diff --git a/.ci/compute_projects.py b/.ci/compute_projects.py
index 6b5370f471203..a4cfecf0cef90 100644
--- a/.ci/compute_projects.py
+++ b/.ci/compute_projects.py
@@ -150,6 +150,7 @@
     "mlir": "check-mlir",
     "openmp": "check-openmp",
     "polly": "check-polly",
+    "lit": "check-lit",
 }
 
 RUNTIMES = {"libcxx", "libcxxabi", "libunwind", "compiler-rt", "libc", "flang-rt"}
@@ -166,8 +167,12 @@
     ("llvm", "utils", "gn"): "gn",
     (".github", "workflows", "premerge.yaml"): ".ci",
     ("third-party",): ".ci",
+    ("llvm", "utils", "lit"): "lit",
 }
 
+# Projects that should run tests but cannot be explicitly built.
+SKIP_BUILD_PROJECTS = ["CIR", "lit"]
+
 # Projects that should not run any tests. These need to be metaprojects.
 SKIP_PROJECTS = ["docs", "gn"]
 
@@ -315,7 +320,9 @@ def get_env_variables(modified_files: list[str], platform: str) -> Set[str]:
     # clang build, but it requires an explicit option to enable. We set that
     # option here, and remove it from the projects_to_build list.
     enable_cir = "ON" if "CIR" in projects_to_build else "OFF"
-    projects_to_build.discard("CIR")
+    # Remove any metaprojects from the list of projects to build.
+    for project in SKIP_BUILD_PROJECTS:
+        projects_to_build.discard(project)
 
     # We use a semicolon to separate the projects/runtimes as they get passed
     # to the CMake invocation and thus we need to use the CMake list separator
diff --git a/.ci/compute_projects_test.py b/.ci/compute_projects_test.py
index bb1d24174ca30..fe1bf07eae8ff 100644
--- a/.ci/compute_projects_test.py
+++ b/.ci/compute_projects_test.py
@@ -413,6 +413,30 @@ def test_third_party_benchmark(self):
             "check-cxx check-cxxabi check-unwind",
         )
 
+    def test_lit(self):
+        env_variables = compute_projects.get_env_variables(
+            ["llvm/utils/lit/CMakeLists.txt"], "Linux"
+        )
+        self.assertEqual(
+            env_variables["projects_to_build"],
+            "bolt;clang;clang-tools-extra;flang;lld;lldb;llvm;mlir;polly",
+        )
+        self.assertEqual(
+            env_variables["project_check_targets"],
+            "check-bolt check-clang check-clang-tools check-flang check-lit check-lld check-lldb check-llvm check-mlir check-polly",
+        )
+        self.assertEqual(
+            env_variables["runtimes_to_build"], "libcxx;libcxxabi;libunwind"
+        )
+        self.assertEqual(
+            env_variables["runtimes_check_targets"],
+            "",
+        )
+        self.assertEqual(
+            env_variables["runtimes_check_targets_needs_reconfig"],
+            "check-cxx check-cxxabi check-unwind",
+        )
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 302b879e1c1f1..742bdbea3023d 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -132,6 +132,9 @@
 /mlir/**/Transforms/Mem2Reg.* @moxinilian
 /mlir/**/Transforms/SROA.* @moxinilian
 
+# MLIR IRDL-related
+/mlir/**/*IRDL* @moxinilian
+
 # BOLT
 /bolt/ @aaupov @maksfb @rafaelauler @ayermolo @yota9 @paschalis-mpeis @yozhu
 
diff --git a/.github/dependabot.yml b/.github/dependabot.yml
deleted file mode 100644
index 8c1dfd39b82c4..0000000000000
--- a/.github/dependabot.yml
+++ /dev/null
@@ -1,18 +0,0 @@
-version: 2
-updates:
-  - package-ecosystem: "github-actions"
-    directory: "/"
-    schedule:
-      interval: "monthly"
-    groups:
-      github-actions:
-        patterns:
-          - "*"
-  - package-ecosystem: "pip"
-    directory: "/llvm/docs"
-    schedule:
-      interval: "monthly"
-    groups:
-      llvm-docs-requirements:
-        patterns:
-          - "*"
diff --git a/.github/workflows/build-ci-container-windows.yml b/.github/workflows/build-ci-container-windows.yml
index 55a269c001c2b..167e7cf06b3b2 100644
--- a/.github/workflows/build-ci-container-windows.yml
+++ b/.github/workflows/build-ci-container-windows.yml
@@ -61,7 +61,7 @@ jobs:
       GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
     steps:
       - name: Download container
-        uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # v4.1.8
+        uses: actions/download-artifact@634f93cb2916e3fdff6788551b99b062d0335ce0 # v5.0.0
         with:
           name: container
       - name: Push Container
diff --git a/.github/workflows/build-ci-container.yml b/.github/workflows/build-ci-container.yml
index 3e91c49a51d19..67f35fd30701f 100644
--- a/.github/workflows/build-ci-container.yml
+++ b/.github/workflows/build-ci-container.yml
@@ -88,7 +88,7 @@ jobs:
       GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
     steps:
       - name: Download container
-        uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # v4.1.8
+        uses: actions/download-artifact@634f93cb2916e3fdff6788551b99b062d0335ce0 # v5.0.0
 
       - name: Push Container
         run: |
diff --git a/.github/workflows/build-metrics-container.yml b/.github/workflows/build-metrics-container.yml
index 265fd73cc0bb7..cadcaa9a42e8f 100644
--- a/.github/workflows/build-metrics-container.yml
+++ b/.github/workflows/build-metrics-container.yml
@@ -66,7 +66,7 @@ jobs:
       GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
     steps:
       - name: Download Container
-        uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # v4.1.8
+        uses: actions/download-artifact@634f93cb2916e3fdff6788551b99b062d0335ce0 # v5.0.0
         with:
           name: container
       - name: Push Container
diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
index b627803f61b27..8cdd39c164cca 100644
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/docs.yml
@@ -60,7 +60,7 @@ jobs:
           fetch-depth: 2
       - name: Get subprojects that have doc changes
         id: docs-changed-subprojects
-        uses: step-security/changed-files@3dbe17c78367e7d60f00d78ae6781a35be47b4a1 # v45.0.1
+        uses: tj-actions/changed-files@ed68ef82c095e0d48ec87eccea555d944a631a4c # v46.0.5
         with:
           skip_initial_fetch: true
           base_sha: 'HEAD~1'
diff --git a/.github/workflows/gha-codeql.yml b/.github/workflows/gha-codeql.yml
new file mode 100644
index 0000000000000..efb8143877c4e
--- /dev/null
+++ b/.github/workflows/gha-codeql.yml
@@ -0,0 +1,37 @@
+name: Github Actions CodeQL
+
+permissions:
+  contents: read
+
+on:
+  pull_request:
+    branches:
+      - main
+    paths:
+      - '.github/**'
+  schedule:
+    - cron: '30 0 * * *'
+
+concurrency:
+  group: ${{ github.workflow }}
+  cancel-in-progress: true
+
+jobs:
+  codeql:
+    name: 'Github Actions CodeQL'
+    runs-on: ubuntu-24.04
+    permissions:
+      security-events: write
+    steps:
+      - name: Checkout LLVM
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        with:
+          sparse-checkout: |
+            .github/
+      - name: Initialize CodeQL
+        uses: github/codeql-action/init@192325c86100d080feab897ff886c34abd4c83a3 # v3.30.3
+        with:
+          languages: actions
+          queries: security-extended
+      - name: Perform CodeQL Analysis
+        uses: github/codeql-action/analyze@192325c86100d080feab897ff886c34abd4c83a3 # v3.30.3
diff --git a/.github/workflows/libclang-abi-tests.yml b/.github/workflows/libclang-abi-tests.yml
index 3836cc56a7c22..d53a2f306afa2 100644
--- a/.github/workflows/libclang-abi-tests.yml
+++ b/.github/workflows/libclang-abi-tests.yml
@@ -113,10 +113,10 @@ jobs:
           ./configure
           sudo make install
       - name: Download source code
-        uses: llvm/actions/get-llvm-project-src@main
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
         with:
           ref: ${{ matrix.ref }}
-          repo: ${{ matrix.repo }}
+          repository: ${{ matrix.repo }}
       - name: Configure
         run: |
           mkdir install
@@ -144,12 +144,12 @@ jobs:
       - abi-dump
     steps:
       - name: Download baseline
-        uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # 4.1.8
+        uses: actions/download-artifact@634f93cb2916e3fdff6788551b99b062d0335ce0 # v5.0.0
         with:
           name: build-baseline
           path: build-baseline
       - name: Download latest
-        uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # 4.1.8
+        uses: actions/download-artifact@634f93cb2916e3fdff6788551b99b062d0335ce0 # v5.0.0
         with:
           name: build-latest
           path: build-latest
diff --git a/.github/workflows/libcxx-run-benchmarks.yml b/.github/workflows/libcxx-run-benchmarks.yml
new file mode 100644
index 0000000000000..17a97df029ba5
--- /dev/null
+++ b/.github/workflows/libcxx-run-benchmarks.yml
@@ -0,0 +1,112 @@
+# This file defines a workflow that runs the libc++ benchmarks when a comment is added to the PR.
+#
+# The comment is of the form:
+#
+#   /libcxx-bot benchmark <path-to-benchmarks-to-run>
+#
+# That will cause the specified benchmarks to be run on the PR and on the pull-request target, and
+# their results to be compared.
+
+name: Benchmark libc++
+
+permissions:
+  contents: read
+
+on:
+  issue_comment:
+    types:
+      - created
+      - edited
+
+env:
+  CC: clang-22
+  CXX: clang++-22
+
+jobs:
+  run-benchmarks:
+    permissions:
+      pull-requests: write
+
+    if: >-
+      github.event.issue.pull_request &&
+      contains(github.event.comment.body, '/libcxx-bot benchmark')
+
+    runs-on: llvm-premerge-libcxx-next-runners # TODO: This should run on a dedicated set of machines
+    steps:
+      - uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
+        with:
+          python-version: '3.10'
+
+      - name: Extract information from the PR
+        id: vars
+        env:
+          COMMENT_BODY: ${{ github.event.comment.body }}
+        run: |
+          python3 -m venv .venv
+          source .venv/bin/activate
+          python -m pip install pygithub
+
+          cat <<EOF | python >> ${GITHUB_OUTPUT}
+          import github
+          repo = github.Github("${{ github.token }}").get_repo("${{ github.repository }}")
+          pr = repo.get_pull(${{ github.event.issue.number }})
+          print(f"pr_base={pr.base.sha}")
+          print(f"pr_head={pr.head.sha}")
+          EOF
+          BENCHMARKS=$(echo "$COMMENT_BODY" | sed -nE 's/\/libcxx-bot benchmark (.+)/\1/p')
+          echo "benchmarks=${BENCHMARKS}" >> ${GITHUB_OUTPUT}
+
+      - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        with:
+          ref: ${{ steps.vars.outputs.pr_head }}
+          fetch-depth: 0
+          fetch-tags: true # This job requires access to all the Git branches so it can diff against (usually) main
+          path: repo # Avoid nuking the workspace, where we have the Python virtualenv
+
+      - name: Run baseline
+        run: |
+          source .venv/bin/activate && cd repo
+          python -m pip install -r libcxx/utils/requirements.txt
+          baseline_commit=$(git merge-base ${{ steps.vars.outputs.pr_base }} ${{ steps.vars.outputs.pr_head }})
+          ./libcxx/utils/test-at-commit --commit ${baseline_commit} -B build/baseline -- -sv -j1 --param optimization=speed ${{ steps.vars.outputs.benchmarks }}
+          ./libcxx/utils/consolidate-benchmarks build/baseline | tee baseline.lnt
+
+      - name: Run candidate
+        run: |
+          source .venv/bin/activate && cd repo
+          ./libcxx/utils/test-at-commit --commit ${{ steps.vars.outputs.pr_head }} -B build/candidate -- -sv -j1 --param optimization=speed ${{ steps.vars.outputs.benchmarks }}
+          ./libcxx/utils/consolidate-benchmarks build/candidate | tee candidate.lnt
+
+      - name: Compare baseline and candidate runs
+        run: |
+          source .venv/bin/activate && cd repo
+          ./libcxx/utils/compare-benchmarks baseline.lnt candidate.lnt | tee results.txt
+
+      - name: Update comment with results
+        run: |
+          source .venv/bin/activate && cd repo
+          cat <<EOF | python
+          import github
+          repo = github.Github("${{ github.token }}").get_repo("${{ github.repository }}")
+          pr = repo.get_pull(${{ github.event.issue.number }})
+          comment = pr.get_issue_comment(${{ github.event.comment.id }})
+          with open('results.txt', 'r') as f:
+            benchmark_results = f.read()
+
+          new_comment_text = f"""
+          {comment.body}
+
+          <details>
+          <summary>
+          Benchmark results:
+          </summary>
+
+          \`\`\`
+          {benchmark_results}
+          \`\`\`
+
+          </details>
+          """
+
+          comment.edit(new_comment_text)
+          EOF
diff --git a/.github/workflows/llvm-tests.yml b/.github/workflows/llvm-tests.yml
index 52b486e7e62fc..ea80e229512d5 100644
--- a/.github/workflows/llvm-tests.yml
+++ b/.github/workflows/llvm-tests.yml
@@ -101,10 +101,10 @@ jobs:
           ./configure
           sudo make install
       - name: Download source code
-        uses: llvm/actions/get-llvm-project-src@main
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
         with:
           ref: ${{ matrix.ref }}
-          repo: ${{ matrix.repo }}
+          repository: ${{ matrix.repo }}
       - name: Configure
         run: |
           mkdir install
@@ -148,17 +148,17 @@ jobs:
       - abi-dump
     steps:
       - name: Download baseline
-        uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # 4.1.8
+        uses: actions/download-artifact@634f93cb2916e3fdff6788551b99b062d0335ce0 # v5.0.0
         with:
           name: build-baseline
           path: build-baseline
       - name: Download latest
-        uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # 4.1.8
+        uses: actions/download-artifact@634f93cb2916e3fdff6788551b99b062d0335ce0 # v5.0.0
         with:
           name: build-latest
           path: build-latest
       - name: Download symbol list
-        uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # 4.1.8
+        uses: actions/download-artifact@634f93cb2916e3fdff6788551b99b062d0335ce0 # v5.0.0
         with:
           name: symbol-list
           path: symbol-list
diff --git a/.github/workflows/pr-code-format.yml b/.github/workflows/pr-code-format.yml
index 9341eaf3ce7c2..9396bf019e1ac 100644
--- a/.github/workflows/pr-code-format.yml
+++ b/.github/workflows/pr-code-format.yml
@@ -25,7 +25,7 @@ jobs:
 
       - name: Get changed files
         id: changed-files
-        uses: step-security/changed-files@3dbe17c78367e7d60f00d78ae6781a35be47b4a1 # v45.0.1
+        uses: tj-actions/changed-files@ed68ef82c095e0d48ec87eccea555d944a631a4c # v46.0.5
         with:
           separator: ","
           skip_initial_fetch: true
diff --git a/.github/workflows/release-binaries-setup-stage/action.yml b/.github/workflows/release-binaries-setup-stage/action.yml
index f5e5db27e6595..8f45e22886b6e 100644
--- a/.github/workflows/release-binaries-setup-stage/action.yml
+++ b/.github/workflows/release-binaries-setup-stage/action.yml
@@ -44,7 +44,7 @@ runs:
     - name: Download Previous Stage Artifact
       if: ${{ inputs.previous-artifact }}
       id: download
-      uses: actions/download-artifact@6b208ae046db98c579e8a3aa621ab581ff575935 # v4.1.1
+      uses: actions/download-artifact@634f93cb2916e3fdff6788551b99b062d0335ce0 # v5.0.0
       with:
         pattern: ${{ runner.os }}-${{ runner.arch }}-${{ inputs.previous-artifact }}-*
         merge-multiple: true
diff --git a/.github/workflows/release-binaries.yml b/.github/workflows/release-binaries.yml
index 116bdfb3929d3..8f422a0147748 100644
--- a/.github/workflows/release-binaries.yml
+++ b/.github/workflows/release-binaries.yml
@@ -294,7 +294,7 @@ jobs:
         sparse-checkout-cone-mode: false
 
     - name: 'Download artifact'
-      uses: actions/download-artifact@6b208ae046db98c579e8a3aa621ab581ff575935 # v4.1.1
+      uses: actions/download-artifact@634f93cb2916e3fdff6788551b99b062d0335ce0 # v5.0.0
       with:
         pattern: '*-release-binary'
         merge-multiple: true
diff --git a/bolt/include/bolt/Core/BinaryContext.h b/bolt/include/bolt/Core/BinaryContext.h
index 91ecf89da618c..72c8817daa714 100644
--- a/bolt/include/bolt/Core/BinaryContext.h
+++ b/bolt/include/bolt/Core/BinaryContext.h
@@ -288,6 +288,12 @@ class BinaryContext {
   /// overwritten, but it is okay to re-generate debug info for them.
   std::set<const DWARFUnit *> ProcessedCUs;
 
+  /// DWARF-related container to manage lifecycle of groups of rows from line
+  /// tables associated with instructions. Since binary functions can span
+  /// multiple compilation units, instructions may reference debug line
+  /// information from multiple CUs.
+  ClusteredRowsContainer ClusteredRows;
+
   // Setup MCPlus target builder
   void initializeTarget(std::unique_ptr<MCPlusBuilder> TargetBuilder) {
     MIB = std::move(TargetBuilder);
diff --git a/bolt/include/bolt/Core/BinaryFunction.h b/bolt/include/bolt/Core/BinaryFunction.h
index b59926cc75571..51b139a15e1a0 100644
--- a/bolt/include/bolt/Core/BinaryFunction.h
+++ b/bolt/include/bolt/Core/BinaryFunction.h
@@ -35,6 +35,7 @@
 #include "bolt/Core/JumpTable.h"
 #include "bolt/Core/MCPlus.h"
 #include "bolt/Utils/NameResolver.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringRef.h"
@@ -423,8 +424,9 @@ class BinaryFunction {
   /// Original LSDA type encoding
   unsigned LSDATypeEncoding{dwarf::DW_EH_PE_omit};
 
-  /// Containing compilation unit for the function.
-  DWARFUnit *DwarfUnit{nullptr};
+  /// All compilation units this function belongs to.
+  /// Maps DWARF unit offset to the unit pointer.
+  DenseMap<uint64_t, DWARFUnit *> DwarfUnitMap;
 
   /// Last computed hash value. Note that the value could be recomputed using
   /// different parameters by every pass.
@@ -2409,15 +2411,21 @@ class BinaryFunction {
   void
   computeBlockHashes(HashFunction HashFunction = HashFunction::Default) const;
 
-  void setDWARFUnit(DWARFUnit *Unit) { DwarfUnit = Unit; }
+  void addDWARFUnit(DWARFUnit *Unit) { DwarfUnitMap[Unit->getOffset()] = Unit; }
 
-  /// Return DWARF compile unit for this function.
-  DWARFUnit *getDWARFUnit() const { return DwarfUnit; }
+  void removeDWARFUnit(DWARFUnit *Unit) {
+    DwarfUnitMap.erase(Unit->getOffset());
+  }
+
+  /// Return DWARF compile units for this function.
+  /// Returns a reference to the map of DWARF unit offsets to units.
+  const DenseMap<uint64_t, DWARFUnit *> &getDWARFUnits() const {
+    return DwarfUnitMap;
+  }
 
-  /// Return line info table for this function.
-  const DWARFDebugLine::LineTable *getDWARFLineTable() const {
-    return getDWARFUnit() ? BC.DwCtx->getLineTableForUnit(getDWARFUnit())
-                          : nullptr;
+  const DWARFDebugLine::LineTable *
+  getDWARFLineTableForUnit(DWARFUnit *Unit) const {
+    return BC.DwCtx->getLineTableForUnit(Unit);
   }
 
   /// Finalize profile for the function.
diff --git a/bolt/include/bolt/Core/DebugData.h b/bolt/include/bolt/Core/DebugData.h
index 6ea3b1af1024f..814978965ce3a 100644
--- a/bolt/include/bolt/Core/DebugData.h
+++ b/bolt/include/bolt/Core/DebugData.h
@@ -135,8 +135,6 @@ struct DebugLineTableRowRef {
   uint32_t DwCompileUnitIndex;
   uint32_t RowIndex;
 
-  const static DebugLineTableRowRef NULL_ROW;
-
   bool operator==(const DebugLineTableRowRef &Rhs) const {
     return DwCompileUnitIndex == Rhs.DwCompileUnitIndex &&
            RowIndex == Rhs.RowIndex;
@@ -145,24 +143,6 @@ struct DebugLineTableRowRef {
   bool operator!=(const DebugLineTableRowRef &Rhs) const {
     return !(*this == Rhs);
   }
-
-  static DebugLineTableRowRef fromSMLoc(const SMLoc &Loc) {
-    union {
-      decltype(Loc.getPointer()) Ptr;
-      DebugLineTableRowRef Ref;
-    } U;
-    U.Ptr = Loc.getPointer();
-    return U.Ref;
-  }
-
-  SMLoc toSMLoc() const {
-    union {
-      decltype(SMLoc().getPointer()) Ptr;
-      DebugLineTableRowRef Ref;
-    } U;
-    U.Ref = *this;
-    return SMLoc::getFromPointer(U.Ptr);
-  }
 };
 
 /// Common buffer vector used for debug info handling.
@@ -210,7 +190,7 @@ class DebugRangesSectionWriter {
   static bool classof(const DebugRangesSectionWriter *Writer) {
     return Writer->getKind() == RangesWriterKind::DebugRangesWriter;
   }
-  
+
   /// Append a range to the main buffer.
   void appendToRangeBuffer(const DebugBufferVector &CUBuffer);
 
@@ -852,6 +832,97 @@ class DwarfLineTable {
   // Returns DWARF Version for this line table.
   uint16_t getDwarfVersion() const { return DwarfVersion; }
 };
+
+/// ClusteredRows represents a collection of debug line table row references.
+///
+/// MEMORY LAYOUT AND DESIGN:
+/// This class uses a flexible array member pattern to store all
+/// DebugLineTableRowRef elements in a single contiguous memory allocation.
+/// The memory layout is:
+///
+/// +------------------+
+/// | ClusteredRows    |  <- Object header (Size + first element)
+/// | - Size           |
+/// | - Rows (element) |  <- First DebugLineTableRowRef element
+/// +------------------+
+/// | element[1]       |  <- Additional DebugLineTableRowRef elements
+/// | element[2]       |     stored immediately after the object
+/// | ...              |
+/// | element[Size-1]  |
+/// +------------------+
+///
+/// The 'Rows' member serves as both the first element storage and the base
+/// address for pointer arithmetic to access subsequent elements.
+class ClusteredRows {
+public:
+  ArrayRef<DebugLineTableRowRef> getRows() const {
+    return ArrayRef<DebugLineTableRowRef>(beginPtrConst(), Size);
+  }
+
+  /// Returns the number of elements in the array.
+  uint64_t size() const { return Size; }
+
+  /// We re-purpose SMLoc inside MCInst to store the pointer
+  /// to ClusteredRows. fromSMLoc() and toSMLoc() are helper
+  /// functions to convert between SMLoc and ClusteredRows.
+
+  static const ClusteredRows *fromSMLoc(const SMLoc &Loc) {
+    return reinterpret_cast<const ClusteredRows *>(Loc.getPointer());
+  }
+  SMLoc toSMLoc() const {
+    return SMLoc::getFromPointer(reinterpret_cast<const char *>(this));
+  }
+
+  /// Given a vector of DebugLineTableRowRef, this method
+  /// copies the elements into pre-allocated memory.
+  template <typename T> void populate(const T Vec) {
+    assert(Vec.size() == Size && "Sizes must match");
+    DebugLineTableRowRef *CurRawPtr = beginPtr();
+    for (DebugLineTableRowRef RowRef : Vec) {
+      *CurRawPtr = RowRef;
+      ++CurRawPtr;
+    }
+  }
+
+private:
+  uint64_t Size;
+  DebugLineTableRowRef Rows;
+
+  ClusteredRows(uint64_t Size) : Size(Size) {}
+
+  /// Total size of the object including the array.
+  static uint64_t getTotalSize(uint64_t Size) {
+    assert(Size > 0 && "Size must be greater than 0");
+    return sizeof(ClusteredRows) + (Size - 1) * sizeof(DebugLineTableRowRef);
+  }
+  const DebugLineTableRowRef *beginPtrConst() const {
+    return reinterpret_cast<const DebugLineTableRowRef *>(&Rows);
+  }
+  DebugLineTableRowRef *beginPtr() {
+    return reinterpret_cast<DebugLineTableRowRef *>(&Rows);
+  }
+
+  friend class ClusteredRowsContainer;
+};
+
+/// ClusteredRowsContainer manages the lifecycle of ClusteredRows objects.
+class ClusteredRowsContainer {
+public:
+  ClusteredRows *createClusteredRows(uint64_t Size) {
+    auto *CR = new (std::malloc(ClusteredRows::getTotalSize(Size)))
+        ClusteredRows(Size);
+    Clusters.push_back(CR);
+    return CR;
+  }
+  ~ClusteredRowsContainer() {
+    for (auto *CR : Clusters)
+      std::free(CR);
+  }
+
+private:
+  std::vector<ClusteredRows *> Clusters;
+};
+
 } // namespace bolt
 } // namespace llvm
 
diff --git a/bolt/include/bolt/Core/MCPlusBuilder.h b/bolt/include/bolt/Core/MCPlusBuilder.h
index 1c630ab8efc10..90129d475d870 100644
--- a/bolt/include/bolt/Core/MCPlusBuilder.h
+++ b/bolt/include/bolt/Core/MCPlusBuilder.h
@@ -2216,7 +2216,8 @@ class MCPlusBuilder {
   }
 
   /// Print each annotation attached to \p Inst.
-  void printAnnotations(const MCInst &Inst, raw_ostream &OS) const;
+  void printAnnotations(const MCInst &Inst, raw_ostream &OS,
+                        bool PrintMemData = false) const;
 
   /// Remove annotation with a given \p Index.
   ///
diff --git a/bolt/lib/Core/BinaryContext.cpp b/bolt/lib/Core/BinaryContext.cpp
index 23a5a65c2c5f0..72c72bbaf4a65 100644
--- a/bolt/lib/Core/BinaryContext.cpp
+++ b/bolt/lib/Core/BinaryContext.cpp
@@ -33,6 +33,7 @@
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Error.h"
+#include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Regex.h"
 #include <algorithm>
 #include <functional>
@@ -1632,11 +1633,20 @@ void BinaryContext::preprocessDWODebugInfo() {
           DwarfUnit->getUnitDIE().find(
               {dwarf::DW_AT_dwo_name, dwarf::DW_AT_GNU_dwo_name}),
           "");
-      SmallString<16> AbsolutePath;
+      SmallString<16> AbsolutePath(DWOName);
+      std::string DWOCompDir = DwarfUnit->getCompilationDir();
       if (!opts::CompDirOverride.empty()) {
-        sys::path::append(AbsolutePath, opts::CompDirOverride);
-        sys::path::append(AbsolutePath, DWOName);
+        DWOCompDir = opts::CompDirOverride;
+      } else if (!sys::fs::exists(DWOCompDir) && sys::fs::exists(DWOName)) {
+        DWOCompDir = ".";
+        this->outs()
+            << "BOLT-WARNING: Debug Fission: Debug Compilation Directory of "
+            << DWOName
+            << " does not exist. Relative path will be used to process .dwo "
+               "files.\n";
       }
+      // Prevent failures when DWOName is already an absolute path.
+      sys::fs::make_absolute(DWOCompDir, AbsolutePath);
       DWARFUnit *DWOCU =
           DwarfUnit->getNonSkeletonUnitDIE(false, AbsolutePath).getDwarfUnit();
       if (!DWOCU->isDWOUnit()) {
@@ -1644,7 +1654,8 @@ void BinaryContext::preprocessDWODebugInfo() {
             << "BOLT-WARNING: Debug Fission: DWO debug information for "
             << DWOName
             << " was not retrieved and won't be updated. Please check "
-               "relative path.\n";
+               "relative path or use '--comp-dir-override' to specify the base "
+               "location.\n";
         continue;
       }
       DWOCUs[*DWOId] = DWOCU;
@@ -1693,22 +1704,39 @@ void BinaryContext::preprocessDebugInfo() {
 
     auto It = llvm::partition_point(
         AllRanges, [=](CURange R) { return R.HighPC <= FunctionAddress; });
-    if (It != AllRanges.end() && It->LowPC <= FunctionAddress)
-      Function.setDWARFUnit(It->Unit);
+    if (It == AllRanges.end() || It->LowPC > FunctionAddress) {
+      continue;
+    }
+    Function.addDWARFUnit(It->Unit);
+
+    // Go forward and add all units from ranges that cover the function.
+    while (++It != AllRanges.end()) {
+      if (It->LowPC > FunctionAddress || FunctionAddress >= It->HighPC)
+        break;
+      Function.addDWARFUnit(It->Unit);
+    }
   }
 
   // Discover units with debug info that needs to be updated.
   for (const auto &KV : BinaryFunctions) {
     const BinaryFunction &BF = KV.second;
-    if (shouldEmit(BF) && BF.getDWARFUnit())
-      ProcessedCUs.insert(BF.getDWARFUnit());
+    if (shouldEmit(BF) && !BF.getDWARFUnits().empty())
+      for (const auto &[_, Unit] : BF.getDWARFUnits())
+        ProcessedCUs.insert(Unit);
   }
-
   // Clear debug info for functions from units that we are not going to process.
   for (auto &KV : BinaryFunctions) {
     BinaryFunction &BF = KV.second;
-    if (BF.getDWARFUnit() && !ProcessedCUs.count(BF.getDWARFUnit()))
-      BF.setDWARFUnit(nullptr);
+    // Collect units to remove to avoid iterator invalidation
+    SmallVector<DWARFUnit *, 1> UnitsToRemove;
+    for (const auto &[_, Unit] : BF.getDWARFUnits()) {
+      if (!ProcessedCUs.count(Unit))
+        UnitsToRemove.push_back(Unit);
+    }
+    // Remove the collected units
+    for (auto *Unit : UnitsToRemove) {
+      BF.removeDWARFUnit(Unit);
+    }
   }
 
   if (opts::Verbosity >= 1) {
@@ -1903,23 +1931,23 @@ bool BinaryContext::isMarker(const SymbolRef &Symbol) const {
 static void printDebugInfo(raw_ostream &OS, const MCInst &Instruction,
                            const BinaryFunction *Function,
                            DWARFContext *DwCtx) {
-  DebugLineTableRowRef RowRef =
-      DebugLineTableRowRef::fromSMLoc(Instruction.getLoc());
-  if (RowRef == DebugLineTableRowRef::NULL_ROW)
+  const ClusteredRows *LineTableRows =
+      ClusteredRows::fromSMLoc(Instruction.getLoc());
+  if (LineTableRows == nullptr)
     return;
 
-  const DWARFDebugLine::LineTable *LineTable;
-  if (Function && Function->getDWARFUnit() &&
-      Function->getDWARFUnit()->getOffset() == RowRef.DwCompileUnitIndex) {
-    LineTable = Function->getDWARFLineTable();
-  } else {
-    LineTable = DwCtx->getLineTableForUnit(
-        DwCtx->getCompileUnitForOffset(RowRef.DwCompileUnitIndex));
-  }
-  assert(LineTable && "line table expected for instruction with debug info");
+  // File name and line number should be the same for all CUs.
+  // So it is sufficient to check the first one.
+  DebugLineTableRowRef RowRef = LineTableRows->getRows().front();
+  const DWARFDebugLine::LineTable *LineTable = DwCtx->getLineTableForUnit(
+      DwCtx->getCompileUnitForOffset(RowRef.DwCompileUnitIndex));
+
+  if (!LineTable)
+    return;
 
   const DWARFDebugLine::Row &Row = LineTable->Rows[RowRef.RowIndex - 1];
   StringRef FileName = "";
+
   if (std::optional<const char *> FName =
           dwarf::toString(LineTable->Prologue.getFileNameEntry(Row.File).Name))
     FileName = *FName;
@@ -2027,7 +2055,7 @@ void BinaryContext::printInstruction(raw_ostream &OS, const MCInst &Instruction,
   if (MCSymbol *Label = MIB->getInstLabel(Instruction))
     OS << " # Label: " << *Label;
 
-  MIB->printAnnotations(Instruction, OS);
+  MIB->printAnnotations(Instruction, OS, PrintMemData || opts::PrintMemData);
 
   if (opts::PrintDebugInfo)
     printDebugInfo(OS, Instruction, Function, DwCtx.get());
diff --git a/bolt/lib/Core/BinaryEmitter.cpp b/bolt/lib/Core/BinaryEmitter.cpp
index 7b5cd276fee89..7aaf721da9769 100644
--- a/bolt/lib/Core/BinaryEmitter.cpp
+++ b/bolt/lib/Core/BinaryEmitter.cpp
@@ -177,7 +177,8 @@ class BinaryEmitter {
   /// Note that it does not automatically result in the insertion of the EOS
   /// marker in the line table program, but provides one to the DWARF generator
   /// when it needs it.
-  void emitLineInfoEnd(const BinaryFunction &BF, MCSymbol *FunctionEndSymbol);
+  void emitLineInfoEnd(const BinaryFunction &BF, MCSymbol *FunctionEndSymbol,
+                       const DWARFUnit &Unit);
 
   /// Emit debug line info for unprocessed functions from CUs that include
   /// emitted functions.
@@ -436,8 +437,9 @@ bool BinaryEmitter::emitFunction(BinaryFunction &Function,
     Streamer.emitELFSize(StartSymbol, SizeExpr);
   }
 
-  if (opts::UpdateDebugSections && Function.getDWARFUnit())
-    emitLineInfoEnd(Function, EndSymbol);
+  if (opts::UpdateDebugSections && !Function.getDWARFUnits().empty())
+    for (const auto &[_, Unit] : Function.getDWARFUnits())
+      emitLineInfoEnd(Function, EndSymbol, *Unit);
 
   // Exception handling info for the function.
   emitLSDA(Function, FF);
@@ -486,7 +488,7 @@ void BinaryEmitter::emitFunctionBody(BinaryFunction &BF, FunctionFragment &FF,
         // A symbol to be emitted before the instruction to mark its location.
         MCSymbol *InstrLabel = BC.MIB->getInstLabel(Instr);
 
-        if (opts::UpdateDebugSections && BF.getDWARFUnit()) {
+        if (opts::UpdateDebugSections && !BF.getDWARFUnits().empty()) {
           LastLocSeen = emitLineInfo(BF, Instr.getLoc(), LastLocSeen,
                                      FirstInstr, InstrLabel);
           FirstInstr = false;
@@ -679,74 +681,100 @@ void BinaryEmitter::emitConstantIslands(BinaryFunction &BF, bool EmitColdPart,
 SMLoc BinaryEmitter::emitLineInfo(const BinaryFunction &BF, SMLoc NewLoc,
                                   SMLoc PrevLoc, bool FirstInstr,
                                   MCSymbol *&InstrLabel) {
-  DWARFUnit *FunctionCU = BF.getDWARFUnit();
-  const DWARFDebugLine::LineTable *FunctionLineTable = BF.getDWARFLineTable();
-  assert(FunctionCU && "cannot emit line info for function without CU");
-
-  DebugLineTableRowRef RowReference = DebugLineTableRowRef::fromSMLoc(NewLoc);
-
-  // Check if no new line info needs to be emitted.
-  if (RowReference == DebugLineTableRowRef::NULL_ROW ||
+  if (NewLoc.getPointer() == nullptr ||
       NewLoc.getPointer() == PrevLoc.getPointer())
     return PrevLoc;
+  const ClusteredRows *Cluster = ClusteredRows::fromSMLoc(NewLoc);
+
+  auto addToLineTable = [&](DebugLineTableRowRef RowReference,
+                            const DWARFUnit &TargetCU, unsigned Flags,
+                            MCSymbol &InstrLabel,
+                            const DWARFDebugLine::Row &CurrentRow) {
+    const uint64_t TargetUnitIndex = TargetCU.getOffset();
+    unsigned TargetFilenum = CurrentRow.File;
+    const uint32_t CurrentUnitIndex = RowReference.DwCompileUnitIndex;
+    // If the CU id from the current instruction location does not
+    // match the target CU id, it means that we have come across some
+    // inlined code (by BOLT).  We must look up the CU for the instruction's
+    // original function and get the line table from that.
+    if (TargetUnitIndex != CurrentUnitIndex) {
+      // Add filename from the inlined function to the current CU.
+      TargetFilenum = BC.addDebugFilenameToUnit(
+          TargetUnitIndex, CurrentUnitIndex, CurrentRow.File);
+    }
+    BC.Ctx->setCurrentDwarfLoc(TargetFilenum, CurrentRow.Line,
+                               CurrentRow.Column, Flags, CurrentRow.Isa,
+                               CurrentRow.Discriminator);
+    const MCDwarfLoc &DwarfLoc = BC.Ctx->getCurrentDwarfLoc();
+    BC.Ctx->clearDwarfLocSeen();
+    const MCLineSection::MCLineDivisionMap &MapLineEntries =
+        BC.getDwarfLineTable(TargetUnitIndex)
+            .getMCLineSections()
+            .getMCLineEntries();
+    const auto *It = MapLineEntries.find(Streamer.getCurrentSectionOnly());
+    MCDwarfLineEntry NewLineEntry = MCDwarfLineEntry(&InstrLabel, DwarfLoc);
+
+    // Check if line table exists and has entries before doing comparison.
+    if (It != MapLineEntries.end() && !It->second.empty()) {
+      // Check if the new line entry has the same debug info as the last one
+      // to avoid duplicates. We don't compare labels since different
+      // instructions can have the same line info.
+      const auto &LastEntry = It->second.back();
+      if (LastEntry.getFileNum() == NewLineEntry.getFileNum() &&
+          LastEntry.getLine() == NewLineEntry.getLine() &&
+          LastEntry.getColumn() == NewLineEntry.getColumn() &&
+          LastEntry.getFlags() == NewLineEntry.getFlags() &&
+          LastEntry.getIsa() == NewLineEntry.getIsa() &&
+          LastEntry.getDiscriminator() == NewLineEntry.getDiscriminator())
+        return;
+    }
 
-  unsigned CurrentFilenum = 0;
-  const DWARFDebugLine::LineTable *CurrentLineTable = FunctionLineTable;
-
-  // If the CU id from the current instruction location does not
-  // match the CU id from the current function, it means that we
-  // have come across some inlined code.  We must look up the CU
-  // for the instruction's original function and get the line table
-  // from that.
-  const uint64_t FunctionUnitIndex = FunctionCU->getOffset();
-  const uint32_t CurrentUnitIndex = RowReference.DwCompileUnitIndex;
-  if (CurrentUnitIndex != FunctionUnitIndex) {
-    CurrentLineTable = BC.DwCtx->getLineTableForUnit(
-        BC.DwCtx->getCompileUnitForOffset(CurrentUnitIndex));
-    // Add filename from the inlined function to the current CU.
-    CurrentFilenum = BC.addDebugFilenameToUnit(
-        FunctionUnitIndex, CurrentUnitIndex,
-        CurrentLineTable->Rows[RowReference.RowIndex - 1].File);
-  }
-
-  const DWARFDebugLine::Row &CurrentRow =
-      CurrentLineTable->Rows[RowReference.RowIndex - 1];
-  if (!CurrentFilenum)
-    CurrentFilenum = CurrentRow.File;
-
-  unsigned Flags = (DWARF2_FLAG_IS_STMT * CurrentRow.IsStmt) |
-                   (DWARF2_FLAG_BASIC_BLOCK * CurrentRow.BasicBlock) |
-                   (DWARF2_FLAG_PROLOGUE_END * CurrentRow.PrologueEnd) |
-                   (DWARF2_FLAG_EPILOGUE_BEGIN * CurrentRow.EpilogueBegin);
-
-  // Always emit is_stmt at the beginning of function fragment.
-  if (FirstInstr)
-    Flags |= DWARF2_FLAG_IS_STMT;
-
-  BC.Ctx->setCurrentDwarfLoc(CurrentFilenum, CurrentRow.Line, CurrentRow.Column,
-                             Flags, CurrentRow.Isa, CurrentRow.Discriminator);
-  const MCDwarfLoc &DwarfLoc = BC.Ctx->getCurrentDwarfLoc();
-  BC.Ctx->clearDwarfLocSeen();
+    BC.getDwarfLineTable(TargetUnitIndex)
+        .getMCLineSections()
+        .addLineEntry(NewLineEntry, Streamer.getCurrentSectionOnly());
+  };
 
   if (!InstrLabel)
     InstrLabel = BC.Ctx->createTempSymbol();
-
-  BC.getDwarfLineTable(FunctionUnitIndex)
-      .getMCLineSections()
-      .addLineEntry(MCDwarfLineEntry(InstrLabel, DwarfLoc),
-                    Streamer.getCurrentSectionOnly());
+  for (DebugLineTableRowRef RowReference : Cluster->getRows()) {
+    const DWARFDebugLine::LineTable *CurrentLineTable =
+        BC.DwCtx->getLineTableForUnit(
+            BC.DwCtx->getCompileUnitForOffset(RowReference.DwCompileUnitIndex));
+    const DWARFDebugLine::Row &CurrentRow =
+        CurrentLineTable->Rows[RowReference.RowIndex - 1];
+    unsigned Flags = (DWARF2_FLAG_IS_STMT * CurrentRow.IsStmt) |
+                     (DWARF2_FLAG_BASIC_BLOCK * CurrentRow.BasicBlock) |
+                     (DWARF2_FLAG_PROLOGUE_END * CurrentRow.PrologueEnd) |
+                     (DWARF2_FLAG_EPILOGUE_BEGIN * CurrentRow.EpilogueBegin);
+
+    // Always emit is_stmt at the beginning of function fragment.
+    if (FirstInstr)
+      Flags |= DWARF2_FLAG_IS_STMT;
+    const auto &FunctionDwarfUnits = BF.getDWARFUnits();
+    auto It = FunctionDwarfUnits.find(RowReference.DwCompileUnitIndex);
+    if (It != FunctionDwarfUnits.end()) {
+      addToLineTable(RowReference, *It->second, Flags, *InstrLabel, CurrentRow);
+      continue;
+    }
+    // This rows is from CU that did not contain the original function.
+    // This might happen if BOLT moved/inlined that instruction from other CUs.
+    // In this case, we need to insert it to all CUs that the function
+    // originally beloned to.
+    for (const auto &[_, Unit] : BF.getDWARFUnits()) {
+      addToLineTable(RowReference, *Unit, Flags, *InstrLabel, CurrentRow);
+    }
+  }
 
   return NewLoc;
 }
 
 void BinaryEmitter::emitLineInfoEnd(const BinaryFunction &BF,
-                                    MCSymbol *FunctionEndLabel) {
-  DWARFUnit *FunctionCU = BF.getDWARFUnit();
-  assert(FunctionCU && "DWARF unit expected");
+                                    MCSymbol *FunctionEndLabel,
+                                    const DWARFUnit &Unit) {
   BC.Ctx->setCurrentDwarfLoc(0, 0, 0, DWARF2_FLAG_END_SEQUENCE, 0, 0);
   const MCDwarfLoc &DwarfLoc = BC.Ctx->getCurrentDwarfLoc();
   BC.Ctx->clearDwarfLocSeen();
-  BC.getDwarfLineTable(FunctionCU->getOffset())
+  BC.getDwarfLineTable(Unit.getOffset())
       .getMCLineSections()
       .addLineEntry(MCDwarfLineEntry(FunctionEndLabel, DwarfLoc),
                     Streamer.getCurrentSectionOnly());
@@ -1115,36 +1143,40 @@ void BinaryEmitter::emitDebugLineInfoForOriginalFunctions() {
     if (Function.isEmitted())
       continue;
 
-    const DWARFDebugLine::LineTable *LineTable = Function.getDWARFLineTable();
-    if (!LineTable)
-      continue; // nothing to update for this function
+    // Loop through all CUs in the function
+    for (const auto &[_, Unit] : Function.getDWARFUnits()) {
+      const DWARFDebugLine::LineTable *LineTable =
+          Function.getDWARFLineTableForUnit(Unit);
+      if (!LineTable)
+        continue; // nothing to update for this unit
+
+      const uint64_t Address = Function.getAddress();
+      std::vector<uint32_t> Results;
+      if (!LineTable->lookupAddressRange(
+              {Address, object::SectionedAddress::UndefSection},
+              Function.getSize(), Results))
+        continue;
 
-    const uint64_t Address = Function.getAddress();
-    std::vector<uint32_t> Results;
-    if (!LineTable->lookupAddressRange(
-            {Address, object::SectionedAddress::UndefSection},
-            Function.getSize(), Results))
-      continue;
+      if (Results.empty())
+        continue;
 
-    if (Results.empty())
-      continue;
+      // The first row returned could be the last row matching the start
+      // address. Find the first row with the same address that is not the end
+      // of the sequence.
+      uint64_t FirstRow = Results.front();
+      while (FirstRow > 0) {
+        const DWARFDebugLine::Row &PrevRow = LineTable->Rows[FirstRow - 1];
+        if (PrevRow.Address.Address != Address || PrevRow.EndSequence)
+          break;
+        --FirstRow;
+      }
 
-    // The first row returned could be the last row matching the start address.
-    // Find the first row with the same address that is not the end of the
-    // sequence.
-    uint64_t FirstRow = Results.front();
-    while (FirstRow > 0) {
-      const DWARFDebugLine::Row &PrevRow = LineTable->Rows[FirstRow - 1];
-      if (PrevRow.Address.Address != Address || PrevRow.EndSequence)
-        break;
-      --FirstRow;
+      const uint64_t EndOfSequenceAddress =
+          Function.getAddress() + Function.getMaxSize();
+      BC.getDwarfLineTable(Unit->getOffset())
+          .addLineTableSequence(LineTable, FirstRow, Results.back(),
+                                EndOfSequenceAddress);
     }
-
-    const uint64_t EndOfSequenceAddress =
-        Function.getAddress() + Function.getMaxSize();
-    BC.getDwarfLineTable(Function.getDWARFUnit()->getOffset())
-        .addLineTableSequence(LineTable, FirstRow, Results.back(),
-                              EndOfSequenceAddress);
   }
 
   // For units that are completely unprocessed, use original debug line contents
diff --git a/bolt/lib/Core/BinaryFunction.cpp b/bolt/lib/Core/BinaryFunction.cpp
index 6cac2d0cca2cb..578a87dc6c09d 100644
--- a/bolt/lib/Core/BinaryFunction.cpp
+++ b/bolt/lib/Core/BinaryFunction.cpp
@@ -179,37 +179,29 @@ template <typename R> static bool emptyRange(const R &Range) {
 }
 
 /// Gets debug line information for the instruction located at the given
-/// address in the original binary. The SMLoc's pointer is used
-/// to point to this information, which is represented by a
-/// DebugLineTableRowRef. The returned pointer is null if no debug line
-/// information for this instruction was found.
-static SMLoc findDebugLineInformationForInstructionAt(
+/// address in the original binary. Returns an optional DebugLineTableRowRef
+/// that references the corresponding row in the DWARF line table. Since binary
+/// functions can span multiple compilation units, this function helps
+/// associate instructions with their debug line information from the
+/// appropriate CU. Returns std::nullopt if no debug line information for
+/// this instruction was found.
+static std::optional<DebugLineTableRowRef>
+findDebugLineInformationForInstructionAt(
     uint64_t Address, DWARFUnit *Unit,
     const DWARFDebugLine::LineTable *LineTable) {
-  // We use the pointer in SMLoc to store an instance of DebugLineTableRowRef,
-  // which occupies 64 bits. Thus, we can only proceed if the struct fits into
-  // the pointer itself.
-  static_assert(
-      sizeof(decltype(SMLoc().getPointer())) >= sizeof(DebugLineTableRowRef),
-      "Cannot fit instruction debug line information into SMLoc's pointer");
-
-  SMLoc NullResult = DebugLineTableRowRef::NULL_ROW.toSMLoc();
   uint32_t RowIndex = LineTable->lookupAddress(
       {Address, object::SectionedAddress::UndefSection});
   if (RowIndex == LineTable->UnknownRowIndex)
-    return NullResult;
+    return std::nullopt;
 
   assert(RowIndex < LineTable->Rows.size() &&
          "Line Table lookup returned invalid index.");
 
-  decltype(SMLoc().getPointer()) Ptr;
-  DebugLineTableRowRef *InstructionLocation =
-      reinterpret_cast<DebugLineTableRowRef *>(&Ptr);
-
-  InstructionLocation->DwCompileUnitIndex = Unit->getOffset();
-  InstructionLocation->RowIndex = RowIndex + 1;
+  DebugLineTableRowRef InstructionLocation;
+  InstructionLocation.DwCompileUnitIndex = Unit->getOffset();
+  InstructionLocation.RowIndex = RowIndex + 1;
 
-  return SMLoc::getFromPointer(Ptr);
+  return InstructionLocation;
 }
 
 static std::string buildSectionName(StringRef Prefix, StringRef Name,
@@ -1496,9 +1488,24 @@ Error BinaryFunction::disassemble() {
     }
 
 add_instruction:
-    if (getDWARFLineTable()) {
-      Instruction.setLoc(findDebugLineInformationForInstructionAt(
-          AbsoluteInstrAddr, getDWARFUnit(), getDWARFLineTable()));
+    if (!getDWARFUnits().empty()) {
+      SmallVector<DebugLineTableRowRef, 1> Rows;
+      for (const auto &[_, Unit] : getDWARFUnits()) {
+        const DWARFDebugLine::LineTable *LineTable =
+            getDWARFLineTableForUnit(Unit);
+        if (!LineTable)
+          continue;
+        if (std::optional<DebugLineTableRowRef> RowRef =
+                findDebugLineInformationForInstructionAt(AbsoluteInstrAddr,
+                                                         Unit, LineTable))
+          Rows.emplace_back(*RowRef);
+      }
+      if (!Rows.empty()) {
+        ClusteredRows *Cluster =
+            BC.ClusteredRows.createClusteredRows(Rows.size());
+        Cluster->populate(Rows);
+        Instruction.setLoc(Cluster->toSMLoc());
+      }
     }
 
     // Record offset of the instruction for profile matching.
diff --git a/bolt/lib/Core/DebugData.cpp b/bolt/lib/Core/DebugData.cpp
index 521eb8d91bbc0..e05f28f08572c 100644
--- a/bolt/lib/Core/DebugData.cpp
+++ b/bolt/lib/Core/DebugData.cpp
@@ -101,8 +101,6 @@ std::optional<AttrInfo> findAttributeInfo(const DWARFDie DIE,
   return findAttributeInfo(DIE, AbbrevDecl, *Index);
 }
 
-const DebugLineTableRowRef DebugLineTableRowRef::NULL_ROW{0, 0};
-
 LLVM_ATTRIBUTE_UNUSED
 static void printLE64(const std::string &S) {
   for (uint32_t I = 0, Size = S.size(); I < Size; ++I) {
diff --git a/bolt/lib/Core/MCPlusBuilder.cpp b/bolt/lib/Core/MCPlusBuilder.cpp
index 7f962e14ea115..52475227eb32f 100644
--- a/bolt/lib/Core/MCPlusBuilder.cpp
+++ b/bolt/lib/Core/MCPlusBuilder.cpp
@@ -378,8 +378,8 @@ void MCPlusBuilder::stripAnnotations(MCInst &Inst, bool KeepTC) const {
     setTailCall(Inst);
 }
 
-void MCPlusBuilder::printAnnotations(const MCInst &Inst,
-                                     raw_ostream &OS) const {
+void MCPlusBuilder::printAnnotations(const MCInst &Inst, raw_ostream &OS,
+                                     bool PrintMemData) const {
   std::optional<unsigned> FirstAnnotationOp = getFirstAnnotationOpIndex(Inst);
   if (!FirstAnnotationOp)
     return;
@@ -390,7 +390,11 @@ void MCPlusBuilder::printAnnotations(const MCInst &Inst,
     const int64_t Value = extractAnnotationValue(Imm);
     const auto *Annotation = reinterpret_cast<const MCAnnotation *>(Value);
     if (Index >= MCAnnotation::kGeneric) {
-      OS << " # " << AnnotationNames[Index - MCAnnotation::kGeneric] << ": ";
+      std::string AnnotationName =
+          AnnotationNames[Index - MCAnnotation::kGeneric];
+      if (!PrintMemData && AnnotationName == "MemoryAccessProfile")
+        continue;
+      OS << " # " << AnnotationName << ": ";
       Annotation->print(OS);
     }
   }
diff --git a/bolt/lib/Rewrite/DWARFRewriter.cpp b/bolt/lib/Rewrite/DWARFRewriter.cpp
index 0c1a1bac6c72e..6752489ad562a 100644
--- a/bolt/lib/Rewrite/DWARFRewriter.cpp
+++ b/bolt/lib/Rewrite/DWARFRewriter.cpp
@@ -1846,15 +1846,16 @@ void DWARFRewriter::writeDWOFiles(
   }
 
   std::string CompDir = CU.getCompilationDir();
+  SmallString<16> AbsolutePath(DWOName);
 
   if (!opts::DwarfOutputPath.empty())
     CompDir = opts::DwarfOutputPath.c_str();
   else if (!opts::CompDirOverride.empty())
     CompDir = opts::CompDirOverride;
-
-  SmallString<16> AbsolutePath;
-  sys::path::append(AbsolutePath, CompDir);
-  sys::path::append(AbsolutePath, DWOName);
+  else if (!sys::fs::exists(CompDir))
+    CompDir = ".";
+  // Prevent failures when DWOName is already an absolute path.
+  sys::fs::make_absolute(CompDir, AbsolutePath);
 
   std::error_code EC;
   std::unique_ptr<ToolOutputFile> TempOut =
diff --git a/bolt/test/AArch64/print-mem-data.test b/bolt/test/AArch64/print-mem-data.test
new file mode 100644
index 0000000000000..09d4f4640a454
--- /dev/null
+++ b/bolt/test/AArch64/print-mem-data.test
@@ -0,0 +1,40 @@
+# Check that --print-mem-data option works properly in llvm-bolt
+
+# RUN: split-file %s %t
+# RUN: %clang %cflags -fPIC -pie %t/main.s -o %t.exe -nostdlib -Wl,-q
+# RUN: llvm-bolt %t.exe -o %t.bolt --print-mem-data=true --print-cfg \
+# RUN:   --data %t/fdata | FileCheck %s -check-prefix=CHECK-PRINT
+# RUN: llvm-bolt %t.exe -o %t.bolt --print-cfg \
+# RUN:   --data %t/fdata | FileCheck %s -check-prefix=CHECK-DEFAULT
+
+# CHECK-PRINT: ldr	w2, [x1], #0x4 # MemoryAccessProfile: 7 total counts :
+# CHECK-PRINT-NEXT: { 0x123: 1 },
+# CHECK-PRINT-NEXT: { 0x456: 2 },
+# CHECK-PRINT-NEXT: { 0xabc: 4 }
+# CHECK-DEFAULT-NOT: MemoryAccessProfile
+
+#--- main.s
+  .text
+  .align 4
+  .global main
+  .type	main, %function
+main:
+  sub sp, sp, #48
+  add x1, sp, 8
+  add x3, sp, 48
+  mov w0, 0
+.L2:
+  ldr w2, [x1], 4
+  add w0, w0, w2
+  cmp x1, x3
+  bne .L2
+  add sp, sp, 48
+  ret
+  .size main, .-main
+
+# The three memory access data generated by the load at
+# offset 0x10 in the main.
+#--- fdata
+4 main 10 4 otherSym 123 1
+4 main 10 4 otherSym 456 2
+4 main 10 4 otherSym abc 4
diff --git a/bolt/test/X86/multi-cu-debug-line.s b/bolt/test/X86/multi-cu-debug-line.s
new file mode 100644
index 0000000000000..15f49a211e58b
--- /dev/null
+++ b/bolt/test/X86/multi-cu-debug-line.s
@@ -0,0 +1,327 @@
+## Test that BOLT correctly handles debug line information for functions
+## that belong to multiple compilation units (e.g., inline functions in
+## common header files). This is the assembly version of the multi-cu-debug-line.test.
+## The test covers two scenarios:
+## 1. Normal processing: .debug_line section shows lines for the function
+##    in all CUs where it was compiled, with no duplicate rows within CUs
+## 2. Functions not processed: When BOLT doesn't process functions (using
+##    --funcs with nonexistent function), original debug info is preserved
+
+# REQUIRES: system-linux
+
+# RUN: split-file %s %t
+# RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown %t/multi-cu-file1.s -o %t/multi-cu-file1.o
+# RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown %t/multi-cu-file2.s -o %t/multi-cu-file2.o
+# RUN: %clang %cflags %t/multi-cu-file1.o %t/multi-cu-file2.o -o %t.exe -Wl,-q
+
+## Test 1: Normal BOLT processing (functions are processed/optimized)
+# RUN: llvm-bolt %t.exe -o %t.bolt --update-debug-sections
+# RUN: llvm-dwarfdump --debug-line %t.bolt > %t.debug-line.txt
+# RUN: FileCheck %s --check-prefix=BASIC --input-file %t.debug-line.txt
+
+## Check that debug line information is present for both compilation units
+# BASIC: debug_line[{{.*}}]
+# BASIC: file_names[{{.*}}]:
+# BASIC: name: "{{.*}}multi-cu-file1.c"
+# BASIC: debug_line[{{.*}}]
+# BASIC: file_names[{{.*}}]:
+# BASIC: name: "{{.*}}multi-cu-file2.c"
+
+## Use our helper script to create a normalized table without addresses
+# RUN: process-debug-line %t.debug-line.txt > %t.normalized-debug-line.txt
+# RUN: FileCheck %s --check-prefix=NORMALIZED --input-file %t.normalized-debug-line.txt
+
+## Check that we have line entries for the inline function (lines 5, 6, 7) from multi-cu-common.h
+## in both compilation units
+# NORMALIZED: multi-cu-file1.c 5 {{[0-9]+}} multi-cu-common.h
+# NORMALIZED: multi-cu-file1.c 6 {{[0-9]+}} multi-cu-common.h
+# NORMALIZED: multi-cu-file1.c 7 {{[0-9]+}} multi-cu-common.h
+# NORMALIZED: multi-cu-file2.c 5 {{[0-9]+}} multi-cu-common.h
+# NORMALIZED: multi-cu-file2.c 6 {{[0-9]+}} multi-cu-common.h
+# NORMALIZED: multi-cu-file2.c 7 {{[0-9]+}} multi-cu-common.h
+
+## Verify that we have line entries for the inline function in multiple CUs
+## by checking that the header file appears multiple times in different contexts
+# RUN: grep -c "multi-cu-common.h" %t.debug-line.txt > %t.header-count.txt
+# RUN: FileCheck %s --check-prefix=MULTI-CU --input-file %t.header-count.txt
+
+## The header should appear in debug line info for multiple CUs
+# MULTI-CU: {{[2-9]|[1-9][0-9]+}}
+
+## Check that there are no duplicate line table rows within the same CU
+## This verifies the fix for the bug where duplicate entries were created
+# RUN: sort %t.normalized-debug-line.txt | uniq -c | \
+# RUN:   awk '$1 > 1 {print "DUPLICATE_ROW: " $0}' > %t.duplicates.txt
+# RUN: FileCheck %s --check-prefix=NO-DUPLICATES --input-file %t.duplicates.txt --allow-empty
+
+## Should have no duplicate normalized rows (file should be empty)
+## Note: Cross-CU duplicates are expected and valid (same function in different CUs)
+## but within-CU duplicates would indicate a bug
+# NO-DUPLICATES-NOT: DUPLICATE_ROW
+
+## Test 2: Functions not processed by BOLT (using --funcs with nonexistent function)
+## This tests the code path where BOLT preserves original debug info
+# RUN: llvm-bolt %t.exe -o %t.not-emitted.bolt --update-debug-sections --funcs=nonexistent_function
+# RUN: llvm-dwarfdump --debug-line %t.not-emitted.bolt > %t.not-emitted.debug-line.txt
+# RUN: FileCheck %s --check-prefix=PRESERVED-BASIC --input-file %t.not-emitted.debug-line.txt
+
+## Check that debug line information is still present for both compilation units when functions aren't processed
+# PRESERVED-BASIC: debug_line[{{.*}}]
+# PRESERVED-BASIC: file_names[{{.*}}]:
+# PRESERVED-BASIC: name: "{{.*}}multi-cu-file1.c"
+# PRESERVED-BASIC: debug_line[{{.*}}]
+# PRESERVED-BASIC: file_names[{{.*}}]:
+# PRESERVED-BASIC: name: "{{.*}}multi-cu-file2.c"
+
+## Create normalized output for the not-emitted case
+# RUN: process-debug-line %t.not-emitted.debug-line.txt > %t.not-emitted.normalized.txt
+# RUN: FileCheck %s --check-prefix=PRESERVED-NORMALIZED --input-file %t.not-emitted.normalized.txt
+
+## Check that we have line entries for the inline function (lines 5, 6, 7) from multi-cu-common.h
+## in both compilation units (preserved from original)
+# PRESERVED-NORMALIZED: multi-cu-file1.c 5 {{[0-9]+}} multi-cu-common.h
+# PRESERVED-NORMALIZED: multi-cu-file1.c 6 {{[0-9]+}} multi-cu-common.h
+# PRESERVED-NORMALIZED: multi-cu-file1.c 7 {{[0-9]+}} multi-cu-common.h
+# PRESERVED-NORMALIZED: multi-cu-file2.c 5 {{[0-9]+}} multi-cu-common.h
+# PRESERVED-NORMALIZED: multi-cu-file2.c 6 {{[0-9]+}} multi-cu-common.h
+# PRESERVED-NORMALIZED: multi-cu-file2.c 7 {{[0-9]+}} multi-cu-common.h
+
+## Verify that we have line entries for the inline function in multiple CUs (preserved)
+## by checking that the header file appears multiple times in different contexts
+# RUN: grep -c "multi-cu-common.h" %t.not-emitted.debug-line.txt > %t.preserved-header-count.txt
+# RUN: FileCheck %s --check-prefix=PRESERVED-MULTI-CU --input-file %t.preserved-header-count.txt
+
+## The header should appear in debug line info for multiple CUs (preserved from original)
+# PRESERVED-MULTI-CU: {{[2-9]|[1-9][0-9]+}}
+
+## Check that original debug info is preserved for main functions
+# RUN: grep "multi-cu-file1.c.*multi-cu-file1.c" %t.not-emitted.normalized.txt > %t.preserved-main.txt
+# RUN: FileCheck %s --check-prefix=PRESERVED-MAIN --input-file %t.preserved-main.txt
+
+# PRESERVED-MAIN: multi-cu-file1.c {{[0-9]+}} {{[0-9]+}} multi-cu-file1.c
+
+## Check that original debug info is preserved for file2 functions
+# RUN: grep "multi-cu-file2.c.*multi-cu-file2.c" %t.not-emitted.normalized.txt > %t.preserved-file2.txt
+# RUN: FileCheck %s --check-prefix=PRESERVED-FILE2 --input-file %t.preserved-file2.txt
+
+# PRESERVED-FILE2: multi-cu-file2.c {{[0-9]+}} {{[0-9]+}} multi-cu-file2.c
+
+;--- multi-cu-file1.s
+	.text
+	.file	1 "/repo/llvm-project" "bolt/test/Inputs/multi-cu-file1.c"
+	.file	2 "/repo/llvm-project" "bolt/test/Inputs/multi-cu-common.h"
+
+	.globl	main
+	.type	main,@function
+main:
+.Lfunc_begin0:
+	.loc	1 4 0
+	callq	common_inline_function
+	.loc	1 8 0
+	retq
+.Lfunc_end0:
+	.size	main, .Lfunc_end0-main
+
+	.type	common_inline_function,@function
+common_inline_function:
+.Lfunc_begin1:
+	.loc	2 5 0
+	movl	$42, %eax
+	.loc	2 6 0
+	addl	$10, %eax
+	.loc	2 7 0
+	retq
+.Lfunc_end1:
+	.size	common_inline_function, .Lfunc_end1-common_inline_function
+
+	.section	.debug_abbrev,"",@progbits
+	.byte	1                               # Abbreviation Code
+	.byte	17                              # DW_TAG_compile_unit
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	37                              # DW_AT_producer
+	.byte	14                              # DW_FORM_strp
+	.byte	19                              # DW_AT_language
+	.byte	5                               # DW_FORM_data2
+	.byte	3                               # DW_AT_name
+	.byte	14                              # DW_FORM_strp
+	.byte	16                              # DW_AT_stmt_list
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	27                              # DW_AT_comp_dir
+	.byte	14                              # DW_FORM_strp
+	.byte	17                              # DW_AT_low_pc
+	.byte	1                               # DW_FORM_addr
+	.byte	18                              # DW_AT_high_pc
+	.byte	6                               # DW_FORM_data4
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	2                               # Abbreviation Code
+	.byte	46                              # DW_TAG_subprogram
+	.byte	0                               # DW_CHILDREN_no
+	.byte	17                              # DW_AT_low_pc
+	.byte	1                               # DW_FORM_addr
+	.byte	18                              # DW_AT_high_pc
+	.byte	6                               # DW_FORM_data4
+	.byte	3                               # DW_AT_name
+	.byte	14                              # DW_FORM_strp
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	0                               # EOM(3)
+
+	.section	.debug_info,"",@progbits
+.Lcu_begin0:
+	.long	.Ldebug_info_end0-.Ldebug_info_start0
+.Ldebug_info_start0:
+	.short	4                               # DWARF version number
+	.long	.debug_abbrev                   # Offset Into Abbrev. Section
+	.byte	8                               # Address Size (in bytes)
+	.byte	1                               # Abbrev [1] 0xb:0x30 DW_TAG_compile_unit
+	.long	.Linfo_string0                  # DW_AT_producer
+	.short	29                              # DW_AT_language
+	.long	.Linfo_string1                  # DW_AT_name
+	.long	.Lline_table_start0             # DW_AT_stmt_list
+	.long	.Linfo_string2                  # DW_AT_comp_dir
+	.quad	.Lfunc_begin0                   # DW_AT_low_pc
+	.long	.Lfunc_end1-.Lfunc_begin0       # DW_AT_high_pc
+	.byte	2                               # Abbrev [2] 0x2a:0x10 DW_TAG_subprogram
+	.quad	.Lfunc_begin0                   # DW_AT_low_pc
+	.long	.Lfunc_end0-.Lfunc_begin0       # DW_AT_high_pc
+	.long	.Linfo_string3                  # DW_AT_name
+	.byte	1                               # DW_AT_decl_file
+	.byte	4                               # DW_AT_decl_line
+	.byte	2                               # Abbrev [2] 0x3a:0x10 DW_TAG_subprogram
+	.quad	.Lfunc_begin1                   # DW_AT_low_pc
+	.long	.Lfunc_end1-.Lfunc_begin1       # DW_AT_high_pc
+	.long	.Linfo_string4                  # DW_AT_name
+	.byte	2                               # DW_AT_decl_file
+	.byte	5                               # DW_AT_decl_line
+	.byte	0                               # End Of Children Mark
+.Ldebug_info_end0:
+
+	.section	.debug_str,"MS",@progbits,1
+.Linfo_string0:
+	.asciz	"clang version 18.0.0"
+.Linfo_string1:
+	.asciz	"/repo/llvm-project/bolt/test/Inputs/multi-cu-file1.c"
+.Linfo_string2:
+	.asciz	"/repo/llvm-project"
+.Linfo_string3:
+	.asciz	"main"
+.Linfo_string4:
+	.asciz	"common_inline_function"
+
+	.section	.debug_line,"",@progbits
+.Lline_table_start0:
+
+;--- multi-cu-file2.s
+	.text
+	.file	1 "/repo/llvm-project" "bolt/test/Inputs/multi-cu-file2.c"
+	.file	2 "/repo/llvm-project" "bolt/test/Inputs/multi-cu-common.h"
+
+	.globl	helper_function
+	.type	helper_function,@function
+helper_function:
+.Lfunc_begin0:
+	.loc	1 4 0
+	callq	common_inline_function
+	.loc	1 8 0
+	retq
+.Lfunc_end0:
+	.size	helper_function, .Lfunc_end0-helper_function
+
+	.type	common_inline_function,@function
+common_inline_function:
+.Lfunc_begin1:
+	.loc	2 5 0
+	movl	$42, %eax
+	.loc	2 6 0
+	addl	$10, %eax
+	.loc	2 7 0
+	retq
+.Lfunc_end1:
+	.size	common_inline_function, .Lfunc_end1-common_inline_function
+
+	.section	.debug_abbrev,"",@progbits
+	.byte	1                               # Abbreviation Code
+	.byte	17                              # DW_TAG_compile_unit
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	37                              # DW_AT_producer
+	.byte	14                              # DW_FORM_strp
+	.byte	19                              # DW_AT_language
+	.byte	5                               # DW_FORM_data2
+	.byte	3                               # DW_AT_name
+	.byte	14                              # DW_FORM_strp
+	.byte	16                              # DW_AT_stmt_list
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	27                              # DW_AT_comp_dir
+	.byte	14                              # DW_FORM_strp
+	.byte	17                              # DW_AT_low_pc
+	.byte	1                               # DW_FORM_addr
+	.byte	18                              # DW_AT_high_pc
+	.byte	6                               # DW_FORM_data4
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	2                               # Abbreviation Code
+	.byte	46                              # DW_TAG_subprogram
+	.byte	0                               # DW_CHILDREN_no
+	.byte	17                              # DW_AT_low_pc
+	.byte	1                               # DW_FORM_addr
+	.byte	18                              # DW_AT_high_pc
+	.byte	6                               # DW_FORM_data4
+	.byte	3                               # DW_AT_name
+	.byte	14                              # DW_FORM_strp
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	0                               # EOM(3)
+
+	.section	.debug_info,"",@progbits
+.Lcu_begin0:
+	.long	.Ldebug_info_end0-.Ldebug_info_start0
+.Ldebug_info_start0:
+	.short	4                               # DWARF version number
+	.long	.debug_abbrev                   # Offset Into Abbrev. Section
+	.byte	8                               # Address Size (in bytes)
+	.byte	1                               # Abbrev [1] 0xb:0x30 DW_TAG_compile_unit
+	.long	.Linfo_string0                  # DW_AT_producer
+	.short	29                              # DW_AT_language
+	.long	.Linfo_string1                  # DW_AT_name
+	.long	.Lline_table_start0             # DW_AT_stmt_list
+	.long	.Linfo_string2                  # DW_AT_comp_dir
+	.quad	.Lfunc_begin0                   # DW_AT_low_pc
+	.long	.Lfunc_end1-.Lfunc_begin0       # DW_AT_high_pc
+	.byte	2                               # Abbrev [2] 0x2a:0x10 DW_TAG_subprogram
+	.quad	.Lfunc_begin0                   # DW_AT_low_pc
+	.long	.Lfunc_end0-.Lfunc_begin0       # DW_AT_high_pc
+	.long	.Linfo_string3                  # DW_AT_name
+	.byte	1                               # DW_AT_decl_file
+	.byte	4                               # DW_AT_decl_line
+	.byte	2                               # Abbrev [2] 0x3a:0x10 DW_TAG_subprogram
+	.quad	.Lfunc_begin1                   # DW_AT_low_pc
+	.long	.Lfunc_end1-.Lfunc_begin1       # DW_AT_high_pc
+	.long	.Linfo_string4                  # DW_AT_name
+	.byte	2                               # DW_AT_decl_file
+	.byte	5                               # DW_AT_decl_line
+	.byte	0                               # End Of Children Mark
+.Ldebug_info_end0:
+
+	.section	.debug_str,"MS",@progbits,1
+.Linfo_string0:
+	.asciz	"clang version 18.0.0"
+.Linfo_string1:
+	.asciz	"/repo/llvm-project/bolt/test/Inputs/multi-cu-file2.c"
+.Linfo_string2:
+	.asciz	"/repo/llvm-project"
+.Linfo_string3:
+	.asciz	"helper_function"
+.Linfo_string4:
+	.asciz	"common_inline_function"
+
+	.section	.debug_line,"",@progbits
+.Lline_table_start0:
diff --git a/bolt/test/dwo-name-retrieving.test b/bolt/test/dwo-name-retrieving.test
new file mode 100755
index 0000000000000..39193ccc6637a
--- /dev/null
+++ b/bolt/test/dwo-name-retrieving.test
@@ -0,0 +1,19 @@
+## Test DWO retrieval via relative path with a missing CompDir.
+## Also, verify no crash for an absolute DWOName path.
+
+## The case where DWOName is a relative path, and debug compilation directory does not exist.
+# RUN: rm -rf %t && mkdir -p %t && cd %t
+# RUN: %clang %cflags -g -gsplit-dwarf -fdebug-compilation-dir=/path/does/not/exist %p/Inputs/hello.c -o main.exe
+# RUN: llvm-bolt %t/main.exe -o %t/main.exe.bolt -update-debug-sections 2>&1 | FileCheck %s -check-prefix=DWO-NAME-REL
+
+# DWO-NAME-REL: BOLT-WARNING: Debug Fission: Debug Compilation Directory of main.exe-hello.dwo does not exist.
+# DWO-NAME-REL-NOT: Debug Fission: DWO debug information for
+
+## The case where DWOName is a absolute path, and a dwp file is provided.
+# RUN: %clang %cflags -g -gsplit-dwarf %p/Inputs/hello.c -o %t/main.exe
+# RUN: llvm-dwp -e %t/main.exe -o  %t/main.exe.dwp
+# RUN: llvm-bolt %t/main.exe -o %t/main.exe.bolt -update-debug-sections -dwp=%t/main.exe.dwp 2>&1 | FileCheck %s -check-prefix=DWO-NAME-ABS
+
+# DWO-NAME-ABS-NOT: BOLT-WARNING: Debug Fission: Debug Compilation Directory of {{.*}}/main.exe-hello.dwo does not exist.
+# DWO-NAME-ABS-NOT: Debug Fission: DWO debug information for
+# DWO-NAME-ABS-NOT: Assertion `FD >= 0 && "File not yet open!"' failed.
diff --git a/bolt/test/lit.cfg.py b/bolt/test/lit.cfg.py
index bef570ba50a04..3299051db4983 100644
--- a/bolt/test/lit.cfg.py
+++ b/bolt/test/lit.cfg.py
@@ -138,6 +138,7 @@
         unresolved="fatal",
         extra_args=[link_fdata_cmd],
     ),
+    ToolSubst("process-debug-line", unresolved="fatal"),
     ToolSubst("merge-fdata", unresolved="fatal"),
     ToolSubst("llvm-readobj", unresolved="fatal"),
     ToolSubst("llvm-dwp", unresolved="fatal"),
diff --git a/bolt/test/process-debug-line b/bolt/test/process-debug-line
new file mode 100755
index 0000000000000..44cbcd1e5984a
--- /dev/null
+++ b/bolt/test/process-debug-line
@@ -0,0 +1,105 @@
+#!/bin/sh
+
+# Script to process llvm-dwarfdump --debug-line output and create a normalized table
+# Usage: process-debug-line.sh <debug-line.txt>
+#
+# Output format: CU_FILE LINE COLUMN FILE_NAME [additional_info]
+# This strips addresses to make rows unique and adds context about which CU and file each line belongs to
+
+if [ $# -ne 1 ]; then
+    echo "Usage: $0 <debug-line.txt>" >&2
+    exit 1
+fi
+
+debug_line_file="$1"
+
+if [ ! -f "$debug_line_file" ]; then
+    echo "Error: File '$debug_line_file' not found" >&2
+    exit 1
+fi
+
+awk '
+BEGIN {
+    cu_count = 0
+    current_cu_file = ""
+    # Initialize file names array
+    for (i = 0; i < 100; i++) {
+        current_file_names[i] = ""
+    }
+}
+
+# Track debug_line sections (new CU)
+/^debug_line\[/ {
+    cu_count++
+    current_cu_file = ""
+    # Clear file names array for new CU
+    for (i = 0; i < 100; i++) {
+        current_file_names[i] = ""
+    }
+    next
+}
+
+# Capture file names and their indices
+/^file_names\[.*\]:/ {
+    # Extract file index using simple string operations
+    line_copy = $0
+    gsub(/file_names\[/, "", line_copy)
+    gsub(/\]:.*/, "", line_copy)
+    gsub(/[ \t]/, "", line_copy)
+    file_index = line_copy
+
+    getline  # Read the next line which contains the actual filename
+    # Extract filename from name: "filename" format
+    if (match($0, /name:[ \t]*"/)) {
+        filename = $0
+        gsub(/.*name:[ \t]*"/, "", filename)
+        gsub(/".*/, "", filename)
+        current_file_names[file_index] = filename
+
+        # Extract basename for main CU file (first .c/.cpp/.cc file we see)
+        if (current_cu_file == "" && match(filename, /\.(c|cpp|cc)$/)) {
+            cu_filename = filename
+            gsub(/.*\//, "", cu_filename)
+            current_cu_file = cu_filename
+        }
+    }
+    next
+}
+
+# Process line table entries
+/^0x[0-9a-f]+/ {
+    # Parse the line entry: Address Line Column File ISA Discriminator OpIndex Flags
+    if (NF >= 4) {
+        line = $2
+        column = $3
+        file_index = $4
+
+        # Get the filename for this file index
+        filename = current_file_names[file_index]
+        if (filename == "") {
+            filename = "UNKNOWN_FILE_" file_index
+        } else {
+            # Extract just the basename
+            basename = filename
+            gsub(/.*\//, "", basename)
+            filename = basename
+        }
+
+        # Build additional info (flags, etc.)
+        additional_info = ""
+        for (i = 8; i <= NF; i++) {
+            if (additional_info != "") {
+                additional_info = additional_info " "
+            }
+            additional_info = additional_info $i
+        }
+
+        # Output normalized row: CU_FILE LINE COLUMN FILE_NAME [additional_info]
+        printf "%s %s %s %s", current_cu_file, line, column, filename
+        if (additional_info != "") {
+            printf " %s", additional_info
+        }
+        printf "\n"
+    }
+}
+' "$debug_line_file"
diff --git a/bolt/unittests/Core/CMakeLists.txt b/bolt/unittests/Core/CMakeLists.txt
index f10b0d9472067..297dec7449202 100644
--- a/bolt/unittests/Core/CMakeLists.txt
+++ b/bolt/unittests/Core/CMakeLists.txt
@@ -7,6 +7,7 @@ set(LLVM_LINK_COMPONENTS
 
 add_bolt_unittest(CoreTests
   BinaryContext.cpp
+  ClusteredRows.cpp
   MCPlusBuilder.cpp
   MemoryMaps.cpp
   DynoStats.cpp
diff --git a/bolt/unittests/Core/ClusteredRows.cpp b/bolt/unittests/Core/ClusteredRows.cpp
new file mode 100644
index 0000000000000..4665022c91fdd
--- /dev/null
+++ b/bolt/unittests/Core/ClusteredRows.cpp
@@ -0,0 +1,141 @@
+//===- bolt/unittest/Core/ClusteredRows.cpp ------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "bolt/Core/DebugData.h"
+#include "llvm/Support/SMLoc.h"
+#include "gtest/gtest.h"
+#include <vector>
+
+using namespace llvm;
+using namespace llvm::bolt;
+
+namespace {
+
+class ClusteredRowsTest : public ::testing::Test {
+protected:
+  void SetUp() override {
+    Container = std::make_unique<ClusteredRowsContainer>();
+  }
+
+  std::unique_ptr<ClusteredRowsContainer> Container;
+};
+
+TEST_F(ClusteredRowsTest, CreateSingleElement) {
+  ClusteredRows *CR = Container->createClusteredRows(1);
+  ASSERT_NE(CR, nullptr);
+  EXPECT_EQ(CR->size(), 1u);
+
+  // Test population with single element
+  std::vector<DebugLineTableRowRef> TestRefs = {{42, 100}};
+  CR->populate(TestRefs);
+
+  ArrayRef<DebugLineTableRowRef> Rows = CR->getRows();
+  EXPECT_EQ(Rows.size(), 1u);
+  EXPECT_EQ(Rows[0].DwCompileUnitIndex, 42u);
+  EXPECT_EQ(Rows[0].RowIndex, 100u);
+}
+
+TEST_F(ClusteredRowsTest, CreateMultipleElements) {
+  ClusteredRows *CR = Container->createClusteredRows(3);
+  ASSERT_NE(CR, nullptr);
+  EXPECT_EQ(CR->size(), 3u);
+
+  // Test population with multiple elements
+  std::vector<DebugLineTableRowRef> TestRefs = {{10, 20}, {30, 40}, {50, 60}};
+  CR->populate(TestRefs);
+
+  ArrayRef<DebugLineTableRowRef> Rows = CR->getRows();
+  EXPECT_EQ(Rows.size(), 3u);
+
+  EXPECT_EQ(Rows[0].DwCompileUnitIndex, 10u);
+  EXPECT_EQ(Rows[0].RowIndex, 20u);
+
+  EXPECT_EQ(Rows[1].DwCompileUnitIndex, 30u);
+  EXPECT_EQ(Rows[1].RowIndex, 40u);
+
+  EXPECT_EQ(Rows[2].DwCompileUnitIndex, 50u);
+  EXPECT_EQ(Rows[2].RowIndex, 60u);
+}
+
+TEST_F(ClusteredRowsTest, SMLoc_Conversion) {
+  ClusteredRows *CR = Container->createClusteredRows(2);
+  ASSERT_NE(CR, nullptr);
+
+  // Test SMLoc conversion
+  SMLoc Loc = CR->toSMLoc();
+  EXPECT_TRUE(Loc.isValid());
+
+  // Test round-trip conversion
+  const ClusteredRows *CR2 = ClusteredRows::fromSMLoc(Loc);
+  EXPECT_EQ(CR, CR2);
+  EXPECT_EQ(CR2->size(), 2u);
+}
+
+TEST_F(ClusteredRowsTest, PopulateWithArrayRef) {
+  ClusteredRows *CR = Container->createClusteredRows(4);
+  ASSERT_NE(CR, nullptr);
+
+  // Test population with ArrayRef
+  DebugLineTableRowRef TestArray[] = {{1, 2}, {3, 4}, {5, 6}, {7, 8}};
+  ArrayRef<DebugLineTableRowRef> TestRefs(TestArray, 4);
+  CR->populate(TestRefs);
+
+  ArrayRef<DebugLineTableRowRef> Rows = CR->getRows();
+  EXPECT_EQ(Rows.size(), 4u);
+
+  for (size_t i = 0; i < 4; ++i) {
+    EXPECT_EQ(Rows[i].DwCompileUnitIndex, TestArray[i].DwCompileUnitIndex);
+    EXPECT_EQ(Rows[i].RowIndex, TestArray[i].RowIndex);
+  }
+}
+
+TEST_F(ClusteredRowsTest, MultipleClusteredRows) {
+  // Test creating multiple ClusteredRows objects
+  ClusteredRows *CR1 = Container->createClusteredRows(2);
+  ClusteredRows *CR2 = Container->createClusteredRows(3);
+  ClusteredRows *CR3 = Container->createClusteredRows(1);
+
+  ASSERT_NE(CR1, nullptr);
+  ASSERT_NE(CR2, nullptr);
+  ASSERT_NE(CR3, nullptr);
+
+  // Ensure they are different objects
+  EXPECT_NE(CR1, CR2);
+  EXPECT_NE(CR2, CR3);
+  EXPECT_NE(CR1, CR3);
+
+  // Verify sizes
+  EXPECT_EQ(CR1->size(), 2u);
+  EXPECT_EQ(CR2->size(), 3u);
+  EXPECT_EQ(CR3->size(), 1u);
+
+  // Populate each with different data
+  std::vector<DebugLineTableRowRef> TestRefs1 = {{100, 200}, {300, 400}};
+  std::vector<DebugLineTableRowRef> TestRefs2 = {{10, 20}, {30, 40}, {50, 60}};
+  std::vector<DebugLineTableRowRef> TestRefs3 = {{999, 888}};
+
+  CR1->populate(TestRefs1);
+  CR2->populate(TestRefs2);
+  CR3->populate(TestRefs3);
+
+  // Verify data integrity
+  ArrayRef<DebugLineTableRowRef> Rows1 = CR1->getRows();
+  ArrayRef<DebugLineTableRowRef> Rows2 = CR2->getRows();
+  ArrayRef<DebugLineTableRowRef> Rows3 = CR3->getRows();
+
+  EXPECT_EQ(Rows1[0].DwCompileUnitIndex, 100u);
+  EXPECT_EQ(Rows1[1].RowIndex, 400u);
+
+  EXPECT_EQ(Rows2[1].DwCompileUnitIndex, 30u);
+  EXPECT_EQ(Rows2[2].RowIndex, 60u);
+
+  EXPECT_EQ(Rows3[0].DwCompileUnitIndex, 999u);
+  EXPECT_EQ(Rows3[0].RowIndex, 888u);
+}
+
+} // namespace
diff --git a/clang-tools-extra/clang-include-fixer/IncludeFixer.cpp b/clang-tools-extra/clang-include-fixer/IncludeFixer.cpp
index 30bb313524cbe..d2ae13c022b23 100644
--- a/clang-tools-extra/clang-include-fixer/IncludeFixer.cpp
+++ b/clang-tools-extra/clang-include-fixer/IncludeFixer.cpp
@@ -94,8 +94,7 @@ bool IncludeFixerActionFactory::runInvocation(
 
   // Create the compiler's actual diagnostics engine. We want to drop all
   // diagnostics here.
-  Compiler.createDiagnostics(Files->getVirtualFileSystem(),
-                             new clang::IgnoringDiagConsumer,
+  Compiler.createDiagnostics(new clang::IgnoringDiagConsumer,
                              /*ShouldOwnClient=*/true);
   Compiler.createSourceManager(*Files);
 
diff --git a/clang-tools-extra/clang-tidy/.clang-tidy b/clang-tools-extra/clang-tidy/.clang-tidy
index ae554c6668a84..0c2f34b529016 100644
--- a/clang-tools-extra/clang-tidy/.clang-tidy
+++ b/clang-tools-extra/clang-tidy/.clang-tidy
@@ -5,7 +5,6 @@ Checks: >
   -bugprone-branch-clone,
   -bugprone-easily-swappable-parameters,
   -bugprone-narrowing-conversions,
-  -bugprone-suspicious-stringview-data-usage,
   -bugprone-unchecked-optional-access,
   -bugprone-unused-return-value,
   modernize-*,
@@ -16,7 +15,6 @@ Checks: >
   performance-*,
   -performance-enum-size,
   -performance-no-int-to-ptr,
-  -performance-type-promotion-in-math-fn,
   -performance-unnecessary-value-param,
   readability-*,
   -readability-avoid-nested-conditional-operator,
diff --git a/clang-tools-extra/clang-tidy/ClangTidy.cpp b/clang-tools-extra/clang-tidy/ClangTidy.cpp
index 2064c7826da0c..4c36bbccf44d9 100644
--- a/clang-tools-extra/clang-tidy/ClangTidy.cpp
+++ b/clang-tools-extra/clang-tidy/ClangTidy.cpp
@@ -1,4 +1,4 @@
-//===--- tools/extra/clang-tidy/ClangTidy.cpp - Clang tidy tool -----------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/ClangTidy.h b/clang-tools-extra/clang-tidy/ClangTidy.h
index d37d68ec0a5b9..3d1d3ca0b1791 100644
--- a/clang-tools-extra/clang-tidy/ClangTidy.h
+++ b/clang-tools-extra/clang-tidy/ClangTidy.h
@@ -1,4 +1,4 @@
-//===--- ClangTidy.h - clang-tidy -------------------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/ClangTidyCheck.cpp b/clang-tools-extra/clang-tidy/ClangTidyCheck.cpp
index 88abcb6946779..d36cc3e6e23db 100644
--- a/clang-tools-extra/clang-tidy/ClangTidyCheck.cpp
+++ b/clang-tools-extra/clang-tidy/ClangTidyCheck.cpp
@@ -1,4 +1,4 @@
-//===--- ClangTidyCheck.cpp - clang-tidy ------------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/ClangTidyCheck.h b/clang-tools-extra/clang-tidy/ClangTidyCheck.h
index 399d45911549d..e53ae532d7e5f 100644
--- a/clang-tools-extra/clang-tidy/ClangTidyCheck.h
+++ b/clang-tools-extra/clang-tidy/ClangTidyCheck.h
@@ -1,4 +1,4 @@
-//===--- ClangTidyCheck.h - clang-tidy --------------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/ClangTidyDiagnosticConsumer.cpp b/clang-tools-extra/clang-tidy/ClangTidyDiagnosticConsumer.cpp
index fac6e0418d163..d07f15a10555f 100644
--- a/clang-tools-extra/clang-tidy/ClangTidyDiagnosticConsumer.cpp
+++ b/clang-tools-extra/clang-tidy/ClangTidyDiagnosticConsumer.cpp
@@ -1,4 +1,4 @@
-//===--- tools/extra/clang-tidy/ClangTidyDiagnosticConsumer.cpp ----------=== //
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/ClangTidyDiagnosticConsumer.h b/clang-tools-extra/clang-tidy/ClangTidyDiagnosticConsumer.h
index 6e7cb7bb10e57..a854756d647c2 100644
--- a/clang-tools-extra/clang-tidy/ClangTidyDiagnosticConsumer.h
+++ b/clang-tools-extra/clang-tidy/ClangTidyDiagnosticConsumer.h
@@ -1,4 +1,4 @@
-//===--- ClangTidyDiagnosticConsumer.h - clang-tidy -------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/ClangTidyModule.cpp b/clang-tools-extra/clang-tidy/ClangTidyModule.cpp
index 7432229fda800..4fb4144f835a3 100644
--- a/clang-tools-extra/clang-tidy/ClangTidyModule.cpp
+++ b/clang-tools-extra/clang-tidy/ClangTidyModule.cpp
@@ -1,4 +1,4 @@
-//===--- tools/extra/clang-tidy/ClangTidyModule.cpp - Clang tidy tool -----===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/ClangTidyModule.h b/clang-tools-extra/clang-tidy/ClangTidyModule.h
index 28f54331755a7..7407ab580d378 100644
--- a/clang-tools-extra/clang-tidy/ClangTidyModule.h
+++ b/clang-tools-extra/clang-tidy/ClangTidyModule.h
@@ -1,4 +1,4 @@
-//===--- ClangTidyModule.h - clang-tidy -------------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/ClangTidyModuleRegistry.h b/clang-tools-extra/clang-tidy/ClangTidyModuleRegistry.h
index 8a07b05c26446..e0e5e35d4dae0 100644
--- a/clang-tools-extra/clang-tidy/ClangTidyModuleRegistry.h
+++ b/clang-tools-extra/clang-tidy/ClangTidyModuleRegistry.h
@@ -1,4 +1,4 @@
-//===--- ClangTidyModuleRegistry.h - clang-tidy -----------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/ClangTidyOptions.cpp b/clang-tools-extra/clang-tidy/ClangTidyOptions.cpp
index e59f157b468bc..dfa3521a25513 100644
--- a/clang-tools-extra/clang-tidy/ClangTidyOptions.cpp
+++ b/clang-tools-extra/clang-tidy/ClangTidyOptions.cpp
@@ -1,4 +1,4 @@
-//===--- ClangTidyOptions.cpp - clang-tidy ----------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -99,6 +99,8 @@ void yamlize(IO &IO, ClangTidyOptions::OptionMap &Val, bool,
     for (auto &Option : SortedOptions) {
       bool UseDefault = false;
       void *SaveInfo = nullptr;
+      // Requires 'llvm::yaml::IO' to accept 'StringRef'
+      // NOLINTNEXTLINE(bugprone-suspicious-stringview-data-usage)
       IO.preflightKey(Option.first.data(), true, false, UseDefault, SaveInfo);
       IO.scalarString(Option.second, needsQuotes(Option.second));
       IO.postflightKey(SaveInfo);
@@ -116,6 +118,8 @@ void yamlize(IO &IO, ClangTidyOptions::OptionMap &Val, bool,
     } else if (isa<MappingNode>(I.getCurrentNode())) {
       IO.beginMapping();
       for (StringRef Key : IO.keys()) {
+        // Requires 'llvm::yaml::IO' to accept 'StringRef'
+        // NOLINTNEXTLINE(bugprone-suspicious-stringview-data-usage)
         IO.mapRequired(Key.data(), Val[Key].Value);
       }
       IO.endMapping();
diff --git a/clang-tools-extra/clang-tidy/ClangTidyOptions.h b/clang-tools-extra/clang-tidy/ClangTidyOptions.h
index 6ddc5f9b9cf9e..22a954d2ac645 100644
--- a/clang-tools-extra/clang-tidy/ClangTidyOptions.h
+++ b/clang-tools-extra/clang-tidy/ClangTidyOptions.h
@@ -1,4 +1,4 @@
-//===--- ClangTidyOptions.h - clang-tidy ------------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/ClangTidyProfiling.cpp b/clang-tools-extra/clang-tidy/ClangTidyProfiling.cpp
index 89867ec30f51f..8ea6b76819804 100644
--- a/clang-tools-extra/clang-tidy/ClangTidyProfiling.cpp
+++ b/clang-tools-extra/clang-tidy/ClangTidyProfiling.cpp
@@ -1,4 +1,4 @@
-//===--- ClangTidyProfiling.cpp - clang-tidy --------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/ClangTidyProfiling.h b/clang-tools-extra/clang-tidy/ClangTidyProfiling.h
index 76deede1716f4..59c213b181ef7 100644
--- a/clang-tools-extra/clang-tidy/ClangTidyProfiling.h
+++ b/clang-tools-extra/clang-tidy/ClangTidyProfiling.h
@@ -1,4 +1,4 @@
-//===--- ClangTidyProfiling.h - clang-tidy ----------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/FileExtensionsSet.h b/clang-tools-extra/clang-tidy/FileExtensionsSet.h
index 7ca4e6ee01d3f..95c221c84da2e 100644
--- a/clang-tools-extra/clang-tidy/FileExtensionsSet.h
+++ b/clang-tools-extra/clang-tidy/FileExtensionsSet.h
@@ -1,4 +1,4 @@
-//===--- FileExtensionsSet.h - clang-tidy -----------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/GlobList.cpp b/clang-tools-extra/clang-tidy/GlobList.cpp
index 8f09ee075bbd6..667a25657a4c9 100644
--- a/clang-tools-extra/clang-tidy/GlobList.cpp
+++ b/clang-tools-extra/clang-tidy/GlobList.cpp
@@ -1,4 +1,4 @@
-//===--- tools/extra/clang-tidy/GlobList.cpp ------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/GlobList.h b/clang-tools-extra/clang-tidy/GlobList.h
index 4317928270adf..c9086df2b7973 100644
--- a/clang-tools-extra/clang-tidy/GlobList.h
+++ b/clang-tools-extra/clang-tidy/GlobList.h
@@ -1,4 +1,4 @@
-//===--- GlobList.h ---------------------------------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/NoLintDirectiveHandler.cpp b/clang-tools-extra/clang-tidy/NoLintDirectiveHandler.cpp
index bbae2c171f790..ef20ee18347df 100644
--- a/clang-tools-extra/clang-tidy/NoLintDirectiveHandler.cpp
+++ b/clang-tools-extra/clang-tidy/NoLintDirectiveHandler.cpp
@@ -1,4 +1,4 @@
-//===-- clang-tools-extra/clang-tidy/NoLintDirectiveHandler.cpp -----------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/NoLintDirectiveHandler.h b/clang-tools-extra/clang-tidy/NoLintDirectiveHandler.h
index e862195abaabb..e33d0f2781886 100644
--- a/clang-tools-extra/clang-tidy/NoLintDirectiveHandler.h
+++ b/clang-tools-extra/clang-tidy/NoLintDirectiveHandler.h
@@ -1,4 +1,4 @@
-//===-- clang-tools-extra/clang-tidy/NoLintDirectiveHandler.h ----*- C++ *-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/abseil/AbseilTidyModule.cpp b/clang-tools-extra/clang-tidy/abseil/AbseilTidyModule.cpp
index 78605d59b4421..8971530bab9b2 100644
--- a/clang-tools-extra/clang-tidy/abseil/AbseilTidyModule.cpp
+++ b/clang-tools-extra/clang-tidy/abseil/AbseilTidyModule.cpp
@@ -1,4 +1,4 @@
-//===------- AbseilTidyModule.cpp - clang-tidy ----------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/abseil/CleanupCtadCheck.cpp b/clang-tools-extra/clang-tidy/abseil/CleanupCtadCheck.cpp
index 8063fc540cce5..dd20ad8a4c269 100644
--- a/clang-tools-extra/clang-tidy/abseil/CleanupCtadCheck.cpp
+++ b/clang-tools-extra/clang-tidy/abseil/CleanupCtadCheck.cpp
@@ -1,4 +1,4 @@
-//===--- CleanupCtadCheck.cpp - clang-tidy --------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/abseil/CleanupCtadCheck.h b/clang-tools-extra/clang-tidy/abseil/CleanupCtadCheck.h
index 5e2350e071bdf..414085146bfe4 100644
--- a/clang-tools-extra/clang-tidy/abseil/CleanupCtadCheck.h
+++ b/clang-tools-extra/clang-tidy/abseil/CleanupCtadCheck.h
@@ -1,4 +1,4 @@
-//===--- CleanupCtadCheck.h - clang-tidy ------------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/abseil/DurationAdditionCheck.cpp b/clang-tools-extra/clang-tidy/abseil/DurationAdditionCheck.cpp
index d4b8c9e6d8942..4e1bd3ae32ee5 100644
--- a/clang-tools-extra/clang-tidy/abseil/DurationAdditionCheck.cpp
+++ b/clang-tools-extra/clang-tidy/abseil/DurationAdditionCheck.cpp
@@ -1,4 +1,4 @@
-//===--- DurationAdditionCheck.cpp - clang-tidy----------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/abseil/DurationAdditionCheck.h b/clang-tools-extra/clang-tidy/abseil/DurationAdditionCheck.h
index ac71f34fed180..e740326a3d6de 100644
--- a/clang-tools-extra/clang-tidy/abseil/DurationAdditionCheck.h
+++ b/clang-tools-extra/clang-tidy/abseil/DurationAdditionCheck.h
@@ -1,4 +1,4 @@
-//===--- DurationAdditionCheck.h - clang-tidy -------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/abseil/DurationComparisonCheck.cpp b/clang-tools-extra/clang-tidy/abseil/DurationComparisonCheck.cpp
index 3baacb36c3f7a..cb8a478e288b6 100644
--- a/clang-tools-extra/clang-tidy/abseil/DurationComparisonCheck.cpp
+++ b/clang-tools-extra/clang-tidy/abseil/DurationComparisonCheck.cpp
@@ -1,4 +1,4 @@
-//===--- DurationComparisonCheck.cpp - clang-tidy -------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/abseil/DurationComparisonCheck.h b/clang-tools-extra/clang-tidy/abseil/DurationComparisonCheck.h
index 65ab7a38eb289..d9fc8cb165235 100644
--- a/clang-tools-extra/clang-tidy/abseil/DurationComparisonCheck.h
+++ b/clang-tools-extra/clang-tidy/abseil/DurationComparisonCheck.h
@@ -1,4 +1,4 @@
-//===--- DurationComparisonCheck.h - clang-tidy -----------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/abseil/DurationConversionCastCheck.cpp b/clang-tools-extra/clang-tidy/abseil/DurationConversionCastCheck.cpp
index 869a0ec44556c..cf591d9589057 100644
--- a/clang-tools-extra/clang-tidy/abseil/DurationConversionCastCheck.cpp
+++ b/clang-tools-extra/clang-tidy/abseil/DurationConversionCastCheck.cpp
@@ -1,4 +1,4 @@
-//===--- DurationConversionCastCheck.cpp - clang-tidy ---------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/abseil/DurationConversionCastCheck.h b/clang-tools-extra/clang-tidy/abseil/DurationConversionCastCheck.h
index a898ba0483966..cd45bc078fde6 100644
--- a/clang-tools-extra/clang-tidy/abseil/DurationConversionCastCheck.h
+++ b/clang-tools-extra/clang-tidy/abseil/DurationConversionCastCheck.h
@@ -1,4 +1,4 @@
-//===--- DurationConversionCastCheck.h - clang-tidy -------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/abseil/DurationDivisionCheck.cpp b/clang-tools-extra/clang-tidy/abseil/DurationDivisionCheck.cpp
index 50e2d0366c768..b23d86c456c51 100644
--- a/clang-tools-extra/clang-tidy/abseil/DurationDivisionCheck.cpp
+++ b/clang-tools-extra/clang-tidy/abseil/DurationDivisionCheck.cpp
@@ -1,4 +1,4 @@
-//===--- DurationDivisionCheck.cpp - clang-tidy----------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/abseil/DurationDivisionCheck.h b/clang-tools-extra/clang-tidy/abseil/DurationDivisionCheck.h
index c8f259521b648..810f7d269f38f 100644
--- a/clang-tools-extra/clang-tidy/abseil/DurationDivisionCheck.h
+++ b/clang-tools-extra/clang-tidy/abseil/DurationDivisionCheck.h
@@ -1,4 +1,4 @@
-//===--- DurationDivisionCheck.h - clang-tidy--------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/abseil/DurationFactoryFloatCheck.cpp b/clang-tools-extra/clang-tidy/abseil/DurationFactoryFloatCheck.cpp
index 398f1691dca39..cccd7cf796150 100644
--- a/clang-tools-extra/clang-tidy/abseil/DurationFactoryFloatCheck.cpp
+++ b/clang-tools-extra/clang-tidy/abseil/DurationFactoryFloatCheck.cpp
@@ -1,4 +1,4 @@
-//===--- DurationFactoryFloatCheck.cpp - clang-tidy -----------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/abseil/DurationFactoryFloatCheck.h b/clang-tools-extra/clang-tidy/abseil/DurationFactoryFloatCheck.h
index e7c3985a7fd92..1d688da43e268 100644
--- a/clang-tools-extra/clang-tidy/abseil/DurationFactoryFloatCheck.h
+++ b/clang-tools-extra/clang-tidy/abseil/DurationFactoryFloatCheck.h
@@ -1,4 +1,4 @@
-//===--- DurationFactoryFloatCheck.h - clang-tidy ---------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/abseil/DurationFactoryScaleCheck.cpp b/clang-tools-extra/clang-tidy/abseil/DurationFactoryScaleCheck.cpp
index 121892fd0daa9..1d6ff1ab17abd 100644
--- a/clang-tools-extra/clang-tidy/abseil/DurationFactoryScaleCheck.cpp
+++ b/clang-tools-extra/clang-tidy/abseil/DurationFactoryScaleCheck.cpp
@@ -1,4 +1,4 @@
-//===--- DurationFactoryScaleCheck.cpp - clang-tidy -----------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/abseil/DurationFactoryScaleCheck.h b/clang-tools-extra/clang-tidy/abseil/DurationFactoryScaleCheck.h
index f5f088c49897d..1d53d13fa9f9a 100644
--- a/clang-tools-extra/clang-tidy/abseil/DurationFactoryScaleCheck.h
+++ b/clang-tools-extra/clang-tidy/abseil/DurationFactoryScaleCheck.h
@@ -1,4 +1,4 @@
-//===--- DurationFactoryScaleCheck.h - clang-tidy ---------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/abseil/DurationRewriter.cpp b/clang-tools-extra/clang-tidy/abseil/DurationRewriter.cpp
index 4cdbbd43c1431..e57073e500ccc 100644
--- a/clang-tools-extra/clang-tidy/abseil/DurationRewriter.cpp
+++ b/clang-tools-extra/clang-tidy/abseil/DurationRewriter.cpp
@@ -1,4 +1,4 @@
-//===--- DurationRewriter.cpp - clang-tidy --------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/abseil/DurationRewriter.h b/clang-tools-extra/clang-tidy/abseil/DurationRewriter.h
index dc05b3fe3b55a..27d6ca0616985 100644
--- a/clang-tools-extra/clang-tidy/abseil/DurationRewriter.h
+++ b/clang-tools-extra/clang-tidy/abseil/DurationRewriter.h
@@ -1,4 +1,4 @@
-//===--- DurationRewriter.h - clang-tidy ------------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/abseil/DurationSubtractionCheck.cpp b/clang-tools-extra/clang-tidy/abseil/DurationSubtractionCheck.cpp
index 48600298a20ca..fd5e2038f75d1 100644
--- a/clang-tools-extra/clang-tidy/abseil/DurationSubtractionCheck.cpp
+++ b/clang-tools-extra/clang-tidy/abseil/DurationSubtractionCheck.cpp
@@ -1,4 +1,4 @@
-//===--- DurationSubtractionCheck.cpp - clang-tidy ------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/abseil/DurationSubtractionCheck.h b/clang-tools-extra/clang-tidy/abseil/DurationSubtractionCheck.h
index c865f2f842a0d..b092561df909c 100644
--- a/clang-tools-extra/clang-tidy/abseil/DurationSubtractionCheck.h
+++ b/clang-tools-extra/clang-tidy/abseil/DurationSubtractionCheck.h
@@ -1,4 +1,4 @@
-//===--- DurationSubtractionCheck.h - clang-tidy ----------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/abseil/DurationUnnecessaryConversionCheck.cpp b/clang-tools-extra/clang-tidy/abseil/DurationUnnecessaryConversionCheck.cpp
index 9bb1fd57a4401..805d7dacd4eec 100644
--- a/clang-tools-extra/clang-tidy/abseil/DurationUnnecessaryConversionCheck.cpp
+++ b/clang-tools-extra/clang-tidy/abseil/DurationUnnecessaryConversionCheck.cpp
@@ -1,5 +1,4 @@
-//===--- DurationUnnecessaryConversionCheck.cpp - clang-tidy
-//-----------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/abseil/DurationUnnecessaryConversionCheck.h b/clang-tools-extra/clang-tidy/abseil/DurationUnnecessaryConversionCheck.h
index fc9cf23459425..a5bd4dca6ce1f 100644
--- a/clang-tools-extra/clang-tidy/abseil/DurationUnnecessaryConversionCheck.h
+++ b/clang-tools-extra/clang-tidy/abseil/DurationUnnecessaryConversionCheck.h
@@ -1,4 +1,4 @@
-//===--- DurationUnnecessaryConversionCheck.h - clang-tidy ------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/abseil/FasterStrsplitDelimiterCheck.cpp b/clang-tools-extra/clang-tidy/abseil/FasterStrsplitDelimiterCheck.cpp
index 4a6f17ed5f868..13d566087688f 100644
--- a/clang-tools-extra/clang-tidy/abseil/FasterStrsplitDelimiterCheck.cpp
+++ b/clang-tools-extra/clang-tidy/abseil/FasterStrsplitDelimiterCheck.cpp
@@ -1,4 +1,4 @@
-//===--- FasterStrsplitDelimiterCheck.cpp - clang-tidy---------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/abseil/FasterStrsplitDelimiterCheck.h b/clang-tools-extra/clang-tidy/abseil/FasterStrsplitDelimiterCheck.h
index b6caacd505c31..96e261d86697b 100644
--- a/clang-tools-extra/clang-tidy/abseil/FasterStrsplitDelimiterCheck.h
+++ b/clang-tools-extra/clang-tidy/abseil/FasterStrsplitDelimiterCheck.h
@@ -1,4 +1,4 @@
-//===--- FasterStrsplitDelimiterCheck.h - clang-tidy-------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/abseil/NoInternalDependenciesCheck.cpp b/clang-tools-extra/clang-tidy/abseil/NoInternalDependenciesCheck.cpp
index 19409d0616f06..c090e5ac54222 100644
--- a/clang-tools-extra/clang-tidy/abseil/NoInternalDependenciesCheck.cpp
+++ b/clang-tools-extra/clang-tidy/abseil/NoInternalDependenciesCheck.cpp
@@ -1,4 +1,4 @@
-//===--- NoInternalDependenciesCheck.cpp - clang-tidy----------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/abseil/NoInternalDependenciesCheck.h b/clang-tools-extra/clang-tidy/abseil/NoInternalDependenciesCheck.h
index 9c16524ff6b59..7b46ba55e008f 100644
--- a/clang-tools-extra/clang-tidy/abseil/NoInternalDependenciesCheck.h
+++ b/clang-tools-extra/clang-tidy/abseil/NoInternalDependenciesCheck.h
@@ -1,4 +1,4 @@
-//===--- NoInternalDependenciesCheck.h - clang-tidy--------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/abseil/NoNamespaceCheck.cpp b/clang-tools-extra/clang-tidy/abseil/NoNamespaceCheck.cpp
index 6c2baa4c41412..74facceddac8b 100644
--- a/clang-tools-extra/clang-tidy/abseil/NoNamespaceCheck.cpp
+++ b/clang-tools-extra/clang-tidy/abseil/NoNamespaceCheck.cpp
@@ -1,4 +1,4 @@
-//===--- NoNamespaceCheck.cpp - clang-tidy---------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/abseil/NoNamespaceCheck.h b/clang-tools-extra/clang-tidy/abseil/NoNamespaceCheck.h
index be56adad03bcf..d3ab5cc5219ef 100644
--- a/clang-tools-extra/clang-tidy/abseil/NoNamespaceCheck.h
+++ b/clang-tools-extra/clang-tidy/abseil/NoNamespaceCheck.h
@@ -1,4 +1,4 @@
-//===--- NoNamespaceCheck.h - clang-tidy-------------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/abseil/RedundantStrcatCallsCheck.cpp b/clang-tools-extra/clang-tidy/abseil/RedundantStrcatCallsCheck.cpp
index ef26a8a76cb37..d7cc0cacab6ea 100644
--- a/clang-tools-extra/clang-tidy/abseil/RedundantStrcatCallsCheck.cpp
+++ b/clang-tools-extra/clang-tidy/abseil/RedundantStrcatCallsCheck.cpp
@@ -1,4 +1,4 @@
-//===--- RedundantStrcatCallsCheck.cpp - clang-tidy------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/abseil/RedundantStrcatCallsCheck.h b/clang-tools-extra/clang-tidy/abseil/RedundantStrcatCallsCheck.h
index a27899d3e821a..a5300a399c89d 100644
--- a/clang-tools-extra/clang-tidy/abseil/RedundantStrcatCallsCheck.h
+++ b/clang-tools-extra/clang-tidy/abseil/RedundantStrcatCallsCheck.h
@@ -1,4 +1,4 @@
-//===--- RedundantStrcatCallsCheck.h - clang-tidy----------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/abseil/StrCatAppendCheck.cpp b/clang-tools-extra/clang-tidy/abseil/StrCatAppendCheck.cpp
index ced92590be02e..e088e286214b0 100644
--- a/clang-tools-extra/clang-tidy/abseil/StrCatAppendCheck.cpp
+++ b/clang-tools-extra/clang-tidy/abseil/StrCatAppendCheck.cpp
@@ -1,4 +1,4 @@
-//===--- StrCatAppendCheck.cpp - clang-tidy--------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/abseil/StrCatAppendCheck.h b/clang-tools-extra/clang-tidy/abseil/StrCatAppendCheck.h
index fcd9d4b6e1885..93245c01cebb4 100644
--- a/clang-tools-extra/clang-tidy/abseil/StrCatAppendCheck.h
+++ b/clang-tools-extra/clang-tidy/abseil/StrCatAppendCheck.h
@@ -1,4 +1,4 @@
-//===--- StrCatAppendCheck.h - clang-tidy------------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/abseil/StringFindStartswithCheck.cpp b/clang-tools-extra/clang-tidy/abseil/StringFindStartswithCheck.cpp
index 221e924c10f62..92d63057caf65 100644
--- a/clang-tools-extra/clang-tidy/abseil/StringFindStartswithCheck.cpp
+++ b/clang-tools-extra/clang-tidy/abseil/StringFindStartswithCheck.cpp
@@ -1,4 +1,4 @@
-//===--- StringFindStartswithCheck.cc - clang-tidy---------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/abseil/StringFindStartswithCheck.h b/clang-tools-extra/clang-tidy/abseil/StringFindStartswithCheck.h
index de3bd4d422200..0d0866db29346 100644
--- a/clang-tools-extra/clang-tidy/abseil/StringFindStartswithCheck.h
+++ b/clang-tools-extra/clang-tidy/abseil/StringFindStartswithCheck.h
@@ -1,4 +1,4 @@
-//===--- StringFindStartswithCheck.h - clang-tidy----------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/abseil/StringFindStrContainsCheck.cpp b/clang-tools-extra/clang-tidy/abseil/StringFindStrContainsCheck.cpp
index 0c2fe285ce060..6eb559717077b 100644
--- a/clang-tools-extra/clang-tidy/abseil/StringFindStrContainsCheck.cpp
+++ b/clang-tools-extra/clang-tidy/abseil/StringFindStrContainsCheck.cpp
@@ -1,4 +1,4 @@
-//===--- StringFindStrContainsCheck.cc - clang-tidy------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/abseil/StringFindStrContainsCheck.h b/clang-tools-extra/clang-tidy/abseil/StringFindStrContainsCheck.h
index 68b827c5de0e1..f939c0b5791e5 100644
--- a/clang-tools-extra/clang-tidy/abseil/StringFindStrContainsCheck.h
+++ b/clang-tools-extra/clang-tidy/abseil/StringFindStrContainsCheck.h
@@ -1,4 +1,4 @@
-//===--- StringFindStrContainsCheck.h - clang-tidy---------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/abseil/TimeComparisonCheck.cpp b/clang-tools-extra/clang-tidy/abseil/TimeComparisonCheck.cpp
index 2bcd8064400f8..52121a57de0d1 100644
--- a/clang-tools-extra/clang-tidy/abseil/TimeComparisonCheck.cpp
+++ b/clang-tools-extra/clang-tidy/abseil/TimeComparisonCheck.cpp
@@ -1,5 +1,4 @@
-//===--- TimeComparisonCheck.cpp - clang-tidy
-//--------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/abseil/TimeComparisonCheck.h b/clang-tools-extra/clang-tidy/abseil/TimeComparisonCheck.h
index bf22977e9d0df..bbf74bebd26ae 100644
--- a/clang-tools-extra/clang-tidy/abseil/TimeComparisonCheck.h
+++ b/clang-tools-extra/clang-tidy/abseil/TimeComparisonCheck.h
@@ -1,4 +1,4 @@
-//===--- TimeComparisonCheck.h - clang-tidy ---------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/abseil/TimeSubtractionCheck.cpp b/clang-tools-extra/clang-tidy/abseil/TimeSubtractionCheck.cpp
index 2def393938e67..228d974cd5e23 100644
--- a/clang-tools-extra/clang-tidy/abseil/TimeSubtractionCheck.cpp
+++ b/clang-tools-extra/clang-tidy/abseil/TimeSubtractionCheck.cpp
@@ -1,4 +1,4 @@
-//===--- TimeSubtractionCheck.cpp - clang-tidy ----------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/abseil/TimeSubtractionCheck.h b/clang-tools-extra/clang-tidy/abseil/TimeSubtractionCheck.h
index 9e2ec1c8def20..f8bb599d36d5d 100644
--- a/clang-tools-extra/clang-tidy/abseil/TimeSubtractionCheck.h
+++ b/clang-tools-extra/clang-tidy/abseil/TimeSubtractionCheck.h
@@ -1,4 +1,4 @@
-//===--- TimeSubtractionCheck.h - clang-tidy --------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/abseil/UpgradeDurationConversionsCheck.cpp b/clang-tools-extra/clang-tidy/abseil/UpgradeDurationConversionsCheck.cpp
index b2eddf67edb3f..f7905e081170e 100644
--- a/clang-tools-extra/clang-tidy/abseil/UpgradeDurationConversionsCheck.cpp
+++ b/clang-tools-extra/clang-tidy/abseil/UpgradeDurationConversionsCheck.cpp
@@ -1,4 +1,4 @@
-//===--- UpgradeDurationConversionsCheck.cpp - clang-tidy -----------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/abseil/UpgradeDurationConversionsCheck.h b/clang-tools-extra/clang-tidy/abseil/UpgradeDurationConversionsCheck.h
index 8e7d9829533d4..e4865b941f2ac 100644
--- a/clang-tools-extra/clang-tidy/abseil/UpgradeDurationConversionsCheck.h
+++ b/clang-tools-extra/clang-tidy/abseil/UpgradeDurationConversionsCheck.h
@@ -1,4 +1,4 @@
-//===--- UpgradeDurationConversionsCheck.h - clang-tidy ---------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/add_new_check.py b/clang-tools-extra/clang-tidy/add_new_check.py
index 2b51a1dc40ebc..0035da288dcf5 100755
--- a/clang-tools-extra/clang-tidy/add_new_check.py
+++ b/clang-tools-extra/clang-tidy/add_new_check.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 #
-# ===- add_new_check.py - clang-tidy check generator ---------*- python -*--===#
+# ===-----------------------------------------------------------------------===#
 #
 # Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 # See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/altera/AlteraTidyModule.cpp b/clang-tools-extra/clang-tidy/altera/AlteraTidyModule.cpp
index 02a43ba86d7bb..28733ef1d994c 100644
--- a/clang-tools-extra/clang-tidy/altera/AlteraTidyModule.cpp
+++ b/clang-tools-extra/clang-tidy/altera/AlteraTidyModule.cpp
@@ -1,4 +1,4 @@
-//===--- AlteraTidyModule.cpp - clang-tidy --------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/altera/IdDependentBackwardBranchCheck.cpp b/clang-tools-extra/clang-tidy/altera/IdDependentBackwardBranchCheck.cpp
index 94db0a793cf53..49ba17ce643fe 100644
--- a/clang-tools-extra/clang-tidy/altera/IdDependentBackwardBranchCheck.cpp
+++ b/clang-tools-extra/clang-tidy/altera/IdDependentBackwardBranchCheck.cpp
@@ -1,4 +1,4 @@
-//===--- IdDependentBackwardBranchCheck.cpp - clang-tidy ------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/altera/IdDependentBackwardBranchCheck.h b/clang-tools-extra/clang-tidy/altera/IdDependentBackwardBranchCheck.h
index cf964a2d5d6f6..0030faa3c5ec5 100644
--- a/clang-tools-extra/clang-tidy/altera/IdDependentBackwardBranchCheck.h
+++ b/clang-tools-extra/clang-tidy/altera/IdDependentBackwardBranchCheck.h
@@ -1,4 +1,4 @@
-//===--- IdDependentBackwardBranchCheck.h - clang-tidy ----------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/altera/KernelNameRestrictionCheck.cpp b/clang-tools-extra/clang-tidy/altera/KernelNameRestrictionCheck.cpp
index a94d6c8d7c4e6..4c740e31ae7be 100644
--- a/clang-tools-extra/clang-tidy/altera/KernelNameRestrictionCheck.cpp
+++ b/clang-tools-extra/clang-tidy/altera/KernelNameRestrictionCheck.cpp
@@ -1,4 +1,4 @@
-//===--- KernelNameRestrictionCheck.cpp - clang-tidy ----------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/altera/KernelNameRestrictionCheck.h b/clang-tools-extra/clang-tidy/altera/KernelNameRestrictionCheck.h
index 02d82f4d6891a..bd8aafecf4f76 100644
--- a/clang-tools-extra/clang-tidy/altera/KernelNameRestrictionCheck.h
+++ b/clang-tools-extra/clang-tidy/altera/KernelNameRestrictionCheck.h
@@ -1,4 +1,4 @@
-//===--- KernelNameRestrictionCheck.h - clang-tidy --------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/altera/SingleWorkItemBarrierCheck.cpp b/clang-tools-extra/clang-tidy/altera/SingleWorkItemBarrierCheck.cpp
index c21b7cab1b8da..c9df658d9bd67 100644
--- a/clang-tools-extra/clang-tidy/altera/SingleWorkItemBarrierCheck.cpp
+++ b/clang-tools-extra/clang-tidy/altera/SingleWorkItemBarrierCheck.cpp
@@ -1,4 +1,4 @@
-//===--- SingleWorkItemBarrierCheck.cpp - clang-tidy-----------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/altera/SingleWorkItemBarrierCheck.h b/clang-tools-extra/clang-tidy/altera/SingleWorkItemBarrierCheck.h
index 62a055b094645..5560f2765f9f9 100644
--- a/clang-tools-extra/clang-tidy/altera/SingleWorkItemBarrierCheck.h
+++ b/clang-tools-extra/clang-tidy/altera/SingleWorkItemBarrierCheck.h
@@ -1,4 +1,4 @@
-//===--- SingleWorkItemBarrierCheck.h - clang-tidy---------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/altera/StructPackAlignCheck.cpp b/clang-tools-extra/clang-tidy/altera/StructPackAlignCheck.cpp
index 789327b196ab6..0a19378949f46 100644
--- a/clang-tools-extra/clang-tidy/altera/StructPackAlignCheck.cpp
+++ b/clang-tools-extra/clang-tidy/altera/StructPackAlignCheck.cpp
@@ -1,4 +1,4 @@
-//===--- StructPackAlignCheck.cpp - clang-tidy ----------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -76,9 +76,9 @@ void StructPackAlignCheck::check(const MatchFinder::MatchResult &Result) {
   CharUnits CurrSize = Result.Context->getASTRecordLayout(Struct).getSize();
   CharUnits MinByteSize =
       CharUnits::fromQuantity(std::max<clang::CharUnits::QuantityType>(
-          ceil(static_cast<float>(TotalBitSize) / CharSize), 1));
+          std::ceil(static_cast<float>(TotalBitSize) / CharSize), 1));
   CharUnits MaxAlign = CharUnits::fromQuantity(
-      ceil((float)Struct->getMaxAlignment() / CharSize));
+      std::ceil((float)Struct->getMaxAlignment() / CharSize));
   CharUnits CurrAlign =
       Result.Context->getASTRecordLayout(Struct).getAlignment();
   CharUnits NewAlign = computeRecommendedAlignment(MinByteSize);
diff --git a/clang-tools-extra/clang-tidy/altera/StructPackAlignCheck.h b/clang-tools-extra/clang-tidy/altera/StructPackAlignCheck.h
index f360bcef4f14e..f6f2d1fa529e5 100644
--- a/clang-tools-extra/clang-tidy/altera/StructPackAlignCheck.h
+++ b/clang-tools-extra/clang-tidy/altera/StructPackAlignCheck.h
@@ -1,4 +1,4 @@
-//===--- StructPackAlignCheck.h - clang-tidy --------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/altera/UnrollLoopsCheck.cpp b/clang-tools-extra/clang-tidy/altera/UnrollLoopsCheck.cpp
index 0bb9d6e4a7cee..6aad3c6b191ed 100644
--- a/clang-tools-extra/clang-tidy/altera/UnrollLoopsCheck.cpp
+++ b/clang-tools-extra/clang-tidy/altera/UnrollLoopsCheck.cpp
@@ -1,4 +1,4 @@
-//===--- UnrollLoopsCheck.cpp - clang-tidy --------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -208,18 +208,20 @@ bool UnrollLoopsCheck::hasLargeNumIterations(const Stmt *Statement,
       return true;
     switch (Op->getOpcode()) {
     case (BO_AddAssign):
-      Iterations = ceil(float(EndValue - InitValue) / ConstantValue);
+      Iterations = std::ceil(float(EndValue - InitValue) / ConstantValue);
       break;
     case (BO_SubAssign):
-      Iterations = ceil(float(InitValue - EndValue) / ConstantValue);
+      Iterations = std::ceil(float(InitValue - EndValue) / ConstantValue);
       break;
     case (BO_MulAssign):
-      Iterations = 1 + (log((double)EndValue) - log((double)InitValue)) /
-                           log((double)ConstantValue);
+      Iterations =
+          1 + (std::log((double)EndValue) - std::log((double)InitValue)) /
+                  std::log((double)ConstantValue);
       break;
     case (BO_DivAssign):
-      Iterations = 1 + (log((double)InitValue) - log((double)EndValue)) /
-                           log((double)ConstantValue);
+      Iterations =
+          1 + (std::log((double)InitValue) - std::log((double)EndValue)) /
+                  std::log((double)ConstantValue);
       break;
     default:
       // All other operators are not handled; assume large bounds.
diff --git a/clang-tools-extra/clang-tidy/altera/UnrollLoopsCheck.h b/clang-tools-extra/clang-tidy/altera/UnrollLoopsCheck.h
index 0d8306e8437b7..453176fa4894a 100644
--- a/clang-tools-extra/clang-tidy/altera/UnrollLoopsCheck.h
+++ b/clang-tools-extra/clang-tidy/altera/UnrollLoopsCheck.h
@@ -1,4 +1,4 @@
-//===--- UnrollLoopsCheck.h - clang-tidy ------------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/android/AndroidTidyModule.cpp b/clang-tools-extra/clang-tidy/android/AndroidTidyModule.cpp
index 17efa10909d0a..40362531f2daf 100644
--- a/clang-tools-extra/clang-tidy/android/AndroidTidyModule.cpp
+++ b/clang-tools-extra/clang-tidy/android/AndroidTidyModule.cpp
@@ -1,4 +1,4 @@
-//===--- AndroidTidyModule.cpp - clang-tidy--------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/android/CloexecAccept4Check.cpp b/clang-tools-extra/clang-tidy/android/CloexecAccept4Check.cpp
index 8c4bcc70b300e..8cf22ba2acb4a 100644
--- a/clang-tools-extra/clang-tidy/android/CloexecAccept4Check.cpp
+++ b/clang-tools-extra/clang-tidy/android/CloexecAccept4Check.cpp
@@ -1,4 +1,4 @@
-//===--- CloexecAccept4Check.cpp - clang-tidy------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/android/CloexecAccept4Check.h b/clang-tools-extra/clang-tidy/android/CloexecAccept4Check.h
index a34d12041cf3c..e7286dc519484 100644
--- a/clang-tools-extra/clang-tidy/android/CloexecAccept4Check.h
+++ b/clang-tools-extra/clang-tidy/android/CloexecAccept4Check.h
@@ -1,4 +1,4 @@
-//===--- CloexecAccept4Check.h - clang-tidy----------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/android/CloexecAcceptCheck.cpp b/clang-tools-extra/clang-tidy/android/CloexecAcceptCheck.cpp
index c90fc7ba1bb04..9cd888cca023b 100644
--- a/clang-tools-extra/clang-tidy/android/CloexecAcceptCheck.cpp
+++ b/clang-tools-extra/clang-tidy/android/CloexecAcceptCheck.cpp
@@ -1,4 +1,4 @@
-//===--- CloexecAcceptCheck.cpp - clang-tidy-------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/android/CloexecAcceptCheck.h b/clang-tools-extra/clang-tidy/android/CloexecAcceptCheck.h
index 013fa5fa1725e..9b982b2b104ca 100644
--- a/clang-tools-extra/clang-tidy/android/CloexecAcceptCheck.h
+++ b/clang-tools-extra/clang-tidy/android/CloexecAcceptCheck.h
@@ -1,4 +1,4 @@
-//===--- CloexecAcceptCheck.h - clang-tidy-----------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/android/CloexecCheck.cpp b/clang-tools-extra/clang-tidy/android/CloexecCheck.cpp
index f4d657a7f4e90..cd83423adae05 100644
--- a/clang-tools-extra/clang-tidy/android/CloexecCheck.cpp
+++ b/clang-tools-extra/clang-tidy/android/CloexecCheck.cpp
@@ -1,4 +1,4 @@
-//===--- CloexecCheck.cpp - clang-tidy-------------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/android/CloexecCheck.h b/clang-tools-extra/clang-tidy/android/CloexecCheck.h
index edbff70fbd33e..79f7ab3354d8d 100644
--- a/clang-tools-extra/clang-tidy/android/CloexecCheck.h
+++ b/clang-tools-extra/clang-tidy/android/CloexecCheck.h
@@ -1,4 +1,4 @@
-//===--- CloexecCheck.h - clang-tidy-----------------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/android/CloexecCreatCheck.cpp b/clang-tools-extra/clang-tidy/android/CloexecCreatCheck.cpp
index 8d9f45c3567f2..ae44efb629893 100644
--- a/clang-tools-extra/clang-tidy/android/CloexecCreatCheck.cpp
+++ b/clang-tools-extra/clang-tidy/android/CloexecCreatCheck.cpp
@@ -1,4 +1,4 @@
-//===--- CloexecCreatCheck.cpp - clang-tidy--------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/android/CloexecCreatCheck.h b/clang-tools-extra/clang-tidy/android/CloexecCreatCheck.h
index b1ab914d57d15..e0629f2ac4061 100644
--- a/clang-tools-extra/clang-tidy/android/CloexecCreatCheck.h
+++ b/clang-tools-extra/clang-tidy/android/CloexecCreatCheck.h
@@ -1,4 +1,4 @@
-//===--- CloexecCreatCheck.h - clang-tidy------------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/android/CloexecDupCheck.cpp b/clang-tools-extra/clang-tidy/android/CloexecDupCheck.cpp
index 89191083c18c8..5ac1b6fb632e1 100644
--- a/clang-tools-extra/clang-tidy/android/CloexecDupCheck.cpp
+++ b/clang-tools-extra/clang-tidy/android/CloexecDupCheck.cpp
@@ -1,4 +1,4 @@
-//===--- CloexecDupCheck.cpp - clang-tidy----------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/android/CloexecDupCheck.h b/clang-tools-extra/clang-tidy/android/CloexecDupCheck.h
index 9554b7cf4ed8f..3016867e56189 100644
--- a/clang-tools-extra/clang-tidy/android/CloexecDupCheck.h
+++ b/clang-tools-extra/clang-tidy/android/CloexecDupCheck.h
@@ -1,4 +1,4 @@
-//===--- CloexecDupCheck.h - clang-tidy-------------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/android/CloexecEpollCreate1Check.cpp b/clang-tools-extra/clang-tidy/android/CloexecEpollCreate1Check.cpp
index 01b771b2072c8..f3c26b48c432a 100644
--- a/clang-tools-extra/clang-tidy/android/CloexecEpollCreate1Check.cpp
+++ b/clang-tools-extra/clang-tidy/android/CloexecEpollCreate1Check.cpp
@@ -1,4 +1,4 @@
-//===--- CloexecEpollCreate1Check.cpp - clang-tidy-------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/android/CloexecEpollCreate1Check.h b/clang-tools-extra/clang-tidy/android/CloexecEpollCreate1Check.h
index b7912aba30962..cb0d40b8b9f36 100644
--- a/clang-tools-extra/clang-tidy/android/CloexecEpollCreate1Check.h
+++ b/clang-tools-extra/clang-tidy/android/CloexecEpollCreate1Check.h
@@ -1,4 +1,4 @@
-//===--- CloexecEpollCreate1Check.h - clang-tidy-----------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/android/CloexecEpollCreateCheck.cpp b/clang-tools-extra/clang-tidy/android/CloexecEpollCreateCheck.cpp
index fc02e542863bc..727f0bef662de 100644
--- a/clang-tools-extra/clang-tidy/android/CloexecEpollCreateCheck.cpp
+++ b/clang-tools-extra/clang-tidy/android/CloexecEpollCreateCheck.cpp
@@ -1,4 +1,4 @@
-//===--- CloexecEpollCreateCheck.cpp - clang-tidy--------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/android/CloexecEpollCreateCheck.h b/clang-tools-extra/clang-tidy/android/CloexecEpollCreateCheck.h
index 1dbbcb1e98502..9010179bd7036 100644
--- a/clang-tools-extra/clang-tidy/android/CloexecEpollCreateCheck.h
+++ b/clang-tools-extra/clang-tidy/android/CloexecEpollCreateCheck.h
@@ -1,4 +1,4 @@
-//===--- CloexecEpollCreateCheck.h - clang-tidy------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/android/CloexecFopenCheck.cpp b/clang-tools-extra/clang-tidy/android/CloexecFopenCheck.cpp
index bb9d0d2cb3da3..8ddd6a0523156 100644
--- a/clang-tools-extra/clang-tidy/android/CloexecFopenCheck.cpp
+++ b/clang-tools-extra/clang-tidy/android/CloexecFopenCheck.cpp
@@ -1,4 +1,4 @@
-//===--- CloexecFopenCheck.cpp - clang-tidy--------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/android/CloexecFopenCheck.h b/clang-tools-extra/clang-tidy/android/CloexecFopenCheck.h
index 96517f55a5b55..1e0e7d76933c7 100644
--- a/clang-tools-extra/clang-tidy/android/CloexecFopenCheck.h
+++ b/clang-tools-extra/clang-tidy/android/CloexecFopenCheck.h
@@ -1,4 +1,4 @@
-//===--- CloexecFopenCheck.h - clang-tidy------------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/android/CloexecInotifyInit1Check.cpp b/clang-tools-extra/clang-tidy/android/CloexecInotifyInit1Check.cpp
index 910793582d67c..c64ef82a35ad6 100644
--- a/clang-tools-extra/clang-tidy/android/CloexecInotifyInit1Check.cpp
+++ b/clang-tools-extra/clang-tidy/android/CloexecInotifyInit1Check.cpp
@@ -1,4 +1,4 @@
-//===--- CloexecInotifyInit1Check.cpp - clang-tidy-------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/android/CloexecInotifyInit1Check.h b/clang-tools-extra/clang-tidy/android/CloexecInotifyInit1Check.h
index c87f1fdd956e4..50bc4bbaa7de5 100644
--- a/clang-tools-extra/clang-tidy/android/CloexecInotifyInit1Check.h
+++ b/clang-tools-extra/clang-tidy/android/CloexecInotifyInit1Check.h
@@ -1,4 +1,4 @@
-//===--- CloexecInotifyInit1Check.h - clang-tidy-----------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/android/CloexecInotifyInitCheck.cpp b/clang-tools-extra/clang-tidy/android/CloexecInotifyInitCheck.cpp
index ed591799d0656..d3502205d1642 100644
--- a/clang-tools-extra/clang-tidy/android/CloexecInotifyInitCheck.cpp
+++ b/clang-tools-extra/clang-tidy/android/CloexecInotifyInitCheck.cpp
@@ -1,4 +1,4 @@
-//===--- CloexecInotifyInitCheck.cpp - clang-tidy--------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/android/CloexecInotifyInitCheck.h b/clang-tools-extra/clang-tidy/android/CloexecInotifyInitCheck.h
index 1ef07de6cad54..7db4ab15c2f9e 100644
--- a/clang-tools-extra/clang-tidy/android/CloexecInotifyInitCheck.h
+++ b/clang-tools-extra/clang-tidy/android/CloexecInotifyInitCheck.h
@@ -1,4 +1,4 @@
-//===--- CloexecInotifyInitCheck.h - clang-tidy------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/android/CloexecMemfdCreateCheck.cpp b/clang-tools-extra/clang-tidy/android/CloexecMemfdCreateCheck.cpp
index 4e53d607d81e3..5ecf908aabb59 100644
--- a/clang-tools-extra/clang-tidy/android/CloexecMemfdCreateCheck.cpp
+++ b/clang-tools-extra/clang-tidy/android/CloexecMemfdCreateCheck.cpp
@@ -1,4 +1,4 @@
-//===--- CloexecMemfdCreateCheck.cpp - clang-tidy--------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/android/CloexecMemfdCreateCheck.h b/clang-tools-extra/clang-tidy/android/CloexecMemfdCreateCheck.h
index 1518d20fd4c5c..43a27dd5658a5 100644
--- a/clang-tools-extra/clang-tidy/android/CloexecMemfdCreateCheck.h
+++ b/clang-tools-extra/clang-tidy/android/CloexecMemfdCreateCheck.h
@@ -1,4 +1,4 @@
-//===--- CloexecMemfdCreateCheck.h - clang-tidy-----------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/android/CloexecOpenCheck.cpp b/clang-tools-extra/clang-tidy/android/CloexecOpenCheck.cpp
index 623b6ab02e7ba..8c24482c73251 100644
--- a/clang-tools-extra/clang-tidy/android/CloexecOpenCheck.cpp
+++ b/clang-tools-extra/clang-tidy/android/CloexecOpenCheck.cpp
@@ -1,4 +1,4 @@
-//===--- CloexecOpenCheck.cpp - clang-tidy---------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/android/CloexecOpenCheck.h b/clang-tools-extra/clang-tidy/android/CloexecOpenCheck.h
index 692d2a2319c1c..d95fe21fb3e88 100644
--- a/clang-tools-extra/clang-tidy/android/CloexecOpenCheck.h
+++ b/clang-tools-extra/clang-tidy/android/CloexecOpenCheck.h
@@ -1,4 +1,4 @@
-//===--- CloexecOpenCheck.h - clang-tidy-----------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/android/CloexecPipe2Check.cpp b/clang-tools-extra/clang-tidy/android/CloexecPipe2Check.cpp
index e32332bdfc953..a024ea3431ddf 100644
--- a/clang-tools-extra/clang-tidy/android/CloexecPipe2Check.cpp
+++ b/clang-tools-extra/clang-tidy/android/CloexecPipe2Check.cpp
@@ -1,4 +1,4 @@
-//===--- CloexecPipe2Check.cpp - clang-tidy--------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/android/CloexecPipe2Check.h b/clang-tools-extra/clang-tidy/android/CloexecPipe2Check.h
index 68e5f4270ceb0..17d9b4f326e86 100644
--- a/clang-tools-extra/clang-tidy/android/CloexecPipe2Check.h
+++ b/clang-tools-extra/clang-tidy/android/CloexecPipe2Check.h
@@ -1,4 +1,4 @@
-//===--- CloexecPipe2Check.h - clang-tidy------------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/android/CloexecPipeCheck.cpp b/clang-tools-extra/clang-tidy/android/CloexecPipeCheck.cpp
index c59b127dc87ac..a475dff4a2682 100644
--- a/clang-tools-extra/clang-tidy/android/CloexecPipeCheck.cpp
+++ b/clang-tools-extra/clang-tidy/android/CloexecPipeCheck.cpp
@@ -1,4 +1,4 @@
-//===--- CloexecPipeCheck.cpp - clang-tidy---------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/android/CloexecPipeCheck.h b/clang-tools-extra/clang-tidy/android/CloexecPipeCheck.h
index b5ef892196b5a..47a202e8542eb 100644
--- a/clang-tools-extra/clang-tidy/android/CloexecPipeCheck.h
+++ b/clang-tools-extra/clang-tidy/android/CloexecPipeCheck.h
@@ -1,4 +1,4 @@
-//===--- CloexecPipeCheck.h - clang-tidy-------------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/android/CloexecSocketCheck.cpp b/clang-tools-extra/clang-tidy/android/CloexecSocketCheck.cpp
index 12b31a050c2c0..4e9f4c33f0b83 100644
--- a/clang-tools-extra/clang-tidy/android/CloexecSocketCheck.cpp
+++ b/clang-tools-extra/clang-tidy/android/CloexecSocketCheck.cpp
@@ -1,4 +1,4 @@
-//===--- CloexecSocketCheck.cpp - clang-tidy-------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/android/CloexecSocketCheck.h b/clang-tools-extra/clang-tidy/android/CloexecSocketCheck.h
index c046337f1d229..8ef02c1f197b7 100644
--- a/clang-tools-extra/clang-tidy/android/CloexecSocketCheck.h
+++ b/clang-tools-extra/clang-tidy/android/CloexecSocketCheck.h
@@ -1,4 +1,4 @@
-//===--- CloexecSocketCheck.h - clang-tidy-----------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/android/ComparisonInTempFailureRetryCheck.cpp b/clang-tools-extra/clang-tidy/android/ComparisonInTempFailureRetryCheck.cpp
index 4c33e5db6d5b9..78e58bccaeba1 100644
--- a/clang-tools-extra/clang-tidy/android/ComparisonInTempFailureRetryCheck.cpp
+++ b/clang-tools-extra/clang-tidy/android/ComparisonInTempFailureRetryCheck.cpp
@@ -1,4 +1,4 @@
-//===--- ComparisonInTempFailureRetryCheck.cpp - clang-tidy----------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/android/ComparisonInTempFailureRetryCheck.h b/clang-tools-extra/clang-tidy/android/ComparisonInTempFailureRetryCheck.h
index 201c16fe70aa3..b7316e4c5f47a 100644
--- a/clang-tools-extra/clang-tidy/android/ComparisonInTempFailureRetryCheck.h
+++ b/clang-tools-extra/clang-tidy/android/ComparisonInTempFailureRetryCheck.h
@@ -1,4 +1,4 @@
-//===--- ComparisonInTempFailureRetryCheck.h - clang-tidy--------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/boost/BoostTidyModule.cpp b/clang-tools-extra/clang-tidy/boost/BoostTidyModule.cpp
index f414fe750d023..c13a24401afba 100644
--- a/clang-tools-extra/clang-tidy/boost/BoostTidyModule.cpp
+++ b/clang-tools-extra/clang-tidy/boost/BoostTidyModule.cpp
@@ -1,4 +1,4 @@
-//===------- BoostTidyModule.cpp - clang-tidy -----------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/boost/UseRangesCheck.cpp b/clang-tools-extra/clang-tidy/boost/UseRangesCheck.cpp
index e45687fde6d9f..34ecee5badb15 100644
--- a/clang-tools-extra/clang-tidy/boost/UseRangesCheck.cpp
+++ b/clang-tools-extra/clang-tidy/boost/UseRangesCheck.cpp
@@ -1,4 +1,4 @@
-//===--- UseRangesCheck.cpp - clang-tidy ----------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/boost/UseRangesCheck.h b/clang-tools-extra/clang-tidy/boost/UseRangesCheck.h
index d91e6393a0e85..107d801969fc4 100644
--- a/clang-tools-extra/clang-tidy/boost/UseRangesCheck.h
+++ b/clang-tools-extra/clang-tidy/boost/UseRangesCheck.h
@@ -1,4 +1,4 @@
-//===--- UseRangesCheck.h - clang-tidy --------------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/boost/UseToStringCheck.cpp b/clang-tools-extra/clang-tidy/boost/UseToStringCheck.cpp
index bcd5def290599..3574108ee5697 100644
--- a/clang-tools-extra/clang-tidy/boost/UseToStringCheck.cpp
+++ b/clang-tools-extra/clang-tidy/boost/UseToStringCheck.cpp
@@ -1,4 +1,4 @@
-//===--- UseToStringCheck.cpp - clang-tidy---------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/boost/UseToStringCheck.h b/clang-tools-extra/clang-tidy/boost/UseToStringCheck.h
index f62df83ed8e54..a245d11ee1c8a 100644
--- a/clang-tools-extra/clang-tidy/boost/UseToStringCheck.h
+++ b/clang-tools-extra/clang-tidy/boost/UseToStringCheck.h
@@ -1,4 +1,4 @@
-//===--- UseToStringCheck.h - clang-tidy-------------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/ArgumentCommentCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/ArgumentCommentCheck.cpp
index 15e7b53ed5be0..c0a778a027377 100644
--- a/clang-tools-extra/clang-tidy/bugprone/ArgumentCommentCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/ArgumentCommentCheck.cpp
@@ -1,4 +1,4 @@
-//===--- ArgumentCommentCheck.cpp - clang-tidy ----------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/ArgumentCommentCheck.h b/clang-tools-extra/clang-tidy/bugprone/ArgumentCommentCheck.h
index 3d608df752c03..30fa32fad72e7 100644
--- a/clang-tools-extra/clang-tidy/bugprone/ArgumentCommentCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/ArgumentCommentCheck.h
@@ -1,4 +1,4 @@
-//===--- ArgumentCommentCheck.h - clang-tidy --------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/AssertSideEffectCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/AssertSideEffectCheck.cpp
index 0889a1a737189..227641d73885e 100644
--- a/clang-tools-extra/clang-tidy/bugprone/AssertSideEffectCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/AssertSideEffectCheck.cpp
@@ -1,4 +1,4 @@
-//===--- AssertSideEffectCheck.cpp - clang-tidy ---------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/AssertSideEffectCheck.h b/clang-tools-extra/clang-tidy/bugprone/AssertSideEffectCheck.h
index 5cd1132bbd839..b65e1a19e81ac 100644
--- a/clang-tools-extra/clang-tidy/bugprone/AssertSideEffectCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/AssertSideEffectCheck.h
@@ -1,4 +1,4 @@
-//===--- AssertSideEffectCheck.h - clang-tidy -------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/AssignmentInIfConditionCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/AssignmentInIfConditionCheck.cpp
index e03cac6c5fd83..2c8856298e7be 100644
--- a/clang-tools-extra/clang-tidy/bugprone/AssignmentInIfConditionCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/AssignmentInIfConditionCheck.cpp
@@ -1,4 +1,4 @@
-//===--- AssignmentInIfConditionCheck.cpp - clang-tidy --------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/AssignmentInIfConditionCheck.h b/clang-tools-extra/clang-tidy/bugprone/AssignmentInIfConditionCheck.h
index 072b4dd79fe5d..3ae4f36913d5f 100644
--- a/clang-tools-extra/clang-tidy/bugprone/AssignmentInIfConditionCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/AssignmentInIfConditionCheck.h
@@ -1,4 +1,4 @@
-//===--- AssignmentInIfConditionCheck.h - clang-tidy ------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/BadSignalToKillThreadCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/BadSignalToKillThreadCheck.cpp
index 8c13ce5a90e9b..e1d0538ab1644 100644
--- a/clang-tools-extra/clang-tidy/bugprone/BadSignalToKillThreadCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/BadSignalToKillThreadCheck.cpp
@@ -1,4 +1,4 @@
-//===--- BadSignalToKillThreadCheck.cpp - clang-tidy ---------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/BadSignalToKillThreadCheck.h b/clang-tools-extra/clang-tidy/bugprone/BadSignalToKillThreadCheck.h
index f21b8c09eb0c6..aa4d83c89a08d 100644
--- a/clang-tools-extra/clang-tidy/bugprone/BadSignalToKillThreadCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/BadSignalToKillThreadCheck.h
@@ -1,4 +1,4 @@
-//===--- BadSignalToKillThreadCheck.h - clang-tidy --------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/BitwisePointerCastCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/BitwisePointerCastCheck.cpp
index 0992e49b7f372..a9e7ae8734677 100644
--- a/clang-tools-extra/clang-tidy/bugprone/BitwisePointerCastCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/BitwisePointerCastCheck.cpp
@@ -1,4 +1,4 @@
-//===--- BitwisePointerCastCheck.cpp - clang-tidy -------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/BitwisePointerCastCheck.h b/clang-tools-extra/clang-tidy/bugprone/BitwisePointerCastCheck.h
index 1515519b3c9fd..71dc159573619 100644
--- a/clang-tools-extra/clang-tidy/bugprone/BitwisePointerCastCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/BitwisePointerCastCheck.h
@@ -1,4 +1,4 @@
-//===--- BitwisePointerCastCheck.h - clang-tidy -----------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/BoolPointerImplicitConversionCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/BoolPointerImplicitConversionCheck.cpp
index 09b198d24dc7a..df8552436241e 100644
--- a/clang-tools-extra/clang-tidy/bugprone/BoolPointerImplicitConversionCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/BoolPointerImplicitConversionCheck.cpp
@@ -1,4 +1,4 @@
-//===--- BoolPointerImplicitConversionCheck.cpp - clang-tidy --------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/BoolPointerImplicitConversionCheck.h b/clang-tools-extra/clang-tidy/bugprone/BoolPointerImplicitConversionCheck.h
index ef62e3d8b0fb1..19dcdf5218c35 100644
--- a/clang-tools-extra/clang-tidy/bugprone/BoolPointerImplicitConversionCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/BoolPointerImplicitConversionCheck.h
@@ -1,4 +1,4 @@
-//===--- BoolPointerImplicitConversionCheck.h - clang-tidy ------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/BranchCloneCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/BranchCloneCheck.cpp
index a6cd68edda55e..07bb08166a006 100644
--- a/clang-tools-extra/clang-tidy/bugprone/BranchCloneCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/BranchCloneCheck.cpp
@@ -1,4 +1,4 @@
-//===--- BranchCloneCheck.cpp - clang-tidy --------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/BranchCloneCheck.h b/clang-tools-extra/clang-tidy/bugprone/BranchCloneCheck.h
index 599da14c136fd..22dbb2384900c 100644
--- a/clang-tools-extra/clang-tidy/bugprone/BranchCloneCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/BranchCloneCheck.h
@@ -1,4 +1,4 @@
-//===--- BranchCloneCheck.h - clang-tidy ------------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/BugproneTidyModule.cpp b/clang-tools-extra/clang-tidy/bugprone/BugproneTidyModule.cpp
index fe261e729539c..456f7a34c672a 100644
--- a/clang-tools-extra/clang-tidy/bugprone/BugproneTidyModule.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/BugproneTidyModule.cpp
@@ -1,4 +1,4 @@
-//===--- BugproneTidyModule.cpp - clang-tidy ------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -23,6 +23,7 @@
 #include "CopyConstructorInitCheck.h"
 #include "CrtpConstructorAccessibilityCheck.h"
 #include "DanglingHandleCheck.h"
+#include "DerivedMethodShadowingBaseMethodCheck.h"
 #include "DynamicStaticInitializersCheck.h"
 #include "EasilySwappableParametersCheck.h"
 #include "EmptyCatchCheck.h"
@@ -134,6 +135,8 @@ class BugproneModule : public ClangTidyModule {
         "bugprone-copy-constructor-init");
     CheckFactories.registerCheck<DanglingHandleCheck>(
         "bugprone-dangling-handle");
+    CheckFactories.registerCheck<DerivedMethodShadowingBaseMethodCheck>(
+        "bugprone-derived-method-shadowing-base-method");
     CheckFactories.registerCheck<DynamicStaticInitializersCheck>(
         "bugprone-dynamic-static-initializers");
     CheckFactories.registerCheck<EasilySwappableParametersCheck>(
diff --git a/clang-tools-extra/clang-tidy/bugprone/CMakeLists.txt b/clang-tools-extra/clang-tidy/bugprone/CMakeLists.txt
index 46bc8efd44bc5..6bae7a4a71b2b 100644
--- a/clang-tools-extra/clang-tidy/bugprone/CMakeLists.txt
+++ b/clang-tools-extra/clang-tidy/bugprone/CMakeLists.txt
@@ -19,6 +19,7 @@ add_clang_library(clangTidyBugproneModule STATIC
   CopyConstructorInitCheck.cpp
   CrtpConstructorAccessibilityCheck.cpp
   DanglingHandleCheck.cpp
+  DerivedMethodShadowingBaseMethodCheck.cpp
   DynamicStaticInitializersCheck.cpp
   EasilySwappableParametersCheck.cpp
   EmptyCatchCheck.cpp
diff --git a/clang-tools-extra/clang-tidy/bugprone/CapturingThisInMemberVariableCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/CapturingThisInMemberVariableCheck.cpp
index f188ae5ec81b1..a376de505dd70 100644
--- a/clang-tools-extra/clang-tidy/bugprone/CapturingThisInMemberVariableCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/CapturingThisInMemberVariableCheck.cpp
@@ -1,4 +1,4 @@
-//===--- CapturingThisInMemberVariableCheck.cpp - clang-tidy --------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/CapturingThisInMemberVariableCheck.h b/clang-tools-extra/clang-tidy/bugprone/CapturingThisInMemberVariableCheck.h
index 934f99cd35797..6aba9ee84d2bd 100644
--- a/clang-tools-extra/clang-tidy/bugprone/CapturingThisInMemberVariableCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/CapturingThisInMemberVariableCheck.h
@@ -1,4 +1,4 @@
-//===--- CapturingThisInMemberVariableCheck.h - clang-tidy ------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/CastingThroughVoidCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/CastingThroughVoidCheck.cpp
index f0a9ace229740..aaddf4bdd259e 100644
--- a/clang-tools-extra/clang-tidy/bugprone/CastingThroughVoidCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/CastingThroughVoidCheck.cpp
@@ -1,4 +1,4 @@
-//===--- CastingThroughVoidCheck.cpp - clang-tidy -------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/CastingThroughVoidCheck.h b/clang-tools-extra/clang-tidy/bugprone/CastingThroughVoidCheck.h
index 834676aaf0543..313f3f240f5b3 100644
--- a/clang-tools-extra/clang-tidy/bugprone/CastingThroughVoidCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/CastingThroughVoidCheck.h
@@ -1,4 +1,4 @@
-//===--- CastingThroughVoidCheck.h - clang-tidy -----------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/ChainedComparisonCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/ChainedComparisonCheck.cpp
index 7378f1a24ffd3..6af535f712d71 100644
--- a/clang-tools-extra/clang-tidy/bugprone/ChainedComparisonCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/ChainedComparisonCheck.cpp
@@ -1,4 +1,4 @@
-//===--- ChainedComparisonCheck.cpp - clang-tidy --------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/ChainedComparisonCheck.h b/clang-tools-extra/clang-tidy/bugprone/ChainedComparisonCheck.h
index a914149a42e69..bf8e3f709d30b 100644
--- a/clang-tools-extra/clang-tidy/bugprone/ChainedComparisonCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/ChainedComparisonCheck.h
@@ -1,4 +1,4 @@
-//===--- ChainedComparisonCheck.h - clang-tidy ------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/ComparePointerToMemberVirtualFunctionCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/ComparePointerToMemberVirtualFunctionCheck.cpp
index 1cbf1e22a33a7..602b63e43ad9e 100644
--- a/clang-tools-extra/clang-tidy/bugprone/ComparePointerToMemberVirtualFunctionCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/ComparePointerToMemberVirtualFunctionCheck.cpp
@@ -1,4 +1,4 @@
-//===--- ComparePointerToMemberVirtualFunctionCheck.cpp - clang-tidy ------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/CopyConstructorInitCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/CopyConstructorInitCheck.cpp
index 6b26b46d032f9..76bcbbbcdf680 100644
--- a/clang-tools-extra/clang-tidy/bugprone/CopyConstructorInitCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/CopyConstructorInitCheck.cpp
@@ -1,4 +1,4 @@
-//===--- CopyConstructorInitCheck.cpp - clang-tidy-------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/CopyConstructorInitCheck.h b/clang-tools-extra/clang-tidy/bugprone/CopyConstructorInitCheck.h
index 4aef892476c46..02755b5894b18 100644
--- a/clang-tools-extra/clang-tidy/bugprone/CopyConstructorInitCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/CopyConstructorInitCheck.h
@@ -1,4 +1,4 @@
-//===--- CopyConstructorInitCheck.h - clang-tidy--------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/CrtpConstructorAccessibilityCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/CrtpConstructorAccessibilityCheck.cpp
index 0625468d9da88..60f7be8996933 100644
--- a/clang-tools-extra/clang-tidy/bugprone/CrtpConstructorAccessibilityCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/CrtpConstructorAccessibilityCheck.cpp
@@ -1,4 +1,4 @@
-//===--- CrtpConstructorAccessibilityCheck.cpp - clang-tidy ---------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/CrtpConstructorAccessibilityCheck.h b/clang-tools-extra/clang-tidy/bugprone/CrtpConstructorAccessibilityCheck.h
index 785116218f468..c7d7c9f7c0e69 100644
--- a/clang-tools-extra/clang-tidy/bugprone/CrtpConstructorAccessibilityCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/CrtpConstructorAccessibilityCheck.h
@@ -1,4 +1,4 @@
-//===--- CrtpConstructorAccessibilityCheck.h - clang-tidy -------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/DanglingHandleCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/DanglingHandleCheck.cpp
index e13b1ceacc539..5b741e8c35b9a 100644
--- a/clang-tools-extra/clang-tidy/bugprone/DanglingHandleCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/DanglingHandleCheck.cpp
@@ -1,4 +1,4 @@
-//===--- DanglingHandleCheck.cpp - clang-tidy------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/DanglingHandleCheck.h b/clang-tools-extra/clang-tidy/bugprone/DanglingHandleCheck.h
index 981e9b571a618..6443b0aa59548 100644
--- a/clang-tools-extra/clang-tidy/bugprone/DanglingHandleCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/DanglingHandleCheck.h
@@ -1,4 +1,4 @@
-//===--- DanglingHandleCheck.h - clang-tidy----------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/DerivedMethodShadowingBaseMethodCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/DerivedMethodShadowingBaseMethodCheck.cpp
new file mode 100644
index 0000000000000..743e6cd27509b
--- /dev/null
+++ b/clang-tools-extra/clang-tidy/bugprone/DerivedMethodShadowingBaseMethodCheck.cpp
@@ -0,0 +1,128 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "DerivedMethodShadowingBaseMethodCheck.h"
+#include "clang/ASTMatchers/ASTMatchFinder.h"
+#include "clang/ASTMatchers/ASTMatchers.h"
+
+using namespace clang::ast_matchers;
+
+namespace clang::tidy::bugprone {
+
+static bool sameBasicType(const ParmVarDecl *Lhs, const ParmVarDecl *Rhs) {
+  return Lhs && Rhs &&
+         Lhs->getType()
+                 .getCanonicalType()
+                 .getNonReferenceType()
+                 .getUnqualifiedType() == Rhs->getType()
+                                              .getCanonicalType()
+                                              .getNonReferenceType()
+                                              .getUnqualifiedType();
+}
+
+static bool namesCollide(const CXXMethodDecl &Lhs, const CXXMethodDecl &Rhs) {
+  if (Lhs.getNameAsString() != Rhs.getNameAsString())
+    return false;
+  if (Lhs.isConst() != Rhs.isConst())
+    return false;
+  if (Lhs.getNumParams() != Rhs.getNumParams())
+    return false;
+  for (unsigned int It = 0; It < Lhs.getNumParams(); ++It)
+    if (!sameBasicType(Lhs.getParamDecl(It), Rhs.getParamDecl(It)))
+      return false;
+  return true;
+}
+
+namespace {
+
+AST_MATCHER(CXXMethodDecl, nameCollidesWithMethodInBase) {
+  const CXXRecordDecl *DerivedClass = Node.getParent();
+  for (const auto &Base : DerivedClass->bases()) {
+    llvm::SmallVector<const CXXBaseSpecifier *, 8> Stack;
+    Stack.push_back(&Base);
+    while (!Stack.empty()) {
+      const CXXBaseSpecifier *CurrentBaseSpec = Stack.back();
+      Stack.pop_back();
+
+      if (CurrentBaseSpec->getAccessSpecifier() ==
+          clang::AccessSpecifier::AS_private)
+        continue;
+
+      const CXXRecordDecl *CurrentRecord =
+          CurrentBaseSpec->getType()->getAsCXXRecordDecl();
+      if (!CurrentRecord)
+        continue;
+
+      // For multiple inheritance, we ignore only the bases that come from the
+      // std:: namespace
+      if (CurrentRecord->isInStdNamespace())
+        continue;
+
+      for (const auto &BaseMethod : CurrentRecord->methods()) {
+        if (namesCollide(*BaseMethod, Node)) {
+          ast_matchers::internal::BoundNodesTreeBuilder Result(*Builder);
+          Builder->setBinding("base_method",
+                              clang::DynTypedNode::create(*BaseMethod));
+          return true;
+        }
+      }
+
+      for (const auto &SubBase : CurrentRecord->bases())
+        Stack.push_back(&SubBase);
+    }
+  }
+  return false;
+}
+
+// Same as clang-tools-extra/clang-tidy/modernize/UseEqualsDefaultCheck.cpp,
+// similar matchers are used elsewhere in LLVM
+AST_MATCHER(CXXMethodDecl, isOutOfLine) { return Node.isOutOfLine(); }
+
+} // namespace
+
+DerivedMethodShadowingBaseMethodCheck::DerivedMethodShadowingBaseMethodCheck(
+    StringRef Name, ClangTidyContext *Context)
+    : ClangTidyCheck(Name, Context) {}
+
+void DerivedMethodShadowingBaseMethodCheck::registerMatchers(
+    MatchFinder *Finder) {
+  Finder->addMatcher(
+      cxxMethodDecl(
+          unless(anyOf(isOutOfLine(), isStaticStorageClass(), isImplicit(),
+                       cxxConstructorDecl(), isOverride(), isPrivate(),
+                       // isFinal(), //included with isOverride,
+                       // Templates are not handled yet
+                       ast_matchers::isTemplateInstantiation(),
+                       ast_matchers::isExplicitTemplateSpecialization())),
+          ofClass(cxxRecordDecl(isDerivedFrom(cxxRecordDecl()))
+                      .bind("derived_class")),
+          nameCollidesWithMethodInBase())
+          .bind("shadowing_method"),
+      this);
+}
+
+void DerivedMethodShadowingBaseMethodCheck::check(
+    const MatchFinder::MatchResult &Result) {
+  const auto *ShadowingMethod =
+      Result.Nodes.getNodeAs<CXXMethodDecl>("shadowing_method");
+  const auto *DerivedClass =
+      Result.Nodes.getNodeAs<CXXRecordDecl>("derived_class");
+  const auto *BaseMethod = Result.Nodes.getNodeAs<CXXMethodDecl>("base_method");
+
+  if (!ShadowingMethod || !DerivedClass || !BaseMethod)
+    llvm_unreachable("Required binding not found");
+
+  diag(ShadowingMethod->getBeginLoc(),
+       "'%0' shadows method with the same name in class %1")
+      << ShadowingMethod->getQualifiedNameAsString() << BaseMethod->getParent();
+  diag(BaseMethod->getBeginLoc(), "previous definition of %0 is here",
+       DiagnosticIDs::Note)
+      << ShadowingMethod;
+}
+
+} // namespace clang::tidy::bugprone
diff --git a/clang-tools-extra/clang-tidy/bugprone/DerivedMethodShadowingBaseMethodCheck.h b/clang-tools-extra/clang-tidy/bugprone/DerivedMethodShadowingBaseMethodCheck.h
new file mode 100644
index 0000000000000..d157e84ebdd98
--- /dev/null
+++ b/clang-tools-extra/clang-tidy/bugprone/DerivedMethodShadowingBaseMethodCheck.h
@@ -0,0 +1,37 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_DERIVEDMETHODSHADOWINGBASEMETHODCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_DERIVEDMETHODSHADOWINGBASEMETHODCHECK_H
+
+#include "../ClangTidyCheck.h"
+
+namespace clang::tidy::bugprone {
+
+/// Checks that a derived class does not define the same (non virtual) method as
+/// a base class
+///
+/// For the user-facing documentation see:
+/// http://clang.llvm.org/extra/clang-tidy/checks/bugprone/derived-method-shadowing-base-method.html
+class DerivedMethodShadowingBaseMethodCheck : public ClangTidyCheck {
+public:
+  DerivedMethodShadowingBaseMethodCheck(StringRef Name,
+                                        ClangTidyContext *Context);
+  void registerMatchers(ast_matchers::MatchFinder *Finder) override;
+  void check(const ast_matchers::MatchFinder::MatchResult &Result) override;
+  bool isLanguageVersionSupported(const LangOptions &LangOpts) const override {
+    return LangOpts.CPlusPlus;
+  }
+  std::optional<TraversalKind> getCheckTraversalKind() const override {
+    return TK_IgnoreUnlessSpelledInSource;
+  }
+};
+
+} // namespace clang::tidy::bugprone
+
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_DERIVEDMETHODSHADOWINGBASEMETHODCHECK_H
diff --git a/clang-tools-extra/clang-tidy/bugprone/DynamicStaticInitializersCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/DynamicStaticInitializersCheck.cpp
index 3fe028b94771d..4d0428ec18598 100644
--- a/clang-tools-extra/clang-tidy/bugprone/DynamicStaticInitializersCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/DynamicStaticInitializersCheck.cpp
@@ -1,4 +1,4 @@
-//===--- DynamicStaticInitializersCheck.cpp - clang-tidy ------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/DynamicStaticInitializersCheck.h b/clang-tools-extra/clang-tidy/bugprone/DynamicStaticInitializersCheck.h
index 66ed2828502b6..e02c62a53ffa0 100644
--- a/clang-tools-extra/clang-tidy/bugprone/DynamicStaticInitializersCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/DynamicStaticInitializersCheck.h
@@ -1,4 +1,4 @@
-//===--- DynamicStaticInitializersCheck.h - clang-tidy ----------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/EasilySwappableParametersCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/EasilySwappableParametersCheck.cpp
index 3c718f1ddbe95..d8207b30f1b5e 100644
--- a/clang-tools-extra/clang-tidy/bugprone/EasilySwappableParametersCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/EasilySwappableParametersCheck.cpp
@@ -1,4 +1,4 @@
-//===--- EasilySwappableParametersCheck.cpp - clang-tidy ------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -82,7 +82,7 @@ static constexpr bool DefaultModelImplicitConversions = true;
 /// used together.
 static constexpr bool DefaultSuppressParametersUsedTogether = true;
 
-/// The default value for the NamePrefixSuffixSilenceDissimilarityTreshold
+/// The default value for the NamePrefixSuffixSilenceDissimilarityThreshold
 /// check option.
 static constexpr std::size_t
     DefaultNamePrefixSuffixSilenceDissimilarityTreshold = 1;
@@ -1435,7 +1435,7 @@ static MixableParameterRange modelMixingRange(
     StringRef PrevParamName = FD->getParamDecl(I - 1)->getName();
     if (!ParamName.empty() && !PrevParamName.empty() &&
         filter::prefixSuffixCoverUnderThreshold(
-            Check.NamePrefixSuffixSilenceDissimilarityTreshold, PrevParamName,
+            Check.NamePrefixSuffixSilenceDissimilarityThreshold, PrevParamName,
             ParamName)) {
       LLVM_DEBUG(llvm::dbgs() << "Parameter '" << ParamName
                               << "' follows a pattern with previous parameter '"
@@ -2108,8 +2108,8 @@ EasilySwappableParametersCheck::EasilySwappableParametersCheck(
       SuppressParametersUsedTogether(
           Options.get("SuppressParametersUsedTogether",
                       DefaultSuppressParametersUsedTogether)),
-      NamePrefixSuffixSilenceDissimilarityTreshold(
-          Options.get("NamePrefixSuffixSilenceDissimilarityTreshold",
+      NamePrefixSuffixSilenceDissimilarityThreshold(
+          Options.get("NamePrefixSuffixSilenceDissimilarityThreshold",
                       DefaultNamePrefixSuffixSilenceDissimilarityTreshold)) {}
 
 void EasilySwappableParametersCheck::storeOptions(
@@ -2123,8 +2123,8 @@ void EasilySwappableParametersCheck::storeOptions(
   Options.store(Opts, "ModelImplicitConversions", ModelImplicitConversions);
   Options.store(Opts, "SuppressParametersUsedTogether",
                 SuppressParametersUsedTogether);
-  Options.store(Opts, "NamePrefixSuffixSilenceDissimilarityTreshold",
-                NamePrefixSuffixSilenceDissimilarityTreshold);
+  Options.store(Opts, "NamePrefixSuffixSilenceDissimilarityThreshold",
+                NamePrefixSuffixSilenceDissimilarityThreshold);
 }
 
 void EasilySwappableParametersCheck::registerMatchers(MatchFinder *Finder) {
diff --git a/clang-tools-extra/clang-tidy/bugprone/EasilySwappableParametersCheck.h b/clang-tools-extra/clang-tidy/bugprone/EasilySwappableParametersCheck.h
index 055ae80dee8f3..284b4f5b9935e 100644
--- a/clang-tools-extra/clang-tidy/bugprone/EasilySwappableParametersCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/EasilySwappableParametersCheck.h
@@ -1,4 +1,4 @@
-//===--- EasilySwappableParametersCheck.h - clang-tidy ----------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -54,7 +54,7 @@ class EasilySwappableParametersCheck : public ClangTidyCheck {
   /// either end for the report about the parameters to be silenced.
   /// E.g. the names "LHS" and "RHS" are 1-dissimilar suffixes of each other,
   /// while "Text1" and "Text2" are 1-dissimilar prefixes of each other.
-  const std::size_t NamePrefixSuffixSilenceDissimilarityTreshold;
+  const std::size_t NamePrefixSuffixSilenceDissimilarityThreshold;
 };
 
 } // namespace clang::tidy::bugprone
diff --git a/clang-tools-extra/clang-tidy/bugprone/EmptyCatchCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/EmptyCatchCheck.cpp
index be0a2a1baa12a..eebab847d1070 100644
--- a/clang-tools-extra/clang-tidy/bugprone/EmptyCatchCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/EmptyCatchCheck.cpp
@@ -1,4 +1,4 @@
-//===--- EmptyCatchCheck.cpp - clang-tidy ---------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/EmptyCatchCheck.h b/clang-tools-extra/clang-tidy/bugprone/EmptyCatchCheck.h
index b0694384f5c2f..acef43934adba 100644
--- a/clang-tools-extra/clang-tidy/bugprone/EmptyCatchCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/EmptyCatchCheck.h
@@ -1,4 +1,4 @@
-//===--- EmptyCatchCheck.h - clang-tidy -------------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/ExceptionEscapeCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/ExceptionEscapeCheck.cpp
index 8eb7881a47a26..3d839b5111cc8 100644
--- a/clang-tools-extra/clang-tidy/bugprone/ExceptionEscapeCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/ExceptionEscapeCheck.cpp
@@ -1,4 +1,4 @@
-//===--- ExceptionEscapeCheck.cpp - clang-tidy ----------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/ExceptionEscapeCheck.h b/clang-tools-extra/clang-tidy/bugprone/ExceptionEscapeCheck.h
index 14b9e8cc0a77f..974b07c42407d 100644
--- a/clang-tools-extra/clang-tidy/bugprone/ExceptionEscapeCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/ExceptionEscapeCheck.h
@@ -1,4 +1,4 @@
-//===--- ExceptionEscapeCheck.h - clang-tidy --------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/FoldInitTypeCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/FoldInitTypeCheck.cpp
index d70cd2836c80f..96e5d5d06ad70 100644
--- a/clang-tools-extra/clang-tidy/bugprone/FoldInitTypeCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/FoldInitTypeCheck.cpp
@@ -1,4 +1,4 @@
-//===--- FoldInitTypeCheck.cpp - clang-tidy--------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/FoldInitTypeCheck.h b/clang-tools-extra/clang-tidy/bugprone/FoldInitTypeCheck.h
index 435c440ddd29f..72dab600e3330 100644
--- a/clang-tools-extra/clang-tidy/bugprone/FoldInitTypeCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/FoldInitTypeCheck.h
@@ -1,4 +1,4 @@
-//===--- FoldInitTypeCheck.h - clang-tidy------------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/ForwardDeclarationNamespaceCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/ForwardDeclarationNamespaceCheck.cpp
index 070ed04efffc4..c3db8fa9b3af2 100644
--- a/clang-tools-extra/clang-tidy/bugprone/ForwardDeclarationNamespaceCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/ForwardDeclarationNamespaceCheck.cpp
@@ -1,4 +1,4 @@
-//===--- ForwardDeclarationNamespaceCheck.cpp - clang-tidy ------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/ForwardDeclarationNamespaceCheck.h b/clang-tools-extra/clang-tidy/bugprone/ForwardDeclarationNamespaceCheck.h
index 266346960a141..700e52f7bb86d 100644
--- a/clang-tools-extra/clang-tidy/bugprone/ForwardDeclarationNamespaceCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/ForwardDeclarationNamespaceCheck.h
@@ -1,4 +1,4 @@
-//===--- ForwardDeclarationNamespaceCheck.h - clang-tidy --------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/ForwardingReferenceOverloadCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/ForwardingReferenceOverloadCheck.cpp
index 10b747e17e2ad..d372cbd798b2e 100644
--- a/clang-tools-extra/clang-tidy/bugprone/ForwardingReferenceOverloadCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/ForwardingReferenceOverloadCheck.cpp
@@ -1,4 +1,4 @@
-//===--- ForwardingReferenceOverloadCheck.cpp - clang-tidy-----------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/ForwardingReferenceOverloadCheck.h b/clang-tools-extra/clang-tidy/bugprone/ForwardingReferenceOverloadCheck.h
index 75297dbbdf41d..ead0edb6a2b3a 100644
--- a/clang-tools-extra/clang-tidy/bugprone/ForwardingReferenceOverloadCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/ForwardingReferenceOverloadCheck.h
@@ -1,4 +1,4 @@
-//===--- ForwardingReferenceOverloadCheck.h - clang-tidy---------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/ImplicitWideningOfMultiplicationResultCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/ImplicitWideningOfMultiplicationResultCheck.cpp
index 46bf20e34ce04..2211a0ba24ebc 100644
--- a/clang-tools-extra/clang-tidy/bugprone/ImplicitWideningOfMultiplicationResultCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/ImplicitWideningOfMultiplicationResultCheck.cpp
@@ -1,4 +1,4 @@
-//===--- ImplicitWideningOfMultiplicationResultCheck.cpp - clang-tidy -----===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/ImplicitWideningOfMultiplicationResultCheck.h b/clang-tools-extra/clang-tidy/bugprone/ImplicitWideningOfMultiplicationResultCheck.h
index 077a4b847cd9c..74c64eb43f3c9 100644
--- a/clang-tools-extra/clang-tidy/bugprone/ImplicitWideningOfMultiplicationResultCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/ImplicitWideningOfMultiplicationResultCheck.h
@@ -1,4 +1,4 @@
-//===--- ImplicitWideningOfMultiplicationResultCheck.h ----------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/InaccurateEraseCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/InaccurateEraseCheck.cpp
index 92425890a0ea8..b0dd9017c8426 100644
--- a/clang-tools-extra/clang-tidy/bugprone/InaccurateEraseCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/InaccurateEraseCheck.cpp
@@ -1,4 +1,4 @@
-//===--- InaccurateEraseCheck.cpp - clang-tidy-----------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/InaccurateEraseCheck.h b/clang-tools-extra/clang-tidy/bugprone/InaccurateEraseCheck.h
index 5bf29d04e4068..3485ffdd89257 100644
--- a/clang-tools-extra/clang-tidy/bugprone/InaccurateEraseCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/InaccurateEraseCheck.h
@@ -1,4 +1,4 @@
-//===--- InaccurateEraseCheck.h - clang-tidy---------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/IncDecInConditionsCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/IncDecInConditionsCheck.cpp
index 73bffe93146e6..9ce6d42344cdf 100644
--- a/clang-tools-extra/clang-tidy/bugprone/IncDecInConditionsCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/IncDecInConditionsCheck.cpp
@@ -1,4 +1,4 @@
-//===--- IncDecInConditionsCheck.cpp - clang-tidy -------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/IncDecInConditionsCheck.h b/clang-tools-extra/clang-tidy/bugprone/IncDecInConditionsCheck.h
index 1f2f1690041fd..2e2dcb1cde7bc 100644
--- a/clang-tools-extra/clang-tidy/bugprone/IncDecInConditionsCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/IncDecInConditionsCheck.h
@@ -1,4 +1,4 @@
-//===--- IncDecInConditionsCheck.h - clang-tidy -----------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/IncorrectEnableIfCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/IncorrectEnableIfCheck.cpp
index 07cd90d64c2a4..84a99c36523ac 100644
--- a/clang-tools-extra/clang-tidy/bugprone/IncorrectEnableIfCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/IncorrectEnableIfCheck.cpp
@@ -1,4 +1,4 @@
-//===--- IncorrectEnableIfCheck.cpp - clang-tidy --------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/IncorrectEnableIfCheck.h b/clang-tools-extra/clang-tidy/bugprone/IncorrectEnableIfCheck.h
index 37a52b425aa80..ea9cb4ecd0006 100644
--- a/clang-tools-extra/clang-tidy/bugprone/IncorrectEnableIfCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/IncorrectEnableIfCheck.h
@@ -1,4 +1,4 @@
-//===--- IncorrectEnableIfCheck.h - clang-tidy ------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/IncorrectEnableSharedFromThisCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/IncorrectEnableSharedFromThisCheck.cpp
index 425e46cf6c88c..1b3c4fe847af0 100644
--- a/clang-tools-extra/clang-tidy/bugprone/IncorrectEnableSharedFromThisCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/IncorrectEnableSharedFromThisCheck.cpp
@@ -1,4 +1,4 @@
-//===--- IncorrectEnableSharedFromThisCheck.cpp - clang-tidy --------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/IncorrectEnableSharedFromThisCheck.h b/clang-tools-extra/clang-tidy/bugprone/IncorrectEnableSharedFromThisCheck.h
index 987c56059259b..866ae56631e36 100644
--- a/clang-tools-extra/clang-tidy/bugprone/IncorrectEnableSharedFromThisCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/IncorrectEnableSharedFromThisCheck.h
@@ -1,4 +1,4 @@
-//===--- IncorrectEnableSharedFromThisCheck.h - clang-tidy ------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/IncorrectRoundingsCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/IncorrectRoundingsCheck.cpp
index c2b0732a3e7bd..a4965c298adbc 100644
--- a/clang-tools-extra/clang-tidy/bugprone/IncorrectRoundingsCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/IncorrectRoundingsCheck.cpp
@@ -1,4 +1,4 @@
-//===--- IncorrectRoundingsCheck.cpp - clang-tidy--------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/IncorrectRoundingsCheck.h b/clang-tools-extra/clang-tidy/bugprone/IncorrectRoundingsCheck.h
index 292cfbaa688f9..a671a4af95f12 100644
--- a/clang-tools-extra/clang-tidy/bugprone/IncorrectRoundingsCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/IncorrectRoundingsCheck.h
@@ -1,4 +1,4 @@
-//===--- IncorrectRoundingsCheck.h - clang-tidy -----------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/InfiniteLoopCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/InfiniteLoopCheck.cpp
index cda9c4e7a6e58..1e516c1573219 100644
--- a/clang-tools-extra/clang-tidy/bugprone/InfiniteLoopCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/InfiniteLoopCheck.cpp
@@ -1,4 +1,4 @@
-//===--- InfiniteLoopCheck.cpp - clang-tidy -------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/InfiniteLoopCheck.h b/clang-tools-extra/clang-tidy/bugprone/InfiniteLoopCheck.h
index 6d0c6d75f2262..0766e2fa3c35d 100644
--- a/clang-tools-extra/clang-tidy/bugprone/InfiniteLoopCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/InfiniteLoopCheck.h
@@ -1,4 +1,4 @@
-//===--- InfiniteLoopCheck.h - clang-tidy -----------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/IntegerDivisionCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/IntegerDivisionCheck.cpp
index 63c932c94b6fb..a262f9b9fc4bc 100644
--- a/clang-tools-extra/clang-tidy/bugprone/IntegerDivisionCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/IntegerDivisionCheck.cpp
@@ -1,4 +1,4 @@
-//===--- IntegerDivisionCheck.cpp - clang-tidy-----------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/IntegerDivisionCheck.h b/clang-tools-extra/clang-tidy/bugprone/IntegerDivisionCheck.h
index b74b206f07373..b191cf693029e 100644
--- a/clang-tools-extra/clang-tidy/bugprone/IntegerDivisionCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/IntegerDivisionCheck.h
@@ -1,4 +1,4 @@
-//===--- IntegerDivisionCheck.h - clang-tidy---------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/InvalidEnumDefaultInitializationCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/InvalidEnumDefaultInitializationCheck.cpp
index 7d92ef301aec3..1e657888b0fc0 100644
--- a/clang-tools-extra/clang-tidy/bugprone/InvalidEnumDefaultInitializationCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/InvalidEnumDefaultInitializationCheck.cpp
@@ -1,4 +1,4 @@
-//===--- InvalidEnumDefaultInitializationCheck.cpp - clang-tidy -----------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/InvalidEnumDefaultInitializationCheck.h b/clang-tools-extra/clang-tidy/bugprone/InvalidEnumDefaultInitializationCheck.h
index 0746c4d025d1f..b9b4f20d111fc 100644
--- a/clang-tools-extra/clang-tidy/bugprone/InvalidEnumDefaultInitializationCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/InvalidEnumDefaultInitializationCheck.h
@@ -1,4 +1,4 @@
-//===--- InvalidEnumDefaultInitializationCheck.h - clang-tidy -*- C++ -*---===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/LambdaFunctionNameCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/LambdaFunctionNameCheck.cpp
index 8517d2bac0d59..fb73e896fdb13 100644
--- a/clang-tools-extra/clang-tidy/bugprone/LambdaFunctionNameCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/LambdaFunctionNameCheck.cpp
@@ -1,4 +1,4 @@
-//===--- LambdaFunctionNameCheck.cpp - clang-tidy--------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/LambdaFunctionNameCheck.h b/clang-tools-extra/clang-tidy/bugprone/LambdaFunctionNameCheck.h
index 04ba3596167e3..9e53951c4a7bd 100644
--- a/clang-tools-extra/clang-tidy/bugprone/LambdaFunctionNameCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/LambdaFunctionNameCheck.h
@@ -1,4 +1,4 @@
-//===--- LambdaFunctionNameCheck.h - clang-tidy------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/MacroParenthesesCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/MacroParenthesesCheck.cpp
index 7d89e107a62d2..b16119daaad8a 100644
--- a/clang-tools-extra/clang-tidy/bugprone/MacroParenthesesCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/MacroParenthesesCheck.cpp
@@ -1,4 +1,4 @@
-//===--- MacroParenthesesCheck.cpp - clang-tidy----------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/MacroParenthesesCheck.h b/clang-tools-extra/clang-tidy/bugprone/MacroParenthesesCheck.h
index 47c18d8d60bdd..34ea582153ebb 100644
--- a/clang-tools-extra/clang-tidy/bugprone/MacroParenthesesCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/MacroParenthesesCheck.h
@@ -1,4 +1,4 @@
-//===--- MacroParenthesesCheck.h - clang-tidy--------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/MacroRepeatedSideEffectsCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/MacroRepeatedSideEffectsCheck.cpp
index 879040177079a..78a53d12bd312 100644
--- a/clang-tools-extra/clang-tidy/bugprone/MacroRepeatedSideEffectsCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/MacroRepeatedSideEffectsCheck.cpp
@@ -1,4 +1,4 @@
-//===--- MacroRepeatedSideEffectsCheck.cpp - clang-tidy--------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/MacroRepeatedSideEffectsCheck.h b/clang-tools-extra/clang-tidy/bugprone/MacroRepeatedSideEffectsCheck.h
index a15b8d4671e1b..25b33ba3082af 100644
--- a/clang-tools-extra/clang-tidy/bugprone/MacroRepeatedSideEffectsCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/MacroRepeatedSideEffectsCheck.h
@@ -1,4 +1,4 @@
-//===--- MacroRepeatedSideEffectsCheck.h - clang-tidy -----------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/MisleadingSetterOfReferenceCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/MisleadingSetterOfReferenceCheck.cpp
index 23de8d971898e..ff7f3020102ad 100644
--- a/clang-tools-extra/clang-tidy/bugprone/MisleadingSetterOfReferenceCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/MisleadingSetterOfReferenceCheck.cpp
@@ -1,4 +1,4 @@
-//===--- MisleadingSetterOfReferenceCheck.cpp - clang-tidy-----------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/MisleadingSetterOfReferenceCheck.h b/clang-tools-extra/clang-tidy/bugprone/MisleadingSetterOfReferenceCheck.h
index 99e7a9435cfa9..b44f7a4ccb795 100644
--- a/clang-tools-extra/clang-tidy/bugprone/MisleadingSetterOfReferenceCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/MisleadingSetterOfReferenceCheck.h
@@ -1,4 +1,4 @@
-//===--- MisleadingSetterOfReferenceCheck.h - clang-tidy---------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/MisplacedOperatorInStrlenInAllocCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/MisplacedOperatorInStrlenInAllocCheck.cpp
index 5b166b4b3a9bf..5925e28eed734 100644
--- a/clang-tools-extra/clang-tidy/bugprone/MisplacedOperatorInStrlenInAllocCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/MisplacedOperatorInStrlenInAllocCheck.cpp
@@ -1,4 +1,4 @@
-//===--- MisplacedOperatorInStrlenInAllocCheck.cpp - clang-tidy------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/MisplacedOperatorInStrlenInAllocCheck.h b/clang-tools-extra/clang-tidy/bugprone/MisplacedOperatorInStrlenInAllocCheck.h
index 93cf50d0b1c6f..764fd3ff97fed 100644
--- a/clang-tools-extra/clang-tidy/bugprone/MisplacedOperatorInStrlenInAllocCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/MisplacedOperatorInStrlenInAllocCheck.h
@@ -1,4 +1,4 @@
-//===--- MisplacedOperatorInStrlenInAllocCheck.h - clang-tidy----*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/MisplacedPointerArithmeticInAllocCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/MisplacedPointerArithmeticInAllocCheck.cpp
index 86785d36696dc..f5acafb7637ad 100644
--- a/clang-tools-extra/clang-tidy/bugprone/MisplacedPointerArithmeticInAllocCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/MisplacedPointerArithmeticInAllocCheck.cpp
@@ -1,4 +1,4 @@
-//===--- MisplacedPointerArithmeticInAllocCheck.cpp - clang-tidy-----------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/MisplacedPointerArithmeticInAllocCheck.h b/clang-tools-extra/clang-tidy/bugprone/MisplacedPointerArithmeticInAllocCheck.h
index 83cd0ddf3cc49..a86d2a33d503f 100644
--- a/clang-tools-extra/clang-tidy/bugprone/MisplacedPointerArithmeticInAllocCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/MisplacedPointerArithmeticInAllocCheck.h
@@ -1,4 +1,4 @@
-//===--- MisplacedPointerArithmeticInAllocCheck.h - clang-tidy---*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/MisplacedWideningCastCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/MisplacedWideningCastCheck.cpp
index 219972e0bdad7..d508e2aaba53c 100644
--- a/clang-tools-extra/clang-tidy/bugprone/MisplacedWideningCastCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/MisplacedWideningCastCheck.cpp
@@ -1,4 +1,4 @@
-//===--- MisplacedWideningCastCheck.cpp - clang-tidy-----------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/MisplacedWideningCastCheck.h b/clang-tools-extra/clang-tidy/bugprone/MisplacedWideningCastCheck.h
index d03506838d07e..5fde5c7d0e46d 100644
--- a/clang-tools-extra/clang-tidy/bugprone/MisplacedWideningCastCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/MisplacedWideningCastCheck.h
@@ -1,4 +1,4 @@
-//===--- MisplacedWideningCastCheck.h - clang-tidy---------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/MoveForwardingReferenceCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/MoveForwardingReferenceCheck.cpp
index 5dc988d6662df..66559a0e5d7b5 100644
--- a/clang-tools-extra/clang-tidy/bugprone/MoveForwardingReferenceCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/MoveForwardingReferenceCheck.cpp
@@ -1,4 +1,4 @@
-//===--- MoveForwardingReferenceCheck.cpp - clang-tidy --------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/MoveForwardingReferenceCheck.h b/clang-tools-extra/clang-tidy/bugprone/MoveForwardingReferenceCheck.h
index 2bbafe716124b..4fc876a232f37 100644
--- a/clang-tools-extra/clang-tidy/bugprone/MoveForwardingReferenceCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/MoveForwardingReferenceCheck.h
@@ -1,4 +1,4 @@
-//===--- MoveForwardingReferenceCheck.h - clang-tidy ----------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/MultiLevelImplicitPointerConversionCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/MultiLevelImplicitPointerConversionCheck.cpp
index 1a23473fdd229..2eff013b2ab7d 100644
--- a/clang-tools-extra/clang-tidy/bugprone/MultiLevelImplicitPointerConversionCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/MultiLevelImplicitPointerConversionCheck.cpp
@@ -1,4 +1,4 @@
-//===--- MultiLevelImplicitPointerConversionCheck.cpp - clang-tidy --------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/MultiLevelImplicitPointerConversionCheck.h b/clang-tools-extra/clang-tidy/bugprone/MultiLevelImplicitPointerConversionCheck.h
index 5ec78be0ea79b..d0a9a21523862 100644
--- a/clang-tools-extra/clang-tidy/bugprone/MultiLevelImplicitPointerConversionCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/MultiLevelImplicitPointerConversionCheck.h
@@ -1,4 +1,4 @@
-//===--- MultiLevelImplicitPointerConversionCheck.h - clang-tidy *- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/MultipleNewInOneExpressionCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/MultipleNewInOneExpressionCheck.cpp
index 6344b4bb6271e..17aea9392bd26 100644
--- a/clang-tools-extra/clang-tidy/bugprone/MultipleNewInOneExpressionCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/MultipleNewInOneExpressionCheck.cpp
@@ -1,4 +1,4 @@
-//===--- MultipleNewInOneExpressionCheck.cpp - clang-tidy------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/MultipleNewInOneExpressionCheck.h b/clang-tools-extra/clang-tidy/bugprone/MultipleNewInOneExpressionCheck.h
index 29eea12ff7192..53ad4a514bcc7 100644
--- a/clang-tools-extra/clang-tidy/bugprone/MultipleNewInOneExpressionCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/MultipleNewInOneExpressionCheck.h
@@ -1,4 +1,4 @@
-//===--- MultipleNewInOneExpressionCheck.h - clang-tidy----------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/MultipleStatementMacroCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/MultipleStatementMacroCheck.cpp
index de05cc0e4f7fb..390f3dd472a5b 100644
--- a/clang-tools-extra/clang-tidy/bugprone/MultipleStatementMacroCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/MultipleStatementMacroCheck.cpp
@@ -1,4 +1,4 @@
-//===--- MultipleStatementMacroCheck.cpp - clang-tidy----------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/MultipleStatementMacroCheck.h b/clang-tools-extra/clang-tidy/bugprone/MultipleStatementMacroCheck.h
index 626cddce6734c..73a00fa493797 100644
--- a/clang-tools-extra/clang-tidy/bugprone/MultipleStatementMacroCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/MultipleStatementMacroCheck.h
@@ -1,4 +1,4 @@
-//===--- MultipleStatementMacroCheck.h - clang-tidy--------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/NarrowingConversionsCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/NarrowingConversionsCheck.cpp
index 249c77ca0c432..287ee95a4db55 100644
--- a/clang-tools-extra/clang-tidy/bugprone/NarrowingConversionsCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/NarrowingConversionsCheck.cpp
@@ -1,4 +1,4 @@
-//===--- NarrowingConversionsCheck.cpp - clang-tidy------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/NarrowingConversionsCheck.h b/clang-tools-extra/clang-tidy/bugprone/NarrowingConversionsCheck.h
index 116a8cba8d321..1f37086e3af55 100644
--- a/clang-tools-extra/clang-tidy/bugprone/NarrowingConversionsCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/NarrowingConversionsCheck.h
@@ -1,4 +1,4 @@
-//===--- NarrowingConversionsCheck.h - clang-tidy----------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/NoEscapeCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/NoEscapeCheck.cpp
index 8023e32d53278..6d21c521bbca7 100644
--- a/clang-tools-extra/clang-tidy/bugprone/NoEscapeCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/NoEscapeCheck.cpp
@@ -1,4 +1,4 @@
-//===--- NoEscapeCheck.cpp - clang-tidy -----------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/NoEscapeCheck.h b/clang-tools-extra/clang-tidy/bugprone/NoEscapeCheck.h
index 476c7749d6e04..4760b171e75ce 100644
--- a/clang-tools-extra/clang-tidy/bugprone/NoEscapeCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/NoEscapeCheck.h
@@ -1,4 +1,4 @@
-//===--- NoEscapeCheck.h - clang-tidy ---------------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/NonZeroEnumToBoolConversionCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/NonZeroEnumToBoolConversionCheck.cpp
index e0b0df98d3409..067577f184281 100644
--- a/clang-tools-extra/clang-tidy/bugprone/NonZeroEnumToBoolConversionCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/NonZeroEnumToBoolConversionCheck.cpp
@@ -1,4 +1,4 @@
-//===--- NonZeroEnumToBoolConversionCheck.cpp - clang-tidy ----------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/NonZeroEnumToBoolConversionCheck.h b/clang-tools-extra/clang-tidy/bugprone/NonZeroEnumToBoolConversionCheck.h
index f1cb81f05a723..977545fd5b65c 100644
--- a/clang-tools-extra/clang-tidy/bugprone/NonZeroEnumToBoolConversionCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/NonZeroEnumToBoolConversionCheck.h
@@ -1,4 +1,4 @@
-//===--- NonZeroNonZeroEnumToBoolConversionCheck.h - clang-tidy -*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/NondeterministicPointerIterationOrderCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/NondeterministicPointerIterationOrderCheck.cpp
index 2ddcfa02bfb96..abde115d10a1b 100644
--- a/clang-tools-extra/clang-tidy/bugprone/NondeterministicPointerIterationOrderCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/NondeterministicPointerIterationOrderCheck.cpp
@@ -1,4 +1,4 @@
-//===----- NondeterministicPointerIterationOrderCheck.cpp - clang-tidy ----===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/NotNullTerminatedResultCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/NotNullTerminatedResultCheck.cpp
index 203170d55f694..d4676842a97ff 100644
--- a/clang-tools-extra/clang-tidy/bugprone/NotNullTerminatedResultCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/NotNullTerminatedResultCheck.cpp
@@ -1,4 +1,4 @@
-//===--- NotNullTerminatedResultCheck.cpp - clang-tidy ----------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/NotNullTerminatedResultCheck.h b/clang-tools-extra/clang-tidy/bugprone/NotNullTerminatedResultCheck.h
index 1eeead02b17ab..fa2ca59b65300 100644
--- a/clang-tools-extra/clang-tidy/bugprone/NotNullTerminatedResultCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/NotNullTerminatedResultCheck.h
@@ -1,4 +1,4 @@
-//===--- NotNullTerminatedResultCheck.h - clang-tidy ------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/OptionalValueConversionCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/OptionalValueConversionCheck.cpp
index cda9288c0531a..1b1e0401556e0 100644
--- a/clang-tools-extra/clang-tidy/bugprone/OptionalValueConversionCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/OptionalValueConversionCheck.cpp
@@ -1,4 +1,4 @@
-//===--- OptionalValueConversionCheck.cpp - clang-tidy --------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/OptionalValueConversionCheck.h b/clang-tools-extra/clang-tidy/bugprone/OptionalValueConversionCheck.h
index 888d29fc937bd..83e08e7359224 100644
--- a/clang-tools-extra/clang-tidy/bugprone/OptionalValueConversionCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/OptionalValueConversionCheck.h
@@ -1,4 +1,4 @@
-//===--- OptionalValueConversionCheck.h - clang-tidy ------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/ParentVirtualCallCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/ParentVirtualCallCheck.cpp
index 56576ba360399..3c0ced96c05ac 100644
--- a/clang-tools-extra/clang-tidy/bugprone/ParentVirtualCallCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/ParentVirtualCallCheck.cpp
@@ -1,4 +1,4 @@
-//===--- ParentVirtualCallCheck.cpp - clang-tidy---------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/ParentVirtualCallCheck.h b/clang-tools-extra/clang-tidy/bugprone/ParentVirtualCallCheck.h
index 293069fd24665..2f86d75a6d64d 100644
--- a/clang-tools-extra/clang-tidy/bugprone/ParentVirtualCallCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/ParentVirtualCallCheck.h
@@ -1,4 +1,4 @@
-//===--- ParentVirtualCallCheck.h - clang-tidy-------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/PointerArithmeticOnPolymorphicObjectCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/PointerArithmeticOnPolymorphicObjectCheck.cpp
index 6e6ad10fabbb3..c21abad947912 100644
--- a/clang-tools-extra/clang-tidy/bugprone/PointerArithmeticOnPolymorphicObjectCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/PointerArithmeticOnPolymorphicObjectCheck.cpp
@@ -1,4 +1,4 @@
-//===--- PointerArithmeticOnPolymorphicObjectCheck.cpp - clang-tidy--------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/PointerArithmeticOnPolymorphicObjectCheck.h b/clang-tools-extra/clang-tidy/bugprone/PointerArithmeticOnPolymorphicObjectCheck.h
index 84f2d8e74ba87..13f9df656c98c 100644
--- a/clang-tools-extra/clang-tidy/bugprone/PointerArithmeticOnPolymorphicObjectCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/PointerArithmeticOnPolymorphicObjectCheck.h
@@ -1,4 +1,4 @@
-//===--- PointerArithmeticOnPolymorphicObjectCheck.h ------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/PosixReturnCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/PosixReturnCheck.cpp
index f05924b81c4c0..57196adf38fb6 100644
--- a/clang-tools-extra/clang-tidy/bugprone/PosixReturnCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/PosixReturnCheck.cpp
@@ -1,4 +1,4 @@
-//===--- PosixReturnCheck.cpp - clang-tidy---------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/PosixReturnCheck.h b/clang-tools-extra/clang-tidy/bugprone/PosixReturnCheck.h
index e9d2263b06bc9..d72c86c060fb9 100644
--- a/clang-tools-extra/clang-tidy/bugprone/PosixReturnCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/PosixReturnCheck.h
@@ -1,4 +1,4 @@
-//===--- PosixReturnCheck.h - clang-tidy ------------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/RedundantBranchConditionCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/RedundantBranchConditionCheck.cpp
index e717564847e4a..6abe53f47b8f9 100644
--- a/clang-tools-extra/clang-tidy/bugprone/RedundantBranchConditionCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/RedundantBranchConditionCheck.cpp
@@ -1,4 +1,4 @@
-//===--- RedundantBranchConditionCheck.cpp - clang-tidy--------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/RedundantBranchConditionCheck.h b/clang-tools-extra/clang-tidy/bugprone/RedundantBranchConditionCheck.h
index 1dfab69db0a22..854de520807fc 100644
--- a/clang-tools-extra/clang-tidy/bugprone/RedundantBranchConditionCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/RedundantBranchConditionCheck.h
@@ -1,4 +1,4 @@
-//===--- RedundantBranchConditionCheck.h - clang-tidy -----------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/ReservedIdentifierCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/ReservedIdentifierCheck.cpp
index 5812c18a2ccca..62e22450800ea 100644
--- a/clang-tools-extra/clang-tidy/bugprone/ReservedIdentifierCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/ReservedIdentifierCheck.cpp
@@ -1,4 +1,4 @@
-//===--- ReservedIdentifierCheck.cpp - clang-tidy -------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/ReservedIdentifierCheck.h b/clang-tools-extra/clang-tidy/bugprone/ReservedIdentifierCheck.h
index 474dc25f6386c..5b6fbff266217 100644
--- a/clang-tools-extra/clang-tidy/bugprone/ReservedIdentifierCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/ReservedIdentifierCheck.h
@@ -1,4 +1,4 @@
-//===--- ReservedIdentifierCheck.h - clang-tidy -----------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/ReturnConstRefFromParameterCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/ReturnConstRefFromParameterCheck.cpp
index 295955a971d7e..a3265293bef58 100644
--- a/clang-tools-extra/clang-tidy/bugprone/ReturnConstRefFromParameterCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/ReturnConstRefFromParameterCheck.cpp
@@ -1,4 +1,4 @@
-//===--- ReturnConstRefFromParameterCheck.cpp - clang-tidy ----------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/ReturnConstRefFromParameterCheck.h b/clang-tools-extra/clang-tidy/bugprone/ReturnConstRefFromParameterCheck.h
index 8768d07087383..8149bd29030b3 100644
--- a/clang-tools-extra/clang-tidy/bugprone/ReturnConstRefFromParameterCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/ReturnConstRefFromParameterCheck.h
@@ -1,4 +1,4 @@
-//===--- ReturnConstRefFromParameterCheck.h - clang-tidy --------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/SharedPtrArrayMismatchCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/SharedPtrArrayMismatchCheck.cpp
index 72036aaff158c..2997d7d3167e4 100644
--- a/clang-tools-extra/clang-tidy/bugprone/SharedPtrArrayMismatchCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/SharedPtrArrayMismatchCheck.cpp
@@ -1,4 +1,4 @@
-//===--- SharedPtrArrayMismatchCheck.cpp - clang-tidy ---------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/SharedPtrArrayMismatchCheck.h b/clang-tools-extra/clang-tidy/bugprone/SharedPtrArrayMismatchCheck.h
index 3787eb345b857..c163c6b7fbafb 100644
--- a/clang-tools-extra/clang-tidy/bugprone/SharedPtrArrayMismatchCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/SharedPtrArrayMismatchCheck.h
@@ -1,4 +1,4 @@
-//===--- SharedPtrArrayMismatchCheck.h - clang-tidy -------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/SignalHandlerCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/SignalHandlerCheck.cpp
index fa38f5e07f832..86af5cbd94374 100644
--- a/clang-tools-extra/clang-tidy/bugprone/SignalHandlerCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/SignalHandlerCheck.cpp
@@ -1,4 +1,4 @@
-//===--- SignalHandlerCheck.cpp - clang-tidy ------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/SignalHandlerCheck.h b/clang-tools-extra/clang-tidy/bugprone/SignalHandlerCheck.h
index 3573cdf49e1d1..6589b19fbe048 100644
--- a/clang-tools-extra/clang-tidy/bugprone/SignalHandlerCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/SignalHandlerCheck.h
@@ -1,4 +1,4 @@
-//===--- SignalHandlerCheck.h - clang-tidy ----------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/SignedCharMisuseCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/SignedCharMisuseCheck.cpp
index dfd3cbfcd664a..1041355a0caad 100644
--- a/clang-tools-extra/clang-tidy/bugprone/SignedCharMisuseCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/SignedCharMisuseCheck.cpp
@@ -1,4 +1,4 @@
-//===--- SignedCharMisuseCheck.cpp - clang-tidy ---------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/SignedCharMisuseCheck.h b/clang-tools-extra/clang-tidy/bugprone/SignedCharMisuseCheck.h
index 42d6080736d30..c735ac634c801 100644
--- a/clang-tools-extra/clang-tidy/bugprone/SignedCharMisuseCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/SignedCharMisuseCheck.h
@@ -1,4 +1,4 @@
-//===--- SignedCharMisuseCheck.h - clang-tidy -------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/SizeofContainerCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/SizeofContainerCheck.cpp
index df2a3e26ea8dc..08ff82f57e3c2 100644
--- a/clang-tools-extra/clang-tidy/bugprone/SizeofContainerCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/SizeofContainerCheck.cpp
@@ -1,4 +1,4 @@
-//===--- SizeofContainerCheck.cpp - clang-tidy-----------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/SizeofContainerCheck.h b/clang-tools-extra/clang-tidy/bugprone/SizeofContainerCheck.h
index f50ce99c6d4c0..8fc351b8c6cb2 100644
--- a/clang-tools-extra/clang-tidy/bugprone/SizeofContainerCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/SizeofContainerCheck.h
@@ -1,4 +1,4 @@
-//===--- SizeofContainerCheck.h - clang-tidy---------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/SizeofExpressionCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/SizeofExpressionCheck.cpp
index 8da6227e172cd..139213ed359ba 100644
--- a/clang-tools-extra/clang-tidy/bugprone/SizeofExpressionCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/SizeofExpressionCheck.cpp
@@ -1,4 +1,4 @@
-//===--- SizeofExpressionCheck.cpp - clang-tidy----------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/SizeofExpressionCheck.h b/clang-tools-extra/clang-tidy/bugprone/SizeofExpressionCheck.h
index e979b4723cf2e..6d7c33977db93 100644
--- a/clang-tools-extra/clang-tidy/bugprone/SizeofExpressionCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/SizeofExpressionCheck.h
@@ -1,4 +1,4 @@
-//===--- SizeofExpressionCheck.h - clang-tidy--------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/SmartPtrArrayMismatchCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/SmartPtrArrayMismatchCheck.cpp
index fbdb676be68b0..ee797ecb694bd 100644
--- a/clang-tools-extra/clang-tidy/bugprone/SmartPtrArrayMismatchCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/SmartPtrArrayMismatchCheck.cpp
@@ -1,4 +1,4 @@
-//===--- SmartPtrArrayMismatchCheck.cpp - clang-tidy ----------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/SmartPtrArrayMismatchCheck.h b/clang-tools-extra/clang-tidy/bugprone/SmartPtrArrayMismatchCheck.h
index 7fcc4b6cfa0e6..b7703a7d61c03 100644
--- a/clang-tools-extra/clang-tidy/bugprone/SmartPtrArrayMismatchCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/SmartPtrArrayMismatchCheck.h
@@ -1,4 +1,4 @@
-//===--- SharedPtrArrayMismatchCheck.h - clang-tidy -------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/SpuriouslyWakeUpFunctionsCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/SpuriouslyWakeUpFunctionsCheck.cpp
index 9bdd167a7afe9..1e8058bc4abc9 100644
--- a/clang-tools-extra/clang-tidy/bugprone/SpuriouslyWakeUpFunctionsCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/SpuriouslyWakeUpFunctionsCheck.cpp
@@ -1,4 +1,4 @@
-//===--- SpuriouslyWakeUpFunctionsCheck.cpp - clang-tidy ------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/SpuriouslyWakeUpFunctionsCheck.h b/clang-tools-extra/clang-tidy/bugprone/SpuriouslyWakeUpFunctionsCheck.h
index 098299aea7dee..23bf8056c0f61 100644
--- a/clang-tools-extra/clang-tidy/bugprone/SpuriouslyWakeUpFunctionsCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/SpuriouslyWakeUpFunctionsCheck.h
@@ -1,4 +1,4 @@
-//===--- SpuriouslyWakeUpFunctionsCheck.h - clang-tidy ----------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/StandaloneEmptyCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/StandaloneEmptyCheck.cpp
index 5d9e91e0b82c7..a7958cc229ffe 100644
--- a/clang-tools-extra/clang-tidy/bugprone/StandaloneEmptyCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/StandaloneEmptyCheck.cpp
@@ -1,4 +1,4 @@
-//===--- StandaloneEmptyCheck.cpp - clang-tidy ----------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/StandaloneEmptyCheck.h b/clang-tools-extra/clang-tidy/bugprone/StandaloneEmptyCheck.h
index 6eaf3e40cb782..85b25d8e25abc 100644
--- a/clang-tools-extra/clang-tidy/bugprone/StandaloneEmptyCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/StandaloneEmptyCheck.h
@@ -1,4 +1,4 @@
-//===--- StandaloneEmptyCheck.h - clang-tidy --------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/StringConstructorCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/StringConstructorCheck.cpp
index d1902b658061b..e4f7a1778fd44 100644
--- a/clang-tools-extra/clang-tidy/bugprone/StringConstructorCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/StringConstructorCheck.cpp
@@ -1,4 +1,4 @@
-//===--- StringConstructorCheck.cpp - clang-tidy---------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/StringConstructorCheck.h b/clang-tools-extra/clang-tidy/bugprone/StringConstructorCheck.h
index e90cf44b07680..5ab05e119abe9 100644
--- a/clang-tools-extra/clang-tidy/bugprone/StringConstructorCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/StringConstructorCheck.h
@@ -1,4 +1,4 @@
-//===--- StringConstructorCheck.h - clang-tidy-------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/StringIntegerAssignmentCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/StringIntegerAssignmentCheck.cpp
index 4f93b3ef779f5..93a55ef549896 100644
--- a/clang-tools-extra/clang-tidy/bugprone/StringIntegerAssignmentCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/StringIntegerAssignmentCheck.cpp
@@ -1,4 +1,4 @@
-//===--- StringIntegerAssignmentCheck.cpp - clang-tidy---------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/StringIntegerAssignmentCheck.h b/clang-tools-extra/clang-tidy/bugprone/StringIntegerAssignmentCheck.h
index 1e86fdfaa3a89..68783b7da53c6 100644
--- a/clang-tools-extra/clang-tidy/bugprone/StringIntegerAssignmentCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/StringIntegerAssignmentCheck.h
@@ -1,4 +1,4 @@
-//===--- StringIntegerAssignmentCheck.h - clang-tidy-------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/StringLiteralWithEmbeddedNulCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/StringLiteralWithEmbeddedNulCheck.cpp
index 444f3081b704d..b3e0673ea6103 100644
--- a/clang-tools-extra/clang-tidy/bugprone/StringLiteralWithEmbeddedNulCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/StringLiteralWithEmbeddedNulCheck.cpp
@@ -1,4 +1,4 @@
-//===--- StringLiteralWithEmbeddedNulCheck.cpp - clang-tidy----------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/StringLiteralWithEmbeddedNulCheck.h b/clang-tools-extra/clang-tidy/bugprone/StringLiteralWithEmbeddedNulCheck.h
index 85d172d835d21..59aece123057a 100644
--- a/clang-tools-extra/clang-tidy/bugprone/StringLiteralWithEmbeddedNulCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/StringLiteralWithEmbeddedNulCheck.h
@@ -1,4 +1,4 @@
-//===--- StringLiteralWithEmbeddedNulCheck.h - clang-tidy--------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/StringviewNullptrCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/StringviewNullptrCheck.cpp
index 20789b3123e2f..faa07fff5a369 100644
--- a/clang-tools-extra/clang-tidy/bugprone/StringviewNullptrCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/StringviewNullptrCheck.cpp
@@ -1,4 +1,4 @@
-//===--- StringviewNullptrCheck.cpp - clang-tidy --------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/StringviewNullptrCheck.h b/clang-tools-extra/clang-tidy/bugprone/StringviewNullptrCheck.h
index 20757cbbaaf7a..81a10101049c9 100644
--- a/clang-tools-extra/clang-tidy/bugprone/StringviewNullptrCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/StringviewNullptrCheck.h
@@ -1,4 +1,4 @@
-//===--- StringviewNullptrCheck.h - clang-tidy ------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/SuspiciousEnumUsageCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/SuspiciousEnumUsageCheck.cpp
index f2067bec001cc..8dbe1c0153f35 100644
--- a/clang-tools-extra/clang-tidy/bugprone/SuspiciousEnumUsageCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/SuspiciousEnumUsageCheck.cpp
@@ -1,4 +1,4 @@
-//===--- SuspiciousEnumUsageCheck.cpp - clang-tidy-------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/SuspiciousEnumUsageCheck.h b/clang-tools-extra/clang-tidy/bugprone/SuspiciousEnumUsageCheck.h
index c8a70c5f07043..542bf7577f927 100644
--- a/clang-tools-extra/clang-tidy/bugprone/SuspiciousEnumUsageCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/SuspiciousEnumUsageCheck.h
@@ -1,4 +1,4 @@
-//===--- SuspiciousEnumUsageCheck.h - clang-tidy------------------*- C++-*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/SuspiciousIncludeCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/SuspiciousIncludeCheck.cpp
index 09ba79f055752..843368e723f1f 100644
--- a/clang-tools-extra/clang-tidy/bugprone/SuspiciousIncludeCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/SuspiciousIncludeCheck.cpp
@@ -1,4 +1,4 @@
-//===--- SuspiciousIncludeCheck.cpp - clang-tidy --------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/SuspiciousIncludeCheck.h b/clang-tools-extra/clang-tidy/bugprone/SuspiciousIncludeCheck.h
index 1167b5a4593f7..03f569e5a483e 100644
--- a/clang-tools-extra/clang-tidy/bugprone/SuspiciousIncludeCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/SuspiciousIncludeCheck.h
@@ -1,4 +1,4 @@
-//===--- SuspiciousIncludeCheck.h - clang-tidy ------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/SuspiciousMemoryComparisonCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/SuspiciousMemoryComparisonCheck.cpp
index 84957e0b8190c..d1df2a8634035 100644
--- a/clang-tools-extra/clang-tidy/bugprone/SuspiciousMemoryComparisonCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/SuspiciousMemoryComparisonCheck.cpp
@@ -1,4 +1,4 @@
-//===--- SuspiciousMemoryComparisonCheck.cpp - clang-tidy -----------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/SuspiciousMemoryComparisonCheck.h b/clang-tools-extra/clang-tidy/bugprone/SuspiciousMemoryComparisonCheck.h
index 5625739ef1327..c36d256242e19 100644
--- a/clang-tools-extra/clang-tidy/bugprone/SuspiciousMemoryComparisonCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/SuspiciousMemoryComparisonCheck.h
@@ -1,4 +1,4 @@
-//===--- SuspiciousMemoryComparisonCheck.h - clang-tidy ---------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/SuspiciousMemsetUsageCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/SuspiciousMemsetUsageCheck.cpp
index cc1bd622039bc..b1d12ba306814 100644
--- a/clang-tools-extra/clang-tidy/bugprone/SuspiciousMemsetUsageCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/SuspiciousMemsetUsageCheck.cpp
@@ -1,4 +1,4 @@
-//===--- SuspiciousMemsetUsageCheck.cpp - clang-tidy-----------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/SuspiciousMemsetUsageCheck.h b/clang-tools-extra/clang-tidy/bugprone/SuspiciousMemsetUsageCheck.h
index db2c09a86ddbd..41ef525c7f9dd 100644
--- a/clang-tools-extra/clang-tidy/bugprone/SuspiciousMemsetUsageCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/SuspiciousMemsetUsageCheck.h
@@ -1,4 +1,4 @@
-//===--- SuspiciousMemsetUsageCheck.h - clang-tidy---------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/SuspiciousMissingCommaCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/SuspiciousMissingCommaCheck.cpp
index 5b1b28dbfbadd..a41f65083653a 100644
--- a/clang-tools-extra/clang-tidy/bugprone/SuspiciousMissingCommaCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/SuspiciousMissingCommaCheck.cpp
@@ -1,4 +1,4 @@
-//===--- SuspiciousMissingCommaCheck.cpp - clang-tidy----------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/SuspiciousMissingCommaCheck.h b/clang-tools-extra/clang-tidy/bugprone/SuspiciousMissingCommaCheck.h
index 215344b6bfe53..3a26b0a4a317e 100644
--- a/clang-tools-extra/clang-tidy/bugprone/SuspiciousMissingCommaCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/SuspiciousMissingCommaCheck.h
@@ -1,4 +1,4 @@
-//===--- SuspiciousMissingCommaCheck.h - clang-tidy--------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/SuspiciousReallocUsageCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/SuspiciousReallocUsageCheck.cpp
index 221cc832882f2..b5da8016f2cc8 100644
--- a/clang-tools-extra/clang-tidy/bugprone/SuspiciousReallocUsageCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/SuspiciousReallocUsageCheck.cpp
@@ -1,4 +1,4 @@
-//===--- SuspiciousReallocUsageCheck.cpp - clang-tidy----------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/SuspiciousReallocUsageCheck.h b/clang-tools-extra/clang-tidy/bugprone/SuspiciousReallocUsageCheck.h
index 2dcbd348697b4..2517d5f7ae319 100644
--- a/clang-tools-extra/clang-tidy/bugprone/SuspiciousReallocUsageCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/SuspiciousReallocUsageCheck.h
@@ -1,4 +1,4 @@
-//===--- SuspiciousReallocUsageCheck.h - clang-tidy -------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/SuspiciousSemicolonCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/SuspiciousSemicolonCheck.cpp
index e93ba760f447e..543d31285af8c 100644
--- a/clang-tools-extra/clang-tidy/bugprone/SuspiciousSemicolonCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/SuspiciousSemicolonCheck.cpp
@@ -1,4 +1,4 @@
-//===--- SuspiciousSemicolonCheck.cpp - clang-tidy-------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/SuspiciousSemicolonCheck.h b/clang-tools-extra/clang-tidy/bugprone/SuspiciousSemicolonCheck.h
index c97bfad665595..73131c7f9f12a 100644
--- a/clang-tools-extra/clang-tidy/bugprone/SuspiciousSemicolonCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/SuspiciousSemicolonCheck.h
@@ -1,4 +1,4 @@
-//===--- SuspiciousSemicolonCheck.h - clang-tidy-----------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/SuspiciousStringCompareCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/SuspiciousStringCompareCheck.cpp
index 33cf04dd56593..7519685418c8c 100644
--- a/clang-tools-extra/clang-tidy/bugprone/SuspiciousStringCompareCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/SuspiciousStringCompareCheck.cpp
@@ -1,4 +1,4 @@
-//===--- SuspiciousStringCompareCheck.cpp - clang-tidy---------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/SuspiciousStringCompareCheck.h b/clang-tools-extra/clang-tidy/bugprone/SuspiciousStringCompareCheck.h
index c399c26786d89..6f01b1ad087bd 100644
--- a/clang-tools-extra/clang-tidy/bugprone/SuspiciousStringCompareCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/SuspiciousStringCompareCheck.h
@@ -1,4 +1,4 @@
-//===--- SuspiciousStringCompareCheck.h - clang-tidy-------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/SuspiciousStringviewDataUsageCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/SuspiciousStringviewDataUsageCheck.cpp
index 8f4b0c5e0dced..d239cbe1fd2cf 100644
--- a/clang-tools-extra/clang-tidy/bugprone/SuspiciousStringviewDataUsageCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/SuspiciousStringviewDataUsageCheck.cpp
@@ -1,4 +1,4 @@
-//===--- SuspiciousStringviewDataUsageCheck.cpp - clang-tidy --------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/SuspiciousStringviewDataUsageCheck.h b/clang-tools-extra/clang-tidy/bugprone/SuspiciousStringviewDataUsageCheck.h
index 31eca0a48722f..57cb164af8565 100644
--- a/clang-tools-extra/clang-tidy/bugprone/SuspiciousStringviewDataUsageCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/SuspiciousStringviewDataUsageCheck.h
@@ -1,4 +1,4 @@
-//===--- SuspiciousStringviewDataUsageCheck.h - clang-tidy -------//C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/SwappedArgumentsCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/SwappedArgumentsCheck.cpp
index 7fdb67e9a7cd9..bcedff5ef5aa2 100644
--- a/clang-tools-extra/clang-tidy/bugprone/SwappedArgumentsCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/SwappedArgumentsCheck.cpp
@@ -1,4 +1,4 @@
-//===--- SwappedArgumentsCheck.cpp - clang-tidy ---------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/SwappedArgumentsCheck.h b/clang-tools-extra/clang-tidy/bugprone/SwappedArgumentsCheck.h
index dcf57eca956f5..e9e779c0cb3d9 100644
--- a/clang-tools-extra/clang-tidy/bugprone/SwappedArgumentsCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/SwappedArgumentsCheck.h
@@ -1,4 +1,4 @@
-//===--- SwappedArgumentsCheck.h - clang-tidy -------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/SwitchMissingDefaultCaseCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/SwitchMissingDefaultCaseCheck.cpp
index b8fc62e8c3292..d821c40f2760a 100644
--- a/clang-tools-extra/clang-tidy/bugprone/SwitchMissingDefaultCaseCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/SwitchMissingDefaultCaseCheck.cpp
@@ -1,4 +1,4 @@
-//===--- SwitchMissingDefaultCaseCheck.cpp - clang-tidy -------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/SwitchMissingDefaultCaseCheck.h b/clang-tools-extra/clang-tidy/bugprone/SwitchMissingDefaultCaseCheck.h
index b0d6e2062b997..f5237775650ea 100644
--- a/clang-tools-extra/clang-tidy/bugprone/SwitchMissingDefaultCaseCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/SwitchMissingDefaultCaseCheck.h
@@ -1,4 +1,4 @@
-//===--- SwitchMissingDefaultCaseCheck.h - clang-tidy -----------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/TaggedUnionMemberCountCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/TaggedUnionMemberCountCheck.cpp
index 02f4421efdbf4..a85a136b92e87 100644
--- a/clang-tools-extra/clang-tidy/bugprone/TaggedUnionMemberCountCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/TaggedUnionMemberCountCheck.cpp
@@ -1,4 +1,4 @@
-//===--- TaggedUnionMemberCountCheck.cpp - clang-tidy ---------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/TaggedUnionMemberCountCheck.h b/clang-tools-extra/clang-tidy/bugprone/TaggedUnionMemberCountCheck.h
index 8b9d677d00b40..0c337df405061 100644
--- a/clang-tools-extra/clang-tidy/bugprone/TaggedUnionMemberCountCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/TaggedUnionMemberCountCheck.h
@@ -1,4 +1,4 @@
-//===--- TaggedUnionMemberCountCheck.h - clang-tidy -------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/TerminatingContinueCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/TerminatingContinueCheck.cpp
index d31211b571734..c8ce77ed6a4ab 100644
--- a/clang-tools-extra/clang-tidy/bugprone/TerminatingContinueCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/TerminatingContinueCheck.cpp
@@ -1,4 +1,4 @@
-//===--- TerminatingContinueCheck.cpp - clang-tidy-------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/TerminatingContinueCheck.h b/clang-tools-extra/clang-tidy/bugprone/TerminatingContinueCheck.h
index 0593c7433c94e..79a794de3819a 100644
--- a/clang-tools-extra/clang-tidy/bugprone/TerminatingContinueCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/TerminatingContinueCheck.h
@@ -1,4 +1,4 @@
-//===--- TerminatingContinueCheck.h - clang-tidy-----------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/ThrowKeywordMissingCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/ThrowKeywordMissingCheck.cpp
index 17d2e75e4f666..89eafb15f2652 100644
--- a/clang-tools-extra/clang-tidy/bugprone/ThrowKeywordMissingCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/ThrowKeywordMissingCheck.cpp
@@ -1,4 +1,4 @@
-//===--- ThrowKeywordMissingCheck.cpp - clang-tidy-------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/ThrowKeywordMissingCheck.h b/clang-tools-extra/clang-tidy/bugprone/ThrowKeywordMissingCheck.h
index 018dceb010c26..ee1e7d20d39e0 100644
--- a/clang-tools-extra/clang-tidy/bugprone/ThrowKeywordMissingCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/ThrowKeywordMissingCheck.h
@@ -1,4 +1,4 @@
-//===--- ThrowKeywordMissingCheck.h - clang-tidy-----------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/TooSmallLoopVariableCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/TooSmallLoopVariableCheck.cpp
index 4ceeefb78ee82..536b6806c66e6 100644
--- a/clang-tools-extra/clang-tidy/bugprone/TooSmallLoopVariableCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/TooSmallLoopVariableCheck.cpp
@@ -1,4 +1,4 @@
-//===--- TooSmallLoopVariableCheck.cpp - clang-tidy -----------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/TooSmallLoopVariableCheck.h b/clang-tools-extra/clang-tidy/bugprone/TooSmallLoopVariableCheck.h
index 03065e3a706a7..e2c1bb7b002e3 100644
--- a/clang-tools-extra/clang-tidy/bugprone/TooSmallLoopVariableCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/TooSmallLoopVariableCheck.h
@@ -1,4 +1,4 @@
-//===--- TooSmallLoopVariableCheck.h - clang-tidy ---------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/UncheckedOptionalAccessCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/UncheckedOptionalAccessCheck.cpp
index 0b51d5677929c..e8f204128cae3 100644
--- a/clang-tools-extra/clang-tidy/bugprone/UncheckedOptionalAccessCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/UncheckedOptionalAccessCheck.cpp
@@ -1,4 +1,4 @@
-//===--- UncheckedOptionalAccessCheck.cpp - clang-tidy --------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/UncheckedOptionalAccessCheck.h b/clang-tools-extra/clang-tidy/bugprone/UncheckedOptionalAccessCheck.h
index e2fcccbfefb26..3c0f261126823 100644
--- a/clang-tools-extra/clang-tidy/bugprone/UncheckedOptionalAccessCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/UncheckedOptionalAccessCheck.h
@@ -1,4 +1,4 @@
-//===--- UncheckedOptionalAccessCheck.h - clang-tidy ------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/UndefinedMemoryManipulationCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/UndefinedMemoryManipulationCheck.cpp
index 4f6bc18151789..c5a0b3d6d963b 100644
--- a/clang-tools-extra/clang-tidy/bugprone/UndefinedMemoryManipulationCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/UndefinedMemoryManipulationCheck.cpp
@@ -1,4 +1,4 @@
-//===--- UndefinedMemoryManipulationCheck.cpp - clang-tidy-----------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/UndefinedMemoryManipulationCheck.h b/clang-tools-extra/clang-tidy/bugprone/UndefinedMemoryManipulationCheck.h
index 5e2d7d8ce48ec..fd067c48a16e0 100644
--- a/clang-tools-extra/clang-tidy/bugprone/UndefinedMemoryManipulationCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/UndefinedMemoryManipulationCheck.h
@@ -1,4 +1,4 @@
-//===--- UndefinedMemoryManipulationCheck.h - clang-tidy---------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/UndelegatedConstructorCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/UndelegatedConstructorCheck.cpp
index a4c1fd53dfbe2..c358a8e0378bd 100644
--- a/clang-tools-extra/clang-tidy/bugprone/UndelegatedConstructorCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/UndelegatedConstructorCheck.cpp
@@ -1,4 +1,4 @@
-//===--- UndelegatedConstructorCheck.cpp - clang-tidy----------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/UndelegatedConstructorCheck.h b/clang-tools-extra/clang-tidy/bugprone/UndelegatedConstructorCheck.h
index 03cf5606ef529..18465f7353b1d 100644
--- a/clang-tools-extra/clang-tidy/bugprone/UndelegatedConstructorCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/UndelegatedConstructorCheck.h
@@ -1,4 +1,4 @@
-//===--- UndelegatedConstructorCheck.h - clang-tidy -------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/UnhandledExceptionAtNewCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/UnhandledExceptionAtNewCheck.cpp
index 5e220017c97f4..bf30753f0e5ef 100644
--- a/clang-tools-extra/clang-tidy/bugprone/UnhandledExceptionAtNewCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/UnhandledExceptionAtNewCheck.cpp
@@ -1,4 +1,4 @@
-//===--- UnhandledExceptionAtNewCheck.cpp - clang-tidy --------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/UnhandledExceptionAtNewCheck.h b/clang-tools-extra/clang-tidy/bugprone/UnhandledExceptionAtNewCheck.h
index 6e2c6aa373de7..0724b4ac6d3e9 100644
--- a/clang-tools-extra/clang-tidy/bugprone/UnhandledExceptionAtNewCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/UnhandledExceptionAtNewCheck.h
@@ -1,4 +1,4 @@
-//===--- UnhandledExceptionAtNewCheck.h - clang-tidy ------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/UnhandledSelfAssignmentCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/UnhandledSelfAssignmentCheck.cpp
index c4c4267545b59..b696089c006c7 100644
--- a/clang-tools-extra/clang-tidy/bugprone/UnhandledSelfAssignmentCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/UnhandledSelfAssignmentCheck.cpp
@@ -1,4 +1,4 @@
-//===--- UnhandledSelfAssignmentCheck.cpp - clang-tidy --------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/UnhandledSelfAssignmentCheck.h b/clang-tools-extra/clang-tidy/bugprone/UnhandledSelfAssignmentCheck.h
index f666e6bfad2e6..61d33028aadc8 100644
--- a/clang-tools-extra/clang-tidy/bugprone/UnhandledSelfAssignmentCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/UnhandledSelfAssignmentCheck.h
@@ -1,4 +1,4 @@
-//===--- UnhandledSelfAssignmentCheck.h - clang-tidy ------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/UnintendedCharOstreamOutputCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/UnintendedCharOstreamOutputCheck.cpp
index 57e1f744fcd7d..bce46572bdeb9 100644
--- a/clang-tools-extra/clang-tidy/bugprone/UnintendedCharOstreamOutputCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/UnintendedCharOstreamOutputCheck.cpp
@@ -1,4 +1,4 @@
-//===--- UnintendedCharOstreamOutputCheck.cpp - clang-tidy ----------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/UnintendedCharOstreamOutputCheck.h b/clang-tools-extra/clang-tidy/bugprone/UnintendedCharOstreamOutputCheck.h
index 0759e3d1eb460..af53dc6158696 100644
--- a/clang-tools-extra/clang-tidy/bugprone/UnintendedCharOstreamOutputCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/UnintendedCharOstreamOutputCheck.h
@@ -1,4 +1,4 @@
-//===--- UnintendedCharOstreamOutputCheck.h - clang-tidy --------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/UniquePtrArrayMismatchCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/UniquePtrArrayMismatchCheck.cpp
index 8d09b4b320c2c..34c2c6dd4642d 100644
--- a/clang-tools-extra/clang-tidy/bugprone/UniquePtrArrayMismatchCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/UniquePtrArrayMismatchCheck.cpp
@@ -1,4 +1,4 @@
-//===--- UniquePtrArrayMismatchCheck.cpp - clang-tidy ---------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/UniquePtrArrayMismatchCheck.h b/clang-tools-extra/clang-tidy/bugprone/UniquePtrArrayMismatchCheck.h
index fb7531ab146c6..36be247c409cb 100644
--- a/clang-tools-extra/clang-tidy/bugprone/UniquePtrArrayMismatchCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/UniquePtrArrayMismatchCheck.h
@@ -1,4 +1,4 @@
-//===--- UniquePtrArrayMismatchCheck.h - clang-tidy -------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/UnsafeFunctionsCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/UnsafeFunctionsCheck.cpp
index 0f2c18ae02663..0399af2a673f4 100644
--- a/clang-tools-extra/clang-tidy/bugprone/UnsafeFunctionsCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/UnsafeFunctionsCheck.cpp
@@ -1,4 +1,4 @@
-//===--- UnsafeFunctionsCheck.cpp - clang-tidy ----------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/UnsafeFunctionsCheck.h b/clang-tools-extra/clang-tidy/bugprone/UnsafeFunctionsCheck.h
index 9b2ec990be01f..6495bd34f6c58 100644
--- a/clang-tools-extra/clang-tidy/bugprone/UnsafeFunctionsCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/UnsafeFunctionsCheck.h
@@ -1,4 +1,4 @@
-//===--- UnsafeFunctionsCheck.h - clang-tidy --------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/UnusedLocalNonTrivialVariableCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/UnusedLocalNonTrivialVariableCheck.cpp
index 3b6969a57c2b8..2b7db2548cfb2 100644
--- a/clang-tools-extra/clang-tidy/bugprone/UnusedLocalNonTrivialVariableCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/UnusedLocalNonTrivialVariableCheck.cpp
@@ -1,4 +1,4 @@
-//===--- UnusedLocalNonTrivialVariableCheck.cpp - clang-tidy --------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/UnusedLocalNonTrivialVariableCheck.h b/clang-tools-extra/clang-tidy/bugprone/UnusedLocalNonTrivialVariableCheck.h
index e79b803a2158b..92eaf290f2073 100644
--- a/clang-tools-extra/clang-tidy/bugprone/UnusedLocalNonTrivialVariableCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/UnusedLocalNonTrivialVariableCheck.h
@@ -1,4 +1,4 @@
-//===--- UnusedLocalNonTrivialVariableCheck.h - clang-tidy ------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/UnusedRaiiCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/UnusedRaiiCheck.cpp
index b17d3868dd76a..dae679baf14e5 100644
--- a/clang-tools-extra/clang-tidy/bugprone/UnusedRaiiCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/UnusedRaiiCheck.cpp
@@ -1,4 +1,4 @@
-//===--- UnusedRaiiCheck.cpp - clang-tidy ---------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/UnusedRaiiCheck.h b/clang-tools-extra/clang-tidy/bugprone/UnusedRaiiCheck.h
index 219fa07fe8265..376f664f74548 100644
--- a/clang-tools-extra/clang-tidy/bugprone/UnusedRaiiCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/UnusedRaiiCheck.h
@@ -1,4 +1,4 @@
-//===--- UnusedRaiiCheck.h - clang-tidy -------------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/UnusedReturnValueCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/UnusedReturnValueCheck.cpp
index 6f08c41b41887..c2fc4af86391d 100644
--- a/clang-tools-extra/clang-tidy/bugprone/UnusedReturnValueCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/UnusedReturnValueCheck.cpp
@@ -1,4 +1,4 @@
-//===--- UnusedReturnValueCheck.cpp - clang-tidy---------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/UnusedReturnValueCheck.h b/clang-tools-extra/clang-tidy/bugprone/UnusedReturnValueCheck.h
index d65a567e1c468..f81603cadbe80 100644
--- a/clang-tools-extra/clang-tidy/bugprone/UnusedReturnValueCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/UnusedReturnValueCheck.h
@@ -1,4 +1,4 @@
-//===--- UnusedReturnValueCheck.h - clang-tidy-------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/UseAfterMoveCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/UseAfterMoveCheck.cpp
index f9906ebf6ea26..efb5ec64689cf 100644
--- a/clang-tools-extra/clang-tidy/bugprone/UseAfterMoveCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/UseAfterMoveCheck.cpp
@@ -1,4 +1,4 @@
-//===--- UseAfterMoveCheck.cpp - clang-tidy -------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/UseAfterMoveCheck.h b/clang-tools-extra/clang-tidy/bugprone/UseAfterMoveCheck.h
index c14e802847415..ac85c80ee0b5b 100644
--- a/clang-tools-extra/clang-tidy/bugprone/UseAfterMoveCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/UseAfterMoveCheck.h
@@ -1,4 +1,4 @@
-//===--- UseAfterMoveCheck.h - clang-tidy ---------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/VirtualNearMissCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/VirtualNearMissCheck.cpp
index 509fce3a38471..0c8d2b8ef40f9 100644
--- a/clang-tools-extra/clang-tidy/bugprone/VirtualNearMissCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/VirtualNearMissCheck.cpp
@@ -1,4 +1,4 @@
-//===--- VirtualNearMissCheck.cpp - clang-tidy-----------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/bugprone/VirtualNearMissCheck.h b/clang-tools-extra/clang-tidy/bugprone/VirtualNearMissCheck.h
index 0d6b6db7f9a7f..b852dffa7c6ea 100644
--- a/clang-tools-extra/clang-tidy/bugprone/VirtualNearMissCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/VirtualNearMissCheck.h
@@ -1,4 +1,4 @@
-//===--- VirtualNearMissCheck.h - clang-tidy---------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/cert/CERTTidyModule.cpp b/clang-tools-extra/clang-tidy/cert/CERTTidyModule.cpp
index a0d0ac1007c3e..c9c150dc230b5 100644
--- a/clang-tools-extra/clang-tidy/cert/CERTTidyModule.cpp
+++ b/clang-tools-extra/clang-tidy/cert/CERTTidyModule.cpp
@@ -1,4 +1,4 @@
-//===--- CERTTidyModule.cpp - clang-tidy ----------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/cert/CommandProcessorCheck.cpp b/clang-tools-extra/clang-tidy/cert/CommandProcessorCheck.cpp
index d377c24da1f46..d87396f5189b1 100644
--- a/clang-tools-extra/clang-tidy/cert/CommandProcessorCheck.cpp
+++ b/clang-tools-extra/clang-tidy/cert/CommandProcessorCheck.cpp
@@ -1,4 +1,4 @@
-//===-- CommandProcessorCheck.cpp - clang-tidy ----------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/cert/CommandProcessorCheck.h b/clang-tools-extra/clang-tidy/cert/CommandProcessorCheck.h
index 1f9206cae73d7..94234f284c045 100644
--- a/clang-tools-extra/clang-tidy/cert/CommandProcessorCheck.h
+++ b/clang-tools-extra/clang-tidy/cert/CommandProcessorCheck.h
@@ -1,4 +1,4 @@
-//===--- CommandInterpreterCheck.h - clang-tidy------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/cert/DefaultOperatorNewAlignmentCheck.cpp b/clang-tools-extra/clang-tidy/cert/DefaultOperatorNewAlignmentCheck.cpp
index 2c2248afb69e7..45c170ec20f4e 100644
--- a/clang-tools-extra/clang-tidy/cert/DefaultOperatorNewAlignmentCheck.cpp
+++ b/clang-tools-extra/clang-tidy/cert/DefaultOperatorNewAlignmentCheck.cpp
@@ -1,4 +1,4 @@
-//===--- DefaultOperatorNewCheck.cpp - clang-tidy --------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/cert/DefaultOperatorNewAlignmentCheck.h b/clang-tools-extra/clang-tidy/cert/DefaultOperatorNewAlignmentCheck.h
index d38a9edb0f95d..f8cb4d6e32d69 100644
--- a/clang-tools-extra/clang-tidy/cert/DefaultOperatorNewAlignmentCheck.h
+++ b/clang-tools-extra/clang-tidy/cert/DefaultOperatorNewAlignmentCheck.h
@@ -1,4 +1,4 @@
-//===--- DefaultOperatorNewCheck.h - clang-tidy -----------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/cert/DontModifyStdNamespaceCheck.cpp b/clang-tools-extra/clang-tidy/cert/DontModifyStdNamespaceCheck.cpp
index e86ed6319a695..79fbc66b5f8a3 100644
--- a/clang-tools-extra/clang-tidy/cert/DontModifyStdNamespaceCheck.cpp
+++ b/clang-tools-extra/clang-tidy/cert/DontModifyStdNamespaceCheck.cpp
@@ -1,4 +1,4 @@
-//===--- DontModifyStdNamespaceCheck.cpp - clang-tidy----------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/cert/DontModifyStdNamespaceCheck.h b/clang-tools-extra/clang-tidy/cert/DontModifyStdNamespaceCheck.h
index d35affa0adc9c..cfcd878644ddb 100644
--- a/clang-tools-extra/clang-tidy/cert/DontModifyStdNamespaceCheck.h
+++ b/clang-tools-extra/clang-tidy/cert/DontModifyStdNamespaceCheck.h
@@ -1,4 +1,4 @@
-//===--- DontModifyStdNamespaceCheck.h - clang-tidy--------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/cert/FloatLoopCounter.cpp b/clang-tools-extra/clang-tidy/cert/FloatLoopCounter.cpp
index 46acc9f4716c5..01299e0e5ab48 100644
--- a/clang-tools-extra/clang-tidy/cert/FloatLoopCounter.cpp
+++ b/clang-tools-extra/clang-tidy/cert/FloatLoopCounter.cpp
@@ -1,4 +1,4 @@
-//===--- FloatLoopCounter.cpp - clang-tidy---------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/cert/FloatLoopCounter.h b/clang-tools-extra/clang-tidy/cert/FloatLoopCounter.h
index 7bc4422aef29b..e9207385f0d20 100644
--- a/clang-tools-extra/clang-tidy/cert/FloatLoopCounter.h
+++ b/clang-tools-extra/clang-tidy/cert/FloatLoopCounter.h
@@ -1,4 +1,4 @@
-//===--- FloatLoopCounter.h - clang-tidy-------------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/cert/LimitedRandomnessCheck.cpp b/clang-tools-extra/clang-tidy/cert/LimitedRandomnessCheck.cpp
index f1d2bcaa916d5..4fe9c6c22590b 100644
--- a/clang-tools-extra/clang-tidy/cert/LimitedRandomnessCheck.cpp
+++ b/clang-tools-extra/clang-tidy/cert/LimitedRandomnessCheck.cpp
@@ -1,4 +1,4 @@
-//===--- LimitedRandomnessCheck.cpp - clang-tidy---------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/cert/LimitedRandomnessCheck.h b/clang-tools-extra/clang-tidy/cert/LimitedRandomnessCheck.h
index 051e100e2cec8..b024b9008d876 100644
--- a/clang-tools-extra/clang-tidy/cert/LimitedRandomnessCheck.h
+++ b/clang-tools-extra/clang-tidy/cert/LimitedRandomnessCheck.h
@@ -1,4 +1,4 @@
-//===--- LimitedRandomnessCheck.h - clang-tidy-------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/cert/MutatingCopyCheck.cpp b/clang-tools-extra/clang-tidy/cert/MutatingCopyCheck.cpp
index a97fd720df86a..fb9d72ce6bd31 100644
--- a/clang-tools-extra/clang-tidy/cert/MutatingCopyCheck.cpp
+++ b/clang-tools-extra/clang-tidy/cert/MutatingCopyCheck.cpp
@@ -1,4 +1,4 @@
-//===--- MutatingCopyCheck.cpp - clang-tidy -------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/cert/MutatingCopyCheck.h b/clang-tools-extra/clang-tidy/cert/MutatingCopyCheck.h
index 8cb7b8e5fa2ce..ecb3d164b5272 100644
--- a/clang-tools-extra/clang-tidy/cert/MutatingCopyCheck.h
+++ b/clang-tools-extra/clang-tidy/cert/MutatingCopyCheck.h
@@ -1,4 +1,4 @@
-//===--- MutatingCopyCheck.h - clang-tidy -----------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/cert/NonTrivialTypesLibcMemoryCallsCheck.cpp b/clang-tools-extra/clang-tidy/cert/NonTrivialTypesLibcMemoryCallsCheck.cpp
index afeef2a80ba54..e266cf995e8a7 100644
--- a/clang-tools-extra/clang-tidy/cert/NonTrivialTypesLibcMemoryCallsCheck.cpp
+++ b/clang-tools-extra/clang-tidy/cert/NonTrivialTypesLibcMemoryCallsCheck.cpp
@@ -1,4 +1,4 @@
-//===--- NonTrivialTypesLibcMemoryCallsCheck.cpp - clang-tidy ----------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/cert/NonTrivialTypesLibcMemoryCallsCheck.h b/clang-tools-extra/clang-tidy/cert/NonTrivialTypesLibcMemoryCallsCheck.h
index aecb37dd7c739..221bdca0baae7 100644
--- a/clang-tools-extra/clang-tidy/cert/NonTrivialTypesLibcMemoryCallsCheck.h
+++ b/clang-tools-extra/clang-tidy/cert/NonTrivialTypesLibcMemoryCallsCheck.h
@@ -1,4 +1,4 @@
-//===--- NonTrivialTypesLibcMemoryCallsCheck.h - clang-tidy -----*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/cert/ProperlySeededRandomGeneratorCheck.cpp b/clang-tools-extra/clang-tidy/cert/ProperlySeededRandomGeneratorCheck.cpp
index cf7b36e05dc4f..aa95fadb0290b 100644
--- a/clang-tools-extra/clang-tidy/cert/ProperlySeededRandomGeneratorCheck.cpp
+++ b/clang-tools-extra/clang-tidy/cert/ProperlySeededRandomGeneratorCheck.cpp
@@ -1,4 +1,4 @@
-//===--- ProperlySeededRandomGeneratorCheck.cpp - clang-tidy---------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/cert/ProperlySeededRandomGeneratorCheck.h b/clang-tools-extra/clang-tidy/cert/ProperlySeededRandomGeneratorCheck.h
index 9f96c6124c6f0..d34b8e702f670 100644
--- a/clang-tools-extra/clang-tidy/cert/ProperlySeededRandomGeneratorCheck.h
+++ b/clang-tools-extra/clang-tidy/cert/ProperlySeededRandomGeneratorCheck.h
@@ -1,4 +1,4 @@
-//===--- ProperlySeededRandomGeneratorCheck.h - clang-tidy-------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/cert/SetLongJmpCheck.cpp b/clang-tools-extra/clang-tidy/cert/SetLongJmpCheck.cpp
index e7d9342bf748f..4f282b2c6b344 100644
--- a/clang-tools-extra/clang-tidy/cert/SetLongJmpCheck.cpp
+++ b/clang-tools-extra/clang-tidy/cert/SetLongJmpCheck.cpp
@@ -1,4 +1,4 @@
-//===--- SetLongJmpCheck.cpp - clang-tidy----------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/cert/SetLongJmpCheck.h b/clang-tools-extra/clang-tidy/cert/SetLongJmpCheck.h
index 6e5d8385d9428..ced3d8cd1b316 100644
--- a/clang-tools-extra/clang-tidy/cert/SetLongJmpCheck.h
+++ b/clang-tools-extra/clang-tidy/cert/SetLongJmpCheck.h
@@ -1,4 +1,4 @@
-//===--- SetLongJmpCheck.h - clang-tidy--------------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/cert/StaticObjectExceptionCheck.cpp b/clang-tools-extra/clang-tidy/cert/StaticObjectExceptionCheck.cpp
index 12830a64bf23e..8f31851a63edc 100644
--- a/clang-tools-extra/clang-tidy/cert/StaticObjectExceptionCheck.cpp
+++ b/clang-tools-extra/clang-tidy/cert/StaticObjectExceptionCheck.cpp
@@ -1,4 +1,4 @@
-//===--- StaticObjectExceptionCheck.cpp - clang-tidy-----------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/cert/StaticObjectExceptionCheck.h b/clang-tools-extra/clang-tidy/cert/StaticObjectExceptionCheck.h
index 26ae6b478b44d..6de9929fb5cc7 100644
--- a/clang-tools-extra/clang-tidy/cert/StaticObjectExceptionCheck.h
+++ b/clang-tools-extra/clang-tidy/cert/StaticObjectExceptionCheck.h
@@ -1,4 +1,4 @@
-//===--- StaticObjectExceptionCheck.h - clang-tidy---------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/cert/ThrownExceptionTypeCheck.cpp b/clang-tools-extra/clang-tidy/cert/ThrownExceptionTypeCheck.cpp
index cbc2991ca6399..2225a90aeece1 100644
--- a/clang-tools-extra/clang-tidy/cert/ThrownExceptionTypeCheck.cpp
+++ b/clang-tools-extra/clang-tidy/cert/ThrownExceptionTypeCheck.cpp
@@ -1,4 +1,4 @@
-//===--- ThrownExceptionTypeCheck.cpp - clang-tidy-------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/cert/ThrownExceptionTypeCheck.h b/clang-tools-extra/clang-tidy/cert/ThrownExceptionTypeCheck.h
index 47469a1e82c7c..9b97feb7fe5f5 100644
--- a/clang-tools-extra/clang-tidy/cert/ThrownExceptionTypeCheck.h
+++ b/clang-tools-extra/clang-tidy/cert/ThrownExceptionTypeCheck.h
@@ -1,4 +1,4 @@
-//===--- ThrownExceptionTypeCheck.h - clang-tidy-----------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/cert/VariadicFunctionDefCheck.cpp b/clang-tools-extra/clang-tidy/cert/VariadicFunctionDefCheck.cpp
index 5fba32417db42..c330d4691443f 100644
--- a/clang-tools-extra/clang-tidy/cert/VariadicFunctionDefCheck.cpp
+++ b/clang-tools-extra/clang-tidy/cert/VariadicFunctionDefCheck.cpp
@@ -1,4 +1,4 @@
-//===-- VariadicFunctionDefCheck.cpp - clang-tidy -------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/cert/VariadicFunctionDefCheck.h b/clang-tools-extra/clang-tidy/cert/VariadicFunctionDefCheck.h
index a082e370c3228..a7f5f11974aa3 100644
--- a/clang-tools-extra/clang-tidy/cert/VariadicFunctionDefCheck.h
+++ b/clang-tools-extra/clang-tidy/cert/VariadicFunctionDefCheck.h
@@ -1,4 +1,4 @@
-//===--- VariadicFunctionDefCheck.h - clang-tidy-----------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/concurrency/ConcurrencyTidyModule.cpp b/clang-tools-extra/clang-tidy/concurrency/ConcurrencyTidyModule.cpp
index 6c58c506dc903..135a54d4565cb 100644
--- a/clang-tools-extra/clang-tidy/concurrency/ConcurrencyTidyModule.cpp
+++ b/clang-tools-extra/clang-tidy/concurrency/ConcurrencyTidyModule.cpp
@@ -1,4 +1,4 @@
-//===--- ConcurrencyTidyModule.cpp - clang-tidy ---------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/concurrency/MtUnsafeCheck.cpp b/clang-tools-extra/clang-tidy/concurrency/MtUnsafeCheck.cpp
index 7e80471b12302..f8050bcfe3263 100644
--- a/clang-tools-extra/clang-tidy/concurrency/MtUnsafeCheck.cpp
+++ b/clang-tools-extra/clang-tidy/concurrency/MtUnsafeCheck.cpp
@@ -1,4 +1,4 @@
-//===--- MtUnsafeCheck.cpp - clang-tidy -----------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/concurrency/MtUnsafeCheck.h b/clang-tools-extra/clang-tidy/concurrency/MtUnsafeCheck.h
index ccee5b3a748a3..c5c707778bc32 100644
--- a/clang-tools-extra/clang-tidy/concurrency/MtUnsafeCheck.h
+++ b/clang-tools-extra/clang-tidy/concurrency/MtUnsafeCheck.h
@@ -1,4 +1,4 @@
-//===--- MtUnsafeCheck.h - clang-tidy ---------------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/concurrency/ThreadCanceltypeAsynchronousCheck.cpp b/clang-tools-extra/clang-tidy/concurrency/ThreadCanceltypeAsynchronousCheck.cpp
index 130b56fb6cd04..9e9c908565497 100644
--- a/clang-tools-extra/clang-tidy/concurrency/ThreadCanceltypeAsynchronousCheck.cpp
+++ b/clang-tools-extra/clang-tidy/concurrency/ThreadCanceltypeAsynchronousCheck.cpp
@@ -1,4 +1,4 @@
-//===--- ThreadCanceltypeAsynchronousCheck.cpp - clang-tidy ---------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/concurrency/ThreadCanceltypeAsynchronousCheck.h b/clang-tools-extra/clang-tidy/concurrency/ThreadCanceltypeAsynchronousCheck.h
index dbfcb265640c9..2d5d82dfd9285 100644
--- a/clang-tools-extra/clang-tidy/concurrency/ThreadCanceltypeAsynchronousCheck.h
+++ b/clang-tools-extra/clang-tidy/concurrency/ThreadCanceltypeAsynchronousCheck.h
@@ -1,4 +1,4 @@
-//===--- ThreadCanceltypeAsynchronousCheck.h - clang-tidy -------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/AvoidCapturingLambdaCoroutinesCheck.cpp b/clang-tools-extra/clang-tidy/cppcoreguidelines/AvoidCapturingLambdaCoroutinesCheck.cpp
index 3c99831f9d640..15fb53c5c57b7 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/AvoidCapturingLambdaCoroutinesCheck.cpp
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/AvoidCapturingLambdaCoroutinesCheck.cpp
@@ -1,4 +1,4 @@
-//===--- AvoidCapturingLambdaCoroutinesCheck.cpp - clang-tidy -------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/AvoidCapturingLambdaCoroutinesCheck.h b/clang-tools-extra/clang-tidy/cppcoreguidelines/AvoidCapturingLambdaCoroutinesCheck.h
index b32e2662b5fba..de59ff189c595 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/AvoidCapturingLambdaCoroutinesCheck.h
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/AvoidCapturingLambdaCoroutinesCheck.h
@@ -1,4 +1,4 @@
-//===--- AvoidCapturingLambdaCoroutinesCheck.h - clang-tidy -----*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/AvoidConstOrRefDataMembersCheck.cpp b/clang-tools-extra/clang-tidy/cppcoreguidelines/AvoidConstOrRefDataMembersCheck.cpp
index dd913c92d60a0..78bcc3e7e0ecc 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/AvoidConstOrRefDataMembersCheck.cpp
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/AvoidConstOrRefDataMembersCheck.cpp
@@ -1,4 +1,4 @@
-//===--- AvoidConstOrRefDataMembersCheck.cpp - clang-tidy -----------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/AvoidConstOrRefDataMembersCheck.h b/clang-tools-extra/clang-tidy/cppcoreguidelines/AvoidConstOrRefDataMembersCheck.h
index de55e0049eaf7..9d458fe9a4d00 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/AvoidConstOrRefDataMembersCheck.h
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/AvoidConstOrRefDataMembersCheck.h
@@ -1,4 +1,4 @@
-//===--- AvoidConstOrRefDataMembersCheck.h - clang-tidy ---------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/AvoidDoWhileCheck.cpp b/clang-tools-extra/clang-tidy/cppcoreguidelines/AvoidDoWhileCheck.cpp
index d623e05e15cc0..5ecfd38e80918 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/AvoidDoWhileCheck.cpp
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/AvoidDoWhileCheck.cpp
@@ -1,4 +1,4 @@
-//===--- AvoidDoWhileCheck.cpp - clang-tidy -------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/AvoidDoWhileCheck.h b/clang-tools-extra/clang-tidy/cppcoreguidelines/AvoidDoWhileCheck.h
index 45259c23e26de..0756d0860f961 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/AvoidDoWhileCheck.h
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/AvoidDoWhileCheck.h
@@ -1,4 +1,4 @@
-//===--- AvoidDoWhileCheck.h - clang-tidy -----------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/AvoidGotoCheck.cpp b/clang-tools-extra/clang-tidy/cppcoreguidelines/AvoidGotoCheck.cpp
index b14587ad7db83..4fb0029cc4323 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/AvoidGotoCheck.cpp
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/AvoidGotoCheck.cpp
@@ -1,4 +1,4 @@
-//===--- AvoidGotoCheck.cpp - clang-tidy-----------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/AvoidGotoCheck.h b/clang-tools-extra/clang-tidy/cppcoreguidelines/AvoidGotoCheck.h
index 8eae409462c91..2b13df795d87c 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/AvoidGotoCheck.h
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/AvoidGotoCheck.h
@@ -1,4 +1,4 @@
-//===--- AvoidGotoCheck.h - clang-tidy---------------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/AvoidNonConstGlobalVariablesCheck.cpp b/clang-tools-extra/clang-tidy/cppcoreguidelines/AvoidNonConstGlobalVariablesCheck.cpp
index a97ec9fe3fe3d..f0e66e44690b2 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/AvoidNonConstGlobalVariablesCheck.cpp
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/AvoidNonConstGlobalVariablesCheck.cpp
@@ -1,4 +1,4 @@
-//===--- AvoidNonConstGlobalVariablesCheck.cpp - clang-tidy ---------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/AvoidNonConstGlobalVariablesCheck.h b/clang-tools-extra/clang-tidy/cppcoreguidelines/AvoidNonConstGlobalVariablesCheck.h
index a912763489db9..9c40fa3e9d341 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/AvoidNonConstGlobalVariablesCheck.h
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/AvoidNonConstGlobalVariablesCheck.h
@@ -1,4 +1,4 @@
-//===--- AvoidNonConstGlobalVariablesCheck.h - clang-tidy -------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/AvoidReferenceCoroutineParametersCheck.cpp b/clang-tools-extra/clang-tidy/cppcoreguidelines/AvoidReferenceCoroutineParametersCheck.cpp
index 3eca364d16c35..7ef1e2bc6178d 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/AvoidReferenceCoroutineParametersCheck.cpp
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/AvoidReferenceCoroutineParametersCheck.cpp
@@ -1,4 +1,4 @@
-//===--- AvoidReferenceCoroutineParametersCheck.cpp - clang-tidy ----------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/AvoidReferenceCoroutineParametersCheck.h b/clang-tools-extra/clang-tidy/cppcoreguidelines/AvoidReferenceCoroutineParametersCheck.h
index 0a4d5b33f2396..3469ea7a8efee 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/AvoidReferenceCoroutineParametersCheck.h
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/AvoidReferenceCoroutineParametersCheck.h
@@ -1,4 +1,4 @@
-//===--- AvoidReferenceCoroutineParametersCheck.h - clang-tidy --*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/CppCoreGuidelinesTidyModule.cpp b/clang-tools-extra/clang-tidy/cppcoreguidelines/CppCoreGuidelinesTidyModule.cpp
index cc1ae156eef3e..5f4c9b48e346a 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/CppCoreGuidelinesTidyModule.cpp
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/CppCoreGuidelinesTidyModule.cpp
@@ -1,4 +1,4 @@
-//===-- CppCoreGuidelinesTidyModule.cpp - clang-tidy ----------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/InitVariablesCheck.cpp b/clang-tools-extra/clang-tidy/cppcoreguidelines/InitVariablesCheck.cpp
index 3eef2fd12cc8e..ed595e1148dec 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/InitVariablesCheck.cpp
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/InitVariablesCheck.cpp
@@ -1,4 +1,4 @@
-//===--- InitVariablesCheck.cpp - clang-tidy ------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/InitVariablesCheck.h b/clang-tools-extra/clang-tidy/cppcoreguidelines/InitVariablesCheck.h
index 901500ac4b915..a1476494b4046 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/InitVariablesCheck.h
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/InitVariablesCheck.h
@@ -1,4 +1,4 @@
-//===--- InitVariablesCheck.h - clang-tidy ----------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/InterfacesGlobalInitCheck.cpp b/clang-tools-extra/clang-tidy/cppcoreguidelines/InterfacesGlobalInitCheck.cpp
index e9f0bd98cad16..788d0571ac7ff 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/InterfacesGlobalInitCheck.cpp
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/InterfacesGlobalInitCheck.cpp
@@ -1,4 +1,4 @@
-//===--- InterfacesGlobalInitCheck.cpp - clang-tidy------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/InterfacesGlobalInitCheck.h b/clang-tools-extra/clang-tidy/cppcoreguidelines/InterfacesGlobalInitCheck.h
index 4b04ec112486b..2141fc2423bdf 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/InterfacesGlobalInitCheck.h
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/InterfacesGlobalInitCheck.h
@@ -1,4 +1,4 @@
-//===--- InterfacesGlobalInitCheck.h - clang-tidy----------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/MacroUsageCheck.cpp b/clang-tools-extra/clang-tidy/cppcoreguidelines/MacroUsageCheck.cpp
index 11eb056e916d3..766cae45f15b5 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/MacroUsageCheck.cpp
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/MacroUsageCheck.cpp
@@ -1,4 +1,4 @@
-//===--- MacroUsageCheck.cpp - clang-tidy----------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/MacroUsageCheck.h b/clang-tools-extra/clang-tidy/cppcoreguidelines/MacroUsageCheck.h
index 876a18256080e..dd553ba613f1e 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/MacroUsageCheck.h
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/MacroUsageCheck.h
@@ -1,4 +1,4 @@
-//===--- MacroUsageCheck.h - clang-tidy--------------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/MisleadingCaptureDefaultByValueCheck.cpp b/clang-tools-extra/clang-tidy/cppcoreguidelines/MisleadingCaptureDefaultByValueCheck.cpp
index 5dee7f91a9341..57d98ee1fd8b4 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/MisleadingCaptureDefaultByValueCheck.cpp
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/MisleadingCaptureDefaultByValueCheck.cpp
@@ -1,4 +1,4 @@
-//===--- MisleadingCaptureDefaultByValueCheck.cpp - clang-tidy-------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/MisleadingCaptureDefaultByValueCheck.h b/clang-tools-extra/clang-tidy/cppcoreguidelines/MisleadingCaptureDefaultByValueCheck.h
index dcf2ce9afc740..87187b3b70bcb 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/MisleadingCaptureDefaultByValueCheck.h
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/MisleadingCaptureDefaultByValueCheck.h
@@ -1,4 +1,4 @@
-//===--- MisleadingCaptureDefaultByValueCheck.h - clang-tidy---------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/MissingStdForwardCheck.cpp b/clang-tools-extra/clang-tidy/cppcoreguidelines/MissingStdForwardCheck.cpp
index 75da6de9b5f13..090ab2f0474c4 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/MissingStdForwardCheck.cpp
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/MissingStdForwardCheck.cpp
@@ -1,4 +1,4 @@
-//===--- MissingStdForwardCheck.cpp - clang-tidy --------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/MissingStdForwardCheck.h b/clang-tools-extra/clang-tidy/cppcoreguidelines/MissingStdForwardCheck.h
index f833b8031f8af..247291076d939 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/MissingStdForwardCheck.h
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/MissingStdForwardCheck.h
@@ -1,4 +1,4 @@
-//===--- MissingStdForwardCheck.h - clang-tidy ------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/NoMallocCheck.cpp b/clang-tools-extra/clang-tidy/cppcoreguidelines/NoMallocCheck.cpp
index b81c6230b8941..22cd1e4e29a68 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/NoMallocCheck.cpp
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/NoMallocCheck.cpp
@@ -1,4 +1,4 @@
-//===--- NoMallocCheck.cpp - clang-tidy------------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/NoMallocCheck.h b/clang-tools-extra/clang-tidy/cppcoreguidelines/NoMallocCheck.h
index 8dea1465c3de7..4e664197b5f72 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/NoMallocCheck.h
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/NoMallocCheck.h
@@ -1,4 +1,4 @@
-﻿//===--- NoMallocCheck.h - clang-tidy----------------------------*- C++ -*-===//
+﻿//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/NoSuspendWithLockCheck.cpp b/clang-tools-extra/clang-tidy/cppcoreguidelines/NoSuspendWithLockCheck.cpp
index 29470b1f725fb..43df277927d8b 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/NoSuspendWithLockCheck.cpp
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/NoSuspendWithLockCheck.cpp
@@ -1,4 +1,4 @@
-//===--- NoSuspendWithLockCheck.cpp - clang-tidy --------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/NoSuspendWithLockCheck.h b/clang-tools-extra/clang-tidy/cppcoreguidelines/NoSuspendWithLockCheck.h
index c7b7f476003fb..877a5173e7f10 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/NoSuspendWithLockCheck.h
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/NoSuspendWithLockCheck.h
@@ -1,4 +1,4 @@
-//===--- NoSuspendWithLockCheck.h - clang-tidy ------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/OwningMemoryCheck.cpp b/clang-tools-extra/clang-tidy/cppcoreguidelines/OwningMemoryCheck.cpp
index 6cdd5bcac6370..f4e89470a80da 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/OwningMemoryCheck.cpp
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/OwningMemoryCheck.cpp
@@ -1,4 +1,4 @@
-//===--- OwningMemoryCheck.cpp - clang-tidy--------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/OwningMemoryCheck.h b/clang-tools-extra/clang-tidy/cppcoreguidelines/OwningMemoryCheck.h
index 3ab8f34b580f9..e191f09943710 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/OwningMemoryCheck.h
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/OwningMemoryCheck.h
@@ -1,4 +1,4 @@
-//===--- OwningMemoryCheck.h - clang-tidy------------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/PreferMemberInitializerCheck.cpp b/clang-tools-extra/clang-tidy/cppcoreguidelines/PreferMemberInitializerCheck.cpp
index 79cd4bbcc9a60..9913671c6f74e 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/PreferMemberInitializerCheck.cpp
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/PreferMemberInitializerCheck.cpp
@@ -1,4 +1,4 @@
-//===--- PreferMemberInitializerCheck.cpp - clang-tidy -------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/PreferMemberInitializerCheck.h b/clang-tools-extra/clang-tidy/cppcoreguidelines/PreferMemberInitializerCheck.h
index b3f8284b435af..6275aa61ba03d 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/PreferMemberInitializerCheck.h
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/PreferMemberInitializerCheck.h
@@ -1,4 +1,4 @@
-//===--- PreferMemberInitializerCheck.h - clang-tidy ------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsArrayToPointerDecayCheck.cpp b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsArrayToPointerDecayCheck.cpp
index 6f67ab955baa3..f3237f4d7dae0 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsArrayToPointerDecayCheck.cpp
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsArrayToPointerDecayCheck.cpp
@@ -1,4 +1,4 @@
-//===--- ProBoundsArrayToPointerDecayCheck.cpp - clang-tidy----------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsArrayToPointerDecayCheck.h b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsArrayToPointerDecayCheck.h
index bcbe40b5dd14c..abd4e5a77009d 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsArrayToPointerDecayCheck.h
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsArrayToPointerDecayCheck.h
@@ -1,4 +1,4 @@
-//===--- ProBoundsArrayToPointerDecayCheck.h - clang-tidy--------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsAvoidUncheckedContainerAccess.cpp b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsAvoidUncheckedContainerAccess.cpp
index 35f432efa88ca..dd7b2b553b7a1 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsAvoidUncheckedContainerAccess.cpp
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsAvoidUncheckedContainerAccess.cpp
@@ -1,4 +1,4 @@
-//===--- ProBoundsAvoidUncheckedContainerAccess.cpp - clang-tidy ----------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsAvoidUncheckedContainerAccess.h b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsAvoidUncheckedContainerAccess.h
index cfd52d69c0f58..2a89be4724037 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsAvoidUncheckedContainerAccess.h
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsAvoidUncheckedContainerAccess.h
@@ -1,4 +1,4 @@
-//===--- ProBoundsAvoidUncheckedContainerAccess.h - clang-tidy --*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsConstantArrayIndexCheck.cpp b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsConstantArrayIndexCheck.cpp
index 20f9a2e549fe2..634ec186616d5 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsConstantArrayIndexCheck.cpp
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsConstantArrayIndexCheck.cpp
@@ -1,4 +1,4 @@
-//===--- ProBoundsConstantArrayIndexCheck.cpp - clang-tidy-----------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsConstantArrayIndexCheck.h b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsConstantArrayIndexCheck.h
index a583cc78b2c54..19d4ef8e25121 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsConstantArrayIndexCheck.h
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsConstantArrayIndexCheck.h
@@ -1,4 +1,4 @@
-//===--- ProBoundsConstantArrayIndexCheck.h - clang-tidy---------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsPointerArithmeticCheck.cpp b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsPointerArithmeticCheck.cpp
index 51995c5f64ef6..b1cf7152aacd4 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsPointerArithmeticCheck.cpp
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsPointerArithmeticCheck.cpp
@@ -1,4 +1,4 @@
-//===--- ProBoundsPointerArithmeticCheck.cpp - clang-tidy------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsPointerArithmeticCheck.h b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsPointerArithmeticCheck.h
index 785f754055fb8..2bd113b38c4d4 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsPointerArithmeticCheck.h
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsPointerArithmeticCheck.h
@@ -1,4 +1,4 @@
-//===--- ProBoundsPointerArithmeticCheck.h - clang-tidy----------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeConstCastCheck.cpp b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeConstCastCheck.cpp
index b234c2a041d8c..0d038bfca60d5 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeConstCastCheck.cpp
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeConstCastCheck.cpp
@@ -1,4 +1,4 @@
-//===--- ProTypeConstCastCheck.cpp - clang-tidy----------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeConstCastCheck.h b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeConstCastCheck.h
index 8d93633a321b5..e05adc966a496 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeConstCastCheck.h
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeConstCastCheck.h
@@ -1,4 +1,4 @@
-//===--- ProTypeConstCastCheck.h - clang-tidy--------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeCstyleCastCheck.cpp b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeCstyleCastCheck.cpp
index 5e255dcaacd26..b9867c2393f0b 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeCstyleCastCheck.cpp
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeCstyleCastCheck.cpp
@@ -1,4 +1,4 @@
-//===--- ProTypeCstyleCastCheck.cpp - clang-tidy---------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeCstyleCastCheck.h b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeCstyleCastCheck.h
index b7e3525e397b2..e6819c40a2bfc 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeCstyleCastCheck.h
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeCstyleCastCheck.h
@@ -1,4 +1,4 @@
-//===--- ProTypeCstyleCastCheck.h - clang-tidy-------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeMemberInitCheck.cpp b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeMemberInitCheck.cpp
index a79c5281d6054..5de4e33a1e16d 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeMemberInitCheck.cpp
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeMemberInitCheck.cpp
@@ -1,4 +1,4 @@
-//===--- ProTypeMemberInitCheck.cpp - clang-tidy---------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeMemberInitCheck.h b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeMemberInitCheck.h
index cfe7c8735a0e0..58125303fb59b 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeMemberInitCheck.h
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeMemberInitCheck.h
@@ -1,4 +1,4 @@
-//===--- ProTypeMemberInitCheck.h - clang-tidy-------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeReinterpretCastCheck.cpp b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeReinterpretCastCheck.cpp
index 94cea79888555..1cd4bf7435be4 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeReinterpretCastCheck.cpp
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeReinterpretCastCheck.cpp
@@ -1,4 +1,4 @@
-//===--- ProTypeReinterpretCastCheck.cpp - clang-tidy----------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeReinterpretCastCheck.h b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeReinterpretCastCheck.h
index da001bfb85d78..63b04261ea436 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeReinterpretCastCheck.h
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeReinterpretCastCheck.h
@@ -1,4 +1,4 @@
-//===--- ProTypeReinterpretCast.h - clang-tidy------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeStaticCastDowncastCheck.cpp b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeStaticCastDowncastCheck.cpp
index 14616ee8514f7..c200a79cb8c49 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeStaticCastDowncastCheck.cpp
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeStaticCastDowncastCheck.cpp
@@ -1,4 +1,4 @@
-//===--- ProTypeStaticCastDowncastCheck.cpp - clang-tidy-------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeStaticCastDowncastCheck.h b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeStaticCastDowncastCheck.h
index b9e78a82a39f2..266441fd9144f 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeStaticCastDowncastCheck.h
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeStaticCastDowncastCheck.h
@@ -1,4 +1,4 @@
-//===--- ProTypeStaticCastDowncastCheck.h - clang-tidy-----------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeUnionAccessCheck.cpp b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeUnionAccessCheck.cpp
index 2793dfbc0eb3f..4361177db4251 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeUnionAccessCheck.cpp
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeUnionAccessCheck.cpp
@@ -1,4 +1,4 @@
-//===--- ProTypeUnionAccessCheck.cpp - clang-tidy--------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeUnionAccessCheck.h b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeUnionAccessCheck.h
index e90d154f9630f..5127e652b6466 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeUnionAccessCheck.h
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeUnionAccessCheck.h
@@ -1,4 +1,4 @@
-//===--- ProTypeUnionAccessCheck.h - clang-tidy------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeVarargCheck.cpp b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeVarargCheck.cpp
index 3923df312791d..431b2a76feeea 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeVarargCheck.cpp
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeVarargCheck.cpp
@@ -1,4 +1,4 @@
-//===--- ProTypeVarargCheck.cpp - clang-tidy-------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeVarargCheck.h b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeVarargCheck.h
index acb3d274908da..f3b20e6e793e5 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeVarargCheck.h
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeVarargCheck.h
@@ -1,4 +1,4 @@
-//===--- ProTypeVarargCheck.h - clang-tidy--------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/RvalueReferenceParamNotMovedCheck.cpp b/clang-tools-extra/clang-tidy/cppcoreguidelines/RvalueReferenceParamNotMovedCheck.cpp
index 272152644d7dd..c40ac7ab5102b 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/RvalueReferenceParamNotMovedCheck.cpp
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/RvalueReferenceParamNotMovedCheck.cpp
@@ -1,4 +1,4 @@
-//===--- RvalueReferenceParamNotMovedCheck.cpp - clang-tidy ---------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/RvalueReferenceParamNotMovedCheck.h b/clang-tools-extra/clang-tidy/cppcoreguidelines/RvalueReferenceParamNotMovedCheck.h
index 950c0206745d7..739e1d706acc3 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/RvalueReferenceParamNotMovedCheck.h
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/RvalueReferenceParamNotMovedCheck.h
@@ -1,4 +1,4 @@
-//===--- RvalueReferenceParamNotMovedCheck.h - clang-tidy -------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/SlicingCheck.cpp b/clang-tools-extra/clang-tidy/cppcoreguidelines/SlicingCheck.cpp
index 6508bfd5ca808..fe95dbba68118 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/SlicingCheck.cpp
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/SlicingCheck.cpp
@@ -1,4 +1,4 @@
-//===--- SlicingCheck.cpp - clang-tidy-------------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/SlicingCheck.h b/clang-tools-extra/clang-tidy/cppcoreguidelines/SlicingCheck.h
index 317547f0a9c87..6d89a8a622a61 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/SlicingCheck.h
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/SlicingCheck.h
@@ -1,4 +1,4 @@
-//===--- SlicingCheck.h - clang-tidy-----------------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/SpecialMemberFunctionsCheck.cpp b/clang-tools-extra/clang-tidy/cppcoreguidelines/SpecialMemberFunctionsCheck.cpp
index 0b6b8d9c97135..b38a0c66eb582 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/SpecialMemberFunctionsCheck.cpp
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/SpecialMemberFunctionsCheck.cpp
@@ -1,4 +1,4 @@
-//===--- SpecialMemberFunctionsCheck.cpp - clang-tidy----------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/SpecialMemberFunctionsCheck.h b/clang-tools-extra/clang-tidy/cppcoreguidelines/SpecialMemberFunctionsCheck.h
index c18ed7db055ba..ffd072a7f6a98 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/SpecialMemberFunctionsCheck.h
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/SpecialMemberFunctionsCheck.h
@@ -1,4 +1,4 @@
-//===--- SpecialMemberFunctionsCheck.h - clang-tidy--------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/UseEnumClassCheck.cpp b/clang-tools-extra/clang-tidy/cppcoreguidelines/UseEnumClassCheck.cpp
index ec7d9237afa3c..9e809e0bedb49 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/UseEnumClassCheck.cpp
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/UseEnumClassCheck.cpp
@@ -1,4 +1,4 @@
-//===--- UseEnumClassCheck.cpp - clang-tidy -------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/UseEnumClassCheck.h b/clang-tools-extra/clang-tidy/cppcoreguidelines/UseEnumClassCheck.h
index dfa4b7e3fda62..c699f9116a120 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/UseEnumClassCheck.h
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/UseEnumClassCheck.h
@@ -1,4 +1,4 @@
-//===--- UseEnumClassCheck.h - clang-tidy -----------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/VirtualClassDestructorCheck.cpp b/clang-tools-extra/clang-tidy/cppcoreguidelines/VirtualClassDestructorCheck.cpp
index e31d046565677..770088991419b 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/VirtualClassDestructorCheck.cpp
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/VirtualClassDestructorCheck.cpp
@@ -1,4 +1,4 @@
-//===--- VirtualClassDestructorCheck.cpp - clang-tidy -----------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/VirtualClassDestructorCheck.h b/clang-tools-extra/clang-tidy/cppcoreguidelines/VirtualClassDestructorCheck.h
index 11bd598f81ac3..2c9d92ddeb4a7 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/VirtualClassDestructorCheck.h
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/VirtualClassDestructorCheck.h
@@ -1,4 +1,4 @@
-//===--- VirtualClassDestructorCheck.h - clang-tidy -------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/darwin/AvoidSpinlockCheck.cpp b/clang-tools-extra/clang-tidy/darwin/AvoidSpinlockCheck.cpp
index 2d3a7e50f242e..875a851586578 100644
--- a/clang-tools-extra/clang-tidy/darwin/AvoidSpinlockCheck.cpp
+++ b/clang-tools-extra/clang-tidy/darwin/AvoidSpinlockCheck.cpp
@@ -1,4 +1,4 @@
-//===--- AvoidSpinlockCheck.cpp - clang-tidy-------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/darwin/AvoidSpinlockCheck.h b/clang-tools-extra/clang-tidy/darwin/AvoidSpinlockCheck.h
index 5b5285710c3b0..78cc968ba4efd 100644
--- a/clang-tools-extra/clang-tidy/darwin/AvoidSpinlockCheck.h
+++ b/clang-tools-extra/clang-tidy/darwin/AvoidSpinlockCheck.h
@@ -1,4 +1,4 @@
-//===--- AvoidSpinlockCheck.h - clang-tidy-----------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/darwin/DarwinTidyModule.cpp b/clang-tools-extra/clang-tidy/darwin/DarwinTidyModule.cpp
index bc8c91a9ed413..0330626a7cd58 100644
--- a/clang-tools-extra/clang-tidy/darwin/DarwinTidyModule.cpp
+++ b/clang-tools-extra/clang-tidy/darwin/DarwinTidyModule.cpp
@@ -1,4 +1,4 @@
-//===--- MiscTidyModule.cpp - clang-tidy ----------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/darwin/DispatchOnceNonstaticCheck.cpp b/clang-tools-extra/clang-tidy/darwin/DispatchOnceNonstaticCheck.cpp
index 75d38a9724950..194f4217f73e3 100644
--- a/clang-tools-extra/clang-tidy/darwin/DispatchOnceNonstaticCheck.cpp
+++ b/clang-tools-extra/clang-tidy/darwin/DispatchOnceNonstaticCheck.cpp
@@ -1,4 +1,4 @@
-//===--- DispatchOnceNonstaticCheck.cpp - clang-tidy ----------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/darwin/DispatchOnceNonstaticCheck.h b/clang-tools-extra/clang-tidy/darwin/DispatchOnceNonstaticCheck.h
index ddf6dfa22cf0a..484b4f93e75f5 100644
--- a/clang-tools-extra/clang-tidy/darwin/DispatchOnceNonstaticCheck.h
+++ b/clang-tools-extra/clang-tidy/darwin/DispatchOnceNonstaticCheck.h
@@ -1,4 +1,4 @@
-//===--- DispatchOnceNonstaticCheck.h - clang-tidy --------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/fuchsia/DefaultArgumentsCallsCheck.cpp b/clang-tools-extra/clang-tidy/fuchsia/DefaultArgumentsCallsCheck.cpp
index 96cd30e0badac..88766d3e5e972 100644
--- a/clang-tools-extra/clang-tidy/fuchsia/DefaultArgumentsCallsCheck.cpp
+++ b/clang-tools-extra/clang-tidy/fuchsia/DefaultArgumentsCallsCheck.cpp
@@ -1,4 +1,4 @@
-//===--- DefaultArgumentsCallsCheck.cpp - clang-tidy-----------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/fuchsia/DefaultArgumentsCallsCheck.h b/clang-tools-extra/clang-tidy/fuchsia/DefaultArgumentsCallsCheck.h
index 120dc90b2cbc0..9ba311c04e679 100644
--- a/clang-tools-extra/clang-tidy/fuchsia/DefaultArgumentsCallsCheck.h
+++ b/clang-tools-extra/clang-tidy/fuchsia/DefaultArgumentsCallsCheck.h
@@ -1,4 +1,4 @@
-//===--- DefaultArgumentsCallsCheck.h - clang-tidy --------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/fuchsia/DefaultArgumentsDeclarationsCheck.cpp b/clang-tools-extra/clang-tidy/fuchsia/DefaultArgumentsDeclarationsCheck.cpp
index 05a663bf3d239..d80511eb626f5 100644
--- a/clang-tools-extra/clang-tidy/fuchsia/DefaultArgumentsDeclarationsCheck.cpp
+++ b/clang-tools-extra/clang-tidy/fuchsia/DefaultArgumentsDeclarationsCheck.cpp
@@ -1,4 +1,4 @@
-//===--- DefaultArgumentsDeclarationsCheck.cpp - clang-tidy ---------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/fuchsia/DefaultArgumentsDeclarationsCheck.h b/clang-tools-extra/clang-tidy/fuchsia/DefaultArgumentsDeclarationsCheck.h
index da73fa4064cbd..b5a19c3b7c22e 100644
--- a/clang-tools-extra/clang-tidy/fuchsia/DefaultArgumentsDeclarationsCheck.h
+++ b/clang-tools-extra/clang-tidy/fuchsia/DefaultArgumentsDeclarationsCheck.h
@@ -1,4 +1,4 @@
-//===--- DefaultArgumentsDeclarationsCheck.h - clang-tidy -------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/fuchsia/FuchsiaTidyModule.cpp b/clang-tools-extra/clang-tidy/fuchsia/FuchsiaTidyModule.cpp
index d7a70b39bdc55..f280a1b07bf39 100644
--- a/clang-tools-extra/clang-tidy/fuchsia/FuchsiaTidyModule.cpp
+++ b/clang-tools-extra/clang-tidy/fuchsia/FuchsiaTidyModule.cpp
@@ -1,4 +1,4 @@
-//===--- FuchsiaTidyModule.cpp - clang-tidy -------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/fuchsia/MultipleInheritanceCheck.cpp b/clang-tools-extra/clang-tidy/fuchsia/MultipleInheritanceCheck.cpp
index 4382f9df5336e..80de0282ee595 100644
--- a/clang-tools-extra/clang-tidy/fuchsia/MultipleInheritanceCheck.cpp
+++ b/clang-tools-extra/clang-tidy/fuchsia/MultipleInheritanceCheck.cpp
@@ -1,4 +1,4 @@
-//===--- MultipleInheritanceCheck.cpp - clang-tidy-------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/fuchsia/MultipleInheritanceCheck.h b/clang-tools-extra/clang-tidy/fuchsia/MultipleInheritanceCheck.h
index be5942c9520ae..838987d20014f 100644
--- a/clang-tools-extra/clang-tidy/fuchsia/MultipleInheritanceCheck.h
+++ b/clang-tools-extra/clang-tidy/fuchsia/MultipleInheritanceCheck.h
@@ -1,4 +1,4 @@
-//===--- MultipleInheritanceCheck.h - clang-tidy-----------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/fuchsia/OverloadedOperatorCheck.cpp b/clang-tools-extra/clang-tidy/fuchsia/OverloadedOperatorCheck.cpp
index 85864006e4d7b..e202c288d6986 100644
--- a/clang-tools-extra/clang-tidy/fuchsia/OverloadedOperatorCheck.cpp
+++ b/clang-tools-extra/clang-tidy/fuchsia/OverloadedOperatorCheck.cpp
@@ -1,4 +1,4 @@
-//===--- OverloadedOperatorCheck.cpp - clang-tidy--------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/fuchsia/OverloadedOperatorCheck.h b/clang-tools-extra/clang-tidy/fuchsia/OverloadedOperatorCheck.h
index d26349d6e9afc..b974c6d7a4473 100644
--- a/clang-tools-extra/clang-tidy/fuchsia/OverloadedOperatorCheck.h
+++ b/clang-tools-extra/clang-tidy/fuchsia/OverloadedOperatorCheck.h
@@ -1,4 +1,4 @@
-//===--- OverloadedOperatorCheck.h - clang-tidy------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/fuchsia/StaticallyConstructedObjectsCheck.cpp b/clang-tools-extra/clang-tidy/fuchsia/StaticallyConstructedObjectsCheck.cpp
index ac55d01208b63..9e540e03d365b 100644
--- a/clang-tools-extra/clang-tidy/fuchsia/StaticallyConstructedObjectsCheck.cpp
+++ b/clang-tools-extra/clang-tidy/fuchsia/StaticallyConstructedObjectsCheck.cpp
@@ -1,4 +1,4 @@
-//===--- StaticallyConstructedObjectsCheck.cpp - clang-tidy----------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/fuchsia/StaticallyConstructedObjectsCheck.h b/clang-tools-extra/clang-tidy/fuchsia/StaticallyConstructedObjectsCheck.h
index 6c65c8cfeb22f..6b4ef681ee188 100644
--- a/clang-tools-extra/clang-tidy/fuchsia/StaticallyConstructedObjectsCheck.h
+++ b/clang-tools-extra/clang-tidy/fuchsia/StaticallyConstructedObjectsCheck.h
@@ -1,4 +1,4 @@
-//===--- StaticallyConstructedObjectsCheck.h - clang-tidy--------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/fuchsia/TrailingReturnCheck.cpp b/clang-tools-extra/clang-tidy/fuchsia/TrailingReturnCheck.cpp
index b619812837753..b2c1acf358240 100644
--- a/clang-tools-extra/clang-tidy/fuchsia/TrailingReturnCheck.cpp
+++ b/clang-tools-extra/clang-tidy/fuchsia/TrailingReturnCheck.cpp
@@ -1,4 +1,4 @@
-//===--- TrailingReturnCheck.cpp - clang-tidy------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/fuchsia/TrailingReturnCheck.h b/clang-tools-extra/clang-tidy/fuchsia/TrailingReturnCheck.h
index 70551844898f1..db6bc33ca0e06 100644
--- a/clang-tools-extra/clang-tidy/fuchsia/TrailingReturnCheck.h
+++ b/clang-tools-extra/clang-tidy/fuchsia/TrailingReturnCheck.h
@@ -1,4 +1,4 @@
-//===--- TrailingReturnCheck.h - clang-tidy----------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/fuchsia/VirtualInheritanceCheck.cpp b/clang-tools-extra/clang-tidy/fuchsia/VirtualInheritanceCheck.cpp
index 20bd036fb265f..b6fb22c66d374 100644
--- a/clang-tools-extra/clang-tidy/fuchsia/VirtualInheritanceCheck.cpp
+++ b/clang-tools-extra/clang-tidy/fuchsia/VirtualInheritanceCheck.cpp
@@ -1,4 +1,4 @@
-//===--- VirtualInheritanceCheck.cpp - clang-tidy--------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/fuchsia/VirtualInheritanceCheck.h b/clang-tools-extra/clang-tidy/fuchsia/VirtualInheritanceCheck.h
index 1bdf19f9146fb..8a3182dd57df7 100644
--- a/clang-tools-extra/clang-tidy/fuchsia/VirtualInheritanceCheck.h
+++ b/clang-tools-extra/clang-tidy/fuchsia/VirtualInheritanceCheck.h
@@ -1,4 +1,4 @@
-//===--- VirtualInheritanceCheck.h - clang-tidy------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/google/AvoidCStyleCastsCheck.cpp b/clang-tools-extra/clang-tidy/google/AvoidCStyleCastsCheck.cpp
index 14e11eb0bc697..174ecb0ed7b77 100644
--- a/clang-tools-extra/clang-tidy/google/AvoidCStyleCastsCheck.cpp
+++ b/clang-tools-extra/clang-tidy/google/AvoidCStyleCastsCheck.cpp
@@ -1,4 +1,4 @@
-//===--- AvoidCStyleCastsCheck.cpp - clang-tidy -----------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/google/AvoidCStyleCastsCheck.h b/clang-tools-extra/clang-tidy/google/AvoidCStyleCastsCheck.h
index 4267b896b6992..dbd2034418762 100644
--- a/clang-tools-extra/clang-tidy/google/AvoidCStyleCastsCheck.h
+++ b/clang-tools-extra/clang-tidy/google/AvoidCStyleCastsCheck.h
@@ -1,4 +1,4 @@
-//===--- AvoidCStyleCastsCheck.h - clang-tidy -------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/google/AvoidNSObjectNewCheck.cpp b/clang-tools-extra/clang-tidy/google/AvoidNSObjectNewCheck.cpp
index adcbf245ef7a3..daf49481bf3b0 100644
--- a/clang-tools-extra/clang-tidy/google/AvoidNSObjectNewCheck.cpp
+++ b/clang-tools-extra/clang-tidy/google/AvoidNSObjectNewCheck.cpp
@@ -1,4 +1,4 @@
-//===--- AvoidNSObjectNewCheck.cpp - clang-tidy ---------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/google/AvoidNSObjectNewCheck.h b/clang-tools-extra/clang-tidy/google/AvoidNSObjectNewCheck.h
index 37b9440396948..fda0d5906a5cd 100644
--- a/clang-tools-extra/clang-tidy/google/AvoidNSObjectNewCheck.h
+++ b/clang-tools-extra/clang-tidy/google/AvoidNSObjectNewCheck.h
@@ -1,4 +1,4 @@
-//===--- AvoidNSObjectNewCheck.h - clang-tidy -------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/google/AvoidThrowingObjCExceptionCheck.cpp b/clang-tools-extra/clang-tidy/google/AvoidThrowingObjCExceptionCheck.cpp
index 6322f63233590..73476571c252f 100644
--- a/clang-tools-extra/clang-tidy/google/AvoidThrowingObjCExceptionCheck.cpp
+++ b/clang-tools-extra/clang-tidy/google/AvoidThrowingObjCExceptionCheck.cpp
@@ -1,4 +1,4 @@
-//===--- AvoidThrowingObjCExceptionCheck.cpp - clang-tidy------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/google/AvoidThrowingObjCExceptionCheck.h b/clang-tools-extra/clang-tidy/google/AvoidThrowingObjCExceptionCheck.h
index 58b46e0a075a4..d32c02b9cfb4b 100644
--- a/clang-tools-extra/clang-tidy/google/AvoidThrowingObjCExceptionCheck.h
+++ b/clang-tools-extra/clang-tidy/google/AvoidThrowingObjCExceptionCheck.h
@@ -1,4 +1,4 @@
-//===--- AvoidThrowingObjCExceptionCheck.h - clang-tidy----------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/google/AvoidUnderscoreInGoogletestNameCheck.cpp b/clang-tools-extra/clang-tidy/google/AvoidUnderscoreInGoogletestNameCheck.cpp
index f2507f0b60e71..b335463bc78bd 100644
--- a/clang-tools-extra/clang-tidy/google/AvoidUnderscoreInGoogletestNameCheck.cpp
+++ b/clang-tools-extra/clang-tidy/google/AvoidUnderscoreInGoogletestNameCheck.cpp
@@ -1,4 +1,4 @@
-//===--- AvoidUnderscoreInGoogletestNameCheck.cpp - clang-tidy --*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/google/AvoidUnderscoreInGoogletestNameCheck.h b/clang-tools-extra/clang-tidy/google/AvoidUnderscoreInGoogletestNameCheck.h
index b53e6c45913d5..c2e39d3a7026d 100644
--- a/clang-tools-extra/clang-tidy/google/AvoidUnderscoreInGoogletestNameCheck.h
+++ b/clang-tools-extra/clang-tidy/google/AvoidUnderscoreInGoogletestNameCheck.h
@@ -1,4 +1,4 @@
-//===--- AvoidUnderscoreInGoogletestNameCheck.h - clang-tidy ----*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/google/DefaultArgumentsCheck.cpp b/clang-tools-extra/clang-tidy/google/DefaultArgumentsCheck.cpp
index 0b14a51c12a8c..9831efe5384a2 100644
--- a/clang-tools-extra/clang-tidy/google/DefaultArgumentsCheck.cpp
+++ b/clang-tools-extra/clang-tidy/google/DefaultArgumentsCheck.cpp
@@ -1,4 +1,4 @@
-//===--- DefaultArgumentsCheck.cpp - clang-tidy----------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/google/DefaultArgumentsCheck.h b/clang-tools-extra/clang-tidy/google/DefaultArgumentsCheck.h
index 49d95a5acd35c..c0e539598e00e 100644
--- a/clang-tools-extra/clang-tidy/google/DefaultArgumentsCheck.h
+++ b/clang-tools-extra/clang-tidy/google/DefaultArgumentsCheck.h
@@ -1,4 +1,4 @@
-//===--- DefaultArgumentsCheck.h - clang-tidy--------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/google/ExplicitConstructorCheck.cpp b/clang-tools-extra/clang-tidy/google/ExplicitConstructorCheck.cpp
index 68233ec6bd441..a038af4fa9543 100644
--- a/clang-tools-extra/clang-tidy/google/ExplicitConstructorCheck.cpp
+++ b/clang-tools-extra/clang-tidy/google/ExplicitConstructorCheck.cpp
@@ -1,4 +1,4 @@
-//===--- ExplicitConstructorCheck.cpp - clang-tidy ------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/google/ExplicitConstructorCheck.h b/clang-tools-extra/clang-tidy/google/ExplicitConstructorCheck.h
index e4434ac71d786..4ed3671fd3951 100644
--- a/clang-tools-extra/clang-tidy/google/ExplicitConstructorCheck.h
+++ b/clang-tools-extra/clang-tidy/google/ExplicitConstructorCheck.h
@@ -1,4 +1,4 @@
-//===--- ExplicitConstructorCheck.h - clang-tidy ----------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/google/ExplicitMakePairCheck.cpp b/clang-tools-extra/clang-tidy/google/ExplicitMakePairCheck.cpp
index d911b58cb8b7e..ac56f5d920e21 100644
--- a/clang-tools-extra/clang-tidy/google/ExplicitMakePairCheck.cpp
+++ b/clang-tools-extra/clang-tidy/google/ExplicitMakePairCheck.cpp
@@ -1,4 +1,4 @@
-//===--- ExplicitMakePairCheck.cpp - clang-tidy -----------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/google/ExplicitMakePairCheck.h b/clang-tools-extra/clang-tidy/google/ExplicitMakePairCheck.h
index 2c796a2811314..49d5172f932d2 100644
--- a/clang-tools-extra/clang-tidy/google/ExplicitMakePairCheck.h
+++ b/clang-tools-extra/clang-tidy/google/ExplicitMakePairCheck.h
@@ -1,4 +1,4 @@
-//===--- ExplicitMakePairCheck.h - clang-tidy -------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/google/FunctionNamingCheck.cpp b/clang-tools-extra/clang-tidy/google/FunctionNamingCheck.cpp
index 6631030734d25..3d75f4dd25bd1 100644
--- a/clang-tools-extra/clang-tidy/google/FunctionNamingCheck.cpp
+++ b/clang-tools-extra/clang-tidy/google/FunctionNamingCheck.cpp
@@ -1,4 +1,4 @@
-//===--- FunctionNamingCheck.cpp - clang-tidy -----------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/google/FunctionNamingCheck.h b/clang-tools-extra/clang-tidy/google/FunctionNamingCheck.h
index 560bb52f15a00..1f4fe92d542a8 100644
--- a/clang-tools-extra/clang-tidy/google/FunctionNamingCheck.h
+++ b/clang-tools-extra/clang-tidy/google/FunctionNamingCheck.h
@@ -1,4 +1,4 @@
-//===--- FunctionNamingCheck.h - clang-tidy ---------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/google/GlobalNamesInHeadersCheck.cpp b/clang-tools-extra/clang-tidy/google/GlobalNamesInHeadersCheck.cpp
index 459dee1247525..aa8bc74e911b4 100644
--- a/clang-tools-extra/clang-tidy/google/GlobalNamesInHeadersCheck.cpp
+++ b/clang-tools-extra/clang-tidy/google/GlobalNamesInHeadersCheck.cpp
@@ -1,4 +1,4 @@
-//===--- GlobalNamesInHeadersCheck.cpp - clang-tidy --------------*- C++-*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/google/GlobalNamesInHeadersCheck.h b/clang-tools-extra/clang-tidy/google/GlobalNamesInHeadersCheck.h
index 70a0a4c0cda00..4cc36630d3851 100644
--- a/clang-tools-extra/clang-tidy/google/GlobalNamesInHeadersCheck.h
+++ b/clang-tools-extra/clang-tidy/google/GlobalNamesInHeadersCheck.h
@@ -1,4 +1,4 @@
-//===--- GlobalNamesInHeadersCheck.h - clang-tidy ---------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/google/GlobalVariableDeclarationCheck.cpp b/clang-tools-extra/clang-tidy/google/GlobalVariableDeclarationCheck.cpp
index 9082c9368d87d..c0c3ffaee796f 100644
--- a/clang-tools-extra/clang-tidy/google/GlobalVariableDeclarationCheck.cpp
+++ b/clang-tools-extra/clang-tidy/google/GlobalVariableDeclarationCheck.cpp
@@ -1,4 +1,4 @@
-//===--- GlobalVariableDeclarationCheck.cpp - clang-tidy-------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/google/GlobalVariableDeclarationCheck.h b/clang-tools-extra/clang-tidy/google/GlobalVariableDeclarationCheck.h
index 19e6c5dbc8e22..c6c32c3ff0884 100644
--- a/clang-tools-extra/clang-tidy/google/GlobalVariableDeclarationCheck.h
+++ b/clang-tools-extra/clang-tidy/google/GlobalVariableDeclarationCheck.h
@@ -1,4 +1,4 @@
-//===--- GlobalVariableDeclarationCheck.h - clang-tidy-----------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/google/GoogleTidyModule.cpp b/clang-tools-extra/clang-tidy/google/GoogleTidyModule.cpp
index eb5666be62bcf..aff8b45ff2f74 100644
--- a/clang-tools-extra/clang-tidy/google/GoogleTidyModule.cpp
+++ b/clang-tools-extra/clang-tidy/google/GoogleTidyModule.cpp
@@ -1,4 +1,4 @@
-//===--- GoogleTidyModule.cpp - clang-tidy --------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/google/IntegerTypesCheck.cpp b/clang-tools-extra/clang-tidy/google/IntegerTypesCheck.cpp
index 711444fa9bcdc..047c7f99ae299 100644
--- a/clang-tools-extra/clang-tidy/google/IntegerTypesCheck.cpp
+++ b/clang-tools-extra/clang-tidy/google/IntegerTypesCheck.cpp
@@ -1,4 +1,4 @@
-//===--- IntegerTypesCheck.cpp - clang-tidy -------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/google/IntegerTypesCheck.h b/clang-tools-extra/clang-tidy/google/IntegerTypesCheck.h
index c62bda67ae2d9..be4989851f20a 100644
--- a/clang-tools-extra/clang-tidy/google/IntegerTypesCheck.h
+++ b/clang-tools-extra/clang-tidy/google/IntegerTypesCheck.h
@@ -1,4 +1,4 @@
-//===--- IntegerTypesCheck.h - clang-tidy -----------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/google/OverloadedUnaryAndCheck.cpp b/clang-tools-extra/clang-tidy/google/OverloadedUnaryAndCheck.cpp
index 6ec907ab1b696..63ca86266e27e 100644
--- a/clang-tools-extra/clang-tidy/google/OverloadedUnaryAndCheck.cpp
+++ b/clang-tools-extra/clang-tidy/google/OverloadedUnaryAndCheck.cpp
@@ -1,4 +1,4 @@
-//===--- OverloadedUnaryAndCheck.cpp - clang-tidy ---------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/google/OverloadedUnaryAndCheck.h b/clang-tools-extra/clang-tidy/google/OverloadedUnaryAndCheck.h
index 3c3b668754ac1..126f0fbc61b87 100644
--- a/clang-tools-extra/clang-tidy/google/OverloadedUnaryAndCheck.h
+++ b/clang-tools-extra/clang-tidy/google/OverloadedUnaryAndCheck.h
@@ -1,4 +1,4 @@
-//===--- OverloadedUnaryAndCheck.h - clang-tidy -----------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/google/TodoCommentCheck.cpp b/clang-tools-extra/clang-tidy/google/TodoCommentCheck.cpp
index adad54aa24ba9..8554870287c81 100644
--- a/clang-tools-extra/clang-tidy/google/TodoCommentCheck.cpp
+++ b/clang-tools-extra/clang-tidy/google/TodoCommentCheck.cpp
@@ -1,4 +1,4 @@
-//===--- TodoCommentCheck.cpp - clang-tidy --------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/google/TodoCommentCheck.h b/clang-tools-extra/clang-tidy/google/TodoCommentCheck.h
index de540d810afaa..d56036095bab9 100644
--- a/clang-tools-extra/clang-tidy/google/TodoCommentCheck.h
+++ b/clang-tools-extra/clang-tidy/google/TodoCommentCheck.h
@@ -1,4 +1,4 @@
-//===--- TodoCommentCheck.h - clang-tidy ------------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/google/UnnamedNamespaceInHeaderCheck.cpp b/clang-tools-extra/clang-tidy/google/UnnamedNamespaceInHeaderCheck.cpp
index c1e421308d77d..3066dd0ff4595 100644
--- a/clang-tools-extra/clang-tidy/google/UnnamedNamespaceInHeaderCheck.cpp
+++ b/clang-tools-extra/clang-tidy/google/UnnamedNamespaceInHeaderCheck.cpp
@@ -1,4 +1,4 @@
-//===--- UnnamedNamespaceInHeaderCheck.cpp - clang-tidy ---------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/google/UnnamedNamespaceInHeaderCheck.h b/clang-tools-extra/clang-tidy/google/UnnamedNamespaceInHeaderCheck.h
index 55b735c0d141b..84f8ae56f2635 100644
--- a/clang-tools-extra/clang-tidy/google/UnnamedNamespaceInHeaderCheck.h
+++ b/clang-tools-extra/clang-tidy/google/UnnamedNamespaceInHeaderCheck.h
@@ -1,4 +1,4 @@
-//===--- UnnamedNamespaceInHeaderCheck.h - clang-tidy -----------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/google/UpgradeGoogletestCaseCheck.cpp b/clang-tools-extra/clang-tidy/google/UpgradeGoogletestCaseCheck.cpp
index c9b48e922ea57..9da1915affd91 100644
--- a/clang-tools-extra/clang-tidy/google/UpgradeGoogletestCaseCheck.cpp
+++ b/clang-tools-extra/clang-tidy/google/UpgradeGoogletestCaseCheck.cpp
@@ -1,4 +1,4 @@
-//===--- UpgradeGoogletestCaseCheck.cpp - clang-tidy ----------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/google/UpgradeGoogletestCaseCheck.h b/clang-tools-extra/clang-tidy/google/UpgradeGoogletestCaseCheck.h
index 61b09b9a9f7de..43fff32d86215 100644
--- a/clang-tools-extra/clang-tidy/google/UpgradeGoogletestCaseCheck.h
+++ b/clang-tools-extra/clang-tidy/google/UpgradeGoogletestCaseCheck.h
@@ -1,4 +1,4 @@
-//===--- UpgradeGoogletestCaseCheck.h - clang-tidy --------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/google/UsingNamespaceDirectiveCheck.cpp b/clang-tools-extra/clang-tidy/google/UsingNamespaceDirectiveCheck.cpp
index 26d27c19f489e..fbfd5d3430519 100644
--- a/clang-tools-extra/clang-tidy/google/UsingNamespaceDirectiveCheck.cpp
+++ b/clang-tools-extra/clang-tidy/google/UsingNamespaceDirectiveCheck.cpp
@@ -1,4 +1,4 @@
-//===--- UsingNamespaceDirectiveCheck.cpp - clang-tidy ----------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/google/UsingNamespaceDirectiveCheck.h b/clang-tools-extra/clang-tidy/google/UsingNamespaceDirectiveCheck.h
index b7abac1311045..bcdf6b6a4bcf1 100644
--- a/clang-tools-extra/clang-tidy/google/UsingNamespaceDirectiveCheck.h
+++ b/clang-tools-extra/clang-tidy/google/UsingNamespaceDirectiveCheck.h
@@ -1,4 +1,4 @@
-//===--- UsingNamespaceDirectiveCheck.h - clang-tidy ------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/hicpp/ExceptionBaseclassCheck.cpp b/clang-tools-extra/clang-tidy/hicpp/ExceptionBaseclassCheck.cpp
index ed39568ea554a..71b82875c09a0 100644
--- a/clang-tools-extra/clang-tidy/hicpp/ExceptionBaseclassCheck.cpp
+++ b/clang-tools-extra/clang-tidy/hicpp/ExceptionBaseclassCheck.cpp
@@ -1,4 +1,4 @@
-//===--- ExceptionBaseclassCheck.cpp - clang-tidy--------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/hicpp/ExceptionBaseclassCheck.h b/clang-tools-extra/clang-tidy/hicpp/ExceptionBaseclassCheck.h
index 79d8cf925d1b7..bc21249663af8 100644
--- a/clang-tools-extra/clang-tidy/hicpp/ExceptionBaseclassCheck.h
+++ b/clang-tools-extra/clang-tidy/hicpp/ExceptionBaseclassCheck.h
@@ -1,4 +1,4 @@
-//===--- ExceptionBaseclassCheck.h - clang-tidy------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/hicpp/HICPPTidyModule.cpp b/clang-tools-extra/clang-tidy/hicpp/HICPPTidyModule.cpp
index 65a56be3e5a05..9695eab51062b 100644
--- a/clang-tools-extra/clang-tidy/hicpp/HICPPTidyModule.cpp
+++ b/clang-tools-extra/clang-tidy/hicpp/HICPPTidyModule.cpp
@@ -1,4 +1,4 @@
-//===------- HICPPTidyModule.cpp - clang-tidy -----------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/hicpp/IgnoredRemoveResultCheck.cpp b/clang-tools-extra/clang-tidy/hicpp/IgnoredRemoveResultCheck.cpp
index b1a18485ce168..5321fd8d5b1c2 100644
--- a/clang-tools-extra/clang-tidy/hicpp/IgnoredRemoveResultCheck.cpp
+++ b/clang-tools-extra/clang-tidy/hicpp/IgnoredRemoveResultCheck.cpp
@@ -1,4 +1,4 @@
-//===--- IgnoredRemoveResultCheck.cpp - clang-tidy ------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/hicpp/IgnoredRemoveResultCheck.h b/clang-tools-extra/clang-tidy/hicpp/IgnoredRemoveResultCheck.h
index 39c45fea9aae4..8cf58d5a6978a 100644
--- a/clang-tools-extra/clang-tidy/hicpp/IgnoredRemoveResultCheck.h
+++ b/clang-tools-extra/clang-tidy/hicpp/IgnoredRemoveResultCheck.h
@@ -1,4 +1,4 @@
-//===--- IgnoredRemoveResultCheck.h - clang-tidy ----------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/hicpp/MultiwayPathsCoveredCheck.cpp b/clang-tools-extra/clang-tidy/hicpp/MultiwayPathsCoveredCheck.cpp
index 3f5cd4b473903..e610d99007d4e 100644
--- a/clang-tools-extra/clang-tidy/hicpp/MultiwayPathsCoveredCheck.cpp
+++ b/clang-tools-extra/clang-tidy/hicpp/MultiwayPathsCoveredCheck.cpp
@@ -1,4 +1,4 @@
-//===--- MultiwayPathsCoveredCheck.cpp - clang-tidy------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/hicpp/MultiwayPathsCoveredCheck.h b/clang-tools-extra/clang-tidy/hicpp/MultiwayPathsCoveredCheck.h
index c26fb3e72211d..2507f6cde338e 100644
--- a/clang-tools-extra/clang-tidy/hicpp/MultiwayPathsCoveredCheck.h
+++ b/clang-tools-extra/clang-tidy/hicpp/MultiwayPathsCoveredCheck.h
@@ -1,4 +1,4 @@
-//===--- MultiwayPathsCoveredCheck.h - clang-tidy----------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/hicpp/NoAssemblerCheck.cpp b/clang-tools-extra/clang-tidy/hicpp/NoAssemblerCheck.cpp
index 54ba6aebab1ba..a89a896b32981 100644
--- a/clang-tools-extra/clang-tidy/hicpp/NoAssemblerCheck.cpp
+++ b/clang-tools-extra/clang-tidy/hicpp/NoAssemblerCheck.cpp
@@ -1,4 +1,4 @@
-//===--- NoAssemblerCheck.cpp - clang-tidy---------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/hicpp/NoAssemblerCheck.h b/clang-tools-extra/clang-tidy/hicpp/NoAssemblerCheck.h
index 7ade7a33091f1..cf397df1578a4 100644
--- a/clang-tools-extra/clang-tidy/hicpp/NoAssemblerCheck.h
+++ b/clang-tools-extra/clang-tidy/hicpp/NoAssemblerCheck.h
@@ -1,4 +1,4 @@
-//===--- NoAssemblerCheck.h - clang-tidy-------------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/hicpp/SignedBitwiseCheck.cpp b/clang-tools-extra/clang-tidy/hicpp/SignedBitwiseCheck.cpp
index bf09a6662d955..19c716e941271 100644
--- a/clang-tools-extra/clang-tidy/hicpp/SignedBitwiseCheck.cpp
+++ b/clang-tools-extra/clang-tidy/hicpp/SignedBitwiseCheck.cpp
@@ -1,4 +1,4 @@
-//===--- SignedBitwiseCheck.cpp - clang-tidy-------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/hicpp/SignedBitwiseCheck.h b/clang-tools-extra/clang-tidy/hicpp/SignedBitwiseCheck.h
index 170c249bc65e6..b3538e7e51f58 100644
--- a/clang-tools-extra/clang-tidy/hicpp/SignedBitwiseCheck.h
+++ b/clang-tools-extra/clang-tidy/hicpp/SignedBitwiseCheck.h
@@ -1,4 +1,4 @@
-//===--- SignedBitwiseCheck.h - clang-tidy-----------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/linuxkernel/LinuxKernelTidyModule.cpp b/clang-tools-extra/clang-tidy/linuxkernel/LinuxKernelTidyModule.cpp
index b8b75b7ccaefe..645d07426fee2 100644
--- a/clang-tools-extra/clang-tidy/linuxkernel/LinuxKernelTidyModule.cpp
+++ b/clang-tools-extra/clang-tidy/linuxkernel/LinuxKernelTidyModule.cpp
@@ -1,4 +1,4 @@
-//===--- LinuxKernelTidyModule.cpp - clang-tidy----------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/linuxkernel/MustCheckErrsCheck.cpp b/clang-tools-extra/clang-tidy/linuxkernel/MustCheckErrsCheck.cpp
index ce501ac2acca2..14f54571885f2 100644
--- a/clang-tools-extra/clang-tidy/linuxkernel/MustCheckErrsCheck.cpp
+++ b/clang-tools-extra/clang-tidy/linuxkernel/MustCheckErrsCheck.cpp
@@ -1,4 +1,4 @@
-//===--- MustCheckErrsCheck.cpp - clang-tidy ------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/linuxkernel/MustCheckErrsCheck.h b/clang-tools-extra/clang-tidy/linuxkernel/MustCheckErrsCheck.h
index 7406aaead836e..a450f50c30cb8 100644
--- a/clang-tools-extra/clang-tidy/linuxkernel/MustCheckErrsCheck.h
+++ b/clang-tools-extra/clang-tidy/linuxkernel/MustCheckErrsCheck.h
@@ -1,4 +1,4 @@
-//===--- MustCheckErrsCheck.h - clang-tidy ----------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/llvm/HeaderGuardCheck.cpp b/clang-tools-extra/clang-tidy/llvm/HeaderGuardCheck.cpp
index 3ea235b1fed7f..8737c1e5f4b05 100644
--- a/clang-tools-extra/clang-tidy/llvm/HeaderGuardCheck.cpp
+++ b/clang-tools-extra/clang-tidy/llvm/HeaderGuardCheck.cpp
@@ -1,4 +1,4 @@
-//===--- HeaderGuardCheck.cpp - clang-tidy --------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/llvm/HeaderGuardCheck.h b/clang-tools-extra/clang-tidy/llvm/HeaderGuardCheck.h
index 508172c6b3a84..1eb307d8347df 100644
--- a/clang-tools-extra/clang-tidy/llvm/HeaderGuardCheck.h
+++ b/clang-tools-extra/clang-tidy/llvm/HeaderGuardCheck.h
@@ -1,4 +1,4 @@
-//===--- HeaderGuardCheck.h - clang-tidy ------------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/llvm/IncludeOrderCheck.cpp b/clang-tools-extra/clang-tidy/llvm/IncludeOrderCheck.cpp
index 4246c8c574c50..f34e3a67c03ab 100644
--- a/clang-tools-extra/clang-tidy/llvm/IncludeOrderCheck.cpp
+++ b/clang-tools-extra/clang-tidy/llvm/IncludeOrderCheck.cpp
@@ -1,4 +1,4 @@
-//===--- IncludeOrderCheck.cpp - clang-tidy -------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/llvm/IncludeOrderCheck.h b/clang-tools-extra/clang-tidy/llvm/IncludeOrderCheck.h
index d992d151734ce..27c6798481866 100644
--- a/clang-tools-extra/clang-tidy/llvm/IncludeOrderCheck.h
+++ b/clang-tools-extra/clang-tidy/llvm/IncludeOrderCheck.h
@@ -1,4 +1,4 @@
-//===--- IncludeOrderCheck.h - clang-tidy -----------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/llvm/LLVMTidyModule.cpp b/clang-tools-extra/clang-tidy/llvm/LLVMTidyModule.cpp
index c1f78caf44d16..ed65cd1720457 100644
--- a/clang-tools-extra/clang-tidy/llvm/LLVMTidyModule.cpp
+++ b/clang-tools-extra/clang-tidy/llvm/LLVMTidyModule.cpp
@@ -1,4 +1,4 @@
-//===--- LLVMTidyModule.cpp - clang-tidy ----------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/llvm/PreferIsaOrDynCastInConditionalsCheck.cpp b/clang-tools-extra/clang-tidy/llvm/PreferIsaOrDynCastInConditionalsCheck.cpp
index cb289af46ea44..f4f3543b56e5c 100644
--- a/clang-tools-extra/clang-tidy/llvm/PreferIsaOrDynCastInConditionalsCheck.cpp
+++ b/clang-tools-extra/clang-tidy/llvm/PreferIsaOrDynCastInConditionalsCheck.cpp
@@ -1,5 +1,4 @@
-//===--- PreferIsaOrDynCastInConditionalsCheck.cpp - clang-tidy
-//---------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/llvm/PreferIsaOrDynCastInConditionalsCheck.h b/clang-tools-extra/clang-tidy/llvm/PreferIsaOrDynCastInConditionalsCheck.h
index 5b611096c25fd..cf4b64ad21686 100644
--- a/clang-tools-extra/clang-tidy/llvm/PreferIsaOrDynCastInConditionalsCheck.h
+++ b/clang-tools-extra/clang-tidy/llvm/PreferIsaOrDynCastInConditionalsCheck.h
@@ -1,4 +1,4 @@
-//===--- PreferIsaOrDynCastInConditionalsCheck.h - clang-tidy ---*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/llvm/PreferRegisterOverUnsignedCheck.cpp b/clang-tools-extra/clang-tidy/llvm/PreferRegisterOverUnsignedCheck.cpp
index f88122360aeb1..c5ee240b64ea8 100644
--- a/clang-tools-extra/clang-tidy/llvm/PreferRegisterOverUnsignedCheck.cpp
+++ b/clang-tools-extra/clang-tidy/llvm/PreferRegisterOverUnsignedCheck.cpp
@@ -1,4 +1,4 @@
-//===--- PreferRegisterOverUnsignedCheck.cpp - clang-tidy -----------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/llvm/PreferRegisterOverUnsignedCheck.h b/clang-tools-extra/clang-tidy/llvm/PreferRegisterOverUnsignedCheck.h
index 07e018a6fc969..9a7a0c3f35857 100644
--- a/clang-tools-extra/clang-tidy/llvm/PreferRegisterOverUnsignedCheck.h
+++ b/clang-tools-extra/clang-tidy/llvm/PreferRegisterOverUnsignedCheck.h
@@ -1,4 +1,4 @@
-//===--- PreferRegisterOverUnsignedCheck.h - clang-tidy ---------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/llvm/PreferStaticOverAnonymousNamespaceCheck.cpp b/clang-tools-extra/clang-tidy/llvm/PreferStaticOverAnonymousNamespaceCheck.cpp
index ea79bfaef8876..ea81c7c10b7d9 100644
--- a/clang-tools-extra/clang-tidy/llvm/PreferStaticOverAnonymousNamespaceCheck.cpp
+++ b/clang-tools-extra/clang-tidy/llvm/PreferStaticOverAnonymousNamespaceCheck.cpp
@@ -1,4 +1,4 @@
-//===--- PreferStaticOverAnonymousNamespaceCheck.cpp - clang-tidy ---------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/llvm/PreferStaticOverAnonymousNamespaceCheck.h b/clang-tools-extra/clang-tidy/llvm/PreferStaticOverAnonymousNamespaceCheck.h
index ca0245e1d3031..a8738e6fbd70d 100644
--- a/clang-tools-extra/clang-tidy/llvm/PreferStaticOverAnonymousNamespaceCheck.h
+++ b/clang-tools-extra/clang-tidy/llvm/PreferStaticOverAnonymousNamespaceCheck.h
@@ -1,4 +1,4 @@
-//===--- PreferStaticOverAnonymousNamespaceCheck.h - clang-tidy -*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/llvm/TwineLocalCheck.cpp b/clang-tools-extra/clang-tidy/llvm/TwineLocalCheck.cpp
index 42e53ef6025d6..b8b7c41e970bb 100644
--- a/clang-tools-extra/clang-tidy/llvm/TwineLocalCheck.cpp
+++ b/clang-tools-extra/clang-tidy/llvm/TwineLocalCheck.cpp
@@ -1,4 +1,4 @@
-//===--- TwineLocalCheck.cpp - clang-tidy ---------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/llvm/TwineLocalCheck.h b/clang-tools-extra/clang-tidy/llvm/TwineLocalCheck.h
index b4550ecb226bf..7bde04c5f11ea 100644
--- a/clang-tools-extra/clang-tidy/llvm/TwineLocalCheck.h
+++ b/clang-tools-extra/clang-tidy/llvm/TwineLocalCheck.h
@@ -1,4 +1,4 @@
-//===--- TwineLocalCheck.h - clang-tidy -------------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/llvm/UseNewMLIROpBuilderCheck.cpp b/clang-tools-extra/clang-tidy/llvm/UseNewMLIROpBuilderCheck.cpp
index 4722199364cb5..0d81b9a9e38ca 100644
--- a/clang-tools-extra/clang-tidy/llvm/UseNewMLIROpBuilderCheck.cpp
+++ b/clang-tools-extra/clang-tidy/llvm/UseNewMLIROpBuilderCheck.cpp
@@ -1,4 +1,4 @@
-//===--- UseNewMLIROpBuilderCheck.cpp - clang-tidy ------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/llvm/UseNewMLIROpBuilderCheck.h b/clang-tools-extra/clang-tidy/llvm/UseNewMLIROpBuilderCheck.h
index 813a23c564782..0842699823a65 100644
--- a/clang-tools-extra/clang-tidy/llvm/UseNewMLIROpBuilderCheck.h
+++ b/clang-tools-extra/clang-tidy/llvm/UseNewMLIROpBuilderCheck.h
@@ -1,4 +1,4 @@
-//===--- UseNewMLIROpBuilderCheck.h - clang-tidy ----------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/llvm/UseRangesCheck.cpp b/clang-tools-extra/clang-tidy/llvm/UseRangesCheck.cpp
index 4afab488b7dcc..49dc92456af39 100644
--- a/clang-tools-extra/clang-tidy/llvm/UseRangesCheck.cpp
+++ b/clang-tools-extra/clang-tidy/llvm/UseRangesCheck.cpp
@@ -1,4 +1,4 @@
-//===--- UseRangesCheck.cpp - clang-tidy ----------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/llvm/UseRangesCheck.h b/clang-tools-extra/clang-tidy/llvm/UseRangesCheck.h
index e9904e11ced36..b985288ea0e4c 100644
--- a/clang-tools-extra/clang-tidy/llvm/UseRangesCheck.h
+++ b/clang-tools-extra/clang-tidy/llvm/UseRangesCheck.h
@@ -1,4 +1,4 @@
-//===--- UseRangesCheck.h - clang-tidy --------------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/llvmlibc/CalleeNamespaceCheck.cpp b/clang-tools-extra/clang-tidy/llvmlibc/CalleeNamespaceCheck.cpp
index 4bc4d5a4691f0..dd1ef076c65e1 100644
--- a/clang-tools-extra/clang-tidy/llvmlibc/CalleeNamespaceCheck.cpp
+++ b/clang-tools-extra/clang-tidy/llvmlibc/CalleeNamespaceCheck.cpp
@@ -1,4 +1,4 @@
-//===-- CalleeNamespaceCheck.cpp ------------------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/llvmlibc/CalleeNamespaceCheck.h b/clang-tools-extra/clang-tidy/llvmlibc/CalleeNamespaceCheck.h
index e718c990c4baa..34c628ea5f6e4 100644
--- a/clang-tools-extra/clang-tidy/llvmlibc/CalleeNamespaceCheck.h
+++ b/clang-tools-extra/clang-tidy/llvmlibc/CalleeNamespaceCheck.h
@@ -1,4 +1,4 @@
-//===-- CalleeNamespaceCheck.h ----------------------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/llvmlibc/ImplementationInNamespaceCheck.cpp b/clang-tools-extra/clang-tidy/llvmlibc/ImplementationInNamespaceCheck.cpp
index c2fbc4422e5d2..567ade5d9a08b 100644
--- a/clang-tools-extra/clang-tidy/llvmlibc/ImplementationInNamespaceCheck.cpp
+++ b/clang-tools-extra/clang-tidy/llvmlibc/ImplementationInNamespaceCheck.cpp
@@ -1,4 +1,4 @@
-//===--- ImplementationInNamespaceCheck.cpp - clang-tidy ------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/llvmlibc/ImplementationInNamespaceCheck.h b/clang-tools-extra/clang-tidy/llvmlibc/ImplementationInNamespaceCheck.h
index 42da38f728bb8..da97443191b9f 100644
--- a/clang-tools-extra/clang-tidy/llvmlibc/ImplementationInNamespaceCheck.h
+++ b/clang-tools-extra/clang-tidy/llvmlibc/ImplementationInNamespaceCheck.h
@@ -1,4 +1,4 @@
-//===--- ImplementationInNamespaceCheck.h - clang-tidy ----------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/llvmlibc/InlineFunctionDeclCheck.cpp b/clang-tools-extra/clang-tidy/llvmlibc/InlineFunctionDeclCheck.cpp
index 4d92b1f6b8d1c..9dae57a50bb52 100644
--- a/clang-tools-extra/clang-tidy/llvmlibc/InlineFunctionDeclCheck.cpp
+++ b/clang-tools-extra/clang-tidy/llvmlibc/InlineFunctionDeclCheck.cpp
@@ -1,4 +1,4 @@
-//===-- InlineFunctionDeclCheck.cpp ---------------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/llvmlibc/InlineFunctionDeclCheck.h b/clang-tools-extra/clang-tidy/llvmlibc/InlineFunctionDeclCheck.h
index 52516f776ad49..01a8df46ec666 100644
--- a/clang-tools-extra/clang-tidy/llvmlibc/InlineFunctionDeclCheck.h
+++ b/clang-tools-extra/clang-tidy/llvmlibc/InlineFunctionDeclCheck.h
@@ -1,4 +1,4 @@
-//===-- InlineFunctionDeclCheck.h -------------------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/llvmlibc/LLVMLibcTidyModule.cpp b/clang-tools-extra/clang-tidy/llvmlibc/LLVMLibcTidyModule.cpp
index 562d71a0891c4..ded85939b75c8 100644
--- a/clang-tools-extra/clang-tidy/llvmlibc/LLVMLibcTidyModule.cpp
+++ b/clang-tools-extra/clang-tidy/llvmlibc/LLVMLibcTidyModule.cpp
@@ -1,4 +1,4 @@
-//===--- LLVMLibcTidyModule.cpp - clang-tidy ------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/llvmlibc/NamespaceConstants.h b/clang-tools-extra/clang-tidy/llvmlibc/NamespaceConstants.h
index 83908a7875d03..50669dc073291 100644
--- a/clang-tools-extra/clang-tidy/llvmlibc/NamespaceConstants.h
+++ b/clang-tools-extra/clang-tidy/llvmlibc/NamespaceConstants.h
@@ -1,4 +1,4 @@
-//===--- NamespaceConstants.h -----------------------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/llvmlibc/RestrictSystemLibcHeadersCheck.cpp b/clang-tools-extra/clang-tidy/llvmlibc/RestrictSystemLibcHeadersCheck.cpp
index 7db648abcc882..129b8a9a30a59 100644
--- a/clang-tools-extra/clang-tidy/llvmlibc/RestrictSystemLibcHeadersCheck.cpp
+++ b/clang-tools-extra/clang-tidy/llvmlibc/RestrictSystemLibcHeadersCheck.cpp
@@ -1,4 +1,4 @@
-//===--- RestrictSystemLibcHeadersCheck.cpp - clang-tidy ------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/llvmlibc/RestrictSystemLibcHeadersCheck.h b/clang-tools-extra/clang-tidy/llvmlibc/RestrictSystemLibcHeadersCheck.h
index f7e613cec5f0e..1c7b31037875d 100644
--- a/clang-tools-extra/clang-tidy/llvmlibc/RestrictSystemLibcHeadersCheck.h
+++ b/clang-tools-extra/clang-tidy/llvmlibc/RestrictSystemLibcHeadersCheck.h
@@ -1,4 +1,4 @@
-//===--- RestrictSystemLibcHeadersCheck.h - clang-tidy ----------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/misc/ConfusableIdentifierCheck.cpp b/clang-tools-extra/clang-tidy/misc/ConfusableIdentifierCheck.cpp
index 79ae5ee98182b..2d0323ac04515 100644
--- a/clang-tools-extra/clang-tidy/misc/ConfusableIdentifierCheck.cpp
+++ b/clang-tools-extra/clang-tidy/misc/ConfusableIdentifierCheck.cpp
@@ -1,4 +1,4 @@
-//===--- ConfusableIdentifierCheck.cpp - clang-tidy -----------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/misc/ConfusableIdentifierCheck.h b/clang-tools-extra/clang-tidy/misc/ConfusableIdentifierCheck.h
index 9cce6cce67682..37337954822b7 100644
--- a/clang-tools-extra/clang-tidy/misc/ConfusableIdentifierCheck.h
+++ b/clang-tools-extra/clang-tidy/misc/ConfusableIdentifierCheck.h
@@ -1,4 +1,4 @@
-//===--- ConfusableIdentifierCheck.h - clang-tidy ---------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/misc/ConfusableTable/BuildConfusableTable.cpp b/clang-tools-extra/clang-tidy/misc/ConfusableTable/BuildConfusableTable.cpp
index 18ac7e508165b..6a079024cfe1c 100644
--- a/clang-tools-extra/clang-tidy/misc/ConfusableTable/BuildConfusableTable.cpp
+++ b/clang-tools-extra/clang-tidy/misc/ConfusableTable/BuildConfusableTable.cpp
@@ -1,4 +1,4 @@
-//===--- BuildConfusableTable.cpp - clang-tidy---------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/misc/ConstCorrectnessCheck.cpp b/clang-tools-extra/clang-tidy/misc/ConstCorrectnessCheck.cpp
index b32507d66cbac..b93f3d6a5a13b 100644
--- a/clang-tools-extra/clang-tidy/misc/ConstCorrectnessCheck.cpp
+++ b/clang-tools-extra/clang-tidy/misc/ConstCorrectnessCheck.cpp
@@ -1,4 +1,4 @@
-//===--- ConstCorrectnessCheck.cpp - clang-tidy -----------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/misc/ConstCorrectnessCheck.h b/clang-tools-extra/clang-tidy/misc/ConstCorrectnessCheck.h
index 8af59b7fee294..650f35b50e189 100644
--- a/clang-tools-extra/clang-tidy/misc/ConstCorrectnessCheck.h
+++ b/clang-tools-extra/clang-tidy/misc/ConstCorrectnessCheck.h
@@ -1,4 +1,4 @@
-//===--- ConstCorrectnessCheck.h - clang-tidy -------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/misc/CoroutineHostileRAIICheck.cpp b/clang-tools-extra/clang-tidy/misc/CoroutineHostileRAIICheck.cpp
index 360335b86c641..8ec7695aa842f 100644
--- a/clang-tools-extra/clang-tidy/misc/CoroutineHostileRAIICheck.cpp
+++ b/clang-tools-extra/clang-tidy/misc/CoroutineHostileRAIICheck.cpp
@@ -1,4 +1,4 @@
-//===--- CoroutineHostileRAII.cpp - clang-tidy ----------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/misc/CoroutineHostileRAIICheck.h b/clang-tools-extra/clang-tidy/misc/CoroutineHostileRAIICheck.h
index be925097692a4..95c2b04b82ea7 100644
--- a/clang-tools-extra/clang-tidy/misc/CoroutineHostileRAIICheck.h
+++ b/clang-tools-extra/clang-tidy/misc/CoroutineHostileRAIICheck.h
@@ -1,4 +1,4 @@
-//===--- CoroutineHostileRAIICheck.h - clang-tidy ----------------*- C++-*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/misc/DefinitionsInHeadersCheck.cpp b/clang-tools-extra/clang-tidy/misc/DefinitionsInHeadersCheck.cpp
index ee86925689898..714af111e7f7a 100644
--- a/clang-tools-extra/clang-tidy/misc/DefinitionsInHeadersCheck.cpp
+++ b/clang-tools-extra/clang-tidy/misc/DefinitionsInHeadersCheck.cpp
@@ -1,4 +1,4 @@
-//===--- DefinitionsInHeadersCheck.cpp - clang-tidy------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/misc/DefinitionsInHeadersCheck.h b/clang-tools-extra/clang-tidy/misc/DefinitionsInHeadersCheck.h
index ebb55d5c0e55b..ce1293038078c 100644
--- a/clang-tools-extra/clang-tidy/misc/DefinitionsInHeadersCheck.h
+++ b/clang-tools-extra/clang-tidy/misc/DefinitionsInHeadersCheck.h
@@ -1,4 +1,4 @@
-//===--- DefinitionsInHeadersCheck.h - clang-tidy----------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/misc/HeaderIncludeCycleCheck.cpp b/clang-tools-extra/clang-tidy/misc/HeaderIncludeCycleCheck.cpp
index 1f6ceda9f5b9e..a0e7ac19ab2d5 100644
--- a/clang-tools-extra/clang-tidy/misc/HeaderIncludeCycleCheck.cpp
+++ b/clang-tools-extra/clang-tidy/misc/HeaderIncludeCycleCheck.cpp
@@ -1,4 +1,4 @@
-//===--- HeaderIncludeCycleCheck.cpp - clang-tidy -------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/misc/HeaderIncludeCycleCheck.h b/clang-tools-extra/clang-tidy/misc/HeaderIncludeCycleCheck.h
index 9a1a72399f423..dcf538c4c2844 100644
--- a/clang-tools-extra/clang-tidy/misc/HeaderIncludeCycleCheck.h
+++ b/clang-tools-extra/clang-tidy/misc/HeaderIncludeCycleCheck.h
@@ -1,4 +1,4 @@
-//===--- HeaderIncludeCycleCheck.h - clang-tidy -----------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/misc/IncludeCleanerCheck.cpp b/clang-tools-extra/clang-tidy/misc/IncludeCleanerCheck.cpp
index 813005b892ed7..1a5aa4b0758a6 100644
--- a/clang-tools-extra/clang-tidy/misc/IncludeCleanerCheck.cpp
+++ b/clang-tools-extra/clang-tidy/misc/IncludeCleanerCheck.cpp
@@ -1,4 +1,4 @@
-//===--- IncludeCleanerCheck.cpp - clang-tidy -----------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/misc/IncludeCleanerCheck.h b/clang-tools-extra/clang-tidy/misc/IncludeCleanerCheck.h
index 8f05887efb776..941a2aad79856 100644
--- a/clang-tools-extra/clang-tidy/misc/IncludeCleanerCheck.h
+++ b/clang-tools-extra/clang-tidy/misc/IncludeCleanerCheck.h
@@ -1,4 +1,4 @@
-//===--- IncludeCleanerCheck.h - clang-tidy ---------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/misc/MiscTidyModule.cpp b/clang-tools-extra/clang-tidy/misc/MiscTidyModule.cpp
index f675ca70deb9d..6f4af6c44dcb4 100644
--- a/clang-tools-extra/clang-tidy/misc/MiscTidyModule.cpp
+++ b/clang-tools-extra/clang-tidy/misc/MiscTidyModule.cpp
@@ -1,4 +1,4 @@
-//===--- MiscTidyModule.cpp - clang-tidy ----------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/misc/MisleadingBidirectional.cpp b/clang-tools-extra/clang-tidy/misc/MisleadingBidirectional.cpp
index d7040e28984b0..f89c539423507 100644
--- a/clang-tools-extra/clang-tidy/misc/MisleadingBidirectional.cpp
+++ b/clang-tools-extra/clang-tidy/misc/MisleadingBidirectional.cpp
@@ -1,4 +1,4 @@
-//===--- MisleadingBidirectional.cpp - clang-tidy -------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/misc/MisleadingBidirectional.h b/clang-tools-extra/clang-tidy/misc/MisleadingBidirectional.h
index 9ffb238aeee35..aa7e0432b9ceb 100644
--- a/clang-tools-extra/clang-tidy/misc/MisleadingBidirectional.h
+++ b/clang-tools-extra/clang-tidy/misc/MisleadingBidirectional.h
@@ -1,4 +1,4 @@
-//===--- MisleadingBidirectionalCheck.h - clang-tidy ------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/misc/MisleadingIdentifier.cpp b/clang-tools-extra/clang-tidy/misc/MisleadingIdentifier.cpp
index 1f5dc94755559..ce04fb6fa4096 100644
--- a/clang-tools-extra/clang-tidy/misc/MisleadingIdentifier.cpp
+++ b/clang-tools-extra/clang-tidy/misc/MisleadingIdentifier.cpp
@@ -1,4 +1,4 @@
-//===--- MisleadingIdentifier.cpp - clang-tidy-----------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/misc/MisleadingIdentifier.h b/clang-tools-extra/clang-tidy/misc/MisleadingIdentifier.h
index 7278b741246a3..5e1a56ddc479a 100644
--- a/clang-tools-extra/clang-tidy/misc/MisleadingIdentifier.h
+++ b/clang-tools-extra/clang-tidy/misc/MisleadingIdentifier.h
@@ -1,4 +1,4 @@
-//===--- MisleadingIdentifierCheck.h - clang-tidy ---------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/misc/MisplacedConstCheck.cpp b/clang-tools-extra/clang-tidy/misc/MisplacedConstCheck.cpp
index bb64a5618620c..afa59f31d7259 100644
--- a/clang-tools-extra/clang-tidy/misc/MisplacedConstCheck.cpp
+++ b/clang-tools-extra/clang-tidy/misc/MisplacedConstCheck.cpp
@@ -1,4 +1,4 @@
-//===--- MisplacedConstCheck.cpp - clang-tidy------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/misc/MisplacedConstCheck.h b/clang-tools-extra/clang-tidy/misc/MisplacedConstCheck.h
index b2d88d41b5e31..1abacb4c16426 100644
--- a/clang-tools-extra/clang-tidy/misc/MisplacedConstCheck.h
+++ b/clang-tools-extra/clang-tidy/misc/MisplacedConstCheck.h
@@ -1,4 +1,4 @@
-//===--- MisplacedConstCheck.h - clang-tidy----------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/misc/NewDeleteOverloadsCheck.cpp b/clang-tools-extra/clang-tidy/misc/NewDeleteOverloadsCheck.cpp
index 2837f40bc49b8..5e0f32a900ea8 100644
--- a/clang-tools-extra/clang-tidy/misc/NewDeleteOverloadsCheck.cpp
+++ b/clang-tools-extra/clang-tidy/misc/NewDeleteOverloadsCheck.cpp
@@ -1,4 +1,4 @@
-//===--- NewDeleteOverloadsCheck.cpp - clang-tidy--------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/misc/NewDeleteOverloadsCheck.h b/clang-tools-extra/clang-tidy/misc/NewDeleteOverloadsCheck.h
index b11a57aebb107..93c39fc7005cf 100644
--- a/clang-tools-extra/clang-tidy/misc/NewDeleteOverloadsCheck.h
+++ b/clang-tools-extra/clang-tidy/misc/NewDeleteOverloadsCheck.h
@@ -1,4 +1,4 @@
-//===--- NewDeleteOverloadsCheck.h - clang-tidy----------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/misc/NoRecursionCheck.cpp b/clang-tools-extra/clang-tidy/misc/NoRecursionCheck.cpp
index 712f390765957..0d7667ce53c0c 100644
--- a/clang-tools-extra/clang-tidy/misc/NoRecursionCheck.cpp
+++ b/clang-tools-extra/clang-tidy/misc/NoRecursionCheck.cpp
@@ -1,4 +1,4 @@
-//===--- NoRecursionCheck.cpp - clang-tidy --------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/misc/NoRecursionCheck.h b/clang-tools-extra/clang-tidy/misc/NoRecursionCheck.h
index fd82ffc6e5aba..b678137927351 100644
--- a/clang-tools-extra/clang-tidy/misc/NoRecursionCheck.h
+++ b/clang-tools-extra/clang-tidy/misc/NoRecursionCheck.h
@@ -1,4 +1,4 @@
-//===--- NoRecursionCheck.h - clang-tidy ------------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/misc/NonCopyableObjects.cpp b/clang-tools-extra/clang-tidy/misc/NonCopyableObjects.cpp
index 6fb4a66d0bac6..b33e2667ef660 100644
--- a/clang-tools-extra/clang-tidy/misc/NonCopyableObjects.cpp
+++ b/clang-tools-extra/clang-tidy/misc/NonCopyableObjects.cpp
@@ -1,4 +1,4 @@
-//===--- NonCopyableObjects.cpp - clang-tidy-------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/misc/NonCopyableObjects.h b/clang-tools-extra/clang-tidy/misc/NonCopyableObjects.h
index b886ea948476d..2fcbf41dcf5e1 100644
--- a/clang-tools-extra/clang-tidy/misc/NonCopyableObjects.h
+++ b/clang-tools-extra/clang-tidy/misc/NonCopyableObjects.h
@@ -1,4 +1,4 @@
-//===--- NonCopyableObjects.h - clang-tidy-----------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/misc/NonPrivateMemberVariablesInClassesCheck.cpp b/clang-tools-extra/clang-tidy/misc/NonPrivateMemberVariablesInClassesCheck.cpp
index 9d7d9d1f865ba..fffce2095d8d5 100644
--- a/clang-tools-extra/clang-tidy/misc/NonPrivateMemberVariablesInClassesCheck.cpp
+++ b/clang-tools-extra/clang-tidy/misc/NonPrivateMemberVariablesInClassesCheck.cpp
@@ -1,4 +1,4 @@
-//===--- NonPrivateMemberVariablesInClassesCheck.cpp - clang-tidy ---------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/misc/NonPrivateMemberVariablesInClassesCheck.h b/clang-tools-extra/clang-tidy/misc/NonPrivateMemberVariablesInClassesCheck.h
index 5f0687abdd118..09077226eb5c5 100644
--- a/clang-tools-extra/clang-tidy/misc/NonPrivateMemberVariablesInClassesCheck.h
+++ b/clang-tools-extra/clang-tidy/misc/NonPrivateMemberVariablesInClassesCheck.h
@@ -1,4 +1,4 @@
-//===--- NonPrivateMemberVariablesInClassesCheck.h - clang-tidy -*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/misc/OverrideWithDifferentVisibilityCheck.cpp b/clang-tools-extra/clang-tidy/misc/OverrideWithDifferentVisibilityCheck.cpp
index 2fe0bcf67a3d7..09c52699a27ba 100644
--- a/clang-tools-extra/clang-tidy/misc/OverrideWithDifferentVisibilityCheck.cpp
+++ b/clang-tools-extra/clang-tidy/misc/OverrideWithDifferentVisibilityCheck.cpp
@@ -1,4 +1,4 @@
-//===--- OverrideWithDifferentVisibilityCheck.cpp - clang-tidy ------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/misc/OverrideWithDifferentVisibilityCheck.h b/clang-tools-extra/clang-tidy/misc/OverrideWithDifferentVisibilityCheck.h
index 1f5222d99196b..6e0909524991d 100644
--- a/clang-tools-extra/clang-tidy/misc/OverrideWithDifferentVisibilityCheck.h
+++ b/clang-tools-extra/clang-tidy/misc/OverrideWithDifferentVisibilityCheck.h
@@ -1,4 +1,4 @@
-//===--- OverrideWithDifferentVisibilityCheck.h - clang-tidy --*- C++ -*---===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/misc/RedundantExpressionCheck.cpp b/clang-tools-extra/clang-tidy/misc/RedundantExpressionCheck.cpp
index 107eda2e98f27..17a8a50ff04ac 100644
--- a/clang-tools-extra/clang-tidy/misc/RedundantExpressionCheck.cpp
+++ b/clang-tools-extra/clang-tidy/misc/RedundantExpressionCheck.cpp
@@ -1,4 +1,4 @@
-//===--- RedundantExpressionCheck.cpp - clang-tidy-------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/misc/RedundantExpressionCheck.h b/clang-tools-extra/clang-tidy/misc/RedundantExpressionCheck.h
index 7b3b84b5b32a3..784548355c164 100644
--- a/clang-tools-extra/clang-tidy/misc/RedundantExpressionCheck.h
+++ b/clang-tools-extra/clang-tidy/misc/RedundantExpressionCheck.h
@@ -1,4 +1,4 @@
-//===--- RedundantExpressionCheck.h - clang-tidy-----------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/misc/StaticAssertCheck.cpp b/clang-tools-extra/clang-tidy/misc/StaticAssertCheck.cpp
index 37fbd8c0d725f..5ac53005ad0fa 100644
--- a/clang-tools-extra/clang-tidy/misc/StaticAssertCheck.cpp
+++ b/clang-tools-extra/clang-tidy/misc/StaticAssertCheck.cpp
@@ -1,4 +1,4 @@
-//===--- StaticAssertCheck.cpp - clang-tidy -------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/misc/StaticAssertCheck.h b/clang-tools-extra/clang-tidy/misc/StaticAssertCheck.h
index 7b378e016408a..8df3b71c5e6c1 100644
--- a/clang-tools-extra/clang-tidy/misc/StaticAssertCheck.h
+++ b/clang-tools-extra/clang-tidy/misc/StaticAssertCheck.h
@@ -1,4 +1,4 @@
-//===--- StaticAssertCheck.h - clang-tidy -----------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/misc/ThrowByValueCatchByReferenceCheck.cpp b/clang-tools-extra/clang-tidy/misc/ThrowByValueCatchByReferenceCheck.cpp
index fd2e6b7f39a6b..92ff1c8f72fa9 100644
--- a/clang-tools-extra/clang-tidy/misc/ThrowByValueCatchByReferenceCheck.cpp
+++ b/clang-tools-extra/clang-tidy/misc/ThrowByValueCatchByReferenceCheck.cpp
@@ -1,4 +1,4 @@
-//===--- ThrowByValueCatchByReferenceCheck.cpp - clang-tidy----------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/misc/ThrowByValueCatchByReferenceCheck.h b/clang-tools-extra/clang-tidy/misc/ThrowByValueCatchByReferenceCheck.h
index e3cc4c5e6cd41..15c17e7fa8f65 100644
--- a/clang-tools-extra/clang-tidy/misc/ThrowByValueCatchByReferenceCheck.h
+++ b/clang-tools-extra/clang-tidy/misc/ThrowByValueCatchByReferenceCheck.h
@@ -1,4 +1,4 @@
-//===--- ThrowByValueCatchByReferenceCheck.h - clang-tidy--------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/misc/UnconventionalAssignOperatorCheck.cpp b/clang-tools-extra/clang-tidy/misc/UnconventionalAssignOperatorCheck.cpp
index 8200239b982a0..8a85e79f5aa21 100644
--- a/clang-tools-extra/clang-tidy/misc/UnconventionalAssignOperatorCheck.cpp
+++ b/clang-tools-extra/clang-tidy/misc/UnconventionalAssignOperatorCheck.cpp
@@ -1,4 +1,4 @@
-//===--- UnconventionalAssignOperatorCheck.cpp - clang-tidy -----*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/misc/UnconventionalAssignOperatorCheck.h b/clang-tools-extra/clang-tidy/misc/UnconventionalAssignOperatorCheck.h
index c1aefaa8790e8..960c85eb89cbc 100644
--- a/clang-tools-extra/clang-tidy/misc/UnconventionalAssignOperatorCheck.h
+++ b/clang-tools-extra/clang-tidy/misc/UnconventionalAssignOperatorCheck.h
@@ -1,4 +1,4 @@
-//===--- UnconventionalAssignOperatorCheck.h - clang-tidy -------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/misc/UniqueptrResetReleaseCheck.cpp b/clang-tools-extra/clang-tidy/misc/UniqueptrResetReleaseCheck.cpp
index 0e24b47f5055f..27ddb7cb9b71c 100644
--- a/clang-tools-extra/clang-tidy/misc/UniqueptrResetReleaseCheck.cpp
+++ b/clang-tools-extra/clang-tidy/misc/UniqueptrResetReleaseCheck.cpp
@@ -1,4 +1,4 @@
-//===--- UniqueptrResetReleaseCheck.cpp - clang-tidy ----------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/misc/UniqueptrResetReleaseCheck.h b/clang-tools-extra/clang-tidy/misc/UniqueptrResetReleaseCheck.h
index a76cc77112c6e..cb83ac7b11985 100644
--- a/clang-tools-extra/clang-tidy/misc/UniqueptrResetReleaseCheck.h
+++ b/clang-tools-extra/clang-tidy/misc/UniqueptrResetReleaseCheck.h
@@ -1,4 +1,4 @@
-//===--- UniqueptrResetReleaseCheck.h - clang-tidy --------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/misc/UnusedAliasDeclsCheck.cpp b/clang-tools-extra/clang-tidy/misc/UnusedAliasDeclsCheck.cpp
index 4fa679aa8dd88..8e54a21b49740 100644
--- a/clang-tools-extra/clang-tidy/misc/UnusedAliasDeclsCheck.cpp
+++ b/clang-tools-extra/clang-tidy/misc/UnusedAliasDeclsCheck.cpp
@@ -1,4 +1,4 @@
-//===--- UnusedAliasDeclsCheck.cpp - clang-tidy----------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/misc/UnusedAliasDeclsCheck.h b/clang-tools-extra/clang-tidy/misc/UnusedAliasDeclsCheck.h
index 9f995d94c1989..ffe82ca989d17 100644
--- a/clang-tools-extra/clang-tidy/misc/UnusedAliasDeclsCheck.h
+++ b/clang-tools-extra/clang-tidy/misc/UnusedAliasDeclsCheck.h
@@ -1,4 +1,4 @@
-//===--- UnusedAliasDeclsCheck.h - clang-tidy--------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/misc/UnusedParametersCheck.cpp b/clang-tools-extra/clang-tidy/misc/UnusedParametersCheck.cpp
index 503f62f946e81..37e289cd9e497 100644
--- a/clang-tools-extra/clang-tidy/misc/UnusedParametersCheck.cpp
+++ b/clang-tools-extra/clang-tidy/misc/UnusedParametersCheck.cpp
@@ -1,4 +1,4 @@
-//===--- UnusedParametersCheck.cpp - clang-tidy----------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/misc/UnusedParametersCheck.h b/clang-tools-extra/clang-tidy/misc/UnusedParametersCheck.h
index 90097ed415d37..6e09086d667f9 100644
--- a/clang-tools-extra/clang-tidy/misc/UnusedParametersCheck.h
+++ b/clang-tools-extra/clang-tidy/misc/UnusedParametersCheck.h
@@ -1,4 +1,4 @@
-//===--- UnusedParametersCheck.h - clang-tidy--------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/misc/UnusedUsingDeclsCheck.cpp b/clang-tools-extra/clang-tidy/misc/UnusedUsingDeclsCheck.cpp
index 49432073ce1d7..31524e41f12a3 100644
--- a/clang-tools-extra/clang-tidy/misc/UnusedUsingDeclsCheck.cpp
+++ b/clang-tools-extra/clang-tidy/misc/UnusedUsingDeclsCheck.cpp
@@ -1,4 +1,4 @@
-//===--- UnusedUsingDeclsCheck.cpp - clang-tidy----------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/misc/UnusedUsingDeclsCheck.h b/clang-tools-extra/clang-tidy/misc/UnusedUsingDeclsCheck.h
index e5f766dbac56b..ce77acf443e2c 100644
--- a/clang-tools-extra/clang-tidy/misc/UnusedUsingDeclsCheck.h
+++ b/clang-tools-extra/clang-tidy/misc/UnusedUsingDeclsCheck.h
@@ -1,4 +1,4 @@
-//===--- UnusedUsingDeclsCheck.h - clang-tidy--------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/misc/UseAnonymousNamespaceCheck.cpp b/clang-tools-extra/clang-tidy/misc/UseAnonymousNamespaceCheck.cpp
index 05b470141f1f7..aa0cc1ecd5761 100644
--- a/clang-tools-extra/clang-tidy/misc/UseAnonymousNamespaceCheck.cpp
+++ b/clang-tools-extra/clang-tidy/misc/UseAnonymousNamespaceCheck.cpp
@@ -1,4 +1,4 @@
-//===--- UseAnonymousNamespaceCheck.cpp - clang-tidy ----------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/misc/UseAnonymousNamespaceCheck.h b/clang-tools-extra/clang-tidy/misc/UseAnonymousNamespaceCheck.h
index 3f73ce7c398af..10e10b0c32360 100644
--- a/clang-tools-extra/clang-tidy/misc/UseAnonymousNamespaceCheck.h
+++ b/clang-tools-extra/clang-tidy/misc/UseAnonymousNamespaceCheck.h
@@ -1,4 +1,4 @@
-//===--- UseAnonymousNamespaceCheck.h - clang-tidy --------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/misc/UseInternalLinkageCheck.cpp b/clang-tools-extra/clang-tidy/misc/UseInternalLinkageCheck.cpp
index e2071b806b125..415852d6f14e9 100644
--- a/clang-tools-extra/clang-tidy/misc/UseInternalLinkageCheck.cpp
+++ b/clang-tools-extra/clang-tidy/misc/UseInternalLinkageCheck.cpp
@@ -1,4 +1,4 @@
-//===--- UseInternalLinkageCheck.cpp - clang-tidy--------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/misc/UseInternalLinkageCheck.h b/clang-tools-extra/clang-tidy/misc/UseInternalLinkageCheck.h
index 0d6c3e43aa945..8c82ac0b6b644 100644
--- a/clang-tools-extra/clang-tidy/misc/UseInternalLinkageCheck.h
+++ b/clang-tools-extra/clang-tidy/misc/UseInternalLinkageCheck.h
@@ -1,4 +1,4 @@
-//===--- UseInternalLinkageCheck.h - clang-tidy -----------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/modernize/AvoidBindCheck.cpp b/clang-tools-extra/clang-tidy/modernize/AvoidBindCheck.cpp
index aa7836bcbf169..1c0043b423361 100644
--- a/clang-tools-extra/clang-tidy/modernize/AvoidBindCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/AvoidBindCheck.cpp
@@ -1,4 +1,4 @@
-//===--- AvoidBindCheck.cpp - clang-tidy-----------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/modernize/AvoidBindCheck.h b/clang-tools-extra/clang-tidy/modernize/AvoidBindCheck.h
index a9054864c3c11..ba9e562324e55 100644
--- a/clang-tools-extra/clang-tidy/modernize/AvoidBindCheck.h
+++ b/clang-tools-extra/clang-tidy/modernize/AvoidBindCheck.h
@@ -1,4 +1,4 @@
-//===--- AvoidBindCheck.h - clang-tidy---------------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/modernize/AvoidCArraysCheck.cpp b/clang-tools-extra/clang-tidy/modernize/AvoidCArraysCheck.cpp
index a5b535f7433bb..92900192957e5 100644
--- a/clang-tools-extra/clang-tidy/modernize/AvoidCArraysCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/AvoidCArraysCheck.cpp
@@ -1,4 +1,4 @@
-//===--- AvoidCArraysCheck.cpp - clang-tidy -------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/modernize/AvoidCArraysCheck.h b/clang-tools-extra/clang-tidy/modernize/AvoidCArraysCheck.h
index 719e88e4b3166..ff0809644050b 100644
--- a/clang-tools-extra/clang-tidy/modernize/AvoidCArraysCheck.h
+++ b/clang-tools-extra/clang-tidy/modernize/AvoidCArraysCheck.h
@@ -1,4 +1,4 @@
-//===--- AvoidCArraysCheck.h - clang-tidy -----------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/modernize/ConcatNestedNamespacesCheck.cpp b/clang-tools-extra/clang-tidy/modernize/ConcatNestedNamespacesCheck.cpp
index 5d11843fea65c..6e28cb223370a 100644
--- a/clang-tools-extra/clang-tidy/modernize/ConcatNestedNamespacesCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/ConcatNestedNamespacesCheck.cpp
@@ -1,4 +1,4 @@
-//===--- ConcatNestedNamespacesCheck.cpp - clang-tidy----------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/modernize/ConcatNestedNamespacesCheck.h b/clang-tools-extra/clang-tidy/modernize/ConcatNestedNamespacesCheck.h
index a5724e169e48d..9886cb5a2d7d9 100644
--- a/clang-tools-extra/clang-tidy/modernize/ConcatNestedNamespacesCheck.h
+++ b/clang-tools-extra/clang-tidy/modernize/ConcatNestedNamespacesCheck.h
@@ -1,4 +1,4 @@
-//===--- ConcatNestedNamespacesCheck.h - clang-tidy--------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/modernize/DeprecatedHeadersCheck.cpp b/clang-tools-extra/clang-tidy/modernize/DeprecatedHeadersCheck.cpp
index 47a3ef987ebcf..9f4c215614287 100644
--- a/clang-tools-extra/clang-tidy/modernize/DeprecatedHeadersCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/DeprecatedHeadersCheck.cpp
@@ -1,4 +1,4 @@
-//===--- DeprecatedHeadersCheck.cpp - clang-tidy---------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/modernize/DeprecatedHeadersCheck.h b/clang-tools-extra/clang-tidy/modernize/DeprecatedHeadersCheck.h
index 68305efdb5d23..c9409cb641c54 100644
--- a/clang-tools-extra/clang-tidy/modernize/DeprecatedHeadersCheck.h
+++ b/clang-tools-extra/clang-tidy/modernize/DeprecatedHeadersCheck.h
@@ -1,4 +1,4 @@
-//===--- DeprecatedHeadersCheck.h - clang-tidy-------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/modernize/DeprecatedIosBaseAliasesCheck.cpp b/clang-tools-extra/clang-tidy/modernize/DeprecatedIosBaseAliasesCheck.cpp
index 2aca61021166d..5e254376c9796 100644
--- a/clang-tools-extra/clang-tidy/modernize/DeprecatedIosBaseAliasesCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/DeprecatedIosBaseAliasesCheck.cpp
@@ -1,4 +1,4 @@
-//===--- DeprecatedIosBaseAliasesCheck.cpp - clang-tidy--------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/modernize/DeprecatedIosBaseAliasesCheck.h b/clang-tools-extra/clang-tidy/modernize/DeprecatedIosBaseAliasesCheck.h
index 09cfebef48d48..0a0b4deb5abba 100644
--- a/clang-tools-extra/clang-tidy/modernize/DeprecatedIosBaseAliasesCheck.h
+++ b/clang-tools-extra/clang-tidy/modernize/DeprecatedIosBaseAliasesCheck.h
@@ -1,4 +1,4 @@
-//===--- DeprecatedIosBaseAliasesCheck.h - clang-tidy------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/modernize/IntegralLiteralExpressionMatcher.cpp b/clang-tools-extra/clang-tidy/modernize/IntegralLiteralExpressionMatcher.cpp
index ecf3a18199ffe..05cf51a430f3f 100644
--- a/clang-tools-extra/clang-tidy/modernize/IntegralLiteralExpressionMatcher.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/IntegralLiteralExpressionMatcher.cpp
@@ -1,4 +1,4 @@
-//===--- IntegralLiteralExpressionMatcher.cpp - clang-tidy ----------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/modernize/IntegralLiteralExpressionMatcher.h b/clang-tools-extra/clang-tidy/modernize/IntegralLiteralExpressionMatcher.h
index 22893784b07f8..d495087f49491 100644
--- a/clang-tools-extra/clang-tidy/modernize/IntegralLiteralExpressionMatcher.h
+++ b/clang-tools-extra/clang-tidy/modernize/IntegralLiteralExpressionMatcher.h
@@ -1,4 +1,4 @@
-//===--- IntegralLiteralExpressionMatcher.h - clang-tidy ------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/modernize/LoopConvertCheck.cpp b/clang-tools-extra/clang-tidy/modernize/LoopConvertCheck.cpp
index 3ce7b12f92f6b..37482583760f2 100644
--- a/clang-tools-extra/clang-tidy/modernize/LoopConvertCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/LoopConvertCheck.cpp
@@ -1,4 +1,4 @@
-//===--- LoopConvertCheck.cpp - clang-tidy---------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/modernize/LoopConvertCheck.h b/clang-tools-extra/clang-tidy/modernize/LoopConvertCheck.h
index b4f729d3ac538..55487828ca69e 100644
--- a/clang-tools-extra/clang-tidy/modernize/LoopConvertCheck.h
+++ b/clang-tools-extra/clang-tidy/modernize/LoopConvertCheck.h
@@ -1,4 +1,4 @@
-//===--- LoopConvertCheck.h - clang-tidy-------------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/modernize/LoopConvertUtils.cpp b/clang-tools-extra/clang-tidy/modernize/LoopConvertUtils.cpp
index 3d0a1f01725fa..286c39be44ce4 100644
--- a/clang-tools-extra/clang-tidy/modernize/LoopConvertUtils.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/LoopConvertUtils.cpp
@@ -1,4 +1,4 @@
-//===--- LoopConvertUtils.cpp - clang-tidy --------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/modernize/LoopConvertUtils.h b/clang-tools-extra/clang-tidy/modernize/LoopConvertUtils.h
index ca9c1855038b5..306eca7140d1a 100644
--- a/clang-tools-extra/clang-tidy/modernize/LoopConvertUtils.h
+++ b/clang-tools-extra/clang-tidy/modernize/LoopConvertUtils.h
@@ -1,4 +1,4 @@
-//===--- LoopConvertUtils.h - clang-tidy ------------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/modernize/MacroToEnumCheck.cpp b/clang-tools-extra/clang-tidy/modernize/MacroToEnumCheck.cpp
index 118e96a6f34ae..2669aa2361ea1 100644
--- a/clang-tools-extra/clang-tidy/modernize/MacroToEnumCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/MacroToEnumCheck.cpp
@@ -1,4 +1,4 @@
-//===--- MacroToEnumCheck.cpp - clang-tidy --------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/modernize/MacroToEnumCheck.h b/clang-tools-extra/clang-tidy/modernize/MacroToEnumCheck.h
index b56d7ac3b22ef..3f339f364d722 100644
--- a/clang-tools-extra/clang-tidy/modernize/MacroToEnumCheck.h
+++ b/clang-tools-extra/clang-tidy/modernize/MacroToEnumCheck.h
@@ -1,4 +1,4 @@
-//===--- MacroToEnumCheck.h - clang-tidy ------------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/modernize/MakeSharedCheck.cpp b/clang-tools-extra/clang-tidy/modernize/MakeSharedCheck.cpp
index 69f7d9f69eeed..207195551883b 100644
--- a/clang-tools-extra/clang-tidy/modernize/MakeSharedCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/MakeSharedCheck.cpp
@@ -1,4 +1,4 @@
-//===--- MakeSharedCheck.cpp - clang-tidy----------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/modernize/MakeSharedCheck.h b/clang-tools-extra/clang-tidy/modernize/MakeSharedCheck.h
index caaf4ae403c34..025ce757b3d5f 100644
--- a/clang-tools-extra/clang-tidy/modernize/MakeSharedCheck.h
+++ b/clang-tools-extra/clang-tidy/modernize/MakeSharedCheck.h
@@ -1,4 +1,4 @@
-//===--- MakeSharedCheck.h - clang-tidy--------------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/modernize/MakeSmartPtrCheck.cpp b/clang-tools-extra/clang-tidy/modernize/MakeSmartPtrCheck.cpp
index b3e29b3273a82..9d01e27fbab9c 100644
--- a/clang-tools-extra/clang-tidy/modernize/MakeSmartPtrCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/MakeSmartPtrCheck.cpp
@@ -1,4 +1,4 @@
-//===--- MakeSmartPtrCheck.cpp - clang-tidy--------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/modernize/MakeSmartPtrCheck.h b/clang-tools-extra/clang-tidy/modernize/MakeSmartPtrCheck.h
index e2f9abed8138a..28d5b459dd914 100644
--- a/clang-tools-extra/clang-tidy/modernize/MakeSmartPtrCheck.h
+++ b/clang-tools-extra/clang-tidy/modernize/MakeSmartPtrCheck.h
@@ -1,4 +1,4 @@
-//===--- MakeSmartPtrCheck.h - clang-tidy------------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/modernize/MakeUniqueCheck.cpp b/clang-tools-extra/clang-tidy/modernize/MakeUniqueCheck.cpp
index d4d9f700f12c9..b13d95633c12e 100644
--- a/clang-tools-extra/clang-tidy/modernize/MakeUniqueCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/MakeUniqueCheck.cpp
@@ -1,4 +1,4 @@
-//===--- MakeUniqueCheck.cpp - clang-tidy----------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/modernize/MakeUniqueCheck.h b/clang-tools-extra/clang-tidy/modernize/MakeUniqueCheck.h
index 7b356823a8cfa..9c4f6bc746392 100644
--- a/clang-tools-extra/clang-tidy/modernize/MakeUniqueCheck.h
+++ b/clang-tools-extra/clang-tidy/modernize/MakeUniqueCheck.h
@@ -1,4 +1,4 @@
-//===--- MakeUniqueCheck.h - clang-tidy--------------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/modernize/MinMaxUseInitializerListCheck.cpp b/clang-tools-extra/clang-tidy/modernize/MinMaxUseInitializerListCheck.cpp
index 9861f4681db1b..b5a985b0ac5d4 100644
--- a/clang-tools-extra/clang-tidy/modernize/MinMaxUseInitializerListCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/MinMaxUseInitializerListCheck.cpp
@@ -1,4 +1,4 @@
-//===--- MinMaxUseInitializerListCheck.cpp - clang-tidy -------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/modernize/MinMaxUseInitializerListCheck.h b/clang-tools-extra/clang-tidy/modernize/MinMaxUseInitializerListCheck.h
index 577d126530761..45fc5089f7737 100644
--- a/clang-tools-extra/clang-tidy/modernize/MinMaxUseInitializerListCheck.h
+++ b/clang-tools-extra/clang-tidy/modernize/MinMaxUseInitializerListCheck.h
@@ -1,4 +1,4 @@
-//===--- MinMaxUseInitializerListCheck.h - clang-tidy -----------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/modernize/ModernizeTidyModule.cpp b/clang-tools-extra/clang-tidy/modernize/ModernizeTidyModule.cpp
index fdf38bc4b6308..9b98ffdadba68 100644
--- a/clang-tools-extra/clang-tidy/modernize/ModernizeTidyModule.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/ModernizeTidyModule.cpp
@@ -1,4 +1,4 @@
-//===--- ModernizeTidyModule.cpp - clang-tidy -----------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/modernize/PassByValueCheck.cpp b/clang-tools-extra/clang-tidy/modernize/PassByValueCheck.cpp
index a54d0721a5b7d..d5ccbb73735ec 100644
--- a/clang-tools-extra/clang-tidy/modernize/PassByValueCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/PassByValueCheck.cpp
@@ -1,4 +1,4 @@
-//===--- PassByValueCheck.cpp - clang-tidy---------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/modernize/PassByValueCheck.h b/clang-tools-extra/clang-tidy/modernize/PassByValueCheck.h
index b586b8d5fbf66..f27871c1a98b7 100644
--- a/clang-tools-extra/clang-tidy/modernize/PassByValueCheck.h
+++ b/clang-tools-extra/clang-tidy/modernize/PassByValueCheck.h
@@ -1,4 +1,4 @@
-//===--- PassByValueCheck.h - clang-tidy-------------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/modernize/RawStringLiteralCheck.cpp b/clang-tools-extra/clang-tidy/modernize/RawStringLiteralCheck.cpp
index 0c9e909fea7f9..8e514e4bc9893 100644
--- a/clang-tools-extra/clang-tidy/modernize/RawStringLiteralCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/RawStringLiteralCheck.cpp
@@ -1,4 +1,4 @@
-//===--- RawStringLiteralCheck.cpp - clang-tidy----------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/modernize/RawStringLiteralCheck.h b/clang-tools-extra/clang-tidy/modernize/RawStringLiteralCheck.h
index 879255550dd5b..5af9f846db29b 100644
--- a/clang-tools-extra/clang-tidy/modernize/RawStringLiteralCheck.h
+++ b/clang-tools-extra/clang-tidy/modernize/RawStringLiteralCheck.h
@@ -1,4 +1,4 @@
-//===--- RawStringLiteralCheck.h - clang-tidy--------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/modernize/RedundantVoidArgCheck.h b/clang-tools-extra/clang-tidy/modernize/RedundantVoidArgCheck.h
index bda5f2c253ce9..53de74b68ff26 100644
--- a/clang-tools-extra/clang-tidy/modernize/RedundantVoidArgCheck.h
+++ b/clang-tools-extra/clang-tidy/modernize/RedundantVoidArgCheck.h
@@ -1,4 +1,4 @@
-//===--- RedundantVoidArgCheck.h - clang-tidy --------------------*- C++-*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/modernize/ReplaceAutoPtrCheck.cpp b/clang-tools-extra/clang-tidy/modernize/ReplaceAutoPtrCheck.cpp
index f2142b810a126..b562ae85aa266 100644
--- a/clang-tools-extra/clang-tidy/modernize/ReplaceAutoPtrCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/ReplaceAutoPtrCheck.cpp
@@ -1,4 +1,4 @@
-//===--- ReplaceAutoPtrCheck.cpp - clang-tidy------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/modernize/ReplaceAutoPtrCheck.h b/clang-tools-extra/clang-tidy/modernize/ReplaceAutoPtrCheck.h
index c91f5f580c524..9a6e2bb0e074d 100644
--- a/clang-tools-extra/clang-tidy/modernize/ReplaceAutoPtrCheck.h
+++ b/clang-tools-extra/clang-tidy/modernize/ReplaceAutoPtrCheck.h
@@ -1,4 +1,4 @@
-//===--- ReplaceAutoPtrCheck.h - clang-tidy----------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/modernize/ReplaceDisallowCopyAndAssignMacroCheck.cpp b/clang-tools-extra/clang-tidy/modernize/ReplaceDisallowCopyAndAssignMacroCheck.cpp
index 42be7d7a7b78c..64b0029fc0e37 100644
--- a/clang-tools-extra/clang-tidy/modernize/ReplaceDisallowCopyAndAssignMacroCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/ReplaceDisallowCopyAndAssignMacroCheck.cpp
@@ -1,4 +1,4 @@
-//===--- ReplaceDisallowCopyAndAssignMacroCheck.cpp - clang-tidy ----------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/modernize/ReplaceDisallowCopyAndAssignMacroCheck.h b/clang-tools-extra/clang-tidy/modernize/ReplaceDisallowCopyAndAssignMacroCheck.h
index 71e5ecafd6a6f..44ca787fa4fcc 100644
--- a/clang-tools-extra/clang-tidy/modernize/ReplaceDisallowCopyAndAssignMacroCheck.h
+++ b/clang-tools-extra/clang-tidy/modernize/ReplaceDisallowCopyAndAssignMacroCheck.h
@@ -1,4 +1,4 @@
-//===--- ReplaceDisallowCopyAndAssignMacroCheck.h - clang-tidy --*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/modernize/ReplaceRandomShuffleCheck.cpp b/clang-tools-extra/clang-tidy/modernize/ReplaceRandomShuffleCheck.cpp
index df20800a215da..3d7b3eae544b6 100644
--- a/clang-tools-extra/clang-tidy/modernize/ReplaceRandomShuffleCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/ReplaceRandomShuffleCheck.cpp
@@ -1,4 +1,4 @@
-//===--- ReplaceRandomShuffleCheck.cpp - clang-tidy------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/modernize/ReplaceRandomShuffleCheck.h b/clang-tools-extra/clang-tidy/modernize/ReplaceRandomShuffleCheck.h
index 95927c2803e7b..23571dfa92175 100644
--- a/clang-tools-extra/clang-tidy/modernize/ReplaceRandomShuffleCheck.h
+++ b/clang-tools-extra/clang-tidy/modernize/ReplaceRandomShuffleCheck.h
@@ -1,4 +1,4 @@
-//===--- ReplaceRandomShuffleCheck.h - clang-tidy----------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/modernize/ReturnBracedInitListCheck.cpp b/clang-tools-extra/clang-tidy/modernize/ReturnBracedInitListCheck.cpp
index 472cc34be4378..eba2445c0aaea 100644
--- a/clang-tools-extra/clang-tidy/modernize/ReturnBracedInitListCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/ReturnBracedInitListCheck.cpp
@@ -1,4 +1,4 @@
-//===--- ReturnBracedInitListCheck.cpp - clang-tidy------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/modernize/ReturnBracedInitListCheck.h b/clang-tools-extra/clang-tidy/modernize/ReturnBracedInitListCheck.h
index e9ea58b06d826..c023cb5c4c2ca 100644
--- a/clang-tools-extra/clang-tidy/modernize/ReturnBracedInitListCheck.h
+++ b/clang-tools-extra/clang-tidy/modernize/ReturnBracedInitListCheck.h
@@ -1,4 +1,4 @@
-//===--- ReturnBracedInitListCheck.h - clang-tidy----------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/modernize/ShrinkToFitCheck.cpp b/clang-tools-extra/clang-tidy/modernize/ShrinkToFitCheck.cpp
index b971b82507644..e32ddbf87efe9 100644
--- a/clang-tools-extra/clang-tidy/modernize/ShrinkToFitCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/ShrinkToFitCheck.cpp
@@ -1,4 +1,4 @@
-//===--- ShrinkToFitCheck.cpp - clang-tidy---------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/modernize/ShrinkToFitCheck.h b/clang-tools-extra/clang-tidy/modernize/ShrinkToFitCheck.h
index 30c2ca25b27e5..d7070d63ca983 100644
--- a/clang-tools-extra/clang-tidy/modernize/ShrinkToFitCheck.h
+++ b/clang-tools-extra/clang-tidy/modernize/ShrinkToFitCheck.h
@@ -1,4 +1,4 @@
-//===--- ShrinkToFitCheck.h - clang-tidy-------------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/modernize/TypeTraitsCheck.cpp b/clang-tools-extra/clang-tidy/modernize/TypeTraitsCheck.cpp
index 15bd0a6760ec1..6078013166d46 100644
--- a/clang-tools-extra/clang-tidy/modernize/TypeTraitsCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/TypeTraitsCheck.cpp
@@ -1,4 +1,4 @@
-//===--- TypeTraitsCheck.cpp - clang-tidy ---------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/modernize/TypeTraitsCheck.h b/clang-tools-extra/clang-tidy/modernize/TypeTraitsCheck.h
index a08b96fd9f13e..1f9ffc9b8b811 100644
--- a/clang-tools-extra/clang-tidy/modernize/TypeTraitsCheck.h
+++ b/clang-tools-extra/clang-tidy/modernize/TypeTraitsCheck.h
@@ -1,4 +1,4 @@
-//===--- TypeTraitsCheck.h - clang-tidy -------------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/modernize/UnaryStaticAssertCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UnaryStaticAssertCheck.cpp
index d4ca652838741..4e4817f2ec2e6 100644
--- a/clang-tools-extra/clang-tidy/modernize/UnaryStaticAssertCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/UnaryStaticAssertCheck.cpp
@@ -1,4 +1,4 @@
-//===--- UnaryStaticAssertCheck.cpp - clang-tidy---------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/modernize/UnaryStaticAssertCheck.h b/clang-tools-extra/clang-tidy/modernize/UnaryStaticAssertCheck.h
index 6ec1a68a28fc1..94e78f01b06f9 100644
--- a/clang-tools-extra/clang-tidy/modernize/UnaryStaticAssertCheck.h
+++ b/clang-tools-extra/clang-tidy/modernize/UnaryStaticAssertCheck.h
@@ -1,4 +1,4 @@
-//===--- UnaryStaticAssertCheck.h - clang-tidy-------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/modernize/UseAutoCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseAutoCheck.cpp
index aedfda83838cd..c7fd0a9695952 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseAutoCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/UseAutoCheck.cpp
@@ -1,4 +1,4 @@
-//===--- UseAutoCheck.cpp - clang-tidy-------------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/modernize/UseAutoCheck.h b/clang-tools-extra/clang-tidy/modernize/UseAutoCheck.h
index 7a9bbbe1cdf77..dc39077d5ac99 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseAutoCheck.h
+++ b/clang-tools-extra/clang-tidy/modernize/UseAutoCheck.h
@@ -1,4 +1,4 @@
-//===--- UseAutoCheck.h - clang-tidy-----------------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/modernize/UseBoolLiteralsCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseBoolLiteralsCheck.cpp
index dfcfc925b5231..8b5ffe86b1839 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseBoolLiteralsCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/UseBoolLiteralsCheck.cpp
@@ -1,4 +1,4 @@
-//===--- UseBoolLiteralsCheck.cpp - clang-tidy-----------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/modernize/UseBoolLiteralsCheck.h b/clang-tools-extra/clang-tidy/modernize/UseBoolLiteralsCheck.h
index 67e6921554852..5b7b1e0cc3b6e 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseBoolLiteralsCheck.h
+++ b/clang-tools-extra/clang-tidy/modernize/UseBoolLiteralsCheck.h
@@ -1,4 +1,4 @@
-//===--- UseBoolLiteralsCheck.h - clang-tidy---------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/modernize/UseConstraintsCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseConstraintsCheck.cpp
index c4a64be537a44..d5342a1664153 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseConstraintsCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/UseConstraintsCheck.cpp
@@ -1,4 +1,4 @@
-//===--- UseConstraintsCheck.cpp - clang-tidy -----------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/modernize/UseConstraintsCheck.h b/clang-tools-extra/clang-tidy/modernize/UseConstraintsCheck.h
index 814160190e0f4..bf49f329baeab 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseConstraintsCheck.h
+++ b/clang-tools-extra/clang-tidy/modernize/UseConstraintsCheck.h
@@ -1,4 +1,4 @@
-//===--- UseConstraintsCheck.h - clang-tidy ---------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/modernize/UseDefaultMemberInitCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseDefaultMemberInitCheck.cpp
index e950fd1c77da5..d920af7fc477b 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseDefaultMemberInitCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/UseDefaultMemberInitCheck.cpp
@@ -1,4 +1,4 @@
-//===--- UseDefaultMemberInitCheck.cpp - clang-tidy------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/modernize/UseDefaultMemberInitCheck.h b/clang-tools-extra/clang-tidy/modernize/UseDefaultMemberInitCheck.h
index 099449a3167fa..7ae04b78006a1 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseDefaultMemberInitCheck.h
+++ b/clang-tools-extra/clang-tidy/modernize/UseDefaultMemberInitCheck.h
@@ -1,4 +1,4 @@
-//===--- UseDefaultMemberInitCheck.h - clang-tidy----------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/modernize/UseDesignatedInitializersCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseDesignatedInitializersCheck.cpp
index e9e750ad4e933..cc7c2d1e1dff5 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseDesignatedInitializersCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/UseDesignatedInitializersCheck.cpp
@@ -1,4 +1,4 @@
-//===--- UseDesignatedInitializersCheck.cpp - clang-tidy ------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/modernize/UseDesignatedInitializersCheck.h b/clang-tools-extra/clang-tidy/modernize/UseDesignatedInitializersCheck.h
index 79095ade50371..e010509474287 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseDesignatedInitializersCheck.h
+++ b/clang-tools-extra/clang-tidy/modernize/UseDesignatedInitializersCheck.h
@@ -1,4 +1,4 @@
-//===--- UseDesignatedInitializersCheck.h - clang-tidy ----------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/modernize/UseEmplaceCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseEmplaceCheck.cpp
index ee49d8a7cb0b0..ade0085267db3 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseEmplaceCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/UseEmplaceCheck.cpp
@@ -1,4 +1,4 @@
-//===--- UseEmplaceCheck.cpp - clang-tidy----------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/modernize/UseEmplaceCheck.h b/clang-tools-extra/clang-tidy/modernize/UseEmplaceCheck.h
index f51e51dc734a0..2e9e142894a47 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseEmplaceCheck.h
+++ b/clang-tools-extra/clang-tidy/modernize/UseEmplaceCheck.h
@@ -1,4 +1,4 @@
-//===--- UseEmplaceCheck.h - clang-tidy--------------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/modernize/UseEqualsDefaultCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseEqualsDefaultCheck.cpp
index f0c541eaca0a0..d6ddbb69f7b0d 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseEqualsDefaultCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/UseEqualsDefaultCheck.cpp
@@ -1,4 +1,4 @@
-//===--- UseEqualsDefaultCheck.cpp - clang-tidy----------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -204,7 +204,7 @@ static bool bodyEmpty(const ASTContext *Context, const CompoundStmt *Body) {
       CharSourceRange::getCharRange(Body->getLBracLoc().getLocWithOffset(1),
                                     Body->getRBracLoc()),
       Context->getSourceManager(), Context->getLangOpts(), &Invalid);
-  return !Invalid && std::strspn(Text.data(), " \t\r\n") == Text.size();
+  return !Invalid && Text.ltrim(" \t\r\n").empty();
 }
 
 UseEqualsDefaultCheck::UseEqualsDefaultCheck(StringRef Name,
diff --git a/clang-tools-extra/clang-tidy/modernize/UseEqualsDefaultCheck.h b/clang-tools-extra/clang-tidy/modernize/UseEqualsDefaultCheck.h
index 04c2177704fbe..51b386c2acaca 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseEqualsDefaultCheck.h
+++ b/clang-tools-extra/clang-tidy/modernize/UseEqualsDefaultCheck.h
@@ -1,4 +1,4 @@
-//===--- UseEqualsDefaultCheck.h - clang-tidy---------------------*- C++-*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/modernize/UseEqualsDeleteCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseEqualsDeleteCheck.cpp
index cf4e4f09c6a90..ab2d41a52040e 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseEqualsDeleteCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/UseEqualsDeleteCheck.cpp
@@ -1,4 +1,4 @@
-//===--- UseEqualsDeleteCheck.cpp - clang-tidy-----------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/modernize/UseEqualsDeleteCheck.h b/clang-tools-extra/clang-tidy/modernize/UseEqualsDeleteCheck.h
index dc3e712482c21..590aa900b8768 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseEqualsDeleteCheck.h
+++ b/clang-tools-extra/clang-tidy/modernize/UseEqualsDeleteCheck.h
@@ -1,4 +1,4 @@
-//===--- UseEqualsDeleteCheck.h - clang-tidy----------------------*- C++-*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/modernize/UseIntegerSignComparisonCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseIntegerSignComparisonCheck.cpp
index 4726674be66fd..0003429c62890 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseIntegerSignComparisonCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/UseIntegerSignComparisonCheck.cpp
@@ -1,4 +1,4 @@
-//===--- UseIntegerSignComparisonCheck.cpp - clang-tidy -------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/modernize/UseIntegerSignComparisonCheck.h b/clang-tools-extra/clang-tidy/modernize/UseIntegerSignComparisonCheck.h
index 84bcba84c74b5..106796f0c8072 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseIntegerSignComparisonCheck.h
+++ b/clang-tools-extra/clang-tidy/modernize/UseIntegerSignComparisonCheck.h
@@ -1,4 +1,4 @@
-//===--- UseIntegerSignComparisonCheck.h - clang-tidy -----------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/modernize/UseNodiscardCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseNodiscardCheck.cpp
index 6de80dcb99c60..d22c99335d9bb 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseNodiscardCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/UseNodiscardCheck.cpp
@@ -1,4 +1,4 @@
-//===--- UseNodiscardCheck.cpp - clang-tidy -------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/modernize/UseNodiscardCheck.h b/clang-tools-extra/clang-tidy/modernize/UseNodiscardCheck.h
index cbfe1089c03ca..cc46769900dd3 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseNodiscardCheck.h
+++ b/clang-tools-extra/clang-tidy/modernize/UseNodiscardCheck.h
@@ -1,4 +1,4 @@
-//===--- UseNodiscardCheck.h - clang-tidy -----------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/modernize/UseNoexceptCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseNoexceptCheck.cpp
index 9ba9e6dd8d2c2..d1388dc6298e4 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseNoexceptCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/UseNoexceptCheck.cpp
@@ -1,4 +1,4 @@
-//===--- UseNoexceptCheck.cpp - clang-tidy---------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/modernize/UseNoexceptCheck.h b/clang-tools-extra/clang-tidy/modernize/UseNoexceptCheck.h
index 159aa97199534..3a915e1fe7238 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseNoexceptCheck.h
+++ b/clang-tools-extra/clang-tidy/modernize/UseNoexceptCheck.h
@@ -1,4 +1,4 @@
-//===--- UseNoexceptCheck.h - clang-tidy-------------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/modernize/UseNullptrCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseNullptrCheck.cpp
index c38fb3a01d287..4dc4baecddd50 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseNullptrCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/UseNullptrCheck.cpp
@@ -1,4 +1,4 @@
-//===--- UseNullptrCheck.cpp - clang-tidy----------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/modernize/UseNullptrCheck.h b/clang-tools-extra/clang-tidy/modernize/UseNullptrCheck.h
index 4c02f8ccdf303..7c7b5ae02f1cd 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseNullptrCheck.h
+++ b/clang-tools-extra/clang-tidy/modernize/UseNullptrCheck.h
@@ -1,4 +1,4 @@
-//===--- UseNullptrCheck.h - clang-tidy--------------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/modernize/UseOverrideCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseOverrideCheck.cpp
index fd5bd9f0b181b..6a19183737119 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseOverrideCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/UseOverrideCheck.cpp
@@ -1,4 +1,4 @@
-//===--- UseOverrideCheck.cpp - clang-tidy --------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/modernize/UseOverrideCheck.h b/clang-tools-extra/clang-tidy/modernize/UseOverrideCheck.h
index 2c624f48fcc85..90d941362a903 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseOverrideCheck.h
+++ b/clang-tools-extra/clang-tidy/modernize/UseOverrideCheck.h
@@ -1,4 +1,4 @@
-//===--- UseOverrideCheck.h - clang-tidy ------------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/modernize/UseRangesCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseRangesCheck.cpp
index 604204e762c78..2e2f25fbb3f58 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseRangesCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/UseRangesCheck.cpp
@@ -1,4 +1,4 @@
-//===--- UseRangesCheck.cpp - clang-tidy ----------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/modernize/UseRangesCheck.h b/clang-tools-extra/clang-tidy/modernize/UseRangesCheck.h
index 51327dab53e3d..80ea6996afe55 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseRangesCheck.h
+++ b/clang-tools-extra/clang-tidy/modernize/UseRangesCheck.h
@@ -1,4 +1,4 @@
-//===--- UseRangesCheck.h - clang-tidy --------------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/modernize/UseScopedLockCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseScopedLockCheck.cpp
index 4041c81526d2f..aa1ee6db8917a 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseScopedLockCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/UseScopedLockCheck.cpp
@@ -1,4 +1,4 @@
-//===--- UseScopedLockCheck.cpp - clang-tidy ------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/modernize/UseScopedLockCheck.h b/clang-tools-extra/clang-tidy/modernize/UseScopedLockCheck.h
index a5697805c15ca..553031857e086 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseScopedLockCheck.h
+++ b/clang-tools-extra/clang-tidy/modernize/UseScopedLockCheck.h
@@ -1,4 +1,4 @@
-//===--- UseScopedLockCheck.h - clang-tidy ----------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/modernize/UseStartsEndsWithCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseStartsEndsWithCheck.cpp
index 2af67f7ccb4c1..eebd609cc84a8 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseStartsEndsWithCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/UseStartsEndsWithCheck.cpp
@@ -1,4 +1,4 @@
-//===--- UseStartsEndsWithCheck.cpp - clang-tidy --------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/modernize/UseStartsEndsWithCheck.h b/clang-tools-extra/clang-tidy/modernize/UseStartsEndsWithCheck.h
index 17c2999bda84c..70df8b87cb6f4 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseStartsEndsWithCheck.h
+++ b/clang-tools-extra/clang-tidy/modernize/UseStartsEndsWithCheck.h
@@ -1,4 +1,4 @@
-//===--- UseStartsEndsWithCheck.h - clang-tidy ------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/modernize/UseStdFormatCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseStdFormatCheck.cpp
index 081ec305f3b2a..c95834faab7fc 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseStdFormatCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/UseStdFormatCheck.cpp
@@ -1,4 +1,4 @@
-//===--- UseStdFormatCheck.cpp - clang-tidy -------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/modernize/UseStdFormatCheck.h b/clang-tools-extra/clang-tidy/modernize/UseStdFormatCheck.h
index 9ac2240212ebf..e369c17a0f733 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseStdFormatCheck.h
+++ b/clang-tools-extra/clang-tidy/modernize/UseStdFormatCheck.h
@@ -1,4 +1,4 @@
-//===--- UseStdFormatCheck.h - clang-tidy -----------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/modernize/UseStdNumbersCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseStdNumbersCheck.cpp
index 934cc24817d73..a04f78c271d42 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseStdNumbersCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/UseStdNumbersCheck.cpp
@@ -1,4 +1,4 @@
-//===--- UseStdNumbersCheck.cpp - clang_tidy ------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/modernize/UseStdNumbersCheck.h b/clang-tools-extra/clang-tidy/modernize/UseStdNumbersCheck.h
index 05fc5ada14b87..f1bd3b4eee2ba 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseStdNumbersCheck.h
+++ b/clang-tools-extra/clang-tidy/modernize/UseStdNumbersCheck.h
@@ -1,4 +1,4 @@
-//===--- UseStdNumbersCheck.h - clang-tidy ----------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/modernize/UseStdPrintCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseStdPrintCheck.cpp
index b1e3ee6e2ba85..99ade046305c1 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseStdPrintCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/UseStdPrintCheck.cpp
@@ -1,4 +1,4 @@
-//===--- UseStdPrintCheck.cpp - clang-tidy-----------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/modernize/UseStdPrintCheck.h b/clang-tools-extra/clang-tidy/modernize/UseStdPrintCheck.h
index 995c740389e73..1f7660991a275 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseStdPrintCheck.h
+++ b/clang-tools-extra/clang-tidy/modernize/UseStdPrintCheck.h
@@ -1,4 +1,4 @@
-//===--- UseStdPrintCheck.h - clang-tidy-------------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/modernize/UseTrailingReturnTypeCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseTrailingReturnTypeCheck.cpp
index 82f64096cbec1..3e27d8fa1fe42 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseTrailingReturnTypeCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/UseTrailingReturnTypeCheck.cpp
@@ -1,4 +1,4 @@
-//===--- UseTrailingReturnTypeCheck.cpp - clang-tidy-----------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/modernize/UseTrailingReturnTypeCheck.h b/clang-tools-extra/clang-tidy/modernize/UseTrailingReturnTypeCheck.h
index 91369919c5d36..9050bd5eba5e2 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseTrailingReturnTypeCheck.h
+++ b/clang-tools-extra/clang-tidy/modernize/UseTrailingReturnTypeCheck.h
@@ -1,4 +1,4 @@
-//===--- UseTrailingReturnTypeCheck.h - clang-tidy---------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/modernize/UseTransparentFunctorsCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseTransparentFunctorsCheck.cpp
index 2373a26fe48b4..03ecec9bd175b 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseTransparentFunctorsCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/UseTransparentFunctorsCheck.cpp
@@ -1,4 +1,4 @@
-//===--- UseTransparentFunctorsCheck.cpp - clang-tidy----------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/modernize/UseTransparentFunctorsCheck.h b/clang-tools-extra/clang-tidy/modernize/UseTransparentFunctorsCheck.h
index 80f022159c67b..dc9c76e8875a0 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseTransparentFunctorsCheck.h
+++ b/clang-tools-extra/clang-tidy/modernize/UseTransparentFunctorsCheck.h
@@ -1,4 +1,4 @@
-//===--- UseTransparentFunctorsCheck.h - clang-tidy--------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/modernize/UseUncaughtExceptionsCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseUncaughtExceptionsCheck.cpp
index 1e0a0a551339a..eef9d39800360 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseUncaughtExceptionsCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/UseUncaughtExceptionsCheck.cpp
@@ -1,4 +1,4 @@
-//===--- UseUncaughtExceptionsCheck.cpp - clang-tidy--------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/modernize/UseUncaughtExceptionsCheck.h b/clang-tools-extra/clang-tidy/modernize/UseUncaughtExceptionsCheck.h
index 48677521181f9..4c63efe0c6919 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseUncaughtExceptionsCheck.h
+++ b/clang-tools-extra/clang-tidy/modernize/UseUncaughtExceptionsCheck.h
@@ -1,4 +1,4 @@
-//===--- UseUncaughtExceptionsCheck.h - clang-tidy------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/modernize/UseUsingCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseUsingCheck.cpp
index 4037e8c1ea2fd..72673753e6c60 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseUsingCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/UseUsingCheck.cpp
@@ -1,4 +1,4 @@
-//===--- UseUsingCheck.cpp - clang-tidy------------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/modernize/UseUsingCheck.h b/clang-tools-extra/clang-tidy/modernize/UseUsingCheck.h
index 1e54bbf23c984..4ab1c4f6b9646 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseUsingCheck.h
+++ b/clang-tools-extra/clang-tidy/modernize/UseUsingCheck.h
@@ -1,4 +1,4 @@
-//===--- UseUsingCheck.h - clang-tidy----------------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/mpi/BufferDerefCheck.cpp b/clang-tools-extra/clang-tidy/mpi/BufferDerefCheck.cpp
index a144296c47b8c..00082c7034306 100644
--- a/clang-tools-extra/clang-tidy/mpi/BufferDerefCheck.cpp
+++ b/clang-tools-extra/clang-tidy/mpi/BufferDerefCheck.cpp
@@ -1,4 +1,4 @@
-//===--- BufferDerefCheck.cpp - clang-tidy---------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/mpi/BufferDerefCheck.h b/clang-tools-extra/clang-tidy/mpi/BufferDerefCheck.h
index 69e7aa092ebbf..7922750c135ac 100644
--- a/clang-tools-extra/clang-tidy/mpi/BufferDerefCheck.h
+++ b/clang-tools-extra/clang-tidy/mpi/BufferDerefCheck.h
@@ -1,4 +1,4 @@
-//===--- BufferDerefCheck.h - clang-tidy-------------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/mpi/MPITidyModule.cpp b/clang-tools-extra/clang-tidy/mpi/MPITidyModule.cpp
index 67ae101c18cb1..f56cb29455007 100644
--- a/clang-tools-extra/clang-tidy/mpi/MPITidyModule.cpp
+++ b/clang-tools-extra/clang-tidy/mpi/MPITidyModule.cpp
@@ -1,4 +1,4 @@
-//===--- MPITidyModule.cpp - clang-tidy -----------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/mpi/TypeMismatchCheck.cpp b/clang-tools-extra/clang-tidy/mpi/TypeMismatchCheck.cpp
index 5abe4f77d6598..17c1283b4d414 100644
--- a/clang-tools-extra/clang-tidy/mpi/TypeMismatchCheck.cpp
+++ b/clang-tools-extra/clang-tidy/mpi/TypeMismatchCheck.cpp
@@ -1,4 +1,4 @@
-//===--- TypeMismatchCheck.cpp - clang-tidy--------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/mpi/TypeMismatchCheck.h b/clang-tools-extra/clang-tidy/mpi/TypeMismatchCheck.h
index 480684b0fac53..60bcb0f3cf70c 100644
--- a/clang-tools-extra/clang-tidy/mpi/TypeMismatchCheck.h
+++ b/clang-tools-extra/clang-tidy/mpi/TypeMismatchCheck.h
@@ -1,4 +1,4 @@
-//===--- TypeMismatchCheck.h - clang-tidy------------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/objc/AssertEquals.cpp b/clang-tools-extra/clang-tidy/objc/AssertEquals.cpp
index 3d9b9fa401910..3f1bc17926ba2 100644
--- a/clang-tools-extra/clang-tidy/objc/AssertEquals.cpp
+++ b/clang-tools-extra/clang-tidy/objc/AssertEquals.cpp
@@ -1,4 +1,4 @@
-//===--- AssertEquals.cpp - clang-tidy --------------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/objc/AssertEquals.h b/clang-tools-extra/clang-tidy/objc/AssertEquals.h
index 59b55273aa118..0f4e303feea8b 100644
--- a/clang-tools-extra/clang-tidy/objc/AssertEquals.h
+++ b/clang-tools-extra/clang-tidy/objc/AssertEquals.h
@@ -1,4 +1,4 @@
-//===--- AssertEquals.h - clang-tidy ----------------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/objc/AvoidNSErrorInitCheck.cpp b/clang-tools-extra/clang-tidy/objc/AvoidNSErrorInitCheck.cpp
index 0de9584ad4806..650b67e77eeed 100644
--- a/clang-tools-extra/clang-tidy/objc/AvoidNSErrorInitCheck.cpp
+++ b/clang-tools-extra/clang-tidy/objc/AvoidNSErrorInitCheck.cpp
@@ -1,4 +1,4 @@
-//===--- AvoidNSErrorInitCheck.cpp - clang-tidy----------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/objc/AvoidNSErrorInitCheck.h b/clang-tools-extra/clang-tidy/objc/AvoidNSErrorInitCheck.h
index b343cb0c4fa80..2fd3d11559a39 100644
--- a/clang-tools-extra/clang-tidy/objc/AvoidNSErrorInitCheck.h
+++ b/clang-tools-extra/clang-tidy/objc/AvoidNSErrorInitCheck.h
@@ -1,4 +1,4 @@
-//===--- AvoidNSErrorInitCheck.h - clang-tidy--------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/objc/DeallocInCategoryCheck.cpp b/clang-tools-extra/clang-tidy/objc/DeallocInCategoryCheck.cpp
index d18815358b837..3a3307e0ff18f 100644
--- a/clang-tools-extra/clang-tidy/objc/DeallocInCategoryCheck.cpp
+++ b/clang-tools-extra/clang-tidy/objc/DeallocInCategoryCheck.cpp
@@ -1,4 +1,4 @@
-//===--- DeallocInCategoryCheck.cpp - clang-tidy -------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/objc/DeallocInCategoryCheck.h b/clang-tools-extra/clang-tidy/objc/DeallocInCategoryCheck.h
index aa34d5bff9665..f44a123055eee 100644
--- a/clang-tools-extra/clang-tidy/objc/DeallocInCategoryCheck.h
+++ b/clang-tools-extra/clang-tidy/objc/DeallocInCategoryCheck.h
@@ -1,4 +1,4 @@
-//===--- DeallocInCategoryCheck.h - clang-tidy ------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/objc/ForbiddenSubclassingCheck.cpp b/clang-tools-extra/clang-tidy/objc/ForbiddenSubclassingCheck.cpp
index 089538d4c65a8..16c9e9b8b4a99 100644
--- a/clang-tools-extra/clang-tidy/objc/ForbiddenSubclassingCheck.cpp
+++ b/clang-tools-extra/clang-tidy/objc/ForbiddenSubclassingCheck.cpp
@@ -1,4 +1,4 @@
-//===--- ForbiddenSubclassingCheck.cpp - clang-tidy -----------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/objc/ForbiddenSubclassingCheck.h b/clang-tools-extra/clang-tidy/objc/ForbiddenSubclassingCheck.h
index 3410868ed42a8..1f345c1da5156 100644
--- a/clang-tools-extra/clang-tidy/objc/ForbiddenSubclassingCheck.h
+++ b/clang-tools-extra/clang-tidy/objc/ForbiddenSubclassingCheck.h
@@ -1,4 +1,4 @@
-//===--- ForbiddenSubclassingCheck.h - clang-tidy ---------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/objc/MissingHashCheck.cpp b/clang-tools-extra/clang-tidy/objc/MissingHashCheck.cpp
index 42f383edc67ed..7b48fd9f77bca 100644
--- a/clang-tools-extra/clang-tidy/objc/MissingHashCheck.cpp
+++ b/clang-tools-extra/clang-tidy/objc/MissingHashCheck.cpp
@@ -1,4 +1,4 @@
-//===--- MissingHashCheck.cpp - clang-tidy --------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/objc/MissingHashCheck.h b/clang-tools-extra/clang-tidy/objc/MissingHashCheck.h
index fbb08dc249e60..cf0261e3cc38a 100644
--- a/clang-tools-extra/clang-tidy/objc/MissingHashCheck.h
+++ b/clang-tools-extra/clang-tidy/objc/MissingHashCheck.h
@@ -1,4 +1,4 @@
-//===--- MissingHashCheck.h - clang-tidy ------------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/objc/NSDateFormatterCheck.cpp b/clang-tools-extra/clang-tidy/objc/NSDateFormatterCheck.cpp
index 79e9d97d9594b..6a9adfe7d282d 100644
--- a/clang-tools-extra/clang-tidy/objc/NSDateFormatterCheck.cpp
+++ b/clang-tools-extra/clang-tidy/objc/NSDateFormatterCheck.cpp
@@ -1,4 +1,4 @@
-//===--- NSDateFormatterCheck.cpp - clang-tidy ----------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/objc/NSDateFormatterCheck.h b/clang-tools-extra/clang-tidy/objc/NSDateFormatterCheck.h
index 48eb4eda192e0..dc0e89a08b680 100644
--- a/clang-tools-extra/clang-tidy/objc/NSDateFormatterCheck.h
+++ b/clang-tools-extra/clang-tidy/objc/NSDateFormatterCheck.h
@@ -1,4 +1,4 @@
-//===--- NSDateFormatterCheck.h - clang-tidy --------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/objc/NSInvocationArgumentLifetimeCheck.cpp b/clang-tools-extra/clang-tidy/objc/NSInvocationArgumentLifetimeCheck.cpp
index 8e4ed41c5f501..8a32c38a04695 100644
--- a/clang-tools-extra/clang-tidy/objc/NSInvocationArgumentLifetimeCheck.cpp
+++ b/clang-tools-extra/clang-tidy/objc/NSInvocationArgumentLifetimeCheck.cpp
@@ -1,4 +1,4 @@
-//===--- NSInvocationArgumentLifetimeCheck.cpp - clang-tidy ---------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/objc/NSInvocationArgumentLifetimeCheck.h b/clang-tools-extra/clang-tidy/objc/NSInvocationArgumentLifetimeCheck.h
index ebb432f5fe32a..d09ea8cc10298 100644
--- a/clang-tools-extra/clang-tidy/objc/NSInvocationArgumentLifetimeCheck.h
+++ b/clang-tools-extra/clang-tidy/objc/NSInvocationArgumentLifetimeCheck.h
@@ -1,4 +1,4 @@
-//===--- NSInvocationArgumentLifetimeCheck.h - clang-tidy -------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/objc/ObjCTidyModule.cpp b/clang-tools-extra/clang-tidy/objc/ObjCTidyModule.cpp
index 56ccf33a6362a..c21b459964692 100644
--- a/clang-tools-extra/clang-tidy/objc/ObjCTidyModule.cpp
+++ b/clang-tools-extra/clang-tidy/objc/ObjCTidyModule.cpp
@@ -1,4 +1,4 @@
-//===--- ObjCTidyModule.cpp - clang-tidy --------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/objc/PropertyDeclarationCheck.cpp b/clang-tools-extra/clang-tidy/objc/PropertyDeclarationCheck.cpp
index 01ee4d518b97c..f2bc6f10b9c58 100644
--- a/clang-tools-extra/clang-tidy/objc/PropertyDeclarationCheck.cpp
+++ b/clang-tools-extra/clang-tidy/objc/PropertyDeclarationCheck.cpp
@@ -1,4 +1,4 @@
-//===--- PropertyDeclarationCheck.cpp - clang-tidy-------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/objc/PropertyDeclarationCheck.h b/clang-tools-extra/clang-tidy/objc/PropertyDeclarationCheck.h
index 9950f92b8a359..c883e59321124 100644
--- a/clang-tools-extra/clang-tidy/objc/PropertyDeclarationCheck.h
+++ b/clang-tools-extra/clang-tidy/objc/PropertyDeclarationCheck.h
@@ -1,4 +1,4 @@
-//===--- PropertyDeclarationCheck.h - clang-tidy-----------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/objc/SuperSelfCheck.cpp b/clang-tools-extra/clang-tidy/objc/SuperSelfCheck.cpp
index 951cbc52c9a99..3c133ad7dd96b 100644
--- a/clang-tools-extra/clang-tidy/objc/SuperSelfCheck.cpp
+++ b/clang-tools-extra/clang-tidy/objc/SuperSelfCheck.cpp
@@ -1,4 +1,4 @@
-//===--- SuperSelfCheck.cpp - clang-tidy ----------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/objc/SuperSelfCheck.h b/clang-tools-extra/clang-tidy/objc/SuperSelfCheck.h
index ec852e84397a8..baeba560a8fef 100644
--- a/clang-tools-extra/clang-tidy/objc/SuperSelfCheck.h
+++ b/clang-tools-extra/clang-tidy/objc/SuperSelfCheck.h
@@ -1,4 +1,4 @@
-//===--- SuperSelfCheck.h - clang-tidy --------------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/openmp/ExceptionEscapeCheck.cpp b/clang-tools-extra/clang-tidy/openmp/ExceptionEscapeCheck.cpp
index 42fb95bf10527..f9becee92e148 100644
--- a/clang-tools-extra/clang-tidy/openmp/ExceptionEscapeCheck.cpp
+++ b/clang-tools-extra/clang-tidy/openmp/ExceptionEscapeCheck.cpp
@@ -1,4 +1,4 @@
-//===--- ExceptionEscapeCheck.cpp - clang-tidy ----------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/openmp/ExceptionEscapeCheck.h b/clang-tools-extra/clang-tidy/openmp/ExceptionEscapeCheck.h
index 3590c0eacee7f..1703f55f902ba 100644
--- a/clang-tools-extra/clang-tidy/openmp/ExceptionEscapeCheck.h
+++ b/clang-tools-extra/clang-tidy/openmp/ExceptionEscapeCheck.h
@@ -1,4 +1,4 @@
-//===--- ExceptionEscapeCheck.h - clang-tidy --------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/openmp/OpenMPTidyModule.cpp b/clang-tools-extra/clang-tidy/openmp/OpenMPTidyModule.cpp
index d9c9d90673408..b48fce670a041 100644
--- a/clang-tools-extra/clang-tidy/openmp/OpenMPTidyModule.cpp
+++ b/clang-tools-extra/clang-tidy/openmp/OpenMPTidyModule.cpp
@@ -1,4 +1,4 @@
-//===--- OpenMPTidyModule.cpp - clang-tidy--------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/openmp/UseDefaultNoneCheck.cpp b/clang-tools-extra/clang-tidy/openmp/UseDefaultNoneCheck.cpp
index e1c353fbe65e0..d02ab728547ae 100644
--- a/clang-tools-extra/clang-tidy/openmp/UseDefaultNoneCheck.cpp
+++ b/clang-tools-extra/clang-tidy/openmp/UseDefaultNoneCheck.cpp
@@ -1,4 +1,4 @@
-//===--- UseDefaultNoneCheck.cpp - clang-tidy -----------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/openmp/UseDefaultNoneCheck.h b/clang-tools-extra/clang-tidy/openmp/UseDefaultNoneCheck.h
index 3b74f9ad78aee..fb6b528df3ffb 100644
--- a/clang-tools-extra/clang-tidy/openmp/UseDefaultNoneCheck.h
+++ b/clang-tools-extra/clang-tidy/openmp/UseDefaultNoneCheck.h
@@ -1,4 +1,4 @@
-//===--- UseDefaultNoneCheck.h - clang-tidy ---------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/performance/AvoidEndlCheck.cpp b/clang-tools-extra/clang-tidy/performance/AvoidEndlCheck.cpp
index a394f5c6efa2a..747994c9a3c7f 100644
--- a/clang-tools-extra/clang-tidy/performance/AvoidEndlCheck.cpp
+++ b/clang-tools-extra/clang-tidy/performance/AvoidEndlCheck.cpp
@@ -1,4 +1,4 @@
-//===--- AvoidEndlCheck.cpp - clang-tidy ----------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/performance/AvoidEndlCheck.h b/clang-tools-extra/clang-tidy/performance/AvoidEndlCheck.h
index db75fbcf4e89f..860d832b807d0 100644
--- a/clang-tools-extra/clang-tidy/performance/AvoidEndlCheck.h
+++ b/clang-tools-extra/clang-tidy/performance/AvoidEndlCheck.h
@@ -1,4 +1,4 @@
-//===--- AvoidEndlCheck.h - clang-tidy --------------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/performance/EnumSizeCheck.cpp b/clang-tools-extra/clang-tidy/performance/EnumSizeCheck.cpp
index 0f3e9d3ef7591..edd3ded2e2858 100644
--- a/clang-tools-extra/clang-tidy/performance/EnumSizeCheck.cpp
+++ b/clang-tools-extra/clang-tidy/performance/EnumSizeCheck.cpp
@@ -1,4 +1,4 @@
-//===--- EnumSizeCheck.cpp - clang-tidy -----------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/performance/EnumSizeCheck.h b/clang-tools-extra/clang-tidy/performance/EnumSizeCheck.h
index 4d797602ede8b..d87e6b8ab9f5e 100644
--- a/clang-tools-extra/clang-tidy/performance/EnumSizeCheck.h
+++ b/clang-tools-extra/clang-tidy/performance/EnumSizeCheck.h
@@ -1,4 +1,4 @@
-//===--- EnumSizeCheck.h - clang-tidy ---------------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/performance/FasterStringFindCheck.cpp b/clang-tools-extra/clang-tidy/performance/FasterStringFindCheck.cpp
index 40ea915a33299..d26480fc9f60d 100644
--- a/clang-tools-extra/clang-tidy/performance/FasterStringFindCheck.cpp
+++ b/clang-tools-extra/clang-tidy/performance/FasterStringFindCheck.cpp
@@ -1,4 +1,4 @@
-//===--- FasterStringFindCheck.cpp - clang-tidy----------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/performance/FasterStringFindCheck.h b/clang-tools-extra/clang-tidy/performance/FasterStringFindCheck.h
index 83af95cd69549..a7ab79a3809d4 100644
--- a/clang-tools-extra/clang-tidy/performance/FasterStringFindCheck.h
+++ b/clang-tools-extra/clang-tidy/performance/FasterStringFindCheck.h
@@ -1,4 +1,4 @@
-//===--- FasterStringFindCheck.h - clang-tidy--------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/performance/ForRangeCopyCheck.cpp b/clang-tools-extra/clang-tidy/performance/ForRangeCopyCheck.cpp
index f545a49dc184b..d0b399739bb48 100644
--- a/clang-tools-extra/clang-tidy/performance/ForRangeCopyCheck.cpp
+++ b/clang-tools-extra/clang-tidy/performance/ForRangeCopyCheck.cpp
@@ -1,4 +1,4 @@
-//===--- ForRangeCopyCheck.cpp - clang-tidy--------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/performance/ForRangeCopyCheck.h b/clang-tools-extra/clang-tidy/performance/ForRangeCopyCheck.h
index 8fabbfa2ae7ba..3ed05fecd015d 100644
--- a/clang-tools-extra/clang-tidy/performance/ForRangeCopyCheck.h
+++ b/clang-tools-extra/clang-tidy/performance/ForRangeCopyCheck.h
@@ -1,4 +1,4 @@
-//===--- ForRangeCopyCheck.h - clang-tidy------------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/performance/ImplicitConversionInLoopCheck.cpp b/clang-tools-extra/clang-tidy/performance/ImplicitConversionInLoopCheck.cpp
index 1ecf1e14957a1..a558954b3fe1d 100644
--- a/clang-tools-extra/clang-tidy/performance/ImplicitConversionInLoopCheck.cpp
+++ b/clang-tools-extra/clang-tidy/performance/ImplicitConversionInLoopCheck.cpp
@@ -1,4 +1,4 @@
-//===--- ImplicitConversionInLoopCheck.cpp - clang-tidy--------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/performance/ImplicitConversionInLoopCheck.h b/clang-tools-extra/clang-tidy/performance/ImplicitConversionInLoopCheck.h
index d1764070bd4d7..786081a351070 100644
--- a/clang-tools-extra/clang-tidy/performance/ImplicitConversionInLoopCheck.h
+++ b/clang-tools-extra/clang-tidy/performance/ImplicitConversionInLoopCheck.h
@@ -1,4 +1,4 @@
-//===--- ImplicitConversionInLoopCheck.h - clang-tidy------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/performance/InefficientAlgorithmCheck.cpp b/clang-tools-extra/clang-tidy/performance/InefficientAlgorithmCheck.cpp
index ad900fcec2dee..cd128c3556725 100644
--- a/clang-tools-extra/clang-tidy/performance/InefficientAlgorithmCheck.cpp
+++ b/clang-tools-extra/clang-tidy/performance/InefficientAlgorithmCheck.cpp
@@ -1,4 +1,4 @@
-//===--- InefficientAlgorithmCheck.cpp - clang-tidy------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/performance/InefficientAlgorithmCheck.h b/clang-tools-extra/clang-tidy/performance/InefficientAlgorithmCheck.h
index 5ab0513ea8f94..be8001a15667c 100644
--- a/clang-tools-extra/clang-tidy/performance/InefficientAlgorithmCheck.h
+++ b/clang-tools-extra/clang-tidy/performance/InefficientAlgorithmCheck.h
@@ -1,4 +1,4 @@
-//===--- InefficientAlgorithmCheck.h - clang-tidy----------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/performance/InefficientStringConcatenationCheck.cpp b/clang-tools-extra/clang-tidy/performance/InefficientStringConcatenationCheck.cpp
index a3f412d9e3415..92e3220fdb817 100644
--- a/clang-tools-extra/clang-tidy/performance/InefficientStringConcatenationCheck.cpp
+++ b/clang-tools-extra/clang-tidy/performance/InefficientStringConcatenationCheck.cpp
@@ -1,4 +1,4 @@
-//===--- InefficientStringConcatenationCheck.cpp - clang-tidy--------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/performance/InefficientStringConcatenationCheck.h b/clang-tools-extra/clang-tidy/performance/InefficientStringConcatenationCheck.h
index 1c15f0eb49ac7..810c0109574e9 100644
--- a/clang-tools-extra/clang-tidy/performance/InefficientStringConcatenationCheck.h
+++ b/clang-tools-extra/clang-tidy/performance/InefficientStringConcatenationCheck.h
@@ -1,5 +1,4 @@
-//===--- InefficientStringConcatenationCheck.h - clang-tidy-----------*- C++
-//-*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/performance/InefficientVectorOperationCheck.cpp b/clang-tools-extra/clang-tidy/performance/InefficientVectorOperationCheck.cpp
index d87e352b00073..3da1469a9f120 100644
--- a/clang-tools-extra/clang-tidy/performance/InefficientVectorOperationCheck.cpp
+++ b/clang-tools-extra/clang-tidy/performance/InefficientVectorOperationCheck.cpp
@@ -1,4 +1,4 @@
-//===--- InefficientVectorOperationCheck.cpp - clang-tidy------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/performance/InefficientVectorOperationCheck.h b/clang-tools-extra/clang-tidy/performance/InefficientVectorOperationCheck.h
index 4f45ff490633a..9737d9d5ecb1a 100644
--- a/clang-tools-extra/clang-tidy/performance/InefficientVectorOperationCheck.h
+++ b/clang-tools-extra/clang-tidy/performance/InefficientVectorOperationCheck.h
@@ -1,4 +1,4 @@
-//===--- InefficientVectorOperationCheck.h - clang-tidy----------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/performance/MoveConstArgCheck.cpp b/clang-tools-extra/clang-tidy/performance/MoveConstArgCheck.cpp
index f458e26d964b0..854f09aeb0b51 100644
--- a/clang-tools-extra/clang-tidy/performance/MoveConstArgCheck.cpp
+++ b/clang-tools-extra/clang-tidy/performance/MoveConstArgCheck.cpp
@@ -1,4 +1,4 @@
-//===--- MoveConstArgCheck.cpp - clang-tidy -----------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/performance/MoveConstArgCheck.h b/clang-tools-extra/clang-tidy/performance/MoveConstArgCheck.h
index 1c2c430d162c7..9f67f64857168 100644
--- a/clang-tools-extra/clang-tidy/performance/MoveConstArgCheck.h
+++ b/clang-tools-extra/clang-tidy/performance/MoveConstArgCheck.h
@@ -1,4 +1,4 @@
-//===--- MoveConstArgCheck.h - clang-tidy -------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/performance/MoveConstructorInitCheck.cpp b/clang-tools-extra/clang-tidy/performance/MoveConstructorInitCheck.cpp
index 1585a0ae36f9c..44f6d20ac2be3 100644
--- a/clang-tools-extra/clang-tidy/performance/MoveConstructorInitCheck.cpp
+++ b/clang-tools-extra/clang-tidy/performance/MoveConstructorInitCheck.cpp
@@ -1,4 +1,4 @@
-//===--- MoveConstructorInitCheck.cpp - clang-tidy-------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/performance/MoveConstructorInitCheck.h b/clang-tools-extra/clang-tidy/performance/MoveConstructorInitCheck.h
index 9c1d20710f51e..7c5aec8c59fc8 100644
--- a/clang-tools-extra/clang-tidy/performance/MoveConstructorInitCheck.h
+++ b/clang-tools-extra/clang-tidy/performance/MoveConstructorInitCheck.h
@@ -1,4 +1,4 @@
-//===--- MoveConstructorInitCheck.h - clang-tidy-----------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/performance/NoAutomaticMoveCheck.cpp b/clang-tools-extra/clang-tidy/performance/NoAutomaticMoveCheck.cpp
index 1c018999432e3..2469da978d0ae 100644
--- a/clang-tools-extra/clang-tidy/performance/NoAutomaticMoveCheck.cpp
+++ b/clang-tools-extra/clang-tidy/performance/NoAutomaticMoveCheck.cpp
@@ -1,4 +1,4 @@
-//===--- NoAutomaticMoveCheck.cpp - clang-tidy ----------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/performance/NoAutomaticMoveCheck.h b/clang-tools-extra/clang-tidy/performance/NoAutomaticMoveCheck.h
index aed230d2f1c40..af80e74f3a5b4 100644
--- a/clang-tools-extra/clang-tidy/performance/NoAutomaticMoveCheck.h
+++ b/clang-tools-extra/clang-tidy/performance/NoAutomaticMoveCheck.h
@@ -1,4 +1,4 @@
-//===--- NoAutomaticMoveCheck.h - clang-tidy --------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/performance/NoIntToPtrCheck.cpp b/clang-tools-extra/clang-tidy/performance/NoIntToPtrCheck.cpp
index bf212595aceaf..115835ad3983e 100644
--- a/clang-tools-extra/clang-tidy/performance/NoIntToPtrCheck.cpp
+++ b/clang-tools-extra/clang-tidy/performance/NoIntToPtrCheck.cpp
@@ -1,4 +1,4 @@
-//===--- NoIntToPtrCheck.cpp - clang-tidy ---------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/performance/NoIntToPtrCheck.h b/clang-tools-extra/clang-tidy/performance/NoIntToPtrCheck.h
index 322838da115cc..ed6f60f697da4 100644
--- a/clang-tools-extra/clang-tidy/performance/NoIntToPtrCheck.h
+++ b/clang-tools-extra/clang-tidy/performance/NoIntToPtrCheck.h
@@ -1,4 +1,4 @@
-//===--- NoIntToPtrCheck.h - clang-tidy -------------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/performance/NoexceptDestructorCheck.cpp b/clang-tools-extra/clang-tidy/performance/NoexceptDestructorCheck.cpp
index 4aa999ea5c0b7..dc293facb2ae1 100644
--- a/clang-tools-extra/clang-tidy/performance/NoexceptDestructorCheck.cpp
+++ b/clang-tools-extra/clang-tidy/performance/NoexceptDestructorCheck.cpp
@@ -1,4 +1,4 @@
-//===--- NoexceptDestructorCheck.cpp - clang-tidy -------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/performance/NoexceptDestructorCheck.h b/clang-tools-extra/clang-tidy/performance/NoexceptDestructorCheck.h
index ab3850f0970a8..ce2b1c9c17a19 100644
--- a/clang-tools-extra/clang-tidy/performance/NoexceptDestructorCheck.h
+++ b/clang-tools-extra/clang-tidy/performance/NoexceptDestructorCheck.h
@@ -1,4 +1,4 @@
-//===--- NoexceptDestructorCheck.h - clang-tidy -----------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/performance/NoexceptFunctionBaseCheck.cpp b/clang-tools-extra/clang-tidy/performance/NoexceptFunctionBaseCheck.cpp
index 911cd1b533367..895bd702d3834 100644
--- a/clang-tools-extra/clang-tidy/performance/NoexceptFunctionBaseCheck.cpp
+++ b/clang-tools-extra/clang-tidy/performance/NoexceptFunctionBaseCheck.cpp
@@ -1,4 +1,4 @@
-//===--- NoexceptFunctionCheck.cpp - clang-tidy ---------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/performance/NoexceptFunctionBaseCheck.h b/clang-tools-extra/clang-tidy/performance/NoexceptFunctionBaseCheck.h
index 4775219d7e439..075b4fe964d89 100644
--- a/clang-tools-extra/clang-tidy/performance/NoexceptFunctionBaseCheck.h
+++ b/clang-tools-extra/clang-tidy/performance/NoexceptFunctionBaseCheck.h
@@ -1,4 +1,4 @@
-//===--- NoexceptFunctionCheck.h - clang-tidy -------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/performance/NoexceptMoveConstructorCheck.cpp b/clang-tools-extra/clang-tidy/performance/NoexceptMoveConstructorCheck.cpp
index a77ca6aebb378..75bf8aa8734d5 100644
--- a/clang-tools-extra/clang-tidy/performance/NoexceptMoveConstructorCheck.cpp
+++ b/clang-tools-extra/clang-tidy/performance/NoexceptMoveConstructorCheck.cpp
@@ -1,4 +1,4 @@
-//===--- NoexceptMoveConstructorCheck.cpp - clang-tidy---------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/performance/NoexceptMoveConstructorCheck.h b/clang-tools-extra/clang-tidy/performance/NoexceptMoveConstructorCheck.h
index 51728d2ce0d8d..11a8068aebbc4 100644
--- a/clang-tools-extra/clang-tidy/performance/NoexceptMoveConstructorCheck.h
+++ b/clang-tools-extra/clang-tidy/performance/NoexceptMoveConstructorCheck.h
@@ -1,4 +1,4 @@
-//===--- NoexceptMoveConstructorCheck.h - clang-tidy-------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/performance/NoexceptSwapCheck.cpp b/clang-tools-extra/clang-tidy/performance/NoexceptSwapCheck.cpp
index e7cba6e54e86a..29faf9f2d476c 100644
--- a/clang-tools-extra/clang-tidy/performance/NoexceptSwapCheck.cpp
+++ b/clang-tools-extra/clang-tidy/performance/NoexceptSwapCheck.cpp
@@ -1,4 +1,4 @@
-//===--- NoexceptSwapCheck.cpp - clang-tidy -------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/performance/NoexceptSwapCheck.h b/clang-tools-extra/clang-tidy/performance/NoexceptSwapCheck.h
index 0330de4a50b43..9466b3a127302 100644
--- a/clang-tools-extra/clang-tidy/performance/NoexceptSwapCheck.h
+++ b/clang-tools-extra/clang-tidy/performance/NoexceptSwapCheck.h
@@ -1,4 +1,4 @@
-//===--- NoexceptSwapCheck.h - clang-tidy -----------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/performance/PerformanceTidyModule.cpp b/clang-tools-extra/clang-tidy/performance/PerformanceTidyModule.cpp
index 10ad9ec6fef4c..ae15208ae3dc5 100644
--- a/clang-tools-extra/clang-tidy/performance/PerformanceTidyModule.cpp
+++ b/clang-tools-extra/clang-tidy/performance/PerformanceTidyModule.cpp
@@ -1,4 +1,4 @@
-//===-- PerformanceTidyModule.cpp - clang-tidy ----------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/performance/TriviallyDestructibleCheck.cpp b/clang-tools-extra/clang-tidy/performance/TriviallyDestructibleCheck.cpp
index adfedb4e84c47..0db66c0d5803d 100644
--- a/clang-tools-extra/clang-tidy/performance/TriviallyDestructibleCheck.cpp
+++ b/clang-tools-extra/clang-tidy/performance/TriviallyDestructibleCheck.cpp
@@ -1,4 +1,4 @@
-//===--- TriviallyDestructibleCheck.cpp - clang-tidy ----------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/performance/TriviallyDestructibleCheck.h b/clang-tools-extra/clang-tidy/performance/TriviallyDestructibleCheck.h
index 305844715726a..ae96359a544ce 100644
--- a/clang-tools-extra/clang-tidy/performance/TriviallyDestructibleCheck.h
+++ b/clang-tools-extra/clang-tidy/performance/TriviallyDestructibleCheck.h
@@ -1,4 +1,4 @@
-//===--- TriviallyDestructibleCheck.h - clang-tidy --------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/performance/TypePromotionInMathFnCheck.cpp b/clang-tools-extra/clang-tidy/performance/TypePromotionInMathFnCheck.cpp
index 29f9146e47786..096ca57ee8e22 100644
--- a/clang-tools-extra/clang-tidy/performance/TypePromotionInMathFnCheck.cpp
+++ b/clang-tools-extra/clang-tidy/performance/TypePromotionInMathFnCheck.cpp
@@ -1,4 +1,4 @@
-//===--- TypePromotionInMathFnCheck.cpp - clang-tidy-----------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/performance/TypePromotionInMathFnCheck.h b/clang-tools-extra/clang-tidy/performance/TypePromotionInMathFnCheck.h
index 08a7eea580221..9d9b073c80400 100644
--- a/clang-tools-extra/clang-tidy/performance/TypePromotionInMathFnCheck.h
+++ b/clang-tools-extra/clang-tidy/performance/TypePromotionInMathFnCheck.h
@@ -1,4 +1,4 @@
-//===--- TypePromotionInMathFnCheck.h - clang-tidy---------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/performance/UnnecessaryCopyInitialization.cpp b/clang-tools-extra/clang-tidy/performance/UnnecessaryCopyInitialization.cpp
index c413090b3a0a4..591836667a2ba 100644
--- a/clang-tools-extra/clang-tidy/performance/UnnecessaryCopyInitialization.cpp
+++ b/clang-tools-extra/clang-tidy/performance/UnnecessaryCopyInitialization.cpp
@@ -1,4 +1,4 @@
-//===--- UnnecessaryCopyInitialization.cpp - clang-tidy--------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/performance/UnnecessaryCopyInitialization.h b/clang-tools-extra/clang-tidy/performance/UnnecessaryCopyInitialization.h
index 38f756f9b452f..66231889b8014 100644
--- a/clang-tools-extra/clang-tidy/performance/UnnecessaryCopyInitialization.h
+++ b/clang-tools-extra/clang-tidy/performance/UnnecessaryCopyInitialization.h
@@ -1,4 +1,4 @@
-//===--- UnnecessaryCopyInitialization.h - clang-tidy------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/performance/UnnecessaryValueParamCheck.cpp b/clang-tools-extra/clang-tidy/performance/UnnecessaryValueParamCheck.cpp
index c1aa52bacf99f..3f5b43feca1ad 100644
--- a/clang-tools-extra/clang-tidy/performance/UnnecessaryValueParamCheck.cpp
+++ b/clang-tools-extra/clang-tidy/performance/UnnecessaryValueParamCheck.cpp
@@ -1,4 +1,4 @@
-//===--- UnnecessaryValueParamCheck.cpp - clang-tidy-----------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/performance/UnnecessaryValueParamCheck.h b/clang-tools-extra/clang-tidy/performance/UnnecessaryValueParamCheck.h
index b52043416e769..571857020cef4 100644
--- a/clang-tools-extra/clang-tidy/performance/UnnecessaryValueParamCheck.h
+++ b/clang-tools-extra/clang-tidy/performance/UnnecessaryValueParamCheck.h
@@ -1,4 +1,4 @@
-//===--- UnnecessaryValueParamCheck.h - clang-tidy---------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/portability/AvoidPragmaOnceCheck.cpp b/clang-tools-extra/clang-tidy/portability/AvoidPragmaOnceCheck.cpp
index d9569d0b5c603..a946ebf1650fc 100644
--- a/clang-tools-extra/clang-tidy/portability/AvoidPragmaOnceCheck.cpp
+++ b/clang-tools-extra/clang-tidy/portability/AvoidPragmaOnceCheck.cpp
@@ -1,4 +1,4 @@
-//===--- AvoidPragmaOnceCheck.cpp - clang-tidy ----------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/portability/AvoidPragmaOnceCheck.h b/clang-tools-extra/clang-tidy/portability/AvoidPragmaOnceCheck.h
index 203fdfd4bd33a..3638a9c46773e 100644
--- a/clang-tools-extra/clang-tidy/portability/AvoidPragmaOnceCheck.h
+++ b/clang-tools-extra/clang-tidy/portability/AvoidPragmaOnceCheck.h
@@ -1,4 +1,4 @@
-//===--- AvoidPragmaOnceCheck.h - clang-tidy --------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/portability/PortabilityTidyModule.cpp b/clang-tools-extra/clang-tidy/portability/PortabilityTidyModule.cpp
index 98853556588b3..e73e95455d3a5 100644
--- a/clang-tools-extra/clang-tidy/portability/PortabilityTidyModule.cpp
+++ b/clang-tools-extra/clang-tidy/portability/PortabilityTidyModule.cpp
@@ -1,4 +1,4 @@
-//===--- PortabilityTidyModule.cpp - clang-tidy ---------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/portability/RestrictSystemIncludesCheck.cpp b/clang-tools-extra/clang-tidy/portability/RestrictSystemIncludesCheck.cpp
index db5693e3b7cb7..5174f56207b54 100644
--- a/clang-tools-extra/clang-tidy/portability/RestrictSystemIncludesCheck.cpp
+++ b/clang-tools-extra/clang-tidy/portability/RestrictSystemIncludesCheck.cpp
@@ -1,4 +1,4 @@
-//===--- RestrictSystemIncludesCheck.cpp - clang-tidy ---------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/portability/RestrictSystemIncludesCheck.h b/clang-tools-extra/clang-tidy/portability/RestrictSystemIncludesCheck.h
index 60fae5e73a602..5347ae9d68b02 100644
--- a/clang-tools-extra/clang-tidy/portability/RestrictSystemIncludesCheck.h
+++ b/clang-tools-extra/clang-tidy/portability/RestrictSystemIncludesCheck.h
@@ -1,4 +1,4 @@
-//===--- RestrictSystemIncludesCheck.h - clang-tidy --------------*- C++-*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/portability/SIMDIntrinsicsCheck.cpp b/clang-tools-extra/clang-tidy/portability/SIMDIntrinsicsCheck.cpp
index f4bd4c3d5657d..d90b09abb1be8 100644
--- a/clang-tools-extra/clang-tidy/portability/SIMDIntrinsicsCheck.cpp
+++ b/clang-tools-extra/clang-tidy/portability/SIMDIntrinsicsCheck.cpp
@@ -1,4 +1,4 @@
-//===--- SIMDIntrinsicsCheck.cpp - clang-tidy------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/portability/SIMDIntrinsicsCheck.h b/clang-tools-extra/clang-tidy/portability/SIMDIntrinsicsCheck.h
index 92fc0af98a25b..ab0711335c920 100644
--- a/clang-tools-extra/clang-tidy/portability/SIMDIntrinsicsCheck.h
+++ b/clang-tools-extra/clang-tidy/portability/SIMDIntrinsicsCheck.h
@@ -1,4 +1,4 @@
-//===--- SIMDIntrinsicsCheck.h - clang-tidy----------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/portability/StdAllocatorConstCheck.cpp b/clang-tools-extra/clang-tidy/portability/StdAllocatorConstCheck.cpp
index 5a3c9a4203eb9..ff58505e8f87c 100644
--- a/clang-tools-extra/clang-tidy/portability/StdAllocatorConstCheck.cpp
+++ b/clang-tools-extra/clang-tidy/portability/StdAllocatorConstCheck.cpp
@@ -1,4 +1,4 @@
-//===-- StdAllocatorConstCheck.cpp - clang-tidy --------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/portability/StdAllocatorConstCheck.h b/clang-tools-extra/clang-tidy/portability/StdAllocatorConstCheck.h
index 87702af91bdb6..b2f5feac21918 100644
--- a/clang-tools-extra/clang-tidy/portability/StdAllocatorConstCheck.h
+++ b/clang-tools-extra/clang-tidy/portability/StdAllocatorConstCheck.h
@@ -1,4 +1,4 @@
-//===--- StdAllocatorConstT.h - clang-tidy -----------------------*- C++-*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/portability/TemplateVirtualMemberFunctionCheck.cpp b/clang-tools-extra/clang-tidy/portability/TemplateVirtualMemberFunctionCheck.cpp
index aaa23367a3825..bf3173dc993e2 100644
--- a/clang-tools-extra/clang-tidy/portability/TemplateVirtualMemberFunctionCheck.cpp
+++ b/clang-tools-extra/clang-tidy/portability/TemplateVirtualMemberFunctionCheck.cpp
@@ -1,4 +1,4 @@
-//===--- TemplateVirtualMemberFunctionCheck.cpp - clang-tidy --------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/portability/TemplateVirtualMemberFunctionCheck.h b/clang-tools-extra/clang-tidy/portability/TemplateVirtualMemberFunctionCheck.h
index 41f92adadd6e8..01d5519d7e6fd 100644
--- a/clang-tools-extra/clang-tidy/portability/TemplateVirtualMemberFunctionCheck.h
+++ b/clang-tools-extra/clang-tidy/portability/TemplateVirtualMemberFunctionCheck.h
@@ -1,4 +1,4 @@
-//===--- TemplateVirtualMemberFunctionCheck.h - clang-tidy ------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/readability/AmbiguousSmartptrResetCallCheck.cpp b/clang-tools-extra/clang-tidy/readability/AmbiguousSmartptrResetCallCheck.cpp
index 5f36c3976fc69..22ff5ce1545a5 100644
--- a/clang-tools-extra/clang-tidy/readability/AmbiguousSmartptrResetCallCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/AmbiguousSmartptrResetCallCheck.cpp
@@ -1,4 +1,4 @@
-//===--- AmbiguousSmartptrResetCallCheck.cpp - clang-tidy -----------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/readability/AmbiguousSmartptrResetCallCheck.h b/clang-tools-extra/clang-tidy/readability/AmbiguousSmartptrResetCallCheck.h
index 05932e59e7928..763cd7f01f9c3 100644
--- a/clang-tools-extra/clang-tidy/readability/AmbiguousSmartptrResetCallCheck.h
+++ b/clang-tools-extra/clang-tidy/readability/AmbiguousSmartptrResetCallCheck.h
@@ -1,4 +1,4 @@
-//===--- AmbiguousSmartptrResetCallCheck.h - clang-tidy ---------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/readability/AvoidConstParamsInDecls.cpp b/clang-tools-extra/clang-tidy/readability/AvoidConstParamsInDecls.cpp
index 24cbbd8bc60a2..554996730c2be 100644
--- a/clang-tools-extra/clang-tidy/readability/AvoidConstParamsInDecls.cpp
+++ b/clang-tools-extra/clang-tidy/readability/AvoidConstParamsInDecls.cpp
@@ -1,4 +1,4 @@
-//===--- AvoidConstParamsInDecls.cpp - clang-tidy--------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/readability/AvoidConstParamsInDecls.h b/clang-tools-extra/clang-tidy/readability/AvoidConstParamsInDecls.h
index 1fc57779111df..1dd28fde217ed 100644
--- a/clang-tools-extra/clang-tidy/readability/AvoidConstParamsInDecls.h
+++ b/clang-tools-extra/clang-tidy/readability/AvoidConstParamsInDecls.h
@@ -1,4 +1,4 @@
-//===--- AvoidConstParamsInDecls.h - clang-tidy----------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/readability/AvoidNestedConditionalOperatorCheck.cpp b/clang-tools-extra/clang-tidy/readability/AvoidNestedConditionalOperatorCheck.cpp
index 1b62f54d5557d..35e5462b55cce 100644
--- a/clang-tools-extra/clang-tidy/readability/AvoidNestedConditionalOperatorCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/AvoidNestedConditionalOperatorCheck.cpp
@@ -1,4 +1,4 @@
-//===--- AvoidNestedConditionalOperatorCheck.cpp - clang-tidy -------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/readability/AvoidNestedConditionalOperatorCheck.h b/clang-tools-extra/clang-tidy/readability/AvoidNestedConditionalOperatorCheck.h
index 9010156de6ce2..b14af6a0cf1c7 100644
--- a/clang-tools-extra/clang-tidy/readability/AvoidNestedConditionalOperatorCheck.h
+++ b/clang-tools-extra/clang-tidy/readability/AvoidNestedConditionalOperatorCheck.h
@@ -1,4 +1,4 @@
-//===--- AvoidNestedConditionalOperatorCheck.h - clang-tidy -----*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/readability/AvoidReturnWithVoidValueCheck.cpp b/clang-tools-extra/clang-tidy/readability/AvoidReturnWithVoidValueCheck.cpp
index d283111a4de1a..40a4fa114681e 100644
--- a/clang-tools-extra/clang-tidy/readability/AvoidReturnWithVoidValueCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/AvoidReturnWithVoidValueCheck.cpp
@@ -1,4 +1,4 @@
-//===--- AvoidReturnWithVoidValueCheck.cpp - clang-tidy -------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/readability/AvoidReturnWithVoidValueCheck.h b/clang-tools-extra/clang-tidy/readability/AvoidReturnWithVoidValueCheck.h
index f8148db43cd95..93e6268fd5dd5 100644
--- a/clang-tools-extra/clang-tidy/readability/AvoidReturnWithVoidValueCheck.h
+++ b/clang-tools-extra/clang-tidy/readability/AvoidReturnWithVoidValueCheck.h
@@ -1,4 +1,4 @@
-//===--- AvoidReturnWithVoidValueCheck.h - clang-tidy -----------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/readability/AvoidUnconditionalPreprocessorIfCheck.cpp b/clang-tools-extra/clang-tidy/readability/AvoidUnconditionalPreprocessorIfCheck.cpp
index ca5fc358ce290..c53c70667dbbc 100644
--- a/clang-tools-extra/clang-tidy/readability/AvoidUnconditionalPreprocessorIfCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/AvoidUnconditionalPreprocessorIfCheck.cpp
@@ -1,4 +1,4 @@
-//===--- AvoidUnconditionalPreprocessorIfCheck.cpp - clang-tidy -----------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/readability/AvoidUnconditionalPreprocessorIfCheck.h b/clang-tools-extra/clang-tidy/readability/AvoidUnconditionalPreprocessorIfCheck.h
index 50292fce9d8dc..2382a5e928972 100644
--- a/clang-tools-extra/clang-tidy/readability/AvoidUnconditionalPreprocessorIfCheck.h
+++ b/clang-tools-extra/clang-tidy/readability/AvoidUnconditionalPreprocessorIfCheck.h
@@ -1,4 +1,4 @@
-//===--- AvoidUnconditionalPreprocessorIfCheck.h - clang-tidy ---*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/readability/BracesAroundStatementsCheck.cpp b/clang-tools-extra/clang-tidy/readability/BracesAroundStatementsCheck.cpp
index 85bd9c1e4f9a0..1952e14d1fc3d 100644
--- a/clang-tools-extra/clang-tidy/readability/BracesAroundStatementsCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/BracesAroundStatementsCheck.cpp
@@ -1,4 +1,4 @@
-//===--- BracesAroundStatementsCheck.cpp - clang-tidy ---------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/readability/BracesAroundStatementsCheck.h b/clang-tools-extra/clang-tidy/readability/BracesAroundStatementsCheck.h
index 4cd37a7b2dd6c..183f1fa8b8a8e 100644
--- a/clang-tools-extra/clang-tidy/readability/BracesAroundStatementsCheck.h
+++ b/clang-tools-extra/clang-tidy/readability/BracesAroundStatementsCheck.h
@@ -1,4 +1,4 @@
-//===--- BracesAroundStatementsCheck.h - clang-tidy -------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/readability/ConstReturnTypeCheck.cpp b/clang-tools-extra/clang-tidy/readability/ConstReturnTypeCheck.cpp
index c13a8010c2221..6ccd933ff4c21 100644
--- a/clang-tools-extra/clang-tidy/readability/ConstReturnTypeCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/ConstReturnTypeCheck.cpp
@@ -1,4 +1,4 @@
-//===--- ConstReturnTypeCheck.cpp - clang-tidy ---------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/readability/ConstReturnTypeCheck.h b/clang-tools-extra/clang-tidy/readability/ConstReturnTypeCheck.h
index a36c6f4b67e5a..e3d9713d430ce 100644
--- a/clang-tools-extra/clang-tidy/readability/ConstReturnTypeCheck.h
+++ b/clang-tools-extra/clang-tidy/readability/ConstReturnTypeCheck.h
@@ -1,4 +1,4 @@
-//===--- ConstReturnTypeCheck.h - clang-tidy --------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/readability/ContainerContainsCheck.cpp b/clang-tools-extra/clang-tidy/readability/ContainerContainsCheck.cpp
index fb68c7d334b7f..04c1aa2fab8e6 100644
--- a/clang-tools-extra/clang-tidy/readability/ContainerContainsCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/ContainerContainsCheck.cpp
@@ -1,4 +1,4 @@
-//===--- ContainerContainsCheck.cpp - clang-tidy --------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/readability/ContainerContainsCheck.h b/clang-tools-extra/clang-tidy/readability/ContainerContainsCheck.h
index 753603ed82537..e419785060df0 100644
--- a/clang-tools-extra/clang-tidy/readability/ContainerContainsCheck.h
+++ b/clang-tools-extra/clang-tidy/readability/ContainerContainsCheck.h
@@ -1,4 +1,4 @@
-//===--- ContainerContainsCheck.h - clang-tidy ------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/readability/ContainerDataPointerCheck.cpp b/clang-tools-extra/clang-tidy/readability/ContainerDataPointerCheck.cpp
index a05e228520c9e..11756d10a8221 100644
--- a/clang-tools-extra/clang-tidy/readability/ContainerDataPointerCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/ContainerDataPointerCheck.cpp
@@ -1,4 +1,4 @@
-//===--- ContainerDataPointerCheck.cpp - clang-tidy -----------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/readability/ContainerDataPointerCheck.h b/clang-tools-extra/clang-tidy/readability/ContainerDataPointerCheck.h
index 2a15b95095171..71fde87fbb093 100644
--- a/clang-tools-extra/clang-tidy/readability/ContainerDataPointerCheck.h
+++ b/clang-tools-extra/clang-tidy/readability/ContainerDataPointerCheck.h
@@ -1,4 +1,4 @@
-//===--- ContainerDataPointerCheck.h - clang-tidy ---------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/readability/ContainerSizeEmptyCheck.cpp b/clang-tools-extra/clang-tidy/readability/ContainerSizeEmptyCheck.cpp
index c3f8106c34dcb..11faf1622e4e8 100644
--- a/clang-tools-extra/clang-tidy/readability/ContainerSizeEmptyCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/ContainerSizeEmptyCheck.cpp
@@ -1,4 +1,4 @@
-//===--- ContainerSizeEmptyCheck.cpp - clang-tidy -------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/readability/ContainerSizeEmptyCheck.h b/clang-tools-extra/clang-tidy/readability/ContainerSizeEmptyCheck.h
index e449686f77566..35ef18430378b 100644
--- a/clang-tools-extra/clang-tidy/readability/ContainerSizeEmptyCheck.h
+++ b/clang-tools-extra/clang-tidy/readability/ContainerSizeEmptyCheck.h
@@ -1,4 +1,4 @@
-//===--- ContainerSizeEmptyCheck.h - clang-tidy -----------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/readability/ConvertMemberFunctionsToStatic.cpp b/clang-tools-extra/clang-tidy/readability/ConvertMemberFunctionsToStatic.cpp
index d6784d0e8fba8..6da4cf7c6bf94 100644
--- a/clang-tools-extra/clang-tidy/readability/ConvertMemberFunctionsToStatic.cpp
+++ b/clang-tools-extra/clang-tidy/readability/ConvertMemberFunctionsToStatic.cpp
@@ -1,4 +1,4 @@
-//===--- ConvertMemberFunctionsToStatic.cpp - clang-tidy ------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/readability/ConvertMemberFunctionsToStatic.h b/clang-tools-extra/clang-tidy/readability/ConvertMemberFunctionsToStatic.h
index 1b12fec972998..ee83d7b4784ff 100644
--- a/clang-tools-extra/clang-tidy/readability/ConvertMemberFunctionsToStatic.h
+++ b/clang-tools-extra/clang-tidy/readability/ConvertMemberFunctionsToStatic.h
@@ -1,4 +1,4 @@
-//===--- ConvertMemberFunctionsToStatic.h - clang-tidy ----------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/readability/DeleteNullPointerCheck.cpp b/clang-tools-extra/clang-tidy/readability/DeleteNullPointerCheck.cpp
index 12131cc078f0b..e96bfe7fe7271 100644
--- a/clang-tools-extra/clang-tidy/readability/DeleteNullPointerCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/DeleteNullPointerCheck.cpp
@@ -1,4 +1,4 @@
-//===--- DeleteNullPointerCheck.cpp - clang-tidy---------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/readability/DeleteNullPointerCheck.h b/clang-tools-extra/clang-tidy/readability/DeleteNullPointerCheck.h
index 6e746d803d3ee..dc88646f07afa 100644
--- a/clang-tools-extra/clang-tidy/readability/DeleteNullPointerCheck.h
+++ b/clang-tools-extra/clang-tidy/readability/DeleteNullPointerCheck.h
@@ -1,4 +1,4 @@
-//===--- DeleteNullPointerCheck.h - clang-tidy-------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/readability/DuplicateIncludeCheck.cpp b/clang-tools-extra/clang-tidy/readability/DuplicateIncludeCheck.cpp
index 229e5583846b9..570a109e55b14 100644
--- a/clang-tools-extra/clang-tidy/readability/DuplicateIncludeCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/DuplicateIncludeCheck.cpp
@@ -1,4 +1,4 @@
-//===--- DuplicateIncludeCheck.cpp - clang-tidy ---------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/readability/DuplicateIncludeCheck.h b/clang-tools-extra/clang-tidy/readability/DuplicateIncludeCheck.h
index 05395496d841b..297999cf4f921 100644
--- a/clang-tools-extra/clang-tidy/readability/DuplicateIncludeCheck.h
+++ b/clang-tools-extra/clang-tidy/readability/DuplicateIncludeCheck.h
@@ -1,4 +1,4 @@
-//===--- DuplicateIncludeCheck.h - clang-tidy -------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/readability/ElseAfterReturnCheck.cpp b/clang-tools-extra/clang-tidy/readability/ElseAfterReturnCheck.cpp
index f68e1f6926b84..6399e7d99a9c7 100644
--- a/clang-tools-extra/clang-tidy/readability/ElseAfterReturnCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/ElseAfterReturnCheck.cpp
@@ -1,4 +1,4 @@
-//===--- ElseAfterReturnCheck.cpp - clang-tidy-----------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/readability/ElseAfterReturnCheck.h b/clang-tools-extra/clang-tidy/readability/ElseAfterReturnCheck.h
index 34860c2853ea8..ab025032317c7 100644
--- a/clang-tools-extra/clang-tidy/readability/ElseAfterReturnCheck.h
+++ b/clang-tools-extra/clang-tidy/readability/ElseAfterReturnCheck.h
@@ -1,4 +1,4 @@
-//===--- ElseAfterReturnCheck.h - clang-tidy---------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/readability/EnumInitialValueCheck.cpp b/clang-tools-extra/clang-tidy/readability/EnumInitialValueCheck.cpp
index 9eef5c4db2d01..a2a5c3e10ee07 100644
--- a/clang-tools-extra/clang-tidy/readability/EnumInitialValueCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/EnumInitialValueCheck.cpp
@@ -1,4 +1,4 @@
-//===--- EnumInitialValueCheck.cpp - clang-tidy ---------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/readability/EnumInitialValueCheck.h b/clang-tools-extra/clang-tidy/readability/EnumInitialValueCheck.h
index 66087e4ee170d..f070f867b6af8 100644
--- a/clang-tools-extra/clang-tidy/readability/EnumInitialValueCheck.h
+++ b/clang-tools-extra/clang-tidy/readability/EnumInitialValueCheck.h
@@ -1,4 +1,4 @@
-//===--- EnumInitialValueCheck.h - clang-tidy -------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/readability/FunctionCognitiveComplexityCheck.cpp b/clang-tools-extra/clang-tidy/readability/FunctionCognitiveComplexityCheck.cpp
index 2f59aaa86b157..f9d81212e2842 100644
--- a/clang-tools-extra/clang-tidy/readability/FunctionCognitiveComplexityCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/FunctionCognitiveComplexityCheck.cpp
@@ -1,4 +1,4 @@
-//===--- FunctionCognitiveComplexityCheck.cpp - clang-tidy ------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/readability/FunctionCognitiveComplexityCheck.h b/clang-tools-extra/clang-tidy/readability/FunctionCognitiveComplexityCheck.h
index bdb8550eeae23..455fbfd9fa56a 100644
--- a/clang-tools-extra/clang-tidy/readability/FunctionCognitiveComplexityCheck.h
+++ b/clang-tools-extra/clang-tidy/readability/FunctionCognitiveComplexityCheck.h
@@ -1,4 +1,4 @@
-//===--- FunctionCognitiveComplexityCheck.h - clang-tidy --------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/readability/FunctionSizeCheck.cpp b/clang-tools-extra/clang-tidy/readability/FunctionSizeCheck.cpp
index 8e3a2e306dbf7..8c58346ede3fa 100644
--- a/clang-tools-extra/clang-tidy/readability/FunctionSizeCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/FunctionSizeCheck.cpp
@@ -1,4 +1,4 @@
-//===-- FunctionSizeCheck.cpp - clang-tidy --------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/readability/FunctionSizeCheck.h b/clang-tools-extra/clang-tidy/readability/FunctionSizeCheck.h
index f668ab18fea52..0459db6abfe31 100644
--- a/clang-tools-extra/clang-tidy/readability/FunctionSizeCheck.h
+++ b/clang-tools-extra/clang-tidy/readability/FunctionSizeCheck.h
@@ -1,4 +1,4 @@
-//===--- FunctionSizeCheck.h - clang-tidy -----------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/readability/IdentifierLengthCheck.cpp b/clang-tools-extra/clang-tidy/readability/IdentifierLengthCheck.cpp
index 50f8a6be06e46..877f0a45f9ea7 100644
--- a/clang-tools-extra/clang-tidy/readability/IdentifierLengthCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/IdentifierLengthCheck.cpp
@@ -1,5 +1,4 @@
-//===--- IdentifierLengthCheck.cpp - clang-tidy
-//-----------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/readability/IdentifierLengthCheck.h b/clang-tools-extra/clang-tidy/readability/IdentifierLengthCheck.h
index 2a4b810264e96..9626e2251426d 100644
--- a/clang-tools-extra/clang-tidy/readability/IdentifierLengthCheck.h
+++ b/clang-tools-extra/clang-tidy/readability/IdentifierLengthCheck.h
@@ -1,5 +1,4 @@
-//===--- IdentifierLengthCheck.h - clang-tidy ---------------------*- C++
-//-*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/readability/IdentifierNamingCheck.cpp b/clang-tools-extra/clang-tidy/readability/IdentifierNamingCheck.cpp
index c8b62211c4b2e..5178bee5c3374 100644
--- a/clang-tools-extra/clang-tidy/readability/IdentifierNamingCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/IdentifierNamingCheck.cpp
@@ -1,4 +1,4 @@
-//===--- IdentifierNamingCheck.cpp - clang-tidy ---------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -337,8 +337,7 @@ std::string IdentifierNamingCheck::HungarianNotation::getDeclTypeName(
 
     // Remove keywords
     for (StringRef Kw : Keywords) {
-      for (size_t Pos = 0;
-           (Pos = Type.find(Kw.data(), Pos)) != std::string::npos;) {
+      for (size_t Pos = 0; (Pos = Type.find(Kw, Pos)) != std::string::npos;) {
         Type.replace(Pos, Kw.size(), "");
       }
     }
@@ -373,7 +372,7 @@ std::string IdentifierNamingCheck::HungarianNotation::getDeclTypeName(
         " int", " char", " double", " long", " short"};
     bool RedundantRemoved = false;
     for (auto Kw : TailsOfMultiWordType) {
-      size_t Pos = Type.rfind(Kw.data());
+      size_t Pos = Type.rfind(Kw);
       if (Pos != std::string::npos) {
         const size_t PtrCount = getAsteriskCount(Type, ND);
         Type = Type.substr(0, Pos + Kw.size() + PtrCount);
@@ -602,9 +601,8 @@ std::string IdentifierNamingCheck::HungarianNotation::getDataTypePrefix(
   if (PtrCount > 0) {
     ModifiedTypeName = [&](std::string Str, StringRef From, StringRef To) {
       size_t StartPos = 0;
-      while ((StartPos = Str.find(From.data(), StartPos)) !=
-             std::string::npos) {
-        Str.replace(StartPos, From.size(), To.data());
+      while ((StartPos = Str.find(From, StartPos)) != std::string::npos) {
+        Str.replace(StartPos, From.size(), To);
         StartPos += To.size();
       }
       return Str;
diff --git a/clang-tools-extra/clang-tidy/readability/IdentifierNamingCheck.h b/clang-tools-extra/clang-tidy/readability/IdentifierNamingCheck.h
index 646ec0eac8dd1..3db9d23150af3 100644
--- a/clang-tools-extra/clang-tidy/readability/IdentifierNamingCheck.h
+++ b/clang-tools-extra/clang-tidy/readability/IdentifierNamingCheck.h
@@ -1,4 +1,4 @@
-//===--- IdentifierNamingCheck.h - clang-tidy -------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/readability/ImplicitBoolConversionCheck.cpp b/clang-tools-extra/clang-tidy/readability/ImplicitBoolConversionCheck.cpp
index 6b10e6b206a31..3fb856097a7e9 100644
--- a/clang-tools-extra/clang-tidy/readability/ImplicitBoolConversionCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/ImplicitBoolConversionCheck.cpp
@@ -1,4 +1,4 @@
-//===--- ImplicitBoolConversionCheck.cpp - clang-tidy----------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/readability/ImplicitBoolConversionCheck.h b/clang-tools-extra/clang-tidy/readability/ImplicitBoolConversionCheck.h
index 5947f7316e67c..8028a31719644 100644
--- a/clang-tools-extra/clang-tidy/readability/ImplicitBoolConversionCheck.h
+++ b/clang-tools-extra/clang-tidy/readability/ImplicitBoolConversionCheck.h
@@ -1,4 +1,4 @@
-//===--- ImplicitBoolConversionCheck.h - clang-tidy--------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/readability/InconsistentDeclarationParameterNameCheck.cpp b/clang-tools-extra/clang-tidy/readability/InconsistentDeclarationParameterNameCheck.cpp
index 10aa779117bbd..2eb26fcf840cd 100644
--- a/clang-tools-extra/clang-tidy/readability/InconsistentDeclarationParameterNameCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/InconsistentDeclarationParameterNameCheck.cpp
@@ -1,4 +1,4 @@
-//===--- InconsistentDeclarationParameterNameCheck.cpp - clang-tidy-------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/readability/IsolateDeclarationCheck.cpp b/clang-tools-extra/clang-tidy/readability/IsolateDeclarationCheck.cpp
index ca6503753f6b4..bc5edecb8a65b 100644
--- a/clang-tools-extra/clang-tidy/readability/IsolateDeclarationCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/IsolateDeclarationCheck.cpp
@@ -1,4 +1,4 @@
-//===--- IsolateDeclarationCheck.cpp - clang-tidy -------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/readability/IsolateDeclarationCheck.h b/clang-tools-extra/clang-tidy/readability/IsolateDeclarationCheck.h
index 63e37a48ca418..c7e1ea33a0d0d 100644
--- a/clang-tools-extra/clang-tidy/readability/IsolateDeclarationCheck.h
+++ b/clang-tools-extra/clang-tidy/readability/IsolateDeclarationCheck.h
@@ -1,4 +1,4 @@
-//===--- IsolateDeclarationCheck.h - clang-tidy -----------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/readability/MagicNumbersCheck.cpp b/clang-tools-extra/clang-tidy/readability/MagicNumbersCheck.cpp
index 6f91527c420e8..a38f7bc029e8b 100644
--- a/clang-tools-extra/clang-tidy/readability/MagicNumbersCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/MagicNumbersCheck.cpp
@@ -1,4 +1,4 @@
-//===--- MagicNumbersCheck.cpp - clang-tidy-------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/readability/MagicNumbersCheck.h b/clang-tools-extra/clang-tidy/readability/MagicNumbersCheck.h
index 70a17889d244e..b703bd4ba984f 100644
--- a/clang-tools-extra/clang-tidy/readability/MagicNumbersCheck.h
+++ b/clang-tools-extra/clang-tidy/readability/MagicNumbersCheck.h
@@ -1,4 +1,4 @@
-//===--- MagicNumbersCheck.h - clang-tidy-----------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/readability/MakeMemberFunctionConstCheck.cpp b/clang-tools-extra/clang-tidy/readability/MakeMemberFunctionConstCheck.cpp
index aace96f54c61c..bea68884e3bda 100644
--- a/clang-tools-extra/clang-tidy/readability/MakeMemberFunctionConstCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/MakeMemberFunctionConstCheck.cpp
@@ -1,4 +1,4 @@
-//===--- MakeMemberFunctionConstCheck.cpp - clang-tidy --------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/readability/MakeMemberFunctionConstCheck.h b/clang-tools-extra/clang-tidy/readability/MakeMemberFunctionConstCheck.h
index dc8d98332793e..6be832260bd18 100644
--- a/clang-tools-extra/clang-tidy/readability/MakeMemberFunctionConstCheck.h
+++ b/clang-tools-extra/clang-tidy/readability/MakeMemberFunctionConstCheck.h
@@ -1,4 +1,4 @@
-//===--- MakeMemberFunctionConstCheck.h - clang-tidy ------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/readability/MathMissingParenthesesCheck.cpp b/clang-tools-extra/clang-tidy/readability/MathMissingParenthesesCheck.cpp
index e0640f27f4e35..e15b2ecd8f5c0 100644
--- a/clang-tools-extra/clang-tidy/readability/MathMissingParenthesesCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/MathMissingParenthesesCheck.cpp
@@ -1,4 +1,4 @@
-//===--- MathMissingParenthesesCheck.cpp - clang-tidy ---------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/readability/MathMissingParenthesesCheck.h b/clang-tools-extra/clang-tidy/readability/MathMissingParenthesesCheck.h
index 9a9d2b3cfaaba..3381d6612a709 100644
--- a/clang-tools-extra/clang-tidy/readability/MathMissingParenthesesCheck.h
+++ b/clang-tools-extra/clang-tidy/readability/MathMissingParenthesesCheck.h
@@ -1,4 +1,4 @@
-//===--- MathMissingParenthesesCheck.h - clang-tidy -------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/readability/MisleadingIndentationCheck.cpp b/clang-tools-extra/clang-tidy/readability/MisleadingIndentationCheck.cpp
index e32f79589a059..0765d8d82ee04 100644
--- a/clang-tools-extra/clang-tidy/readability/MisleadingIndentationCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/MisleadingIndentationCheck.cpp
@@ -1,4 +1,4 @@
-//===--- MisleadingIndentationCheck.cpp - clang-tidy-----------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/readability/MisleadingIndentationCheck.h b/clang-tools-extra/clang-tidy/readability/MisleadingIndentationCheck.h
index 9c92fc1e18b6f..39bb4baba5141 100644
--- a/clang-tools-extra/clang-tidy/readability/MisleadingIndentationCheck.h
+++ b/clang-tools-extra/clang-tidy/readability/MisleadingIndentationCheck.h
@@ -1,4 +1,4 @@
-//===--- MisleadingIndentationCheck.h - clang-tidy---------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/readability/MisplacedArrayIndexCheck.cpp b/clang-tools-extra/clang-tidy/readability/MisplacedArrayIndexCheck.cpp
index 328d1896ce9f8..0052af6f5d1d1 100644
--- a/clang-tools-extra/clang-tidy/readability/MisplacedArrayIndexCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/MisplacedArrayIndexCheck.cpp
@@ -1,4 +1,4 @@
-//===--- MisplacedArrayIndexCheck.cpp - clang-tidy-------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/readability/MisplacedArrayIndexCheck.h b/clang-tools-extra/clang-tidy/readability/MisplacedArrayIndexCheck.h
index 1ccd011b30fff..1b11b6bea108e 100644
--- a/clang-tools-extra/clang-tidy/readability/MisplacedArrayIndexCheck.h
+++ b/clang-tools-extra/clang-tidy/readability/MisplacedArrayIndexCheck.h
@@ -1,4 +1,4 @@
-//===--- MisplacedArrayIndexCheck.h - clang-tidy-----------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/readability/NamedParameterCheck.cpp b/clang-tools-extra/clang-tidy/readability/NamedParameterCheck.cpp
index 6bb8c394f75cc..7251d63edfd89 100644
--- a/clang-tools-extra/clang-tidy/readability/NamedParameterCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/NamedParameterCheck.cpp
@@ -1,4 +1,4 @@
-//===--- NamedParameterCheck.cpp - clang-tidy -------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/readability/NamedParameterCheck.h b/clang-tools-extra/clang-tidy/readability/NamedParameterCheck.h
index f14a74d75eb49..ecd128d887f84 100644
--- a/clang-tools-extra/clang-tidy/readability/NamedParameterCheck.h
+++ b/clang-tools-extra/clang-tidy/readability/NamedParameterCheck.h
@@ -1,4 +1,4 @@
-//===--- NamedParameterCheck.h - clang-tidy ---------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/readability/NamespaceCommentCheck.cpp b/clang-tools-extra/clang-tidy/readability/NamespaceCommentCheck.cpp
index c04bf361c40ca..744d23a6fdbcd 100644
--- a/clang-tools-extra/clang-tidy/readability/NamespaceCommentCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/NamespaceCommentCheck.cpp
@@ -1,4 +1,4 @@
-//===--- NamespaceCommentCheck.cpp - clang-tidy ---------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/readability/NamespaceCommentCheck.h b/clang-tools-extra/clang-tidy/readability/NamespaceCommentCheck.h
index 8edd77213f779..883a2a44fee8d 100644
--- a/clang-tools-extra/clang-tidy/readability/NamespaceCommentCheck.h
+++ b/clang-tools-extra/clang-tidy/readability/NamespaceCommentCheck.h
@@ -1,4 +1,4 @@
-//===--- NamespaceCommentCheck.h - clang-tidy -------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/readability/NonConstParameterCheck.cpp b/clang-tools-extra/clang-tidy/readability/NonConstParameterCheck.cpp
index 07071a1f6d2fe..29fff3971599e 100644
--- a/clang-tools-extra/clang-tidy/readability/NonConstParameterCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/NonConstParameterCheck.cpp
@@ -1,4 +1,4 @@
-//===--- NonConstParameterCheck.cpp - clang-tidy---------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/readability/NonConstParameterCheck.h b/clang-tools-extra/clang-tidy/readability/NonConstParameterCheck.h
index e2598dd01d297..61d6ebd4c2f2a 100644
--- a/clang-tools-extra/clang-tidy/readability/NonConstParameterCheck.h
+++ b/clang-tools-extra/clang-tidy/readability/NonConstParameterCheck.h
@@ -1,4 +1,4 @@
-//===--- NonConstParameterCheck.h - clang-tidy-------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/readability/OperatorsRepresentationCheck.cpp b/clang-tools-extra/clang-tidy/readability/OperatorsRepresentationCheck.cpp
index ccaa686f85323..196fb31bd4b7a 100644
--- a/clang-tools-extra/clang-tidy/readability/OperatorsRepresentationCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/OperatorsRepresentationCheck.cpp
@@ -1,5 +1,4 @@
-//===--- OperatorsRepresentationCheck.cpp - clang-tidy
-//--------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/readability/OperatorsRepresentationCheck.h b/clang-tools-extra/clang-tidy/readability/OperatorsRepresentationCheck.h
index d315f3912a914..f1a9793481ada 100644
--- a/clang-tools-extra/clang-tidy/readability/OperatorsRepresentationCheck.h
+++ b/clang-tools-extra/clang-tidy/readability/OperatorsRepresentationCheck.h
@@ -1,4 +1,4 @@
-//===--- OperatorsRepresentationCheck.h - clang-tidy ------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/readability/QualifiedAutoCheck.cpp b/clang-tools-extra/clang-tidy/readability/QualifiedAutoCheck.cpp
index 44a784bc9f21a..dc9510d1dab62 100644
--- a/clang-tools-extra/clang-tidy/readability/QualifiedAutoCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/QualifiedAutoCheck.cpp
@@ -1,4 +1,4 @@
-//===--- QualifiedAutoCheck.cpp - clang-tidy ------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/readability/QualifiedAutoCheck.h b/clang-tools-extra/clang-tidy/readability/QualifiedAutoCheck.h
index b5b713f3db6cf..c63b426bda7c8 100644
--- a/clang-tools-extra/clang-tidy/readability/QualifiedAutoCheck.h
+++ b/clang-tools-extra/clang-tidy/readability/QualifiedAutoCheck.h
@@ -1,4 +1,4 @@
-//===--- QualifiedAutoCheck.h - clang-tidy ----------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/readability/ReadabilityTidyModule.cpp b/clang-tools-extra/clang-tidy/readability/ReadabilityTidyModule.cpp
index 12f8cdb289dd2..d01882dfc9daa 100644
--- a/clang-tools-extra/clang-tidy/readability/ReadabilityTidyModule.cpp
+++ b/clang-tools-extra/clang-tidy/readability/ReadabilityTidyModule.cpp
@@ -1,4 +1,4 @@
-//===--- ReadabilityTidyModule.cpp - clang-tidy ---------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/readability/RedundantAccessSpecifiersCheck.cpp b/clang-tools-extra/clang-tidy/readability/RedundantAccessSpecifiersCheck.cpp
index c3464b2a83d15..e93aa16ebdb13 100644
--- a/clang-tools-extra/clang-tidy/readability/RedundantAccessSpecifiersCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/RedundantAccessSpecifiersCheck.cpp
@@ -1,4 +1,4 @@
-//===--- RedundantAccessSpecifiersCheck.cpp - clang-tidy ------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/readability/RedundantAccessSpecifiersCheck.h b/clang-tools-extra/clang-tidy/readability/RedundantAccessSpecifiersCheck.h
index 566e5ea637986..6359dafc0e4eb 100644
--- a/clang-tools-extra/clang-tidy/readability/RedundantAccessSpecifiersCheck.h
+++ b/clang-tools-extra/clang-tidy/readability/RedundantAccessSpecifiersCheck.h
@@ -1,4 +1,4 @@
-//===--- RedundantAccessSpecifiersCheck.h - clang-tidy ----------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/readability/RedundantCastingCheck.cpp b/clang-tools-extra/clang-tidy/readability/RedundantCastingCheck.cpp
index acc834ae25c60..1ee75220b1c4e 100644
--- a/clang-tools-extra/clang-tidy/readability/RedundantCastingCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/RedundantCastingCheck.cpp
@@ -1,4 +1,4 @@
-//===--- RedundantCastingCheck.cpp - clang-tidy ---------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/readability/RedundantCastingCheck.h b/clang-tools-extra/clang-tidy/readability/RedundantCastingCheck.h
index fdcfede05d436..97c87fb8b09a1 100644
--- a/clang-tools-extra/clang-tidy/readability/RedundantCastingCheck.h
+++ b/clang-tools-extra/clang-tidy/readability/RedundantCastingCheck.h
@@ -1,4 +1,4 @@
-//===--- RedundantCastingCheck.h - clang-tidy -------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/readability/RedundantControlFlowCheck.cpp b/clang-tools-extra/clang-tidy/readability/RedundantControlFlowCheck.cpp
index d93077cc6884e..b3b84e2cc0ccd 100644
--- a/clang-tools-extra/clang-tidy/readability/RedundantControlFlowCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/RedundantControlFlowCheck.cpp
@@ -1,4 +1,4 @@
-//===--- RedundantControlFlowCheck.cpp - clang-tidy------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/readability/RedundantControlFlowCheck.h b/clang-tools-extra/clang-tidy/readability/RedundantControlFlowCheck.h
index 7433005bb7a37..7698996d107e4 100644
--- a/clang-tools-extra/clang-tidy/readability/RedundantControlFlowCheck.h
+++ b/clang-tools-extra/clang-tidy/readability/RedundantControlFlowCheck.h
@@ -1,4 +1,4 @@
-//===--- RedundantControlFlowCheck.h - clang-tidy----------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/readability/RedundantDeclarationCheck.cpp b/clang-tools-extra/clang-tidy/readability/RedundantDeclarationCheck.cpp
index e86e866209e9a..cf6e92d84e92a 100644
--- a/clang-tools-extra/clang-tidy/readability/RedundantDeclarationCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/RedundantDeclarationCheck.cpp
@@ -1,4 +1,4 @@
-//===--- RedundantDeclarationCheck.cpp - clang-tidy------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/readability/RedundantDeclarationCheck.h b/clang-tools-extra/clang-tidy/readability/RedundantDeclarationCheck.h
index a14a8aa70f6cf..fff7827c6378a 100644
--- a/clang-tools-extra/clang-tidy/readability/RedundantDeclarationCheck.h
+++ b/clang-tools-extra/clang-tidy/readability/RedundantDeclarationCheck.h
@@ -1,4 +1,4 @@
-//===--- RedundantDeclarationCheck.h - clang-tidy----------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/readability/RedundantFunctionPtrDereferenceCheck.cpp b/clang-tools-extra/clang-tidy/readability/RedundantFunctionPtrDereferenceCheck.cpp
index a70719fd8a041..7f399997cfecf 100644
--- a/clang-tools-extra/clang-tidy/readability/RedundantFunctionPtrDereferenceCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/RedundantFunctionPtrDereferenceCheck.cpp
@@ -1,4 +1,4 @@
-//===--- RedundantFunctionPtrDereferenceCheck.cpp - clang-tidy-------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/readability/RedundantFunctionPtrDereferenceCheck.h b/clang-tools-extra/clang-tidy/readability/RedundantFunctionPtrDereferenceCheck.h
index a04e9c165bc03..f4a3671b0f7d1 100644
--- a/clang-tools-extra/clang-tidy/readability/RedundantFunctionPtrDereferenceCheck.h
+++ b/clang-tools-extra/clang-tidy/readability/RedundantFunctionPtrDereferenceCheck.h
@@ -1,4 +1,4 @@
-//===--- RedundantFunctionPtrDereferenceCheck.h - clang-tidy-----*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/readability/RedundantInlineSpecifierCheck.cpp b/clang-tools-extra/clang-tidy/readability/RedundantInlineSpecifierCheck.cpp
index 7f1882c775c59..2053b89ada7e2 100644
--- a/clang-tools-extra/clang-tidy/readability/RedundantInlineSpecifierCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/RedundantInlineSpecifierCheck.cpp
@@ -1,4 +1,4 @@
-//===--- RedundantInlineSpecifierCheck.cpp - clang-tidy--------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/readability/RedundantInlineSpecifierCheck.h b/clang-tools-extra/clang-tidy/readability/RedundantInlineSpecifierCheck.h
index 63b1b46bb7e09..d1134b307a909 100644
--- a/clang-tools-extra/clang-tidy/readability/RedundantInlineSpecifierCheck.h
+++ b/clang-tools-extra/clang-tidy/readability/RedundantInlineSpecifierCheck.h
@@ -1,4 +1,4 @@
-//===--- RedundantInlineSpecifierCheck.h - clang-tidy ------------*-C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/readability/RedundantMemberInitCheck.cpp b/clang-tools-extra/clang-tidy/readability/RedundantMemberInitCheck.cpp
index 2373dde1618bc..1bbb9c86fee14 100644
--- a/clang-tools-extra/clang-tidy/readability/RedundantMemberInitCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/RedundantMemberInitCheck.cpp
@@ -1,4 +1,4 @@
-//===--- RedundantMemberInitCheck.cpp - clang-tidy-------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/readability/RedundantMemberInitCheck.h b/clang-tools-extra/clang-tidy/readability/RedundantMemberInitCheck.h
index c0e0a6dac0dbc..2ce8c3f5f64f5 100644
--- a/clang-tools-extra/clang-tidy/readability/RedundantMemberInitCheck.h
+++ b/clang-tools-extra/clang-tidy/readability/RedundantMemberInitCheck.h
@@ -1,4 +1,4 @@
-//===--- RedundantMemberInitCheck.h - clang-tidy----------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/readability/RedundantPreprocessorCheck.cpp b/clang-tools-extra/clang-tidy/readability/RedundantPreprocessorCheck.cpp
index 513687f03df0c..931126a154d1e 100644
--- a/clang-tools-extra/clang-tidy/readability/RedundantPreprocessorCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/RedundantPreprocessorCheck.cpp
@@ -1,4 +1,4 @@
-//===--- RedundantPreprocessorCheck.cpp - clang-tidy ----------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/readability/RedundantPreprocessorCheck.h b/clang-tools-extra/clang-tidy/readability/RedundantPreprocessorCheck.h
index 8a6fb6fd98b33..ca34f9783c619 100644
--- a/clang-tools-extra/clang-tidy/readability/RedundantPreprocessorCheck.h
+++ b/clang-tools-extra/clang-tidy/readability/RedundantPreprocessorCheck.h
@@ -1,4 +1,4 @@
-//===--- RedundantPreprocessorCheck.h - clang-tidy --------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/readability/RedundantSmartptrGetCheck.cpp b/clang-tools-extra/clang-tidy/readability/RedundantSmartptrGetCheck.cpp
index 9774d93ff36fd..0598683bff6c2 100644
--- a/clang-tools-extra/clang-tidy/readability/RedundantSmartptrGetCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/RedundantSmartptrGetCheck.cpp
@@ -1,4 +1,4 @@
-//===--- RedundantSmartptrGetCheck.cpp - clang-tidy -----------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/readability/RedundantSmartptrGetCheck.h b/clang-tools-extra/clang-tidy/readability/RedundantSmartptrGetCheck.h
index 1f90e4fb4a8be..be9e916cc86be 100644
--- a/clang-tools-extra/clang-tidy/readability/RedundantSmartptrGetCheck.h
+++ b/clang-tools-extra/clang-tidy/readability/RedundantSmartptrGetCheck.h
@@ -1,4 +1,4 @@
-//===--- RedundantSmartptrGetCheck.h - clang-tidy ---------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/readability/RedundantStringCStrCheck.h b/clang-tools-extra/clang-tidy/readability/RedundantStringCStrCheck.h
index e2e6ab1fd939c..ac82778853747 100644
--- a/clang-tools-extra/clang-tidy/readability/RedundantStringCStrCheck.h
+++ b/clang-tools-extra/clang-tidy/readability/RedundantStringCStrCheck.h
@@ -1,4 +1,4 @@
-//===--- RedundantStringCStrCheck.h - clang-tidy ----------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/readability/ReferenceToConstructedTemporaryCheck.cpp b/clang-tools-extra/clang-tidy/readability/ReferenceToConstructedTemporaryCheck.cpp
index 587ae8ea30580..5d3fd14b92471 100644
--- a/clang-tools-extra/clang-tidy/readability/ReferenceToConstructedTemporaryCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/ReferenceToConstructedTemporaryCheck.cpp
@@ -1,5 +1,4 @@
-//===--- ReferenceToConstructedTemporaryCheck.cpp - clang-tidy
-//--------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/readability/ReferenceToConstructedTemporaryCheck.h b/clang-tools-extra/clang-tidy/readability/ReferenceToConstructedTemporaryCheck.h
index c1f4f1c4d47dd..c95f65a3ec691 100644
--- a/clang-tools-extra/clang-tidy/readability/ReferenceToConstructedTemporaryCheck.h
+++ b/clang-tools-extra/clang-tidy/readability/ReferenceToConstructedTemporaryCheck.h
@@ -1,4 +1,4 @@
-//===--- ReferenceToConstructedTemporaryCheck.h - clang-tidy ----*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/readability/SimplifyBooleanExprCheck.cpp b/clang-tools-extra/clang-tidy/readability/SimplifyBooleanExprCheck.cpp
index 499c88ef5d4e4..4184c295b5f0a 100644
--- a/clang-tools-extra/clang-tidy/readability/SimplifyBooleanExprCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/SimplifyBooleanExprCheck.cpp
@@ -1,4 +1,4 @@
-//===-- SimplifyBooleanExprCheck.cpp - clang-tidy -------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/readability/SimplifyBooleanExprCheck.h b/clang-tools-extra/clang-tidy/readability/SimplifyBooleanExprCheck.h
index 2ea6968798408..2ab074e5dca69 100644
--- a/clang-tools-extra/clang-tidy/readability/SimplifyBooleanExprCheck.h
+++ b/clang-tools-extra/clang-tidy/readability/SimplifyBooleanExprCheck.h
@@ -1,4 +1,4 @@
-//===--- SimplifyBooleanExpr.h clang-tidy -----------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/readability/SimplifySubscriptExprCheck.cpp b/clang-tools-extra/clang-tidy/readability/SimplifySubscriptExprCheck.cpp
index 7d4698d27ed16..591ee1fbe067c 100644
--- a/clang-tools-extra/clang-tidy/readability/SimplifySubscriptExprCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/SimplifySubscriptExprCheck.cpp
@@ -1,4 +1,4 @@
-//===--- SimplifySubscriptExprCheck.cpp - clang-tidy-----------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/readability/SimplifySubscriptExprCheck.h b/clang-tools-extra/clang-tidy/readability/SimplifySubscriptExprCheck.h
index deffb09f5db28..79ced95fd762c 100644
--- a/clang-tools-extra/clang-tidy/readability/SimplifySubscriptExprCheck.h
+++ b/clang-tools-extra/clang-tidy/readability/SimplifySubscriptExprCheck.h
@@ -1,4 +1,4 @@
-//===--- SimplifySubscriptExprCheck.h - clang-tidy---------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/readability/StaticAccessedThroughInstanceCheck.cpp b/clang-tools-extra/clang-tidy/readability/StaticAccessedThroughInstanceCheck.cpp
index a7b3c4a1f7cf9..7ef8ef3d947f3 100644
--- a/clang-tools-extra/clang-tidy/readability/StaticAccessedThroughInstanceCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/StaticAccessedThroughInstanceCheck.cpp
@@ -1,4 +1,4 @@
-//===--- StaticAccessedThroughInstanceCheck.cpp - clang-tidy---------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/readability/StaticAccessedThroughInstanceCheck.h b/clang-tools-extra/clang-tidy/readability/StaticAccessedThroughInstanceCheck.h
index 9869855c17d6b..5b47bf7685bbf 100644
--- a/clang-tools-extra/clang-tidy/readability/StaticAccessedThroughInstanceCheck.h
+++ b/clang-tools-extra/clang-tidy/readability/StaticAccessedThroughInstanceCheck.h
@@ -1,4 +1,4 @@
-//===--- StaticAccessedThroughInstanceCheck.h - clang-tidy-------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/readability/StaticDefinitionInAnonymousNamespaceCheck.cpp b/clang-tools-extra/clang-tidy/readability/StaticDefinitionInAnonymousNamespaceCheck.cpp
index a0f59dbcb4890..e9a2eae11bfde 100644
--- a/clang-tools-extra/clang-tidy/readability/StaticDefinitionInAnonymousNamespaceCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/StaticDefinitionInAnonymousNamespaceCheck.cpp
@@ -1,4 +1,4 @@
-//===--- StaticDefinitionInAnonymousNamespaceCheck.cpp - clang-tidy--------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/readability/StaticDefinitionInAnonymousNamespaceCheck.h b/clang-tools-extra/clang-tidy/readability/StaticDefinitionInAnonymousNamespaceCheck.h
index 620cd6e3f2f87..9207ba0075b5d 100644
--- a/clang-tools-extra/clang-tidy/readability/StaticDefinitionInAnonymousNamespaceCheck.h
+++ b/clang-tools-extra/clang-tidy/readability/StaticDefinitionInAnonymousNamespaceCheck.h
@@ -1,4 +1,4 @@
-//===--- StaticDefinitionInAnonymousNamespaceCheck.h - clang-tidy*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/readability/StringCompareCheck.cpp b/clang-tools-extra/clang-tidy/readability/StringCompareCheck.cpp
index 7c0bbef3ca087..229b5159d53d1 100644
--- a/clang-tools-extra/clang-tidy/readability/StringCompareCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/StringCompareCheck.cpp
@@ -1,4 +1,4 @@
-//===-- StringCompareCheck.cpp - clang-tidy--------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/readability/StringCompareCheck.h b/clang-tools-extra/clang-tidy/readability/StringCompareCheck.h
index 150090901a6e9..9ff80b075f101 100644
--- a/clang-tools-extra/clang-tidy/readability/StringCompareCheck.h
+++ b/clang-tools-extra/clang-tidy/readability/StringCompareCheck.h
@@ -1,4 +1,4 @@
-//===--- StringCompareCheck.h - clang-tidy-----------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/readability/SuspiciousCallArgumentCheck.cpp b/clang-tools-extra/clang-tidy/readability/SuspiciousCallArgumentCheck.cpp
index a80637dee18f4..ad8b47aa96425 100644
--- a/clang-tools-extra/clang-tidy/readability/SuspiciousCallArgumentCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/SuspiciousCallArgumentCheck.cpp
@@ -1,4 +1,4 @@
-//===--- SuspiciousCallArgumentCheck.cpp - clang-tidy ---------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/readability/SuspiciousCallArgumentCheck.h b/clang-tools-extra/clang-tidy/readability/SuspiciousCallArgumentCheck.h
index 38477d0800f15..43ae0f181302f 100644
--- a/clang-tools-extra/clang-tidy/readability/SuspiciousCallArgumentCheck.h
+++ b/clang-tools-extra/clang-tidy/readability/SuspiciousCallArgumentCheck.h
@@ -1,4 +1,4 @@
-//===--- SuspiciousCallArgumentCheck.h - clang-tidy -------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/readability/UniqueptrDeleteReleaseCheck.cpp b/clang-tools-extra/clang-tidy/readability/UniqueptrDeleteReleaseCheck.cpp
index 462085b023179..c9d70419af24b 100644
--- a/clang-tools-extra/clang-tidy/readability/UniqueptrDeleteReleaseCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/UniqueptrDeleteReleaseCheck.cpp
@@ -1,4 +1,4 @@
-//===--- UniqueptrDeleteReleaseCheck.cpp - clang-tidy----------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/readability/UniqueptrDeleteReleaseCheck.h b/clang-tools-extra/clang-tidy/readability/UniqueptrDeleteReleaseCheck.h
index 2768955109d26..f7d6fe70058fc 100644
--- a/clang-tools-extra/clang-tidy/readability/UniqueptrDeleteReleaseCheck.h
+++ b/clang-tools-extra/clang-tidy/readability/UniqueptrDeleteReleaseCheck.h
@@ -1,4 +1,4 @@
-//===--- UniqueptrDeleteReleaseCheck.h - clang-tidy--------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/readability/UppercaseLiteralSuffixCheck.cpp b/clang-tools-extra/clang-tidy/readability/UppercaseLiteralSuffixCheck.cpp
index dac4cb556aa75..c1dc209fd079d 100644
--- a/clang-tools-extra/clang-tidy/readability/UppercaseLiteralSuffixCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/UppercaseLiteralSuffixCheck.cpp
@@ -1,4 +1,4 @@
-//===--- UppercaseLiteralSuffixCheck.cpp - clang-tidy ---------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/readability/UppercaseLiteralSuffixCheck.h b/clang-tools-extra/clang-tidy/readability/UppercaseLiteralSuffixCheck.h
index a8af08f5a8021..7c71fe064f3c9 100644
--- a/clang-tools-extra/clang-tidy/readability/UppercaseLiteralSuffixCheck.h
+++ b/clang-tools-extra/clang-tidy/readability/UppercaseLiteralSuffixCheck.h
@@ -1,4 +1,4 @@
-//===--- UppercaseLiteralSuffixCheck.h - clang-tidy -------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/readability/UseAnyOfAllOfCheck.cpp b/clang-tools-extra/clang-tidy/readability/UseAnyOfAllOfCheck.cpp
index 7cf0e0853f080..82eb6de8fa3dc 100644
--- a/clang-tools-extra/clang-tidy/readability/UseAnyOfAllOfCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/UseAnyOfAllOfCheck.cpp
@@ -1,4 +1,4 @@
-//===--- UseAnyOfAllOfCheck.cpp - clang-tidy-------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/readability/UseAnyOfAllOfCheck.h b/clang-tools-extra/clang-tidy/readability/UseAnyOfAllOfCheck.h
index 4e53b3f5a8a91..4b7ffc1f36ace 100644
--- a/clang-tools-extra/clang-tidy/readability/UseAnyOfAllOfCheck.h
+++ b/clang-tools-extra/clang-tidy/readability/UseAnyOfAllOfCheck.h
@@ -1,4 +1,4 @@
-//===--- UseAnyOfAllOfCheck.h - clang-tidy-----------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/readability/UseConcisePreprocessorDirectivesCheck.cpp b/clang-tools-extra/clang-tidy/readability/UseConcisePreprocessorDirectivesCheck.cpp
index 05c0088e6b41b..40aaff4cb3893 100644
--- a/clang-tools-extra/clang-tidy/readability/UseConcisePreprocessorDirectivesCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/UseConcisePreprocessorDirectivesCheck.cpp
@@ -1,4 +1,4 @@
-//===--- UseConcisePreprocessorDirectivesCheck.cpp - clang-tidy -----------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/readability/UseConcisePreprocessorDirectivesCheck.h b/clang-tools-extra/clang-tidy/readability/UseConcisePreprocessorDirectivesCheck.h
index e65b16876a89a..762862dc00305 100644
--- a/clang-tools-extra/clang-tidy/readability/UseConcisePreprocessorDirectivesCheck.h
+++ b/clang-tools-extra/clang-tidy/readability/UseConcisePreprocessorDirectivesCheck.h
@@ -1,4 +1,4 @@
-//===--- UseConcisePreprocessorDirectivesCheck.h - clang-tidy ---*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/readability/UseStdMinMaxCheck.cpp b/clang-tools-extra/clang-tidy/readability/UseStdMinMaxCheck.cpp
index 511256332cee9..8052e04c99f43 100644
--- a/clang-tools-extra/clang-tidy/readability/UseStdMinMaxCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/UseStdMinMaxCheck.cpp
@@ -1,4 +1,4 @@
-//===--- UseStdMinMaxCheck.cpp - clang-tidy -------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/readability/UseStdMinMaxCheck.h b/clang-tools-extra/clang-tidy/readability/UseStdMinMaxCheck.h
index b8d8b8c4fe894..573394361cbda 100644
--- a/clang-tools-extra/clang-tidy/readability/UseStdMinMaxCheck.h
+++ b/clang-tools-extra/clang-tidy/readability/UseStdMinMaxCheck.h
@@ -1,4 +1,4 @@
-//===--- UseStdMinMaxCheck.h - clang-tidy -----------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/rename_check.py b/clang-tools-extra/clang-tidy/rename_check.py
index 5f3295b23ba72..b864bff814485 100755
--- a/clang-tools-extra/clang-tidy/rename_check.py
+++ b/clang-tools-extra/clang-tidy/rename_check.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 #
-# ===- rename_check.py - clang-tidy check renamer ------------*- python -*--===#
+# ===-----------------------------------------------------------------------===#
 #
 # Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 # See https://llvm.org/LICENSE.txt for license information.
@@ -51,30 +51,6 @@ def replaceInFile(fileName: str, sFrom: str, sTo: str) -> None:
         f.write(txt)
 
 
-def generateCommentLineHeader(filename: str) -> str:
-    return "".join(
-        [
-            "//===--- ",
-            os.path.basename(filename),
-            " - clang-tidy ",
-            "-" * max(0, 42 - len(os.path.basename(filename))),
-            "*- C++ -*-===//",
-        ]
-    )
-
-
-def generateCommentLineSource(filename: str) -> str:
-    return "".join(
-        [
-            "//===--- ",
-            os.path.basename(filename),
-            " - clang-tidy",
-            "-" * max(0, 52 - len(os.path.basename(filename))),
-            "-===//",
-        ]
-    )
-
-
 def fileRename(fileName: str, sFrom: str, sTo: str) -> str:
     if sFrom not in fileName or sFrom == sTo:
         return fileName
@@ -337,16 +313,6 @@ def main() -> None:
         )
         filename = fileRename(filename, args.old_check_name, args.new_check_name)
         filename = fileRename(filename, check_name_camel, new_check_name_camel)
-        replaceInFile(
-            filename,
-            generateCommentLineHeader(originalName),
-            generateCommentLineHeader(filename),
-        )
-        replaceInFile(
-            filename,
-            generateCommentLineSource(originalName),
-            generateCommentLineSource(filename),
-        )
         for header_guard in header_guard_variants:
             replaceInFile(filename, header_guard, header_guard_new)
 
diff --git a/clang-tools-extra/clang-tidy/tool/ClangTidyMain.cpp b/clang-tools-extra/clang-tidy/tool/ClangTidyMain.cpp
index bef3b938b5afd..35ea1b5714b84 100644
--- a/clang-tools-extra/clang-tidy/tool/ClangTidyMain.cpp
+++ b/clang-tools-extra/clang-tidy/tool/ClangTidyMain.cpp
@@ -1,4 +1,4 @@
-//===--- tools/extra/clang-tidy/ClangTidyMain.cpp - Clang tidy tool -------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/tool/ClangTidyMain.h b/clang-tools-extra/clang-tidy/tool/ClangTidyMain.h
index f3862f93d833b..35f75396828dd 100644
--- a/clang-tools-extra/clang-tidy/tool/ClangTidyMain.h
+++ b/clang-tools-extra/clang-tidy/tool/ClangTidyMain.h
@@ -1,4 +1,4 @@
-//===--- tools/extra/clang-tidy/ClangTidyMain.h - Clang tidy tool -------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/tool/ClangTidyToolMain.cpp b/clang-tools-extra/clang-tidy/tool/ClangTidyToolMain.cpp
index eb7fde7b8e07b..ea2897dfe1390 100644
--- a/clang-tools-extra/clang-tidy/tool/ClangTidyToolMain.cpp
+++ b/clang-tools-extra/clang-tidy/tool/ClangTidyToolMain.cpp
@@ -1,4 +1,4 @@
-//===--- tools/extra/clang-tidy/ClangTidyToolMain.cpp - Clang tidy tool ---===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/tool/clang-tidy-diff.py b/clang-tools-extra/clang-tidy/tool/clang-tidy-diff.py
index b4b4648e765cf..5daa93dca2a99 100755
--- a/clang-tools-extra/clang-tidy/tool/clang-tidy-diff.py
+++ b/clang-tools-extra/clang-tidy/tool/clang-tidy-diff.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 #
-# ===- clang-tidy-diff.py - ClangTidy Diff Checker -----------*- python -*--===#
+# ===-----------------------------------------------------------------------===#
 #
 # Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 # See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/tool/run-clang-tidy.py b/clang-tools-extra/clang-tidy/tool/run-clang-tidy.py
index a722e20a81c68..eadf7194ab94f 100755
--- a/clang-tools-extra/clang-tidy/tool/run-clang-tidy.py
+++ b/clang-tools-extra/clang-tidy/tool/run-clang-tidy.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 #
-# ===- run-clang-tidy.py - Parallel clang-tidy runner --------*- python -*--===#
+# ===-----------------------------------------------------------------------===#
 #
 # Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 # See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/utils/ASTUtils.cpp b/clang-tools-extra/clang-tidy/utils/ASTUtils.cpp
index 0cdc7d08abc99..d5deb99a8442d 100644
--- a/clang-tools-extra/clang-tidy/utils/ASTUtils.cpp
+++ b/clang-tools-extra/clang-tidy/utils/ASTUtils.cpp
@@ -1,4 +1,4 @@
-//===---------- ASTUtils.cpp - clang-tidy ---------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/utils/ASTUtils.h b/clang-tools-extra/clang-tidy/utils/ASTUtils.h
index 6c3e54facd020..c2127f0746986 100644
--- a/clang-tools-extra/clang-tidy/utils/ASTUtils.h
+++ b/clang-tools-extra/clang-tidy/utils/ASTUtils.h
@@ -1,4 +1,4 @@
-//===---------- ASTUtils.h - clang-tidy -----------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/utils/Aliasing.cpp b/clang-tools-extra/clang-tidy/utils/Aliasing.cpp
index cbe4873b5c022..a22d2358bc560 100644
--- a/clang-tools-extra/clang-tidy/utils/Aliasing.cpp
+++ b/clang-tools-extra/clang-tidy/utils/Aliasing.cpp
@@ -1,4 +1,4 @@
-//===------------- Aliasing.cpp - clang-tidy ------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/utils/Aliasing.h b/clang-tools-extra/clang-tidy/utils/Aliasing.h
index 6c0763b766805..2384534609366 100644
--- a/clang-tools-extra/clang-tidy/utils/Aliasing.h
+++ b/clang-tools-extra/clang-tidy/utils/Aliasing.h
@@ -1,4 +1,4 @@
-//===------------- Aliasing.h - clang-tidy --------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/utils/BracesAroundStatement.cpp b/clang-tools-extra/clang-tidy/utils/BracesAroundStatement.cpp
index 2a3b7bed08c1e..14770c49c2e25 100644
--- a/clang-tools-extra/clang-tidy/utils/BracesAroundStatement.cpp
+++ b/clang-tools-extra/clang-tidy/utils/BracesAroundStatement.cpp
@@ -1,4 +1,4 @@
-//===--- BracesAroundStatement.cpp - clang-tidy -------- ------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/utils/BracesAroundStatement.h b/clang-tools-extra/clang-tidy/utils/BracesAroundStatement.h
index cb1c06c7aa1a1..699d75435db7b 100644
--- a/clang-tools-extra/clang-tidy/utils/BracesAroundStatement.h
+++ b/clang-tools-extra/clang-tidy/utils/BracesAroundStatement.h
@@ -1,4 +1,4 @@
-//===--- BracesAroundStatement.h - clang-tidy ------- -----------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/utils/DeclRefExprUtils.cpp b/clang-tools-extra/clang-tidy/utils/DeclRefExprUtils.cpp
index 106feb7fb4172..57453ad089a2c 100644
--- a/clang-tools-extra/clang-tidy/utils/DeclRefExprUtils.cpp
+++ b/clang-tools-extra/clang-tidy/utils/DeclRefExprUtils.cpp
@@ -1,4 +1,4 @@
-//===--- DeclRefExprUtils.cpp - clang-tidy---------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/utils/DeclRefExprUtils.h b/clang-tools-extra/clang-tidy/utils/DeclRefExprUtils.h
index 8361b9d89ed26..794adc04dc478 100644
--- a/clang-tools-extra/clang-tidy/utils/DeclRefExprUtils.h
+++ b/clang-tools-extra/clang-tidy/utils/DeclRefExprUtils.h
@@ -1,4 +1,4 @@
-//===--- DeclRefExprUtils.h - clang-tidy-------------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/utils/DesignatedInitializers.cpp b/clang-tools-extra/clang-tidy/utils/DesignatedInitializers.cpp
index d43716e901e84..044f89be61342 100644
--- a/clang-tools-extra/clang-tidy/utils/DesignatedInitializers.cpp
+++ b/clang-tools-extra/clang-tidy/utils/DesignatedInitializers.cpp
@@ -1,4 +1,4 @@
-//===--- DesignatedInitializers.cpp - clang-tidy --------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/utils/DesignatedInitializers.h b/clang-tools-extra/clang-tidy/utils/DesignatedInitializers.h
index a6cb2963faf72..910960137ddbb 100644
--- a/clang-tools-extra/clang-tidy/utils/DesignatedInitializers.h
+++ b/clang-tools-extra/clang-tidy/utils/DesignatedInitializers.h
@@ -1,4 +1,4 @@
-//===--- DesignatedInitializers.h - clang-tidy ------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/utils/ExceptionAnalyzer.cpp b/clang-tools-extra/clang-tidy/utils/ExceptionAnalyzer.cpp
index 3fe8412e69675..bdde7249d2796 100644
--- a/clang-tools-extra/clang-tidy/utils/ExceptionAnalyzer.cpp
+++ b/clang-tools-extra/clang-tidy/utils/ExceptionAnalyzer.cpp
@@ -1,4 +1,4 @@
-//===--- ExceptionAnalyzer.cpp - clang-tidy -------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/utils/ExceptionAnalyzer.h b/clang-tools-extra/clang-tidy/utils/ExceptionAnalyzer.h
index bd466c99c04bb..1ab6dcb2eb255 100644
--- a/clang-tools-extra/clang-tidy/utils/ExceptionAnalyzer.h
+++ b/clang-tools-extra/clang-tidy/utils/ExceptionAnalyzer.h
@@ -1,4 +1,4 @@
-//===--- ExceptionAnalyzer.h - clang-tidy -----------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/utils/ExceptionSpecAnalyzer.cpp b/clang-tools-extra/clang-tidy/utils/ExceptionSpecAnalyzer.cpp
index 4693c656a6602..b1d6b195f9470 100644
--- a/clang-tools-extra/clang-tidy/utils/ExceptionSpecAnalyzer.cpp
+++ b/clang-tools-extra/clang-tidy/utils/ExceptionSpecAnalyzer.cpp
@@ -1,4 +1,4 @@
-//===--- ExceptionSpecAnalyzer.cpp - clang-tidy ---------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/utils/ExceptionSpecAnalyzer.h b/clang-tools-extra/clang-tidy/utils/ExceptionSpecAnalyzer.h
index ddfb796d9c546..3fd6fe170c734 100644
--- a/clang-tools-extra/clang-tidy/utils/ExceptionSpecAnalyzer.h
+++ b/clang-tools-extra/clang-tidy/utils/ExceptionSpecAnalyzer.h
@@ -1,4 +1,4 @@
-//===--- ExceptionSpecAnalyzer.h - clang-tidy -------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/utils/ExprSequence.cpp b/clang-tools-extra/clang-tidy/utils/ExprSequence.cpp
index fcbb5ecc7152d..393f935fc31e4 100644
--- a/clang-tools-extra/clang-tidy/utils/ExprSequence.cpp
+++ b/clang-tools-extra/clang-tidy/utils/ExprSequence.cpp
@@ -1,4 +1,4 @@
-//===---------- ExprSequence.cpp - clang-tidy -----------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/utils/ExprSequence.h b/clang-tools-extra/clang-tidy/utils/ExprSequence.h
index 6531e1876c4fe..9ef94e0e3bcde 100644
--- a/clang-tools-extra/clang-tidy/utils/ExprSequence.h
+++ b/clang-tools-extra/clang-tidy/utils/ExprSequence.h
@@ -1,4 +1,4 @@
-//===------------- ExprSequence.h - clang-tidy ----------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/utils/FileExtensionsUtils.cpp b/clang-tools-extra/clang-tidy/utils/FileExtensionsUtils.cpp
index b40bba6d1f3ab..41d5131599ce6 100644
--- a/clang-tools-extra/clang-tidy/utils/FileExtensionsUtils.cpp
+++ b/clang-tools-extra/clang-tidy/utils/FileExtensionsUtils.cpp
@@ -1,4 +1,4 @@
-//===--- FileExtensionsUtils.cpp - clang-tidy -------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/utils/FileExtensionsUtils.h b/clang-tools-extra/clang-tidy/utils/FileExtensionsUtils.h
index e23f6b79c6af4..dfab141e32417 100644
--- a/clang-tools-extra/clang-tidy/utils/FileExtensionsUtils.h
+++ b/clang-tools-extra/clang-tidy/utils/FileExtensionsUtils.h
@@ -1,4 +1,4 @@
-//===--- FileExtensionsUtils.h - clang-tidy --------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/utils/FixItHintUtils.cpp b/clang-tools-extra/clang-tidy/utils/FixItHintUtils.cpp
index a15589f9721c7..086c7f3a15d45 100644
--- a/clang-tools-extra/clang-tidy/utils/FixItHintUtils.cpp
+++ b/clang-tools-extra/clang-tidy/utils/FixItHintUtils.cpp
@@ -1,4 +1,4 @@
-//===--- FixItHintUtils.cpp - clang-tidy-----------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/utils/FixItHintUtils.h b/clang-tools-extra/clang-tidy/utils/FixItHintUtils.h
index e690dbaefe642..74608d4ff268f 100644
--- a/clang-tools-extra/clang-tidy/utils/FixItHintUtils.h
+++ b/clang-tools-extra/clang-tidy/utils/FixItHintUtils.h
@@ -1,4 +1,4 @@
-//===--- FixItHintUtils.h - clang-tidy---------------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/utils/FormatStringConverter.cpp b/clang-tools-extra/clang-tidy/utils/FormatStringConverter.cpp
index 0d0834dc38fc6..f4945b2113c69 100644
--- a/clang-tools-extra/clang-tidy/utils/FormatStringConverter.cpp
+++ b/clang-tools-extra/clang-tidy/utils/FormatStringConverter.cpp
@@ -1,4 +1,4 @@
-//===--- FormatStringConverter.cpp - clang-tidy----------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/utils/FormatStringConverter.h b/clang-tools-extra/clang-tidy/utils/FormatStringConverter.h
index 15d1f597fe440..209741fac276c 100644
--- a/clang-tools-extra/clang-tidy/utils/FormatStringConverter.h
+++ b/clang-tools-extra/clang-tidy/utils/FormatStringConverter.h
@@ -1,4 +1,4 @@
-//===--- FormatStringConverter.h - clang-tidy--------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/utils/HeaderGuard.cpp b/clang-tools-extra/clang-tidy/utils/HeaderGuard.cpp
index 53ce28e019f75..e1d13876d64a9 100644
--- a/clang-tools-extra/clang-tidy/utils/HeaderGuard.cpp
+++ b/clang-tools-extra/clang-tidy/utils/HeaderGuard.cpp
@@ -1,4 +1,4 @@
-//===--- HeaderGuard.cpp - clang-tidy -------------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/utils/HeaderGuard.h b/clang-tools-extra/clang-tidy/utils/HeaderGuard.h
index eff75d6ff26a2..ce8acb07783b3 100644
--- a/clang-tools-extra/clang-tidy/utils/HeaderGuard.h
+++ b/clang-tools-extra/clang-tidy/utils/HeaderGuard.h
@@ -1,4 +1,4 @@
-//===--- HeaderGuard.h - clang-tidy -----------------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/utils/IncludeInserter.cpp b/clang-tools-extra/clang-tidy/utils/IncludeInserter.cpp
index b53016f331b79..0b67cba6ffb0a 100644
--- a/clang-tools-extra/clang-tidy/utils/IncludeInserter.cpp
+++ b/clang-tools-extra/clang-tidy/utils/IncludeInserter.cpp
@@ -1,4 +1,4 @@
-//===-------- IncludeInserter.cpp - clang-tidy ----------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/utils/IncludeInserter.h b/clang-tools-extra/clang-tidy/utils/IncludeInserter.h
index 5308f76bd2151..f6ca7d63632de 100644
--- a/clang-tools-extra/clang-tidy/utils/IncludeInserter.h
+++ b/clang-tools-extra/clang-tidy/utils/IncludeInserter.h
@@ -1,4 +1,4 @@
-//===---------- IncludeInserter.h - clang-tidy ----------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/utils/IncludeSorter.cpp b/clang-tools-extra/clang-tidy/utils/IncludeSorter.cpp
index db1ea1bb514f8..6a71a11c18754 100644
--- a/clang-tools-extra/clang-tidy/utils/IncludeSorter.cpp
+++ b/clang-tools-extra/clang-tidy/utils/IncludeSorter.cpp
@@ -1,4 +1,4 @@
-//===---------- IncludeSorter.cpp - clang-tidy ----------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/utils/IncludeSorter.h b/clang-tools-extra/clang-tidy/utils/IncludeSorter.h
index 782fa6721bc03..ce752c45f2a77 100644
--- a/clang-tools-extra/clang-tidy/utils/IncludeSorter.h
+++ b/clang-tools-extra/clang-tidy/utils/IncludeSorter.h
@@ -1,4 +1,4 @@
-//===------------ IncludeSorter.h - clang-tidy ----------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/utils/LexerUtils.cpp b/clang-tools-extra/clang-tidy/utils/LexerUtils.cpp
index c14d341caf779..7222f64804f63 100644
--- a/clang-tools-extra/clang-tidy/utils/LexerUtils.cpp
+++ b/clang-tools-extra/clang-tidy/utils/LexerUtils.cpp
@@ -1,4 +1,4 @@
-//===--- LexerUtils.cpp - clang-tidy---------------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/utils/LexerUtils.h b/clang-tools-extra/clang-tidy/utils/LexerUtils.h
index afd63885e388c..b76a37874b514 100644
--- a/clang-tools-extra/clang-tidy/utils/LexerUtils.h
+++ b/clang-tools-extra/clang-tidy/utils/LexerUtils.h
@@ -1,4 +1,4 @@
-//===--- LexerUtils.h - clang-tidy-------------------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/utils/Matchers.cpp b/clang-tools-extra/clang-tidy/utils/Matchers.cpp
index bd7b03eb39ad7..4382745c8bdc5 100644
--- a/clang-tools-extra/clang-tidy/utils/Matchers.cpp
+++ b/clang-tools-extra/clang-tidy/utils/Matchers.cpp
@@ -1,4 +1,4 @@
-//===---------- Matchers.cpp - clang-tidy ---------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/utils/Matchers.h b/clang-tools-extra/clang-tidy/utils/Matchers.h
index a7683024d69c4..6caa35de3c98f 100644
--- a/clang-tools-extra/clang-tidy/utils/Matchers.h
+++ b/clang-tools-extra/clang-tidy/utils/Matchers.h
@@ -1,4 +1,4 @@
-//===--- Matchers.h - clang-tidy-------------------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/utils/NamespaceAliaser.cpp b/clang-tools-extra/clang-tidy/utils/NamespaceAliaser.cpp
index f5949bab8f243..3af7f8dcf2ee5 100644
--- a/clang-tools-extra/clang-tidy/utils/NamespaceAliaser.cpp
+++ b/clang-tools-extra/clang-tidy/utils/NamespaceAliaser.cpp
@@ -1,4 +1,4 @@
-//===---------- NamespaceAliaser.cpp - clang-tidy -------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/utils/NamespaceAliaser.h b/clang-tools-extra/clang-tidy/utils/NamespaceAliaser.h
index df4d4b95ba421..497b67e82a900 100644
--- a/clang-tools-extra/clang-tidy/utils/NamespaceAliaser.h
+++ b/clang-tools-extra/clang-tidy/utils/NamespaceAliaser.h
@@ -1,4 +1,4 @@
-//===---------- NamespaceAliaser.h - clang-tidy ---------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/utils/OptionsUtils.cpp b/clang-tools-extra/clang-tidy/utils/OptionsUtils.cpp
index 1866ea3f5b58a..2f784360ac7ec 100644
--- a/clang-tools-extra/clang-tidy/utils/OptionsUtils.cpp
+++ b/clang-tools-extra/clang-tidy/utils/OptionsUtils.cpp
@@ -1,4 +1,4 @@
-//===-- OptionsUtils.cpp - clang-tidy -------------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/utils/OptionsUtils.h b/clang-tools-extra/clang-tidy/utils/OptionsUtils.h
index f15c07fe47fad..aec24ab0a84b3 100644
--- a/clang-tools-extra/clang-tidy/utils/OptionsUtils.h
+++ b/clang-tools-extra/clang-tidy/utils/OptionsUtils.h
@@ -1,4 +1,4 @@
-//===--- DanglingHandleCheck.h - clang-tidy----------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/utils/RenamerClangTidyCheck.cpp b/clang-tools-extra/clang-tidy/utils/RenamerClangTidyCheck.cpp
index 90539eaabbe03..70f6092a5e4bc 100644
--- a/clang-tools-extra/clang-tidy/utils/RenamerClangTidyCheck.cpp
+++ b/clang-tools-extra/clang-tidy/utils/RenamerClangTidyCheck.cpp
@@ -1,4 +1,4 @@
-//===--- RenamerClangTidyCheck.cpp - clang-tidy ---------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -350,6 +350,8 @@ class RenamerClangTidyVisitor
     const TemplateDecl *Decl =
         Loc.getTypePtr()->getTemplateName().getAsTemplateDecl(
             /*IgnoreDeduced=*/true);
+    if (!Decl)
+      return true;
 
     if (const auto *ClassDecl = dyn_cast<TemplateDecl>(Decl))
       if (const NamedDecl *TemplDecl = ClassDecl->getTemplatedDecl())
diff --git a/clang-tools-extra/clang-tidy/utils/RenamerClangTidyCheck.h b/clang-tools-extra/clang-tidy/utils/RenamerClangTidyCheck.h
index 3d5721b789ac2..68b3040895417 100644
--- a/clang-tools-extra/clang-tidy/utils/RenamerClangTidyCheck.h
+++ b/clang-tools-extra/clang-tidy/utils/RenamerClangTidyCheck.h
@@ -1,4 +1,4 @@
-//===--- RenamerClangTidyCheck.h - clang-tidy -------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/utils/TransformerClangTidyCheck.cpp b/clang-tools-extra/clang-tidy/utils/TransformerClangTidyCheck.cpp
index 7d84a4a9331b1..87602d1187d59 100644
--- a/clang-tools-extra/clang-tidy/utils/TransformerClangTidyCheck.cpp
+++ b/clang-tools-extra/clang-tidy/utils/TransformerClangTidyCheck.cpp
@@ -1,4 +1,4 @@
-//===---------- TransformerClangTidyCheck.cpp - clang-tidy ----------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/utils/TransformerClangTidyCheck.h b/clang-tools-extra/clang-tidy/utils/TransformerClangTidyCheck.h
index 3f5c4cac52b7b..ad20fbd475759 100644
--- a/clang-tools-extra/clang-tidy/utils/TransformerClangTidyCheck.h
+++ b/clang-tools-extra/clang-tidy/utils/TransformerClangTidyCheck.h
@@ -1,4 +1,4 @@
-//===---------- TransformerClangTidyCheck.h - clang-tidy ------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/utils/TypeTraits.cpp b/clang-tools-extra/clang-tidy/utils/TypeTraits.cpp
index f944306171135..d4e079f1cf4c2 100644
--- a/clang-tools-extra/clang-tidy/utils/TypeTraits.cpp
+++ b/clang-tools-extra/clang-tidy/utils/TypeTraits.cpp
@@ -1,4 +1,4 @@
-//===--- TypeTraits.cpp - clang-tidy---------------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/utils/TypeTraits.h b/clang-tools-extra/clang-tidy/utils/TypeTraits.h
index eb4dd0ff3a510..98a4a99bf8d4d 100644
--- a/clang-tools-extra/clang-tidy/utils/TypeTraits.h
+++ b/clang-tools-extra/clang-tidy/utils/TypeTraits.h
@@ -1,4 +1,4 @@
-//===--- TypeTraits.h - clang-tidy-------------------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/utils/UseRangesCheck.cpp b/clang-tools-extra/clang-tidy/utils/UseRangesCheck.cpp
index 25601f9a01a48..cb1495163a2f9 100644
--- a/clang-tools-extra/clang-tidy/utils/UseRangesCheck.cpp
+++ b/clang-tools-extra/clang-tidy/utils/UseRangesCheck.cpp
@@ -1,4 +1,4 @@
-//===--- UseRangesCheck.cpp - clang-tidy ----------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/utils/UseRangesCheck.h b/clang-tools-extra/clang-tidy/utils/UseRangesCheck.h
index a5ba6802dd89e..b85a157ba2873 100644
--- a/clang-tools-extra/clang-tidy/utils/UseRangesCheck.h
+++ b/clang-tools-extra/clang-tidy/utils/UseRangesCheck.h
@@ -1,4 +1,4 @@
-//===--- UseRangesCheck.h - clang-tidy --------------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/utils/UsingInserter.cpp b/clang-tools-extra/clang-tidy/utils/UsingInserter.cpp
index 3a2c16ff05dae..e4c71aa60a7a2 100644
--- a/clang-tools-extra/clang-tidy/utils/UsingInserter.cpp
+++ b/clang-tools-extra/clang-tidy/utils/UsingInserter.cpp
@@ -1,4 +1,4 @@
-//===---------- UsingInserter.cpp - clang-tidy ----------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/utils/UsingInserter.h b/clang-tools-extra/clang-tidy/utils/UsingInserter.h
index 7ff1f0b9792e1..23c317581c191 100644
--- a/clang-tools-extra/clang-tidy/utils/UsingInserter.h
+++ b/clang-tools-extra/clang-tidy/utils/UsingInserter.h
@@ -1,4 +1,4 @@
-//===---------- UsingInserter.h - clang-tidy ----------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/zircon/TemporaryObjectsCheck.cpp b/clang-tools-extra/clang-tidy/zircon/TemporaryObjectsCheck.cpp
index bb2c71913193b..96a36cba827e6 100644
--- a/clang-tools-extra/clang-tidy/zircon/TemporaryObjectsCheck.cpp
+++ b/clang-tools-extra/clang-tidy/zircon/TemporaryObjectsCheck.cpp
@@ -1,4 +1,4 @@
-//===--- TemporaryObjectsCheck.cpp - clang-tidy----------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/zircon/TemporaryObjectsCheck.h b/clang-tools-extra/clang-tidy/zircon/TemporaryObjectsCheck.h
index b2d5ab61fb0dc..5ecf9c4172d18 100644
--- a/clang-tools-extra/clang-tidy/zircon/TemporaryObjectsCheck.h
+++ b/clang-tools-extra/clang-tidy/zircon/TemporaryObjectsCheck.h
@@ -1,4 +1,4 @@
-//===--- TemporaryObjectsCheck.h - clang-tidy------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/zircon/ZirconTidyModule.cpp b/clang-tools-extra/clang-tidy/zircon/ZirconTidyModule.cpp
index 0eb5683a94e41..86d7ce4e04e7b 100644
--- a/clang-tools-extra/clang-tidy/zircon/ZirconTidyModule.cpp
+++ b/clang-tools-extra/clang-tidy/zircon/ZirconTidyModule.cpp
@@ -1,4 +1,4 @@
-//===--- ZirconTidyModule.cpp - clang-tidy---------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clangd/AST.cpp b/clang-tools-extra/clangd/AST.cpp
index 2f46ecc92576c..b96a84519e78c 100644
--- a/clang-tools-extra/clangd/AST.cpp
+++ b/clang-tools-extra/clangd/AST.cpp
@@ -29,6 +29,7 @@
 #include "clang/Basic/SourceManager.h"
 #include "clang/Basic/Specifiers.h"
 #include "clang/Index/USRGeneration.h"
+#include "clang/Sema/HeuristicResolver.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallSet.h"
@@ -479,10 +480,12 @@ namespace {
 /// a deduced type set. The AST should be improved to simplify this scenario.
 class DeducedTypeVisitor : public RecursiveASTVisitor<DeducedTypeVisitor> {
   SourceLocation SearchedLocation;
+  const HeuristicResolver *Resolver;
 
 public:
-  DeducedTypeVisitor(SourceLocation SearchedLocation)
-      : SearchedLocation(SearchedLocation) {}
+  DeducedTypeVisitor(SourceLocation SearchedLocation,
+                     const HeuristicResolver *Resolver)
+      : SearchedLocation(SearchedLocation), Resolver(Resolver) {}
 
   // Handle auto initializers:
   //- auto i = 1;
@@ -499,6 +502,14 @@ class DeducedTypeVisitor : public RecursiveASTVisitor<DeducedTypeVisitor> {
       return true;
 
     if (auto *AT = D->getType()->getContainedAutoType()) {
+      if (AT->isUndeducedAutoType()) {
+        if (const auto *VD = dyn_cast<VarDecl>(D)) {
+          if (Resolver && VD->hasInit()) {
+            DeducedType = Resolver->resolveExprToType(VD->getInit());
+            return true;
+          }
+        }
+      }
       DeducedType = AT->desugar();
     }
     return true;
@@ -608,10 +619,12 @@ class DeducedTypeVisitor : public RecursiveASTVisitor<DeducedTypeVisitor> {
 };
 } // namespace
 
-std::optional<QualType> getDeducedType(ASTContext &ASTCtx, SourceLocation Loc) {
+std::optional<QualType> getDeducedType(ASTContext &ASTCtx,
+                                       const HeuristicResolver *Resolver,
+                                       SourceLocation Loc) {
   if (!Loc.isValid())
     return {};
-  DeducedTypeVisitor V(Loc);
+  DeducedTypeVisitor V(Loc, Resolver);
   V.TraverseAST(ASTCtx);
   if (V.DeducedType.isNull())
     return std::nullopt;
diff --git a/clang-tools-extra/clangd/AST.h b/clang-tools-extra/clangd/AST.h
index 1538d12172593..2b83595e5b8e9 100644
--- a/clang-tools-extra/clangd/AST.h
+++ b/clang-tools-extra/clangd/AST.h
@@ -31,6 +31,7 @@ namespace clang {
 class SourceManager;
 class Decl;
 class DynTypedNode;
+class HeuristicResolver;
 
 namespace clangd {
 
@@ -167,7 +168,8 @@ QualType declaredType(const TypeDecl *D);
 /// Retrieves the deduced type at a given location (auto, decltype).
 /// It will return the underlying type.
 /// If the type is an undeduced auto, returns the type itself.
-std::optional<QualType> getDeducedType(ASTContext &, SourceLocation Loc);
+std::optional<QualType> getDeducedType(ASTContext &, const HeuristicResolver *,
+                                       SourceLocation Loc);
 
 // Find the abbreviated-function-template `auto` within a type, or returns null.
 // Similar to getContainedAutoTypeLoc, but these `auto`s are
diff --git a/clang-tools-extra/clangd/Compiler.cpp b/clang-tools-extra/clangd/Compiler.cpp
index 8b3865c8a8e5c..6ebc2eac25745 100644
--- a/clang-tools-extra/clangd/Compiler.cpp
+++ b/clang-tools-extra/clangd/Compiler.cpp
@@ -147,13 +147,9 @@ prepareCompilerInstance(std::unique_ptr<clang::CompilerInvocation> CI,
   }
 
   auto Clang = std::make_unique<CompilerInstance>(std::move(CI));
-  Clang->createDiagnostics(*VFS, &DiagsClient, false);
-
-  if (auto VFSWithRemapping = createVFSFromCompilerInvocation(
-          Clang->getInvocation(), Clang->getDiagnostics(), VFS))
-    VFS = VFSWithRemapping;
-  Clang->createFileManager(VFS);
-
+  Clang->createVirtualFileSystem(VFS, &DiagsClient);
+  Clang->createDiagnostics(&DiagsClient, false);
+  Clang->createFileManager();
   if (!Clang->createTarget())
     return nullptr;
 
diff --git a/clang-tools-extra/clangd/FindTarget.cpp b/clang-tools-extra/clangd/FindTarget.cpp
index 32018d1bf3a84..8aae41420b83e 100644
--- a/clang-tools-extra/clangd/FindTarget.cpp
+++ b/clang-tools-extra/clangd/FindTarget.cpp
@@ -406,15 +406,6 @@ struct TargetFinder {
           }
         }
       }
-      void VisitDependentTemplateSpecializationType(
-          const DependentTemplateSpecializationType *DTST) {
-        if (Outer.Resolver) {
-          for (const NamedDecl *ND :
-               Outer.Resolver->resolveTemplateSpecializationType(DTST)) {
-            Outer.add(ND, Flags);
-          }
-        }
-      }
       void VisitTypedefType(const TypedefType *TT) {
         if (shouldSkipTypedef(TT->getDecl()))
           return;
@@ -455,11 +446,13 @@ struct TargetFinder {
         // class template specializations have a (specialized) CXXRecordDecl.
         else if (const CXXRecordDecl *RD = TST->getAsCXXRecordDecl())
           Outer.add(RD, Flags); // add(Decl) will despecialize if needed.
-        else {
+        else if (auto *TD = TST->getTemplateName().getAsTemplateDecl())
           // fallback: the (un-specialized) declaration from primary template.
-          if (auto *TD = TST->getTemplateName().getAsTemplateDecl())
-            Outer.add(TD->getTemplatedDecl(), Flags | Rel::TemplatePattern);
-        }
+          Outer.add(TD->getTemplatedDecl(), Flags | Rel::TemplatePattern);
+        else if (Outer.Resolver)
+          for (const NamedDecl *ND :
+               Outer.Resolver->resolveTemplateSpecializationType(TST))
+            Outer.add(ND, Flags);
       }
       void
       VisitSubstTemplateTypeParmType(const SubstTemplateTypeParmType *STTPT) {
@@ -900,15 +893,6 @@ refInTypeLoc(TypeLoc L, const HeuristicResolver *Resolver) {
                                    DeclRelation::Alias, Resolver)});
     }
 
-    void VisitDependentTemplateSpecializationTypeLoc(
-        DependentTemplateSpecializationTypeLoc L) {
-      Refs.push_back(
-          ReferenceLoc{L.getQualifierLoc(), L.getTemplateNameLoc(),
-                       /*IsDecl=*/false,
-                       explicitReferenceTargets(
-                           DynTypedNode::create(L.getType()), {}, Resolver)});
-    }
-
     void VisitDependentNameTypeLoc(DependentNameTypeLoc L) {
       Refs.push_back(
           ReferenceLoc{L.getQualifierLoc(), L.getNameLoc(),
diff --git a/clang-tools-extra/clangd/Hover.cpp b/clang-tools-extra/clangd/Hover.cpp
index 9eec322fe5963..138544dea99a1 100644
--- a/clang-tools-extra/clangd/Hover.cpp
+++ b/clang-tools-extra/clangd/Hover.cpp
@@ -1309,7 +1309,9 @@ std::optional<HoverInfo> getHover(ParsedAST &AST, Position Pos,
       }
     } else if (Tok.kind() == tok::kw_auto || Tok.kind() == tok::kw_decltype) {
       HoverCountMetric.record(1, "keyword");
-      if (auto Deduced = getDeducedType(AST.getASTContext(), Tok.location())) {
+      if (auto Deduced =
+              getDeducedType(AST.getASTContext(), AST.getHeuristicResolver(),
+                             Tok.location())) {
         HI = getDeducedTypeHoverContents(*Deduced, Tok, AST.getASTContext(), PP,
                                          Index);
         HighlightRange = Tok.range(SM).toCharRange(SM);
diff --git a/clang-tools-extra/clangd/InlayHints.cpp b/clang-tools-extra/clangd/InlayHints.cpp
index cd479e1b7c9bc..d56b93e5f36dc 100644
--- a/clang-tools-extra/clangd/InlayHints.cpp
+++ b/clang-tools-extra/clangd/InlayHints.cpp
@@ -633,13 +633,30 @@ class InlayHintVisitor : public RecursiveASTVisitor<InlayHintVisitor> {
     }
 
     if (auto *AT = D->getType()->getContainedAutoType()) {
-      if (AT->isDeduced() && !D->getType()->isDependentType()) {
-        // Our current approach is to place the hint on the variable
-        // and accordingly print the full type
-        // (e.g. for `const auto& x = 42`, print `const int&`).
-        // Alternatively, we could place the hint on the `auto`
-        // (and then just print the type deduced for the `auto`).
-        addTypeHint(D->getLocation(), D->getType(), /*Prefix=*/": ");
+      if (AT->isDeduced()) {
+        QualType T;
+        // If the type is dependent, HeuristicResolver *may* be able to
+        // resolve it to something that's useful to print. In other
+        // cases, it can't, and the resultng type would just be printed
+        // as "<dependent type>", in which case don't hint it at all.
+        if (D->getType()->isDependentType()) {
+          if (D->hasInit()) {
+            QualType Resolved = Resolver->resolveExprToType(D->getInit());
+            if (Resolved != AST.DependentTy) {
+              T = Resolved;
+            }
+          }
+        } else {
+          T = D->getType();
+        }
+        if (!T.isNull()) {
+          // Our current approach is to place the hint on the variable
+          // and accordingly print the full type
+          // (e.g. for `const auto& x = 42`, print `const int&`).
+          // Alternatively, we could place the hint on the `auto`
+          // (and then just print the type deduced for the `auto`).
+          addTypeHint(D->getLocation(), T, /*Prefix=*/": ");
+        }
       }
     }
 
diff --git a/clang-tools-extra/clangd/SemanticHighlighting.cpp b/clang-tools-extra/clangd/SemanticHighlighting.cpp
index 2b151b1274428..ab720ebe6b47f 100644
--- a/clang-tools-extra/clangd/SemanticHighlighting.cpp
+++ b/clang-tools-extra/clangd/SemanticHighlighting.cpp
@@ -728,11 +728,6 @@ class CollectExtraHighlightings
     return true;
   }
 
-  bool VisitTemplateSpecializationTypeLoc(TemplateSpecializationTypeLoc L) {
-    H.addAngleBracketTokens(L.getLAngleLoc(), L.getRAngleLoc());
-    return true;
-  }
-
   bool VisitFunctionDecl(FunctionDecl *D) {
     if (D->isOverloadedOperator()) {
       const auto AddOpDeclToken = [&](SourceLocation Loc) {
@@ -1087,11 +1082,12 @@ class CollectExtraHighlightings
     return true;
   }
 
-  bool VisitDependentTemplateSpecializationTypeLoc(
-      DependentTemplateSpecializationTypeLoc L) {
-    H.addToken(L.getTemplateNameLoc(), HighlightingKind::Type)
-        .addModifier(HighlightingModifier::DependentName)
-        .addModifier(HighlightingModifier::ClassScope);
+  bool VisitTemplateSpecializationTypeLoc(TemplateSpecializationTypeLoc L) {
+    if (!L.getTypePtr()->getTemplateName().getAsTemplateDecl(
+            /*IgnoreDeduced=*/true))
+      H.addToken(L.getTemplateNameLoc(), HighlightingKind::Type)
+          .addModifier(HighlightingModifier::DependentName)
+          .addModifier(HighlightingModifier::ClassScope);
     H.addAngleBracketTokens(L.getLAngleLoc(), L.getRAngleLoc());
     return true;
   }
diff --git a/clang-tools-extra/clangd/XRefs.cpp b/clang-tools-extra/clangd/XRefs.cpp
index e1c50f906de08..05e04ac161e54 100644
--- a/clang-tools-extra/clangd/XRefs.cpp
+++ b/clang-tools-extra/clangd/XRefs.cpp
@@ -806,7 +806,9 @@ std::vector<LocatedSymbol> locateSymbolAt(ParsedAST &AST, Position Pos,
     if (Tok.kind() == tok::kw_auto || Tok.kind() == tok::kw_decltype) {
       // go-to-definition on auto should find the definition of the deduced
       // type, if possible
-      if (auto Deduced = getDeducedType(AST.getASTContext(), Tok.location())) {
+      if (auto Deduced =
+              getDeducedType(AST.getASTContext(), AST.getHeuristicResolver(),
+                             Tok.location())) {
         auto LocSym = locateSymbolForType(AST, *Deduced, Index);
         if (!LocSym.empty())
           return LocSym;
@@ -1965,7 +1967,7 @@ std::vector<const CXXRecordDecl *> findRecordTypeAt(ParsedAST &AST,
 
 // Return the type most associated with an AST node.
 // This isn't precisely defined: we want "go to type" to do something useful.
-static QualType typeForNode(const ASTContext &Ctx,
+static QualType typeForNode(const ASTContext &Ctx, const HeuristicResolver *H,
                             const SelectionTree::Node *N) {
   // If we're looking at a namespace qualifier, walk up to what it's qualifying.
   // (If we're pointing at a *class* inside a NNS, N will be a TypeLoc).
@@ -1978,7 +1980,7 @@ static QualType typeForNode(const ASTContext &Ctx,
   if (const TypeLoc *TL = N->ASTNode.get<TypeLoc>()) {
     if (llvm::isa<DeducedType>(TL->getTypePtr()))
       if (auto Deduced = getDeducedType(
-              N->getDeclContext().getParentASTContext(), TL->getBeginLoc()))
+              N->getDeclContext().getParentASTContext(), H, TL->getBeginLoc()))
         return *Deduced;
     // Exception: an alias => underlying type.
     if (llvm::isa<TypedefType>(TL->getTypePtr()))
@@ -2161,7 +2163,8 @@ std::vector<LocatedSymbol> findType(ParsedAST &AST, Position Pos,
     // information about the type you may have not known before
     // (since unique_ptr<unique_ptr<T>> != unique_ptr<T>).
     for (const QualType &Type : unwrapFindType(
-             typeForNode(AST.getASTContext(), N), AST.getHeuristicResolver()))
+             typeForNode(AST.getASTContext(), AST.getHeuristicResolver(), N),
+             AST.getHeuristicResolver()))
       llvm::copy(locateSymbolForType(AST, Type, Index),
                  std::back_inserter(LocatedSymbols));
 
diff --git a/clang-tools-extra/clangd/refactor/tweaks/ExpandDeducedType.cpp b/clang-tools-extra/clangd/refactor/tweaks/ExpandDeducedType.cpp
index fec5f5797cb62..52afda56a5028 100644
--- a/clang-tools-extra/clangd/refactor/tweaks/ExpandDeducedType.cpp
+++ b/clang-tools-extra/clangd/refactor/tweaks/ExpandDeducedType.cpp
@@ -133,7 +133,8 @@ Expected<Tweak::Effect> ExpandDeducedType::apply(const Selection &Inputs) {
   auto &SrcMgr = Inputs.AST->getSourceManager();
 
   std::optional<clang::QualType> DeducedType =
-      getDeducedType(Inputs.AST->getASTContext(), Range.getBegin());
+      getDeducedType(Inputs.AST->getASTContext(),
+                     Inputs.AST->getHeuristicResolver(), Range.getBegin());
 
   // if we can't resolve the type, return an error message
   if (DeducedType == std::nullopt || (*DeducedType)->isUndeducedAutoType())
diff --git a/clang-tools-extra/clangd/unittests/ASTTests.cpp b/clang-tools-extra/clangd/unittests/ASTTests.cpp
index 76d46bad82224..91ae727d8c944 100644
--- a/clang-tools-extra/clangd/unittests/ASTTests.cpp
+++ b/clang-tools-extra/clangd/unittests/ASTTests.cpp
@@ -244,7 +244,8 @@ TEST(GetDeducedType, KwAutoKwDecltypeExpansion) {
     for (Position Pos : File.points()) {
       auto Location = sourceLocationInMainFile(SM.get(), Pos);
       ASSERT_TRUE(!!Location) << llvm::toString(Location.takeError());
-      auto DeducedType = getDeducedType(AST.getASTContext(), *Location);
+      auto DeducedType = getDeducedType(AST.getASTContext(),
+                                        AST.getHeuristicResolver(), *Location);
       if (T.DeducedType == nullptr) {
         EXPECT_FALSE(DeducedType);
       } else {
diff --git a/clang-tools-extra/clangd/unittests/FindTargetTests.cpp b/clang-tools-extra/clangd/unittests/FindTargetTests.cpp
index f369e1b0341e8..dd26182630ae1 100644
--- a/clang-tools-extra/clangd/unittests/FindTargetTests.cpp
+++ b/clang-tools-extra/clangd/unittests/FindTargetTests.cpp
@@ -1029,8 +1029,7 @@ TEST_F(TargetDeclTest, DependentTypes) {
         template <typename T>
         void foo(typename A<T>::template [[B]]<int>);
       )cpp";
-  EXPECT_DECLS("DependentTemplateSpecializationTypeLoc",
-               "template <typename> struct B");
+  EXPECT_DECLS("TemplateSpecializationTypeLoc", "template <typename> struct B");
 
   // Dependent name with recursive definition. We don't expect a
   // result, but we shouldn't get into a stack overflow either.
diff --git a/clang-tools-extra/clangd/unittests/HoverTests.cpp b/clang-tools-extra/clangd/unittests/HoverTests.cpp
index 743c0dc0d0187..e9abf71e6d1b6 100644
--- a/clang-tools-extra/clangd/unittests/HoverTests.cpp
+++ b/clang-tools-extra/clangd/unittests/HoverTests.cpp
@@ -481,7 +481,7 @@ class Foo final {})cpp";
        [](HoverInfo &HI) {
          HI.Name = "auto";
          HI.Kind = index::SymbolKind::TypeAlias;
-         HI.Definition = "/* not deduced */";
+         HI.Definition = "T";
        }},
       // constrained auto
       {R"cpp(
@@ -2657,7 +2657,7 @@ TEST(Hover, All) {
           [](HoverInfo &HI) {
             HI.Name = "auto";
             HI.Kind = index::SymbolKind::TypeAlias;
-            HI.Definition = "/* not deduced */";
+            HI.Definition = "T";
           }},
       {
           R"cpp(// Undeduced auto return type
diff --git a/clang-tools-extra/clangd/unittests/InlayHintTests.cpp b/clang-tools-extra/clangd/unittests/InlayHintTests.cpp
index 99e728c40063d..feb4404b3d2bf 100644
--- a/clang-tools-extra/clangd/unittests/InlayHintTests.cpp
+++ b/clang-tools-extra/clangd/unittests/InlayHintTests.cpp
@@ -1441,7 +1441,8 @@ TEST(TypeHints, DependentType) {
     void bar(T arg) {
       auto [a, b] = arg;
     }
-  )cpp");
+  )cpp",
+                  ExpectedHint{": T", "var2"});
 }
 
 TEST(TypeHints, LongTypeName) {
diff --git a/clang-tools-extra/clangd/unittests/XRefsTests.cpp b/clang-tools-extra/clangd/unittests/XRefsTests.cpp
index 17204a47ba3bc..7ed08d7cce3d3 100644
--- a/clang-tools-extra/clangd/unittests/XRefsTests.cpp
+++ b/clang-tools-extra/clangd/unittests/XRefsTests.cpp
@@ -924,11 +924,19 @@ TEST(LocateSymbol, All) {
         }
       )cpp",
 
+      R"cpp(// auto with dependent type
+        template <typename>
+        struct [[A]] {};
+        template <typename T>
+        void foo(A<T> a) {
+          ^auto copy = a;
+        }
+      )cpp",
+
       R"cpp(// Override specifier jumps to overridden method
         class Y { virtual void $decl[[a]]() = 0; };
         class X : Y { void a() ^override {} };
       )cpp",
-
       R"cpp(// Final specifier jumps to overridden method
         class Y { virtual void $decl[[a]]() = 0; };
         class X : Y { void a() ^final {} };
diff --git a/clang-tools-extra/clangd/unittests/tweaks/ExpandDeducedTypeTests.cpp b/clang-tools-extra/clangd/unittests/tweaks/ExpandDeducedTypeTests.cpp
index 8da394d74b54d..3a53c11839c20 100644
--- a/clang-tools-extra/clangd/unittests/tweaks/ExpandDeducedTypeTests.cpp
+++ b/clang-tools-extra/clangd/unittests/tweaks/ExpandDeducedTypeTests.cpp
@@ -47,7 +47,7 @@ TEST_F(ExpandDeducedTypeTest, Test) {
             "namespace ns { void f() { Class C = Class(); } }");
   // undefined functions should not be replaced
   EXPECT_THAT(apply("au^to x = doesnt_exist(); // error-ok"),
-              StartsWith("fail: Could not deduce type for 'auto' type"));
+              StartsWith("fail: Could not expand a dependent type"));
   // function pointers should not be replaced
   EXPECT_THAT(apply("au^to x = &ns::Func;"),
               StartsWith("fail: Could not expand type"));
@@ -91,7 +91,7 @@ TEST_F(ExpandDeducedTypeTest, Test) {
 
   // unknown types in a template should not be replaced
   EXPECT_THAT(apply("template <typename T> void x() { ^auto y = T::z(); }"),
-              StartsWith("fail: Could not deduce type for 'auto' type"));
+              StartsWith("fail: Could not expand a dependent type"));
 
   // check primitive type
   EXPECT_EQ(apply("decl^type(0) i;"), "int i;");
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index 23d757b5e6f2e..3f403c42a168a 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -46,6 +46,21 @@ infrastructure are described first, followed by tool-specific sections.
 Major New Features
 ------------------
 
+Potentially Breaking Changes
+----------------------------
+
+- Removed :program:`clang-tidy`'s global options `IgnoreMacros` and
+  `StrictMode`, which were documented as deprecated since
+  :program:`clang-tidy-20`. Users should use the check-specific options of the
+  same name instead.
+
+- Renamed :program:`clang-tidy`'s option name of check
+  :doc:`bugprone-easily-swappable-parameters
+  <clang-tidy/checks/bugprone/easily-swappable-parameters>` from
+  ``NamePrefixSuffixSilenceDissimilarityTreshold`` to
+  ``NamePrefixSuffixSilenceDissimilarityThreshold``,
+  correcting a spelling mistake.
+
 Improvements to clangd
 ----------------------
 
@@ -135,7 +150,7 @@ Improvements to clang-tidy
   :program:`clang-tidy-20`. Users should use the check-specific options of the
   same name instead.
 
-- Improved :program:`run-clang-tidy.py` and :program:`clang-tidy-diff.py` 
+- Improved :program:`run-clang-tidy.py` and :program:`clang-tidy-diff.py`
   scripts by adding the `-hide-progress` option to suppress progress and
   informational messages.
 
@@ -148,6 +163,11 @@ New checks
   Detects default initialization (to 0) of variables with ``enum`` type where
   the enum has no enumerator with value of 0.
 
+- New :doc:`bugprone-derived-method-shadowing-base-method
+  <clang-tidy/checks/bugprone/derived-method-shadowing-base-method>` check.
+
+  Finds derived class methods that shadow a (non-virtual) base class method.
+
 - New :doc:`cppcoreguidelines-pro-bounds-avoid-unchecked-container-access
   <clang-tidy/checks/cppcoreguidelines/pro-bounds-avoid-unchecked-container-access>`
   check.
@@ -190,6 +210,11 @@ New check aliases
 Changes in existing checks
 ^^^^^^^^^^^^^^^^^^^^^^^^^^
 
+- Improved :doc:`bugprone-easily-swappable-parameters
+  <clang-tidy/checks/bugprone/easily-swappable-parameters>` check by
+  correcting a spelling mistake on its option
+  ``NamePrefixSuffixSilenceDissimilarityTreshold``.
+
 - Improved :doc:`bugprone-infinite-loop
   <clang-tidy/checks/bugprone/infinite-loop>` check by adding detection for
   variables introduced by structured bindings.
@@ -213,8 +238,8 @@ Changes in existing checks
   tagged union respectively.
 
 - Improved :doc:`bugprone-unchecked-optional-access
-  <clang-tidy/checks/bugprone/unchecked-optional-access>` check by supporting 
-  ``NullableValue::makeValue`` and ``NullableValue::makeValueInplace`` to 
+  <clang-tidy/checks/bugprone/unchecked-optional-access>` check by supporting
+  ``NullableValue::makeValue`` and ``NullableValue::makeValueInplace`` to
   prevent false-positives for ``BloombergLP::bdlb::NullableValue`` type.
 
 - Improved :doc:`bugprone-unhandled-self-assignment
diff --git a/clang-tools-extra/docs/ReleaseNotesTemplate.txt b/clang-tools-extra/docs/ReleaseNotesTemplate.txt
index b17799b3b557d..69c3bcf67b8db 100644
--- a/clang-tools-extra/docs/ReleaseNotesTemplate.txt
+++ b/clang-tools-extra/docs/ReleaseNotesTemplate.txt
@@ -46,6 +46,9 @@ infrastructure are described first, followed by tool-specific sections.
 Major New Features
 ------------------
 
+Potentially Breaking Changes
+----------------------------
+
 Improvements to clangd
 ----------------------
 
diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/derived-method-shadowing-base-method.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/derived-method-shadowing-base-method.rst
new file mode 100644
index 0000000000000..f544abc14ffbf
--- /dev/null
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/derived-method-shadowing-base-method.rst
@@ -0,0 +1,30 @@
+.. title:: clang-tidy - bugprone-derived-method-shadowing-base-method
+
+bugprone-derived-method-shadowing-base-method
+=============================================
+
+Finds derived class methods that shadow a (non-virtual) base class method.
+
+In order to be considered "shadowing", methods must have the same signature
+(i.e. the same name, same number of parameters, same parameter types, etc).
+Only checks public, non-templated methods. 
+
+The below example is bugprone because consumers of the ``Derived`` class will
+expect the ``reset`` method to do the work of ``Base::reset()`` in addition to extra
+work required to reset the ``Derived`` class.  Common fixes include:
+
+- Making the ``reset`` method polymorphic
+- Re-naming ``Derived::reset`` if it's not meant to intersect with ``Base::reset``
+- Using ``using Base::reset`` to change the access specifier
+
+This is also a violation of the Liskov Substitution Principle.
+
+.. code-block:: c++
+
+  struct Base {
+    void reset() {/* reset the base class */};
+  };
+
+  struct Derived : public Base {
+    void reset() {/* reset the derived class, but not the base class */};
+  };
\ No newline at end of file
diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/easily-swappable-parameters.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/easily-swappable-parameters.rst
index 47970bfbbc400..a96d7f6015bda 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/easily-swappable-parameters.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/easily-swappable-parameters.rst
@@ -169,7 +169,7 @@ noisiness.
     * Separate ``return`` statements return either of the parameters on
       different code paths.
 
-.. option:: NamePrefixSuffixSilenceDissimilarityTreshold
+.. option:: NamePrefixSuffixSilenceDissimilarityThreshold
 
     The number of characters two parameter names might be different on *either*
     the head or the tail end with the rest of the name the same so that the
diff --git a/clang-tools-extra/docs/clang-tidy/checks/list.rst b/clang-tools-extra/docs/clang-tidy/checks/list.rst
index c490d2ece2e0a..e06849c419389 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/list.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/list.rst
@@ -91,6 +91,7 @@ Clang-Tidy Checks
    :doc:`bugprone-copy-constructor-init <bugprone/copy-constructor-init>`, "Yes"
    :doc:`bugprone-crtp-constructor-accessibility <bugprone/crtp-constructor-accessibility>`, "Yes"
    :doc:`bugprone-dangling-handle <bugprone/dangling-handle>`,
+   :doc:`bugprone-derived-method-shadowing-base-method <bugprone/derived-method-shadowing-base-method>`,
    :doc:`bugprone-dynamic-static-initializers <bugprone/dynamic-static-initializers>`,
    :doc:`bugprone-easily-swappable-parameters <bugprone/easily-swappable-parameters>`,
    :doc:`bugprone-empty-catch <bugprone/empty-catch>`,
diff --git a/clang-tools-extra/docs/clang-tidy/checks/modernize/make-shared.rst b/clang-tools-extra/docs/clang-tidy/checks/modernize/make-shared.rst
index 982138fc5e781..cd953e7ee394d 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/modernize/make-shared.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/modernize/make-shared.rst
@@ -37,7 +37,7 @@ Options
 .. option:: MakeSmartPtrFunctionHeader
 
    A string specifying the corresponding header of make-shared-ptr function.
-   Default is `memory`.
+   Default is `<memory>`.
 
 .. option:: IncludeStyle
 
diff --git a/clang-tools-extra/include-cleaner/lib/WalkAST.cpp b/clang-tools-extra/include-cleaner/lib/WalkAST.cpp
index 0cbf9a080a3ce..7bbdc8ba00dca 100644
--- a/clang-tools-extra/include-cleaner/lib/WalkAST.cpp
+++ b/clang-tools-extra/include-cleaner/lib/WalkAST.cpp
@@ -321,6 +321,8 @@ class ASTWalker : public RecursiveASTVisitor<ASTWalker> {
 
   // TypeLoc visitors.
   void reportType(SourceLocation RefLoc, NamedDecl *ND) {
+    if (!ND)
+      return;
     // Reporting explicit references to types nested inside classes can cause
     // issues, e.g. a type accessed through a derived class shouldn't require
     // inclusion of the base.
diff --git a/clang-tools-extra/include-cleaner/unittests/RecordTest.cpp b/clang-tools-extra/include-cleaner/unittests/RecordTest.cpp
index c88848ed35580..3fb49796039f2 100644
--- a/clang-tools-extra/include-cleaner/unittests/RecordTest.cpp
+++ b/clang-tools-extra/include-cleaner/unittests/RecordTest.cpp
@@ -646,9 +646,10 @@ TEST_F(PragmaIncludeTest, ExportInUnnamedBuffer) {
                                                  *Diags, "clang"));
 
   auto Clang = std::make_unique<CompilerInstance>(std::move(Invocation));
-  Clang->createDiagnostics(*VFS);
+  Clang->createVirtualFileSystem(VFS);
+  Clang->createDiagnostics();
 
-  auto *FM = Clang->createFileManager(VFS);
+  auto *FM = Clang->createFileManager();
   ASSERT_TRUE(Clang->ExecuteAction(*Inputs.MakeAction()));
   EXPECT_THAT(
       PI.getExporters(llvm::cantFail(FM->getFileRef("foo.h")), *FM),
diff --git a/clang-tools-extra/test/clang-tidy/check_clang_tidy.py b/clang-tools-extra/test/clang-tidy/check_clang_tidy.py
index 3eaba0e9dff3a..26f8cbaeb9f31 100755
--- a/clang-tools-extra/test/clang-tidy/check_clang_tidy.py
+++ b/clang-tools-extra/test/clang-tidy/check_clang_tidy.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 #
-# ===- check_clang_tidy.py - ClangTidy Test Helper ------------*- python -*--===#
+# ===-----------------------------------------------------------------------===#
 #
 # Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 # See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/derived-method-shadowing-base-method.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/derived-method-shadowing-base-method.cpp
new file mode 100644
index 0000000000000..c22598d84d1b2
--- /dev/null
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/derived-method-shadowing-base-method.cpp
@@ -0,0 +1,139 @@
+// RUN: %check_clang_tidy %s bugprone-derived-method-shadowing-base-method %t
+
+class Base 
+{
+    void method();
+    void methodWithArg(int I);
+
+    virtual Base* getThis() = 0;
+};
+
+class A : public Base
+{
+public:
+    void method();
+// CHECK-MESSAGES: :[[@LINE-1]]:5: warning: 'A::method' shadows method with the same name in class 'Base' [bugprone-derived-method-shadowing-base-method]
+// CHECK-MESSAGES: :5:5: note: previous definition of 'method' is here
+};
+
+// only declaration should be checked
+void A::method()
+{    
+}
+
+class B
+{
+public:
+    void method();
+};
+
+class D: public Base
+{
+
+};
+
+// test indirect inheritance
+class E : public D
+{
+public:
+    void method();
+// CHECK-MESSAGES: :[[@LINE-1]]:5: warning: 'E::method' shadows method with the same name in class 'Base' [bugprone-derived-method-shadowing-base-method]
+};
+
+class H : public Base
+{
+public:
+    Base* getThis() override;
+    Base const* getThis() const;
+};
+
+class I : public Base
+{
+public:
+    // test with inline implementation
+    void method()
+// CHECK-MESSAGES: :[[@LINE-1]]:5: warning: 'I::method' shadows method with the same name in class 'Base' [bugprone-derived-method-shadowing-base-method]
+    {
+
+    }
+};
+
+class J : public Base
+{
+public:
+    Base* getThis() final;
+};
+
+template<typename T>
+class TemplateBase
+{
+public:
+   virtual void size() const = 0;
+};
+    
+template<typename T>
+class K : public TemplateBase<T>
+{
+public:
+    void size() const final;
+};
+
+class L : public Base
+{
+public:
+// not same signature (take const ref) but still ambiguous
+    void methodWithArg(int const& I);
+// CHECK-MESSAGES: :[[@LINE-1]]:5: warning: 'L::methodWithArg' shadows method with the same name in class 'Base' [bugprone-derived-method-shadowing-base-method]
+
+    void methodWithArg(int const I);
+// CHECK-MESSAGES: :[[@LINE-1]]:5: warning: 'L::methodWithArg' shadows method with the same name in class 'Base' [bugprone-derived-method-shadowing-base-method]
+
+    void methodWithArg(int *I);
+    void methodWithArg(int const* I);
+};
+
+class M : public Base
+{
+public:
+    static void method();
+};
+
+class N : public Base
+{
+public:
+    template<typename T>
+    void methodWithArg(T I);
+    // TODO:  Templates are not handled yet
+    template<> void methodWithArg<int>(int I);
+};
+
+namespace std{
+    struct thread{
+        void join();
+    };
+}
+
+struct O: public std::thread{
+    void join();
+};
+
+struct P: public std::thread, Base{
+    void join();
+    void method();
+    // CHECK-MESSAGES: :[[@LINE-1]]:5: warning: 'P::method' shadows method with the same name in class 'Base' [bugprone-derived-method-shadowing-base-method]
+};
+
+class Q : public Base
+{
+public:
+    typedef int MyInt;
+// not same signature (take const ref) but still ambiguous
+    void methodWithArg(MyInt const& I);
+// CHECK-MESSAGES: :[[@LINE-1]]:5: warning: 'Q::methodWithArg' shadows method with the same name in class 'Base' [bugprone-derived-method-shadowing-base-method]
+
+    void methodWithArg(MyInt const I);
+// CHECK-MESSAGES: :[[@LINE-1]]:5: warning: 'Q::methodWithArg' shadows method with the same name in class 'Base' [bugprone-derived-method-shadowing-base-method]
+
+    void methodWithArg(MyInt *I);
+    void methodWithArg(MyInt const* I);
+};
diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/easily-swappable-parameters-ignore.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/easily-swappable-parameters-ignore.cpp
index 27104b93da0ac..be4db2e95ffca 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/bugprone/easily-swappable-parameters-ignore.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/easily-swappable-parameters-ignore.cpp
@@ -6,7 +6,7 @@
 // RUN:     bugprone-easily-swappable-parameters.QualifiersMix: 0, \
 // RUN:     bugprone-easily-swappable-parameters.ModelImplicitConversions: 0, \
 // RUN:     bugprone-easily-swappable-parameters.SuppressParametersUsedTogether: 0, \
-// RUN:     bugprone-easily-swappable-parameters.NamePrefixSuffixSilenceDissimilarityTreshold: 0 \
+// RUN:     bugprone-easily-swappable-parameters.NamePrefixSuffixSilenceDissimilarityThreshold: 0 \
 // RUN:  }}' --
 
 void ignoredUnnamed(int I, int, int) {} // NO-WARN: No >= 2 length of non-unnamed.
diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/easily-swappable-parameters-implicit-qualifiers.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/easily-swappable-parameters-implicit-qualifiers.cpp
index dc89dc68f4538..cc8332635e107 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/bugprone/easily-swappable-parameters-implicit-qualifiers.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/easily-swappable-parameters-implicit-qualifiers.cpp
@@ -6,7 +6,7 @@
 // RUN:     bugprone-easily-swappable-parameters.QualifiersMix: 1, \
 // RUN:     bugprone-easily-swappable-parameters.ModelImplicitConversions: 1, \
 // RUN:     bugprone-easily-swappable-parameters.SuppressParametersUsedTogether: 0, \
-// RUN:     bugprone-easily-swappable-parameters.NamePrefixSuffixSilenceDissimilarityTreshold: 0 \
+// RUN:     bugprone-easily-swappable-parameters.NamePrefixSuffixSilenceDissimilarityThreshold: 0 \
 // RUN:  }}' --
 
 void numericAndQualifierConversion(int I, const double CD) { numericAndQualifierConversion(CD, I); }
diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/easily-swappable-parameters-implicits.c b/clang-tools-extra/test/clang-tidy/checkers/bugprone/easily-swappable-parameters-implicits.c
index b9efc99c477b2..0779738004076 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/bugprone/easily-swappable-parameters-implicits.c
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/easily-swappable-parameters-implicits.c
@@ -1,4 +1,4 @@
-// RUN: %check_clang_tidy %s bugprone-easily-swappable-parameters %t \
+// RUN: %check_clang_tidy %s --extra-arg=-Wno-error=incompatible-pointer-types bugprone-easily-swappable-parameters %t \
 // RUN:   -config='{CheckOptions: { \
 // RUN:     bugprone-easily-swappable-parameters.MinimumLength: 2, \
 // RUN:     bugprone-easily-swappable-parameters.IgnoredParameterNames: "", \
@@ -6,7 +6,7 @@
 // RUN:     bugprone-easily-swappable-parameters.QualifiersMix: 0, \
 // RUN:     bugprone-easily-swappable-parameters.ModelImplicitConversions: 1, \
 // RUN:     bugprone-easily-swappable-parameters.SuppressParametersUsedTogether: 0, \
-// RUN:     bugprone-easily-swappable-parameters.NamePrefixSuffixSilenceDissimilarityTreshold: 0 \
+// RUN:     bugprone-easily-swappable-parameters.NamePrefixSuffixSilenceDissimilarityThreshold: 0 \
 // RUN:  }}' --
 
 void implicitDoesntBreakOtherStuff(int A, int B) {}
diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/easily-swappable-parameters-implicits.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/easily-swappable-parameters-implicits.cpp
index 44ba5d5d9f590..7518e3fb031f5 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/bugprone/easily-swappable-parameters-implicits.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/easily-swappable-parameters-implicits.cpp
@@ -6,7 +6,7 @@
 // RUN:     bugprone-easily-swappable-parameters.QualifiersMix: 0, \
 // RUN:     bugprone-easily-swappable-parameters.ModelImplicitConversions: 1, \
 // RUN:     bugprone-easily-swappable-parameters.SuppressParametersUsedTogether: 0, \
-// RUN:     bugprone-easily-swappable-parameters.NamePrefixSuffixSilenceDissimilarityTreshold: 0 \
+// RUN:     bugprone-easily-swappable-parameters.NamePrefixSuffixSilenceDissimilarityThreshold: 0 \
 // RUN:  }}' --
 
 void implicitDoesntBreakOtherStuff(int A, int B) {}
diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/easily-swappable-parameters-len2.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/easily-swappable-parameters-len2.cpp
index ac7cc7d9a7b7c..d933891072a54 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/bugprone/easily-swappable-parameters-len2.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/easily-swappable-parameters-len2.cpp
@@ -6,7 +6,7 @@
 // RUN:     bugprone-easily-swappable-parameters.QualifiersMix: 0, \
 // RUN:     bugprone-easily-swappable-parameters.ModelImplicitConversions: 0, \
 // RUN:     bugprone-easily-swappable-parameters.SuppressParametersUsedTogether: 0, \
-// RUN:     bugprone-easily-swappable-parameters.NamePrefixSuffixSilenceDissimilarityTreshold: 0 \
+// RUN:     bugprone-easily-swappable-parameters.NamePrefixSuffixSilenceDissimilarityThreshold: 0 \
 // RUN:  }}' --
 
 namespace std {
diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/easily-swappable-parameters-len3.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/easily-swappable-parameters-len3.cpp
index 05900068e62f9..bf9ceb112a61f 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/bugprone/easily-swappable-parameters-len3.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/easily-swappable-parameters-len3.cpp
@@ -6,7 +6,7 @@
 // RUN:     bugprone-easily-swappable-parameters.QualifiersMix: 0, \
 // RUN:     bugprone-easily-swappable-parameters.ModelImplicitConversions: 0, \
 // RUN:     bugprone-easily-swappable-parameters.SuppressParametersUsedTogether: 0, \
-// RUN:     bugprone-easily-swappable-parameters.NamePrefixSuffixSilenceDissimilarityTreshold: 0 \
+// RUN:     bugprone-easily-swappable-parameters.NamePrefixSuffixSilenceDissimilarityThreshold: 0 \
 // RUN:  }}' --
 
 int add(int Left, int Right) { return Left + Right; } // NO-WARN: Only 2 parameters.
diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/easily-swappable-parameters-prefixsuffixname.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/easily-swappable-parameters-prefixsuffixname.cpp
index 72ce54e517304..00e54d0df690e 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/bugprone/easily-swappable-parameters-prefixsuffixname.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/easily-swappable-parameters-prefixsuffixname.cpp
@@ -6,7 +6,7 @@
 // RUN:     bugprone-easily-swappable-parameters.QualifiersMix: 0, \
 // RUN:     bugprone-easily-swappable-parameters.ModelImplicitConversions: 0, \
 // RUN:     bugprone-easily-swappable-parameters.SuppressParametersUsedTogether: 0, \
-// RUN:     bugprone-easily-swappable-parameters.NamePrefixSuffixSilenceDissimilarityTreshold: 1 \
+// RUN:     bugprone-easily-swappable-parameters.NamePrefixSuffixSilenceDissimilarityThreshold: 1 \
 // RUN:  }}' --
 
 namespace std {
diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/easily-swappable-parameters-qualifiermixing.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/easily-swappable-parameters-qualifiermixing.cpp
index 2bfcefcc56fa0..61159bfa022fb 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/bugprone/easily-swappable-parameters-qualifiermixing.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/easily-swappable-parameters-qualifiermixing.cpp
@@ -6,7 +6,7 @@
 // RUN:     bugprone-easily-swappable-parameters.QualifiersMix: 1, \
 // RUN:     bugprone-easily-swappable-parameters.ModelImplicitConversions: 0, \
 // RUN:     bugprone-easily-swappable-parameters.SuppressParametersUsedTogether: 0, \
-// RUN:     bugprone-easily-swappable-parameters.NamePrefixSuffixSilenceDissimilarityTreshold: 0 \
+// RUN:     bugprone-easily-swappable-parameters.NamePrefixSuffixSilenceDissimilarityThreshold: 0 \
 // RUN:  }}' --
 
 typedef int MyInt1;
diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/easily-swappable-parameters-relatedness.c b/clang-tools-extra/test/clang-tidy/checkers/bugprone/easily-swappable-parameters-relatedness.c
index 45752de36a90c..0f325f0ab7ac5 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/bugprone/easily-swappable-parameters-relatedness.c
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/easily-swappable-parameters-relatedness.c
@@ -6,7 +6,7 @@
 // RUN:     bugprone-easily-swappable-parameters.QualifiersMix: 0, \
 // RUN:     bugprone-easily-swappable-parameters.ModelImplicitConversions: 0, \
 // RUN:     bugprone-easily-swappable-parameters.SuppressParametersUsedTogether: 1, \
-// RUN:     bugprone-easily-swappable-parameters.NamePrefixSuffixSilenceDissimilarityTreshold: 0 \
+// RUN:     bugprone-easily-swappable-parameters.NamePrefixSuffixSilenceDissimilarityThreshold: 0 \
 // RUN:  }}' -- -Wno-strict-prototypes -x c
 //
 // RUN: %check_clang_tidy -std=c23-or-later %s bugprone-easily-swappable-parameters %t \
@@ -17,7 +17,7 @@
 // RUN:     bugprone-easily-swappable-parameters.QualifiersMix: 0, \
 // RUN:     bugprone-easily-swappable-parameters.ModelImplicitConversions: 0, \
 // RUN:     bugprone-easily-swappable-parameters.SuppressParametersUsedTogether: 1, \
-// RUN:     bugprone-easily-swappable-parameters.NamePrefixSuffixSilenceDissimilarityTreshold: 0 \
+// RUN:     bugprone-easily-swappable-parameters.NamePrefixSuffixSilenceDissimilarityThreshold: 0 \
 // RUN:  }}' -- -Wno-strict-prototypes -x c
 
 int add(int X, int Y);
diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/easily-swappable-parameters-relatedness.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/easily-swappable-parameters-relatedness.cpp
index 9ede3dc5f8b8b..9214522070c3f 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/bugprone/easily-swappable-parameters-relatedness.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/easily-swappable-parameters-relatedness.cpp
@@ -6,7 +6,7 @@
 // RUN:     bugprone-easily-swappable-parameters.QualifiersMix: 0, \
 // RUN:     bugprone-easily-swappable-parameters.ModelImplicitConversions: 0, \
 // RUN:     bugprone-easily-swappable-parameters.SuppressParametersUsedTogether: 1, \
-// RUN:     bugprone-easily-swappable-parameters.NamePrefixSuffixSilenceDissimilarityTreshold: 0 \
+// RUN:     bugprone-easily-swappable-parameters.NamePrefixSuffixSilenceDissimilarityThreshold: 0 \
 // RUN:  }}' --
 
 namespace std {
diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/easily-swappable-parameters.c b/clang-tools-extra/test/clang-tidy/checkers/bugprone/easily-swappable-parameters.c
index be44cfc889ed0..25d27b3dba222 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/bugprone/easily-swappable-parameters.c
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/easily-swappable-parameters.c
@@ -6,7 +6,7 @@
 // RUN:     bugprone-easily-swappable-parameters.QualifiersMix: 0, \
 // RUN:     bugprone-easily-swappable-parameters.ModelImplicitConversions: 0, \
 // RUN:     bugprone-easily-swappable-parameters.SuppressParametersUsedTogether: 0, \
-// RUN:     bugprone-easily-swappable-parameters.NamePrefixSuffixSilenceDissimilarityTreshold: 0 \
+// RUN:     bugprone-easily-swappable-parameters.NamePrefixSuffixSilenceDissimilarityThreshold: 0 \
 // RUN:  }}' -- -Wno-strict-prototypes
 //
 // RUN: %check_clang_tidy -std=c23-or-later %s bugprone-easily-swappable-parameters %t \
@@ -17,7 +17,7 @@
 // RUN:     bugprone-easily-swappable-parameters.QualifiersMix: 0, \
 // RUN:     bugprone-easily-swappable-parameters.ModelImplicitConversions: 0, \
 // RUN:     bugprone-easily-swappable-parameters.SuppressParametersUsedTogether: 0, \
-// RUN:     bugprone-easily-swappable-parameters.NamePrefixSuffixSilenceDissimilarityTreshold: 0 \
+// RUN:     bugprone-easily-swappable-parameters.NamePrefixSuffixSilenceDissimilarityThreshold: 0 \
 // RUN:  }}' -- -Wno-strict-prototypes
 
 #define bool _Bool
diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/return-braced-init-list.cpp b/clang-tools-extra/test/clang-tidy/checkers/modernize/return-braced-init-list.cpp
index 02e95e15499dc..ae33d25d49152 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/modernize/return-braced-init-list.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/modernize/return-braced-init-list.cpp
@@ -1,4 +1,4 @@
-// RUN: %check_clang_tidy -std=c++14-or-later %s modernize-return-braced-init-list %t
+// RUN: %check_clang_tidy -std=c++11-or-later %s modernize-return-braced-init-list %t
 
 namespace std {
 typedef decltype(sizeof(int)) size_t;
@@ -80,10 +80,12 @@ Foo f2() {
   return {b2};
 }
 
+#if __cplusplus >= 201402L
 auto f3() {
   Bar b3;
   return Foo(b3);
 }
+#endif
 
 #define A(b) Foo(b)
 
diff --git a/clang/bindings/python/clang/cindex.py b/clang/bindings/python/clang/cindex.py
index 3398823836e62..071bd76f95906 100644
--- a/clang/bindings/python/clang/cindex.py
+++ b/clang/bindings/python/clang/cindex.py
@@ -3150,7 +3150,7 @@ def string(self) -> CompletionString:
 
 
 class CCRStructure(Structure):
-    _fields_ = [("results", POINTER(CodeCompletionResult)), ("numResults", c_int)]
+    _fields_ = [("results", POINTER(CodeCompletionResult)), ("numResults", c_uint)]
 
     results: NoSliceSequence[CodeCompletionResult]
     numResults: int
diff --git a/clang/cmake/caches/PGO.cmake b/clang/cmake/caches/PGO.cmake
index d6471160037c1..15bc755d110d1 100644
--- a/clang/cmake/caches/PGO.cmake
+++ b/clang/cmake/caches/PGO.cmake
@@ -5,7 +5,7 @@ set(LLVM_ENABLE_PROJECTS "clang;lld" CACHE STRING "")
 set(LLVM_ENABLE_RUNTIMES "compiler-rt;libcxx;libcxxabi;libunwind" CACHE STRING "")
 
 set(LLVM_TARGETS_TO_BUILD Native CACHE STRING "")
-set(BOOTSTRAP_LLVM_BUILD_INSTRUMENTED IR CACHE BOOL "")
+set(BOOTSTRAP_LLVM_BUILD_INSTRUMENTED ON CACHE BOOL "")
 set(CLANG_BOOTSTRAP_TARGETS
   generate-profdata
   stage2
diff --git a/clang/cmake/modules/ClangConfig.cmake.in b/clang/cmake/modules/ClangConfig.cmake.in
index 5f67681649c66..68f723d050117 100644
--- a/clang/cmake/modules/ClangConfig.cmake.in
+++ b/clang/cmake/modules/ClangConfig.cmake.in
@@ -10,6 +10,7 @@ set(CLANG_EXPORTED_TARGETS "@CLANG_EXPORTS@")
 set(CLANG_CMAKE_DIR "@CLANG_CONFIG_CMAKE_DIR@")
 set(CLANG_INCLUDE_DIRS "@CLANG_CONFIG_INCLUDE_DIRS@")
 set(CLANG_LINK_CLANG_DYLIB "@CLANG_LINK_CLANG_DYLIB@")
+set(CLANG_DEFAULT_LINKER "@CLANG_DEFAULT_LINKER@")
 
 # Provide all our library targets to users.
 @CLANG_CONFIG_INCLUDE_EXPORTS@
diff --git a/clang/docs/APINotes.rst b/clang/docs/APINotes.rst
index dec4b186ff72f..e142cfa62e5a2 100644
--- a/clang/docs/APINotes.rst
+++ b/clang/docs/APINotes.rst
@@ -229,6 +229,20 @@ declaration kind), all of which are optional:
     - Name: vector
       SwiftConformsTo: Cxx.CxxSequence
 
+:SwiftSafety:
+
+  Import a declaration as ``@safe`` or ``@unsafe`` to Swift.
+
+  ::
+
+    Tags:
+    - Name: UnsafeType
+      SwiftSafety: unsafe
+    - Name: span
+      Methods:
+        - Name: size
+          SwiftSafety: safe
+
 :Availability, AvailabilityMsg:
 
   A value of "nonswift" is equivalent to ``NS_SWIFT_UNAVAILABLE``. A value of
diff --git a/clang/docs/ClangFormatStyleOptions.rst b/clang/docs/ClangFormatStyleOptions.rst
index 3ac9e3795cae7..9413b9a348b76 100644
--- a/clang/docs/ClangFormatStyleOptions.rst
+++ b/clang/docs/ClangFormatStyleOptions.rst
@@ -4426,6 +4426,21 @@ the configuration (without a prefix: ``Auto``).
          #endif
        #endif
 
+  * ``PPDIS_Leave`` (in configuration: ``Leave``)
+    Leaves indentation of directives as-is.
+
+    .. note::
+
+     Ignores ``PPIndentWidth``.
+
+    .. code-block:: c++
+
+      #if FOO
+        #if BAR
+      #include <foo>
+        #endif
+      #endif
+
 
 
 .. _IndentRequiresClause:
@@ -5079,6 +5094,113 @@ the configuration (without a prefix: ``Auto``).
 
   For example: TESTSUITE
 
+.. _NumericLiteralCase:
+
+**NumericLiteralCase** (``NumericLiteralCaseStyle``) :versionbadge:`clang-format 22` :ref:`¶ <NumericLiteralCase>`
+  Capitalization style for numeric literals.
+
+  Nested configuration flags:
+
+  Separate control for each numeric literal component.
+
+  For example, the config below will leave exponent letters alone, reformat
+  hexadecimal digits in lowercase, reformat numeric literal prefixes in
+  uppercase, and reformat suffixes in lowercase.
+
+  .. code-block:: c++
+
+    NumericLiteralCase:
+      ExponentLetter: Leave
+      HexDigit: Lower
+      Prefix: Upper
+      Suffix: Lower
+
+  * ``NumericLiteralComponentStyle ExponentLetter``
+    Format floating point exponent separator letter case.
+
+    .. code-block:: c++
+
+      float a = 6.02e23 + 1.0E10; // Leave
+      float a = 6.02E23 + 1.0E10; // Upper
+      float a = 6.02e23 + 1.0e10; // Lower
+
+    Possible values:
+
+    * ``NLCS_Leave`` (in configuration: ``Leave``)
+      Leave this component of the literal as is.
+
+    * ``NLCS_Upper`` (in configuration: ``Upper``)
+      Format this component with uppercase characters.
+
+    * ``NLCS_Lower`` (in configuration: ``Lower``)
+      Format this component with lowercase characters.
+
+
+  * ``NumericLiteralComponentStyle HexDigit``
+    Format hexadecimal digit case.
+
+    .. code-block:: c++
+
+      a = 0xaBcDeF; // Leave
+      a = 0xABCDEF; // Upper
+      a = 0xabcdef; // Lower
+
+    Possible values:
+
+    * ``NLCS_Leave`` (in configuration: ``Leave``)
+      Leave this component of the literal as is.
+
+    * ``NLCS_Upper`` (in configuration: ``Upper``)
+      Format this component with uppercase characters.
+
+    * ``NLCS_Lower`` (in configuration: ``Lower``)
+      Format this component with lowercase characters.
+
+
+  * ``NumericLiteralComponentStyle Prefix``
+    Format integer prefix case.
+
+    .. code-block:: c++
+
+       a = 0XF0 | 0b1; // Leave
+       a = 0XF0 | 0B1; // Upper
+       a = 0xF0 | 0b1; // Lower
+
+    Possible values:
+
+    * ``NLCS_Leave`` (in configuration: ``Leave``)
+      Leave this component of the literal as is.
+
+    * ``NLCS_Upper`` (in configuration: ``Upper``)
+      Format this component with uppercase characters.
+
+    * ``NLCS_Lower`` (in configuration: ``Lower``)
+      Format this component with lowercase characters.
+
+
+  * ``NumericLiteralComponentStyle Suffix``
+    Format suffix case. This option excludes case-sensitive reserved
+    suffixes, such as ``min`` in C++.
+
+    .. code-block:: c++
+
+      a = 1uLL; // Leave
+      a = 1ULL; // Upper
+      a = 1ull; // Lower
+
+    Possible values:
+
+    * ``NLCS_Leave`` (in configuration: ``Leave``)
+      Leave this component of the literal as is.
+
+    * ``NLCS_Upper`` (in configuration: ``Upper``)
+      Format this component with uppercase characters.
+
+    * ``NLCS_Lower`` (in configuration: ``Lower``)
+      Format this component with lowercase characters.
+
+
+
 .. _ObjCBinPackProtocolList:
 
 **ObjCBinPackProtocolList** (``BinPackStyle``) :versionbadge:`clang-format 7` :ref:`¶ <ObjCBinPackProtocolList>`
diff --git a/clang/docs/OpenMPSupport.rst b/clang/docs/OpenMPSupport.rst
index cb8ea5e511101..8a627d1f0b92d 100644
--- a/clang/docs/OpenMPSupport.rst
+++ b/clang/docs/OpenMPSupport.rst
@@ -348,17 +348,105 @@ implementation.
 +------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
 
 
+.. _OpenMP 5.2 implementation details:
+
+OpenMP 5.2 Implementation Details
+=================================
+
+The following table provides a quick overview of various OpenMP 5.2 features
+and their implementation status. Please post on the
+`Discourse forums (Runtimes - OpenMP category)`_ for more
+information or if you want to help with the
+implementation.
+
+
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+|Feature                                                      | C/C++ Status              |  Fortran Status           | Reviews                                                                  |
++=============================================================+===========================+===========================+==========================================================================+
+| omp_in_explicit_task()                                      | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| semantics of explicit_task_var and implicit_task_var        | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| ompx sentinel for C/C++ directive extensions                | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| ompx prefix for clause extensions                           | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| if clause on teams construct                                | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| step modifier added                                         | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| declare mapper: Add iterator modifier on map clause         | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| declare mapper: Add iterator modifier on map clause         | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| memspace and traits modifiers to uses allocator         i   | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| Add otherwise clause to metadirectives                      | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| doacross clause with support for omp_cur_iteration          | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| position of interop_type in init clause on iterop           | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| implicit map type for target enter/exit data                | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| work OMPT type for work-sharing loop constructs             | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| allocate and firstprivate on scope directive                | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| Change loop consistency for order clause                    | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| Add memspace and traits modifiers to uses_allocators        | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| Keep original base pointer on map w/o matched candidate     | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| Pure procedure support for certain directives               | :none:`N/A`               | :none:`unclaimed`         |                                                                          |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| ALLOCATE statement support for allocators                   | :none:`N/A`               | :none:`unclaimed`         |                                                                          |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| dispatch construct extension to support end directive       | :none:`N/A`               | :none:`unclaimed`         |                                                                          |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+
+
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+|OpenMP 5.2 Deprecations                                      | C/C++ Status              |  Fortran Status           | Reviews                                                                  |
++=============================================================+===========================+===========================+==========================================================================+
+| Linear clause syntax                                        | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| The minus operator                                          | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| Map clause modifiers without commas                         | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| The use of allocate directives with ALLOCATE statement      | :good:`N/A`               | :none:`unclaimed`         |                                                                          |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| uses_allocators list syntax                                 | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| The default clause on metadirectives                        | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| The delimited form of the declare target directive          | :none:`unclaimed`         | :good:`N/A`               |                                                                          |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| The use of the to clause on the declare target directive    | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| The syntax of the destroy clause on the depobj construct    | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| keyword source and sink as task-dependence modifiers        | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| interop types in any position on init clause of interop     | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| ompd prefix usage for some ICVs                             | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+
 .. _OpenMP 6.0 implementation details:
 
 OpenMP 6.0 Implementation Details
 =================================
 
-The following table provides a quick overview over various OpenMP 6.0 features
+The following table provides a quick overview of various OpenMP 6.0 features
 and their implementation status. Please post on the
 `Discourse forums (Runtimes - OpenMP category)`_ for more
 information or if you want to help with the
 implementation.
 
+
 +-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
 |Feature                                                      | C/C++ Status              |  Fortran Status           | Reviews                                                                  |
 +=============================================================+===========================+===========================+==========================================================================+
@@ -491,11 +579,14 @@ implementation.
 |                                                             |                           |                           | Flang parser: https://github.com/llvm/llvm-project/pull/153807           |
 |                                                             |                           |                           | Flang sema: https://github.com/llvm/llvm-project/pull/154779             |
 +-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
-| variable-category on default clause                         | :part:`In Progress`       | :none:`unclaimed`         |                                                                          |
+| variable-category on default clause                         | :good:`done`              | :none:`unclaimed`         |                                                                          |
 +-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
 | Changes to omp_target_is_accessible                         | :part:`In Progress`       | :part:`In Progress`       |                                                                          |
 +-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
-
+| defaultmap implicit-behavior 'storage'                      | :good:`done`              | :none:`unclaimed`         | https://github.com/llvm/llvm-project/pull/158336                         |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| defaultmap implicit-behavior 'private'                      | :good:`done`              | :none:`unclaimed`         | https://github.com/llvm/llvm-project/pull/158712                         |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
 
 .. _OpenMP 6.1 implementation details:
 
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index e1e497ccdbccd..60dd2b4c0c6ef 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -52,12 +52,32 @@ Potentially Breaking Changes
   ``--gcc-install-dir`` command line argument. This will silence the
   warning. It can also be disabled using the
   ``-Wno-gcc-install-dir-libstdcxx`` command line flag.
+- Scalar deleting destructor support has been aligned with MSVC when
+  targeting the MSVC ABI. Clang previously implemented support for
+  ``::delete`` by calling the complete object destructor and then the
+  appropriate global delete operator (as is done for the Itanium ABI).
+  The scalar deleting destructor is now called to destroy the object
+  and deallocate its storage. This is an ABI change that can result in
+  memory corruption when a program built for the MSVC ABI has
+  portions compiled with clang 21 or earlier and portions compiled
+  with a version of clang 22 (or MSVC). Consider a class ``X`` that
+  declares a virtual destructor and an ``operator delete`` member
+  with the destructor defined in library ``A`` and a call to `::delete`` in
+  library ``B``. If library ``A`` is compiled with clang 21 and library ``B``
+  is compiled with clang 22, the ``::delete`` call might dispatch to the
+  scalar deleting destructor emitted in library ``A`` which will erroneously
+  call the member ``operator delete`` instead of the expected global
+  delete operator. The old behavior is retained under ``-fclang-abi-compat=21``
+  flag.
 
 C/C++ Language Potentially Breaking Changes
 -------------------------------------------
 
 - The ``__has_builtin`` function now only considers the currently active target when being used with target offloading.
 
+- The ``-Wincompatible-pointer-types`` diagnostic now defaults to an error;
+  it can still be downgraded to a warning by passing ``-Wno-error=incompatible-pointer-types``. (#GH74605)
+
 C++ Specific Potentially Breaking Changes
 -----------------------------------------
 - For C++20 modules, the Reduced BMI mode will be the default option. This may introduce
@@ -230,6 +250,9 @@ Removed Compiler Flags
 
 Attribute Changes in Clang
 --------------------------
+- The definition of a function declaration with ``[[clang::cfi_unchecked_callee]]`` inherits this
+  attribute, allowing the attribute to only be attached to the declaration. Prior, this would be
+  treated as an error where the definition and declaration would have differing types.
 
 Improvements to Clang's diagnostics
 -----------------------------------
@@ -283,6 +306,15 @@ Improvements to Clang's diagnostics
   pointers under ``-Wthread-safety-beta`` (still experimental), which reduces
   both false positives but also false negatives through more precise analysis.
 
+- Clang now looks through parenthesis for ``-Wundefined-reinterpret-cast`` diagnostic.
+
+- Fixed a bug where the source location was missing when diagnosing ill-formed
+  placeholder constraints.
+
+- The two-element, unary mask variant of ``__builtin_shufflevector`` is now
+  properly being rejected when used at compile-time. It was not implemented
+  and caused assertion failures before (#GH158471).
+
 Improvements to Clang's time-trace
 ----------------------------------
 
@@ -311,8 +343,16 @@ Bug Fixes in This Version
 - Builtin elementwise operators now accept vector arguments that have different
   qualifiers on their elements. For example, vector of 4 ``const float`` values
   and vector of 4 ``float`` values. (#GH155405)
+- Fixed inconsistent shadow warnings for lambda capture of structured bindings.
+  Previously, ``[val = val]`` (regular parameter) produced no warnings with ``-Wshadow``
+  while ``[a = a]`` (where ``a`` is from ``auto [a, b] = std::make_pair(1, 2)``) 
+  incorrectly produced warnings. Both cases now consistently show no warnings with 
+  ``-Wshadow`` and show uncaptured-local warnings with ``-Wshadow-all``. (#GH68605)
 - Fixed a failed assertion with a negative limit parameter value inside of
   ``__has_embed``. (#GH157842)
+- Fixed an assertion when an improper use of the ``malloc`` attribute targeting
+  a function without arguments caused us to try to access a non-existent argument.
+  (#GH159080)
 
 Bug Fixes to Compiler Builtins
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -355,11 +395,15 @@ Bug Fixes to C++ Support
   authentication enabled. (#GH152601)
 - Fix the check for narrowing int-to-float conversions, so that they are detected in
   cases where converting the float back to an integer is undefined behaviour (#GH157067).
+- Stop rejecting C++11-style attributes on the first argument of constructors in older
+  standards. (#GH156809).
 - Fix a crash when applying binary or ternary operators to two same function types with different spellings,
   where at least one of the function parameters has an attribute which affects
   the function type.
 - Fix an assertion failure when a ``constexpr`` variable is only referenced through
   ``__builtin_addressof``, and related issues with builtin arguments. (#GH154034)
+- Fix an assertion failure when taking the address on a non-type template parameter argument of
+  object type. (#GH151531)
 
 Bug Fixes to AST Handling
 ^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -429,6 +473,8 @@ CUDA/HIP Language Changes
 CUDA Support
 ^^^^^^^^^^^^
 
+Support calling `consteval` function between different target.
+
 AIX Support
 ^^^^^^^^^^^
 
@@ -456,7 +502,9 @@ AST Matchers
   following the corresponding changes in the clang AST.
 - Ensure ``hasBitWidth`` doesn't crash on bit widths that are dependent on template
   parameters.
-
+- Remove the ``dependentTemplateSpecializationType`` matcher, as the
+  corresponding AST node was removed. This matcher was never very useful, since
+  there was no way to match on its template name.
 - Add a boolean member ``IgnoreSystemHeaders`` to ``MatchFinderOptions``. This
   allows it to ignore nodes in system headers when traversing the AST.
 
@@ -466,6 +514,9 @@ AST Matchers
 clang-format
 ------------
 - Add ``SpaceInEmptyBraces`` option and set it to ``Always`` for WebKit style.
+- Add ``NumericLiteralCase`` option for enforcing character case in numeric
+  literals.
+- Add ``Leave`` suboption to ``IndentPPDirectives``.
 
 libclang
 --------
@@ -511,6 +562,9 @@ OpenMP Support
 - Allow array length to be omitted in array section subscript expression.
 - Fixed non-contiguous strided update in the ``omp target update`` directive with the ``from`` clause.
 - Properly handle array section/assumed-size array privatization in C/C++.
+- Added support for ``variable-category`` modifier in ``default clause``.
+- Added support for ``defaultmap`` directive implicit-behavior ``storage``.
+- Added support for ``defaultmap`` directive implicit-behavior ``private``.
 
 Improvements
 ^^^^^^^^^^^^
diff --git a/clang/include/clang/APINotes/Types.h b/clang/include/clang/APINotes/Types.h
index 71625715bda19..fb2b91a3e1750 100644
--- a/clang/include/clang/APINotes/Types.h
+++ b/clang/include/clang/APINotes/Types.h
@@ -46,6 +46,8 @@ enum class SwiftNewTypeKind {
   Enum,
 };
 
+enum class SwiftSafetyKind { Unspecified, Safe, Unsafe, None };
+
 /// Describes API notes data for any entity.
 ///
 /// This is used as the base of all API notes.
@@ -71,13 +73,19 @@ class CommonEntityInfo {
   LLVM_PREFERRED_TYPE(bool)
   unsigned SwiftPrivate : 1;
 
+  LLVM_PREFERRED_TYPE(bool)
+  unsigned SwiftSafetyAudited : 1;
+
+  LLVM_PREFERRED_TYPE(SwiftSafetyKind)
+  unsigned SwiftSafety : 2;
+
 public:
   /// Swift name of this entity.
   std::string SwiftName;
 
   CommonEntityInfo()
       : Unavailable(0), UnavailableInSwift(0), SwiftPrivateSpecified(0),
-        SwiftPrivate(0) {}
+        SwiftPrivate(0), SwiftSafetyAudited(0), SwiftSafety(0) {}
 
   std::optional<bool> isSwiftPrivate() const {
     return SwiftPrivateSpecified ? std::optional<bool>(SwiftPrivate)
@@ -89,6 +97,17 @@ class CommonEntityInfo {
     SwiftPrivate = Private.value_or(0);
   }
 
+  std::optional<SwiftSafetyKind> getSwiftSafety() const {
+    return SwiftSafetyAudited ? std::optional<SwiftSafetyKind>(
+                                    static_cast<SwiftSafetyKind>(SwiftSafety))
+                              : std::nullopt;
+  }
+
+  void setSwiftSafety(SwiftSafetyKind Safety) {
+    SwiftSafetyAudited = 1;
+    SwiftSafety = static_cast<unsigned>(Safety);
+  }
+
   friend bool operator==(const CommonEntityInfo &, const CommonEntityInfo &);
 
   CommonEntityInfo &operator|=(const CommonEntityInfo &RHS) {
@@ -108,6 +127,9 @@ class CommonEntityInfo {
     if (!SwiftPrivateSpecified)
       setSwiftPrivate(RHS.isSwiftPrivate());
 
+    if (!SwiftSafetyAudited && RHS.SwiftSafetyAudited)
+      setSwiftSafety(*RHS.getSwiftSafety());
+
     if (SwiftName.empty())
       SwiftName = RHS.SwiftName;
 
@@ -123,7 +145,9 @@ inline bool operator==(const CommonEntityInfo &LHS,
          LHS.Unavailable == RHS.Unavailable &&
          LHS.UnavailableInSwift == RHS.UnavailableInSwift &&
          LHS.SwiftPrivateSpecified == RHS.SwiftPrivateSpecified &&
-         LHS.SwiftPrivate == RHS.SwiftPrivate && LHS.SwiftName == RHS.SwiftName;
+         LHS.SwiftPrivate == RHS.SwiftPrivate &&
+         LHS.SwiftSafetyAudited == RHS.SwiftSafetyAudited &&
+         LHS.SwiftSafety == RHS.SwiftSafety && LHS.SwiftName == RHS.SwiftName;
 }
 
 inline bool operator!=(const CommonEntityInfo &LHS,
diff --git a/clang/include/clang/AST/ASTContext.h b/clang/include/clang/AST/ASTContext.h
index 1c17333b722f8..b8f6de69bbb98 100644
--- a/clang/include/clang/AST/ASTContext.h
+++ b/clang/include/clang/AST/ASTContext.h
@@ -241,9 +241,6 @@ class ASTContext : public RefCountedBase<ASTContext> {
   mutable llvm::FoldingSet<UsingType> UsingTypes;
   mutable llvm::FoldingSet<FoldingSetPlaceholder<TypedefType>> TypedefTypes;
   mutable llvm::FoldingSet<DependentNameType> DependentNameTypes;
-  mutable llvm::DenseMap<llvm::FoldingSetNodeID,
-                         DependentTemplateSpecializationType *>
-      DependentTemplateSpecializationTypes;
   mutable llvm::FoldingSet<PackExpansionType> PackExpansionTypes;
   mutable llvm::FoldingSet<ObjCObjectTypeImpl> ObjCObjectTypes;
   mutable llvm::FoldingSet<ObjCObjectPointerType> ObjCObjectPointerTypes;
@@ -1904,7 +1901,8 @@ class ASTContext : public RefCountedBase<ASTContext> {
                           TemplateTypeParmDecl *ParmDecl = nullptr) const;
 
   QualType getCanonicalTemplateSpecializationType(
-      TemplateName T, ArrayRef<TemplateArgument> CanonicalArgs) const;
+      ElaboratedTypeKeyword Keyword, TemplateName T,
+      ArrayRef<TemplateArgument> CanonicalArgs) const;
 
   QualType
   getTemplateSpecializationType(ElaboratedTypeKeyword Keyword, TemplateName T,
@@ -1935,13 +1933,6 @@ class ASTContext : public RefCountedBase<ASTContext> {
                                 NestedNameSpecifier NNS,
                                 const IdentifierInfo *Name) const;
 
-  QualType getDependentTemplateSpecializationType(
-      ElaboratedTypeKeyword Keyword, const DependentTemplateStorage &Name,
-      ArrayRef<TemplateArgumentLoc> Args) const;
-  QualType getDependentTemplateSpecializationType(
-      ElaboratedTypeKeyword Keyword, const DependentTemplateStorage &Name,
-      ArrayRef<TemplateArgument> Args, bool IsCanonical = false) const;
-
   TemplateArgument getInjectedTemplateArg(NamedDecl *ParamDecl) const;
 
   /// Form a pack expansion type with the given pattern.
diff --git a/clang/include/clang/AST/ASTMutationListener.h b/clang/include/clang/AST/ASTMutationListener.h
index 2c4ec2ce67f36..ab0a539f84e42 100644
--- a/clang/include/clang/AST/ASTMutationListener.h
+++ b/clang/include/clang/AST/ASTMutationListener.h
@@ -86,6 +86,10 @@ class ASTMutationListener {
                                       const FunctionDecl *Delete,
                                       Expr *ThisArg) {}
 
+  /// A virtual destructor's operator global delete has been resolved.
+  virtual void ResolvedOperatorGlobDelete(const CXXDestructorDecl *DD,
+                                          const FunctionDecl *GlobDelete) {}
+
   /// An implicit member got a definition.
   virtual void CompletedImplicitDefinition(const FunctionDecl *D) {}
 
diff --git a/clang/include/clang/AST/ASTNodeTraverser.h b/clang/include/clang/AST/ASTNodeTraverser.h
index fe08d637a1e1d..ea68cc70f9131 100644
--- a/clang/include/clang/AST/ASTNodeTraverser.h
+++ b/clang/include/clang/AST/ASTNodeTraverser.h
@@ -533,11 +533,6 @@ class ASTNodeTraverser
     for (unsigned I=0, N=TL.getNumArgs(); I < N; ++I)
       dumpTemplateArgumentLoc(TL.getArgLoc(I));
   }
-  void VisitDependentTemplateSpecializationTypeLoc(
-      DependentTemplateSpecializationTypeLoc TL) {
-    for (unsigned I=0, N=TL.getNumArgs(); I < N; ++I)
-      dumpTemplateArgumentLoc(TL.getArgLoc(I));
-  }
 
   void VisitTypedefDecl(const TypedefDecl *D) { Visit(D->getUnderlyingType()); }
 
diff --git a/clang/include/clang/AST/DeclCXX.h b/clang/include/clang/AST/DeclCXX.h
index 8802664031d37..898487bffec08 100644
--- a/clang/include/clang/AST/DeclCXX.h
+++ b/clang/include/clang/AST/DeclCXX.h
@@ -2873,6 +2873,7 @@ class CXXDestructorDecl : public CXXMethodDecl {
   // FIXME: Don't allocate storage for these except in the first declaration
   // of a virtual destructor.
   FunctionDecl *OperatorDelete = nullptr;
+  FunctionDecl *OperatorGlobalDelete = nullptr;
   Expr *OperatorDeleteThisArg = nullptr;
 
   CXXDestructorDecl(ASTContext &C, CXXRecordDecl *RD, SourceLocation StartLoc,
@@ -2898,11 +2899,16 @@ class CXXDestructorDecl : public CXXMethodDecl {
   static CXXDestructorDecl *CreateDeserialized(ASTContext &C, GlobalDeclID ID);
 
   void setOperatorDelete(FunctionDecl *OD, Expr *ThisArg);
+  void setOperatorGlobalDelete(FunctionDecl *OD);
 
   const FunctionDecl *getOperatorDelete() const {
     return getCanonicalDecl()->OperatorDelete;
   }
 
+  const FunctionDecl *getOperatorGlobalDelete() const {
+    return getCanonicalDecl()->OperatorGlobalDelete;
+  }
+
   Expr *getOperatorDeleteThisArg() const {
     return getCanonicalDecl()->OperatorDeleteThisArg;
   }
diff --git a/clang/include/clang/AST/OpenMPClause.h b/clang/include/clang/AST/OpenMPClause.h
index 72effbc3e02fc..b2a6d4b9182b0 100644
--- a/clang/include/clang/AST/OpenMPClause.h
+++ b/clang/include/clang/AST/OpenMPClause.h
@@ -1269,6 +1269,12 @@ class OMPDefaultClause : public OMPClause {
   /// Start location of the kind in source code.
   SourceLocation KindKwLoc;
 
+  /// Variable-Category to indicate where Kind is applied
+  OpenMPDefaultClauseVariableCategory VC = OMPC_DEFAULT_VC_all;
+
+  /// Start location of Variable-Category
+  SourceLocation VCLoc;
+
   /// Set kind of the clauses.
   ///
   /// \param K Argument of clause.
@@ -1279,6 +1285,15 @@ class OMPDefaultClause : public OMPClause {
   /// \param KLoc Argument location.
   void setDefaultKindKwLoc(SourceLocation KLoc) { KindKwLoc = KLoc; }
 
+  /// Set Variable Category used with the Kind Clause (Default Modifier)
+  void setDefaultVariableCategory(OpenMPDefaultClauseVariableCategory VC) {
+    this->VC = VC;
+  }
+
+  void setDefaultVariableCategoryLocation(SourceLocation VCLoc) {
+    this->VCLoc = VCLoc;
+  }
+
 public:
   /// Build 'default' clause with argument \a A ('none' or 'shared').
   ///
@@ -1288,10 +1303,11 @@ class OMPDefaultClause : public OMPClause {
   /// \param LParenLoc Location of '('.
   /// \param EndLoc Ending location of the clause.
   OMPDefaultClause(llvm::omp::DefaultKind A, SourceLocation ALoc,
+                   OpenMPDefaultClauseVariableCategory VC, SourceLocation VCLoc,
                    SourceLocation StartLoc, SourceLocation LParenLoc,
                    SourceLocation EndLoc)
       : OMPClause(llvm::omp::OMPC_default, StartLoc, EndLoc),
-        LParenLoc(LParenLoc), Kind(A), KindKwLoc(ALoc) {}
+        LParenLoc(LParenLoc), Kind(A), KindKwLoc(ALoc), VC(VC), VCLoc(VCLoc) {}
 
   /// Build an empty clause.
   OMPDefaultClause()
@@ -1310,6 +1326,10 @@ class OMPDefaultClause : public OMPClause {
   /// Returns location of clause kind.
   SourceLocation getDefaultKindKwLoc() const { return KindKwLoc; }
 
+  OpenMPDefaultClauseVariableCategory getDefaultVC() const { return VC; }
+
+  SourceLocation getDefaultVCLoc() const { return VCLoc; }
+
   child_range children() {
     return child_range(child_iterator(), child_iterator());
   }
diff --git a/clang/include/clang/AST/RecursiveASTVisitor.h b/clang/include/clang/AST/RecursiveASTVisitor.h
index 02581c8e73299..c1944487716de 100644
--- a/clang/include/clang/AST/RecursiveASTVisitor.h
+++ b/clang/include/clang/AST/RecursiveASTVisitor.h
@@ -1192,13 +1192,6 @@ DEF_TRAVERSE_TYPE(DependentNameType, {
     TRY_TO(TraverseNestedNameSpecifier(T->getQualifier()));
 })
 
-DEF_TRAVERSE_TYPE(DependentTemplateSpecializationType, {
-  const DependentTemplateStorage &S = T->getDependentTemplateName();
-  if (TraverseQualifier)
-    TRY_TO(TraverseNestedNameSpecifier(S.getQualifier()));
-  TRY_TO(TraverseTemplateArguments(T->template_arguments()));
-})
-
 DEF_TRAVERSE_TYPE(TemplateSpecializationType, {
   if (TraverseQualifier) {
     TRY_TO(TraverseTemplateName(T->getTemplateName()));
@@ -1546,15 +1539,6 @@ DEF_TRAVERSE_TYPELOC(DependentNameType, {
     TRY_TO(TraverseNestedNameSpecifierLoc(TL.getQualifierLoc()));
 })
 
-DEF_TRAVERSE_TYPELOC(DependentTemplateSpecializationType, {
-  if (TraverseQualifier)
-    TRY_TO(TraverseNestedNameSpecifierLoc(TL.getQualifierLoc()));
-
-  for (unsigned I = 0, E = TL.getNumArgs(); I != E; ++I) {
-    TRY_TO(TraverseTemplateArgumentLoc(TL.getArgLoc(I)));
-  }
-})
-
 DEF_TRAVERSE_TYPELOC(TemplateSpecializationType, {
   if (TraverseQualifier)
     TRY_TO(TraverseNestedNameSpecifierLoc(TL.getQualifierLoc()));
diff --git a/clang/include/clang/AST/StmtOpenMP.h b/clang/include/clang/AST/StmtOpenMP.h
index a436676113921..d9f87f1e49b40 100644
--- a/clang/include/clang/AST/StmtOpenMP.h
+++ b/clang/include/clang/AST/StmtOpenMP.h
@@ -956,30 +956,46 @@ class OMPLoopBasedDirective : public OMPExecutableDirective {
   }
 };
 
+/// Common class of data shared between
+/// OMPCanonicalLoopNestTransformationDirective and transformations over
+/// canonical loop sequences.
+class OMPLoopTransformationDirective {
+  /// Number of (top-level) generated loops.
+  /// This value is 1 for most transformations as they only map one loop nest
+  /// into another.
+  /// Some loop transformations (like a non-partial 'unroll') may not generate
+  /// a loop nest, so this would be 0.
+  /// Some loop transformations (like 'fuse' with looprange and 'split') may
+  /// generate more than one loop nest, so the value would be >= 1.
+  unsigned NumGeneratedTopLevelLoops = 1;
+
+protected:
+  void setNumGeneratedTopLevelLoops(unsigned N) {
+    NumGeneratedTopLevelLoops = N;
+  }
+
+public:
+  unsigned getNumGeneratedTopLevelLoops() const {
+    return NumGeneratedTopLevelLoops;
+  }
+};
+
 /// The base class for all transformation directives of canonical loop nests.
 class OMPCanonicalLoopNestTransformationDirective
-    : public OMPLoopBasedDirective {
+    : public OMPLoopBasedDirective,
+      public OMPLoopTransformationDirective {
   friend class ASTStmtReader;
 
-  /// Number of loops generated by this loop transformation.
-  unsigned NumGeneratedLoops = 0;
-
 protected:
   explicit OMPCanonicalLoopNestTransformationDirective(
       StmtClass SC, OpenMPDirectiveKind Kind, SourceLocation StartLoc,
       SourceLocation EndLoc, unsigned NumAssociatedLoops)
       : OMPLoopBasedDirective(SC, Kind, StartLoc, EndLoc, NumAssociatedLoops) {}
 
-  /// Set the number of loops generated by this loop transformation.
-  void setNumGeneratedLoops(unsigned Num) { NumGeneratedLoops = Num; }
-
 public:
   /// Return the number of associated (consumed) loops.
   unsigned getNumAssociatedLoops() const { return getLoopsNumber(); }
 
-  /// Return the number of loops generated by this loop transformation.
-  unsigned getNumGeneratedLoops() const { return NumGeneratedLoops; }
-
   /// Get the de-sugared statements after the loop transformation.
   ///
   /// Might be nullptr if either the directive generates no loops and is handled
@@ -5560,9 +5576,7 @@ class OMPTileDirective final
                             unsigned NumLoops)
       : OMPCanonicalLoopNestTransformationDirective(
             OMPTileDirectiveClass, llvm::omp::OMPD_tile, StartLoc, EndLoc,
-            NumLoops) {
-    setNumGeneratedLoops(2 * NumLoops);
-  }
+            NumLoops) {}
 
   void setPreInits(Stmt *PreInits) {
     Data->getChildren()[PreInitsOffset] = PreInits;
@@ -5638,9 +5652,7 @@ class OMPStripeDirective final
                               unsigned NumLoops)
       : OMPCanonicalLoopNestTransformationDirective(
             OMPStripeDirectiveClass, llvm::omp::OMPD_stripe, StartLoc, EndLoc,
-            NumLoops) {
-    setNumGeneratedLoops(2 * NumLoops);
-  }
+            NumLoops) {}
 
   void setPreInits(Stmt *PreInits) {
     Data->getChildren()[PreInitsOffset] = PreInits;
@@ -5744,7 +5756,8 @@ class OMPUnrollDirective final
   static OMPUnrollDirective *
   Create(const ASTContext &C, SourceLocation StartLoc, SourceLocation EndLoc,
          ArrayRef<OMPClause *> Clauses, Stmt *AssociatedStmt,
-         unsigned NumGeneratedLoops, Stmt *TransformedStmt, Stmt *PreInits);
+         unsigned NumGeneratedTopLevelLoops, Stmt *TransformedStmt,
+         Stmt *PreInits);
 
   /// Build an empty '#pragma omp unroll' AST node for deserialization.
   ///
@@ -5794,9 +5807,7 @@ class OMPReverseDirective final
                                unsigned NumLoops)
       : OMPCanonicalLoopNestTransformationDirective(
             OMPReverseDirectiveClass, llvm::omp::OMPD_reverse, StartLoc, EndLoc,
-            NumLoops) {
-    setNumGeneratedLoops(NumLoops);
-  }
+            NumLoops) {}
 
   void setPreInits(Stmt *PreInits) {
     Data->getChildren()[PreInitsOffset] = PreInits;
@@ -5867,9 +5878,7 @@ class OMPInterchangeDirective final
                                    SourceLocation EndLoc, unsigned NumLoops)
       : OMPCanonicalLoopNestTransformationDirective(
             OMPInterchangeDirectiveClass, llvm::omp::OMPD_interchange, StartLoc,
-            EndLoc, NumLoops) {
-    setNumGeneratedLoops(NumLoops);
-  }
+            EndLoc, NumLoops) {}
 
   void setPreInits(Stmt *PreInits) {
     Data->getChildren()[PreInitsOffset] = PreInits;
diff --git a/clang/include/clang/AST/TemplateName.h b/clang/include/clang/AST/TemplateName.h
index abb0669bff378..b6999a1b4e9b9 100644
--- a/clang/include/clang/AST/TemplateName.h
+++ b/clang/include/clang/AST/TemplateName.h
@@ -297,10 +297,10 @@ class TemplateName {
   /// set of function templates, returns NULL.
   TemplateDecl *getAsTemplateDecl(bool IgnoreDeduced = false) const;
 
-  /// Retrieves the underlying template declaration that
+  /// Retrieves the underlying template name that
   /// this template name refers to, along with the
   /// deduced default arguments, if any.
-  std::pair<TemplateDecl *, DefaultArguments>
+  std::pair<TemplateName, DefaultArguments>
   getTemplateDeclAndDefaultArgs() const;
 
   /// Retrieve the underlying, overloaded function template
diff --git a/clang/include/clang/AST/Type.h b/clang/include/clang/AST/Type.h
index 48575c1b19395..df106d5b12c8d 100644
--- a/clang/include/clang/AST/Type.h
+++ b/clang/include/clang/AST/Type.h
@@ -19,7 +19,7 @@
 
 #include "clang/AST/Decl.h"
 #include "clang/AST/DeclCXX.h"
-#include "clang/AST/TypeBase.h"
+#include "clang/AST/TypeBase.h" // IWYU pragma: export
 
 namespace clang {
 
diff --git a/clang/include/clang/AST/TypeBase.h b/clang/include/clang/AST/TypeBase.h
index db2ab04e4471c..b02d9c7499fe5 100644
--- a/clang/include/clang/AST/TypeBase.h
+++ b/clang/include/clang/AST/TypeBase.h
@@ -2250,22 +2250,6 @@ class alignas(TypeAlignment) Type : public ExtQualsTypeCommonBase {
     unsigned NumArgs;
   };
 
-  class DependentTemplateSpecializationTypeBitfields {
-    friend class DependentTemplateSpecializationType;
-
-    LLVM_PREFERRED_TYPE(KeywordWrapperBitfields)
-    unsigned : NumTypeWithKeywordBits;
-
-    /// The number of template arguments named in this class template
-    /// specialization, which is expected to be able to hold at least 1024
-    /// according to [implimits]. However, as this limit is somewhat easy to
-    /// hit with template metaprogramming we'd prefer to keep it as large
-    /// as possible. At the moment it has been left as a non-bitfield since
-    /// this type safely fits in 64 bits as an unsigned, so there is no reason
-    /// to introduce the performance impact of a bitfield.
-    unsigned NumArgs;
-  };
-
   class PackExpansionTypeBitfields {
     friend class PackExpansionType;
 
@@ -2346,8 +2330,6 @@ class alignas(TypeAlignment) Type : public ExtQualsTypeCommonBase {
     SubstTemplateTypeParmTypeBitfields SubstTemplateTypeParmTypeBits;
     SubstPackTypeBitfields SubstPackTypeBits;
     TemplateSpecializationTypeBitfields TemplateSpecializationTypeBits;
-    DependentTemplateSpecializationTypeBitfields
-      DependentTemplateSpecializationTypeBits;
     PackExpansionTypeBitfields PackExpansionTypeBits;
     CountAttributedTypeBitfields CountAttributedTypeBits;
     PresefinedSugarTypeBitfields PredefinedSugarTypeBits;
@@ -7366,9 +7348,9 @@ class TemplateSpecializationType : public TypeWithKeyword,
   }
 
   void Profile(llvm::FoldingSetNodeID &ID, const ASTContext &Ctx);
-  static void Profile(llvm::FoldingSetNodeID &ID, TemplateName T,
-                      ArrayRef<TemplateArgument> Args, QualType Underlying,
-                      const ASTContext &Context);
+  static void Profile(llvm::FoldingSetNodeID &ID, ElaboratedTypeKeyword Keyword,
+                      TemplateName T, ArrayRef<TemplateArgument> Args,
+                      QualType Underlying, const ASTContext &Context);
 
   static bool classof(const Type *T) {
     return T->getTypeClass() == TemplateSpecialization;
@@ -7459,46 +7441,6 @@ class DependentNameType : public TypeWithKeyword, public llvm::FoldingSetNode {
   }
 };
 
-/// Represents a template specialization type whose template cannot be
-/// resolved, e.g.
-///   A<T>::template B<T>
-class DependentTemplateSpecializationType : public TypeWithKeyword {
-  friend class ASTContext; // ASTContext creates these
-
-  DependentTemplateStorage Name;
-
-  DependentTemplateSpecializationType(ElaboratedTypeKeyword Keyword,
-                                      const DependentTemplateStorage &Name,
-                                      ArrayRef<TemplateArgument> Args,
-                                      QualType Canon);
-
-public:
-  const DependentTemplateStorage &getDependentTemplateName() const {
-    return Name;
-  }
-
-  ArrayRef<TemplateArgument> template_arguments() const {
-    return {reinterpret_cast<const TemplateArgument *>(this + 1),
-            DependentTemplateSpecializationTypeBits.NumArgs};
-  }
-
-  bool isSugared() const { return false; }
-  QualType desugar() const { return QualType(this, 0); }
-
-  void Profile(llvm::FoldingSetNodeID &ID, const ASTContext &Context) {
-    Profile(ID, Context, getKeyword(), Name, template_arguments());
-  }
-
-  static void Profile(llvm::FoldingSetNodeID &ID, const ASTContext &Context,
-                      ElaboratedTypeKeyword Keyword,
-                      const DependentTemplateStorage &Name,
-                      ArrayRef<TemplateArgument> Args);
-
-  static bool classof(const Type *T) {
-    return T->getTypeClass() == DependentTemplateSpecialization;
-  }
-};
-
 /// Represents a pack expansion of types.
 ///
 /// Pack expansions are part of C++11 variadic templates. A pack
@@ -9150,10 +9092,7 @@ inline const StreamingDiagnostic &operator<<(const StreamingDiagnostic &PD,
 
 // Helper class template that is used by Type::getAs to ensure that one does
 // not try to look through a qualified type to get to an array type.
-template <typename T>
-using TypeIsArrayType =
-    std::integral_constant<bool, std::is_same<T, ArrayType>::value ||
-                                     std::is_base_of<ArrayType, T>::value>;
+template <typename T> using TypeIsArrayType = std::is_base_of<ArrayType, T>;
 
 // Member-template getAs<specific type>'.
 template <typename T> const T *Type::getAs() const {
diff --git a/clang/include/clang/AST/TypeLoc.h b/clang/include/clang/AST/TypeLoc.h
index d52e10419e97a..38e8fba569396 100644
--- a/clang/include/clang/AST/TypeLoc.h
+++ b/clang/include/clang/AST/TypeLoc.h
@@ -2598,134 +2598,6 @@ class DependentNameTypeLoc : public ConcreteTypeLoc<UnqualTypeLoc,
   void initializeLocal(ASTContext &Context, SourceLocation Loc);
 };
 
-struct DependentTemplateSpecializationLocInfo : DependentNameLocInfo {
-  SourceLocation TemplateKWLoc;
-  SourceLocation LAngleLoc;
-  SourceLocation RAngleLoc;
-  // followed by a TemplateArgumentLocInfo[]
-};
-
-class DependentTemplateSpecializationTypeLoc :
-    public ConcreteTypeLoc<UnqualTypeLoc,
-                           DependentTemplateSpecializationTypeLoc,
-                           DependentTemplateSpecializationType,
-                           DependentTemplateSpecializationLocInfo> {
-public:
-  SourceLocation getElaboratedKeywordLoc() const {
-    return this->getLocalData()->ElaboratedKWLoc;
-  }
-
-  void setElaboratedKeywordLoc(SourceLocation Loc) {
-    this->getLocalData()->ElaboratedKWLoc = Loc;
-  }
-
-  NestedNameSpecifierLoc getQualifierLoc() const {
-    if (!getLocalData()->QualifierData)
-      return NestedNameSpecifierLoc();
-
-    return NestedNameSpecifierLoc(
-        getTypePtr()->getDependentTemplateName().getQualifier(),
-        getLocalData()->QualifierData);
-  }
-
-  void setQualifierLoc(NestedNameSpecifierLoc QualifierLoc) {
-    if (!QualifierLoc) {
-      // Even if we have a nested-name-specifier in the dependent
-      // template specialization type, we won't record the nested-name-specifier
-      // location information when this type-source location information is
-      // part of a nested-name-specifier.
-      getLocalData()->QualifierData = nullptr;
-      return;
-    }
-
-    assert(QualifierLoc.getNestedNameSpecifier() ==
-               getTypePtr()->getDependentTemplateName().getQualifier() &&
-           "Inconsistent nested-name-specifier pointer");
-    getLocalData()->QualifierData = QualifierLoc.getOpaqueData();
-  }
-
-  SourceLocation getTemplateKeywordLoc() const {
-    return getLocalData()->TemplateKWLoc;
-  }
-
-  void setTemplateKeywordLoc(SourceLocation Loc) {
-    getLocalData()->TemplateKWLoc = Loc;
-  }
-
-  SourceLocation getTemplateNameLoc() const {
-    return this->getLocalData()->NameLoc;
-  }
-
-  void setTemplateNameLoc(SourceLocation Loc) {
-    this->getLocalData()->NameLoc = Loc;
-  }
-
-  SourceLocation getLAngleLoc() const {
-    return this->getLocalData()->LAngleLoc;
-  }
-
-  void setLAngleLoc(SourceLocation Loc) {
-    this->getLocalData()->LAngleLoc = Loc;
-  }
-
-  SourceLocation getRAngleLoc() const {
-    return this->getLocalData()->RAngleLoc;
-  }
-
-  void setRAngleLoc(SourceLocation Loc) {
-    this->getLocalData()->RAngleLoc = Loc;
-  }
-
-  unsigned getNumArgs() const {
-    return getTypePtr()->template_arguments().size();
-  }
-
-  void setArgLocInfo(unsigned i, TemplateArgumentLocInfo AI) {
-    getArgInfos()[i] = AI;
-  }
-
-  TemplateArgumentLocInfo getArgLocInfo(unsigned i) const {
-    return getArgInfos()[i];
-  }
-
-  TemplateArgumentLoc getArgLoc(unsigned i) const {
-    return TemplateArgumentLoc(getTypePtr()->template_arguments()[i],
-                               getArgLocInfo(i));
-  }
-
-  SourceRange getLocalSourceRange() const {
-    if (getElaboratedKeywordLoc().isValid())
-      return SourceRange(getElaboratedKeywordLoc(), getRAngleLoc());
-    else if (getQualifierLoc())
-      return SourceRange(getQualifierLoc().getBeginLoc(), getRAngleLoc());
-    else if (getTemplateKeywordLoc().isValid())
-      return SourceRange(getTemplateKeywordLoc(), getRAngleLoc());
-    else
-      return SourceRange(getTemplateNameLoc(), getRAngleLoc());
-  }
-
-  void copy(DependentTemplateSpecializationTypeLoc Loc) {
-    unsigned size = getFullDataSize();
-    assert(size == Loc.getFullDataSize());
-    memcpy(Data, Loc.Data, size);
-  }
-
-  void initializeLocal(ASTContext &Context, SourceLocation Loc);
-
-  unsigned getExtraLocalDataSize() const {
-    return getNumArgs() * sizeof(TemplateArgumentLocInfo);
-  }
-
-  unsigned getExtraLocalDataAlignment() const {
-    return alignof(TemplateArgumentLocInfo);
-  }
-
-private:
-  TemplateArgumentLocInfo *getArgInfos() const {
-    return static_cast<TemplateArgumentLocInfo*>(getExtraLocalData());
-  }
-};
-
 struct PackExpansionTypeLocInfo {
   SourceLocation EllipsisLoc;
 };
diff --git a/clang/include/clang/AST/TypeProperties.td b/clang/include/clang/AST/TypeProperties.td
index 185a968217f97..b3932a67db69d 100644
--- a/clang/include/clang/AST/TypeProperties.td
+++ b/clang/include/clang/AST/TypeProperties.td
@@ -729,41 +729,6 @@ let Class = TemplateSpecializationType in {
   }]>;
 }
 
-let Class = DependentTemplateSpecializationType in {
-  def : ReadHelper<[{
-    const auto &dtn = node->getDependentTemplateName();
-    auto name = dtn.getName();
-  }]>;
-
-  def : Property<"qualifier", NestedNameSpecifier> {
-    let Read = [{ dtn.getQualifier() }];
-  }
-  def : Property<"identifier", Optional<Identifier>> {
-    let Read = [{ makeOptionalFromPointer(name.getIdentifier()) }];
-  }
-  def : Property<"operatorKind", OverloadedOperatorKind> {
-    let Conditional = [{ !identifier }];
-    let Read = [{ name.getOperator() }];
-  }
-  def : Property<"HasTemplateKeyword", Bool> {
-    let Read = [{ dtn.hasTemplateKeyword() }];
-  }
-
-  def : Property<"keyword", ElaboratedTypeKeyword> {
-    let Read = [{ node->getKeyword() }];
-  }
-  def : Property<"templateArguments", Array<TemplateArgument>> {
-    let Read = [{ node->template_arguments() }];
-  }
-
-  def : Creator<[{
-    DependentTemplateStorage S(qualifier, identifier ? IdentifierOrOverloadedOperator(*identifier) :
-                                                       IdentifierOrOverloadedOperator(*operatorKind),
-                               HasTemplateKeyword);
-    return ctx.getDependentTemplateSpecializationType(keyword, S, templateArguments);
-  }]>;
-}
-
 let Class = TemplateTypeParmType in {
   def : Property<"depth", UInt32> {
     let Read = [{ node->getDepth() }];
diff --git a/clang/include/clang/ASTMatchers/ASTMatchers.h b/clang/include/clang/ASTMatchers/ASTMatchers.h
index f1d88a9523838..492863ddfc4a1 100644
--- a/clang/include/clang/ASTMatchers/ASTMatchers.h
+++ b/clang/include/clang/ASTMatchers/ASTMatchers.h
@@ -7712,18 +7712,6 @@ AST_MATCHER_P(DecayedType, hasDecayedType, internal::Matcher<QualType>,
 /// \endcode
 extern const AstTypeMatcher<DependentNameType> dependentNameType;
 
-/// Matches a dependent template specialization type
-///
-/// Example matches A<T>::template B<T>
-/// \code
-///   template<typename T> struct A;
-///   template<typename T> struct declToImport {
-///     typename A<T>::template B<T> a;
-///   };
-/// \endcode
-extern const AstTypeMatcher<DependentTemplateSpecializationType>
-    dependentTemplateSpecializationType;
-
 /// Matches declarations whose declaration context, interpreted as a
 /// Decl, matches \c InnerMatcher.
 ///
diff --git a/clang/include/clang/Analysis/PathDiagnostic.h b/clang/include/clang/Analysis/PathDiagnostic.h
index 5907df022e449..197920d4cd100 100644
--- a/clang/include/clang/Analysis/PathDiagnostic.h
+++ b/clang/include/clang/Analysis/PathDiagnostic.h
@@ -885,6 +885,10 @@ class PathDiagnostic : public llvm::FoldingSetNode {
     return UniqueingDecl;
   }
 
+  /// Get a hash that identifies the issue.
+  SmallString<32> getIssueHash(const SourceManager &SrcMgr,
+                               const LangOptions &LangOpts) const;
+
   void flattenLocations() {
     Loc.flatten();
     for (const auto &I : pathImpl)
diff --git a/clang/include/clang/Basic/ABIVersions.def b/clang/include/clang/Basic/ABIVersions.def
index f6524bc3bafb9..92edcd830f031 100644
--- a/clang/include/clang/Basic/ABIVersions.def
+++ b/clang/include/clang/Basic/ABIVersions.def
@@ -127,6 +127,14 @@ ABI_VER_MAJOR(19)
 ///   - Incorrectly return C++ records in AVX registers on x86_64.
 ABI_VER_MAJOR(20)
 
+/// Attempt to be ABI-compatible with code generated by Clang 21.0.x.
+/// This causes clang to:
+///   - When targeting Windows emit scalar deleting destructors that are not
+///    compatible with scalar deleting destructors emitted by MSVC for the
+///    cases when the class whose destructor is being emitted defines
+///    operator delete.
+ABI_VER_MAJOR(21)
+
 /// Conform to the underlying platform's C and C++ ABIs as closely as we can.
 ABI_VER_LATEST(Latest)
 
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index fda16e42d2c6b..32b5aa5ac1377 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -696,6 +696,7 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_f16_f32, "V2hV2hfUiIb", "nc", "f32-to-f16
 //===----------------------------------------------------------------------===//
 // GFX1250+ only builtins.
 //===----------------------------------------------------------------------===//
+TARGET_BUILTIN(__builtin_amdgcn_s_cluster_barrier, "v", "n", "gfx1250-insts")
 
 TARGET_BUILTIN(__builtin_amdgcn_flat_prefetch, "vvC*0Ii", "nc", "vmem-pref-insts")
 TARGET_BUILTIN(__builtin_amdgcn_global_prefetch, "vvC*1Ii", "nc", "vmem-pref-insts")
diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td
index 1a8645f99e281..aac502091b57e 100644
--- a/clang/include/clang/Basic/BuiltinsX86.td
+++ b/clang/include/clang/Basic/BuiltinsX86.td
@@ -277,8 +277,8 @@ let Features = "sse2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] i
   def pslld128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
   def psllq128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>)">;
   def pmaddwd128 : X86Builtin<"_Vector<4, int>(_Vector<8, short>, _Vector<8, short>)">;
-  def pslldqi128_byteshift : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Constant int)">;
-  def psrldqi128_byteshift : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Constant int)">;
+  def pslldqi128_byteshift : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Constant int)">;
+  def psrldqi128_byteshift : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Constant int)">;
 }
 
 let Features = "sse2",
@@ -312,9 +312,6 @@ let Features = "ssse3", Attributes = [NoThrow, Const, RequiredVectorWidth<128>]
 
 let Features = "sse4.1", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
   def insertps128 : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Constant char)">;
-  def pblendw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>, _Constant int)">;
-  def blendpd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Constant int)">;
-  def blendps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Constant int)">;
   def packusdw128 : X86Builtin<"_Vector<8, short>(_Vector<4, int>, _Vector<4, int>)">;
   def roundps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Constant int)">;
   def roundss : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Constant int)">;
@@ -333,6 +330,9 @@ let Features = "sse4.1", Attributes = [NoThrow, Const, RequiredVectorWidth<128>]
 }
 
 let Features = "sse4.1", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
+  def pblendw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>, _Constant int)">;
+  def blendpd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Constant int)">;
+  def blendps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Constant int)">;
   def blendvpd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, double>)">;
   def blendvps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>)">;
   def pblendvb128 : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>, _Vector<16, char>)">;
@@ -469,8 +469,6 @@ let Features = "avx", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in
   def vpermilvarps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, int>)">;
   def vpermilvarpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, long long int>)">;
   def vpermilvarps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, int>)">;
-  def blendpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Constant int)">;
-  def blendps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Constant int)">;
   def shufpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Constant int)">;
   def shufps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Constant int)">;
   def dpps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Constant char)">;
@@ -495,6 +493,8 @@ let Features = "avx", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in
 }
 
 let Features = "avx", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in {
+  def blendpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Constant int)">;
+  def blendps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Constant int)">;
   def blendvpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Vector<4, double>)">;
   def blendvps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Vector<8, float>)">;
 }
@@ -575,7 +575,6 @@ let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] i
   def packuswb256 : X86Builtin<"_Vector<32, char>(_Vector<16, short>, _Vector<16, short>)">;
   def packusdw256 : X86Builtin<"_Vector<16, short>(_Vector<8, int>, _Vector<8, int>)">;
   def palignr256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>, _Constant int)">;
-  def pblendw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>, _Constant int)">;
   def phaddw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
   def phaddd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>)">;
   def phaddsw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
@@ -595,17 +594,15 @@ let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] i
   def psignw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
   def psignd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>)">;
   def psllw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<8, short>)">;
-  def pslldqi256_byteshift : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Constant int)">;
+  def pslldqi256_byteshift : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Constant int)">;
   def pslld256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<4, int>)">;
   def psllq256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<2, long long int>)">;
   def psraw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<8, short>)">;
   def psrad256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<4, int>)">;
-  def psrldqi256_byteshift : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Constant int)">;
+  def psrldqi256_byteshift : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Constant int)">;
   def psrlw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<8, short>)">;
   def psrld256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<4, int>)">;
   def psrlq256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<2, long long int>)">;
-  def pblendd128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Constant int)">;
-  def pblendd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Constant int)">;
   def permvarsi256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>)">;
   def permdf256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Constant int)">;
   def permvarsf256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, int>)">;
@@ -619,6 +616,10 @@ let Features = "avx2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWi
   def pavgb256 : X86Builtin<"_Vector<32, unsigned char>(_Vector<32, unsigned char>, _Vector<32, unsigned char>)">;
   def pavgw256 : X86Builtin<"_Vector<16, unsigned short>(_Vector<16, unsigned short>, _Vector<16, unsigned short>)">;
 
+  def pblendd128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Constant int)">;
+  def pblendd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Constant int)">;
+  def pblendw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>, _Constant int)">;
+
   def pblendvb256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>, _Vector<32, char>)">;
 
   def pmuldq256 : X86Builtin<"_Vector<4, long long int>(_Vector<8, int>, _Vector<8, int>)">;
@@ -2051,8 +2052,8 @@ let Features = "avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<512
       : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Vector<8, short>)">;
   def psrlw512
       : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Vector<8, short>)">;
-  def pslldqi512_byteshift : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Constant int)">;
-  def psrldqi512_byteshift : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Constant int)">;
+  def pslldqi512_byteshift : X86Builtin<"_Vector<64, char>(_Vector<64, char>, _Constant int)">;
+  def psrldqi512_byteshift : X86Builtin<"_Vector<64, char>(_Vector<64, char>, _Constant int)">;
 }
 
 let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
diff --git a/clang/include/clang/Basic/CodeGenOptions.def b/clang/include/clang/Basic/CodeGenOptions.def
index fda0da99b60c0..872f73ebf3810 100644
--- a/clang/include/clang/Basic/CodeGenOptions.def
+++ b/clang/include/clang/Basic/CodeGenOptions.def
@@ -322,6 +322,7 @@ CODEGENOPT(TimeTrace         , 1, 0, Benign) ///< Set when -ftime-trace is enabl
 VALUE_CODEGENOPT(TimeTraceGranularity, 32, 500, Benign) ///< Minimum time granularity (in microseconds),
                                                         ///< traced by time profiler
 CODEGENOPT(InterchangeLoops  , 1, 0, Benign) ///< Run loop-interchange.
+CODEGENOPT(FuseLoops         , 1, 0, Benign) ///< Run loop-fusion.
 CODEGENOPT(UnrollLoops       , 1, 0, Benign) ///< Control whether loops are unrolled.
 CODEGENOPT(RerollLoops       , 1, 0, Benign) ///< Control whether loops are rerolled.
 CODEGENOPT(NoUseJumpTables   , 1, 0, Benign) ///< Set when -fno-jump-tables is enabled.
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index b0e669cd3560d..409a8202d8a09 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -9048,7 +9048,7 @@ def ext_typecheck_convert_incompatible_pointer : ExtWarn<
   "; take the address with &|"
   "; remove *|"
   "; remove &}3">,
-  InGroup<IncompatiblePointerTypes>;
+  InGroup<IncompatiblePointerTypes>, DefaultError;
 def err_typecheck_convert_incompatible_pointer : Error<
   "incompatible pointer types "
   "%select{%diff{assigning to $ from $|assigning to different types}0,1"
@@ -11714,6 +11714,8 @@ def note_omp_default_dsa_none : Note<
   "explicit data sharing attribute requested here">;
 def note_omp_defaultmap_attr_none : Note<
   "explicit data sharing attribute, data mapping attribute, or is_device_ptr clause requested here">;
+def err_omp_default_vc : Error<
+  "wrong variable category specified with modifier %0 in the default clause">;
 def err_omp_wrong_dsa : Error<
   "%0 variable cannot be %1">;
 def err_omp_variably_modified_type_not_supported : Error<
diff --git a/clang/include/clang/Basic/OpenMPKinds.def b/clang/include/clang/Basic/OpenMPKinds.def
index 9d6f816eea91f..202d06fa1fcaa 100644
--- a/clang/include/clang/Basic/OpenMPKinds.def
+++ b/clang/include/clang/Basic/OpenMPKinds.def
@@ -35,6 +35,9 @@
 #ifndef OPENMP_DIST_SCHEDULE_KIND
 #define OPENMP_DIST_SCHEDULE_KIND(Name)
 #endif
+#ifndef OPENMP_DEFAULT_VARIABLE_CATEGORY
+#define OPENMP_DEFAULT_VARIABLE_CATEGORY(Name)
+#endif
 #ifndef OPENMP_DEFAULTMAP_KIND
 #define OPENMP_DEFAULTMAP_KIND(Name)
 #endif
@@ -112,6 +115,13 @@ OPENMP_SCHEDULE_MODIFIER(simd)
 OPENMP_DEVICE_MODIFIER(ancestor)
 OPENMP_DEVICE_MODIFIER(device_num)
 
+// Variable-category attributes for 'default' clause.
+OPENMP_DEFAULT_VARIABLE_CATEGORY(aggregate)
+OPENMP_DEFAULT_VARIABLE_CATEGORY(all)
+OPENMP_DEFAULT_VARIABLE_CATEGORY(allocatable)
+OPENMP_DEFAULT_VARIABLE_CATEGORY(pointer)
+OPENMP_DEFAULT_VARIABLE_CATEGORY(scalar)
+
 // Static attributes for 'defaultmap' clause.
 OPENMP_DEFAULTMAP_KIND(scalar)
 OPENMP_DEFAULTMAP_KIND(aggregate)
@@ -127,6 +137,8 @@ OPENMP_DEFAULTMAP_MODIFIER(firstprivate)
 OPENMP_DEFAULTMAP_MODIFIER(none)
 OPENMP_DEFAULTMAP_MODIFIER(default)
 OPENMP_DEFAULTMAP_MODIFIER(present)
+OPENMP_DEFAULTMAP_MODIFIER(storage)
+OPENMP_DEFAULTMAP_MODIFIER(private)
 
 // Static attributes for 'depend' clause.
 OPENMP_DEPEND_KIND(in)
@@ -267,6 +279,7 @@ OPENMP_DOACROSS_MODIFIER(source_omp_cur_iteration)
 #undef OPENMP_MAP_MODIFIER_KIND
 #undef OPENMP_MOTION_MODIFIER_KIND
 #undef OPENMP_DIST_SCHEDULE_KIND
+#undef OPENMP_DEFAULT_VARIABLE_CATEGORY
 #undef OPENMP_DEFAULTMAP_KIND
 #undef OPENMP_DEFAULTMAP_MODIFIER
 #undef OPENMP_DOACROSS_MODIFIER
diff --git a/clang/include/clang/Basic/OpenMPKinds.h b/clang/include/clang/Basic/OpenMPKinds.h
index d3285cd9c6a14..115af7b19d6e4 100644
--- a/clang/include/clang/Basic/OpenMPKinds.h
+++ b/clang/include/clang/Basic/OpenMPKinds.h
@@ -107,6 +107,13 @@ enum OpenMPDistScheduleClauseKind {
   OMPC_DIST_SCHEDULE_unknown
 };
 
+/// OpenMP variable-category for 'default' clause.
+enum OpenMPDefaultClauseVariableCategory {
+#define OPENMP_DEFAULT_VARIABLE_CATEGORY(Name) OMPC_DEFAULT_VC_##Name,
+#include "clang/Basic/OpenMPKinds.def"
+  OMPC_DEFAULT_VC_unknown
+};
+
 /// OpenMP attributes for 'defaultmap' clause.
 enum OpenMPDefaultmapClauseKind {
 #define OPENMP_DEFAULTMAP_KIND(Name) \
@@ -257,6 +264,10 @@ struct OMPInteropInfo final {
   llvm::SmallVector<Expr *, 4> PreferTypes;
 };
 
+OpenMPDefaultClauseVariableCategory
+getOpenMPDefaultVariableCategory(StringRef Str, const LangOptions &LangOpts);
+const char *getOpenMPDefaultVariableCategoryName(unsigned VC);
+
 unsigned getOpenMPSimpleClauseType(OpenMPClauseKind Kind, llvm::StringRef Str,
                                    const LangOptions &LangOpts);
 const char *getOpenMPSimpleClauseTypeName(OpenMPClauseKind Kind, unsigned Type);
diff --git a/clang/include/clang/Basic/Sarif.h b/clang/include/clang/Basic/Sarif.h
index e6c46224b316d..a88d1ee2965a9 100644
--- a/clang/include/clang/Basic/Sarif.h
+++ b/clang/include/clang/Basic/Sarif.h
@@ -322,6 +322,8 @@ class SarifResult {
   uint32_t RuleIdx;
   std::string RuleId;
   std::string DiagnosticMessage;
+  std::string HostedViewerURI;
+  llvm::SmallDenseMap<StringRef, std::string, 4> PartialFingerprints;
   llvm::SmallVector<CharSourceRange, 8> Locations;
   llvm::SmallVector<ThreadFlow, 8> ThreadFlows;
   std::optional<SarifResultLevel> LevelOverride;
@@ -347,6 +349,11 @@ class SarifResult {
     return *this;
   }
 
+  SarifResult setHostedViewerURI(llvm::StringRef URI) {
+    HostedViewerURI = URI.str();
+    return *this;
+  }
+
   SarifResult setLocations(llvm::ArrayRef<CharSourceRange> DiagLocs) {
 #ifndef NDEBUG
     for (const auto &Loc : DiagLocs) {
@@ -366,6 +373,12 @@ class SarifResult {
     LevelOverride = TheLevel;
     return *this;
   }
+
+  SarifResult addPartialFingerprint(llvm::StringRef key,
+                                    llvm::StringRef value) {
+    PartialFingerprints[key] = value;
+    return *this;
+  }
 };
 
 /// This class handles creating a valid SARIF document given various input
@@ -475,6 +488,8 @@ class SarifDocumentWriter {
   /// reported diagnostics, resulting in an expensive call.
   llvm::json::Object createDocument();
 
+  static std::string fileNameToURI(llvm::StringRef Filename);
+
 private:
   /// Source Manager to use for the current SARIF document.
   const SourceManager &SourceMgr;
diff --git a/clang/include/clang/Basic/TargetInfo.h b/clang/include/clang/Basic/TargetInfo.h
index 25b68622656fa..e5c5ada3b0858 100644
--- a/clang/include/clang/Basic/TargetInfo.h
+++ b/clang/include/clang/Basic/TargetInfo.h
@@ -1762,6 +1762,14 @@ class TargetInfo : public TransferrableTargetInfo,
   /// Clang backwards compatibility rather than GCC/Itanium ABI compatibility.
   virtual bool areDefaultedSMFStillPOD(const LangOptions&) const;
 
+  /// Controls whether global operator delete is called by the deleting
+  /// destructor or at the point where ::delete was called. Historically Clang
+  /// called global operator delete outside of the deleting destructor for both
+  /// Microsoft and Itanium ABI. In Clang 21 support for ::delete was aligned
+  /// with Microsoft ABI, so it will call global operator delete in the deleting
+  /// destructor body.
+  virtual bool callGlobalDeleteInDeletingDtor(const LangOptions &) const;
+
   /// Controls if __builtin_longjmp / __builtin_setjmp can be lowered to
   /// llvm.eh.sjlj.longjmp / llvm.eh.sjlj.setjmp.
   virtual bool hasSjLjLowering() const {
diff --git a/clang/include/clang/Basic/TypeNodes.td b/clang/include/clang/Basic/TypeNodes.td
index fb6862b90987f..db43a8529f02b 100644
--- a/clang/include/clang/Basic/TypeNodes.td
+++ b/clang/include/clang/Basic/TypeNodes.td
@@ -5,10 +5,11 @@ class TypeNode<TypeNode base, bit abstract = 0> : ASTNode {
   bit Abstract = abstract;
 }
 
-/// A type node that is only used to represent dependent types in C++.  For
-/// example, DependentTemplateSpecializationType is used to represent types
-/// where the base template-id is dependent (such as `T::foo<U>`).  Code
-/// that only works with non-dependent types can ignore these type nodes.
+/// A type node that is only used to represent dependent types in C++.
+/// For example, DependentSizedArrayType is used to represent types where the
+/// size expression is dependent (such as `T[V]`, where V is a constant template
+/// parameter). Code that only works with non-dependent types can ignore these
+/// type nodes.
 class AlwaysDependent {}
 
 /// A type node that is never used to represent a canonical type, which is to
@@ -96,7 +97,6 @@ def DeducedType : TypeNode<Type, 1>;
 def AutoType : TypeNode<DeducedType>;
 def DeducedTemplateSpecializationType : TypeNode<DeducedType>;
 def DependentNameType : TypeNode<Type>, AlwaysDependent;
-def DependentTemplateSpecializationType : TypeNode<Type>, AlwaysDependent;
 def PackExpansionType : TypeNode<Type>, AlwaysDependent;
 def PackIndexingType  : TypeNode<Type>, NeverCanonicalUnlessDependent;
 def ObjCTypeParamType : TypeNode<Type>, NeverCanonical;
diff --git a/clang/include/clang/Basic/arm_sme.td b/clang/include/clang/Basic/arm_sme.td
index a4eb92e76968c..5f6a6eaab80a3 100644
--- a/clang/include/clang/Basic/arm_sme.td
+++ b/clang/include/clang/Basic/arm_sme.td
@@ -156,16 +156,10 @@ let SMETargetGuard = "sme2p1" in {
 ////////////////////////////////////////////////////////////////////////////////
 // SME - Counting elements in a streaming vector
 
-multiclass ZACount<string n_suffix> {
-  def NAME : SInst<"sv" # n_suffix, "nv", "", MergeNone,
-                    "aarch64_sme_" # n_suffix,
-                    [IsOverloadNone, IsStreamingCompatible]>;
-}
-
-defm SVCNTSB : ZACount<"cntsb">;
-defm SVCNTSH : ZACount<"cntsh">;
-defm SVCNTSW : ZACount<"cntsw">;
-defm SVCNTSD : ZACount<"cntsd">;
+def SVCNTSB : SInst<"svcntsb", "nv", "", MergeNone, "", [IsOverloadNone, IsStreamingCompatible]>;
+def SVCNTSH : SInst<"svcntsh", "nv", "", MergeNone, "", [IsOverloadNone, IsStreamingCompatible]>;
+def SVCNTSW : SInst<"svcntsw", "nv", "", MergeNone, "", [IsOverloadNone, IsStreamingCompatible]>;
+def SVCNTSD : SInst<"svcntsd", "nv", "", MergeNone, "aarch64_sme_cntsd", [IsOverloadNone, IsStreamingCompatible]>;
 
 ////////////////////////////////////////////////////////////////////////////////
 // SME - ADDHA/ADDVA
diff --git a/clang/include/clang/CIR/Dialect/IR/CIROps.td b/clang/include/clang/CIR/Dialect/IR/CIROps.td
index b3c435cc59140..38c4a87f69d6d 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIROps.td
+++ b/clang/include/clang/CIR/Dialect/IR/CIROps.td
@@ -3847,6 +3847,16 @@ def CIR_ATanOp : CIR_UnaryFPToFPBuiltinOp<"atan", "ATanOp"> {
   }];
 }
 
+def CIR_CosOp : CIR_UnaryFPToFPBuiltinOp<"cos", "CosOp"> {
+  let summary = "Computes the floating-point cosine value";
+  let description = [{
+    `cir.cos` computes the cosine of a floating-point operand and returns
+    a result of the same type.
+
+    Floating-point exceptions are ignored, and it does not set `errno`.
+  }];
+}
+
 def CIR_FAbsOp : CIR_UnaryFPToFPBuiltinOp<"fabs", "FAbsOp"> {
   let summary = "Computes the floating-point absolute value";
   let description = [{
diff --git a/clang/include/clang/CIR/Interfaces/CIRLoopOpInterface.h b/clang/include/clang/CIR/Interfaces/CIRLoopOpInterface.h
index 3722c5e4a195c..90fd07919dc01 100644
--- a/clang/include/clang/CIR/Interfaces/CIRLoopOpInterface.h
+++ b/clang/include/clang/CIR/Interfaces/CIRLoopOpInterface.h
@@ -13,12 +13,14 @@
 #ifndef CLANG_CIR_INTERFACES_CIRLOOPOPINTERFACE_H
 #define CLANG_CIR_INTERFACES_CIRLOOPOPINTERFACE_H
 
+#include "llvm/ADT/APInt.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/OpDefinition.h"
 #include "mlir/IR/Operation.h"
 #include "mlir/Interfaces/ControlFlowInterfaces.h"
 #include "mlir/Interfaces/LoopLikeInterface.h"
 
+using llvm::APInt;
 namespace cir {
 namespace detail {
 
diff --git a/clang/include/clang/CIR/MissingFeatures.h b/clang/include/clang/CIR/MissingFeatures.h
index 52d5f8a2ded2c..60e0aa163dc04 100644
--- a/clang/include/clang/CIR/MissingFeatures.h
+++ b/clang/include/clang/CIR/MissingFeatures.h
@@ -296,8 +296,6 @@ struct MissingFeatures {
   // Future CIR operations
   static bool awaitOp() { return false; }
   static bool callOp() { return false; }
-  static bool complexImagOp() { return false; }
-  static bool complexRealOp() { return false; }
   static bool ifOp() { return false; }
   static bool invokeOp() { return false; }
   static bool labelOp() { return false; }
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index a7c514e809aa9..ef6665a87d2cc 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -4304,6 +4304,10 @@ def floop_interchange : Flag<["-"], "floop-interchange">, Group<f_Group>,
   HelpText<"Enable the loop interchange pass">, Visibility<[ClangOption, CC1Option, FlangOption, FC1Option]>;
 def fno_loop_interchange: Flag<["-"], "fno-loop-interchange">, Group<f_Group>,
   HelpText<"Disable the loop interchange pass">, Visibility<[ClangOption, CC1Option, FlangOption, FC1Option]>;
+defm experimental_loop_fusion
+    : OptInCC1FFlag<"experimental-loop-fusion", "Enable", "Disable",
+                    "Enable the loop fusion pass",
+                    [ClangOption, FlangOption, FC1Option]>;
 def funroll_loops : Flag<["-"], "funroll-loops">, Group<f_Group>,
   HelpText<"Turn on loop unroller">, Visibility<[ClangOption, CC1Option, FlangOption, FC1Option]>;
 def fno_unroll_loops : Flag<["-"], "fno-unroll-loops">, Group<f_Group>,
@@ -4678,8 +4682,10 @@ def gdbx : Flag<["-"], "gdbx">, Group<gTune_Group>;
 // Equivalent to our default dwarf version. Forces usual dwarf emission when
 // CodeView is enabled.
 def gdwarf : Flag<["-"], "gdwarf">, Group<g_Group>,
-  Visibility<[ClangOption, CLOption, DXCOption]>,
+  Visibility<[ClangOption, CLOption, DXCOption, FlangOption]>,
   HelpText<"Generate source-level debug information with the default dwarf version">;
+
+let Visibility = [ClangOption, FlangOption] in {
 def gdwarf_2 : Flag<["-"], "gdwarf-2">, Group<g_Group>,
   HelpText<"Generate source-level debug information with dwarf version 2">;
 def gdwarf_3 : Flag<["-"], "gdwarf-3">, Group<g_Group>,
@@ -4688,6 +4694,7 @@ def gdwarf_4 : Flag<["-"], "gdwarf-4">, Group<g_Group>,
   HelpText<"Generate source-level debug information with dwarf version 4">;
 def gdwarf_5 : Flag<["-"], "gdwarf-5">, Group<g_Group>,
   HelpText<"Generate source-level debug information with dwarf version 5">;
+}
 def gdwarf64 : Flag<["-"], "gdwarf64">, Group<g_Group>,
   Visibility<[ClangOption, CC1Option, CC1AsOption]>,
   HelpText<"Enables DWARF64 format for ELF binaries, if debug information emission is enabled.">,
@@ -4799,7 +4806,7 @@ defm structor_decl_linkage_names
                           "declarations in DWARF."
                           "Implies -g.">,
                   BothFlags<[], [ClangOption, CLOption, CC1Option]>>,
-                  DocBrief<[{On some ABIs (e.g., Itanium), constructors and destructors may have multiple variants. Historically, when generating DWARF, Clang did not attach ``DW_AT_linkage_name``s to structor DIEs because there were multiple possible manglings (depending on the structor variant) that could be used. With ``-gstructor-decl-linkage-names``, for ABIs with structor variants, we attach a "unified" mangled name to structor declarations DIEs which debuggers can use to look up all the definitions for a structor declaration. E.g., a "unified" mangled name ``_ZN3FooC4Ev`` may have multiple definitions associated with it such as ``_ZN3FooC1Ev`` and ``_ZN3FooC2Ev``.
+                  DocBrief<[{On some ABIs (e.g., Itanium), constructors and destructors may have multiple variants. Historically, when generating DWARF, Clang did not attach ``DW_AT_linkage_name`` to structor DIEs because there were multiple possible manglings (depending on the structor variant) that could be used. With ``-gstructor-decl-linkage-names``, for ABIs with structor variants, we attach a "unified" mangled name to structor declarations DIEs which debuggers can use to look up all the definitions for a structor declaration. E.g., a "unified" mangled name ``_ZN3FooC4Ev`` may have multiple definitions associated with it such as ``_ZN3FooC1Ev`` and ``_ZN3FooC2Ev``.
 
 Enabling this flag results in a better interactive debugging experience (both GDB and LLDB have support for understanding these "unified" linkage names). However, it comes with a significant increase in debug-info size (particularly the `.debug_str` section). As an escape hatch, users can disable this feature using ``-gno-structor-decl-linkage-names``.}]>;
 defm key_instructions : BoolGOption<"key-instructions",
@@ -7629,6 +7636,8 @@ def debug_info_kind_EQ : Joined<["-"], "debug-info-kind=">;
 def record_command_line : Separate<["-"], "record-command-line">,
   HelpText<"The string to embed in the .LLVM.command.line section.">,
   MarshallingInfoString<CodeGenOpts<"RecordCommandLine">>;
+def dwarf_version_EQ : Joined<["-"], "dwarf-version=">,
+  MarshallingInfoInt<CodeGenOpts<"DwarfVersion">>;
 
 } // let Visibility = [CC1Option, CC1AsOption, FC1Option]
 
@@ -7640,8 +7649,6 @@ def debug_info_macro : Flag<["-"], "debug-info-macro">,
 def default_function_attr : Separate<["-"], "default-function-attr">,
   HelpText<"Apply given attribute to all functions">,
   MarshallingInfoStringVector<CodeGenOpts<"DefaultFunctionAttrs">>;
-def dwarf_version_EQ : Joined<["-"], "dwarf-version=">,
-  MarshallingInfoInt<CodeGenOpts<"DwarfVersion">>;
 def debugger_tuning_EQ : Joined<["-"], "debugger-tuning=">,
   Values<"gdb,lldb,sce,dbx">,
   NormalizedValuesScope<"llvm::DebuggerKind">, NormalizedValues<["GDB", "LLDB", "SCE", "DBX"]>,
diff --git a/clang/include/clang/Format/Format.h b/clang/include/clang/Format/Format.h
index 5dfdb23594610..342fefcfc408c 100644
--- a/clang/include/clang/Format/Format.h
+++ b/clang/include/clang/Format/Format.h
@@ -2976,7 +2976,19 @@ struct FormatStyle {
     ///      #endif
     ///    #endif
     /// \endcode
-    PPDIS_BeforeHash
+    PPDIS_BeforeHash,
+    /// Leaves indentation of directives as-is.
+    /// \note
+    ///  Ignores ``PPIndentWidth``.
+    /// \endnote
+    /// \code
+    ///   #if FOO
+    ///     #if BAR
+    ///   #include <foo>
+    ///     #endif
+    ///   #endif
+    /// \endcode
+    PPDIS_Leave
   };
 
   /// The preprocessor directive indenting style to use.
@@ -3558,6 +3570,73 @@ struct FormatStyle {
   /// \version 9
   std::vector<std::string> NamespaceMacros;
 
+  /// Control over each component in a numeric literal.
+  enum NumericLiteralComponentStyle : int8_t {
+    /// Leave this component of the literal as is.
+    NLCS_Leave,
+    /// Format this component with uppercase characters.
+    NLCS_Upper,
+    /// Format this component with lowercase characters.
+    NLCS_Lower,
+  };
+
+  /// Separate control for each numeric literal component.
+  ///
+  /// For example, the config below will leave exponent letters alone, reformat
+  /// hexadecimal digits in lowercase, reformat numeric literal prefixes in
+  /// uppercase, and reformat suffixes in lowercase.
+  /// \code
+  ///   NumericLiteralCase:
+  ///     ExponentLetter: Leave
+  ///     HexDigit: Lower
+  ///     Prefix: Upper
+  ///     Suffix: Lower
+  /// \endcode
+  struct NumericLiteralCaseStyle {
+    /// Format floating point exponent separator letter case.
+    /// \code
+    ///   float a = 6.02e23 + 1.0E10; // Leave
+    ///   float a = 6.02E23 + 1.0E10; // Upper
+    ///   float a = 6.02e23 + 1.0e10; // Lower
+    /// \endcode
+    NumericLiteralComponentStyle ExponentLetter;
+    /// Format hexadecimal digit case.
+    /// \code
+    ///   a = 0xaBcDeF; // Leave
+    ///   a = 0xABCDEF; // Upper
+    ///   a = 0xabcdef; // Lower
+    /// \endcode
+    NumericLiteralComponentStyle HexDigit;
+    /// Format integer prefix case.
+    /// \code
+    ///    a = 0XF0 | 0b1; // Leave
+    ///    a = 0XF0 | 0B1; // Upper
+    ///    a = 0xF0 | 0b1; // Lower
+    /// \endcode
+    NumericLiteralComponentStyle Prefix;
+    /// Format suffix case. This option excludes case-sensitive reserved
+    /// suffixes, such as ``min`` in C++.
+    /// \code
+    ///   a = 1uLL; // Leave
+    ///   a = 1ULL; // Upper
+    ///   a = 1ull; // Lower
+    /// \endcode
+    NumericLiteralComponentStyle Suffix;
+
+    bool operator==(const NumericLiteralCaseStyle &R) const {
+      return ExponentLetter == R.ExponentLetter && HexDigit == R.HexDigit &&
+             Prefix == R.Prefix && Suffix == R.Suffix;
+    }
+
+    bool operator!=(const NumericLiteralCaseStyle &R) const {
+      return !(*this == R);
+    }
+  };
+
+  /// Capitalization style for numeric literals.
+  /// \version 22
+  NumericLiteralCaseStyle NumericLiteralCase;
+
   /// Controls bin-packing Objective-C protocol conformance list
   /// items into as few lines as possible when they go over ``ColumnLimit``.
   ///
@@ -5469,6 +5548,7 @@ struct FormatStyle {
            MaxEmptyLinesToKeep == R.MaxEmptyLinesToKeep &&
            NamespaceIndentation == R.NamespaceIndentation &&
            NamespaceMacros == R.NamespaceMacros &&
+           NumericLiteralCase == R.NumericLiteralCase &&
            ObjCBinPackProtocolList == R.ObjCBinPackProtocolList &&
            ObjCBlockIndentWidth == R.ObjCBlockIndentWidth &&
            ObjCBreakBeforeNestedBlockParam ==
diff --git a/clang/include/clang/Frontend/CompilerInstance.h b/clang/include/clang/Frontend/CompilerInstance.h
index 9f3d5f97cdff4..a6b6993b708d0 100644
--- a/clang/include/clang/Frontend/CompilerInstance.h
+++ b/clang/include/clang/Frontend/CompilerInstance.h
@@ -83,6 +83,9 @@ class CompilerInstance : public ModuleLoader {
   /// The options used in this compiler instance.
   std::shared_ptr<CompilerInvocation> Invocation;
 
+  /// The virtual file system instance.
+  IntrusiveRefCntPtr<llvm::vfs::FileSystem> VFS;
+
   /// The diagnostics engine instance.
   IntrusiveRefCntPtr<DiagnosticsEngine> Diagnostics;
 
@@ -409,9 +412,31 @@ class CompilerInstance : public ModuleLoader {
   /// @name Virtual File System
   /// @{
 
-  llvm::vfs::FileSystem &getVirtualFileSystem() const;
-  llvm::IntrusiveRefCntPtr<llvm::vfs::FileSystem>
-  getVirtualFileSystemPtr() const;
+  bool hasVirtualFileSystem() const { return VFS != nullptr; }
+
+  /// Create a virtual file system instance based on the invocation.
+  ///
+  /// @param BaseFS The file system that may be used when configuring the final
+  ///               file system, and act as the underlying file system. Must not
+  ///               be NULL.
+  /// @param DC If non-NULL, the diagnostic consumer to be used in case
+  ///           configuring the file system emits diagnostics. Note that the
+  ///           DiagnosticsEngine using the consumer won't obey the
+  ///           --warning-suppression-mappings= flag.
+  void createVirtualFileSystem(IntrusiveRefCntPtr<llvm::vfs::FileSystem>
+                                   BaseFS = llvm::vfs::getRealFileSystem(),
+                               DiagnosticConsumer *DC = nullptr);
+
+  /// Use the given file system.
+  void setVirtualFileSystem(IntrusiveRefCntPtr<llvm::vfs::FileSystem> FS) {
+    VFS = std::move(FS);
+  }
+
+  llvm::vfs::FileSystem &getVirtualFileSystem() const { return *VFS; }
+
+  IntrusiveRefCntPtr<llvm::vfs::FileSystem> getVirtualFileSystemPtr() const {
+    return VFS;
+  }
 
   /// @}
   /// @name File Manager
@@ -650,32 +675,31 @@ class CompilerInstance : public ModuleLoader {
   /// Note that this routine also replaces the diagnostic client,
   /// allocating one if one is not provided.
   ///
-  /// \param VFS is used for any IO needed when creating DiagnosticsEngine. It
-  /// doesn't replace VFS in the CompilerInstance (if any).
-  ///
   /// \param Client If non-NULL, a diagnostic client that will be
   /// attached to (and, then, owned by) the DiagnosticsEngine inside this AST
   /// unit.
   ///
   /// \param ShouldOwnClient If Client is non-NULL, specifies whether
   /// the diagnostic object should take ownership of the client.
-  void createDiagnostics(llvm::vfs::FileSystem &VFS,
-                         DiagnosticConsumer *Client = nullptr,
+  void createDiagnostics(DiagnosticConsumer *Client = nullptr,
                          bool ShouldOwnClient = true);
 
-  /// Create a DiagnosticsEngine object with a the TextDiagnosticPrinter.
+  /// Create a DiagnosticsEngine object.
   ///
   /// If no diagnostic client is provided, this creates a
   /// DiagnosticConsumer that is owned by the returned diagnostic
   /// object, if using directly the caller is responsible for
   /// releasing the returned DiagnosticsEngine's client eventually.
   ///
+  /// \param VFS The file system used to load the suppression mappings file.
+  ///
   /// \param Opts - The diagnostic options; note that the created text
   /// diagnostic object contains a reference to these options.
   ///
   /// \param Client If non-NULL, a diagnostic client that will be
   /// attached to (and, then, owned by) the returned DiagnosticsEngine
-  /// object.
+  /// object. If NULL, the returned DiagnosticsEngine will own a newly-created
+  /// client.
   ///
   /// \param CodeGenOpts If non-NULL, the code gen options in use, which may be
   /// used by some diagnostics printers (for logging purposes only).
@@ -690,8 +714,7 @@ class CompilerInstance : public ModuleLoader {
   /// Create the file manager and replace any existing one with it.
   ///
   /// \return The new file manager on success, or null on failure.
-  FileManager *
-  createFileManager(IntrusiveRefCntPtr<llvm::vfs::FileSystem> VFS = nullptr);
+  FileManager *createFileManager();
 
   /// Create the source manager and replace any existing one with it.
   void createSourceManager(FileManager &FileMgr);
diff --git a/clang/include/clang/Interpreter/Interpreter.h b/clang/include/clang/Interpreter/Interpreter.h
index 61af7bf762d5e..078d70b3b1749 100644
--- a/clang/include/clang/Interpreter/Interpreter.h
+++ b/clang/include/clang/Interpreter/Interpreter.h
@@ -135,11 +135,15 @@ class Interpreter {
     std::string OrcRuntimePath = "";
     /// PID of the out-of-process JIT executor.
     uint32_t ExecutorPID = 0;
+    /// Custom lambda to be executed inside child process/executor
+    std::function<void()> CustomizeFork = nullptr;
+    /// An optional code model to provide to the JITTargetMachineBuilder
+    std::optional<llvm::CodeModel::Model> CM = std::nullopt;
 
     JITConfig()
         : IsOutOfProcess(false), OOPExecutor(""), OOPExecutorConnect(""),
           UseSharedMemory(false), SlabAllocateSize(0), OrcRuntimePath(""),
-          ExecutorPID(0) {}
+          ExecutorPID(0), CustomizeFork(nullptr), CM(std::nullopt) {}
   };
 
 protected:
diff --git a/clang/include/clang/Sema/HeuristicResolver.h b/clang/include/clang/Sema/HeuristicResolver.h
index 71588bee92d16..15357152ce964 100644
--- a/clang/include/clang/Sema/HeuristicResolver.h
+++ b/clang/include/clang/Sema/HeuristicResolver.h
@@ -54,15 +54,13 @@ class HeuristicResolver {
   std::vector<const NamedDecl *>
   resolveDeclRefExpr(const DependentScopeDeclRefExpr *RE) const;
   std::vector<const NamedDecl *>
-  resolveTypeOfCallExpr(const CallExpr *CE) const;
-  std::vector<const NamedDecl *>
   resolveCalleeOfCallExpr(const CallExpr *CE) const;
   std::vector<const NamedDecl *>
   resolveUsingValueDecl(const UnresolvedUsingValueDecl *UUVD) const;
   std::vector<const NamedDecl *>
   resolveDependentNameType(const DependentNameType *DNT) const;
   std::vector<const NamedDecl *> resolveTemplateSpecializationType(
-      const DependentTemplateSpecializationType *DTST) const;
+      const TemplateSpecializationType *TST) const;
 
   // Try to heuristically resolve a dependent nested name specifier
   // to the type it likely denotes. Note that *dependent* name specifiers always
@@ -93,6 +91,10 @@ class HeuristicResolver {
   // during simplification, and the operation fails if no pointer type is found.
   QualType simplifyType(QualType Type, const Expr *E, bool UnwrapPointer);
 
+  // Try to heuristically resolve the type of a possibly-dependent expression
+  // `E`.
+  QualType resolveExprToType(const Expr *E) const;
+
   // Given an expression `Fn` representing the callee in a function call,
   // if the call is through a function pointer, try to find the declaration of
   // the corresponding function pointer type, so that we can recover argument
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index a7600ab88febe..d017d1f829015 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -2819,26 +2819,27 @@ class Sema final : public SemaBase {
 
   /// BuiltinConstantArg - Handle a check if argument ArgNum of CallExpr
   /// TheCall is a constant expression.
-  bool BuiltinConstantArg(CallExpr *TheCall, int ArgNum, llvm::APSInt &Result);
+  bool BuiltinConstantArg(CallExpr *TheCall, unsigned ArgNum,
+                          llvm::APSInt &Result);
 
   /// BuiltinConstantArgRange - Handle a check if argument ArgNum of CallExpr
   /// TheCall is a constant expression in the range [Low, High].
-  bool BuiltinConstantArgRange(CallExpr *TheCall, int ArgNum, int Low, int High,
-                               bool RangeIsError = true);
+  bool BuiltinConstantArgRange(CallExpr *TheCall, unsigned ArgNum, int Low,
+                               int High, bool RangeIsError = true);
 
   /// BuiltinConstantArgMultiple - Handle a check if argument ArgNum of CallExpr
   /// TheCall is a constant expression is a multiple of Num..
-  bool BuiltinConstantArgMultiple(CallExpr *TheCall, int ArgNum,
+  bool BuiltinConstantArgMultiple(CallExpr *TheCall, unsigned ArgNum,
                                   unsigned Multiple);
 
   /// BuiltinConstantArgPower2 - Check if argument ArgNum of TheCall is a
   /// constant expression representing a power of 2.
-  bool BuiltinConstantArgPower2(CallExpr *TheCall, int ArgNum);
+  bool BuiltinConstantArgPower2(CallExpr *TheCall, unsigned ArgNum);
 
   /// BuiltinConstantArgShiftedByte - Check if argument ArgNum of TheCall is
   /// a constant expression representing an arbitrary byte value shifted left by
   /// a multiple of 8 bits.
-  bool BuiltinConstantArgShiftedByte(CallExpr *TheCall, int ArgNum,
+  bool BuiltinConstantArgShiftedByte(CallExpr *TheCall, unsigned ArgNum,
                                      unsigned ArgBits);
 
   /// BuiltinConstantArgShiftedByteOr0xFF - Check if argument ArgNum of
@@ -2846,7 +2847,7 @@ class Sema final : public SemaBase {
   /// or a value of the form 0x??FF (i.e. a member of the arithmetic progression
   /// 0x00FF, 0x01FF, ..., 0xFFFF). This strange range check is needed for some
   /// Arm MVE intrinsics.
-  bool BuiltinConstantArgShiftedByteOrXXFF(CallExpr *TheCall, int ArgNum,
+  bool BuiltinConstantArgShiftedByteOrXXFF(CallExpr *TheCall, unsigned ArgNum,
                                            unsigned ArgBits);
 
   /// Checks that a call expression's argument count is at least the desired
@@ -8553,10 +8554,12 @@ class Sema final : public SemaBase {
                                 bool Diagnose = true);
   FunctionDecl *FindUsualDeallocationFunction(SourceLocation StartLoc,
                                               ImplicitDeallocationParameters,
-                                              DeclarationName Name);
+                                              DeclarationName Name,
+                                              bool Diagnose = true);
   FunctionDecl *FindDeallocationFunctionForDestructor(SourceLocation StartLoc,
                                                       CXXRecordDecl *RD,
-                                                      bool Diagnose = true);
+                                                      bool Diagnose,
+                                                      bool LookForGlobal);
 
   /// ActOnCXXDelete - Parsed a C++ 'delete' expression (C++ 5.3.5), as in:
   /// @code ::delete ptr; @endcode
@@ -11399,10 +11402,6 @@ class Sema final : public SemaBase {
                                        SourceLocation NameLoc,
                                        IdentifierInfo *&II);
 
-  bool resolveAssumedTemplateNameAsType(Scope *S, TemplateName &Name,
-                                        SourceLocation NameLoc,
-                                        bool Diagnose = true);
-
   /// Determine whether a particular identifier might be the name in a C++1z
   /// deduction-guide declaration.
   bool isDeductionGuideName(Scope *S, const IdentifierInfo &Name,
@@ -11643,7 +11642,8 @@ class Sema final : public SemaBase {
   QualType CheckTemplateIdType(ElaboratedTypeKeyword Keyword,
                                TemplateName Template,
                                SourceLocation TemplateLoc,
-                               TemplateArgumentListInfo &TemplateArgs);
+                               TemplateArgumentListInfo &TemplateArgs,
+                               Scope *Scope, bool ForNestedNameSpecifier);
 
   TypeResult
   ActOnTemplateIdType(Scope *S, ElaboratedTypeKeyword ElaboratedKeyword,
diff --git a/clang/include/clang/Sema/SemaHLSL.h b/clang/include/clang/Sema/SemaHLSL.h
index b5ddca0fe2ca5..4df330ed87120 100644
--- a/clang/include/clang/Sema/SemaHLSL.h
+++ b/clang/include/clang/Sema/SemaHLSL.h
@@ -260,11 +260,6 @@ class SemaHLSL : public SemaBase {
 
   bool initGlobalResourceDecl(VarDecl *VD);
   bool initGlobalResourceArrayDecl(VarDecl *VD);
-  void createResourceRecordCtorArgs(const Type *ResourceTy, StringRef VarName,
-                                    HLSLResourceBindingAttr *RBA,
-                                    HLSLVkBindingAttr *VkBinding,
-                                    uint32_t ArrayIndex,
-                                    llvm::SmallVectorImpl<Expr *> &Args);
 };
 
 } // namespace clang
diff --git a/clang/include/clang/Sema/SemaOpenMP.h b/clang/include/clang/Sema/SemaOpenMP.h
index 91c3d4bd5210e..23827051ed724 100644
--- a/clang/include/clang/Sema/SemaOpenMP.h
+++ b/clang/include/clang/Sema/SemaOpenMP.h
@@ -951,11 +951,11 @@ class SemaOpenMP : public SemaBase {
                                    SourceLocation LParenLoc,
                                    SourceLocation EndLoc);
   /// Called on well-formed 'default' clause.
-  OMPClause *ActOnOpenMPDefaultClause(llvm::omp::DefaultKind Kind,
-                                      SourceLocation KindLoc,
-                                      SourceLocation StartLoc,
-                                      SourceLocation LParenLoc,
-                                      SourceLocation EndLoc);
+  OMPClause *
+  ActOnOpenMPDefaultClause(llvm::omp::DefaultKind M, SourceLocation MLoc,
+                           OpenMPDefaultClauseVariableCategory VCKind,
+                           SourceLocation VCKindLoc, SourceLocation StartLoc,
+                           SourceLocation LParenLoc, SourceLocation EndLoc);
   /// Called on well-formed 'proc_bind' clause.
   OMPClause *ActOnOpenMPProcBindClause(llvm::omp::ProcBindKind Kind,
                                        SourceLocation KindLoc,
diff --git a/clang/include/clang/Serialization/ASTWriter.h b/clang/include/clang/Serialization/ASTWriter.h
index edf5bbaddf1aa..28c3e55864057 100644
--- a/clang/include/clang/Serialization/ASTWriter.h
+++ b/clang/include/clang/Serialization/ASTWriter.h
@@ -949,6 +949,8 @@ class ASTWriter : public ASTDeserializationListener,
   void ResolvedOperatorDelete(const CXXDestructorDecl *DD,
                               const FunctionDecl *Delete,
                               Expr *ThisArg) override;
+  void ResolvedOperatorGlobDelete(const CXXDestructorDecl *DD,
+                                  const FunctionDecl *Delete) override;
   void CompletedImplicitDefinition(const FunctionDecl *D) override;
   void InstantiationRequested(const ValueDecl *D) override;
   void VariableDefinitionInstantiated(const VarDecl *D) override;
diff --git a/clang/include/clang/Serialization/TypeBitCodes.def b/clang/include/clang/Serialization/TypeBitCodes.def
index bea15254922c1..d6c484563409c 100644
--- a/clang/include/clang/Serialization/TypeBitCodes.def
+++ b/clang/include/clang/Serialization/TypeBitCodes.def
@@ -39,7 +39,6 @@ TYPE_BIT_CODE(ObjCObject, OBJC_OBJECT, 28)
 TYPE_BIT_CODE(TemplateTypeParm, TEMPLATE_TYPE_PARM, 29)
 TYPE_BIT_CODE(TemplateSpecialization, TEMPLATE_SPECIALIZATION, 30)
 TYPE_BIT_CODE(DependentName, DEPENDENT_NAME, 31)
-TYPE_BIT_CODE(DependentTemplateSpecialization, DEPENDENT_TEMPLATE_SPECIALIZATION, 32)
 TYPE_BIT_CODE(DependentSizedArray, DEPENDENT_SIZED_ARRAY, 33)
 TYPE_BIT_CODE(Paren, PAREN, 34)
 TYPE_BIT_CODE(PackExpansion, PACK_EXPANSION, 35)
diff --git a/clang/lib/APINotes/APINotesFormat.h b/clang/lib/APINotes/APINotesFormat.h
index 69d180e7b3eb5..bb423ccb2bfaf 100644
--- a/clang/lib/APINotes/APINotesFormat.h
+++ b/clang/lib/APINotes/APINotesFormat.h
@@ -24,7 +24,7 @@ const uint16_t VERSION_MAJOR = 0;
 /// API notes file minor version number.
 ///
 /// When the format changes IN ANY WAY, this number should be incremented.
-const uint16_t VERSION_MINOR = 37; // SwiftDestroyOp
+const uint16_t VERSION_MINOR = 38; // SwiftSafety
 
 const uint8_t kSwiftConforms = 1;
 const uint8_t kSwiftDoesNotConform = 2;
diff --git a/clang/lib/APINotes/APINotesReader.cpp b/clang/lib/APINotes/APINotesReader.cpp
index 573356f97ff73..7f9bb5f12cda7 100644
--- a/clang/lib/APINotes/APINotesReader.cpp
+++ b/clang/lib/APINotes/APINotesReader.cpp
@@ -94,11 +94,14 @@ class VersionedTableInfo {
 
 /// Read serialized CommonEntityInfo.
 void ReadCommonEntityInfo(const uint8_t *&Data, CommonEntityInfo &Info) {
-  uint8_t UnavailableBits = *Data++;
-  Info.Unavailable = (UnavailableBits >> 1) & 0x01;
-  Info.UnavailableInSwift = UnavailableBits & 0x01;
-  if ((UnavailableBits >> 2) & 0x01)
-    Info.setSwiftPrivate(static_cast<bool>((UnavailableBits >> 3) & 0x01));
+  uint8_t EncodedBits = *Data++;
+  Info.Unavailable = (EncodedBits >> 1) & 0x01;
+  Info.UnavailableInSwift = EncodedBits & 0x01;
+  if ((EncodedBits >> 2) & 0x01)
+    Info.setSwiftPrivate(static_cast<bool>((EncodedBits >> 3) & 0x01));
+  if ((EncodedBits >> 4) & 0x01)
+    Info.setSwiftSafety(
+        static_cast<SwiftSafetyKind>((EncodedBits >> 5) & 0x03));
 
   unsigned MsgLength =
       endian::readNext<uint16_t, llvm::endianness::little>(Data);
diff --git a/clang/lib/APINotes/APINotesTypes.cpp b/clang/lib/APINotes/APINotesTypes.cpp
index f726faa832bcc..bff4be104c6c8 100644
--- a/clang/lib/APINotes/APINotesTypes.cpp
+++ b/clang/lib/APINotes/APINotesTypes.cpp
@@ -18,6 +18,21 @@ LLVM_DUMP_METHOD void CommonEntityInfo::dump(llvm::raw_ostream &OS) const {
     OS << "[UnavailableInSwift] ";
   if (SwiftPrivateSpecified)
     OS << (SwiftPrivate ? "[SwiftPrivate] " : "");
+  if (SwiftSafetyAudited) {
+    switch (*getSwiftSafety()) {
+    case SwiftSafetyKind::Safe:
+      OS << "[Safe] ";
+      break;
+    case SwiftSafetyKind::Unsafe:
+      OS << "[Unsafe] ";
+      break;
+    case SwiftSafetyKind::Unspecified:
+      OS << "[Unspecified] ";
+      break;
+    case SwiftSafetyKind::None:
+      break;
+    }
+  }
   if (!SwiftName.empty())
     OS << "Swift Name: " << SwiftName << ' ';
   OS << '\n';
diff --git a/clang/lib/APINotes/APINotesWriter.cpp b/clang/lib/APINotes/APINotesWriter.cpp
index cf88d118d0979..47ed93a567c0e 100644
--- a/clang/lib/APINotes/APINotesWriter.cpp
+++ b/clang/lib/APINotes/APINotesWriter.cpp
@@ -507,6 +507,12 @@ void emitCommonEntityInfo(raw_ostream &OS, const CommonEntityInfo &CEI) {
   llvm::support::endian::Writer writer(OS, llvm::endianness::little);
 
   uint8_t payload = 0;
+  if (auto safety = CEI.getSwiftSafety()) {
+    payload = static_cast<unsigned>(*safety);
+    payload <<= 1;
+    payload |= 0x01;
+  }
+  payload <<= 2;
   if (auto swiftPrivate = CEI.isSwiftPrivate()) {
     payload |= 0x01;
     if (*swiftPrivate)
diff --git a/clang/lib/APINotes/APINotesYAMLCompiler.cpp b/clang/lib/APINotes/APINotesYAMLCompiler.cpp
index a91a1eea03d81..8e91d48b4ba62 100644
--- a/clang/lib/APINotes/APINotesYAMLCompiler.cpp
+++ b/clang/lib/APINotes/APINotesYAMLCompiler.cpp
@@ -29,6 +29,18 @@
 using namespace clang;
 using namespace api_notes;
 
+namespace llvm {
+namespace yaml {
+template <> struct ScalarEnumerationTraits<SwiftSafetyKind> {
+  static void enumeration(IO &IO, SwiftSafetyKind &SK) {
+    IO.enumCase(SK, "unspecified", SwiftSafetyKind::Unspecified);
+    IO.enumCase(SK, "safe", SwiftSafetyKind::Safe);
+    IO.enumCase(SK, "unsafe", SwiftSafetyKind::Unsafe);
+  }
+};
+} // namespace yaml
+} // namespace llvm
+
 namespace {
 enum class APIAvailability {
   Available = 0,
@@ -163,6 +175,7 @@ struct Method {
   bool Required = false;
   StringRef ResultType;
   StringRef SwiftReturnOwnership;
+  SwiftSafetyKind SafetyKind = SwiftSafetyKind::None;
 };
 
 typedef std::vector<Method> MethodsSeq;
@@ -199,6 +212,7 @@ template <> struct MappingTraits<Method> {
     IO.mapOptional("ResultType", M.ResultType, StringRef(""));
     IO.mapOptional("SwiftReturnOwnership", M.SwiftReturnOwnership,
                    StringRef(""));
+    IO.mapOptional("SwiftSafety", M.SafetyKind, SwiftSafetyKind::None);
   }
 };
 } // namespace yaml
@@ -214,6 +228,7 @@ struct Property {
   StringRef SwiftName;
   std::optional<bool> SwiftImportAsAccessors;
   StringRef Type;
+  SwiftSafetyKind SafetyKind = SwiftSafetyKind::None;
 };
 
 typedef std::vector<Property> PropertiesSeq;
@@ -235,6 +250,7 @@ template <> struct MappingTraits<Property> {
     IO.mapOptional("SwiftName", P.SwiftName, StringRef(""));
     IO.mapOptional("SwiftImportAsAccessors", P.SwiftImportAsAccessors);
     IO.mapOptional("Type", P.Type, StringRef(""));
+    IO.mapOptional("SwiftSafety", P.SafetyKind, SwiftSafetyKind::None);
   }
 };
 } // namespace yaml
@@ -254,6 +270,7 @@ struct Class {
   std::optional<std::string> SwiftConformance;
   MethodsSeq Methods;
   PropertiesSeq Properties;
+  SwiftSafetyKind SafetyKind = SwiftSafetyKind::None;
 };
 
 typedef std::vector<Class> ClassesSeq;
@@ -279,6 +296,7 @@ template <> struct MappingTraits<Class> {
     IO.mapOptional("SwiftConformsTo", C.SwiftConformance);
     IO.mapOptional("Methods", C.Methods);
     IO.mapOptional("Properties", C.Properties);
+    IO.mapOptional("SwiftSafety", C.SafetyKind, SwiftSafetyKind::None);
   }
 };
 } // namespace yaml
@@ -297,6 +315,7 @@ struct Function {
   StringRef Type;
   StringRef ResultType;
   StringRef SwiftReturnOwnership;
+  SwiftSafetyKind SafetyKind = SwiftSafetyKind::None;
 };
 
 typedef std::vector<Function> FunctionsSeq;
@@ -321,6 +340,7 @@ template <> struct MappingTraits<Function> {
     IO.mapOptional("ResultType", F.ResultType, StringRef(""));
     IO.mapOptional("SwiftReturnOwnership", F.SwiftReturnOwnership,
                    StringRef(""));
+    IO.mapOptional("SwiftSafety", F.SafetyKind, SwiftSafetyKind::None);
   }
 };
 } // namespace yaml
@@ -334,6 +354,7 @@ struct GlobalVariable {
   std::optional<bool> SwiftPrivate;
   StringRef SwiftName;
   StringRef Type;
+  SwiftSafetyKind SafetyKind = SwiftSafetyKind::None;
 };
 
 typedef std::vector<GlobalVariable> GlobalVariablesSeq;
@@ -353,6 +374,7 @@ template <> struct MappingTraits<GlobalVariable> {
     IO.mapOptional("SwiftPrivate", GV.SwiftPrivate);
     IO.mapOptional("SwiftName", GV.SwiftName, StringRef(""));
     IO.mapOptional("Type", GV.Type, StringRef(""));
+    IO.mapOptional("SwiftSafety", GV.SafetyKind, SwiftSafetyKind::None);
   }
 };
 } // namespace yaml
@@ -364,6 +386,7 @@ struct EnumConstant {
   AvailabilityItem Availability;
   std::optional<bool> SwiftPrivate;
   StringRef SwiftName;
+  SwiftSafetyKind SafetyKind = SwiftSafetyKind::None;
 };
 
 typedef std::vector<EnumConstant> EnumConstantsSeq;
@@ -381,6 +404,7 @@ template <> struct MappingTraits<EnumConstant> {
     IO.mapOptional("AvailabilityMsg", EC.Availability.Msg, StringRef(""));
     IO.mapOptional("SwiftPrivate", EC.SwiftPrivate);
     IO.mapOptional("SwiftName", EC.SwiftName, StringRef(""));
+    IO.mapOptional("SwiftSafety", EC.SafetyKind, SwiftSafetyKind::None);
   }
 };
 } // namespace yaml
@@ -424,6 +448,7 @@ struct Field {
   std::optional<bool> SwiftPrivate;
   StringRef SwiftName;
   StringRef Type;
+  SwiftSafetyKind SafetyKind = SwiftSafetyKind::None;
 };
 
 typedef std::vector<Field> FieldsSeq;
@@ -443,6 +468,7 @@ template <> struct MappingTraits<Field> {
     IO.mapOptional("SwiftPrivate", F.SwiftPrivate);
     IO.mapOptional("SwiftName", F.SwiftName, StringRef(""));
     IO.mapOptional("Type", F.Type, StringRef(""));
+    IO.mapOptional("SwiftSafety", F.SafetyKind, SwiftSafetyKind::None);
   }
 };
 } // namespace yaml
@@ -470,6 +496,7 @@ struct Tag {
   std::optional<EnumConvenienceAliasKind> EnumConvenienceKind;
   std::optional<bool> SwiftCopyable;
   std::optional<bool> SwiftEscapable;
+  SwiftSafetyKind SafetyKind = SwiftSafetyKind::None;
   FunctionsSeq Methods;
   FieldsSeq Fields;
 
@@ -515,6 +542,7 @@ template <> struct MappingTraits<Tag> {
     IO.mapOptional("Methods", T.Methods);
     IO.mapOptional("Fields", T.Fields);
     IO.mapOptional("Tags", T.Tags);
+    IO.mapOptional("SwiftSafety", T.SafetyKind, SwiftSafetyKind::None);
   }
 };
 } // namespace yaml
@@ -530,6 +558,7 @@ struct Typedef {
   std::optional<StringRef> NSErrorDomain;
   std::optional<SwiftNewTypeKind> SwiftType;
   std::optional<std::string> SwiftConformance;
+  const SwiftSafetyKind SafetyKind = SwiftSafetyKind::None;
 };
 
 typedef std::vector<Typedef> TypedefsSeq;
@@ -602,6 +631,7 @@ struct Namespace {
   StringRef SwiftName;
   std::optional<bool> SwiftPrivate;
   TopLevelItems Items;
+  const SwiftSafetyKind SafetyKind = SwiftSafetyKind::None;
 };
 } // namespace
 
@@ -797,6 +827,8 @@ class YAMLConverter {
                            StringRef APIName) {
     convertAvailability(Common.Availability, Info, APIName);
     Info.setSwiftPrivate(Common.SwiftPrivate);
+    if (Common.SafetyKind != SwiftSafetyKind::None)
+      Info.setSwiftSafety(Common.SafetyKind);
     Info.SwiftName = std::string(Common.SwiftName);
   }
 
@@ -956,6 +988,8 @@ class YAMLConverter {
   void convertFunction(const Function &Function, FuncOrMethodInfo &FI) {
     convertAvailability(Function.Availability, FI, Function.Name);
     FI.setSwiftPrivate(Function.SwiftPrivate);
+    if (Function.SafetyKind != SwiftSafetyKind::None)
+      FI.setSwiftSafety(Function.SafetyKind);
     FI.SwiftName = std::string(Function.SwiftName);
     std::optional<ParamInfo> This;
     convertParams(Function.Params, FI, This);
diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp
index ed4c6b0e38be3..5240054c2f36b 100644
--- a/clang/lib/AST/ASTContext.cpp
+++ b/clang/lib/AST/ASTContext.cpp
@@ -4286,7 +4286,6 @@ QualType ASTContext::getVariableArrayDecayedType(QualType type) const {
   case Type::DependentName:
   case Type::InjectedClassName:
   case Type::TemplateSpecialization:
-  case Type::DependentTemplateSpecialization:
   case Type::TemplateTypeParm:
   case Type::SubstTemplateTypeParmPack:
   case Type::SubstBuiltinTemplatePack:
@@ -5932,6 +5931,30 @@ QualType ASTContext::getTemplateTypeParmType(unsigned Depth, unsigned Index,
   return QualType(TypeParm, 0);
 }
 
+static ElaboratedTypeKeyword
+getCanonicalElaboratedTypeKeyword(ElaboratedTypeKeyword Keyword) {
+  switch (Keyword) {
+  // These are just themselves.
+  case ElaboratedTypeKeyword::None:
+  case ElaboratedTypeKeyword::Struct:
+  case ElaboratedTypeKeyword::Union:
+  case ElaboratedTypeKeyword::Enum:
+  case ElaboratedTypeKeyword::Interface:
+    return Keyword;
+
+  // These are equivalent.
+  case ElaboratedTypeKeyword::Typename:
+    return ElaboratedTypeKeyword::None;
+
+  // These are functionally equivalent, so relying on their equivalence is
+  // IFNDR. By making them equivalent, we disallow overloading, which at least
+  // can produce a diagnostic.
+  case ElaboratedTypeKeyword::Class:
+    return ElaboratedTypeKeyword::Struct;
+  }
+  llvm_unreachable("unexpected keyword kind");
+}
+
 TypeSourceInfo *ASTContext::getTemplateSpecializationTypeInfo(
     ElaboratedTypeKeyword Keyword, SourceLocation ElaboratedKeywordLoc,
     NestedNameSpecifierLoc QualifierLoc, SourceLocation TemplateKeywordLoc,
@@ -5970,17 +5993,20 @@ hasAnyPackExpansions(ArrayRef<TemplateArgument> Args) {
 }
 
 QualType ASTContext::getCanonicalTemplateSpecializationType(
-    TemplateName Template, ArrayRef<TemplateArgument> Args) const {
+    ElaboratedTypeKeyword Keyword, TemplateName Template,
+    ArrayRef<TemplateArgument> Args) const {
   assert(Template ==
          getCanonicalTemplateName(Template, /*IgnoreDeduced=*/true));
-  assert(!Args.empty());
+  assert((Keyword == ElaboratedTypeKeyword::None ||
+          Template.getAsDependentTemplateName()));
 #ifndef NDEBUG
   for (const auto &Arg : Args)
     assert(Arg.structurallyEquals(getCanonicalTemplateArgument(Arg)));
 #endif
 
   llvm::FoldingSetNodeID ID;
-  TemplateSpecializationType::Profile(ID, Template, Args, QualType(), *this);
+  TemplateSpecializationType::Profile(ID, Keyword, Template, Args, QualType(),
+                                      *this);
   void *InsertPos = nullptr;
   if (auto *T = TemplateSpecializationTypes.FindNodeOrInsertPos(ID, InsertPos))
     return QualType(T, 0);
@@ -5988,9 +6014,9 @@ QualType ASTContext::getCanonicalTemplateSpecializationType(
   void *Mem = Allocate(sizeof(TemplateSpecializationType) +
                            sizeof(TemplateArgument) * Args.size(),
                        alignof(TemplateSpecializationType));
-  auto *Spec = new (Mem)
-      TemplateSpecializationType(ElaboratedTypeKeyword::None, Template,
-                                 /*IsAlias=*/false, Args, QualType());
+  auto *Spec =
+      new (Mem) TemplateSpecializationType(Keyword, Template,
+                                           /*IsAlias=*/false, Args, QualType());
   assert(Spec->isDependentType() &&
          "canonical template specialization must be dependent");
   Types.push_back(Spec);
@@ -6002,16 +6028,16 @@ QualType ASTContext::getTemplateSpecializationType(
     ElaboratedTypeKeyword Keyword, TemplateName Template,
     ArrayRef<TemplateArgument> SpecifiedArgs,
     ArrayRef<TemplateArgument> CanonicalArgs, QualType Underlying) const {
-  assert(!Template.getUnderlying().getAsDependentTemplateName() &&
-         "No dependent template names here!");
-
   const auto *TD = Template.getAsTemplateDecl(/*IgnoreDeduced=*/true);
   bool IsTypeAlias = TD && TD->isTypeAlias();
   if (Underlying.isNull()) {
     TemplateName CanonTemplate =
         getCanonicalTemplateName(Template, /*IgnoreDeduced=*/true);
-    bool NonCanonical =
-        Template != CanonTemplate || Keyword != ElaboratedTypeKeyword::None;
+    ElaboratedTypeKeyword CanonKeyword =
+        CanonTemplate.getAsDependentTemplateName()
+            ? getCanonicalElaboratedTypeKeyword(Keyword)
+            : ElaboratedTypeKeyword::None;
+    bool NonCanonical = Template != CanonTemplate || Keyword != CanonKeyword;
     SmallVector<TemplateArgument, 4> CanonArgsVec;
     if (CanonicalArgs.empty()) {
       CanonArgsVec = SmallVector<TemplateArgument, 4>(SpecifiedArgs);
@@ -6033,8 +6059,8 @@ QualType ASTContext::getTemplateSpecializationType(
            "Caller must compute aliased type");
     IsTypeAlias = false;
 
-    Underlying =
-        getCanonicalTemplateSpecializationType(CanonTemplate, CanonicalArgs);
+    Underlying = getCanonicalTemplateSpecializationType(
+        CanonKeyword, CanonTemplate, CanonicalArgs);
     if (!NonCanonical)
       return Underlying;
   }
@@ -6085,30 +6111,6 @@ ASTContext::getMacroQualifiedType(QualType UnderlyingTy,
   return QualType(newType, 0);
 }
 
-static ElaboratedTypeKeyword
-getCanonicalElaboratedTypeKeyword(ElaboratedTypeKeyword Keyword) {
-  switch (Keyword) {
-  // These are just themselves.
-  case ElaboratedTypeKeyword::None:
-  case ElaboratedTypeKeyword::Struct:
-  case ElaboratedTypeKeyword::Union:
-  case ElaboratedTypeKeyword::Enum:
-  case ElaboratedTypeKeyword::Interface:
-    return Keyword;
-
-  // These are equivalent.
-  case ElaboratedTypeKeyword::Typename:
-    return ElaboratedTypeKeyword::None;
-
-  // These are functionally equivalent, so relying on their equivalence is
-  // IFNDR. By making them equivalent, we disallow overloading, which at least
-  // can produce a diagnostic.
-  case ElaboratedTypeKeyword::Class:
-    return ElaboratedTypeKeyword::Struct;
-  }
-  llvm_unreachable("unexpected keyword kind");
-}
-
 QualType ASTContext::getDependentNameType(ElaboratedTypeKeyword Keyword,
                                           NestedNameSpecifier NNS,
                                           const IdentifierInfo *Name) const {
@@ -6140,68 +6142,6 @@ QualType ASTContext::getDependentNameType(ElaboratedTypeKeyword Keyword,
   return QualType(T, 0);
 }
 
-QualType ASTContext::getDependentTemplateSpecializationType(
-    ElaboratedTypeKeyword Keyword, const DependentTemplateStorage &Name,
-    ArrayRef<TemplateArgumentLoc> Args) const {
-  // TODO: avoid this copy
-  SmallVector<TemplateArgument, 16> ArgCopy;
-  for (unsigned I = 0, E = Args.size(); I != E; ++I)
-    ArgCopy.push_back(Args[I].getArgument());
-  return getDependentTemplateSpecializationType(Keyword, Name, ArgCopy);
-}
-
-QualType ASTContext::getDependentTemplateSpecializationType(
-    ElaboratedTypeKeyword Keyword, const DependentTemplateStorage &Name,
-    ArrayRef<TemplateArgument> Args, bool IsCanonical) const {
-  llvm::FoldingSetNodeID ID;
-  DependentTemplateSpecializationType::Profile(ID, *this, Keyword, Name, Args);
-
-  if (auto const T_iter = DependentTemplateSpecializationTypes.find(ID);
-      T_iter != DependentTemplateSpecializationTypes.end())
-    return QualType(T_iter->getSecond(), 0);
-
-  NestedNameSpecifier NNS = Name.getQualifier();
-
-  QualType Canon;
-  if (!IsCanonical) {
-    ElaboratedTypeKeyword CanonKeyword =
-        getCanonicalElaboratedTypeKeyword(Keyword);
-    NestedNameSpecifier CanonNNS = NNS.getCanonical();
-    bool AnyNonCanonArgs = false;
-    auto CanonArgs =
-        ::getCanonicalTemplateArguments(*this, Args, AnyNonCanonArgs);
-
-    if (CanonKeyword != Keyword || AnyNonCanonArgs || CanonNNS != NNS ||
-        !Name.hasTemplateKeyword()) {
-      Canon = getDependentTemplateSpecializationType(
-          CanonKeyword, {CanonNNS, Name.getName(), /*HasTemplateKeyword=*/true},
-          CanonArgs,
-          /*IsCanonical=*/true);
-    }
-  } else {
-    assert(Keyword == getCanonicalElaboratedTypeKeyword(Keyword));
-    assert(Name.hasTemplateKeyword());
-    assert(NNS.isCanonical());
-#ifndef NDEBUG
-    for (const auto &Arg : Args)
-      assert(Arg.structurallyEquals(getCanonicalTemplateArgument(Arg)));
-#endif
-  }
-  void *Mem = Allocate((sizeof(DependentTemplateSpecializationType) +
-                        sizeof(TemplateArgument) * Args.size()),
-                       alignof(DependentTemplateSpecializationType));
-  auto *T =
-      new (Mem) DependentTemplateSpecializationType(Keyword, Name, Args, Canon);
-#ifndef NDEBUG
-  llvm::FoldingSetNodeID InsertedID;
-  T->Profile(InsertedID, *this);
-  assert(InsertedID == ID && "ID does not match");
-#endif
-  Types.push_back(T);
-  DependentTemplateSpecializationTypes.try_emplace(ID, T);
-  return QualType(T, 0);
-}
-
 TemplateArgument ASTContext::getInjectedTemplateArg(NamedDecl *Param) const {
   TemplateArgument Arg;
   if (const auto *TTP = dyn_cast<TemplateTypeParmDecl>(Param)) {
@@ -14327,21 +14267,6 @@ static QualType getCommonNonSugarTypeNode(const ASTContext &Ctx, const Type *X,
         getCommonTypeKeyword(NX, NY, /*IsSame=*/true),
         getCommonQualifier(Ctx, NX, NY, /*IsSame=*/true), NX->getIdentifier());
   }
-  case Type::DependentTemplateSpecialization: {
-    const auto *TX = cast<DependentTemplateSpecializationType>(X),
-               *TY = cast<DependentTemplateSpecializationType>(Y);
-    auto As = getCommonTemplateArguments(Ctx, TX->template_arguments(),
-                                         TY->template_arguments());
-    const DependentTemplateStorage &SX = TX->getDependentTemplateName(),
-                                   &SY = TY->getDependentTemplateName();
-    assert(SX.getName() == SY.getName());
-    DependentTemplateStorage Name(
-        getCommonNNS(Ctx, SX.getQualifier(), SY.getQualifier(),
-                     /*IsSame=*/true),
-        SX.getName(), SX.hasTemplateKeyword() || SY.hasTemplateKeyword());
-    return Ctx.getDependentTemplateSpecializationType(
-        getCommonTypeKeyword(TX, TY, /*IsSame=*/true), Name, As);
-  }
   case Type::UnaryTransform: {
     const auto *TX = cast<UnaryTransformType>(X),
                *TY = cast<UnaryTransformType>(Y);
diff --git a/clang/lib/AST/ASTImporter.cpp b/clang/lib/AST/ASTImporter.cpp
index fe7f1e5eb0310..1c8fd83feb7f8 100644
--- a/clang/lib/AST/ASTImporter.cpp
+++ b/clang/lib/AST/ASTImporter.cpp
@@ -1745,15 +1745,13 @@ ExpectedType ASTNodeImporter::VisitTagType(const TagType *T) {
   if (!ToDeclOrErr)
     return ToDeclOrErr.takeError();
 
-  if (DeclForType->isUsed()) {
-    // If there is a definition of the 'OriginalDecl', it should be imported to
-    // have all information for the type in the "To" AST. (In some cases no
-    // other reference may exist to the definition decl and it would not be
-    // imported otherwise.)
-    Expected<TagDecl *> ToDefDeclOrErr = import(DeclForType->getDefinition());
-    if (!ToDefDeclOrErr)
-      return ToDefDeclOrErr.takeError();
-  }
+  // If there is a definition of the 'OriginalDecl', it should be imported to
+  // have all information for the type in the "To" AST. (In some cases no
+  // other reference may exist to the definition decl and it would not be
+  // imported otherwise.)
+  Expected<TagDecl *> ToDefDeclOrErr = import(DeclForType->getDefinition());
+  if (!ToDefDeclOrErr)
+    return ToDefDeclOrErr.takeError();
 
   if (T->isCanonicalUnqualified())
     return Importer.getToContext().getCanonicalTagType(*ToDeclOrErr);
@@ -1892,25 +1890,6 @@ ASTNodeImporter::VisitPackExpansionType(const PackExpansionType *T) {
                                                       /*ExpactPack=*/false);
 }
 
-ExpectedType ASTNodeImporter::VisitDependentTemplateSpecializationType(
-    const DependentTemplateSpecializationType *T) {
-  const DependentTemplateStorage &DTN = T->getDependentTemplateName();
-  auto QualifierOrErr = import(DTN.getQualifier());
-  if (!QualifierOrErr)
-    return QualifierOrErr.takeError();
-
-  SmallVector<TemplateArgument, 2> ToPack;
-  ToPack.reserve(T->template_arguments().size());
-  if (Error Err = ImportTemplateArguments(T->template_arguments(), ToPack))
-    return std::move(Err);
-
-  return Importer.getToContext().getDependentTemplateSpecializationType(
-      T->getKeyword(),
-      {*QualifierOrErr, Importer.Import(DTN.getName()),
-       DTN.hasTemplateKeyword()},
-      ToPack);
-}
-
 ExpectedType
 ASTNodeImporter::VisitDependentNameType(const DependentNameType *T) {
   auto ToQualifierOrErr = import(T->getQualifier());
diff --git a/clang/lib/AST/ASTStructuralEquivalence.cpp b/clang/lib/AST/ASTStructuralEquivalence.cpp
index 1292c30d47589..155734679b2da 100644
--- a/clang/lib/AST/ASTStructuralEquivalence.cpp
+++ b/clang/lib/AST/ASTStructuralEquivalence.cpp
@@ -1384,20 +1384,6 @@ static bool IsStructurallyEquivalent(StructuralEquivalenceContext &Context,
     break;
   }
 
-  case Type::DependentTemplateSpecialization: {
-    const auto *Spec1 = cast<DependentTemplateSpecializationType>(T1);
-    const auto *Spec2 = cast<DependentTemplateSpecializationType>(T2);
-    if (Spec1->getKeyword() != Spec2->getKeyword())
-      return false;
-    if (!IsStructurallyEquivalent(Context, Spec1->getDependentTemplateName(),
-                                  Spec2->getDependentTemplateName()))
-      return false;
-    if (!IsStructurallyEquivalent(Context, Spec1->template_arguments(),
-                                  Spec2->template_arguments()))
-      return false;
-    break;
-  }
-
   case Type::PackExpansion:
     if (!IsStructurallyEquivalent(Context,
                                   cast<PackExpansionType>(T1)->getPattern(),
diff --git a/clang/lib/AST/ASTTypeTraits.cpp b/clang/lib/AST/ASTTypeTraits.cpp
index d2f7fdbbad04d..84eb77730b1cb 100644
--- a/clang/lib/AST/ASTTypeTraits.cpp
+++ b/clang/lib/AST/ASTTypeTraits.cpp
@@ -249,10 +249,6 @@ SourceRange DynTypedNode::getSourceRange(bool IncludeQualifier) const {
       auto T = TL->castAs<TemplateSpecializationTypeLoc>();
       return SourceRange(T.getTemplateNameLoc(), T.getEndLoc());
     }
-    case TypeLoc::DependentTemplateSpecialization: {
-      auto T = TL->castAs<DependentTemplateSpecializationTypeLoc>();
-      return SourceRange(T.getTemplateNameLoc(), T.getEndLoc());
-    }
     case TypeLoc::Enum:
     case TypeLoc::Record:
     case TypeLoc::InjectedClassName:
diff --git a/clang/lib/AST/ByteCode/ByteCodeEmitter.cpp b/clang/lib/AST/ByteCode/ByteCodeEmitter.cpp
index 1d71708799518..274efccac79dc 100644
--- a/clang/lib/AST/ByteCode/ByteCodeEmitter.cpp
+++ b/clang/lib/AST/ByteCode/ByteCodeEmitter.cpp
@@ -24,15 +24,13 @@ void ByteCodeEmitter::compileFunc(const FunctionDecl *FuncDecl,
                                   Function *Func) {
   assert(FuncDecl);
   assert(Func);
+  assert(FuncDecl->isThisDeclarationADefinition());
 
   // Manually created functions that haven't been assigned proper
   // parameters yet.
   if (!FuncDecl->param_empty() && !FuncDecl->param_begin())
     return;
 
-  if (!FuncDecl->isDefined())
-    return;
-
   // Set up lambda captures.
   if (const auto *MD = dyn_cast<CXXMethodDecl>(FuncDecl);
       MD && isLambdaCallOperator(MD)) {
@@ -87,7 +85,7 @@ void ByteCodeEmitter::compileFunc(const FunctionDecl *FuncDecl,
   }
 
   // Set the function's code.
-  Func->setCode(NextLocalOffset, std::move(Code), std::move(SrcMap),
+  Func->setCode(FuncDecl, NextLocalOffset, std::move(Code), std::move(SrcMap),
                 std::move(Scopes), FuncDecl->hasBody());
   Func->setIsFullyCompiled(true);
 }
diff --git a/clang/lib/AST/ByteCode/ByteCodeEmitter.h b/clang/lib/AST/ByteCode/ByteCodeEmitter.h
index d29db66325412..c050b299d8f61 100644
--- a/clang/lib/AST/ByteCode/ByteCodeEmitter.h
+++ b/clang/lib/AST/ByteCode/ByteCodeEmitter.h
@@ -46,7 +46,8 @@ class ByteCodeEmitter {
   /// Methods implemented by the compiler.
   virtual bool visitFunc(const FunctionDecl *E) = 0;
   virtual bool visitExpr(const Expr *E, bool DestroyToplevelScope) = 0;
-  virtual bool visitDeclAndReturn(const VarDecl *E, bool ConstantContext) = 0;
+  virtual bool visitDeclAndReturn(const VarDecl *VD, const Expr *Init,
+                                  bool ConstantContext) = 0;
   virtual bool visit(const Expr *E) = 0;
   virtual bool emitBool(bool V, const Expr *E) = 0;
 
diff --git a/clang/lib/AST/ByteCode/Compiler.cpp b/clang/lib/AST/ByteCode/Compiler.cpp
index 3f7db39281358..1340a84a7d44d 100644
--- a/clang/lib/AST/ByteCode/Compiler.cpp
+++ b/clang/lib/AST/ByteCode/Compiler.cpp
@@ -2429,7 +2429,7 @@ bool Compiler<Emitter>::VisitArrayInitLoopExpr(const ArrayInitLoopExpr *E) {
   // and the RHS is our SubExpr.
   for (size_t I = 0; I != Size; ++I) {
     ArrayIndexScope<Emitter> IndexScope(this, I);
-    BlockScope<Emitter> BS(this);
+    LocalScope<Emitter> BS(this);
 
     if (!this->visitArrayElemInit(I, SubExpr, SubExprT))
       return false;
@@ -3986,6 +3986,10 @@ bool Compiler<Emitter>::VisitConvertVectorExpr(const ConvertVectorExpr *E) {
 
 template <class Emitter>
 bool Compiler<Emitter>::VisitShuffleVectorExpr(const ShuffleVectorExpr *E) {
+  // FIXME: Unary shuffle with mask not currently supported.
+  if (E->getNumSubExprs() == 2)
+    return this->emitInvalid(E);
+
   assert(Initializing);
   assert(E->getNumSubExprs() > 2);
 
@@ -4140,7 +4144,7 @@ bool Compiler<Emitter>::VisitCXXStdInitializerListExpr(
 
 template <class Emitter>
 bool Compiler<Emitter>::VisitStmtExpr(const StmtExpr *E) {
-  BlockScope<Emitter> BS(this);
+  LocalScope<Emitter> BS(this);
   StmtExprScope<Emitter> SS(this);
 
   const CompoundStmt *CS = E->getSubStmt();
@@ -4714,7 +4718,8 @@ template <class Emitter>
 VarCreationState Compiler<Emitter>::visitDecl(const VarDecl *VD,
                                               bool IsConstexprUnknown) {
 
-  auto R = this->visitVarDecl(VD, /*Toplevel=*/true, IsConstexprUnknown);
+  auto R = this->visitVarDecl(VD, VD->getInit(), /*Toplevel=*/true,
+                              IsConstexprUnknown);
 
   if (R.notCreated())
     return R;
@@ -4740,14 +4745,12 @@ VarCreationState Compiler<Emitter>::visitDecl(const VarDecl *VD,
 /// We get here from evaluateAsInitializer().
 /// We need to evaluate the initializer and return its value.
 template <class Emitter>
-bool Compiler<Emitter>::visitDeclAndReturn(const VarDecl *VD,
+bool Compiler<Emitter>::visitDeclAndReturn(const VarDecl *VD, const Expr *Init,
                                            bool ConstantContext) {
-
   // We only create variables if we're evaluating in a constant context.
   // Otherwise, just evaluate the initializer and return it.
   if (!ConstantContext) {
     DeclScope<Emitter> LS(this, VD);
-    const Expr *Init = VD->getInit();
     if (!this->visit(Init))
       return false;
     return this->emitRet(classify(Init).value_or(PT_Ptr), VD) &&
@@ -4755,7 +4758,7 @@ bool Compiler<Emitter>::visitDeclAndReturn(const VarDecl *VD,
   }
 
   LocalScope<Emitter> VDScope(this, VD);
-  if (!this->visitVarDecl(VD, /*Toplevel=*/true))
+  if (!this->visitVarDecl(VD, Init, /*Toplevel=*/true))
     return false;
 
   OptPrimType VarT = classify(VD->getType());
@@ -4802,9 +4805,9 @@ bool Compiler<Emitter>::visitDeclAndReturn(const VarDecl *VD,
 }
 
 template <class Emitter>
-VarCreationState Compiler<Emitter>::visitVarDecl(const VarDecl *VD,
-                                                 bool Toplevel,
-                                                 bool IsConstexprUnknown) {
+VarCreationState
+Compiler<Emitter>::visitVarDecl(const VarDecl *VD, const Expr *Init,
+                                bool Toplevel, bool IsConstexprUnknown) {
   // We don't know what to do with these, so just return false.
   if (VD->getType().isNull())
     return false;
@@ -4814,7 +4817,6 @@ VarCreationState Compiler<Emitter>::visitVarDecl(const VarDecl *VD,
   if (!this->isActive())
     return VarCreationState::NotCreated();
 
-  const Expr *Init = VD->getInit();
   OptPrimType VarT = classify(VD->getType());
 
   if (Init && Init->isValueDependent())
@@ -5113,7 +5115,7 @@ bool Compiler<Emitter>::VisitCallExpr(const CallExpr *E) {
     }
   }
 
-  BlockScope<Emitter> CallScope(this, ScopeKind::Call);
+  LocalScope<Emitter> CallScope(this, ScopeKind::Call);
 
   QualType ReturnType = E->getCallReturnType(Ctx.getASTContext());
   OptPrimType T = classify(ReturnType);
@@ -5477,7 +5479,7 @@ template <class Emitter> bool Compiler<Emitter>::visitStmt(const Stmt *S) {
 
 template <class Emitter>
 bool Compiler<Emitter>::visitCompoundStmt(const CompoundStmt *S) {
-  BlockScope<Emitter> Scope(this);
+  LocalScope<Emitter> Scope(this);
   for (const auto *InnerStmt : S->body())
     if (!visitStmt(InnerStmt))
       return false;
@@ -5488,7 +5490,8 @@ template <class Emitter>
 bool Compiler<Emitter>::maybeEmitDeferredVarInit(const VarDecl *VD) {
   if (auto *DD = dyn_cast_if_present<DecompositionDecl>(VD)) {
     for (auto *BD : DD->flat_bindings())
-      if (auto *KD = BD->getHoldingVar(); KD && !this->visitVarDecl(KD))
+      if (auto *KD = BD->getHoldingVar();
+          KD && !this->visitVarDecl(KD, KD->getInit()))
         return false;
   }
   return true;
@@ -5552,7 +5555,7 @@ bool Compiler<Emitter>::visitDeclStmt(const DeclStmt *DS,
     const auto *VD = dyn_cast<VarDecl>(D);
     if (!VD)
       return false;
-    if (!this->visitVarDecl(VD))
+    if (!this->visitVarDecl(VD, VD->getInit()))
       return false;
 
     // Register decomposition decl holding vars.
@@ -6212,7 +6215,7 @@ bool Compiler<Emitter>::compileConstructor(const CXXConstructorDecl *Ctor) {
   InitLinkScope<Emitter> InitScope(this, InitLink::This());
   for (const auto *Init : Ctor->inits()) {
     // Scope needed for the initializers.
-    BlockScope<Emitter> Scope(this);
+    LocalScope<Emitter> Scope(this);
 
     const Expr *InitExpr = Init->getInit();
     if (const FieldDecl *Member = Init->getMember()) {
@@ -7395,7 +7398,8 @@ bool Compiler<Emitter>::emitBuiltinBitCast(const CastExpr *E) {
   uint32_t ResultBitWidth = std::max(Ctx.getBitWidth(ToType), 8u);
 
   if (!this->emitBitCastPrim(*ToT, ToTypeIsUChar || ToType->isStdByteType(),
-                             ResultBitWidth, TargetSemantics, E))
+                             ResultBitWidth, TargetSemantics,
+                             ToType.getTypePtr(), E))
     return false;
 
   if (DiscardResult)
diff --git a/clang/lib/AST/ByteCode/Compiler.h b/clang/lib/AST/ByteCode/Compiler.h
index c97dc18656ce4..09599b3547888 100644
--- a/clang/lib/AST/ByteCode/Compiler.h
+++ b/clang/lib/AST/ByteCode/Compiler.h
@@ -251,7 +251,8 @@ class Compiler : public ConstStmtVisitor<Compiler<Emitter>, bool>,
   bool visitExpr(const Expr *E, bool DestroyToplevelScope) override;
   bool visitFunc(const FunctionDecl *F) override;
 
-  bool visitDeclAndReturn(const VarDecl *VD, bool ConstantContext) override;
+  bool visitDeclAndReturn(const VarDecl *VD, const Expr *Init,
+                          bool ConstantContext) override;
 
 protected:
   /// Emits scope cleanup instructions.
@@ -303,7 +304,8 @@ class Compiler : public ConstStmtVisitor<Compiler<Emitter>, bool>,
   /// intact.
   bool delegate(const Expr *E);
   /// Creates and initializes a variable from the given decl.
-  VarCreationState visitVarDecl(const VarDecl *VD, bool Toplevel = false,
+  VarCreationState visitVarDecl(const VarDecl *VD, const Expr *Init,
+                                bool Toplevel = false,
                                 bool IsConstexprUnknown = false);
   VarCreationState visitDecl(const VarDecl *VD,
                              bool IsConstexprUnknown = false);
@@ -622,14 +624,6 @@ template <class Emitter> class LocalScope : public VariableScope<Emitter> {
   UnsignedOrNone Idx = std::nullopt;
 };
 
-/// Scope for storage declared in a compound statement.
-// FIXME: Remove?
-template <class Emitter> class BlockScope final : public LocalScope<Emitter> {
-public:
-  BlockScope(Compiler<Emitter> *Ctx, ScopeKind Kind = ScopeKind::Block)
-      : LocalScope<Emitter>(Ctx, Kind) {}
-};
-
 template <class Emitter> class ArrayIndexScope final {
 public:
   ArrayIndexScope(Compiler<Emitter> *Ctx, uint64_t Index) : Ctx(Ctx) {
diff --git a/clang/lib/AST/ByteCode/Context.cpp b/clang/lib/AST/ByteCode/Context.cpp
index 8598996681466..6e6c60925a70f 100644
--- a/clang/lib/AST/ByteCode/Context.cpp
+++ b/clang/lib/AST/ByteCode/Context.cpp
@@ -126,7 +126,7 @@ bool Context::evaluate(State &Parent, const Expr *E, APValue &Result,
 }
 
 bool Context::evaluateAsInitializer(State &Parent, const VarDecl *VD,
-                                    APValue &Result) {
+                                    const Expr *Init, APValue &Result) {
   ++EvalID;
   bool Recursing = !Stk.empty();
   size_t StackSizeBefore = Stk.size();
@@ -135,7 +135,7 @@ bool Context::evaluateAsInitializer(State &Parent, const VarDecl *VD,
   bool CheckGlobalInitialized =
       shouldBeGloballyIndexed(VD) &&
       (VD->getType()->isRecordType() || VD->getType()->isArrayType());
-  auto Res = C.interpretDecl(VD, CheckGlobalInitialized);
+  auto Res = C.interpretDecl(VD, Init, CheckGlobalInitialized);
   if (Res.isInvalid()) {
     C.cleanup();
     Stk.clearTo(StackSizeBefore);
diff --git a/clang/lib/AST/ByteCode/Context.h b/clang/lib/AST/ByteCode/Context.h
index fa98498dbe8fa..280a31725555f 100644
--- a/clang/lib/AST/ByteCode/Context.h
+++ b/clang/lib/AST/ByteCode/Context.h
@@ -59,7 +59,8 @@ class Context final {
                 ConstantExprKind Kind);
 
   /// Evaluates a toplevel initializer.
-  bool evaluateAsInitializer(State &Parent, const VarDecl *VD, APValue &Result);
+  bool evaluateAsInitializer(State &Parent, const VarDecl *VD, const Expr *Init,
+                             APValue &Result);
 
   bool evaluateCharRange(State &Parent, const Expr *SizeExpr,
                          const Expr *PtrExpr, APValue &Result);
diff --git a/clang/lib/AST/ByteCode/EvalEmitter.cpp b/clang/lib/AST/ByteCode/EvalEmitter.cpp
index e349397078aa3..c7287999dd9c0 100644
--- a/clang/lib/AST/ByteCode/EvalEmitter.cpp
+++ b/clang/lib/AST/ByteCode/EvalEmitter.cpp
@@ -49,23 +49,21 @@ EvaluationResult EvalEmitter::interpretExpr(const Expr *E,
   return std::move(this->EvalResult);
 }
 
-EvaluationResult EvalEmitter::interpretDecl(const VarDecl *VD,
+EvaluationResult EvalEmitter::interpretDecl(const VarDecl *VD, const Expr *Init,
                                             bool CheckFullyInitialized) {
+  assert(VD);
+  assert(Init);
   this->CheckFullyInitialized = CheckFullyInitialized;
   S.EvaluatingDecl = VD;
   S.setEvalLocation(VD->getLocation());
   EvalResult.setSource(VD);
 
-  if (const Expr *Init = VD->getAnyInitializer()) {
-    QualType T = VD->getType();
-    this->ConvertResultToRValue = !Init->isGLValue() && !T->isPointerType() &&
-                                  !T->isObjCObjectPointerType();
-  } else
-    this->ConvertResultToRValue = false;
-
+  QualType T = VD->getType();
+  this->ConvertResultToRValue = !Init->isGLValue() && !T->isPointerType() &&
+                                !T->isObjCObjectPointerType();
   EvalResult.setSource(VD);
 
-  if (!this->visitDeclAndReturn(VD, S.inConstantContext()))
+  if (!this->visitDeclAndReturn(VD, Init, S.inConstantContext()))
     EvalResult.setInvalid();
 
   S.EvaluatingDecl = nullptr;
diff --git a/clang/lib/AST/ByteCode/EvalEmitter.h b/clang/lib/AST/ByteCode/EvalEmitter.h
index 85a0a99fbb4b0..e81ea67adf97a 100644
--- a/clang/lib/AST/ByteCode/EvalEmitter.h
+++ b/clang/lib/AST/ByteCode/EvalEmitter.h
@@ -37,7 +37,8 @@ class EvalEmitter : public SourceMapper {
   EvaluationResult interpretExpr(const Expr *E,
                                  bool ConvertResultToRValue = false,
                                  bool DestroyToplevelScope = false);
-  EvaluationResult interpretDecl(const VarDecl *VD, bool CheckFullyInitialized);
+  EvaluationResult interpretDecl(const VarDecl *VD, const Expr *Init,
+                                 bool CheckFullyInitialized);
   /// Interpret the given Expr to a Pointer.
   EvaluationResult interpretAsPointer(const Expr *E, PtrCallback PtrCB);
   /// Interpret the given expression as if it was in the body of the given
@@ -59,7 +60,8 @@ class EvalEmitter : public SourceMapper {
 
   /// Methods implemented by the compiler.
   virtual bool visitExpr(const Expr *E, bool DestroyToplevelScope) = 0;
-  virtual bool visitDeclAndReturn(const VarDecl *VD, bool ConstantContext) = 0;
+  virtual bool visitDeclAndReturn(const VarDecl *VD, const Expr *Init,
+                                  bool ConstantContext) = 0;
   virtual bool visitFunc(const FunctionDecl *F) = 0;
   virtual bool visit(const Expr *E) = 0;
   virtual bool emitBool(bool V, const Expr *E) = 0;
diff --git a/clang/lib/AST/ByteCode/Function.h b/clang/lib/AST/ByteCode/Function.h
index af429b7849e88..95add5809afcc 100644
--- a/clang/lib/AST/ByteCode/Function.h
+++ b/clang/lib/AST/ByteCode/Function.h
@@ -236,9 +236,10 @@ class Function final {
            bool HasRVO, bool IsLambdaStaticInvoker);
 
   /// Sets the code of a function.
-  void setCode(unsigned NewFrameSize, llvm::SmallVector<std::byte> &&NewCode,
-               SourceMap &&NewSrcMap, llvm::SmallVector<Scope, 2> &&NewScopes,
-               bool NewHasBody) {
+  void setCode(FunctionDeclTy Source, unsigned NewFrameSize,
+               llvm::SmallVector<std::byte> &&NewCode, SourceMap &&NewSrcMap,
+               llvm::SmallVector<Scope, 2> &&NewScopes, bool NewHasBody) {
+    this->Source = Source;
     FrameSize = NewFrameSize;
     Code = std::move(NewCode);
     SrcMap = std::move(NewSrcMap);
diff --git a/clang/lib/AST/ByteCode/Interp.cpp b/clang/lib/AST/ByteCode/Interp.cpp
index b961a413fbe78..d5e75a0c90469 100644
--- a/clang/lib/AST/ByteCode/Interp.cpp
+++ b/clang/lib/AST/ByteCode/Interp.cpp
@@ -1493,9 +1493,12 @@ bool CheckDestructor(InterpState &S, CodePtr OpPC, const Pointer &Ptr) {
 }
 
 static void compileFunction(InterpState &S, const Function *Func) {
+  const FunctionDecl *Definition = Func->getDecl()->getDefinition();
+  if (!Definition)
+    return;
+
   Compiler<ByteCodeEmitter>(S.getContext(), S.P)
-      .compileFunc(Func->getDecl()->getMostRecentDecl(),
-                   const_cast<Function *>(Func));
+      .compileFunc(Definition, const_cast<Function *>(Func));
 }
 
 bool CallVar(InterpState &S, CodePtr OpPC, const Function *Func,
diff --git a/clang/lib/AST/ByteCode/Interp.h b/clang/lib/AST/ByteCode/Interp.h
index 9a7bd03bea077..b3b4b998439cc 100644
--- a/clang/lib/AST/ByteCode/Interp.h
+++ b/clang/lib/AST/ByteCode/Interp.h
@@ -2127,19 +2127,28 @@ bool InitElem(InterpState &S, CodePtr OpPC, uint32_t Idx) {
   if (Ptr.isUnknownSizeArray())
     return false;
 
+  const Descriptor *Desc = Ptr.getFieldDesc();
   // In the unlikely event that we're initializing the first item of
   // a non-array, skip the atIndex().
-  if (Idx == 0 && !Ptr.getFieldDesc()->isArray()) {
+  if (Idx == 0 && !Desc->isArray()) {
     Ptr.initialize();
     new (&Ptr.deref<T>()) T(Value);
     return true;
   }
 
-  const Pointer &ElemPtr = Ptr.atIndex(Idx);
-  if (!CheckInit(S, OpPC, ElemPtr))
+  if (!CheckLive(S, OpPC, Ptr, AK_Assign))
     return false;
-  ElemPtr.initialize();
-  new (&ElemPtr.deref<T>()) T(Value);
+  if (Idx >= Desc->getNumElems()) {
+    // CheckRange.
+    if (S.getLangOpts().CPlusPlus) {
+      const SourceInfo &Loc = S.Current->getSource(OpPC);
+      S.FFDiag(Loc, diag::note_constexpr_access_past_end)
+          << AK_Assign << S.Current->getRange(OpPC);
+    }
+    return false;
+  }
+  Ptr.initializeElement(Idx);
+  new (&Ptr.elem<T>(Idx)) T(Value);
   return true;
 }
 
@@ -2148,22 +2157,32 @@ template <PrimType Name, class T = typename PrimConv<Name>::T>
 bool InitElemPop(InterpState &S, CodePtr OpPC, uint32_t Idx) {
   const T &Value = S.Stk.pop<T>();
   const Pointer &Ptr = S.Stk.pop<Pointer>();
+
   if (Ptr.isUnknownSizeArray())
     return false;
 
+  const Descriptor *Desc = Ptr.getFieldDesc();
   // In the unlikely event that we're initializing the first item of
   // a non-array, skip the atIndex().
-  if (Idx == 0 && !Ptr.getFieldDesc()->isArray()) {
+  if (Idx == 0 && !Desc->isArray()) {
     Ptr.initialize();
     new (&Ptr.deref<T>()) T(Value);
     return true;
   }
 
-  const Pointer &ElemPtr = Ptr.atIndex(Idx);
-  if (!CheckInit(S, OpPC, ElemPtr))
+  if (!CheckLive(S, OpPC, Ptr, AK_Assign))
+    return false;
+  if (Idx >= Desc->getNumElems()) {
+    // CheckRange.
+    if (S.getLangOpts().CPlusPlus) {
+      const SourceInfo &Loc = S.Current->getSource(OpPC);
+      S.FFDiag(Loc, diag::note_constexpr_access_past_end)
+          << AK_Assign << S.Current->getRange(OpPC);
+    }
     return false;
-  ElemPtr.initialize();
-  new (&ElemPtr.deref<T>()) T(Value);
+  }
+  Ptr.initializeElement(Idx);
+  new (&Ptr.elem<T>(Idx)) T(Value);
   return true;
 }
 
@@ -3156,6 +3175,9 @@ inline bool CopyArray(InterpState &S, CodePtr OpPC, uint32_t SrcIndex,
   const auto &SrcPtr = S.Stk.pop<Pointer>();
   const auto &DestPtr = S.Stk.peek<Pointer>();
 
+  if (SrcPtr.isDummy() || DestPtr.isDummy())
+    return false;
+
   for (uint32_t I = 0; I != Size; ++I) {
     const Pointer &SP = SrcPtr.atIndex(SrcIndex + I);
 
@@ -3566,17 +3588,28 @@ bool InvalidNewDeleteExpr(InterpState &S, CodePtr OpPC, const Expr *E);
 
 template <PrimType Name, class T = typename PrimConv<Name>::T>
 inline bool BitCastPrim(InterpState &S, CodePtr OpPC, bool TargetIsUCharOrByte,
-                        uint32_t ResultBitWidth,
-                        const llvm::fltSemantics *Sem) {
+                        uint32_t ResultBitWidth, const llvm::fltSemantics *Sem,
+                        const Type *TargetType) {
   const Pointer &FromPtr = S.Stk.pop<Pointer>();
 
   if (!CheckLoad(S, OpPC, FromPtr))
     return false;
 
   if constexpr (std::is_same_v<T, Pointer>) {
+    if (!TargetType->isNullPtrType()) {
+      S.FFDiag(S.Current->getSource(OpPC),
+               diag::note_constexpr_bit_cast_invalid_type)
+          << /*IsToType=*/true << /*IsReference=*/false << 1 /*Pointer*/;
+      return false;
+    }
     // The only pointer type we can validly bitcast to is nullptr_t.
     S.Stk.push<Pointer>();
     return true;
+  } else if constexpr (std::is_same_v<T, MemberPointer>) {
+    S.FFDiag(S.Current->getSource(OpPC),
+             diag::note_constexpr_bit_cast_invalid_type)
+        << /*IsToType=*/true << /*IsReference=*/false << 2 /*MemberPointer*/;
+    return false;
   } else {
 
     size_t BuffSize = ResultBitWidth / 8;
diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
index 4461731c25648..b7b6d65c38e97 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
@@ -49,6 +49,13 @@ static APSInt popToAPSInt(InterpStack &Stk, PrimType T) {
   INT_TYPE_SWITCH(T, return Stk.pop<T>().toAPSInt());
 }
 
+static APSInt popToAPSInt(InterpState &S, const Expr *E) {
+  return popToAPSInt(S.Stk, *S.getContext().classify(E->getType()));
+}
+static APSInt popToAPSInt(InterpState &S, QualType T) {
+  return popToAPSInt(S.Stk, *S.getContext().classify(T));
+}
+
 /// Pushes \p Val on the stack as the type given by \p QT.
 static void pushInteger(InterpState &S, const APSInt &Val, QualType QT) {
   assert(QT->isSignedIntegerOrEnumerationType() ||
@@ -1350,11 +1357,8 @@ static bool interp__builtin_ia32_bzhi(InterpState &S, CodePtr OpPC,
       !CallType->isIntegerType())
     return false;
 
-  PrimType ValT = *S.Ctx.classify(Call->getArg(0));
-  PrimType IndexT = *S.Ctx.classify(Call->getArg(1));
-
-  APSInt Idx = popToAPSInt(S.Stk, IndexT);
-  APSInt Val = popToAPSInt(S.Stk, ValT);
+  APSInt Idx = popToAPSInt(S, Call->getArg(1));
+  APSInt Val = popToAPSInt(S, Call->getArg(0));
 
   unsigned BitWidth = Val.getBitWidth();
   uint64_t Index = Idx.extractBitsAsZExtValue(8, 0);
@@ -1374,7 +1378,7 @@ static bool interp__builtin_ia32_lzcnt(InterpState &S, CodePtr OpPC,
       !Call->getArg(0)->getType()->isIntegerType())
     return false;
 
-  APSInt Val = popToAPSInt(S.Stk, *S.Ctx.classify(Call->getArg(0)));
+  APSInt Val = popToAPSInt(S, Call->getArg(0));
   pushInteger(S, Val.countLeadingZeros(), CallType);
   return true;
 }
@@ -1387,7 +1391,7 @@ static bool interp__builtin_ia32_tzcnt(InterpState &S, CodePtr OpPC,
       !Call->getArg(0)->getType()->isIntegerType())
     return false;
 
-  APSInt Val = popToAPSInt(S.Stk, *S.Ctx.classify(Call->getArg(0)));
+  APSInt Val = popToAPSInt(S, Call->getArg(0));
   pushInteger(S, Val.countTrailingZeros(), CallType);
   return true;
 }
@@ -1399,11 +1403,8 @@ static bool interp__builtin_ia32_pdep(InterpState &S, CodePtr OpPC,
       !Call->getArg(1)->getType()->isIntegerType())
     return false;
 
-  PrimType ValT = *S.Ctx.classify(Call->getArg(0));
-  PrimType MaskT = *S.Ctx.classify(Call->getArg(1));
-
-  APSInt Mask = popToAPSInt(S.Stk, MaskT);
-  APSInt Val = popToAPSInt(S.Stk, ValT);
+  APSInt Mask = popToAPSInt(S, Call->getArg(1));
+  APSInt Val = popToAPSInt(S, Call->getArg(0));
 
   unsigned BitWidth = Val.getBitWidth();
   APInt Result = APInt::getZero(BitWidth);
@@ -1422,11 +1423,8 @@ static bool interp__builtin_ia32_pext(InterpState &S, CodePtr OpPC,
       !Call->getArg(1)->getType()->isIntegerType())
     return false;
 
-  PrimType ValT = *S.Ctx.classify(Call->getArg(0));
-  PrimType MaskT = *S.Ctx.classify(Call->getArg(1));
-
-  APSInt Mask = popToAPSInt(S.Stk, MaskT);
-  APSInt Val = popToAPSInt(S.Stk, ValT);
+  APSInt Mask = popToAPSInt(S, Call->getArg(1));
+  APSInt Val = popToAPSInt(S, Call->getArg(0));
 
   unsigned BitWidth = Val.getBitWidth();
   APInt Result = APInt::getZero(BitWidth);
@@ -1451,12 +1449,9 @@ static bool interp__builtin_ia32_addcarry_subborrow(InterpState &S,
 
   const Pointer &CarryOutPtr = S.Stk.pop<Pointer>();
 
-  PrimType CarryInT = *S.getContext().classify(Call->getArg(0));
-  PrimType LHST = *S.getContext().classify(Call->getArg(1));
-  PrimType RHST = *S.getContext().classify(Call->getArg(2));
-  APSInt RHS = popToAPSInt(S.Stk, RHST);
-  APSInt LHS = popToAPSInt(S.Stk, LHST);
-  APSInt CarryIn = popToAPSInt(S.Stk, CarryInT);
+  APSInt RHS = popToAPSInt(S, Call->getArg(2));
+  APSInt LHS = popToAPSInt(S, Call->getArg(1));
+  APSInt CarryIn = popToAPSInt(S, Call->getArg(0));
 
   bool IsAdd = BuiltinOp == clang::X86::BI__builtin_ia32_addcarryx_u32 ||
                BuiltinOp == clang::X86::BI__builtin_ia32_addcarryx_u64;
@@ -1546,7 +1541,7 @@ static bool interp__builtin_operator_new(InterpState &S, CodePtr OpPC,
       discard(S.Stk, *S.getContext().classify(Arg));
   }
 
-  APSInt Bytes = popToAPSInt(S.Stk, *S.getContext().classify(Call->getArg(0)));
+  APSInt Bytes = popToAPSInt(S, Call->getArg(0));
   CharUnits ElemSize = S.getASTContext().getTypeSizeInChars(ElemType);
   assert(!ElemSize.isZero());
   // Divide the number of bytes by sizeof(ElemType), so we get the number of
@@ -1740,9 +1735,7 @@ static bool interp__builtin_elementwise_abs(InterpState &S, CodePtr OpPC,
   assert(Call->getNumArgs() == 1);
   QualType Ty = Call->getArg(0)->getType();
   if (Ty->isIntegerType()) {
-    PrimType ArgT = *S.getContext().classify(Call->getArg(0)->getType());
-    APSInt Val = popToAPSInt(S.Stk, ArgT);
-
+    APSInt Val = popToAPSInt(S, Call->getArg(0));
     pushInteger(S, Val.abs(), Call->getType());
     return true;
   }
@@ -1791,8 +1784,7 @@ static bool interp__builtin_elementwise_popcount(InterpState &S, CodePtr OpPC,
                                                  unsigned BuiltinID) {
   assert(Call->getNumArgs() == 1);
   if (Call->getArg(0)->getType()->isIntegerType()) {
-    PrimType ArgT = *S.getContext().classify(Call->getArg(0)->getType());
-    APSInt Val = popToAPSInt(S.Stk, ArgT);
+    APSInt Val = popToAPSInt(S, Call->getArg(0));
 
     if (BuiltinID == Builtin::BI__builtin_elementwise_popcount) {
       pushInteger(S, Val.popcount(), Call->getType());
@@ -1923,8 +1915,7 @@ static bool interp__builtin_memcpy(InterpState &S, CodePtr OpPC,
                                    const CallExpr *Call, unsigned ID) {
   assert(Call->getNumArgs() == 3);
   const ASTContext &ASTCtx = S.getASTContext();
-  PrimType SizeT = *S.getContext().classify(Call->getArg(2));
-  APSInt Size = popToAPSInt(S.Stk, SizeT);
+  APSInt Size = popToAPSInt(S, Call->getArg(2));
   const Pointer SrcPtr = S.Stk.pop<Pointer>();
   const Pointer DestPtr = S.Stk.pop<Pointer>();
 
@@ -2090,8 +2081,7 @@ static bool interp__builtin_memcmp(InterpState &S, CodePtr OpPC,
                                    const InterpFrame *Frame,
                                    const CallExpr *Call, unsigned ID) {
   assert(Call->getNumArgs() == 3);
-  PrimType SizeT = *S.getContext().classify(Call->getArg(2));
-  const APSInt &Size = popToAPSInt(S.Stk, SizeT);
+  const APSInt &Size = popToAPSInt(S, Call->getArg(2));
   const Pointer &PtrB = S.Stk.pop<Pointer>();
   const Pointer &PtrA = S.Stk.pop<Pointer>();
 
@@ -2206,12 +2196,10 @@ static bool interp__builtin_memchr(InterpState &S, CodePtr OpPC,
     diagnoseNonConstexprBuiltin(S, OpPC, ID);
 
   std::optional<APSInt> MaxLength;
-  PrimType DesiredT = *S.getContext().classify(Call->getArg(1));
-  if (Call->getNumArgs() == 3) {
-    PrimType MaxT = *S.getContext().classify(Call->getArg(2));
-    MaxLength = popToAPSInt(S.Stk, MaxT);
-  }
-  APSInt Desired = popToAPSInt(S.Stk, DesiredT);
+  if (Call->getNumArgs() == 3)
+    MaxLength = popToAPSInt(S, Call->getArg(2));
+
+  APSInt Desired = popToAPSInt(S, Call->getArg(1));
   const Pointer &Ptr = S.Stk.pop<Pointer>();
 
   if (MaxLength && MaxLength->isZero()) {
@@ -2428,13 +2416,12 @@ static bool interp__builtin_object_size(InterpState &S, CodePtr OpPC,
                                         const InterpFrame *Frame,
                                         const CallExpr *Call) {
   const ASTContext &ASTCtx = S.getASTContext();
-  PrimType KindT = *S.getContext().classify(Call->getArg(1));
   // From the GCC docs:
   // Kind is an integer constant from 0 to 3. If the least significant bit is
   // clear, objects are whole variables. If it is set, a closest surrounding
   // subobject is considered the object a pointer points to. The second bit
   // determines if maximum or minimum of remaining bytes is computed.
-  unsigned Kind = popToAPSInt(S.Stk, KindT).getZExtValue();
+  unsigned Kind = popToAPSInt(S, Call->getArg(1)).getZExtValue();
   assert(Kind <= 3 && "unexpected kind");
   bool UseFieldDesc = (Kind & 1u);
   bool ReportMinimum = (Kind & 2u);
@@ -2562,10 +2549,8 @@ static bool interp__builtin_elementwise_int_binop(
   // Single integer case.
   if (!Call->getArg(0)->getType()->isVectorType()) {
     assert(!Call->getArg(1)->getType()->isVectorType());
-    APSInt RHS = popToAPSInt(
-        S.Stk, *S.getContext().classify(Call->getArg(1)->getType()));
-    APSInt LHS = popToAPSInt(
-        S.Stk, *S.getContext().classify(Call->getArg(0)->getType()));
+    APSInt RHS = popToAPSInt(S, Call->getArg(1));
+    APSInt LHS = popToAPSInt(S, Call->getArg(0));
     APInt Result = Fn(LHS, RHS);
     pushInteger(S, APSInt(std::move(Result), !LHS.isSigned()), Call->getType());
     return true;
@@ -2581,8 +2566,7 @@ static bool interp__builtin_elementwise_int_binop(
   if (!Call->getArg(1)->getType()->isVectorType()) {
     assert(Call->getArg(1)->getType()->isIntegralOrEnumerationType());
 
-    APSInt RHS = popToAPSInt(
-        S.Stk, *S.getContext().classify(Call->getArg(1)->getType()));
+    APSInt RHS = popToAPSInt(S, Call->getArg(1));
     const Pointer &LHS = S.Stk.pop<Pointer>();
     const Pointer &Dst = S.Stk.peek<Pointer>();
 
@@ -2635,10 +2619,8 @@ static bool interp__builtin_elementwise_maxmin(InterpState &S, CodePtr OpPC,
 
   if (!Arg0Type->isVectorType()) {
     assert(!Call->getArg(1)->getType()->isVectorType());
-    APSInt RHS = popToAPSInt(
-        S.Stk, *S.getContext().classify(Call->getArg(1)->getType()));
-    APSInt LHS = popToAPSInt(
-        S.Stk, *S.getContext().classify(Call->getArg(0)->getType()));
+    APSInt RHS = popToAPSInt(S, Call->getArg(1));
+    APSInt LHS = popToAPSInt(S, Arg0Type);
     APInt Result;
     if (BuiltinID == Builtin::BI__builtin_elementwise_max) {
       Result = std::max(LHS, RHS);
@@ -2808,8 +2790,7 @@ static bool interp__builtin_select(InterpState &S, CodePtr OpPC,
                                    const CallExpr *Call) {
   const Pointer &RHS = S.Stk.pop<Pointer>();
   const Pointer &LHS = S.Stk.pop<Pointer>();
-  PrimType MaskT = *S.getContext().classify(Call->getArg(0));
-  APSInt Mask = popToAPSInt(S.Stk, MaskT);
+  APSInt Mask = popToAPSInt(S, Call->getArg(0));
   const Pointer &Dst = S.Stk.peek<Pointer>();
 
   assert(LHS.getNumElems() == RHS.getNumElems());
@@ -2837,6 +2818,39 @@ static bool interp__builtin_select(InterpState &S, CodePtr OpPC,
   return true;
 }
 
+static bool interp__builtin_blend(InterpState &S, CodePtr OpPC,
+                                  const CallExpr *Call) {
+  APSInt Mask = popToAPSInt(S, Call->getArg(2));
+  const Pointer &TrueVec = S.Stk.pop<Pointer>();
+  const Pointer &FalseVec = S.Stk.pop<Pointer>();
+  const Pointer &Dst = S.Stk.peek<Pointer>();
+
+  assert(FalseVec.getNumElems() == TrueVec.getNumElems());
+  assert(FalseVec.getNumElems() == Dst.getNumElems());
+  unsigned NumElems = FalseVec.getNumElems();
+  PrimType ElemT = FalseVec.getFieldDesc()->getPrimType();
+  PrimType DstElemT = Dst.getFieldDesc()->getPrimType();
+
+  for (unsigned I = 0; I != NumElems; ++I) {
+    bool MaskBit = Mask[I % 8];
+    if (ElemT == PT_Float) {
+      assert(DstElemT == PT_Float);
+      Dst.elem<Floating>(I) =
+          MaskBit ? TrueVec.elem<Floating>(I) : FalseVec.elem<Floating>(I);
+    } else {
+      assert(DstElemT == ElemT);
+      INT_TYPE_SWITCH_NO_BOOL(DstElemT, {
+        Dst.elem<T>(I) =
+            static_cast<T>(MaskBit ? TrueVec.elem<T>(I).toAPSInt()
+                                   : FalseVec.elem<T>(I).toAPSInt());
+      });
+    }
+  }
+  Dst.initializeAllElements();
+
+  return true;
+}
+
 static bool interp__builtin_elementwise_triop(
     InterpState &S, CodePtr OpPC, const CallExpr *Call,
     llvm::function_ref<APInt(const APSInt &, const APSInt &, const APSInt &)>
@@ -2844,14 +2858,12 @@ static bool interp__builtin_elementwise_triop(
   assert(Call->getNumArgs() == 3);
 
   QualType Arg0Type = Call->getArg(0)->getType();
-  QualType Arg1Type = Call->getArg(1)->getType();
   QualType Arg2Type = Call->getArg(2)->getType();
-
   // Non-vector integer types.
   if (!Arg0Type->isVectorType()) {
-    const APSInt &Op2 = popToAPSInt(S.Stk, *S.getContext().classify(Arg2Type));
-    const APSInt &Op1 = popToAPSInt(S.Stk, *S.getContext().classify(Arg1Type));
-    const APSInt &Op0 = popToAPSInt(S.Stk, *S.getContext().classify(Arg0Type));
+    const APSInt &Op2 = popToAPSInt(S, Arg2Type);
+    const APSInt &Op1 = popToAPSInt(S, Call->getArg(1));
+    const APSInt &Op0 = popToAPSInt(S, Arg0Type);
     APSInt Result = APSInt(Fn(Op0, Op1, Op2), Op0.isUnsigned());
     pushInteger(S, Result, Call->getType());
     return true;
@@ -2864,8 +2876,7 @@ static bool interp__builtin_elementwise_triop(
 
   // Vector + Vector + Scalar case.
   if (!Arg2Type->isVectorType()) {
-    APSInt Op2 = popToAPSInt(
-        S.Stk, *S.getContext().classify(Call->getArg(2)->getType()));
+    APSInt Op2 = popToAPSInt(S, Arg2Type);
 
     const Pointer &Op1 = S.Stk.pop<Pointer>();
     const Pointer &Op0 = S.Stk.pop<Pointer>();
@@ -3502,6 +3513,16 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
           return llvm::APIntOps::fshr(Hi, Lo, Amt);
         });
 
+  case clang::X86::BI__builtin_ia32_blendpd:
+  case clang::X86::BI__builtin_ia32_blendpd256:
+  case clang::X86::BI__builtin_ia32_blendps:
+  case clang::X86::BI__builtin_ia32_blendps256:
+  case clang::X86::BI__builtin_ia32_pblendw128:
+  case clang::X86::BI__builtin_ia32_pblendw256:
+  case clang::X86::BI__builtin_ia32_pblendd128:
+  case clang::X86::BI__builtin_ia32_pblendd256:
+    return interp__builtin_blend(S, OpPC, Call);
+
   case clang::X86::BI__builtin_ia32_blendvpd:
   case clang::X86::BI__builtin_ia32_blendvpd256:
   case clang::X86::BI__builtin_ia32_blendvps:
diff --git a/clang/lib/AST/ByteCode/InterpBuiltinBitCast.cpp b/clang/lib/AST/ByteCode/InterpBuiltinBitCast.cpp
index feac97d4b1a69..4bd9c66fc9974 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltinBitCast.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltinBitCast.cpp
@@ -441,13 +441,27 @@ bool clang::interp::DoBitCastPtr(InterpState &S, CodePtr OpPC,
         if (llvm::sys::IsBigEndianHost)
           swapBytes(Memory.get(), FullBitWidth.roundToBytes());
 
-        BITCAST_TYPE_SWITCH_FIXED_SIZE(T, {
-          if (BitWidth.nonZero())
-            P.deref<T>() = T::bitcastFromMemory(Memory.get(), T::bitWidth())
-                               .truncate(BitWidth.getQuantity());
-          else
-            P.deref<T>() = T::zero();
-        });
+        if (T == PT_IntAPS) {
+          P.deref<IntegralAP<true>>() =
+              S.allocAP<IntegralAP<true>>(FullBitWidth.getQuantity());
+          IntegralAP<true>::bitcastFromMemory(Memory.get(),
+                                              FullBitWidth.getQuantity(),
+                                              &P.deref<IntegralAP<true>>());
+        } else if (T == PT_IntAP) {
+          P.deref<IntegralAP<false>>() =
+              S.allocAP<IntegralAP<false>>(FullBitWidth.getQuantity());
+          IntegralAP<false>::bitcastFromMemory(Memory.get(),
+                                               FullBitWidth.getQuantity(),
+                                               &P.deref<IntegralAP<false>>());
+        } else {
+          BITCAST_TYPE_SWITCH_FIXED_SIZE(T, {
+            if (BitWidth.nonZero())
+              P.deref<T>() = T::bitcastFromMemory(Memory.get(), T::bitWidth())
+                                 .truncate(BitWidth.getQuantity());
+            else
+              P.deref<T>() = T::zero();
+          });
+        }
         P.initialize();
         return true;
       });
diff --git a/clang/lib/AST/ByteCode/InterpFrame.cpp b/clang/lib/AST/ByteCode/InterpFrame.cpp
index b9dc2aed23113..c411a371282ef 100644
--- a/clang/lib/AST/ByteCode/InterpFrame.cpp
+++ b/clang/lib/AST/ByteCode/InterpFrame.cpp
@@ -24,13 +24,13 @@ using namespace clang::interp;
 
 InterpFrame::InterpFrame(InterpState &S)
     : Caller(nullptr), S(S), Depth(0), Func(nullptr), RetPC(CodePtr()),
-      ArgSize(0), Args(nullptr), FrameOffset(0), IsBottom(true) {}
+      ArgSize(0), Args(nullptr), FrameOffset(0) {}
 
 InterpFrame::InterpFrame(InterpState &S, const Function *Func,
                          InterpFrame *Caller, CodePtr RetPC, unsigned ArgSize)
     : Caller(Caller), S(S), Depth(Caller ? Caller->Depth + 1 : 0), Func(Func),
       RetPC(RetPC), ArgSize(ArgSize), Args(static_cast<char *>(S.Stk.top())),
-      FrameOffset(S.Stk.size()), IsBottom(!Caller) {
+      FrameOffset(S.Stk.size()) {
   if (!Func)
     return;
 
diff --git a/clang/lib/AST/ByteCode/InterpFrame.h b/clang/lib/AST/ByteCode/InterpFrame.h
index cf4d27d341e91..129851155bd86 100644
--- a/clang/lib/AST/ByteCode/InterpFrame.h
+++ b/clang/lib/AST/ByteCode/InterpFrame.h
@@ -129,7 +129,7 @@ class InterpFrame final : public Frame {
 
   bool isStdFunction() const;
 
-  bool isBottomFrame() const { return IsBottom; }
+  bool isBottomFrame() const { return !Caller; }
 
   void dump() const { dump(llvm::errs(), 0); }
   void dump(llvm::raw_ostream &OS, unsigned Indent = 0) const;
@@ -179,7 +179,6 @@ class InterpFrame final : public Frame {
   const size_t FrameOffset;
   /// Mapping from arg offsets to their argument blocks.
   llvm::DenseMap<unsigned, std::unique_ptr<char[]>> Params;
-  bool IsBottom = false;
 };
 
 } // namespace interp
diff --git a/clang/lib/AST/ByteCode/InterpState.h b/clang/lib/AST/ByteCode/InterpState.h
index e095908bce986..a13244bf383ae 100644
--- a/clang/lib/AST/ByteCode/InterpState.h
+++ b/clang/lib/AST/ByteCode/InterpState.h
@@ -67,7 +67,10 @@ class InterpState final : public State, public SourceMapper {
   Expr::EvalStatus &getEvalStatus() const override {
     return Parent.getEvalStatus();
   }
-  ASTContext &getASTContext() const override { return Parent.getASTContext(); }
+  ASTContext &getASTContext() const override { return Ctx.getASTContext(); }
+  const LangOptions &getLangOpts() const {
+    return Ctx.getASTContext().getLangOpts();
+  }
 
   // Forward status checks and updates to the walker.
   bool keepEvaluatingAfterFailure() const override {
@@ -122,7 +125,9 @@ class InterpState final : public State, public SourceMapper {
   StdAllocatorCaller getStdAllocatorCaller(StringRef Name) const;
 
   void *allocate(size_t Size, unsigned Align = 8) const {
-    return Allocator.Allocate(Size, Align);
+    if (!Allocator)
+      Allocator.emplace();
+    return Allocator->Allocate(Size, Align);
   }
   template <typename T> T *allocate(size_t Num = 1) const {
     return static_cast<T *>(allocate(Num * sizeof(T), alignof(T)));
@@ -188,7 +193,7 @@ class InterpState final : public State, public SourceMapper {
   /// for.
   llvm::SmallVector<const Block *> InitializingBlocks;
 
-  mutable llvm::BumpPtrAllocator Allocator;
+  mutable std::optional<llvm::BumpPtrAllocator> Allocator;
 };
 
 class InterpStateCCOverride final {
diff --git a/clang/lib/AST/ByteCode/Opcodes.td b/clang/lib/AST/ByteCode/Opcodes.td
index 95a44333e8e04..7af2df5318106 100644
--- a/clang/lib/AST/ByteCode/Opcodes.td
+++ b/clang/lib/AST/ByteCode/Opcodes.td
@@ -872,12 +872,12 @@ def CheckNull : Opcode;
 
 def BitCastTypeClass : TypeClass {
   let Types = [Uint8, Sint8, Uint16, Sint16, Uint32, Sint32, Uint64, Sint64,
-               IntAP, IntAPS, Bool, Float, Ptr];
+               IntAP, IntAPS, Bool, Float, Ptr, MemberPtr];
 }
 
 def BitCastPrim : Opcode {
   let Types = [BitCastTypeClass];
-  let Args = [ArgBool, ArgUint32, ArgFltSemantics];
+  let Args = [ArgBool, ArgUint32, ArgFltSemantics, ArgTypePtr];
   let HasGroup = 1;
 }
 
diff --git a/clang/lib/AST/ByteCode/Pointer.cpp b/clang/lib/AST/ByteCode/Pointer.cpp
index ef75b0ded4f1f..81d4ce14f9310 100644
--- a/clang/lib/AST/ByteCode/Pointer.cpp
+++ b/clang/lib/AST/ByteCode/Pointer.cpp
@@ -433,7 +433,8 @@ bool Pointer::isInitialized() const {
   if (!isBlockPointer())
     return true;
 
-  if (isRoot() && BS.Base == sizeof(GlobalInlineDescriptor)) {
+  if (isRoot() && BS.Base == sizeof(GlobalInlineDescriptor) &&
+      Offset == BS.Base) {
     const GlobalInlineDescriptor &GD =
         *reinterpret_cast<const GlobalInlineDescriptor *>(block()->rawData());
     return GD.InitState == GlobalInitState::Initialized;
@@ -461,7 +462,8 @@ bool Pointer::isElementInitialized(unsigned Index) const {
   if (isStatic() && BS.Base == 0)
     return true;
 
-  if (isRoot() && BS.Base == sizeof(GlobalInlineDescriptor)) {
+  if (isRoot() && BS.Base == sizeof(GlobalInlineDescriptor) &&
+      Offset == BS.Base) {
     const GlobalInlineDescriptor &GD =
         *reinterpret_cast<const GlobalInlineDescriptor *>(block()->rawData());
     return GD.InitState == GlobalInitState::Initialized;
@@ -486,7 +488,8 @@ void Pointer::initialize() const {
 
   assert(BS.Pointee && "Cannot initialize null pointer");
 
-  if (isRoot() && BS.Base == sizeof(GlobalInlineDescriptor)) {
+  if (isRoot() && BS.Base == sizeof(GlobalInlineDescriptor) &&
+      Offset == BS.Base) {
     GlobalInlineDescriptor &GD = *reinterpret_cast<GlobalInlineDescriptor *>(
         asBlockPointer().Pointee->rawData());
     GD.InitState = GlobalInitState::Initialized;
@@ -496,35 +499,39 @@ void Pointer::initialize() const {
   const Descriptor *Desc = getFieldDesc();
   assert(Desc);
   if (Desc->isPrimitiveArray()) {
-    // Primitive global arrays don't have an initmap.
-    if (isStatic() && BS.Base == 0)
-      return;
+    if (Desc->getNumElems() != 0)
+      initializeElement(getIndex());
+    return;
+  }
 
-    // Nothing to do for these.
-    if (Desc->getNumElems() == 0)
-      return;
+  // Field has its bit in an inline descriptor.
+  assert(BS.Base != 0 && "Only composite fields can be initialised");
+  getInlineDesc()->IsInitialized = true;
+}
 
-    InitMapPtr &IM = getInitMap();
-    if (!IM)
-      IM =
-          std::make_pair(false, std::make_shared<InitMap>(Desc->getNumElems()));
+void Pointer::initializeElement(unsigned Index) const {
+  // Primitive global arrays don't have an initmap.
+  if (isStatic() && BS.Base == 0)
+    return;
 
-    assert(IM);
+  assert(Index < getFieldDesc()->getNumElems());
 
-    // All initialized.
-    if (IM->first)
-      return;
+  InitMapPtr &IM = getInitMap();
+  if (!IM) {
+    const Descriptor *Desc = getFieldDesc();
+    IM = std::make_pair(false, std::make_shared<InitMap>(Desc->getNumElems()));
+  }
 
-    if (IM->second->initializeElement(getIndex())) {
-      IM->first = true;
-      IM->second.reset();
-    }
+  assert(IM);
+
+  // All initialized.
+  if (IM->first)
     return;
-  }
 
-  // Field has its bit in an inline descriptor.
-  assert(BS.Base != 0 && "Only composite fields can be initialised");
-  getInlineDesc()->IsInitialized = true;
+  if (IM->second->initializeElement(Index)) {
+    IM->first = true;
+    IM->second.reset();
+  }
 }
 
 void Pointer::initializeAllElements() const {
@@ -547,7 +554,8 @@ bool Pointer::allElementsInitialized() const {
   if (isStatic() && BS.Base == 0)
     return true;
 
-  if (isRoot() && BS.Base == sizeof(GlobalInlineDescriptor)) {
+  if (isRoot() && BS.Base == sizeof(GlobalInlineDescriptor) &&
+      Offset == BS.Base) {
     const GlobalInlineDescriptor &GD =
         *reinterpret_cast<const GlobalInlineDescriptor *>(block()->rawData());
     return GD.InitState == GlobalInitState::Initialized;
diff --git a/clang/lib/AST/ByteCode/Pointer.h b/clang/lib/AST/ByteCode/Pointer.h
index 49d701c3e27b6..bbf20801ce923 100644
--- a/clang/lib/AST/ByteCode/Pointer.h
+++ b/clang/lib/AST/ByteCode/Pointer.h
@@ -75,7 +75,7 @@ enum class Storage { Block, Int, Fn, Typeid };
 /// data the pointer decribes can be found at
 /// Pointee->rawData() + Pointer.Offset.
 ///
-///
+/// \verbatim
 /// Pointee                      Offset
 /// │                              │
 /// │                              │
@@ -87,6 +87,7 @@ enum class Storage { Block, Int, Fn, Typeid };
 ///                      │
 ///                      │
 ///                     Base
+/// \endverbatim
 class Pointer {
 private:
   static constexpr unsigned PastEndMark = ~0u;
@@ -701,6 +702,8 @@ class Pointer {
 
   /// Initializes a field.
   void initialize() const;
+  /// Initialized the given element of a primitive array.
+  void initializeElement(unsigned Index) const;
   /// Initialize all elements of a primitive array at once. This can be
   /// used in situations where we *know* we have initialized *all* elements
   /// of a primtive array.
diff --git a/clang/lib/AST/ByteCode/State.cpp b/clang/lib/AST/ByteCode/State.cpp
index dc3d0da7a4a46..323231fbf8236 100644
--- a/clang/lib/AST/ByteCode/State.cpp
+++ b/clang/lib/AST/ByteCode/State.cpp
@@ -112,10 +112,6 @@ OptionalDiagnostic State::diag(SourceLocation Loc, diag::kind DiagId,
   return OptionalDiagnostic();
 }
 
-const LangOptions &State::getLangOpts() const {
-  return getASTContext().getLangOpts();
-}
-
 void State::addCallStack(unsigned Limit) {
   // Determine which calls to skip, if any.
   unsigned ActiveCalls = getCallStackDepth() - 1;
diff --git a/clang/lib/AST/ByteCode/State.h b/clang/lib/AST/ByteCode/State.h
index a834eed142de0..0695c61c07a05 100644
--- a/clang/lib/AST/ByteCode/State.h
+++ b/clang/lib/AST/ByteCode/State.h
@@ -151,8 +151,6 @@ class State {
   /// Directly reports a diagnostic message.
   DiagnosticBuilder report(SourceLocation Loc, diag::kind DiagId);
 
-  const LangOptions &getLangOpts() const;
-
   /// Whether or not we're in a context where the front end requires a
   /// constant value.
   bool InConstantContext = false;
diff --git a/clang/lib/AST/DeclCXX.cpp b/clang/lib/AST/DeclCXX.cpp
index aa1f5a1146599..43264f835122f 100644
--- a/clang/lib/AST/DeclCXX.cpp
+++ b/clang/lib/AST/DeclCXX.cpp
@@ -3119,6 +3119,22 @@ void CXXDestructorDecl::setOperatorDelete(FunctionDecl *OD, Expr *ThisArg) {
   }
 }
 
+void CXXDestructorDecl::setOperatorGlobalDelete(FunctionDecl *OD) {
+  // FIXME: C++23 [expr.delete] specifies that the delete operator will be
+  // a usual deallocation function declared at global scope. A convenient
+  // function to assert that is lacking; Sema::isUsualDeallocationFunction()
+  // only works for CXXMethodDecl.
+  assert(!OD ||
+         (OD->getDeclName().getCXXOverloadedOperator() == OO_Delete &&
+          OD->getDeclContext()->getRedeclContext()->isTranslationUnit()));
+  auto *Canonical = cast<CXXDestructorDecl>(getCanonicalDecl());
+  if (!Canonical->OperatorGlobalDelete) {
+    Canonical->OperatorGlobalDelete = OD;
+    if (auto *L = getASTMutationListener())
+      L->ResolvedOperatorGlobDelete(Canonical, OD);
+  }
+}
+
 bool CXXDestructorDecl::isCalledByDelete(const FunctionDecl *OpDel) const {
   // C++20 [expr.delete]p6: If the value of the operand of the delete-
   // expression is not a null pointer value and the selected deallocation
diff --git a/clang/lib/AST/DeclTemplate.cpp b/clang/lib/AST/DeclTemplate.cpp
index 3162857aac5d0..b6bb6117d42af 100644
--- a/clang/lib/AST/DeclTemplate.cpp
+++ b/clang/lib/AST/DeclTemplate.cpp
@@ -663,6 +663,7 @@ CanQualType ClassTemplateDecl::getCanonicalInjectedSpecializationType(
     Ctx.canonicalizeTemplateArguments(CanonicalArgs);
     CommonPtr->CanonInjectedTST =
         CanQualType::CreateUnsafe(Ctx.getCanonicalTemplateSpecializationType(
+            ElaboratedTypeKeyword::None,
             TemplateName(const_cast<ClassTemplateDecl *>(getCanonicalDecl())),
             CanonicalArgs));
   }
@@ -1209,6 +1210,7 @@ ClassTemplatePartialSpecializationDecl::getCanonicalInjectedSpecializationType(
   if (CanonInjectedTST.isNull()) {
     CanonInjectedTST =
         CanQualType::CreateUnsafe(Ctx.getCanonicalTemplateSpecializationType(
+            ElaboratedTypeKeyword::None,
             TemplateName(getSpecializedTemplate()->getCanonicalDecl()),
             getTemplateArgs().asArray()));
   }
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index 5145896930153..b2cb9e2b3c347 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -945,6 +945,7 @@ namespace {
     }
 
     ASTContext &getASTContext() const override { return Ctx; }
+    const LangOptions &getLangOpts() const { return Ctx.getLangOpts(); }
 
     void setEvaluatingDecl(APValue::LValueBase Base, APValue &Value,
                            EvaluatingDeclKind EDK = EvaluatingDeclKind::Ctor) {
@@ -11926,6 +11927,33 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
 
     return Success(APValue(ResultElements.data(), ResultElements.size()), E);
   }
+  case X86::BI__builtin_ia32_blendpd:
+  case X86::BI__builtin_ia32_blendpd256:
+  case X86::BI__builtin_ia32_blendps:
+  case X86::BI__builtin_ia32_blendps256:
+  case X86::BI__builtin_ia32_pblendw128:
+  case X86::BI__builtin_ia32_pblendw256:
+  case X86::BI__builtin_ia32_pblendd128:
+  case X86::BI__builtin_ia32_pblendd256: {
+    APValue SourceF, SourceT, SourceC;
+    if (!EvaluateAsRValue(Info, E->getArg(0), SourceF) ||
+        !EvaluateAsRValue(Info, E->getArg(1), SourceT) ||
+        !EvaluateAsRValue(Info, E->getArg(2), SourceC))
+      return false;
+
+    const APInt &C = SourceC.getInt();
+    unsigned SourceLen = SourceF.getVectorLength();
+    SmallVector<APValue, 32> ResultElements;
+    ResultElements.reserve(SourceLen);
+    for (unsigned EltNum = 0; EltNum != SourceLen; ++EltNum) {
+      const APValue &F = SourceF.getVectorElt(EltNum);
+      const APValue &T = SourceT.getVectorElt(EltNum);
+      ResultElements.push_back(C[EltNum % 8] ? T : F);
+    }
+
+    return Success(APValue(ResultElements.data(), ResultElements.size()), E);
+  }
+
   case X86::BI__builtin_ia32_blendvpd:
   case X86::BI__builtin_ia32_blendvpd256:
   case X86::BI__builtin_ia32_blendvps:
@@ -12160,6 +12188,9 @@ static bool handleVectorShuffle(EvalInfo &Info, const ShuffleVectorExpr *E,
 }
 
 bool VectorExprEvaluator::VisitShuffleVectorExpr(const ShuffleVectorExpr *E) {
+  // FIXME: Unary shuffle with mask not currently supported.
+  if (E->getNumSubExprs() == 2)
+    return Error(E);
   APValue VecVal1;
   const Expr *Vec1 = E->getExpr(0);
   if (!EvaluateAsRValue(Info, Vec1, VecVal1))
@@ -14071,6 +14102,7 @@ bool IntExprEvaluator::VisitBuiltinCallExpr(const CallExpr *E,
       return Success(Result, E);
     }
     }
+    llvm_unreachable("Fully covered switch above");
   }
   case Builtin::BIstrlen:
   case Builtin::BIwcslen:
@@ -17731,6 +17763,7 @@ bool Expr::EvaluateAsInitializer(APValue &Value, const ASTContext &Ctx,
                                  bool IsConstantInitialization) const {
   assert(!isValueDependent() &&
          "Expression evaluator can't be called on a dependent expression.");
+  assert(VD && "Need a valid VarDecl");
 
   llvm::TimeTraceScope TimeScope("EvaluateAsInitializer", [&] {
     std::string Name;
@@ -17755,7 +17788,7 @@ bool Expr::EvaluateAsInitializer(APValue &Value, const ASTContext &Ctx,
 
   if (Info.EnableNewConstInterp) {
     auto &InterpCtx = const_cast<ASTContext &>(Ctx).getInterpContext();
-    if (!InterpCtx.evaluateAsInitializer(Info, VD, Value))
+    if (!InterpCtx.evaluateAsInitializer(Info, VD, this, Value))
       return false;
 
     return CheckConstantExpression(Info, DeclLoc, DeclTy, Value,
diff --git a/clang/lib/AST/ItaniumMangle.cpp b/clang/lib/AST/ItaniumMangle.cpp
index 163cd43abd45a..2173aed5b45af 100644
--- a/clang/lib/AST/ItaniumMangle.cpp
+++ b/clang/lib/AST/ItaniumMangle.cpp
@@ -1311,19 +1311,6 @@ void CXXNameMangler::manglePrefix(QualType type) {
       mangleTemplateArgs(TST->getTemplateName(), TST->template_arguments());
       addSubstitution(QualType(TST, 0));
     }
-  } else if (const auto *DTST =
-                 type->getAs<DependentTemplateSpecializationType>()) {
-    if (!mangleSubstitution(QualType(DTST, 0))) {
-      TemplateName Template = getASTContext().getDependentTemplateName(
-          DTST->getDependentTemplateName());
-      mangleTemplatePrefix(Template);
-
-      // FIXME: GCC does not appear to mangle the template arguments when
-      // the template in question is a dependent template name. Should we
-      // emulate that badness?
-      mangleTemplateArgs(Template, DTST->template_arguments());
-      addSubstitution(QualType(DTST, 0));
-    }
   } else if (const auto *DNT = type->getAs<DependentNameType>()) {
     // Clang 14 and before did not consider this substitutable.
     bool Clang14Compat = isCompatibleWith(LangOptions::ClangABI::Ver14);
@@ -2525,10 +2512,14 @@ bool CXXNameMangler::mangleUnresolvedTypeOrSimpleId(QualType Ty,
       mangleSourceNameWithAbiTags(TD);
       break;
     }
+    case TemplateName::DependentTemplate: {
+      const DependentTemplateStorage *S = TN.getAsDependentTemplateName();
+      mangleSourceName(S->getName().getIdentifier());
+      break;
+    }
 
     case TemplateName::OverloadedTemplate:
     case TemplateName::AssumedTemplate:
-    case TemplateName::DependentTemplate:
     case TemplateName::DeducedTemplate:
       llvm_unreachable("invalid base for a template specialization type");
 
@@ -2574,17 +2565,6 @@ bool CXXNameMangler::mangleUnresolvedTypeOrSimpleId(QualType Ty,
     mangleSourceName(cast<DependentNameType>(Ty)->getIdentifier());
     break;
 
-  case Type::DependentTemplateSpecialization: {
-    const DependentTemplateSpecializationType *DTST =
-        cast<DependentTemplateSpecializationType>(Ty);
-    TemplateName Template = getASTContext().getDependentTemplateName(
-        DTST->getDependentTemplateName());
-    const DependentTemplateStorage &S = DTST->getDependentTemplateName();
-    mangleSourceName(S.getName().getIdentifier());
-    mangleTemplateArgs(Template, DTST->template_arguments());
-    break;
-  }
-
   case Type::Using:
     return mangleUnresolvedTypeOrSimpleId(cast<UsingType>(Ty)->desugar(),
                                           Prefix);
@@ -4458,16 +4438,14 @@ void CXXNameMangler::mangleType(const TemplateSpecializationType *T) {
   if (TemplateDecl *TD = T->getTemplateName().getAsTemplateDecl()) {
     mangleTemplateName(TD, T->template_arguments());
   } else {
-    if (mangleSubstitution(QualType(T, 0)))
-      return;
-
+    Out << 'N';
     mangleTemplatePrefix(T->getTemplateName());
 
     // FIXME: GCC does not appear to mangle the template arguments when
     // the template in question is a dependent template name. Should we
     // emulate that badness?
     mangleTemplateArgs(T->getTemplateName(), T->template_arguments());
-    addSubstitution(QualType(T, 0));
+    Out << 'E';
   }
 }
 
@@ -4505,21 +4483,6 @@ void CXXNameMangler::mangleType(const DependentNameType *T) {
   Out << 'E';
 }
 
-void CXXNameMangler::mangleType(const DependentTemplateSpecializationType *T) {
-  // Dependently-scoped template types are nested if they have a prefix.
-  Out << 'N';
-
-  TemplateName Prefix =
-      getASTContext().getDependentTemplateName(T->getDependentTemplateName());
-  mangleTemplatePrefix(Prefix);
-
-  // FIXME: GCC does not appear to mangle the template arguments when
-  // the template in question is a dependent template name. Should we
-  // emulate that badness?
-  mangleTemplateArgs(Prefix, T->template_arguments());
-  Out << 'E';
-}
-
 void CXXNameMangler::mangleType(const TypeOfType *T) {
   // FIXME: this is pretty unsatisfactory, but there isn't an obvious
   // "extension with parameters" mangling.
diff --git a/clang/lib/AST/MicrosoftMangle.cpp b/clang/lib/AST/MicrosoftMangle.cpp
index d96472e393f68..8cbc72b1db735 100644
--- a/clang/lib/AST/MicrosoftMangle.cpp
+++ b/clang/lib/AST/MicrosoftMangle.cpp
@@ -3655,12 +3655,6 @@ void MicrosoftCXXNameMangler::mangleType(const DependentNameType *T, Qualifiers,
   Error(Range.getBegin(), "dependent name type") << Range;
 }
 
-void MicrosoftCXXNameMangler::mangleType(
-    const DependentTemplateSpecializationType *T, Qualifiers,
-    SourceRange Range) {
-  Error(Range.getBegin(), "dependent template specialization type") << Range;
-}
-
 void MicrosoftCXXNameMangler::mangleType(const PackExpansionType *T, Qualifiers,
                                          SourceRange Range) {
   Error(Range.getBegin(), "pack expansion") << Range;
diff --git a/clang/lib/AST/ODRHash.cpp b/clang/lib/AST/ODRHash.cpp
index fb95f58092c49..6842038b7eb57 100644
--- a/clang/lib/AST/ODRHash.cpp
+++ b/clang/lib/AST/ODRHash.cpp
@@ -1213,16 +1213,6 @@ class ODRTypeVisitor : public TypeVisitor<ODRTypeVisitor> {
     VisitTypeWithKeyword(T);
   }
 
-  void VisitDependentTemplateSpecializationType(
-      const DependentTemplateSpecializationType *T) {
-    Hash.AddDependentTemplateName(T->getDependentTemplateName());
-    ID.AddInteger(T->template_arguments().size());
-    for (const auto &TA : T->template_arguments()) {
-      Hash.AddTemplateArgument(TA);
-    }
-    VisitTypeWithKeyword(T);
-  }
-
   void VisitUnaryTransformType(const UnaryTransformType *T) {
     AddQualType(T->getUnderlyingType());
     AddQualType(T->getBaseType());
diff --git a/clang/lib/AST/OpenMPClause.cpp b/clang/lib/AST/OpenMPClause.cpp
index 0930ca27c29f8..69d33019c0952 100644
--- a/clang/lib/AST/OpenMPClause.cpp
+++ b/clang/lib/AST/OpenMPClause.cpp
@@ -1911,8 +1911,13 @@ void OMPClausePrinter::VisitOMPDetachClause(OMPDetachClause *Node) {
 void OMPClausePrinter::VisitOMPDefaultClause(OMPDefaultClause *Node) {
   OS << "default("
      << getOpenMPSimpleClauseTypeName(OMPC_default,
-                                      unsigned(Node->getDefaultKind()))
-     << ")";
+                                      unsigned(Node->getDefaultKind()));
+  if (Version >= 60 && Node->getDefaultVC() != OMPC_DEFAULT_VC_all) {
+    OS << ":"
+       << getOpenMPDefaultVariableCategoryName(unsigned(Node->getDefaultVC()));
+  }
+
+  OS << ")";
 }
 
 void OMPClausePrinter::VisitOMPProcBindClause(OMPProcBindClause *Node) {
diff --git a/clang/lib/AST/QualTypeNames.cpp b/clang/lib/AST/QualTypeNames.cpp
index ee7fec3372fcf..a2f930911bfe5 100644
--- a/clang/lib/AST/QualTypeNames.cpp
+++ b/clang/lib/AST/QualTypeNames.cpp
@@ -58,9 +58,9 @@ static bool getFullyQualifiedTemplateName(const ASTContext &Ctx,
   NestedNameSpecifier NNS = std::nullopt;
 
   TemplateDecl *ArgTDecl = TName.getAsTemplateDecl();
-  // ArgTDecl won't be NULL because we asserted that this isn't a
-  // dependent context very early in the call chain.
-  assert(ArgTDecl != nullptr);
+  if (!ArgTDecl) // ArgTDecl can be null in dependent contexts.
+    return false;
+
   QualifiedTemplateName *QTName = TName.getAsQualifiedTemplateName();
 
   if (QTName &&
@@ -252,6 +252,9 @@ createNestedNameSpecifierForScopeOf(const ASTContext &Ctx, const Decl *Decl,
                                     bool WithGlobalNsPrefix) {
   assert(Decl);
 
+  // Some declaration cannot be qualified.
+  if (Decl->isTemplateParameter())
+    return std::nullopt;
   const DeclContext *DC = Decl->getDeclContext()->getRedeclContext();
   const auto *Outer = dyn_cast<NamedDecl>(DC);
   const auto *OuterNS = dyn_cast<NamespaceDecl>(DC);
diff --git a/clang/lib/AST/StmtOpenMP.cpp b/clang/lib/AST/StmtOpenMP.cpp
index 36ecaf6489ef0..1f6586f95a9f8 100644
--- a/clang/lib/AST/StmtOpenMP.cpp
+++ b/clang/lib/AST/StmtOpenMP.cpp
@@ -139,13 +139,14 @@ bool OMPLoopBasedDirective::doForAllLoops(
 
       Stmt *TransformedStmt = Dir->getTransformedStmt();
       if (!TransformedStmt) {
-        unsigned NumGeneratedLoops = Dir->getNumGeneratedLoops();
-        if (NumGeneratedLoops == 0) {
+        unsigned NumGeneratedTopLevelLoops =
+            Dir->getNumGeneratedTopLevelLoops();
+        if (NumGeneratedTopLevelLoops == 0) {
           // May happen if the loop transformation does not result in a
           // generated loop (such as full unrolling).
           break;
         }
-        if (NumGeneratedLoops > 0) {
+        if (NumGeneratedTopLevelLoops > 0) {
           // The loop transformation construct has generated loops, but these
           // may not have been generated yet due to being in a dependent
           // context.
@@ -447,16 +448,16 @@ OMPStripeDirective *OMPStripeDirective::CreateEmpty(const ASTContext &C,
       SourceLocation(), SourceLocation(), NumLoops);
 }
 
-OMPUnrollDirective *
-OMPUnrollDirective::Create(const ASTContext &C, SourceLocation StartLoc,
-                           SourceLocation EndLoc, ArrayRef<OMPClause *> Clauses,
-                           Stmt *AssociatedStmt, unsigned NumGeneratedLoops,
-                           Stmt *TransformedStmt, Stmt *PreInits) {
-  assert(NumGeneratedLoops <= 1 && "Unrolling generates at most one loop");
+OMPUnrollDirective *OMPUnrollDirective::Create(
+    const ASTContext &C, SourceLocation StartLoc, SourceLocation EndLoc,
+    ArrayRef<OMPClause *> Clauses, Stmt *AssociatedStmt,
+    unsigned NumGeneratedTopLevelLoops, Stmt *TransformedStmt, Stmt *PreInits) {
+  assert(NumGeneratedTopLevelLoops <= 1 &&
+         "Unrolling generates at most one loop");
 
   auto *Dir = createDirective<OMPUnrollDirective>(
       C, Clauses, AssociatedStmt, TransformedStmtOffset + 1, StartLoc, EndLoc);
-  Dir->setNumGeneratedLoops(NumGeneratedLoops);
+  Dir->setNumGeneratedTopLevelLoops(NumGeneratedTopLevelLoops);
   Dir->setTransformedStmt(TransformedStmt);
   Dir->setPreInits(PreInits);
   return Dir;
diff --git a/clang/lib/AST/TemplateName.cpp b/clang/lib/AST/TemplateName.cpp
index f2cb15dbc43dd..2b8044e4188cd 100644
--- a/clang/lib/AST/TemplateName.cpp
+++ b/clang/lib/AST/TemplateName.cpp
@@ -213,25 +213,25 @@ TemplateDecl *TemplateName::getAsTemplateDecl(bool IgnoreDeduced) const {
       dyn_cast_if_present<Decl *>(Name.Storage));
 }
 
-std::pair<TemplateDecl *, DefaultArguments>
+std::pair<TemplateName, DefaultArguments>
 TemplateName::getTemplateDeclAndDefaultArgs() const {
+  DefaultArguments DefArgs;
   for (TemplateName Name = *this; /**/; /**/) {
-    if (Name.getKind() == TemplateName::DeducedTemplate) {
-      DeducedTemplateStorage *DTS = Name.getAsDeducedTemplateName();
-      TemplateDecl *TD =
-          DTS->getUnderlying().getAsTemplateDecl(/*IgnoreDeduced=*/true);
-      DefaultArguments DefArgs = DTS->getDefaultArguments();
-      if (TD && DefArgs)
+    if (DeducedTemplateStorage *DTS = Name.getAsDeducedTemplateName()) {
+      assert(!DefArgs && "multiple default args?");
+      DefArgs = DTS->getDefaultArguments();
+      if (TemplateDecl *TD = DTS->getUnderlying().getAsTemplateDecl();
+          TD && DefArgs)
         assert(DefArgs.StartPos + DefArgs.Args.size() <=
                TD->getTemplateParameters()->size());
-      return {TD, DTS->getDefaultArguments()};
+      Name = DTS->getUnderlying();
     }
     if (std::optional<TemplateName> UnderlyingOrNone =
             Name.desugar(/*IgnoreDeduced=*/false)) {
       Name = *UnderlyingOrNone;
       continue;
     }
-    return {cast_if_present<TemplateDecl>(Name.Storage.dyn_cast<Decl *>()), {}};
+    return {Name, DefArgs};
   }
 }
 
diff --git a/clang/lib/AST/Type.cpp b/clang/lib/AST/Type.cpp
index 86621795d81e6..9794314a98f81 100644
--- a/clang/lib/AST/Type.cpp
+++ b/clang/lib/AST/Type.cpp
@@ -1933,10 +1933,6 @@ NestedNameSpecifier Type::getPrefix() const {
     return cast<TemplateSpecializationType>(this)
         ->getTemplateName()
         .getQualifier();
-  case Type::DependentTemplateSpecialization:
-    return cast<DependentTemplateSpecializationType>(this)
-        ->getDependentTemplateName()
-        .getQualifier();
   case Type::Enum:
   case Type::Record:
   case Type::InjectedClassName:
@@ -3215,7 +3211,6 @@ bool Type::isSpecifierType() const {
   case SubstTemplateTypeParm:
   case TemplateSpecialization:
   case DependentName:
-  case DependentTemplateSpecialization:
   case ObjCInterface:
   case ObjCObject:
     return true;
@@ -3333,42 +3328,12 @@ StringRef KeywordHelpers::getKeywordName(ElaboratedTypeKeyword Keyword) {
   llvm_unreachable("Unknown elaborated type keyword.");
 }
 
-DependentTemplateSpecializationType::DependentTemplateSpecializationType(
-    ElaboratedTypeKeyword Keyword, const DependentTemplateStorage &Name,
-    ArrayRef<TemplateArgument> Args, QualType Canon)
-    : TypeWithKeyword(Keyword, DependentTemplateSpecialization, Canon,
-
-                      toTypeDependence(Name.getDependence())),
-      Name(Name) {
-  DependentTemplateSpecializationTypeBits.NumArgs = Args.size();
-  auto *ArgBuffer = const_cast<TemplateArgument *>(template_arguments().data());
-  for (const TemplateArgument &Arg : Args) {
-    addDependence(toTypeDependence(Arg.getDependence() &
-                                   TemplateArgumentDependence::UnexpandedPack));
-
-    new (ArgBuffer++) TemplateArgument(Arg);
-  }
-}
-
-void DependentTemplateSpecializationType::Profile(
-    llvm::FoldingSetNodeID &ID, const ASTContext &Context,
-    ElaboratedTypeKeyword Keyword, const DependentTemplateStorage &Name,
-    ArrayRef<TemplateArgument> Args) {
-  ID.AddInteger(llvm::to_underlying(Keyword));
-  Name.Profile(ID);
-  for (const TemplateArgument &Arg : Args)
-    Arg.Profile(ID, Context);
-}
-
 bool Type::isElaboratedTypeSpecifier() const {
   ElaboratedTypeKeyword Keyword;
   if (const auto *TST = dyn_cast<TemplateSpecializationType>(this))
     Keyword = TST->getKeyword();
   else if (const auto *DepName = dyn_cast<DependentNameType>(this))
     Keyword = DepName->getKeyword();
-  else if (const auto *DepTST =
-               dyn_cast<DependentTemplateSpecializationType>(this))
-    Keyword = DepTST->getKeyword();
   else if (const auto *T = dyn_cast<TagType>(this))
     Keyword = T->getKeyword();
   else if (const auto *T = dyn_cast<TypedefType>(this))
@@ -4641,17 +4606,6 @@ TemplateSpecializationType::TemplateSpecializationType(
   TemplateSpecializationTypeBits.NumArgs = Args.size();
   TemplateSpecializationTypeBits.TypeAlias = IsAlias;
 
-  assert(!T.getAsDependentTemplateName() &&
-         "Use DependentTemplateSpecializationType for dependent template-name");
-  assert((T.getKind() == TemplateName::Template ||
-          T.getKind() == TemplateName::SubstTemplateTemplateParm ||
-          T.getKind() == TemplateName::SubstTemplateTemplateParmPack ||
-          T.getKind() == TemplateName::UsingTemplate ||
-          T.getKind() == TemplateName::QualifiedTemplate ||
-          T.getKind() == TemplateName::DeducedTemplate ||
-          T.getKind() == TemplateName::AssumedTemplate) &&
-         "Unexpected template name for TemplateSpecializationType");
-
   auto *TemplateArgs =
       const_cast<TemplateArgument *>(template_arguments().data());
   for (const TemplateArgument &Arg : Args) {
@@ -4690,15 +4644,17 @@ bool clang::TemplateSpecializationType::isSugared() const {
 
 void TemplateSpecializationType::Profile(llvm::FoldingSetNodeID &ID,
                                          const ASTContext &Ctx) {
-  Profile(ID, Template, template_arguments(),
+  Profile(ID, getKeyword(), Template, template_arguments(),
           isSugared() ? desugar() : QualType(), Ctx);
 }
 
 void TemplateSpecializationType::Profile(llvm::FoldingSetNodeID &ID,
+                                         ElaboratedTypeKeyword Keyword,
                                          TemplateName T,
                                          ArrayRef<TemplateArgument> Args,
                                          QualType Underlying,
                                          const ASTContext &Context) {
+  ID.AddInteger(llvm::to_underlying(Keyword));
   T.Profile(ID);
   Underlying.Profile(ID);
 
@@ -5105,7 +5061,6 @@ bool Type::canHaveNullability(bool ResultIfUnknown) const {
   case Type::SubstTemplateTypeParmPack:
   case Type::SubstBuiltinTemplatePack:
   case Type::DependentName:
-  case Type::DependentTemplateSpecialization:
   case Type::Auto:
     return ResultIfUnknown;
 
diff --git a/clang/lib/AST/TypeLoc.cpp b/clang/lib/AST/TypeLoc.cpp
index 3e9597fc4d471..55476e2175a1f 100644
--- a/clang/lib/AST/TypeLoc.cpp
+++ b/clang/lib/AST/TypeLoc.cpp
@@ -477,8 +477,6 @@ NestedNameSpecifierLoc TypeLoc::getPrefix() const {
     return castAs<DependentNameTypeLoc>().getQualifierLoc();
   case TypeLoc::TemplateSpecialization:
     return castAs<TemplateSpecializationTypeLoc>().getQualifierLoc();
-  case TypeLoc::DependentTemplateSpecialization:
-    return castAs<DependentTemplateSpecializationTypeLoc>().getQualifierLoc();
   case TypeLoc::DeducedTemplateSpecialization:
     return castAs<DeducedTemplateSpecializationTypeLoc>().getQualifierLoc();
   case TypeLoc::Enum:
@@ -505,13 +503,6 @@ SourceLocation TypeLoc::getNonPrefixBeginLoc() const {
       Loc = TL.getTemplateNameLoc();
     return Loc;
   }
-  case TypeLoc::DependentTemplateSpecialization: {
-    auto TL = castAs<DependentTemplateSpecializationTypeLoc>();
-    SourceLocation Loc = TL.getTemplateKeywordLoc();
-    if (!Loc.isValid())
-      Loc = TL.getTemplateNameLoc();
-    return Loc;
-  }
   case TypeLoc::DeducedTemplateSpecialization: {
     auto TL = castAs<DeducedTemplateSpecializationTypeLoc>();
     SourceLocation Loc = TL.getTemplateKeywordLoc();
@@ -550,12 +541,6 @@ SourceLocation TypeLoc::getNonElaboratedBeginLoc() const {
       return QualifierLoc.getBeginLoc();
     return T.getTemplateNameLoc();
   }
-  case TypeLoc::DependentTemplateSpecialization: {
-    auto T = castAs<DependentTemplateSpecializationTypeLoc>();
-    if (NestedNameSpecifierLoc QualifierLoc = T.getQualifierLoc())
-      return QualifierLoc.getBeginLoc();
-    return T.getTemplateNameLoc();
-  }
   case TypeLoc::DeducedTemplateSpecialization: {
     auto T = castAs<DeducedTemplateSpecializationTypeLoc>();
     if (NestedNameSpecifierLoc QualifierLoc = T.getQualifierLoc())
@@ -690,20 +675,6 @@ void DependentNameTypeLoc::initializeLocal(ASTContext &Context,
   setNameLoc(Loc);
 }
 
-void
-DependentTemplateSpecializationTypeLoc::initializeLocal(ASTContext &Context,
-                                                        SourceLocation Loc) {
-  initializeElaboratedKeyword(*this, Loc);
-  setQualifierLoc(initializeQualifier(
-      Context, getTypePtr()->getDependentTemplateName().getQualifier(), Loc));
-  setTemplateKeywordLoc(Loc);
-  setTemplateNameLoc(Loc);
-  setLAngleLoc(Loc);
-  setRAngleLoc(Loc);
-  TemplateSpecializationTypeLoc::initializeArgLocs(
-      Context, getTypePtr()->template_arguments(), getArgInfos(), Loc);
-}
-
 void TemplateSpecializationTypeLoc::set(SourceLocation ElaboratedKeywordLoc,
                                         NestedNameSpecifierLoc QualifierLoc,
                                         SourceLocation TemplateKeywordLoc,
@@ -949,8 +920,5 @@ AutoTypeLoc TypeLoc::getContainedAutoTypeLoc() const {
 SourceLocation TypeLoc::getTemplateKeywordLoc() const {
   if (const auto TSTL = getAsAdjusted<TemplateSpecializationTypeLoc>())
     return TSTL.getTemplateKeywordLoc();
-  if (const auto DTSTL =
-          getAsAdjusted<DependentTemplateSpecializationTypeLoc>())
-    return DTSTL.getTemplateKeywordLoc();
   return SourceLocation();
 }
diff --git a/clang/lib/AST/TypePrinter.cpp b/clang/lib/AST/TypePrinter.cpp
index 54ca42d2035ad..cd59678d67f2f 100644
--- a/clang/lib/AST/TypePrinter.cpp
+++ b/clang/lib/AST/TypePrinter.cpp
@@ -237,7 +237,6 @@ bool TypePrinter::canPrefixQualifiers(const Type *T,
     case Type::TemplateSpecialization:
     case Type::InjectedClassName:
     case Type::DependentName:
-    case Type::DependentTemplateSpecialization:
     case Type::ObjCObject:
     case Type::ObjCTypeParam:
     case Type::ObjCInterface:
@@ -1836,22 +1835,6 @@ void TypePrinter::printDependentNameBefore(const DependentNameType *T,
 void TypePrinter::printDependentNameAfter(const DependentNameType *T,
                                           raw_ostream &OS) {}
 
-void TypePrinter::printDependentTemplateSpecializationBefore(
-        const DependentTemplateSpecializationType *T, raw_ostream &OS) {
-  IncludeStrongLifetimeRAII Strong(Policy);
-
-  OS << TypeWithKeyword::getKeywordName(T->getKeyword());
-  if (T->getKeyword() != ElaboratedTypeKeyword::None)
-    OS << " ";
-
-  T->getDependentTemplateName().print(OS, Policy);
-  printTemplateArgumentList(OS, T->template_arguments(), Policy);
-  spaceBeforePlaceHolder(OS);
-}
-
-void TypePrinter::printDependentTemplateSpecializationAfter(
-        const DependentTemplateSpecializationType *T, raw_ostream &OS) {}
-
 void TypePrinter::printPackExpansionBefore(const PackExpansionType *T,
                                            raw_ostream &OS) {
   printBefore(T->getPattern(), OS);
diff --git a/clang/lib/ASTMatchers/ASTMatchersInternal.cpp b/clang/lib/ASTMatchers/ASTMatchersInternal.cpp
index 653b3810cb68b..1f0e007dafc65 100644
--- a/clang/lib/ASTMatchers/ASTMatchersInternal.cpp
+++ b/clang/lib/ASTMatchers/ASTMatchersInternal.cpp
@@ -1109,8 +1109,6 @@ const AstTypeMatcher<TemplateTypeParmType> templateTypeParmType;
 const AstTypeMatcher<InjectedClassNameType> injectedClassNameType;
 const AstTypeMatcher<DecayedType> decayedType;
 const AstTypeMatcher<DependentNameType> dependentNameType;
-const AstTypeMatcher<DependentTemplateSpecializationType>
-    dependentTemplateSpecializationType;
 AST_TYPELOC_TRAVERSE_MATCHER_DEF(hasElementType,
                                  AST_POLYMORPHIC_SUPPORTED_TYPES(ArrayType,
                                                                  ComplexType));
diff --git a/clang/lib/ASTMatchers/Dynamic/Registry.cpp b/clang/lib/ASTMatchers/Dynamic/Registry.cpp
index 48a7b91969aef..01c03f309a77b 100644
--- a/clang/lib/ASTMatchers/Dynamic/Registry.cpp
+++ b/clang/lib/ASTMatchers/Dynamic/Registry.cpp
@@ -222,7 +222,6 @@ RegistryMaps::RegistryMaps() {
   REGISTER_MATCHER(declRefExpr);
   REGISTER_MATCHER(dependentNameType);
   REGISTER_MATCHER(dependentScopeDeclRefExpr);
-  REGISTER_MATCHER(dependentTemplateSpecializationType);
   REGISTER_MATCHER(declStmt);
   REGISTER_MATCHER(declaratorDecl);
   REGISTER_MATCHER(decltypeType);
diff --git a/clang/lib/Analysis/LifetimeSafety.cpp b/clang/lib/Analysis/LifetimeSafety.cpp
index e687e5419c50a..0dd5716d93fb6 100644
--- a/clang/lib/Analysis/LifetimeSafety.cpp
+++ b/clang/lib/Analysis/LifetimeSafety.cpp
@@ -478,6 +478,25 @@ class FactGenerator : public ConstStmtVisitor<FactGenerator> {
     }
   }
 
+  void VisitCXXConstructExpr(const CXXConstructExpr *CCE) {
+    if (isGslPointerType(CCE->getType())) {
+      handleGSLPointerConstruction(CCE);
+      return;
+    }
+  }
+
+  void VisitCXXMemberCallExpr(const CXXMemberCallExpr *MCE) {
+    // Specifically for conversion operators,
+    // like `std::string_view p = std::string{};`
+    if (isGslPointerType(MCE->getType()) &&
+        isa<CXXConversionDecl>(MCE->getCalleeDecl())) {
+      // The argument is the implicit object itself.
+      handleFunctionCall(MCE, MCE->getMethodDecl(),
+                         {MCE->getImplicitObjectArgument()});
+    }
+    // FIXME: A more general VisitCallExpr could also be used here.
+  }
+
   void VisitCXXNullPtrLiteralExpr(const CXXNullPtrLiteralExpr *N) {
     /// TODO: Handle nullptr expr as a special 'null' loan. Uninitialized
     /// pointers can use the same type of loan.
@@ -530,8 +549,27 @@ class FactGenerator : public ConstStmtVisitor<FactGenerator> {
   void VisitCXXFunctionalCastExpr(const CXXFunctionalCastExpr *FCE) {
     // Check if this is a test point marker. If so, we are done with this
     // expression.
-    if (VisitTestPoint(FCE))
+    if (handleTestPoint(FCE))
       return;
+    if (isGslPointerType(FCE->getType()))
+      addAssignOriginFact(*FCE, *FCE->getSubExpr());
+  }
+
+  void VisitInitListExpr(const InitListExpr *ILE) {
+    if (!hasOrigin(ILE))
+      return;
+    // For list initialization with a single element, like `View{...}`, the
+    // origin of the list itself is the origin of its single element.
+    if (ILE->getNumInits() == 1)
+      addAssignOriginFact(*ILE, *ILE->getInit(0));
+  }
+
+  void VisitMaterializeTemporaryExpr(const MaterializeTemporaryExpr *MTE) {
+    if (!hasOrigin(MTE))
+      return;
+    // A temporary object's origin is the same as the origin of the
+    // expression that initializes it.
+    addAssignOriginFact(*MTE, *MTE->getSubExpr());
   }
 
   void handleDestructor(const CFGAutomaticObjDtor &DtorOpt) {
@@ -557,10 +595,21 @@ class FactGenerator : public ConstStmtVisitor<FactGenerator> {
   }
 
 private:
-  static bool isPointerType(QualType QT) {
-    return QT->isPointerOrReferenceType();
+  static bool isGslPointerType(QualType QT) {
+    if (const auto *RD = QT->getAsCXXRecordDecl()) {
+      // We need to check the template definition for specializations.
+      if (auto *CTSD = dyn_cast<ClassTemplateSpecializationDecl>(RD))
+        return CTSD->getSpecializedTemplate()
+            ->getTemplatedDecl()
+            ->hasAttr<PointerAttr>();
+      return RD->hasAttr<PointerAttr>();
+    }
+    return false;
   }
 
+  static bool isPointerType(QualType QT) {
+    return QT->isPointerOrReferenceType() || isGslPointerType(QT);
+  }
   // Check if a type has an origin.
   static bool hasOrigin(const Expr *E) {
     return E->isGLValue() || isPointerType(E->getType());
@@ -570,6 +619,41 @@ class FactGenerator : public ConstStmtVisitor<FactGenerator> {
     return isPointerType(VD->getType());
   }
 
+  void handleGSLPointerConstruction(const CXXConstructExpr *CCE) {
+    assert(isGslPointerType(CCE->getType()));
+    if (CCE->getNumArgs() != 1)
+      return;
+    if (hasOrigin(CCE->getArg(0)))
+      addAssignOriginFact(*CCE, *CCE->getArg(0));
+    else
+      // This could be a new borrow.
+      handleFunctionCall(CCE, CCE->getConstructor(),
+                         {CCE->getArgs(), CCE->getNumArgs()});
+  }
+
+  /// Checks if a call-like expression creates a borrow by passing a value to a
+  /// reference parameter, creating an IssueFact if it does.
+  void handleFunctionCall(const Expr *Call, const FunctionDecl *FD,
+                          ArrayRef<const Expr *> Args) {
+    if (!FD)
+      return;
+    // TODO: Handle more than one arguments.
+    for (unsigned I = 0; I <= 0 /*Args.size()*/; ++I) {
+      const Expr *ArgExpr = Args[I];
+
+      // Propagate origins for CXX this.
+      if (FD->isCXXClassMember() && I == 0) {
+        addAssignOriginFact(*Call, *ArgExpr);
+        continue;
+      }
+      // The parameter is a pointer, reference, or gsl::Pointer.
+      // This is a borrow. We propagate the origin from the argument expression
+      // at the call site to the parameter declaration in the callee.
+      if (hasOrigin(ArgExpr))
+        addAssignOriginFact(*Call, *ArgExpr);
+    }
+  }
+
   /// Creates a loan for the storage path of a given declaration reference.
   /// This function should be called whenever a DeclRefExpr represents a borrow.
   /// \param DRE The declaration reference expression that initiates the borrow.
@@ -593,7 +677,7 @@ class FactGenerator : public ConstStmtVisitor<FactGenerator> {
 
   /// Checks if the expression is a `void("__lifetime_test_point_...")` cast.
   /// If so, creates a `TestPointFact` and returns true.
-  bool VisitTestPoint(const CXXFunctionalCastExpr *FCE) {
+  bool handleTestPoint(const CXXFunctionalCastExpr *FCE) {
     if (!FCE->getType()->isVoidType())
       return false;
 
@@ -641,6 +725,8 @@ class FactGenerator : public ConstStmtVisitor<FactGenerator> {
   }
 
   void markUseAsWrite(const DeclRefExpr *DRE) {
+    if (!isPointerType(DRE->getType()))
+      return;
     assert(UseFacts.contains(DRE));
     UseFacts[DRE]->markAsWritten();
   }
diff --git a/clang/lib/Analysis/PathDiagnostic.cpp b/clang/lib/Analysis/PathDiagnostic.cpp
index ef24efd3c4bd0..e42731b93bfb2 100644
--- a/clang/lib/Analysis/PathDiagnostic.cpp
+++ b/clang/lib/Analysis/PathDiagnostic.cpp
@@ -24,6 +24,7 @@
 #include "clang/AST/Type.h"
 #include "clang/Analysis/AnalysisDeclContext.h"
 #include "clang/Analysis/CFG.h"
+#include "clang/Analysis/IssueHash.h"
 #include "clang/Analysis/ProgramPoint.h"
 #include "clang/Basic/LLVM.h"
 #include "clang/Basic/SourceLocation.h"
@@ -1075,6 +1076,19 @@ unsigned PathDiagnostic::full_size() {
   return size;
 }
 
+SmallString<32>
+PathDiagnostic::getIssueHash(const SourceManager &SrcMgr,
+                             const LangOptions &LangOpts) const {
+  PathDiagnosticLocation UPDLoc = getUniqueingLoc();
+  FullSourceLoc FullLoc(
+      SrcMgr.getExpansionLoc(UPDLoc.isValid() ? UPDLoc.asLocation()
+                                              : getLocation().asLocation()),
+      SrcMgr);
+
+  return clang::getIssueHash(FullLoc, getCheckerName(), getBugType(),
+                             getDeclWithIssue(), LangOpts);
+}
+
 //===----------------------------------------------------------------------===//
 // FoldingSet profiling methods.
 //===----------------------------------------------------------------------===//
diff --git a/clang/lib/Analysis/UnsafeBufferUsage.cpp b/clang/lib/Analysis/UnsafeBufferUsage.cpp
index 1d7b8722103aa..ad3d2346d18be 100644
--- a/clang/lib/Analysis/UnsafeBufferUsage.cpp
+++ b/clang/lib/Analysis/UnsafeBufferUsage.cpp
@@ -900,22 +900,22 @@ static bool hasUnsafeFormatOrSArg(const CallExpr *Call, const Expr *&UnsafeArg,
   const Expr *Fmt = Call->getArg(FmtArgIdx);
 
   if (auto *SL = dyn_cast<clang::StringLiteral>(Fmt->IgnoreParenImpCasts())) {
-    StringRef FmtStr;
+    if (SL->getCharByteWidth() == 1) {
+      StringRef FmtStr = SL->getString();
+      StringFormatStringHandler Handler(Call, FmtArgIdx, UnsafeArg, Ctx);
 
-    if (SL->getCharByteWidth() == 1)
-      FmtStr = SL->getString();
-    else if (auto EvaledFmtStr = SL->tryEvaluateString(Ctx))
-      FmtStr = *EvaledFmtStr;
-    else
-      goto CHECK_UNSAFE_PTR;
-
-    StringFormatStringHandler Handler(Call, FmtArgIdx, UnsafeArg, Ctx);
+      return analyze_format_string::ParsePrintfString(
+          Handler, FmtStr.begin(), FmtStr.end(), Ctx.getLangOpts(),
+          Ctx.getTargetInfo(), isKprintf);
+    }
 
-    return analyze_format_string::ParsePrintfString(
-        Handler, FmtStr.begin(), FmtStr.end(), Ctx.getLangOpts(),
-        Ctx.getTargetInfo(), isKprintf);
+    if (auto FmtStr = SL->tryEvaluateString(Ctx)) {
+      StringFormatStringHandler Handler(Call, FmtArgIdx, UnsafeArg, Ctx);
+      return analyze_format_string::ParsePrintfString(
+          Handler, FmtStr->data(), FmtStr->data() + FmtStr->size(),
+          Ctx.getLangOpts(), Ctx.getTargetInfo(), isKprintf);
+    }
   }
-CHECK_UNSAFE_PTR:
   // If format is not a string literal, we cannot analyze the format string.
   // In this case, this call is considered unsafe if at least one argument
   // (including the format argument) is unsafe pointer.
diff --git a/clang/lib/Basic/OpenMPKinds.cpp b/clang/lib/Basic/OpenMPKinds.cpp
index 3f8f64df8702e..ea913d766ba57 100644
--- a/clang/lib/Basic/OpenMPKinds.cpp
+++ b/clang/lib/Basic/OpenMPKinds.cpp
@@ -20,6 +20,26 @@
 using namespace clang;
 using namespace llvm::omp;
 
+OpenMPDefaultClauseVariableCategory
+clang::getOpenMPDefaultVariableCategory(StringRef Str,
+                                        const LangOptions &LangOpts) {
+  return llvm::StringSwitch<OpenMPDefaultClauseVariableCategory>(Str)
+#define OPENMP_DEFAULT_VARIABLE_CATEGORY(Name)                                 \
+  .Case(#Name, OMPC_DEFAULT_VC_##Name)
+#include "clang/Basic/OpenMPKinds.def"
+      .Default(OMPC_DEFAULT_VC_unknown);
+}
+
+const char *clang::getOpenMPDefaultVariableCategoryName(unsigned VC) {
+  switch (VC) {
+#define OPENMP_DEFAULT_VARIABLE_CATEGORY(Name)                                 \
+  case OMPC_DEFAULT_VC_##Name:                                                 \
+    return #Name;
+#include "clang/Basic/OpenMPKinds.def"
+  }
+  llvm_unreachable("Invalid Variable Category in the default clause");
+}
+
 unsigned clang::getOpenMPSimpleClauseType(OpenMPClauseKind Kind, StringRef Str,
                                           const LangOptions &LangOpts) {
   switch (Kind) {
@@ -90,14 +110,19 @@ unsigned clang::getOpenMPSimpleClauseType(OpenMPClauseKind Kind, StringRef Str,
 #define OPENMP_DIST_SCHEDULE_KIND(Name) .Case(#Name, OMPC_DIST_SCHEDULE_##Name)
 #include "clang/Basic/OpenMPKinds.def"
         .Default(OMPC_DIST_SCHEDULE_unknown);
-  case OMPC_defaultmap:
-    return llvm::StringSwitch<unsigned>(Str)
+  case OMPC_defaultmap: {
+    unsigned Type = llvm::StringSwitch<unsigned>(Str)
 #define OPENMP_DEFAULTMAP_KIND(Name)                                           \
   .Case(#Name, static_cast<unsigned>(OMPC_DEFAULTMAP_##Name))
 #define OPENMP_DEFAULTMAP_MODIFIER(Name)                                       \
   .Case(#Name, static_cast<unsigned>(OMPC_DEFAULTMAP_MODIFIER_##Name))
 #include "clang/Basic/OpenMPKinds.def"
-        .Default(OMPC_DEFAULTMAP_unknown);
+                        .Default(OMPC_DEFAULTMAP_unknown);
+    if (LangOpts.OpenMP < 60 && (Type == OMPC_DEFAULTMAP_MODIFIER_storage ||
+                                 Type == OMPC_DEFAULTMAP_MODIFIER_private))
+      return OMPC_DEFAULTMAP_MODIFIER_unknown;
+    return Type;
+  }
   case OMPC_atomic_default_mem_order:
      return llvm::StringSwitch<OpenMPAtomicDefaultMemOrderClauseKind>(Str)
 #define OPENMP_ATOMIC_DEFAULT_MEM_ORDER_KIND(Name)       \
@@ -902,4 +927,3 @@ bool clang::checkFailClauseParameter(OpenMPClauseKind FailClauseParameter) {
          FailClauseParameter == llvm::omp::OMPC_relaxed ||
          FailClauseParameter == llvm::omp::OMPC_seq_cst;
 }
-
diff --git a/clang/lib/Basic/Sarif.cpp b/clang/lib/Basic/Sarif.cpp
index 69862b73febd7..b3fb9a21249e9 100644
--- a/clang/lib/Basic/Sarif.cpp
+++ b/clang/lib/Basic/Sarif.cpp
@@ -67,7 +67,7 @@ static std::string percentEncodeURICharacter(char C) {
 /// \param Filename The filename to be represented as URI.
 ///
 /// \return RFC3986 URI representing the input file name.
-static std::string fileNameToURI(StringRef Filename) {
+std::string SarifDocumentWriter::fileNameToURI(StringRef Filename) {
   SmallString<32> Ret = StringRef("file://");
 
   // Get the root name to see if it has a URI authority.
@@ -391,6 +391,11 @@ void SarifDocumentWriter::appendResult(const SarifResult &Result) {
   json::Object Ret{{"message", createMessage(Result.DiagnosticMessage)},
                    {"ruleIndex", static_cast<int64_t>(RuleIdx)},
                    {"ruleId", Rule.Id}};
+
+  if (!Result.HostedViewerURI.empty()) {
+    Ret["hostedViewerUri"] = Result.HostedViewerURI;
+  }
+
   if (!Result.Locations.empty()) {
     json::Array Locs;
     for (auto &Range : Result.Locations) {
@@ -398,6 +403,15 @@ void SarifDocumentWriter::appendResult(const SarifResult &Result) {
     }
     Ret["locations"] = std::move(Locs);
   }
+
+  if (!Result.PartialFingerprints.empty()) {
+    json::Object fingerprints = {};
+    for (auto &pair : Result.PartialFingerprints) {
+      fingerprints[pair.first] = pair.second;
+    }
+    Ret["partialFingerprints"] = std::move(fingerprints);
+  }
+
   if (!Result.ThreadFlows.empty())
     Ret["codeFlows"] = json::Array{createCodeFlow(Result.ThreadFlows)};
 
diff --git a/clang/lib/Basic/TargetInfo.cpp b/clang/lib/Basic/TargetInfo.cpp
index 2fbf1ee39b789..72ee09d209e02 100644
--- a/clang/lib/Basic/TargetInfo.cpp
+++ b/clang/lib/Basic/TargetInfo.cpp
@@ -626,6 +626,14 @@ TargetInfo::getCallingConvKind(bool ClangABICompat4) const {
   return CCK_Default;
 }
 
+bool TargetInfo::callGlobalDeleteInDeletingDtor(
+    const LangOptions &LangOpts) const {
+  if (getCXXABI() == TargetCXXABI::Microsoft &&
+      LangOpts.getClangABICompat() > LangOptions::ClangABI::Ver21)
+    return true;
+  return false;
+}
+
 bool TargetInfo::areDefaultedSMFStillPOD(const LangOptions &LangOpts) const {
   return LangOpts.getClangABICompat() > LangOptions::ClangABI::Ver15;
 }
diff --git a/clang/lib/Basic/Targets/AMDGPU.h b/clang/lib/Basic/Targets/AMDGPU.h
index 8b7fab3d439e0..552698a680d3e 100644
--- a/clang/lib/Basic/Targets/AMDGPU.h
+++ b/clang/lib/Basic/Targets/AMDGPU.h
@@ -400,15 +400,12 @@ class LLVM_LIBRARY_VISIBILITY AMDGPUTargetInfo final : public TargetInfo {
   /// in the DWARF.
   std::optional<unsigned>
   getDWARFAddressSpace(unsigned AddressSpace) const override {
-    const unsigned DWARF_Private = 1;
-    const unsigned DWARF_Local = 2;
-    if (AddressSpace == llvm::AMDGPUAS::PRIVATE_ADDRESS) {
-      return DWARF_Private;
-    } else if (AddressSpace == llvm::AMDGPUAS::LOCAL_ADDRESS) {
-      return DWARF_Local;
-    } else {
+    int DWARFAS = llvm::AMDGPU::mapToDWARFAddrSpace(AddressSpace);
+    // If there is no corresponding address space identifier, or it would be
+    // the default, then don't emit the attribute.
+    if (DWARFAS == -1 || DWARFAS == llvm::AMDGPU::DWARFAS::DEFAULT)
       return std::nullopt;
-    }
+    return DWARFAS;
   }
 
   CallingConvCheckResult checkCallingConvention(CallingConv CC) const override {
diff --git a/clang/lib/Basic/Targets/X86.h b/clang/lib/Basic/Targets/X86.h
index 6e013c95dbf01..be3a473174370 100644
--- a/clang/lib/Basic/Targets/X86.h
+++ b/clang/lib/Basic/Targets/X86.h
@@ -646,6 +646,7 @@ class LLVM_LIBRARY_VISIBILITY CygwinX86_32TargetInfo : public X86_32TargetInfo {
       : X86_32TargetInfo(Triple, Opts) {
     this->WCharType = TargetInfo::UnsignedShort;
     this->WIntType = TargetInfo::UnsignedInt;
+    this->UseMicrosoftManglingForC = true;
     DoubleAlign = LongLongAlign = 64;
     resetDataLayout("e-m:x-p:32:32-p270:32:32-p271:32:32-p272:64:64-i64:64-"
                     "i128:128-f80:32-n8:16:32-a:0:32-S32",
@@ -983,6 +984,7 @@ class LLVM_LIBRARY_VISIBILITY CygwinX86_64TargetInfo : public X86_64TargetInfo {
       : X86_64TargetInfo(Triple, Opts) {
     this->WCharType = TargetInfo::UnsignedShort;
     this->WIntType = TargetInfo::UnsignedInt;
+    this->UseMicrosoftManglingForC = true;
   }
 
   void getTargetDefines(const LangOptions &Opts,
@@ -997,6 +999,29 @@ class LLVM_LIBRARY_VISIBILITY CygwinX86_64TargetInfo : public X86_64TargetInfo {
       Builder.defineMacro("_GNU_SOURCE");
   }
 
+  CallingConvCheckResult checkCallingConvention(CallingConv CC) const override {
+    switch (CC) {
+    case CC_X86StdCall:
+    case CC_X86ThisCall:
+    case CC_X86FastCall:
+      return CCCR_Ignore;
+    case CC_C:
+    case CC_X86VectorCall:
+    case CC_IntelOclBicc:
+    case CC_PreserveMost:
+    case CC_PreserveAll:
+    case CC_PreserveNone:
+    case CC_X86_64SysV:
+    case CC_Swift:
+    case CC_SwiftAsync:
+    case CC_X86RegCall:
+    case CC_DeviceKernel:
+      return CCCR_OK;
+    default:
+      return CCCR_Warning;
+    }
+  }
+
   BuiltinVaListKind getBuiltinVaListKind() const override {
     return TargetInfo::CharPtrBuiltinVaList;
   }
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
index 8892e62accb74..cf17de144f4d9 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
@@ -200,6 +200,17 @@ RValue CIRGenFunction::emitBuiltinExpr(const GlobalDecl &gd, unsigned builtinID,
         builder.createBitcast(allocaAddr, builder.getVoidPtrTy()));
   }
 
+  case Builtin::BIcos:
+  case Builtin::BIcosf:
+  case Builtin::BIcosl:
+  case Builtin::BI__builtin_cos:
+  case Builtin::BI__builtin_cosf:
+  case Builtin::BI__builtin_cosf16:
+  case Builtin::BI__builtin_cosl:
+  case Builtin::BI__builtin_cosf128:
+    assert(!cir::MissingFeatures::fastMathFlags());
+    return emitUnaryMaybeConstrainedFPBuiltin<cir::CosOp>(*this, *e);
+
   case Builtin::BIfabs:
   case Builtin::BIfabsf:
   case Builtin::BIfabsl:
@@ -415,6 +426,8 @@ RValue CIRGenFunction::emitBuiltinExpr(const GlobalDecl &gd, unsigned builtinID,
     return emitUnaryFPBuiltin<cir::ASinOp>(*this, *e);
   case Builtin::BI__builtin_elementwise_atan:
     return emitUnaryFPBuiltin<cir::ATanOp>(*this, *e);
+  case Builtin::BI__builtin_elementwise_cos:
+    return emitUnaryFPBuiltin<cir::CosOp>(*this, *e);
   }
 
   // If this is an alias for a lib function (e.g. __builtin_sin), emit
diff --git a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
index aab7e2745f30f..4f2bafd986292 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
@@ -1376,6 +1376,30 @@ LValue CIRGenFunction::emitMaterializeTemporaryExpr(
   return makeAddrLValue(object, m->getType(), AlignmentSource::Decl);
 }
 
+LValue
+CIRGenFunction::getOrCreateOpaqueLValueMapping(const OpaqueValueExpr *e) {
+  assert(OpaqueValueMapping::shouldBindAsLValue(e));
+
+  auto it = opaqueLValues.find(e);
+  if (it != opaqueLValues.end())
+    return it->second;
+
+  assert(e->isUnique() && "LValue for a nonunique OVE hasn't been emitted");
+  return emitLValue(e->getSourceExpr());
+}
+
+RValue
+CIRGenFunction::getOrCreateOpaqueRValueMapping(const OpaqueValueExpr *e) {
+  assert(!OpaqueValueMapping::shouldBindAsLValue(e));
+
+  auto it = opaqueRValues.find(e);
+  if (it != opaqueRValues.end())
+    return it->second;
+
+  assert(e->isUnique() && "RValue for a nonunique OVE hasn't been emitted");
+  return emitAnyExpr(e->getSourceExpr());
+}
+
 LValue CIRGenFunction::emitCompoundLiteralLValue(const CompoundLiteralExpr *e) {
   if (e->isFileScope()) {
     cgm.errorNYI(e->getSourceRange(), "emitCompoundLiteralLValue: FileScope");
diff --git a/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp b/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp
index d678ea0212aa5..614c915a3a93d 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp
@@ -128,9 +128,12 @@ class ComplexExprEmitter : public StmtVisitor<ComplexExprEmitter, mlir::Value> {
     return emitLoadOfLValue(me);
   }
   mlir::Value VisitOpaqueValueExpr(OpaqueValueExpr *e) {
-    cgf.cgm.errorNYI(e->getExprLoc(),
-                     "ComplexExprEmitter VisitOpaqueValueExpr");
-    return {};
+    if (e->isGLValue())
+      return emitLoadOfLValue(cgf.getOrCreateOpaqueLValueMapping(e),
+                              e->getExprLoc());
+
+    // Otherwise, assume the mapping is the scalar directly.
+    return cgf.getOrCreateOpaqueRValueMapping(e).getValue();
   }
 
   mlir::Value VisitPseudoObjectExpr(PseudoObjectExpr *e) {
@@ -960,21 +963,32 @@ mlir::Value ComplexExprEmitter::VisitBinComma(const BinaryOperator *e) {
 
 mlir::Value ComplexExprEmitter::VisitAbstractConditionalOperator(
     const AbstractConditionalOperator *e) {
-  mlir::Value condValue = Visit(e->getCond());
   mlir::Location loc = cgf.getLoc(e->getSourceRange());
 
+  // Bind the common expression if necessary.
+  CIRGenFunction::OpaqueValueMapping binding(cgf, e);
+
+  CIRGenFunction::ConditionalEvaluation eval(cgf);
+
+  Expr *cond = e->getCond()->IgnoreParens();
+  mlir::Value condValue = cgf.evaluateExprAsBool(cond);
+
   return builder
       .create<cir::TernaryOp>(
           loc, condValue,
           /*thenBuilder=*/
           [&](mlir::OpBuilder &b, mlir::Location loc) {
+            eval.beginEvaluation();
             mlir::Value trueValue = Visit(e->getTrueExpr());
             b.create<cir::YieldOp>(loc, trueValue);
+            eval.endEvaluation();
           },
           /*elseBuilder=*/
           [&](mlir::OpBuilder &b, mlir::Location loc) {
+            eval.beginEvaluation();
             mlir::Value falseValue = Visit(e->getFalseExpr());
             b.create<cir::YieldOp>(loc, falseValue);
+            eval.endEvaluation();
           })
       .getResult();
 }
diff --git a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
index 754ef79392916..2261e24fe44c2 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
@@ -193,6 +193,15 @@ class ScalarExprEmitter : public StmtVisitor<ScalarExprEmitter, mlir::Value> {
     return emitNullValue(e->getType(), cgf.getLoc(e->getSourceRange()));
   }
 
+  mlir::Value VisitOpaqueValueExpr(OpaqueValueExpr *e) {
+    if (e->isGLValue())
+      return emitLoadOfLValue(cgf.getOrCreateOpaqueLValueMapping(e),
+                              e->getExprLoc());
+
+    // Otherwise, assume the mapping is the scalar directly.
+    return cgf.getOrCreateOpaqueRValueMapping(e).getValue();
+  }
+
   mlir::Value VisitCastExpr(CastExpr *e);
   mlir::Value VisitCallExpr(const CallExpr *e);
 
diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.h b/clang/lib/CIR/CodeGen/CIRGenFunction.h
index 42f7f401555ca..30f06dffc0769 100644
--- a/clang/lib/CIR/CodeGen/CIRGenFunction.h
+++ b/clang/lib/CIR/CodeGen/CIRGenFunction.h
@@ -706,6 +706,14 @@ class CIRGenFunction : public CIRGenTypeCache {
   Address getAddrOfBitFieldStorage(LValue base, const clang::FieldDecl *field,
                                    mlir::Type fieldType, unsigned index);
 
+  /// Given an opaque value expression, return its LValue mapping if it exists,
+  /// otherwise create one.
+  LValue getOrCreateOpaqueLValueMapping(const OpaqueValueExpr *e);
+
+  /// Given an opaque value expression, return its RValue mapping if it exists,
+  /// otherwise create one.
+  RValue getOrCreateOpaqueRValueMapping(const OpaqueValueExpr *e);
+
   /// Load the value for 'this'. This function is only valid while generating
   /// code for an C++ member function.
   /// FIXME(cir): this should return a mlir::Value!
diff --git a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
index 24aef693024f7..8918eb4cbb1ad 100644
--- a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
+++ b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
@@ -1754,9 +1754,6 @@ LogicalResult cir::BinOp::verify() {
     return emitError() << "The nsw/nuw flags and the saturated flag are "
                           "mutually exclusive";
 
-  assert(!cir::MissingFeatures::complexType());
-  // TODO(cir): verify for complex binops
-
   return mlir::success();
 }
 
diff --git a/clang/lib/CIR/Dialect/Transforms/CIRCanonicalize.cpp b/clang/lib/CIR/Dialect/Transforms/CIRCanonicalize.cpp
index d41ea0af58938..fbecab9774f5b 100644
--- a/clang/lib/CIR/Dialect/Transforms/CIRCanonicalize.cpp
+++ b/clang/lib/CIR/Dialect/Transforms/CIRCanonicalize.cpp
@@ -134,8 +134,6 @@ void CIRCanonicalizePass::runOnOperation() {
   getOperation()->walk([&](Operation *op) {
     assert(!cir::MissingFeatures::switchOp());
     assert(!cir::MissingFeatures::tryOp());
-    assert(!cir::MissingFeatures::complexRealOp());
-    assert(!cir::MissingFeatures::complexImagOp());
     assert(!cir::MissingFeatures::callOp());
 
     // Many operations are here to perform a manual `fold` in
diff --git a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
index d9097b0b9e03d..1d7e3df1430ac 100644
--- a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
+++ b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
@@ -185,6 +185,14 @@ mlir::LogicalResult CIRToLLVMCopyOpLowering::matchAndRewrite(
   return mlir::success();
 }
 
+mlir::LogicalResult CIRToLLVMCosOpLowering::matchAndRewrite(
+    cir::CosOp op, OpAdaptor adaptor,
+    mlir::ConversionPatternRewriter &rewriter) const {
+  mlir::Type resTy = typeConverter->convertType(op.getType());
+  rewriter.replaceOpWithNewOp<mlir::LLVM::CosOp>(op, resTy, adaptor.getSrc());
+  return mlir::success();
+}
+
 static mlir::Value getLLVMIntCast(mlir::ConversionPatternRewriter &rewriter,
                                   mlir::Value llvmSrc, mlir::Type llvmDstIntTy,
                                   bool isUnsigned, uint64_t cirSrcWidth,
@@ -2498,6 +2506,7 @@ void ConvertCIRToLLVMPass::runOnOperation() {
                CIRToLLVMComplexRealPtrOpLowering,
                CIRToLLVMComplexSubOpLowering,
                CIRToLLVMCopyOpLowering,
+               CIRToLLVMCosOpLowering,
                CIRToLLVMConstantOpLowering,
                CIRToLLVMExpectOpLowering,
                CIRToLLVMFAbsOpLowering,
diff --git a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.h b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.h
index dd1dd0aaec7d8..09ff7a0901c69 100644
--- a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.h
+++ b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.h
@@ -189,6 +189,15 @@ class CIRToLLVMCopyOpLowering : public mlir::OpConversionPattern<cir::CopyOp> {
                   mlir::ConversionPatternRewriter &) const override;
 };
 
+class CIRToLLVMCosOpLowering : public mlir::OpConversionPattern<cir::CosOp> {
+public:
+  using mlir::OpConversionPattern<cir::CosOp>::OpConversionPattern;
+
+  mlir::LogicalResult
+  matchAndRewrite(cir::CosOp op, OpAdaptor,
+                  mlir::ConversionPatternRewriter &) const override;
+};
+
 class CIRToLLVMExpectOpLowering
     : public mlir::OpConversionPattern<cir::ExpectOp> {
 public:
diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp
index 3f095c03397fd..8c99af2bdff83 100644
--- a/clang/lib/CodeGen/BackendUtil.cpp
+++ b/clang/lib/CodeGen/BackendUtil.cpp
@@ -896,6 +896,7 @@ void EmitAssemblyHelper::RunOptimizationPipeline(
   PipelineTuningOptions PTO;
   PTO.LoopUnrolling = CodeGenOpts.UnrollLoops;
   PTO.LoopInterchange = CodeGenOpts.InterchangeLoops;
+  PTO.LoopFusion = CodeGenOpts.FuseLoops;
   // For historical reasons, loop interleaving is set to mirror setting for loop
   // unrolling.
   PTO.LoopInterleaving = CodeGenOpts.UnrollLoops;
@@ -1331,6 +1332,7 @@ runThinLTOBackend(CompilerInstance &CI, ModuleSummaryIndex *CombinedIndex,
   Conf.SampleProfile = std::move(SampleProfile);
   Conf.PTO.LoopUnrolling = CGOpts.UnrollLoops;
   Conf.PTO.LoopInterchange = CGOpts.InterchangeLoops;
+  Conf.PTO.LoopFusion = CGOpts.FuseLoops;
   // For historical reasons, loop interleaving is set to mirror setting for loop
   // unrolling.
   Conf.PTO.LoopInterleaving = CGOpts.UnrollLoops;
diff --git a/clang/lib/CodeGen/CGClass.cpp b/clang/lib/CodeGen/CGClass.cpp
index 8346ee3aa6a8d..f31f0a2c382d8 100644
--- a/clang/lib/CodeGen/CGClass.cpp
+++ b/clang/lib/CodeGen/CGClass.cpp
@@ -1599,29 +1599,85 @@ namespace {
     }
   };
 
+  // This function implements generation of scalar deleting destructor body for
+  // the case when the destructor also accepts an implicit flag. Right now only
+  // Microsoft ABI requires deleting destructors to accept implicit flags.
+  // The flag indicates whether an operator delete should be called and whether
+  // it should be a class-specific operator delete or a global one.
   void EmitConditionalDtorDeleteCall(CodeGenFunction &CGF,
                                      llvm::Value *ShouldDeleteCondition,
                                      bool ReturnAfterDelete) {
+    const CXXDestructorDecl *Dtor = cast<CXXDestructorDecl>(CGF.CurCodeDecl);
+    const CXXRecordDecl *ClassDecl = Dtor->getParent();
+    const FunctionDecl *OD = Dtor->getOperatorDelete();
+    assert(OD->isDestroyingOperatorDelete() == ReturnAfterDelete &&
+           "unexpected value for ReturnAfterDelete");
+    auto *CondTy = cast<llvm::IntegerType>(ShouldDeleteCondition->getType());
+    // MSVC calls global operator delete inside of the dtor body, but clang
+    // aligned with this behavior only after a particular version. This is not
+    // ABI-compatible with previous versions.
+    ASTContext &Context = CGF.getContext();
+    bool CallGlobDelete =
+        Context.getTargetInfo().callGlobalDeleteInDeletingDtor(
+            Context.getLangOpts());
+    if (CallGlobDelete && OD->isDestroyingOperatorDelete()) {
+      llvm::BasicBlock *CallDtor = CGF.createBasicBlock("dtor.call_dtor");
+      llvm::BasicBlock *DontCallDtor = CGF.createBasicBlock("dtor.entry_cont");
+      // Third bit set signals that global operator delete is called. That means
+      // despite class having destroying operator delete which is responsible
+      // for calling dtor, we need to call dtor because global operator delete
+      // won't do that.
+      llvm::Value *Check3rdBit = CGF.Builder.CreateAnd(
+          ShouldDeleteCondition, llvm::ConstantInt::get(CondTy, 4));
+      llvm::Value *ShouldCallDtor = CGF.Builder.CreateIsNull(Check3rdBit);
+      CGF.Builder.CreateCondBr(ShouldCallDtor, DontCallDtor, CallDtor);
+      CGF.EmitBlock(CallDtor);
+      QualType ThisTy = Dtor->getFunctionObjectParameterType();
+      CGF.EmitCXXDestructorCall(Dtor, Dtor_Complete, /*ForVirtualBase=*/false,
+                                /*Delegating=*/false, CGF.LoadCXXThisAddress(),
+                                ThisTy);
+      CGF.Builder.CreateBr(DontCallDtor);
+      CGF.EmitBlock(DontCallDtor);
+    }
     llvm::BasicBlock *callDeleteBB = CGF.createBasicBlock("dtor.call_delete");
     llvm::BasicBlock *continueBB = CGF.createBasicBlock("dtor.continue");
-    llvm::Value *ShouldCallDelete
-      = CGF.Builder.CreateIsNull(ShouldDeleteCondition);
+    // First bit set signals that operator delete must be called.
+    llvm::Value *Check1stBit = CGF.Builder.CreateAnd(
+        ShouldDeleteCondition, llvm::ConstantInt::get(CondTy, 1));
+    llvm::Value *ShouldCallDelete = CGF.Builder.CreateIsNull(Check1stBit);
     CGF.Builder.CreateCondBr(ShouldCallDelete, continueBB, callDeleteBB);
 
     CGF.EmitBlock(callDeleteBB);
-    const CXXDestructorDecl *Dtor = cast<CXXDestructorDecl>(CGF.CurCodeDecl);
-    const CXXRecordDecl *ClassDecl = Dtor->getParent();
-    CGF.EmitDeleteCall(Dtor->getOperatorDelete(),
-                       LoadThisForDtorDelete(CGF, Dtor),
-                       CGF.getContext().getCanonicalTagType(ClassDecl));
-    assert(Dtor->getOperatorDelete()->isDestroyingOperatorDelete() ==
-               ReturnAfterDelete &&
-           "unexpected value for ReturnAfterDelete");
-    if (ReturnAfterDelete)
-      CGF.EmitBranchThroughCleanup(CGF.ReturnBlock);
-    else
-      CGF.Builder.CreateBr(continueBB);
-
+    auto EmitDeleteAndGoToEnd = [&](const FunctionDecl *DeleteOp) {
+      CGF.EmitDeleteCall(DeleteOp, LoadThisForDtorDelete(CGF, Dtor),
+                         Context.getCanonicalTagType(ClassDecl));
+      if (ReturnAfterDelete)
+        CGF.EmitBranchThroughCleanup(CGF.ReturnBlock);
+      else
+        CGF.Builder.CreateBr(continueBB);
+    };
+    // If Sema only found a global operator delete previously, the dtor can
+    // always call it. Otherwise we need to check the third bit and call the
+    // appropriate operator delete, i.e. global or class-specific.
+    if (const FunctionDecl *GlobOD = Dtor->getOperatorGlobalDelete();
+        isa<CXXMethodDecl>(OD) && GlobOD && CallGlobDelete) {
+      // Third bit set signals that global operator delete is called, i.e.
+      // ::delete appears on the callsite.
+      llvm::Value *CheckTheBitForGlobDeleteCall = CGF.Builder.CreateAnd(
+          ShouldDeleteCondition, llvm::ConstantInt::get(CondTy, 4));
+      llvm::Value *ShouldCallGlobDelete =
+          CGF.Builder.CreateIsNull(CheckTheBitForGlobDeleteCall);
+      llvm::BasicBlock *GlobDelete =
+          CGF.createBasicBlock("dtor.call_glob_delete");
+      llvm::BasicBlock *ClassDelete =
+          CGF.createBasicBlock("dtor.call_class_delete");
+      CGF.Builder.CreateCondBr(ShouldCallGlobDelete, ClassDelete, GlobDelete);
+      CGF.EmitBlock(GlobDelete);
+
+      EmitDeleteAndGoToEnd(GlobOD);
+      CGF.EmitBlock(ClassDelete);
+    }
+    EmitDeleteAndGoToEnd(OD);
     CGF.EmitBlock(continueBB);
   }
 
diff --git a/clang/lib/CodeGen/CGDecl.cpp b/clang/lib/CodeGen/CGDecl.cpp
index 29193e0c541b9..4e735f6d28f34 100644
--- a/clang/lib/CodeGen/CGDecl.cpp
+++ b/clang/lib/CodeGen/CGDecl.cpp
@@ -1251,8 +1251,7 @@ void CodeGenFunction::emitStoresForConstant(const VarDecl &D, Address Loc,
       LangOptions::TrivialAutoVarInitKind::Pattern;
   if (shouldSplitConstantStore(CGM, ConstantSize)) {
     if (auto *STy = dyn_cast<llvm::StructType>(Ty)) {
-      if (STy == Loc.getElementType() ||
-          (STy != Loc.getElementType() && IsTrivialAutoVarInitPattern)) {
+      if (STy == Loc.getElementType() || IsTrivialAutoVarInitPattern) {
         const llvm::StructLayout *Layout =
             CGM.getDataLayout().getStructLayout(STy);
         for (unsigned i = 0; i != constant->getNumOperands(); i++) {
@@ -1266,8 +1265,7 @@ void CodeGenFunction::emitStoresForConstant(const VarDecl &D, Address Loc,
         return;
       }
     } else if (auto *ATy = dyn_cast<llvm::ArrayType>(Ty)) {
-      if (ATy == Loc.getElementType() ||
-          (ATy != Loc.getElementType() && IsTrivialAutoVarInitPattern)) {
+      if (ATy == Loc.getElementType() || IsTrivialAutoVarInitPattern) {
         for (unsigned i = 0; i != ATy->getNumElements(); i++) {
           Address EltPtr = Builder.CreateConstGEP(
               Loc.withElementType(ATy->getElementType()), i);
diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp
index e8456a44f8367..e6e4947882544 100644
--- a/clang/lib/CodeGen/CGExpr.cpp
+++ b/clang/lib/CodeGen/CGExpr.cpp
@@ -6496,11 +6496,8 @@ RValue CodeGenFunction::EmitCall(QualType CalleeType,
     SanitizerDebugLocation SanScope(this, {CheckOrdinal}, CheckHandler);
     EmitSanitizerStatReport(llvm::SanStat_CFI_ICall);
 
-    llvm::Metadata *MD;
-    if (CGM.getCodeGenOpts().SanitizeCfiICallGeneralizePointers)
-      MD = CGM.CreateMetadataIdentifierGeneralized(QualType(FnType, 0));
-    else
-      MD = CGM.CreateMetadataIdentifierForType(QualType(FnType, 0));
+    llvm::Metadata *MD =
+        CGM.CreateMetadataIdentifierForFnType(QualType(FnType, 0));
 
     llvm::Value *TypeId = llvm::MetadataAsValue::get(getLLVMContext(), MD);
 
diff --git a/clang/lib/CodeGen/CGExprScalar.cpp b/clang/lib/CodeGen/CGExprScalar.cpp
index ce483c5cc4e45..4fa25c5d66669 100644
--- a/clang/lib/CodeGen/CGExprScalar.cpp
+++ b/clang/lib/CodeGen/CGExprScalar.cpp
@@ -2142,9 +2142,9 @@ Value *ScalarExprEmitter::VisitInitListExpr(InitListExpr *E) {
   bool Ignore = TestAndClearIgnoreResultAssign();
   (void)Ignore;
   unsigned NumInitElements = E->getNumInits();
-  assert(Ignore == false ||
-         (NumInitElements == 0 && E->getType()->isVoidType()) &&
-             "init list ignored");
+  assert((Ignore == false ||
+          (NumInitElements == 0 && E->getType()->isVoidType())) &&
+         "init list ignored");
 
   // HLSL initialization lists in the AST are an expansion which can contain
   // side-effecting expressions wrapped in opaque value expressions. To properly
diff --git a/clang/lib/CodeGen/CGHLSLBuiltins.cpp b/clang/lib/CodeGen/CGHLSLBuiltins.cpp
index 5004c09e0d5cf..7b5b924b1fe82 100644
--- a/clang/lib/CodeGen/CGHLSLBuiltins.cpp
+++ b/clang/lib/CodeGen/CGHLSLBuiltins.cpp
@@ -604,12 +604,12 @@ Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID,
     Value *OpTrue =
         RValTrue.isScalar()
             ? RValTrue.getScalarVal()
-            : RValTrue.getAggregatePointer(E->getArg(1)->getType(), *this);
+            : Builder.CreateLoad(RValTrue.getAggregateAddress(), "true_val");
     RValue RValFalse = EmitAnyExpr(E->getArg(2));
     Value *OpFalse =
         RValFalse.isScalar()
             ? RValFalse.getScalarVal()
-            : RValFalse.getAggregatePointer(E->getArg(2)->getType(), *this);
+            : Builder.CreateLoad(RValFalse.getAggregateAddress(), "false_val");
     if (auto *VTy = E->getType()->getAs<VectorType>()) {
       if (!OpTrue->getType()->isVectorTy())
         OpTrue =
diff --git a/clang/lib/CodeGen/CGHLSLRuntime.cpp b/clang/lib/CodeGen/CGHLSLRuntime.cpp
index afee1198e0988..cf018c8c7de2a 100644
--- a/clang/lib/CodeGen/CGHLSLRuntime.cpp
+++ b/clang/lib/CodeGen/CGHLSLRuntime.cpp
@@ -13,6 +13,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "CGHLSLRuntime.h"
+#include "Address.h"
 #include "CGDebugInfo.h"
 #include "CodeGenFunction.h"
 #include "CodeGenModule.h"
@@ -39,6 +40,7 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FormatVariadic.h"
 #include <cstdint>
+#include <optional>
 
 using namespace clang;
 using namespace CodeGen;
@@ -111,37 +113,6 @@ static int getTotalArraySize(ASTContext &AST, const clang::Type *Ty) {
   return AST.getConstantArrayElementCount(cast<ConstantArrayType>(Ty));
 }
 
-// Find constructor decl for a specific resource record type and binding
-// (implicit vs. explicit). The constructor has 5 parameters.
-// For explicit binding the signature is:
-//   void(unsigned, unsigned, int, unsigned, const char *).
-// For implicit binding the signature is:
-//   void(unsigned, int, unsigned, unsigned, const char *).
-static CXXConstructorDecl *findResourceConstructorDecl(ASTContext &AST,
-                                                       QualType ResTy,
-                                                       bool ExplicitBinding) {
-  std::array<QualType, 5> ExpParmTypes = {
-      AST.UnsignedIntTy, AST.UnsignedIntTy, AST.UnsignedIntTy,
-      AST.UnsignedIntTy, AST.getPointerType(AST.CharTy.withConst())};
-  ExpParmTypes[ExplicitBinding ? 2 : 1] = AST.IntTy;
-
-  CXXRecordDecl *ResDecl = ResTy->getAsCXXRecordDecl();
-  for (auto *Ctor : ResDecl->ctors()) {
-    if (Ctor->getNumParams() != ExpParmTypes.size())
-      continue;
-    auto *ParmIt = Ctor->param_begin();
-    auto ExpTyIt = ExpParmTypes.begin();
-    for (; ParmIt != Ctor->param_end() && ExpTyIt != ExpParmTypes.end();
-         ++ParmIt, ++ExpTyIt) {
-      if ((*ParmIt)->getType() != *ExpTyIt)
-        break;
-    }
-    if (ParmIt == Ctor->param_end())
-      return Ctor;
-  }
-  llvm_unreachable("did not find constructor for resource class");
-}
-
 static Value *buildNameForResource(llvm::StringRef BaseName,
                                    CodeGenModule &CGM) {
   llvm::SmallString<64> GlobalName = {BaseName, ".str"};
@@ -149,14 +120,22 @@ static Value *buildNameForResource(llvm::StringRef BaseName,
       .getPointer();
 }
 
-static void createResourceCtorArgs(CodeGenModule &CGM, CXXConstructorDecl *CD,
-                                   llvm::Value *ThisPtr, llvm::Value *Range,
-                                   llvm::Value *Index, StringRef Name,
-                                   HLSLResourceBindingAttr *RBA,
-                                   HLSLVkBindingAttr *VkBinding,
-                                   CallArgList &Args) {
+static CXXMethodDecl *lookupMethod(CXXRecordDecl *Record, StringRef Name,
+                                   StorageClass SC = SC_None) {
+  for (auto *Method : Record->methods()) {
+    if (Method->getStorageClass() == SC && Method->getName() == Name)
+      return Method;
+  }
+  return nullptr;
+}
+
+static CXXMethodDecl *lookupResourceInitMethodAndSetupArgs(
+    CodeGenModule &CGM, CXXRecordDecl *ResourceDecl, llvm::Value *Range,
+    llvm::Value *Index, StringRef Name, HLSLResourceBindingAttr *RBA,
+    HLSLVkBindingAttr *VkBinding, CallArgList &Args) {
   assert((VkBinding || RBA) && "at least one a binding attribute expected");
 
+  ASTContext &AST = CGM.getContext();
   std::optional<uint32_t> RegisterSlot;
   uint32_t SpaceNo = 0;
   if (VkBinding) {
@@ -168,44 +147,57 @@ static void createResourceCtorArgs(CodeGenModule &CGM, CXXConstructorDecl *CD,
     SpaceNo = RBA->getSpaceNumber();
   }
 
-  ASTContext &AST = CD->getASTContext();
+  CXXMethodDecl *CreateMethod = nullptr;
   Value *NameStr = buildNameForResource(Name, CGM);
   Value *Space = llvm::ConstantInt::get(CGM.IntTy, SpaceNo);
 
-  Args.add(RValue::get(ThisPtr), CD->getThisType());
   if (RegisterSlot.has_value()) {
     // explicit binding
     auto *RegSlot = llvm::ConstantInt::get(CGM.IntTy, RegisterSlot.value());
     Args.add(RValue::get(RegSlot), AST.UnsignedIntTy);
-    Args.add(RValue::get(Space), AST.UnsignedIntTy);
-    Args.add(RValue::get(Range), AST.IntTy);
-    Args.add(RValue::get(Index), AST.UnsignedIntTy);
-
+    CreateMethod = lookupMethod(ResourceDecl, "__createFromBinding", SC_Static);
   } else {
     // implicit binding
-    assert(RBA && "missing implicit binding attribute");
     auto *OrderID =
         llvm::ConstantInt::get(CGM.IntTy, RBA->getImplicitBindingOrderID());
-    Args.add(RValue::get(Space), AST.UnsignedIntTy);
-    Args.add(RValue::get(Range), AST.IntTy);
-    Args.add(RValue::get(Index), AST.UnsignedIntTy);
     Args.add(RValue::get(OrderID), AST.UnsignedIntTy);
+    CreateMethod =
+        lookupMethod(ResourceDecl, "__createFromImplicitBinding", SC_Static);
   }
+  Args.add(RValue::get(Space), AST.UnsignedIntTy);
+  Args.add(RValue::get(Range), AST.IntTy);
+  Args.add(RValue::get(Index), AST.UnsignedIntTy);
   Args.add(RValue::get(NameStr), AST.getPointerType(AST.CharTy.withConst()));
+
+  return CreateMethod;
+}
+
+static void callResourceInitMethod(CodeGenFunction &CGF,
+                                   CXXMethodDecl *CreateMethod,
+                                   CallArgList &Args, Address ReturnAddress) {
+  llvm::Constant *CalleeFn = CGF.CGM.GetAddrOfFunction(CreateMethod);
+  const FunctionProtoType *Proto =
+      CreateMethod->getType()->getAs<FunctionProtoType>();
+  const CGFunctionInfo &FnInfo =
+      CGF.CGM.getTypes().arrangeFreeFunctionCall(Args, Proto, false);
+  ReturnValueSlot ReturnValue(ReturnAddress, false);
+  CGCallee Callee(CGCalleeInfo(Proto), CalleeFn);
+  CGF.EmitCall(FnInfo, Callee, ReturnValue, Args, nullptr);
 }
 
 // Initializes local resource array variable. For multi-dimensional arrays it
 // calls itself recursively to initialize its sub-arrays. The Index used in the
 // resource constructor calls will begin at StartIndex and will be incremented
 // for each array element. The last used resource Index is returned to the
-// caller.
-static Value *initializeLocalResourceArray(
-    CodeGenFunction &CGF, AggValueSlot &ValueSlot,
-    const ConstantArrayType *ArrayTy, CXXConstructorDecl *CD,
+// caller. If the function returns std::nullopt, it indicates an error.
+static std::optional<llvm::Value *> initializeLocalResourceArray(
+    CodeGenFunction &CGF, CXXRecordDecl *ResourceDecl,
+    const ConstantArrayType *ArrayTy, AggValueSlot &ValueSlot,
     llvm::Value *Range, llvm::Value *StartIndex, StringRef ResourceName,
     HLSLResourceBindingAttr *RBA, HLSLVkBindingAttr *VkBinding,
     ArrayRef<llvm::Value *> PrevGEPIndices, SourceLocation ArraySubsExprLoc) {
 
+  ASTContext &AST = CGF.getContext();
   llvm::IntegerType *IntTy = CGF.CGM.IntTy;
   llvm::Value *Index = StartIndex;
   llvm::Value *One = llvm::ConstantInt::get(IntTy, 1);
@@ -226,16 +218,19 @@ static Value *initializeLocalResourceArray(
         Index = CGF.Builder.CreateAdd(Index, One);
         GEPIndices.back() = llvm::ConstantInt::get(IntTy, I);
       }
-      Index = initializeLocalResourceArray(
-          CGF, ValueSlot, SubArrayTy, CD, Range, Index, ResourceName, RBA,
-          VkBinding, GEPIndices, ArraySubsExprLoc);
+      std::optional<llvm::Value *> MaybeIndex = initializeLocalResourceArray(
+          CGF, ResourceDecl, SubArrayTy, ValueSlot, Range, Index, ResourceName,
+          RBA, VkBinding, GEPIndices, ArraySubsExprLoc);
+      if (!MaybeIndex)
+        return std::nullopt;
+      Index = *MaybeIndex;
     }
     return Index;
   }
 
   // For array of resources, initialize each resource in the array.
   llvm::Type *Ty = CGF.ConvertTypeForMem(ElemType);
-  CharUnits ElemSize = CD->getASTContext().getTypeSizeInChars(ElemType);
+  CharUnits ElemSize = AST.getTypeSizeInChars(ElemType);
   CharUnits Align =
       TmpArrayAddr.getAlignment().alignmentOfArrayElement(ElemSize);
 
@@ -244,16 +239,21 @@ static Value *initializeLocalResourceArray(
       Index = CGF.Builder.CreateAdd(Index, One);
       GEPIndices.back() = llvm::ConstantInt::get(IntTy, I);
     }
-    Address ThisAddress =
+    Address ReturnAddress =
         CGF.Builder.CreateGEP(TmpArrayAddr, GEPIndices, Ty, Align);
-    llvm::Value *ThisPtr = CGF.getAsNaturalPointerTo(ThisAddress, ElemType);
 
     CallArgList Args;
-    createResourceCtorArgs(CGF.CGM, CD, ThisPtr, Range, Index, ResourceName,
-                           RBA, VkBinding, Args);
-    CGF.EmitCXXConstructorCall(CD, Ctor_Complete, false, false, ThisAddress,
-                               Args, ValueSlot.mayOverlap(), ArraySubsExprLoc,
-                               ValueSlot.isSanitizerChecked());
+    CXXMethodDecl *CreateMethod = lookupResourceInitMethodAndSetupArgs(
+        CGF.CGM, ResourceDecl, Range, Index, ResourceName, RBA, VkBinding,
+        Args);
+
+    if (!CreateMethod)
+      // This can happen if someone creates an array of structs that looks like
+      // an HLSL resource record array but it does not have the required static
+      // create method. No binding will be generated for it.
+      return std::nullopt;
+
+    callResourceInitMethod(CGF, CreateMethod, Args, ReturnAddress);
   }
   return Index;
 }
@@ -922,9 +922,9 @@ void CGHLSLRuntime::emitInitListOpaqueValues(CodeGenFunction &CGF,
 
 std::optional<LValue> CGHLSLRuntime::emitResourceArraySubscriptExpr(
     const ArraySubscriptExpr *ArraySubsExpr, CodeGenFunction &CGF) {
-  assert(ArraySubsExpr->getType()->isHLSLResourceRecord() ||
-         ArraySubsExpr->getType()->isHLSLResourceRecordArray() &&
-             "expected resource array subscript expression");
+  assert((ArraySubsExpr->getType()->isHLSLResourceRecord() ||
+          ArraySubsExpr->getType()->isHLSLResourceRecordArray()) &&
+         "expected resource array subscript expression");
 
   // Let clang codegen handle local resource array subscripts,
   // or when the subscript references on opaque expression (as part of
@@ -969,11 +969,6 @@ std::optional<LValue> CGHLSLRuntime::emitResourceArraySubscriptExpr(
   QualType ResourceTy =
       ResultTy->isArrayType() ? AST.getBaseElementType(ResultTy) : ResultTy;
 
-  // Lookup the resource class constructor based on the resource type and
-  // binding.
-  CXXConstructorDecl *CD = findResourceConstructorDecl(
-      AST, ResourceTy, VkBinding || RBA->hasRegisterSlot());
-
   // Create a temporary variable for the result, which is either going
   // to be a single resource instance or a local array of resources (we need to
   // return an LValue).
@@ -986,7 +981,6 @@ std::optional<LValue> CGHLSLRuntime::emitResourceArraySubscriptExpr(
       TmpVar, Qualifiers(), AggValueSlot::IsDestructed_t(true),
       AggValueSlot::DoesNotNeedGCBarriers, AggValueSlot::IsAliased_t(false),
       AggValueSlot::DoesNotOverlap);
-  Address TmpVarAddress = ValueSlot.getAddress();
 
   // Calculate total array size (= range size).
   llvm::Value *Range =
@@ -995,27 +989,30 @@ std::optional<LValue> CGHLSLRuntime::emitResourceArraySubscriptExpr(
   // If the result of the subscript operation is a single resource, call the
   // constructor.
   if (ResultTy == ResourceTy) {
-    QualType ThisType = CD->getThisType()->getPointeeType();
-    llvm::Value *ThisPtr = CGF.getAsNaturalPointerTo(TmpVarAddress, ThisType);
-
-    // Assemble the constructor parameters.
     CallArgList Args;
-    createResourceCtorArgs(CGM, CD, ThisPtr, Range, Index, ArrayDecl->getName(),
-                           RBA, VkBinding, Args);
-    // Call the constructor.
-    CGF.EmitCXXConstructorCall(CD, Ctor_Complete, false, false, TmpVarAddress,
-                               Args, ValueSlot.mayOverlap(),
-                               ArraySubsExpr->getExprLoc(),
-                               ValueSlot.isSanitizerChecked());
+    CXXMethodDecl *CreateMethod = lookupResourceInitMethodAndSetupArgs(
+        CGF.CGM, ResourceTy->getAsCXXRecordDecl(), Range, Index,
+        ArrayDecl->getName(), RBA, VkBinding, Args);
+
+    if (!CreateMethod)
+      // This can happen if someone creates an array of structs that looks like
+      // an HLSL resource record array but it does not have the required static
+      // create method. No binding will be generated for it.
+      return std::nullopt;
+
+    callResourceInitMethod(CGF, CreateMethod, Args, ValueSlot.getAddress());
+
   } else {
     // The result of the subscript operation is a local resource array which
     // needs to be initialized.
     const ConstantArrayType *ArrayTy =
         cast<ConstantArrayType>(ResultTy.getTypePtr());
-    initializeLocalResourceArray(CGF, ValueSlot, ArrayTy, CD, Range, Index,
-                                 ArrayDecl->getName(), RBA, VkBinding,
-                                 {llvm::ConstantInt::get(CGM.IntTy, 0)},
-                                 ArraySubsExpr->getExprLoc());
+    std::optional<llvm::Value *> EndIndex = initializeLocalResourceArray(
+        CGF, ResourceTy->getAsCXXRecordDecl(), ArrayTy, ValueSlot, Range, Index,
+        ArrayDecl->getName(), RBA, VkBinding,
+        {llvm::ConstantInt::get(CGM.IntTy, 0)}, ArraySubsExpr->getExprLoc());
+    if (!EndIndex)
+      return std::nullopt;
   }
   return CGF.MakeAddrLValue(TmpVar, ResultTy, AlignmentSource::Decl);
 }
diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index e80aa1592f252..a503aaf613e30 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -1037,12 +1037,15 @@ CGOpenMPRuntime::CGOpenMPRuntime(CodeGenModule &CGM)
       CGM.getLangOpts().OpenMPOffloadMandatory,
       /*HasRequiresReverseOffload*/ false, /*HasRequiresUnifiedAddress*/ false,
       hasRequiresUnifiedSharedMemory(), /*HasRequiresDynamicAllocators*/ false);
+  Config.setDefaultTargetAS(
+      CGM.getContext().getTargetInfo().getTargetAddressSpace(LangAS::Default));
+
+  OMPBuilder.setConfig(Config);
   OMPBuilder.initialize();
   OMPBuilder.loadOffloadInfoMetadata(*CGM.getFileSystem(),
                                      CGM.getLangOpts().OpenMPIsTargetDevice
                                          ? CGM.getLangOpts().OMPHostIRFile
                                          : StringRef{});
-  OMPBuilder.setConfig(Config);
 
   // The user forces the compiler to behave as if omp requires
   // unified_shared_memory was given.
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index a16dfb52f4d90..0b660e3daaf81 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -2339,12 +2339,28 @@ llvm::ConstantInt *CodeGenModule::CreateCrossDsoCfiTypeId(llvm::Metadata *MD) {
   return llvm::ConstantInt::get(Int64Ty, llvm::MD5Hash(MDS->getString()));
 }
 
-// Generalize pointer types to a void pointer with the qualifiers of the
-// originally pointed-to type, e.g. 'const char *' and 'char * const *'
-// generalize to 'const void *' while 'char *' and 'const char **' generalize to
-// 'void *'.
-static QualType GeneralizeType(ASTContext &Ctx, QualType Ty) {
-  if (!Ty->isPointerType())
+static QualType GeneralizeTransparentUnion(QualType Ty) {
+  const RecordType *UT = Ty->getAsUnionType();
+  if (!UT)
+    return Ty;
+  const RecordDecl *UD = UT->getOriginalDecl()->getDefinitionOrSelf();
+  if (!UD->hasAttr<TransparentUnionAttr>())
+    return Ty;
+  for (const auto *it : UD->fields()) {
+    return it->getType();
+  }
+  return Ty;
+}
+
+// If `GeneralizePointers` is true, generalizes types to a void pointer with the
+// qualifiers of the originally pointed-to type, e.g. 'const char *' and 'char *
+// const *' generalize to 'const void *' while 'char *' and 'const char **'
+// generalize to 'void *'.
+static QualType GeneralizeType(ASTContext &Ctx, QualType Ty,
+                               bool GeneralizePointers) {
+  Ty = GeneralizeTransparentUnion(Ty);
+
+  if (!GeneralizePointers || !Ty->isPointerType())
     return Ty;
 
   return Ctx.getPointerType(
@@ -2353,26 +2369,29 @@ static QualType GeneralizeType(ASTContext &Ctx, QualType Ty) {
 }
 
 // Apply type generalization to a FunctionType's return and argument types
-static QualType GeneralizeFunctionType(ASTContext &Ctx, QualType Ty) {
+static QualType GeneralizeFunctionType(ASTContext &Ctx, QualType Ty,
+                                       bool GeneralizePointers) {
   if (auto *FnType = Ty->getAs<FunctionProtoType>()) {
     SmallVector<QualType, 8> GeneralizedParams;
     for (auto &Param : FnType->param_types())
-      GeneralizedParams.push_back(GeneralizeType(Ctx, Param));
+      GeneralizedParams.push_back(
+          GeneralizeType(Ctx, Param, GeneralizePointers));
 
-    return Ctx.getFunctionType(GeneralizeType(Ctx, FnType->getReturnType()),
-                               GeneralizedParams, FnType->getExtProtoInfo());
+    return Ctx.getFunctionType(
+        GeneralizeType(Ctx, FnType->getReturnType(), GeneralizePointers),
+        GeneralizedParams, FnType->getExtProtoInfo());
   }
 
   if (auto *FnType = Ty->getAs<FunctionNoProtoType>())
     return Ctx.getFunctionNoProtoType(
-        GeneralizeType(Ctx, FnType->getReturnType()));
+        GeneralizeType(Ctx, FnType->getReturnType(), GeneralizePointers));
 
   llvm_unreachable("Encountered unknown FunctionType");
 }
 
 llvm::ConstantInt *CodeGenModule::CreateKCFITypeId(QualType T, StringRef Salt) {
-  if (getCodeGenOpts().SanitizeCfiICallGeneralizePointers)
-    T = GeneralizeFunctionType(getContext(), T);
+  T = GeneralizeFunctionType(
+      getContext(), T, getCodeGenOpts().SanitizeCfiICallGeneralizePointers);
   if (auto *FnType = T->getAs<FunctionProtoType>())
     T = getContext().getFunctionType(
         FnType->getReturnType(), FnType->getParamTypes(),
@@ -3041,9 +3060,14 @@ void CodeGenModule::createFunctionTypeMetadataForIcall(const FunctionDecl *FD,
   if (isa<CXXMethodDecl>(FD) && !cast<CXXMethodDecl>(FD)->isStatic())
     return;
 
-  llvm::Metadata *MD = CreateMetadataIdentifierForType(FD->getType());
+  QualType FnType = GeneralizeFunctionType(getContext(), FD->getType(),
+                                           /*GeneralizePointers=*/false);
+  llvm::Metadata *MD = CreateMetadataIdentifierForType(FnType);
   F->addTypeMetadata(0, MD);
-  F->addTypeMetadata(0, CreateMetadataIdentifierGeneralized(FD->getType()));
+
+  QualType GenPtrFnType = GeneralizeFunctionType(getContext(), FD->getType(),
+                                                 /*GeneralizePointers=*/true);
+  F->addTypeMetadata(0, CreateMetadataIdentifierGeneralized(GenPtrFnType));
 
   // Emit a hash-based bit set entry for cross-DSO calls.
   if (CodeGenOpts.SanitizeCfiCrossDso)
@@ -4586,12 +4610,6 @@ void CodeGenModule::emitMultiVersionFunctions() {
     }
     llvm::Function *ResolverFunc = cast<llvm::Function>(ResolverConstant);
 
-    ResolverFunc->setLinkage(getMultiversionLinkage(*this, GD));
-
-    if (!ResolverFunc->hasLocalLinkage() && supportsCOMDAT())
-      ResolverFunc->setComdat(
-          getModule().getOrInsertComdat(ResolverFunc->getName()));
-
     const TargetInfo &TI = getTarget();
     llvm::stable_sort(
         Options, [&TI](const CodeGenFunction::FMVResolverOption &LHS,
@@ -4600,6 +4618,11 @@ void CodeGenModule::emitMultiVersionFunctions() {
         });
     CodeGenFunction CGF(*this);
     CGF.EmitMultiVersionResolver(ResolverFunc, Options);
+
+    setMultiVersionResolverAttributes(ResolverFunc, GD);
+    if (!ResolverFunc->hasLocalLinkage() && supportsCOMDAT())
+      ResolverFunc->setComdat(
+          getModule().getOrInsertComdat(ResolverFunc->getName()));
   }
 
   // Ensure that any additions to the deferred decls list caused by emitting a
@@ -4650,7 +4673,7 @@ void CodeGenModule::emitCPUDispatchDefinition(GlobalDecl GD) {
 
   auto *ResolverFunc = cast<llvm::Function>(GetOrCreateLLVMFunction(
       ResolverName, ResolverType, ResolverGD, /*ForVTable=*/false));
-  ResolverFunc->setLinkage(getMultiversionLinkage(*this, GD));
+
   if (supportsCOMDAT())
     ResolverFunc->setComdat(
         getModule().getOrInsertComdat(ResolverFunc->getName()));
@@ -4716,6 +4739,7 @@ void CodeGenModule::emitCPUDispatchDefinition(GlobalDecl GD) {
 
   CodeGenFunction CGF(*this);
   CGF.EmitMultiVersionResolver(ResolverFunc, Options);
+  setMultiVersionResolverAttributes(ResolverFunc, GD);
 
   if (getTarget().supportsIFunc()) {
     llvm::GlobalValue::LinkageTypes Linkage = getMultiversionLinkage(*this, GD);
@@ -4834,6 +4858,26 @@ llvm::Constant *CodeGenModule::GetOrCreateMultiVersionResolver(GlobalDecl GD) {
   return Resolver;
 }
 
+void CodeGenModule::setMultiVersionResolverAttributes(llvm::Function *Resolver,
+                                                      GlobalDecl GD) {
+  const NamedDecl *D = dyn_cast_or_null<NamedDecl>(GD.getDecl());
+  Resolver->setLinkage(getMultiversionLinkage(*this, GD));
+
+  // Function body has to be emitted before calling setGlobalVisibility
+  // for Resolver to be considered as definition.
+  setGlobalVisibility(Resolver, D);
+
+  setDSOLocal(Resolver);
+
+  // Set the default target-specific attributes, such as PAC and BTI ones on
+  // AArch64. Not passing Decl to prevent setting unrelated attributes,
+  // as Resolver can be shared by multiple declarations.
+  // FIXME Some targets may require a non-null D to set some attributes
+  //       (such as "stackrealign" on X86, even when it is requested via
+  //       "-mstackrealign" command line option).
+  getTargetCodeGenInfo().setTargetAttributes(/*D=*/nullptr, Resolver, *this);
+}
+
 bool CodeGenModule::shouldDropDLLAttribute(const Decl *D,
                                            const llvm::GlobalValue *GV) const {
   auto SC = GV->getDLLStorageClass();
@@ -7934,6 +7978,15 @@ CodeGenModule::CreateMetadataIdentifierImpl(QualType T, MetadataTypeMap &Map,
   return InternalId;
 }
 
+llvm::Metadata *CodeGenModule::CreateMetadataIdentifierForFnType(QualType T) {
+  assert(isa<FunctionType>(T));
+  T = GeneralizeFunctionType(
+      getContext(), T, getCodeGenOpts().SanitizeCfiICallGeneralizePointers);
+  if (getCodeGenOpts().SanitizeCfiICallGeneralizePointers)
+    return CreateMetadataIdentifierGeneralized(T);
+  return CreateMetadataIdentifierForType(T);
+}
+
 llvm::Metadata *CodeGenModule::CreateMetadataIdentifierForType(QualType T) {
   return CreateMetadataIdentifierImpl(T, MetadataIdMap, "");
 }
@@ -7944,8 +7997,8 @@ CodeGenModule::CreateMetadataIdentifierForVirtualMemPtrType(QualType T) {
 }
 
 llvm::Metadata *CodeGenModule::CreateMetadataIdentifierGeneralized(QualType T) {
-  return CreateMetadataIdentifierImpl(GeneralizeFunctionType(getContext(), T),
-                                      GeneralizedMetadataIdMap, ".generalized");
+  return CreateMetadataIdentifierImpl(T, GeneralizedMetadataIdMap,
+                                      ".generalized");
 }
 
 /// Returns whether this module needs the "all-vtables" type identifier.
diff --git a/clang/lib/CodeGen/CodeGenModule.h b/clang/lib/CodeGen/CodeGenModule.h
index f62350fd8d378..3971b296b3f80 100644
--- a/clang/lib/CodeGen/CodeGenModule.h
+++ b/clang/lib/CodeGen/CodeGenModule.h
@@ -1623,6 +1623,9 @@ class CodeGenModule : public CodeGenTypeCache {
   /// Generate a KCFI type identifier for T.
   llvm::ConstantInt *CreateKCFITypeId(QualType T, StringRef Salt);
 
+  /// Create a metadata identifier for the given function type.
+  llvm::Metadata *CreateMetadataIdentifierForFnType(QualType T);
+
   /// Create a metadata identifier for the given type. This may either be an
   /// MDString (for external identifiers) or a distinct unnamed MDNode (for
   /// internal identifiers).
@@ -1848,6 +1851,15 @@ class CodeGenModule : public CodeGenTypeCache {
   // that feature and for a regular function (llvm::GlobalValue) otherwise.
   llvm::Constant *GetOrCreateMultiVersionResolver(GlobalDecl GD);
 
+  // Set attributes to a resolver function generated by Clang.
+  // GD is either the cpu_dispatch declaration or an arbitrarily chosen
+  // function declaration that triggered the implicit generation of this
+  // resolver function.
+  //
+  /// NOTE: This should only be called for definitions.
+  void setMultiVersionResolverAttributes(llvm::Function *Resolver,
+                                         GlobalDecl GD);
+
   // In scenarios where a function is not known to be a multiversion function
   // until a later declaration, it is sometimes necessary to change the
   // previously created mangled name to align with requirements of whatever
diff --git a/clang/lib/CodeGen/MicrosoftCXXABI.cpp b/clang/lib/CodeGen/MicrosoftCXXABI.cpp
index 94190a149e859..19d9265247119 100644
--- a/clang/lib/CodeGen/MicrosoftCXXABI.cpp
+++ b/clang/lib/CodeGen/MicrosoftCXXABI.cpp
@@ -903,12 +903,19 @@ void MicrosoftCXXABI::emitVirtualObjectDelete(CodeGenFunction &CGF,
                                               const CXXDestructorDecl *Dtor) {
   // FIXME: Provide a source location here even though there's no
   // CXXMemberCallExpr for dtor call.
-  bool UseGlobalDelete = DE->isGlobalDelete();
-  CXXDtorType DtorType = UseGlobalDelete ? Dtor_Complete : Dtor_Deleting;
-  llvm::Value *MDThis = EmitVirtualDestructorCall(CGF, Dtor, DtorType, Ptr, DE,
-                                                  /*CallOrInvoke=*/nullptr);
-  if (UseGlobalDelete)
-    CGF.EmitDeleteCall(DE->getOperatorDelete(), MDThis, ElementType);
+  if (!getContext().getTargetInfo().callGlobalDeleteInDeletingDtor(
+          getContext().getLangOpts())) {
+    bool UseGlobalDelete = DE->isGlobalDelete();
+    CXXDtorType DtorType = UseGlobalDelete ? Dtor_Complete : Dtor_Deleting;
+    llvm::Value *MDThis =
+        EmitVirtualDestructorCall(CGF, Dtor, DtorType, Ptr, DE,
+                                  /*CallOrInvoke=*/nullptr);
+    if (UseGlobalDelete)
+      CGF.EmitDeleteCall(DE->getOperatorDelete(), MDThis, ElementType);
+  } else {
+    EmitVirtualDestructorCall(CGF, Dtor, Dtor_Deleting, Ptr, DE,
+                              /*CallOrInvoke=*/nullptr);
+  }
 }
 
 void MicrosoftCXXABI::emitRethrow(CodeGenFunction &CGF, bool isNoReturn) {
@@ -2023,9 +2030,12 @@ llvm::Value *MicrosoftCXXABI::EmitVirtualDestructorCall(
   CGCallee Callee = CGCallee::forVirtual(CE, GD, This, Ty);
 
   ASTContext &Context = getContext();
-  llvm::Value *ImplicitParam = llvm::ConstantInt::get(
-      llvm::IntegerType::getInt32Ty(CGF.getLLVMContext()),
-      DtorType == Dtor_Deleting);
+  bool IsDeleting = DtorType == Dtor_Deleting;
+  bool IsGlobalDelete = D && D->isGlobalDelete() &&
+                        Context.getTargetInfo().callGlobalDeleteInDeletingDtor(
+                            Context.getLangOpts());
+  llvm::Value *ImplicitParam =
+      CGF.Builder.getInt32((IsDeleting ? 1 : 0) | (IsGlobalDelete ? 4 : 0));
 
   QualType ThisTy;
   if (CE) {
diff --git a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
index f4baf8c7f0dde..82b71e398dcc9 100644
--- a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
@@ -4304,9 +4304,11 @@ Value *CodeGenFunction::EmitSMELd1St1(const SVETypeFlags &TypeFlags,
   // size in bytes.
   if (Ops.size() == 5) {
     Function *StreamingVectorLength =
-        CGM.getIntrinsic(Intrinsic::aarch64_sme_cntsb);
+        CGM.getIntrinsic(Intrinsic::aarch64_sme_cntsd);
     llvm::Value *StreamingVectorLengthCall =
-        Builder.CreateCall(StreamingVectorLength);
+        Builder.CreateMul(Builder.CreateCall(StreamingVectorLength),
+                          llvm::ConstantInt::get(Int64Ty, 8), "svl",
+                          /* HasNUW */ true, /* HasNSW */ true);
     llvm::Value *Mulvl =
         Builder.CreateMul(StreamingVectorLengthCall, Ops[4], "mulvl");
     // The type of the ptr parameter is void *, so use Int8Ty here.
@@ -4918,6 +4920,26 @@ Value *CodeGenFunction::EmitAArch64SMEBuiltinExpr(unsigned BuiltinID,
   // Handle builtins which require their multi-vector operands to be swapped
   swapCommutativeSMEOperands(BuiltinID, Ops);
 
+  auto isCntsBuiltin = [&]() {
+    switch (BuiltinID) {
+    default:
+      return 0;
+    case SME::BI__builtin_sme_svcntsb:
+      return 8;
+    case SME::BI__builtin_sme_svcntsh:
+      return 4;
+    case SME::BI__builtin_sme_svcntsw:
+      return 2;
+    }
+  };
+
+  if (auto Mul = isCntsBuiltin()) {
+    llvm::Value *Cntd =
+        Builder.CreateCall(CGM.getIntrinsic(Intrinsic::aarch64_sme_cntsd));
+    return Builder.CreateMul(Cntd, llvm::ConstantInt::get(Int64Ty, Mul),
+                             "mulsvl", /* HasNUW */ true, /* HasNSW */ true);
+  }
+
   // Should not happen!
   if (Builtin->LLVMIntrinsic == 0)
     return nullptr;
diff --git a/clang/lib/CodeGen/TargetBuiltins/X86.cpp b/clang/lib/CodeGen/TargetBuiltins/X86.cpp
index a4974e45caf10..b924407b6ddd7 100644
--- a/clang/lib/CodeGen/TargetBuiltins/X86.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/X86.cpp
@@ -1814,59 +1814,53 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
   case X86::BI__builtin_ia32_pslldqi256_byteshift:
   case X86::BI__builtin_ia32_pslldqi512_byteshift: {
     unsigned ShiftVal = cast<llvm::ConstantInt>(Ops[1])->getZExtValue() & 0xff;
-    auto *ResultType = cast<llvm::FixedVectorType>(Ops[0]->getType());
-    // Builtin type is vXi64 so multiply by 8 to get bytes.
-    unsigned NumElts = ResultType->getNumElements() * 8;
+    auto *VecTy = cast<llvm::FixedVectorType>(Ops[0]->getType());
+    // Builtin type is vXi8.
+    unsigned NumElts = VecTy->getNumElements();
+    Value *Zero = llvm::Constant::getNullValue(VecTy);
 
     // If pslldq is shifting the vector more than 15 bytes, emit zero.
     if (ShiftVal >= 16)
-      return llvm::Constant::getNullValue(ResultType);
+      return Zero;
 
     int Indices[64];
     // 256/512-bit pslldq operates on 128-bit lanes so we need to handle that
     for (unsigned l = 0; l != NumElts; l += 16) {
       for (unsigned i = 0; i != 16; ++i) {
         unsigned Idx = NumElts + i - ShiftVal;
-        if (Idx < NumElts) Idx -= NumElts - 16; // end of lane, switch operand.
+        if (Idx < NumElts)
+          Idx -= NumElts - 16; // end of lane, switch operand.
         Indices[l + i] = Idx + l;
       }
     }
-
-    auto *VecTy = llvm::FixedVectorType::get(Int8Ty, NumElts);
-    Value *Cast = Builder.CreateBitCast(Ops[0], VecTy, "cast");
-    Value *Zero = llvm::Constant::getNullValue(VecTy);
-    Value *SV = Builder.CreateShuffleVector(
-        Zero, Cast, ArrayRef(Indices, NumElts), "pslldq");
-    return Builder.CreateBitCast(SV, Ops[0]->getType(), "cast");
+    return Builder.CreateShuffleVector(Zero, Ops[0], ArrayRef(Indices, NumElts),
+                                       "pslldq");
   }
   case X86::BI__builtin_ia32_psrldqi128_byteshift:
   case X86::BI__builtin_ia32_psrldqi256_byteshift:
   case X86::BI__builtin_ia32_psrldqi512_byteshift: {
     unsigned ShiftVal = cast<llvm::ConstantInt>(Ops[1])->getZExtValue() & 0xff;
-    auto *ResultType = cast<llvm::FixedVectorType>(Ops[0]->getType());
-    // Builtin type is vXi64 so multiply by 8 to get bytes.
-    unsigned NumElts = ResultType->getNumElements() * 8;
+    auto *VecTy = cast<llvm::FixedVectorType>(Ops[0]->getType());
+    // Builtin type is vXi8.
+    unsigned NumElts = VecTy->getNumElements();
+    Value *Zero = llvm::Constant::getNullValue(VecTy);
 
     // If psrldq is shifting the vector more than 15 bytes, emit zero.
     if (ShiftVal >= 16)
-      return llvm::Constant::getNullValue(ResultType);
+      return Zero;
 
     int Indices[64];
     // 256/512-bit psrldq operates on 128-bit lanes so we need to handle that
     for (unsigned l = 0; l != NumElts; l += 16) {
       for (unsigned i = 0; i != 16; ++i) {
         unsigned Idx = i + ShiftVal;
-        if (Idx >= 16) Idx += NumElts - 16; // end of lane, switch operand.
+        if (Idx >= 16)
+          Idx += NumElts - 16; // end of lane, switch operand.
         Indices[l + i] = Idx + l;
       }
     }
-
-    auto *VecTy = llvm::FixedVectorType::get(Int8Ty, NumElts);
-    Value *Cast = Builder.CreateBitCast(Ops[0], VecTy, "cast");
-    Value *Zero = llvm::Constant::getNullValue(VecTy);
-    Value *SV = Builder.CreateShuffleVector(
-        Cast, Zero, ArrayRef(Indices, NumElts), "psrldq");
-    return Builder.CreateBitCast(SV, ResultType, "cast");
+    return Builder.CreateShuffleVector(Ops[0], Zero, ArrayRef(Indices, NumElts),
+                                       "psrldq");
   }
   case X86::BI__builtin_ia32_kshiftliqi:
   case X86::BI__builtin_ia32_kshiftlihi:
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 946b1e39af3b9..e7aabee273a34 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -1370,6 +1370,11 @@ static void handlePAuthABI(const ArgList &DriverArgs, ArgStringList &CC1Args) {
           options::OPT_fno_ptrauth_vtable_pointer_type_discrimination))
     CC1Args.push_back("-fptrauth-vtable-pointer-type-discrimination");
 
+  if (!DriverArgs.hasArg(
+          options::OPT_fptrauth_type_info_vtable_pointer_discrimination,
+          options::OPT_fno_ptrauth_type_info_vtable_pointer_discrimination))
+    CC1Args.push_back("-fptrauth-type-info-vtable-pointer-discrimination");
+
   if (!DriverArgs.hasArg(options::OPT_fptrauth_indirect_gotos,
                          options::OPT_fno_ptrauth_indirect_gotos))
     CC1Args.push_back("-fptrauth-indirect-gotos");
@@ -1377,6 +1382,15 @@ static void handlePAuthABI(const ArgList &DriverArgs, ArgStringList &CC1Args) {
   if (!DriverArgs.hasArg(options::OPT_fptrauth_init_fini,
                          options::OPT_fno_ptrauth_init_fini))
     CC1Args.push_back("-fptrauth-init-fini");
+
+  if (!DriverArgs.hasArg(
+          options::OPT_fptrauth_init_fini_address_discrimination,
+          options::OPT_fno_ptrauth_init_fini_address_discrimination))
+    CC1Args.push_back("-fptrauth-init-fini-address-discrimination");
+
+  if (!DriverArgs.hasArg(options::OPT_faarch64_jump_table_hardening,
+                         options::OPT_fno_aarch64_jump_table_hardening))
+    CC1Args.push_back("-faarch64-jump-table-hardening");
 }
 
 static void CollectARMPACBTIOptions(const ToolChain &TC, const ArgList &Args,
@@ -4393,6 +4407,13 @@ renderDebugOptions(const ToolChain &TC, const Driver &D, const llvm::Triple &T,
   // object file generation and no IR generation, -gN should not be needed. So
   // allow -gsplit-dwarf with either -gN or IR input.
   if (IRInput || Args.hasArg(options::OPT_g_Group)) {
+    // FIXME: -gsplit-dwarf on AIX is currently unimplemented.
+    if (TC.getTriple().isOSAIX() && Args.hasArg(options::OPT_gsplit_dwarf)) {
+      D.Diag(diag::err_drv_unsupported_opt_for_target)
+          << Args.getLastArg(options::OPT_gsplit_dwarf)->getSpelling()
+          << TC.getTriple().str();
+      return;
+    }
     Arg *SplitDWARFArg;
     DwarfFission = getDebugFissionKind(D, Args, SplitDWARFArg);
     if (DwarfFission != DwarfFissionKind::None &&
@@ -6854,6 +6875,8 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
                   options::OPT_fno_unroll_loops);
   Args.AddLastArg(CmdArgs, options::OPT_floop_interchange,
                   options::OPT_fno_loop_interchange);
+  Args.addOptInFlag(CmdArgs, options::OPT_fexperimental_loop_fusion,
+                    options::OPT_fno_experimental_loop_fusion);
 
   Args.AddLastArg(CmdArgs, options::OPT_fstrict_flex_arrays_EQ);
 
diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp
index b50549219e4e8..08cd98fd04df0 100644
--- a/clang/lib/Driver/ToolChains/CommonArgs.cpp
+++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp
@@ -3520,6 +3520,7 @@ std::string tools::complexRangeKindToStr(LangOptions::ComplexRangeKind Range) {
     return "none";
     break;
   }
+  llvm_unreachable("Fully covered switch above");
 }
 
 std::string
diff --git a/clang/lib/Driver/ToolChains/Flang.cpp b/clang/lib/Driver/ToolChains/Flang.cpp
index 1535f4cebf436..12e510ab1562d 100644
--- a/clang/lib/Driver/ToolChains/Flang.cpp
+++ b/clang/lib/Driver/ToolChains/Flang.cpp
@@ -134,12 +134,17 @@ void Flang::addOtherOptions(const ArgList &Args, ArgStringList &CmdArgs) const {
   if (Args.hasArg(options::OPT_gN_Group)) {
     Arg *gNArg = Args.getLastArg(options::OPT_gN_Group);
     DebugInfoKind = debugLevelToInfoKind(*gNArg);
-  } else if (Args.hasArg(options::OPT_g_Flag)) {
+  } else if (Args.hasArg(options::OPT_g_Group)) {
     DebugInfoKind = llvm::codegenoptions::FullDebugInfo;
   } else {
     DebugInfoKind = llvm::codegenoptions::NoDebugInfo;
   }
   addDebugInfoKind(CmdArgs, DebugInfoKind);
+  if (getDwarfNArg(Args)) {
+    const unsigned DwarfVersion = getDwarfVersion(getToolChain(), Args);
+    CmdArgs.push_back(
+        Args.MakeArgString("-dwarf-version=" + Twine(DwarfVersion)));
+  }
 }
 
 void Flang::addCodegenOptions(const ArgList &Args,
@@ -151,6 +156,9 @@ void Flang::addCodegenOptions(const ArgList &Args,
       !stackArrays->getOption().matches(options::OPT_fno_stack_arrays))
     CmdArgs.push_back("-fstack-arrays");
 
+  Args.addOptInFlag(CmdArgs, options::OPT_fexperimental_loop_fusion,
+                    options::OPT_fno_experimental_loop_fusion);
+
   handleInterchangeLoopsArgs(Args, CmdArgs);
   handleVectorizeLoopsArgs(Args, CmdArgs);
   handleVectorizeSLPArgs(Args, CmdArgs);
diff --git a/clang/lib/Driver/ToolChains/FreeBSD.h b/clang/lib/Driver/ToolChains/FreeBSD.h
index 7ab63905ed4f9..7d090ba682b30 100644
--- a/clang/lib/Driver/ToolChains/FreeBSD.h
+++ b/clang/lib/Driver/ToolChains/FreeBSD.h
@@ -78,6 +78,11 @@ class LLVM_LIBRARY_VISIBILITY FreeBSD : public Generic_ELF {
   void AddHIPIncludeArgs(const llvm::opt::ArgList &DriverArgs,
                          llvm::opt::ArgStringList &CC1Args) const override;
 
+  bool IsAArch64OutlineAtomicsDefault(
+      const llvm::opt::ArgList &Args) const override {
+    return true;
+  }
+
   UnwindTableLevel
   getDefaultUnwindTableLevel(const llvm::opt::ArgList &Args) const override;
   bool isPIEDefault(const llvm::opt::ArgList &Args) const override;
diff --git a/clang/lib/Driver/ToolChains/PS4CPU.cpp b/clang/lib/Driver/ToolChains/PS4CPU.cpp
index 21e23d486f9d4..61afc61a53dfd 100644
--- a/clang/lib/Driver/ToolChains/PS4CPU.cpp
+++ b/clang/lib/Driver/ToolChains/PS4CPU.cpp
@@ -343,6 +343,18 @@ void tools::PS5cpu::Linker::ConstructJob(Compilation &C, const JobAction &JA,
   // whether or not that will be the case at this point. So, unconditionally
   // pass LTO options to ensure proper codegen, metadata production, etc if
   // LTO indeed occurs.
+
+  if (const Arg *A = Args.getLastArg(options::OPT_fthinlto_distributor_EQ)) {
+    CmdArgs.push_back(
+        Args.MakeArgString("--thinlto-distributor=" + Twine(A->getValue())));
+    CmdArgs.push_back(Args.MakeArgString("--thinlto-remote-compiler=" +
+                                         Twine(D.getClangProgramPath())));
+
+    for (const auto &A :
+         Args.getAllArgValues(options::OPT_Xthinlto_distributor_EQ))
+      CmdArgs.push_back(Args.MakeArgString("--thinlto-distributor-arg=" + A));
+  }
+
   if (Args.hasFlag(options::OPT_funified_lto, options::OPT_fno_unified_lto,
                    true))
     CmdArgs.push_back(D.getLTOMode() == LTOK_Thin ? "--lto=thin"
diff --git a/clang/lib/Format/CMakeLists.txt b/clang/lib/Format/CMakeLists.txt
index 24f435d2caee1..50c0683dc9b7f 100644
--- a/clang/lib/Format/CMakeLists.txt
+++ b/clang/lib/Format/CMakeLists.txt
@@ -13,6 +13,7 @@ add_clang_library(clangFormat
   MacroExpander.cpp
   MatchFilePath.cpp
   NamespaceEndCommentsFixer.cpp
+  NumericLiteralCaseFixer.cpp
   NumericLiteralInfo.cpp
   ObjCPropertyAttributeOrderFixer.cpp
   QualifierAlignmentFixer.cpp
diff --git a/clang/lib/Format/ContinuationIndenter.cpp b/clang/lib/Format/ContinuationIndenter.cpp
index 888d0faf80931..9413c13a4137e 100644
--- a/clang/lib/Format/ContinuationIndenter.cpp
+++ b/clang/lib/Format/ContinuationIndenter.cpp
@@ -780,19 +780,21 @@ void ContinuationIndenter::addTokenOnCurrentLine(LineState &State, bool DryRun,
 
   // Indent preprocessor directives after the hash if required.
   int PPColumnCorrection = 0;
-  if (Style.IndentPPDirectives == FormatStyle::PPDIS_AfterHash &&
-      Previous.is(tok::hash) && State.FirstIndent > 0 &&
-      &Previous == State.Line->First &&
+  if (&Previous == State.Line->First && Previous.is(tok::hash) &&
       (State.Line->Type == LT_PreprocessorDirective ||
        State.Line->Type == LT_ImportStatement)) {
-    Spaces += State.FirstIndent;
-
-    // For preprocessor indent with tabs, State.Column will be 1 because of the
-    // hash. This causes second-level indents onward to have an extra space
-    // after the tabs. We avoid this misalignment by subtracting 1 from the
-    // column value passed to replaceWhitespace().
-    if (Style.UseTab != FormatStyle::UT_Never)
-      PPColumnCorrection = -1;
+    if (Style.IndentPPDirectives == FormatStyle::PPDIS_AfterHash) {
+      Spaces += State.FirstIndent;
+
+      // For preprocessor indent with tabs, State.Column will be 1 because of
+      // the hash. This causes second-level indents onward to have an extra
+      // space after the tabs. We avoid this misalignment by subtracting 1 from
+      // the column value passed to replaceWhitespace().
+      if (Style.UseTab != FormatStyle::UT_Never)
+        PPColumnCorrection = -1;
+    } else if (Style.IndentPPDirectives == FormatStyle::PPDIS_Leave) {
+      Spaces += Current.OriginalColumn - Previous.OriginalColumn - 1;
+    }
   }
 
   if (!DryRun) {
diff --git a/clang/lib/Format/Format.cpp b/clang/lib/Format/Format.cpp
index e3b22cdabaccd..68e9618432035 100644
--- a/clang/lib/Format/Format.cpp
+++ b/clang/lib/Format/Format.cpp
@@ -16,6 +16,7 @@
 #include "DefinitionBlockSeparator.h"
 #include "IntegerLiteralSeparatorFixer.h"
 #include "NamespaceEndCommentsFixer.h"
+#include "NumericLiteralCaseFixer.h"
 #include "ObjCPropertyAttributeOrderFixer.h"
 #include "QualifierAlignmentFixer.h"
 #include "SortJavaScriptImports.h"
@@ -472,6 +473,25 @@ struct ScalarEnumerationTraits<FormatStyle::NamespaceIndentationKind> {
   }
 };
 
+template <>
+struct ScalarEnumerationTraits<FormatStyle::NumericLiteralComponentStyle> {
+  static void enumeration(IO &IO,
+                          FormatStyle::NumericLiteralComponentStyle &Value) {
+    IO.enumCase(Value, "Leave", FormatStyle::NLCS_Leave);
+    IO.enumCase(Value, "Upper", FormatStyle::NLCS_Upper);
+    IO.enumCase(Value, "Lower", FormatStyle::NLCS_Lower);
+  }
+};
+
+template <> struct MappingTraits<FormatStyle::NumericLiteralCaseStyle> {
+  static void mapping(IO &IO, FormatStyle::NumericLiteralCaseStyle &Value) {
+    IO.mapOptional("ExponentLetter", Value.ExponentLetter);
+    IO.mapOptional("HexDigit", Value.HexDigit);
+    IO.mapOptional("Prefix", Value.Prefix);
+    IO.mapOptional("Suffix", Value.Suffix);
+  }
+};
+
 template <> struct ScalarEnumerationTraits<FormatStyle::OperandAlignmentStyle> {
   static void enumeration(IO &IO, FormatStyle::OperandAlignmentStyle &Value) {
     IO.enumCase(Value, "DontAlign", FormatStyle::OAS_DontAlign);
@@ -515,6 +535,7 @@ struct ScalarEnumerationTraits<FormatStyle::PPDirectiveIndentStyle> {
     IO.enumCase(Value, "None", FormatStyle::PPDIS_None);
     IO.enumCase(Value, "AfterHash", FormatStyle::PPDIS_AfterHash);
     IO.enumCase(Value, "BeforeHash", FormatStyle::PPDIS_BeforeHash);
+    IO.enumCase(Value, "Leave", FormatStyle::PPDIS_Leave);
   }
 };
 
@@ -1121,6 +1142,7 @@ template <> struct MappingTraits<FormatStyle> {
     IO.mapOptional("MaxEmptyLinesToKeep", Style.MaxEmptyLinesToKeep);
     IO.mapOptional("NamespaceIndentation", Style.NamespaceIndentation);
     IO.mapOptional("NamespaceMacros", Style.NamespaceMacros);
+    IO.mapOptional("NumericLiteralCase", Style.NumericLiteralCase);
     IO.mapOptional("ObjCBinPackProtocolList", Style.ObjCBinPackProtocolList);
     IO.mapOptional("ObjCBlockIndentWidth", Style.ObjCBlockIndentWidth);
     IO.mapOptional("ObjCBreakBeforeNestedBlockParam",
@@ -1653,6 +1675,10 @@ FormatStyle getLLVMStyle(FormatStyle::LanguageKind Language) {
   LLVMStyle.LineEnding = FormatStyle::LE_DeriveLF;
   LLVMStyle.MaxEmptyLinesToKeep = 1;
   LLVMStyle.NamespaceIndentation = FormatStyle::NI_None;
+  LLVMStyle.NumericLiteralCase = {/*ExponentLetter=*/FormatStyle::NLCS_Leave,
+                                  /*HexDigit=*/FormatStyle::NLCS_Leave,
+                                  /*Prefix=*/FormatStyle::NLCS_Leave,
+                                  /*Suffix=*/FormatStyle::NLCS_Leave};
   LLVMStyle.ObjCBinPackProtocolList = FormatStyle::BPS_Auto;
   LLVMStyle.ObjCBlockIndentWidth = 2;
   LLVMStyle.ObjCBreakBeforeNestedBlockParam = true;
@@ -3890,6 +3916,10 @@ reformat(const FormatStyle &Style, StringRef Code,
     return IntegerLiteralSeparatorFixer().process(Env, Expanded);
   });
 
+  Passes.emplace_back([&](const Environment &Env) {
+    return NumericLiteralCaseFixer().process(Env, Expanded);
+  });
+
   if (Style.isCpp()) {
     if (Style.QualifierAlignment != FormatStyle::QAS_Leave)
       addQualifierAlignmentFixerPasses(Expanded, Passes);
@@ -4087,6 +4117,7 @@ LangOptions getFormattingLangOpts(const FormatStyle &Style) {
   switch (Style.Language) {
   case FormatStyle::LK_C:
     LangOpts.C11 = 1;
+    LangOpts.C23 = 1;
     break;
   case FormatStyle::LK_Cpp:
   case FormatStyle::LK_ObjC:
diff --git a/clang/lib/Format/NumericLiteralCaseFixer.cpp b/clang/lib/Format/NumericLiteralCaseFixer.cpp
new file mode 100644
index 0000000000000..b58b3c7ee0189
--- /dev/null
+++ b/clang/lib/Format/NumericLiteralCaseFixer.cpp
@@ -0,0 +1,177 @@
+//===--- NumericLiteralCaseFixer.cpp ----------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file implements NumericLiteralCaseFixer that standardizes character
+/// case within numeric literals.
+///
+//===----------------------------------------------------------------------===//
+
+#include "NumericLiteralCaseFixer.h"
+#include "NumericLiteralInfo.h"
+
+#include "llvm/ADT/StringExtras.h"
+
+#include <algorithm>
+
+namespace clang {
+namespace format {
+
+static bool isNumericLiteralCaseFixerNeeded(const FormatStyle &Style) {
+  // Check if language is supported.
+  switch (Style.Language) {
+  case FormatStyle::LK_C:
+  case FormatStyle::LK_Cpp:
+  case FormatStyle::LK_ObjC:
+  case FormatStyle::LK_CSharp:
+  case FormatStyle::LK_Java:
+  case FormatStyle::LK_JavaScript:
+    break;
+  default:
+    return false;
+  }
+
+  // Check if style options are set.
+  const auto &Option = Style.NumericLiteralCase;
+  const auto Leave = FormatStyle::NLCS_Leave;
+  return Option.Prefix != Leave || Option.HexDigit != Leave ||
+         Option.ExponentLetter != Leave || Option.Suffix != Leave;
+}
+
+static std::string
+transformComponent(StringRef Component,
+                   FormatStyle::NumericLiteralComponentStyle ConfigValue) {
+  switch (ConfigValue) {
+  case FormatStyle::NLCS_Upper:
+    return Component.upper();
+  case FormatStyle::NLCS_Lower:
+    return Component.lower();
+  default:
+    // Covers FormatStyle::NLCS_Leave.
+    return Component.str();
+  }
+}
+
+/// Test if Suffix matches a C++ literal reserved by the library.
+/// Matches against all suffixes reserved in the C++23 standard.
+static bool matchesReservedSuffix(StringRef Suffix) {
+  static constexpr std::array<StringRef, 11> SortedReservedSuffixes = {
+      "d", "h", "i", "if", "il", "min", "ms", "ns", "s", "us", "y",
+  };
+
+  // This can be static_assert when we have access to constexpr is_sorted in
+  // C++ 20.
+  assert(llvm::is_sorted(SortedReservedSuffixes) &&
+         "Must be sorted as precondition for lower_bound().");
+
+  auto entry = llvm::lower_bound(SortedReservedSuffixes, Suffix);
+  if (entry == SortedReservedSuffixes.cend())
+    return false;
+  return *entry == Suffix;
+}
+
+static std::string format(StringRef NumericLiteral, const FormatStyle &Style) {
+  const char Separator = Style.isCpp() ? '\'' : '_';
+  const NumericLiteralInfo Info(NumericLiteral, Separator);
+  const bool HasBaseLetter = Info.BaseLetterPos != StringRef::npos;
+  const bool HasExponent = Info.ExponentLetterPos != StringRef::npos;
+  const bool HasSuffix = Info.SuffixPos != StringRef::npos;
+
+  std::string Formatted;
+
+  if (HasBaseLetter) {
+    Formatted +=
+        transformComponent(NumericLiteral.take_front(1 + Info.BaseLetterPos),
+                           Style.NumericLiteralCase.Prefix);
+  }
+  // Reformat this slice as HexDigit whether or not the digit has hexadecimal
+  // characters because binary/decimal/octal digits are unchanged.
+  Formatted += transformComponent(
+      NumericLiteral.slice(HasBaseLetter ? 1 + Info.BaseLetterPos : 0,
+                           HasExponent ? Info.ExponentLetterPos
+                           : HasSuffix ? Info.SuffixPos
+                                       : NumericLiteral.size()),
+      Style.NumericLiteralCase.HexDigit);
+
+  if (HasExponent) {
+    Formatted += transformComponent(
+        NumericLiteral.slice(Info.ExponentLetterPos,
+                             HasSuffix ? Info.SuffixPos
+                                       : NumericLiteral.size()),
+        Style.NumericLiteralCase.ExponentLetter);
+  }
+
+  if (HasSuffix) {
+    StringRef Suffix = NumericLiteral.drop_front(Info.SuffixPos);
+    if (matchesReservedSuffix(Suffix) || Suffix.front() == '_') {
+      // In C++, it is idiomatic, but NOT standardized to define user-defined
+      // literals with a leading '_'. Omit user defined literals and standard
+      // reserved suffixes from transformation.
+      Formatted += Suffix.str();
+    } else {
+      Formatted += transformComponent(Suffix, Style.NumericLiteralCase.Suffix);
+    }
+  }
+
+  return Formatted;
+}
+
+std::pair<tooling::Replacements, unsigned>
+NumericLiteralCaseFixer::process(const Environment &Env,
+                                 const FormatStyle &Style) {
+  if (!isNumericLiteralCaseFixerNeeded(Style))
+    return {};
+
+  const auto &SourceMgr = Env.getSourceManager();
+  AffectedRangeManager AffectedRangeMgr(SourceMgr, Env.getCharRanges());
+
+  const auto ID = Env.getFileID();
+  const auto LangOpts = getFormattingLangOpts(Style);
+  Lexer Lex(ID, SourceMgr.getBufferOrFake(ID), SourceMgr, LangOpts);
+  Lex.SetCommentRetentionState(true);
+
+  Token Tok;
+  tooling::Replacements Result;
+
+  for (bool Skip = false; !Lex.LexFromRawLexer(Tok);) {
+    // Skip tokens that are too small to contain a formattable literal.
+    // Size=2 is the smallest possible literal that could contain formattable
+    // components, for example "1u".
+    auto Length = Tok.getLength();
+    if (Length < 2)
+      continue;
+
+    // Service clang-format off/on comments.
+    auto Location = Tok.getLocation();
+    auto Text = StringRef(SourceMgr.getCharacterData(Location), Length);
+    if (Tok.is(tok::comment)) {
+      if (isClangFormatOff(Text))
+        Skip = true;
+      else if (isClangFormatOn(Text))
+        Skip = false;
+      continue;
+    }
+
+    if (Skip || Tok.isNot(tok::numeric_constant) ||
+        !AffectedRangeMgr.affectsCharSourceRange(
+            CharSourceRange::getCharRange(Location, Tok.getEndLoc()))) {
+      continue;
+    }
+
+    const auto Formatted = format(Text, Style);
+    if (Formatted != Text) {
+      cantFail(Result.add(
+          tooling::Replacement(SourceMgr, Location, Length, Formatted)));
+    }
+  }
+
+  return {Result, 0};
+}
+
+} // namespace format
+} // namespace clang
diff --git a/clang/lib/Format/NumericLiteralCaseFixer.h b/clang/lib/Format/NumericLiteralCaseFixer.h
new file mode 100644
index 0000000000000..ac3ac30d1d19a
--- /dev/null
+++ b/clang/lib/Format/NumericLiteralCaseFixer.h
@@ -0,0 +1,32 @@
+//===--- NumericLiteralCaseFixer.h ------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file declares NumericLiteralCaseFixer that standardizes character case
+/// within numeric literals.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_LIB_FORMAT_NUMERICLITERALCASEFIXER_H
+#define LLVM_CLANG_LIB_FORMAT_NUMERICLITERALCASEFIXER_H
+
+#include "TokenAnalyzer.h"
+
+namespace clang {
+namespace format {
+
+class NumericLiteralCaseFixer {
+public:
+  std::pair<tooling::Replacements, unsigned> process(const Environment &Env,
+                                                     const FormatStyle &Style);
+};
+
+} // end namespace format
+} // end namespace clang
+
+#endif
diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp
index bbb7ef2c337d6..d97f56751ea69 100644
--- a/clang/lib/Format/TokenAnnotator.cpp
+++ b/clang/lib/Format/TokenAnnotator.cpp
@@ -3627,7 +3627,7 @@ void TokenAnnotator::setCommentLineLevels(
       // Align comments for preprocessor lines with the # in column 0 if
       // preprocessor lines are not indented. Otherwise, align with the next
       // line.
-      Line->Level = Style.IndentPPDirectives != FormatStyle::PPDIS_BeforeHash &&
+      Line->Level = Style.IndentPPDirectives < FormatStyle::PPDIS_BeforeHash &&
                             PPDirectiveOrImportStmt
                         ? 0
                         : NextNonCommentLine->Level;
diff --git a/clang/lib/Format/UnwrappedLineFormatter.cpp b/clang/lib/Format/UnwrappedLineFormatter.cpp
index 2a7bfd1a7dc5b..ac9d147defc13 100644
--- a/clang/lib/Format/UnwrappedLineFormatter.cpp
+++ b/clang/lib/Format/UnwrappedLineFormatter.cpp
@@ -62,10 +62,16 @@ class LevelIndentTracker {
     // having the right size in adjustToUnmodifiedline.
     if (Line.Level >= IndentForLevel.size())
       IndentForLevel.resize(Line.Level + 1, -1);
-    if (Style.IndentPPDirectives != FormatStyle::PPDIS_None &&
-        (Line.InPPDirective ||
-         (Style.IndentPPDirectives == FormatStyle::PPDIS_BeforeHash &&
-          Line.Type == LT_CommentAbovePPDirective))) {
+    if (Style.IndentPPDirectives == FormatStyle::PPDIS_Leave &&
+        (Line.InPPDirective || Line.Type == LT_CommentAbovePPDirective)) {
+      Indent = Line.InMacroBody
+                   ? (Line.Level - Line.PPLevel) * Style.IndentWidth +
+                         AdditionalIndent
+                   : Line.First->OriginalColumn;
+    } else if (Style.IndentPPDirectives != FormatStyle::PPDIS_None &&
+               (Line.InPPDirective ||
+                (Style.IndentPPDirectives == FormatStyle::PPDIS_BeforeHash &&
+                 Line.Type == LT_CommentAbovePPDirective))) {
       unsigned PPIndentWidth =
           (Style.PPIndentWidth >= 0) ? Style.PPIndentWidth : Style.IndentWidth;
       Indent = Line.InMacroBody
@@ -1656,7 +1662,7 @@ void UnwrappedLineFormatter::formatFirstToken(
   // Preprocessor directives get indented before the hash only if specified. In
   // Javascript import statements are indented like normal statements.
   if (!Style.isJavaScript() &&
-      Style.IndentPPDirectives != FormatStyle::PPDIS_BeforeHash &&
+      Style.IndentPPDirectives < FormatStyle::PPDIS_BeforeHash &&
       (Line.Type == LT_PreprocessorDirective ||
        Line.Type == LT_ImportStatement)) {
     Indent = 0;
diff --git a/clang/lib/Format/UnwrappedLineParser.cpp b/clang/lib/Format/UnwrappedLineParser.cpp
index f4bbfcf8461bc..2c9766c9b7bc0 100644
--- a/clang/lib/Format/UnwrappedLineParser.cpp
+++ b/clang/lib/Format/UnwrappedLineParser.cpp
@@ -162,17 +162,13 @@ UnwrappedLineParser::UnwrappedLineParser(
       LangOpts(getFormattingLangOpts(Style)), Keywords(Keywords),
       CommentPragmasRegex(Style.CommentPragmas), Tokens(nullptr),
       Callback(Callback), AllTokens(Tokens), PPBranchLevel(-1),
-      IncludeGuard(Style.IndentPPDirectives == FormatStyle::PPDIS_None
-                       ? IG_Rejected
-                       : IG_Inited),
+      IncludeGuard(getIncludeGuardState(Style.IndentPPDirectives)),
       IncludeGuardToken(nullptr), FirstStartColumn(FirstStartColumn),
       Macros(Style.Macros, SourceMgr, Style, Allocator, IdentTable) {}
 
 void UnwrappedLineParser::reset() {
   PPBranchLevel = -1;
-  IncludeGuard = Style.IndentPPDirectives == FormatStyle::PPDIS_None
-                     ? IG_Rejected
-                     : IG_Inited;
+  IncludeGuard = getIncludeGuardState(Style.IndentPPDirectives);
   IncludeGuardToken = nullptr;
   Line.reset(new UnwrappedLine);
   CommentsBeforeNextToken.clear();
@@ -1140,7 +1136,7 @@ void UnwrappedLineParser::parsePPEndIf() {
   // If the #endif of a potential include guard is the last thing in the file,
   // then we found an include guard.
   if (IncludeGuard == IG_Defined && PPBranchLevel == -1 && Tokens->isEOF() &&
-      Style.IndentPPDirectives != FormatStyle::PPDIS_None) {
+      getIncludeGuardState(Style.IndentPPDirectives) == IG_Inited) {
     IncludeGuard = IG_Found;
   }
 }
diff --git a/clang/lib/Format/UnwrappedLineParser.h b/clang/lib/Format/UnwrappedLineParser.h
index 8e29680ff244b..8b8ad84896f1a 100644
--- a/clang/lib/Format/UnwrappedLineParser.h
+++ b/clang/lib/Format/UnwrappedLineParser.h
@@ -397,6 +397,13 @@ class UnwrappedLineParser {
   // Current state of include guard search.
   IncludeGuardState IncludeGuard;
 
+  IncludeGuardState
+  getIncludeGuardState(FormatStyle::PPDirectiveIndentStyle Style) const {
+    return Style == FormatStyle::PPDIS_None || Style == FormatStyle::PPDIS_Leave
+               ? IG_Rejected
+               : IG_Inited;
+  }
+
   // Points to the #ifndef condition for a potential include guard. Null unless
   // IncludeGuardState == IG_IfNdefed.
   FormatToken *IncludeGuardToken;
diff --git a/clang/lib/Frontend/ASTUnit.cpp b/clang/lib/Frontend/ASTUnit.cpp
index 03b08cdabe39e..8b35af152cbc8 100644
--- a/clang/lib/Frontend/ASTUnit.cpp
+++ b/clang/lib/Frontend/ASTUnit.cpp
@@ -1189,10 +1189,12 @@ bool ASTUnit::Parse(std::shared_ptr<PCHContainerOperations> PCHContainerOps,
   // Ensure that Clang has a FileManager with the right VFS, which may have
   // changed above in AddImplicitPreamble.  If VFS is nullptr, rely on
   // createFileManager to create one.
-  if (VFS && FileMgr && &FileMgr->getVirtualFileSystem() == VFS)
+  if (VFS && FileMgr && &FileMgr->getVirtualFileSystem() == VFS) {
+    Clang->setVirtualFileSystem(std::move(VFS));
     Clang->setFileManager(FileMgr);
-  else {
-    Clang->createFileManager(std::move(VFS));
+  } else {
+    Clang->setVirtualFileSystem(std::move(VFS));
+    Clang->createFileManager();
     FileMgr = Clang->getFileManagerPtr();
   }
 
diff --git a/clang/lib/Frontend/ChainedIncludesSource.cpp b/clang/lib/Frontend/ChainedIncludesSource.cpp
index 013814a738a36..82249f893a795 100644
--- a/clang/lib/Frontend/ChainedIncludesSource.cpp
+++ b/clang/lib/Frontend/ChainedIncludesSource.cpp
@@ -124,6 +124,7 @@ clang::createChainedIncludesSource(CompilerInstance &CI,
 
     auto Clang = std::make_unique<CompilerInstance>(
         std::move(CInvok), CI.getPCHContainerOperations());
+    Clang->createVirtualFileSystem();
     Clang->setDiagnostics(Diags);
     Clang->setTarget(TargetInfo::CreateTargetInfo(
         Clang->getDiagnostics(), Clang->getInvocation().getTargetOpts()));
diff --git a/clang/lib/Frontend/CompilerInstance.cpp b/clang/lib/Frontend/CompilerInstance.cpp
index 31a8d75fec4bd..d6f3aec981336 100644
--- a/clang/lib/Frontend/CompilerInstance.cpp
+++ b/clang/lib/Frontend/CompilerInstance.cpp
@@ -159,17 +159,11 @@ bool CompilerInstance::createTarget() {
   return true;
 }
 
-llvm::vfs::FileSystem &CompilerInstance::getVirtualFileSystem() const {
-  return getFileManager().getVirtualFileSystem();
-}
-
-llvm::IntrusiveRefCntPtr<llvm::vfs::FileSystem>
-CompilerInstance::getVirtualFileSystemPtr() const {
-  return getFileManager().getVirtualFileSystemPtr();
-}
-
-void CompilerInstance::setFileManager(
-    llvm::IntrusiveRefCntPtr<FileManager> Value) {
+void CompilerInstance::setFileManager(IntrusiveRefCntPtr<FileManager> Value) {
+  if (!hasVirtualFileSystem())
+    setVirtualFileSystem(Value->getVirtualFileSystemPtr());
+  assert(Value == nullptr ||
+         getVirtualFileSystemPtr() == Value->getVirtualFileSystemPtr());
   FileMgr = std::move(Value);
 }
 
@@ -271,24 +265,31 @@ static void collectIncludePCH(CompilerInstance &CI,
 
 static void collectVFSEntries(CompilerInstance &CI,
                               std::shared_ptr<ModuleDependencyCollector> MDC) {
-  if (CI.getHeaderSearchOpts().VFSOverlayFiles.empty())
-    return;
-
   // Collect all VFS found.
   SmallVector<llvm::vfs::YAMLVFSEntry, 16> VFSEntries;
-  for (const std::string &VFSFile : CI.getHeaderSearchOpts().VFSOverlayFiles) {
-    llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> Buffer =
-        llvm::MemoryBuffer::getFile(VFSFile);
-    if (!Buffer)
-      return;
-    llvm::vfs::collectVFSFromYAML(std::move(Buffer.get()),
-                                  /*DiagHandler*/ nullptr, VFSFile, VFSEntries);
-  }
+  CI.getVirtualFileSystem().visit([&](llvm::vfs::FileSystem &VFS) {
+    if (auto *RedirectingVFS = dyn_cast<llvm::vfs::RedirectingFileSystem>(&VFS))
+      llvm::vfs::collectVFSEntries(*RedirectingVFS, VFSEntries);
+  });
 
   for (auto &E : VFSEntries)
     MDC->addFile(E.VPath, E.RPath);
 }
 
+void CompilerInstance::createVirtualFileSystem(
+    IntrusiveRefCntPtr<llvm::vfs::FileSystem> BaseFS, DiagnosticConsumer *DC) {
+  DiagnosticOptions DiagOpts;
+  DiagnosticsEngine Diags(DiagnosticIDs::create(), DiagOpts, DC,
+                          /*ShouldOwnClient=*/false);
+
+  VFS = createVFSFromCompilerInvocation(getInvocation(), Diags,
+                                        std::move(BaseFS));
+  // FIXME: Should this go into createVFSFromCompilerInvocation?
+  if (getFrontendOpts().ShowStats)
+    VFS =
+        llvm::makeIntrusiveRefCnt<llvm::vfs::TracingFileSystem>(std::move(VFS));
+}
+
 // Diagnostics
 static void SetUpDiagnosticLog(DiagnosticOptions &DiagOpts,
                                const CodeGenOptions *CodeGenOpts,
@@ -340,11 +341,10 @@ static void SetupSerializedDiagnostics(DiagnosticOptions &DiagOpts,
   }
 }
 
-void CompilerInstance::createDiagnostics(llvm::vfs::FileSystem &VFS,
-                                         DiagnosticConsumer *Client,
+void CompilerInstance::createDiagnostics(DiagnosticConsumer *Client,
                                          bool ShouldOwnClient) {
-  Diagnostics = createDiagnostics(VFS, getDiagnosticOpts(), Client,
-                                  ShouldOwnClient, &getCodeGenOpts());
+  Diagnostics = createDiagnostics(getVirtualFileSystem(), getDiagnosticOpts(),
+                                  Client, ShouldOwnClient, &getCodeGenOpts());
 }
 
 IntrusiveRefCntPtr<DiagnosticsEngine> CompilerInstance::createDiagnostics(
@@ -382,18 +382,9 @@ IntrusiveRefCntPtr<DiagnosticsEngine> CompilerInstance::createDiagnostics(
 
 // File Manager
 
-FileManager *CompilerInstance::createFileManager(
-    IntrusiveRefCntPtr<llvm::vfs::FileSystem> VFS) {
-  if (!VFS)
-    VFS = FileMgr ? FileMgr->getVirtualFileSystemPtr()
-                  : createVFSFromCompilerInvocation(getInvocation(),
-                                                    getDiagnostics());
-  assert(VFS && "FileManager has no VFS?");
-  if (getFrontendOpts().ShowStats)
-    VFS =
-        llvm::makeIntrusiveRefCnt<llvm::vfs::TracingFileSystem>(std::move(VFS));
-  FileMgr = llvm::makeIntrusiveRefCnt<FileManager>(getFileSystemOpts(),
-                                                   std::move(VFS));
+FileManager *CompilerInstance::createFileManager() {
+  assert(VFS && "CompilerInstance needs a VFS for creating FileManager");
+  FileMgr = llvm::makeIntrusiveRefCnt<FileManager>(getFileSystemOpts(), VFS);
   return FileMgr.get();
 }
 
@@ -1174,20 +1165,21 @@ std::unique_ptr<CompilerInstance> CompilerInstance::cloneForModuleCompileImpl(
   auto &Inv = Instance.getInvocation();
 
   if (ThreadSafeConfig) {
-    Instance.createFileManager(ThreadSafeConfig->getVFS());
+    Instance.setVirtualFileSystem(ThreadSafeConfig->getVFS());
+    Instance.createFileManager();
   } else if (FrontendOpts.ModulesShareFileManager) {
+    Instance.setVirtualFileSystem(getVirtualFileSystemPtr());
     Instance.setFileManager(getFileManagerPtr());
   } else {
-    Instance.createFileManager(getVirtualFileSystemPtr());
+    Instance.setVirtualFileSystem(getVirtualFileSystemPtr());
+    Instance.createFileManager();
   }
 
   if (ThreadSafeConfig) {
-    Instance.createDiagnostics(Instance.getVirtualFileSystem(),
-                               &ThreadSafeConfig->getDiagConsumer(),
+    Instance.createDiagnostics(&ThreadSafeConfig->getDiagConsumer(),
                                /*ShouldOwnClient=*/false);
   } else {
     Instance.createDiagnostics(
-        Instance.getVirtualFileSystem(),
         new ForwardingDiagnosticConsumer(getDiagnosticClient()),
         /*ShouldOwnClient=*/true);
   }
diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp
index 931766db4b0c8..422375240bab6 100644
--- a/clang/lib/Frontend/CompilerInvocation.cpp
+++ b/clang/lib/Frontend/CompilerInvocation.cpp
@@ -1680,6 +1680,9 @@ void CompilerInvocationBase::GenerateCodeGenArgs(const CodeGenOptions &Opts,
   else
     GenerateArg(Consumer, OPT_fno_loop_interchange);
 
+  if (Opts.FuseLoops)
+    GenerateArg(Consumer, OPT_fexperimental_loop_fusion);
+
   if (!Opts.BinutilsVersion.empty())
     GenerateArg(Consumer, OPT_fbinutils_version_EQ, Opts.BinutilsVersion);
 
@@ -1975,9 +1978,10 @@ bool CompilerInvocation::ParseCodeGenArgs(CodeGenOptions &Opts, ArgList &Args,
   }
 
   const llvm::Triple::ArchType DebugEntryValueArchs[] = {
-      llvm::Triple::x86, llvm::Triple::x86_64, llvm::Triple::aarch64,
-      llvm::Triple::arm, llvm::Triple::armeb, llvm::Triple::mips,
-      llvm::Triple::mipsel, llvm::Triple::mips64, llvm::Triple::mips64el};
+      llvm::Triple::x86,     llvm::Triple::x86_64, llvm::Triple::aarch64,
+      llvm::Triple::arm,     llvm::Triple::armeb,  llvm::Triple::mips,
+      llvm::Triple::mipsel,  llvm::Triple::mips64, llvm::Triple::mips64el,
+      llvm::Triple::riscv32, llvm::Triple::riscv64};
 
   if (Opts.OptimizationLevel > 0 && Opts.hasReducedDebugInfo() &&
       llvm::is_contained(DebugEntryValueArchs, T.getArch()))
@@ -2000,6 +2004,8 @@ bool CompilerInvocation::ParseCodeGenArgs(CodeGenOptions &Opts, ArgList &Args,
                    (Opts.OptimizationLevel > 1));
   Opts.InterchangeLoops =
       Args.hasFlag(OPT_floop_interchange, OPT_fno_loop_interchange, false);
+  Opts.FuseLoops = Args.hasFlag(OPT_fexperimental_loop_fusion,
+                                OPT_fno_experimental_loop_fusion, false);
   Opts.BinutilsVersion =
       std::string(Args.getLastArgValue(OPT_fbinutils_version_EQ));
 
diff --git a/clang/lib/Frontend/FrontendAction.cpp b/clang/lib/Frontend/FrontendAction.cpp
index 6b1fcac75ac2b..ca37e0661476d 100644
--- a/clang/lib/Frontend/FrontendAction.cpp
+++ b/clang/lib/Frontend/FrontendAction.cpp
@@ -876,6 +876,7 @@ bool FrontendAction::BeginSourceFile(CompilerInstance &CI,
 
     // Set the shared objects, these are reset when we finish processing the
     // file, otherwise the CompilerInstance will happily destroy them.
+    CI.setVirtualFileSystem(AST->getFileManager().getVirtualFileSystemPtr());
     CI.setFileManager(AST->getFileManagerPtr());
     CI.createSourceManager(CI.getFileManager());
     CI.getSourceManager().initializeForReplay(AST->getSourceManager());
@@ -966,7 +967,9 @@ bool FrontendAction::BeginSourceFile(CompilerInstance &CI,
     return true;
   }
 
-  // Set up the file and source managers, if needed.
+  // Set up the file system, file and source managers, if needed.
+  if (!CI.hasVirtualFileSystem())
+    CI.createVirtualFileSystem();
   if (!CI.hasFileManager()) {
     if (!CI.createFileManager()) {
       return false;
diff --git a/clang/lib/Frontend/InitPreprocessor.cpp b/clang/lib/Frontend/InitPreprocessor.cpp
index e65c8b4f6facf..edf0a091e087c 100644
--- a/clang/lib/Frontend/InitPreprocessor.cpp
+++ b/clang/lib/Frontend/InitPreprocessor.cpp
@@ -766,7 +766,7 @@ static void InitializeCPlusPlusFeatureTestMacros(const LangOptions &LangOpts,
   Builder.defineMacro("__cpp_pack_indexing", "202311L");
   Builder.defineMacro("__cpp_deleted_function", "202403L");
   Builder.defineMacro("__cpp_variadic_friend", "202403L");
-  // Builder.defineMacro("__cpp_trivial_relocatability", "202502L");
+  Builder.defineMacro("__cpp_trivial_relocatability", "202502L");
 
   if (LangOpts.Char8)
     Builder.defineMacro("__cpp_char8_t", "202207L");
diff --git a/clang/lib/Frontend/MultiplexConsumer.cpp b/clang/lib/Frontend/MultiplexConsumer.cpp
index 3fd3c9bd69037..f5f8848798a35 100644
--- a/clang/lib/Frontend/MultiplexConsumer.cpp
+++ b/clang/lib/Frontend/MultiplexConsumer.cpp
@@ -107,6 +107,8 @@ class MultiplexASTMutationListener : public ASTMutationListener {
   void ResolvedOperatorDelete(const CXXDestructorDecl *DD,
                               const FunctionDecl *Delete,
                               Expr *ThisArg) override;
+  void ResolvedOperatorGlobDelete(const CXXDestructorDecl *DD,
+                                  const FunctionDecl *GlobDelete) override;
   void CompletedImplicitDefinition(const FunctionDecl *D) override;
   void InstantiationRequested(const ValueDecl *D) override;
   void VariableDefinitionInstantiated(const VarDecl *D) override;
@@ -184,6 +186,11 @@ void MultiplexASTMutationListener::ResolvedOperatorDelete(
   for (auto *L : Listeners)
     L->ResolvedOperatorDelete(DD, Delete, ThisArg);
 }
+void MultiplexASTMutationListener::ResolvedOperatorGlobDelete(
+    const CXXDestructorDecl *DD, const FunctionDecl *GlobDelete) {
+  for (auto *L : Listeners)
+    L->ResolvedOperatorGlobDelete(DD, GlobDelete);
+}
 void MultiplexASTMutationListener::CompletedImplicitDefinition(
                                                         const FunctionDecl *D) {
   for (size_t i = 0, e = Listeners.size(); i != e; ++i)
diff --git a/clang/lib/Frontend/Rewrite/FrontendActions.cpp b/clang/lib/Frontend/Rewrite/FrontendActions.cpp
index 6c9c9d5b5c8d3..f5656b3b190e9 100644
--- a/clang/lib/Frontend/Rewrite/FrontendActions.cpp
+++ b/clang/lib/Frontend/Rewrite/FrontendActions.cpp
@@ -245,8 +245,8 @@ class RewriteIncludesAction::RewriteImportsListener : public ASTReaderListener {
     CompilerInstance Instance(
         std::make_shared<CompilerInvocation>(CI.getInvocation()),
         CI.getPCHContainerOperations(), &CI.getModuleCache());
+    Instance.setVirtualFileSystem(CI.getVirtualFileSystemPtr());
     Instance.createDiagnostics(
-        CI.getVirtualFileSystem(),
         new ForwardingDiagnosticConsumer(CI.getDiagnosticClient()),
         /*ShouldOwnClient=*/true);
     Instance.getFrontendOpts().DisableFree = false;
diff --git a/clang/lib/FrontendTool/ExecuteCompilerInvocation.cpp b/clang/lib/FrontendTool/ExecuteCompilerInvocation.cpp
index 9a6844d5f7d40..8b0ab2eb37189 100644
--- a/clang/lib/FrontendTool/ExecuteCompilerInvocation.cpp
+++ b/clang/lib/FrontendTool/ExecuteCompilerInvocation.cpp
@@ -210,6 +210,8 @@ CreateFrontendAction(CompilerInstance &CI) {
 }
 
 bool ExecuteCompilerInvocation(CompilerInstance *Clang) {
+  unsigned NumErrorsBefore = Clang->getDiagnostics().getNumErrors();
+
   // Honor -help.
   if (Clang->getFrontendOpts().ShowHelp) {
     driver::getDriverOptTable().printHelp(
@@ -292,9 +294,12 @@ bool ExecuteCompilerInvocation(CompilerInstance *Clang) {
   }
 #endif
 
-  // If there were errors in processing arguments, don't do anything else.
-  if (Clang->getDiagnostics().hasErrorOccurred())
+  // If there were errors in the above, don't do anything else.
+  // This intentionally ignores errors emitted before this function to
+  // accommodate lenient callers that decided to make progress despite errors.
+  if (Clang->getDiagnostics().getNumErrors() != NumErrorsBefore)
     return false;
+
   // Create and execute the frontend action.
   std::unique_ptr<FrontendAction> Act(CreateFrontendAction(*Clang));
   if (!Act)
diff --git a/clang/lib/Headers/avx2intrin.h b/clang/lib/Headers/avx2intrin.h
index fc12a9bf15e57..e35c159fec7fd 100644
--- a/clang/lib/Headers/avx2intrin.h
+++ b/clang/lib/Headers/avx2intrin.h
@@ -2060,8 +2060,9 @@ _mm256_sign_epi32(__m256i __a, __m256i __b)
 /// \param imm
 ///     An unsigned immediate value specifying the shift count (in bytes).
 /// \returns A 256-bit integer vector containing the result.
-#define _mm256_slli_si256(a, imm) \
-  ((__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a), (int)(imm)))
+#define _mm256_slli_si256(a, imm)                                              \
+  ((__m256i)__builtin_ia32_pslldqi256_byteshift((__v32qi)(__m256i)(a),         \
+                                                (int)(imm)))
 
 /// Shifts each 128-bit half of the 256-bit integer vector \a a left by
 ///    \a imm bytes, shifting in zero bytes, and returns the result. If \a imm
@@ -2080,8 +2081,9 @@ _mm256_sign_epi32(__m256i __a, __m256i __b)
 /// \param imm
 ///    An unsigned immediate value specifying the shift count (in bytes).
 /// \returns A 256-bit integer vector containing the result.
-#define _mm256_bslli_epi128(a, imm) \
-  ((__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a), (int)(imm)))
+#define _mm256_bslli_epi128(a, imm)                                            \
+  ((__m256i)__builtin_ia32_pslldqi256_byteshift((__v32qi)(__m256i)(a),         \
+                                                (int)(imm)))
 
 /// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
 ///    left by \a __count bits, shifting in zero bits, and returns the result.
@@ -2299,8 +2301,9 @@ _mm256_sra_epi32(__m256i __a, __m128i __count)
 /// \param imm
 ///    An unsigned immediate value specifying the shift count (in bytes).
 /// \returns A 256-bit integer vector containing the result.
-#define _mm256_srli_si256(a, imm) \
-  ((__m256i)__builtin_ia32_psrldqi256_byteshift((__m256i)(a), (int)(imm)))
+#define _mm256_srli_si256(a, imm)                                              \
+  ((__m256i)__builtin_ia32_psrldqi256_byteshift((__v32qi)(__m256i)(a),         \
+                                                (int)(imm)))
 
 /// Shifts each 128-bit half of the 256-bit integer vector in \a a right by
 ///    \a imm bytes, shifting in zero bytes, and returns the result. If
@@ -2319,8 +2322,9 @@ _mm256_sra_epi32(__m256i __a, __m128i __count)
 /// \param imm
 ///     An unsigned immediate value specifying the shift count (in bytes).
 /// \returns A 256-bit integer vector containing the result.
-#define _mm256_bsrli_epi128(a, imm) \
-  ((__m256i)__builtin_ia32_psrldqi256_byteshift((__m256i)(a), (int)(imm)))
+#define _mm256_bsrli_epi128(a, imm)                                            \
+  ((__m256i)__builtin_ia32_psrldqi256_byteshift((__v32qi)(__m256i)(a),         \
+                                                (int)(imm)))
 
 /// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
 ///    right by \a __count bits, shifting in zero bits, and returns the result.
diff --git a/clang/lib/Headers/avx512bwintrin.h b/clang/lib/Headers/avx512bwintrin.h
index 42fce7d89e1bb..599cfbe479676 100644
--- a/clang/lib/Headers/avx512bwintrin.h
+++ b/clang/lib/Headers/avx512bwintrin.h
@@ -464,17 +464,15 @@ _mm512_maskz_mullo_epi16(__mmask32 __U, __m512i __A, __m512i __B) {
                                              (__v32hi)_mm512_setzero_si512());
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_blend_epi8 (__mmask64 __U, __m512i __A, __m512i __W)
-{
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_mask_blend_epi8(__mmask64 __U, __m512i __A, __m512i __W) {
   return (__m512i) __builtin_ia32_selectb_512 ((__mmask64) __U,
               (__v64qi) __W,
               (__v64qi) __A);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_blend_epi16 (__mmask32 __U, __m512i __A, __m512i __W)
-{
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_mask_blend_epi16(__mmask32 __U, __m512i __A, __m512i __W) {
   return (__m512i) __builtin_ia32_selectw_512 ((__mmask32) __U,
               (__v32hi) __W,
               (__v32hi) __A);
@@ -1461,8 +1459,9 @@ _mm512_maskz_slli_epi16(__mmask32 __U, __m512i __A, unsigned int __B) {
                                          (__v32hi)_mm512_setzero_si512());
 }
 
-#define _mm512_bslli_epi128(a, imm) \
-  ((__m512i)__builtin_ia32_pslldqi512_byteshift((__v8di)(__m512i)(a), (int)(imm)))
+#define _mm512_bslli_epi128(a, imm)                                            \
+  ((__m512i)__builtin_ia32_pslldqi512_byteshift((__v64qi)(__m512i)(a),         \
+                                                (int)(imm)))
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_srlv_epi16(__m512i __A, __m512i __B)
@@ -1592,8 +1591,9 @@ _mm512_maskz_srli_epi16(__mmask32 __U, __m512i __A, int __B) {
                                          (__v32hi)_mm512_setzero_si512());
 }
 
-#define _mm512_bsrli_epi128(a, imm) \
-  ((__m512i)__builtin_ia32_psrldqi512_byteshift((__v8di)(__m512i)(a), (int)(imm)))
+#define _mm512_bsrli_epi128(a, imm)                                            \
+  ((__m512i)__builtin_ia32_psrldqi512_byteshift((__v64qi)(__m512i)(a),         \
+                                                (int)(imm)))
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_mask_mov_epi16 (__m512i __W, __mmask32 __U, __m512i __A)
diff --git a/clang/lib/Headers/avx512fintrin.h b/clang/lib/Headers/avx512fintrin.h
index 7ba09039cd826..8ebfb75170e17 100644
--- a/clang/lib/Headers/avx512fintrin.h
+++ b/clang/lib/Headers/avx512fintrin.h
@@ -3209,33 +3209,29 @@ _mm512_maskz_permutex2var_epi64(__mmask8 __U, __m512i __A, __m512i __I,
 
 /* Vector Blend */
 
-static __inline __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask_blend_pd(__mmask8 __U, __m512d __A, __m512d __W)
-{
+static __inline __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_mask_blend_pd(__mmask8 __U, __m512d __A, __m512d __W) {
   return (__m512d) __builtin_ia32_selectpd_512 ((__mmask8) __U,
                  (__v8df) __W,
                  (__v8df) __A);
 }
 
-static __inline __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask_blend_ps(__mmask16 __U, __m512 __A, __m512 __W)
-{
+static __inline __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_mask_blend_ps(__mmask16 __U, __m512 __A, __m512 __W) {
   return (__m512) __builtin_ia32_selectps_512 ((__mmask16) __U,
                 (__v16sf) __W,
                 (__v16sf) __A);
 }
 
-static __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_blend_epi64(__mmask8 __U, __m512i __A, __m512i __W)
-{
+static __inline __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_mask_blend_epi64(__mmask8 __U, __m512i __A, __m512i __W) {
   return (__m512i) __builtin_ia32_selectq_512 ((__mmask8) __U,
                 (__v8di) __W,
                 (__v8di) __A);
 }
 
-static __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_blend_epi32(__mmask16 __U, __m512i __A, __m512i __W)
-{
+static __inline __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_mask_blend_epi32(__mmask16 __U, __m512i __A, __m512i __W) {
   return (__m512i) __builtin_ia32_selectd_512 ((__mmask16) __U,
                 (__v16si) __W,
                 (__v16si) __A);
diff --git a/clang/lib/Headers/avx512fp16intrin.h b/clang/lib/Headers/avx512fp16intrin.h
index d30b49e552e1b..4bd798129a25d 100644
--- a/clang/lib/Headers/avx512fp16intrin.h
+++ b/clang/lib/Headers/avx512fp16intrin.h
@@ -3309,7 +3309,7 @@ _mm512_reduce_min_ph(__m512h __V) {
   return __builtin_ia32_reduce_fmin_ph512(__V);
 }
 
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
+static __inline__ __m512h __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_mask_blend_ph(__mmask32 __U, __m512h __A, __m512h __W) {
   return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U, (__v32hf)__W,
                                               (__v32hf)__A);
diff --git a/clang/lib/Headers/avx512vlbwintrin.h b/clang/lib/Headers/avx512vlbwintrin.h
index 6e3efa7b3562c..f1cd71af05ab5 100644
--- a/clang/lib/Headers/avx512vlbwintrin.h
+++ b/clang/lib/Headers/avx512vlbwintrin.h
@@ -452,33 +452,29 @@ _mm_maskz_mullo_epi16(__mmask8 __U, __m128i __A, __m128i __B) {
                                              (__v8hi)_mm_setzero_si128());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_blend_epi8 (__mmask16 __U, __m128i __A, __m128i __W)
-{
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_mask_blend_epi8(__mmask16 __U, __m128i __A, __m128i __W) {
   return (__m128i) __builtin_ia32_selectb_128 ((__mmask16) __U,
               (__v16qi) __W,
               (__v16qi) __A);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_blend_epi8 (__mmask32 __U, __m256i __A, __m256i __W)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_mask_blend_epi8(__mmask32 __U, __m256i __A, __m256i __W) {
   return (__m256i) __builtin_ia32_selectb_256 ((__mmask32) __U,
                (__v32qi) __W,
                (__v32qi) __A);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_blend_epi16 (__mmask8 __U, __m128i __A, __m128i __W)
-{
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_mask_blend_epi16(__mmask8 __U, __m128i __A, __m128i __W) {
   return (__m128i) __builtin_ia32_selectw_128 ((__mmask8) __U,
                (__v8hi) __W,
                (__v8hi) __A);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_blend_epi16 (__mmask16 __U, __m256i __A, __m256i __W)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_mask_blend_epi16(__mmask16 __U, __m256i __A, __m256i __W) {
   return (__m256i) __builtin_ia32_selectw_256 ((__mmask16) __U,
                (__v16hi) __W,
                (__v16hi) __A);
diff --git a/clang/lib/Headers/avx512vlfp16intrin.h b/clang/lib/Headers/avx512vlfp16intrin.h
index 8eb31eae6173b..ec766e31c6769 100644
--- a/clang/lib/Headers/avx512vlfp16intrin.h
+++ b/clang/lib/Headers/avx512vlfp16intrin.h
@@ -1995,14 +1995,13 @@ _mm256_maskz_fmadd_pch(__mmask8 __U, __m256h __A, __m256h __B, __m256h __C) {
                                                     (__v8sf)__C, (__mmask8)__U);
 }
 
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_blend_ph(__mmask8 __U,
-                                                                  __m128h __A,
-                                                                  __m128h __W) {
+static __inline__ __m128h __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_mask_blend_ph(__mmask8 __U, __m128h __A, __m128h __W) {
   return (__m128h)__builtin_ia32_selectph_128((__mmask8)__U, (__v8hf)__W,
                                               (__v8hf)__A);
 }
 
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
+static __inline__ __m256h __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_mask_blend_ph(__mmask16 __U, __m256h __A, __m256h __W) {
   return (__m256h)__builtin_ia32_selectph_256((__mmask16)__U, (__v16hf)__W,
                                               (__v16hf)__A);
diff --git a/clang/lib/Headers/avx512vlintrin.h b/clang/lib/Headers/avx512vlintrin.h
index d85ea23d5ee5a..5f5a54e7284c1 100644
--- a/clang/lib/Headers/avx512vlintrin.h
+++ b/clang/lib/Headers/avx512vlintrin.h
@@ -1498,57 +1498,57 @@ _mm256_maskz_add_ps(__mmask8 __U, __m256 __A, __m256 __B) {
                                              (__v8sf)_mm256_setzero_ps());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_blend_epi32 (__mmask8 __U, __m128i __A, __m128i __W) {
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_mask_blend_epi32(__mmask8 __U, __m128i __A, __m128i __W) {
   return (__m128i) __builtin_ia32_selectd_128 ((__mmask8) __U,
                 (__v4si) __W,
                 (__v4si) __A);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_blend_epi32 (__mmask8 __U, __m256i __A, __m256i __W) {
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_mask_blend_epi32(__mmask8 __U, __m256i __A, __m256i __W) {
   return (__m256i) __builtin_ia32_selectd_256 ((__mmask8) __U,
                 (__v8si) __W,
                 (__v8si) __A);
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask_blend_pd (__mmask8 __U, __m128d __A, __m128d __W) {
+static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_mask_blend_pd(__mmask8 __U, __m128d __A, __m128d __W) {
   return (__m128d) __builtin_ia32_selectpd_128 ((__mmask8) __U,
                  (__v2df) __W,
                  (__v2df) __A);
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_mask_blend_pd (__mmask8 __U, __m256d __A, __m256d __W) {
+static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_mask_blend_pd(__mmask8 __U, __m256d __A, __m256d __W) {
   return (__m256d) __builtin_ia32_selectpd_256 ((__mmask8) __U,
                  (__v4df) __W,
                  (__v4df) __A);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_blend_ps (__mmask8 __U, __m128 __A, __m128 __W) {
+static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_mask_blend_ps(__mmask8 __U, __m128 __A, __m128 __W) {
   return (__m128) __builtin_ia32_selectps_128 ((__mmask8) __U,
                 (__v4sf) __W,
                 (__v4sf) __A);
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_mask_blend_ps (__mmask8 __U, __m256 __A, __m256 __W) {
+static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_mask_blend_ps(__mmask8 __U, __m256 __A, __m256 __W) {
   return (__m256) __builtin_ia32_selectps_256 ((__mmask8) __U,
                 (__v8sf) __W,
                 (__v8sf) __A);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_blend_epi64 (__mmask8 __U, __m128i __A, __m128i __W) {
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_mask_blend_epi64(__mmask8 __U, __m128i __A, __m128i __W) {
   return (__m128i) __builtin_ia32_selectq_128 ((__mmask8) __U,
                 (__v2di) __W,
                 (__v2di) __A);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_blend_epi64 (__mmask8 __U, __m256i __A, __m256i __W) {
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_mask_blend_epi64(__mmask8 __U, __m256i __A, __m256i __W) {
   return (__m256i) __builtin_ia32_selectq_256 ((__mmask8) __U,
                 (__v4di) __W,
                 (__v4di) __A);
diff --git a/clang/lib/Headers/emmintrin.h b/clang/lib/Headers/emmintrin.h
index e4fbe011239d6..12260ec6ea14c 100644
--- a/clang/lib/Headers/emmintrin.h
+++ b/clang/lib/Headers/emmintrin.h
@@ -2745,11 +2745,11 @@ _mm_xor_si128(__m128i __a, __m128i __b) {
 ///    \a a.
 /// \returns A 128-bit integer vector containing the left-shifted value.
 #define _mm_slli_si128(a, imm)                                                 \
-  ((__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a),          \
+  ((__m128i)__builtin_ia32_pslldqi128_byteshift((__v16qi)(__m128i)(a),         \
                                                 (int)(imm)))
 
 #define _mm_bslli_si128(a, imm)                                                \
-  ((__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a),          \
+  ((__m128i)__builtin_ia32_pslldqi128_byteshift((__v16qi)(__m128i)(a),         \
                                                 (int)(imm)))
 
 /// Left-shifts each 16-bit value in the 128-bit integer vector operand
@@ -2954,11 +2954,11 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sra_epi32(__m128i __a,
 ///    \a a.
 /// \returns A 128-bit integer vector containing the right-shifted value.
 #define _mm_srli_si128(a, imm)                                                 \
-  ((__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a),          \
+  ((__m128i)__builtin_ia32_psrldqi128_byteshift((__v16qi)(__m128i)(a),         \
                                                 (int)(imm)))
 
 #define _mm_bsrli_si128(a, imm)                                                \
-  ((__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a),          \
+  ((__m128i)__builtin_ia32_psrldqi128_byteshift((__v16qi)(__m128i)(a),         \
                                                 (int)(imm)))
 
 /// Right-shifts each of 16-bit values in the 128-bit integer vector
diff --git a/clang/lib/Headers/f16cintrin.h b/clang/lib/Headers/f16cintrin.h
index ede67afada766..83965334e2c9b 100644
--- a/clang/lib/Headers/f16cintrin.h
+++ b/clang/lib/Headers/f16cintrin.h
@@ -20,6 +20,14 @@
 #define __DEFAULT_FN_ATTRS256 \
   __attribute__((__always_inline__, __nodebug__, __target__("f16c"), __min_vector_width__(256)))
 
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 constexpr
+#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256 constexpr
+#else
+#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128
+#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256
+#endif
+
 /* NOTE: Intel documents the 128-bit versions of these as being in emmintrin.h,
  * but that's because icc can emulate these without f16c using a library call.
  * Since we don't do that let's leave these in f16cintrin.h.
@@ -35,7 +43,7 @@
 /// \param __a
 ///    A 16-bit half-precision float value.
 /// \returns The converted 32-bit float value.
-static __inline float __DEFAULT_FN_ATTRS128
+static __inline float __DEFAULT_FN_ATTRS128_CONSTEXPR
 _cvtsh_ss(unsigned short __a)
 {
   return (float)__builtin_bit_cast(__fp16, __a);
@@ -104,7 +112,7 @@ _cvtsh_ss(unsigned short __a)
 ///    A 128-bit vector containing 16-bit half-precision float values. The lower
 ///    64 bits are used in the conversion.
 /// \returns A 128-bit vector of [4 x float] containing converted float values.
-static __inline __m128 __DEFAULT_FN_ATTRS128
+static __inline __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_cvtph_ps(__m128i __a)
 {
   typedef __fp16 __v4fp16 __attribute__((__vector_size__(8)));
@@ -151,7 +159,7 @@ _mm_cvtph_ps(__m128i __a)
 ///    converted to 32-bit single-precision float values.
 /// \returns A vector of [8 x float] containing the converted 32-bit
 ///    single-precision float values.
-static __inline __m256 __DEFAULT_FN_ATTRS256
+static __inline __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_cvtph_ps(__m128i __a)
 {
   typedef __fp16 __v8fp16 __attribute__((__vector_size__(16), __aligned__(16)));
@@ -161,5 +169,7 @@ _mm256_cvtph_ps(__m128i __a)
 
 #undef __DEFAULT_FN_ATTRS128
 #undef __DEFAULT_FN_ATTRS256
+#undef __DEFAULT_FN_ATTRS128_CONSTEXPR
+#undef __DEFAULT_FN_ATTRS256_CONSTEXPR
 
 #endif /* __F16CINTRIN_H */
diff --git a/clang/lib/Headers/intrin.h b/clang/lib/Headers/intrin.h
index 588c283cbdfba..210ed0c1f773b 100644
--- a/clang/lib/Headers/intrin.h
+++ b/clang/lib/Headers/intrin.h
@@ -30,6 +30,10 @@
 #include <arm64intr.h>
 #endif
 
+#if defined(__ARM_ACLE)
+#include <arm_acle.h>
+#endif
+
 /* For the definition of jmp_buf. */
 #if __STDC_HOSTED__
 #include <setjmp.h>
diff --git a/clang/lib/Headers/tmmintrin.h b/clang/lib/Headers/tmmintrin.h
index 6e7107e36ea79..d40f0c56b2c5a 100644
--- a/clang/lib/Headers/tmmintrin.h
+++ b/clang/lib/Headers/tmmintrin.h
@@ -175,11 +175,12 @@ _mm_abs_epi32(__m128i __a) {
 ///    An immediate operand specifying how many bytes to right-shift the result.
 /// \returns A 64-bit integer vector containing the concatenated right-shifted
 ///    value.
-#define _mm_alignr_pi8(a, b, n) \
-  ((__m64)__builtin_shufflevector(                                       \
-       __builtin_ia32_psrldqi128_byteshift(                              \
-           __builtin_shufflevector((__v1di)(a), (__v1di)(b), 1, 0),      \
-           (n)), __extension__ (__v2di){}, 0))
+#define _mm_alignr_pi8(a, b, n)                                                \
+  ((__m64)__builtin_shufflevector(                                             \
+      (__v2di)__builtin_ia32_psrldqi128_byteshift(                             \
+          (__v16qi)__builtin_shufflevector((__v1di)(a), (__v1di)(b), 1, 0),    \
+          (n)),                                                                \
+      __extension__(__v2di){}, 0))
 
 /// Horizontally adds the adjacent pairs of values contained in 2 packed
 ///    128-bit vectors of [8 x i16].
diff --git a/clang/lib/Headers/xmmintrin.h b/clang/lib/Headers/xmmintrin.h
index 6b70f245e2564..4891e3ce077b5 100644
--- a/clang/lib/Headers/xmmintrin.h
+++ b/clang/lib/Headers/xmmintrin.h
@@ -2520,8 +2520,8 @@ _mm_maskmove_si64(__m64 __d, __m64 __n, char *__p)
     // If there's a risk of spurious trap due to a 128-bit write, back up the
     // pointer by 8 bytes and shift values in registers to match.
     __p -= 8;
-    __d128 = __builtin_ia32_pslldqi128_byteshift((__v2di)__d128, 8);
-    __n128 = __builtin_ia32_pslldqi128_byteshift((__v2di)__n128, 8);
+    __d128 = (__m128i)__builtin_ia32_pslldqi128_byteshift((__v16qi)__d128, 8);
+    __n128 = (__m128i)__builtin_ia32_pslldqi128_byteshift((__v16qi)__n128, 8);
   }
 
   __builtin_ia32_maskmovdqu((__v16qi)__d128, (__v16qi)__n128, __p);
diff --git a/clang/lib/Interpreter/IncrementalExecutor.cpp b/clang/lib/Interpreter/IncrementalExecutor.cpp
index b0eb7d0e9f072..45620fcd358c8 100644
--- a/clang/lib/Interpreter/IncrementalExecutor.cpp
+++ b/clang/lib/Interpreter/IncrementalExecutor.cpp
@@ -172,7 +172,8 @@ createSharedMemoryManager(llvm::orc::SimpleRemoteEPC &SREPC,
 llvm::Expected<std::pair<std::unique_ptr<llvm::orc::SimpleRemoteEPC>, uint32_t>>
 IncrementalExecutor::launchExecutor(llvm::StringRef ExecutablePath,
                                     bool UseSharedMemory,
-                                    unsigned SlabAllocateSize) {
+                                    unsigned SlabAllocateSize,
+                                    std::function<void()> CustomizeFork) {
 #ifndef LLVM_ON_UNIX
   // FIXME: Add support for Windows.
   return llvm::make_error<llvm::StringError>(
@@ -215,6 +216,9 @@ IncrementalExecutor::launchExecutor(llvm::StringRef ExecutablePath,
     close(ToExecutor[WriteEnd]);
     close(FromExecutor[ReadEnd]);
 
+    if (CustomizeFork)
+      CustomizeFork();
+
     // Execute the child process.
     std::unique_ptr<char[]> ExecutorPath, FDSpecifier;
     {
diff --git a/clang/lib/Interpreter/IncrementalExecutor.h b/clang/lib/Interpreter/IncrementalExecutor.h
index d091535166770..bb1ec33452515 100644
--- a/clang/lib/Interpreter/IncrementalExecutor.h
+++ b/clang/lib/Interpreter/IncrementalExecutor.h
@@ -79,7 +79,8 @@ class IncrementalExecutor {
   static llvm::Expected<
       std::pair<std::unique_ptr<llvm::orc::SimpleRemoteEPC>, uint32_t>>
   launchExecutor(llvm::StringRef ExecutablePath, bool UseSharedMemory,
-                 unsigned SlabAllocateSize);
+                 unsigned SlabAllocateSize,
+                 std::function<void()> CustomizeFork = nullptr);
 
 #if LLVM_ON_UNIX && LLVM_ENABLE_THREADS
   static llvm::Expected<std::unique_ptr<llvm::orc::SimpleRemoteEPC>>
diff --git a/clang/lib/Interpreter/Interpreter.cpp b/clang/lib/Interpreter/Interpreter.cpp
index 043e0c1e5754e..9cc1c450b7650 100644
--- a/clang/lib/Interpreter/Interpreter.cpp
+++ b/clang/lib/Interpreter/Interpreter.cpp
@@ -107,8 +107,10 @@ CreateCI(const llvm::opt::ArgStringList &Argv) {
     Clang->getHeaderSearchOpts().ResourceDir =
         CompilerInvocation::GetResourcesPath(Argv[0], nullptr);
 
+  Clang->createVirtualFileSystem();
+
   // Create the actual diagnostics engine.
-  Clang->createDiagnostics(*llvm::vfs::getRealFileSystem());
+  Clang->createDiagnostics();
   if (!Clang->hasDiagnostics())
     return llvm::createStringError(llvm::errc::not_supported,
                                    "Initialization failed. "
@@ -355,7 +357,8 @@ Interpreter::outOfProcessJITBuilder(JITConfig Config) {
   if (!Config.OOPExecutor.empty()) {
     // Launch an out-of-process executor locally in a child process.
     auto ResultOrErr = IncrementalExecutor::launchExecutor(
-        Config.OOPExecutor, Config.UseSharedMemory, Config.SlabAllocateSize);
+        Config.OOPExecutor, Config.UseSharedMemory, Config.SlabAllocateSize,
+        Config.CustomizeFork);
     if (!ResultOrErr)
       return ResultOrErr.takeError();
     childPid = ResultOrErr->second;
@@ -474,7 +477,8 @@ Interpreter::createWithCUDA(std::unique_ptr<CompilerInstance> CI,
       std::make_unique<llvm::vfs::OverlayFileSystem>(
           llvm::vfs::getRealFileSystem());
   OverlayVFS->pushOverlay(IMVFS);
-  CI->createFileManager(OverlayVFS);
+  CI->createVirtualFileSystem(OverlayVFS);
+  CI->createFileManager();
 
   llvm::Expected<std::unique_ptr<Interpreter>> InterpOrErr =
       Interpreter::create(std::move(CI));
@@ -647,6 +651,8 @@ llvm::Error Interpreter::CreateExecutor(JITConfig Config) {
     auto JTMB = createJITTargetMachineBuilder(TT);
     if (!JTMB)
       return JTMB.takeError();
+    if (Config.CM)
+      JTMB->setCodeModel(Config.CM);
     auto JB = IncrementalExecutor::createDefaultJITBuilder(std::move(*JTMB));
     if (!JB)
       return JB.takeError();
diff --git a/clang/lib/Parse/ParseDecl.cpp b/clang/lib/Parse/ParseDecl.cpp
index 10355bb874762..22c01c4e371f3 100644
--- a/clang/lib/Parse/ParseDecl.cpp
+++ b/clang/lib/Parse/ParseDecl.cpp
@@ -6007,10 +6007,9 @@ bool Parser::isConstructorDeclarator(bool IsUnqualified, bool DeductionGuide,
 
   // A C++11 attribute here signals that we have a constructor, and is an
   // attribute on the first constructor parameter.
-  if (getLangOpts().CPlusPlus11 &&
-      isCXX11AttributeSpecifier(/*Disambiguate*/ false,
-                                /*OuterMightBeMessageSend*/ true) !=
-          CXX11AttributeKind::NotAttributeSpecifier) {
+  if (isCXX11AttributeSpecifier(/*Disambiguate=*/false,
+                                /*OuterMightBeMessageSend=*/true) !=
+      CXX11AttributeKind::NotAttributeSpecifier) {
     return true;
   }
 
@@ -6814,10 +6813,10 @@ void Parser::ParseDirectDeclarator(Declarator &D) {
       bool IsFunctionDeclaration = D.isFunctionDeclaratorAFunctionDeclaration();
       // Enter function-declaration scope, limiting any declarators to the
       // function prototype scope, including parameter declarators.
-      ParseScope PrototypeScope(this,
-                                Scope::FunctionPrototypeScope|Scope::DeclScope|
-                                (IsFunctionDeclaration
-                                   ? Scope::FunctionDeclarationScope : 0));
+      ParseScope PrototypeScope(
+          this, Scope::FunctionPrototypeScope | Scope::DeclScope |
+                    (IsFunctionDeclaration ? Scope::FunctionDeclarationScope
+                                           : Scope::NoScope));
 
       // The paren may be part of a C++ direct initializer, eg. "int x(1);".
       // In such a case, check if we actually have a function declarator; if it
@@ -7098,8 +7097,9 @@ void Parser::ParseParenDeclarator(Declarator &D) {
   // function prototype scope, including parameter declarators.
   ParseScope PrototypeScope(this,
                             Scope::FunctionPrototypeScope | Scope::DeclScope |
-                            (D.isFunctionDeclaratorAFunctionDeclaration()
-                               ? Scope::FunctionDeclarationScope : 0));
+                                (D.isFunctionDeclaratorAFunctionDeclaration()
+                                     ? Scope::FunctionDeclarationScope
+                                     : Scope::NoScope));
   ParseFunctionDeclarator(D, attrs, T, false, RequiresArg);
   PrototypeScope.Exit();
 }
diff --git a/clang/lib/Parse/ParseObjc.cpp b/clang/lib/Parse/ParseObjc.cpp
index 291c70e7bad4b..a64fb02294c5a 100644
--- a/clang/lib/Parse/ParseObjc.cpp
+++ b/clang/lib/Parse/ParseObjc.cpp
@@ -3295,9 +3295,9 @@ void Parser::ParseLexedObjCMethodDefs(LexedMethod &LM, bool parseMethod) {
   assert(Tok.isOneOf(tok::l_brace, tok::kw_try, tok::colon) &&
          "Inline objective-c method not starting with '{' or 'try' or ':'");
   // Enter a scope for the method or c-function body.
-  ParseScope BodyScope(this, (parseMethod ? Scope::ObjCMethodScope : 0) |
-                                 Scope::FnScope | Scope::DeclScope |
-                                 Scope::CompoundStmtScope);
+  ParseScope BodyScope(
+      this, (parseMethod ? Scope::ObjCMethodScope : Scope::NoScope) |
+                Scope::FnScope | Scope::DeclScope | Scope::CompoundStmtScope);
   Sema::FPFeaturesStateRAII SaveFPFeatures(Actions);
 
   // Tell the actions module that we have entered a method or c-function definition
diff --git a/clang/lib/Parse/ParseOpenMP.cpp b/clang/lib/Parse/ParseOpenMP.cpp
index 5db2f2e2ccf86..7dceb2d208352 100644
--- a/clang/lib/Parse/ParseOpenMP.cpp
+++ b/clang/lib/Parse/ParseOpenMP.cpp
@@ -3083,7 +3083,6 @@ OMPClause *Parser::ParseOpenMPClause(OpenMPDirectiveKind DKind,
       Clause = ParseOpenMPSingleExprClause(CKind, WrongDirective);
     break;
   case OMPC_fail:
-  case OMPC_default:
   case OMPC_proc_bind:
   case OMPC_atomic_default_mem_order:
   case OMPC_at:
@@ -3115,6 +3114,7 @@ OMPClause *Parser::ParseOpenMPClause(OpenMPDirectiveKind DKind,
   case OMPC_schedule:
   case OMPC_dist_schedule:
   case OMPC_defaultmap:
+  case OMPC_default:
   case OMPC_order:
     // OpenMP [2.7.1, Restrictions, p. 3]
     //  Only one schedule clause can appear on a loop directive.
@@ -3734,6 +3734,32 @@ OMPClause *Parser::ParseOpenMPSingleExprWithArgClause(OpenMPDirectiveKind DKind,
       ConsumeAnyToken();
     if (Arg.back() == OMPC_DIST_SCHEDULE_static && Tok.is(tok::comma))
       DelimLoc = ConsumeAnyToken();
+  } else if (Kind == OMPC_default) {
+    // Get a default modifier
+    unsigned Modifier = getOpenMPSimpleClauseType(
+        Kind, Tok.isAnnotation() ? "" : PP.getSpelling(Tok), getLangOpts());
+
+    Arg.push_back(Modifier);
+    KLoc.push_back(Tok.getLocation());
+    if (Tok.isNot(tok::r_paren) && Tok.isNot(tok::comma) &&
+        Tok.isNot(tok::annot_pragma_openmp_end))
+      ConsumeAnyToken();
+    // Parse ':'
+    if (Tok.is(tok::colon) && getLangOpts().OpenMP >= 60) {
+      ConsumeAnyToken();
+      // Get a variable-category attribute for default clause modifier
+      OpenMPDefaultClauseVariableCategory VariableCategory =
+          getOpenMPDefaultVariableCategory(
+              Tok.isAnnotation() ? "" : PP.getSpelling(Tok), getLangOpts());
+      Arg.push_back(VariableCategory);
+      KLoc.push_back(Tok.getLocation());
+      if (Tok.isNot(tok::r_paren) && Tok.isNot(tok::comma) &&
+          Tok.isNot(tok::annot_pragma_openmp_end))
+        ConsumeAnyToken();
+    } else {
+      Arg.push_back(OMPC_DEFAULT_VC_all);
+      KLoc.push_back(SourceLocation());
+    }
   } else if (Kind == OMPC_defaultmap) {
     // Get a defaultmap modifier
     unsigned Modifier = getOpenMPSimpleClauseType(
@@ -3932,6 +3958,18 @@ OMPClause *Parser::ParseOpenMPSingleExprWithArgClause(OpenMPDirectiveKind DKind,
   if (NeedAnExpression && Val.isInvalid())
     return nullptr;
 
+  if (Kind == OMPC_default && getLangOpts().OpenMP < 51 && Arg[0] &&
+      (static_cast<DefaultKind>(Arg[0]) == OMP_DEFAULT_private ||
+       static_cast<DefaultKind>(Arg[0]) == OMP_DEFAULT_firstprivate)) {
+    Diag(KLoc[0], diag::err_omp_invalid_dsa)
+        << getOpenMPClauseName(static_cast<DefaultKind>(Arg[0]) ==
+                                       OMP_DEFAULT_private
+                                   ? OMPC_private
+                                   : OMPC_firstprivate)
+        << getOpenMPClauseName(OMPC_default) << "5.1";
+    return nullptr;
+  }
+
   if (ParseOnly)
     return nullptr;
   return Actions.OpenMP().ActOnOpenMPSingleExprWithArgClause(
diff --git a/clang/lib/Parse/ParseStmt.cpp b/clang/lib/Parse/ParseStmt.cpp
index 62361c066a3f3..2e7af1219547e 100644
--- a/clang/lib/Parse/ParseStmt.cpp
+++ b/clang/lib/Parse/ParseStmt.cpp
@@ -2529,9 +2529,9 @@ StmtResult Parser::ParseCXXTryBlockCommon(SourceLocation TryLoc, bool FnTry) {
     return StmtError(Diag(Tok, diag::err_expected) << tok::l_brace);
 
   StmtResult TryBlock(ParseCompoundStatement(
-      /*isStmtExpr=*/false, Scope::DeclScope | Scope::TryScope |
-                                Scope::CompoundStmtScope |
-                                (FnTry ? Scope::FnTryCatchScope : 0)));
+      /*isStmtExpr=*/false,
+      Scope::DeclScope | Scope::TryScope | Scope::CompoundStmtScope |
+          (FnTry ? Scope::FnTryCatchScope : Scope::NoScope)));
   if (TryBlock.isInvalid())
     return TryBlock;
 
@@ -2593,9 +2593,9 @@ StmtResult Parser::ParseCXXCatchBlock(bool FnCatch) {
   // C++ 3.3.2p3:
   // The name in a catch exception-declaration is local to the handler and
   // shall not be redeclared in the outermost block of the handler.
-  ParseScope CatchScope(this, Scope::DeclScope | Scope::ControlScope |
-                                  Scope::CatchScope |
-                                  (FnCatch ? Scope::FnTryCatchScope : 0));
+  ParseScope CatchScope(
+      this, Scope::DeclScope | Scope::ControlScope | Scope::CatchScope |
+                (FnCatch ? Scope::FnTryCatchScope : Scope::NoScope));
 
   // exception-declaration is equivalent to '...' or a parameter-declaration
   // without default arguments.
diff --git a/clang/lib/Sema/HLSLBuiltinTypeDeclBuilder.cpp b/clang/lib/Sema/HLSLBuiltinTypeDeclBuilder.cpp
index ecf9cfde8aa72..5eafd03d89efe 100644
--- a/clang/lib/Sema/HLSLBuiltinTypeDeclBuilder.cpp
+++ b/clang/lib/Sema/HLSLBuiltinTypeDeclBuilder.cpp
@@ -17,6 +17,7 @@
 #include "clang/AST/Decl.h"
 #include "clang/AST/DeclCXX.h"
 #include "clang/AST/Expr.h"
+#include "clang/AST/Stmt.h"
 #include "clang/AST/Type.h"
 #include "clang/Basic/SourceLocation.h"
 #include "clang/Basic/Specifiers.h"
@@ -48,6 +49,14 @@ static FunctionDecl *lookupBuiltinFunction(Sema &S, StringRef Name) {
          "Since this is a builtin it should always resolve!");
   return cast<FunctionDecl>(R.getFoundDecl());
 }
+
+CXXConstructorDecl *lookupCopyConstructor(QualType ResTy) {
+  assert(ResTy->isRecordType() && "not a CXXRecord type");
+  for (auto *CD : ResTy->getAsCXXRecordDecl()->ctors())
+    if (CD->isCopyConstructor())
+      return CD;
+  return nullptr;
+}
 } // namespace
 
 // Builder for template arguments of builtin types. Used internally
@@ -580,6 +589,23 @@ BuiltinTypeMethodBuilder &BuiltinTypeMethodBuilder::returnValue(T ReturnValue) {
 
   Expr *ReturnValueExpr = convertPlaceholder(ReturnValue);
   ASTContext &AST = DeclBuilder.SemaRef.getASTContext();
+
+  QualType Ty = ReturnValueExpr->getType();
+  if (Ty->isRecordType()) {
+    // For record types, create a call to copy constructor to ensure proper copy
+    // semantics.
+    auto *ICE =
+        ImplicitCastExpr::Create(AST, Ty.withConst(), CK_NoOp, ReturnValueExpr,
+                                 nullptr, VK_XValue, FPOptionsOverride());
+    CXXConstructorDecl *CD = lookupCopyConstructor(Ty);
+    assert(CD && "no copy constructor found");
+    ReturnValueExpr = CXXConstructExpr::Create(
+        AST, Ty, SourceLocation(), CD, /*Elidable=*/false, {ICE},
+        /*HadMultipleCandidates=*/false, /*ListInitialization=*/false,
+        /*StdInitListInitialization=*/false,
+        /*ZeroInitListInitialization=*/false, CXXConstructionKind::Complete,
+        SourceRange());
+  }
   StmtsList.push_back(
       ReturnStmt::Create(AST, SourceLocation(), ReturnValueExpr, nullptr));
   return *this;
@@ -735,49 +761,6 @@ BuiltinTypeDeclBuilder &BuiltinTypeDeclBuilder::addDefaultHandleConstructor() {
       .finalize();
 }
 
-BuiltinTypeDeclBuilder &
-BuiltinTypeDeclBuilder::addHandleConstructorFromBinding() {
-  if (Record->isCompleteDefinition())
-    return *this;
-
-  using PH = BuiltinTypeMethodBuilder::PlaceHolder;
-  ASTContext &AST = SemaRef.getASTContext();
-  QualType HandleType = getResourceHandleField()->getType();
-
-  return BuiltinTypeMethodBuilder(*this, "", AST.VoidTy, false, true)
-      .addParam("registerNo", AST.UnsignedIntTy)
-      .addParam("spaceNo", AST.UnsignedIntTy)
-      .addParam("range", AST.IntTy)
-      .addParam("index", AST.UnsignedIntTy)
-      .addParam("name", AST.getPointerType(AST.CharTy.withConst()))
-      .callBuiltin("__builtin_hlsl_resource_handlefrombinding", HandleType,
-                   PH::Handle, PH::_0, PH::_1, PH::_2, PH::_3, PH::_4)
-      .assign(PH::Handle, PH::LastStmt)
-      .finalize();
-}
-
-BuiltinTypeDeclBuilder &
-BuiltinTypeDeclBuilder::addHandleConstructorFromImplicitBinding() {
-  if (Record->isCompleteDefinition())
-    return *this;
-
-  using PH = BuiltinTypeMethodBuilder::PlaceHolder;
-  ASTContext &AST = SemaRef.getASTContext();
-  QualType HandleType = getResourceHandleField()->getType();
-
-  return BuiltinTypeMethodBuilder(*this, "", AST.VoidTy, false, true)
-      .addParam("spaceNo", AST.UnsignedIntTy)
-      .addParam("range", AST.IntTy)
-      .addParam("index", AST.UnsignedIntTy)
-      .addParam("orderId", AST.UnsignedIntTy)
-      .addParam("name", AST.getPointerType(AST.CharTy.withConst()))
-      .callBuiltin("__builtin_hlsl_resource_handlefromimplicitbinding",
-                   HandleType, PH::Handle, PH::_3, PH::_0, PH::_1, PH::_2,
-                   PH::_4)
-      .assign(PH::Handle, PH::LastStmt)
-      .finalize();
-}
-
 // Adds static method that initializes resource from binding:
 //
 // static Resource<T> __createFromBinding(unsigned registerNo,
diff --git a/clang/lib/Sema/HLSLBuiltinTypeDeclBuilder.h b/clang/lib/Sema/HLSLBuiltinTypeDeclBuilder.h
index b898417e9fe14..9448af13530cb 100644
--- a/clang/lib/Sema/HLSLBuiltinTypeDeclBuilder.h
+++ b/clang/lib/Sema/HLSLBuiltinTypeDeclBuilder.h
@@ -78,8 +78,6 @@ class BuiltinTypeDeclBuilder {
 
   // Builtin types constructors
   BuiltinTypeDeclBuilder &addDefaultHandleConstructor();
-  BuiltinTypeDeclBuilder &addHandleConstructorFromBinding();
-  BuiltinTypeDeclBuilder &addHandleConstructorFromImplicitBinding();
   BuiltinTypeDeclBuilder &addCopyConstructor();
   BuiltinTypeDeclBuilder &addCopyAssignmentOperator();
 
diff --git a/clang/lib/Sema/HLSLExternalSemaSource.cpp b/clang/lib/Sema/HLSLExternalSemaSource.cpp
index 3386d8da281e9..781f0445d0b61 100644
--- a/clang/lib/Sema/HLSLExternalSemaSource.cpp
+++ b/clang/lib/Sema/HLSLExternalSemaSource.cpp
@@ -135,9 +135,7 @@ static BuiltinTypeDeclBuilder setupBufferType(CXXRecordDecl *Decl, Sema &S,
       .addCopyConstructor()
       .addCopyAssignmentOperator()
       .addCreateFromBinding()
-      .addCreateFromImplicitBinding()
-      .addHandleConstructorFromBinding()
-      .addHandleConstructorFromImplicitBinding();
+      .addCreateFromImplicitBinding();
 }
 
 // This function is responsible for constructing the constraint expression for
diff --git a/clang/lib/Sema/HeuristicResolver.cpp b/clang/lib/Sema/HeuristicResolver.cpp
index 6d79f3feeaace..a7fe4ea28b72c 100644
--- a/clang/lib/Sema/HeuristicResolver.cpp
+++ b/clang/lib/Sema/HeuristicResolver.cpp
@@ -13,7 +13,6 @@
 #include "clang/AST/ExprCXX.h"
 #include "clang/AST/TemplateBase.h"
 #include "clang/AST/Type.h"
-#include "llvm/ADT/identity.h"
 
 namespace clang {
 
@@ -36,14 +35,13 @@ class HeuristicResolverImpl {
   resolveMemberExpr(const CXXDependentScopeMemberExpr *ME);
   std::vector<const NamedDecl *>
   resolveDeclRefExpr(const DependentScopeDeclRefExpr *RE);
-  std::vector<const NamedDecl *> resolveTypeOfCallExpr(const CallExpr *CE);
   std::vector<const NamedDecl *> resolveCalleeOfCallExpr(const CallExpr *CE);
   std::vector<const NamedDecl *>
   resolveUsingValueDecl(const UnresolvedUsingValueDecl *UUVD);
   std::vector<const NamedDecl *>
   resolveDependentNameType(const DependentNameType *DNT);
-  std::vector<const NamedDecl *> resolveTemplateSpecializationType(
-      const DependentTemplateSpecializationType *DTST);
+  std::vector<const NamedDecl *>
+  resolveTemplateSpecializationType(const TemplateSpecializationType *TST);
   QualType resolveNestedNameSpecifierToType(NestedNameSpecifier NNS);
   QualType getPointeeType(QualType T);
   std::vector<const NamedDecl *>
@@ -51,6 +49,7 @@ class HeuristicResolverImpl {
                       llvm::function_ref<bool(const NamedDecl *ND)> Filter);
   TagDecl *resolveTypeToTagDecl(QualType T);
   QualType simplifyType(QualType Type, const Expr *E, bool UnwrapPointer);
+  QualType resolveExprToType(const Expr *E);
   FunctionProtoTypeLoc getFunctionProtoTypeLoc(const Expr *Fn);
 
 private:
@@ -72,10 +71,8 @@ class HeuristicResolverImpl {
   resolveDependentMember(QualType T, DeclarationName Name,
                          llvm::function_ref<bool(const NamedDecl *ND)> Filter);
 
-  // Try to heuristically resolve the type of a possibly-dependent expression
-  // `E`.
-  QualType resolveExprToType(const Expr *E);
   std::vector<const NamedDecl *> resolveExprToDecls(const Expr *E);
+  QualType resolveTypeOfCallExpr(const CallExpr *CE);
 
   bool findOrdinaryMemberInDependentClasses(const CXXBaseSpecifier *Specifier,
                                             CXXBasePath &Path,
@@ -97,18 +94,25 @@ const auto TemplateFilter = [](const NamedDecl *D) {
   return isa<TemplateDecl>(D);
 };
 
-QualType resolveDeclsToType(const std::vector<const NamedDecl *> &Decls,
-                            ASTContext &Ctx) {
-  if (Decls.size() != 1) // Names an overload set -- just bail.
-    return QualType();
-  if (const auto *TD = dyn_cast<TypeDecl>(Decls[0]))
+QualType resolveDeclToType(const NamedDecl *D, ASTContext &Ctx) {
+  if (const auto *TempD = dyn_cast<TemplateDecl>(D)) {
+    D = TempD->getTemplatedDecl();
+  }
+  if (const auto *TD = dyn_cast<TypeDecl>(D))
     return Ctx.getCanonicalTypeDeclType(TD);
-  if (const auto *VD = dyn_cast<ValueDecl>(Decls[0])) {
+  if (const auto *VD = dyn_cast<ValueDecl>(D)) {
     return VD->getType();
   }
   return QualType();
 }
 
+QualType resolveDeclsToType(const std::vector<const NamedDecl *> &Decls,
+                            ASTContext &Ctx) {
+  if (Decls.size() != 1) // Names an overload set -- just bail.
+    return QualType();
+  return resolveDeclToType(Decls[0], Ctx);
+}
+
 TemplateName getReferencedTemplateName(const Type *T) {
   if (const auto *TST = T->getAs<TemplateSpecializationType>()) {
     return TST->getTemplateName();
@@ -330,19 +334,29 @@ HeuristicResolverImpl::resolveDeclRefExpr(const DependentScopeDeclRefExpr *RE) {
   return resolveDependentMember(Qualifier, RE->getDeclName(), StaticFilter);
 }
 
-std::vector<const NamedDecl *>
-HeuristicResolverImpl::resolveTypeOfCallExpr(const CallExpr *CE) {
-  QualType CalleeType = resolveExprToType(CE->getCallee());
-  if (CalleeType.isNull())
-    return {};
-  if (const auto *FnTypePtr = CalleeType->getAs<PointerType>())
-    CalleeType = FnTypePtr->getPointeeType();
-  if (const FunctionType *FnType = CalleeType->getAs<FunctionType>()) {
-    if (const auto *D = resolveTypeToTagDecl(FnType->getReturnType())) {
-      return {D};
+QualType HeuristicResolverImpl::resolveTypeOfCallExpr(const CallExpr *CE) {
+  // resolveExprToType(CE->getCallee()) would bail in the case of multiple
+  // overloads, as it can't produce a single type for them. We can be more
+  // permissive here, and allow multiple overloads with a common return type.
+  std::vector<const NamedDecl *> CalleeDecls =
+      resolveExprToDecls(CE->getCallee());
+  QualType CommonReturnType;
+  for (const NamedDecl *CalleeDecl : CalleeDecls) {
+    QualType CalleeType = resolveDeclToType(CalleeDecl, Ctx);
+    if (CalleeType.isNull())
+      continue;
+    if (const auto *FnTypePtr = CalleeType->getAs<PointerType>())
+      CalleeType = FnTypePtr->getPointeeType();
+    if (const FunctionType *FnType = CalleeType->getAs<FunctionType>()) {
+      QualType ReturnType =
+          simplifyType(FnType->getReturnType(), nullptr, false);
+      if (!CommonReturnType.isNull() && CommonReturnType != ReturnType) {
+        return {}; // conflicting return types
+      }
+      CommonReturnType = ReturnType;
     }
   }
-  return {};
+  return CommonReturnType;
 }
 
 std::vector<const NamedDecl *>
@@ -374,8 +388,9 @@ HeuristicResolverImpl::resolveDependentNameType(const DependentNameType *DNT) {
 
 std::vector<const NamedDecl *>
 HeuristicResolverImpl::resolveTemplateSpecializationType(
-    const DependentTemplateSpecializationType *DTST) {
-  const DependentTemplateStorage &DTN = DTST->getDependentTemplateName();
+    const TemplateSpecializationType *TST) {
+  const DependentTemplateStorage &DTN =
+      *TST->getTemplateName().getAsDependentTemplateName();
   return resolveDependentMember(
       resolveNestedNameSpecifierToType(DTN.getQualifier()),
       DTN.getName().getIdentifier(), TemplateFilter);
@@ -393,15 +408,41 @@ HeuristicResolverImpl::resolveExprToDecls(const Expr *E) {
     return {OE->decls_begin(), OE->decls_end()};
   }
   if (const auto *CE = dyn_cast<CallExpr>(E)) {
-    return resolveTypeOfCallExpr(CE);
+    QualType T = resolveTypeOfCallExpr(CE);
+    if (const auto *D = resolveTypeToTagDecl(T)) {
+      return {D};
+    }
+    return {};
   }
   if (const auto *ME = dyn_cast<MemberExpr>(E))
     return {ME->getMemberDecl()};
+  if (const auto *DRE = dyn_cast<DeclRefExpr>(E))
+    return {DRE->getDecl()};
 
   return {};
 }
 
 QualType HeuristicResolverImpl::resolveExprToType(const Expr *E) {
+  // resolveExprToDecls on a CallExpr only succeeds if the return type is
+  // a TagDecl, but we may want the type of a call in other cases as well.
+  // (FIXME: There are probably other cases where we can do something more
+  // flexible than resoveExprToDecls + resolveDeclsToType, e.g. in the case
+  // of OverloadExpr we can probably accept overloads with a common type).
+  if (const auto *CE = dyn_cast<CallExpr>(E)) {
+    if (QualType Resolved = resolveTypeOfCallExpr(CE); !Resolved.isNull())
+      return Resolved;
+  }
+  // Similarly, unwrapping a unary dereference operation does not work via
+  // resolveExprToDecls.
+  if (const auto *UO = dyn_cast<UnaryOperator>(E->IgnoreParenCasts())) {
+    if (UO->getOpcode() == UnaryOperatorKind::UO_Deref) {
+      if (auto Pointee = getPointeeType(resolveExprToType(UO->getSubExpr()));
+          !Pointee.isNull()) {
+        return Pointee;
+      }
+    }
+  }
+
   std::vector<const NamedDecl *> Decls = resolveExprToDecls(E);
   if (!Decls.empty())
     return resolveDeclsToType(Decls, Ctx);
@@ -562,7 +603,7 @@ HeuristicResolverImpl::getFunctionProtoTypeLoc(const Expr *Fn) {
     // In some edge cases the AST can contain a "trivial" FunctionProtoTypeLoc
     // which has null parameters. Avoid these as they don't contain useful
     // information.
-    if (llvm::all_of(F.getParams(), llvm::identity<ParmVarDecl *>()))
+    if (!llvm::is_contained(F.getParams(), nullptr))
       return F;
   }
 
@@ -580,10 +621,6 @@ std::vector<const NamedDecl *> HeuristicResolver::resolveDeclRefExpr(
   return HeuristicResolverImpl(Ctx).resolveDeclRefExpr(RE);
 }
 std::vector<const NamedDecl *>
-HeuristicResolver::resolveTypeOfCallExpr(const CallExpr *CE) const {
-  return HeuristicResolverImpl(Ctx).resolveTypeOfCallExpr(CE);
-}
-std::vector<const NamedDecl *>
 HeuristicResolver::resolveCalleeOfCallExpr(const CallExpr *CE) const {
   return HeuristicResolverImpl(Ctx).resolveCalleeOfCallExpr(CE);
 }
@@ -597,8 +634,8 @@ std::vector<const NamedDecl *> HeuristicResolver::resolveDependentNameType(
 }
 std::vector<const NamedDecl *>
 HeuristicResolver::resolveTemplateSpecializationType(
-    const DependentTemplateSpecializationType *DTST) const {
-  return HeuristicResolverImpl(Ctx).resolveTemplateSpecializationType(DTST);
+    const TemplateSpecializationType *TST) const {
+  return HeuristicResolverImpl(Ctx).resolveTemplateSpecializationType(TST);
 }
 QualType HeuristicResolver::resolveNestedNameSpecifierToType(
     NestedNameSpecifier NNS) const {
@@ -619,7 +656,9 @@ QualType HeuristicResolver::simplifyType(QualType Type, const Expr *E,
                                          bool UnwrapPointer) {
   return HeuristicResolverImpl(Ctx).simplifyType(Type, E, UnwrapPointer);
 }
-
+QualType HeuristicResolver::resolveExprToType(const Expr *E) const {
+  return HeuristicResolverImpl(Ctx).resolveExprToType(E);
+}
 FunctionProtoTypeLoc
 HeuristicResolver::getFunctionProtoTypeLoc(const Expr *Fn) const {
   return HeuristicResolverImpl(Ctx).getFunctionProtoTypeLoc(Fn);
diff --git a/clang/lib/Sema/SemaAMDGPU.cpp b/clang/lib/Sema/SemaAMDGPU.cpp
index baba503239e9f..bb98a39948fce 100644
--- a/clang/lib/Sema/SemaAMDGPU.cpp
+++ b/clang/lib/Sema/SemaAMDGPU.cpp
@@ -100,7 +100,7 @@ bool SemaAMDGPU::CheckAMDGCNBuiltinFunctionCall(unsigned BuiltinID,
   case AMDGPU::BI__builtin_amdgcn_cvt_scale_pk16_bf16_bf6:
   case AMDGPU::BI__builtin_amdgcn_cvt_scale_pk16_f32_fp6:
   case AMDGPU::BI__builtin_amdgcn_cvt_scale_pk16_f32_bf6:
-    return SemaRef.BuiltinConstantArgRange(TheCall, 2, 0, 7);
+    return SemaRef.BuiltinConstantArgRange(TheCall, 2, 0, 15);
   case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_load_32x4B:
   case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_load_16x8B:
   case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_load_8x16B:
diff --git a/clang/lib/Sema/SemaAPINotes.cpp b/clang/lib/Sema/SemaAPINotes.cpp
index 4cc1b76264340..99a29add8211d 100644
--- a/clang/lib/Sema/SemaAPINotes.cpp
+++ b/clang/lib/Sema/SemaAPINotes.cpp
@@ -13,6 +13,7 @@
 #include "CheckExprLifetime.h"
 #include "TypeLocBuilder.h"
 #include "clang/APINotes/APINotesReader.h"
+#include "clang/APINotes/Types.h"
 #include "clang/AST/Decl.h"
 #include "clang/AST/DeclCXX.h"
 #include "clang/AST/DeclObjC.h"
@@ -291,6 +292,29 @@ static void ProcessAPINotes(Sema &S, Decl *D,
         });
   }
 
+  // swift_safety
+  if (auto SafetyKind = Info.getSwiftSafety()) {
+    bool Addition = *SafetyKind != api_notes::SwiftSafetyKind::Unspecified;
+    handleAPINotedAttribute<SwiftAttrAttr>(
+        S, D, Addition, Metadata,
+        [&] {
+          return SwiftAttrAttr::Create(
+              S.Context, *SafetyKind == api_notes::SwiftSafetyKind::Safe
+                             ? "safe"
+                             : "unsafe");
+        },
+        [](const Decl *D) {
+          return llvm::find_if(D->attrs(), [](const Attr *attr) {
+            if (const auto *swiftAttr = dyn_cast<SwiftAttrAttr>(attr)) {
+              if (swiftAttr->getAttribute() == "safe" ||
+                  swiftAttr->getAttribute() == "unsafe")
+                return true;
+            }
+            return false;
+          });
+        });
+  }
+
   // swift_name
   if (!Info.SwiftName.empty()) {
     handleAPINotedAttribute<SwiftNameAttr>(
diff --git a/clang/lib/Sema/SemaAttr.cpp b/clang/lib/Sema/SemaAttr.cpp
index 3eed6ad7fe6b3..8411a3da8322d 100644
--- a/clang/lib/Sema/SemaAttr.cpp
+++ b/clang/lib/Sema/SemaAttr.cpp
@@ -157,8 +157,8 @@ void Sema::inferGslPointerAttribute(TypedefNameDecl *TD) {
     if (auto *TST =
             dyn_cast<TemplateSpecializationType>(Canonical.getTypePtr())) {
 
-      RD = dyn_cast_or_null<CXXRecordDecl>(
-          TST->getTemplateName().getAsTemplateDecl()->getTemplatedDecl());
+      if (const auto *TD = TST->getTemplateName().getAsTemplateDecl())
+        RD = dyn_cast_or_null<CXXRecordDecl>(TD->getTemplatedDecl());
     }
   }
 
diff --git a/clang/lib/Sema/SemaCUDA.cpp b/clang/lib/Sema/SemaCUDA.cpp
index 2e3cbb336a0c8..31735a0f5feb3 100644
--- a/clang/lib/Sema/SemaCUDA.cpp
+++ b/clang/lib/Sema/SemaCUDA.cpp
@@ -143,6 +143,9 @@ CUDAFunctionTarget SemaCUDA::IdentifyTarget(const FunctionDecl *D,
   if (D->hasAttr<CUDAGlobalAttr>())
     return CUDAFunctionTarget::Global;
 
+  if (D->isConsteval())
+    return CUDAFunctionTarget::HostDevice;
+
   if (hasAttr<CUDADeviceAttr>(D, IgnoreImplicitHDAttr)) {
     if (hasAttr<CUDAHostAttr>(D, IgnoreImplicitHDAttr))
       return CUDAFunctionTarget::HostDevice;
diff --git a/clang/lib/Sema/SemaCXXScopeSpec.cpp b/clang/lib/Sema/SemaCXXScopeSpec.cpp
index 437c69aa1587d..e89243b9d767a 100644
--- a/clang/lib/Sema/SemaCXXScopeSpec.cpp
+++ b/clang/lib/Sema/SemaCXXScopeSpec.cpp
@@ -896,64 +896,15 @@ bool Sema::ActOnCXXNestedNameSpecifier(Scope *S,
   if (SS.isInvalid())
     return true;
 
-  TemplateName Template = OpaqueTemplate.get();
-
   // Translate the parser's template argument list in our AST format.
   TemplateArgumentListInfo TemplateArgs(LAngleLoc, RAngleLoc);
   translateTemplateArguments(TemplateArgsIn, TemplateArgs);
 
-  DependentTemplateName *DTN = Template.getAsDependentTemplateName();
-  if (DTN && DTN->getName().getIdentifier()) {
-    // Handle a dependent template specialization for which we cannot resolve
-    // the template name.
-    assert(DTN->getQualifier() == SS.getScopeRep());
-    QualType T = Context.getDependentTemplateSpecializationType(
-        ElaboratedTypeKeyword::None,
-        {SS.getScopeRep(), DTN->getName().getIdentifier(),
-         TemplateKWLoc.isValid()},
-        TemplateArgs.arguments());
-
-    // Create source-location information for this type.
-    TypeLocBuilder Builder;
-    DependentTemplateSpecializationTypeLoc SpecTL
-      = Builder.push<DependentTemplateSpecializationTypeLoc>(T);
-    SpecTL.setElaboratedKeywordLoc(SourceLocation());
-    SpecTL.setQualifierLoc(SS.getWithLocInContext(Context));
-    SpecTL.setTemplateKeywordLoc(TemplateKWLoc);
-    SpecTL.setTemplateNameLoc(TemplateNameLoc);
-    SpecTL.setLAngleLoc(LAngleLoc);
-    SpecTL.setRAngleLoc(RAngleLoc);
-    for (unsigned I = 0, N = TemplateArgs.size(); I != N; ++I)
-      SpecTL.setArgLocInfo(I, TemplateArgs[I].getLocInfo());
-
-    SS.clear();
-    SS.Make(Context, Builder.getTypeLocInContext(Context, T), CCLoc);
-    return false;
-  }
-
-  // If we assumed an undeclared identifier was a template name, try to
-  // typo-correct it now.
-  if (Template.getAsAssumedTemplateName() &&
-      resolveAssumedTemplateNameAsType(S, Template, TemplateNameLoc))
-    return true;
-
-  TemplateDecl *TD = Template.getAsTemplateDecl();
-  if (Template.getAsOverloadedTemplate() || DTN ||
-      isa<FunctionTemplateDecl>(TD) || isa<VarTemplateDecl>(TD)) {
-    SourceRange R(TemplateNameLoc, RAngleLoc);
-    if (SS.getRange().isValid())
-      R.setBegin(SS.getRange().getBegin());
-
-    Diag(CCLoc, diag::err_non_type_template_in_nested_name_specifier)
-        << isa_and_nonnull<VarTemplateDecl>(TD) << Template << R;
-    NoteAllFoundTemplates(Template);
-    return true;
-  }
-
   // We were able to resolve the template name to an actual template.
   // Build an appropriate nested-name-specifier.
-  QualType T = CheckTemplateIdType(ElaboratedTypeKeyword::None, Template,
-                                   TemplateNameLoc, TemplateArgs);
+  QualType T = CheckTemplateIdType(
+      ElaboratedTypeKeyword::None, OpaqueTemplate.get(), TemplateNameLoc,
+      TemplateArgs, /*Scope=*/S, /*ForNestedNameSpecifier=*/true);
   if (T.isNull())
     return true;
 
@@ -961,7 +912,7 @@ bool Sema::ActOnCXXNestedNameSpecifier(Scope *S,
   // nested name specifiers.
   if (!T->isDependentType() && !isa<TagType>(T.getCanonicalType())) {
     Diag(TemplateNameLoc, diag::err_nested_name_spec_non_tag) << T;
-    NoteAllFoundTemplates(Template);
+    NoteAllFoundTemplates(OpaqueTemplate.get());
     return true;
   }
 
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index 077f4311ed729..654a0670c0c4c 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -177,9 +177,7 @@ bool Sema::checkArgCount(CallExpr *Call, unsigned DesiredArgCount) {
 static bool checkBuiltinVerboseTrap(CallExpr *Call, Sema &S) {
   bool HasError = false;
 
-  for (unsigned I = 0; I < Call->getNumArgs(); ++I) {
-    Expr *Arg = Call->getArg(I);
-
+  for (const Expr *Arg : Call->arguments()) {
     if (Arg->isValueDependent())
       continue;
 
@@ -5558,17 +5556,18 @@ bool Sema::BuiltinComplex(CallExpr *TheCall) {
 /// BuiltinShuffleVector - Handle __builtin_shufflevector.
 // This is declared to take (...), so we have to check everything.
 ExprResult Sema::BuiltinShuffleVector(CallExpr *TheCall) {
-  if (TheCall->getNumArgs() < 2)
+  unsigned NumArgs = TheCall->getNumArgs();
+  if (NumArgs < 2)
     return ExprError(Diag(TheCall->getEndLoc(),
                           diag::err_typecheck_call_too_few_args_at_least)
-                     << 0 /*function call*/ << 2 << TheCall->getNumArgs()
+                     << 0 /*function call*/ << 2 << NumArgs
                      << /*is non object*/ 0 << TheCall->getSourceRange());
 
   // Determine which of the following types of shufflevector we're checking:
   // 1) unary, vector mask: (lhs, mask)
   // 2) binary, scalar mask: (lhs, rhs, index, ..., index)
-  QualType resType = TheCall->getArg(0)->getType();
-  unsigned numElements = 0;
+  QualType ResType = TheCall->getArg(0)->getType();
+  unsigned NumElements = 0;
 
   if (!TheCall->getArg(0)->isTypeDependent() &&
       !TheCall->getArg(1)->isTypeDependent()) {
@@ -5578,48 +5577,48 @@ ExprResult Sema::BuiltinShuffleVector(CallExpr *TheCall) {
     if (!LHSType->isVectorType() || !RHSType->isVectorType())
       return ExprError(
           Diag(TheCall->getBeginLoc(), diag::err_vec_builtin_non_vector)
-          << TheCall->getDirectCallee() << /*isMorethantwoArgs*/ false
+          << TheCall->getDirectCallee() << /*isMoreThanTwoArgs*/ false
           << SourceRange(TheCall->getArg(0)->getBeginLoc(),
                          TheCall->getArg(1)->getEndLoc()));
 
-    numElements = LHSType->castAs<VectorType>()->getNumElements();
-    unsigned numResElements = TheCall->getNumArgs() - 2;
+    NumElements = LHSType->castAs<VectorType>()->getNumElements();
+    unsigned NumResElements = NumArgs - 2;
 
     // Check to see if we have a call with 2 vector arguments, the unary shuffle
     // with mask.  If so, verify that RHS is an integer vector type with the
     // same number of elts as lhs.
-    if (TheCall->getNumArgs() == 2) {
+    if (NumArgs == 2) {
       if (!RHSType->hasIntegerRepresentation() ||
-          RHSType->castAs<VectorType>()->getNumElements() != numElements)
+          RHSType->castAs<VectorType>()->getNumElements() != NumElements)
         return ExprError(Diag(TheCall->getBeginLoc(),
                               diag::err_vec_builtin_incompatible_vector)
                          << TheCall->getDirectCallee()
-                         << /*isMorethantwoArgs*/ false
+                         << /*isMoreThanTwoArgs*/ false
                          << SourceRange(TheCall->getArg(1)->getBeginLoc(),
                                         TheCall->getArg(1)->getEndLoc()));
     } else if (!Context.hasSameUnqualifiedType(LHSType, RHSType)) {
       return ExprError(Diag(TheCall->getBeginLoc(),
                             diag::err_vec_builtin_incompatible_vector)
                        << TheCall->getDirectCallee()
-                       << /*isMorethantwoArgs*/ false
+                       << /*isMoreThanTwoArgs*/ false
                        << SourceRange(TheCall->getArg(0)->getBeginLoc(),
                                       TheCall->getArg(1)->getEndLoc()));
-    } else if (numElements != numResElements) {
-      QualType eltType = LHSType->castAs<VectorType>()->getElementType();
-      resType = resType->isExtVectorType()
-                    ? Context.getExtVectorType(eltType, numResElements)
-                    : Context.getVectorType(eltType, numResElements,
+    } else if (NumElements != NumResElements) {
+      QualType EltType = LHSType->castAs<VectorType>()->getElementType();
+      ResType = ResType->isExtVectorType()
+                    ? Context.getExtVectorType(EltType, NumResElements)
+                    : Context.getVectorType(EltType, NumResElements,
                                             VectorKind::Generic);
     }
   }
 
-  for (unsigned i = 2; i < TheCall->getNumArgs(); i++) {
-    Expr *Arg = TheCall->getArg(i);
+  for (unsigned I = 2; I != NumArgs; ++I) {
+    Expr *Arg = TheCall->getArg(I);
     if (Arg->isTypeDependent() || Arg->isValueDependent())
       continue;
 
-    std::optional<llvm::APSInt> Result;
-    if (!(Result = Arg->getIntegerConstantExpr(Context)))
+    std::optional<llvm::APSInt> Result = Arg->getIntegerConstantExpr(Context);
+    if (!Result)
       return ExprError(Diag(TheCall->getBeginLoc(),
                             diag::err_shufflevector_nonconstant_argument)
                        << Arg->getSourceRange());
@@ -5628,23 +5627,21 @@ ExprResult Sema::BuiltinShuffleVector(CallExpr *TheCall) {
     if (Result->isSigned() && Result->isAllOnes())
       ;
     else if (Result->getActiveBits() > 64 ||
-             Result->getZExtValue() >= numElements * 2)
+             Result->getZExtValue() >= NumElements * 2)
       return ExprError(Diag(TheCall->getBeginLoc(),
                             diag::err_shufflevector_argument_too_large)
                        << Arg->getSourceRange());
 
-    TheCall->setArg(i, ConstantExpr::Create(Context, Arg, APValue(*Result)));
+    TheCall->setArg(I, ConstantExpr::Create(Context, Arg, APValue(*Result)));
   }
 
-  SmallVector<Expr *> exprs;
-  for (unsigned i = 0, e = TheCall->getNumArgs(); i != e; i++) {
-    exprs.push_back(TheCall->getArg(i));
-    TheCall->setArg(i, nullptr);
-  }
+  auto *Result = new (Context) ShuffleVectorExpr(
+      Context, ArrayRef(TheCall->getArgs(), NumArgs), ResType,
+      TheCall->getCallee()->getBeginLoc(), TheCall->getRParenLoc());
 
-  return new (Context) ShuffleVectorExpr(Context, exprs, resType,
-                                         TheCall->getCallee()->getBeginLoc(),
-                                         TheCall->getRParenLoc());
+  // All moved to Result.
+  TheCall->shrinkNumArgs(0);
+  return Result;
 }
 
 ExprResult Sema::ConvertVectorExpr(Expr *E, TypeSourceInfo *TInfo,
@@ -5886,23 +5883,26 @@ bool Sema::BuiltinOSLogFormat(CallExpr *TheCall) {
   return false;
 }
 
-bool Sema::BuiltinConstantArg(CallExpr *TheCall, int ArgNum,
+bool Sema::BuiltinConstantArg(CallExpr *TheCall, unsigned ArgNum,
                               llvm::APSInt &Result) {
   Expr *Arg = TheCall->getArg(ArgNum);
-  DeclRefExpr *DRE =cast<DeclRefExpr>(TheCall->getCallee()->IgnoreParenCasts());
-  FunctionDecl *FDecl = cast<FunctionDecl>(DRE->getDecl());
 
-  if (Arg->isTypeDependent() || Arg->isValueDependent()) return false;
+  if (Arg->isTypeDependent() || Arg->isValueDependent())
+    return false;
 
-  std::optional<llvm::APSInt> R;
-  if (!(R = Arg->getIntegerConstantExpr(Context)))
+  std::optional<llvm::APSInt> R = Arg->getIntegerConstantExpr(Context);
+  if (!R) {
+    auto *DRE = cast<DeclRefExpr>(TheCall->getCallee()->IgnoreParenCasts());
+    auto *FDecl = cast<FunctionDecl>(DRE->getDecl());
     return Diag(TheCall->getBeginLoc(), diag::err_constant_integer_arg_type)
            << FDecl->getDeclName() << Arg->getSourceRange();
+  }
   Result = *R;
+
   return false;
 }
 
-bool Sema::BuiltinConstantArgRange(CallExpr *TheCall, int ArgNum, int Low,
+bool Sema::BuiltinConstantArgRange(CallExpr *TheCall, unsigned ArgNum, int Low,
                                    int High, bool RangeIsError) {
   if (isConstantEvaluatedContext())
     return false;
@@ -5933,7 +5933,7 @@ bool Sema::BuiltinConstantArgRange(CallExpr *TheCall, int ArgNum, int Low,
   return false;
 }
 
-bool Sema::BuiltinConstantArgMultiple(CallExpr *TheCall, int ArgNum,
+bool Sema::BuiltinConstantArgMultiple(CallExpr *TheCall, unsigned ArgNum,
                                       unsigned Num) {
   llvm::APSInt Result;
 
@@ -5953,7 +5953,7 @@ bool Sema::BuiltinConstantArgMultiple(CallExpr *TheCall, int ArgNum,
   return false;
 }
 
-bool Sema::BuiltinConstantArgPower2(CallExpr *TheCall, int ArgNum) {
+bool Sema::BuiltinConstantArgPower2(CallExpr *TheCall, unsigned ArgNum) {
   llvm::APSInt Result;
 
   // We can't check the value of a dependent argument.
@@ -5965,9 +5965,7 @@ bool Sema::BuiltinConstantArgPower2(CallExpr *TheCall, int ArgNum) {
   if (BuiltinConstantArg(TheCall, ArgNum, Result))
     return true;
 
-  // Bit-twiddling to test for a power of 2: for x > 0, x & (x-1) is zero if
-  // and only if x is a power of 2.
-  if (Result.isStrictlyPositive() && (Result & (Result - 1)) == 0)
+  if (Result.isPowerOf2())
     return false;
 
   return Diag(TheCall->getBeginLoc(), diag::err_argument_not_power_of_2)
@@ -5996,7 +5994,7 @@ static bool IsShiftedByte(llvm::APSInt Value) {
   }
 }
 
-bool Sema::BuiltinConstantArgShiftedByte(CallExpr *TheCall, int ArgNum,
+bool Sema::BuiltinConstantArgShiftedByte(CallExpr *TheCall, unsigned ArgNum,
                                          unsigned ArgBits) {
   llvm::APSInt Result;
 
@@ -6020,7 +6018,8 @@ bool Sema::BuiltinConstantArgShiftedByte(CallExpr *TheCall, int ArgNum,
          << Arg->getSourceRange();
 }
 
-bool Sema::BuiltinConstantArgShiftedByteOrXXFF(CallExpr *TheCall, int ArgNum,
+bool Sema::BuiltinConstantArgShiftedByteOrXXFF(CallExpr *TheCall,
+                                               unsigned ArgNum,
                                                unsigned ArgBits) {
   llvm::APSInt Result;
 
diff --git a/clang/lib/Sema/SemaCodeComplete.cpp b/clang/lib/Sema/SemaCodeComplete.cpp
index 03bf4b3690b13..5dd49497ce9fd 100644
--- a/clang/lib/Sema/SemaCodeComplete.cpp
+++ b/clang/lib/Sema/SemaCodeComplete.cpp
@@ -5827,96 +5827,13 @@ class ConceptInfo {
 // We accept some lossiness (like dropping parameters).
 // We only try to handle common expressions on the LHS of MemberExpr.
 QualType getApproximateType(const Expr *E, HeuristicResolver &Resolver) {
-  if (E->getType().isNull())
-    return QualType();
-  // Don't drop implicit cast if it's an array decay.
-  if (auto *ICE = dyn_cast<ImplicitCastExpr>(E);
-      !ICE || ICE->getCastKind() != CK_ArrayToPointerDecay)
-    E = E->IgnoreParenImpCasts();
-  QualType Unresolved = E->getType();
-  // Resolve DependentNameType
-  if (const auto *DNT = Unresolved->getAs<DependentNameType>()) {
-    if (auto Decls = Resolver.resolveDependentNameType(DNT);
-        Decls.size() == 1) {
-      if (const auto *TD = dyn_cast<TypeDecl>(Decls[0]))
-        return TD->getASTContext().getTypeDeclType(TD);
-    }
-  }
-  // We only resolve DependentTy, or undeduced autos (including auto* etc).
-  if (!Unresolved->isSpecificBuiltinType(BuiltinType::Dependent)) {
-    AutoType *Auto = Unresolved->getContainedAutoType();
-    if (!Auto || !Auto->isUndeducedAutoType())
-      return Unresolved;
-  }
-  // A call: approximate-resolve callee to a function type, get its return type
-  if (const CallExpr *CE = llvm::dyn_cast<CallExpr>(E)) {
-    QualType Callee = getApproximateType(CE->getCallee(), Resolver);
-    if (Callee.isNull() ||
-        Callee->isSpecificPlaceholderType(BuiltinType::BoundMember))
-      Callee = Expr::findBoundMemberType(CE->getCallee());
-    if (Callee.isNull())
-      return Unresolved;
-
-    if (const auto *FnTypePtr = Callee->getAs<PointerType>()) {
-      Callee = FnTypePtr->getPointeeType();
-    } else if (const auto *BPT = Callee->getAs<BlockPointerType>()) {
-      Callee = BPT->getPointeeType();
-    }
-    if (const FunctionType *FnType = Callee->getAs<FunctionType>())
-      return FnType->getReturnType().getNonReferenceType();
-
-    // Unresolved call: try to guess the return type.
-    if (const auto *OE = llvm::dyn_cast<OverloadExpr>(CE->getCallee())) {
-      // If all candidates have the same approximate return type, use it.
-      // Discard references and const to allow more to be "the same".
-      // (In particular, if there's one candidate + ADL, resolve it).
-      const Type *Common = nullptr;
-      for (const auto *D : OE->decls()) {
-        QualType ReturnType;
-        if (const auto *FD = llvm::dyn_cast<FunctionDecl>(D))
-          ReturnType = FD->getReturnType();
-        else if (const auto *FTD = llvm::dyn_cast<FunctionTemplateDecl>(D))
-          ReturnType = FTD->getTemplatedDecl()->getReturnType();
-        if (ReturnType.isNull())
-          continue;
-        const Type *Candidate =
-            ReturnType.getNonReferenceType().getCanonicalType().getTypePtr();
-        if (Common && Common != Candidate)
-          return Unresolved; // Multiple candidates.
-        Common = Candidate;
-      }
-      if (Common != nullptr)
-        return QualType(Common, 0);
-    }
-  }
-  // A dependent member: resolve using HeuristicResolver.
-  if (const auto *CDSME = llvm::dyn_cast<CXXDependentScopeMemberExpr>(E)) {
-    for (const auto *Member : Resolver.resolveMemberExpr(CDSME)) {
-      if (const auto *VD = dyn_cast<ValueDecl>(Member)) {
-        return VD->getType().getNonReferenceType();
-      }
-    }
-  }
-  // A reference to an `auto` variable: approximate-resolve its initializer.
-  if (const auto *DRE = llvm::dyn_cast<DeclRefExpr>(E)) {
-    if (const auto *VD = llvm::dyn_cast<VarDecl>(DRE->getDecl())) {
-      if (VD->hasInit())
-        return getApproximateType(VD->getInit(), Resolver);
-    }
-  }
-  if (const auto *UO = llvm::dyn_cast<UnaryOperator>(E)) {
-    if (UO->getOpcode() == UnaryOperatorKind::UO_Deref) {
-      // We recurse into the subexpression because it could be of dependent
-      // type.
-      if (auto Pointee =
-              getApproximateType(UO->getSubExpr(), Resolver)->getPointeeType();
-          !Pointee.isNull())
-        return Pointee;
-      // Our caller expects a non-null result, even though the SubType is
-      // supposed to have a pointee. Fall through to Unresolved anyway.
-    }
-  }
-  return Unresolved;
+  QualType Result = Resolver.resolveExprToType(E);
+  if (Result.isNull())
+    return Result;
+  Result = Resolver.simplifyType(Result.getNonReferenceType(), E, false);
+  if (Result.isNull())
+    return Result;
+  return Result.getNonReferenceType();
 }
 
 // If \p Base is ParenListExpr, assume a chain of comma operators and pick the
diff --git a/clang/lib/Sema/SemaCoroutine.cpp b/clang/lib/Sema/SemaCoroutine.cpp
index cc03616e0dfe1..229e91ed04caa 100644
--- a/clang/lib/Sema/SemaCoroutine.cpp
+++ b/clang/lib/Sema/SemaCoroutine.cpp
@@ -90,7 +90,8 @@ static QualType lookupPromiseType(Sema &S, const FunctionDecl *FD,
 
   // Build the template-id.
   QualType CoroTrait = S.CheckTemplateIdType(
-      ElaboratedTypeKeyword::None, TemplateName(CoroTraits), KwLoc, Args);
+      ElaboratedTypeKeyword::None, TemplateName(CoroTraits), KwLoc, Args,
+      /*Scope=*/nullptr, /*ForNestedNameSpecifier=*/false);
   if (CoroTrait.isNull())
     return QualType();
   if (S.RequireCompleteType(KwLoc, CoroTrait,
@@ -163,7 +164,8 @@ static QualType lookupCoroutineHandleType(Sema &S, QualType PromiseType,
 
   // Build the template-id.
   QualType CoroHandleType = S.CheckTemplateIdType(
-      ElaboratedTypeKeyword::None, TemplateName(CoroHandle), Loc, Args);
+      ElaboratedTypeKeyword::None, TemplateName(CoroHandle), Loc, Args,
+      /*Scope=*/nullptr, /*ForNestedNameSpecifier=*/false);
   if (CoroHandleType.isNull())
     return QualType();
   if (S.RequireCompleteType(Loc, CoroHandleType,
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index 7c1459e320167..e10511cc7fc4e 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -3877,6 +3877,23 @@ bool Sema::MergeFunctionDecl(FunctionDecl *New, NamedDecl *&OldD, Scope *S,
     RequiresAdjustment = true;
   }
 
+  // If the declaration is marked with cfi_unchecked_callee but the definition
+  // isn't, the definition is also cfi_unchecked_callee.
+  if (auto *FPT1 = OldType->getAs<FunctionProtoType>()) {
+    if (auto *FPT2 = NewType->getAs<FunctionProtoType>()) {
+      FunctionProtoType::ExtProtoInfo EPI1 = FPT1->getExtProtoInfo();
+      FunctionProtoType::ExtProtoInfo EPI2 = FPT2->getExtProtoInfo();
+
+      if (EPI1.CFIUncheckedCallee && !EPI2.CFIUncheckedCallee) {
+        EPI2.CFIUncheckedCallee = true;
+        NewQType = Context.getFunctionType(FPT2->getReturnType(),
+                                           FPT2->getParamTypes(), EPI2);
+        NewType = cast<FunctionType>(NewQType);
+        New->setType(NewQType);
+      }
+    }
+  }
+
   // Merge regparm attribute.
   if (OldTypeInfo.getHasRegParm() != NewTypeInfo.getHasRegParm() ||
       OldTypeInfo.getRegParm() != NewTypeInfo.getRegParm()) {
@@ -6392,12 +6409,6 @@ bool Sema::diagnoseQualifiedDeclaration(CXXScopeSpec &SS, DeclContext *DC,
       NextTL =
           TL.castAs<DependentNameTypeLoc>().getQualifierLoc().getAsTypeLoc();
       break;
-    case TypeLoc::DependentTemplateSpecialization: {
-      auto TST = TL.castAs<DependentTemplateSpecializationTypeLoc>();
-      TemplateKeywordLoc = TST.getTemplateKeywordLoc();
-      NextTL = TST.getQualifierLoc().getAsTypeLoc();
-      break;
-    }
     default:
       break;
     }
@@ -8401,7 +8412,7 @@ static ShadowedDeclKind computeShadowedDeclKind(const NamedDecl *ShadowedDecl,
 /// Return the location of the capture if the given lambda captures the given
 /// variable \p VD, or an invalid source location otherwise.
 static SourceLocation getCaptureLocation(const LambdaScopeInfo *LSI,
-                                         const VarDecl *VD) {
+                                         const ValueDecl *VD) {
   for (const Capture &Capture : LSI->Captures) {
     if (Capture.isVariableCapture() && Capture.getVariable() == VD)
       return Capture.getLocation();
@@ -8498,7 +8509,9 @@ void Sema::CheckShadow(NamedDecl *D, NamedDecl *ShadowedDecl,
   if (isa<VarDecl>(D) && NewDC && isa<CXXMethodDecl>(NewDC)) {
     if (const auto *RD = dyn_cast<CXXRecordDecl>(NewDC->getParent())) {
       if (RD->isLambda() && OldDC->Encloses(NewDC->getLexicalParent())) {
-        if (const auto *VD = dyn_cast<VarDecl>(ShadowedDecl)) {
+        // Handle both VarDecl and BindingDecl in lambda contexts
+        if (isa<VarDecl, BindingDecl>(ShadowedDecl)) {
+          const auto *VD = cast<ValueDecl>(ShadowedDecl);
           const auto *LSI = cast<LambdaScopeInfo>(getCurFunction());
           if (RD->getLambdaCaptureDefault() == LCD_None) {
             // Try to avoid warnings for lambdas with an explicit capture
@@ -8527,18 +8540,27 @@ void Sema::CheckShadow(NamedDecl *D, NamedDecl *ShadowedDecl,
           return;
         }
       }
-      if (const auto *VD = dyn_cast<VarDecl>(ShadowedDecl);
-          VD && VD->hasLocalStorage()) {
-        // A variable can't shadow a local variable in an enclosing scope, if
-        // they are separated by a non-capturing declaration context.
-        for (DeclContext *ParentDC = NewDC;
-             ParentDC && !ParentDC->Equals(OldDC);
-             ParentDC = getLambdaAwareParentOfDeclContext(ParentDC)) {
-          // Only block literals, captured statements, and lambda expressions
-          // can capture; other scopes don't.
-          if (!isa<BlockDecl>(ParentDC) && !isa<CapturedDecl>(ParentDC) &&
-              !isLambdaCallOperator(ParentDC)) {
-            return;
+      // Apply scoping logic to both VarDecl and BindingDecl with local storage
+      if (isa<VarDecl, BindingDecl>(ShadowedDecl)) {
+        bool HasLocalStorage = false;
+        if (const auto *VD = dyn_cast<VarDecl>(ShadowedDecl))
+          HasLocalStorage = VD->hasLocalStorage();
+        else if (const auto *BD = dyn_cast<BindingDecl>(ShadowedDecl))
+          HasLocalStorage =
+              cast<VarDecl>(BD->getDecomposedDecl())->hasLocalStorage();
+
+        if (HasLocalStorage) {
+          // A variable can't shadow a local variable or binding in an enclosing
+          // scope, if they are separated by a non-capturing declaration
+          // context.
+          for (DeclContext *ParentDC = NewDC;
+               ParentDC && !ParentDC->Equals(OldDC);
+               ParentDC = getLambdaAwareParentOfDeclContext(ParentDC)) {
+            // Only block literals, captured statements, and lambda expressions
+            // can capture; other scopes don't.
+            if (!isa<BlockDecl>(ParentDC) && !isa<CapturedDecl>(ParentDC) &&
+                !isLambdaCallOperator(ParentDC))
+              return;
           }
         }
       }
@@ -8585,7 +8607,8 @@ void Sema::DiagnoseShadowingLambdaDecls(const LambdaScopeInfo *LSI) {
     const NamedDecl *ShadowedDecl = Shadow.ShadowedDecl;
     // Try to avoid the warning when the shadowed decl isn't captured.
     const DeclContext *OldDC = ShadowedDecl->getDeclContext();
-    if (const auto *VD = dyn_cast<VarDecl>(ShadowedDecl)) {
+    if (isa<VarDecl, BindingDecl>(ShadowedDecl)) {
+      const auto *VD = cast<ValueDecl>(ShadowedDecl);
       SourceLocation CaptureLoc = getCaptureLocation(LSI, VD);
       Diag(Shadow.VD->getLocation(),
            CaptureLoc.isInvalid() ? diag::warn_decl_shadow_uncaptured_local
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index 44906456f3371..b6ebe54764282 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -1802,7 +1802,11 @@ static void handleRestrictAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
   if (AL.getNumArgs() == 1) {
     DeallocPtrIdx = ParamIdx(1, DeallocFD);
 
-    if (!DeallocPtrIdx.isValid() ||
+    // FIXME: We could probably be better about diagnosing that there IS no
+    // argument, or that the function doesn't have a prototype, but this is how
+    // GCC diagnoses this, and is reasonably clear.
+    if (!DeallocPtrIdx.isValid() || !hasFunctionProto(DeallocFD) ||
+        getFunctionOrMethodNumParams(DeallocFD) < 1 ||
         !getFunctionOrMethodParamType(DeallocFD, DeallocPtrIdx.getASTIndex())
              .getCanonicalType()
              ->isPointerType()) {
diff --git a/clang/lib/Sema/SemaDeclCXX.cpp b/clang/lib/Sema/SemaDeclCXX.cpp
index 63ce87b9b0607..fb57b43882911 100644
--- a/clang/lib/Sema/SemaDeclCXX.cpp
+++ b/clang/lib/Sema/SemaDeclCXX.cpp
@@ -1138,8 +1138,9 @@ static QualType getStdTrait(Sema &S, SourceLocation Loc, StringRef Trait,
   }
 
   // Build the template-id.
-  QualType TraitTy = S.CheckTemplateIdType(ElaboratedTypeKeyword::None,
-                                           TemplateName(TraitTD), Loc, Args);
+  QualType TraitTy = S.CheckTemplateIdType(
+      ElaboratedTypeKeyword::None, TemplateName(TraitTD), Loc, Args,
+      /*Scope=*/nullptr, /*ForNestedNameSpecifier=*/false);
   if (TraitTy.isNull())
     return QualType();
 
@@ -11123,8 +11124,8 @@ bool Sema::CheckDestructor(CXXDestructorDecl *Destructor) {
       Loc = RD->getLocation();
 
     // If we have a virtual destructor, look up the deallocation function
-    if (FunctionDecl *OperatorDelete =
-            FindDeallocationFunctionForDestructor(Loc, RD)) {
+    if (FunctionDecl *OperatorDelete = FindDeallocationFunctionForDestructor(
+            Loc, RD, /*Diagnose=*/true, /*LookForGlobal=*/false)) {
       Expr *ThisArg = nullptr;
 
       // If the notional 'delete this' expression requires a non-trivial
@@ -11159,6 +11160,22 @@ bool Sema::CheckDestructor(CXXDestructorDecl *Destructor) {
       DiagnoseUseOfDecl(OperatorDelete, Loc);
       MarkFunctionReferenced(Loc, OperatorDelete);
       Destructor->setOperatorDelete(OperatorDelete, ThisArg);
+
+      if (isa<CXXMethodDecl>(OperatorDelete) &&
+          Context.getTargetInfo().callGlobalDeleteInDeletingDtor(
+              Context.getLangOpts())) {
+        // In Microsoft ABI whenever a class has a defined operator delete,
+        // scalar deleting destructors check the 3rd bit of the implicit
+        // parameter and if it is set, then, global operator delete must be
+        // called instead of the class-specific one. Find and save the global
+        // operator delete for that case. Do not diagnose at this point because
+        // the lack of a global operator delete is not an error if there are no
+        // delete calls that require it.
+        FunctionDecl *GlobalOperatorDelete =
+            FindDeallocationFunctionForDestructor(Loc, RD, /*Diagnose*/ false,
+                                                  /*LookForGlobal*/ true);
+        Destructor->setOperatorGlobalDelete(GlobalOperatorDelete);
+      }
     }
   }
 
@@ -12315,7 +12332,8 @@ static QualType BuildStdClassTemplate(Sema &S, ClassTemplateDecl *CTD,
   Args.addArgument(TemplateArgumentLoc(TemplateArgument(TypeParam), TSI));
 
   return S.CheckTemplateIdType(ElaboratedTypeKeyword::None, TemplateName(CTD),
-                               Loc, Args);
+                               Loc, Args, /*Scope=*/nullptr,
+                               /*ForNestedNameSpecifier=*/false);
 }
 
 QualType Sema::BuildStdInitializerList(QualType Element, SourceLocation Loc) {
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index aba00dc8ff9b6..03def26fe53bd 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -9385,11 +9385,21 @@ AssignConvertType Sema::CheckAssignmentConstraints(QualType LHSType,
     return AssignConvertType::Incompatible;
   }
 
-  // Allow scalar to ExtVector assignments, and assignments of an ExtVector type
-  // to the same ExtVector type.
-  if (LHSType->isExtVectorType()) {
-    if (RHSType->isExtVectorType())
+  // Allow scalar to ExtVector assignments, assignment to bool, and assignments
+  // of an ExtVector type to the same ExtVector type.
+  if (auto *LHSExtType = LHSType->getAs<ExtVectorType>()) {
+    if (auto *RHSExtType = RHSType->getAs<ExtVectorType>()) {
+      // Implicit conversions require the same number of elements.
+      if (LHSExtType->getNumElements() != RHSExtType->getNumElements())
+        return AssignConvertType::Incompatible;
+
+      if (LHSType->isExtVectorBoolType() &&
+          RHSExtType->getElementType()->isIntegerType()) {
+        Kind = CK_IntegralToBoolean;
+        return AssignConvertType::Compatible;
+      }
       return AssignConvertType::Incompatible;
+    }
     if (RHSType->isArithmeticType()) {
       // CK_VectorSplat does T -> vector T, so first cast to the element type.
       if (ConvertRHS)
@@ -14715,8 +14725,9 @@ QualType Sema::CheckAddressOfOperand(ExprResult &OrigOp, SourceLocation OpLoc) {
           return MPTy;
         }
       }
-    } else if (!isa<FunctionDecl, NonTypeTemplateParmDecl, BindingDecl,
-                    MSGuidDecl, UnnamedGlobalConstantDecl>(dcl))
+    } else if (!isa<FunctionDecl, TemplateParamObjectDecl,
+                    NonTypeTemplateParmDecl, BindingDecl, MSGuidDecl,
+                    UnnamedGlobalConstantDecl>(dcl))
       llvm_unreachable("Unknown/unexpected decl type");
   }
 
@@ -14784,7 +14795,7 @@ static QualType CheckIndirectionOperand(Sema &S, Expr *Op, ExprValueKind &VK,
   QualType OpTy = Op->getType();
   QualType Result;
 
-  if (isa<CXXReinterpretCastExpr>(Op)) {
+  if (isa<CXXReinterpretCastExpr>(Op->IgnoreParens())) {
     QualType OpOrigType = Op->IgnoreParenCasts()->getType();
     S.CheckCompatibleReinterpretCast(OpOrigType, OpTy, /*IsDereference*/true,
                                      Op->getSourceRange());
@@ -21360,8 +21371,9 @@ ExprResult Sema::CheckPlaceholderExpr(Expr *E) {
     QualType TST;
     {
       SFINAETrap Trap(*this);
-      TST = CheckTemplateIdType(ElaboratedTypeKeyword::None, TN,
-                                NameInfo.getBeginLoc(), TAL);
+      TST = CheckTemplateIdType(
+          ElaboratedTypeKeyword::None, TN, NameInfo.getBeginLoc(), TAL,
+          /*Scope=*/nullptr, /*ForNestedNameSpecifier=*/false);
     }
     if (TST.isNull())
       TST = Context.getTemplateSpecializationType(
diff --git a/clang/lib/Sema/SemaExprCXX.cpp b/clang/lib/Sema/SemaExprCXX.cpp
index 5a9279d928465..1e8bb6e3064a9 100644
--- a/clang/lib/Sema/SemaExprCXX.cpp
+++ b/clang/lib/Sema/SemaExprCXX.cpp
@@ -3579,7 +3579,7 @@ void Sema::DeclareGlobalAllocationFunction(DeclarationName Name,
 FunctionDecl *
 Sema::FindUsualDeallocationFunction(SourceLocation StartLoc,
                                     ImplicitDeallocationParameters IDP,
-                                    DeclarationName Name) {
+                                    DeclarationName Name, bool Diagnose) {
   DeclareGlobalNewDelete();
 
   LookupResult FoundDelete(*this, Name, StartLoc, LookupOrdinaryName);
@@ -3594,7 +3594,7 @@ Sema::FindUsualDeallocationFunction(SourceLocation StartLoc,
   if (!Result)
     return nullptr;
 
-  if (CheckDeleteOperator(*this, StartLoc, StartLoc, /*Diagnose=*/true,
+  if (CheckDeleteOperator(*this, StartLoc, StartLoc, Diagnose,
                           FoundDelete.getNamingClass(), Result.Found,
                           Result.FD))
     return nullptr;
@@ -3605,7 +3605,8 @@ Sema::FindUsualDeallocationFunction(SourceLocation StartLoc,
 
 FunctionDecl *Sema::FindDeallocationFunctionForDestructor(SourceLocation Loc,
                                                           CXXRecordDecl *RD,
-                                                          bool Diagnose) {
+                                                          bool Diagnose,
+                                                          bool LookForGlobal) {
   DeclarationName Name = Context.DeclarationNames.getCXXOperatorName(OO_Delete);
 
   FunctionDecl *OperatorDelete = nullptr;
@@ -3614,18 +3615,20 @@ FunctionDecl *Sema::FindDeallocationFunctionForDestructor(SourceLocation Loc,
       DeallocType, ShouldUseTypeAwareOperatorNewOrDelete(),
       AlignedAllocationMode::No, SizedDeallocationMode::No};
 
-  if (FindDeallocationFunction(Loc, RD, Name, OperatorDelete, IDP, Diagnose))
-    return nullptr;
+  if (!LookForGlobal) {
+    if (FindDeallocationFunction(Loc, RD, Name, OperatorDelete, IDP, Diagnose))
+      return nullptr;
 
-  if (OperatorDelete)
-    return OperatorDelete;
+    if (OperatorDelete)
+      return OperatorDelete;
+  }
 
   // If there's no class-specific operator delete, look up the global
   // non-array delete.
   IDP.PassAlignment = alignedAllocationModeFromBool(
       hasNewExtendedAlignment(*this, DeallocType));
   IDP.PassSize = SizedDeallocationMode::Yes;
-  return FindUsualDeallocationFunction(Loc, IDP, Name);
+  return FindUsualDeallocationFunction(Loc, IDP, Name, Diagnose);
 }
 
 bool Sema::FindDeallocationFunction(SourceLocation StartLoc, CXXRecordDecl *RD,
diff --git a/clang/lib/Sema/SemaHLSL.cpp b/clang/lib/Sema/SemaHLSL.cpp
index 0af38472b0fec..55be036207eec 100644
--- a/clang/lib/Sema/SemaHLSL.cpp
+++ b/clang/lib/Sema/SemaHLSL.cpp
@@ -1228,6 +1228,15 @@ struct PerVisibilityBindingChecker {
   }
 };
 
+static CXXMethodDecl *lookupMethod(Sema &S, CXXRecordDecl *RecordDecl,
+                                   StringRef Name, SourceLocation Loc) {
+  DeclarationName DeclName(&S.getASTContext().Idents.get(Name));
+  LookupResult Result(S, DeclName, Loc, Sema::LookupMemberName);
+  if (!S.LookupQualifiedName(Result, static_cast<DeclContext *>(RecordDecl)))
+    return nullptr;
+  return cast<CXXMethodDecl>(Result.getFoundDecl());
+}
+
 } // end anonymous namespace
 
 bool SemaHLSL::handleRootSignatureElements(
@@ -3784,30 +3793,17 @@ void SemaHLSL::ActOnVariableDeclarator(VarDecl *VD) {
   deduceAddressSpace(VD);
 }
 
-static bool initVarDeclWithCtor(Sema &S, VarDecl *VD,
-                                MutableArrayRef<Expr *> Args) {
-  InitializedEntity Entity = InitializedEntity::InitializeVariable(VD);
-  InitializationKind Kind = InitializationKind::CreateDirect(
-      VD->getLocation(), SourceLocation(), SourceLocation());
-
-  InitializationSequence InitSeq(S, Entity, Kind, Args);
-  if (InitSeq.Failed())
-    return false;
-
-  ExprResult Init = InitSeq.Perform(S, Entity, Kind, Args);
-  if (!Init.get())
-    return false;
+bool SemaHLSL::initGlobalResourceDecl(VarDecl *VD) {
+  assert(VD->getType()->isHLSLResourceRecord() &&
+         "expected resource record type");
 
-  VD->setInit(S.MaybeCreateExprWithCleanups(Init.get()));
-  VD->setInitStyle(VarDecl::CallInit);
-  S.CheckCompleteVariableDeclaration(VD);
-  return true;
-}
+  ASTContext &AST = SemaRef.getASTContext();
+  uint64_t UIntTySize = AST.getTypeSize(AST.UnsignedIntTy);
+  uint64_t IntTySize = AST.getTypeSize(AST.IntTy);
 
-void SemaHLSL::createResourceRecordCtorArgs(
-    const Type *ResourceTy, StringRef VarName, HLSLResourceBindingAttr *RBA,
-    HLSLVkBindingAttr *VkBinding, uint32_t ArrayIndex,
-    llvm::SmallVectorImpl<Expr *> &Args) {
+  // Gather resource binding information from attributes.
+  HLSLResourceBindingAttr *RBA = VD->getAttr<HLSLResourceBindingAttr>();
+  HLSLVkBindingAttr *VkBinding = VD->getAttr<HLSLVkBindingAttr>();
   std::optional<uint32_t> RegisterSlot;
   uint32_t SpaceNo = 0;
   if (VkBinding) {
@@ -3819,46 +3815,85 @@ void SemaHLSL::createResourceRecordCtorArgs(
     SpaceNo = RBA->getSpaceNumber();
   }
 
-  ASTContext &AST = SemaRef.getASTContext();
-  uint64_t UIntTySize = AST.getTypeSize(AST.UnsignedIntTy);
-  uint64_t IntTySize = AST.getTypeSize(AST.IntTy);
-  IntegerLiteral *RangeSize = IntegerLiteral::Create(
-      AST, llvm::APInt(IntTySize, 1), AST.IntTy, SourceLocation());
-  IntegerLiteral *Index =
-      IntegerLiteral::Create(AST, llvm::APInt(UIntTySize, ArrayIndex),
-                             AST.UnsignedIntTy, SourceLocation());
-  IntegerLiteral *Space =
-      IntegerLiteral::Create(AST, llvm::APInt(UIntTySize, SpaceNo),
-                             AST.UnsignedIntTy, SourceLocation());
-  StringLiteral *Name = StringLiteral::Create(
-      AST, VarName, StringLiteralKind::Ordinary, false,
-      AST.getStringLiteralArrayType(AST.CharTy.withConst(), VarName.size()),
-      SourceLocation());
+  // Find correct initialization method and create its arguments.
+  QualType ResourceTy = VD->getType();
+  CXXRecordDecl *ResourceDecl = ResourceTy->getAsCXXRecordDecl();
+  CXXMethodDecl *CreateMethod = nullptr;
+  llvm::SmallVector<Expr *> Args;
 
-  // resource with explicit binding
   if (RegisterSlot.has_value()) {
+    // The resource has explicit binding.
+    CreateMethod = lookupMethod(SemaRef, ResourceDecl, "__createFromBinding",
+                                VD->getLocation());
     IntegerLiteral *RegSlot = IntegerLiteral::Create(
         AST, llvm::APInt(UIntTySize, RegisterSlot.value()), AST.UnsignedIntTy,
         SourceLocation());
-    Args.append({RegSlot, Space, RangeSize, Index, Name});
+    Args.push_back(RegSlot);
   } else {
-    // resource with implicit binding
+    // The resource has implicit binding.
+    CreateMethod =
+        lookupMethod(SemaRef, ResourceDecl, "__createFromImplicitBinding",
+                     VD->getLocation());
     uint32_t OrderID = (RBA && RBA->hasImplicitBindingOrderID())
                            ? RBA->getImplicitBindingOrderID()
                            : getNextImplicitBindingOrderID();
     IntegerLiteral *OrderId =
         IntegerLiteral::Create(AST, llvm::APInt(UIntTySize, OrderID),
                                AST.UnsignedIntTy, SourceLocation());
-    Args.append({Space, RangeSize, Index, OrderId, Name});
+    Args.push_back(OrderId);
   }
-}
 
-bool SemaHLSL::initGlobalResourceDecl(VarDecl *VD) {
-  SmallVector<Expr *> Args;
-  createResourceRecordCtorArgs(VD->getType().getTypePtr(), VD->getName(),
-                               VD->getAttr<HLSLResourceBindingAttr>(),
-                               VD->getAttr<HLSLVkBindingAttr>(), 0, Args);
-  return initVarDeclWithCtor(SemaRef, VD, Args);
+  if (!CreateMethod)
+    // This can happen if someone creates a struct that looks like an HLSL
+    // resource record but does not have the required static create method.
+    // No binding will be generated for it.
+    return false;
+
+  IntegerLiteral *Space =
+      IntegerLiteral::Create(AST, llvm::APInt(UIntTySize, SpaceNo),
+                             AST.UnsignedIntTy, SourceLocation());
+  Args.push_back(Space);
+
+  IntegerLiteral *RangeSize = IntegerLiteral::Create(
+      AST, llvm::APInt(IntTySize, 1), AST.IntTy, SourceLocation());
+  Args.push_back(RangeSize);
+
+  IntegerLiteral *Index = IntegerLiteral::Create(
+      AST, llvm::APInt(UIntTySize, 0), AST.UnsignedIntTy, SourceLocation());
+  Args.push_back(Index);
+
+  StringRef VarName = VD->getName();
+  StringLiteral *Name = StringLiteral::Create(
+      AST, VarName, StringLiteralKind::Ordinary, false,
+      AST.getStringLiteralArrayType(AST.CharTy.withConst(), VarName.size()),
+      SourceLocation());
+  ImplicitCastExpr *NameCast = ImplicitCastExpr::Create(
+      AST, AST.getPointerType(AST.CharTy.withConst()), CK_ArrayToPointerDecay,
+      Name, nullptr, VK_PRValue, FPOptionsOverride());
+  Args.push_back(NameCast);
+
+  // Make sure the create method template is instantiated and emitted.
+  if (!CreateMethod->isDefined() && CreateMethod->isTemplateInstantiation())
+    SemaRef.InstantiateFunctionDefinition(VD->getLocation(), CreateMethod,
+                                          true);
+
+  // Create CallExpr with a call to the static method and set it as the decl
+  // initialization.
+  DeclRefExpr *DRE = DeclRefExpr::Create(
+      AST, NestedNameSpecifierLoc(), SourceLocation(), CreateMethod, false,
+      CreateMethod->getNameInfo(), CreateMethod->getType(), VK_PRValue);
+
+  auto *ImpCast = ImplicitCastExpr::Create(
+      AST, AST.getPointerType(CreateMethod->getType()),
+      CK_FunctionToPointerDecay, DRE, nullptr, VK_PRValue, FPOptionsOverride());
+
+  CallExpr *InitExpr =
+      CallExpr::Create(AST, ImpCast, Args, ResourceTy, VK_PRValue,
+                       SourceLocation(), FPOptionsOverride());
+  VD->setInit(InitExpr);
+  VD->setInitStyle(VarDecl::CallInit);
+  SemaRef.CheckCompleteVariableDeclaration(VD);
+  return true;
 }
 
 bool SemaHLSL::initGlobalResourceArrayDecl(VarDecl *VD) {
@@ -3867,28 +3902,39 @@ bool SemaHLSL::initGlobalResourceArrayDecl(VarDecl *VD) {
 
   // Individual resources in a resource array are not initialized here. They
   // are initialized later on during codegen when the individual resources are
-  // accessed. Codegen will emit a call to the resource constructor with the
-  // specified array index. We need to make sure though that the constructor
+  // accessed. Codegen will emit a call to the resource initialization method
+  // with the specified array index. We need to make sure though that the method
   // for the specific resource type is instantiated, so codegen can emit a call
   // to it when the array element is accessed.
-  SmallVector<Expr *> Args;
-  QualType ResElementTy = VD->getASTContext().getBaseElementType(VD->getType());
-  createResourceRecordCtorArgs(ResElementTy.getTypePtr(), VD->getName(),
-                               VD->getAttr<HLSLResourceBindingAttr>(),
-                               VD->getAttr<HLSLVkBindingAttr>(), 0, Args);
 
-  SourceLocation Loc = VD->getLocation();
-  InitializedEntity Entity =
-      InitializedEntity::InitializeTemporary(ResElementTy);
-  InitializationKind Kind = InitializationKind::CreateDirect(Loc, Loc, Loc);
-  InitializationSequence InitSeq(SemaRef, Entity, Kind, Args);
-  if (InitSeq.Failed())
+  // Find correct initialization method based on the resource binding
+  // information.
+  ASTContext &AST = SemaRef.getASTContext();
+  QualType ResElementTy = AST.getBaseElementType(VD->getType());
+  CXXRecordDecl *ResourceDecl = ResElementTy->getAsCXXRecordDecl();
+
+  HLSLResourceBindingAttr *RBA = VD->getAttr<HLSLResourceBindingAttr>();
+  HLSLVkBindingAttr *VkBinding = VD->getAttr<HLSLVkBindingAttr>();
+  CXXMethodDecl *CreateMethod = nullptr;
+
+  if (VkBinding || (RBA && RBA->hasRegisterSlot()))
+    // Resource has explicit binding.
+    CreateMethod = lookupMethod(SemaRef, ResourceDecl, "__createFromBinding",
+                                VD->getLocation());
+  else
+    // Resource has implicit binding.
+    CreateMethod =
+        lookupMethod(SemaRef, ResourceDecl, "__createFromImplicitBinding",
+                     VD->getLocation());
+
+  if (!CreateMethod)
     return false;
 
-  // This takes care of instantiating and emitting of the constructor that will
-  // be called from codegen when the array is accessed.
-  ExprResult OneResInit = InitSeq.Perform(SemaRef, Entity, Kind, Args);
-  return !OneResInit.isInvalid();
+  // Make sure the create method template is instantiated and emitted.
+  if (!CreateMethod->isDefined() && CreateMethod->isTemplateInstantiation())
+    SemaRef.InstantiateFunctionDefinition(VD->getLocation(), CreateMethod,
+                                          true);
+  return true;
 }
 
 // Returns true if the initialization has been handled.
diff --git a/clang/lib/Sema/SemaLookup.cpp b/clang/lib/Sema/SemaLookup.cpp
index 54918c560b655..25728de1779ad 100644
--- a/clang/lib/Sema/SemaLookup.cpp
+++ b/clang/lib/Sema/SemaLookup.cpp
@@ -4575,6 +4575,13 @@ static void getNestedNameSpecifierIdentifiers(
       case Type::TemplateSpecialization: {
         TemplateName Name =
             cast<TemplateSpecializationType>(T)->getTemplateName();
+        if (const DependentTemplateName *DTN =
+                Name.getAsDependentTemplateName()) {
+          getNestedNameSpecifierIdentifiers(DTN->getQualifier(), Identifiers);
+          if (const auto *II = DTN->getName().getIdentifier())
+            Identifiers.push_back(II);
+          return;
+        }
         if (const QualifiedTemplateName *QTN =
                 Name.getAsQualifiedTemplateName()) {
           getNestedNameSpecifierIdentifiers(QTN->getQualifier(), Identifiers);
@@ -4584,15 +4591,6 @@ static void getNestedNameSpecifierIdentifiers(
           Identifiers.push_back(TD->getIdentifier());
         return;
       }
-      case Type::DependentTemplateSpecialization: {
-        const DependentTemplateStorage &S =
-            cast<DependentTemplateSpecializationType>(T)
-                ->getDependentTemplateName();
-        getNestedNameSpecifierIdentifiers(S.getQualifier(), Identifiers);
-        // FIXME: Should this dig into the Name as well?
-        // Identifiers.push_back(S.getName().getIdentifier());
-        return;
-      }
       case Type::SubstTemplateTypeParm:
         T = cast<SubstTemplateTypeParmType>(T)
                 ->getReplacementType()
diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp
index 63a56a6583efc..6a7a5a9a4303a 100644
--- a/clang/lib/Sema/SemaOpenMP.cpp
+++ b/clang/lib/Sema/SemaOpenMP.cpp
@@ -73,6 +73,18 @@ enum DefaultDataSharingAttributes {
   DSA_firstprivate = 1 << 3, /// Default data sharing attribute 'firstprivate'.
 };
 
+/// Variable Category attributes to restrict the modifier of the
+/// default clause (DefaultDataSharingAttributes)
+/// Not mentioning any Variable category attribute indicates
+/// the modifier (DefaultDataSharingAttributes) is for all variables.
+enum DefaultDataSharingVCAttributes {
+  DSA_VC_all = 0,     /// for all variables.
+  DSA_VC_aggregate,   /// for aggregate variables.
+  DSA_VC_allocatable, /// for allocatable variables.
+  DSA_VC_pointer,     /// for pointer variables.
+  DSA_VC_scalar,      /// for scalar variables.
+};
+
 /// Stack for tracking declarations used in OpenMP directives and
 /// clauses and their data-sharing attributes.
 class DSAStackTy {
@@ -168,6 +180,8 @@ class DSAStackTy {
     LoopControlVariablesMapTy LCVMap;
     DefaultDataSharingAttributes DefaultAttr = DSA_unspecified;
     SourceLocation DefaultAttrLoc;
+    DefaultDataSharingVCAttributes DefaultVCAttr = DSA_VC_all;
+    SourceLocation DefaultAttrVCLoc;
     DefaultmapInfo DefaultmapMap[OMPC_DEFAULTMAP_unknown + 1];
     OpenMPDirectiveKind Directive = OMPD_unknown;
     DeclarationNameInfo DirectiveName;
@@ -735,6 +749,31 @@ class DSAStackTy {
     getTopOfStack().DefaultAttr = DSA_firstprivate;
     getTopOfStack().DefaultAttrLoc = Loc;
   }
+  /// Set default data sharing variable category attribute to aggregate.
+  void setDefaultDSAVCAggregate(SourceLocation VCLoc) {
+    getTopOfStack().DefaultVCAttr = DSA_VC_aggregate;
+    getTopOfStack().DefaultAttrVCLoc = VCLoc;
+  }
+  /// Set default data sharing variable category attribute to all.
+  void setDefaultDSAVCAll(SourceLocation VCLoc) {
+    getTopOfStack().DefaultVCAttr = DSA_VC_all;
+    getTopOfStack().DefaultAttrVCLoc = VCLoc;
+  }
+  /// Set default data sharing variable category attribute to allocatable.
+  void setDefaultDSAVCAllocatable(SourceLocation VCLoc) {
+    getTopOfStack().DefaultVCAttr = DSA_VC_allocatable;
+    getTopOfStack().DefaultAttrVCLoc = VCLoc;
+  }
+  /// Set default data sharing variable category attribute to pointer.
+  void setDefaultDSAVCPointer(SourceLocation VCLoc) {
+    getTopOfStack().DefaultVCAttr = DSA_VC_pointer;
+    getTopOfStack().DefaultAttrVCLoc = VCLoc;
+  }
+  /// Set default data sharing variable category attribute to scalar.
+  void setDefaultDSAVCScalar(SourceLocation VCLoc) {
+    getTopOfStack().DefaultVCAttr = DSA_VC_scalar;
+    getTopOfStack().DefaultAttrVCLoc = VCLoc;
+  }
   /// Set default data mapping attribute to Modifier:Kind
   void setDefaultDMAAttr(OpenMPDefaultmapClauseModifier M,
                          OpenMPDefaultmapClauseKind Kind, SourceLocation Loc) {
@@ -804,7 +843,8 @@ class DSAStackTy {
              (M == OMPC_DEFAULTMAP_MODIFIER_to) ||
              (M == OMPC_DEFAULTMAP_MODIFIER_from) ||
              (M == OMPC_DEFAULTMAP_MODIFIER_tofrom) ||
-             (M == OMPC_DEFAULTMAP_MODIFIER_present);
+             (M == OMPC_DEFAULTMAP_MODIFIER_present) ||
+             (M == OMPC_DEFAULTMAP_MODIFIER_storage);
     }
     return true;
   }
@@ -1326,11 +1366,34 @@ DSAStackTy::DSAVarData DSAStackTy::getDSA(const_iterator &Iter,
     return DVar;
   }
 
+  DefaultDataSharingAttributes IterDA = Iter->DefaultAttr;
+  switch (Iter->DefaultVCAttr) {
+  case DSA_VC_aggregate:
+    if (!VD->getType()->isAggregateType())
+      IterDA = DSA_none;
+    break;
+  case DSA_VC_allocatable:
+    if (!(VD->getType()->isPointerType() ||
+          VD->getType()->isVariableArrayType()))
+      IterDA = DSA_none;
+    break;
+  case DSA_VC_pointer:
+    if (!VD->getType()->isPointerType())
+      IterDA = DSA_none;
+    break;
+  case DSA_VC_scalar:
+    if (!VD->getType()->isScalarType())
+      IterDA = DSA_none;
+    break;
+  case DSA_VC_all:
+    break;
+  }
+
   // OpenMP [2.9.1.1, Data-sharing Attribute Rules for Variables Referenced
   // in a Construct, C/C++, implicitly determined, p.1]
   //  In a parallel or task construct, the data-sharing attributes of these
   //  variables are determined by the default clause, if present.
-  switch (Iter->DefaultAttr) {
+  switch (IterDA) {
   case DSA_shared:
     DVar.CKind = OMPC_shared;
     DVar.ImplicitDSALoc = Iter->DefaultAttrLoc;
@@ -3686,6 +3749,7 @@ getMapClauseKindFromModifier(OpenMPDefaultmapClauseModifier M,
   OpenMPMapClauseKind Kind = OMPC_MAP_unknown;
   switch (M) {
   case OMPC_DEFAULTMAP_MODIFIER_alloc:
+  case OMPC_DEFAULTMAP_MODIFIER_storage:
     Kind = OMPC_MAP_alloc;
     break;
   case OMPC_DEFAULTMAP_MODIFIER_to:
@@ -3706,6 +3770,7 @@ getMapClauseKindFromModifier(OpenMPDefaultmapClauseModifier M,
     Kind = OMPC_MAP_alloc;
     break;
   case OMPC_DEFAULTMAP_MODIFIER_firstprivate:
+  case OMPC_DEFAULTMAP_MODIFIER_private:
   case OMPC_DEFAULTMAP_MODIFIER_last:
     llvm_unreachable("Unexpected defaultmap implicit behavior");
   case OMPC_DEFAULTMAP_MODIFIER_none:
@@ -3942,9 +4007,13 @@ class DSAAttrChecker final : public StmtVisitor<DSAAttrChecker, void> {
           } else {
             OpenMPDefaultmapClauseModifier M =
                 Stack->getDefaultmapModifier(ClauseKind);
-            OpenMPMapClauseKind Kind = getMapClauseKindFromModifier(
-                M, ClauseKind == OMPC_DEFAULTMAP_aggregate || Res);
-            ImpInfo.Mappings[ClauseKind][Kind].insert(E);
+            if (M == OMPC_DEFAULTMAP_MODIFIER_private) {
+              ImpInfo.Privates.insert(E);
+            } else {
+              OpenMPMapClauseKind Kind = getMapClauseKindFromModifier(
+                  M, ClauseKind == OMPC_DEFAULTMAP_aggregate || Res);
+              ImpInfo.Mappings[ClauseKind][Kind].insert(E);
+            }
           }
           return;
         }
@@ -14919,12 +14988,13 @@ StmtResult SemaOpenMP::ActOnOpenMPUnrollDirective(ArrayRef<OMPClause *> Clauses,
                                   Body, OriginalInits))
     return StmtError();
 
-  unsigned NumGeneratedLoops = PartialClause ? 1 : 0;
+  unsigned NumGeneratedTopLevelLoops = PartialClause ? 1 : 0;
 
   // Delay unrolling to when template is completely instantiated.
   if (SemaRef.CurContext->isDependentContext())
     return OMPUnrollDirective::Create(Context, StartLoc, EndLoc, Clauses, AStmt,
-                                      NumGeneratedLoops, nullptr, nullptr);
+                                      NumGeneratedTopLevelLoops, nullptr,
+                                      nullptr);
 
   assert(LoopHelpers.size() == NumLoops &&
          "Expecting a single-dimensional loop iteration space");
@@ -14947,9 +15017,10 @@ StmtResult SemaOpenMP::ActOnOpenMPUnrollDirective(ArrayRef<OMPClause *> Clauses,
   // The generated loop may only be passed to other loop-associated directive
   // when a partial clause is specified. Without the requirement it is
   // sufficient to generate loop unroll metadata at code-generation.
-  if (NumGeneratedLoops == 0)
+  if (NumGeneratedTopLevelLoops == 0)
     return OMPUnrollDirective::Create(Context, StartLoc, EndLoc, Clauses, AStmt,
-                                      NumGeneratedLoops, nullptr, nullptr);
+                                      NumGeneratedTopLevelLoops, nullptr,
+                                      nullptr);
 
   // Otherwise, we need to provide a de-sugared/transformed AST that can be
   // associated with another loop directive.
@@ -15164,7 +15235,7 @@ StmtResult SemaOpenMP::ActOnOpenMPUnrollDirective(ArrayRef<OMPClause *> Clauses,
               LoopHelper.Init->getBeginLoc(), LoopHelper.Inc->getEndLoc());
 
   return OMPUnrollDirective::Create(Context, StartLoc, EndLoc, Clauses, AStmt,
-                                    NumGeneratedLoops, OuterFor,
+                                    NumGeneratedTopLevelLoops, OuterFor,
                                     buildPreInits(Context, PreInits));
 }
 
@@ -16265,10 +16336,6 @@ OMPClause *SemaOpenMP::ActOnOpenMPSimpleClause(
     SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation EndLoc) {
   OMPClause *Res = nullptr;
   switch (Kind) {
-  case OMPC_default:
-    Res = ActOnOpenMPDefaultClause(static_cast<DefaultKind>(Argument),
-                                   ArgumentLoc, StartLoc, LParenLoc, EndLoc);
-    break;
   case OMPC_proc_bind:
     Res = ActOnOpenMPProcBindClause(static_cast<ProcBindKind>(Argument),
                                     ArgumentLoc, StartLoc, LParenLoc, EndLoc);
@@ -16349,6 +16416,7 @@ OMPClause *SemaOpenMP::ActOnOpenMPSimpleClause(
   case OMPC_num_tasks:
   case OMPC_hint:
   case OMPC_dist_schedule:
+  case OMPC_default:
   case OMPC_defaultmap:
   case OMPC_unknown:
   case OMPC_uniform:
@@ -16382,38 +16450,58 @@ OMPClause *SemaOpenMP::ActOnOpenMPSimpleClause(
   return Res;
 }
 
-OMPClause *SemaOpenMP::ActOnOpenMPDefaultClause(DefaultKind Kind,
-                                                SourceLocation KindKwLoc,
-                                                SourceLocation StartLoc,
-                                                SourceLocation LParenLoc,
-                                                SourceLocation EndLoc) {
-  if (Kind == OMP_DEFAULT_unknown) {
-    Diag(KindKwLoc, diag::err_omp_unexpected_clause_value)
+OMPClause *SemaOpenMP::ActOnOpenMPDefaultClause(
+    llvm::omp::DefaultKind M, SourceLocation MLoc,
+    OpenMPDefaultClauseVariableCategory VCKind, SourceLocation VCKindLoc,
+    SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation EndLoc) {
+  if (M == OMP_DEFAULT_unknown) {
+    Diag(MLoc, diag::err_omp_unexpected_clause_value)
         << getListOfPossibleValues(OMPC_default, /*First=*/0,
                                    /*Last=*/unsigned(OMP_DEFAULT_unknown))
         << getOpenMPClauseNameForDiag(OMPC_default);
     return nullptr;
   }
 
-  switch (Kind) {
+  switch (M) {
   case OMP_DEFAULT_none:
-    DSAStack->setDefaultDSANone(KindKwLoc);
+    DSAStack->setDefaultDSANone(MLoc);
     break;
   case OMP_DEFAULT_shared:
-    DSAStack->setDefaultDSAShared(KindKwLoc);
+    DSAStack->setDefaultDSAShared(MLoc);
     break;
   case OMP_DEFAULT_firstprivate:
-    DSAStack->setDefaultDSAFirstPrivate(KindKwLoc);
+    DSAStack->setDefaultDSAFirstPrivate(MLoc);
     break;
   case OMP_DEFAULT_private:
-    DSAStack->setDefaultDSAPrivate(KindKwLoc);
+    DSAStack->setDefaultDSAPrivate(MLoc);
     break;
   default:
     llvm_unreachable("DSA unexpected in OpenMP default clause");
   }
 
+  switch (VCKind) {
+  case OMPC_DEFAULT_VC_aggregate:
+    DSAStack->setDefaultDSAVCAggregate(VCKindLoc);
+    break;
+  case OMPC_DEFAULT_VC_all:
+    DSAStack->setDefaultDSAVCAll(VCKindLoc);
+    break;
+  case OMPC_DEFAULT_VC_allocatable:
+    DSAStack->setDefaultDSAVCAllocatable(VCKindLoc);
+    break;
+  case OMPC_DEFAULT_VC_pointer:
+    DSAStack->setDefaultDSAVCPointer(VCKindLoc);
+    break;
+  case OMPC_DEFAULT_VC_scalar:
+    DSAStack->setDefaultDSAVCScalar(VCKindLoc);
+    break;
+  default:
+    Diag(VCKindLoc, diag::err_omp_default_vc)
+        << getOpenMPSimpleClauseTypeName(OMPC_default, unsigned(M));
+  }
+
   return new (getASTContext())
-      OMPDefaultClause(Kind, KindKwLoc, StartLoc, LParenLoc, EndLoc);
+      OMPDefaultClause(M, MLoc, VCKind, VCKindLoc, StartLoc, LParenLoc, EndLoc);
 }
 
 OMPClause *SemaOpenMP::ActOnOpenMPProcBindClause(ProcBindKind Kind,
@@ -16742,6 +16830,15 @@ OMPClause *SemaOpenMP::ActOnOpenMPSingleExprWithArgClause(
         static_cast<OpenMPDistScheduleClauseKind>(Argument.back()), Expr,
         StartLoc, LParenLoc, ArgumentLoc.back(), DelimLoc, EndLoc);
     break;
+  case OMPC_default:
+    enum { DefaultModifier, DefaultVarCategory };
+    Res = ActOnOpenMPDefaultClause(
+        static_cast<llvm::omp::DefaultKind>(Argument[DefaultModifier]),
+        ArgumentLoc[DefaultModifier],
+        static_cast<OpenMPDefaultClauseVariableCategory>(
+            Argument[DefaultVarCategory]),
+        ArgumentLoc[DefaultVarCategory], StartLoc, LParenLoc, EndLoc);
+    break;
   case OMPC_defaultmap:
     enum { Modifier, DefaultmapKind };
     Res = ActOnOpenMPDefaultmapClause(
@@ -16790,7 +16887,6 @@ OMPClause *SemaOpenMP::ActOnOpenMPSingleExprWithArgClause(
   case OMPC_sizes:
   case OMPC_allocator:
   case OMPC_collapse:
-  case OMPC_default:
   case OMPC_proc_bind:
   case OMPC_private:
   case OMPC_firstprivate:
@@ -23025,8 +23121,11 @@ OMPClause *SemaOpenMP::ActOnOpenMPDefaultmapClause(
         }
       } else {
         StringRef ModifierValue =
-            "'alloc', 'from', 'to', 'tofrom', "
-            "'firstprivate', 'none', 'default', 'present'";
+            getLangOpts().OpenMP < 60
+                ? "'alloc', 'from', 'to', 'tofrom', "
+                  "'firstprivate', 'none', 'default', 'present'"
+                : "'storage', 'from', 'to', 'tofrom', "
+                  "'firstprivate', 'private', 'none', 'default', 'present'";
         if (!isDefaultmapKind && isDefaultmapModifier) {
           Diag(KindLoc, diag::err_omp_unexpected_clause_value)
               << KindValue << getOpenMPClauseNameForDiag(OMPC_defaultmap);
diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp
index 941542247e240..ea5c4265d736d 100644
--- a/clang/lib/Sema/SemaOverload.cpp
+++ b/clang/lib/Sema/SemaOverload.cpp
@@ -2162,9 +2162,18 @@ static bool IsVectorConversion(Sema &S, QualType FromType, QualType ToType,
 
   // There are no conversions between extended vector types, only identity.
   if (auto *ToExtType = ToType->getAs<ExtVectorType>()) {
-    if (FromType->getAs<ExtVectorType>()) {
-      // There are no conversions between extended vector types other than the
-      // identity conversion.
+    if (auto *FromExtType = FromType->getAs<ExtVectorType>()) {
+      // Implicit conversions require the same number of elements.
+      if (ToExtType->getNumElements() != FromExtType->getNumElements())
+        return false;
+
+      // Permit implicit conversions from integral values to boolean vectors.
+      if (ToType->isExtVectorBoolType() &&
+          FromExtType->getElementType()->isIntegerType()) {
+        ICK = ICK_Boolean_Conversion;
+        return true;
+      }
+      // There are no other conversions between extended vector types.
       return false;
     }
 
diff --git a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp
index 58dae32569bcc..e1b1269e0d4d8 100644
--- a/clang/lib/Sema/SemaTemplate.cpp
+++ b/clang/lib/Sema/SemaTemplate.cpp
@@ -2845,6 +2845,16 @@ TemplateParameterList *Sema::MatchTemplateParametersToScopeSpecifier(
 
     if (const TemplateSpecializationType *TST
                                      = T->getAs<TemplateSpecializationType>()) {
+      TemplateName Name = TST->getTemplateName();
+      if (const auto *DTS = Name.getAsDependentTemplateName()) {
+        // Look one step prior in a dependent template specialization type.
+        if (NestedNameSpecifier NNS = DTS->getQualifier();
+            NNS.getKind() == NestedNameSpecifier::Kind::Type)
+          T = QualType(NNS.getAsType(), 0);
+        else
+          T = QualType();
+        continue;
+      }
       if (TemplateDecl *Template = TST->getTemplateName().getAsTemplateDecl()) {
         if (TypeDecl *Parent = dyn_cast<TypeDecl>(Template->getDeclContext()))
           T = Context.getTypeDeclType(Parent);
@@ -2854,18 +2864,6 @@ TemplateParameterList *Sema::MatchTemplateParametersToScopeSpecifier(
       }
     }
 
-    // Look one step prior in a dependent template specialization type.
-    if (const DependentTemplateSpecializationType *DependentTST
-                          = T->getAs<DependentTemplateSpecializationType>()) {
-      if (NestedNameSpecifier NNS =
-              DependentTST->getDependentTemplateName().getQualifier();
-          NNS.getKind() == NestedNameSpecifier::Kind::Type)
-        T = QualType(NNS.getAsType(), 0);
-      else
-        T = QualType();
-      continue;
-    }
-
     // Look one step prior in a dependent name type.
     if (const DependentNameType *DependentName = T->getAs<DependentNameType>()){
       if (NestedNameSpecifier NNS = DependentName->getQualifier();
@@ -2985,16 +2983,16 @@ TemplateParameterList *Sema::MatchTemplateParametersToScopeSpecifier(
 
         continue;
       }
-    } else if (const TemplateSpecializationType *TST
-                                     = T->getAs<TemplateSpecializationType>()) {
-      if (TemplateDecl *Template = TST->getTemplateName().getAsTemplateDecl()) {
+    } else if (const auto *TST = T->getAs<TemplateSpecializationType>()) {
+      TemplateName Name = TST->getTemplateName();
+      if (TemplateDecl *Template = Name.getAsTemplateDecl()) {
         ExpectedTemplateParams = Template->getTemplateParameters();
         NeedNonemptyTemplateHeader = true;
+      } else if (Name.getAsDeducedTemplateName()) {
+        // FIXME:  We actually could/should check the template arguments here
+        // against the corresponding template parameter list.
+        NeedNonemptyTemplateHeader = false;
       }
-    } else if (T->getAs<DependentTemplateSpecializationType>()) {
-      // FIXME:  We actually could/should check the template arguments here
-      // against the corresponding template parameter list.
-      NeedNonemptyTemplateHeader = false;
     }
 
     // C++ [temp.expl.spec]p16:
@@ -3203,8 +3201,9 @@ static QualType builtinCommonTypeImpl(Sema &S, ElaboratedTypeKeyword Keyword,
     Sema::SFINAETrap SFINAE(S, /*ForValidityCheck=*/true);
     Sema::ContextRAII TUContext(S, S.Context.getTranslationUnitDecl());
 
-    QualType BaseTemplateInst =
-        S.CheckTemplateIdType(Keyword, BaseTemplate, TemplateLoc, Args);
+    QualType BaseTemplateInst = S.CheckTemplateIdType(
+        Keyword, BaseTemplate, TemplateLoc, Args,
+        /*Scope=*/nullptr, /*ForNestedNameSpecifier=*/false);
 
     if (SFINAE.hasErrorOccurred())
       return QualType();
@@ -3422,7 +3421,9 @@ static QualType checkBuiltinTemplateIdType(
     // The first template argument will be reused as the template decl that
     // our synthetic template arguments will be applied to.
     return SemaRef.CheckTemplateIdType(Keyword, Converted[0].getAsTemplate(),
-                                       TemplateLoc, SyntheticTemplateArgs);
+                                       TemplateLoc, SyntheticTemplateArgs,
+                                       /*Scope=*/nullptr,
+                                       /*ForNestedNameSpecifier=*/false);
   }
 
   case BTK__type_pack_element: {
@@ -3467,7 +3468,8 @@ static QualType checkBuiltinTemplateIdType(
                                     CT, TemplateArgs[1].getLocation())));
       TemplateName HasTypeMember = Converted[1].getAsTemplate();
       return SemaRef.CheckTemplateIdType(Keyword, HasTypeMember, TemplateLoc,
-                                         TAs);
+                                         TAs, /*Scope=*/nullptr,
+                                         /*ForNestedNameSpecifier=*/false);
     }
     QualType HasNoTypeMember = Converted[2].getAsType();
     return HasNoTypeMember;
@@ -3666,40 +3668,81 @@ Sema::findFailedBooleanCondition(Expr *Cond) {
   return { FailedCond, Description };
 }
 
+static TemplateName
+resolveAssumedTemplateNameAsType(Sema &S, Scope *Scope,
+                                 const AssumedTemplateStorage *ATN,
+                                 SourceLocation NameLoc) {
+  // We assumed this undeclared identifier to be an (ADL-only) function
+  // template name, but it was used in a context where a type was required.
+  // Try to typo-correct it now.
+  LookupResult R(S, ATN->getDeclName(), NameLoc, S.LookupOrdinaryName);
+  struct CandidateCallback : CorrectionCandidateCallback {
+    bool ValidateCandidate(const TypoCorrection &TC) override {
+      return TC.getCorrectionDecl() &&
+             getAsTypeTemplateDecl(TC.getCorrectionDecl());
+    }
+    std::unique_ptr<CorrectionCandidateCallback> clone() override {
+      return std::make_unique<CandidateCallback>(*this);
+    }
+  } FilterCCC;
+
+  TypoCorrection Corrected =
+      S.CorrectTypo(R.getLookupNameInfo(), R.getLookupKind(), Scope,
+                    /*SS=*/nullptr, FilterCCC, CorrectTypoKind::ErrorRecovery);
+  if (Corrected && Corrected.getFoundDecl()) {
+    S.diagnoseTypo(Corrected, S.PDiag(diag::err_no_template_suggest)
+                                  << ATN->getDeclName());
+    return S.Context.getQualifiedTemplateName(
+        /*Qualifier=*/std::nullopt, /*TemplateKeyword=*/false,
+        TemplateName(Corrected.getCorrectionDeclAs<TemplateDecl>()));
+  }
+
+  return TemplateName();
+}
+
 QualType Sema::CheckTemplateIdType(ElaboratedTypeKeyword Keyword,
                                    TemplateName Name,
                                    SourceLocation TemplateLoc,
-                                   TemplateArgumentListInfo &TemplateArgs) {
-  // FIXME: 'getUnderlying' loses SubstTemplateTemplateParm nodes from alias
-  // template substitutions.
-  if (DependentTemplateName *DTN =
-          Name.getUnderlying().getAsDependentTemplateName();
-      DTN && DTN->getName().getIdentifier())
-    // When building a template-id where the template-name is dependent,
-    // assume the template is a type template. Either our assumption is
-    // correct, or the code is ill-formed and will be diagnosed when the
-    // dependent name is substituted.
-    return Context.getDependentTemplateSpecializationType(
-        ElaboratedTypeKeyword::None, *DTN, TemplateArgs.arguments());
-
-  if (Name.getAsAssumedTemplateName() &&
-      resolveAssumedTemplateNameAsType(/*Scope=*/nullptr, Name, TemplateLoc))
-    return QualType();
+                                   TemplateArgumentListInfo &TemplateArgs,
+                                   Scope *Scope, bool ForNestedNameSpecifier) {
+  auto [UnderlyingName, DefaultArgs] = Name.getTemplateDeclAndDefaultArgs();
 
-  TemplateDecl *Template;
-  DefaultArguments DefaultArgs;
-  if (const SubstTemplateTemplateParmPackStorage *S =
-          Name.getAsSubstTemplateTemplateParmPack()) {
-    Template = S->getParameterPack();
-  } else {
-    std::tie(Template, DefaultArgs) = Name.getTemplateDeclAndDefaultArgs();
-    if (!Template || isa<FunctionTemplateDecl>(Template) ||
-        isa<VarTemplateDecl>(Template) || isa<ConceptDecl>(Template)) {
-      Diag(TemplateLoc, diag::err_template_id_not_a_type) << Name;
-      NoteAllFoundTemplates(Name);
-      return QualType();
+  TemplateDecl *Template = UnderlyingName.getAsTemplateDecl();
+  if (!Template) {
+    if (const auto *S = UnderlyingName.getAsSubstTemplateTemplateParmPack()) {
+      Template = S->getParameterPack();
+    } else if (const auto *DTN = UnderlyingName.getAsDependentTemplateName()) {
+      if (DTN->getName().getIdentifier())
+        // When building a template-id where the template-name is dependent,
+        // assume the template is a type template. Either our assumption is
+        // correct, or the code is ill-formed and will be diagnosed when the
+        // dependent name is substituted.
+        return Context.getTemplateSpecializationType(Keyword, Name,
+                                                     TemplateArgs.arguments(),
+                                                     /*CanonicalArgs=*/{});
+    } else if (const auto *ATN = UnderlyingName.getAsAssumedTemplateName()) {
+      if (TemplateName CorrectedName = ::resolveAssumedTemplateNameAsType(
+              *this, Scope, ATN, TemplateLoc);
+          CorrectedName.isNull()) {
+        Diag(TemplateLoc, diag::err_no_template) << ATN->getDeclName();
+        return QualType();
+      } else {
+        Name = CorrectedName;
+        Template = Name.getAsTemplateDecl();
+      }
     }
   }
+  if (!Template ||
+      isa<FunctionTemplateDecl, VarTemplateDecl, ConceptDecl>(Template)) {
+    SourceRange R(TemplateLoc, TemplateArgs.getRAngleLoc());
+    if (ForNestedNameSpecifier)
+      Diag(TemplateLoc, diag::err_non_type_template_in_nested_name_specifier)
+          << isa_and_nonnull<VarTemplateDecl>(Template) << Name << R;
+    else
+      Diag(TemplateLoc, diag::err_template_id_not_a_type) << Name << R;
+    NoteAllFoundTemplates(Name);
+    return QualType();
+  }
 
   // Check that the template argument list is well-formed for this
   // template.
@@ -3810,6 +3853,7 @@ QualType Sema::CheckTemplateIdType(ElaboratedTypeKeyword Keyword,
     //
     //   template<typename T, typename U = T> struct A;
     CanonType = Context.getCanonicalTemplateSpecializationType(
+        ElaboratedTypeKeyword::None,
         Context.getCanonicalTemplateName(Name, /*IgnoreDeduced=*/true),
         CTAI.CanonicalConverted);
     assert(CanonType->isCanonicalUnqualified());
@@ -3908,55 +3952,19 @@ void Sema::ActOnUndeclaredTypeTemplateName(Scope *S, TemplateTy &ParsedName,
                                            IdentifierInfo *&II) {
   assert(TNK == TNK_Undeclared_template && "not an undeclared template name");
 
-  TemplateName Name = ParsedName.get();
-  auto *ATN = Name.getAsAssumedTemplateName();
+  auto *ATN = ParsedName.get().getAsAssumedTemplateName();
   assert(ATN && "not an assumed template name");
   II = ATN->getDeclName().getAsIdentifierInfo();
 
-  if (!resolveAssumedTemplateNameAsType(S, Name, NameLoc, /*Diagnose*/false)) {
+  if (TemplateName Name =
+          ::resolveAssumedTemplateNameAsType(*this, S, ATN, NameLoc);
+      !Name.isNull()) {
     // Resolved to a type template name.
     ParsedName = TemplateTy::make(Name);
     TNK = TNK_Type_template;
   }
 }
 
-bool Sema::resolveAssumedTemplateNameAsType(Scope *S, TemplateName &Name,
-                                            SourceLocation NameLoc,
-                                            bool Diagnose) {
-  // We assumed this undeclared identifier to be an (ADL-only) function
-  // template name, but it was used in a context where a type was required.
-  // Try to typo-correct it now.
-  AssumedTemplateStorage *ATN = Name.getAsAssumedTemplateName();
-  assert(ATN && "not an assumed template name");
-
-  LookupResult R(*this, ATN->getDeclName(), NameLoc, LookupOrdinaryName);
-  struct CandidateCallback : CorrectionCandidateCallback {
-    bool ValidateCandidate(const TypoCorrection &TC) override {
-      return TC.getCorrectionDecl() &&
-             getAsTypeTemplateDecl(TC.getCorrectionDecl());
-    }
-    std::unique_ptr<CorrectionCandidateCallback> clone() override {
-      return std::make_unique<CandidateCallback>(*this);
-    }
-  } FilterCCC;
-
-  TypoCorrection Corrected =
-      CorrectTypo(R.getLookupNameInfo(), R.getLookupKind(), S, nullptr,
-                  FilterCCC, CorrectTypoKind::ErrorRecovery);
-  if (Corrected && Corrected.getFoundDecl()) {
-    diagnoseTypo(Corrected, PDiag(diag::err_no_template_suggest)
-                                << ATN->getDeclName());
-    Name = Context.getQualifiedTemplateName(
-        /*Qualifier=*/std::nullopt, /*TemplateKeyword=*/false,
-        TemplateName(Corrected.getCorrectionDeclAs<TemplateDecl>()));
-    return false;
-  }
-
-  if (Diagnose)
-    Diag(R.getNameLoc(), diag::err_no_template) << R.getLookupName();
-  return true;
-}
-
 TypeResult Sema::ActOnTemplateIdType(
     Scope *S, ElaboratedTypeKeyword ElaboratedKeyword,
     SourceLocation ElaboratedKeywordLoc, CXXScopeSpec &SS,
@@ -4013,36 +4021,13 @@ TypeResult Sema::ActOnTemplateIdType(
     }
   }
 
-  TemplateName Template = TemplateD.get();
-  if (Template.getAsAssumedTemplateName() &&
-      resolveAssumedTemplateNameAsType(S, Template, TemplateIILoc))
-    return true;
-
   // Translate the parser's template argument list in our AST format.
   TemplateArgumentListInfo TemplateArgs(LAngleLoc, RAngleLoc);
   translateTemplateArguments(TemplateArgsIn, TemplateArgs);
 
-  if (DependentTemplateName *DTN = Template.getAsDependentTemplateName()) {
-    assert(SS.getScopeRep() == DTN->getQualifier());
-    QualType T = Context.getDependentTemplateSpecializationType(
-        ElaboratedKeyword, *DTN, TemplateArgs.arguments());
-    // Build type-source information.
-    TypeLocBuilder TLB;
-    DependentTemplateSpecializationTypeLoc SpecTL
-      = TLB.push<DependentTemplateSpecializationTypeLoc>(T);
-    SpecTL.setElaboratedKeywordLoc(ElaboratedKeywordLoc);
-    SpecTL.setQualifierLoc(SS.getWithLocInContext(Context));
-    SpecTL.setTemplateKeywordLoc(TemplateKWLoc);
-    SpecTL.setTemplateNameLoc(TemplateIILoc);
-    SpecTL.setLAngleLoc(LAngleLoc);
-    SpecTL.setRAngleLoc(RAngleLoc);
-    for (unsigned I = 0, N = SpecTL.getNumArgs(); I != N; ++I)
-      SpecTL.setArgLocInfo(I, TemplateArgs[I].getLocInfo());
-    return CreateParsedType(T, TLB.getTypeSourceInfo(Context, T));
-  }
-
-  QualType SpecTy = CheckTemplateIdType(ElaboratedKeyword, Template,
-                                        TemplateIILoc, TemplateArgs);
+  QualType SpecTy = CheckTemplateIdType(
+      ElaboratedKeyword, TemplateD.get(), TemplateIILoc, TemplateArgs,
+      /*Scope=*/S, /*ForNestedNameSpecifier=*/false);
   if (SpecTy.isNull())
     return true;
 
@@ -4067,8 +4052,6 @@ TypeResult Sema::ActOnTagTemplateIdType(TagUseKind TUK,
   if (SS.isInvalid())
     return TypeResult(true);
 
-  TemplateName Template = TemplateD.get();
-
   // Translate the parser's template argument list in our AST format.
   TemplateArgumentListInfo TemplateArgs(LAngleLoc, RAngleLoc);
   translateTemplateArguments(TemplateArgsIn, TemplateArgs);
@@ -4078,28 +4061,9 @@ TypeResult Sema::ActOnTagTemplateIdType(TagUseKind TUK,
   ElaboratedTypeKeyword Keyword
     = TypeWithKeyword::getKeywordForTagTypeKind(TagKind);
 
-  if (DependentTemplateName *DTN = Template.getAsDependentTemplateName()) {
-    assert(SS.getScopeRep() == DTN->getQualifier());
-    QualType T = Context.getDependentTemplateSpecializationType(
-        Keyword, *DTN, TemplateArgs.arguments());
-
-    // Build type-source information.
-    TypeLocBuilder TLB;
-    DependentTemplateSpecializationTypeLoc SpecTL
-      = TLB.push<DependentTemplateSpecializationTypeLoc>(T);
-    SpecTL.setElaboratedKeywordLoc(TagLoc);
-    SpecTL.setQualifierLoc(SS.getWithLocInContext(Context));
-    SpecTL.setTemplateKeywordLoc(TemplateKWLoc);
-    SpecTL.setTemplateNameLoc(TemplateLoc);
-    SpecTL.setLAngleLoc(LAngleLoc);
-    SpecTL.setRAngleLoc(RAngleLoc);
-    for (unsigned I = 0, N = SpecTL.getNumArgs(); I != N; ++I)
-      SpecTL.setArgLocInfo(I, TemplateArgs[I].getLocInfo());
-    return CreateParsedType(T, TLB.getTypeSourceInfo(Context, T));
-  }
-
   QualType Result =
-      CheckTemplateIdType(Keyword, Template, TemplateLoc, TemplateArgs);
+      CheckTemplateIdType(Keyword, TemplateD.get(), TemplateLoc, TemplateArgs,
+                          /*Scope=*/nullptr, /*ForNestedNameSpecifier=*/false);
   if (Result.isNull())
     return TypeResult(true);
 
@@ -6389,11 +6353,6 @@ bool UnnamedLocalNoLinkageFinder::VisitDependentNameType(
   return VisitNestedNameSpecifier(T->getQualifier());
 }
 
-bool UnnamedLocalNoLinkageFinder::VisitDependentTemplateSpecializationType(
-                                 const DependentTemplateSpecializationType* T) {
-  return VisitNestedNameSpecifier(T->getDependentTemplateName().getQualifier());
-}
-
 bool UnnamedLocalNoLinkageFinder::VisitPackExpansionType(
                                                    const PackExpansionType* T) {
   return Visit(T->getPattern());
@@ -7388,6 +7347,9 @@ ExprResult Sema::CheckTemplateArgument(NamedDecl *Param, QualType ParamType,
     return Arg;
   }
 
+  // These should have all been handled above using the C++17 rules.
+  assert(!ArgPE && !StrictCheck);
+
   // C++ [temp.arg.nontype]p5:
   //   The following conversions are performed on each expression used
   //   as a non-type template-argument. If a non-type
@@ -7415,13 +7377,13 @@ ExprResult Sema::CheckTemplateArgument(NamedDecl *Param, QualType ParamType,
       //        template-parameter; or
       llvm::APSInt Value;
       ExprResult ArgResult = CheckConvertedConstantExpression(
-          DeductionArg, ParamType, Value, CCEKind::TemplateArg);
+          Arg, ParamType, Value, CCEKind::TemplateArg);
       if (ArgResult.isInvalid())
         return ExprError();
-      setDeductionArg(ArgResult.get());
+      Arg = ArgResult.get();
 
       // We can't check arbitrary value-dependent arguments.
-      if (DeductionArg->isValueDependent()) {
+      if (Arg->isValueDependent()) {
         SugaredConverted = TemplateArgument(Arg, /*IsCanonical=*/false);
         CanonicalConverted =
             Context.getCanonicalTemplateArgument(SugaredConverted);
@@ -7438,24 +7400,18 @@ ExprResult Sema::CheckTemplateArgument(NamedDecl *Param, QualType ParamType,
                                    ? Context.getIntWidth(IntegerType)
                                    : Context.getTypeSize(IntegerType));
 
-      if (ArgPE) {
-        SugaredConverted = TemplateArgument(Arg, /*IsCanonical=*/false);
-        CanonicalConverted =
-            Context.getCanonicalTemplateArgument(SugaredConverted);
-      } else {
-        SugaredConverted = TemplateArgument(Context, Value, ParamType);
-        CanonicalConverted = TemplateArgument(
-            Context, Value, Context.getCanonicalType(ParamType));
-      }
+      SugaredConverted = TemplateArgument(Context, Value, ParamType);
+      CanonicalConverted =
+          TemplateArgument(Context, Value, Context.getCanonicalType(ParamType));
       return Arg;
     }
 
     ExprResult ArgResult = DefaultLvalueConversion(Arg);
     if (ArgResult.isInvalid())
       return ExprError();
-    DeductionArg = ArgResult.get();
+    Arg = ArgResult.get();
 
-    QualType ArgType = DeductionArg->getType();
+    QualType ArgType = Arg->getType();
 
     // C++ [temp.arg.nontype]p1:
     //   A template-argument for a non-type, non-template
@@ -7466,11 +7422,12 @@ ExprResult Sema::CheckTemplateArgument(NamedDecl *Param, QualType ParamType,
     //     -- the name of a non-type template-parameter; or
     llvm::APSInt Value;
     if (!ArgType->isIntegralOrEnumerationType()) {
-      Diag(StartLoc, diag::err_template_arg_not_integral_or_enumeral)
-          << ArgType << DeductionArg->getSourceRange();
+      Diag(Arg->getBeginLoc(), diag::err_template_arg_not_integral_or_enumeral)
+          << ArgType << Arg->getSourceRange();
       NoteTemplateParameterLocation(*Param);
       return ExprError();
-    } else if (!DeductionArg->isValueDependent()) {
+    }
+    if (!Arg->isValueDependent()) {
       class TmplArgICEDiagnoser : public VerifyICEDiagnoser {
         QualType T;
 
@@ -7483,10 +7440,8 @@ ExprResult Sema::CheckTemplateArgument(NamedDecl *Param, QualType ParamType,
         }
       } Diagnoser(ArgType);
 
-      DeductionArg =
-          VerifyIntegerConstantExpression(DeductionArg, &Value, Diagnoser)
-              .get();
-      if (!DeductionArg)
+      Arg = VerifyIntegerConstantExpression(Arg, &Value, Diagnoser).get();
+      if (!Arg)
         return ExprError();
     }
 
@@ -7499,28 +7454,23 @@ ExprResult Sema::CheckTemplateArgument(NamedDecl *Param, QualType ParamType,
       // Okay: no conversion necessary
     } else if (ParamType->isBooleanType()) {
       // This is an integral-to-boolean conversion.
-      DeductionArg =
-          ImpCastExprToType(DeductionArg, ParamType, CK_IntegralToBoolean)
-              .get();
+      Arg = ImpCastExprToType(Arg, ParamType, CK_IntegralToBoolean).get();
     } else if (IsIntegralPromotion(Arg, ArgType, ParamType) ||
                !ParamType->isEnumeralType()) {
       // This is an integral promotion or conversion.
-      DeductionArg =
-          ImpCastExprToType(DeductionArg, ParamType, CK_IntegralCast).get();
+      Arg = ImpCastExprToType(Arg, ParamType, CK_IntegralCast).get();
     } else {
       // We can't perform this conversion.
       Diag(StartLoc, diag::err_template_arg_not_convertible)
-          << DeductionArg->getType() << ParamType
-          << DeductionArg->getSourceRange();
+          << Arg->getType() << ParamType << Arg->getSourceRange();
       NoteTemplateParameterLocation(*Param);
       return ExprError();
     }
-    setDeductionArg(DeductionArg);
 
     // Add the value of this argument to the list of converted
     // arguments. We use the bitwidth and signedness of the template
     // parameter.
-    if (DeductionArg->isValueDependent()) {
+    if (Arg->isValueDependent()) {
       // The argument is value-dependent. Create a new
       // TemplateArgument with the converted expression.
       SugaredConverted = TemplateArgument(Arg, /*IsCanonical=*/false);
@@ -7578,20 +7528,14 @@ ExprResult Sema::CheckTemplateArgument(NamedDecl *Param, QualType ParamType,
       }
     }
 
-    if (ArgPE) {
-      SugaredConverted = TemplateArgument(Arg, /*IsCanonical=*/false);
-      CanonicalConverted =
-          Context.getCanonicalTemplateArgument(SugaredConverted);
-    } else {
-      QualType T = ParamType->isEnumeralType() ? ParamType : IntegerType;
-      SugaredConverted = TemplateArgument(Context, Value, T);
-      CanonicalConverted =
-          TemplateArgument(Context, Value, Context.getCanonicalType(T));
-    }
+    QualType T = ParamType->isEnumeralType() ? ParamType : IntegerType;
+    SugaredConverted = TemplateArgument(Context, Value, T);
+    CanonicalConverted =
+        TemplateArgument(Context, Value, Context.getCanonicalType(T));
     return Arg;
   }
 
-  QualType ArgType = DeductionArg->getType();
+  QualType ArgType = Arg->getType();
   DeclAccessPair FoundResult; // temporary for ResolveOverloadedFunction
 
   // Handle pointer-to-function, reference-to-function, and
@@ -7618,7 +7562,7 @@ ExprResult Sema::CheckTemplateArgument(NamedDecl *Param, QualType ParamType,
        ParamType->castAs<MemberPointerType>()->getPointeeType()
          ->isFunctionType())) {
 
-    if (DeductionArg->getType() == Context.OverloadTy) {
+    if (Arg->getType() == Context.OverloadTy) {
       if (FunctionDecl *Fn = ResolveAddressOfOverloadedFunction(Arg, ParamType,
                                                                 true,
                                                                 FoundResult)) {
@@ -7628,12 +7572,11 @@ ExprResult Sema::CheckTemplateArgument(NamedDecl *Param, QualType ParamType,
         ExprResult Res = FixOverloadedFunctionReference(Arg, FoundResult, Fn);
         if (Res.isInvalid())
           return ExprError();
-        DeductionArg = Res.get();
+        Arg = Res.get();
         ArgType = Arg->getType();
       } else
         return ExprError();
     }
-    setDeductionArg(DeductionArg);
 
     if (!ParamType->isMemberPointerType()) {
       if (CheckTemplateArgumentAddressOfObjectOrFunction(
@@ -7649,8 +7592,6 @@ ExprResult Sema::CheckTemplateArgument(NamedDecl *Param, QualType ParamType,
     return Arg;
   }
 
-  setDeductionArg(DeductionArg);
-
   if (ParamType->isPointerType()) {
     //   -- for a non-type template-parameter of type pointer to
     //      object, qualification conversions (4.4) and the
@@ -7659,7 +7600,6 @@ ExprResult Sema::CheckTemplateArgument(NamedDecl *Param, QualType ParamType,
     assert(ParamType->getPointeeType()->isIncompleteOrObjectType() &&
            "Only object pointers allowed here");
 
-    // FIXME: Deal with pack expansions here.
     if (CheckTemplateArgumentAddressOfObjectOrFunction(
             *this, Param, ParamType, Arg, SugaredConverted, CanonicalConverted))
       return ExprError();
@@ -7676,7 +7616,6 @@ ExprResult Sema::CheckTemplateArgument(NamedDecl *Param, QualType ParamType,
     assert(ParamRefType->getPointeeType()->isIncompleteOrObjectType() &&
            "Only object references allowed here");
 
-    // FIXME: Deal with pack expansions here.
     if (Arg->getType() == Context.OverloadTy) {
       if (FunctionDecl *Fn = ResolveAddressOfOverloadedFunction(Arg,
                                                  ParamRefType->getPointeeType(),
@@ -7701,18 +7640,17 @@ ExprResult Sema::CheckTemplateArgument(NamedDecl *Param, QualType ParamType,
 
   // Deal with parameters of type std::nullptr_t.
   if (ParamType->isNullPtrType()) {
-    if (DeductionArg->isTypeDependent() || DeductionArg->isValueDependent()) {
+    if (Arg->isTypeDependent() || Arg->isValueDependent()) {
       SugaredConverted = TemplateArgument(Arg, /*IsCanonical=*/false);
       CanonicalConverted =
           Context.getCanonicalTemplateArgument(SugaredConverted);
       return Arg;
     }
 
-    switch (isNullPointerValueTemplateArgument(*this, Param, ParamType,
-                                               DeductionArg)) {
+    switch (isNullPointerValueTemplateArgument(*this, Param, ParamType, Arg)) {
     case NPV_NotNullPointer:
       Diag(Arg->getExprLoc(), diag::err_template_arg_not_convertible)
-          << DeductionArg->getType() << ParamType;
+          << Arg->getType() << ParamType;
       NoteTemplateParameterLocation(*Param);
       return ExprError();
 
@@ -7721,17 +7659,10 @@ ExprResult Sema::CheckTemplateArgument(NamedDecl *Param, QualType ParamType,
 
     case NPV_NullPointer:
       Diag(Arg->getExprLoc(), diag::warn_cxx98_compat_template_arg_null);
-      if (ArgPE) {
-        SugaredConverted = TemplateArgument(Arg, /*IsCanonical=*/false);
-        CanonicalConverted =
-            Context.getCanonicalTemplateArgument(SugaredConverted);
-      } else {
-        SugaredConverted = TemplateArgument(ParamType,
+      SugaredConverted = TemplateArgument(ParamType,
+                                          /*isNullPtr=*/true);
+      CanonicalConverted = TemplateArgument(Context.getCanonicalType(ParamType),
                                             /*isNullPtr=*/true);
-        CanonicalConverted =
-            TemplateArgument(Context.getCanonicalType(ParamType),
-                             /*isNullPtr=*/true);
-      }
       return Arg;
     }
   }
@@ -7740,7 +7671,6 @@ ExprResult Sema::CheckTemplateArgument(NamedDecl *Param, QualType ParamType,
   //        member, qualification conversions (4.4) are applied.
   assert(ParamType->isMemberPointerType() && "Only pointers to members remain");
 
-  // FIXME: Deal with pack expansions here.
   if (CheckTemplateArgumentPointerToMember(
           *this, Param, ParamType, Arg, SugaredConverted, CanonicalConverted))
     return ExprError();
@@ -7832,8 +7762,10 @@ bool Sema::CheckTemplateTemplateArgument(TemplateTemplateParmDecl *Param,
                                          bool PartialOrdering,
                                          bool *StrictPackMatch) {
   TemplateName Name = Arg.getArgument().getAsTemplateOrTemplatePattern();
-  auto [Template, DefaultArgs] = Name.getTemplateDeclAndDefaultArgs();
+  auto [UnderlyingName, DefaultArgs] = Name.getTemplateDeclAndDefaultArgs();
+  TemplateDecl *Template = UnderlyingName.getAsTemplateDecl();
   if (!Template) {
+    // FIXME: Handle AssumedTemplateNames
     // Any dependent template name is fine.
     assert(Name.isDependent() && "Non-dependent template isn't a declaration?");
     return false;
@@ -8949,6 +8881,7 @@ DeclResult Sema::ActOnClassTemplateSpecialization(
   } else {
     CanQualType CanonType = CanQualType::CreateUnsafe(
         Context.getCanonicalTemplateSpecializationType(
+            ElaboratedTypeKeyword::None,
             TemplateName(ClassTemplate->getCanonicalDecl()),
             CTAI.CanonicalConverted));
     if (Context.hasSameType(
@@ -11128,43 +11061,11 @@ Sema::ActOnTypenameType(Scope *S, SourceLocation TypenameLoc,
   TemplateArgumentListInfo TemplateArgs(LAngleLoc, RAngleLoc);
   translateTemplateArguments(TemplateArgsIn, TemplateArgs);
 
-  auto Keyword = TypenameLoc.isValid() ? ElaboratedTypeKeyword::Typename
-                                       : ElaboratedTypeKeyword::None;
-
-  TemplateName Template = TemplateIn.get();
-  if (DependentTemplateName *DTN = Template.getAsDependentTemplateName()) {
-    // Construct a dependent template specialization type.
-    assert(DTN && "dependent template has non-dependent name?");
-    assert(DTN->getQualifier() == SS.getScopeRep());
-
-    if (!DTN->getName().getIdentifier()) {
-      Diag(TemplateIILoc, diag::err_template_id_not_a_type) << Template;
-      NoteAllFoundTemplates(Template);
-      return true;
-    }
-
-    QualType T = Context.getDependentTemplateSpecializationType(
-        Keyword, *DTN, TemplateArgs.arguments());
-
-    // Create source-location information for this type.
-    TypeLocBuilder Builder;
-    DependentTemplateSpecializationTypeLoc SpecTL
-    = Builder.push<DependentTemplateSpecializationTypeLoc>(T);
-    SpecTL.setElaboratedKeywordLoc(TypenameLoc);
-    SpecTL.setQualifierLoc(SS.getWithLocInContext(Context));
-    SpecTL.setTemplateKeywordLoc(TemplateKWLoc);
-    SpecTL.setTemplateNameLoc(TemplateIILoc);
-    SpecTL.setLAngleLoc(LAngleLoc);
-    SpecTL.setRAngleLoc(RAngleLoc);
-    for (unsigned I = 0, N = TemplateArgs.size(); I != N; ++I)
-      SpecTL.setArgLocInfo(I, TemplateArgs[I].getLocInfo());
-    return CreateParsedType(T, Builder.getTypeSourceInfo(Context, T));
-  }
-
-  QualType T = CheckTemplateIdType(TypenameLoc.isValid()
-                                       ? ElaboratedTypeKeyword::Typename
-                                       : ElaboratedTypeKeyword::None,
-                                   Template, TemplateIILoc, TemplateArgs);
+  QualType T = CheckTemplateIdType(
+      TypenameLoc.isValid() ? ElaboratedTypeKeyword::Typename
+                            : ElaboratedTypeKeyword::None,
+      TemplateIn.get(), TemplateIILoc, TemplateArgs,
+      /*Scope=*/S, /*ForNestedNameSpecifier=*/false);
   if (T.isNull())
     return true;
 
diff --git a/clang/lib/Sema/SemaTemplateDeduction.cpp b/clang/lib/Sema/SemaTemplateDeduction.cpp
index cce40c0c91f95..62e867c44ad14 100644
--- a/clang/lib/Sema/SemaTemplateDeduction.cpp
+++ b/clang/lib/Sema/SemaTemplateDeduction.cpp
@@ -696,6 +696,11 @@ DeduceTemplateSpecArguments(Sema &S, TemplateParameterList *TemplateParams,
   if (isa<TemplateSpecializationType>(P.getCanonicalType())) {
     const TemplateSpecializationType *TP = ::getLastTemplateSpecType(P);
     TNP = TP->getTemplateName();
+
+    // No deduction for specializations of dependent template names.
+    if (TNP.getAsDependentTemplateName())
+      return TemplateDeductionResult::Success;
+
     // FIXME: To preserve sugar, the TST needs to carry sugared resolved
     // arguments.
     PResolved =
@@ -2540,7 +2545,6 @@ static TemplateDeductionResult DeduceTemplateArgumentsByTypeMatch(
     case Type::Decltype:
     case Type::UnaryTransform:
     case Type::DeducedTemplateSpecialization:
-    case Type::DependentTemplateSpecialization:
     case Type::PackExpansion:
     case Type::Pipe:
     case Type::ArrayParameter:
@@ -5176,7 +5180,7 @@ static bool CheckDeducedPlaceholderConstraints(Sema &S, const AutoType &Type,
     TemplateArgs.addArgument(TypeLoc.getArgLoc(I));
 
   Sema::CheckTemplateArgumentInfo CTAI;
-  if (S.CheckTemplateArgumentList(Concept, SourceLocation(), TemplateArgs,
+  if (S.CheckTemplateArgumentList(Concept, TypeLoc.getNameLoc(), TemplateArgs,
                                   /*DefaultArgs=*/{},
                                   /*PartialTemplateArgs=*/false, CTAI))
     return true;
@@ -6495,9 +6499,9 @@ Sema::getMoreSpecializedPartialSpecialization(
          " the same template.");
   TemplateName Name(PS1->getSpecializedTemplate()->getCanonicalDecl());
   QualType PT1 = Context.getCanonicalTemplateSpecializationType(
-      Name, PS1->getTemplateArgs().asArray());
+      ElaboratedTypeKeyword::None, Name, PS1->getTemplateArgs().asArray());
   QualType PT2 = Context.getCanonicalTemplateSpecializationType(
-      Name, PS2->getTemplateArgs().asArray());
+      ElaboratedTypeKeyword::None, Name, PS2->getTemplateArgs().asArray());
 
   TemplateDeductionInfo Info(Loc);
   return getMoreSpecialized(*this, PT1, PT2, PS1, PS2, Info);
@@ -6512,10 +6516,10 @@ bool Sema::isMoreSpecializedThanPrimary(
       Primary->getInjectedTemplateArgs(Context));
   Context.canonicalizeTemplateArguments(PrimaryCanonArgs);
 
-  QualType PrimaryT =
-      Context.getCanonicalTemplateSpecializationType(Name, PrimaryCanonArgs);
+  QualType PrimaryT = Context.getCanonicalTemplateSpecializationType(
+      ElaboratedTypeKeyword::None, Name, PrimaryCanonArgs);
   QualType PartialT = Context.getCanonicalTemplateSpecializationType(
-      Name, Spec->getTemplateArgs().asArray());
+      ElaboratedTypeKeyword::None, Name, Spec->getTemplateArgs().asArray());
 
   VarTemplatePartialSpecializationDecl *MaybeSpec =
       getMoreSpecialized(*this, PartialT, PrimaryT, Spec, Primary, Info);
@@ -6993,8 +6997,12 @@ MarkUsedTemplateParameters(ASTContext &Ctx, QualType T,
   case Type::TemplateSpecialization: {
     const TemplateSpecializationType *Spec
       = cast<TemplateSpecializationType>(T);
-    MarkUsedTemplateParameters(Ctx, Spec->getTemplateName(), OnlyDeduced,
-                               Depth, Used);
+
+    TemplateName Name = Spec->getTemplateName();
+    if (OnlyDeduced && Name.getAsDependentTemplateName())
+      break;
+
+    MarkUsedTemplateParameters(Ctx, Name, OnlyDeduced, Depth, Used);
 
     // C++0x [temp.deduct.type]p9:
     //   If the template argument list of P contains a pack expansion that is
@@ -7030,31 +7038,6 @@ MarkUsedTemplateParameters(ASTContext &Ctx, QualType T,
                                  OnlyDeduced, Depth, Used);
     break;
 
-  case Type::DependentTemplateSpecialization: {
-    // C++14 [temp.deduct.type]p5:
-    //   The non-deduced contexts are:
-    //     -- The nested-name-specifier of a type that was specified using a
-    //        qualified-id
-    //
-    // C++14 [temp.deduct.type]p6:
-    //   When a type name is specified in a way that includes a non-deduced
-    //   context, all of the types that comprise that type name are also
-    //   non-deduced.
-    if (OnlyDeduced)
-      break;
-
-    const DependentTemplateSpecializationType *Spec
-      = cast<DependentTemplateSpecializationType>(T);
-
-    MarkUsedTemplateParameters(Ctx,
-                               Spec->getDependentTemplateName().getQualifier(),
-                               OnlyDeduced, Depth, Used);
-
-    for (const auto &Arg : Spec->template_arguments())
-      MarkUsedTemplateParameters(Ctx, Arg, OnlyDeduced, Depth, Used);
-    break;
-  }
-
   case Type::TypeOf:
     if (!OnlyDeduced)
       MarkUsedTemplateParameters(Ctx, cast<TypeOfType>(T)->getUnmodifiedType(),
diff --git a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
index b3cbd7f8c1efe..df1a100cab22c 100644
--- a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
@@ -6951,8 +6951,9 @@ NamedDecl *Sema::FindInstantiatedDecl(SourceLocation Loc, NamedDecl *D,
               Args.addArgument(
                   getTrivialTemplateArgumentLoc(UnpackedArg, QualType(), Loc));
           }
-          QualType T = CheckTemplateIdType(ElaboratedTypeKeyword::None,
-                                           TemplateName(TD), Loc, Args);
+          QualType T = CheckTemplateIdType(
+              ElaboratedTypeKeyword::None, TemplateName(TD), Loc, Args,
+              /*Scope=*/nullptr, /*ForNestedNameSpecifier=*/false);
           // We may get a non-null type with errors, in which case
           // `getAsCXXRecordDecl` will return `nullptr`. For instance, this
           // happens when one of the template arguments is an invalid
diff --git a/clang/lib/Sema/SemaType.cpp b/clang/lib/Sema/SemaType.cpp
index 0f655d7f684a5..d723fb80f437e 100644
--- a/clang/lib/Sema/SemaType.cpp
+++ b/clang/lib/Sema/SemaType.cpp
@@ -6036,15 +6036,6 @@ namespace {
       assert(TInfo);
       TL.copy(TInfo->getTypeLoc().castAs<DependentNameTypeLoc>());
     }
-    void VisitDependentTemplateSpecializationTypeLoc(
-                                 DependentTemplateSpecializationTypeLoc TL) {
-      assert(DS.getTypeSpecType() == TST_typename);
-      TypeSourceInfo *TInfo = nullptr;
-      Sema::GetTypeFromParser(DS.getRepAsType(), &TInfo);
-      assert(TInfo);
-      TL.copy(
-          TInfo->getTypeLoc().castAs<DependentTemplateSpecializationTypeLoc>());
-    }
     void VisitAutoTypeLoc(AutoTypeLoc TL) {
       assert(DS.getTypeSpecType() == TST_auto ||
              DS.getTypeSpecType() == TST_decltype_auto ||
diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h
index 0587a7decbd8d..242ffb09af006 100644
--- a/clang/lib/Sema/TreeTransform.h
+++ b/clang/lib/Sema/TreeTransform.h
@@ -744,10 +744,11 @@ class TreeTransform {
 
   StmtResult TransformSEHHandler(Stmt *Handler);
 
-  QualType TransformDependentTemplateSpecializationType(
-      TypeLocBuilder &TLB, DependentTemplateSpecializationTypeLoc TL,
-      QualType ObjectType, NamedDecl *UnqualLookup,
-      bool AllowInjectedClassName);
+  QualType TransformTemplateSpecializationType(TypeLocBuilder &TLB,
+                                               TemplateSpecializationTypeLoc TL,
+                                               QualType ObjectType,
+                                               NamedDecl *FirstQualifierInScope,
+                                               bool AllowInjectedClassName);
 
   QualType TransformTagType(TypeLocBuilder &TLB, TagTypeLoc TL);
 
@@ -1163,24 +1164,6 @@ class TreeTransform {
     return SemaRef.BuildParenType(InnerType);
   }
 
-  /// Build a new typename type that refers to a template-id.
-  ///
-  /// By default, builds a new DependentNameType type from the
-  /// nested-name-specifier and the given type. Subclasses may override
-  /// this routine to provide different behavior.
-  QualType RebuildDependentTemplateSpecializationType(
-      ElaboratedTypeKeyword Keyword, SourceLocation TemplateKWLoc,
-      TemplateName Name, SourceLocation NameLoc, TemplateArgumentListInfo &Args,
-      bool AllowInjectedClassName) {
-    // If it's still dependent, make a dependent specialization.
-    if (const DependentTemplateStorage *S = Name.getAsDependentTemplateName())
-      return SemaRef.Context.getDependentTemplateSpecializationType(
-          Keyword, *S, Args.arguments());
-
-    return getDerived().RebuildTemplateSpecializationType(Keyword, Name,
-                                                          NameLoc, Args);
-  }
-
   /// Build a new typename type that refers to an identifier.
   ///
   /// By default, performs semantic analysis when building the typename type
@@ -1827,11 +1810,13 @@ class TreeTransform {
   /// By default, performs semantic analysis to build the new OpenMP clause.
   /// Subclasses may override this routine to provide different behavior.
   OMPClause *RebuildOMPDefaultClause(DefaultKind Kind, SourceLocation KindKwLoc,
+                                     OpenMPDefaultClauseVariableCategory VCKind,
+                                     SourceLocation VCLoc,
                                      SourceLocation StartLoc,
                                      SourceLocation LParenLoc,
                                      SourceLocation EndLoc) {
     return getSema().OpenMP().ActOnOpenMPDefaultClause(
-        Kind, KindKwLoc, StartLoc, LParenLoc, EndLoc);
+        Kind, KindKwLoc, VCKind, VCLoc, StartLoc, LParenLoc, EndLoc);
   }
 
   /// Build a new OpenMP 'proc_bind' clause.
@@ -5526,19 +5511,18 @@ QualType TreeTransform<Derived>::RebuildQualifiedType(QualType T,
 template <typename Derived>
 QualType TreeTransform<Derived>::TransformTypeInObjectScope(
     TypeLocBuilder &TLB, TypeLoc TL, QualType ObjectType,
-    NamedDecl *UnqualLookup) {
+    NamedDecl *FirstQualifierInScope) {
   assert(!getDerived().AlreadyTransformed(TL.getType()));
 
   switch (TL.getTypeLocClass()) {
-  case TypeLoc::DependentTemplateSpecialization:
-    return getDerived().TransformDependentTemplateSpecializationType(
-        TLB, TL.castAs<DependentTemplateSpecializationTypeLoc>(), ObjectType,
-        UnqualLookup, /*AllowInjectedClassName=*/true);
-  case TypeLoc::DependentName: {
+  case TypeLoc::TemplateSpecialization:
+    return getDerived().TransformTemplateSpecializationType(
+        TLB, TL.castAs<TemplateSpecializationTypeLoc>(), ObjectType,
+        FirstQualifierInScope, /*AllowInjectedClassName=*/true);
+  case TypeLoc::DependentName:
     return getDerived().TransformDependentNameType(
         TLB, TL.castAs<DependentNameTypeLoc>(), /*DeducedTSTContext=*/false,
-        ObjectType, UnqualLookup);
-  }
+        ObjectType, FirstQualifierInScope);
   default:
     // Any dependent canonical type can appear here, through type alias
     // templates.
@@ -7504,12 +7488,22 @@ QualType TreeTransform<Derived>::TransformAutoType(TypeLocBuilder &TLB,
 template <typename Derived>
 QualType TreeTransform<Derived>::TransformTemplateSpecializationType(
     TypeLocBuilder &TLB, TemplateSpecializationTypeLoc TL) {
+  return getDerived().TransformTemplateSpecializationType(
+      TLB, TL, /*ObjectType=*/QualType(), /*FirstQualifierInScope=*/nullptr,
+      /*AllowInjectedClassName=*/false);
+}
+
+template <typename Derived>
+QualType TreeTransform<Derived>::TransformTemplateSpecializationType(
+    TypeLocBuilder &TLB, TemplateSpecializationTypeLoc TL, QualType ObjectType,
+    NamedDecl *FirstQualifierInScope, bool AllowInjectedClassName) {
   const TemplateSpecializationType *T = TL.getTypePtr();
 
   NestedNameSpecifierLoc QualifierLoc = TL.getQualifierLoc();
   TemplateName Template = getDerived().TransformTemplateName(
       QualifierLoc, TL.getTemplateKeywordLoc(), T->getTemplateName(),
-      TL.getTemplateNameLoc());
+      TL.getTemplateNameLoc(), ObjectType, FirstQualifierInScope,
+      AllowInjectedClassName);
   if (Template.isNull())
     return QualType();
 
@@ -7532,23 +7526,6 @@ QualType TreeTransform<Derived>::TransformTemplateSpecializationType(
       NewTemplateArgs);
 
   if (!Result.isNull()) {
-    // Specializations of template template parameters are represented as
-    // TemplateSpecializationTypes, and substitution of type alias templates
-    // within a dependent context can transform them into
-    // DependentTemplateSpecializationTypes.
-    if (isa<DependentTemplateSpecializationType>(Result)) {
-      DependentTemplateSpecializationTypeLoc NewTL
-        = TLB.push<DependentTemplateSpecializationTypeLoc>(Result);
-      NewTL.setElaboratedKeywordLoc(TL.getElaboratedKeywordLoc());
-      NewTL.setQualifierLoc(QualifierLoc);
-      NewTL.setTemplateKeywordLoc(TL.getTemplateKeywordLoc());
-      NewTL.setTemplateNameLoc(TL.getTemplateNameLoc());
-      NewTL.setLAngleLoc(TL.getLAngleLoc());
-      NewTL.setRAngleLoc(TL.getRAngleLoc());
-      for (unsigned i = 0, e = NewTemplateArgs.size(); i != e; ++i)
-        NewTL.setArgLocInfo(i, NewTemplateArgs[i].getLocInfo());
-      return Result;
-    }
     TLB.push<TemplateSpecializationTypeLoc>(Result).set(
         TL.getElaboratedKeywordLoc(), QualifierLoc, TL.getTemplateKeywordLoc(),
         TL.getTemplateNameLoc(), NewTemplateArgs);
@@ -7799,83 +7776,6 @@ QualType TreeTransform<Derived>::TransformDependentNameType(
   return Result;
 }
 
-template <typename Derived>
-QualType TreeTransform<Derived>::TransformDependentTemplateSpecializationType(
-    TypeLocBuilder &TLB, DependentTemplateSpecializationTypeLoc TL) {
-  return getDerived().TransformDependentTemplateSpecializationType(
-      TLB, TL, QualType(), nullptr, false);
-}
-
-template <typename Derived>
-QualType TreeTransform<Derived>::TransformDependentTemplateSpecializationType(
-    TypeLocBuilder &TLB, DependentTemplateSpecializationTypeLoc TL,
-    QualType ObjectType, NamedDecl *UnqualLookup, bool AllowInjectedClassName) {
-  const DependentTemplateSpecializationType *T = TL.getTypePtr();
-
-  NestedNameSpecifierLoc QualifierLoc = TL.getQualifierLoc();
-  if (QualifierLoc) {
-    QualifierLoc = getDerived().TransformNestedNameSpecifierLoc(
-        QualifierLoc, ObjectType, UnqualLookup);
-    if (!QualifierLoc)
-      return QualType();
-    // These only apply to the leftmost prefix.
-    ObjectType = QualType();
-    UnqualLookup = nullptr;
-  }
-  CXXScopeSpec SS;
-  SS.Adopt(QualifierLoc);
-
-  TemplateArgumentListInfo NewTemplateArgs(TL.getLAngleLoc(),
-                                           TL.getRAngleLoc());
-  auto ArgsRange = llvm::make_range<TemplateArgumentLocContainerIterator<
-      DependentTemplateSpecializationTypeLoc>>({TL, 0}, {TL, TL.getNumArgs()});
-
-  if (getDerived().TransformTemplateArguments(ArgsRange.begin(),
-                                              ArgsRange.end(), NewTemplateArgs))
-    return QualType();
-  bool TemplateArgumentsChanged = !llvm::equal(
-      ArgsRange, NewTemplateArgs.arguments(),
-      [](const TemplateArgumentLoc &A, const TemplateArgumentLoc &B) {
-        return A.getArgument().structurallyEquals(B.getArgument());
-      });
-
-  const DependentTemplateStorage &DTN = T->getDependentTemplateName();
-
-  QualType Result = TL.getType();
-  if (getDerived().AlwaysRebuild() || SS.getScopeRep() != DTN.getQualifier() ||
-      TemplateArgumentsChanged || !ObjectType.isNull()) {
-    TemplateName Name = getDerived().RebuildTemplateName(
-        SS, TL.getTemplateKeywordLoc(), DTN.getName(), TL.getTemplateNameLoc(),
-        ObjectType, AllowInjectedClassName);
-    if (Name.isNull())
-      return QualType();
-    Result = getDerived().RebuildDependentTemplateSpecializationType(
-        T->getKeyword(), TL.getTemplateKeywordLoc(), Name,
-        TL.getTemplateNameLoc(), NewTemplateArgs,
-        /*AllowInjectedClassName=*/false);
-    if (Result.isNull())
-      return QualType();
-  }
-
-  QualifierLoc = SS.getWithLocInContext(SemaRef.Context);
-  if (isa<TemplateSpecializationType>(Result)) {
-    TLB.push<TemplateSpecializationTypeLoc>(Result).set(
-        TL.getElaboratedKeywordLoc(), QualifierLoc, TL.getTemplateKeywordLoc(),
-        TL.getTemplateNameLoc(), NewTemplateArgs);
-  } else {
-    auto SpecTL = TLB.push<DependentTemplateSpecializationTypeLoc>(Result);
-    SpecTL.setElaboratedKeywordLoc(TL.getElaboratedKeywordLoc());
-    SpecTL.setQualifierLoc(QualifierLoc);
-    SpecTL.setTemplateKeywordLoc(TL.getTemplateKeywordLoc());
-    SpecTL.setTemplateNameLoc(TL.getTemplateNameLoc());
-    SpecTL.setLAngleLoc(TL.getLAngleLoc());
-    SpecTL.setRAngleLoc(TL.getRAngleLoc());
-    for (unsigned I = 0, E = NewTemplateArgs.size(); I != E; ++I)
-      SpecTL.setArgLocInfo(I, NewTemplateArgs[I].getLocInfo());
-  }
-  return Result;
-}
-
 template<typename Derived>
 QualType TreeTransform<Derived>::TransformPackExpansionType(TypeLocBuilder &TLB,
                                                       PackExpansionTypeLoc TL) {
@@ -10614,8 +10514,9 @@ template <typename Derived>
 OMPClause *
 TreeTransform<Derived>::TransformOMPDefaultClause(OMPDefaultClause *C) {
   return getDerived().RebuildOMPDefaultClause(
-      C->getDefaultKind(), C->getDefaultKindKwLoc(), C->getBeginLoc(),
-      C->getLParenLoc(), C->getEndLoc());
+      C->getDefaultKind(), C->getDefaultKindKwLoc(), C->getDefaultVC(),
+      C->getDefaultVCLoc(), C->getBeginLoc(), C->getLParenLoc(),
+      C->getEndLoc());
 }
 
 template <typename Derived>
@@ -17468,8 +17369,9 @@ template <typename Derived>
 QualType TreeTransform<Derived>::RebuildTemplateSpecializationType(
     ElaboratedTypeKeyword Keyword, TemplateName Template,
     SourceLocation TemplateNameLoc, TemplateArgumentListInfo &TemplateArgs) {
-  return SemaRef.CheckTemplateIdType(Keyword, Template, TemplateNameLoc,
-                                     TemplateArgs);
+  return SemaRef.CheckTemplateIdType(
+      Keyword, Template, TemplateNameLoc, TemplateArgs,
+      /*Scope=*/nullptr, /*ForNestedNameSpecifier=*/false);
 }
 
 template<typename Derived>
diff --git a/clang/lib/Serialization/ASTCommon.h b/clang/lib/Serialization/ASTCommon.h
index 371db4bace013..c9b9b1bbf8743 100644
--- a/clang/lib/Serialization/ASTCommon.h
+++ b/clang/lib/Serialization/ASTCommon.h
@@ -41,7 +41,8 @@ enum class DeclUpdateKind {
   DeclMarkedOpenMPAllocate,
   DeclMarkedOpenMPDeclareTarget,
   DeclExported,
-  AddedAttrToRecord
+  AddedAttrToRecord,
+  CXXResolvedDtorGlobDelete
 };
 
 TypeIdx TypeIdxFromBuiltin(const BuiltinType *BT);
diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp
index 1b3a8b13f1fb1..9ee8a0fb0f060 100644
--- a/clang/lib/Serialization/ASTReader.cpp
+++ b/clang/lib/Serialization/ASTReader.cpp
@@ -7532,20 +7532,6 @@ void TypeLocReader::VisitDependentNameTypeLoc(DependentNameTypeLoc TL) {
   TL.setNameLoc(readSourceLocation());
 }
 
-void TypeLocReader::VisitDependentTemplateSpecializationTypeLoc(
-       DependentTemplateSpecializationTypeLoc TL) {
-  TL.setElaboratedKeywordLoc(readSourceLocation());
-  TL.setQualifierLoc(ReadNestedNameSpecifierLoc());
-  TL.setTemplateKeywordLoc(readSourceLocation());
-  TL.setTemplateNameLoc(readSourceLocation());
-  TL.setLAngleLoc(readSourceLocation());
-  TL.setRAngleLoc(readSourceLocation());
-  for (unsigned I = 0, E = TL.getNumArgs(); I != E; ++I)
-    TL.setArgLocInfo(I,
-                     Reader.readTemplateArgumentLocInfo(
-                         TL.getTypePtr()->template_arguments()[I].getKind()));
-}
-
 void TypeLocReader::VisitPackExpansionTypeLoc(PackExpansionTypeLoc TL) {
   TL.setEllipsisLoc(readSourceLocation());
 }
@@ -11646,6 +11632,9 @@ void OMPClauseReader::VisitOMPDefaultClause(OMPDefaultClause *C) {
   C->setDefaultKind(static_cast<llvm::omp::DefaultKind>(Record.readInt()));
   C->setLParenLoc(Record.readSourceLocation());
   C->setDefaultKindKwLoc(Record.readSourceLocation());
+  C->setDefaultVariableCategory(
+      Record.readEnum<OpenMPDefaultClauseVariableCategory>());
+  C->setDefaultVariableCategoryLocation(Record.readSourceLocation());
 }
 
 void OMPClauseReader::VisitOMPProcBindClause(OMPProcBindClause *C) {
diff --git a/clang/lib/Serialization/ASTReaderDecl.cpp b/clang/lib/Serialization/ASTReaderDecl.cpp
index 44b8ba6f4c984..4356f2b734fb0 100644
--- a/clang/lib/Serialization/ASTReaderDecl.cpp
+++ b/clang/lib/Serialization/ASTReaderDecl.cpp
@@ -2339,8 +2339,8 @@ void ASTDeclReader::VisitCXXConstructorDecl(CXXConstructorDecl *D) {
 void ASTDeclReader::VisitCXXDestructorDecl(CXXDestructorDecl *D) {
   VisitCXXMethodDecl(D);
 
+  CXXDestructorDecl *Canon = D->getCanonicalDecl();
   if (auto *OperatorDelete = readDeclAs<FunctionDecl>()) {
-    CXXDestructorDecl *Canon = D->getCanonicalDecl();
     auto *ThisArg = Record.readExpr();
     // FIXME: Check consistency if we have an old and new operator delete.
     if (!Canon->OperatorDelete) {
@@ -2348,6 +2348,11 @@ void ASTDeclReader::VisitCXXDestructorDecl(CXXDestructorDecl *D) {
       Canon->OperatorDeleteThisArg = ThisArg;
     }
   }
+  if (auto *OperatorGlobDelete = readDeclAs<FunctionDecl>()) {
+    if (!Canon->OperatorGlobalDelete) {
+      Canon->OperatorGlobalDelete = OperatorGlobDelete;
+    }
+  }
 }
 
 void ASTDeclReader::VisitCXXConversionDecl(CXXConversionDecl *D) {
@@ -4846,6 +4851,14 @@ void ASTDeclReader::UpdateDecl(Decl *D) {
       break;
     }
 
+    case DeclUpdateKind::CXXResolvedDtorGlobDelete: {
+      auto *Del = readDeclAs<FunctionDecl>();
+      auto *Canon = cast<CXXDestructorDecl>(D->getCanonicalDecl());
+      if (!Canon->OperatorGlobalDelete)
+        Canon->OperatorGlobalDelete = Del;
+      break;
+    }
+
     case DeclUpdateKind::CXXResolvedExceptionSpec: {
       SmallVector<QualType, 8> ExceptionStorage;
       auto ESI = Record.readExceptionSpecInfo(ExceptionStorage);
diff --git a/clang/lib/Serialization/ASTReaderStmt.cpp b/clang/lib/Serialization/ASTReaderStmt.cpp
index 7ec8e450fbaca..213c2c2148f64 100644
--- a/clang/lib/Serialization/ASTReaderStmt.cpp
+++ b/clang/lib/Serialization/ASTReaderStmt.cpp
@@ -2450,7 +2450,7 @@ void ASTStmtReader::VisitOMPSimdDirective(OMPSimdDirective *D) {
 void ASTStmtReader::VisitOMPCanonicalLoopNestTransformationDirective(
     OMPCanonicalLoopNestTransformationDirective *D) {
   VisitOMPLoopBasedDirective(D);
-  D->setNumGeneratedLoops(Record.readUInt32());
+  D->setNumGeneratedTopLevelLoops(Record.readUInt32());
 }
 
 void ASTStmtReader::VisitOMPTileDirective(OMPTileDirective *D) {
diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp
index a3a25e48f9065..3293a54a0a093 100644
--- a/clang/lib/Serialization/ASTWriter.cpp
+++ b/clang/lib/Serialization/ASTWriter.cpp
@@ -659,18 +659,6 @@ void TypeLocWriter::VisitDependentNameTypeLoc(DependentNameTypeLoc TL) {
   addSourceLocation(TL.getNameLoc());
 }
 
-void TypeLocWriter::VisitDependentTemplateSpecializationTypeLoc(
-       DependentTemplateSpecializationTypeLoc TL) {
-  addSourceLocation(TL.getElaboratedKeywordLoc());
-  Record.AddNestedNameSpecifierLoc(TL.getQualifierLoc());
-  addSourceLocation(TL.getTemplateKeywordLoc());
-  addSourceLocation(TL.getTemplateNameLoc());
-  addSourceLocation(TL.getLAngleLoc());
-  addSourceLocation(TL.getRAngleLoc());
-  for (unsigned I = 0, E = TL.getNumArgs(); I != E; ++I)
-    Record.AddTemplateArgumentLocInfo(TL.getArgLoc(I));
-}
-
 void TypeLocWriter::VisitPackExpansionTypeLoc(PackExpansionTypeLoc TL) {
   addSourceLocation(TL.getEllipsisLoc());
 }
@@ -1058,7 +1046,6 @@ void ASTWriter::WriteBlockInfoBlock() {
   RECORD(TYPE_TEMPLATE_TYPE_PARM);
   RECORD(TYPE_TEMPLATE_SPECIALIZATION);
   RECORD(TYPE_DEPENDENT_NAME);
-  RECORD(TYPE_DEPENDENT_TEMPLATE_SPECIALIZATION);
   RECORD(TYPE_DEPENDENT_SIZED_ARRAY);
   RECORD(TYPE_PAREN);
   RECORD(TYPE_MACRO_QUALIFIED);
@@ -6541,6 +6528,10 @@ void ASTWriter::WriteDeclUpdatesBlocks(ASTContext &Context,
         Record.AddStmt(cast<CXXDestructorDecl>(D)->getOperatorDeleteThisArg());
         break;
 
+      case DeclUpdateKind::CXXResolvedDtorGlobDelete:
+        Record.AddDeclRef(Update.getDecl());
+        break;
+
       case DeclUpdateKind::CXXResolvedExceptionSpec: {
         auto prototype =
           cast<FunctionDecl>(D)->getType()->castAs<FunctionProtoType>();
@@ -7589,6 +7580,20 @@ void ASTWriter::ResolvedOperatorDelete(const CXXDestructorDecl *DD,
   });
 }
 
+void ASTWriter::ResolvedOperatorGlobDelete(const CXXDestructorDecl *DD,
+                                           const FunctionDecl *GlobDelete) {
+  if (Chain && Chain->isProcessingUpdateRecords())
+    return;
+  assert(!WritingAST && "Already writing the AST!");
+  assert(GlobDelete && "Not given an operator delete");
+  if (!Chain)
+    return;
+  Chain->forEachImportedKeyDecl(DD, [&](const Decl *D) {
+    DeclUpdates[D].push_back(
+        DeclUpdate(DeclUpdateKind::CXXResolvedDtorGlobDelete, GlobDelete));
+  });
+}
+
 void ASTWriter::CompletedImplicitDefinition(const FunctionDecl *D) {
   if (Chain && Chain->isProcessingUpdateRecords()) return;
   assert(!WritingAST && "Already writing the AST!");
@@ -7888,6 +7893,8 @@ void OMPClauseWriter::VisitOMPDefaultClause(OMPDefaultClause *C) {
   Record.push_back(unsigned(C->getDefaultKind()));
   Record.AddSourceLocation(C->getLParenLoc());
   Record.AddSourceLocation(C->getDefaultKindKwLoc());
+  Record.push_back(unsigned(C->getDefaultVC()));
+  Record.AddSourceLocation(C->getDefaultVCLoc());
 }
 
 void OMPClauseWriter::VisitOMPProcBindClause(OMPProcBindClause *C) {
diff --git a/clang/lib/Serialization/ASTWriterDecl.cpp b/clang/lib/Serialization/ASTWriterDecl.cpp
index ec3dda1fcdf48..a8c487005f6ec 100644
--- a/clang/lib/Serialization/ASTWriterDecl.cpp
+++ b/clang/lib/Serialization/ASTWriterDecl.cpp
@@ -1793,6 +1793,7 @@ void ASTDeclWriter::VisitCXXDestructorDecl(CXXDestructorDecl *D) {
   Record.AddDeclRef(D->getOperatorDelete());
   if (D->getOperatorDelete())
     Record.AddStmt(D->getOperatorDeleteThisArg());
+  Record.AddDeclRef(D->getOperatorGlobalDelete());
 
   Code = serialization::DECL_CXX_DESTRUCTOR;
 }
diff --git a/clang/lib/Serialization/ASTWriterStmt.cpp b/clang/lib/Serialization/ASTWriterStmt.cpp
index 07a5cde47a9a8..21c04ddbc2c7a 100644
--- a/clang/lib/Serialization/ASTWriterStmt.cpp
+++ b/clang/lib/Serialization/ASTWriterStmt.cpp
@@ -2459,7 +2459,7 @@ void ASTStmtWriter::VisitOMPSimdDirective(OMPSimdDirective *D) {
 void ASTStmtWriter::VisitOMPCanonicalLoopNestTransformationDirective(
     OMPCanonicalLoopNestTransformationDirective *D) {
   VisitOMPLoopBasedDirective(D);
-  Record.writeUInt32(D->getNumGeneratedLoops());
+  Record.writeUInt32(D->getNumGeneratedTopLevelLoops());
 }
 
 void ASTStmtWriter::VisitOMPTileDirective(OMPTileDirective *D) {
diff --git a/clang/lib/StaticAnalyzer/Checkers/ArrayBoundChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/ArrayBoundChecker.cpp
index d35031b5c22df..6ad5acd4e76f2 100644
--- a/clang/lib/StaticAnalyzer/Checkers/ArrayBoundChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/ArrayBoundChecker.cpp
@@ -130,6 +130,20 @@ struct Messages {
   std::string Short, Full;
 };
 
+enum class BadOffsetKind { Negative, Overflowing, Indeterminate };
+
+constexpr llvm::StringLiteral Adjectives[] = {"a negative", "an overflowing",
+                                              "a negative or overflowing"};
+static StringRef asAdjective(BadOffsetKind Problem) {
+  return Adjectives[static_cast<int>(Problem)];
+}
+
+constexpr llvm::StringLiteral Prepositions[] = {"preceding", "after the end of",
+                                                "around"};
+static StringRef asPreposition(BadOffsetKind Problem) {
+  return Prepositions[static_cast<int>(Problem)];
+}
+
 // NOTE: The `ArraySubscriptExpr` and `UnaryOperator` callbacks are `PostStmt`
 // instead of `PreStmt` because the current implementation passes the whole
 // expression to `CheckerContext::getSVal()` which only works after the
@@ -388,18 +402,6 @@ static std::optional<int64_t> getConcreteValue(std::optional<NonLoc> SV) {
   return SV ? getConcreteValue(*SV) : std::nullopt;
 }
 
-static Messages getPrecedesMsgs(const MemSpaceRegion *Space,
-                                const SubRegion *Region, NonLoc Offset) {
-  std::string RegName = getRegionName(Space, Region), OffsetStr = "";
-
-  if (auto ConcreteOffset = getConcreteValue(Offset))
-    OffsetStr = formatv(" {0}", ConcreteOffset);
-
-  return {
-      formatv("Out of bound access to memory preceding {0}", RegName),
-      formatv("Access of {0} at negative byte offset{1}", RegName, OffsetStr)};
-}
-
 /// Try to divide `Val1` and `Val2` (in place) by `Divisor` and return true if
 /// it can be performed (`Divisor` is nonzero and there is no remainder). The
 /// values `Val1` and `Val2` may be nullopt and in that case the corresponding
@@ -419,10 +421,11 @@ static bool tryDividePair(std::optional<int64_t> &Val1,
   return true;
 }
 
-static Messages getExceedsMsgs(ASTContext &ACtx, const MemSpaceRegion *Space,
-                               const SubRegion *Region, NonLoc Offset,
-                               NonLoc Extent, SVal Location,
-                               bool AlsoMentionUnderflow) {
+static Messages getNonTaintMsgs(const ASTContext &ACtx,
+                                const MemSpaceRegion *Space,
+                                const SubRegion *Region, NonLoc Offset,
+                                std::optional<NonLoc> Extent, SVal Location,
+                                BadOffsetKind Problem) {
   std::string RegName = getRegionName(Space, Region);
   const auto *EReg = Location.getAsRegion()->getAs<ElementRegion>();
   assert(EReg && "this checker only handles element access");
@@ -439,15 +442,21 @@ static Messages getExceedsMsgs(ASTContext &ACtx, const MemSpaceRegion *Space,
   SmallString<256> Buf;
   llvm::raw_svector_ostream Out(Buf);
   Out << "Access of ";
-  if (!ExtentN && !UseByteOffsets)
+  if (OffsetN && !ExtentN && !UseByteOffsets) {
+    // If the offset is reported as an index, then the report must mention the
+    // element type (because it is not always clear from the code). It's more
+    // natural to mention the element type later where the extent is described,
+    // but if the extent is unknown/irrelevant, then the element type can be
+    // inserted into the message at this point.
     Out << "'" << ElemType.getAsString() << "' element in ";
+  }
   Out << RegName << " at ";
-  if (AlsoMentionUnderflow) {
-    Out << "a negative or overflowing " << OffsetOrIndex;
-  } else if (OffsetN) {
+  if (OffsetN) {
+    if (Problem == BadOffsetKind::Negative)
+      Out << "negative ";
     Out << OffsetOrIndex << " " << *OffsetN;
   } else {
-    Out << "an overflowing " << OffsetOrIndex;
+    Out << asAdjective(Problem) << " " << OffsetOrIndex;
   }
   if (ExtentN) {
     Out << ", while it holds only ";
@@ -465,8 +474,7 @@ static Messages getExceedsMsgs(ASTContext &ACtx, const MemSpaceRegion *Space,
   }
 
   return {formatv("Out of bound access to memory {0} {1}",
-                  AlsoMentionUnderflow ? "around" : "after the end of",
-                  RegName),
+                  asPreposition(Problem), RegName),
           std::string(Buf)};
 }
 
@@ -635,7 +643,9 @@ void ArrayBoundChecker::performCheck(const Expr *E, CheckerContext &C) const {
       } else {
         if (!WithinLowerBound) {
           // ...and it cannot be valid (>= 0), so report an error.
-          Messages Msgs = getPrecedesMsgs(Space, Reg, ByteOffset);
+          Messages Msgs = getNonTaintMsgs(C.getASTContext(), Space, Reg,
+                                          ByteOffset, /*Extent=*/std::nullopt,
+                                          Location, BadOffsetKind::Negative);
           reportOOB(C, PrecedesLowerBound, Msgs, ByteOffset, std::nullopt);
           return;
         }
@@ -677,9 +687,12 @@ void ArrayBoundChecker::performCheck(const Expr *E, CheckerContext &C) const {
           return;
         }
 
+        BadOffsetKind Problem = AlsoMentionUnderflow
+                                    ? BadOffsetKind::Indeterminate
+                                    : BadOffsetKind::Overflowing;
         Messages Msgs =
-            getExceedsMsgs(C.getASTContext(), Space, Reg, ByteOffset,
-                           *KnownSize, Location, AlsoMentionUnderflow);
+            getNonTaintMsgs(C.getASTContext(), Space, Reg, ByteOffset,
+                            *KnownSize, Location, Problem);
         reportOOB(C, ExceedsUpperBound, Msgs, ByteOffset, KnownSize);
         return;
       }
diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/ASTUtils.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/ASTUtils.cpp
index 9a7f5b71cae71..b629de3254ed3 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/ASTUtils.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/ASTUtils.cpp
@@ -91,6 +91,13 @@ bool tryToFindPtrOrigin(
       continue;
     }
     if (auto *call = dyn_cast<CallExpr>(E)) {
+      if (auto *Callee = call->getCalleeDecl()) {
+        if (Callee->hasAttr<CFReturnsRetainedAttr>() ||
+            Callee->hasAttr<NSReturnsRetainedAttr>()) {
+          return callback(E, true);
+        }
+      }
+
       if (auto *memberCall = dyn_cast<CXXMemberCallExpr>(call)) {
         if (auto *decl = memberCall->getMethodDecl()) {
           std::optional<bool> IsGetterOfRefCt = isGetterOfSafePtr(decl);
@@ -153,6 +160,29 @@ bool tryToFindPtrOrigin(
         if (Name == "__builtin___CFStringMakeConstantString" ||
             Name == "NSClassFromString")
           return callback(E, true);
+      } else if (auto *CalleeE = call->getCallee()) {
+        if (auto *E = dyn_cast<DeclRefExpr>(CalleeE->IgnoreParenCasts())) {
+          if (isSingleton(E->getFoundDecl()))
+            return callback(E, true);
+        }
+      }
+
+      // Sometimes, canonical type erroneously turns Ref<T> into T.
+      // Workaround this problem by checking again if the original type was
+      // a SubstTemplateTypeParmType of a safe smart pointer type (e.g. Ref).
+      if (auto *CalleeDecl = call->getCalleeDecl()) {
+        if (auto *FD = dyn_cast<FunctionDecl>(CalleeDecl)) {
+          auto RetType = FD->getReturnType();
+          if (auto *Subst = dyn_cast<SubstTemplateTypeParmType>(RetType)) {
+            if (auto *SubstType = Subst->desugar().getTypePtr()) {
+              if (auto *RD = dyn_cast<RecordType>(SubstType)) {
+                if (auto *CXX = dyn_cast<CXXRecordDecl>(RD->getOriginalDecl()))
+                  if (isSafePtr(CXX))
+                    return callback(E, true);
+              }
+            }
+          }
+        }
       }
     }
     if (auto *ObjCMsgExpr = dyn_cast<ObjCMessageExpr>(E)) {
diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp
index 884dbe90e7b12..90b2343b4be77 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp
@@ -479,7 +479,7 @@ bool isTrivialBuiltinFunction(const FunctionDecl *F) {
          Name.starts_with("os_log") || Name.starts_with("_os_log");
 }
 
-bool isSingleton(const FunctionDecl *F) {
+bool isSingleton(const NamedDecl *F) {
   assert(F);
   // FIXME: check # of params == 1
   if (auto *MethodDecl = dyn_cast<CXXMethodDecl>(F)) {
@@ -666,6 +666,10 @@ class TrivialFunctionAnalysisVisitor
     return IsFunctionTrivial(Callee);
   }
 
+  bool VisitGCCAsmStmt(const GCCAsmStmt *AS) {
+    return AS->getAsmString() == "brk #0xc471";
+  }
+
   bool
   VisitSubstNonTypeTemplateParmExpr(const SubstNonTypeTemplateParmExpr *E) {
     // Non-type template paramter is compile time constant and trivial.
diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.h b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.h
index 3c9560cb8059b..d2095d07e1434 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.h
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.h
@@ -21,6 +21,7 @@ class CXXMethodDecl;
 class CXXRecordDecl;
 class Decl;
 class FunctionDecl;
+class NamedDecl;
 class QualType;
 class RecordType;
 class Stmt;
@@ -156,7 +157,7 @@ bool isPtrConversion(const FunctionDecl *F);
 bool isTrivialBuiltinFunction(const FunctionDecl *F);
 
 /// \returns true if \p F is a static singleton function.
-bool isSingleton(const FunctionDecl *F);
+bool isSingleton(const NamedDecl *F);
 
 /// An inter-procedural analysis facility that detects functions with "trivial"
 /// behavior with respect to reference counting, such as simple field getters.
diff --git a/clang/lib/StaticAnalyzer/Core/HTMLDiagnostics.cpp b/clang/lib/StaticAnalyzer/Core/HTMLDiagnostics.cpp
index 2ef98e17cf9c0..217b853305ed1 100644
--- a/clang/lib/StaticAnalyzer/Core/HTMLDiagnostics.cpp
+++ b/clang/lib/StaticAnalyzer/Core/HTMLDiagnostics.cpp
@@ -10,6 +10,9 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "HTMLDiagnostics.h"
+#include "PlistDiagnostics.h"
+#include "SarifDiagnostics.h"
 #include "clang/AST/Decl.h"
 #include "clang/AST/DeclBase.h"
 #include "clang/AST/Stmt.h"
@@ -80,7 +83,7 @@ class HTMLDiagnostics : public PathDiagnosticConsumer {
   void FlushDiagnosticsImpl(std::vector<const PathDiagnostic *> &Diags,
                             FilesMade *filesMade) override;
 
-  StringRef getName() const override { return "HTMLDiagnostics"; }
+  StringRef getName() const override { return HTML_DIAGNOSTICS_NAME; }
 
   bool supportsCrossFileDiagnostics() const override {
     return SupportsCrossFileDiagnostics;
@@ -169,6 +172,21 @@ llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const ArrowMap &Indices) {
 
 } // namespace
 
+/// Creates and registers an HTML diagnostic consumer, without any additional
+/// text consumer.
+static void createHTMLDiagnosticConsumerImpl(
+    PathDiagnosticConsumerOptions DiagOpts, PathDiagnosticConsumers &C,
+    const std::string &OutputDir, const Preprocessor &PP,
+    bool SupportMultipleFiles) {
+
+  // TODO: Emit an error here.
+  if (OutputDir.empty())
+    return;
+
+  C.emplace_back(std::make_unique<HTMLDiagnostics>(
+      std::move(DiagOpts), OutputDir, PP, SupportMultipleFiles));
+}
+
 void ento::createHTMLDiagnosticConsumer(
     PathDiagnosticConsumerOptions DiagOpts, PathDiagnosticConsumers &C,
     const std::string &OutputDir, const Preprocessor &PP,
@@ -183,12 +201,8 @@ void ento::createHTMLDiagnosticConsumer(
   createTextMinimalPathDiagnosticConsumer(DiagOpts, C, OutputDir, PP, CTU,
                                           MacroExpansions);
 
-  // TODO: Emit an error here.
-  if (OutputDir.empty())
-    return;
-
-  C.emplace_back(std::make_unique<HTMLDiagnostics>(std::move(DiagOpts),
-                                                   OutputDir, PP, true));
+  createHTMLDiagnosticConsumerImpl(DiagOpts, C, OutputDir, PP,
+                                   /*SupportMultipleFiles=*/true);
 }
 
 void ento::createHTMLSingleFileDiagnosticConsumer(
@@ -199,12 +213,8 @@ void ento::createHTMLSingleFileDiagnosticConsumer(
   createTextMinimalPathDiagnosticConsumer(DiagOpts, C, OutputDir, PP, CTU,
                                           MacroExpansions);
 
-  // TODO: Emit an error here.
-  if (OutputDir.empty())
-    return;
-
-  C.emplace_back(std::make_unique<HTMLDiagnostics>(std::move(DiagOpts),
-                                                   OutputDir, PP, false));
+  createHTMLDiagnosticConsumerImpl(DiagOpts, C, OutputDir, PP,
+                                   /*SupportMultipleFiles=*/false);
 }
 
 void ento::createPlistHTMLDiagnosticConsumer(
@@ -212,11 +222,10 @@ void ento::createPlistHTMLDiagnosticConsumer(
     const std::string &prefix, const Preprocessor &PP,
     const cross_tu::CrossTranslationUnitContext &CTU,
     const MacroExpansionContext &MacroExpansions) {
-  createHTMLDiagnosticConsumer(
-      DiagOpts, C, std::string(llvm::sys::path::parent_path(prefix)), PP, CTU,
-      MacroExpansions);
-  createPlistMultiFileDiagnosticConsumer(DiagOpts, C, prefix, PP, CTU,
-                                         MacroExpansions);
+  createHTMLDiagnosticConsumerImpl(
+      DiagOpts, C, std::string(llvm::sys::path::parent_path(prefix)), PP, true);
+  createPlistDiagnosticConsumerImpl(DiagOpts, C, prefix, PP, CTU,
+                                    MacroExpansions, true);
   createTextMinimalPathDiagnosticConsumer(std::move(DiagOpts), C, prefix, PP,
                                           CTU, MacroExpansions);
 }
@@ -226,11 +235,11 @@ void ento::createSarifHTMLDiagnosticConsumer(
     const std::string &sarif_file, const Preprocessor &PP,
     const cross_tu::CrossTranslationUnitContext &CTU,
     const MacroExpansionContext &MacroExpansions) {
-  createHTMLDiagnosticConsumer(
+  createHTMLDiagnosticConsumerImpl(
       DiagOpts, C, std::string(llvm::sys::path::parent_path(sarif_file)), PP,
-      CTU, MacroExpansions);
-  createSarifDiagnosticConsumer(DiagOpts, C, sarif_file, PP, CTU,
-                                MacroExpansions);
+      true);
+  createSarifDiagnosticConsumerImpl(DiagOpts, C, sarif_file, PP);
+
   createTextMinimalPathDiagnosticConsumer(std::move(DiagOpts), C, sarif_file,
                                           PP, CTU, MacroExpansions);
 }
@@ -246,18 +255,6 @@ void HTMLDiagnostics::FlushDiagnosticsImpl(
     ReportDiag(*Diag, filesMade);
 }
 
-static llvm::SmallString<32> getIssueHash(const PathDiagnostic &D,
-                                          const Preprocessor &PP) {
-  SourceManager &SMgr = PP.getSourceManager();
-  PathDiagnosticLocation UPDLoc = D.getUniqueingLoc();
-  FullSourceLoc L(SMgr.getExpansionLoc(UPDLoc.isValid()
-                                           ? UPDLoc.asLocation()
-                                           : D.getLocation().asLocation()),
-                  SMgr);
-  return getIssueHash(L, D.getCheckerName(), D.getBugType(),
-                      D.getDeclWithIssue(), PP.getLangOpts());
-}
-
 void HTMLDiagnostics::ReportDiag(const PathDiagnostic& D,
                                  FilesMade *filesMade) {
   // Create the HTML directory if it is missing.
@@ -302,7 +299,8 @@ void HTMLDiagnostics::ReportDiag(const PathDiagnostic& D,
       }
   }
 
-  SmallString<32> IssueHash = getIssueHash(D, PP);
+  SmallString<32> IssueHash =
+      D.getIssueHash(PP.getSourceManager(), PP.getLangOpts());
   auto [It, IsNew] = EmittedHashes.insert(IssueHash);
   if (!IsNew) {
     // We've already emitted a duplicate issue. It'll get overwritten anyway.
@@ -361,6 +359,12 @@ void HTMLDiagnostics::ReportDiag(const PathDiagnostic& D,
     if (EC != llvm::errc::file_exists) {
       llvm::errs() << "warning: could not create file in '" << Directory
                    << "': " << EC.message() << '\n';
+    } else if (filesMade) {
+      // Record that we created the file so that it gets referenced in the
+      // plist and SARIF reports for every translation unit that found the
+      // issue.
+      filesMade->addDiagnostic(D, getName(),
+                               llvm::sys::path::filename(ResultPath));
     }
     return;
   }
@@ -671,8 +675,8 @@ void HTMLDiagnostics::FinalizeHTML(const PathDiagnostic &D, Rewriter &R,
 
     os  << "\n<!-- FUNCTIONNAME " <<  declName << " -->\n";
 
-    os << "\n<!-- ISSUEHASHCONTENTOFLINEINCONTEXT " << getIssueHash(D, PP)
-       << " -->\n";
+    os << "\n<!-- ISSUEHASHCONTENTOFLINEINCONTEXT "
+       << D.getIssueHash(PP.getSourceManager(), PP.getLangOpts()) << " -->\n";
 
     os << "\n<!-- BUGLINE "
        << LineNumber
diff --git a/clang/lib/StaticAnalyzer/Core/HTMLDiagnostics.h b/clang/lib/StaticAnalyzer/Core/HTMLDiagnostics.h
new file mode 100644
index 0000000000000..d6e4d6b344ddb
--- /dev/null
+++ b/clang/lib/StaticAnalyzer/Core/HTMLDiagnostics.h
@@ -0,0 +1,14 @@
+//==- HTMLDiagnostics.h - HTML Diagnostics for Paths ---------------*- C++ -*-//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_LIB_STATICANALYZER_CORE_HTMLDIAGNOSTICS_H
+#define LLVM_CLANG_LIB_STATICANALYZER_CORE_HTMLDIAGNOSTICS_H
+
+#define HTML_DIAGNOSTICS_NAME "HTMLDiagnostics"
+
+#endif
diff --git a/clang/lib/StaticAnalyzer/Core/PlistDiagnostics.cpp b/clang/lib/StaticAnalyzer/Core/PlistDiagnostics.cpp
index 771d09e19f178..3e3fff900cde8 100644
--- a/clang/lib/StaticAnalyzer/Core/PlistDiagnostics.cpp
+++ b/clang/lib/StaticAnalyzer/Core/PlistDiagnostics.cpp
@@ -10,6 +10,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "PlistDiagnostics.h"
 #include "clang/Analysis/IssueHash.h"
 #include "clang/Analysis/MacroExpansionContext.h"
 #include "clang/Analysis/PathDiagnostic.h"
@@ -528,19 +529,31 @@ PlistDiagnostics::PlistDiagnostics(
   (void)this->CTU;
 }
 
-void ento::createPlistDiagnosticConsumer(
+/// Creates and registers a Plist diagnostic consumer, without any additional
+/// text consumer.
+void ento::createPlistDiagnosticConsumerImpl(
     PathDiagnosticConsumerOptions DiagOpts, PathDiagnosticConsumers &C,
     const std::string &OutputFile, const Preprocessor &PP,
     const cross_tu::CrossTranslationUnitContext &CTU,
-    const MacroExpansionContext &MacroExpansions) {
+    const MacroExpansionContext &MacroExpansions, bool SupportsMultipleFiles) {
 
   // TODO: Emit an error here.
   if (OutputFile.empty())
     return;
 
   C.push_back(std::make_unique<PlistDiagnostics>(
-      DiagOpts, OutputFile, PP, CTU, MacroExpansions,
-      /*supportsMultipleFiles=*/false));
+      DiagOpts, OutputFile, PP, CTU, MacroExpansions, SupportsMultipleFiles));
+}
+
+void ento::createPlistDiagnosticConsumer(
+    PathDiagnosticConsumerOptions DiagOpts, PathDiagnosticConsumers &C,
+    const std::string &OutputFile, const Preprocessor &PP,
+    const cross_tu::CrossTranslationUnitContext &CTU,
+    const MacroExpansionContext &MacroExpansions) {
+
+  createPlistDiagnosticConsumerImpl(DiagOpts, C, OutputFile, PP, CTU,
+                                    MacroExpansions,
+                                    /*SupportsMultipleFiles=*/false);
   createTextMinimalPathDiagnosticConsumer(std::move(DiagOpts), C, OutputFile,
                                           PP, CTU, MacroExpansions);
 }
@@ -551,13 +564,10 @@ void ento::createPlistMultiFileDiagnosticConsumer(
     const cross_tu::CrossTranslationUnitContext &CTU,
     const MacroExpansionContext &MacroExpansions) {
 
-  // TODO: Emit an error here.
-  if (OutputFile.empty())
-    return;
+  createPlistDiagnosticConsumerImpl(DiagOpts, C, OutputFile, PP, CTU,
+                                    MacroExpansions,
+                                    /*SupportsMultipleFiles=*/true);
 
-  C.push_back(std::make_unique<PlistDiagnostics>(
-      DiagOpts, OutputFile, PP, CTU, MacroExpansions,
-      /*supportsMultipleFiles=*/true));
   createTextMinimalPathDiagnosticConsumer(std::move(DiagOpts), C, OutputFile,
                                           PP, CTU, MacroExpansions);
 }
@@ -696,13 +706,11 @@ void PlistDiagnostics::FlushDiagnosticsImpl(
     o << "   <key>issue_hash_content_of_line_in_context</key>";
     PathDiagnosticLocation UPDLoc = D->getUniqueingLoc();
     FullSourceLoc L(SM.getExpansionLoc(UPDLoc.isValid()
-                                            ? UPDLoc.asLocation()
-                                            : D->getLocation().asLocation()),
+                                           ? UPDLoc.asLocation()
+                                           : D->getLocation().asLocation()),
                     SM);
-    const Decl *DeclWithIssue = D->getDeclWithIssue();
-    EmitString(o, getIssueHash(L, D->getCheckerName(), D->getBugType(),
-                               DeclWithIssue, LangOpts))
-        << '\n';
+
+    EmitString(o, D->getIssueHash(SM, LangOpts)) << '\n';
 
     // Output information about the semantic context where
     // the issue occurred.
diff --git a/clang/lib/StaticAnalyzer/Core/PlistDiagnostics.h b/clang/lib/StaticAnalyzer/Core/PlistDiagnostics.h
new file mode 100644
index 0000000000000..d4ec998ad7d2d
--- /dev/null
+++ b/clang/lib/StaticAnalyzer/Core/PlistDiagnostics.h
@@ -0,0 +1,27 @@
+//==- PlistDiagnostics.h - Plist Diagnostics for Paths -------------*- C++ -*-//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_LIB_STATICANALYZER_CORE_PLISTDIAGNOSTICS_H
+#define LLVM_CLANG_LIB_STATICANALYZER_CORE_PLISTDIAGNOSTICS_H
+
+#include "clang/CrossTU/CrossTranslationUnit.h"
+#include "clang/Lex/Preprocessor.h"
+#include "clang/StaticAnalyzer/Core/PathDiagnosticConsumers.h"
+#include <string>
+
+namespace clang::ento {
+
+void createPlistDiagnosticConsumerImpl(
+    PathDiagnosticConsumerOptions DiagOpts, PathDiagnosticConsumers &C,
+    const std::string &Output, const Preprocessor &PP,
+    const cross_tu::CrossTranslationUnitContext &CTU,
+    const MacroExpansionContext &MacroExpansions, bool SupportsMultipleFiles);
+
+} // namespace clang::ento
+
+#endif
diff --git a/clang/lib/StaticAnalyzer/Core/SarifDiagnostics.cpp b/clang/lib/StaticAnalyzer/Core/SarifDiagnostics.cpp
index 94b2f486ab7d4..6673f2f319c0e 100644
--- a/clang/lib/StaticAnalyzer/Core/SarifDiagnostics.cpp
+++ b/clang/lib/StaticAnalyzer/Core/SarifDiagnostics.cpp
@@ -10,6 +10,9 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "SarifDiagnostics.h"
+#include "HTMLDiagnostics.h"
+#include "clang/Analysis/IssueHash.h"
 #include "clang/Analysis/MacroExpansionContext.h"
 #include "clang/Analysis/PathDiagnostic.h"
 #include "clang/Basic/Sarif.h"
@@ -30,12 +33,13 @@ namespace {
 class SarifDiagnostics : public PathDiagnosticConsumer {
   std::string OutputFile;
   const LangOptions &LO;
+  const SourceManager &SM;
   SarifDocumentWriter SarifWriter;
 
 public:
   SarifDiagnostics(const std::string &Output, const LangOptions &LO,
                    const SourceManager &SM)
-      : OutputFile(Output), LO(LO), SarifWriter(SM) {}
+      : OutputFile(Output), LO(LO), SM(SM), SarifWriter(SM) {}
   ~SarifDiagnostics() override = default;
 
   void FlushDiagnosticsImpl(std::vector<const PathDiagnostic *> &Diags,
@@ -45,6 +49,11 @@ class SarifDiagnostics : public PathDiagnosticConsumer {
   PathGenerationScheme getGenerationScheme() const override { return Minimal; }
   bool supportsLogicalOpControlFlow() const override { return true; }
   bool supportsCrossFileDiagnostics() const override { return true; }
+
+private:
+  SarifResult createResult(const PathDiagnostic *Diag,
+                           const StringMap<uint32_t> &RuleMapping,
+                           const LangOptions &LO, FilesMade *FM);
 };
 } // end anonymous namespace
 
@@ -54,14 +63,24 @@ void ento::createSarifDiagnosticConsumer(
     const cross_tu::CrossTranslationUnitContext &CTU,
     const MacroExpansionContext &MacroExpansions) {
 
+  createSarifDiagnosticConsumerImpl(DiagOpts, C, Output, PP);
+
+  createTextMinimalPathDiagnosticConsumer(std::move(DiagOpts), C, Output, PP,
+                                          CTU, MacroExpansions);
+}
+
+/// Creates and registers a SARIF diagnostic consumer, without any additional
+/// text consumer.
+void ento::createSarifDiagnosticConsumerImpl(
+    PathDiagnosticConsumerOptions DiagOpts, PathDiagnosticConsumers &C,
+    const std::string &Output, const Preprocessor &PP) {
+
   // TODO: Emit an error here.
   if (Output.empty())
     return;
 
   C.push_back(std::make_unique<SarifDiagnostics>(Output, PP.getLangOpts(),
                                                  PP.getSourceManager()));
-  createTextMinimalPathDiagnosticConsumer(std::move(DiagOpts), C, Output, PP,
-                                          CTU, MacroExpansions);
 }
 
 static StringRef getRuleDescription(StringRef CheckName) {
@@ -162,9 +181,12 @@ createRuleMapping(const std::vector<const PathDiagnostic *> &Diags,
   return RuleMapping;
 }
 
-static SarifResult createResult(const PathDiagnostic *Diag,
-                                const StringMap<uint32_t> &RuleMapping,
-                                const LangOptions &LO) {
+static const llvm::StringRef IssueHashKey = "clang/issueHash/v1";
+
+SarifResult
+SarifDiagnostics::createResult(const PathDiagnostic *Diag,
+                               const StringMap<uint32_t> &RuleMapping,
+                               const LangOptions &LO, FilesMade *FM) {
 
   StringRef CheckName = Diag->getCheckerName();
   uint32_t RuleIdx = RuleMapping.lookup(CheckName);
@@ -172,17 +194,40 @@ static SarifResult createResult(const PathDiagnostic *Diag,
       Diag->getLocation().asRange(), Diag->getLocation().getManager(), LO);
 
   SmallVector<ThreadFlow, 8> Flows = createThreadFlows(Diag, LO);
+
+  auto IssueHash = Diag->getIssueHash(SM, LO);
+
+  std::string HtmlReportURL;
+  if (FM && !FM->empty()) {
+    // Find the HTML report that was generated for this issue, if one exists.
+    PDFileEntry::ConsumerFiles *Files = FM->getFiles(*Diag);
+    if (Files) {
+      auto HtmlFile =
+          std::find_if(Files->cbegin(), Files->cend(), [](auto &File) {
+            return File.first == HTML_DIAGNOSTICS_NAME;
+          });
+      if (HtmlFile != Files->cend()) {
+        SmallString<128> HtmlReportPath =
+            llvm::sys::path::parent_path(OutputFile);
+        llvm::sys::path::append(HtmlReportPath, HtmlFile->second);
+        HtmlReportURL = SarifDocumentWriter::fileNameToURI(HtmlReportPath);
+      }
+    }
+  }
+
   auto Result = SarifResult::create(RuleIdx)
                     .setRuleId(CheckName)
                     .setDiagnosticMessage(Diag->getVerboseDescription())
                     .setDiagnosticLevel(SarifResultLevel::Warning)
                     .setLocations({Range})
+                    .addPartialFingerprint(IssueHashKey, IssueHash)
+                    .setHostedViewerURI(HtmlReportURL)
                     .setThreadFlows(Flows);
   return Result;
 }
 
 void SarifDiagnostics::FlushDiagnosticsImpl(
-    std::vector<const PathDiagnostic *> &Diags, FilesMade *) {
+    std::vector<const PathDiagnostic *> &Diags, FilesMade *FM) {
   // We currently overwrite the file if it already exists. However, it may be
   // useful to add a feature someday that allows the user to append a run to an
   // existing SARIF file. One danger from that approach is that the size of the
@@ -199,7 +244,7 @@ void SarifDiagnostics::FlushDiagnosticsImpl(
   SarifWriter.createRun("clang", "clang static analyzer", ToolVersion);
   StringMap<uint32_t> RuleMapping = createRuleMapping(Diags, SarifWriter);
   for (const PathDiagnostic *D : Diags) {
-    SarifResult Result = createResult(D, RuleMapping, LO);
+    SarifResult Result = createResult(D, RuleMapping, LO, FM);
     SarifWriter.appendResult(Result);
   }
   auto Document = SarifWriter.createDocument();
diff --git a/clang/lib/StaticAnalyzer/Core/SarifDiagnostics.h b/clang/lib/StaticAnalyzer/Core/SarifDiagnostics.h
new file mode 100644
index 0000000000000..533ee99191926
--- /dev/null
+++ b/clang/lib/StaticAnalyzer/Core/SarifDiagnostics.h
@@ -0,0 +1,25 @@
+//==- SarifDiagnostics.h - SARIF Diagnostics for Paths -------------*- C++ -*-//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_LIB_STATICANALYZER_CORE_SARIFDIAGNOSTICS_H
+#define LLVM_CLANG_LIB_STATICANALYZER_CORE_SARIFDIAGNOSTICS_H
+
+#include "clang/Lex/Preprocessor.h"
+#include "clang/StaticAnalyzer/Core/PathDiagnosticConsumers.h"
+#include <string>
+
+namespace clang::ento {
+
+void createSarifDiagnosticConsumerImpl(PathDiagnosticConsumerOptions DiagOpts,
+                                       PathDiagnosticConsumers &C,
+                                       const std::string &Output,
+                                       const Preprocessor &PP);
+
+} // namespace clang::ento
+
+#endif
diff --git a/clang/lib/StaticAnalyzer/Frontend/ModelInjector.cpp b/clang/lib/StaticAnalyzer/Frontend/ModelInjector.cpp
index 975c72af0b031..be74ff2cd4799 100644
--- a/clang/lib/StaticAnalyzer/Frontend/ModelInjector.cpp
+++ b/clang/lib/StaticAnalyzer/Frontend/ModelInjector.cpp
@@ -84,8 +84,8 @@ void ModelInjector::onBodySynthesis(const NamedDecl *D) {
   // behavior for models
   CompilerInstance Instance(std::move(Invocation),
                             CI.getPCHContainerOperations());
+  Instance.setVirtualFileSystem(CI.getVirtualFileSystemPtr());
   Instance.createDiagnostics(
-      CI.getVirtualFileSystem(),
       new ForwardingDiagnosticConsumer(CI.getDiagnosticClient()),
       /*ShouldOwnClient=*/true);
 
diff --git a/clang/lib/Testing/TestAST.cpp b/clang/lib/Testing/TestAST.cpp
index b59a8d55129de..9ad0de95530fb 100644
--- a/clang/lib/Testing/TestAST.cpp
+++ b/clang/lib/Testing/TestAST.cpp
@@ -54,8 +54,10 @@ class StoreDiagnostics : public DiagnosticConsumer {
 // Fills in the bits of a CompilerInstance that weren't initialized yet.
 // Provides "empty" ASTContext etc if we fail before parsing gets started.
 void createMissingComponents(CompilerInstance &Clang) {
+  if (!Clang.hasVirtualFileSystem())
+    Clang.createVirtualFileSystem();
   if (!Clang.hasDiagnostics())
-    Clang.createDiagnostics(*llvm::vfs::getRealFileSystem());
+    Clang.createDiagnostics();
   if (!Clang.hasFileManager())
     Clang.createFileManager();
   if (!Clang.hasSourceManager())
@@ -98,7 +100,9 @@ TestAST::TestAST(const TestInputs &In) {
 
   // Extra error conditions are reported through diagnostics, set that up first.
   bool ErrorOK = In.ErrorOK || llvm::StringRef(In.Code).contains("error-ok");
-  Clang->createDiagnostics(*VFS, new StoreDiagnostics(Diagnostics, !ErrorOK));
+  auto DiagConsumer = new StoreDiagnostics(Diagnostics, !ErrorOK);
+  Clang->createVirtualFileSystem(std::move(VFS), DiagConsumer);
+  Clang->createDiagnostics(DiagConsumer);
 
   // Parse cc1 argv, (typically [-std=c++20 input.cc]) into CompilerInvocation.
   std::vector<const char *> Argv;
@@ -115,7 +119,7 @@ TestAST::TestAST(const TestInputs &In) {
   }
   assert(!Clang->getInvocation().getFrontendOpts().DisableFree);
 
-  Clang->createFileManager(VFS);
+  Clang->createFileManager();
 
   // Running the FrontendAction creates the other components: SourceManager,
   // Preprocessor, ASTContext, Sema. Preprocessor needs TargetInfo to be set.
diff --git a/clang/lib/Tooling/DependencyScanning/DependencyScanningWorker.cpp b/clang/lib/Tooling/DependencyScanning/DependencyScanningWorker.cpp
index 0855e6dec6158..0a12c479bf8e3 100644
--- a/clang/lib/Tooling/DependencyScanning/DependencyScanningWorker.cpp
+++ b/clang/lib/Tooling/DependencyScanning/DependencyScanningWorker.cpp
@@ -414,11 +414,12 @@ class DependencyScanningAction {
     CompilerInstance &ScanInstance = *ScanInstanceStorage;
     ScanInstance.setBuildingModule(false);
 
+    ScanInstance.createVirtualFileSystem(FS, DiagConsumer);
+
     // Create the compiler's actual diagnostics engine.
     sanitizeDiagOpts(ScanInstance.getDiagnosticOpts());
     assert(!DiagConsumerFinished && "attempt to reuse finished consumer");
-    ScanInstance.createDiagnostics(*FS, DiagConsumer,
-                                   /*ShouldOwnClient=*/false);
+    ScanInstance.createDiagnostics(DiagConsumer, /*ShouldOwnClient=*/false);
     if (!ScanInstance.hasDiagnostics())
       return false;
 
@@ -439,13 +440,8 @@ class DependencyScanningAction {
     ScanInstance.getHeaderSearchOpts().ModulesIncludeVFSUsage =
         any(Service.getOptimizeArgs() & ScanningOptimizations::VFS);
 
-    // Support for virtual file system overlays.
-    FS = createVFSFromCompilerInvocation(ScanInstance.getInvocation(),
-                                         ScanInstance.getDiagnostics(),
-                                         std::move(FS));
-
     // Create a new FileManager to match the invocation's FileSystemOptions.
-    auto *FileMgr = ScanInstance.createFileManager(FS);
+    auto *FileMgr = ScanInstance.createFileManager();
 
     // Use the dependency scanning optimized file system if requested to do so.
     if (DepFS) {
diff --git a/clang/lib/Tooling/Refactoring/Rename/USRLocFinder.cpp b/clang/lib/Tooling/Refactoring/Rename/USRLocFinder.cpp
index d9444110d421c..c9108fc299cc1 100644
--- a/clang/lib/Tooling/Refactoring/Rename/USRLocFinder.cpp
+++ b/clang/lib/Tooling/Refactoring/Rename/USRLocFinder.cpp
@@ -157,12 +157,6 @@ SourceLocation StartLocationForType(TypeLoc TL) {
       return QualifierLoc.getBeginLoc();
     return TTL.getNameLoc();
   }
-  case TypeLoc::DependentTemplateSpecialization: {
-    auto TTL = TL.castAs<DependentTemplateSpecializationTypeLoc>();
-    if (NestedNameSpecifierLoc QualifierLoc = TTL.getQualifierLoc())
-      return QualifierLoc.getBeginLoc();
-    return TTL.getTemplateNameLoc();
-  }
   default:
     llvm_unreachable("unhandled TypeLoc class");
   }
diff --git a/clang/lib/Tooling/Syntax/BuildTree.cpp b/clang/lib/Tooling/Syntax/BuildTree.cpp
index b75f8ff6defee..90fd1f91b9ef2 100644
--- a/clang/lib/Tooling/Syntax/BuildTree.cpp
+++ b/clang/lib/Tooling/Syntax/BuildTree.cpp
@@ -974,13 +974,6 @@ class BuildTreeVisitor : public RecursiveASTVisitor<BuildTreeVisitor> {
           BeginLoc = TST.getTemplateNameLoc();
         return buildSimpleTemplateName({BeginLoc, TST.getEndLoc()});
       }
-      case TypeLoc::DependentTemplateSpecialization: {
-        auto DT = TL.castAs<DependentTemplateSpecializationTypeLoc>();
-        SourceLocation BeginLoc = DT.getTemplateKeywordLoc();
-        if (BeginLoc.isInvalid())
-          BeginLoc = DT.getTemplateNameLoc();
-        return buildSimpleTemplateName({BeginLoc, DT.getEndLoc()});
-      }
       case TypeLoc::Decltype: {
         const auto DTL = TL.castAs<DecltypeTypeLoc>();
         if (!RecursiveASTVisitor::TraverseDecltypeTypeLoc(
diff --git a/clang/lib/Tooling/Tooling.cpp b/clang/lib/Tooling/Tooling.cpp
index 0c179b852813d..2d4790b205b1a 100644
--- a/clang/lib/Tooling/Tooling.cpp
+++ b/clang/lib/Tooling/Tooling.cpp
@@ -454,8 +454,7 @@ bool FrontendActionFactory::runInvocation(
   std::unique_ptr<FrontendAction> ScopedToolAction(create());
 
   // Create the compiler's actual diagnostics engine.
-  Compiler.createDiagnostics(Files->getVirtualFileSystem(), DiagConsumer,
-                             /*ShouldOwnClient=*/false);
+  Compiler.createDiagnostics(DiagConsumer, /*ShouldOwnClient=*/false);
   if (!Compiler.hasDiagnostics())
     return false;
 
diff --git a/clang/test/APINotes/Inputs/Headers/SwiftImportAs.apinotes b/clang/test/APINotes/Inputs/Headers/SwiftImportAs.apinotes
index 15c806842d08f..7e9cac32df3a5 100644
--- a/clang/test/APINotes/Inputs/Headers/SwiftImportAs.apinotes
+++ b/clang/test/APINotes/Inputs/Headers/SwiftImportAs.apinotes
@@ -35,6 +35,14 @@ Tags:
 - Name: NoncopyableWithDestroyType
   SwiftCopyable: false
   SwiftDestroyOp: NCDDestroy
+- Name: ImportAsUnsafeStruct
+  SwiftSafety: unsafe
+- Name: StructWithUnsafeMethod
+  Methods:
+    - Name: ImportAsUnsafeMethod
+      SwiftSafety: unsafe
+    - Name: ImportAsUnsafeMethodActuallySafe
+      SwiftSafety: safe
 
 Functions:
   - Name: functionReturningFrt__
@@ -42,7 +50,20 @@ Functions:
     SwiftReturnOwnership: unretained
   - Name: functionReturningFrt_returns_retained
     SwiftReturnOwnership: retained
+  - Name: ImportAsUnsafe
+    SwiftSafety: unsafe
+  - Name: ImportAsUnsafeAlreadyAnnotated
+    SwiftSafety: unspecified
 Typedefs:
   - Name: WrappedOptions
     SwiftWrapper: struct
     SwiftConformsTo: Swift.OptionSet
+SwiftVersions:
+  - Version: 3.0
+    Functions:
+      - Name: ImportAsUnsafeVersioned
+        SwiftSafety: unsafe
+  - Version: 6.0
+    Functions:
+      - Name: ImportAsUnsafeVersioned
+        SwiftSafety: safe
diff --git a/clang/test/APINotes/Inputs/Headers/SwiftImportAs.h b/clang/test/APINotes/Inputs/Headers/SwiftImportAs.h
index 978b4fbbb3b00..272e3865ab2ba 100644
--- a/clang/test/APINotes/Inputs/Headers/SwiftImportAs.h
+++ b/clang/test/APINotes/Inputs/Headers/SwiftImportAs.h
@@ -36,3 +36,14 @@ struct NoncopyableWithDestroyType {
 };
 
 void NCDDestroy(NoncopyableWithDestroyType instance);
+
+void ImportAsUnsafe();
+struct ImportAsUnsafeStruct {
+};
+struct StructWithUnsafeMethod {
+    void ImportAsUnsafeMethod();
+    void ImportAsUnsafeMethodActuallySafe();
+};
+
+void ImportAsUnsafeAlreadyAnnotated() __attribute__((swift_attr("unsafe")));
+void ImportAsUnsafeVersioned();
diff --git a/clang/test/APINotes/nullability.c b/clang/test/APINotes/nullability.c
index e07fc2e5c1174..4b310d4fb0068 100644
--- a/clang/test/APINotes/nullability.c
+++ b/clang/test/APINotes/nullability.c
@@ -15,7 +15,7 @@ int main() {
 
   take_pointer_and_int(0, 0); // expected-warning{{null passed to a callee that requires a non-null argument}}
 
-  float *fp = global_int; // expected-warning{{incompatible pointer types initializing 'float *' with an expression of type 'int * _Nonnull'}}
+  float *fp = global_int; // expected-error{{incompatible pointer types initializing 'float *' with an expression of type 'int * _Nonnull'}}
   return 0;
 }
 
diff --git a/clang/test/APINotes/nullability.m b/clang/test/APINotes/nullability.m
index 21ec6680fa714..2c32caf6260a6 100644
--- a/clang/test/APINotes/nullability.m
+++ b/clang/test/APINotes/nullability.m
@@ -11,10 +11,10 @@ int main() {
   A *a;
 
 #if SWIFT_VERSION_3_0
-  float *fp =  // expected-warning{{incompatible pointer types initializing 'float *' with an expression of type 'A * _Nullable'}}
+  float *fp =  // expected-error{{incompatible pointer types initializing 'float *' with an expression of type 'A * _Nullable'}}
     [a transform: 0 integer: 0];
 #else
-  float *fp =  // expected-warning{{incompatible pointer types initializing 'float *' with an expression of type 'A *'}}
+  float *fp =  // expected-error{{incompatible pointer types initializing 'float *' with an expression of type 'A *'}}
     [a transform: 0 integer: 0]; // expected-warning{{null passed to a callee that requires a non-null argument}}
 #endif
 
diff --git a/clang/test/APINotes/swift-import-as.cpp b/clang/test/APINotes/swift-import-as.cpp
index f5d08df7c6a1b..20d38b5a0968d 100644
--- a/clang/test/APINotes/swift-import-as.cpp
+++ b/clang/test/APINotes/swift-import-as.cpp
@@ -16,6 +16,7 @@
 // RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache -fdisable-module-hash -fapinotes-modules -I %S/Inputs/Headers %s -x c++ -ast-dump -ast-dump-filter methodReturningFrt_returns_retained | FileCheck -check-prefix=CHECK-METHOD-RETURNING-FRT-RETAINED %s
 // RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache -fdisable-module-hash -fapinotes-modules -I %S/Inputs/Headers %s -x c++ -ast-dump -ast-dump-filter WrappedOptions | FileCheck -check-prefix=CHECK-WRAPPED-OPTIONS %s
 // RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache -fdisable-module-hash -fapinotes-modules -I %S/Inputs/Headers %s -x c++ -ast-dump -ast-dump-filter NoncopyableWithDestroyType | FileCheck -check-prefix=CHECK-NONCOPYABLE-WITH-DESTROY %s
+// RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache -fdisable-module-hash -fapinotes-modules -I %S/Inputs/Headers %s -x c++ -ast-dump -ast-dump-filter ImportAsUnsafe | FileCheck -check-prefix=CHECK-IMPORT-AS-UNSAFE %s
 
 #include <SwiftImportAs.h>
 
@@ -103,3 +104,32 @@
 // CHECK-NONCOPYABLE-WITH-DESTROY: RecordDecl {{.*}}struct NoncopyableWithDestroyType
 // CHECK-NONCOPYABLE-WITH-DESTROY: SwiftAttrAttr {{.+}} "destroy:NCDDestroy"
 // CHECK-NONCOPYABLE-WITH-DESTROY: SwiftAttrAttr {{.+}} "~Copyable"
+
+// CHECK-IMPORT-AS-UNSAFE: Dumping ImportAsUnsafe:
+// CHECK-IMPORT-AS-UNSAFE: FunctionDecl {{.+}} ImportAsUnsafe
+// CHECK-IMPORT-AS-UNSAFE: SwiftAttrAttr {{.+}} "unsafe"
+
+// CHECK-IMPORT-AS-UNSAFE: Dumping ImportAsUnsafeStruct:
+// CHECK-IMPORT-AS-UNSAFE: CXXRecordDecl {{.+}} ImportAsUnsafeStruct
+// CHECK-IMPORT-AS-UNSAFE: SwiftAttrAttr {{.+}} "unsafe"
+
+// CHECK-IMPORT-AS-UNSAFE: Dumping StructWithUnsafeMethod::ImportAsUnsafeMethod:
+// CHECK-IMPORT-AS-UNSAFE: CXXMethodDecl {{.+}} ImportAsUnsafeMethod
+// CHECK-IMPORT-AS-UNSAFE: SwiftAttrAttr {{.+}} "unsafe"
+
+// CHECK-IMPORT-AS-UNSAFE: Dumping StructWithUnsafeMethod::ImportAsUnsafeMethodActuallySafe:
+// CHECK-IMPORT-AS-UNSAFE: CXXMethodDecl {{.+}} ImportAsUnsafeMethodActuallySafe
+// CHECK-IMPORT-AS-UNSAFE: SwiftAttrAttr {{.+}} "safe"
+
+// CHECK-IMPORT-AS-UNSAFE: Dumping ImportAsUnsafeAlreadyAnnotated:
+// CHECK-IMPORT-AS-UNSAFE: FunctionDecl {{.+}} ImportAsUnsafeAlreadyAnnotated
+// CHECK-IMPORT-AS-UNSAFE: SwiftVersionedAdditionAttr {{.+}} IsReplacedByActive
+// CHECK-IMPORT-AS-UNSAFE: SwiftAttrAttr {{.+}} "unsafe"
+// CHECK-IMPORT-AS-UNSAFE-EMPTY:
+
+// CHECK-IMPORT-AS-UNSAFE: Dumping ImportAsUnsafeVersioned:
+// CHECK-IMPORT-AS-UNSAFE: FunctionDecl {{.+}} ImportAsUnsafeVersioned
+// CHECK-IMPORT-AS-UNSAFE: SwiftVersionedAdditionAttr {{.+}} 3.0
+// CHECK-IMPORT-AS-UNSAFE: SwiftAttrAttr {{.+}} "unsafe"
+// CHECK-IMPORT-AS-UNSAFE: SwiftVersionedAdditionAttr {{.+}} 6.0
+// CHECK-IMPORT-AS-UNSAFE: SwiftAttrAttr {{.+}} "safe"
diff --git a/clang/test/APINotes/types.m b/clang/test/APINotes/types.m
index 752f102643284..9e78d406b9ed9 100644
--- a/clang/test/APINotes/types.m
+++ b/clang/test/APINotes/types.m
@@ -12,17 +12,17 @@
 // CHECK-NEXT: } AnonEnumWithTypedefName
 
 void test(OverriddenTypes *overridden) {
-  int *ip1 = global_int_ptr; // expected-warning{{incompatible pointer types initializing 'int *' with an expression of type 'double (*)(int, int)'}}
+  int *ip1 = global_int_ptr; // expected-error{{incompatible pointer types initializing 'int *' with an expression of type 'double (*)(int, int)'}}
 
-  int *ip2 = global_int_fun( // expected-warning{{incompatible pointer types initializing 'int *' with an expression of type 'char *'}}
-               ip2, // expected-warning{{incompatible pointer types passing 'int *' to parameter of type 'double *'}}
-               ip2); // expected-warning{{incompatible pointer types passing 'int *' to parameter of type 'float *'}}
+  int *ip2 = global_int_fun( // expected-error{{incompatible pointer types initializing 'int *' with an expression of type 'char *'}}
+               ip2, // expected-error{{incompatible pointer types passing 'int *' to parameter of type 'double *'}}
+               ip2); // expected-error{{incompatible pointer types passing 'int *' to parameter of type 'float *'}}
 
-  int *ip3 = [overridden // expected-warning{{incompatible pointer types initializing 'int *' with an expression of type 'char *'}}
-                methodToMangle: ip3 // expected-warning{{incompatible pointer types sending 'int *' to parameter of type 'double *'}}
-                        second: ip3]; // expected-warning{{incompatible pointer types sending 'int *' to parameter of type 'float *'}}
+  int *ip3 = [overridden // expected-error{{incompatible pointer types initializing 'int *' with an expression of type 'char *'}}
+                methodToMangle: ip3 // expected-error{{incompatible pointer types sending 'int *' to parameter of type 'double *'}}
+                        second: ip3]; // expected-error{{incompatible pointer types sending 'int *' to parameter of type 'float *'}}
 
-  int *ip4 = overridden.intPropertyToMangle; // expected-warning{{incompatible pointer types initializing 'int *' with an expression of type 'double *'}}
+  int *ip4 = overridden.intPropertyToMangle; // expected-error{{incompatible pointer types initializing 'int *' with an expression of type 'double *'}}
 }
 
 // expected-note@SomeKit/SomeKit.h:42{{passing argument to parameter 'ptr' here}}
diff --git a/clang/test/AST/ByteCode/atomic.c b/clang/test/AST/ByteCode/atomic.c
index c8469d4a938b8..a93787ccd1e02 100644
--- a/clang/test/AST/ByteCode/atomic.c
+++ b/clang/test/AST/ByteCode/atomic.c
@@ -8,8 +8,8 @@ _Atomic int ai = 0;
 // FIXME: &ai is an address constant, so this should be accepted as an
 // initializer, but the bit-cast inserted due to the pointer conversion is
 // tripping up the test for whether the initializer is a constant expression.
-// The warning is correct but the error is not.
-_Atomic(int *) aip3 = &ai; // both-warning {{incompatible pointer types initializing '_Atomic(int *)' with an expression of type '_Atomic(int) *'}} \
+// The first error is correct; the second is not.
+_Atomic(int *) aip3 = &ai; // both-error {{incompatible pointer types initializing '_Atomic(int *)' with an expression of type '_Atomic(int) *'}} \
                            // both-error {{initializer element is not a compile-time constant}}
 
 #include <stdatomic.h>
diff --git a/clang/test/AST/ByteCode/builtin-bit-cast.cpp b/clang/test/AST/ByteCode/builtin-bit-cast.cpp
index bc356b0b6e122..a12f305caf877 100644
--- a/clang/test/AST/ByteCode/builtin-bit-cast.cpp
+++ b/clang/test/AST/ByteCode/builtin-bit-cast.cpp
@@ -529,3 +529,61 @@ constexpr const intptr_t &returns_local() { return 0L; }
 // both-error@+2 {{constexpr variable 'test_nullptr_bad' must be initialized by a constant expression}}
 // both-note@+1 {{read of temporary whose lifetime has ended}}
 constexpr nullptr_t test_nullptr_bad = __builtin_bit_cast(nullptr_t, returns_local());
+
+#ifdef __SIZEOF_INT128__
+namespace VectorCast {
+  typedef unsigned X          __attribute__ ((vector_size (64)));
+  typedef unsigned __int128 Y __attribute__ ((vector_size (64)));
+  constexpr int test() {
+    X x = {0};
+    Y y = x;
+
+    X x2 = y;
+
+    return 0;
+  }
+  static_assert(test() == 0);
+
+  typedef int X2      __attribute__ ((vector_size (64)));
+  typedef __int128 Y2 __attribute__ ((vector_size (64)));
+  constexpr int test2() {
+    X2 x = {0};
+    Y2 y = x;
+
+    X2 x2 = y;
+
+    return 0;
+  }
+  static_assert(test2() == 0);
+
+  struct S {
+    unsigned __int128 a : 3;
+  };
+  constexpr S s = __builtin_bit_cast(S, (__int128)12); // ref-error {{must be initialized by a constant expression}} \
+                                                       // ref-note {{constexpr bit_cast involving bit-field is not yet supported}} \
+                                                       // ref-note {{declared here}}
+#if LITTLE_END
+  static_assert(s.a == 4); // ref-error {{not an integral constant expression}} \
+                           // ref-note {{initializer of 's' is not a constant expression}}
+#else
+  static_assert(s.a == 0); // ref-error {{not an integral constant expression}} \
+                           // ref-note {{initializer of 's' is not a constant expression}}
+#endif
+}
+#endif
+
+namespace ToPrimPtrs {
+  struct S { int foo () { return 0; } };
+  auto ptr  = __builtin_bit_cast(int *, ((__INTPTR_TYPE__) 0));
+  auto nptr = __builtin_bit_cast(nullptr_t, ((__INTPTR_TYPE__)0));
+
+  constexpr auto cptr  = __builtin_bit_cast(int *, ((__INTPTR_TYPE__) 0)); // both-error {{must be initialized by a constant expression}} \
+                                                                           // both-note {{bit_cast to a pointer type is not allowed in a constant expression}}
+  constexpr auto cnptr = __builtin_bit_cast(nullptr_t, ((__INTPTR_TYPE__)0));
+
+#if !defined(_WIN32)
+  auto memptr = __builtin_bit_cast(int S::*, ((__INTPTR_TYPE__) 0));
+  constexpr auto cmemptr = __builtin_bit_cast(int S::*, ((__INTPTR_TYPE__) 0)); // both-error {{must be initialized by a constant expression}} \
+                                                                                // both-note {{bit_cast to a member pointer type is not allowed in a constant expression}}
+#endif
+}
diff --git a/clang/test/AST/ByteCode/c.c b/clang/test/AST/ByteCode/c.c
index b6d2a69271afb..6681a4f427093 100644
--- a/clang/test/AST/ByteCode/c.c
+++ b/clang/test/AST/ByteCode/c.c
@@ -349,7 +349,7 @@ const unsigned char _str2[] = {S[0], S[1], S[2], S[3], S[4], S[5], S[6], S[7]};
 const int compared = strcmp(_str, (const char *)_str2); // all-error {{initializer element is not a compile-time constant}}
 
 
-const int compared2 = strcmp(strcmp, _str); // all-warning {{incompatible pointer types}} \
+const int compared2 = strcmp(strcmp, _str); // all-error {{incompatible pointer types}} \
                                             // all-error {{initializer element is not a compile-time constant}}
 
 int foo(x) // all-warning {{a function definition without a prototype is deprecated in all versions of C}}
diff --git a/clang/test/AST/ByteCode/vectors.cpp b/clang/test/AST/ByteCode/vectors.cpp
index 91fec8f86f613..2eb615e906cf5 100644
--- a/clang/test/AST/ByteCode/vectors.cpp
+++ b/clang/test/AST/ByteCode/vectors.cpp
@@ -168,3 +168,15 @@ namespace Assign {
   }
   static_assert(invalid()); // both-error {{not an integral constant expression}}
 }
+
+namespace CopyArrayDummy {
+  struct S {
+    long a, b, c, d;
+  };
+  typedef long T __attribute__((vector_size(4 * sizeof(long))));
+
+  void foo(void) {
+    struct S s;
+    *(T *)&s = (T){0, 1, 2, 3};
+  }
+}
diff --git a/clang/test/AST/HLSL/ByteAddressBuffers-AST.hlsl b/clang/test/AST/HLSL/ByteAddressBuffers-AST.hlsl
index 8d2e36f4bb7d1..43d8ddee6ccad 100644
--- a/clang/test/AST/HLSL/ByteAddressBuffers-AST.hlsl
+++ b/clang/test/AST/HLSL/ByteAddressBuffers-AST.hlsl
@@ -34,7 +34,7 @@ RESOURCE Buffer;
 
 #endif
 
-// CHECK: CXXRecordDecl {{.*}} implicit referenced <undeserialized declarations> class [[RESOURCE]] definition
+// CHECK: CXXRecordDecl {{.*}} implicit referenced class [[RESOURCE]] definition
 // CHECK: FinalAttr {{.*}}  Implicit final
 // CHECK-NEXT: FieldDecl {{.*}} implicit __handle '__hlsl_resource_t
 // CHECK-SRV-SAME{LITERAL}: [[hlsl::resource_class(SRV)]]
@@ -107,6 +107,8 @@ RESOURCE Buffer;
 // CHECK-NEXT: DeclRefExpr {{.*}} 'unsigned int' ParmVar {{.*}} 'index' 'unsigned int'
 // CHECK-NEXT: DeclRefExpr {{.*}} 'const char *' ParmVar {{.*}} 'name' 'const char *'
 // CHECK-NEXT: ReturnStmt
+// CHECK-NEXT: CXXConstructExpr {{.*}} 'hlsl::[[RESOURCE]]' 'void (const hlsl::[[RESOURCE]] &)'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'const hlsl::[[RESOURCE]]' xvalue <NoOp>
 // CHECK-NEXT: DeclRefExpr {{.*}} 'hlsl::[[RESOURCE]]' lvalue Var {{.*}} 'tmp' 'hlsl::[[RESOURCE]]'
 // CHECK-NEXT: AlwaysInlineAttr {{.*}} Implicit always_inline
 
@@ -135,56 +137,10 @@ RESOURCE Buffer;
 // CHECK-NEXT: DeclRefExpr {{.*}} 'unsigned int' ParmVar {{.*}} 'index' 'unsigned int'
 // CHECK-NEXT: DeclRefExpr {{.*}} 'const char *' ParmVar {{.*}} 'name' 'const char *'
 // CHECK-NEXT: ReturnStmt
+// CHECK-NEXT: CXXConstructExpr {{.*}} 'hlsl::[[RESOURCE]]' 'void (const hlsl::[[RESOURCE]] &)'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'const hlsl::[[RESOURCE]]' xvalue <NoOp>
 // CHECK-NEXT: DeclRefExpr {{.*}} 'hlsl::[[RESOURCE]]' lvalue Var {{.*}} 'tmp' 'hlsl::[[RESOURCE]]'
 // CHECK-NEXT: AlwaysInlineAttr {{.*}} Implicit always_inline
 
-// Constructor from binding
-
-// CHECK: CXXConstructorDecl {{.*}} [[RESOURCE]] 'void (unsigned int, unsigned int, int, unsigned int, const char *)' inline
-// CHECK-NEXT: ParmVarDecl {{.*}} registerNo 'unsigned int'
-// CHECK-NEXT: ParmVarDecl {{.*}} spaceNo 'unsigned int'
-// CHECK-NEXT: ParmVarDecl {{.*}} range 'int'
-// CHECK-NEXT: ParmVarDecl {{.*}} index 'unsigned int'
-// CHECK-NEXT: ParmVarDecl {{.*}} name 'const char *'
-// CHECK-NEXT: CompoundStmt {{.*}}
-// CHECK-NEXT: BinaryOperator {{.*}} '='
-// CHECK-NEXT: MemberExpr {{.*}} lvalue .__handle
-// CHECK-NEXT: CXXThisExpr {{.*}} 'hlsl::[[RESOURCE]]' lvalue implicit this
-// CHECK-NEXT: CallExpr {{.*}} '__hlsl_resource_t
-// CHECK-NEXT: ImplicitCastExpr {{.*}} <BuiltinFnToFnPtr>
-// CHECK-NEXT: DeclRefExpr {{.*}} '<builtin fn type>' Function {{.*}} '__builtin_hlsl_resource_handlefrombinding'
-// CHECK-NEXT: MemberExpr {{.*}} lvalue .__handle
-// CHECK-NEXT: CXXThisExpr {{.*}} 'hlsl::[[RESOURCE]]' lvalue implicit this
-// CHECK-NEXT: DeclRefExpr {{.*}} 'unsigned int' ParmVar {{.*}} 'registerNo' 'unsigned int'
-// CHECK-NEXT: DeclRefExpr {{.*}} 'unsigned int' ParmVar {{.*}} 'spaceNo' 'unsigned int'
-// CHECK-NEXT: DeclRefExpr {{.*}} 'int' ParmVar {{.*}} 'range' 'int'
-// CHECK-NEXT: DeclRefExpr {{.*}} 'unsigned int' ParmVar {{.*}} 'index' 'unsigned int'
-// CHECK-NEXT: DeclRefExpr {{.*}} 'const char *' ParmVar {{.*}} 'name' 'const char *'
-// CHECK-NEXT: AlwaysInlineAttr
-
-// Constructor from implicit binding
-
-// CHECK: CXXConstructorDecl {{.*}} [[RESOURCE]] 'void (unsigned int, int, unsigned int, unsigned int, const char *)' inline
-// CHECK-NEXT: ParmVarDecl {{.*}} spaceNo 'unsigned int'
-// CHECK-NEXT: ParmVarDecl {{.*}} range 'int'
-// CHECK-NEXT: ParmVarDecl {{.*}} index 'unsigned int'
-// CHECK-NEXT: ParmVarDecl {{.*}} orderId 'unsigned int'
-// CHECK-NEXT: ParmVarDecl {{.*}} name 'const char *'
-// CHECK-NEXT: CompoundStmt {{.*}}
-// CHECK-NEXT: BinaryOperator {{.*}} '='
-// CHECK-NEXT: MemberExpr {{.*}} lvalue .__handle
-// CHECK-NEXT: CXXThisExpr {{.*}} 'hlsl::[[RESOURCE]]' lvalue implicit this
-// CHECK-NEXT: CallExpr {{.*}} '__hlsl_resource_t
-// CHECK-NEXT: ImplicitCastExpr {{.*}} <BuiltinFnToFnPtr>
-// CHECK-NEXT: DeclRefExpr {{.*}} '<builtin fn type>' Function {{.*}} '__builtin_hlsl_resource_handlefromimplicitbinding'
-// CHECK-NEXT: MemberExpr {{.*}} lvalue .__handle
-// CHECK-NEXT: CXXThisExpr {{.*}} 'hlsl::[[RESOURCE]]' lvalue implicit this
-// CHECK-NEXT: DeclRefExpr {{.*}} 'unsigned int' ParmVar {{.*}} 'orderId' 'unsigned int'
-// CHECK-NEXT: DeclRefExpr {{.*}} 'unsigned int' ParmVar {{.*}} 'spaceNo' 'unsigned int'
-// CHECK-NEXT: DeclRefExpr {{.*}} 'int' ParmVar {{.*}} 'range' 'int'
-// CHECK-NEXT: DeclRefExpr {{.*}} 'unsigned int' ParmVar {{.*}} 'index' 'unsigned int'
-// CHECK-NEXT: DeclRefExpr {{.*}} 'const char *' ParmVar {{.*}} 'name' 'const char *'
-// CHECK-NEXT: AlwaysInlineAttr
-
 // CHECK-NOSUBSCRIPT-NOT: CXXMethodDecl {{.*}} operator[] 'const char8_t &(unsigned int) const'
 // CHECK-NOSUBSCRIPT-NOT: CXXMethodDecl {{.*}} operator[] 'char8_t &(unsigned int)'
diff --git a/clang/test/AST/HLSL/StructuredBuffers-AST.hlsl b/clang/test/AST/HLSL/StructuredBuffers-AST.hlsl
index 52a2c20686c53..a490b22ab437b 100644
--- a/clang/test/AST/HLSL/StructuredBuffers-AST.hlsl
+++ b/clang/test/AST/HLSL/StructuredBuffers-AST.hlsl
@@ -185,54 +185,6 @@ RESOURCE<float> Buffer;
 // CHECK-NEXT: DeclRefExpr {{.*}} 'hlsl::[[RESOURCE]]<element_type>' lvalue Var {{.*}} 'tmp' 'hlsl::[[RESOURCE]]<element_type>'
 // CHECK-NEXT: AlwaysInlineAttr {{.*}} Implicit always_inline
 
-// Constructor from binding
-
-// CHECK: CXXConstructorDecl {{.*}} [[RESOURCE]]<element_type> 'void (unsigned int, unsigned int, int, unsigned int, const char *)' inline
-// CHECK-NEXT: ParmVarDecl {{.*}} registerNo 'unsigned int'
-// CHECK-NEXT: ParmVarDecl {{.*}} spaceNo 'unsigned int'
-// CHECK-NEXT: ParmVarDecl {{.*}} range 'int'
-// CHECK-NEXT: ParmVarDecl {{.*}} index 'unsigned int'
-// CHECK-NEXT: ParmVarDecl {{.*}} name 'const char *'
-// CHECK-NEXT: CompoundStmt {{.*}}
-// CHECK-NEXT: BinaryOperator {{.*}} '='
-// CHECK-NEXT: MemberExpr {{.*}} lvalue .__handle
-// CHECK-NEXT: CXXThisExpr {{.*}} 'hlsl::[[RESOURCE]]<element_type>' lvalue implicit this
-// CHECK-NEXT: CallExpr {{.*}} '__hlsl_resource_t
-// CHECK-NEXT: ImplicitCastExpr {{.*}} <BuiltinFnToFnPtr>
-// CHECK-NEXT: DeclRefExpr {{.*}} '<builtin fn type>' Function {{.*}} '__builtin_hlsl_resource_handlefrombinding'
-// CHECK-NEXT: MemberExpr {{.*}} lvalue .__handle
-// CHECK-NEXT: CXXThisExpr {{.*}} 'hlsl::[[RESOURCE]]<element_type>' lvalue implicit this
-// CHECK-NEXT: DeclRefExpr {{.*}} 'unsigned int' ParmVar {{.*}} 'registerNo' 'unsigned int'
-// CHECK-NEXT: DeclRefExpr {{.*}} 'unsigned int' ParmVar {{.*}} 'spaceNo' 'unsigned int'
-// CHECK-NEXT: DeclRefExpr {{.*}} 'int' ParmVar {{.*}} 'range' 'int'
-// CHECK-NEXT: DeclRefExpr {{.*}} 'unsigned int' ParmVar {{.*}} 'index' 'unsigned int'
-// CHECK-NEXT: DeclRefExpr {{.*}} 'const char *' ParmVar {{.*}} 'name' 'const char *'
-// CHECK-NEXT: AlwaysInlineAttr
-
-// Constructor from implicit binding
-
-// CHECK: CXXConstructorDecl {{.*}} [[RESOURCE]]<element_type> 'void (unsigned int, int, unsigned int, unsigned int, const char *)' inline
-// CHECK-NEXT: ParmVarDecl {{.*}} spaceNo 'unsigned int'
-// CHECK-NEXT: ParmVarDecl {{.*}} range 'int'
-// CHECK-NEXT: ParmVarDecl {{.*}} index 'unsigned int'
-// CHECK-NEXT: ParmVarDecl {{.*}} orderId 'unsigned int'
-// CHECK-NEXT: ParmVarDecl {{.*}} name 'const char *'
-// CHECK-NEXT: CompoundStmt {{.*}}
-// CHECK-NEXT: BinaryOperator {{.*}} '='
-// CHECK-NEXT: MemberExpr {{.*}} lvalue .__handle
-// CHECK-NEXT: CXXThisExpr {{.*}} 'hlsl::[[RESOURCE]]<element_type>' lvalue implicit this
-// CHECK-NEXT: CallExpr {{.*}} '__hlsl_resource_t
-// CHECK-NEXT: ImplicitCastExpr {{.*}} <BuiltinFnToFnPtr>
-// CHECK-NEXT: DeclRefExpr {{.*}} '<builtin fn type>' Function {{.*}} '__builtin_hlsl_resource_handlefromimplicitbinding'
-// CHECK-NEXT: MemberExpr {{.*}} lvalue .__handle
-// CHECK-NEXT: CXXThisExpr {{.*}} 'hlsl::[[RESOURCE]]<element_type>' lvalue implicit this
-// CHECK-NEXT: DeclRefExpr {{.*}} 'unsigned int' ParmVar {{.*}} 'orderId' 'unsigned int'
-// CHECK-NEXT: DeclRefExpr {{.*}} 'unsigned int' ParmVar {{.*}} 'spaceNo' 'unsigned int'
-// CHECK-NEXT: DeclRefExpr {{.*}} 'int' ParmVar {{.*}} 'range' 'int'
-// CHECK-NEXT: DeclRefExpr {{.*}} 'unsigned int' ParmVar {{.*}} 'index' 'unsigned int'
-// CHECK-NEXT: DeclRefExpr {{.*}} 'const char *' ParmVar {{.*}} 'name' 'const char *'
-// CHECK-NEXT: AlwaysInlineAttr
-
 // Subscript operators
 
 // CHECK-SUBSCRIPT: CXXMethodDecl {{.*}} operator[] 'const hlsl_device element_type &(unsigned int) const'
diff --git a/clang/test/AST/HLSL/TypedBuffers-AST.hlsl b/clang/test/AST/HLSL/TypedBuffers-AST.hlsl
index 95f578947a904..5182ce194cfb0 100644
--- a/clang/test/AST/HLSL/TypedBuffers-AST.hlsl
+++ b/clang/test/AST/HLSL/TypedBuffers-AST.hlsl
@@ -160,54 +160,6 @@ RESOURCE<float> Buffer;
 // CHECK-NEXT: DeclRefExpr {{.*}} 'hlsl::[[RESOURCE]]<element_type>' lvalue Var {{.*}} 'tmp' 'hlsl::[[RESOURCE]]<element_type>'
 // CHECK-NEXT: AlwaysInlineAttr {{.*}} Implicit always_inline
 
-// Constructor from binding
-
-// CHECK: CXXConstructorDecl {{.*}} [[RESOURCE]]<element_type> 'void (unsigned int, unsigned int, int, unsigned int, const char *)' inline
-// CHECK-NEXT: ParmVarDecl {{.*}} registerNo 'unsigned int'
-// CHECK-NEXT: ParmVarDecl {{.*}} spaceNo 'unsigned int'
-// CHECK-NEXT: ParmVarDecl {{.*}} range 'int'
-// CHECK-NEXT: ParmVarDecl {{.*}} index 'unsigned int'
-// CHECK-NEXT: ParmVarDecl {{.*}} name 'const char *'
-// CHECK-NEXT: CompoundStmt {{.*}}
-// CHECK-NEXT: BinaryOperator {{.*}} '='
-// CHECK-NEXT: MemberExpr {{.*}} lvalue .__handle
-// CHECK-NEXT: CXXThisExpr {{.*}} 'hlsl::[[RESOURCE]]<element_type>' lvalue implicit this
-// CHECK-NEXT: CallExpr {{.*}} '__hlsl_resource_t
-// CHECK-NEXT: ImplicitCastExpr {{.*}} <BuiltinFnToFnPtr>
-// CHECK-NEXT: DeclRefExpr {{.*}} '<builtin fn type>' Function {{.*}} '__builtin_hlsl_resource_handlefrombinding'
-// CHECK-NEXT: MemberExpr {{.*}} lvalue .__handle
-// CHECK-NEXT: CXXThisExpr {{.*}} 'hlsl::[[RESOURCE]]<element_type>' lvalue implicit this
-// CHECK-NEXT: DeclRefExpr {{.*}} 'unsigned int' ParmVar {{.*}} 'registerNo' 'unsigned int'
-// CHECK-NEXT: DeclRefExpr {{.*}} 'unsigned int' ParmVar {{.*}} 'spaceNo' 'unsigned int'
-// CHECK-NEXT: DeclRefExpr {{.*}} 'int' ParmVar {{.*}} 'range' 'int'
-// CHECK-NEXT: DeclRefExpr {{.*}} 'unsigned int' ParmVar {{.*}} 'index' 'unsigned int'
-// CHECK-NEXT: DeclRefExpr {{.*}} 'const char *' ParmVar {{.*}} 'name' 'const char *'
-// CHECK-NEXT: AlwaysInlineAttr
-
-// Constructor from implicit binding
-
-// CHECK: CXXConstructorDecl {{.*}} [[RESOURCE]]<element_type> 'void (unsigned int, int, unsigned int, unsigned int, const char *)' inline
-// CHECK-NEXT: ParmVarDecl {{.*}} spaceNo 'unsigned int'
-// CHECK-NEXT: ParmVarDecl {{.*}} range 'int'
-// CHECK-NEXT: ParmVarDecl {{.*}} index 'unsigned int'
-// CHECK-NEXT: ParmVarDecl {{.*}} orderId 'unsigned int'
-// CHECK-NEXT: ParmVarDecl {{.*}} name 'const char *'
-// CHECK-NEXT: CompoundStmt {{.*}}
-// CHECK-NEXT: BinaryOperator {{.*}} '='
-// CHECK-NEXT: MemberExpr {{.*}} lvalue .__handle
-// CHECK-NEXT: CXXThisExpr {{.*}} 'hlsl::[[RESOURCE]]<element_type>' lvalue implicit this
-// CHECK-NEXT: CallExpr {{.*}} '__hlsl_resource_t
-// CHECK-NEXT: ImplicitCastExpr {{.*}} <BuiltinFnToFnPtr>
-// CHECK-NEXT: DeclRefExpr {{.*}} '<builtin fn type>' Function {{.*}} '__builtin_hlsl_resource_handlefromimplicitbinding'
-// CHECK-NEXT: MemberExpr {{.*}} lvalue .__handle
-// CHECK-NEXT: CXXThisExpr {{.*}} 'hlsl::[[RESOURCE]]<element_type>' lvalue implicit this
-// CHECK-NEXT: DeclRefExpr {{.*}} 'unsigned int' ParmVar {{.*}} 'orderId' 'unsigned int'
-// CHECK-NEXT: DeclRefExpr {{.*}} 'unsigned int' ParmVar {{.*}} 'spaceNo' 'unsigned int'
-// CHECK-NEXT: DeclRefExpr {{.*}} 'int' ParmVar {{.*}} 'range' 'int'
-// CHECK-NEXT: DeclRefExpr {{.*}} 'unsigned int' ParmVar {{.*}} 'index' 'unsigned int'
-// CHECK-NEXT: DeclRefExpr {{.*}} 'const char *' ParmVar {{.*}} 'name' 'const char *'
-// CHECK-NEXT: AlwaysInlineAttr
-
 // Subscript operators
 
 // CHECK: CXXMethodDecl {{.*}} operator[] 'const hlsl_device element_type &(unsigned int) const'
diff --git a/clang/test/AST/HLSL/vk_binding_attr.hlsl b/clang/test/AST/HLSL/vk_binding_attr.hlsl
index 4cb2abdaef01a..d08165d7c593d 100644
--- a/clang/test/AST/HLSL/vk_binding_attr.hlsl
+++ b/clang/test/AST/HLSL/vk_binding_attr.hlsl
@@ -2,18 +2,23 @@
 // RUN: %clang_cc1 -triple dxil-pc-shadermodel6.8-library -finclude-default-header -ast-dump -o - %s | FileCheck %s -check-prefixes=DXIL,CHECK
 
 // CHECK: VarDecl {{.*}} Buf 'StructuredBuffer<float>':'hlsl::StructuredBuffer<float>'
-// SPV-NEXT: CXXConstructExpr {{.*}} 'StructuredBuffer<float>':'hlsl::StructuredBuffer<float>' 'void (unsigned int, unsigned int, int, unsigned int, const char *)'
+// CHECK-NEXT: CallExpr {{.*}} 'StructuredBuffer<float>':'hlsl::StructuredBuffer<float>'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'hlsl::StructuredBuffer<float> (*)(unsigned int, unsigned int, int, unsigned int, const char *)' <FunctionToPointerDecay>
+// CHECK-NEXT: DeclRefExpr {{.*}} 'hlsl::StructuredBuffer<float> (unsigned int, unsigned int, int, unsigned int, const char *)' 
+// CHECK-NEXT-SAME: CXXMethod {{.*}} '__createFromBinding' 'hlsl::StructuredBuffer<float> (unsigned int, unsigned int, int, unsigned int, const char *)'
 // SPV-NEXT: IntegerLiteral {{.*}} 'unsigned int' 23
 // SPV-NEXT: IntegerLiteral {{.*}} 'unsigned int' 102
-// DXIL-NEXT: CXXConstructExpr {{.*}} 'StructuredBuffer<float>':'hlsl::StructuredBuffer<float>' 'void (unsigned int, int, unsigned int, unsigned int, const char *)'
 // DXIL-NEXT: IntegerLiteral {{.*}} 'unsigned int' 0
-// DXIL-NEXT: IntegerLiteral {{.*}} 'int' 1
+// DXIL-NEXT: IntegerLiteral {{.*}} 'unsigned int' 0
 // SPV: HLSLVkBindingAttr {{.*}} 23 102
 // DXIL-NOT: HLSLVkBindingAttr
 [[vk::binding(23, 102)]] StructuredBuffer<float> Buf;
 
 // CHECK: VarDecl {{.*}} Buf2 'StructuredBuffer<float>':'hlsl::StructuredBuffer<float>'
-// CHECK-NEXT: CXXConstructExpr {{.*}} 'StructuredBuffer<float>':'hlsl::StructuredBuffer<float>' 'void (unsigned int, unsigned int, int, unsigned int, const char *)'
+// CHECK-NEXT: CallExpr {{.*}} 'StructuredBuffer<float>':'hlsl::StructuredBuffer<float>'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'hlsl::StructuredBuffer<float> (*)(unsigned int, unsigned int, int, unsigned int, const char *)' <FunctionToPointerDecay>
+// CHECK-NEXT: DeclRefExpr {{.*}} 'hlsl::StructuredBuffer<float> (unsigned int, unsigned int, int, unsigned int, const char *)' 
+// CHECK-NEXT-SAME: CXXMethod {{.*}} '__createFromBinding' 'hlsl::StructuredBuffer<float> (unsigned int, unsigned int, int, unsigned int, const char *)'
 // SPV-NEXT: IntegerLiteral {{.*}} 'unsigned int' 14
 // SPV-NEXT: IntegerLiteral {{.*}} 'unsigned int' 1
 // DXIL-NEXT: IntegerLiteral {{.*}} 'unsigned int' 23
@@ -24,7 +29,10 @@
 [[vk::binding(14, 1)]] StructuredBuffer<float> Buf2 : register(t23, space102);
 
 // CHECK: VarDecl {{.*}} Buf3 'StructuredBuffer<float>':'hlsl::StructuredBuffer<float>'
-// CHECK-NEXT: CXXConstructExpr {{.*}} 'StructuredBuffer<float>':'hlsl::StructuredBuffer<float>' 'void (unsigned int, unsigned int, int, unsigned int, const char *)'
+// CHECK-NEXT: CallExpr {{.*}} 'StructuredBuffer<float>':'hlsl::StructuredBuffer<float>'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'hlsl::StructuredBuffer<float> (*)(unsigned int, unsigned int, int, unsigned int, const char *)' <FunctionToPointerDecay>
+// CHECK-NEXT: DeclRefExpr {{.*}} 'hlsl::StructuredBuffer<float> (unsigned int, unsigned int, int, unsigned int, const char *)' 
+// CHECK-NEXT-SAME: CXXMethod {{.*}} '__createFromBinding' 'hlsl::StructuredBuffer<float> (unsigned int, unsigned int, int, unsigned int, const char *)'
 // SPV-NEXT: IntegerLiteral {{.*}} 'unsigned int' 14
 // SPV-NEXT: IntegerLiteral {{.*}} 'unsigned int' 0
 // DXIL-NEXT: IntegerLiteral {{.*}} 'unsigned int' 23
@@ -43,28 +51,46 @@
 }
 
 // CHECK: VarDecl {{.*}} Buf4 'Buffer<int>':'hlsl::Buffer<int>'
-// SPV-NEXT: CXXConstructExpr {{.*}} 'Buffer<int>':'hlsl::Buffer<int>' 'void (unsigned int, unsigned int, int, unsigned int, const char *)'
+// CHECK-NEXT: CallExpr {{.*}} 'Buffer<int>':'hlsl::Buffer<int>'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'hlsl::Buffer<int> (*)(unsigned int, unsigned int, int, unsigned int, const char *)' <FunctionToPointerDecay>
+// SPV-NEXT: DeclRefExpr {{.*}} 'hlsl::Buffer<int> (unsigned int, unsigned int, int, unsigned int, const char *)' 
+// SPV-NEXT-SAME: CXXMethod {{.*}} '__createFromBinding' 'Buffer<int> (unsigned int, unsigned int, int, unsigned int, const char *)'
 // SPV-NEXT: IntegerLiteral {{.*}} 'unsigned int' 24
 // SPV-NEXT: IntegerLiteral {{.*}} 'unsigned int' 103
-// DXL-NEXT: CXXConstructExpr {{.*}} 'Buffer<int>':'hlsl::Buffer<int>' 'void (unsigned int, int, unsigned int, unsigned int, const char *)'
+// DXIL-NEXT: DeclRefExpr {{.*}} 'hlsl::Buffer<int> (unsigned int, unsigned int, int, unsigned int, const char *)' 
+// DXIL-NEXT-SAME: CXXMethod {{.*}} '__createFromImplicitBinding' 'Buffer<int> (unsigned int, unsigned int, int, unsigned int, const char *)'
+// DXIL-NEXT: IntegerLiteral {{.*}} 'unsigned int' 2
+// DXIL-NEXT: IntegerLiteral {{.*}} 'unsigned int' 0
 // SPV: HLSLVkBindingAttr {{.*}} 24 103
 // DXIL-NOT: HLSLVkBindingAttr
 [[vk::binding(24, 103)]] Buffer<int> Buf4;
 
 // CHECK: VarDecl {{.*}} Buf5 'RWBuffer<int2>':'hlsl::RWBuffer<vector<int, 2>>'
-// SPV-NEXT: CXXConstructExpr {{.*}} 'RWBuffer<int2>':'hlsl::RWBuffer<vector<int, 2>>' 'void (unsigned int, unsigned int, int, unsigned int, const char *)'
+// CHECK-NEXT: CallExpr {{.*}} 'RWBuffer<int2>':'hlsl::RWBuffer<vector<int, 2>>'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'hlsl::RWBuffer<vector<int, 2>> (*)(unsigned int, unsigned int, int, unsigned int, const char *)' <FunctionToPointerDecay>
+// SPV-NEXT: DeclRefExpr {{.*}} 'hlsl::RWBuffer<vector<int, 2>> (unsigned int, unsigned int, int, unsigned int, const char *)' 
+// SPV-NEXT-SAME: CXXMethod {{.*}} '__createFromBinding' 'Buffer<int2> (unsigned int, unsigned int, int, unsigned int, const char *)'
 // SPV-NEXT: IntegerLiteral {{.*}} 'unsigned int' 25
 // SPV-NEXT: IntegerLiteral {{.*}} 'unsigned int' 104
-// DXL-NEXT: CXXConstructExpr {{.*}} 'Buffer<int>':'hlsl::Buffer<int>' 'void (unsigned int, int, unsigned int, unsigned int, const char *)'
+// DXIL-NEXT: DeclRefExpr {{.*}} 'hlsl::RWBuffer<vector<int, 2>> (unsigned int, unsigned int, int, unsigned int, const char *)' 
+// DXIL-NEXT-SAME: CXXMethod {{.*}} '__createFromImplicitBinding' 'Buffer<int2> (unsigned int, unsigned int, int, unsigned int, const char *)'
+// DXIL-NEXT: IntegerLiteral {{.*}} 'unsigned int' 3
+// DXIL-NEXT: IntegerLiteral {{.*}} 'unsigned int' 0
 // SPV: HLSLVkBindingAttr {{.*}} 25 104
 // DXIL-NOT: HLSLVkBindingAttr
 [[vk::binding(25, 104)]] RWBuffer<int2> Buf5;
 
 // CHECK: VarDecl {{.*}} Buf6 'RWStructuredBuffer<int>':'hlsl::RWStructuredBuffer<int>'
-// SPV-NEXT: CXXConstructExpr {{.*}} 'RWStructuredBuffer<int>':'hlsl::RWStructuredBuffer<int>' 'void (unsigned int, unsigned int, int, unsigned int, const char *)'
+// CHECK-NEXT: CallExpr {{.*}} 'RWStructuredBuffer<int>':'hlsl::RWStructuredBuffer<int>'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'hlsl::RWStructuredBuffer<int> (*)(unsigned int, unsigned int, int, unsigned int, const char *)' <FunctionToPointerDecay>
+// SPV-NEXT: DeclRefExpr {{.*}} 'hlsl::RWStructuredBuffer<int> (unsigned int, unsigned int, int, unsigned int, const char *)' 
+// SPV-NEXT-SAME: CXXMethod {{.*}} '__createFromBinding' 'hlsl::RWStructuredBuffer<int> (unsigned int, unsigned int, int, unsigned int, const char *)'
 // SPV-NEXT: IntegerLiteral {{.*}} 'unsigned int' 26
 // SPV-NEXT: IntegerLiteral {{.*}} 'unsigned int' 105
-// DXL-NEXT: CXXConstructExpr {{.*}} 'Buffer<int>':'hlsl::Buffer<int>' 'void (unsigned int, int, unsigned int, unsigned int, const char *)'
+// DXIL-NEXT: DeclRefExpr {{.*}} 'hlsl::RWStructuredBuffer<int> (unsigned int, unsigned int, int, unsigned int, const char *)' 
+// DXIL-NEXT-SAME: CXXMethod {{.*}} '__createFromBinding' 'hlsl::RWStructuredBuffer<int> (unsigned int, unsigned int, int, unsigned int, const char *)'
+// DXIL-NEXT: IntegerLiteral {{.*}} 'unsigned int' 4
+// DXIL-NEXT: IntegerLiteral {{.*}} 'unsigned int' 0
 // SPV: HLSLVkBindingAttr {{.*}} 26 105
 // DXIL-NOT: HLSLVkBindingAttr
 [[vk::binding(26, 105)]] RWStructuredBuffer<int> Buf6;
diff --git a/clang/test/AST/ast-dump-templates.cpp b/clang/test/AST/ast-dump-templates.cpp
index e43fe6b1dda25..18f62e4acdc78 100644
--- a/clang/test/AST/ast-dump-templates.cpp
+++ b/clang/test/AST/ast-dump-templates.cpp
@@ -175,7 +175,10 @@ namespace TestDependentMemberPointer {
 // DUMP-NEXT:  |   `-BuiltinType {{.+}} 'int'
 // DUMP-NEXT:  `-TypeAliasDecl {{.+}} Z 'int U::template V<int>::*'{{$}}
 // DUMP-NEXT:    `-MemberPointerType {{.+}} 'int U::template V<int>::*' dependent
-// DUMP-NEXT:      |-DependentTemplateSpecializationType {{.+}} 'U::template V<int>' dependent
+// DUMP-NEXT:      |-TemplateSpecializationType {{.+}} 'U::template V<int>' dependent
+// DUMP-NEXT:      | |-name: 'U::template V':'type-parameter-0-0::template V' dependent
+// DUMP-NEXT:      | | `-NestedNameSpecifier TypeSpec 'U'
+// DUMP-NEXT:      | `-TemplateArgument type 'int'
 // DUMP-NEXT:      `-BuiltinType {{.+}} 'int'
 } // namespace TestDependentMemberPointer
 
@@ -237,6 +240,28 @@ namespace GH153540 {
 // DUMP-NEXT: CXXConstructExpr {{.*}} <col:5, col:11> 'N::S<int>':'GH153540::N::S<int>' 'void (int)'
 } // namespace GH153540
 
+namespace AliasDependentTemplateSpecializationType {
+  // DUMP-LABEL: NamespaceDecl {{.*}} AliasDependentTemplateSpecializationType{{$}}
+
+  template<template<class> class TT> using T1 = TT<int>;
+  template<class T> using T2 = T1<T::template X>;
+
+// DUMP:      TypeAliasDecl {{.*}} T2 'T1<T::template X>':'T::template X<int>'
+// DUMP-NEXT: `-TemplateSpecializationType {{.*}} 'T1<T::template X>' sugar dependent alias
+// DUMP-NEXT:   |-name: 'T1':'AliasDependentTemplateSpecializationType::T1' qualified
+// DUMP-NEXT:   | `-TypeAliasTemplateDecl {{.*}} T1
+// DUMP-NEXT:   |-TemplateArgument template 'T::template X':'type-parameter-0-0::template X' dependent
+// DUMP-NEXT:   | `-NestedNameSpecifier TypeSpec 'T'
+// DUMP-NEXT:   `-TemplateSpecializationType {{.*}} 'T::template X<int>' dependent
+// DUMP-NEXT:     |-name: 'T::template X':'type-parameter-0-0::template X' subst index 0 final
+// DUMP-NEXT:     | |-parameter: TemplateTemplateParmDecl {{.*}} depth 0 index 0 TT
+// DUMP-NEXT:     | |-associated TypeAliasTemplate {{.*}} 'T1'
+// DUMP-NEXT:     | `-replacement: 'T::template X':'type-parameter-0-0::template X' dependent
+// DUMP-NEXT:     |   `-NestedNameSpecifier TypeSpec 'T'
+// DUMP-NEXT:     `-TemplateArgument type 'int'
+// DUMP-NEXT:       `-BuiltinType {{.*}} 'int'
+} // namespace
+
 // NOTE: CHECK lines have been autogenerated by gen_ast_dump_json_test.py
 
 
@@ -6646,8 +6671,8 @@ namespace GH153540 {
 // JSON-NEXT:      "tokLen": 9
 // JSON-NEXT:     },
 // JSON-NEXT:     "end": {
-// JSON-NEXT:      "offset": 6425,
-// JSON-NEXT:      "line": 180,
+// JSON-NEXT:      "offset": 6613,
+// JSON-NEXT:      "line": 183,
 // JSON-NEXT:      "col": 1,
 // JSON-NEXT:      "tokLen": 1
 // JSON-NEXT:     }
@@ -6961,12 +6986,30 @@ namespace GH153540 {
 // JSON-NEXT:            "inner": [
 // JSON-NEXT:             {
 // JSON-NEXT:              "id": "0x{{.*}}",
-// JSON-NEXT:              "kind": "DependentTemplateSpecializationType",
+// JSON-NEXT:              "kind": "TemplateSpecializationType",
 // JSON-NEXT:              "type": {
 // JSON-NEXT:               "qualType": "U::template V<int>"
 // JSON-NEXT:              },
 // JSON-NEXT:              "isDependent": true,
-// JSON-NEXT:              "isInstantiationDependent": true
+// JSON-NEXT:              "isInstantiationDependent": true,
+// JSON-NEXT:              "templateName": "U::template V",
+// JSON-NEXT:              "inner": [
+// JSON-NEXT:               {
+// JSON-NEXT:                "kind": "TemplateArgument",
+// JSON-NEXT:                "type": {
+// JSON-NEXT:                 "qualType": "int"
+// JSON-NEXT:                },
+// JSON-NEXT:                "inner": [
+// JSON-NEXT:                 {
+// JSON-NEXT:                  "id": "0x{{.*}}",
+// JSON-NEXT:                  "kind": "BuiltinType",
+// JSON-NEXT:                  "type": {
+// JSON-NEXT:                   "qualType": "int"
+// JSON-NEXT:                  }
+// JSON-NEXT:                 }
+// JSON-NEXT:                ]
+// JSON-NEXT:               }
+// JSON-NEXT:              ]
 // JSON-NEXT:             },
 // JSON-NEXT:             {
 // JSON-NEXT:              "id": "0x{{.*}}",
@@ -6989,20 +7032,20 @@ namespace GH153540 {
 // JSON-NEXT:    "id": "0x{{.*}}",
 // JSON-NEXT:    "kind": "NamespaceDecl",
 // JSON-NEXT:    "loc": {
-// JSON-NEXT:     "offset": 6478,
-// JSON-NEXT:     "line": 182,
+// JSON-NEXT:     "offset": 6666,
+// JSON-NEXT:     "line": 185,
 // JSON-NEXT:     "col": 11,
 // JSON-NEXT:     "tokLen": 19
 // JSON-NEXT:    },
 // JSON-NEXT:    "range": {
 // JSON-NEXT:     "begin": {
-// JSON-NEXT:      "offset": 6468,
+// JSON-NEXT:      "offset": 6656,
 // JSON-NEXT:      "col": 1,
 // JSON-NEXT:      "tokLen": 9
 // JSON-NEXT:     },
 // JSON-NEXT:     "end": {
-// JSON-NEXT:      "offset": 9336,
-// JSON-NEXT:      "line": 222,
+// JSON-NEXT:      "offset": 9524,
+// JSON-NEXT:      "line": 225,
 // JSON-NEXT:      "col": 1,
 // JSON-NEXT:      "tokLen": 1
 // JSON-NEXT:     }
@@ -7013,19 +7056,19 @@ namespace GH153540 {
 // JSON-NEXT:      "id": "0x{{.*}}",
 // JSON-NEXT:      "kind": "ClassTemplateDecl",
 // JSON-NEXT:      "loc": {
-// JSON-NEXT:       "offset": 6601,
-// JSON-NEXT:       "line": 184,
+// JSON-NEXT:       "offset": 6789,
+// JSON-NEXT:       "line": 187,
 // JSON-NEXT:       "col": 41,
 // JSON-NEXT:       "tokLen": 9
 // JSON-NEXT:      },
 // JSON-NEXT:      "range": {
 // JSON-NEXT:       "begin": {
-// JSON-NEXT:        "offset": 6563,
+// JSON-NEXT:        "offset": 6751,
 // JSON-NEXT:        "col": 3,
 // JSON-NEXT:        "tokLen": 8
 // JSON-NEXT:       },
 // JSON-NEXT:       "end": {
-// JSON-NEXT:        "offset": 6612,
+// JSON-NEXT:        "offset": 6800,
 // JSON-NEXT:        "col": 52,
 // JSON-NEXT:        "tokLen": 1
 // JSON-NEXT:       }
@@ -7036,18 +7079,18 @@ namespace GH153540 {
 // JSON-NEXT:        "id": "0x{{.*}}",
 // JSON-NEXT:        "kind": "TemplateTypeParmDecl",
 // JSON-NEXT:        "loc": {
-// JSON-NEXT:         "offset": 6579,
+// JSON-NEXT:         "offset": 6767,
 // JSON-NEXT:         "col": 19,
 // JSON-NEXT:         "tokLen": 3
 // JSON-NEXT:        },
 // JSON-NEXT:        "range": {
 // JSON-NEXT:         "begin": {
-// JSON-NEXT:          "offset": 6573,
+// JSON-NEXT:          "offset": 6761,
 // JSON-NEXT:          "col": 13,
 // JSON-NEXT:          "tokLen": 5
 // JSON-NEXT:         },
 // JSON-NEXT:         "end": {
-// JSON-NEXT:          "offset": 6579,
+// JSON-NEXT:          "offset": 6767,
 // JSON-NEXT:          "col": 19,
 // JSON-NEXT:          "tokLen": 3
 // JSON-NEXT:         }
@@ -7061,18 +7104,18 @@ namespace GH153540 {
 // JSON-NEXT:        "id": "0x{{.*}}",
 // JSON-NEXT:        "kind": "NonTypeTemplateParmDecl",
 // JSON-NEXT:        "loc": {
-// JSON-NEXT:         "offset": 6589,
+// JSON-NEXT:         "offset": 6777,
 // JSON-NEXT:         "col": 29,
 // JSON-NEXT:         "tokLen": 3
 // JSON-NEXT:        },
 // JSON-NEXT:        "range": {
 // JSON-NEXT:         "begin": {
-// JSON-NEXT:          "offset": 6584,
+// JSON-NEXT:          "offset": 6772,
 // JSON-NEXT:          "col": 24,
 // JSON-NEXT:          "tokLen": 4
 // JSON-NEXT:         },
 // JSON-NEXT:         "end": {
-// JSON-NEXT:          "offset": 6589,
+// JSON-NEXT:          "offset": 6777,
 // JSON-NEXT:          "col": 29,
 // JSON-NEXT:          "tokLen": 3
 // JSON-NEXT:         }
@@ -7088,18 +7131,18 @@ namespace GH153540 {
 // JSON-NEXT:        "id": "0x{{.*}}",
 // JSON-NEXT:        "kind": "CXXRecordDecl",
 // JSON-NEXT:        "loc": {
-// JSON-NEXT:         "offset": 6601,
+// JSON-NEXT:         "offset": 6789,
 // JSON-NEXT:         "col": 41,
 // JSON-NEXT:         "tokLen": 9
 // JSON-NEXT:        },
 // JSON-NEXT:        "range": {
 // JSON-NEXT:         "begin": {
-// JSON-NEXT:          "offset": 6594,
+// JSON-NEXT:          "offset": 6782,
 // JSON-NEXT:          "col": 34,
 // JSON-NEXT:          "tokLen": 6
 // JSON-NEXT:         },
 // JSON-NEXT:         "end": {
-// JSON-NEXT:          "offset": 6612,
+// JSON-NEXT:          "offset": 6800,
 // JSON-NEXT:          "col": 52,
 // JSON-NEXT:          "tokLen": 1
 // JSON-NEXT:         }
@@ -7162,18 +7205,18 @@ namespace GH153540 {
 // JSON-NEXT:          "id": "0x{{.*}}",
 // JSON-NEXT:          "kind": "CXXRecordDecl",
 // JSON-NEXT:          "loc": {
-// JSON-NEXT:           "offset": 6601,
+// JSON-NEXT:           "offset": 6789,
 // JSON-NEXT:           "col": 41,
 // JSON-NEXT:           "tokLen": 9
 // JSON-NEXT:          },
 // JSON-NEXT:          "range": {
 // JSON-NEXT:           "begin": {
-// JSON-NEXT:            "offset": 6594,
+// JSON-NEXT:            "offset": 6782,
 // JSON-NEXT:            "col": 34,
 // JSON-NEXT:            "tokLen": 6
 // JSON-NEXT:           },
 // JSON-NEXT:           "end": {
-// JSON-NEXT:            "offset": 6601,
+// JSON-NEXT:            "offset": 6789,
 // JSON-NEXT:            "col": 41,
 // JSON-NEXT:            "tokLen": 9
 // JSON-NEXT:           }
@@ -7190,19 +7233,19 @@ namespace GH153540 {
 // JSON-NEXT:      "id": "0x{{.*}}",
 // JSON-NEXT:      "kind": "ClassTemplateDecl",
 // JSON-NEXT:      "loc": {
-// JSON-NEXT:       "offset": 6655,
-// JSON-NEXT:       "line": 185,
+// JSON-NEXT:       "offset": 6843,
+// JSON-NEXT:       "line": 188,
 // JSON-NEXT:       "col": 41,
 // JSON-NEXT:       "tokLen": 9
 // JSON-NEXT:      },
 // JSON-NEXT:      "range": {
 // JSON-NEXT:       "begin": {
-// JSON-NEXT:        "offset": 6617,
+// JSON-NEXT:        "offset": 6805,
 // JSON-NEXT:        "col": 3,
 // JSON-NEXT:        "tokLen": 8
 // JSON-NEXT:       },
 // JSON-NEXT:       "end": {
-// JSON-NEXT:        "offset": 6666,
+// JSON-NEXT:        "offset": 6854,
 // JSON-NEXT:        "col": 52,
 // JSON-NEXT:        "tokLen": 1
 // JSON-NEXT:       }
@@ -7213,18 +7256,18 @@ namespace GH153540 {
 // JSON-NEXT:        "id": "0x{{.*}}",
 // JSON-NEXT:        "kind": "TemplateTypeParmDecl",
 // JSON-NEXT:        "loc": {
-// JSON-NEXT:         "offset": 6633,
+// JSON-NEXT:         "offset": 6821,
 // JSON-NEXT:         "col": 19,
 // JSON-NEXT:         "tokLen": 3
 // JSON-NEXT:        },
 // JSON-NEXT:        "range": {
 // JSON-NEXT:         "begin": {
-// JSON-NEXT:          "offset": 6627,
+// JSON-NEXT:          "offset": 6815,
 // JSON-NEXT:          "col": 13,
 // JSON-NEXT:          "tokLen": 5
 // JSON-NEXT:         },
 // JSON-NEXT:         "end": {
-// JSON-NEXT:          "offset": 6633,
+// JSON-NEXT:          "offset": 6821,
 // JSON-NEXT:          "col": 19,
 // JSON-NEXT:          "tokLen": 3
 // JSON-NEXT:         }
@@ -7238,18 +7281,18 @@ namespace GH153540 {
 // JSON-NEXT:        "id": "0x{{.*}}",
 // JSON-NEXT:        "kind": "NonTypeTemplateParmDecl",
 // JSON-NEXT:        "loc": {
-// JSON-NEXT:         "offset": 6643,
+// JSON-NEXT:         "offset": 6831,
 // JSON-NEXT:         "col": 29,
 // JSON-NEXT:         "tokLen": 3
 // JSON-NEXT:        },
 // JSON-NEXT:        "range": {
 // JSON-NEXT:         "begin": {
-// JSON-NEXT:          "offset": 6638,
+// JSON-NEXT:          "offset": 6826,
 // JSON-NEXT:          "col": 24,
 // JSON-NEXT:          "tokLen": 4
 // JSON-NEXT:         },
 // JSON-NEXT:         "end": {
-// JSON-NEXT:          "offset": 6643,
+// JSON-NEXT:          "offset": 6831,
 // JSON-NEXT:          "col": 29,
 // JSON-NEXT:          "tokLen": 3
 // JSON-NEXT:         }
@@ -7265,18 +7308,18 @@ namespace GH153540 {
 // JSON-NEXT:        "id": "0x{{.*}}",
 // JSON-NEXT:        "kind": "CXXRecordDecl",
 // JSON-NEXT:        "loc": {
-// JSON-NEXT:         "offset": 6655,
+// JSON-NEXT:         "offset": 6843,
 // JSON-NEXT:         "col": 41,
 // JSON-NEXT:         "tokLen": 9
 // JSON-NEXT:        },
 // JSON-NEXT:        "range": {
 // JSON-NEXT:         "begin": {
-// JSON-NEXT:          "offset": 6648,
+// JSON-NEXT:          "offset": 6836,
 // JSON-NEXT:          "col": 34,
 // JSON-NEXT:          "tokLen": 6
 // JSON-NEXT:         },
 // JSON-NEXT:         "end": {
-// JSON-NEXT:          "offset": 6666,
+// JSON-NEXT:          "offset": 6854,
 // JSON-NEXT:          "col": 52,
 // JSON-NEXT:          "tokLen": 1
 // JSON-NEXT:         }
@@ -7339,18 +7382,18 @@ namespace GH153540 {
 // JSON-NEXT:          "id": "0x{{.*}}",
 // JSON-NEXT:          "kind": "CXXRecordDecl",
 // JSON-NEXT:          "loc": {
-// JSON-NEXT:           "offset": 6655,
+// JSON-NEXT:           "offset": 6843,
 // JSON-NEXT:           "col": 41,
 // JSON-NEXT:           "tokLen": 9
 // JSON-NEXT:          },
 // JSON-NEXT:          "range": {
 // JSON-NEXT:           "begin": {
-// JSON-NEXT:            "offset": 6648,
+// JSON-NEXT:            "offset": 6836,
 // JSON-NEXT:            "col": 34,
 // JSON-NEXT:            "tokLen": 6
 // JSON-NEXT:           },
 // JSON-NEXT:           "end": {
-// JSON-NEXT:            "offset": 6655,
+// JSON-NEXT:            "offset": 6843,
 // JSON-NEXT:            "col": 41,
 // JSON-NEXT:            "tokLen": 9
 // JSON-NEXT:           }
@@ -7367,21 +7410,21 @@ namespace GH153540 {
 // JSON-NEXT:      "id": "0x{{.*}}",
 // JSON-NEXT:      "kind": "ClassTemplatePartialSpecializationDecl",
 // JSON-NEXT:      "loc": {
-// JSON-NEXT:       "offset": 6719,
-// JSON-NEXT:       "line": 188,
+// JSON-NEXT:       "offset": 6907,
+// JSON-NEXT:       "line": 191,
 // JSON-NEXT:       "col": 10,
 // JSON-NEXT:       "tokLen": 9
 // JSON-NEXT:      },
 // JSON-NEXT:      "range": {
 // JSON-NEXT:       "begin": {
-// JSON-NEXT:        "offset": 6672,
-// JSON-NEXT:        "line": 187,
+// JSON-NEXT:        "offset": 6860,
+// JSON-NEXT:        "line": 190,
 // JSON-NEXT:        "col": 3,
 // JSON-NEXT:        "tokLen": 8
 // JSON-NEXT:       },
 // JSON-NEXT:       "end": {
-// JSON-NEXT:        "offset": 6753,
-// JSON-NEXT:        "line": 188,
+// JSON-NEXT:        "offset": 6941,
+// JSON-NEXT:        "line": 191,
 // JSON-NEXT:        "col": 44,
 // JSON-NEXT:        "tokLen": 1
 // JSON-NEXT:       }
@@ -7488,12 +7531,12 @@ namespace GH153540 {
 // JSON-NEXT:              "kind": "DeclRefExpr",
 // JSON-NEXT:              "range": {
 // JSON-NEXT:               "begin": {
-// JSON-NEXT:                "offset": 6743,
+// JSON-NEXT:                "offset": 6931,
 // JSON-NEXT:                "col": 34,
 // JSON-NEXT:                "tokLen": 2
 // JSON-NEXT:               },
 // JSON-NEXT:               "end": {
-// JSON-NEXT:                "offset": 6743,
+// JSON-NEXT:                "offset": 6931,
 // JSON-NEXT:                "col": 34,
 // JSON-NEXT:                "tokLen": 2
 // JSON-NEXT:               }
@@ -7527,12 +7570,12 @@ namespace GH153540 {
 // JSON-NEXT:          "kind": "DeclRefExpr",
 // JSON-NEXT:          "range": {
 // JSON-NEXT:           "begin": {
-// JSON-NEXT:            "offset": 6748,
+// JSON-NEXT:            "offset": 6936,
 // JSON-NEXT:            "col": 39,
 // JSON-NEXT:            "tokLen": 2
 // JSON-NEXT:           },
 // JSON-NEXT:           "end": {
-// JSON-NEXT:            "offset": 6748,
+// JSON-NEXT:            "offset": 6936,
 // JSON-NEXT:            "col": 39,
 // JSON-NEXT:            "tokLen": 2
 // JSON-NEXT:           }
@@ -7556,19 +7599,19 @@ namespace GH153540 {
 // JSON-NEXT:        "id": "0x{{.*}}",
 // JSON-NEXT:        "kind": "TemplateTypeParmDecl",
 // JSON-NEXT:        "loc": {
-// JSON-NEXT:         "offset": 6688,
-// JSON-NEXT:         "line": 187,
+// JSON-NEXT:         "offset": 6876,
+// JSON-NEXT:         "line": 190,
 // JSON-NEXT:         "col": 19,
 // JSON-NEXT:         "tokLen": 2
 // JSON-NEXT:        },
 // JSON-NEXT:        "range": {
 // JSON-NEXT:         "begin": {
-// JSON-NEXT:          "offset": 6682,
+// JSON-NEXT:          "offset": 6870,
 // JSON-NEXT:          "col": 13,
 // JSON-NEXT:          "tokLen": 5
 // JSON-NEXT:         },
 // JSON-NEXT:         "end": {
-// JSON-NEXT:          "offset": 6688,
+// JSON-NEXT:          "offset": 6876,
 // JSON-NEXT:          "col": 19,
 // JSON-NEXT:          "tokLen": 2
 // JSON-NEXT:         }
@@ -7583,18 +7626,18 @@ namespace GH153540 {
 // JSON-NEXT:        "id": "0x{{.*}}",
 // JSON-NEXT:        "kind": "NonTypeTemplateParmDecl",
 // JSON-NEXT:        "loc": {
-// JSON-NEXT:         "offset": 6697,
+// JSON-NEXT:         "offset": 6885,
 // JSON-NEXT:         "col": 28,
 // JSON-NEXT:         "tokLen": 2
 // JSON-NEXT:        },
 // JSON-NEXT:        "range": {
 // JSON-NEXT:         "begin": {
-// JSON-NEXT:          "offset": 6692,
+// JSON-NEXT:          "offset": 6880,
 // JSON-NEXT:          "col": 23,
 // JSON-NEXT:          "tokLen": 4
 // JSON-NEXT:         },
 // JSON-NEXT:         "end": {
-// JSON-NEXT:          "offset": 6697,
+// JSON-NEXT:          "offset": 6885,
 // JSON-NEXT:          "col": 28,
 // JSON-NEXT:          "tokLen": 2
 // JSON-NEXT:         }
@@ -7611,18 +7654,18 @@ namespace GH153540 {
 // JSON-NEXT:        "id": "0x{{.*}}",
 // JSON-NEXT:        "kind": "NonTypeTemplateParmDecl",
 // JSON-NEXT:        "loc": {
-// JSON-NEXT:         "offset": 6706,
+// JSON-NEXT:         "offset": 6894,
 // JSON-NEXT:         "col": 37,
 // JSON-NEXT:         "tokLen": 2
 // JSON-NEXT:        },
 // JSON-NEXT:        "range": {
 // JSON-NEXT:         "begin": {
-// JSON-NEXT:          "offset": 6701,
+// JSON-NEXT:          "offset": 6889,
 // JSON-NEXT:          "col": 32,
 // JSON-NEXT:          "tokLen": 4
 // JSON-NEXT:         },
 // JSON-NEXT:         "end": {
-// JSON-NEXT:          "offset": 6706,
+// JSON-NEXT:          "offset": 6894,
 // JSON-NEXT:          "col": 37,
 // JSON-NEXT:          "tokLen": 2
 // JSON-NEXT:         }
@@ -7639,19 +7682,19 @@ namespace GH153540 {
 // JSON-NEXT:        "id": "0x{{.*}}",
 // JSON-NEXT:        "kind": "CXXRecordDecl",
 // JSON-NEXT:        "loc": {
-// JSON-NEXT:         "offset": 6719,
-// JSON-NEXT:         "line": 188,
+// JSON-NEXT:         "offset": 6907,
+// JSON-NEXT:         "line": 191,
 // JSON-NEXT:         "col": 10,
 // JSON-NEXT:         "tokLen": 9
 // JSON-NEXT:        },
 // JSON-NEXT:        "range": {
 // JSON-NEXT:         "begin": {
-// JSON-NEXT:          "offset": 6712,
+// JSON-NEXT:          "offset": 6900,
 // JSON-NEXT:          "col": 3,
 // JSON-NEXT:          "tokLen": 6
 // JSON-NEXT:         },
 // JSON-NEXT:         "end": {
-// JSON-NEXT:          "offset": 6719,
+// JSON-NEXT:          "offset": 6907,
 // JSON-NEXT:          "col": 10,
 // JSON-NEXT:          "tokLen": 9
 // JSON-NEXT:         }
@@ -7666,21 +7709,21 @@ namespace GH153540 {
 // JSON-NEXT:      "id": "0x{{.*}}",
 // JSON-NEXT:      "kind": "ClassTemplatePartialSpecializationDecl",
 // JSON-NEXT:      "loc": {
-// JSON-NEXT:       "offset": 8035,
-// JSON-NEXT:       "line": 206,
+// JSON-NEXT:       "offset": 8223,
+// JSON-NEXT:       "line": 209,
 // JSON-NEXT:       "col": 10,
 // JSON-NEXT:       "tokLen": 9
 // JSON-NEXT:      },
 // JSON-NEXT:      "range": {
 // JSON-NEXT:       "begin": {
-// JSON-NEXT:        "offset": 7985,
-// JSON-NEXT:        "line": 205,
+// JSON-NEXT:        "offset": 8173,
+// JSON-NEXT:        "line": 208,
 // JSON-NEXT:        "col": 3,
 // JSON-NEXT:        "tokLen": 8
 // JSON-NEXT:       },
 // JSON-NEXT:       "end": {
-// JSON-NEXT:        "offset": 8069,
-// JSON-NEXT:        "line": 206,
+// JSON-NEXT:        "offset": 8257,
+// JSON-NEXT:        "line": 209,
 // JSON-NEXT:        "col": 44,
 // JSON-NEXT:        "tokLen": 1
 // JSON-NEXT:       }
@@ -7787,12 +7830,12 @@ namespace GH153540 {
 // JSON-NEXT:              "kind": "DeclRefExpr",
 // JSON-NEXT:              "range": {
 // JSON-NEXT:               "begin": {
-// JSON-NEXT:                "offset": 8059,
+// JSON-NEXT:                "offset": 8247,
 // JSON-NEXT:                "col": 34,
 // JSON-NEXT:                "tokLen": 2
 // JSON-NEXT:               },
 // JSON-NEXT:               "end": {
-// JSON-NEXT:                "offset": 8059,
+// JSON-NEXT:                "offset": 8247,
 // JSON-NEXT:                "col": 34,
 // JSON-NEXT:                "tokLen": 2
 // JSON-NEXT:               }
@@ -7826,12 +7869,12 @@ namespace GH153540 {
 // JSON-NEXT:          "kind": "DeclRefExpr",
 // JSON-NEXT:          "range": {
 // JSON-NEXT:           "begin": {
-// JSON-NEXT:            "offset": 8064,
+// JSON-NEXT:            "offset": 8252,
 // JSON-NEXT:            "col": 39,
 // JSON-NEXT:            "tokLen": 2
 // JSON-NEXT:           },
 // JSON-NEXT:           "end": {
-// JSON-NEXT:            "offset": 8064,
+// JSON-NEXT:            "offset": 8252,
 // JSON-NEXT:            "col": 39,
 // JSON-NEXT:            "tokLen": 2
 // JSON-NEXT:           }
@@ -7855,19 +7898,19 @@ namespace GH153540 {
 // JSON-NEXT:        "id": "0x{{.*}}",
 // JSON-NEXT:        "kind": "TemplateTypeParmDecl",
 // JSON-NEXT:        "loc": {
-// JSON-NEXT:         "offset": 8004,
-// JSON-NEXT:         "line": 205,
+// JSON-NEXT:         "offset": 8192,
+// JSON-NEXT:         "line": 208,
 // JSON-NEXT:         "col": 22,
 // JSON-NEXT:         "tokLen": 2
 // JSON-NEXT:        },
 // JSON-NEXT:        "range": {
 // JSON-NEXT:         "begin": {
-// JSON-NEXT:          "offset": 7995,
+// JSON-NEXT:          "offset": 8183,
 // JSON-NEXT:          "col": 13,
 // JSON-NEXT:          "tokLen": 8
 // JSON-NEXT:         },
 // JSON-NEXT:         "end": {
-// JSON-NEXT:          "offset": 8004,
+// JSON-NEXT:          "offset": 8192,
 // JSON-NEXT:          "col": 22,
 // JSON-NEXT:          "tokLen": 2
 // JSON-NEXT:         }
@@ -7882,18 +7925,18 @@ namespace GH153540 {
 // JSON-NEXT:        "id": "0x{{.*}}",
 // JSON-NEXT:        "kind": "NonTypeTemplateParmDecl",
 // JSON-NEXT:        "loc": {
-// JSON-NEXT:         "offset": 8013,
+// JSON-NEXT:         "offset": 8201,
 // JSON-NEXT:         "col": 31,
 // JSON-NEXT:         "tokLen": 2
 // JSON-NEXT:        },
 // JSON-NEXT:        "range": {
 // JSON-NEXT:         "begin": {
-// JSON-NEXT:          "offset": 8008,
+// JSON-NEXT:          "offset": 8196,
 // JSON-NEXT:          "col": 26,
 // JSON-NEXT:          "tokLen": 4
 // JSON-NEXT:         },
 // JSON-NEXT:         "end": {
-// JSON-NEXT:          "offset": 8013,
+// JSON-NEXT:          "offset": 8201,
 // JSON-NEXT:          "col": 31,
 // JSON-NEXT:          "tokLen": 2
 // JSON-NEXT:         }
@@ -7910,18 +7953,18 @@ namespace GH153540 {
 // JSON-NEXT:        "id": "0x{{.*}}",
 // JSON-NEXT:        "kind": "NonTypeTemplateParmDecl",
 // JSON-NEXT:        "loc": {
-// JSON-NEXT:         "offset": 8022,
+// JSON-NEXT:         "offset": 8210,
 // JSON-NEXT:         "col": 40,
 // JSON-NEXT:         "tokLen": 2
 // JSON-NEXT:        },
 // JSON-NEXT:        "range": {
 // JSON-NEXT:         "begin": {
-// JSON-NEXT:          "offset": 8017,
+// JSON-NEXT:          "offset": 8205,
 // JSON-NEXT:          "col": 35,
 // JSON-NEXT:          "tokLen": 4
 // JSON-NEXT:         },
 // JSON-NEXT:         "end": {
-// JSON-NEXT:          "offset": 8022,
+// JSON-NEXT:          "offset": 8210,
 // JSON-NEXT:          "col": 40,
 // JSON-NEXT:          "tokLen": 2
 // JSON-NEXT:         }
@@ -7938,19 +7981,19 @@ namespace GH153540 {
 // JSON-NEXT:        "id": "0x{{.*}}",
 // JSON-NEXT:        "kind": "CXXRecordDecl",
 // JSON-NEXT:        "loc": {
-// JSON-NEXT:         "offset": 8035,
-// JSON-NEXT:         "line": 206,
+// JSON-NEXT:         "offset": 8223,
+// JSON-NEXT:         "line": 209,
 // JSON-NEXT:         "col": 10,
 // JSON-NEXT:         "tokLen": 9
 // JSON-NEXT:        },
 // JSON-NEXT:        "range": {
 // JSON-NEXT:         "begin": {
-// JSON-NEXT:          "offset": 8028,
+// JSON-NEXT:          "offset": 8216,
 // JSON-NEXT:          "col": 3,
 // JSON-NEXT:          "tokLen": 6
 // JSON-NEXT:         },
 // JSON-NEXT:         "end": {
-// JSON-NEXT:          "offset": 8035,
+// JSON-NEXT:          "offset": 8223,
 // JSON-NEXT:          "col": 10,
 // JSON-NEXT:          "tokLen": 9
 // JSON-NEXT:         }
@@ -7967,20 +8010,20 @@ namespace GH153540 {
 // JSON-NEXT:    "id": "0x{{.*}}",
 // JSON-NEXT:    "kind": "NamespaceDecl",
 // JSON-NEXT:    "loc": {
-// JSON-NEXT:     "offset": 9382,
-// JSON-NEXT:     "line": 224,
+// JSON-NEXT:     "offset": 9570,
+// JSON-NEXT:     "line": 227,
 // JSON-NEXT:     "col": 11,
 // JSON-NEXT:     "tokLen": 8
 // JSON-NEXT:    },
 // JSON-NEXT:    "range": {
 // JSON-NEXT:     "begin": {
-// JSON-NEXT:      "offset": 9372,
+// JSON-NEXT:      "offset": 9560,
 // JSON-NEXT:      "col": 1,
 // JSON-NEXT:      "tokLen": 9
 // JSON-NEXT:     },
 // JSON-NEXT:     "end": {
-// JSON-NEXT:      "offset": 9791,
-// JSON-NEXT:      "line": 238,
+// JSON-NEXT:      "offset": 9979,
+// JSON-NEXT:      "line": 241,
 // JSON-NEXT:      "col": 1,
 // JSON-NEXT:      "tokLen": 1
 // JSON-NEXT:     }
@@ -7991,20 +8034,20 @@ namespace GH153540 {
 // JSON-NEXT:      "id": "0x{{.*}}",
 // JSON-NEXT:      "kind": "NamespaceDecl",
 // JSON-NEXT:      "loc": {
-// JSON-NEXT:       "offset": 9456,
-// JSON-NEXT:       "line": 227,
+// JSON-NEXT:       "offset": 9644,
+// JSON-NEXT:       "line": 230,
 // JSON-NEXT:       "col": 13,
 // JSON-NEXT:       "tokLen": 1
 // JSON-NEXT:      },
 // JSON-NEXT:      "range": {
 // JSON-NEXT:       "begin": {
-// JSON-NEXT:        "offset": 9446,
+// JSON-NEXT:        "offset": 9634,
 // JSON-NEXT:        "col": 3,
 // JSON-NEXT:        "tokLen": 9
 // JSON-NEXT:       },
 // JSON-NEXT:       "end": {
-// JSON-NEXT:        "offset": 9507,
-// JSON-NEXT:        "line": 229,
+// JSON-NEXT:        "offset": 9695,
+// JSON-NEXT:        "line": 232,
 // JSON-NEXT:        "col": 3,
 // JSON-NEXT:        "tokLen": 1
 // JSON-NEXT:       }
@@ -8015,19 +8058,19 @@ namespace GH153540 {
 // JSON-NEXT:        "id": "0x{{.*}}",
 // JSON-NEXT:        "kind": "ClassTemplateDecl",
 // JSON-NEXT:        "loc": {
-// JSON-NEXT:         "offset": 9492,
-// JSON-NEXT:         "line": 228,
+// JSON-NEXT:         "offset": 9680,
+// JSON-NEXT:         "line": 231,
 // JSON-NEXT:         "col": 33,
 // JSON-NEXT:         "tokLen": 1
 // JSON-NEXT:        },
 // JSON-NEXT:        "range": {
 // JSON-NEXT:         "begin": {
-// JSON-NEXT:          "offset": 9464,
+// JSON-NEXT:          "offset": 9652,
 // JSON-NEXT:          "col": 5,
 // JSON-NEXT:          "tokLen": 8
 // JSON-NEXT:         },
 // JSON-NEXT:         "end": {
-// JSON-NEXT:          "offset": 9502,
+// JSON-NEXT:          "offset": 9690,
 // JSON-NEXT:          "col": 43,
 // JSON-NEXT:          "tokLen": 1
 // JSON-NEXT:         }
@@ -8038,18 +8081,18 @@ namespace GH153540 {
 // JSON-NEXT:          "id": "0x{{.*}}",
 // JSON-NEXT:          "kind": "TemplateTypeParmDecl",
 // JSON-NEXT:          "loc": {
-// JSON-NEXT:           "offset": 9482,
+// JSON-NEXT:           "offset": 9670,
 // JSON-NEXT:           "col": 23,
 // JSON-NEXT:           "tokLen": 1
 // JSON-NEXT:          },
 // JSON-NEXT:          "range": {
 // JSON-NEXT:           "begin": {
-// JSON-NEXT:            "offset": 9473,
+// JSON-NEXT:            "offset": 9661,
 // JSON-NEXT:            "col": 14,
 // JSON-NEXT:            "tokLen": 8
 // JSON-NEXT:           },
 // JSON-NEXT:           "end": {
-// JSON-NEXT:            "offset": 9482,
+// JSON-NEXT:            "offset": 9670,
 // JSON-NEXT:            "col": 23,
 // JSON-NEXT:            "tokLen": 1
 // JSON-NEXT:           }
@@ -8064,18 +8107,18 @@ namespace GH153540 {
 // JSON-NEXT:          "id": "0x{{.*}}",
 // JSON-NEXT:          "kind": "CXXRecordDecl",
 // JSON-NEXT:          "loc": {
-// JSON-NEXT:           "offset": 9492,
+// JSON-NEXT:           "offset": 9680,
 // JSON-NEXT:           "col": 33,
 // JSON-NEXT:           "tokLen": 1
 // JSON-NEXT:          },
 // JSON-NEXT:          "range": {
 // JSON-NEXT:           "begin": {
-// JSON-NEXT:            "offset": 9485,
+// JSON-NEXT:            "offset": 9673,
 // JSON-NEXT:            "col": 26,
 // JSON-NEXT:            "tokLen": 6
 // JSON-NEXT:           },
 // JSON-NEXT:           "end": {
-// JSON-NEXT:            "offset": 9502,
+// JSON-NEXT:            "offset": 9690,
 // JSON-NEXT:            "col": 43,
 // JSON-NEXT:            "tokLen": 1
 // JSON-NEXT:           }
@@ -8130,18 +8173,18 @@ namespace GH153540 {
 // JSON-NEXT:            "id": "0x{{.*}}",
 // JSON-NEXT:            "kind": "CXXRecordDecl",
 // JSON-NEXT:            "loc": {
-// JSON-NEXT:             "offset": 9492,
+// JSON-NEXT:             "offset": 9680,
 // JSON-NEXT:             "col": 33,
 // JSON-NEXT:             "tokLen": 1
 // JSON-NEXT:            },
 // JSON-NEXT:            "range": {
 // JSON-NEXT:             "begin": {
-// JSON-NEXT:              "offset": 9485,
+// JSON-NEXT:              "offset": 9673,
 // JSON-NEXT:              "col": 26,
 // JSON-NEXT:              "tokLen": 6
 // JSON-NEXT:             },
 // JSON-NEXT:             "end": {
-// JSON-NEXT:              "offset": 9492,
+// JSON-NEXT:              "offset": 9680,
 // JSON-NEXT:              "col": 33,
 // JSON-NEXT:              "tokLen": 1
 // JSON-NEXT:             }
@@ -8154,18 +8197,18 @@ namespace GH153540 {
 // JSON-NEXT:            "id": "0x{{.*}}",
 // JSON-NEXT:            "kind": "CXXConstructorDecl",
 // JSON-NEXT:            "loc": {
-// JSON-NEXT:             "offset": 9496,
+// JSON-NEXT:             "offset": 9684,
 // JSON-NEXT:             "col": 37,
 // JSON-NEXT:             "tokLen": 1
 // JSON-NEXT:            },
 // JSON-NEXT:            "range": {
 // JSON-NEXT:             "begin": {
-// JSON-NEXT:              "offset": 9496,
+// JSON-NEXT:              "offset": 9684,
 // JSON-NEXT:              "col": 37,
 // JSON-NEXT:              "tokLen": 1
 // JSON-NEXT:             },
 // JSON-NEXT:             "end": {
-// JSON-NEXT:              "offset": 9499,
+// JSON-NEXT:              "offset": 9687,
 // JSON-NEXT:              "col": 40,
 // JSON-NEXT:              "tokLen": 1
 // JSON-NEXT:             }
@@ -8179,18 +8222,18 @@ namespace GH153540 {
 // JSON-NEXT:              "id": "0x{{.*}}",
 // JSON-NEXT:              "kind": "ParmVarDecl",
 // JSON-NEXT:              "loc": {
-// JSON-NEXT:               "offset": 9499,
+// JSON-NEXT:               "offset": 9687,
 // JSON-NEXT:               "col": 40,
 // JSON-NEXT:               "tokLen": 1
 // JSON-NEXT:              },
 // JSON-NEXT:              "range": {
 // JSON-NEXT:               "begin": {
-// JSON-NEXT:                "offset": 9498,
+// JSON-NEXT:                "offset": 9686,
 // JSON-NEXT:                "col": 39,
 // JSON-NEXT:                "tokLen": 1
 // JSON-NEXT:               },
 // JSON-NEXT:               "end": {
-// JSON-NEXT:                "offset": 9498,
+// JSON-NEXT:                "offset": 9686,
 // JSON-NEXT:                "col": 39,
 // JSON-NEXT:                "tokLen": 1
 // JSON-NEXT:               }
@@ -8207,18 +8250,18 @@ namespace GH153540 {
 // JSON-NEXT:          "id": "0x{{.*}}",
 // JSON-NEXT:          "kind": "ClassTemplateSpecializationDecl",
 // JSON-NEXT:          "loc": {
-// JSON-NEXT:           "offset": 9492,
+// JSON-NEXT:           "offset": 9680,
 // JSON-NEXT:           "col": 33,
 // JSON-NEXT:           "tokLen": 1
 // JSON-NEXT:          },
 // JSON-NEXT:          "range": {
 // JSON-NEXT:           "begin": {
-// JSON-NEXT:            "offset": 9464,
+// JSON-NEXT:            "offset": 9652,
 // JSON-NEXT:            "col": 5,
 // JSON-NEXT:            "tokLen": 8
 // JSON-NEXT:           },
 // JSON-NEXT:           "end": {
-// JSON-NEXT:            "offset": 9502,
+// JSON-NEXT:            "offset": 9690,
 // JSON-NEXT:            "col": 43,
 // JSON-NEXT:            "tokLen": 1
 // JSON-NEXT:           }
@@ -8286,18 +8329,18 @@ namespace GH153540 {
 // JSON-NEXT:            "id": "0x{{.*}}",
 // JSON-NEXT:            "kind": "CXXRecordDecl",
 // JSON-NEXT:            "loc": {
-// JSON-NEXT:             "offset": 9492,
+// JSON-NEXT:             "offset": 9680,
 // JSON-NEXT:             "col": 33,
 // JSON-NEXT:             "tokLen": 1
 // JSON-NEXT:            },
 // JSON-NEXT:            "range": {
 // JSON-NEXT:             "begin": {
-// JSON-NEXT:              "offset": 9485,
+// JSON-NEXT:              "offset": 9673,
 // JSON-NEXT:              "col": 26,
 // JSON-NEXT:              "tokLen": 6
 // JSON-NEXT:             },
 // JSON-NEXT:             "end": {
-// JSON-NEXT:              "offset": 9492,
+// JSON-NEXT:              "offset": 9680,
 // JSON-NEXT:              "col": 33,
 // JSON-NEXT:              "tokLen": 1
 // JSON-NEXT:             }
@@ -8310,18 +8353,18 @@ namespace GH153540 {
 // JSON-NEXT:            "id": "0x{{.*}}",
 // JSON-NEXT:            "kind": "CXXConstructorDecl",
 // JSON-NEXT:            "loc": {
-// JSON-NEXT:             "offset": 9496,
+// JSON-NEXT:             "offset": 9684,
 // JSON-NEXT:             "col": 37,
 // JSON-NEXT:             "tokLen": 1
 // JSON-NEXT:            },
 // JSON-NEXT:            "range": {
 // JSON-NEXT:             "begin": {
-// JSON-NEXT:              "offset": 9496,
+// JSON-NEXT:              "offset": 9684,
 // JSON-NEXT:              "col": 37,
 // JSON-NEXT:              "tokLen": 1
 // JSON-NEXT:             },
 // JSON-NEXT:             "end": {
-// JSON-NEXT:              "offset": 9499,
+// JSON-NEXT:              "offset": 9687,
 // JSON-NEXT:              "col": 40,
 // JSON-NEXT:              "tokLen": 1
 // JSON-NEXT:             }
@@ -8337,18 +8380,18 @@ namespace GH153540 {
 // JSON-NEXT:              "id": "0x{{.*}}",
 // JSON-NEXT:              "kind": "ParmVarDecl",
 // JSON-NEXT:              "loc": {
-// JSON-NEXT:               "offset": 9499,
+// JSON-NEXT:               "offset": 9687,
 // JSON-NEXT:               "col": 40,
 // JSON-NEXT:               "tokLen": 1
 // JSON-NEXT:              },
 // JSON-NEXT:              "range": {
 // JSON-NEXT:               "begin": {
-// JSON-NEXT:                "offset": 9498,
+// JSON-NEXT:                "offset": 9686,
 // JSON-NEXT:                "col": 39,
 // JSON-NEXT:                "tokLen": 1
 // JSON-NEXT:               },
 // JSON-NEXT:               "end": {
-// JSON-NEXT:                "offset": 9498,
+// JSON-NEXT:                "offset": 9686,
 // JSON-NEXT:                "col": 39,
 // JSON-NEXT:                "tokLen": 1
 // JSON-NEXT:               }
@@ -8363,18 +8406,18 @@ namespace GH153540 {
 // JSON-NEXT:            "id": "0x{{.*}}",
 // JSON-NEXT:            "kind": "CXXConstructorDecl",
 // JSON-NEXT:            "loc": {
-// JSON-NEXT:             "offset": 9492,
+// JSON-NEXT:             "offset": 9680,
 // JSON-NEXT:             "col": 33,
 // JSON-NEXT:             "tokLen": 1
 // JSON-NEXT:            },
 // JSON-NEXT:            "range": {
 // JSON-NEXT:             "begin": {
-// JSON-NEXT:              "offset": 9492,
+// JSON-NEXT:              "offset": 9680,
 // JSON-NEXT:              "col": 33,
 // JSON-NEXT:              "tokLen": 1
 // JSON-NEXT:             },
 // JSON-NEXT:             "end": {
-// JSON-NEXT:              "offset": 9492,
+// JSON-NEXT:              "offset": 9680,
 // JSON-NEXT:              "col": 33,
 // JSON-NEXT:              "tokLen": 1
 // JSON-NEXT:             }
@@ -8393,18 +8436,18 @@ namespace GH153540 {
 // JSON-NEXT:              "id": "0x{{.*}}",
 // JSON-NEXT:              "kind": "ParmVarDecl",
 // JSON-NEXT:              "loc": {
-// JSON-NEXT:               "offset": 9492,
+// JSON-NEXT:               "offset": 9680,
 // JSON-NEXT:               "col": 33,
 // JSON-NEXT:               "tokLen": 1
 // JSON-NEXT:              },
 // JSON-NEXT:              "range": {
 // JSON-NEXT:               "begin": {
-// JSON-NEXT:                "offset": 9492,
+// JSON-NEXT:                "offset": 9680,
 // JSON-NEXT:                "col": 33,
 // JSON-NEXT:                "tokLen": 1
 // JSON-NEXT:               },
 // JSON-NEXT:               "end": {
-// JSON-NEXT:                "offset": 9492,
+// JSON-NEXT:                "offset": 9680,
 // JSON-NEXT:                "col": 33,
 // JSON-NEXT:                "tokLen": 1
 // JSON-NEXT:               }
@@ -8419,18 +8462,18 @@ namespace GH153540 {
 // JSON-NEXT:            "id": "0x{{.*}}",
 // JSON-NEXT:            "kind": "CXXConstructorDecl",
 // JSON-NEXT:            "loc": {
-// JSON-NEXT:             "offset": 9492,
+// JSON-NEXT:             "offset": 9680,
 // JSON-NEXT:             "col": 33,
 // JSON-NEXT:             "tokLen": 1
 // JSON-NEXT:            },
 // JSON-NEXT:            "range": {
 // JSON-NEXT:             "begin": {
-// JSON-NEXT:              "offset": 9492,
+// JSON-NEXT:              "offset": 9680,
 // JSON-NEXT:              "col": 33,
 // JSON-NEXT:              "tokLen": 1
 // JSON-NEXT:             },
 // JSON-NEXT:             "end": {
-// JSON-NEXT:              "offset": 9492,
+// JSON-NEXT:              "offset": 9680,
 // JSON-NEXT:              "col": 33,
 // JSON-NEXT:              "tokLen": 1
 // JSON-NEXT:             }
@@ -8449,18 +8492,18 @@ namespace GH153540 {
 // JSON-NEXT:              "id": "0x{{.*}}",
 // JSON-NEXT:              "kind": "ParmVarDecl",
 // JSON-NEXT:              "loc": {
-// JSON-NEXT:               "offset": 9492,
+// JSON-NEXT:               "offset": 9680,
 // JSON-NEXT:               "col": 33,
 // JSON-NEXT:               "tokLen": 1
 // JSON-NEXT:              },
 // JSON-NEXT:              "range": {
 // JSON-NEXT:               "begin": {
-// JSON-NEXT:                "offset": 9492,
+// JSON-NEXT:                "offset": 9680,
 // JSON-NEXT:                "col": 33,
 // JSON-NEXT:                "tokLen": 1
 // JSON-NEXT:               },
 // JSON-NEXT:               "end": {
-// JSON-NEXT:                "offset": 9492,
+// JSON-NEXT:                "offset": 9680,
 // JSON-NEXT:                "col": 33,
 // JSON-NEXT:                "tokLen": 1
 // JSON-NEXT:               }
@@ -8475,18 +8518,18 @@ namespace GH153540 {
 // JSON-NEXT:            "id": "0x{{.*}}",
 // JSON-NEXT:            "kind": "CXXDestructorDecl",
 // JSON-NEXT:            "loc": {
-// JSON-NEXT:             "offset": 9492,
+// JSON-NEXT:             "offset": 9680,
 // JSON-NEXT:             "col": 33,
 // JSON-NEXT:             "tokLen": 1
 // JSON-NEXT:            },
 // JSON-NEXT:            "range": {
 // JSON-NEXT:             "begin": {
-// JSON-NEXT:              "offset": 9492,
+// JSON-NEXT:              "offset": 9680,
 // JSON-NEXT:              "col": 33,
 // JSON-NEXT:              "tokLen": 1
 // JSON-NEXT:             },
 // JSON-NEXT:             "end": {
-// JSON-NEXT:              "offset": 9492,
+// JSON-NEXT:              "offset": 9680,
 // JSON-NEXT:              "col": 33,
 // JSON-NEXT:              "tokLen": 1
 // JSON-NEXT:             }
@@ -8510,18 +8553,18 @@ namespace GH153540 {
 // JSON-NEXT:        "id": "0x{{.*}}",
 // JSON-NEXT:        "kind": "FunctionTemplateDecl",
 // JSON-NEXT:        "loc": {
-// JSON-NEXT:         "offset": 9496,
+// JSON-NEXT:         "offset": 9684,
 // JSON-NEXT:         "col": 37,
 // JSON-NEXT:         "tokLen": 1
 // JSON-NEXT:        },
 // JSON-NEXT:        "range": {
 // JSON-NEXT:         "begin": {
-// JSON-NEXT:          "offset": 9464,
+// JSON-NEXT:          "offset": 9652,
 // JSON-NEXT:          "col": 5,
 // JSON-NEXT:          "tokLen": 8
 // JSON-NEXT:         },
 // JSON-NEXT:         "end": {
-// JSON-NEXT:          "offset": 9499,
+// JSON-NEXT:          "offset": 9687,
 // JSON-NEXT:          "col": 40,
 // JSON-NEXT:          "tokLen": 1
 // JSON-NEXT:         }
@@ -8533,18 +8576,18 @@ namespace GH153540 {
 // JSON-NEXT:          "id": "0x{{.*}}",
 // JSON-NEXT:          "kind": "TemplateTypeParmDecl",
 // JSON-NEXT:          "loc": {
-// JSON-NEXT:           "offset": 9482,
+// JSON-NEXT:           "offset": 9670,
 // JSON-NEXT:           "col": 23,
 // JSON-NEXT:           "tokLen": 1
 // JSON-NEXT:          },
 // JSON-NEXT:          "range": {
 // JSON-NEXT:           "begin": {
-// JSON-NEXT:            "offset": 9473,
+// JSON-NEXT:            "offset": 9661,
 // JSON-NEXT:            "col": 14,
 // JSON-NEXT:            "tokLen": 8
 // JSON-NEXT:           },
 // JSON-NEXT:           "end": {
-// JSON-NEXT:            "offset": 9482,
+// JSON-NEXT:            "offset": 9670,
 // JSON-NEXT:            "col": 23,
 // JSON-NEXT:            "tokLen": 1
 // JSON-NEXT:           }
@@ -8559,18 +8602,18 @@ namespace GH153540 {
 // JSON-NEXT:          "id": "0x{{.*}}",
 // JSON-NEXT:          "kind": "CXXDeductionGuideDecl",
 // JSON-NEXT:          "loc": {
-// JSON-NEXT:           "offset": 9496,
+// JSON-NEXT:           "offset": 9684,
 // JSON-NEXT:           "col": 37,
 // JSON-NEXT:           "tokLen": 1
 // JSON-NEXT:          },
 // JSON-NEXT:          "range": {
 // JSON-NEXT:           "begin": {
-// JSON-NEXT:            "offset": 9496,
+// JSON-NEXT:            "offset": 9684,
 // JSON-NEXT:            "col": 37,
 // JSON-NEXT:            "tokLen": 1
 // JSON-NEXT:           },
 // JSON-NEXT:           "end": {
-// JSON-NEXT:            "offset": 9499,
+// JSON-NEXT:            "offset": 9687,
 // JSON-NEXT:            "col": 40,
 // JSON-NEXT:            "tokLen": 1
 // JSON-NEXT:           }
@@ -8585,18 +8628,18 @@ namespace GH153540 {
 // JSON-NEXT:            "id": "0x{{.*}}",
 // JSON-NEXT:            "kind": "ParmVarDecl",
 // JSON-NEXT:            "loc": {
-// JSON-NEXT:             "offset": 9499,
+// JSON-NEXT:             "offset": 9687,
 // JSON-NEXT:             "col": 40,
 // JSON-NEXT:             "tokLen": 1
 // JSON-NEXT:            },
 // JSON-NEXT:            "range": {
 // JSON-NEXT:             "begin": {
-// JSON-NEXT:              "offset": 9498,
+// JSON-NEXT:              "offset": 9686,
 // JSON-NEXT:              "col": 39,
 // JSON-NEXT:              "tokLen": 1
 // JSON-NEXT:             },
 // JSON-NEXT:             "end": {
-// JSON-NEXT:              "offset": 9498,
+// JSON-NEXT:              "offset": 9686,
 // JSON-NEXT:              "col": 39,
 // JSON-NEXT:              "tokLen": 1
 // JSON-NEXT:             }
@@ -8611,18 +8654,18 @@ namespace GH153540 {
 // JSON-NEXT:          "id": "0x{{.*}}",
 // JSON-NEXT:          "kind": "CXXDeductionGuideDecl",
 // JSON-NEXT:          "loc": {
-// JSON-NEXT:           "offset": 9496,
+// JSON-NEXT:           "offset": 9684,
 // JSON-NEXT:           "col": 37,
 // JSON-NEXT:           "tokLen": 1
 // JSON-NEXT:          },
 // JSON-NEXT:          "range": {
 // JSON-NEXT:           "begin": {
-// JSON-NEXT:            "offset": 9496,
+// JSON-NEXT:            "offset": 9684,
 // JSON-NEXT:            "col": 37,
 // JSON-NEXT:            "tokLen": 1
 // JSON-NEXT:           },
 // JSON-NEXT:           "end": {
-// JSON-NEXT:            "offset": 9499,
+// JSON-NEXT:            "offset": 9687,
 // JSON-NEXT:            "col": 40,
 // JSON-NEXT:            "tokLen": 1
 // JSON-NEXT:           }
@@ -8653,18 +8696,18 @@ namespace GH153540 {
 // JSON-NEXT:            "id": "0x{{.*}}",
 // JSON-NEXT:            "kind": "ParmVarDecl",
 // JSON-NEXT:            "loc": {
-// JSON-NEXT:             "offset": 9499,
+// JSON-NEXT:             "offset": 9687,
 // JSON-NEXT:             "col": 40,
 // JSON-NEXT:             "tokLen": 1
 // JSON-NEXT:            },
 // JSON-NEXT:            "range": {
 // JSON-NEXT:             "begin": {
-// JSON-NEXT:              "offset": 9498,
+// JSON-NEXT:              "offset": 9686,
 // JSON-NEXT:              "col": 39,
 // JSON-NEXT:              "tokLen": 1
 // JSON-NEXT:             },
 // JSON-NEXT:             "end": {
-// JSON-NEXT:              "offset": 9498,
+// JSON-NEXT:              "offset": 9686,
 // JSON-NEXT:              "col": 39,
 // JSON-NEXT:              "tokLen": 1
 // JSON-NEXT:             }
@@ -8681,18 +8724,18 @@ namespace GH153540 {
 // JSON-NEXT:        "id": "0x{{.*}}",
 // JSON-NEXT:        "kind": "FunctionTemplateDecl",
 // JSON-NEXT:        "loc": {
-// JSON-NEXT:         "offset": 9492,
+// JSON-NEXT:         "offset": 9680,
 // JSON-NEXT:         "col": 33,
 // JSON-NEXT:         "tokLen": 1
 // JSON-NEXT:        },
 // JSON-NEXT:        "range": {
 // JSON-NEXT:         "begin": {
-// JSON-NEXT:          "offset": 9464,
+// JSON-NEXT:          "offset": 9652,
 // JSON-NEXT:          "col": 5,
 // JSON-NEXT:          "tokLen": 8
 // JSON-NEXT:         },
 // JSON-NEXT:         "end": {
-// JSON-NEXT:          "offset": 9492,
+// JSON-NEXT:          "offset": 9680,
 // JSON-NEXT:          "col": 33,
 // JSON-NEXT:          "tokLen": 1
 // JSON-NEXT:         }
@@ -8704,18 +8747,18 @@ namespace GH153540 {
 // JSON-NEXT:          "id": "0x{{.*}}",
 // JSON-NEXT:          "kind": "TemplateTypeParmDecl",
 // JSON-NEXT:          "loc": {
-// JSON-NEXT:           "offset": 9482,
+// JSON-NEXT:           "offset": 9670,
 // JSON-NEXT:           "col": 23,
 // JSON-NEXT:           "tokLen": 1
 // JSON-NEXT:          },
 // JSON-NEXT:          "range": {
 // JSON-NEXT:           "begin": {
-// JSON-NEXT:            "offset": 9473,
+// JSON-NEXT:            "offset": 9661,
 // JSON-NEXT:            "col": 14,
 // JSON-NEXT:            "tokLen": 8
 // JSON-NEXT:           },
 // JSON-NEXT:           "end": {
-// JSON-NEXT:            "offset": 9482,
+// JSON-NEXT:            "offset": 9670,
 // JSON-NEXT:            "col": 23,
 // JSON-NEXT:            "tokLen": 1
 // JSON-NEXT:           }
@@ -8730,18 +8773,18 @@ namespace GH153540 {
 // JSON-NEXT:          "id": "0x{{.*}}",
 // JSON-NEXT:          "kind": "CXXDeductionGuideDecl",
 // JSON-NEXT:          "loc": {
-// JSON-NEXT:           "offset": 9492,
+// JSON-NEXT:           "offset": 9680,
 // JSON-NEXT:           "col": 33,
 // JSON-NEXT:           "tokLen": 1
 // JSON-NEXT:          },
 // JSON-NEXT:          "range": {
 // JSON-NEXT:           "begin": {
-// JSON-NEXT:            "offset": 9492,
+// JSON-NEXT:            "offset": 9680,
 // JSON-NEXT:            "col": 33,
 // JSON-NEXT:            "tokLen": 1
 // JSON-NEXT:           },
 // JSON-NEXT:           "end": {
-// JSON-NEXT:            "offset": 9492,
+// JSON-NEXT:            "offset": 9680,
 // JSON-NEXT:            "col": 33,
 // JSON-NEXT:            "tokLen": 1
 // JSON-NEXT:           }
@@ -8756,18 +8799,18 @@ namespace GH153540 {
 // JSON-NEXT:            "id": "0x{{.*}}",
 // JSON-NEXT:            "kind": "ParmVarDecl",
 // JSON-NEXT:            "loc": {
-// JSON-NEXT:             "offset": 9492,
+// JSON-NEXT:             "offset": 9680,
 // JSON-NEXT:             "col": 33,
 // JSON-NEXT:             "tokLen": 1
 // JSON-NEXT:            },
 // JSON-NEXT:            "range": {
 // JSON-NEXT:             "begin": {
-// JSON-NEXT:              "offset": 9492,
+// JSON-NEXT:              "offset": 9680,
 // JSON-NEXT:              "col": 33,
 // JSON-NEXT:              "tokLen": 1
 // JSON-NEXT:             },
 // JSON-NEXT:             "end": {
-// JSON-NEXT:              "offset": 9492,
+// JSON-NEXT:              "offset": 9680,
 // JSON-NEXT:              "col": 33,
 // JSON-NEXT:              "tokLen": 1
 // JSON-NEXT:             }
@@ -8786,20 +8829,20 @@ namespace GH153540 {
 // JSON-NEXT:      "id": "0x{{.*}}",
 // JSON-NEXT:      "kind": "FunctionDecl",
 // JSON-NEXT:      "loc": {
-// JSON-NEXT:       "offset": 9516,
-// JSON-NEXT:       "line": 230,
+// JSON-NEXT:       "offset": 9704,
+// JSON-NEXT:       "line": 233,
 // JSON-NEXT:       "col": 8,
 // JSON-NEXT:       "tokLen": 1
 // JSON-NEXT:      },
 // JSON-NEXT:      "range": {
 // JSON-NEXT:       "begin": {
-// JSON-NEXT:        "offset": 9511,
+// JSON-NEXT:        "offset": 9699,
 // JSON-NEXT:        "col": 3,
 // JSON-NEXT:        "tokLen": 4
 // JSON-NEXT:       },
 // JSON-NEXT:       "end": {
-// JSON-NEXT:        "offset": 9537,
-// JSON-NEXT:        "line": 232,
+// JSON-NEXT:        "offset": 9725,
+// JSON-NEXT:        "line": 235,
 // JSON-NEXT:        "col": 3,
 // JSON-NEXT:        "tokLen": 1
 // JSON-NEXT:       }
@@ -8815,14 +8858,14 @@ namespace GH153540 {
 // JSON-NEXT:        "kind": "CompoundStmt",
 // JSON-NEXT:        "range": {
 // JSON-NEXT:         "begin": {
-// JSON-NEXT:          "offset": 9520,
-// JSON-NEXT:          "line": 230,
+// JSON-NEXT:          "offset": 9708,
+// JSON-NEXT:          "line": 233,
 // JSON-NEXT:          "col": 12,
 // JSON-NEXT:          "tokLen": 1
 // JSON-NEXT:         },
 // JSON-NEXT:         "end": {
-// JSON-NEXT:          "offset": 9537,
-// JSON-NEXT:          "line": 232,
+// JSON-NEXT:          "offset": 9725,
+// JSON-NEXT:          "line": 235,
 // JSON-NEXT:          "col": 3,
 // JSON-NEXT:          "tokLen": 1
 // JSON-NEXT:         }
@@ -8833,13 +8876,13 @@ namespace GH153540 {
 // JSON-NEXT:          "kind": "CXXFunctionalCastExpr",
 // JSON-NEXT:          "range": {
 // JSON-NEXT:           "begin": {
-// JSON-NEXT:            "offset": 9526,
-// JSON-NEXT:            "line": 231,
+// JSON-NEXT:            "offset": 9714,
+// JSON-NEXT:            "line": 234,
 // JSON-NEXT:            "col": 5,
 // JSON-NEXT:            "tokLen": 1
 // JSON-NEXT:           },
 // JSON-NEXT:           "end": {
-// JSON-NEXT:            "offset": 9532,
+// JSON-NEXT:            "offset": 9720,
 // JSON-NEXT:            "col": 11,
 // JSON-NEXT:            "tokLen": 1
 // JSON-NEXT:           }
@@ -8864,12 +8907,12 @@ namespace GH153540 {
 // JSON-NEXT:            "kind": "CXXConstructExpr",
 // JSON-NEXT:            "range": {
 // JSON-NEXT:             "begin": {
-// JSON-NEXT:              "offset": 9526,
+// JSON-NEXT:              "offset": 9714,
 // JSON-NEXT:              "col": 5,
 // JSON-NEXT:              "tokLen": 1
 // JSON-NEXT:             },
 // JSON-NEXT:             "end": {
-// JSON-NEXT:              "offset": 9532,
+// JSON-NEXT:              "offset": 9720,
 // JSON-NEXT:              "col": 11,
 // JSON-NEXT:              "tokLen": 1
 // JSON-NEXT:             }
@@ -8890,12 +8933,12 @@ namespace GH153540 {
 // JSON-NEXT:              "kind": "IntegerLiteral",
 // JSON-NEXT:              "range": {
 // JSON-NEXT:               "begin": {
-// JSON-NEXT:                "offset": 9531,
+// JSON-NEXT:                "offset": 9719,
 // JSON-NEXT:                "col": 10,
 // JSON-NEXT:                "tokLen": 1
 // JSON-NEXT:               },
 // JSON-NEXT:               "end": {
-// JSON-NEXT:                "offset": 9531,
+// JSON-NEXT:                "offset": 9719,
 // JSON-NEXT:                "col": 10,
 // JSON-NEXT:                "tokLen": 1
 // JSON-NEXT:               }
@@ -8915,6 +8958,282 @@ namespace GH153540 {
 // JSON-NEXT:      ]
 // JSON-NEXT:     }
 // JSON-NEXT:    ]
+// JSON-NEXT:   },
+// JSON-NEXT:   {
+// JSON-NEXT:    "id": "0x{{.*}}",
+// JSON-NEXT:    "kind": "NamespaceDecl",
+// JSON-NEXT:    "loc": {
+// JSON-NEXT:     "offset": 10014,
+// JSON-NEXT:     "line": 243,
+// JSON-NEXT:     "col": 11,
+// JSON-NEXT:     "tokLen": 40
+// JSON-NEXT:    },
+// JSON-NEXT:    "range": {
+// JSON-NEXT:     "begin": {
+// JSON-NEXT:      "offset": 10004,
+// JSON-NEXT:      "col": 1,
+// JSON-NEXT:      "tokLen": 9
+// JSON-NEXT:     },
+// JSON-NEXT:     "end": {
+// JSON-NEXT:      "offset": 11286,
+// JSON-NEXT:      "line": 263,
+// JSON-NEXT:      "col": 1,
+// JSON-NEXT:      "tokLen": 1
+// JSON-NEXT:     }
+// JSON-NEXT:    },
+// JSON-NEXT:    "name": "AliasDependentTemplateSpecializationType",
+// JSON-NEXT:    "inner": [
+// JSON-NEXT:     {
+// JSON-NEXT:      "id": "0x{{.*}}",
+// JSON-NEXT:      "kind": "TypeAliasTemplateDecl",
+// JSON-NEXT:      "loc": {
+// JSON-NEXT:       "offset": 10179,
+// JSON-NEXT:       "line": 246,
+// JSON-NEXT:       "col": 38,
+// JSON-NEXT:       "tokLen": 5
+// JSON-NEXT:      },
+// JSON-NEXT:      "range": {
+// JSON-NEXT:       "begin": {
+// JSON-NEXT:        "offset": 10144,
+// JSON-NEXT:        "col": 3,
+// JSON-NEXT:        "tokLen": 8
+// JSON-NEXT:       },
+// JSON-NEXT:       "end": {
+// JSON-NEXT:        "offset": 10196,
+// JSON-NEXT:        "col": 55,
+// JSON-NEXT:        "tokLen": 1
+// JSON-NEXT:       }
+// JSON-NEXT:      },
+// JSON-NEXT:      "name": "T1",
+// JSON-NEXT:      "inner": [
+// JSON-NEXT:       {
+// JSON-NEXT:        "id": "0x{{.*}}",
+// JSON-NEXT:        "kind": "TemplateTemplateParmDecl",
+// JSON-NEXT:        "loc": {
+// JSON-NEXT:         "offset": 10175,
+// JSON-NEXT:         "col": 34,
+// JSON-NEXT:         "tokLen": 2
+// JSON-NEXT:        },
+// JSON-NEXT:        "range": {
+// JSON-NEXT:         "begin": {
+// JSON-NEXT:          "offset": 10153,
+// JSON-NEXT:          "col": 12,
+// JSON-NEXT:          "tokLen": 8
+// JSON-NEXT:         },
+// JSON-NEXT:         "end": {
+// JSON-NEXT:          "offset": 10175,
+// JSON-NEXT:          "col": 34,
+// JSON-NEXT:          "tokLen": 2
+// JSON-NEXT:         }
+// JSON-NEXT:        },
+// JSON-NEXT:        "name": "TT",
+// JSON-NEXT:        "depth": 0,
+// JSON-NEXT:        "index": 0,
+// JSON-NEXT:        "inner": [
+// JSON-NEXT:         {
+// JSON-NEXT:          "id": "0x{{.*}}",
+// JSON-NEXT:          "kind": "TemplateTypeParmDecl",
+// JSON-NEXT:          "loc": {
+// JSON-NEXT:           "offset": 10167,
+// JSON-NEXT:           "col": 26,
+// JSON-NEXT:           "tokLen": 1
+// JSON-NEXT:          },
+// JSON-NEXT:          "range": {
+// JSON-NEXT:           "begin": {
+// JSON-NEXT:            "offset": 10162,
+// JSON-NEXT:            "col": 21,
+// JSON-NEXT:            "tokLen": 5
+// JSON-NEXT:           },
+// JSON-NEXT:           "end": {
+// JSON-NEXT:            "offset": 10162,
+// JSON-NEXT:            "col": 21,
+// JSON-NEXT:            "tokLen": 5
+// JSON-NEXT:           }
+// JSON-NEXT:          },
+// JSON-NEXT:          "tagUsed": "class",
+// JSON-NEXT:          "depth": 1,
+// JSON-NEXT:          "index": 0
+// JSON-NEXT:         }
+// JSON-NEXT:        ]
+// JSON-NEXT:       },
+// JSON-NEXT:       {
+// JSON-NEXT:        "id": "0x{{.*}}",
+// JSON-NEXT:        "kind": "TypeAliasDecl",
+// JSON-NEXT:        "loc": {
+// JSON-NEXT:         "offset": 10185,
+// JSON-NEXT:         "col": 44,
+// JSON-NEXT:         "tokLen": 2
+// JSON-NEXT:        },
+// JSON-NEXT:        "range": {
+// JSON-NEXT:         "begin": {
+// JSON-NEXT:          "offset": 10179,
+// JSON-NEXT:          "col": 38,
+// JSON-NEXT:          "tokLen": 5
+// JSON-NEXT:         },
+// JSON-NEXT:         "end": {
+// JSON-NEXT:          "offset": 10196,
+// JSON-NEXT:          "col": 55,
+// JSON-NEXT:          "tokLen": 1
+// JSON-NEXT:         }
+// JSON-NEXT:        },
+// JSON-NEXT:        "name": "T1",
+// JSON-NEXT:        "type": {
+// JSON-NEXT:         "qualType": "TT<int>"
+// JSON-NEXT:        },
+// JSON-NEXT:        "inner": [
+// JSON-NEXT:         {
+// JSON-NEXT:          "id": "0x{{.*}}",
+// JSON-NEXT:          "kind": "TemplateSpecializationType",
+// JSON-NEXT:          "type": {
+// JSON-NEXT:           "qualType": "TT<int>"
+// JSON-NEXT:          },
+// JSON-NEXT:          "isDependent": true,
+// JSON-NEXT:          "isInstantiationDependent": true,
+// JSON-NEXT:          "templateName": "TT",
+// JSON-NEXT:          "inner": [
+// JSON-NEXT:           {
+// JSON-NEXT:            "kind": "TemplateArgument",
+// JSON-NEXT:            "type": {
+// JSON-NEXT:             "qualType": "int"
+// JSON-NEXT:            },
+// JSON-NEXT:            "inner": [
+// JSON-NEXT:             {
+// JSON-NEXT:              "id": "0x{{.*}}",
+// JSON-NEXT:              "kind": "BuiltinType",
+// JSON-NEXT:              "type": {
+// JSON-NEXT:               "qualType": "int"
+// JSON-NEXT:              }
+// JSON-NEXT:             }
+// JSON-NEXT:            ]
+// JSON-NEXT:           }
+// JSON-NEXT:          ]
+// JSON-NEXT:         }
+// JSON-NEXT:        ]
+// JSON-NEXT:       }
+// JSON-NEXT:      ]
+// JSON-NEXT:     },
+// JSON-NEXT:     {
+// JSON-NEXT:      "id": "0x{{.*}}",
+// JSON-NEXT:      "kind": "TypeAliasTemplateDecl",
+// JSON-NEXT:      "loc": {
+// JSON-NEXT:       "offset": 10219,
+// JSON-NEXT:       "line": 247,
+// JSON-NEXT:       "col": 21,
+// JSON-NEXT:       "tokLen": 5
+// JSON-NEXT:      },
+// JSON-NEXT:      "range": {
+// JSON-NEXT:       "begin": {
+// JSON-NEXT:        "offset": 10201,
+// JSON-NEXT:        "col": 3,
+// JSON-NEXT:        "tokLen": 8
+// JSON-NEXT:       },
+// JSON-NEXT:       "end": {
+// JSON-NEXT:        "offset": 10246,
+// JSON-NEXT:        "col": 48,
+// JSON-NEXT:        "tokLen": 1
+// JSON-NEXT:       }
+// JSON-NEXT:      },
+// JSON-NEXT:      "name": "T2",
+// JSON-NEXT:      "inner": [
+// JSON-NEXT:       {
+// JSON-NEXT:        "id": "0x{{.*}}",
+// JSON-NEXT:        "kind": "TemplateTypeParmDecl",
+// JSON-NEXT:        "loc": {
+// JSON-NEXT:         "offset": 10216,
+// JSON-NEXT:         "col": 18,
+// JSON-NEXT:         "tokLen": 1
+// JSON-NEXT:        },
+// JSON-NEXT:        "range": {
+// JSON-NEXT:         "begin": {
+// JSON-NEXT:          "offset": 10210,
+// JSON-NEXT:          "col": 12,
+// JSON-NEXT:          "tokLen": 5
+// JSON-NEXT:         },
+// JSON-NEXT:         "end": {
+// JSON-NEXT:          "offset": 10216,
+// JSON-NEXT:          "col": 18,
+// JSON-NEXT:          "tokLen": 1
+// JSON-NEXT:         }
+// JSON-NEXT:        },
+// JSON-NEXT:        "name": "T",
+// JSON-NEXT:        "tagUsed": "class",
+// JSON-NEXT:        "depth": 0,
+// JSON-NEXT:        "index": 0
+// JSON-NEXT:       },
+// JSON-NEXT:       {
+// JSON-NEXT:        "id": "0x{{.*}}",
+// JSON-NEXT:        "kind": "TypeAliasDecl",
+// JSON-NEXT:        "loc": {
+// JSON-NEXT:         "offset": 10225,
+// JSON-NEXT:         "col": 27,
+// JSON-NEXT:         "tokLen": 2
+// JSON-NEXT:        },
+// JSON-NEXT:        "range": {
+// JSON-NEXT:         "begin": {
+// JSON-NEXT:          "offset": 10219,
+// JSON-NEXT:          "col": 21,
+// JSON-NEXT:          "tokLen": 5
+// JSON-NEXT:         },
+// JSON-NEXT:         "end": {
+// JSON-NEXT:          "offset": 10246,
+// JSON-NEXT:          "col": 48,
+// JSON-NEXT:          "tokLen": 1
+// JSON-NEXT:         }
+// JSON-NEXT:        },
+// JSON-NEXT:        "name": "T2",
+// JSON-NEXT:        "type": {
+// JSON-NEXT:         "desugaredQualType": "T::template X<int>",
+// JSON-NEXT:         "qualType": "T1<T::template X>"
+// JSON-NEXT:        },
+// JSON-NEXT:        "inner": [
+// JSON-NEXT:         {
+// JSON-NEXT:          "id": "0x{{.*}}",
+// JSON-NEXT:          "kind": "TemplateSpecializationType",
+// JSON-NEXT:          "type": {
+// JSON-NEXT:           "qualType": "T1<T::template X>"
+// JSON-NEXT:          },
+// JSON-NEXT:          "isDependent": true,
+// JSON-NEXT:          "isInstantiationDependent": true,
+// JSON-NEXT:          "isAlias": true,
+// JSON-NEXT:          "templateName": "T1",
+// JSON-NEXT:          "inner": [
+// JSON-NEXT:           {
+// JSON-NEXT:            "kind": "TemplateArgument"
+// JSON-NEXT:           },
+// JSON-NEXT:           {
+// JSON-NEXT:            "id": "0x{{.*}}",
+// JSON-NEXT:            "kind": "TemplateSpecializationType",
+// JSON-NEXT:            "type": {
+// JSON-NEXT:             "qualType": "T::template X<int>"
+// JSON-NEXT:            },
+// JSON-NEXT:            "isDependent": true,
+// JSON-NEXT:            "isInstantiationDependent": true,
+// JSON-NEXT:            "templateName": "T::template X",
+// JSON-NEXT:            "inner": [
+// JSON-NEXT:             {
+// JSON-NEXT:              "kind": "TemplateArgument",
+// JSON-NEXT:              "type": {
+// JSON-NEXT:               "qualType": "int"
+// JSON-NEXT:              },
+// JSON-NEXT:              "inner": [
+// JSON-NEXT:               {
+// JSON-NEXT:                "id": "0x{{.*}}",
+// JSON-NEXT:                "kind": "BuiltinType",
+// JSON-NEXT:                "type": {
+// JSON-NEXT:                 "qualType": "int"
+// JSON-NEXT:                }
+// JSON-NEXT:               }
+// JSON-NEXT:              ]
+// JSON-NEXT:             }
+// JSON-NEXT:            ]
+// JSON-NEXT:           }
+// JSON-NEXT:          ]
+// JSON-NEXT:         }
+// JSON-NEXT:        ]
+// JSON-NEXT:       }
+// JSON-NEXT:      ]
+// JSON-NEXT:     }
+// JSON-NEXT:    ]
 // JSON-NEXT:   }
 // JSON-NEXT:  ]
 // JSON-NEXT: }
diff --git a/clang/test/AST/cfi-unchecked-callee.cpp b/clang/test/AST/cfi-unchecked-callee.cpp
new file mode 100644
index 0000000000000..af84996835930
--- /dev/null
+++ b/clang/test/AST/cfi-unchecked-callee.cpp
@@ -0,0 +1,9 @@
+// RUN: %clang_cc1 -ast-dump %s | FileCheck %s
+
+
+// CHECK: FunctionDecl [[PTR:0x[a-z0-9]*]] {{.*}}func 'void () __attribute__((cfi_unchecked_callee))'
+__attribute__((cfi_unchecked_callee))
+void func(void);
+
+// CHECK-NEXT: FunctionDecl {{0x[a-z0-9]*}} prev [[PTR]] {{.*}}func 'void () __attribute__((cfi_unchecked_callee))'
+void func(void) {}
diff --git a/clang/test/Analysis/ArrayBound/assumption-reporting.c b/clang/test/Analysis/ArrayBound/assumption-reporting.c
index 535e623baa815..bffd5d9bc35b5 100644
--- a/clang/test/Analysis/ArrayBound/assumption-reporting.c
+++ b/clang/test/Analysis/ArrayBound/assumption-reporting.c
@@ -70,7 +70,7 @@ int assumingUpper(int arg) {
   // expected-note@-1 {{Assuming index is less than 10, the number of 'int' elements in 'TenElements'}}
   int b = TenElements[arg - 10];
   // expected-warning@-1 {{Out of bound access to memory preceding 'TenElements'}}
-  // expected-note@-2 {{Access of 'TenElements' at negative byte offset}}
+  // expected-note@-2 {{Access of 'TenElements' at a negative index}}
   return a + b;
 }
 
@@ -99,7 +99,7 @@ int assumingUpperUnsigned(unsigned arg) {
   // expected-note@-1 {{Assuming index is less than 10, the number of 'int' elements in 'TenElements'}}
   int b = TenElements[(int)arg - 10];
   // expected-warning@-1 {{Out of bound access to memory preceding 'TenElements'}}
-  // expected-note@-2 {{Access of 'TenElements' at negative byte offset}}
+  // expected-note@-2 {{Access of 'TenElements' at a negative index}}
   return a + b;
 }
 
@@ -111,7 +111,7 @@ int assumingNothing(unsigned arg) {
   int a = TenElements[arg]; // no note here, we already know that 'arg' is in bounds
   int b = TenElements[(int)arg - 10];
   // expected-warning@-1 {{Out of bound access to memory preceding 'TenElements'}}
-  // expected-note@-2 {{Access of 'TenElements' at negative byte offset}}
+  // expected-note@-2 {{Access of 'TenElements' at a negative index}}
   return a + b;
 }
 
@@ -145,7 +145,7 @@ int assumingConvertedToIntP(struct foo f, int arg) {
   // expected-note@-1 {{Assuming byte offset is less than 5, the extent of 'f.b'}}
   int c = TenElements[arg-2];
   // expected-warning@-1 {{Out of bound access to memory preceding 'TenElements'}}
-  // expected-note@-2 {{Access of 'TenElements' at negative byte offset}}
+  // expected-note@-2 {{Access of 'TenElements' at a negative index}}
   return a + b + c;
 }
 
diff --git a/clang/test/Analysis/ArrayBound/verbose-tests.c b/clang/test/Analysis/ArrayBound/verbose-tests.c
index 84d238ed1a2a4..e3416886d13e5 100644
--- a/clang/test/Analysis/ArrayBound/verbose-tests.c
+++ b/clang/test/Analysis/ArrayBound/verbose-tests.c
@@ -11,7 +11,7 @@ int TenElements[10];
 void arrayUnderflow(void) {
   TenElements[-3] = 5;
   // expected-warning@-1 {{Out of bound access to memory preceding 'TenElements'}}
-  // expected-note@-2 {{Access of 'TenElements' at negative byte offset -12}}
+  // expected-note@-2 {{Access of 'int' element in 'TenElements' at negative index -3}}
 }
 
 int underflowWithDeref(void) {
@@ -19,9 +19,39 @@ int underflowWithDeref(void) {
   --p;
   return *p;
   // expected-warning@-1 {{Out of bound access to memory preceding 'TenElements'}}
-  // expected-note@-2 {{Access of 'TenElements' at negative byte offset -4}}
+  // expected-note@-2 {{Access of 'int' element in 'TenElements' at negative index -1}}
+}
+
+char underflowReportedAsChar(void) {
+  // Underflow is reported with the type of the accessed element (here 'char'),
+  // not the type that appears in the declaration of the original array (which
+  // would be 'int').
+  return ((char *)TenElements)[-1];
+  // expected-warning@-1 {{Out of bound access to memory preceding 'TenElements'}}
+  // expected-note@-2 {{Access of 'char' element in 'TenElements' at negative index -1}}
 }
 
+struct TwoInts {
+  int a, b;
+};
+
+struct TwoInts underflowReportedAsStruct(void) {
+  // Another case where the accessed type is used for reporting the offset.
+  return *(struct TwoInts*)(TenElements - 4);
+  // expected-warning@-1 {{Out of bound access to memory preceding 'TenElements'}}
+  // expected-note@-2 {{Access of 'struct TwoInts' element in 'TenElements' at negative index -2}}
+}
+
+struct TwoInts underflowOnlyByteOffset(void) {
+  // In this case the negative byte offset is not a multiple of the size of the
+  // accessed element, so the part "= -... * sizeof(type)" is omitted at the
+  // end of the message.
+  return *(struct TwoInts*)(TenElements - 3);
+  // expected-warning@-1 {{Out of bound access to memory preceding 'TenElements'}}
+  // expected-note@-2 {{Access of 'TenElements' at negative byte offset -12}}
+}
+
+
 int rng(void);
 int getIndex(void) {
   switch (rng()) {
@@ -40,10 +70,10 @@ void gh86959(void) {
   while (rng())
     TenElements[getIndex()] = 10;
   // expected-warning@-1 {{Out of bound access to memory preceding 'TenElements'}}
-  // expected-note@-2 {{Access of 'TenElements' at negative byte offset -688}}
+  // expected-note@-2 {{Access of 'int' element in 'TenElements' at negative index -172}}
 }
 
-int scanf(const char *restrict fmt, ...);
+int scanf(const char *fmt, ...);
 
 void taintedIndex(void) {
   int index;
diff --git a/clang/test/Analysis/Checkers/WebKit/template-wrapper-call-arg.cpp b/clang/test/Analysis/Checkers/WebKit/template-wrapper-call-arg.cpp
new file mode 100644
index 0000000000000..b0ff210f9415e
--- /dev/null
+++ b/clang/test/Analysis/Checkers/WebKit/template-wrapper-call-arg.cpp
@@ -0,0 +1,21 @@
+// RUN: %clang_analyze_cc1 -analyzer-checker=alpha.webkit.UncountedCallArgsChecker -verify %s
+// expected-no-diagnostics
+
+#include "mock-types.h"
+
+struct Obj {
+  void ref() const;
+  void deref() const;
+
+  void someFunction();
+};
+
+template<typename T> class Wrapper {
+public:
+  T obj();
+};
+
+static void foo(Wrapper<Ref<Obj>>&& wrapper)
+{
+  wrapper.obj()->someFunction();
+}
diff --git a/clang/test/Analysis/Checkers/WebKit/trivial-code-check-asm-brk.cpp b/clang/test/Analysis/Checkers/WebKit/trivial-code-check-asm-brk.cpp
new file mode 100644
index 0000000000000..de98c77eb7347
--- /dev/null
+++ b/clang/test/Analysis/Checkers/WebKit/trivial-code-check-asm-brk.cpp
@@ -0,0 +1,22 @@
+// RUN: %clang_analyze_cc1 -triple arm-darwin -analyzer-checker=alpha.webkit.UncountedCallArgsChecker -verify %s
+// expected-no-diagnostics
+
+void crash()
+{
+  __asm__ volatile ("brk #0xc471");
+  __builtin_unreachable();
+}
+
+class SomeObj {
+public:
+  void ref();
+  void deref();
+
+  void someWork() { crash(); }
+};
+
+SomeObj* provide();
+
+void doSomeWork() {
+  provide()->someWork();
+}
diff --git a/clang/test/Analysis/Checkers/WebKit/unretained-call-args.mm b/clang/test/Analysis/Checkers/WebKit/unretained-call-args.mm
index 3feecd930f109..75eead070fdf9 100644
--- a/clang/test/Analysis/Checkers/WebKit/unretained-call-args.mm
+++ b/clang/test/Analysis/Checkers/WebKit/unretained-call-args.mm
@@ -438,6 +438,60 @@ void use_const_local() {
 
 } // namespace const_global
 
+namespace var_decl_ref_singleton {
+
+static Class initSomeObject() { return nil; }
+static Class (*getSomeObjectClassSingleton)() = initSomeObject;
+
+bool foo(NSString *obj) {
+  return [obj isKindOfClass:getSomeObjectClassSingleton()];
+}
+
+class Bar {
+public:
+  Class someObject();
+  static Class staticSomeObject();
+};
+typedef Class (Bar::*SomeObjectSingleton)();
+
+bool bar(NSObject *obj, Bar *bar, SomeObjectSingleton someObjSingleton) {
+  return [obj isKindOfClass:(bar->*someObjSingleton)()];
+  // expected-warning@-1{{Call argument for parameter 'aClass' is unretained and unsafe}}
+}
+
+bool baz(NSObject *obj) {
+  Class (*someObjectSingleton)() = Bar::staticSomeObject;
+  return [obj isKindOfClass:someObjectSingleton()];
+}
+
+} // namespace var_decl_ref_singleton
+
+namespace ns_retained_return_value {
+
+NSString *provideNS() NS_RETURNS_RETAINED;
+CFDictionaryRef provideCF() CF_RETURNS_RETAINED;
+void consumeNS(NSString *);
+void consumeCF(CFDictionaryRef);
+
+void foo() {
+  consumeNS(provideNS());
+  consumeCF(provideCF());
+}
+
+struct Base {
+  NSString *provideStr() NS_RETURNS_RETAINED;
+};
+
+struct Derived : Base {
+  void consumeStr(NSString *);
+
+  void foo() {
+    consumeStr(provideStr());
+  }
+};
+
+} // namespace ns_retained_return_value
+
 @interface TestObject : NSObject
 - (void)doWork:(NSString *)msg, ...;
 - (void)doWorkOnSelf;
diff --git a/clang/test/Analysis/Checkers/WebKit/unretained-local-vars.mm b/clang/test/Analysis/Checkers/WebKit/unretained-local-vars.mm
index 10f7c9acb7a3c..0ad8f707e254c 100644
--- a/clang/test/Analysis/Checkers/WebKit/unretained-local-vars.mm
+++ b/clang/test/Analysis/Checkers/WebKit/unretained-local-vars.mm
@@ -408,6 +408,21 @@ void use_const_local() {
 
 } // namespace const_global
 
+namespace ns_retained_return_value {
+
+NSString *provideNS() NS_RETURNS_RETAINED;
+CFDictionaryRef provideCF() CF_RETURNS_RETAINED;
+void consumeNS(NSString *);
+void consumeCF(CFDictionaryRef);
+
+unsigned foo() {
+  auto *string = provideNS();
+  auto *dictionary = provideCF();
+  return string.length + CFDictionaryGetCount(dictionary);
+}
+
+} // namespace ns_retained_return_value
+
 bool doMoreWorkOpaque(OtherObj*);
 SomeObj* provide();
 
diff --git a/clang/test/Analysis/OSAtomic_mac.c b/clang/test/Analysis/OSAtomic_mac.c
index 5cf7e93014323..9e3fef782de34 100644
--- a/clang/test/Analysis/OSAtomic_mac.c
+++ b/clang/test/Analysis/OSAtomic_mac.c
@@ -15,7 +15,7 @@ int *invalidSLocOnRedecl(void) {
   // something like "The "compare" part of CompareAndSwap depends on an
   // undefined value".
   int *b;
-  OSAtomicCompareAndSwapPtrBarrier(0, 0, &b); // no-crash
+  OSAtomicCompareAndSwapPtrBarrier(0, 0, (void**)&b); // no-crash
   return b;
 }
 
diff --git a/clang/test/Analysis/bsd-string.c b/clang/test/Analysis/bsd-string.c
index 93b2214786009..0c22dd0719132 100644
--- a/clang/test/Analysis/bsd-string.c
+++ b/clang/test/Analysis/bsd-string.c
@@ -137,5 +137,5 @@ void f11(void) {
 int a, b;
 void unknown_val_crash(void) {
   // We're unable to evaluate the integer-to-pointer cast.
-  strlcat(&b, a, 0); // no-crash
+  strlcat((char*)&b, a, 0); // no-crash
 }
diff --git a/clang/test/Analysis/ctu-import-type-decl-definition.c b/clang/test/Analysis/ctu-import-type-decl-definition.c
index f74920697e977..10910e0812f3a 100644
--- a/clang/test/Analysis/ctu-import-type-decl-definition.c
+++ b/clang/test/Analysis/ctu-import-type-decl-definition.c
@@ -2,26 +2,28 @@
 // RUN: mkdir -p %t
 // RUN: split-file %s %t
 
-// RUN: %clang_cc1 -emit-pch -o %t/import.c.ast %t/import.c
+// RUN: %clang_cc1 -x c -emit-pch -o %t/import.c.ast %t/import.c
 
-// RUN: %clang_extdef_map -- -x c %t/import.c >> %t/externalDefMap.tmp.txt
-// RUN: sed 's/$/.ast/' %t/externalDefMap.tmp.txt >> %t/externalDefMap.txt
+// RUN: %clang_extdef_map %t/import.c -- -c -x c > %t/externalDefMap.tmp.txt
+// RUN: sed 's/$/.ast/' %t/externalDefMap.tmp.txt > %t/externalDefMap.txt
 
 // RUN: %clang_cc1 -analyze \
 // RUN:   -analyzer-checker=core \
 // RUN:   -analyzer-config experimental-enable-naive-ctu-analysis=true \
 // RUN:   -analyzer-config display-ctu-progress=true \
 // RUN:   -analyzer-config ctu-dir=%t \
-// RUN:   -verify %t/main.c
+// RUN:   -verify %t/main.c 2>&1 | FileCheck %s
 
 //--- main.c
 
 // expected-no-diagnostics
+// CHECK: CTU loaded AST file:
 
 typedef struct X_s X_t;
-unsigned long f_import(struct X_s *xPtr);
 
-static void freeWriteFileResources(struct X_s *xPtr) {
+long f_import(struct X_s *xPtr);
+
+static void f_main(struct X_s *xPtr) {
   f_import(xPtr);
 }
 
@@ -36,7 +38,7 @@ struct X_s {
   Y_t y;
 };
 
-unsigned long f_import(struct X_s *xPtr) {
+long f_import(struct X_s *xPtr) {
   if (xPtr != 0) {
   }
   return 0;
diff --git a/clang/test/Analysis/diagnostics/Inputs/expected-plists/plist-html.c.plist b/clang/test/Analysis/diagnostics/Inputs/expected-plists/plist-html.c.plist
new file mode 100644
index 0000000000000..e12a5a688f648
--- /dev/null
+++ b/clang/test/Analysis/diagnostics/Inputs/expected-plists/plist-html.c.plist
@@ -0,0 +1,197 @@
+   <array>
+    <dict>
+     <key>kind</key><string>control</string>
+     <key>edges</key>
+      <array>
+       <dict>
+        <key>start</key>
+         <array>
+          <dict>
+           <key>line</key><integer>5</integer>
+           <key>col</key><integer>5</integer>
+           <key>file</key><integer>0</integer>
+          </dict>
+          <dict>
+           <key>line</key><integer>5</integer>
+           <key>col</key><integer>6</integer>
+           <key>file</key><integer>0</integer>
+          </dict>
+         </array>
+        <key>end</key>
+         <array>
+          <dict>
+           <key>line</key><integer>5</integer>
+           <key>col</key><integer>9</integer>
+           <key>file</key><integer>0</integer>
+          </dict>
+          <dict>
+           <key>line</key><integer>5</integer>
+           <key>col</key><integer>9</integer>
+           <key>file</key><integer>0</integer>
+          </dict>
+         </array>
+       </dict>
+      </array>
+    </dict>
+    <dict>
+     <key>kind</key><string>event</string>
+     <key>location</key>
+     <dict>
+      <key>line</key><integer>5</integer>
+      <key>col</key><integer>9</integer>
+      <key>file</key><integer>0</integer>
+     </dict>
+     <key>ranges</key>
+     <array>
+       <array>
+        <dict>
+         <key>line</key><integer>5</integer>
+         <key>col</key><integer>9</integer>
+         <key>file</key><integer>0</integer>
+        </dict>
+        <dict>
+         <key>line</key><integer>5</integer>
+         <key>col</key><integer>9</integer>
+         <key>file</key><integer>0</integer>
+        </dict>
+       </array>
+     </array>
+     <key>depth</key><integer>0</integer>
+     <key>extended_message</key>
+     <string>Assuming &apos;p&apos; is null</string>
+     <key>message</key>
+     <string>Assuming &apos;p&apos; is null</string>
+    </dict>
+    <dict>
+     <key>kind</key><string>control</string>
+     <key>edges</key>
+      <array>
+       <dict>
+        <key>start</key>
+         <array>
+          <dict>
+           <key>line</key><integer>5</integer>
+           <key>col</key><integer>9</integer>
+           <key>file</key><integer>0</integer>
+          </dict>
+          <dict>
+           <key>line</key><integer>5</integer>
+           <key>col</key><integer>9</integer>
+           <key>file</key><integer>0</integer>
+          </dict>
+         </array>
+        <key>end</key>
+         <array>
+          <dict>
+           <key>line</key><integer>8</integer>
+           <key>col</key><integer>9</integer>
+           <key>file</key><integer>0</integer>
+          </dict>
+          <dict>
+           <key>line</key><integer>8</integer>
+           <key>col</key><integer>14</integer>
+           <key>file</key><integer>0</integer>
+          </dict>
+         </array>
+       </dict>
+      </array>
+    </dict>
+    <dict>
+     <key>kind</key><string>control</string>
+     <key>edges</key>
+      <array>
+       <dict>
+        <key>start</key>
+         <array>
+          <dict>
+           <key>line</key><integer>8</integer>
+           <key>col</key><integer>9</integer>
+           <key>file</key><integer>0</integer>
+          </dict>
+          <dict>
+           <key>line</key><integer>8</integer>
+           <key>col</key><integer>14</integer>
+           <key>file</key><integer>0</integer>
+          </dict>
+         </array>
+        <key>end</key>
+         <array>
+          <dict>
+           <key>line</key><integer>8</integer>
+           <key>col</key><integer>16</integer>
+           <key>file</key><integer>0</integer>
+          </dict>
+          <dict>
+           <key>line</key><integer>8</integer>
+           <key>col</key><integer>16</integer>
+           <key>file</key><integer>0</integer>
+          </dict>
+         </array>
+       </dict>
+      </array>
+    </dict>
+    <dict>
+     <key>kind</key><string>event</string>
+     <key>location</key>
+     <dict>
+      <key>line</key><integer>8</integer>
+      <key>col</key><integer>16</integer>
+      <key>file</key><integer>0</integer>
+     </dict>
+     <key>ranges</key>
+     <array>
+       <array>
+        <dict>
+         <key>line</key><integer>8</integer>
+         <key>col</key><integer>17</integer>
+         <key>file</key><integer>0</integer>
+        </dict>
+        <dict>
+         <key>line</key><integer>8</integer>
+         <key>col</key><integer>17</integer>
+         <key>file</key><integer>0</integer>
+        </dict>
+       </array>
+     </array>
+     <key>depth</key><integer>0</integer>
+     <key>extended_message</key>
+     <string>Dereference of null pointer (loaded from variable &apos;p&apos;)</string>
+     <key>message</key>
+     <string>Dereference of null pointer (loaded from variable &apos;p&apos;)</string>
+    </dict>
+   </array>
+   <key>description</key><string>Dereference of null pointer (loaded from variable &apos;p&apos;)</string>
+   <key>category</key><string>Logic error</string>
+   <key>type</key><string>Dereference of null pointer</string>
+   <key>check_name</key><string>core.NullDereference</string>
+   <!-- This hash is experimental and going to change! -->
+   <key>issue_hash_content_of_line_in_context</key><string>665591022ee1cf653566ea441043d888</string>
+  <key>issue_context_kind</key><string>function</string>
+  <key>issue_context</key><string>foo</string>
+  <key>issue_hash_function_offset</key><string>4</string>
+  <key>location</key>
+  <dict>
+   <key>line</key><integer>8</integer>
+   <key>col</key><integer>16</integer>
+   <key>file</key><integer>0</integer>
+  </dict>
+  <key>HTMLDiagnostics_files</key>
+  <array>
+   <string>report-665591.html</string>
+  </array>
+  <key>ExecutedLines</key>
+  <dict>
+   <key>0</key>
+   <array>
+    <integer>4</integer>
+    <integer>5</integer>
+    <integer>8</integer>
+   </array>
+  </dict>
+  </dict>
+ </array>
+ <key>files</key>
+ <array>
+ </array>
+</dict>
+</plist>
diff --git a/clang/test/Analysis/diagnostics/Inputs/expected-sarif/sarif-diagnostics-taint-test.c.sarif b/clang/test/Analysis/diagnostics/Inputs/expected-sarif/sarif-diagnostics-taint-test.c.sarif
index 0bded6f0925d1..76f25475e3b21 100644
--- a/clang/test/Analysis/diagnostics/Inputs/expected-sarif/sarif-diagnostics-taint-test.c.sarif
+++ b/clang/test/Analysis/diagnostics/Inputs/expected-sarif/sarif-diagnostics-taint-test.c.sarif
@@ -4,9 +4,10 @@
     {
       "artifacts": [
         {
-          "length": 425,
+          "length": -1,
           "location": {
             "index": 0,
+            "uri": "file:///[...]/sarif-diagnostics-taint-test.c"
           },
           "mimeType": "text/plain",
           "roles": [
@@ -31,6 +32,7 @@
                         "physicalLocation": {
                           "artifactLocation": {
                             "index": 0,
+                            "uri": "file:///[...]/sarif-diagnostics-taint-test.c"
                           },
                           "region": {
                             "endColumn": 6,
@@ -50,6 +52,7 @@
                         "physicalLocation": {
                           "artifactLocation": {
                             "index": 0,
+                            "uri": "file:///[...]/sarif-diagnostics-taint-test.c"
                           },
                           "region": {
                             "endColumn": 18,
@@ -71,6 +74,7 @@
               "physicalLocation": {
                 "artifactLocation": {
                   "index": 0,
+                  "uri": "file:///[...]/sarif-diagnostics-taint-test.c"
                 },
                 "region": {
                   "endColumn": 18,
@@ -84,6 +88,9 @@
           "message": {
             "text": "tainted"
           },
+          "partialFingerprints": {
+            "clang/issueHash/v1": "5c964815b8d6db3989bacdd308e657d0"
+          },
           "ruleId": "debug.TaintTest",
           "ruleIndex": 0
         }
@@ -108,8 +115,10 @@
               "name": "debug.TaintTest"
             }
           ],
+          "version": "[clang version]"
         }
       }
     }
   ],
+  "version": "[SARIF version]"
 }
diff --git a/clang/test/Analysis/diagnostics/Inputs/expected-sarif/sarif-multi-diagnostic-test.c.sarif b/clang/test/Analysis/diagnostics/Inputs/expected-sarif/sarif-multi-diagnostic-test.c.sarif
index 7f9deea304832..4aa6239f6312d 100644
--- a/clang/test/Analysis/diagnostics/Inputs/expected-sarif/sarif-multi-diagnostic-test.c.sarif
+++ b/clang/test/Analysis/diagnostics/Inputs/expected-sarif/sarif-multi-diagnostic-test.c.sarif
@@ -4,9 +4,10 @@
     {
       "artifacts": [
         {
-          "length": 1071,
+          "length": -1,
           "location": {
             "index": 0,
+            "uri": "file:///[...]/sarif-multi-diagnostic-test.c"
           },
           "mimeType": "text/plain",
           "roles": [
@@ -31,6 +32,7 @@
                         "physicalLocation": {
                           "artifactLocation": {
                             "index": 0,
+                            "uri": "file:///[...]/sarif-multi-diagnostic-test.c"
                           },
                           "region": {
                             "endColumn": 6,
@@ -50,6 +52,7 @@
                         "physicalLocation": {
                           "artifactLocation": {
                             "index": 0,
+                            "uri": "file:///[...]/sarif-multi-diagnostic-test.c"
                           },
                           "region": {
                             "endColumn": 18,
@@ -65,12 +68,14 @@
               ]
             }
           ],
+          "hostedViewerUri": "file:///[...]/report-5c9648.html",
           "level": "warning",
           "locations": [
             {
               "physicalLocation": {
                 "artifactLocation": {
                   "index": 0,
+                  "uri": "file:///[...]/sarif-multi-diagnostic-test.c"
                 },
                 "region": {
                   "endColumn": 18,
@@ -84,6 +89,9 @@
           "message": {
             "text": "tainted"
           },
+          "partialFingerprints": {
+            "clang/issueHash/v1": "5c964815b8d6db3989bacdd308e657d0"
+          },
           "ruleId": "debug.TaintTest",
           "ruleIndex": 0
         },
@@ -102,6 +110,7 @@
                         "physicalLocation": {
                           "artifactLocation": {
                             "index": 0,
+                            "uri": "file:///[...]/sarif-multi-diagnostic-test.c"
                           },
                           "region": {
                             "endColumn": 6,
@@ -121,6 +130,7 @@
                         "physicalLocation": {
                           "artifactLocation": {
                             "index": 0,
+                            "uri": "file:///[...]/sarif-multi-diagnostic-test.c"
                           },
                           "region": {
                             "endColumn": 12,
@@ -140,6 +150,7 @@
                         "physicalLocation": {
                           "artifactLocation": {
                             "index": 0,
+                            "uri": "file:///[...]/sarif-multi-diagnostic-test.c"
                           },
                           "region": {
                             "endColumn": 9,
@@ -155,12 +166,14 @@
               ]
             }
           ],
+          "hostedViewerUri": "file:///[...]/report-256f65.html",
           "level": "warning",
           "locations": [
             {
               "physicalLocation": {
                 "artifactLocation": {
                   "index": 0,
+                  "uri": "file:///[...]/sarif-multi-diagnostic-test.c"
                 },
                 "region": {
                   "endColumn": 9,
@@ -174,6 +187,9 @@
           "message": {
             "text": "Called function pointer is an uninitialized pointer value"
           },
+          "partialFingerprints": {
+            "clang/issueHash/v1": "256f6502719de88bece09a676d4102c6"
+          },
           "ruleId": "core.CallAndMessage",
           "ruleIndex": 1
         },
@@ -192,6 +208,7 @@
                         "physicalLocation": {
                           "artifactLocation": {
                             "index": 0,
+                            "uri": "file:///[...]/sarif-multi-diagnostic-test.c"
                           },
                           "region": {
                             "endColumn": 13,
@@ -211,6 +228,7 @@
                         "physicalLocation": {
                           "artifactLocation": {
                             "index": 0,
+                            "uri": "file:///[...]/sarif-multi-diagnostic-test.c"
                           },
                           "region": {
                             "endColumn": 3,
@@ -229,6 +247,7 @@
                         "physicalLocation": {
                           "artifactLocation": {
                             "index": 0,
+                            "uri": "file:///[...]/sarif-multi-diagnostic-test.c"
                           },
                           "region": {
                             "endColumn": 14,
@@ -243,12 +262,14 @@
               ]
             }
           ],
+          "hostedViewerUri": "file:///[...]/report-91023b.html",
           "level": "warning",
           "locations": [
             {
               "physicalLocation": {
                 "artifactLocation": {
                   "index": 0,
+                  "uri": "file:///[...]/sarif-multi-diagnostic-test.c"
                 },
                 "region": {
                   "endColumn": 14,
@@ -261,6 +282,9 @@
           "message": {
             "text": "Division by zero"
           },
+          "partialFingerprints": {
+            "clang/issueHash/v1": "91023b85b7e0ff79f11ab603e63cfa58"
+          },
           "ruleId": "core.DivideZero",
           "ruleIndex": 2
         },
@@ -279,6 +303,7 @@
                         "physicalLocation": {
                           "artifactLocation": {
                             "index": 0,
+                            "uri": "file:///[...]/sarif-multi-diagnostic-test.c"
                           },
                           "region": {
                             "endColumn": 24,
@@ -298,6 +323,7 @@
                         "physicalLocation": {
                           "artifactLocation": {
                             "index": 0,
+                            "uri": "file:///[...]/sarif-multi-diagnostic-test.c"
                           },
                           "region": {
                             "endColumn": 12,
@@ -317,6 +343,7 @@
                         "physicalLocation": {
                           "artifactLocation": {
                             "index": 0,
+                            "uri": "file:///[...]/sarif-multi-diagnostic-test.c"
                           },
                           "region": {
                             "endColumn": 3,
@@ -335,6 +362,7 @@
                         "physicalLocation": {
                           "artifactLocation": {
                             "index": 0,
+                            "uri": "file:///[...]/sarif-multi-diagnostic-test.c"
                           },
                           "region": {
                             "endColumn": 12,
@@ -349,12 +377,14 @@
               ]
             }
           ],
+          "hostedViewerUri": "file:///[...]/report-b18daa.html",
           "level": "warning",
           "locations": [
             {
               "physicalLocation": {
                 "artifactLocation": {
                   "index": 0,
+                  "uri": "file:///[...]/sarif-multi-diagnostic-test.c"
                 },
                 "region": {
                   "endColumn": 12,
@@ -367,6 +397,9 @@
           "message": {
             "text": "Potential leak of memory pointed to by 'mem'"
           },
+          "partialFingerprints": {
+            "clang/issueHash/v1": "b18daabce2816b9efb6afffaa64ca9f9"
+          },
           "ruleId": "unix.Malloc",
           "ruleIndex": 3
         },
@@ -385,6 +418,7 @@
                         "physicalLocation": {
                           "artifactLocation": {
                             "index": 0,
+                            "uri": "file:///[...]/sarif-multi-diagnostic-test.c"
                           },
                           "region": {
                             "endColumn": 12,
@@ -404,6 +438,7 @@
                         "physicalLocation": {
                           "artifactLocation": {
                             "index": 0,
+                            "uri": "file:///[...]/sarif-multi-diagnostic-test.c"
                           },
                           "region": {
                             "endColumn": 20,
@@ -418,12 +453,14 @@
               ]
             }
           ],
+          "hostedViewerUri": "file:///[...]/report-4e5361.html",
           "level": "warning",
           "locations": [
             {
               "physicalLocation": {
                 "artifactLocation": {
                   "index": 0,
+                  "uri": "file:///[...]/sarif-multi-diagnostic-test.c"
                 },
                 "region": {
                   "endColumn": 20,
@@ -436,6 +473,9 @@
           "message": {
             "text": "Division by zero"
           },
+          "partialFingerprints": {
+            "clang/issueHash/v1": "4e53611783411e0dae06a4084b00281c"
+          },
           "ruleId": "core.DivideZero",
           "ruleIndex": 2
         }
@@ -499,8 +539,10 @@
               "name": "unix.Malloc"
             }
           ],
+          "version": "[clang version]"
         }
       }
     }
   ],
+  "version": "[SARIF version]"
 }
diff --git a/clang/test/Analysis/diagnostics/Inputs/expected-sarif/sarif-multi-file-diagnostics.c.sarif b/clang/test/Analysis/diagnostics/Inputs/expected-sarif/sarif-multi-file-diagnostics.c.sarif
new file mode 100644
index 0000000000000..85e710fc7bac3
--- /dev/null
+++ b/clang/test/Analysis/diagnostics/Inputs/expected-sarif/sarif-multi-file-diagnostics.c.sarif
@@ -0,0 +1,144 @@
+{
+  "$schema": "https://docs.oasis-open.org/sarif/sarif/v2.1.0/cos02/schemas/sarif-schema-2.1.0.json",
+  "runs": [
+    {
+      "artifacts": [
+        {
+          "length": -1,
+          "location": {
+            "index": 0,
+            "uri": "file:///[...]/sarif-multi-file-diagnostics.c"
+          },
+          "mimeType": "text/plain",
+          "roles": [
+            "resultFile"
+          ]
+        }
+      ],
+      "columnKind": "unicodeCodePoints",
+      "results": [
+        {
+          "codeFlows": [
+            {
+              "threadFlows": [
+                {
+                  "locations": [
+                    {
+                      "importance": "important",
+                      "location": {
+                        "message": {
+                          "text": "Assuming 'p' is null"
+                        },
+                        "physicalLocation": {
+                          "artifactLocation": {
+                            "index": 0,
+                            "uri": "file:///[...]/sarif-multi-file-diagnostics.c"
+                          },
+                          "region": {
+                            "endColumn": 7,
+                            "startColumn": 7,
+                            "startLine": 8
+                          }
+                        }
+                      }
+                    },
+                    {
+                      "importance": "unimportant",
+                      "location": {
+                        "message": {
+                          "text": "Taking false branch"
+                        },
+                        "physicalLocation": {
+                          "artifactLocation": {
+                            "index": 0,
+                            "uri": "file:///[...]/sarif-multi-file-diagnostics.c"
+                          },
+                          "region": {
+                            "endColumn": 3,
+                            "startColumn": 3,
+                            "startLine": 8
+                          }
+                        }
+                      }
+                    },
+                    {
+                      "importance": "essential",
+                      "location": {
+                        "message": {
+                          "text": "Dereference of null pointer (loaded from variable 'p')"
+                        },
+                        "physicalLocation": {
+                          "artifactLocation": {
+                            "index": 0,
+                            "uri": "file:///[...]/sarif-multi-file-diagnostics.c"
+                          },
+                          "region": {
+                            "endColumn": 14,
+                            "endLine": 11,
+                            "startColumn": 12,
+                            "startLine": 11
+                          }
+                        }
+                      }
+                    }
+                  ]
+                }
+              ]
+            }
+          ],
+          "hostedViewerUri": "file:///[...]/report-d03238.html",
+          "level": "warning",
+          "locations": [
+            {
+              "physicalLocation": {
+                "artifactLocation": {
+                  "index": 0,
+                  "uri": "file:///[...]/sarif-multi-file-diagnostics.c"
+                },
+                "region": {
+                  "endColumn": 14,
+                  "endLine": 11,
+                  "startColumn": 12,
+                  "startLine": 11
+                }
+              }
+            }
+          ],
+          "message": {
+            "text": "Dereference of null pointer (loaded from variable 'p')"
+          },
+          "partialFingerprints": {
+            "clang/issueHash/v1": "d0323824ffaf9fee78b866e18d300fda"
+          },
+          "ruleId": "core.NullDereference",
+          "ruleIndex": 0
+        }
+      ],
+      "tool": {
+        "driver": {
+          "fullName": "clang static analyzer",
+          "informationUri": "https://clang.llvm.org/docs/UsersManual.html",
+          "language": "en-US",
+          "name": "clang",
+          "rules": [
+            {
+              "defaultConfiguration": {
+                "enabled": true,
+                "level": "warning",
+                "rank": -1
+              },
+              "fullDescription": {
+                "text": "Check for dereferences of null pointers"
+              },
+              "helpUri": "https://clang.llvm.org/docs/analyzer/checkers.html#core-nulldereference",
+              "id": "core.NullDereference",
+              "name": "core.NullDereference"
+            }
+          ],
+          "version": "[clang version]"
+        }
+      }
+    }
+  ],
+  "version": "[SARIF version]"
+}
\ No newline at end of file
diff --git a/clang/test/Analysis/diagnostics/plist-html.c b/clang/test/Analysis/diagnostics/plist-html.c
new file mode 100644
index 0000000000000..7ee0aa5681d3a
--- /dev/null
+++ b/clang/test/Analysis/diagnostics/plist-html.c
@@ -0,0 +1,10 @@
+// RUN: rm -rf %t && mkdir %t && %clang_analyze_cc1 -analyzer-checker=core -analyzer-output=plist-html %s -o %t/plist-html.c.plist -verify
+// RUN: tail -n +11 %t/plist-html.c.plist | %normalize_plist | diff -ub %S/Inputs/expected-plists/plist-html.c.plist -
+
+int foo(int *p) {
+    if (p) {
+        return 0;
+    } else {
+        return *p;  // expected-warning {{Dereference of null pointer (loaded from variable 'p')}}
+    }
+}
diff --git a/clang/test/Analysis/diagnostics/sarif-multi-diagnostic-test.c b/clang/test/Analysis/diagnostics/sarif-multi-diagnostic-test.c
index eeafd178628b3..5842574793bce 100644
--- a/clang/test/Analysis/diagnostics/sarif-multi-diagnostic-test.c
+++ b/clang/test/Analysis/diagnostics/sarif-multi-diagnostic-test.c
@@ -1,8 +1,8 @@
-// RUN: %clang_analyze_cc1 -analyzer-checker=core,optin.taint,debug.TaintTest,unix.Malloc %s -verify -analyzer-output=sarif -o - | %normalize_sarif | diff -U1 -b %S/Inputs/expected-sarif/sarif-multi-diagnostic-test.c.sarif -
+// RUN: rm -rf %t && mkdir %t && %clang_analyze_cc1 -analyzer-checker=core,optin.taint,debug.TaintTest,unix.Malloc %s -verify -analyzer-output=sarif-html -o %t%{fs-sep}out.sarif
+// RUN: cat %t%{fs-sep}out.sarif | %normalize_sarif | diff -U1 -b %S/Inputs/expected-sarif/sarif-multi-diagnostic-test.c.sarif -
 #include "../Inputs/system-header-simulator.h"
 #include "../Inputs/system-header-simulator-for-malloc.h"
 #define ERR -1
-
 int atoi(const char *nptr);
 
 void f(void) {
diff --git a/clang/test/Analysis/diagnostics/sarif-multi-file-diagnostics.c b/clang/test/Analysis/diagnostics/sarif-multi-file-diagnostics.c
new file mode 100644
index 0000000000000..48880b592f261
--- /dev/null
+++ b/clang/test/Analysis/diagnostics/sarif-multi-file-diagnostics.c
@@ -0,0 +1,12 @@
+// RUN: rm -rf %t && mkdir %t
+// RUN: %clang_analyze_cc1 -analyzer-checker=core %s -verify -analyzer-output=sarif-html -o %t%{fs-sep}out1.sarif
+// RUN: %clang_analyze_cc1 -analyzer-checker=core %s -verify -analyzer-output=sarif-html -o %t%{fs-sep}out2.sarif
+// RUN: cat %t%{fs-sep}out1.sarif | %normalize_sarif | diff -U1 -b %S/Inputs/expected-sarif/sarif-multi-file-diagnostics.c.sarif -
+// RUN: cat %t%{fs-sep}out2.sarif | %normalize_sarif | diff -U1 -b %S/Inputs/expected-sarif/sarif-multi-file-diagnostics.c.sarif -
+
+int test(int *p) {
+  if (p)
+    return 0;
+  else
+    return *p;  // expected-warning {{Dereference of null pointer (loaded from variable 'p')}}
+}
diff --git a/clang/test/Analysis/lit.local.cfg b/clang/test/Analysis/lit.local.cfg
index f08ff8d6cce63..3d60a16405ea6 100644
--- a/clang/test/Analysis/lit.local.cfg
+++ b/clang/test/Analysis/lit.local.cfg
@@ -21,11 +21,15 @@ config.substitutions.append(
 config.substitutions.append(
     (
         "%normalize_sarif",
-        "grep -Ev '^[[:space:]]*(%s|%s|%s)[[:space:]]*$'"
+        "sed -r '%s;%s;%s;%s'"
         % (
-            '"uri": "file:.*%basename_t"',
-            '"version": ".* version .*"',
-            '"version": "2.1.0"',
+            # Replace version strings that are likely to change.
+            r's/"version": ".* version .*"/"version": "[clang version]"/',
+            r's/"version": "2.1.0"/"version": "[SARIF version]"/',
+            # Strip directories from file URIs
+            r's/"file:(\/+)([^"\/]+\/)*([^"]+)"/"file:\1[...]\/\3"/',
+            # Set "length" to -1
+            r's/"length": [[:digit:]]+/"length": -1/'
         ),
     )
 )
diff --git a/clang/test/Analysis/novoidtypecrash.c b/clang/test/Analysis/novoidtypecrash.c
index 5af30c2010438..325dd505a4201 100644
--- a/clang/test/Analysis/novoidtypecrash.c
+++ b/clang/test/Analysis/novoidtypecrash.c
@@ -3,7 +3,7 @@ x;
 y(void **z) { // no-crash
   *z = x;
   int *w;
-  y(&w);
+  y((void**)&w);
   *w;
 }
 
diff --git a/clang/test/Analysis/override-werror.c b/clang/test/Analysis/override-werror.c
index e84c20fc0696f..1aeed80492cca 100644
--- a/clang/test/Analysis/override-werror.c
+++ b/clang/test/Analysis/override-werror.c
@@ -5,9 +5,9 @@
 // -Werror.  This allows basic warnings not to interfere with producing
 // analyzer results.
 
-char* f(int *p) {
-  return p; // expected-warning{{incompatible pointer types}} \
-               werror-warning{{incompatible pointer types}}
+void f(int *p) {
+  int; // expected-warning{{declaration does not declare anything}} \
+          werror-warning{{declaration does not declare anything}}
 }
 
 void g(int *p) {
diff --git a/clang/test/Analysis/scan-build/cxx-name.test b/clang/test/Analysis/scan-build/cxx-name.test
index 483762d619d17..b602cb5c5231c 100644
--- a/clang/test/Analysis/scan-build/cxx-name.test
+++ b/clang/test/Analysis/scan-build/cxx-name.test
@@ -1,5 +1,3 @@
-REQUIRES: shell
-
 RUN: %scan-build sh -c 'echo "CLANG_CXX=/$(basename "$CLANG_CXX")/"' | FileCheck %s
 
 Check that scan-build sets the CLANG_CXX environment variable (meant to be
diff --git a/clang/test/Analysis/scan-build/deduplication.test b/clang/test/Analysis/scan-build/deduplication.test
index 2ec3061701fce..067a5153d67db 100644
--- a/clang/test/Analysis/scan-build/deduplication.test
+++ b/clang/test/Analysis/scan-build/deduplication.test
@@ -1,5 +1,3 @@
-REQUIRES: shell
-
 RUN: rm -rf %t.output_dir && mkdir %t.output_dir
 RUN: %scan-build -o %t.output_dir \
 RUN:             %clang -S %S/Inputs/deduplication/1.c \
diff --git a/clang/test/Analysis/scan-build/html_output.test b/clang/test/Analysis/scan-build/html_output.test
index c2b509d9ef661..1eb4e73611cf2 100644
--- a/clang/test/Analysis/scan-build/html_output.test
+++ b/clang/test/Analysis/scan-build/html_output.test
@@ -1,5 +1,3 @@
-REQUIRES: shell
-
 RUN: rm -rf %t.output_dir && mkdir %t.output_dir
 RUN: %scan-build -o %t.output_dir %clang -S %S/Inputs/single_null_dereference.c \
 RUN:     | FileCheck %s -check-prefix CHECK-STDOUT
diff --git a/clang/test/Analysis/scan-build/plist_html_output.test b/clang/test/Analysis/scan-build/plist_html_output.test
index ca9c5256b9d75..b995aa6d5d36a 100644
--- a/clang/test/Analysis/scan-build/plist_html_output.test
+++ b/clang/test/Analysis/scan-build/plist_html_output.test
@@ -1,5 +1,3 @@
-REQUIRES: shell
-
 RUN: rm -rf %t.output_dir && mkdir %t.output_dir
 RUN: %scan-build -plist-html -o %t.output_dir %clang -S %S/Inputs/single_null_dereference.c \
 RUN:     | FileCheck %s -check-prefix CHECK-STDOUT
diff --git a/clang/test/Analysis/scan-build/plist_output.test b/clang/test/Analysis/scan-build/plist_output.test
index 4d01640bff6ea..1e7bef1035b51 100644
--- a/clang/test/Analysis/scan-build/plist_output.test
+++ b/clang/test/Analysis/scan-build/plist_output.test
@@ -1,5 +1,3 @@
-REQUIRES: shell
-
 RUN: rm -rf %t.output_dir && mkdir %t.output_dir
 RUN: %scan-build -plist -o %t.output_dir %clang -S %S/Inputs/single_null_dereference.c \
 RUN:     | FileCheck %s -check-prefix CHECK-STDOUT
diff --git a/clang/test/Analysis/uninit-vals-union.c b/clang/test/Analysis/uninit-vals-union.c
index e16cccfc9115a..079632d642a3b 100644
--- a/clang/test/Analysis/uninit-vals-union.c
+++ b/clang/test/Analysis/uninit-vals-union.c
@@ -1,4 +1,4 @@
-// RUN: %clang_analyze_cc1 -analyzer-checker=core.builtin -verify -Wno-unused %s
+// RUN: %clang_analyze_cc1 -analyzer-checker=core.builtin -verify -Wno-unused -Wno-error=incompatible-pointer-types %s
 
 typedef union {
   int y;
diff --git a/clang/test/C/C11/n1285_1.c b/clang/test/C/C11/n1285_1.c
index 25b68e3145b04..345ec94a1eeef 100644
--- a/clang/test/C/C11/n1285_1.c
+++ b/clang/test/C/C11/n1285_1.c
@@ -1,4 +1,4 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6
 // RUN: %clang_cc1 -triple=x86_64 -std=c99 -Wno-dangling -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK
 // RUN: %clang_cc1 -triple=x86_64 -std=c11 -Wno-dangling -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK
 // RUN: %clang_cc1 -triple=x86_64 -std=c11 -O2 -disable-llvm-passes -Wno-dangling -emit-llvm -o - %s | FileCheck %s --check-prefix=C11-O2
@@ -32,9 +32,9 @@ struct X f(void);
 // C11-O2-NEXT:    [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_X]], ptr [[REF_TMP]], i32 0, i32 0
 // C11-O2-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [5 x i32], ptr [[A]], i64 0, i64 0
 // C11-O2-NEXT:    call void @llvm.lifetime.end.p0(ptr [[REF_TMP]]) #[[ATTR5]]
-// C11-O2-NEXT:    store ptr [[ARRAYDECAY]], ptr [[P]], align 8, !tbaa [[TBAA2:![0-9]+]]
-// C11-O2-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[P]], align 8, !tbaa [[TBAA2]]
-// C11-O2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4, !tbaa [[TBAA7:![0-9]+]]
+// C11-O2-NEXT:    store ptr [[ARRAYDECAY]], ptr [[P]], align 8, !tbaa [[INTPTR_TBAA2:![0-9]+]]
+// C11-O2-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[P]], align 8, !tbaa [[INTPTR_TBAA2]]
+// C11-O2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4, !tbaa [[INT_TBAA7:![0-9]+]]
 // C11-O2-NEXT:    call void @llvm.lifetime.end.p0(ptr [[P]]) #[[ATTR5]]
 // C11-O2-NEXT:    ret i32 [[TMP1]]
 //
@@ -91,18 +91,18 @@ int func_return(void) {
 // C11-O2:       [[COND_END]]:
 // C11-O2-NEXT:    [[A1:%.*]] = getelementptr inbounds nuw [[STRUCT_X]], ptr [[REF_TMP]], i32 0, i32 0
 // C11-O2-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [5 x i32], ptr [[A1]], i64 0, i64 0
-// C11-O2-NEXT:    store ptr [[ARRAYDECAY]], ptr @p, align 8, !tbaa [[TBAA2]]
+// C11-O2-NEXT:    store ptr [[ARRAYDECAY]], ptr @p, align 8, !tbaa [[INTPTR_TBAA2]]
 // C11-O2-NEXT:    call void @llvm.lifetime.end.p0(ptr [[REF_TMP]]) #[[ATTR5]]
 // C11-O2-NEXT:    call void @llvm.lifetime.start.p0(ptr [[Q]]) #[[ATTR5]]
 // C11-O2-NEXT:    call void @llvm.memset.p0.i64(ptr align 4 [[DOTCOMPOUNDLITERAL]], i8 0, i64 20, i1 false)
 // C11-O2-NEXT:    [[A2:%.*]] = getelementptr inbounds nuw [[STRUCT_X]], ptr [[DOTCOMPOUNDLITERAL]], i32 0, i32 0
 // C11-O2-NEXT:    [[A3:%.*]] = getelementptr inbounds nuw [[STRUCT_X]], ptr [[DOTCOMPOUNDLITERAL]], i32 0, i32 0
 // C11-O2-NEXT:    [[ARRAYDECAY4:%.*]] = getelementptr inbounds [5 x i32], ptr [[A3]], i64 0, i64 0
-// C11-O2-NEXT:    store ptr [[ARRAYDECAY4]], ptr [[Q]], align 8, !tbaa [[TBAA2]]
-// C11-O2-NEXT:    [[TMP0:%.*]] = load ptr, ptr @p, align 8, !tbaa [[TBAA2]]
-// C11-O2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4, !tbaa [[TBAA7]]
-// C11-O2-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[Q]], align 8, !tbaa [[TBAA2]]
-// C11-O2-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4, !tbaa [[TBAA7]]
+// C11-O2-NEXT:    store ptr [[ARRAYDECAY4]], ptr [[Q]], align 8, !tbaa [[INTPTR_TBAA2]]
+// C11-O2-NEXT:    [[TMP0:%.*]] = load ptr, ptr @p, align 8, !tbaa [[INTPTR_TBAA2]]
+// C11-O2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4, !tbaa [[INT_TBAA7]]
+// C11-O2-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[Q]], align 8, !tbaa [[INTPTR_TBAA2]]
+// C11-O2-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4, !tbaa [[INT_TBAA7]]
 // C11-O2-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP1]], [[TMP3]]
 // C11-O2-NEXT:    call void @llvm.lifetime.end.p0(ptr [[Q]]) #[[ATTR5]]
 // C11-O2-NEXT:    ret i32 [[ADD]]
@@ -138,10 +138,10 @@ int ternary(void) {
 // C11-O2-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[REF_TMP]], ptr align 4 [[X]], i64 20, i1 false), !tbaa.struct [[TBAA_STRUCT9:![0-9]+]]
 // C11-O2-NEXT:    [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_X]], ptr [[REF_TMP]], i32 0, i32 0
 // C11-O2-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [5 x i32], ptr [[A]], i64 0, i64 0
-// C11-O2-NEXT:    store ptr [[ARRAYDECAY]], ptr @p, align 8, !tbaa [[TBAA2]]
+// C11-O2-NEXT:    store ptr [[ARRAYDECAY]], ptr @p, align 8, !tbaa [[INTPTR_TBAA2]]
 // C11-O2-NEXT:    call void @llvm.lifetime.end.p0(ptr [[REF_TMP]]) #[[ATTR5]]
-// C11-O2-NEXT:    [[TMP0:%.*]] = load ptr, ptr @p, align 8, !tbaa [[TBAA2]]
-// C11-O2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4, !tbaa [[TBAA7]]
+// C11-O2-NEXT:    [[TMP0:%.*]] = load ptr, ptr @p, align 8, !tbaa [[INTPTR_TBAA2]]
+// C11-O2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4, !tbaa [[INT_TBAA7]]
 // C11-O2-NEXT:    call void @llvm.lifetime.end.p0(ptr [[X]]) #[[ATTR5]]
 // C11-O2-NEXT:    ret i32 [[TMP1]]
 //
@@ -175,10 +175,10 @@ int comma(void) {
 // C11-O2-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[REF_TMP]], ptr align 4 [[X]], i64 20, i1 false), !tbaa.struct [[TBAA_STRUCT9]]
 // C11-O2-NEXT:    [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_X]], ptr [[REF_TMP]], i32 0, i32 0
 // C11-O2-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [5 x i32], ptr [[A]], i64 0, i64 0
-// C11-O2-NEXT:    store ptr [[ARRAYDECAY]], ptr @p, align 8, !tbaa [[TBAA2]]
+// C11-O2-NEXT:    store ptr [[ARRAYDECAY]], ptr @p, align 8, !tbaa [[INTPTR_TBAA2]]
 // C11-O2-NEXT:    call void @llvm.lifetime.end.p0(ptr [[REF_TMP]]) #[[ATTR5]]
-// C11-O2-NEXT:    [[TMP0:%.*]] = load ptr, ptr @p, align 8, !tbaa [[TBAA2]]
-// C11-O2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4, !tbaa [[TBAA7]]
+// C11-O2-NEXT:    [[TMP0:%.*]] = load ptr, ptr @p, align 8, !tbaa [[INTPTR_TBAA2]]
+// C11-O2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4, !tbaa [[INT_TBAA7]]
 // C11-O2-NEXT:    call void @llvm.lifetime.end.p0(ptr [[X]]) #[[ATTR5]]
 // C11-O2-NEXT:    ret i32 [[TMP1]]
 //
@@ -217,10 +217,10 @@ int cast(void) {
 // C11-O2-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[REF_TMP]], ptr align 4 [[X]], i64 20, i1 false), !tbaa.struct [[TBAA_STRUCT9]]
 // C11-O2-NEXT:    [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_X]], ptr [[REF_TMP]], i32 0, i32 0
 // C11-O2-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [5 x i32], ptr [[A]], i64 0, i64 0
-// C11-O2-NEXT:    store ptr [[ARRAYDECAY]], ptr @p, align 8, !tbaa [[TBAA2]]
+// C11-O2-NEXT:    store ptr [[ARRAYDECAY]], ptr @p, align 8, !tbaa [[INTPTR_TBAA2]]
 // C11-O2-NEXT:    call void @llvm.lifetime.end.p0(ptr [[REF_TMP]]) #[[ATTR5]]
-// C11-O2-NEXT:    [[TMP0:%.*]] = load ptr, ptr @p, align 8, !tbaa [[TBAA2]]
-// C11-O2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4, !tbaa [[TBAA7]]
+// C11-O2-NEXT:    [[TMP0:%.*]] = load ptr, ptr @p, align 8, !tbaa [[INTPTR_TBAA2]]
+// C11-O2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4, !tbaa [[INT_TBAA7]]
 // C11-O2-NEXT:    call void @llvm.lifetime.end.p0(ptr [[S]]) #[[ATTR5]]
 // C11-O2-NEXT:    call void @llvm.lifetime.end.p0(ptr [[X]]) #[[ATTR5]]
 // C11-O2-NEXT:    ret i32 [[TMP1]]
@@ -232,12 +232,12 @@ int assign(void) {
   return *p;
 }
 //.
-// C11-O2: [[TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
+// C11-O2: [[INTPTR_TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
 // C11-O2: [[META3]] = !{!"p1 int", [[META4:![0-9]+]], i64 0}
 // C11-O2: [[META4]] = !{!"any pointer", [[META5:![0-9]+]], i64 0}
 // C11-O2: [[META5]] = !{!"omnipotent char", [[META6:![0-9]+]], i64 0}
 // C11-O2: [[META6]] = !{!"Simple C/C++ TBAA"}
-// C11-O2: [[TBAA7]] = !{[[META8:![0-9]+]], [[META8]], i64 0}
+// C11-O2: [[INT_TBAA7]] = !{[[META8:![0-9]+]], [[META8]], i64 0}
 // C11-O2: [[META8]] = !{!"int", [[META5]], i64 0}
 // C11-O2: [[TBAA_STRUCT9]] = !{i64 0, i64 20, [[META10:![0-9]+]]}
 // C11-O2: [[META10]] = !{[[META5]], [[META5]], i64 0}
diff --git a/clang/test/C/C23/n3007.c b/clang/test/C/C23/n3007.c
index 34ec419b71b27..b8b84519fc19d 100644
--- a/clang/test/C/C23/n3007.c
+++ b/clang/test/C/C23/n3007.c
@@ -13,7 +13,7 @@ void test_qualifiers(int x, const int y, int * restrict z) {
   static auto c = 1UL;
   int* pa = &a; // expected-warning {{initializing 'int *' with an expression of type 'const int *' discards qualifiers}}
   const int* pb = &b;
-  int* pc = &c; // expected-warning {{incompatible pointer types initializing 'int *' with an expression of type 'unsigned long *'}}
+  int* pc = &c; // expected-error {{incompatible pointer types initializing 'int *' with an expression of type 'unsigned long *'}}
 
   const int ci = 12;
   auto yup = ci;
diff --git a/clang/test/C/C2y/n3254.c b/clang/test/C/C2y/n3254.c
index e114735a9cb79..9f8c47756df32 100644
--- a/clang/test/C/C2y/n3254.c
+++ b/clang/test/C/C2y/n3254.c
@@ -1,4 +1,4 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6
 // RUN: %clang_cc1 -triple=x86_64 -std=c2y %s -emit-llvm -o - | FileCheck %s
 
 /* WG14 N3254: Yes
diff --git a/clang/test/C/C2y/n3369.c b/clang/test/C/C2y/n3369.c
index db26040d8cf44..dd3125709279d 100644
--- a/clang/test/C/C2y/n3369.c
+++ b/clang/test/C/C2y/n3369.c
@@ -100,11 +100,11 @@ void test_funcs() {
   int i5[5];
   char c35[3][5];
   test_func_fix_fix(5, &c35, &i3, NULL);
-  test_func_fix_fix(5, &c35, &i5, NULL); // expected-warning {{incompatible pointer types passing 'int (*)[5]' to parameter of type 'int (*)[3]'}}
+  test_func_fix_fix(5, &c35, &i5, NULL); // expected-error {{incompatible pointer types passing 'int (*)[5]' to parameter of type 'int (*)[3]'}}
   test_func_fix_var(5, &c35, &i3, NULL);
-  test_func_fix_var(5, &c35, &i5, NULL); // expected-warning {{incompatible pointer types passing 'int (*)[5]' to parameter of type 'int (*)[3]'}}
+  test_func_fix_var(5, &c35, &i5, NULL); // expected-error {{incompatible pointer types passing 'int (*)[5]' to parameter of type 'int (*)[3]'}}
   test_func_fix_uns(5, &c35, &i3, NULL);
-  test_func_fix_uns(5, &c35, &i5, NULL); // expected-warning {{incompatible pointer types passing 'int (*)[5]' to parameter of type 'int (*)[3]'}}
+  test_func_fix_uns(5, &c35, &i5, NULL); // expected-error {{incompatible pointer types passing 'int (*)[5]' to parameter of type 'int (*)[3]'}}
 }
 
 void test_multidimensional_arrays() {
diff --git a/clang/test/C/drs/dr0xx.c b/clang/test/C/drs/dr0xx.c
index c2b1a5b4bbecd..77ade3c351171 100644
--- a/clang/test/C/drs/dr0xx.c
+++ b/clang/test/C/drs/dr0xx.c
@@ -459,7 +459,7 @@ void dr088_1(void) {
   /* Distinct type from the file scope forward declaration. */
   struct dr088_t_1;
   /* FIXME: this diagnostic could be improved to not be utterly baffling. */
-  dr088_f((struct dr088_t_1 *)0); /* expected-warning {{incompatible pointer types passing 'struct dr088_t_1 *' to parameter of type 'struct dr088_t_1 *'}} */
+  dr088_f((struct dr088_t_1 *)0); /* expected-error {{incompatible pointer types passing 'struct dr088_t_1 *' to parameter of type 'struct dr088_t_1 *'}} */
 }
 
 void dr088_2(struct dr088_t_1 *p) { /* Pointer to incomplete type. */ }
diff --git a/clang/test/CIR/CodeGen/builtins-elementwise.c b/clang/test/CIR/CodeGen/builtins-elementwise.c
index e3460f06d166a..f64080b829bdf 100644
--- a/clang/test/CIR/CodeGen/builtins-elementwise.c
+++ b/clang/test/CIR/CodeGen/builtins-elementwise.c
@@ -89,3 +89,30 @@ void test_builtin_elementwise_atan(float f, double d, vfloat4 vf4,
   // OGCG: %{{.*}} = call <4 x double> @llvm.atan.v4f64(<4 x double> %{{.*}})
   vd4 = __builtin_elementwise_atan(vd4);
 }
+
+void test_builtin_elementwise_cos(float f, double d, vfloat4 vf4,
+                                     vdouble4 vd4) {
+  // CIR-LABEL: test_builtin_elementwise_cos
+  // LLVM-LABEL: test_builtin_elementwise_cos
+  // OGCG-LABEL: test_builtin_elementwise_cos
+
+  // CIR: {{%.*}} = cir.cos {{%.*}} : !cir.float
+  // LLVM: {{%.*}} = call float @llvm.cos.f32(float {{%.*}})
+  // OGCG: {{%.*}} = call float @llvm.cos.f32(float {{%.*}})
+  f = __builtin_elementwise_cos(f);
+
+  // CIR: {{%.*}} = cir.cos {{%.*}} : !cir.double
+  // LLVM: {{%.*}} = call double @llvm.cos.f64(double {{%.*}})
+  // OGCG: {{%.*}} = call double @llvm.cos.f64(double {{%.*}})
+  d = __builtin_elementwise_cos(d);
+
+  // CIR: {{%.*}} = cir.cos {{%.*}} : !cir.vector<4 x !cir.float>
+  // LLVM: {{%.*}} = call <4 x float> @llvm.cos.v4f32(<4 x float> {{%.*}})
+  // OGCG: {{%.*}} = call <4 x float> @llvm.cos.v4f32(<4 x float> {{%.*}})
+  vf4 = __builtin_elementwise_cos(vf4);
+
+  // CIR: {{%.*}} = cir.cos {{%.*}} : !cir.vector<4 x !cir.double>
+  // LLVM: {{%.*}} = call <4 x double> @llvm.cos.v4f64(<4 x double> {{%.*}})
+  // OGCG: {{%.*}} = call <4 x double> @llvm.cos.v4f64(<4 x double> {{%.*}})
+  vd4 = __builtin_elementwise_cos(vd4);
+}
diff --git a/clang/test/CIR/CodeGen/builtins-floating-point.c b/clang/test/CIR/CodeGen/builtins-floating-point.c
new file mode 100644
index 0000000000000..193cc172d37d2
--- /dev/null
+++ b/clang/test/CIR/CodeGen/builtins-floating-point.c
@@ -0,0 +1,20 @@
+// RUN: %clang_cc1 -triple aarch64-none-linux-android24 -Wno-unused-value -fclangir -emit-cir %s -o %t.cir
+// RUN: FileCheck --input-file=%t.cir %s -check-prefix=CIR
+// RUN: %clang_cc1 -triple aarch64-none-linux-android24 -Wno-unused-value -fclangir -emit-llvm %s -o %t-cir.ll
+// RUN: FileCheck --input-file=%t-cir.ll %s -check-prefix=LLVM
+// RUN: %clang_cc1 -triple aarch64-none-linux-android24 -Wno-unused-value -emit-llvm %s -o %t.ll
+// RUN: FileCheck --input-file=%t.ll %s -check-prefix=OGCG
+
+float cosf(float f) {
+  return __builtin_cosf(f);
+  // CHECK: %{{.*}} = cir.cos {{.*}} : !cir.float
+  // LLVM: %{{.*}} = call float @llvm.cos.f32(float %{{.*}})
+  // OGCG: %{{.*}} = call float @llvm.cos.f32(float %{{.*}})
+}
+
+double cos(double f) {
+  return __builtin_cos(f);
+  // CIR: {{.+}} = cir.cos {{.+}} : !cir.double
+  // LLVM: %{{.*}} = call double @llvm.cos.f64(double %{{.*}})
+  // OGCG: %{{.*}} = call double @llvm.cos.f64(double %{{.*}})
+}
diff --git a/clang/test/CIR/CodeGen/opaque.cpp b/clang/test/CIR/CodeGen/opaque.cpp
new file mode 100644
index 0000000000000..a48c013e5c20b
--- /dev/null
+++ b/clang/test/CIR/CodeGen/opaque.cpp
@@ -0,0 +1,156 @@
+// RUN: %clang_cc1 -std=c++20 -triple x86_64-unknown-linux-gnu -Wno-unused-value -fclangir -emit-cir %s -o %t.cir
+// RUN: FileCheck --input-file=%t.cir %s -check-prefix=CIR
+// RUN: %clang_cc1 -std=c++20 -triple x86_64-unknown-linux-gnu -Wno-unused-value -fclangir -emit-llvm %s -o %t-cir.ll
+// RUN: FileCheck --input-file=%t-cir.ll %s -check-prefix=LLVM
+// RUN: %clang_cc1 -std=c++20 -triple x86_64-unknown-linux-gnu -Wno-unused-value -emit-llvm %s -o %t.ll
+// RUN: FileCheck --input-file=%t.ll %s -check-prefix=OGCG
+
+void foo() {
+  int a;
+  int b = 1 ?: a;
+}
+
+// CIR: %[[A_ADDR:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["a"]
+// CIR: %[[B_ADDR:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["b", init]
+// CIR: %[[CONST_1:.*]] = cir.const #cir.int<1> : !s32i
+// CIR: cir.store{{.*}} %[[CONST_1]], %[[B_ADDR]] : !s32i, !cir.ptr<!s32i>
+
+// LLVM: %[[A_ADDR:.*]] = alloca i32, i64 1, align 4
+// LLVM: %[[B_ADDR:.*]] = alloca i32, i64 1, align 4
+// LLVM: store i32 1, ptr %[[B_ADDR]], align 4
+
+// OGCG: %[[A_ADDR:.*]] = alloca i32, align 4
+// OGCG: %[[B_ADDR:.*]] = alloca i32, align 4
+// OGCG: store i32 1, ptr %[[B_ADDR]], align 4
+
+void foo2() {
+  float _Complex a;
+  float _Complex b;
+  float _Complex c = a ?: b;
+}
+
+// CIR: %[[A_ADDR:.*]] = cir.alloca !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>, ["a"]
+// CIR: %[[B_ADDR:.*]] = cir.alloca !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>, ["b"]
+// CIR: %[[C_ADDR:.*]] = cir.alloca !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>, ["c", init]
+// CIR: %[[TMP_A:.*]] = cir.load{{.*}} %[[A_ADDR]] : !cir.ptr<!cir.complex<!cir.float>>, !cir.complex<!cir.float>
+// CIR: %[[A_REAL:.*]] = cir.complex.real %[[TMP_A]] : !cir.complex<!cir.float> -> !cir.float
+// CIR: %[[A_IMAG:.*]] = cir.complex.imag %[[TMP_A]] : !cir.complex<!cir.float> -> !cir.float
+// CIR: %[[A_REAL_BOOL:.*]] = cir.cast(float_to_bool, %[[A_REAL]] : !cir.float), !cir.bool
+// CIR: %[[A_IMAG_BOOL:.*]] = cir.cast(float_to_bool, %[[A_IMAG]] : !cir.float), !cir.bool
+// CIR: %[[CONST_TRUE:.*]] = cir.const #true
+// CIR: %[[COND:.*]] = cir.select if %[[A_REAL_BOOL]] then %[[CONST_TRUE]] else %[[A_IMAG_BOOL]] : (!cir.bool, !cir.bool, !cir.bool) -> !cir.bool
+// CIR: %[[RESULT:.*]] = cir.ternary(%[[COND]], true {
+// CIR:   %[[TMP_A:.*]] = cir.load{{.*}} %[[A_ADDR]] : !cir.ptr<!cir.complex<!cir.float>>, !cir.complex<!cir.float>
+// CIR:   cir.yield %[[TMP_A]] : !cir.complex<!cir.float>
+// CIR: }, false {
+// CIR:   %[[TMP_B:.*]] = cir.load{{.*}} %[[B_ADDR]] : !cir.ptr<!cir.complex<!cir.float>>, !cir.complex<!cir.float>
+// CIR:   cir.yield %[[TMP_B]] : !cir.complex<!cir.float>
+// CIR: }) : (!cir.bool) -> !cir.complex<!cir.float>
+// CIR: cir.store{{.*}} %[[RESULT]], %[[C_ADDR]] : !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>
+
+// LLVM: %[[A_ADDR:.*]] = alloca { float, float }, i64 1, align 4
+// LLVM: %[[B_ADDR:.*]] = alloca { float, float }, i64 1, align 4
+// LLVM: %[[C_ADDR:.*]] = alloca { float, float }, i64 1, align 4
+// LLVM: %[[TMP_A:.*]] = load { float, float }, ptr %[[A_ADDR]], align 4
+// LLVM: %[[A_REAL:.*]] = extractvalue { float, float } %[[TMP_A]], 0
+// LLVM: %[[A_IMAG:.*]] = extractvalue { float, float } %[[TMP_A]], 1
+// LLVM: %[[A_REAL_BOOL:.*]] = fcmp une float %[[A_REAL]], 0.000000e+00
+// LLVM: %[[A_IMAG_BOOL:.*]] = fcmp une float %[[A_IMAG]], 0.000000e+00
+// LLVM: %[[COND:.*]] = or i1 %[[A_REAL_BOOL]], %[[A_IMAG_BOOL]]
+// LLVM: br i1 %[[COND]], label %[[COND_TRUE:.*]], label %[[COND_FALSE:.*]]
+// LLVM: [[COND_TRUE]]:
+// LLVM:  %[[TMP_A:.*]] = load { float, float }, ptr %[[A_ADDR]], align 4
+// LLVM:  br label %[[COND_RESULT:.*]]
+// LLVM: [[COND_FALSE]]:
+// LLVM:  %[[TMP_B:.*]] = load { float, float }, ptr %[[B_ADDR]], align 4
+// LLVM:  br label %[[COND_RESULT]]
+// LLVM: [[COND_RESULT]]:
+// LLVM:  %[[RESULT:.*]] = phi { float, float } [ %[[TMP_B]], %[[COND_FALSE]] ], [ %[[TMP_A]], %[[COND_TRUE]] ]
+// LLVM:  br label %[[COND_END:.*]]
+// LLVM: [[COND_END]]:
+// LLVM:  store { float, float } %[[RESULT]], ptr %[[C_ADDR]], align 4
+
+// OGCG: %[[A_ADDR:.*]] = alloca { float, float }, align 4
+// OGCG: %[[B_ADDR:.*]] = alloca { float, float }, align 4
+// OGCG: %[[C_ADDR:.*]] = alloca { float, float }, align 4
+// OGCG: %[[A_REAL_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[A_ADDR]], i32 0, i32 0
+// OGCG: %[[A_REAL:.*]] = load float, ptr %[[A_REAL_PTR]], align 4
+// OGCG: %[[A_IMAG_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[A_ADDR]], i32 0, i32 1
+// OGCG: %[[A_IMAG:.*]] = load float, ptr %[[A_IMAG_PTR]], align 4
+// OGCG: %[[A_REAL_BOOL:.*]] = fcmp une float %[[A_REAL]], 0.000000e+00
+// OGCG: %[[A_IMAG_BOOL:.*]] = fcmp une float %[[A_IMAG]], 0.000000e+00
+// OGCG: %[[COND:.*]] = or i1 %[[A_REAL_BOOL]], %[[A_IMAG_BOOL]]
+// OGCG: br i1 %tobool2, label %[[COND_TRUE:.*]], label %[[COND_FALSE:.*]]
+// OGCG: [[COND_TRUE]]:
+// OGCG:  %[[A_REAL_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[A_ADDR]], i32 0, i32 0
+// OGCG:  %[[A_REAL:.*]] = load float, ptr %[[A_REAL_PTR]], align 4
+// OGCG:  %[[A_IMAG_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[A_ADDR]], i32 0, i32 1
+// OGCG:  %[[A_IMAG:.*]] = load float, ptr %[[A_IMAG_PTR]], align 4
+// OGCG:  br label %[[COND_END:.*]]
+// OGCG: [[COND_FALSE]]:
+// OGCG:  %[[B_REAL_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[B_ADDR]], i32 0, i32 0
+// OGCG:  %[[B_REAL:.*]] = load float, ptr %[[B_REAL_PTR]], align 4
+// OGCG:  %[[B_IMAG_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[B_ADDR]], i32 0, i32 1
+// OGCG:  %[[B_IMAG:.*]] = load float, ptr %[[B_IMAG_PTR]], align 4
+// OGCG:  br label %[[COND_END]]
+// OGCG: [[COND_END]]:
+// OGCG:  %[[RESULT_REAL:.*]] = phi float [ %[[A_REAL]], %[[COND_TRUE]] ], [ %[[B_REAL]], %[[COND_FALSE]] ]
+// OGCG:  %[[RESULT_IMAG:.*]] = phi float [ %[[A_IMAG]], %[[COND_TRUE]] ], [ %[[B_IMAG]], %[[COND_FALSE]] ]
+// OGCG:  %[[C_REAL_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[C_ADDR]], i32 0, i32 0
+// OGCG:  %[[C_IMAG_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[C_ADDR]], i32 0, i32 1
+// OGCG:  store float %[[RESULT_REAL]], ptr %[[C_REAL_PTR]], align 4
+// OGCG:  store float %[[RESULT_IMAG]], ptr %[[C_IMAG_PTR]], align 4
+
+void foo3() {
+  int a;
+  int b;
+  int c = a ?: b;
+}
+
+// CIR: %[[A_ADDR:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["a"]
+// CIR: %[[B_ADDR:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["b"]
+// CIR: %[[C_ADDR:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["c", init]
+// CIR: %[[TMP_A:.*]] = cir.load{{.*}} %[[A_ADDR]] : !cir.ptr<!s32i>, !s32i
+// CIR: %[[A_BOOL:.*]] = cir.cast(int_to_bool, %[[TMP_A]] : !s32i), !cir.bool
+// CIR: %[[RESULT:.*]] = cir.ternary(%[[A_BOOL]], true {
+// CIR:   %[[TMP_A:.*]] = cir.load{{.*}} %[[A_ADDR]] : !cir.ptr<!s32i>, !s32i
+// CIR:   cir.yield %[[TMP_A]] : !s32i
+// CIR: }, false {
+// CIR:   %[[TMP_B:.*]] = cir.load{{.*}} %[[B_ADDR]] : !cir.ptr<!s32i>, !s32i
+// CIR:   cir.yield %[[TMP_B]] : !s32i
+// CIR: }) : (!cir.bool) -> !s32i
+// CIR: cir.store{{.*}} %[[RESULT]], %[[C_ADDR]] : !s32i, !cir.ptr<!s32i>
+
+// LLVM: %[[A_ADDR:.*]] = alloca i32, i64 1, align 4
+// LLVM: %[[B_ADDR:.*]] = alloca i32, i64 1, align 4
+// LLVM: %[[C_ADDR:.*]] = alloca i32, i64 1, align 4
+// LLVM: %[[TMP_A:.*]] = load i32, ptr %[[A_ADDR]], align 4
+// LLVM: %[[COND:.*]] = icmp ne i32 %[[TMP_A]], 0
+// LLVM: br i1 %[[COND]], label %[[COND_TRUE:.*]], label %[[COND_FALSE:.*]]
+// LLVM: [[COND_TRUE]]:
+// LLVM:  %[[TMP_A:.*]] = load i32, ptr %[[A_ADDR]], align 4
+// LLVM:  br label %[[COND_RESULT:.*]]
+// LLVM: [[COND_FALSE]]:
+// LLVM:  %[[TMP_B:.*]] = load i32, ptr %[[B_ADDR]], align 4
+// LLVM:  br label %[[COND_RESULT]]
+// LLVM: [[COND_RESULT]]:
+// LLVM:  %[[RESULT:.*]] = phi i32 [ %[[TMP_B]], %[[COND_FALSE]] ], [ %[[TMP_A]], %[[COND_TRUE]] ]
+// LLVM:  br label %[[COND_END:.*]]
+// LLVM: [[COND_END]]:
+// LLVM:  store i32 %[[RESULT]], ptr %[[C_ADDR]], align 4
+
+// OGCG: %[[A_ADDR:.*]] = alloca i32, align 4
+// OGCG: %[[B_ADDR:.*]] = alloca i32, align 4
+// OGCG: %[[C_ADDR:.*]] = alloca i32, align 4
+// OGCG: %[[TMP_A:.*]] = load i32, ptr %[[A_ADDR]], align 4
+// OGCG: %[[A_BOOL:.*]] = icmp ne i32 %[[TMP_A]], 0
+// OGCG: br i1 %[[A_BOOL]], label %[[COND_TRUE:.*]], label %[[COND_FALSE:.*]]
+// OGCG: [[COND_TRUE]]:
+// OGCG:  %[[TMP_A:.*]] = load i32, ptr %[[A_ADDR]], align 4
+// OGCG:  br label %[[COND_END:.*]]
+// OGCG: [[COND_FALSE]]:
+// OGCG:  %[[TMP_B:.*]] = load i32, ptr %[[B_ADDR]], align 4
+// OGCG:  br label %[[COND_END]]
+// OGCG: [[COND_END]]:
+// OGCG:  %[[RESULT:.*]] = phi i32 [ %[[TMP_A]], %[[COND_TRUE]] ], [ %[[TMP_B]], %[[COND_FALSE]] ]
+// OGCG:  store i32 %[[RESULT]], ptr %[[C_ADDR]], align 4
diff --git a/clang/test/CXX/drs/cwg30xx.cpp b/clang/test/CXX/drs/cwg30xx.cpp
index a0e13013b1bbf..0be3f0b1e88ea 100644
--- a/clang/test/CXX/drs/cwg30xx.cpp
+++ b/clang/test/CXX/drs/cwg30xx.cpp
@@ -7,7 +7,7 @@
 // RUN: %clang_cc1 -std=c++2c -pedantic-errors -verify=expected %s
 
 
-namespace cwg3005 { // cwg3005: 21 open 2025-03-10
+namespace cwg3005 { // cwg3005: 21 tentatively ready 2025-09-12
 
 void f(
     int _, // #cwg3005-first-param
diff --git a/clang/test/ClangScanDeps/module-format.c b/clang/test/ClangScanDeps/module-format.c
index 0a6abec80dd90..acfe195c4e080 100644
--- a/clang/test/ClangScanDeps/module-format.c
+++ b/clang/test/ClangScanDeps/module-format.c
@@ -6,8 +6,6 @@
 // section in XCOFF yet.
 // UNSUPPORTED: target={{.*}}-aix{{.*}}
 
-// REQUIRES: shell
-
 // RUN: rm -rf %t && mkdir %t
 // RUN: cp %S/Inputs/modules-pch/* %t
 
diff --git a/clang/test/ClangScanDeps/modules-context-hash-cwd.c b/clang/test/ClangScanDeps/modules-context-hash-cwd.c
index c609a7dcbc80e..b5086ed409223 100644
--- a/clang/test/ClangScanDeps/modules-context-hash-cwd.c
+++ b/clang/test/ClangScanDeps/modules-context-hash-cwd.c
@@ -1,7 +1,7 @@
+// Most likely platform specific sed differences
+// UNSUPPORTED: system-windows
 // Test current directory pruning when computing the context hash.
 
-// REQUIRES: shell
-
 // RUN: rm -rf %t
 // RUN: split-file %s %t
 // RUN: sed -e "s|DIR|%/t|g" %t/cdb0.json.in > %t/cdb0.json
diff --git a/clang/test/ClangScanDeps/modules-file-path-isolation.c b/clang/test/ClangScanDeps/modules-file-path-isolation.c
index 2bd0a58ca9ae6..55784cf41700e 100644
--- a/clang/test/ClangScanDeps/modules-file-path-isolation.c
+++ b/clang/test/ClangScanDeps/modules-file-path-isolation.c
@@ -3,7 +3,8 @@
 // Note: the spelling of the modulemap path still depends on the includer, since
 // that is the only source of information about it.
 
-// REQUIRES: shell
+// Needs symlinks
+// UNSUPPORTED: system-windows
 
 // RUN: rm -rf %t
 // RUN: split-file %s %t
diff --git a/clang/test/ClangScanDeps/modules-in-stable-dirs.c b/clang/test/ClangScanDeps/modules-in-stable-dirs.c
index 066c5445f41f4..f54e09fecee94 100644
--- a/clang/test/ClangScanDeps/modules-in-stable-dirs.c
+++ b/clang/test/ClangScanDeps/modules-in-stable-dirs.c
@@ -1,3 +1,6 @@
+// Most likely platform specific sed differences
+// UNSUPPORTED: system-windows
+
 // This test verifies modules that are entirely comprised from stable directory inputs are captured in
 // dependency information.
 
@@ -5,7 +8,6 @@
 // The second compilation verifies that external paths are resolved when a 
 // vfsoverlay for determining is-in-stable-directories.
 
-// REQUIRES: shell
 // RUN: rm -rf %t
 // RUN: split-file %s %t
 // RUN: sed -e "s|DIR|%/t|g" %t/compile-commands.json.in > %t/compile-commands.json
diff --git a/clang/test/ClangScanDeps/modules-symlink-dir-from-module.c b/clang/test/ClangScanDeps/modules-symlink-dir-from-module.c
index 5f0ebc13eb2ee..85f5f1acc3793 100644
--- a/clang/test/ClangScanDeps/modules-symlink-dir-from-module.c
+++ b/clang/test/ClangScanDeps/modules-symlink-dir-from-module.c
@@ -3,7 +3,7 @@
 // module below does not transitively import Mod via a symlink, so it should not
 // see the symlinked path.
 
-// REQUIRES: shell
+// REQUIRES: symlinks
 
 // RUN: rm -rf %t
 // RUN: split-file %s %t
diff --git a/clang/test/ClangScanDeps/modules-symlink-dir-vfs.c b/clang/test/ClangScanDeps/modules-symlink-dir-vfs.c
index f2e5758aa41fb..eb49ab90c4d18 100644
--- a/clang/test/ClangScanDeps/modules-symlink-dir-vfs.c
+++ b/clang/test/ClangScanDeps/modules-symlink-dir-vfs.c
@@ -5,7 +5,7 @@
 // RUN: rm -rf %t
 // RUN: split-file %s %t
 
-// REQUIRES: shell
+// REQUIRES: symlinks
 
 // RUN: mkdir -p %t/frameworks-symlink
 // RUN: ln -s %t/frameworks/FW.framework %t/frameworks-symlink/FW.framework
diff --git a/clang/test/ClangScanDeps/modules-symlink-dir.c b/clang/test/ClangScanDeps/modules-symlink-dir.c
index 35e830e8c6c57..9946b57c6680f 100644
--- a/clang/test/ClangScanDeps/modules-symlink-dir.c
+++ b/clang/test/ClangScanDeps/modules-symlink-dir.c
@@ -1,8 +1,8 @@
+// REQUIRES: symlinks
+
 // Check that we canonicalize the module map path without changing the module
 // directory, which would break header lookup.
 
-// REQUIRES: shell
-
 // RUN: rm -rf %t
 // RUN: split-file %s %t
 // RUN: sed -e "s|DIR|%/t|g" %t/cdb.json.in > %t/cdb.json
diff --git a/clang/test/ClangScanDeps/prebuilt-modules-in-stable-dirs.c b/clang/test/ClangScanDeps/prebuilt-modules-in-stable-dirs.c
index acacda1466b59..39b2863d966c3 100644
--- a/clang/test/ClangScanDeps/prebuilt-modules-in-stable-dirs.c
+++ b/clang/test/ClangScanDeps/prebuilt-modules-in-stable-dirs.c
@@ -1,3 +1,6 @@
+/// Most likely platform specific sed differences
+// UNSUPPORTED: system-windows
+
 /// This test validates that modules that depend on prebuilt modules 
 ///   resolve `is-in-stable-directories` correctly. 
 /// The steps are: 
@@ -5,8 +8,7 @@
 ///   that is seemingly from the sysroot. However, it depends on a local header that is overlaid.
 /// 2. Build the PCH & dependency PCMs.
 /// 3. Scan a source file that transitively depends on the same modules as the pcm.
- 
-// REQUIRES: shell
+
 // RUN: rm -rf %t
 // RUN: split-file %s %t
 // RUN: sed -e "s|DIR|%/t|g" %t/overlay.json.template > %t/overlay.json
diff --git a/clang/test/ClangScanDeps/subframework_header_dir_symlink.m b/clang/test/ClangScanDeps/subframework_header_dir_symlink.m
index 3bbc5320d4e0c..66ff9df80527b 100644
--- a/clang/test/ClangScanDeps/subframework_header_dir_symlink.m
+++ b/clang/test/ClangScanDeps/subframework_header_dir_symlink.m
@@ -1,4 +1,4 @@
-// REQUIRES: shell
+// REQUIRES: symlinks
 // RUN: rm -rf %t.dir
 // RUN: rm -rf %t.cdb
 // RUN: mkdir -p %t.dir
diff --git a/clang/test/ClangScanDeps/symlink.cpp b/clang/test/ClangScanDeps/symlink.cpp
index d262f8c7f1d95..5b13d88544d98 100644
--- a/clang/test/ClangScanDeps/symlink.cpp
+++ b/clang/test/ClangScanDeps/symlink.cpp
@@ -1,4 +1,5 @@
-// REQUIRES: shell
+// REQUIRES: symlinks
+
 // RUN: rm -rf %t.dir
 // RUN: rm -rf %t.cdb
 // RUN: mkdir -p %t.dir
diff --git a/clang/test/CodeCompletion/included-symlinks.cpp b/clang/test/CodeCompletion/included-symlinks.cpp
index 7ac5e20e7fdef..c3e7c0ab0c5a3 100644
--- a/clang/test/CodeCompletion/included-symlinks.cpp
+++ b/clang/test/CodeCompletion/included-symlinks.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: shell
+// REQUIRES: symlinks
 // RUN: rm -rf %t && mkdir -p %t/real/myproj && mkdir -p %t/links
 // RUN: touch %t/real/foo.h && ln -s %t/real/foo.h %t/links/foo.h
 // RUN: touch %t/real/foobar.h && ln -s %t/real/foobar.h %t/links/foobar.h
diff --git a/clang/test/CodeGen/2008-03-05-syncPtr.c b/clang/test/CodeGen/2008-03-05-syncPtr.c
index 8968a7dc2678c..30dc4f26e1d3e 100644
--- a/clang/test/CodeGen/2008-03-05-syncPtr.c
+++ b/clang/test/CodeGen/2008-03-05-syncPtr.c
@@ -34,7 +34,7 @@ int* foo5(int** a, int* b) {
 
 
 int* foo6(int** a, int*** b) {
-  return __sync_lock_test_and_set (a, b);
+  return __sync_lock_test_and_set (a, (int*)b);
 }
 // CHECK-LABEL: define{{.*}} ptr @foo6
 // CHECK: atomicrmw xchg {{.*}}, align 8
diff --git a/clang/test/CodeGen/AArch64/fp8-init-list.c b/clang/test/CodeGen/AArch64/fp8-init-list.c
index 8b4b31a71c46a..7c0f6278b2090 100644
--- a/clang/test/CodeGen/AArch64/fp8-init-list.c
+++ b/clang/test/CodeGen/AArch64/fp8-init-list.c
@@ -1,4 +1,4 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6
 
 // RUN: %clang_cc1        -triple aarch64-none-linux-gnu -target-feature +neon -O2 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
 // RUN: %clang_cc1 -x c++ -triple aarch64-none-linux-gnu -target-feature +neon -O2 -Werror -Wall -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK-CXX
@@ -34,25 +34,25 @@ struct S s;
 // CHECK-LABEL: define dso_local void @f(
 // CHECK-SAME: <1 x i8> [[X:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    store <1 x i8> [[X]], ptr @s, align 1, !tbaa [[TBAA2:![0-9]+]]
+// CHECK-NEXT:    store <1 x i8> [[X]], ptr @s, align 1, !tbaa [[__MFP8_TBAA2:![0-9]+]]
 // CHECK-NEXT:    ret void
 //
 // CHECK-CXX-LABEL: define dso_local void @_Z1fu6__mfp8(
 // CHECK-CXX-SAME: <1 x i8> [[X:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] {
 // CHECK-CXX-NEXT:  [[ENTRY:.*:]]
-// CHECK-CXX-NEXT:    store <1 x i8> [[X]], ptr @s, align 1, !tbaa [[TBAA2:![0-9]+]]
+// CHECK-CXX-NEXT:    store <1 x i8> [[X]], ptr @s, align 1, !tbaa [[__MFP8_TBAA2:![0-9]+]]
 // CHECK-CXX-NEXT:    ret void
 //
 void f(__mfp8 x) {
     s = (struct S){x};
 }
 //.
-// CHECK: [[TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
+// CHECK: [[__MFP8_TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
 // CHECK: [[META3]] = !{!"__mfp8", [[META4:![0-9]+]], i64 0}
 // CHECK: [[META4]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0}
 // CHECK: [[META5]] = !{!"Simple C/C++ TBAA"}
 //.
-// CHECK-CXX: [[TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
+// CHECK-CXX: [[__MFP8_TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
 // CHECK-CXX: [[META3]] = !{!"__mfp8", [[META4:![0-9]+]], i64 0}
 // CHECK-CXX: [[META4]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0}
 // CHECK-CXX: [[META5]] = !{!"Simple C++ TBAA"}
diff --git a/clang/test/CodeGen/AArch64/ls64-inline-asm.c b/clang/test/CodeGen/AArch64/ls64-inline-asm.c
index 8aa0684dba14d..1d217eb8801e5 100644
--- a/clang/test/CodeGen/AArch64/ls64-inline-asm.c
+++ b/clang/test/CodeGen/AArch64/ls64-inline-asm.c
@@ -1,12 +1,13 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6
 // RUN: %clang_cc1 -triple aarch64 -target-feature +ls64 -O1 -emit-llvm -x c %s -o - | FileCheck %s
 
 struct foo { unsigned long long x[8]; };
 
-// CHECK-LABEL: @load(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i512 asm sideeffect "ld64b $0,[$1]", "=r,r,~{memory}"(ptr [[ADDR:%.*]]) #[[ATTR1:[0-9]+]], !srcloc [[META2:![0-9]+]]
-// CHECK-NEXT:    store i512 [[TMP0]], ptr [[OUTPUT:%.*]], align 8
+// CHECK-LABEL: define dso_local void @load(
+// CHECK-SAME: ptr noundef writeonly captures(none) initializes((0, 64)) [[OUTPUT:%.*]], ptr noundef [[ADDR:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i512 asm sideeffect "ld64b $0,[$1]", "=r,r,~{memory}"(ptr [[ADDR]]) #[[ATTR1:[0-9]+]], !srcloc [[META2:![0-9]+]]
+// CHECK-NEXT:    store i512 [[TMP0]], ptr [[OUTPUT]], align 8
 // CHECK-NEXT:    ret void
 //
 void load(struct foo *output, void *addr)
@@ -14,10 +15,11 @@ void load(struct foo *output, void *addr)
     __asm__ volatile ("ld64b %0,[%1]" : "=r" (*output) : "r" (addr) : "memory");
 }
 
-// CHECK-LABEL: @store(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load i512, ptr [[INPUT:%.*]], align 8
-// CHECK-NEXT:    tail call void asm sideeffect "st64b $0,[$1]", "r,r,~{memory}"(i512 [[TMP0]], ptr [[ADDR:%.*]]) #[[ATTR1]], !srcloc [[META3:![0-9]+]]
+// CHECK-LABEL: define dso_local void @store(
+// CHECK-SAME: ptr noundef readonly captures(none) [[INPUT:%.*]], ptr noundef [[ADDR:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i512, ptr [[INPUT]], align 8
+// CHECK-NEXT:    tail call void asm sideeffect "st64b $0,[$1]", "r,r,~{memory}"(i512 [[TMP0]], ptr [[ADDR]]) #[[ATTR1]], !srcloc [[META3:![0-9]+]]
 // CHECK-NEXT:    ret void
 //
 void store(const struct foo *input, void *addr)
@@ -25,30 +27,31 @@ void store(const struct foo *input, void *addr)
     __asm__ volatile ("st64b %0,[%1]" : : "r" (*input), "r" (addr) : "memory" );
 }
 
-// CHECK-LABEL: @store2(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[IN:%.*]], align 4, !tbaa [[TBAA4:![0-9]+]]
+// CHECK-LABEL: define dso_local void @store2(
+// CHECK-SAME: ptr noundef readonly captures(none) [[IN:%.*]], ptr noundef [[ADDR:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[IN]], align 4, !tbaa [[INT_TBAA4:![0-9]+]]
 // CHECK-NEXT:    [[CONV:%.*]] = sext i32 [[TMP0]] to i64
 // CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw i8, ptr [[IN]], i64 4
-// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4, !tbaa [[TBAA4]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4, !tbaa [[INT_TBAA4]]
 // CHECK-NEXT:    [[CONV2:%.*]] = sext i32 [[TMP1]] to i64
 // CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[IN]], i64 16
-// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX4]], align 4, !tbaa [[TBAA4]]
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX4]], align 4, !tbaa [[INT_TBAA4]]
 // CHECK-NEXT:    [[CONV5:%.*]] = sext i32 [[TMP2]] to i64
 // CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw i8, ptr [[IN]], i64 64
-// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX7]], align 4, !tbaa [[TBAA4]]
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX7]], align 4, !tbaa [[INT_TBAA4]]
 // CHECK-NEXT:    [[CONV8:%.*]] = sext i32 [[TMP3]] to i64
 // CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds nuw i8, ptr [[IN]], i64 100
-// CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX10]], align 4, !tbaa [[TBAA4]]
+// CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX10]], align 4, !tbaa [[INT_TBAA4]]
 // CHECK-NEXT:    [[CONV11:%.*]] = sext i32 [[TMP4]] to i64
 // CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds nuw i8, ptr [[IN]], i64 144
-// CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[ARRAYIDX13]], align 4, !tbaa [[TBAA4]]
+// CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[ARRAYIDX13]], align 4, !tbaa [[INT_TBAA4]]
 // CHECK-NEXT:    [[CONV14:%.*]] = sext i32 [[TMP5]] to i64
 // CHECK-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds nuw i8, ptr [[IN]], i64 196
-// CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[ARRAYIDX16]], align 4, !tbaa [[TBAA4]]
+// CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[ARRAYIDX16]], align 4, !tbaa [[INT_TBAA4]]
 // CHECK-NEXT:    [[CONV17:%.*]] = sext i32 [[TMP6]] to i64
 // CHECK-NEXT:    [[ARRAYIDX19:%.*]] = getelementptr inbounds nuw i8, ptr [[IN]], i64 256
-// CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[ARRAYIDX19]], align 4, !tbaa [[TBAA4]]
+// CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[ARRAYIDX19]], align 4, !tbaa [[INT_TBAA4]]
 // CHECK-NEXT:    [[CONV20:%.*]] = sext i32 [[TMP7]] to i64
 // CHECK-NEXT:    [[S_SROA_10_0_INSERT_EXT:%.*]] = zext i64 [[CONV20]] to i512
 // CHECK-NEXT:    [[S_SROA_10_0_INSERT_SHIFT:%.*]] = shl nuw i512 [[S_SROA_10_0_INSERT_EXT]], 448
@@ -72,7 +75,7 @@ void store(const struct foo *input, void *addr)
 // CHECK-NEXT:    [[S_SROA_0_0_INSERT_EXT:%.*]] = zext i64 [[CONV]] to i512
 // CHECK-NEXT:    [[S_SROA_0_0_INSERT_MASK:%.*]] = or disjoint i512 [[S_SROA_4_0_INSERT_MASK]], [[S_SROA_4_0_INSERT_SHIFT]]
 // CHECK-NEXT:    [[S_SROA_0_0_INSERT_INSERT:%.*]] = or i512 [[S_SROA_0_0_INSERT_MASK]], [[S_SROA_0_0_INSERT_EXT]]
-// CHECK-NEXT:    tail call void asm sideeffect "st64b $0,[$1]", "r,r,~{memory}"(i512 [[S_SROA_0_0_INSERT_INSERT]], ptr [[ADDR:%.*]]) #[[ATTR1]], !srcloc [[META8:![0-9]+]]
+// CHECK-NEXT:    tail call void asm sideeffect "st64b $0,[$1]", "r,r,~{memory}"(i512 [[S_SROA_0_0_INSERT_INSERT]], ptr [[ADDR]]) #[[ATTR1]], !srcloc [[META8:![0-9]+]]
 // CHECK-NEXT:    ret void
 //
 void store2(int *in, void *addr)
@@ -80,3 +83,12 @@ void store2(int *in, void *addr)
     struct foo s = { in[0], in[1], in[4], in[16], in[25], in[36], in[49], in[64] };
     __asm__ volatile ("st64b %0,[%1]" : : "r" (s), "r" (addr) : "memory" );
 }
+//.
+// CHECK: [[META2]] = !{i64 789}
+// CHECK: [[META3]] = !{i64 1368}
+// CHECK: [[INT_TBAA4]] = !{[[META5:![0-9]+]], [[META5]], i64 0}
+// CHECK: [[META5]] = !{!"int", [[META6:![0-9]+]], i64 0}
+// CHECK: [[META6]] = !{!"omnipotent char", [[META7:![0-9]+]], i64 0}
+// CHECK: [[META7]] = !{!"Simple C/C++ TBAA"}
+// CHECK: [[META8]] = !{i64 5992}
+//.
diff --git a/clang/test/CodeGen/AArch64/ptrauth-fmv.c b/clang/test/CodeGen/AArch64/ptrauth-fmv.c
new file mode 100644
index 0000000000000..3b60ea7412f1b
--- /dev/null
+++ b/clang/test/CodeGen/AArch64/ptrauth-fmv.c
@@ -0,0 +1,42 @@
+// RUN: %clang_cc1 -triple aarch64-linux-gnu -mbranch-target-enforce -msign-return-address=all      -emit-llvm %s -o - | FileCheck --check-prefixes=CHECK,BTI-SIGNRA %s
+// RUN: %clang_cc1 -triple arm64-apple-ios   -mbranch-target-enforce -msign-return-address=all      -emit-llvm %s -o - | FileCheck --check-prefixes=CHECK,BTI-SIGNRA %s
+// RUN: %clang_cc1 -triple aarch64-linux-gnu -fptrauth-calls -fptrauth-returns -fptrauth-auth-traps -emit-llvm %s -o - | FileCheck --check-prefixes=CHECK,PAUTHTEST  %s
+// RUN: %clang_cc1 -triple arm64-apple-ios   -fptrauth-calls -fptrauth-returns -fptrauth-auth-traps -emit-llvm %s -o - | FileCheck --check-prefixes=CHECK,PAUTHTEST  %s
+
+// Check that both multi-versioned functions themselves and corresponding
+// resolvers generated by Clang have the correct PAC/BTI attributes.
+
+int __attribute__((target_clones("crc", "default"))) global_target_clones(void) { return 0; }
+
+int __attribute__((target_version("crc")))     global_target_version(void) { return 0; }
+int __attribute__((target_version("default"))) global_target_version(void) { return 0; }
+
+static int __attribute__((target_clones("crc", "default"))) static_target_clones(void) { return 0; }
+
+static int __attribute__((target_version("crc")))     static_target_version(void) { return 0; }
+static int __attribute__((target_version("default"))) static_target_version(void) { return 0; }
+
+// Force emission of static_* functions.
+void *get_ptr1(void) { return static_target_clones; }
+void *get_ptr2(void) { return static_target_version; }
+
+// CHECK-DAG: define{{( dso_local)?}} i32 @global_target_clones._Mcrc()    #[[ATTR_CRC:[0-9]+]]
+// CHECK-DAG: define{{( dso_local)?}} i32 @global_target_clones.default()  #[[ATTR_DEFAULT:[0-9]+]]
+// CHECK-DAG: define weak_odr         ptr @global_target_clones.resolver() #[[ATTR_RESOLVER:[0-9]+]]
+// CHECK-DAG: define{{( dso_local)?}} i32 @global_target_version._Mcrc()    #[[ATTR_CRC]]
+// CHECK-DAG: define{{( dso_local)?}} i32 @global_target_version.default()  #[[ATTR_DEFAULT]]
+// CHECK-DAG: define weak_odr         ptr @global_target_version.resolver() #[[ATTR_RESOLVER]]
+
+// CHECK-DAG: define internal i32 @static_target_clones._Mcrc()    #[[ATTR_CRC:[0-9]+]]
+// CHECK-DAG: define internal i32 @static_target_clones.default()  #[[ATTR_DEFAULT:[0-9]+]]
+// CHECK-DAG: define internal ptr @static_target_clones.resolver() #[[ATTR_RESOLVER:[0-9]+]]
+// CHECK-DAG: define internal i32 @static_target_version._Mcrc()    #[[ATTR_CRC]]
+// CHECK-DAG: define internal i32 @static_target_version.default()  #[[ATTR_DEFAULT]]
+// CHECK-DAG: define internal ptr @static_target_version.resolver() #[[ATTR_RESOLVER]]
+
+// BTI-SIGNRA-DAG: attributes #[[ATTR_CRC]]      = { {{.*}}"branch-target-enforcement" {{.*}}"sign-return-address"="all" "sign-return-address-key"="a_key"{{.*}} }
+// BTI-SIGNRA-DAG: attributes #[[ATTR_RESOLVER]] = { {{.*}}"branch-target-enforcement" {{.*}}"sign-return-address"="all" "sign-return-address-key"="a_key"{{.*}} }
+// BTI-SIGNRA-DAG: attributes #[[ATTR_DEFAULT]]  = { {{.*}}"branch-target-enforcement" {{.*}}"sign-return-address"="all" "sign-return-address-key"="a_key"{{.*}} }
+// PAUTHTEST-DAG:  attributes #[[ATTR_CRC]]      = { {{.*}}"ptrauth-auth-traps" "ptrauth-calls" "ptrauth-returns"{{.*}} }
+// PAUTHTEST-DAG:  attributes #[[ATTR_RESOLVER]] = { {{.*}}"ptrauth-auth-traps" "ptrauth-calls" "ptrauth-returns"{{.*}} }
+// PAUTHTEST-DAG:  attributes #[[ATTR_DEFAULT]]  = { {{.*}}"ptrauth-auth-traps" "ptrauth-calls" "ptrauth-returns"{{.*}} }
diff --git a/clang/test/CodeGen/AArch64/resolver-attributes.c b/clang/test/CodeGen/AArch64/resolver-attributes.c
new file mode 100644
index 0000000000000..6e4497cdc8611
--- /dev/null
+++ b/clang/test/CodeGen/AArch64/resolver-attributes.c
@@ -0,0 +1,62 @@
+// RUN: %clang_cc1 -triple aarch64-linux-gnu -mbranch-target-enforce    -emit-llvm %s -o - | FileCheck --check-prefixes=BTI,ELF    %s
+// RUN: %clang_cc1 -triple arm64-apple-ios   -mbranch-target-enforce    -emit-llvm %s -o - | FileCheck --check-prefixes=BTI        %s
+// RUN: %clang_cc1 -triple aarch64-linux-gnu                            -emit-llvm %s -o - | FileCheck --check-prefixes=NOBTI,ELF  %s
+// RUN: %clang_cc1 -triple arm64-apple-ios                              -emit-llvm %s -o - | FileCheck --check-prefixes=NOBTI      %s
+// RUN: %clang_cc1 -triple aarch64-linux-gnu -fvisibility=hidden        -emit-llvm %s -o - | FileCheck --check-prefixes=HIDDEN,ELF %s
+// RUN: %clang_cc1 -triple arm64-apple-ios   -fvisibility=hidden        -emit-llvm %s -o - | FileCheck --check-prefixes=HIDDEN     %s
+
+// Check that the resolver functions generated by Clang have the correct attributes.
+// In these test cases, branch-target-enforcement is used as an example of
+// target-specific attribute that has to be set on every function by default.
+
+// FIXME: `cpu_specific`/`cpu_dispatch` and `target` attributes cannot be
+//        tested on AArch64.
+
+__attribute__((target_clones("crc", "default")))
+int global_target_clones(void) { return 0; }
+
+__attribute__((target_version("crc")))     int global_target_version(void) { return 0; }
+__attribute__((target_version("default"))) int global_target_version(void) { return 0; }
+
+__attribute__((target_clones("crc", "default")))
+static int static_target_clones(void) { return 0; }
+
+__attribute__((target_version("crc")))     static int  static_target_version(void) { return 0; }
+__attribute__((target_version("default"))) static int  static_target_version(void) { return 0; }
+
+// Force emission of static_* functions.
+void *get_ptr1(void) { return static_target_clones; }
+void *get_ptr2(void) { return static_target_version; }
+
+#ifdef __ELF__
+// Make sure target-specific attributes can be overriden as needed for
+// non-autogenerated resolver functions.
+// Note that since there is only a single definition of ifunc_resolver, it
+// is not itself a multi-versioned function, even though it has target(...)
+// attribute.
+int ifunc_func(void) { return 0; }
+__attribute__((target("branch-protection=bti"))) void *ifunc_resolver(void) { return ifunc_func; }
+__attribute__((ifunc("ifunc_resolver"))) int ifunc(void);
+#endif
+
+// ELF: define{{.*}} ptr @ifunc_resolver() #[[ATTR_IFUNC_RESOLVER:[0-9]+]]
+
+// BTI: define weak_odr ptr @global_target_clones.resolver()  #[[ATTR_RESOLVER:[0-9]+]]
+// BTI: define weak_odr ptr @global_target_version.resolver() #[[ATTR_RESOLVER]]
+// BTI: define internal ptr @static_target_clones.resolver()  #[[ATTR_RESOLVER]]
+// BTI: define internal ptr @static_target_version.resolver() #[[ATTR_RESOLVER]]
+
+// In NOBTI case, no attribute groups are assigned to the resolver functions:
+// NOBTI: define weak_odr ptr @global_target_clones.resolver(){{( comdat)?}} {
+// NOBTI: define weak_odr ptr @global_target_version.resolver(){{( comdat)?}} {
+// NOBTI: define internal ptr @static_target_clones.resolver() {
+// NOBTI: define internal ptr @static_target_version.resolver() {
+
+// HIDDEN: define weak_odr hidden ptr @global_target_clones.resolver(){{( comdat)?}} {
+// HIDDEN: define weak_odr hidden ptr @global_target_version.resolver(){{( comdat)?}} {
+// HIDDEN: define internal ptr @static_target_clones.resolver() {
+// HIDDEN: define internal ptr @static_target_version.resolver() {
+
+// ELF:       attributes #[[ATTR_IFUNC_RESOLVER]] = { {{.*}}"branch-target-enforcement"{{.*}} }
+
+// BTI:       attributes #[[ATTR_RESOLVER]] = { {{.*}}"branch-target-enforcement"{{.*}} }
diff --git a/clang/test/CodeGen/AArch64/sme-intrinsics/acle_sme_cnt.c b/clang/test/CodeGen/AArch64/sme-intrinsics/acle_sme_cnt.c
index c0b3e1a06b0ff..049c1742e5a9d 100644
--- a/clang/test/CodeGen/AArch64/sme-intrinsics/acle_sme_cnt.c
+++ b/clang/test/CodeGen/AArch64/sme-intrinsics/acle_sme_cnt.c
@@ -6,49 +6,55 @@
 
 #include <arm_sme.h>
 
-// CHECK-C-LABEL: define dso_local i64 @test_svcntsb(
+// CHECK-C-LABEL: define dso_local range(i64 0, -9223372036854775808) i64 @test_svcntsb(
 // CHECK-C-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 // CHECK-C-NEXT:  entry:
-// CHECK-C-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb()
-// CHECK-C-NEXT:    ret i64 [[TMP0]]
+// CHECK-C-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sme.cntsd()
+// CHECK-C-NEXT:    [[MULSVL:%.*]] = shl nuw nsw i64 [[TMP0]], 3
+// CHECK-C-NEXT:    ret i64 [[MULSVL]]
 //
-// CHECK-CXX-LABEL: define dso_local noundef i64 @_Z12test_svcntsbv(
+// CHECK-CXX-LABEL: define dso_local noundef range(i64 0, -9223372036854775808) i64 @_Z12test_svcntsbv(
 // CHECK-CXX-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 // CHECK-CXX-NEXT:  entry:
-// CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb()
-// CHECK-CXX-NEXT:    ret i64 [[TMP0]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sme.cntsd()
+// CHECK-CXX-NEXT:    [[MULSVL:%.*]] = shl nuw nsw i64 [[TMP0]], 3
+// CHECK-CXX-NEXT:    ret i64 [[MULSVL]]
 //
 uint64_t test_svcntsb() {
   return svcntsb();
 }
 
-// CHECK-C-LABEL: define dso_local i64 @test_svcntsh(
+// CHECK-C-LABEL: define dso_local range(i64 0, -9223372036854775808) i64 @test_svcntsh(
 // CHECK-C-SAME: ) local_unnamed_addr #[[ATTR0]] {
 // CHECK-C-NEXT:  entry:
-// CHECK-C-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sme.cntsh()
-// CHECK-C-NEXT:    ret i64 [[TMP0]]
+// CHECK-C-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sme.cntsd()
+// CHECK-C-NEXT:    [[MULSVL:%.*]] = shl nuw nsw i64 [[TMP0]], 2
+// CHECK-C-NEXT:    ret i64 [[MULSVL]]
 //
-// CHECK-CXX-LABEL: define dso_local noundef i64 @_Z12test_svcntshv(
+// CHECK-CXX-LABEL: define dso_local noundef range(i64 0, -9223372036854775808) i64 @_Z12test_svcntshv(
 // CHECK-CXX-SAME: ) local_unnamed_addr #[[ATTR0]] {
 // CHECK-CXX-NEXT:  entry:
-// CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sme.cntsh()
-// CHECK-CXX-NEXT:    ret i64 [[TMP0]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sme.cntsd()
+// CHECK-CXX-NEXT:    [[MULSVL:%.*]] = shl nuw nsw i64 [[TMP0]], 2
+// CHECK-CXX-NEXT:    ret i64 [[MULSVL]]
 //
 uint64_t test_svcntsh() {
   return svcntsh();
 }
 
-// CHECK-C-LABEL: define dso_local i64 @test_svcntsw(
+// CHECK-C-LABEL: define dso_local range(i64 0, -9223372036854775808) i64 @test_svcntsw(
 // CHECK-C-SAME: ) local_unnamed_addr #[[ATTR0]] {
 // CHECK-C-NEXT:  entry:
-// CHECK-C-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sme.cntsw()
-// CHECK-C-NEXT:    ret i64 [[TMP0]]
+// CHECK-C-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sme.cntsd()
+// CHECK-C-NEXT:    [[MULSVL:%.*]] = shl nuw nsw i64 [[TMP0]], 1
+// CHECK-C-NEXT:    ret i64 [[MULSVL]]
 //
-// CHECK-CXX-LABEL: define dso_local noundef i64 @_Z12test_svcntswv(
+// CHECK-CXX-LABEL: define dso_local noundef range(i64 0, -9223372036854775808) i64 @_Z12test_svcntswv(
 // CHECK-CXX-SAME: ) local_unnamed_addr #[[ATTR0]] {
 // CHECK-CXX-NEXT:  entry:
-// CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sme.cntsw()
-// CHECK-CXX-NEXT:    ret i64 [[TMP0]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sme.cntsd()
+// CHECK-CXX-NEXT:    [[MULSVL:%.*]] = shl nuw nsw i64 [[TMP0]], 1
+// CHECK-CXX-NEXT:    ret i64 [[MULSVL]]
 //
 uint64_t test_svcntsw() {
   return svcntsw();
diff --git a/clang/test/CodeGen/LoongArch/lasx/builtin-alias.c b/clang/test/CodeGen/LoongArch/lasx/builtin-alias.c
index 9a8ce224bcfd0..dd094e5493a60 100644
--- a/clang/test/CodeGen/LoongArch/lasx/builtin-alias.c
+++ b/clang/test/CodeGen/LoongArch/lasx/builtin-alias.c
@@ -1,6386 +1,7130 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6
 // RUN: %clang_cc1 -triple loongarch64 -target-feature +lasx -O2 -emit-llvm %s -o - | FileCheck %s
 
 #include <lasxintrin.h>
 
-// CHECK-LABEL: @xvsll_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2:![0-9]+]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsll_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2:![0-9]+]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsll.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvsll_b(v32i8 _1, v32i8 _2) { return __lasx_xvsll_b(_1, _2); }
-// CHECK-LABEL: @xvsll_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsll_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsll.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvsll_h(v16i16 _1, v16i16 _2) { return __lasx_xvsll_h(_1, _2); }
-// CHECK-LABEL: @xvsll_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsll_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsll.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvsll_w(v8i32 _1, v8i32 _2) { return __lasx_xvsll_w(_1, _2); }
-// CHECK-LABEL: @xvsll_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsll_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsll.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvsll_d(v4i64 _1, v4i64 _2) { return __lasx_xvsll_d(_1, _2); }
-// CHECK-LABEL: @xvslli_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvslli_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvslli.b(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvslli_b(v32i8 _1) { return __lasx_xvslli_b(_1, 1); }
-// CHECK-LABEL: @xvslli_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvslli_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvslli.h(<16 x i16> [[_1]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvslli_h(v16i16 _1) { return __lasx_xvslli_h(_1, 1); }
-// CHECK-LABEL: @xvslli_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvslli_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvslli.w(<8 x i32> [[_1]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvslli_w(v8i32 _1) { return __lasx_xvslli_w(_1, 1); }
-// CHECK-LABEL: @xvslli_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvslli_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvslli.d(<4 x i64> [[_1]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvslli_d(v4i64 _1) { return __lasx_xvslli_d(_1, 1); }
-// CHECK-LABEL: @xvsra_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsra_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsra.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvsra_b(v32i8 _1, v32i8 _2) { return __lasx_xvsra_b(_1, _2); }
-// CHECK-LABEL: @xvsra_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsra_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsra.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvsra_h(v16i16 _1, v16i16 _2) { return __lasx_xvsra_h(_1, _2); }
-// CHECK-LABEL: @xvsra_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsra_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsra.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvsra_w(v8i32 _1, v8i32 _2) { return __lasx_xvsra_w(_1, _2); }
-// CHECK-LABEL: @xvsra_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsra_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsra.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvsra_d(v4i64 _1, v4i64 _2) { return __lasx_xvsra_d(_1, _2); }
-// CHECK-LABEL: @xvsrai_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsrai_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrai.b(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvsrai_b(v32i8 _1) { return __lasx_xvsrai_b(_1, 1); }
-// CHECK-LABEL: @xvsrai_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsrai_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrai.h(<16 x i16> [[_1]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvsrai_h(v16i16 _1) { return __lasx_xvsrai_h(_1, 1); }
-// CHECK-LABEL: @xvsrai_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsrai_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrai.w(<8 x i32> [[_1]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvsrai_w(v8i32 _1) { return __lasx_xvsrai_w(_1, 1); }
-// CHECK-LABEL: @xvsrai_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsrai_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrai.d(<4 x i64> [[_1]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvsrai_d(v4i64 _1) { return __lasx_xvsrai_d(_1, 1); }
-// CHECK-LABEL: @xvsrar_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsrar_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrar.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvsrar_b(v32i8 _1, v32i8 _2) { return __lasx_xvsrar_b(_1, _2); }
-// CHECK-LABEL: @xvsrar_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsrar_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrar.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvsrar_h(v16i16 _1, v16i16 _2) { return __lasx_xvsrar_h(_1, _2); }
-// CHECK-LABEL: @xvsrar_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsrar_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrar.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvsrar_w(v8i32 _1, v8i32 _2) { return __lasx_xvsrar_w(_1, _2); }
-// CHECK-LABEL: @xvsrar_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsrar_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrar.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvsrar_d(v4i64 _1, v4i64 _2) { return __lasx_xvsrar_d(_1, _2); }
-// CHECK-LABEL: @xvsrari_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsrari_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrari.b(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvsrari_b(v32i8 _1) { return __lasx_xvsrari_b(_1, 1); }
-// CHECK-LABEL: @xvsrari_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsrari_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrari.h(<16 x i16> [[_1]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvsrari_h(v16i16 _1) { return __lasx_xvsrari_h(_1, 1); }
-// CHECK-LABEL: @xvsrari_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsrari_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrari.w(<8 x i32> [[_1]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvsrari_w(v8i32 _1) { return __lasx_xvsrari_w(_1, 1); }
-// CHECK-LABEL: @xvsrari_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsrari_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrari.d(<4 x i64> [[_1]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvsrari_d(v4i64 _1) { return __lasx_xvsrari_d(_1, 1); }
-// CHECK-LABEL: @xvsrl_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsrl_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrl.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvsrl_b(v32i8 _1, v32i8 _2) { return __lasx_xvsrl_b(_1, _2); }
-// CHECK-LABEL: @xvsrl_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsrl_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrl.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvsrl_h(v16i16 _1, v16i16 _2) { return __lasx_xvsrl_h(_1, _2); }
-// CHECK-LABEL: @xvsrl_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsrl_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrl.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvsrl_w(v8i32 _1, v8i32 _2) { return __lasx_xvsrl_w(_1, _2); }
-// CHECK-LABEL: @xvsrl_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsrl_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrl.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvsrl_d(v4i64 _1, v4i64 _2) { return __lasx_xvsrl_d(_1, _2); }
-// CHECK-LABEL: @xvsrli_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsrli_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrli.b(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvsrli_b(v32i8 _1) { return __lasx_xvsrli_b(_1, 1); }
-// CHECK-LABEL: @xvsrli_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsrli_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrli.h(<16 x i16> [[_1]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvsrli_h(v16i16 _1) { return __lasx_xvsrli_h(_1, 1); }
-// CHECK-LABEL: @xvsrli_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsrli_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrli.w(<8 x i32> [[_1]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvsrli_w(v8i32 _1) { return __lasx_xvsrli_w(_1, 1); }
-// CHECK-LABEL: @xvsrli_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsrli_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrli.d(<4 x i64> [[_1]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvsrli_d(v4i64 _1) { return __lasx_xvsrli_d(_1, 1); }
-// CHECK-LABEL: @xvsrlr_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsrlr_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrlr.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvsrlr_b(v32i8 _1, v32i8 _2) { return __lasx_xvsrlr_b(_1, _2); }
-// CHECK-LABEL: @xvsrlr_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsrlr_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrlr.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvsrlr_h(v16i16 _1, v16i16 _2) { return __lasx_xvsrlr_h(_1, _2); }
-// CHECK-LABEL: @xvsrlr_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsrlr_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrlr.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvsrlr_w(v8i32 _1, v8i32 _2) { return __lasx_xvsrlr_w(_1, _2); }
-// CHECK-LABEL: @xvsrlr_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsrlr_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrlr.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvsrlr_d(v4i64 _1, v4i64 _2) { return __lasx_xvsrlr_d(_1, _2); }
-// CHECK-LABEL: @xvsrlri_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsrlri_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrlri.b(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvsrlri_b(v32i8 _1) { return __lasx_xvsrlri_b(_1, 1); }
-// CHECK-LABEL: @xvsrlri_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsrlri_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrlri.h(<16 x i16> [[_1]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvsrlri_h(v16i16 _1) { return __lasx_xvsrlri_h(_1, 1); }
-// CHECK-LABEL: @xvsrlri_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsrlri_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrlri.w(<8 x i32> [[_1]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvsrlri_w(v8i32 _1) { return __lasx_xvsrlri_w(_1, 1); }
-// CHECK-LABEL: @xvsrlri_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsrlri_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrlri.d(<4 x i64> [[_1]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvsrlri_d(v4i64 _1) { return __lasx_xvsrlri_d(_1, 1); }
-// CHECK-LABEL: @xvbitclr_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvbitclr_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbitclr.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvbitclr_b(v32u8 _1, v32u8 _2) { return __lasx_xvbitclr_b(_1, _2); }
-// CHECK-LABEL: @xvbitclr_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvbitclr_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvbitclr.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvbitclr_h(v16u16 _1, v16u16 _2) { return __lasx_xvbitclr_h(_1, _2); }
-// CHECK-LABEL: @xvbitclr_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvbitclr_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvbitclr.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvbitclr_w(v8u32 _1, v8u32 _2) { return __lasx_xvbitclr_w(_1, _2); }
-// CHECK-LABEL: @xvbitclr_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvbitclr_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvbitclr.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvbitclr_d(v4u64 _1, v4u64 _2) { return __lasx_xvbitclr_d(_1, _2); }
-// CHECK-LABEL: @xvbitclri_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvbitclri_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbitclri.b(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvbitclri_b(v32u8 _1) { return __lasx_xvbitclri_b(_1, 1); }
-// CHECK-LABEL: @xvbitclri_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvbitclri_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvbitclri.h(<16 x i16> [[_1]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvbitclri_h(v16u16 _1) { return __lasx_xvbitclri_h(_1, 1); }
-// CHECK-LABEL: @xvbitclri_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvbitclri_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvbitclri.w(<8 x i32> [[_1]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvbitclri_w(v8u32 _1) { return __lasx_xvbitclri_w(_1, 1); }
-// CHECK-LABEL: @xvbitclri_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvbitclri_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvbitclri.d(<4 x i64> [[_1]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvbitclri_d(v4u64 _1) { return __lasx_xvbitclri_d(_1, 1); }
-// CHECK-LABEL: @xvbitset_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvbitset_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbitset.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvbitset_b(v32u8 _1, v32u8 _2) { return __lasx_xvbitset_b(_1, _2); }
-// CHECK-LABEL: @xvbitset_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvbitset_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvbitset.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvbitset_h(v16u16 _1, v16u16 _2) { return __lasx_xvbitset_h(_1, _2); }
-// CHECK-LABEL: @xvbitset_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvbitset_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvbitset.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvbitset_w(v8u32 _1, v8u32 _2) { return __lasx_xvbitset_w(_1, _2); }
-// CHECK-LABEL: @xvbitset_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvbitset_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvbitset.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvbitset_d(v4u64 _1, v4u64 _2) { return __lasx_xvbitset_d(_1, _2); }
-// CHECK-LABEL: @xvbitseti_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvbitseti_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbitseti.b(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvbitseti_b(v32u8 _1) { return __lasx_xvbitseti_b(_1, 1); }
-// CHECK-LABEL: @xvbitseti_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvbitseti_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvbitseti.h(<16 x i16> [[_1]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvbitseti_h(v16u16 _1) { return __lasx_xvbitseti_h(_1, 1); }
-// CHECK-LABEL: @xvbitseti_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvbitseti_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvbitseti.w(<8 x i32> [[_1]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvbitseti_w(v8u32 _1) { return __lasx_xvbitseti_w(_1, 1); }
-// CHECK-LABEL: @xvbitseti_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvbitseti_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvbitseti.d(<4 x i64> [[_1]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvbitseti_d(v4u64 _1) { return __lasx_xvbitseti_d(_1, 1); }
-// CHECK-LABEL: @xvbitrev_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvbitrev_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbitrev.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvbitrev_b(v32u8 _1, v32u8 _2) { return __lasx_xvbitrev_b(_1, _2); }
-// CHECK-LABEL: @xvbitrev_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvbitrev_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvbitrev.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvbitrev_h(v16u16 _1, v16u16 _2) { return __lasx_xvbitrev_h(_1, _2); }
-// CHECK-LABEL: @xvbitrev_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvbitrev_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvbitrev.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvbitrev_w(v8u32 _1, v8u32 _2) { return __lasx_xvbitrev_w(_1, _2); }
-// CHECK-LABEL: @xvbitrev_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvbitrev_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvbitrev.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvbitrev_d(v4u64 _1, v4u64 _2) { return __lasx_xvbitrev_d(_1, _2); }
-// CHECK-LABEL: @xvbitrevi_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvbitrevi_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbitrevi.b(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvbitrevi_b(v32u8 _1) { return __lasx_xvbitrevi_b(_1, 1); }
-// CHECK-LABEL: @xvbitrevi_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvbitrevi_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvbitrevi.h(<16 x i16> [[_1]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvbitrevi_h(v16u16 _1) { return __lasx_xvbitrevi_h(_1, 1); }
-// CHECK-LABEL: @xvbitrevi_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvbitrevi_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvbitrevi.w(<8 x i32> [[_1]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvbitrevi_w(v8u32 _1) { return __lasx_xvbitrevi_w(_1, 1); }
-// CHECK-LABEL: @xvbitrevi_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvbitrevi_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvbitrevi.d(<4 x i64> [[_1]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvbitrevi_d(v4u64 _1) { return __lasx_xvbitrevi_d(_1, 1); }
-// CHECK-LABEL: @xvadd_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvadd_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvadd.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvadd_b(v32i8 _1, v32i8 _2) { return __lasx_xvadd_b(_1, _2); }
-// CHECK-LABEL: @xvadd_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvadd_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvadd.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvadd_h(v16i16 _1, v16i16 _2) { return __lasx_xvadd_h(_1, _2); }
-// CHECK-LABEL: @xvadd_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvadd_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvadd.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvadd_w(v8i32 _1, v8i32 _2) { return __lasx_xvadd_w(_1, _2); }
-// CHECK-LABEL: @xvadd_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvadd_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvadd.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvadd_d(v4i64 _1, v4i64 _2) { return __lasx_xvadd_d(_1, _2); }
-// CHECK-LABEL: @xvaddi_bu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvaddi_bu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvaddi.bu(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvaddi_bu(v32i8 _1) { return __lasx_xvaddi_bu(_1, 1); }
-// CHECK-LABEL: @xvaddi_hu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvaddi_hu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvaddi.hu(<16 x i16> [[_1]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvaddi_hu(v16i16 _1) { return __lasx_xvaddi_hu(_1, 1); }
-// CHECK-LABEL: @xvaddi_wu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvaddi_wu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvaddi.wu(<8 x i32> [[_1]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvaddi_wu(v8i32 _1) { return __lasx_xvaddi_wu(_1, 1); }
-// CHECK-LABEL: @xvaddi_du(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvaddi_du(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddi.du(<4 x i64> [[_1]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvaddi_du(v4i64 _1) { return __lasx_xvaddi_du(_1, 1); }
-// CHECK-LABEL: @xvsub_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsub_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsub.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvsub_b(v32i8 _1, v32i8 _2) { return __lasx_xvsub_b(_1, _2); }
-// CHECK-LABEL: @xvsub_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsub_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsub.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvsub_h(v16i16 _1, v16i16 _2) { return __lasx_xvsub_h(_1, _2); }
-// CHECK-LABEL: @xvsub_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsub_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsub.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvsub_w(v8i32 _1, v8i32 _2) { return __lasx_xvsub_w(_1, _2); }
-// CHECK-LABEL: @xvsub_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsub_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsub.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvsub_d(v4i64 _1, v4i64 _2) { return __lasx_xvsub_d(_1, _2); }
-// CHECK-LABEL: @xvsubi_bu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsubi_bu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsubi.bu(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvsubi_bu(v32i8 _1) { return __lasx_xvsubi_bu(_1, 1); }
-// CHECK-LABEL: @xvsubi_hu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsubi_hu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsubi.hu(<16 x i16> [[_1]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvsubi_hu(v16i16 _1) { return __lasx_xvsubi_hu(_1, 1); }
-// CHECK-LABEL: @xvsubi_wu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsubi_wu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsubi.wu(<8 x i32> [[_1]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvsubi_wu(v8i32 _1) { return __lasx_xvsubi_wu(_1, 1); }
-// CHECK-LABEL: @xvsubi_du(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsubi_du(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubi.du(<4 x i64> [[_1]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvsubi_du(v4i64 _1) { return __lasx_xvsubi_du(_1, 1); }
-// CHECK-LABEL: @xvmax_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmax_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmax.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvmax_b(v32i8 _1, v32i8 _2) { return __lasx_xvmax_b(_1, _2); }
-// CHECK-LABEL: @xvmax_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmax_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmax.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvmax_h(v16i16 _1, v16i16 _2) { return __lasx_xvmax_h(_1, _2); }
-// CHECK-LABEL: @xvmax_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmax_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmax.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvmax_w(v8i32 _1, v8i32 _2) { return __lasx_xvmax_w(_1, _2); }
-// CHECK-LABEL: @xvmax_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmax_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmax.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvmax_d(v4i64 _1, v4i64 _2) { return __lasx_xvmax_d(_1, _2); }
-// CHECK-LABEL: @xvmaxi_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmaxi_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmaxi.b(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvmaxi_b(v32i8 _1) { return __lasx_xvmaxi_b(_1, 1); }
-// CHECK-LABEL: @xvmaxi_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmaxi_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmaxi.h(<16 x i16> [[_1]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvmaxi_h(v16i16 _1) { return __lasx_xvmaxi_h(_1, 1); }
-// CHECK-LABEL: @xvmaxi_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmaxi_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmaxi.w(<8 x i32> [[_1]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvmaxi_w(v8i32 _1) { return __lasx_xvmaxi_w(_1, 1); }
-// CHECK-LABEL: @xvmaxi_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmaxi_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaxi.d(<4 x i64> [[_1]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvmaxi_d(v4i64 _1) { return __lasx_xvmaxi_d(_1, 1); }
-// CHECK-LABEL: @xvmax_bu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmax_bu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmax.bu(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvmax_bu(v32u8 _1, v32u8 _2) { return __lasx_xvmax_bu(_1, _2); }
-// CHECK-LABEL: @xvmax_hu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmax_hu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmax.hu(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvmax_hu(v16u16 _1, v16u16 _2) { return __lasx_xvmax_hu(_1, _2); }
-// CHECK-LABEL: @xvmax_wu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmax_wu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmax.wu(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvmax_wu(v8u32 _1, v8u32 _2) { return __lasx_xvmax_wu(_1, _2); }
-// CHECK-LABEL: @xvmax_du(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmax_du(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmax.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvmax_du(v4u64 _1, v4u64 _2) { return __lasx_xvmax_du(_1, _2); }
-// CHECK-LABEL: @xvmaxi_bu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmaxi_bu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmaxi.bu(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvmaxi_bu(v32u8 _1) { return __lasx_xvmaxi_bu(_1, 1); }
-// CHECK-LABEL: @xvmaxi_hu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmaxi_hu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmaxi.hu(<16 x i16> [[_1]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvmaxi_hu(v16u16 _1) { return __lasx_xvmaxi_hu(_1, 1); }
-// CHECK-LABEL: @xvmaxi_wu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmaxi_wu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmaxi.wu(<8 x i32> [[_1]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvmaxi_wu(v8u32 _1) { return __lasx_xvmaxi_wu(_1, 1); }
-// CHECK-LABEL: @xvmaxi_du(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmaxi_du(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaxi.du(<4 x i64> [[_1]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvmaxi_du(v4u64 _1) { return __lasx_xvmaxi_du(_1, 1); }
-// CHECK-LABEL: @xvmin_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmin_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmin.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvmin_b(v32i8 _1, v32i8 _2) { return __lasx_xvmin_b(_1, _2); }
-// CHECK-LABEL: @xvmin_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmin_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmin.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvmin_h(v16i16 _1, v16i16 _2) { return __lasx_xvmin_h(_1, _2); }
-// CHECK-LABEL: @xvmin_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmin_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmin.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvmin_w(v8i32 _1, v8i32 _2) { return __lasx_xvmin_w(_1, _2); }
-// CHECK-LABEL: @xvmin_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmin_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmin.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvmin_d(v4i64 _1, v4i64 _2) { return __lasx_xvmin_d(_1, _2); }
-// CHECK-LABEL: @xvmini_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmini_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmini.b(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvmini_b(v32i8 _1) { return __lasx_xvmini_b(_1, 1); }
-// CHECK-LABEL: @xvmini_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmini_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmini.h(<16 x i16> [[_1]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvmini_h(v16i16 _1) { return __lasx_xvmini_h(_1, 1); }
-// CHECK-LABEL: @xvmini_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmini_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmini.w(<8 x i32> [[_1]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvmini_w(v8i32 _1) { return __lasx_xvmini_w(_1, 1); }
-// CHECK-LABEL: @xvmini_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmini_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmini.d(<4 x i64> [[_1]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvmini_d(v4i64 _1) { return __lasx_xvmini_d(_1, 1); }
-// CHECK-LABEL: @xvmin_bu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmin_bu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmin.bu(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvmin_bu(v32u8 _1, v32u8 _2) { return __lasx_xvmin_bu(_1, _2); }
-// CHECK-LABEL: @xvmin_hu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmin_hu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmin.hu(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvmin_hu(v16u16 _1, v16u16 _2) { return __lasx_xvmin_hu(_1, _2); }
-// CHECK-LABEL: @xvmin_wu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmin_wu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmin.wu(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvmin_wu(v8u32 _1, v8u32 _2) { return __lasx_xvmin_wu(_1, _2); }
-// CHECK-LABEL: @xvmin_du(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmin_du(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmin.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvmin_du(v4u64 _1, v4u64 _2) { return __lasx_xvmin_du(_1, _2); }
-// CHECK-LABEL: @xvmini_bu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmini_bu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmini.bu(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvmini_bu(v32u8 _1) { return __lasx_xvmini_bu(_1, 1); }
-// CHECK-LABEL: @xvmini_hu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmini_hu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmini.hu(<16 x i16> [[_1]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvmini_hu(v16u16 _1) { return __lasx_xvmini_hu(_1, 1); }
-// CHECK-LABEL: @xvmini_wu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmini_wu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmini.wu(<8 x i32> [[_1]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvmini_wu(v8u32 _1) { return __lasx_xvmini_wu(_1, 1); }
-// CHECK-LABEL: @xvmini_du(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmini_du(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmini.du(<4 x i64> [[_1]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvmini_du(v4u64 _1) { return __lasx_xvmini_du(_1, 1); }
-// CHECK-LABEL: @xvseq_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvseq_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvseq.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvseq_b(v32i8 _1, v32i8 _2) { return __lasx_xvseq_b(_1, _2); }
-// CHECK-LABEL: @xvseq_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvseq_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvseq.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvseq_h(v16i16 _1, v16i16 _2) { return __lasx_xvseq_h(_1, _2); }
-// CHECK-LABEL: @xvseq_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvseq_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvseq.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvseq_w(v8i32 _1, v8i32 _2) { return __lasx_xvseq_w(_1, _2); }
-// CHECK-LABEL: @xvseq_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvseq_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvseq.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvseq_d(v4i64 _1, v4i64 _2) { return __lasx_xvseq_d(_1, _2); }
-// CHECK-LABEL: @xvseqi_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvseqi_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvseqi.b(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvseqi_b(v32i8 _1) { return __lasx_xvseqi_b(_1, 1); }
-// CHECK-LABEL: @xvseqi_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvseqi_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvseqi.h(<16 x i16> [[_1]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvseqi_h(v16i16 _1) { return __lasx_xvseqi_h(_1, 1); }
-// CHECK-LABEL: @xvseqi_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvseqi_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvseqi.w(<8 x i32> [[_1]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvseqi_w(v8i32 _1) { return __lasx_xvseqi_w(_1, 1); }
-// CHECK-LABEL: @xvseqi_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvseqi_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvseqi.d(<4 x i64> [[_1]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvseqi_d(v4i64 _1) { return __lasx_xvseqi_d(_1, 1); }
-// CHECK-LABEL: @xvslt_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvslt_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvslt.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvslt_b(v32i8 _1, v32i8 _2) { return __lasx_xvslt_b(_1, _2); }
-// CHECK-LABEL: @xvslt_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvslt_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvslt.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvslt_h(v16i16 _1, v16i16 _2) { return __lasx_xvslt_h(_1, _2); }
-// CHECK-LABEL: @xvslt_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvslt_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvslt.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvslt_w(v8i32 _1, v8i32 _2) { return __lasx_xvslt_w(_1, _2); }
-// CHECK-LABEL: @xvslt_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvslt_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvslt.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvslt_d(v4i64 _1, v4i64 _2) { return __lasx_xvslt_d(_1, _2); }
-// CHECK-LABEL: @xvslti_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvslti_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvslti.b(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvslti_b(v32i8 _1) { return __lasx_xvslti_b(_1, 1); }
-// CHECK-LABEL: @xvslti_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvslti_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvslti.h(<16 x i16> [[_1]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvslti_h(v16i16 _1) { return __lasx_xvslti_h(_1, 1); }
-// CHECK-LABEL: @xvslti_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvslti_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvslti.w(<8 x i32> [[_1]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvslti_w(v8i32 _1) { return __lasx_xvslti_w(_1, 1); }
-// CHECK-LABEL: @xvslti_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvslti_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvslti.d(<4 x i64> [[_1]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvslti_d(v4i64 _1) { return __lasx_xvslti_d(_1, 1); }
-// CHECK-LABEL: @xvslt_bu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvslt_bu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvslt.bu(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvslt_bu(v32u8 _1, v32u8 _2) { return __lasx_xvslt_bu(_1, _2); }
-// CHECK-LABEL: @xvslt_hu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvslt_hu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvslt.hu(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvslt_hu(v16u16 _1, v16u16 _2) { return __lasx_xvslt_hu(_1, _2); }
-// CHECK-LABEL: @xvslt_wu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvslt_wu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvslt.wu(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvslt_wu(v8u32 _1, v8u32 _2) { return __lasx_xvslt_wu(_1, _2); }
-// CHECK-LABEL: @xvslt_du(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvslt_du(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvslt.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvslt_du(v4u64 _1, v4u64 _2) { return __lasx_xvslt_du(_1, _2); }
-// CHECK-LABEL: @xvslti_bu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvslti_bu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvslti.bu(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvslti_bu(v32u8 _1) { return __lasx_xvslti_bu(_1, 1); }
-// CHECK-LABEL: @xvslti_hu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvslti_hu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvslti.hu(<16 x i16> [[_1]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvslti_hu(v16u16 _1) { return __lasx_xvslti_hu(_1, 1); }
-// CHECK-LABEL: @xvslti_wu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvslti_wu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvslti.wu(<8 x i32> [[_1]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvslti_wu(v8u32 _1) { return __lasx_xvslti_wu(_1, 1); }
-// CHECK-LABEL: @xvslti_du(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvslti_du(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvslti.du(<4 x i64> [[_1]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvslti_du(v4u64 _1) { return __lasx_xvslti_du(_1, 1); }
-// CHECK-LABEL: @xvsle_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsle_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsle.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvsle_b(v32i8 _1, v32i8 _2) { return __lasx_xvsle_b(_1, _2); }
-// CHECK-LABEL: @xvsle_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsle_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsle.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvsle_h(v16i16 _1, v16i16 _2) { return __lasx_xvsle_h(_1, _2); }
-// CHECK-LABEL: @xvsle_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsle_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsle.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvsle_w(v8i32 _1, v8i32 _2) { return __lasx_xvsle_w(_1, _2); }
-// CHECK-LABEL: @xvsle_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsle_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsle.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvsle_d(v4i64 _1, v4i64 _2) { return __lasx_xvsle_d(_1, _2); }
-// CHECK-LABEL: @xvslei_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvslei_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvslei.b(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvslei_b(v32i8 _1) { return __lasx_xvslei_b(_1, 1); }
-// CHECK-LABEL: @xvslei_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvslei_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvslei.h(<16 x i16> [[_1]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvslei_h(v16i16 _1) { return __lasx_xvslei_h(_1, 1); }
-// CHECK-LABEL: @xvslei_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvslei_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvslei.w(<8 x i32> [[_1]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvslei_w(v8i32 _1) { return __lasx_xvslei_w(_1, 1); }
-// CHECK-LABEL: @xvslei_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvslei_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvslei.d(<4 x i64> [[_1]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvslei_d(v4i64 _1) { return __lasx_xvslei_d(_1, 1); }
-// CHECK-LABEL: @xvsle_bu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsle_bu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsle.bu(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvsle_bu(v32u8 _1, v32u8 _2) { return __lasx_xvsle_bu(_1, _2); }
-// CHECK-LABEL: @xvsle_hu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsle_hu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsle.hu(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvsle_hu(v16u16 _1, v16u16 _2) { return __lasx_xvsle_hu(_1, _2); }
-// CHECK-LABEL: @xvsle_wu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsle_wu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsle.wu(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvsle_wu(v8u32 _1, v8u32 _2) { return __lasx_xvsle_wu(_1, _2); }
-// CHECK-LABEL: @xvsle_du(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsle_du(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsle.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvsle_du(v4u64 _1, v4u64 _2) { return __lasx_xvsle_du(_1, _2); }
-// CHECK-LABEL: @xvslei_bu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvslei_bu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvslei.bu(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvslei_bu(v32u8 _1) { return __lasx_xvslei_bu(_1, 1); }
-// CHECK-LABEL: @xvslei_hu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvslei_hu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvslei.hu(<16 x i16> [[_1]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvslei_hu(v16u16 _1) { return __lasx_xvslei_hu(_1, 1); }
-// CHECK-LABEL: @xvslei_wu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvslei_wu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvslei.wu(<8 x i32> [[_1]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvslei_wu(v8u32 _1) { return __lasx_xvslei_wu(_1, 1); }
-// CHECK-LABEL: @xvslei_du(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvslei_du(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvslei.du(<4 x i64> [[_1]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvslei_du(v4u64 _1) { return __lasx_xvslei_du(_1, 1); }
-// CHECK-LABEL: @xvsat_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsat_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsat.b(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvsat_b(v32i8 _1) { return __lasx_xvsat_b(_1, 1); }
-// CHECK-LABEL: @xvsat_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsat_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsat.h(<16 x i16> [[_1]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvsat_h(v16i16 _1) { return __lasx_xvsat_h(_1, 1); }
-// CHECK-LABEL: @xvsat_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsat_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsat.w(<8 x i32> [[_1]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvsat_w(v8i32 _1) { return __lasx_xvsat_w(_1, 1); }
-// CHECK-LABEL: @xvsat_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsat_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsat.d(<4 x i64> [[_1]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvsat_d(v4i64 _1) { return __lasx_xvsat_d(_1, 1); }
-// CHECK-LABEL: @xvsat_bu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsat_bu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsat.bu(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvsat_bu(v32u8 _1) { return __lasx_xvsat_bu(_1, 1); }
-// CHECK-LABEL: @xvsat_hu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsat_hu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsat.hu(<16 x i16> [[_1]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvsat_hu(v16u16 _1) { return __lasx_xvsat_hu(_1, 1); }
-// CHECK-LABEL: @xvsat_wu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsat_wu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsat.wu(<8 x i32> [[_1]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvsat_wu(v8u32 _1) { return __lasx_xvsat_wu(_1, 1); }
-// CHECK-LABEL: @xvsat_du(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsat_du(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsat.du(<4 x i64> [[_1]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvsat_du(v4u64 _1) { return __lasx_xvsat_du(_1, 1); }
-// CHECK-LABEL: @xvadda_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvadda_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvadda.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvadda_b(v32i8 _1, v32i8 _2) { return __lasx_xvadda_b(_1, _2); }
-// CHECK-LABEL: @xvadda_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvadda_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvadda.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvadda_h(v16i16 _1, v16i16 _2) { return __lasx_xvadda_h(_1, _2); }
-// CHECK-LABEL: @xvadda_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvadda_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvadda.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvadda_w(v8i32 _1, v8i32 _2) { return __lasx_xvadda_w(_1, _2); }
-// CHECK-LABEL: @xvadda_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvadda_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvadda.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvadda_d(v4i64 _1, v4i64 _2) { return __lasx_xvadda_d(_1, _2); }
-// CHECK-LABEL: @xvsadd_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsadd_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsadd.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvsadd_b(v32i8 _1, v32i8 _2) { return __lasx_xvsadd_b(_1, _2); }
-// CHECK-LABEL: @xvsadd_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsadd_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsadd.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvsadd_h(v16i16 _1, v16i16 _2) { return __lasx_xvsadd_h(_1, _2); }
-// CHECK-LABEL: @xvsadd_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsadd_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsadd.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvsadd_w(v8i32 _1, v8i32 _2) { return __lasx_xvsadd_w(_1, _2); }
-// CHECK-LABEL: @xvsadd_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsadd_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsadd.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvsadd_d(v4i64 _1, v4i64 _2) { return __lasx_xvsadd_d(_1, _2); }
-// CHECK-LABEL: @xvsadd_bu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsadd_bu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsadd.bu(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvsadd_bu(v32u8 _1, v32u8 _2) { return __lasx_xvsadd_bu(_1, _2); }
-// CHECK-LABEL: @xvsadd_hu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsadd_hu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsadd.hu(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvsadd_hu(v16u16 _1, v16u16 _2) { return __lasx_xvsadd_hu(_1, _2); }
-// CHECK-LABEL: @xvsadd_wu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsadd_wu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsadd.wu(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvsadd_wu(v8u32 _1, v8u32 _2) { return __lasx_xvsadd_wu(_1, _2); }
-// CHECK-LABEL: @xvsadd_du(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsadd_du(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsadd.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvsadd_du(v4u64 _1, v4u64 _2) { return __lasx_xvsadd_du(_1, _2); }
-// CHECK-LABEL: @xvavg_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvavg_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvavg.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvavg_b(v32i8 _1, v32i8 _2) { return __lasx_xvavg_b(_1, _2); }
-// CHECK-LABEL: @xvavg_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvavg_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvavg.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvavg_h(v16i16 _1, v16i16 _2) { return __lasx_xvavg_h(_1, _2); }
-// CHECK-LABEL: @xvavg_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvavg_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvavg.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvavg_w(v8i32 _1, v8i32 _2) { return __lasx_xvavg_w(_1, _2); }
-// CHECK-LABEL: @xvavg_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvavg_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvavg.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvavg_d(v4i64 _1, v4i64 _2) { return __lasx_xvavg_d(_1, _2); }
-// CHECK-LABEL: @xvavg_bu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvavg_bu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvavg.bu(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvavg_bu(v32u8 _1, v32u8 _2) { return __lasx_xvavg_bu(_1, _2); }
-// CHECK-LABEL: @xvavg_hu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvavg_hu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvavg.hu(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvavg_hu(v16u16 _1, v16u16 _2) { return __lasx_xvavg_hu(_1, _2); }
-// CHECK-LABEL: @xvavg_wu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvavg_wu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvavg.wu(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvavg_wu(v8u32 _1, v8u32 _2) { return __lasx_xvavg_wu(_1, _2); }
-// CHECK-LABEL: @xvavg_du(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvavg_du(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvavg.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvavg_du(v4u64 _1, v4u64 _2) { return __lasx_xvavg_du(_1, _2); }
-// CHECK-LABEL: @xvavgr_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvavgr_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvavgr.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvavgr_b(v32i8 _1, v32i8 _2) { return __lasx_xvavgr_b(_1, _2); }
-// CHECK-LABEL: @xvavgr_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvavgr_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvavgr.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvavgr_h(v16i16 _1, v16i16 _2) { return __lasx_xvavgr_h(_1, _2); }
-// CHECK-LABEL: @xvavgr_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvavgr_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvavgr.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvavgr_w(v8i32 _1, v8i32 _2) { return __lasx_xvavgr_w(_1, _2); }
-// CHECK-LABEL: @xvavgr_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvavgr_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvavgr.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvavgr_d(v4i64 _1, v4i64 _2) { return __lasx_xvavgr_d(_1, _2); }
-// CHECK-LABEL: @xvavgr_bu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvavgr_bu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvavgr.bu(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvavgr_bu(v32u8 _1, v32u8 _2) { return __lasx_xvavgr_bu(_1, _2); }
-// CHECK-LABEL: @xvavgr_hu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvavgr_hu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvavgr.hu(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvavgr_hu(v16u16 _1, v16u16 _2) { return __lasx_xvavgr_hu(_1, _2); }
-// CHECK-LABEL: @xvavgr_wu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvavgr_wu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvavgr.wu(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvavgr_wu(v8u32 _1, v8u32 _2) { return __lasx_xvavgr_wu(_1, _2); }
-// CHECK-LABEL: @xvavgr_du(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvavgr_du(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvavgr.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvavgr_du(v4u64 _1, v4u64 _2) { return __lasx_xvavgr_du(_1, _2); }
-// CHECK-LABEL: @xvssub_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssub_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssub.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvssub_b(v32i8 _1, v32i8 _2) { return __lasx_xvssub_b(_1, _2); }
-// CHECK-LABEL: @xvssub_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssub_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssub.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvssub_h(v16i16 _1, v16i16 _2) { return __lasx_xvssub_h(_1, _2); }
-// CHECK-LABEL: @xvssub_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssub_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssub.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvssub_w(v8i32 _1, v8i32 _2) { return __lasx_xvssub_w(_1, _2); }
-// CHECK-LABEL: @xvssub_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssub_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssub.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvssub_d(v4i64 _1, v4i64 _2) { return __lasx_xvssub_d(_1, _2); }
-// CHECK-LABEL: @xvssub_bu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssub_bu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssub.bu(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvssub_bu(v32u8 _1, v32u8 _2) { return __lasx_xvssub_bu(_1, _2); }
-// CHECK-LABEL: @xvssub_hu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssub_hu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssub.hu(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvssub_hu(v16u16 _1, v16u16 _2) { return __lasx_xvssub_hu(_1, _2); }
-// CHECK-LABEL: @xvssub_wu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssub_wu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssub.wu(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvssub_wu(v8u32 _1, v8u32 _2) { return __lasx_xvssub_wu(_1, _2); }
-// CHECK-LABEL: @xvssub_du(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssub_du(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssub.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvssub_du(v4u64 _1, v4u64 _2) { return __lasx_xvssub_du(_1, _2); }
-// CHECK-LABEL: @xvabsd_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvabsd_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvabsd.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvabsd_b(v32i8 _1, v32i8 _2) { return __lasx_xvabsd_b(_1, _2); }
-// CHECK-LABEL: @xvabsd_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvabsd_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvabsd.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvabsd_h(v16i16 _1, v16i16 _2) { return __lasx_xvabsd_h(_1, _2); }
-// CHECK-LABEL: @xvabsd_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvabsd_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvabsd.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvabsd_w(v8i32 _1, v8i32 _2) { return __lasx_xvabsd_w(_1, _2); }
-// CHECK-LABEL: @xvabsd_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvabsd_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvabsd.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvabsd_d(v4i64 _1, v4i64 _2) { return __lasx_xvabsd_d(_1, _2); }
-// CHECK-LABEL: @xvabsd_bu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvabsd_bu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvabsd.bu(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvabsd_bu(v32u8 _1, v32u8 _2) { return __lasx_xvabsd_bu(_1, _2); }
-// CHECK-LABEL: @xvabsd_hu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvabsd_hu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvabsd.hu(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvabsd_hu(v16u16 _1, v16u16 _2) { return __lasx_xvabsd_hu(_1, _2); }
-// CHECK-LABEL: @xvabsd_wu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvabsd_wu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvabsd.wu(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvabsd_wu(v8u32 _1, v8u32 _2) { return __lasx_xvabsd_wu(_1, _2); }
-// CHECK-LABEL: @xvabsd_du(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvabsd_du(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvabsd.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvabsd_du(v4u64 _1, v4u64 _2) { return __lasx_xvabsd_du(_1, _2); }
-// CHECK-LABEL: @xvmul_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmul_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmul.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvmul_b(v32i8 _1, v32i8 _2) { return __lasx_xvmul_b(_1, _2); }
-// CHECK-LABEL: @xvmul_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmul_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmul.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvmul_h(v16i16 _1, v16i16 _2) { return __lasx_xvmul_h(_1, _2); }
-// CHECK-LABEL: @xvmul_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmul_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmul.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvmul_w(v8i32 _1, v8i32 _2) { return __lasx_xvmul_w(_1, _2); }
-// CHECK-LABEL: @xvmul_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmul_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmul.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvmul_d(v4i64 _1, v4i64 _2) { return __lasx_xvmul_d(_1, _2); }
-// CHECK-LABEL: @xvmadd_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_136:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_247:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_358:%.*]] = load <32 x i8>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmadd_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_136:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_247:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_358:%.*]] = load <32 x i8>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmadd.b(<32 x i8> [[_136]], <32 x i8> [[_247]], <32 x i8> [[_358]])
-// CHECK-NEXT:    store <32 x i8> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvmadd_b(v32i8 _1, v32i8 _2, v32i8 _3) { return __lasx_xvmadd_b(_1, _2, _3); }
-// CHECK-LABEL: @xvmadd_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_136:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_247:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_358:%.*]] = load <16 x i16>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmadd_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_136:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_247:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_358:%.*]] = load <16 x i16>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmadd.h(<16 x i16> [[_136]], <16 x i16> [[_247]], <16 x i16> [[_358]])
-// CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvmadd_h(v16i16 _1, v16i16 _2, v16i16 _3) { return __lasx_xvmadd_h(_1, _2, _3); }
-// CHECK-LABEL: @xvmadd_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_136:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_247:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_358:%.*]] = load <8 x i32>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmadd_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_136:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_247:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_358:%.*]] = load <8 x i32>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmadd.w(<8 x i32> [[_136]], <8 x i32> [[_247]], <8 x i32> [[_358]])
-// CHECK-NEXT:    store <8 x i32> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvmadd_w(v8i32 _1, v8i32 _2, v8i32 _3) { return __lasx_xvmadd_w(_1, _2, _3); }
-// CHECK-LABEL: @xvmadd_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <4 x i64>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmadd_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <4 x i64>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmadd.d(<4 x i64> [[_1]], <4 x i64> [[_2]], <4 x i64> [[_3]])
-// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvmadd_d(v4i64 _1, v4i64 _2, v4i64 _3) { return __lasx_xvmadd_d(_1, _2, _3); }
-// CHECK-LABEL: @xvmsub_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_136:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_247:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_358:%.*]] = load <32 x i8>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmsub_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_136:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_247:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_358:%.*]] = load <32 x i8>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmsub.b(<32 x i8> [[_136]], <32 x i8> [[_247]], <32 x i8> [[_358]])
-// CHECK-NEXT:    store <32 x i8> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvmsub_b(v32i8 _1, v32i8 _2, v32i8 _3) { return __lasx_xvmsub_b(_1, _2, _3); }
-// CHECK-LABEL: @xvmsub_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_136:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_247:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_358:%.*]] = load <16 x i16>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmsub_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_136:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_247:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_358:%.*]] = load <16 x i16>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmsub.h(<16 x i16> [[_136]], <16 x i16> [[_247]], <16 x i16> [[_358]])
-// CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvmsub_h(v16i16 _1, v16i16 _2, v16i16 _3) { return __lasx_xvmsub_h(_1, _2, _3); }
-// CHECK-LABEL: @xvmsub_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_136:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_247:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_358:%.*]] = load <8 x i32>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmsub_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_136:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_247:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_358:%.*]] = load <8 x i32>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmsub.w(<8 x i32> [[_136]], <8 x i32> [[_247]], <8 x i32> [[_358]])
-// CHECK-NEXT:    store <8 x i32> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvmsub_w(v8i32 _1, v8i32 _2, v8i32 _3) { return __lasx_xvmsub_w(_1, _2, _3); }
-// CHECK-LABEL: @xvmsub_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <4 x i64>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmsub_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <4 x i64>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmsub.d(<4 x i64> [[_1]], <4 x i64> [[_2]], <4 x i64> [[_3]])
-// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvmsub_d(v4i64 _1, v4i64 _2, v4i64 _3) { return __lasx_xvmsub_d(_1, _2, _3); }
-// CHECK-LABEL: @xvdiv_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvdiv_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvdiv.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvdiv_b(v32i8 _1, v32i8 _2) { return __lasx_xvdiv_b(_1, _2); }
-// CHECK-LABEL: @xvdiv_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvdiv_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvdiv.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvdiv_h(v16i16 _1, v16i16 _2) { return __lasx_xvdiv_h(_1, _2); }
-// CHECK-LABEL: @xvdiv_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvdiv_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvdiv.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvdiv_w(v8i32 _1, v8i32 _2) { return __lasx_xvdiv_w(_1, _2); }
-// CHECK-LABEL: @xvdiv_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvdiv_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvdiv.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvdiv_d(v4i64 _1, v4i64 _2) { return __lasx_xvdiv_d(_1, _2); }
-// CHECK-LABEL: @xvdiv_bu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvdiv_bu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvdiv.bu(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvdiv_bu(v32u8 _1, v32u8 _2) { return __lasx_xvdiv_bu(_1, _2); }
-// CHECK-LABEL: @xvdiv_hu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvdiv_hu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvdiv.hu(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvdiv_hu(v16u16 _1, v16u16 _2) { return __lasx_xvdiv_hu(_1, _2); }
-// CHECK-LABEL: @xvdiv_wu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvdiv_wu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvdiv.wu(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvdiv_wu(v8u32 _1, v8u32 _2) { return __lasx_xvdiv_wu(_1, _2); }
-// CHECK-LABEL: @xvdiv_du(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvdiv_du(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvdiv.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvdiv_du(v4u64 _1, v4u64 _2) { return __lasx_xvdiv_du(_1, _2); }
-// CHECK-LABEL: @xvhaddw_h_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvhaddw_h_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvhaddw.h.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvhaddw_h_b(v32i8 _1, v32i8 _2) { return __lasx_xvhaddw_h_b(_1, _2); }
-// CHECK-LABEL: @xvhaddw_w_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvhaddw_w_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvhaddw.w.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvhaddw_w_h(v16i16 _1, v16i16 _2) { return __lasx_xvhaddw_w_h(_1, _2); }
-// CHECK-LABEL: @xvhaddw_d_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvhaddw_d_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvhaddw.d.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvhaddw_d_w(v8i32 _1, v8i32 _2) { return __lasx_xvhaddw_d_w(_1, _2); }
-// CHECK-LABEL: @xvhaddw_hu_bu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvhaddw_hu_bu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvhaddw.hu.bu(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvhaddw_hu_bu(v32u8 _1, v32u8 _2) { return __lasx_xvhaddw_hu_bu(_1, _2); }
-// CHECK-LABEL: @xvhaddw_wu_hu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvhaddw_wu_hu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvhaddw.wu.hu(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvhaddw_wu_hu(v16u16 _1, v16u16 _2) { return __lasx_xvhaddw_wu_hu(_1, _2); }
-// CHECK-LABEL: @xvhaddw_du_wu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvhaddw_du_wu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvhaddw.du.wu(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvhaddw_du_wu(v8u32 _1, v8u32 _2) { return __lasx_xvhaddw_du_wu(_1, _2); }
-// CHECK-LABEL: @xvhsubw_h_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvhsubw_h_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvhsubw.h.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvhsubw_h_b(v32i8 _1, v32i8 _2) { return __lasx_xvhsubw_h_b(_1, _2); }
-// CHECK-LABEL: @xvhsubw_w_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvhsubw_w_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvhsubw.w.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvhsubw_w_h(v16i16 _1, v16i16 _2) { return __lasx_xvhsubw_w_h(_1, _2); }
-// CHECK-LABEL: @xvhsubw_d_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvhsubw_d_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvhsubw.d.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvhsubw_d_w(v8i32 _1, v8i32 _2) { return __lasx_xvhsubw_d_w(_1, _2); }
-// CHECK-LABEL: @xvhsubw_hu_bu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvhsubw_hu_bu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvhsubw.hu.bu(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvhsubw_hu_bu(v32u8 _1, v32u8 _2) { return __lasx_xvhsubw_hu_bu(_1, _2); }
-// CHECK-LABEL: @xvhsubw_wu_hu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvhsubw_wu_hu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvhsubw.wu.hu(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvhsubw_wu_hu(v16u16 _1, v16u16 _2) { return __lasx_xvhsubw_wu_hu(_1, _2); }
-// CHECK-LABEL: @xvhsubw_du_wu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvhsubw_du_wu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvhsubw.du.wu(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvhsubw_du_wu(v8u32 _1, v8u32 _2) { return __lasx_xvhsubw_du_wu(_1, _2); }
-// CHECK-LABEL: @xvmod_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmod_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmod.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvmod_b(v32i8 _1, v32i8 _2) { return __lasx_xvmod_b(_1, _2); }
-// CHECK-LABEL: @xvmod_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmod_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmod.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvmod_h(v16i16 _1, v16i16 _2) { return __lasx_xvmod_h(_1, _2); }
-// CHECK-LABEL: @xvmod_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmod_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmod.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvmod_w(v8i32 _1, v8i32 _2) { return __lasx_xvmod_w(_1, _2); }
-// CHECK-LABEL: @xvmod_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmod_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmod.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvmod_d(v4i64 _1, v4i64 _2) { return __lasx_xvmod_d(_1, _2); }
-// CHECK-LABEL: @xvmod_bu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmod_bu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmod.bu(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvmod_bu(v32u8 _1, v32u8 _2) { return __lasx_xvmod_bu(_1, _2); }
-// CHECK-LABEL: @xvmod_hu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmod_hu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmod.hu(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvmod_hu(v16u16 _1, v16u16 _2) { return __lasx_xvmod_hu(_1, _2); }
-// CHECK-LABEL: @xvmod_wu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmod_wu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmod.wu(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvmod_wu(v8u32 _1, v8u32 _2) { return __lasx_xvmod_wu(_1, _2); }
-// CHECK-LABEL: @xvmod_du(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmod_du(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmod.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvmod_du(v4u64 _1, v4u64 _2) { return __lasx_xvmod_du(_1, _2); }
-// CHECK-LABEL: @xvrepl128vei_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvrepl128vei_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvrepl128vei.b(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvrepl128vei_b(v32i8 _1) { return __lasx_xvrepl128vei_b(_1, 1); }
-// CHECK-LABEL: @xvrepl128vei_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvrepl128vei_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvrepl128vei.h(<16 x i16> [[_1]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvrepl128vei_h(v16i16 _1) { return __lasx_xvrepl128vei_h(_1, 1); }
-// CHECK-LABEL: @xvrepl128vei_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvrepl128vei_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvrepl128vei.w(<8 x i32> [[_1]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvrepl128vei_w(v8i32 _1) { return __lasx_xvrepl128vei_w(_1, 1); }
-// CHECK-LABEL: @xvrepl128vei_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvrepl128vei_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvrepl128vei.d(<4 x i64> [[_1]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvrepl128vei_d(v4i64 _1) { return __lasx_xvrepl128vei_d(_1, 1); }
-// CHECK-LABEL: @xvpickev_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvpickev_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvpickev.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvpickev_b(v32i8 _1, v32i8 _2) { return __lasx_xvpickev_b(_1, _2); }
-// CHECK-LABEL: @xvpickev_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvpickev_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvpickev.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvpickev_h(v16i16 _1, v16i16 _2) { return __lasx_xvpickev_h(_1, _2); }
-// CHECK-LABEL: @xvpickev_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvpickev_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvpickev.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvpickev_w(v8i32 _1, v8i32 _2) { return __lasx_xvpickev_w(_1, _2); }
-// CHECK-LABEL: @xvpickev_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvpickev_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvpickev.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvpickev_d(v4i64 _1, v4i64 _2) { return __lasx_xvpickev_d(_1, _2); }
-// CHECK-LABEL: @xvpickod_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvpickod_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvpickod.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvpickod_b(v32i8 _1, v32i8 _2) { return __lasx_xvpickod_b(_1, _2); }
-// CHECK-LABEL: @xvpickod_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvpickod_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvpickod.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvpickod_h(v16i16 _1, v16i16 _2) { return __lasx_xvpickod_h(_1, _2); }
-// CHECK-LABEL: @xvpickod_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvpickod_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvpickod.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvpickod_w(v8i32 _1, v8i32 _2) { return __lasx_xvpickod_w(_1, _2); }
-// CHECK-LABEL: @xvpickod_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvpickod_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvpickod.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvpickod_d(v4i64 _1, v4i64 _2) { return __lasx_xvpickod_d(_1, _2); }
-// CHECK-LABEL: @xvilvh_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvilvh_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvilvh.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvilvh_b(v32i8 _1, v32i8 _2) { return __lasx_xvilvh_b(_1, _2); }
-// CHECK-LABEL: @xvilvh_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvilvh_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvilvh.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvilvh_h(v16i16 _1, v16i16 _2) { return __lasx_xvilvh_h(_1, _2); }
-// CHECK-LABEL: @xvilvh_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvilvh_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvilvh.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvilvh_w(v8i32 _1, v8i32 _2) { return __lasx_xvilvh_w(_1, _2); }
-// CHECK-LABEL: @xvilvh_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvilvh_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvilvh.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvilvh_d(v4i64 _1, v4i64 _2) { return __lasx_xvilvh_d(_1, _2); }
-// CHECK-LABEL: @xvilvl_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvilvl_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvilvl.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvilvl_b(v32i8 _1, v32i8 _2) { return __lasx_xvilvl_b(_1, _2); }
-// CHECK-LABEL: @xvilvl_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvilvl_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvilvl.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvilvl_h(v16i16 _1, v16i16 _2) { return __lasx_xvilvl_h(_1, _2); }
-// CHECK-LABEL: @xvilvl_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvilvl_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvilvl.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvilvl_w(v8i32 _1, v8i32 _2) { return __lasx_xvilvl_w(_1, _2); }
-// CHECK-LABEL: @xvilvl_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvilvl_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvilvl.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvilvl_d(v4i64 _1, v4i64 _2) { return __lasx_xvilvl_d(_1, _2); }
-// CHECK-LABEL: @xvpackev_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvpackev_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvpackev.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvpackev_b(v32i8 _1, v32i8 _2) { return __lasx_xvpackev_b(_1, _2); }
-// CHECK-LABEL: @xvpackev_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvpackev_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvpackev.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvpackev_h(v16i16 _1, v16i16 _2) { return __lasx_xvpackev_h(_1, _2); }
-// CHECK-LABEL: @xvpackev_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvpackev_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvpackev.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvpackev_w(v8i32 _1, v8i32 _2) { return __lasx_xvpackev_w(_1, _2); }
-// CHECK-LABEL: @xvpackev_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvpackev_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvpackev.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvpackev_d(v4i64 _1, v4i64 _2) { return __lasx_xvpackev_d(_1, _2); }
-// CHECK-LABEL: @xvpackod_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvpackod_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvpackod.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvpackod_b(v32i8 _1, v32i8 _2) { return __lasx_xvpackod_b(_1, _2); }
-// CHECK-LABEL: @xvpackod_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvpackod_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvpackod.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvpackod_h(v16i16 _1, v16i16 _2) { return __lasx_xvpackod_h(_1, _2); }
-// CHECK-LABEL: @xvpackod_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvpackod_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvpackod.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvpackod_w(v8i32 _1, v8i32 _2) { return __lasx_xvpackod_w(_1, _2); }
-// CHECK-LABEL: @xvpackod_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvpackod_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvpackod.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvpackod_d(v4i64 _1, v4i64 _2) { return __lasx_xvpackod_d(_1, _2); }
-// CHECK-LABEL: @xvshuf_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_136:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_247:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_358:%.*]] = load <32 x i8>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvshuf_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_136:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_247:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_358:%.*]] = load <32 x i8>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvshuf.b(<32 x i8> [[_136]], <32 x i8> [[_247]], <32 x i8> [[_358]])
-// CHECK-NEXT:    store <32 x i8> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvshuf_b(v32i8 _1, v32i8 _2, v32i8 _3) { return __lasx_xvshuf_b(_1, _2, _3); }
-// CHECK-LABEL: @xvshuf_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_136:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_247:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_358:%.*]] = load <16 x i16>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvshuf_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_136:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_247:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_358:%.*]] = load <16 x i16>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvshuf.h(<16 x i16> [[_136]], <16 x i16> [[_247]], <16 x i16> [[_358]])
-// CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvshuf_h(v16i16 _1, v16i16 _2, v16i16 _3) { return __lasx_xvshuf_h(_1, _2, _3); }
-// CHECK-LABEL: @xvshuf_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_136:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_247:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_358:%.*]] = load <8 x i32>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvshuf_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_136:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_247:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_358:%.*]] = load <8 x i32>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvshuf.w(<8 x i32> [[_136]], <8 x i32> [[_247]], <8 x i32> [[_358]])
-// CHECK-NEXT:    store <8 x i32> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvshuf_w(v8i32 _1, v8i32 _2, v8i32 _3) { return __lasx_xvshuf_w(_1, _2, _3); }
-// CHECK-LABEL: @xvshuf_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <4 x i64>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvshuf_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <4 x i64>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvshuf.d(<4 x i64> [[_1]], <4 x i64> [[_2]], <4 x i64> [[_3]])
-// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvshuf_d(v4i64 _1, v4i64 _2, v4i64 _3) { return __lasx_xvshuf_d(_1, _2, _3); }
-// CHECK-LABEL: @xvand_v(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvand_v(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvand.v(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvand_v(v32u8 _1, v32u8 _2) { return __lasx_xvand_v(_1, _2); }
-// CHECK-LABEL: @xvandi_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvandi_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvandi.b(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvandi_b(v32u8 _1) { return __lasx_xvandi_b(_1, 1); }
-// CHECK-LABEL: @xvor_v(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvor_v(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvor.v(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvor_v(v32u8 _1, v32u8 _2) { return __lasx_xvor_v(_1, _2); }
-// CHECK-LABEL: @xvori_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvori_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvori.b(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvori_b(v32u8 _1) { return __lasx_xvori_b(_1, 1); }
-// CHECK-LABEL: @xvnor_v(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvnor_v(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvnor.v(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvnor_v(v32u8 _1, v32u8 _2) { return __lasx_xvnor_v(_1, _2); }
-// CHECK-LABEL: @xvnori_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvnori_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvnori.b(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvnori_b(v32u8 _1) { return __lasx_xvnori_b(_1, 1); }
-// CHECK-LABEL: @xvxor_v(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvxor_v(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvxor.v(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvxor_v(v32u8 _1, v32u8 _2) { return __lasx_xvxor_v(_1, _2); }
-// CHECK-LABEL: @xvxori_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvxori_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvxori.b(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvxori_b(v32u8 _1) { return __lasx_xvxori_b(_1, 1); }
-// CHECK-LABEL: @xvbitsel_v(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_136:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_247:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_358:%.*]] = load <32 x i8>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvbitsel_v(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_136:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_247:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_358:%.*]] = load <32 x i8>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbitsel.v(<32 x i8> [[_136]], <32 x i8> [[_247]], <32 x i8> [[_358]])
-// CHECK-NEXT:    store <32 x i8> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvbitsel_v(v32u8 _1, v32u8 _2, v32u8 _3) { return __lasx_xvbitsel_v(_1, _2, _3); }
-// CHECK-LABEL: @xvbitseli_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvbitseli_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbitseli.b(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvbitseli_b(v32u8 _1, v32u8 _2) { return __lasx_xvbitseli_b(_1, _2, 1); }
-// CHECK-LABEL: @xvshuf4i_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvshuf4i_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvshuf4i.b(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvshuf4i_b(v32i8 _1) { return __lasx_xvshuf4i_b(_1, 1); }
-// CHECK-LABEL: @xvshuf4i_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvshuf4i_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvshuf4i.h(<16 x i16> [[_1]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvshuf4i_h(v16i16 _1) { return __lasx_xvshuf4i_h(_1, 1); }
-// CHECK-LABEL: @xvshuf4i_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvshuf4i_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvshuf4i.w(<8 x i32> [[_1]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvshuf4i_w(v8i32 _1) { return __lasx_xvshuf4i_w(_1, 1); }
-// CHECK-LABEL: @xvreplgr2vr_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvreplgr2vr.b(i32 [[_1:%.*]])
-// CHECK-NEXT:    store <32 x i8> [[TMP0]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvreplgr2vr_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], i32 noundef signext [[_1:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvreplgr2vr.b(i32 [[_1]])
+// CHECK-NEXT:    store <32 x i8> [[TMP0]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvreplgr2vr_b(int _1) { return __lasx_xvreplgr2vr_b(_1); }
-// CHECK-LABEL: @xvreplgr2vr_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvreplgr2vr.h(i32 [[_1:%.*]])
-// CHECK-NEXT:    store <16 x i16> [[TMP0]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvreplgr2vr_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], i32 noundef signext [[_1:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvreplgr2vr.h(i32 [[_1]])
+// CHECK-NEXT:    store <16 x i16> [[TMP0]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvreplgr2vr_h(int _1) { return __lasx_xvreplgr2vr_h(_1); }
-// CHECK-LABEL: @xvreplgr2vr_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvreplgr2vr.w(i32 [[_1:%.*]])
-// CHECK-NEXT:    store <8 x i32> [[TMP0]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvreplgr2vr_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], i32 noundef signext [[_1:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvreplgr2vr.w(i32 [[_1]])
+// CHECK-NEXT:    store <8 x i32> [[TMP0]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvreplgr2vr_w(int _1) { return __lasx_xvreplgr2vr_w(_1); }
-// CHECK-LABEL: @xvreplgr2vr_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[CONV:%.*]] = sext i32 [[_1:%.*]] to i64
+// CHECK-LABEL: define dso_local void @xvreplgr2vr_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], i32 noundef signext [[_1:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CONV:%.*]] = sext i32 [[_1]] to i64
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvreplgr2vr.d(i64 [[CONV]])
-// CHECK-NEXT:    store <4 x i64> [[TMP0]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP0]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvreplgr2vr_d(int _1) { return __lasx_xvreplgr2vr_d(_1); }
-// CHECK-LABEL: @xvpcnt_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_112:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvpcnt_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_112:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvpcnt.b(<32 x i8> [[_112]])
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvpcnt_b(v32i8 _1) { return __lasx_xvpcnt_b(_1); }
-// CHECK-LABEL: @xvpcnt_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_112:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvpcnt_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_112:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvpcnt.h(<16 x i16> [[_112]])
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvpcnt_h(v16i16 _1) { return __lasx_xvpcnt_h(_1); }
-// CHECK-LABEL: @xvpcnt_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_112:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvpcnt_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_112:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvpcnt.w(<8 x i32> [[_112]])
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvpcnt_w(v8i32 _1) { return __lasx_xvpcnt_w(_1); }
-// CHECK-LABEL: @xvpcnt_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvpcnt_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvpcnt.d(<4 x i64> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvpcnt_d(v4i64 _1) { return __lasx_xvpcnt_d(_1); }
-// CHECK-LABEL: @xvclo_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_112:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvclo_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_112:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvclo.b(<32 x i8> [[_112]])
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvclo_b(v32i8 _1) { return __lasx_xvclo_b(_1); }
-// CHECK-LABEL: @xvclo_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_112:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvclo_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_112:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvclo.h(<16 x i16> [[_112]])
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvclo_h(v16i16 _1) { return __lasx_xvclo_h(_1); }
-// CHECK-LABEL: @xvclo_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_112:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvclo_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_112:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvclo.w(<8 x i32> [[_112]])
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvclo_w(v8i32 _1) { return __lasx_xvclo_w(_1); }
-// CHECK-LABEL: @xvclo_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvclo_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvclo.d(<4 x i64> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvclo_d(v4i64 _1) { return __lasx_xvclo_d(_1); }
-// CHECK-LABEL: @xvclz_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_112:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvclz_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_112:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvclz.b(<32 x i8> [[_112]])
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvclz_b(v32i8 _1) { return __lasx_xvclz_b(_1); }
-// CHECK-LABEL: @xvclz_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_112:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvclz_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_112:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvclz.h(<16 x i16> [[_112]])
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvclz_h(v16i16 _1) { return __lasx_xvclz_h(_1); }
-// CHECK-LABEL: @xvclz_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_112:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvclz_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_112:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvclz.w(<8 x i32> [[_112]])
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvclz_w(v8i32 _1) { return __lasx_xvclz_w(_1); }
-// CHECK-LABEL: @xvclz_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvclz_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvclz.d(<4 x i64> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvclz_d(v4i64 _1) { return __lasx_xvclz_d(_1); }
-// CHECK-LABEL: @xvfadd_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfadd_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfadd.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8f32 xvfadd_s(v8f32 _1, v8f32 _2) { return __lasx_xvfadd_s(_1, _2); }
-// CHECK-LABEL: @xvfadd_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfadd_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfadd.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <4 x double> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x double> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4f64 xvfadd_d(v4f64 _1, v4f64 _2) { return __lasx_xvfadd_d(_1, _2); }
-// CHECK-LABEL: @xvfsub_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfsub_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfsub.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8f32 xvfsub_s(v8f32 _1, v8f32 _2) { return __lasx_xvfsub_s(_1, _2); }
-// CHECK-LABEL: @xvfsub_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfsub_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfsub.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <4 x double> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x double> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4f64 xvfsub_d(v4f64 _1, v4f64 _2) { return __lasx_xvfsub_d(_1, _2); }
-// CHECK-LABEL: @xvfmul_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfmul_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfmul.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8f32 xvfmul_s(v8f32 _1, v8f32 _2) { return __lasx_xvfmul_s(_1, _2); }
-// CHECK-LABEL: @xvfmul_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfmul_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfmul.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <4 x double> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x double> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4f64 xvfmul_d(v4f64 _1, v4f64 _2) { return __lasx_xvfmul_d(_1, _2); }
-// CHECK-LABEL: @xvfdiv_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfdiv_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfdiv.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8f32 xvfdiv_s(v8f32 _1, v8f32 _2) { return __lasx_xvfdiv_s(_1, _2); }
-// CHECK-LABEL: @xvfdiv_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfdiv_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfdiv.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <4 x double> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x double> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4f64 xvfdiv_d(v4f64 _1, v4f64 _2) { return __lasx_xvfdiv_d(_1, _2); }
-// CHECK-LABEL: @xvfcvt_h_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfcvt_h_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvfcvt.h.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvfcvt_h_s(v8f32 _1, v8f32 _2) { return __lasx_xvfcvt_h_s(_1, _2); }
-// CHECK-LABEL: @xvfcvt_s_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfcvt_s_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfcvt.s.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8f32 xvfcvt_s_d(v4f64 _1, v4f64 _2) { return __lasx_xvfcvt_s_d(_1, _2); }
-// CHECK-LABEL: @xvfmin_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfmin_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfmin.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8f32 xvfmin_s(v8f32 _1, v8f32 _2) { return __lasx_xvfmin_s(_1, _2); }
-// CHECK-LABEL: @xvfmin_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfmin_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfmin.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <4 x double> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x double> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4f64 xvfmin_d(v4f64 _1, v4f64 _2) { return __lasx_xvfmin_d(_1, _2); }
-// CHECK-LABEL: @xvfmina_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfmina_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfmina.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8f32 xvfmina_s(v8f32 _1, v8f32 _2) { return __lasx_xvfmina_s(_1, _2); }
-// CHECK-LABEL: @xvfmina_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfmina_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfmina.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <4 x double> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x double> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4f64 xvfmina_d(v4f64 _1, v4f64 _2) { return __lasx_xvfmina_d(_1, _2); }
-// CHECK-LABEL: @xvfmax_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfmax_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfmax.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8f32 xvfmax_s(v8f32 _1, v8f32 _2) { return __lasx_xvfmax_s(_1, _2); }
-// CHECK-LABEL: @xvfmax_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfmax_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfmax.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <4 x double> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x double> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4f64 xvfmax_d(v4f64 _1, v4f64 _2) { return __lasx_xvfmax_d(_1, _2); }
-// CHECK-LABEL: @xvfmaxa_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfmaxa_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfmaxa.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8f32 xvfmaxa_s(v8f32 _1, v8f32 _2) { return __lasx_xvfmaxa_s(_1, _2); }
-// CHECK-LABEL: @xvfmaxa_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfmaxa_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfmaxa.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <4 x double> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x double> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4f64 xvfmaxa_d(v4f64 _1, v4f64 _2) { return __lasx_xvfmaxa_d(_1, _2); }
-// CHECK-LABEL: @xvfclass_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfclass_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfclass.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvfclass_s(v8f32 _1) { return __lasx_xvfclass_s(_1); }
-// CHECK-LABEL: @xvfclass_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfclass_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfclass.d(<4 x double> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvfclass_d(v4f64 _1) { return __lasx_xvfclass_d(_1); }
-// CHECK-LABEL: @xvfsqrt_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfsqrt_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfsqrt.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8f32 xvfsqrt_s(v8f32 _1) { return __lasx_xvfsqrt_s(_1); }
-// CHECK-LABEL: @xvfsqrt_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfsqrt_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfsqrt.d(<4 x double> [[_1]])
-// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4f64 xvfsqrt_d(v4f64 _1) { return __lasx_xvfsqrt_d(_1); }
-// CHECK-LABEL: @xvfrecip_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfrecip_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfrecip.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8f32 xvfrecip_s(v8f32 _1) { return __lasx_xvfrecip_s(_1); }
-// CHECK-LABEL: @xvfrecip_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfrecip_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfrecip.d(<4 x double> [[_1]])
-// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4f64 xvfrecip_d(v4f64 _1) { return __lasx_xvfrecip_d(_1); }
-// CHECK-LABEL: @xvfrint_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfrint_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfrint.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8f32 xvfrint_s(v8f32 _1) { return __lasx_xvfrint_s(_1); }
-// CHECK-LABEL: @xvfrint_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfrint_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfrint.d(<4 x double> [[_1]])
-// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4f64 xvfrint_d(v4f64 _1) { return __lasx_xvfrint_d(_1); }
-// CHECK-LABEL: @xvfrsqrt_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfrsqrt_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfrsqrt.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8f32 xvfrsqrt_s(v8f32 _1) { return __lasx_xvfrsqrt_s(_1); }
-// CHECK-LABEL: @xvfrsqrt_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfrsqrt_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfrsqrt.d(<4 x double> [[_1]])
-// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4f64 xvfrsqrt_d(v4f64 _1) { return __lasx_xvfrsqrt_d(_1); }
-// CHECK-LABEL: @xvflogb_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvflogb_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvflogb.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8f32 xvflogb_s(v8f32 _1) { return __lasx_xvflogb_s(_1); }
-// CHECK-LABEL: @xvflogb_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvflogb_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvflogb.d(<4 x double> [[_1]])
-// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4f64 xvflogb_d(v4f64 _1) { return __lasx_xvflogb_d(_1); }
-// CHECK-LABEL: @xvfcvth_s_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_112:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfcvth_s_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_112:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfcvth.s.h(<16 x i16> [[_112]])
-// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8f32 xvfcvth_s_h(v16i16 _1) { return __lasx_xvfcvth_s_h(_1); }
-// CHECK-LABEL: @xvfcvth_d_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfcvth_d_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfcvth.d.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4f64 xvfcvth_d_s(v8f32 _1) { return __lasx_xvfcvth_d_s(_1); }
-// CHECK-LABEL: @xvfcvtl_s_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_112:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfcvtl_s_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_112:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfcvtl.s.h(<16 x i16> [[_112]])
-// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8f32 xvfcvtl_s_h(v16i16 _1) { return __lasx_xvfcvtl_s_h(_1); }
-// CHECK-LABEL: @xvfcvtl_d_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfcvtl_d_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfcvtl.d.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4f64 xvfcvtl_d_s(v8f32 _1) { return __lasx_xvfcvtl_d_s(_1); }
-// CHECK-LABEL: @xvftint_w_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvftint_w_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftint.w.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvftint_w_s(v8f32 _1) { return __lasx_xvftint_w_s(_1); }
-// CHECK-LABEL: @xvftint_l_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvftint_l_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftint.l.d(<4 x double> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvftint_l_d(v4f64 _1) { return __lasx_xvftint_l_d(_1); }
-// CHECK-LABEL: @xvftint_wu_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvftint_wu_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftint.wu.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvftint_wu_s(v8f32 _1) { return __lasx_xvftint_wu_s(_1); }
-// CHECK-LABEL: @xvftint_lu_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvftint_lu_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftint.lu.d(<4 x double> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvftint_lu_d(v4f64 _1) { return __lasx_xvftint_lu_d(_1); }
-// CHECK-LABEL: @xvftintrz_w_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvftintrz_w_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrz.w.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvftintrz_w_s(v8f32 _1) { return __lasx_xvftintrz_w_s(_1); }
-// CHECK-LABEL: @xvftintrz_l_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvftintrz_l_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrz.l.d(<4 x double> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvftintrz_l_d(v4f64 _1) { return __lasx_xvftintrz_l_d(_1); }
-// CHECK-LABEL: @xvftintrz_wu_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvftintrz_wu_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrz.wu.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvftintrz_wu_s(v8f32 _1) { return __lasx_xvftintrz_wu_s(_1); }
-// CHECK-LABEL: @xvftintrz_lu_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvftintrz_lu_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrz.lu.d(<4 x double> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvftintrz_lu_d(v4f64 _1) { return __lasx_xvftintrz_lu_d(_1); }
-// CHECK-LABEL: @xvffint_s_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_112:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvffint_s_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_112:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvffint.s.w(<8 x i32> [[_112]])
-// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8f32 xvffint_s_w(v8i32 _1) { return __lasx_xvffint_s_w(_1); }
-// CHECK-LABEL: @xvffint_d_l(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvffint_d_l(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvffint.d.l(<4 x i64> [[_1]])
-// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4f64 xvffint_d_l(v4i64 _1) { return __lasx_xvffint_d_l(_1); }
-// CHECK-LABEL: @xvffint_s_wu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_112:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvffint_s_wu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_112:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvffint.s.wu(<8 x i32> [[_112]])
-// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8f32 xvffint_s_wu(v8u32 _1) { return __lasx_xvffint_s_wu(_1); }
-// CHECK-LABEL: @xvffint_d_lu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvffint_d_lu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvffint.d.lu(<4 x i64> [[_1]])
-// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4f64 xvffint_d_lu(v4u64 _1) { return __lasx_xvffint_d_lu(_1); }
-// CHECK-LABEL: @xvreplve_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_112:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvreplve.b(<32 x i8> [[_112]], i32 [[_2:%.*]])
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvreplve_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], i32 noundef signext [[_2:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_112:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvreplve.b(<32 x i8> [[_112]], i32 [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvreplve_b(v32i8 _1, int _2) { return __lasx_xvreplve_b(_1, _2); }
-// CHECK-LABEL: @xvreplve_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_112:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvreplve.h(<16 x i16> [[_112]], i32 [[_2:%.*]])
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvreplve_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], i32 noundef signext [[_2:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_112:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvreplve.h(<16 x i16> [[_112]], i32 [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvreplve_h(v16i16 _1, int _2) { return __lasx_xvreplve_h(_1, _2); }
-// CHECK-LABEL: @xvreplve_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_112:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvreplve.w(<8 x i32> [[_112]], i32 [[_2:%.*]])
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvreplve_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], i32 noundef signext [[_2:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_112:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvreplve.w(<8 x i32> [[_112]], i32 [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvreplve_w(v8i32 _1, int _2) { return __lasx_xvreplve_w(_1, _2); }
-// CHECK-LABEL: @xvreplve_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvreplve.d(<4 x i64> [[_1]], i32 [[_2:%.*]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvreplve_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], i32 noundef signext [[_2:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvreplve.d(<4 x i64> [[_1]], i32 [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvreplve_d(v4i64 _1, int _2) { return __lasx_xvreplve_d(_1, _2); }
-// CHECK-LABEL: @xvpermi_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvpermi_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvpermi.w(<8 x i32> [[_1]], <8 x i32> [[_2]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvpermi_w(v8i32 _1, v8i32 _2) { return __lasx_xvpermi_w(_1, _2, 1); }
-// CHECK-LABEL: @xvandn_v(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvandn_v(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvandn.v(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvandn_v(v32u8 _1, v32u8 _2) { return __lasx_xvandn_v(_1, _2); }
-// CHECK-LABEL: @xvneg_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_112:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvneg_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_112:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvneg.b(<32 x i8> [[_112]])
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvneg_b(v32i8 _1) { return __lasx_xvneg_b(_1); }
-// CHECK-LABEL: @xvneg_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_112:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvneg_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_112:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvneg.h(<16 x i16> [[_112]])
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvneg_h(v16i16 _1) { return __lasx_xvneg_h(_1); }
-// CHECK-LABEL: @xvneg_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_112:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvneg_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_112:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvneg.w(<8 x i32> [[_112]])
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvneg_w(v8i32 _1) { return __lasx_xvneg_w(_1); }
-// CHECK-LABEL: @xvneg_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvneg_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvneg.d(<4 x i64> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvneg_d(v4i64 _1) { return __lasx_xvneg_d(_1); }
-// CHECK-LABEL: @xvmuh_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmuh_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmuh.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvmuh_b(v32i8 _1, v32i8 _2) { return __lasx_xvmuh_b(_1, _2); }
-// CHECK-LABEL: @xvmuh_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmuh_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmuh.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvmuh_h(v16i16 _1, v16i16 _2) { return __lasx_xvmuh_h(_1, _2); }
-// CHECK-LABEL: @xvmuh_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmuh_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmuh.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvmuh_w(v8i32 _1, v8i32 _2) { return __lasx_xvmuh_w(_1, _2); }
-// CHECK-LABEL: @xvmuh_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmuh_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmuh.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvmuh_d(v4i64 _1, v4i64 _2) { return __lasx_xvmuh_d(_1, _2); }
-// CHECK-LABEL: @xvmuh_bu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmuh_bu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmuh.bu(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvmuh_bu(v32u8 _1, v32u8 _2) { return __lasx_xvmuh_bu(_1, _2); }
-// CHECK-LABEL: @xvmuh_hu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmuh_hu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmuh.hu(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvmuh_hu(v16u16 _1, v16u16 _2) { return __lasx_xvmuh_hu(_1, _2); }
-// CHECK-LABEL: @xvmuh_wu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmuh_wu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmuh.wu(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvmuh_wu(v8u32 _1, v8u32 _2) { return __lasx_xvmuh_wu(_1, _2); }
-// CHECK-LABEL: @xvmuh_du(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmuh_du(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmuh.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvmuh_du(v4u64 _1, v4u64 _2) { return __lasx_xvmuh_du(_1, _2); }
-// CHECK-LABEL: @xvsllwil_h_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsllwil_h_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsllwil.h.b(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvsllwil_h_b(v32i8 _1) { return __lasx_xvsllwil_h_b(_1, 1); }
-// CHECK-LABEL: @xvsllwil_w_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsllwil_w_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsllwil.w.h(<16 x i16> [[_1]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvsllwil_w_h(v16i16 _1) { return __lasx_xvsllwil_w_h(_1, 1); }
-// CHECK-LABEL: @xvsllwil_d_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsllwil_d_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsllwil.d.w(<8 x i32> [[_1]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvsllwil_d_w(v8i32 _1) { return __lasx_xvsllwil_d_w(_1, 1); }
-// CHECK-LABEL: @xvsllwil_hu_bu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsllwil_hu_bu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsllwil.hu.bu(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvsllwil_hu_bu(v32u8 _1) { return __lasx_xvsllwil_hu_bu(_1, 1); }
-// CHECK-LABEL: @xvsllwil_wu_hu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsllwil_wu_hu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsllwil.wu.hu(<16 x i16> [[_1]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvsllwil_wu_hu(v16u16 _1) { return __lasx_xvsllwil_wu_hu(_1, 1); }
-// CHECK-LABEL: @xvsllwil_du_wu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsllwil_du_wu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsllwil.du.wu(<8 x i32> [[_1]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvsllwil_du_wu(v8u32 _1) { return __lasx_xvsllwil_du_wu(_1, 1); }
-// CHECK-LABEL: @xvsran_b_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsran_b_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsran.b.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvsran_b_h(v16i16 _1, v16i16 _2) { return __lasx_xvsran_b_h(_1, _2); }
-// CHECK-LABEL: @xvsran_h_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsran_h_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsran.h.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvsran_h_w(v8i32 _1, v8i32 _2) { return __lasx_xvsran_h_w(_1, _2); }
-// CHECK-LABEL: @xvsran_w_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsran_w_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsran.w.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvsran_w_d(v4i64 _1, v4i64 _2) { return __lasx_xvsran_w_d(_1, _2); }
-// CHECK-LABEL: @xvssran_b_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssran_b_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssran.b.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvssran_b_h(v16i16 _1, v16i16 _2) { return __lasx_xvssran_b_h(_1, _2); }
-// CHECK-LABEL: @xvssran_h_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssran_h_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssran.h.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvssran_h_w(v8i32 _1, v8i32 _2) { return __lasx_xvssran_h_w(_1, _2); }
-// CHECK-LABEL: @xvssran_w_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssran_w_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssran.w.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvssran_w_d(v4i64 _1, v4i64 _2) { return __lasx_xvssran_w_d(_1, _2); }
-// CHECK-LABEL: @xvssran_bu_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssran_bu_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssran.bu.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvssran_bu_h(v16u16 _1, v16u16 _2) { return __lasx_xvssran_bu_h(_1, _2); }
-// CHECK-LABEL: @xvssran_hu_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssran_hu_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssran.hu.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvssran_hu_w(v8u32 _1, v8u32 _2) { return __lasx_xvssran_hu_w(_1, _2); }
-// CHECK-LABEL: @xvssran_wu_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssran_wu_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssran.wu.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvssran_wu_d(v4u64 _1, v4u64 _2) { return __lasx_xvssran_wu_d(_1, _2); }
-// CHECK-LABEL: @xvsrarn_b_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsrarn_b_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrarn.b.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvsrarn_b_h(v16i16 _1, v16i16 _2) { return __lasx_xvsrarn_b_h(_1, _2); }
-// CHECK-LABEL: @xvsrarn_h_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsrarn_h_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrarn.h.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvsrarn_h_w(v8i32 _1, v8i32 _2) { return __lasx_xvsrarn_h_w(_1, _2); }
-// CHECK-LABEL: @xvsrarn_w_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsrarn_w_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrarn.w.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvsrarn_w_d(v4i64 _1, v4i64 _2) { return __lasx_xvsrarn_w_d(_1, _2); }
-// CHECK-LABEL: @xvssrarn_b_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssrarn_b_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrarn.b.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvssrarn_b_h(v16i16 _1, v16i16 _2) { return __lasx_xvssrarn_b_h(_1, _2); }
-// CHECK-LABEL: @xvssrarn_h_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssrarn_h_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrarn.h.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvssrarn_h_w(v8i32 _1, v8i32 _2) { return __lasx_xvssrarn_h_w(_1, _2); }
-// CHECK-LABEL: @xvssrarn_w_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssrarn_w_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrarn.w.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvssrarn_w_d(v4i64 _1, v4i64 _2) { return __lasx_xvssrarn_w_d(_1, _2); }
-// CHECK-LABEL: @xvssrarn_bu_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssrarn_bu_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrarn.bu.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvssrarn_bu_h(v16u16 _1, v16u16 _2) { return __lasx_xvssrarn_bu_h(_1, _2); }
-// CHECK-LABEL: @xvssrarn_hu_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssrarn_hu_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrarn.hu.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvssrarn_hu_w(v8u32 _1, v8u32 _2) { return __lasx_xvssrarn_hu_w(_1, _2); }
-// CHECK-LABEL: @xvssrarn_wu_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssrarn_wu_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrarn.wu.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvssrarn_wu_d(v4u64 _1, v4u64 _2) { return __lasx_xvssrarn_wu_d(_1, _2); }
-// CHECK-LABEL: @xvsrln_b_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsrln_b_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrln.b.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvsrln_b_h(v16i16 _1, v16i16 _2) { return __lasx_xvsrln_b_h(_1, _2); }
-// CHECK-LABEL: @xvsrln_h_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsrln_h_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrln.h.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvsrln_h_w(v8i32 _1, v8i32 _2) { return __lasx_xvsrln_h_w(_1, _2); }
-// CHECK-LABEL: @xvsrln_w_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsrln_w_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrln.w.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvsrln_w_d(v4i64 _1, v4i64 _2) { return __lasx_xvsrln_w_d(_1, _2); }
-// CHECK-LABEL: @xvssrln_bu_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssrln_bu_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrln.bu.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvssrln_bu_h(v16u16 _1, v16u16 _2) { return __lasx_xvssrln_bu_h(_1, _2); }
-// CHECK-LABEL: @xvssrln_hu_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssrln_hu_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrln.hu.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvssrln_hu_w(v8u32 _1, v8u32 _2) { return __lasx_xvssrln_hu_w(_1, _2); }
-// CHECK-LABEL: @xvssrln_wu_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssrln_wu_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrln.wu.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvssrln_wu_d(v4u64 _1, v4u64 _2) { return __lasx_xvssrln_wu_d(_1, _2); }
-// CHECK-LABEL: @xvsrlrn_b_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsrlrn_b_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrlrn.b.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvsrlrn_b_h(v16i16 _1, v16i16 _2) { return __lasx_xvsrlrn_b_h(_1, _2); }
-// CHECK-LABEL: @xvsrlrn_h_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsrlrn_h_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrlrn.h.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvsrlrn_h_w(v8i32 _1, v8i32 _2) { return __lasx_xvsrlrn_h_w(_1, _2); }
-// CHECK-LABEL: @xvsrlrn_w_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsrlrn_w_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrlrn.w.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvsrlrn_w_d(v4i64 _1, v4i64 _2) { return __lasx_xvsrlrn_w_d(_1, _2); }
-// CHECK-LABEL: @xvssrlrn_bu_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssrlrn_bu_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrlrn.bu.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvssrlrn_bu_h(v16u16 _1, v16u16 _2) { return __lasx_xvssrlrn_bu_h(_1, _2); }
-// CHECK-LABEL: @xvssrlrn_hu_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssrlrn_hu_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrlrn.hu.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvssrlrn_hu_w(v8u32 _1, v8u32 _2) { return __lasx_xvssrlrn_hu_w(_1, _2); }
-// CHECK-LABEL: @xvssrlrn_wu_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssrlrn_wu_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrlrn.wu.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvssrlrn_wu_d(v4u64 _1, v4u64 _2) { return __lasx_xvssrlrn_wu_d(_1, _2); }
-// CHECK-LABEL: @xvfrstpi_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfrstpi_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvfrstpi.b(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvfrstpi_b(v32i8 _1, v32i8 _2) { return __lasx_xvfrstpi_b(_1, _2, 1); }
-// CHECK-LABEL: @xvfrstpi_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfrstpi_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvfrstpi.h(<16 x i16> [[_1]], <16 x i16> [[_2]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvfrstpi_h(v16i16 _1, v16i16 _2) { return __lasx_xvfrstpi_h(_1, _2, 1); }
-// CHECK-LABEL: @xvfrstp_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_136:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_247:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_358:%.*]] = load <32 x i8>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfrstp_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_136:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_247:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_358:%.*]] = load <32 x i8>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvfrstp.b(<32 x i8> [[_136]], <32 x i8> [[_247]], <32 x i8> [[_358]])
-// CHECK-NEXT:    store <32 x i8> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvfrstp_b(v32i8 _1, v32i8 _2, v32i8 _3) { return __lasx_xvfrstp_b(_1, _2, _3); }
-// CHECK-LABEL: @xvfrstp_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_136:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_247:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_358:%.*]] = load <16 x i16>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfrstp_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_136:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_247:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_358:%.*]] = load <16 x i16>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvfrstp.h(<16 x i16> [[_136]], <16 x i16> [[_247]], <16 x i16> [[_358]])
-// CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvfrstp_h(v16i16 _1, v16i16 _2, v16i16 _3) { return __lasx_xvfrstp_h(_1, _2, _3); }
-// CHECK-LABEL: @xvshuf4i_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvshuf4i_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvshuf4i.d(<4 x i64> [[_1]], <4 x i64> [[_2]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvshuf4i_d(v4i64 _1, v4i64 _2) { return __lasx_xvshuf4i_d(_1, _2, 1); }
-// CHECK-LABEL: @xvbsrl_v(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvbsrl_v(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbsrl.v(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvbsrl_v(v32i8 _1) { return __lasx_xvbsrl_v(_1, 1); }
-// CHECK-LABEL: @xvbsll_v(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvbsll_v(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbsll.v(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvbsll_v(v32i8 _1) { return __lasx_xvbsll_v(_1, 1); }
-// CHECK-LABEL: @xvextrins_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvextrins_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvextrins.b(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvextrins_b(v32i8 _1, v32i8 _2) { return __lasx_xvextrins_b(_1, _2, 1); }
-// CHECK-LABEL: @xvextrins_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvextrins_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvextrins.h(<16 x i16> [[_1]], <16 x i16> [[_2]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvextrins_h(v16i16 _1, v16i16 _2) { return __lasx_xvextrins_h(_1, _2, 1); }
-// CHECK-LABEL: @xvextrins_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvextrins_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvextrins.w(<8 x i32> [[_1]], <8 x i32> [[_2]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvextrins_w(v8i32 _1, v8i32 _2) { return __lasx_xvextrins_w(_1, _2, 1); }
-// CHECK-LABEL: @xvextrins_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvextrins_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvextrins.d(<4 x i64> [[_1]], <4 x i64> [[_2]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvextrins_d(v4i64 _1, v4i64 _2) { return __lasx_xvextrins_d(_1, _2, 1); }
-// CHECK-LABEL: @xvmskltz_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_112:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmskltz_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_112:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmskltz.b(<32 x i8> [[_112]])
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvmskltz_b(v32i8 _1) { return __lasx_xvmskltz_b(_1); }
-// CHECK-LABEL: @xvmskltz_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_112:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmskltz_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_112:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmskltz.h(<16 x i16> [[_112]])
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvmskltz_h(v16i16 _1) { return __lasx_xvmskltz_h(_1); }
-// CHECK-LABEL: @xvmskltz_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_112:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmskltz_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_112:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmskltz.w(<8 x i32> [[_112]])
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvmskltz_w(v8i32 _1) { return __lasx_xvmskltz_w(_1); }
-// CHECK-LABEL: @xvmskltz_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmskltz_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmskltz.d(<4 x i64> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvmskltz_d(v4i64 _1) { return __lasx_xvmskltz_d(_1); }
-// CHECK-LABEL: @xvsigncov_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsigncov_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsigncov.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvsigncov_b(v32i8 _1, v32i8 _2) { return __lasx_xvsigncov_b(_1, _2); }
-// CHECK-LABEL: @xvsigncov_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsigncov_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsigncov.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvsigncov_h(v16i16 _1, v16i16 _2) { return __lasx_xvsigncov_h(_1, _2); }
-// CHECK-LABEL: @xvsigncov_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsigncov_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsigncov.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvsigncov_w(v8i32 _1, v8i32 _2) { return __lasx_xvsigncov_w(_1, _2); }
-// CHECK-LABEL: @xvsigncov_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsigncov_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsigncov.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvsigncov_d(v4i64 _1, v4i64 _2) { return __lasx_xvsigncov_d(_1, _2); }
-// CHECK-LABEL: @xvfmadd_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <8 x float>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfmadd_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <8 x float>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfmadd.s(<8 x float> [[_1]], <8 x float> [[_2]], <8 x float> [[_3]])
-// CHECK-NEXT:    store <8 x float> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x float> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8f32 xvfmadd_s(v8f32 _1, v8f32 _2, v8f32 _3) { return __lasx_xvfmadd_s(_1, _2, _3); }
-// CHECK-LABEL: @xvfmadd_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <4 x double>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfmadd_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <4 x double>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfmadd.d(<4 x double> [[_1]], <4 x double> [[_2]], <4 x double> [[_3]])
-// CHECK-NEXT:    store <4 x double> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x double> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4f64 xvfmadd_d(v4f64 _1, v4f64 _2, v4f64 _3) { return __lasx_xvfmadd_d(_1, _2, _3); }
-// CHECK-LABEL: @xvfmsub_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <8 x float>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfmsub_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <8 x float>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfmsub.s(<8 x float> [[_1]], <8 x float> [[_2]], <8 x float> [[_3]])
-// CHECK-NEXT:    store <8 x float> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x float> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8f32 xvfmsub_s(v8f32 _1, v8f32 _2, v8f32 _3) { return __lasx_xvfmsub_s(_1, _2, _3); }
-// CHECK-LABEL: @xvfmsub_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <4 x double>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfmsub_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <4 x double>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfmsub.d(<4 x double> [[_1]], <4 x double> [[_2]], <4 x double> [[_3]])
-// CHECK-NEXT:    store <4 x double> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x double> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4f64 xvfmsub_d(v4f64 _1, v4f64 _2, v4f64 _3) { return __lasx_xvfmsub_d(_1, _2, _3); }
-// CHECK-LABEL: @xvfnmadd_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <8 x float>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfnmadd_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <8 x float>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfnmadd.s(<8 x float> [[_1]], <8 x float> [[_2]], <8 x float> [[_3]])
-// CHECK-NEXT:    store <8 x float> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x float> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8f32 xvfnmadd_s(v8f32 _1, v8f32 _2, v8f32 _3) { return __lasx_xvfnmadd_s(_1, _2, _3); }
-// CHECK-LABEL: @xvfnmadd_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <4 x double>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfnmadd_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <4 x double>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfnmadd.d(<4 x double> [[_1]], <4 x double> [[_2]], <4 x double> [[_3]])
-// CHECK-NEXT:    store <4 x double> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x double> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4f64 xvfnmadd_d(v4f64 _1, v4f64 _2, v4f64 _3) { return __lasx_xvfnmadd_d(_1, _2, _3); }
-// CHECK-LABEL: @xvfnmsub_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <8 x float>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfnmsub_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <8 x float>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfnmsub.s(<8 x float> [[_1]], <8 x float> [[_2]], <8 x float> [[_3]])
-// CHECK-NEXT:    store <8 x float> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x float> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8f32 xvfnmsub_s(v8f32 _1, v8f32 _2, v8f32 _3) { return __lasx_xvfnmsub_s(_1, _2, _3); }
-// CHECK-LABEL: @xvfnmsub_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <4 x double>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfnmsub_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <4 x double>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfnmsub.d(<4 x double> [[_1]], <4 x double> [[_2]], <4 x double> [[_3]])
-// CHECK-NEXT:    store <4 x double> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x double> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4f64 xvfnmsub_d(v4f64 _1, v4f64 _2, v4f64 _3) { return __lasx_xvfnmsub_d(_1, _2, _3); }
-// CHECK-LABEL: @xvftintrne_w_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvftintrne_w_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrne.w.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvftintrne_w_s(v8f32 _1) { return __lasx_xvftintrne_w_s(_1); }
-// CHECK-LABEL: @xvftintrne_l_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvftintrne_l_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrne.l.d(<4 x double> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvftintrne_l_d(v4f64 _1) { return __lasx_xvftintrne_l_d(_1); }
-// CHECK-LABEL: @xvftintrp_w_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvftintrp_w_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrp.w.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvftintrp_w_s(v8f32 _1) { return __lasx_xvftintrp_w_s(_1); }
-// CHECK-LABEL: @xvftintrp_l_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvftintrp_l_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrp.l.d(<4 x double> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvftintrp_l_d(v4f64 _1) { return __lasx_xvftintrp_l_d(_1); }
-// CHECK-LABEL: @xvftintrm_w_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvftintrm_w_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrm.w.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvftintrm_w_s(v8f32 _1) { return __lasx_xvftintrm_w_s(_1); }
-// CHECK-LABEL: @xvftintrm_l_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvftintrm_l_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrm.l.d(<4 x double> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvftintrm_l_d(v4f64 _1) { return __lasx_xvftintrm_l_d(_1); }
-// CHECK-LABEL: @xvftint_w_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvftint_w_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftint.w.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvftint_w_d(v4f64 _1, v4f64 _2) { return __lasx_xvftint_w_d(_1, _2); }
-// CHECK-LABEL: @xvffint_s_l(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvffint_s_l(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvffint.s.l(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8f32 xvffint_s_l(v4i64 _1, v4i64 _2) { return __lasx_xvffint_s_l(_1, _2); }
-// CHECK-LABEL: @xvftintrz_w_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvftintrz_w_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrz.w.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvftintrz_w_d(v4f64 _1, v4f64 _2) { return __lasx_xvftintrz_w_d(_1, _2); }
-// CHECK-LABEL: @xvftintrp_w_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvftintrp_w_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrp.w.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvftintrp_w_d(v4f64 _1, v4f64 _2) { return __lasx_xvftintrp_w_d(_1, _2); }
-// CHECK-LABEL: @xvftintrm_w_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvftintrm_w_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrm.w.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvftintrm_w_d(v4f64 _1, v4f64 _2) { return __lasx_xvftintrm_w_d(_1, _2); }
-// CHECK-LABEL: @xvftintrne_w_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvftintrne_w_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrne.w.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvftintrne_w_d(v4f64 _1, v4f64 _2) { return __lasx_xvftintrne_w_d(_1, _2); }
-// CHECK-LABEL: @xvftinth_l_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvftinth_l_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftinth.l.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvftinth_l_s(v8f32 _1) { return __lasx_xvftinth_l_s(_1); }
-// CHECK-LABEL: @xvftintl_l_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvftintl_l_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintl.l.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvftintl_l_s(v8f32 _1) { return __lasx_xvftintl_l_s(_1); }
-// CHECK-LABEL: @xvffinth_d_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_112:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvffinth_d_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_112:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvffinth.d.w(<8 x i32> [[_112]])
-// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4f64 xvffinth_d_w(v8i32 _1) { return __lasx_xvffinth_d_w(_1); }
-// CHECK-LABEL: @xvffintl_d_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_112:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvffintl_d_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_112:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvffintl.d.w(<8 x i32> [[_112]])
-// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4f64 xvffintl_d_w(v8i32 _1) { return __lasx_xvffintl_d_w(_1); }
-// CHECK-LABEL: @xvftintrzh_l_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvftintrzh_l_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrzh.l.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvftintrzh_l_s(v8f32 _1) { return __lasx_xvftintrzh_l_s(_1); }
-// CHECK-LABEL: @xvftintrzl_l_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvftintrzl_l_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrzl.l.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvftintrzl_l_s(v8f32 _1) { return __lasx_xvftintrzl_l_s(_1); }
-// CHECK-LABEL: @xvftintrph_l_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvftintrph_l_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrph.l.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvftintrph_l_s(v8f32 _1) { return __lasx_xvftintrph_l_s(_1); }
-// CHECK-LABEL: @xvftintrpl_l_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvftintrpl_l_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrpl.l.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvftintrpl_l_s(v8f32 _1) { return __lasx_xvftintrpl_l_s(_1); }
-// CHECK-LABEL: @xvftintrmh_l_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvftintrmh_l_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrmh.l.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvftintrmh_l_s(v8f32 _1) { return __lasx_xvftintrmh_l_s(_1); }
-// CHECK-LABEL: @xvftintrml_l_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvftintrml_l_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrml.l.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvftintrml_l_s(v8f32 _1) { return __lasx_xvftintrml_l_s(_1); }
-// CHECK-LABEL: @xvftintrneh_l_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvftintrneh_l_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrneh.l.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvftintrneh_l_s(v8f32 _1) { return __lasx_xvftintrneh_l_s(_1); }
-// CHECK-LABEL: @xvftintrnel_l_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvftintrnel_l_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrnel.l.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvftintrnel_l_s(v8f32 _1) { return __lasx_xvftintrnel_l_s(_1); }
-// CHECK-LABEL: @xvfrintrne_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfrintrne_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfrintrne.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvfrintrne_s(v8f32 _1) { return __lasx_xvfrintrne_s(_1); }
-// CHECK-LABEL: @xvfrintrne_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfrintrne_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfrintrne.d(<4 x double> [[_1]])
-// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvfrintrne_d(v4f64 _1) { return __lasx_xvfrintrne_d(_1); }
-// CHECK-LABEL: @xvfrintrz_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfrintrz_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfrintrz.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvfrintrz_s(v8f32 _1) { return __lasx_xvfrintrz_s(_1); }
-// CHECK-LABEL: @xvfrintrz_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfrintrz_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfrintrz.d(<4 x double> [[_1]])
-// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvfrintrz_d(v4f64 _1) { return __lasx_xvfrintrz_d(_1); }
-// CHECK-LABEL: @xvfrintrp_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfrintrp_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfrintrp.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvfrintrp_s(v8f32 _1) { return __lasx_xvfrintrp_s(_1); }
-// CHECK-LABEL: @xvfrintrp_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfrintrp_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfrintrp.d(<4 x double> [[_1]])
-// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvfrintrp_d(v4f64 _1) { return __lasx_xvfrintrp_d(_1); }
-// CHECK-LABEL: @xvfrintrm_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfrintrm_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfrintrm.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvfrintrm_s(v8f32 _1) { return __lasx_xvfrintrm_s(_1); }
-// CHECK-LABEL: @xvfrintrm_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfrintrm_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfrintrm.d(<4 x double> [[_1]])
-// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvfrintrm_d(v4f64 _1) { return __lasx_xvfrintrm_d(_1); }
-// CHECK-LABEL: @xvld(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvld(ptr [[_1:%.*]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP0]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvld(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr noundef [[_1:%.*]]) local_unnamed_addr #[[ATTR3:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvld(ptr [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP0]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvld(void * _1) { return __lasx_xvld(_1, 1); }
-// CHECK-LABEL: @xvst(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    tail call void @llvm.loongarch.lasx.xvst(<32 x i8> [[_1]], ptr [[_2:%.*]], i32 1)
+// CHECK-LABEL: define dso_local void @xvst(
+// CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr noundef [[_2:%.*]]) local_unnamed_addr #[[ATTR5:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    tail call void @llvm.loongarch.lasx.xvst(<32 x i8> [[_1]], ptr [[_2]], i32 1)
 // CHECK-NEXT:    ret void
 //
 void xvst(v32i8 _1, void * _2) { return __lasx_xvst(_1, _2, 1); }
-// CHECK-LABEL: @xvstelm_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    tail call void @llvm.loongarch.lasx.xvstelm.b(<32 x i8> [[_1]], ptr [[_2:%.*]], i32 1, i32 1)
+// CHECK-LABEL: define dso_local void @xvstelm_b(
+// CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr noundef [[_2:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    tail call void @llvm.loongarch.lasx.xvstelm.b(<32 x i8> [[_1]], ptr [[_2]], i32 1, i32 1)
 // CHECK-NEXT:    ret void
 //
 void xvstelm_b(v32i8 _1, void * _2) { return __lasx_xvstelm_b(_1, _2, 1, 1); }
-// CHECK-LABEL: @xvstelm_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    tail call void @llvm.loongarch.lasx.xvstelm.h(<16 x i16> [[_1]], ptr [[_2:%.*]], i32 2, i32 1)
+// CHECK-LABEL: define dso_local void @xvstelm_h(
+// CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr noundef [[_2:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    tail call void @llvm.loongarch.lasx.xvstelm.h(<16 x i16> [[_1]], ptr [[_2]], i32 2, i32 1)
 // CHECK-NEXT:    ret void
 //
 void xvstelm_h(v16i16 _1, void * _2) { return __lasx_xvstelm_h(_1, _2, 2, 1); }
-// CHECK-LABEL: @xvstelm_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    tail call void @llvm.loongarch.lasx.xvstelm.w(<8 x i32> [[_1]], ptr [[_2:%.*]], i32 4, i32 1)
+// CHECK-LABEL: define dso_local void @xvstelm_w(
+// CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr noundef [[_2:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    tail call void @llvm.loongarch.lasx.xvstelm.w(<8 x i32> [[_1]], ptr [[_2]], i32 4, i32 1)
 // CHECK-NEXT:    ret void
 //
 void xvstelm_w(v8i32 _1, void * _2) { return __lasx_xvstelm_w(_1, _2, 4, 1); }
-// CHECK-LABEL: @xvstelm_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    tail call void @llvm.loongarch.lasx.xvstelm.d(<4 x i64> [[_1]], ptr [[_2:%.*]], i32 8, i32 1)
+// CHECK-LABEL: define dso_local void @xvstelm_d(
+// CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr noundef [[_2:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    tail call void @llvm.loongarch.lasx.xvstelm.d(<4 x i64> [[_1]], ptr [[_2]], i32 8, i32 1)
 // CHECK-NEXT:    ret void
 //
 void xvstelm_d(v4i64 _1, void * _2) { return __lasx_xvstelm_d(_1, _2, 8, 1); }
-// CHECK-LABEL: @xvinsve0_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvinsve0_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvinsve0.w(<8 x i32> [[_1]], <8 x i32> [[_2]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvinsve0_w(v8i32 _1, v8i32 _2) { return __lasx_xvinsve0_w(_1, _2, 1); }
-// CHECK-LABEL: @xvinsve0_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvinsve0_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvinsve0.d(<4 x i64> [[_1]], <4 x i64> [[_2]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvinsve0_d(v4i64 _1, v4i64 _2) { return __lasx_xvinsve0_d(_1, _2, 1); }
-// CHECK-LABEL: @xvpickve_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvpickve_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvpickve.w(<8 x i32> [[_1]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvpickve_w(v8i32 _1) { return __lasx_xvpickve_w(_1, 1); }
-// CHECK-LABEL: @xvpickve_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvpickve_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvpickve.d(<4 x i64> [[_1]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvpickve_d(v4i64 _1) { return __lasx_xvpickve_d(_1, 1); }
-// CHECK-LABEL: @xvssrlrn_b_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssrlrn_b_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrlrn.b.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvssrlrn_b_h(v16i16 _1, v16i16 _2) { return __lasx_xvssrlrn_b_h(_1, _2); }
-// CHECK-LABEL: @xvssrlrn_h_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssrlrn_h_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrlrn.h.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvssrlrn_h_w(v8i32 _1, v8i32 _2) { return __lasx_xvssrlrn_h_w(_1, _2); }
-// CHECK-LABEL: @xvssrlrn_w_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssrlrn_w_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrlrn.w.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvssrlrn_w_d(v4i64 _1, v4i64 _2) { return __lasx_xvssrlrn_w_d(_1, _2); }
-// CHECK-LABEL: @xvssrln_b_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssrln_b_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrln.b.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvssrln_b_h(v16i16 _1, v16i16 _2) { return __lasx_xvssrln_b_h(_1, _2); }
-// CHECK-LABEL: @xvssrln_h_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssrln_h_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrln.h.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvssrln_h_w(v8i32 _1, v8i32 _2) { return __lasx_xvssrln_h_w(_1, _2); }
-// CHECK-LABEL: @xvssrln_w_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssrln_w_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrln.w.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvssrln_w_d(v4i64 _1, v4i64 _2) { return __lasx_xvssrln_w_d(_1, _2); }
-// CHECK-LABEL: @xvorn_v(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvorn_v(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvorn.v(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvorn_v(v32i8 _1, v32i8 _2) { return __lasx_xvorn_v(_1, _2); }
-// CHECK-LABEL: @xvldi(
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define dso_local void @xvldi(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvldi(i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP0]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP0]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvldi() { return __lasx_xvldi(1); }
-// CHECK-LABEL: @xvldx(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvldx(ptr [[_1:%.*]], i64 1), !noalias [[META5:![0-9]+]]
-// CHECK-NEXT:    store <32 x i8> [[TMP0]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvldx(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr noundef [[_1:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvldx(ptr [[_1]], i64 1), !noalias [[META5:![0-9]+]]
+// CHECK-NEXT:    store <32 x i8> [[TMP0]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvldx(void * _1) { return __lasx_xvldx(_1, 1); }
-// CHECK-LABEL: @xvstx(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_112:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    tail call void @llvm.loongarch.lasx.xvstx(<32 x i8> [[_112]], ptr [[_2:%.*]], i64 1)
+// CHECK-LABEL: define dso_local void @xvstx(
+// CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr noundef [[_2:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_112:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    tail call void @llvm.loongarch.lasx.xvstx(<32 x i8> [[_112]], ptr [[_2]], i64 1)
 // CHECK-NEXT:    ret void
 //
 void xvstx(v32i8 _1, void * _2) { return __lasx_xvstx(_1, _2, 1); }
-// CHECK-LABEL: @xvextl_qu_du(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvextl_qu_du(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvextl.qu.du(<4 x i64> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvextl_qu_du(v4u64 _1) { return __lasx_xvextl_qu_du(_1); }
-// CHECK-LABEL: @xvinsgr2vr_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvinsgr2vr_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvinsgr2vr.w(<8 x i32> [[_1]], i32 1, i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvinsgr2vr_w(v8i32 _1) { return __lasx_xvinsgr2vr_w(_1, 1, 1); }
-// CHECK-LABEL: @xvinsgr2vr_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvinsgr2vr_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvinsgr2vr.d(<4 x i64> [[_1]], i64 1, i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvinsgr2vr_d(v4i64 _1) { return __lasx_xvinsgr2vr_d(_1, 1, 1); }
-// CHECK-LABEL: @xvreplve0_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_112:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvreplve0_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_112:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvreplve0.b(<32 x i8> [[_112]])
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvreplve0_b(v32i8 _1) { return __lasx_xvreplve0_b(_1); }
-// CHECK-LABEL: @xvreplve0_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_112:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvreplve0_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_112:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvreplve0.h(<16 x i16> [[_112]])
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvreplve0_h(v16i16 _1) { return __lasx_xvreplve0_h(_1); }
-// CHECK-LABEL: @xvreplve0_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_112:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvreplve0_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_112:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvreplve0.w(<8 x i32> [[_112]])
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvreplve0_w(v8i32 _1) { return __lasx_xvreplve0_w(_1); }
-// CHECK-LABEL: @xvreplve0_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvreplve0_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvreplve0.d(<4 x i64> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvreplve0_d(v4i64 _1) { return __lasx_xvreplve0_d(_1); }
-// CHECK-LABEL: @xvreplve0_q(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_112:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvreplve0_q(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_112:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvreplve0.q(<32 x i8> [[_112]])
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvreplve0_q(v32i8 _1) { return __lasx_xvreplve0_q(_1); }
-// CHECK-LABEL: @vext2xv_h_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_112:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @vext2xv_h_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_112:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.vext2xv.h.b(<32 x i8> [[_112]])
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 vext2xv_h_b(v32i8 _1) { return __lasx_vext2xv_h_b(_1); }
-// CHECK-LABEL: @vext2xv_w_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_112:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @vext2xv_w_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_112:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.vext2xv.w.h(<16 x i16> [[_112]])
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 vext2xv_w_h(v16i16 _1) { return __lasx_vext2xv_w_h(_1); }
-// CHECK-LABEL: @vext2xv_d_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_112:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @vext2xv_d_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_112:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.vext2xv.d.w(<8 x i32> [[_112]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 vext2xv_d_w(v8i32 _1) { return __lasx_vext2xv_d_w(_1); }
-// CHECK-LABEL: @vext2xv_w_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_112:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @vext2xv_w_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_112:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.vext2xv.w.b(<32 x i8> [[_112]])
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 vext2xv_w_b(v32i8 _1) { return __lasx_vext2xv_w_b(_1); }
-// CHECK-LABEL: @vext2xv_d_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_112:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @vext2xv_d_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_112:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.vext2xv.d.h(<16 x i16> [[_112]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 vext2xv_d_h(v16i16 _1) { return __lasx_vext2xv_d_h(_1); }
-// CHECK-LABEL: @vext2xv_d_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_112:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @vext2xv_d_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_112:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.vext2xv.d.b(<32 x i8> [[_112]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 vext2xv_d_b(v32i8 _1) { return __lasx_vext2xv_d_b(_1); }
-// CHECK-LABEL: @vext2xv_hu_bu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_112:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @vext2xv_hu_bu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_112:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.vext2xv.hu.bu(<32 x i8> [[_112]])
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 vext2xv_hu_bu(v32i8 _1) { return __lasx_vext2xv_hu_bu(_1); }
-// CHECK-LABEL: @vext2xv_wu_hu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_112:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @vext2xv_wu_hu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_112:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.vext2xv.wu.hu(<16 x i16> [[_112]])
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 vext2xv_wu_hu(v16i16 _1) { return __lasx_vext2xv_wu_hu(_1); }
-// CHECK-LABEL: @vext2xv_du_wu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_112:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @vext2xv_du_wu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_112:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.vext2xv.du.wu(<8 x i32> [[_112]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 vext2xv_du_wu(v8i32 _1) { return __lasx_vext2xv_du_wu(_1); }
-// CHECK-LABEL: @vext2xv_wu_bu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_112:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @vext2xv_wu_bu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_112:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.vext2xv.wu.bu(<32 x i8> [[_112]])
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 vext2xv_wu_bu(v32i8 _1) { return __lasx_vext2xv_wu_bu(_1); }
-// CHECK-LABEL: @vext2xv_du_hu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_112:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @vext2xv_du_hu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_112:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.vext2xv.du.hu(<16 x i16> [[_112]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 vext2xv_du_hu(v16i16 _1) { return __lasx_vext2xv_du_hu(_1); }
-// CHECK-LABEL: @vext2xv_du_bu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_112:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @vext2xv_du_bu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_112:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.vext2xv.du.bu(<32 x i8> [[_112]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 vext2xv_du_bu(v32i8 _1) { return __lasx_vext2xv_du_bu(_1); }
-// CHECK-LABEL: @xvpermi_q(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvpermi_q(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvpermi.q(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvpermi_q(v32i8 _1, v32i8 _2) { return __lasx_xvpermi_q(_1, _2, 1); }
-// CHECK-LABEL: @xvpermi_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvpermi_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvpermi.d(<4 x i64> [[_1]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvpermi_d(v4i64 _1) { return __lasx_xvpermi_d(_1, 1); }
-// CHECK-LABEL: @xvperm_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvperm_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvperm.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvperm_w(v8i32 _1, v8i32 _2) { return __lasx_xvperm_w(_1, _2); }
-// CHECK-LABEL: @xvldrepl_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvldrepl.b(ptr [[_1:%.*]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP0]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvldrepl_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr noundef [[_1:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvldrepl.b(ptr [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP0]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvldrepl_b(void * _1) { return __lasx_xvldrepl_b(_1, 1); }
-// CHECK-LABEL: @xvldrepl_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvldrepl.h(ptr [[_1:%.*]], i32 2)
-// CHECK-NEXT:    store <16 x i16> [[TMP0]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvldrepl_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr noundef [[_1:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvldrepl.h(ptr [[_1]], i32 2)
+// CHECK-NEXT:    store <16 x i16> [[TMP0]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvldrepl_h(void * _1) { return __lasx_xvldrepl_h(_1, 2); }
-// CHECK-LABEL: @xvldrepl_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvldrepl.w(ptr [[_1:%.*]], i32 4)
-// CHECK-NEXT:    store <8 x i32> [[TMP0]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvldrepl_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr noundef [[_1:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvldrepl.w(ptr [[_1]], i32 4)
+// CHECK-NEXT:    store <8 x i32> [[TMP0]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvldrepl_w(void * _1) { return __lasx_xvldrepl_w(_1, 4); }
-// CHECK-LABEL: @xvldrepl_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvldrepl.d(ptr [[_1:%.*]], i32 8)
-// CHECK-NEXT:    store <4 x i64> [[TMP0]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvldrepl_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr noundef [[_1:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvldrepl.d(ptr [[_1]], i32 8)
+// CHECK-NEXT:    store <4 x i64> [[TMP0]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvldrepl_d(void * _1) { return __lasx_xvldrepl_d(_1, 8); }
-// CHECK-LABEL: @xvpickve2gr_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local signext i32 @xvpickve2gr_w(
+// CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR7:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lasx.xvpickve2gr.w(<8 x i32> [[_1]], i32 1)
 // CHECK-NEXT:    ret i32 [[TMP1]]
 //
 int xvpickve2gr_w(v8i32 _1) { return __lasx_xvpickve2gr_w(_1, 1); }
-// CHECK-LABEL: @xvpickve2gr_wu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local signext i32 @xvpickve2gr_wu(
+// CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR7]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lasx.xvpickve2gr.wu(<8 x i32> [[_1]], i32 1)
 // CHECK-NEXT:    ret i32 [[TMP1]]
 //
 unsigned int xvpickve2gr_wu(v8i32 _1) { return __lasx_xvpickve2gr_wu(_1, 1); }
-// CHECK-LABEL: @xvpickve2gr_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local i64 @xvpickve2gr_d(
+// CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR7]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.loongarch.lasx.xvpickve2gr.d(<4 x i64> [[_1]], i32 1)
 // CHECK-NEXT:    ret i64 [[TMP1]]
 //
 long xvpickve2gr_d(v4i64 _1) { return __lasx_xvpickve2gr_d(_1, 1); }
-// CHECK-LABEL: @xvpickve2gr_du(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local i64 @xvpickve2gr_du(
+// CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR7]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.loongarch.lasx.xvpickve2gr.du(<4 x i64> [[_1]], i32 1)
 // CHECK-NEXT:    ret i64 [[TMP1]]
 //
 unsigned long int xvpickve2gr_du(v4i64 _1) { return __lasx_xvpickve2gr_du(_1, 1); }
-// CHECK-LABEL: @xvaddwev_q_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvaddwev_q_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwev.q.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvaddwev_q_d(v4i64 _1, v4i64 _2) { return __lasx_xvaddwev_q_d(_1, _2); }
-// CHECK-LABEL: @xvaddwev_d_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvaddwev_d_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwev.d.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvaddwev_d_w(v8i32 _1, v8i32 _2) { return __lasx_xvaddwev_d_w(_1, _2); }
-// CHECK-LABEL: @xvaddwev_w_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvaddwev_w_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvaddwev.w.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvaddwev_w_h(v16i16 _1, v16i16 _2) { return __lasx_xvaddwev_w_h(_1, _2); }
-// CHECK-LABEL: @xvaddwev_h_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvaddwev_h_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvaddwev.h.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvaddwev_h_b(v32i8 _1, v32i8 _2) { return __lasx_xvaddwev_h_b(_1, _2); }
-// CHECK-LABEL: @xvaddwev_q_du(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvaddwev_q_du(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwev.q.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvaddwev_q_du(v4u64 _1, v4u64 _2) { return __lasx_xvaddwev_q_du(_1, _2); }
-// CHECK-LABEL: @xvaddwev_d_wu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvaddwev_d_wu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwev.d.wu(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvaddwev_d_wu(v8u32 _1, v8u32 _2) { return __lasx_xvaddwev_d_wu(_1, _2); }
-// CHECK-LABEL: @xvaddwev_w_hu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvaddwev_w_hu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvaddwev.w.hu(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvaddwev_w_hu(v16u16 _1, v16u16 _2) { return __lasx_xvaddwev_w_hu(_1, _2); }
-// CHECK-LABEL: @xvaddwev_h_bu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvaddwev_h_bu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvaddwev.h.bu(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvaddwev_h_bu(v32u8 _1, v32u8 _2) { return __lasx_xvaddwev_h_bu(_1, _2); }
-// CHECK-LABEL: @xvsubwev_q_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsubwev_q_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubwev.q.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvsubwev_q_d(v4i64 _1, v4i64 _2) { return __lasx_xvsubwev_q_d(_1, _2); }
-// CHECK-LABEL: @xvsubwev_d_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsubwev_d_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubwev.d.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvsubwev_d_w(v8i32 _1, v8i32 _2) { return __lasx_xvsubwev_d_w(_1, _2); }
-// CHECK-LABEL: @xvsubwev_w_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsubwev_w_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsubwev.w.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvsubwev_w_h(v16i16 _1, v16i16 _2) { return __lasx_xvsubwev_w_h(_1, _2); }
-// CHECK-LABEL: @xvsubwev_h_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsubwev_h_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsubwev.h.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvsubwev_h_b(v32i8 _1, v32i8 _2) { return __lasx_xvsubwev_h_b(_1, _2); }
-// CHECK-LABEL: @xvsubwev_q_du(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsubwev_q_du(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubwev.q.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvsubwev_q_du(v4u64 _1, v4u64 _2) { return __lasx_xvsubwev_q_du(_1, _2); }
-// CHECK-LABEL: @xvsubwev_d_wu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsubwev_d_wu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubwev.d.wu(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvsubwev_d_wu(v8u32 _1, v8u32 _2) { return __lasx_xvsubwev_d_wu(_1, _2); }
-// CHECK-LABEL: @xvsubwev_w_hu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsubwev_w_hu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsubwev.w.hu(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvsubwev_w_hu(v16u16 _1, v16u16 _2) { return __lasx_xvsubwev_w_hu(_1, _2); }
-// CHECK-LABEL: @xvsubwev_h_bu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsubwev_h_bu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsubwev.h.bu(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvsubwev_h_bu(v32u8 _1, v32u8 _2) { return __lasx_xvsubwev_h_bu(_1, _2); }
-// CHECK-LABEL: @xvmulwev_q_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmulwev_q_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwev.q.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvmulwev_q_d(v4i64 _1, v4i64 _2) { return __lasx_xvmulwev_q_d(_1, _2); }
-// CHECK-LABEL: @xvmulwev_d_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmulwev_d_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwev.d.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvmulwev_d_w(v8i32 _1, v8i32 _2) { return __lasx_xvmulwev_d_w(_1, _2); }
-// CHECK-LABEL: @xvmulwev_w_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmulwev_w_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmulwev.w.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvmulwev_w_h(v16i16 _1, v16i16 _2) { return __lasx_xvmulwev_w_h(_1, _2); }
-// CHECK-LABEL: @xvmulwev_h_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmulwev_h_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmulwev.h.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvmulwev_h_b(v32i8 _1, v32i8 _2) { return __lasx_xvmulwev_h_b(_1, _2); }
-// CHECK-LABEL: @xvmulwev_q_du(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmulwev_q_du(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwev.q.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvmulwev_q_du(v4u64 _1, v4u64 _2) { return __lasx_xvmulwev_q_du(_1, _2); }
-// CHECK-LABEL: @xvmulwev_d_wu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmulwev_d_wu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwev.d.wu(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvmulwev_d_wu(v8u32 _1, v8u32 _2) { return __lasx_xvmulwev_d_wu(_1, _2); }
-// CHECK-LABEL: @xvmulwev_w_hu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmulwev_w_hu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmulwev.w.hu(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvmulwev_w_hu(v16u16 _1, v16u16 _2) { return __lasx_xvmulwev_w_hu(_1, _2); }
-// CHECK-LABEL: @xvmulwev_h_bu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmulwev_h_bu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmulwev.h.bu(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvmulwev_h_bu(v32u8 _1, v32u8 _2) { return __lasx_xvmulwev_h_bu(_1, _2); }
-// CHECK-LABEL: @xvaddwod_q_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvaddwod_q_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwod.q.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvaddwod_q_d(v4i64 _1, v4i64 _2) { return __lasx_xvaddwod_q_d(_1, _2); }
-// CHECK-LABEL: @xvaddwod_d_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvaddwod_d_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwod.d.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvaddwod_d_w(v8i32 _1, v8i32 _2) { return __lasx_xvaddwod_d_w(_1, _2); }
-// CHECK-LABEL: @xvaddwod_w_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvaddwod_w_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvaddwod.w.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvaddwod_w_h(v16i16 _1, v16i16 _2) { return __lasx_xvaddwod_w_h(_1, _2); }
-// CHECK-LABEL: @xvaddwod_h_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvaddwod_h_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvaddwod.h.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvaddwod_h_b(v32i8 _1, v32i8 _2) { return __lasx_xvaddwod_h_b(_1, _2); }
-// CHECK-LABEL: @xvaddwod_q_du(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvaddwod_q_du(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwod.q.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvaddwod_q_du(v4u64 _1, v4u64 _2) { return __lasx_xvaddwod_q_du(_1, _2); }
-// CHECK-LABEL: @xvaddwod_d_wu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvaddwod_d_wu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwod.d.wu(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvaddwod_d_wu(v8u32 _1, v8u32 _2) { return __lasx_xvaddwod_d_wu(_1, _2); }
-// CHECK-LABEL: @xvaddwod_w_hu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvaddwod_w_hu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvaddwod.w.hu(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvaddwod_w_hu(v16u16 _1, v16u16 _2) { return __lasx_xvaddwod_w_hu(_1, _2); }
-// CHECK-LABEL: @xvaddwod_h_bu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvaddwod_h_bu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvaddwod.h.bu(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvaddwod_h_bu(v32u8 _1, v32u8 _2) { return __lasx_xvaddwod_h_bu(_1, _2); }
-// CHECK-LABEL: @xvsubwod_q_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsubwod_q_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubwod.q.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvsubwod_q_d(v4i64 _1, v4i64 _2) { return __lasx_xvsubwod_q_d(_1, _2); }
-// CHECK-LABEL: @xvsubwod_d_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsubwod_d_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubwod.d.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvsubwod_d_w(v8i32 _1, v8i32 _2) { return __lasx_xvsubwod_d_w(_1, _2); }
-// CHECK-LABEL: @xvsubwod_w_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsubwod_w_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsubwod.w.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvsubwod_w_h(v16i16 _1, v16i16 _2) { return __lasx_xvsubwod_w_h(_1, _2); }
-// CHECK-LABEL: @xvsubwod_h_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsubwod_h_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsubwod.h.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvsubwod_h_b(v32i8 _1, v32i8 _2) { return __lasx_xvsubwod_h_b(_1, _2); }
-// CHECK-LABEL: @xvsubwod_q_du(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsubwod_q_du(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubwod.q.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvsubwod_q_du(v4u64 _1, v4u64 _2) { return __lasx_xvsubwod_q_du(_1, _2); }
-// CHECK-LABEL: @xvsubwod_d_wu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsubwod_d_wu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubwod.d.wu(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvsubwod_d_wu(v8u32 _1, v8u32 _2) { return __lasx_xvsubwod_d_wu(_1, _2); }
-// CHECK-LABEL: @xvsubwod_w_hu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsubwod_w_hu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsubwod.w.hu(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvsubwod_w_hu(v16u16 _1, v16u16 _2) { return __lasx_xvsubwod_w_hu(_1, _2); }
-// CHECK-LABEL: @xvsubwod_h_bu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsubwod_h_bu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsubwod.h.bu(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvsubwod_h_bu(v32u8 _1, v32u8 _2) { return __lasx_xvsubwod_h_bu(_1, _2); }
-// CHECK-LABEL: @xvmulwod_q_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmulwod_q_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwod.q.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvmulwod_q_d(v4i64 _1, v4i64 _2) { return __lasx_xvmulwod_q_d(_1, _2); }
-// CHECK-LABEL: @xvmulwod_d_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmulwod_d_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwod.d.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvmulwod_d_w(v8i32 _1, v8i32 _2) { return __lasx_xvmulwod_d_w(_1, _2); }
-// CHECK-LABEL: @xvmulwod_w_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmulwod_w_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmulwod.w.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvmulwod_w_h(v16i16 _1, v16i16 _2) { return __lasx_xvmulwod_w_h(_1, _2); }
-// CHECK-LABEL: @xvmulwod_h_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmulwod_h_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmulwod.h.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvmulwod_h_b(v32i8 _1, v32i8 _2) { return __lasx_xvmulwod_h_b(_1, _2); }
-// CHECK-LABEL: @xvmulwod_q_du(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmulwod_q_du(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwod.q.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvmulwod_q_du(v4u64 _1, v4u64 _2) { return __lasx_xvmulwod_q_du(_1, _2); }
-// CHECK-LABEL: @xvmulwod_d_wu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmulwod_d_wu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwod.d.wu(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvmulwod_d_wu(v8u32 _1, v8u32 _2) { return __lasx_xvmulwod_d_wu(_1, _2); }
-// CHECK-LABEL: @xvmulwod_w_hu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmulwod_w_hu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmulwod.w.hu(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvmulwod_w_hu(v16u16 _1, v16u16 _2) { return __lasx_xvmulwod_w_hu(_1, _2); }
-// CHECK-LABEL: @xvmulwod_h_bu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmulwod_h_bu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmulwod.h.bu(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvmulwod_h_bu(v32u8 _1, v32u8 _2) { return __lasx_xvmulwod_h_bu(_1, _2); }
-// CHECK-LABEL: @xvaddwev_d_wu_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvaddwev_d_wu_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwev.d.wu.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvaddwev_d_wu_w(v8u32 _1, v8i32 _2) { return __lasx_xvaddwev_d_wu_w(_1, _2); }
-// CHECK-LABEL: @xvaddwev_w_hu_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvaddwev_w_hu_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvaddwev.w.hu.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvaddwev_w_hu_h(v16u16 _1, v16i16 _2) { return __lasx_xvaddwev_w_hu_h(_1, _2); }
-// CHECK-LABEL: @xvaddwev_h_bu_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvaddwev_h_bu_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvaddwev.h.bu.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvaddwev_h_bu_b(v32u8 _1, v32i8 _2) { return __lasx_xvaddwev_h_bu_b(_1, _2); }
-// CHECK-LABEL: @xvmulwev_d_wu_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmulwev_d_wu_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwev.d.wu.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvmulwev_d_wu_w(v8u32 _1, v8i32 _2) { return __lasx_xvmulwev_d_wu_w(_1, _2); }
-// CHECK-LABEL: @xvmulwev_w_hu_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmulwev_w_hu_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmulwev.w.hu.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvmulwev_w_hu_h(v16u16 _1, v16i16 _2) { return __lasx_xvmulwev_w_hu_h(_1, _2); }
-// CHECK-LABEL: @xvmulwev_h_bu_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmulwev_h_bu_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmulwev.h.bu.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvmulwev_h_bu_b(v32u8 _1, v32i8 _2) { return __lasx_xvmulwev_h_bu_b(_1, _2); }
-// CHECK-LABEL: @xvaddwod_d_wu_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvaddwod_d_wu_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwod.d.wu.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvaddwod_d_wu_w(v8u32 _1, v8i32 _2) { return __lasx_xvaddwod_d_wu_w(_1, _2); }
-// CHECK-LABEL: @xvaddwod_w_hu_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvaddwod_w_hu_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvaddwod.w.hu.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvaddwod_w_hu_h(v16u16 _1, v16i16 _2) { return __lasx_xvaddwod_w_hu_h(_1, _2); }
-// CHECK-LABEL: @xvaddwod_h_bu_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvaddwod_h_bu_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvaddwod.h.bu.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvaddwod_h_bu_b(v32u8 _1, v32i8 _2) { return __lasx_xvaddwod_h_bu_b(_1, _2); }
-// CHECK-LABEL: @xvmulwod_d_wu_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmulwod_d_wu_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwod.d.wu.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvmulwod_d_wu_w(v8u32 _1, v8i32 _2) { return __lasx_xvmulwod_d_wu_w(_1, _2); }
-// CHECK-LABEL: @xvmulwod_w_hu_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmulwod_w_hu_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmulwod.w.hu.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvmulwod_w_hu_h(v16u16 _1, v16i16 _2) { return __lasx_xvmulwod_w_hu_h(_1, _2); }
-// CHECK-LABEL: @xvmulwod_h_bu_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmulwod_h_bu_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmulwod.h.bu.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvmulwod_h_bu_b(v32u8 _1, v32i8 _2) { return __lasx_xvmulwod_h_bu_b(_1, _2); }
-// CHECK-LABEL: @xvhaddw_q_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvhaddw_q_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvhaddw.q.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvhaddw_q_d(v4i64 _1, v4i64 _2) { return __lasx_xvhaddw_q_d(_1, _2); }
-// CHECK-LABEL: @xvhaddw_qu_du(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvhaddw_qu_du(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvhaddw.qu.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvhaddw_qu_du(v4u64 _1, v4u64 _2) { return __lasx_xvhaddw_qu_du(_1, _2); }
-// CHECK-LABEL: @xvhsubw_q_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvhsubw_q_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvhsubw.q.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvhsubw_q_d(v4i64 _1, v4i64 _2) { return __lasx_xvhsubw_q_d(_1, _2); }
-// CHECK-LABEL: @xvhsubw_qu_du(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvhsubw_qu_du(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvhsubw.qu.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvhsubw_qu_du(v4u64 _1, v4u64 _2) { return __lasx_xvhsubw_qu_du(_1, _2); }
-// CHECK-LABEL: @xvmaddwev_q_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <4 x i64>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmaddwev_q_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <4 x i64>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwev.q.d(<4 x i64> [[_1]], <4 x i64> [[_2]], <4 x i64> [[_3]])
-// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvmaddwev_q_d(v4i64 _1, v4i64 _2, v4i64 _3) { return __lasx_xvmaddwev_q_d(_1, _2, _3); }
-// CHECK-LABEL: @xvmaddwev_d_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_346:%.*]] = load <8 x i32>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmaddwev_d_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_346:%.*]] = load <8 x i32>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwev.d.w(<4 x i64> [[_1]], <8 x i32> [[_235]], <8 x i32> [[_346]])
-// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvmaddwev_d_w(v4i64 _1, v8i32 _2, v8i32 _3) { return __lasx_xvmaddwev_d_w(_1, _2, _3); }
-// CHECK-LABEL: @xvmaddwev_w_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_136:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_247:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_358:%.*]] = load <16 x i16>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmaddwev_w_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_136:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_247:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_358:%.*]] = load <16 x i16>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmaddwev.w.h(<8 x i32> [[_136]], <16 x i16> [[_247]], <16 x i16> [[_358]])
-// CHECK-NEXT:    store <8 x i32> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvmaddwev_w_h(v8i32 _1, v16i16 _2, v16i16 _3) { return __lasx_xvmaddwev_w_h(_1, _2, _3); }
-// CHECK-LABEL: @xvmaddwev_h_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_136:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_247:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_358:%.*]] = load <32 x i8>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmaddwev_h_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_136:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_247:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_358:%.*]] = load <32 x i8>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmaddwev.h.b(<16 x i16> [[_136]], <32 x i8> [[_247]], <32 x i8> [[_358]])
-// CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvmaddwev_h_b(v16i16 _1, v32i8 _2, v32i8 _3) { return __lasx_xvmaddwev_h_b(_1, _2, _3); }
-// CHECK-LABEL: @xvmaddwev_q_du(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <4 x i64>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmaddwev_q_du(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <4 x i64>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwev.q.du(<4 x i64> [[_1]], <4 x i64> [[_2]], <4 x i64> [[_3]])
-// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvmaddwev_q_du(v4u64 _1, v4u64 _2, v4u64 _3) { return __lasx_xvmaddwev_q_du(_1, _2, _3); }
-// CHECK-LABEL: @xvmaddwev_d_wu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_346:%.*]] = load <8 x i32>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmaddwev_d_wu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_346:%.*]] = load <8 x i32>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwev.d.wu(<4 x i64> [[_1]], <8 x i32> [[_235]], <8 x i32> [[_346]])
-// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvmaddwev_d_wu(v4u64 _1, v8u32 _2, v8u32 _3) { return __lasx_xvmaddwev_d_wu(_1, _2, _3); }
-// CHECK-LABEL: @xvmaddwev_w_hu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_136:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_247:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_358:%.*]] = load <16 x i16>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmaddwev_w_hu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_136:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_247:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_358:%.*]] = load <16 x i16>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmaddwev.w.hu(<8 x i32> [[_136]], <16 x i16> [[_247]], <16 x i16> [[_358]])
-// CHECK-NEXT:    store <8 x i32> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvmaddwev_w_hu(v8u32 _1, v16u16 _2, v16u16 _3) { return __lasx_xvmaddwev_w_hu(_1, _2, _3); }
-// CHECK-LABEL: @xvmaddwev_h_bu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_136:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_247:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_358:%.*]] = load <32 x i8>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmaddwev_h_bu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_136:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_247:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_358:%.*]] = load <32 x i8>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmaddwev.h.bu(<16 x i16> [[_136]], <32 x i8> [[_247]], <32 x i8> [[_358]])
-// CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvmaddwev_h_bu(v16u16 _1, v32u8 _2, v32u8 _3) { return __lasx_xvmaddwev_h_bu(_1, _2, _3); }
-// CHECK-LABEL: @xvmaddwod_q_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <4 x i64>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmaddwod_q_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <4 x i64>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwod.q.d(<4 x i64> [[_1]], <4 x i64> [[_2]], <4 x i64> [[_3]])
-// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvmaddwod_q_d(v4i64 _1, v4i64 _2, v4i64 _3) { return __lasx_xvmaddwod_q_d(_1, _2, _3); }
-// CHECK-LABEL: @xvmaddwod_d_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_346:%.*]] = load <8 x i32>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmaddwod_d_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_346:%.*]] = load <8 x i32>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwod.d.w(<4 x i64> [[_1]], <8 x i32> [[_235]], <8 x i32> [[_346]])
-// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvmaddwod_d_w(v4i64 _1, v8i32 _2, v8i32 _3) { return __lasx_xvmaddwod_d_w(_1, _2, _3); }
-// CHECK-LABEL: @xvmaddwod_w_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_136:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_247:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_358:%.*]] = load <16 x i16>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmaddwod_w_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_136:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_247:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_358:%.*]] = load <16 x i16>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmaddwod.w.h(<8 x i32> [[_136]], <16 x i16> [[_247]], <16 x i16> [[_358]])
-// CHECK-NEXT:    store <8 x i32> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvmaddwod_w_h(v8i32 _1, v16i16 _2, v16i16 _3) { return __lasx_xvmaddwod_w_h(_1, _2, _3); }
-// CHECK-LABEL: @xvmaddwod_h_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_136:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_247:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_358:%.*]] = load <32 x i8>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmaddwod_h_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_136:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_247:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_358:%.*]] = load <32 x i8>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmaddwod.h.b(<16 x i16> [[_136]], <32 x i8> [[_247]], <32 x i8> [[_358]])
-// CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvmaddwod_h_b(v16i16 _1, v32i8 _2, v32i8 _3) { return __lasx_xvmaddwod_h_b(_1, _2, _3); }
-// CHECK-LABEL: @xvmaddwod_q_du(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <4 x i64>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmaddwod_q_du(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <4 x i64>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwod.q.du(<4 x i64> [[_1]], <4 x i64> [[_2]], <4 x i64> [[_3]])
-// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvmaddwod_q_du(v4u64 _1, v4u64 _2, v4u64 _3) { return __lasx_xvmaddwod_q_du(_1, _2, _3); }
-// CHECK-LABEL: @xvmaddwod_d_wu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_346:%.*]] = load <8 x i32>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmaddwod_d_wu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_346:%.*]] = load <8 x i32>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwod.d.wu(<4 x i64> [[_1]], <8 x i32> [[_235]], <8 x i32> [[_346]])
-// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvmaddwod_d_wu(v4u64 _1, v8u32 _2, v8u32 _3) { return __lasx_xvmaddwod_d_wu(_1, _2, _3); }
-// CHECK-LABEL: @xvmaddwod_w_hu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_136:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_247:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_358:%.*]] = load <16 x i16>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmaddwod_w_hu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_136:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_247:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_358:%.*]] = load <16 x i16>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmaddwod.w.hu(<8 x i32> [[_136]], <16 x i16> [[_247]], <16 x i16> [[_358]])
-// CHECK-NEXT:    store <8 x i32> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvmaddwod_w_hu(v8u32 _1, v16u16 _2, v16u16 _3) { return __lasx_xvmaddwod_w_hu(_1, _2, _3); }
-// CHECK-LABEL: @xvmaddwod_h_bu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_136:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_247:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_358:%.*]] = load <32 x i8>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmaddwod_h_bu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_136:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_247:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_358:%.*]] = load <32 x i8>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmaddwod.h.bu(<16 x i16> [[_136]], <32 x i8> [[_247]], <32 x i8> [[_358]])
-// CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvmaddwod_h_bu(v16u16 _1, v32u8 _2, v32u8 _3) { return __lasx_xvmaddwod_h_bu(_1, _2, _3); }
-// CHECK-LABEL: @xvmaddwev_q_du_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <4 x i64>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmaddwev_q_du_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <4 x i64>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwev.q.du.d(<4 x i64> [[_1]], <4 x i64> [[_2]], <4 x i64> [[_3]])
-// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvmaddwev_q_du_d(v4i64 _1, v4u64 _2, v4i64 _3) { return __lasx_xvmaddwev_q_du_d(_1, _2, _3); }
-// CHECK-LABEL: @xvmaddwev_d_wu_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_346:%.*]] = load <8 x i32>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmaddwev_d_wu_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_346:%.*]] = load <8 x i32>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwev.d.wu.w(<4 x i64> [[_1]], <8 x i32> [[_235]], <8 x i32> [[_346]])
-// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvmaddwev_d_wu_w(v4i64 _1, v8u32 _2, v8i32 _3) { return __lasx_xvmaddwev_d_wu_w(_1, _2, _3); }
-// CHECK-LABEL: @xvmaddwev_w_hu_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_136:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_247:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_358:%.*]] = load <16 x i16>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmaddwev_w_hu_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_136:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_247:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_358:%.*]] = load <16 x i16>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmaddwev.w.hu.h(<8 x i32> [[_136]], <16 x i16> [[_247]], <16 x i16> [[_358]])
-// CHECK-NEXT:    store <8 x i32> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvmaddwev_w_hu_h(v8i32 _1, v16u16 _2, v16i16 _3) { return __lasx_xvmaddwev_w_hu_h(_1, _2, _3); }
-// CHECK-LABEL: @xvmaddwev_h_bu_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_136:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_247:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_358:%.*]] = load <32 x i8>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmaddwev_h_bu_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_136:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_247:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_358:%.*]] = load <32 x i8>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmaddwev.h.bu.b(<16 x i16> [[_136]], <32 x i8> [[_247]], <32 x i8> [[_358]])
-// CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvmaddwev_h_bu_b(v16i16 _1, v32u8 _2, v32i8 _3) { return __lasx_xvmaddwev_h_bu_b(_1, _2, _3); }
-// CHECK-LABEL: @xvmaddwod_q_du_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <4 x i64>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmaddwod_q_du_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <4 x i64>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwod.q.du.d(<4 x i64> [[_1]], <4 x i64> [[_2]], <4 x i64> [[_3]])
-// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvmaddwod_q_du_d(v4i64 _1, v4u64 _2, v4i64 _3) { return __lasx_xvmaddwod_q_du_d(_1, _2, _3); }
-// CHECK-LABEL: @xvmaddwod_d_wu_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_346:%.*]] = load <8 x i32>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmaddwod_d_wu_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_346:%.*]] = load <8 x i32>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwod.d.wu.w(<4 x i64> [[_1]], <8 x i32> [[_235]], <8 x i32> [[_346]])
-// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvmaddwod_d_wu_w(v4i64 _1, v8u32 _2, v8i32 _3) { return __lasx_xvmaddwod_d_wu_w(_1, _2, _3); }
-// CHECK-LABEL: @xvmaddwod_w_hu_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_136:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_247:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_358:%.*]] = load <16 x i16>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmaddwod_w_hu_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_136:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_247:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_358:%.*]] = load <16 x i16>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmaddwod.w.hu.h(<8 x i32> [[_136]], <16 x i16> [[_247]], <16 x i16> [[_358]])
-// CHECK-NEXT:    store <8 x i32> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvmaddwod_w_hu_h(v8i32 _1, v16u16 _2, v16i16 _3) { return __lasx_xvmaddwod_w_hu_h(_1, _2, _3); }
-// CHECK-LABEL: @xvmaddwod_h_bu_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_136:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_247:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_358:%.*]] = load <32 x i8>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmaddwod_h_bu_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_136:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_247:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_358:%.*]] = load <32 x i8>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmaddwod.h.bu.b(<16 x i16> [[_136]], <32 x i8> [[_247]], <32 x i8> [[_358]])
-// CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvmaddwod_h_bu_b(v16i16 _1, v32u8 _2, v32i8 _3) { return __lasx_xvmaddwod_h_bu_b(_1, _2, _3); }
-// CHECK-LABEL: @xvrotr_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvrotr_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvrotr.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvrotr_b(v32i8 _1, v32i8 _2) { return __lasx_xvrotr_b(_1, _2); }
-// CHECK-LABEL: @xvrotr_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvrotr_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvrotr.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvrotr_h(v16i16 _1, v16i16 _2) { return __lasx_xvrotr_h(_1, _2); }
-// CHECK-LABEL: @xvrotr_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvrotr_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvrotr.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvrotr_w(v8i32 _1, v8i32 _2) { return __lasx_xvrotr_w(_1, _2); }
-// CHECK-LABEL: @xvrotr_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvrotr_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvrotr.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvrotr_d(v4i64 _1, v4i64 _2) { return __lasx_xvrotr_d(_1, _2); }
-// CHECK-LABEL: @xvadd_q(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvadd_q(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvadd.q(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvadd_q(v4i64 _1, v4i64 _2) { return __lasx_xvadd_q(_1, _2); }
-// CHECK-LABEL: @xvsub_q(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsub_q(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsub.q(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvsub_q(v4i64 _1, v4i64 _2) { return __lasx_xvsub_q(_1, _2); }
-// CHECK-LABEL: @xvaddwev_q_du_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvaddwev_q_du_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwev.q.du.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvaddwev_q_du_d(v4u64 _1, v4i64 _2) { return __lasx_xvaddwev_q_du_d(_1, _2); }
-// CHECK-LABEL: @xvaddwod_q_du_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvaddwod_q_du_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwod.q.du.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvaddwod_q_du_d(v4u64 _1, v4i64 _2) { return __lasx_xvaddwod_q_du_d(_1, _2); }
-// CHECK-LABEL: @xvmulwev_q_du_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmulwev_q_du_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwev.q.du.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvmulwev_q_du_d(v4u64 _1, v4i64 _2) { return __lasx_xvmulwev_q_du_d(_1, _2); }
-// CHECK-LABEL: @xvmulwod_q_du_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmulwod_q_du_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwod.q.du.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvmulwod_q_du_d(v4u64 _1, v4i64 _2) { return __lasx_xvmulwod_q_du_d(_1, _2); }
-// CHECK-LABEL: @xvmskgez_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_112:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmskgez_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_112:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmskgez.b(<32 x i8> [[_112]])
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvmskgez_b(v32i8 _1) { return __lasx_xvmskgez_b(_1); }
-// CHECK-LABEL: @xvmsknz_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_112:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmsknz_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_112:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmsknz.b(<32 x i8> [[_112]])
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvmsknz_b(v32i8 _1) { return __lasx_xvmsknz_b(_1); }
-// CHECK-LABEL: @xvexth_h_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_112:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvexth_h_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_112:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvexth.h.b(<32 x i8> [[_112]])
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvexth_h_b(v32i8 _1) { return __lasx_xvexth_h_b(_1); }
-// CHECK-LABEL: @xvexth_w_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_112:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvexth_w_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_112:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvexth.w.h(<16 x i16> [[_112]])
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvexth_w_h(v16i16 _1) { return __lasx_xvexth_w_h(_1); }
-// CHECK-LABEL: @xvexth_d_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_112:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvexth_d_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_112:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvexth.d.w(<8 x i32> [[_112]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvexth_d_w(v8i32 _1) { return __lasx_xvexth_d_w(_1); }
-// CHECK-LABEL: @xvexth_q_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvexth_q_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvexth.q.d(<4 x i64> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvexth_q_d(v4i64 _1) { return __lasx_xvexth_q_d(_1); }
-// CHECK-LABEL: @xvexth_hu_bu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_112:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvexth_hu_bu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_112:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvexth.hu.bu(<32 x i8> [[_112]])
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvexth_hu_bu(v32u8 _1) { return __lasx_xvexth_hu_bu(_1); }
-// CHECK-LABEL: @xvexth_wu_hu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_112:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvexth_wu_hu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_112:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvexth.wu.hu(<16 x i16> [[_112]])
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvexth_wu_hu(v16u16 _1) { return __lasx_xvexth_wu_hu(_1); }
-// CHECK-LABEL: @xvexth_du_wu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_112:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvexth_du_wu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_112:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvexth.du.wu(<8 x i32> [[_112]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvexth_du_wu(v8u32 _1) { return __lasx_xvexth_du_wu(_1); }
-// CHECK-LABEL: @xvexth_qu_du(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvexth_qu_du(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvexth.qu.du(<4 x i64> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvexth_qu_du(v4u64 _1) { return __lasx_xvexth_qu_du(_1); }
-// CHECK-LABEL: @xvrotri_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvrotri_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvrotri.b(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvrotri_b(v32i8 _1) { return __lasx_xvrotri_b(_1, 1); }
-// CHECK-LABEL: @xvrotri_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvrotri_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvrotri.h(<16 x i16> [[_1]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvrotri_h(v16i16 _1) { return __lasx_xvrotri_h(_1, 1); }
-// CHECK-LABEL: @xvrotri_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvrotri_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvrotri.w(<8 x i32> [[_1]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvrotri_w(v8i32 _1) { return __lasx_xvrotri_w(_1, 1); }
-// CHECK-LABEL: @xvrotri_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvrotri_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvrotri.d(<4 x i64> [[_1]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvrotri_d(v4i64 _1) { return __lasx_xvrotri_d(_1, 1); }
-// CHECK-LABEL: @xvextl_q_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvextl_q_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvextl.q.d(<4 x i64> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvextl_q_d(v4i64 _1) { return __lasx_xvextl_q_d(_1); }
-// CHECK-LABEL: @xvsrlni_b_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsrlni_b_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrlni.b.h(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvsrlni_b_h(v32i8 _1, v32i8 _2) { return __lasx_xvsrlni_b_h(_1, _2, 1); }
-// CHECK-LABEL: @xvsrlni_h_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsrlni_h_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrlni.h.w(<16 x i16> [[_1]], <16 x i16> [[_2]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvsrlni_h_w(v16i16 _1, v16i16 _2) { return __lasx_xvsrlni_h_w(_1, _2, 1); }
-// CHECK-LABEL: @xvsrlni_w_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsrlni_w_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrlni.w.d(<8 x i32> [[_1]], <8 x i32> [[_2]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvsrlni_w_d(v8i32 _1, v8i32 _2) { return __lasx_xvsrlni_w_d(_1, _2, 1); }
-// CHECK-LABEL: @xvsrlni_d_q(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsrlni_d_q(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrlni.d.q(<4 x i64> [[_1]], <4 x i64> [[_2]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvsrlni_d_q(v4i64 _1, v4i64 _2) { return __lasx_xvsrlni_d_q(_1, _2, 1); }
-// CHECK-LABEL: @xvsrlrni_b_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsrlrni_b_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrlrni.b.h(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvsrlrni_b_h(v32i8 _1, v32i8 _2) { return __lasx_xvsrlrni_b_h(_1, _2, 1); }
-// CHECK-LABEL: @xvsrlrni_h_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsrlrni_h_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrlrni.h.w(<16 x i16> [[_1]], <16 x i16> [[_2]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvsrlrni_h_w(v16i16 _1, v16i16 _2) { return __lasx_xvsrlrni_h_w(_1, _2, 1); }
-// CHECK-LABEL: @xvsrlrni_w_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsrlrni_w_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrlrni.w.d(<8 x i32> [[_1]], <8 x i32> [[_2]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvsrlrni_w_d(v8i32 _1, v8i32 _2) { return __lasx_xvsrlrni_w_d(_1, _2, 1); }
-// CHECK-LABEL: @xvsrlrni_d_q(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsrlrni_d_q(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrlrni.d.q(<4 x i64> [[_1]], <4 x i64> [[_2]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvsrlrni_d_q(v4i64 _1, v4i64 _2) { return __lasx_xvsrlrni_d_q(_1, _2, 1); }
-// CHECK-LABEL: @xvssrlni_b_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssrlni_b_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrlni.b.h(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvssrlni_b_h(v32i8 _1, v32i8 _2) { return __lasx_xvssrlni_b_h(_1, _2, 1); }
-// CHECK-LABEL: @xvssrlni_h_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssrlni_h_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrlni.h.w(<16 x i16> [[_1]], <16 x i16> [[_2]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvssrlni_h_w(v16i16 _1, v16i16 _2) { return __lasx_xvssrlni_h_w(_1, _2, 1); }
-// CHECK-LABEL: @xvssrlni_w_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssrlni_w_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrlni.w.d(<8 x i32> [[_1]], <8 x i32> [[_2]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvssrlni_w_d(v8i32 _1, v8i32 _2) { return __lasx_xvssrlni_w_d(_1, _2, 1); }
-// CHECK-LABEL: @xvssrlni_d_q(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssrlni_d_q(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssrlni.d.q(<4 x i64> [[_1]], <4 x i64> [[_2]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvssrlni_d_q(v4i64 _1, v4i64 _2) { return __lasx_xvssrlni_d_q(_1, _2, 1); }
-// CHECK-LABEL: @xvssrlni_bu_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssrlni_bu_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrlni.bu.h(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvssrlni_bu_h(v32u8 _1, v32i8 _2) { return __lasx_xvssrlni_bu_h(_1, _2, 1); }
-// CHECK-LABEL: @xvssrlni_hu_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssrlni_hu_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrlni.hu.w(<16 x i16> [[_1]], <16 x i16> [[_2]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvssrlni_hu_w(v16u16 _1, v16i16 _2) { return __lasx_xvssrlni_hu_w(_1, _2, 1); }
-// CHECK-LABEL: @xvssrlni_wu_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssrlni_wu_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrlni.wu.d(<8 x i32> [[_1]], <8 x i32> [[_2]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvssrlni_wu_d(v8u32 _1, v8i32 _2) { return __lasx_xvssrlni_wu_d(_1, _2, 1); }
-// CHECK-LABEL: @xvssrlni_du_q(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssrlni_du_q(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssrlni.du.q(<4 x i64> [[_1]], <4 x i64> [[_2]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvssrlni_du_q(v4u64 _1, v4i64 _2) { return __lasx_xvssrlni_du_q(_1, _2, 1); }
-// CHECK-LABEL: @xvssrlrni_b_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssrlrni_b_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrlrni.b.h(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvssrlrni_b_h(v32i8 _1, v32i8 _2) { return __lasx_xvssrlrni_b_h(_1, _2, 1); }
-// CHECK-LABEL: @xvssrlrni_h_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssrlrni_h_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrlrni.h.w(<16 x i16> [[_1]], <16 x i16> [[_2]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvssrlrni_h_w(v16i16 _1, v16i16 _2) { return __lasx_xvssrlrni_h_w(_1, _2, 1); }
-// CHECK-LABEL: @xvssrlrni_w_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssrlrni_w_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrlrni.w.d(<8 x i32> [[_1]], <8 x i32> [[_2]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvssrlrni_w_d(v8i32 _1, v8i32 _2) { return __lasx_xvssrlrni_w_d(_1, _2, 1); }
-// CHECK-LABEL: @xvssrlrni_d_q(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssrlrni_d_q(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssrlrni.d.q(<4 x i64> [[_1]], <4 x i64> [[_2]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvssrlrni_d_q(v4i64 _1, v4i64 _2) { return __lasx_xvssrlrni_d_q(_1, _2, 1); }
-// CHECK-LABEL: @xvssrlrni_bu_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssrlrni_bu_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrlrni.bu.h(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvssrlrni_bu_h(v32u8 _1, v32i8 _2) { return __lasx_xvssrlrni_bu_h(_1, _2, 1); }
-// CHECK-LABEL: @xvssrlrni_hu_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssrlrni_hu_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrlrni.hu.w(<16 x i16> [[_1]], <16 x i16> [[_2]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvssrlrni_hu_w(v16u16 _1, v16i16 _2) { return __lasx_xvssrlrni_hu_w(_1, _2, 1); }
-// CHECK-LABEL: @xvssrlrni_wu_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssrlrni_wu_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrlrni.wu.d(<8 x i32> [[_1]], <8 x i32> [[_2]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvssrlrni_wu_d(v8u32 _1, v8i32 _2) { return __lasx_xvssrlrni_wu_d(_1, _2, 1); }
-// CHECK-LABEL: @xvssrlrni_du_q(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssrlrni_du_q(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssrlrni.du.q(<4 x i64> [[_1]], <4 x i64> [[_2]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvssrlrni_du_q(v4u64 _1, v4i64 _2) { return __lasx_xvssrlrni_du_q(_1, _2, 1); }
-// CHECK-LABEL: @xvsrani_b_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsrani_b_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrani.b.h(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvsrani_b_h(v32i8 _1, v32i8 _2) { return __lasx_xvsrani_b_h(_1, _2, 1); }
-// CHECK-LABEL: @xvsrani_h_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsrani_h_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrani.h.w(<16 x i16> [[_1]], <16 x i16> [[_2]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvsrani_h_w(v16i16 _1, v16i16 _2) { return __lasx_xvsrani_h_w(_1, _2, 1); }
-// CHECK-LABEL: @xvsrani_w_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsrani_w_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrani.w.d(<8 x i32> [[_1]], <8 x i32> [[_2]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvsrani_w_d(v8i32 _1, v8i32 _2) { return __lasx_xvsrani_w_d(_1, _2, 1); }
-// CHECK-LABEL: @xvsrani_d_q(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsrani_d_q(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrani.d.q(<4 x i64> [[_1]], <4 x i64> [[_2]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvsrani_d_q(v4i64 _1, v4i64 _2) { return __lasx_xvsrani_d_q(_1, _2, 1); }
-// CHECK-LABEL: @xvsrarni_b_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsrarni_b_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrarni.b.h(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvsrarni_b_h(v32i8 _1, v32i8 _2) { return __lasx_xvsrarni_b_h(_1, _2, 1); }
-// CHECK-LABEL: @xvsrarni_h_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsrarni_h_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrarni.h.w(<16 x i16> [[_1]], <16 x i16> [[_2]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvsrarni_h_w(v16i16 _1, v16i16 _2) { return __lasx_xvsrarni_h_w(_1, _2, 1); }
-// CHECK-LABEL: @xvsrarni_w_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsrarni_w_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrarni.w.d(<8 x i32> [[_1]], <8 x i32> [[_2]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvsrarni_w_d(v8i32 _1, v8i32 _2) { return __lasx_xvsrarni_w_d(_1, _2, 1); }
-// CHECK-LABEL: @xvsrarni_d_q(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsrarni_d_q(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrarni.d.q(<4 x i64> [[_1]], <4 x i64> [[_2]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvsrarni_d_q(v4i64 _1, v4i64 _2) { return __lasx_xvsrarni_d_q(_1, _2, 1); }
-// CHECK-LABEL: @xvssrani_b_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssrani_b_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrani.b.h(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvssrani_b_h(v32i8 _1, v32i8 _2) { return __lasx_xvssrani_b_h(_1, _2, 1); }
-// CHECK-LABEL: @xvssrani_h_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssrani_h_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrani.h.w(<16 x i16> [[_1]], <16 x i16> [[_2]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvssrani_h_w(v16i16 _1, v16i16 _2) { return __lasx_xvssrani_h_w(_1, _2, 1); }
-// CHECK-LABEL: @xvssrani_w_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssrani_w_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrani.w.d(<8 x i32> [[_1]], <8 x i32> [[_2]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvssrani_w_d(v8i32 _1, v8i32 _2) { return __lasx_xvssrani_w_d(_1, _2, 1); }
-// CHECK-LABEL: @xvssrani_d_q(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssrani_d_q(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssrani.d.q(<4 x i64> [[_1]], <4 x i64> [[_2]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvssrani_d_q(v4i64 _1, v4i64 _2) { return __lasx_xvssrani_d_q(_1, _2, 1); }
-// CHECK-LABEL: @xvssrani_bu_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssrani_bu_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrani.bu.h(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvssrani_bu_h(v32u8 _1, v32i8 _2) { return __lasx_xvssrani_bu_h(_1, _2, 1); }
-// CHECK-LABEL: @xvssrani_hu_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssrani_hu_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrani.hu.w(<16 x i16> [[_1]], <16 x i16> [[_2]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvssrani_hu_w(v16u16 _1, v16i16 _2) { return __lasx_xvssrani_hu_w(_1, _2, 1); }
-// CHECK-LABEL: @xvssrani_wu_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssrani_wu_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrani.wu.d(<8 x i32> [[_1]], <8 x i32> [[_2]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvssrani_wu_d(v8u32 _1, v8i32 _2) { return __lasx_xvssrani_wu_d(_1, _2, 1); }
-// CHECK-LABEL: @xvssrani_du_q(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssrani_du_q(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssrani.du.q(<4 x i64> [[_1]], <4 x i64> [[_2]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvssrani_du_q(v4u64 _1, v4i64 _2) { return __lasx_xvssrani_du_q(_1, _2, 1); }
-// CHECK-LABEL: @xvssrarni_b_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssrarni_b_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrarni.b.h(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvssrarni_b_h(v32i8 _1, v32i8 _2) { return __lasx_xvssrarni_b_h(_1, _2, 1); }
-// CHECK-LABEL: @xvssrarni_h_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssrarni_h_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrarni.h.w(<16 x i16> [[_1]], <16 x i16> [[_2]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvssrarni_h_w(v16i16 _1, v16i16 _2) { return __lasx_xvssrarni_h_w(_1, _2, 1); }
-// CHECK-LABEL: @xvssrarni_w_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssrarni_w_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrarni.w.d(<8 x i32> [[_1]], <8 x i32> [[_2]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvssrarni_w_d(v8i32 _1, v8i32 _2) { return __lasx_xvssrarni_w_d(_1, _2, 1); }
-// CHECK-LABEL: @xvssrarni_d_q(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssrarni_d_q(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssrarni.d.q(<4 x i64> [[_1]], <4 x i64> [[_2]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvssrarni_d_q(v4i64 _1, v4i64 _2) { return __lasx_xvssrarni_d_q(_1, _2, 1); }
-// CHECK-LABEL: @xvssrarni_bu_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssrarni_bu_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrarni.bu.h(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvssrarni_bu_h(v32u8 _1, v32i8 _2) { return __lasx_xvssrarni_bu_h(_1, _2, 1); }
-// CHECK-LABEL: @xvssrarni_hu_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssrarni_hu_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrarni.hu.w(<16 x i16> [[_1]], <16 x i16> [[_2]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvssrarni_hu_w(v16u16 _1, v16i16 _2) { return __lasx_xvssrarni_hu_w(_1, _2, 1); }
-// CHECK-LABEL: @xvssrarni_wu_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssrarni_wu_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrarni.wu.d(<8 x i32> [[_1]], <8 x i32> [[_2]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvssrarni_wu_d(v8u32 _1, v8i32 _2) { return __lasx_xvssrarni_wu_d(_1, _2, 1); }
-// CHECK-LABEL: @xvssrarni_du_q(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssrarni_du_q(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssrarni.du.q(<4 x i64> [[_1]], <4 x i64> [[_2]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvssrarni_du_q(v4u64 _1, v4i64 _2) { return __lasx_xvssrarni_du_q(_1, _2, 1); }
-// CHECK-LABEL: @xbnz_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local signext i32 @xbnz_b(
+// CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR7]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lasx.xbnz.b(<32 x i8> [[_1]])
 // CHECK-NEXT:    ret i32 [[TMP1]]
 //
 int xbnz_b(v32u8 _1) { return __lasx_xbnz_b(_1); }
-// CHECK-LABEL: @xbnz_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local signext i32 @xbnz_d(
+// CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR7]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lasx.xbnz.d(<4 x i64> [[_1]])
 // CHECK-NEXT:    ret i32 [[TMP1]]
 //
 int xbnz_d(v4u64 _1) { return __lasx_xbnz_d(_1); }
-// CHECK-LABEL: @xbnz_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local signext i32 @xbnz_h(
+// CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR7]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lasx.xbnz.h(<16 x i16> [[_1]])
 // CHECK-NEXT:    ret i32 [[TMP1]]
 //
 int xbnz_h(v16u16 _1) { return __lasx_xbnz_h(_1); }
-// CHECK-LABEL: @xbnz_v(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local signext i32 @xbnz_v(
+// CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR7]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lasx.xbnz.v(<32 x i8> [[_1]])
 // CHECK-NEXT:    ret i32 [[TMP1]]
 //
 int xbnz_v(v32u8 _1) { return __lasx_xbnz_v(_1); }
-// CHECK-LABEL: @xbnz_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local signext i32 @xbnz_w(
+// CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR7]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lasx.xbnz.w(<8 x i32> [[_1]])
 // CHECK-NEXT:    ret i32 [[TMP1]]
 //
 int xbnz_w(v8u32 _1) { return __lasx_xbnz_w(_1); }
-// CHECK-LABEL: @xbz_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local signext i32 @xbz_b(
+// CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR7]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lasx.xbz.b(<32 x i8> [[_1]])
 // CHECK-NEXT:    ret i32 [[TMP1]]
 //
 int xbz_b(v32u8 _1) { return __lasx_xbz_b(_1); }
-// CHECK-LABEL: @xbz_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local signext i32 @xbz_d(
+// CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR7]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lasx.xbz.d(<4 x i64> [[_1]])
 // CHECK-NEXT:    ret i32 [[TMP1]]
 //
 int xbz_d(v4u64 _1) { return __lasx_xbz_d(_1); }
-// CHECK-LABEL: @xbz_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local signext i32 @xbz_h(
+// CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR7]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lasx.xbz.h(<16 x i16> [[_1]])
 // CHECK-NEXT:    ret i32 [[TMP1]]
 //
 int xbz_h(v16u16 _1) { return __lasx_xbz_h(_1); }
-// CHECK-LABEL: @xbz_v(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local signext i32 @xbz_v(
+// CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR7]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lasx.xbz.v(<32 x i8> [[_1]])
 // CHECK-NEXT:    ret i32 [[TMP1]]
 //
 int xbz_v(v32u8 _1) { return __lasx_xbz_v(_1); }
-// CHECK-LABEL: @xbz_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local signext i32 @xbz_w(
+// CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR7]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lasx.xbz.w(<8 x i32> [[_1]])
 // CHECK-NEXT:    ret i32 [[TMP1]]
 //
 int xbz_w(v8u32 _1) { return __lasx_xbz_w(_1); }
-// CHECK-LABEL: @xvfcmp_caf_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfcmp_caf_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.caf.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_caf_d(v4f64 _1, v4f64 _2) { return __lasx_xvfcmp_caf_d(_1, _2); }
-// CHECK-LABEL: @xvfcmp_caf_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfcmp_caf_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.caf.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_caf_s(v8f32 _1, v8f32 _2) { return __lasx_xvfcmp_caf_s(_1, _2); }
-// CHECK-LABEL: @xvfcmp_ceq_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfcmp_ceq_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.ceq.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_ceq_d(v4f64 _1, v4f64 _2) { return __lasx_xvfcmp_ceq_d(_1, _2); }
-// CHECK-LABEL: @xvfcmp_ceq_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfcmp_ceq_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.ceq.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_ceq_s(v8f32 _1, v8f32 _2) { return __lasx_xvfcmp_ceq_s(_1, _2); }
-// CHECK-LABEL: @xvfcmp_cle_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfcmp_cle_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.cle.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_cle_d(v4f64 _1, v4f64 _2) { return __lasx_xvfcmp_cle_d(_1, _2); }
-// CHECK-LABEL: @xvfcmp_cle_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfcmp_cle_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.cle.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_cle_s(v8f32 _1, v8f32 _2) { return __lasx_xvfcmp_cle_s(_1, _2); }
-// CHECK-LABEL: @xvfcmp_clt_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfcmp_clt_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.clt.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_clt_d(v4f64 _1, v4f64 _2) { return __lasx_xvfcmp_clt_d(_1, _2); }
-// CHECK-LABEL: @xvfcmp_clt_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfcmp_clt_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.clt.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_clt_s(v8f32 _1, v8f32 _2) { return __lasx_xvfcmp_clt_s(_1, _2); }
-// CHECK-LABEL: @xvfcmp_cne_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfcmp_cne_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.cne.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_cne_d(v4f64 _1, v4f64 _2) { return __lasx_xvfcmp_cne_d(_1, _2); }
-// CHECK-LABEL: @xvfcmp_cne_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfcmp_cne_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.cne.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_cne_s(v8f32 _1, v8f32 _2) { return __lasx_xvfcmp_cne_s(_1, _2); }
-// CHECK-LABEL: @xvfcmp_cor_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfcmp_cor_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.cor.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_cor_d(v4f64 _1, v4f64 _2) { return __lasx_xvfcmp_cor_d(_1, _2); }
-// CHECK-LABEL: @xvfcmp_cor_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfcmp_cor_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.cor.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_cor_s(v8f32 _1, v8f32 _2) { return __lasx_xvfcmp_cor_s(_1, _2); }
-// CHECK-LABEL: @xvfcmp_cueq_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfcmp_cueq_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.cueq.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_cueq_d(v4f64 _1, v4f64 _2) { return __lasx_xvfcmp_cueq_d(_1, _2); }
-// CHECK-LABEL: @xvfcmp_cueq_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfcmp_cueq_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.cueq.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_cueq_s(v8f32 _1, v8f32 _2) { return __lasx_xvfcmp_cueq_s(_1, _2); }
-// CHECK-LABEL: @xvfcmp_cule_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfcmp_cule_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.cule.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_cule_d(v4f64 _1, v4f64 _2) { return __lasx_xvfcmp_cule_d(_1, _2); }
-// CHECK-LABEL: @xvfcmp_cule_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfcmp_cule_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.cule.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_cule_s(v8f32 _1, v8f32 _2) { return __lasx_xvfcmp_cule_s(_1, _2); }
-// CHECK-LABEL: @xvfcmp_cult_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfcmp_cult_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.cult.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_cult_d(v4f64 _1, v4f64 _2) { return __lasx_xvfcmp_cult_d(_1, _2); }
-// CHECK-LABEL: @xvfcmp_cult_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfcmp_cult_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.cult.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_cult_s(v8f32 _1, v8f32 _2) { return __lasx_xvfcmp_cult_s(_1, _2); }
-// CHECK-LABEL: @xvfcmp_cun_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfcmp_cun_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.cun.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_cun_d(v4f64 _1, v4f64 _2) { return __lasx_xvfcmp_cun_d(_1, _2); }
-// CHECK-LABEL: @xvfcmp_cune_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfcmp_cune_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.cune.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_cune_d(v4f64 _1, v4f64 _2) { return __lasx_xvfcmp_cune_d(_1, _2); }
-// CHECK-LABEL: @xvfcmp_cune_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfcmp_cune_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.cune.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_cune_s(v8f32 _1, v8f32 _2) { return __lasx_xvfcmp_cune_s(_1, _2); }
-// CHECK-LABEL: @xvfcmp_cun_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfcmp_cun_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.cun.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_cun_s(v8f32 _1, v8f32 _2) { return __lasx_xvfcmp_cun_s(_1, _2); }
-// CHECK-LABEL: @xvfcmp_saf_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfcmp_saf_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.saf.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_saf_d(v4f64 _1, v4f64 _2) { return __lasx_xvfcmp_saf_d(_1, _2); }
-// CHECK-LABEL: @xvfcmp_saf_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfcmp_saf_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.saf.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_saf_s(v8f32 _1, v8f32 _2) { return __lasx_xvfcmp_saf_s(_1, _2); }
-// CHECK-LABEL: @xvfcmp_seq_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfcmp_seq_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.seq.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_seq_d(v4f64 _1, v4f64 _2) { return __lasx_xvfcmp_seq_d(_1, _2); }
-// CHECK-LABEL: @xvfcmp_seq_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfcmp_seq_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.seq.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_seq_s(v8f32 _1, v8f32 _2) { return __lasx_xvfcmp_seq_s(_1, _2); }
-// CHECK-LABEL: @xvfcmp_sle_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfcmp_sle_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.sle.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_sle_d(v4f64 _1, v4f64 _2) { return __lasx_xvfcmp_sle_d(_1, _2); }
-// CHECK-LABEL: @xvfcmp_sle_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfcmp_sle_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.sle.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_sle_s(v8f32 _1, v8f32 _2) { return __lasx_xvfcmp_sle_s(_1, _2); }
-// CHECK-LABEL: @xvfcmp_slt_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfcmp_slt_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.slt.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_slt_d(v4f64 _1, v4f64 _2) { return __lasx_xvfcmp_slt_d(_1, _2); }
-// CHECK-LABEL: @xvfcmp_slt_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfcmp_slt_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.slt.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_slt_s(v8f32 _1, v8f32 _2) { return __lasx_xvfcmp_slt_s(_1, _2); }
-// CHECK-LABEL: @xvfcmp_sne_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfcmp_sne_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.sne.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_sne_d(v4f64 _1, v4f64 _2) { return __lasx_xvfcmp_sne_d(_1, _2); }
-// CHECK-LABEL: @xvfcmp_sne_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfcmp_sne_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.sne.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_sne_s(v8f32 _1, v8f32 _2) { return __lasx_xvfcmp_sne_s(_1, _2); }
-// CHECK-LABEL: @xvfcmp_sor_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfcmp_sor_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.sor.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_sor_d(v4f64 _1, v4f64 _2) { return __lasx_xvfcmp_sor_d(_1, _2); }
-// CHECK-LABEL: @xvfcmp_sor_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfcmp_sor_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.sor.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_sor_s(v8f32 _1, v8f32 _2) { return __lasx_xvfcmp_sor_s(_1, _2); }
-// CHECK-LABEL: @xvfcmp_sueq_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfcmp_sueq_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.sueq.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_sueq_d(v4f64 _1, v4f64 _2) { return __lasx_xvfcmp_sueq_d(_1, _2); }
-// CHECK-LABEL: @xvfcmp_sueq_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfcmp_sueq_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.sueq.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_sueq_s(v8f32 _1, v8f32 _2) { return __lasx_xvfcmp_sueq_s(_1, _2); }
-// CHECK-LABEL: @xvfcmp_sule_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfcmp_sule_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.sule.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_sule_d(v4f64 _1, v4f64 _2) { return __lasx_xvfcmp_sule_d(_1, _2); }
-// CHECK-LABEL: @xvfcmp_sule_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfcmp_sule_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.sule.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_sule_s(v8f32 _1, v8f32 _2) { return __lasx_xvfcmp_sule_s(_1, _2); }
-// CHECK-LABEL: @xvfcmp_sult_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfcmp_sult_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.sult.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_sult_d(v4f64 _1, v4f64 _2) { return __lasx_xvfcmp_sult_d(_1, _2); }
-// CHECK-LABEL: @xvfcmp_sult_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfcmp_sult_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.sult.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_sult_s(v8f32 _1, v8f32 _2) { return __lasx_xvfcmp_sult_s(_1, _2); }
-// CHECK-LABEL: @xvfcmp_sun_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfcmp_sun_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.sun.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_sun_d(v4f64 _1, v4f64 _2) { return __lasx_xvfcmp_sun_d(_1, _2); }
-// CHECK-LABEL: @xvfcmp_sune_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfcmp_sune_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.sune.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_sune_d(v4f64 _1, v4f64 _2) { return __lasx_xvfcmp_sune_d(_1, _2); }
-// CHECK-LABEL: @xvfcmp_sune_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfcmp_sune_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.sune.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_sune_s(v8f32 _1, v8f32 _2) { return __lasx_xvfcmp_sune_s(_1, _2); }
-// CHECK-LABEL: @xvfcmp_sun_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfcmp_sun_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.sun.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_sun_s(v8f32 _1, v8f32 _2) { return __lasx_xvfcmp_sun_s(_1, _2); }
-// CHECK-LABEL: @xvpickve_d_f(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvpickve_d_f(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvpickve.d.f(<4 x double> [[_1]], i32 1)
-// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4f64 xvpickve_d_f(v4f64 _1) { return __lasx_xvpickve_d_f(_1, 1); }
-// CHECK-LABEL: @xvpickve_w_f(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvpickve_w_f(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvpickve.w.f(<8 x float> [[_1]], i32 1)
-// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8f32 xvpickve_w_f(v8f32 _1) { return __lasx_xvpickve_w_f(_1, 1); }
-// CHECK-LABEL: @xvrepli_b(
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define dso_local void @xvrepli_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvrepli.b(i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP0]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP0]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvrepli_b() { return __lasx_xvrepli_b(1); }
-// CHECK-LABEL: @xvrepli_d(
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define dso_local void @xvrepli_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvrepli.d(i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP0]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP0]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvrepli_d() { return __lasx_xvrepli_d(1); }
-// CHECK-LABEL: @xvrepli_h(
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define dso_local void @xvrepli_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvrepli.h(i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP0]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP0]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvrepli_h() { return __lasx_xvrepli_h(1); }
-// CHECK-LABEL: @xvrepli_w(
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define dso_local void @xvrepli_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvrepli.w(i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP0]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP0]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvrepli_w() { return __lasx_xvrepli_w(1); }
+//.
+// CHECK: [[CHAR_TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
+// CHECK: [[META3]] = !{!"omnipotent char", [[META4:![0-9]+]], i64 0}
+// CHECK: [[META4]] = !{!"Simple C/C++ TBAA"}
+// CHECK: [[META5]] = !{[[META6:![0-9]+]]}
+// CHECK: [[META6]] = distinct !{[[META6]], [[META7:![0-9]+]], !"__lasx_xvldx: %agg.result"}
+// CHECK: [[META7]] = distinct !{[[META7]], !"__lasx_xvldx"}
+//.
diff --git a/clang/test/CodeGen/LoongArch/lasx/builtin-approximate-alias.c b/clang/test/CodeGen/LoongArch/lasx/builtin-approximate-alias.c
index b79f939403993..b194ea8f3182a 100644
--- a/clang/test/CodeGen/LoongArch/lasx/builtin-approximate-alias.c
+++ b/clang/test/CodeGen/LoongArch/lasx/builtin-approximate-alias.c
@@ -1,37 +1,46 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6
 // RUN: %clang_cc1 -triple loongarch64 -target-feature +lasx -target-feature +frecipe -O2 -emit-llvm %s -o - | FileCheck %s
 
 #include <lasxintrin.h>
 
-// CHECK-LABEL: @xvfrecipe_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2:![0-9]+]]
+// CHECK-LABEL: define dso_local void @xvfrecipe_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2:![0-9]+]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfrecipe.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8f32 xvfrecipe_s(v8f32 _1) { return __lasx_xvfrecipe_s(_1); }
-// CHECK-LABEL: @xvfrecipe_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2:![0-9]+]]
+// CHECK-LABEL: define dso_local void @xvfrecipe_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfrecipe.d(<4 x double> [[_1]])
-// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4f64 xvfrecipe_d(v4f64 _1) { return __lasx_xvfrecipe_d(_1); }
-// CHECK-LABEL: @xvfrsqrte_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2:![0-9]+]]
+// CHECK-LABEL: define dso_local void @xvfrsqrte_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfrsqrte.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8f32 xvfrsqrte_s(v8f32 _1) { return __lasx_xvfrsqrte_s(_1); }
-// CHECK-LABEL: @xvfrsqrte_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2:![0-9]+]]
+// CHECK-LABEL: define dso_local void @xvfrsqrte_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfrsqrte.d(<4 x double> [[_1]])
-// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4f64 xvfrsqrte_d(v4f64 _1) { return __lasx_xvfrsqrte_d(_1); }
+//.
+// CHECK: [[CHAR_TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
+// CHECK: [[META3]] = !{!"omnipotent char", [[META4:![0-9]+]], i64 0}
+// CHECK: [[META4]] = !{!"Simple C/C++ TBAA"}
+//.
diff --git a/clang/test/CodeGen/LoongArch/lasx/builtin-approximate.c b/clang/test/CodeGen/LoongArch/lasx/builtin-approximate.c
index 63e9ba639ea2c..9d543dfabe3d2 100644
--- a/clang/test/CodeGen/LoongArch/lasx/builtin-approximate.c
+++ b/clang/test/CodeGen/LoongArch/lasx/builtin-approximate.c
@@ -1,38 +1,47 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6
 // RUN: %clang_cc1 -triple loongarch64 -target-feature +lasx -target-feature +frecipe -O2 -emit-llvm %s -o - | FileCheck %s
 
 typedef float v8f32 __attribute__((vector_size(32), aligned(32)));
 typedef double v4f64 __attribute__((vector_size(32), aligned(32)));
 
-// CHECK-LABEL: @xvfrecipe_s
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2:![0-9]+]]
+// CHECK-LABEL: define dso_local void @xvfrecipe_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2:![0-9]+]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfrecipe.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8f32 xvfrecipe_s(v8f32 _1) { return __builtin_lasx_xvfrecipe_s(_1); }
-// CHECK-LABEL: @xvfrecipe_d
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2:![0-9]+]]
+// CHECK-LABEL: define dso_local void @xvfrecipe_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfrecipe.d(<4 x double> [[_1]])
-// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4f64 xvfrecipe_d(v4f64 _1) { return __builtin_lasx_xvfrecipe_d(_1); }
-// CHECK-LABEL: @xvfrsqrte_s
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2:![0-9]+]]
+// CHECK-LABEL: define dso_local void @xvfrsqrte_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfrsqrte.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8f32 xvfrsqrte_s(v8f32 _1) { return __builtin_lasx_xvfrsqrte_s(_1); }
-// CHECK-LABEL: @xvfrsqrte_d
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2:![0-9]+]]
+// CHECK-LABEL: define dso_local void @xvfrsqrte_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfrsqrte.d(<4 x double> [[_1]])
-// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4f64 xvfrsqrte_d(v4f64 _1) { return __builtin_lasx_xvfrsqrte_d(_1); }
+//.
+// CHECK: [[CHAR_TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
+// CHECK: [[META3]] = !{!"omnipotent char", [[META4:![0-9]+]], i64 0}
+// CHECK: [[META4]] = !{!"Simple C/C++ TBAA"}
+//.
diff --git a/clang/test/CodeGen/LoongArch/lasx/builtin.c b/clang/test/CodeGen/LoongArch/lasx/builtin.c
index f52a23a5faea7..9b21c7ea3e8a5 100644
--- a/clang/test/CodeGen/LoongArch/lasx/builtin.c
+++ b/clang/test/CodeGen/LoongArch/lasx/builtin.c
@@ -1,4 +1,4 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6
 // RUN: %clang_cc1 -triple loongarch64 -target-feature +lasx -O2 -emit-llvm %s -o - | FileCheck %s
 
 typedef signed char v32i8 __attribute__((vector_size(32), aligned(32)));
@@ -25,6384 +25,7125 @@ typedef double v4f64_d __attribute__((vector_size(32), aligned(8)));
 typedef double v4f64 __attribute__((vector_size(32), aligned(32)));
 typedef double v4f64_d __attribute__((vector_size(32), aligned(8)));
 
-// CHECK-LABEL: @xvsll_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2:![0-9]+]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsll_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2:![0-9]+]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsll.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvsll_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvsll_b(_1, _2); }
-// CHECK-LABEL: @xvsll_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsll_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsll.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvsll_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvsll_h(_1, _2); }
-// CHECK-LABEL: @xvsll_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsll_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsll.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvsll_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvsll_w(_1, _2); }
-// CHECK-LABEL: @xvsll_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsll_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsll.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvsll_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvsll_d(_1, _2); }
-// CHECK-LABEL: @xvslli_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvslli_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvslli.b(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvslli_b(v32i8 _1) { return __builtin_lasx_xvslli_b(_1, 1); }
-// CHECK-LABEL: @xvslli_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvslli_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvslli.h(<16 x i16> [[_1]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvslli_h(v16i16 _1) { return __builtin_lasx_xvslli_h(_1, 1); }
-// CHECK-LABEL: @xvslli_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvslli_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvslli.w(<8 x i32> [[_1]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvslli_w(v8i32 _1) { return __builtin_lasx_xvslli_w(_1, 1); }
-// CHECK-LABEL: @xvslli_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvslli_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvslli.d(<4 x i64> [[_1]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvslli_d(v4i64 _1) { return __builtin_lasx_xvslli_d(_1, 1); }
-// CHECK-LABEL: @xvsra_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsra_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsra.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvsra_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvsra_b(_1, _2); }
-// CHECK-LABEL: @xvsra_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsra_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsra.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvsra_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvsra_h(_1, _2); }
-// CHECK-LABEL: @xvsra_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsra_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsra.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvsra_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvsra_w(_1, _2); }
-// CHECK-LABEL: @xvsra_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsra_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsra.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvsra_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvsra_d(_1, _2); }
-// CHECK-LABEL: @xvsrai_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsrai_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrai.b(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvsrai_b(v32i8 _1) { return __builtin_lasx_xvsrai_b(_1, 1); }
-// CHECK-LABEL: @xvsrai_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsrai_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrai.h(<16 x i16> [[_1]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvsrai_h(v16i16 _1) { return __builtin_lasx_xvsrai_h(_1, 1); }
-// CHECK-LABEL: @xvsrai_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsrai_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrai.w(<8 x i32> [[_1]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvsrai_w(v8i32 _1) { return __builtin_lasx_xvsrai_w(_1, 1); }
-// CHECK-LABEL: @xvsrai_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsrai_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrai.d(<4 x i64> [[_1]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvsrai_d(v4i64 _1) { return __builtin_lasx_xvsrai_d(_1, 1); }
-// CHECK-LABEL: @xvsrar_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsrar_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrar.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvsrar_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvsrar_b(_1, _2); }
-// CHECK-LABEL: @xvsrar_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsrar_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrar.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvsrar_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvsrar_h(_1, _2); }
-// CHECK-LABEL: @xvsrar_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsrar_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrar.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvsrar_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvsrar_w(_1, _2); }
-// CHECK-LABEL: @xvsrar_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsrar_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrar.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvsrar_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvsrar_d(_1, _2); }
-// CHECK-LABEL: @xvsrari_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsrari_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrari.b(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvsrari_b(v32i8 _1) { return __builtin_lasx_xvsrari_b(_1, 1); }
-// CHECK-LABEL: @xvsrari_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsrari_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrari.h(<16 x i16> [[_1]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvsrari_h(v16i16 _1) { return __builtin_lasx_xvsrari_h(_1, 1); }
-// CHECK-LABEL: @xvsrari_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsrari_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrari.w(<8 x i32> [[_1]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvsrari_w(v8i32 _1) { return __builtin_lasx_xvsrari_w(_1, 1); }
-// CHECK-LABEL: @xvsrari_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsrari_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrari.d(<4 x i64> [[_1]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvsrari_d(v4i64 _1) { return __builtin_lasx_xvsrari_d(_1, 1); }
-// CHECK-LABEL: @xvsrl_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsrl_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrl.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvsrl_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvsrl_b(_1, _2); }
-// CHECK-LABEL: @xvsrl_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsrl_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrl.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvsrl_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvsrl_h(_1, _2); }
-// CHECK-LABEL: @xvsrl_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsrl_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrl.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvsrl_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvsrl_w(_1, _2); }
-// CHECK-LABEL: @xvsrl_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsrl_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrl.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvsrl_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvsrl_d(_1, _2); }
-// CHECK-LABEL: @xvsrli_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsrli_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrli.b(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvsrli_b(v32i8 _1) { return __builtin_lasx_xvsrli_b(_1, 1); }
-// CHECK-LABEL: @xvsrli_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsrli_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrli.h(<16 x i16> [[_1]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvsrli_h(v16i16 _1) { return __builtin_lasx_xvsrli_h(_1, 1); }
-// CHECK-LABEL: @xvsrli_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsrli_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrli.w(<8 x i32> [[_1]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvsrli_w(v8i32 _1) { return __builtin_lasx_xvsrli_w(_1, 1); }
-// CHECK-LABEL: @xvsrli_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsrli_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrli.d(<4 x i64> [[_1]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvsrli_d(v4i64 _1) { return __builtin_lasx_xvsrli_d(_1, 1); }
-// CHECK-LABEL: @xvsrlr_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsrlr_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrlr.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvsrlr_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvsrlr_b(_1, _2); }
-// CHECK-LABEL: @xvsrlr_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsrlr_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrlr.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvsrlr_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvsrlr_h(_1, _2); }
-// CHECK-LABEL: @xvsrlr_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsrlr_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrlr.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvsrlr_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvsrlr_w(_1, _2); }
-// CHECK-LABEL: @xvsrlr_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsrlr_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrlr.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvsrlr_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvsrlr_d(_1, _2); }
-// CHECK-LABEL: @xvsrlri_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsrlri_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrlri.b(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvsrlri_b(v32i8 _1) { return __builtin_lasx_xvsrlri_b(_1, 1); }
-// CHECK-LABEL: @xvsrlri_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsrlri_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrlri.h(<16 x i16> [[_1]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvsrlri_h(v16i16 _1) { return __builtin_lasx_xvsrlri_h(_1, 1); }
-// CHECK-LABEL: @xvsrlri_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsrlri_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrlri.w(<8 x i32> [[_1]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvsrlri_w(v8i32 _1) { return __builtin_lasx_xvsrlri_w(_1, 1); }
-// CHECK-LABEL: @xvsrlri_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsrlri_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrlri.d(<4 x i64> [[_1]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvsrlri_d(v4i64 _1) { return __builtin_lasx_xvsrlri_d(_1, 1); }
-// CHECK-LABEL: @xvbitclr_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvbitclr_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbitclr.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvbitclr_b(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvbitclr_b(_1, _2); }
-// CHECK-LABEL: @xvbitclr_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvbitclr_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvbitclr.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvbitclr_h(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvbitclr_h(_1, _2); }
-// CHECK-LABEL: @xvbitclr_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvbitclr_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvbitclr.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvbitclr_w(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvbitclr_w(_1, _2); }
-// CHECK-LABEL: @xvbitclr_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvbitclr_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvbitclr.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvbitclr_d(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvbitclr_d(_1, _2); }
-// CHECK-LABEL: @xvbitclri_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvbitclri_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbitclri.b(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvbitclri_b(v32u8 _1) { return __builtin_lasx_xvbitclri_b(_1, 1); }
-// CHECK-LABEL: @xvbitclri_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvbitclri_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvbitclri.h(<16 x i16> [[_1]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvbitclri_h(v16u16 _1) { return __builtin_lasx_xvbitclri_h(_1, 1); }
-// CHECK-LABEL: @xvbitclri_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvbitclri_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvbitclri.w(<8 x i32> [[_1]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvbitclri_w(v8u32 _1) { return __builtin_lasx_xvbitclri_w(_1, 1); }
-// CHECK-LABEL: @xvbitclri_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvbitclri_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvbitclri.d(<4 x i64> [[_1]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvbitclri_d(v4u64 _1) { return __builtin_lasx_xvbitclri_d(_1, 1); }
-// CHECK-LABEL: @xvbitset_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvbitset_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbitset.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvbitset_b(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvbitset_b(_1, _2); }
-// CHECK-LABEL: @xvbitset_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvbitset_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvbitset.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvbitset_h(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvbitset_h(_1, _2); }
-// CHECK-LABEL: @xvbitset_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvbitset_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvbitset.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvbitset_w(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvbitset_w(_1, _2); }
-// CHECK-LABEL: @xvbitset_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvbitset_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvbitset.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvbitset_d(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvbitset_d(_1, _2); }
-// CHECK-LABEL: @xvbitseti_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvbitseti_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbitseti.b(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvbitseti_b(v32u8 _1) { return __builtin_lasx_xvbitseti_b(_1, 1); }
-// CHECK-LABEL: @xvbitseti_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvbitseti_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvbitseti.h(<16 x i16> [[_1]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvbitseti_h(v16u16 _1) { return __builtin_lasx_xvbitseti_h(_1, 1); }
-// CHECK-LABEL: @xvbitseti_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvbitseti_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvbitseti.w(<8 x i32> [[_1]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvbitseti_w(v8u32 _1) { return __builtin_lasx_xvbitseti_w(_1, 1); }
-// CHECK-LABEL: @xvbitseti_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvbitseti_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvbitseti.d(<4 x i64> [[_1]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvbitseti_d(v4u64 _1) { return __builtin_lasx_xvbitseti_d(_1, 1); }
-// CHECK-LABEL: @xvbitrev_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvbitrev_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbitrev.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvbitrev_b(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvbitrev_b(_1, _2); }
-// CHECK-LABEL: @xvbitrev_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvbitrev_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvbitrev.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvbitrev_h(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvbitrev_h(_1, _2); }
-// CHECK-LABEL: @xvbitrev_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvbitrev_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvbitrev.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvbitrev_w(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvbitrev_w(_1, _2); }
-// CHECK-LABEL: @xvbitrev_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvbitrev_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvbitrev.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvbitrev_d(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvbitrev_d(_1, _2); }
-// CHECK-LABEL: @xvbitrevi_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvbitrevi_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbitrevi.b(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvbitrevi_b(v32u8 _1) { return __builtin_lasx_xvbitrevi_b(_1, 1); }
-// CHECK-LABEL: @xvbitrevi_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvbitrevi_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvbitrevi.h(<16 x i16> [[_1]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvbitrevi_h(v16u16 _1) { return __builtin_lasx_xvbitrevi_h(_1, 1); }
-// CHECK-LABEL: @xvbitrevi_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvbitrevi_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvbitrevi.w(<8 x i32> [[_1]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvbitrevi_w(v8u32 _1) { return __builtin_lasx_xvbitrevi_w(_1, 1); }
-// CHECK-LABEL: @xvbitrevi_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvbitrevi_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvbitrevi.d(<4 x i64> [[_1]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvbitrevi_d(v4u64 _1) { return __builtin_lasx_xvbitrevi_d(_1, 1); }
-// CHECK-LABEL: @xvadd_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvadd_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvadd.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvadd_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvadd_b(_1, _2); }
-// CHECK-LABEL: @xvadd_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvadd_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvadd.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvadd_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvadd_h(_1, _2); }
-// CHECK-LABEL: @xvadd_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvadd_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvadd.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvadd_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvadd_w(_1, _2); }
-// CHECK-LABEL: @xvadd_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvadd_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvadd.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvadd_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvadd_d(_1, _2); }
-// CHECK-LABEL: @xvaddi_bu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvaddi_bu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvaddi.bu(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvaddi_bu(v32i8 _1) { return __builtin_lasx_xvaddi_bu(_1, 1); }
-// CHECK-LABEL: @xvaddi_hu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvaddi_hu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvaddi.hu(<16 x i16> [[_1]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvaddi_hu(v16i16 _1) { return __builtin_lasx_xvaddi_hu(_1, 1); }
-// CHECK-LABEL: @xvaddi_wu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvaddi_wu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvaddi.wu(<8 x i32> [[_1]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvaddi_wu(v8i32 _1) { return __builtin_lasx_xvaddi_wu(_1, 1); }
-// CHECK-LABEL: @xvaddi_du(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvaddi_du(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddi.du(<4 x i64> [[_1]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvaddi_du(v4i64 _1) { return __builtin_lasx_xvaddi_du(_1, 1); }
-// CHECK-LABEL: @xvsub_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsub_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsub.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvsub_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvsub_b(_1, _2); }
-// CHECK-LABEL: @xvsub_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsub_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsub.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvsub_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvsub_h(_1, _2); }
-// CHECK-LABEL: @xvsub_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsub_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsub.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvsub_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvsub_w(_1, _2); }
-// CHECK-LABEL: @xvsub_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsub_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsub.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvsub_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvsub_d(_1, _2); }
-// CHECK-LABEL: @xvsubi_bu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsubi_bu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsubi.bu(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvsubi_bu(v32i8 _1) { return __builtin_lasx_xvsubi_bu(_1, 1); }
-// CHECK-LABEL: @xvsubi_hu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsubi_hu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsubi.hu(<16 x i16> [[_1]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvsubi_hu(v16i16 _1) { return __builtin_lasx_xvsubi_hu(_1, 1); }
-// CHECK-LABEL: @xvsubi_wu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsubi_wu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsubi.wu(<8 x i32> [[_1]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvsubi_wu(v8i32 _1) { return __builtin_lasx_xvsubi_wu(_1, 1); }
-// CHECK-LABEL: @xvsubi_du(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsubi_du(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubi.du(<4 x i64> [[_1]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvsubi_du(v4i64 _1) { return __builtin_lasx_xvsubi_du(_1, 1); }
-// CHECK-LABEL: @xvmax_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmax_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmax.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvmax_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvmax_b(_1, _2); }
-// CHECK-LABEL: @xvmax_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmax_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmax.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvmax_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvmax_h(_1, _2); }
-// CHECK-LABEL: @xvmax_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmax_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmax.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvmax_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvmax_w(_1, _2); }
-// CHECK-LABEL: @xvmax_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmax_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmax.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvmax_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvmax_d(_1, _2); }
-// CHECK-LABEL: @xvmaxi_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmaxi_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmaxi.b(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvmaxi_b(v32i8 _1) { return __builtin_lasx_xvmaxi_b(_1, 1); }
-// CHECK-LABEL: @xvmaxi_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmaxi_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmaxi.h(<16 x i16> [[_1]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvmaxi_h(v16i16 _1) { return __builtin_lasx_xvmaxi_h(_1, 1); }
-// CHECK-LABEL: @xvmaxi_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmaxi_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmaxi.w(<8 x i32> [[_1]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvmaxi_w(v8i32 _1) { return __builtin_lasx_xvmaxi_w(_1, 1); }
-// CHECK-LABEL: @xvmaxi_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmaxi_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaxi.d(<4 x i64> [[_1]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvmaxi_d(v4i64 _1) { return __builtin_lasx_xvmaxi_d(_1, 1); }
-// CHECK-LABEL: @xvmax_bu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmax_bu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmax.bu(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvmax_bu(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvmax_bu(_1, _2); }
-// CHECK-LABEL: @xvmax_hu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmax_hu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmax.hu(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvmax_hu(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvmax_hu(_1, _2); }
-// CHECK-LABEL: @xvmax_wu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmax_wu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmax.wu(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvmax_wu(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvmax_wu(_1, _2); }
-// CHECK-LABEL: @xvmax_du(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmax_du(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmax.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvmax_du(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvmax_du(_1, _2); }
-// CHECK-LABEL: @xvmaxi_bu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmaxi_bu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmaxi.bu(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvmaxi_bu(v32u8 _1) { return __builtin_lasx_xvmaxi_bu(_1, 1); }
-// CHECK-LABEL: @xvmaxi_hu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmaxi_hu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmaxi.hu(<16 x i16> [[_1]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvmaxi_hu(v16u16 _1) { return __builtin_lasx_xvmaxi_hu(_1, 1); }
-// CHECK-LABEL: @xvmaxi_wu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmaxi_wu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmaxi.wu(<8 x i32> [[_1]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvmaxi_wu(v8u32 _1) { return __builtin_lasx_xvmaxi_wu(_1, 1); }
-// CHECK-LABEL: @xvmaxi_du(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmaxi_du(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaxi.du(<4 x i64> [[_1]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvmaxi_du(v4u64 _1) { return __builtin_lasx_xvmaxi_du(_1, 1); }
-// CHECK-LABEL: @xvmin_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmin_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmin.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvmin_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvmin_b(_1, _2); }
-// CHECK-LABEL: @xvmin_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmin_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmin.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvmin_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvmin_h(_1, _2); }
-// CHECK-LABEL: @xvmin_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmin_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmin.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvmin_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvmin_w(_1, _2); }
-// CHECK-LABEL: @xvmin_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmin_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmin.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvmin_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvmin_d(_1, _2); }
-// CHECK-LABEL: @xvmini_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmini_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmini.b(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvmini_b(v32i8 _1) { return __builtin_lasx_xvmini_b(_1, 1); }
-// CHECK-LABEL: @xvmini_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmini_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmini.h(<16 x i16> [[_1]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvmini_h(v16i16 _1) { return __builtin_lasx_xvmini_h(_1, 1); }
-// CHECK-LABEL: @xvmini_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmini_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmini.w(<8 x i32> [[_1]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvmini_w(v8i32 _1) { return __builtin_lasx_xvmini_w(_1, 1); }
-// CHECK-LABEL: @xvmini_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmini_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmini.d(<4 x i64> [[_1]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvmini_d(v4i64 _1) { return __builtin_lasx_xvmini_d(_1, 1); }
-// CHECK-LABEL: @xvmin_bu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmin_bu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmin.bu(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvmin_bu(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvmin_bu(_1, _2); }
-// CHECK-LABEL: @xvmin_hu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmin_hu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmin.hu(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvmin_hu(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvmin_hu(_1, _2); }
-// CHECK-LABEL: @xvmin_wu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmin_wu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmin.wu(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvmin_wu(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvmin_wu(_1, _2); }
-// CHECK-LABEL: @xvmin_du(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmin_du(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmin.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvmin_du(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvmin_du(_1, _2); }
-// CHECK-LABEL: @xvmini_bu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmini_bu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmini.bu(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvmini_bu(v32u8 _1) { return __builtin_lasx_xvmini_bu(_1, 1); }
-// CHECK-LABEL: @xvmini_hu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmini_hu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmini.hu(<16 x i16> [[_1]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvmini_hu(v16u16 _1) { return __builtin_lasx_xvmini_hu(_1, 1); }
-// CHECK-LABEL: @xvmini_wu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmini_wu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmini.wu(<8 x i32> [[_1]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvmini_wu(v8u32 _1) { return __builtin_lasx_xvmini_wu(_1, 1); }
-// CHECK-LABEL: @xvmini_du(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmini_du(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmini.du(<4 x i64> [[_1]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvmini_du(v4u64 _1) { return __builtin_lasx_xvmini_du(_1, 1); }
-// CHECK-LABEL: @xvseq_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvseq_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvseq.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvseq_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvseq_b(_1, _2); }
-// CHECK-LABEL: @xvseq_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvseq_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvseq.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvseq_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvseq_h(_1, _2); }
-// CHECK-LABEL: @xvseq_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvseq_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvseq.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvseq_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvseq_w(_1, _2); }
-// CHECK-LABEL: @xvseq_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvseq_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvseq.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvseq_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvseq_d(_1, _2); }
-// CHECK-LABEL: @xvseqi_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvseqi_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvseqi.b(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvseqi_b(v32i8 _1) { return __builtin_lasx_xvseqi_b(_1, 1); }
-// CHECK-LABEL: @xvseqi_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvseqi_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvseqi.h(<16 x i16> [[_1]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvseqi_h(v16i16 _1) { return __builtin_lasx_xvseqi_h(_1, 1); }
-// CHECK-LABEL: @xvseqi_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvseqi_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvseqi.w(<8 x i32> [[_1]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvseqi_w(v8i32 _1) { return __builtin_lasx_xvseqi_w(_1, 1); }
-// CHECK-LABEL: @xvseqi_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvseqi_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvseqi.d(<4 x i64> [[_1]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvseqi_d(v4i64 _1) { return __builtin_lasx_xvseqi_d(_1, 1); }
-// CHECK-LABEL: @xvslt_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvslt_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvslt.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvslt_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvslt_b(_1, _2); }
-// CHECK-LABEL: @xvslt_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvslt_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvslt.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvslt_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvslt_h(_1, _2); }
-// CHECK-LABEL: @xvslt_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvslt_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvslt.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvslt_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvslt_w(_1, _2); }
-// CHECK-LABEL: @xvslt_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvslt_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvslt.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvslt_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvslt_d(_1, _2); }
-// CHECK-LABEL: @xvslti_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvslti_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvslti.b(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvslti_b(v32i8 _1) { return __builtin_lasx_xvslti_b(_1, 1); }
-// CHECK-LABEL: @xvslti_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvslti_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvslti.h(<16 x i16> [[_1]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvslti_h(v16i16 _1) { return __builtin_lasx_xvslti_h(_1, 1); }
-// CHECK-LABEL: @xvslti_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvslti_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvslti.w(<8 x i32> [[_1]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvslti_w(v8i32 _1) { return __builtin_lasx_xvslti_w(_1, 1); }
-// CHECK-LABEL: @xvslti_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvslti_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvslti.d(<4 x i64> [[_1]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvslti_d(v4i64 _1) { return __builtin_lasx_xvslti_d(_1, 1); }
-// CHECK-LABEL: @xvslt_bu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvslt_bu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvslt.bu(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvslt_bu(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvslt_bu(_1, _2); }
-// CHECK-LABEL: @xvslt_hu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvslt_hu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvslt.hu(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvslt_hu(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvslt_hu(_1, _2); }
-// CHECK-LABEL: @xvslt_wu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvslt_wu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvslt.wu(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvslt_wu(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvslt_wu(_1, _2); }
-// CHECK-LABEL: @xvslt_du(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvslt_du(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvslt.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvslt_du(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvslt_du(_1, _2); }
-// CHECK-LABEL: @xvslti_bu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvslti_bu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvslti.bu(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvslti_bu(v32u8 _1) { return __builtin_lasx_xvslti_bu(_1, 1); }
-// CHECK-LABEL: @xvslti_hu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvslti_hu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvslti.hu(<16 x i16> [[_1]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvslti_hu(v16u16 _1) { return __builtin_lasx_xvslti_hu(_1, 1); }
-// CHECK-LABEL: @xvslti_wu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvslti_wu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvslti.wu(<8 x i32> [[_1]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvslti_wu(v8u32 _1) { return __builtin_lasx_xvslti_wu(_1, 1); }
-// CHECK-LABEL: @xvslti_du(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvslti_du(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvslti.du(<4 x i64> [[_1]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvslti_du(v4u64 _1) { return __builtin_lasx_xvslti_du(_1, 1); }
-// CHECK-LABEL: @xvsle_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsle_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsle.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvsle_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvsle_b(_1, _2); }
-// CHECK-LABEL: @xvsle_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsle_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsle.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvsle_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvsle_h(_1, _2); }
-// CHECK-LABEL: @xvsle_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsle_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsle.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvsle_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvsle_w(_1, _2); }
-// CHECK-LABEL: @xvsle_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsle_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsle.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvsle_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvsle_d(_1, _2); }
-// CHECK-LABEL: @xvslei_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvslei_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvslei.b(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvslei_b(v32i8 _1) { return __builtin_lasx_xvslei_b(_1, 1); }
-// CHECK-LABEL: @xvslei_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvslei_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvslei.h(<16 x i16> [[_1]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvslei_h(v16i16 _1) { return __builtin_lasx_xvslei_h(_1, 1); }
-// CHECK-LABEL: @xvslei_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvslei_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvslei.w(<8 x i32> [[_1]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvslei_w(v8i32 _1) { return __builtin_lasx_xvslei_w(_1, 1); }
-// CHECK-LABEL: @xvslei_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvslei_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvslei.d(<4 x i64> [[_1]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvslei_d(v4i64 _1) { return __builtin_lasx_xvslei_d(_1, 1); }
-// CHECK-LABEL: @xvsle_bu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsle_bu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsle.bu(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvsle_bu(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvsle_bu(_1, _2); }
-// CHECK-LABEL: @xvsle_hu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsle_hu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsle.hu(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvsle_hu(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvsle_hu(_1, _2); }
-// CHECK-LABEL: @xvsle_wu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsle_wu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsle.wu(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvsle_wu(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvsle_wu(_1, _2); }
-// CHECK-LABEL: @xvsle_du(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsle_du(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsle.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvsle_du(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvsle_du(_1, _2); }
-// CHECK-LABEL: @xvslei_bu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvslei_bu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvslei.bu(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvslei_bu(v32u8 _1) { return __builtin_lasx_xvslei_bu(_1, 1); }
-// CHECK-LABEL: @xvslei_hu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvslei_hu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvslei.hu(<16 x i16> [[_1]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvslei_hu(v16u16 _1) { return __builtin_lasx_xvslei_hu(_1, 1); }
-// CHECK-LABEL: @xvslei_wu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvslei_wu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvslei.wu(<8 x i32> [[_1]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvslei_wu(v8u32 _1) { return __builtin_lasx_xvslei_wu(_1, 1); }
-// CHECK-LABEL: @xvslei_du(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvslei_du(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvslei.du(<4 x i64> [[_1]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvslei_du(v4u64 _1) { return __builtin_lasx_xvslei_du(_1, 1); }
-// CHECK-LABEL: @xvsat_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsat_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsat.b(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvsat_b(v32i8 _1) { return __builtin_lasx_xvsat_b(_1, 1); }
-// CHECK-LABEL: @xvsat_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsat_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsat.h(<16 x i16> [[_1]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvsat_h(v16i16 _1) { return __builtin_lasx_xvsat_h(_1, 1); }
-// CHECK-LABEL: @xvsat_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsat_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsat.w(<8 x i32> [[_1]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvsat_w(v8i32 _1) { return __builtin_lasx_xvsat_w(_1, 1); }
-// CHECK-LABEL: @xvsat_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsat_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsat.d(<4 x i64> [[_1]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvsat_d(v4i64 _1) { return __builtin_lasx_xvsat_d(_1, 1); }
-// CHECK-LABEL: @xvsat_bu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsat_bu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsat.bu(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvsat_bu(v32u8 _1) { return __builtin_lasx_xvsat_bu(_1, 1); }
-// CHECK-LABEL: @xvsat_hu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsat_hu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsat.hu(<16 x i16> [[_1]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvsat_hu(v16u16 _1) { return __builtin_lasx_xvsat_hu(_1, 1); }
-// CHECK-LABEL: @xvsat_wu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsat_wu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsat.wu(<8 x i32> [[_1]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvsat_wu(v8u32 _1) { return __builtin_lasx_xvsat_wu(_1, 1); }
-// CHECK-LABEL: @xvsat_du(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsat_du(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsat.du(<4 x i64> [[_1]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvsat_du(v4u64 _1) { return __builtin_lasx_xvsat_du(_1, 1); }
-// CHECK-LABEL: @xvadda_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvadda_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvadda.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvadda_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvadda_b(_1, _2); }
-// CHECK-LABEL: @xvadda_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvadda_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvadda.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvadda_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvadda_h(_1, _2); }
-// CHECK-LABEL: @xvadda_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvadda_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvadda.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvadda_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvadda_w(_1, _2); }
-// CHECK-LABEL: @xvadda_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvadda_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvadda.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvadda_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvadda_d(_1, _2); }
-// CHECK-LABEL: @xvsadd_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsadd_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsadd.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvsadd_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvsadd_b(_1, _2); }
-// CHECK-LABEL: @xvsadd_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsadd_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsadd.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvsadd_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvsadd_h(_1, _2); }
-// CHECK-LABEL: @xvsadd_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsadd_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsadd.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvsadd_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvsadd_w(_1, _2); }
-// CHECK-LABEL: @xvsadd_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsadd_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsadd.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvsadd_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvsadd_d(_1, _2); }
-// CHECK-LABEL: @xvsadd_bu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsadd_bu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsadd.bu(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvsadd_bu(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvsadd_bu(_1, _2); }
-// CHECK-LABEL: @xvsadd_hu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsadd_hu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsadd.hu(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvsadd_hu(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvsadd_hu(_1, _2); }
-// CHECK-LABEL: @xvsadd_wu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsadd_wu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsadd.wu(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvsadd_wu(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvsadd_wu(_1, _2); }
-// CHECK-LABEL: @xvsadd_du(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsadd_du(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsadd.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvsadd_du(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvsadd_du(_1, _2); }
-// CHECK-LABEL: @xvavg_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvavg_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvavg.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvavg_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvavg_b(_1, _2); }
-// CHECK-LABEL: @xvavg_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvavg_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvavg.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvavg_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvavg_h(_1, _2); }
-// CHECK-LABEL: @xvavg_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvavg_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvavg.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvavg_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvavg_w(_1, _2); }
-// CHECK-LABEL: @xvavg_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvavg_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvavg.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvavg_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvavg_d(_1, _2); }
-// CHECK-LABEL: @xvavg_bu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvavg_bu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvavg.bu(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvavg_bu(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvavg_bu(_1, _2); }
-// CHECK-LABEL: @xvavg_hu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvavg_hu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvavg.hu(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvavg_hu(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvavg_hu(_1, _2); }
-// CHECK-LABEL: @xvavg_wu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvavg_wu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvavg.wu(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvavg_wu(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvavg_wu(_1, _2); }
-// CHECK-LABEL: @xvavg_du(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvavg_du(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvavg.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvavg_du(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvavg_du(_1, _2); }
-// CHECK-LABEL: @xvavgr_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvavgr_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvavgr.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvavgr_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvavgr_b(_1, _2); }
-// CHECK-LABEL: @xvavgr_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvavgr_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvavgr.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvavgr_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvavgr_h(_1, _2); }
-// CHECK-LABEL: @xvavgr_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvavgr_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvavgr.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvavgr_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvavgr_w(_1, _2); }
-// CHECK-LABEL: @xvavgr_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvavgr_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvavgr.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvavgr_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvavgr_d(_1, _2); }
-// CHECK-LABEL: @xvavgr_bu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvavgr_bu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvavgr.bu(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvavgr_bu(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvavgr_bu(_1, _2); }
-// CHECK-LABEL: @xvavgr_hu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvavgr_hu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvavgr.hu(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvavgr_hu(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvavgr_hu(_1, _2); }
-// CHECK-LABEL: @xvavgr_wu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvavgr_wu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvavgr.wu(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvavgr_wu(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvavgr_wu(_1, _2); }
-// CHECK-LABEL: @xvavgr_du(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvavgr_du(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvavgr.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvavgr_du(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvavgr_du(_1, _2); }
-// CHECK-LABEL: @xvssub_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssub_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssub.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvssub_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvssub_b(_1, _2); }
-// CHECK-LABEL: @xvssub_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssub_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssub.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvssub_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvssub_h(_1, _2); }
-// CHECK-LABEL: @xvssub_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssub_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssub.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvssub_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvssub_w(_1, _2); }
-// CHECK-LABEL: @xvssub_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssub_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssub.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvssub_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvssub_d(_1, _2); }
-// CHECK-LABEL: @xvssub_bu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssub_bu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssub.bu(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvssub_bu(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvssub_bu(_1, _2); }
-// CHECK-LABEL: @xvssub_hu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssub_hu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssub.hu(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvssub_hu(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvssub_hu(_1, _2); }
-// CHECK-LABEL: @xvssub_wu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssub_wu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssub.wu(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvssub_wu(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvssub_wu(_1, _2); }
-// CHECK-LABEL: @xvssub_du(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssub_du(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssub.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvssub_du(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvssub_du(_1, _2); }
-// CHECK-LABEL: @xvabsd_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvabsd_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvabsd.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvabsd_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvabsd_b(_1, _2); }
-// CHECK-LABEL: @xvabsd_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvabsd_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvabsd.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvabsd_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvabsd_h(_1, _2); }
-// CHECK-LABEL: @xvabsd_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvabsd_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvabsd.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvabsd_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvabsd_w(_1, _2); }
-// CHECK-LABEL: @xvabsd_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvabsd_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvabsd.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvabsd_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvabsd_d(_1, _2); }
-// CHECK-LABEL: @xvabsd_bu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvabsd_bu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvabsd.bu(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvabsd_bu(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvabsd_bu(_1, _2); }
-// CHECK-LABEL: @xvabsd_hu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvabsd_hu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvabsd.hu(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvabsd_hu(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvabsd_hu(_1, _2); }
-// CHECK-LABEL: @xvabsd_wu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvabsd_wu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvabsd.wu(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvabsd_wu(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvabsd_wu(_1, _2); }
-// CHECK-LABEL: @xvabsd_du(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvabsd_du(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvabsd.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvabsd_du(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvabsd_du(_1, _2); }
-// CHECK-LABEL: @xvmul_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmul_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmul.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvmul_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvmul_b(_1, _2); }
-// CHECK-LABEL: @xvmul_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmul_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmul.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvmul_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvmul_h(_1, _2); }
-// CHECK-LABEL: @xvmul_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmul_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmul.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvmul_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvmul_w(_1, _2); }
-// CHECK-LABEL: @xvmul_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmul_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmul.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvmul_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvmul_d(_1, _2); }
-// CHECK-LABEL: @xvmadd_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <32 x i8>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmadd_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <32 x i8>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmadd.b(<32 x i8> [[_1]], <32 x i8> [[_2]], <32 x i8> [[_3]])
-// CHECK-NEXT:    store <32 x i8> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvmadd_b(v32i8 _1, v32i8 _2, v32i8 _3) { return __builtin_lasx_xvmadd_b(_1, _2, _3); }
-// CHECK-LABEL: @xvmadd_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <16 x i16>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmadd_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <16 x i16>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmadd.h(<16 x i16> [[_1]], <16 x i16> [[_2]], <16 x i16> [[_3]])
-// CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvmadd_h(v16i16 _1, v16i16 _2, v16i16 _3) { return __builtin_lasx_xvmadd_h(_1, _2, _3); }
-// CHECK-LABEL: @xvmadd_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <8 x i32>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmadd_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <8 x i32>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmadd.w(<8 x i32> [[_1]], <8 x i32> [[_2]], <8 x i32> [[_3]])
-// CHECK-NEXT:    store <8 x i32> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvmadd_w(v8i32 _1, v8i32 _2, v8i32 _3) { return __builtin_lasx_xvmadd_w(_1, _2, _3); }
-// CHECK-LABEL: @xvmadd_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <4 x i64>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmadd_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <4 x i64>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmadd.d(<4 x i64> [[_1]], <4 x i64> [[_2]], <4 x i64> [[_3]])
-// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvmadd_d(v4i64 _1, v4i64 _2, v4i64 _3) { return __builtin_lasx_xvmadd_d(_1, _2, _3); }
-// CHECK-LABEL: @xvmsub_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <32 x i8>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmsub_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <32 x i8>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmsub.b(<32 x i8> [[_1]], <32 x i8> [[_2]], <32 x i8> [[_3]])
-// CHECK-NEXT:    store <32 x i8> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvmsub_b(v32i8 _1, v32i8 _2, v32i8 _3) { return __builtin_lasx_xvmsub_b(_1, _2, _3); }
-// CHECK-LABEL: @xvmsub_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <16 x i16>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmsub_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <16 x i16>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmsub.h(<16 x i16> [[_1]], <16 x i16> [[_2]], <16 x i16> [[_3]])
-// CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvmsub_h(v16i16 _1, v16i16 _2, v16i16 _3) { return __builtin_lasx_xvmsub_h(_1, _2, _3); }
-// CHECK-LABEL: @xvmsub_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <8 x i32>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmsub_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <8 x i32>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmsub.w(<8 x i32> [[_1]], <8 x i32> [[_2]], <8 x i32> [[_3]])
-// CHECK-NEXT:    store <8 x i32> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvmsub_w(v8i32 _1, v8i32 _2, v8i32 _3) { return __builtin_lasx_xvmsub_w(_1, _2, _3); }
-// CHECK-LABEL: @xvmsub_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <4 x i64>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmsub_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <4 x i64>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmsub.d(<4 x i64> [[_1]], <4 x i64> [[_2]], <4 x i64> [[_3]])
-// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvmsub_d(v4i64 _1, v4i64 _2, v4i64 _3) { return __builtin_lasx_xvmsub_d(_1, _2, _3); }
-// CHECK-LABEL: @xvdiv_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvdiv_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvdiv.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvdiv_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvdiv_b(_1, _2); }
-// CHECK-LABEL: @xvdiv_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvdiv_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvdiv.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvdiv_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvdiv_h(_1, _2); }
-// CHECK-LABEL: @xvdiv_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvdiv_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvdiv.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvdiv_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvdiv_w(_1, _2); }
-// CHECK-LABEL: @xvdiv_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvdiv_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvdiv.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvdiv_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvdiv_d(_1, _2); }
-// CHECK-LABEL: @xvdiv_bu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvdiv_bu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvdiv.bu(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvdiv_bu(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvdiv_bu(_1, _2); }
-// CHECK-LABEL: @xvdiv_hu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvdiv_hu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvdiv.hu(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvdiv_hu(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvdiv_hu(_1, _2); }
-// CHECK-LABEL: @xvdiv_wu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvdiv_wu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvdiv.wu(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvdiv_wu(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvdiv_wu(_1, _2); }
-// CHECK-LABEL: @xvdiv_du(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvdiv_du(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvdiv.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvdiv_du(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvdiv_du(_1, _2); }
-// CHECK-LABEL: @xvhaddw_h_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvhaddw_h_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvhaddw.h.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvhaddw_h_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvhaddw_h_b(_1, _2); }
-// CHECK-LABEL: @xvhaddw_w_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvhaddw_w_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvhaddw.w.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvhaddw_w_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvhaddw_w_h(_1, _2); }
-// CHECK-LABEL: @xvhaddw_d_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvhaddw_d_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvhaddw.d.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvhaddw_d_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvhaddw_d_w(_1, _2); }
-// CHECK-LABEL: @xvhaddw_hu_bu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvhaddw_hu_bu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvhaddw.hu.bu(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvhaddw_hu_bu(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvhaddw_hu_bu(_1, _2); }
-// CHECK-LABEL: @xvhaddw_wu_hu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvhaddw_wu_hu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvhaddw.wu.hu(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvhaddw_wu_hu(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvhaddw_wu_hu(_1, _2); }
-// CHECK-LABEL: @xvhaddw_du_wu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvhaddw_du_wu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvhaddw.du.wu(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvhaddw_du_wu(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvhaddw_du_wu(_1, _2); }
-// CHECK-LABEL: @xvhsubw_h_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvhsubw_h_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvhsubw.h.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvhsubw_h_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvhsubw_h_b(_1, _2); }
-// CHECK-LABEL: @xvhsubw_w_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvhsubw_w_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvhsubw.w.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvhsubw_w_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvhsubw_w_h(_1, _2); }
-// CHECK-LABEL: @xvhsubw_d_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvhsubw_d_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvhsubw.d.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvhsubw_d_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvhsubw_d_w(_1, _2); }
-// CHECK-LABEL: @xvhsubw_hu_bu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvhsubw_hu_bu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvhsubw.hu.bu(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvhsubw_hu_bu(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvhsubw_hu_bu(_1, _2); }
-// CHECK-LABEL: @xvhsubw_wu_hu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvhsubw_wu_hu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvhsubw.wu.hu(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvhsubw_wu_hu(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvhsubw_wu_hu(_1, _2); }
-// CHECK-LABEL: @xvhsubw_du_wu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvhsubw_du_wu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvhsubw.du.wu(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvhsubw_du_wu(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvhsubw_du_wu(_1, _2); }
-// CHECK-LABEL: @xvmod_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmod_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmod.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvmod_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvmod_b(_1, _2); }
-// CHECK-LABEL: @xvmod_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmod_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmod.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvmod_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvmod_h(_1, _2); }
-// CHECK-LABEL: @xvmod_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmod_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmod.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvmod_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvmod_w(_1, _2); }
-// CHECK-LABEL: @xvmod_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmod_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmod.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvmod_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvmod_d(_1, _2); }
-// CHECK-LABEL: @xvmod_bu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmod_bu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmod.bu(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvmod_bu(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvmod_bu(_1, _2); }
-// CHECK-LABEL: @xvmod_hu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmod_hu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmod.hu(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvmod_hu(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvmod_hu(_1, _2); }
-// CHECK-LABEL: @xvmod_wu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmod_wu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmod.wu(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvmod_wu(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvmod_wu(_1, _2); }
-// CHECK-LABEL: @xvmod_du(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmod_du(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmod.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvmod_du(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvmod_du(_1, _2); }
-// CHECK-LABEL: @xvrepl128vei_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvrepl128vei_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvrepl128vei.b(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvrepl128vei_b(v32i8 _1) { return __builtin_lasx_xvrepl128vei_b(_1, 1); }
-// CHECK-LABEL: @xvrepl128vei_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvrepl128vei_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvrepl128vei.h(<16 x i16> [[_1]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvrepl128vei_h(v16i16 _1) { return __builtin_lasx_xvrepl128vei_h(_1, 1); }
-// CHECK-LABEL: @xvrepl128vei_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvrepl128vei_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvrepl128vei.w(<8 x i32> [[_1]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvrepl128vei_w(v8i32 _1) { return __builtin_lasx_xvrepl128vei_w(_1, 1); }
-// CHECK-LABEL: @xvrepl128vei_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvrepl128vei_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvrepl128vei.d(<4 x i64> [[_1]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvrepl128vei_d(v4i64 _1) { return __builtin_lasx_xvrepl128vei_d(_1, 1); }
-// CHECK-LABEL: @xvpickev_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvpickev_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvpickev.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvpickev_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvpickev_b(_1, _2); }
-// CHECK-LABEL: @xvpickev_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvpickev_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvpickev.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvpickev_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvpickev_h(_1, _2); }
-// CHECK-LABEL: @xvpickev_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvpickev_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvpickev.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvpickev_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvpickev_w(_1, _2); }
-// CHECK-LABEL: @xvpickev_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvpickev_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvpickev.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvpickev_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvpickev_d(_1, _2); }
-// CHECK-LABEL: @xvpickod_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvpickod_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvpickod.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvpickod_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvpickod_b(_1, _2); }
-// CHECK-LABEL: @xvpickod_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvpickod_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvpickod.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvpickod_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvpickod_h(_1, _2); }
-// CHECK-LABEL: @xvpickod_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvpickod_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvpickod.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvpickod_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvpickod_w(_1, _2); }
-// CHECK-LABEL: @xvpickod_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvpickod_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvpickod.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvpickod_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvpickod_d(_1, _2); }
-// CHECK-LABEL: @xvilvh_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvilvh_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvilvh.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvilvh_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvilvh_b(_1, _2); }
-// CHECK-LABEL: @xvilvh_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvilvh_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvilvh.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvilvh_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvilvh_h(_1, _2); }
-// CHECK-LABEL: @xvilvh_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvilvh_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvilvh.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvilvh_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvilvh_w(_1, _2); }
-// CHECK-LABEL: @xvilvh_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvilvh_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvilvh.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvilvh_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvilvh_d(_1, _2); }
-// CHECK-LABEL: @xvilvl_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvilvl_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvilvl.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvilvl_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvilvl_b(_1, _2); }
-// CHECK-LABEL: @xvilvl_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvilvl_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvilvl.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvilvl_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvilvl_h(_1, _2); }
-// CHECK-LABEL: @xvilvl_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvilvl_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvilvl.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvilvl_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvilvl_w(_1, _2); }
-// CHECK-LABEL: @xvilvl_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvilvl_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvilvl.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvilvl_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvilvl_d(_1, _2); }
-// CHECK-LABEL: @xvpackev_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvpackev_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvpackev.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvpackev_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvpackev_b(_1, _2); }
-// CHECK-LABEL: @xvpackev_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvpackev_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvpackev.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvpackev_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvpackev_h(_1, _2); }
-// CHECK-LABEL: @xvpackev_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvpackev_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvpackev.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvpackev_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvpackev_w(_1, _2); }
-// CHECK-LABEL: @xvpackev_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvpackev_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvpackev.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvpackev_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvpackev_d(_1, _2); }
-// CHECK-LABEL: @xvpackod_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvpackod_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvpackod.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvpackod_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvpackod_b(_1, _2); }
-// CHECK-LABEL: @xvpackod_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvpackod_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvpackod.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvpackod_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvpackod_h(_1, _2); }
-// CHECK-LABEL: @xvpackod_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvpackod_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvpackod.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvpackod_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvpackod_w(_1, _2); }
-// CHECK-LABEL: @xvpackod_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvpackod_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvpackod.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvpackod_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvpackod_d(_1, _2); }
-// CHECK-LABEL: @xvshuf_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <32 x i8>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvshuf_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <32 x i8>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvshuf.b(<32 x i8> [[_1]], <32 x i8> [[_2]], <32 x i8> [[_3]])
-// CHECK-NEXT:    store <32 x i8> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvshuf_b(v32i8 _1, v32i8 _2, v32i8 _3) { return __builtin_lasx_xvshuf_b(_1, _2, _3); }
-// CHECK-LABEL: @xvshuf_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <16 x i16>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvshuf_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <16 x i16>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvshuf.h(<16 x i16> [[_1]], <16 x i16> [[_2]], <16 x i16> [[_3]])
-// CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvshuf_h(v16i16 _1, v16i16 _2, v16i16 _3) { return __builtin_lasx_xvshuf_h(_1, _2, _3); }
-// CHECK-LABEL: @xvshuf_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <8 x i32>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvshuf_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <8 x i32>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvshuf.w(<8 x i32> [[_1]], <8 x i32> [[_2]], <8 x i32> [[_3]])
-// CHECK-NEXT:    store <8 x i32> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvshuf_w(v8i32 _1, v8i32 _2, v8i32 _3) { return __builtin_lasx_xvshuf_w(_1, _2, _3); }
-// CHECK-LABEL: @xvshuf_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <4 x i64>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvshuf_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <4 x i64>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvshuf.d(<4 x i64> [[_1]], <4 x i64> [[_2]], <4 x i64> [[_3]])
-// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvshuf_d(v4i64 _1, v4i64 _2, v4i64 _3) { return __builtin_lasx_xvshuf_d(_1, _2, _3); }
-// CHECK-LABEL: @xvand_v(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvand_v(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvand.v(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvand_v(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvand_v(_1, _2); }
-// CHECK-LABEL: @xvandi_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvandi_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvandi.b(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvandi_b(v32u8 _1) { return __builtin_lasx_xvandi_b(_1, 1); }
-// CHECK-LABEL: @xvor_v(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvor_v(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvor.v(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvor_v(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvor_v(_1, _2); }
-// CHECK-LABEL: @xvori_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvori_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvori.b(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvori_b(v32u8 _1) { return __builtin_lasx_xvori_b(_1, 1); }
-// CHECK-LABEL: @xvnor_v(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvnor_v(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvnor.v(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvnor_v(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvnor_v(_1, _2); }
-// CHECK-LABEL: @xvnori_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvnori_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvnori.b(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvnori_b(v32u8 _1) { return __builtin_lasx_xvnori_b(_1, 1); }
-// CHECK-LABEL: @xvxor_v(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvxor_v(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvxor.v(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvxor_v(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvxor_v(_1, _2); }
-// CHECK-LABEL: @xvxori_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvxori_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvxori.b(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvxori_b(v32u8 _1) { return __builtin_lasx_xvxori_b(_1, 1); }
-// CHECK-LABEL: @xvbitsel_v(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <32 x i8>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvbitsel_v(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <32 x i8>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbitsel.v(<32 x i8> [[_1]], <32 x i8> [[_2]], <32 x i8> [[_3]])
-// CHECK-NEXT:    store <32 x i8> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvbitsel_v(v32u8 _1, v32u8 _2, v32u8 _3) { return __builtin_lasx_xvbitsel_v(_1, _2, _3); }
-// CHECK-LABEL: @xvbitseli_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvbitseli_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbitseli.b(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvbitseli_b(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvbitseli_b(_1, _2, 1); }
-// CHECK-LABEL: @xvshuf4i_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvshuf4i_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvshuf4i.b(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvshuf4i_b(v32i8 _1) { return __builtin_lasx_xvshuf4i_b(_1, 1); }
-// CHECK-LABEL: @xvshuf4i_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvshuf4i_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvshuf4i.h(<16 x i16> [[_1]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvshuf4i_h(v16i16 _1) { return __builtin_lasx_xvshuf4i_h(_1, 1); }
-// CHECK-LABEL: @xvshuf4i_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvshuf4i_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvshuf4i.w(<8 x i32> [[_1]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvshuf4i_w(v8i32 _1) { return __builtin_lasx_xvshuf4i_w(_1, 1); }
-// CHECK-LABEL: @xvreplgr2vr_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvreplgr2vr.b(i32 [[_1:%.*]])
-// CHECK-NEXT:    store <32 x i8> [[TMP0]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvreplgr2vr_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], i32 noundef signext [[_1:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvreplgr2vr.b(i32 [[_1]])
+// CHECK-NEXT:    store <32 x i8> [[TMP0]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvreplgr2vr_b(int _1) { return __builtin_lasx_xvreplgr2vr_b(_1); }
-// CHECK-LABEL: @xvreplgr2vr_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvreplgr2vr.h(i32 [[_1:%.*]])
-// CHECK-NEXT:    store <16 x i16> [[TMP0]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvreplgr2vr_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], i32 noundef signext [[_1:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvreplgr2vr.h(i32 [[_1]])
+// CHECK-NEXT:    store <16 x i16> [[TMP0]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvreplgr2vr_h(int _1) { return __builtin_lasx_xvreplgr2vr_h(_1); }
-// CHECK-LABEL: @xvreplgr2vr_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvreplgr2vr.w(i32 [[_1:%.*]])
-// CHECK-NEXT:    store <8 x i32> [[TMP0]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvreplgr2vr_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], i32 noundef signext [[_1:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvreplgr2vr.w(i32 [[_1]])
+// CHECK-NEXT:    store <8 x i32> [[TMP0]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvreplgr2vr_w(int _1) { return __builtin_lasx_xvreplgr2vr_w(_1); }
-// CHECK-LABEL: @xvreplgr2vr_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[CONV:%.*]] = sext i32 [[_1:%.*]] to i64
+// CHECK-LABEL: define dso_local void @xvreplgr2vr_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], i32 noundef signext [[_1:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CONV:%.*]] = sext i32 [[_1]] to i64
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvreplgr2vr.d(i64 [[CONV]])
-// CHECK-NEXT:    store <4 x i64> [[TMP0]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP0]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvreplgr2vr_d(int _1) { return __builtin_lasx_xvreplgr2vr_d(_1); }
-// CHECK-LABEL: @xvpcnt_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvpcnt_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvpcnt.b(<32 x i8> [[_1]])
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvpcnt_b(v32i8 _1) { return __builtin_lasx_xvpcnt_b(_1); }
-// CHECK-LABEL: @xvpcnt_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvpcnt_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvpcnt.h(<16 x i16> [[_1]])
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvpcnt_h(v16i16 _1) { return __builtin_lasx_xvpcnt_h(_1); }
-// CHECK-LABEL: @xvpcnt_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvpcnt_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvpcnt.w(<8 x i32> [[_1]])
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvpcnt_w(v8i32 _1) { return __builtin_lasx_xvpcnt_w(_1); }
-// CHECK-LABEL: @xvpcnt_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvpcnt_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvpcnt.d(<4 x i64> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvpcnt_d(v4i64 _1) { return __builtin_lasx_xvpcnt_d(_1); }
-// CHECK-LABEL: @xvclo_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvclo_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvclo.b(<32 x i8> [[_1]])
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvclo_b(v32i8 _1) { return __builtin_lasx_xvclo_b(_1); }
-// CHECK-LABEL: @xvclo_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvclo_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvclo.h(<16 x i16> [[_1]])
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvclo_h(v16i16 _1) { return __builtin_lasx_xvclo_h(_1); }
-// CHECK-LABEL: @xvclo_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvclo_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvclo.w(<8 x i32> [[_1]])
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvclo_w(v8i32 _1) { return __builtin_lasx_xvclo_w(_1); }
-// CHECK-LABEL: @xvclo_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvclo_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvclo.d(<4 x i64> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvclo_d(v4i64 _1) { return __builtin_lasx_xvclo_d(_1); }
-// CHECK-LABEL: @xvclz_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvclz_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvclz.b(<32 x i8> [[_1]])
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvclz_b(v32i8 _1) { return __builtin_lasx_xvclz_b(_1); }
-// CHECK-LABEL: @xvclz_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvclz_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvclz.h(<16 x i16> [[_1]])
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvclz_h(v16i16 _1) { return __builtin_lasx_xvclz_h(_1); }
-// CHECK-LABEL: @xvclz_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvclz_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvclz.w(<8 x i32> [[_1]])
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvclz_w(v8i32 _1) { return __builtin_lasx_xvclz_w(_1); }
-// CHECK-LABEL: @xvclz_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvclz_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvclz.d(<4 x i64> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvclz_d(v4i64 _1) { return __builtin_lasx_xvclz_d(_1); }
-// CHECK-LABEL: @xvfadd_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfadd_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfadd.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8f32 xvfadd_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfadd_s(_1, _2); }
-// CHECK-LABEL: @xvfadd_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfadd_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfadd.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <4 x double> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x double> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4f64 xvfadd_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfadd_d(_1, _2); }
-// CHECK-LABEL: @xvfsub_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfsub_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfsub.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8f32 xvfsub_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfsub_s(_1, _2); }
-// CHECK-LABEL: @xvfsub_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfsub_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfsub.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <4 x double> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x double> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4f64 xvfsub_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfsub_d(_1, _2); }
-// CHECK-LABEL: @xvfmul_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfmul_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfmul.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8f32 xvfmul_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfmul_s(_1, _2); }
-// CHECK-LABEL: @xvfmul_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfmul_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfmul.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <4 x double> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x double> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4f64 xvfmul_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfmul_d(_1, _2); }
-// CHECK-LABEL: @xvfdiv_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfdiv_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfdiv.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8f32 xvfdiv_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfdiv_s(_1, _2); }
-// CHECK-LABEL: @xvfdiv_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfdiv_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfdiv.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <4 x double> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x double> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4f64 xvfdiv_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfdiv_d(_1, _2); }
-// CHECK-LABEL: @xvfcvt_h_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfcvt_h_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvfcvt.h.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvfcvt_h_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfcvt_h_s(_1, _2); }
-// CHECK-LABEL: @xvfcvt_s_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfcvt_s_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfcvt.s.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8f32 xvfcvt_s_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfcvt_s_d(_1, _2); }
-// CHECK-LABEL: @xvfmin_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfmin_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfmin.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8f32 xvfmin_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfmin_s(_1, _2); }
-// CHECK-LABEL: @xvfmin_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfmin_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfmin.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <4 x double> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x double> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4f64 xvfmin_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfmin_d(_1, _2); }
-// CHECK-LABEL: @xvfmina_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfmina_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfmina.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8f32 xvfmina_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfmina_s(_1, _2); }
-// CHECK-LABEL: @xvfmina_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfmina_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfmina.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <4 x double> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x double> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4f64 xvfmina_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfmina_d(_1, _2); }
-// CHECK-LABEL: @xvfmax_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfmax_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfmax.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8f32 xvfmax_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfmax_s(_1, _2); }
-// CHECK-LABEL: @xvfmax_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfmax_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfmax.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <4 x double> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x double> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4f64 xvfmax_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfmax_d(_1, _2); }
-// CHECK-LABEL: @xvfmaxa_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfmaxa_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfmaxa.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8f32 xvfmaxa_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfmaxa_s(_1, _2); }
-// CHECK-LABEL: @xvfmaxa_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfmaxa_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfmaxa.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <4 x double> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x double> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4f64 xvfmaxa_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfmaxa_d(_1, _2); }
-// CHECK-LABEL: @xvfclass_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfclass_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfclass.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvfclass_s(v8f32 _1) { return __builtin_lasx_xvfclass_s(_1); }
-// CHECK-LABEL: @xvfclass_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfclass_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfclass.d(<4 x double> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvfclass_d(v4f64 _1) { return __builtin_lasx_xvfclass_d(_1); }
-// CHECK-LABEL: @xvfsqrt_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfsqrt_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfsqrt.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8f32 xvfsqrt_s(v8f32 _1) { return __builtin_lasx_xvfsqrt_s(_1); }
-// CHECK-LABEL: @xvfsqrt_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfsqrt_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfsqrt.d(<4 x double> [[_1]])
-// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4f64 xvfsqrt_d(v4f64 _1) { return __builtin_lasx_xvfsqrt_d(_1); }
-// CHECK-LABEL: @xvfrecip_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfrecip_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfrecip.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8f32 xvfrecip_s(v8f32 _1) { return __builtin_lasx_xvfrecip_s(_1); }
-// CHECK-LABEL: @xvfrecip_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfrecip_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfrecip.d(<4 x double> [[_1]])
-// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4f64 xvfrecip_d(v4f64 _1) { return __builtin_lasx_xvfrecip_d(_1); }
-// CHECK-LABEL: @xvfrint_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfrint_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfrint.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8f32 xvfrint_s(v8f32 _1) { return __builtin_lasx_xvfrint_s(_1); }
-// CHECK-LABEL: @xvfrint_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfrint_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfrint.d(<4 x double> [[_1]])
-// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4f64 xvfrint_d(v4f64 _1) { return __builtin_lasx_xvfrint_d(_1); }
-// CHECK-LABEL: @xvfrsqrt_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfrsqrt_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfrsqrt.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8f32 xvfrsqrt_s(v8f32 _1) { return __builtin_lasx_xvfrsqrt_s(_1); }
-// CHECK-LABEL: @xvfrsqrt_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfrsqrt_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfrsqrt.d(<4 x double> [[_1]])
-// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4f64 xvfrsqrt_d(v4f64 _1) { return __builtin_lasx_xvfrsqrt_d(_1); }
-// CHECK-LABEL: @xvflogb_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvflogb_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvflogb.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8f32 xvflogb_s(v8f32 _1) { return __builtin_lasx_xvflogb_s(_1); }
-// CHECK-LABEL: @xvflogb_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvflogb_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvflogb.d(<4 x double> [[_1]])
-// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4f64 xvflogb_d(v4f64 _1) { return __builtin_lasx_xvflogb_d(_1); }
-// CHECK-LABEL: @xvfcvth_s_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfcvth_s_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfcvth.s.h(<16 x i16> [[_1]])
-// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8f32 xvfcvth_s_h(v16i16 _1) { return __builtin_lasx_xvfcvth_s_h(_1); }
-// CHECK-LABEL: @xvfcvth_d_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfcvth_d_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfcvth.d.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4f64 xvfcvth_d_s(v8f32 _1) { return __builtin_lasx_xvfcvth_d_s(_1); }
-// CHECK-LABEL: @xvfcvtl_s_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfcvtl_s_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfcvtl.s.h(<16 x i16> [[_1]])
-// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8f32 xvfcvtl_s_h(v16i16 _1) { return __builtin_lasx_xvfcvtl_s_h(_1); }
-// CHECK-LABEL: @xvfcvtl_d_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfcvtl_d_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfcvtl.d.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4f64 xvfcvtl_d_s(v8f32 _1) { return __builtin_lasx_xvfcvtl_d_s(_1); }
-// CHECK-LABEL: @xvftint_w_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvftint_w_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftint.w.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvftint_w_s(v8f32 _1) { return __builtin_lasx_xvftint_w_s(_1); }
-// CHECK-LABEL: @xvftint_l_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvftint_l_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftint.l.d(<4 x double> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvftint_l_d(v4f64 _1) { return __builtin_lasx_xvftint_l_d(_1); }
-// CHECK-LABEL: @xvftint_wu_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvftint_wu_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftint.wu.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvftint_wu_s(v8f32 _1) { return __builtin_lasx_xvftint_wu_s(_1); }
-// CHECK-LABEL: @xvftint_lu_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvftint_lu_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftint.lu.d(<4 x double> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvftint_lu_d(v4f64 _1) { return __builtin_lasx_xvftint_lu_d(_1); }
-// CHECK-LABEL: @xvftintrz_w_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvftintrz_w_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrz.w.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvftintrz_w_s(v8f32 _1) { return __builtin_lasx_xvftintrz_w_s(_1); }
-// CHECK-LABEL: @xvftintrz_l_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvftintrz_l_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrz.l.d(<4 x double> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvftintrz_l_d(v4f64 _1) { return __builtin_lasx_xvftintrz_l_d(_1); }
-// CHECK-LABEL: @xvftintrz_wu_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvftintrz_wu_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrz.wu.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvftintrz_wu_s(v8f32 _1) { return __builtin_lasx_xvftintrz_wu_s(_1); }
-// CHECK-LABEL: @xvftintrz_lu_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvftintrz_lu_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrz.lu.d(<4 x double> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvftintrz_lu_d(v4f64 _1) { return __builtin_lasx_xvftintrz_lu_d(_1); }
-// CHECK-LABEL: @xvffint_s_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvffint_s_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvffint.s.w(<8 x i32> [[_1]])
-// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8f32 xvffint_s_w(v8i32 _1) { return __builtin_lasx_xvffint_s_w(_1); }
-// CHECK-LABEL: @xvffint_d_l(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvffint_d_l(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvffint.d.l(<4 x i64> [[_1]])
-// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4f64 xvffint_d_l(v4i64 _1) { return __builtin_lasx_xvffint_d_l(_1); }
-// CHECK-LABEL: @xvffint_s_wu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvffint_s_wu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvffint.s.wu(<8 x i32> [[_1]])
-// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8f32 xvffint_s_wu(v8u32 _1) { return __builtin_lasx_xvffint_s_wu(_1); }
-// CHECK-LABEL: @xvffint_d_lu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvffint_d_lu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvffint.d.lu(<4 x i64> [[_1]])
-// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4f64 xvffint_d_lu(v4u64 _1) { return __builtin_lasx_xvffint_d_lu(_1); }
-// CHECK-LABEL: @xvreplve_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvreplve.b(<32 x i8> [[_1]], i32 [[_2:%.*]])
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvreplve_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], i32 noundef signext [[_2:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvreplve.b(<32 x i8> [[_1]], i32 [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvreplve_b(v32i8 _1, int _2) { return __builtin_lasx_xvreplve_b(_1, _2); }
-// CHECK-LABEL: @xvreplve_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvreplve.h(<16 x i16> [[_1]], i32 [[_2:%.*]])
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvreplve_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], i32 noundef signext [[_2:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvreplve.h(<16 x i16> [[_1]], i32 [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvreplve_h(v16i16 _1, int _2) { return __builtin_lasx_xvreplve_h(_1, _2); }
-// CHECK-LABEL: @xvreplve_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvreplve.w(<8 x i32> [[_1]], i32 [[_2:%.*]])
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvreplve_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], i32 noundef signext [[_2:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvreplve.w(<8 x i32> [[_1]], i32 [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvreplve_w(v8i32 _1, int _2) { return __builtin_lasx_xvreplve_w(_1, _2); }
-// CHECK-LABEL: @xvreplve_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvreplve.d(<4 x i64> [[_1]], i32 [[_2:%.*]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvreplve_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], i32 noundef signext [[_2:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvreplve.d(<4 x i64> [[_1]], i32 [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvreplve_d(v4i64 _1, int _2) { return __builtin_lasx_xvreplve_d(_1, _2); }
-// CHECK-LABEL: @xvpermi_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvpermi_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvpermi.w(<8 x i32> [[_1]], <8 x i32> [[_2]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvpermi_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvpermi_w(_1, _2, 1); }
-// CHECK-LABEL: @xvandn_v(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvandn_v(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvandn.v(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvandn_v(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvandn_v(_1, _2); }
-// CHECK-LABEL: @xvneg_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvneg_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvneg.b(<32 x i8> [[_1]])
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvneg_b(v32i8 _1) { return __builtin_lasx_xvneg_b(_1); }
-// CHECK-LABEL: @xvneg_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvneg_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvneg.h(<16 x i16> [[_1]])
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvneg_h(v16i16 _1) { return __builtin_lasx_xvneg_h(_1); }
-// CHECK-LABEL: @xvneg_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvneg_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvneg.w(<8 x i32> [[_1]])
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvneg_w(v8i32 _1) { return __builtin_lasx_xvneg_w(_1); }
-// CHECK-LABEL: @xvneg_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvneg_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvneg.d(<4 x i64> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvneg_d(v4i64 _1) { return __builtin_lasx_xvneg_d(_1); }
-// CHECK-LABEL: @xvmuh_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmuh_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmuh.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvmuh_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvmuh_b(_1, _2); }
-// CHECK-LABEL: @xvmuh_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmuh_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmuh.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvmuh_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvmuh_h(_1, _2); }
-// CHECK-LABEL: @xvmuh_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmuh_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmuh.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvmuh_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvmuh_w(_1, _2); }
-// CHECK-LABEL: @xvmuh_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmuh_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmuh.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvmuh_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvmuh_d(_1, _2); }
-// CHECK-LABEL: @xvmuh_bu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmuh_bu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmuh.bu(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvmuh_bu(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvmuh_bu(_1, _2); }
-// CHECK-LABEL: @xvmuh_hu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmuh_hu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmuh.hu(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvmuh_hu(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvmuh_hu(_1, _2); }
-// CHECK-LABEL: @xvmuh_wu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmuh_wu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmuh.wu(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvmuh_wu(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvmuh_wu(_1, _2); }
-// CHECK-LABEL: @xvmuh_du(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmuh_du(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmuh.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvmuh_du(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvmuh_du(_1, _2); }
-// CHECK-LABEL: @xvsllwil_h_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsllwil_h_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsllwil.h.b(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvsllwil_h_b(v32i8 _1) { return __builtin_lasx_xvsllwil_h_b(_1, 1); }
-// CHECK-LABEL: @xvsllwil_w_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsllwil_w_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsllwil.w.h(<16 x i16> [[_1]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvsllwil_w_h(v16i16 _1) { return __builtin_lasx_xvsllwil_w_h(_1, 1); }
-// CHECK-LABEL: @xvsllwil_d_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsllwil_d_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsllwil.d.w(<8 x i32> [[_1]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvsllwil_d_w(v8i32 _1) { return __builtin_lasx_xvsllwil_d_w(_1, 1); }
-// CHECK-LABEL: @xvsllwil_hu_bu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsllwil_hu_bu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsllwil.hu.bu(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvsllwil_hu_bu(v32u8 _1) { return __builtin_lasx_xvsllwil_hu_bu(_1, 1); }
-// CHECK-LABEL: @xvsllwil_wu_hu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsllwil_wu_hu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsllwil.wu.hu(<16 x i16> [[_1]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvsllwil_wu_hu(v16u16 _1) { return __builtin_lasx_xvsllwil_wu_hu(_1, 1); }
-// CHECK-LABEL: @xvsllwil_du_wu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsllwil_du_wu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsllwil.du.wu(<8 x i32> [[_1]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvsllwil_du_wu(v8u32 _1) { return __builtin_lasx_xvsllwil_du_wu(_1, 1); }
-// CHECK-LABEL: @xvsran_b_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsran_b_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsran.b.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvsran_b_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvsran_b_h(_1, _2); }
-// CHECK-LABEL: @xvsran_h_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsran_h_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsran.h.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvsran_h_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvsran_h_w(_1, _2); }
-// CHECK-LABEL: @xvsran_w_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsran_w_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsran.w.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvsran_w_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvsran_w_d(_1, _2); }
-// CHECK-LABEL: @xvssran_b_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssran_b_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssran.b.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvssran_b_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvssran_b_h(_1, _2); }
-// CHECK-LABEL: @xvssran_h_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssran_h_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssran.h.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvssran_h_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvssran_h_w(_1, _2); }
-// CHECK-LABEL: @xvssran_w_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssran_w_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssran.w.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvssran_w_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvssran_w_d(_1, _2); }
-// CHECK-LABEL: @xvssran_bu_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssran_bu_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssran.bu.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvssran_bu_h(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvssran_bu_h(_1, _2); }
-// CHECK-LABEL: @xvssran_hu_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssran_hu_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssran.hu.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvssran_hu_w(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvssran_hu_w(_1, _2); }
-// CHECK-LABEL: @xvssran_wu_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssran_wu_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssran.wu.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvssran_wu_d(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvssran_wu_d(_1, _2); }
-// CHECK-LABEL: @xvsrarn_b_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsrarn_b_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrarn.b.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvsrarn_b_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvsrarn_b_h(_1, _2); }
-// CHECK-LABEL: @xvsrarn_h_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsrarn_h_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrarn.h.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvsrarn_h_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvsrarn_h_w(_1, _2); }
-// CHECK-LABEL: @xvsrarn_w_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsrarn_w_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrarn.w.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvsrarn_w_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvsrarn_w_d(_1, _2); }
-// CHECK-LABEL: @xvssrarn_b_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssrarn_b_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrarn.b.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvssrarn_b_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvssrarn_b_h(_1, _2); }
-// CHECK-LABEL: @xvssrarn_h_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssrarn_h_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrarn.h.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvssrarn_h_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvssrarn_h_w(_1, _2); }
-// CHECK-LABEL: @xvssrarn_w_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssrarn_w_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrarn.w.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvssrarn_w_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvssrarn_w_d(_1, _2); }
-// CHECK-LABEL: @xvssrarn_bu_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssrarn_bu_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrarn.bu.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvssrarn_bu_h(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvssrarn_bu_h(_1, _2); }
-// CHECK-LABEL: @xvssrarn_hu_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssrarn_hu_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrarn.hu.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvssrarn_hu_w(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvssrarn_hu_w(_1, _2); }
-// CHECK-LABEL: @xvssrarn_wu_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssrarn_wu_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrarn.wu.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvssrarn_wu_d(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvssrarn_wu_d(_1, _2); }
-// CHECK-LABEL: @xvsrln_b_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsrln_b_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrln.b.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvsrln_b_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvsrln_b_h(_1, _2); }
-// CHECK-LABEL: @xvsrln_h_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsrln_h_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrln.h.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvsrln_h_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvsrln_h_w(_1, _2); }
-// CHECK-LABEL: @xvsrln_w_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsrln_w_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrln.w.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvsrln_w_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvsrln_w_d(_1, _2); }
-// CHECK-LABEL: @xvssrln_bu_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssrln_bu_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrln.bu.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvssrln_bu_h(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvssrln_bu_h(_1, _2); }
-// CHECK-LABEL: @xvssrln_hu_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssrln_hu_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrln.hu.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvssrln_hu_w(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvssrln_hu_w(_1, _2); }
-// CHECK-LABEL: @xvssrln_wu_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssrln_wu_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrln.wu.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvssrln_wu_d(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvssrln_wu_d(_1, _2); }
-// CHECK-LABEL: @xvsrlrn_b_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsrlrn_b_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrlrn.b.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvsrlrn_b_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvsrlrn_b_h(_1, _2); }
-// CHECK-LABEL: @xvsrlrn_h_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsrlrn_h_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrlrn.h.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvsrlrn_h_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvsrlrn_h_w(_1, _2); }
-// CHECK-LABEL: @xvsrlrn_w_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsrlrn_w_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrlrn.w.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvsrlrn_w_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvsrlrn_w_d(_1, _2); }
-// CHECK-LABEL: @xvssrlrn_bu_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssrlrn_bu_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrlrn.bu.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvssrlrn_bu_h(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvssrlrn_bu_h(_1, _2); }
-// CHECK-LABEL: @xvssrlrn_hu_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssrlrn_hu_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrlrn.hu.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvssrlrn_hu_w(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvssrlrn_hu_w(_1, _2); }
-// CHECK-LABEL: @xvssrlrn_wu_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssrlrn_wu_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrlrn.wu.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvssrlrn_wu_d(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvssrlrn_wu_d(_1, _2); }
-// CHECK-LABEL: @xvfrstpi_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfrstpi_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvfrstpi.b(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvfrstpi_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvfrstpi_b(_1, _2, 1); }
-// CHECK-LABEL: @xvfrstpi_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfrstpi_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvfrstpi.h(<16 x i16> [[_1]], <16 x i16> [[_2]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvfrstpi_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvfrstpi_h(_1, _2, 1); }
-// CHECK-LABEL: @xvfrstp_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <32 x i8>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfrstp_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <32 x i8>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvfrstp.b(<32 x i8> [[_1]], <32 x i8> [[_2]], <32 x i8> [[_3]])
-// CHECK-NEXT:    store <32 x i8> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvfrstp_b(v32i8 _1, v32i8 _2, v32i8 _3) { return __builtin_lasx_xvfrstp_b(_1, _2, _3); }
-// CHECK-LABEL: @xvfrstp_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <16 x i16>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfrstp_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <16 x i16>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvfrstp.h(<16 x i16> [[_1]], <16 x i16> [[_2]], <16 x i16> [[_3]])
-// CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvfrstp_h(v16i16 _1, v16i16 _2, v16i16 _3) { return __builtin_lasx_xvfrstp_h(_1, _2, _3); }
-// CHECK-LABEL: @xvshuf4i_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvshuf4i_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvshuf4i.d(<4 x i64> [[_1]], <4 x i64> [[_2]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvshuf4i_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvshuf4i_d(_1, _2, 1); }
-// CHECK-LABEL: @xvbsrl_v(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvbsrl_v(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbsrl.v(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvbsrl_v(v32i8 _1) { return __builtin_lasx_xvbsrl_v(_1, 1); }
-// CHECK-LABEL: @xvbsll_v(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvbsll_v(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbsll.v(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvbsll_v(v32i8 _1) { return __builtin_lasx_xvbsll_v(_1, 1); }
-// CHECK-LABEL: @xvextrins_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvextrins_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvextrins.b(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvextrins_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvextrins_b(_1, _2, 1); }
-// CHECK-LABEL: @xvextrins_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvextrins_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvextrins.h(<16 x i16> [[_1]], <16 x i16> [[_2]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvextrins_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvextrins_h(_1, _2, 1); }
-// CHECK-LABEL: @xvextrins_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvextrins_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvextrins.w(<8 x i32> [[_1]], <8 x i32> [[_2]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvextrins_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvextrins_w(_1, _2, 1); }
-// CHECK-LABEL: @xvextrins_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvextrins_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvextrins.d(<4 x i64> [[_1]], <4 x i64> [[_2]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvextrins_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvextrins_d(_1, _2, 1); }
-// CHECK-LABEL: @xvmskltz_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmskltz_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmskltz.b(<32 x i8> [[_1]])
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvmskltz_b(v32i8 _1) { return __builtin_lasx_xvmskltz_b(_1); }
-// CHECK-LABEL: @xvmskltz_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmskltz_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmskltz.h(<16 x i16> [[_1]])
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvmskltz_h(v16i16 _1) { return __builtin_lasx_xvmskltz_h(_1); }
-// CHECK-LABEL: @xvmskltz_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmskltz_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmskltz.w(<8 x i32> [[_1]])
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvmskltz_w(v8i32 _1) { return __builtin_lasx_xvmskltz_w(_1); }
-// CHECK-LABEL: @xvmskltz_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmskltz_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmskltz.d(<4 x i64> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvmskltz_d(v4i64 _1) { return __builtin_lasx_xvmskltz_d(_1); }
-// CHECK-LABEL: @xvsigncov_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsigncov_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsigncov.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvsigncov_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvsigncov_b(_1, _2); }
-// CHECK-LABEL: @xvsigncov_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsigncov_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsigncov.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvsigncov_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvsigncov_h(_1, _2); }
-// CHECK-LABEL: @xvsigncov_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsigncov_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsigncov.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvsigncov_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvsigncov_w(_1, _2); }
-// CHECK-LABEL: @xvsigncov_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsigncov_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsigncov.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvsigncov_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvsigncov_d(_1, _2); }
-// CHECK-LABEL: @xvfmadd_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <8 x float>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfmadd_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <8 x float>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfmadd.s(<8 x float> [[_1]], <8 x float> [[_2]], <8 x float> [[_3]])
-// CHECK-NEXT:    store <8 x float> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x float> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8f32 xvfmadd_s(v8f32 _1, v8f32 _2, v8f32 _3) { return __builtin_lasx_xvfmadd_s(_1, _2, _3); }
-// CHECK-LABEL: @xvfmadd_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <4 x double>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfmadd_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <4 x double>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfmadd.d(<4 x double> [[_1]], <4 x double> [[_2]], <4 x double> [[_3]])
-// CHECK-NEXT:    store <4 x double> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x double> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4f64 xvfmadd_d(v4f64 _1, v4f64 _2, v4f64 _3) { return __builtin_lasx_xvfmadd_d(_1, _2, _3); }
-// CHECK-LABEL: @xvfmsub_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <8 x float>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfmsub_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <8 x float>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfmsub.s(<8 x float> [[_1]], <8 x float> [[_2]], <8 x float> [[_3]])
-// CHECK-NEXT:    store <8 x float> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x float> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8f32 xvfmsub_s(v8f32 _1, v8f32 _2, v8f32 _3) { return __builtin_lasx_xvfmsub_s(_1, _2, _3); }
-// CHECK-LABEL: @xvfmsub_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <4 x double>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfmsub_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <4 x double>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfmsub.d(<4 x double> [[_1]], <4 x double> [[_2]], <4 x double> [[_3]])
-// CHECK-NEXT:    store <4 x double> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x double> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4f64 xvfmsub_d(v4f64 _1, v4f64 _2, v4f64 _3) { return __builtin_lasx_xvfmsub_d(_1, _2, _3); }
-// CHECK-LABEL: @xvfnmadd_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <8 x float>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfnmadd_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <8 x float>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfnmadd.s(<8 x float> [[_1]], <8 x float> [[_2]], <8 x float> [[_3]])
-// CHECK-NEXT:    store <8 x float> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x float> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8f32 xvfnmadd_s(v8f32 _1, v8f32 _2, v8f32 _3) { return __builtin_lasx_xvfnmadd_s(_1, _2, _3); }
-// CHECK-LABEL: @xvfnmadd_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <4 x double>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfnmadd_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <4 x double>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfnmadd.d(<4 x double> [[_1]], <4 x double> [[_2]], <4 x double> [[_3]])
-// CHECK-NEXT:    store <4 x double> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x double> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4f64 xvfnmadd_d(v4f64 _1, v4f64 _2, v4f64 _3) { return __builtin_lasx_xvfnmadd_d(_1, _2, _3); }
-// CHECK-LABEL: @xvfnmsub_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <8 x float>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfnmsub_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <8 x float>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfnmsub.s(<8 x float> [[_1]], <8 x float> [[_2]], <8 x float> [[_3]])
-// CHECK-NEXT:    store <8 x float> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x float> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8f32 xvfnmsub_s(v8f32 _1, v8f32 _2, v8f32 _3) { return __builtin_lasx_xvfnmsub_s(_1, _2, _3); }
-// CHECK-LABEL: @xvfnmsub_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <4 x double>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfnmsub_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <4 x double>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfnmsub.d(<4 x double> [[_1]], <4 x double> [[_2]], <4 x double> [[_3]])
-// CHECK-NEXT:    store <4 x double> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x double> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4f64 xvfnmsub_d(v4f64 _1, v4f64 _2, v4f64 _3) { return __builtin_lasx_xvfnmsub_d(_1, _2, _3); }
-// CHECK-LABEL: @xvftintrne_w_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvftintrne_w_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrne.w.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvftintrne_w_s(v8f32 _1) { return __builtin_lasx_xvftintrne_w_s(_1); }
-// CHECK-LABEL: @xvftintrne_l_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvftintrne_l_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrne.l.d(<4 x double> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvftintrne_l_d(v4f64 _1) { return __builtin_lasx_xvftintrne_l_d(_1); }
-// CHECK-LABEL: @xvftintrp_w_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvftintrp_w_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrp.w.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvftintrp_w_s(v8f32 _1) { return __builtin_lasx_xvftintrp_w_s(_1); }
-// CHECK-LABEL: @xvftintrp_l_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvftintrp_l_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrp.l.d(<4 x double> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvftintrp_l_d(v4f64 _1) { return __builtin_lasx_xvftintrp_l_d(_1); }
-// CHECK-LABEL: @xvftintrm_w_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvftintrm_w_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrm.w.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvftintrm_w_s(v8f32 _1) { return __builtin_lasx_xvftintrm_w_s(_1); }
-// CHECK-LABEL: @xvftintrm_l_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvftintrm_l_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrm.l.d(<4 x double> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvftintrm_l_d(v4f64 _1) { return __builtin_lasx_xvftintrm_l_d(_1); }
-// CHECK-LABEL: @xvftint_w_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvftint_w_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftint.w.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvftint_w_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvftint_w_d(_1, _2); }
-// CHECK-LABEL: @xvffint_s_l(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvffint_s_l(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvffint.s.l(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8f32 xvffint_s_l(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvffint_s_l(_1, _2); }
-// CHECK-LABEL: @xvftintrz_w_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvftintrz_w_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrz.w.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvftintrz_w_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvftintrz_w_d(_1, _2); }
-// CHECK-LABEL: @xvftintrp_w_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvftintrp_w_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrp.w.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvftintrp_w_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvftintrp_w_d(_1, _2); }
-// CHECK-LABEL: @xvftintrm_w_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvftintrm_w_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrm.w.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvftintrm_w_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvftintrm_w_d(_1, _2); }
-// CHECK-LABEL: @xvftintrne_w_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvftintrne_w_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrne.w.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvftintrne_w_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvftintrne_w_d(_1, _2); }
-// CHECK-LABEL: @xvftinth_l_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvftinth_l_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftinth.l.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvftinth_l_s(v8f32 _1) { return __builtin_lasx_xvftinth_l_s(_1); }
-// CHECK-LABEL: @xvftintl_l_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvftintl_l_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintl.l.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvftintl_l_s(v8f32 _1) { return __builtin_lasx_xvftintl_l_s(_1); }
-// CHECK-LABEL: @xvffinth_d_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvffinth_d_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvffinth.d.w(<8 x i32> [[_1]])
-// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4f64 xvffinth_d_w(v8i32 _1) { return __builtin_lasx_xvffinth_d_w(_1); }
-// CHECK-LABEL: @xvffintl_d_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvffintl_d_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvffintl.d.w(<8 x i32> [[_1]])
-// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4f64 xvffintl_d_w(v8i32 _1) { return __builtin_lasx_xvffintl_d_w(_1); }
-// CHECK-LABEL: @xvftintrzh_l_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvftintrzh_l_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrzh.l.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvftintrzh_l_s(v8f32 _1) { return __builtin_lasx_xvftintrzh_l_s(_1); }
-// CHECK-LABEL: @xvftintrzl_l_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvftintrzl_l_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrzl.l.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvftintrzl_l_s(v8f32 _1) { return __builtin_lasx_xvftintrzl_l_s(_1); }
-// CHECK-LABEL: @xvftintrph_l_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvftintrph_l_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrph.l.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvftintrph_l_s(v8f32 _1) { return __builtin_lasx_xvftintrph_l_s(_1); }
-// CHECK-LABEL: @xvftintrpl_l_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvftintrpl_l_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrpl.l.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvftintrpl_l_s(v8f32 _1) { return __builtin_lasx_xvftintrpl_l_s(_1); }
-// CHECK-LABEL: @xvftintrmh_l_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvftintrmh_l_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrmh.l.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvftintrmh_l_s(v8f32 _1) { return __builtin_lasx_xvftintrmh_l_s(_1); }
-// CHECK-LABEL: @xvftintrml_l_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvftintrml_l_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrml.l.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvftintrml_l_s(v8f32 _1) { return __builtin_lasx_xvftintrml_l_s(_1); }
-// CHECK-LABEL: @xvftintrneh_l_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvftintrneh_l_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrneh.l.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvftintrneh_l_s(v8f32 _1) { return __builtin_lasx_xvftintrneh_l_s(_1); }
-// CHECK-LABEL: @xvftintrnel_l_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvftintrnel_l_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrnel.l.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvftintrnel_l_s(v8f32 _1) { return __builtin_lasx_xvftintrnel_l_s(_1); }
-// CHECK-LABEL: @xvfrintrne_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfrintrne_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfrintrne.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvfrintrne_s(v8f32 _1) { return __builtin_lasx_xvfrintrne_s(_1); }
-// CHECK-LABEL: @xvfrintrne_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfrintrne_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfrintrne.d(<4 x double> [[_1]])
-// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvfrintrne_d(v4f64 _1) { return __builtin_lasx_xvfrintrne_d(_1); }
-// CHECK-LABEL: @xvfrintrz_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfrintrz_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfrintrz.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvfrintrz_s(v8f32 _1) { return __builtin_lasx_xvfrintrz_s(_1); }
-// CHECK-LABEL: @xvfrintrz_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfrintrz_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfrintrz.d(<4 x double> [[_1]])
-// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvfrintrz_d(v4f64 _1) { return __builtin_lasx_xvfrintrz_d(_1); }
-// CHECK-LABEL: @xvfrintrp_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfrintrp_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfrintrp.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvfrintrp_s(v8f32 _1) { return __builtin_lasx_xvfrintrp_s(_1); }
-// CHECK-LABEL: @xvfrintrp_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfrintrp_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfrintrp.d(<4 x double> [[_1]])
-// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvfrintrp_d(v4f64 _1) { return __builtin_lasx_xvfrintrp_d(_1); }
-// CHECK-LABEL: @xvfrintrm_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfrintrm_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfrintrm.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvfrintrm_s(v8f32 _1) { return __builtin_lasx_xvfrintrm_s(_1); }
-// CHECK-LABEL: @xvfrintrm_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfrintrm_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfrintrm.d(<4 x double> [[_1]])
-// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvfrintrm_d(v4f64 _1) { return __builtin_lasx_xvfrintrm_d(_1); }
-// CHECK-LABEL: @xvld(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvld(ptr [[_1:%.*]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP0]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvld(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr noundef [[_1:%.*]]) local_unnamed_addr #[[ATTR3:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvld(ptr [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP0]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvld(void *_1) { return __builtin_lasx_xvld(_1, 1); }
-// CHECK-LABEL: @xvst(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    tail call void @llvm.loongarch.lasx.xvst(<32 x i8> [[_1]], ptr [[_2:%.*]], i32 1)
+// CHECK-LABEL: define dso_local void @xvst(
+// CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr noundef [[_2:%.*]]) local_unnamed_addr #[[ATTR5:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    tail call void @llvm.loongarch.lasx.xvst(<32 x i8> [[_1]], ptr [[_2]], i32 1)
 // CHECK-NEXT:    ret void
 //
 void xvst(v32i8 _1, void *_2) { return __builtin_lasx_xvst(_1, _2, 1); }
-// CHECK-LABEL: @xvstelm_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    tail call void @llvm.loongarch.lasx.xvstelm.b(<32 x i8> [[_1]], ptr [[_2:%.*]], i32 1, i32 1)
+// CHECK-LABEL: define dso_local void @xvstelm_b(
+// CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr noundef [[_2:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    tail call void @llvm.loongarch.lasx.xvstelm.b(<32 x i8> [[_1]], ptr [[_2]], i32 1, i32 1)
 // CHECK-NEXT:    ret void
 //
 void xvstelm_b(v32i8 _1, void * _2) { return __builtin_lasx_xvstelm_b(_1, _2, 1, 1); }
-// CHECK-LABEL: @xvstelm_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    tail call void @llvm.loongarch.lasx.xvstelm.h(<16 x i16> [[_1]], ptr [[_2:%.*]], i32 2, i32 1)
+// CHECK-LABEL: define dso_local void @xvstelm_h(
+// CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr noundef [[_2:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    tail call void @llvm.loongarch.lasx.xvstelm.h(<16 x i16> [[_1]], ptr [[_2]], i32 2, i32 1)
 // CHECK-NEXT:    ret void
 //
 void xvstelm_h(v16i16 _1, void * _2) { return __builtin_lasx_xvstelm_h(_1, _2, 2, 1); }
-// CHECK-LABEL: @xvstelm_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    tail call void @llvm.loongarch.lasx.xvstelm.w(<8 x i32> [[_1]], ptr [[_2:%.*]], i32 4, i32 1)
+// CHECK-LABEL: define dso_local void @xvstelm_w(
+// CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr noundef [[_2:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    tail call void @llvm.loongarch.lasx.xvstelm.w(<8 x i32> [[_1]], ptr [[_2]], i32 4, i32 1)
 // CHECK-NEXT:    ret void
 //
 void xvstelm_w(v8i32 _1, void * _2) { return __builtin_lasx_xvstelm_w(_1, _2, 4, 1); }
-// CHECK-LABEL: @xvstelm_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    tail call void @llvm.loongarch.lasx.xvstelm.d(<4 x i64> [[_1]], ptr [[_2:%.*]], i32 8, i32 1)
+// CHECK-LABEL: define dso_local void @xvstelm_d(
+// CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr noundef [[_2:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    tail call void @llvm.loongarch.lasx.xvstelm.d(<4 x i64> [[_1]], ptr [[_2]], i32 8, i32 1)
 // CHECK-NEXT:    ret void
 //
 void xvstelm_d(v4i64 _1, void * _2) { return __builtin_lasx_xvstelm_d(_1, _2, 8, 1); }
-// CHECK-LABEL: @xvinsve0_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvinsve0_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvinsve0.w(<8 x i32> [[_1]], <8 x i32> [[_2]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvinsve0_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvinsve0_w(_1, _2, 1); }
-// CHECK-LABEL: @xvinsve0_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvinsve0_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvinsve0.d(<4 x i64> [[_1]], <4 x i64> [[_2]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvinsve0_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvinsve0_d(_1, _2, 1); }
-// CHECK-LABEL: @xvpickve_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvpickve_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvpickve.w(<8 x i32> [[_1]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvpickve_w(v8i32 _1) { return __builtin_lasx_xvpickve_w(_1, 1); }
-// CHECK-LABEL: @xvpickve_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvpickve_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvpickve.d(<4 x i64> [[_1]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvpickve_d(v4i64 _1) { return __builtin_lasx_xvpickve_d(_1, 1); }
-// CHECK-LABEL: @xvssrlrn_b_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssrlrn_b_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrlrn.b.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvssrlrn_b_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvssrlrn_b_h(_1, _2); }
-// CHECK-LABEL: @xvssrlrn_h_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssrlrn_h_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrlrn.h.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvssrlrn_h_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvssrlrn_h_w(_1, _2); }
-// CHECK-LABEL: @xvssrlrn_w_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssrlrn_w_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrlrn.w.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvssrlrn_w_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvssrlrn_w_d(_1, _2); }
-// CHECK-LABEL: @xvssrln_b_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssrln_b_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrln.b.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvssrln_b_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvssrln_b_h(_1, _2); }
-// CHECK-LABEL: @xvssrln_h_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssrln_h_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrln.h.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvssrln_h_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvssrln_h_w(_1, _2); }
-// CHECK-LABEL: @xvssrln_w_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssrln_w_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrln.w.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvssrln_w_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvssrln_w_d(_1, _2); }
-// CHECK-LABEL: @xvorn_v(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvorn_v(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvorn.v(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvorn_v(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvorn_v(_1, _2); }
-// CHECK-LABEL: @xvldi(
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define dso_local void @xvldi(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvldi(i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP0]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP0]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvldi() { return __builtin_lasx_xvldi(1); }
-// CHECK-LABEL: @xvldx(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvldx(ptr [[_1:%.*]], i64 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP0]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvldx(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr noundef [[_1:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvldx(ptr [[_1]], i64 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP0]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvldx(void *_1) { return __builtin_lasx_xvldx(_1, 1); }
-// CHECK-LABEL: @xvstx(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    tail call void @llvm.loongarch.lasx.xvstx(<32 x i8> [[_1]], ptr [[_2:%.*]], i64 1)
+// CHECK-LABEL: define dso_local void @xvstx(
+// CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr noundef [[_2:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    tail call void @llvm.loongarch.lasx.xvstx(<32 x i8> [[_1]], ptr [[_2]], i64 1)
 // CHECK-NEXT:    ret void
 //
 void xvstx(v32i8 _1, void *_2) { return __builtin_lasx_xvstx(_1, _2, 1); }
-// CHECK-LABEL: @xvextl_qu_du(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvextl_qu_du(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvextl.qu.du(<4 x i64> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvextl_qu_du(v4u64 _1) { return __builtin_lasx_xvextl_qu_du(_1); }
-// CHECK-LABEL: @xvinsgr2vr_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvinsgr2vr_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvinsgr2vr.w(<8 x i32> [[_1]], i32 1, i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvinsgr2vr_w(v8i32 _1) { return __builtin_lasx_xvinsgr2vr_w(_1, 1, 1); }
-// CHECK-LABEL: @xvinsgr2vr_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvinsgr2vr_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvinsgr2vr.d(<4 x i64> [[_1]], i64 1, i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvinsgr2vr_d(v4i64 _1) { return __builtin_lasx_xvinsgr2vr_d(_1, 1, 1); }
-// CHECK-LABEL: @xvreplve0_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvreplve0_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvreplve0.b(<32 x i8> [[_1]])
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvreplve0_b(v32i8 _1) { return __builtin_lasx_xvreplve0_b(_1); }
-// CHECK-LABEL: @xvreplve0_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvreplve0_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvreplve0.h(<16 x i16> [[_1]])
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvreplve0_h(v16i16 _1) { return __builtin_lasx_xvreplve0_h(_1); }
-// CHECK-LABEL: @xvreplve0_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvreplve0_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvreplve0.w(<8 x i32> [[_1]])
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvreplve0_w(v8i32 _1) { return __builtin_lasx_xvreplve0_w(_1); }
-// CHECK-LABEL: @xvreplve0_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvreplve0_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvreplve0.d(<4 x i64> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvreplve0_d(v4i64 _1) { return __builtin_lasx_xvreplve0_d(_1); }
-// CHECK-LABEL: @xvreplve0_q(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvreplve0_q(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvreplve0.q(<32 x i8> [[_1]])
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvreplve0_q(v32i8 _1) { return __builtin_lasx_xvreplve0_q(_1); }
-// CHECK-LABEL: @vext2xv_h_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @vext2xv_h_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.vext2xv.h.b(<32 x i8> [[_1]])
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 vext2xv_h_b(v32i8 _1) { return __builtin_lasx_vext2xv_h_b(_1); }
-// CHECK-LABEL: @vext2xv_w_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @vext2xv_w_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.vext2xv.w.h(<16 x i16> [[_1]])
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 vext2xv_w_h(v16i16 _1) { return __builtin_lasx_vext2xv_w_h(_1); }
-// CHECK-LABEL: @vext2xv_d_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @vext2xv_d_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.vext2xv.d.w(<8 x i32> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 vext2xv_d_w(v8i32 _1) { return __builtin_lasx_vext2xv_d_w(_1); }
-// CHECK-LABEL: @vext2xv_w_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @vext2xv_w_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.vext2xv.w.b(<32 x i8> [[_1]])
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 vext2xv_w_b(v32i8 _1) { return __builtin_lasx_vext2xv_w_b(_1); }
-// CHECK-LABEL: @vext2xv_d_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @vext2xv_d_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.vext2xv.d.h(<16 x i16> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 vext2xv_d_h(v16i16 _1) { return __builtin_lasx_vext2xv_d_h(_1); }
-// CHECK-LABEL: @vext2xv_d_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @vext2xv_d_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.vext2xv.d.b(<32 x i8> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 vext2xv_d_b(v32i8 _1) { return __builtin_lasx_vext2xv_d_b(_1); }
-// CHECK-LABEL: @vext2xv_hu_bu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @vext2xv_hu_bu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.vext2xv.hu.bu(<32 x i8> [[_1]])
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 vext2xv_hu_bu(v32i8 _1) { return __builtin_lasx_vext2xv_hu_bu(_1); }
-// CHECK-LABEL: @vext2xv_wu_hu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @vext2xv_wu_hu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.vext2xv.wu.hu(<16 x i16> [[_1]])
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 vext2xv_wu_hu(v16i16 _1) { return __builtin_lasx_vext2xv_wu_hu(_1); }
-// CHECK-LABEL: @vext2xv_du_wu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @vext2xv_du_wu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.vext2xv.du.wu(<8 x i32> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 vext2xv_du_wu(v8i32 _1) { return __builtin_lasx_vext2xv_du_wu(_1); }
-// CHECK-LABEL: @vext2xv_wu_bu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @vext2xv_wu_bu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.vext2xv.wu.bu(<32 x i8> [[_1]])
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 vext2xv_wu_bu(v32i8 _1) { return __builtin_lasx_vext2xv_wu_bu(_1); }
-// CHECK-LABEL: @vext2xv_du_hu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @vext2xv_du_hu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.vext2xv.du.hu(<16 x i16> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 vext2xv_du_hu(v16i16 _1) { return __builtin_lasx_vext2xv_du_hu(_1); }
-// CHECK-LABEL: @vext2xv_du_bu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @vext2xv_du_bu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.vext2xv.du.bu(<32 x i8> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 vext2xv_du_bu(v32i8 _1) { return __builtin_lasx_vext2xv_du_bu(_1); }
-// CHECK-LABEL: @xvpermi_q(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvpermi_q(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvpermi.q(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvpermi_q(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvpermi_q(_1, _2, 1); }
-// CHECK-LABEL: @xvpermi_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvpermi_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvpermi.d(<4 x i64> [[_1]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvpermi_d(v4i64 _1) { return __builtin_lasx_xvpermi_d(_1, 1); }
-// CHECK-LABEL: @xvperm_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvperm_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvperm.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvperm_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvperm_w(_1, _2); }
-// CHECK-LABEL: @xvldrepl_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvldrepl.b(ptr [[_1:%.*]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP0]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvldrepl_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr noundef [[_1:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvldrepl.b(ptr [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP0]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvldrepl_b(void *_1) { return __builtin_lasx_xvldrepl_b(_1, 1); }
-// CHECK-LABEL: @xvldrepl_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvldrepl.h(ptr [[_1:%.*]], i32 2)
-// CHECK-NEXT:    store <16 x i16> [[TMP0]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvldrepl_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr noundef [[_1:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvldrepl.h(ptr [[_1]], i32 2)
+// CHECK-NEXT:    store <16 x i16> [[TMP0]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvldrepl_h(void *_1) { return __builtin_lasx_xvldrepl_h(_1, 2); }
-// CHECK-LABEL: @xvldrepl_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvldrepl.w(ptr [[_1:%.*]], i32 4)
-// CHECK-NEXT:    store <8 x i32> [[TMP0]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvldrepl_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr noundef [[_1:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvldrepl.w(ptr [[_1]], i32 4)
+// CHECK-NEXT:    store <8 x i32> [[TMP0]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvldrepl_w(void *_1) { return __builtin_lasx_xvldrepl_w(_1, 4); }
-// CHECK-LABEL: @xvldrepl_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvldrepl.d(ptr [[_1:%.*]], i32 8)
-// CHECK-NEXT:    store <4 x i64> [[TMP0]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvldrepl_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr noundef [[_1:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvldrepl.d(ptr [[_1]], i32 8)
+// CHECK-NEXT:    store <4 x i64> [[TMP0]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvldrepl_d(void *_1) { return __builtin_lasx_xvldrepl_d(_1, 8); }
-// CHECK-LABEL: @xvpickve2gr_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local signext i32 @xvpickve2gr_w(
+// CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR7:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lasx.xvpickve2gr.w(<8 x i32> [[_1]], i32 1)
 // CHECK-NEXT:    ret i32 [[TMP1]]
 //
 int xvpickve2gr_w(v8i32 _1) { return __builtin_lasx_xvpickve2gr_w(_1, 1); }
-// CHECK-LABEL: @xvpickve2gr_wu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local signext i32 @xvpickve2gr_wu(
+// CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR7]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lasx.xvpickve2gr.wu(<8 x i32> [[_1]], i32 1)
 // CHECK-NEXT:    ret i32 [[TMP1]]
 //
 unsigned int xvpickve2gr_wu(v8i32 _1) { return __builtin_lasx_xvpickve2gr_wu(_1, 1); }
-// CHECK-LABEL: @xvpickve2gr_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local i64 @xvpickve2gr_d(
+// CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR7]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.loongarch.lasx.xvpickve2gr.d(<4 x i64> [[_1]], i32 1)
 // CHECK-NEXT:    ret i64 [[TMP1]]
 //
 long xvpickve2gr_d(v4i64 _1) { return __builtin_lasx_xvpickve2gr_d(_1, 1); }
-// CHECK-LABEL: @xvpickve2gr_du(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local i64 @xvpickve2gr_du(
+// CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR7]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.loongarch.lasx.xvpickve2gr.du(<4 x i64> [[_1]], i32 1)
 // CHECK-NEXT:    ret i64 [[TMP1]]
 //
 unsigned long int xvpickve2gr_du(v4i64 _1) { return __builtin_lasx_xvpickve2gr_du(_1, 1); }
-// CHECK-LABEL: @xvaddwev_q_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvaddwev_q_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwev.q.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvaddwev_q_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvaddwev_q_d(_1, _2); }
-// CHECK-LABEL: @xvaddwev_d_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvaddwev_d_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwev.d.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvaddwev_d_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvaddwev_d_w(_1, _2); }
-// CHECK-LABEL: @xvaddwev_w_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvaddwev_w_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvaddwev.w.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvaddwev_w_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvaddwev_w_h(_1, _2); }
-// CHECK-LABEL: @xvaddwev_h_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvaddwev_h_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvaddwev.h.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvaddwev_h_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvaddwev_h_b(_1, _2); }
-// CHECK-LABEL: @xvaddwev_q_du(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvaddwev_q_du(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwev.q.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvaddwev_q_du(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvaddwev_q_du(_1, _2); }
-// CHECK-LABEL: @xvaddwev_d_wu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvaddwev_d_wu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwev.d.wu(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvaddwev_d_wu(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvaddwev_d_wu(_1, _2); }
-// CHECK-LABEL: @xvaddwev_w_hu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvaddwev_w_hu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvaddwev.w.hu(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvaddwev_w_hu(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvaddwev_w_hu(_1, _2); }
-// CHECK-LABEL: @xvaddwev_h_bu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvaddwev_h_bu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvaddwev.h.bu(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvaddwev_h_bu(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvaddwev_h_bu(_1, _2); }
-// CHECK-LABEL: @xvsubwev_q_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsubwev_q_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubwev.q.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvsubwev_q_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvsubwev_q_d(_1, _2); }
-// CHECK-LABEL: @xvsubwev_d_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsubwev_d_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubwev.d.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvsubwev_d_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvsubwev_d_w(_1, _2); }
-// CHECK-LABEL: @xvsubwev_w_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsubwev_w_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsubwev.w.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvsubwev_w_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvsubwev_w_h(_1, _2); }
-// CHECK-LABEL: @xvsubwev_h_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsubwev_h_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsubwev.h.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvsubwev_h_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvsubwev_h_b(_1, _2); }
-// CHECK-LABEL: @xvsubwev_q_du(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsubwev_q_du(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubwev.q.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvsubwev_q_du(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvsubwev_q_du(_1, _2); }
-// CHECK-LABEL: @xvsubwev_d_wu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsubwev_d_wu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubwev.d.wu(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvsubwev_d_wu(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvsubwev_d_wu(_1, _2); }
-// CHECK-LABEL: @xvsubwev_w_hu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsubwev_w_hu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsubwev.w.hu(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvsubwev_w_hu(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvsubwev_w_hu(_1, _2); }
-// CHECK-LABEL: @xvsubwev_h_bu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsubwev_h_bu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsubwev.h.bu(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvsubwev_h_bu(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvsubwev_h_bu(_1, _2); }
-// CHECK-LABEL: @xvmulwev_q_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmulwev_q_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwev.q.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvmulwev_q_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvmulwev_q_d(_1, _2); }
-// CHECK-LABEL: @xvmulwev_d_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmulwev_d_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwev.d.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvmulwev_d_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvmulwev_d_w(_1, _2); }
-// CHECK-LABEL: @xvmulwev_w_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmulwev_w_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmulwev.w.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvmulwev_w_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvmulwev_w_h(_1, _2); }
-// CHECK-LABEL: @xvmulwev_h_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmulwev_h_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmulwev.h.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvmulwev_h_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvmulwev_h_b(_1, _2); }
-// CHECK-LABEL: @xvmulwev_q_du(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmulwev_q_du(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwev.q.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvmulwev_q_du(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvmulwev_q_du(_1, _2); }
-// CHECK-LABEL: @xvmulwev_d_wu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmulwev_d_wu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwev.d.wu(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvmulwev_d_wu(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvmulwev_d_wu(_1, _2); }
-// CHECK-LABEL: @xvmulwev_w_hu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmulwev_w_hu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmulwev.w.hu(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvmulwev_w_hu(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvmulwev_w_hu(_1, _2); }
-// CHECK-LABEL: @xvmulwev_h_bu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmulwev_h_bu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmulwev.h.bu(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvmulwev_h_bu(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvmulwev_h_bu(_1, _2); }
-// CHECK-LABEL: @xvaddwod_q_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvaddwod_q_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwod.q.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvaddwod_q_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvaddwod_q_d(_1, _2); }
-// CHECK-LABEL: @xvaddwod_d_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvaddwod_d_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwod.d.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvaddwod_d_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvaddwod_d_w(_1, _2); }
-// CHECK-LABEL: @xvaddwod_w_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvaddwod_w_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvaddwod.w.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvaddwod_w_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvaddwod_w_h(_1, _2); }
-// CHECK-LABEL: @xvaddwod_h_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvaddwod_h_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvaddwod.h.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvaddwod_h_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvaddwod_h_b(_1, _2); }
-// CHECK-LABEL: @xvaddwod_q_du(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvaddwod_q_du(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwod.q.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvaddwod_q_du(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvaddwod_q_du(_1, _2); }
-// CHECK-LABEL: @xvaddwod_d_wu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvaddwod_d_wu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwod.d.wu(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvaddwod_d_wu(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvaddwod_d_wu(_1, _2); }
-// CHECK-LABEL: @xvaddwod_w_hu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvaddwod_w_hu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvaddwod.w.hu(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvaddwod_w_hu(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvaddwod_w_hu(_1, _2); }
-// CHECK-LABEL: @xvaddwod_h_bu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvaddwod_h_bu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvaddwod.h.bu(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvaddwod_h_bu(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvaddwod_h_bu(_1, _2); }
-// CHECK-LABEL: @xvsubwod_q_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsubwod_q_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubwod.q.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvsubwod_q_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvsubwod_q_d(_1, _2); }
-// CHECK-LABEL: @xvsubwod_d_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsubwod_d_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubwod.d.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvsubwod_d_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvsubwod_d_w(_1, _2); }
-// CHECK-LABEL: @xvsubwod_w_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsubwod_w_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsubwod.w.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvsubwod_w_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvsubwod_w_h(_1, _2); }
-// CHECK-LABEL: @xvsubwod_h_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsubwod_h_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsubwod.h.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvsubwod_h_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvsubwod_h_b(_1, _2); }
-// CHECK-LABEL: @xvsubwod_q_du(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsubwod_q_du(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubwod.q.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvsubwod_q_du(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvsubwod_q_du(_1, _2); }
-// CHECK-LABEL: @xvsubwod_d_wu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsubwod_d_wu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubwod.d.wu(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvsubwod_d_wu(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvsubwod_d_wu(_1, _2); }
-// CHECK-LABEL: @xvsubwod_w_hu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsubwod_w_hu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsubwod.w.hu(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvsubwod_w_hu(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvsubwod_w_hu(_1, _2); }
-// CHECK-LABEL: @xvsubwod_h_bu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsubwod_h_bu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsubwod.h.bu(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvsubwod_h_bu(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvsubwod_h_bu(_1, _2); }
-// CHECK-LABEL: @xvmulwod_q_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmulwod_q_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwod.q.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvmulwod_q_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvmulwod_q_d(_1, _2); }
-// CHECK-LABEL: @xvmulwod_d_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmulwod_d_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwod.d.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvmulwod_d_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvmulwod_d_w(_1, _2); }
-// CHECK-LABEL: @xvmulwod_w_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmulwod_w_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmulwod.w.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvmulwod_w_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvmulwod_w_h(_1, _2); }
-// CHECK-LABEL: @xvmulwod_h_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmulwod_h_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmulwod.h.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvmulwod_h_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvmulwod_h_b(_1, _2); }
-// CHECK-LABEL: @xvmulwod_q_du(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmulwod_q_du(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwod.q.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvmulwod_q_du(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvmulwod_q_du(_1, _2); }
-// CHECK-LABEL: @xvmulwod_d_wu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmulwod_d_wu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwod.d.wu(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvmulwod_d_wu(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvmulwod_d_wu(_1, _2); }
-// CHECK-LABEL: @xvmulwod_w_hu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmulwod_w_hu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmulwod.w.hu(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvmulwod_w_hu(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvmulwod_w_hu(_1, _2); }
-// CHECK-LABEL: @xvmulwod_h_bu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmulwod_h_bu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmulwod.h.bu(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvmulwod_h_bu(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvmulwod_h_bu(_1, _2); }
-// CHECK-LABEL: @xvaddwev_d_wu_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvaddwev_d_wu_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwev.d.wu.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvaddwev_d_wu_w(v8u32 _1, v8i32 _2) { return __builtin_lasx_xvaddwev_d_wu_w(_1, _2); }
-// CHECK-LABEL: @xvaddwev_w_hu_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvaddwev_w_hu_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvaddwev.w.hu.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvaddwev_w_hu_h(v16u16 _1, v16i16 _2) { return __builtin_lasx_xvaddwev_w_hu_h(_1, _2); }
-// CHECK-LABEL: @xvaddwev_h_bu_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvaddwev_h_bu_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvaddwev.h.bu.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvaddwev_h_bu_b(v32u8 _1, v32i8 _2) { return __builtin_lasx_xvaddwev_h_bu_b(_1, _2); }
-// CHECK-LABEL: @xvmulwev_d_wu_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmulwev_d_wu_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwev.d.wu.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvmulwev_d_wu_w(v8u32 _1, v8i32 _2) { return __builtin_lasx_xvmulwev_d_wu_w(_1, _2); }
-// CHECK-LABEL: @xvmulwev_w_hu_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmulwev_w_hu_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmulwev.w.hu.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvmulwev_w_hu_h(v16u16 _1, v16i16 _2) { return __builtin_lasx_xvmulwev_w_hu_h(_1, _2); }
-// CHECK-LABEL: @xvmulwev_h_bu_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmulwev_h_bu_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmulwev.h.bu.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvmulwev_h_bu_b(v32u8 _1, v32i8 _2) { return __builtin_lasx_xvmulwev_h_bu_b(_1, _2); }
-// CHECK-LABEL: @xvaddwod_d_wu_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvaddwod_d_wu_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwod.d.wu.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvaddwod_d_wu_w(v8u32 _1, v8i32 _2) { return __builtin_lasx_xvaddwod_d_wu_w(_1, _2); }
-// CHECK-LABEL: @xvaddwod_w_hu_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvaddwod_w_hu_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvaddwod.w.hu.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvaddwod_w_hu_h(v16u16 _1, v16i16 _2) { return __builtin_lasx_xvaddwod_w_hu_h(_1, _2); }
-// CHECK-LABEL: @xvaddwod_h_bu_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvaddwod_h_bu_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvaddwod.h.bu.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvaddwod_h_bu_b(v32u8 _1, v32i8 _2) { return __builtin_lasx_xvaddwod_h_bu_b(_1, _2); }
-// CHECK-LABEL: @xvmulwod_d_wu_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmulwod_d_wu_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwod.d.wu.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvmulwod_d_wu_w(v8u32 _1, v8i32 _2) { return __builtin_lasx_xvmulwod_d_wu_w(_1, _2); }
-// CHECK-LABEL: @xvmulwod_w_hu_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmulwod_w_hu_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmulwod.w.hu.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvmulwod_w_hu_h(v16u16 _1, v16i16 _2) { return __builtin_lasx_xvmulwod_w_hu_h(_1, _2); }
-// CHECK-LABEL: @xvmulwod_h_bu_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmulwod_h_bu_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmulwod.h.bu.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvmulwod_h_bu_b(v32u8 _1, v32i8 _2) { return __builtin_lasx_xvmulwod_h_bu_b(_1, _2); }
-// CHECK-LABEL: @xvhaddw_q_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvhaddw_q_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvhaddw.q.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvhaddw_q_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvhaddw_q_d(_1, _2); }
-// CHECK-LABEL: @xvhaddw_qu_du(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvhaddw_qu_du(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvhaddw.qu.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvhaddw_qu_du(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvhaddw_qu_du(_1, _2); }
-// CHECK-LABEL: @xvhsubw_q_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvhsubw_q_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvhsubw.q.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvhsubw_q_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvhsubw_q_d(_1, _2); }
-// CHECK-LABEL: @xvhsubw_qu_du(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvhsubw_qu_du(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvhsubw.qu.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvhsubw_qu_du(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvhsubw_qu_du(_1, _2); }
-// CHECK-LABEL: @xvmaddwev_q_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <4 x i64>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmaddwev_q_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <4 x i64>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwev.q.d(<4 x i64> [[_1]], <4 x i64> [[_2]], <4 x i64> [[_3]])
-// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvmaddwev_q_d(v4i64 _1, v4i64 _2, v4i64 _3) { return __builtin_lasx_xvmaddwev_q_d(_1, _2, _3); }
-// CHECK-LABEL: @xvmaddwev_d_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <8 x i32>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmaddwev_d_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <8 x i32>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwev.d.w(<4 x i64> [[_1]], <8 x i32> [[_2]], <8 x i32> [[_3]])
-// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvmaddwev_d_w(v4i64 _1, v8i32 _2, v8i32 _3) { return __builtin_lasx_xvmaddwev_d_w(_1, _2, _3); }
-// CHECK-LABEL: @xvmaddwev_w_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <16 x i16>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmaddwev_w_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <16 x i16>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmaddwev.w.h(<8 x i32> [[_1]], <16 x i16> [[_2]], <16 x i16> [[_3]])
-// CHECK-NEXT:    store <8 x i32> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvmaddwev_w_h(v8i32 _1, v16i16 _2, v16i16 _3) { return __builtin_lasx_xvmaddwev_w_h(_1, _2, _3); }
-// CHECK-LABEL: @xvmaddwev_h_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <32 x i8>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmaddwev_h_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <32 x i8>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmaddwev.h.b(<16 x i16> [[_1]], <32 x i8> [[_2]], <32 x i8> [[_3]])
-// CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvmaddwev_h_b(v16i16 _1, v32i8 _2, v32i8 _3) { return __builtin_lasx_xvmaddwev_h_b(_1, _2, _3); }
-// CHECK-LABEL: @xvmaddwev_q_du(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <4 x i64>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmaddwev_q_du(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <4 x i64>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwev.q.du(<4 x i64> [[_1]], <4 x i64> [[_2]], <4 x i64> [[_3]])
-// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvmaddwev_q_du(v4u64 _1, v4u64 _2, v4u64 _3) { return __builtin_lasx_xvmaddwev_q_du(_1, _2, _3); }
-// CHECK-LABEL: @xvmaddwev_d_wu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <8 x i32>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmaddwev_d_wu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <8 x i32>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwev.d.wu(<4 x i64> [[_1]], <8 x i32> [[_2]], <8 x i32> [[_3]])
-// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvmaddwev_d_wu(v4u64 _1, v8u32 _2, v8u32 _3) { return __builtin_lasx_xvmaddwev_d_wu(_1, _2, _3); }
-// CHECK-LABEL: @xvmaddwev_w_hu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <16 x i16>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmaddwev_w_hu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <16 x i16>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmaddwev.w.hu(<8 x i32> [[_1]], <16 x i16> [[_2]], <16 x i16> [[_3]])
-// CHECK-NEXT:    store <8 x i32> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvmaddwev_w_hu(v8u32 _1, v16u16 _2, v16u16 _3) { return __builtin_lasx_xvmaddwev_w_hu(_1, _2, _3); }
-// CHECK-LABEL: @xvmaddwev_h_bu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <32 x i8>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmaddwev_h_bu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <32 x i8>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmaddwev.h.bu(<16 x i16> [[_1]], <32 x i8> [[_2]], <32 x i8> [[_3]])
-// CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvmaddwev_h_bu(v16u16 _1, v32u8 _2, v32u8 _3) { return __builtin_lasx_xvmaddwev_h_bu(_1, _2, _3); }
-// CHECK-LABEL: @xvmaddwod_q_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <4 x i64>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmaddwod_q_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <4 x i64>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwod.q.d(<4 x i64> [[_1]], <4 x i64> [[_2]], <4 x i64> [[_3]])
-// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvmaddwod_q_d(v4i64 _1, v4i64 _2, v4i64 _3) { return __builtin_lasx_xvmaddwod_q_d(_1, _2, _3); }
-// CHECK-LABEL: @xvmaddwod_d_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <8 x i32>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmaddwod_d_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <8 x i32>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwod.d.w(<4 x i64> [[_1]], <8 x i32> [[_2]], <8 x i32> [[_3]])
-// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvmaddwod_d_w(v4i64 _1, v8i32 _2, v8i32 _3) { return __builtin_lasx_xvmaddwod_d_w(_1, _2, _3); }
-// CHECK-LABEL: @xvmaddwod_w_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <16 x i16>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmaddwod_w_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <16 x i16>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmaddwod.w.h(<8 x i32> [[_1]], <16 x i16> [[_2]], <16 x i16> [[_3]])
-// CHECK-NEXT:    store <8 x i32> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvmaddwod_w_h(v8i32 _1, v16i16 _2, v16i16 _3) { return __builtin_lasx_xvmaddwod_w_h(_1, _2, _3); }
-// CHECK-LABEL: @xvmaddwod_h_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <32 x i8>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmaddwod_h_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <32 x i8>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmaddwod.h.b(<16 x i16> [[_1]], <32 x i8> [[_2]], <32 x i8> [[_3]])
-// CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvmaddwod_h_b(v16i16 _1, v32i8 _2, v32i8 _3) { return __builtin_lasx_xvmaddwod_h_b(_1, _2, _3); }
-// CHECK-LABEL: @xvmaddwod_q_du(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <4 x i64>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmaddwod_q_du(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <4 x i64>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwod.q.du(<4 x i64> [[_1]], <4 x i64> [[_2]], <4 x i64> [[_3]])
-// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvmaddwod_q_du(v4u64 _1, v4u64 _2, v4u64 _3) { return __builtin_lasx_xvmaddwod_q_du(_1, _2, _3); }
-// CHECK-LABEL: @xvmaddwod_d_wu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <8 x i32>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmaddwod_d_wu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <8 x i32>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwod.d.wu(<4 x i64> [[_1]], <8 x i32> [[_2]], <8 x i32> [[_3]])
-// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvmaddwod_d_wu(v4u64 _1, v8u32 _2, v8u32 _3) { return __builtin_lasx_xvmaddwod_d_wu(_1, _2, _3); }
-// CHECK-LABEL: @xvmaddwod_w_hu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <16 x i16>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmaddwod_w_hu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <16 x i16>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmaddwod.w.hu(<8 x i32> [[_1]], <16 x i16> [[_2]], <16 x i16> [[_3]])
-// CHECK-NEXT:    store <8 x i32> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvmaddwod_w_hu(v8u32 _1, v16u16 _2, v16u16 _3) { return __builtin_lasx_xvmaddwod_w_hu(_1, _2, _3); }
-// CHECK-LABEL: @xvmaddwod_h_bu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <32 x i8>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmaddwod_h_bu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <32 x i8>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmaddwod.h.bu(<16 x i16> [[_1]], <32 x i8> [[_2]], <32 x i8> [[_3]])
-// CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvmaddwod_h_bu(v16u16 _1, v32u8 _2, v32u8 _3) { return __builtin_lasx_xvmaddwod_h_bu(_1, _2, _3); }
-// CHECK-LABEL: @xvmaddwev_q_du_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <4 x i64>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmaddwev_q_du_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <4 x i64>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwev.q.du.d(<4 x i64> [[_1]], <4 x i64> [[_2]], <4 x i64> [[_3]])
-// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvmaddwev_q_du_d(v4i64 _1, v4u64 _2, v4i64 _3) { return __builtin_lasx_xvmaddwev_q_du_d(_1, _2, _3); }
-// CHECK-LABEL: @xvmaddwev_d_wu_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <8 x i32>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmaddwev_d_wu_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <8 x i32>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwev.d.wu.w(<4 x i64> [[_1]], <8 x i32> [[_2]], <8 x i32> [[_3]])
-// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvmaddwev_d_wu_w(v4i64 _1, v8u32 _2, v8i32 _3) { return __builtin_lasx_xvmaddwev_d_wu_w(_1, _2, _3); }
-// CHECK-LABEL: @xvmaddwev_w_hu_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <16 x i16>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmaddwev_w_hu_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <16 x i16>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmaddwev.w.hu.h(<8 x i32> [[_1]], <16 x i16> [[_2]], <16 x i16> [[_3]])
-// CHECK-NEXT:    store <8 x i32> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvmaddwev_w_hu_h(v8i32 _1, v16u16 _2, v16i16 _3) { return __builtin_lasx_xvmaddwev_w_hu_h(_1, _2, _3); }
-// CHECK-LABEL: @xvmaddwev_h_bu_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <32 x i8>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmaddwev_h_bu_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <32 x i8>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmaddwev.h.bu.b(<16 x i16> [[_1]], <32 x i8> [[_2]], <32 x i8> [[_3]])
-// CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvmaddwev_h_bu_b(v16i16 _1, v32u8 _2, v32i8 _3) { return __builtin_lasx_xvmaddwev_h_bu_b(_1, _2, _3); }
-// CHECK-LABEL: @xvmaddwod_q_du_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <4 x i64>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmaddwod_q_du_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <4 x i64>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwod.q.du.d(<4 x i64> [[_1]], <4 x i64> [[_2]], <4 x i64> [[_3]])
-// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvmaddwod_q_du_d(v4i64 _1, v4u64 _2, v4i64 _3) { return __builtin_lasx_xvmaddwod_q_du_d(_1, _2, _3); }
-// CHECK-LABEL: @xvmaddwod_d_wu_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <8 x i32>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmaddwod_d_wu_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <8 x i32>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwod.d.wu.w(<4 x i64> [[_1]], <8 x i32> [[_2]], <8 x i32> [[_3]])
-// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvmaddwod_d_wu_w(v4i64 _1, v8u32 _2, v8i32 _3) { return __builtin_lasx_xvmaddwod_d_wu_w(_1, _2, _3); }
-// CHECK-LABEL: @xvmaddwod_w_hu_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <16 x i16>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmaddwod_w_hu_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <16 x i16>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmaddwod.w.hu.h(<8 x i32> [[_1]], <16 x i16> [[_2]], <16 x i16> [[_3]])
-// CHECK-NEXT:    store <8 x i32> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvmaddwod_w_hu_h(v8i32 _1, v16u16 _2, v16i16 _3) { return __builtin_lasx_xvmaddwod_w_hu_h(_1, _2, _3); }
-// CHECK-LABEL: @xvmaddwod_h_bu_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <32 x i8>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmaddwod_h_bu_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <32 x i8>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmaddwod.h.bu.b(<16 x i16> [[_1]], <32 x i8> [[_2]], <32 x i8> [[_3]])
-// CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvmaddwod_h_bu_b(v16i16 _1, v32u8 _2, v32i8 _3) { return __builtin_lasx_xvmaddwod_h_bu_b(_1, _2, _3); }
-// CHECK-LABEL: @xvrotr_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvrotr_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvrotr.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvrotr_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvrotr_b(_1, _2); }
-// CHECK-LABEL: @xvrotr_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvrotr_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvrotr.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvrotr_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvrotr_h(_1, _2); }
-// CHECK-LABEL: @xvrotr_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvrotr_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvrotr.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvrotr_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvrotr_w(_1, _2); }
-// CHECK-LABEL: @xvrotr_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvrotr_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvrotr.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvrotr_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvrotr_d(_1, _2); }
-// CHECK-LABEL: @xvadd_q(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvadd_q(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvadd.q(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvadd_q(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvadd_q(_1, _2); }
-// CHECK-LABEL: @xvsub_q(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsub_q(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsub.q(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvsub_q(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvsub_q(_1, _2); }
-// CHECK-LABEL: @xvaddwev_q_du_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvaddwev_q_du_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwev.q.du.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvaddwev_q_du_d(v4u64 _1, v4i64 _2) { return __builtin_lasx_xvaddwev_q_du_d(_1, _2); }
-// CHECK-LABEL: @xvaddwod_q_du_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvaddwod_q_du_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwod.q.du.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvaddwod_q_du_d(v4u64 _1, v4i64 _2) { return __builtin_lasx_xvaddwod_q_du_d(_1, _2); }
-// CHECK-LABEL: @xvmulwev_q_du_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmulwev_q_du_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwev.q.du.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvmulwev_q_du_d(v4u64 _1, v4i64 _2) { return __builtin_lasx_xvmulwev_q_du_d(_1, _2); }
-// CHECK-LABEL: @xvmulwod_q_du_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmulwod_q_du_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwod.q.du.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvmulwod_q_du_d(v4u64 _1, v4i64 _2) { return __builtin_lasx_xvmulwod_q_du_d(_1, _2); }
-// CHECK-LABEL: @xvmskgez_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmskgez_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmskgez.b(<32 x i8> [[_1]])
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvmskgez_b(v32i8 _1) { return __builtin_lasx_xvmskgez_b(_1); }
-// CHECK-LABEL: @xvmsknz_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvmsknz_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmsknz.b(<32 x i8> [[_1]])
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvmsknz_b(v32i8 _1) { return __builtin_lasx_xvmsknz_b(_1); }
-// CHECK-LABEL: @xvexth_h_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvexth_h_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvexth.h.b(<32 x i8> [[_1]])
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvexth_h_b(v32i8 _1) { return __builtin_lasx_xvexth_h_b(_1); }
-// CHECK-LABEL: @xvexth_w_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvexth_w_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvexth.w.h(<16 x i16> [[_1]])
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvexth_w_h(v16i16 _1) { return __builtin_lasx_xvexth_w_h(_1); }
-// CHECK-LABEL: @xvexth_d_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvexth_d_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvexth.d.w(<8 x i32> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvexth_d_w(v8i32 _1) { return __builtin_lasx_xvexth_d_w(_1); }
-// CHECK-LABEL: @xvexth_q_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvexth_q_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvexth.q.d(<4 x i64> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvexth_q_d(v4i64 _1) { return __builtin_lasx_xvexth_q_d(_1); }
-// CHECK-LABEL: @xvexth_hu_bu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvexth_hu_bu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvexth.hu.bu(<32 x i8> [[_1]])
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvexth_hu_bu(v32u8 _1) { return __builtin_lasx_xvexth_hu_bu(_1); }
-// CHECK-LABEL: @xvexth_wu_hu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvexth_wu_hu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvexth.wu.hu(<16 x i16> [[_1]])
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvexth_wu_hu(v16u16 _1) { return __builtin_lasx_xvexth_wu_hu(_1); }
-// CHECK-LABEL: @xvexth_du_wu(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvexth_du_wu(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvexth.du.wu(<8 x i32> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvexth_du_wu(v8u32 _1) { return __builtin_lasx_xvexth_du_wu(_1); }
-// CHECK-LABEL: @xvexth_qu_du(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvexth_qu_du(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvexth.qu.du(<4 x i64> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvexth_qu_du(v4u64 _1) { return __builtin_lasx_xvexth_qu_du(_1); }
-// CHECK-LABEL: @xvrotri_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvrotri_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvrotri.b(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvrotri_b(v32i8 _1) { return __builtin_lasx_xvrotri_b(_1, 1); }
-// CHECK-LABEL: @xvrotri_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvrotri_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvrotri.h(<16 x i16> [[_1]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvrotri_h(v16i16 _1) { return __builtin_lasx_xvrotri_h(_1, 1); }
-// CHECK-LABEL: @xvrotri_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvrotri_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvrotri.w(<8 x i32> [[_1]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvrotri_w(v8i32 _1) { return __builtin_lasx_xvrotri_w(_1, 1); }
-// CHECK-LABEL: @xvrotri_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvrotri_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvrotri.d(<4 x i64> [[_1]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvrotri_d(v4i64 _1) { return __builtin_lasx_xvrotri_d(_1, 1); }
-// CHECK-LABEL: @xvextl_q_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvextl_q_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvextl.q.d(<4 x i64> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvextl_q_d(v4i64 _1) { return __builtin_lasx_xvextl_q_d(_1); }
-// CHECK-LABEL: @xvsrlni_b_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsrlni_b_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrlni.b.h(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvsrlni_b_h(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvsrlni_b_h(_1, _2, 1); }
-// CHECK-LABEL: @xvsrlni_h_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsrlni_h_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrlni.h.w(<16 x i16> [[_1]], <16 x i16> [[_2]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvsrlni_h_w(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvsrlni_h_w(_1, _2, 1); }
-// CHECK-LABEL: @xvsrlni_w_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsrlni_w_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrlni.w.d(<8 x i32> [[_1]], <8 x i32> [[_2]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvsrlni_w_d(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvsrlni_w_d(_1, _2, 1); }
-// CHECK-LABEL: @xvsrlni_d_q(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsrlni_d_q(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrlni.d.q(<4 x i64> [[_1]], <4 x i64> [[_2]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvsrlni_d_q(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvsrlni_d_q(_1, _2, 1); }
-// CHECK-LABEL: @xvsrlrni_b_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsrlrni_b_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrlrni.b.h(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvsrlrni_b_h(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvsrlrni_b_h(_1, _2, 1); }
-// CHECK-LABEL: @xvsrlrni_h_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsrlrni_h_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrlrni.h.w(<16 x i16> [[_1]], <16 x i16> [[_2]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvsrlrni_h_w(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvsrlrni_h_w(_1, _2, 1); }
-// CHECK-LABEL: @xvsrlrni_w_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsrlrni_w_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrlrni.w.d(<8 x i32> [[_1]], <8 x i32> [[_2]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvsrlrni_w_d(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvsrlrni_w_d(_1, _2, 1); }
-// CHECK-LABEL: @xvsrlrni_d_q(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsrlrni_d_q(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrlrni.d.q(<4 x i64> [[_1]], <4 x i64> [[_2]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvsrlrni_d_q(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvsrlrni_d_q(_1, _2, 1); }
-// CHECK-LABEL: @xvssrlni_b_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssrlni_b_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrlni.b.h(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvssrlni_b_h(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvssrlni_b_h(_1, _2, 1); }
-// CHECK-LABEL: @xvssrlni_h_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssrlni_h_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrlni.h.w(<16 x i16> [[_1]], <16 x i16> [[_2]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvssrlni_h_w(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvssrlni_h_w(_1, _2, 1); }
-// CHECK-LABEL: @xvssrlni_w_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssrlni_w_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrlni.w.d(<8 x i32> [[_1]], <8 x i32> [[_2]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvssrlni_w_d(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvssrlni_w_d(_1, _2, 1); }
-// CHECK-LABEL: @xvssrlni_d_q(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssrlni_d_q(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssrlni.d.q(<4 x i64> [[_1]], <4 x i64> [[_2]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvssrlni_d_q(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvssrlni_d_q(_1, _2, 1); }
-// CHECK-LABEL: @xvssrlni_bu_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssrlni_bu_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrlni.bu.h(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvssrlni_bu_h(v32u8 _1, v32i8 _2) { return __builtin_lasx_xvssrlni_bu_h(_1, _2, 1); }
-// CHECK-LABEL: @xvssrlni_hu_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssrlni_hu_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrlni.hu.w(<16 x i16> [[_1]], <16 x i16> [[_2]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvssrlni_hu_w(v16u16 _1, v16i16 _2) { return __builtin_lasx_xvssrlni_hu_w(_1, _2, 1); }
-// CHECK-LABEL: @xvssrlni_wu_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssrlni_wu_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrlni.wu.d(<8 x i32> [[_1]], <8 x i32> [[_2]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvssrlni_wu_d(v8u32 _1, v8i32 _2) { return __builtin_lasx_xvssrlni_wu_d(_1, _2, 1); }
-// CHECK-LABEL: @xvssrlni_du_q(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssrlni_du_q(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssrlni.du.q(<4 x i64> [[_1]], <4 x i64> [[_2]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvssrlni_du_q(v4u64 _1, v4i64 _2) { return __builtin_lasx_xvssrlni_du_q(_1, _2, 1); }
-// CHECK-LABEL: @xvssrlrni_b_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssrlrni_b_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrlrni.b.h(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvssrlrni_b_h(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvssrlrni_b_h(_1, _2, 1); }
-// CHECK-LABEL: @xvssrlrni_h_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssrlrni_h_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrlrni.h.w(<16 x i16> [[_1]], <16 x i16> [[_2]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvssrlrni_h_w(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvssrlrni_h_w(_1, _2, 1); }
-// CHECK-LABEL: @xvssrlrni_w_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssrlrni_w_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrlrni.w.d(<8 x i32> [[_1]], <8 x i32> [[_2]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvssrlrni_w_d(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvssrlrni_w_d(_1, _2, 1); }
-// CHECK-LABEL: @xvssrlrni_d_q(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssrlrni_d_q(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssrlrni.d.q(<4 x i64> [[_1]], <4 x i64> [[_2]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvssrlrni_d_q(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvssrlrni_d_q(_1, _2, 1); }
-// CHECK-LABEL: @xvssrlrni_bu_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssrlrni_bu_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrlrni.bu.h(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvssrlrni_bu_h(v32u8 _1, v32i8 _2) { return __builtin_lasx_xvssrlrni_bu_h(_1, _2, 1); }
-// CHECK-LABEL: @xvssrlrni_hu_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssrlrni_hu_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrlrni.hu.w(<16 x i16> [[_1]], <16 x i16> [[_2]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvssrlrni_hu_w(v16u16 _1, v16i16 _2) { return __builtin_lasx_xvssrlrni_hu_w(_1, _2, 1); }
-// CHECK-LABEL: @xvssrlrni_wu_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssrlrni_wu_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrlrni.wu.d(<8 x i32> [[_1]], <8 x i32> [[_2]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvssrlrni_wu_d(v8u32 _1, v8i32 _2) { return __builtin_lasx_xvssrlrni_wu_d(_1, _2, 1); }
-// CHECK-LABEL: @xvssrlrni_du_q(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssrlrni_du_q(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssrlrni.du.q(<4 x i64> [[_1]], <4 x i64> [[_2]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvssrlrni_du_q(v4u64 _1, v4i64 _2) { return __builtin_lasx_xvssrlrni_du_q(_1, _2, 1); }
-// CHECK-LABEL: @xvsrani_b_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsrani_b_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrani.b.h(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvsrani_b_h(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvsrani_b_h(_1, _2, 1); }
-// CHECK-LABEL: @xvsrani_h_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsrani_h_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrani.h.w(<16 x i16> [[_1]], <16 x i16> [[_2]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvsrani_h_w(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvsrani_h_w(_1, _2, 1); }
-// CHECK-LABEL: @xvsrani_w_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsrani_w_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrani.w.d(<8 x i32> [[_1]], <8 x i32> [[_2]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvsrani_w_d(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvsrani_w_d(_1, _2, 1); }
-// CHECK-LABEL: @xvsrani_d_q(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsrani_d_q(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrani.d.q(<4 x i64> [[_1]], <4 x i64> [[_2]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvsrani_d_q(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvsrani_d_q(_1, _2, 1); }
-// CHECK-LABEL: @xvsrarni_b_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsrarni_b_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrarni.b.h(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvsrarni_b_h(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvsrarni_b_h(_1, _2, 1); }
-// CHECK-LABEL: @xvsrarni_h_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsrarni_h_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrarni.h.w(<16 x i16> [[_1]], <16 x i16> [[_2]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvsrarni_h_w(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvsrarni_h_w(_1, _2, 1); }
-// CHECK-LABEL: @xvsrarni_w_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsrarni_w_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrarni.w.d(<8 x i32> [[_1]], <8 x i32> [[_2]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvsrarni_w_d(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvsrarni_w_d(_1, _2, 1); }
-// CHECK-LABEL: @xvsrarni_d_q(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvsrarni_d_q(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrarni.d.q(<4 x i64> [[_1]], <4 x i64> [[_2]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvsrarni_d_q(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvsrarni_d_q(_1, _2, 1); }
-// CHECK-LABEL: @xvssrani_b_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssrani_b_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrani.b.h(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvssrani_b_h(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvssrani_b_h(_1, _2, 1); }
-// CHECK-LABEL: @xvssrani_h_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssrani_h_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrani.h.w(<16 x i16> [[_1]], <16 x i16> [[_2]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvssrani_h_w(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvssrani_h_w(_1, _2, 1); }
-// CHECK-LABEL: @xvssrani_w_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssrani_w_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrani.w.d(<8 x i32> [[_1]], <8 x i32> [[_2]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvssrani_w_d(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvssrani_w_d(_1, _2, 1); }
-// CHECK-LABEL: @xvssrani_d_q(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssrani_d_q(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssrani.d.q(<4 x i64> [[_1]], <4 x i64> [[_2]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvssrani_d_q(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvssrani_d_q(_1, _2, 1); }
-// CHECK-LABEL: @xvssrani_bu_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssrani_bu_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrani.bu.h(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvssrani_bu_h(v32u8 _1, v32i8 _2) { return __builtin_lasx_xvssrani_bu_h(_1, _2, 1); }
-// CHECK-LABEL: @xvssrani_hu_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssrani_hu_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrani.hu.w(<16 x i16> [[_1]], <16 x i16> [[_2]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvssrani_hu_w(v16u16 _1, v16i16 _2) { return __builtin_lasx_xvssrani_hu_w(_1, _2, 1); }
-// CHECK-LABEL: @xvssrani_wu_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssrani_wu_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrani.wu.d(<8 x i32> [[_1]], <8 x i32> [[_2]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvssrani_wu_d(v8u32 _1, v8i32 _2) { return __builtin_lasx_xvssrani_wu_d(_1, _2, 1); }
-// CHECK-LABEL: @xvssrani_du_q(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssrani_du_q(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssrani.du.q(<4 x i64> [[_1]], <4 x i64> [[_2]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvssrani_du_q(v4u64 _1, v4i64 _2) { return __builtin_lasx_xvssrani_du_q(_1, _2, 1); }
-// CHECK-LABEL: @xvssrarni_b_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssrarni_b_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrarni.b.h(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvssrarni_b_h(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvssrarni_b_h(_1, _2, 1); }
-// CHECK-LABEL: @xvssrarni_h_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssrarni_h_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrarni.h.w(<16 x i16> [[_1]], <16 x i16> [[_2]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvssrarni_h_w(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvssrarni_h_w(_1, _2, 1); }
-// CHECK-LABEL: @xvssrarni_w_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssrarni_w_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrarni.w.d(<8 x i32> [[_1]], <8 x i32> [[_2]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvssrarni_w_d(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvssrarni_w_d(_1, _2, 1); }
-// CHECK-LABEL: @xvssrarni_d_q(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssrarni_d_q(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssrarni.d.q(<4 x i64> [[_1]], <4 x i64> [[_2]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvssrarni_d_q(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvssrarni_d_q(_1, _2, 1); }
-// CHECK-LABEL: @xvssrarni_bu_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssrarni_bu_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrarni.bu.h(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvssrarni_bu_h(v32u8 _1, v32i8 _2) { return __builtin_lasx_xvssrarni_bu_h(_1, _2, 1); }
-// CHECK-LABEL: @xvssrarni_hu_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssrarni_hu_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrarni.hu.w(<16 x i16> [[_1]], <16 x i16> [[_2]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvssrarni_hu_w(v16u16 _1, v16i16 _2) { return __builtin_lasx_xvssrarni_hu_w(_1, _2, 1); }
-// CHECK-LABEL: @xvssrarni_wu_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssrarni_wu_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrarni.wu.d(<8 x i32> [[_1]], <8 x i32> [[_2]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvssrarni_wu_d(v8u32 _1, v8i32 _2) { return __builtin_lasx_xvssrarni_wu_d(_1, _2, 1); }
-// CHECK-LABEL: @xvssrarni_du_q(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvssrarni_du_q(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssrarni.du.q(<4 x i64> [[_1]], <4 x i64> [[_2]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvssrarni_du_q(v4u64 _1, v4i64 _2) { return __builtin_lasx_xvssrarni_du_q(_1, _2, 1); }
-// CHECK-LABEL: @xbnz_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local signext i32 @xbnz_b(
+// CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR7]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lasx.xbnz.b(<32 x i8> [[_1]])
 // CHECK-NEXT:    ret i32 [[TMP1]]
 //
 int xbnz_b(v32u8 _1) { return __builtin_lasx_xbnz_b(_1); }
-// CHECK-LABEL: @xbnz_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local signext i32 @xbnz_d(
+// CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR7]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lasx.xbnz.d(<4 x i64> [[_1]])
 // CHECK-NEXT:    ret i32 [[TMP1]]
 //
 int xbnz_d(v4u64 _1) { return __builtin_lasx_xbnz_d(_1); }
-// CHECK-LABEL: @xbnz_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local signext i32 @xbnz_h(
+// CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR7]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lasx.xbnz.h(<16 x i16> [[_1]])
 // CHECK-NEXT:    ret i32 [[TMP1]]
 //
 int xbnz_h(v16u16 _1) { return __builtin_lasx_xbnz_h(_1); }
-// CHECK-LABEL: @xbnz_v(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local signext i32 @xbnz_v(
+// CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR7]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lasx.xbnz.v(<32 x i8> [[_1]])
 // CHECK-NEXT:    ret i32 [[TMP1]]
 //
 int xbnz_v(v32u8 _1) { return __builtin_lasx_xbnz_v(_1); }
-// CHECK-LABEL: @xbnz_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local signext i32 @xbnz_w(
+// CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR7]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lasx.xbnz.w(<8 x i32> [[_1]])
 // CHECK-NEXT:    ret i32 [[TMP1]]
 //
 int xbnz_w(v8u32 _1) { return __builtin_lasx_xbnz_w(_1); }
-// CHECK-LABEL: @xbz_b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local signext i32 @xbz_b(
+// CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR7]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lasx.xbz.b(<32 x i8> [[_1]])
 // CHECK-NEXT:    ret i32 [[TMP1]]
 //
 int xbz_b(v32u8 _1) { return __builtin_lasx_xbz_b(_1); }
-// CHECK-LABEL: @xbz_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local signext i32 @xbz_d(
+// CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR7]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lasx.xbz.d(<4 x i64> [[_1]])
 // CHECK-NEXT:    ret i32 [[TMP1]]
 //
 int xbz_d(v4u64 _1) { return __builtin_lasx_xbz_d(_1); }
-// CHECK-LABEL: @xbz_h(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local signext i32 @xbz_h(
+// CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR7]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lasx.xbz.h(<16 x i16> [[_1]])
 // CHECK-NEXT:    ret i32 [[TMP1]]
 //
 int xbz_h(v16u16 _1) { return __builtin_lasx_xbz_h(_1); }
-// CHECK-LABEL: @xbz_v(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local signext i32 @xbz_v(
+// CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR7]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lasx.xbz.v(<32 x i8> [[_1]])
 // CHECK-NEXT:    ret i32 [[TMP1]]
 //
 int xbz_v(v32u8 _1) { return __builtin_lasx_xbz_v(_1); }
-// CHECK-LABEL: @xbz_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local signext i32 @xbz_w(
+// CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR7]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lasx.xbz.w(<8 x i32> [[_1]])
 // CHECK-NEXT:    ret i32 [[TMP1]]
 //
 int xbz_w(v8u32 _1) { return __builtin_lasx_xbz_w(_1); }
-// CHECK-LABEL: @xvfcmp_caf_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfcmp_caf_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.caf.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_caf_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfcmp_caf_d(_1, _2); }
-// CHECK-LABEL: @xvfcmp_caf_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfcmp_caf_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.caf.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_caf_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfcmp_caf_s(_1, _2); }
-// CHECK-LABEL: @xvfcmp_ceq_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfcmp_ceq_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.ceq.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_ceq_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfcmp_ceq_d(_1, _2); }
-// CHECK-LABEL: @xvfcmp_ceq_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfcmp_ceq_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.ceq.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_ceq_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfcmp_ceq_s(_1, _2); }
-// CHECK-LABEL: @xvfcmp_cle_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfcmp_cle_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.cle.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_cle_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfcmp_cle_d(_1, _2); }
-// CHECK-LABEL: @xvfcmp_cle_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfcmp_cle_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.cle.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_cle_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfcmp_cle_s(_1, _2); }
-// CHECK-LABEL: @xvfcmp_clt_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfcmp_clt_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.clt.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_clt_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfcmp_clt_d(_1, _2); }
-// CHECK-LABEL: @xvfcmp_clt_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfcmp_clt_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.clt.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_clt_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfcmp_clt_s(_1, _2); }
-// CHECK-LABEL: @xvfcmp_cne_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfcmp_cne_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.cne.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_cne_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfcmp_cne_d(_1, _2); }
-// CHECK-LABEL: @xvfcmp_cne_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfcmp_cne_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.cne.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_cne_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfcmp_cne_s(_1, _2); }
-// CHECK-LABEL: @xvfcmp_cor_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfcmp_cor_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.cor.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_cor_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfcmp_cor_d(_1, _2); }
-// CHECK-LABEL: @xvfcmp_cor_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfcmp_cor_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.cor.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_cor_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfcmp_cor_s(_1, _2); }
-// CHECK-LABEL: @xvfcmp_cueq_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfcmp_cueq_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.cueq.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_cueq_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfcmp_cueq_d(_1, _2); }
-// CHECK-LABEL: @xvfcmp_cueq_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfcmp_cueq_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.cueq.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_cueq_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfcmp_cueq_s(_1, _2); }
-// CHECK-LABEL: @xvfcmp_cule_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfcmp_cule_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.cule.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_cule_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfcmp_cule_d(_1, _2); }
-// CHECK-LABEL: @xvfcmp_cule_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfcmp_cule_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.cule.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_cule_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfcmp_cule_s(_1, _2); }
-// CHECK-LABEL: @xvfcmp_cult_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfcmp_cult_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.cult.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_cult_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfcmp_cult_d(_1, _2); }
-// CHECK-LABEL: @xvfcmp_cult_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfcmp_cult_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.cult.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_cult_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfcmp_cult_s(_1, _2); }
-// CHECK-LABEL: @xvfcmp_cun_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfcmp_cun_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.cun.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_cun_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfcmp_cun_d(_1, _2); }
-// CHECK-LABEL: @xvfcmp_cune_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfcmp_cune_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.cune.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_cune_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfcmp_cune_d(_1, _2); }
-// CHECK-LABEL: @xvfcmp_cune_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfcmp_cune_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.cune.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_cune_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfcmp_cune_s(_1, _2); }
-// CHECK-LABEL: @xvfcmp_cun_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfcmp_cun_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.cun.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_cun_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfcmp_cun_s(_1, _2); }
-// CHECK-LABEL: @xvfcmp_saf_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfcmp_saf_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.saf.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_saf_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfcmp_saf_d(_1, _2); }
-// CHECK-LABEL: @xvfcmp_saf_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfcmp_saf_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.saf.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_saf_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfcmp_saf_s(_1, _2); }
-// CHECK-LABEL: @xvfcmp_seq_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfcmp_seq_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.seq.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_seq_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfcmp_seq_d(_1, _2); }
-// CHECK-LABEL: @xvfcmp_seq_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfcmp_seq_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.seq.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_seq_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfcmp_seq_s(_1, _2); }
-// CHECK-LABEL: @xvfcmp_sle_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfcmp_sle_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.sle.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_sle_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfcmp_sle_d(_1, _2); }
-// CHECK-LABEL: @xvfcmp_sle_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfcmp_sle_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.sle.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_sle_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfcmp_sle_s(_1, _2); }
-// CHECK-LABEL: @xvfcmp_slt_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfcmp_slt_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.slt.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_slt_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfcmp_slt_d(_1, _2); }
-// CHECK-LABEL: @xvfcmp_slt_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfcmp_slt_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.slt.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_slt_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfcmp_slt_s(_1, _2); }
-// CHECK-LABEL: @xvfcmp_sne_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfcmp_sne_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.sne.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_sne_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfcmp_sne_d(_1, _2); }
-// CHECK-LABEL: @xvfcmp_sne_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfcmp_sne_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.sne.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_sne_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfcmp_sne_s(_1, _2); }
-// CHECK-LABEL: @xvfcmp_sor_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfcmp_sor_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.sor.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_sor_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfcmp_sor_d(_1, _2); }
-// CHECK-LABEL: @xvfcmp_sor_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfcmp_sor_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.sor.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_sor_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfcmp_sor_s(_1, _2); }
-// CHECK-LABEL: @xvfcmp_sueq_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfcmp_sueq_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.sueq.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_sueq_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfcmp_sueq_d(_1, _2); }
-// CHECK-LABEL: @xvfcmp_sueq_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfcmp_sueq_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.sueq.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_sueq_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfcmp_sueq_s(_1, _2); }
-// CHECK-LABEL: @xvfcmp_sule_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfcmp_sule_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.sule.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_sule_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfcmp_sule_d(_1, _2); }
-// CHECK-LABEL: @xvfcmp_sule_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfcmp_sule_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.sule.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_sule_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfcmp_sule_s(_1, _2); }
-// CHECK-LABEL: @xvfcmp_sult_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfcmp_sult_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.sult.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_sult_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfcmp_sult_d(_1, _2); }
-// CHECK-LABEL: @xvfcmp_sult_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfcmp_sult_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.sult.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_sult_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfcmp_sult_s(_1, _2); }
-// CHECK-LABEL: @xvfcmp_sun_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfcmp_sun_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.sun.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_sun_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfcmp_sun_d(_1, _2); }
-// CHECK-LABEL: @xvfcmp_sune_d(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfcmp_sune_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.sune.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_sune_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfcmp_sune_d(_1, _2); }
-// CHECK-LABEL: @xvfcmp_sune_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfcmp_sune_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.sune.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_sune_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfcmp_sune_s(_1, _2); }
-// CHECK-LABEL: @xvfcmp_sun_s(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvfcmp_sun_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.sun.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_sun_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfcmp_sun_s(_1, _2); }
-// CHECK-LABEL: @xvpickve_d_f(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvpickve_d_f(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvpickve.d.f(<4 x double> [[_1]], i32 1)
-// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4f64 xvpickve_d_f(v4f64 _1) { return __builtin_lasx_xvpickve_d_f(_1, 1); }
-// CHECK-LABEL: @xvpickve_w_f(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @xvpickve_w_f(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvpickve.w.f(<8 x float> [[_1]], i32 1)
-// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8f32 xvpickve_w_f(v8f32 _1) { return __builtin_lasx_xvpickve_w_f(_1, 1); }
-// CHECK-LABEL: @xvrepli_b(
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define dso_local void @xvrepli_b(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvrepli.b(i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP0]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP0]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvrepli_b() { return __builtin_lasx_xvrepli_b(1); }
-// CHECK-LABEL: @xvrepli_d(
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define dso_local void @xvrepli_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvrepli.d(i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP0]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP0]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvrepli_d() { return __builtin_lasx_xvrepli_d(1); }
-// CHECK-LABEL: @xvrepli_h(
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define dso_local void @xvrepli_h(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvrepli.h(i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP0]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP0]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvrepli_h() { return __builtin_lasx_xvrepli_h(1); }
-// CHECK-LABEL: @xvrepli_w(
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define dso_local void @xvrepli_w(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvrepli.w(i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP0]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP0]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvrepli_w() { return __builtin_lasx_xvrepli_w(1); }
+//.
+// CHECK: [[CHAR_TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
+// CHECK: [[META3]] = !{!"omnipotent char", [[META4:![0-9]+]], i64 0}
+// CHECK: [[META4]] = !{!"Simple C/C++ TBAA"}
+//.
diff --git a/clang/test/CodeGen/PowerPC/builtins-ppc-build-pair-mma.c b/clang/test/CodeGen/PowerPC/builtins-ppc-build-pair-mma.c
index cdbfdd6b7975a..59b71cd355813 100644
--- a/clang/test/CodeGen/PowerPC/builtins-ppc-build-pair-mma.c
+++ b/clang/test/CodeGen/PowerPC/builtins-ppc-build-pair-mma.c
@@ -1,4 +1,4 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6
 // RUN: %clang_cc1 -O3 -triple powerpc64le-unknown-unknown -target-cpu pwr10 \
 // RUN: -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK-LE
 // RUN: %clang_cc1 -O3 -triple powerpc64-unknown-unknown -target-cpu pwr10 \
@@ -6,20 +6,23 @@
 // RUN: %clang_cc1 -O0 -triple powerpc64le-unknown-unknown -target-cpu pwr10 \
 // RUN: -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK-LE-NOOPT
 
-// CHECK-LE-LABEL: @test1(
-// CHECK-LE-NEXT:  entry:
-// CHECK-LE-NEXT:    [[TMP0:%.*]] = tail call <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8> [[VC4:%.*]], <16 x i8> [[VC3:%.*]], <16 x i8> [[VC2:%.*]], <16 x i8> [[VC1:%.*]])
-// CHECK-LE-NEXT:    store <512 x i1> [[TMP0]], ptr [[RESP:%.*]], align 64, !tbaa [[TBAA2:![0-9]+]]
+// CHECK-LE-LABEL: define dso_local void @test1(
+// CHECK-LE-SAME: ptr noundef readnone captures(none) [[VQP:%.*]], ptr noundef readnone captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC1:%.*]], <16 x i8> noundef [[VC2:%.*]], <16 x i8> noundef [[VC3:%.*]], <16 x i8> noundef [[VC4:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// CHECK-LE-NEXT:  [[ENTRY:.*:]]
+// CHECK-LE-NEXT:    [[TMP0:%.*]] = tail call <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8> [[VC4]], <16 x i8> [[VC3]], <16 x i8> [[VC2]], <16 x i8> [[VC1]])
+// CHECK-LE-NEXT:    store <512 x i1> [[TMP0]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2:![0-9]+]]
 // CHECK-LE-NEXT:    ret void
 //
-// CHECK-BE-LABEL: @test1(
-// CHECK-BE-NEXT:  entry:
-// CHECK-BE-NEXT:    [[TMP0:%.*]] = tail call <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8> [[VC1:%.*]], <16 x i8> [[VC2:%.*]], <16 x i8> [[VC3:%.*]], <16 x i8> [[VC4:%.*]])
-// CHECK-BE-NEXT:    store <512 x i1> [[TMP0]], ptr [[RESP:%.*]], align 64, !tbaa [[TBAA2:![0-9]+]]
+// CHECK-BE-LABEL: define dso_local void @test1(
+// CHECK-BE-SAME: ptr noundef readnone captures(none) [[VQP:%.*]], ptr noundef readnone captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC1:%.*]], <16 x i8> noundef [[VC2:%.*]], <16 x i8> noundef [[VC3:%.*]], <16 x i8> noundef [[VC4:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// CHECK-BE-NEXT:  [[ENTRY:.*:]]
+// CHECK-BE-NEXT:    [[TMP0:%.*]] = tail call <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8> [[VC1]], <16 x i8> [[VC2]], <16 x i8> [[VC3]], <16 x i8> [[VC4]])
+// CHECK-BE-NEXT:    store <512 x i1> [[TMP0]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2:![0-9]+]]
 // CHECK-BE-NEXT:    ret void
 //
-// CHECK-LE-NOOPT-LABEL: @test1(
-// CHECK-LE-NOOPT-NEXT:  entry:
+// CHECK-LE-NOOPT-LABEL: define dso_local void @test1(
+// CHECK-LE-NOOPT-SAME: ptr noundef [[VQP:%.*]], ptr noundef [[VPP:%.*]], <16 x i8> noundef [[VC1:%.*]], <16 x i8> noundef [[VC2:%.*]], <16 x i8> noundef [[VC3:%.*]], <16 x i8> noundef [[VC4:%.*]], ptr noundef [[RESP:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-LE-NOOPT-NEXT:  [[ENTRY:.*:]]
 // CHECK-LE-NOOPT-NEXT:    [[VQP_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-LE-NOOPT-NEXT:    [[VPP_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-LE-NOOPT-NEXT:    [[VC1_ADDR:%.*]] = alloca <16 x i8>, align 16
@@ -30,13 +33,13 @@
 // CHECK-LE-NOOPT-NEXT:    [[VQ:%.*]] = alloca <512 x i1>, align 64
 // CHECK-LE-NOOPT-NEXT:    [[VP:%.*]] = alloca <256 x i1>, align 32
 // CHECK-LE-NOOPT-NEXT:    [[RES:%.*]] = alloca <512 x i1>, align 64
-// CHECK-LE-NOOPT-NEXT:    store ptr [[VQP:%.*]], ptr [[VQP_ADDR]], align 8
-// CHECK-LE-NOOPT-NEXT:    store ptr [[VPP:%.*]], ptr [[VPP_ADDR]], align 8
-// CHECK-LE-NOOPT-NEXT:    store <16 x i8> [[VC1:%.*]], ptr [[VC1_ADDR]], align 16
-// CHECK-LE-NOOPT-NEXT:    store <16 x i8> [[VC2:%.*]], ptr [[VC2_ADDR]], align 16
-// CHECK-LE-NOOPT-NEXT:    store <16 x i8> [[VC3:%.*]], ptr [[VC3_ADDR]], align 16
-// CHECK-LE-NOOPT-NEXT:    store <16 x i8> [[VC4:%.*]], ptr [[VC4_ADDR]], align 16
-// CHECK-LE-NOOPT-NEXT:    store ptr [[RESP:%.*]], ptr [[RESP_ADDR]], align 8
+// CHECK-LE-NOOPT-NEXT:    store ptr [[VQP]], ptr [[VQP_ADDR]], align 8
+// CHECK-LE-NOOPT-NEXT:    store ptr [[VPP]], ptr [[VPP_ADDR]], align 8
+// CHECK-LE-NOOPT-NEXT:    store <16 x i8> [[VC1]], ptr [[VC1_ADDR]], align 16
+// CHECK-LE-NOOPT-NEXT:    store <16 x i8> [[VC2]], ptr [[VC2_ADDR]], align 16
+// CHECK-LE-NOOPT-NEXT:    store <16 x i8> [[VC3]], ptr [[VC3_ADDR]], align 16
+// CHECK-LE-NOOPT-NEXT:    store <16 x i8> [[VC4]], ptr [[VC4_ADDR]], align 16
+// CHECK-LE-NOOPT-NEXT:    store ptr [[RESP]], ptr [[RESP_ADDR]], align 8
 // CHECK-LE-NOOPT-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VQP_ADDR]], align 8
 // CHECK-LE-NOOPT-NEXT:    [[TMP1:%.*]] = load <512 x i1>, ptr [[TMP0]], align 64
 // CHECK-LE-NOOPT-NEXT:    store <512 x i1> [[TMP1]], ptr [[VQ]], align 64
@@ -63,20 +66,23 @@ void test1(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc1, vec
   *((__vector_quad *)resp) = res;
 }
 
-// CHECK-LE-LABEL: @test2(
-// CHECK-LE-NEXT:  entry:
-// CHECK-LE-NEXT:    [[TMP0:%.*]] = tail call <256 x i1> @llvm.ppc.vsx.assemble.pair(<16 x i8> [[VC2:%.*]], <16 x i8> [[VC1:%.*]])
-// CHECK-LE-NEXT:    store <256 x i1> [[TMP0]], ptr [[RESP:%.*]], align 32, !tbaa [[TBAA6:![0-9]+]]
+// CHECK-LE-LABEL: define dso_local void @test2(
+// CHECK-LE-SAME: ptr noundef readnone captures(none) [[VQP:%.*]], ptr noundef readnone captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC1:%.*]], <16 x i8> noundef [[VC2:%.*]], ptr noundef writeonly captures(none) initializes((0, 32)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-LE-NEXT:  [[ENTRY:.*:]]
+// CHECK-LE-NEXT:    [[TMP0:%.*]] = tail call <256 x i1> @llvm.ppc.vsx.assemble.pair(<16 x i8> [[VC2]], <16 x i8> [[VC1]])
+// CHECK-LE-NEXT:    store <256 x i1> [[TMP0]], ptr [[RESP]], align 32, !tbaa [[__VECTOR_PAIR_TBAA6:![0-9]+]]
 // CHECK-LE-NEXT:    ret void
 //
-// CHECK-BE-LABEL: @test2(
-// CHECK-BE-NEXT:  entry:
-// CHECK-BE-NEXT:    [[TMP0:%.*]] = tail call <256 x i1> @llvm.ppc.vsx.assemble.pair(<16 x i8> [[VC1:%.*]], <16 x i8> [[VC2:%.*]])
-// CHECK-BE-NEXT:    store <256 x i1> [[TMP0]], ptr [[RESP:%.*]], align 32, !tbaa [[TBAA6:![0-9]+]]
+// CHECK-BE-LABEL: define dso_local void @test2(
+// CHECK-BE-SAME: ptr noundef readnone captures(none) [[VQP:%.*]], ptr noundef readnone captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC1:%.*]], <16 x i8> noundef [[VC2:%.*]], ptr noundef writeonly captures(none) initializes((0, 32)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-BE-NEXT:  [[ENTRY:.*:]]
+// CHECK-BE-NEXT:    [[TMP0:%.*]] = tail call <256 x i1> @llvm.ppc.vsx.assemble.pair(<16 x i8> [[VC1]], <16 x i8> [[VC2]])
+// CHECK-BE-NEXT:    store <256 x i1> [[TMP0]], ptr [[RESP]], align 32, !tbaa [[__VECTOR_PAIR_TBAA6:![0-9]+]]
 // CHECK-BE-NEXT:    ret void
 //
-// CHECK-LE-NOOPT-LABEL: @test2(
-// CHECK-LE-NOOPT-NEXT:  entry:
+// CHECK-LE-NOOPT-LABEL: define dso_local void @test2(
+// CHECK-LE-NOOPT-SAME: ptr noundef [[VQP:%.*]], ptr noundef [[VPP:%.*]], <16 x i8> noundef [[VC1:%.*]], <16 x i8> noundef [[VC2:%.*]], ptr noundef [[RESP:%.*]]) #[[ATTR0]] {
+// CHECK-LE-NOOPT-NEXT:  [[ENTRY:.*:]]
 // CHECK-LE-NOOPT-NEXT:    [[VQP_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-LE-NOOPT-NEXT:    [[VPP_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-LE-NOOPT-NEXT:    [[VC1_ADDR:%.*]] = alloca <16 x i8>, align 16
@@ -85,11 +91,11 @@ void test1(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc1, vec
 // CHECK-LE-NOOPT-NEXT:    [[VQ:%.*]] = alloca <512 x i1>, align 64
 // CHECK-LE-NOOPT-NEXT:    [[VP:%.*]] = alloca <256 x i1>, align 32
 // CHECK-LE-NOOPT-NEXT:    [[RES:%.*]] = alloca <256 x i1>, align 32
-// CHECK-LE-NOOPT-NEXT:    store ptr [[VQP:%.*]], ptr [[VQP_ADDR]], align 8
-// CHECK-LE-NOOPT-NEXT:    store ptr [[VPP:%.*]], ptr [[VPP_ADDR]], align 8
-// CHECK-LE-NOOPT-NEXT:    store <16 x i8> [[VC1:%.*]], ptr [[VC1_ADDR]], align 16
-// CHECK-LE-NOOPT-NEXT:    store <16 x i8> [[VC2:%.*]], ptr [[VC2_ADDR]], align 16
-// CHECK-LE-NOOPT-NEXT:    store ptr [[RESP:%.*]], ptr [[RESP_ADDR]], align 8
+// CHECK-LE-NOOPT-NEXT:    store ptr [[VQP]], ptr [[VQP_ADDR]], align 8
+// CHECK-LE-NOOPT-NEXT:    store ptr [[VPP]], ptr [[VPP_ADDR]], align 8
+// CHECK-LE-NOOPT-NEXT:    store <16 x i8> [[VC1]], ptr [[VC1_ADDR]], align 16
+// CHECK-LE-NOOPT-NEXT:    store <16 x i8> [[VC2]], ptr [[VC2_ADDR]], align 16
+// CHECK-LE-NOOPT-NEXT:    store ptr [[RESP]], ptr [[RESP_ADDR]], align 8
 // CHECK-LE-NOOPT-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VQP_ADDR]], align 8
 // CHECK-LE-NOOPT-NEXT:    [[TMP1:%.*]] = load <512 x i1>, ptr [[TMP0]], align 64
 // CHECK-LE-NOOPT-NEXT:    store <512 x i1> [[TMP1]], ptr [[VQ]], align 64
@@ -113,3 +119,18 @@ void test2(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc1,
   __builtin_vsx_build_pair(&res, vc1, vc2);
   *((__vector_pair *)resp) = res;
 }
+//.
+// CHECK-LE: [[__VECTOR_QUAD_TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
+// CHECK-LE: [[META3]] = !{!"__vector_quad", [[META4:![0-9]+]], i64 0}
+// CHECK-LE: [[META4]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0}
+// CHECK-LE: [[META5]] = !{!"Simple C/C++ TBAA"}
+// CHECK-LE: [[__VECTOR_PAIR_TBAA6]] = !{[[META7:![0-9]+]], [[META7]], i64 0}
+// CHECK-LE: [[META7]] = !{!"__vector_pair", [[META4]], i64 0}
+//.
+// CHECK-BE: [[__VECTOR_QUAD_TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
+// CHECK-BE: [[META3]] = !{!"__vector_quad", [[META4:![0-9]+]], i64 0}
+// CHECK-BE: [[META4]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0}
+// CHECK-BE: [[META5]] = !{!"Simple C/C++ TBAA"}
+// CHECK-BE: [[__VECTOR_PAIR_TBAA6]] = !{[[META7:![0-9]+]], [[META7]], i64 0}
+// CHECK-BE: [[META7]] = !{!"__vector_pair", [[META4]], i64 0}
+//.
diff --git a/clang/test/CodeGen/PowerPC/builtins-ppc-dmf.c b/clang/test/CodeGen/PowerPC/builtins-ppc-dmf.c
index c66f5e2a32919..f62656757c8c5 100644
--- a/clang/test/CodeGen/PowerPC/builtins-ppc-dmf.c
+++ b/clang/test/CodeGen/PowerPC/builtins-ppc-dmf.c
@@ -1,17 +1,26 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6
 // RUN: %clang_cc1 -O3 -triple powerpc64le-unknown-unknown -target-cpu future \
 // RUN:  -emit-llvm %s -o - | FileCheck %s
 // RUN: %clang_cc1 -O3 -triple powerpc64-ibm-aix -target-cpu future \
-// RUN: -emit-llvm %s -o - | FileCheck %s
+// RUN: -emit-llvm %s -o - | FileCheck %s --check-prefix=AIX
 
 
-// CHECK-LABEL: @test_dmxvi8gerx4(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load <256 x i1>, ptr [[VPP:%.*]], align 32, !tbaa [[TBAA2:![0-9]+]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmxvi8gerx4(<256 x i1> [[TMP0]], <16 x i8> [[VC:%.*]])
-// CHECK-NEXT:    store <1024 x i1> [[TMP1]], ptr [[RESP:%.*]], align 128, !tbaa [[TBAA6:![0-9]+]]
+// CHECK-LABEL: define dso_local void @test_dmxvi8gerx4(
+// CHECK-SAME: ptr noundef readnone captures(none) [[VDMRP:%.*]], ptr noundef readonly captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 128)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <256 x i1>, ptr [[VPP]], align 32, !tbaa [[__VECTOR_PAIR_TBAA2:![0-9]+]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmxvi8gerx4(<256 x i1> [[TMP0]], <16 x i8> [[VC]])
+// CHECK-NEXT:    store <1024 x i1> [[TMP1]], ptr [[RESP]], align 128, !tbaa [[__DMR1024_TBAA6:![0-9]+]]
 // CHECK-NEXT:    ret void
 //
+// AIX-LABEL: define void @test_dmxvi8gerx4(
+// AIX-SAME: ptr noundef readnone captures(none) [[VDMRP:%.*]], ptr noundef readonly captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 128)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// AIX-NEXT:  [[ENTRY:.*:]]
+// AIX-NEXT:    [[TMP0:%.*]] = load <256 x i1>, ptr [[VPP]], align 32, !tbaa [[__VECTOR_PAIR_TBAA2:![0-9]+]]
+// AIX-NEXT:    [[TMP1:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmxvi8gerx4(<256 x i1> [[TMP0]], <16 x i8> [[VC]])
+// AIX-NEXT:    store <1024 x i1> [[TMP1]], ptr [[RESP]], align 128, !tbaa [[__DMR1024_TBAA6:![0-9]+]]
+// AIX-NEXT:    ret void
+//
 void test_dmxvi8gerx4(unsigned char *vdmrp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
   __dmr1024 vdmr = *((__dmr1024 *)vdmrp);
   __vector_pair vp = *((__vector_pair *)vpp);
@@ -19,13 +28,22 @@ void test_dmxvi8gerx4(unsigned char *vdmrp, unsigned char *vpp, vector unsigned
   *((__dmr1024 *)resp) = vdmr;
 }
 
-// CHECK-LABEL: @test_pmdmxvi8gerx4(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load <256 x i1>, ptr [[VPP:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.pmdmxvi8gerx4(<256 x i1> [[TMP0]], <16 x i8> [[VC:%.*]], i32 0, i32 0, i32 0)
-// CHECK-NEXT:    store <1024 x i1> [[TMP1]], ptr [[RESP:%.*]], align 128, !tbaa [[TBAA6]]
+// CHECK-LABEL: define dso_local void @test_pmdmxvi8gerx4(
+// CHECK-SAME: ptr noundef readnone captures(none) [[VDMRP:%.*]], ptr noundef readonly captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 128)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <256 x i1>, ptr [[VPP]], align 32, !tbaa [[__VECTOR_PAIR_TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.pmdmxvi8gerx4(<256 x i1> [[TMP0]], <16 x i8> [[VC]], i32 0, i32 0, i32 0)
+// CHECK-NEXT:    store <1024 x i1> [[TMP1]], ptr [[RESP]], align 128, !tbaa [[__DMR1024_TBAA6]]
 // CHECK-NEXT:    ret void
 //
+// AIX-LABEL: define void @test_pmdmxvi8gerx4(
+// AIX-SAME: ptr noundef readnone captures(none) [[VDMRP:%.*]], ptr noundef readonly captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 128)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// AIX-NEXT:  [[ENTRY:.*:]]
+// AIX-NEXT:    [[TMP0:%.*]] = load <256 x i1>, ptr [[VPP]], align 32, !tbaa [[__VECTOR_PAIR_TBAA2]]
+// AIX-NEXT:    [[TMP1:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.pmdmxvi8gerx4(<256 x i1> [[TMP0]], <16 x i8> [[VC]], i32 0, i32 0, i32 0)
+// AIX-NEXT:    store <1024 x i1> [[TMP1]], ptr [[RESP]], align 128, !tbaa [[__DMR1024_TBAA6]]
+// AIX-NEXT:    ret void
+//
 void test_pmdmxvi8gerx4(unsigned char *vdmrp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
   __dmr1024 vdmr = *((__dmr1024 *)vdmrp);
   __vector_pair vp = *((__vector_pair *)vpp);
@@ -33,14 +51,24 @@ void test_pmdmxvi8gerx4(unsigned char *vdmrp, unsigned char *vpp, vector unsigne
   *((__dmr1024 *)resp) = vdmr;
 }
 
-// CHECK-LABEL: @test_dmxvi8gerx4pp(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP:%.*]], align 128, !tbaa [[TBAA6]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmxvi8gerx4pp(<1024 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC:%.*]])
-// CHECK-NEXT:    store <1024 x i1> [[TMP2]], ptr [[RESP:%.*]], align 128, !tbaa [[TBAA6]]
+// CHECK-LABEL: define dso_local void @test_dmxvi8gerx4pp(
+// CHECK-SAME: ptr noundef readonly captures(none) [[VDMRP:%.*]], ptr noundef readonly captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 128)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP]], align 128, !tbaa [[__DMR1024_TBAA6]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP]], align 32, !tbaa [[__VECTOR_PAIR_TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmxvi8gerx4pp(<1024 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC]])
+// CHECK-NEXT:    store <1024 x i1> [[TMP2]], ptr [[RESP]], align 128, !tbaa [[__DMR1024_TBAA6]]
 // CHECK-NEXT:    ret void
 //
+// AIX-LABEL: define void @test_dmxvi8gerx4pp(
+// AIX-SAME: ptr noundef readonly captures(none) [[VDMRP:%.*]], ptr noundef readonly captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 128)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// AIX-NEXT:  [[ENTRY:.*:]]
+// AIX-NEXT:    [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP]], align 128, !tbaa [[__DMR1024_TBAA6]]
+// AIX-NEXT:    [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP]], align 32, !tbaa [[__VECTOR_PAIR_TBAA2]]
+// AIX-NEXT:    [[TMP2:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmxvi8gerx4pp(<1024 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC]])
+// AIX-NEXT:    store <1024 x i1> [[TMP2]], ptr [[RESP]], align 128, !tbaa [[__DMR1024_TBAA6]]
+// AIX-NEXT:    ret void
+//
 void test_dmxvi8gerx4pp(unsigned char *vdmrp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
   __dmr1024 vdmr = *((__dmr1024 *)vdmrp);
   __vector_pair vp = *((__vector_pair *)vpp);
@@ -48,14 +76,24 @@ void test_dmxvi8gerx4pp(unsigned char *vdmrp, unsigned char *vpp, vector unsigne
   *((__dmr1024 *)resp) = vdmr;
 }
 
-// CHECK-LABEL: @test_pmdmxvi8gerx4pp(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP:%.*]], align 128, !tbaa [[TBAA6]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.pmdmxvi8gerx4pp(<1024 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC:%.*]], i32 0, i32 0, i32 0)
-// CHECK-NEXT:    store <1024 x i1> [[TMP2]], ptr [[RESP:%.*]], align 128, !tbaa [[TBAA6]]
+// CHECK-LABEL: define dso_local void @test_pmdmxvi8gerx4pp(
+// CHECK-SAME: ptr noundef readonly captures(none) [[VDMRP:%.*]], ptr noundef readonly captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 128)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP]], align 128, !tbaa [[__DMR1024_TBAA6]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP]], align 32, !tbaa [[__VECTOR_PAIR_TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.pmdmxvi8gerx4pp(<1024 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC]], i32 0, i32 0, i32 0)
+// CHECK-NEXT:    store <1024 x i1> [[TMP2]], ptr [[RESP]], align 128, !tbaa [[__DMR1024_TBAA6]]
 // CHECK-NEXT:    ret void
 //
+// AIX-LABEL: define void @test_pmdmxvi8gerx4pp(
+// AIX-SAME: ptr noundef readonly captures(none) [[VDMRP:%.*]], ptr noundef readonly captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 128)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// AIX-NEXT:  [[ENTRY:.*:]]
+// AIX-NEXT:    [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP]], align 128, !tbaa [[__DMR1024_TBAA6]]
+// AIX-NEXT:    [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP]], align 32, !tbaa [[__VECTOR_PAIR_TBAA2]]
+// AIX-NEXT:    [[TMP2:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.pmdmxvi8gerx4pp(<1024 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC]], i32 0, i32 0, i32 0)
+// AIX-NEXT:    store <1024 x i1> [[TMP2]], ptr [[RESP]], align 128, !tbaa [[__DMR1024_TBAA6]]
+// AIX-NEXT:    ret void
+//
 void test_pmdmxvi8gerx4pp(unsigned char *vdmrp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
   __dmr1024 vdmr = *((__dmr1024 *)vdmrp);
   __vector_pair vp = *((__vector_pair *)vpp);
@@ -63,14 +101,24 @@ void test_pmdmxvi8gerx4pp(unsigned char *vdmrp, unsigned char *vpp, vector unsig
   *((__dmr1024 *)resp) = vdmr;
 }
 
-// CHECK-LABEL: @test_dmxvi8gerx4spp(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP:%.*]], align 128, !tbaa [[TBAA6]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmxvi8gerx4spp(<1024 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC:%.*]])
-// CHECK-NEXT:    store <1024 x i1> [[TMP2]], ptr [[RESP:%.*]], align 128, !tbaa [[TBAA6]]
+// CHECK-LABEL: define dso_local void @test_dmxvi8gerx4spp(
+// CHECK-SAME: ptr noundef readonly captures(none) [[VDMRP:%.*]], ptr noundef readonly captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 128)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP]], align 128, !tbaa [[__DMR1024_TBAA6]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP]], align 32, !tbaa [[__VECTOR_PAIR_TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmxvi8gerx4spp(<1024 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC]])
+// CHECK-NEXT:    store <1024 x i1> [[TMP2]], ptr [[RESP]], align 128, !tbaa [[__DMR1024_TBAA6]]
 // CHECK-NEXT:    ret void
 //
+// AIX-LABEL: define void @test_dmxvi8gerx4spp(
+// AIX-SAME: ptr noundef readonly captures(none) [[VDMRP:%.*]], ptr noundef readonly captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 128)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// AIX-NEXT:  [[ENTRY:.*:]]
+// AIX-NEXT:    [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP]], align 128, !tbaa [[__DMR1024_TBAA6]]
+// AIX-NEXT:    [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP]], align 32, !tbaa [[__VECTOR_PAIR_TBAA2]]
+// AIX-NEXT:    [[TMP2:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmxvi8gerx4spp(<1024 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC]])
+// AIX-NEXT:    store <1024 x i1> [[TMP2]], ptr [[RESP]], align 128, !tbaa [[__DMR1024_TBAA6]]
+// AIX-NEXT:    ret void
+//
 void test_dmxvi8gerx4spp(unsigned char *vdmrp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
   __dmr1024 vdmr = *((__dmr1024 *)vdmrp);
   __vector_pair vp = *((__vector_pair *)vpp);
@@ -78,14 +126,24 @@ void test_dmxvi8gerx4spp(unsigned char *vdmrp, unsigned char *vpp, vector unsign
   *((__dmr1024 *)resp) = vdmr;
 }
 
-// CHECK-LABEL: @test_pmdmxvi8gerx4spp(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP:%.*]], align 128, !tbaa [[TBAA6]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.pmdmxvi8gerx4spp(<1024 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC:%.*]], i32 0, i32 0, i32 0)
-// CHECK-NEXT:    store <1024 x i1> [[TMP2]], ptr [[RESP:%.*]], align 128, !tbaa [[TBAA6]]
+// CHECK-LABEL: define dso_local void @test_pmdmxvi8gerx4spp(
+// CHECK-SAME: ptr noundef readonly captures(none) [[VDMRP:%.*]], ptr noundef readonly captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 128)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP]], align 128, !tbaa [[__DMR1024_TBAA6]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP]], align 32, !tbaa [[__VECTOR_PAIR_TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.pmdmxvi8gerx4spp(<1024 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC]], i32 0, i32 0, i32 0)
+// CHECK-NEXT:    store <1024 x i1> [[TMP2]], ptr [[RESP]], align 128, !tbaa [[__DMR1024_TBAA6]]
 // CHECK-NEXT:    ret void
 //
+// AIX-LABEL: define void @test_pmdmxvi8gerx4spp(
+// AIX-SAME: ptr noundef readonly captures(none) [[VDMRP:%.*]], ptr noundef readonly captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 128)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// AIX-NEXT:  [[ENTRY:.*:]]
+// AIX-NEXT:    [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP]], align 128, !tbaa [[__DMR1024_TBAA6]]
+// AIX-NEXT:    [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP]], align 32, !tbaa [[__VECTOR_PAIR_TBAA2]]
+// AIX-NEXT:    [[TMP2:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.pmdmxvi8gerx4spp(<1024 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC]], i32 0, i32 0, i32 0)
+// AIX-NEXT:    store <1024 x i1> [[TMP2]], ptr [[RESP]], align 128, !tbaa [[__DMR1024_TBAA6]]
+// AIX-NEXT:    ret void
+//
 void test_pmdmxvi8gerx4spp(unsigned char *vdmrp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
   __dmr1024 vdmr = *((__dmr1024 *)vdmrp);
   __vector_pair vp = *((__vector_pair *)vpp);
@@ -93,17 +151,30 @@ void test_pmdmxvi8gerx4spp(unsigned char *vdmrp, unsigned char *vpp, vector unsi
   *((__dmr1024 *)resp) = vdmr;
 }
 
-// CHECK-LABEL: @test_dmf_basic(
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define dso_local void @test_dmf_basic(
+// CHECK-SAME: ptr noundef readonly captures(none) [[P:%.*]], ptr noundef writeonly captures(none) initializes((0, 128)) [[RES1:%.*]], ptr noundef captures(none) [[RES2:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmsetdmrz()
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmmr(<1024 x i1> [[TMP0]])
-// CHECK-NEXT:    store <1024 x i1> [[TMP1]], ptr [[RES1:%.*]], align 128
-// CHECK-NEXT:    [[TMP2:%.*]] = load <1024 x i1>, ptr [[RES2:%.*]], align 128
-// CHECK-NEXT:    [[TMP3:%.*]] = load <1024 x i1>, ptr [[P:%.*]], align 128
+// CHECK-NEXT:    store <1024 x i1> [[TMP1]], ptr [[RES1]], align 128
+// CHECK-NEXT:    [[TMP2:%.*]] = load <1024 x i1>, ptr [[RES2]], align 128
+// CHECK-NEXT:    [[TMP3:%.*]] = load <1024 x i1>, ptr [[P]], align 128
 // CHECK-NEXT:    [[TMP4:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmxor(<1024 x i1> [[TMP2]], <1024 x i1> [[TMP3]])
 // CHECK-NEXT:    store <1024 x i1> [[TMP4]], ptr [[RES2]], align 128
 // CHECK-NEXT:    ret void
 //
+// AIX-LABEL: define void @test_dmf_basic(
+// AIX-SAME: ptr noundef readonly captures(none) [[P:%.*]], ptr noundef writeonly captures(none) initializes((0, 128)) [[RES1:%.*]], ptr noundef captures(none) [[RES2:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// AIX-NEXT:  [[ENTRY:.*:]]
+// AIX-NEXT:    [[TMP0:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmsetdmrz()
+// AIX-NEXT:    [[TMP1:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmmr(<1024 x i1> [[TMP0]])
+// AIX-NEXT:    store <1024 x i1> [[TMP1]], ptr [[RES1]], align 128
+// AIX-NEXT:    [[TMP2:%.*]] = load <1024 x i1>, ptr [[RES2]], align 128
+// AIX-NEXT:    [[TMP3:%.*]] = load <1024 x i1>, ptr [[P]], align 128
+// AIX-NEXT:    [[TMP4:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmxor(<1024 x i1> [[TMP2]], <1024 x i1> [[TMP3]])
+// AIX-NEXT:    store <1024 x i1> [[TMP4]], ptr [[RES2]], align 128
+// AIX-NEXT:    ret void
+//
 void test_dmf_basic(char *p, char *res1, char *res2) {
   __dmr1024 x[2];
   __builtin_mma_dmsetdmrz(&x[0]);
@@ -111,18 +182,46 @@ void test_dmf_basic(char *p, char *res1, char *res2) {
   __builtin_mma_dmxor((__dmr1024*)res2, (__dmr1024*)p);
 }
 
-// CHECK-LABEL: @test_dmf_basic2(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[V:%.*]], align 16, !tbaa [[TBAA8:![0-9]+]]
+// CHECK-LABEL: define dso_local void @test_dmf_basic2(
+// CHECK-SAME: ptr noundef readonly captures(none) [[P1:%.*]], ptr noundef writeonly captures(none) initializes((0, 128)) [[RES1:%.*]], ptr noundef writeonly captures(none) initializes((0, 128)) [[RES2:%.*]], ptr noundef readonly captures(none) [[V:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[V]], align 16, !tbaa [[CHAR_TBAA8:![0-9]+]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.build.dmr(<16 x i8> [[TMP0]], <16 x i8> [[TMP0]], <16 x i8> [[TMP0]], <16 x i8> [[TMP0]], <16 x i8> [[TMP0]], <16 x i8> [[TMP0]], <16 x i8> [[TMP0]], <16 x i8> [[TMP0]])
-// CHECK-NEXT:    store <1024 x i1> [[TMP1]], ptr [[RES2:%.*]], align 128
-// CHECK-NEXT:    [[TMP2:%.*]] = load <1024 x i1>, ptr [[P1:%.*]], align 128
-// CHECK-NEXT:    store <1024 x i1> [[TMP2]], ptr [[RES1:%.*]], align 128
+// CHECK-NEXT:    store <1024 x i1> [[TMP1]], ptr [[RES2]], align 128
+// CHECK-NEXT:    [[TMP2:%.*]] = load <1024 x i1>, ptr [[P1]], align 128
+// CHECK-NEXT:    store <1024 x i1> [[TMP2]], ptr [[RES1]], align 128
 // CHECK-NEXT:    ret void
 //
+// AIX-LABEL: define void @test_dmf_basic2(
+// AIX-SAME: ptr noundef readonly captures(none) [[P1:%.*]], ptr noundef writeonly captures(none) initializes((0, 128)) [[RES1:%.*]], ptr noundef writeonly captures(none) initializes((0, 128)) [[RES2:%.*]], ptr noundef readonly captures(none) [[V:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// AIX-NEXT:  [[ENTRY:.*:]]
+// AIX-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[V]], align 16, !tbaa [[CHAR_TBAA8:![0-9]+]]
+// AIX-NEXT:    [[TMP1:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.build.dmr(<16 x i8> [[TMP0]], <16 x i8> [[TMP0]], <16 x i8> [[TMP0]], <16 x i8> [[TMP0]], <16 x i8> [[TMP0]], <16 x i8> [[TMP0]], <16 x i8> [[TMP0]], <16 x i8> [[TMP0]])
+// AIX-NEXT:    store <1024 x i1> [[TMP1]], ptr [[RES2]], align 128
+// AIX-NEXT:    [[TMP2:%.*]] = load <1024 x i1>, ptr [[P1]], align 128
+// AIX-NEXT:    store <1024 x i1> [[TMP2]], ptr [[RES1]], align 128
+// AIX-NEXT:    ret void
+//
 void test_dmf_basic2(char *p1, char *res1, char *res2,
                      vector unsigned char *v) {
   vector unsigned char vv = *v;
   __builtin_mma_build_dmr((__dmr1024*)res2, vv, vv, vv, vv, vv, vv, vv, vv);
   __builtin_mma_disassemble_dmr(res1, (__dmr1024*)p1);
 }
+//.
+// CHECK: [[__VECTOR_PAIR_TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
+// CHECK: [[META3]] = !{!"__vector_pair", [[META4:![0-9]+]], i64 0}
+// CHECK: [[META4]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0}
+// CHECK: [[META5]] = !{!"Simple C/C++ TBAA"}
+// CHECK: [[__DMR1024_TBAA6]] = !{[[META7:![0-9]+]], [[META7]], i64 0}
+// CHECK: [[META7]] = !{!"__dmr1024", [[META4]], i64 0}
+// CHECK: [[CHAR_TBAA8]] = !{[[META4]], [[META4]], i64 0}
+//.
+// AIX: [[__VECTOR_PAIR_TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
+// AIX: [[META3]] = !{!"__vector_pair", [[META4:![0-9]+]], i64 0}
+// AIX: [[META4]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0}
+// AIX: [[META5]] = !{!"Simple C/C++ TBAA"}
+// AIX: [[__DMR1024_TBAA6]] = !{[[META7:![0-9]+]], [[META7]], i64 0}
+// AIX: [[META7]] = !{!"__dmr1024", [[META4]], i64 0}
+// AIX: [[CHAR_TBAA8]] = !{[[META4]], [[META4]], i64 0}
+//.
diff --git a/clang/test/CodeGen/PowerPC/builtins-ppc-pair-mma.c b/clang/test/CodeGen/PowerPC/builtins-ppc-pair-mma.c
index 08ff936a0a797..5c7b222cb618e 100644
--- a/clang/test/CodeGen/PowerPC/builtins-ppc-pair-mma.c
+++ b/clang/test/CodeGen/PowerPC/builtins-ppc-pair-mma.c
@@ -1,13 +1,14 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6
 // RUN: %clang_cc1 -O3 -triple powerpc64le-unknown-unknown -target-cpu pwr10 \
 // RUN:  -emit-llvm %s -o - | FileCheck %s
 // RUN: %clang_cc1 -O3 -triple powerpc64-unknown-unknown -target-cpu pwr10 \
 // RUN: -emit-llvm %s -o - | FileCheck %s
 
-// CHECK-LABEL: @test1(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8> [[VC:%.*]], <16 x i8> [[VC]], <16 x i8> [[VC]], <16 x i8> [[VC]])
-// CHECK-NEXT:    store <512 x i1> [[TMP0]], ptr [[RESP:%.*]], align 64, !tbaa [[TBAA2:![0-9]+]]
+// CHECK-LABEL: define dso_local void @test1(
+// CHECK-SAME: ptr noundef readnone captures(none) [[VQP:%.*]], ptr noundef readnone captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8> [[VC]], <16 x i8> [[VC]], <16 x i8> [[VC]], <16 x i8> [[VC]])
+// CHECK-NEXT:    store <512 x i1> [[TMP0]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2:![0-9]+]]
 // CHECK-NEXT:    ret void
 //
 void test1(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -18,12 +19,13 @@ void test1(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsi
   *((__vector_quad *)resp) = res;
 }
 
-// CHECK-LABEL: @test2(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP:%.*]], align 64
+// CHECK-LABEL: define dso_local void @test2(
+// CHECK-SAME: ptr noundef readonly captures(none) [[VQP:%.*]], ptr noundef readnone captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.acc(<512 x i1> [[TMP0]])
 // CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP1]], 0
-// CHECK-NEXT:    store <16 x i8> [[TMP2]], ptr [[RESP:%.*]], align 16
+// CHECK-NEXT:    store <16 x i8> [[TMP2]], ptr [[RESP]], align 16
 // CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP1]], 1
 // CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[RESP]], i64 16
 // CHECK-NEXT:    store <16 x i8> [[TMP3]], ptr [[TMP4]], align 16
@@ -39,10 +41,11 @@ void test2(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsi
   __builtin_mma_disassemble_acc(resp, (__vector_quad*)vqp);
 }
 
-// CHECK-LABEL: @test3(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <256 x i1> @llvm.ppc.vsx.assemble.pair(<16 x i8> [[VC:%.*]], <16 x i8> [[VC]])
-// CHECK-NEXT:    store <256 x i1> [[TMP0]], ptr [[RESP:%.*]], align 32, !tbaa [[TBAA6:![0-9]+]]
+// CHECK-LABEL: define dso_local void @test3(
+// CHECK-SAME: ptr noundef readnone captures(none) [[VQP:%.*]], ptr noundef readnone captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 32)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <256 x i1> @llvm.ppc.vsx.assemble.pair(<16 x i8> [[VC]], <16 x i8> [[VC]])
+// CHECK-NEXT:    store <256 x i1> [[TMP0]], ptr [[RESP]], align 32, !tbaa [[__VECTOR_PAIR_TBAA6:![0-9]+]]
 // CHECK-NEXT:    ret void
 //
 void test3(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -53,12 +56,13 @@ void test3(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsi
   *((__vector_pair *)resp) = res;
 }
 
-// CHECK-LABEL: @test4(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load <256 x i1>, ptr [[VPP:%.*]], align 32
+// CHECK-LABEL: define dso_local void @test4(
+// CHECK-SAME: ptr noundef readnone captures(none) [[VQP:%.*]], ptr noundef readonly captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 32)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <256 x i1>, ptr [[VPP]], align 32
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call { <16 x i8>, <16 x i8> } @llvm.ppc.vsx.disassemble.pair(<256 x i1> [[TMP0]])
 // CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[TMP1]], 0
-// CHECK-NEXT:    store <16 x i8> [[TMP2]], ptr [[RESP:%.*]], align 16
+// CHECK-NEXT:    store <16 x i8> [[TMP2]], ptr [[RESP]], align 16
 // CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[TMP1]], 1
 // CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[RESP]], i64 16
 // CHECK-NEXT:    store <16 x i8> [[TMP3]], ptr [[TMP4]], align 16
@@ -68,11 +72,12 @@ void test4(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsi
   __builtin_vsx_disassemble_pair(resp, (__vector_pair*)vpp);
 }
 
-// CHECK-LABEL: @test5(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP:%.*]], align 64, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @test5(
+// CHECK-SAME: ptr noundef readonly captures(none) [[VQP:%.*]], ptr noundef readnone captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xxmtacc(<512 x i1> [[TMP0]])
-// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP:%.*]], align 64, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 void test5(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -82,11 +87,12 @@ void test5(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsi
   *((__vector_quad *)resp) = vq;
 }
 
-// CHECK-LABEL: @test6(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP:%.*]], align 64, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @test6(
+// CHECK-SAME: ptr noundef readonly captures(none) [[VQP:%.*]], ptr noundef readnone captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xxmfacc(<512 x i1> [[TMP0]])
-// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP:%.*]], align 64, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 void test6(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -96,10 +102,11 @@ void test6(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsi
   *((__vector_quad *)resp) = vq;
 }
 
-// CHECK-LABEL: @test7(
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define dso_local void @test7(
+// CHECK-SAME: ptr noundef readnone captures(none) [[VQP:%.*]], ptr noundef readnone captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xxsetaccz()
-// CHECK-NEXT:    store <512 x i1> [[TMP0]], ptr [[RESP:%.*]], align 64, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <512 x i1> [[TMP0]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 void test7(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -109,10 +116,11 @@ void test7(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsi
   *((__vector_quad *)resp) = vq;
 }
 
-// CHECK-LABEL: @test8(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvi4ger8(<16 x i8> [[VC:%.*]], <16 x i8> [[VC]])
-// CHECK-NEXT:    store <512 x i1> [[TMP0]], ptr [[RESP:%.*]], align 64, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @test8(
+// CHECK-SAME: ptr noundef readnone captures(none) [[VQP:%.*]], ptr noundef readnone captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvi4ger8(<16 x i8> [[VC]], <16 x i8> [[VC]])
+// CHECK-NEXT:    store <512 x i1> [[TMP0]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 void test8(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -122,10 +130,11 @@ void test8(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsi
   *((__vector_quad *)resp) = vq;
 }
 
-// CHECK-LABEL: @test9(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvi8ger4(<16 x i8> [[VC:%.*]], <16 x i8> [[VC]])
-// CHECK-NEXT:    store <512 x i1> [[TMP0]], ptr [[RESP:%.*]], align 64, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @test9(
+// CHECK-SAME: ptr noundef readnone captures(none) [[VQP:%.*]], ptr noundef readnone captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvi8ger4(<16 x i8> [[VC]], <16 x i8> [[VC]])
+// CHECK-NEXT:    store <512 x i1> [[TMP0]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 void test9(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -135,10 +144,11 @@ void test9(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsi
   *((__vector_quad *)resp) = vq;
 }
 
-// CHECK-LABEL: @test10(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvi16ger2(<16 x i8> [[VC:%.*]], <16 x i8> [[VC]])
-// CHECK-NEXT:    store <512 x i1> [[TMP0]], ptr [[RESP:%.*]], align 64, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @test10(
+// CHECK-SAME: ptr noundef readnone captures(none) [[VQP:%.*]], ptr noundef readnone captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvi16ger2(<16 x i8> [[VC]], <16 x i8> [[VC]])
+// CHECK-NEXT:    store <512 x i1> [[TMP0]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 void test10(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -148,10 +158,11 @@ void test10(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, uns
   *((__vector_quad *)resp) = vq;
 }
 
-// CHECK-LABEL: @test11(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvi16ger2s(<16 x i8> [[VC:%.*]], <16 x i8> [[VC]])
-// CHECK-NEXT:    store <512 x i1> [[TMP0]], ptr [[RESP:%.*]], align 64, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @test11(
+// CHECK-SAME: ptr noundef readnone captures(none) [[VQP:%.*]], ptr noundef readnone captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvi16ger2s(<16 x i8> [[VC]], <16 x i8> [[VC]])
+// CHECK-NEXT:    store <512 x i1> [[TMP0]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 void test11(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -161,10 +172,11 @@ void test11(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, uns
   *((__vector_quad *)resp) = vq;
 }
 
-// CHECK-LABEL: @test12(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvf16ger2(<16 x i8> [[VC:%.*]], <16 x i8> [[VC]])
-// CHECK-NEXT:    store <512 x i1> [[TMP0]], ptr [[RESP:%.*]], align 64, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @test12(
+// CHECK-SAME: ptr noundef readnone captures(none) [[VQP:%.*]], ptr noundef readnone captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvf16ger2(<16 x i8> [[VC]], <16 x i8> [[VC]])
+// CHECK-NEXT:    store <512 x i1> [[TMP0]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 void test12(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -174,10 +186,11 @@ void test12(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, uns
   *((__vector_quad *)resp) = vq;
 }
 
-// CHECK-LABEL: @test13(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvf32ger(<16 x i8> [[VC:%.*]], <16 x i8> [[VC]])
-// CHECK-NEXT:    store <512 x i1> [[TMP0]], ptr [[RESP:%.*]], align 64, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @test13(
+// CHECK-SAME: ptr noundef readnone captures(none) [[VQP:%.*]], ptr noundef readnone captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvf32ger(<16 x i8> [[VC]], <16 x i8> [[VC]])
+// CHECK-NEXT:    store <512 x i1> [[TMP0]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 void test13(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -187,11 +200,12 @@ void test13(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, uns
   *((__vector_quad *)resp) = vq;
 }
 
-// CHECK-LABEL: @test14(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load <256 x i1>, ptr [[VPP:%.*]], align 32, !tbaa [[TBAA6]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvf64ger(<256 x i1> [[TMP0]], <16 x i8> [[VC:%.*]])
-// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP:%.*]], align 64, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @test14(
+// CHECK-SAME: ptr noundef readnone captures(none) [[VQP:%.*]], ptr noundef readonly captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <256 x i1>, ptr [[VPP]], align 32, !tbaa [[__VECTOR_PAIR_TBAA6]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvf64ger(<256 x i1> [[TMP0]], <16 x i8> [[VC]])
+// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 void test14(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -201,10 +215,11 @@ void test14(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, uns
   *((__vector_quad *)resp) = vq;
 }
 
-// CHECK-LABEL: @test15(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvi4ger8(<16 x i8> [[VC:%.*]], <16 x i8> [[VC]], i32 0, i32 0, i32 0)
-// CHECK-NEXT:    store <512 x i1> [[TMP0]], ptr [[RESP:%.*]], align 64, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @test15(
+// CHECK-SAME: ptr noundef readnone captures(none) [[VQP:%.*]], ptr noundef readnone captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvi4ger8(<16 x i8> [[VC]], <16 x i8> [[VC]], i32 0, i32 0, i32 0)
+// CHECK-NEXT:    store <512 x i1> [[TMP0]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 void test15(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -214,10 +229,11 @@ void test15(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, uns
   *((__vector_quad *)resp) = vq;
 }
 
-// CHECK-LABEL: @test16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvi8ger4(<16 x i8> [[VC:%.*]], <16 x i8> [[VC]], i32 0, i32 0, i32 0)
-// CHECK-NEXT:    store <512 x i1> [[TMP0]], ptr [[RESP:%.*]], align 64, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @test16(
+// CHECK-SAME: ptr noundef readnone captures(none) [[VQP:%.*]], ptr noundef readnone captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvi8ger4(<16 x i8> [[VC]], <16 x i8> [[VC]], i32 0, i32 0, i32 0)
+// CHECK-NEXT:    store <512 x i1> [[TMP0]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 void test16(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -227,10 +243,11 @@ void test16(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, uns
   *((__vector_quad *)resp) = vq;
 }
 
-// CHECK-LABEL: @test17(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvi16ger2(<16 x i8> [[VC:%.*]], <16 x i8> [[VC]], i32 0, i32 0, i32 0)
-// CHECK-NEXT:    store <512 x i1> [[TMP0]], ptr [[RESP:%.*]], align 64, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @test17(
+// CHECK-SAME: ptr noundef readnone captures(none) [[VQP:%.*]], ptr noundef readnone captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvi16ger2(<16 x i8> [[VC]], <16 x i8> [[VC]], i32 0, i32 0, i32 0)
+// CHECK-NEXT:    store <512 x i1> [[TMP0]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 void test17(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -240,10 +257,11 @@ void test17(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, uns
   *((__vector_quad *)resp) = vq;
 }
 
-// CHECK-LABEL: @test18(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvi16ger2s(<16 x i8> [[VC:%.*]], <16 x i8> [[VC]], i32 0, i32 0, i32 0)
-// CHECK-NEXT:    store <512 x i1> [[TMP0]], ptr [[RESP:%.*]], align 64, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @test18(
+// CHECK-SAME: ptr noundef readnone captures(none) [[VQP:%.*]], ptr noundef readnone captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvi16ger2s(<16 x i8> [[VC]], <16 x i8> [[VC]], i32 0, i32 0, i32 0)
+// CHECK-NEXT:    store <512 x i1> [[TMP0]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 void test18(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -253,10 +271,11 @@ void test18(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, uns
   *((__vector_quad *)resp) = vq;
 }
 
-// CHECK-LABEL: @test19(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvf16ger2(<16 x i8> [[VC:%.*]], <16 x i8> [[VC]], i32 0, i32 0, i32 0)
-// CHECK-NEXT:    store <512 x i1> [[TMP0]], ptr [[RESP:%.*]], align 64, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @test19(
+// CHECK-SAME: ptr noundef readnone captures(none) [[VQP:%.*]], ptr noundef readnone captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvf16ger2(<16 x i8> [[VC]], <16 x i8> [[VC]], i32 0, i32 0, i32 0)
+// CHECK-NEXT:    store <512 x i1> [[TMP0]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 void test19(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -266,10 +285,11 @@ void test19(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, uns
   *((__vector_quad *)resp) = vq;
 }
 
-// CHECK-LABEL: @test20(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvf32ger(<16 x i8> [[VC:%.*]], <16 x i8> [[VC]], i32 0, i32 0)
-// CHECK-NEXT:    store <512 x i1> [[TMP0]], ptr [[RESP:%.*]], align 64, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @test20(
+// CHECK-SAME: ptr noundef readnone captures(none) [[VQP:%.*]], ptr noundef readnone captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvf32ger(<16 x i8> [[VC]], <16 x i8> [[VC]], i32 0, i32 0)
+// CHECK-NEXT:    store <512 x i1> [[TMP0]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 void test20(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -279,11 +299,12 @@ void test20(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, uns
   *((__vector_quad *)resp) = vq;
 }
 
-// CHECK-LABEL: @test21(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load <256 x i1>, ptr [[VPP:%.*]], align 32, !tbaa [[TBAA6]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvf64ger(<256 x i1> [[TMP0]], <16 x i8> [[VC:%.*]], i32 0, i32 0)
-// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP:%.*]], align 64, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @test21(
+// CHECK-SAME: ptr noundef readnone captures(none) [[VQP:%.*]], ptr noundef readonly captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <256 x i1>, ptr [[VPP]], align 32, !tbaa [[__VECTOR_PAIR_TBAA6]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvf64ger(<256 x i1> [[TMP0]], <16 x i8> [[VC]], i32 0, i32 0)
+// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 void test21(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -293,11 +314,12 @@ void test21(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, uns
   *((__vector_quad *)resp) = vq;
 }
 
-// CHECK-LABEL: @test22(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP:%.*]], align 64, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvi4ger8pp(<512 x i1> [[TMP0]], <16 x i8> [[VC:%.*]], <16 x i8> [[VC]])
-// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP:%.*]], align 64, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @test22(
+// CHECK-SAME: ptr noundef readonly captures(none) [[VQP:%.*]], ptr noundef readnone captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvi4ger8pp(<512 x i1> [[TMP0]], <16 x i8> [[VC]], <16 x i8> [[VC]])
+// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 void test22(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -307,11 +329,12 @@ void test22(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, uns
   *((__vector_quad *)resp) = vq;
 }
 
-// CHECK-LABEL: @test23(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP:%.*]], align 64, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvi8ger4pp(<512 x i1> [[TMP0]], <16 x i8> [[VC:%.*]], <16 x i8> [[VC]])
-// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP:%.*]], align 64, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @test23(
+// CHECK-SAME: ptr noundef readonly captures(none) [[VQP:%.*]], ptr noundef readnone captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvi8ger4pp(<512 x i1> [[TMP0]], <16 x i8> [[VC]], <16 x i8> [[VC]])
+// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 void test23(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -321,11 +344,12 @@ void test23(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, uns
   *((__vector_quad *)resp) = vq;
 }
 
-// CHECK-LABEL: @test24(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP:%.*]], align 64, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvi8ger4spp(<512 x i1> [[TMP0]], <16 x i8> [[VC:%.*]], <16 x i8> [[VC]])
-// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP:%.*]], align 64, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @test24(
+// CHECK-SAME: ptr noundef readonly captures(none) [[VQP:%.*]], ptr noundef readnone captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvi8ger4spp(<512 x i1> [[TMP0]], <16 x i8> [[VC]], <16 x i8> [[VC]])
+// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 void test24(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -335,11 +359,12 @@ void test24(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, uns
   *((__vector_quad *)resp) = vq;
 }
 
-// CHECK-LABEL: @test25(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP:%.*]], align 64, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvi16ger2pp(<512 x i1> [[TMP0]], <16 x i8> [[VC:%.*]], <16 x i8> [[VC]])
-// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP:%.*]], align 64, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @test25(
+// CHECK-SAME: ptr noundef readonly captures(none) [[VQP:%.*]], ptr noundef readnone captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvi16ger2pp(<512 x i1> [[TMP0]], <16 x i8> [[VC]], <16 x i8> [[VC]])
+// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 void test25(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -349,11 +374,12 @@ void test25(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, uns
   *((__vector_quad *)resp) = vq;
 }
 
-// CHECK-LABEL: @test26(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP:%.*]], align 64, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvi16ger2spp(<512 x i1> [[TMP0]], <16 x i8> [[VC:%.*]], <16 x i8> [[VC]])
-// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP:%.*]], align 64, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @test26(
+// CHECK-SAME: ptr noundef readonly captures(none) [[VQP:%.*]], ptr noundef readnone captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvi16ger2spp(<512 x i1> [[TMP0]], <16 x i8> [[VC]], <16 x i8> [[VC]])
+// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 void test26(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -363,11 +389,12 @@ void test26(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, uns
   *((__vector_quad *)resp) = vq;
 }
 
-// CHECK-LABEL: @test27(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP:%.*]], align 64, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvi4ger8pp(<512 x i1> [[TMP0]], <16 x i8> [[VC:%.*]], <16 x i8> [[VC]], i32 0, i32 0, i32 0)
-// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP:%.*]], align 64, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @test27(
+// CHECK-SAME: ptr noundef readonly captures(none) [[VQP:%.*]], ptr noundef readnone captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvi4ger8pp(<512 x i1> [[TMP0]], <16 x i8> [[VC]], <16 x i8> [[VC]], i32 0, i32 0, i32 0)
+// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 void test27(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -377,11 +404,12 @@ void test27(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, uns
   *((__vector_quad *)resp) = vq;
 }
 
-// CHECK-LABEL: @test28(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP:%.*]], align 64, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvi8ger4pp(<512 x i1> [[TMP0]], <16 x i8> [[VC:%.*]], <16 x i8> [[VC]], i32 0, i32 0, i32 0)
-// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP:%.*]], align 64, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @test28(
+// CHECK-SAME: ptr noundef readonly captures(none) [[VQP:%.*]], ptr noundef readnone captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvi8ger4pp(<512 x i1> [[TMP0]], <16 x i8> [[VC]], <16 x i8> [[VC]], i32 0, i32 0, i32 0)
+// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 void test28(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -391,11 +419,12 @@ void test28(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, uns
   *((__vector_quad *)resp) = vq;
 }
 
-// CHECK-LABEL: @test29(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP:%.*]], align 64, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvi8ger4spp(<512 x i1> [[TMP0]], <16 x i8> [[VC:%.*]], <16 x i8> [[VC]], i32 0, i32 0, i32 0)
-// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP:%.*]], align 64, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @test29(
+// CHECK-SAME: ptr noundef readonly captures(none) [[VQP:%.*]], ptr noundef readnone captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvi8ger4spp(<512 x i1> [[TMP0]], <16 x i8> [[VC]], <16 x i8> [[VC]], i32 0, i32 0, i32 0)
+// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 void test29(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -405,11 +434,12 @@ void test29(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, uns
   *((__vector_quad *)resp) = vq;
 }
 
-// CHECK-LABEL: @test30(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP:%.*]], align 64, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvi16ger2pp(<512 x i1> [[TMP0]], <16 x i8> [[VC:%.*]], <16 x i8> [[VC]], i32 0, i32 0, i32 0)
-// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP:%.*]], align 64, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @test30(
+// CHECK-SAME: ptr noundef readonly captures(none) [[VQP:%.*]], ptr noundef readnone captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvi16ger2pp(<512 x i1> [[TMP0]], <16 x i8> [[VC]], <16 x i8> [[VC]], i32 0, i32 0, i32 0)
+// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 void test30(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -419,11 +449,12 @@ void test30(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, uns
   *((__vector_quad *)resp) = vq;
 }
 
-// CHECK-LABEL: @test31(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP:%.*]], align 64, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvi16ger2spp(<512 x i1> [[TMP0]], <16 x i8> [[VC:%.*]], <16 x i8> [[VC]], i32 0, i32 0, i32 0)
-// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP:%.*]], align 64, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @test31(
+// CHECK-SAME: ptr noundef readonly captures(none) [[VQP:%.*]], ptr noundef readnone captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvi16ger2spp(<512 x i1> [[TMP0]], <16 x i8> [[VC]], <16 x i8> [[VC]], i32 0, i32 0, i32 0)
+// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 void test31(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -433,11 +464,12 @@ void test31(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, uns
   *((__vector_quad *)resp) = vq;
 }
 
-// CHECK-LABEL: @test32(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP:%.*]], align 64, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvf16ger2pp(<512 x i1> [[TMP0]], <16 x i8> [[VC:%.*]], <16 x i8> [[VC]])
-// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP:%.*]], align 64, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @test32(
+// CHECK-SAME: ptr noundef readonly captures(none) [[VQP:%.*]], ptr noundef readnone captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvf16ger2pp(<512 x i1> [[TMP0]], <16 x i8> [[VC]], <16 x i8> [[VC]])
+// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 void test32(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -447,11 +479,12 @@ void test32(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, uns
   *((__vector_quad *)resp) = vq;
 }
 
-// CHECK-LABEL: @test33(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP:%.*]], align 64, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvf16ger2pn(<512 x i1> [[TMP0]], <16 x i8> [[VC:%.*]], <16 x i8> [[VC]])
-// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP:%.*]], align 64, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @test33(
+// CHECK-SAME: ptr noundef readonly captures(none) [[VQP:%.*]], ptr noundef readnone captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvf16ger2pn(<512 x i1> [[TMP0]], <16 x i8> [[VC]], <16 x i8> [[VC]])
+// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 void test33(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -461,11 +494,12 @@ void test33(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, uns
   *((__vector_quad *)resp) = vq;
 }
 
-// CHECK-LABEL: @test34(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP:%.*]], align 64, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvf16ger2np(<512 x i1> [[TMP0]], <16 x i8> [[VC:%.*]], <16 x i8> [[VC]])
-// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP:%.*]], align 64, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @test34(
+// CHECK-SAME: ptr noundef readonly captures(none) [[VQP:%.*]], ptr noundef readnone captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvf16ger2np(<512 x i1> [[TMP0]], <16 x i8> [[VC]], <16 x i8> [[VC]])
+// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 void test34(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -475,11 +509,12 @@ void test34(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, uns
   *((__vector_quad *)resp) = vq;
 }
 
-// CHECK-LABEL: @test35(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP:%.*]], align 64, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvf16ger2nn(<512 x i1> [[TMP0]], <16 x i8> [[VC:%.*]], <16 x i8> [[VC]])
-// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP:%.*]], align 64, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @test35(
+// CHECK-SAME: ptr noundef readonly captures(none) [[VQP:%.*]], ptr noundef readnone captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvf16ger2nn(<512 x i1> [[TMP0]], <16 x i8> [[VC]], <16 x i8> [[VC]])
+// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 void test35(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -489,11 +524,12 @@ void test35(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, uns
   *((__vector_quad *)resp) = vq;
 }
 
-// CHECK-LABEL: @test36(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP:%.*]], align 64, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvf16ger2pp(<512 x i1> [[TMP0]], <16 x i8> [[VC:%.*]], <16 x i8> [[VC]], i32 0, i32 0, i32 0)
-// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP:%.*]], align 64, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @test36(
+// CHECK-SAME: ptr noundef readonly captures(none) [[VQP:%.*]], ptr noundef readnone captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvf16ger2pp(<512 x i1> [[TMP0]], <16 x i8> [[VC]], <16 x i8> [[VC]], i32 0, i32 0, i32 0)
+// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 void test36(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -503,11 +539,12 @@ void test36(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, uns
   *((__vector_quad *)resp) = vq;
 }
 
-// CHECK-LABEL: @test37(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP:%.*]], align 64, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvf16ger2pn(<512 x i1> [[TMP0]], <16 x i8> [[VC:%.*]], <16 x i8> [[VC]], i32 0, i32 0, i32 0)
-// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP:%.*]], align 64, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @test37(
+// CHECK-SAME: ptr noundef readonly captures(none) [[VQP:%.*]], ptr noundef readnone captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvf16ger2pn(<512 x i1> [[TMP0]], <16 x i8> [[VC]], <16 x i8> [[VC]], i32 0, i32 0, i32 0)
+// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 void test37(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -517,11 +554,12 @@ void test37(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, uns
   *((__vector_quad *)resp) = vq;
 }
 
-// CHECK-LABEL: @test38(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP:%.*]], align 64, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvf16ger2np(<512 x i1> [[TMP0]], <16 x i8> [[VC:%.*]], <16 x i8> [[VC]], i32 0, i32 0, i32 0)
-// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP:%.*]], align 64, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @test38(
+// CHECK-SAME: ptr noundef readonly captures(none) [[VQP:%.*]], ptr noundef readnone captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvf16ger2np(<512 x i1> [[TMP0]], <16 x i8> [[VC]], <16 x i8> [[VC]], i32 0, i32 0, i32 0)
+// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 void test38(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -531,11 +569,12 @@ void test38(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, uns
   *((__vector_quad *)resp) = vq;
 }
 
-// CHECK-LABEL: @test39(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP:%.*]], align 64, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvf16ger2nn(<512 x i1> [[TMP0]], <16 x i8> [[VC:%.*]], <16 x i8> [[VC]], i32 0, i32 0, i32 0)
-// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP:%.*]], align 64, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @test39(
+// CHECK-SAME: ptr noundef readonly captures(none) [[VQP:%.*]], ptr noundef readnone captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvf16ger2nn(<512 x i1> [[TMP0]], <16 x i8> [[VC]], <16 x i8> [[VC]], i32 0, i32 0, i32 0)
+// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 void test39(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -545,11 +584,12 @@ void test39(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, uns
   *((__vector_quad *)resp) = vq;
 }
 
-// CHECK-LABEL: @test40(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP:%.*]], align 64, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvf32gerpp(<512 x i1> [[TMP0]], <16 x i8> [[VC:%.*]], <16 x i8> [[VC]])
-// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP:%.*]], align 64, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @test40(
+// CHECK-SAME: ptr noundef readonly captures(none) [[VQP:%.*]], ptr noundef readnone captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvf32gerpp(<512 x i1> [[TMP0]], <16 x i8> [[VC]], <16 x i8> [[VC]])
+// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 void test40(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -559,11 +599,12 @@ void test40(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, uns
   *((__vector_quad *)resp) = vq;
 }
 
-// CHECK-LABEL: @test41(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP:%.*]], align 64, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvf32gerpn(<512 x i1> [[TMP0]], <16 x i8> [[VC:%.*]], <16 x i8> [[VC]])
-// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP:%.*]], align 64, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @test41(
+// CHECK-SAME: ptr noundef readonly captures(none) [[VQP:%.*]], ptr noundef readnone captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvf32gerpn(<512 x i1> [[TMP0]], <16 x i8> [[VC]], <16 x i8> [[VC]])
+// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 void test41(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -573,11 +614,12 @@ void test41(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, uns
   *((__vector_quad *)resp) = vq;
 }
 
-// CHECK-LABEL: @test42(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP:%.*]], align 64, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvf32gernp(<512 x i1> [[TMP0]], <16 x i8> [[VC:%.*]], <16 x i8> [[VC]])
-// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP:%.*]], align 64, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @test42(
+// CHECK-SAME: ptr noundef readonly captures(none) [[VQP:%.*]], ptr noundef readnone captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvf32gernp(<512 x i1> [[TMP0]], <16 x i8> [[VC]], <16 x i8> [[VC]])
+// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 void test42(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -587,11 +629,12 @@ void test42(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, uns
   *((__vector_quad *)resp) = vq;
 }
 
-// CHECK-LABEL: @test43(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP:%.*]], align 64, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvf32gernn(<512 x i1> [[TMP0]], <16 x i8> [[VC:%.*]], <16 x i8> [[VC]])
-// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP:%.*]], align 64, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @test43(
+// CHECK-SAME: ptr noundef readonly captures(none) [[VQP:%.*]], ptr noundef readnone captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvf32gernn(<512 x i1> [[TMP0]], <16 x i8> [[VC]], <16 x i8> [[VC]])
+// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 void test43(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -601,11 +644,12 @@ void test43(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, uns
   *((__vector_quad *)resp) = vq;
 }
 
-// CHECK-LABEL: @test44(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP:%.*]], align 64, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvf32gerpp(<512 x i1> [[TMP0]], <16 x i8> [[VC:%.*]], <16 x i8> [[VC]], i32 0, i32 0)
-// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP:%.*]], align 64, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @test44(
+// CHECK-SAME: ptr noundef readonly captures(none) [[VQP:%.*]], ptr noundef readnone captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvf32gerpp(<512 x i1> [[TMP0]], <16 x i8> [[VC]], <16 x i8> [[VC]], i32 0, i32 0)
+// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 void test44(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -615,11 +659,12 @@ void test44(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, uns
   *((__vector_quad *)resp) = vq;
 }
 
-// CHECK-LABEL: @test45(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP:%.*]], align 64, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvf32gerpn(<512 x i1> [[TMP0]], <16 x i8> [[VC:%.*]], <16 x i8> [[VC]], i32 0, i32 0)
-// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP:%.*]], align 64, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @test45(
+// CHECK-SAME: ptr noundef readonly captures(none) [[VQP:%.*]], ptr noundef readnone captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvf32gerpn(<512 x i1> [[TMP0]], <16 x i8> [[VC]], <16 x i8> [[VC]], i32 0, i32 0)
+// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 void test45(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -629,11 +674,12 @@ void test45(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, uns
   *((__vector_quad *)resp) = vq;
 }
 
-// CHECK-LABEL: @test46(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP:%.*]], align 64, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvf32gernp(<512 x i1> [[TMP0]], <16 x i8> [[VC:%.*]], <16 x i8> [[VC]], i32 0, i32 0)
-// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP:%.*]], align 64, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @test46(
+// CHECK-SAME: ptr noundef readonly captures(none) [[VQP:%.*]], ptr noundef readnone captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvf32gernp(<512 x i1> [[TMP0]], <16 x i8> [[VC]], <16 x i8> [[VC]], i32 0, i32 0)
+// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 void test46(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -643,11 +689,12 @@ void test46(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, uns
   *((__vector_quad *)resp) = vq;
 }
 
-// CHECK-LABEL: @test47(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP:%.*]], align 64, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvf32gernn(<512 x i1> [[TMP0]], <16 x i8> [[VC:%.*]], <16 x i8> [[VC]], i32 0, i32 0)
-// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP:%.*]], align 64, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @test47(
+// CHECK-SAME: ptr noundef readonly captures(none) [[VQP:%.*]], ptr noundef readnone captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvf32gernn(<512 x i1> [[TMP0]], <16 x i8> [[VC]], <16 x i8> [[VC]], i32 0, i32 0)
+// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 void test47(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -657,12 +704,13 @@ void test47(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, uns
   *((__vector_quad *)resp) = vq;
 }
 
-// CHECK-LABEL: @test48(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP:%.*]], align 64, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP:%.*]], align 32, !tbaa [[TBAA6]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvf64gerpp(<512 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC:%.*]])
-// CHECK-NEXT:    store <512 x i1> [[TMP2]], ptr [[RESP:%.*]], align 64, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @test48(
+// CHECK-SAME: ptr noundef readonly captures(none) [[VQP:%.*]], ptr noundef readonly captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP]], align 32, !tbaa [[__VECTOR_PAIR_TBAA6]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvf64gerpp(<512 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC]])
+// CHECK-NEXT:    store <512 x i1> [[TMP2]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 void test48(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -672,12 +720,13 @@ void test48(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, uns
   *((__vector_quad *)resp) = vq;
 }
 
-// CHECK-LABEL: @test49(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP:%.*]], align 64, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP:%.*]], align 32, !tbaa [[TBAA6]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvf64gerpn(<512 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC:%.*]])
-// CHECK-NEXT:    store <512 x i1> [[TMP2]], ptr [[RESP:%.*]], align 64, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @test49(
+// CHECK-SAME: ptr noundef readonly captures(none) [[VQP:%.*]], ptr noundef readonly captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP]], align 32, !tbaa [[__VECTOR_PAIR_TBAA6]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvf64gerpn(<512 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC]])
+// CHECK-NEXT:    store <512 x i1> [[TMP2]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 void test49(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -687,12 +736,13 @@ void test49(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, uns
   *((__vector_quad *)resp) = vq;
 }
 
-// CHECK-LABEL: @test50(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP:%.*]], align 64, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP:%.*]], align 32, !tbaa [[TBAA6]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvf64gernp(<512 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC:%.*]])
-// CHECK-NEXT:    store <512 x i1> [[TMP2]], ptr [[RESP:%.*]], align 64, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @test50(
+// CHECK-SAME: ptr noundef readonly captures(none) [[VQP:%.*]], ptr noundef readonly captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP]], align 32, !tbaa [[__VECTOR_PAIR_TBAA6]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvf64gernp(<512 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC]])
+// CHECK-NEXT:    store <512 x i1> [[TMP2]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 void test50(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -702,12 +752,13 @@ void test50(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, uns
   *((__vector_quad *)resp) = vq;
 }
 
-// CHECK-LABEL: @test51(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP:%.*]], align 64, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP:%.*]], align 32, !tbaa [[TBAA6]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvf64gernn(<512 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC:%.*]])
-// CHECK-NEXT:    store <512 x i1> [[TMP2]], ptr [[RESP:%.*]], align 64, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @test51(
+// CHECK-SAME: ptr noundef readonly captures(none) [[VQP:%.*]], ptr noundef readonly captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP]], align 32, !tbaa [[__VECTOR_PAIR_TBAA6]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvf64gernn(<512 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC]])
+// CHECK-NEXT:    store <512 x i1> [[TMP2]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 void test51(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -717,12 +768,13 @@ void test51(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, uns
   *((__vector_quad *)resp) = vq;
 }
 
-// CHECK-LABEL: @test52(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP:%.*]], align 64, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP:%.*]], align 32, !tbaa [[TBAA6]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvf64gerpp(<512 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC:%.*]], i32 0, i32 0)
-// CHECK-NEXT:    store <512 x i1> [[TMP2]], ptr [[RESP:%.*]], align 64, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @test52(
+// CHECK-SAME: ptr noundef readonly captures(none) [[VQP:%.*]], ptr noundef readonly captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP]], align 32, !tbaa [[__VECTOR_PAIR_TBAA6]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvf64gerpp(<512 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC]], i32 0, i32 0)
+// CHECK-NEXT:    store <512 x i1> [[TMP2]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 void test52(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -732,12 +784,13 @@ void test52(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, uns
   *((__vector_quad *)resp) = vq;
 }
 
-// CHECK-LABEL: @test53(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP:%.*]], align 64, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP:%.*]], align 32, !tbaa [[TBAA6]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvf64gerpn(<512 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC:%.*]], i32 0, i32 0)
-// CHECK-NEXT:    store <512 x i1> [[TMP2]], ptr [[RESP:%.*]], align 64, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @test53(
+// CHECK-SAME: ptr noundef readonly captures(none) [[VQP:%.*]], ptr noundef readonly captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP]], align 32, !tbaa [[__VECTOR_PAIR_TBAA6]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvf64gerpn(<512 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC]], i32 0, i32 0)
+// CHECK-NEXT:    store <512 x i1> [[TMP2]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 void test53(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -747,12 +800,13 @@ void test53(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, uns
   *((__vector_quad *)resp) = vq;
 }
 
-// CHECK-LABEL: @test54(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP:%.*]], align 64, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP:%.*]], align 32, !tbaa [[TBAA6]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvf64gernp(<512 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC:%.*]], i32 0, i32 0)
-// CHECK-NEXT:    store <512 x i1> [[TMP2]], ptr [[RESP:%.*]], align 64, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @test54(
+// CHECK-SAME: ptr noundef readonly captures(none) [[VQP:%.*]], ptr noundef readonly captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP]], align 32, !tbaa [[__VECTOR_PAIR_TBAA6]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvf64gernp(<512 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC]], i32 0, i32 0)
+// CHECK-NEXT:    store <512 x i1> [[TMP2]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 void test54(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -762,12 +816,13 @@ void test54(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, uns
   *((__vector_quad *)resp) = vq;
 }
 
-// CHECK-LABEL: @test55(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP:%.*]], align 64, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP:%.*]], align 32, !tbaa [[TBAA6]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvf64gernn(<512 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC:%.*]], i32 0, i32 0)
-// CHECK-NEXT:    store <512 x i1> [[TMP2]], ptr [[RESP:%.*]], align 64, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @test55(
+// CHECK-SAME: ptr noundef readonly captures(none) [[VQP:%.*]], ptr noundef readonly captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP]], align 32, !tbaa [[__VECTOR_PAIR_TBAA6]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvf64gernn(<512 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC]], i32 0, i32 0)
+// CHECK-NEXT:    store <512 x i1> [[TMP2]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 void test55(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -777,10 +832,11 @@ void test55(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, uns
   *((__vector_quad *)resp) = vq;
 }
 
-// CHECK-LABEL: @test56(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvbf16ger2(<16 x i8> [[VC:%.*]], <16 x i8> [[VC]])
-// CHECK-NEXT:    store <512 x i1> [[TMP0]], ptr [[RESP:%.*]], align 64, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @test56(
+// CHECK-SAME: ptr noundef readnone captures(none) [[VQP:%.*]], ptr noundef readnone captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvbf16ger2(<16 x i8> [[VC]], <16 x i8> [[VC]])
+// CHECK-NEXT:    store <512 x i1> [[TMP0]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 void test56(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -790,10 +846,11 @@ void test56(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, uns
   *((__vector_quad *)resp) = vq;
 }
 
-// CHECK-LABEL: @test57(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvbf16ger2(<16 x i8> [[VC:%.*]], <16 x i8> [[VC]], i32 0, i32 0, i32 0)
-// CHECK-NEXT:    store <512 x i1> [[TMP0]], ptr [[RESP:%.*]], align 64, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @test57(
+// CHECK-SAME: ptr noundef readnone captures(none) [[VQP:%.*]], ptr noundef readnone captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvbf16ger2(<16 x i8> [[VC]], <16 x i8> [[VC]], i32 0, i32 0, i32 0)
+// CHECK-NEXT:    store <512 x i1> [[TMP0]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 void test57(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -803,11 +860,12 @@ void test57(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, uns
   *((__vector_quad *)resp) = vq;
 }
 
-// CHECK-LABEL: @test58(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP:%.*]], align 64, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvbf16ger2pp(<512 x i1> [[TMP0]], <16 x i8> [[VC:%.*]], <16 x i8> [[VC]])
-// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP:%.*]], align 64, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @test58(
+// CHECK-SAME: ptr noundef readonly captures(none) [[VQP:%.*]], ptr noundef readnone captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvbf16ger2pp(<512 x i1> [[TMP0]], <16 x i8> [[VC]], <16 x i8> [[VC]])
+// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 void test58(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -817,11 +875,12 @@ void test58(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, uns
   *((__vector_quad *)resp) = vq;
 }
 
-// CHECK-LABEL: @test59(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP:%.*]], align 64, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvbf16ger2pn(<512 x i1> [[TMP0]], <16 x i8> [[VC:%.*]], <16 x i8> [[VC]])
-// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP:%.*]], align 64, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @test59(
+// CHECK-SAME: ptr noundef readonly captures(none) [[VQP:%.*]], ptr noundef readnone captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvbf16ger2pn(<512 x i1> [[TMP0]], <16 x i8> [[VC]], <16 x i8> [[VC]])
+// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 void test59(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -831,11 +890,12 @@ void test59(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, uns
   *((__vector_quad *)resp) = vq;
 }
 
-// CHECK-LABEL: @test60(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP:%.*]], align 64, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvbf16ger2np(<512 x i1> [[TMP0]], <16 x i8> [[VC:%.*]], <16 x i8> [[VC]])
-// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP:%.*]], align 64, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @test60(
+// CHECK-SAME: ptr noundef readonly captures(none) [[VQP:%.*]], ptr noundef readnone captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvbf16ger2np(<512 x i1> [[TMP0]], <16 x i8> [[VC]], <16 x i8> [[VC]])
+// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 void test60(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -845,11 +905,12 @@ void test60(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, uns
   *((__vector_quad *)resp) = vq;
 }
 
-// CHECK-LABEL: @test61(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP:%.*]], align 64, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvbf16ger2nn(<512 x i1> [[TMP0]], <16 x i8> [[VC:%.*]], <16 x i8> [[VC]])
-// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP:%.*]], align 64, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @test61(
+// CHECK-SAME: ptr noundef readonly captures(none) [[VQP:%.*]], ptr noundef readnone captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvbf16ger2nn(<512 x i1> [[TMP0]], <16 x i8> [[VC]], <16 x i8> [[VC]])
+// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 void test61(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -859,11 +920,12 @@ void test61(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, uns
   *((__vector_quad *)resp) = vq;
 }
 
-// CHECK-LABEL: @test62(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP:%.*]], align 64, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvbf16ger2pp(<512 x i1> [[TMP0]], <16 x i8> [[VC:%.*]], <16 x i8> [[VC]], i32 0, i32 0, i32 0)
-// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP:%.*]], align 64, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @test62(
+// CHECK-SAME: ptr noundef readonly captures(none) [[VQP:%.*]], ptr noundef readnone captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvbf16ger2pp(<512 x i1> [[TMP0]], <16 x i8> [[VC]], <16 x i8> [[VC]], i32 0, i32 0, i32 0)
+// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 void test62(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -873,11 +935,12 @@ void test62(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, uns
   *((__vector_quad *)resp) = vq;
 }
 
-// CHECK-LABEL: @test63(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP:%.*]], align 64, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvbf16ger2pn(<512 x i1> [[TMP0]], <16 x i8> [[VC:%.*]], <16 x i8> [[VC]], i32 0, i32 0, i32 0)
-// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP:%.*]], align 64, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @test63(
+// CHECK-SAME: ptr noundef readonly captures(none) [[VQP:%.*]], ptr noundef readnone captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvbf16ger2pn(<512 x i1> [[TMP0]], <16 x i8> [[VC]], <16 x i8> [[VC]], i32 0, i32 0, i32 0)
+// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 void test63(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -887,11 +950,12 @@ void test63(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, uns
   *((__vector_quad *)resp) = vq;
 }
 
-// CHECK-LABEL: @test64(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP:%.*]], align 64, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvbf16ger2np(<512 x i1> [[TMP0]], <16 x i8> [[VC:%.*]], <16 x i8> [[VC]], i32 0, i32 0, i32 0)
-// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP:%.*]], align 64, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @test64(
+// CHECK-SAME: ptr noundef readonly captures(none) [[VQP:%.*]], ptr noundef readnone captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvbf16ger2np(<512 x i1> [[TMP0]], <16 x i8> [[VC]], <16 x i8> [[VC]], i32 0, i32 0, i32 0)
+// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 void test64(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -901,11 +965,12 @@ void test64(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, uns
   *((__vector_quad *)resp) = vq;
 }
 
-// CHECK-LABEL: @test65(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP:%.*]], align 64, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvbf16ger2nn(<512 x i1> [[TMP0]], <16 x i8> [[VC:%.*]], <16 x i8> [[VC]], i32 0, i32 0, i32 0)
-// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP:%.*]], align 64, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @test65(
+// CHECK-SAME: ptr noundef readonly captures(none) [[VQP:%.*]], ptr noundef readnone captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvbf16ger2nn(<512 x i1> [[TMP0]], <16 x i8> [[VC]], <16 x i8> [[VC]], i32 0, i32 0, i32 0)
+// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 void test65(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -915,10 +980,11 @@ void test65(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, uns
   *((__vector_quad *)resp) = vq;
 }
 
-// CHECK-LABEL: @test66(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <256 x i1> @llvm.ppc.vsx.lxvp(ptr [[VPP:%.*]])
-// CHECK-NEXT:    tail call void @llvm.ppc.vsx.stxvp(<256 x i1> [[TMP0]], ptr [[VP2:%.*]])
+// CHECK-LABEL: define dso_local void @test66(
+// CHECK-SAME: ptr noundef [[VPP:%.*]], ptr noundef [[VP2:%.*]]) local_unnamed_addr #[[ATTR3:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <256 x i1> @llvm.ppc.vsx.lxvp(ptr [[VPP]])
+// CHECK-NEXT:    tail call void @llvm.ppc.vsx.stxvp(<256 x i1> [[TMP0]], ptr [[VP2]])
 // CHECK-NEXT:    ret void
 //
 void test66(const __vector_pair *vpp, __vector_pair *vp2) {
@@ -926,11 +992,12 @@ void test66(const __vector_pair *vpp, __vector_pair *vp2) {
   __builtin_vsx_stxvp(vp, 0L, vp2);
 }
 
-// CHECK-LABEL: @test67(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[VPP:%.*]], i64 [[OFFSET:%.*]]
+// CHECK-LABEL: define dso_local void @test67(
+// CHECK-SAME: ptr noundef [[VPP:%.*]], i64 noundef [[OFFSET:%.*]], ptr noundef [[VP2:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[VPP]], i64 [[OFFSET]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <256 x i1> @llvm.ppc.vsx.lxvp(ptr [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[VP2:%.*]], i64 [[OFFSET]]
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[VP2]], i64 [[OFFSET]]
 // CHECK-NEXT:    tail call void @llvm.ppc.vsx.stxvp(<256 x i1> [[TMP1]], ptr [[TMP2]])
 // CHECK-NEXT:    ret void
 //
@@ -939,11 +1006,12 @@ void test67(const __vector_pair *vpp, signed long offset, __vector_pair *vp2) {
   __builtin_vsx_stxvp(vp, offset, vp2);
 }
 
-// CHECK-LABEL: @test68(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[VPP:%.*]], i64 18
+// CHECK-LABEL: define dso_local void @test68(
+// CHECK-SAME: ptr noundef [[VPP:%.*]], ptr noundef [[VP2:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[VPP]], i64 18
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <256 x i1> @llvm.ppc.vsx.lxvp(ptr [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[VP2:%.*]], i64 18
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[VP2]], i64 18
 // CHECK-NEXT:    tail call void @llvm.ppc.vsx.stxvp(<256 x i1> [[TMP1]], ptr [[TMP2]])
 // CHECK-NEXT:    ret void
 //
@@ -952,11 +1020,12 @@ void test68(const __vector_pair *vpp, __vector_pair *vp2) {
   __builtin_vsx_stxvp(vp, 18L, vp2);
 }
 
-// CHECK-LABEL: @test69(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[VPP:%.*]], i64 1
+// CHECK-LABEL: define dso_local void @test69(
+// CHECK-SAME: ptr noundef [[VPP:%.*]], ptr noundef [[VP2:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[VPP]], i64 1
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <256 x i1> @llvm.ppc.vsx.lxvp(ptr [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[VP2:%.*]], i64 1
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[VP2]], i64 1
 // CHECK-NEXT:    tail call void @llvm.ppc.vsx.stxvp(<256 x i1> [[TMP1]], ptr [[TMP2]])
 // CHECK-NEXT:    ret void
 //
@@ -965,11 +1034,12 @@ void test69(const __vector_pair *vpp, __vector_pair *vp2) {
   __builtin_vsx_stxvp(vp, 1L, vp2);
 }
 
-// CHECK-LABEL: @test70(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[VPP:%.*]], i64 42
+// CHECK-LABEL: define dso_local void @test70(
+// CHECK-SAME: ptr noundef [[VPP:%.*]], ptr noundef [[VP2:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[VPP]], i64 42
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <256 x i1> @llvm.ppc.vsx.lxvp(ptr [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[VP2:%.*]], i64 42
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[VP2]], i64 42
 // CHECK-NEXT:    tail call void @llvm.ppc.vsx.stxvp(<256 x i1> [[TMP1]], ptr [[TMP2]])
 // CHECK-NEXT:    ret void
 //
@@ -978,11 +1048,12 @@ void test70(const __vector_pair *vpp, __vector_pair *vp2) {
   __builtin_vsx_stxvp(vp, 42L, vp2);
 }
 
-// CHECK-LABEL: @test71(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[VPP:%.*]], i64 32768
+// CHECK-LABEL: define dso_local void @test71(
+// CHECK-SAME: ptr noundef [[VPP:%.*]], ptr noundef [[VP2:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[VPP]], i64 32768
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <256 x i1> @llvm.ppc.vsx.lxvp(ptr [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[VP2:%.*]], i64 32768
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[VP2]], i64 32768
 // CHECK-NEXT:    tail call void @llvm.ppc.vsx.stxvp(<256 x i1> [[TMP1]], ptr [[TMP2]])
 // CHECK-NEXT:    ret void
 //
@@ -991,11 +1062,12 @@ void test71(const __vector_pair *vpp, __vector_pair *vp2) {
   __builtin_vsx_stxvp(vp, 32768L, vp2);
 }
 
-// CHECK-LABEL: @test72(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[VPP:%.*]], i64 32799
+// CHECK-LABEL: define dso_local void @test72(
+// CHECK-SAME: ptr noundef [[VPP:%.*]], ptr noundef [[VP2:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[VPP]], i64 32799
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <256 x i1> @llvm.ppc.vsx.lxvp(ptr [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[VP2:%.*]], i64 32799
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[VP2]], i64 32799
 // CHECK-NEXT:    tail call void @llvm.ppc.vsx.stxvp(<256 x i1> [[TMP1]], ptr [[TMP2]])
 // CHECK-NEXT:    ret void
 //
@@ -1004,13 +1076,14 @@ void test72(const __vector_pair *vpp, __vector_pair *vp2) {
   __builtin_vsx_stxvp(vp, 32799L, vp2);
 }
 
-// CHECK-LABEL: @test73(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP:%.*]], align 64, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[VPP:%.*]], i64 8
+// CHECK-LABEL: define dso_local void @test73(
+// CHECK-SAME: ptr noundef readonly captures(none) [[VQP:%.*]], ptr noundef [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[VPP]], i64 8
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <256 x i1> @llvm.ppc.vsx.lxvp(ptr [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvf64gernn(<512 x i1> [[TMP0]], <256 x i1> [[TMP2]], <16 x i8> [[VC:%.*]], i32 0, i32 0)
-// CHECK-NEXT:    store <512 x i1> [[TMP3]], ptr [[RESP:%.*]], align 64, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvf64gernn(<512 x i1> [[TMP0]], <256 x i1> [[TMP2]], <16 x i8> [[VC]], i32 0, i32 0)
+// CHECK-NEXT:    store <512 x i1> [[TMP3]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 void test73(unsigned char *vqp, const __vector_pair *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -1020,12 +1093,13 @@ void test73(unsigned char *vqp, const __vector_pair *vpp, vector unsigned char v
   *((__vector_quad *)resp) = vq;
 }
 
-// CHECK-LABEL: @test74(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP:%.*]], align 64, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <256 x i1> @llvm.ppc.vsx.lxvp(ptr [[VPP:%.*]])
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvf64gernp(<512 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC:%.*]])
-// CHECK-NEXT:    store <512 x i1> [[TMP2]], ptr [[RESP:%.*]], align 64, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @test74(
+// CHECK-SAME: ptr noundef readonly captures(none) [[VQP:%.*]], ptr noundef [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <256 x i1> @llvm.ppc.vsx.lxvp(ptr [[VPP]])
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvf64gernp(<512 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC]])
+// CHECK-NEXT:    store <512 x i1> [[TMP2]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 void test74(unsigned char *vqp, const __vector_pair *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -1035,13 +1109,14 @@ void test74(unsigned char *vqp, const __vector_pair *vpp, vector unsigned char v
   *((__vector_quad *)resp) = vq;
 }
 
-// CHECK-LABEL: @test75(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP:%.*]], align 64, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[VPP:%.*]], i64 [[OFFS:%.*]]
+// CHECK-LABEL: define dso_local void @test75(
+// CHECK-SAME: ptr noundef readonly captures(none) [[VQP:%.*]], i64 noundef [[OFFS:%.*]], ptr noundef [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[VPP]], i64 [[OFFS]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <256 x i1> @llvm.ppc.vsx.lxvp(ptr [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvf64gernp(<512 x i1> [[TMP0]], <256 x i1> [[TMP2]], <16 x i8> [[VC:%.*]])
-// CHECK-NEXT:    store <512 x i1> [[TMP3]], ptr [[RESP:%.*]], align 64, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvf64gernp(<512 x i1> [[TMP0]], <256 x i1> [[TMP2]], <16 x i8> [[VC]])
+// CHECK-NEXT:    store <512 x i1> [[TMP3]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 void test75(unsigned char *vqp, signed long offs, const __vector_pair *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -1051,10 +1126,11 @@ void test75(unsigned char *vqp, signed long offs, const __vector_pair *vpp, vect
   *((__vector_quad *)resp) = vq;
 }
 
-// CHECK-LABEL: @test76(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <256 x i1> @llvm.ppc.vsx.assemble.pair(<16 x i8> [[VC:%.*]], <16 x i8> [[VC]])
-// CHECK-NEXT:    store <256 x i1> [[TMP0]], ptr [[RESP:%.*]], align 32, !tbaa [[TBAA6]]
+// CHECK-LABEL: define dso_local void @test76(
+// CHECK-SAME: ptr noundef readnone captures(none) [[VQP:%.*]], ptr noundef readnone captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 32)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <256 x i1> @llvm.ppc.vsx.assemble.pair(<16 x i8> [[VC]], <16 x i8> [[VC]])
+// CHECK-NEXT:    store <256 x i1> [[TMP0]], ptr [[RESP]], align 32, !tbaa [[__VECTOR_PAIR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 void test76(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -1065,12 +1141,13 @@ void test76(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, uns
   *((__vector_pair *)resp) = res;
 }
 
-// CHECK-LABEL: @test77(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load <256 x i1>, ptr [[VPP:%.*]], align 32
+// CHECK-LABEL: define dso_local void @test77(
+// CHECK-SAME: ptr noundef readnone captures(none) [[VQP:%.*]], ptr noundef readonly captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 32)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <256 x i1>, ptr [[VPP]], align 32
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call { <16 x i8>, <16 x i8> } @llvm.ppc.vsx.disassemble.pair(<256 x i1> [[TMP0]])
 // CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[TMP1]], 0
-// CHECK-NEXT:    store <16 x i8> [[TMP2]], ptr [[RESP:%.*]], align 16
+// CHECK-NEXT:    store <16 x i8> [[TMP2]], ptr [[RESP]], align 16
 // CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[TMP1]], 1
 // CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[RESP]], i64 16
 // CHECK-NEXT:    store <16 x i8> [[TMP3]], ptr [[TMP4]], align 16
@@ -1080,10 +1157,11 @@ void test77(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, uns
   __builtin_mma_disassemble_pair(resp, (__vector_pair*)vpp);
 }
 
-// CHECK-LABEL: @test78(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <256 x i1> @llvm.ppc.vsx.lxvp(ptr [[VPP:%.*]])
-// CHECK-NEXT:    tail call void @llvm.ppc.vsx.stxvp(<256 x i1> [[TMP0]], ptr [[VP2:%.*]])
+// CHECK-LABEL: define dso_local void @test78(
+// CHECK-SAME: ptr noundef [[VPP:%.*]], ptr noundef [[VP2:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <256 x i1> @llvm.ppc.vsx.lxvp(ptr [[VPP]])
+// CHECK-NEXT:    tail call void @llvm.ppc.vsx.stxvp(<256 x i1> [[TMP0]], ptr [[VP2]])
 // CHECK-NEXT:    ret void
 //
 void test78(const __vector_pair *vpp, __vector_pair *vp2) {
@@ -1091,11 +1169,12 @@ void test78(const __vector_pair *vpp, __vector_pair *vp2) {
   __builtin_mma_stxvp(vp, 0L, vp2);
 }
 
-// CHECK-LABEL: @test79(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[VPP:%.*]], i64 [[OFFSET:%.*]]
+// CHECK-LABEL: define dso_local void @test79(
+// CHECK-SAME: ptr noundef [[VPP:%.*]], i64 noundef [[OFFSET:%.*]], ptr noundef [[VP2:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[VPP]], i64 [[OFFSET]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <256 x i1> @llvm.ppc.vsx.lxvp(ptr [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[VP2:%.*]], i64 [[OFFSET]]
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[VP2]], i64 [[OFFSET]]
 // CHECK-NEXT:    tail call void @llvm.ppc.vsx.stxvp(<256 x i1> [[TMP1]], ptr [[TMP2]])
 // CHECK-NEXT:    ret void
 //
@@ -1104,11 +1183,12 @@ void test79(const __vector_pair *vpp, signed long offset, __vector_pair *vp2) {
   __builtin_mma_stxvp(vp, offset, vp2);
 }
 
-// CHECK-LABEL: @test80(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[VPP:%.*]], i64 18
+// CHECK-LABEL: define dso_local void @test80(
+// CHECK-SAME: ptr noundef [[VPP:%.*]], ptr noundef [[VP2:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[VPP]], i64 18
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <256 x i1> @llvm.ppc.vsx.lxvp(ptr [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[VP2:%.*]], i64 18
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[VP2]], i64 18
 // CHECK-NEXT:    tail call void @llvm.ppc.vsx.stxvp(<256 x i1> [[TMP1]], ptr [[TMP2]])
 // CHECK-NEXT:    ret void
 //
@@ -1117,11 +1197,12 @@ void test80(const __vector_pair *vpp, __vector_pair *vp2) {
   __builtin_mma_stxvp(vp, 18L, vp2);
 }
 
-// CHECK-LABEL: @test81(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[VPP:%.*]], i64 1
+// CHECK-LABEL: define dso_local void @test81(
+// CHECK-SAME: ptr noundef [[VPP:%.*]], ptr noundef [[VP2:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[VPP]], i64 1
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <256 x i1> @llvm.ppc.vsx.lxvp(ptr [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[VP2:%.*]], i64 1
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[VP2]], i64 1
 // CHECK-NEXT:    tail call void @llvm.ppc.vsx.stxvp(<256 x i1> [[TMP1]], ptr [[TMP2]])
 // CHECK-NEXT:    ret void
 //
@@ -1130,11 +1211,12 @@ void test81(const __vector_pair *vpp, __vector_pair *vp2) {
   __builtin_mma_stxvp(vp, 1L, vp2);
 }
 
-// CHECK-LABEL: @test82(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[VPP:%.*]], i64 42
+// CHECK-LABEL: define dso_local void @test82(
+// CHECK-SAME: ptr noundef [[VPP:%.*]], ptr noundef [[VP2:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[VPP]], i64 42
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <256 x i1> @llvm.ppc.vsx.lxvp(ptr [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[VP2:%.*]], i64 42
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[VP2]], i64 42
 // CHECK-NEXT:    tail call void @llvm.ppc.vsx.stxvp(<256 x i1> [[TMP1]], ptr [[TMP2]])
 // CHECK-NEXT:    ret void
 //
@@ -1143,11 +1225,12 @@ void test82(const __vector_pair *vpp, __vector_pair *vp2) {
   __builtin_mma_stxvp(vp, 42L, vp2);
 }
 
-// CHECK-LABEL: @test83(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[VPP:%.*]], i64 32768
+// CHECK-LABEL: define dso_local void @test83(
+// CHECK-SAME: ptr noundef [[VPP:%.*]], ptr noundef [[VP2:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[VPP]], i64 32768
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <256 x i1> @llvm.ppc.vsx.lxvp(ptr [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[VP2:%.*]], i64 32768
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[VP2]], i64 32768
 // CHECK-NEXT:    tail call void @llvm.ppc.vsx.stxvp(<256 x i1> [[TMP1]], ptr [[TMP2]])
 // CHECK-NEXT:    ret void
 //
@@ -1156,11 +1239,12 @@ void test83(const __vector_pair *vpp, __vector_pair *vp2) {
   __builtin_mma_stxvp(vp, 32768L, vp2);
 }
 
-// CHECK-LABEL: @test84(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[VPP:%.*]], i64 32799
+// CHECK-LABEL: define dso_local void @test84(
+// CHECK-SAME: ptr noundef [[VPP:%.*]], ptr noundef [[VP2:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[VPP]], i64 32799
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <256 x i1> @llvm.ppc.vsx.lxvp(ptr [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[VP2:%.*]], i64 32799
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[VP2]], i64 32799
 // CHECK-NEXT:    tail call void @llvm.ppc.vsx.stxvp(<256 x i1> [[TMP1]], ptr [[TMP2]])
 // CHECK-NEXT:    ret void
 //
@@ -1169,13 +1253,14 @@ void test84(const __vector_pair *vpp, __vector_pair *vp2) {
   __builtin_mma_stxvp(vp, 32799L, vp2);
 }
 
-// CHECK-LABEL: @test85(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP:%.*]], align 64, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[VPP:%.*]], i64 8
+// CHECK-LABEL: define dso_local void @test85(
+// CHECK-SAME: ptr noundef readonly captures(none) [[VQP:%.*]], ptr noundef [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[VPP]], i64 8
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <256 x i1> @llvm.ppc.vsx.lxvp(ptr [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvf64gernn(<512 x i1> [[TMP0]], <256 x i1> [[TMP2]], <16 x i8> [[VC:%.*]], i32 0, i32 0)
-// CHECK-NEXT:    store <512 x i1> [[TMP3]], ptr [[RESP:%.*]], align 64, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvf64gernn(<512 x i1> [[TMP0]], <256 x i1> [[TMP2]], <16 x i8> [[VC]], i32 0, i32 0)
+// CHECK-NEXT:    store <512 x i1> [[TMP3]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 void test85(unsigned char *vqp, const __vector_pair *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -1185,12 +1270,13 @@ void test85(unsigned char *vqp, const __vector_pair *vpp, vector unsigned char v
   *((__vector_quad *)resp) = vq;
 }
 
-// CHECK-LABEL: @test86(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP:%.*]], align 64, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <256 x i1> @llvm.ppc.vsx.lxvp(ptr [[VPP:%.*]])
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvf64gernp(<512 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC:%.*]])
-// CHECK-NEXT:    store <512 x i1> [[TMP2]], ptr [[RESP:%.*]], align 64, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @test86(
+// CHECK-SAME: ptr noundef readonly captures(none) [[VQP:%.*]], ptr noundef [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <256 x i1> @llvm.ppc.vsx.lxvp(ptr [[VPP]])
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvf64gernp(<512 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC]])
+// CHECK-NEXT:    store <512 x i1> [[TMP2]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 void test86(unsigned char *vqp, const __vector_pair *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -1200,13 +1286,14 @@ void test86(unsigned char *vqp, const __vector_pair *vpp, vector unsigned char v
   *((__vector_quad *)resp) = vq;
 }
 
-// CHECK-LABEL: @test87(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP:%.*]], align 64, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[VPP:%.*]], i64 [[OFFS:%.*]]
+// CHECK-LABEL: define dso_local void @test87(
+// CHECK-SAME: ptr noundef readonly captures(none) [[VQP:%.*]], i64 noundef [[OFFS:%.*]], ptr noundef [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[VPP]], i64 [[OFFS]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <256 x i1> @llvm.ppc.vsx.lxvp(ptr [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvf64gernp(<512 x i1> [[TMP0]], <256 x i1> [[TMP2]], <16 x i8> [[VC:%.*]])
-// CHECK-NEXT:    store <512 x i1> [[TMP3]], ptr [[RESP:%.*]], align 64, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvf64gernp(<512 x i1> [[TMP0]], <256 x i1> [[TMP2]], <16 x i8> [[VC]])
+// CHECK-NEXT:    store <512 x i1> [[TMP3]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 void test87(unsigned char *vqp, signed long offs, const __vector_pair *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -1215,3 +1302,11 @@ void test87(unsigned char *vqp, signed long offs, const __vector_pair *vpp, vect
   __builtin_mma_xvf64gernp(&vq, vp, vc);
   *((__vector_quad *)resp) = vq;
 }
+//.
+// CHECK: [[__VECTOR_QUAD_TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
+// CHECK: [[META3]] = !{!"__vector_quad", [[META4:![0-9]+]], i64 0}
+// CHECK: [[META4]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0}
+// CHECK: [[META5]] = !{!"Simple C/C++ TBAA"}
+// CHECK: [[__VECTOR_PAIR_TBAA6]] = !{[[META7:![0-9]+]], [[META7]], i64 0}
+// CHECK: [[META7]] = !{!"__vector_pair", [[META4]], i64 0}
+//.
diff --git a/clang/test/CodeGen/PowerPC/builtins-ppc-xlcompat-cas-error.c b/clang/test/CodeGen/PowerPC/builtins-ppc-xlcompat-cas-error.c
index c35c54d6b1858..669ce43e23d16 100644
--- a/clang/test/CodeGen/PowerPC/builtins-ppc-xlcompat-cas-error.c
+++ b/clang/test/CodeGen/PowerPC/builtins-ppc-xlcompat-cas-error.c
@@ -6,7 +6,7 @@ void test_builtin_ppc_compare_and_swap() {
   volatile int a = 0;
   long b = 0, c = 0;
 
-  __compare_and_swap(&a, &b, c); // expected-warning {{incompatible pointer types passing 'long *' to parameter of type 'int *'}}
+  __compare_and_swap(&a, &b, c); // expected-error {{incompatible pointer types passing 'long *' to parameter of type 'int *'}}
 
 }
 
@@ -14,6 +14,6 @@ void test_builtin_ppc_compare_and_swaplp() {
   volatile long a = 0;
   int b = 0, c = 0;
 
-  __compare_and_swaplp(&a, &b, c);// expected-warning {{incompatible pointer types passing 'int *' to parameter of type 'long *'}}
+  __compare_and_swaplp(&a, &b, c);// expected-error {{incompatible pointer types passing 'int *' to parameter of type 'long *'}}
 
 }
diff --git a/clang/test/CodeGen/RISCV/attr-rvv-vector-bits-bitcast-less-8.c b/clang/test/CodeGen/RISCV/attr-rvv-vector-bits-bitcast-less-8.c
index 45a099dc9c678..1f0b3d4a560e7 100644
--- a/clang/test/CodeGen/RISCV/attr-rvv-vector-bits-bitcast-less-8.c
+++ b/clang/test/CodeGen/RISCV/attr-rvv-vector-bits-bitcast-less-8.c
@@ -1,4 +1,4 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6
 // RUN: %clang_cc1 -triple riscv64-none-linux-gnu -target-feature +f -target-feature +d -target-feature +zve64d -mvscale-min=2 -mvscale-max=2 -O1 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-128
 
 // REQUIRES: riscv-registered-target
@@ -53,10 +53,11 @@ DEFINE_STRUCT(bool64)
 // bool
 //===----------------------------------------------------------------------===//
 
-// CHECK-128-LABEL: @read_bool32(
-// CHECK-128-NEXT:  entry:
-// CHECK-128-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S:%.*]], i64 1
-// CHECK-128-NEXT:    [[TMP0:%.*]] = load <1 x i8>, ptr [[Y]], align 1, !tbaa [[TBAA6:![0-9]+]]
+// CHECK-128-LABEL: define dso_local <vscale x 2 x i1> @read_bool32(
+// CHECK-128-SAME: ptr noundef readonly captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// CHECK-128-NEXT:  [[ENTRY:.*:]]
+// CHECK-128-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S]], i64 1
+// CHECK-128-NEXT:    [[TMP0:%.*]] = load <1 x i8>, ptr [[Y]], align 1, !tbaa [[CHAR_TBAA6:![0-9]+]]
 // CHECK-128-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 1 x i8> @llvm.vector.insert.nxv1i8.v1i8(<vscale x 1 x i8> poison, <1 x i8> [[TMP0]], i64 0)
 // CHECK-128-NEXT:    [[TMP1:%.*]] = bitcast <vscale x 1 x i8> [[CAST_SCALABLE]] to <vscale x 8 x i1>
 // CHECK-128-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv8i1(<vscale x 8 x i1> [[TMP1]], i64 0)
@@ -66,23 +67,25 @@ vbool32_t read_bool32(struct struct_bool32 *s) {
   return s->y[0];
 }
 
-// CHECK-128-LABEL: @write_bool32(
-// CHECK-128-NEXT:  entry:
-// CHECK-128-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.vector.insert.nxv8i1.nxv2i1(<vscale x 8 x i1> zeroinitializer, <vscale x 2 x i1> [[X:%.*]], i64 0)
+// CHECK-128-LABEL: define dso_local void @write_bool32(
+// CHECK-128-SAME: ptr noundef writeonly captures(none) initializes((1, 2)) [[S:%.*]], <vscale x 2 x i1> [[X:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] {
+// CHECK-128-NEXT:  [[ENTRY:.*:]]
+// CHECK-128-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.vector.insert.nxv8i1.nxv2i1(<vscale x 8 x i1> zeroinitializer, <vscale x 2 x i1> [[X]], i64 0)
 // CHECK-128-NEXT:    [[TMP1:%.*]] = bitcast <vscale x 8 x i1> [[TMP0]] to <vscale x 1 x i8>
 // CHECK-128-NEXT:    [[CAST_FIXED:%.*]] = tail call <1 x i8> @llvm.vector.extract.v1i8.nxv1i8(<vscale x 1 x i8> [[TMP1]], i64 0)
-// CHECK-128-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S:%.*]], i64 1
-// CHECK-128-NEXT:    store <1 x i8> [[CAST_FIXED]], ptr [[Y]], align 1, !tbaa [[TBAA6]]
+// CHECK-128-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S]], i64 1
+// CHECK-128-NEXT:    store <1 x i8> [[CAST_FIXED]], ptr [[Y]], align 1, !tbaa [[CHAR_TBAA6]]
 // CHECK-128-NEXT:    ret void
 //
 void write_bool32(struct struct_bool32 *s, vbool32_t x) {
   s->y[0] = x;
 }
 
-// CHECK-128-LABEL: @read_bool64(
-// CHECK-128-NEXT:  entry:
-// CHECK-128-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S:%.*]], i64 1
-// CHECK-128-NEXT:    [[TMP0:%.*]] = load <1 x i8>, ptr [[Y]], align 1, !tbaa [[TBAA6]]
+// CHECK-128-LABEL: define dso_local <vscale x 1 x i1> @read_bool64(
+// CHECK-128-SAME: ptr noundef readonly captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-128-NEXT:  [[ENTRY:.*:]]
+// CHECK-128-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S]], i64 1
+// CHECK-128-NEXT:    [[TMP0:%.*]] = load <1 x i8>, ptr [[Y]], align 1, !tbaa [[CHAR_TBAA6]]
 // CHECK-128-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 1 x i8> @llvm.vector.insert.nxv1i8.v1i8(<vscale x 1 x i8> poison, <1 x i8> [[TMP0]], i64 0)
 // CHECK-128-NEXT:    [[TMP1:%.*]] = bitcast <vscale x 1 x i8> [[CAST_SCALABLE]] to <vscale x 8 x i1>
 // CHECK-128-NEXT:    [[TMP2:%.*]] = tail call <vscale x 1 x i1> @llvm.vector.extract.nxv1i1.nxv8i1(<vscale x 8 x i1> [[TMP1]], i64 0)
@@ -92,15 +95,21 @@ vbool64_t read_bool64(struct struct_bool64 *s) {
   return s->y[0];
 }
 
-// CHECK-128-LABEL: @write_bool64(
-// CHECK-128-NEXT:  entry:
-// CHECK-128-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.vector.insert.nxv8i1.nxv1i1(<vscale x 8 x i1> zeroinitializer, <vscale x 1 x i1> [[X:%.*]], i64 0)
+// CHECK-128-LABEL: define dso_local void @write_bool64(
+// CHECK-128-SAME: ptr noundef writeonly captures(none) initializes((1, 2)) [[S:%.*]], <vscale x 1 x i1> [[X:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-128-NEXT:  [[ENTRY:.*:]]
+// CHECK-128-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.vector.insert.nxv8i1.nxv1i1(<vscale x 8 x i1> zeroinitializer, <vscale x 1 x i1> [[X]], i64 0)
 // CHECK-128-NEXT:    [[TMP1:%.*]] = bitcast <vscale x 8 x i1> [[TMP0]] to <vscale x 1 x i8>
 // CHECK-128-NEXT:    [[CAST_FIXED:%.*]] = tail call <1 x i8> @llvm.vector.extract.v1i8.nxv1i8(<vscale x 1 x i8> [[TMP1]], i64 0)
-// CHECK-128-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S:%.*]], i64 1
-// CHECK-128-NEXT:    store <1 x i8> [[CAST_FIXED]], ptr [[Y]], align 1, !tbaa [[TBAA6]]
+// CHECK-128-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S]], i64 1
+// CHECK-128-NEXT:    store <1 x i8> [[CAST_FIXED]], ptr [[Y]], align 1, !tbaa [[CHAR_TBAA6]]
 // CHECK-128-NEXT:    ret void
 //
 void write_bool64(struct struct_bool64 *s, vbool64_t x) {
   s->y[0] = x;
 }
+//.
+// CHECK-128: [[CHAR_TBAA6]] = !{[[META7:![0-9]+]], [[META7]], i64 0}
+// CHECK-128: [[META7]] = !{!"omnipotent char", [[META8:![0-9]+]], i64 0}
+// CHECK-128: [[META8]] = !{!"Simple C/C++ TBAA"}
+//.
diff --git a/clang/test/CodeGen/RISCV/attr-rvv-vector-bits-bitcast.c b/clang/test/CodeGen/RISCV/attr-rvv-vector-bits-bitcast.c
index ecde52eb3d762..b92e6dff31748 100644
--- a/clang/test/CodeGen/RISCV/attr-rvv-vector-bits-bitcast.c
+++ b/clang/test/CodeGen/RISCV/attr-rvv-vector-bits-bitcast.c
@@ -1,4 +1,4 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6
 // RUN: %clang_cc1 -triple riscv64-none-linux-gnu -target-feature +f -target-feature +d -target-feature +zve64d -mvscale-min=1 -mvscale-max=1 -O1 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-64
 // RUN: %clang_cc1 -triple riscv64-none-linux-gnu -target-feature +f -target-feature +d -target-feature +zve64d -mvscale-min=2 -mvscale-max=2 -O1 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-128
 // RUN: %clang_cc1 -triple riscv64-none-linux-gnu -target-feature +f -target-feature +d -target-feature +zve64d -mvscale-min=4 -mvscale-max=4 -O1 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-256
@@ -67,24 +67,27 @@ DEFINE_STRUCT(bool64)
 // int64
 //===----------------------------------------------------------------------===//
 
-// CHECK-64-LABEL: @read_int64m1(
-// CHECK-64-NEXT:  entry:
-// CHECK-64-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S:%.*]], i64 8
-// CHECK-64-NEXT:    [[TMP0:%.*]] = load <1 x i64>, ptr [[Y]], align 8, !tbaa [[TBAA6:![0-9]+]]
+// CHECK-64-LABEL: define dso_local <vscale x 1 x i64> @read_int64m1(
+// CHECK-64-SAME: ptr noundef readonly captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// CHECK-64-NEXT:  [[ENTRY:.*:]]
+// CHECK-64-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S]], i64 8
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load <1 x i64>, ptr [[Y]], align 8, !tbaa [[CHAR_TBAA6:![0-9]+]]
 // CHECK-64-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 1 x i64> @llvm.vector.insert.nxv1i64.v1i64(<vscale x 1 x i64> poison, <1 x i64> [[TMP0]], i64 0)
 // CHECK-64-NEXT:    ret <vscale x 1 x i64> [[CAST_SCALABLE]]
 //
-// CHECK-128-LABEL: @read_int64m1(
-// CHECK-128-NEXT:  entry:
-// CHECK-128-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S:%.*]], i64 16
-// CHECK-128-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[Y]], align 8, !tbaa [[TBAA6:![0-9]+]]
+// CHECK-128-LABEL: define dso_local <vscale x 1 x i64> @read_int64m1(
+// CHECK-128-SAME: ptr noundef readonly captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// CHECK-128-NEXT:  [[ENTRY:.*:]]
+// CHECK-128-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S]], i64 16
+// CHECK-128-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[Y]], align 8, !tbaa [[CHAR_TBAA6:![0-9]+]]
 // CHECK-128-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 1 x i64> @llvm.vector.insert.nxv1i64.v2i64(<vscale x 1 x i64> poison, <2 x i64> [[TMP0]], i64 0)
 // CHECK-128-NEXT:    ret <vscale x 1 x i64> [[CAST_SCALABLE]]
 //
-// CHECK-256-LABEL: @read_int64m1(
-// CHECK-256-NEXT:  entry:
-// CHECK-256-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S:%.*]], i64 32
-// CHECK-256-NEXT:    [[TMP0:%.*]] = load <4 x i64>, ptr [[Y]], align 8, !tbaa [[TBAA6:![0-9]+]]
+// CHECK-256-LABEL: define dso_local <vscale x 1 x i64> @read_int64m1(
+// CHECK-256-SAME: ptr noundef readonly captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// CHECK-256-NEXT:  [[ENTRY:.*:]]
+// CHECK-256-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S]], i64 32
+// CHECK-256-NEXT:    [[TMP0:%.*]] = load <4 x i64>, ptr [[Y]], align 8, !tbaa [[CHAR_TBAA6:![0-9]+]]
 // CHECK-256-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 1 x i64> @llvm.vector.insert.nxv1i64.v4i64(<vscale x 1 x i64> poison, <4 x i64> [[TMP0]], i64 0)
 // CHECK-256-NEXT:    ret <vscale x 1 x i64> [[CAST_SCALABLE]]
 //
@@ -92,25 +95,28 @@ vint64m1_t read_int64m1(struct struct_int64m1 *s) {
   return s->y[0];
 }
 
-// CHECK-64-LABEL: @write_int64m1(
-// CHECK-64-NEXT:  entry:
-// CHECK-64-NEXT:    [[CAST_FIXED:%.*]] = tail call <1 x i64> @llvm.vector.extract.v1i64.nxv1i64(<vscale x 1 x i64> [[X:%.*]], i64 0)
-// CHECK-64-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S:%.*]], i64 8
-// CHECK-64-NEXT:    store <1 x i64> [[CAST_FIXED]], ptr [[Y]], align 8, !tbaa [[TBAA6]]
+// CHECK-64-LABEL: define dso_local void @write_int64m1(
+// CHECK-64-SAME: ptr noundef writeonly captures(none) initializes((8, 16)) [[S:%.*]], <vscale x 1 x i64> [[X:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] {
+// CHECK-64-NEXT:  [[ENTRY:.*:]]
+// CHECK-64-NEXT:    [[CAST_FIXED:%.*]] = tail call <1 x i64> @llvm.vector.extract.v1i64.nxv1i64(<vscale x 1 x i64> [[X]], i64 0)
+// CHECK-64-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S]], i64 8
+// CHECK-64-NEXT:    store <1 x i64> [[CAST_FIXED]], ptr [[Y]], align 8, !tbaa [[CHAR_TBAA6]]
 // CHECK-64-NEXT:    ret void
 //
-// CHECK-128-LABEL: @write_int64m1(
-// CHECK-128-NEXT:  entry:
-// CHECK-128-NEXT:    [[CAST_FIXED:%.*]] = tail call <2 x i64> @llvm.vector.extract.v2i64.nxv1i64(<vscale x 1 x i64> [[X:%.*]], i64 0)
-// CHECK-128-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S:%.*]], i64 16
-// CHECK-128-NEXT:    store <2 x i64> [[CAST_FIXED]], ptr [[Y]], align 8, !tbaa [[TBAA6]]
+// CHECK-128-LABEL: define dso_local void @write_int64m1(
+// CHECK-128-SAME: ptr noundef writeonly captures(none) initializes((16, 32)) [[S:%.*]], <vscale x 1 x i64> [[X:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] {
+// CHECK-128-NEXT:  [[ENTRY:.*:]]
+// CHECK-128-NEXT:    [[CAST_FIXED:%.*]] = tail call <2 x i64> @llvm.vector.extract.v2i64.nxv1i64(<vscale x 1 x i64> [[X]], i64 0)
+// CHECK-128-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S]], i64 16
+// CHECK-128-NEXT:    store <2 x i64> [[CAST_FIXED]], ptr [[Y]], align 8, !tbaa [[CHAR_TBAA6]]
 // CHECK-128-NEXT:    ret void
 //
-// CHECK-256-LABEL: @write_int64m1(
-// CHECK-256-NEXT:  entry:
-// CHECK-256-NEXT:    [[CAST_FIXED:%.*]] = tail call <4 x i64> @llvm.vector.extract.v4i64.nxv1i64(<vscale x 1 x i64> [[X:%.*]], i64 0)
-// CHECK-256-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S:%.*]], i64 32
-// CHECK-256-NEXT:    store <4 x i64> [[CAST_FIXED]], ptr [[Y]], align 8, !tbaa [[TBAA6]]
+// CHECK-256-LABEL: define dso_local void @write_int64m1(
+// CHECK-256-SAME: ptr noundef writeonly captures(none) initializes((32, 64)) [[S:%.*]], <vscale x 1 x i64> [[X:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] {
+// CHECK-256-NEXT:  [[ENTRY:.*:]]
+// CHECK-256-NEXT:    [[CAST_FIXED:%.*]] = tail call <4 x i64> @llvm.vector.extract.v4i64.nxv1i64(<vscale x 1 x i64> [[X]], i64 0)
+// CHECK-256-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S]], i64 32
+// CHECK-256-NEXT:    store <4 x i64> [[CAST_FIXED]], ptr [[Y]], align 8, !tbaa [[CHAR_TBAA6]]
 // CHECK-256-NEXT:    ret void
 //
 void write_int64m1(struct struct_int64m1 *s, vint64m1_t x) {
@@ -121,24 +127,27 @@ void write_int64m1(struct struct_int64m1 *s, vint64m1_t x) {
 // float64
 //===----------------------------------------------------------------------===//
 
-// CHECK-64-LABEL: @read_float64m1(
-// CHECK-64-NEXT:  entry:
-// CHECK-64-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S:%.*]], i64 8
-// CHECK-64-NEXT:    [[TMP0:%.*]] = load <1 x double>, ptr [[Y]], align 8, !tbaa [[TBAA6]]
+// CHECK-64-LABEL: define dso_local <vscale x 1 x double> @read_float64m1(
+// CHECK-64-SAME: ptr noundef readonly captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-64-NEXT:  [[ENTRY:.*:]]
+// CHECK-64-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S]], i64 8
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load <1 x double>, ptr [[Y]], align 8, !tbaa [[CHAR_TBAA6]]
 // CHECK-64-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 1 x double> @llvm.vector.insert.nxv1f64.v1f64(<vscale x 1 x double> poison, <1 x double> [[TMP0]], i64 0)
 // CHECK-64-NEXT:    ret <vscale x 1 x double> [[CAST_SCALABLE]]
 //
-// CHECK-128-LABEL: @read_float64m1(
-// CHECK-128-NEXT:  entry:
-// CHECK-128-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S:%.*]], i64 16
-// CHECK-128-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[Y]], align 8, !tbaa [[TBAA6]]
+// CHECK-128-LABEL: define dso_local <vscale x 1 x double> @read_float64m1(
+// CHECK-128-SAME: ptr noundef readonly captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-128-NEXT:  [[ENTRY:.*:]]
+// CHECK-128-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S]], i64 16
+// CHECK-128-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[Y]], align 8, !tbaa [[CHAR_TBAA6]]
 // CHECK-128-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 1 x double> @llvm.vector.insert.nxv1f64.v2f64(<vscale x 1 x double> poison, <2 x double> [[TMP0]], i64 0)
 // CHECK-128-NEXT:    ret <vscale x 1 x double> [[CAST_SCALABLE]]
 //
-// CHECK-256-LABEL: @read_float64m1(
-// CHECK-256-NEXT:  entry:
-// CHECK-256-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S:%.*]], i64 32
-// CHECK-256-NEXT:    [[TMP0:%.*]] = load <4 x double>, ptr [[Y]], align 8, !tbaa [[TBAA6]]
+// CHECK-256-LABEL: define dso_local <vscale x 1 x double> @read_float64m1(
+// CHECK-256-SAME: ptr noundef readonly captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-256-NEXT:  [[ENTRY:.*:]]
+// CHECK-256-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S]], i64 32
+// CHECK-256-NEXT:    [[TMP0:%.*]] = load <4 x double>, ptr [[Y]], align 8, !tbaa [[CHAR_TBAA6]]
 // CHECK-256-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 1 x double> @llvm.vector.insert.nxv1f64.v4f64(<vscale x 1 x double> poison, <4 x double> [[TMP0]], i64 0)
 // CHECK-256-NEXT:    ret <vscale x 1 x double> [[CAST_SCALABLE]]
 //
@@ -146,25 +155,28 @@ vfloat64m1_t read_float64m1(struct struct_float64m1 *s) {
   return s->y[0];
 }
 
-// CHECK-64-LABEL: @write_float64m1(
-// CHECK-64-NEXT:  entry:
-// CHECK-64-NEXT:    [[CAST_FIXED:%.*]] = tail call <1 x double> @llvm.vector.extract.v1f64.nxv1f64(<vscale x 1 x double> [[X:%.*]], i64 0)
-// CHECK-64-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S:%.*]], i64 8
-// CHECK-64-NEXT:    store <1 x double> [[CAST_FIXED]], ptr [[Y]], align 8, !tbaa [[TBAA6]]
+// CHECK-64-LABEL: define dso_local void @write_float64m1(
+// CHECK-64-SAME: ptr noundef writeonly captures(none) initializes((8, 16)) [[S:%.*]], <vscale x 1 x double> [[X:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-64-NEXT:  [[ENTRY:.*:]]
+// CHECK-64-NEXT:    [[CAST_FIXED:%.*]] = tail call <1 x double> @llvm.vector.extract.v1f64.nxv1f64(<vscale x 1 x double> [[X]], i64 0)
+// CHECK-64-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S]], i64 8
+// CHECK-64-NEXT:    store <1 x double> [[CAST_FIXED]], ptr [[Y]], align 8, !tbaa [[CHAR_TBAA6]]
 // CHECK-64-NEXT:    ret void
 //
-// CHECK-128-LABEL: @write_float64m1(
-// CHECK-128-NEXT:  entry:
-// CHECK-128-NEXT:    [[CAST_FIXED:%.*]] = tail call <2 x double> @llvm.vector.extract.v2f64.nxv1f64(<vscale x 1 x double> [[X:%.*]], i64 0)
-// CHECK-128-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S:%.*]], i64 16
-// CHECK-128-NEXT:    store <2 x double> [[CAST_FIXED]], ptr [[Y]], align 8, !tbaa [[TBAA6]]
+// CHECK-128-LABEL: define dso_local void @write_float64m1(
+// CHECK-128-SAME: ptr noundef writeonly captures(none) initializes((16, 32)) [[S:%.*]], <vscale x 1 x double> [[X:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-128-NEXT:  [[ENTRY:.*:]]
+// CHECK-128-NEXT:    [[CAST_FIXED:%.*]] = tail call <2 x double> @llvm.vector.extract.v2f64.nxv1f64(<vscale x 1 x double> [[X]], i64 0)
+// CHECK-128-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S]], i64 16
+// CHECK-128-NEXT:    store <2 x double> [[CAST_FIXED]], ptr [[Y]], align 8, !tbaa [[CHAR_TBAA6]]
 // CHECK-128-NEXT:    ret void
 //
-// CHECK-256-LABEL: @write_float64m1(
-// CHECK-256-NEXT:  entry:
-// CHECK-256-NEXT:    [[CAST_FIXED:%.*]] = tail call <4 x double> @llvm.vector.extract.v4f64.nxv1f64(<vscale x 1 x double> [[X:%.*]], i64 0)
-// CHECK-256-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S:%.*]], i64 32
-// CHECK-256-NEXT:    store <4 x double> [[CAST_FIXED]], ptr [[Y]], align 8, !tbaa [[TBAA6]]
+// CHECK-256-LABEL: define dso_local void @write_float64m1(
+// CHECK-256-SAME: ptr noundef writeonly captures(none) initializes((32, 64)) [[S:%.*]], <vscale x 1 x double> [[X:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-256-NEXT:  [[ENTRY:.*:]]
+// CHECK-256-NEXT:    [[CAST_FIXED:%.*]] = tail call <4 x double> @llvm.vector.extract.v4f64.nxv1f64(<vscale x 1 x double> [[X]], i64 0)
+// CHECK-256-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S]], i64 32
+// CHECK-256-NEXT:    store <4 x double> [[CAST_FIXED]], ptr [[Y]], align 8, !tbaa [[CHAR_TBAA6]]
 // CHECK-256-NEXT:    ret void
 //
 void write_float64m1(struct struct_float64m1 *s, vfloat64m1_t x) {
@@ -175,26 +187,29 @@ void write_float64m1(struct struct_float64m1 *s, vfloat64m1_t x) {
 // bool
 //===----------------------------------------------------------------------===//
 
-// CHECK-64-LABEL: @read_bool1(
-// CHECK-64-NEXT:  entry:
-// CHECK-64-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S:%.*]], i64 8
-// CHECK-64-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[Y]], align 8, !tbaa [[TBAA6]]
+// CHECK-64-LABEL: define dso_local <vscale x 64 x i1> @read_bool1(
+// CHECK-64-SAME: ptr noundef readonly captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-64-NEXT:  [[ENTRY:.*:]]
+// CHECK-64-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S]], i64 8
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[Y]], align 8, !tbaa [[CHAR_TBAA6]]
 // CHECK-64-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 8 x i8> @llvm.vector.insert.nxv8i8.v8i8(<vscale x 8 x i8> poison, <8 x i8> [[TMP0]], i64 0)
 // CHECK-64-NEXT:    [[TMP1:%.*]] = bitcast <vscale x 8 x i8> [[CAST_SCALABLE]] to <vscale x 64 x i1>
 // CHECK-64-NEXT:    ret <vscale x 64 x i1> [[TMP1]]
 //
-// CHECK-128-LABEL: @read_bool1(
-// CHECK-128-NEXT:  entry:
-// CHECK-128-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S:%.*]], i64 16
-// CHECK-128-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[Y]], align 8, !tbaa [[TBAA6]]
+// CHECK-128-LABEL: define dso_local <vscale x 64 x i1> @read_bool1(
+// CHECK-128-SAME: ptr noundef readonly captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-128-NEXT:  [[ENTRY:.*:]]
+// CHECK-128-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S]], i64 16
+// CHECK-128-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[Y]], align 8, !tbaa [[CHAR_TBAA6]]
 // CHECK-128-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 8 x i8> @llvm.vector.insert.nxv8i8.v16i8(<vscale x 8 x i8> poison, <16 x i8> [[TMP0]], i64 0)
 // CHECK-128-NEXT:    [[TMP1:%.*]] = bitcast <vscale x 8 x i8> [[CAST_SCALABLE]] to <vscale x 64 x i1>
 // CHECK-128-NEXT:    ret <vscale x 64 x i1> [[TMP1]]
 //
-// CHECK-256-LABEL: @read_bool1(
-// CHECK-256-NEXT:  entry:
-// CHECK-256-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S:%.*]], i64 32
-// CHECK-256-NEXT:    [[TMP0:%.*]] = load <32 x i8>, ptr [[Y]], align 8, !tbaa [[TBAA6]]
+// CHECK-256-LABEL: define dso_local <vscale x 64 x i1> @read_bool1(
+// CHECK-256-SAME: ptr noundef readonly captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-256-NEXT:  [[ENTRY:.*:]]
+// CHECK-256-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S]], i64 32
+// CHECK-256-NEXT:    [[TMP0:%.*]] = load <32 x i8>, ptr [[Y]], align 8, !tbaa [[CHAR_TBAA6]]
 // CHECK-256-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 8 x i8> @llvm.vector.insert.nxv8i8.v32i8(<vscale x 8 x i8> poison, <32 x i8> [[TMP0]], i64 0)
 // CHECK-256-NEXT:    [[TMP1:%.*]] = bitcast <vscale x 8 x i8> [[CAST_SCALABLE]] to <vscale x 64 x i1>
 // CHECK-256-NEXT:    ret <vscale x 64 x i1> [[TMP1]]
@@ -203,30 +218,46 @@ vbool1_t read_bool1(struct struct_bool1 *s) {
   return s->y[0];
 }
 
-// CHECK-64-LABEL: @write_bool1(
-// CHECK-64-NEXT:  entry:
-// CHECK-64-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 64 x i1> [[X:%.*]] to <vscale x 8 x i8>
+// CHECK-64-LABEL: define dso_local void @write_bool1(
+// CHECK-64-SAME: ptr noundef writeonly captures(none) initializes((8, 16)) [[S:%.*]], <vscale x 64 x i1> [[X:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-64-NEXT:  [[ENTRY:.*:]]
+// CHECK-64-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 64 x i1> [[X]] to <vscale x 8 x i8>
 // CHECK-64-NEXT:    [[CAST_FIXED:%.*]] = tail call <8 x i8> @llvm.vector.extract.v8i8.nxv8i8(<vscale x 8 x i8> [[TMP0]], i64 0)
-// CHECK-64-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S:%.*]], i64 8
-// CHECK-64-NEXT:    store <8 x i8> [[CAST_FIXED]], ptr [[Y]], align 8, !tbaa [[TBAA6]]
+// CHECK-64-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S]], i64 8
+// CHECK-64-NEXT:    store <8 x i8> [[CAST_FIXED]], ptr [[Y]], align 8, !tbaa [[CHAR_TBAA6]]
 // CHECK-64-NEXT:    ret void
 //
-// CHECK-128-LABEL: @write_bool1(
-// CHECK-128-NEXT:  entry:
-// CHECK-128-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 64 x i1> [[X:%.*]] to <vscale x 8 x i8>
+// CHECK-128-LABEL: define dso_local void @write_bool1(
+// CHECK-128-SAME: ptr noundef writeonly captures(none) initializes((16, 32)) [[S:%.*]], <vscale x 64 x i1> [[X:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-128-NEXT:  [[ENTRY:.*:]]
+// CHECK-128-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 64 x i1> [[X]] to <vscale x 8 x i8>
 // CHECK-128-NEXT:    [[CAST_FIXED:%.*]] = tail call <16 x i8> @llvm.vector.extract.v16i8.nxv8i8(<vscale x 8 x i8> [[TMP0]], i64 0)
-// CHECK-128-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S:%.*]], i64 16
-// CHECK-128-NEXT:    store <16 x i8> [[CAST_FIXED]], ptr [[Y]], align 8, !tbaa [[TBAA6]]
+// CHECK-128-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S]], i64 16
+// CHECK-128-NEXT:    store <16 x i8> [[CAST_FIXED]], ptr [[Y]], align 8, !tbaa [[CHAR_TBAA6]]
 // CHECK-128-NEXT:    ret void
 //
-// CHECK-256-LABEL: @write_bool1(
-// CHECK-256-NEXT:  entry:
-// CHECK-256-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 64 x i1> [[X:%.*]] to <vscale x 8 x i8>
+// CHECK-256-LABEL: define dso_local void @write_bool1(
+// CHECK-256-SAME: ptr noundef writeonly captures(none) initializes((32, 64)) [[S:%.*]], <vscale x 64 x i1> [[X:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-256-NEXT:  [[ENTRY:.*:]]
+// CHECK-256-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 64 x i1> [[X]] to <vscale x 8 x i8>
 // CHECK-256-NEXT:    [[CAST_FIXED:%.*]] = tail call <32 x i8> @llvm.vector.extract.v32i8.nxv8i8(<vscale x 8 x i8> [[TMP0]], i64 0)
-// CHECK-256-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S:%.*]], i64 32
-// CHECK-256-NEXT:    store <32 x i8> [[CAST_FIXED]], ptr [[Y]], align 8, !tbaa [[TBAA6]]
+// CHECK-256-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S]], i64 32
+// CHECK-256-NEXT:    store <32 x i8> [[CAST_FIXED]], ptr [[Y]], align 8, !tbaa [[CHAR_TBAA6]]
 // CHECK-256-NEXT:    ret void
 //
 void write_bool1(struct struct_bool1 *s, vbool1_t x) {
   s->y[0] = x;
 }
+//.
+// CHECK-64: [[CHAR_TBAA6]] = !{[[META7:![0-9]+]], [[META7]], i64 0}
+// CHECK-64: [[META7]] = !{!"omnipotent char", [[META8:![0-9]+]], i64 0}
+// CHECK-64: [[META8]] = !{!"Simple C/C++ TBAA"}
+//.
+// CHECK-128: [[CHAR_TBAA6]] = !{[[META7:![0-9]+]], [[META7]], i64 0}
+// CHECK-128: [[META7]] = !{!"omnipotent char", [[META8:![0-9]+]], i64 0}
+// CHECK-128: [[META8]] = !{!"Simple C/C++ TBAA"}
+//.
+// CHECK-256: [[CHAR_TBAA6]] = !{[[META7:![0-9]+]], [[META7]], i64 0}
+// CHECK-256: [[META7]] = !{!"omnipotent char", [[META8:![0-9]+]], i64 0}
+// CHECK-256: [[META8]] = !{!"Simple C/C++ TBAA"}
+//.
diff --git a/clang/test/CodeGen/RISCV/attr-rvv-vector-bits-cast.c b/clang/test/CodeGen/RISCV/attr-rvv-vector-bits-cast.c
index 0a50e41dda7e1..4517b52aefdfd 100644
--- a/clang/test/CodeGen/RISCV/attr-rvv-vector-bits-cast.c
+++ b/clang/test/CodeGen/RISCV/attr-rvv-vector-bits-cast.c
@@ -1,4 +1,4 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6
 // RUN: %clang_cc1 -triple riscv64-none-linux-gnu -target-feature +f -target-feature +d -target-feature +zve64d -mvscale-min=4 -mvscale-max=4 -O1 -emit-llvm -o - %s | FileCheck %s
 
 // REQUIRES: riscv-registered-target
@@ -31,89 +31,100 @@ typedef vbool1_t fixed_bool1_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fi
 typedef vbool4_t fixed_bool4_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen/4)));
 typedef vbool32_t fixed_bool32_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen/32)));
 
-// CHECK-LABEL: @to_vint32m1_t(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    ret <vscale x 2 x i32> [[TYPE_COERCE:%.*]]
+// CHECK-LABEL: define dso_local noundef <vscale x 2 x i32> @to_vint32m1_t(
+// CHECK-SAME: <vscale x 2 x i32> noundef returned [[TYPE_COERCE:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <vscale x 2 x i32> [[TYPE_COERCE]]
 //
 vint32m1_t to_vint32m1_t(fixed_int32m1_t type) {
   return type;
 }
 
-// CHECK-LABEL: @from_vint32m1_t(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    ret <vscale x 2 x i32> [[TYPE:%.*]]
+// CHECK-LABEL: define dso_local <vscale x 2 x i32> @from_vint32m1_t(
+// CHECK-SAME: <vscale x 2 x i32> returned [[TYPE:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <vscale x 2 x i32> [[TYPE]]
 //
 fixed_int32m1_t from_vint32m1_t(vint32m1_t type) {
   return type;
 }
 
-// CHECK-LABEL: @to_vfloat64m1_t(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    ret <vscale x 1 x double> [[TYPE_COERCE:%.*]]
+// CHECK-LABEL: define dso_local noundef <vscale x 1 x double> @to_vfloat64m1_t(
+// CHECK-SAME: <vscale x 1 x double> noundef returned [[TYPE_COERCE:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <vscale x 1 x double> [[TYPE_COERCE]]
 //
 vfloat64m1_t to_vfloat64m1_t(fixed_float64m1_t type) {
   return type;
 }
 
-// CHECK-LABEL: @from_vfloat64m1_t(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    ret <vscale x 1 x double> [[TYPE:%.*]]
+// CHECK-LABEL: define dso_local <vscale x 1 x double> @from_vfloat64m1_t(
+// CHECK-SAME: <vscale x 1 x double> returned [[TYPE:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <vscale x 1 x double> [[TYPE]]
 //
 fixed_float64m1_t from_vfloat64m1_t(vfloat64m1_t type) {
   return type;
 }
 
-// CHECK-LABEL: @from_vbool1_t(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    ret <vscale x 64 x i1> [[TYPE:%.*]]
+// CHECK-LABEL: define dso_local <vscale x 64 x i1> @from_vbool1_t(
+// CHECK-SAME: <vscale x 64 x i1> returned [[TYPE:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <vscale x 64 x i1> [[TYPE]]
 //
 fixed_bool1_t from_vbool1_t(vbool1_t type) {
   return type;
 }
 
-// CHECK-LABEL: @to_vbool1_t(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    ret <vscale x 64 x i1> [[TMP0:%.*]]
+// CHECK-LABEL: define dso_local noundef <vscale x 64 x i1> @to_vbool1_t(
+// CHECK-SAME: <vscale x 64 x i1> noundef returned [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <vscale x 64 x i1> [[TMP0]]
 //
 vbool1_t to_vbool1_t(fixed_bool1_t type) {
   return type;
 }
 
-// CHECK-LABEL: @from_vbool4_t(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    ret <vscale x 16 x i1> [[TYPE:%.*]]
+// CHECK-LABEL: define dso_local <vscale x 16 x i1> @from_vbool4_t(
+// CHECK-SAME: <vscale x 16 x i1> returned [[TYPE:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <vscale x 16 x i1> [[TYPE]]
 //
 fixed_bool4_t from_vbool4_t(vbool4_t type) {
   return type;
 }
 
-// CHECK-LABEL: @to_vbool4_t(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    ret <vscale x 16 x i1> [[TMP0:%.*]]
+// CHECK-LABEL: define dso_local noundef <vscale x 16 x i1> @to_vbool4_t(
+// CHECK-SAME: <vscale x 16 x i1> noundef returned [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <vscale x 16 x i1> [[TMP0]]
 //
 vbool4_t to_vbool4_t(fixed_bool4_t type) {
   return type;
 }
 
-// CHECK-LABEL: @from_vbool32_t(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    ret <vscale x 2 x i1> [[TYPE:%.*]]
+// CHECK-LABEL: define dso_local <vscale x 2 x i1> @from_vbool32_t(
+// CHECK-SAME: <vscale x 2 x i1> returned [[TYPE:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <vscale x 2 x i1> [[TYPE]]
 //
 fixed_bool32_t from_vbool32_t(vbool32_t type) {
   return type;
 }
 
-// CHECK-LABEL: @to_vbool32_t(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    ret <vscale x 2 x i1> [[TMP0:%.*]]
+// CHECK-LABEL: define dso_local noundef <vscale x 2 x i1> @to_vbool32_t(
+// CHECK-SAME: <vscale x 2 x i1> noundef returned [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <vscale x 2 x i1> [[TMP0]]
 //
 vbool32_t to_vbool32_t(fixed_bool32_t type) {
   return type;
 }
 
-// CHECK-LABEL: @to_vint32m1_t__from_gnu_int32m1_t(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TYPE:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA6:![0-9]+]]
+// CHECK-LABEL: define dso_local <vscale x 2 x i32> @to_vint32m1_t__from_gnu_int32m1_t(
+// CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TYPE:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6:![0-9]+]]
 // CHECK-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 2 x i32> @llvm.vector.insert.nxv2i32.v8i32(<vscale x 2 x i32> poison, <8 x i32> [[TYPE]], i64 0)
 // CHECK-NEXT:    ret <vscale x 2 x i32> [[CAST_SCALABLE]]
 //
@@ -121,19 +132,21 @@ vint32m1_t to_vint32m1_t__from_gnu_int32m1_t(gnu_int32m1_t type) {
   return type;
 }
 
-// CHECK-LABEL: @from_vint32m1_t__to_gnu_int32m1_t(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[CAST_FIXED:%.*]] = tail call <8 x i32> @llvm.vector.extract.v8i32.nxv2i32(<vscale x 2 x i32> [[TYPE:%.*]], i64 0)
-// CHECK-NEXT:    store <8 x i32> [[CAST_FIXED]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA6]]
+// CHECK-LABEL: define dso_local void @from_vint32m1_t__to_gnu_int32m1_t(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], <vscale x 2 x i32> [[TYPE:%.*]]) local_unnamed_addr #[[ATTR3:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CAST_FIXED:%.*]] = tail call <8 x i32> @llvm.vector.extract.v8i32.nxv2i32(<vscale x 2 x i32> [[TYPE]], i64 0)
+// CHECK-NEXT:    store <8 x i32> [[CAST_FIXED]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 gnu_int32m1_t from_vint32m1_t__to_gnu_int32m1_t(vint32m1_t type) {
   return type;
 }
 
-// CHECK-LABEL: @to_fixed_int32m1_t__from_gnu_int32m1_t(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TYPE:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA6]]
+// CHECK-LABEL: define dso_local <vscale x 2 x i32> @to_fixed_int32m1_t__from_gnu_int32m1_t(
+// CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TYPE:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 2 x i32> @llvm.vector.insert.nxv2i32.v8i32(<vscale x 2 x i32> poison, <8 x i32> [[TYPE]], i64 0)
 // CHECK-NEXT:    ret <vscale x 2 x i32> [[CAST_SCALABLE]]
 //
@@ -141,12 +154,18 @@ fixed_int32m1_t to_fixed_int32m1_t__from_gnu_int32m1_t(gnu_int32m1_t type) {
   return type;
 }
 
-// CHECK-LABEL: @from_fixed_int32m1_t__to_gnu_int32m1_t(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TYPE:%.*]] = tail call <8 x i32> @llvm.vector.extract.v8i32.nxv2i32(<vscale x 2 x i32> [[TYPE_COERCE:%.*]], i64 0)
-// CHECK-NEXT:    store <8 x i32> [[TYPE]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA6]]
+// CHECK-LABEL: define dso_local void @from_fixed_int32m1_t__to_gnu_int32m1_t(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], <vscale x 2 x i32> noundef [[TYPE_COERCE:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TYPE:%.*]] = tail call <8 x i32> @llvm.vector.extract.v8i32.nxv2i32(<vscale x 2 x i32> [[TYPE_COERCE]], i64 0)
+// CHECK-NEXT:    store <8 x i32> [[TYPE]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 gnu_int32m1_t from_fixed_int32m1_t__to_gnu_int32m1_t(fixed_int32m1_t type) {
   return type;
 }
+//.
+// CHECK: [[CHAR_TBAA6]] = !{[[META7:![0-9]+]], [[META7]], i64 0}
+// CHECK: [[META7]] = !{!"omnipotent char", [[META8:![0-9]+]], i64 0}
+// CHECK: [[META8]] = !{!"Simple C/C++ TBAA"}
+//.
diff --git a/clang/test/CodeGen/RISCV/attr-rvv-vector-bits-globals.c b/clang/test/CodeGen/RISCV/attr-rvv-vector-bits-globals.c
index 92ba27fb65425..f3b91b23a73e4 100644
--- a/clang/test/CodeGen/RISCV/attr-rvv-vector-bits-globals.c
+++ b/clang/test/CodeGen/RISCV/attr-rvv-vector-bits-globals.c
@@ -1,4 +1,4 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6
 // RUN: %clang_cc1 -triple riscv64-none-linux-gnu -target-feature +f -target-feature +d -target-feature +zve64d -mvscale-min=1 -mvscale-max=1 -O1 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-64
 // RUN: %clang_cc1 -triple riscv64-none-linux-gnu -target-feature +f -target-feature +d -target-feature +zve64d -mvscale-min=4 -mvscale-max=4 -O1 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-256
 
@@ -40,59 +40,66 @@ fixed_bool32_t global_bool32;
 // WRITES
 //===----------------------------------------------------------------------===//
 
-// CHECK-64-LABEL: @write_global_i64(
-// CHECK-64-NEXT:  entry:
-// CHECK-64-NEXT:    [[CAST_FIXED:%.*]] = tail call <1 x i64> @llvm.vector.extract.v1i64.nxv1i64(<vscale x 1 x i64> [[V:%.*]], i64 0)
-// CHECK-64-NEXT:    store <1 x i64> [[CAST_FIXED]], ptr @global_i64, align 8, !tbaa [[TBAA6:![0-9]+]]
+// CHECK-64-LABEL: define dso_local void @write_global_i64(
+// CHECK-64-SAME: <vscale x 1 x i64> [[V:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// CHECK-64-NEXT:  [[ENTRY:.*:]]
+// CHECK-64-NEXT:    [[CAST_FIXED:%.*]] = tail call <1 x i64> @llvm.vector.extract.v1i64.nxv1i64(<vscale x 1 x i64> [[V]], i64 0)
+// CHECK-64-NEXT:    store <1 x i64> [[CAST_FIXED]], ptr @global_i64, align 8, !tbaa [[CHAR_TBAA6:![0-9]+]]
 // CHECK-64-NEXT:    ret void
 //
-// CHECK-256-LABEL: @write_global_i64(
-// CHECK-256-NEXT:  entry:
-// CHECK-256-NEXT:    [[CAST_FIXED:%.*]] = tail call <4 x i64> @llvm.vector.extract.v4i64.nxv1i64(<vscale x 1 x i64> [[V:%.*]], i64 0)
-// CHECK-256-NEXT:    store <4 x i64> [[CAST_FIXED]], ptr @global_i64, align 8, !tbaa [[TBAA6:![0-9]+]]
+// CHECK-256-LABEL: define dso_local void @write_global_i64(
+// CHECK-256-SAME: <vscale x 1 x i64> [[V:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// CHECK-256-NEXT:  [[ENTRY:.*:]]
+// CHECK-256-NEXT:    [[CAST_FIXED:%.*]] = tail call <4 x i64> @llvm.vector.extract.v4i64.nxv1i64(<vscale x 1 x i64> [[V]], i64 0)
+// CHECK-256-NEXT:    store <4 x i64> [[CAST_FIXED]], ptr @global_i64, align 8, !tbaa [[CHAR_TBAA6:![0-9]+]]
 // CHECK-256-NEXT:    ret void
 //
 void write_global_i64(vint64m1_t v) { global_i64 = v; }
 
-// CHECK-64-LABEL: @write_global_bool1(
-// CHECK-64-NEXT:  entry:
-// CHECK-64-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 64 x i1> [[V:%.*]] to <vscale x 8 x i8>
+// CHECK-64-LABEL: define dso_local void @write_global_bool1(
+// CHECK-64-SAME: <vscale x 64 x i1> [[V:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-64-NEXT:  [[ENTRY:.*:]]
+// CHECK-64-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 64 x i1> [[V]] to <vscale x 8 x i8>
 // CHECK-64-NEXT:    [[CAST_FIXED:%.*]] = tail call <8 x i8> @llvm.vector.extract.v8i8.nxv8i8(<vscale x 8 x i8> [[TMP0]], i64 0)
-// CHECK-64-NEXT:    store <8 x i8> [[CAST_FIXED]], ptr @global_bool1, align 8, !tbaa [[TBAA6]]
+// CHECK-64-NEXT:    store <8 x i8> [[CAST_FIXED]], ptr @global_bool1, align 8, !tbaa [[CHAR_TBAA6]]
 // CHECK-64-NEXT:    ret void
 //
-// CHECK-256-LABEL: @write_global_bool1(
-// CHECK-256-NEXT:  entry:
-// CHECK-256-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 64 x i1> [[V:%.*]] to <vscale x 8 x i8>
+// CHECK-256-LABEL: define dso_local void @write_global_bool1(
+// CHECK-256-SAME: <vscale x 64 x i1> [[V:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-256-NEXT:  [[ENTRY:.*:]]
+// CHECK-256-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 64 x i1> [[V]] to <vscale x 8 x i8>
 // CHECK-256-NEXT:    [[CAST_FIXED:%.*]] = tail call <32 x i8> @llvm.vector.extract.v32i8.nxv8i8(<vscale x 8 x i8> [[TMP0]], i64 0)
-// CHECK-256-NEXT:    store <32 x i8> [[CAST_FIXED]], ptr @global_bool1, align 8, !tbaa [[TBAA6]]
+// CHECK-256-NEXT:    store <32 x i8> [[CAST_FIXED]], ptr @global_bool1, align 8, !tbaa [[CHAR_TBAA6]]
 // CHECK-256-NEXT:    ret void
 //
 void write_global_bool1(vbool1_t v) { global_bool1 = v; }
 
-// CHECK-64-LABEL: @write_global_bool4(
-// CHECK-64-NEXT:  entry:
-// CHECK-64-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i1> [[V:%.*]] to <vscale x 2 x i8>
+// CHECK-64-LABEL: define dso_local void @write_global_bool4(
+// CHECK-64-SAME: <vscale x 16 x i1> [[V:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-64-NEXT:  [[ENTRY:.*:]]
+// CHECK-64-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i1> [[V]] to <vscale x 2 x i8>
 // CHECK-64-NEXT:    [[CAST_FIXED:%.*]] = tail call <2 x i8> @llvm.vector.extract.v2i8.nxv2i8(<vscale x 2 x i8> [[TMP0]], i64 0)
-// CHECK-64-NEXT:    store <2 x i8> [[CAST_FIXED]], ptr @global_bool4, align 2, !tbaa [[TBAA6]]
+// CHECK-64-NEXT:    store <2 x i8> [[CAST_FIXED]], ptr @global_bool4, align 2, !tbaa [[CHAR_TBAA6]]
 // CHECK-64-NEXT:    ret void
 //
-// CHECK-256-LABEL: @write_global_bool4(
-// CHECK-256-NEXT:  entry:
-// CHECK-256-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i1> [[V:%.*]] to <vscale x 2 x i8>
+// CHECK-256-LABEL: define dso_local void @write_global_bool4(
+// CHECK-256-SAME: <vscale x 16 x i1> [[V:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-256-NEXT:  [[ENTRY:.*:]]
+// CHECK-256-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i1> [[V]] to <vscale x 2 x i8>
 // CHECK-256-NEXT:    [[CAST_FIXED:%.*]] = tail call <8 x i8> @llvm.vector.extract.v8i8.nxv2i8(<vscale x 2 x i8> [[TMP0]], i64 0)
-// CHECK-256-NEXT:    store <8 x i8> [[CAST_FIXED]], ptr @global_bool4, align 8, !tbaa [[TBAA6]]
+// CHECK-256-NEXT:    store <8 x i8> [[CAST_FIXED]], ptr @global_bool4, align 8, !tbaa [[CHAR_TBAA6]]
 // CHECK-256-NEXT:    ret void
 //
 void write_global_bool4(vbool4_t v) { global_bool4 = v; }
 
 #if __riscv_v_fixed_vlen >= 256
-// CHECK-256-LABEL: @write_global_bool32(
-// CHECK-256-NEXT:  entry:
-// CHECK-256-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.vector.insert.nxv8i1.nxv2i1(<vscale x 8 x i1> zeroinitializer, <vscale x 2 x i1> [[V:%.*]], i64 0)
+// CHECK-256-LABEL: define dso_local void @write_global_bool32(
+// CHECK-256-SAME: <vscale x 2 x i1> [[V:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-256-NEXT:  [[ENTRY:.*:]]
+// CHECK-256-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.vector.insert.nxv8i1.nxv2i1(<vscale x 8 x i1> zeroinitializer, <vscale x 2 x i1> [[V]], i64 0)
 // CHECK-256-NEXT:    [[TMP1:%.*]] = bitcast <vscale x 8 x i1> [[TMP0]] to <vscale x 1 x i8>
 // CHECK-256-NEXT:    [[CAST_FIXED:%.*]] = tail call <1 x i8> @llvm.vector.extract.v1i8.nxv1i8(<vscale x 1 x i8> [[TMP1]], i64 0)
-// CHECK-256-NEXT:    store <1 x i8> [[CAST_FIXED]], ptr @global_bool32, align 1, !tbaa [[TBAA6]]
+// CHECK-256-NEXT:    store <1 x i8> [[CAST_FIXED]], ptr @global_bool32, align 1, !tbaa [[CHAR_TBAA6]]
 // CHECK-256-NEXT:    ret void
 //
 void write_global_bool32(vbool32_t v) { global_bool32 = v; }
@@ -102,46 +109,52 @@ void write_global_bool32(vbool32_t v) { global_bool32 = v; }
 // READS
 //===----------------------------------------------------------------------===//
 
-// CHECK-64-LABEL: @read_global_i64(
-// CHECK-64-NEXT:  entry:
-// CHECK-64-NEXT:    [[TMP0:%.*]] = load <1 x i64>, ptr @global_i64, align 8, !tbaa [[TBAA6]]
+// CHECK-64-LABEL: define dso_local <vscale x 1 x i64> @read_global_i64(
+// CHECK-64-SAME: ) local_unnamed_addr #[[ATTR2:[0-9]+]] {
+// CHECK-64-NEXT:  [[ENTRY:.*:]]
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load <1 x i64>, ptr @global_i64, align 8, !tbaa [[CHAR_TBAA6]]
 // CHECK-64-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 1 x i64> @llvm.vector.insert.nxv1i64.v1i64(<vscale x 1 x i64> poison, <1 x i64> [[TMP0]], i64 0)
 // CHECK-64-NEXT:    ret <vscale x 1 x i64> [[CAST_SCALABLE]]
 //
-// CHECK-256-LABEL: @read_global_i64(
-// CHECK-256-NEXT:  entry:
-// CHECK-256-NEXT:    [[TMP0:%.*]] = load <4 x i64>, ptr @global_i64, align 8, !tbaa [[TBAA6]]
+// CHECK-256-LABEL: define dso_local <vscale x 1 x i64> @read_global_i64(
+// CHECK-256-SAME: ) local_unnamed_addr #[[ATTR2:[0-9]+]] {
+// CHECK-256-NEXT:  [[ENTRY:.*:]]
+// CHECK-256-NEXT:    [[TMP0:%.*]] = load <4 x i64>, ptr @global_i64, align 8, !tbaa [[CHAR_TBAA6]]
 // CHECK-256-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 1 x i64> @llvm.vector.insert.nxv1i64.v4i64(<vscale x 1 x i64> poison, <4 x i64> [[TMP0]], i64 0)
 // CHECK-256-NEXT:    ret <vscale x 1 x i64> [[CAST_SCALABLE]]
 //
 vint64m1_t read_global_i64() { return global_i64; }
 
-// CHECK-64-LABEL: @read_global_bool1(
-// CHECK-64-NEXT:  entry:
-// CHECK-64-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr @global_bool1, align 8, !tbaa [[TBAA6]]
+// CHECK-64-LABEL: define dso_local <vscale x 64 x i1> @read_global_bool1(
+// CHECK-64-SAME: ) local_unnamed_addr #[[ATTR2]] {
+// CHECK-64-NEXT:  [[ENTRY:.*:]]
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr @global_bool1, align 8, !tbaa [[CHAR_TBAA6]]
 // CHECK-64-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 8 x i8> @llvm.vector.insert.nxv8i8.v8i8(<vscale x 8 x i8> poison, <8 x i8> [[TMP0]], i64 0)
 // CHECK-64-NEXT:    [[TMP1:%.*]] = bitcast <vscale x 8 x i8> [[CAST_SCALABLE]] to <vscale x 64 x i1>
 // CHECK-64-NEXT:    ret <vscale x 64 x i1> [[TMP1]]
 //
-// CHECK-256-LABEL: @read_global_bool1(
-// CHECK-256-NEXT:  entry:
-// CHECK-256-NEXT:    [[TMP0:%.*]] = load <32 x i8>, ptr @global_bool1, align 8, !tbaa [[TBAA6]]
+// CHECK-256-LABEL: define dso_local <vscale x 64 x i1> @read_global_bool1(
+// CHECK-256-SAME: ) local_unnamed_addr #[[ATTR2]] {
+// CHECK-256-NEXT:  [[ENTRY:.*:]]
+// CHECK-256-NEXT:    [[TMP0:%.*]] = load <32 x i8>, ptr @global_bool1, align 8, !tbaa [[CHAR_TBAA6]]
 // CHECK-256-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 8 x i8> @llvm.vector.insert.nxv8i8.v32i8(<vscale x 8 x i8> poison, <32 x i8> [[TMP0]], i64 0)
 // CHECK-256-NEXT:    [[TMP1:%.*]] = bitcast <vscale x 8 x i8> [[CAST_SCALABLE]] to <vscale x 64 x i1>
 // CHECK-256-NEXT:    ret <vscale x 64 x i1> [[TMP1]]
 //
 vbool1_t read_global_bool1() { return global_bool1; }
 
-// CHECK-64-LABEL: @read_global_bool4(
-// CHECK-64-NEXT:  entry:
-// CHECK-64-NEXT:    [[TMP0:%.*]] = load <2 x i8>, ptr @global_bool4, align 2, !tbaa [[TBAA6]]
+// CHECK-64-LABEL: define dso_local <vscale x 16 x i1> @read_global_bool4(
+// CHECK-64-SAME: ) local_unnamed_addr #[[ATTR2]] {
+// CHECK-64-NEXT:  [[ENTRY:.*:]]
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load <2 x i8>, ptr @global_bool4, align 2, !tbaa [[CHAR_TBAA6]]
 // CHECK-64-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 2 x i8> @llvm.vector.insert.nxv2i8.v2i8(<vscale x 2 x i8> poison, <2 x i8> [[TMP0]], i64 0)
 // CHECK-64-NEXT:    [[TMP1:%.*]] = bitcast <vscale x 2 x i8> [[CAST_SCALABLE]] to <vscale x 16 x i1>
 // CHECK-64-NEXT:    ret <vscale x 16 x i1> [[TMP1]]
 //
-// CHECK-256-LABEL: @read_global_bool4(
-// CHECK-256-NEXT:  entry:
-// CHECK-256-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr @global_bool4, align 8, !tbaa [[TBAA6]]
+// CHECK-256-LABEL: define dso_local <vscale x 16 x i1> @read_global_bool4(
+// CHECK-256-SAME: ) local_unnamed_addr #[[ATTR2]] {
+// CHECK-256-NEXT:  [[ENTRY:.*:]]
+// CHECK-256-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr @global_bool4, align 8, !tbaa [[CHAR_TBAA6]]
 // CHECK-256-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 2 x i8> @llvm.vector.insert.nxv2i8.v8i8(<vscale x 2 x i8> poison, <8 x i8> [[TMP0]], i64 0)
 // CHECK-256-NEXT:    [[TMP1:%.*]] = bitcast <vscale x 2 x i8> [[CAST_SCALABLE]] to <vscale x 16 x i1>
 // CHECK-256-NEXT:    ret <vscale x 16 x i1> [[TMP1]]
@@ -149,9 +162,10 @@ vbool1_t read_global_bool1() { return global_bool1; }
 vbool4_t read_global_bool4() { return global_bool4; }
 
 #if __riscv_v_fixed_vlen >= 256
-// CHECK-256-LABEL: @read_global_bool32(
-// CHECK-256-NEXT:  entry:
-// CHECK-256-NEXT:    [[TMP0:%.*]] = load <1 x i8>, ptr @global_bool32, align 1, !tbaa [[TBAA6]]
+// CHECK-256-LABEL: define dso_local <vscale x 2 x i1> @read_global_bool32(
+// CHECK-256-SAME: ) local_unnamed_addr #[[ATTR2]] {
+// CHECK-256-NEXT:  [[ENTRY:.*:]]
+// CHECK-256-NEXT:    [[TMP0:%.*]] = load <1 x i8>, ptr @global_bool32, align 1, !tbaa [[CHAR_TBAA6]]
 // CHECK-256-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 1 x i8> @llvm.vector.insert.nxv1i8.v1i8(<vscale x 1 x i8> poison, <1 x i8> [[TMP0]], i64 0)
 // CHECK-256-NEXT:    [[TMP1:%.*]] = bitcast <vscale x 1 x i8> [[CAST_SCALABLE]] to <vscale x 8 x i1>
 // CHECK-256-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv8i1(<vscale x 8 x i1> [[TMP1]], i64 0)
@@ -159,3 +173,12 @@ vbool4_t read_global_bool4() { return global_bool4; }
 //
 vbool32_t read_global_bool32() { return global_bool32; }
 #endif
+//.
+// CHECK-64: [[CHAR_TBAA6]] = !{[[META7:![0-9]+]], [[META7]], i64 0}
+// CHECK-64: [[META7]] = !{!"omnipotent char", [[META8:![0-9]+]], i64 0}
+// CHECK-64: [[META8]] = !{!"Simple C/C++ TBAA"}
+//.
+// CHECK-256: [[CHAR_TBAA6]] = !{[[META7:![0-9]+]], [[META7]], i64 0}
+// CHECK-256: [[META7]] = !{!"omnipotent char", [[META8:![0-9]+]], i64 0}
+// CHECK-256: [[META8]] = !{!"Simple C/C++ TBAA"}
+//.
diff --git a/clang/test/CodeGen/SystemZ/builtins-systemz-i128.c b/clang/test/CodeGen/SystemZ/builtins-systemz-i128.c
index 896cef515743c..d25b8d84aa2d5 100644
--- a/clang/test/CodeGen/SystemZ/builtins-systemz-i128.c
+++ b/clang/test/CodeGen/SystemZ/builtins-systemz-i128.c
@@ -1,4 +1,4 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6
 // REQUIRES: systemz-registered-target
 // RUN: %clang_cc1 -target-cpu z14 -triple s390x-linux-gnu \
 // RUN: -O2 -fzvector -flax-vector-conversions=none \
@@ -14,124 +14,124 @@ volatile vector unsigned long long vul;
 // CHECK-LABEL: define dso_local void @test(
 // CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load volatile <16 x i8>, ptr @vuc, align 8, !tbaa [[TBAA3:![0-9]+]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load volatile <16 x i8>, ptr @vuc, align 8, !tbaa [[TBAA3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load volatile <16 x i8>, ptr @vuc, align 8, !tbaa [[CHAR_TBAA3:![0-9]+]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load volatile <16 x i8>, ptr @vuc, align 8, !tbaa [[CHAR_TBAA3]]
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to i128
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
 // CHECK-NEXT:    [[ADD_I:%.*]] = add nsw i128 [[TMP3]], [[TMP2]]
 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast i128 [[ADD_I]] to <16 x i8>
-// CHECK-NEXT:    store volatile <16 x i8> [[TMP4]], ptr @vuc, align 8, !tbaa [[TBAA3]]
-// CHECK-NEXT:    [[TMP5:%.*]] = load volatile <16 x i8>, ptr @vuc, align 8, !tbaa [[TBAA3]]
-// CHECK-NEXT:    [[TMP6:%.*]] = load volatile <16 x i8>, ptr @vuc, align 8, !tbaa [[TBAA3]]
+// CHECK-NEXT:    store volatile <16 x i8> [[TMP4]], ptr @vuc, align 8, !tbaa [[CHAR_TBAA3]]
+// CHECK-NEXT:    [[TMP5:%.*]] = load volatile <16 x i8>, ptr @vuc, align 8, !tbaa [[CHAR_TBAA3]]
+// CHECK-NEXT:    [[TMP6:%.*]] = load volatile <16 x i8>, ptr @vuc, align 8, !tbaa [[CHAR_TBAA3]]
 // CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i8> [[TMP5]] to i128
 // CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to i128
 // CHECK-NEXT:    [[TMP9:%.*]] = tail call i128 @llvm.s390.vaccq(i128 [[TMP7]], i128 [[TMP8]])
 // CHECK-NEXT:    [[TMP10:%.*]] = bitcast i128 [[TMP9]] to <16 x i8>
-// CHECK-NEXT:    store volatile <16 x i8> [[TMP10]], ptr @vuc, align 8, !tbaa [[TBAA3]]
-// CHECK-NEXT:    [[TMP11:%.*]] = load volatile <16 x i8>, ptr @vuc, align 8, !tbaa [[TBAA3]]
-// CHECK-NEXT:    [[TMP12:%.*]] = load volatile <16 x i8>, ptr @vuc, align 8, !tbaa [[TBAA3]]
-// CHECK-NEXT:    [[TMP13:%.*]] = load volatile <16 x i8>, ptr @vuc, align 8, !tbaa [[TBAA3]]
+// CHECK-NEXT:    store volatile <16 x i8> [[TMP10]], ptr @vuc, align 8, !tbaa [[CHAR_TBAA3]]
+// CHECK-NEXT:    [[TMP11:%.*]] = load volatile <16 x i8>, ptr @vuc, align 8, !tbaa [[CHAR_TBAA3]]
+// CHECK-NEXT:    [[TMP12:%.*]] = load volatile <16 x i8>, ptr @vuc, align 8, !tbaa [[CHAR_TBAA3]]
+// CHECK-NEXT:    [[TMP13:%.*]] = load volatile <16 x i8>, ptr @vuc, align 8, !tbaa [[CHAR_TBAA3]]
 // CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x i8> [[TMP11]] to i128
 // CHECK-NEXT:    [[TMP15:%.*]] = bitcast <16 x i8> [[TMP12]] to i128
 // CHECK-NEXT:    [[TMP16:%.*]] = bitcast <16 x i8> [[TMP13]] to i128
 // CHECK-NEXT:    [[TMP17:%.*]] = tail call i128 @llvm.s390.vacq(i128 [[TMP14]], i128 [[TMP15]], i128 [[TMP16]])
 // CHECK-NEXT:    [[TMP18:%.*]] = bitcast i128 [[TMP17]] to <16 x i8>
-// CHECK-NEXT:    store volatile <16 x i8> [[TMP18]], ptr @vuc, align 8, !tbaa [[TBAA3]]
-// CHECK-NEXT:    [[TMP19:%.*]] = load volatile <16 x i8>, ptr @vuc, align 8, !tbaa [[TBAA3]]
-// CHECK-NEXT:    [[TMP20:%.*]] = load volatile <16 x i8>, ptr @vuc, align 8, !tbaa [[TBAA3]]
-// CHECK-NEXT:    [[TMP21:%.*]] = load volatile <16 x i8>, ptr @vuc, align 8, !tbaa [[TBAA3]]
+// CHECK-NEXT:    store volatile <16 x i8> [[TMP18]], ptr @vuc, align 8, !tbaa [[CHAR_TBAA3]]
+// CHECK-NEXT:    [[TMP19:%.*]] = load volatile <16 x i8>, ptr @vuc, align 8, !tbaa [[CHAR_TBAA3]]
+// CHECK-NEXT:    [[TMP20:%.*]] = load volatile <16 x i8>, ptr @vuc, align 8, !tbaa [[CHAR_TBAA3]]
+// CHECK-NEXT:    [[TMP21:%.*]] = load volatile <16 x i8>, ptr @vuc, align 8, !tbaa [[CHAR_TBAA3]]
 // CHECK-NEXT:    [[TMP22:%.*]] = bitcast <16 x i8> [[TMP19]] to i128
 // CHECK-NEXT:    [[TMP23:%.*]] = bitcast <16 x i8> [[TMP20]] to i128
 // CHECK-NEXT:    [[TMP24:%.*]] = bitcast <16 x i8> [[TMP21]] to i128
 // CHECK-NEXT:    [[TMP25:%.*]] = tail call i128 @llvm.s390.vacccq(i128 [[TMP22]], i128 [[TMP23]], i128 [[TMP24]])
 // CHECK-NEXT:    [[TMP26:%.*]] = bitcast i128 [[TMP25]] to <16 x i8>
-// CHECK-NEXT:    store volatile <16 x i8> [[TMP26]], ptr @vuc, align 8, !tbaa [[TBAA3]]
-// CHECK-NEXT:    [[TMP27:%.*]] = load volatile <16 x i8>, ptr @vuc, align 8, !tbaa [[TBAA3]]
-// CHECK-NEXT:    [[TMP28:%.*]] = load volatile <16 x i8>, ptr @vuc, align 8, !tbaa [[TBAA3]]
+// CHECK-NEXT:    store volatile <16 x i8> [[TMP26]], ptr @vuc, align 8, !tbaa [[CHAR_TBAA3]]
+// CHECK-NEXT:    [[TMP27:%.*]] = load volatile <16 x i8>, ptr @vuc, align 8, !tbaa [[CHAR_TBAA3]]
+// CHECK-NEXT:    [[TMP28:%.*]] = load volatile <16 x i8>, ptr @vuc, align 8, !tbaa [[CHAR_TBAA3]]
 // CHECK-NEXT:    [[TMP29:%.*]] = bitcast <16 x i8> [[TMP27]] to i128
 // CHECK-NEXT:    [[TMP30:%.*]] = bitcast <16 x i8> [[TMP28]] to i128
 // CHECK-NEXT:    [[SUB_I:%.*]] = sub nsw i128 [[TMP29]], [[TMP30]]
 // CHECK-NEXT:    [[TMP31:%.*]] = bitcast i128 [[SUB_I]] to <16 x i8>
-// CHECK-NEXT:    store volatile <16 x i8> [[TMP31]], ptr @vuc, align 8, !tbaa [[TBAA3]]
-// CHECK-NEXT:    [[TMP32:%.*]] = load volatile <16 x i8>, ptr @vuc, align 8, !tbaa [[TBAA3]]
-// CHECK-NEXT:    [[TMP33:%.*]] = load volatile <16 x i8>, ptr @vuc, align 8, !tbaa [[TBAA3]]
+// CHECK-NEXT:    store volatile <16 x i8> [[TMP31]], ptr @vuc, align 8, !tbaa [[CHAR_TBAA3]]
+// CHECK-NEXT:    [[TMP32:%.*]] = load volatile <16 x i8>, ptr @vuc, align 8, !tbaa [[CHAR_TBAA3]]
+// CHECK-NEXT:    [[TMP33:%.*]] = load volatile <16 x i8>, ptr @vuc, align 8, !tbaa [[CHAR_TBAA3]]
 // CHECK-NEXT:    [[TMP34:%.*]] = bitcast <16 x i8> [[TMP32]] to i128
 // CHECK-NEXT:    [[TMP35:%.*]] = bitcast <16 x i8> [[TMP33]] to i128
 // CHECK-NEXT:    [[TMP36:%.*]] = tail call i128 @llvm.s390.vscbiq(i128 [[TMP34]], i128 [[TMP35]])
 // CHECK-NEXT:    [[TMP37:%.*]] = bitcast i128 [[TMP36]] to <16 x i8>
-// CHECK-NEXT:    store volatile <16 x i8> [[TMP37]], ptr @vuc, align 8, !tbaa [[TBAA3]]
-// CHECK-NEXT:    [[TMP38:%.*]] = load volatile <16 x i8>, ptr @vuc, align 8, !tbaa [[TBAA3]]
-// CHECK-NEXT:    [[TMP39:%.*]] = load volatile <16 x i8>, ptr @vuc, align 8, !tbaa [[TBAA3]]
-// CHECK-NEXT:    [[TMP40:%.*]] = load volatile <16 x i8>, ptr @vuc, align 8, !tbaa [[TBAA3]]
+// CHECK-NEXT:    store volatile <16 x i8> [[TMP37]], ptr @vuc, align 8, !tbaa [[CHAR_TBAA3]]
+// CHECK-NEXT:    [[TMP38:%.*]] = load volatile <16 x i8>, ptr @vuc, align 8, !tbaa [[CHAR_TBAA3]]
+// CHECK-NEXT:    [[TMP39:%.*]] = load volatile <16 x i8>, ptr @vuc, align 8, !tbaa [[CHAR_TBAA3]]
+// CHECK-NEXT:    [[TMP40:%.*]] = load volatile <16 x i8>, ptr @vuc, align 8, !tbaa [[CHAR_TBAA3]]
 // CHECK-NEXT:    [[TMP41:%.*]] = bitcast <16 x i8> [[TMP38]] to i128
 // CHECK-NEXT:    [[TMP42:%.*]] = bitcast <16 x i8> [[TMP39]] to i128
 // CHECK-NEXT:    [[TMP43:%.*]] = bitcast <16 x i8> [[TMP40]] to i128
 // CHECK-NEXT:    [[TMP44:%.*]] = tail call i128 @llvm.s390.vsbiq(i128 [[TMP41]], i128 [[TMP42]], i128 [[TMP43]])
 // CHECK-NEXT:    [[TMP45:%.*]] = bitcast i128 [[TMP44]] to <16 x i8>
-// CHECK-NEXT:    store volatile <16 x i8> [[TMP45]], ptr @vuc, align 8, !tbaa [[TBAA3]]
-// CHECK-NEXT:    [[TMP46:%.*]] = load volatile <16 x i8>, ptr @vuc, align 8, !tbaa [[TBAA3]]
-// CHECK-NEXT:    [[TMP47:%.*]] = load volatile <16 x i8>, ptr @vuc, align 8, !tbaa [[TBAA3]]
-// CHECK-NEXT:    [[TMP48:%.*]] = load volatile <16 x i8>, ptr @vuc, align 8, !tbaa [[TBAA3]]
+// CHECK-NEXT:    store volatile <16 x i8> [[TMP45]], ptr @vuc, align 8, !tbaa [[CHAR_TBAA3]]
+// CHECK-NEXT:    [[TMP46:%.*]] = load volatile <16 x i8>, ptr @vuc, align 8, !tbaa [[CHAR_TBAA3]]
+// CHECK-NEXT:    [[TMP47:%.*]] = load volatile <16 x i8>, ptr @vuc, align 8, !tbaa [[CHAR_TBAA3]]
+// CHECK-NEXT:    [[TMP48:%.*]] = load volatile <16 x i8>, ptr @vuc, align 8, !tbaa [[CHAR_TBAA3]]
 // CHECK-NEXT:    [[TMP49:%.*]] = bitcast <16 x i8> [[TMP46]] to i128
 // CHECK-NEXT:    [[TMP50:%.*]] = bitcast <16 x i8> [[TMP47]] to i128
 // CHECK-NEXT:    [[TMP51:%.*]] = bitcast <16 x i8> [[TMP48]] to i128
 // CHECK-NEXT:    [[TMP52:%.*]] = tail call i128 @llvm.s390.vsbcbiq(i128 [[TMP49]], i128 [[TMP50]], i128 [[TMP51]])
 // CHECK-NEXT:    [[TMP53:%.*]] = bitcast i128 [[TMP52]] to <16 x i8>
-// CHECK-NEXT:    store volatile <16 x i8> [[TMP53]], ptr @vuc, align 8, !tbaa [[TBAA3]]
-// CHECK-NEXT:    [[TMP54:%.*]] = load volatile <4 x i32>, ptr @vui, align 8, !tbaa [[TBAA3]]
-// CHECK-NEXT:    [[TMP55:%.*]] = load volatile <4 x i32>, ptr @vui, align 8, !tbaa [[TBAA3]]
+// CHECK-NEXT:    store volatile <16 x i8> [[TMP53]], ptr @vuc, align 8, !tbaa [[CHAR_TBAA3]]
+// CHECK-NEXT:    [[TMP54:%.*]] = load volatile <4 x i32>, ptr @vui, align 8, !tbaa [[CHAR_TBAA3]]
+// CHECK-NEXT:    [[TMP55:%.*]] = load volatile <4 x i32>, ptr @vui, align 8, !tbaa [[CHAR_TBAA3]]
 // CHECK-NEXT:    [[TMP56:%.*]] = tail call i128 @llvm.s390.vsumqf(<4 x i32> [[TMP54]], <4 x i32> [[TMP55]])
 // CHECK-NEXT:    [[TMP57:%.*]] = bitcast i128 [[TMP56]] to <16 x i8>
-// CHECK-NEXT:    store volatile <16 x i8> [[TMP57]], ptr @vuc, align 8, !tbaa [[TBAA3]]
-// CHECK-NEXT:    [[TMP58:%.*]] = load volatile <2 x i64>, ptr @vul, align 8, !tbaa [[TBAA3]]
-// CHECK-NEXT:    [[TMP59:%.*]] = load volatile <2 x i64>, ptr @vul, align 8, !tbaa [[TBAA3]]
+// CHECK-NEXT:    store volatile <16 x i8> [[TMP57]], ptr @vuc, align 8, !tbaa [[CHAR_TBAA3]]
+// CHECK-NEXT:    [[TMP58:%.*]] = load volatile <2 x i64>, ptr @vul, align 8, !tbaa [[CHAR_TBAA3]]
+// CHECK-NEXT:    [[TMP59:%.*]] = load volatile <2 x i64>, ptr @vul, align 8, !tbaa [[CHAR_TBAA3]]
 // CHECK-NEXT:    [[TMP60:%.*]] = tail call i128 @llvm.s390.vsumqg(<2 x i64> [[TMP58]], <2 x i64> [[TMP59]])
 // CHECK-NEXT:    [[TMP61:%.*]] = bitcast i128 [[TMP60]] to <16 x i8>
-// CHECK-NEXT:    store volatile <16 x i8> [[TMP61]], ptr @vuc, align 8, !tbaa [[TBAA3]]
-// CHECK-NEXT:    [[TMP62:%.*]] = load volatile <2 x i64>, ptr @vul, align 8, !tbaa [[TBAA3]]
-// CHECK-NEXT:    [[TMP63:%.*]] = load volatile <2 x i64>, ptr @vul, align 8, !tbaa [[TBAA3]]
+// CHECK-NEXT:    store volatile <16 x i8> [[TMP61]], ptr @vuc, align 8, !tbaa [[CHAR_TBAA3]]
+// CHECK-NEXT:    [[TMP62:%.*]] = load volatile <2 x i64>, ptr @vul, align 8, !tbaa [[CHAR_TBAA3]]
+// CHECK-NEXT:    [[TMP63:%.*]] = load volatile <2 x i64>, ptr @vul, align 8, !tbaa [[CHAR_TBAA3]]
 // CHECK-NEXT:    [[TMP64:%.*]] = tail call i128 @llvm.s390.vgfmg(<2 x i64> [[TMP62]], <2 x i64> [[TMP63]])
 // CHECK-NEXT:    [[TMP65:%.*]] = bitcast i128 [[TMP64]] to <16 x i8>
-// CHECK-NEXT:    store volatile <16 x i8> [[TMP65]], ptr @vuc, align 8, !tbaa [[TBAA3]]
-// CHECK-NEXT:    [[TMP66:%.*]] = load volatile <2 x i64>, ptr @vul, align 8, !tbaa [[TBAA3]]
-// CHECK-NEXT:    [[TMP67:%.*]] = load volatile <2 x i64>, ptr @vul, align 8, !tbaa [[TBAA3]]
-// CHECK-NEXT:    [[TMP68:%.*]] = load volatile <16 x i8>, ptr @vuc, align 8, !tbaa [[TBAA3]]
+// CHECK-NEXT:    store volatile <16 x i8> [[TMP65]], ptr @vuc, align 8, !tbaa [[CHAR_TBAA3]]
+// CHECK-NEXT:    [[TMP66:%.*]] = load volatile <2 x i64>, ptr @vul, align 8, !tbaa [[CHAR_TBAA3]]
+// CHECK-NEXT:    [[TMP67:%.*]] = load volatile <2 x i64>, ptr @vul, align 8, !tbaa [[CHAR_TBAA3]]
+// CHECK-NEXT:    [[TMP68:%.*]] = load volatile <16 x i8>, ptr @vuc, align 8, !tbaa [[CHAR_TBAA3]]
 // CHECK-NEXT:    [[TMP69:%.*]] = bitcast <16 x i8> [[TMP68]] to i128
 // CHECK-NEXT:    [[TMP70:%.*]] = tail call i128 @llvm.s390.vgfmag(<2 x i64> [[TMP66]], <2 x i64> [[TMP67]], i128 [[TMP69]])
 // CHECK-NEXT:    [[TMP71:%.*]] = bitcast i128 [[TMP70]] to <16 x i8>
-// CHECK-NEXT:    store volatile <16 x i8> [[TMP71]], ptr @vuc, align 8, !tbaa [[TBAA3]]
-// CHECK-NEXT:    [[TMP72:%.*]] = load volatile <2 x i64>, ptr @vul, align 8, !tbaa [[TBAA3]]
-// CHECK-NEXT:    [[TMP73:%.*]] = load volatile <2 x i64>, ptr @vul, align 8, !tbaa [[TBAA3]]
-// CHECK-NEXT:    [[TMP74:%.*]] = load volatile <16 x i8>, ptr @vuc, align 8, !tbaa [[TBAA3]]
+// CHECK-NEXT:    store volatile <16 x i8> [[TMP71]], ptr @vuc, align 8, !tbaa [[CHAR_TBAA3]]
+// CHECK-NEXT:    [[TMP72:%.*]] = load volatile <2 x i64>, ptr @vul, align 8, !tbaa [[CHAR_TBAA3]]
+// CHECK-NEXT:    [[TMP73:%.*]] = load volatile <2 x i64>, ptr @vul, align 8, !tbaa [[CHAR_TBAA3]]
+// CHECK-NEXT:    [[TMP74:%.*]] = load volatile <16 x i8>, ptr @vuc, align 8, !tbaa [[CHAR_TBAA3]]
 // CHECK-NEXT:    [[TMP75:%.*]] = bitcast <16 x i8> [[TMP74]] to i128
 // CHECK-NEXT:    [[TMP76:%.*]] = tail call i128 @llvm.s390.vmslg(<2 x i64> [[TMP72]], <2 x i64> [[TMP73]], i128 [[TMP75]], i32 0)
 // CHECK-NEXT:    [[TMP77:%.*]] = bitcast i128 [[TMP76]] to <16 x i8>
-// CHECK-NEXT:    store volatile <16 x i8> [[TMP77]], ptr @vuc, align 8, !tbaa [[TBAA3]]
-// CHECK-NEXT:    [[TMP78:%.*]] = load volatile <2 x i64>, ptr @vul, align 8, !tbaa [[TBAA3]]
-// CHECK-NEXT:    [[TMP79:%.*]] = load volatile <2 x i64>, ptr @vul, align 8, !tbaa [[TBAA3]]
-// CHECK-NEXT:    [[TMP80:%.*]] = load volatile <16 x i8>, ptr @vuc, align 8, !tbaa [[TBAA3]]
+// CHECK-NEXT:    store volatile <16 x i8> [[TMP77]], ptr @vuc, align 8, !tbaa [[CHAR_TBAA3]]
+// CHECK-NEXT:    [[TMP78:%.*]] = load volatile <2 x i64>, ptr @vul, align 8, !tbaa [[CHAR_TBAA3]]
+// CHECK-NEXT:    [[TMP79:%.*]] = load volatile <2 x i64>, ptr @vul, align 8, !tbaa [[CHAR_TBAA3]]
+// CHECK-NEXT:    [[TMP80:%.*]] = load volatile <16 x i8>, ptr @vuc, align 8, !tbaa [[CHAR_TBAA3]]
 // CHECK-NEXT:    [[TMP81:%.*]] = bitcast <16 x i8> [[TMP80]] to i128
 // CHECK-NEXT:    [[TMP82:%.*]] = tail call i128 @llvm.s390.vmslg(<2 x i64> [[TMP78]], <2 x i64> [[TMP79]], i128 [[TMP81]], i32 4)
 // CHECK-NEXT:    [[TMP83:%.*]] = bitcast i128 [[TMP82]] to <16 x i8>
-// CHECK-NEXT:    store volatile <16 x i8> [[TMP83]], ptr @vuc, align 8, !tbaa [[TBAA3]]
-// CHECK-NEXT:    [[TMP84:%.*]] = load volatile <2 x i64>, ptr @vul, align 8, !tbaa [[TBAA3]]
-// CHECK-NEXT:    [[TMP85:%.*]] = load volatile <2 x i64>, ptr @vul, align 8, !tbaa [[TBAA3]]
-// CHECK-NEXT:    [[TMP86:%.*]] = load volatile <16 x i8>, ptr @vuc, align 8, !tbaa [[TBAA3]]
+// CHECK-NEXT:    store volatile <16 x i8> [[TMP83]], ptr @vuc, align 8, !tbaa [[CHAR_TBAA3]]
+// CHECK-NEXT:    [[TMP84:%.*]] = load volatile <2 x i64>, ptr @vul, align 8, !tbaa [[CHAR_TBAA3]]
+// CHECK-NEXT:    [[TMP85:%.*]] = load volatile <2 x i64>, ptr @vul, align 8, !tbaa [[CHAR_TBAA3]]
+// CHECK-NEXT:    [[TMP86:%.*]] = load volatile <16 x i8>, ptr @vuc, align 8, !tbaa [[CHAR_TBAA3]]
 // CHECK-NEXT:    [[TMP87:%.*]] = bitcast <16 x i8> [[TMP86]] to i128
 // CHECK-NEXT:    [[TMP88:%.*]] = tail call i128 @llvm.s390.vmslg(<2 x i64> [[TMP84]], <2 x i64> [[TMP85]], i128 [[TMP87]], i32 8)
 // CHECK-NEXT:    [[TMP89:%.*]] = bitcast i128 [[TMP88]] to <16 x i8>
-// CHECK-NEXT:    store volatile <16 x i8> [[TMP89]], ptr @vuc, align 8, !tbaa [[TBAA3]]
-// CHECK-NEXT:    [[TMP90:%.*]] = load volatile <2 x i64>, ptr @vul, align 8, !tbaa [[TBAA3]]
-// CHECK-NEXT:    [[TMP91:%.*]] = load volatile <2 x i64>, ptr @vul, align 8, !tbaa [[TBAA3]]
-// CHECK-NEXT:    [[TMP92:%.*]] = load volatile <16 x i8>, ptr @vuc, align 8, !tbaa [[TBAA3]]
+// CHECK-NEXT:    store volatile <16 x i8> [[TMP89]], ptr @vuc, align 8, !tbaa [[CHAR_TBAA3]]
+// CHECK-NEXT:    [[TMP90:%.*]] = load volatile <2 x i64>, ptr @vul, align 8, !tbaa [[CHAR_TBAA3]]
+// CHECK-NEXT:    [[TMP91:%.*]] = load volatile <2 x i64>, ptr @vul, align 8, !tbaa [[CHAR_TBAA3]]
+// CHECK-NEXT:    [[TMP92:%.*]] = load volatile <16 x i8>, ptr @vuc, align 8, !tbaa [[CHAR_TBAA3]]
 // CHECK-NEXT:    [[TMP93:%.*]] = bitcast <16 x i8> [[TMP92]] to i128
 // CHECK-NEXT:    [[TMP94:%.*]] = tail call i128 @llvm.s390.vmslg(<2 x i64> [[TMP90]], <2 x i64> [[TMP91]], i128 [[TMP93]], i32 12)
 // CHECK-NEXT:    [[TMP95:%.*]] = bitcast i128 [[TMP94]] to <16 x i8>
-// CHECK-NEXT:    store volatile <16 x i8> [[TMP95]], ptr @vuc, align 8, !tbaa [[TBAA3]]
-// CHECK-NEXT:    [[TMP96:%.*]] = load volatile <16 x i8>, ptr @vuc, align 8, !tbaa [[TBAA3]]
-// CHECK-NEXT:    [[TMP97:%.*]] = load volatile <16 x i8>, ptr @vuc, align 8, !tbaa [[TBAA3]]
+// CHECK-NEXT:    store volatile <16 x i8> [[TMP95]], ptr @vuc, align 8, !tbaa [[CHAR_TBAA3]]
+// CHECK-NEXT:    [[TMP96:%.*]] = load volatile <16 x i8>, ptr @vuc, align 8, !tbaa [[CHAR_TBAA3]]
+// CHECK-NEXT:    [[TMP97:%.*]] = load volatile <16 x i8>, ptr @vuc, align 8, !tbaa [[CHAR_TBAA3]]
 // CHECK-NEXT:    [[TMP98:%.*]] = tail call <2 x i64> @llvm.s390.vbperm(<16 x i8> [[TMP96]], <16 x i8> [[TMP97]])
-// CHECK-NEXT:    store volatile <2 x i64> [[TMP98]], ptr @vul, align 8, !tbaa [[TBAA3]]
+// CHECK-NEXT:    store volatile <2 x i64> [[TMP98]], ptr @vul, align 8, !tbaa [[CHAR_TBAA3]]
 // CHECK-NEXT:    ret void
 //
 void test(void) {
@@ -159,7 +159,7 @@ void test(void) {
   vul = vec_bperm_u128(vuc, vuc);
 }
 //.
-// CHECK: [[TBAA3]] = !{[[META4:![0-9]+]], [[META4]], i64 0}
+// CHECK: [[CHAR_TBAA3]] = !{[[META4:![0-9]+]], [[META4]], i64 0}
 // CHECK: [[META4]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0}
 // CHECK: [[META5]] = !{!"Simple C/C++ TBAA"}
 //.
diff --git a/clang/test/CodeGen/SystemZ/gnu-atomic-builtins-i128-16Al.c b/clang/test/CodeGen/SystemZ/gnu-atomic-builtins-i128-16Al.c
index e3db2063312d2..5f3b0ec546462 100644
--- a/clang/test/CodeGen/SystemZ/gnu-atomic-builtins-i128-16Al.c
+++ b/clang/test/CodeGen/SystemZ/gnu-atomic-builtins-i128-16Al.c
@@ -1,4 +1,4 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6
 // RUN: %clang_cc1 -triple s390x-linux-gnu -O1 -emit-llvm %s -o - | FileCheck %s
 //
 // Test GNU atomic builtins for __int128 aligned to 16 bytes, which should be
@@ -13,21 +13,23 @@ __int128 Val __attribute__((aligned(16)));
 __int128 Exp __attribute__((aligned(16)));
 __int128 Des __attribute__((aligned(16)));
 
-// CHECK-LABEL: @f1(
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define dso_local void @f1(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(i128) align 8 captures(none) initializes((0, 16)) [[AGG_RESULT:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = load atomic i128, ptr @Ptr seq_cst, align 16
-// CHECK-NEXT:    store i128 [[TMP0]], ptr [[AGG_RESULT:%.*]], align 8, !tbaa [[TBAA2:![0-9]+]]
+// CHECK-NEXT:    store i128 [[TMP0]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA2:![0-9]+]]
 // CHECK-NEXT:    ret void
 //
 __int128 f1() {
   return __atomic_load_n(&Ptr, memory_order_seq_cst);
 }
 
-// CHECK-LABEL: @f2(
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define dso_local void @f2(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(i128) align 8 captures(none) initializes((0, 16)) [[AGG_RESULT:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = load atomic i128, ptr @Ptr seq_cst, align 16
 // CHECK-NEXT:    store i128 [[TMP0]], ptr @Ret, align 16
-// CHECK-NEXT:    store i128 [[TMP0]], ptr [[AGG_RESULT:%.*]], align 8, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store i128 [[TMP0]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 __int128 f2() {
@@ -35,9 +37,10 @@ __int128 f2() {
   return Ret;
 }
 
-// CHECK-LABEL: @f3(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @f3(
+// CHECK-SAME: ) local_unnamed_addr #[[ATTR1:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[__INT128_TBAA2]]
 // CHECK-NEXT:    store atomic i128 [[TMP0]], ptr @Ptr seq_cst, align 16
 // CHECK-NEXT:    ret void
 //
@@ -45,8 +48,9 @@ void f3() {
   __atomic_store_n(&Ptr, Val, memory_order_seq_cst);
 }
 
-// CHECK-LABEL: @f4(
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define dso_local void @f4(
+// CHECK-SAME: ) local_unnamed_addr #[[ATTR1]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16
 // CHECK-NEXT:    store atomic i128 [[TMP0]], ptr @Ptr seq_cst, align 16
 // CHECK-NEXT:    ret void
@@ -55,23 +59,25 @@ void f4() {
   __atomic_store(&Ptr, &Val, memory_order_seq_cst);
 }
 
-// CHECK-LABEL: @f5(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @f5(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(i128) align 8 captures(none) initializes((0, 16)) [[AGG_RESULT:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[__INT128_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw xchg ptr @Ptr, i128 [[TMP0]] seq_cst, align 16
-// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT:%.*]], align 8, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 __int128 f5() {
   return __atomic_exchange_n(&Ptr, Val, memory_order_seq_cst);
 }
 
-// CHECK-LABEL: @f6(
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define dso_local void @f6(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(i128) align 8 captures(none) initializes((0, 16)) [[AGG_RESULT:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16
 // CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw xchg ptr @Ptr, i128 [[TMP0]] seq_cst, align 16
 // CHECK-NEXT:    store i128 [[TMP1]], ptr @Ret, align 16
-// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT:%.*]], align 8, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 __int128 f6() {
@@ -79,18 +85,19 @@ __int128 f6() {
   return Ret;
 }
 
-// CHECK-LABEL: @f7(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Des, align 16, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local noundef zeroext i1 @f7(
+// CHECK-SAME: ) local_unnamed_addr #[[ATTR1]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Des, align 16, !tbaa [[__INT128_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = load i128, ptr @Exp, align 16
 // CHECK-NEXT:    [[TMP2:%.*]] = cmpxchg ptr @Ptr, i128 [[TMP1]], i128 [[TMP0]] seq_cst seq_cst, align 16
 // CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { i128, i1 } [[TMP2]], 1
-// CHECK-NEXT:    br i1 [[TMP3]], label [[CMPXCHG_CONTINUE:%.*]], label [[CMPXCHG_STORE_EXPECTED:%.*]]
-// CHECK:       cmpxchg.store_expected:
+// CHECK-NEXT:    br i1 [[TMP3]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]]
+// CHECK:       [[CMPXCHG_STORE_EXPECTED]]:
 // CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { i128, i1 } [[TMP2]], 0
 // CHECK-NEXT:    store i128 [[TMP4]], ptr @Exp, align 16
-// CHECK-NEXT:    br label [[CMPXCHG_CONTINUE]]
-// CHECK:       cmpxchg.continue:
+// CHECK-NEXT:    br label %[[CMPXCHG_CONTINUE]]
+// CHECK:       [[CMPXCHG_CONTINUE]]:
 // CHECK-NEXT:    ret i1 [[TMP3]]
 //
 _Bool f7() {
@@ -98,18 +105,19 @@ _Bool f7() {
                                      memory_order_seq_cst, memory_order_seq_cst);
 }
 
-// CHECK-LABEL: @f8(
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define dso_local noundef zeroext i1 @f8(
+// CHECK-SAME: ) local_unnamed_addr #[[ATTR1]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Exp, align 16
 // CHECK-NEXT:    [[TMP1:%.*]] = load i128, ptr @Des, align 16
 // CHECK-NEXT:    [[TMP2:%.*]] = cmpxchg ptr @Ptr, i128 [[TMP0]], i128 [[TMP1]] seq_cst seq_cst, align 16
 // CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { i128, i1 } [[TMP2]], 1
-// CHECK-NEXT:    br i1 [[TMP3]], label [[CMPXCHG_CONTINUE:%.*]], label [[CMPXCHG_STORE_EXPECTED:%.*]]
-// CHECK:       cmpxchg.store_expected:
+// CHECK-NEXT:    br i1 [[TMP3]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]]
+// CHECK:       [[CMPXCHG_STORE_EXPECTED]]:
 // CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { i128, i1 } [[TMP2]], 0
 // CHECK-NEXT:    store i128 [[TMP4]], ptr @Exp, align 16
-// CHECK-NEXT:    br label [[CMPXCHG_CONTINUE]]
-// CHECK:       cmpxchg.continue:
+// CHECK-NEXT:    br label %[[CMPXCHG_CONTINUE]]
+// CHECK:       [[CMPXCHG_CONTINUE]]:
 // CHECK-NEXT:    ret i1 [[TMP3]]
 //
 _Bool f8() {
@@ -117,141 +125,159 @@ _Bool f8() {
                                    memory_order_seq_cst, memory_order_seq_cst);
 }
 
-// CHECK-LABEL: @f9(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @f9(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(i128) align 8 captures(none) initializes((0, 16)) [[AGG_RESULT:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[__INT128_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw add ptr @Ptr, i128 [[TMP0]] seq_cst, align 16
 // CHECK-NEXT:    [[TMP2:%.*]] = add i128 [[TMP1]], [[TMP0]]
-// CHECK-NEXT:    store i128 [[TMP2]], ptr [[AGG_RESULT:%.*]], align 8, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store i128 [[TMP2]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 __int128 f9() {
   return __atomic_add_fetch(&Ptr, Val, memory_order_seq_cst);
 }
 
-// CHECK-LABEL: @f10(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @f10(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(i128) align 8 captures(none) initializes((0, 16)) [[AGG_RESULT:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[__INT128_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw sub ptr @Ptr, i128 [[TMP0]] seq_cst, align 16
 // CHECK-NEXT:    [[TMP2:%.*]] = sub i128 [[TMP1]], [[TMP0]]
-// CHECK-NEXT:    store i128 [[TMP2]], ptr [[AGG_RESULT:%.*]], align 8, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store i128 [[TMP2]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 __int128 f10() {
   return __atomic_sub_fetch(&Ptr, Val, memory_order_seq_cst);
 }
 
-// CHECK-LABEL: @f11(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @f11(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(i128) align 8 captures(none) initializes((0, 16)) [[AGG_RESULT:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[__INT128_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw and ptr @Ptr, i128 [[TMP0]] seq_cst, align 16
 // CHECK-NEXT:    [[TMP2:%.*]] = and i128 [[TMP1]], [[TMP0]]
-// CHECK-NEXT:    store i128 [[TMP2]], ptr [[AGG_RESULT:%.*]], align 8, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store i128 [[TMP2]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 __int128 f11() {
   return __atomic_and_fetch(&Ptr, Val, memory_order_seq_cst);
 }
 
-// CHECK-LABEL: @f12(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @f12(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(i128) align 8 captures(none) initializes((0, 16)) [[AGG_RESULT:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[__INT128_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw xor ptr @Ptr, i128 [[TMP0]] seq_cst, align 16
 // CHECK-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], [[TMP0]]
-// CHECK-NEXT:    store i128 [[TMP2]], ptr [[AGG_RESULT:%.*]], align 8, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store i128 [[TMP2]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 __int128 f12() {
   return __atomic_xor_fetch(&Ptr, Val, memory_order_seq_cst);
 }
 
-// CHECK-LABEL: @f13(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @f13(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(i128) align 8 captures(none) initializes((0, 16)) [[AGG_RESULT:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[__INT128_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw or ptr @Ptr, i128 [[TMP0]] seq_cst, align 16
 // CHECK-NEXT:    [[TMP2:%.*]] = or i128 [[TMP1]], [[TMP0]]
-// CHECK-NEXT:    store i128 [[TMP2]], ptr [[AGG_RESULT:%.*]], align 8, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store i128 [[TMP2]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 __int128 f13() {
   return __atomic_or_fetch(&Ptr, Val, memory_order_seq_cst);
 }
 
-// CHECK-LABEL: @f14(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @f14(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(i128) align 8 captures(none) initializes((0, 16)) [[AGG_RESULT:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[__INT128_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw nand ptr @Ptr, i128 [[TMP0]] seq_cst, align 16
 // CHECK-NEXT:    [[TMP2:%.*]] = and i128 [[TMP1]], [[TMP0]]
 // CHECK-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP2]], -1
-// CHECK-NEXT:    store i128 [[TMP3]], ptr [[AGG_RESULT:%.*]], align 8, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store i128 [[TMP3]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 __int128 f14() {
   return __atomic_nand_fetch(&Ptr, Val, memory_order_seq_cst);
 }
 
-// CHECK-LABEL: @f15(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @f15(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(i128) align 8 captures(none) initializes((0, 16)) [[AGG_RESULT:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[__INT128_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw add ptr @Ptr, i128 [[TMP0]] seq_cst, align 16
-// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT:%.*]], align 8, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 __int128 f15() {
   return __atomic_fetch_add(&Ptr, Val, memory_order_seq_cst);
 }
 
-// CHECK-LABEL: @f16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @f16(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(i128) align 8 captures(none) initializes((0, 16)) [[AGG_RESULT:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[__INT128_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw sub ptr @Ptr, i128 [[TMP0]] seq_cst, align 16
-// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT:%.*]], align 8, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 __int128 f16() {
   return __atomic_fetch_sub(&Ptr, Val, memory_order_seq_cst);
 }
 
-// CHECK-LABEL: @f17(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @f17(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(i128) align 8 captures(none) initializes((0, 16)) [[AGG_RESULT:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[__INT128_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw and ptr @Ptr, i128 [[TMP0]] seq_cst, align 16
-// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT:%.*]], align 8, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 __int128 f17() {
   return __atomic_fetch_and(&Ptr, Val, memory_order_seq_cst);
 }
 
-// CHECK-LABEL: @f18(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @f18(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(i128) align 8 captures(none) initializes((0, 16)) [[AGG_RESULT:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[__INT128_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw xor ptr @Ptr, i128 [[TMP0]] seq_cst, align 16
-// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT:%.*]], align 8, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 __int128 f18() {
   return __atomic_fetch_xor(&Ptr, Val, memory_order_seq_cst);
 }
 
-// CHECK-LABEL: @f19(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @f19(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(i128) align 8 captures(none) initializes((0, 16)) [[AGG_RESULT:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[__INT128_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw or ptr @Ptr, i128 [[TMP0]] seq_cst, align 16
-// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT:%.*]], align 8, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 __int128 f19() {
   return __atomic_fetch_or(&Ptr, Val, memory_order_seq_cst);
 }
 
-// CHECK-LABEL: @f20(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @f20(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(i128) align 8 captures(none) initializes((0, 16)) [[AGG_RESULT:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[__INT128_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw nand ptr @Ptr, i128 [[TMP0]] seq_cst, align 16
-// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT:%.*]], align 8, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 __int128 f20() {
   return __atomic_fetch_nand(&Ptr, Val, memory_order_seq_cst);
 }
+//.
+// CHECK: [[__INT128_TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
+// CHECK: [[META3]] = !{!"__int128", [[META4:![0-9]+]], i64 0}
+// CHECK: [[META4]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0}
+// CHECK: [[META5]] = !{!"Simple C/C++ TBAA"}
+//.
diff --git a/clang/test/CodeGen/SystemZ/gnu-atomic-builtins-i128-8Al.c b/clang/test/CodeGen/SystemZ/gnu-atomic-builtins-i128-8Al.c
index 8759df7b19c63..3ac5959a29dcb 100644
--- a/clang/test/CodeGen/SystemZ/gnu-atomic-builtins-i128-8Al.c
+++ b/clang/test/CodeGen/SystemZ/gnu-atomic-builtins-i128-8Al.c
@@ -1,4 +1,4 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6
 // RUN: %clang_cc1 -triple s390x-linux-gnu -O1 -emit-llvm %s -o - | FileCheck %s
 //
 // Test GNU atomic builtins for __int128 (with default alignment of 8 bytes
@@ -18,21 +18,23 @@ __int128 Des;
 // pass. It seems that a 'writable' attribute should now be added to the argument
 // in order for this optimization to proceed.
 
-// CHECK-LABEL: @f1(
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define dso_local void @f1(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(i128) align 8 captures(none) initializes((0, 16)) [[AGG_RESULT:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = load atomic i128, ptr @Ptr seq_cst, align 8
-// CHECK-NEXT:    store i128 [[TMP0]], ptr [[AGG_RESULT:%.*]], align 8, !tbaa [[TBAA2:![0-9]+]]
+// CHECK-NEXT:    store i128 [[TMP0]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA2:![0-9]+]]
 // CHECK-NEXT:    ret void
 //
 __int128 f1() {
   return __atomic_load_n(&Ptr, memory_order_seq_cst);
 }
 
-// CHECK-LABEL: @f2(
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define dso_local void @f2(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(i128) align 8 captures(none) initializes((0, 16)) [[AGG_RESULT:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = load atomic i128, ptr @Ptr seq_cst, align 8
 // CHECK-NEXT:    store i128 [[TMP0]], ptr @Ret, align 8
-// CHECK-NEXT:    store i128 [[TMP0]], ptr [[AGG_RESULT:%.*]], align 8, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store i128 [[TMP0]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 __int128 f2() {
@@ -40,9 +42,10 @@ __int128 f2() {
   return Ret;
 }
 
-// CHECK-LABEL: @f3(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 8, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @f3(
+// CHECK-SAME: ) local_unnamed_addr #[[ATTR1:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 8, !tbaa [[__INT128_TBAA2]]
 // CHECK-NEXT:    store atomic i128 [[TMP0]], ptr @Ptr seq_cst, align 8
 // CHECK-NEXT:    ret void
 //
@@ -50,8 +53,9 @@ void f3() {
   __atomic_store_n(&Ptr, Val, memory_order_seq_cst);
 }
 
-// CHECK-LABEL: @f4(
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define dso_local void @f4(
+// CHECK-SAME: ) local_unnamed_addr #[[ATTR1]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 8
 // CHECK-NEXT:    store atomic i128 [[TMP0]], ptr @Ptr seq_cst, align 8
 // CHECK-NEXT:    ret void
@@ -60,23 +64,25 @@ void f4() {
   __atomic_store(&Ptr, &Val, memory_order_seq_cst);
 }
 
-// CHECK-LABEL: @f5(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 8, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @f5(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(i128) align 8 captures(none) initializes((0, 16)) [[AGG_RESULT:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 8, !tbaa [[__INT128_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw xchg ptr @Ptr, i128 [[TMP0]] seq_cst, align 8
-// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT:%.*]], align 8, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 __int128 f5() {
   return __atomic_exchange_n(&Ptr, Val, memory_order_seq_cst);
 }
 
-// CHECK-LABEL: @f6(
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define dso_local void @f6(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(i128) align 8 captures(none) initializes((0, 16)) [[AGG_RESULT:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 8
 // CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw xchg ptr @Ptr, i128 [[TMP0]] seq_cst, align 8
 // CHECK-NEXT:    store i128 [[TMP1]], ptr @Ret, align 8
-// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT:%.*]], align 8, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 __int128 f6() {
@@ -84,18 +90,19 @@ __int128 f6() {
   return Ret;
 }
 
-// CHECK-LABEL: @f7(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Des, align 8, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local noundef zeroext i1 @f7(
+// CHECK-SAME: ) local_unnamed_addr #[[ATTR1]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Des, align 8, !tbaa [[__INT128_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = load i128, ptr @Exp, align 8
 // CHECK-NEXT:    [[TMP2:%.*]] = cmpxchg ptr @Ptr, i128 [[TMP1]], i128 [[TMP0]] seq_cst seq_cst, align 8
 // CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { i128, i1 } [[TMP2]], 1
-// CHECK-NEXT:    br i1 [[TMP3]], label [[CMPXCHG_CONTINUE:%.*]], label [[CMPXCHG_STORE_EXPECTED:%.*]]
-// CHECK:       cmpxchg.store_expected:
+// CHECK-NEXT:    br i1 [[TMP3]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]]
+// CHECK:       [[CMPXCHG_STORE_EXPECTED]]:
 // CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { i128, i1 } [[TMP2]], 0
 // CHECK-NEXT:    store i128 [[TMP4]], ptr @Exp, align 8
-// CHECK-NEXT:    br label [[CMPXCHG_CONTINUE]]
-// CHECK:       cmpxchg.continue:
+// CHECK-NEXT:    br label %[[CMPXCHG_CONTINUE]]
+// CHECK:       [[CMPXCHG_CONTINUE]]:
 // CHECK-NEXT:    ret i1 [[TMP3]]
 //
 _Bool f7() {
@@ -103,18 +110,19 @@ _Bool f7() {
                                      memory_order_seq_cst, memory_order_seq_cst);
 }
 
-// CHECK-LABEL: @f8(
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define dso_local noundef zeroext i1 @f8(
+// CHECK-SAME: ) local_unnamed_addr #[[ATTR1]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Exp, align 8
 // CHECK-NEXT:    [[TMP1:%.*]] = load i128, ptr @Des, align 8
 // CHECK-NEXT:    [[TMP2:%.*]] = cmpxchg ptr @Ptr, i128 [[TMP0]], i128 [[TMP1]] seq_cst seq_cst, align 8
 // CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { i128, i1 } [[TMP2]], 1
-// CHECK-NEXT:    br i1 [[TMP3]], label [[CMPXCHG_CONTINUE:%.*]], label [[CMPXCHG_STORE_EXPECTED:%.*]]
-// CHECK:       cmpxchg.store_expected:
+// CHECK-NEXT:    br i1 [[TMP3]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]]
+// CHECK:       [[CMPXCHG_STORE_EXPECTED]]:
 // CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { i128, i1 } [[TMP2]], 0
 // CHECK-NEXT:    store i128 [[TMP4]], ptr @Exp, align 8
-// CHECK-NEXT:    br label [[CMPXCHG_CONTINUE]]
-// CHECK:       cmpxchg.continue:
+// CHECK-NEXT:    br label %[[CMPXCHG_CONTINUE]]
+// CHECK:       [[CMPXCHG_CONTINUE]]:
 // CHECK-NEXT:    ret i1 [[TMP3]]
 //
 _Bool f8() {
@@ -122,141 +130,159 @@ _Bool f8() {
                                    memory_order_seq_cst, memory_order_seq_cst);
 }
 
-// CHECK-LABEL: @f9(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 8, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @f9(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(i128) align 8 captures(none) initializes((0, 16)) [[AGG_RESULT:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 8, !tbaa [[__INT128_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw add ptr @Ptr, i128 [[TMP0]] seq_cst, align 8
 // CHECK-NEXT:    [[TMP2:%.*]] = add i128 [[TMP1]], [[TMP0]]
-// CHECK-NEXT:    store i128 [[TMP2]], ptr [[AGG_RESULT:%.*]], align 8, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store i128 [[TMP2]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 __int128 f9() {
   return __atomic_add_fetch(&Ptr, Val, memory_order_seq_cst);
 }
 
-// CHECK-LABEL: @f10(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 8, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @f10(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(i128) align 8 captures(none) initializes((0, 16)) [[AGG_RESULT:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 8, !tbaa [[__INT128_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw sub ptr @Ptr, i128 [[TMP0]] seq_cst, align 8
 // CHECK-NEXT:    [[TMP2:%.*]] = sub i128 [[TMP1]], [[TMP0]]
-// CHECK-NEXT:    store i128 [[TMP2]], ptr [[AGG_RESULT:%.*]], align 8, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store i128 [[TMP2]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 __int128 f10() {
   return __atomic_sub_fetch(&Ptr, Val, memory_order_seq_cst);
 }
 
-// CHECK-LABEL: @f11(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 8, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @f11(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(i128) align 8 captures(none) initializes((0, 16)) [[AGG_RESULT:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 8, !tbaa [[__INT128_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw and ptr @Ptr, i128 [[TMP0]] seq_cst, align 8
 // CHECK-NEXT:    [[TMP2:%.*]] = and i128 [[TMP1]], [[TMP0]]
-// CHECK-NEXT:    store i128 [[TMP2]], ptr [[AGG_RESULT:%.*]], align 8, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store i128 [[TMP2]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 __int128 f11() {
   return __atomic_and_fetch(&Ptr, Val, memory_order_seq_cst);
 }
 
-// CHECK-LABEL: @f12(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 8, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @f12(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(i128) align 8 captures(none) initializes((0, 16)) [[AGG_RESULT:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 8, !tbaa [[__INT128_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw xor ptr @Ptr, i128 [[TMP0]] seq_cst, align 8
 // CHECK-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], [[TMP0]]
-// CHECK-NEXT:    store i128 [[TMP2]], ptr [[AGG_RESULT:%.*]], align 8, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store i128 [[TMP2]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 __int128 f12() {
   return __atomic_xor_fetch(&Ptr, Val, memory_order_seq_cst);
 }
 
-// CHECK-LABEL: @f13(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 8, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @f13(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(i128) align 8 captures(none) initializes((0, 16)) [[AGG_RESULT:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 8, !tbaa [[__INT128_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw or ptr @Ptr, i128 [[TMP0]] seq_cst, align 8
 // CHECK-NEXT:    [[TMP2:%.*]] = or i128 [[TMP1]], [[TMP0]]
-// CHECK-NEXT:    store i128 [[TMP2]], ptr [[AGG_RESULT:%.*]], align 8, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store i128 [[TMP2]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 __int128 f13() {
   return __atomic_or_fetch(&Ptr, Val, memory_order_seq_cst);
 }
 
-// CHECK-LABEL: @f14(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 8, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @f14(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(i128) align 8 captures(none) initializes((0, 16)) [[AGG_RESULT:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 8, !tbaa [[__INT128_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw nand ptr @Ptr, i128 [[TMP0]] seq_cst, align 8
 // CHECK-NEXT:    [[TMP2:%.*]] = and i128 [[TMP1]], [[TMP0]]
 // CHECK-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP2]], -1
-// CHECK-NEXT:    store i128 [[TMP3]], ptr [[AGG_RESULT:%.*]], align 8, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store i128 [[TMP3]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 __int128 f14() {
   return __atomic_nand_fetch(&Ptr, Val, memory_order_seq_cst);
 }
 
-// CHECK-LABEL: @f15(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 8, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @f15(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(i128) align 8 captures(none) initializes((0, 16)) [[AGG_RESULT:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 8, !tbaa [[__INT128_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw add ptr @Ptr, i128 [[TMP0]] seq_cst, align 8
-// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT:%.*]], align 8, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 __int128 f15() {
   return __atomic_fetch_add(&Ptr, Val, memory_order_seq_cst);
 }
 
-// CHECK-LABEL: @f16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 8, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @f16(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(i128) align 8 captures(none) initializes((0, 16)) [[AGG_RESULT:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 8, !tbaa [[__INT128_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw sub ptr @Ptr, i128 [[TMP0]] seq_cst, align 8
-// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT:%.*]], align 8, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 __int128 f16() {
   return __atomic_fetch_sub(&Ptr, Val, memory_order_seq_cst);
 }
 
-// CHECK-LABEL: @f17(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 8, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @f17(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(i128) align 8 captures(none) initializes((0, 16)) [[AGG_RESULT:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 8, !tbaa [[__INT128_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw and ptr @Ptr, i128 [[TMP0]] seq_cst, align 8
-// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT:%.*]], align 8, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 __int128 f17() {
   return __atomic_fetch_and(&Ptr, Val, memory_order_seq_cst);
 }
 
-// CHECK-LABEL: @f18(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 8, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @f18(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(i128) align 8 captures(none) initializes((0, 16)) [[AGG_RESULT:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 8, !tbaa [[__INT128_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw xor ptr @Ptr, i128 [[TMP0]] seq_cst, align 8
-// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT:%.*]], align 8, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 __int128 f18() {
   return __atomic_fetch_xor(&Ptr, Val, memory_order_seq_cst);
 }
 
-// CHECK-LABEL: @f19(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 8, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @f19(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(i128) align 8 captures(none) initializes((0, 16)) [[AGG_RESULT:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 8, !tbaa [[__INT128_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw or ptr @Ptr, i128 [[TMP0]] seq_cst, align 8
-// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT:%.*]], align 8, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 __int128 f19() {
   return __atomic_fetch_or(&Ptr, Val, memory_order_seq_cst);
 }
 
-// CHECK-LABEL: @f20(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 8, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @f20(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(i128) align 8 captures(none) initializes((0, 16)) [[AGG_RESULT:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 8, !tbaa [[__INT128_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw nand ptr @Ptr, i128 [[TMP0]] seq_cst, align 8
-// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT:%.*]], align 8, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 __int128 f20() {
   return __atomic_fetch_nand(&Ptr, Val, memory_order_seq_cst);
 }
+//.
+// CHECK: [[__INT128_TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
+// CHECK: [[META3]] = !{!"__int128", [[META4:![0-9]+]], i64 0}
+// CHECK: [[META4]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0}
+// CHECK: [[META5]] = !{!"Simple C/C++ TBAA"}
+//.
diff --git a/clang/test/CodeGen/SystemZ/sync-builtins-i128-16Al.c b/clang/test/CodeGen/SystemZ/sync-builtins-i128-16Al.c
index e80f2b6920845..601bd7fa16153 100644
--- a/clang/test/CodeGen/SystemZ/sync-builtins-i128-16Al.c
+++ b/clang/test/CodeGen/SystemZ/sync-builtins-i128-16Al.c
@@ -1,4 +1,4 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6
 // RUN: %clang_cc1 -triple s390x-linux-gnu -O1 -emit-llvm %s -o - \
 // RUN:   | FileCheck %s
 //
@@ -10,149 +10,162 @@ __int128 Ptr __attribute__((aligned(16)));
 __int128 Val __attribute__((aligned(16)));
 __int128 OldVal __attribute__((aligned(16)));
 
-// CHECK-LABEL: @f1(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[TBAA2:![0-9]+]]
+// CHECK-LABEL: define dso_local void @f1(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(i128) align 8 captures(none) initializes((0, 16)) [[AGG_RESULT:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[__INT128_TBAA2:![0-9]+]]
 // CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw add ptr @Ptr, i128 [[TMP0]] seq_cst, align 16
-// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT:%.*]], align 8, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 __int128 f1() {
   return __sync_fetch_and_add(&Ptr, Val);
 }
 
-// CHECK-LABEL: @f2(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @f2(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(i128) align 8 captures(none) initializes((0, 16)) [[AGG_RESULT:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[__INT128_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw sub ptr @Ptr, i128 [[TMP0]] seq_cst, align 16
-// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT:%.*]], align 8, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 __int128 f2() {
   return __sync_fetch_and_sub(&Ptr, Val);
 }
 
-// CHECK-LABEL: @f3(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @f3(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(i128) align 8 captures(none) initializes((0, 16)) [[AGG_RESULT:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[__INT128_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw or ptr @Ptr, i128 [[TMP0]] seq_cst, align 16
-// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT:%.*]], align 8, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 __int128 f3() {
   return __sync_fetch_and_or(&Ptr, Val);
 }
 
-// CHECK-LABEL: @f4(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @f4(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(i128) align 8 captures(none) initializes((0, 16)) [[AGG_RESULT:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[__INT128_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw and ptr @Ptr, i128 [[TMP0]] seq_cst, align 16
-// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT:%.*]], align 8, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 __int128 f4() {
   return __sync_fetch_and_and(&Ptr, Val);
 }
 
-// CHECK-LABEL: @f5(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @f5(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(i128) align 8 captures(none) initializes((0, 16)) [[AGG_RESULT:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[__INT128_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw xor ptr @Ptr, i128 [[TMP0]] seq_cst, align 16
-// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT:%.*]], align 8, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 __int128 f5() {
   return __sync_fetch_and_xor(&Ptr, Val);
 }
 
-// CHECK-LABEL: @f6(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @f6(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(i128) align 8 captures(none) initializes((0, 16)) [[AGG_RESULT:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[__INT128_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw nand ptr @Ptr, i128 [[TMP0]] seq_cst, align 16
-// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT:%.*]], align 8, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 __int128 f6() {
   return __sync_fetch_and_nand(&Ptr, Val);
 }
 
-// CHECK-LABEL: @f7(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @f7(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(i128) align 8 captures(none) initializes((0, 16)) [[AGG_RESULT:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[__INT128_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw add ptr @Ptr, i128 [[TMP0]] seq_cst, align 16
 // CHECK-NEXT:    [[TMP2:%.*]] = add i128 [[TMP1]], [[TMP0]]
-// CHECK-NEXT:    store i128 [[TMP2]], ptr [[AGG_RESULT:%.*]], align 8, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store i128 [[TMP2]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 __int128 f7() {
   return __sync_add_and_fetch(&Ptr, Val);
 }
 
-// CHECK-LABEL: @f8(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @f8(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(i128) align 8 captures(none) initializes((0, 16)) [[AGG_RESULT:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[__INT128_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw sub ptr @Ptr, i128 [[TMP0]] seq_cst, align 16
 // CHECK-NEXT:    [[TMP2:%.*]] = sub i128 [[TMP1]], [[TMP0]]
-// CHECK-NEXT:    store i128 [[TMP2]], ptr [[AGG_RESULT:%.*]], align 8, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store i128 [[TMP2]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 __int128 f8() {
   return __sync_sub_and_fetch(&Ptr, Val);
 }
 
-// CHECK-LABEL: @f9(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @f9(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(i128) align 8 captures(none) initializes((0, 16)) [[AGG_RESULT:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[__INT128_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw or ptr @Ptr, i128 [[TMP0]] seq_cst, align 16
 // CHECK-NEXT:    [[TMP2:%.*]] = or i128 [[TMP1]], [[TMP0]]
-// CHECK-NEXT:    store i128 [[TMP2]], ptr [[AGG_RESULT:%.*]], align 8, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store i128 [[TMP2]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 __int128 f9() {
   return __sync_or_and_fetch(&Ptr, Val);
 }
 
-// CHECK-LABEL: @f10(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @f10(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(i128) align 8 captures(none) initializes((0, 16)) [[AGG_RESULT:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[__INT128_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw and ptr @Ptr, i128 [[TMP0]] seq_cst, align 16
 // CHECK-NEXT:    [[TMP2:%.*]] = and i128 [[TMP1]], [[TMP0]]
-// CHECK-NEXT:    store i128 [[TMP2]], ptr [[AGG_RESULT:%.*]], align 8, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store i128 [[TMP2]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 __int128 f10() {
   return __sync_and_and_fetch(&Ptr, Val);
 }
 
-// CHECK-LABEL: @f11(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @f11(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(i128) align 8 captures(none) initializes((0, 16)) [[AGG_RESULT:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[__INT128_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw xor ptr @Ptr, i128 [[TMP0]] seq_cst, align 16
 // CHECK-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], [[TMP0]]
-// CHECK-NEXT:    store i128 [[TMP2]], ptr [[AGG_RESULT:%.*]], align 8, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store i128 [[TMP2]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 __int128 f11() {
   return __sync_xor_and_fetch(&Ptr, Val);
 }
 
-// CHECK-LABEL: @f12(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @f12(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(i128) align 8 captures(none) initializes((0, 16)) [[AGG_RESULT:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[__INT128_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw nand ptr @Ptr, i128 [[TMP0]] seq_cst, align 16
 // CHECK-NEXT:    [[TMP2:%.*]] = and i128 [[TMP1]], [[TMP0]]
 // CHECK-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP2]], -1
-// CHECK-NEXT:    store i128 [[TMP3]], ptr [[AGG_RESULT:%.*]], align 8, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store i128 [[TMP3]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 __int128 f12() {
   return __sync_nand_and_fetch(&Ptr, Val);
 }
 
-// CHECK-LABEL: @f13(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @OldVal, align 16, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load i128, ptr @Val, align 16, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local zeroext i1 @f13(
+// CHECK-SAME: ) local_unnamed_addr #[[ATTR1:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @OldVal, align 16, !tbaa [[__INT128_TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load i128, ptr @Val, align 16, !tbaa [[__INT128_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = cmpxchg ptr @Ptr, i128 [[TMP0]], i128 [[TMP1]] seq_cst seq_cst, align 16
 // CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { i128, i1 } [[TMP2]], 1
 // CHECK-NEXT:    ret i1 [[TMP3]]
@@ -161,32 +174,35 @@ _Bool f13() {
   return __sync_bool_compare_and_swap(&Ptr, OldVal, Val);
 }
 
-// CHECK-LABEL: @f14(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @OldVal, align 16, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load i128, ptr @Val, align 16, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @f14(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(i128) align 8 captures(none) initializes((0, 16)) [[AGG_RESULT:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @OldVal, align 16, !tbaa [[__INT128_TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load i128, ptr @Val, align 16, !tbaa [[__INT128_TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = cmpxchg ptr @Ptr, i128 [[TMP0]], i128 [[TMP1]] seq_cst seq_cst, align 16
 // CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { i128, i1 } [[TMP2]], 0
-// CHECK-NEXT:    store i128 [[TMP3]], ptr [[AGG_RESULT:%.*]], align 8, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store i128 [[TMP3]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 __int128 f14() {
   return __sync_val_compare_and_swap(&Ptr, OldVal, Val);
 }
 
-// CHECK-LABEL: @f15(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @f15(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(i128) align 8 captures(none) initializes((0, 16)) [[AGG_RESULT:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[__INT128_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw xchg ptr @Ptr, i128 [[TMP0]] seq_cst, align 16
-// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT:%.*]], align 8, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 __int128 f15() {
   return __sync_lock_test_and_set(&Ptr, Val);
 }
 
-// CHECK-LABEL: @f16(
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define dso_local void @f16(
+// CHECK-SAME: ) local_unnamed_addr #[[ATTR1]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    store atomic i128 0, ptr @Ptr release, align 16
 // CHECK-NEXT:    ret void
 //
@@ -194,11 +210,12 @@ void f16() {
   return __sync_lock_release(&Ptr);
 }
 
-// CHECK-LABEL: @f17(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @f17(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(i128) align 8 captures(none) initializes((0, 16)) [[AGG_RESULT:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[__INT128_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw xchg ptr @Ptr, i128 [[TMP0]] seq_cst, align 16
-// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT:%.*]], align 8, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 __int128 f17() {
@@ -206,14 +223,21 @@ __int128 f17() {
 }
 
 // Test that a statement expression compiles.
-// CHECK-LABEL: @f18(
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define dso_local void @f18(
+// CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[T_ADDR:%.*]] = alloca i128, align 8
-// CHECK-NEXT:    [[T:%.*]] = load i128, ptr [[TMP0:%.*]], align 8, !tbaa [[TBAA2]]
-// CHECK-NEXT:    store i128 [[T]], ptr [[T_ADDR]], align 8, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[T:%.*]] = load i128, ptr [[TMP0]], align 8, !tbaa [[__INT128_TBAA2]]
+// CHECK-NEXT:    store i128 [[T]], ptr [[T_ADDR]], align 8, !tbaa [[__INT128_TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[T_ADDR]], i128 [[T]], i128 [[T]] seq_cst seq_cst, align 16
 // CHECK-NEXT:    ret void
 //
 void f18(__int128 t) {
   __sync_bool_compare_and_swap(({int x = 1; &t;}), t, t);
 }
+//.
+// CHECK: [[__INT128_TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
+// CHECK: [[META3]] = !{!"__int128", [[META4:![0-9]+]], i64 0}
+// CHECK: [[META4]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0}
+// CHECK: [[META5]] = !{!"Simple C/C++ TBAA"}
+//.
diff --git a/clang/test/CodeGen/SystemZ/zvector2.c b/clang/test/CodeGen/SystemZ/zvector2.c
index b021ae8534353..f00fcdd52c401 100644
--- a/clang/test/CodeGen/SystemZ/zvector2.c
+++ b/clang/test/CodeGen/SystemZ/zvector2.c
@@ -1,4 +1,4 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6
 // RUN: %clang_cc1 -triple s390x-linux-gnu -target-cpu z14 -fzvector \
 // RUN:  -O -emit-llvm -o - -W -Wall -Werror %s | FileCheck %s
 
@@ -8,8 +8,8 @@ volatile vector bool int bi;
 // CHECK-LABEL: define dso_local void @test_assign(
 // CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load volatile <4 x float>, ptr @ff2, align 8, !tbaa [[TBAA3:![0-9]+]]
-// CHECK-NEXT:    store volatile <4 x float> [[TMP0]], ptr @ff, align 8, !tbaa [[TBAA3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load volatile <4 x float>, ptr @ff2, align 8, !tbaa [[CHAR_TBAA3:![0-9]+]]
+// CHECK-NEXT:    store volatile <4 x float> [[TMP0]], ptr @ff, align 8, !tbaa [[CHAR_TBAA3]]
 // CHECK-NEXT:    ret void
 //
 void test_assign (void)
@@ -20,8 +20,8 @@ void test_assign (void)
 // CHECK-LABEL: define dso_local void @test_pos(
 // CHECK-SAME: ) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load volatile <4 x float>, ptr @ff2, align 8, !tbaa [[TBAA3]]
-// CHECK-NEXT:    store volatile <4 x float> [[TMP0]], ptr @ff, align 8, !tbaa [[TBAA3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load volatile <4 x float>, ptr @ff2, align 8, !tbaa [[CHAR_TBAA3]]
+// CHECK-NEXT:    store volatile <4 x float> [[TMP0]], ptr @ff, align 8, !tbaa [[CHAR_TBAA3]]
 // CHECK-NEXT:    ret void
 //
 void test_pos (void)
@@ -32,9 +32,9 @@ void test_pos (void)
 // CHECK-LABEL: define dso_local void @test_neg(
 // CHECK-SAME: ) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load volatile <4 x float>, ptr @ff2, align 8, !tbaa [[TBAA3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load volatile <4 x float>, ptr @ff2, align 8, !tbaa [[CHAR_TBAA3]]
 // CHECK-NEXT:    [[FNEG:%.*]] = fneg <4 x float> [[TMP0]]
-// CHECK-NEXT:    store volatile <4 x float> [[FNEG]], ptr @ff, align 8, !tbaa [[TBAA3]]
+// CHECK-NEXT:    store volatile <4 x float> [[FNEG]], ptr @ff, align 8, !tbaa [[CHAR_TBAA3]]
 // CHECK-NEXT:    ret void
 //
 void test_neg (void)
@@ -45,9 +45,9 @@ void test_neg (void)
 // CHECK-LABEL: define dso_local void @test_preinc(
 // CHECK-SAME: ) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load volatile <4 x float>, ptr @ff2, align 8, !tbaa [[TBAA3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load volatile <4 x float>, ptr @ff2, align 8, !tbaa [[CHAR_TBAA3]]
 // CHECK-NEXT:    [[INC:%.*]] = fadd <4 x float> [[TMP0]], splat (float 1.000000e+00)
-// CHECK-NEXT:    store volatile <4 x float> [[INC]], ptr @ff2, align 8, !tbaa [[TBAA3]]
+// CHECK-NEXT:    store volatile <4 x float> [[INC]], ptr @ff2, align 8, !tbaa [[CHAR_TBAA3]]
 // CHECK-NEXT:    ret void
 //
 void test_preinc (void)
@@ -58,9 +58,9 @@ void test_preinc (void)
 // CHECK-LABEL: define dso_local void @test_postinc(
 // CHECK-SAME: ) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load volatile <4 x float>, ptr @ff2, align 8, !tbaa [[TBAA3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load volatile <4 x float>, ptr @ff2, align 8, !tbaa [[CHAR_TBAA3]]
 // CHECK-NEXT:    [[INC:%.*]] = fadd <4 x float> [[TMP0]], splat (float 1.000000e+00)
-// CHECK-NEXT:    store volatile <4 x float> [[INC]], ptr @ff2, align 8, !tbaa [[TBAA3]]
+// CHECK-NEXT:    store volatile <4 x float> [[INC]], ptr @ff2, align 8, !tbaa [[CHAR_TBAA3]]
 // CHECK-NEXT:    ret void
 //
 void test_postinc (void)
@@ -71,9 +71,9 @@ void test_postinc (void)
 // CHECK-LABEL: define dso_local void @test_predec(
 // CHECK-SAME: ) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load volatile <4 x float>, ptr @ff2, align 8, !tbaa [[TBAA3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load volatile <4 x float>, ptr @ff2, align 8, !tbaa [[CHAR_TBAA3]]
 // CHECK-NEXT:    [[DEC:%.*]] = fadd <4 x float> [[TMP0]], splat (float -1.000000e+00)
-// CHECK-NEXT:    store volatile <4 x float> [[DEC]], ptr @ff2, align 8, !tbaa [[TBAA3]]
+// CHECK-NEXT:    store volatile <4 x float> [[DEC]], ptr @ff2, align 8, !tbaa [[CHAR_TBAA3]]
 // CHECK-NEXT:    ret void
 //
 void test_predec (void)
@@ -84,9 +84,9 @@ void test_predec (void)
 // CHECK-LABEL: define dso_local void @test_postdec(
 // CHECK-SAME: ) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load volatile <4 x float>, ptr @ff2, align 8, !tbaa [[TBAA3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load volatile <4 x float>, ptr @ff2, align 8, !tbaa [[CHAR_TBAA3]]
 // CHECK-NEXT:    [[DEC:%.*]] = fadd <4 x float> [[TMP0]], splat (float -1.000000e+00)
-// CHECK-NEXT:    store volatile <4 x float> [[DEC]], ptr @ff2, align 8, !tbaa [[TBAA3]]
+// CHECK-NEXT:    store volatile <4 x float> [[DEC]], ptr @ff2, align 8, !tbaa [[CHAR_TBAA3]]
 // CHECK-NEXT:    ret void
 //
 void test_postdec (void)
@@ -97,10 +97,10 @@ void test_postdec (void)
 // CHECK-LABEL: define dso_local void @test_add(
 // CHECK-SAME: ) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load volatile <4 x float>, ptr @ff, align 8, !tbaa [[TBAA3]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load volatile <4 x float>, ptr @ff2, align 8, !tbaa [[TBAA3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load volatile <4 x float>, ptr @ff, align 8, !tbaa [[CHAR_TBAA3]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load volatile <4 x float>, ptr @ff2, align 8, !tbaa [[CHAR_TBAA3]]
 // CHECK-NEXT:    [[ADD:%.*]] = fadd <4 x float> [[TMP0]], [[TMP1]]
-// CHECK-NEXT:    store volatile <4 x float> [[ADD]], ptr @ff, align 8, !tbaa [[TBAA3]]
+// CHECK-NEXT:    store volatile <4 x float> [[ADD]], ptr @ff, align 8, !tbaa [[CHAR_TBAA3]]
 // CHECK-NEXT:    ret void
 //
 void test_add (void)
@@ -111,10 +111,10 @@ void test_add (void)
 // CHECK-LABEL: define dso_local void @test_add_assign(
 // CHECK-SAME: ) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load volatile <4 x float>, ptr @ff2, align 8, !tbaa [[TBAA3]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load volatile <4 x float>, ptr @ff, align 8, !tbaa [[TBAA3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load volatile <4 x float>, ptr @ff2, align 8, !tbaa [[CHAR_TBAA3]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load volatile <4 x float>, ptr @ff, align 8, !tbaa [[CHAR_TBAA3]]
 // CHECK-NEXT:    [[ADD:%.*]] = fadd <4 x float> [[TMP0]], [[TMP1]]
-// CHECK-NEXT:    store volatile <4 x float> [[ADD]], ptr @ff, align 8, !tbaa [[TBAA3]]
+// CHECK-NEXT:    store volatile <4 x float> [[ADD]], ptr @ff, align 8, !tbaa [[CHAR_TBAA3]]
 // CHECK-NEXT:    ret void
 //
 void test_add_assign (void)
@@ -125,10 +125,10 @@ void test_add_assign (void)
 // CHECK-LABEL: define dso_local void @test_sub(
 // CHECK-SAME: ) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load volatile <4 x float>, ptr @ff, align 8, !tbaa [[TBAA3]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load volatile <4 x float>, ptr @ff2, align 8, !tbaa [[TBAA3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load volatile <4 x float>, ptr @ff, align 8, !tbaa [[CHAR_TBAA3]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load volatile <4 x float>, ptr @ff2, align 8, !tbaa [[CHAR_TBAA3]]
 // CHECK-NEXT:    [[SUB:%.*]] = fsub <4 x float> [[TMP0]], [[TMP1]]
-// CHECK-NEXT:    store volatile <4 x float> [[SUB]], ptr @ff, align 8, !tbaa [[TBAA3]]
+// CHECK-NEXT:    store volatile <4 x float> [[SUB]], ptr @ff, align 8, !tbaa [[CHAR_TBAA3]]
 // CHECK-NEXT:    ret void
 //
 void test_sub (void)
@@ -139,10 +139,10 @@ void test_sub (void)
 // CHECK-LABEL: define dso_local void @test_sub_assign(
 // CHECK-SAME: ) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load volatile <4 x float>, ptr @ff2, align 8, !tbaa [[TBAA3]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load volatile <4 x float>, ptr @ff, align 8, !tbaa [[TBAA3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load volatile <4 x float>, ptr @ff2, align 8, !tbaa [[CHAR_TBAA3]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load volatile <4 x float>, ptr @ff, align 8, !tbaa [[CHAR_TBAA3]]
 // CHECK-NEXT:    [[SUB:%.*]] = fsub <4 x float> [[TMP1]], [[TMP0]]
-// CHECK-NEXT:    store volatile <4 x float> [[SUB]], ptr @ff, align 8, !tbaa [[TBAA3]]
+// CHECK-NEXT:    store volatile <4 x float> [[SUB]], ptr @ff, align 8, !tbaa [[CHAR_TBAA3]]
 // CHECK-NEXT:    ret void
 //
 void test_sub_assign (void)
@@ -153,10 +153,10 @@ void test_sub_assign (void)
 // CHECK-LABEL: define dso_local void @test_mul(
 // CHECK-SAME: ) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load volatile <4 x float>, ptr @ff, align 8, !tbaa [[TBAA3]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load volatile <4 x float>, ptr @ff2, align 8, !tbaa [[TBAA3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load volatile <4 x float>, ptr @ff, align 8, !tbaa [[CHAR_TBAA3]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load volatile <4 x float>, ptr @ff2, align 8, !tbaa [[CHAR_TBAA3]]
 // CHECK-NEXT:    [[MUL:%.*]] = fmul <4 x float> [[TMP0]], [[TMP1]]
-// CHECK-NEXT:    store volatile <4 x float> [[MUL]], ptr @ff, align 8, !tbaa [[TBAA3]]
+// CHECK-NEXT:    store volatile <4 x float> [[MUL]], ptr @ff, align 8, !tbaa [[CHAR_TBAA3]]
 // CHECK-NEXT:    ret void
 //
 void test_mul (void)
@@ -167,10 +167,10 @@ void test_mul (void)
 // CHECK-LABEL: define dso_local void @test_mul_assign(
 // CHECK-SAME: ) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load volatile <4 x float>, ptr @ff2, align 8, !tbaa [[TBAA3]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load volatile <4 x float>, ptr @ff, align 8, !tbaa [[TBAA3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load volatile <4 x float>, ptr @ff2, align 8, !tbaa [[CHAR_TBAA3]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load volatile <4 x float>, ptr @ff, align 8, !tbaa [[CHAR_TBAA3]]
 // CHECK-NEXT:    [[MUL:%.*]] = fmul <4 x float> [[TMP0]], [[TMP1]]
-// CHECK-NEXT:    store volatile <4 x float> [[MUL]], ptr @ff, align 8, !tbaa [[TBAA3]]
+// CHECK-NEXT:    store volatile <4 x float> [[MUL]], ptr @ff, align 8, !tbaa [[CHAR_TBAA3]]
 // CHECK-NEXT:    ret void
 //
 void test_mul_assign (void)
@@ -181,10 +181,10 @@ void test_mul_assign (void)
 // CHECK-LABEL: define dso_local void @test_div(
 // CHECK-SAME: ) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load volatile <4 x float>, ptr @ff, align 8, !tbaa [[TBAA3]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load volatile <4 x float>, ptr @ff2, align 8, !tbaa [[TBAA3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load volatile <4 x float>, ptr @ff, align 8, !tbaa [[CHAR_TBAA3]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load volatile <4 x float>, ptr @ff2, align 8, !tbaa [[CHAR_TBAA3]]
 // CHECK-NEXT:    [[DIV:%.*]] = fdiv <4 x float> [[TMP0]], [[TMP1]]
-// CHECK-NEXT:    store volatile <4 x float> [[DIV]], ptr @ff, align 8, !tbaa [[TBAA3]]
+// CHECK-NEXT:    store volatile <4 x float> [[DIV]], ptr @ff, align 8, !tbaa [[CHAR_TBAA3]]
 // CHECK-NEXT:    ret void
 //
 void test_div (void)
@@ -195,10 +195,10 @@ void test_div (void)
 // CHECK-LABEL: define dso_local void @test_div_assign(
 // CHECK-SAME: ) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load volatile <4 x float>, ptr @ff2, align 8, !tbaa [[TBAA3]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load volatile <4 x float>, ptr @ff, align 8, !tbaa [[TBAA3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load volatile <4 x float>, ptr @ff2, align 8, !tbaa [[CHAR_TBAA3]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load volatile <4 x float>, ptr @ff, align 8, !tbaa [[CHAR_TBAA3]]
 // CHECK-NEXT:    [[DIV:%.*]] = fdiv <4 x float> [[TMP1]], [[TMP0]]
-// CHECK-NEXT:    store volatile <4 x float> [[DIV]], ptr @ff, align 8, !tbaa [[TBAA3]]
+// CHECK-NEXT:    store volatile <4 x float> [[DIV]], ptr @ff, align 8, !tbaa [[CHAR_TBAA3]]
 // CHECK-NEXT:    ret void
 //
 void test_div_assign (void)
@@ -209,11 +209,11 @@ void test_div_assign (void)
 // CHECK-LABEL: define dso_local void @test_cmpeq(
 // CHECK-SAME: ) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load volatile <4 x float>, ptr @ff, align 8, !tbaa [[TBAA3]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load volatile <4 x float>, ptr @ff2, align 8, !tbaa [[TBAA3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load volatile <4 x float>, ptr @ff, align 8, !tbaa [[CHAR_TBAA3]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load volatile <4 x float>, ptr @ff2, align 8, !tbaa [[CHAR_TBAA3]]
 // CHECK-NEXT:    [[CMP:%.*]] = fcmp oeq <4 x float> [[TMP0]], [[TMP1]]
 // CHECK-NEXT:    [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i32>
-// CHECK-NEXT:    store volatile <4 x i32> [[SEXT]], ptr @bi, align 8, !tbaa [[TBAA3]]
+// CHECK-NEXT:    store volatile <4 x i32> [[SEXT]], ptr @bi, align 8, !tbaa [[CHAR_TBAA3]]
 // CHECK-NEXT:    ret void
 //
 void test_cmpeq (void)
@@ -224,11 +224,11 @@ void test_cmpeq (void)
 // CHECK-LABEL: define dso_local void @test_cmpne(
 // CHECK-SAME: ) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load volatile <4 x float>, ptr @ff, align 8, !tbaa [[TBAA3]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load volatile <4 x float>, ptr @ff2, align 8, !tbaa [[TBAA3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load volatile <4 x float>, ptr @ff, align 8, !tbaa [[CHAR_TBAA3]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load volatile <4 x float>, ptr @ff2, align 8, !tbaa [[CHAR_TBAA3]]
 // CHECK-NEXT:    [[CMP:%.*]] = fcmp une <4 x float> [[TMP0]], [[TMP1]]
 // CHECK-NEXT:    [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i32>
-// CHECK-NEXT:    store volatile <4 x i32> [[SEXT]], ptr @bi, align 8, !tbaa [[TBAA3]]
+// CHECK-NEXT:    store volatile <4 x i32> [[SEXT]], ptr @bi, align 8, !tbaa [[CHAR_TBAA3]]
 // CHECK-NEXT:    ret void
 //
 void test_cmpne (void)
@@ -239,11 +239,11 @@ void test_cmpne (void)
 // CHECK-LABEL: define dso_local void @test_cmpge(
 // CHECK-SAME: ) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load volatile <4 x float>, ptr @ff, align 8, !tbaa [[TBAA3]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load volatile <4 x float>, ptr @ff2, align 8, !tbaa [[TBAA3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load volatile <4 x float>, ptr @ff, align 8, !tbaa [[CHAR_TBAA3]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load volatile <4 x float>, ptr @ff2, align 8, !tbaa [[CHAR_TBAA3]]
 // CHECK-NEXT:    [[CMP:%.*]] = fcmp oge <4 x float> [[TMP0]], [[TMP1]]
 // CHECK-NEXT:    [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i32>
-// CHECK-NEXT:    store volatile <4 x i32> [[SEXT]], ptr @bi, align 8, !tbaa [[TBAA3]]
+// CHECK-NEXT:    store volatile <4 x i32> [[SEXT]], ptr @bi, align 8, !tbaa [[CHAR_TBAA3]]
 // CHECK-NEXT:    ret void
 //
 void test_cmpge (void)
@@ -254,11 +254,11 @@ void test_cmpge (void)
 // CHECK-LABEL: define dso_local void @test_cmpgt(
 // CHECK-SAME: ) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load volatile <4 x float>, ptr @ff, align 8, !tbaa [[TBAA3]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load volatile <4 x float>, ptr @ff2, align 8, !tbaa [[TBAA3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load volatile <4 x float>, ptr @ff, align 8, !tbaa [[CHAR_TBAA3]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load volatile <4 x float>, ptr @ff2, align 8, !tbaa [[CHAR_TBAA3]]
 // CHECK-NEXT:    [[CMP:%.*]] = fcmp ogt <4 x float> [[TMP0]], [[TMP1]]
 // CHECK-NEXT:    [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i32>
-// CHECK-NEXT:    store volatile <4 x i32> [[SEXT]], ptr @bi, align 8, !tbaa [[TBAA3]]
+// CHECK-NEXT:    store volatile <4 x i32> [[SEXT]], ptr @bi, align 8, !tbaa [[CHAR_TBAA3]]
 // CHECK-NEXT:    ret void
 //
 void test_cmpgt (void)
@@ -269,11 +269,11 @@ void test_cmpgt (void)
 // CHECK-LABEL: define dso_local void @test_cmple(
 // CHECK-SAME: ) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load volatile <4 x float>, ptr @ff, align 8, !tbaa [[TBAA3]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load volatile <4 x float>, ptr @ff2, align 8, !tbaa [[TBAA3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load volatile <4 x float>, ptr @ff, align 8, !tbaa [[CHAR_TBAA3]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load volatile <4 x float>, ptr @ff2, align 8, !tbaa [[CHAR_TBAA3]]
 // CHECK-NEXT:    [[CMP:%.*]] = fcmp ole <4 x float> [[TMP0]], [[TMP1]]
 // CHECK-NEXT:    [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i32>
-// CHECK-NEXT:    store volatile <4 x i32> [[SEXT]], ptr @bi, align 8, !tbaa [[TBAA3]]
+// CHECK-NEXT:    store volatile <4 x i32> [[SEXT]], ptr @bi, align 8, !tbaa [[CHAR_TBAA3]]
 // CHECK-NEXT:    ret void
 //
 void test_cmple (void)
@@ -284,11 +284,11 @@ void test_cmple (void)
 // CHECK-LABEL: define dso_local void @test_cmplt(
 // CHECK-SAME: ) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load volatile <4 x float>, ptr @ff, align 8, !tbaa [[TBAA3]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load volatile <4 x float>, ptr @ff2, align 8, !tbaa [[TBAA3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load volatile <4 x float>, ptr @ff, align 8, !tbaa [[CHAR_TBAA3]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load volatile <4 x float>, ptr @ff2, align 8, !tbaa [[CHAR_TBAA3]]
 // CHECK-NEXT:    [[CMP:%.*]] = fcmp olt <4 x float> [[TMP0]], [[TMP1]]
 // CHECK-NEXT:    [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i32>
-// CHECK-NEXT:    store volatile <4 x i32> [[SEXT]], ptr @bi, align 8, !tbaa [[TBAA3]]
+// CHECK-NEXT:    store volatile <4 x i32> [[SEXT]], ptr @bi, align 8, !tbaa [[CHAR_TBAA3]]
 // CHECK-NEXT:    ret void
 //
 void test_cmplt (void)
@@ -297,7 +297,7 @@ void test_cmplt (void)
 }
 
 //.
-// CHECK: [[TBAA3]] = !{[[META4:![0-9]+]], [[META4]], i64 0}
+// CHECK: [[CHAR_TBAA3]] = !{[[META4:![0-9]+]], [[META4]], i64 0}
 // CHECK: [[META4]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0}
 // CHECK: [[META5]] = !{!"Simple C/C++ TBAA"}
 //.
diff --git a/clang/test/CodeGen/X86/avx-builtins.c b/clang/test/CodeGen/X86/avx-builtins.c
index 8223ab2b52cac..7b1a9cc4d9a7f 100644
--- a/clang/test/CodeGen/X86/avx-builtins.c
+++ b/clang/test/CodeGen/X86/avx-builtins.c
@@ -87,12 +87,20 @@ __m256d test_mm256_blend_pd(__m256d A, __m256d B) {
   // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
   return _mm256_blend_pd(A, B, 0x05);
 }
+TEST_CONSTEXPR(match_m256d(_mm256_blend_pd(((__m256d){1.0, 2.0, 3.0, 4.0}), ((__m256d){5.0, 6.0, 7.0, 8.0}), 0x00), 1.0, 2.0, 3.0, 4.0));
+TEST_CONSTEXPR(match_m256d(_mm256_blend_pd(((__m256d){1.0, 2.0, 3.0, 4.0}), ((__m256d){5.0, 6.0, 7.0, 8.0}), 0x05), 5.0, 2.0, 7.0, 4.0));
+TEST_CONSTEXPR(match_m256d(_mm256_blend_pd(((__m256d){1.0, 2.0, 3.0, 4.0}), ((__m256d){5.0, 6.0, 7.0, 8.0}), 0x0A), 1.0, 6.0, 3.0, 8.0));
+TEST_CONSTEXPR(match_m256d(_mm256_blend_pd(((__m256d){1.0, 2.0, 3.0, 4.0}), ((__m256d){5.0, 6.0, 7.0, 8.0}), 0x0F), 5.0, 6.0, 7.0, 8.0));
 
 __m256 test_mm256_blend_ps(__m256 A, __m256 B) {
   // CHECK-LABEL: test_mm256_blend_ps
   // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> <i32 8, i32 1, i32 10, i32 3, i32 12, i32 13, i32 6, i32 7>
   return _mm256_blend_ps(A, B, 0x35);
 }
+TEST_CONSTEXPR(match_m256(_mm256_blend_ps(((__m256){1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f}), ((__m256){-1.0f, -2.0f, -3.0f, -4.0f, -5.0f, -6.0f, -7.0f, -8.0f}), 0x00), 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f));
+TEST_CONSTEXPR(match_m256(_mm256_blend_ps(((__m256){1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f}), ((__m256){-1.0f, -2.0f, -3.0f, -4.0f, -5.0f, -6.0f, -7.0f, -8.0f}), 0x35), -1.0f, 2.0f, -3.0f, 4.0f, -5.0f, -6.0f, 7.0f, 8.0f));
+TEST_CONSTEXPR(match_m256(_mm256_blend_ps(((__m256){1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f}), ((__m256){-1.0f, -2.0f, -3.0f, -4.0f, -5.0f, -6.0f, -7.0f, -8.0f}), 0xAA), 1.0f, -2.0f, 3.0f, -4.0f, 5.0f, -6.0f, 7.0f, -8.0f));
+TEST_CONSTEXPR(match_m256(_mm256_blend_ps(((__m256){1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f}), ((__m256){-1.0f, -2.0f, -3.0f, -4.0f, -5.0f, -6.0f, -7.0f, -8.0f}), 0xFF), -1.0f, -2.0f, -3.0f, -4.0f, -5.0f, -6.0f, -7.0f, -8.0f));
 
 __m256d test_mm256_blendv_pd(__m256d V1, __m256d V2, __m256d V3) {
   // CHECK-LABEL: test_mm256_blendv_pd
diff --git a/clang/test/CodeGen/X86/avx2-builtins.c b/clang/test/CodeGen/X86/avx2-builtins.c
index aeb1aee4ea946..17ab47c72ad4b 100644
--- a/clang/test/CodeGen/X86/avx2-builtins.c
+++ b/clang/test/CodeGen/X86/avx2-builtins.c
@@ -146,6 +146,10 @@ __m256i test_mm256_blend_epi16(__m256i a, __m256i b) {
   // CHECK: shufflevector <16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <16 x i32> <i32 0, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 25, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   return _mm256_blend_epi16(a, b, 2);
 }
+TEST_CONSTEXPR(match_v16hi(_mm256_blend_epi16(((__m256i)(__v16hi){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16}), ((__m256i)(__v16hi){-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15,-16}), 0x00), 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16));
+TEST_CONSTEXPR(match_v16hi(_mm256_blend_epi16(((__m256i)(__v16hi){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16}), ((__m256i)(__v16hi){-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15,-16}), 0x5A), 1,-2,3,-4,-5,6,-7,8,9,-10,11,-12,-13,14,-15,16));
+TEST_CONSTEXPR(match_v16hi(_mm256_blend_epi16(((__m256i)(__v16hi){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16}), ((__m256i)(__v16hi){-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15,-16}), 0x94), 1,2,-3,4,-5,6,7,-8,9,10,-11,12,-13,14,15,-16));
+TEST_CONSTEXPR(match_v16hi(_mm256_blend_epi16(((__m256i)(__v16hi){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16}), ((__m256i)(__v16hi){-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15,-16}), 0xFF), -1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15,-16));
 
 __m128i test_mm_blend_epi32(__m128i a, __m128i b) {
   // CHECK-LABEL: test_mm_blend_epi32
@@ -153,6 +157,10 @@ __m128i test_mm_blend_epi32(__m128i a, __m128i b) {
   // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
   return _mm_blend_epi32(a, b, 0x05);
 }
+TEST_CONSTEXPR(match_v4si(_mm_blend_epi32(((__m128i)(__v4si){1,2,3,4}), ((__m128i)(__v4si){-1,-2,-3,-4}), 0x0), 1,2,3,4));
+TEST_CONSTEXPR(match_v4si(_mm_blend_epi32(((__m128i)(__v4si){1,2,3,4}), ((__m128i)(__v4si){-1,-2,-3,-4}), 0x5), -1,2,-3,4));
+TEST_CONSTEXPR(match_v4si(_mm_blend_epi32(((__m128i)(__v4si){1,2,3,4}), ((__m128i)(__v4si){-1,-2,-3,-4}), 0xA), 1,-2,3,-4));
+TEST_CONSTEXPR(match_v4si(_mm_blend_epi32(((__m128i)(__v4si){1,2,3,4}), ((__m128i)(__v4si){-1,-2,-3,-4}), 0xF), -1,-2,-3,-4));
 
 __m256i test_mm256_blend_epi32(__m256i a, __m256i b) {
   // CHECK-LABEL: test_mm256_blend_epi32
@@ -160,6 +168,10 @@ __m256i test_mm256_blend_epi32(__m256i a, __m256i b) {
   // CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> <i32 8, i32 1, i32 10, i32 3, i32 12, i32 13, i32 6, i32 7>
   return _mm256_blend_epi32(a, b, 0x35);
 }
+TEST_CONSTEXPR(match_v8si(_mm256_blend_epi32(((__m256i)(__v8si){1,2,3,4,5,6,7,8}), ((__m256i)(__v8si){-1,-2,-3,-4,-5,-6,-7,-8}), 0x00), 1,2,3,4,5,6,7,8));
+TEST_CONSTEXPR(match_v8si(_mm256_blend_epi32(((__m256i)(__v8si){1,2,3,4,5,6,7,8}), ((__m256i)(__v8si){-1,-2,-3,-4,-5,-6,-7,-8}), 0xA5), -1,2,-3,4,5,-6,7,-8));
+TEST_CONSTEXPR(match_v8si(_mm256_blend_epi32(((__m256i)(__v8si){1,2,3,4,5,6,7,8}), ((__m256i)(__v8si){-1,-2,-3,-4,-5,-6,-7,-8}), 0x94), 1,2,-3,4,-5,6,7,-8));
+TEST_CONSTEXPR(match_v8si(_mm256_blend_epi32(((__m256i)(__v8si){1,2,3,4,5,6,7,8}), ((__m256i)(__v8si){-1,-2,-3,-4,-5,-6,-7,-8}), 0xFF), -1,-2,-3,-4,-5,-6,-7,-8));
 
 __m256i test_mm256_blendv_epi8(__m256i a, __m256i b, __m256i m) {
   // CHECK-LABEL: test_mm256_blendv_epi8
diff --git a/clang/test/CodeGen/X86/avx512bw-builtins.c b/clang/test/CodeGen/X86/avx512bw-builtins.c
index 9d605efcbd758..3be708aea8a4d 100644
--- a/clang/test/CodeGen/X86/avx512bw-builtins.c
+++ b/clang/test/CodeGen/X86/avx512bw-builtins.c
@@ -854,11 +854,57 @@ __m512i test_mm512_mask_blend_epi8(__mmask64 __U, __m512i __A, __m512i __W) {
   // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
   return _mm512_mask_blend_epi8(__U,__A,__W); 
 }
+TEST_CONSTEXPR(match_v64qi(
+  _mm512_mask_blend_epi8(
+    (__mmask64) 0x00000001,
+    (__m512i)(__v64qi) {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2},
+    (__m512i)(__v64qi){ 10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25, 10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25}
+  ),
+  10, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
+));
 __m512i test_mm512_mask_blend_epi16(__mmask32 __U, __m512i __A, __m512i __W) {
   // CHECK-LABEL: test_mm512_mask_blend_epi16
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_mask_blend_epi16(__U,__A,__W); 
 }
+TEST_CONSTEXPR(match_v32hi(
+  _mm512_mask_blend_epi16(
+    (__mmask32) 0x00000001,
+    (__m512i)(__v32hi) {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2},
+    (__m512i)(__v32hi){ 10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25}
+  ),
+  10, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
+));
+
+__m512i test_mm512_mask_blend_epi32(__mmask16 __U, __m512i __A, __m512i __W) {
+  // CHECK-LABEL: test_mm512_mask_blend_epi32
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
+  return _mm512_mask_blend_epi32(__U, __A, __W);
+}
+TEST_CONSTEXPR(match_v16si(
+  _mm512_mask_blend_epi32(
+    (__mmask16) 0x0001,
+    (__m512i)(__v16si) {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2},
+    (__m512i)(__v16si){ 10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25}
+  ),
+  10, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
+));
+
+__m512i test_mm512_mask_blend_epi64(__mmask8 __U, __m512i __A, __m512i __W) {
+  // CHECK-LABEL: test_mm512_mask_blend_epi64
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
+  return _mm512_mask_blend_epi64(__U, __A, __W);
+}
+
+TEST_CONSTEXPR(match_v8di(
+  _mm512_mask_blend_epi64(
+    (__mmask8)0x01,
+    (__m512i)(__v8di){2, 2, 2, 2, 2, 2, 2, 2},
+    (__m512i)(__v8di){10, 11, 12, 13, 14, 15, 16, 17}
+  ),
+  10, 2, 2, 2, 2, 2, 2, 2
+));
+
 __m512i test_mm512_abs_epi8(__m512i __A) {
   // CHECK-LABEL: test_mm512_abs_epi8
   // CHECK: [[ABS:%.*]] = call <64 x i8> @llvm.abs.v64i8(<64 x i8> %{{.*}}, i1 false)
diff --git a/clang/test/CodeGen/X86/avx512vl-builtins.c b/clang/test/CodeGen/X86/avx512vl-builtins.c
index 9daecd0d9875f..8cef11b12fb93 100644
--- a/clang/test/CodeGen/X86/avx512vl-builtins.c
+++ b/clang/test/CodeGen/X86/avx512vl-builtins.c
@@ -3622,41 +3622,140 @@ __m128i test_mm_mask_blend_epi32(__mmask8 __U, __m128i __A, __m128i __W) {
   // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_mask_blend_epi32(__U,__A,__W); 
 }
+TEST_CONSTEXPR(match_v4si(
+  _mm_mask_blend_epi32(
+    (__mmask8)0x01,
+    (__m128i)(__v4si){2, 2, 2, 2},
+    (__m128i)(__v4si){ 10,11,12,13 }
+  ),
+  10, 2, 2, 2
+));
 __m256i test_mm256_mask_blend_epi32(__mmask8 __U, __m256i __A, __m256i __W) {
   // CHECK-LABEL: test_mm256_mask_blend_epi32
   // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_mask_blend_epi32(__U,__A,__W); 
 }
+TEST_CONSTEXPR(match_v8si(
+  _mm256_mask_blend_epi32(
+    (__mmask8)0x01,
+    (__m256i)(__v8si){2, 2, 2, 2, 2, 2, 2, 2},
+    (__m256i)(__v8si){ 10,11,12,13,14,15,16,17 }
+  ),
+  10, 2, 2, 2, 2, 2, 2, 2
+));
 __m128d test_mm_mask_blend_pd(__mmask8 __U, __m128d __A, __m128d __W) {
   // CHECK-LABEL: test_mm_mask_blend_pd
   // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
   return _mm_mask_blend_pd(__U,__A,__W); 
 }
+TEST_CONSTEXPR(match_m128d(
+  _mm_mask_blend_pd(
+    (__mmask8)0x01, 
+    (__m128d)(__v2df){2.0, 2.0}, 
+    (__m128d)(__v2df){10.0, 20.0}
+  ),
+  10.0, 2.0
+));
 __m256d test_mm256_mask_blend_pd(__mmask8 __U, __m256d __A, __m256d __W) {
   // CHECK-LABEL: test_mm256_mask_blend_pd
   // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
   return _mm256_mask_blend_pd(__U,__A,__W); 
 }
+TEST_CONSTEXPR(match_m256d(
+  _mm256_mask_blend_pd(
+    (__mmask8)0x01,
+    (__m256d)(__v4df){2.0, 2.0, 2.0, 2.0},
+    (__m256d)(__v4df){10.0, 11.0, 12.0, 13.0}
+  ),
+  10.0, 2.0, 2.0, 2.0
+));
+
+__m512d test_mm512_mask_blend_pd(__mmask8 __U, __m512d __A, __m512d __W) {
+  // CHECK-LABEL: test_mm512_mask_blend_pd
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
+  return _mm512_mask_blend_pd(__U, __A, __W);
+}
+
+TEST_CONSTEXPR(match_m512d(
+  _mm512_mask_blend_pd(
+    (__mmask8)0x01,
+    (__m512d)(__v8df){2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0},
+    (__m512d)(__v8df){10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0}
+  ),
+  10.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0
+));
+
 __m128 test_mm_mask_blend_ps(__mmask8 __U, __m128 __A, __m128 __W) {
   // CHECK-LABEL: test_mm_mask_blend_ps
   // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
   return _mm_mask_blend_ps(__U,__A,__W); 
 }
+TEST_CONSTEXPR(match_m128(
+  _mm_mask_blend_ps(
+    (__mmask8)0x01,
+    (__m128)(__v4sf){2.0f, 2.0f, 2.0f, 2.0f},
+    (__m128)(__v4sf){10.0f, 11.0f, 12.0f, 13.0f}
+  ),
+  10.0f, 2.0f, 2.0f, 2.0f
+));
+
 __m256 test_mm256_mask_blend_ps(__mmask8 __U, __m256 __A, __m256 __W) {
   // CHECK-LABEL: test_mm256_mask_blend_ps
   // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
   return _mm256_mask_blend_ps(__U,__A,__W); 
 }
+TEST_CONSTEXPR(match_m256(
+  _mm256_mask_blend_ps(
+    (__mmask8)0x01,
+    (__m256)(__v8sf){2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f},
+    (__m256)(__v8sf){10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f, 17.0f}
+  ),
+  10.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f
+));
+
+__m512 test_mm512_mask_blend_ps(__mmask8 __U, __m512 __A, __m512 __W) {
+  // CHECK-LABEL: test_mm512_mask_blend_ps
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
+  return _mm512_mask_blend_ps(__U, __A, __W);
+}
+TEST_CONSTEXPR(match_m512(
+  _mm512_mask_blend_ps(
+    (__mmask16)0x01,
+    (__m512)(__v16sf){2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f,
+                      2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f},
+    (__m512)(__v16sf){10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f, 17.0f,
+                      18.0f, 19.0f, 20.0f, 21.0f, 22.0f, 23.0f, 24.0f, 25.0f}
+  ),
+  10.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f,
+  2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f
+));
+
 __m128i test_mm_mask_blend_epi64(__mmask8 __U, __m128i __A, __m128i __W) {
   // CHECK-LABEL: test_mm_mask_blend_epi64
   // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
   return _mm_mask_blend_epi64(__U,__A,__W); 
 }
+TEST_CONSTEXPR(match_v2di(
+  _mm_mask_blend_epi64(
+    (__mmask8)0x01,
+    (__m128i)(__v2di){2, 2},
+    (__m128i)(__v2di){ 10,11 }
+  ),
+  10, 2
+));
 __m256i test_mm256_mask_blend_epi64(__mmask8 __U, __m256i __A, __m256i __W) {
   // CHECK-LABEL: test_mm256_mask_blend_epi64
   // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
   return _mm256_mask_blend_epi64(__U,__A,__W); 
 }
+TEST_CONSTEXPR(match_v4di(
+  _mm256_mask_blend_epi64(
+    (__mmask8)0x01,
+    (__m256i)(__v4di){2, 2, 2, 2},
+    (__m256i)(__v4di){ 10,11,12,13 }
+  ),
+  10, 2, 2, 2
+));
 __m128d test_mm_mask_compress_pd(__m128d __W, __mmask8 __U, __m128d __A) {
   // CHECK-LABEL: test_mm_mask_compress_pd
   // CHECK: @llvm.x86.avx512.mask.compress
diff --git a/clang/test/CodeGen/X86/avx512vlbw-builtins.c b/clang/test/CodeGen/X86/avx512vlbw-builtins.c
index d62235a630fd8..d8f9a3ace6f38 100644
--- a/clang/test/CodeGen/X86/avx512vlbw-builtins.c
+++ b/clang/test/CodeGen/X86/avx512vlbw-builtins.c
@@ -893,23 +893,56 @@ __m128i test_mm_mask_blend_epi8(__mmask16 __U, __m128i __A, __m128i __W) {
   // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
   return _mm_mask_blend_epi8(__U,__A,__W); 
 }
+TEST_CONSTEXPR(match_v16qi(
+  _mm_mask_blend_epi8(
+    (__mmask16)0x0001,
+    (__m128i)(__v16qi){2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2},
+    (__m128i)(__v16qi){ 10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25 }
+  ),
+  10, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
+));
+
 __m256i test_mm256_mask_blend_epi8(__mmask32 __U, __m256i __A, __m256i __W) {
   // CHECK-LABEL: test_mm256_mask_blend_epi8
   // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
   return _mm256_mask_blend_epi8(__U,__A,__W); 
 }
+TEST_CONSTEXPR(match_v32qi(
+  _mm256_mask_blend_epi8(
+    (__mmask32) 0x00000001,
+    (__m256i)(__v32qi) {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2},
+    (__m256i)(__v32qi){ 10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25}
+  ),
+  10, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
+));
 
 __m128i test_mm_mask_blend_epi16(__mmask8 __U, __m128i __A, __m128i __W) {
   // CHECK-LABEL: test_mm_mask_blend_epi16
   // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_mask_blend_epi16(__U,__A,__W); 
 }
+TEST_CONSTEXPR(match_v8hi(
+  _mm_mask_blend_epi16(
+    (__mmask8)0x01,
+    (__m128i)(__v8hi){2, 2, 2, 2, 2, 2, 2, 2},
+    (__m128i)(__v8hi){ 10,11,12,13,14,15,16,17 }
+  ),
+  10, 2, 2, 2, 2, 2, 2, 2
+));
 
 __m256i test_mm256_mask_blend_epi16(__mmask16 __U, __m256i __A, __m256i __W) {
   // CHECK-LABEL: test_mm256_mask_blend_epi16
   // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_mask_blend_epi16(__U,__A,__W); 
 }
+TEST_CONSTEXPR(match_v16hi(
+  _mm256_mask_blend_epi16(
+    (__mmask16)0x0001,
+    (__m256i)(__v16hi){2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2},
+    (__m256i)(__v16hi){ 10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25 }
+  ),
+  10, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
+));
 
 __m128i test_mm_mask_abs_epi8(__m128i __W, __mmask16 __U, __m128i __A) {
   // CHECK-LABEL: test_mm_mask_abs_epi8
diff --git a/clang/test/CodeGen/X86/avx512vlfp16-builtins.c b/clang/test/CodeGen/X86/avx512vlfp16-builtins.c
index fd6ea8fe6056d..badfa301e429d 100644
--- a/clang/test/CodeGen/X86/avx512vlfp16-builtins.c
+++ b/clang/test/CodeGen/X86/avx512vlfp16-builtins.c
@@ -3016,6 +3016,14 @@ __m128h test_mm_mask_blend_ph(__mmask8 __U, __m128h __A, __m128h __W) {
   // CHECK:  %{{.*}} = select <8 x i1> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}}
   return _mm_mask_blend_ph(__U, __A, __W);
 }
+TEST_CONSTEXPR(match_m128h(
+  _mm_mask_blend_ph(
+    (__mmask8)0x01,
+    (__m128h)(__v8hf){2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f},
+    (__m128h)(__v8hf){10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f, 17.0f}
+  ),
+  10.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f
+));
 
 __m256h test_mm256_mask_blend_ph(__mmask16 __U, __m256h __A, __m256h __W) {
   // CHECK-LABEL: test_mm256_mask_blend_ph
@@ -3023,6 +3031,41 @@ __m256h test_mm256_mask_blend_ph(__mmask16 __U, __m256h __A, __m256h __W) {
   // CHECK:  %{{.*}} = select <16 x i1> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}}
   return _mm256_mask_blend_ph(__U, __A, __W);
 }
+TEST_CONSTEXPR(match_m256h(
+  _mm256_mask_blend_ph(
+    (__mmask16)0x0001,
+    (__m256h)(__v16hf){2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f,
+                       2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f},
+    (__m256h)(__v16hf){10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f, 17.0f,
+                       18.0f, 19.0f, 20.0f, 21.0f, 22.0f, 23.0f, 24.0f, 25.0f}
+  ),
+  10.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f,
+  2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f
+));
+
+__m512h test_mm512_mask_blend_ph(__mmask32 __U, __m512h __A, __m512h __W) {
+  // CHECK-LABEL: test_mm512_mask_blend_ph
+  // CHECK: %{{.*}} = bitcast i32 %{{.*}} to <32 x i1>
+  // CHECK: %{{.*}} = select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}
+  return _mm512_mask_blend_ph(__U, __A, __W);
+}
+TEST_CONSTEXPR(match_m512h(
+  _mm512_mask_blend_ph(
+    (__mmask32)0x00000001,
+    (__m512h)(__v32hf){2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f,
+                       2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f,
+                       2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f,
+                       2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f},
+    (__m512h)(__v32hf){10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f, 17.0f,
+                       18.0f, 19.0f, 20.0f, 21.0f, 22.0f, 23.0f, 24.0f, 25.0f,
+                       26.0f, 27.0f, 28.0f, 29.0f, 30.0f, 31.0f, 32.0f, 33.0f,
+                       34.0f, 35.0f, 36.0f, 37.0f, 38.0f, 39.0f, 40.0f, 41.0f}
+  ),
+  10.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f,
+  2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f,
+  2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f,
+  2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f
+));
 
 __m128h test_mm_permutex2var_ph(__m128h __A, __m128i __I, __m128h __B) {
   // CHECK-LABEL: test_mm_permutex2var_ph
diff --git a/clang/test/CodeGen/X86/cmpccxadd-builtins-error.c b/clang/test/CodeGen/X86/cmpccxadd-builtins-error.c
index f7ecf12d0becf..d931c0eae01d4 100644
--- a/clang/test/CodeGen/X86/cmpccxadd-builtins-error.c
+++ b/clang/test/CodeGen/X86/cmpccxadd-builtins-error.c
@@ -12,5 +12,5 @@ long long test_cmpccxadd64(void *__A, long long __B, long long __C) {
 }
 
 long long test_cmpccxadd64_2(int *__A, long long __B, long long __C) {
-  return _cmpccxadd_epi64(__A, __B, __C, 3); // expected-warning {{incompatible pointer types passing 'int *' to parameter of type 'long long *'}}
+  return _cmpccxadd_epi64(__A, __B, __C, 3); // expected-error {{incompatible pointer types passing 'int *' to parameter of type 'long long *'}}
 }
diff --git a/clang/test/CodeGen/X86/cygwin-varargs.c b/clang/test/CodeGen/X86/cygwin-varargs.c
deleted file mode 100644
index 4eea7d64bcb35..0000000000000
--- a/clang/test/CodeGen/X86/cygwin-varargs.c
+++ /dev/null
@@ -1,35 +0,0 @@
-// RUN: %clang_cc1 -triple x86_64-windows-gnu -emit-llvm < %s | FileCheck %s
-// RUN: %clang_cc1 -triple x86_64-pc-cygwin -emit-llvm < %s | FileCheck %s
-
-struct foo {
-  int x;
-  float y;
-  char z;
-};
-// CHECK: %[[STRUCT_FOO:.*]] = type { i32, float, i8 }
-
-void f(int a, ...) {
-  // CHECK-LABEL: define dso_local void @f
-  __builtin_va_list ap;
-  __builtin_va_start(ap, a);
-  // CHECK: %[[AP:.*]] = alloca ptr
-  // CHECK: call void @llvm.va_start
-  int b = __builtin_va_arg(ap, int);
-  // CHECK: %[[AP_CUR:.*]] = load ptr, ptr %[[AP]]
-  // CHECK-NEXT: %[[AP_NEXT:.*]] = getelementptr inbounds i8, ptr %[[AP_CUR]], i64 8
-  // CHECK-NEXT: store ptr %[[AP_NEXT]], ptr %[[AP]]
-  double _Complex c = __builtin_va_arg(ap, double _Complex);
-  // CHECK: %[[AP_CUR2:.*]] = load ptr, ptr %[[AP]]
-  // CHECK-NEXT: %[[AP_NEXT2:.*]] = getelementptr inbounds i8, ptr %[[AP_CUR2]], i64 8
-  // CHECK-NEXT: store ptr %[[AP_NEXT2]], ptr %[[AP]]
-  // CHECK-NEXT: load ptr, ptr %[[AP_CUR2]]
-  struct foo d = __builtin_va_arg(ap, struct foo);
-  // CHECK: %[[AP_CUR3:.*]] = load ptr, ptr %[[AP]]
-  // CHECK-NEXT: %[[AP_NEXT3:.*]] = getelementptr inbounds i8, ptr %[[AP_CUR3]], i64 8
-  // CHECK-NEXT: store ptr %[[AP_NEXT3]], ptr %[[AP]]
-  __builtin_va_list ap2;
-  __builtin_va_copy(ap2, ap);
-  // CHECK: call void @llvm.va_copy
-  __builtin_va_end(ap);
-  // CHECK: call void @llvm.va_end
-}
diff --git a/clang/test/CodeGen/X86/f16c-builtins.c b/clang/test/CodeGen/X86/f16c-builtins.c
old mode 100644
new mode 100755
index 6a696273cb3c8..c08ef76d56981
--- a/clang/test/CodeGen/X86/f16c-builtins.c
+++ b/clang/test/CodeGen/X86/f16c-builtins.c
@@ -10,6 +10,7 @@
 
 
 #include <immintrin.h>
+#include "builtin_test_helpers.h"
 
 float test_cvtsh_ss(unsigned short a) {
   // CHECK-LABEL: test_cvtsh_ss
@@ -18,6 +19,10 @@ float test_cvtsh_ss(unsigned short a) {
   return _cvtsh_ss(a);
 }
 
+TEST_CONSTEXPR(_cvtsh_ss(0x0000) == 0.0f);
+TEST_CONSTEXPR(_cvtsh_ss(0x4500) == 5.0f);
+TEST_CONSTEXPR(_cvtsh_ss(0xC000) == -2.0f);
+
 unsigned short test_cvtss_sh(float a) {
   // CHECK-LABEL: test_cvtss_sh
   // CHECK: insertelement <4 x float> poison, float %{{.*}}, i32 0
@@ -29,6 +34,11 @@ unsigned short test_cvtss_sh(float a) {
   return _cvtss_sh(a, 0);
 }
 
+TEST_CONSTEXPR(match_m128(
+    _mm_cvtph_ps(_mm_setr_epi16(0x3C00, 0x4000, 0x4200, 0x4400, 0, 0, 0, 0)), 
+    1.0f, 2.0f, 3.0f, 4.0f
+));
+
 __m128 test_mm_cvtph_ps(__m128i a) {
   // CHECK-LABEL: test_mm_cvtph_ps
   // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -41,6 +51,10 @@ __m256 test_mm256_cvtph_ps(__m128i a) {
   // CHECK: fpext <8 x half> %{{.*}} to <8 x float>
   return _mm256_cvtph_ps(a);
 }
+TEST_CONSTEXPR(match_m256(
+    _mm256_cvtph_ps(_mm_setr_epi16(0x3C00, 0x4000, 0x4200, 0x4400, 0x4500, 0x3800, 0xC000, 0x0000)), 
+    1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 0.5f, -2.0f, 0.0f
+));
 
 __m128i test_mm_cvtps_ph(__m128 a) {
   // CHECK-LABEL: test_mm_cvtps_ph
diff --git a/clang/test/CodeGen/X86/math-builtins.c b/clang/test/CodeGen/X86/math-builtins.c
index 8a85d1f6c3a76..a56f8ba1ee385 100644
--- a/clang/test/CodeGen/X86/math-builtins.c
+++ b/clang/test/CodeGen/X86/math-builtins.c
@@ -1,9 +1,9 @@
-// RUN: %clang_cc1 -triple x86_64-unknown-unknown -w -o - -emit-llvm %s | FileCheck %s -check-prefix=NO__ERRNO
-// RUN: %clang_cc1 -triple x86_64-unknown-unknown -w -o - -emit-llvm -fmath-errno %s | FileCheck %s -check-prefix=HAS_ERRNO
-// RUN: %clang_cc1 -triple x86_64-unknown-unknown -w -o - -emit-llvm -disable-llvm-passes -O2 %s | FileCheck %s -check-prefix=NO__ERRNO
-// RUN: %clang_cc1 -triple x86_64-unknown-unknown -w -o - -emit-llvm -disable-llvm-passes -O2 -fmath-errno %s | FileCheck %s -check-prefix=HAS_ERRNO
-// RUN: %clang_cc1 -triple x86_64-unknown-unknown-gnu -w -o - -emit-llvm -fmath-errno %s | FileCheck %s --check-prefix=HAS_ERRNO_GNU
-// RUN: %clang_cc1 -triple x86_64-unknown-windows-msvc -w -o - -emit-llvm -fmath-errno %s | FileCheck %s --check-prefix=HAS_ERRNO_WIN
+// RUN: %clang_cc1 -Wno-error=incompatible-pointer-types -triple x86_64-unknown-unknown -w -o - -emit-llvm %s | FileCheck %s -check-prefix=NO__ERRNO
+// RUN: %clang_cc1 -Wno-error=incompatible-pointer-types -triple x86_64-unknown-unknown -w -o - -emit-llvm -fmath-errno %s | FileCheck %s -check-prefix=HAS_ERRNO
+// RUN: %clang_cc1 -Wno-error=incompatible-pointer-types -triple x86_64-unknown-unknown -w -o - -emit-llvm -disable-llvm-passes -O2 %s | FileCheck %s -check-prefix=NO__ERRNO
+// RUN: %clang_cc1 -Wno-error=incompatible-pointer-types -triple x86_64-unknown-unknown -w -o - -emit-llvm -disable-llvm-passes -O2 -fmath-errno %s | FileCheck %s -check-prefix=HAS_ERRNO
+// RUN: %clang_cc1 -Wno-error=incompatible-pointer-types -triple x86_64-unknown-unknown-gnu -w -o - -emit-llvm -fmath-errno %s | FileCheck %s --check-prefix=HAS_ERRNO_GNU
+// RUN: %clang_cc1 -Wno-error=incompatible-pointer-types -triple x86_64-unknown-windows-msvc -w -o - -emit-llvm -fmath-errno %s | FileCheck %s --check-prefix=HAS_ERRNO_WIN
 
 // Test attributes and codegen of math builtins.
 
diff --git a/clang/test/CodeGen/X86/sse.c b/clang/test/CodeGen/X86/sse.c
index 017bdd7846fa3..38cc7179543d5 100644
--- a/clang/test/CodeGen/X86/sse.c
+++ b/clang/test/CodeGen/X86/sse.c
@@ -32,6 +32,7 @@ __m128i test_mm_slli_si128_0(__m128i a) {
 // CHECK-LABEL: define dso_local <2 x i64> @test_mm_slli_si128_16(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CAST:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
 // CHECK-NEXT:    ret <2 x i64> zeroinitializer
 //
 __m128i test_mm_slli_si128_16(__m128i a) {
@@ -65,6 +66,7 @@ __m128i test_mm_srli_si128_0(__m128i a) {
 // CHECK-LABEL: define dso_local <2 x i64> @test_mm_srli_si128_16(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CAST:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
 // CHECK-NEXT:    ret <2 x i64> zeroinitializer
 //
 __m128i test_mm_srli_si128_16(__m128i a) {
diff --git a/clang/test/CodeGen/X86/sse41-builtins.c b/clang/test/CodeGen/X86/sse41-builtins.c
index dca161c8038a2..c7265b188d572 100644
--- a/clang/test/CodeGen/X86/sse41-builtins.c
+++ b/clang/test/CodeGen/X86/sse41-builtins.c
@@ -27,18 +27,30 @@ __m128i test_mm_blend_epi16(__m128i V1, __m128i V2) {
   // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 7>
   return _mm_blend_epi16(V1, V2, 42);
 }
+TEST_CONSTEXPR(match_v8hi(_mm_blend_epi16(((__m128i)(__v8hi){1,2,3,4,5,6,7,8}),((__m128i)(__v8hi){-1,-2,-3,-4,-5,-6,-7,-8}),0x00),1,2,3,4,5,6,7,8));
+TEST_CONSTEXPR(match_v8hi(_mm_blend_epi16(((__m128i)(__v8hi){1,2,3,4,5,6,7,8}),((__m128i)(__v8hi){-1,-2,-3,-4,-5,-6,-7,-8}),0x5A),1,-2,3,-4,-5,6,-7,8));
+TEST_CONSTEXPR(match_v8hi(_mm_blend_epi16(((__m128i)(__v8hi){1,2,3,4,5,6,7,8}),((__m128i)(__v8hi){-1,-2,-3,-4,-5,-6,-7,-8}),0x94),1,2,-3,4,-5,6,7,-8));
+TEST_CONSTEXPR(match_v8hi(_mm_blend_epi16(((__m128i)(__v8hi){1,2,3,4,5,6,7,8}),((__m128i)(__v8hi){-1,-2,-3,-4,-5,-6,-7,-8}),0xFF),-1,-2,-3,-4,-5,-6,-7,-8));
 
 __m128d test_mm_blend_pd(__m128d V1, __m128d V2) {
   // CHECK-LABEL: test_mm_blend_pd
   // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x i32> <i32 0, i32 3>
   return _mm_blend_pd(V1, V2, 2);
 }
+TEST_CONSTEXPR(match_m128d(_mm_blend_pd(((__m128d){1.0, 2.0}), ((__m128d){3.0, 4.0}), 0), 1.0, 2.0));
+TEST_CONSTEXPR(match_m128d(_mm_blend_pd(((__m128d){1.0, 2.0}), ((__m128d){3.0, 4.0}), 1), 3.0, 2.0));
+TEST_CONSTEXPR(match_m128d(_mm_blend_pd(((__m128d){1.0, 2.0}), ((__m128d){3.0, 4.0}), 2), 1.0, 4.0));
+TEST_CONSTEXPR(match_m128d(_mm_blend_pd(((__m128d){1.0, 2.0}), ((__m128d){3.0, 4.0}), 3), 3.0, 4.0));
 
 __m128 test_mm_blend_ps(__m128 V1, __m128 V2) {
   // CHECK-LABEL: test_mm_blend_ps
   // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 0, i32 5, i32 6, i32 3>
   return _mm_blend_ps(V1, V2, 6);
 }
+TEST_CONSTEXPR(match_m128(_mm_blend_ps(((__m128){1.0f, 2.0f, 3.0f, 4.0f}), ((__m128){5.0f, 6.0f, 7.0f, 8.0f}), 0x0), 1.0f, 2.0f, 3.0f, 4.0f));
+TEST_CONSTEXPR(match_m128(_mm_blend_ps(((__m128){1.0f, 2.0f, 3.0f, 4.0f}), ((__m128){5.0f, 6.0f, 7.0f, 8.0f}), 0x5), 5.0f, 2.0f, 7.0f, 4.0f));
+TEST_CONSTEXPR(match_m128(_mm_blend_ps(((__m128){1.0f, 2.0f, 3.0f, 4.0f}), ((__m128){5.0f, 6.0f, 7.0f, 8.0f}), 0xA), 1.0f, 6.0f, 3.0f, 8.0f));
+TEST_CONSTEXPR(match_m128(_mm_blend_ps(((__m128){1.0f, 2.0f, 3.0f, 4.0f}), ((__m128){5.0f, 6.0f, 7.0f, 8.0f}), 0xF), 5.0f, 6.0f, 7.0f, 8.0f));
 
 __m128i test_mm_blendv_epi8(__m128i V1, __m128i V2, __m128i V3) {
   // CHECK-LABEL: test_mm_blendv_epi8
@@ -459,4 +471,3 @@ int test_mm_testz_si128(__m128i x, __m128i y) {
   // CHECK: call {{.*}}i32 @llvm.x86.sse41.ptestz(<2 x i64> %{{.*}}, <2 x i64> %{{.*}})
   return _mm_testz_si128(x, y);
 }
-
diff --git a/clang/test/CodeGen/allow-ubsan-check.c b/clang/test/CodeGen/allow-ubsan-check.c
index 6de7676951c90..8d30e29886046 100644
--- a/clang/test/CodeGen/allow-ubsan-check.c
+++ b/clang/test/CodeGen/allow-ubsan-check.c
@@ -1,4 +1,4 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6
 //
 // We can't use -fsanitize-skip-hot-cutoff because that includes both -ubsan-guard-checks and
 //-lower-allow-check-percentile-cutoff.
@@ -98,7 +98,7 @@ int div(int x, int y) {
 // CHECK-NEXT:    tail call void @__ubsan_handle_type_mismatch_v1_abort(ptr nonnull @[[GLOB2:[0-9]+]], i64 0) #[[ATTR6]], !nosanitize [[META2]]
 // CHECK-NEXT:    unreachable, !nosanitize [[META2]]
 // CHECK:       [[CONT]]:
-// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[TBAA5:![0-9]+]]
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[INT_TBAA5:![0-9]+]]
 // CHECK-NEXT:    ret i32 [[TMP2]]
 //
 // TR-LABEL: define dso_local i32 @null(
@@ -112,7 +112,7 @@ int div(int x, int y) {
 // TR-NEXT:    tail call void @llvm.ubsantrap(i8 22) #[[ATTR7]], !nosanitize [[META2]]
 // TR-NEXT:    unreachable, !nosanitize [[META2]]
 // TR:       [[CONT]]:
-// TR-NEXT:    [[TMP2:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[TBAA5:![0-9]+]]
+// TR-NEXT:    [[TMP2:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[INT_TBAA5:![0-9]+]]
 // TR-NEXT:    ret i32 [[TMP2]]
 //
 // REC-LABEL: define dso_local i32 @null(
@@ -126,7 +126,7 @@ int div(int x, int y) {
 // REC-NEXT:    tail call void @__ubsan_handle_type_mismatch_v1(ptr nonnull @[[GLOB2:[0-9]+]], i64 0) #[[ATTR6]], !nosanitize [[META2]]
 // REC-NEXT:    br label %[[CONT]], !nosanitize [[META2]]
 // REC:       [[CONT]]:
-// REC-NEXT:    [[TMP2:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[TBAA5:![0-9]+]]
+// REC-NEXT:    [[TMP2:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[INT_TBAA5:![0-9]+]]
 // REC-NEXT:    ret i32 [[TMP2]]
 //
 int null(int* x) {
@@ -205,7 +205,7 @@ void use(double*);
 // CHECK-NEXT:    br i1 [[TMP3]], label %[[TRAP:.*]], label %[[BB4:.*]]
 // CHECK:       [[BB4]]:
 // CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[VLA]], i64 [[IDXPROM]]
-// CHECK-NEXT:    [[TMP5:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA9:![0-9]+]]
+// CHECK-NEXT:    [[TMP5:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !tbaa [[DOUBLE_TBAA9:![0-9]+]]
 // CHECK-NEXT:    ret double [[TMP5]]
 // CHECK:       [[TRAP]]:
 // CHECK-NEXT:    call void @__ubsan_handle_local_out_of_bounds_abort() #[[ATTR6]], !nosanitize [[META2]]
@@ -224,7 +224,7 @@ void use(double*);
 // TR-NEXT:    br i1 [[TMP3]], label %[[TRAP:.*]], label %[[BB4:.*]]
 // TR:       [[BB4]]:
 // TR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[VLA]], i64 [[IDXPROM]]
-// TR-NEXT:    [[TMP5:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA9:![0-9]+]]
+// TR-NEXT:    [[TMP5:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !tbaa [[DOUBLE_TBAA9:![0-9]+]]
 // TR-NEXT:    ret double [[TMP5]]
 // TR:       [[TRAP]]:
 // TR-NEXT:    call void @llvm.ubsantrap(i8 71) #[[ATTR7]], !nosanitize [[META2]]
@@ -243,7 +243,7 @@ void use(double*);
 // REC-NEXT:    br i1 [[TMP3]], label %[[TRAP:.*]], label %[[BB4:.*]]
 // REC:       [[BB4]]:
 // REC-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[VLA]], i64 [[IDXPROM]]
-// REC-NEXT:    [[TMP5:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA9:![0-9]+]]
+// REC-NEXT:    [[TMP5:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !tbaa [[DOUBLE_TBAA9:![0-9]+]]
 // REC-NEXT:    ret double [[TMP5]]
 // REC:       [[TRAP]]:
 // REC-NEXT:    call void @__ubsan_handle_local_out_of_bounds() #[[ATTR6]], !nosanitize [[META2]]
@@ -259,30 +259,30 @@ double lbounds(int b, int i) {
 // CHECK: [[META2]] = !{}
 // CHECK: [[PROF3]] = !{!"branch_weights", i32 1048575, i32 1}
 // CHECK: [[PROF4]] = !{!"branch_weights", i32 1, i32 1048575}
-// CHECK: [[TBAA5]] = !{[[META6:![0-9]+]], [[META6]], i64 0}
+// CHECK: [[INT_TBAA5]] = !{[[META6:![0-9]+]], [[META6]], i64 0}
 // CHECK: [[META6]] = !{!"int", [[META7:![0-9]+]], i64 0}
 // CHECK: [[META7]] = !{!"omnipotent char", [[META8:![0-9]+]], i64 0}
 // CHECK: [[META8]] = !{!"Simple C/C++ TBAA"}
-// CHECK: [[TBAA9]] = !{[[META10:![0-9]+]], [[META10]], i64 0}
+// CHECK: [[DOUBLE_TBAA9]] = !{[[META10:![0-9]+]], [[META10]], i64 0}
 // CHECK: [[META10]] = !{!"double", [[META7]], i64 0}
 //.
 // TR: [[META2]] = !{}
 // TR: [[PROF3]] = !{!"branch_weights", i32 1048575, i32 1}
 // TR: [[PROF4]] = !{!"branch_weights", i32 1, i32 1048575}
-// TR: [[TBAA5]] = !{[[META6:![0-9]+]], [[META6]], i64 0}
+// TR: [[INT_TBAA5]] = !{[[META6:![0-9]+]], [[META6]], i64 0}
 // TR: [[META6]] = !{!"int", [[META7:![0-9]+]], i64 0}
 // TR: [[META7]] = !{!"omnipotent char", [[META8:![0-9]+]], i64 0}
 // TR: [[META8]] = !{!"Simple C/C++ TBAA"}
-// TR: [[TBAA9]] = !{[[META10:![0-9]+]], [[META10]], i64 0}
+// TR: [[DOUBLE_TBAA9]] = !{[[META10:![0-9]+]], [[META10]], i64 0}
 // TR: [[META10]] = !{!"double", [[META7]], i64 0}
 //.
 // REC: [[META2]] = !{}
 // REC: [[PROF3]] = !{!"branch_weights", i32 1048575, i32 1}
 // REC: [[PROF4]] = !{!"branch_weights", i32 1, i32 1048575}
-// REC: [[TBAA5]] = !{[[META6:![0-9]+]], [[META6]], i64 0}
+// REC: [[INT_TBAA5]] = !{[[META6:![0-9]+]], [[META6]], i64 0}
 // REC: [[META6]] = !{!"int", [[META7:![0-9]+]], i64 0}
 // REC: [[META7]] = !{!"omnipotent char", [[META8:![0-9]+]], i64 0}
 // REC: [[META8]] = !{!"Simple C/C++ TBAA"}
-// REC: [[TBAA9]] = !{[[META10:![0-9]+]], [[META10]], i64 0}
+// REC: [[DOUBLE_TBAA9]] = !{[[META10:![0-9]+]], [[META10]], i64 0}
 // REC: [[META10]] = !{!"double", [[META7]], i64 0}
 //.
diff --git a/clang/test/CodeGen/arm64-microsoft-intrinsics.c b/clang/test/CodeGen/arm64-microsoft-intrinsics.c
index 51e0038b64cde..c0ff785883c71 100644
--- a/clang/test/CodeGen/arm64-microsoft-intrinsics.c
+++ b/clang/test/CodeGen/arm64-microsoft-intrinsics.c
@@ -4,14 +4,21 @@
 // RUN: not %clang_cc1 -triple arm64-linux -Werror -S -o /dev/null %s 2>&1 \
 // RUN:    | FileCheck %s -check-prefix CHECK-LINUX
 
-// RUN: %clang_cc1 -triple arm64-darwin -Wno-implicit-function-declaration -fms-compatibility -emit-llvm -o - %s \
+// RUN: %clang_cc1 -triple arm64-darwin -Wno-implicit-function-declaration -fms-compatibility -emit-llvm -o - -DARM64_DARWIN %s \
 // RUN:    | FileCheck %s -check-prefix CHECK-MSCOMPAT
 
-long test_InterlockedAdd(long volatile *Addend, long Value) {
+// For some reason '_InterlockedAdd` on arm64-darwin takes an 'int*' rather than a 'long*'.
+#ifdef ARM64_DARWIN
+typedef int int32_t;
+#else
+typedef long int32_t;
+#endif
+
+long test_InterlockedAdd(int32_t volatile *Addend, long Value) {
   return _InterlockedAdd(Addend, Value);
 }
 
-long test_InterlockedAdd_constant(long volatile *Addend) {
+long test_InterlockedAdd_constant(int32_t volatile *Addend) {
   return _InterlockedAdd(Addend, -1);
 }
 
@@ -21,7 +28,7 @@ long test_InterlockedAdd_constant(long volatile *Addend) {
 // CHECK-MSVC: ret i32 %[[NEWVAL:[0-9]+]]
 // CHECK-LINUX: error: call to undeclared function '_InterlockedAdd'
 
-long test_InterlockedAdd_acq(long volatile *Addend, long Value) {
+long test_InterlockedAdd_acq(int32_t volatile *Addend, long Value) {
   return _InterlockedAdd_acq(Addend, Value);
 }
 
@@ -31,7 +38,7 @@ long test_InterlockedAdd_acq(long volatile *Addend, long Value) {
 // CHECK-MSVC: ret i32 %[[NEWVAL:[0-9]+]]
 // CHECK-LINUX: error: call to undeclared function '_InterlockedAdd_acq'
 
-long test_InterlockedAdd_nf(long volatile *Addend, long Value) {
+long test_InterlockedAdd_nf(int32_t volatile *Addend, long Value) {
   return _InterlockedAdd_nf(Addend, Value);
 }
 
@@ -41,7 +48,7 @@ long test_InterlockedAdd_nf(long volatile *Addend, long Value) {
 // CHECK-MSVC: ret i32 %[[NEWVAL:[0-9]+]]
 // CHECK-LINUX: error: call to undeclared function '_InterlockedAdd_nf'
 
-long test_InterlockedAdd_rel(long volatile *Addend, long Value) {
+long test_InterlockedAdd_rel(int32_t volatile *Addend, long Value) {
   return _InterlockedAdd_rel(Addend, Value);
 }
 
diff --git a/clang/test/CodeGen/attr-arm-sve-vector-bits-bitcast.c b/clang/test/CodeGen/attr-arm-sve-vector-bits-bitcast.c
index c5a410193bfb7..847ce67fcc31b 100644
--- a/clang/test/CodeGen/attr-arm-sve-vector-bits-bitcast.c
+++ b/clang/test/CodeGen/attr-arm-sve-vector-bits-bitcast.c
@@ -1,4 +1,4 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6
 // RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +bf16 -mvscale-min=1 -mvscale-max=1 -O1 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-128
 // RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +bf16 -mvscale-min=2 -mvscale-max=2 -O1 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-256
 // RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +bf16 -mvscale-min=4 -mvscale-max=4 -O1 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-512
@@ -28,24 +28,27 @@ DEFINE_STRUCT(bool)
 // int64
 //===----------------------------------------------------------------------===//
 
-// CHECK-128-LABEL: @read_int64(
-// CHECK-128-NEXT:  entry:
-// CHECK-128-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S:%.*]], i64 16
-// CHECK-128-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[Y]], align 16, !tbaa [[TBAA2:![0-9]+]]
+// CHECK-128-LABEL: define dso_local <vscale x 2 x i64> @read_int64(
+// CHECK-128-SAME: ptr noundef readonly captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// CHECK-128-NEXT:  [[ENTRY:.*:]]
+// CHECK-128-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S]], i64 16
+// CHECK-128-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[Y]], align 16, !tbaa [[CHAR_TBAA2:![0-9]+]]
 // CHECK-128-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v2i64(<vscale x 2 x i64> poison, <2 x i64> [[TMP0]], i64 0)
 // CHECK-128-NEXT:    ret <vscale x 2 x i64> [[CAST_SCALABLE]]
 //
-// CHECK-256-LABEL: @read_int64(
-// CHECK-256-NEXT:  entry:
-// CHECK-256-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S:%.*]], i64 32
-// CHECK-256-NEXT:    [[TMP0:%.*]] = load <4 x i64>, ptr [[Y]], align 16, !tbaa [[TBAA2:![0-9]+]]
+// CHECK-256-LABEL: define dso_local <vscale x 2 x i64> @read_int64(
+// CHECK-256-SAME: ptr noundef readonly captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// CHECK-256-NEXT:  [[ENTRY:.*:]]
+// CHECK-256-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S]], i64 32
+// CHECK-256-NEXT:    [[TMP0:%.*]] = load <4 x i64>, ptr [[Y]], align 16, !tbaa [[CHAR_TBAA2:![0-9]+]]
 // CHECK-256-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> poison, <4 x i64> [[TMP0]], i64 0)
 // CHECK-256-NEXT:    ret <vscale x 2 x i64> [[CAST_SCALABLE]]
 //
-// CHECK-512-LABEL: @read_int64(
-// CHECK-512-NEXT:  entry:
-// CHECK-512-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S:%.*]], i64 64
-// CHECK-512-NEXT:    [[TMP0:%.*]] = load <8 x i64>, ptr [[Y]], align 16, !tbaa [[TBAA2:![0-9]+]]
+// CHECK-512-LABEL: define dso_local <vscale x 2 x i64> @read_int64(
+// CHECK-512-SAME: ptr noundef readonly captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// CHECK-512-NEXT:  [[ENTRY:.*:]]
+// CHECK-512-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S]], i64 64
+// CHECK-512-NEXT:    [[TMP0:%.*]] = load <8 x i64>, ptr [[Y]], align 16, !tbaa [[CHAR_TBAA2:![0-9]+]]
 // CHECK-512-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v8i64(<vscale x 2 x i64> poison, <8 x i64> [[TMP0]], i64 0)
 // CHECK-512-NEXT:    ret <vscale x 2 x i64> [[CAST_SCALABLE]]
 //
@@ -53,25 +56,28 @@ svint64_t read_int64(struct struct_int64 *s) {
   return s->y[0];
 }
 
-// CHECK-128-LABEL: @write_int64(
-// CHECK-128-NEXT:  entry:
-// CHECK-128-NEXT:    [[CAST_FIXED:%.*]] = tail call <2 x i64> @llvm.vector.extract.v2i64.nxv2i64(<vscale x 2 x i64> [[X:%.*]], i64 0)
-// CHECK-128-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S:%.*]], i64 16
-// CHECK-128-NEXT:    store <2 x i64> [[CAST_FIXED]], ptr [[Y]], align 16, !tbaa [[TBAA2]]
+// CHECK-128-LABEL: define dso_local void @write_int64(
+// CHECK-128-SAME: ptr noundef writeonly captures(none) initializes((16, 32)) [[S:%.*]], <vscale x 2 x i64> [[X:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] {
+// CHECK-128-NEXT:  [[ENTRY:.*:]]
+// CHECK-128-NEXT:    [[CAST_FIXED:%.*]] = tail call <2 x i64> @llvm.vector.extract.v2i64.nxv2i64(<vscale x 2 x i64> [[X]], i64 0)
+// CHECK-128-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S]], i64 16
+// CHECK-128-NEXT:    store <2 x i64> [[CAST_FIXED]], ptr [[Y]], align 16, !tbaa [[CHAR_TBAA2]]
 // CHECK-128-NEXT:    ret void
 //
-// CHECK-256-LABEL: @write_int64(
-// CHECK-256-NEXT:  entry:
-// CHECK-256-NEXT:    [[CAST_FIXED:%.*]] = tail call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> [[X:%.*]], i64 0)
-// CHECK-256-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S:%.*]], i64 32
-// CHECK-256-NEXT:    store <4 x i64> [[CAST_FIXED]], ptr [[Y]], align 16, !tbaa [[TBAA2]]
+// CHECK-256-LABEL: define dso_local void @write_int64(
+// CHECK-256-SAME: ptr noundef writeonly captures(none) initializes((32, 64)) [[S:%.*]], <vscale x 2 x i64> [[X:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] {
+// CHECK-256-NEXT:  [[ENTRY:.*:]]
+// CHECK-256-NEXT:    [[CAST_FIXED:%.*]] = tail call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> [[X]], i64 0)
+// CHECK-256-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S]], i64 32
+// CHECK-256-NEXT:    store <4 x i64> [[CAST_FIXED]], ptr [[Y]], align 16, !tbaa [[CHAR_TBAA2]]
 // CHECK-256-NEXT:    ret void
 //
-// CHECK-512-LABEL: @write_int64(
-// CHECK-512-NEXT:  entry:
-// CHECK-512-NEXT:    [[CAST_FIXED:%.*]] = tail call <8 x i64> @llvm.vector.extract.v8i64.nxv2i64(<vscale x 2 x i64> [[X:%.*]], i64 0)
-// CHECK-512-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S:%.*]], i64 64
-// CHECK-512-NEXT:    store <8 x i64> [[CAST_FIXED]], ptr [[Y]], align 16, !tbaa [[TBAA2]]
+// CHECK-512-LABEL: define dso_local void @write_int64(
+// CHECK-512-SAME: ptr noundef writeonly captures(none) initializes((64, 128)) [[S:%.*]], <vscale x 2 x i64> [[X:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] {
+// CHECK-512-NEXT:  [[ENTRY:.*:]]
+// CHECK-512-NEXT:    [[CAST_FIXED:%.*]] = tail call <8 x i64> @llvm.vector.extract.v8i64.nxv2i64(<vscale x 2 x i64> [[X]], i64 0)
+// CHECK-512-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S]], i64 64
+// CHECK-512-NEXT:    store <8 x i64> [[CAST_FIXED]], ptr [[Y]], align 16, !tbaa [[CHAR_TBAA2]]
 // CHECK-512-NEXT:    ret void
 //
 void write_int64(struct struct_int64 *s, svint64_t x) {
@@ -82,24 +88,27 @@ void write_int64(struct struct_int64 *s, svint64_t x) {
 // float64
 //===----------------------------------------------------------------------===//
 
-// CHECK-128-LABEL: @read_float64(
-// CHECK-128-NEXT:  entry:
-// CHECK-128-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S:%.*]], i64 16
-// CHECK-128-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[Y]], align 16, !tbaa [[TBAA2]]
+// CHECK-128-LABEL: define dso_local <vscale x 2 x double> @read_float64(
+// CHECK-128-SAME: ptr noundef readonly captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-128-NEXT:  [[ENTRY:.*:]]
+// CHECK-128-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S]], i64 16
+// CHECK-128-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[Y]], align 16, !tbaa [[CHAR_TBAA2]]
 // CHECK-128-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 2 x double> @llvm.vector.insert.nxv2f64.v2f64(<vscale x 2 x double> poison, <2 x double> [[TMP0]], i64 0)
 // CHECK-128-NEXT:    ret <vscale x 2 x double> [[CAST_SCALABLE]]
 //
-// CHECK-256-LABEL: @read_float64(
-// CHECK-256-NEXT:  entry:
-// CHECK-256-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S:%.*]], i64 32
-// CHECK-256-NEXT:    [[TMP0:%.*]] = load <4 x double>, ptr [[Y]], align 16, !tbaa [[TBAA2]]
+// CHECK-256-LABEL: define dso_local <vscale x 2 x double> @read_float64(
+// CHECK-256-SAME: ptr noundef readonly captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-256-NEXT:  [[ENTRY:.*:]]
+// CHECK-256-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S]], i64 32
+// CHECK-256-NEXT:    [[TMP0:%.*]] = load <4 x double>, ptr [[Y]], align 16, !tbaa [[CHAR_TBAA2]]
 // CHECK-256-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 2 x double> @llvm.vector.insert.nxv2f64.v4f64(<vscale x 2 x double> poison, <4 x double> [[TMP0]], i64 0)
 // CHECK-256-NEXT:    ret <vscale x 2 x double> [[CAST_SCALABLE]]
 //
-// CHECK-512-LABEL: @read_float64(
-// CHECK-512-NEXT:  entry:
-// CHECK-512-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S:%.*]], i64 64
-// CHECK-512-NEXT:    [[TMP0:%.*]] = load <8 x double>, ptr [[Y]], align 16, !tbaa [[TBAA2]]
+// CHECK-512-LABEL: define dso_local <vscale x 2 x double> @read_float64(
+// CHECK-512-SAME: ptr noundef readonly captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-512-NEXT:  [[ENTRY:.*:]]
+// CHECK-512-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S]], i64 64
+// CHECK-512-NEXT:    [[TMP0:%.*]] = load <8 x double>, ptr [[Y]], align 16, !tbaa [[CHAR_TBAA2]]
 // CHECK-512-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 2 x double> @llvm.vector.insert.nxv2f64.v8f64(<vscale x 2 x double> poison, <8 x double> [[TMP0]], i64 0)
 // CHECK-512-NEXT:    ret <vscale x 2 x double> [[CAST_SCALABLE]]
 //
@@ -107,25 +116,28 @@ svfloat64_t read_float64(struct struct_float64 *s) {
   return s->y[0];
 }
 
-// CHECK-128-LABEL: @write_float64(
-// CHECK-128-NEXT:  entry:
-// CHECK-128-NEXT:    [[CAST_FIXED:%.*]] = tail call <2 x double> @llvm.vector.extract.v2f64.nxv2f64(<vscale x 2 x double> [[X:%.*]], i64 0)
-// CHECK-128-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S:%.*]], i64 16
-// CHECK-128-NEXT:    store <2 x double> [[CAST_FIXED]], ptr [[Y]], align 16, !tbaa [[TBAA2]]
+// CHECK-128-LABEL: define dso_local void @write_float64(
+// CHECK-128-SAME: ptr noundef writeonly captures(none) initializes((16, 32)) [[S:%.*]], <vscale x 2 x double> [[X:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-128-NEXT:  [[ENTRY:.*:]]
+// CHECK-128-NEXT:    [[CAST_FIXED:%.*]] = tail call <2 x double> @llvm.vector.extract.v2f64.nxv2f64(<vscale x 2 x double> [[X]], i64 0)
+// CHECK-128-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S]], i64 16
+// CHECK-128-NEXT:    store <2 x double> [[CAST_FIXED]], ptr [[Y]], align 16, !tbaa [[CHAR_TBAA2]]
 // CHECK-128-NEXT:    ret void
 //
-// CHECK-256-LABEL: @write_float64(
-// CHECK-256-NEXT:  entry:
-// CHECK-256-NEXT:    [[CAST_FIXED:%.*]] = tail call <4 x double> @llvm.vector.extract.v4f64.nxv2f64(<vscale x 2 x double> [[X:%.*]], i64 0)
-// CHECK-256-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S:%.*]], i64 32
-// CHECK-256-NEXT:    store <4 x double> [[CAST_FIXED]], ptr [[Y]], align 16, !tbaa [[TBAA2]]
+// CHECK-256-LABEL: define dso_local void @write_float64(
+// CHECK-256-SAME: ptr noundef writeonly captures(none) initializes((32, 64)) [[S:%.*]], <vscale x 2 x double> [[X:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-256-NEXT:  [[ENTRY:.*:]]
+// CHECK-256-NEXT:    [[CAST_FIXED:%.*]] = tail call <4 x double> @llvm.vector.extract.v4f64.nxv2f64(<vscale x 2 x double> [[X]], i64 0)
+// CHECK-256-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S]], i64 32
+// CHECK-256-NEXT:    store <4 x double> [[CAST_FIXED]], ptr [[Y]], align 16, !tbaa [[CHAR_TBAA2]]
 // CHECK-256-NEXT:    ret void
 //
-// CHECK-512-LABEL: @write_float64(
-// CHECK-512-NEXT:  entry:
-// CHECK-512-NEXT:    [[CAST_FIXED:%.*]] = tail call <8 x double> @llvm.vector.extract.v8f64.nxv2f64(<vscale x 2 x double> [[X:%.*]], i64 0)
-// CHECK-512-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S:%.*]], i64 64
-// CHECK-512-NEXT:    store <8 x double> [[CAST_FIXED]], ptr [[Y]], align 16, !tbaa [[TBAA2]]
+// CHECK-512-LABEL: define dso_local void @write_float64(
+// CHECK-512-SAME: ptr noundef writeonly captures(none) initializes((64, 128)) [[S:%.*]], <vscale x 2 x double> [[X:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-512-NEXT:  [[ENTRY:.*:]]
+// CHECK-512-NEXT:    [[CAST_FIXED:%.*]] = tail call <8 x double> @llvm.vector.extract.v8f64.nxv2f64(<vscale x 2 x double> [[X]], i64 0)
+// CHECK-512-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S]], i64 64
+// CHECK-512-NEXT:    store <8 x double> [[CAST_FIXED]], ptr [[Y]], align 16, !tbaa [[CHAR_TBAA2]]
 // CHECK-512-NEXT:    ret void
 //
 void write_float64(struct struct_float64 *s, svfloat64_t x) {
@@ -136,24 +148,27 @@ void write_float64(struct struct_float64 *s, svfloat64_t x) {
 // bfloat16
 //===----------------------------------------------------------------------===//
 
-// CHECK-128-LABEL: @read_bfloat16(
-// CHECK-128-NEXT:  entry:
-// CHECK-128-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S:%.*]], i64 16
-// CHECK-128-NEXT:    [[TMP0:%.*]] = load <8 x bfloat>, ptr [[Y]], align 16, !tbaa [[TBAA2]]
+// CHECK-128-LABEL: define dso_local <vscale x 8 x bfloat> @read_bfloat16(
+// CHECK-128-SAME: ptr noundef readonly captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-128-NEXT:  [[ENTRY:.*:]]
+// CHECK-128-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S]], i64 16
+// CHECK-128-NEXT:    [[TMP0:%.*]] = load <8 x bfloat>, ptr [[Y]], align 16, !tbaa [[CHAR_TBAA2]]
 // CHECK-128-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.insert.nxv8bf16.v8bf16(<vscale x 8 x bfloat> poison, <8 x bfloat> [[TMP0]], i64 0)
 // CHECK-128-NEXT:    ret <vscale x 8 x bfloat> [[CAST_SCALABLE]]
 //
-// CHECK-256-LABEL: @read_bfloat16(
-// CHECK-256-NEXT:  entry:
-// CHECK-256-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S:%.*]], i64 32
-// CHECK-256-NEXT:    [[TMP0:%.*]] = load <16 x bfloat>, ptr [[Y]], align 16, !tbaa [[TBAA2]]
+// CHECK-256-LABEL: define dso_local <vscale x 8 x bfloat> @read_bfloat16(
+// CHECK-256-SAME: ptr noundef readonly captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-256-NEXT:  [[ENTRY:.*:]]
+// CHECK-256-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S]], i64 32
+// CHECK-256-NEXT:    [[TMP0:%.*]] = load <16 x bfloat>, ptr [[Y]], align 16, !tbaa [[CHAR_TBAA2]]
 // CHECK-256-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.insert.nxv8bf16.v16bf16(<vscale x 8 x bfloat> poison, <16 x bfloat> [[TMP0]], i64 0)
 // CHECK-256-NEXT:    ret <vscale x 8 x bfloat> [[CAST_SCALABLE]]
 //
-// CHECK-512-LABEL: @read_bfloat16(
-// CHECK-512-NEXT:  entry:
-// CHECK-512-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S:%.*]], i64 64
-// CHECK-512-NEXT:    [[TMP0:%.*]] = load <32 x bfloat>, ptr [[Y]], align 16, !tbaa [[TBAA2]]
+// CHECK-512-LABEL: define dso_local <vscale x 8 x bfloat> @read_bfloat16(
+// CHECK-512-SAME: ptr noundef readonly captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-512-NEXT:  [[ENTRY:.*:]]
+// CHECK-512-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S]], i64 64
+// CHECK-512-NEXT:    [[TMP0:%.*]] = load <32 x bfloat>, ptr [[Y]], align 16, !tbaa [[CHAR_TBAA2]]
 // CHECK-512-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.insert.nxv8bf16.v32bf16(<vscale x 8 x bfloat> poison, <32 x bfloat> [[TMP0]], i64 0)
 // CHECK-512-NEXT:    ret <vscale x 8 x bfloat> [[CAST_SCALABLE]]
 //
@@ -161,25 +176,28 @@ svbfloat16_t read_bfloat16(struct struct_bfloat16 *s) {
   return s->y[0];
 }
 
-// CHECK-128-LABEL: @write_bfloat16(
-// CHECK-128-NEXT:  entry:
-// CHECK-128-NEXT:    [[CAST_FIXED:%.*]] = tail call <8 x bfloat> @llvm.vector.extract.v8bf16.nxv8bf16(<vscale x 8 x bfloat> [[X:%.*]], i64 0)
-// CHECK-128-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S:%.*]], i64 16
-// CHECK-128-NEXT:    store <8 x bfloat> [[CAST_FIXED]], ptr [[Y]], align 16, !tbaa [[TBAA2]]
+// CHECK-128-LABEL: define dso_local void @write_bfloat16(
+// CHECK-128-SAME: ptr noundef writeonly captures(none) initializes((16, 32)) [[S:%.*]], <vscale x 8 x bfloat> [[X:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-128-NEXT:  [[ENTRY:.*:]]
+// CHECK-128-NEXT:    [[CAST_FIXED:%.*]] = tail call <8 x bfloat> @llvm.vector.extract.v8bf16.nxv8bf16(<vscale x 8 x bfloat> [[X]], i64 0)
+// CHECK-128-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S]], i64 16
+// CHECK-128-NEXT:    store <8 x bfloat> [[CAST_FIXED]], ptr [[Y]], align 16, !tbaa [[CHAR_TBAA2]]
 // CHECK-128-NEXT:    ret void
 //
-// CHECK-256-LABEL: @write_bfloat16(
-// CHECK-256-NEXT:  entry:
-// CHECK-256-NEXT:    [[CAST_FIXED:%.*]] = tail call <16 x bfloat> @llvm.vector.extract.v16bf16.nxv8bf16(<vscale x 8 x bfloat> [[X:%.*]], i64 0)
-// CHECK-256-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S:%.*]], i64 32
-// CHECK-256-NEXT:    store <16 x bfloat> [[CAST_FIXED]], ptr [[Y]], align 16, !tbaa [[TBAA2]]
+// CHECK-256-LABEL: define dso_local void @write_bfloat16(
+// CHECK-256-SAME: ptr noundef writeonly captures(none) initializes((32, 64)) [[S:%.*]], <vscale x 8 x bfloat> [[X:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-256-NEXT:  [[ENTRY:.*:]]
+// CHECK-256-NEXT:    [[CAST_FIXED:%.*]] = tail call <16 x bfloat> @llvm.vector.extract.v16bf16.nxv8bf16(<vscale x 8 x bfloat> [[X]], i64 0)
+// CHECK-256-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S]], i64 32
+// CHECK-256-NEXT:    store <16 x bfloat> [[CAST_FIXED]], ptr [[Y]], align 16, !tbaa [[CHAR_TBAA2]]
 // CHECK-256-NEXT:    ret void
 //
-// CHECK-512-LABEL: @write_bfloat16(
-// CHECK-512-NEXT:  entry:
-// CHECK-512-NEXT:    [[CAST_FIXED:%.*]] = tail call <32 x bfloat> @llvm.vector.extract.v32bf16.nxv8bf16(<vscale x 8 x bfloat> [[X:%.*]], i64 0)
-// CHECK-512-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S:%.*]], i64 64
-// CHECK-512-NEXT:    store <32 x bfloat> [[CAST_FIXED]], ptr [[Y]], align 16, !tbaa [[TBAA2]]
+// CHECK-512-LABEL: define dso_local void @write_bfloat16(
+// CHECK-512-SAME: ptr noundef writeonly captures(none) initializes((64, 128)) [[S:%.*]], <vscale x 8 x bfloat> [[X:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-512-NEXT:  [[ENTRY:.*:]]
+// CHECK-512-NEXT:    [[CAST_FIXED:%.*]] = tail call <32 x bfloat> @llvm.vector.extract.v32bf16.nxv8bf16(<vscale x 8 x bfloat> [[X]], i64 0)
+// CHECK-512-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S]], i64 64
+// CHECK-512-NEXT:    store <32 x bfloat> [[CAST_FIXED]], ptr [[Y]], align 16, !tbaa [[CHAR_TBAA2]]
 // CHECK-512-NEXT:    ret void
 //
 void write_bfloat16(struct struct_bfloat16 *s, svbfloat16_t x) {
@@ -190,26 +208,29 @@ void write_bfloat16(struct struct_bfloat16 *s, svbfloat16_t x) {
 // bool
 //===----------------------------------------------------------------------===//
 
-// CHECK-128-LABEL: @read_bool(
-// CHECK-128-NEXT:  entry:
-// CHECK-128-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S:%.*]], i64 2
-// CHECK-128-NEXT:    [[TMP0:%.*]] = load <2 x i8>, ptr [[Y]], align 2, !tbaa [[TBAA2]]
+// CHECK-128-LABEL: define dso_local <vscale x 16 x i1> @read_bool(
+// CHECK-128-SAME: ptr noundef readonly captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-128-NEXT:  [[ENTRY:.*:]]
+// CHECK-128-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S]], i64 2
+// CHECK-128-NEXT:    [[TMP0:%.*]] = load <2 x i8>, ptr [[Y]], align 2, !tbaa [[CHAR_TBAA2]]
 // CHECK-128-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 2 x i8> @llvm.vector.insert.nxv2i8.v2i8(<vscale x 2 x i8> poison, <2 x i8> [[TMP0]], i64 0)
 // CHECK-128-NEXT:    [[TMP1:%.*]] = bitcast <vscale x 2 x i8> [[CAST_SCALABLE]] to <vscale x 16 x i1>
 // CHECK-128-NEXT:    ret <vscale x 16 x i1> [[TMP1]]
 //
-// CHECK-256-LABEL: @read_bool(
-// CHECK-256-NEXT:  entry:
-// CHECK-256-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S:%.*]], i64 4
-// CHECK-256-NEXT:    [[TMP0:%.*]] = load <4 x i8>, ptr [[Y]], align 2, !tbaa [[TBAA2]]
+// CHECK-256-LABEL: define dso_local <vscale x 16 x i1> @read_bool(
+// CHECK-256-SAME: ptr noundef readonly captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-256-NEXT:  [[ENTRY:.*:]]
+// CHECK-256-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S]], i64 4
+// CHECK-256-NEXT:    [[TMP0:%.*]] = load <4 x i8>, ptr [[Y]], align 2, !tbaa [[CHAR_TBAA2]]
 // CHECK-256-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 2 x i8> @llvm.vector.insert.nxv2i8.v4i8(<vscale x 2 x i8> poison, <4 x i8> [[TMP0]], i64 0)
 // CHECK-256-NEXT:    [[TMP1:%.*]] = bitcast <vscale x 2 x i8> [[CAST_SCALABLE]] to <vscale x 16 x i1>
 // CHECK-256-NEXT:    ret <vscale x 16 x i1> [[TMP1]]
 //
-// CHECK-512-LABEL: @read_bool(
-// CHECK-512-NEXT:  entry:
-// CHECK-512-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S:%.*]], i64 8
-// CHECK-512-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[Y]], align 2, !tbaa [[TBAA2]]
+// CHECK-512-LABEL: define dso_local <vscale x 16 x i1> @read_bool(
+// CHECK-512-SAME: ptr noundef readonly captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-512-NEXT:  [[ENTRY:.*:]]
+// CHECK-512-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S]], i64 8
+// CHECK-512-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[Y]], align 2, !tbaa [[CHAR_TBAA2]]
 // CHECK-512-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 2 x i8> @llvm.vector.insert.nxv2i8.v8i8(<vscale x 2 x i8> poison, <8 x i8> [[TMP0]], i64 0)
 // CHECK-512-NEXT:    [[TMP1:%.*]] = bitcast <vscale x 2 x i8> [[CAST_SCALABLE]] to <vscale x 16 x i1>
 // CHECK-512-NEXT:    ret <vscale x 16 x i1> [[TMP1]]
@@ -218,30 +239,46 @@ svbool_t read_bool(struct struct_bool *s) {
   return s->y[0];
 }
 
-// CHECK-128-LABEL: @write_bool(
-// CHECK-128-NEXT:  entry:
-// CHECK-128-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i1> [[X:%.*]] to <vscale x 2 x i8>
+// CHECK-128-LABEL: define dso_local void @write_bool(
+// CHECK-128-SAME: ptr noundef writeonly captures(none) initializes((2, 4)) [[S:%.*]], <vscale x 16 x i1> [[X:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-128-NEXT:  [[ENTRY:.*:]]
+// CHECK-128-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i1> [[X]] to <vscale x 2 x i8>
 // CHECK-128-NEXT:    [[CAST_FIXED:%.*]] = tail call <2 x i8> @llvm.vector.extract.v2i8.nxv2i8(<vscale x 2 x i8> [[TMP0]], i64 0)
-// CHECK-128-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S:%.*]], i64 2
-// CHECK-128-NEXT:    store <2 x i8> [[CAST_FIXED]], ptr [[Y]], align 2, !tbaa [[TBAA2]]
+// CHECK-128-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S]], i64 2
+// CHECK-128-NEXT:    store <2 x i8> [[CAST_FIXED]], ptr [[Y]], align 2, !tbaa [[CHAR_TBAA2]]
 // CHECK-128-NEXT:    ret void
 //
-// CHECK-256-LABEL: @write_bool(
-// CHECK-256-NEXT:  entry:
-// CHECK-256-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i1> [[X:%.*]] to <vscale x 2 x i8>
+// CHECK-256-LABEL: define dso_local void @write_bool(
+// CHECK-256-SAME: ptr noundef writeonly captures(none) initializes((4, 8)) [[S:%.*]], <vscale x 16 x i1> [[X:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-256-NEXT:  [[ENTRY:.*:]]
+// CHECK-256-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i1> [[X]] to <vscale x 2 x i8>
 // CHECK-256-NEXT:    [[CAST_FIXED:%.*]] = tail call <4 x i8> @llvm.vector.extract.v4i8.nxv2i8(<vscale x 2 x i8> [[TMP0]], i64 0)
-// CHECK-256-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S:%.*]], i64 4
-// CHECK-256-NEXT:    store <4 x i8> [[CAST_FIXED]], ptr [[Y]], align 2, !tbaa [[TBAA2]]
+// CHECK-256-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S]], i64 4
+// CHECK-256-NEXT:    store <4 x i8> [[CAST_FIXED]], ptr [[Y]], align 2, !tbaa [[CHAR_TBAA2]]
 // CHECK-256-NEXT:    ret void
 //
-// CHECK-512-LABEL: @write_bool(
-// CHECK-512-NEXT:  entry:
-// CHECK-512-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i1> [[X:%.*]] to <vscale x 2 x i8>
+// CHECK-512-LABEL: define dso_local void @write_bool(
+// CHECK-512-SAME: ptr noundef writeonly captures(none) initializes((8, 16)) [[S:%.*]], <vscale x 16 x i1> [[X:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-512-NEXT:  [[ENTRY:.*:]]
+// CHECK-512-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i1> [[X]] to <vscale x 2 x i8>
 // CHECK-512-NEXT:    [[CAST_FIXED:%.*]] = tail call <8 x i8> @llvm.vector.extract.v8i8.nxv2i8(<vscale x 2 x i8> [[TMP0]], i64 0)
-// CHECK-512-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S:%.*]], i64 8
-// CHECK-512-NEXT:    store <8 x i8> [[CAST_FIXED]], ptr [[Y]], align 2, !tbaa [[TBAA2]]
+// CHECK-512-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S]], i64 8
+// CHECK-512-NEXT:    store <8 x i8> [[CAST_FIXED]], ptr [[Y]], align 2, !tbaa [[CHAR_TBAA2]]
 // CHECK-512-NEXT:    ret void
 //
 void write_bool(struct struct_bool *s, svbool_t x) {
   s->y[0] = x;
 }
+//.
+// CHECK-128: [[CHAR_TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
+// CHECK-128: [[META3]] = !{!"omnipotent char", [[META4:![0-9]+]], i64 0}
+// CHECK-128: [[META4]] = !{!"Simple C/C++ TBAA"}
+//.
+// CHECK-256: [[CHAR_TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
+// CHECK-256: [[META3]] = !{!"omnipotent char", [[META4:![0-9]+]], i64 0}
+// CHECK-256: [[META4]] = !{!"Simple C/C++ TBAA"}
+//.
+// CHECK-512: [[CHAR_TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
+// CHECK-512: [[META3]] = !{!"omnipotent char", [[META4:![0-9]+]], i64 0}
+// CHECK-512: [[META4]] = !{!"Simple C/C++ TBAA"}
+//.
diff --git a/clang/test/CodeGen/attr-arm-sve-vector-bits-cast.c b/clang/test/CodeGen/attr-arm-sve-vector-bits-cast.c
index fcd4314249ff8..bdaebf7ec1da7 100644
--- a/clang/test/CodeGen/attr-arm-sve-vector-bits-cast.c
+++ b/clang/test/CodeGen/attr-arm-sve-vector-bits-cast.c
@@ -1,4 +1,4 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6
 // RUN: %clang_cc1 -triple aarch64 -target-feature +sve -mvscale-min=4 -mvscale-max=4 -O1 -emit-llvm -o - %s | FileCheck %s
 
 // REQUIRES: aarch64-registered-target
@@ -12,66 +12,74 @@ typedef svfloat64_t fixed_float64_t __attribute__((arm_sve_vector_bits(N)));
 typedef svbool_t fixed_bool_t __attribute__((arm_sve_vector_bits(N)));
 typedef int32_t gnu_int32_t __attribute__((vector_size(N / 8)));
 
-// CHECK-LABEL: @to_svint32_t(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    ret <vscale x 4 x i32> [[TYPE_COERCE:%.*]]
+// CHECK-LABEL: define dso_local noundef <vscale x 4 x i32> @to_svint32_t(
+// CHECK-SAME: <vscale x 4 x i32> noundef returned [[TYPE_COERCE:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <vscale x 4 x i32> [[TYPE_COERCE]]
 //
 svint32_t to_svint32_t(fixed_int32_t type) {
   return type;
 }
 
-// CHECK-LABEL: @from_svint32_t(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    ret <vscale x 4 x i32> [[TYPE:%.*]]
+// CHECK-LABEL: define dso_local <vscale x 4 x i32> @from_svint32_t(
+// CHECK-SAME: <vscale x 4 x i32> returned [[TYPE:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <vscale x 4 x i32> [[TYPE]]
 //
 fixed_int32_t from_svint32_t(svint32_t type) {
   return type;
 }
 
-// CHECK-LABEL: @to_svfloat64_t(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    ret <vscale x 2 x double> [[TYPE_COERCE:%.*]]
+// CHECK-LABEL: define dso_local noundef <vscale x 2 x double> @to_svfloat64_t(
+// CHECK-SAME: <vscale x 2 x double> noundef returned [[TYPE_COERCE:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <vscale x 2 x double> [[TYPE_COERCE]]
 //
 svfloat64_t to_svfloat64_t(fixed_float64_t type) {
   return type;
 }
 
-// CHECK-LABEL: @from_svfloat64_t(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    ret <vscale x 2 x double> [[TYPE:%.*]]
+// CHECK-LABEL: define dso_local <vscale x 2 x double> @from_svfloat64_t(
+// CHECK-SAME: <vscale x 2 x double> returned [[TYPE:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <vscale x 2 x double> [[TYPE]]
 //
 fixed_float64_t from_svfloat64_t(svfloat64_t type) {
   return type;
 }
 
-// CHECK-LABEL: @to_svbool_t(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    ret <vscale x 16 x i1> [[TMP0:%.*]]
+// CHECK-LABEL: define dso_local noundef <vscale x 16 x i1> @to_svbool_t(
+// CHECK-SAME: <vscale x 16 x i1> noundef returned [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <vscale x 16 x i1> [[TMP0]]
 //
 svbool_t to_svbool_t(fixed_bool_t type) {
   return type;
 }
 
-// CHECK-LABEL: @from_svbool_t(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    ret <vscale x 16 x i1> [[TYPE:%.*]]
+// CHECK-LABEL: define dso_local <vscale x 16 x i1> @from_svbool_t(
+// CHECK-SAME: <vscale x 16 x i1> returned [[TYPE:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <vscale x 16 x i1> [[TYPE]]
 //
 fixed_bool_t from_svbool_t(svbool_t type) {
   return type;
 }
 
-// CHECK-LABEL: @lax_cast(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x i32> [[TYPE_COERCE:%.*]] to <vscale x 2 x i64>
+// CHECK-LABEL: define dso_local noundef <vscale x 2 x i64> @lax_cast(
+// CHECK-SAME: <vscale x 4 x i32> noundef [[TYPE_COERCE:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x i32> [[TYPE_COERCE]] to <vscale x 2 x i64>
 // CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
 svint64_t lax_cast(fixed_int32_t type) {
   return type;
 }
 
-// CHECK-LABEL: @to_svint32_t__from_gnu_int32_t(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TYPE:%.*]] = load <16 x i32>, ptr [[TMP0:%.*]], align 16, !tbaa [[TBAA2:![0-9]+]]
+// CHECK-LABEL: define dso_local <vscale x 4 x i32> @to_svint32_t__from_gnu_int32_t(
+// CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TYPE:%.*]] = load <16 x i32>, ptr [[TMP0]], align 16, !tbaa [[CHAR_TBAA2:![0-9]+]]
 // CHECK-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v16i32(<vscale x 4 x i32> poison, <16 x i32> [[TYPE]], i64 0)
 // CHECK-NEXT:    ret <vscale x 4 x i32> [[CAST_SCALABLE]]
 //
@@ -79,19 +87,21 @@ svint32_t to_svint32_t__from_gnu_int32_t(gnu_int32_t type) {
   return type;
 }
 
-// CHECK-LABEL: @from_svint32_t__to_gnu_int32_t(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[CAST_FIXED:%.*]] = tail call <16 x i32> @llvm.vector.extract.v16i32.nxv4i32(<vscale x 4 x i32> [[TYPE:%.*]], i64 0)
-// CHECK-NEXT:    store <16 x i32> [[CAST_FIXED]], ptr [[AGG_RESULT:%.*]], align 16, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @from_svint32_t__to_gnu_int32_t(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i32>) align 16 captures(none) initializes((0, 64)) [[AGG_RESULT:%.*]], <vscale x 4 x i32> [[TYPE:%.*]]) local_unnamed_addr #[[ATTR3:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CAST_FIXED:%.*]] = tail call <16 x i32> @llvm.vector.extract.v16i32.nxv4i32(<vscale x 4 x i32> [[TYPE]], i64 0)
+// CHECK-NEXT:    store <16 x i32> [[CAST_FIXED]], ptr [[AGG_RESULT]], align 16, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 gnu_int32_t from_svint32_t__to_gnu_int32_t(svint32_t type) {
   return type;
 }
 
-// CHECK-LABEL: @to_fixed_int32_t__from_gnu_int32_t(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TYPE:%.*]] = load <16 x i32>, ptr [[TMP0:%.*]], align 16, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local <vscale x 4 x i32> @to_fixed_int32_t__from_gnu_int32_t(
+// CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TYPE:%.*]] = load <16 x i32>, ptr [[TMP0]], align 16, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v16i32(<vscale x 4 x i32> poison, <16 x i32> [[TYPE]], i64 0)
 // CHECK-NEXT:    ret <vscale x 4 x i32> [[CAST_SCALABLE]]
 //
@@ -99,12 +109,18 @@ fixed_int32_t to_fixed_int32_t__from_gnu_int32_t(gnu_int32_t type) {
   return type;
 }
 
-// CHECK-LABEL: @from_fixed_int32_t__to_gnu_int32_t(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TYPE:%.*]] = tail call <16 x i32> @llvm.vector.extract.v16i32.nxv4i32(<vscale x 4 x i32> [[TYPE_COERCE:%.*]], i64 0)
-// CHECK-NEXT:    store <16 x i32> [[TYPE]], ptr [[AGG_RESULT:%.*]], align 16, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @from_fixed_int32_t__to_gnu_int32_t(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i32>) align 16 captures(none) initializes((0, 64)) [[AGG_RESULT:%.*]], <vscale x 4 x i32> noundef [[TYPE_COERCE:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TYPE:%.*]] = tail call <16 x i32> @llvm.vector.extract.v16i32.nxv4i32(<vscale x 4 x i32> [[TYPE_COERCE]], i64 0)
+// CHECK-NEXT:    store <16 x i32> [[TYPE]], ptr [[AGG_RESULT]], align 16, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 gnu_int32_t from_fixed_int32_t__to_gnu_int32_t(fixed_int32_t type) {
   return type;
 }
+//.
+// CHECK: [[CHAR_TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
+// CHECK: [[META3]] = !{!"omnipotent char", [[META4:![0-9]+]], i64 0}
+// CHECK: [[META4]] = !{!"Simple C/C++ TBAA"}
+//.
diff --git a/clang/test/CodeGen/attr-arm-sve-vector-bits-globals.c b/clang/test/CodeGen/attr-arm-sve-vector-bits-globals.c
index 011518c60f52f..b604a06d76a30 100644
--- a/clang/test/CodeGen/attr-arm-sve-vector-bits-globals.c
+++ b/clang/test/CodeGen/attr-arm-sve-vector-bits-globals.c
@@ -1,4 +1,4 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6
 // RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +bf16 -mvscale-min=1 -mvscale-max=1 -O1 -emit-llvm -o - %s -fhalf-no-semantic-interposition | FileCheck %s --check-prefix=CHECK-128
 // RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +bf16 -mvscale-min=4 -mvscale-max=4 -O1 -emit-llvm -o - %s -fhalf-no-semantic-interposition | FileCheck %s --check-prefix=CHECK-512
 
@@ -20,46 +20,52 @@ fixed_bool_t global_bool;
 // WRITES
 //===----------------------------------------------------------------------===//
 
-// CHECK-128-LABEL: @write_global_i64(
-// CHECK-128-NEXT:  entry:
-// CHECK-128-NEXT:    [[CASTFIXEDSVE:%.*]] = tail call <2 x i64> @llvm.vector.extract.v2i64.nxv2i64(<vscale x 2 x i64> [[V:%.*]], i64 0)
-// CHECK-128-NEXT:    store <2 x i64> [[CASTFIXEDSVE]], ptr @global_i64, align 16, !tbaa [[TBAA6:![0-9]+]]
+// CHECK-128-LABEL: define void @write_global_i64(
+// CHECK-128-SAME: <vscale x 2 x i64> [[V:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// CHECK-128-NEXT:  [[ENTRY:.*:]]
+// CHECK-128-NEXT:    [[CAST_FIXED:%.*]] = tail call <2 x i64> @llvm.vector.extract.v2i64.nxv2i64(<vscale x 2 x i64> [[V]], i64 0)
+// CHECK-128-NEXT:    store <2 x i64> [[CAST_FIXED]], ptr @global_i64, align 16, !tbaa [[CHAR_TBAA2:![0-9]+]]
 // CHECK-128-NEXT:    ret void
 //
-// CHECK-512-LABEL: @write_global_i64(
-// CHECK-512-NEXT:  entry:
-// CHECK-512-NEXT:    [[CASTFIXEDSVE:%.*]] = tail call <8 x i64> @llvm.vector.extract.v8i64.nxv2i64(<vscale x 2 x i64> [[V:%.*]], i64 0)
-// CHECK-512-NEXT:    store <8 x i64> [[CASTFIXEDSVE]], ptr @global_i64, align 16, !tbaa [[TBAA6:![0-9]+]]
+// CHECK-512-LABEL: define void @write_global_i64(
+// CHECK-512-SAME: <vscale x 2 x i64> [[V:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// CHECK-512-NEXT:  [[ENTRY:.*:]]
+// CHECK-512-NEXT:    [[CAST_FIXED:%.*]] = tail call <8 x i64> @llvm.vector.extract.v8i64.nxv2i64(<vscale x 2 x i64> [[V]], i64 0)
+// CHECK-512-NEXT:    store <8 x i64> [[CAST_FIXED]], ptr @global_i64, align 16, !tbaa [[CHAR_TBAA2:![0-9]+]]
 // CHECK-512-NEXT:    ret void
 //
 void write_global_i64(svint64_t v) { global_i64 = v; }
 
-// CHECK-128-LABEL: @write_global_bf16(
-// CHECK-128-NEXT:  entry:
-// CHECK-128-NEXT:    [[CASTFIXEDSVE:%.*]] = tail call <8 x bfloat> @llvm.vector.extract.v8bf16.nxv8bf16(<vscale x 8 x bfloat> [[V:%.*]], i64 0)
-// CHECK-128-NEXT:    store <8 x bfloat> [[CASTFIXEDSVE]], ptr @global_bf16, align 16, !tbaa [[TBAA6]]
+// CHECK-128-LABEL: define void @write_global_bf16(
+// CHECK-128-SAME: <vscale x 8 x bfloat> [[V:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-128-NEXT:  [[ENTRY:.*:]]
+// CHECK-128-NEXT:    [[CAST_FIXED:%.*]] = tail call <8 x bfloat> @llvm.vector.extract.v8bf16.nxv8bf16(<vscale x 8 x bfloat> [[V]], i64 0)
+// CHECK-128-NEXT:    store <8 x bfloat> [[CAST_FIXED]], ptr @global_bf16, align 16, !tbaa [[CHAR_TBAA2]]
 // CHECK-128-NEXT:    ret void
 //
-// CHECK-512-LABEL: @write_global_bf16(
-// CHECK-512-NEXT:  entry:
-// CHECK-512-NEXT:    [[CASTFIXEDSVE:%.*]] = tail call <32 x bfloat> @llvm.vector.extract.v32bf16.nxv8bf16(<vscale x 8 x bfloat> [[V:%.*]], i64 0)
-// CHECK-512-NEXT:    store <32 x bfloat> [[CASTFIXEDSVE]], ptr @global_bf16, align 16, !tbaa [[TBAA6]]
+// CHECK-512-LABEL: define void @write_global_bf16(
+// CHECK-512-SAME: <vscale x 8 x bfloat> [[V:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-512-NEXT:  [[ENTRY:.*:]]
+// CHECK-512-NEXT:    [[CAST_FIXED:%.*]] = tail call <32 x bfloat> @llvm.vector.extract.v32bf16.nxv8bf16(<vscale x 8 x bfloat> [[V]], i64 0)
+// CHECK-512-NEXT:    store <32 x bfloat> [[CAST_FIXED]], ptr @global_bf16, align 16, !tbaa [[CHAR_TBAA2]]
 // CHECK-512-NEXT:    ret void
 //
 void write_global_bf16(svbfloat16_t v) { global_bf16 = v; }
 
-// CHECK-128-LABEL: @write_global_bool(
-// CHECK-128-NEXT:  entry:
-// CHECK-128-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i1> [[V:%.*]] to <vscale x 2 x i8>
-// CHECK-128-NEXT:    [[CASTFIXEDSVE:%.*]] = tail call <2 x i8> @llvm.vector.extract.v2i8.nxv2i8(<vscale x 2 x i8> [[TMP0]], i64 0)
-// CHECK-128-NEXT:    store <2 x i8> [[CASTFIXEDSVE]], ptr @global_bool, align 2, !tbaa [[TBAA6]]
+// CHECK-128-LABEL: define void @write_global_bool(
+// CHECK-128-SAME: <vscale x 16 x i1> [[V:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-128-NEXT:  [[ENTRY:.*:]]
+// CHECK-128-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i1> [[V]] to <vscale x 2 x i8>
+// CHECK-128-NEXT:    [[CAST_FIXED:%.*]] = tail call <2 x i8> @llvm.vector.extract.v2i8.nxv2i8(<vscale x 2 x i8> [[TMP0]], i64 0)
+// CHECK-128-NEXT:    store <2 x i8> [[CAST_FIXED]], ptr @global_bool, align 2, !tbaa [[CHAR_TBAA2]]
 // CHECK-128-NEXT:    ret void
 //
-// CHECK-512-LABEL: @write_global_bool(
-// CHECK-512-NEXT:  entry:
-// CHECK-512-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i1> [[V:%.*]] to <vscale x 2 x i8>
-// CHECK-512-NEXT:    [[CASTFIXEDSVE:%.*]] = tail call <8 x i8> @llvm.vector.extract.v8i8.nxv2i8(<vscale x 2 x i8> [[TMP0]], i64 0)
-// CHECK-512-NEXT:    store <8 x i8> [[CASTFIXEDSVE]], ptr @global_bool, align 2, !tbaa [[TBAA6]]
+// CHECK-512-LABEL: define void @write_global_bool(
+// CHECK-512-SAME: <vscale x 16 x i1> [[V:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-512-NEXT:  [[ENTRY:.*:]]
+// CHECK-512-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i1> [[V]] to <vscale x 2 x i8>
+// CHECK-512-NEXT:    [[CAST_FIXED:%.*]] = tail call <8 x i8> @llvm.vector.extract.v8i8.nxv2i8(<vscale x 2 x i8> [[TMP0]], i64 0)
+// CHECK-512-NEXT:    store <8 x i8> [[CAST_FIXED]], ptr @global_bool, align 2, !tbaa [[CHAR_TBAA2]]
 // CHECK-512-NEXT:    ret void
 //
 void write_global_bool(svbool_t v) { global_bool = v; }
@@ -68,46 +74,61 @@ void write_global_bool(svbool_t v) { global_bool = v; }
 // READS
 //===----------------------------------------------------------------------===//
 
-// CHECK-128-LABEL: @read_global_i64(
-// CHECK-128-NEXT:  entry:
-// CHECK-128-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr @global_i64, align 16, !tbaa [[TBAA6]]
-// CHECK-128-NEXT:    [[CASTSCALABLESVE:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v2i64(<vscale x 2 x i64> poison, <2 x i64> [[TMP0]], i64 0)
-// CHECK-128-NEXT:    ret <vscale x 2 x i64> [[CASTSCALABLESVE]]
+// CHECK-128-LABEL: define <vscale x 2 x i64> @read_global_i64(
+// CHECK-128-SAME: ) local_unnamed_addr #[[ATTR2:[0-9]+]] {
+// CHECK-128-NEXT:  [[ENTRY:.*:]]
+// CHECK-128-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr @global_i64, align 16, !tbaa [[CHAR_TBAA2]]
+// CHECK-128-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v2i64(<vscale x 2 x i64> poison, <2 x i64> [[TMP0]], i64 0)
+// CHECK-128-NEXT:    ret <vscale x 2 x i64> [[CAST_SCALABLE]]
 //
-// CHECK-512-LABEL: @read_global_i64(
-// CHECK-512-NEXT:  entry:
-// CHECK-512-NEXT:    [[TMP0:%.*]] = load <8 x i64>, ptr @global_i64, align 16, !tbaa [[TBAA6]]
-// CHECK-512-NEXT:    [[CASTSCALABLESVE:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v8i64(<vscale x 2 x i64> poison, <8 x i64> [[TMP0]], i64 0)
-// CHECK-512-NEXT:    ret <vscale x 2 x i64> [[CASTSCALABLESVE]]
+// CHECK-512-LABEL: define <vscale x 2 x i64> @read_global_i64(
+// CHECK-512-SAME: ) local_unnamed_addr #[[ATTR2:[0-9]+]] {
+// CHECK-512-NEXT:  [[ENTRY:.*:]]
+// CHECK-512-NEXT:    [[TMP0:%.*]] = load <8 x i64>, ptr @global_i64, align 16, !tbaa [[CHAR_TBAA2]]
+// CHECK-512-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v8i64(<vscale x 2 x i64> poison, <8 x i64> [[TMP0]], i64 0)
+// CHECK-512-NEXT:    ret <vscale x 2 x i64> [[CAST_SCALABLE]]
 //
 svint64_t read_global_i64() { return global_i64; }
 
-// CHECK-128-LABEL: @read_global_bf16(
-// CHECK-128-NEXT:  entry:
-// CHECK-128-NEXT:    [[TMP0:%.*]] = load <8 x bfloat>, ptr @global_bf16, align 16, !tbaa [[TBAA6]]
-// CHECK-128-NEXT:    [[CASTSCALABLESVE:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.insert.nxv8bf16.v8bf16(<vscale x 8 x bfloat> poison, <8 x bfloat> [[TMP0]], i64 0)
-// CHECK-128-NEXT:    ret <vscale x 8 x bfloat> [[CASTSCALABLESVE]]
+// CHECK-128-LABEL: define <vscale x 8 x bfloat> @read_global_bf16(
+// CHECK-128-SAME: ) local_unnamed_addr #[[ATTR2]] {
+// CHECK-128-NEXT:  [[ENTRY:.*:]]
+// CHECK-128-NEXT:    [[TMP0:%.*]] = load <8 x bfloat>, ptr @global_bf16, align 16, !tbaa [[CHAR_TBAA2]]
+// CHECK-128-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.insert.nxv8bf16.v8bf16(<vscale x 8 x bfloat> poison, <8 x bfloat> [[TMP0]], i64 0)
+// CHECK-128-NEXT:    ret <vscale x 8 x bfloat> [[CAST_SCALABLE]]
 //
-// CHECK-512-LABEL: @read_global_bf16(
-// CHECK-512-NEXT:  entry:
-// CHECK-512-NEXT:    [[TMP0:%.*]] = load <32 x bfloat>, ptr @global_bf16, align 16, !tbaa [[TBAA6]]
-// CHECK-512-NEXT:    [[CASTSCALABLESVE:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.insert.nxv8bf16.v32bf16(<vscale x 8 x bfloat> poison, <32 x bfloat> [[TMP0]], i64 0)
-// CHECK-512-NEXT:    ret <vscale x 8 x bfloat> [[CASTSCALABLESVE]]
+// CHECK-512-LABEL: define <vscale x 8 x bfloat> @read_global_bf16(
+// CHECK-512-SAME: ) local_unnamed_addr #[[ATTR2]] {
+// CHECK-512-NEXT:  [[ENTRY:.*:]]
+// CHECK-512-NEXT:    [[TMP0:%.*]] = load <32 x bfloat>, ptr @global_bf16, align 16, !tbaa [[CHAR_TBAA2]]
+// CHECK-512-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.insert.nxv8bf16.v32bf16(<vscale x 8 x bfloat> poison, <32 x bfloat> [[TMP0]], i64 0)
+// CHECK-512-NEXT:    ret <vscale x 8 x bfloat> [[CAST_SCALABLE]]
 //
 svbfloat16_t read_global_bf16() { return global_bf16; }
 
-// CHECK-128-LABEL: @read_global_bool(
-// CHECK-128-NEXT:  entry:
-// CHECK-128-NEXT:    [[TMP0:%.*]] = load <2 x i8>, ptr @global_bool, align 2, !tbaa [[TBAA6]]
-// CHECK-128-NEXT:    [[CASTSCALABLESVE:%.*]] = tail call <vscale x 2 x i8> @llvm.vector.insert.nxv2i8.v2i8(<vscale x 2 x i8> poison, <2 x i8> [[TMP0]], i64 0)
-// CHECK-128-NEXT:    [[TMP1:%.*]] = bitcast <vscale x 2 x i8> [[CASTSCALABLESVE]] to <vscale x 16 x i1>
+// CHECK-128-LABEL: define <vscale x 16 x i1> @read_global_bool(
+// CHECK-128-SAME: ) local_unnamed_addr #[[ATTR2]] {
+// CHECK-128-NEXT:  [[ENTRY:.*:]]
+// CHECK-128-NEXT:    [[TMP0:%.*]] = load <2 x i8>, ptr @global_bool, align 2, !tbaa [[CHAR_TBAA2]]
+// CHECK-128-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 2 x i8> @llvm.vector.insert.nxv2i8.v2i8(<vscale x 2 x i8> poison, <2 x i8> [[TMP0]], i64 0)
+// CHECK-128-NEXT:    [[TMP1:%.*]] = bitcast <vscale x 2 x i8> [[CAST_SCALABLE]] to <vscale x 16 x i1>
 // CHECK-128-NEXT:    ret <vscale x 16 x i1> [[TMP1]]
 //
-// CHECK-512-LABEL: @read_global_bool(
-// CHECK-512-NEXT:  entry:
-// CHECK-512-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr @global_bool, align 2, !tbaa [[TBAA6]]
-// CHECK-512-NEXT:    [[CASTSCALABLESVE:%.*]] = tail call <vscale x 2 x i8> @llvm.vector.insert.nxv2i8.v8i8(<vscale x 2 x i8> poison, <8 x i8> [[TMP0]], i64 0)
-// CHECK-512-NEXT:    [[TMP1:%.*]] = bitcast <vscale x 2 x i8> [[CASTSCALABLESVE]] to <vscale x 16 x i1>
+// CHECK-512-LABEL: define <vscale x 16 x i1> @read_global_bool(
+// CHECK-512-SAME: ) local_unnamed_addr #[[ATTR2]] {
+// CHECK-512-NEXT:  [[ENTRY:.*:]]
+// CHECK-512-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr @global_bool, align 2, !tbaa [[CHAR_TBAA2]]
+// CHECK-512-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 2 x i8> @llvm.vector.insert.nxv2i8.v8i8(<vscale x 2 x i8> poison, <8 x i8> [[TMP0]], i64 0)
+// CHECK-512-NEXT:    [[TMP1:%.*]] = bitcast <vscale x 2 x i8> [[CAST_SCALABLE]] to <vscale x 16 x i1>
 // CHECK-512-NEXT:    ret <vscale x 16 x i1> [[TMP1]]
 //
 svbool_t read_global_bool() { return global_bool; }
+//.
+// CHECK-128: [[CHAR_TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
+// CHECK-128: [[META3]] = !{!"omnipotent char", [[META4:![0-9]+]], i64 0}
+// CHECK-128: [[META4]] = !{!"Simple C/C++ TBAA"}
+//.
+// CHECK-512: [[CHAR_TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
+// CHECK-512: [[META3]] = !{!"omnipotent char", [[META4:![0-9]+]], i64 0}
+// CHECK-512: [[META4]] = !{!"Simple C/C++ TBAA"}
+//.
diff --git a/clang/test/CodeGen/attr-counted-by-for-pointers.c b/clang/test/CodeGen/attr-counted-by-for-pointers.c
index 0d72b58c78fd1..f7b737d5c5039 100644
--- a/clang/test/CodeGen/attr-counted-by-for-pointers.c
+++ b/clang/test/CodeGen/attr-counted-by-for-pointers.c
@@ -1,4 +1,4 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 3
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6
 // RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -O2 -DWITH_ATTRS -Wall -fsanitize=array-bounds,object-size,local-bounds -fstrict-flex-arrays=3 -fexperimental-late-parse-attributes -emit-llvm -o - %s | FileCheck --check-prefix=SANITIZE-WITH-ATTR %s
 // RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -O2 -DWITH_ATTRS -Wall -fstrict-flex-arrays=3 -fexperimental-late-parse-attributes -emit-llvm -o - %s | FileCheck --check-prefix=NO-SANITIZE-WITH-ATTR %s
 // RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -O2 -Wall -fsanitize=array-bounds,object-size,local-bounds -fstrict-flex-arrays=3 -fexperimental-late-parse-attributes -emit-llvm -o - %s | FileCheck --check-prefix=SANITIZE-WITHOUT-ATTR %s
@@ -29,51 +29,51 @@ struct annotated_ptr {
 
 // SANITIZE-WITH-ATTR-LABEL: define dso_local void @test1(
 // SANITIZE-WITH-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[INDEX:%.*]], ptr noundef [[VALUE:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
-// SANITIZE-WITH-ATTR-NEXT:  entry:
+// SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
 // SANITIZE-WITH-ATTR-NEXT:    [[DOTCOUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 16
 // SANITIZE-WITH-ATTR-NEXT:    [[DOTCOUNTED_BY_LOAD:%.*]] = load i32, ptr [[DOTCOUNTED_BY_GEP]], align 8
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = zext i32 [[DOTCOUNTED_BY_LOAD]] to i64, !nosanitize [[META2:![0-9]+]]
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[IDXPROM]], [[TMP0]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP1]], label [[CONT10:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3:![0-9]+]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR:       handler.out_of_bounds:
+// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP1]], label %[[CONT10:.*]], label %[[HANDLER_OUT_OF_BOUNDS:.*]], !prof [[PROF3:![0-9]+]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR:       [[HANDLER_OUT_OF_BOUNDS]]:
 // SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB2:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR3:[0-9]+]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR:       cont10:
+// SANITIZE-WITH-ATTR:       [[CONT10]]:
 // SANITIZE-WITH-ATTR-NEXT:    [[BUF:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[BUF]], align 8, !tbaa [[TBAA4:![0-9]+]]
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[BUF]], align 8, !tbaa [[_ZTS3FOOPTR_TBAA4:![0-9]+]]
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[TMP2]], i64 [[IDXPROM]]
-// SANITIZE-WITH-ATTR-NEXT:    store ptr [[VALUE]], ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA13:![0-9]+]]
+// SANITIZE-WITH-ATTR-NEXT:    store ptr [[VALUE]], ptr [[ARRAYIDX]], align 8, !tbaa [[_ZTS3FOOPTR_TBAA13:![0-9]+]]
 // SANITIZE-WITH-ATTR-NEXT:    ret void
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local void @test1(
 // NO-SANITIZE-WITH-ATTR-SAME: ptr noundef readonly captures(none) [[P:%.*]], i32 noundef [[INDEX:%.*]], ptr noundef [[VALUE:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
-// NO-SANITIZE-WITH-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[BUF:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[BUF]], align 8, !tbaa [[TBAA2:![0-9]+]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[BUF]], align 8, !tbaa [[_ZTS3FOOPTR_TBAA2:![0-9]+]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[TMP0]], i64 [[IDXPROM]]
-// NO-SANITIZE-WITH-ATTR-NEXT:    store ptr [[VALUE]], ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA11:![0-9]+]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    store ptr [[VALUE]], ptr [[ARRAYIDX]], align 8, !tbaa [[_ZTS3FOOPTR_TBAA11:![0-9]+]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    ret void
 //
 // SANITIZE-WITHOUT-ATTR-LABEL: define dso_local void @test1(
 // SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[INDEX:%.*]], ptr noundef [[VALUE:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
-// SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[BUF:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
-// SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[BUF]], align 8, !tbaa [[TBAA2:![0-9]+]]
+// SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[BUF]], align 8, !tbaa [[_ZTS3FOOPTR_TBAA2:![0-9]+]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[TMP0]], i64 [[IDXPROM]]
-// SANITIZE-WITHOUT-ATTR-NEXT:    store ptr [[VALUE]], ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA11:![0-9]+]]
+// SANITIZE-WITHOUT-ATTR-NEXT:    store ptr [[VALUE]], ptr [[ARRAYIDX]], align 8, !tbaa [[_ZTS3FOOPTR_TBAA11:![0-9]+]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    ret void
 //
 // NO-SANITIZE-WITHOUT-ATTR-LABEL: define dso_local void @test1(
 // NO-SANITIZE-WITHOUT-ATTR-SAME: ptr noundef readonly captures(none) [[P:%.*]], i32 noundef [[INDEX:%.*]], ptr noundef [[VALUE:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[BUF:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[BUF]], align 8, !tbaa [[TBAA2:![0-9]+]]
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[BUF]], align 8, !tbaa [[_ZTS3FOOPTR_TBAA2:![0-9]+]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[TMP0]], i64 [[IDXPROM]]
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:    store ptr [[VALUE]], ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA11:![0-9]+]]
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    store ptr [[VALUE]], ptr [[ARRAYIDX]], align 8, !tbaa [[_ZTS3FOOPTR_TBAA11:![0-9]+]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret void
 //
 void test1(struct annotated_ptr *p, int index, struct foo *value) {
@@ -82,51 +82,51 @@ void test1(struct annotated_ptr *p, int index, struct foo *value) {
 
 // SANITIZE-WITH-ATTR-LABEL: define dso_local void @test2(
 // SANITIZE-WITH-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[INDEX:%.*]], ptr noundef [[VALUE:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// SANITIZE-WITH-ATTR-NEXT:  entry:
+// SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
 // SANITIZE-WITH-ATTR-NEXT:    [[DOTCOUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 16
 // SANITIZE-WITH-ATTR-NEXT:    [[DOTCOUNTED_BY_LOAD:%.*]] = load i32, ptr [[DOTCOUNTED_BY_GEP]], align 8
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = zext i32 [[DOTCOUNTED_BY_LOAD]] to i64, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[IDXPROM]], [[TMP0]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP1]], label [[CONT10:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR:       handler.out_of_bounds:
+// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP1]], label %[[CONT10:.*]], label %[[HANDLER_OUT_OF_BOUNDS:.*]], !prof [[PROF3]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR:       [[HANDLER_OUT_OF_BOUNDS]]:
 // SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB4:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR3]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR:       cont10:
+// SANITIZE-WITH-ATTR:       [[CONT10]]:
 // SANITIZE-WITH-ATTR-NEXT:    [[BUF:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[BUF]], align 8, !tbaa [[TBAA4]]
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[BUF]], align 8, !tbaa [[_ZTS3FOOPTR_TBAA4]]
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[TMP2]], i64 [[IDXPROM]]
-// SANITIZE-WITH-ATTR-NEXT:    store ptr [[VALUE]], ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA13]]
+// SANITIZE-WITH-ATTR-NEXT:    store ptr [[VALUE]], ptr [[ARRAYIDX]], align 8, !tbaa [[_ZTS3FOOPTR_TBAA13]]
 // SANITIZE-WITH-ATTR-NEXT:    ret void
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local void @test2(
 // NO-SANITIZE-WITH-ATTR-SAME: ptr noundef readonly captures(none) [[P:%.*]], i32 noundef [[INDEX:%.*]], ptr noundef [[VALUE:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// NO-SANITIZE-WITH-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[BUF:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[BUF]], align 8, !tbaa [[TBAA2]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[BUF]], align 8, !tbaa [[_ZTS3FOOPTR_TBAA2]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[TMP0]], i64 [[IDXPROM]]
-// NO-SANITIZE-WITH-ATTR-NEXT:    store ptr [[VALUE]], ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA11]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    store ptr [[VALUE]], ptr [[ARRAYIDX]], align 8, !tbaa [[_ZTS3FOOPTR_TBAA11]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    ret void
 //
 // SANITIZE-WITHOUT-ATTR-LABEL: define dso_local void @test2(
 // SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[INDEX:%.*]], ptr noundef [[VALUE:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[BUF:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
-// SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[BUF]], align 8, !tbaa [[TBAA2]]
+// SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[BUF]], align 8, !tbaa [[_ZTS3FOOPTR_TBAA2]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[TMP0]], i64 [[IDXPROM]]
-// SANITIZE-WITHOUT-ATTR-NEXT:    store ptr [[VALUE]], ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA11]]
+// SANITIZE-WITHOUT-ATTR-NEXT:    store ptr [[VALUE]], ptr [[ARRAYIDX]], align 8, !tbaa [[_ZTS3FOOPTR_TBAA11]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    ret void
 //
 // NO-SANITIZE-WITHOUT-ATTR-LABEL: define dso_local void @test2(
 // NO-SANITIZE-WITHOUT-ATTR-SAME: ptr noundef readonly captures(none) [[P:%.*]], i32 noundef [[INDEX:%.*]], ptr noundef [[VALUE:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[BUF:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[BUF]], align 8, !tbaa [[TBAA2]]
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[BUF]], align 8, !tbaa [[_ZTS3FOOPTR_TBAA2]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[TMP0]], i64 [[IDXPROM]]
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:    store ptr [[VALUE]], ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA11]]
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    store ptr [[VALUE]], ptr [[ARRAYIDX]], align 8, !tbaa [[_ZTS3FOOPTR_TBAA11]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret void
 //
 void test2(struct annotated_ptr *p, int index, struct foo *value) {
@@ -135,51 +135,51 @@ void test2(struct annotated_ptr *p, int index, struct foo *value) {
 
 // SANITIZE-WITH-ATTR-LABEL: define dso_local void @test3(
 // SANITIZE-WITH-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[INDEX:%.*]], ptr noundef [[VALUE:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// SANITIZE-WITH-ATTR-NEXT:  entry:
+// SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
 // SANITIZE-WITH-ATTR-NEXT:    [[DOTCOUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 16
 // SANITIZE-WITH-ATTR-NEXT:    [[DOTCOUNTED_BY_LOAD:%.*]] = load i32, ptr [[DOTCOUNTED_BY_GEP]], align 8
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = zext i32 [[DOTCOUNTED_BY_LOAD]] to i64, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    [[DOTNOT:%.*]] = icmp ugt i64 [[IDXPROM]], [[TMP0]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    br i1 [[DOTNOT]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], label [[CONT10:%.*]], !prof [[PROF15:![0-9]+]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR:       handler.out_of_bounds:
+// SANITIZE-WITH-ATTR-NEXT:    br i1 [[DOTNOT]], label %[[HANDLER_OUT_OF_BOUNDS:.*]], label %[[CONT10:.*]], !prof [[PROF15:![0-9]+]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR:       [[HANDLER_OUT_OF_BOUNDS]]:
 // SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB5:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR3]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR:       cont10:
+// SANITIZE-WITH-ATTR:       [[CONT10]]:
 // SANITIZE-WITH-ATTR-NEXT:    [[BUF:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[BUF]], align 8, !tbaa [[TBAA4]]
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[BUF]], align 8, !tbaa [[_ZTS3FOOPTR_TBAA4]]
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[TMP1]], i64 [[IDXPROM]]
-// SANITIZE-WITH-ATTR-NEXT:    store ptr [[VALUE]], ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA13]]
+// SANITIZE-WITH-ATTR-NEXT:    store ptr [[VALUE]], ptr [[ARRAYIDX]], align 8, !tbaa [[_ZTS3FOOPTR_TBAA13]]
 // SANITIZE-WITH-ATTR-NEXT:    ret void
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local void @test3(
 // NO-SANITIZE-WITH-ATTR-SAME: ptr noundef readonly captures(none) [[P:%.*]], i32 noundef [[INDEX:%.*]], ptr noundef [[VALUE:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// NO-SANITIZE-WITH-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[BUF:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[BUF]], align 8, !tbaa [[TBAA2]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[BUF]], align 8, !tbaa [[_ZTS3FOOPTR_TBAA2]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[TMP0]], i64 [[IDXPROM]]
-// NO-SANITIZE-WITH-ATTR-NEXT:    store ptr [[VALUE]], ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA11]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    store ptr [[VALUE]], ptr [[ARRAYIDX]], align 8, !tbaa [[_ZTS3FOOPTR_TBAA11]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    ret void
 //
 // SANITIZE-WITHOUT-ATTR-LABEL: define dso_local void @test3(
 // SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[INDEX:%.*]], ptr noundef [[VALUE:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[BUF:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
-// SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[BUF]], align 8, !tbaa [[TBAA2]]
+// SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[BUF]], align 8, !tbaa [[_ZTS3FOOPTR_TBAA2]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[TMP0]], i64 [[IDXPROM]]
-// SANITIZE-WITHOUT-ATTR-NEXT:    store ptr [[VALUE]], ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA11]]
+// SANITIZE-WITHOUT-ATTR-NEXT:    store ptr [[VALUE]], ptr [[ARRAYIDX]], align 8, !tbaa [[_ZTS3FOOPTR_TBAA11]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    ret void
 //
 // NO-SANITIZE-WITHOUT-ATTR-LABEL: define dso_local void @test3(
 // NO-SANITIZE-WITHOUT-ATTR-SAME: ptr noundef readonly captures(none) [[P:%.*]], i32 noundef [[INDEX:%.*]], ptr noundef [[VALUE:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[BUF:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[BUF]], align 8, !tbaa [[TBAA2]]
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[BUF]], align 8, !tbaa [[_ZTS3FOOPTR_TBAA2]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[TMP0]], i64 [[IDXPROM]]
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:    store ptr [[VALUE]], ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA11]]
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    store ptr [[VALUE]], ptr [[ARRAYIDX]], align 8, !tbaa [[_ZTS3FOOPTR_TBAA11]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret void
 //
 void test3(struct annotated_ptr *p, int index, struct foo *value) {
@@ -188,7 +188,7 @@ void test3(struct annotated_ptr *p, int index, struct foo *value) {
 
 // SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 -17179869184, 17179869177) i64 @test4(
 // SANITIZE-WITH-ATTR-SAME: ptr noundef [[P:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// SANITIZE-WITH-ATTR-NEXT:  entry:
+// SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 16
 // SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i32, ptr [[COUNTED_BY_GEP]], align 4
 // SANITIZE-WITH-ATTR-NEXT:    [[COUNT:%.*]] = sext i32 [[COUNTED_BY_LOAD]] to i64
@@ -199,7 +199,7 @@ void test3(struct annotated_ptr *p, int index, struct foo *value) {
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 -17179869184, 17179869177) i64 @test4(
 // NO-SANITIZE-WITH-ATTR-SAME: ptr noundef readonly captures(none) [[P:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] {
-// NO-SANITIZE-WITH-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 16
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i32, ptr [[COUNTED_BY_GEP]], align 4
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNT:%.*]] = sext i32 [[COUNTED_BY_LOAD]] to i64
@@ -210,12 +210,12 @@ void test3(struct annotated_ptr *p, int index, struct foo *value) {
 //
 // SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i64 @test4(
 // SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[P:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    ret i64 -1
 //
 // NO-SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i64 @test4(
 // NO-SANITIZE-WITHOUT-ATTR-SAME: ptr noundef readonly captures(none) [[P:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] {
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret i64 -1
 //
 size_t test4(struct annotated_ptr *p) {
@@ -224,7 +224,7 @@ size_t test4(struct annotated_ptr *p) {
 
 // SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 -17179869184, 17179869177) i64 @test5(
 // SANITIZE-WITH-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// SANITIZE-WITH-ATTR-NEXT:  entry:
+// SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 16
 // SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i32, ptr [[COUNTED_BY_GEP]], align 4
 // SANITIZE-WITH-ATTR-NEXT:    [[COUNT:%.*]] = sext i32 [[COUNTED_BY_LOAD]] to i64
@@ -235,7 +235,7 @@ size_t test4(struct annotated_ptr *p) {
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 -17179869184, 17179869177) i64 @test5(
 // NO-SANITIZE-WITH-ATTR-SAME: ptr noundef readonly captures(none) [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR1]] {
-// NO-SANITIZE-WITH-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 16
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i32, ptr [[COUNTED_BY_GEP]], align 4
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNT:%.*]] = sext i32 [[COUNTED_BY_LOAD]] to i64
@@ -246,12 +246,12 @@ size_t test4(struct annotated_ptr *p) {
 //
 // SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i64 @test5(
 // SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    ret i64 -1
 //
 // NO-SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i64 @test5(
 // NO-SANITIZE-WITHOUT-ATTR-SAME: ptr noundef readonly captures(none) [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR1]] {
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret i64 -1
 //
 size_t test5(struct annotated_ptr *p, int index) {
@@ -260,17 +260,17 @@ size_t test5(struct annotated_ptr *p, int index) {
 
 // SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 0, 17179869177) i64 @test6(
 // SANITIZE-WITH-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// SANITIZE-WITH-ATTR-NEXT:  entry:
+// SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
 // SANITIZE-WITH-ATTR-NEXT:    [[DOTCOUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 16
 // SANITIZE-WITH-ATTR-NEXT:    [[DOTCOUNTED_BY_LOAD:%.*]] = load i32, ptr [[DOTCOUNTED_BY_GEP]], align 4
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = zext i32 [[DOTCOUNTED_BY_LOAD]] to i64, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    [[DOTNOT:%.*]] = icmp ugt i64 [[IDXPROM]], [[TMP0]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    br i1 [[DOTNOT]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], label [[CONT8:%.*]], !prof [[PROF15]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR:       handler.out_of_bounds:
+// SANITIZE-WITH-ATTR-NEXT:    br i1 [[DOTNOT]], label %[[HANDLER_OUT_OF_BOUNDS:.*]], label %[[CONT8:.*]], !prof [[PROF15]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR:       [[HANDLER_OUT_OF_BOUNDS]]:
 // SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB6:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR3]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR:       cont8:
+// SANITIZE-WITH-ATTR:       [[CONT8]]:
 // SANITIZE-WITH-ATTR-NEXT:    [[COUNT:%.*]] = sext i32 [[DOTCOUNTED_BY_LOAD]] to i64
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = sub nsw i64 [[COUNT]], [[IDXPROM]]
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = tail call i64 @llvm.smax.i64(i64 [[TMP1]], i64 0)
@@ -279,7 +279,7 @@ size_t test5(struct annotated_ptr *p, int index) {
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 -34359738360, 34359738361) i64 @test6(
 // NO-SANITIZE-WITH-ATTR-SAME: ptr noundef readonly captures(none) [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR1]] {
-// NO-SANITIZE-WITH-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 16
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i32, ptr [[COUNTED_BY_GEP]], align 4
@@ -294,12 +294,12 @@ size_t test5(struct annotated_ptr *p, int index) {
 //
 // SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i64 @test6(
 // SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    ret i64 -1
 //
 // NO-SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i64 @test6(
 // NO-SANITIZE-WITHOUT-ATTR-SAME: ptr noundef readonly captures(none) [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR1]] {
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret i64 -1
 //
 size_t test6(struct annotated_ptr *p, int index) {
@@ -308,32 +308,32 @@ size_t test6(struct annotated_ptr *p, int index) {
 
 // SANITIZE-WITH-ATTR-LABEL: define dso_local i64 @test7(
 // SANITIZE-WITH-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// SANITIZE-WITH-ATTR-NEXT:  entry:
+// SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
 // SANITIZE-WITH-ATTR-NEXT:    [[DOTCOUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 16
 // SANITIZE-WITH-ATTR-NEXT:    [[DOTCOUNTED_BY_LOAD:%.*]] = load i32, ptr [[DOTCOUNTED_BY_GEP]], align 8
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = zext i32 [[DOTCOUNTED_BY_LOAD]] to i64, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[IDXPROM]], [[TMP0]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP1]], label [[CONT10:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR:       handler.out_of_bounds:
+// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP1]], label %[[CONT10:.*]], label %[[HANDLER_OUT_OF_BOUNDS:.*]], !prof [[PROF3]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR:       [[HANDLER_OUT_OF_BOUNDS]]:
 // SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB7:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR3]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR:       cont10:
+// SANITIZE-WITH-ATTR:       [[CONT10]]:
 // SANITIZE-WITH-ATTR-NEXT:    ret i64 -1
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local i64 @test7(
 // NO-SANITIZE-WITH-ATTR-SAME: ptr noundef readonly captures(none) [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] {
-// NO-SANITIZE-WITH-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    ret i64 -1
 //
 // SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i64 @test7(
 // SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    ret i64 -1
 //
 // NO-SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i64 @test7(
 // NO-SANITIZE-WITHOUT-ATTR-SAME: ptr noundef readonly captures(none) [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] {
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret i64 -1
 //
 size_t test7(struct annotated_ptr *p, int index) {
@@ -348,7 +348,7 @@ struct annotated_sized_ptr {
 
 // SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 0, 2147483648) i64 @test8(
 // SANITIZE-WITH-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// SANITIZE-WITH-ATTR-NEXT:  entry:
+// SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 16
 // SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i32, ptr [[COUNTED_BY_GEP]], align 4
 // SANITIZE-WITH-ATTR-NEXT:    [[NARROW:%.*]] = tail call i32 @llvm.smax.i32(i32 [[COUNTED_BY_LOAD]], i32 0)
@@ -357,7 +357,7 @@ struct annotated_sized_ptr {
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 0, 2147483648) i64 @test8(
 // NO-SANITIZE-WITH-ATTR-SAME: ptr noundef readonly captures(none) [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR1]] {
-// NO-SANITIZE-WITH-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 16
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i32, ptr [[COUNTED_BY_GEP]], align 4
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[NARROW:%.*]] = tail call i32 @llvm.smax.i32(i32 [[COUNTED_BY_LOAD]], i32 0)
@@ -366,12 +366,12 @@ struct annotated_sized_ptr {
 //
 // SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i64 @test8(
 // SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    ret i64 -1
 //
 // NO-SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i64 @test8(
 // NO-SANITIZE-WITHOUT-ATTR-SAME: ptr noundef readonly captures(none) [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR1]] {
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret i64 -1
 //
 size_t test8(struct annotated_sized_ptr *p, int index) {
@@ -380,17 +380,17 @@ size_t test8(struct annotated_sized_ptr *p, int index) {
 
 // SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 0, 2147483648) i64 @test9(
 // SANITIZE-WITH-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// SANITIZE-WITH-ATTR-NEXT:  entry:
+// SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
 // SANITIZE-WITH-ATTR-NEXT:    [[DOTCOUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 16
 // SANITIZE-WITH-ATTR-NEXT:    [[DOTCOUNTED_BY_LOAD:%.*]] = load i32, ptr [[DOTCOUNTED_BY_GEP]], align 4
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = zext i32 [[DOTCOUNTED_BY_LOAD]] to i64, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    [[DOTNOT:%.*]] = icmp ugt i64 [[IDXPROM]], [[TMP0]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    br i1 [[DOTNOT]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], label [[CONT8:%.*]], !prof [[PROF15]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR:       handler.out_of_bounds:
+// SANITIZE-WITH-ATTR-NEXT:    br i1 [[DOTNOT]], label %[[HANDLER_OUT_OF_BOUNDS:.*]], label %[[CONT8:.*]], !prof [[PROF15]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR:       [[HANDLER_OUT_OF_BOUNDS]]:
 // SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB9:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR3]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR:       cont8:
+// SANITIZE-WITH-ATTR:       [[CONT8]]:
 // SANITIZE-WITH-ATTR-NEXT:    [[COUNT:%.*]] = sext i32 [[DOTCOUNTED_BY_LOAD]] to i64
 // SANITIZE-WITH-ATTR-NEXT:    [[RESULT:%.*]] = sub nsw i64 [[COUNT]], [[IDXPROM]]
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.smax.i64(i64 [[RESULT]], i64 0)
@@ -398,7 +398,7 @@ size_t test8(struct annotated_sized_ptr *p, int index) {
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 -4294967295, 4294967296) i64 @test9(
 // NO-SANITIZE-WITH-ATTR-SAME: ptr noundef readonly captures(none) [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR1]] {
-// NO-SANITIZE-WITH-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 16
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i32, ptr [[COUNTED_BY_GEP]], align 4
@@ -412,12 +412,12 @@ size_t test8(struct annotated_sized_ptr *p, int index) {
 //
 // SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i64 @test9(
 // SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    ret i64 -1
 //
 // NO-SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i64 @test9(
 // NO-SANITIZE-WITHOUT-ATTR-SAME: ptr noundef readonly captures(none) [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR1]] {
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret i64 -1
 //
 size_t test9(struct annotated_sized_ptr *p, int index) {
@@ -426,17 +426,17 @@ size_t test9(struct annotated_sized_ptr *p, int index) {
 
 // SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 0, 2147483648) i64 @test10(
 // SANITIZE-WITH-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// SANITIZE-WITH-ATTR-NEXT:  entry:
+// SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
 // SANITIZE-WITH-ATTR-NEXT:    [[DOTCOUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 16
 // SANITIZE-WITH-ATTR-NEXT:    [[DOTCOUNTED_BY_LOAD:%.*]] = load i32, ptr [[DOTCOUNTED_BY_GEP]], align 4
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = zext i32 [[DOTCOUNTED_BY_LOAD]] to i64, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    [[DOTNOT:%.*]] = icmp ugt i64 [[IDXPROM]], [[TMP0]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    br i1 [[DOTNOT]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], label [[CONT8:%.*]], !prof [[PROF15]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR:       handler.out_of_bounds:
+// SANITIZE-WITH-ATTR-NEXT:    br i1 [[DOTNOT]], label %[[HANDLER_OUT_OF_BOUNDS:.*]], label %[[CONT8:.*]], !prof [[PROF15]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR:       [[HANDLER_OUT_OF_BOUNDS]]:
 // SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB11:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR3]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR:       cont8:
+// SANITIZE-WITH-ATTR:       [[CONT8]]:
 // SANITIZE-WITH-ATTR-NEXT:    [[COUNT:%.*]] = sext i32 [[DOTCOUNTED_BY_LOAD]] to i64
 // SANITIZE-WITH-ATTR-NEXT:    [[INDEX_SIZE:%.*]] = shl nuw nsw i64 [[IDXPROM]], 2
 // SANITIZE-WITH-ATTR-NEXT:    [[RESULT:%.*]] = sub nsw i64 [[COUNT]], [[INDEX_SIZE]]
@@ -445,7 +445,7 @@ size_t test9(struct annotated_sized_ptr *p, int index) {
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 -10737418236, 10737418240) i64 @test10(
 // NO-SANITIZE-WITH-ATTR-SAME: ptr noundef readonly captures(none) [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR1]] {
-// NO-SANITIZE-WITH-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 16
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i32, ptr [[COUNTED_BY_GEP]], align 4
@@ -460,12 +460,12 @@ size_t test9(struct annotated_sized_ptr *p, int index) {
 //
 // SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i64 @test10(
 // SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    ret i64 -1
 //
 // NO-SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i64 @test10(
 // NO-SANITIZE-WITHOUT-ATTR-SAME: ptr noundef readonly captures(none) [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR1]] {
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret i64 -1
 //
 size_t test10(struct annotated_sized_ptr *p, int index) {
@@ -479,7 +479,7 @@ struct pr151236_struct {
 
 // SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 -262144, 262137) i64 @test11(
 // SANITIZE-WITH-ATTR-SAME: ptr noundef [[P:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// SANITIZE-WITH-ATTR-NEXT:  entry:
+// SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
 // SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i16, ptr [[COUNTED_BY_GEP]], align 4
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = icmp sgt i16 [[COUNTED_BY_LOAD]], -1
@@ -490,7 +490,7 @@ struct pr151236_struct {
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 -262144, 262137) i64 @test11(
 // NO-SANITIZE-WITH-ATTR-SAME: ptr noundef readonly captures(none) [[P:%.*]]) local_unnamed_addr #[[ATTR1]] {
-// NO-SANITIZE-WITH-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i16, ptr [[COUNTED_BY_GEP]], align 4
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNT:%.*]] = sext i16 [[COUNTED_BY_LOAD]] to i64
@@ -501,12 +501,12 @@ struct pr151236_struct {
 //
 // SANITIZE-WITHOUT-ATTR-LABEL: define dso_local range(i64 0, -1) i64 @test11(
 // SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[P:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    ret i64 -2
 //
 // NO-SANITIZE-WITHOUT-ATTR-LABEL: define dso_local range(i64 0, -1) i64 @test11(
 // NO-SANITIZE-WITHOUT-ATTR-SAME: ptr noundef readonly captures(none) [[P:%.*]]) local_unnamed_addr #[[ATTR1]] {
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret i64 -2
 //
 size_t test11(struct pr151236_struct *p) {
@@ -515,7 +515,7 @@ size_t test11(struct pr151236_struct *p) {
 
 // SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 -262144, 262137) i64 @test12(
 // SANITIZE-WITH-ATTR-SAME: ptr noundef [[P:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// SANITIZE-WITH-ATTR-NEXT:  entry:
+// SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
 // SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i16, ptr [[COUNTED_BY_GEP]], align 4
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = icmp sgt i16 [[COUNTED_BY_LOAD]], -1
@@ -526,7 +526,7 @@ size_t test11(struct pr151236_struct *p) {
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 -262144, 262137) i64 @test12(
 // NO-SANITIZE-WITH-ATTR-SAME: ptr noundef readonly captures(none) [[P:%.*]]) local_unnamed_addr #[[ATTR1]] {
-// NO-SANITIZE-WITH-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i16, ptr [[COUNTED_BY_GEP]], align 4
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNT:%.*]] = sext i16 [[COUNTED_BY_LOAD]] to i64
@@ -537,14 +537,66 @@ size_t test11(struct pr151236_struct *p) {
 //
 // SANITIZE-WITHOUT-ATTR-LABEL: define dso_local range(i64 0, -1) i64 @test12(
 // SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[P:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    ret i64 -2
 //
 // NO-SANITIZE-WITHOUT-ATTR-LABEL: define dso_local range(i64 0, -1) i64 @test12(
 // NO-SANITIZE-WITHOUT-ATTR-SAME: ptr noundef readonly captures(none) [[P:%.*]]) local_unnamed_addr #[[ATTR1]] {
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret i64 -2
 //
 size_t test12(struct pr151236_struct *p) {
   return __bdos(p->a) + __bdos(((int *)p->a));
 }
+//.
+// SANITIZE-WITH-ATTR: [[META2]] = !{}
+// SANITIZE-WITH-ATTR: [[PROF3]] = !{!"branch_weights", i32 1048575, i32 1}
+// SANITIZE-WITH-ATTR: [[_ZTS3FOOPTR_TBAA4]] = !{[[META5:![0-9]+]], [[META9:![0-9]+]], i64 8}
+// SANITIZE-WITH-ATTR: [[META5]] = !{!"annotated_ptr", [[META6:![0-9]+]], i64 0, [[META9]], i64 8, [[META12:![0-9]+]], i64 16}
+// SANITIZE-WITH-ATTR: [[META6]] = !{!"long", [[META7:![0-9]+]], i64 0}
+// SANITIZE-WITH-ATTR: [[META7]] = !{!"omnipotent char", [[META8:![0-9]+]], i64 0}
+// SANITIZE-WITH-ATTR: [[META8]] = !{!"Simple C/C++ TBAA"}
+// SANITIZE-WITH-ATTR: [[META9]] = !{!"p2 _ZTS3foo", [[META10:![0-9]+]], i64 0}
+// SANITIZE-WITH-ATTR: [[META10]] = !{!"any p2 pointer", [[META11:![0-9]+]], i64 0}
+// SANITIZE-WITH-ATTR: [[META11]] = !{!"any pointer", [[META7]], i64 0}
+// SANITIZE-WITH-ATTR: [[META12]] = !{!"int", [[META7]], i64 0}
+// SANITIZE-WITH-ATTR: [[_ZTS3FOOPTR_TBAA13]] = !{[[META14:![0-9]+]], [[META14]], i64 0}
+// SANITIZE-WITH-ATTR: [[META14]] = !{!"p1 _ZTS3foo", [[META11]], i64 0}
+// SANITIZE-WITH-ATTR: [[PROF15]] = !{!"branch_weights", i32 1, i32 1048575}
+//.
+// NO-SANITIZE-WITH-ATTR: [[_ZTS3FOOPTR_TBAA2]] = !{[[META3:![0-9]+]], [[META7:![0-9]+]], i64 8}
+// NO-SANITIZE-WITH-ATTR: [[META3]] = !{!"annotated_ptr", [[META4:![0-9]+]], i64 0, [[META7]], i64 8, [[META10:![0-9]+]], i64 16}
+// NO-SANITIZE-WITH-ATTR: [[META4]] = !{!"long", [[META5:![0-9]+]], i64 0}
+// NO-SANITIZE-WITH-ATTR: [[META5]] = !{!"omnipotent char", [[META6:![0-9]+]], i64 0}
+// NO-SANITIZE-WITH-ATTR: [[META6]] = !{!"Simple C/C++ TBAA"}
+// NO-SANITIZE-WITH-ATTR: [[META7]] = !{!"p2 _ZTS3foo", [[META8:![0-9]+]], i64 0}
+// NO-SANITIZE-WITH-ATTR: [[META8]] = !{!"any p2 pointer", [[META9:![0-9]+]], i64 0}
+// NO-SANITIZE-WITH-ATTR: [[META9]] = !{!"any pointer", [[META5]], i64 0}
+// NO-SANITIZE-WITH-ATTR: [[META10]] = !{!"int", [[META5]], i64 0}
+// NO-SANITIZE-WITH-ATTR: [[_ZTS3FOOPTR_TBAA11]] = !{[[META12:![0-9]+]], [[META12]], i64 0}
+// NO-SANITIZE-WITH-ATTR: [[META12]] = !{!"p1 _ZTS3foo", [[META9]], i64 0}
+//.
+// SANITIZE-WITHOUT-ATTR: [[_ZTS3FOOPTR_TBAA2]] = !{[[META3:![0-9]+]], [[META7:![0-9]+]], i64 8}
+// SANITIZE-WITHOUT-ATTR: [[META3]] = !{!"annotated_ptr", [[META4:![0-9]+]], i64 0, [[META7]], i64 8, [[META10:![0-9]+]], i64 16}
+// SANITIZE-WITHOUT-ATTR: [[META4]] = !{!"long", [[META5:![0-9]+]], i64 0}
+// SANITIZE-WITHOUT-ATTR: [[META5]] = !{!"omnipotent char", [[META6:![0-9]+]], i64 0}
+// SANITIZE-WITHOUT-ATTR: [[META6]] = !{!"Simple C/C++ TBAA"}
+// SANITIZE-WITHOUT-ATTR: [[META7]] = !{!"p2 _ZTS3foo", [[META8:![0-9]+]], i64 0}
+// SANITIZE-WITHOUT-ATTR: [[META8]] = !{!"any p2 pointer", [[META9:![0-9]+]], i64 0}
+// SANITIZE-WITHOUT-ATTR: [[META9]] = !{!"any pointer", [[META5]], i64 0}
+// SANITIZE-WITHOUT-ATTR: [[META10]] = !{!"int", [[META5]], i64 0}
+// SANITIZE-WITHOUT-ATTR: [[_ZTS3FOOPTR_TBAA11]] = !{[[META12:![0-9]+]], [[META12]], i64 0}
+// SANITIZE-WITHOUT-ATTR: [[META12]] = !{!"p1 _ZTS3foo", [[META9]], i64 0}
+//.
+// NO-SANITIZE-WITHOUT-ATTR: [[_ZTS3FOOPTR_TBAA2]] = !{[[META3:![0-9]+]], [[META7:![0-9]+]], i64 8}
+// NO-SANITIZE-WITHOUT-ATTR: [[META3]] = !{!"annotated_ptr", [[META4:![0-9]+]], i64 0, [[META7]], i64 8, [[META10:![0-9]+]], i64 16}
+// NO-SANITIZE-WITHOUT-ATTR: [[META4]] = !{!"long", [[META5:![0-9]+]], i64 0}
+// NO-SANITIZE-WITHOUT-ATTR: [[META5]] = !{!"omnipotent char", [[META6:![0-9]+]], i64 0}
+// NO-SANITIZE-WITHOUT-ATTR: [[META6]] = !{!"Simple C/C++ TBAA"}
+// NO-SANITIZE-WITHOUT-ATTR: [[META7]] = !{!"p2 _ZTS3foo", [[META8:![0-9]+]], i64 0}
+// NO-SANITIZE-WITHOUT-ATTR: [[META8]] = !{!"any p2 pointer", [[META9:![0-9]+]], i64 0}
+// NO-SANITIZE-WITHOUT-ATTR: [[META9]] = !{!"any pointer", [[META5]], i64 0}
+// NO-SANITIZE-WITHOUT-ATTR: [[META10]] = !{!"int", [[META5]], i64 0}
+// NO-SANITIZE-WITHOUT-ATTR: [[_ZTS3FOOPTR_TBAA11]] = !{[[META12:![0-9]+]], [[META12]], i64 0}
+// NO-SANITIZE-WITHOUT-ATTR: [[META12]] = !{!"p1 _ZTS3foo", [[META9]], i64 0}
+//.
diff --git a/clang/test/CodeGen/attr-counted-by-pr110385.c b/clang/test/CodeGen/attr-counted-by-pr110385.c
index 412c12cb687c4..32ee1c8eb5dbe 100644
--- a/clang/test/CodeGen/attr-counted-by-pr110385.c
+++ b/clang/test/CodeGen/attr-counted-by-pr110385.c
@@ -1,4 +1,4 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6
 // RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -O2 -Wno-missing-declarations -emit-llvm -o - %s | FileCheck %s
 
 // See #110385
@@ -27,17 +27,17 @@ void init(void * __attribute__((pass_dynamic_object_size(0))));
 
 // CHECK-LABEL: define dso_local void @test1(
 // CHECK-SAME: ptr noundef readonly captures(none) [[FOO:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
-// CHECK-NEXT:  entry:
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[GROWABLE:%.*]] = getelementptr inbounds nuw i8, ptr [[FOO]], i64 8
-// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[GROWABLE]], align 8, !tbaa [[TBAA2:![0-9]+]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[GROWABLE]], align 8, !tbaa [[_ZTS8VARIABLEPTR_TBAA2:![0-9]+]]
 // CHECK-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 12
 // CHECK-NEXT:    [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 8
 // CHECK-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i32, ptr [[COUNTED_BY_GEP]], align 4
-// CHECK-NEXT:    [[TMP1:%.*]] = sext i32 [[COUNTED_BY_LOAD]] to i64
-// CHECK-NEXT:    [[TMP2:%.*]] = shl nsw i64 [[TMP1]], 1
-// CHECK-NEXT:    [[TMP3:%.*]] = icmp sgt i32 [[COUNTED_BY_LOAD]], -1
-// CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP3]], i64 [[TMP2]], i64 0
-// CHECK-NEXT:    tail call void @init(ptr noundef nonnull [[ARRAY]], i64 noundef [[TMP4]]) #[[ATTR2:[0-9]+]]
+// CHECK-NEXT:    [[COUNT:%.*]] = sext i32 [[COUNTED_BY_LOAD]] to i64
+// CHECK-NEXT:    [[FLEXIBLE_ARRAY_MEMBER_SIZE:%.*]] = shl nsw i64 [[COUNT]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt i32 [[COUNTED_BY_LOAD]], -1
+// CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i64 [[FLEXIBLE_ARRAY_MEMBER_SIZE]], i64 0
+// CHECK-NEXT:    tail call void @init(ptr noundef nonnull [[ARRAY]], i64 noundef [[TMP2]]) #[[ATTR2:[0-9]+]]
 // CHECK-NEXT:    ret void
 //
 void test1(struct bucket *foo) {
@@ -46,22 +46,22 @@ void test1(struct bucket *foo) {
 
 // CHECK-LABEL: define dso_local void @test2(
 // CHECK-SAME: ptr noundef [[FOO:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// CHECK-NEXT:  entry:
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds nuw i8, ptr [[FOO]], i64 16
 // CHECK-NEXT:    [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[FOO]], i64 12
 // CHECK-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i32, ptr [[COUNTED_BY_GEP]], align 4
-// CHECK-NEXT:    [[TMP0:%.*]] = sext i32 [[COUNTED_BY_LOAD]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = shl nsw i64 [[TMP0]], 1
-// CHECK-NEXT:    [[TMP2:%.*]] = icmp sgt i32 [[COUNTED_BY_LOAD]], -1
-// CHECK-NEXT:    [[TMP3:%.*]] = select i1 [[TMP2]], i64 [[TMP1]], i64 0
-// CHECK-NEXT:    tail call void @init(ptr noundef nonnull [[ARRAY]], i64 noundef [[TMP3]]) #[[ATTR2]]
+// CHECK-NEXT:    [[COUNT:%.*]] = sext i32 [[COUNTED_BY_LOAD]] to i64
+// CHECK-NEXT:    [[FLEXIBLE_ARRAY_MEMBER_SIZE:%.*]] = shl nsw i64 [[COUNT]], 1
+// CHECK-NEXT:    [[TMP0:%.*]] = icmp sgt i32 [[COUNTED_BY_LOAD]], -1
+// CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[TMP0]], i64 [[FLEXIBLE_ARRAY_MEMBER_SIZE]], i64 0
+// CHECK-NEXT:    tail call void @init(ptr noundef nonnull [[ARRAY]], i64 noundef [[TMP1]]) #[[ATTR2]]
 // CHECK-NEXT:    ret void
 //
 void test2(struct bucket2 *foo) {
         init(foo->growable.array);
 }
 //.
-// CHECK: [[TBAA2]] = !{[[META3:![0-9]+]], [[META7:![0-9]+]], i64 8}
+// CHECK: [[_ZTS8VARIABLEPTR_TBAA2]] = !{[[META3:![0-9]+]], [[META7:![0-9]+]], i64 8}
 // CHECK: [[META3]] = !{!"bucket", [[META4:![0-9]+]], i64 0, [[META7]], i64 8, [[META4]], i64 16}
 // CHECK: [[META4]] = !{!"int", [[META5:![0-9]+]], i64 0}
 // CHECK: [[META5]] = !{!"omnipotent char", [[META6:![0-9]+]], i64 0}
diff --git a/clang/test/CodeGen/attr-counted-by.c b/clang/test/CodeGen/attr-counted-by.c
index cb23efdb8f263..9675fe21be366 100644
--- a/clang/test/CodeGen/attr-counted-by.c
+++ b/clang/test/CodeGen/attr-counted-by.c
@@ -1,4 +1,4 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 3
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6
 // RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -DCOUNTED_BY -O2 -Wall -fsanitize=array-bounds,object-size,local-bounds -fstrict-flex-arrays=3 -emit-llvm -o - %s | FileCheck --check-prefix=SANITIZE-WITH-ATTR %s
 // RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -DCOUNTED_BY -O2 -Wall -fstrict-flex-arrays=3 -emit-llvm -o - %s | FileCheck --check-prefix=NO-SANITIZE-WITH-ATTR %s
 // RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -O2 -Wall -fsanitize=array-bounds,object-size,local-bounds -fstrict-flex-arrays=3 -emit-llvm -o - %s | FileCheck --check-prefix=SANITIZE-WITHOUT-ATTR %s
@@ -60,47 +60,47 @@ struct anon_struct {
 
 // SANITIZE-WITH-ATTR-LABEL: define dso_local void @test1(
 // SANITIZE-WITH-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[INDEX:%.*]], i32 noundef [[VAL:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
-// SANITIZE-WITH-ATTR-NEXT:  entry:
+// SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
 // SANITIZE-WITH-ATTR-NEXT:    [[DOTCOUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
 // SANITIZE-WITH-ATTR-NEXT:    [[DOTCOUNTED_BY_LOAD:%.*]] = load i32, ptr [[DOTCOUNTED_BY_GEP]], align 4
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = zext i32 [[DOTCOUNTED_BY_LOAD]] to i64, !nosanitize [[META2:![0-9]+]]
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[IDXPROM]], [[TMP0]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP1]], label [[CONT3:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3:![0-9]+]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR:       handler.out_of_bounds:
+// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP1]], label %[[CONT3:.*]], label %[[HANDLER_OUT_OF_BOUNDS:.*]], !prof [[PROF3:![0-9]+]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR:       [[HANDLER_OUT_OF_BOUNDS]]:
 // SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB1:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR8:[0-9]+]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR:       cont3:
+// SANITIZE-WITH-ATTR:       [[CONT3]]:
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 12
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAY]], i64 [[IDXPROM]]
-// SANITIZE-WITH-ATTR-NEXT:    store i32 [[VAL]], ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA4:![0-9]+]]
+// SANITIZE-WITH-ATTR-NEXT:    store i32 [[VAL]], ptr [[ARRAYIDX]], align 4, !tbaa [[INT_TBAA4:![0-9]+]]
 // SANITIZE-WITH-ATTR-NEXT:    ret void
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local void @test1(
 // NO-SANITIZE-WITH-ATTR-SAME: ptr noundef writeonly captures(none) [[P:%.*]], i32 noundef [[INDEX:%.*]], i32 noundef [[VAL:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
-// NO-SANITIZE-WITH-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 12
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[ARRAY]], i64 [[IDXPROM]]
-// NO-SANITIZE-WITH-ATTR-NEXT:    store i32 [[VAL]], ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2:![0-9]+]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    store i32 [[VAL]], ptr [[ARRAYIDX]], align 4, !tbaa [[INT_TBAA2:![0-9]+]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    ret void
 //
 // SANITIZE-WITHOUT-ATTR-LABEL: define dso_local void @test1(
 // SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[INDEX:%.*]], i32 noundef [[VAL:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
-// SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 12
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[ARRAY]], i64 [[IDXPROM]]
-// SANITIZE-WITHOUT-ATTR-NEXT:    store i32 [[VAL]], ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2:![0-9]+]]
+// SANITIZE-WITHOUT-ATTR-NEXT:    store i32 [[VAL]], ptr [[ARRAYIDX]], align 4, !tbaa [[INT_TBAA2:![0-9]+]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    ret void
 //
 // NO-SANITIZE-WITHOUT-ATTR-LABEL: define dso_local void @test1(
 // NO-SANITIZE-WITHOUT-ATTR-SAME: ptr noundef writeonly captures(none) [[P:%.*]], i32 noundef [[INDEX:%.*]], i32 noundef [[VAL:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 12
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[ARRAY]], i64 [[IDXPROM]]
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:    store i32 [[VAL]], ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2:![0-9]+]]
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    store i32 [[VAL]], ptr [[ARRAYIDX]], align 4, !tbaa [[INT_TBAA2:![0-9]+]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret void
 //
 void test1(struct annotated *p, int index, int val) {
@@ -109,49 +109,49 @@ void test1(struct annotated *p, int index, int val) {
 
 // SANITIZE-WITH-ATTR-LABEL: define dso_local void @test2(
 // SANITIZE-WITH-ATTR-SAME: ptr noundef [[P:%.*]], i64 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// SANITIZE-WITH-ATTR-NEXT:  entry:
+// SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
 // SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i32, ptr [[COUNTED_BY_GEP]], align 4
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = zext i32 [[COUNTED_BY_LOAD]] to i64, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[INDEX]], [[TMP0]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP1]], label [[CONT6:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR:       handler.out_of_bounds:
+// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP1]], label %[[CONT6:.*]], label %[[HANDLER_OUT_OF_BOUNDS:.*]], !prof [[PROF3]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR:       [[HANDLER_OUT_OF_BOUNDS]]:
 // SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB3:[0-9]+]], i64 [[INDEX]]) #[[ATTR8]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR:       cont6:
+// SANITIZE-WITH-ATTR:       [[CONT6]]:
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 12
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAY]], i64 [[INDEX]]
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = tail call i32 @llvm.smax.i32(i32 [[COUNTED_BY_LOAD]], i32 0)
 // SANITIZE-WITH-ATTR-NEXT:    [[CONV:%.*]] = shl i32 [[TMP2]], 2
-// SANITIZE-WITH-ATTR-NEXT:    store i32 [[CONV]], ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA4]]
+// SANITIZE-WITH-ATTR-NEXT:    store i32 [[CONV]], ptr [[ARRAYIDX]], align 4, !tbaa [[INT_TBAA4]]
 // SANITIZE-WITH-ATTR-NEXT:    ret void
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local void @test2(
 // NO-SANITIZE-WITH-ATTR-SAME: ptr noundef captures(none) [[P:%.*]], i64 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] {
-// NO-SANITIZE-WITH-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 12
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i32, ptr [[COUNTED_BY_GEP]], align 4
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.smax.i32(i32 [[COUNTED_BY_LOAD]], i32 0)
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[CONV:%.*]] = shl i32 [[TMP0]], 2
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAY]], i64 [[INDEX]]
-// NO-SANITIZE-WITH-ATTR-NEXT:    store i32 [[CONV]], ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    store i32 [[CONV]], ptr [[ARRAYIDX]], align 4, !tbaa [[INT_TBAA2]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    ret void
 //
 // SANITIZE-WITHOUT-ATTR-LABEL: define dso_local void @test2(
 // SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[P:%.*]], i64 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 12
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAY]], i64 [[INDEX]]
-// SANITIZE-WITHOUT-ATTR-NEXT:    store i32 -1, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
+// SANITIZE-WITHOUT-ATTR-NEXT:    store i32 -1, ptr [[ARRAYIDX]], align 4, !tbaa [[INT_TBAA2]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    ret void
 //
 // NO-SANITIZE-WITHOUT-ATTR-LABEL: define dso_local void @test2(
 // NO-SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[P:%.*]], i64 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 12
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAY]], i64 [[INDEX]]
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:    store i32 -1, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    store i32 -1, ptr [[ARRAYIDX]], align 4, !tbaa [[INT_TBAA2]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret void
 //
 void test2(struct annotated *p, size_t index) {
@@ -160,7 +160,7 @@ void test2(struct annotated *p, size_t index) {
 
 // SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 -8589934592, 8589934589) i64 @test2_bdos(
 // SANITIZE-WITH-ATTR-SAME: ptr noundef [[P:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// SANITIZE-WITH-ATTR-NEXT:  entry:
+// SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
 // SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i32, ptr [[COUNTED_BY_GEP]], align 4
 // SANITIZE-WITH-ATTR-NEXT:    [[COUNT:%.*]] = sext i32 [[COUNTED_BY_LOAD]] to i64
@@ -171,7 +171,7 @@ void test2(struct annotated *p, size_t index) {
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 -8589934592, 8589934589) i64 @test2_bdos(
 // NO-SANITIZE-WITH-ATTR-SAME: ptr noundef readonly captures(none) [[P:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] {
-// NO-SANITIZE-WITH-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i32, ptr [[COUNTED_BY_GEP]], align 4
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNT:%.*]] = sext i32 [[COUNTED_BY_LOAD]] to i64
@@ -182,12 +182,12 @@ void test2(struct annotated *p, size_t index) {
 //
 // SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i64 @test2_bdos(
 // SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[P:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    ret i64 -1
 //
 // NO-SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i64 @test2_bdos(
 // NO-SANITIZE-WITHOUT-ATTR-SAME: ptr noundef readnone [[P:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] {
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret i64 -1
 //
 size_t test2_bdos(struct annotated *p) {
@@ -196,7 +196,7 @@ size_t test2_bdos(struct annotated *p) {
 
 // SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 -8589934592, 8589934589) i64 @test2_bdos_cast(
 // SANITIZE-WITH-ATTR-SAME: ptr noundef [[P:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// SANITIZE-WITH-ATTR-NEXT:  entry:
+// SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
 // SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i32, ptr [[COUNTED_BY_GEP]], align 4
 // SANITIZE-WITH-ATTR-NEXT:    [[COUNT:%.*]] = sext i32 [[COUNTED_BY_LOAD]] to i64
@@ -207,7 +207,7 @@ size_t test2_bdos(struct annotated *p) {
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 -8589934592, 8589934589) i64 @test2_bdos_cast(
 // NO-SANITIZE-WITH-ATTR-SAME: ptr noundef readonly captures(none) [[P:%.*]]) local_unnamed_addr #[[ATTR2]] {
-// NO-SANITIZE-WITH-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i32, ptr [[COUNTED_BY_GEP]], align 4
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNT:%.*]] = sext i32 [[COUNTED_BY_LOAD]] to i64
@@ -218,12 +218,12 @@ size_t test2_bdos(struct annotated *p) {
 //
 // SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i64 @test2_bdos_cast(
 // SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[P:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    ret i64 -1
 //
 // NO-SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i64 @test2_bdos_cast(
 // NO-SANITIZE-WITHOUT-ATTR-SAME: ptr noundef readnone [[P:%.*]]) local_unnamed_addr #[[ATTR1]] {
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret i64 -1
 //
 size_t test2_bdos_cast(struct annotated *p) {
@@ -232,43 +232,43 @@ size_t test2_bdos_cast(struct annotated *p) {
 
 // SANITIZE-WITH-ATTR-LABEL: define dso_local void @test3(
 // SANITIZE-WITH-ATTR-SAME: ptr noundef [[P:%.*]], i64 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// SANITIZE-WITH-ATTR-NEXT:  entry:
+// SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITH-ATTR-NEXT:    [[DOTCOUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
 // SANITIZE-WITH-ATTR-NEXT:    [[DOTCOUNTED_BY_LOAD:%.*]] = load i32, ptr [[DOTCOUNTED_BY_GEP]], align 4
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = zext i32 [[DOTCOUNTED_BY_LOAD]] to i64, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[INDEX]], [[TMP0]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP1]], label [[CONT3:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR:       handler.out_of_bounds:
+// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP1]], label %[[CONT3:.*]], label %[[HANDLER_OUT_OF_BOUNDS:.*]], !prof [[PROF3]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR:       [[HANDLER_OUT_OF_BOUNDS]]:
 // SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB4:[0-9]+]], i64 [[INDEX]]) #[[ATTR8]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR:       cont3:
+// SANITIZE-WITH-ATTR:       [[CONT3]]:
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 12
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAY]], i64 [[INDEX]]
-// SANITIZE-WITH-ATTR-NEXT:    store i32 -1, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA4]]
+// SANITIZE-WITH-ATTR-NEXT:    store i32 -1, ptr [[ARRAYIDX]], align 4, !tbaa [[INT_TBAA4]]
 // SANITIZE-WITH-ATTR-NEXT:    ret void
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local void @test3(
 // NO-SANITIZE-WITH-ATTR-SAME: ptr noundef [[P:%.*]], i64 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// NO-SANITIZE-WITH-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 12
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAY]], i64 [[INDEX]]
-// NO-SANITIZE-WITH-ATTR-NEXT:    store i32 -1, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    store i32 -1, ptr [[ARRAYIDX]], align 4, !tbaa [[INT_TBAA2]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    ret void
 //
 // SANITIZE-WITHOUT-ATTR-LABEL: define dso_local void @test3(
 // SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[P:%.*]], i64 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 12
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAY]], i64 [[INDEX]]
-// SANITIZE-WITHOUT-ATTR-NEXT:    store i32 -1, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
+// SANITIZE-WITHOUT-ATTR-NEXT:    store i32 -1, ptr [[ARRAYIDX]], align 4, !tbaa [[INT_TBAA2]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    ret void
 //
 // NO-SANITIZE-WITHOUT-ATTR-LABEL: define dso_local void @test3(
 // NO-SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[P:%.*]], i64 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 12
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAY]], i64 [[INDEX]]
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:    store i32 -1, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    store i32 -1, ptr [[ARRAYIDX]], align 4, !tbaa [[INT_TBAA2]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret void
 //
 void test3(struct annotated *p, size_t index) {
@@ -279,22 +279,22 @@ void test3(struct annotated *p, size_t index) {
 
 // SANITIZE-WITH-ATTR-LABEL: define dso_local i64 @test3_bdos(
 // SANITIZE-WITH-ATTR-SAME: ptr noundef readnone [[P:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] {
-// SANITIZE-WITH-ATTR-NEXT:  entry:
+// SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITH-ATTR-NEXT:    ret i64 -1
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local i64 @test3_bdos(
 // NO-SANITIZE-WITH-ATTR-SAME: ptr noundef readnone [[P:%.*]]) local_unnamed_addr #[[ATTR3:[0-9]+]] {
-// NO-SANITIZE-WITH-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    ret i64 -1
 //
 // SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i64 @test3_bdos(
 // SANITIZE-WITHOUT-ATTR-SAME: ptr noundef readnone [[P:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] {
-// SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    ret i64 -1
 //
 // NO-SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i64 @test3_bdos(
 // NO-SANITIZE-WITHOUT-ATTR-SAME: ptr noundef readnone [[P:%.*]]) local_unnamed_addr #[[ATTR1]] {
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret i64 -1
 //
 size_t test3_bdos(struct annotated *p) {
@@ -303,22 +303,22 @@ size_t test3_bdos(struct annotated *p) {
 
 // SANITIZE-WITH-ATTR-LABEL: define dso_local i64 @test3_bdos_cast(
 // SANITIZE-WITH-ATTR-SAME: ptr noundef readnone [[P:%.*]]) local_unnamed_addr #[[ATTR2]] {
-// SANITIZE-WITH-ATTR-NEXT:  entry:
+// SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITH-ATTR-NEXT:    ret i64 -1
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local i64 @test3_bdos_cast(
 // NO-SANITIZE-WITH-ATTR-SAME: ptr noundef readnone [[P:%.*]]) local_unnamed_addr #[[ATTR3]] {
-// NO-SANITIZE-WITH-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    ret i64 -1
 //
 // SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i64 @test3_bdos_cast(
 // SANITIZE-WITHOUT-ATTR-SAME: ptr noundef readnone [[P:%.*]]) local_unnamed_addr #[[ATTR2]] {
-// SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    ret i64 -1
 //
 // NO-SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i64 @test3_bdos_cast(
 // NO-SANITIZE-WITHOUT-ATTR-SAME: ptr noundef readnone [[P:%.*]]) local_unnamed_addr #[[ATTR1]] {
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret i64 -1
 //
 size_t test3_bdos_cast(struct annotated *p) {
@@ -327,68 +327,68 @@ size_t test3_bdos_cast(struct annotated *p) {
 
 // SANITIZE-WITH-ATTR-LABEL: define dso_local void @test4(
 // SANITIZE-WITH-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[INDEX:%.*]], i32 noundef [[FAM_IDX:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// SANITIZE-WITH-ATTR-NEXT:  entry:
+// SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 12
 // SANITIZE-WITH-ATTR-NEXT:    [[DOTCOUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
 // SANITIZE-WITH-ATTR-NEXT:    [[DOTCOUNTED_BY_LOAD:%.*]] = load i32, ptr [[DOTCOUNTED_BY_GEP]], align 4
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = zext i32 [[DOTCOUNTED_BY_LOAD]] to i64, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = icmp ugt i32 [[DOTCOUNTED_BY_LOAD]], 2
-// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP1]], label [[CONT1:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR:       handler.out_of_bounds:
+// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP1]], label %[[CONT1:.*]], label %[[HANDLER_OUT_OF_BOUNDS:.*]], !prof [[PROF3]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR:       [[HANDLER_OUT_OF_BOUNDS]]:
 // SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB5:[0-9]+]], i64 3) #[[ATTR8]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR:       cont1:
+// SANITIZE-WITH-ATTR:       [[CONT1]]:
 // SANITIZE-WITH-ATTR-NEXT:    [[FLEXIBLE_ARRAY_MEMBER_SIZE:%.*]] = shl i32 [[DOTCOUNTED_BY_LOAD]], 2
 // SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = icmp ult i64 [[IDXPROM]], [[TMP0]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP2]], label [[CONT12:%.*]], label [[HANDLER_OUT_OF_BOUNDS8:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR:       handler.out_of_bounds8:
+// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP2]], label %[[CONT12:.*]], label %[[HANDLER_OUT_OF_BOUNDS8:.*]], !prof [[PROF3]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR:       [[HANDLER_OUT_OF_BOUNDS8]]:
 // SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB6:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR8]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR:       cont12:
+// SANITIZE-WITH-ATTR:       [[CONT12]]:
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP3:%.*]] = icmp sgt i32 [[DOTCOUNTED_BY_LOAD]], 2
 // SANITIZE-WITH-ATTR-NEXT:    [[RESULT:%.*]] = add i32 [[FLEXIBLE_ARRAY_MEMBER_SIZE]], 244
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP4:%.*]] = and i32 [[RESULT]], 252
 // SANITIZE-WITH-ATTR-NEXT:    [[CONV2:%.*]] = select i1 [[TMP3]], i32 [[TMP4]], i32 0
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAY]], i64 [[IDXPROM]]
-// SANITIZE-WITH-ATTR-NEXT:    store i32 [[CONV2]], ptr [[ARRAYIDX10]], align 4, !tbaa [[TBAA4]]
+// SANITIZE-WITH-ATTR-NEXT:    store i32 [[CONV2]], ptr [[ARRAYIDX10]], align 4, !tbaa [[INT_TBAA4]]
 // SANITIZE-WITH-ATTR-NEXT:    [[DOTNOT81:%.*]] = icmp eq i32 [[DOTCOUNTED_BY_LOAD]], 3
-// SANITIZE-WITH-ATTR-NEXT:    br i1 [[DOTNOT81]], label [[HANDLER_OUT_OF_BOUNDS18:%.*]], label [[CONT19:%.*]], !prof [[PROF8:![0-9]+]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR:       handler.out_of_bounds18:
+// SANITIZE-WITH-ATTR-NEXT:    br i1 [[DOTNOT81]], label %[[HANDLER_OUT_OF_BOUNDS18:.*]], label %[[CONT19:.*]], !prof [[PROF8:![0-9]+]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR:       [[HANDLER_OUT_OF_BOUNDS18]]:
 // SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB7:[0-9]+]], i64 4) #[[ATTR8]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR:       cont19:
+// SANITIZE-WITH-ATTR:       [[CONT19]]:
 // SANITIZE-WITH-ATTR-NEXT:    [[ADD:%.*]] = add nsw i32 [[INDEX]], 1
 // SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM31:%.*]] = sext i32 [[ADD]] to i64
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[IDXPROM31]], [[TMP0]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP5]], label [[CONT38:%.*]], label [[HANDLER_OUT_OF_BOUNDS34:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR:       handler.out_of_bounds34:
+// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP5]], label %[[CONT38:.*]], label %[[HANDLER_OUT_OF_BOUNDS34:.*]], !prof [[PROF3]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR:       [[HANDLER_OUT_OF_BOUNDS34]]:
 // SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB8:[0-9]+]], i64 [[IDXPROM31]]) #[[ATTR8]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR:       cont38:
+// SANITIZE-WITH-ATTR:       [[CONT38]]:
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP6:%.*]] = icmp sgt i32 [[DOTCOUNTED_BY_LOAD]], 3
 // SANITIZE-WITH-ATTR-NEXT:    [[RESULT25:%.*]] = add i32 [[FLEXIBLE_ARRAY_MEMBER_SIZE]], 240
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP7:%.*]] = and i32 [[RESULT25]], 252
 // SANITIZE-WITH-ATTR-NEXT:    [[CONV27:%.*]] = select i1 [[TMP6]], i32 [[TMP7]], i32 0
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX36:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAY]], i64 [[IDXPROM31]]
-// SANITIZE-WITH-ATTR-NEXT:    store i32 [[CONV27]], ptr [[ARRAYIDX36]], align 4, !tbaa [[TBAA4]]
+// SANITIZE-WITH-ATTR-NEXT:    store i32 [[CONV27]], ptr [[ARRAYIDX36]], align 4, !tbaa [[INT_TBAA4]]
 // SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM42:%.*]] = sext i32 [[FAM_IDX]] to i64
 // SANITIZE-WITH-ATTR-NEXT:    [[DOTCOUNTED_BY_LOAD44:%.*]] = load i32, ptr [[DOTCOUNTED_BY_GEP]], align 4
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP8:%.*]] = zext i32 [[DOTCOUNTED_BY_LOAD44]] to i64, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    [[DOTNOT:%.*]] = icmp ugt i64 [[IDXPROM42]], [[TMP8]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    br i1 [[DOTNOT]], label [[HANDLER_OUT_OF_BOUNDS45:%.*]], label [[CONT46:%.*]], !prof [[PROF8]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR:       handler.out_of_bounds45:
+// SANITIZE-WITH-ATTR-NEXT:    br i1 [[DOTNOT]], label %[[HANDLER_OUT_OF_BOUNDS45:.*]], label %[[CONT46:.*]], !prof [[PROF8]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR:       [[HANDLER_OUT_OF_BOUNDS45]]:
 // SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB9:[0-9]+]], i64 [[IDXPROM42]]) #[[ATTR8]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR:       cont46:
+// SANITIZE-WITH-ATTR:       [[CONT46]]:
 // SANITIZE-WITH-ATTR-NEXT:    [[ADD59:%.*]] = add nsw i32 [[INDEX]], 2
 // SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM60:%.*]] = sext i32 [[ADD59]] to i64
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP9:%.*]] = icmp ult i64 [[IDXPROM60]], [[TMP8]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP9]], label [[CONT67:%.*]], label [[HANDLER_OUT_OF_BOUNDS63:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR:       handler.out_of_bounds63:
+// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP9]], label %[[CONT67:.*]], label %[[HANDLER_OUT_OF_BOUNDS63:.*]], !prof [[PROF3]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR:       [[HANDLER_OUT_OF_BOUNDS63]]:
 // SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB10:[0-9]+]], i64 [[IDXPROM60]]) #[[ATTR8]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR:       cont67:
+// SANITIZE-WITH-ATTR:       [[CONT67]]:
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX65:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAY]], i64 [[IDXPROM60]]
 // SANITIZE-WITH-ATTR-NEXT:    [[COUNT50:%.*]] = sext i32 [[DOTCOUNTED_BY_LOAD44]] to i64
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP10:%.*]] = sub nsw i64 [[COUNT50]], [[IDXPROM42]]
@@ -396,12 +396,12 @@ size_t test3_bdos_cast(struct annotated *p) {
 // SANITIZE-WITH-ATTR-NEXT:    [[DOTTR:%.*]] = trunc nuw nsw i64 [[TMP11]] to i32
 // SANITIZE-WITH-ATTR-NEXT:    [[CONV54:%.*]] = shl i32 [[DOTTR]], 2
 // SANITIZE-WITH-ATTR-NEXT:    [[CONV55:%.*]] = and i32 [[CONV54]], 252
-// SANITIZE-WITH-ATTR-NEXT:    store i32 [[CONV55]], ptr [[ARRAYIDX65]], align 4, !tbaa [[TBAA4]]
+// SANITIZE-WITH-ATTR-NEXT:    store i32 [[CONV55]], ptr [[ARRAYIDX65]], align 4, !tbaa [[INT_TBAA4]]
 // SANITIZE-WITH-ATTR-NEXT:    ret void
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local void @test4(
 // NO-SANITIZE-WITH-ATTR-SAME: ptr noundef captures(none) [[P:%.*]], i32 noundef [[INDEX:%.*]], i32 noundef [[FAM_IDX:%.*]]) local_unnamed_addr #[[ATTR1]] {
-// NO-SANITIZE-WITH-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 12
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i32, ptr [[COUNTED_BY_GEP]], align 4
@@ -412,7 +412,7 @@ size_t test3_bdos_cast(struct annotated *p) {
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[CONV1:%.*]] = select i1 [[TMP0]], i32 [[TMP1]], i32 0
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[ARRAY]], i64 [[IDXPROM]]
-// NO-SANITIZE-WITH-ATTR-NEXT:    store i32 [[CONV1]], ptr [[ARRAYIDX3]], align 4, !tbaa [[TBAA2]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    store i32 [[CONV1]], ptr [[ARRAYIDX3]], align 4, !tbaa [[INT_TBAA2]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD7:%.*]] = load i32, ptr [[COUNTED_BY_GEP]], align 4
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[FLEXIBLE_ARRAY_MEMBER_SIZE9:%.*]] = shl i32 [[COUNTED_BY_LOAD7]], 2
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[RESULT10:%.*]] = add i32 [[FLEXIBLE_ARRAY_MEMBER_SIZE9]], 240
@@ -420,7 +420,7 @@ size_t test3_bdos_cast(struct annotated *p) {
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP3:%.*]] = and i32 [[RESULT10]], 252
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[CONV12:%.*]] = select i1 [[TMP2]], i32 [[TMP3]], i32 0
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr i8, ptr [[ARRAYIDX3]], i64 4
-// NO-SANITIZE-WITH-ATTR-NEXT:    store i32 [[CONV12]], ptr [[ARRAYIDX15]], align 4, !tbaa [[TBAA2]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    store i32 [[CONV12]], ptr [[ARRAYIDX15]], align 4, !tbaa [[INT_TBAA2]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM17:%.*]] = sext i32 [[FAM_IDX]] to i64
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD20:%.*]] = load i32, ptr [[COUNTED_BY_GEP]], align 4
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNT21:%.*]] = sext i32 [[COUNTED_BY_LOAD20]] to i64
@@ -433,33 +433,33 @@ size_t test3_bdos_cast(struct annotated *p) {
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP9:%.*]] = and i32 [[TMP8]], 252
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[CONV26:%.*]] = select i1 [[TMP7]], i32 [[TMP9]], i32 0
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX30:%.*]] = getelementptr i8, ptr [[ARRAYIDX3]], i64 8
-// NO-SANITIZE-WITH-ATTR-NEXT:    store i32 [[CONV26]], ptr [[ARRAYIDX30]], align 4, !tbaa [[TBAA2]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    store i32 [[CONV26]], ptr [[ARRAYIDX30]], align 4, !tbaa [[INT_TBAA2]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    ret void
 //
 // SANITIZE-WITHOUT-ATTR-LABEL: define dso_local void @test4(
 // SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[INDEX:%.*]], i32 noundef [[FAM_IDX:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 12
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr i32, ptr [[ARRAY]], i64 [[IDXPROM]]
-// SANITIZE-WITHOUT-ATTR-NEXT:    store i32 255, ptr [[ARRAYIDX5]], align 4, !tbaa [[TBAA2]]
+// SANITIZE-WITHOUT-ATTR-NEXT:    store i32 255, ptr [[ARRAYIDX5]], align 4, !tbaa [[INT_TBAA2]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr i8, ptr [[ARRAYIDX5]], i64 4
-// SANITIZE-WITHOUT-ATTR-NEXT:    store i32 255, ptr [[ARRAYIDX18]], align 4, !tbaa [[TBAA2]]
+// SANITIZE-WITHOUT-ATTR-NEXT:    store i32 255, ptr [[ARRAYIDX18]], align 4, !tbaa [[INT_TBAA2]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX33:%.*]] = getelementptr i8, ptr [[ARRAYIDX5]], i64 8
-// SANITIZE-WITHOUT-ATTR-NEXT:    store i32 255, ptr [[ARRAYIDX33]], align 4, !tbaa [[TBAA2]]
+// SANITIZE-WITHOUT-ATTR-NEXT:    store i32 255, ptr [[ARRAYIDX33]], align 4, !tbaa [[INT_TBAA2]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    ret void
 //
 // NO-SANITIZE-WITHOUT-ATTR-LABEL: define dso_local void @test4(
 // NO-SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[INDEX:%.*]], i32 noundef [[FAM_IDX:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 12
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[ARRAY]], i64 [[IDXPROM]]
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:    store i32 255, ptr [[ARRAYIDX3]], align 4, !tbaa [[TBAA2]]
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    store i32 255, ptr [[ARRAYIDX3]], align 4, !tbaa [[INT_TBAA2]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr i8, ptr [[ARRAYIDX3]], i64 4
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:    store i32 255, ptr [[ARRAYIDX10]], align 4, !tbaa [[TBAA2]]
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    store i32 255, ptr [[ARRAYIDX10]], align 4, !tbaa [[INT_TBAA2]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX19:%.*]] = getelementptr i8, ptr [[ARRAYIDX3]], i64 8
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:    store i32 255, ptr [[ARRAYIDX19]], align 4, !tbaa [[TBAA2]]
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    store i32 255, ptr [[ARRAYIDX19]], align 4, !tbaa [[INT_TBAA2]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret void
 //
 void test4(struct annotated *p, int index, int fam_idx) {
@@ -471,17 +471,17 @@ void test4(struct annotated *p, int index, int fam_idx) {
 
 // SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 0, 8589934589) i64 @test4_bdos(
 // SANITIZE-WITH-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// SANITIZE-WITH-ATTR-NEXT:  entry:
+// SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
 // SANITIZE-WITH-ATTR-NEXT:    [[DOTCOUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
 // SANITIZE-WITH-ATTR-NEXT:    [[DOTCOUNTED_BY_LOAD:%.*]] = load i32, ptr [[DOTCOUNTED_BY_GEP]], align 4
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = zext i32 [[DOTCOUNTED_BY_LOAD]] to i64, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    [[DOTNOT:%.*]] = icmp ugt i64 [[IDXPROM]], [[TMP0]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    br i1 [[DOTNOT]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], label [[CONT1:%.*]], !prof [[PROF8]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR:       handler.out_of_bounds:
+// SANITIZE-WITH-ATTR-NEXT:    br i1 [[DOTNOT]], label %[[HANDLER_OUT_OF_BOUNDS:.*]], label %[[CONT1:.*]], !prof [[PROF8]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR:       [[HANDLER_OUT_OF_BOUNDS]]:
 // SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB11:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR8]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR:       cont1:
+// SANITIZE-WITH-ATTR:       [[CONT1]]:
 // SANITIZE-WITH-ATTR-NEXT:    [[COUNT:%.*]] = sext i32 [[DOTCOUNTED_BY_LOAD]] to i64
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = sub nsw i64 [[COUNT]], [[IDXPROM]]
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = tail call i64 @llvm.smax.i64(i64 [[TMP1]], i64 0)
@@ -490,7 +490,7 @@ void test4(struct annotated *p, int index, int fam_idx) {
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 -17179869180, 17179869181) i64 @test4_bdos(
 // NO-SANITIZE-WITH-ATTR-SAME: ptr noundef readonly captures(none) [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR2]] {
-// NO-SANITIZE-WITH-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i32, ptr [[COUNTED_BY_GEP]], align 4
@@ -505,12 +505,12 @@ void test4(struct annotated *p, int index, int fam_idx) {
 //
 // SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i64 @test4_bdos(
 // SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    ret i64 -1
 //
 // NO-SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i64 @test4_bdos(
 // NO-SANITIZE-WITHOUT-ATTR-SAME: ptr noundef readnone [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR1]] {
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret i64 -1
 //
 size_t test4_bdos(struct annotated *p, int index) {
@@ -519,7 +519,7 @@ size_t test4_bdos(struct annotated *p, int index) {
 
 // SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 -12884901886, 12884901885) i64 @test4_bdos_cast1(
 // SANITIZE-WITH-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// SANITIZE-WITH-ATTR-NEXT:  entry:
+// SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
 // SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
 // SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i32, ptr [[COUNTED_BY_GEP]], align 4
@@ -535,7 +535,7 @@ size_t test4_bdos(struct annotated *p, int index) {
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 -12884901886, 12884901885) i64 @test4_bdos_cast1(
 // NO-SANITIZE-WITH-ATTR-SAME: ptr noundef readonly captures(none) [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR2]] {
-// NO-SANITIZE-WITH-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i32, ptr [[COUNTED_BY_GEP]], align 4
@@ -551,12 +551,12 @@ size_t test4_bdos(struct annotated *p, int index) {
 //
 // SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i64 @test4_bdos_cast1(
 // SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    ret i64 -1
 //
 // NO-SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i64 @test4_bdos_cast1(
 // NO-SANITIZE-WITHOUT-ATTR-SAME: ptr noundef readnone [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR1]] {
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret i64 -1
 //
 size_t test4_bdos_cast1(struct annotated *p, int index) {
@@ -565,7 +565,7 @@ size_t test4_bdos_cast1(struct annotated *p, int index) {
 
 // SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 -10737418239, 10737418237) i64 @test4_bdos_cast2(
 // SANITIZE-WITH-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// SANITIZE-WITH-ATTR-NEXT:  entry:
+// SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
 // SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
 // SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i32, ptr [[COUNTED_BY_GEP]], align 4
@@ -580,7 +580,7 @@ size_t test4_bdos_cast1(struct annotated *p, int index) {
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 -10737418239, 10737418237) i64 @test4_bdos_cast2(
 // NO-SANITIZE-WITH-ATTR-SAME: ptr noundef readonly captures(none) [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR2]] {
-// NO-SANITIZE-WITH-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i32, ptr [[COUNTED_BY_GEP]], align 4
@@ -595,12 +595,12 @@ size_t test4_bdos_cast1(struct annotated *p, int index) {
 //
 // SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i64 @test4_bdos_cast2(
 // SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    ret i64 -1
 //
 // NO-SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i64 @test4_bdos_cast2(
 // NO-SANITIZE-WITHOUT-ATTR-SAME: ptr noundef readnone [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR1]] {
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret i64 -1
 //
 size_t test4_bdos_cast2(struct annotated *p, int index) {
@@ -609,46 +609,46 @@ size_t test4_bdos_cast2(struct annotated *p, int index) {
 
 // SANITIZE-WITH-ATTR-LABEL: define dso_local void @test5(
 // SANITIZE-WITH-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// SANITIZE-WITH-ATTR-NEXT:  entry:
+// SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
 // SANITIZE-WITH-ATTR-NEXT:    [[DOTCOUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
 // SANITIZE-WITH-ATTR-NEXT:    [[DOTCOUNTED_BY_LOAD:%.*]] = load i64, ptr [[DOTCOUNTED_BY_GEP]], align 4
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = icmp ugt i64 [[DOTCOUNTED_BY_LOAD]], [[IDXPROM]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP0]], label [[CONT3:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR:       handler.out_of_bounds:
+// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP0]], label %[[CONT3:.*]], label %[[HANDLER_OUT_OF_BOUNDS:.*]], !prof [[PROF3]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR:       [[HANDLER_OUT_OF_BOUNDS]]:
 // SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB12:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR8]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR:       cont3:
+// SANITIZE-WITH-ATTR:       [[CONT3]]:
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 16
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 [[IDXPROM]]
-// SANITIZE-WITH-ATTR-NEXT:    store i32 -1, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA4]]
+// SANITIZE-WITH-ATTR-NEXT:    store i32 -1, ptr [[ARRAYIDX]], align 4, !tbaa [[INT_TBAA4]]
 // SANITIZE-WITH-ATTR-NEXT:    ret void
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local void @test5(
 // NO-SANITIZE-WITH-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// NO-SANITIZE-WITH-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 16
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 [[IDXPROM]]
-// NO-SANITIZE-WITH-ATTR-NEXT:    store i32 -1, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    store i32 -1, ptr [[ARRAYIDX]], align 4, !tbaa [[INT_TBAA2]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    ret void
 //
 // SANITIZE-WITHOUT-ATTR-LABEL: define dso_local void @test5(
 // SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 16
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 [[IDXPROM]]
-// SANITIZE-WITHOUT-ATTR-NEXT:    store i32 -1, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
+// SANITIZE-WITHOUT-ATTR-NEXT:    store i32 -1, ptr [[ARRAYIDX]], align 4, !tbaa [[INT_TBAA2]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    ret void
 //
 // NO-SANITIZE-WITHOUT-ATTR-LABEL: define dso_local void @test5(
 // NO-SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 16
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 [[IDXPROM]]
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:    store i32 -1, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    store i32 -1, ptr [[ARRAYIDX]], align 4, !tbaa [[INT_TBAA2]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret void
 //
 void test5(struct anon_struct *p, int index) {
@@ -657,22 +657,22 @@ void test5(struct anon_struct *p, int index) {
 
 // SANITIZE-WITH-ATTR-LABEL: define dso_local i64 @test5_bdos(
 // SANITIZE-WITH-ATTR-SAME: ptr noundef readnone [[P:%.*]]) local_unnamed_addr #[[ATTR2]] {
-// SANITIZE-WITH-ATTR-NEXT:  entry:
+// SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITH-ATTR-NEXT:    ret i64 -1
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local i64 @test5_bdos(
 // NO-SANITIZE-WITH-ATTR-SAME: ptr noundef readnone [[P:%.*]]) local_unnamed_addr #[[ATTR3]] {
-// NO-SANITIZE-WITH-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    ret i64 -1
 //
 // SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i64 @test5_bdos(
 // SANITIZE-WITHOUT-ATTR-SAME: ptr noundef readnone [[P:%.*]]) local_unnamed_addr #[[ATTR2]] {
-// SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    ret i64 -1
 //
 // NO-SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i64 @test5_bdos(
 // NO-SANITIZE-WITHOUT-ATTR-SAME: ptr noundef readnone [[P:%.*]]) local_unnamed_addr #[[ATTR1]] {
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret i64 -1
 //
 size_t test5_bdos(struct anon_struct *p) {
@@ -681,27 +681,27 @@ size_t test5_bdos(struct anon_struct *p) {
 
 // SANITIZE-WITH-ATTR-LABEL: define dso_local void @test6(
 // SANITIZE-WITH-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// SANITIZE-WITH-ATTR-NEXT:  entry:
+// SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
 // SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i64, ptr [[COUNTED_BY_GEP]], align 4
 // SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = icmp ugt i64 [[COUNTED_BY_LOAD]], [[IDXPROM]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP0]], label [[CONT6:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR:       handler.out_of_bounds:
+// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP0]], label %[[CONT6:.*]], label %[[HANDLER_OUT_OF_BOUNDS:.*]], !prof [[PROF3]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR:       [[HANDLER_OUT_OF_BOUNDS]]:
 // SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB13:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR8]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR:       cont6:
+// SANITIZE-WITH-ATTR:       [[CONT6]]:
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 16
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 [[IDXPROM]]
 // SANITIZE-WITH-ATTR-NEXT:    [[FLEXIBLE_ARRAY_MEMBER_SIZE:%.*]] = shl nuw i64 [[COUNTED_BY_LOAD]], 2
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = tail call i64 @llvm.smax.i64(i64 [[FLEXIBLE_ARRAY_MEMBER_SIZE]], i64 0)
 // SANITIZE-WITH-ATTR-NEXT:    [[CONV:%.*]] = trunc i64 [[TMP2]] to i32
-// SANITIZE-WITH-ATTR-NEXT:    store i32 [[CONV]], ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA4]]
+// SANITIZE-WITH-ATTR-NEXT:    store i32 [[CONV]], ptr [[ARRAYIDX]], align 4, !tbaa [[INT_TBAA4]]
 // SANITIZE-WITH-ATTR-NEXT:    ret void
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local void @test6(
 // NO-SANITIZE-WITH-ATTR-SAME: ptr noundef captures(none) [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR1]] {
-// NO-SANITIZE-WITH-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 16
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i64, ptr [[COUNTED_BY_GEP]], align 4
@@ -710,25 +710,25 @@ size_t test5_bdos(struct anon_struct *p) {
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[CONV:%.*]] = trunc i64 [[TMP1]] to i32
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 [[IDXPROM]]
-// NO-SANITIZE-WITH-ATTR-NEXT:    store i32 [[CONV]], ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    store i32 [[CONV]], ptr [[ARRAYIDX]], align 4, !tbaa [[INT_TBAA2]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    ret void
 //
 // SANITIZE-WITHOUT-ATTR-LABEL: define dso_local void @test6(
 // SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 16
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 [[IDXPROM]]
-// SANITIZE-WITHOUT-ATTR-NEXT:    store i32 -1, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
+// SANITIZE-WITHOUT-ATTR-NEXT:    store i32 -1, ptr [[ARRAYIDX]], align 4, !tbaa [[INT_TBAA2]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    ret void
 //
 // NO-SANITIZE-WITHOUT-ATTR-LABEL: define dso_local void @test6(
 // NO-SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 16
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 [[IDXPROM]]
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:    store i32 -1, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    store i32 -1, ptr [[ARRAYIDX]], align 4, !tbaa [[INT_TBAA2]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret void
 //
 void test6(struct anon_struct *p, int index) {
@@ -737,7 +737,7 @@ void test6(struct anon_struct *p, int index) {
 
 // SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 0, -9223372036854775808) i64 @test6_bdos(
 // SANITIZE-WITH-ATTR-SAME: ptr noundef [[P:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// SANITIZE-WITH-ATTR-NEXT:  entry:
+// SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
 // SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i64, ptr [[COUNTED_BY_GEP]], align 4
 // SANITIZE-WITH-ATTR-NEXT:    [[FLEXIBLE_ARRAY_MEMBER_SIZE:%.*]] = shl nuw i64 [[COUNTED_BY_LOAD]], 2
@@ -746,7 +746,7 @@ void test6(struct anon_struct *p, int index) {
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 0, -9223372036854775808) i64 @test6_bdos(
 // NO-SANITIZE-WITH-ATTR-SAME: ptr noundef readonly captures(none) [[P:%.*]]) local_unnamed_addr #[[ATTR2]] {
-// NO-SANITIZE-WITH-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i64, ptr [[COUNTED_BY_GEP]], align 4
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[FLEXIBLE_ARRAY_MEMBER_SIZE:%.*]] = shl nuw i64 [[COUNTED_BY_LOAD]], 2
@@ -755,12 +755,12 @@ void test6(struct anon_struct *p, int index) {
 //
 // SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i64 @test6_bdos(
 // SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[P:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    ret i64 -1
 //
 // NO-SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i64 @test6_bdos(
 // NO-SANITIZE-WITHOUT-ATTR-SAME: ptr noundef readnone [[P:%.*]]) local_unnamed_addr #[[ATTR1]] {
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret i64 -1
 //
 size_t test6_bdos(struct anon_struct *p) {
@@ -769,47 +769,47 @@ size_t test6_bdos(struct anon_struct *p) {
 
 // SANITIZE-WITH-ATTR-LABEL: define dso_local void @test7(
 // SANITIZE-WITH-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// SANITIZE-WITH-ATTR-NEXT:  entry:
+// SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
 // SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
 // SANITIZE-WITH-ATTR-NEXT:    [[DOTCOUNTED_BY_LOAD:%.*]] = load i8, ptr [[TMP0]], align 4
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = zext i8 [[DOTCOUNTED_BY_LOAD]] to i64, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = icmp ult i64 [[IDXPROM]], [[TMP1]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP2]], label [[CONT7:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR:       handler.out_of_bounds:
+// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP2]], label %[[CONT7:.*]], label %[[HANDLER_OUT_OF_BOUNDS:.*]], !prof [[PROF3]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR:       [[HANDLER_OUT_OF_BOUNDS]]:
 // SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB15:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR8]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR:       cont7:
+// SANITIZE-WITH-ATTR:       [[CONT7]]:
 // SANITIZE-WITH-ATTR-NEXT:    [[INTS:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 9
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i8, ptr [[INTS]], i64 [[IDXPROM]]
-// SANITIZE-WITH-ATTR-NEXT:    store i8 -1, ptr [[ARRAYIDX]], align 1, !tbaa [[TBAA9:![0-9]+]]
+// SANITIZE-WITH-ATTR-NEXT:    store i8 -1, ptr [[ARRAYIDX]], align 1, !tbaa [[CHAR_TBAA9:![0-9]+]]
 // SANITIZE-WITH-ATTR-NEXT:    ret void
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local void @test7(
 // NO-SANITIZE-WITH-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// NO-SANITIZE-WITH-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[INTS:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 9
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[INTS]], i64 [[IDXPROM]]
-// NO-SANITIZE-WITH-ATTR-NEXT:    store i8 -1, ptr [[ARRAYIDX]], align 1, !tbaa [[TBAA6:![0-9]+]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    store i8 -1, ptr [[ARRAYIDX]], align 1, !tbaa [[CHAR_TBAA6:![0-9]+]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    ret void
 //
 // SANITIZE-WITHOUT-ATTR-LABEL: define dso_local void @test7(
 // SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[INTS:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 9
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[INTS]], i64 [[IDXPROM]]
-// SANITIZE-WITHOUT-ATTR-NEXT:    store i8 -1, ptr [[ARRAYIDX]], align 1, !tbaa [[TBAA6:![0-9]+]]
+// SANITIZE-WITHOUT-ATTR-NEXT:    store i8 -1, ptr [[ARRAYIDX]], align 1, !tbaa [[CHAR_TBAA6:![0-9]+]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    ret void
 //
 // NO-SANITIZE-WITHOUT-ATTR-LABEL: define dso_local void @test7(
 // NO-SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[INTS:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 9
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[INTS]], i64 [[IDXPROM]]
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:    store i8 -1, ptr [[ARRAYIDX]], align 1, !tbaa [[TBAA6:![0-9]+]]
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    store i8 -1, ptr [[ARRAYIDX]], align 1, !tbaa [[CHAR_TBAA6:![0-9]+]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret void
 //
 void test7(struct union_of_fams *p, int index) {
@@ -818,22 +818,22 @@ void test7(struct union_of_fams *p, int index) {
 
 // SANITIZE-WITH-ATTR-LABEL: define dso_local i64 @test7_bdos(
 // SANITIZE-WITH-ATTR-SAME: ptr noundef readnone [[P:%.*]]) local_unnamed_addr #[[ATTR2]] {
-// SANITIZE-WITH-ATTR-NEXT:  entry:
+// SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITH-ATTR-NEXT:    ret i64 -1
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local i64 @test7_bdos(
 // NO-SANITIZE-WITH-ATTR-SAME: ptr noundef readnone [[P:%.*]]) local_unnamed_addr #[[ATTR3]] {
-// NO-SANITIZE-WITH-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    ret i64 -1
 //
 // SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i64 @test7_bdos(
 // SANITIZE-WITHOUT-ATTR-SAME: ptr noundef readnone [[P:%.*]]) local_unnamed_addr #[[ATTR2]] {
-// SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    ret i64 -1
 //
 // NO-SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i64 @test7_bdos(
 // NO-SANITIZE-WITHOUT-ATTR-SAME: ptr noundef readnone [[P:%.*]]) local_unnamed_addr #[[ATTR1]] {
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret i64 -1
 //
 size_t test7_bdos(struct union_of_fams *p) {
@@ -842,49 +842,49 @@ size_t test7_bdos(struct union_of_fams *p) {
 
 // SANITIZE-WITH-ATTR-LABEL: define dso_local void @test8(
 // SANITIZE-WITH-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// SANITIZE-WITH-ATTR-NEXT:  entry:
+// SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
 // SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i8, ptr [[TMP0]], align 4
 // SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = zext i8 [[COUNTED_BY_LOAD]] to i64, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = icmp ult i64 [[IDXPROM]], [[TMP1]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP2]], label [[CONT14:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR:       handler.out_of_bounds:
+// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP2]], label %[[CONT14:.*]], label %[[HANDLER_OUT_OF_BOUNDS:.*]], !prof [[PROF3]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR:       [[HANDLER_OUT_OF_BOUNDS]]:
 // SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB16:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR8]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR:       cont14:
+// SANITIZE-WITH-ATTR:       [[CONT14]]:
 // SANITIZE-WITH-ATTR-NEXT:    [[INTS:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 9
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i8, ptr [[INTS]], i64 [[IDXPROM]]
-// SANITIZE-WITH-ATTR-NEXT:    store i8 [[COUNTED_BY_LOAD]], ptr [[ARRAYIDX]], align 1, !tbaa [[TBAA9]]
+// SANITIZE-WITH-ATTR-NEXT:    store i8 [[COUNTED_BY_LOAD]], ptr [[ARRAYIDX]], align 1, !tbaa [[CHAR_TBAA9]]
 // SANITIZE-WITH-ATTR-NEXT:    ret void
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local void @test8(
 // NO-SANITIZE-WITH-ATTR-SAME: ptr noundef captures(none) [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR1]] {
-// NO-SANITIZE-WITH-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[INTS:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 9
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i8, ptr [[TMP0]], align 4
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[INTS]], i64 [[IDXPROM]]
-// NO-SANITIZE-WITH-ATTR-NEXT:    store i8 [[COUNTED_BY_LOAD]], ptr [[ARRAYIDX]], align 1, !tbaa [[TBAA6]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    store i8 [[COUNTED_BY_LOAD]], ptr [[ARRAYIDX]], align 1, !tbaa [[CHAR_TBAA6]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    ret void
 //
 // SANITIZE-WITHOUT-ATTR-LABEL: define dso_local void @test8(
 // SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[INTS:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 9
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[INTS]], i64 [[IDXPROM]]
-// SANITIZE-WITHOUT-ATTR-NEXT:    store i8 -1, ptr [[ARRAYIDX]], align 1, !tbaa [[TBAA6]]
+// SANITIZE-WITHOUT-ATTR-NEXT:    store i8 -1, ptr [[ARRAYIDX]], align 1, !tbaa [[CHAR_TBAA6]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    ret void
 //
 // NO-SANITIZE-WITHOUT-ATTR-LABEL: define dso_local void @test8(
 // NO-SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[INTS:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 9
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[INTS]], i64 [[IDXPROM]]
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:    store i8 -1, ptr [[ARRAYIDX]], align 1, !tbaa [[TBAA6]]
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    store i8 -1, ptr [[ARRAYIDX]], align 1, !tbaa [[CHAR_TBAA6]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret void
 //
 void test8(struct union_of_fams *p, int index) {
@@ -893,7 +893,7 @@ void test8(struct union_of_fams *p, int index) {
 
 // SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 0, 256) i64 @test8_bdos(
 // SANITIZE-WITH-ATTR-SAME: ptr noundef [[P:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// SANITIZE-WITH-ATTR-NEXT:  entry:
+// SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
 // SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i8, ptr [[TMP0]], align 4
 // SANITIZE-WITH-ATTR-NEXT:    [[COUNT:%.*]] = zext i8 [[COUNTED_BY_LOAD]] to i64
@@ -901,7 +901,7 @@ void test8(struct union_of_fams *p, int index) {
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 0, 256) i64 @test8_bdos(
 // NO-SANITIZE-WITH-ATTR-SAME: ptr noundef readonly captures(none) [[P:%.*]]) local_unnamed_addr #[[ATTR2]] {
-// NO-SANITIZE-WITH-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i8, ptr [[TMP0]], align 4
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNT:%.*]] = zext i8 [[COUNTED_BY_LOAD]] to i64
@@ -909,12 +909,12 @@ void test8(struct union_of_fams *p, int index) {
 //
 // SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i64 @test8_bdos(
 // SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[P:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    ret i64 -1
 //
 // NO-SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i64 @test8_bdos(
 // NO-SANITIZE-WITHOUT-ATTR-SAME: ptr noundef readnone [[P:%.*]]) local_unnamed_addr #[[ATTR1]] {
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret i64 -1
 //
 size_t test8_bdos(struct union_of_fams *p) {
@@ -923,47 +923,47 @@ size_t test8_bdos(struct union_of_fams *p) {
 
 // SANITIZE-WITH-ATTR-LABEL: define dso_local void @test9(
 // SANITIZE-WITH-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// SANITIZE-WITH-ATTR-NEXT:  entry:
+// SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
 // SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
 // SANITIZE-WITH-ATTR-NEXT:    [[DOTCOUNTED_BY_LOAD:%.*]] = load i32, ptr [[TMP0]], align 4
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = zext i32 [[DOTCOUNTED_BY_LOAD]] to i64, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = icmp ult i64 [[IDXPROM]], [[TMP1]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP2]], label [[CONT7:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR:       handler.out_of_bounds:
+// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP2]], label %[[CONT7:.*]], label %[[HANDLER_OUT_OF_BOUNDS:.*]], !prof [[PROF3]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR:       [[HANDLER_OUT_OF_BOUNDS]]:
 // SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB18:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR8]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR:       cont7:
+// SANITIZE-WITH-ATTR:       [[CONT7]]:
 // SANITIZE-WITH-ATTR-NEXT:    [[BYTES:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 12
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i8, ptr [[BYTES]], i64 [[IDXPROM]]
-// SANITIZE-WITH-ATTR-NEXT:    store i8 -1, ptr [[ARRAYIDX]], align 1, !tbaa [[TBAA9]]
+// SANITIZE-WITH-ATTR-NEXT:    store i8 -1, ptr [[ARRAYIDX]], align 1, !tbaa [[CHAR_TBAA9]]
 // SANITIZE-WITH-ATTR-NEXT:    ret void
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local void @test9(
 // NO-SANITIZE-WITH-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// NO-SANITIZE-WITH-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[BYTES:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 12
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[BYTES]], i64 [[IDXPROM]]
-// NO-SANITIZE-WITH-ATTR-NEXT:    store i8 -1, ptr [[ARRAYIDX]], align 1, !tbaa [[TBAA6]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    store i8 -1, ptr [[ARRAYIDX]], align 1, !tbaa [[CHAR_TBAA6]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    ret void
 //
 // SANITIZE-WITHOUT-ATTR-LABEL: define dso_local void @test9(
 // SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[BYTES:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 12
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[BYTES]], i64 [[IDXPROM]]
-// SANITIZE-WITHOUT-ATTR-NEXT:    store i8 -1, ptr [[ARRAYIDX]], align 1, !tbaa [[TBAA6]]
+// SANITIZE-WITHOUT-ATTR-NEXT:    store i8 -1, ptr [[ARRAYIDX]], align 1, !tbaa [[CHAR_TBAA6]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    ret void
 //
 // NO-SANITIZE-WITHOUT-ATTR-LABEL: define dso_local void @test9(
 // NO-SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[BYTES:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 12
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[BYTES]], i64 [[IDXPROM]]
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:    store i8 -1, ptr [[ARRAYIDX]], align 1, !tbaa [[TBAA6]]
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    store i8 -1, ptr [[ARRAYIDX]], align 1, !tbaa [[CHAR_TBAA6]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret void
 //
 void test9(struct union_of_fams *p, int index) {
@@ -972,22 +972,22 @@ void test9(struct union_of_fams *p, int index) {
 
 // SANITIZE-WITH-ATTR-LABEL: define dso_local i64 @test9_bdos(
 // SANITIZE-WITH-ATTR-SAME: ptr noundef readnone [[P:%.*]]) local_unnamed_addr #[[ATTR2]] {
-// SANITIZE-WITH-ATTR-NEXT:  entry:
+// SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITH-ATTR-NEXT:    ret i64 -1
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local i64 @test9_bdos(
 // NO-SANITIZE-WITH-ATTR-SAME: ptr noundef readnone [[P:%.*]]) local_unnamed_addr #[[ATTR3]] {
-// NO-SANITIZE-WITH-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    ret i64 -1
 //
 // SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i64 @test9_bdos(
 // SANITIZE-WITHOUT-ATTR-SAME: ptr noundef readnone [[P:%.*]]) local_unnamed_addr #[[ATTR2]] {
-// SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    ret i64 -1
 //
 // NO-SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i64 @test9_bdos(
 // NO-SANITIZE-WITHOUT-ATTR-SAME: ptr noundef readnone [[P:%.*]]) local_unnamed_addr #[[ATTR1]] {
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret i64 -1
 //
 size_t test9_bdos(struct union_of_fams *p) {
@@ -996,27 +996,27 @@ size_t test9_bdos(struct union_of_fams *p) {
 
 // SANITIZE-WITH-ATTR-LABEL: define dso_local void @test10(
 // SANITIZE-WITH-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// SANITIZE-WITH-ATTR-NEXT:  entry:
+// SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
 // SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i32, ptr [[TMP0]], align 4
 // SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = zext i32 [[COUNTED_BY_LOAD]] to i64, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = icmp ult i64 [[IDXPROM]], [[TMP1]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP2]], label [[CONT14:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR:       handler.out_of_bounds:
+// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP2]], label %[[CONT14:.*]], label %[[HANDLER_OUT_OF_BOUNDS:.*]], !prof [[PROF3]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR:       [[HANDLER_OUT_OF_BOUNDS]]:
 // SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB19:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR8]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR:       cont14:
+// SANITIZE-WITH-ATTR:       [[CONT14]]:
 // SANITIZE-WITH-ATTR-NEXT:    [[BYTES:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 12
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i8, ptr [[BYTES]], i64 [[IDXPROM]]
 // SANITIZE-WITH-ATTR-NEXT:    [[NARROW:%.*]] = tail call i32 @llvm.smax.i32(i32 [[COUNTED_BY_LOAD]], i32 0)
 // SANITIZE-WITH-ATTR-NEXT:    [[CONV:%.*]] = trunc i32 [[NARROW]] to i8
-// SANITIZE-WITH-ATTR-NEXT:    store i8 [[CONV]], ptr [[ARRAYIDX]], align 1, !tbaa [[TBAA9]]
+// SANITIZE-WITH-ATTR-NEXT:    store i8 [[CONV]], ptr [[ARRAYIDX]], align 1, !tbaa [[CHAR_TBAA9]]
 // SANITIZE-WITH-ATTR-NEXT:    ret void
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local void @test10(
 // NO-SANITIZE-WITH-ATTR-SAME: ptr noundef captures(none) [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR1]] {
-// NO-SANITIZE-WITH-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[BYTES:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 12
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i32, ptr [[TMP0]], align 4
@@ -1024,25 +1024,25 @@ size_t test9_bdos(struct union_of_fams *p) {
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[CONV:%.*]] = trunc i32 [[NARROW]] to i8
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[BYTES]], i64 [[IDXPROM]]
-// NO-SANITIZE-WITH-ATTR-NEXT:    store i8 [[CONV]], ptr [[ARRAYIDX]], align 1, !tbaa [[TBAA6]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    store i8 [[CONV]], ptr [[ARRAYIDX]], align 1, !tbaa [[CHAR_TBAA6]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    ret void
 //
 // SANITIZE-WITHOUT-ATTR-LABEL: define dso_local void @test10(
 // SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[BYTES:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 12
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[BYTES]], i64 [[IDXPROM]]
-// SANITIZE-WITHOUT-ATTR-NEXT:    store i8 -1, ptr [[ARRAYIDX]], align 1, !tbaa [[TBAA6]]
+// SANITIZE-WITHOUT-ATTR-NEXT:    store i8 -1, ptr [[ARRAYIDX]], align 1, !tbaa [[CHAR_TBAA6]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    ret void
 //
 // NO-SANITIZE-WITHOUT-ATTR-LABEL: define dso_local void @test10(
 // NO-SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[BYTES:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 12
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[BYTES]], i64 [[IDXPROM]]
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:    store i8 -1, ptr [[ARRAYIDX]], align 1, !tbaa [[TBAA6]]
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    store i8 -1, ptr [[ARRAYIDX]], align 1, !tbaa [[CHAR_TBAA6]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret void
 //
 void test10(struct union_of_fams *p, int index) {
@@ -1051,7 +1051,7 @@ void test10(struct union_of_fams *p, int index) {
 
 // SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 0, 2147483648) i64 @test10_bdos(
 // SANITIZE-WITH-ATTR-SAME: ptr noundef [[P:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// SANITIZE-WITH-ATTR-NEXT:  entry:
+// SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
 // SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i32, ptr [[TMP0]], align 4
 // SANITIZE-WITH-ATTR-NEXT:    [[NARROW:%.*]] = tail call i32 @llvm.smax.i32(i32 [[COUNTED_BY_LOAD]], i32 0)
@@ -1060,7 +1060,7 @@ void test10(struct union_of_fams *p, int index) {
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 0, 2147483648) i64 @test10_bdos(
 // NO-SANITIZE-WITH-ATTR-SAME: ptr noundef readonly captures(none) [[P:%.*]]) local_unnamed_addr #[[ATTR2]] {
-// NO-SANITIZE-WITH-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i32, ptr [[TMP0]], align 4
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[NARROW:%.*]] = tail call i32 @llvm.smax.i32(i32 [[COUNTED_BY_LOAD]], i32 0)
@@ -1069,12 +1069,12 @@ void test10(struct union_of_fams *p, int index) {
 //
 // SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i64 @test10_bdos(
 // SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[P:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    ret i64 -1
 //
 // NO-SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i64 @test10_bdos(
 // NO-SANITIZE-WITHOUT-ATTR-SAME: ptr noundef readnone [[P:%.*]]) local_unnamed_addr #[[ATTR1]] {
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret i64 -1
 //
 size_t test10_bdos(struct union_of_fams *p) {
@@ -1083,29 +1083,29 @@ size_t test10_bdos(struct union_of_fams *p) {
 
 // SANITIZE-WITH-ATTR-LABEL: define dso_local void @test11(
 // SANITIZE-WITH-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// SANITIZE-WITH-ATTR-NEXT:  entry:
+// SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
 // SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i32, ptr [[COUNTED_BY_GEP]], align 4
 // SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = zext i32 [[COUNTED_BY_LOAD]] to i64, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[IDXPROM]], [[TMP0]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP1]], label [[CONT6:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR:       handler.out_of_bounds:
+// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP1]], label %[[CONT6:.*]], label %[[HANDLER_OUT_OF_BOUNDS:.*]], !prof [[PROF3]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR:       [[HANDLER_OUT_OF_BOUNDS]]:
 // SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB20:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR8]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR:       cont6:
+// SANITIZE-WITH-ATTR:       [[CONT6]]:
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 12
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAY]], i64 [[IDXPROM]]
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = icmp sgt i32 [[COUNTED_BY_LOAD]], -3
 // SANITIZE-WITH-ATTR-NEXT:    [[FLEXIBLE_ARRAY_MEMBER_SIZE:%.*]] = shl i32 [[COUNTED_BY_LOAD]], 2
 // SANITIZE-WITH-ATTR-NEXT:    [[RESULT:%.*]] = add i32 [[FLEXIBLE_ARRAY_MEMBER_SIZE]], 8
 // SANITIZE-WITH-ATTR-NEXT:    [[CONV:%.*]] = select i1 [[TMP2]], i32 [[RESULT]], i32 0
-// SANITIZE-WITH-ATTR-NEXT:    store i32 [[CONV]], ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA4]]
+// SANITIZE-WITH-ATTR-NEXT:    store i32 [[CONV]], ptr [[ARRAYIDX]], align 4, !tbaa [[INT_TBAA4]]
 // SANITIZE-WITH-ATTR-NEXT:    ret void
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local void @test11(
 // NO-SANITIZE-WITH-ATTR-SAME: ptr noundef captures(none) [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR1]] {
-// NO-SANITIZE-WITH-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i32, ptr [[COUNTED_BY_GEP]], align 4
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[FLEXIBLE_ARRAY_MEMBER_SIZE:%.*]] = shl i32 [[COUNTED_BY_LOAD]], 2
@@ -1115,25 +1115,25 @@ size_t test10_bdos(struct union_of_fams *p) {
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 12
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[ARRAY]], i64 [[IDXPROM]]
-// NO-SANITIZE-WITH-ATTR-NEXT:    store i32 [[CONV]], ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    store i32 [[CONV]], ptr [[ARRAYIDX]], align 4, !tbaa [[INT_TBAA2]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    ret void
 //
 // SANITIZE-WITHOUT-ATTR-LABEL: define dso_local void @test11(
 // SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 12
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[ARRAY]], i64 [[IDXPROM]]
-// SANITIZE-WITHOUT-ATTR-NEXT:    store i32 -1, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
+// SANITIZE-WITHOUT-ATTR-NEXT:    store i32 -1, ptr [[ARRAYIDX]], align 4, !tbaa [[INT_TBAA2]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    ret void
 //
 // NO-SANITIZE-WITHOUT-ATTR-LABEL: define dso_local void @test11(
 // NO-SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 12
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[ARRAY]], i64 [[IDXPROM]]
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:    store i32 -1, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    store i32 -1, ptr [[ARRAYIDX]], align 4, !tbaa [[INT_TBAA2]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret void
 //
 void test11(struct annotated *p, int index) {
@@ -1142,7 +1142,7 @@ void test11(struct annotated *p, int index) {
 
 // SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 -8589934584, 8589934597) i64 @test11_bdos(
 // SANITIZE-WITH-ATTR-SAME: ptr noundef [[P:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// SANITIZE-WITH-ATTR-NEXT:  entry:
+// SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
 // SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i32, ptr [[COUNTED_BY_GEP]], align 4
 // SANITIZE-WITH-ATTR-NEXT:    [[COUNT1:%.*]] = sext i32 [[COUNTED_BY_LOAD]] to i64
@@ -1154,7 +1154,7 @@ void test11(struct annotated *p, int index) {
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 -8589934584, 8589934597) i64 @test11_bdos(
 // NO-SANITIZE-WITH-ATTR-SAME: ptr noundef readonly captures(none) [[P:%.*]]) local_unnamed_addr #[[ATTR2]] {
-// NO-SANITIZE-WITH-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i32, ptr [[COUNTED_BY_GEP]], align 4
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNT1:%.*]] = sext i32 [[COUNTED_BY_LOAD]] to i64
@@ -1166,12 +1166,12 @@ void test11(struct annotated *p, int index) {
 //
 // SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i64 @test11_bdos(
 // SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[P:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    ret i64 -1
 //
 // NO-SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i64 @test11_bdos(
 // NO-SANITIZE-WITHOUT-ATTR-SAME: ptr noundef readnone [[P:%.*]]) local_unnamed_addr #[[ATTR1]] {
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret i64 -1
 //
 size_t test11_bdos(struct annotated *p) {
@@ -1195,87 +1195,87 @@ int test12_a, test12_b;
 
 // SANITIZE-WITH-ATTR-LABEL: define dso_local noundef i32 @test12(
 // SANITIZE-WITH-ATTR-SAME: i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR3:[0-9]+]] {
-// SANITIZE-WITH-ATTR-NEXT:  entry:
+// SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITH-ATTR-NEXT:    [[BAZ:%.*]] = alloca [[STRUCT_HANG:%.*]], align 4
 // SANITIZE-WITH-ATTR-NEXT:    call void @llvm.lifetime.start.p0(ptr nonnull [[BAZ]]) #[[ATTR9:[0-9]+]]
 // SANITIZE-WITH-ATTR-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 4 dereferenceable(24) [[BAZ]], ptr noundef nonnull align 4 dereferenceable(24) @test12_bar, i64 24, i1 false), !tbaa.struct [[TBAA_STRUCT10:![0-9]+]]
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = icmp ult i32 [[INDEX]], 6
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = zext i32 [[INDEX]] to i64
-// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP0]], label [[CONT:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR:       handler.out_of_bounds:
+// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP0]], label %[[CONT:.*]], label %[[HANDLER_OUT_OF_BOUNDS:.*]], !prof [[PROF3]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR:       [[HANDLER_OUT_OF_BOUNDS]]:
 // SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB22:[0-9]+]], i64 [[TMP1]]) #[[ATTR8]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR:       cont:
+// SANITIZE-WITH-ATTR:       [[CONT]]:
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[BAZ]], i64 [[TMP1]]
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA4]]
-// SANITIZE-WITH-ATTR-NEXT:    store i32 [[TMP2]], ptr @test12_b, align 4, !tbaa [[TBAA4]]
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !tbaa [[INT_TBAA4]]
+// SANITIZE-WITH-ATTR-NEXT:    store i32 [[TMP2]], ptr @test12_b, align 4, !tbaa [[INT_TBAA4]]
 // SANITIZE-WITH-ATTR-NEXT:    [[DOTCOUNTED_BY_LOAD:%.*]] = load i32, ptr @test12_foo, align 4
 // SANITIZE-WITH-ATTR-NEXT:    [[DOTNOT:%.*]] = icmp eq i32 [[DOTCOUNTED_BY_LOAD]], 0
-// SANITIZE-WITH-ATTR-NEXT:    br i1 [[DOTNOT]], label [[HANDLER_OUT_OF_BOUNDS4:%.*]], label [[HANDLER_TYPE_MISMATCH6:%.*]], !prof [[PROF8]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR:       handler.out_of_bounds4:
+// SANITIZE-WITH-ATTR-NEXT:    br i1 [[DOTNOT]], label %[[HANDLER_OUT_OF_BOUNDS4:.*]], label %[[HANDLER_TYPE_MISMATCH6:.*]], !prof [[PROF8]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR:       [[HANDLER_OUT_OF_BOUNDS4]]:
 // SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB24:[0-9]+]], i64 0) #[[ATTR8]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR:       handler.type_mismatch6:
+// SANITIZE-WITH-ATTR:       [[HANDLER_TYPE_MISMATCH6]]:
 // SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_type_mismatch_v1_abort(ptr nonnull @[[GLOB25:[0-9]+]], i64 ptrtoint (ptr getelementptr inbounds nuw (i8, ptr @test12_foo, i64 4) to i64)) #[[ATTR8]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local noundef i32 @test12(
 // NO-SANITIZE-WITH-ATTR-SAME: i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR4:[0-9]+]] {
-// NO-SANITIZE-WITH-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[BAZ:%.*]] = alloca [[STRUCT_HANG:%.*]], align 4
 // NO-SANITIZE-WITH-ATTR-NEXT:    call void @llvm.lifetime.start.p0(ptr nonnull [[BAZ]]) #[[ATTR12:[0-9]+]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 4 dereferenceable(24) [[BAZ]], ptr noundef nonnull align 4 dereferenceable(24) @test12_bar, i64 24, i1 false), !tbaa.struct [[TBAA_STRUCT7:![0-9]+]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[BAZ]], i64 [[IDXPROM]]
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
-// NO-SANITIZE-WITH-ATTR-NEXT:    store i32 [[TMP0]], ptr @test12_b, align 4, !tbaa [[TBAA2]]
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = load i32, ptr getelementptr inbounds nuw (i8, ptr @test12_foo, i64 4), align 4, !tbaa [[TBAA2]]
-// NO-SANITIZE-WITH-ATTR-NEXT:    store i32 [[TMP1]], ptr @test12_a, align 4, !tbaa [[TBAA2]]
-// NO-SANITIZE-WITH-ATTR-NEXT:    br label [[FOR_COND:%.*]]
-// NO-SANITIZE-WITH-ATTR:       for.cond:
-// NO-SANITIZE-WITH-ATTR-NEXT:    br label [[FOR_COND]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !tbaa [[INT_TBAA2]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    store i32 [[TMP0]], ptr @test12_b, align 4, !tbaa [[INT_TBAA2]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = load i32, ptr getelementptr inbounds nuw (i8, ptr @test12_foo, i64 4), align 4, !tbaa [[INT_TBAA2]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    store i32 [[TMP1]], ptr @test12_a, align 4, !tbaa [[INT_TBAA2]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    br label %[[FOR_COND:.*]]
+// NO-SANITIZE-WITH-ATTR:       [[FOR_COND]]:
+// NO-SANITIZE-WITH-ATTR-NEXT:    br label %[[FOR_COND]]
 //
 // SANITIZE-WITHOUT-ATTR-LABEL: define dso_local noundef i32 @test12(
 // SANITIZE-WITHOUT-ATTR-SAME: i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR3:[0-9]+]] {
-// SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[BAZ:%.*]] = alloca [[STRUCT_HANG:%.*]], align 4
 // SANITIZE-WITHOUT-ATTR-NEXT:    call void @llvm.lifetime.start.p0(ptr nonnull [[BAZ]]) #[[ATTR7:[0-9]+]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 4 dereferenceable(24) [[BAZ]], ptr noundef nonnull align 4 dereferenceable(24) @test12_bar, i64 24, i1 false), !tbaa.struct [[TBAA_STRUCT7:![0-9]+]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP0:%.*]] = icmp ult i32 [[INDEX]], 6
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP1:%.*]] = zext i32 [[INDEX]] to i64
-// SANITIZE-WITHOUT-ATTR-NEXT:    br i1 [[TMP0]], label [[CONT:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF8:![0-9]+]], !nosanitize [[META9:![0-9]+]]
-// SANITIZE-WITHOUT-ATTR:       handler.out_of_bounds:
+// SANITIZE-WITHOUT-ATTR-NEXT:    br i1 [[TMP0]], label %[[CONT:.*]], label %[[HANDLER_OUT_OF_BOUNDS:.*]], !prof [[PROF8:![0-9]+]], !nosanitize [[META9:![0-9]+]]
+// SANITIZE-WITHOUT-ATTR:       [[HANDLER_OUT_OF_BOUNDS]]:
 // SANITIZE-WITHOUT-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB2:[0-9]+]], i64 [[TMP1]]) #[[ATTR8:[0-9]+]], !nosanitize [[META9]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    unreachable, !nosanitize [[META9]]
-// SANITIZE-WITHOUT-ATTR:       cont:
+// SANITIZE-WITHOUT-ATTR:       [[CONT]]:
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[BAZ]], i64 [[TMP1]]
-// SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
-// SANITIZE-WITHOUT-ATTR-NEXT:    store i32 [[TMP2]], ptr @test12_b, align 4, !tbaa [[TBAA2]]
+// SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !tbaa [[INT_TBAA2]]
+// SANITIZE-WITHOUT-ATTR-NEXT:    store i32 [[TMP2]], ptr @test12_b, align 4, !tbaa [[INT_TBAA2]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[DOTCOUNTED_BY_LOAD:%.*]] = load i32, ptr @test12_foo, align 4
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[DOTNOT:%.*]] = icmp eq i32 [[DOTCOUNTED_BY_LOAD]], 0
-// SANITIZE-WITHOUT-ATTR-NEXT:    br i1 [[DOTNOT]], label [[HANDLER_OUT_OF_BOUNDS4:%.*]], label [[HANDLER_TYPE_MISMATCH6:%.*]], !prof [[PROF10:![0-9]+]], !nosanitize [[META9]]
-// SANITIZE-WITHOUT-ATTR:       handler.out_of_bounds4:
+// SANITIZE-WITHOUT-ATTR-NEXT:    br i1 [[DOTNOT]], label %[[HANDLER_OUT_OF_BOUNDS4:.*]], label %[[HANDLER_TYPE_MISMATCH6:.*]], !prof [[PROF10:![0-9]+]], !nosanitize [[META9]]
+// SANITIZE-WITHOUT-ATTR:       [[HANDLER_OUT_OF_BOUNDS4]]:
 // SANITIZE-WITHOUT-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB4:[0-9]+]], i64 0) #[[ATTR8]], !nosanitize [[META9]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    unreachable, !nosanitize [[META9]]
-// SANITIZE-WITHOUT-ATTR:       handler.type_mismatch6:
+// SANITIZE-WITHOUT-ATTR:       [[HANDLER_TYPE_MISMATCH6]]:
 // SANITIZE-WITHOUT-ATTR-NEXT:    tail call void @__ubsan_handle_type_mismatch_v1_abort(ptr nonnull @[[GLOB5:[0-9]+]], i64 ptrtoint (ptr getelementptr inbounds nuw (i8, ptr @test12_foo, i64 4) to i64)) #[[ATTR8]], !nosanitize [[META9]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    unreachable, !nosanitize [[META9]]
 //
 // NO-SANITIZE-WITHOUT-ATTR-LABEL: define dso_local noundef i32 @test12(
 // NO-SANITIZE-WITHOUT-ATTR-SAME: i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] {
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[BAZ:%.*]] = alloca [[STRUCT_HANG:%.*]], align 4
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    call void @llvm.lifetime.start.p0(ptr nonnull [[BAZ]]) #[[ATTR10:[0-9]+]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 4 dereferenceable(24) [[BAZ]], ptr noundef nonnull align 4 dereferenceable(24) @test12_bar, i64 24, i1 false), !tbaa.struct [[TBAA_STRUCT7:![0-9]+]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[BAZ]], i64 [[IDXPROM]]
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:    store i32 [[TMP0]], ptr @test12_b, align 4, !tbaa [[TBAA2]]
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP1:%.*]] = load i32, ptr getelementptr inbounds nuw (i8, ptr @test12_foo, i64 4), align 4, !tbaa [[TBAA2]]
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:    store i32 [[TMP1]], ptr @test12_a, align 4, !tbaa [[TBAA2]]
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:    br label [[FOR_COND:%.*]]
-// NO-SANITIZE-WITHOUT-ATTR:       for.cond:
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:    br label [[FOR_COND]]
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !tbaa [[INT_TBAA2]]
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    store i32 [[TMP0]], ptr @test12_b, align 4, !tbaa [[INT_TBAA2]]
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP1:%.*]] = load i32, ptr getelementptr inbounds nuw (i8, ptr @test12_foo, i64 4), align 4, !tbaa [[INT_TBAA2]]
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    store i32 [[TMP1]], ptr @test12_a, align 4, !tbaa [[INT_TBAA2]]
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    br label %[[FOR_COND:.*]]
+// NO-SANITIZE-WITHOUT-ATTR:       [[FOR_COND]]:
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    br label %[[FOR_COND]]
 //
 int test12(int index) {
   struct hang baz = test12_bar;
@@ -1298,56 +1298,56 @@ struct test13_bar {
 
 // SANITIZE-WITH-ATTR-LABEL: define dso_local noundef i32 @test13(
 // SANITIZE-WITH-ATTR-SAME: i64 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// SANITIZE-WITH-ATTR-NEXT:  entry:
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = load ptr, ptr @test13_f, align 8, !tbaa [[TBAA11:![0-9]+]]
+// SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = load ptr, ptr @test13_f, align 8, !tbaa [[_ZTS10TEST13_BARPTR_TBAA11:![0-9]+]]
 // SANITIZE-WITH-ATTR-NEXT:    [[DOTCOUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 8
 // SANITIZE-WITH-ATTR-NEXT:    [[DOTCOUNTED_BY_LOAD:%.*]] = load i32, ptr [[DOTCOUNTED_BY_GEP]], align 4
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = zext i32 [[DOTCOUNTED_BY_LOAD]] to i64, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = icmp ult i64 [[INDEX]], [[TMP1]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP2]], label [[CONT5:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR:       handler.out_of_bounds:
+// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP2]], label %[[CONT5:.*]], label %[[HANDLER_OUT_OF_BOUNDS:.*]], !prof [[PROF3]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR:       [[HANDLER_OUT_OF_BOUNDS]]:
 // SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB28:[0-9]+]], i64 [[INDEX]]) #[[ATTR8]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR:       cont5:
+// SANITIZE-WITH-ATTR:       [[CONT5]]:
 // SANITIZE-WITH-ATTR-NEXT:    [[REVMAP:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 16
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw ptr, ptr [[REVMAP]], i64 [[INDEX]]
-// SANITIZE-WITH-ATTR-NEXT:    store ptr null, ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA15:![0-9]+]]
+// SANITIZE-WITH-ATTR-NEXT:    store ptr null, ptr [[ARRAYIDX]], align 8, !tbaa [[_ZTS10TEST13_FOOPTR_TBAA15:![0-9]+]]
 // SANITIZE-WITH-ATTR-NEXT:    ret i32 0
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local noundef i32 @test13(
 // NO-SANITIZE-WITH-ATTR-SAME: i64 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR7:[0-9]+]] {
-// NO-SANITIZE-WITH-ATTR-NEXT:  entry:
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = load ptr, ptr @test13_f, align 8, !tbaa [[TBAA8:![0-9]+]]
+// NO-SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = load ptr, ptr @test13_f, align 8, !tbaa [[_ZTS10TEST13_BARPTR_TBAA8:![0-9]+]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[REVMAP:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 16
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[REVMAP]], i64 [[INDEX]]
-// NO-SANITIZE-WITH-ATTR-NEXT:    store ptr null, ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA12:![0-9]+]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    store ptr null, ptr [[ARRAYIDX]], align 8, !tbaa [[_ZTS10TEST13_FOOPTR_TBAA12:![0-9]+]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    ret i32 0
 //
 // SANITIZE-WITHOUT-ATTR-LABEL: define dso_local noundef i32 @test13(
 // SANITIZE-WITHOUT-ATTR-SAME: i64 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// SANITIZE-WITHOUT-ATTR-NEXT:  entry:
-// SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP0:%.*]] = load ptr, ptr @test13_f, align 8, !tbaa [[TBAA11:![0-9]+]]
+// SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
+// SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP0:%.*]] = load ptr, ptr @test13_f, align 8, !tbaa [[_ZTS10TEST13_BARPTR_TBAA11:![0-9]+]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[DOTCOUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 8
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[DOTCOUNTED_BY_LOAD:%.*]] = load i32, ptr [[DOTCOUNTED_BY_GEP]], align 4
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP1:%.*]] = zext i32 [[DOTCOUNTED_BY_LOAD]] to i64, !nosanitize [[META9]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP2:%.*]] = icmp ult i64 [[INDEX]], [[TMP1]], !nosanitize [[META9]]
-// SANITIZE-WITHOUT-ATTR-NEXT:    br i1 [[TMP2]], label [[CONT5:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF8]], !nosanitize [[META9]]
-// SANITIZE-WITHOUT-ATTR:       handler.out_of_bounds:
+// SANITIZE-WITHOUT-ATTR-NEXT:    br i1 [[TMP2]], label %[[CONT5:.*]], label %[[HANDLER_OUT_OF_BOUNDS:.*]], !prof [[PROF8]], !nosanitize [[META9]]
+// SANITIZE-WITHOUT-ATTR:       [[HANDLER_OUT_OF_BOUNDS]]:
 // SANITIZE-WITHOUT-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB8:[0-9]+]], i64 [[INDEX]]) #[[ATTR8]], !nosanitize [[META9]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    unreachable, !nosanitize [[META9]]
-// SANITIZE-WITHOUT-ATTR:       cont5:
+// SANITIZE-WITHOUT-ATTR:       [[CONT5]]:
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[REVMAP:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 16
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw ptr, ptr [[REVMAP]], i64 [[INDEX]]
-// SANITIZE-WITHOUT-ATTR-NEXT:    store ptr null, ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA15:![0-9]+]]
+// SANITIZE-WITHOUT-ATTR-NEXT:    store ptr null, ptr [[ARRAYIDX]], align 8, !tbaa [[_ZTS10TEST13_FOOPTR_TBAA15:![0-9]+]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    ret i32 0
 //
 // NO-SANITIZE-WITHOUT-ATTR-LABEL: define dso_local noundef i32 @test13(
 // NO-SANITIZE-WITHOUT-ATTR-SAME: i64 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR5:[0-9]+]] {
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:  entry:
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP0:%.*]] = load ptr, ptr @test13_f, align 8, !tbaa [[TBAA8:![0-9]+]]
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP0:%.*]] = load ptr, ptr @test13_f, align 8, !tbaa [[_ZTS10TEST13_BARPTR_TBAA8:![0-9]+]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[REVMAP:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 16
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[REVMAP]], i64 [[INDEX]]
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:    store ptr null, ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA12:![0-9]+]]
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    store ptr null, ptr [[ARRAYIDX]], align 8, !tbaa [[_ZTS10TEST13_FOOPTR_TBAA12:![0-9]+]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret i32 0
 //
 int test13(long index) {
@@ -1362,52 +1362,52 @@ struct test14_foo {
 
 // SANITIZE-WITH-ATTR-LABEL: define dso_local i32 @test14(
 // SANITIZE-WITH-ATTR-SAME: i32 noundef [[IDX:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// SANITIZE-WITH-ATTR-NEXT:  entry:
+// SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[IDX]], 0
-// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP0]], label [[CONT3:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR:       handler.out_of_bounds:
+// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP0]], label %[[CONT3:.*]], label %[[HANDLER_OUT_OF_BOUNDS:.*]], !prof [[PROF3]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR:       [[HANDLER_OUT_OF_BOUNDS]]:
 // SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[IDX]] to i64
 // SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB29:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR8]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR:       cont3:
+// SANITIZE-WITH-ATTR:       [[CONT3]]:
 // SANITIZE-WITH-ATTR-NEXT:    ret i32 undef
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local i32 @test14(
 // NO-SANITIZE-WITH-ATTR-SAME: i32 noundef [[IDX:%.*]]) local_unnamed_addr #[[ATTR3]] {
-// NO-SANITIZE-WITH-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[DOTCOMPOUNDLITERAL:%.*]] = alloca [[STRUCT_TEST14_FOO:%.*]], align 4
-// NO-SANITIZE-WITH-ATTR-NEXT:    store i32 1, ptr [[DOTCOMPOUNDLITERAL]], align 4, !tbaa [[TBAA2]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    store i32 1, ptr [[DOTCOMPOUNDLITERAL]], align 4, !tbaa [[INT_TBAA2]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[DOTCOMPOUNDLITERAL]], i64 4
-// NO-SANITIZE-WITH-ATTR-NEXT:    store i32 2, ptr [[Y]], align 4, !tbaa [[TBAA2]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    store i32 2, ptr [[Y]], align 4, !tbaa [[INT_TBAA2]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[BLAH:%.*]] = getelementptr inbounds nuw i8, ptr [[DOTCOMPOUNDLITERAL]], i64 8
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[IDX]] to i64
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[BLAH]], i64 [[IDXPROM]]
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !tbaa [[INT_TBAA2]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    ret i32 [[TMP0]]
 //
 // SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i32 @test14(
 // SANITIZE-WITHOUT-ATTR-SAME: i32 noundef [[IDX:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[IDX]], 0
-// SANITIZE-WITHOUT-ATTR-NEXT:    br i1 [[TMP0]], label [[CONT3:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF8]], !nosanitize [[META9]]
-// SANITIZE-WITHOUT-ATTR:       handler.out_of_bounds:
+// SANITIZE-WITHOUT-ATTR-NEXT:    br i1 [[TMP0]], label %[[CONT3:.*]], label %[[HANDLER_OUT_OF_BOUNDS:.*]], !prof [[PROF8]], !nosanitize [[META9]]
+// SANITIZE-WITHOUT-ATTR:       [[HANDLER_OUT_OF_BOUNDS]]:
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[IDX]] to i64
 // SANITIZE-WITHOUT-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB9:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR8]], !nosanitize [[META9]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    unreachable, !nosanitize [[META9]]
-// SANITIZE-WITHOUT-ATTR:       cont3:
+// SANITIZE-WITHOUT-ATTR:       [[CONT3]]:
 // SANITIZE-WITHOUT-ATTR-NEXT:    ret i32 undef
 //
 // NO-SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i32 @test14(
 // NO-SANITIZE-WITHOUT-ATTR-SAME: i32 noundef [[IDX:%.*]]) local_unnamed_addr #[[ATTR1]] {
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[DOTCOMPOUNDLITERAL:%.*]] = alloca [[STRUCT_TEST14_FOO:%.*]], align 4
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:    store i32 1, ptr [[DOTCOMPOUNDLITERAL]], align 4, !tbaa [[TBAA2]]
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    store i32 1, ptr [[DOTCOMPOUNDLITERAL]], align 4, !tbaa [[INT_TBAA2]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[DOTCOMPOUNDLITERAL]], i64 4
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:    store i32 2, ptr [[Y]], align 4, !tbaa [[TBAA2]]
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    store i32 2, ptr [[Y]], align 4, !tbaa [[INT_TBAA2]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[BLAH:%.*]] = getelementptr inbounds nuw i8, ptr [[DOTCOMPOUNDLITERAL]], i64 8
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[IDX]] to i64
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[BLAH]], i64 [[IDXPROM]]
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !tbaa [[INT_TBAA2]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret i32 [[TMP0]]
 //
 int test14(int idx) {
@@ -1416,42 +1416,42 @@ int test14(int idx) {
 
 // SANITIZE-WITH-ATTR-LABEL: define dso_local i32 @test15(
 // SANITIZE-WITH-ATTR-SAME: i32 noundef [[IDX:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// SANITIZE-WITH-ATTR-NEXT:  entry:
+// SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[IDX]], 0
-// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP0]], label [[CONT1:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR:       handler.out_of_bounds:
+// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP0]], label %[[CONT1:.*]], label %[[HANDLER_OUT_OF_BOUNDS:.*]], !prof [[PROF3]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR:       [[HANDLER_OUT_OF_BOUNDS]]:
 // SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[IDX]] to i64
 // SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB31:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR8]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR:       cont1:
+// SANITIZE-WITH-ATTR:       [[CONT1]]:
 // SANITIZE-WITH-ATTR-NEXT:    ret i32 undef
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local i32 @test15(
 // NO-SANITIZE-WITH-ATTR-SAME: i32 noundef [[IDX:%.*]]) local_unnamed_addr #[[ATTR3]] {
-// NO-SANITIZE-WITH-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[IDX]] to i64
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr getelementptr inbounds nuw (i8, ptr @__const.test15.foo, i64 8), i64 [[IDXPROM]]
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !tbaa [[INT_TBAA2]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    ret i32 [[TMP0]]
 //
 // SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i32 @test15(
 // SANITIZE-WITHOUT-ATTR-SAME: i32 noundef [[IDX:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[IDX]], 0
-// SANITIZE-WITHOUT-ATTR-NEXT:    br i1 [[TMP0]], label [[CONT1:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF8]], !nosanitize [[META9]]
-// SANITIZE-WITHOUT-ATTR:       handler.out_of_bounds:
+// SANITIZE-WITHOUT-ATTR-NEXT:    br i1 [[TMP0]], label %[[CONT1:.*]], label %[[HANDLER_OUT_OF_BOUNDS:.*]], !prof [[PROF8]], !nosanitize [[META9]]
+// SANITIZE-WITHOUT-ATTR:       [[HANDLER_OUT_OF_BOUNDS]]:
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[IDX]] to i64
 // SANITIZE-WITHOUT-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB11:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR8]], !nosanitize [[META9]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    unreachable, !nosanitize [[META9]]
-// SANITIZE-WITHOUT-ATTR:       cont1:
+// SANITIZE-WITHOUT-ATTR:       [[CONT1]]:
 // SANITIZE-WITHOUT-ATTR-NEXT:    ret i32 undef
 //
 // NO-SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i32 @test15(
 // NO-SANITIZE-WITHOUT-ATTR-SAME: i32 noundef [[IDX:%.*]]) local_unnamed_addr #[[ATTR1]] {
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[IDX]] to i64
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr getelementptr inbounds nuw (i8, ptr @__const.test15.foo, i64 8), i64 [[IDXPROM]]
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !tbaa [[INT_TBAA2]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret i32 [[TMP0]]
 //
 int test15(int idx) {
@@ -1465,30 +1465,30 @@ int test15(int idx) {
 
 // SANITIZE-WITH-ATTR-LABEL: define dso_local i64 @test19(
 // SANITIZE-WITH-ATTR-SAME: ptr noundef [[P:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// SANITIZE-WITH-ATTR-NEXT:  entry:
+// SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITH-ATTR-NEXT:    [[DOTCOUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 680
 // SANITIZE-WITH-ATTR-NEXT:    [[DOTCOUNTED_BY_LOAD:%.*]] = load i32, ptr [[DOTCOUNTED_BY_GEP]], align 4
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = icmp ugt i32 [[DOTCOUNTED_BY_LOAD]], 1
-// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP0]], label [[CONT1:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR:       handler.out_of_bounds:
+// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP0]], label %[[CONT1:.*]], label %[[HANDLER_OUT_OF_BOUNDS:.*]], !prof [[PROF3]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR:       [[HANDLER_OUT_OF_BOUNDS]]:
 // SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB32:[0-9]+]], i64 2) #[[ATTR8]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR:       cont1:
+// SANITIZE-WITH-ATTR:       [[CONT1]]:
 // SANITIZE-WITH-ATTR-NEXT:    ret i64 -1
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local i64 @test19(
 // NO-SANITIZE-WITH-ATTR-SAME: ptr noundef readnone [[P:%.*]]) local_unnamed_addr #[[ATTR3]] {
-// NO-SANITIZE-WITH-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    ret i64 -1
 //
 // SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i64 @test19(
 // SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[P:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    ret i64 -1
 //
 // NO-SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i64 @test19(
 // NO-SANITIZE-WITHOUT-ATTR-SAME: ptr noundef readnone [[P:%.*]]) local_unnamed_addr #[[ATTR1]] {
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret i64 -1
 //
 size_t test19(struct annotated *p) {
@@ -1498,22 +1498,22 @@ size_t test19(struct annotated *p) {
 
 // SANITIZE-WITH-ATTR-LABEL: define dso_local noundef i64 @test20(
 // SANITIZE-WITH-ATTR-SAME: ptr noundef readnone captures(none) [[P:%.*]]) local_unnamed_addr #[[ATTR2]] {
-// SANITIZE-WITH-ATTR-NEXT:  entry:
+// SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITH-ATTR-NEXT:    ret i64 -1
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local noundef i64 @test20(
 // NO-SANITIZE-WITH-ATTR-SAME: ptr noundef readnone captures(none) [[P:%.*]]) local_unnamed_addr #[[ATTR3]] {
-// NO-SANITIZE-WITH-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    ret i64 -1
 //
 // SANITIZE-WITHOUT-ATTR-LABEL: define dso_local noundef i64 @test20(
 // SANITIZE-WITHOUT-ATTR-SAME: ptr noundef readnone captures(none) [[P:%.*]]) local_unnamed_addr #[[ATTR2]] {
-// SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    ret i64 -1
 //
 // NO-SANITIZE-WITHOUT-ATTR-LABEL: define dso_local noundef i64 @test20(
 // NO-SANITIZE-WITHOUT-ATTR-SAME: ptr noundef readnone captures(none) [[P:%.*]]) local_unnamed_addr #[[ATTR1]] {
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret i64 -1
 //
 size_t test20(struct annotated *p) {
@@ -1523,22 +1523,22 @@ size_t test20(struct annotated *p) {
 
 // SANITIZE-WITH-ATTR-LABEL: define dso_local noundef i64 @test21(
 // SANITIZE-WITH-ATTR-SAME: ptr noundef readnone captures(none) [[P:%.*]]) local_unnamed_addr #[[ATTR2]] {
-// SANITIZE-WITH-ATTR-NEXT:  entry:
+// SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITH-ATTR-NEXT:    ret i64 -1
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local noundef i64 @test21(
 // NO-SANITIZE-WITH-ATTR-SAME: ptr noundef readnone captures(none) [[P:%.*]]) local_unnamed_addr #[[ATTR3]] {
-// NO-SANITIZE-WITH-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    ret i64 -1
 //
 // SANITIZE-WITHOUT-ATTR-LABEL: define dso_local noundef i64 @test21(
 // SANITIZE-WITHOUT-ATTR-SAME: ptr noundef readnone captures(none) [[P:%.*]]) local_unnamed_addr #[[ATTR2]] {
-// SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    ret i64 -1
 //
 // NO-SANITIZE-WITHOUT-ATTR-LABEL: define dso_local noundef i64 @test21(
 // NO-SANITIZE-WITHOUT-ATTR-SAME: ptr noundef readnone captures(none) [[P:%.*]]) local_unnamed_addr #[[ATTR1]] {
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret i64 -1
 //
 size_t test21(struct annotated *p) {
@@ -1548,22 +1548,22 @@ size_t test21(struct annotated *p) {
 
 // SANITIZE-WITH-ATTR-LABEL: define dso_local noundef i64 @test22(
 // SANITIZE-WITH-ATTR-SAME: ptr noundef readnone captures(none) [[P:%.*]]) local_unnamed_addr #[[ATTR2]] {
-// SANITIZE-WITH-ATTR-NEXT:  entry:
+// SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITH-ATTR-NEXT:    ret i64 -1
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local noundef i64 @test22(
 // NO-SANITIZE-WITH-ATTR-SAME: ptr noundef readnone captures(none) [[P:%.*]]) local_unnamed_addr #[[ATTR3]] {
-// NO-SANITIZE-WITH-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    ret i64 -1
 //
 // SANITIZE-WITHOUT-ATTR-LABEL: define dso_local noundef i64 @test22(
 // SANITIZE-WITHOUT-ATTR-SAME: ptr noundef readnone captures(none) [[P:%.*]]) local_unnamed_addr #[[ATTR2]] {
-// SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    ret i64 -1
 //
 // NO-SANITIZE-WITHOUT-ATTR-LABEL: define dso_local noundef i64 @test22(
 // NO-SANITIZE-WITHOUT-ATTR-SAME: ptr noundef readnone captures(none) [[P:%.*]]) local_unnamed_addr #[[ATTR1]] {
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret i64 -1
 //
 size_t test22(struct annotated *p) {
@@ -1573,22 +1573,22 @@ size_t test22(struct annotated *p) {
 
 // SANITIZE-WITH-ATTR-LABEL: define dso_local noundef i64 @test23(
 // SANITIZE-WITH-ATTR-SAME: ptr noundef readnone captures(none) [[P:%.*]]) local_unnamed_addr #[[ATTR2]] {
-// SANITIZE-WITH-ATTR-NEXT:  entry:
+// SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITH-ATTR-NEXT:    ret i64 -1
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local noundef i64 @test23(
 // NO-SANITIZE-WITH-ATTR-SAME: ptr noundef readnone captures(none) [[P:%.*]]) local_unnamed_addr #[[ATTR3]] {
-// NO-SANITIZE-WITH-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    ret i64 -1
 //
 // SANITIZE-WITHOUT-ATTR-LABEL: define dso_local noundef i64 @test23(
 // SANITIZE-WITHOUT-ATTR-SAME: ptr noundef readnone captures(none) [[P:%.*]]) local_unnamed_addr #[[ATTR2]] {
-// SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    ret i64 -1
 //
 // NO-SANITIZE-WITHOUT-ATTR-LABEL: define dso_local noundef i64 @test23(
 // NO-SANITIZE-WITHOUT-ATTR-SAME: ptr noundef readnone captures(none) [[P:%.*]]) local_unnamed_addr #[[ATTR1]] {
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret i64 -1
 //
 size_t test23(struct annotated *p) {
@@ -1603,38 +1603,38 @@ struct tests_foo {
 
 // SANITIZE-WITH-ATTR-LABEL: define dso_local i32 @test24(
 // SANITIZE-WITH-ATTR-SAME: i32 noundef [[C:%.*]], ptr noundef [[VAR:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// SANITIZE-WITH-ATTR-NEXT:  entry:
+// SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i8, ptr [[VAR]], i64 40
 // SANITIZE-WITH-ATTR-NEXT:    [[DOTCOUNTED_BY_LOAD:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = icmp ugt i32 [[DOTCOUNTED_BY_LOAD]], 10
-// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP0]], label [[CONT4:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR:       handler.out_of_bounds:
+// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP0]], label %[[CONT4:.*]], label %[[HANDLER_OUT_OF_BOUNDS:.*]], !prof [[PROF3]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR:       [[HANDLER_OUT_OF_BOUNDS]]:
 // SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB33:[0-9]+]], i64 10) #[[ATTR8]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR:       cont4:
+// SANITIZE-WITH-ATTR:       [[CONT4]]:
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[VAR]], i64 84
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4, !tbaa [[TBAA4]]
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4, !tbaa [[INT_TBAA4]]
 // SANITIZE-WITH-ATTR-NEXT:    ret i32 [[TMP1]]
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local i32 @test24(
 // NO-SANITIZE-WITH-ATTR-SAME: i32 noundef [[C:%.*]], ptr noundef readonly captures(none) [[VAR:%.*]]) local_unnamed_addr #[[ATTR2]] {
-// NO-SANITIZE-WITH-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw i8, ptr [[VAR]], i64 84
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4, !tbaa [[TBAA2]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4, !tbaa [[INT_TBAA2]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    ret i32 [[TMP0]]
 //
 // SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i32 @test24(
 // SANITIZE-WITHOUT-ATTR-SAME: i32 noundef [[C:%.*]], ptr noundef [[VAR:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw i8, ptr [[VAR]], i64 84
-// SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4, !tbaa [[TBAA2]]
+// SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4, !tbaa [[INT_TBAA2]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    ret i32 [[TMP0]]
 //
 // NO-SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i32 @test24(
 // NO-SANITIZE-WITHOUT-ATTR-SAME: i32 noundef [[C:%.*]], ptr noundef readonly captures(none) [[VAR:%.*]]) local_unnamed_addr #[[ATTR6:[0-9]+]] {
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw i8, ptr [[VAR]], i64 84
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4, !tbaa [[TBAA2]]
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4, !tbaa [[INT_TBAA2]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret i32 [[TMP0]]
 //
 int test24(int c, struct tests_foo *var) {
@@ -1644,41 +1644,41 @@ int test24(int c, struct tests_foo *var) {
 
 // SANITIZE-WITH-ATTR-LABEL: define dso_local i32 @test25(
 // SANITIZE-WITH-ATTR-SAME: i32 noundef [[C:%.*]], ptr noundef [[VAR:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// SANITIZE-WITH-ATTR-NEXT:  entry:
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 8, !tbaa [[TBAA17:![0-9]+]]
+// SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 8, !tbaa [[_ZTS9TESTS_FOOPTR_TBAA17:![0-9]+]]
 // SANITIZE-WITH-ATTR-NEXT:    [[DOTCOUNTED_BY_LOAD:%.*]] = load i32, ptr [[TMP0]], align 4
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = icmp ugt i32 [[DOTCOUNTED_BY_LOAD]], 10
-// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP1]], label [[CONT5:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR:       handler.out_of_bounds:
+// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP1]], label %[[CONT5:.*]], label %[[HANDLER_OUT_OF_BOUNDS:.*]], !prof [[PROF3]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR:       [[HANDLER_OUT_OF_BOUNDS]]:
 // SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB34:[0-9]+]], i64 10) #[[ATTR8]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR:       cont5:
+// SANITIZE-WITH-ATTR:       [[CONT5]]:
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 44
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA4]]
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !tbaa [[INT_TBAA4]]
 // SANITIZE-WITH-ATTR-NEXT:    ret i32 [[TMP2]]
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local i32 @test25(
 // NO-SANITIZE-WITH-ATTR-SAME: i32 noundef [[C:%.*]], ptr noundef readonly captures(none) [[VAR:%.*]]) local_unnamed_addr #[[ATTR8:[0-9]+]] {
-// NO-SANITIZE-WITH-ATTR-NEXT:  entry:
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 8, !tbaa [[TBAA14:![0-9]+]]
+// NO-SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 8, !tbaa [[_ZTS9TESTS_FOOPTR_TBAA14:![0-9]+]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 44
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !tbaa [[INT_TBAA2]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    ret i32 [[TMP1]]
 //
 // SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i32 @test25(
 // SANITIZE-WITHOUT-ATTR-SAME: i32 noundef [[C:%.*]], ptr noundef [[VAR:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// SANITIZE-WITHOUT-ATTR-NEXT:  entry:
-// SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 8, !tbaa [[TBAA17:![0-9]+]]
+// SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
+// SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 8, !tbaa [[_ZTS9TESTS_FOOPTR_TBAA17:![0-9]+]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 44
-// SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
+// SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !tbaa [[INT_TBAA2]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    ret i32 [[TMP1]]
 //
 // NO-SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i32 @test25(
 // NO-SANITIZE-WITHOUT-ATTR-SAME: i32 noundef [[C:%.*]], ptr noundef readonly captures(none) [[VAR:%.*]]) local_unnamed_addr #[[ATTR7:[0-9]+]] {
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:  entry:
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 8, !tbaa [[TBAA14:![0-9]+]]
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 8, !tbaa [[_ZTS9TESTS_FOOPTR_TBAA14:![0-9]+]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 44
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !tbaa [[INT_TBAA2]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret i32 [[TMP1]]
 //
 int test25(int c, struct tests_foo **var) {
@@ -1694,47 +1694,47 @@ struct test26_foo {
 
 // SANITIZE-WITH-ATTR-LABEL: define dso_local i32 @test26(
 // SANITIZE-WITH-ATTR-SAME: i32 noundef [[C:%.*]], ptr noundef [[FOO:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// SANITIZE-WITH-ATTR-NEXT:  entry:
+// SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITH-ATTR-NEXT:    [[S:%.*]] = getelementptr inbounds nuw i8, ptr [[FOO]], i64 4
 // SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[C]] to i64
 // SANITIZE-WITH-ATTR-NEXT:    [[DOTCOUNTED_BY_LOAD:%.*]] = load i32, ptr [[S]], align 4
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = zext i32 [[DOTCOUNTED_BY_LOAD]] to i64, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[IDXPROM]], [[TMP0]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP1]], label [[CONT5:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR:       handler.out_of_bounds:
+// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP1]], label %[[CONT5:.*]], label %[[HANDLER_OUT_OF_BOUNDS:.*]], !prof [[PROF3]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR:       [[HANDLER_OUT_OF_BOUNDS]]:
 // SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB35:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR8]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR:       cont5:
+// SANITIZE-WITH-ATTR:       [[CONT5]]:
 // SANITIZE-WITH-ATTR-NEXT:    [[ARR:%.*]] = getelementptr inbounds nuw i8, ptr [[FOO]], i64 8
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[ARR]], i64 [[IDXPROM]]
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA4]]
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !tbaa [[INT_TBAA4]]
 // SANITIZE-WITH-ATTR-NEXT:    ret i32 [[TMP2]]
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local i32 @test26(
 // NO-SANITIZE-WITH-ATTR-SAME: i32 noundef [[C:%.*]], ptr noundef readonly captures(none) [[FOO:%.*]]) local_unnamed_addr #[[ATTR2]] {
-// NO-SANITIZE-WITH-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[ARR:%.*]] = getelementptr inbounds nuw i8, ptr [[FOO]], i64 8
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[C]] to i64
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[IDXPROM]]
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !tbaa [[INT_TBAA2]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    ret i32 [[TMP0]]
 //
 // SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i32 @test26(
 // SANITIZE-WITHOUT-ATTR-SAME: i32 noundef [[C:%.*]], ptr noundef [[FOO:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[ARR:%.*]] = getelementptr inbounds nuw i8, ptr [[FOO]], i64 8
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[C]] to i64
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[IDXPROM]]
-// SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
+// SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !tbaa [[INT_TBAA2]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    ret i32 [[TMP0]]
 //
 // NO-SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i32 @test26(
 // NO-SANITIZE-WITHOUT-ATTR-SAME: i32 noundef [[C:%.*]], ptr noundef readonly captures(none) [[FOO:%.*]]) local_unnamed_addr #[[ATTR6]] {
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARR:%.*]] = getelementptr inbounds nuw i8, ptr [[FOO]], i64 8
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[C]] to i64
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[IDXPROM]]
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !tbaa [[INT_TBAA2]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret i32 [[TMP0]]
 //
 int test26(int c, struct test26_foo *foo) {
@@ -1765,53 +1765,53 @@ struct test27_foo {
 
 // SANITIZE-WITH-ATTR-LABEL: define dso_local ptr @test27(
 // SANITIZE-WITH-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[I:%.*]], i32 noundef [[J:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// SANITIZE-WITH-ATTR-NEXT:  entry:
+// SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[I]] to i64
 // SANITIZE-WITH-ATTR-NEXT:    [[DOTCOUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 12
 // SANITIZE-WITH-ATTR-NEXT:    [[DOTCOUNTED_BY_LOAD:%.*]] = load i32, ptr [[DOTCOUNTED_BY_GEP]], align 4
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = zext i32 [[DOTCOUNTED_BY_LOAD]] to i64, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[IDXPROM]], [[TMP0]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP1]], label [[CONT3:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR:       handler.out_of_bounds:
+// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP1]], label %[[CONT3:.*]], label %[[HANDLER_OUT_OF_BOUNDS:.*]], !prof [[PROF3]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR:       [[HANDLER_OUT_OF_BOUNDS]]:
 // SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB37:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR8]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR:       cont3:
+// SANITIZE-WITH-ATTR:       [[CONT3]]:
 // SANITIZE-WITH-ATTR-NEXT:    [[ENTRIES:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 24
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw ptr, ptr [[ENTRIES]], i64 [[IDXPROM]]
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA19:![0-9]+]]
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8, !tbaa [[_ZTS10TEST27_BARPTR_TBAA19:![0-9]+]]
 // SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM4:%.*]] = sext i32 [[J]] to i64
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds [[STRUCT_TEST27_BAR:%.*]], ptr [[TMP2]], i64 [[IDXPROM4]]
 // SANITIZE-WITH-ATTR-NEXT:    ret ptr [[ARRAYIDX5]]
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local ptr @test27(
 // NO-SANITIZE-WITH-ATTR-SAME: ptr noundef readonly captures(none) [[P:%.*]], i32 noundef [[I:%.*]], i32 noundef [[J:%.*]]) local_unnamed_addr #[[ATTR2]] {
-// NO-SANITIZE-WITH-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[ENTRIES:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 24
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[I]] to i64
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[ENTRIES]], i64 [[IDXPROM]]
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA16:![0-9]+]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8, !tbaa [[_ZTS10TEST27_BARPTR_TBAA16:![0-9]+]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM1:%.*]] = sext i32 [[J]] to i64
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [[STRUCT_TEST27_BAR:%.*]], ptr [[TMP0]], i64 [[IDXPROM1]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    ret ptr [[ARRAYIDX2]]
 //
 // SANITIZE-WITHOUT-ATTR-LABEL: define dso_local ptr @test27(
 // SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[I:%.*]], i32 noundef [[J:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[ENTRIES:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 24
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[I]] to i64
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[ENTRIES]], i64 [[IDXPROM]]
-// SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA19:![0-9]+]]
+// SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8, !tbaa [[_ZTS10TEST27_BARPTR_TBAA19:![0-9]+]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[IDXPROM3:%.*]] = sext i32 [[J]] to i64
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [[STRUCT_TEST27_BAR:%.*]], ptr [[TMP0]], i64 [[IDXPROM3]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    ret ptr [[ARRAYIDX4]]
 //
 // NO-SANITIZE-WITHOUT-ATTR-LABEL: define dso_local ptr @test27(
 // NO-SANITIZE-WITHOUT-ATTR-SAME: ptr noundef readonly captures(none) [[P:%.*]], i32 noundef [[I:%.*]], i32 noundef [[J:%.*]]) local_unnamed_addr #[[ATTR6]] {
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ENTRIES:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 24
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[I]] to i64
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[ENTRIES]], i64 [[IDXPROM]]
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA16:![0-9]+]]
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8, !tbaa [[_ZTS10TEST27_BARPTR_TBAA16:![0-9]+]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[IDXPROM1:%.*]] = sext i32 [[J]] to i64
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [[STRUCT_TEST27_BAR:%.*]], ptr [[TMP0]], i64 [[IDXPROM1]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret ptr [[ARRAYIDX2]]
@@ -1828,59 +1828,59 @@ struct test28_foo {
 
 // SANITIZE-WITH-ATTR-LABEL: define dso_local i32 @test28(
 // SANITIZE-WITH-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[I:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// SANITIZE-WITH-ATTR-NEXT:  entry:
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[P]], align 8, !tbaa [[TBAA21:![0-9]+]]
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 8, !tbaa [[TBAA21]]
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8, !tbaa [[TBAA21]]
+// SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[P]], align 8, !tbaa [[_ZTS10TEST28_FOOPTR_TBAA21:![0-9]+]]
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 8, !tbaa [[_ZTS10TEST28_FOOPTR_TBAA21]]
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8, !tbaa [[_ZTS10TEST28_FOOPTR_TBAA21]]
 // SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[I]] to i64
 // SANITIZE-WITH-ATTR-NEXT:    [[DOTCOUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i64 8
 // SANITIZE-WITH-ATTR-NEXT:    [[DOTCOUNTED_BY_LOAD:%.*]] = load i32, ptr [[DOTCOUNTED_BY_GEP]], align 4
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP3:%.*]] = zext i32 [[DOTCOUNTED_BY_LOAD]] to i64, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP4:%.*]] = icmp ult i64 [[IDXPROM]], [[TMP3]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP4]], label [[CONT17:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR:       handler.out_of_bounds:
+// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP4]], label %[[CONT17:.*]], label %[[HANDLER_OUT_OF_BOUNDS:.*]], !prof [[PROF3]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR:       [[HANDLER_OUT_OF_BOUNDS]]:
 // SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB39:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR8]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR:       cont17:
+// SANITIZE-WITH-ATTR:       [[CONT17]]:
 // SANITIZE-WITH-ATTR-NEXT:    [[ARR:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i64 12
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[ARR]], i64 [[IDXPROM]]
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP5:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA4]]
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP5:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !tbaa [[INT_TBAA4]]
 // SANITIZE-WITH-ATTR-NEXT:    ret i32 [[TMP5]]
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local i32 @test28(
 // NO-SANITIZE-WITH-ATTR-SAME: ptr noundef readonly captures(none) [[P:%.*]], i32 noundef [[I:%.*]]) local_unnamed_addr #[[ATTR8]] {
-// NO-SANITIZE-WITH-ATTR-NEXT:  entry:
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[P]], align 8, !tbaa [[TBAA18:![0-9]+]]
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 8, !tbaa [[TBAA18]]
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8, !tbaa [[TBAA18]]
+// NO-SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[P]], align 8, !tbaa [[_ZTS10TEST28_FOOPTR_TBAA18:![0-9]+]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 8, !tbaa [[_ZTS10TEST28_FOOPTR_TBAA18]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8, !tbaa [[_ZTS10TEST28_FOOPTR_TBAA18]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[ARR:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i64 12
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[I]] to i64
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[IDXPROM]]
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !tbaa [[INT_TBAA2]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    ret i32 [[TMP3]]
 //
 // SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i32 @test28(
 // SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[I:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// SANITIZE-WITHOUT-ATTR-NEXT:  entry:
-// SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[P]], align 8, !tbaa [[TBAA21:![0-9]+]]
-// SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 8, !tbaa [[TBAA21]]
-// SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8, !tbaa [[TBAA21]]
+// SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
+// SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[P]], align 8, !tbaa [[_ZTS10TEST28_FOOPTR_TBAA21:![0-9]+]]
+// SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 8, !tbaa [[_ZTS10TEST28_FOOPTR_TBAA21]]
+// SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8, !tbaa [[_ZTS10TEST28_FOOPTR_TBAA21]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[ARR:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i64 12
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[I]] to i64
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[IDXPROM]]
-// SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
+// SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !tbaa [[INT_TBAA2]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    ret i32 [[TMP3]]
 //
 // NO-SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i32 @test28(
 // NO-SANITIZE-WITHOUT-ATTR-SAME: ptr noundef readonly captures(none) [[P:%.*]], i32 noundef [[I:%.*]]) local_unnamed_addr #[[ATTR7]] {
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:  entry:
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[P]], align 8, !tbaa [[TBAA18:![0-9]+]]
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 8, !tbaa [[TBAA18]]
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8, !tbaa [[TBAA18]]
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[P]], align 8, !tbaa [[_ZTS10TEST28_FOOPTR_TBAA18:![0-9]+]]
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 8, !tbaa [[_ZTS10TEST28_FOOPTR_TBAA18]]
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8, !tbaa [[_ZTS10TEST28_FOOPTR_TBAA18]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARR:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i64 12
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[I]] to i64
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[IDXPROM]]
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !tbaa [[INT_TBAA2]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret i32 [[TMP3]]
 //
 int test28(struct test28_foo *p, int i) {
@@ -1896,39 +1896,39 @@ struct annotated_struct_array {
 
 // SANITIZE-WITH-ATTR-LABEL: define dso_local void @test29(
 // SANITIZE-WITH-ATTR-SAME: ptr noundef [[ANN:%.*]], i32 noundef [[IDX1:%.*]], i32 noundef [[IDX2:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// SANITIZE-WITH-ATTR-NEXT:  entry:
+// SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = icmp ult i32 [[IDX1]], 10
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = zext i32 [[IDX1]] to i64
-// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP0]], label [[CONT3:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR:       handler.out_of_bounds:
+// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP0]], label %[[CONT3:.*]], label %[[HANDLER_OUT_OF_BOUNDS:.*]], !prof [[PROF3]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR:       [[HANDLER_OUT_OF_BOUNDS]]:
 // SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB41:[0-9]+]], i64 [[TMP1]]) #[[ATTR8]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR:       cont3:
+// SANITIZE-WITH-ATTR:       [[CONT3]]:
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw ptr, ptr [[ANN]], i64 [[TMP1]]
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA23:![0-9]+]]
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8, !tbaa [[_ZTS9ANNOTATEDPTR_TBAA23:![0-9]+]]
 // SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i64 8
 // SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i32, ptr [[COUNTED_BY_GEP]], align 4
 // SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM27:%.*]] = sext i32 [[IDX2]] to i64
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP3:%.*]] = zext i32 [[COUNTED_BY_LOAD]] to i64, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP4:%.*]] = icmp ult i64 [[IDXPROM27]], [[TMP3]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP4]], label [[CONT32:%.*]], label [[HANDLER_OUT_OF_BOUNDS28:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR:       handler.out_of_bounds28:
+// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP4]], label %[[CONT32:.*]], label %[[HANDLER_OUT_OF_BOUNDS28:.*]], !prof [[PROF3]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR:       [[HANDLER_OUT_OF_BOUNDS28]]:
 // SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB42:[0-9]+]], i64 [[IDXPROM27]]) #[[ATTR8]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR:       cont32:
+// SANITIZE-WITH-ATTR:       [[CONT32]]:
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i64 12
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX30:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAY]], i64 [[IDXPROM27]]
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP5:%.*]] = tail call i32 @llvm.smax.i32(i32 [[COUNTED_BY_LOAD]], i32 0)
 // SANITIZE-WITH-ATTR-NEXT:    [[CONV:%.*]] = shl i32 [[TMP5]], 2
-// SANITIZE-WITH-ATTR-NEXT:    store i32 [[CONV]], ptr [[ARRAYIDX30]], align 4, !tbaa [[TBAA4]]
+// SANITIZE-WITH-ATTR-NEXT:    store i32 [[CONV]], ptr [[ARRAYIDX30]], align 4, !tbaa [[INT_TBAA4]]
 // SANITIZE-WITH-ATTR-NEXT:    ret void
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local void @test29(
 // NO-SANITIZE-WITH-ATTR-SAME: ptr noundef readonly captures(none) [[ANN:%.*]], i32 noundef [[IDX1:%.*]], i32 noundef [[IDX2:%.*]]) local_unnamed_addr #[[ATTR9:[0-9]+]] {
-// NO-SANITIZE-WITH-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[IDX1]] to i64
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[ANN]], i64 [[IDXPROM]]
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA20:![0-9]+]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8, !tbaa [[_ZTS9ANNOTATEDPTR_TBAA20:![0-9]+]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 12
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 8
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i32, ptr [[COUNTED_BY_GEP]], align 4
@@ -1936,37 +1936,37 @@ struct annotated_struct_array {
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[CONV:%.*]] = shl i32 [[TMP1]], 2
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM8:%.*]] = sext i32 [[IDX2]] to i64
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, ptr [[ARRAY]], i64 [[IDXPROM8]]
-// NO-SANITIZE-WITH-ATTR-NEXT:    store i32 [[CONV]], ptr [[ARRAYIDX9]], align 4, !tbaa [[TBAA2]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    store i32 [[CONV]], ptr [[ARRAYIDX9]], align 4, !tbaa [[INT_TBAA2]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    ret void
 //
 // SANITIZE-WITHOUT-ATTR-LABEL: define dso_local void @test29(
 // SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[ANN:%.*]], i32 noundef [[IDX1:%.*]], i32 noundef [[IDX2:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP0:%.*]] = icmp ult i32 [[IDX1]], 10
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP1:%.*]] = zext i32 [[IDX1]] to i64
-// SANITIZE-WITHOUT-ATTR-NEXT:    br i1 [[TMP0]], label [[CONT21:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF8]], !nosanitize [[META9]]
-// SANITIZE-WITHOUT-ATTR:       handler.out_of_bounds:
+// SANITIZE-WITHOUT-ATTR-NEXT:    br i1 [[TMP0]], label %[[CONT21:.*]], label %[[HANDLER_OUT_OF_BOUNDS:.*]], !prof [[PROF8]], !nosanitize [[META9]]
+// SANITIZE-WITHOUT-ATTR:       [[HANDLER_OUT_OF_BOUNDS]]:
 // SANITIZE-WITHOUT-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB13:[0-9]+]], i64 [[TMP1]]) #[[ATTR8]], !nosanitize [[META9]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    unreachable, !nosanitize [[META9]]
-// SANITIZE-WITHOUT-ATTR:       cont21:
+// SANITIZE-WITHOUT-ATTR:       [[CONT21]]:
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw ptr, ptr [[ANN]], i64 [[TMP1]]
-// SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA23:![0-9]+]]
+// SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8, !tbaa [[_ZTS9ANNOTATEDPTR_TBAA23:![0-9]+]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i64 12
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[IDXPROM18:%.*]] = sext i32 [[IDX2]] to i64
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX19:%.*]] = getelementptr inbounds i32, ptr [[ARRAY]], i64 [[IDXPROM18]]
-// SANITIZE-WITHOUT-ATTR-NEXT:    store i32 -1, ptr [[ARRAYIDX19]], align 4, !tbaa [[TBAA2]]
+// SANITIZE-WITHOUT-ATTR-NEXT:    store i32 -1, ptr [[ARRAYIDX19]], align 4, !tbaa [[INT_TBAA2]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    ret void
 //
 // NO-SANITIZE-WITHOUT-ATTR-LABEL: define dso_local void @test29(
 // NO-SANITIZE-WITHOUT-ATTR-SAME: ptr noundef readonly captures(none) [[ANN:%.*]], i32 noundef [[IDX1:%.*]], i32 noundef [[IDX2:%.*]]) local_unnamed_addr #[[ATTR8:[0-9]+]] {
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[IDX1]] to i64
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[ANN]], i64 [[IDXPROM]]
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA20:![0-9]+]]
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8, !tbaa [[_ZTS9ANNOTATEDPTR_TBAA20:![0-9]+]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 12
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[IDXPROM5:%.*]] = sext i32 [[IDX2]] to i64
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, ptr [[ARRAY]], i64 [[IDXPROM5]]
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:    store i32 -1, ptr [[ARRAYIDX6]], align 4, !tbaa [[TBAA2]]
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    store i32 -1, ptr [[ARRAYIDX6]], align 4, !tbaa [[INT_TBAA2]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret void
 //
 void test29(struct annotated_struct_array *ann, int idx1, int idx2) {
@@ -1986,34 +1986,34 @@ struct test30_struct {
 
 // SANITIZE-WITH-ATTR-LABEL: define dso_local void @test30(
 // SANITIZE-WITH-ATTR-SAME: ptr noundef [[PTR:%.*]], i32 noundef [[IDX:%.*]]) local_unnamed_addr #[[ATTR3]] {
-// SANITIZE-WITH-ATTR-NEXT:  entry:
+// SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = zext i32 [[IDX]] to i64, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB44:[0-9]+]], i64 [[TMP0]]) #[[ATTR8]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local void @test30(
 // NO-SANITIZE-WITH-ATTR-SAME: ptr noundef [[PTR:%.*]], i32 noundef [[IDX:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// NO-SANITIZE-WITH-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[PCPU_REFCNT:%.*]] = getelementptr inbounds nuw i8, ptr [[PTR]], i64 12
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[IDX]] to i64
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[PCPU_REFCNT]], i64 [[IDXPROM]]
-// NO-SANITIZE-WITH-ATTR-NEXT:    store i8 -1, ptr [[ARRAYIDX]], align 1, !tbaa [[TBAA6]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    store i8 -1, ptr [[ARRAYIDX]], align 1, !tbaa [[CHAR_TBAA6]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    ret void
 //
 // SANITIZE-WITHOUT-ATTR-LABEL: define dso_local void @test30(
 // SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[PTR:%.*]], i32 noundef [[IDX:%.*]]) local_unnamed_addr #[[ATTR3]] {
-// SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP0:%.*]] = zext i32 [[IDX]] to i64, !nosanitize [[META9]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB15:[0-9]+]], i64 [[TMP0]]) #[[ATTR8]], !nosanitize [[META9]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    unreachable, !nosanitize [[META9]]
 //
 // NO-SANITIZE-WITHOUT-ATTR-LABEL: define dso_local void @test30(
 // NO-SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[PTR:%.*]], i32 noundef [[IDX:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[PCPU_REFCNT:%.*]] = getelementptr inbounds nuw i8, ptr [[PTR]], i64 12
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[IDX]] to i64
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[PCPU_REFCNT]], i64 [[IDXPROM]]
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:    store i8 -1, ptr [[ARRAYIDX]], align 1, !tbaa [[TBAA6]]
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    store i8 -1, ptr [[ARRAYIDX]], align 1, !tbaa [[CHAR_TBAA6]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret void
 //
 void test30(struct test30_struct *ptr, int idx) {
@@ -2030,22 +2030,22 @@ struct test31_struct {
 
 // SANITIZE-WITH-ATTR-LABEL: define dso_local i32 @test31(
 // SANITIZE-WITH-ATTR-SAME: ptr noundef [[PTR:%.*]], i32 noundef [[IDX:%.*]]) local_unnamed_addr #[[ATTR2]] {
-// SANITIZE-WITH-ATTR-NEXT:  entry:
+// SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITH-ATTR-NEXT:    ret i32 -1
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local i32 @test31(
 // NO-SANITIZE-WITH-ATTR-SAME: ptr noundef [[PTR:%.*]], i32 noundef [[IDX:%.*]]) local_unnamed_addr #[[ATTR3]] {
-// NO-SANITIZE-WITH-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    ret i32 -1
 //
 // SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i32 @test31(
 // SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[PTR:%.*]], i32 noundef [[IDX:%.*]]) local_unnamed_addr #[[ATTR2]] {
-// SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    ret i32 -1
 //
 // NO-SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i32 @test31(
 // NO-SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[PTR:%.*]], i32 noundef [[IDX:%.*]]) local_unnamed_addr #[[ATTR1]] {
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret i32 -1
 //
 int test31(struct test31_struct *ptr, int idx) {
@@ -2060,24 +2060,24 @@ struct annotated_with_array {
 
 // SANITIZE-WITH-ATTR-LABEL: define dso_local void @test32(
 // SANITIZE-WITH-ATTR-SAME: ptr noundef [[PTR:%.*]], i32 noundef [[IDX1:%.*]], i32 noundef [[IDX2:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// SANITIZE-WITH-ATTR-NEXT:  entry:
+// SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = icmp ult i32 [[IDX2]], 43
-// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP0]], label [[CONT1:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR:       handler.out_of_bounds:
+// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP0]], label %[[CONT1:.*]], label %[[HANDLER_OUT_OF_BOUNDS:.*]], !prof [[PROF3]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR:       [[HANDLER_OUT_OF_BOUNDS]]:
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = zext i32 [[IDX2]] to i64, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB46:[0-9]+]], i64 [[TMP1]]) #[[ATTR8]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR:       cont1:
+// SANITIZE-WITH-ATTR:       [[CONT1]]:
 // SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[PTR]], i64 336
 // SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i32, ptr [[COUNTED_BY_GEP]], align 4
 // SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM4:%.*]] = sext i32 [[IDX1]] to i64
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = zext i32 [[COUNTED_BY_LOAD]] to i64, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[IDXPROM4]], [[TMP2]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP3]], label [[CONT9:%.*]], label [[HANDLER_OUT_OF_BOUNDS5:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR:       handler.out_of_bounds5:
+// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP3]], label %[[CONT9:.*]], label %[[HANDLER_OUT_OF_BOUNDS5:.*]], !prof [[PROF3]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR:       [[HANDLER_OUT_OF_BOUNDS5]]:
 // SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB48:[0-9]+]], i64 [[IDXPROM4]]) #[[ATTR8]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR:       cont9:
+// SANITIZE-WITH-ATTR:       [[CONT9]]:
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds nuw i8, ptr [[PTR]], i64 344
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw i64, ptr [[ARRAY]], i64 [[IDXPROM4]]
 // SANITIZE-WITH-ATTR-NEXT:    [[COUNT:%.*]] = sext i32 [[COUNTED_BY_LOAD]] to i64
@@ -2087,12 +2087,12 @@ struct annotated_with_array {
 // SANITIZE-WITH-ATTR-NEXT:    [[REASS_SUB:%.*]] = sub nsw i64 [[FLEXIBLE_ARRAY_MEMBER_SIZE]], [[FIELD_OFFSET]]
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP5:%.*]] = tail call i64 @llvm.smax.i64(i64 [[REASS_SUB]], i64 -344)
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP6:%.*]] = add nsw i64 [[TMP5]], 344
-// SANITIZE-WITH-ATTR-NEXT:    store i64 [[TMP6]], ptr [[ARRAYIDX7]], align 8, !tbaa [[TBAA25:![0-9]+]]
+// SANITIZE-WITH-ATTR-NEXT:    store i64 [[TMP6]], ptr [[ARRAYIDX7]], align 8, !tbaa [[LONG_TBAA25:![0-9]+]]
 // SANITIZE-WITH-ATTR-NEXT:    ret void
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local void @test32(
 // NO-SANITIZE-WITH-ATTR-SAME: ptr noundef captures(none) [[PTR:%.*]], i32 noundef [[IDX1:%.*]], i32 noundef [[IDX2:%.*]]) local_unnamed_addr #[[ATTR1]] {
-// NO-SANITIZE-WITH-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[IDX2]] to i64
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[PTR]], i64 336
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i32, ptr [[COUNTED_BY_GEP]], align 4
@@ -2107,32 +2107,32 @@ struct annotated_with_array {
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds nuw i8, ptr [[PTR]], i64 344
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM1:%.*]] = sext i32 [[IDX1]] to i64
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, ptr [[ARRAY]], i64 [[IDXPROM1]]
-// NO-SANITIZE-WITH-ATTR-NEXT:    store i64 [[TMP4]], ptr [[ARRAYIDX2]], align 8, !tbaa [[TBAA22:![0-9]+]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    store i64 [[TMP4]], ptr [[ARRAYIDX2]], align 8, !tbaa [[LONG_TBAA22:![0-9]+]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    ret void
 //
 // SANITIZE-WITHOUT-ATTR-LABEL: define dso_local void @test32(
 // SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[PTR:%.*]], i32 noundef [[IDX1:%.*]], i32 noundef [[IDX2:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP0:%.*]] = icmp ult i32 [[IDX2]], 43
-// SANITIZE-WITHOUT-ATTR-NEXT:    br i1 [[TMP0]], label [[CONT7:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF8]], !nosanitize [[META9]]
-// SANITIZE-WITHOUT-ATTR:       handler.out_of_bounds:
+// SANITIZE-WITHOUT-ATTR-NEXT:    br i1 [[TMP0]], label %[[CONT7:.*]], label %[[HANDLER_OUT_OF_BOUNDS:.*]], !prof [[PROF8]], !nosanitize [[META9]]
+// SANITIZE-WITHOUT-ATTR:       [[HANDLER_OUT_OF_BOUNDS]]:
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP1:%.*]] = zext i32 [[IDX2]] to i64, !nosanitize [[META9]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB17:[0-9]+]], i64 [[TMP1]]) #[[ATTR8]], !nosanitize [[META9]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    unreachable, !nosanitize [[META9]]
-// SANITIZE-WITHOUT-ATTR:       cont7:
+// SANITIZE-WITHOUT-ATTR:       [[CONT7]]:
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds nuw i8, ptr [[PTR]], i64 344
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[IDXPROM4:%.*]] = sext i32 [[IDX1]] to i64
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i64, ptr [[ARRAY]], i64 [[IDXPROM4]]
-// SANITIZE-WITHOUT-ATTR-NEXT:    store i64 -1, ptr [[ARRAYIDX5]], align 8, !tbaa [[TBAA25:![0-9]+]]
+// SANITIZE-WITHOUT-ATTR-NEXT:    store i64 -1, ptr [[ARRAYIDX5]], align 8, !tbaa [[LONG_TBAA25:![0-9]+]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    ret void
 //
 // NO-SANITIZE-WITHOUT-ATTR-LABEL: define dso_local void @test32(
 // NO-SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[PTR:%.*]], i32 noundef [[IDX1:%.*]], i32 noundef [[IDX2:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds nuw i8, ptr [[PTR]], i64 344
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[IDXPROM1:%.*]] = sext i32 [[IDX1]] to i64
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, ptr [[ARRAY]], i64 [[IDXPROM1]]
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:    store i64 -1, ptr [[ARRAYIDX2]], align 8, !tbaa [[TBAA22:![0-9]+]]
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    store i64 -1, ptr [[ARRAYIDX2]], align 8, !tbaa [[LONG_TBAA22:![0-9]+]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret void
 //
 void test32(struct annotated_with_array *ptr, int idx1, int idx2) {
@@ -2141,14 +2141,14 @@ void test32(struct annotated_with_array *ptr, int idx1, int idx2) {
 
 // SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 0, 17179869521) i64 @test32_bdos(
 // SANITIZE-WITH-ATTR-SAME: ptr noundef [[PTR:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// SANITIZE-WITH-ATTR-NEXT:  entry:
+// SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = icmp ult i32 [[INDEX]], 43
-// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP0]], label [[CONT1:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR:       handler.out_of_bounds:
+// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP0]], label %[[CONT1:.*]], label %[[HANDLER_OUT_OF_BOUNDS:.*]], !prof [[PROF3]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR:       [[HANDLER_OUT_OF_BOUNDS]]:
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = zext i32 [[INDEX]] to i64, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB49:[0-9]+]], i64 [[TMP1]]) #[[ATTR8]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR:       cont1:
+// SANITIZE-WITH-ATTR:       [[CONT1]]:
 // SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[PTR]], i64 336
 // SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i32, ptr [[COUNTED_BY_GEP]], align 4
 // SANITIZE-WITH-ATTR-NEXT:    [[COUNT:%.*]] = sext i32 [[COUNTED_BY_LOAD]] to i64
@@ -2162,7 +2162,7 @@ void test32(struct annotated_with_array *ptr, int idx1, int idx2) {
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 -34359738016, 34359738705) i64 @test32_bdos(
 // NO-SANITIZE-WITH-ATTR-SAME: ptr noundef readonly captures(none) [[PTR:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR2]] {
-// NO-SANITIZE-WITH-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[PTR]], i64 336
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i32, ptr [[COUNTED_BY_GEP]], align 4
@@ -2178,19 +2178,19 @@ void test32(struct annotated_with_array *ptr, int idx1, int idx2) {
 //
 // SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i64 @test32_bdos(
 // SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[PTR:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP0:%.*]] = icmp ult i32 [[INDEX]], 43
-// SANITIZE-WITHOUT-ATTR-NEXT:    br i1 [[TMP0]], label [[CONT1:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF8]], !nosanitize [[META9]]
-// SANITIZE-WITHOUT-ATTR:       handler.out_of_bounds:
+// SANITIZE-WITHOUT-ATTR-NEXT:    br i1 [[TMP0]], label %[[CONT1:.*]], label %[[HANDLER_OUT_OF_BOUNDS:.*]], !prof [[PROF8]], !nosanitize [[META9]]
+// SANITIZE-WITHOUT-ATTR:       [[HANDLER_OUT_OF_BOUNDS]]:
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP1:%.*]] = zext i32 [[INDEX]] to i64
 // SANITIZE-WITHOUT-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB18:[0-9]+]], i64 [[TMP1]]) #[[ATTR8]], !nosanitize [[META9]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    unreachable, !nosanitize [[META9]]
-// SANITIZE-WITHOUT-ATTR:       cont1:
+// SANITIZE-WITHOUT-ATTR:       [[CONT1]]:
 // SANITIZE-WITHOUT-ATTR-NEXT:    ret i64 -1
 //
 // NO-SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i64 @test32_bdos(
 // NO-SANITIZE-WITHOUT-ATTR-SAME: ptr noundef readnone [[PTR:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR1]] {
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret i64 -1
 //
 size_t test32_bdos(struct annotated_with_array *ptr, int index) {
@@ -2199,7 +2199,7 @@ size_t test32_bdos(struct annotated_with_array *ptr, int index) {
 
 // SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 -21474836134, 21474836817) i64 @test32_bdos_cast(
 // SANITIZE-WITH-ATTR-SAME: ptr noundef [[PTR:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// SANITIZE-WITH-ATTR-NEXT:  entry:
+// SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
 // SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[PTR]], i64 336
 // SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i32, ptr [[COUNTED_BY_GEP]], align 4
@@ -2216,7 +2216,7 @@ size_t test32_bdos(struct annotated_with_array *ptr, int index) {
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 -21474836134, 21474836817) i64 @test32_bdos_cast(
 // NO-SANITIZE-WITH-ATTR-SAME: ptr noundef readonly captures(none) [[PTR:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR2]] {
-// NO-SANITIZE-WITH-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[PTR]], i64 336
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i32, ptr [[COUNTED_BY_GEP]], align 4
@@ -2233,12 +2233,12 @@ size_t test32_bdos(struct annotated_with_array *ptr, int index) {
 //
 // SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i64 @test32_bdos_cast(
 // SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[PTR:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    ret i64 -1
 //
 // NO-SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i64 @test32_bdos_cast(
 // NO-SANITIZE-WITHOUT-ATTR-SAME: ptr noundef readnone [[PTR:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR1]] {
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret i64 -1
 //
 size_t test32_bdos_cast(struct annotated_with_array *ptr, int index) {
@@ -2247,22 +2247,22 @@ size_t test32_bdos_cast(struct annotated_with_array *ptr, int index) {
 
 // SANITIZE-WITH-ATTR-LABEL: define dso_local i64 @test33(
 // SANITIZE-WITH-ATTR-SAME: ptr noundef [[PTR:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// SANITIZE-WITH-ATTR-NEXT:  entry:
+// SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITH-ATTR-NEXT:    ret i64 -1
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local i64 @test33(
 // NO-SANITIZE-WITH-ATTR-SAME: ptr noundef readnone [[PTR:%.*]]) local_unnamed_addr #[[ATTR3]] {
-// NO-SANITIZE-WITH-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    ret i64 -1
 //
 // SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i64 @test33(
 // SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[PTR:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    ret i64 -1
 //
 // NO-SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i64 @test33(
 // NO-SANITIZE-WITHOUT-ATTR-SAME: ptr noundef readnone [[PTR:%.*]]) local_unnamed_addr #[[ATTR1]] {
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret i64 -1
 //
 size_t test33(struct annotated *ptr) {
@@ -2278,50 +2278,50 @@ struct multi_subscripts {
 
 // SANITIZE-WITH-ATTR-LABEL: define dso_local i64 @test34(
 // SANITIZE-WITH-ATTR-SAME: ptr noundef [[PTR:%.*]], i32 noundef [[IDX1:%.*]], i32 noundef [[IDX2:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// SANITIZE-WITH-ATTR-NEXT:  entry:
+// SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = icmp ult i32 [[IDX1]], 42
-// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP0]], label [[CONT1:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR:       handler.out_of_bounds:
+// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP0]], label %[[CONT1:.*]], label %[[HANDLER_OUT_OF_BOUNDS:.*]], !prof [[PROF3]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR:       [[HANDLER_OUT_OF_BOUNDS]]:
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = zext i32 [[IDX1]] to i64, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB51:[0-9]+]], i64 [[TMP1]]) #[[ATTR8]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR:       cont1:
+// SANITIZE-WITH-ATTR:       [[CONT1]]:
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = icmp ult i32 [[IDX2]], 43
-// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP2]], label [[CONT3:%.*]], label [[HANDLER_OUT_OF_BOUNDS2:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR:       handler.out_of_bounds2:
+// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP2]], label %[[CONT3:.*]], label %[[HANDLER_OUT_OF_BOUNDS2:.*]], !prof [[PROF3]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR:       [[HANDLER_OUT_OF_BOUNDS2]]:
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP3:%.*]] = zext i32 [[IDX2]] to i64
 // SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB52:[0-9]+]], i64 [[TMP3]]) #[[ATTR8]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR:       cont3:
+// SANITIZE-WITH-ATTR:       [[CONT3]]:
 // SANITIZE-WITH-ATTR-NEXT:    ret i64 -1
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local i64 @test34(
 // NO-SANITIZE-WITH-ATTR-SAME: ptr noundef readnone [[PTR:%.*]], i32 noundef [[IDX1:%.*]], i32 noundef [[IDX2:%.*]]) local_unnamed_addr #[[ATTR3]] {
-// NO-SANITIZE-WITH-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    ret i64 -1
 //
 // SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i64 @test34(
 // SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[PTR:%.*]], i32 noundef [[IDX1:%.*]], i32 noundef [[IDX2:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP0:%.*]] = icmp ult i32 [[IDX1]], 42
-// SANITIZE-WITHOUT-ATTR-NEXT:    br i1 [[TMP0]], label [[CONT1:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF8]], !nosanitize [[META9]]
-// SANITIZE-WITHOUT-ATTR:       handler.out_of_bounds:
+// SANITIZE-WITHOUT-ATTR-NEXT:    br i1 [[TMP0]], label %[[CONT1:.*]], label %[[HANDLER_OUT_OF_BOUNDS:.*]], !prof [[PROF8]], !nosanitize [[META9]]
+// SANITIZE-WITHOUT-ATTR:       [[HANDLER_OUT_OF_BOUNDS]]:
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP1:%.*]] = zext i32 [[IDX1]] to i64, !nosanitize [[META9]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB20:[0-9]+]], i64 [[TMP1]]) #[[ATTR8]], !nosanitize [[META9]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    unreachable, !nosanitize [[META9]]
-// SANITIZE-WITHOUT-ATTR:       cont1:
+// SANITIZE-WITHOUT-ATTR:       [[CONT1]]:
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP2:%.*]] = icmp ult i32 [[IDX2]], 43
-// SANITIZE-WITHOUT-ATTR-NEXT:    br i1 [[TMP2]], label [[CONT3:%.*]], label [[HANDLER_OUT_OF_BOUNDS2:%.*]], !prof [[PROF8]], !nosanitize [[META9]]
-// SANITIZE-WITHOUT-ATTR:       handler.out_of_bounds2:
+// SANITIZE-WITHOUT-ATTR-NEXT:    br i1 [[TMP2]], label %[[CONT3:.*]], label %[[HANDLER_OUT_OF_BOUNDS2:.*]], !prof [[PROF8]], !nosanitize [[META9]]
+// SANITIZE-WITHOUT-ATTR:       [[HANDLER_OUT_OF_BOUNDS2]]:
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP3:%.*]] = zext i32 [[IDX2]] to i64
 // SANITIZE-WITHOUT-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB21:[0-9]+]], i64 [[TMP3]]) #[[ATTR8]], !nosanitize [[META9]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    unreachable, !nosanitize [[META9]]
-// SANITIZE-WITHOUT-ATTR:       cont3:
+// SANITIZE-WITHOUT-ATTR:       [[CONT3]]:
 // SANITIZE-WITHOUT-ATTR-NEXT:    ret i64 -1
 //
 // NO-SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i64 @test34(
 // NO-SANITIZE-WITHOUT-ATTR-SAME: ptr noundef readnone [[PTR:%.*]], i32 noundef [[IDX1:%.*]], i32 noundef [[IDX2:%.*]]) local_unnamed_addr #[[ATTR1]] {
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret i64 -1
 //
 size_t test34(struct multi_subscripts *ptr, int idx1, int idx2) {
@@ -2330,43 +2330,43 @@ size_t test34(struct multi_subscripts *ptr, int idx1, int idx2) {
 
 // SANITIZE-WITH-ATTR-LABEL: define dso_local void @test35(
 // SANITIZE-WITH-ATTR-SAME: ptr noundef [[P:%.*]], i64 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// SANITIZE-WITH-ATTR-NEXT:  entry:
+// SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITH-ATTR-NEXT:    [[DOTCOUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
 // SANITIZE-WITH-ATTR-NEXT:    [[DOTCOUNTED_BY_LOAD:%.*]] = load i32, ptr [[DOTCOUNTED_BY_GEP]], align 4
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = zext i32 [[DOTCOUNTED_BY_LOAD]] to i64, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[INDEX]], [[TMP0]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP1]], label [[CONT3:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR:       handler.out_of_bounds:
+// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP1]], label %[[CONT3:.*]], label %[[HANDLER_OUT_OF_BOUNDS:.*]], !prof [[PROF3]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR:       [[HANDLER_OUT_OF_BOUNDS]]:
 // SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB53:[0-9]+]], i64 [[INDEX]]) #[[ATTR8]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR:       cont3:
+// SANITIZE-WITH-ATTR:       [[CONT3]]:
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 12
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAY]], i64 [[INDEX]]
-// SANITIZE-WITH-ATTR-NEXT:    store i32 0, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA4]]
+// SANITIZE-WITH-ATTR-NEXT:    store i32 0, ptr [[ARRAYIDX]], align 4, !tbaa [[INT_TBAA4]]
 // SANITIZE-WITH-ATTR-NEXT:    ret void
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local void @test35(
 // NO-SANITIZE-WITH-ATTR-SAME: ptr noundef writeonly captures(none) [[P:%.*]], i64 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// NO-SANITIZE-WITH-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 12
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAY]], i64 [[INDEX]]
-// NO-SANITIZE-WITH-ATTR-NEXT:    store i32 0, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    store i32 0, ptr [[ARRAYIDX]], align 4, !tbaa [[INT_TBAA2]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    ret void
 //
 // SANITIZE-WITHOUT-ATTR-LABEL: define dso_local void @test35(
 // SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[P:%.*]], i64 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 12
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAY]], i64 [[INDEX]]
-// SANITIZE-WITHOUT-ATTR-NEXT:    store i32 0, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
+// SANITIZE-WITHOUT-ATTR-NEXT:    store i32 0, ptr [[ARRAYIDX]], align 4, !tbaa [[INT_TBAA2]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    ret void
 //
 // NO-SANITIZE-WITHOUT-ATTR-LABEL: define dso_local void @test35(
 // NO-SANITIZE-WITHOUT-ATTR-SAME: ptr noundef writeonly captures(none) [[P:%.*]], i64 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 12
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAY]], i64 [[INDEX]]
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:    store i32 0, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    store i32 0, ptr [[ARRAYIDX]], align 4, !tbaa [[INT_TBAA2]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret void
 //
 void test35(struct annotated *p, size_t index) {
@@ -2375,22 +2375,22 @@ void test35(struct annotated *p, size_t index) {
 
 // SANITIZE-WITH-ATTR-LABEL: define dso_local noundef i64 @test35_bdos(
 // SANITIZE-WITH-ATTR-SAME: ptr noundef readnone captures(none) [[P:%.*]]) local_unnamed_addr #[[ATTR2]] {
-// SANITIZE-WITH-ATTR-NEXT:  entry:
+// SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITH-ATTR-NEXT:    ret i64 0
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local noundef i64 @test35_bdos(
 // NO-SANITIZE-WITH-ATTR-SAME: ptr noundef readnone captures(none) [[P:%.*]]) local_unnamed_addr #[[ATTR3]] {
-// NO-SANITIZE-WITH-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    ret i64 0
 //
 // SANITIZE-WITHOUT-ATTR-LABEL: define dso_local noundef i64 @test35_bdos(
 // SANITIZE-WITHOUT-ATTR-SAME: ptr noundef readnone captures(none) [[P:%.*]]) local_unnamed_addr #[[ATTR2]] {
-// SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    ret i64 0
 //
 // NO-SANITIZE-WITHOUT-ATTR-LABEL: define dso_local noundef i64 @test35_bdos(
 // NO-SANITIZE-WITHOUT-ATTR-SAME: ptr noundef readnone captures(none) [[P:%.*]]) local_unnamed_addr #[[ATTR1]] {
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret i64 0
 //
 size_t test35_bdos(struct annotated *p) {
@@ -2412,22 +2412,22 @@ struct {
 
 // SANITIZE-WITH-ATTR-LABEL: define dso_local i64 @test36(
 // SANITIZE-WITH-ATTR-SAME: ) local_unnamed_addr #[[ATTR6:[0-9]+]] {
-// SANITIZE-WITH-ATTR-NEXT:  entry:
+// SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITH-ATTR-NEXT:    ret i64 -1
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local i64 @test36(
 // NO-SANITIZE-WITH-ATTR-SAME: ) local_unnamed_addr #[[ATTR10:[0-9]+]] {
-// NO-SANITIZE-WITH-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    ret i64 -1
 //
 // SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i64 @test36(
 // SANITIZE-WITHOUT-ATTR-SAME: ) local_unnamed_addr #[[ATTR6:[0-9]+]] {
-// SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    ret i64 -1
 //
 // NO-SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i64 @test36(
 // NO-SANITIZE-WITHOUT-ATTR-SAME: ) local_unnamed_addr #[[ATTR9:[0-9]+]] {
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret i64 -1
 //
 size_t test36() {
@@ -2436,7 +2436,7 @@ size_t test36() {
 
 // SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 -8589934592, 8589934589) i64 @test37(
 // SANITIZE-WITH-ATTR-SAME: ptr noundef [[PTR:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// SANITIZE-WITH-ATTR-NEXT:  entry:
+// SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[PTR]], i64 8
 // SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i32, ptr [[COUNTED_BY_GEP]], align 4
 // SANITIZE-WITH-ATTR-NEXT:    [[COUNT:%.*]] = sext i32 [[COUNTED_BY_LOAD]] to i64
@@ -2447,7 +2447,7 @@ size_t test36() {
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 -8589934592, 8589934589) i64 @test37(
 // NO-SANITIZE-WITH-ATTR-SAME: ptr noundef readonly captures(none) [[PTR:%.*]]) local_unnamed_addr #[[ATTR2]] {
-// NO-SANITIZE-WITH-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[PTR]], i64 8
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i32, ptr [[COUNTED_BY_GEP]], align 4
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNT:%.*]] = sext i32 [[COUNTED_BY_LOAD]] to i64
@@ -2458,14 +2458,113 @@ size_t test36() {
 //
 // SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i64 @test37(
 // SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[PTR:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    ret i64 -1
 //
 // NO-SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i64 @test37(
 // NO-SANITIZE-WITHOUT-ATTR-SAME: ptr noundef readnone [[PTR:%.*]]) local_unnamed_addr #[[ATTR1]] {
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret i64 -1
 //
 size_t test37(struct annotated *ptr) {
   return __builtin_dynamic_object_size((1, 2, (4, 5, (7, 8, 9, (10, ptr->array)))), 1);
 }
+//.
+// SANITIZE-WITH-ATTR: [[META2]] = !{}
+// SANITIZE-WITH-ATTR: [[PROF3]] = !{!"branch_weights", i32 1048575, i32 1}
+// SANITIZE-WITH-ATTR: [[INT_TBAA4]] = !{[[META5:![0-9]+]], [[META5]], i64 0}
+// SANITIZE-WITH-ATTR: [[META5]] = !{!"int", [[META6:![0-9]+]], i64 0}
+// SANITIZE-WITH-ATTR: [[META6]] = !{!"omnipotent char", [[META7:![0-9]+]], i64 0}
+// SANITIZE-WITH-ATTR: [[META7]] = !{!"Simple C/C++ TBAA"}
+// SANITIZE-WITH-ATTR: [[PROF8]] = !{!"branch_weights", i32 1, i32 1048575}
+// SANITIZE-WITH-ATTR: [[CHAR_TBAA9]] = !{[[META6]], [[META6]], i64 0}
+// SANITIZE-WITH-ATTR: [[TBAA_STRUCT10]] = !{i64 0, i64 24, [[CHAR_TBAA9]]}
+// SANITIZE-WITH-ATTR: [[_ZTS10TEST13_BARPTR_TBAA11]] = !{[[META12:![0-9]+]], [[META13:![0-9]+]], i64 0}
+// SANITIZE-WITH-ATTR: [[META12]] = !{!"test13_foo", [[META13]], i64 0}
+// SANITIZE-WITH-ATTR: [[META13]] = !{!"p1 _ZTS10test13_bar", [[META14:![0-9]+]], i64 0}
+// SANITIZE-WITH-ATTR: [[META14]] = !{!"any pointer", [[META6]], i64 0}
+// SANITIZE-WITH-ATTR: [[_ZTS10TEST13_FOOPTR_TBAA15]] = !{[[META16:![0-9]+]], [[META16]], i64 0}
+// SANITIZE-WITH-ATTR: [[META16]] = !{!"p1 _ZTS10test13_foo", [[META14]], i64 0}
+// SANITIZE-WITH-ATTR: [[_ZTS9TESTS_FOOPTR_TBAA17]] = !{[[META18:![0-9]+]], [[META18]], i64 0}
+// SANITIZE-WITH-ATTR: [[META18]] = !{!"p1 _ZTS9tests_foo", [[META14]], i64 0}
+// SANITIZE-WITH-ATTR: [[_ZTS10TEST27_BARPTR_TBAA19]] = !{[[META20:![0-9]+]], [[META20]], i64 0}
+// SANITIZE-WITH-ATTR: [[META20]] = !{!"p1 _ZTS10test27_bar", [[META14]], i64 0}
+// SANITIZE-WITH-ATTR: [[_ZTS10TEST28_FOOPTR_TBAA21]] = !{[[META22:![0-9]+]], [[META22]], i64 0}
+// SANITIZE-WITH-ATTR: [[META22]] = !{!"p1 _ZTS10test28_foo", [[META14]], i64 0}
+// SANITIZE-WITH-ATTR: [[_ZTS9ANNOTATEDPTR_TBAA23]] = !{[[META24:![0-9]+]], [[META24]], i64 0}
+// SANITIZE-WITH-ATTR: [[META24]] = !{!"p1 _ZTS9annotated", [[META14]], i64 0}
+// SANITIZE-WITH-ATTR: [[LONG_TBAA25]] = !{[[META26:![0-9]+]], [[META26]], i64 0}
+// SANITIZE-WITH-ATTR: [[META26]] = !{!"long", [[META6]], i64 0}
+//.
+// NO-SANITIZE-WITH-ATTR: [[INT_TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
+// NO-SANITIZE-WITH-ATTR: [[META3]] = !{!"int", [[META4:![0-9]+]], i64 0}
+// NO-SANITIZE-WITH-ATTR: [[META4]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0}
+// NO-SANITIZE-WITH-ATTR: [[META5]] = !{!"Simple C/C++ TBAA"}
+// NO-SANITIZE-WITH-ATTR: [[CHAR_TBAA6]] = !{[[META4]], [[META4]], i64 0}
+// NO-SANITIZE-WITH-ATTR: [[TBAA_STRUCT7]] = !{i64 0, i64 24, [[CHAR_TBAA6]]}
+// NO-SANITIZE-WITH-ATTR: [[_ZTS10TEST13_BARPTR_TBAA8]] = !{[[META9:![0-9]+]], [[META10:![0-9]+]], i64 0}
+// NO-SANITIZE-WITH-ATTR: [[META9]] = !{!"test13_foo", [[META10]], i64 0}
+// NO-SANITIZE-WITH-ATTR: [[META10]] = !{!"p1 _ZTS10test13_bar", [[META11:![0-9]+]], i64 0}
+// NO-SANITIZE-WITH-ATTR: [[META11]] = !{!"any pointer", [[META4]], i64 0}
+// NO-SANITIZE-WITH-ATTR: [[_ZTS10TEST13_FOOPTR_TBAA12]] = !{[[META13:![0-9]+]], [[META13]], i64 0}
+// NO-SANITIZE-WITH-ATTR: [[META13]] = !{!"p1 _ZTS10test13_foo", [[META11]], i64 0}
+// NO-SANITIZE-WITH-ATTR: [[_ZTS9TESTS_FOOPTR_TBAA14]] = !{[[META15:![0-9]+]], [[META15]], i64 0}
+// NO-SANITIZE-WITH-ATTR: [[META15]] = !{!"p1 _ZTS9tests_foo", [[META11]], i64 0}
+// NO-SANITIZE-WITH-ATTR: [[_ZTS10TEST27_BARPTR_TBAA16]] = !{[[META17:![0-9]+]], [[META17]], i64 0}
+// NO-SANITIZE-WITH-ATTR: [[META17]] = !{!"p1 _ZTS10test27_bar", [[META11]], i64 0}
+// NO-SANITIZE-WITH-ATTR: [[_ZTS10TEST28_FOOPTR_TBAA18]] = !{[[META19:![0-9]+]], [[META19]], i64 0}
+// NO-SANITIZE-WITH-ATTR: [[META19]] = !{!"p1 _ZTS10test28_foo", [[META11]], i64 0}
+// NO-SANITIZE-WITH-ATTR: [[_ZTS9ANNOTATEDPTR_TBAA20]] = !{[[META21:![0-9]+]], [[META21]], i64 0}
+// NO-SANITIZE-WITH-ATTR: [[META21]] = !{!"p1 _ZTS9annotated", [[META11]], i64 0}
+// NO-SANITIZE-WITH-ATTR: [[LONG_TBAA22]] = !{[[META23:![0-9]+]], [[META23]], i64 0}
+// NO-SANITIZE-WITH-ATTR: [[META23]] = !{!"long", [[META4]], i64 0}
+//.
+// SANITIZE-WITHOUT-ATTR: [[INT_TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
+// SANITIZE-WITHOUT-ATTR: [[META3]] = !{!"int", [[META4:![0-9]+]], i64 0}
+// SANITIZE-WITHOUT-ATTR: [[META4]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0}
+// SANITIZE-WITHOUT-ATTR: [[META5]] = !{!"Simple C/C++ TBAA"}
+// SANITIZE-WITHOUT-ATTR: [[CHAR_TBAA6]] = !{[[META4]], [[META4]], i64 0}
+// SANITIZE-WITHOUT-ATTR: [[TBAA_STRUCT7]] = !{i64 0, i64 24, [[CHAR_TBAA6]]}
+// SANITIZE-WITHOUT-ATTR: [[PROF8]] = !{!"branch_weights", i32 1048575, i32 1}
+// SANITIZE-WITHOUT-ATTR: [[META9]] = !{}
+// SANITIZE-WITHOUT-ATTR: [[PROF10]] = !{!"branch_weights", i32 1, i32 1048575}
+// SANITIZE-WITHOUT-ATTR: [[_ZTS10TEST13_BARPTR_TBAA11]] = !{[[META12:![0-9]+]], [[META13:![0-9]+]], i64 0}
+// SANITIZE-WITHOUT-ATTR: [[META12]] = !{!"test13_foo", [[META13]], i64 0}
+// SANITIZE-WITHOUT-ATTR: [[META13]] = !{!"p1 _ZTS10test13_bar", [[META14:![0-9]+]], i64 0}
+// SANITIZE-WITHOUT-ATTR: [[META14]] = !{!"any pointer", [[META4]], i64 0}
+// SANITIZE-WITHOUT-ATTR: [[_ZTS10TEST13_FOOPTR_TBAA15]] = !{[[META16:![0-9]+]], [[META16]], i64 0}
+// SANITIZE-WITHOUT-ATTR: [[META16]] = !{!"p1 _ZTS10test13_foo", [[META14]], i64 0}
+// SANITIZE-WITHOUT-ATTR: [[_ZTS9TESTS_FOOPTR_TBAA17]] = !{[[META18:![0-9]+]], [[META18]], i64 0}
+// SANITIZE-WITHOUT-ATTR: [[META18]] = !{!"p1 _ZTS9tests_foo", [[META14]], i64 0}
+// SANITIZE-WITHOUT-ATTR: [[_ZTS10TEST27_BARPTR_TBAA19]] = !{[[META20:![0-9]+]], [[META20]], i64 0}
+// SANITIZE-WITHOUT-ATTR: [[META20]] = !{!"p1 _ZTS10test27_bar", [[META14]], i64 0}
+// SANITIZE-WITHOUT-ATTR: [[_ZTS10TEST28_FOOPTR_TBAA21]] = !{[[META22:![0-9]+]], [[META22]], i64 0}
+// SANITIZE-WITHOUT-ATTR: [[META22]] = !{!"p1 _ZTS10test28_foo", [[META14]], i64 0}
+// SANITIZE-WITHOUT-ATTR: [[_ZTS9ANNOTATEDPTR_TBAA23]] = !{[[META24:![0-9]+]], [[META24]], i64 0}
+// SANITIZE-WITHOUT-ATTR: [[META24]] = !{!"p1 _ZTS9annotated", [[META14]], i64 0}
+// SANITIZE-WITHOUT-ATTR: [[LONG_TBAA25]] = !{[[META26:![0-9]+]], [[META26]], i64 0}
+// SANITIZE-WITHOUT-ATTR: [[META26]] = !{!"long", [[META4]], i64 0}
+//.
+// NO-SANITIZE-WITHOUT-ATTR: [[INT_TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
+// NO-SANITIZE-WITHOUT-ATTR: [[META3]] = !{!"int", [[META4:![0-9]+]], i64 0}
+// NO-SANITIZE-WITHOUT-ATTR: [[META4]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0}
+// NO-SANITIZE-WITHOUT-ATTR: [[META5]] = !{!"Simple C/C++ TBAA"}
+// NO-SANITIZE-WITHOUT-ATTR: [[CHAR_TBAA6]] = !{[[META4]], [[META4]], i64 0}
+// NO-SANITIZE-WITHOUT-ATTR: [[TBAA_STRUCT7]] = !{i64 0, i64 24, [[CHAR_TBAA6]]}
+// NO-SANITIZE-WITHOUT-ATTR: [[_ZTS10TEST13_BARPTR_TBAA8]] = !{[[META9:![0-9]+]], [[META10:![0-9]+]], i64 0}
+// NO-SANITIZE-WITHOUT-ATTR: [[META9]] = !{!"test13_foo", [[META10]], i64 0}
+// NO-SANITIZE-WITHOUT-ATTR: [[META10]] = !{!"p1 _ZTS10test13_bar", [[META11:![0-9]+]], i64 0}
+// NO-SANITIZE-WITHOUT-ATTR: [[META11]] = !{!"any pointer", [[META4]], i64 0}
+// NO-SANITIZE-WITHOUT-ATTR: [[_ZTS10TEST13_FOOPTR_TBAA12]] = !{[[META13:![0-9]+]], [[META13]], i64 0}
+// NO-SANITIZE-WITHOUT-ATTR: [[META13]] = !{!"p1 _ZTS10test13_foo", [[META11]], i64 0}
+// NO-SANITIZE-WITHOUT-ATTR: [[_ZTS9TESTS_FOOPTR_TBAA14]] = !{[[META15:![0-9]+]], [[META15]], i64 0}
+// NO-SANITIZE-WITHOUT-ATTR: [[META15]] = !{!"p1 _ZTS9tests_foo", [[META11]], i64 0}
+// NO-SANITIZE-WITHOUT-ATTR: [[_ZTS10TEST27_BARPTR_TBAA16]] = !{[[META17:![0-9]+]], [[META17]], i64 0}
+// NO-SANITIZE-WITHOUT-ATTR: [[META17]] = !{!"p1 _ZTS10test27_bar", [[META11]], i64 0}
+// NO-SANITIZE-WITHOUT-ATTR: [[_ZTS10TEST28_FOOPTR_TBAA18]] = !{[[META19:![0-9]+]], [[META19]], i64 0}
+// NO-SANITIZE-WITHOUT-ATTR: [[META19]] = !{!"p1 _ZTS10test28_foo", [[META11]], i64 0}
+// NO-SANITIZE-WITHOUT-ATTR: [[_ZTS9ANNOTATEDPTR_TBAA20]] = !{[[META21:![0-9]+]], [[META21]], i64 0}
+// NO-SANITIZE-WITHOUT-ATTR: [[META21]] = !{!"p1 _ZTS9annotated", [[META11]], i64 0}
+// NO-SANITIZE-WITHOUT-ATTR: [[LONG_TBAA22]] = !{[[META23:![0-9]+]], [[META23]], i64 0}
+// NO-SANITIZE-WITHOUT-ATTR: [[META23]] = !{!"long", [[META4]], i64 0}
+//.
diff --git a/clang/test/CodeGen/builtin-attributes.c b/clang/test/CodeGen/builtin-attributes.c
index 506b165fcf36e..212ff008e12a5 100644
--- a/clang/test/CodeGen/builtin-attributes.c
+++ b/clang/test/CodeGen/builtin-attributes.c
@@ -57,12 +57,15 @@ long double modfl(long double x, long double*) asm("modfl");
 // CHECK: ret
 int f3(double x) {
   int e;
+  float f;
+  double d;
+  long double ld;
   frexp(x, &e);
   frexpf(x, &e);
   frexpl(x, &e);
-  modf(x, &e);
-  modff(x, &e);
-  modfl(x, &e);
+  modf(x, &d);
+  modff(x, &f);
+  modfl(x, &ld);
   __builtin_remquo(x, x, &e);
   __builtin_remquof(x, x, &e);
   __builtin_remquol(x, x, &e);
diff --git a/clang/test/CodeGen/builtin-maxnum-minnum.c b/clang/test/CodeGen/builtin-maxnum-minnum.c
index 69cec72495d30..2455f3b616ce7 100644
--- a/clang/test/CodeGen/builtin-maxnum-minnum.c
+++ b/clang/test/CodeGen/builtin-maxnum-minnum.c
@@ -1,4 +1,4 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6
 // RUN: %clang_cc1 -x c++ -std=c++20 -disable-llvm-passes -O3 -triple x86_64 %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK
 
 typedef _Float16 half8 __attribute__((ext_vector_type(8)));
@@ -12,10 +12,10 @@ typedef long double ldouble2 __attribute__((ext_vector_type(2)));
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x half>, align 16
 // CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <8 x half>, align 16
-// CHECK-NEXT:    store <8 x half> [[A]], ptr [[A_ADDR]], align 16, !tbaa [[TBAA2:![0-9]+]]
-// CHECK-NEXT:    store <8 x half> [[B]], ptr [[B_ADDR]], align 16, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[A_ADDR]], align 16, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[B_ADDR]], align 16, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x half> [[A]], ptr [[A_ADDR]], align 16, !tbaa [[CHAR_TBAA2:![0-9]+]]
+// CHECK-NEXT:    store <8 x half> [[B]], ptr [[B_ADDR]], align 16, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[A_ADDR]], align 16, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[B_ADDR]], align 16, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[ELT_MINNUM:%.*]] = call <8 x half> @llvm.minnum.v8f16(<8 x half> [[TMP0]], <8 x half> [[TMP1]])
 // CHECK-NEXT:    ret <8 x half> [[ELT_MINNUM]]
 //
@@ -27,10 +27,10 @@ half8 pfmin16(half8 a, half8 b) {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x bfloat>, align 16
 // CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <8 x bfloat>, align 16
-// CHECK-NEXT:    store <8 x bfloat> [[A]], ptr [[A_ADDR]], align 16, !tbaa [[TBAA2]]
-// CHECK-NEXT:    store <8 x bfloat> [[B]], ptr [[B_ADDR]], align 16, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x bfloat>, ptr [[A_ADDR]], align 16, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x bfloat>, ptr [[B_ADDR]], align 16, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x bfloat> [[A]], ptr [[A_ADDR]], align 16, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x bfloat> [[B]], ptr [[B_ADDR]], align 16, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x bfloat>, ptr [[A_ADDR]], align 16, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x bfloat>, ptr [[B_ADDR]], align 16, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[ELT_MINNUM:%.*]] = call <8 x bfloat> @llvm.minnum.v8bf16(<8 x bfloat> [[TMP0]], <8 x bfloat> [[TMP1]])
 // CHECK-NEXT:    ret <8 x bfloat> [[ELT_MINNUM]]
 //
@@ -42,10 +42,10 @@ bf16x8 pfmin16b(bf16x8 a, bf16x8 b) {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <4 x float>, align 16
 // CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <4 x float>, align 16
-// CHECK-NEXT:    store <4 x float> [[A]], ptr [[A_ADDR]], align 16, !tbaa [[TBAA2]]
-// CHECK-NEXT:    store <4 x float> [[B]], ptr [[B_ADDR]], align 16, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A_ADDR]], align 16, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[B_ADDR]], align 16, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x float> [[A]], ptr [[A_ADDR]], align 16, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x float> [[B]], ptr [[B_ADDR]], align 16, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A_ADDR]], align 16, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[B_ADDR]], align 16, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[ELT_MINNUM:%.*]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
 // CHECK-NEXT:    ret <4 x float> [[ELT_MINNUM]]
 //
@@ -57,10 +57,10 @@ float4 pfmin32(float4 a, float4 b) {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
 // CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
-// CHECK-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16, !tbaa [[TBAA2]]
-// CHECK-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[ELT_MINNUM:%.*]] = call <2 x double> @llvm.minnum.v2f64(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
 // CHECK-NEXT:    ret <2 x double> [[ELT_MINNUM]]
 //
@@ -72,12 +72,12 @@ double2 pfmin64(double2 a, double2 b) {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <2 x x86_fp80>, align 32
 // CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <2 x x86_fp80>, align 32
-// CHECK-NEXT:    [[A:%.*]] = load <2 x x86_fp80>, ptr [[TMP0]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[B:%.*]] = load <2 x x86_fp80>, ptr [[TMP1]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    store <2 x x86_fp80> [[A]], ptr [[A_ADDR]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    store <2 x x86_fp80> [[B]], ptr [[B_ADDR]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x x86_fp80>, ptr [[A_ADDR]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP3:%.*]] = load <2 x x86_fp80>, ptr [[B_ADDR]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[A:%.*]] = load <2 x x86_fp80>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[B:%.*]] = load <2 x x86_fp80>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <2 x x86_fp80> [[A]], ptr [[A_ADDR]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <2 x x86_fp80> [[B]], ptr [[B_ADDR]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x x86_fp80>, ptr [[A_ADDR]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = load <2 x x86_fp80>, ptr [[B_ADDR]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[ELT_MINNUM:%.*]] = call <2 x x86_fp80> @llvm.minnum.v2f80(<2 x x86_fp80> [[TMP2]], <2 x x86_fp80> [[TMP3]])
 // CHECK-NEXT:    ret <2 x x86_fp80> [[ELT_MINNUM]]
 //
@@ -90,10 +90,10 @@ ldouble2 pfmin80(ldouble2 a, ldouble2 b) {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x half>, align 16
 // CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <8 x half>, align 16
-// CHECK-NEXT:    store <8 x half> [[A]], ptr [[A_ADDR]], align 16, !tbaa [[TBAA2]]
-// CHECK-NEXT:    store <8 x half> [[B]], ptr [[B_ADDR]], align 16, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[A_ADDR]], align 16, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[B_ADDR]], align 16, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x half> [[A]], ptr [[A_ADDR]], align 16, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x half> [[B]], ptr [[B_ADDR]], align 16, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[A_ADDR]], align 16, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[B_ADDR]], align 16, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[ELT_MAXNUM:%.*]] = call <8 x half> @llvm.maxnum.v8f16(<8 x half> [[TMP0]], <8 x half> [[TMP1]])
 // CHECK-NEXT:    ret <8 x half> [[ELT_MAXNUM]]
 //
@@ -105,10 +105,10 @@ half8 pfmax16(half8 a, half8 b) {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x bfloat>, align 16
 // CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <8 x bfloat>, align 16
-// CHECK-NEXT:    store <8 x bfloat> [[A]], ptr [[A_ADDR]], align 16, !tbaa [[TBAA2]]
-// CHECK-NEXT:    store <8 x bfloat> [[B]], ptr [[B_ADDR]], align 16, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x bfloat>, ptr [[A_ADDR]], align 16, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x bfloat>, ptr [[B_ADDR]], align 16, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x bfloat> [[A]], ptr [[A_ADDR]], align 16, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x bfloat> [[B]], ptr [[B_ADDR]], align 16, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x bfloat>, ptr [[A_ADDR]], align 16, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x bfloat>, ptr [[B_ADDR]], align 16, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[ELT_MAXNUM:%.*]] = call <8 x bfloat> @llvm.maxnum.v8bf16(<8 x bfloat> [[TMP0]], <8 x bfloat> [[TMP1]])
 // CHECK-NEXT:    ret <8 x bfloat> [[ELT_MAXNUM]]
 //
@@ -120,10 +120,10 @@ bf16x8 pfmax16b(bf16x8 a, bf16x8 b) {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <4 x float>, align 16
 // CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <4 x float>, align 16
-// CHECK-NEXT:    store <4 x float> [[A]], ptr [[A_ADDR]], align 16, !tbaa [[TBAA2]]
-// CHECK-NEXT:    store <4 x float> [[B]], ptr [[B_ADDR]], align 16, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A_ADDR]], align 16, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[B_ADDR]], align 16, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x float> [[A]], ptr [[A_ADDR]], align 16, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x float> [[B]], ptr [[B_ADDR]], align 16, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A_ADDR]], align 16, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[B_ADDR]], align 16, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[ELT_MAXNUM:%.*]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
 // CHECK-NEXT:    ret <4 x float> [[ELT_MAXNUM]]
 //
@@ -135,10 +135,10 @@ float4 pfmax32(float4 a, float4 b) {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
 // CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
-// CHECK-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16, !tbaa [[TBAA2]]
-// CHECK-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[ELT_MAXNUM:%.*]] = call <2 x double> @llvm.maxnum.v2f64(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
 // CHECK-NEXT:    ret <2 x double> [[ELT_MAXNUM]]
 //
@@ -151,12 +151,12 @@ double2 pfmax64(double2 a, double2 b) {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <2 x x86_fp80>, align 32
 // CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <2 x x86_fp80>, align 32
-// CHECK-NEXT:    [[A:%.*]] = load <2 x x86_fp80>, ptr [[TMP0]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[B:%.*]] = load <2 x x86_fp80>, ptr [[TMP1]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    store <2 x x86_fp80> [[A]], ptr [[A_ADDR]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    store <2 x x86_fp80> [[B]], ptr [[B_ADDR]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x x86_fp80>, ptr [[A_ADDR]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP3:%.*]] = load <2 x x86_fp80>, ptr [[B_ADDR]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[A:%.*]] = load <2 x x86_fp80>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[B:%.*]] = load <2 x x86_fp80>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <2 x x86_fp80> [[A]], ptr [[A_ADDR]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <2 x x86_fp80> [[B]], ptr [[B_ADDR]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x x86_fp80>, ptr [[A_ADDR]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = load <2 x x86_fp80>, ptr [[B_ADDR]], align 32, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[ELT_MINNUM:%.*]] = call <2 x x86_fp80> @llvm.minnum.v2f80(<2 x x86_fp80> [[TMP2]], <2 x x86_fp80> [[TMP3]])
 // CHECK-NEXT:    ret <2 x x86_fp80> [[ELT_MINNUM]]
 //
@@ -165,7 +165,7 @@ ldouble2 pfmax80(ldouble2 a, ldouble2 b) {
 }
 
 //.
-// CHECK: [[TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
+// CHECK: [[CHAR_TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
 // CHECK: [[META3]] = !{!"omnipotent char", [[META4:![0-9]+]], i64 0}
 // CHECK: [[META4]] = !{!"Simple C++ TBAA"}
 //.
diff --git a/clang/test/CodeGen/builtin-rename.c b/clang/test/CodeGen/builtin-rename.c
index 0b71d88806237..0092e54ef37d9 100644
--- a/clang/test/CodeGen/builtin-rename.c
+++ b/clang/test/CodeGen/builtin-rename.c
@@ -4,5 +4,5 @@
 int printf(const char *, ...);
 
 int foo(void) {
-  return printf(printf);
+  return printf((const char*)printf);
 }
diff --git a/clang/test/CodeGen/calling-conv-ignored.c b/clang/test/CodeGen/calling-conv-ignored.c
index 9c47f641eaacb..5dbc7e4084c88 100644
--- a/clang/test/CodeGen/calling-conv-ignored.c
+++ b/clang/test/CodeGen/calling-conv-ignored.c
@@ -1,5 +1,7 @@
 // RUN: %clang_cc1 -triple i686-windows-msvc   -emit-llvm -o - %s | FileCheck %s --check-prefix=X86
 // RUN: %clang_cc1 -triple x86_64-windows-msvc -emit-llvm -o - %s | FileCheck %s --check-prefix=X64
+// RUN: %clang_cc1 -triple x86_64-windows-gnu  -emit-llvm -o - %s | FileCheck %s --check-prefix=X64
+// RUN: %clang_cc1 -triple x86_64-cygwin       -emit-llvm -o - %s | FileCheck %s --check-prefix=X64
 // RUN: %clang_cc1 -triple i686-windows-msvc   -emit-llvm -o - %s -fdefault-calling-conv=vectorcall | FileCheck %s --check-prefix=X86-VEC
 // RUN: %clang_cc1 -triple x86_64-windows-msvc -emit-llvm -o - %s -fdefault-calling-conv=vectorcall | FileCheck %s --check-prefix=X64-VEC
 
diff --git a/clang/test/CodeGen/cfi-icall-generalize.c b/clang/test/CodeGen/cfi-icall-generalize.c
index 0af17e5760cc6..5995540ba33fb 100644
--- a/clang/test/CodeGen/cfi-icall-generalize.c
+++ b/clang/test/CodeGen/cfi-icall-generalize.c
@@ -15,5 +15,21 @@ void g(int** (*fp)(const char *, const char **)) {
   fp(0, 0);
 }
 
+union Union {
+  char *c;
+  long *n;
+} __attribute__((transparent_union));
+
+// CHECK: define{{.*}} void @uni({{.*}} !type [[TYPE2:![0-9]+]] !type [[TYPE2_GENERALIZED:![0-9]+]]
+void uni(void (*fn)(union Union), union Union arg1) {
+  // UNGENERALIZED: call i1 @llvm.type.test(ptr {{.*}}, metadata !"_ZTSFvPcE")
+  // GENERALIZED: call i1 @llvm.type.test(ptr {{.*}}, metadata !"_ZTSFvPvE.generalized")
+    fn(arg1);
+}
+
 // CHECK: [[TYPE]] = !{i64 0, !"_ZTSFPPiPKcPS2_E"}
 // CHECK: [[TYPE_GENERALIZED]] = !{i64 0, !"_ZTSFPvPKvS_E.generalized"}
+
+// CHECK: [[TYPE2]] = !{i64 0, !"_ZTSFvPFv5UnionEPcE"}
+// CHECK: [[TYPE2_GENERALIZED]] = !{i64 0, !"_ZTSFvPvS_E.generalized"}
+
diff --git a/clang/test/CodeGen/cfi-icall-normalize2.c b/clang/test/CodeGen/cfi-icall-normalize2.c
index 93893065cf903..9fa6f95e523d7 100644
--- a/clang/test/CodeGen/cfi-icall-normalize2.c
+++ b/clang/test/CodeGen/cfi-icall-normalize2.c
@@ -24,6 +24,20 @@ void baz(void (*fn)(int, int, int), int arg1, int arg2, int arg3) {
     fn(arg1, arg2, arg3);
 }
 
+union Union {
+  char *c;
+  long *n;
+} __attribute__((transparent_union));
+
+void uni(void (*fn)(union Union), union Union arg1) {
+    // CHECK-LABEL: define{{.*}}uni
+    // CHECK-SAME: {{.*}}!type ![[TYPE4:[0-9]+]] !type !{{[0-9]+}}
+    // CHECK: call i1 @llvm.type.test({{i8\*|ptr}} {{%f|%0}}, metadata !"_ZTSFvPu2i8E.normalized")
+    fn(arg1);
+}
+
 // CHECK: ![[TYPE1]] = !{i64 0, !"_ZTSFvPFvu3i32ES_E.normalized"}
 // CHECK: ![[TYPE2]] = !{i64 0, !"_ZTSFvPFvu3i32S_ES_S_E.normalized"}
 // CHECK: ![[TYPE3]] = !{i64 0, !"_ZTSFvPFvu3i32S_S_ES_S_S_E.normalized"}
+// CHECK: ![[TYPE4]] = !{i64 0, !"_ZTSFvPFv5UnionEPu2i8E.normalized"}
+
diff --git a/clang/test/CodeGen/cleanup-destslot-simple.c b/clang/test/CodeGen/cleanup-destslot-simple.c
index 8ace33254723c..23a70d4a7da25 100644
--- a/clang/test/CodeGen/cleanup-destslot-simple.c
+++ b/clang/test/CodeGen/cleanup-destslot-simple.c
@@ -1,4 +1,4 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6
 // RUN: %clang_cc1 -O1 -triple x86_64-none-linux-gnu -emit-llvm -debug-info-kind=line-tables-only %s -o - | FileCheck %s --check-prefix=CHECK-LIFETIME
 
 // We shouldn't have markers at -O0 or with msan.
@@ -9,22 +9,24 @@
 // There is no exception to handle here, lifetime.end is not a destructor,
 // so there is no need have cleanup dest slot related code
 
-// CHECK-LIFETIME-LABEL: @test(
-// CHECK-LIFETIME-NEXT:  entry:
+// CHECK-LIFETIME-LABEL: define dso_local i32 @test(
+// CHECK-LIFETIME-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] !dbg [[DBG5:![0-9]+]] {
+// CHECK-LIFETIME-NEXT:  [[ENTRY:.*:]]
 // CHECK-LIFETIME-NEXT:    [[X:%.*]] = alloca i32, align 4
 // CHECK-LIFETIME-NEXT:    [[P:%.*]] = alloca ptr, align 8
 // CHECK-LIFETIME-NEXT:    call void @llvm.lifetime.start.p0(ptr nonnull [[X]]) #[[ATTR2:[0-9]+]], !dbg [[DBG9:![0-9]+]]
-// CHECK-LIFETIME-NEXT:    store i32 3, ptr [[X]], align 4, !dbg [[DBG10:![0-9]+]], !tbaa [[TBAA11:![0-9]+]]
+// CHECK-LIFETIME-NEXT:    store i32 3, ptr [[X]], align 4, !dbg [[DBG10:![0-9]+]], !tbaa [[INT_TBAA11:![0-9]+]]
 // CHECK-LIFETIME-NEXT:    call void @llvm.lifetime.start.p0(ptr nonnull [[P]]), !dbg [[DBG15:![0-9]+]]
-// CHECK-LIFETIME-NEXT:    store volatile ptr [[X]], ptr [[P]], align 8, !dbg [[DBG16:![0-9]+]], !tbaa [[TBAA17:![0-9]+]]
-// CHECK-LIFETIME-NEXT:    [[P_0_P_0_P_0_P_0_:%.*]] = load volatile ptr, ptr [[P]], align 8, !dbg [[DBG19:![0-9]+]], !tbaa [[TBAA17]]
-// CHECK-LIFETIME-NEXT:    [[TMP0:%.*]] = load i32, ptr [[P_0_P_0_P_0_P_0_]], align 4, !dbg [[DBG20:![0-9]+]], !tbaa [[TBAA11]]
-// CHECK-LIFETIME-NEXT:    call void @llvm.lifetime.end.p0(ptr nonnull [[P]]), !dbg [[DBG21:![0-9]+]]
-// CHECK-LIFETIME-NEXT:    call void @llvm.lifetime.end.p0(ptr nonnull [[X]]) #[[ATTR2]], !dbg [[DBG21]]
-// CHECK-LIFETIME-NEXT:    ret i32 [[TMP0]], !dbg [[DBG22:![0-9]+]]
+// CHECK-LIFETIME-NEXT:    store volatile ptr [[X]], ptr [[P]], align 8, !dbg [[DBG16:![0-9]+]], !tbaa [[INTPTR_TBAA17:![0-9]+]]
+// CHECK-LIFETIME-NEXT:    [[P_0_P_0_P_0_P_0_:%.*]] = load volatile ptr, ptr [[P]], align 8, !dbg [[DBG20:![0-9]+]], !tbaa [[INTPTR_TBAA17]]
+// CHECK-LIFETIME-NEXT:    [[TMP0:%.*]] = load i32, ptr [[P_0_P_0_P_0_P_0_]], align 4, !dbg [[DBG21:![0-9]+]], !tbaa [[INT_TBAA11]]
+// CHECK-LIFETIME-NEXT:    call void @llvm.lifetime.end.p0(ptr nonnull [[P]]), !dbg [[DBG22:![0-9]+]]
+// CHECK-LIFETIME-NEXT:    call void @llvm.lifetime.end.p0(ptr nonnull [[X]]) #[[ATTR2]], !dbg [[DBG22]]
+// CHECK-LIFETIME-NEXT:    ret i32 [[TMP0]], !dbg [[DBG23:![0-9]+]]
 //
-// CHECK-OPTNONE-LABEL: @test(
-// CHECK-OPTNONE-NEXT:  entry:
+// CHECK-OPTNONE-LABEL: define dso_local i32 @test(
+// CHECK-OPTNONE-SAME: ) #[[ATTR0:[0-9]+]] !dbg [[DBG5:![0-9]+]] {
+// CHECK-OPTNONE-NEXT:  [[ENTRY:.*:]]
 // CHECK-OPTNONE-NEXT:    [[X:%.*]] = alloca i32, align 4
 // CHECK-OPTNONE-NEXT:    [[P:%.*]] = alloca ptr, align 8
 // CHECK-OPTNONE-NEXT:    store i32 3, ptr [[X]], align 4, !dbg [[DBG9:![0-9]+]]
@@ -33,87 +35,89 @@
 // CHECK-OPTNONE-NEXT:    [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4, !dbg [[DBG12:![0-9]+]]
 // CHECK-OPTNONE-NEXT:    ret i32 [[TMP1]], !dbg [[DBG13:![0-9]+]]
 //
-// CHECK-MSAN-LABEL: @test(
-// CHECK-MSAN-NEXT:  entry:
+// CHECK-MSAN-LABEL: define dso_local noundef i32 @test(
+// CHECK-MSAN-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] !dbg [[DBG6:![0-9]+]] {
+// CHECK-MSAN-NEXT:  [[ENTRY:.*:]]
 // CHECK-MSAN-NEXT:    [[X:%.*]] = alloca i32, align 4
 // CHECK-MSAN-NEXT:    [[P:%.*]] = alloca ptr, align 8
-// CHECK-MSAN-NEXT:    call void @llvm.lifetime.start.p0(ptr nonnull [[X]]) #[[ATTR2:[0-9]+]], !dbg [[DBG9:![0-9]+]]
-// CHECK-MSAN-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG9]]
-// CHECK-MSAN-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 87960930222080, !dbg [[DBG9]]
-// CHECK-MSAN-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr, !dbg [[DBG9]]
-// CHECK-MSAN-NEXT:    store i32 0, ptr [[TMP2]], align 4, !dbg [[DBG10:![0-9]+]]
-// CHECK-MSAN-NEXT:    store i32 3, ptr [[X]], align 4, !dbg [[DBG10]], !tbaa [[TBAA11:![0-9]+]]
-// CHECK-MSAN-NEXT:    call void @llvm.lifetime.start.p0(ptr nonnull [[P]]), !dbg [[DBG15:![0-9]+]]
-// CHECK-MSAN-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[P]] to i64, !dbg [[DBG15]]
-// CHECK-MSAN-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 87960930222080, !dbg [[DBG15]]
-// CHECK-MSAN-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr, !dbg [[DBG15]]
-// CHECK-MSAN-NEXT:    store i64 0, ptr [[TMP5]], align 8, !dbg [[DBG16:![0-9]+]]
-// CHECK-MSAN-NEXT:    store volatile ptr [[X]], ptr [[P]], align 8, !dbg [[DBG16]], !tbaa [[TBAA17:![0-9]+]]
-// CHECK-MSAN-NEXT:    [[P_0_P_0_P_0_P_0_:%.*]] = load volatile ptr, ptr [[P]], align 8, !dbg [[DBG19:![0-9]+]], !tbaa [[TBAA17]]
-// CHECK-MSAN-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP5]], align 8, !dbg [[DBG19]]
-// CHECK-MSAN-NEXT:    [[_MSCMP_NOT:%.*]] = icmp eq i64 [[_MSLD]], 0, !dbg [[DBG20:![0-9]+]]
-// CHECK-MSAN-NEXT:    br i1 [[_MSCMP_NOT]], label [[TMP7:%.*]], label [[TMP6:%.*]], !dbg [[DBG20]], !prof [[PROF21:![0-9]+]]
-// CHECK-MSAN:       6:
-// CHECK-MSAN-NEXT:    call void @__msan_warning_noreturn() #[[ATTR3:[0-9]+]], !dbg [[DBG20]]
-// CHECK-MSAN-NEXT:    unreachable, !dbg [[DBG20]]
-// CHECK-MSAN:       7:
-// CHECK-MSAN-NEXT:    [[TMP8:%.*]] = load i32, ptr [[P_0_P_0_P_0_P_0_]], align 4, !dbg [[DBG20]], !tbaa [[TBAA11]]
-// CHECK-MSAN-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[P_0_P_0_P_0_P_0_]] to i64, !dbg [[DBG20]]
-// CHECK-MSAN-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 87960930222080, !dbg [[DBG20]]
-// CHECK-MSAN-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr, !dbg [[DBG20]]
-// CHECK-MSAN-NEXT:    [[_MSLD1:%.*]] = load i32, ptr [[TMP11]], align 4, !dbg [[DBG20]]
-// CHECK-MSAN-NEXT:    call void @llvm.lifetime.end.p0(ptr nonnull [[P]]), !dbg [[DBG22:![0-9]+]]
-// CHECK-MSAN-NEXT:    call void @llvm.lifetime.end.p0(ptr nonnull [[X]]) #[[ATTR2]], !dbg [[DBG22]]
-// CHECK-MSAN-NEXT:    [[_MSCMP2_NOT:%.*]] = icmp eq i32 [[_MSLD1]], 0, !dbg [[DBG23:![0-9]+]]
-// CHECK-MSAN-NEXT:    br i1 [[_MSCMP2_NOT]], label [[TMP13:%.*]], label [[TMP12:%.*]], !dbg [[DBG23]], !prof [[PROF21]]
-// CHECK-MSAN:       12:
-// CHECK-MSAN-NEXT:    call void @__msan_warning_noreturn() #[[ATTR3]], !dbg [[DBG23]]
-// CHECK-MSAN-NEXT:    unreachable, !dbg [[DBG23]]
-// CHECK-MSAN:       13:
-// CHECK-MSAN-NEXT:    ret i32 [[TMP8]], !dbg [[DBG23]]
+// CHECK-MSAN-NEXT:    call void @llvm.lifetime.start.p0(ptr nonnull [[X]]) #[[ATTR3:[0-9]+]], !dbg [[DBG10:![0-9]+]]
+// CHECK-MSAN-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG10]]
+// CHECK-MSAN-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 87960930222080, !dbg [[DBG10]]
+// CHECK-MSAN-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr, !dbg [[DBG10]]
+// CHECK-MSAN-NEXT:    store i32 0, ptr [[TMP2]], align 4, !dbg [[DBG11:![0-9]+]]
+// CHECK-MSAN-NEXT:    store i32 3, ptr [[X]], align 4, !dbg [[DBG11]], !tbaa [[INT_TBAA12:![0-9]+]]
+// CHECK-MSAN-NEXT:    call void @llvm.lifetime.start.p0(ptr nonnull [[P]]), !dbg [[DBG16:![0-9]+]]
+// CHECK-MSAN-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[P]] to i64, !dbg [[DBG16]]
+// CHECK-MSAN-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 87960930222080, !dbg [[DBG16]]
+// CHECK-MSAN-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr, !dbg [[DBG16]]
+// CHECK-MSAN-NEXT:    store i64 0, ptr [[TMP5]], align 8, !dbg [[DBG17:![0-9]+]]
+// CHECK-MSAN-NEXT:    store volatile ptr [[X]], ptr [[P]], align 8, !dbg [[DBG17]], !tbaa [[INTPTR_TBAA18:![0-9]+]]
+// CHECK-MSAN-NEXT:    [[P_0_P_0_P_0_P_0_:%.*]] = load volatile ptr, ptr [[P]], align 8, !dbg [[DBG21:![0-9]+]], !tbaa [[INTPTR_TBAA18]]
+// CHECK-MSAN-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP5]], align 8, !dbg [[DBG21]]
+// CHECK-MSAN-NEXT:    [[_MSCMP_NOT:%.*]] = icmp eq i64 [[_MSLD]], 0, !dbg [[DBG22:![0-9]+]]
+// CHECK-MSAN-NEXT:    br i1 [[_MSCMP_NOT]], label %[[BB7:.*]], label %[[BB6:.*]], !dbg [[DBG22]], !prof [[PROF23:![0-9]+]]
+// CHECK-MSAN:       [[BB6]]:
+// CHECK-MSAN-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4:[0-9]+]], !dbg [[DBG22]]
+// CHECK-MSAN-NEXT:    unreachable, !dbg [[DBG22]]
+// CHECK-MSAN:       [[BB7]]:
+// CHECK-MSAN-NEXT:    [[TMP8:%.*]] = load i32, ptr [[P_0_P_0_P_0_P_0_]], align 4, !dbg [[DBG22]], !tbaa [[INT_TBAA12]]
+// CHECK-MSAN-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[P_0_P_0_P_0_P_0_]] to i64, !dbg [[DBG22]]
+// CHECK-MSAN-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 87960930222080, !dbg [[DBG22]]
+// CHECK-MSAN-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr, !dbg [[DBG22]]
+// CHECK-MSAN-NEXT:    [[_MSLD1:%.*]] = load i32, ptr [[TMP11]], align 4, !dbg [[DBG22]]
+// CHECK-MSAN-NEXT:    call void @llvm.lifetime.end.p0(ptr nonnull [[P]]), !dbg [[DBG24:![0-9]+]]
+// CHECK-MSAN-NEXT:    call void @llvm.lifetime.end.p0(ptr nonnull [[X]]) #[[ATTR3]], !dbg [[DBG24]]
+// CHECK-MSAN-NEXT:    [[_MSCMP2_NOT:%.*]] = icmp eq i32 [[_MSLD1]], 0, !dbg [[DBG25:![0-9]+]]
+// CHECK-MSAN-NEXT:    br i1 [[_MSCMP2_NOT]], label %[[BB13:.*]], label %[[BB12:.*]], !dbg [[DBG25]], !prof [[PROF23]]
+// CHECK-MSAN:       [[BB12]]:
+// CHECK-MSAN-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]], !dbg [[DBG25]]
+// CHECK-MSAN-NEXT:    unreachable, !dbg [[DBG25]]
+// CHECK-MSAN:       [[BB13]]:
+// CHECK-MSAN-NEXT:    ret i32 [[TMP8]], !dbg [[DBG25]]
 //
-// CHECK-KMSAN-LABEL: @test(
-// CHECK-KMSAN-NEXT:  entry:
+// CHECK-KMSAN-LABEL: define dso_local i32 @test(
+// CHECK-KMSAN-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] !dbg [[DBG6:![0-9]+]] {
+// CHECK-KMSAN-NEXT:  [[ENTRY:.*:]]
 // CHECK-KMSAN-NEXT:    [[TMP0:%.*]] = call ptr @__msan_get_context_state() #[[ATTR2:[0-9]+]]
 // CHECK-KMSAN-NEXT:    [[X:%.*]] = alloca i32, align 4
 // CHECK-KMSAN-NEXT:    [[P:%.*]] = alloca ptr, align 8
-// CHECK-KMSAN-NEXT:    call void @llvm.lifetime.start.p0(ptr nonnull [[X]]) #[[ATTR2]], !dbg [[DBG9:![0-9]+]]
-// CHECK-KMSAN-NEXT:    call void @__msan_poison_alloca(ptr nonnull [[X]], i64 4, ptr nonnull @[[GLOB0:[0-9]+]]) #[[ATTR2]], !dbg [[DBG9]]
-// CHECK-KMSAN-NEXT:    [[TMP1:%.*]] = call { ptr, ptr } @__msan_metadata_ptr_for_store_4(ptr nonnull [[X]]) #[[ATTR2]], !dbg [[DBG10:![0-9]+]]
-// CHECK-KMSAN-NEXT:    [[TMP2:%.*]] = extractvalue { ptr, ptr } [[TMP1]], 0, !dbg [[DBG10]]
-// CHECK-KMSAN-NEXT:    store i32 0, ptr [[TMP2]], align 4, !dbg [[DBG10]]
-// CHECK-KMSAN-NEXT:    store i32 3, ptr [[X]], align 4, !dbg [[DBG10]], !tbaa [[TBAA11:![0-9]+]]
-// CHECK-KMSAN-NEXT:    call void @llvm.lifetime.start.p0(ptr nonnull [[P]]), !dbg [[DBG15:![0-9]+]]
-// CHECK-KMSAN-NEXT:    call void @__msan_poison_alloca(ptr nonnull [[P]], i64 8, ptr nonnull @[[GLOB1:[0-9]+]]) #[[ATTR2]], !dbg [[DBG15]]
-// CHECK-KMSAN-NEXT:    [[TMP3:%.*]] = call { ptr, ptr } @__msan_metadata_ptr_for_store_8(ptr nonnull [[P]]) #[[ATTR2]], !dbg [[DBG16:![0-9]+]]
-// CHECK-KMSAN-NEXT:    [[TMP4:%.*]] = extractvalue { ptr, ptr } [[TMP3]], 0, !dbg [[DBG16]]
-// CHECK-KMSAN-NEXT:    store i64 0, ptr [[TMP4]], align 8, !dbg [[DBG16]]
-// CHECK-KMSAN-NEXT:    store volatile ptr [[X]], ptr [[P]], align 8, !dbg [[DBG16]], !tbaa [[TBAA17:![0-9]+]]
-// CHECK-KMSAN-NEXT:    [[P_0_P_0_P_0_P_0_:%.*]] = load volatile ptr, ptr [[P]], align 8, !dbg [[DBG19:![0-9]+]], !tbaa [[TBAA17]]
-// CHECK-KMSAN-NEXT:    [[TMP5:%.*]] = call { ptr, ptr } @__msan_metadata_ptr_for_load_8(ptr nonnull [[P]]) #[[ATTR2]], !dbg [[DBG19]]
-// CHECK-KMSAN-NEXT:    [[TMP6:%.*]] = extractvalue { ptr, ptr } [[TMP5]], 0, !dbg [[DBG19]]
-// CHECK-KMSAN-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP6]], align 8, !dbg [[DBG19]]
-// CHECK-KMSAN-NEXT:    [[_MSCMP_NOT:%.*]] = icmp eq i64 [[_MSLD]], 0, !dbg [[DBG20:![0-9]+]]
-// CHECK-KMSAN-NEXT:    br i1 [[_MSCMP_NOT]], label [[TMP10:%.*]], label [[TMP7:%.*]], !dbg [[DBG20]], !prof [[PROF21:![0-9]+]]
-// CHECK-KMSAN:       7:
-// CHECK-KMSAN-NEXT:    [[TMP8:%.*]] = extractvalue { ptr, ptr } [[TMP5]], 1, !dbg [[DBG19]]
-// CHECK-KMSAN-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP8]], align 8, !dbg [[DBG19]]
-// CHECK-KMSAN-NEXT:    call void @__msan_warning(i32 [[TMP9]]) #[[ATTR3:[0-9]+]], !dbg [[DBG20]]
-// CHECK-KMSAN-NEXT:    br label [[TMP10]], !dbg [[DBG20]]
-// CHECK-KMSAN:       10:
+// CHECK-KMSAN-NEXT:    call void @llvm.lifetime.start.p0(ptr nonnull [[X]]) #[[ATTR2]], !dbg [[DBG10:![0-9]+]]
+// CHECK-KMSAN-NEXT:    call void @__msan_poison_alloca(ptr nonnull [[X]], i64 4, ptr nonnull @[[GLOB0:[0-9]+]]) #[[ATTR2]], !dbg [[DBG10]]
+// CHECK-KMSAN-NEXT:    [[TMP1:%.*]] = call { ptr, ptr } @__msan_metadata_ptr_for_store_4(ptr nonnull [[X]]) #[[ATTR2]], !dbg [[DBG11:![0-9]+]]
+// CHECK-KMSAN-NEXT:    [[TMP2:%.*]] = extractvalue { ptr, ptr } [[TMP1]], 0, !dbg [[DBG11]]
+// CHECK-KMSAN-NEXT:    store i32 0, ptr [[TMP2]], align 4, !dbg [[DBG11]]
+// CHECK-KMSAN-NEXT:    store i32 3, ptr [[X]], align 4, !dbg [[DBG11]], !tbaa [[INT_TBAA12:![0-9]+]]
+// CHECK-KMSAN-NEXT:    call void @llvm.lifetime.start.p0(ptr nonnull [[P]]), !dbg [[DBG16:![0-9]+]]
+// CHECK-KMSAN-NEXT:    call void @__msan_poison_alloca(ptr nonnull [[P]], i64 8, ptr nonnull @[[GLOB1:[0-9]+]]) #[[ATTR2]], !dbg [[DBG16]]
+// CHECK-KMSAN-NEXT:    [[TMP3:%.*]] = call { ptr, ptr } @__msan_metadata_ptr_for_store_8(ptr nonnull [[P]]) #[[ATTR2]], !dbg [[DBG17:![0-9]+]]
+// CHECK-KMSAN-NEXT:    [[TMP4:%.*]] = extractvalue { ptr, ptr } [[TMP3]], 0, !dbg [[DBG17]]
+// CHECK-KMSAN-NEXT:    store i64 0, ptr [[TMP4]], align 8, !dbg [[DBG17]]
+// CHECK-KMSAN-NEXT:    store volatile ptr [[X]], ptr [[P]], align 8, !dbg [[DBG17]], !tbaa [[INTPTR_TBAA18:![0-9]+]]
+// CHECK-KMSAN-NEXT:    [[P_0_P_0_P_0_P_0_:%.*]] = load volatile ptr, ptr [[P]], align 8, !dbg [[DBG21:![0-9]+]], !tbaa [[INTPTR_TBAA18]]
+// CHECK-KMSAN-NEXT:    [[TMP5:%.*]] = call { ptr, ptr } @__msan_metadata_ptr_for_load_8(ptr nonnull [[P]]) #[[ATTR2]], !dbg [[DBG21]]
+// CHECK-KMSAN-NEXT:    [[TMP6:%.*]] = extractvalue { ptr, ptr } [[TMP5]], 0, !dbg [[DBG21]]
+// CHECK-KMSAN-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP6]], align 8, !dbg [[DBG21]]
+// CHECK-KMSAN-NEXT:    [[_MSCMP_NOT:%.*]] = icmp eq i64 [[_MSLD]], 0, !dbg [[DBG22:![0-9]+]]
+// CHECK-KMSAN-NEXT:    br i1 [[_MSCMP_NOT]], label %[[BB10:.*]], label %[[BB7:.*]], !dbg [[DBG22]], !prof [[PROF23:![0-9]+]]
+// CHECK-KMSAN:       [[BB7]]:
+// CHECK-KMSAN-NEXT:    [[TMP8:%.*]] = extractvalue { ptr, ptr } [[TMP5]], 1, !dbg [[DBG21]]
+// CHECK-KMSAN-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP8]], align 8, !dbg [[DBG21]]
+// CHECK-KMSAN-NEXT:    call void @__msan_warning(i32 [[TMP9]]) #[[ATTR3:[0-9]+]], !dbg [[DBG22]]
+// CHECK-KMSAN-NEXT:    br label %[[BB10]], !dbg [[DBG22]]
+// CHECK-KMSAN:       [[BB10]]:
 // CHECK-KMSAN-NEXT:    [[RETVAL_ORIGIN:%.*]] = getelementptr i8, ptr [[TMP0]], i64 4008
 // CHECK-KMSAN-NEXT:    [[RETVAL_SHADOW:%.*]] = getelementptr i8, ptr [[TMP0]], i64 800
-// CHECK-KMSAN-NEXT:    [[TMP11:%.*]] = load i32, ptr [[P_0_P_0_P_0_P_0_]], align 4, !dbg [[DBG20]], !tbaa [[TBAA11]]
-// CHECK-KMSAN-NEXT:    [[TMP12:%.*]] = call { ptr, ptr } @__msan_metadata_ptr_for_load_4(ptr nonnull [[P_0_P_0_P_0_P_0_]]) #[[ATTR2]], !dbg [[DBG20]]
-// CHECK-KMSAN-NEXT:    [[TMP13:%.*]] = extractvalue { ptr, ptr } [[TMP12]], 0, !dbg [[DBG20]]
-// CHECK-KMSAN-NEXT:    [[TMP14:%.*]] = extractvalue { ptr, ptr } [[TMP12]], 1, !dbg [[DBG20]]
-// CHECK-KMSAN-NEXT:    [[_MSLD1:%.*]] = load i32, ptr [[TMP13]], align 4, !dbg [[DBG20]]
-// CHECK-KMSAN-NEXT:    [[TMP15:%.*]] = load i32, ptr [[TMP14]], align 4, !dbg [[DBG20]]
-// CHECK-KMSAN-NEXT:    call void @llvm.lifetime.end.p0(ptr nonnull [[P]]), !dbg [[DBG22:![0-9]+]]
-// CHECK-KMSAN-NEXT:    call void @llvm.lifetime.end.p0(ptr nonnull [[X]]) #[[ATTR2]], !dbg [[DBG22]]
-// CHECK-KMSAN-NEXT:    store i32 [[_MSLD1]], ptr [[RETVAL_SHADOW]], align 8, !dbg [[DBG23:![0-9]+]]
-// CHECK-KMSAN-NEXT:    store i32 [[TMP15]], ptr [[RETVAL_ORIGIN]], align 4, !dbg [[DBG23]]
-// CHECK-KMSAN-NEXT:    ret i32 [[TMP11]], !dbg [[DBG23]]
+// CHECK-KMSAN-NEXT:    [[TMP11:%.*]] = load i32, ptr [[P_0_P_0_P_0_P_0_]], align 4, !dbg [[DBG22]], !tbaa [[INT_TBAA12]]
+// CHECK-KMSAN-NEXT:    [[TMP12:%.*]] = call { ptr, ptr } @__msan_metadata_ptr_for_load_4(ptr nonnull [[P_0_P_0_P_0_P_0_]]) #[[ATTR2]], !dbg [[DBG22]]
+// CHECK-KMSAN-NEXT:    [[TMP13:%.*]] = extractvalue { ptr, ptr } [[TMP12]], 0, !dbg [[DBG22]]
+// CHECK-KMSAN-NEXT:    [[TMP14:%.*]] = extractvalue { ptr, ptr } [[TMP12]], 1, !dbg [[DBG22]]
+// CHECK-KMSAN-NEXT:    [[_MSLD1:%.*]] = load i32, ptr [[TMP13]], align 4, !dbg [[DBG22]]
+// CHECK-KMSAN-NEXT:    [[TMP15:%.*]] = load i32, ptr [[TMP14]], align 4, !dbg [[DBG22]]
+// CHECK-KMSAN-NEXT:    call void @llvm.lifetime.end.p0(ptr nonnull [[P]]), !dbg [[DBG24:![0-9]+]]
+// CHECK-KMSAN-NEXT:    call void @llvm.lifetime.end.p0(ptr nonnull [[X]]) #[[ATTR2]], !dbg [[DBG24]]
+// CHECK-KMSAN-NEXT:    store i32 [[_MSLD1]], ptr [[RETVAL_SHADOW]], align 8, !dbg [[DBG25:![0-9]+]]
+// CHECK-KMSAN-NEXT:    store i32 [[TMP15]], ptr [[RETVAL_ORIGIN]], align 4, !dbg [[DBG25]]
+// CHECK-KMSAN-NEXT:    ret i32 [[TMP11]], !dbg [[DBG25]]
 //
 int test(void) {
   int x = 3;
diff --git a/clang/test/CodeGen/isfpclass.c b/clang/test/CodeGen/isfpclass.c
index ee3a22b40fefd..8a631c471c329 100644
--- a/clang/test/CodeGen/isfpclass.c
+++ b/clang/test/CodeGen/isfpclass.c
@@ -1,9 +1,9 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6
 // RUN: %clang_cc1 -triple aarch64-linux-gnu -O1 -emit-llvm %s -o - | FileCheck %s
 
-// CHECK-LABEL: define dso_local noundef i1 @check_isfpclass_finite
-// CHECK-SAME: (float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define dso_local noundef i1 @check_isfpclass_finite(
+// CHECK-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call float @llvm.fabs.f32(float [[X]])
 // CHECK-NEXT:    [[TMP1:%.*]] = fcmp one float [[TMP0]], 0x7FF0000000000000
 // CHECK-NEXT:    ret i1 [[TMP1]]
@@ -12,9 +12,9 @@ _Bool check_isfpclass_finite(float x) {
   return __builtin_isfpclass(x, 504 /*Finite*/);
 }
 
-// CHECK-LABEL: define dso_local noundef i1 @check_isfpclass_finite_strict
-// CHECK-SAME: (float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] {
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define dso_local noundef i1 @check_isfpclass_finite_strict(
+// CHECK-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i1 @llvm.is.fpclass.f32(float [[X]], i32 504) #[[ATTR5:[0-9]+]]
 // CHECK-NEXT:    ret i1 [[TMP0]]
 //
@@ -23,9 +23,9 @@ _Bool check_isfpclass_finite_strict(float x) {
   return __builtin_isfpclass(x, 504 /*Finite*/);
 }
 
-// CHECK-LABEL: define dso_local noundef i1 @check_isfpclass_nan_f32
-// CHECK-SAME: (float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define dso_local noundef i1 @check_isfpclass_nan_f32(
+// CHECK-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = fcmp uno float [[X]], 0.000000e+00
 // CHECK-NEXT:    ret i1 [[TMP0]]
 //
@@ -33,9 +33,9 @@ _Bool check_isfpclass_nan_f32(float x) {
   return __builtin_isfpclass(x, 3 /*NaN*/);
 }
 
-// CHECK-LABEL: define dso_local noundef i1 @check_isfpclass_nan_f32_strict
-// CHECK-SAME: (float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR2]] {
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define dso_local noundef i1 @check_isfpclass_nan_f32_strict(
+// CHECK-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i1 @llvm.is.fpclass.f32(float [[X]], i32 3) #[[ATTR5]]
 // CHECK-NEXT:    ret i1 [[TMP0]]
 //
@@ -44,9 +44,9 @@ _Bool check_isfpclass_nan_f32_strict(float x) {
   return __builtin_isfpclass(x, 3 /*NaN*/);
 }
 
-// CHECK-LABEL: define dso_local noundef i1 @check_isfpclass_snan_f64
-// CHECK-SAME: (double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define dso_local noundef i1 @check_isfpclass_snan_f64(
+// CHECK-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i1 @llvm.is.fpclass.f64(double [[X]], i32 1)
 // CHECK-NEXT:    ret i1 [[TMP0]]
 //
@@ -54,9 +54,9 @@ _Bool check_isfpclass_snan_f64(double x) {
   return __builtin_isfpclass(x, 1 /*SNaN*/);
 }
 
-// CHECK-LABEL: define dso_local noundef i1 @check_isfpclass_snan_f64_strict
-// CHECK-SAME: (double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR2]] {
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define dso_local noundef i1 @check_isfpclass_snan_f64_strict(
+// CHECK-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i1 @llvm.is.fpclass.f64(double [[X]], i32 1) #[[ATTR5]]
 // CHECK-NEXT:    ret i1 [[TMP0]]
 //
@@ -65,9 +65,9 @@ _Bool check_isfpclass_snan_f64_strict(double x) {
   return __builtin_isfpclass(x, 1 /*NaN*/);
 }
 
-// CHECK-LABEL: define dso_local noundef i1 @check_isfpclass_zero_f16
-// CHECK-SAME: (half noundef [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define dso_local noundef i1 @check_isfpclass_zero_f16(
+// CHECK-SAME: half noundef [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = fcmp oeq half [[X]], 0xH0000
 // CHECK-NEXT:    ret i1 [[TMP0]]
 //
@@ -75,9 +75,9 @@ _Bool check_isfpclass_zero_f16(_Float16 x) {
   return __builtin_isfpclass(x, 96 /*Zero*/);
 }
 
-// CHECK-LABEL: define dso_local noundef i1 @check_isfpclass_zero_f16_strict
-// CHECK-SAME: (half noundef [[X:%.*]]) local_unnamed_addr #[[ATTR2]] {
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define dso_local noundef i1 @check_isfpclass_zero_f16_strict(
+// CHECK-SAME: half noundef [[X:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i1 @llvm.is.fpclass.f16(half [[X]], i32 96) #[[ATTR5]]
 // CHECK-NEXT:    ret i1 [[TMP0]]
 //
@@ -86,9 +86,9 @@ _Bool check_isfpclass_zero_f16_strict(_Float16 x) {
   return __builtin_isfpclass(x, 96 /*Zero*/);
 }
 
-// CHECK-LABEL: define dso_local noundef i1 @check_isnan
-// CHECK-SAME: (float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR2]] {
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define dso_local noundef i1 @check_isnan(
+// CHECK-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i1 @llvm.is.fpclass.f32(float [[X]], i32 3) #[[ATTR5]]
 // CHECK-NEXT:    ret i1 [[TMP0]]
 //
@@ -97,9 +97,9 @@ _Bool check_isnan(float x) {
   return __builtin_isnan(x);
 }
 
-// CHECK-LABEL: define dso_local noundef i1 @check_isinf
-// CHECK-SAME: (float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR2]] {
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define dso_local noundef i1 @check_isinf(
+// CHECK-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i1 @llvm.is.fpclass.f32(float [[X]], i32 516) #[[ATTR5]]
 // CHECK-NEXT:    ret i1 [[TMP0]]
 //
@@ -108,9 +108,9 @@ _Bool check_isinf(float x) {
   return __builtin_isinf(x);
 }
 
-// CHECK-LABEL: define dso_local noundef i1 @check_isfinite
-// CHECK-SAME: (float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR2]] {
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define dso_local noundef i1 @check_isfinite(
+// CHECK-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i1 @llvm.is.fpclass.f32(float [[X]], i32 504) #[[ATTR5]]
 // CHECK-NEXT:    ret i1 [[TMP0]]
 //
@@ -119,9 +119,9 @@ _Bool check_isfinite(float x) {
   return __builtin_isfinite(x);
 }
 
-// CHECK-LABEL: define dso_local noundef i1 @check_isnormal
-// CHECK-SAME: (float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR2]] {
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define dso_local noundef i1 @check_isnormal(
+// CHECK-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i1 @llvm.is.fpclass.f32(float [[X]], i32 264) #[[ATTR5]]
 // CHECK-NEXT:    ret i1 [[TMP0]]
 //
@@ -136,9 +136,9 @@ typedef double __attribute__((ext_vector_type(4))) double4;
 typedef int __attribute__((ext_vector_type(4))) int4;
 typedef long __attribute__((ext_vector_type(4))) long4;
 
-// CHECK-LABEL: define dso_local range(i32 0, 2) <4 x i32> @check_isfpclass_nan_v4f32
-// CHECK-SAME: (<4 x float> noundef [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define dso_local range(i32 0, 2) <4 x i32> @check_isfpclass_nan_v4f32(
+// CHECK-SAME: <4 x float> noundef [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = fcmp uno <4 x float> [[X]], zeroinitializer
 // CHECK-NEXT:    [[TMP1:%.*]] = zext <4 x i1> [[TMP0]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP1]]
@@ -147,9 +147,9 @@ int4 check_isfpclass_nan_v4f32(float4 x) {
   return __builtin_isfpclass(x, 3 /*NaN*/);
 }
 
-// CHECK-LABEL: define dso_local range(i32 0, 2) <4 x i32> @check_isfpclass_nan_strict_v4f32
-// CHECK-SAME: (<4 x float> noundef [[X:%.*]]) local_unnamed_addr #[[ATTR2]] {
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define dso_local range(i32 0, 2) <4 x i32> @check_isfpclass_nan_strict_v4f32(
+// CHECK-SAME: <4 x float> noundef [[X:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i1> @llvm.is.fpclass.v4f32(<4 x float> [[X]], i32 3) #[[ATTR5]]
 // CHECK-NEXT:    [[TMP1:%.*]] = zext <4 x i1> [[TMP0]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP1]]
@@ -159,15 +159,20 @@ int4 check_isfpclass_nan_strict_v4f32(float4 x) {
   return __builtin_isfpclass(x, 3 /*NaN*/);
 }
 
-// CHECK-LABEL: define dso_local void @check_isfpclass_nan_v4f64
-// CHECK-SAME: (ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 16 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR3:[0-9]+]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[X:%.*]] = load <4 x double>, ptr [[TMP0]], align 16, !tbaa [[TBAA2:![0-9]+]]
+// CHECK-LABEL: define dso_local void @check_isfpclass_nan_v4f64(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 16 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR3:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[X:%.*]] = load <4 x double>, ptr [[TMP0]], align 16, !tbaa [[CHAR_TBAA2:![0-9]+]]
 // CHECK-NEXT:    [[TMP1:%.*]] = fcmp uno <4 x double> [[X]], zeroinitializer
 // CHECK-NEXT:    [[TMP2:%.*]] = zext <4 x i1> [[TMP1]] to <4 x i64>
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 16, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 16, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 long4 check_isfpclass_nan_v4f64(double4 x) {
   return __builtin_isfpclass(x, 3 /*NaN*/);
 }
+//.
+// CHECK: [[CHAR_TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
+// CHECK: [[META3]] = !{!"omnipotent char", [[META4:![0-9]+]], i64 0}
+// CHECK: [[META4]] = !{!"Simple C/C++ TBAA"}
+//.
diff --git a/clang/test/CodeGen/kcfi-generalize.c b/clang/test/CodeGen/kcfi-generalize.c
index 4e32f4f35057c..5a44d97412af9 100644
--- a/clang/test/CodeGen/kcfi-generalize.c
+++ b/clang/test/CodeGen/kcfi-generalize.c
@@ -26,8 +26,23 @@ void g(int** (*fp)(const char *, const char **)) {
   fp(0, 0);
 }
 
+union Union {
+  char *c;
+  long *n;
+} __attribute__((transparent_union));
+
+// CHECK: define{{.*}} void @uni({{.*}} !kcfi_type [[TYPE4:![0-9]+]]
+void uni(void (*fn)(union Union), union Union arg1) {
+  // UNGENERALIZED: call {{.*}} [ "kcfi"(i32 -587217045) ]
+  // GENERALIZED: call {{.*}} [ "kcfi"(i32 2139530422) ]
+    fn(arg1);
+}
+
 // UNGENERALIZED: [[TYPE]] = !{i32 1296635908}
 // GENERALIZED: [[TYPE]] = !{i32 -49168686}
 
 // UNGENERALIZED: [[TYPE3]] = !{i32 874141567}
 // GENERALIZED: [[TYPE3]] = !{i32 954385378}
+
+// UNGENERALIZED: [[TYPE4]] = !{i32 -1619636625}
+// GENERALIZED: [[TYPE4]] = !{i32 -125078496}
diff --git a/clang/test/CodeGen/kcfi-normalize.c b/clang/test/CodeGen/kcfi-normalize.c
index b9150e88f6ab5..bd87f4af534a1 100644
--- a/clang/test/CodeGen/kcfi-normalize.c
+++ b/clang/test/CodeGen/kcfi-normalize.c
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm -fsanitize=kcfi -fsanitize-cfi-icall-experimental-normalize-integers -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm -fsanitize=kcfi -fsanitize-cfi-icall-experimental-normalize-integers -x c++ -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm -fsanitize=kcfi -fsanitize-cfi-icall-experimental-normalize-integers -o - %s | FileCheck %s --check-prefixes=CHECK,C
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm -fsanitize=kcfi -fsanitize-cfi-icall-experimental-normalize-integers -x c++ -o - %s | FileCheck %s --check-prefixes=CHECK,CPP
 #if !__has_feature(kcfi)
 #error Missing kcfi?
 #endif
@@ -28,7 +28,22 @@ void baz(void (*fn)(int, int, int), int arg1, int arg2, int arg3) {
     fn(arg1, arg2, arg3);
 }
 
+union Union {
+  char *c;
+  long *n;
+} __attribute__((transparent_union));
+
+void uni(void (*fn)(union Union), union Union arg1) {
+    // CHECK-LABEL: define{{.*}}uni
+    // CHECK-SAME: {{.*}}!kcfi_type ![[TYPE4:[0-9]+]]
+    // C: call void %0(ptr %1) [ "kcfi"(i32 1819770848) ]
+    // CPP: call void %0(ptr %1) [ "kcfi"(i32 -1430221633) ]
+    fn(arg1);
+}
+
 // CHECK: ![[#]] = !{i32 4, !"cfi-normalize-integers", i32 1}
 // CHECK: ![[TYPE1]] = !{i32 -1143117868}
 // CHECK: ![[TYPE2]] = !{i32 -460921415}
 // CHECK: ![[TYPE3]] = !{i32 -333839615}
+// C: ![[TYPE4]] = !{i32 -650530463}
+// CPP: ![[TYPE4]] = !{i32 1766237188}
diff --git a/clang/test/CodeGen/mangle-windows.c b/clang/test/CodeGen/mangle-windows.c
index 046b1e8815a8a..e1b06e72a9635 100644
--- a/clang/test/CodeGen/mangle-windows.c
+++ b/clang/test/CodeGen/mangle-windows.c
@@ -1,8 +1,10 @@
 // RUN: %clang_cc1 -emit-llvm %s -o - -triple=i386-pc-win32 | FileCheck %s
-// RUN: %clang_cc1 -emit-llvm %s -o - -triple=i386-mingw32 | FileCheck %s
+// RUN: %clang_cc1 -emit-llvm %s -o - -triple=i386-mingw32  | FileCheck %s
+// RUN: %clang_cc1 -emit-llvm %s -o - -triple=i386-cygwin   | FileCheck %s
 // RUN: %clang_cc1 -emit-llvm %s -o - -triple=i386-pc-windows-msvc-elf | FileCheck %s --check-prefix=ELF32
 // RUN: %clang_cc1 -emit-llvm %s -o - -triple=x86_64-pc-win32 | FileCheck %s --check-prefix=X64
-// RUN: %clang_cc1 -emit-llvm %s -o - -triple=x86_64-mingw32 | FileCheck %s --check-prefix=X64
+// RUN: %clang_cc1 -emit-llvm %s -o - -triple=x86_64-mingw32  | FileCheck %s --check-prefix=X64
+// RUN: %clang_cc1 -emit-llvm %s -o - -triple=x86_64-cygwin   | FileCheck %s --check-prefix=X64
 // RUN: %clang_cc1 -emit-llvm %s -o - -triple=x86_64-pc-windows-msvc-elf | FileCheck %s --check-prefix=ELF64
 
 // CHECK: target datalayout = "e-m:x-{{.*}}"
diff --git a/clang/test/CodeGen/math-libcalls-tbaa-indirect-args.c b/clang/test/CodeGen/math-libcalls-tbaa-indirect-args.c
index 0124cc5c06d43..20a31003fe915 100644
--- a/clang/test/CodeGen/math-libcalls-tbaa-indirect-args.c
+++ b/clang/test/CodeGen/math-libcalls-tbaa-indirect-args.c
@@ -1,4 +1,4 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --filter "(@powl|@cargl|@ilogbl|!|load|store)" --version 5
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --filter "(@powl|@cargl|@ilogbl|!|load|store)" --version 6
 // RUN: %clang_cc1 %s -O3 -fmath-errno -emit-llvm -triple x86_64-unknown-unknown -o - | FileCheck %s -check-prefixes=CHECK
 // RUN: %clang_cc1 %s -O3 -fmath-errno -emit-llvm -triple x86_64-pc-win64 -o - | FileCheck %s -check-prefixes=CHECK-WIN64
 // RUN: %clang_cc1 %s -O3 -fmath-errno -emit-llvm -triple i686-unknown-unknown -o - | FileCheck %s -check-prefixes=CHECK-I686
@@ -18,49 +18,49 @@ long double powl(long double a, long double b);
 
 // CHECK-LABEL: define dso_local x86_fp80 @test_powl(
 // CHECK-SAME: x86_fp80 noundef [[A:%.*]], x86_fp80 noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
-// CHECK:    [[CALL:%.*]] = tail call x86_fp80 @powl(x86_fp80 noundef [[A]], x86_fp80 noundef [[B]]) #[[ATTR5:[0-9]+]], !tbaa [[TBAA2:![0-9]+]]
+// CHECK:    [[CALL:%.*]] = tail call x86_fp80 @powl(x86_fp80 noundef [[A]], x86_fp80 noundef [[B]]) #[[ATTR5:[0-9]+]], !tbaa [[INT_TBAA2:![0-9]+]]
 //
 // CHECK-WIN64-LABEL: define dso_local x86_fp80 @test_powl(
 // CHECK-WIN64-SAME: x86_fp80 noundef [[A:%.*]], x86_fp80 noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
-// CHECK-WIN64:    [[CALL:%.*]] = tail call x86_fp80 @powl(x86_fp80 noundef [[A]], x86_fp80 noundef [[B]]) #[[ATTR5:[0-9]+]], !tbaa [[TBAA2:![0-9]+]]
+// CHECK-WIN64:    [[CALL:%.*]] = tail call x86_fp80 @powl(x86_fp80 noundef [[A]], x86_fp80 noundef [[B]]) #[[ATTR5:[0-9]+]], !tbaa [[INT_TBAA2:![0-9]+]]
 //
 // CHECK-I686-LABEL: define dso_local x86_fp80 @test_powl(
 // CHECK-I686-SAME: x86_fp80 noundef [[A:%.*]], x86_fp80 noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
-// CHECK-I686:    [[CALL:%.*]] = tail call x86_fp80 @powl(x86_fp80 noundef [[A]], x86_fp80 noundef [[B]]) #[[ATTR5:[0-9]+]], !tbaa [[TBAA3:![0-9]+]]
+// CHECK-I686:    [[CALL:%.*]] = tail call x86_fp80 @powl(x86_fp80 noundef [[A]], x86_fp80 noundef [[B]]) #[[ATTR5:[0-9]+]], !tbaa [[INT_TBAA3:![0-9]+]]
 //
 // CHECK-PPC-LABEL: define dso_local ppc_fp128 @test_powl(
 // CHECK-PPC-SAME: ppc_fp128 noundef [[A:%.*]], ppc_fp128 noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
-// CHECK-PPC:    [[CALL:%.*]] = tail call ppc_fp128 @powl(ppc_fp128 noundef [[A]], ppc_fp128 noundef [[B]]) #[[ATTR4:[0-9]+]], !tbaa [[TBAA2:![0-9]+]]
+// CHECK-PPC:    [[CALL:%.*]] = tail call ppc_fp128 @powl(ppc_fp128 noundef [[A]], ppc_fp128 noundef [[B]]) #[[ATTR4:[0-9]+]], !tbaa [[INT_TBAA2:![0-9]+]]
 //
 // CHECK-ARM-LABEL: define dso_local double @test_powl(
 // CHECK-ARM-SAME: double noundef [[A:%.*]], double noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
-// CHECK-ARM:    [[CALL:%.*]] = tail call double @powl(double noundef [[A]], double noundef [[B]]) #[[ATTR3:[0-9]+]], !tbaa [[TBAA3:![0-9]+]]
+// CHECK-ARM:    [[CALL:%.*]] = tail call double @powl(double noundef [[A]], double noundef [[B]]) #[[ATTR3:[0-9]+]], !tbaa [[INT_TBAA3:![0-9]+]]
 //
 // CHECK-ARM-HF-LABEL: define dso_local double @test_powl(
 // CHECK-ARM-HF-SAME: double noundef [[A:%.*]], double noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
-// CHECK-ARM-HF:    [[CALL:%.*]] = tail call double @powl(double noundef [[A]], double noundef [[B]]) #[[ATTR3:[0-9]+]], !tbaa [[TBAA3:![0-9]+]]
+// CHECK-ARM-HF:    [[CALL:%.*]] = tail call double @powl(double noundef [[A]], double noundef [[B]]) #[[ATTR3:[0-9]+]], !tbaa [[INT_TBAA3:![0-9]+]]
 //
 // CHECK-THUMB-LABEL: define double @test_powl(
 // CHECK-THUMB-SAME: double noundef [[A:%.*]], double noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
-// CHECK-THUMB:    [[CALL:%.*]] = tail call double @powl(double noundef [[A]], double noundef [[B]]) #[[ATTR3:[0-9]+]], !tbaa [[TBAA3:![0-9]+]]
+// CHECK-THUMB:    [[CALL:%.*]] = tail call double @powl(double noundef [[A]], double noundef [[B]]) #[[ATTR3:[0-9]+]], !tbaa [[INT_TBAA3:![0-9]+]]
 //
 // CHECK-AARCH-LABEL: define dso_local fp128 @test_powl(
 // CHECK-AARCH-SAME: fp128 noundef [[A:%.*]], fp128 noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
-// CHECK-AARCH:    [[CALL:%.*]] = tail call fp128 @powl(fp128 noundef [[A]], fp128 noundef [[B]]) #[[ATTR3:[0-9]+]], !tbaa [[TBAA2:![0-9]+]]
+// CHECK-AARCH:    [[CALL:%.*]] = tail call fp128 @powl(fp128 noundef [[A]], fp128 noundef [[B]]) #[[ATTR3:[0-9]+]], !tbaa [[INT_TBAA2:![0-9]+]]
 //
 // CHECK-SPIR-LABEL: define dso_local spir_func double @test_powl(
 // CHECK-SPIR-SAME: double noundef [[A:%.*]], double noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
-// CHECK-SPIR:    [[CALL:%.*]] = tail call spir_func double @powl(double noundef [[A]], double noundef [[B]]) #[[ATTR4:[0-9]+]], !tbaa [[TBAA2:![0-9]+]]
+// CHECK-SPIR:    [[CALL:%.*]] = tail call spir_func double @powl(double noundef [[A]], double noundef [[B]]) #[[ATTR4:[0-9]+]], !tbaa [[INT_TBAA2:![0-9]+]]
 //
 // CHECK-MINGW32-LABEL: define dso_local void @test_powl(
 // CHECK-MINGW32-SAME: ptr dead_on_unwind noalias writable writeonly sret(x86_fp80) align 16 captures(none) initializes((0, 10)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
-// CHECK-MINGW32:    [[A:%.*]] = load x86_fp80, ptr [[TMP0]], align 16, !tbaa [[TBAA6:![0-9]+]]
-// CHECK-MINGW32:    [[B:%.*]] = load x86_fp80, ptr [[TMP1]], align 16, !tbaa [[TBAA6]]
-// CHECK-MINGW32:    store x86_fp80 [[A]], ptr [[BYVAL_TEMP:%.*]], align 16, !tbaa [[TBAA6]]
-// CHECK-MINGW32:    store x86_fp80 [[B]], ptr [[BYVAL_TEMP1:%.*]], align 16, !tbaa [[TBAA6]]
+// CHECK-MINGW32:    [[A:%.*]] = load x86_fp80, ptr [[TMP0]], align 16, !tbaa [[LONG_DOUBLE_TBAA6:![0-9]+]]
+// CHECK-MINGW32:    [[B:%.*]] = load x86_fp80, ptr [[TMP1]], align 16, !tbaa [[LONG_DOUBLE_TBAA6]]
+// CHECK-MINGW32:    store x86_fp80 [[A]], ptr [[BYVAL_TEMP:%.*]], align 16, !tbaa [[LONG_DOUBLE_TBAA6]]
+// CHECK-MINGW32:    store x86_fp80 [[B]], ptr [[BYVAL_TEMP1:%.*]], align 16, !tbaa [[LONG_DOUBLE_TBAA6]]
 // CHECK-MINGW32:    call void @powl(ptr dead_on_unwind nonnull writable sret(x86_fp80) align 16 [[TMP:%.*]], ptr dead_on_return noundef nonnull [[BYVAL_TEMP]], ptr dead_on_return noundef nonnull [[BYVAL_TEMP1]]) #[[ATTR3:[0-9]+]]
-// CHECK-MINGW32:    [[TMP2:%.*]] = load x86_fp80, ptr [[TMP]], align 16, !tbaa [[TBAA6]]
-// CHECK-MINGW32:    store x86_fp80 [[TMP2]], ptr [[AGG_RESULT]], align 16, !tbaa [[TBAA6]]
+// CHECK-MINGW32:    [[TMP2:%.*]] = load x86_fp80, ptr [[TMP]], align 16, !tbaa [[LONG_DOUBLE_TBAA6]]
+// CHECK-MINGW32:    store x86_fp80 [[TMP2]], ptr [[AGG_RESULT]], align 16, !tbaa [[LONG_DOUBLE_TBAA6]]
 //
 long double test_powl(long double a, long double b) {
    return powl(a, b);
@@ -104,21 +104,21 @@ long double test_powl(long double a, long double b) {
 //
 // CHECK-ARM-LABEL: define dso_local void @test_cargl(
 // CHECK-ARM-SAME: ptr dead_on_unwind noalias writable writeonly sret({ double, double }) align 8 captures(none) initializes((0, 16)) [[AGG_RESULT:%.*]], [2 x i64] noundef [[CLD_COERCE:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] {
-// CHECK-ARM:    [[CALL:%.*]] = tail call double @cargl([2 x i64] noundef [[CLD_COERCE]]) #[[ATTR3]], !tbaa [[TBAA3]]
+// CHECK-ARM:    [[CALL:%.*]] = tail call double @cargl([2 x i64] noundef [[CLD_COERCE]]) #[[ATTR3]], !tbaa [[INT_TBAA3]]
 // CHECK-ARM:    store double [[MUL_RL:%.*]], ptr [[AGG_RESULT]], align 8
 // CHECK-ARM:    store double [[MUL_IR:%.*]], ptr [[AGG_RESULT_IMAGP:%.*]], align 8
 //
 // CHECK-ARM-HF-LABEL: define dso_local { double, double } @test_cargl(
 // CHECK-ARM-HF-SAME: { double, double } noundef [[CLD_COERCE:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] {
-// CHECK-ARM-HF:    [[CALL:%.*]] = tail call double @cargl({ double, double } noundef [[CLD_COERCE]]) #[[ATTR3]], !tbaa [[TBAA3]]
+// CHECK-ARM-HF:    [[CALL:%.*]] = tail call double @cargl({ double, double } noundef [[CLD_COERCE]]) #[[ATTR3]], !tbaa [[INT_TBAA3]]
 //
 // CHECK-THUMB-LABEL: define { double, double } @test_cargl(
 // CHECK-THUMB-SAME: [2 x double] noundef [[CLD_COERCE:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] {
-// CHECK-THUMB:    [[CALL:%.*]] = tail call double @cargl([2 x double] noundef [[CLD_COERCE]]) #[[ATTR3]], !tbaa [[TBAA3]]
+// CHECK-THUMB:    [[CALL:%.*]] = tail call double @cargl([2 x double] noundef [[CLD_COERCE]]) #[[ATTR3]], !tbaa [[INT_TBAA3]]
 //
 // CHECK-AARCH-LABEL: define dso_local { fp128, fp128 } @test_cargl(
 // CHECK-AARCH-SAME: [2 x fp128] noundef alignstack(16) [[CLD_COERCE:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] {
-// CHECK-AARCH:    [[CALL:%.*]] = tail call fp128 @cargl([2 x fp128] noundef alignstack(16) [[CLD_COERCE]]) #[[ATTR3]], !tbaa [[TBAA2]]
+// CHECK-AARCH:    [[CALL:%.*]] = tail call fp128 @cargl([2 x fp128] noundef alignstack(16) [[CLD_COERCE]]) #[[ATTR3]], !tbaa [[INT_TBAA2]]
 //
 // CHECK-SPIR-LABEL: define dso_local spir_func void @test_cargl(
 // CHECK-SPIR-SAME: ptr dead_on_unwind noalias writable writeonly sret({ double, double }) align 8 captures(none) initializes((0, 16)) [[AGG_RESULT:%.*]], ptr noundef readonly byval({ double, double }) align 8 captures(none) [[CLD:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] {
@@ -137,7 +137,7 @@ long double test_powl(long double a, long double b) {
 // CHECK-MINGW32:    store x86_fp80 [[CLD_REAL]], ptr [[BYVAL_TEMP:%.*]], align 16
 // CHECK-MINGW32:    store x86_fp80 [[CLD_IMAG]], ptr [[BYVAL_TEMP_IMAGP:%.*]], align 16
 // CHECK-MINGW32:    call void @cargl(ptr dead_on_unwind nonnull writable sret(x86_fp80) align 16 [[TMP:%.*]], ptr dead_on_return noundef nonnull [[BYVAL_TEMP]]) #[[ATTR3]]
-// CHECK-MINGW32:    [[TMP0:%.*]] = load x86_fp80, ptr [[TMP]], align 16, !tbaa [[TBAA6]]
+// CHECK-MINGW32:    [[TMP0:%.*]] = load x86_fp80, ptr [[TMP]], align 16, !tbaa [[LONG_DOUBLE_TBAA6]]
 // CHECK-MINGW32:    [[CLD_REAL3:%.*]] = load x86_fp80, ptr [[CLD]], align 16
 // CHECK-MINGW32:    [[CLD_IMAG5:%.*]] = load x86_fp80, ptr [[CLD_IMAGP]], align 16
 // CHECK-MINGW32:    store x86_fp80 [[MUL_RL:%.*]], ptr [[AGG_RESULT]], align 16
@@ -154,96 +154,96 @@ int ilogbl(long double a);
 
 // CHECK-LABEL: define dso_local i32 @test_ilogb(
 // CHECK-SAME: x86_fp80 noundef [[A:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// CHECK:    [[CALL:%.*]] = tail call i32 @ilogbl(x86_fp80 noundef [[A]]) #[[ATTR5]], !tbaa [[TBAA2]]
+// CHECK:    [[CALL:%.*]] = tail call i32 @ilogbl(x86_fp80 noundef [[A]]) #[[ATTR5]], !tbaa [[INT_TBAA2]]
 //
 // CHECK-WIN64-LABEL: define dso_local i32 @test_ilogb(
 // CHECK-WIN64-SAME: x86_fp80 noundef [[A:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// CHECK-WIN64:    [[CALL:%.*]] = tail call i32 @ilogbl(x86_fp80 noundef [[A]]) #[[ATTR5]], !tbaa [[TBAA2]]
+// CHECK-WIN64:    [[CALL:%.*]] = tail call i32 @ilogbl(x86_fp80 noundef [[A]]) #[[ATTR5]], !tbaa [[INT_TBAA2]]
 //
 // CHECK-I686-LABEL: define dso_local i32 @test_ilogb(
 // CHECK-I686-SAME: x86_fp80 noundef [[A:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// CHECK-I686:    [[CALL:%.*]] = tail call i32 @ilogbl(x86_fp80 noundef [[A]]) #[[ATTR5]], !tbaa [[TBAA3]]
+// CHECK-I686:    [[CALL:%.*]] = tail call i32 @ilogbl(x86_fp80 noundef [[A]]) #[[ATTR5]], !tbaa [[INT_TBAA3]]
 //
 // CHECK-PPC-LABEL: define dso_local i32 @test_ilogb(
 // CHECK-PPC-SAME: ppc_fp128 noundef [[A:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// CHECK-PPC:    [[CALL:%.*]] = tail call i32 @ilogbl(ppc_fp128 noundef [[A]]) #[[ATTR4]], !tbaa [[TBAA2]]
+// CHECK-PPC:    [[CALL:%.*]] = tail call i32 @ilogbl(ppc_fp128 noundef [[A]]) #[[ATTR4]], !tbaa [[INT_TBAA2]]
 //
 // CHECK-ARM-LABEL: define dso_local i32 @test_ilogb(
 // CHECK-ARM-SAME: double noundef [[A:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// CHECK-ARM:    [[CALL:%.*]] = tail call i32 @ilogbl(double noundef [[A]]) #[[ATTR3]], !tbaa [[TBAA3]]
+// CHECK-ARM:    [[CALL:%.*]] = tail call i32 @ilogbl(double noundef [[A]]) #[[ATTR3]], !tbaa [[INT_TBAA3]]
 //
 // CHECK-ARM-HF-LABEL: define dso_local i32 @test_ilogb(
 // CHECK-ARM-HF-SAME: double noundef [[A:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// CHECK-ARM-HF:    [[CALL:%.*]] = tail call i32 @ilogbl(double noundef [[A]]) #[[ATTR3]], !tbaa [[TBAA3]]
+// CHECK-ARM-HF:    [[CALL:%.*]] = tail call i32 @ilogbl(double noundef [[A]]) #[[ATTR3]], !tbaa [[INT_TBAA3]]
 //
 // CHECK-THUMB-LABEL: define i32 @test_ilogb(
 // CHECK-THUMB-SAME: double noundef [[A:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// CHECK-THUMB:    [[CALL:%.*]] = tail call i32 @ilogbl(double noundef [[A]]) #[[ATTR3]], !tbaa [[TBAA3]]
+// CHECK-THUMB:    [[CALL:%.*]] = tail call i32 @ilogbl(double noundef [[A]]) #[[ATTR3]], !tbaa [[INT_TBAA3]]
 //
 // CHECK-AARCH-LABEL: define dso_local i32 @test_ilogb(
 // CHECK-AARCH-SAME: fp128 noundef [[A:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// CHECK-AARCH:    [[CALL:%.*]] = tail call i32 @ilogbl(fp128 noundef [[A]]) #[[ATTR3]], !tbaa [[TBAA2]]
+// CHECK-AARCH:    [[CALL:%.*]] = tail call i32 @ilogbl(fp128 noundef [[A]]) #[[ATTR3]], !tbaa [[INT_TBAA2]]
 //
 // CHECK-SPIR-LABEL: define dso_local spir_func i32 @test_ilogb(
 // CHECK-SPIR-SAME: double noundef [[A:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// CHECK-SPIR:    [[CALL:%.*]] = tail call spir_func i32 @ilogbl(double noundef [[A]]) #[[ATTR4]], !tbaa [[TBAA2]]
+// CHECK-SPIR:    [[CALL:%.*]] = tail call spir_func i32 @ilogbl(double noundef [[A]]) #[[ATTR4]], !tbaa [[INT_TBAA2]]
 //
 // CHECK-MINGW32-LABEL: define dso_local i32 @test_ilogb(
 // CHECK-MINGW32-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// CHECK-MINGW32:    [[A:%.*]] = load x86_fp80, ptr [[TMP0]], align 16, !tbaa [[TBAA6]]
-// CHECK-MINGW32:    store x86_fp80 [[A]], ptr [[BYVAL_TEMP:%.*]], align 16, !tbaa [[TBAA6]]
+// CHECK-MINGW32:    [[A:%.*]] = load x86_fp80, ptr [[TMP0]], align 16, !tbaa [[LONG_DOUBLE_TBAA6]]
+// CHECK-MINGW32:    store x86_fp80 [[A]], ptr [[BYVAL_TEMP:%.*]], align 16, !tbaa [[LONG_DOUBLE_TBAA6]]
 // CHECK-MINGW32:    [[CALL:%.*]] = call i32 @ilogbl(ptr dead_on_return noundef nonnull [[BYVAL_TEMP]]) #[[ATTR3]]
 //
 int test_ilogb(long double a) {
    return ilogbl(a);
 }
 //.
-// CHECK: [[TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
+// CHECK: [[INT_TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
 // CHECK: [[META3]] = !{!"int", [[META4:![0-9]+]], i64 0}
 // CHECK: [[META4]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0}
 // CHECK: [[META5]] = !{!"Simple C/C++ TBAA"}
 //.
-// CHECK-WIN64: [[TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
+// CHECK-WIN64: [[INT_TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
 // CHECK-WIN64: [[META3]] = !{!"int", [[META4:![0-9]+]], i64 0}
 // CHECK-WIN64: [[META4]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0}
 // CHECK-WIN64: [[META5]] = !{!"Simple C/C++ TBAA"}
 //.
-// CHECK-I686: [[TBAA3]] = !{[[META4:![0-9]+]], [[META4]], i64 0}
+// CHECK-I686: [[INT_TBAA3]] = !{[[META4:![0-9]+]], [[META4]], i64 0}
 // CHECK-I686: [[META4]] = !{!"int", [[META5:![0-9]+]], i64 0}
 // CHECK-I686: [[META5]] = !{!"omnipotent char", [[META6:![0-9]+]], i64 0}
 // CHECK-I686: [[META6]] = !{!"Simple C/C++ TBAA"}
 //.
-// CHECK-PPC: [[TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
+// CHECK-PPC: [[INT_TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
 // CHECK-PPC: [[META3]] = !{!"int", [[META4:![0-9]+]], i64 0}
 // CHECK-PPC: [[META4]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0}
 // CHECK-PPC: [[META5]] = !{!"Simple C/C++ TBAA"}
 //.
-// CHECK-ARM: [[TBAA3]] = !{[[META4:![0-9]+]], [[META4]], i64 0}
+// CHECK-ARM: [[INT_TBAA3]] = !{[[META4:![0-9]+]], [[META4]], i64 0}
 // CHECK-ARM: [[META4]] = !{!"int", [[META5:![0-9]+]], i64 0}
 // CHECK-ARM: [[META5]] = !{!"omnipotent char", [[META6:![0-9]+]], i64 0}
 // CHECK-ARM: [[META6]] = !{!"Simple C/C++ TBAA"}
 //.
-// CHECK-ARM-HF: [[TBAA3]] = !{[[META4:![0-9]+]], [[META4]], i64 0}
+// CHECK-ARM-HF: [[INT_TBAA3]] = !{[[META4:![0-9]+]], [[META4]], i64 0}
 // CHECK-ARM-HF: [[META4]] = !{!"int", [[META5:![0-9]+]], i64 0}
 // CHECK-ARM-HF: [[META5]] = !{!"omnipotent char", [[META6:![0-9]+]], i64 0}
 // CHECK-ARM-HF: [[META6]] = !{!"Simple C/C++ TBAA"}
 //.
-// CHECK-THUMB: [[TBAA3]] = !{[[META4:![0-9]+]], [[META4]], i64 0}
+// CHECK-THUMB: [[INT_TBAA3]] = !{[[META4:![0-9]+]], [[META4]], i64 0}
 // CHECK-THUMB: [[META4]] = !{!"int", [[META5:![0-9]+]], i64 0}
 // CHECK-THUMB: [[META5]] = !{!"omnipotent char", [[META6:![0-9]+]], i64 0}
 // CHECK-THUMB: [[META6]] = !{!"Simple C/C++ TBAA"}
 //.
-// CHECK-AARCH: [[TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
+// CHECK-AARCH: [[INT_TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
 // CHECK-AARCH: [[META3]] = !{!"int", [[META4:![0-9]+]], i64 0}
 // CHECK-AARCH: [[META4]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0}
 // CHECK-AARCH: [[META5]] = !{!"Simple C/C++ TBAA"}
 //.
-// CHECK-SPIR: [[TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
+// CHECK-SPIR: [[INT_TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
 // CHECK-SPIR: [[META3]] = !{!"int", [[META4:![0-9]+]], i64 0}
 // CHECK-SPIR: [[META4]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0}
 // CHECK-SPIR: [[META5]] = !{!"Simple C/C++ TBAA"}
 //.
-// CHECK-MINGW32: [[TBAA6]] = !{[[META7:![0-9]+]], [[META7]], i64 0}
+// CHECK-MINGW32: [[LONG_DOUBLE_TBAA6]] = !{[[META7:![0-9]+]], [[META7]], i64 0}
 // CHECK-MINGW32: [[META7]] = !{!"long double", [[META8:![0-9]+]], i64 0}
 // CHECK-MINGW32: [[META8]] = !{!"omnipotent char", [[META9:![0-9]+]], i64 0}
 // CHECK-MINGW32: [[META9]] = !{!"Simple C/C++ TBAA"}
diff --git a/clang/test/CodeGen/math-libcalls-tbaa.c b/clang/test/CodeGen/math-libcalls-tbaa.c
index b2f502e5b4729..53ca7963b27c1 100644
--- a/clang/test/CodeGen/math-libcalls-tbaa.c
+++ b/clang/test/CodeGen/math-libcalls-tbaa.c
@@ -1,7 +1,7 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6
 
-// RUN:  %clang_cc1 -triple=aarch64-unknown-linux-gnu -fmath-errno -O3 -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,NONEWSTRUCTPATHTBAA
-// RUN:  %clang_cc1 -triple=aarch64-unknown-linux-gnu -fmath-errno -O3 -new-struct-path-tbaa -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,NEWSTRUCTPATHTBAA
+// RUN:  %clang_cc1 -triple=aarch64-unknown-linux-gnu -fmath-errno -O3 -emit-llvm -o - %s | FileCheck %s -check-prefixes=NONEWSTRUCTPATHTBAA
+// RUN:  %clang_cc1 -triple=aarch64-unknown-linux-gnu -fmath-errno -O3 -new-struct-path-tbaa -emit-llvm -o - %s | FileCheck %s -check-prefixes=NEWSTRUCTPATHTBAA
 
 float expf(float);
 double remainder(double, double);
@@ -13,14 +13,23 @@ float crealf(float _Complex);
 
 // Emit int TBAA metadata on FP math libcalls, which is useful for alias analysis
 
-// CHECK-LABEL: define dso_local float @test_expf(
-// CHECK-SAME: ptr noundef readonly captures(none) [[NUM:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i8, ptr [[NUM]], i64 40
-// CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2:![0-9]+]]
-// CHECK-NEXT:    [[CALL:%.*]] = tail call float @expf(float noundef [[TMP0]]) #[[ATTR9:[0-9]+]], !tbaa [[TBAA6:![0-9]+]]
-// CHECK-NEXT:    [[MUL:%.*]] = fmul float [[TMP0]], [[CALL]]
-// CHECK-NEXT:    ret float [[MUL]]
+// NONEWSTRUCTPATHTBAA-LABEL: define dso_local float @test_expf(
+// NONEWSTRUCTPATHTBAA-SAME: ptr noundef readonly captures(none) [[NUM:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// NONEWSTRUCTPATHTBAA-NEXT:  [[ENTRY:.*:]]
+// NONEWSTRUCTPATHTBAA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i8, ptr [[NUM]], i64 40
+// NONEWSTRUCTPATHTBAA-NEXT:    [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !tbaa [[FLOAT_TBAA2:![0-9]+]]
+// NONEWSTRUCTPATHTBAA-NEXT:    [[CALL:%.*]] = tail call float @expf(float noundef [[TMP0]]) #[[ATTR9:[0-9]+]], !tbaa [[INT_TBAA6:![0-9]+]]
+// NONEWSTRUCTPATHTBAA-NEXT:    [[MUL:%.*]] = fmul float [[TMP0]], [[CALL]]
+// NONEWSTRUCTPATHTBAA-NEXT:    ret float [[MUL]]
+//
+// NEWSTRUCTPATHTBAA-LABEL: define dso_local float @test_expf(
+// NEWSTRUCTPATHTBAA-SAME: ptr noundef readonly captures(none) [[NUM:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// NEWSTRUCTPATHTBAA-NEXT:  [[ENTRY:.*:]]
+// NEWSTRUCTPATHTBAA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i8, ptr [[NUM]], i64 40
+// NEWSTRUCTPATHTBAA-NEXT:    [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2:![0-9]+]]
+// NEWSTRUCTPATHTBAA-NEXT:    [[CALL:%.*]] = tail call float @expf(float noundef [[TMP0]]) #[[ATTR9:[0-9]+]], !tbaa [[TBAA6:![0-9]+]]
+// NEWSTRUCTPATHTBAA-NEXT:    [[MUL:%.*]] = fmul float [[TMP0]], [[CALL]]
+// NEWSTRUCTPATHTBAA-NEXT:    ret float [[MUL]]
 //
 float test_expf (float num[]) {
    const float expm2 = expf(num[10]);  // Emit TBAA metadata on @expf
@@ -28,14 +37,23 @@ float test_expf (float num[]) {
    return tmp;
 }
 
-// CHECK-LABEL: define dso_local float @test_builtin_expf(
-// CHECK-SAME: ptr noundef readonly captures(none) [[NUM:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i8, ptr [[NUM]], i64 40
-// CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[CALL:%.*]] = tail call float @expf(float noundef [[TMP0]]) #[[ATTR9]], !tbaa [[TBAA6]]
-// CHECK-NEXT:    [[MUL:%.*]] = fmul float [[TMP0]], [[CALL]]
-// CHECK-NEXT:    ret float [[MUL]]
+// NONEWSTRUCTPATHTBAA-LABEL: define dso_local float @test_builtin_expf(
+// NONEWSTRUCTPATHTBAA-SAME: ptr noundef readonly captures(none) [[NUM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// NONEWSTRUCTPATHTBAA-NEXT:  [[ENTRY:.*:]]
+// NONEWSTRUCTPATHTBAA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i8, ptr [[NUM]], i64 40
+// NONEWSTRUCTPATHTBAA-NEXT:    [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !tbaa [[FLOAT_TBAA2]]
+// NONEWSTRUCTPATHTBAA-NEXT:    [[CALL:%.*]] = tail call float @expf(float noundef [[TMP0]]) #[[ATTR9]], !tbaa [[INT_TBAA6]]
+// NONEWSTRUCTPATHTBAA-NEXT:    [[MUL:%.*]] = fmul float [[TMP0]], [[CALL]]
+// NONEWSTRUCTPATHTBAA-NEXT:    ret float [[MUL]]
+//
+// NEWSTRUCTPATHTBAA-LABEL: define dso_local float @test_builtin_expf(
+// NEWSTRUCTPATHTBAA-SAME: ptr noundef readonly captures(none) [[NUM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// NEWSTRUCTPATHTBAA-NEXT:  [[ENTRY:.*:]]
+// NEWSTRUCTPATHTBAA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i8, ptr [[NUM]], i64 40
+// NEWSTRUCTPATHTBAA-NEXT:    [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
+// NEWSTRUCTPATHTBAA-NEXT:    [[CALL:%.*]] = tail call float @expf(float noundef [[TMP0]]) #[[ATTR9]], !tbaa [[TBAA6]]
+// NEWSTRUCTPATHTBAA-NEXT:    [[MUL:%.*]] = fmul float [[TMP0]], [[CALL]]
+// NEWSTRUCTPATHTBAA-NEXT:    ret float [[MUL]]
 //
 float test_builtin_expf (float num[]) {
    const float expm2 = __builtin_expf(num[10]);  // Emit TBAA metadata on @expf
@@ -45,14 +63,23 @@ float test_builtin_expf (float num[]) {
 
 //
 // Negative test: fabs cannot set errno
-// CHECK-LABEL: define dso_local double @test_fabs(
-// CHECK-SAME: ptr noundef readonly captures(none) [[NUM:%.*]]) local_unnamed_addr #[[ATTR3:[0-9]+]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i8, ptr [[NUM]], i64 80
-// CHECK-NEXT:    [[TMP0:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA8:![0-9]+]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call double @llvm.fabs.f64(double [[TMP0]])
-// CHECK-NEXT:    [[MUL:%.*]] = fmul double [[TMP0]], [[TMP1]]
-// CHECK-NEXT:    ret double [[MUL]]
+// NONEWSTRUCTPATHTBAA-LABEL: define dso_local double @test_fabs(
+// NONEWSTRUCTPATHTBAA-SAME: ptr noundef readonly captures(none) [[NUM:%.*]]) local_unnamed_addr #[[ATTR3:[0-9]+]] {
+// NONEWSTRUCTPATHTBAA-NEXT:  [[ENTRY:.*:]]
+// NONEWSTRUCTPATHTBAA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i8, ptr [[NUM]], i64 80
+// NONEWSTRUCTPATHTBAA-NEXT:    [[TMP0:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !tbaa [[DOUBLE_TBAA8:![0-9]+]]
+// NONEWSTRUCTPATHTBAA-NEXT:    [[TMP1:%.*]] = tail call double @llvm.fabs.f64(double [[TMP0]])
+// NONEWSTRUCTPATHTBAA-NEXT:    [[MUL:%.*]] = fmul double [[TMP0]], [[TMP1]]
+// NONEWSTRUCTPATHTBAA-NEXT:    ret double [[MUL]]
+//
+// NEWSTRUCTPATHTBAA-LABEL: define dso_local double @test_fabs(
+// NEWSTRUCTPATHTBAA-SAME: ptr noundef readonly captures(none) [[NUM:%.*]]) local_unnamed_addr #[[ATTR3:[0-9]+]] {
+// NEWSTRUCTPATHTBAA-NEXT:  [[ENTRY:.*:]]
+// NEWSTRUCTPATHTBAA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i8, ptr [[NUM]], i64 80
+// NEWSTRUCTPATHTBAA-NEXT:    [[TMP0:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA8:![0-9]+]]
+// NEWSTRUCTPATHTBAA-NEXT:    [[TMP1:%.*]] = tail call double @llvm.fabs.f64(double [[TMP0]])
+// NEWSTRUCTPATHTBAA-NEXT:    [[MUL:%.*]] = fmul double [[TMP0]], [[TMP1]]
+// NEWSTRUCTPATHTBAA-NEXT:    ret double [[MUL]]
 //
 double test_fabs (double num[]) {
    const double expm2 = fabs(num[10]);          // Don't emit TBAA metadata
@@ -60,14 +87,23 @@ double test_fabs (double num[]) {
    return tmp;
 }
 
-// CHECK-LABEL: define dso_local double @test_remainder(
-// CHECK-SAME: ptr noundef readonly captures(none) [[NUM:%.*]], double noundef [[A:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i8, ptr [[NUM]], i64 80
-// CHECK-NEXT:    [[TMP0:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA8]]
-// CHECK-NEXT:    [[CALL:%.*]] = tail call double @remainder(double noundef [[TMP0]], double noundef [[A]]) #[[ATTR9]], !tbaa [[TBAA6]]
-// CHECK-NEXT:    [[MUL:%.*]] = fmul double [[TMP0]], [[CALL]]
-// CHECK-NEXT:    ret double [[MUL]]
+// NONEWSTRUCTPATHTBAA-LABEL: define dso_local double @test_remainder(
+// NONEWSTRUCTPATHTBAA-SAME: ptr noundef readonly captures(none) [[NUM:%.*]], double noundef [[A:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// NONEWSTRUCTPATHTBAA-NEXT:  [[ENTRY:.*:]]
+// NONEWSTRUCTPATHTBAA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i8, ptr [[NUM]], i64 80
+// NONEWSTRUCTPATHTBAA-NEXT:    [[TMP0:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !tbaa [[DOUBLE_TBAA8]]
+// NONEWSTRUCTPATHTBAA-NEXT:    [[CALL:%.*]] = tail call double @remainder(double noundef [[TMP0]], double noundef [[A]]) #[[ATTR9]], !tbaa [[INT_TBAA6]]
+// NONEWSTRUCTPATHTBAA-NEXT:    [[MUL:%.*]] = fmul double [[TMP0]], [[CALL]]
+// NONEWSTRUCTPATHTBAA-NEXT:    ret double [[MUL]]
+//
+// NEWSTRUCTPATHTBAA-LABEL: define dso_local double @test_remainder(
+// NEWSTRUCTPATHTBAA-SAME: ptr noundef readonly captures(none) [[NUM:%.*]], double noundef [[A:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// NEWSTRUCTPATHTBAA-NEXT:  [[ENTRY:.*:]]
+// NEWSTRUCTPATHTBAA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i8, ptr [[NUM]], i64 80
+// NEWSTRUCTPATHTBAA-NEXT:    [[TMP0:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA8]]
+// NEWSTRUCTPATHTBAA-NEXT:    [[CALL:%.*]] = tail call double @remainder(double noundef [[TMP0]], double noundef [[A]]) #[[ATTR9]], !tbaa [[TBAA6]]
+// NEWSTRUCTPATHTBAA-NEXT:    [[MUL:%.*]] = fmul double [[TMP0]], [[CALL]]
+// NEWSTRUCTPATHTBAA-NEXT:    ret double [[MUL]]
 //
 double test_remainder (double num[], double a) {
    const double expm2 = remainder(num[10], a);  // Emit TBAA metadata
@@ -78,17 +114,29 @@ double test_remainder (double num[], double a) {
 //
 // TODO: frexp is not subject to any errors, but also writes to
 // its int pointer out argument, so it could emit int TBAA metadata.
-// CHECK-LABEL: define dso_local double @test_frexp(
-// CHECK-SAME: ptr noundef readonly captures(none) [[NUM:%.*]]) local_unnamed_addr #[[ATTR5:[0-9]+]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[E:%.*]] = alloca i32, align 4
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(ptr nonnull [[E]]) #[[ATTR9]]
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i8, ptr [[NUM]], i64 16
-// CHECK-NEXT:    [[TMP0:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA8]]
-// CHECK-NEXT:    [[CALL:%.*]] = call double @frexp(double noundef [[TMP0]], ptr noundef nonnull [[E]]) #[[ATTR9]]
-// CHECK-NEXT:    [[MUL:%.*]] = fmul double [[TMP0]], [[CALL]]
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(ptr nonnull [[E]]) #[[ATTR9]]
-// CHECK-NEXT:    ret double [[MUL]]
+// NONEWSTRUCTPATHTBAA-LABEL: define dso_local double @test_frexp(
+// NONEWSTRUCTPATHTBAA-SAME: ptr noundef readonly captures(none) [[NUM:%.*]]) local_unnamed_addr #[[ATTR5:[0-9]+]] {
+// NONEWSTRUCTPATHTBAA-NEXT:  [[ENTRY:.*:]]
+// NONEWSTRUCTPATHTBAA-NEXT:    [[E:%.*]] = alloca i32, align 4
+// NONEWSTRUCTPATHTBAA-NEXT:    call void @llvm.lifetime.start.p0(ptr nonnull [[E]]) #[[ATTR9]]
+// NONEWSTRUCTPATHTBAA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i8, ptr [[NUM]], i64 16
+// NONEWSTRUCTPATHTBAA-NEXT:    [[TMP0:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !tbaa [[DOUBLE_TBAA8]]
+// NONEWSTRUCTPATHTBAA-NEXT:    [[CALL:%.*]] = call double @frexp(double noundef [[TMP0]], ptr noundef nonnull [[E]]) #[[ATTR9]]
+// NONEWSTRUCTPATHTBAA-NEXT:    [[MUL:%.*]] = fmul double [[TMP0]], [[CALL]]
+// NONEWSTRUCTPATHTBAA-NEXT:    call void @llvm.lifetime.end.p0(ptr nonnull [[E]]) #[[ATTR9]]
+// NONEWSTRUCTPATHTBAA-NEXT:    ret double [[MUL]]
+//
+// NEWSTRUCTPATHTBAA-LABEL: define dso_local double @test_frexp(
+// NEWSTRUCTPATHTBAA-SAME: ptr noundef readonly captures(none) [[NUM:%.*]]) local_unnamed_addr #[[ATTR5:[0-9]+]] {
+// NEWSTRUCTPATHTBAA-NEXT:  [[ENTRY:.*:]]
+// NEWSTRUCTPATHTBAA-NEXT:    [[E:%.*]] = alloca i32, align 4
+// NEWSTRUCTPATHTBAA-NEXT:    call void @llvm.lifetime.start.p0(ptr nonnull [[E]]) #[[ATTR9]]
+// NEWSTRUCTPATHTBAA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i8, ptr [[NUM]], i64 16
+// NEWSTRUCTPATHTBAA-NEXT:    [[TMP0:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA8]]
+// NEWSTRUCTPATHTBAA-NEXT:    [[CALL:%.*]] = call double @frexp(double noundef [[TMP0]], ptr noundef nonnull [[E]]) #[[ATTR9]]
+// NEWSTRUCTPATHTBAA-NEXT:    [[MUL:%.*]] = fmul double [[TMP0]], [[CALL]]
+// NEWSTRUCTPATHTBAA-NEXT:    call void @llvm.lifetime.end.p0(ptr nonnull [[E]]) #[[ATTR9]]
+// NEWSTRUCTPATHTBAA-NEXT:    ret double [[MUL]]
 //
 double test_frexp (double num[]) {
    int e;
@@ -100,24 +148,43 @@ double test_frexp (double num[]) {
 //
 // Negative test: sincos is a library function, but is not a builtin function
 // checked in CodeGenFunction::EmitCallExpr.
-// CHECK-LABEL: define dso_local float @test_sincos(
-// CHECK-SAME: ptr noundef readonly captures(none) [[NUM:%.*]]) local_unnamed_addr #[[ATTR7:[0-9]+]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[SIN:%.*]] = alloca float, align 4
-// CHECK-NEXT:    [[COS:%.*]] = alloca float, align 4
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(ptr nonnull [[SIN]]) #[[ATTR9]]
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(ptr nonnull [[COS]]) #[[ATTR9]]
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i8, ptr [[NUM]], i64 8
-// CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
-// CHECK-NEXT:    call void @sincos(float noundef [[TMP0]], ptr noundef nonnull [[SIN]], ptr noundef nonnull [[COS]]) #[[ATTR9]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[SIN]], align 4, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[COS]], align 4, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[MUL:%.*]] = fmul float [[TMP1]], [[TMP2]]
-// CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[ADD:%.*]] = fadd float [[MUL]], [[TMP3]]
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(ptr nonnull [[COS]]) #[[ATTR9]]
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(ptr nonnull [[SIN]]) #[[ATTR9]]
-// CHECK-NEXT:    ret float [[ADD]]
+// NONEWSTRUCTPATHTBAA-LABEL: define dso_local float @test_sincos(
+// NONEWSTRUCTPATHTBAA-SAME: ptr noundef readonly captures(none) [[NUM:%.*]]) local_unnamed_addr #[[ATTR7:[0-9]+]] {
+// NONEWSTRUCTPATHTBAA-NEXT:  [[ENTRY:.*:]]
+// NONEWSTRUCTPATHTBAA-NEXT:    [[SIN:%.*]] = alloca float, align 4
+// NONEWSTRUCTPATHTBAA-NEXT:    [[COS:%.*]] = alloca float, align 4
+// NONEWSTRUCTPATHTBAA-NEXT:    call void @llvm.lifetime.start.p0(ptr nonnull [[SIN]]) #[[ATTR9]]
+// NONEWSTRUCTPATHTBAA-NEXT:    call void @llvm.lifetime.start.p0(ptr nonnull [[COS]]) #[[ATTR9]]
+// NONEWSTRUCTPATHTBAA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i8, ptr [[NUM]], i64 8
+// NONEWSTRUCTPATHTBAA-NEXT:    [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !tbaa [[FLOAT_TBAA2]]
+// NONEWSTRUCTPATHTBAA-NEXT:    call void @sincos(float noundef [[TMP0]], ptr noundef nonnull [[SIN]], ptr noundef nonnull [[COS]]) #[[ATTR9]]
+// NONEWSTRUCTPATHTBAA-NEXT:    [[TMP1:%.*]] = load float, ptr [[SIN]], align 4, !tbaa [[FLOAT_TBAA2]]
+// NONEWSTRUCTPATHTBAA-NEXT:    [[TMP2:%.*]] = load float, ptr [[COS]], align 4, !tbaa [[FLOAT_TBAA2]]
+// NONEWSTRUCTPATHTBAA-NEXT:    [[MUL:%.*]] = fmul float [[TMP1]], [[TMP2]]
+// NONEWSTRUCTPATHTBAA-NEXT:    [[TMP3:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !tbaa [[FLOAT_TBAA2]]
+// NONEWSTRUCTPATHTBAA-NEXT:    [[ADD:%.*]] = fadd float [[MUL]], [[TMP3]]
+// NONEWSTRUCTPATHTBAA-NEXT:    call void @llvm.lifetime.end.p0(ptr nonnull [[COS]]) #[[ATTR9]]
+// NONEWSTRUCTPATHTBAA-NEXT:    call void @llvm.lifetime.end.p0(ptr nonnull [[SIN]]) #[[ATTR9]]
+// NONEWSTRUCTPATHTBAA-NEXT:    ret float [[ADD]]
+//
+// NEWSTRUCTPATHTBAA-LABEL: define dso_local float @test_sincos(
+// NEWSTRUCTPATHTBAA-SAME: ptr noundef readonly captures(none) [[NUM:%.*]]) local_unnamed_addr #[[ATTR7:[0-9]+]] {
+// NEWSTRUCTPATHTBAA-NEXT:  [[ENTRY:.*:]]
+// NEWSTRUCTPATHTBAA-NEXT:    [[SIN:%.*]] = alloca float, align 4
+// NEWSTRUCTPATHTBAA-NEXT:    [[COS:%.*]] = alloca float, align 4
+// NEWSTRUCTPATHTBAA-NEXT:    call void @llvm.lifetime.start.p0(ptr nonnull [[SIN]]) #[[ATTR9]]
+// NEWSTRUCTPATHTBAA-NEXT:    call void @llvm.lifetime.start.p0(ptr nonnull [[COS]]) #[[ATTR9]]
+// NEWSTRUCTPATHTBAA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i8, ptr [[NUM]], i64 8
+// NEWSTRUCTPATHTBAA-NEXT:    [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
+// NEWSTRUCTPATHTBAA-NEXT:    call void @sincos(float noundef [[TMP0]], ptr noundef nonnull [[SIN]], ptr noundef nonnull [[COS]]) #[[ATTR9]]
+// NEWSTRUCTPATHTBAA-NEXT:    [[TMP1:%.*]] = load float, ptr [[SIN]], align 4, !tbaa [[TBAA2]]
+// NEWSTRUCTPATHTBAA-NEXT:    [[TMP2:%.*]] = load float, ptr [[COS]], align 4, !tbaa [[TBAA2]]
+// NEWSTRUCTPATHTBAA-NEXT:    [[MUL:%.*]] = fmul float [[TMP1]], [[TMP2]]
+// NEWSTRUCTPATHTBAA-NEXT:    [[TMP3:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
+// NEWSTRUCTPATHTBAA-NEXT:    [[ADD:%.*]] = fadd float [[MUL]], [[TMP3]]
+// NEWSTRUCTPATHTBAA-NEXT:    call void @llvm.lifetime.end.p0(ptr nonnull [[COS]]) #[[ATTR9]]
+// NEWSTRUCTPATHTBAA-NEXT:    call void @llvm.lifetime.end.p0(ptr nonnull [[SIN]]) #[[ATTR9]]
+// NEWSTRUCTPATHTBAA-NEXT:    ret float [[ADD]]
 //
 float test_sincos (float num[]) {
    float sin, cos;
@@ -127,18 +194,31 @@ float test_sincos (float num[]) {
 }
 
 // TODO: The builtin return a complex type
-// CHECK-LABEL: define dso_local float @test_cacoshf(
-// CHECK-SAME: ptr noundef readonly captures(none) [[NUM:%.*]]) local_unnamed_addr #[[ATTR7]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i8, ptr [[NUM]], i64 8
-// CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue [2 x float] poison, float [[TMP0]], 0
-// CHECK-NEXT:    [[DOTFCA_1_INSERT:%.*]] = insertvalue [2 x float] [[DOTFCA_0_INSERT]], float 0.000000e+00, 1
-// CHECK-NEXT:    [[CALL:%.*]] = tail call { float, float } @cacoshf([2 x float] noundef alignstack(8) [[DOTFCA_1_INSERT]]) #[[ATTR9]]
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { float, float } [[CALL]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[ADD:%.*]] = fadd float [[TMP1]], [[TMP2]]
-// CHECK-NEXT:    ret float [[ADD]]
+// NONEWSTRUCTPATHTBAA-LABEL: define dso_local float @test_cacoshf(
+// NONEWSTRUCTPATHTBAA-SAME: ptr noundef readonly captures(none) [[NUM:%.*]]) local_unnamed_addr #[[ATTR7]] {
+// NONEWSTRUCTPATHTBAA-NEXT:  [[ENTRY:.*:]]
+// NONEWSTRUCTPATHTBAA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i8, ptr [[NUM]], i64 8
+// NONEWSTRUCTPATHTBAA-NEXT:    [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !tbaa [[FLOAT_TBAA2]]
+// NONEWSTRUCTPATHTBAA-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue [2 x float] poison, float [[TMP0]], 0
+// NONEWSTRUCTPATHTBAA-NEXT:    [[DOTFCA_1_INSERT:%.*]] = insertvalue [2 x float] [[DOTFCA_0_INSERT]], float 0.000000e+00, 1
+// NONEWSTRUCTPATHTBAA-NEXT:    [[CALL:%.*]] = tail call { float, float } @cacoshf([2 x float] noundef alignstack(8) [[DOTFCA_1_INSERT]]) #[[ATTR9]]
+// NONEWSTRUCTPATHTBAA-NEXT:    [[TMP1:%.*]] = extractvalue { float, float } [[CALL]], 0
+// NONEWSTRUCTPATHTBAA-NEXT:    [[TMP2:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !tbaa [[FLOAT_TBAA2]]
+// NONEWSTRUCTPATHTBAA-NEXT:    [[ADD:%.*]] = fadd float [[TMP1]], [[TMP2]]
+// NONEWSTRUCTPATHTBAA-NEXT:    ret float [[ADD]]
+//
+// NEWSTRUCTPATHTBAA-LABEL: define dso_local float @test_cacoshf(
+// NEWSTRUCTPATHTBAA-SAME: ptr noundef readonly captures(none) [[NUM:%.*]]) local_unnamed_addr #[[ATTR7]] {
+// NEWSTRUCTPATHTBAA-NEXT:  [[ENTRY:.*:]]
+// NEWSTRUCTPATHTBAA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i8, ptr [[NUM]], i64 8
+// NEWSTRUCTPATHTBAA-NEXT:    [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
+// NEWSTRUCTPATHTBAA-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue [2 x float] poison, float [[TMP0]], 0
+// NEWSTRUCTPATHTBAA-NEXT:    [[DOTFCA_1_INSERT:%.*]] = insertvalue [2 x float] [[DOTFCA_0_INSERT]], float 0.000000e+00, 1
+// NEWSTRUCTPATHTBAA-NEXT:    [[CALL:%.*]] = tail call { float, float } @cacoshf([2 x float] noundef alignstack(8) [[DOTFCA_1_INSERT]]) #[[ATTR9]]
+// NEWSTRUCTPATHTBAA-NEXT:    [[TMP1:%.*]] = extractvalue { float, float } [[CALL]], 0
+// NEWSTRUCTPATHTBAA-NEXT:    [[TMP2:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
+// NEWSTRUCTPATHTBAA-NEXT:    [[ADD:%.*]] = fadd float [[TMP1]], [[TMP2]]
+// NEWSTRUCTPATHTBAA-NEXT:    ret float [[ADD]]
 //
 float test_cacoshf (float num[]) {
    float _Complex z = cacoshf(num[2]);  // Don't emit TBAA metadata
@@ -147,13 +227,13 @@ float test_cacoshf (float num[]) {
 }
 
 //.
-// NONEWSTRUCTPATHTBAA: [[TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
+// NONEWSTRUCTPATHTBAA: [[FLOAT_TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
 // NONEWSTRUCTPATHTBAA: [[META3]] = !{!"float", [[META4:![0-9]+]], i64 0}
 // NONEWSTRUCTPATHTBAA: [[META4]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0}
 // NONEWSTRUCTPATHTBAA: [[META5]] = !{!"Simple C/C++ TBAA"}
-// NONEWSTRUCTPATHTBAA: [[TBAA6]] = !{[[META7:![0-9]+]], [[META7]], i64 0}
+// NONEWSTRUCTPATHTBAA: [[INT_TBAA6]] = !{[[META7:![0-9]+]], [[META7]], i64 0}
 // NONEWSTRUCTPATHTBAA: [[META7]] = !{!"int", [[META4]], i64 0}
-// NONEWSTRUCTPATHTBAA: [[TBAA8]] = !{[[META9:![0-9]+]], [[META9]], i64 0}
+// NONEWSTRUCTPATHTBAA: [[DOUBLE_TBAA8]] = !{[[META9:![0-9]+]], [[META9]], i64 0}
 // NONEWSTRUCTPATHTBAA: [[META9]] = !{!"double", [[META4]], i64 0}
 //.
 // NEWSTRUCTPATHTBAA: [[TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0, i64 4}
@@ -165,6 +245,3 @@ float test_cacoshf (float num[]) {
 // NEWSTRUCTPATHTBAA: [[TBAA8]] = !{[[META9:![0-9]+]], [[META9]], i64 0, i64 8}
 // NEWSTRUCTPATHTBAA: [[META9]] = !{[[META4]], i64 8, !"double"}
 //.
-//// NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-// NEWSTRUCTPATHTBAA: {{.*}}
-// NONEWSTRUCTPATHTBAA: {{.*}}
diff --git a/clang/test/CodeGen/ms-intrinsics-underaligned.c b/clang/test/CodeGen/ms-intrinsics-underaligned.c
index 34e2afb09f4b9..5be8ed8c6a600 100644
--- a/clang/test/CodeGen/ms-intrinsics-underaligned.c
+++ b/clang/test/CodeGen/ms-intrinsics-underaligned.c
@@ -111,6 +111,6 @@ long test_InterlockedAdd(X *x) {
 // CHECK-AARCH64-LABEL: @test_InterlockedAdd64(
 // CHECK-AARCH64:   atomicrmw {{.*}} align 8
 long test_InterlockedAdd64(X *x) {
-  return _InterlockedAdd64(&x->c, 4);
+  return _InterlockedAdd64((volatile long long*)&x->c, 4);
 }
 #endif
diff --git a/clang/test/CodeGen/ms_abi.c b/clang/test/CodeGen/ms_abi.c
index 5d58c9816da78..2047febabdb11 100644
--- a/clang/test/CodeGen/ms_abi.c
+++ b/clang/test/CodeGen/ms_abi.c
@@ -1,5 +1,7 @@
 // RUN: %clang_cc1 -triple x86_64-unknown-freebsd10.0 -emit-llvm < %s | FileCheck -check-prefix=FREEBSD %s
 // RUN: %clang_cc1 -triple x86_64-pc-win32 -emit-llvm < %s | FileCheck -check-prefix=WIN64 %s
+// RUN: %clang_cc1 -triple x86_64-mingw    -emit-llvm < %s | FileCheck -check-prefix=WIN64 %s
+// RUN: %clang_cc1 -triple x86_64-cygwin   -emit-llvm < %s | FileCheck -check-prefix=WIN64 %s
 // RUN: %clang_cc1 -triple x86_64-uefi -emit-llvm < %s | FileCheck -check-prefix=WIN64 %s
 
 struct foo {
diff --git a/clang/test/CodeGen/sanitize-metadata-nosanitize.c b/clang/test/CodeGen/sanitize-metadata-nosanitize.c
index eabcbd1409fe2..22ed25bd3b670 100644
--- a/clang/test/CodeGen/sanitize-metadata-nosanitize.c
+++ b/clang/test/CodeGen/sanitize-metadata-nosanitize.c
@@ -1,4 +1,4 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-attributes --check-globals --version 2
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-attributes --check-globals all --version 6
 // RUN: %clang_cc1 -O -fexperimental-sanitize-metadata=covered -fexperimental-sanitize-metadata=atomics -fexperimental-sanitize-metadata=uar -triple x86_64-gnu-linux -x c -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK
 
 //.
@@ -11,9 +11,9 @@
 // CHECK: @llvm.global_dtors = appending global [2 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 2, ptr @__sanitizer_metadata_covered2.module_dtor, ptr @__sanitizer_metadata_covered2.module_dtor }, { i32, ptr, ptr } { i32 2, ptr @__sanitizer_metadata_atomics2.module_dtor, ptr @__sanitizer_metadata_atomics2.module_dtor }]
 //.
 // CHECK: Function Attrs: mustprogress nofree noinline norecurse nosync nounwind willreturn memory(write, argmem: none, inaccessiblemem: none)
-// CHECK-LABEL: define dso_local void @escape
-// CHECK-SAME: (ptr noundef [[P:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] !pcsections [[META2:![0-9]+]] {
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define dso_local void @escape(
+// CHECK-SAME: ptr noundef [[P:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] !pcsections [[META2:![0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    ret void
 //
 __attribute__((noinline, not_tail_called)) void escape(const volatile void *p) {
@@ -22,14 +22,14 @@ __attribute__((noinline, not_tail_called)) void escape(const volatile void *p) {
 }
 
 // CHECK: Function Attrs: mustprogress nofree norecurse nounwind willreturn memory(write, argmem: readwrite, inaccessiblemem: none)
-// CHECK-LABEL: define dso_local i32 @normal_function
-// CHECK-SAME: (ptr noundef [[X:%.*]], ptr noundef readonly captures(none) [[Y:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] !pcsections [[META4:![0-9]+]] {
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define dso_local i32 @normal_function(
+// CHECK-SAME: ptr noundef [[X:%.*]], ptr noundef readonly captures(none) [[Y:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] !pcsections [[META4:![0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[X_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    store ptr [[X]], ptr [[X_ADDR]], align 8, !tbaa [[TBAA6:![0-9]+]]
+// CHECK-NEXT:    store ptr [[X]], ptr [[X_ADDR]], align 8, !tbaa [[INTPTR_TBAA6:![0-9]+]]
 // CHECK-NEXT:    store atomic i32 1, ptr [[X]] monotonic, align 4, !pcsections [[META11:![0-9]+]]
 // CHECK-NEXT:    notail call void @escape(ptr noundef nonnull [[X_ADDR]])
-// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[Y]], align 4, !tbaa [[TBAA12:![0-9]+]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[Y]], align 4, !tbaa [[INT_TBAA12:![0-9]+]]
 // CHECK-NEXT:    ret i32 [[TMP0]]
 //
 int normal_function(int *x, int *y) {
@@ -39,14 +39,14 @@ int normal_function(int *x, int *y) {
 }
 
 // CHECK: Function Attrs: disable_sanitizer_instrumentation mustprogress nofree norecurse nounwind willreturn memory(write, argmem: readwrite, inaccessiblemem: none)
-// CHECK-LABEL: define dso_local i32 @test_disable_sanitize_instrumentation
-// CHECK-SAME: (ptr noundef [[X:%.*]], ptr noundef readonly captures(none) [[Y:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] {
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define dso_local i32 @test_disable_sanitize_instrumentation(
+// CHECK-SAME: ptr noundef [[X:%.*]], ptr noundef readonly captures(none) [[Y:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[X_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    store ptr [[X]], ptr [[X_ADDR]], align 8, !tbaa [[TBAA6]]
+// CHECK-NEXT:    store ptr [[X]], ptr [[X_ADDR]], align 8, !tbaa [[INTPTR_TBAA6]]
 // CHECK-NEXT:    store atomic i32 1, ptr [[X]] monotonic, align 4
 // CHECK-NEXT:    notail call void @escape(ptr noundef nonnull [[X_ADDR]])
-// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[Y]], align 4, !tbaa [[TBAA12]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[Y]], align 4, !tbaa [[INT_TBAA12]]
 // CHECK-NEXT:    ret i32 [[TMP0]]
 //
 __attribute__((disable_sanitizer_instrumentation)) int test_disable_sanitize_instrumentation(int *x, int *y) {
@@ -56,14 +56,14 @@ __attribute__((disable_sanitizer_instrumentation)) int test_disable_sanitize_ins
 }
 
 // CHECK: Function Attrs: mustprogress nofree norecurse nounwind willreturn memory(write, argmem: readwrite, inaccessiblemem: none)
-// CHECK-LABEL: define dso_local i32 @test_no_sanitize_thread
-// CHECK-SAME: (ptr noundef [[X:%.*]], ptr noundef readonly captures(none) [[Y:%.*]]) local_unnamed_addr #[[ATTR3:[0-9]+]] !pcsections [[META14:![0-9]+]] {
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define dso_local i32 @test_no_sanitize_thread(
+// CHECK-SAME: ptr noundef [[X:%.*]], ptr noundef readonly captures(none) [[Y:%.*]]) local_unnamed_addr #[[ATTR3:[0-9]+]] !pcsections [[META14:![0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[X_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    store ptr [[X]], ptr [[X_ADDR]], align 8, !tbaa [[TBAA6]]
+// CHECK-NEXT:    store ptr [[X]], ptr [[X_ADDR]], align 8, !tbaa [[INTPTR_TBAA6]]
 // CHECK-NEXT:    store atomic i32 1, ptr [[X]] monotonic, align 4, !pcsections [[META11]]
 // CHECK-NEXT:    notail call void @escape(ptr noundef nonnull [[X_ADDR]])
-// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[Y]], align 4, !tbaa [[TBAA12]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[Y]], align 4, !tbaa [[INT_TBAA12]]
 // CHECK-NEXT:    ret i32 [[TMP0]]
 //
 __attribute__((no_sanitize("thread"))) int test_no_sanitize_thread(int *x, int *y) {
@@ -73,14 +73,14 @@ __attribute__((no_sanitize("thread"))) int test_no_sanitize_thread(int *x, int *
 }
 
 // CHECK: Function Attrs: mustprogress nofree norecurse nounwind willreturn memory(write, argmem: readwrite, inaccessiblemem: none)
-// CHECK-LABEL: define dso_local i32 @test_no_sanitize_all
-// CHECK-SAME: (ptr noundef [[X:%.*]], ptr noundef readonly captures(none) [[Y:%.*]]) local_unnamed_addr #[[ATTR3]] !pcsections [[META14]] {
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define dso_local i32 @test_no_sanitize_all(
+// CHECK-SAME: ptr noundef [[X:%.*]], ptr noundef readonly captures(none) [[Y:%.*]]) local_unnamed_addr #[[ATTR3]] !pcsections [[META14]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[X_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    store ptr [[X]], ptr [[X_ADDR]], align 8, !tbaa [[TBAA6]]
+// CHECK-NEXT:    store ptr [[X]], ptr [[X_ADDR]], align 8, !tbaa [[INTPTR_TBAA6]]
 // CHECK-NEXT:    store atomic i32 1, ptr [[X]] monotonic, align 4, !pcsections [[META11]]
 // CHECK-NEXT:    notail call void @escape(ptr noundef nonnull [[X_ADDR]])
-// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[Y]], align 4, !tbaa [[TBAA12]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[Y]], align 4, !tbaa [[INT_TBAA12]]
 // CHECK-NEXT:    ret i32 [[TMP0]]
 //
 __attribute__((no_sanitize("all"))) int test_no_sanitize_all(int *x, int *y) {
@@ -101,13 +101,13 @@ __attribute__((no_sanitize("all"))) int test_no_sanitize_all(int *x, int *y) {
 // CHECK: [[META3]] = !{i64 0}
 // CHECK: [[META4]] = !{!"sanmd_covered2!C", [[META5:![0-9]+]]}
 // CHECK: [[META5]] = !{i64 3}
-// CHECK: [[TBAA6]] = !{[[META7:![0-9]+]], [[META7]], i64 0}
+// CHECK: [[INTPTR_TBAA6]] = !{[[META7:![0-9]+]], [[META7]], i64 0}
 // CHECK: [[META7]] = !{!"p1 int", [[META8:![0-9]+]], i64 0}
 // CHECK: [[META8]] = !{!"any pointer", [[META9:![0-9]+]], i64 0}
 // CHECK: [[META9]] = !{!"omnipotent char", [[META10:![0-9]+]], i64 0}
 // CHECK: [[META10]] = !{!"Simple C/C++ TBAA"}
 // CHECK: [[META11]] = !{!"sanmd_atomics2!C"}
-// CHECK: [[TBAA12]] = !{[[META13:![0-9]+]], [[META13]], i64 0}
+// CHECK: [[INT_TBAA12]] = !{[[META13:![0-9]+]], [[META13]], i64 0}
 // CHECK: [[META13]] = !{!"int", [[META9]], i64 0}
 // CHECK: [[META14]] = !{!"sanmd_covered2!C", [[META15:![0-9]+]]}
 // CHECK: [[META15]] = !{i64 2}
diff --git a/clang/test/CodeGen/sysv_abi.c b/clang/test/CodeGen/sysv_abi.c
index 29ea819c2aa26..a66ecc6e26242 100644
--- a/clang/test/CodeGen/sysv_abi.c
+++ b/clang/test/CodeGen/sysv_abi.c
@@ -1,7 +1,11 @@
 // RUN: %clang_cc1 -triple x86_64-pc-win32 -emit-llvm  -target-cpu skylake-avx512 < %s | FileCheck %s --check-prefixes=CHECK,AVX
+// RUN: %clang_cc1 -triple x86_64-mingw    -emit-llvm  -target-cpu skylake-avx512 < %s | FileCheck %s --check-prefixes=CHECK,AVX
+// RUN: %clang_cc1 -triple x86_64-cygwin   -emit-llvm  -target-cpu skylake-avx512 < %s | FileCheck %s --check-prefixes=CHECK,AVX
 // RUN: %clang_cc1 -triple x86_64-linux -emit-llvm  -target-cpu skylake-avx512 < %s | FileCheck %s --check-prefixes=CHECK,AVX
 // RUN: %clang_cc1 -triple x86_64-uefi -emit-llvm  -target-cpu skylake-avx512 < %s | FileCheck %s --check-prefixes=CHECK,AVX
 // RUN: %clang_cc1 -triple x86_64-pc-win32 -emit-llvm < %s | FileCheck %s --check-prefixes=CHECK,NOAVX
+// RUN: %clang_cc1 -triple x86_64-mingw    -emit-llvm < %s | FileCheck %s --check-prefixes=CHECK,NOAVX
+// RUN: %clang_cc1 -triple x86_64-cygwin   -emit-llvm < %s | FileCheck %s --check-prefixes=CHECK,NOAVX
 // RUN: %clang_cc1 -triple x86_64-linux -emit-llvm < %s | FileCheck %s --check-prefixes=CHECK,NOAVX
 // RUN: %clang_cc1 -triple x86_64-uefi -emit-llvm < %s | FileCheck %s --check-prefixes=CHECK,NOAVX
 
diff --git a/clang/test/CodeGen/ubsan-pass-object-size.c b/clang/test/CodeGen/ubsan-pass-object-size.c
index b36b8bb409aef..c606d33128322 100644
--- a/clang/test/CodeGen/ubsan-pass-object-size.c
+++ b/clang/test/CodeGen/ubsan-pass-object-size.c
@@ -14,7 +14,8 @@ int foo(int *const p __attribute__((pass_object_size(0))), int n) {
   // CHECK: __ubsan_handle_out_of_bounds
 
   {
-    int **p = &p; // Shadow the parameter. The pass_object_size info is lost.
+    int **q = &p;
+    int **p = q; // Shadow the parameter. The pass_object_size info is lost.
     // CHECK-NOT: __ubsan_handle_out_of_bounds
     x = *p[n];
   }
diff --git a/clang/test/CodeGen/vector-convert-boolean.cpp b/clang/test/CodeGen/vector-convert-boolean.cpp
new file mode 100644
index 0000000000000..f47f608833645
--- /dev/null
+++ b/clang/test/CodeGen/vector-convert-boolean.cpp
@@ -0,0 +1,70 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu %s -emit-llvm -o - | FileCheck %s
+
+using v1i = int [[clang::ext_vector_type(1)]];
+using v1b = bool [[clang::ext_vector_type(1)]];
+using v8i = int [[clang::ext_vector_type(8)]];
+using v8b = bool [[clang::ext_vector_type(8)]];
+using v16i = short [[clang::ext_vector_type(16)]];
+using v16b = bool [[clang::ext_vector_type(16)]];
+using v32i = char [[clang::ext_vector_type(32)]];
+using v32b = bool [[clang::ext_vector_type(32)]];
+
+// CHECK-LABEL: define dso_local noundef i8 @_Z3fooDv1_i(
+// CHECK-SAME: i32 noundef [[V_COERCE:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca <1 x i1>, align 1
+// CHECK-NEXT:    [[V:%.*]] = alloca <1 x i32>, align 4
+// CHECK-NEXT:    [[V_ADDR:%.*]] = alloca <1 x i32>, align 4
+// CHECK-NEXT:    store i32 [[V_COERCE]], ptr [[V]], align 4
+// CHECK-NEXT:    [[V1:%.*]] = load <1 x i32>, ptr [[V]], align 4
+// CHECK-NEXT:    store <1 x i32> [[V1]], ptr [[V_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load <1 x i32>, ptr [[V_ADDR]], align 4
+// CHECK-NEXT:    [[TOBOOL:%.*]] = icmp ne <1 x i32> [[TMP0]], zeroinitializer
+// CHECK-NEXT:    store <1 x i1> [[TOBOOL]], ptr [[RETVAL]], align 1
+// CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[RETVAL]], align 1
+// CHECK-NEXT:    ret i8 [[TMP1]]
+//
+v1b foo(v1i v) { return v; }
+// CHECK-LABEL: define dso_local noundef i8 @_Z3fooDv8_i(
+// CHECK-SAME: ptr noundef byval(<8 x i32>) align 32 [[TMP0:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca <8 x i1>, align 1
+// CHECK-NEXT:    [[V_ADDR:%.*]] = alloca <8 x i32>, align 32
+// CHECK-NEXT:    [[V:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32
+// CHECK-NEXT:    store <8 x i32> [[V]], ptr [[V_ADDR]], align 32
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr [[V_ADDR]], align 32
+// CHECK-NEXT:    [[TOBOOL:%.*]] = icmp ne <8 x i32> [[TMP1]], zeroinitializer
+// CHECK-NEXT:    store <8 x i1> [[TOBOOL]], ptr [[RETVAL]], align 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr [[RETVAL]], align 1
+// CHECK-NEXT:    ret i8 [[TMP2]]
+//
+v8b foo(v8i v) { return v; }
+// CHECK-LABEL: define dso_local noundef i16 @_Z3fooDv16_s(
+// CHECK-SAME: ptr noundef byval(<16 x i16>) align 32 [[TMP0:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca <16 x i1>, align 2
+// CHECK-NEXT:    [[V_ADDR:%.*]] = alloca <16 x i16>, align 32
+// CHECK-NEXT:    [[V:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32
+// CHECK-NEXT:    store <16 x i16> [[V]], ptr [[V_ADDR]], align 32
+// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr [[V_ADDR]], align 32
+// CHECK-NEXT:    [[TOBOOL:%.*]] = icmp ne <16 x i16> [[TMP1]], zeroinitializer
+// CHECK-NEXT:    store <16 x i1> [[TOBOOL]], ptr [[RETVAL]], align 2
+// CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr [[RETVAL]], align 2
+// CHECK-NEXT:    ret i16 [[TMP2]]
+//
+v16b foo(v16i v) { return v; }
+// CHECK-LABEL: define dso_local noundef i32 @_Z3fooDv32_c(
+// CHECK-SAME: ptr noundef byval(<32 x i8>) align 32 [[TMP0:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca <32 x i1>, align 4
+// CHECK-NEXT:    [[V_ADDR:%.*]] = alloca <32 x i8>, align 32
+// CHECK-NEXT:    [[V:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32
+// CHECK-NEXT:    store <32 x i8> [[V]], ptr [[V_ADDR]], align 32
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i8>, ptr [[V_ADDR]], align 32
+// CHECK-NEXT:    [[TOBOOL:%.*]] = icmp ne <32 x i8> [[TMP1]], zeroinitializer
+// CHECK-NEXT:    store <32 x i1> [[TOBOOL]], ptr [[RETVAL]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[RETVAL]], align 4
+// CHECK-NEXT:    ret i32 [[TMP2]]
+//
+v32b foo(v32i v) { return v; }
diff --git a/clang/test/CodeGen/vla.c b/clang/test/CodeGen/vla.c
index a22ba727df2fe..18aa744b5f6fe 100644
--- a/clang/test/CodeGen/vla.c
+++ b/clang/test/CodeGen/vla.c
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -Wno-int-conversion -triple i386-unknown-unknown %s -emit-llvm -o - | FileCheck %s -check-prefixes=CHECK,NULL-INVALID
-// RUN: %clang_cc1 -Wno-int-conversion -triple i386-unknown-unknown %s -emit-llvm -fno-delete-null-pointer-checks -o - | FileCheck %s -check-prefixes=CHECK,NULL-VALID
+// RUN: %clang_cc1 -Wno-error=incompatible-pointer-types -Wno-int-conversion -triple i386-unknown-unknown %s -emit-llvm -o - | FileCheck %s -check-prefixes=CHECK,NULL-INVALID
+// RUN: %clang_cc1 -Wno-error=incompatible-pointer-types -Wno-int-conversion -triple i386-unknown-unknown %s -emit-llvm -fno-delete-null-pointer-checks -o - | FileCheck %s -check-prefixes=CHECK,NULL-VALID
 
 int b(char* x);
 
diff --git a/clang/test/CodeGen/vlt_to_pointer.c b/clang/test/CodeGen/vlt_to_pointer.c
index f06a1607ef469..7b9e60ab51ae7 100644
--- a/clang/test/CodeGen/vlt_to_pointer.c
+++ b/clang/test/CodeGen/vlt_to_pointer.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -Wno-error=incompatible-pointer-types %s -emit-llvm -o - | FileCheck %s
 
 int c[1][3*2];
 // CHECK: @{{.+}} ={{.*}}global [1 x [6 x {{i[0-9]+}}]] zeroinitializer
diff --git a/clang/test/CodeGenCXX/attr-likelihood-if-branch-weights.cpp b/clang/test/CodeGenCXX/attr-likelihood-if-branch-weights.cpp
index a77593f5df738..8969e12f8f797 100644
--- a/clang/test/CodeGenCXX/attr-likelihood-if-branch-weights.cpp
+++ b/clang/test/CodeGenCXX/attr-likelihood-if-branch-weights.cpp
@@ -1,4 +1,4 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6
 // RUN: %clang_cc1 -O1 -disable-llvm-passes -emit-llvm %s -o - -triple=x86_64-- | FileCheck %s
 
 extern volatile bool b;
@@ -6,22 +6,23 @@ extern volatile int i;
 extern bool A();
 extern bool B();
 
-// CHECK-LABEL: @_Z1fv(
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define dso_local noundef zeroext i1 @_Z1fv(
+// CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[RETVAL:%.*]] = alloca i1, align 1
-// CHECK-NEXT:    [[TMP0:%.*]] = load volatile i8, ptr @b, align 1, !tbaa [[TBAA2:![0-9]+]], !range [[RNG6:![0-9]+]]
-// CHECK-NEXT:    [[TOBOOL:%.*]] = trunc i8 [[TMP0]] to i1
-// CHECK-NEXT:    [[TOBOOL_EXPVAL:%.*]] = call i1 @llvm.expect.i1(i1 [[TOBOOL]], i1 true)
-// CHECK-NEXT:    br i1 [[TOBOOL_EXPVAL]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
-// CHECK:       if.then:
+// CHECK-NEXT:    [[TMP0:%.*]] = load volatile i8, ptr @b, align 1, !tbaa [[BOOL_TBAA2:![0-9]+]], !range [[RNG6:![0-9]+]], !noundef [[META7:![0-9]+]]
+// CHECK-NEXT:    [[LOADEDV:%.*]] = trunc i8 [[TMP0]] to i1
+// CHECK-NEXT:    [[LOADEDV_EXPVAL:%.*]] = call i1 @llvm.expect.i1(i1 [[LOADEDV]], i1 true)
+// CHECK-NEXT:    br i1 [[LOADEDV_EXPVAL]], label %[[IF_THEN:.*]], label %[[IF_END:.*]]
+// CHECK:       [[IF_THEN]]:
 // CHECK-NEXT:    [[CALL:%.*]] = call noundef zeroext i1 @_Z1Av()
 // CHECK-NEXT:    store i1 [[CALL]], ptr [[RETVAL]], align 1
-// CHECK-NEXT:    br label [[RETURN:%.*]]
-// CHECK:       if.end:
+// CHECK-NEXT:    br label %[[RETURN:.*]]
+// CHECK:       [[IF_END]]:
 // CHECK-NEXT:    [[CALL1:%.*]] = call noundef zeroext i1 @_Z1Bv()
 // CHECK-NEXT:    store i1 [[CALL1]], ptr [[RETVAL]], align 1
-// CHECK-NEXT:    br label [[RETURN]]
-// CHECK:       return:
+// CHECK-NEXT:    br label %[[RETURN]]
+// CHECK:       [[RETURN]]:
 // CHECK-NEXT:    [[TMP1:%.*]] = load i1, ptr [[RETVAL]], align 1
 // CHECK-NEXT:    ret i1 [[TMP1]]
 //
@@ -33,22 +34,23 @@ bool f() {
   return B();
 }
 
-// CHECK-LABEL: @_Z1gv(
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define dso_local noundef zeroext i1 @_Z1gv(
+// CHECK-SAME: ) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[RETVAL:%.*]] = alloca i1, align 1
-// CHECK-NEXT:    [[TMP0:%.*]] = load volatile i8, ptr @b, align 1, !tbaa [[TBAA2]], !range [[RNG6]]
-// CHECK-NEXT:    [[TOBOOL:%.*]] = trunc i8 [[TMP0]] to i1
-// CHECK-NEXT:    [[TOBOOL_EXPVAL:%.*]] = call i1 @llvm.expect.i1(i1 [[TOBOOL]], i1 false)
-// CHECK-NEXT:    br i1 [[TOBOOL_EXPVAL]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
-// CHECK:       if.then:
+// CHECK-NEXT:    [[TMP0:%.*]] = load volatile i8, ptr @b, align 1, !tbaa [[BOOL_TBAA2]], !range [[RNG6]], !noundef [[META7]]
+// CHECK-NEXT:    [[LOADEDV:%.*]] = trunc i8 [[TMP0]] to i1
+// CHECK-NEXT:    [[LOADEDV_EXPVAL:%.*]] = call i1 @llvm.expect.i1(i1 [[LOADEDV]], i1 false)
+// CHECK-NEXT:    br i1 [[LOADEDV_EXPVAL]], label %[[IF_THEN:.*]], label %[[IF_END:.*]]
+// CHECK:       [[IF_THEN]]:
 // CHECK-NEXT:    [[CALL:%.*]] = call noundef zeroext i1 @_Z1Av()
 // CHECK-NEXT:    store i1 [[CALL]], ptr [[RETVAL]], align 1
-// CHECK-NEXT:    br label [[RETURN:%.*]]
-// CHECK:       if.end:
+// CHECK-NEXT:    br label %[[RETURN:.*]]
+// CHECK:       [[IF_END]]:
 // CHECK-NEXT:    [[CALL1:%.*]] = call noundef zeroext i1 @_Z1Bv()
 // CHECK-NEXT:    store i1 [[CALL1]], ptr [[RETVAL]], align 1
-// CHECK-NEXT:    br label [[RETURN]]
-// CHECK:       return:
+// CHECK-NEXT:    br label %[[RETURN]]
+// CHECK:       [[RETURN]]:
 // CHECK-NEXT:    [[TMP1:%.*]] = load i1, ptr [[RETVAL]], align 1
 // CHECK-NEXT:    ret i1 [[TMP1]]
 //
@@ -61,22 +63,23 @@ bool g() {
   return B();
 }
 
-// CHECK-LABEL: @_Z1hv(
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define dso_local noundef zeroext i1 @_Z1hv(
+// CHECK-SAME: ) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[RETVAL:%.*]] = alloca i1, align 1
-// CHECK-NEXT:    [[TMP0:%.*]] = load volatile i8, ptr @b, align 1, !tbaa [[TBAA2]], !range [[RNG6]]
-// CHECK-NEXT:    [[TOBOOL:%.*]] = trunc i8 [[TMP0]] to i1
-// CHECK-NEXT:    [[TOBOOL_EXPVAL:%.*]] = call i1 @llvm.expect.i1(i1 [[TOBOOL]], i1 false)
-// CHECK-NEXT:    br i1 [[TOBOOL_EXPVAL]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
-// CHECK:       if.then:
+// CHECK-NEXT:    [[TMP0:%.*]] = load volatile i8, ptr @b, align 1, !tbaa [[BOOL_TBAA2]], !range [[RNG6]], !noundef [[META7]]
+// CHECK-NEXT:    [[LOADEDV:%.*]] = trunc i8 [[TMP0]] to i1
+// CHECK-NEXT:    [[LOADEDV_EXPVAL:%.*]] = call i1 @llvm.expect.i1(i1 [[LOADEDV]], i1 false)
+// CHECK-NEXT:    br i1 [[LOADEDV_EXPVAL]], label %[[IF_THEN:.*]], label %[[IF_END:.*]]
+// CHECK:       [[IF_THEN]]:
 // CHECK-NEXT:    [[CALL:%.*]] = call noundef zeroext i1 @_Z1Av()
 // CHECK-NEXT:    store i1 [[CALL]], ptr [[RETVAL]], align 1
-// CHECK-NEXT:    br label [[RETURN:%.*]]
-// CHECK:       if.end:
+// CHECK-NEXT:    br label %[[RETURN:.*]]
+// CHECK:       [[IF_END]]:
 // CHECK-NEXT:    [[CALL1:%.*]] = call noundef zeroext i1 @_Z1Bv()
 // CHECK-NEXT:    store i1 [[CALL1]], ptr [[RETVAL]], align 1
-// CHECK-NEXT:    br label [[RETURN]]
-// CHECK:       return:
+// CHECK-NEXT:    br label %[[RETURN]]
+// CHECK:       [[RETURN]]:
 // CHECK-NEXT:    [[TMP1:%.*]] = load i1, ptr [[RETVAL]], align 1
 // CHECK-NEXT:    ret i1 [[TMP1]]
 //
@@ -87,18 +90,19 @@ bool h() {
   return B();
 }
 
-// CHECK-LABEL: @_Z8NullStmtv(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load volatile i8, ptr @b, align 1, !tbaa [[TBAA2]], !range [[RNG6]]
-// CHECK-NEXT:    [[TOBOOL:%.*]] = trunc i8 [[TMP0]] to i1
-// CHECK-NEXT:    [[TOBOOL_EXPVAL:%.*]] = call i1 @llvm.expect.i1(i1 [[TOBOOL]], i1 false)
-// CHECK-NEXT:    br i1 [[TOBOOL_EXPVAL]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
-// CHECK:       if.then:
-// CHECK-NEXT:    br label [[IF_END:%.*]]
-// CHECK:       if.else:
-// CHECK-NEXT:    store volatile i8 1, ptr @b, align 1, !tbaa [[TBAA2]]
-// CHECK-NEXT:    br label [[IF_END]]
-// CHECK:       if.end:
+// CHECK-LABEL: define dso_local void @_Z8NullStmtv(
+// CHECK-SAME: ) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load volatile i8, ptr @b, align 1, !tbaa [[BOOL_TBAA2]], !range [[RNG6]], !noundef [[META7]]
+// CHECK-NEXT:    [[LOADEDV:%.*]] = trunc i8 [[TMP0]] to i1
+// CHECK-NEXT:    [[LOADEDV_EXPVAL:%.*]] = call i1 @llvm.expect.i1(i1 [[LOADEDV]], i1 false)
+// CHECK-NEXT:    br i1 [[LOADEDV_EXPVAL]], label %[[IF_THEN:.*]], label %[[IF_ELSE:.*]]
+// CHECK:       [[IF_THEN]]:
+// CHECK-NEXT:    br label %[[IF_END:.*]]
+// CHECK:       [[IF_ELSE]]:
+// CHECK-NEXT:    store volatile i8 1, ptr @b, align 1, !tbaa [[BOOL_TBAA2]]
+// CHECK-NEXT:    br label %[[IF_END]]
+// CHECK:       [[IF_END]]:
 // CHECK-NEXT:    ret void
 //
 void NullStmt() {
@@ -110,33 +114,34 @@ void NullStmt() {
   }
 }
 
-// CHECK-LABEL: @_Z6IfStmtv(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load volatile i8, ptr @b, align 1, !tbaa [[TBAA2]], !range [[RNG6]]
-// CHECK-NEXT:    [[TOBOOL:%.*]] = trunc i8 [[TMP0]] to i1
-// CHECK-NEXT:    [[TOBOOL_EXPVAL:%.*]] = call i1 @llvm.expect.i1(i1 [[TOBOOL]], i1 false)
-// CHECK-NEXT:    br i1 [[TOBOOL_EXPVAL]], label [[IF_THEN:%.*]], label [[IF_END2:%.*]]
-// CHECK:       if.then:
+// CHECK-LABEL: define dso_local void @_Z6IfStmtv(
+// CHECK-SAME: ) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load volatile i8, ptr @b, align 1, !tbaa [[BOOL_TBAA2]], !range [[RNG6]], !noundef [[META7]]
+// CHECK-NEXT:    [[LOADEDV:%.*]] = trunc i8 [[TMP0]] to i1
+// CHECK-NEXT:    [[LOADEDV_EXPVAL:%.*]] = call i1 @llvm.expect.i1(i1 [[LOADEDV]], i1 false)
+// CHECK-NEXT:    br i1 [[LOADEDV_EXPVAL]], label %[[IF_THEN:.*]], label %[[IF_END2:.*]]
+// CHECK:       [[IF_THEN]]:
 // CHECK-NEXT:    [[CALL:%.*]] = call noundef zeroext i1 @_Z1Bv()
-// CHECK-NEXT:    br i1 [[CALL]], label [[IF_THEN1:%.*]], label [[IF_END:%.*]]
-// CHECK:       if.then1:
-// CHECK-NEXT:    br label [[IF_END]]
-// CHECK:       if.end:
-// CHECK-NEXT:    br label [[IF_END2]]
-// CHECK:       if.end2:
-// CHECK-NEXT:    [[TMP1:%.*]] = load volatile i8, ptr @b, align 1, !tbaa [[TBAA2]], !range [[RNG6]]
-// CHECK-NEXT:    [[TOBOOL3:%.*]] = trunc i8 [[TMP1]] to i1
-// CHECK-NEXT:    br i1 [[TOBOOL3]], label [[IF_THEN4:%.*]], label [[IF_END8:%.*]]
-// CHECK:       if.then4:
+// CHECK-NEXT:    br i1 [[CALL]], label %[[IF_THEN1:.*]], label %[[IF_END:.*]]
+// CHECK:       [[IF_THEN1]]:
+// CHECK-NEXT:    br label %[[IF_END]]
+// CHECK:       [[IF_END]]:
+// CHECK-NEXT:    br label %[[IF_END2]]
+// CHECK:       [[IF_END2]]:
+// CHECK-NEXT:    [[TMP1:%.*]] = load volatile i8, ptr @b, align 1, !tbaa [[BOOL_TBAA2]], !range [[RNG6]], !noundef [[META7]]
+// CHECK-NEXT:    [[LOADEDV3:%.*]] = trunc i8 [[TMP1]] to i1
+// CHECK-NEXT:    br i1 [[LOADEDV3]], label %[[IF_THEN4:.*]], label %[[IF_END8:.*]]
+// CHECK:       [[IF_THEN4]]:
 // CHECK-NEXT:    [[CALL5:%.*]] = call noundef zeroext i1 @_Z1Bv()
 // CHECK-NEXT:    [[CALL5_EXPVAL:%.*]] = call i1 @llvm.expect.i1(i1 [[CALL5]], i1 false)
-// CHECK-NEXT:    br i1 [[CALL5_EXPVAL]], label [[IF_THEN6:%.*]], label [[IF_END7:%.*]]
-// CHECK:       if.then6:
-// CHECK-NEXT:    store volatile i8 0, ptr @b, align 1, !tbaa [[TBAA2]]
-// CHECK-NEXT:    br label [[IF_END7]]
-// CHECK:       if.end7:
-// CHECK-NEXT:    br label [[IF_END8]]
-// CHECK:       if.end8:
+// CHECK-NEXT:    br i1 [[CALL5_EXPVAL]], label %[[IF_THEN6:.*]], label %[[IF_END7:.*]]
+// CHECK:       [[IF_THEN6]]:
+// CHECK-NEXT:    store volatile i8 0, ptr @b, align 1, !tbaa [[BOOL_TBAA2]]
+// CHECK-NEXT:    br label %[[IF_END7]]
+// CHECK:       [[IF_END7]]:
+// CHECK-NEXT:    br label %[[IF_END8]]
+// CHECK:       [[IF_END8]]:
 // CHECK-NEXT:    ret void
 //
 void IfStmt() {
@@ -149,37 +154,38 @@ void IfStmt() {
   }
 }
 
-// CHECK-LABEL: @_Z9WhileStmtv(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load volatile i8, ptr @b, align 1, !tbaa [[TBAA2]], !range [[RNG6]]
-// CHECK-NEXT:    [[TOBOOL:%.*]] = trunc i8 [[TMP0]] to i1
-// CHECK-NEXT:    [[TOBOOL_EXPVAL:%.*]] = call i1 @llvm.expect.i1(i1 [[TOBOOL]], i1 false)
-// CHECK-NEXT:    br i1 [[TOBOOL_EXPVAL]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
-// CHECK:       if.then:
-// CHECK-NEXT:    br label [[WHILE_COND:%.*]]
-// CHECK:       while.cond:
+// CHECK-LABEL: define dso_local void @_Z9WhileStmtv(
+// CHECK-SAME: ) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load volatile i8, ptr @b, align 1, !tbaa [[BOOL_TBAA2]], !range [[RNG6]], !noundef [[META7]]
+// CHECK-NEXT:    [[LOADEDV:%.*]] = trunc i8 [[TMP0]] to i1
+// CHECK-NEXT:    [[LOADEDV_EXPVAL:%.*]] = call i1 @llvm.expect.i1(i1 [[LOADEDV]], i1 false)
+// CHECK-NEXT:    br i1 [[LOADEDV_EXPVAL]], label %[[IF_THEN:.*]], label %[[IF_END:.*]]
+// CHECK:       [[IF_THEN]]:
+// CHECK-NEXT:    br label %[[WHILE_COND:.*]]
+// CHECK:       [[WHILE_COND]]:
 // CHECK-NEXT:    [[CALL:%.*]] = call noundef zeroext i1 @_Z1Bv()
-// CHECK-NEXT:    br i1 [[CALL]], label [[WHILE_BODY:%.*]], label [[WHILE_END:%.*]]
-// CHECK:       while.body:
-// CHECK-NEXT:    br label [[WHILE_COND]], !llvm.loop [[LOOP7:![0-9]+]]
-// CHECK:       while.end:
-// CHECK-NEXT:    br label [[IF_END]]
-// CHECK:       if.end:
-// CHECK-NEXT:    [[TMP1:%.*]] = load volatile i8, ptr @b, align 1, !tbaa [[TBAA2]], !range [[RNG6]]
-// CHECK-NEXT:    [[TOBOOL1:%.*]] = trunc i8 [[TMP1]] to i1
-// CHECK-NEXT:    br i1 [[TOBOOL1]], label [[IF_THEN2:%.*]], label [[IF_END7:%.*]]
-// CHECK:       if.then2:
-// CHECK-NEXT:    br label [[WHILE_COND3:%.*]]
-// CHECK:       while.cond3:
+// CHECK-NEXT:    br i1 [[CALL]], label %[[WHILE_BODY:.*]], label %[[WHILE_END:.*]]
+// CHECK:       [[WHILE_BODY]]:
+// CHECK-NEXT:    br label %[[WHILE_COND]], !llvm.loop [[LOOP8:![0-9]+]]
+// CHECK:       [[WHILE_END]]:
+// CHECK-NEXT:    br label %[[IF_END]]
+// CHECK:       [[IF_END]]:
+// CHECK-NEXT:    [[TMP1:%.*]] = load volatile i8, ptr @b, align 1, !tbaa [[BOOL_TBAA2]], !range [[RNG6]], !noundef [[META7]]
+// CHECK-NEXT:    [[LOADEDV1:%.*]] = trunc i8 [[TMP1]] to i1
+// CHECK-NEXT:    br i1 [[LOADEDV1]], label %[[IF_THEN2:.*]], label %[[IF_END7:.*]]
+// CHECK:       [[IF_THEN2]]:
+// CHECK-NEXT:    br label %[[WHILE_COND3:.*]]
+// CHECK:       [[WHILE_COND3]]:
 // CHECK-NEXT:    [[CALL4:%.*]] = call noundef zeroext i1 @_Z1Bv()
 // CHECK-NEXT:    [[CALL4_EXPVAL:%.*]] = call i1 @llvm.expect.i1(i1 [[CALL4]], i1 false)
-// CHECK-NEXT:    br i1 [[CALL4_EXPVAL]], label [[WHILE_BODY5:%.*]], label [[WHILE_END6:%.*]]
-// CHECK:       while.body5:
-// CHECK-NEXT:    store volatile i8 0, ptr @b, align 1, !tbaa [[TBAA2]]
-// CHECK-NEXT:    br label [[WHILE_COND3]], !llvm.loop [[LOOP10:![0-9]+]]
-// CHECK:       while.end6:
-// CHECK-NEXT:    br label [[IF_END7]]
-// CHECK:       if.end7:
+// CHECK-NEXT:    br i1 [[CALL4_EXPVAL]], label %[[WHILE_BODY5:.*]], label %[[WHILE_END6:.*]]
+// CHECK:       [[WHILE_BODY5]]:
+// CHECK-NEXT:    store volatile i8 0, ptr @b, align 1, !tbaa [[BOOL_TBAA2]]
+// CHECK-NEXT:    br label %[[WHILE_COND3]], !llvm.loop [[LOOP11:![0-9]+]]
+// CHECK:       [[WHILE_END6]]:
+// CHECK-NEXT:    br label %[[IF_END7]]
+// CHECK:       [[IF_END7]]:
 // CHECK-NEXT:    ret void
 //
 void WhileStmt() {
@@ -191,35 +197,36 @@ void WhileStmt() {
       [[unlikely]] { b = false; }
 }
 
-// CHECK-LABEL: @_Z6DoStmtv(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load volatile i8, ptr @b, align 1, !tbaa [[TBAA2]], !range [[RNG6]]
-// CHECK-NEXT:    [[TOBOOL:%.*]] = trunc i8 [[TMP0]] to i1
-// CHECK-NEXT:    [[TOBOOL_EXPVAL:%.*]] = call i1 @llvm.expect.i1(i1 [[TOBOOL]], i1 false)
-// CHECK-NEXT:    br i1 [[TOBOOL_EXPVAL]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
-// CHECK:       if.then:
-// CHECK-NEXT:    br label [[DO_BODY:%.*]]
-// CHECK:       do.body:
-// CHECK-NEXT:    br label [[DO_COND:%.*]]
-// CHECK:       do.cond:
+// CHECK-LABEL: define dso_local void @_Z6DoStmtv(
+// CHECK-SAME: ) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load volatile i8, ptr @b, align 1, !tbaa [[BOOL_TBAA2]], !range [[RNG6]], !noundef [[META7]]
+// CHECK-NEXT:    [[LOADEDV:%.*]] = trunc i8 [[TMP0]] to i1
+// CHECK-NEXT:    [[LOADEDV_EXPVAL:%.*]] = call i1 @llvm.expect.i1(i1 [[LOADEDV]], i1 false)
+// CHECK-NEXT:    br i1 [[LOADEDV_EXPVAL]], label %[[IF_THEN:.*]], label %[[IF_END:.*]]
+// CHECK:       [[IF_THEN]]:
+// CHECK-NEXT:    br label %[[DO_BODY:.*]]
+// CHECK:       [[DO_BODY]]:
+// CHECK-NEXT:    br label %[[DO_COND:.*]]
+// CHECK:       [[DO_COND]]:
 // CHECK-NEXT:    [[CALL:%.*]] = call noundef zeroext i1 @_Z1Bv()
-// CHECK-NEXT:    br i1 [[CALL]], label [[DO_BODY]], label [[DO_END:%.*]], !llvm.loop [[LOOP11:![0-9]+]]
-// CHECK:       do.end:
-// CHECK-NEXT:    br label [[IF_END]]
-// CHECK:       if.end:
-// CHECK-NEXT:    [[TMP1:%.*]] = load volatile i8, ptr @b, align 1, !tbaa [[TBAA2]], !range [[RNG6]]
-// CHECK-NEXT:    [[TOBOOL1:%.*]] = trunc i8 [[TMP1]] to i1
-// CHECK-NEXT:    br i1 [[TOBOOL1]], label [[IF_THEN2:%.*]], label [[IF_END7:%.*]]
-// CHECK:       if.then2:
-// CHECK-NEXT:    br label [[DO_BODY3:%.*]]
-// CHECK:       do.body3:
-// CHECK-NEXT:    br label [[DO_COND4:%.*]]
-// CHECK:       do.cond4:
+// CHECK-NEXT:    br i1 [[CALL]], label %[[DO_BODY]], label %[[DO_END:.*]], !llvm.loop [[LOOP12:![0-9]+]]
+// CHECK:       [[DO_END]]:
+// CHECK-NEXT:    br label %[[IF_END]]
+// CHECK:       [[IF_END]]:
+// CHECK-NEXT:    [[TMP1:%.*]] = load volatile i8, ptr @b, align 1, !tbaa [[BOOL_TBAA2]], !range [[RNG6]], !noundef [[META7]]
+// CHECK-NEXT:    [[LOADEDV1:%.*]] = trunc i8 [[TMP1]] to i1
+// CHECK-NEXT:    br i1 [[LOADEDV1]], label %[[IF_THEN2:.*]], label %[[IF_END7:.*]]
+// CHECK:       [[IF_THEN2]]:
+// CHECK-NEXT:    br label %[[DO_BODY3:.*]]
+// CHECK:       [[DO_BODY3]]:
+// CHECK-NEXT:    br label %[[DO_COND4:.*]]
+// CHECK:       [[DO_COND4]]:
 // CHECK-NEXT:    [[CALL5:%.*]] = call noundef zeroext i1 @_Z1Bv()
-// CHECK-NEXT:    br i1 [[CALL5]], label [[DO_BODY3]], label [[DO_END6:%.*]], !llvm.loop [[LOOP12:![0-9]+]]
-// CHECK:       do.end6:
-// CHECK-NEXT:    br label [[IF_END7]]
-// CHECK:       if.end7:
+// CHECK-NEXT:    br i1 [[CALL5]], label %[[DO_BODY3]], label %[[DO_END6:.*]], !llvm.loop [[LOOP13:![0-9]+]]
+// CHECK:       [[DO_END6]]:
+// CHECK-NEXT:    br label %[[IF_END7]]
+// CHECK:       [[IF_END7]]:
 // CHECK-NEXT:    ret void
 //
 void DoStmt() {
@@ -234,36 +241,37 @@ void DoStmt() {
     while (B());
 }
 
-// CHECK-LABEL: @_Z7ForStmtv(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load volatile i8, ptr @b, align 1, !tbaa [[TBAA2]], !range [[RNG6]]
-// CHECK-NEXT:    [[TOBOOL:%.*]] = trunc i8 [[TMP0]] to i1
-// CHECK-NEXT:    [[TOBOOL_EXPVAL:%.*]] = call i1 @llvm.expect.i1(i1 [[TOBOOL]], i1 false)
-// CHECK-NEXT:    br i1 [[TOBOOL_EXPVAL]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
-// CHECK:       if.then:
-// CHECK-NEXT:    br label [[FOR_COND:%.*]]
-// CHECK:       for.cond:
+// CHECK-LABEL: define dso_local void @_Z7ForStmtv(
+// CHECK-SAME: ) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load volatile i8, ptr @b, align 1, !tbaa [[BOOL_TBAA2]], !range [[RNG6]], !noundef [[META7]]
+// CHECK-NEXT:    [[LOADEDV:%.*]] = trunc i8 [[TMP0]] to i1
+// CHECK-NEXT:    [[LOADEDV_EXPVAL:%.*]] = call i1 @llvm.expect.i1(i1 [[LOADEDV]], i1 false)
+// CHECK-NEXT:    br i1 [[LOADEDV_EXPVAL]], label %[[IF_THEN:.*]], label %[[IF_END:.*]]
+// CHECK:       [[IF_THEN]]:
+// CHECK-NEXT:    br label %[[FOR_COND:.*]]
+// CHECK:       [[FOR_COND]]:
 // CHECK-NEXT:    [[CALL:%.*]] = call noundef zeroext i1 @_Z1Bv()
-// CHECK-NEXT:    br i1 [[CALL]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]]
-// CHECK:       for.body:
-// CHECK-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP13:![0-9]+]]
-// CHECK:       for.end:
-// CHECK-NEXT:    br label [[IF_END]]
-// CHECK:       if.end:
-// CHECK-NEXT:    [[TMP1:%.*]] = load volatile i8, ptr @b, align 1, !tbaa [[TBAA2]], !range [[RNG6]]
-// CHECK-NEXT:    [[TOBOOL1:%.*]] = trunc i8 [[TMP1]] to i1
-// CHECK-NEXT:    br i1 [[TOBOOL1]], label [[IF_THEN2:%.*]], label [[IF_END7:%.*]]
-// CHECK:       if.then2:
-// CHECK-NEXT:    br label [[FOR_COND3:%.*]]
-// CHECK:       for.cond3:
+// CHECK-NEXT:    br i1 [[CALL]], label %[[FOR_BODY:.*]], label %[[FOR_END:.*]]
+// CHECK:       [[FOR_BODY]]:
+// CHECK-NEXT:    br label %[[FOR_COND]], !llvm.loop [[LOOP14:![0-9]+]]
+// CHECK:       [[FOR_END]]:
+// CHECK-NEXT:    br label %[[IF_END]]
+// CHECK:       [[IF_END]]:
+// CHECK-NEXT:    [[TMP1:%.*]] = load volatile i8, ptr @b, align 1, !tbaa [[BOOL_TBAA2]], !range [[RNG6]], !noundef [[META7]]
+// CHECK-NEXT:    [[LOADEDV1:%.*]] = trunc i8 [[TMP1]] to i1
+// CHECK-NEXT:    br i1 [[LOADEDV1]], label %[[IF_THEN2:.*]], label %[[IF_END7:.*]]
+// CHECK:       [[IF_THEN2]]:
+// CHECK-NEXT:    br label %[[FOR_COND3:.*]]
+// CHECK:       [[FOR_COND3]]:
 // CHECK-NEXT:    [[CALL4:%.*]] = call noundef zeroext i1 @_Z1Bv()
 // CHECK-NEXT:    [[CALL4_EXPVAL:%.*]] = call i1 @llvm.expect.i1(i1 [[CALL4]], i1 false)
-// CHECK-NEXT:    br i1 [[CALL4_EXPVAL]], label [[FOR_BODY5:%.*]], label [[FOR_END6:%.*]]
-// CHECK:       for.body5:
-// CHECK-NEXT:    br label [[FOR_COND3]], !llvm.loop [[LOOP14:![0-9]+]]
-// CHECK:       for.end6:
-// CHECK-NEXT:    br label [[IF_END7]]
-// CHECK:       if.end7:
+// CHECK-NEXT:    br i1 [[CALL4_EXPVAL]], label %[[FOR_BODY5:.*]], label %[[FOR_END6:.*]]
+// CHECK:       [[FOR_BODY5]]:
+// CHECK-NEXT:    br label %[[FOR_COND3]], !llvm.loop [[LOOP15:![0-9]+]]
+// CHECK:       [[FOR_END6]]:
+// CHECK-NEXT:    br label %[[IF_END7]]
+// CHECK:       [[IF_END7]]:
 // CHECK-NEXT:    ret void
 //
 void ForStmt() {
@@ -275,20 +283,21 @@ void ForStmt() {
       [[unlikely]] {}
 }
 
-// CHECK-LABEL: @_Z8GotoStmtv(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load volatile i8, ptr @b, align 1, !tbaa [[TBAA2]], !range [[RNG6]]
-// CHECK-NEXT:    [[TOBOOL:%.*]] = trunc i8 [[TMP0]] to i1
-// CHECK-NEXT:    [[TOBOOL_EXPVAL:%.*]] = call i1 @llvm.expect.i1(i1 [[TOBOOL]], i1 false)
-// CHECK-NEXT:    br i1 [[TOBOOL_EXPVAL]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
-// CHECK:       if.then:
-// CHECK-NEXT:    br label [[END:%.*]]
-// CHECK:       if.else:
-// CHECK-NEXT:    store volatile i8 1, ptr @b, align 1, !tbaa [[TBAA2]]
-// CHECK-NEXT:    br label [[IF_END:%.*]]
-// CHECK:       if.end:
-// CHECK-NEXT:    br label [[END]]
-// CHECK:       end:
+// CHECK-LABEL: define dso_local void @_Z8GotoStmtv(
+// CHECK-SAME: ) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load volatile i8, ptr @b, align 1, !tbaa [[BOOL_TBAA2]], !range [[RNG6]], !noundef [[META7]]
+// CHECK-NEXT:    [[LOADEDV:%.*]] = trunc i8 [[TMP0]] to i1
+// CHECK-NEXT:    [[LOADEDV_EXPVAL:%.*]] = call i1 @llvm.expect.i1(i1 [[LOADEDV]], i1 false)
+// CHECK-NEXT:    br i1 [[LOADEDV_EXPVAL]], label %[[IF_THEN:.*]], label %[[IF_ELSE:.*]]
+// CHECK:       [[IF_THEN]]:
+// CHECK-NEXT:    br label %[[END:.*]]
+// CHECK:       [[IF_ELSE]]:
+// CHECK-NEXT:    store volatile i8 1, ptr @b, align 1, !tbaa [[BOOL_TBAA2]]
+// CHECK-NEXT:    br label %[[IF_END:.*]]
+// CHECK:       [[IF_END]]:
+// CHECK-NEXT:    br label %[[END]]
+// CHECK:       [[END]]:
 // CHECK-NEXT:    ret void
 //
 void GotoStmt() {
@@ -301,18 +310,19 @@ void GotoStmt() {
 end:;
 }
 
-// CHECK-LABEL: @_Z10ReturnStmtv(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load volatile i8, ptr @b, align 1, !tbaa [[TBAA2]], !range [[RNG6]]
-// CHECK-NEXT:    [[TOBOOL:%.*]] = trunc i8 [[TMP0]] to i1
-// CHECK-NEXT:    [[TOBOOL_EXPVAL:%.*]] = call i1 @llvm.expect.i1(i1 [[TOBOOL]], i1 false)
-// CHECK-NEXT:    br i1 [[TOBOOL_EXPVAL]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
-// CHECK:       if.then:
-// CHECK-NEXT:    br label [[IF_END:%.*]]
-// CHECK:       if.else:
-// CHECK-NEXT:    store volatile i8 1, ptr @b, align 1, !tbaa [[TBAA2]]
-// CHECK-NEXT:    br label [[IF_END]]
-// CHECK:       if.end:
+// CHECK-LABEL: define dso_local void @_Z10ReturnStmtv(
+// CHECK-SAME: ) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load volatile i8, ptr @b, align 1, !tbaa [[BOOL_TBAA2]], !range [[RNG6]], !noundef [[META7]]
+// CHECK-NEXT:    [[LOADEDV:%.*]] = trunc i8 [[TMP0]] to i1
+// CHECK-NEXT:    [[LOADEDV_EXPVAL:%.*]] = call i1 @llvm.expect.i1(i1 [[LOADEDV]], i1 false)
+// CHECK-NEXT:    br i1 [[LOADEDV_EXPVAL]], label %[[IF_THEN:.*]], label %[[IF_ELSE:.*]]
+// CHECK:       [[IF_THEN]]:
+// CHECK-NEXT:    br label %[[IF_END:.*]]
+// CHECK:       [[IF_ELSE]]:
+// CHECK-NEXT:    store volatile i8 1, ptr @b, align 1, !tbaa [[BOOL_TBAA2]]
+// CHECK-NEXT:    br label %[[IF_END]]
+// CHECK:       [[IF_END]]:
 // CHECK-NEXT:    ret void
 //
 void ReturnStmt() {
@@ -324,35 +334,36 @@ void ReturnStmt() {
   }
 }
 
-// CHECK-LABEL: @_Z10SwitchStmtv(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load volatile i8, ptr @b, align 1, !tbaa [[TBAA2]], !range [[RNG6]]
-// CHECK-NEXT:    [[TOBOOL:%.*]] = trunc i8 [[TMP0]] to i1
-// CHECK-NEXT:    [[TOBOOL_EXPVAL:%.*]] = call i1 @llvm.expect.i1(i1 [[TOBOOL]], i1 false)
-// CHECK-NEXT:    br i1 [[TOBOOL_EXPVAL]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
-// CHECK:       if.then:
-// CHECK-NEXT:    [[TMP1:%.*]] = load volatile i32, ptr @i, align 4, !tbaa [[TBAA15:![0-9]+]]
-// CHECK-NEXT:    switch i32 [[TMP1]], label [[SW_EPILOG:%.*]] [
+// CHECK-LABEL: define dso_local void @_Z10SwitchStmtv(
+// CHECK-SAME: ) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load volatile i8, ptr @b, align 1, !tbaa [[BOOL_TBAA2]], !range [[RNG6]], !noundef [[META7]]
+// CHECK-NEXT:    [[LOADEDV:%.*]] = trunc i8 [[TMP0]] to i1
+// CHECK-NEXT:    [[LOADEDV_EXPVAL:%.*]] = call i1 @llvm.expect.i1(i1 [[LOADEDV]], i1 false)
+// CHECK-NEXT:    br i1 [[LOADEDV_EXPVAL]], label %[[IF_THEN:.*]], label %[[IF_ELSE:.*]]
+// CHECK:       [[IF_THEN]]:
+// CHECK-NEXT:    [[TMP1:%.*]] = load volatile i32, ptr @i, align 4, !tbaa [[INT_TBAA16:![0-9]+]]
+// CHECK-NEXT:    switch i32 [[TMP1]], label %[[SW_EPILOG:.*]] [
 // CHECK-NEXT:    ]
-// CHECK:       sw.epilog:
-// CHECK-NEXT:    br label [[IF_END:%.*]]
-// CHECK:       if.else:
-// CHECK-NEXT:    store volatile i8 1, ptr @b, align 1, !tbaa [[TBAA2]]
-// CHECK-NEXT:    br label [[IF_END]]
-// CHECK:       if.end:
-// CHECK-NEXT:    [[TMP2:%.*]] = load volatile i8, ptr @b, align 1, !tbaa [[TBAA2]], !range [[RNG6]]
-// CHECK-NEXT:    [[TOBOOL1:%.*]] = trunc i8 [[TMP2]] to i1
-// CHECK-NEXT:    br i1 [[TOBOOL1]], label [[IF_THEN2:%.*]], label [[IF_ELSE4:%.*]]
-// CHECK:       if.then2:
-// CHECK-NEXT:    [[TMP3:%.*]] = load volatile i32, ptr @i, align 4, !tbaa [[TBAA15]]
-// CHECK-NEXT:    switch i32 [[TMP3]], label [[SW_EPILOG3:%.*]] [
+// CHECK:       [[SW_EPILOG]]:
+// CHECK-NEXT:    br label %[[IF_END:.*]]
+// CHECK:       [[IF_ELSE]]:
+// CHECK-NEXT:    store volatile i8 1, ptr @b, align 1, !tbaa [[BOOL_TBAA2]]
+// CHECK-NEXT:    br label %[[IF_END]]
+// CHECK:       [[IF_END]]:
+// CHECK-NEXT:    [[TMP2:%.*]] = load volatile i8, ptr @b, align 1, !tbaa [[BOOL_TBAA2]], !range [[RNG6]], !noundef [[META7]]
+// CHECK-NEXT:    [[LOADEDV1:%.*]] = trunc i8 [[TMP2]] to i1
+// CHECK-NEXT:    br i1 [[LOADEDV1]], label %[[IF_THEN2:.*]], label %[[IF_ELSE4:.*]]
+// CHECK:       [[IF_THEN2]]:
+// CHECK-NEXT:    [[TMP3:%.*]] = load volatile i32, ptr @i, align 4, !tbaa [[INT_TBAA16]]
+// CHECK-NEXT:    switch i32 [[TMP3]], label %[[SW_EPILOG3:.*]] [
 // CHECK-NEXT:    ]
-// CHECK:       sw.epilog3:
-// CHECK-NEXT:    br label [[IF_END5:%.*]]
-// CHECK:       if.else4:
-// CHECK-NEXT:    store volatile i8 1, ptr @b, align 1, !tbaa [[TBAA2]]
-// CHECK-NEXT:    br label [[IF_END5]]
-// CHECK:       if.end5:
+// CHECK:       [[SW_EPILOG3]]:
+// CHECK-NEXT:    br label %[[IF_END5:.*]]
+// CHECK:       [[IF_ELSE4]]:
+// CHECK-NEXT:    store volatile i8 1, ptr @b, align 1, !tbaa [[BOOL_TBAA2]]
+// CHECK-NEXT:    br label %[[IF_END5]]
+// CHECK:       [[IF_END5]]:
 // CHECK-NEXT:    ret void
 //
 void SwitchStmt() {
@@ -371,3 +382,21 @@ void SwitchStmt() {
   }
 }
 
+//.
+// CHECK: [[BOOL_TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
+// CHECK: [[META3]] = !{!"bool", [[META4:![0-9]+]], i64 0}
+// CHECK: [[META4]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0}
+// CHECK: [[META5]] = !{!"Simple C++ TBAA"}
+// CHECK: [[RNG6]] = !{i8 0, i8 2}
+// CHECK: [[META7]] = !{}
+// CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META9:![0-9]+]], [[META10:![0-9]+]]}
+// CHECK: [[META9]] = !{!"llvm.loop.mustprogress"}
+// CHECK: [[META10]] = !{!"llvm.loop.unroll.disable"}
+// CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[META9]], [[META10]]}
+// CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META9]], [[META10]]}
+// CHECK: [[LOOP13]] = distinct !{[[LOOP13]], [[META9]], [[META10]]}
+// CHECK: [[LOOP14]] = distinct !{[[LOOP14]], [[META9]], [[META10]]}
+// CHECK: [[LOOP15]] = distinct !{[[LOOP15]], [[META9]], [[META10]]}
+// CHECK: [[INT_TBAA16]] = !{[[META17:![0-9]+]], [[META17]], i64 0}
+// CHECK: [[META17]] = !{!"int", [[META4]], i64 0}
+//.
diff --git a/clang/test/CodeGenCXX/attr-likelihood-iteration-stmt.cpp b/clang/test/CodeGenCXX/attr-likelihood-iteration-stmt.cpp
index 151b77ac1007b..441faac6bdd3b 100644
--- a/clang/test/CodeGenCXX/attr-likelihood-iteration-stmt.cpp
+++ b/clang/test/CodeGenCXX/attr-likelihood-iteration-stmt.cpp
@@ -1,61 +1,64 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6
 // RUN: %clang_cc1 -O1 -disable-llvm-passes -emit-llvm %s -o - -triple=x86_64-linux-gnu -verify
 // RUN: %clang_cc1 -O1 -disable-llvm-passes -emit-llvm %s -o - -triple=x86_64-linux-gnu | FileCheck %s
 
-// CHECK-LABEL: @_Z2wli(
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define dso_local void @_Z2wli(
+// CHECK-SAME: i32 noundef [[E:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[E_ADDR:%.*]] = alloca i32, align 4
-// CHECK-NEXT:    store i32 [[E:%.*]], ptr [[E_ADDR]], align 4, !tbaa [[TBAA2:![0-9]+]]
-// CHECK-NEXT:    br label [[WHILE_COND:%.*]]
-// CHECK:       while.cond:
-// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[E_ADDR]], align 4, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store i32 [[E]], ptr [[E_ADDR]], align 4, !tbaa [[INT_TBAA2:![0-9]+]]
+// CHECK-NEXT:    br label %[[WHILE_COND:.*]]
+// CHECK:       [[WHILE_COND]]:
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[E_ADDR]], align 4, !tbaa [[INT_TBAA2]]
 // CHECK-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP0]], 0
 // CHECK-NEXT:    [[TOBOOL_EXPVAL:%.*]] = call i1 @llvm.expect.i1(i1 [[TOBOOL]], i1 true)
-// CHECK-NEXT:    br i1 [[TOBOOL_EXPVAL]], label [[WHILE_BODY:%.*]], label [[WHILE_END:%.*]]
-// CHECK:       while.body:
-// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[E_ADDR]], align 4, !tbaa [[TBAA2]]
+// CHECK-NEXT:    br i1 [[TOBOOL_EXPVAL]], label %[[WHILE_BODY:.*]], label %[[WHILE_END:.*]]
+// CHECK:       [[WHILE_BODY]]:
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[E_ADDR]], align 4, !tbaa [[INT_TBAA2]]
 // CHECK-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP1]], 1
-// CHECK-NEXT:    store i32 [[INC]], ptr [[E_ADDR]], align 4, !tbaa [[TBAA2]]
-// CHECK-NEXT:    br label [[WHILE_COND]], !llvm.loop [[LOOP6:![0-9]+]]
-// CHECK:       while.end:
+// CHECK-NEXT:    store i32 [[INC]], ptr [[E_ADDR]], align 4, !tbaa [[INT_TBAA2]]
+// CHECK-NEXT:    br label %[[WHILE_COND]], !llvm.loop [[LOOP6:![0-9]+]]
+// CHECK:       [[WHILE_END]]:
 // CHECK-NEXT:    ret void
 //
 void wl(int e){
   while(e) [[likely]] ++e;
 }
 
-// CHECK-LABEL: @_Z2wui(
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define dso_local void @_Z2wui(
+// CHECK-SAME: i32 noundef [[E:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[E_ADDR:%.*]] = alloca i32, align 4
-// CHECK-NEXT:    store i32 [[E:%.*]], ptr [[E_ADDR]], align 4, !tbaa [[TBAA2]]
-// CHECK-NEXT:    br label [[WHILE_COND:%.*]]
-// CHECK:       while.cond:
-// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[E_ADDR]], align 4, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store i32 [[E]], ptr [[E_ADDR]], align 4, !tbaa [[INT_TBAA2]]
+// CHECK-NEXT:    br label %[[WHILE_COND:.*]]
+// CHECK:       [[WHILE_COND]]:
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[E_ADDR]], align 4, !tbaa [[INT_TBAA2]]
 // CHECK-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP0]], 0
 // CHECK-NEXT:    [[TOBOOL_EXPVAL:%.*]] = call i1 @llvm.expect.i1(i1 [[TOBOOL]], i1 false)
-// CHECK-NEXT:    br i1 [[TOBOOL_EXPVAL]], label [[WHILE_BODY:%.*]], label [[WHILE_END:%.*]]
-// CHECK:       while.body:
-// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[E_ADDR]], align 4, !tbaa [[TBAA2]]
+// CHECK-NEXT:    br i1 [[TOBOOL_EXPVAL]], label %[[WHILE_BODY:.*]], label %[[WHILE_END:.*]]
+// CHECK:       [[WHILE_BODY]]:
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[E_ADDR]], align 4, !tbaa [[INT_TBAA2]]
 // CHECK-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP1]], 1
-// CHECK-NEXT:    store i32 [[INC]], ptr [[E_ADDR]], align 4, !tbaa [[TBAA2]]
-// CHECK-NEXT:    br label [[WHILE_COND]], !llvm.loop [[LOOP9:![0-9]+]]
-// CHECK:       while.end:
+// CHECK-NEXT:    store i32 [[INC]], ptr [[E_ADDR]], align 4, !tbaa [[INT_TBAA2]]
+// CHECK-NEXT:    br label %[[WHILE_COND]], !llvm.loop [[LOOP9:![0-9]+]]
+// CHECK:       [[WHILE_END]]:
 // CHECK-NEXT:    ret void
 //
 void wu(int e){
   while(e) [[unlikely]] ++e;
 }
 
-// CHECK-LABEL: @_Z15w_branch_elidedj(
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define dso_local void @_Z15w_branch_elidedj(
+// CHECK-SAME: i32 noundef [[E:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[E_ADDR:%.*]] = alloca i32, align 4
-// CHECK-NEXT:    store i32 [[E:%.*]], ptr [[E_ADDR]], align 4, !tbaa [[TBAA2]]
-// CHECK-NEXT:    br label [[WHILE_BODY:%.*]]
-// CHECK:       while.body:
-// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[E_ADDR]], align 4, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store i32 [[E]], ptr [[E_ADDR]], align 4, !tbaa [[INT_TBAA2]]
+// CHECK-NEXT:    br label %[[WHILE_BODY:.*]]
+// CHECK:       [[WHILE_BODY]]:
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[E_ADDR]], align 4, !tbaa [[INT_TBAA2]]
 // CHECK-NEXT:    [[INC:%.*]] = add i32 [[TMP0]], 1
-// CHECK-NEXT:    store i32 [[INC]], ptr [[E_ADDR]], align 4, !tbaa [[TBAA2]]
-// CHECK-NEXT:    br label [[WHILE_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+// CHECK-NEXT:    store i32 [[INC]], ptr [[E_ADDR]], align 4, !tbaa [[INT_TBAA2]]
+// CHECK-NEXT:    br label %[[WHILE_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 //
 void w_branch_elided(unsigned e){
   // expected-warning@+2 {{attribute 'likely' has no effect when annotating an infinite loop}}
@@ -63,31 +66,32 @@ void w_branch_elided(unsigned e){
   while(1) [[likely]] ++e;
 }
 
-// CHECK-LABEL: @_Z2flj(
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define dso_local void @_Z2flj(
+// CHECK-SAME: i32 noundef [[E:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[E_ADDR:%.*]] = alloca i32, align 4
 // CHECK-NEXT:    [[I:%.*]] = alloca i32, align 4
-// CHECK-NEXT:    store i32 [[E:%.*]], ptr [[E_ADDR]], align 4, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store i32 [[E]], ptr [[E_ADDR]], align 4, !tbaa [[INT_TBAA2]]
 // CHECK-NEXT:    call void @llvm.lifetime.start.p0(ptr [[I]]) #[[ATTR3:[0-9]+]]
-// CHECK-NEXT:    store i32 0, ptr [[I]], align 4, !tbaa [[TBAA2]]
-// CHECK-NEXT:    br label [[FOR_COND:%.*]]
-// CHECK:       for.cond:
-// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[I]], align 4, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[E_ADDR]], align 4, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store i32 0, ptr [[I]], align 4, !tbaa [[INT_TBAA2]]
+// CHECK-NEXT:    br label %[[FOR_COND:.*]]
+// CHECK:       [[FOR_COND]]:
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[I]], align 4, !tbaa [[INT_TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[E_ADDR]], align 4, !tbaa [[INT_TBAA2]]
 // CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[TMP0]], [[TMP1]]
 // CHECK-NEXT:    [[CMP_EXPVAL:%.*]] = call i1 @llvm.expect.i1(i1 [[CMP]], i1 true)
-// CHECK-NEXT:    br i1 [[CMP_EXPVAL]], label [[FOR_BODY:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-// CHECK:       for.cond.cleanup:
+// CHECK-NEXT:    br i1 [[CMP_EXPVAL]], label %[[FOR_BODY:.*]], label %[[FOR_COND_CLEANUP:.*]]
+// CHECK:       [[FOR_COND_CLEANUP]]:
 // CHECK-NEXT:    call void @llvm.lifetime.end.p0(ptr [[I]]) #[[ATTR3]]
-// CHECK-NEXT:    br label [[FOR_END:%.*]]
-// CHECK:       for.body:
-// CHECK-NEXT:    br label [[FOR_INC:%.*]]
-// CHECK:       for.inc:
-// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[E_ADDR]], align 4, !tbaa [[TBAA2]]
+// CHECK-NEXT:    br label %[[FOR_END:.*]]
+// CHECK:       [[FOR_BODY]]:
+// CHECK-NEXT:    br label %[[FOR_INC:.*]]
+// CHECK:       [[FOR_INC]]:
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[E_ADDR]], align 4, !tbaa [[INT_TBAA2]]
 // CHECK-NEXT:    [[INC:%.*]] = add i32 [[TMP2]], 1
-// CHECK-NEXT:    store i32 [[INC]], ptr [[E_ADDR]], align 4, !tbaa [[TBAA2]]
-// CHECK-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP11:![0-9]+]]
-// CHECK:       for.end:
+// CHECK-NEXT:    store i32 [[INC]], ptr [[E_ADDR]], align 4, !tbaa [[INT_TBAA2]]
+// CHECK-NEXT:    br label %[[FOR_COND]], !llvm.loop [[LOOP11:![0-9]+]]
+// CHECK:       [[FOR_END]]:
 // CHECK-NEXT:    ret void
 //
 void fl(unsigned e)
@@ -95,31 +99,32 @@ void fl(unsigned e)
   for(int i = 0; i != e; ++e) [[likely]];
 }
 
-// CHECK-LABEL: @_Z2fui(
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define dso_local void @_Z2fui(
+// CHECK-SAME: i32 noundef [[E:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[E_ADDR:%.*]] = alloca i32, align 4
 // CHECK-NEXT:    [[I:%.*]] = alloca i32, align 4
-// CHECK-NEXT:    store i32 [[E:%.*]], ptr [[E_ADDR]], align 4, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store i32 [[E]], ptr [[E_ADDR]], align 4, !tbaa [[INT_TBAA2]]
 // CHECK-NEXT:    call void @llvm.lifetime.start.p0(ptr [[I]]) #[[ATTR3]]
-// CHECK-NEXT:    store i32 0, ptr [[I]], align 4, !tbaa [[TBAA2]]
-// CHECK-NEXT:    br label [[FOR_COND:%.*]]
-// CHECK:       for.cond:
-// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[I]], align 4, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[E_ADDR]], align 4, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store i32 0, ptr [[I]], align 4, !tbaa [[INT_TBAA2]]
+// CHECK-NEXT:    br label %[[FOR_COND:.*]]
+// CHECK:       [[FOR_COND]]:
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[I]], align 4, !tbaa [[INT_TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[E_ADDR]], align 4, !tbaa [[INT_TBAA2]]
 // CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[TMP0]], [[TMP1]]
 // CHECK-NEXT:    [[CMP_EXPVAL:%.*]] = call i1 @llvm.expect.i1(i1 [[CMP]], i1 false)
-// CHECK-NEXT:    br i1 [[CMP_EXPVAL]], label [[FOR_BODY:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-// CHECK:       for.cond.cleanup:
+// CHECK-NEXT:    br i1 [[CMP_EXPVAL]], label %[[FOR_BODY:.*]], label %[[FOR_COND_CLEANUP:.*]]
+// CHECK:       [[FOR_COND_CLEANUP]]:
 // CHECK-NEXT:    call void @llvm.lifetime.end.p0(ptr [[I]]) #[[ATTR3]]
-// CHECK-NEXT:    br label [[FOR_END:%.*]]
-// CHECK:       for.body:
-// CHECK-NEXT:    br label [[FOR_INC:%.*]]
-// CHECK:       for.inc:
-// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[E_ADDR]], align 4, !tbaa [[TBAA2]]
+// CHECK-NEXT:    br label %[[FOR_END:.*]]
+// CHECK:       [[FOR_BODY]]:
+// CHECK-NEXT:    br label %[[FOR_INC:.*]]
+// CHECK:       [[FOR_INC]]:
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[E_ADDR]], align 4, !tbaa [[INT_TBAA2]]
 // CHECK-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP2]], 1
-// CHECK-NEXT:    store i32 [[INC]], ptr [[E_ADDR]], align 4, !tbaa [[TBAA2]]
-// CHECK-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP12:![0-9]+]]
-// CHECK:       for.end:
+// CHECK-NEXT:    store i32 [[INC]], ptr [[E_ADDR]], align 4, !tbaa [[INT_TBAA2]]
+// CHECK-NEXT:    br label %[[FOR_COND]], !llvm.loop [[LOOP12:![0-9]+]]
+// CHECK:       [[FOR_END]]:
 // CHECK-NEXT:    ret void
 //
 void fu(int e)
@@ -127,62 +132,64 @@ void fu(int e)
   for(int i = 0; i != e; ++e) [[unlikely]];
 }
 
-// CHECK-LABEL: @_Z15f_branch_elidedv(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    br label [[FOR_COND:%.*]]
-// CHECK:       for.cond:
-// CHECK-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP13:![0-9]+]]
+// CHECK-LABEL: define dso_local void @_Z15f_branch_elidedv(
+// CHECK-SAME: ) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    br label %[[FOR_COND:.*]]
+// CHECK:       [[FOR_COND]]:
+// CHECK-NEXT:    br label %[[FOR_COND]], !llvm.loop [[LOOP13:![0-9]+]]
 //
 void f_branch_elided()
 {
   for(;;) [[likely]];
 }
 
-// CHECK-LABEL: @_Z3frlOA4_i(
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define dso_local void @_Z3frlOA4_i(
+// CHECK-SAME: ptr noundef nonnull align 4 dereferenceable(16) [[E:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[E_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-NEXT:    [[__RANGE1:%.*]] = alloca ptr, align 8
 // CHECK-NEXT:    [[__BEGIN1:%.*]] = alloca ptr, align 8
 // CHECK-NEXT:    [[__END1:%.*]] = alloca ptr, align 8
 // CHECK-NEXT:    [[I:%.*]] = alloca i32, align 4
-// CHECK-NEXT:    store ptr [[E:%.*]], ptr [[E_ADDR]], align 8, !tbaa [[TBAA14:![0-9]+]]
+// CHECK-NEXT:    store ptr [[E]], ptr [[E_ADDR]], align 8, !tbaa [[INTPTR_TBAA14:![0-9]+]]
 // CHECK-NEXT:    call void @llvm.lifetime.start.p0(ptr [[__RANGE1]]) #[[ATTR3]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[E_ADDR]], align 8, !tbaa [[TBAA14]]
-// CHECK-NEXT:    store ptr [[TMP0]], ptr [[__RANGE1]], align 8, !tbaa [[TBAA14]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[E_ADDR]], align 8, !tbaa [[INTPTR_TBAA14]], !nonnull [[META17:![0-9]+]], !align [[META18:![0-9]+]]
+// CHECK-NEXT:    store ptr [[TMP0]], ptr [[__RANGE1]], align 8, !tbaa [[INTPTR_TBAA14]]
 // CHECK-NEXT:    call void @llvm.lifetime.start.p0(ptr [[__BEGIN1]]) #[[ATTR3]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[__RANGE1]], align 8, !tbaa [[TBAA14]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[__RANGE1]], align 8, !tbaa [[INTPTR_TBAA14]], !nonnull [[META17]], !align [[META18]]
 // CHECK-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [4 x i32], ptr [[TMP1]], i64 0, i64 0
-// CHECK-NEXT:    store ptr [[ARRAYDECAY]], ptr [[__BEGIN1]], align 8, !tbaa [[TBAA14]]
+// CHECK-NEXT:    store ptr [[ARRAYDECAY]], ptr [[__BEGIN1]], align 8, !tbaa [[INTPTR_TBAA14]]
 // CHECK-NEXT:    call void @llvm.lifetime.start.p0(ptr [[__END1]]) #[[ATTR3]]
-// CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[__RANGE1]], align 8, !tbaa [[TBAA14]]
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[__RANGE1]], align 8, !tbaa [[INTPTR_TBAA14]], !nonnull [[META17]], !align [[META18]]
 // CHECK-NEXT:    [[ARRAYDECAY1:%.*]] = getelementptr inbounds [4 x i32], ptr [[TMP2]], i64 0, i64 0
 // CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i32, ptr [[ARRAYDECAY1]], i64 4
-// CHECK-NEXT:    store ptr [[ADD_PTR]], ptr [[__END1]], align 8, !tbaa [[TBAA14]]
-// CHECK-NEXT:    br label [[FOR_COND:%.*]]
-// CHECK:       for.cond:
-// CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[__BEGIN1]], align 8, !tbaa [[TBAA14]]
-// CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[__END1]], align 8, !tbaa [[TBAA14]]
+// CHECK-NEXT:    store ptr [[ADD_PTR]], ptr [[__END1]], align 8, !tbaa [[INTPTR_TBAA14]]
+// CHECK-NEXT:    br label %[[FOR_COND:.*]]
+// CHECK:       [[FOR_COND]]:
+// CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[__BEGIN1]], align 8, !tbaa [[INTPTR_TBAA14]]
+// CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[__END1]], align 8, !tbaa [[INTPTR_TBAA14]]
 // CHECK-NEXT:    [[CMP:%.*]] = icmp ne ptr [[TMP3]], [[TMP4]]
 // CHECK-NEXT:    [[CMP_EXPVAL:%.*]] = call i1 @llvm.expect.i1(i1 [[CMP]], i1 true)
-// CHECK-NEXT:    br i1 [[CMP_EXPVAL]], label [[FOR_BODY:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-// CHECK:       for.cond.cleanup:
+// CHECK-NEXT:    br i1 [[CMP_EXPVAL]], label %[[FOR_BODY:.*]], label %[[FOR_COND_CLEANUP:.*]]
+// CHECK:       [[FOR_COND_CLEANUP]]:
 // CHECK-NEXT:    call void @llvm.lifetime.end.p0(ptr [[__END1]]) #[[ATTR3]]
 // CHECK-NEXT:    call void @llvm.lifetime.end.p0(ptr [[__BEGIN1]]) #[[ATTR3]]
 // CHECK-NEXT:    call void @llvm.lifetime.end.p0(ptr [[__RANGE1]]) #[[ATTR3]]
-// CHECK-NEXT:    br label [[FOR_END:%.*]]
-// CHECK:       for.body:
+// CHECK-NEXT:    br label %[[FOR_END:.*]]
+// CHECK:       [[FOR_BODY]]:
 // CHECK-NEXT:    call void @llvm.lifetime.start.p0(ptr [[I]]) #[[ATTR3]]
-// CHECK-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[__BEGIN1]], align 8, !tbaa [[TBAA14]]
-// CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA2]]
-// CHECK-NEXT:    store i32 [[TMP6]], ptr [[I]], align 4, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[__BEGIN1]], align 8, !tbaa [[INTPTR_TBAA14]]
+// CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[INT_TBAA2]]
+// CHECK-NEXT:    store i32 [[TMP6]], ptr [[I]], align 4, !tbaa [[INT_TBAA2]]
 // CHECK-NEXT:    call void @llvm.lifetime.end.p0(ptr [[I]]) #[[ATTR3]]
-// CHECK-NEXT:    br label [[FOR_INC:%.*]]
-// CHECK:       for.inc:
-// CHECK-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[__BEGIN1]], align 8, !tbaa [[TBAA14]]
+// CHECK-NEXT:    br label %[[FOR_INC:.*]]
+// CHECK:       [[FOR_INC]]:
+// CHECK-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[__BEGIN1]], align 8, !tbaa [[INTPTR_TBAA14]]
 // CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP7]], i32 1
-// CHECK-NEXT:    store ptr [[INCDEC_PTR]], ptr [[__BEGIN1]], align 8, !tbaa [[TBAA14]]
-// CHECK-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP16:![0-9]+]]
-// CHECK:       for.end:
+// CHECK-NEXT:    store ptr [[INCDEC_PTR]], ptr [[__BEGIN1]], align 8, !tbaa [[INTPTR_TBAA14]]
+// CHECK-NEXT:    br label %[[FOR_COND]], !llvm.loop [[LOOP19:![0-9]+]]
+// CHECK:       [[FOR_END]]:
 // CHECK-NEXT:    ret void
 //
 void frl(int (&&e) [4])
@@ -190,54 +197,76 @@ void frl(int (&&e) [4])
   for(int i : e) [[likely]];
 }
 
-// CHECK-LABEL: @_Z3fruOA4_i(
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define dso_local void @_Z3fruOA4_i(
+// CHECK-SAME: ptr noundef nonnull align 4 dereferenceable(16) [[E:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[E_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-NEXT:    [[__RANGE1:%.*]] = alloca ptr, align 8
 // CHECK-NEXT:    [[__BEGIN1:%.*]] = alloca ptr, align 8
 // CHECK-NEXT:    [[__END1:%.*]] = alloca ptr, align 8
 // CHECK-NEXT:    [[I:%.*]] = alloca i32, align 4
-// CHECK-NEXT:    store ptr [[E:%.*]], ptr [[E_ADDR]], align 8, !tbaa [[TBAA14]]
+// CHECK-NEXT:    store ptr [[E]], ptr [[E_ADDR]], align 8, !tbaa [[INTPTR_TBAA14]]
 // CHECK-NEXT:    call void @llvm.lifetime.start.p0(ptr [[__RANGE1]]) #[[ATTR3]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[E_ADDR]], align 8, !tbaa [[TBAA14]]
-// CHECK-NEXT:    store ptr [[TMP0]], ptr [[__RANGE1]], align 8, !tbaa [[TBAA14]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[E_ADDR]], align 8, !tbaa [[INTPTR_TBAA14]], !nonnull [[META17]], !align [[META18]]
+// CHECK-NEXT:    store ptr [[TMP0]], ptr [[__RANGE1]], align 8, !tbaa [[INTPTR_TBAA14]]
 // CHECK-NEXT:    call void @llvm.lifetime.start.p0(ptr [[__BEGIN1]]) #[[ATTR3]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[__RANGE1]], align 8, !tbaa [[TBAA14]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[__RANGE1]], align 8, !tbaa [[INTPTR_TBAA14]], !nonnull [[META17]], !align [[META18]]
 // CHECK-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [4 x i32], ptr [[TMP1]], i64 0, i64 0
-// CHECK-NEXT:    store ptr [[ARRAYDECAY]], ptr [[__BEGIN1]], align 8, !tbaa [[TBAA14]]
+// CHECK-NEXT:    store ptr [[ARRAYDECAY]], ptr [[__BEGIN1]], align 8, !tbaa [[INTPTR_TBAA14]]
 // CHECK-NEXT:    call void @llvm.lifetime.start.p0(ptr [[__END1]]) #[[ATTR3]]
-// CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[__RANGE1]], align 8, !tbaa [[TBAA14]]
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[__RANGE1]], align 8, !tbaa [[INTPTR_TBAA14]], !nonnull [[META17]], !align [[META18]]
 // CHECK-NEXT:    [[ARRAYDECAY1:%.*]] = getelementptr inbounds [4 x i32], ptr [[TMP2]], i64 0, i64 0
 // CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i32, ptr [[ARRAYDECAY1]], i64 4
-// CHECK-NEXT:    store ptr [[ADD_PTR]], ptr [[__END1]], align 8, !tbaa [[TBAA14]]
-// CHECK-NEXT:    br label [[FOR_COND:%.*]]
-// CHECK:       for.cond:
-// CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[__BEGIN1]], align 8, !tbaa [[TBAA14]]
-// CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[__END1]], align 8, !tbaa [[TBAA14]]
+// CHECK-NEXT:    store ptr [[ADD_PTR]], ptr [[__END1]], align 8, !tbaa [[INTPTR_TBAA14]]
+// CHECK-NEXT:    br label %[[FOR_COND:.*]]
+// CHECK:       [[FOR_COND]]:
+// CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[__BEGIN1]], align 8, !tbaa [[INTPTR_TBAA14]]
+// CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[__END1]], align 8, !tbaa [[INTPTR_TBAA14]]
 // CHECK-NEXT:    [[CMP:%.*]] = icmp ne ptr [[TMP3]], [[TMP4]]
 // CHECK-NEXT:    [[CMP_EXPVAL:%.*]] = call i1 @llvm.expect.i1(i1 [[CMP]], i1 false)
-// CHECK-NEXT:    br i1 [[CMP_EXPVAL]], label [[FOR_BODY:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-// CHECK:       for.cond.cleanup:
+// CHECK-NEXT:    br i1 [[CMP_EXPVAL]], label %[[FOR_BODY:.*]], label %[[FOR_COND_CLEANUP:.*]]
+// CHECK:       [[FOR_COND_CLEANUP]]:
 // CHECK-NEXT:    call void @llvm.lifetime.end.p0(ptr [[__END1]]) #[[ATTR3]]
 // CHECK-NEXT:    call void @llvm.lifetime.end.p0(ptr [[__BEGIN1]]) #[[ATTR3]]
 // CHECK-NEXT:    call void @llvm.lifetime.end.p0(ptr [[__RANGE1]]) #[[ATTR3]]
-// CHECK-NEXT:    br label [[FOR_END:%.*]]
-// CHECK:       for.body:
+// CHECK-NEXT:    br label %[[FOR_END:.*]]
+// CHECK:       [[FOR_BODY]]:
 // CHECK-NEXT:    call void @llvm.lifetime.start.p0(ptr [[I]]) #[[ATTR3]]
-// CHECK-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[__BEGIN1]], align 8, !tbaa [[TBAA14]]
-// CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA2]]
-// CHECK-NEXT:    store i32 [[TMP6]], ptr [[I]], align 4, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[__BEGIN1]], align 8, !tbaa [[INTPTR_TBAA14]]
+// CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[INT_TBAA2]]
+// CHECK-NEXT:    store i32 [[TMP6]], ptr [[I]], align 4, !tbaa [[INT_TBAA2]]
 // CHECK-NEXT:    call void @llvm.lifetime.end.p0(ptr [[I]]) #[[ATTR3]]
-// CHECK-NEXT:    br label [[FOR_INC:%.*]]
-// CHECK:       for.inc:
-// CHECK-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[__BEGIN1]], align 8, !tbaa [[TBAA14]]
+// CHECK-NEXT:    br label %[[FOR_INC:.*]]
+// CHECK:       [[FOR_INC]]:
+// CHECK-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[__BEGIN1]], align 8, !tbaa [[INTPTR_TBAA14]]
 // CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP7]], i32 1
-// CHECK-NEXT:    store ptr [[INCDEC_PTR]], ptr [[__BEGIN1]], align 8, !tbaa [[TBAA14]]
-// CHECK-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]]
-// CHECK:       for.end:
+// CHECK-NEXT:    store ptr [[INCDEC_PTR]], ptr [[__BEGIN1]], align 8, !tbaa [[INTPTR_TBAA14]]
+// CHECK-NEXT:    br label %[[FOR_COND]], !llvm.loop [[LOOP20:![0-9]+]]
+// CHECK:       [[FOR_END]]:
 // CHECK-NEXT:    ret void
 //
 void fru(int (&&e) [4])
 {
   for(int i : e) [[unlikely]];
 }
+//.
+// CHECK: [[INT_TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
+// CHECK: [[META3]] = !{!"int", [[META4:![0-9]+]], i64 0}
+// CHECK: [[META4]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0}
+// CHECK: [[META5]] = !{!"Simple C++ TBAA"}
+// CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META7:![0-9]+]], [[META8:![0-9]+]]}
+// CHECK: [[META7]] = !{!"llvm.loop.mustprogress"}
+// CHECK: [[META8]] = !{!"llvm.loop.unroll.disable"}
+// CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META7]], [[META8]]}
+// CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META7]], [[META8]]}
+// CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[META7]], [[META8]]}
+// CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META7]], [[META8]]}
+// CHECK: [[LOOP13]] = distinct !{[[LOOP13]], [[META7]], [[META8]]}
+// CHECK: [[INTPTR_TBAA14]] = !{[[META15:![0-9]+]], [[META15]], i64 0}
+// CHECK: [[META15]] = !{!"p1 int", [[META16:![0-9]+]], i64 0}
+// CHECK: [[META16]] = !{!"any pointer", [[META4]], i64 0}
+// CHECK: [[META17]] = !{}
+// CHECK: [[META18]] = !{i64 4}
+// CHECK: [[LOOP19]] = distinct !{[[LOOP19]], [[META8]]}
+// CHECK: [[LOOP20]] = distinct !{[[LOOP20]], [[META8]]}
+//.
diff --git a/clang/test/CodeGenCXX/attr-likelihood-switch-branch-weights.cpp b/clang/test/CodeGenCXX/attr-likelihood-switch-branch-weights.cpp
index 328d1bcc76208..bb6f5bb248e3e 100644
--- a/clang/test/CodeGenCXX/attr-likelihood-switch-branch-weights.cpp
+++ b/clang/test/CodeGenCXX/attr-likelihood-switch-branch-weights.cpp
@@ -1,15 +1,16 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6
 // RUN: %clang_cc1 -O1 -disable-llvm-passes -emit-llvm %s -o - -triple=x86_64-linux-gnu | FileCheck %s
 
 extern volatile int i;
 
-// CHECK-LABEL: @_Z8OneCaseLv(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load volatile i32, ptr @i, align 4, !tbaa [[TBAA2:![0-9]+]]
-// CHECK-NEXT:    switch i32 [[TMP0]], label [[SW_EPILOG:%.*]] [
-// CHECK-NEXT:    i32 1, label [[SW_EPILOG]]
-// CHECK-NEXT:    ], !prof !6
-// CHECK:       sw.epilog:
+// CHECK-LABEL: define dso_local void @_Z8OneCaseLv(
+// CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load volatile i32, ptr @i, align 4, !tbaa [[INT_TBAA2:![0-9]+]]
+// CHECK-NEXT:    switch i32 [[TMP0]], label %[[SW_EPILOG:.*]] [
+// CHECK-NEXT:      i32 1, label %[[SW_EPILOG]]
+// CHECK-NEXT:    ], !prof [[PROF6:![0-9]+]]
+// CHECK:       [[SW_EPILOG]]:
 // CHECK-NEXT:    ret void
 //
 void OneCaseL() {
@@ -18,18 +19,19 @@ void OneCaseL() {
   }
 }
 
-// CHECK-LABEL: @_Z8OneCaseUv(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load volatile i32, ptr @i, align 4, !tbaa [[TBAA2]]
-// CHECK-NEXT:    switch i32 [[TMP0]], label [[SW_EPILOG:%.*]] [
-// CHECK-NEXT:    i32 1, label [[SW_BB:%.*]]
-// CHECK-NEXT:    ], !prof !7
-// CHECK:       sw.bb:
-// CHECK-NEXT:    [[TMP1:%.*]] = load volatile i32, ptr @i, align 4, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @_Z8OneCaseUv(
+// CHECK-SAME: ) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load volatile i32, ptr @i, align 4, !tbaa [[INT_TBAA2]]
+// CHECK-NEXT:    switch i32 [[TMP0]], label %[[SW_EPILOG:.*]] [
+// CHECK-NEXT:      i32 1, label %[[SW_BB:.*]]
+// CHECK-NEXT:    ], !prof [[PROF7:![0-9]+]]
+// CHECK:       [[SW_BB]]:
+// CHECK-NEXT:    [[TMP1:%.*]] = load volatile i32, ptr @i, align 4, !tbaa [[INT_TBAA2]]
 // CHECK-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP1]], 1
-// CHECK-NEXT:    store volatile i32 [[INC]], ptr @i, align 4, !tbaa [[TBAA2]]
-// CHECK-NEXT:    br label [[SW_EPILOG]]
-// CHECK:       sw.epilog:
+// CHECK-NEXT:    store volatile i32 [[INC]], ptr @i, align 4, !tbaa [[INT_TBAA2]]
+// CHECK-NEXT:    br label %[[SW_EPILOG]]
+// CHECK:       [[SW_EPILOG]]:
 // CHECK-NEXT:    ret void
 //
 void OneCaseU() {
@@ -38,14 +40,15 @@ void OneCaseU() {
   }
 }
 
-// CHECK-LABEL: @_Z10TwoCasesLNv(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load volatile i32, ptr @i, align 4, !tbaa [[TBAA2]]
-// CHECK-NEXT:    switch i32 [[TMP0]], label [[SW_EPILOG:%.*]] [
-// CHECK-NEXT:    i32 1, label [[SW_EPILOG]]
-// CHECK-NEXT:    i32 2, label [[SW_EPILOG]]
-// CHECK-NEXT:    ], !prof !8
-// CHECK:       sw.epilog:
+// CHECK-LABEL: define dso_local void @_Z10TwoCasesLNv(
+// CHECK-SAME: ) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load volatile i32, ptr @i, align 4, !tbaa [[INT_TBAA2]]
+// CHECK-NEXT:    switch i32 [[TMP0]], label %[[SW_EPILOG:.*]] [
+// CHECK-NEXT:      i32 1, label %[[SW_EPILOG]]
+// CHECK-NEXT:      i32 2, label %[[SW_EPILOG]]
+// CHECK-NEXT:    ], !prof [[PROF8:![0-9]+]]
+// CHECK:       [[SW_EPILOG]]:
 // CHECK-NEXT:    ret void
 //
 void TwoCasesLN() {
@@ -55,14 +58,15 @@ void TwoCasesLN() {
   }
 }
 
-// CHECK-LABEL: @_Z10TwoCasesUNv(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load volatile i32, ptr @i, align 4, !tbaa [[TBAA2]]
-// CHECK-NEXT:    switch i32 [[TMP0]], label [[SW_EPILOG:%.*]] [
-// CHECK-NEXT:    i32 1, label [[SW_EPILOG]]
-// CHECK-NEXT:    i32 2, label [[SW_EPILOG]]
-// CHECK-NEXT:    ], !prof !9
-// CHECK:       sw.epilog:
+// CHECK-LABEL: define dso_local void @_Z10TwoCasesUNv(
+// CHECK-SAME: ) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load volatile i32, ptr @i, align 4, !tbaa [[INT_TBAA2]]
+// CHECK-NEXT:    switch i32 [[TMP0]], label %[[SW_EPILOG:.*]] [
+// CHECK-NEXT:      i32 1, label %[[SW_EPILOG]]
+// CHECK-NEXT:      i32 2, label %[[SW_EPILOG]]
+// CHECK-NEXT:    ], !prof [[PROF9:![0-9]+]]
+// CHECK:       [[SW_EPILOG]]:
 // CHECK-NEXT:    ret void
 //
 void TwoCasesUN() {
@@ -72,14 +76,15 @@ void TwoCasesUN() {
   }
 }
 
-// CHECK-LABEL: @_Z10TwoCasesLUv(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load volatile i32, ptr @i, align 4, !tbaa [[TBAA2]]
-// CHECK-NEXT:    switch i32 [[TMP0]], label [[SW_EPILOG:%.*]] [
-// CHECK-NEXT:    i32 1, label [[SW_EPILOG]]
-// CHECK-NEXT:    i32 2, label [[SW_EPILOG]]
-// CHECK-NEXT:    ], !prof !10
-// CHECK:       sw.epilog:
+// CHECK-LABEL: define dso_local void @_Z10TwoCasesLUv(
+// CHECK-SAME: ) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load volatile i32, ptr @i, align 4, !tbaa [[INT_TBAA2]]
+// CHECK-NEXT:    switch i32 [[TMP0]], label %[[SW_EPILOG:.*]] [
+// CHECK-NEXT:      i32 1, label %[[SW_EPILOG]]
+// CHECK-NEXT:      i32 2, label %[[SW_EPILOG]]
+// CHECK-NEXT:    ], !prof [[PROF10:![0-9]+]]
+// CHECK:       [[SW_EPILOG]]:
 // CHECK-NEXT:    ret void
 //
 void TwoCasesLU() {
@@ -89,20 +94,21 @@ void TwoCasesLU() {
   }
 }
 
-// CHECK-LABEL: @_Z20CasesFallthroughNNLNv(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load volatile i32, ptr @i, align 4, !tbaa [[TBAA2]]
-// CHECK-NEXT:    switch i32 [[TMP0]], label [[SW_EPILOG:%.*]] [
-// CHECK-NEXT:    i32 1, label [[SW_BB:%.*]]
-// CHECK-NEXT:    i32 2, label [[SW_BB]]
-// CHECK-NEXT:    i32 3, label [[SW_BB1:%.*]]
-// CHECK-NEXT:    i32 4, label [[SW_BB1]]
-// CHECK-NEXT:    ], !prof !11
-// CHECK:       sw.bb:
-// CHECK-NEXT:    br label [[SW_BB1]]
-// CHECK:       sw.bb1:
-// CHECK-NEXT:    br label [[SW_EPILOG]]
-// CHECK:       sw.epilog:
+// CHECK-LABEL: define dso_local void @_Z20CasesFallthroughNNLNv(
+// CHECK-SAME: ) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load volatile i32, ptr @i, align 4, !tbaa [[INT_TBAA2]]
+// CHECK-NEXT:    switch i32 [[TMP0]], label %[[SW_EPILOG:.*]] [
+// CHECK-NEXT:      i32 1, label %[[SW_BB:.*]]
+// CHECK-NEXT:      i32 2, label %[[SW_BB]]
+// CHECK-NEXT:      i32 3, label %[[SW_BB1:.*]]
+// CHECK-NEXT:      i32 4, label %[[SW_BB1]]
+// CHECK-NEXT:    ], !prof [[PROF11:![0-9]+]]
+// CHECK:       [[SW_BB]]:
+// CHECK-NEXT:    br label %[[SW_BB1]]
+// CHECK:       [[SW_BB1]]:
+// CHECK-NEXT:    br label %[[SW_EPILOG]]
+// CHECK:       [[SW_EPILOG]]:
 // CHECK-NEXT:    ret void
 //
 void CasesFallthroughNNLN() {
@@ -114,20 +120,21 @@ void CasesFallthroughNNLN() {
   }
 }
 
-// CHECK-LABEL: @_Z20CasesFallthroughNNUNv(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load volatile i32, ptr @i, align 4, !tbaa [[TBAA2]]
-// CHECK-NEXT:    switch i32 [[TMP0]], label [[SW_EPILOG:%.*]] [
-// CHECK-NEXT:    i32 1, label [[SW_BB:%.*]]
-// CHECK-NEXT:    i32 2, label [[SW_BB]]
-// CHECK-NEXT:    i32 3, label [[SW_BB1:%.*]]
-// CHECK-NEXT:    i32 4, label [[SW_BB1]]
-// CHECK-NEXT:    ], !prof !12
-// CHECK:       sw.bb:
-// CHECK-NEXT:    br label [[SW_BB1]]
-// CHECK:       sw.bb1:
-// CHECK-NEXT:    br label [[SW_EPILOG]]
-// CHECK:       sw.epilog:
+// CHECK-LABEL: define dso_local void @_Z20CasesFallthroughNNUNv(
+// CHECK-SAME: ) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load volatile i32, ptr @i, align 4, !tbaa [[INT_TBAA2]]
+// CHECK-NEXT:    switch i32 [[TMP0]], label %[[SW_EPILOG:.*]] [
+// CHECK-NEXT:      i32 1, label %[[SW_BB:.*]]
+// CHECK-NEXT:      i32 2, label %[[SW_BB]]
+// CHECK-NEXT:      i32 3, label %[[SW_BB1:.*]]
+// CHECK-NEXT:      i32 4, label %[[SW_BB1]]
+// CHECK-NEXT:    ], !prof [[PROF12:![0-9]+]]
+// CHECK:       [[SW_BB]]:
+// CHECK-NEXT:    br label %[[SW_BB1]]
+// CHECK:       [[SW_BB1]]:
+// CHECK-NEXT:    br label %[[SW_EPILOG]]
+// CHECK:       [[SW_EPILOG]]:
 // CHECK-NEXT:    ret void
 //
 void CasesFallthroughNNUN() {
@@ -139,29 +146,30 @@ void CasesFallthroughNNUN() {
   }
 }
 
-// CHECK-LABEL: @_Z28CasesFallthroughRangeSmallLNv(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load volatile i32, ptr @i, align 4, !tbaa [[TBAA2]]
-// CHECK-NEXT:    switch i32 [[TMP0]], label [[SW_EPILOG:%.*]] [
-// CHECK-NEXT:    i32 1, label [[SW_BB:%.*]]
-// CHECK-NEXT:    i32 2, label [[SW_BB]]
-// CHECK-NEXT:    i32 3, label [[SW_BB]]
-// CHECK-NEXT:    i32 4, label [[SW_BB]]
-// CHECK-NEXT:    i32 5, label [[SW_BB]]
-// CHECK-NEXT:    i32 102, label [[SW_BB1:%.*]]
-// CHECK-NEXT:    i32 103, label [[SW_BB2:%.*]]
-// CHECK-NEXT:    i32 104, label [[SW_BB2]]
-// CHECK-NEXT:    ], !prof !13
-// CHECK:       sw.bb:
-// CHECK-NEXT:    [[TMP1:%.*]] = load volatile i32, ptr @i, align 4, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @_Z28CasesFallthroughRangeSmallLNv(
+// CHECK-SAME: ) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load volatile i32, ptr @i, align 4, !tbaa [[INT_TBAA2]]
+// CHECK-NEXT:    switch i32 [[TMP0]], label %[[SW_EPILOG:.*]] [
+// CHECK-NEXT:      i32 1, label %[[SW_BB:.*]]
+// CHECK-NEXT:      i32 2, label %[[SW_BB]]
+// CHECK-NEXT:      i32 3, label %[[SW_BB]]
+// CHECK-NEXT:      i32 4, label %[[SW_BB]]
+// CHECK-NEXT:      i32 5, label %[[SW_BB]]
+// CHECK-NEXT:      i32 102, label %[[SW_BB1:.*]]
+// CHECK-NEXT:      i32 103, label %[[SW_BB2:.*]]
+// CHECK-NEXT:      i32 104, label %[[SW_BB2]]
+// CHECK-NEXT:    ], !prof [[PROF13:![0-9]+]]
+// CHECK:       [[SW_BB]]:
+// CHECK-NEXT:    [[TMP1:%.*]] = load volatile i32, ptr @i, align 4, !tbaa [[INT_TBAA2]]
 // CHECK-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP1]], 1
-// CHECK-NEXT:    store volatile i32 [[INC]], ptr @i, align 4, !tbaa [[TBAA2]]
-// CHECK-NEXT:    br label [[SW_BB1]]
-// CHECK:       sw.bb1:
-// CHECK-NEXT:    br label [[SW_BB2]]
-// CHECK:       sw.bb2:
-// CHECK-NEXT:    br label [[SW_EPILOG]]
-// CHECK:       sw.epilog:
+// CHECK-NEXT:    store volatile i32 [[INC]], ptr @i, align 4, !tbaa [[INT_TBAA2]]
+// CHECK-NEXT:    br label %[[SW_BB1]]
+// CHECK:       [[SW_BB1]]:
+// CHECK-NEXT:    br label %[[SW_BB2]]
+// CHECK:       [[SW_BB2]]:
+// CHECK-NEXT:    br label %[[SW_EPILOG]]
+// CHECK:       [[SW_EPILOG]]:
 // CHECK-NEXT:    ret void
 //
 void CasesFallthroughRangeSmallLN() {
@@ -173,29 +181,30 @@ void CasesFallthroughRangeSmallLN() {
   }
 }
 
-// CHECK-LABEL: @_Z28CasesFallthroughRangeSmallUNv(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load volatile i32, ptr @i, align 4, !tbaa [[TBAA2]]
-// CHECK-NEXT:    switch i32 [[TMP0]], label [[SW_EPILOG:%.*]] [
-// CHECK-NEXT:    i32 1, label [[SW_BB:%.*]]
-// CHECK-NEXT:    i32 2, label [[SW_BB]]
-// CHECK-NEXT:    i32 3, label [[SW_BB]]
-// CHECK-NEXT:    i32 4, label [[SW_BB]]
-// CHECK-NEXT:    i32 5, label [[SW_BB]]
-// CHECK-NEXT:    i32 102, label [[SW_BB1:%.*]]
-// CHECK-NEXT:    i32 103, label [[SW_BB2:%.*]]
-// CHECK-NEXT:    i32 104, label [[SW_BB2]]
-// CHECK-NEXT:    ], !prof !14
-// CHECK:       sw.bb:
-// CHECK-NEXT:    [[TMP1:%.*]] = load volatile i32, ptr @i, align 4, !tbaa [[TBAA2]]
+// CHECK-LABEL: define dso_local void @_Z28CasesFallthroughRangeSmallUNv(
+// CHECK-SAME: ) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load volatile i32, ptr @i, align 4, !tbaa [[INT_TBAA2]]
+// CHECK-NEXT:    switch i32 [[TMP0]], label %[[SW_EPILOG:.*]] [
+// CHECK-NEXT:      i32 1, label %[[SW_BB:.*]]
+// CHECK-NEXT:      i32 2, label %[[SW_BB]]
+// CHECK-NEXT:      i32 3, label %[[SW_BB]]
+// CHECK-NEXT:      i32 4, label %[[SW_BB]]
+// CHECK-NEXT:      i32 5, label %[[SW_BB]]
+// CHECK-NEXT:      i32 102, label %[[SW_BB1:.*]]
+// CHECK-NEXT:      i32 103, label %[[SW_BB2:.*]]
+// CHECK-NEXT:      i32 104, label %[[SW_BB2]]
+// CHECK-NEXT:    ], !prof [[PROF14:![0-9]+]]
+// CHECK:       [[SW_BB]]:
+// CHECK-NEXT:    [[TMP1:%.*]] = load volatile i32, ptr @i, align 4, !tbaa [[INT_TBAA2]]
 // CHECK-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP1]], 1
-// CHECK-NEXT:    store volatile i32 [[INC]], ptr @i, align 4, !tbaa [[TBAA2]]
-// CHECK-NEXT:    br label [[SW_BB1]]
-// CHECK:       sw.bb1:
-// CHECK-NEXT:    br label [[SW_BB2]]
-// CHECK:       sw.bb2:
-// CHECK-NEXT:    br label [[SW_EPILOG]]
-// CHECK:       sw.epilog:
+// CHECK-NEXT:    store volatile i32 [[INC]], ptr @i, align 4, !tbaa [[INT_TBAA2]]
+// CHECK-NEXT:    br label %[[SW_BB1]]
+// CHECK:       [[SW_BB1]]:
+// CHECK-NEXT:    br label %[[SW_BB2]]
+// CHECK:       [[SW_BB2]]:
+// CHECK-NEXT:    br label %[[SW_EPILOG]]
+// CHECK:       [[SW_EPILOG]]:
 // CHECK-NEXT:    ret void
 //
 void CasesFallthroughRangeSmallUN() {
@@ -207,23 +216,24 @@ void CasesFallthroughRangeSmallUN() {
   }
 }
 
-// CHECK-LABEL: @_Z29CasesFallthroughRangeLargeLLNv(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load volatile i32, ptr @i, align 4, !tbaa [[TBAA2]]
-// CHECK-NEXT:    switch i32 [[TMP0]], label [[SW_CASERANGE:%.*]] [
-// CHECK-NEXT:    i32 1003, label [[SW_BB1:%.*]]
-// CHECK-NEXT:    i32 104, label [[SW_BB1]]
-// CHECK-NEXT:    ], !prof !8
-// CHECK:       sw.bb:
-// CHECK-NEXT:    br label [[SW_BB1]]
-// CHECK:       sw.bb1:
-// CHECK-NEXT:    br label [[SW_EPILOG:%.*]]
-// CHECK:       sw.caserange:
+// CHECK-LABEL: define dso_local void @_Z29CasesFallthroughRangeLargeLLNv(
+// CHECK-SAME: ) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load volatile i32, ptr @i, align 4, !tbaa [[INT_TBAA2]]
+// CHECK-NEXT:    switch i32 [[TMP0]], label %[[SW_CASERANGE:.*]] [
+// CHECK-NEXT:      i32 1003, label %[[SW_BB1:.*]]
+// CHECK-NEXT:      i32 104, label %[[SW_BB1]]
+// CHECK-NEXT:    ], !prof [[PROF8]]
+// CHECK:       [[SW_BB:.*]]:
+// CHECK-NEXT:    br label %[[SW_BB1]]
+// CHECK:       [[SW_BB1]]:
+// CHECK-NEXT:    br label %[[SW_EPILOG:.*]]
+// CHECK:       [[SW_CASERANGE]]:
 // CHECK-NEXT:    [[TMP1:%.*]] = sub i32 [[TMP0]], 0
 // CHECK-NEXT:    [[INBOUNDS:%.*]] = icmp ule i32 [[TMP1]], 64
 // CHECK-NEXT:    [[INBOUNDS_EXPVAL:%.*]] = call i1 @llvm.expect.i1(i1 [[INBOUNDS]], i1 true)
-// CHECK-NEXT:    br i1 [[INBOUNDS_EXPVAL]], label [[SW_BB:%.*]], label [[SW_EPILOG]]
-// CHECK:       sw.epilog:
+// CHECK-NEXT:    br i1 [[INBOUNDS_EXPVAL]], label %[[SW_BB]], label %[[SW_EPILOG]]
+// CHECK:       [[SW_EPILOG]]:
 // CHECK-NEXT:    ret void
 //
 void CasesFallthroughRangeLargeLLN() {
@@ -234,23 +244,24 @@ void CasesFallthroughRangeLargeLLN() {
   }
 }
 
-// CHECK-LABEL: @_Z29CasesFallthroughRangeLargeUUNv(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load volatile i32, ptr @i, align 4, !tbaa [[TBAA2]]
-// CHECK-NEXT:    switch i32 [[TMP0]], label [[SW_CASERANGE:%.*]] [
-// CHECK-NEXT:    i32 1003, label [[SW_BB1:%.*]]
-// CHECK-NEXT:    i32 104, label [[SW_BB1]]
-// CHECK-NEXT:    ], !prof !9
-// CHECK:       sw.bb:
-// CHECK-NEXT:    br label [[SW_BB1]]
-// CHECK:       sw.bb1:
-// CHECK-NEXT:    br label [[SW_EPILOG:%.*]]
-// CHECK:       sw.caserange:
+// CHECK-LABEL: define dso_local void @_Z29CasesFallthroughRangeLargeUUNv(
+// CHECK-SAME: ) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load volatile i32, ptr @i, align 4, !tbaa [[INT_TBAA2]]
+// CHECK-NEXT:    switch i32 [[TMP0]], label %[[SW_CASERANGE:.*]] [
+// CHECK-NEXT:      i32 1003, label %[[SW_BB1:.*]]
+// CHECK-NEXT:      i32 104, label %[[SW_BB1]]
+// CHECK-NEXT:    ], !prof [[PROF9]]
+// CHECK:       [[SW_BB:.*]]:
+// CHECK-NEXT:    br label %[[SW_BB1]]
+// CHECK:       [[SW_BB1]]:
+// CHECK-NEXT:    br label %[[SW_EPILOG:.*]]
+// CHECK:       [[SW_CASERANGE]]:
 // CHECK-NEXT:    [[TMP1:%.*]] = sub i32 [[TMP0]], 0
 // CHECK-NEXT:    [[INBOUNDS:%.*]] = icmp ule i32 [[TMP1]], 64
 // CHECK-NEXT:    [[INBOUNDS_EXPVAL:%.*]] = call i1 @llvm.expect.i1(i1 [[INBOUNDS]], i1 false)
-// CHECK-NEXT:    br i1 [[INBOUNDS_EXPVAL]], label [[SW_BB:%.*]], label [[SW_EPILOG]]
-// CHECK:       sw.epilog:
+// CHECK-NEXT:    br i1 [[INBOUNDS_EXPVAL]], label %[[SW_BB]], label %[[SW_EPILOG]]
+// CHECK:       [[SW_EPILOG]]:
 // CHECK-NEXT:    ret void
 //
 void CasesFallthroughRangeLargeUUN() {
@@ -261,15 +272,16 @@ void CasesFallthroughRangeLargeUUN() {
   }
 }
 
-// CHECK-LABEL: @_Z15OneCaseDefaultLv(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load volatile i32, ptr @i, align 4, !tbaa [[TBAA2]]
-// CHECK-NEXT:    switch i32 [[TMP0]], label [[SW_DEFAULT:%.*]] [
-// CHECK-NEXT:    i32 1, label [[SW_EPILOG:%.*]]
-// CHECK-NEXT:    ], !prof !15
-// CHECK:       sw.default:
-// CHECK-NEXT:    br label [[SW_EPILOG]]
-// CHECK:       sw.epilog:
+// CHECK-LABEL: define dso_local void @_Z15OneCaseDefaultLv(
+// CHECK-SAME: ) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load volatile i32, ptr @i, align 4, !tbaa [[INT_TBAA2]]
+// CHECK-NEXT:    switch i32 [[TMP0]], label %[[SW_DEFAULT:.*]] [
+// CHECK-NEXT:      i32 1, label %[[SW_EPILOG:.*]]
+// CHECK-NEXT:    ], !prof [[PROF15:![0-9]+]]
+// CHECK:       [[SW_DEFAULT]]:
+// CHECK-NEXT:    br label %[[SW_EPILOG]]
+// CHECK:       [[SW_EPILOG]]:
 // CHECK-NEXT:    ret void
 //
 void OneCaseDefaultL() {
@@ -279,15 +291,16 @@ void OneCaseDefaultL() {
   }
 }
 
-// CHECK-LABEL: @_Z15OneCaseDefaultUv(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load volatile i32, ptr @i, align 4, !tbaa [[TBAA2]]
-// CHECK-NEXT:    switch i32 [[TMP0]], label [[SW_DEFAULT:%.*]] [
-// CHECK-NEXT:    i32 1, label [[SW_EPILOG:%.*]]
-// CHECK-NEXT:    ], !prof !16
-// CHECK:       sw.default:
-// CHECK-NEXT:    br label [[SW_EPILOG]]
-// CHECK:       sw.epilog:
+// CHECK-LABEL: define dso_local void @_Z15OneCaseDefaultUv(
+// CHECK-SAME: ) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load volatile i32, ptr @i, align 4, !tbaa [[INT_TBAA2]]
+// CHECK-NEXT:    switch i32 [[TMP0]], label %[[SW_DEFAULT:.*]] [
+// CHECK-NEXT:      i32 1, label %[[SW_EPILOG:.*]]
+// CHECK-NEXT:    ], !prof [[PROF16:![0-9]+]]
+// CHECK:       [[SW_DEFAULT]]:
+// CHECK-NEXT:    br label %[[SW_EPILOG]]
+// CHECK:       [[SW_EPILOG]]:
 // CHECK-NEXT:    ret void
 //
 void OneCaseDefaultU() {
@@ -297,16 +310,17 @@ void OneCaseDefaultU() {
   }
 }
 
-// CHECK-LABEL: @_Z18TwoCasesDefaultLNLv(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load volatile i32, ptr @i, align 4, !tbaa [[TBAA2]]
-// CHECK-NEXT:    switch i32 [[TMP0]], label [[SW_DEFAULT:%.*]] [
-// CHECK-NEXT:    i32 1, label [[SW_EPILOG:%.*]]
-// CHECK-NEXT:    i32 2, label [[SW_EPILOG]]
-// CHECK-NEXT:    ], !prof !17
-// CHECK:       sw.default:
-// CHECK-NEXT:    br label [[SW_EPILOG]]
-// CHECK:       sw.epilog:
+// CHECK-LABEL: define dso_local void @_Z18TwoCasesDefaultLNLv(
+// CHECK-SAME: ) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load volatile i32, ptr @i, align 4, !tbaa [[INT_TBAA2]]
+// CHECK-NEXT:    switch i32 [[TMP0]], label %[[SW_DEFAULT:.*]] [
+// CHECK-NEXT:      i32 1, label %[[SW_EPILOG:.*]]
+// CHECK-NEXT:      i32 2, label %[[SW_EPILOG]]
+// CHECK-NEXT:    ], !prof [[PROF17:![0-9]+]]
+// CHECK:       [[SW_DEFAULT]]:
+// CHECK-NEXT:    br label %[[SW_EPILOG]]
+// CHECK:       [[SW_EPILOG]]:
 // CHECK-NEXT:    ret void
 //
 void TwoCasesDefaultLNL() {
@@ -317,16 +331,17 @@ void TwoCasesDefaultLNL() {
   }
 }
 
-// CHECK-LABEL: @_Z18TwoCasesDefaultLNNv(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load volatile i32, ptr @i, align 4, !tbaa [[TBAA2]]
-// CHECK-NEXT:    switch i32 [[TMP0]], label [[SW_DEFAULT:%.*]] [
-// CHECK-NEXT:    i32 1, label [[SW_EPILOG:%.*]]
-// CHECK-NEXT:    i32 2, label [[SW_EPILOG]]
-// CHECK-NEXT:    ], !prof !8
-// CHECK:       sw.default:
-// CHECK-NEXT:    br label [[SW_EPILOG]]
-// CHECK:       sw.epilog:
+// CHECK-LABEL: define dso_local void @_Z18TwoCasesDefaultLNNv(
+// CHECK-SAME: ) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load volatile i32, ptr @i, align 4, !tbaa [[INT_TBAA2]]
+// CHECK-NEXT:    switch i32 [[TMP0]], label %[[SW_DEFAULT:.*]] [
+// CHECK-NEXT:      i32 1, label %[[SW_EPILOG:.*]]
+// CHECK-NEXT:      i32 2, label %[[SW_EPILOG]]
+// CHECK-NEXT:    ], !prof [[PROF8]]
+// CHECK:       [[SW_DEFAULT]]:
+// CHECK-NEXT:    br label %[[SW_EPILOG]]
+// CHECK:       [[SW_EPILOG]]:
 // CHECK-NEXT:    ret void
 //
 void TwoCasesDefaultLNN() {
@@ -337,16 +352,17 @@ void TwoCasesDefaultLNN() {
   }
 }
 
-// CHECK-LABEL: @_Z18TwoCasesDefaultLNUv(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load volatile i32, ptr @i, align 4, !tbaa [[TBAA2]]
-// CHECK-NEXT:    switch i32 [[TMP0]], label [[SW_DEFAULT:%.*]] [
-// CHECK-NEXT:    i32 1, label [[SW_EPILOG:%.*]]
-// CHECK-NEXT:    i32 2, label [[SW_EPILOG]]
-// CHECK-NEXT:    ], !prof !18
-// CHECK:       sw.default:
-// CHECK-NEXT:    br label [[SW_EPILOG]]
-// CHECK:       sw.epilog:
+// CHECK-LABEL: define dso_local void @_Z18TwoCasesDefaultLNUv(
+// CHECK-SAME: ) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load volatile i32, ptr @i, align 4, !tbaa [[INT_TBAA2]]
+// CHECK-NEXT:    switch i32 [[TMP0]], label %[[SW_DEFAULT:.*]] [
+// CHECK-NEXT:      i32 1, label %[[SW_EPILOG:.*]]
+// CHECK-NEXT:      i32 2, label %[[SW_EPILOG]]
+// CHECK-NEXT:    ], !prof [[PROF18:![0-9]+]]
+// CHECK:       [[SW_DEFAULT]]:
+// CHECK-NEXT:    br label %[[SW_EPILOG]]
+// CHECK:       [[SW_EPILOG]]:
 // CHECK-NEXT:    ret void
 //
 void TwoCasesDefaultLNU() {
@@ -356,3 +372,22 @@ void TwoCasesDefaultLNU() {
     [[unlikely]] default: break;
   }
 }
+//.
+// CHECK: [[INT_TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
+// CHECK: [[META3]] = !{!"int", [[META4:![0-9]+]], i64 0}
+// CHECK: [[META4]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0}
+// CHECK: [[META5]] = !{!"Simple C++ TBAA"}
+// CHECK: [[PROF6]] = !{!"branch_weights", i32 357913942, i32 715827883}
+// CHECK: [[PROF7]] = !{!"branch_weights", i32 536870912, i32 1}
+// CHECK: [[PROF8]] = !{!"branch_weights", i32 238609295, i32 715827883, i32 238609295}
+// CHECK: [[PROF9]] = !{!"branch_weights", i32 357913942, i32 1, i32 357913942}
+// CHECK: [[PROF10]] = !{!"branch_weights", i32 357913942, i32 715827883, i32 1}
+// CHECK: [[PROF11]] = !{!"branch_weights", i32 143165577, i32 143165577, i32 143165577, i32 715827883, i32 143165577}
+// CHECK: [[PROF12]] = !{!"branch_weights", i32 214748365, i32 214748365, i32 214748365, i32 1, i32 214748365}
+// CHECK: [[PROF13]] = !{!"branch_weights", i32 79536432, i32 79536432, i32 79536432, i32 79536432, i32 79536432, i32 79536432, i32 79536432, i32 715827883, i32 79536432}
+// CHECK: [[PROF14]] = !{!"branch_weights", i32 119304648, i32 119304648, i32 119304648, i32 119304648, i32 119304648, i32 119304648, i32 119304648, i32 1, i32 119304648}
+// CHECK: [[PROF15]] = !{!"branch_weights", i32 715827883, i32 357913942}
+// CHECK: [[PROF16]] = !{!"branch_weights", i32 1, i32 536870912}
+// CHECK: [[PROF17]] = !{!"branch_weights", i32 536870912, i32 536870912, i32 268435456}
+// CHECK: [[PROF18]] = !{!"branch_weights", i32 1, i32 715827883, i32 357913942}
+//.
diff --git a/clang/test/CodeGenCXX/cfi-mfcall-nomerge.cpp b/clang/test/CodeGenCXX/cfi-mfcall-nomerge.cpp
index c1ee5af7254a0..d4b4f3030d117 100644
--- a/clang/test/CodeGenCXX/cfi-mfcall-nomerge.cpp
+++ b/clang/test/CodeGenCXX/cfi-mfcall-nomerge.cpp
@@ -1,4 +1,4 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --include-generated-funcs --version 5
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --include-generated-funcs --version 6
 // with MERGE/NO-MERGE assertions added manually.
 
 // N.B. although the clang driver defaults to merge, clang_cc1 defaults to non-merge.
@@ -29,7 +29,7 @@ void f(S *s, void (S::*p)()) {
 // NO-MERGE-NEXT:    [[MEMPTR_ISVIRTUAL_NOT:%.*]] = icmp eq i64 [[TMP1]], 0
 // NO-MERGE-NEXT:    br i1 [[MEMPTR_ISVIRTUAL_NOT]], label %[[MEMPTR_NONVIRTUAL:.*]], label %[[MEMPTR_VIRTUAL:.*]]
 // NO-MERGE:       [[MEMPTR_VIRTUAL]]:
-// NO-MERGE-NEXT:    [[VTABLE:%.*]] = load ptr, ptr [[TMP0]], align 8, !tbaa [[TBAA2:![0-9]+]]
+// NO-MERGE-NEXT:    [[VTABLE:%.*]] = load ptr, ptr [[TMP0]], align 8, !tbaa [[VTABLE_POINTER_TBAA2:![0-9]+]]
 // NO-MERGE-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[VTABLE]], i64 [[P_COERCE0]]
 // NO-MERGE-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[TMP2]], i64 -1
 // NO-MERGE-NEXT:    [[TMP4:%.*]] = tail call i1 @llvm.type.test(ptr [[TMP3]], metadata !"_ZTSM1SFvvE.virtual"), !nosanitize [[META5:![0-9]+]]
@@ -49,7 +49,7 @@ void f(S *s, void (S::*p)()) {
 // NO-MERGE:       [[MEMPTR_VIRTUAL7]]:
 // NO-MERGE-NEXT:    [[MEMPTR_VIRTUALFN:%.*]] = load ptr, ptr [[TMP3]], align 8, !nosanitize [[META5]]
 // NO-MERGE-NEXT:    tail call void [[MEMPTR_VIRTUALFN]](ptr noundef nonnull align 1 dereferenceable(1) [[TMP0]]) #[[ATTR5:[0-9]+]]
-// NO-MERGE-NEXT:    [[VTABLE8:%.*]] = load ptr, ptr [[TMP0]], align 8, !tbaa [[TBAA2]]
+// NO-MERGE-NEXT:    [[VTABLE8:%.*]] = load ptr, ptr [[TMP0]], align 8, !tbaa [[VTABLE_POINTER_TBAA2]]
 // NO-MERGE-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[VTABLE8]], i64 [[P_COERCE0]]
 // NO-MERGE-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP8]], i64 -1
 // NO-MERGE-NEXT:    [[TMP10:%.*]] = tail call i1 @llvm.type.test(ptr [[TMP9]], metadata !"_ZTSM1SFvvE.virtual"), !nosanitize [[META5]]
@@ -60,7 +60,7 @@ void f(S *s, void (S::*p)()) {
 // NO-MERGE:       [[MEMPTR_VIRTUAL19]]:
 // NO-MERGE-NEXT:    [[MEMPTR_VIRTUALFN9:%.*]] = load ptr, ptr [[TMP9]], align 8, !nosanitize [[META5]]
 // NO-MERGE-NEXT:    tail call void [[MEMPTR_VIRTUALFN9]](ptr noundef nonnull align 1 dereferenceable(1) [[TMP0]]) #[[ATTR5]]
-// NO-MERGE-NEXT:    [[VTABLE20:%.*]] = load ptr, ptr [[TMP0]], align 8, !tbaa [[TBAA2]]
+// NO-MERGE-NEXT:    [[VTABLE20:%.*]] = load ptr, ptr [[TMP0]], align 8, !tbaa [[VTABLE_POINTER_TBAA2]]
 // NO-MERGE-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[VTABLE20]], i64 [[P_COERCE0]]
 // NO-MERGE-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[TMP11]], i64 -1
 // NO-MERGE-NEXT:    [[TMP13:%.*]] = tail call i1 @llvm.type.test(ptr [[TMP12]], metadata !"_ZTSM1SFvvE.virtual"), !nosanitize [[META5]]
@@ -84,7 +84,7 @@ void f(S *s, void (S::*p)()) {
 // MERGE-NEXT:    [[MEMPTR_ISVIRTUAL_NOT:%.*]] = icmp eq i64 [[TMP1]], 0
 // MERGE-NEXT:    br i1 [[MEMPTR_ISVIRTUAL_NOT]], label %[[MEMPTR_NONVIRTUAL:.*]], label %[[MEMPTR_VIRTUAL:.*]]
 // MERGE:       [[MEMPTR_VIRTUAL]]:
-// MERGE-NEXT:    [[VTABLE:%.*]] = load ptr, ptr [[TMP0]], align 8, !tbaa [[TBAA2:![0-9]+]]
+// MERGE-NEXT:    [[VTABLE:%.*]] = load ptr, ptr [[TMP0]], align 8, !tbaa [[VTABLE_POINTER_TBAA2:![0-9]+]]
 // MERGE-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[VTABLE]], i64 [[P_COERCE0]]
 // MERGE-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[TMP2]], i64 -1
 // MERGE-NEXT:    [[TMP4:%.*]] = tail call i1 @llvm.type.test(ptr [[TMP3]], metadata !"_ZTSM1SFvvE.virtual"), !nosanitize [[META5:![0-9]+]]
@@ -101,7 +101,7 @@ void f(S *s, void (S::*p)()) {
 // MERGE:       [[MEMPTR_VIRTUAL6]]:
 // MERGE-NEXT:    [[MEMPTR_VIRTUALFN:%.*]] = load ptr, ptr [[TMP3]], align 8, !nosanitize [[META5]]
 // MERGE-NEXT:    tail call void [[MEMPTR_VIRTUALFN]](ptr noundef nonnull align 1 dereferenceable(1) [[TMP0]]) #[[ATTR4:[0-9]+]]
-// MERGE-NEXT:    [[VTABLE7:%.*]] = load ptr, ptr [[TMP0]], align 8, !tbaa [[TBAA2]]
+// MERGE-NEXT:    [[VTABLE7:%.*]] = load ptr, ptr [[TMP0]], align 8, !tbaa [[VTABLE_POINTER_TBAA2]]
 // MERGE-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[VTABLE7]], i64 [[P_COERCE0]]
 // MERGE-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP8]], i64 -1
 // MERGE-NEXT:    [[TMP10:%.*]] = tail call i1 @llvm.type.test(ptr [[TMP9]], metadata !"_ZTSM1SFvvE.virtual"), !nosanitize [[META5]]
@@ -109,7 +109,7 @@ void f(S *s, void (S::*p)()) {
 // MERGE:       [[MEMPTR_VIRTUAL17]]:
 // MERGE-NEXT:    [[MEMPTR_VIRTUALFN8:%.*]] = load ptr, ptr [[TMP9]], align 8, !nosanitize [[META5]]
 // MERGE-NEXT:    tail call void [[MEMPTR_VIRTUALFN8]](ptr noundef nonnull align 1 dereferenceable(1) [[TMP0]]) #[[ATTR4]]
-// MERGE-NEXT:    [[VTABLE18:%.*]] = load ptr, ptr [[TMP0]], align 8, !tbaa [[TBAA2]]
+// MERGE-NEXT:    [[VTABLE18:%.*]] = load ptr, ptr [[TMP0]], align 8, !tbaa [[VTABLE_POINTER_TBAA2]]
 // MERGE-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[VTABLE18]], i64 [[P_COERCE0]]
 // MERGE-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[TMP11]], i64 -1
 // MERGE-NEXT:    [[TMP13:%.*]] = tail call i1 @llvm.type.test(ptr [[TMP12]], metadata !"_ZTSM1SFvvE.virtual"), !nosanitize [[META5]]
@@ -123,9 +123,17 @@ void f(S *s, void (S::*p)()) {
 // MERGE-NEXT:    [[TMP14:%.*]] = phi ptr [ [[MEMPTR_VIRTUALFN19]], %[[MEMPTR_VIRTUAL17]] ], [ [[MEMPTR_NONVIRTUALFN]], %[[MEMPTR_NONVIRTUAL21]] ]
 // MERGE-NEXT:    tail call void [[TMP14]](ptr noundef nonnull align 1 dereferenceable(1) [[TMP0]]) #[[ATTR4]]
 // MERGE-NEXT:    ret void
-
-// MERGE: [[ATTR3]] = { noreturn nounwind }
-// MERGE: [[ATTR4]] = { nounwind }
-
-// NO-MERGE: [[ATTR4]] = { nomerge noreturn nounwind }
-// NO-MERGE: [[ATTR5]] = { nounwind }
+//
+//.
+// NO-MERGE: [[VTABLE_POINTER_TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
+// NO-MERGE: [[META3]] = !{!"vtable pointer", [[META4:![0-9]+]], i64 0}
+// NO-MERGE: [[META4]] = !{!"Simple C++ TBAA"}
+// NO-MERGE: [[META5]] = !{}
+// NO-MERGE: [[PROF6]] = !{!"branch_weights", i32 1048575, i32 1}
+//.
+// MERGE: [[VTABLE_POINTER_TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
+// MERGE: [[META3]] = !{!"vtable pointer", [[META4:![0-9]+]], i64 0}
+// MERGE: [[META4]] = !{!"Simple C++ TBAA"}
+// MERGE: [[META5]] = !{}
+// MERGE: [[PROF6]] = !{!"branch_weights", i32 1048575, i32 1}
+//.
diff --git a/clang/test/CodeGenCXX/cxx2a-destroying-delete.cpp b/clang/test/CodeGenCXX/cxx2a-destroying-delete.cpp
index f6f4a2ff735cc..24b1a4dd42977 100644
--- a/clang/test/CodeGenCXX/cxx2a-destroying-delete.cpp
+++ b/clang/test/CodeGenCXX/cxx2a-destroying-delete.cpp
@@ -1,6 +1,8 @@
 // RUN: %clang_cc1 -std=c++2a -fexceptions -emit-llvm %s -triple x86_64-linux-gnu -o - | FileCheck %s --check-prefixes=CHECK,CHECK-ITANIUM,CHECK-64BIT
-// RUN: %clang_cc1 -std=c++2a -fexceptions -emit-llvm %s -triple x86_64-windows -o - | FileCheck %s --check-prefixes=CHECK,CHECK-MSABI,CHECK-MSABI64,CHECK-64BIT
-// RUN: %clang_cc1 -std=c++2a -fexceptions -emit-llvm %s -triple i386-windows -o - | FileCheck %s --check-prefixes=CHECK,CHECK-MSABI,CHECK-MSABI32,CHECK-32BIT
+// RUN: %clang_cc1 -std=c++2a -fexceptions -emit-llvm %s -triple x86_64-windows -o - | FileCheck %s --check-prefixes=CHECK,CHECK-MSABI,CHECK-MSABI64,CHECK-64BIT,CLANG22-MSABI,CLANG22-MSABI64
+// RUN: %clang_cc1 -std=c++2a -fexceptions -emit-llvm %s -triple i386-windows -o - | FileCheck %s --check-prefixes=CHECK,CHECK-MSABI,CHECK-MSABI32,CHECK-32BIT,CLANG22-MSABI,CLANG22-MSABI32
+// RUN: %clang_cc1 -std=c++2a -fexceptions -emit-llvm %s -triple i386-windows -fclang-abi-compat=20 -o - | FileCheck %s --check-prefixes=CHECK,CHECK-MSABI,CHECK-32BIT,CHECK-MSABI32,CLANG21-MSABI,CLANG21-MSABI32
+// RUN: %clang_cc1 -std=c++2a -fexceptions -emit-llvm %s -triple x86_64-windows -fclang-abi-compat=20 -o - | FileCheck %s --check-prefixes=CHECK,CHECK-MSABI,CHECK-MSABI64,CHECK-64BIT,CLANG21-MSABI,CLANG21-MSABI64
 
 // PR46908: ensure the IR passes the verifier with optimizations enabled.
 // RUN: %clang_cc1 -std=c++2a -fexceptions -emit-llvm-only %s -triple x86_64-linux-gnu -O2
@@ -32,6 +34,20 @@ void delete_A(A *a) { delete a; }
 // CHECK-NOT: call
 // CHECK: }
 
+void glob_delete_A(A *a) { ::delete a; }
+
+// CHECK-LABEL: define {{.*}}glob_delete_A
+// CHECK: %[[a:.*]] = load
+// CHECK: icmp eq ptr %[[a]], null
+// CHECK: br i1
+
+// CHECK-ITANIUM: call void @_ZN1AD1Ev(ptr noundef nonnull align 8 dereferenceable(8) %[[a]])
+// CHECK-ITANIUM-NEXT: call void @_ZdlPvm(ptr noundef %[[a]], i64 noundef 8)
+// CHECK-MSABI64: call void @"??1A@@QEAA@XZ"(ptr noundef nonnull align 8 dereferenceable(8) %[[a]])
+// CHECK-MSABI64-NEXT: call void @"??3@YAXPEAX_K@Z"(ptr noundef %[[a]], i64 noundef 8)
+// CHECK-MSABI32: call x86_thiscallcc void @"??1A@@QAE@XZ"(ptr noundef nonnull align 4 dereferenceable(4) %[[a]])
+// CHECK-MSABI32-NEXT: call void @"??3@YAXPAXI@Z"(ptr noundef %[[a]], i32 noundef 4)
+
 struct B {
   virtual ~B();
   void operator delete(B*, std::destroying_delete_t);
@@ -51,6 +67,31 @@ void delete_B(B *b) { delete b; }
 // CHECK-NOT: call
 // CHECK: }
 
+void glob_delete_B(B *b) { ::delete b; }
+// CHECK-LABEL: define {{.*}}glob_delete_B
+// CHECK: %[[b:.*]] = load
+// CHECK: icmp eq ptr %[[b]], null
+// CHECK: br i1
+
+// CHECK-NOT: call
+// CHECK-MSABI: %[[VTABLE:.*]] = load
+// CHECK-MSABI: %[[DTOR:.*]] = load
+// CHECK-ITANIUM: %[[VTABLE:.*]] = load ptr, ptr %[[b]], align 8
+// CHECK-ITANIUM: %[[COMPLETEOFFSETPTR:.*]] = getelementptr inbounds i64, ptr %[[VTABLE]], i64 -2
+// CHECK-ITANIUM: %[[OFFSET:.*]] = load i64, ptr %[[COMPLETEOFFSETPTR]], align 8
+// CHECK-ITANIUM: %[[ALLOCATED:.*]] = getelementptr inbounds i8, ptr %[[b]], i64 %[[OFFSET]]
+// CHECK-ITANIUM: %[[VTABLE1:.*]] = load ptr, ptr %[[b]], align 8
+// CHECK-ITANIUM: %[[DTOR_ADDR:.*]] = getelementptr inbounds ptr, ptr %[[VTABLE1]], i64 0
+// CHECK-ITANIUM: %[[DTOR:.*]] = load ptr, ptr %[[DTOR_ADDR]], align 8
+// CHECK: call {{void|noundef ptr|x86_thiscallcc noundef ptr}} %[[DTOR]](ptr {{[^,]*}} %[[b]]
+// CLANG22-MSABI-SAME: , i32 noundef 5)
+// CLANG21-MSABI-SAME: , i32 noundef 0)
+// CLANG22-MSABI-NOT: call
+// CLANG21-MSABI64: call void @"??3@YAXPEAX_K@Z"({{.*}})
+// CLANG21-MSABI32: call void @"??3@YAXPAXI@Z"({{.*}})
+// CHECK-ITANIUM: call void @_ZdlPvm({{.*}})
+// CHECK: }
+
 struct Padding {
   virtual void f();
 };
@@ -159,21 +200,50 @@ H::~H() { call_in_dtor(); }
 // CHECK-ITANIUM-NOT: call
 // CHECK-ITANIUM: }
 
-// CHECK-MSABI64-LABEL: define {{.*}} @"??_GH@@UEAAPEAXI@Z"(
+// CHECK-MSABI64-LABEL: define {{.*}} @"??_GH@@UEAAPEAXI@Z"({{.*}},
 // CHECK-MSABI32-LABEL: define {{.*}} @"??_GH@@UAEPAXI@Z"(
+// CHECK-MSABI-SAME: i32 noundef %[[IP:.*]])
 // CHECK-MSABI-NOT: call{{ }}
-// CHECK-MSABI: load i32
-// CHECK-MSABI: icmp eq i32 {{.*}}, 0
-// CHECK-MSABI: br i1
+// CHECK-MSABI: store i32 %[[IP]], ptr %[[IP_ALLOCA:.*]]
+// CHECK-MSABI: %[[IMP_PARAM:.*]] = load i32, ptr %[[IP_ALLOCA]]
+// CLANG22: %[[THIRDBIT:.*]] = and i32 %[[IMP_PARAM]], 4
+// CLANG22-NEXT: %[[CHCK:.*]] = icmp eq i32 %[[THIRDBIT]], 0
+// CLANG22-NEXT: br i1 %[[CHCK]], label %dtor.entry_cont, label %dtor.call_dtor
+// CLANG21-MSABI: %[[FIRSTBIT:.*]] = and i32 %[[IMP_PARAM]], 1
+// CLANG21-MSABI: %[[CHCK:.*]] = icmp eq i32 %[[FIRSTBIT]], 0
+// CLANG21-MSABI: br i1 %[[CHCK]], label %dtor.continue, label %dtor.call_delete
 //
-// CHECK-MSABI-NOT: call{{ }}
-// CHECK-MSABI64: getelementptr {{.*}}, i64 24
-// CHECK-MSABI32: getelementptr {{.*}}, i32 20
-// CHECK-MSABI-NOT: call{{ }}
+// CLANG22-MSABI: dtor.call_dtor:
+// CLANG22-MSABI64-NEXT: call void @"??1H@@UEAA@XZ"({{.*}})
+// CLANG22-MSABI32-NEXT: call x86_thiscallcc void @"??1H@@UAE@XZ"({{.*}})
+// CLANG22-MSABI-NEXT: br label %dtor.entry_cont
+//
+// CLANG22-MSABI-LABEL: dtor.entry_cont:
+// CLANG22-MSABI-NEXT: %[[FIRSTBIT:.*]] = and i32 %[[IMP_PARAM]], 1
+// CLANG22-MSABI-NEXT: %[[CHCK1:.*]] = icmp eq i32 %[[FIRSTBIT]], 0
+// CLANG22-MSABI-NEXT: br i1 %[[CHCK1]], label %dtor.continue, label %dtor.call_delete
+//
+// CLANG22-MSABI-LABEL: dtor.call_delete:
+// CLANG22-MSABI: %[[THIRDBIT1:.*]] = and i32 %[[IMP_PARAM]], 4
+// CLANG22-MSABI-NEXT: %[[CHCK2:.*]] = icmp eq i32 %[[THIRDBIT1]], 0
+// CLANG22-MSABI-NEXT: br i1 %[[CHCK2]], label %dtor.call_class_delete, label %dtor.call_glob_delete
+//
+// CLANG22-MSABI-LABEL: dtor.call_glob_delete:
+// CLANG22-MSABI64: call void @"??3@YAXPEAX_K@Z"(ptr noundef %{{.*}}, i64 noundef 48)
+// CLANG22-MSABI32: call void @"??3@YAXPAXIW4align_val_t@std@@@Z"(ptr noundef %{{.*}}, i32 noundef 32, i32 noundef 16)
+// CLANG22-MSABI-NEXT: br label %[[RETURN:.*]]
+//
+// CLANG21-MSABI: dtor.call_delete:
+// CLANG22-MSABI: dtor.call_class_delete:
+// CLANG22-MSABI-NOT: call{{ }}
+// CLANG22-MSABI64: getelementptr {{.*}}, i64 24
+// CLANG22-MSABI32: getelementptr {{.*}}, i32 20
+// CLANG22-MSABI-NOT: call{{ }}
 // CHECK-MSABI64: call void @"??3F@@SAXPEAU0@Udestroying_delete_t@std@@_KW4align_val_t@2@@Z"({{.*}}, i64 noundef 48, i64 noundef 16)
 // CHECK-MSABI32: call void @"??3F@@SAXPAU0@Udestroying_delete_t@std@@IW4align_val_t@2@@Z"({{.*}}, i32 noundef 32, i32 noundef 16)
-// CHECK-MSABI: br label %[[RETURN:.*]]
+// CHECK-MSABI: br label %[[RETURN:]]
 //
+// CHECK-MSABI: dtor.continue:
 // CHECK-MSABI64: call void @"??1H@@UEAA@XZ"(
 // CHECK-MSABI32: call x86_thiscallcc void @"??1H@@UAE@XZ"(
 // CHECK-MSABI: br label %[[RETURN]]
@@ -194,9 +264,32 @@ I::~I() { call_in_dtor(); }
 // CHECK-MSABI32-LABEL: define {{.*}} @"??_GI@@UAEPAXI@Z"(
 // CHECK-MSABI-NOT: call{{ }}
 // CHECK-MSABI: load i32
-// CHECK-MSABI: icmp eq i32 {{.*}}, 0
-// CHECK-MSABI: br i1
+// CLANG22-MSABI-NEXT: and i32 %[[IMP_PARAM:.*]], 4
+// CLANG22-MSABI-NEXT: icmp eq i32 {{.*}}, 0
+// CLANG22-MSABI-NEXT: br i1 %{{.*}}, label %dtor.entry_cont, label %dtor.call_dtor
+//
+// CLANG22-MSABI: dtor.call_dtor:
+// CLANG22-MSABI64-NEXT: call void @"??1I@@UEAA@XZ"({{.*}})
+// CLANG22-MSABI32-NEXT: call x86_thiscallcc void @"??1I@@UAE@XZ"({{.*}})
+// CLANG22-MSABI-NEXT: br label %dtor.entry_cont
+//
+// CLANG22-MSABI: dtor.entry_cont:
+// CLANG22-MSABI-NEXT: and i32 %[[IMP_PARAM]], 1
+// CLANG22-MSABI-NEXT: icmp eq i32 %{{.*}}, 0
+// CLANG22-MSABI-NEXT: br i1 %{{.*}}, label %dtor.continue, label %dtor.call_delete
+//
+// CLANG22-MSABI: dtor.call_delete:
+// CLANG22-MSABI-NEXT: %[[THIRDBIT1:.*]] = and i32 %[[IMP_PARAM]], 4
+// CLANG22-MSABI-NEXT: %[[CHCK2:.*]] = icmp eq i32 %[[THIRDBIT1]], 0
+// CLANG22-MSABI-NEXT: br i1 %[[CHCK2]], label %dtor.call_class_delete, label %dtor.call_glob_delete
+//
+// CLANG22-MSABI: dtor.call_glob_delete:
+// CLANG22-MSABI64: call void @"??3@YAXPEAX_KW4align_val_t@std@@@Z"(ptr noundef %{{.*}}, i64 noundef 96, i64 noundef 32)
+// CLANG22-MSABI32: call void @"??3@YAXPAXIW4align_val_t@std@@@Z"(ptr noundef %{{.*}}, i32 noundef 64, i32 noundef 32)
+// CLANG22-MSABI-NEXT: br label %[[RETURN:.*]]
 //
+// CLANG21-MSABI: dtor.call_delete:
+// CLANG22-MSABI: dtor.call_class_delete:
 // CHECK-MSABI-NOT: call{{ }}
 // CHECK-MSABI64: getelementptr {{.*}}, i64 24
 // CHECK-MSABI32: getelementptr {{.*}}, i32 20
diff --git a/clang/test/CodeGenCXX/inline-then-fold-variadics.cpp b/clang/test/CodeGenCXX/inline-then-fold-variadics.cpp
index 5f83545f78127..ab3695a3d9ce3 100644
--- a/clang/test/CodeGenCXX/inline-then-fold-variadics.cpp
+++ b/clang/test/CodeGenCXX/inline-then-fold-variadics.cpp
@@ -1,4 +1,4 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6
 // REQUIRES: webassembly-registered-target
 
 // Simple calls to known variadic functions that are completely elided when
@@ -33,32 +33,32 @@ template <typename X, typename Y> static Y second(...) {
 
 extern "C" {
 
-// CHECK-LABEL: define {{[^@]+}}@first_pair_i32
-// CHECK-SAME: (i32 noundef returned [[X:%.*]], i32 noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define noundef i32 @first_pair_i32(
+// CHECK-SAME: i32 noundef returned [[X:%.*]], i32 noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    ret i32 [[X]]
 //
 int first_pair_i32(int x, int y) { return first<int, int>(x, y); }
 
-// CHECK-LABEL: define {{[^@]+}}@second_pair_i32
-// CHECK-SAME: (i32 noundef [[X:%.*]], i32 noundef returned [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define noundef i32 @second_pair_i32(
+// CHECK-SAME: i32 noundef [[X:%.*]], i32 noundef returned [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    ret i32 [[Y]]
 //
 int second_pair_i32(int x, int y) { return second<int, int>(x, y); }
 
-// CHECK-LABEL: define {{[^@]+}}@first_pair_f64
-// CHECK-SAME: (double noundef returned [[X:%.*]], double noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define noundef double @first_pair_f64(
+// CHECK-SAME: double noundef returned [[X:%.*]], double noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    ret double [[X]]
 //
 double first_pair_f64(double x, double y) {
   return first<double, double>(x, y);
 }
 
-// CHECK-LABEL: define {{[^@]+}}@second_pair_f64
-// CHECK-SAME: (double noundef [[X:%.*]], double noundef returned [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define noundef double @second_pair_f64(
+// CHECK-SAME: double noundef [[X:%.*]], double noundef returned [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    ret double [[Y]]
 //
 double second_pair_f64(double x, double y) {
@@ -68,30 +68,30 @@ double second_pair_f64(double x, double y) {
 
 extern "C" {
 
-// CHECK-LABEL: define {{[^@]+}}@first_i32_f64
-// CHECK-SAME: (i32 noundef returned [[X:%.*]], double noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define noundef i32 @first_i32_f64(
+// CHECK-SAME: i32 noundef returned [[X:%.*]], double noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    ret i32 [[X]]
 //
 int first_i32_f64(int x, double y) { return first<int, double>(x, y); }
 
-// CHECK-LABEL: define {{[^@]+}}@second_i32_f64
-// CHECK-SAME: (i32 noundef [[X:%.*]], double noundef returned [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define noundef double @second_i32_f64(
+// CHECK-SAME: i32 noundef [[X:%.*]], double noundef returned [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    ret double [[Y]]
 //
 double second_i32_f64(int x, double y) { return second<int, double>(x, y); }
 
-// CHECK-LABEL: define {{[^@]+}}@first_f64_i32
-// CHECK-SAME: (double noundef returned [[X:%.*]], i32 noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define noundef double @first_f64_i32(
+// CHECK-SAME: double noundef returned [[X:%.*]], i32 noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    ret double [[X]]
 //
 double first_f64_i32(double x, int y) { return first<double, int>(x, y); }
 
-// CHECK-LABEL: define {{[^@]+}}@second_f64_i32
-// CHECK-SAME: (double noundef [[X:%.*]], i32 noundef returned [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define noundef i32 @second_f64_i32(
+// CHECK-SAME: double noundef [[X:%.*]], i32 noundef returned [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    ret i32 [[Y]]
 //
 int second_f64_i32(double x, int y) { return second<double, int>(x, y); }
@@ -100,38 +100,38 @@ int second_f64_i32(double x, int y) { return second<double, int>(x, y); }
 extern "C" {
 typedef uint64_t ulong2 __attribute__((__vector_size__(16), __aligned__(16)));
 
-// CHECK-LABEL: define {{[^@]+}}@first_i32_ulong2
-// CHECK-SAME: (i32 noundef returned [[X:%.*]], ptr noundef readonly captures(none) [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define noundef i32 @first_i32_ulong2(
+// CHECK-SAME: i32 noundef returned [[X:%.*]], ptr noundef readonly captures(none) [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    ret i32 [[X]]
 //
 int first_i32_ulong2(int x, ulong2 *y) { return first<int, ulong2>(x, *y); }
 
-// CHECK-LABEL: define {{[^@]+}}@second_i32_ulong2
-// CHECK-SAME: (i32 noundef [[X:%.*]], ptr noundef readonly captures(none) [[Y:%.*]], ptr noundef writeonly captures(none) initializes((0, 16)) [[R:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[Y]], align 16, !tbaa [[TBAA2:![0-9]+]]
-// CHECK-NEXT:    store <2 x i64> [[TMP0]], ptr [[R]], align 16, !tbaa [[TBAA2]]
+// CHECK-LABEL: define void @second_i32_ulong2(
+// CHECK-SAME: i32 noundef [[X:%.*]], ptr noundef readonly captures(none) [[Y:%.*]], ptr noundef writeonly captures(none) initializes((0, 16)) [[R:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[Y]], align 16, !tbaa [[INT_TBAA2:![0-9]+]]
+// CHECK-NEXT:    store <2 x i64> [[TMP0]], ptr [[R]], align 16, !tbaa [[INT_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 void second_i32_ulong2(int x, ulong2 *y, ulong2 *r) {
   *r = second<int, ulong2>(x, *y);
 }
 
-// CHECK-LABEL: define {{[^@]+}}@first_ulong2_i32
-// CHECK-SAME: (ptr noundef readonly captures(none) [[X:%.*]], i32 noundef [[Y:%.*]], ptr noundef writeonly captures(none) initializes((0, 16)) [[R:%.*]]) local_unnamed_addr #[[ATTR1]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[X]], align 16, !tbaa [[TBAA2]]
-// CHECK-NEXT:    store <2 x i64> [[TMP0]], ptr [[R]], align 16, !tbaa [[TBAA2]]
+// CHECK-LABEL: define void @first_ulong2_i32(
+// CHECK-SAME: ptr noundef readonly captures(none) [[X:%.*]], i32 noundef [[Y:%.*]], ptr noundef writeonly captures(none) initializes((0, 16)) [[R:%.*]]) local_unnamed_addr #[[ATTR1]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[X]], align 16, !tbaa [[INT_TBAA2]]
+// CHECK-NEXT:    store <2 x i64> [[TMP0]], ptr [[R]], align 16, !tbaa [[INT_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 void first_ulong2_i32(ulong2 *x, int y, ulong2 *r) {
   *r = first<ulong2, int>(*x, y);
 }
 
-// CHECK-LABEL: define {{[^@]+}}@second_ulong2_i32
-// CHECK-SAME: (ptr noundef readonly captures(none) [[X:%.*]], i32 noundef returned [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define noundef i32 @second_ulong2_i32(
+// CHECK-SAME: ptr noundef readonly captures(none) [[X:%.*]], i32 noundef returned [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    ret i32 [[Y]]
 //
 int second_ulong2_i32(ulong2 *x, int y) { return second<ulong2, int>(*x, y); }
@@ -149,33 +149,38 @@ typedef struct {
 
 extern "C" {
 
-// CHECK-LABEL: define {{[^@]+}}@first_i32_asc
-// CHECK-SAME: (i32 noundef returned [[X:%.*]], ptr noundef readonly captures(none) [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define noundef i32 @first_i32_asc(
+// CHECK-SAME: i32 noundef returned [[X:%.*]], ptr noundef readonly captures(none) [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    ret i32 [[X]]
 //
 int first_i32_asc(int x, asc *y) { return first<int, asc>(x, *y); }
 
-// CHECK-LABEL: define {{[^@]+}}@second_i32_asc
-// CHECK-SAME: (i32 noundef [[X:%.*]], ptr noundef readonly captures(none) [[Y:%.*]], ptr noundef writeonly captures(none) initializes((0, 24)) [[R:%.*]]) local_unnamed_addr #[[ATTR1]] {
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define void @second_i32_asc(
+// CHECK-SAME: i32 noundef [[X:%.*]], ptr noundef readonly captures(none) [[Y:%.*]], ptr noundef writeonly captures(none) initializes((0, 24)) [[R:%.*]]) local_unnamed_addr #[[ATTR1]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    tail call void @llvm.memmove.p0.p0.i32(ptr noundef nonnull align 8 dereferenceable(24) [[R]], ptr noundef nonnull align 1 dereferenceable(24) [[Y]], i32 24, i1 false)
 // CHECK-NEXT:    ret void
 //
 void second_i32_asc(int x, asc *y, asc *r) { *r = second<int, asc>(x, *y); }
 
-// CHECK-LABEL: define {{[^@]+}}@first_asc_i32
-// CHECK-SAME: (ptr noundef readonly captures(none) [[X:%.*]], i32 noundef [[Y:%.*]], ptr noundef writeonly captures(none) initializes((0, 24)) [[R:%.*]]) local_unnamed_addr #[[ATTR1]] {
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define void @first_asc_i32(
+// CHECK-SAME: ptr noundef readonly captures(none) [[X:%.*]], i32 noundef [[Y:%.*]], ptr noundef writeonly captures(none) initializes((0, 24)) [[R:%.*]]) local_unnamed_addr #[[ATTR1]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    tail call void @llvm.memmove.p0.p0.i32(ptr noundef nonnull align 8 dereferenceable(24) [[R]], ptr noundef nonnull align 1 dereferenceable(24) [[X]], i32 24, i1 false)
 // CHECK-NEXT:    ret void
 //
 void first_asc_i32(asc *x, int y, asc *r) { *r = first<asc, int>(*x, y); }
 
-// CHECK-LABEL: define {{[^@]+}}@second_asc_i32
-// CHECK-SAME: (ptr noundef readonly captures(none) [[X:%.*]], i32 noundef returned [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define noundef i32 @second_asc_i32(
+// CHECK-SAME: ptr noundef readonly captures(none) [[X:%.*]], i32 noundef returned [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    ret i32 [[Y]]
 //
 int second_asc_i32(asc *x, int y) { return second<asc, int>(*x, y); }
 }
+//.
+// CHECK: [[INT_TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
+// CHECK: [[META3]] = !{!"omnipotent char", [[META4:![0-9]+]], i64 0}
+// CHECK: [[META4]] = !{!"Simple C++ TBAA"}
+//.
diff --git a/clang/test/CodeGenCXX/load-reference-metadata.cpp b/clang/test/CodeGenCXX/load-reference-metadata.cpp
index daceb752a732b..abfdd055c3ad6 100644
--- a/clang/test/CodeGenCXX/load-reference-metadata.cpp
+++ b/clang/test/CodeGenCXX/load-reference-metadata.cpp
@@ -1,4 +1,4 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6
 // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm -std=c++11 -O1 -disable-llvm-passes %s -o - | FileCheck %s
 
 struct alignas(32) F { int x; };
@@ -13,20 +13,20 @@ struct S {
 // CHECK-SAME: ptr noundef nonnull align 8 dereferenceable(24) [[S:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[S_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    store ptr [[S]], ptr [[S_ADDR]], align 8, !tbaa [[TBAA2:![0-9]+]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 8, !tbaa [[TBAA2]], !nonnull [[META7:![0-9]+]], !align [[META8:![0-9]+]]
+// CHECK-NEXT:    store ptr [[S]], ptr [[S_ADDR]], align 8, !tbaa [[_ZTS1SPTR_TBAA2:![0-9]+]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 8, !tbaa [[_ZTS1SPTR_TBAA2]], !nonnull [[META7:![0-9]+]], !align [[META8:![0-9]+]]
 // CHECK-NEXT:    [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_S:%.*]], ptr [[TMP0]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[A]], align 8, !tbaa [[TBAA9:![0-9]+]], !nonnull [[META7]]
-// CHECK-NEXT:    store i8 0, ptr [[TMP1]], align 1, !tbaa [[TBAA14:![0-9]+]]
-// CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[S_ADDR]], align 8, !tbaa [[TBAA2]], !nonnull [[META7]], !align [[META8]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[A]], align 8, !tbaa [[CHARPTR_TBAA9:![0-9]+]], !nonnull [[META7]]
+// CHECK-NEXT:    store i8 0, ptr [[TMP1]], align 1, !tbaa [[CHAR_TBAA14:![0-9]+]]
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[S_ADDR]], align 8, !tbaa [[_ZTS1SPTR_TBAA2]], !nonnull [[META7]], !align [[META8]]
 // CHECK-NEXT:    [[B:%.*]] = getelementptr inbounds nuw [[STRUCT_S]], ptr [[TMP2]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[B]], align 8, !tbaa [[TBAA15:![0-9]+]], !nonnull [[META7]], !align [[META16:![0-9]+]]
-// CHECK-NEXT:    store i32 0, ptr [[TMP3]], align 4, !tbaa [[TBAA17:![0-9]+]]
-// CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[S_ADDR]], align 8, !tbaa [[TBAA2]], !nonnull [[META7]], !align [[META8]]
+// CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[B]], align 8, !tbaa [[INTPTR_TBAA15:![0-9]+]], !nonnull [[META7]], !align [[META16:![0-9]+]]
+// CHECK-NEXT:    store i32 0, ptr [[TMP3]], align 4, !tbaa [[INT_TBAA17:![0-9]+]]
+// CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[S_ADDR]], align 8, !tbaa [[_ZTS1SPTR_TBAA2]], !nonnull [[META7]], !align [[META8]]
 // CHECK-NEXT:    [[C:%.*]] = getelementptr inbounds nuw [[STRUCT_S]], ptr [[TMP4]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[C]], align 8, !tbaa [[TBAA19:![0-9]+]], !nonnull [[META7]], !align [[META20:![0-9]+]]
+// CHECK-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[C]], align 8, !tbaa [[_ZTS1FPTR_TBAA19:![0-9]+]], !nonnull [[META7]], !align [[META20:![0-9]+]]
 // CHECK-NEXT:    [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_F:%.*]], ptr [[TMP5]], i32 0, i32 0
-// CHECK-NEXT:    store i32 0, ptr [[X]], align 32, !tbaa [[TBAA21:![0-9]+]]
+// CHECK-NEXT:    store i32 0, ptr [[X]], align 32, !tbaa [[INT_TBAA21:![0-9]+]]
 // CHECK-NEXT:    ret void
 //
 void test(S &s) {
@@ -42,13 +42,13 @@ extern B (&bb)[2];
 // CHECK-LABEL: define dso_local void @_Z13test_externalv(
 // CHECK-SAME: ) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr @b, align 8, !tbaa [[TBAA23:![0-9]+]], !nonnull [[META7]], !align [[META8]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr @b, align 8, !tbaa [[_ZTS1BPTR_TBAA23:![0-9]+]], !nonnull [[META7]], !align [[META8]]
 // CHECK-NEXT:    [[C:%.*]] = getelementptr inbounds nuw [[STRUCT_B:%.*]], ptr [[TMP0]], i32 0, i32 2
-// CHECK-NEXT:    store i8 0, ptr [[C]], align 8, !tbaa [[TBAA25:![0-9]+]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr @bb, align 8, !tbaa [[TBAA23]], !nonnull [[META7]], !align [[META20]]
+// CHECK-NEXT:    store i8 0, ptr [[C]], align 8, !tbaa [[CHAR_TBAA25:![0-9]+]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr @bb, align 8, !tbaa [[_ZTS1BPTR_TBAA23]], !nonnull [[META7]], !align [[META20]]
 // CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x %struct.B], ptr [[TMP1]], i64 0, i64 0
 // CHECK-NEXT:    [[C1:%.*]] = getelementptr inbounds nuw [[STRUCT_B]], ptr [[ARRAYIDX]], i32 0, i32 2
-// CHECK-NEXT:    store i8 0, ptr [[C1]], align 16, !tbaa [[TBAA25]]
+// CHECK-NEXT:    store i8 0, ptr [[C1]], align 16, !tbaa [[CHAR_TBAA25]]
 // CHECK-NEXT:    ret void
 //
 void test_external() {
@@ -60,8 +60,8 @@ void test_external() {
 // CHECK-SAME: ptr noundef nonnull align 8 dereferenceable(17) [[S:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[S_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    store ptr [[S]], ptr [[S_ADDR]], align 8, !tbaa [[TBAA23]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 8, !tbaa [[TBAA23]], !nonnull [[META7]], !align [[META8]]
+// CHECK-NEXT:    store ptr [[S]], ptr [[S_ADDR]], align 8, !tbaa [[_ZTS1BPTR_TBAA23]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 8, !tbaa [[_ZTS1BPTR_TBAA23]], !nonnull [[META7]], !align [[META8]]
 // CHECK-NEXT:    [[C:%.*]] = getelementptr inbounds nuw [[STRUCT_B:%.*]], ptr [[TMP0]], i32 0, i32 2
 // CHECK-NEXT:    ret ptr [[C]]
 //
@@ -69,30 +69,30 @@ char* test_deref_only(B &s) {
   return &s.c;
 }
 //.
-// CHECK: [[TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
+// CHECK: [[_ZTS1SPTR_TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
 // CHECK: [[META3]] = !{!"p1 _ZTS1S", [[META4:![0-9]+]], i64 0}
 // CHECK: [[META4]] = !{!"any pointer", [[META5:![0-9]+]], i64 0}
 // CHECK: [[META5]] = !{!"omnipotent char", [[META6:![0-9]+]], i64 0}
 // CHECK: [[META6]] = !{!"Simple C++ TBAA"}
 // CHECK: [[META7]] = !{}
 // CHECK: [[META8]] = !{i64 8}
-// CHECK: [[TBAA9]] = !{[[META10:![0-9]+]], [[META11:![0-9]+]], i64 0}
+// CHECK: [[CHARPTR_TBAA9]] = !{[[META10:![0-9]+]], [[META11:![0-9]+]], i64 0}
 // CHECK: [[META10]] = !{!"_ZTS1S", [[META11]], i64 0, [[META12:![0-9]+]], i64 8, [[META13:![0-9]+]], i64 16}
 // CHECK: [[META11]] = !{!"p1 omnipotent char", [[META4]], i64 0}
 // CHECK: [[META12]] = !{!"p1 int", [[META4]], i64 0}
 // CHECK: [[META13]] = !{!"p1 _ZTS1F", [[META4]], i64 0}
-// CHECK: [[TBAA14]] = !{[[META5]], [[META5]], i64 0}
-// CHECK: [[TBAA15]] = !{[[META10]], [[META12]], i64 8}
+// CHECK: [[CHAR_TBAA14]] = !{[[META5]], [[META5]], i64 0}
+// CHECK: [[INTPTR_TBAA15]] = !{[[META10]], [[META12]], i64 8}
 // CHECK: [[META16]] = !{i64 4}
-// CHECK: [[TBAA17]] = !{[[META18:![0-9]+]], [[META18]], i64 0}
+// CHECK: [[INT_TBAA17]] = !{[[META18:![0-9]+]], [[META18]], i64 0}
 // CHECK: [[META18]] = !{!"int", [[META5]], i64 0}
-// CHECK: [[TBAA19]] = !{[[META10]], [[META13]], i64 16}
+// CHECK: [[_ZTS1FPTR_TBAA19]] = !{[[META10]], [[META13]], i64 16}
 // CHECK: [[META20]] = !{i64 32}
-// CHECK: [[TBAA21]] = !{[[META22:![0-9]+]], [[META18]], i64 0}
+// CHECK: [[INT_TBAA21]] = !{[[META22:![0-9]+]], [[META18]], i64 0}
 // CHECK: [[META22]] = !{!"_ZTS1F", [[META18]], i64 0}
-// CHECK: [[TBAA23]] = !{[[META24:![0-9]+]], [[META24]], i64 0}
+// CHECK: [[_ZTS1BPTR_TBAA23]] = !{[[META24:![0-9]+]], [[META24]], i64 0}
 // CHECK: [[META24]] = !{!"p1 _ZTS1B", [[META4]], i64 0}
-// CHECK: [[TBAA25]] = !{[[META26:![0-9]+]], [[META5]], i64 16}
+// CHECK: [[CHAR_TBAA25]] = !{[[META26:![0-9]+]], [[META5]], i64 16}
 // CHECK: [[META26]] = !{!"_ZTS1B", [[META27:![0-9]+]], i64 8, [[META5]], i64 16}
 // CHECK: [[META27]] = !{!"long long", [[META5]], i64 0}
 //.
diff --git a/clang/test/CodeGenCXX/mangle-windows.cpp b/clang/test/CodeGenCXX/mangle-windows.cpp
index 3d5a1e9a868ef..737abcf6e3498 100644
--- a/clang/test/CodeGenCXX/mangle-windows.cpp
+++ b/clang/test/CodeGenCXX/mangle-windows.cpp
@@ -4,6 +4,9 @@
 // RUN: %clang_cc1 -emit-llvm %s -o - -triple=i386-mingw32 | \
 // RUN:     FileCheck --check-prefix=ITANIUM %s
 
+// RUN: %clang_cc1 -emit-llvm %s -o - -triple=i386-cygwin | \
+// RUN:     FileCheck --check-prefix=ITANIUM %s
+
 void __stdcall f1(void) {}
 // WIN: define dso_local x86_stdcallcc void @"?f1@@YGXXZ"
 // ITANIUM: define dso_local x86_stdcallcc void @"\01__Z2f1v@0"
diff --git a/clang/test/CodeGenCXX/microsoft-abi-structors.cpp b/clang/test/CodeGenCXX/microsoft-abi-structors.cpp
index 07abc3d065e5e..497775840e049 100644
--- a/clang/test/CodeGenCXX/microsoft-abi-structors.cpp
+++ b/clang/test/CodeGenCXX/microsoft-abi-structors.cpp
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -no-enable-noundef-analysis -emit-llvm -fno-rtti %s -std=c++11 -o - -mconstructor-aliases -triple=i386-pc-win32 -fno-rtti > %t
-// RUN: FileCheck %s < %t
+// RUN: FileCheck --check-prefixes CHECK,CLANG22 %s < %t
 // vftables are emitted very late, so do another pass to try to keep the checks
 // in source order.
 // RUN: FileCheck --check-prefix DTORS %s < %t
@@ -8,6 +8,7 @@
 // RUN: FileCheck --check-prefix DTORS4 %s < %t
 //
 // RUN: %clang_cc1 -emit-llvm %s -o - -mconstructor-aliases -triple=x86_64-pc-win32 -fno-rtti -std=c++11 | FileCheck --check-prefix DTORS-X64 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -emit-llvm -fno-rtti %s -std=c++11 -o - -mconstructor-aliases -triple=i386-pc-win32 -fclang-abi-compat=20 | FileCheck --check-prefixes CHECK,CLANG21 %s
 
 namespace basic {
 
@@ -52,7 +53,8 @@ struct C {
 // DTORS:        store ptr %{{.*}}, ptr %[[RETVAL:retval]]
 // DTORS:        %[[SHOULD_DELETE_VALUE:[0-9a-z._]+]] = load i32, ptr %[[SHOULD_DELETE_VAR]]
 // DTORS:        call x86_thiscallcc void @"??1C@basic@@UAE@XZ"(ptr {{[^,]*}} %[[THIS:[0-9a-z]+]])
-// DTORS-NEXT:   %[[CONDITION:[0-9]+]] = icmp eq i32 %[[SHOULD_DELETE_VALUE]], 0
+// DTORS-NEXT:   %[[AND:[0-9]+]] = and i32 %[[SHOULD_DELETE_VALUE]], 1
+// DTORS-NEXT:   %[[CONDITION:[0-9]+]] = icmp eq i32 %[[AND]], 0
 // DTORS-NEXT:   br i1 %[[CONDITION]], label %[[CONTINUE_LABEL:[0-9a-z._]+]], label %[[CALL_DELETE_LABEL:[0-9a-z._]+]]
 //
 // DTORS:      [[CALL_DELETE_LABEL]]
@@ -113,8 +115,9 @@ void call_deleting_dtor_and_global_delete(C *obj_ptr) {
 // CHECK-NEXT:   %[[VTABLE:.*]] = load ptr, ptr %[[OBJ_PTR_VALUE]]
 // CHECK-NEXT:   %[[PVDTOR:.*]] = getelementptr inbounds ptr, ptr %[[VTABLE]], i64 0
 // CHECK-NEXT:   %[[VDTOR:.*]] = load ptr, ptr %[[PVDTOR]]
-// CHECK-NEXT:   %[[CALL:.*]] = call x86_thiscallcc ptr %[[VDTOR]](ptr {{[^,]*}} %[[OBJ_PTR_VALUE]], i32 0)
-// CHECK-NEXT:   call void @"??3@YAXPAX@Z"(ptr %[[CALL]])
+// CLANG22-NEXT:   %[[CALL:.*]] = call x86_thiscallcc ptr %[[VDTOR]](ptr {{[^,]*}} %[[OBJ_PTR_VALUE]], i32 5)
+// CLANG21-NEXT:   %[[CALL:.*]] = call x86_thiscallcc ptr %[[VDTOR]](ptr {{[^,]*}} %[[OBJ_PTR_VALUE]], i32 0)
+// CLANG21-NEXT: call void @"??3@YAXPAX@Z"(ptr %[[CALL]])
 // CHECK:      ret void
 }
 
@@ -458,3 +461,57 @@ class G {
 extern void testG() {
   G g;
 }
+
+namespace operator_delete {
+
+class H { virtual ~H();
+  void operator delete(void *);
+};
+H::~H() { }
+
+void checkH() {
+  new H();
+}
+// DTORS:      define linkonce_odr dso_local x86_thiscallcc ptr @"??_GH@operator_delete@@EAEPAXI@Z"(ptr {{[^,]*}} %this, i32 %should_call_delete) {{.*}} comdat {{.*}} {
+// DTORS:        store i32 %should_call_delete, ptr %[[SHOULD_DELETE_VAR:[0-9a-z._]+]], align 4
+// DTORS:        store ptr %{{.*}}, ptr %[[RETVAL:retval]]
+// DTORS:        %[[SHOULD_DELETE_VALUE:[0-9a-z._]+]] = load i32, ptr %[[SHOULD_DELETE_VAR]]
+// DTORS:        call x86_thiscallcc void @"??1H@operator_delete@@EAE@XZ"(ptr {{[^,]*}} %[[THIS:[0-9a-z]+]])
+// DTORS-NEXT:   %[[AND:[0-9]+]] = and i32 %[[SHOULD_DELETE_VALUE]], 1
+// DTORS-NEXT:   %[[CONDITION:[0-9]+]] = icmp eq i32 %[[AND]], 0
+// DTORS-NEXT:   br i1 %[[CONDITION]], label %[[CONTINUE_LABEL:[0-9a-z._]+]], label %[[CALL_DELETE_LABEL:[0-9a-z._]+]]
+//
+// DTORS:      [[CALL_DELETE_LABEL]]
+// DTORS-NEXT:   %[[AND:[0-9]+]] = and i32 %[[SHOULD_DELETE_VALUE]], 4
+// DTORS-NEXT:   %[[CONDITION1:[0-9]+]] = icmp eq i32 %[[AND]], 0
+// DTORS-NEXT:   br i1 %[[CONDITION1]], label %[[CALL_CLASS_DELETE:[0-9a-z._]+]], label %[[CALL_GLOB_DELETE:[0-9a-z._]+]]
+//
+// DTORS:      [[CALL_GLOB_DELETE]]
+// DTORS-NEXT:   call void @"??3@YAXPAX@Z"(ptr %[[THIS]])
+// DTORS-NEXT:   br label %[[CONTINUE_LABEL]]
+//
+// DTORS:      [[CALL_CLASS_DELETE]]
+// DTORS-NEXT:   call void @"??3H@operator_delete@@CAXPAX@Z"(ptr %[[THIS]])
+// DTORS-NEXT:   br label %[[CONTINUE_LABEL]]
+//
+// DTORS:      [[CONTINUE_LABEL]]
+// DTORS-NEXT:   %[[RET:.*]] = load ptr, ptr %[[RETVAL]]
+// DTORS-NEXT:   ret ptr %[[RET]]
+
+// CLANG21:      define linkonce_odr dso_local x86_thiscallcc ptr @"??_GH@operator_delete@@EAEPAXI@Z"(ptr {{[^,]*}} %this, i32 %should_call_delete) {{.*}} comdat {{.*}} {
+// CLANG21:        store i32 %should_call_delete, ptr %[[SHOULD_DELETE_VAR:[0-9a-z._]+]], align 4
+// CLANG21:        store ptr %{{.*}}, ptr %[[RETVAL:retval]]
+// CLANG21:        %[[SHOULD_DELETE_VALUE:[0-9a-z._]+]] = load i32, ptr %[[SHOULD_DELETE_VAR]]
+// CLANG21:        call x86_thiscallcc void @"??1H@operator_delete@@EAE@XZ"(ptr {{[^,]*}} %[[THIS:[0-9a-z]+]])
+// CLANG21-NEXT:   %[[AND:[0-9]+]] = and i32 %[[SHOULD_DELETE_VALUE]], 1
+// CLANG21-NEXT:   %[[CONDITION:[0-9]+]] = icmp eq i32 %[[AND]], 0
+// CLANG21-NEXT:   br i1 %[[CONDITION]], label %[[CONTINUE_LABEL:[0-9a-z._]+]], label %[[CALL_DELETE_LABEL:[0-9a-z._]+]]
+//
+// CLANG21:      [[CALL_DELETE_LABEL]]
+// CLANG21-NEXT:   call void @"??3H@operator_delete@@CAXPAX@Z"(ptr %[[THIS:[0-9a-z]+]])
+// CLANG21-NEXT:   br label %[[CONTINUE_LABEL]]
+//
+// CLANG21:      [[CONTINUE_LABEL]]
+// CLANG21-NEXT:   %[[RET:.*]] = load ptr, ptr %[[RETVAL]]
+// CLANG21-NEXT:   ret ptr %[[RET]]
+}
diff --git a/clang/test/CodeGenHLSL/GlobalConstructorLib.hlsl b/clang/test/CodeGenHLSL/GlobalConstructorLib.hlsl
index afda714106fac..0ef4b432019bb 100644
--- a/clang/test/CodeGenHLSL/GlobalConstructorLib.hlsl
+++ b/clang/test/CodeGenHLSL/GlobalConstructorLib.hlsl
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -emit-llvm -disable-llvm-passes %s -o - | FileCheck %s --check-prefixes=CHECK,NOINLINE
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -emit-llvm -O0 %s -o - | FileCheck %s --check-prefixes=CHECK,INLINE
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -emit-llvm -disable-llvm-passes %s -o - | llvm-cxxfilt | FileCheck %s --check-prefixes=CHECK,NOINLINE
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -emit-llvm -O0 %s -o - | llvm-cxxfilt | FileCheck %s --check-prefixes=CHECK,INLINE
 
 // Make sure global variable for ctors exist for lib profile.
 // CHECK:@llvm.global_ctors
@@ -13,7 +13,7 @@ void FirstEntry() {}
 // CHECK: define void @FirstEntry()
 // CHECK-NEXT: entry:
 // NOINLINE-NEXT:   call void @_GLOBAL__sub_I_GlobalConstructorLib.hlsl()
-// NOINLINE-NEXT:   call void @_Z10FirstEntryv()
+// NOINLINE-NEXT:   call void @FirstEntry()
 // Verify inlining leaves only calls to "llvm." intrinsics
 // INLINE-NOT:   call {{[^@]*}} @{{[^l][^l][^v][^m][^\.]}}
 // CHECK: ret void
@@ -25,15 +25,24 @@ void SecondEntry() {}
 // CHECK: define void @SecondEntry()
 // CHECK-NEXT: entry:
 // NOINLINE-NEXT:   call void @_GLOBAL__sub_I_GlobalConstructorLib.hlsl()
-// NOINLINE-NEXT:   call void @_Z11SecondEntryv()
+// NOINLINE-NEXT:   call void @SecondEntry()
 // Verify inlining leaves only calls to "llvm." intrinsics
 // INLINE-NOT:   call {{[^@]*}} @{{[^l][^l][^v][^m][^\.]}}
 // CHECK: ret void
 
 
-// Verify the constructor is alwaysinline
+// Verify the constructors are alwaysinline
 // NOINLINE: ; Function Attrs: {{.*}}alwaysinline
-// NOINLINE-NEXT: define linkonce_odr hidden void @_ZN4hlsl8RWBufferIfEC2EjijjPKc({{.*}} [[CtorAttr:\#[0-9]+]]
+// NOINLINE-NEXT: define linkonce_odr hidden void @hlsl::RWBuffer<float>::RWBuffer()({{.*}}){{.*}} [[CtorAttr:\#[0-9]+]]
+
+// NOINLINE: ; Function Attrs: {{.*}}alwaysinline
+// NOINLINE-NEXT: define linkonce_odr hidden void @hlsl::RWBuffer<float>::RWBuffer(hlsl::RWBuffer<float> const&)({{.*}}){{.*}} [[CtorAttr]]
+
+// NOINLINE: ; Function Attrs: {{.*}}alwaysinline
+// NOINLINE-NEXT: define linkonce_odr hidden void @hlsl::RWBuffer<float>::RWBuffer()(ptr noundef nonnull align 4 dereferenceable(4) %this){{.*}} [[CtorAttr:\#[0-9]+]]
+
+// NOINLINE: ; Function Attrs: {{.*}}alwaysinline
+// NOINLINE-NEXT: define linkonce_odr hidden void @hlsl::RWBuffer<float>::RWBuffer(hlsl::RWBuffer<float> const&)({{.*}}){{.*}} [[CtorAttr]]
 
 // NOINLINE: ; Function Attrs: {{.*}}alwaysinline
 // NOINLINE-NEXT: define internal void @_GLOBAL__sub_I_GlobalConstructorLib.hlsl() [[InitAttr:\#[0-9]+]]
diff --git a/clang/test/CodeGenHLSL/builtins/select.hlsl b/clang/test/CodeGenHLSL/builtins/select.hlsl
index 196b8a90cd877..7590b4a881259 100644
--- a/clang/test/CodeGenHLSL/builtins/select.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/select.hlsl
@@ -10,14 +10,26 @@ int test_select_bool_int(bool cond0, int tVal, int fVal) {
 }
 
 struct S { int a; };
-// CHECK-LABEL: test_select_infer
-// CHECK: [[SELECT:%.*]] = select i1 {{%.*}}, ptr {{%.*}}, ptr {{%.*}}
-// CHECK: store ptr [[SELECT]]
+// CHECK-LABEL: test_select_infer_struct
+// CHECK: [[TRUE_VAL:%.*]] = load %struct.S, ptr {{%.*}}, align 1
+// CHECK: [[FALSE_VAL:%.*]] = load %struct.S, ptr {{%.*}}, align 1
+// CHECK: [[SELECT:%.*]] = select i1 {{%.*}}, %struct.S [[TRUE_VAL]], %struct.S [[FALSE_VAL]]
+// CHECK: store %struct.S [[SELECT]], ptr {{%.*}}, align 1
 // CHECK: ret void
-struct S test_select_infer(bool cond0, struct S tVal, struct S fVal) {
+struct S test_select_infer_struct(bool cond0, struct S tVal, struct S fVal) {
   return select(cond0, tVal, fVal);
 }
 
+// CHECK-LABEL: test_select_infer_array
+// CHECK: [[TRUE_VAL:%.*]] = load [3 x i32], ptr {{%.*}}, align 4
+// CHECK: [[FALSE_VAL:%.*]] = load [3 x i32], ptr {{%.*}}, align 4
+// CHECK: [[SELECT:%.*]] = select i1 {{%.*}}, [3 x i32] [[TRUE_VAL]], [3 x i32] [[FALSE_VAL]]
+// CHECK: store [3 x i32] [[SELECT]], ptr {{%.*}}, align 4
+// CHECK: ret void
+int test_select_infer_array(bool cond, int tVal[3], int fVal[3])[3] {
+  return select(cond, tVal, fVal);
+}
+
 // CHECK-LABEL: test_select_bool_vector
 // CHECK: [[SELECT:%.*]] = select i1 {{%.*}}, <2 x i32> {{%.*}}, <2 x i32> {{%.*}}
 // CHECK: ret <2 x i32> [[SELECT]]
diff --git a/clang/test/CodeGenHLSL/resources/ByteAddressBuffers-constructors.hlsl b/clang/test/CodeGenHLSL/resources/ByteAddressBuffers-constructors.hlsl
index 5db156ed325da..9354ee714f86e 100644
--- a/clang/test/CodeGenHLSL/resources/ByteAddressBuffers-constructors.hlsl
+++ b/clang/test/CodeGenHLSL/resources/ByteAddressBuffers-constructors.hlsl
@@ -1,6 +1,8 @@
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-DXIL
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -emit-llvm -disable-llvm-passes -o - %s | \
+// RUN:   llvm-cxxfilt | FileCheck %s --check-prefixes=CHECK,CHECK-DXIL
 // FIXME: SPIR-V codegen of llvm.spv.resource.handlefrombinding and resource types is not yet implemented
-// RUN-DISABLED: %clang_cc1 -triple spirv-vulkan-library -x hlsl -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-SPIRV
+// RUN-DISABLED: %clang_cc1 -triple spirv-vulkan-library -x hlsl -emit-llvm -disable-llvm-passes -o - %s | \
+//        llvm-cxxfilt | FileCheck %s --check-prefixes=CHECK,CHECK-SPIRV
 
 // NOTE: Itanium ABI for C++ requires Clang to generate 2 constructors types to support polymorphism:
 // - C1 - Complete object constructor - constructs the complete object, including virtual base classes.
@@ -23,69 +25,61 @@ export void foo() {
 // CHECK: %"class.hlsl::RWByteAddressBuffer" = type { target("dx.RawBuffer", i8, 1, 0) }
 // CHECK: %"class.hlsl::RasterizerOrderedByteAddressBuffer" = type { target("dx.RawBuffer", i8, 1, 1) }
 
-// CHECK: @_ZL4Buf1 = internal global %"class.hlsl::ByteAddressBuffer" poison, align 4
+// CHECK: @Buf1 = internal global %"class.hlsl::ByteAddressBuffer" poison, align 4
 // CHECK: @[[Buf1Str:.*]] = private unnamed_addr constant [5 x i8] c"Buf1\00", align 1
-// CHECK: @_ZL4Buf2 = internal global %"class.hlsl::RWByteAddressBuffer" poison, align 4
+// CHECK: @Buf2 = internal global %"class.hlsl::RWByteAddressBuffer" poison, align 4
 // CHECK: @[[Buf2Str:.*]] = private unnamed_addr constant [5 x i8] c"Buf2\00", align 1
 
-// Buf1 initialization part 1 - global init function that calls ByteAddressBuffer C1 constructor with explicit binding
+// Buf1 initialization part 1 - global init function that calls ByteAddressBuffer::__createFromBinding
 // CHECK: define internal void @__cxx_global_var_init()
 // CHECK-NEXT: entry:
-// CHECK-NEXT: call void @_ZN4hlsl17ByteAddressBufferC1EjjijPKc(ptr noundef nonnull align 4 dereferenceable(4) @_ZL4Buf1,
-// CHECK-SAME: i32 noundef 1, i32 noundef 2, i32 noundef 1, i32 noundef 0, ptr noundef @[[Buf1Str]])
-
-// Buf1 initialization part 2 - body of ByteAddressBuffer C1 constructor with explicit binding that calls the C2 constructor
-// CHECK: define linkonce_odr hidden void @_ZN4hlsl17ByteAddressBufferC1EjjijPKc(ptr noundef nonnull align 4 dereferenceable(4) %this,
-// CHECK-SAME: i32 noundef %registerNo, i32 noundef %spaceNo, i32 noundef %range, i32 noundef %index, ptr noundef %name)
-// CHECK: call void @_ZN4hlsl17ByteAddressBufferC2EjjijPKc(ptr noundef nonnull align 4 dereferenceable(4)
-// CHECK-SAME:  %{{.*}}, i32 noundef %{{.*}}, i32 noundef %{{.*}}, i32 noundef %{{.*}}, i32 noundef %{{.*}}, ptr noundef %{{.*}})
-
-// Buf2 initialization part 1 - global init function that calls RWByteAddressBuffer C1 constructor with implicit binding
+// CHECK-NEXT: call void @hlsl::ByteAddressBuffer::__createFromBinding(unsigned int, unsigned int, int, unsigned int, char const*)
+// CHECK-SAME: (ptr {{.*}} @Buf1, i32 noundef 1, i32 noundef 2, i32 noundef 1, i32 noundef 0, ptr noundef @[[Buf1Str]])
+
+// Buf1 initialization part 2 - body of ByteAddressBuffer::__createFromBinding
+// CHECK: define {{.*}} void @hlsl::ByteAddressBuffer::__createFromBinding(unsigned int, unsigned int, int, unsigned int, char const*)
+// CHECK-SAME: (ptr {{.*}} sret(%"class.hlsl::ByteAddressBuffer") align 4 %[[RetValue1:.*]], i32 noundef %registerNo, 
+// CHECK-SAME: i32 noundef %spaceNo, i32 noundef %range, i32 noundef %index, ptr noundef %name)
+// CHECK: %[[Tmp1:.*]] = alloca %"class.hlsl::ByteAddressBuffer", align 4
+// CHECK: %[[Handle1:.*]] = call target("dx.RawBuffer", i8, 0, 0) 
+// CHECK-SAME: @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_i8_0_0t(
+// CHECK: %__handle = getelementptr inbounds nuw %"class.hlsl::ByteAddressBuffer", ptr %[[Tmp1]], i32 0, i32 0
+// CHECK-DXIL: store target("dx.RawBuffer", i8, 0, 0) %[[Handle1]], ptr %__handle, align 4
+// CHECK: call void @hlsl::ByteAddressBuffer::ByteAddressBuffer(hlsl::ByteAddressBuffer const&)(ptr {{.*}} %[[RetValue1]], ptr {{.*}} %[[Tmp1]])
+
+// Buf2 initialization part 1 - global init function that calls RWByteAddressBuffer::__createFromImplicitBinding
 // CHECK: define internal void @__cxx_global_var_init.1()
 // CHECK-NEXT: entry:
-// CHECK-NEXT: call void @_ZN4hlsl19RWByteAddressBufferC1EjijjPKc(ptr noundef nonnull align 4 dereferenceable(4) @_ZL4Buf2,
-// CHECK-SAME: i32 noundef 0, i32 noundef 1, i32 noundef 0, i32 noundef 0, ptr noundef @[[Buf2Str]])
-
-// Buf2 initialization part 2 - body of RWByteAddressBuffer C1 constructor with implicit binding that calls the C2 constructor
-// CHECK: define linkonce_odr hidden void @_ZN4hlsl19RWByteAddressBufferC1EjijjPKc(ptr noundef nonnull align 4 dereferenceable(4) %this,
-// CHECK-SAME: i32 noundef %spaceNo, i32 noundef %range, i32 noundef %index, i32 noundef %orderId, ptr noundef %name)
-// CHECK: call void @_ZN4hlsl19RWByteAddressBufferC2EjijjPKc(ptr noundef nonnull align 4 dereferenceable(4) %this1,
-// CHECK-SAME: i32 noundef %{{.*}}, i32 noundef %{{.*}}, i32 noundef %{{.*}}, i32 noundef %{{.*}}, ptr noundef %{{.*}})
+// CHECK-NEXT: call void @hlsl::RWByteAddressBuffer::__createFromImplicitBinding(unsigned int, unsigned int, int, unsigned int, char const*)
+// CHECK-SAME: (ptr {{.*}} @Buf2, i32 noundef 0, i32 noundef 0, i32 noundef 1, i32 noundef 0, ptr noundef @[[Buf2Str]])
+
+// Buf2 initialization part 2 - body of RWByteAddressBuffer::__createFromImplicitBinding
+// CHECK: define hidden void @hlsl::RWByteAddressBuffer::__createFromImplicitBinding(unsigned int, unsigned int, int, unsigned int, char const*)
+// CHECK-SAME: (ptr {{.*}} sret(%"class.hlsl::RWByteAddressBuffer") align 4 %[[RetValue2:.*]], i32 noundef %orderId,
+// CHECK-SAME: i32 noundef %spaceNo, i32 noundef %range, i32 noundef %index, ptr noundef %name)
+// CHECK: %[[Tmp2:.*]] = alloca %"class.hlsl::RWByteAddressBuffer", align 4
+// CHECK: %[[Handle2:.*]] = call target("dx.RawBuffer", i8, 1, 0)
+// CHECK-SAME: @llvm.dx.resource.handlefromimplicitbinding.tdx.RawBuffer_i8_1_0t(
+// CHECK: %__handle = getelementptr inbounds nuw %"class.hlsl::RWByteAddressBuffer", ptr %[[Tmp2]], i32 0, i32 0
+// CHECK-DXIL: store target("dx.RawBuffer", i8, 1, 0) %[[Handle2]], ptr %__handle, align 4
+// CHECK: call void @hlsl::RWByteAddressBuffer::RWByteAddressBuffer(hlsl::RWByteAddressBuffer const&)(ptr {{.*}} %[[RetValue2]], ptr {{.*}} %[[Tmp2]])
 
 // Buf3 initialization part 1 - local variable declared in function foo() is initialized by 
 // RasterizerOrderedByteAddressBuffer C1 default constructor
-// CHECK: define void @_Z3foov()
+// CHECK: define void @foo()
 // CHECK-NEXT: entry:
 // CHECK-NEXT: %Buf3 = alloca %"class.hlsl::RasterizerOrderedByteAddressBuffer", align 4
-// CHECK-NEXT: call void @_ZN4hlsl34RasterizerOrderedByteAddressBufferC1Ev(ptr noundef nonnull align 4 dereferenceable(4) %Buf3)
+// CHECK-NEXT: call void @hlsl::RasterizerOrderedByteAddressBuffer::RasterizerOrderedByteAddressBuffer()(ptr {{.*}} %Buf3)
 
 // Buf3 initialization part 2 - body of RasterizerOrderedByteAddressBuffer default C1 constructor that
 // calls the default C2 constructor
-// CHECK: define linkonce_odr hidden void @_ZN4hlsl34RasterizerOrderedByteAddressBufferC1Ev(ptr noundef nonnull align 4 dereferenceable(4) %this)
-// CHECK: call void @_ZN4hlsl34RasterizerOrderedByteAddressBufferC2Ev(ptr noundef nonnull align 4 dereferenceable(4) %{{.*}})
+// CHECK: define linkonce_odr hidden void @hlsl::RasterizerOrderedByteAddressBuffer::RasterizerOrderedByteAddressBuffer()(ptr {{.*}} %this)
+// CHECK: call void @hlsl::RasterizerOrderedByteAddressBuffer::RasterizerOrderedByteAddressBuffer()(ptr {{.*}} %this1)
 // CHECK-NEXT: ret void
 
-// Buf1 initialization part 3 - ByteAddressBuffer C2 constructor with explicit binding that initializes
-// handle with @llvm.dx.resource.handlefrombinding
-// CHECK: define linkonce_odr hidden void @_ZN4hlsl17ByteAddressBufferC2EjjijPKc(ptr noundef nonnull align 4 dereferenceable(4) %this,
-// CHECK-SAME: i32 noundef %registerNo, i32 noundef %spaceNo, i32 noundef %range, i32 noundef %index, ptr noundef %name)
-// CHECK-DXIL: %[[HANDLE:.*]] = call target("dx.RawBuffer", i8, 0, 0) @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_i8_0_0t(
-// CHECK-DXIL-SAME: i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, ptr %{{.*}})
-// CHECK-NEXT: %__handle = getelementptr inbounds nuw %"class.hlsl::ByteAddressBuffer", ptr %{{.*}}, i32 0, i32 0
-// CHECK-DXIL-NEXT: store target("dx.RawBuffer", i8, 0, 0) %[[HANDLE]], ptr %__handle, align 4
-
-// Buf2 initialization part 3 - body of RWByteAddressBuffer C2 constructor with implicit binding that initializes
-// handle with @llvm.dx.resource.handlefromimplicitbinding
-// CHECK: define linkonce_odr hidden void @_ZN4hlsl19RWByteAddressBufferC2EjijjPKc(ptr noundef nonnull align 4 dereferenceable(4) %this,
-// CHECK-SAME: i32 noundef %spaceNo, i32 noundef %range, i32 noundef %index, i32 noundef %orderId, ptr noundef %name)
-// CHECK: %[[HANDLE:.*]] = call target("dx.RawBuffer", i8, 1, 0) @llvm.dx.resource.handlefromimplicitbinding.tdx.RawBuffer_i8_1_0t
-// CHECK-SAME: (i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, ptr %{{.*}})
-// CHECK-NEXT: %__handle = getelementptr inbounds nuw %"class.hlsl::RWByteAddressBuffer", ptr %this1, i32 0, i32 0
-// CHECK-NEXT: store target("dx.RawBuffer", i8, 1, 0) %[[HANDLE]], ptr %__handle, align 4
-
 // Buf3 initialization part 3 - body of RasterizerOrderedByteAddressBuffer default C2 constructor that
 // initializes handle to poison
-// CHECK: define linkonce_odr hidden void @_ZN4hlsl34RasterizerOrderedByteAddressBufferC2Ev(ptr noundef nonnull align 4 dereferenceable(4) %this)
+// CHECK: define linkonce_odr hidden void @hlsl::RasterizerOrderedByteAddressBuffer::RasterizerOrderedByteAddressBuffer()(ptr {{.*}} %this)
 // CHECK: %__handle = getelementptr inbounds nuw %"class.hlsl::RasterizerOrderedByteAddressBuffer", ptr %{{.*}}, i32 0, i32 0
 // CHECK: store target("dx.RawBuffer", i8, 1, 1) poison, ptr %__handle, align 4
 
diff --git a/clang/test/CodeGenHLSL/resources/RWBuffer-constructor.hlsl b/clang/test/CodeGenHLSL/resources/RWBuffer-constructor.hlsl
index 127498460b039..ca33c4220dd73 100644
--- a/clang/test/CodeGenHLSL/resources/RWBuffer-constructor.hlsl
+++ b/clang/test/CodeGenHLSL/resources/RWBuffer-constructor.hlsl
@@ -1,6 +1,8 @@
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-DXIL
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -emit-llvm -disable-llvm-passes -o - %s | \
+// RUN:   llvm-cxxfilt | FileCheck %s --check-prefixes=CHECK,CHECK-DXIL
 // FIXME: SPIR-V codegen of llvm.spv.resource.handlefrombinding and resource types is not yet implemented
-// RUN-DISABLED: %clang_cc1 -triple spirv-vulkan-library -x hlsl -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-SPIRV
+// RUN-DISABLED: %clang_cc1 -triple spirv-vulkan-library -x hlsl -emit-llvm -disable-llvm-passes -o - %s | \
+//        llvm-cxxfilt | FileCheck %s --check-prefixes=CHECK,CHECK-SPIRV
 
 // NOTE: Itanium ABI for C++ requires Clang to generate 2 constructors types to support polymorphism:
 // - C1 - Complete object constructor - constructs the complete object, including virtual base classes.
@@ -23,65 +25,57 @@ export void foo() {
 // CHECK: %"class.hlsl::RWBuffer.0" = type { target("dx.TypedBuffer", double, 1, 0, 0) }
 // CHECK: %"class.hlsl::RWBuffer.1" = type { target("dx.TypedBuffer", i32, 1, 0, 1) }
 
-// CHECK: @_ZL4Buf1 = internal global %"class.hlsl::RWBuffer" poison, align 4
+// CHECK: @Buf1 = internal global %"class.hlsl::RWBuffer" poison, align 4
 // CHECK: @[[Buf1Str:.*]] = private unnamed_addr constant [5 x i8] c"Buf1\00", align 1
-// CHECK: @_ZL4Buf2 = internal global %"class.hlsl::RWBuffer.0" poison, align 4
+// CHECK: @Buf2 = internal global %"class.hlsl::RWBuffer.0" poison, align 4
 // CHECK: @[[Buf2Str:.*]] = private unnamed_addr constant [5 x i8] c"Buf2\00", align 1
 
-// Buf1 initialization part 1 - global init function that calls RWBuffer<float> C1 constructor with explicit binding
+// Buf1 initialization part 1 - global init function that calls RWBuffer<float>::__createFromBinding
 // CHECK: define internal void @__cxx_global_var_init()
 // CHECK-NEXT: entry:
-// CHECK-NEXT: call void @_ZN4hlsl8RWBufferIfEC1EjjijPKc(ptr noundef nonnull align 4 dereferenceable(4) @_ZL4Buf1,
-// CHECK-SAME: i32 noundef 5, i32 noundef 3, i32 noundef 1, i32 noundef 0, ptr noundef @[[Buf1Str]])
-
-// Buf1 initialization part 2 - body of RWBuffer<float> C1 constructor with explicit binding that calls the C2 constructor
-// CHECK: define linkonce_odr hidden void @_ZN4hlsl8RWBufferIfEC1EjjijPKc(ptr noundef nonnull align 4 dereferenceable(4) %this,
-// CHECK-SAME: i32 noundef %registerNo, i32 noundef %spaceNo, i32 noundef %range, i32 noundef %index, ptr noundef %name)
-// CHECK: call void @_ZN4hlsl8RWBufferIfEC2EjjijPKc(ptr noundef nonnull align 4 dereferenceable(4)
-// CHECK-SAME: %{{.*}}, i32 noundef %{{.*}}, i32 noundef %{{.*}}, i32 noundef %{{.*}}, i32 noundef %{{.*}}, ptr noundef %{{.*}})
-
-// Buf2 initialization part 1 - global init function that calls RWBuffer<float> C1 constructor with implicit binding
+// CHECK-NEXT: call void @hlsl::RWBuffer<float>::__createFromBinding(unsigned int, unsigned int, int, unsigned int, char const*)
+// CHECK-SAME: (ptr {{.*}} sret(%"class.hlsl::RWBuffer") align 4 @Buf1, i32 noundef 5, i32 noundef 3, i32 noundef 1, i32 noundef 0, ptr noundef @[[Buf1Str]])
+
+// Buf1 initialization part 2 - body of RWBuffer<float>::__createFromBinding
+// CHECK: define {{.*}} void @hlsl::RWBuffer<float>::__createFromBinding(unsigned int, unsigned int, int, unsigned int, char const*)
+// CHECK-SAME: (ptr {{.*}} sret(%"class.hlsl::RWBuffer") align 4 %[[RetValue1:.*]], i32 noundef %registerNo, 
+// CHECK-SAME: i32 noundef %spaceNo, i32 noundef %range, i32 noundef %index, ptr noundef %name)
+// CHECK: %[[Tmp1:.*]] = alloca %"class.hlsl::RWBuffer", align 4
+// CHECK: %[[Handle1:.*]] = call target("dx.TypedBuffer", float, 1, 0, 0) 
+// CHECK-SAME: @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_f32_1_0_0t(
+// CHECK: %__handle = getelementptr inbounds nuw %"class.hlsl::RWBuffer", ptr %[[Tmp1]], i32 0, i32 0
+// CHECK-DXIL: store target("dx.TypedBuffer", float, 1, 0, 0) %[[Handle1]], ptr %__handle, align 4
+// CHECK: call void @hlsl::RWBuffer<float>::RWBuffer(hlsl::RWBuffer<float> const&)(ptr {{.*}} %[[RetValue1]], ptr {{.*}} %[[Tmp1]])
+
+// Buf2 initialization part 1 - global init function that RWBuffer<float>::__createFromImplicitBinding
 // CHECK: define internal void @__cxx_global_var_init.1()
 // CHECK-NEXT: entry:
-// CHECK-NEXT: call void @_ZN4hlsl8RWBufferIdEC1EjijjPKc(ptr noundef nonnull align 4 dereferenceable(4) @_ZL4Buf2,
-// CHECK-SAME: i32 noundef 0, i32 noundef 1, i32 noundef 0, i32 noundef 0, ptr noundef @[[Buf2Str]])
-
-// Buf2 initialization part 2 - body of RWBuffer<float> C1 constructor with implicit binding that calls the C2 constructor
-// CHECK: define linkonce_odr hidden void @_ZN4hlsl8RWBufferIdEC1EjijjPKc(ptr noundef nonnull align 4 dereferenceable(4) %this,
-// CHECK-SAME: i32 noundef %spaceNo, i32 noundef %range, i32 noundef %index, i32 noundef %orderId, ptr noundef %name)
-// CHECK: call void @_ZN4hlsl8RWBufferIdEC2EjijjPKc(ptr noundef nonnull align 4 dereferenceable(4)
-// CHECK-SAME: %{{.*}}, i32 noundef %{{.*}}, i32 noundef %{{.*}}, i32 noundef %{{.*}}, i32 noundef %{{.*}}, ptr noundef %{{.*}})
+// CHECK-NEXT: call void @hlsl::RWBuffer<double>::__createFromImplicitBinding(unsigned int, unsigned int, int, unsigned int, char const*)
+// CHECK-SAME: (ptr {{.*}} @Buf2, i32 noundef 0, i32 noundef 0, i32 noundef 1, i32 noundef 0, ptr noundef @[[Buf2Str]])
+
+// Buf2 initialization part 2 - body of RWBuffer<float>::__createFromImplicitBinding call
+// CHECK: define linkonce_odr hidden void @hlsl::RWBuffer<double>::__createFromImplicitBinding(unsigned int, unsigned int, int, unsigned int, char const*)
+// CHECK-SAME: (ptr {{.*}} sret(%"class.hlsl::RWBuffer.0") align 4 %[[RetValue2:.*]], i32 noundef %orderId, 
+// CHECK-SAME: i32 noundef %spaceNo, i32 noundef %range, i32 noundef %index, ptr noundef %name)
+// CHECK: %[[Tmp2:.*]] = alloca %"class.hlsl::RWBuffer.0", align 4
+// CHECK: %[[Handle2:.*]] = call target("dx.TypedBuffer", double, 1, 0, 0)
+// CHECK-SAME: @llvm.dx.resource.handlefromimplicitbinding.tdx.TypedBuffer_f64_1_0_0t(
+// CHECK: %__handle = getelementptr inbounds nuw %"class.hlsl::RWBuffer.0", ptr %[[Tmp2]], i32 0, i32 0
+// CHECK-DXIL: store target("dx.TypedBuffer", double, 1, 0, 0) %[[Handle2]], ptr %__handle, align 4
+// CHECK: call void @hlsl::RWBuffer<double>::RWBuffer(hlsl::RWBuffer<double> const&)(ptr {{.*}} %[[RetValue2]], ptr {{.*}} %[[Tmp2]])
 
 // Buf3 initialization part 1 - local variable declared in function foo() is initialized by RWBuffer<int> C1 default constructor
-// CHECK: define void @_Z3foov()
+// CHECK: define void @foo()
 // CHECK-NEXT: entry:
 // CHECK-NEXT: %Buf3 = alloca %"class.hlsl::RWBuffer.1", align 4
-// CHECK-NEXT: call void @_ZN4hlsl8RWBufferIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) %Buf3)
+// CHECK-NEXT: call void @hlsl::RWBuffer<int>::RWBuffer()(ptr {{.*}} %Buf3)
 
 // Buf3 initialization part 2 - body of RWBuffer<int> default C1 constructor that calls the default C2 constructor
-// CHECK: define linkonce_odr hidden void @_ZN4hlsl8RWBufferIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) %this)
-// CHECK: call void @_ZN4hlsl8RWBufferIiEC2Ev(ptr noundef nonnull align 4 dereferenceable(4) %{{.*}})
-
-// Buf1 initialization part 3 - body of RWBuffer<float> C2 constructor with explicit binding that initializes
-// handle with @llvm.dx.resource.handlefrombinding 
-// CHECK: define linkonce_odr hidden void @_ZN4hlsl8RWBufferIfEC2EjjijPKc(ptr noundef nonnull align 4 dereferenceable(4) %this,
-// CHECK-SAME: i32 noundef %registerNo, i32 noundef %spaceNo, i32 noundef %range, i32 noundef %index, ptr noundef %name)
-// CHECK-DXIL: %[[HANDLE:.*]] = call target("dx.TypedBuffer", float, 1, 0, 0) @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_f32_1_0_0t(
-// CHECK-DXIL-SAME: i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, ptr %{{.*}})
-// CHECK-NEXT: %__handle = getelementptr inbounds nuw %"class.hlsl::RWBuffer", ptr %{{.*}}, i32 0, i32 0
-// CHECK-DXIL-NEXT: store target("dx.TypedBuffer", float, 1, 0, 0) %[[HANDLE]], ptr %__handle, align 4
-
-// Buf2 initialization part 3 - body of RWBuffer<float> C2 constructor with implicit binding that initializes
-// handle with @llvm.dx.resource.handlefromimplicitbinding
-// CHECK: define linkonce_odr hidden void @_ZN4hlsl8RWBufferIdEC2EjijjPKc(ptr noundef nonnull align 4 dereferenceable(4) %this,
-// CHECK-SAME: i32 noundef %spaceNo, i32 noundef %range, i32 noundef %index, i32 noundef %orderId, ptr noundef %name)
-// CHECK: %[[HANDLE:.*]] = call target("dx.TypedBuffer", double, 1, 0, 0) @llvm.dx.resource.handlefromimplicitbinding.tdx.TypedBuffer_f64_1_0_0t
-// CHECK-SAME: (i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, ptr %{{.*}})
-// CHECK-NEXT: %__handle = getelementptr inbounds nuw %"class.hlsl::RWBuffer.0", ptr %{{.*}}, i32 0, i32 0
-// CHECK-NEXT: store target("dx.TypedBuffer", double, 1, 0, 0) %[[HANDLE]], ptr %__handle, align 4
+// CHECK: define linkonce_odr hidden void @hlsl::RWBuffer<int>::RWBuffer()(ptr {{.*}} %this)
+// CHECK: call void @hlsl::RWBuffer<int>::RWBuffer()(ptr {{.*}} %{{.*}})
 
 // Buf3 initialization part 3 - body of RWBuffer<int> default C2 constructor that initializes handle to poison
-// CHECK: define linkonce_odr hidden void @_ZN4hlsl8RWBufferIiEC2Ev(ptr noundef nonnull align 4 dereferenceable(4) %this)
+// CHECK: define linkonce_odr hidden void @hlsl::RWBuffer<int>::RWBuffer()(ptr {{.*}} %this)
 // CHECK: %__handle = getelementptr inbounds nuw %"class.hlsl::RWBuffer.1", ptr %{{.*}}, i32 0, i32 0
 // CHECK-NEXT: store target("dx.TypedBuffer", i32, 1, 0, 1) poison, ptr %__handle, align 4
 
diff --git a/clang/test/CodeGenHLSL/resources/StructuredBuffers-constructors.hlsl b/clang/test/CodeGenHLSL/resources/StructuredBuffers-constructors.hlsl
index 91410e600c6e0..4f005eab5c71a 100644
--- a/clang/test/CodeGenHLSL/resources/StructuredBuffers-constructors.hlsl
+++ b/clang/test/CodeGenHLSL/resources/StructuredBuffers-constructors.hlsl
@@ -1,6 +1,8 @@
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-DXIL
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -emit-llvm -disable-llvm-passes -o - %s | \
+// RUN:   llvm-cxxfilt | FileCheck %s --check-prefixes=CHECK,CHECK-DXIL
 // FIXME: SPIR-V codegen of llvm.spv.resource.handlefrombinding and resource types is not yet implemented
-// RUN-DISABLED: %clang_cc1 -triple spirv-vulkan-library -x hlsl -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-SPIRV
+// RUN-DISABLED: %clang_cc1 -triple spirv-vulkan-library -x hlsl -emit-llvm -disable-llvm-passes -o - %s | \
+//        llvm-cxxfilt | FileCheck %s --check-prefixes=CHECK,CHECK-SPIRV
 
 // NOTE: Itanium ABI for C++ requires Clang to generate 2 constructors types to support polymorphism:
 // - C1 - Complete object constructor - constructs the complete object, including virtual base classes.
@@ -18,77 +20,68 @@ export void foo() {
   AppendStructuredBuffer<float> Buf3;
 }
 
-// CHECK: %"class.hlsl::StructuredBuffer" = type { target("dx.RawBuffer", float, 0, 0) }
-// CHECK: %"class.hlsl::RWStructuredBuffer" = type { target("dx.RawBuffer", float, 1, 0) }
-// CHECK: %"class.hlsl::AppendStructuredBuffer" = type { target("dx.RawBuffer", float, 1, 0) }
+// CHECK-DXIL: %"class.hlsl::StructuredBuffer" = type { target("dx.RawBuffer", float, 0, 0) }
+// CHECK-DXIL: %"class.hlsl::RWStructuredBuffer" = type { target("dx.RawBuffer", float, 1, 0) }
+// CHECK-DXIL: %"class.hlsl::AppendStructuredBuffer" = type { target("dx.RawBuffer", float, 1, 0) }
 
-// CHECK: @_ZL4Buf1 = internal global %"class.hlsl::StructuredBuffer" poison, align 4
+// CHECK: @Buf1 = internal global %"class.hlsl::StructuredBuffer" poison, align 4
 // CHECK: @[[Buf1Str:.*]] = private unnamed_addr constant [5 x i8] c"Buf1\00", align 1
-// CHECK: @_ZL4Buf2 = internal global %"class.hlsl::RWStructuredBuffer" poison, align 4
+// CHECK: @Buf2 = internal global %"class.hlsl::RWStructuredBuffer" poison, align 4
 // CHECK: @[[Buf2Str:.*]] = private unnamed_addr constant [5 x i8] c"Buf2\00", align 1
 
-// Buf1 initialization part 1 - global init function that calls StructuredBuffer<float> C1 constructor
+// Buf1 initialization part 1 - global init function that calls StructuredBuffer<float>::__createFromBinding
 // with explicit binding
 // CHECK: define internal void @__cxx_global_var_init()
 // CHECK-NEXT: entry:
-// CHECK-NEXT: call void @_ZN4hlsl16StructuredBufferIfEC1EjjijPKc(ptr noundef nonnull align 4 dereferenceable(4) @_ZL4Buf1,
-// CHECK-SAME: i32 noundef 10, i32 noundef 2, i32 noundef 1, i32 noundef 0, ptr noundef @[[Buf1Str]])
-
-// Buf1 initialization part 2 - body of StructuredBuffer<float> C1 constructor with explicit binding 
-// that calls the C2 constructor
-// CHECK: define linkonce_odr hidden void @_ZN4hlsl16StructuredBufferIfEC1EjjijPKc(ptr noundef nonnull align 4 dereferenceable(4) %this, 
-// CHECK-SAME: i32 noundef %registerNo, i32 noundef %spaceNo, i32 noundef %range, i32 noundef %index, ptr noundef %name)
-// CHECK: call void @_ZN4hlsl16StructuredBufferIfEC2EjjijPKc(ptr noundef nonnull align 4 dereferenceable(4)
-// CHECK-SAME: %{{.*}}, i32 noundef %{{.*}}, i32 noundef %{{.*}}, i32 noundef %{{.*}}, i32 noundef %{{.*}}, ptr noundef %{{.*}})
-
-// Buf2 initialization part 1 - global init function that calls RWStructuredBuffer<float> C1 constructor with
-// implicit binding
+// CHECK-NEXT: call void @hlsl::StructuredBuffer<float>::__createFromBinding(unsigned int, unsigned int, int, unsigned int, char const*)
+// CHECK-SAME: (ptr {{.*}} @Buf1, i32 noundef 10, i32 noundef 2, i32 noundef 1, i32 noundef 0, ptr noundef @[[Buf1Str]])
+
+// Buf1 initialization part 2 - body of StructuredBuffer<float>::::__createFromBinding
+
+// CHECK: define {{.*}} void @hlsl::StructuredBuffer<float>::__createFromBinding(unsigned int, unsigned int, int, unsigned int, char const*)
+// CHECK-SAME: ptr {{.*}} sret(%"class.hlsl::StructuredBuffer") align 4 %[[RetValue1:.*]], i32 noundef %registerNo, 
+// CHECK-SAME: i32 noundef %spaceNo, i32 noundef %range, i32 noundef %index, ptr noundef %name)
+// CHECK: %[[Tmp1:.*]] = alloca %"class.hlsl::StructuredBuffer", align 4
+// CHECK-DXIL: %[[Handle1:.*]] = call target("dx.RawBuffer", float, 0, 0) 
+// CHECK-DXIL-SAME: @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_f32_0_0t(
+// CHECK: %__handle = getelementptr inbounds nuw %"class.hlsl::StructuredBuffer", ptr %[[Tmp1]], i32 0, i32 0
+// CHECK-DXIL: store target("dx.RawBuffer", float, 0, 0) %[[Handle1]], ptr %__handle, align 4
+// CHECK: call void @hlsl::StructuredBuffer<float>::StructuredBuffer(hlsl::StructuredBuffer<float> const&)(ptr {{.*}} %[[RetValue1]], ptr {{.*}} %[[Tmp1]])
+
+// Buf2 initialization part 1 - global init function that calls RWStructuredBuffer<float>::__createFromImplicitBinding
 // CHECK: define internal void @__cxx_global_var_init.1()
 // CHECK-NEXT: entry:
-// CHECK-NEXT: call void @_ZN4hlsl18RWStructuredBufferIfEC1EjijjPKc(ptr noundef nonnull align 4 dereferenceable(4) @_ZL4Buf2,
-// CHECK-SAME: i32 noundef 0, i32 noundef 1, i32 noundef 0, i32 noundef 0, ptr noundef @[[Buf2Str]])
-
-// Buf2 initialization part 2 - body of RWStructuredBuffer<float> C1 constructor with implicit binding that calls the C2 constructor
-// CHECK: define linkonce_odr hidden void @_ZN4hlsl18RWStructuredBufferIfEC1EjijjPKc(ptr noundef nonnull align 4 dereferenceable(4) %this,
-// CHECK-SAME: i32 noundef %spaceNo, i32 noundef %range, i32 noundef %index, i32 noundef %orderId, ptr noundef %name)
-// CHECK: call void @_ZN4hlsl18RWStructuredBufferIfEC2EjijjPKc(ptr noundef nonnull align 4 dereferenceable(4)
-// CHECK-SAME: %{{.*}}, i32 noundef %{{.*}}, i32 noundef %{{.*}}, i32 noundef %{{.*}}, i32 noundef %{{.*}}, ptr noundef %{{.*}})
+// CHECK-NEXT: call void @hlsl::RWStructuredBuffer<float>::__createFromImplicitBinding(unsigned int, unsigned int, int, unsigned int, char const*)
+// CHECK-SAME: (ptr {{.*}} @Buf2, i32 noundef 0, i32 noundef 0, i32 noundef 1, i32 noundef 0, ptr noundef @[[Buf2Str]])
+
+// Buf2 initialization part 2 - body of RWStructuredBuffer<float>::__createFromImplicitBinding
+// CHECK: define linkonce_odr hidden void @hlsl::RWStructuredBuffer<float>::__createFromImplicitBinding(unsigned int, unsigned int, int, unsigned int, char const*)
+// CHECK-SAME: (ptr {{.*}} sret(%"class.hlsl::RWStructuredBuffer") align 4 %[[RetValue2:.*]], i32 noundef %orderId, 
+// CHECK-SAME: i32 noundef %spaceNo, i32 noundef %range, i32 noundef %index, ptr noundef %name)
+// CHECK: %[[Tmp2:.*]] = alloca %"class.hlsl::RWStructuredBuffer", align 4
+// CHECK-DXIL: %[[Handle2:.*]] = call target("dx.RawBuffer", float, 1, 0)
+// CHECK-DXIL-SAME: @llvm.dx.resource.handlefromimplicitbinding.tdx.RawBuffer_f32_1_0t(
+// CHECK: %__handle = getelementptr inbounds nuw %"class.hlsl::RWStructuredBuffer", ptr %[[Tmp2]], i32 0, i32 0
+// CHECK-DXIL: store target("dx.RawBuffer", float, 1, 0) %[[Handle2]], ptr %__handle, align 4
+// CHECK: call void @hlsl::RWStructuredBuffer<float>::RWStructuredBuffer(hlsl::RWStructuredBuffer<float> const&)(ptr {{.*}} %[[RetValue2]], ptr {{.*}} %[[Tmp2]])
 
 // Buf3 initialization part 1 - local variable declared in function foo() is initialized by 
 // AppendStructuredBuffer<float> C1 default constructor
-// CHECK: define void @_Z3foov()
+// CHECK: define void @foo()
 // CHECK-NEXT: entry:
 // CHECK-NEXT: %Buf3 = alloca %"class.hlsl::AppendStructuredBuffer", align 4
-// CHECK-NEXT: call void @_ZN4hlsl22AppendStructuredBufferIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) %Buf3)
+// CHECK-NEXT: call void @hlsl::AppendStructuredBuffer<float>::AppendStructuredBuffer()(ptr {{.*}} %Buf3)
 
 // Buf3 initialization part 2 - body of AppendStructuredBuffer<float> default C1 constructor that calls
 // the default C2 constructor
-// CHECK: define linkonce_odr hidden void @_ZN4hlsl22AppendStructuredBufferIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) %this)
-// CHECK: call void @_ZN4hlsl22AppendStructuredBufferIfEC2Ev(ptr noundef nonnull align 4 dereferenceable(4) %{{.*}})
-
-// Buf1 initialization part 3 - body of AppendStructuredBuffer<float> C2 constructor with explicit binding 
-// that initializes handle with @llvm.dx.resource.handlefrombinding
-// CHECK: define linkonce_odr hidden void @_ZN4hlsl16StructuredBufferIfEC2EjjijPKc(ptr noundef nonnull align 4 dereferenceable(4) %this,
-// CHECK-SAME: i32 noundef %registerNo, i32 noundef %spaceNo, i32 noundef %range, i32 noundef %index, ptr noundef %name)
-// CHECK-DXIL: %[[HANDLE:.*]] = call target("dx.RawBuffer", float, 0, 0) @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_f32_0_0t(
-// CHECK-SAME: i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, ptr %{{.*}})
-// CHECK-NEXT: %__handle = getelementptr inbounds nuw %"class.hlsl::StructuredBuffer", ptr %{{.*}}, i32 0, i32 0
-// CHECK-DXIL-NEXT: store target("dx.RawBuffer", float, 0, 0) %[[HANDLE]], ptr %__handle, align 4
-
-// Buf2 initialization part 3 - body of RWStructuredBuffer<float> C2 constructor with implicit binding that initializes
-// handle with @llvm.dx.resource.handlefromimplicitbinding
-// CHECK: define linkonce_odr hidden void @_ZN4hlsl18RWStructuredBufferIfEC2EjijjPKc(ptr noundef nonnull align 4 dereferenceable(4) %this,
-// CHECK-SAME: i32 noundef %spaceNo, i32 noundef %range, i32 noundef %index, i32 noundef %orderId, ptr noundef %name)
-// CHECK: %[[HANDLE:.*]] = call target("dx.RawBuffer", float, 1, 0) @llvm.dx.resource.handlefromimplicitbinding.tdx.RawBuffer_f32_1_0t
-// CHECK-SAME: (i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, ptr %{{.*}})
-// CHECK-NEXT: %__handle = getelementptr inbounds nuw %"class.hlsl::RWStructuredBuffer", ptr %{{.*}}, i32 0, i32 0
-// CHECK-NEXT: store target("dx.RawBuffer", float, 1, 0) %[[HANDLE]], ptr %__handle, align 4
+// CHECK: define linkonce_odr hidden void @hlsl::StructuredBuffer<float>::StructuredBuffer()(ptr {{.*}} %this)
+// CHECK: call void @hlsl::StructuredBuffer<float>::StructuredBuffer()(ptr {{.*}} %this1)
 
 // Buf3 initialization part 3 - body of AppendStructuredBuffer<float> default C2 constructor that
 // initializes handle to poison
-// CHECK: define linkonce_odr hidden void @_ZN4hlsl22AppendStructuredBufferIfEC2Ev(ptr noundef nonnull align 4 dereferenceable(4) %this)
+// CHECK: define linkonce_odr hidden void @hlsl::StructuredBuffer<float>::StructuredBuffer()(ptr {{.*}} %this)
 // CHECK: %__handle = getelementptr inbounds nuw %"class.hlsl::AppendStructuredBuffer", ptr %{{.*}}, i32 0, i32 0
-// CHECK: store target("dx.RawBuffer", float, 1, 0) poison, ptr %__handle, align 4
+// CHECK-DXIL: store target("dx.RawBuffer", float, 1, 0) poison, ptr %__handle, align 4
 
 // Module initialization
 // CHECK: define internal void @_GLOBAL__sub_I_StructuredBuffers_constructors.hlsl()
diff --git a/clang/test/CodeGenHLSL/resources/res-array-global-dyn-index.hlsl b/clang/test/CodeGenHLSL/resources/res-array-global-dyn-index.hlsl
index bbd162e3aad20..f17cf12945e4a 100644
--- a/clang/test/CodeGenHLSL/resources/res-array-global-dyn-index.hlsl
+++ b/clang/test/CodeGenHLSL/resources/res-array-global-dyn-index.hlsl
@@ -1,17 +1,15 @@
 // RUN: %clang_cc1 -triple dxil-pc-shadermodel6.6-compute -finclude-default-header \
-// RUN:   -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s
+// RUN:   -emit-llvm -disable-llvm-passes -o - %s | llvm-cxxfilt | FileCheck %s
 
 // CHECK: @[[BufA:.*]] = private unnamed_addr constant [2 x i8] c"A\00", align 1
 
 RWBuffer<float> A[4][3] : register(u2);
 RWStructuredBuffer<float> Out;
 
-// Make sure A[GI.x][GI.y] is translated to a RWBuffer<float> constructor call with range 12 and dynamically calculated index
+// Make sure A[GI.x][GI.y] is translated to a RWBuffer<float>::__createFromBinding call
+// with range 12 and dynamically calculated index
 
-// NOTE:
-// Constructor call for explicit binding has "jjij" in the mangled name and the arguments are (register, space, range_size, index, name).
-
-// CHECK: define internal void @_Z4mainDv3_j(<3 x i32> noundef %GI)
+// CHECK: define internal void @main(unsigned int vector[3])(<3 x i32> noundef %GI)
 // CHECK: %[[GI_alloca:.*]] = alloca <3 x i32>, align 16
 // CHECK: %[[Tmp0:.*]] = alloca %"class.hlsl::RWBuffer
 // CHECK: store <3 x i32> %GI, ptr %[[GI_alloca]]
@@ -22,7 +20,9 @@ RWStructuredBuffer<float> Out;
 // CHECK: %[[GI_x:.*]] = extractelement <3 x i32> %[[GI]], i32 0
 // CHECK: %[[Tmp1:.*]] = mul i32 %[[GI_x]], 3
 // CHECK: %[[Index:.*]] = add i32 %[[GI_y]], %[[Tmp1]]
-// CHECK: call void @_ZN4hlsl8RWBufferIfEC1EjjijPKc(ptr {{.*}} %[[Tmp0]], i32 noundef 2, i32 noundef 0, i32 noundef 12, i32 noundef %[[Index]], ptr noundef @A.str)
+// CHECK: call void @hlsl::RWBuffer<float>::__createFromBinding(unsigned int, unsigned int, int, unsigned int, char const*)
+// CHECK-SAME: (ptr {{.*}} sret(%"class.hlsl::RWBuffer") align 4 %[[Tmp0]],
+// CHECK-SAME: i32 noundef 2, i32 noundef 0, i32 noundef 12, i32 noundef %[[Index]], ptr noundef @A.str)
 [numthreads(4,1,1)]
 void main(uint3 GI : SV_GroupThreadID) {
   Out[0] = A[GI.x][GI.y][0];
diff --git a/clang/test/CodeGenHLSL/resources/res-array-global-multi-dim.hlsl b/clang/test/CodeGenHLSL/resources/res-array-global-multi-dim.hlsl
index 36871f9a63b3f..1a05897b9e70b 100644
--- a/clang/test/CodeGenHLSL/resources/res-array-global-multi-dim.hlsl
+++ b/clang/test/CodeGenHLSL/resources/res-array-global-multi-dim.hlsl
@@ -1,7 +1,7 @@
 // RUN: %clang_cc1 -triple dxil-pc-shadermodel6.6-compute -finclude-default-header \
-// RUN:   -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s
+// RUN:   -emit-llvm -disable-llvm-passes -o - %s | llvm-cxxfilt | FileCheck %s
 // RUN: %clang_cc1 -finclude-default-header -triple spirv-unknown-vulkan-compute \
-// RUN:   -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s
+// RUN:   -emit-llvm -disable-llvm-passes -o - %s | llvm-cxxfilt | FileCheck %s
 
 // CHECK: @[[BufB:.*]] = private unnamed_addr constant [2 x i8] c"B\00", align 1
 // CHECK: @[[BufC:.*]] = private unnamed_addr constant [2 x i8] c"C\00", align 1
@@ -17,30 +17,30 @@ RWStructuredBuffer<float> Out;
 
 [numthreads(4,1,1)]
 void main() {
-  // CHECK: define internal{{.*}} void @_Z4mainv()
+  // CHECK: define internal{{.*}} void @main()
   // CHECK: %[[Tmp0:.*]] = alloca %"class.hlsl::RWBuffer
   // CHECK: %[[Tmp1:.*]] = alloca %"class.hlsl::RWBuffer
   // CHECK: %[[Tmp2:.*]] = alloca %"class.hlsl::RWBuffer
   // CHECK: %[[Tmp3:.*]] = alloca %"class.hlsl::RWBuffer
 
-  // NOTE:
-  // Constructor call for explicit binding has "jjij" in the mangled name and the arguments are (register, space, range_size, index, name).
-  // For implicit binding the constructor has "jijj" in the mangled name and the arguments are (space, range_size, index, order_id, name).
-  // The range_size can be -1 for unbounded arrays, and that is the only signed int in the signature.
-  // The order_id argument is a sequential number that is assigned to resources with implicit binding and corresponds to the order in which 
-  // the resources were declared. It is needed because implicit bindings are assigned later on in an LLVM pass that needs to know the order
-  // of the resource declarations.
-
-  // Make sure that B[3][2] is translated to a RWBuffer<float> constructor call for explicit binding (u2, space0) with range 16 and index 14
-  // CHECK: call void @_ZN4hlsl8RWBufferIfEC1EjjijPKc(ptr {{.*}} %[[Tmp0]], i32 noundef 2, i32 noundef 0, i32 noundef 16, i32 noundef 14, ptr noundef @[[BufB]])
-
-  // Make sure that C[1][0][3] is translated to a RWBuffer<int> constructor call for explicit binding (u10, space1) with range 20 and index 13
-  // CHECK: call void @_ZN4hlsl8RWBufferIiEC1EjjijPKc(ptr {{.*}} %[[Tmp1]], i32 noundef 10, i32 noundef 1, i32 noundef 20, i32 noundef 13, ptr noundef @[[BufC]])
-
-  // Make sure that D[9][2] is translated to a RWBuffer<uint> constructor call for implicit binding (u18, space0) with range 50 and index 47
-  // CHECK: call void @_ZN4hlsl8RWBufferIjEC1EjijjPKc(ptr {{.*}} %[[Tmp2]], i32 noundef 0, i32 noundef 50, i32 noundef 47, i32 noundef 0, ptr noundef @[[BufD]])
-
-  // Make sure that the second B[3][2] is translated to the same a RWBuffer<float> constructor call as the first B[3][2] subscript
-  // CHECK: call void @_ZN4hlsl8RWBufferIfEC1EjjijPKc(ptr {{.*}} %[[Tmp3]], i32 noundef 2, i32 noundef 0, i32 noundef 16, i32 noundef 14, ptr noundef @[[BufB]])
+  // Make sure that B[3][2] is translated to a RWBuffer<float>::__createFromBinding call (u2, space0) with range 16 and index 14
+  // CHECK: call void @hlsl::RWBuffer<float>::__createFromBinding(unsigned int, unsigned int, int, unsigned int, char const*)
+  // CHECK-SAME: (ptr {{.*}} sret(%"class.hlsl::RWBuffer") align {{(4|8)}} %[[Tmp0]],
+  // CHECK-SAME: i32 noundef 2, i32 noundef 0, i32 noundef 16, i32 noundef 14, ptr noundef @[[BufB]])
+
+  // Make sure that C[1][0][3] is translated to a RWBuffer<int>::__createFromBinding call (u10, space1) with range 20 and index 13
+  // CHECK: call void @hlsl::RWBuffer<int>::__createFromBinding(unsigned int, unsigned int, int, unsigned int, char const*)
+  // CHECK-SAME: (ptr {{.*}} sret(%"class.hlsl::RWBuffer.0") align {{(4|8)}} %[[Tmp1]],
+  // CHECK-SAME: i32 noundef 10, i32 noundef 1, i32 noundef 20, i32 noundef 13, ptr noundef @[[BufC]])
+
+  // Make sure that D[9][2] is translated to a RWBuffer<uint>::__createFromImplicitBinding call with range 50 and index 47
+  // CHECK: call void @hlsl::RWBuffer<unsigned int>::__createFromImplicitBinding(unsigned int, unsigned int, int, unsigned int, char const*)
+  // CHECK-SAME: (ptr {{.*}} sret(%"class.hlsl::RWBuffer.1") align {{(4|8)}} %[[Tmp2]],
+  // CHECK-SAME: i32 noundef 0, i32 noundef 0, i32 noundef 50, i32 noundef 47, ptr noundef @[[BufD]])
+
+  // Make sure that the second B[3][2] is translated to the same RWBuffer<float>::__createFromBinding call as the first B[3][2] subscript
+  // CHECK: call void @hlsl::RWBuffer<float>::__createFromBinding(unsigned int, unsigned int, int, unsigned int, char const*)
+  // CHECK-SAME: (ptr {{.*}} writable sret(%"class.hlsl::RWBuffer") align {{(4|8)}} %[[Tmp3]],
+  // CHECK-SAME: i32 noundef 2, i32 noundef 0, i32 noundef 16, i32 noundef 14, ptr noundef @[[BufB]])
   Out[0] =  B[3][2][0] + (float)C[1][0][3][0] + (float)D[9][2][0] + B[3][2][1];
 }
diff --git a/clang/test/CodeGenHLSL/resources/res-array-global-subarray-many.hlsl b/clang/test/CodeGenHLSL/resources/res-array-global-subarray-many.hlsl
index 7c52c7116f3d9..036feec39f1dd 100644
--- a/clang/test/CodeGenHLSL/resources/res-array-global-subarray-many.hlsl
+++ b/clang/test/CodeGenHLSL/resources/res-array-global-subarray-many.hlsl
@@ -1,28 +1,24 @@
 // RUN: %clang_cc1 -triple dxil-pc-shadermodel6.6-compute -finclude-default-header \
-// RUN:   -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s
+// RUN:   -emit-llvm -disable-llvm-passes -o - %s | llvm-cxxfilt | FileCheck %s
 
 // CHECK: @[[BufA:.*]] = private unnamed_addr constant [2 x i8] c"A\00", align 1
 
 RWBuffer<float> A[5][4][3][2] : register(u10, space2);
 RWStructuredBuffer<float> Out;
 
-// CHECK: define {{.*}} float @_Z3fooA3_A2_N4hlsl8RWBufferIfEE(ptr noundef byval([3 x [2 x %"class.hlsl::RWBuffer"]]) align 4 %Arr)
+// CHECK: define {{.*}} float @foo(hlsl::RWBuffer<float> [3][2])
+// CHECK-SAME: (ptr noundef byval([3 x [2 x %"class.hlsl::RWBuffer"]]) align 4 %Arr)
 // CHECK-NEXT: entry:
 float foo(RWBuffer<float> Arr[3][2]) {
 // CHECK-NEXT: %[[Arr_1_Ptr:.*]] = getelementptr inbounds [3 x [2 x %"class.hlsl::RWBuffer"]], ptr %Arr, i32 0, i32 1
 // CHECK-NEXT: %[[Arr_1_0_Ptr:.*]] = getelementptr inbounds [2 x %"class.hlsl::RWBuffer"], ptr %[[Arr_1_Ptr]], i32 0, i32 0
-// CHECK-NEXT: %[[BufPtr:.*]] = call {{.*}} ptr @_ZN4hlsl8RWBufferIfEixEj(ptr {{.*}} %[[Arr_1_0_Ptr]], i32 noundef 0)
+// CHECK-NEXT: %[[BufPtr:.*]] = call {{.*}} ptr @hlsl::RWBuffer<float>::operator[](unsigned int)(ptr {{.*}} %[[Arr_1_0_Ptr]], i32 noundef 0)
 // CHECK-NEXT: %[[Value:.*]] = load float, ptr %[[BufPtr]], align 4
 // CHECK-NEXT: ret float %[[Value]]
   return Arr[1][0][0];
 }
 
-// NOTE:
-// - _ZN4hlsl8RWBufferIfEC1EjjijPKc is the constructor call for explicit binding
-//    (has "jjij" in the mangled name) and the arguments are (register, space, range_size, index, name).
-// - _ZN4hlsl8RWBufferIfEixEj is the subscript operator for RWBuffer<float>
-
-// CHECK: define internal void @_Z4mainj(i32 noundef %GI)
+// CHECK: define internal void @main(unsigned int)(i32 noundef %GI)
 // CHECK-NEXT: entry:
 // CHECK-NEXT: %[[GI_alloca:.*]] = alloca i32, align 4
 // CHECK-NEXT: %Sub = alloca [3 x [2 x %"class.hlsl::RWBuffer"]], align 4
@@ -35,35 +31,53 @@ float foo(RWBuffer<float> Arr[3][2]) {
 [numthreads(4,1,1)]
 void main(uint GI : SV_GroupThreadID) {
 // Codegen for "A[4][1]" - create local array [[Tmp0]] of size 3 x 2 and initialize
-// each element by a call to the resource constructor
+// each element by a call to RWBuffer<float>::__createFromBinding
 // The resource index for A[4][1][0][0] is 102 = 4 * (4 * 3 * 2) + 1 * (3 * 2)
 // (index in the resource array as if it was flattened)
 // CHECK-NEXT: %[[Ptr_Tmp0_0_0:.*]] = getelementptr [3 x [2 x %"class.hlsl::RWBuffer"]], ptr %tmp, i32 0, i32 0, i32 0
-// CHECK-NEXT: call void @_ZN4hlsl8RWBufferIfEC1EjjijPKc(ptr {{.*}} %[[Ptr_Tmp0_0_0]], i32 noundef 10, i32 noundef 2, i32 noundef 120, i32 noundef 102, ptr noundef @[[BufA]])
+// CHECK-NEXT: call void @hlsl::RWBuffer<float>::__createFromBinding(unsigned int, unsigned int, int, unsigned int, char const*)
+// CHECK-SAME: (ptr {{.*}} sret(%"class.hlsl::RWBuffer") align 4 %[[Ptr_Tmp0_0_0]],
+// CHECK-SAME: i32 noundef 10, i32 noundef 2, i32 noundef 120, i32 noundef 102, ptr noundef @[[BufA]])
+
 // CHECK-NEXT: %[[Ptr_Tmp0_0_1:.*]] = getelementptr [3 x [2 x %"class.hlsl::RWBuffer"]], ptr %tmp, i32 0, i32 0, i32 1
-// CHECK-NEXT: call void @_ZN4hlsl8RWBufferIfEC1EjjijPKc(ptr {{.*}} %[[Ptr_Tmp0_0_1]], i32 noundef 10, i32 noundef 2, i32 noundef 120, i32 noundef 103, ptr noundef @[[BufA]])
+// CHECK-NEXT: call void @hlsl::RWBuffer<float>::__createFromBinding(unsigned int, unsigned int, int, unsigned int, char const*)
+// CHECK-SAME: (ptr {{.*}} sret(%"class.hlsl::RWBuffer") align 4 %[[Ptr_Tmp0_0_1]],
+// CHECK-SAME: i32 noundef 10, i32 noundef 2, i32 noundef 120, i32 noundef 103, ptr noundef @[[BufA]])
+
 // CHECK-NEXT: %[[Ptr_Tmp0_1_0:.*]] = getelementptr [3 x [2 x %"class.hlsl::RWBuffer"]], ptr %tmp, i32 0, i32 1, i32 0
-// CHECK-NEXT: call void @_ZN4hlsl8RWBufferIfEC1EjjijPKc(ptr {{.*}} %[[Ptr_Tmp0_1_0]], i32 noundef 10, i32 noundef 2, i32 noundef 120, i32 noundef 104, ptr noundef @[[BufA]])
+// CHECK-NEXT: call void @hlsl::RWBuffer<float>::__createFromBinding(unsigned int, unsigned int, int, unsigned int, char const*)
+// CHECK-SAME: (ptr {{.*}} sret(%"class.hlsl::RWBuffer") align 4 %[[Ptr_Tmp0_1_0]],
+// CHECK-SAME: i32 noundef 10, i32 noundef 2, i32 noundef 120, i32 noundef 104, ptr noundef @[[BufA]])
+
 // CHECK-NEXT: %[[Ptr_Tmp0_1_1:.*]] = getelementptr [3 x [2 x %"class.hlsl::RWBuffer"]], ptr %tmp, i32 0, i32 1, i32 1
-// CHECK-NEXT: call void @_ZN4hlsl8RWBufferIfEC1EjjijPKc(ptr {{.*}} %[[Ptr_Tmp0_1_1]], i32 noundef 10, i32 noundef 2, i32 noundef 120, i32 noundef 105, ptr noundef @[[BufA]])
+// CHECK-NEXT: call void @hlsl::RWBuffer<float>::__createFromBinding(unsigned int, unsigned int, int, unsigned int, char const*)
+// CHECK-SAME: (ptr {{.*}} sret(%"class.hlsl::RWBuffer") align 4 %[[Ptr_Tmp0_1_1]],
+// CHECK-SAME: i32 noundef 10, i32 noundef 2, i32 noundef 120, i32 noundef 105, ptr noundef @[[BufA]])
+
 // CHECK-NEXT: %[[Ptr_Tmp0_2_0:.*]] = getelementptr [3 x [2 x %"class.hlsl::RWBuffer"]], ptr %tmp, i32 0, i32 2, i32 0
-// CHECK-NEXT: call void @_ZN4hlsl8RWBufferIfEC1EjjijPKc(ptr {{.*}} %[[Ptr_Tmp0_2_0]], i32 noundef 10, i32 noundef 2, i32 noundef 120, i32 noundef 106, ptr noundef @[[BufA]])
+// CHECK-NEXT: call void @hlsl::RWBuffer<float>::__createFromBinding(unsigned int, unsigned int, int, unsigned int, char const*)
+// CHECK-SAME: (ptr {{.*}} sret(%"class.hlsl::RWBuffer") align 4 %[[Ptr_Tmp0_2_0]],
+// CHECK-SAME: i32 noundef 10, i32 noundef 2, i32 noundef 120, i32 noundef 106, ptr noundef @[[BufA]])
+
 // CHECK-NEXT: %[[Ptr_Tmp0_2_1:.*]] = getelementptr [3 x [2 x %"class.hlsl::RWBuffer"]], ptr %tmp, i32 0, i32 2, i32 1
-// CHECK-NEXT: call void @_ZN4hlsl8RWBufferIfEC1EjjijPKc(ptr {{.*}} %[[Ptr_Tmp0_2_1]], i32 noundef 10, i32 noundef 2, i32 noundef 120, i32 noundef 107, ptr noundef @[[BufA]])
+// CHECK-NEXT: call void @hlsl::RWBuffer<float>::__createFromBinding(unsigned int, unsigned int, int, unsigned int, char const*)
+// CHECK-SAME: (ptr {{.*}} sret(%"class.hlsl::RWBuffer") align 4 %[[Ptr_Tmp0_2_1]],
+// CHECK-SAME: i32 noundef 10, i32 noundef 2, i32 noundef 120, i32 noundef 107, ptr noundef @[[BufA]])
+
 // After this Tmp0 values are copied to %Sub using the standard array loop initializaion
 // (generated from ArrayInitLoopExpr AST node)
   RWBuffer<float> Sub[3][2] = A[4][1];
 
 // CHECK: %[[Ptr_Sub_2:.*]] = getelementptr inbounds  [3 x [2 x %"class.hlsl::RWBuffer"]], ptr %Sub, i32 0, i32 2
 // CHECK: %[[Ptr_Sub_2_1:.*]] = getelementptr inbounds [2 x %"class.hlsl::RWBuffer"], ptr %[[Ptr_Sub_2]], i32 0, i32 1
-// CHECK-NEXT: %[[BufPtr:.*]] = call {{.*}} ptr @_ZN4hlsl8RWBufferIfEixEj(ptr {{.*}} %[[Ptr_Sub_2_1]], i32 noundef 0)
+// CHECK-NEXT: %[[BufPtr:.*]] = call {{.*}} ptr @hlsl::RWBuffer<float>::operator[](unsigned int)(ptr {{.*}} %[[Ptr_Sub_2_1]], i32 noundef 0)
 // CHECK-NEXT: %[[Sub_2_1_0_Value:.*]] = load float, ptr %[[BufPtr]], align 4
 // CHECK-NEXT: store float %[[Sub_2_1_0_Value]], ptr %a, align 4
   float a = Sub[2][1][0];
 
 // Codegen for "foo(A[2][GI])" - create local array [[Tmp2]] of size 3 x 2 and initialize
-// each element by a call to the resource constructor with dynamic index, and then
-// copy-in the array as an argument of "foo"
+// each element by a call to the RWBuffer<float>::__createFromBinding with dynamic index,
+// and then copy-in the array as an argument of "foo"
 
 // Calculate the resource index for A[2][GI][0][0] (index in the resource array as if it was flattened)
 // The index is 2 * (4 * 3 * 2) + GI * (3 * 2) = 48 + GI * 6
@@ -73,35 +87,48 @@ void main(uint GI : SV_GroupThreadID) {
 
 // A[2][GI][0][0]
 // CHECK-NEXT: %[[Ptr_Tmp2_0_0:.*]] = getelementptr [3 x [2 x %"class.hlsl::RWBuffer"]], ptr %[[Tmp2]], i32 0, i32 0, i32 0
-// CHECK-NEXT: call void @_ZN4hlsl8RWBufferIfEC1EjjijPKc(ptr {{.*}} %[[Ptr_Tmp2_0_0]], i32 noundef 10, i32 noundef 2, i32 noundef 120, i32 noundef %[[Index_A_2_GI_0_0]], ptr noundef @[[BufA]])
+// CHECK-NEXT: call void @hlsl::RWBuffer<float>::__createFromBinding(unsigned int, unsigned int, int, unsigned int, char const*)
+// CHECK-SAME: (ptr {{.*}} sret(%"class.hlsl::RWBuffer") align 4 %[[Ptr_Tmp2_0_0]],
+// CHECK-SAME: i32 noundef 10, i32 noundef 2, i32 noundef 120, i32 noundef %[[Index_A_2_GI_0_0]], ptr noundef @[[BufA]])
 
 // A[2][GI][0][1]
 // CHECK-NEXT: %[[Index_A_2_GI_0_1:.*]] = add i32 %[[Index_A_2_GI_0_0]], 1
 // CHECK-NEXT: %[[Ptr_Tmp2_0_1:.*]] = getelementptr [3 x [2 x %"class.hlsl::RWBuffer"]], ptr %[[Tmp2]], i32 0, i32 0, i32 1
-// CHECK-NEXT: call void @_ZN4hlsl8RWBufferIfEC1EjjijPKc(ptr {{.*}} %[[Ptr_Tmp2_0_1]], i32 noundef 10, i32 noundef 2, i32 noundef 120, i32 noundef %[[Index_A_2_GI_0_1]], ptr noundef @[[BufA]])
+// CHECK-NEXT: call void @hlsl::RWBuffer<float>::__createFromBinding(unsigned int, unsigned int, int, unsigned int, char const*)
+// CHECK-SAME: (ptr {{.*}} sret(%"class.hlsl::RWBuffer") align 4 %[[Ptr_Tmp2_0_1]],
+// CHECK-SAME: i32 noundef 10, i32 noundef 2, i32 noundef 120, i32 noundef %[[Index_A_2_GI_0_1]], ptr noundef @[[BufA]])
 
 // A[2][GI][1][0]
 // CHECK-NEXT: %[[Index_A_2_GI_1_0:.*]] = add i32 %[[Index_A_2_GI_0_1]], 1
 // CHECK-NEXT: %[[Ptr_Tmp2_1_0:.*]] = getelementptr [3 x [2 x %"class.hlsl::RWBuffer"]], ptr %[[Tmp2]], i32 0, i32 1, i32 0
-// CHECK-NEXT: call void @_ZN4hlsl8RWBufferIfEC1EjjijPKc(ptr {{.*}} %[[Ptr_Tmp2_1_0]], i32 noundef 10, i32 noundef 2, i32 noundef 120, i32 noundef %[[Index_A_2_GI_1_0]], ptr noundef @[[BufA]])
+// CHECK-NEXT: call void @hlsl::RWBuffer<float>::__createFromBinding(unsigned int, unsigned int, int, unsigned int, char const*)
+// CHECK-SAME: (ptr {{.*}} sret(%"class.hlsl::RWBuffer") align 4 %[[Ptr_Tmp2_1_0]],
+// CHECK-SAME: i32 noundef 10, i32 noundef 2, i32 noundef 120, i32 noundef %[[Index_A_2_GI_1_0]], ptr noundef @[[BufA]])
 
 // A[2][GI][1][1]
 // CHECK-NEXT: %[[Index_A_2_GI_1_1:.*]] = add i32 %[[Index_A_2_GI_1_0]], 1
 // CHECK-NEXT: %[[Ptr_Tmp2_1_1:.*]] = getelementptr [3 x [2 x %"class.hlsl::RWBuffer"]], ptr %[[Tmp2]], i32 0, i32 1, i32 1
-// CHECK-NEXT: call void @_ZN4hlsl8RWBufferIfEC1EjjijPKc(ptr {{.*}} %[[Ptr_Tmp2_1_1]], i32 noundef 10, i32 noundef 2, i32 noundef 120, i32 noundef %[[Index_A_2_GI_1_1]], ptr noundef @[[BufA]])
+// CHECK-NEXT: call void @hlsl::RWBuffer<float>::__createFromBinding(unsigned int, unsigned int, int, unsigned int, char const*)
+// CHECK-SAME: (ptr {{.*}} sret(%"class.hlsl::RWBuffer") align 4 %[[Ptr_Tmp2_1_1]],
+// CHECK-SAME: i32 noundef 10, i32 noundef 2, i32 noundef 120, i32 noundef %[[Index_A_2_GI_1_1]], ptr noundef @[[BufA]])
 
 // A[2][GI][2][0]
 // CHECK-NEXT: %[[Index_A_2_GI_2_0:.*]] = add i32 %[[Index_A_2_GI_1_1]], 1
 // CHECK-NEXT: %[[Ptr_Tmp2_2_0:.*]] = getelementptr [3 x [2 x %"class.hlsl::RWBuffer"]], ptr %[[Tmp2]], i32 0, i32 2, i32 0
-// CHECK-NEXT: call void @_ZN4hlsl8RWBufferIfEC1EjjijPKc(ptr {{.*}} %[[Ptr_Tmp2_2_0]], i32 noundef 10, i32 noundef 2, i32 noundef 120, i32 noundef %[[Index_A_2_GI_2_0]], ptr noundef @[[BufA]])
+// CHECK-NEXT: call void @hlsl::RWBuffer<float>::__createFromBinding(unsigned int, unsigned int, int, unsigned int, char const*)
+// CHECK-SAME: (ptr {{.*}} sret(%"class.hlsl::RWBuffer") align 4 %[[Ptr_Tmp2_2_0]],
+// CHECK-SAME: i32 noundef 10, i32 noundef 2, i32 noundef 120, i32 noundef %[[Index_A_2_GI_2_0]], ptr noundef @[[BufA]])
 
 // A[2][GI][2][1]
 // CHECK-NEXT: %[[Index_A_2_GI_2_1:.*]] = add i32 %[[Index_A_2_GI_2_0]], 1
 // CHECK-NEXT: %[[Ptr_Tmp2_2_1:.*]] = getelementptr [3 x [2 x %"class.hlsl::RWBuffer"]], ptr %[[Tmp2]], i32 0, i32 2, i32 1
-// CHECK-NEXT: call void @_ZN4hlsl8RWBufferIfEC1EjjijPKc(ptr {{.*}} %[[Ptr_Tmp2_2_1]], i32 noundef 10, i32 noundef 2, i32 noundef 120, i32 noundef %[[Index_A_2_GI_2_1]], ptr noundef @[[BufA]])
+// CHECK-NEXT: call void @hlsl::RWBuffer<float>::__createFromBinding(unsigned int, unsigned int, int, unsigned int, char const*)
+// CHECK-SAME: (ptr {{.*}} sret(%"class.hlsl::RWBuffer") align 4 %[[Ptr_Tmp2_2_1]],
+// CHECK-SAME: i32 noundef 10, i32 noundef 2, i32 noundef 120, i32 noundef %[[Index_A_2_GI_2_1]], ptr noundef @[[BufA]])
 
 // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 %[[Tmp1]], ptr align 4 %[[Tmp2]], i32 24, i1 false)
-// CHECK-NEXT: %[[FooReturned:.*]] = call {{.*}} float @_Z3fooA3_A2_N4hlsl8RWBufferIfEE(ptr noundef byval([3 x [2 x %"class.hlsl::RWBuffer"]]) align 4 %[[Tmp1]])
+// CHECK-NEXT: %[[FooReturned:.*]] = call {{.*}} float @foo(hlsl::RWBuffer<float> [3][2])
+// CHECK-SAME: (ptr noundef byval([3 x [2 x %"class.hlsl::RWBuffer"]]) align 4 %[[Tmp1]])
 // CHECK-NEXT: store float %[[FooReturned]], ptr %b, align 4
   float b = foo(A[2][GI]);
 
diff --git a/clang/test/CodeGenHLSL/resources/res-array-global-subarray-one.hlsl b/clang/test/CodeGenHLSL/resources/res-array-global-subarray-one.hlsl
index 5caf2b6db4c8e..bbd48b7ddea52 100644
--- a/clang/test/CodeGenHLSL/resources/res-array-global-subarray-one.hlsl
+++ b/clang/test/CodeGenHLSL/resources/res-array-global-subarray-one.hlsl
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -triple dxil-pc-shadermodel6.6-compute -finclude-default-header \
-// RUN:   -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s
+// RUN:   -emit-llvm -disable-llvm-passes -o - %s | llvm-cxxfilt | FileCheck %s
 
 // CHECK: @[[BufA:.*]] = private unnamed_addr constant [2 x i8] c"A\00", align 1
 
@@ -10,12 +10,7 @@ float foo(RWBuffer<float> Arr[2]) {
   return Arr[1][0];
 }
 
-// NOTE:
-// - _ZN4hlsl8RWBufferIfEC1EjjijPKc is the constructor call for explicit binding
-//    (has "jjij" in the mangled name) and the arguments are (register, space, range_size, index, name).
-// - _ZN4hlsl8RWBufferIfEixEj is the subscript operator for RWBuffer<float>
-
-// CHECK: define internal void @_Z4mainj(i32 noundef %GI)
+// CHECK: define internal void @main(unsigned int)(i32 noundef %GI)
 // CHECK-NEXT: entry:
 // CHECK-NEXT: %[[GI_alloca:.*]] = alloca i32, align 4
 // CHECK-NEXT: %Sub = alloca [2 x %"class.hlsl::RWBuffer"], align 4
@@ -28,33 +23,41 @@ float foo(RWBuffer<float> Arr[2]) {
 [numthreads(4,1,1)]
 void main(uint GI : SV_GroupThreadID) {
 // Codegen for "A[2]" - create local array [[Tmp0]] of size 2 and initialize
-// each element by a call to the resource constructor
+// each element by a call to RWBuffer<float>::__createFromBinding method
 // CHECK-NEXT: %[[Ptr_Tmp0_0:.*]] = getelementptr [2 x %"class.hlsl::RWBuffer"], ptr %[[Tmp0]], i32 0, i32 0
-// CHECK-NEXT: call void @_ZN4hlsl8RWBufferIfEC1EjjijPKc(ptr {{.*}} %[[Ptr_Tmp0_0]], i32 noundef 10, i32 noundef 2, i32 noundef 8, i32 noundef 6, ptr noundef @[[BufA]])
+// CHECK-NEXT: call void @hlsl::RWBuffer<float>::__createFromBinding(unsigned int, unsigned int, int, unsigned int, char const*)
+// CHECK-SAME: (ptr {{.*}} sret(%"class.hlsl::RWBuffer") align 4 %[[Ptr_Tmp0_0]],
+// CHECK-SAME: i32 noundef 10, i32 noundef 2, i32 noundef 8, i32 noundef 6, ptr noundef @[[BufA]])
 // CHECK-NEXT: %[[Ptr_Tmp0_1:.*]] = getelementptr [2 x %"class.hlsl::RWBuffer"], ptr %[[Tmp0]], i32 0, i32 1
-// CHECK-NEXT: call void @_ZN4hlsl8RWBufferIfEC1EjjijPKc(ptr {{.*}} %[[Ptr_Tmp0_1]], i32 noundef 10, i32 noundef 2, i32 noundef 8, i32 noundef 7, ptr noundef @[[BufA]])
+// CHECK-NEXT: call void @hlsl::RWBuffer<float>::__createFromBinding(unsigned int, unsigned int, int, unsigned int, char const*)
+// CHECK-SAME: (ptr {{.*}} sret(%"class.hlsl::RWBuffer") align 4 %[[Ptr_Tmp0_1]],
+// CHECK-SAME: i32 noundef 10, i32 noundef 2, i32 noundef 8, i32 noundef 7, ptr noundef @[[BufA]])
 // After this Tmp0 values are copied to %Sub using the standard array loop initializaion
 // (generated from ArrayInitLoopExpr AST node)
   RWBuffer<float> Sub[2] = A[3];
 
 // CHECK: %[[Ptr_Sub_1:.*]] = getelementptr inbounds [2 x %"class.hlsl::RWBuffer"], ptr %Sub, i32 0, i32 1
-// CHECK-NEXT: %[[BufPtr:.*]] = call {{.*}} ptr @_ZN4hlsl8RWBufferIfEixEj(ptr {{.*}} %[[Ptr_Sub_1]], i32 noundef 0)
+// CHECK-NEXT: %[[BufPtr:.*]] = call {{.*}} ptr @hlsl::RWBuffer<float>::operator[](unsigned int)(ptr {{.*}} %[[Ptr_Sub_1]], i32 noundef 0)
 // CHECK-NEXT: %[[Sub_1_0_Value:.*]] = load float, ptr %[[BufPtr]], align 4
 // CHECK-NEXT: store float %[[Sub_1_0_Value]], ptr %a, align 4
   float a = Sub[1][0];
 
 // Codegen for "foo(A[GI])" - create local array [[Tmp2]] of size 2 and initialize
-// each element by a call to the resource constructor with dynamic index, and then
-// copy-in the array as an argument of "foo"
+// each element by a call to the RWBuffer<float>::__createFromBinding method 
+// with dynamic index, and then copy-in the array as an argument of "foo"
 // CHECK: %[[GI:.*]] = load i32, ptr %[[GI_alloca]], align 4
 // CHECK-NEXT: %[[Index_A_GI_0:.*]] = mul i32 %[[GI]], 2
 // CHECK-NEXT: %[[Ptr_Tmp2_GI_0:.*]] = getelementptr [2 x %"class.hlsl::RWBuffer"], ptr %[[Tmp2]], i32 0, i32 0
-// CHECK-NEXT: call void @_ZN4hlsl8RWBufferIfEC1EjjijPKc(ptr {{.*}} %[[Ptr_Tmp2_GI_0]], i32 noundef 10, i32 noundef 2, i32 noundef 8, i32 noundef %[[Index_A_GI_0]], ptr noundef @[[BufA]])
+// CHECK-NEXT: call void @hlsl::RWBuffer<float>::__createFromBinding(unsigned int, unsigned int, int, unsigned int, char const*)
+// CHECK-SAME: (ptr {{.*}} sret(%"class.hlsl::RWBuffer") align 4 %[[Ptr_Tmp2_GI_0]],
+// CHECK-SAME: i32 noundef 10, i32 noundef 2, i32 noundef 8, i32 noundef %[[Index_A_GI_0]], ptr noundef @[[BufA]])
 // CHECK-NEXT: %[[Index_A_GI_1:.*]] = add i32 %[[Index_A_GI_0]], 1
 // CHECK-NEXT: %[[Ptr_Tmp2_GI_1:.*]] = getelementptr [2 x %"class.hlsl::RWBuffer"], ptr %[[Tmp2]], i32 0, i32 1
-// CHECK-NEXT: call void @_ZN4hlsl8RWBufferIfEC1EjjijPKc(ptr {{.*}} %[[Ptr_Tmp2_GI_1]], i32 noundef 10, i32 noundef 2, i32 noundef 8, i32 noundef %[[Index_A_GI_1]], ptr noundef @[[BufA]])
+// CHECK-NEXT: call void @hlsl::RWBuffer<float>::__createFromBinding(unsigned int, unsigned int, int, unsigned int, char const*)
+// CHECK-SAME: (ptr {{.*}} sret(%"class.hlsl::RWBuffer") align 4 %[[Ptr_Tmp2_GI_1]],
+// CHECK-SAME: i32 noundef 10, i32 noundef 2, i32 noundef 8, i32 noundef %[[Index_A_GI_1]], ptr noundef @[[BufA]])
 // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 %[[Tmp1]], ptr align 4 %[[Tmp2]], i32 8, i1 false)
-// CHECK-NEXT: %[[FooReturned:.*]] = call {{.*}} float @_Z3fooA2_N4hlsl8RWBufferIfEE(ptr noundef byval([2 x %"class.hlsl::RWBuffer"]) align 4 %[[Tmp1]])
+// CHECK-NEXT: %[[FooReturned:.*]] = call {{.*}} float @foo(hlsl::RWBuffer<float> [2])(ptr noundef byval([2 x %"class.hlsl::RWBuffer"]) align 4 %[[Tmp1]])
 // CHECK-NEXT: store float %[[FooReturned]], ptr %b, align 4
   float b = foo(A[GI]);
 
diff --git a/clang/test/CodeGenHLSL/resources/res-array-global-unbounded.hlsl b/clang/test/CodeGenHLSL/resources/res-array-global-unbounded.hlsl
index edf9ce01f72a6..6756a26bfc124 100644
--- a/clang/test/CodeGenHLSL/resources/res-array-global-unbounded.hlsl
+++ b/clang/test/CodeGenHLSL/resources/res-array-global-unbounded.hlsl
@@ -1,7 +1,7 @@
 // RUN: %clang_cc1 -triple dxil-pc-shadermodel6.6-compute -finclude-default-header \
-// RUN:   -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s -check-prefixes=CHECK,DXIL
+// RUN:   -emit-llvm -disable-llvm-passes -o - %s | llvm-cxxfilt | FileCheck %s -check-prefixes=CHECK,DXIL
 // RUN: %clang_cc1 -finclude-default-header -triple spirv-unknown-vulkan-compute \
-// RUN:   -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s -check-prefixes=CHECK,SPV
+// RUN:   -emit-llvm -disable-llvm-passes -o - %s | llvm-cxxfilt | FileCheck %s -check-prefixes=CHECK,SPV
 
 // CHECK: @[[BufA:.*]] = private unnamed_addr constant [2 x i8] c"A\00", align 1
 // CHECK: @[[BufB:.*]] = private unnamed_addr constant [2 x i8] c"B\00", align 1
@@ -15,16 +15,9 @@ float foo(RWBuffer<int> Arr[4], uint Index) {
   return (float)Arr[Index][0];
 }
 
-// NOTE:
-// - _ZN4hlsl8RWBufferIfEC1EjjijPKc is the constructor call for explicit binding for RWBuffer<float>
-//    (has "jjij" in the mangled name) and the arguments are (register, space, range_size, index, name).
-// - _ZN4hlsl8RWBufferIiEC1EjijjPKc is the constructor call for implicit binding for RWBuffer<int>
-//    (has "jijj" in the mangled name) and the arguments are (space, range_size, index, order_id, name).
-// - _ZN4hlsl8RWBufferIfEixEj is the subscript operator on RWBuffer<float>
-
 [numthreads(4,1,1)]
 void main(uint GI : SV_GroupIndex) {
-  // CHECK: define internal {{.*}}void @_Z4mainj(i32 noundef %GI)
+  // CHECK: define internal{{.*}} void @main(unsigned int)(i32 noundef %GI)
   // CHECK: %[[GI_alloca:.*]] = alloca i32, align 4
   // CHECK-NEXT: %a = alloca float, align 4
   // CHECK-NEXT: %[[Tmp0:.*]] = alloca %"class.hlsl::RWBuffer
@@ -35,8 +28,10 @@ void main(uint GI : SV_GroupIndex) {
 
   // Make sure A[100] is translated to a RWBuffer<float> constructor call with range -1 and index 100
   // and explicit binding (u10, space1) 
-  // CHECK: call void @_ZN4hlsl8RWBufferIfEC1EjjijPKc(ptr {{.*}} %[[Tmp0]], i32 noundef 10, i32 noundef 1, i32 noundef -1, i32 noundef 100, ptr noundef @A.str)
-  // CHECK-NEXT: %[[BufPtr:.*]] = call {{.*}} ptr{{.*}} @_ZN4hlsl8RWBufferIfEixEj(ptr {{.*}} %[[Tmp0]], i32 noundef 0)
+  // CHECK: @hlsl::RWBuffer<float>::__createFromBinding(unsigned int, unsigned int, int, unsigned int, char const*)
+  // CHECK-SAME: (ptr {{.*}} sret(%"class.hlsl::RWBuffer.0") align {{(4|8)}} %[[Tmp0]],
+  // CHECK-SAME: i32 noundef 10, i32 noundef 1, i32 noundef -1, i32 noundef 100, ptr noundef @A.str)
+  // CHECK-NEXT: %[[BufPtr:.*]] = call {{.*}} ptr{{.*}} @hlsl::RWBuffer<float>::operator[](unsigned int)(ptr {{.*}} %[[Tmp0]], i32 noundef 0)
   // CHECK-NEXT: %[[Value1:.*]] = load float, ptr{{.*}} %[[BufPtr]], align 4
   // CHECK-NEXT: store float %[[Value1]], ptr %a, align 4
   float a = A[100][0];
@@ -46,19 +41,34 @@ void main(uint GI : SV_GroupIndex) {
   // (space 0, order_id 0) 
   // The first index is calculated from the array dimensions (unbounded x 5 x 4) and indices (2, 3)
   // as 2 * 5 * 4 + 3 * 4 = 52 and the following indices are sequential.
+  
   // CHECK-NEXT: %[[Ptr_Tmp2_0:.*]] = getelementptr [4 x %"class.hlsl::RWBuffer"], ptr %[[Tmp2]], i32 0, i32 0
-  // CHECK-NEXT: call void @_ZN4hlsl8RWBufferIiEC1EjijjPKc(ptr {{.*}} %[[Ptr_Tmp2_0]], i32 noundef 0, i32 noundef -1, i32 noundef 52, i32 noundef 0, ptr noundef @B.str)
+  // CHECK-NEXT: call void @hlsl::RWBuffer<int>::__createFromImplicitBinding(unsigned int, unsigned int, int, unsigned int, char const*)
+  // CHECK-SAME: (ptr {{.*}} sret(%"class.hlsl::RWBuffer") align {{(4|8)}} %[[Ptr_Tmp2_0]], 
+  // CHECK-SAME: i32 noundef 0, i32 noundef 0, i32 noundef -1, i32 noundef 52, ptr noundef @[[BufB]])
+  
   // CHECK-NEXT: %[[Ptr_Tmp2_1:.*]] = getelementptr [4 x %"class.hlsl::RWBuffer"], ptr %[[Tmp2]], i32 0, i32 1
-  // CHECK-NEXT: call void @_ZN4hlsl8RWBufferIiEC1EjijjPKc(ptr {{.*}} %[[Ptr_Tmp2_1]], i32 noundef 0, i32 noundef -1, i32 noundef 53, i32 noundef 0, ptr noundef @B.str)
+  // CHECK-NEXT: call void @hlsl::RWBuffer<int>::__createFromImplicitBinding(unsigned int, unsigned int, int, unsigned int, char const*)
+  // CHECK-SAME: (ptr {{.*}} sret(%"class.hlsl::RWBuffer") align {{(4|8)}} %[[Ptr_Tmp2_1]], 
+  // CHECK-SAME: i32 noundef 0, i32 noundef 0, i32 noundef -1, i32 noundef 53, ptr noundef @[[BufB]])
+  
   // CHECK-NEXT: %[[Ptr_Tmp2_2:.*]] = getelementptr [4 x %"class.hlsl::RWBuffer"], ptr %[[Tmp2]], i32 0, i32 2
-  // CHECK-NEXT: call void @_ZN4hlsl8RWBufferIiEC1EjijjPKc(ptr {{.*}} %[[Ptr_Tmp2_2]], i32 noundef 0, i32 noundef -1, i32 noundef 54, i32 noundef 0, ptr noundef @B.str)
+  // CHECK-NEXT: call void @hlsl::RWBuffer<int>::__createFromImplicitBinding(unsigned int, unsigned int, int, unsigned int, char const*)
+  // CHECK-SAME: (ptr {{.*}} sret(%"class.hlsl::RWBuffer") align {{(4|8)}} %[[Ptr_Tmp2_2]], 
+  // CHECK-SAME: i32 noundef 0, i32 noundef 0, i32 noundef -1, i32 noundef 54, ptr noundef @[[BufB]])
+
   // CHECK-NEXT: %[[Ptr_Tmp2_3:.*]] = getelementptr [4 x %"class.hlsl::RWBuffer"], ptr %[[Tmp2]], i32 0, i32 3
-  // CHECK-NEXT: call void @_ZN4hlsl8RWBufferIiEC1EjijjPKc(ptr {{.*}} %[[Ptr_Tmp2_3]], i32 noundef 0, i32 noundef -1, i32 noundef 55, i32 noundef 0, ptr noundef @B.str)
+  // CHECK-NEXT: call void @hlsl::RWBuffer<int>::__createFromImplicitBinding(unsigned int, unsigned int, int, unsigned int, char const*)
+  // CHECK-SAME: (ptr {{.*}} sret(%"class.hlsl::RWBuffer") align {{(4|8)}} %[[Ptr_Tmp2_3]], 
+  // CHECK-SAME: i32 noundef 0, i32 noundef 0, i32 noundef -1, i32 noundef 55, ptr noundef @[[BufB]])
+
   // DXIL-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 %[[Tmp1]], ptr align 4 %[[Tmp2]], i32 16, i1 false)
   // SPV-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 8 %[[Tmp1]], ptr align 8 %[[Tmp2]], i64 32, i1 false)
+
   // CHECK-NEXT: %[[GI:.*]] = load i32, ptr %[[GI_alloca]], align 4
-  // DXIL-NEXT: %[[Value2:.*]] = call {{.*}} float @_Z3fooA4_N4hlsl8RWBufferIiEEj(ptr noundef byval([4 x %"class.hlsl::RWBuffer"]) align 4 %[[Tmp1]], i32 noundef %[[GI]])
-  // SPV-NEXT: %[[Value2:.*]] = call {{.*}} float @_Z3fooA4_N4hlsl8RWBufferIiEEj(ptr noundef byval([4 x %"class.hlsl::RWBuffer"]) align 8 %[[Tmp1]], i32 noundef %[[GI]])
+  // CHECK-NEXT: %[[Value2:.*]] = call {{.*}} float @foo(hlsl::RWBuffer<int> [4], unsigned int)
+  // CHECK-SAME: (ptr noundef byval([4 x %"class.hlsl::RWBuffer"]) align {{(4|8)}} %[[Tmp1]], i32 noundef %[[GI]])
+
   // CHECK-NEXT: store float %[[Value2]], ptr %b, align 4
   float b = foo(B[2][3], GI);
 
diff --git a/clang/test/CodeGenHLSL/resources/res-array-global.hlsl b/clang/test/CodeGenHLSL/resources/res-array-global.hlsl
index 0abdf5f88cf6e..f728c6b627b68 100644
--- a/clang/test/CodeGenHLSL/resources/res-array-global.hlsl
+++ b/clang/test/CodeGenHLSL/resources/res-array-global.hlsl
@@ -1,7 +1,7 @@
 // RUN: %clang_cc1 -triple dxil-pc-shadermodel6.6-compute -finclude-default-header \
-// RUN:   -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s -check-prefixes=CHECK,DXIL
+// RUN:   -emit-llvm -disable-llvm-passes -o - %s | llvm-cxxfilt | FileCheck %s -check-prefixes=CHECK,DXIL
 // RUN: %clang_cc1 -finclude-default-header -triple spirv-unknown-vulkan-compute \
-// RUN:   -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s -check-prefixes=CHECK,SPV
+// RUN:   -emit-llvm -disable-llvm-passes -o - %s | llvm-cxxfilt | FileCheck %s -check-prefixes=CHECK,SPV
 
 // CHECK: @[[BufA:.*]] = private unnamed_addr constant [2 x i8] c"A\00", align 1
 // CHECK: @[[BufB:.*]] = private unnamed_addr constant [2 x i8] c"B\00", align 1
@@ -30,46 +30,59 @@ RWStructuredBuffer<float> Out;
 
 [numthreads(4,1,1)]
 void main() {
-  // CHECK: define internal{{.*}} void @_Z4mainv()
+  // CHECK: define internal{{.*}} void @main()
   // CHECK: %[[Tmp0:.*]] = alloca %"class.hlsl::RWBuffer
   // CHECK: %[[Tmp1:.*]] = alloca %"class.hlsl::RWBuffer
   // CHECK: %[[Tmp2:.*]] = alloca %"class.hlsl::RWBuffer
   // CHECK: %[[Tmp3:.*]] = alloca %"class.hlsl::RWBuffer
   // CHECK: %[[Tmp4:.*]] = alloca %"class.hlsl::RWBuffer
 
-  // NOTE:
-  // Constructor call for explicit binding has "jjij" in the mangled name and the arguments are (register, space, range_size, index, name).
-  // For implicit binding the constructor has "jijj" in the mangled name and the arguments are (space, range_size, index, order_id, name).
-  // The range_size can be -1 for unbounded arrays, and that is the only signed int in the signature.
-  // The order_id argument is a sequential number that is assigned to resources with implicit binding and corresponds to the order in which 
-  // the resources were declared. It is needed because implicit bindings are assigned later on in an LLVM pass that needs to know the order
-  // of the resource declarations.
-
-  // Make sure A[2] is translated to a RWBuffer<float> constructor call with range 4 and index 2
+  // Make sure A[2] is translated to a RWBuffer<float>::__createFromBinding call with range 4 and index 2
   // and DXIL explicit binding (u10, space1)
   // and SPIR-V explicit binding (binding 12, set 2) 
-  // DXIL: call void @_ZN4hlsl8RWBufferIfEC1EjjijPKc(ptr {{.*}} %[[Tmp0]], i32 noundef 10, i32 noundef 1, i32 noundef 4, i32 noundef 2, ptr noundef @[[BufA]])
-  // SPV: call void @_ZN4hlsl8RWBufferIfEC1EjjijPKc(ptr {{.*}} %[[Tmp0]], i32 noundef 12, i32 noundef 2, i32 noundef 4, i32 noundef 2, ptr noundef @[[BufA]])
+  // DXIL: call void @hlsl::RWBuffer<float>::__createFromBinding(unsigned int, unsigned int, int, unsigned int, char const*)
+  // DXIL-SAME: (ptr {{.*}} sret(%"class.hlsl::RWBuffer") align 4 %[[Tmp0]],
+  // DXIL-SAME: i32 noundef 10, i32 noundef 1, i32 noundef 4, i32 noundef 2, ptr noundef @[[BufA]])
+  // SPV: call void @hlsl::RWBuffer<float>::__createFromBinding(unsigned int, unsigned int, int, unsigned int, char const*)
+  // SPV-SAME: (ptr {{.*}} sret(%"class.hlsl::RWBuffer") align 8 %[[Tmp0]],
+  // SPV-SAME: i32 noundef 12, i32 noundef 2, i32 noundef 4, i32 noundef 2, ptr noundef @[[BufA]])
 
-  // Make sure B[3] is translated to a RWBuffer<int> constructor call with range 5 and index 3
+  // Make sure B[3] is translated to a RWBuffer<int>::__createFromImplicitBinding call with range 5 and index 3
   // and DXIL for implicit binding in space0, order id 0
   // and SPIR-V explicit binding (binding 13, set 0)
-  // DXIL: call void @_ZN4hlsl8RWBufferIiEC1EjijjPKc(ptr {{.*}} %[[Tmp1]], i32 noundef 0, i32 noundef 5, i32 noundef 3, i32 noundef 0, ptr noundef @[[BufB]])
-  // SPV: call void @_ZN4hlsl8RWBufferIiEC1EjjijPKc(ptr {{.*}} %[[Tmp1]], i32 noundef 13, i32 noundef 0, i32 noundef 5, i32 noundef 3, ptr noundef @[[BufB]])
+  // DXIL: call void @hlsl::RWBuffer<int>::__createFromImplicitBinding(unsigned int, unsigned int, int, unsigned int, char const*)
+  // DXIL-SAME: (ptr {{.*}} sret(%"class.hlsl::RWBuffer.0") align 4 %[[Tmp1]],
+  // DXIL-SAME: i32 noundef 0, i32 noundef 0, i32 noundef 5, i32 noundef 3, ptr noundef @[[BufB]])
+  // SPV: call void @hlsl::RWBuffer<int>::__createFromBinding(unsigned int, unsigned int, int, unsigned int, char const*)
+  // SPV-SAME: (ptr {{.*}} sret(%"class.hlsl::RWBuffer.0") align 8 %[[Tmp1]],
+  // SPV-SAME: i32 noundef 13, i32 noundef 0, i32 noundef 5, i32 noundef 3, ptr noundef @[[BufB]])
 
-  // Make sure C[1] is translated to a RWBuffer<int> constructor call with range 3 and index 1
-  // and DXIL explicit binding (u2, space0) 
+  // Make sure C[1] is translated to a RWBuffer<int>::__createFromBinding call with range 3 and index 1
+  // and DXIL explicit binding (u2, space0)
   // and SPIR-V explicit binding (binding 2, set 0)
-  // DXIL: call void @_ZN4hlsl8RWBufferIiEC1EjjijPKc(ptr {{.*}} %[[Tmp2]], i32 noundef 2, i32 noundef 0, i32 noundef 3, i32 noundef 1, ptr noundef @[[BufC]])
-  // SPV: call void @_ZN4hlsl8RWBufferIiEC1EjjijPKc(ptr {{.*}} %[[Tmp2]], i32 noundef 2, i32 noundef 0, i32 noundef 3, i32 noundef 1, ptr noundef @[[BufC]])
+  // DXIL: call void @hlsl::RWBuffer<int>::__createFromBinding(unsigned int, unsigned int, int, unsigned int, char const*)
+  // DXIL-SAME: (ptr {{.*}} sret(%"class.hlsl::RWBuffer.0") align 4 %[[Tmp2]],
+  // DXIL-SAME: i32 noundef 2, i32 noundef 0, i32 noundef 3, i32 noundef 1, ptr noundef @[[BufC]])
+  // SPV: call void @hlsl::RWBuffer<int>::__createFromBinding(unsigned int, unsigned int, int, unsigned int, char const*)
+  // SPV-SAME: (ptr {{.*}} sret(%"class.hlsl::RWBuffer.0") align 8 %[[Tmp2]],
+  // SPV-SAME: i32 noundef 2, i32 noundef 0, i32 noundef 3, i32 noundef 1, ptr noundef @[[BufC]])
 
-  // Make sure D[7] is translated to a RWBuffer<double> constructor call with implicit binding
+  // Make sure D[7] is translated to a RWBuffer<double>::__createFromImplicitBinding call
   // for both DXIL and SPIR-V
-  // DXIL: call void @_ZN4hlsl8RWBufferIdEC1EjijjPKc(ptr {{.*}} %[[Tmp3]], i32 noundef 0, i32 noundef 10, i32 noundef 7, i32 noundef 1, ptr noundef @[[BufD]])
-  // SPV: call void @_ZN4hlsl8RWBufferIdEC1EjijjPKc(ptr {{.*}} %[[Tmp3]], i32 noundef 0, i32 noundef 10, i32 noundef 7, i32 noundef 0, ptr noundef @[[BufD]])
+  // DXIL: call void @hlsl::RWBuffer<double>::__createFromImplicitBinding(unsigned int, unsigned int, int, unsigned int, char const*)
+  // DXIL-SAME: (ptr {{.*}} sret(%"class.hlsl::RWBuffer.1") align 4 %[[Tmp3]],
+  // DXIL-SAME: i32 noundef 1, i32 noundef 0, i32 noundef 10, i32 noundef 7, ptr noundef @D.str)
+  // SPV: call void @hlsl::RWBuffer<double>::__createFromImplicitBinding(unsigned int, unsigned int, int, unsigned int, char const*)
+  // SPV-SAME: (ptr {{.*}} sret(%"class.hlsl::RWBuffer.1") align 8 %[[Tmp3]],
+  // SPV-SAME: i32 noundef 0, i32 noundef 0, i32 noundef 10, i32 noundef 7, ptr noundef @[[BufD]])
 
-  // Make sure E[5][0] is translated to RWBuffer<uint> constructor call with implicit binding and specified space/set 2
-  // DXIL: call void @_ZN4hlsl8RWBufferIjEC1EjijjPKc(ptr {{.*}} %[[Tmp4]], i32 noundef 2, i32 noundef 15, i32 noundef 5, i32 noundef 2, ptr noundef @[[BufE]])
-  // SPV: call void @_ZN4hlsl8RWBufferIjEC1EjijjPKc(ptr {{.*}} %[[Tmp4]], i32 noundef 2, i32 noundef 15, i32 noundef 5, i32 noundef 1, ptr noundef @[[BufE]])
+  // Make sure E[5][0] is translated to RWBuffer<uint>::__createFromImplicitBinding call 
+  // for both DXIL and SPIR-V with specified space/set 2
+  // DXIL: call void  @hlsl::RWBuffer<unsigned int>::__createFromImplicitBinding(unsigned int, unsigned int, int, unsigned int, char const*)
+  // DXIL-SAME: (ptr {{.*}} sret(%"class.hlsl::RWBuffer.2") align 4 %[[Tmp4]],
+  // DXIL-SAME: i32 noundef 2, i32 noundef 2, i32 noundef 15, i32 noundef 5, ptr noundef @[[BufE]])
+  // SPV: call void @hlsl::RWBuffer<unsigned int>::__createFromImplicitBinding(unsigned int, unsigned int, int, unsigned int, char const*)
+  // SPV-SAME: (ptr {{.*}} sret(%"class.hlsl::RWBuffer.2") align 8 %[[Tmp4]],
+  // SPV-SAME: i32 noundef 1, i32 noundef 2, i32 noundef 15, i32 noundef 5, ptr noundef @[[BufE]])
   Out[0] = A[2][0] + (float)B[3][0] + (float)C[1][0] + (float)D[7][0] + (float)E[5][0];
 }
diff --git a/clang/test/CodeGenHLSL/resources/resource-bindings.hlsl b/clang/test/CodeGenHLSL/resources/resource-bindings.hlsl
index 27af47e9587cc..4ffa7cfc84e17 100644
--- a/clang/test/CodeGenHLSL/resources/resource-bindings.hlsl
+++ b/clang/test/CodeGenHLSL/resources/resource-bindings.hlsl
@@ -1,4 +1,5 @@
-// RUN: %clang_cc1 -triple dxil--shadermodel6.6-compute -x hlsl -finclude-default-header -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple dxil--shadermodel6.6-compute -x hlsl -finclude-default-header \
+// RUN:   -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s
 
 // CHECK: %"class.hlsl::RWBuffer" = type { target("dx.TypedBuffer", <4 x float>, 1, 0, 0) }
 // CHECK: %"class.hlsl::RWBuffer.0" = type { target("dx.TypedBuffer", float, 1, 0, 0) }
@@ -12,32 +13,20 @@
 // CHECK: @_ZL4T3S0 = internal global %"class.hlsl::RWStructuredBuffer" poison, align 4
 // CHECK: @_ZL5Array = internal global [10 x %"class.hlsl::RWBuffer.1"] poison, align 4
 
-// CHECK: %[[HANDLE:.*]] = call target("dx.TypedBuffer", <4 x float>, 1, 0, 0)
+// CHECK: call target("dx.TypedBuffer", <4 x float>, 1, 0, 0)
 // CHECK-SAME: @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_v4f32_1_0_0t(
-// CHECK-SAME: i32 %{{[0-9]+}}, i32 %{{[0-9]+}}, i32 %{{[0-9]+}}, i32 %{{[0-9]+}}, ptr %{{.*}})
-// CHECK: %[[HANDLE_PTR:.*]] = getelementptr inbounds nuw %"class.hlsl::RWBuffer", ptr %this{{[0-9]*}}, i32 0, i32 0
-// CHECK: store target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %[[HANDLE]], ptr %[[HANDLE_PTR]], align 4
 RWBuffer<float4> U0S0 : register(u0);
 
-// CHECK: %[[HANDLE:.*]] = call target("dx.TypedBuffer", float, 1, 0, 0)
+// CHECK: call target("dx.TypedBuffer", float, 1, 0, 0)
 // CHECK-SAME: @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_f32_1_0_0t(
-// CHECK-SAME: i32 %{{[0-9]+}}, i32 %{{[0-9]+}}, i32 %{{[0-9]+}}, i32 %{{[0-9]+}}, ptr %{{.*}})
-// CHECK: %[[HANDLE_PTR:.*]] = getelementptr inbounds nuw %"class.hlsl::RWBuffer.0", ptr %this{{[0-9]*}}, i32 0, i32 0
-// CHECK: store target("dx.TypedBuffer", float, 1, 0, 0) %[[HANDLE]], ptr %[[HANDLE_PTR]], align 4
 RWBuffer<float> U5S3 : register(u5, space3);
 
-// CHECK: %[[HANDLE:.*]] = call target("dx.RawBuffer", i32, 0, 0)
+// CHECK: call target("dx.RawBuffer", i32, 0, 0)
 // CHECK-SAME: @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_i32_0_0t(
-// CHECK-SAME: i32 %{{[0-9]+}}, i32 %{{[0-9]+}}, i32 %{{[0-9]+}}, i32 %{{[0-9]+}}, ptr %{{.*}})
-// CHECK: %[[HANDLE_PTR:.*]] = getelementptr inbounds nuw %"class.hlsl::StructuredBuffer", ptr %this{{[0-9]*}}, i32 0, i32 0
-// CHECK: store target("dx.RawBuffer", i32, 0, 0) %[[HANDLE]], ptr %[[HANDLE_PTR]], align 4
 StructuredBuffer<int> T2S2 : register(t2, space2);
 
-// CHECK: %[[HANDLE:.*]] = call target("dx.RawBuffer", %struct.S, 1, 0)
+// CHECK: call target("dx.RawBuffer", %struct.S, 1, 0)
 // CHECK-SAME: @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_s_struct.Ss_1_0t(
-// CHECK-SAME: i32 %{{[0-9]+}}, i32 %{{[0-9]+}}, i32 %{{[0-9]+}}, i32 %{{[0-9]+}}, ptr %{{.*}})
-// CHECK: %[[HANDLE_PTR:.*]] = getelementptr inbounds nuw %"class.hlsl::RWStructuredBuffer", ptr %this{{[0-9]*}}, i32 0, i32 0
-// CHECK: store target("dx.RawBuffer", %struct.S, 1, 0) %[[HANDLE]], ptr %[[HANDLE_PTR]], align 4
 struct S {
   float4 f;
   int i;
diff --git a/clang/test/CodeGenHLSL/static-local-ctor.hlsl b/clang/test/CodeGenHLSL/static-local-ctor.hlsl
index 9a4bf66f030ed..bb575d23216b0 100644
--- a/clang/test/CodeGenHLSL/static-local-ctor.hlsl
+++ b/clang/test/CodeGenHLSL/static-local-ctor.hlsl
@@ -1,4 +1,5 @@
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -emit-llvm -o - -disable-llvm-passes %s | FileCheck %s
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -emit-llvm -o - -disable-llvm-passes %s | \
+// RUN:   llvm-cxxfilt | FileCheck %s
 
 // Verify that no per variable _Init_thread instructions are emitted for non-trivial static locals
 // These would normally be emitted by the MicrosoftCXXABI, but the DirectX backend should exlude them
@@ -13,17 +14,17 @@ void InitBuf(RWBuffer<int> buf) {
 }
 
 // CHECK-NOT: _Init_thread_epoch
-// CHECK: define internal void @_Z4mainv
+// CHECK: define internal void @main
 // CHECK-NEXT: entry:
 // CHECK-NEXT: [[Tmp0:%.*]] = alloca %"class.hlsl::RWBuffer"
 // CHECK-NEXT: [[Tmp1:%.*]] = alloca %"class.hlsl::RWBuffer"
-// CHECK-NEXT: [[Tmp2:%.*]] = load i8, ptr @_ZGVZ4mainvE5mybuf
+// CHECK-NEXT: [[Tmp2:%.*]] = load i8, ptr @guard variable for main()::mybuf
 // CHECK-NEXT: [[Tmp3:%.*]] = icmp eq i8 [[Tmp2]], 0
 // CHECK-NEXT: br i1 [[Tmp3]]
 // CHECK-NOT: _Init_thread_header
 // CHECK: init.check:
-// CHECK-NEXT: call void @_ZN4hlsl8RWBufferIiEC1EjijjPKc(
-// CHECK-NEXT: store i8 1, ptr @_ZGVZ4mainvE5mybuf
+// CHECK-NEXT: call void @hlsl::RWBuffer<int>::__createFromImplicitBinding
+// CHECK-NEXT: store i8 1, ptr @guard variable for main()::mybuf
 // CHECK-NOT: _Init_thread_footer
 
 
diff --git a/clang/test/CodeGenOpenCL/amdgcn-buffer-rsrc-type.cl b/clang/test/CodeGenOpenCL/amdgcn-buffer-rsrc-type.cl
index f9d7968fc5570..b55f663d6d948 100644
--- a/clang/test/CodeGenOpenCL/amdgcn-buffer-rsrc-type.cl
+++ b/clang/test/CodeGenOpenCL/amdgcn-buffer-rsrc-type.cl
@@ -1,4 +1,4 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6
  // REQUIRES: amdgpu-registered-target
  // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu verde -emit-llvm -o - %s | FileCheck %s
 
@@ -11,9 +11,9 @@ AA getAA(void *p);
 __amdgpu_buffer_rsrc_t getBufferImpl(void *p);
 void consumeBuffer(__amdgpu_buffer_rsrc_t);
 
-// CHECK-LABEL: define {{[^@]+}}@getBuffer
-// CHECK-SAME: (ptr addrspace(5) noundef [[P:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define dso_local ptr addrspace(8) @getBuffer(
+// CHECK-SAME: ptr addrspace(5) noundef [[P:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[CALL:%.*]] = tail call ptr addrspace(8) @getBufferImpl(ptr addrspace(5) noundef [[P]]) #[[ATTR2:[0-9]+]]
 // CHECK-NEXT:    ret ptr addrspace(8) [[CALL]]
 //
@@ -21,16 +21,16 @@ __amdgpu_buffer_rsrc_t getBuffer(void *p) {
   return getBufferImpl(p);
 }
 
-// CHECK-LABEL: define {{[^@]+}}@consumeBufferPtr
-// CHECK-SAME: (ptr addrspace(5) noundef readonly captures(address) [[P:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define dso_local void @consumeBufferPtr(
+// CHECK-SAME: ptr addrspace(5) noundef readonly captures(address) [[P:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq ptr addrspace(5) [[P]], addrspacecast (ptr null to ptr addrspace(5))
-// CHECK-NEXT:    br i1 [[TOBOOL_NOT]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
-// CHECK:       if.then:
-// CHECK-NEXT:    [[TMP0:%.*]] = load ptr addrspace(8), ptr addrspace(5) [[P]], align 16, !tbaa [[TBAA4:![0-9]+]]
+// CHECK-NEXT:    br i1 [[TOBOOL_NOT]], label %[[IF_END:.*]], label %[[IF_THEN:.*]]
+// CHECK:       [[IF_THEN]]:
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr addrspace(8), ptr addrspace(5) [[P]], align 16, !tbaa [[__AMDGPU_BUFFER_RSRC_T_TBAA4:![0-9]+]]
 // CHECK-NEXT:    tail call void @consumeBuffer(ptr addrspace(8) [[TMP0]]) #[[ATTR2]]
-// CHECK-NEXT:    br label [[IF_END]]
-// CHECK:       if.end:
+// CHECK-NEXT:    br label %[[IF_END]]
+// CHECK:       [[IF_END]]:
 // CHECK-NEXT:    ret void
 //
 void consumeBufferPtr(__amdgpu_buffer_rsrc_t *p) {
@@ -38,20 +38,20 @@ void consumeBufferPtr(__amdgpu_buffer_rsrc_t *p) {
     consumeBuffer(*p);
 }
 
-// CHECK-LABEL: define {{[^@]+}}@test
-// CHECK-SAME: (ptr addrspace(5) noundef readonly captures(address) [[A:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[A]], align 16, !tbaa [[TBAA8:![0-9]+]]
+// CHECK-LABEL: define dso_local void @test(
+// CHECK-SAME: ptr addrspace(5) noundef readonly captures(address) [[A:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[A]], align 16, !tbaa [[INT_TBAA8:![0-9]+]]
 // CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i32 [[TMP0]], 0
 // CHECK-NEXT:    [[TOBOOL_NOT_I:%.*]] = icmp eq ptr addrspace(5) [[A]], addrspacecast (ptr null to ptr addrspace(5))
 // CHECK-NEXT:    [[OR_COND:%.*]] = select i1 [[TOBOOL_NOT]], i1 true, i1 [[TOBOOL_NOT_I]]
-// CHECK-NEXT:    br i1 [[OR_COND]], label [[IF_END:%.*]], label [[IF_THEN_I:%.*]]
-// CHECK:       if.then.i:
+// CHECK-NEXT:    br i1 [[OR_COND]], label %[[IF_END:.*]], label %[[IF_THEN_I:.*]]
+// CHECK:       [[IF_THEN_I]]:
 // CHECK-NEXT:    [[R:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(5) [[A]], i32 16
-// CHECK-NEXT:    [[TMP1:%.*]] = load ptr addrspace(8), ptr addrspace(5) [[R]], align 16, !tbaa [[TBAA4]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load ptr addrspace(8), ptr addrspace(5) [[R]], align 16, !tbaa [[__AMDGPU_BUFFER_RSRC_T_TBAA4]]
 // CHECK-NEXT:    tail call void @consumeBuffer(ptr addrspace(8) [[TMP1]]) #[[ATTR2]]
-// CHECK-NEXT:    br label [[IF_END]]
-// CHECK:       if.end:
+// CHECK-NEXT:    br label %[[IF_END]]
+// CHECK:       [[IF_END]]:
 // CHECK-NEXT:    ret void
 //
 void test(AA *a) {
@@ -59,18 +59,18 @@ void test(AA *a) {
     consumeBufferPtr(&(a->r));
 }
 
-// CHECK-LABEL: define {{[^@]+}}@bar
-// CHECK-SAME: (ptr addrspace(5) noundef [[P:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define dso_local %struct.AA_ty @bar(
+// CHECK-SAME: ptr addrspace(5) noundef [[P:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[CALL:%.*]] = tail call [[STRUCT_AA_TY:%.*]] @[[GETAA:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr addrspace(5) noundef [[P]]) #[[ATTR2]]
 // CHECK-NEXT:    [[TMP0:%.*]] = extractvalue [[STRUCT_AA_TY]] [[CALL]], 0
 // CHECK-NEXT:    [[CALL_I:%.*]] = tail call ptr addrspace(8) @getBufferImpl(ptr addrspace(5) noundef [[P]]) #[[ATTR2]]
 // CHECK-NEXT:    [[TOBOOL_NOT_I:%.*]] = icmp eq i32 [[TMP0]], 0
-// CHECK-NEXT:    br i1 [[TOBOOL_NOT_I]], label [[TEST_EXIT:%.*]], label [[IF_THEN_I_I:%.*]]
-// CHECK:       if.then.i.i:
+// CHECK-NEXT:    br i1 [[TOBOOL_NOT_I]], label %[[TEST_EXIT:.*]], label %[[IF_THEN_I_I:.*]]
+// CHECK:       [[IF_THEN_I_I]]:
 // CHECK-NEXT:    tail call void @consumeBuffer(ptr addrspace(8) [[CALL_I]]) #[[ATTR2]]
-// CHECK-NEXT:    br label [[TEST_EXIT]]
-// CHECK:       test.exit:
+// CHECK-NEXT:    br label %[[TEST_EXIT]]
+// CHECK:       [[TEST_EXIT]]:
 // CHECK-NEXT:    [[DOTFCA_1_INSERT:%.*]] = insertvalue [[STRUCT_AA_TY]] [[CALL]], ptr addrspace(8) [[CALL_I]], 1
 // CHECK-NEXT:    ret [[STRUCT_AA_TY]] [[DOTFCA_1_INSERT]]
 //
@@ -80,3 +80,12 @@ AA bar(void *p) {
   test(&a);
   return a;
 }
+//.
+// CHECK: [[__AMDGPU_BUFFER_RSRC_T_TBAA4]] = !{[[META5:![0-9]+]], [[META5]], i64 0}
+// CHECK: [[META5]] = !{!"__amdgpu_buffer_rsrc_t", [[META6:![0-9]+]], i64 0}
+// CHECK: [[META6]] = !{!"omnipotent char", [[META7:![0-9]+]], i64 0}
+// CHECK: [[META7]] = !{!"Simple C/C++ TBAA"}
+// CHECK: [[INT_TBAA8]] = !{[[META9:![0-9]+]], [[META10:![0-9]+]], i64 0}
+// CHECK: [[META9]] = !{!"AA_ty", [[META10]], i64 0, [[META5]], i64 16}
+// CHECK: [[META10]] = !{!"int", [[META6]], i64 0}
+//.
diff --git a/clang/test/CodeGenOpenCL/amdgpu-debug-info-pointer-address-space.cl b/clang/test/CodeGenOpenCL/amdgpu-debug-info-pointer-address-space.cl
index ab625f3154b20..e6a783fff4bc5 100644
--- a/clang/test/CodeGenOpenCL/amdgpu-debug-info-pointer-address-space.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-debug-info-pointer-address-space.cl
@@ -2,122 +2,123 @@
 // RUN: %clang -cl-std=CL2.0 -emit-llvm -g -O0 -S -nogpulib -target amdgcn-amd-amdhsa -mcpu=fiji -o - %s | FileCheck %s
 // RUN: %clang -cl-std=CL2.0 -emit-llvm -g -O0 -S -nogpulib -target amdgcn-amd-amdhsa-opencl -mcpu=fiji -o - %s | FileCheck %s
 
-// CHECK-DAG: ![[DWARF_ADDRESS_SPACE_NONE:[0-9]+]] = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !{{[0-9]+}}, size: {{[0-9]+}})
-// CHECK-DAG: ![[DWARF_ADDRESS_SPACE_LOCAL:[0-9]+]] = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !{{[0-9]+}}, size: {{[0-9]+}}, dwarfAddressSpace: 2)
-// CHECK-DAG: ![[DWARF_ADDRESS_SPACE_PRIVATE:[0-9]+]] = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !{{[0-9]+}}, size: {{[0-9]+}}, dwarfAddressSpace: 1)
+// CHECK-DAG: ![[DWARF_ADDRESS_SPACE_GLOBAL:[0-9]+]] = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !{{[0-9]+}}, size: {{[0-9]+}})
+// CHECK-DAG: ![[DWARF_ADDRESS_SPACE_LOCAL:[0-9]+]] = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !{{[0-9]+}}, size: {{[0-9]+}}, dwarfAddressSpace: 3)
+// CHECK-DAG: ![[DWARF_ADDRESS_SPACE_PRIVATE:[0-9]+]] = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !{{[0-9]+}}, size: {{[0-9]+}}, dwarfAddressSpace: 5)
+// CHECK-DAG: ![[DWARF_ADDRESS_SPACE_GENERIC:[0-9]+]] = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !{{[0-9]+}}, size: {{[0-9]+}}, dwarfAddressSpace: 1)
 
-// CHECK-DAG: distinct !DIGlobalVariable(name: "FileVar0", scope: !{{[0-9]+}}, file: !{{[0-9]+}}, line: {{[0-9]+}}, type: ![[DWARF_ADDRESS_SPACE_NONE]], isLocal: false, isDefinition: true)
+// CHECK-DAG: distinct !DIGlobalVariable(name: "FileVar0", scope: !{{[0-9]+}}, file: !{{[0-9]+}}, line: {{[0-9]+}}, type: ![[DWARF_ADDRESS_SPACE_GLOBAL]], isLocal: false, isDefinition: true)
 global int *FileVar0;
-// CHECK-DAG: distinct !DIGlobalVariable(name: "FileVar1", scope: !{{[0-9]+}}, file: !{{[0-9]+}}, line: {{[0-9]+}}, type: ![[DWARF_ADDRESS_SPACE_NONE]], isLocal: false, isDefinition: true)
+// CHECK-DAG: distinct !DIGlobalVariable(name: "FileVar1", scope: !{{[0-9]+}}, file: !{{[0-9]+}}, line: {{[0-9]+}}, type: ![[DWARF_ADDRESS_SPACE_GLOBAL]], isLocal: false, isDefinition: true)
 constant int *FileVar1;
 // CHECK-DAG: distinct !DIGlobalVariable(name: "FileVar2", scope: !{{[0-9]+}}, file: !{{[0-9]+}}, line: {{[0-9]+}}, type: ![[DWARF_ADDRESS_SPACE_LOCAL]], isLocal: false, isDefinition: true)
 local int *FileVar2;
 // CHECK-DAG: distinct !DIGlobalVariable(name: "FileVar3", scope: !{{[0-9]+}}, file: !{{[0-9]+}}, line: {{[0-9]+}}, type: ![[DWARF_ADDRESS_SPACE_PRIVATE]], isLocal: false, isDefinition: true)
 private int *FileVar3;
-// CHECK-DAG: distinct !DIGlobalVariable(name: "FileVar4", scope: !{{[0-9]+}}, file: !{{[0-9]+}}, line: {{[0-9]+}}, type: ![[DWARF_ADDRESS_SPACE_NONE]], isLocal: false, isDefinition: true)
+// CHECK-DAG: distinct !DIGlobalVariable(name: "FileVar4", scope: !{{[0-9]+}}, file: !{{[0-9]+}}, line: {{[0-9]+}}, type: ![[DWARF_ADDRESS_SPACE_GENERIC]], isLocal: false, isDefinition: true)
 int *FileVar4;
 
-// CHECK-DAG: distinct !DIGlobalVariable(name: "FileVar5", scope: !{{[0-9]+}}, file: !{{[0-9]+}}, line: {{[0-9]+}}, type: ![[DWARF_ADDRESS_SPACE_NONE]], isLocal: false, isDefinition: true)
+// CHECK-DAG: distinct !DIGlobalVariable(name: "FileVar5", scope: !{{[0-9]+}}, file: !{{[0-9]+}}, line: {{[0-9]+}}, type: ![[DWARF_ADDRESS_SPACE_GLOBAL]], isLocal: false, isDefinition: true)
 global int *global FileVar5;
-// CHECK-DAG: distinct !DIGlobalVariable(name: "FileVar6", scope: !{{[0-9]+}}, file: !{{[0-9]+}}, line: {{[0-9]+}}, type: ![[DWARF_ADDRESS_SPACE_NONE]], isLocal: false, isDefinition: true)
+// CHECK-DAG: distinct !DIGlobalVariable(name: "FileVar6", scope: !{{[0-9]+}}, file: !{{[0-9]+}}, line: {{[0-9]+}}, type: ![[DWARF_ADDRESS_SPACE_GLOBAL]], isLocal: false, isDefinition: true)
 constant int *global FileVar6;
 // CHECK-DAG: distinct !DIGlobalVariable(name: "FileVar7", scope: !{{[0-9]+}}, file: !{{[0-9]+}}, line: {{[0-9]+}}, type: ![[DWARF_ADDRESS_SPACE_LOCAL]], isLocal: false, isDefinition: true)
 local int *global FileVar7;
 // CHECK-DAG: distinct !DIGlobalVariable(name: "FileVar8", scope: !{{[0-9]+}}, file: !{{[0-9]+}}, line: {{[0-9]+}}, type: ![[DWARF_ADDRESS_SPACE_PRIVATE]], isLocal: false, isDefinition: true)
 private int *global FileVar8;
-// CHECK-DAG: distinct !DIGlobalVariable(name: "FileVar9", scope: !{{[0-9]+}}, file: !{{[0-9]+}}, line: {{[0-9]+}}, type: ![[DWARF_ADDRESS_SPACE_NONE]], isLocal: false, isDefinition: true)
+// CHECK-DAG: distinct !DIGlobalVariable(name: "FileVar9", scope: !{{[0-9]+}}, file: !{{[0-9]+}}, line: {{[0-9]+}}, type: ![[DWARF_ADDRESS_SPACE_GENERIC]], isLocal: false, isDefinition: true)
 int *global FileVar9;
 
-// CHECK-DAG: distinct !DIGlobalVariable(name: "FileVar10", scope: !{{[0-9]+}}, file: !{{[0-9]+}}, line: {{[0-9]+}}, type: ![[DWARF_ADDRESS_SPACE_NONE]], isLocal: false, isDefinition: true)
+// CHECK-DAG: distinct !DIGlobalVariable(name: "FileVar10", scope: !{{[0-9]+}}, file: !{{[0-9]+}}, line: {{[0-9]+}}, type: ![[DWARF_ADDRESS_SPACE_GLOBAL]], isLocal: false, isDefinition: true)
 global int *constant FileVar10 = 0;
-// CHECK-DAG: distinct !DIGlobalVariable(name: "FileVar11", scope: !{{[0-9]+}}, file: !{{[0-9]+}}, line: {{[0-9]+}}, type: ![[DWARF_ADDRESS_SPACE_NONE]], isLocal: false, isDefinition: true)
+// CHECK-DAG: distinct !DIGlobalVariable(name: "FileVar11", scope: !{{[0-9]+}}, file: !{{[0-9]+}}, line: {{[0-9]+}}, type: ![[DWARF_ADDRESS_SPACE_GLOBAL]], isLocal: false, isDefinition: true)
 constant int *constant FileVar11 = 0;
 // CHECK-DAG: distinct !DIGlobalVariable(name: "FileVar12", scope: !{{[0-9]+}}, file: !{{[0-9]+}}, line: {{[0-9]+}}, type: ![[DWARF_ADDRESS_SPACE_LOCAL]], isLocal: false, isDefinition: true)
 local int *constant FileVar12 = 0;
 // CHECK-DAG: distinct !DIGlobalVariable(name: "FileVar13", scope: !{{[0-9]+}}, file: !{{[0-9]+}}, line: {{[0-9]+}}, type: ![[DWARF_ADDRESS_SPACE_PRIVATE]], isLocal: false, isDefinition: true)
 private int *constant FileVar13 = 0;
-// CHECK-DAG: distinct !DIGlobalVariable(name: "FileVar14", scope: !{{[0-9]+}}, file: !{{[0-9]+}}, line: {{[0-9]+}}, type: ![[DWARF_ADDRESS_SPACE_NONE]], isLocal: false, isDefinition: true)
+// CHECK-DAG: distinct !DIGlobalVariable(name: "FileVar14", scope: !{{[0-9]+}}, file: !{{[0-9]+}}, line: {{[0-9]+}}, type: ![[DWARF_ADDRESS_SPACE_GENERIC]], isLocal: false, isDefinition: true)
 int *constant FileVar14 = 0;
 
 kernel void kernel1(
-    // CHECK-DAG: !DILocalVariable(name: "KernelArg0", arg: {{[0-9]+}}, scope: !{{[0-9]+}}, file: !{{[0-9]+}}, line: {{[0-9]+}}, type: ![[DWARF_ADDRESS_SPACE_NONE]])
+    // CHECK-DAG: !DILocalVariable(name: "KernelArg0", arg: {{[0-9]+}}, scope: !{{[0-9]+}}, file: !{{[0-9]+}}, line: {{[0-9]+}}, type: ![[DWARF_ADDRESS_SPACE_GLOBAL]])
     global int *KernelArg0,
-    // CHECK-DAG: !DILocalVariable(name: "KernelArg1", arg: {{[0-9]+}}, scope: !{{[0-9]+}}, file: !{{[0-9]+}}, line: {{[0-9]+}}, type: ![[DWARF_ADDRESS_SPACE_NONE]])
+    // CHECK-DAG: !DILocalVariable(name: "KernelArg1", arg: {{[0-9]+}}, scope: !{{[0-9]+}}, file: !{{[0-9]+}}, line: {{[0-9]+}}, type: ![[DWARF_ADDRESS_SPACE_GLOBAL]])
     constant int *KernelArg1,
     // CHECK-DAG: !DILocalVariable(name: "KernelArg2", arg: {{[0-9]+}}, scope: !{{[0-9]+}}, file: !{{[0-9]+}}, line: {{[0-9]+}}, type: ![[DWARF_ADDRESS_SPACE_LOCAL]])
     local int *KernelArg2) {
   private int *Tmp0;
   int *Tmp1;
 
-  // CHECK-DAG: !DILocalVariable(name: "FuncVar0", scope: !{{[0-9]+}}, file: !{{[0-9]+}}, line: {{[0-9]+}}, type: ![[DWARF_ADDRESS_SPACE_NONE]])
+  // CHECK-DAG: !DILocalVariable(name: "FuncVar0", scope: !{{[0-9]+}}, file: !{{[0-9]+}}, line: {{[0-9]+}}, type: ![[DWARF_ADDRESS_SPACE_GLOBAL]])
   global int *FuncVar0 = KernelArg0;
-  // CHECK-DAG: !DILocalVariable(name: "FuncVar1", scope: !{{[0-9]+}}, file: !{{[0-9]+}}, line: {{[0-9]+}}, type: ![[DWARF_ADDRESS_SPACE_NONE]])
+  // CHECK-DAG: !DILocalVariable(name: "FuncVar1", scope: !{{[0-9]+}}, file: !{{[0-9]+}}, line: {{[0-9]+}}, type: ![[DWARF_ADDRESS_SPACE_GLOBAL]])
   constant int *FuncVar1 = KernelArg1;
   // CHECK-DAG: !DILocalVariable(name: "FuncVar2", scope: !{{[0-9]+}}, file: !{{[0-9]+}}, line: {{[0-9]+}}, type: ![[DWARF_ADDRESS_SPACE_LOCAL]])
   local int *FuncVar2 = KernelArg2;
   // CHECK-DAG: !DILocalVariable(name: "FuncVar3", scope: !{{[0-9]+}}, file: !{{[0-9]+}}, line: {{[0-9]+}}, type: ![[DWARF_ADDRESS_SPACE_PRIVATE]])
   private int *FuncVar3 = Tmp0;
-  // CHECK-DAG: !DILocalVariable(name: "FuncVar4", scope: !{{[0-9]+}}, file: !{{[0-9]+}}, line: {{[0-9]+}}, type: ![[DWARF_ADDRESS_SPACE_NONE]])
+  // CHECK-DAG: !DILocalVariable(name: "FuncVar4", scope: !{{[0-9]+}}, file: !{{[0-9]+}}, line: {{[0-9]+}}, type: ![[DWARF_ADDRESS_SPACE_GENERIC]])
   int *FuncVar4 = Tmp1;
 
-  // CHECK-DAG: distinct !DIGlobalVariable(name: "FuncVar5", scope: !{{[0-9]+}}, file: !{{[0-9]+}}, line: {{[0-9]+}}, type: ![[DWARF_ADDRESS_SPACE_NONE]], isLocal: true, isDefinition: true)
+  // CHECK-DAG: distinct !DIGlobalVariable(name: "FuncVar5", scope: !{{[0-9]+}}, file: !{{[0-9]+}}, line: {{[0-9]+}}, type: ![[DWARF_ADDRESS_SPACE_GLOBAL]], isLocal: true, isDefinition: true)
   global int *constant FuncVar5 = 0;
-  // CHECK-DAG: distinct !DIGlobalVariable(name: "FuncVar6", scope: !{{[0-9]+}}, file: !{{[0-9]+}}, line: {{[0-9]+}}, type: ![[DWARF_ADDRESS_SPACE_NONE]], isLocal: true, isDefinition: true)
+  // CHECK-DAG: distinct !DIGlobalVariable(name: "FuncVar6", scope: !{{[0-9]+}}, file: !{{[0-9]+}}, line: {{[0-9]+}}, type: ![[DWARF_ADDRESS_SPACE_GLOBAL]], isLocal: true, isDefinition: true)
   constant int *constant FuncVar6 = 0;
   // CHECK-DAG: distinct !DIGlobalVariable(name: "FuncVar7", scope: !{{[0-9]+}}, file: !{{[0-9]+}}, line: {{[0-9]+}}, type: ![[DWARF_ADDRESS_SPACE_LOCAL]], isLocal: true, isDefinition: true)
   local int *constant FuncVar7 = 0;
   // CHECK-DAG: distinct !DIGlobalVariable(name: "FuncVar8", scope: !{{[0-9]+}}, file: !{{[0-9]+}}, line: {{[0-9]+}}, type: ![[DWARF_ADDRESS_SPACE_PRIVATE]], isLocal: true, isDefinition: true)
   private int *constant FuncVar8 = 0;
-  // CHECK-DAG: distinct !DIGlobalVariable(name: "FuncVar9", scope: !{{[0-9]+}}, file: !{{[0-9]+}}, line: {{[0-9]+}}, type: ![[DWARF_ADDRESS_SPACE_NONE]], isLocal: true, isDefinition: true)
+  // CHECK-DAG: distinct !DIGlobalVariable(name: "FuncVar9", scope: !{{[0-9]+}}, file: !{{[0-9]+}}, line: {{[0-9]+}}, type: ![[DWARF_ADDRESS_SPACE_GENERIC]], isLocal: true, isDefinition: true)
   int *constant FuncVar9 = 0;
 
-  // CHECK-DAG: distinct !DIGlobalVariable(name: "FuncVar10", scope: !{{[0-9]+}}, file: !{{[0-9]+}}, line: {{[0-9]+}}, type: ![[DWARF_ADDRESS_SPACE_NONE]], isLocal: true, isDefinition: true)
+  // CHECK-DAG: distinct !DIGlobalVariable(name: "FuncVar10", scope: !{{[0-9]+}}, file: !{{[0-9]+}}, line: {{[0-9]+}}, type: ![[DWARF_ADDRESS_SPACE_GLOBAL]], isLocal: true, isDefinition: true)
   global int *local FuncVar10; FuncVar10 = KernelArg0;
-  // CHECK-DAG: distinct !DIGlobalVariable(name: "FuncVar11", scope: !{{[0-9]+}}, file: !{{[0-9]+}}, line: {{[0-9]+}}, type: ![[DWARF_ADDRESS_SPACE_NONE]], isLocal: true, isDefinition: true)
+  // CHECK-DAG: distinct !DIGlobalVariable(name: "FuncVar11", scope: !{{[0-9]+}}, file: !{{[0-9]+}}, line: {{[0-9]+}}, type: ![[DWARF_ADDRESS_SPACE_GLOBAL]], isLocal: true, isDefinition: true)
   constant int *local FuncVar11; FuncVar11 = KernelArg1;
   // CHECK-DAG: distinct !DIGlobalVariable(name: "FuncVar12", scope: !{{[0-9]+}}, file: !{{[0-9]+}}, line: {{[0-9]+}}, type: ![[DWARF_ADDRESS_SPACE_LOCAL]], isLocal: true, isDefinition: true)
   local int *local FuncVar12; FuncVar12 = KernelArg2;
   // CHECK-DAG: distinct !DIGlobalVariable(name: "FuncVar13", scope: !{{[0-9]+}}, file: !{{[0-9]+}}, line: {{[0-9]+}}, type: ![[DWARF_ADDRESS_SPACE_PRIVATE]], isLocal: true, isDefinition: true)
   private int *local FuncVar13; FuncVar13 = Tmp0;
-  // CHECK-DAG: distinct !DIGlobalVariable(name: "FuncVar14", scope: !{{[0-9]+}}, file: !{{[0-9]+}}, line: {{[0-9]+}}, type: ![[DWARF_ADDRESS_SPACE_NONE]], isLocal: true, isDefinition: true)
+  // CHECK-DAG: distinct !DIGlobalVariable(name: "FuncVar14", scope: !{{[0-9]+}}, file: !{{[0-9]+}}, line: {{[0-9]+}}, type: ![[DWARF_ADDRESS_SPACE_GENERIC]], isLocal: true, isDefinition: true)
   int *local FuncVar14; FuncVar14 = Tmp1;
 
-  // CHECK-DAG: !DILocalVariable(name: "FuncVar15", scope: !{{[0-9]+}}, file: !{{[0-9]+}}, line: {{[0-9]+}}, type: ![[DWARF_ADDRESS_SPACE_NONE]])
+  // CHECK-DAG: !DILocalVariable(name: "FuncVar15", scope: !{{[0-9]+}}, file: !{{[0-9]+}}, line: {{[0-9]+}}, type: ![[DWARF_ADDRESS_SPACE_GLOBAL]])
   global int *private FuncVar15 = KernelArg0;
-  // CHECK-DAG: !DILocalVariable(name: "FuncVar16", scope: !{{[0-9]+}}, file: !{{[0-9]+}}, line: {{[0-9]+}}, type: ![[DWARF_ADDRESS_SPACE_NONE]])
+  // CHECK-DAG: !DILocalVariable(name: "FuncVar16", scope: !{{[0-9]+}}, file: !{{[0-9]+}}, line: {{[0-9]+}}, type: ![[DWARF_ADDRESS_SPACE_GLOBAL]])
   constant int *private FuncVar16 = KernelArg1;
   // CHECK-DAG: !DILocalVariable(name: "FuncVar17", scope: !{{[0-9]+}}, file: !{{[0-9]+}}, line: {{[0-9]+}}, type: ![[DWARF_ADDRESS_SPACE_LOCAL]])
   local int *private FuncVar17 = KernelArg2;
   // CHECK-DAG: !DILocalVariable(name: "FuncVar18", scope: !{{[0-9]+}}, file: !{{[0-9]+}}, line: {{[0-9]+}}, type: ![[DWARF_ADDRESS_SPACE_PRIVATE]])
   private int *private FuncVar18 = Tmp0;
-  // CHECK-DAG: !DILocalVariable(name: "FuncVar19", scope: !{{[0-9]+}}, file: !{{[0-9]+}}, line: {{[0-9]+}}, type: ![[DWARF_ADDRESS_SPACE_NONE]])
+  // CHECK-DAG: !DILocalVariable(name: "FuncVar19", scope: !{{[0-9]+}}, file: !{{[0-9]+}}, line: {{[0-9]+}}, type: ![[DWARF_ADDRESS_SPACE_GENERIC]])
   int *private FuncVar19 = Tmp1;
 }
 
 struct FileStruct0 {
-  // CHECK-DAG: !DIDerivedType(tag: DW_TAG_member, name: "StructMem0", scope: !{{[0-9]+}}, file: !{{[0-9]+}}, line: {{[0-9]+}}, baseType: ![[DWARF_ADDRESS_SPACE_NONE]], size: {{[0-9]+}})
+  // CHECK-DAG: !DIDerivedType(tag: DW_TAG_member, name: "StructMem0", scope: !{{[0-9]+}}, file: !{{[0-9]+}}, line: {{[0-9]+}}, baseType: ![[DWARF_ADDRESS_SPACE_GLOBAL]], size: {{[0-9]+}})
   global int *StructMem0;
-  // CHECK-DAG: !DIDerivedType(tag: DW_TAG_member, name: "StructMem1", scope: !{{[0-9]+}}, file: !{{[0-9]+}}, line: {{[0-9]+}}, baseType: ![[DWARF_ADDRESS_SPACE_NONE]], size: {{[0-9]+}}, offset: {{[0-9]+}})
+  // CHECK-DAG: !DIDerivedType(tag: DW_TAG_member, name: "StructMem1", scope: !{{[0-9]+}}, file: !{{[0-9]+}}, line: {{[0-9]+}}, baseType: ![[DWARF_ADDRESS_SPACE_GLOBAL]], size: {{[0-9]+}}, offset: {{[0-9]+}})
   constant int *StructMem1;
   // CHECK-DAG: !DIDerivedType(tag: DW_TAG_member, name: "StructMem2", scope: !{{[0-9]+}}, file: !{{[0-9]+}}, line: {{[0-9]+}}, baseType: ![[DWARF_ADDRESS_SPACE_LOCAL]], size: {{[0-9]+}}, offset: {{[0-9]+}})
   local int *StructMem2;
   // CHECK-DAG: !DIDerivedType(tag: DW_TAG_member, name: "StructMem3", scope: !{{[0-9]+}}, file: !{{[0-9]+}}, line: {{[0-9]+}}, baseType: ![[DWARF_ADDRESS_SPACE_PRIVATE]], size: {{[0-9]+}}, offset: {{[0-9]+}})
   private int *StructMem3;
-  // CHECK-DAG: !DIDerivedType(tag: DW_TAG_member, name: "StructMem4", scope: !{{[0-9]+}}, file: !{{[0-9]+}}, line: {{[0-9]+}}, baseType: ![[DWARF_ADDRESS_SPACE_NONE]], size: {{[0-9]+}}, offset: {{[0-9]+}})
+  // CHECK-DAG: !DIDerivedType(tag: DW_TAG_member, name: "StructMem4", scope: !{{[0-9]+}}, file: !{{[0-9]+}}, line: {{[0-9]+}}, baseType: ![[DWARF_ADDRESS_SPACE_GENERIC]], size: {{[0-9]+}}, offset: {{[0-9]+}})
   int *StructMem4;
 };
 
 struct FileStruct1 {
   union {
-    // CHECK-DAG: !DIDerivedType(tag: DW_TAG_member, name: "UnionMem0", scope: !{{[0-9]+}}, file: !{{[0-9]+}}, line: {{[0-9]+}}, baseType: ![[DWARF_ADDRESS_SPACE_NONE]], size: {{[0-9]+}})
+    // CHECK-DAG: !DIDerivedType(tag: DW_TAG_member, name: "UnionMem0", scope: !{{[0-9]+}}, file: !{{[0-9]+}}, line: {{[0-9]+}}, baseType: ![[DWARF_ADDRESS_SPACE_GLOBAL]], size: {{[0-9]+}})
     global int *UnionMem0;
-    // CHECK-DAG: !DIDerivedType(tag: DW_TAG_member, name: "UnionMem1", scope: !{{[0-9]+}}, file: !{{[0-9]+}}, line: {{[0-9]+}}, baseType: ![[DWARF_ADDRESS_SPACE_NONE]], size: {{[0-9]+}})
+    // CHECK-DAG: !DIDerivedType(tag: DW_TAG_member, name: "UnionMem1", scope: !{{[0-9]+}}, file: !{{[0-9]+}}, line: {{[0-9]+}}, baseType: ![[DWARF_ADDRESS_SPACE_GLOBAL]], size: {{[0-9]+}})
     constant int *UnionMem1;
     // CHECK-DAG: !DIDerivedType(tag: DW_TAG_member, name: "UnionMem2", scope: !{{[0-9]+}}, file: !{{[0-9]+}}, line: {{[0-9]+}}, baseType: ![[DWARF_ADDRESS_SPACE_LOCAL]], size: {{[0-9]+}})
     local int *UnionMem2;
     // CHECK-DAG: !DIDerivedType(tag: DW_TAG_member, name: "UnionMem3", scope: !{{[0-9]+}}, file: !{{[0-9]+}}, line: {{[0-9]+}}, baseType: ![[DWARF_ADDRESS_SPACE_PRIVATE]], size: {{[0-9]+}})
     private int *UnionMem3;
-    // CHECK-DAG: !DIDerivedType(tag: DW_TAG_member, name: "UnionMem4", scope: !{{[0-9]+}}, file: !{{[0-9]+}}, line: {{[0-9]+}}, baseType: ![[DWARF_ADDRESS_SPACE_NONE]], size: {{[0-9]+}})
+    // CHECK-DAG: !DIDerivedType(tag: DW_TAG_member, name: "UnionMem4", scope: !{{[0-9]+}}, file: !{{[0-9]+}}, line: {{[0-9]+}}, baseType: ![[DWARF_ADDRESS_SPACE_GENERIC]], size: {{[0-9]+}})
     int *UnionMem4;
   };
   long StructMem0;
diff --git a/clang/test/CodeGenOpenCL/amdgpu-debug-info-variable-expression.cl b/clang/test/CodeGenOpenCL/amdgpu-debug-info-variable-expression.cl
index 479e893000942..4d5f1019378af 100644
--- a/clang/test/CodeGenOpenCL/amdgpu-debug-info-variable-expression.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-debug-info-variable-expression.cl
@@ -52,31 +52,31 @@ int *constant FileVar14 = 0;
 
 kernel void kernel1(
     // CHECK-DAG: ![[KERNELARG0:[0-9]+]] = !DILocalVariable(name: "KernelArg0", arg: {{[0-9]+}}, scope: !{{[0-9]+}}, file: !{{[0-9]+}}, line: {{[0-9]+}}, type: !{{[0-9]+}})
-    // CHECK-DAG: #dbg_declare(ptr addrspace(5) {{.*}}, ![[KERNELARG0]], !DIExpression(DW_OP_constu, 1, DW_OP_swap, DW_OP_xderef), !{{[0-9]+}}
+    // CHECK-DAG: #dbg_declare(ptr addrspace(5) {{.*}}, ![[KERNELARG0]], !DIExpression(DW_OP_constu, 5, DW_OP_swap, DW_OP_xderef), !{{[0-9]+}}
     global int *KernelArg0,
     // CHECK-DAG: ![[KERNELARG1:[0-9]+]] = !DILocalVariable(name: "KernelArg1", arg: {{[0-9]+}}, scope: !{{[0-9]+}}, file: !{{[0-9]+}}, line: {{[0-9]+}}, type: !{{[0-9]+}})
-    // CHECK-DAG: #dbg_declare(ptr addrspace(5) {{.*}}, ![[KERNELARG1]], !DIExpression(DW_OP_constu, 1, DW_OP_swap, DW_OP_xderef), !{{[0-9]+}}
+    // CHECK-DAG: #dbg_declare(ptr addrspace(5) {{.*}}, ![[KERNELARG1]], !DIExpression(DW_OP_constu, 5, DW_OP_swap, DW_OP_xderef), !{{[0-9]+}}
     constant int *KernelArg1,
     // CHECK-DAG: ![[KERNELARG2:[0-9]+]] = !DILocalVariable(name: "KernelArg2", arg: {{[0-9]+}}, scope: !{{[0-9]+}}, file: !{{[0-9]+}}, line: {{[0-9]+}}, type: !{{[0-9]+}})
-    // CHECK-DAG: #dbg_declare(ptr addrspace(5) {{.*}}, ![[KERNELARG2]], !DIExpression(DW_OP_constu, 1, DW_OP_swap, DW_OP_xderef), !{{[0-9]+}}
+    // CHECK-DAG: #dbg_declare(ptr addrspace(5) {{.*}}, ![[KERNELARG2]], !DIExpression(DW_OP_constu, 5, DW_OP_swap, DW_OP_xderef), !{{[0-9]+}}
     local int *KernelArg2) {
   private int *Tmp0;
   int *Tmp1;
 
   // CHECK-DAG: ![[FUNCVAR0:[0-9]+]] = !DILocalVariable(name: "FuncVar0", scope: !{{[0-9]+}}, file: !{{[0-9]+}}, line: {{[0-9]+}}, type: !{{[0-9]+}})
-  // CHECK-DAG: #dbg_declare(ptr addrspace(5) {{.*}}, ![[FUNCVAR0]], !DIExpression(DW_OP_constu, 1, DW_OP_swap, DW_OP_xderef), !{{[0-9]+}}
+  // CHECK-DAG: #dbg_declare(ptr addrspace(5) {{.*}}, ![[FUNCVAR0]], !DIExpression(DW_OP_constu, 5, DW_OP_swap, DW_OP_xderef), !{{[0-9]+}}
   global int *FuncVar0 = KernelArg0;
   // CHECK-DAG: ![[FUNCVAR1:[0-9]+]] = !DILocalVariable(name: "FuncVar1", scope: !{{[0-9]+}}, file: !{{[0-9]+}}, line: {{[0-9]+}}, type: !{{[0-9]+}})
-  // CHECK-DAG: #dbg_declare(ptr addrspace(5) {{.*}}, ![[FUNCVAR1]], !DIExpression(DW_OP_constu, 1, DW_OP_swap, DW_OP_xderef), !{{[0-9]+}}
+  // CHECK-DAG: #dbg_declare(ptr addrspace(5) {{.*}}, ![[FUNCVAR1]], !DIExpression(DW_OP_constu, 5, DW_OP_swap, DW_OP_xderef), !{{[0-9]+}}
   constant int *FuncVar1 = KernelArg1;
   // CHECK-DAG: ![[FUNCVAR2:[0-9]+]] = !DILocalVariable(name: "FuncVar2", scope: !{{[0-9]+}}, file: !{{[0-9]+}}, line: {{[0-9]+}}, type: !{{[0-9]+}})
-  // CHECK-DAG: #dbg_declare(ptr addrspace(5) {{.*}}, ![[FUNCVAR2]], !DIExpression(DW_OP_constu, 1, DW_OP_swap, DW_OP_xderef), !{{[0-9]+}}
+  // CHECK-DAG: #dbg_declare(ptr addrspace(5) {{.*}}, ![[FUNCVAR2]], !DIExpression(DW_OP_constu, 5, DW_OP_swap, DW_OP_xderef), !{{[0-9]+}}
   local int *FuncVar2 = KernelArg2;
   // CHECK-DAG: ![[FUNCVAR3:[0-9]+]] = !DILocalVariable(name: "FuncVar3", scope: !{{[0-9]+}}, file: !{{[0-9]+}}, line: {{[0-9]+}}, type: !{{[0-9]+}})
-  // CHECK-DAG: #dbg_declare(ptr addrspace(5) {{.*}}, ![[FUNCVAR3]], !DIExpression(DW_OP_constu, 1, DW_OP_swap, DW_OP_xderef), !{{[0-9]+}}
+  // CHECK-DAG: #dbg_declare(ptr addrspace(5) {{.*}}, ![[FUNCVAR3]], !DIExpression(DW_OP_constu, 5, DW_OP_swap, DW_OP_xderef), !{{[0-9]+}}
   private int *FuncVar3 = Tmp0;
   // CHECK-DAG: ![[FUNCVAR4:[0-9]+]] = !DILocalVariable(name: "FuncVar4", scope: !{{[0-9]+}}, file: !{{[0-9]+}}, line: {{[0-9]+}}, type: !{{[0-9]+}})
-  // CHECK-DAG: #dbg_declare(ptr addrspace(5) {{.*}}, ![[FUNCVAR4]], !DIExpression(DW_OP_constu, 1, DW_OP_swap, DW_OP_xderef), !{{[0-9]+}}
+  // CHECK-DAG: #dbg_declare(ptr addrspace(5) {{.*}}, ![[FUNCVAR4]], !DIExpression(DW_OP_constu, 5, DW_OP_swap, DW_OP_xderef), !{{[0-9]+}}
   int *FuncVar4 = Tmp1;
 
   // CHECK-DAG: ![[FUNCVAR5:[0-9]+]] = distinct !DIGlobalVariable(name: "FuncVar5", scope: !{{[0-9]+}}, file: !{{[0-9]+}}, line: {{[0-9]+}}, type: !{{[0-9]+}}, isLocal: true, isDefinition: true)
@@ -96,34 +96,34 @@ kernel void kernel1(
   int *constant FuncVar9 = 0;
 
   // CHECK-DAG: ![[FUNCVAR10:[0-9]+]] = distinct !DIGlobalVariable(name: "FuncVar10", scope: !{{[0-9]+}}, file: !{{[0-9]+}}, line: {{[0-9]+}}, type: !{{[0-9]+}}, isLocal: true, isDefinition: true)
-  // CHECK-DAG: !DIGlobalVariableExpression(var: ![[FUNCVAR10]], expr: !DIExpression(DW_OP_constu, 2, DW_OP_swap, DW_OP_xderef))
+  // CHECK-DAG: !DIGlobalVariableExpression(var: ![[FUNCVAR10]], expr: !DIExpression(DW_OP_constu, 3, DW_OP_swap, DW_OP_xderef))
   global int *local FuncVar10; FuncVar10 = KernelArg0;
   // CHECK-DAG: ![[FUNCVAR11:[0-9]+]] = distinct !DIGlobalVariable(name: "FuncVar11", scope: !{{[0-9]+}}, file: !{{[0-9]+}}, line: {{[0-9]+}}, type: !{{[0-9]+}}, isLocal: true, isDefinition: true)
-  // CHECK-DAG: !DIGlobalVariableExpression(var: ![[FUNCVAR11]], expr: !DIExpression(DW_OP_constu, 2, DW_OP_swap, DW_OP_xderef))
+  // CHECK-DAG: !DIGlobalVariableExpression(var: ![[FUNCVAR11]], expr: !DIExpression(DW_OP_constu, 3, DW_OP_swap, DW_OP_xderef))
   constant int *local FuncVar11; FuncVar11 = KernelArg1;
   // CHECK-DAG: ![[FUNCVAR12:[0-9]+]] = distinct !DIGlobalVariable(name: "FuncVar12", scope: !{{[0-9]+}}, file: !{{[0-9]+}}, line: {{[0-9]+}}, type: !{{[0-9]+}}, isLocal: true, isDefinition: true)
-  // CHECK-DAG: !DIGlobalVariableExpression(var: ![[FUNCVAR12]], expr: !DIExpression(DW_OP_constu, 2, DW_OP_swap, DW_OP_xderef))
+  // CHECK-DAG: !DIGlobalVariableExpression(var: ![[FUNCVAR12]], expr: !DIExpression(DW_OP_constu, 3, DW_OP_swap, DW_OP_xderef))
   local int *local FuncVar12; FuncVar12 = KernelArg2;
   // CHECK-DAG: ![[FUNCVAR13:[0-9]+]] = distinct !DIGlobalVariable(name: "FuncVar13", scope: !{{[0-9]+}}, file: !{{[0-9]+}}, line: {{[0-9]+}}, type: !{{[0-9]+}}, isLocal: true, isDefinition: true)
-  // CHECK-DAG: !DIGlobalVariableExpression(var: ![[FUNCVAR13]], expr: !DIExpression(DW_OP_constu, 2, DW_OP_swap, DW_OP_xderef))
+  // CHECK-DAG: !DIGlobalVariableExpression(var: ![[FUNCVAR13]], expr: !DIExpression(DW_OP_constu, 3, DW_OP_swap, DW_OP_xderef))
   private int *local FuncVar13; FuncVar13 = Tmp0;
   // CHECK-DAG: ![[FUNCVAR14:[0-9]+]] = distinct !DIGlobalVariable(name: "FuncVar14", scope: !{{[0-9]+}}, file: !{{[0-9]+}}, line: {{[0-9]+}}, type: !{{[0-9]+}}, isLocal: true, isDefinition: true)
-  // CHECK-DAG: !DIGlobalVariableExpression(var: ![[FUNCVAR14]], expr: !DIExpression(DW_OP_constu, 2, DW_OP_swap, DW_OP_xderef))
+  // CHECK-DAG: !DIGlobalVariableExpression(var: ![[FUNCVAR14]], expr: !DIExpression(DW_OP_constu, 3, DW_OP_swap, DW_OP_xderef))
   int *local FuncVar14; FuncVar14 = Tmp1;
 
   // CHECK-DAG: ![[FUNCVAR15:[0-9]+]] = !DILocalVariable(name: "FuncVar15", scope: !{{[0-9]+}}, file: !{{[0-9]+}}, line: {{[0-9]+}}, type: !{{[0-9]+}})
-  // CHECK-DAG: #dbg_declare(ptr addrspace(5) {{.*}}, ![[FUNCVAR15]], !DIExpression(DW_OP_constu, 1, DW_OP_swap, DW_OP_xderef), !{{[0-9]+}}
+  // CHECK-DAG: #dbg_declare(ptr addrspace(5) {{.*}}, ![[FUNCVAR15]], !DIExpression(DW_OP_constu, 5, DW_OP_swap, DW_OP_xderef), !{{[0-9]+}}
   global int *private FuncVar15 = KernelArg0;
   // CHECK-DAG: ![[FUNCVAR16:[0-9]+]] = !DILocalVariable(name: "FuncVar16", scope: !{{[0-9]+}}, file: !{{[0-9]+}}, line: {{[0-9]+}}, type: !{{[0-9]+}})
-  // CHECK-DAG: #dbg_declare(ptr addrspace(5) {{.*}}, ![[FUNCVAR16]], !DIExpression(DW_OP_constu, 1, DW_OP_swap, DW_OP_xderef), !{{[0-9]+}}
+  // CHECK-DAG: #dbg_declare(ptr addrspace(5) {{.*}}, ![[FUNCVAR16]], !DIExpression(DW_OP_constu, 5, DW_OP_swap, DW_OP_xderef), !{{[0-9]+}}
   constant int *private FuncVar16 = KernelArg1;
   // CHECK-DAG: ![[FUNCVAR17:[0-9]+]] = !DILocalVariable(name: "FuncVar17", scope: !{{[0-9]+}}, file: !{{[0-9]+}}, line: {{[0-9]+}}, type: !{{[0-9]+}})
-  // CHECK-DAG: #dbg_declare(ptr addrspace(5) {{.*}}, ![[FUNCVAR17]], !DIExpression(DW_OP_constu, 1, DW_OP_swap, DW_OP_xderef), !{{[0-9]+}}
+  // CHECK-DAG: #dbg_declare(ptr addrspace(5) {{.*}}, ![[FUNCVAR17]], !DIExpression(DW_OP_constu, 5, DW_OP_swap, DW_OP_xderef), !{{[0-9]+}}
   local int *private FuncVar17 = KernelArg2;
   // CHECK-DAG: ![[FUNCVAR18:[0-9]+]] = !DILocalVariable(name: "FuncVar18", scope: !{{[0-9]+}}, file: !{{[0-9]+}}, line: {{[0-9]+}}, type: !{{[0-9]+}})
-  // CHECK-DAG: #dbg_declare(ptr addrspace(5) {{.*}}, ![[FUNCVAR18]], !DIExpression(DW_OP_constu, 1, DW_OP_swap, DW_OP_xderef), !{{[0-9]+}}
+  // CHECK-DAG: #dbg_declare(ptr addrspace(5) {{.*}}, ![[FUNCVAR18]], !DIExpression(DW_OP_constu, 5, DW_OP_swap, DW_OP_xderef), !{{[0-9]+}}
   private int *private FuncVar18 = Tmp0;
   // CHECK-DAG: ![[FUNCVAR19:[0-9]+]] = !DILocalVariable(name: "FuncVar19", scope: !{{[0-9]+}}, file: !{{[0-9]+}}, line: {{[0-9]+}}, type: !{{[0-9]+}})
-  // CHECK-DAG: #dbg_declare(ptr addrspace(5) {{.*}}, ![[FUNCVAR19]], !DIExpression(DW_OP_constu, 1, DW_OP_swap, DW_OP_xderef), !{{[0-9]+}}
+  // CHECK-DAG: #dbg_declare(ptr addrspace(5) {{.*}}, ![[FUNCVAR19]], !DIExpression(DW_OP_constu, 5, DW_OP_swap, DW_OP_xderef), !{{[0-9]+}}
   int *private FuncVar19 = Tmp1;
 }
diff --git a/clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl b/clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl
index d71c89811f04b..6d573238440d2 100644
--- a/clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl
@@ -1,4 +1,4 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-attributes --check-globals all --include-generated-funcs --prefix-filecheck-ir-name VAR --version 5
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-attributes --check-globals all --include-generated-funcs --prefix-filecheck-ir-name VAR --version 6
 // RUN: %clang_cc1 -cl-std=CL2.0 -O0 -disable-llvm-passes -fno-ident -emit-llvm -o - -triple amdgcn-amd-amdhsa %s -fdenormal-fp-math-f32=preserve-sign | FileCheck %s --check-prefixes=CHECK,NOCPU
 
 // // Check no-optnone and target-cpu behavior
@@ -451,13 +451,13 @@ kernel void test_target_features_kernel(global int *i) {
 // GFX900-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
 // GFX900-NEXT:    [[ID_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ID_ADDR]] to ptr
 // GFX900-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
-// GFX900-NEXT:    store i64 [[ID]], ptr [[ID_ADDR_ASCAST]], align 8, !tbaa [[TBAA3:![0-9]+]]
-// GFX900-NEXT:    store ptr addrspace(1) [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8, !tbaa [[TBAA7:![0-9]+]]
-// GFX900-NEXT:    [[TMP0:%.*]] = load i64, ptr [[ID_ADDR_ASCAST]], align 8, !tbaa [[TBAA3]]
-// GFX900-NEXT:    [[TMP1:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8, !tbaa [[TBAA7]]
-// GFX900-NEXT:    [[TMP2:%.*]] = load i64, ptr [[ID_ADDR_ASCAST]], align 8, !tbaa [[TBAA3]]
+// GFX900-NEXT:    store i64 [[ID]], ptr [[ID_ADDR_ASCAST]], align 8, !tbaa [[LONG_TBAA3:![0-9]+]]
+// GFX900-NEXT:    store ptr addrspace(1) [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8, !tbaa [[LONGPTR_TBAA7:![0-9]+]]
+// GFX900-NEXT:    [[TMP0:%.*]] = load i64, ptr [[ID_ADDR_ASCAST]], align 8, !tbaa [[LONG_TBAA3]]
+// GFX900-NEXT:    [[TMP1:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8, !tbaa [[LONGPTR_TBAA7]]
+// GFX900-NEXT:    [[TMP2:%.*]] = load i64, ptr [[ID_ADDR_ASCAST]], align 8, !tbaa [[LONG_TBAA3]]
 // GFX900-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr addrspace(1) [[TMP1]], i64 [[TMP2]]
-// GFX900-NEXT:    store i64 [[TMP0]], ptr addrspace(1) [[ARRAYIDX]], align 8, !tbaa [[TBAA3]]
+// GFX900-NEXT:    store i64 [[TMP0]], ptr addrspace(1) [[ARRAYIDX]], align 8, !tbaa [[LONG_TBAA3]]
 // GFX900-NEXT:    ret void
 //
 //
@@ -473,14 +473,14 @@ kernel void test_target_features_kernel(global int *i) {
 // GFX900-NEXT:    [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr
 // GFX900-NEXT:    [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
 // GFX900-NEXT:    [[D_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D_ADDR]] to ptr
-// GFX900-NEXT:    store ptr addrspace(1) [[A]], ptr [[A_ADDR_ASCAST]], align 8, !tbaa [[TBAA14:![0-9]+]]
-// GFX900-NEXT:    store i8 [[B]], ptr [[B_ADDR_ASCAST]], align 1, !tbaa [[TBAA16:![0-9]+]]
-// GFX900-NEXT:    store ptr addrspace(1) [[C]], ptr [[C_ADDR_ASCAST]], align 8, !tbaa [[TBAA7]]
-// GFX900-NEXT:    store i64 [[D]], ptr [[D_ADDR_ASCAST]], align 8, !tbaa [[TBAA3]]
-// GFX900-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[A_ADDR_ASCAST]], align 8, !tbaa [[TBAA14]]
-// GFX900-NEXT:    [[TMP1:%.*]] = load i8, ptr [[B_ADDR_ASCAST]], align 1, !tbaa [[TBAA16]]
-// GFX900-NEXT:    [[TMP2:%.*]] = load ptr addrspace(1), ptr [[C_ADDR_ASCAST]], align 8, !tbaa [[TBAA7]]
-// GFX900-NEXT:    [[TMP3:%.*]] = load i64, ptr [[D_ADDR_ASCAST]], align 8, !tbaa [[TBAA3]]
+// GFX900-NEXT:    store ptr addrspace(1) [[A]], ptr [[A_ADDR_ASCAST]], align 8, !tbaa [[CHARPTR_TBAA14:![0-9]+]]
+// GFX900-NEXT:    store i8 [[B]], ptr [[B_ADDR_ASCAST]], align 1, !tbaa [[CHAR_TBAA16:![0-9]+]]
+// GFX900-NEXT:    store ptr addrspace(1) [[C]], ptr [[C_ADDR_ASCAST]], align 8, !tbaa [[LONGPTR_TBAA7]]
+// GFX900-NEXT:    store i64 [[D]], ptr [[D_ADDR_ASCAST]], align 8, !tbaa [[LONG_TBAA3]]
+// GFX900-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[A_ADDR_ASCAST]], align 8, !tbaa [[CHARPTR_TBAA14]]
+// GFX900-NEXT:    [[TMP1:%.*]] = load i8, ptr [[B_ADDR_ASCAST]], align 1, !tbaa [[CHAR_TBAA16]]
+// GFX900-NEXT:    [[TMP2:%.*]] = load ptr addrspace(1), ptr [[C_ADDR_ASCAST]], align 8, !tbaa [[LONGPTR_TBAA7]]
+// GFX900-NEXT:    [[TMP3:%.*]] = load i64, ptr [[D_ADDR_ASCAST]], align 8, !tbaa [[LONG_TBAA3]]
 // GFX900-NEXT:    call void @__clang_ocl_kern_imp_test(ptr addrspace(1) noundef align 1 [[TMP0]], i8 noundef signext [[TMP1]], ptr addrspace(1) noundef align 8 [[TMP2]], i64 noundef [[TMP3]]) #[[ATTR8:[0-9]+]]
 // GFX900-NEXT:    ret void
 //
@@ -519,16 +519,16 @@ kernel void test_target_features_kernel(global int *i) {
 // GFX900-NEXT:    [[BLOCK_SIZES_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[BLOCK_SIZES]] to ptr
 // GFX900-NEXT:    [[BLOCK21_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[BLOCK21]] to ptr
 // GFX900-NEXT:    [[TMP27_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VARTMP27]] to ptr
-// GFX900-NEXT:    store ptr addrspace(1) [[A]], ptr [[A_ADDR_ASCAST]], align 8, !tbaa [[TBAA14]]
-// GFX900-NEXT:    store i8 [[B]], ptr [[B_ADDR_ASCAST]], align 1, !tbaa [[TBAA16]]
-// GFX900-NEXT:    store ptr addrspace(1) [[C]], ptr [[C_ADDR_ASCAST]], align 8, !tbaa [[TBAA7]]
-// GFX900-NEXT:    store i64 [[D]], ptr [[D_ADDR_ASCAST]], align 8, !tbaa [[TBAA3]]
+// GFX900-NEXT:    store ptr addrspace(1) [[A]], ptr [[A_ADDR_ASCAST]], align 8, !tbaa [[CHARPTR_TBAA14]]
+// GFX900-NEXT:    store i8 [[B]], ptr [[B_ADDR_ASCAST]], align 1, !tbaa [[CHAR_TBAA16]]
+// GFX900-NEXT:    store ptr addrspace(1) [[C]], ptr [[C_ADDR_ASCAST]], align 8, !tbaa [[LONGPTR_TBAA7]]
+// GFX900-NEXT:    store i64 [[D]], ptr [[D_ADDR_ASCAST]], align 8, !tbaa [[LONG_TBAA3]]
 // GFX900-NEXT:    call void @llvm.lifetime.start.p5(ptr addrspace(5) [[DEFAULT_QUEUE]]) #[[ATTR9:[0-9]+]]
 // GFX900-NEXT:    call void @llvm.lifetime.start.p5(ptr addrspace(5) [[FLAGS]]) #[[ATTR9]]
-// GFX900-NEXT:    store i32 0, ptr addrspace(5) [[FLAGS]], align 4, !tbaa [[TBAA17:![0-9]+]]
+// GFX900-NEXT:    store i32 0, ptr addrspace(5) [[FLAGS]], align 4, !tbaa [[INT_TBAA17:![0-9]+]]
 // GFX900-NEXT:    call void @llvm.lifetime.start.p5(ptr addrspace(5) [[NDRANGE]]) #[[ATTR9]]
-// GFX900-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[DEFAULT_QUEUE]], align 8, !tbaa [[TBAA19:![0-9]+]]
-// GFX900-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[FLAGS]], align 4, !tbaa [[TBAA17]]
+// GFX900-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[DEFAULT_QUEUE]], align 8, !tbaa [[QUEUE_T_TBAA19:![0-9]+]]
+// GFX900-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[FLAGS]], align 4, !tbaa [[INT_TBAA17]]
 // GFX900-NEXT:    call void @llvm.memcpy.p0.p5.i64(ptr align 4 [[TMP_ASCAST]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false), !tbaa.struct [[TBAA_STRUCT21:![0-9]+]]
 // GFX900-NEXT:    [[BLOCK_SIZE:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), i8 }>, ptr [[BLOCK_ASCAST]], i32 0, i32 0
 // GFX900-NEXT:    store i32 25, ptr [[BLOCK_SIZE]], align 8
@@ -537,14 +537,14 @@ kernel void test_target_features_kernel(global int *i) {
 // GFX900-NEXT:    [[BLOCK_INVOKE:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), i8 }>, ptr [[BLOCK_ASCAST]], i32 0, i32 2
 // GFX900-NEXT:    store ptr @__test_block_invoke, ptr [[BLOCK_INVOKE]], align 8
 // GFX900-NEXT:    [[BLOCK_CAPTURED:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), i8 }>, ptr [[BLOCK_ASCAST]], i32 0, i32 3
-// GFX900-NEXT:    [[TMP2:%.*]] = load ptr addrspace(1), ptr [[A_ADDR_ASCAST]], align 8, !tbaa [[TBAA14]]
-// GFX900-NEXT:    store ptr addrspace(1) [[TMP2]], ptr [[BLOCK_CAPTURED]], align 8, !tbaa [[TBAA14]]
+// GFX900-NEXT:    [[TMP2:%.*]] = load ptr addrspace(1), ptr [[A_ADDR_ASCAST]], align 8, !tbaa [[CHARPTR_TBAA14]]
+// GFX900-NEXT:    store ptr addrspace(1) [[TMP2]], ptr [[BLOCK_CAPTURED]], align 8, !tbaa [[CHARPTR_TBAA14]]
 // GFX900-NEXT:    [[BLOCK_CAPTURED1:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), i8 }>, ptr [[BLOCK_ASCAST]], i32 0, i32 4
-// GFX900-NEXT:    [[TMP3:%.*]] = load i8, ptr [[B_ADDR_ASCAST]], align 1, !tbaa [[TBAA16]]
-// GFX900-NEXT:    store i8 [[TMP3]], ptr [[BLOCK_CAPTURED1]], align 8, !tbaa [[TBAA16]]
+// GFX900-NEXT:    [[TMP3:%.*]] = load i8, ptr [[B_ADDR_ASCAST]], align 1, !tbaa [[CHAR_TBAA16]]
+// GFX900-NEXT:    store i8 [[TMP3]], ptr [[BLOCK_CAPTURED1]], align 8, !tbaa [[CHAR_TBAA16]]
 // GFX900-NEXT:    [[TMP4:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP0]], i32 [[TMP1]], ptr addrspace(5) [[TMP]], ptr addrspacecast (ptr addrspace(1) @__test_block_invoke_kernel.runtime.handle to ptr), ptr [[BLOCK_ASCAST]])
-// GFX900-NEXT:    [[TMP5:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[DEFAULT_QUEUE]], align 8, !tbaa [[TBAA19]]
-// GFX900-NEXT:    [[TMP6:%.*]] = load i32, ptr addrspace(5) [[FLAGS]], align 4, !tbaa [[TBAA17]]
+// GFX900-NEXT:    [[TMP5:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[DEFAULT_QUEUE]], align 8, !tbaa [[QUEUE_T_TBAA19]]
+// GFX900-NEXT:    [[TMP6:%.*]] = load i32, ptr addrspace(5) [[FLAGS]], align 4, !tbaa [[INT_TBAA17]]
 // GFX900-NEXT:    call void @llvm.memcpy.p0.p5.i64(ptr align 4 [[TMP2_ASCAST]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false), !tbaa.struct [[TBAA_STRUCT21]]
 // GFX900-NEXT:    [[BLOCK_SIZE4:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[BLOCK3_ASCAST]], i32 0, i32 0
 // GFX900-NEXT:    store i32 41, ptr [[BLOCK_SIZE4]], align 8
@@ -553,20 +553,20 @@ kernel void test_target_features_kernel(global int *i) {
 // GFX900-NEXT:    [[BLOCK_INVOKE6:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[BLOCK3_ASCAST]], i32 0, i32 2
 // GFX900-NEXT:    store ptr @__test_block_invoke_2, ptr [[BLOCK_INVOKE6]], align 8
 // GFX900-NEXT:    [[BLOCK_CAPTURED7:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[BLOCK3_ASCAST]], i32 0, i32 3
-// GFX900-NEXT:    [[TMP7:%.*]] = load ptr addrspace(1), ptr [[A_ADDR_ASCAST]], align 8, !tbaa [[TBAA14]]
-// GFX900-NEXT:    store ptr addrspace(1) [[TMP7]], ptr [[BLOCK_CAPTURED7]], align 8, !tbaa [[TBAA14]]
+// GFX900-NEXT:    [[TMP7:%.*]] = load ptr addrspace(1), ptr [[A_ADDR_ASCAST]], align 8, !tbaa [[CHARPTR_TBAA14]]
+// GFX900-NEXT:    store ptr addrspace(1) [[TMP7]], ptr [[BLOCK_CAPTURED7]], align 8, !tbaa [[CHARPTR_TBAA14]]
 // GFX900-NEXT:    [[BLOCK_CAPTURED8:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[BLOCK3_ASCAST]], i32 0, i32 6
-// GFX900-NEXT:    [[TMP8:%.*]] = load i8, ptr [[B_ADDR_ASCAST]], align 1, !tbaa [[TBAA16]]
-// GFX900-NEXT:    store i8 [[TMP8]], ptr [[BLOCK_CAPTURED8]], align 8, !tbaa [[TBAA16]]
+// GFX900-NEXT:    [[TMP8:%.*]] = load i8, ptr [[B_ADDR_ASCAST]], align 1, !tbaa [[CHAR_TBAA16]]
+// GFX900-NEXT:    store i8 [[TMP8]], ptr [[BLOCK_CAPTURED8]], align 8, !tbaa [[CHAR_TBAA16]]
 // GFX900-NEXT:    [[BLOCK_CAPTURED9:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[BLOCK3_ASCAST]], i32 0, i32 4
-// GFX900-NEXT:    [[TMP9:%.*]] = load ptr addrspace(1), ptr [[C_ADDR_ASCAST]], align 8, !tbaa [[TBAA7]]
-// GFX900-NEXT:    store ptr addrspace(1) [[TMP9]], ptr [[BLOCK_CAPTURED9]], align 8, !tbaa [[TBAA7]]
+// GFX900-NEXT:    [[TMP9:%.*]] = load ptr addrspace(1), ptr [[C_ADDR_ASCAST]], align 8, !tbaa [[LONGPTR_TBAA7]]
+// GFX900-NEXT:    store ptr addrspace(1) [[TMP9]], ptr [[BLOCK_CAPTURED9]], align 8, !tbaa [[LONGPTR_TBAA7]]
 // GFX900-NEXT:    [[BLOCK_CAPTURED10:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[BLOCK3_ASCAST]], i32 0, i32 5
-// GFX900-NEXT:    [[TMP10:%.*]] = load i64, ptr [[D_ADDR_ASCAST]], align 8, !tbaa [[TBAA3]]
-// GFX900-NEXT:    store i64 [[TMP10]], ptr [[BLOCK_CAPTURED10]], align 8, !tbaa [[TBAA3]]
+// GFX900-NEXT:    [[TMP10:%.*]] = load i64, ptr [[D_ADDR_ASCAST]], align 8, !tbaa [[LONG_TBAA3]]
+// GFX900-NEXT:    store i64 [[TMP10]], ptr [[BLOCK_CAPTURED10]], align 8, !tbaa [[LONG_TBAA3]]
 // GFX900-NEXT:    [[TMP11:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP5]], i32 [[TMP6]], ptr addrspace(5) [[VARTMP2]], ptr addrspacecast (ptr addrspace(1) @__test_block_invoke_2_kernel.runtime.handle to ptr), ptr [[BLOCK3_ASCAST]])
-// GFX900-NEXT:    [[TMP12:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[DEFAULT_QUEUE]], align 8, !tbaa [[TBAA19]]
-// GFX900-NEXT:    [[TMP13:%.*]] = load i32, ptr addrspace(5) [[FLAGS]], align 4, !tbaa [[TBAA17]]
+// GFX900-NEXT:    [[TMP12:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[DEFAULT_QUEUE]], align 8, !tbaa [[QUEUE_T_TBAA19]]
+// GFX900-NEXT:    [[TMP13:%.*]] = load i32, ptr addrspace(5) [[FLAGS]], align 4, !tbaa [[INT_TBAA17]]
 // GFX900-NEXT:    call void @llvm.memcpy.p0.p5.i64(ptr align 4 [[TMP11_ASCAST]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false), !tbaa.struct [[TBAA_STRUCT21]]
 // GFX900-NEXT:    [[BLOCK_SIZE13:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[BLOCK12_ASCAST]], i32 0, i32 0
 // GFX900-NEXT:    store i32 41, ptr [[BLOCK_SIZE13]], align 8
@@ -575,17 +575,17 @@ kernel void test_target_features_kernel(global int *i) {
 // GFX900-NEXT:    [[BLOCK_INVOKE15:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[BLOCK12_ASCAST]], i32 0, i32 2
 // GFX900-NEXT:    store ptr @__test_block_invoke_3, ptr [[BLOCK_INVOKE15]], align 8
 // GFX900-NEXT:    [[BLOCK_CAPTURED16:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[BLOCK12_ASCAST]], i32 0, i32 3
-// GFX900-NEXT:    [[TMP14:%.*]] = load ptr addrspace(1), ptr [[A_ADDR_ASCAST]], align 8, !tbaa [[TBAA14]]
-// GFX900-NEXT:    store ptr addrspace(1) [[TMP14]], ptr [[BLOCK_CAPTURED16]], align 8, !tbaa [[TBAA14]]
+// GFX900-NEXT:    [[TMP14:%.*]] = load ptr addrspace(1), ptr [[A_ADDR_ASCAST]], align 8, !tbaa [[CHARPTR_TBAA14]]
+// GFX900-NEXT:    store ptr addrspace(1) [[TMP14]], ptr [[BLOCK_CAPTURED16]], align 8, !tbaa [[CHARPTR_TBAA14]]
 // GFX900-NEXT:    [[BLOCK_CAPTURED17:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[BLOCK12_ASCAST]], i32 0, i32 6
-// GFX900-NEXT:    [[TMP15:%.*]] = load i8, ptr [[B_ADDR_ASCAST]], align 1, !tbaa [[TBAA16]]
-// GFX900-NEXT:    store i8 [[TMP15]], ptr [[BLOCK_CAPTURED17]], align 8, !tbaa [[TBAA16]]
+// GFX900-NEXT:    [[TMP15:%.*]] = load i8, ptr [[B_ADDR_ASCAST]], align 1, !tbaa [[CHAR_TBAA16]]
+// GFX900-NEXT:    store i8 [[TMP15]], ptr [[BLOCK_CAPTURED17]], align 8, !tbaa [[CHAR_TBAA16]]
 // GFX900-NEXT:    [[BLOCK_CAPTURED18:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[BLOCK12_ASCAST]], i32 0, i32 4
-// GFX900-NEXT:    [[TMP16:%.*]] = load ptr addrspace(1), ptr [[C_ADDR_ASCAST]], align 8, !tbaa [[TBAA7]]
-// GFX900-NEXT:    store ptr addrspace(1) [[TMP16]], ptr [[BLOCK_CAPTURED18]], align 8, !tbaa [[TBAA7]]
+// GFX900-NEXT:    [[TMP16:%.*]] = load ptr addrspace(1), ptr [[C_ADDR_ASCAST]], align 8, !tbaa [[LONGPTR_TBAA7]]
+// GFX900-NEXT:    store ptr addrspace(1) [[TMP16]], ptr [[BLOCK_CAPTURED18]], align 8, !tbaa [[LONGPTR_TBAA7]]
 // GFX900-NEXT:    [[BLOCK_CAPTURED19:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[BLOCK12_ASCAST]], i32 0, i32 5
-// GFX900-NEXT:    [[TMP17:%.*]] = load i64, ptr [[D_ADDR_ASCAST]], align 8, !tbaa [[TBAA3]]
-// GFX900-NEXT:    store i64 [[TMP17]], ptr [[BLOCK_CAPTURED19]], align 8, !tbaa [[TBAA3]]
+// GFX900-NEXT:    [[TMP17:%.*]] = load i64, ptr [[D_ADDR_ASCAST]], align 8, !tbaa [[LONG_TBAA3]]
+// GFX900-NEXT:    store i64 [[TMP17]], ptr [[BLOCK_CAPTURED19]], align 8, !tbaa [[LONG_TBAA3]]
 // GFX900-NEXT:    call void @llvm.lifetime.start.p5(ptr addrspace(5) [[BLOCK_SIZES]]) #[[ATTR9]]
 // GFX900-NEXT:    [[TMP18:%.*]] = getelementptr [1 x i64], ptr addrspace(5) [[BLOCK_SIZES]], i32 0, i32 0
 // GFX900-NEXT:    store i64 100, ptr addrspace(5) [[TMP18]], align 8
@@ -599,16 +599,16 @@ kernel void test_target_features_kernel(global int *i) {
 // GFX900-NEXT:    [[BLOCK_INVOKE24:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr [[BLOCK21_ASCAST]], i32 0, i32 2
 // GFX900-NEXT:    store ptr @__test_block_invoke_4, ptr [[BLOCK_INVOKE24]], align 8
 // GFX900-NEXT:    [[BLOCK_CAPTURED25:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr [[BLOCK21_ASCAST]], i32 0, i32 3
-// GFX900-NEXT:    [[TMP20:%.*]] = load i64, ptr [[D_ADDR_ASCAST]], align 8, !tbaa [[TBAA3]]
-// GFX900-NEXT:    store i64 [[TMP20]], ptr [[BLOCK_CAPTURED25]], align 8, !tbaa [[TBAA3]]
+// GFX900-NEXT:    [[TMP20:%.*]] = load i64, ptr [[D_ADDR_ASCAST]], align 8, !tbaa [[LONG_TBAA3]]
+// GFX900-NEXT:    store i64 [[TMP20]], ptr [[BLOCK_CAPTURED25]], align 8, !tbaa [[LONG_TBAA3]]
 // GFX900-NEXT:    [[BLOCK_CAPTURED26:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr [[BLOCK21_ASCAST]], i32 0, i32 4
-// GFX900-NEXT:    [[TMP21:%.*]] = load ptr addrspace(1), ptr [[C_ADDR_ASCAST]], align 8, !tbaa [[TBAA7]]
-// GFX900-NEXT:    store ptr addrspace(1) [[TMP21]], ptr [[BLOCK_CAPTURED26]], align 8, !tbaa [[TBAA7]]
-// GFX900-NEXT:    store ptr [[BLOCK21_ASCAST]], ptr addrspace(5) [[BLOCK20]], align 8, !tbaa [[TBAA16]]
-// GFX900-NEXT:    [[TMP22:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[DEFAULT_QUEUE]], align 8, !tbaa [[TBAA19]]
-// GFX900-NEXT:    [[TMP23:%.*]] = load i32, ptr addrspace(5) [[FLAGS]], align 4, !tbaa [[TBAA17]]
+// GFX900-NEXT:    [[TMP21:%.*]] = load ptr addrspace(1), ptr [[C_ADDR_ASCAST]], align 8, !tbaa [[LONGPTR_TBAA7]]
+// GFX900-NEXT:    store ptr addrspace(1) [[TMP21]], ptr [[BLOCK_CAPTURED26]], align 8, !tbaa [[LONGPTR_TBAA7]]
+// GFX900-NEXT:    store ptr [[BLOCK21_ASCAST]], ptr addrspace(5) [[BLOCK20]], align 8, !tbaa [[CHAR_TBAA16]]
+// GFX900-NEXT:    [[TMP22:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[DEFAULT_QUEUE]], align 8, !tbaa [[QUEUE_T_TBAA19]]
+// GFX900-NEXT:    [[TMP23:%.*]] = load i32, ptr addrspace(5) [[FLAGS]], align 4, !tbaa [[INT_TBAA17]]
 // GFX900-NEXT:    call void @llvm.memcpy.p0.p5.i64(ptr align 4 [[TMP27_ASCAST]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false), !tbaa.struct [[TBAA_STRUCT21]]
-// GFX900-NEXT:    [[TMP24:%.*]] = load ptr, ptr addrspace(5) [[BLOCK20]], align 8, !tbaa [[TBAA16]]
+// GFX900-NEXT:    [[TMP24:%.*]] = load ptr, ptr addrspace(5) [[BLOCK20]], align 8, !tbaa [[CHAR_TBAA16]]
 // GFX900-NEXT:    [[TMP25:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP22]], i32 [[TMP23]], ptr addrspace(5) [[VARTMP27]], ptr addrspacecast (ptr addrspace(1) @__test_block_invoke_4_kernel.runtime.handle to ptr), ptr [[BLOCK21_ASCAST]])
 // GFX900-NEXT:    call void @llvm.lifetime.end.p5(ptr addrspace(5) [[BLOCK20]]) #[[ATTR9]]
 // GFX900-NEXT:    call void @llvm.lifetime.end.p5(ptr addrspace(5) [[NDRANGE]]) #[[ATTR9]]
@@ -623,8 +623,8 @@ kernel void test_target_features_kernel(global int *i) {
 // GFX900-NEXT:  [[ENTRY:.*:]]
 // GFX900-NEXT:    [[I_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
 // GFX900-NEXT:    [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr
-// GFX900-NEXT:    store ptr addrspace(1) [[I]], ptr [[I_ADDR_ASCAST]], align 8, !tbaa [[TBAA26:![0-9]+]]
-// GFX900-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[I_ADDR_ASCAST]], align 8, !tbaa [[TBAA26]]
+// GFX900-NEXT:    store ptr addrspace(1) [[I]], ptr [[I_ADDR_ASCAST]], align 8, !tbaa [[INTPTR_TBAA26:![0-9]+]]
+// GFX900-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[I_ADDR_ASCAST]], align 8, !tbaa [[INTPTR_TBAA26]]
 // GFX900-NEXT:    call void @__clang_ocl_kern_imp_test_target_features_kernel(ptr addrspace(1) noundef align 4 [[TMP0]]) #[[ATTR8]]
 // GFX900-NEXT:    ret void
 //
@@ -640,14 +640,14 @@ kernel void test_target_features_kernel(global int *i) {
 // GFX900-NEXT:    [[TMP:%.*]] = alloca [[STRUCT_NDRANGE_T]], align 4, addrspace(5)
 // GFX900-NEXT:    [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr
 // GFX900-NEXT:    [[TMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TMP]] to ptr
-// GFX900-NEXT:    store ptr addrspace(1) [[I]], ptr [[I_ADDR_ASCAST]], align 8, !tbaa [[TBAA26]]
+// GFX900-NEXT:    store ptr addrspace(1) [[I]], ptr [[I_ADDR_ASCAST]], align 8, !tbaa [[INTPTR_TBAA26]]
 // GFX900-NEXT:    call void @llvm.lifetime.start.p5(ptr addrspace(5) [[DEFAULT_QUEUE]]) #[[ATTR9]]
 // GFX900-NEXT:    call void @llvm.lifetime.start.p5(ptr addrspace(5) [[FLAGS]]) #[[ATTR9]]
-// GFX900-NEXT:    store i32 0, ptr addrspace(5) [[FLAGS]], align 4, !tbaa [[TBAA17]]
+// GFX900-NEXT:    store i32 0, ptr addrspace(5) [[FLAGS]], align 4, !tbaa [[INT_TBAA17]]
 // GFX900-NEXT:    call void @llvm.lifetime.start.p5(ptr addrspace(5) [[NDRANGE]]) #[[ATTR9]]
 // GFX900-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.s.memtime()
-// GFX900-NEXT:    [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[DEFAULT_QUEUE]], align 8, !tbaa [[TBAA19]]
-// GFX900-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(5) [[FLAGS]], align 4, !tbaa [[TBAA17]]
+// GFX900-NEXT:    [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[DEFAULT_QUEUE]], align 8, !tbaa [[QUEUE_T_TBAA19]]
+// GFX900-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(5) [[FLAGS]], align 4, !tbaa [[INT_TBAA17]]
 // GFX900-NEXT:    call void @llvm.memcpy.p0.p5.i64(ptr align 4 [[TMP_ASCAST]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false), !tbaa.struct [[TBAA_STRUCT21]]
 // GFX900-NEXT:    [[TMP3:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP1]], i32 [[TMP2]], ptr addrspace(5) [[TMP]], ptr addrspacecast (ptr addrspace(1) @__test_target_features_kernel_block_invoke_kernel.runtime.handle to ptr), ptr addrspacecast (ptr addrspace(1) @__block_literal_global to ptr))
 // GFX900-NEXT:    call void @llvm.lifetime.end.p5(ptr addrspace(5) [[NDRANGE]]) #[[ATTR9]]
@@ -664,11 +664,11 @@ kernel void test_target_features_kernel(global int *i) {
 // GFX900-NEXT:    [[DOTBLOCK_DESCRIPTOR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTBLOCK_DESCRIPTOR_ADDR]] to ptr
 // GFX900-NEXT:    store ptr [[DOTBLOCK_DESCRIPTOR]], ptr [[DOTBLOCK_DESCRIPTOR_ADDR_ASCAST]], align 8
 // GFX900-NEXT:    [[BLOCK_CAPTURE_ADDR:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), i8 }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 4
-// GFX900-NEXT:    [[TMP0:%.*]] = load i8, ptr [[BLOCK_CAPTURE_ADDR]], align 8, !tbaa [[TBAA16]]
+// GFX900-NEXT:    [[TMP0:%.*]] = load i8, ptr [[BLOCK_CAPTURE_ADDR]], align 8, !tbaa [[CHAR_TBAA16]]
 // GFX900-NEXT:    [[BLOCK_CAPTURE_ADDR1:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), i8 }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 3
-// GFX900-NEXT:    [[TMP1:%.*]] = load ptr addrspace(1), ptr [[BLOCK_CAPTURE_ADDR1]], align 8, !tbaa [[TBAA14]]
+// GFX900-NEXT:    [[TMP1:%.*]] = load ptr addrspace(1), ptr [[BLOCK_CAPTURE_ADDR1]], align 8, !tbaa [[CHARPTR_TBAA14]]
 // GFX900-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP1]], i64 0
-// GFX900-NEXT:    store i8 [[TMP0]], ptr addrspace(1) [[ARRAYIDX]], align 1, !tbaa [[TBAA16]]
+// GFX900-NEXT:    store i8 [[TMP0]], ptr addrspace(1) [[ARRAYIDX]], align 1, !tbaa [[CHAR_TBAA16]]
 // GFX900-NEXT:    ret void
 //
 //
@@ -691,17 +691,17 @@ kernel void test_target_features_kernel(global int *i) {
 // GFX900-NEXT:    [[DOTBLOCK_DESCRIPTOR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTBLOCK_DESCRIPTOR_ADDR]] to ptr
 // GFX900-NEXT:    store ptr [[DOTBLOCK_DESCRIPTOR]], ptr [[DOTBLOCK_DESCRIPTOR_ADDR_ASCAST]], align 8
 // GFX900-NEXT:    [[BLOCK_CAPTURE_ADDR:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 6
-// GFX900-NEXT:    [[TMP0:%.*]] = load i8, ptr [[BLOCK_CAPTURE_ADDR]], align 8, !tbaa [[TBAA16]]
+// GFX900-NEXT:    [[TMP0:%.*]] = load i8, ptr [[BLOCK_CAPTURE_ADDR]], align 8, !tbaa [[CHAR_TBAA16]]
 // GFX900-NEXT:    [[BLOCK_CAPTURE_ADDR1:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 3
-// GFX900-NEXT:    [[TMP1:%.*]] = load ptr addrspace(1), ptr [[BLOCK_CAPTURE_ADDR1]], align 8, !tbaa [[TBAA14]]
+// GFX900-NEXT:    [[TMP1:%.*]] = load ptr addrspace(1), ptr [[BLOCK_CAPTURE_ADDR1]], align 8, !tbaa [[CHARPTR_TBAA14]]
 // GFX900-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP1]], i64 0
-// GFX900-NEXT:    store i8 [[TMP0]], ptr addrspace(1) [[ARRAYIDX]], align 1, !tbaa [[TBAA16]]
+// GFX900-NEXT:    store i8 [[TMP0]], ptr addrspace(1) [[ARRAYIDX]], align 1, !tbaa [[CHAR_TBAA16]]
 // GFX900-NEXT:    [[BLOCK_CAPTURE_ADDR2:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 5
-// GFX900-NEXT:    [[TMP2:%.*]] = load i64, ptr [[BLOCK_CAPTURE_ADDR2]], align 8, !tbaa [[TBAA3]]
+// GFX900-NEXT:    [[TMP2:%.*]] = load i64, ptr [[BLOCK_CAPTURE_ADDR2]], align 8, !tbaa [[LONG_TBAA3]]
 // GFX900-NEXT:    [[BLOCK_CAPTURE_ADDR3:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 4
-// GFX900-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr [[BLOCK_CAPTURE_ADDR3]], align 8, !tbaa [[TBAA7]]
+// GFX900-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr [[BLOCK_CAPTURE_ADDR3]], align 8, !tbaa [[LONGPTR_TBAA7]]
 // GFX900-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i64, ptr addrspace(1) [[TMP3]], i64 0
-// GFX900-NEXT:    store i64 [[TMP2]], ptr addrspace(1) [[ARRAYIDX4]], align 8, !tbaa [[TBAA3]]
+// GFX900-NEXT:    store i64 [[TMP2]], ptr addrspace(1) [[ARRAYIDX4]], align 8, !tbaa [[LONG_TBAA3]]
 // GFX900-NEXT:    ret void
 //
 //
@@ -725,22 +725,22 @@ kernel void test_target_features_kernel(global int *i) {
 // GFX900-NEXT:    [[DOTBLOCK_DESCRIPTOR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTBLOCK_DESCRIPTOR_ADDR]] to ptr
 // GFX900-NEXT:    [[LP_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[LP_ADDR]] to ptr
 // GFX900-NEXT:    store ptr [[DOTBLOCK_DESCRIPTOR]], ptr [[DOTBLOCK_DESCRIPTOR_ADDR_ASCAST]], align 8
-// GFX900-NEXT:    store ptr addrspace(3) [[LP]], ptr [[LP_ADDR_ASCAST]], align 4, !tbaa [[TBAA32:![0-9]+]]
+// GFX900-NEXT:    store ptr addrspace(3) [[LP]], ptr [[LP_ADDR_ASCAST]], align 4, !tbaa [[ANYPTR_TBAA32:![0-9]+]]
 // GFX900-NEXT:    [[BLOCK_CAPTURE_ADDR:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 6
-// GFX900-NEXT:    [[TMP0:%.*]] = load i8, ptr [[BLOCK_CAPTURE_ADDR]], align 8, !tbaa [[TBAA16]]
+// GFX900-NEXT:    [[TMP0:%.*]] = load i8, ptr [[BLOCK_CAPTURE_ADDR]], align 8, !tbaa [[CHAR_TBAA16]]
 // GFX900-NEXT:    [[BLOCK_CAPTURE_ADDR1:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 3
-// GFX900-NEXT:    [[TMP1:%.*]] = load ptr addrspace(1), ptr [[BLOCK_CAPTURE_ADDR1]], align 8, !tbaa [[TBAA14]]
+// GFX900-NEXT:    [[TMP1:%.*]] = load ptr addrspace(1), ptr [[BLOCK_CAPTURE_ADDR1]], align 8, !tbaa [[CHARPTR_TBAA14]]
 // GFX900-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP1]], i64 0
-// GFX900-NEXT:    store i8 [[TMP0]], ptr addrspace(1) [[ARRAYIDX]], align 1, !tbaa [[TBAA16]]
+// GFX900-NEXT:    store i8 [[TMP0]], ptr addrspace(1) [[ARRAYIDX]], align 1, !tbaa [[CHAR_TBAA16]]
 // GFX900-NEXT:    [[BLOCK_CAPTURE_ADDR2:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 5
-// GFX900-NEXT:    [[TMP2:%.*]] = load i64, ptr [[BLOCK_CAPTURE_ADDR2]], align 8, !tbaa [[TBAA3]]
+// GFX900-NEXT:    [[TMP2:%.*]] = load i64, ptr [[BLOCK_CAPTURE_ADDR2]], align 8, !tbaa [[LONG_TBAA3]]
 // GFX900-NEXT:    [[BLOCK_CAPTURE_ADDR3:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 4
-// GFX900-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr [[BLOCK_CAPTURE_ADDR3]], align 8, !tbaa [[TBAA7]]
+// GFX900-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr [[BLOCK_CAPTURE_ADDR3]], align 8, !tbaa [[LONGPTR_TBAA7]]
 // GFX900-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i64, ptr addrspace(1) [[TMP3]], i64 0
-// GFX900-NEXT:    store i64 [[TMP2]], ptr addrspace(1) [[ARRAYIDX4]], align 8, !tbaa [[TBAA3]]
-// GFX900-NEXT:    [[TMP4:%.*]] = load ptr addrspace(3), ptr [[LP_ADDR_ASCAST]], align 4, !tbaa [[TBAA32]]
+// GFX900-NEXT:    store i64 [[TMP2]], ptr addrspace(1) [[ARRAYIDX4]], align 8, !tbaa [[LONG_TBAA3]]
+// GFX900-NEXT:    [[TMP4:%.*]] = load ptr addrspace(3), ptr [[LP_ADDR_ASCAST]], align 4, !tbaa [[ANYPTR_TBAA32]]
 // GFX900-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr addrspace(3) [[TMP4]], i64 0
-// GFX900-NEXT:    store i32 1, ptr addrspace(3) [[ARRAYIDX5]], align 4, !tbaa [[TBAA17]]
+// GFX900-NEXT:    store i32 1, ptr addrspace(3) [[ARRAYIDX5]], align 4, !tbaa [[INT_TBAA17]]
 // GFX900-NEXT:    ret void
 //
 //
@@ -763,9 +763,9 @@ kernel void test_target_features_kernel(global int *i) {
 // GFX900-NEXT:    [[DOTBLOCK_DESCRIPTOR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTBLOCK_DESCRIPTOR_ADDR]] to ptr
 // GFX900-NEXT:    store ptr [[DOTBLOCK_DESCRIPTOR]], ptr [[DOTBLOCK_DESCRIPTOR_ADDR_ASCAST]], align 8
 // GFX900-NEXT:    [[BLOCK_CAPTURE_ADDR:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 3
-// GFX900-NEXT:    [[TMP0:%.*]] = load i64, ptr [[BLOCK_CAPTURE_ADDR]], align 8, !tbaa [[TBAA3]]
+// GFX900-NEXT:    [[TMP0:%.*]] = load i64, ptr [[BLOCK_CAPTURE_ADDR]], align 8, !tbaa [[LONG_TBAA3]]
 // GFX900-NEXT:    [[BLOCK_CAPTURE_ADDR1:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 4
-// GFX900-NEXT:    [[TMP1:%.*]] = load ptr addrspace(1), ptr [[BLOCK_CAPTURE_ADDR1]], align 8, !tbaa [[TBAA7]]
+// GFX900-NEXT:    [[TMP1:%.*]] = load ptr addrspace(1), ptr [[BLOCK_CAPTURE_ADDR1]], align 8, !tbaa [[LONGPTR_TBAA7]]
 // GFX900-NEXT:    call void @callee(i64 noundef [[TMP0]], ptr addrspace(1) noundef [[TMP1]]) #[[ATTR8]]
 // GFX900-NEXT:    ret void
 //
@@ -852,36 +852,36 @@ kernel void test_target_features_kernel(global int *i) {
 // GFX900: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 600}
 // GFX900: [[META1:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
 // GFX900: [[META2:![0-9]+]] = !{i32 2, i32 0}
-// GFX900: [[TBAA3]] = !{[[META4:![0-9]+]], [[META4]], i64 0}
+// GFX900: [[LONG_TBAA3]] = !{[[META4:![0-9]+]], [[META4]], i64 0}
 // GFX900: [[META4]] = !{!"long", [[META5:![0-9]+]], i64 0}
 // GFX900: [[META5]] = !{!"omnipotent char", [[META6:![0-9]+]], i64 0}
 // GFX900: [[META6]] = !{!"Simple C/C++ TBAA"}
-// GFX900: [[TBAA7]] = !{[[META8:![0-9]+]], [[META8]], i64 0}
+// GFX900: [[LONGPTR_TBAA7]] = !{[[META8:![0-9]+]], [[META8]], i64 0}
 // GFX900: [[META8]] = !{!"p1 long", [[META9:![0-9]+]], i64 0}
 // GFX900: [[META9]] = !{!"any pointer", [[META5]], i64 0}
 // GFX900: [[META10]] = !{i32 1, i32 0, i32 1, i32 0}
 // GFX900: [[META11]] = !{!"none", !"none", !"none", !"none"}
 // GFX900: [[META12]] = !{!"char*", !"char", !"long*", !"long"}
 // GFX900: [[META13]] = !{!"", !"", !"", !""}
-// GFX900: [[TBAA14]] = !{[[META15:![0-9]+]], [[META15]], i64 0}
+// GFX900: [[CHARPTR_TBAA14]] = !{[[META15:![0-9]+]], [[META15]], i64 0}
 // GFX900: [[META15]] = !{!"p1 omnipotent char", [[META9]], i64 0}
-// GFX900: [[TBAA16]] = !{[[META5]], [[META5]], i64 0}
-// GFX900: [[TBAA17]] = !{[[META18:![0-9]+]], [[META18]], i64 0}
+// GFX900: [[CHAR_TBAA16]] = !{[[META5]], [[META5]], i64 0}
+// GFX900: [[INT_TBAA17]] = !{[[META18:![0-9]+]], [[META18]], i64 0}
 // GFX900: [[META18]] = !{!"int", [[META5]], i64 0}
-// GFX900: [[TBAA19]] = !{[[META20:![0-9]+]], [[META20]], i64 0}
+// GFX900: [[QUEUE_T_TBAA19]] = !{[[META20:![0-9]+]], [[META20]], i64 0}
 // GFX900: [[META20]] = !{!"queue_t", [[META5]], i64 0}
-// GFX900: [[TBAA_STRUCT21]] = !{i64 0, i64 4, [[TBAA17]]}
+// GFX900: [[TBAA_STRUCT21]] = !{i64 0, i64 4, [[INT_TBAA17]]}
 // GFX900: [[META22]] = !{i32 1}
 // GFX900: [[META23]] = !{!"none"}
 // GFX900: [[META24]] = !{!"int*"}
 // GFX900: [[META25]] = !{!""}
-// GFX900: [[TBAA26]] = !{[[META27:![0-9]+]], [[META27]], i64 0}
+// GFX900: [[INTPTR_TBAA26]] = !{[[META27:![0-9]+]], [[META27]], i64 0}
 // GFX900: [[META27]] = !{!"p1 int", [[META9]], i64 0}
 // GFX900: [[META28]] = !{ptr addrspace(1) @__test_block_invoke_kernel.runtime.handle}
 // GFX900: [[META29]] = !{i32 0}
 // GFX900: [[META30]] = !{!"__block_literal"}
 // GFX900: [[META31]] = !{ptr addrspace(1) @__test_block_invoke_2_kernel.runtime.handle}
-// GFX900: [[TBAA32]] = !{[[META9]], [[META9]], i64 0}
+// GFX900: [[ANYPTR_TBAA32]] = !{[[META9]], [[META9]], i64 0}
 // GFX900: [[META33]] = !{ptr addrspace(1) @__test_block_invoke_3_kernel.runtime.handle}
 // GFX900: [[META34]] = !{i32 0, i32 3}
 // GFX900: [[META35]] = !{!"none", !"none"}
diff --git a/clang/test/CodeGenOpenCL/amdgpu-features.cl b/clang/test/CodeGenOpenCL/amdgpu-features.cl
index 776d89890ac6f..0fdb212553151 100644
--- a/clang/test/CodeGenOpenCL/amdgpu-features.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-features.cl
@@ -108,7 +108,7 @@
 // GFX1153: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+atomic-fmin-fmax-global-f32,+ci-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
 // GFX1200: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
 // GFX1201: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
-// GFX1250: "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-trans-insts,+bitop3-insts,+ci-insts,+dl-insts,+dot7-insts,+dot8-insts,+dpp,+fp8-conversion-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+permlane16-swap,+prng-inst,+setprio-inc-wg-inst,+tanh-insts,+tensor-cvt-lut-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+wavefrontsize32"
+// GFX1250: "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-pk-insts,+bf16-trans-insts,+bitop3-insts,+ci-insts,+dl-insts,+dot7-insts,+dot8-insts,+dpp,+fp8-conversion-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+permlane16-swap,+prng-inst,+setprio-inc-wg-inst,+tanh-insts,+tensor-cvt-lut-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+wavefrontsize32"
 
 // GFX1103-W64: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+atomic-fmin-fmax-global-f32,+ci-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize64"
 
diff --git a/clang/test/CodeGenOpenCL/amdgpu-printf.cl b/clang/test/CodeGenOpenCL/amdgpu-printf.cl
index b9e25172a56af..cea7ee576d822 100644
--- a/clang/test/CodeGenOpenCL/amdgpu-printf.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-printf.cl
@@ -1,4 +1,4 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --include-generated-funcs --version 4
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --include-generated-funcs --version 6
 // RUN: %clang_cc1 -cl-std=CL1.2 -triple amdgcn-amd-amdhsa -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s
 
 int printf(__constant const char* st, ...) __attribute__((format(printf, 1, 2)));
@@ -17,60 +17,60 @@ __kernel void test_printf_str_int(int i) {
 }
 // CHECK-LABEL: define dso_local amdgpu_kernel void @test_printf_noargs(
 // CHECK-SAME: ) #[[ATTR0:[0-9]+]] !kernel_arg_addr_space [[META4:![0-9]+]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META4]] !kernel_arg_base_type [[META4]] !kernel_arg_type_qual [[META4]] {
-// CHECK-NEXT:  entry:
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    call void @__clang_ocl_kern_imp_test_printf_noargs() #[[ATTR5:[0-9]+]]
 // CHECK-NEXT:    ret void
 //
 //
 // CHECK-LABEL: define dso_local void @__clang_ocl_kern_imp_test_printf_noargs(
 // CHECK-SAME: ) #[[ATTR1:[0-9]+]] !kernel_arg_addr_space [[META4]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META4]] !kernel_arg_base_type [[META4]] !kernel_arg_type_qual [[META4]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[CALL:%.*]] = call i32 (ptr addrspace(4), ...) @printf(ptr addrspace(4) noundef @.str) #[[ATTR6:[0-9]+]]
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CALL:%.*]] = call i32 (ptr addrspace(4), ...) @printf(ptr addrspace(4) noundef @.str) #[[ATTR5]]
 // CHECK-NEXT:    ret void
 //
 //
 // CHECK-LABEL: define dso_local amdgpu_kernel void @test_printf_int(
 // CHECK-SAME: i32 noundef [[I:%.*]]) #[[ATTR0]] !kernel_arg_addr_space [[META5:![0-9]+]] !kernel_arg_access_qual [[META6:![0-9]+]] !kernel_arg_type [[META7:![0-9]+]] !kernel_arg_base_type [[META7]] !kernel_arg_type_qual [[META8:![0-9]+]] {
-// CHECK-NEXT:  entry:
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[I_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
-// CHECK-NEXT:    store i32 [[I]], ptr addrspace(5) [[I_ADDR]], align 4, !tbaa [[TBAA9:![0-9]+]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[I_ADDR]], align 4, !tbaa [[TBAA9]]
+// CHECK-NEXT:    store i32 [[I]], ptr addrspace(5) [[I_ADDR]], align 4, !tbaa [[INT_TBAA9:![0-9]+]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[I_ADDR]], align 4, !tbaa [[INT_TBAA9]]
 // CHECK-NEXT:    call void @__clang_ocl_kern_imp_test_printf_int(i32 noundef [[TMP0]]) #[[ATTR5]]
 // CHECK-NEXT:    ret void
 //
 //
 // CHECK-LABEL: define dso_local void @__clang_ocl_kern_imp_test_printf_int(
 // CHECK-SAME: i32 noundef [[I:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META5]] !kernel_arg_access_qual [[META6]] !kernel_arg_type [[META7]] !kernel_arg_base_type [[META7]] !kernel_arg_type_qual [[META8]] {
-// CHECK-NEXT:  entry:
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[I_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
-// CHECK-NEXT:    store i32 [[I]], ptr addrspace(5) [[I_ADDR]], align 4, !tbaa [[TBAA9]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[I_ADDR]], align 4, !tbaa [[TBAA9]]
-// CHECK-NEXT:    [[CALL:%.*]] = call i32 (ptr addrspace(4), ...) @printf(ptr addrspace(4) noundef @.str.1, i32 noundef [[TMP0]]) #[[ATTR6]]
+// CHECK-NEXT:    store i32 [[I]], ptr addrspace(5) [[I_ADDR]], align 4, !tbaa [[INT_TBAA9]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[I_ADDR]], align 4, !tbaa [[INT_TBAA9]]
+// CHECK-NEXT:    [[CALL:%.*]] = call i32 (ptr addrspace(4), ...) @printf(ptr addrspace(4) noundef @.str.1, i32 noundef [[TMP0]]) #[[ATTR5]]
 // CHECK-NEXT:    ret void
 //
 //
 // CHECK-LABEL: define dso_local amdgpu_kernel void @test_printf_str_int(
 // CHECK-SAME: i32 noundef [[I:%.*]]) #[[ATTR0]] !kernel_arg_addr_space [[META5]] !kernel_arg_access_qual [[META6]] !kernel_arg_type [[META7]] !kernel_arg_base_type [[META7]] !kernel_arg_type_qual [[META8]] {
-// CHECK-NEXT:  entry:
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[I_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
-// CHECK-NEXT:    store i32 [[I]], ptr addrspace(5) [[I_ADDR]], align 4, !tbaa [[TBAA9]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[I_ADDR]], align 4, !tbaa [[TBAA9]]
+// CHECK-NEXT:    store i32 [[I]], ptr addrspace(5) [[I_ADDR]], align 4, !tbaa [[INT_TBAA9]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[I_ADDR]], align 4, !tbaa [[INT_TBAA9]]
 // CHECK-NEXT:    call void @__clang_ocl_kern_imp_test_printf_str_int(i32 noundef [[TMP0]]) #[[ATTR5]]
 // CHECK-NEXT:    ret void
 //
 //
 // CHECK-LABEL: define dso_local void @__clang_ocl_kern_imp_test_printf_str_int(
 // CHECK-SAME: i32 noundef [[I:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META5]] !kernel_arg_access_qual [[META6]] !kernel_arg_type [[META7]] !kernel_arg_base_type [[META7]] !kernel_arg_type_qual [[META8]] {
-// CHECK-NEXT:  entry:
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[I_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
 // CHECK-NEXT:    [[S:%.*]] = alloca [4 x i8], align 1, addrspace(5)
-// CHECK-NEXT:    store i32 [[I]], ptr addrspace(5) [[I_ADDR]], align 4, !tbaa [[TBAA9]]
-// CHECK-NEXT:    call void @llvm.lifetime.start.p5(ptr addrspace(5) [[S]]) #[[ATTR7:[0-9]+]]
+// CHECK-NEXT:    store i32 [[I]], ptr addrspace(5) [[I_ADDR]], align 4, !tbaa [[INT_TBAA9]]
+// CHECK-NEXT:    call void @llvm.lifetime.start.p5(ptr addrspace(5) [[S]]) #[[ATTR6:[0-9]+]]
 // CHECK-NEXT:    call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 1 [[S]], ptr addrspace(4) align 1 @__const.test_printf_str_int.s, i64 4, i1 false)
 // CHECK-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [4 x i8], ptr addrspace(5) [[S]], i64 0, i64 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[I_ADDR]], align 4, !tbaa [[TBAA9]]
-// CHECK-NEXT:    [[CALL:%.*]] = call i32 (ptr addrspace(4), ...) @printf(ptr addrspace(4) noundef @.str.2, ptr addrspace(5) noundef [[ARRAYDECAY]], i32 noundef [[TMP0]]) #[[ATTR6]]
-// CHECK-NEXT:    call void @llvm.lifetime.end.p5(ptr addrspace(5) [[S]]) #[[ATTR7]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[I_ADDR]], align 4, !tbaa [[INT_TBAA9]]
+// CHECK-NEXT:    [[CALL:%.*]] = call i32 (ptr addrspace(4), ...) @printf(ptr addrspace(4) noundef @.str.2, ptr addrspace(5) noundef [[ARRAYDECAY]], i32 noundef [[TMP0]]) #[[ATTR5]]
+// CHECK-NEXT:    call void @llvm.lifetime.end.p5(ptr addrspace(5) [[S]]) #[[ATTR6]]
 // CHECK-NEXT:    ret void
 //
 //.
@@ -79,7 +79,7 @@ __kernel void test_printf_str_int(int i) {
 // CHECK: [[META6]] = !{!"none"}
 // CHECK: [[META7]] = !{!"int"}
 // CHECK: [[META8]] = !{!""}
-// CHECK: [[TBAA9]] = !{[[META10:![0-9]+]], [[META10]], i64 0}
+// CHECK: [[INT_TBAA9]] = !{[[META10:![0-9]+]], [[META10]], i64 0}
 // CHECK: [[META10]] = !{!"int", [[META11:![0-9]+]], i64 0}
 // CHECK: [[META11]] = !{!"omnipotent char", [[META12:![0-9]+]], i64 0}
 // CHECK: [[META12]] = !{!"Simple C/C++ TBAA"}
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-wmma-w32.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-wmma-w32.cl
index cddc323cb27a5..321835cc3d28d 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-wmma-w32.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-wmma-w32.cl
@@ -1,4 +1,4 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6
 // REQUIRES: amdgpu-registered-target
 // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -target-feature +wavefrontsize32 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1200
 
@@ -14,10 +14,11 @@ typedef int    v8i   __attribute__((ext_vector_type(8)));
 // amdgcn_wmma_f32_16x16x16_f16
 //
 
-// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f32_16x16x16_f16_w32(
-// CHECK-GFX1200-NEXT:  entry:
-// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16(<8 x half> [[A:%.*]], <8 x half> [[B:%.*]], <8 x float> [[C:%.*]])
-// CHECK-GFX1200-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4:![0-9]+]]
+// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_wmma_f32_16x16x16_f16_w32(
+// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <8 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// CHECK-GFX1200-NEXT:  [[ENTRY:.*:]]
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16(<8 x half> [[A]], <8 x half> [[B]], <8 x float> [[C]])
+// CHECK-GFX1200-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA4:![0-9]+]]
 // CHECK-GFX1200-NEXT:    ret void
 //
 void test_amdgcn_wmma_f32_16x16x16_f16_w32(global v8f* out, v8h a, v8h b, v8f c)
@@ -29,10 +30,11 @@ void test_amdgcn_wmma_f32_16x16x16_f16_w32(global v8f* out, v8h a, v8h b, v8f c)
 // amdgcn_wmma_f32_16x16x16_bf16
 //
 
-// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f32_16x16x16_bf16_w32(
-// CHECK-GFX1200-NEXT:  entry:
-// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x float> [[C:%.*]])
-// CHECK-GFX1200-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_wmma_f32_16x16x16_bf16_w32(
+// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]], <8 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-GFX1200-NEXT:  [[ENTRY:.*:]]
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16(<8 x i16> [[A]], <8 x i16> [[B]], <8 x float> [[C]])
+// CHECK-GFX1200-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA4]]
 // CHECK-GFX1200-NEXT:    ret void
 //
 void test_amdgcn_wmma_f32_16x16x16_bf16_w32(global v8f* out, v8s a, v8s b, v8f c)
@@ -44,10 +46,11 @@ void test_amdgcn_wmma_f32_16x16x16_bf16_w32(global v8f* out, v8s a, v8s b, v8f c
 // amdgcn_wmma_f16_16x16x16_f16
 //
 
-// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f16_16x16x16_f16_w32(
-// CHECK-GFX1200-NEXT:  entry:
-// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> [[A:%.*]], <8 x half> [[B:%.*]], <8 x half> [[C:%.*]], i1 false)
-// CHECK-GFX1200-NEXT:    store <8 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_wmma_f16_16x16x16_f16_w32(
+// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-GFX1200-NEXT:  [[ENTRY:.*:]]
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> [[A]], <8 x half> [[B]], <8 x half> [[C]], i1 false)
+// CHECK-GFX1200-NEXT:    store <8 x half> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA4]]
 // CHECK-GFX1200-NEXT:    ret void
 //
 void test_amdgcn_wmma_f16_16x16x16_f16_w32(global v8h* out, v8h a, v8h b, v8h c)
@@ -59,10 +62,11 @@ void test_amdgcn_wmma_f16_16x16x16_f16_w32(global v8h* out, v8h a, v8h b, v8h c)
 // amdgcn_wmma_bf16_16x16x16_bf16
 //
 
-// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_bf16_16x16x16_bf16_w32(
-// CHECK-GFX1200-NEXT:  entry:
-// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]], i1 false)
-// CHECK-GFX1200-NEXT:    store <8 x i16> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_wmma_bf16_16x16x16_bf16_w32(
+// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]], <8 x i16> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-GFX1200-NEXT:  [[ENTRY:.*:]]
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16(<8 x i16> [[A]], <8 x i16> [[B]], <8 x i16> [[C]], i1 false)
+// CHECK-GFX1200-NEXT:    store <8 x i16> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA4]]
 // CHECK-GFX1200-NEXT:    ret void
 //
 void test_amdgcn_wmma_bf16_16x16x16_bf16_w32(global v8s* out, v8s a, v8s b, v8s c)
@@ -74,10 +78,11 @@ void test_amdgcn_wmma_bf16_16x16x16_bf16_w32(global v8s* out, v8s a, v8s b, v8s
 // amdgcn_wmma_i32_16x16x16_iu8
 //
 
-// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_i32_16x16x16_iu8_w32(
-// CHECK-GFX1200-NEXT:  entry:
-// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32(i1 true, <2 x i32> [[A:%.*]], i1 true, <2 x i32> [[B:%.*]], <8 x i32> [[C:%.*]], i1 false)
-// CHECK-GFX1200-NEXT:    store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_wmma_i32_16x16x16_iu8_w32(
+// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <8 x i32> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-GFX1200-NEXT:  [[ENTRY:.*:]]
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32(i1 true, <2 x i32> [[A]], i1 true, <2 x i32> [[B]], <8 x i32> [[C]], i1 false)
+// CHECK-GFX1200-NEXT:    store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA4]]
 // CHECK-GFX1200-NEXT:    ret void
 //
 void test_amdgcn_wmma_i32_16x16x16_iu8_w32(global v8i* out, v2i a, v2i b, v8i c)
@@ -89,10 +94,11 @@ void test_amdgcn_wmma_i32_16x16x16_iu8_w32(global v8i* out, v2i a, v2i b, v8i c)
 // amdgcn_wmma_i32_16x16x16_iu4
 //
 
-// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_i32_16x16x16_iu4_w32(
-// CHECK-GFX1200-NEXT:  entry:
-// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32(i1 true, i32 [[A:%.*]], i1 true, i32 [[B:%.*]], <8 x i32> [[C:%.*]], i1 false)
-// CHECK-GFX1200-NEXT:    store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_wmma_i32_16x16x16_iu4_w32(
+// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], <8 x i32> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-GFX1200-NEXT:  [[ENTRY:.*:]]
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32(i1 true, i32 [[A]], i1 true, i32 [[B]], <8 x i32> [[C]], i1 false)
+// CHECK-GFX1200-NEXT:    store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA4]]
 // CHECK-GFX1200-NEXT:    ret void
 //
 void test_amdgcn_wmma_i32_16x16x16_iu4_w32(global v8i* out, int a, int b, v8i c)
@@ -100,10 +106,11 @@ void test_amdgcn_wmma_i32_16x16x16_iu4_w32(global v8i* out, int a, int b, v8i c)
   *out = __builtin_amdgcn_wmma_i32_16x16x16_iu4_w32_gfx12(true, a, true, b, c, false);
 }
 
-// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32(
-// CHECK-GFX1200-NEXT:  entry:
-// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <8 x float> [[C:%.*]])
-// CHECK-GFX1200-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32(
+// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <8 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-GFX1200-NEXT:  [[ENTRY:.*:]]
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32(<2 x i32> [[A]], <2 x i32> [[B]], <8 x float> [[C]])
+// CHECK-GFX1200-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA4]]
 // CHECK-GFX1200-NEXT:    ret void
 //
 void test_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32(global v8f* out, v2i a, v2i b, v8f c)
@@ -111,10 +118,11 @@ void test_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32(global v8f* out, v2i a, v2i b, v8
   *out = __builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32_gfx12(a, b, c);
 }
 
-// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32(
-// CHECK-GFX1200-NEXT:  entry:
-// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <8 x float> [[C:%.*]])
-// CHECK-GFX1200-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32(
+// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <8 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-GFX1200-NEXT:  [[ENTRY:.*:]]
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32(<2 x i32> [[A]], <2 x i32> [[B]], <8 x float> [[C]])
+// CHECK-GFX1200-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA4]]
 // CHECK-GFX1200-NEXT:    ret void
 //
 void test_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32(global v8f* out, v2i a, v2i b, v8f c)
@@ -122,10 +130,11 @@ void test_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32(global v8f* out, v2i a, v2i b, v8
   *out = __builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32_gfx12(a, b, c);
 }
 
-// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32(
-// CHECK-GFX1200-NEXT:  entry:
-// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <8 x float> [[C:%.*]])
-// CHECK-GFX1200-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32(
+// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <8 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-GFX1200-NEXT:  [[ENTRY:.*:]]
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32(<2 x i32> [[A]], <2 x i32> [[B]], <8 x float> [[C]])
+// CHECK-GFX1200-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA4]]
 // CHECK-GFX1200-NEXT:    ret void
 //
 void test_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32(global v8f* out, v2i a, v2i b, v8f c)
@@ -133,10 +142,11 @@ void test_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32(global v8f* out, v2i a, v2i b, v8
   *out = __builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32_gfx12(a, b, c);
 }
 
-// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32(
-// CHECK-GFX1200-NEXT:  entry:
-// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <8 x float> [[C:%.*]])
-// CHECK-GFX1200-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32(
+// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <8 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-GFX1200-NEXT:  [[ENTRY:.*:]]
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32(<2 x i32> [[A]], <2 x i32> [[B]], <8 x float> [[C]])
+// CHECK-GFX1200-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA4]]
 // CHECK-GFX1200-NEXT:    ret void
 //
 void test_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32(global v8f* out, v2i a, v2i b, v8f c)
@@ -144,13 +154,19 @@ void test_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32(global v8f* out, v2i a, v2i b, v8
   *out = __builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32_gfx12(a, b, c);
 }
 
-// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_i32_16x16x32_iu4_w32(
-// CHECK-GFX1200-NEXT:  entry:
-// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32(i1 true, <2 x i32> [[A:%.*]], i1 true, <2 x i32> [[B:%.*]], <8 x i32> [[C:%.*]], i1 false)
-// CHECK-GFX1200-NEXT:    store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_wmma_i32_16x16x32_iu4_w32(
+// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <8 x i32> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-GFX1200-NEXT:  [[ENTRY:.*:]]
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32(i1 true, <2 x i32> [[A]], i1 true, <2 x i32> [[B]], <8 x i32> [[C]], i1 false)
+// CHECK-GFX1200-NEXT:    store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA4]]
 // CHECK-GFX1200-NEXT:    ret void
 //
 void test_amdgcn_wmma_i32_16x16x32_iu4_w32(global v8i* out, v2i a, v2i b, v8i c)
 {
   *out = __builtin_amdgcn_wmma_i32_16x16x32_iu4_w32_gfx12(true, a, true, b, c, false);
 }
+//.
+// CHECK-GFX1200: [[CHAR_TBAA4]] = !{[[META5:![0-9]+]], [[META5]], i64 0}
+// CHECK-GFX1200: [[META5]] = !{!"omnipotent char", [[META6:![0-9]+]], i64 0}
+// CHECK-GFX1200: [[META6]] = !{!"Simple C/C++ TBAA"}
+//.
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-wmma-w64.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-wmma-w64.cl
index 1c1d273eda771..8b5b31537ce58 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-wmma-w64.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-wmma-w64.cl
@@ -1,4 +1,4 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6
 // REQUIRES: amdgpu-registered-target
 // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -target-feature +wavefrontsize64 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1200
 
@@ -13,10 +13,11 @@ typedef int    v4i   __attribute__((ext_vector_type(4)));
 // amdgcn_wmma_f32_16x16x16_f16
 //
 
-// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f32_16x16x16_f16_w64(
-// CHECK-GFX1200-NEXT:  entry:
-// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16(<4 x half> [[A:%.*]], <4 x half> [[B:%.*]], <4 x float> [[C:%.*]])
-// CHECK-GFX1200-NEXT:    store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4:![0-9]+]]
+// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_wmma_f32_16x16x16_f16_w64(
+// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <4 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// CHECK-GFX1200-NEXT:  [[ENTRY:.*:]]
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16(<4 x half> [[A]], <4 x half> [[B]], <4 x float> [[C]])
+// CHECK-GFX1200-NEXT:    store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA4:![0-9]+]]
 // CHECK-GFX1200-NEXT:    ret void
 //
 void test_amdgcn_wmma_f32_16x16x16_f16_w64(global v4f* out, v4h a, v4h b, v4f c)
@@ -28,10 +29,11 @@ void test_amdgcn_wmma_f32_16x16x16_f16_w64(global v4f* out, v4h a, v4h b, v4f c)
 // amdgcn_wmma_f32_16x16x16_bf16
 //
 
-// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f32_16x16x16_bf16_w64(
-// CHECK-GFX1200-NEXT:  entry:
-// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x float> [[C:%.*]])
-// CHECK-GFX1200-NEXT:    store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_wmma_f32_16x16x16_bf16_w64(
+// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-GFX1200-NEXT:  [[ENTRY:.*:]]
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16(<4 x i16> [[A]], <4 x i16> [[B]], <4 x float> [[C]])
+// CHECK-GFX1200-NEXT:    store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA4]]
 // CHECK-GFX1200-NEXT:    ret void
 //
 void test_amdgcn_wmma_f32_16x16x16_bf16_w64(global v4f* out, v4s a, v4s b, v4f c)
@@ -43,10 +45,11 @@ void test_amdgcn_wmma_f32_16x16x16_bf16_w64(global v4f* out, v4s a, v4s b, v4f c
 // amdgcn_wmma_f16_16x16x16_f16
 //
 
-// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f16_16x16x16_f16_w64(
-// CHECK-GFX1200-NEXT:  entry:
-// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16(<4 x half> [[A:%.*]], <4 x half> [[B:%.*]], <4 x half> [[C:%.*]], i1 false)
-// CHECK-GFX1200-NEXT:    store <4 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 8, !tbaa [[TBAA4]]
+// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_wmma_f16_16x16x16_f16_w64(
+// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 8)) [[OUT:%.*]], <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <4 x half> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-GFX1200-NEXT:  [[ENTRY:.*:]]
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16(<4 x half> [[A]], <4 x half> [[B]], <4 x half> [[C]], i1 false)
+// CHECK-GFX1200-NEXT:    store <4 x half> [[TMP0]], ptr addrspace(1) [[OUT]], align 8, !tbaa [[CHAR_TBAA4]]
 // CHECK-GFX1200-NEXT:    ret void
 //
 void test_amdgcn_wmma_f16_16x16x16_f16_w64(global v4h* out, v4h a, v4h b, v4h c)
@@ -58,10 +61,11 @@ void test_amdgcn_wmma_f16_16x16x16_f16_w64(global v4h* out, v4h a, v4h b, v4h c)
 // amdgcn_wmma_bf16_16x16x16_bf16
 //
 
-// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_bf16_16x16x16_bf16_w64(
-// CHECK-GFX1200-NEXT:  entry:
-// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[C:%.*]], i1 false)
-// CHECK-GFX1200-NEXT:    store <4 x i16> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 8, !tbaa [[TBAA4]]
+// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_wmma_bf16_16x16x16_bf16_w64(
+// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 8)) [[OUT:%.*]], <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-GFX1200-NEXT:  [[ENTRY:.*:]]
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16(<4 x i16> [[A]], <4 x i16> [[B]], <4 x i16> [[C]], i1 false)
+// CHECK-GFX1200-NEXT:    store <4 x i16> [[TMP0]], ptr addrspace(1) [[OUT]], align 8, !tbaa [[CHAR_TBAA4]]
 // CHECK-GFX1200-NEXT:    ret void
 //
 void test_amdgcn_wmma_bf16_16x16x16_bf16_w64(global v4s* out, v4s a, v4s b, v4s c)
@@ -73,10 +77,11 @@ void test_amdgcn_wmma_bf16_16x16x16_bf16_w64(global v4s* out, v4s a, v4s b, v4s
 // amdgcn_wmma_i32_16x16x16_iu8
 //
 
-// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_i32_16x16x16_iu8_w64(
-// CHECK-GFX1200-NEXT:  entry:
-// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32(i1 true, i32 [[A:%.*]], i1 true, i32 [[B:%.*]], <4 x i32> [[C:%.*]], i1 false)
-// CHECK-GFX1200-NEXT:    store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_wmma_i32_16x16x16_iu8_w64(
+// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], <4 x i32> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-GFX1200-NEXT:  [[ENTRY:.*:]]
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32(i1 true, i32 [[A]], i1 true, i32 [[B]], <4 x i32> [[C]], i1 false)
+// CHECK-GFX1200-NEXT:    store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA4]]
 // CHECK-GFX1200-NEXT:    ret void
 //
 void test_amdgcn_wmma_i32_16x16x16_iu8_w64(global v4i* out, int a, int b, v4i c)
@@ -88,10 +93,11 @@ void test_amdgcn_wmma_i32_16x16x16_iu8_w64(global v4i* out, int a, int b, v4i c)
 // amdgcn_wmma_i32_16x16x16_iu4
 //
 
-// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_i32_16x16x16_iu4_w64(
-// CHECK-GFX1200-NEXT:  entry:
-// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32(i1 true, i32 [[A:%.*]], i1 true, i32 [[B:%.*]], <4 x i32> [[C:%.*]], i1 false)
-// CHECK-GFX1200-NEXT:    store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_wmma_i32_16x16x16_iu4_w64(
+// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], <4 x i32> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-GFX1200-NEXT:  [[ENTRY:.*:]]
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32(i1 true, i32 [[A]], i1 true, i32 [[B]], <4 x i32> [[C]], i1 false)
+// CHECK-GFX1200-NEXT:    store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA4]]
 // CHECK-GFX1200-NEXT:    ret void
 //
 void test_amdgcn_wmma_i32_16x16x16_iu4_w64(global v4i* out, int a, int b, v4i c)
@@ -99,10 +105,11 @@ void test_amdgcn_wmma_i32_16x16x16_iu4_w64(global v4i* out, int a, int b, v4i c)
   *out = __builtin_amdgcn_wmma_i32_16x16x16_iu4_w64_gfx12(true, a, true, b, c, false);
 }
 
-// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32(
-// CHECK-GFX1200-NEXT:  entry:
-// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32(i32 [[A:%.*]], i32 [[B:%.*]], <4 x float> [[C:%.*]])
-// CHECK-GFX1200-NEXT:    store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32(
+// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], <4 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-GFX1200-NEXT:  [[ENTRY:.*:]]
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32(i32 [[A]], i32 [[B]], <4 x float> [[C]])
+// CHECK-GFX1200-NEXT:    store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA4]]
 // CHECK-GFX1200-NEXT:    ret void
 //
 void test_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32(global v4f* out, int a, int b, v4f c)
@@ -110,10 +117,11 @@ void test_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32(global v4f* out, int a, int b, v4
   *out = __builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w64_gfx12(a, b, c);
 }
 
-// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32(
-// CHECK-GFX1200-NEXT:  entry:
-// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32(i32 [[A:%.*]], i32 [[B:%.*]], <4 x float> [[C:%.*]])
-// CHECK-GFX1200-NEXT:    store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32(
+// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], <4 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-GFX1200-NEXT:  [[ENTRY:.*:]]
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32(i32 [[A]], i32 [[B]], <4 x float> [[C]])
+// CHECK-GFX1200-NEXT:    store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA4]]
 // CHECK-GFX1200-NEXT:    ret void
 //
 void test_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32(global v4f* out, int a, int b, v4f c)
@@ -121,10 +129,11 @@ void test_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32(global v4f* out, int a, int b, v4
   *out = __builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w64_gfx12(a, b, c);
 }
 
-// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32(
-// CHECK-GFX1200-NEXT:  entry:
-// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32(i32 [[A:%.*]], i32 [[B:%.*]], <4 x float> [[C:%.*]])
-// CHECK-GFX1200-NEXT:    store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32(
+// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], <4 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-GFX1200-NEXT:  [[ENTRY:.*:]]
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32(i32 [[A]], i32 [[B]], <4 x float> [[C]])
+// CHECK-GFX1200-NEXT:    store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA4]]
 // CHECK-GFX1200-NEXT:    ret void
 //
 void test_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32(global v4f* out, int a, int b, v4f c)
@@ -132,10 +141,11 @@ void test_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32(global v4f* out, int a, int b, v4
   *out = __builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w64_gfx12(a, b, c);
 }
 
-// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32(
-// CHECK-GFX1200-NEXT:  entry:
-// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32(i32 [[A:%.*]], i32 [[B:%.*]], <4 x float> [[C:%.*]])
-// CHECK-GFX1200-NEXT:    store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32(
+// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], <4 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-GFX1200-NEXT:  [[ENTRY:.*:]]
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32(i32 [[A]], i32 [[B]], <4 x float> [[C]])
+// CHECK-GFX1200-NEXT:    store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA4]]
 // CHECK-GFX1200-NEXT:    ret void
 //
 void test_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32(global v4f* out, int a, int b, v4f c)
@@ -143,13 +153,19 @@ void test_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32(global v4f* out, int a, int b, v4
   *out = __builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w64_gfx12(a, b, c);
 }
 
-// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_i32_16x16x32_iu4_w32(
-// CHECK-GFX1200-NEXT:  entry:
-// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32(i1 true, i32 [[A:%.*]], i1 true, i32 [[B:%.*]], <4 x i32> [[C:%.*]], i1 false)
-// CHECK-GFX1200-NEXT:    store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_wmma_i32_16x16x32_iu4_w32(
+// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], <4 x i32> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-GFX1200-NEXT:  [[ENTRY:.*:]]
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32(i1 true, i32 [[A]], i1 true, i32 [[B]], <4 x i32> [[C]], i1 false)
+// CHECK-GFX1200-NEXT:    store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA4]]
 // CHECK-GFX1200-NEXT:    ret void
 //
 void test_amdgcn_wmma_i32_16x16x32_iu4_w32(global v4i* out, int a, int b, v4i c)
 {
   *out = __builtin_amdgcn_wmma_i32_16x16x32_iu4_w64_gfx12(true, a, true, b, c, false);
 }
+//.
+// CHECK-GFX1200: [[CHAR_TBAA4]] = !{[[META5:![0-9]+]], [[META5]], i64 0}
+// CHECK-GFX1200: [[META5]] = !{!"omnipotent char", [[META6:![0-9]+]], i64 0}
+// CHECK-GFX1200: [[META6]] = !{!"Simple C/C++ TBAA"}
+//.
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-async-load-store-lds.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-async-load-store-lds.cl
index c645d52cc7e38..e03ae66f92035 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-async-load-store-lds.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-async-load-store-lds.cl
@@ -1,13 +1,14 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6
 // REQUIRES: amdgpu-registered-target
 // RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx1250 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1250
 
 typedef int    v2i   __attribute__((ext_vector_type(2)));
 typedef int    v4i   __attribute__((ext_vector_type(4)));
 
-// CHECK-GFX1250-LABEL: @test_amdgcn_cluster_load_async_to_lds_b8(
-// CHECK-GFX1250-NEXT:  entry:
-// CHECK-GFX1250-NEXT:    tail call void @llvm.amdgcn.cluster.load.async.to.lds.b8(ptr addrspace(1) [[GADDR:%.*]], ptr addrspace(3) [[LADDR:%.*]], i32 16, i32 0, i32 [[MASK:%.*]])
+// CHECK-GFX1250-LABEL: define dso_local void @test_amdgcn_cluster_load_async_to_lds_b8(
+// CHECK-GFX1250-SAME: ptr addrspace(1) noundef readonly captures(none) [[GADDR:%.*]], ptr addrspace(3) noundef writeonly captures(none) [[LADDR:%.*]], i32 noundef [[MASK:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// CHECK-GFX1250-NEXT:  [[ENTRY:.*:]]
+// CHECK-GFX1250-NEXT:    tail call void @llvm.amdgcn.cluster.load.async.to.lds.b8(ptr addrspace(1) [[GADDR]], ptr addrspace(3) [[LADDR]], i32 16, i32 0, i32 [[MASK]])
 // CHECK-GFX1250-NEXT:    ret void
 //
 void test_amdgcn_cluster_load_async_to_lds_b8(global char* gaddr, local char* laddr, int mask)
@@ -15,9 +16,10 @@ void test_amdgcn_cluster_load_async_to_lds_b8(global char* gaddr, local char* la
   __builtin_amdgcn_cluster_load_async_to_lds_b8(gaddr, laddr, 16, 0, mask);
 }
 
-// CHECK-GFX1250-LABEL: @test_amdgcn_cluster_load_async_to_lds_b32(
-// CHECK-GFX1250-NEXT:  entry:
-// CHECK-GFX1250-NEXT:    tail call void @llvm.amdgcn.cluster.load.async.to.lds.b32(ptr addrspace(1) [[GADDR:%.*]], ptr addrspace(3) [[LADDR:%.*]], i32 16, i32 0, i32 [[MASK:%.*]])
+// CHECK-GFX1250-LABEL: define dso_local void @test_amdgcn_cluster_load_async_to_lds_b32(
+// CHECK-GFX1250-SAME: ptr addrspace(1) noundef readonly captures(none) [[GADDR:%.*]], ptr addrspace(3) noundef writeonly captures(none) [[LADDR:%.*]], i32 noundef [[MASK:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-GFX1250-NEXT:  [[ENTRY:.*:]]
+// CHECK-GFX1250-NEXT:    tail call void @llvm.amdgcn.cluster.load.async.to.lds.b32(ptr addrspace(1) [[GADDR]], ptr addrspace(3) [[LADDR]], i32 16, i32 0, i32 [[MASK]])
 // CHECK-GFX1250-NEXT:    ret void
 //
 void test_amdgcn_cluster_load_async_to_lds_b32(global int* gaddr, local int* laddr, int mask)
@@ -25,9 +27,10 @@ void test_amdgcn_cluster_load_async_to_lds_b32(global int* gaddr, local int* lad
   __builtin_amdgcn_cluster_load_async_to_lds_b32(gaddr, laddr, 16, 0, mask);
 }
 
-// CHECK-GFX1250-LABEL: @test_amdgcn_cluster_load_async_to_lds_b64(
-// CHECK-GFX1250-NEXT:  entry:
-// CHECK-GFX1250-NEXT:    tail call void @llvm.amdgcn.cluster.load.async.to.lds.b64(ptr addrspace(1) [[GADDR:%.*]], ptr addrspace(3) [[LADDR:%.*]], i32 16, i32 0, i32 [[MASK:%.*]])
+// CHECK-GFX1250-LABEL: define dso_local void @test_amdgcn_cluster_load_async_to_lds_b64(
+// CHECK-GFX1250-SAME: ptr addrspace(1) noundef readonly captures(none) [[GADDR:%.*]], ptr addrspace(3) noundef writeonly captures(none) [[LADDR:%.*]], i32 noundef [[MASK:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-GFX1250-NEXT:  [[ENTRY:.*:]]
+// CHECK-GFX1250-NEXT:    tail call void @llvm.amdgcn.cluster.load.async.to.lds.b64(ptr addrspace(1) [[GADDR]], ptr addrspace(3) [[LADDR]], i32 16, i32 0, i32 [[MASK]])
 // CHECK-GFX1250-NEXT:    ret void
 //
 void test_amdgcn_cluster_load_async_to_lds_b64(global v2i* gaddr, local v2i* laddr, int mask)
@@ -35,9 +38,10 @@ void test_amdgcn_cluster_load_async_to_lds_b64(global v2i* gaddr, local v2i* lad
   __builtin_amdgcn_cluster_load_async_to_lds_b64(gaddr, laddr, 16, 0, mask);
 }
 
-// CHECK-GFX1250-LABEL: @test_amdgcn_cluster_load_async_to_lds_b128(
-// CHECK-GFX1250-NEXT:  entry:
-// CHECK-GFX1250-NEXT:    tail call void @llvm.amdgcn.cluster.load.async.to.lds.b128(ptr addrspace(1) [[GADDR:%.*]], ptr addrspace(3) [[LADDR:%.*]], i32 16, i32 0, i32 [[MASK:%.*]])
+// CHECK-GFX1250-LABEL: define dso_local void @test_amdgcn_cluster_load_async_to_lds_b128(
+// CHECK-GFX1250-SAME: ptr addrspace(1) noundef readonly captures(none) [[GADDR:%.*]], ptr addrspace(3) noundef writeonly captures(none) [[LADDR:%.*]], i32 noundef [[MASK:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-GFX1250-NEXT:  [[ENTRY:.*:]]
+// CHECK-GFX1250-NEXT:    tail call void @llvm.amdgcn.cluster.load.async.to.lds.b128(ptr addrspace(1) [[GADDR]], ptr addrspace(3) [[LADDR]], i32 16, i32 0, i32 [[MASK]])
 // CHECK-GFX1250-NEXT:    ret void
 //
 void test_amdgcn_cluster_load_async_to_lds_b128(global v4i* gaddr, local v4i* laddr, int mask)
@@ -45,9 +49,10 @@ void test_amdgcn_cluster_load_async_to_lds_b128(global v4i* gaddr, local v4i* la
   __builtin_amdgcn_cluster_load_async_to_lds_b128(gaddr, laddr, 16, 0, mask);
 }
 
-// CHECK-GFX1250-LABEL: @test_amdgcn_global_load_async_to_lds_b8(
-// CHECK-GFX1250-NEXT:  entry:
-// CHECK-GFX1250-NEXT:    tail call void @llvm.amdgcn.global.load.async.to.lds.b8(ptr addrspace(1) [[GADDR:%.*]], ptr addrspace(3) [[LADDR:%.*]], i32 16, i32 0)
+// CHECK-GFX1250-LABEL: define dso_local void @test_amdgcn_global_load_async_to_lds_b8(
+// CHECK-GFX1250-SAME: ptr addrspace(1) noundef readonly captures(none) [[GADDR:%.*]], ptr addrspace(3) noundef writeonly captures(none) [[LADDR:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-GFX1250-NEXT:  [[ENTRY:.*:]]
+// CHECK-GFX1250-NEXT:    tail call void @llvm.amdgcn.global.load.async.to.lds.b8(ptr addrspace(1) [[GADDR]], ptr addrspace(3) [[LADDR]], i32 16, i32 0)
 // CHECK-GFX1250-NEXT:    ret void
 //
 void test_amdgcn_global_load_async_to_lds_b8( global char* gaddr, local char* laddr)
@@ -55,9 +60,10 @@ void test_amdgcn_global_load_async_to_lds_b8( global char* gaddr, local char* la
   __builtin_amdgcn_global_load_async_to_lds_b8(gaddr, laddr, 16, 0);
 }
 
-// CHECK-GFX1250-LABEL: @test_amdgcn_global_load_async_to_lds_b32(
-// CHECK-GFX1250-NEXT:  entry:
-// CHECK-GFX1250-NEXT:    tail call void @llvm.amdgcn.global.load.async.to.lds.b32(ptr addrspace(1) [[GADDR:%.*]], ptr addrspace(3) [[LADDR:%.*]], i32 16, i32 0)
+// CHECK-GFX1250-LABEL: define dso_local void @test_amdgcn_global_load_async_to_lds_b32(
+// CHECK-GFX1250-SAME: ptr addrspace(1) noundef readonly captures(none) [[GADDR:%.*]], ptr addrspace(3) noundef writeonly captures(none) [[LADDR:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-GFX1250-NEXT:  [[ENTRY:.*:]]
+// CHECK-GFX1250-NEXT:    tail call void @llvm.amdgcn.global.load.async.to.lds.b32(ptr addrspace(1) [[GADDR]], ptr addrspace(3) [[LADDR]], i32 16, i32 0)
 // CHECK-GFX1250-NEXT:    ret void
 //
 void test_amdgcn_global_load_async_to_lds_b32(global int* gaddr, local int* laddr)
@@ -65,9 +71,10 @@ void test_amdgcn_global_load_async_to_lds_b32(global int* gaddr, local int* ladd
   __builtin_amdgcn_global_load_async_to_lds_b32(gaddr, laddr, 16, 0);
 }
 
-// CHECK-GFX1250-LABEL: @test_amdgcn_global_load_async_to_lds_b64(
-// CHECK-GFX1250-NEXT:  entry:
-// CHECK-GFX1250-NEXT:    tail call void @llvm.amdgcn.global.load.async.to.lds.b64(ptr addrspace(1) [[GADDR:%.*]], ptr addrspace(3) [[LADDR:%.*]], i32 16, i32 0)
+// CHECK-GFX1250-LABEL: define dso_local void @test_amdgcn_global_load_async_to_lds_b64(
+// CHECK-GFX1250-SAME: ptr addrspace(1) noundef readonly captures(none) [[GADDR:%.*]], ptr addrspace(3) noundef writeonly captures(none) [[LADDR:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-GFX1250-NEXT:  [[ENTRY:.*:]]
+// CHECK-GFX1250-NEXT:    tail call void @llvm.amdgcn.global.load.async.to.lds.b64(ptr addrspace(1) [[GADDR]], ptr addrspace(3) [[LADDR]], i32 16, i32 0)
 // CHECK-GFX1250-NEXT:    ret void
 //
 void test_amdgcn_global_load_async_to_lds_b64(global v2i* gaddr, local v2i* laddr)
@@ -75,9 +82,10 @@ void test_amdgcn_global_load_async_to_lds_b64(global v2i* gaddr, local v2i* ladd
   __builtin_amdgcn_global_load_async_to_lds_b64(gaddr, laddr, 16, 0);
 }
 
-// CHECK-GFX1250-LABEL: @test_amdgcn_global_load_async_to_lds_b128(
-// CHECK-GFX1250-NEXT:  entry:
-// CHECK-GFX1250-NEXT:    tail call void @llvm.amdgcn.global.load.async.to.lds.b128(ptr addrspace(1) [[GADDR:%.*]], ptr addrspace(3) [[LADDR:%.*]], i32 16, i32 0)
+// CHECK-GFX1250-LABEL: define dso_local void @test_amdgcn_global_load_async_to_lds_b128(
+// CHECK-GFX1250-SAME: ptr addrspace(1) noundef readonly captures(none) [[GADDR:%.*]], ptr addrspace(3) noundef writeonly captures(none) [[LADDR:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-GFX1250-NEXT:  [[ENTRY:.*:]]
+// CHECK-GFX1250-NEXT:    tail call void @llvm.amdgcn.global.load.async.to.lds.b128(ptr addrspace(1) [[GADDR]], ptr addrspace(3) [[LADDR]], i32 16, i32 0)
 // CHECK-GFX1250-NEXT:    ret void
 //
 void test_amdgcn_global_load_async_to_lds_b128( global v4i* gaddr, local v4i* laddr)
@@ -85,9 +93,10 @@ void test_amdgcn_global_load_async_to_lds_b128( global v4i* gaddr, local v4i* la
   __builtin_amdgcn_global_load_async_to_lds_b128(gaddr, laddr, 16, 0);
 }
 
-// CHECK-GFX1250-LABEL: @test_amdgcn_global_store_async_from_lds_b8(
-// CHECK-GFX1250-NEXT:  entry:
-// CHECK-GFX1250-NEXT:    tail call void @llvm.amdgcn.global.store.async.from.lds.b8(ptr addrspace(1) [[GADDR:%.*]], ptr addrspace(3) [[LADDR:%.*]], i32 16, i32 0)
+// CHECK-GFX1250-LABEL: define dso_local void @test_amdgcn_global_store_async_from_lds_b8(
+// CHECK-GFX1250-SAME: ptr addrspace(1) noundef writeonly captures(none) [[GADDR:%.*]], ptr addrspace(3) noundef readonly captures(none) [[LADDR:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-GFX1250-NEXT:  [[ENTRY:.*:]]
+// CHECK-GFX1250-NEXT:    tail call void @llvm.amdgcn.global.store.async.from.lds.b8(ptr addrspace(1) [[GADDR]], ptr addrspace(3) [[LADDR]], i32 16, i32 0)
 // CHECK-GFX1250-NEXT:    ret void
 //
 void test_amdgcn_global_store_async_from_lds_b8(global char* gaddr, local char* laddr)
@@ -95,9 +104,10 @@ void test_amdgcn_global_store_async_from_lds_b8(global char* gaddr, local char*
   __builtin_amdgcn_global_store_async_from_lds_b8(gaddr, laddr, 16, 0);
 }
 
-// CHECK-GFX1250-LABEL: @test_amdgcn_global_store_async_from_lds_b32(
-// CHECK-GFX1250-NEXT:  entry:
-// CHECK-GFX1250-NEXT:    tail call void @llvm.amdgcn.global.store.async.from.lds.b32(ptr addrspace(1) [[GADDR:%.*]], ptr addrspace(3) [[LADDR:%.*]], i32 16, i32 0)
+// CHECK-GFX1250-LABEL: define dso_local void @test_amdgcn_global_store_async_from_lds_b32(
+// CHECK-GFX1250-SAME: ptr addrspace(1) noundef writeonly captures(none) [[GADDR:%.*]], ptr addrspace(3) noundef readonly captures(none) [[LADDR:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-GFX1250-NEXT:  [[ENTRY:.*:]]
+// CHECK-GFX1250-NEXT:    tail call void @llvm.amdgcn.global.store.async.from.lds.b32(ptr addrspace(1) [[GADDR]], ptr addrspace(3) [[LADDR]], i32 16, i32 0)
 // CHECK-GFX1250-NEXT:    ret void
 //
 void test_amdgcn_global_store_async_from_lds_b32(global int* gaddr, local int* laddr)
@@ -105,9 +115,10 @@ void test_amdgcn_global_store_async_from_lds_b32(global int* gaddr, local int* l
   __builtin_amdgcn_global_store_async_from_lds_b32(gaddr, laddr, 16, 0);
 }
 
-// CHECK-GFX1250-LABEL: @test_amdgcn_global_store_async_from_lds_b64(
-// CHECK-GFX1250-NEXT:  entry:
-// CHECK-GFX1250-NEXT:    tail call void @llvm.amdgcn.global.store.async.from.lds.b64(ptr addrspace(1) [[GADDR:%.*]], ptr addrspace(3) [[LADDR:%.*]], i32 16, i32 0)
+// CHECK-GFX1250-LABEL: define dso_local void @test_amdgcn_global_store_async_from_lds_b64(
+// CHECK-GFX1250-SAME: ptr addrspace(1) noundef writeonly captures(none) [[GADDR:%.*]], ptr addrspace(3) noundef readonly captures(none) [[LADDR:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-GFX1250-NEXT:  [[ENTRY:.*:]]
+// CHECK-GFX1250-NEXT:    tail call void @llvm.amdgcn.global.store.async.from.lds.b64(ptr addrspace(1) [[GADDR]], ptr addrspace(3) [[LADDR]], i32 16, i32 0)
 // CHECK-GFX1250-NEXT:    ret void
 //
 void test_amdgcn_global_store_async_from_lds_b64(global v2i* gaddr, local v2i* laddr)
@@ -115,9 +126,10 @@ void test_amdgcn_global_store_async_from_lds_b64(global v2i* gaddr, local v2i* l
   __builtin_amdgcn_global_store_async_from_lds_b64(gaddr, laddr, 16, 0);
 }
 
-// CHECK-GFX1250-LABEL: @test_amdgcn_global_store_async_from_lds_b128(
-// CHECK-GFX1250-NEXT:  entry:
-// CHECK-GFX1250-NEXT:    tail call void @llvm.amdgcn.global.store.async.from.lds.b128(ptr addrspace(1) [[GADDR:%.*]], ptr addrspace(3) [[LADDR:%.*]], i32 16, i32 0)
+// CHECK-GFX1250-LABEL: define dso_local void @test_amdgcn_global_store_async_from_lds_b128(
+// CHECK-GFX1250-SAME: ptr addrspace(1) noundef writeonly captures(none) [[GADDR:%.*]], ptr addrspace(3) noundef readonly captures(none) [[LADDR:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-GFX1250-NEXT:  [[ENTRY:.*:]]
+// CHECK-GFX1250-NEXT:    tail call void @llvm.amdgcn.global.store.async.from.lds.b128(ptr addrspace(1) [[GADDR]], ptr addrspace(3) [[LADDR]], i32 16, i32 0)
 // CHECK-GFX1250-NEXT:    ret void
 //
 void test_amdgcn_global_store_async_from_lds_b128(global v4i* gaddr, local v4i* laddr)
@@ -125,9 +137,10 @@ void test_amdgcn_global_store_async_from_lds_b128(global v4i* gaddr, local v4i*
   __builtin_amdgcn_global_store_async_from_lds_b128(gaddr, laddr, 16, 0);
 }
 
-// CHECK-GFX1250-LABEL: @test_amdgcn_ds_atomic_async_barrier_arrive_b64(
-// CHECK-GFX1250-NEXT:  entry:
-// CHECK-GFX1250-NEXT:    tail call void @llvm.amdgcn.ds.atomic.async.barrier.arrive.b64(ptr addrspace(3) [[ADDR:%.*]])
+// CHECK-GFX1250-LABEL: define dso_local void @test_amdgcn_ds_atomic_async_barrier_arrive_b64(
+// CHECK-GFX1250-SAME: ptr addrspace(3) noundef [[ADDR:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] {
+// CHECK-GFX1250-NEXT:  [[ENTRY:.*:]]
+// CHECK-GFX1250-NEXT:    tail call void @llvm.amdgcn.ds.atomic.async.barrier.arrive.b64(ptr addrspace(3) [[ADDR]])
 // CHECK-GFX1250-NEXT:    ret void
 //
 void test_amdgcn_ds_atomic_async_barrier_arrive_b64(local long* addr)
@@ -135,13 +148,20 @@ void test_amdgcn_ds_atomic_async_barrier_arrive_b64(local long* addr)
   __builtin_amdgcn_ds_atomic_async_barrier_arrive_b64(addr);
 }
 
-// CHECK-GFX1250-LABEL: @test_amdgcn_ds_atomic_barrier_arrive_rtn_b64(
-// CHECK-GFX1250-NEXT:  entry:
-// CHECK-GFX1250-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.amdgcn.ds.atomic.barrier.arrive.rtn.b64(ptr addrspace(3) [[ADDR:%.*]], i64 [[DATA:%.*]])
-// CHECK-GFX1250-NEXT:    store i64 [[TMP0]], ptr [[OUT:%.*]], align 8, !tbaa [[TBAA4:![0-9]+]]
+// CHECK-GFX1250-LABEL: define dso_local void @test_amdgcn_ds_atomic_barrier_arrive_rtn_b64(
+// CHECK-GFX1250-SAME: ptr addrspace(3) noundef captures(none) [[ADDR:%.*]], i64 noundef [[DATA:%.*]], ptr noundef writeonly captures(none) initializes((0, 8)) [[OUT:%.*]]) local_unnamed_addr #[[ATTR4:[0-9]+]] {
+// CHECK-GFX1250-NEXT:  [[ENTRY:.*:]]
+// CHECK-GFX1250-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.amdgcn.ds.atomic.barrier.arrive.rtn.b64(ptr addrspace(3) [[ADDR]], i64 [[DATA]])
+// CHECK-GFX1250-NEXT:    store i64 [[TMP0]], ptr [[OUT]], align 8, !tbaa [[LONG_TBAA4:![0-9]+]]
 // CHECK-GFX1250-NEXT:    ret void
 //
 void test_amdgcn_ds_atomic_barrier_arrive_rtn_b64(local long* addr, long data, long *out)
 {
   *out = __builtin_amdgcn_ds_atomic_barrier_arrive_rtn_b64(addr, data);
 }
+//.
+// CHECK-GFX1250: [[LONG_TBAA4]] = !{[[META5:![0-9]+]], [[META5]], i64 0}
+// CHECK-GFX1250: [[META5]] = !{!"long", [[META6:![0-9]+]], i64 0}
+// CHECK-GFX1250: [[META6]] = !{!"omnipotent char", [[META7:![0-9]+]], i64 0}
+// CHECK-GFX1250: [[META7]] = !{!"Simple C/C++ TBAA"}
+//.
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
index c35715965daeb..b6b475a7565ba 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
@@ -1478,6 +1478,16 @@ void test_prefetch(generic void *fptr, global void *gptr) {
   __builtin_amdgcn_global_prefetch(gptr, 8);
 }
 
+// CHECK-LABEL: @test_s_cluster_barrier(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.amdgcn.s.cluster.barrier()
+// CHECK-NEXT:    ret void
+//
+void test_s_cluster_barrier()
+{
+  __builtin_amdgcn_s_cluster_barrier();
+}
+
 // CHECK-LABEL: @test_global_add_f32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[RETVAL:%.*]] = alloca float, align 4, addrspace(5)
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-swmmac-w32.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-swmmac-w32.cl
index 9927bb334c486..214390142b6aa 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-swmmac-w32.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-swmmac-w32.cl
@@ -1,4 +1,4 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6
 // REQUIRES: amdgpu-registered-target
 // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -target-feature +wavefrontsize32 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1200
 
@@ -13,10 +13,11 @@ typedef short v16s   __attribute__((ext_vector_type(16)));
 
 // Wave32
 
-// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f32_16x16x32_f16_w32(
-// CHECK-GFX1200-NEXT:  entry:
-// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.i32(<8 x half> [[A:%.*]], <16 x half> [[B:%.*]], <8 x float> [[C:%.*]], i32 [[INDEX:%.*]])
-// CHECK-GFX1200-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4:![0-9]+]]
+// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_swmmac_f32_16x16x32_f16_w32(
+// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], <8 x half> noundef [[A:%.*]], <16 x half> noundef [[B:%.*]], <8 x float> noundef [[C:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// CHECK-GFX1200-NEXT:  [[ENTRY:.*:]]
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.i32(<8 x half> [[A]], <16 x half> [[B]], <8 x float> [[C]], i32 [[INDEX]])
+// CHECK-GFX1200-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA4:![0-9]+]]
 // CHECK-GFX1200-NEXT:    ret void
 //
 void test_amdgcn_swmmac_f32_16x16x32_f16_w32(global v8f* out, v8h a, v16h b, v8f c, int index)
@@ -24,10 +25,11 @@ void test_amdgcn_swmmac_f32_16x16x32_f16_w32(global v8f* out, v8h a, v16h b, v8f
   *out = __builtin_amdgcn_swmmac_f32_16x16x32_f16_w32(a, b, c, index);
 }
 
-// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f32_16x16x32_bf16_w32(
-// CHECK-GFX1200-NEXT:  entry:
-// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.i32(<8 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x float> [[C:%.*]], i32 [[INDEX:%.*]])
-// CHECK-GFX1200-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_swmmac_f32_16x16x32_bf16_w32(
+// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], <8 x i16> noundef [[A:%.*]], <16 x i16> noundef [[B:%.*]], <8 x float> noundef [[C:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-GFX1200-NEXT:  [[ENTRY:.*:]]
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.i32(<8 x i16> [[A]], <16 x i16> [[B]], <8 x float> [[C]], i32 [[INDEX]])
+// CHECK-GFX1200-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA4]]
 // CHECK-GFX1200-NEXT:    ret void
 //
 void test_amdgcn_swmmac_f32_16x16x32_bf16_w32(global v8f* out, v8s a, v16s b, v8f c, int index)
@@ -35,10 +37,11 @@ void test_amdgcn_swmmac_f32_16x16x32_bf16_w32(global v8f* out, v8s a, v16s b, v8
   *out = __builtin_amdgcn_swmmac_f32_16x16x32_bf16_w32(a, b, c, index);
 }
 
-// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f16_16x16x32_f16_w32(
-// CHECK-GFX1200-NEXT:  entry:
-// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i32(<8 x half> [[A:%.*]], <16 x half> [[B:%.*]], <8 x half> [[C:%.*]], i32 [[INDEX:%.*]])
-// CHECK-GFX1200-NEXT:    store <8 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_swmmac_f16_16x16x32_f16_w32(
+// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], <8 x half> noundef [[A:%.*]], <16 x half> noundef [[B:%.*]], <8 x half> noundef [[C:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-GFX1200-NEXT:  [[ENTRY:.*:]]
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i32(<8 x half> [[A]], <16 x half> [[B]], <8 x half> [[C]], i32 [[INDEX]])
+// CHECK-GFX1200-NEXT:    store <8 x half> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA4]]
 // CHECK-GFX1200-NEXT:    ret void
 //
 void test_amdgcn_swmmac_f16_16x16x32_f16_w32(global v8h* out, v8h a, v16h b, v8h c, int index)
@@ -46,10 +49,11 @@ void test_amdgcn_swmmac_f16_16x16x32_f16_w32(global v8h* out, v8h a, v16h b, v8h
   *out = __builtin_amdgcn_swmmac_f16_16x16x32_f16_w32(a, b, c, index);
 }
 
-// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_bf16_16x16x32_bf16_w32(
-// CHECK-GFX1200-NEXT:  entry:
-// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.i32(<8 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i16> [[C:%.*]], i32 [[INDEX:%.*]])
-// CHECK-GFX1200-NEXT:    store <8 x i16> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_swmmac_bf16_16x16x32_bf16_w32(
+// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], <8 x i16> noundef [[A:%.*]], <16 x i16> noundef [[B:%.*]], <8 x i16> noundef [[C:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-GFX1200-NEXT:  [[ENTRY:.*:]]
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.i32(<8 x i16> [[A]], <16 x i16> [[B]], <8 x i16> [[C]], i32 [[INDEX]])
+// CHECK-GFX1200-NEXT:    store <8 x i16> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA4]]
 // CHECK-GFX1200-NEXT:    ret void
 //
 void test_amdgcn_swmmac_bf16_16x16x32_bf16_w32(global v8s* out, v8s a, v16s b, v8s c, int index)
@@ -57,10 +61,11 @@ void test_amdgcn_swmmac_bf16_16x16x32_bf16_w32(global v8s* out, v8s a, v16s b, v
   *out = __builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w32(a, b, c, index);
 }
 
-// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_i32_16x16x32_iu8_w32(
-// CHECK-GFX1200-NEXT:  entry:
-// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.i32(i1 true, <2 x i32> [[A:%.*]], i1 true, <4 x i32> [[B:%.*]], <8 x i32> [[C:%.*]], i32 [[INDEX:%.*]], i1 true)
-// CHECK-GFX1200-NEXT:    store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_swmmac_i32_16x16x32_iu8_w32(
+// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], <2 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]], <8 x i32> noundef [[C:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-GFX1200-NEXT:  [[ENTRY:.*:]]
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.i32(i1 true, <2 x i32> [[A]], i1 true, <4 x i32> [[B]], <8 x i32> [[C]], i32 [[INDEX]], i1 true)
+// CHECK-GFX1200-NEXT:    store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA4]]
 // CHECK-GFX1200-NEXT:    ret void
 //
 void test_amdgcn_swmmac_i32_16x16x32_iu8_w32(global v8i* out, v2i a, v4i b, v8i c, int index)
@@ -68,10 +73,11 @@ void test_amdgcn_swmmac_i32_16x16x32_iu8_w32(global v8i* out, v2i a, v4i b, v8i
   *out = __builtin_amdgcn_swmmac_i32_16x16x32_iu8_w32(true, a, true, b, c, index, true);
 }
 
-// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_i32_16x16x32_iu4_w32(
-// CHECK-GFX1200-NEXT:  entry:
-// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.i32(i1 true, i32 [[A:%.*]], i1 true, <2 x i32> [[B:%.*]], <8 x i32> [[C:%.*]], i32 [[INDEX:%.*]], i1 true)
-// CHECK-GFX1200-NEXT:    store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_swmmac_i32_16x16x32_iu4_w32(
+// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], i32 noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <8 x i32> noundef [[C:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-GFX1200-NEXT:  [[ENTRY:.*:]]
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.i32(i1 true, i32 [[A]], i1 true, <2 x i32> [[B]], <8 x i32> [[C]], i32 [[INDEX]], i1 true)
+// CHECK-GFX1200-NEXT:    store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA4]]
 // CHECK-GFX1200-NEXT:    ret void
 //
 void test_amdgcn_swmmac_i32_16x16x32_iu4_w32(global v8i* out, int a, v2i b, v8i c, int index)
@@ -79,10 +85,11 @@ void test_amdgcn_swmmac_i32_16x16x32_iu4_w32(global v8i* out, int a, v2i b, v8i
   *out = __builtin_amdgcn_swmmac_i32_16x16x32_iu4_w32(true, a, true, b, c, index, true);
 }
 
-// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_i32_16x16x64_iu4_w32(
-// CHECK-GFX1200-NEXT:  entry:
-// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.i32(i1 true, <2 x i32> [[A:%.*]], i1 true, <4 x i32> [[B:%.*]], <8 x i32> [[C:%.*]], i32 [[INDEX:%.*]], i1 true)
-// CHECK-GFX1200-NEXT:    store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_swmmac_i32_16x16x64_iu4_w32(
+// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], <2 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]], <8 x i32> noundef [[C:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-GFX1200-NEXT:  [[ENTRY:.*:]]
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.i32(i1 true, <2 x i32> [[A]], i1 true, <4 x i32> [[B]], <8 x i32> [[C]], i32 [[INDEX]], i1 true)
+// CHECK-GFX1200-NEXT:    store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA4]]
 // CHECK-GFX1200-NEXT:    ret void
 //
 void test_amdgcn_swmmac_i32_16x16x64_iu4_w32(global v8i* out, v2i a, v4i b, v8i c, int index)
@@ -90,10 +97,11 @@ void test_amdgcn_swmmac_i32_16x16x64_iu4_w32(global v8i* out, v2i a, v4i b, v8i
   *out = __builtin_amdgcn_swmmac_i32_16x16x64_iu4_w32(true, a, true, b, c, index, true);
 }
 
-// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32(
-// CHECK-GFX1200-NEXT:  entry:
-// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.i32(<2 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <8 x float> [[C:%.*]], i32 [[INDEX:%.*]])
-// CHECK-GFX1200-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32(
+// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], <2 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]], <8 x float> noundef [[C:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-GFX1200-NEXT:  [[ENTRY:.*:]]
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.i32(<2 x i32> [[A]], <4 x i32> [[B]], <8 x float> [[C]], i32 [[INDEX]])
+// CHECK-GFX1200-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA4]]
 // CHECK-GFX1200-NEXT:    ret void
 //
 void test_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32(global v8f* out, v2i a, v4i b, v8f c, int index)
@@ -101,10 +109,11 @@ void test_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32(global v8f* out, v2i a, v4i b,
   *out = __builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32(a, b, c, index);
 }
 
-// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32(
-// CHECK-GFX1200-NEXT:  entry:
-// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.i32(<2 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <8 x float> [[C:%.*]], i32 [[INDEX:%.*]])
-// CHECK-GFX1200-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32(
+// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], <2 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]], <8 x float> noundef [[C:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-GFX1200-NEXT:  [[ENTRY:.*:]]
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.i32(<2 x i32> [[A]], <4 x i32> [[B]], <8 x float> [[C]], i32 [[INDEX]])
+// CHECK-GFX1200-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA4]]
 // CHECK-GFX1200-NEXT:    ret void
 //
 void test_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32(global v8f* out, v2i a, v4i b, v8f c, int index)
@@ -112,10 +121,11 @@ void test_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32(global v8f* out, v2i a, v4i b,
   *out = __builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32(a, b, c, index);
 }
 
-// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32(
-// CHECK-GFX1200-NEXT:  entry:
-// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.i32(<2 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <8 x float> [[C:%.*]], i32 [[INDEX:%.*]])
-// CHECK-GFX1200-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32(
+// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], <2 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]], <8 x float> noundef [[C:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-GFX1200-NEXT:  [[ENTRY:.*:]]
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.i32(<2 x i32> [[A]], <4 x i32> [[B]], <8 x float> [[C]], i32 [[INDEX]])
+// CHECK-GFX1200-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA4]]
 // CHECK-GFX1200-NEXT:    ret void
 //
 void test_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32(global v8f* out, v2i a, v4i b, v8f c, int index)
@@ -123,13 +133,19 @@ void test_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32(global v8f* out, v2i a, v4i b,
   *out = __builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32(a, b, c, index);
 }
 
-// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32(
-// CHECK-GFX1200-NEXT:  entry:
-// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.i32(<2 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <8 x float> [[C:%.*]], i32 [[INDEX:%.*]])
-// CHECK-GFX1200-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32(
+// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], <2 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]], <8 x float> noundef [[C:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-GFX1200-NEXT:  [[ENTRY:.*:]]
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.i32(<2 x i32> [[A]], <4 x i32> [[B]], <8 x float> [[C]], i32 [[INDEX]])
+// CHECK-GFX1200-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA4]]
 // CHECK-GFX1200-NEXT:    ret void
 //
 void test_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32(global v8f* out, v2i a, v4i b, v8f c, int index)
 {
   *out = __builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32(a, b, c, index);
 }
+//.
+// CHECK-GFX1200: [[CHAR_TBAA4]] = !{[[META5:![0-9]+]], [[META5]], i64 0}
+// CHECK-GFX1200: [[META5]] = !{!"omnipotent char", [[META6:![0-9]+]], i64 0}
+// CHECK-GFX1200: [[META6]] = !{!"Simple C/C++ TBAA"}
+//.
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-swmmac-w64.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-swmmac-w64.cl
index eaa6b14d2a792..47753afd1aa52 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-swmmac-w64.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-swmmac-w64.cl
@@ -1,4 +1,4 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6
 // REQUIRES: amdgpu-registered-target
 // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -target-feature +wavefrontsize64 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1200
 
@@ -12,10 +12,11 @@ typedef short  v8s   __attribute__((ext_vector_type(8)));
 
 // Wave64
 
-// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f32_16x16x32_f16_w64(
-// CHECK-GFX1200-NEXT:  entry:
-// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.i32(<4 x half> [[A:%.*]], <8 x half> [[B:%.*]], <4 x float> [[C:%.*]], i32 [[INDEX:%.*]])
-// CHECK-GFX1200-NEXT:    store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4:![0-9]+]]
+// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_swmmac_f32_16x16x32_f16_w64(
+// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], <4 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <4 x float> noundef [[C:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// CHECK-GFX1200-NEXT:  [[ENTRY:.*:]]
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.i32(<4 x half> [[A]], <8 x half> [[B]], <4 x float> [[C]], i32 [[INDEX]])
+// CHECK-GFX1200-NEXT:    store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA4:![0-9]+]]
 // CHECK-GFX1200-NEXT:    ret void
 //
 void test_amdgcn_swmmac_f32_16x16x32_f16_w64(global v4f* out, v4h a, v8h b, v4f c, int index)
@@ -23,10 +24,11 @@ void test_amdgcn_swmmac_f32_16x16x32_f16_w64(global v4f* out, v4h a, v8h b, v4f
   *out = __builtin_amdgcn_swmmac_f32_16x16x32_f16_w64(a, b, c, index);
 }
 
-// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f32_16x16x32_bf16_w64(
-// CHECK-GFX1200-NEXT:  entry:
-// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.i32(<4 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <4 x float> [[C:%.*]], i32 [[INDEX:%.*]])
-// CHECK-GFX1200-NEXT:    store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_swmmac_f32_16x16x32_bf16_w64(
+// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], <4 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]], <4 x float> noundef [[C:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-GFX1200-NEXT:  [[ENTRY:.*:]]
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.i32(<4 x i16> [[A]], <8 x i16> [[B]], <4 x float> [[C]], i32 [[INDEX]])
+// CHECK-GFX1200-NEXT:    store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA4]]
 // CHECK-GFX1200-NEXT:    ret void
 //
 void test_amdgcn_swmmac_f32_16x16x32_bf16_w64(global v4f* out, v4s a, v8s b, v4f c, int index)
@@ -34,10 +36,11 @@ void test_amdgcn_swmmac_f32_16x16x32_bf16_w64(global v4f* out, v4s a, v8s b, v4f
   *out = __builtin_amdgcn_swmmac_f32_16x16x32_bf16_w64(a, b, c, index);
 }
 
-// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f16_16x16x32_f16_w64(
-// CHECK-GFX1200-NEXT:  entry:
-// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.i32(<4 x half> [[A:%.*]], <8 x half> [[B:%.*]], <4 x half> [[C:%.*]], i32 [[INDEX:%.*]])
-// CHECK-GFX1200-NEXT:    store <4 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 8, !tbaa [[TBAA4]]
+// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_swmmac_f16_16x16x32_f16_w64(
+// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 8)) [[OUT:%.*]], <4 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <4 x half> noundef [[C:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-GFX1200-NEXT:  [[ENTRY:.*:]]
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.i32(<4 x half> [[A]], <8 x half> [[B]], <4 x half> [[C]], i32 [[INDEX]])
+// CHECK-GFX1200-NEXT:    store <4 x half> [[TMP0]], ptr addrspace(1) [[OUT]], align 8, !tbaa [[CHAR_TBAA4]]
 // CHECK-GFX1200-NEXT:    ret void
 //
 void test_amdgcn_swmmac_f16_16x16x32_f16_w64(global v4h* out, v4h a, v8h b, v4h c, int index)
@@ -45,10 +48,11 @@ void test_amdgcn_swmmac_f16_16x16x32_f16_w64(global v4h* out, v4h a, v8h b, v4h
   *out = __builtin_amdgcn_swmmac_f16_16x16x32_f16_w64(a, b, c, index);
 }
 
-// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_bf16_16x16x32_bf16_w64(
-// CHECK-GFX1200-NEXT:  entry:
-// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.i32(<4 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <4 x i16> [[C:%.*]], i32 [[INDEX:%.*]])
-// CHECK-GFX1200-NEXT:    store <4 x i16> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 8, !tbaa [[TBAA4]]
+// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_swmmac_bf16_16x16x32_bf16_w64(
+// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 8)) [[OUT:%.*]], <4 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-GFX1200-NEXT:  [[ENTRY:.*:]]
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.i32(<4 x i16> [[A]], <8 x i16> [[B]], <4 x i16> [[C]], i32 [[INDEX]])
+// CHECK-GFX1200-NEXT:    store <4 x i16> [[TMP0]], ptr addrspace(1) [[OUT]], align 8, !tbaa [[CHAR_TBAA4]]
 // CHECK-GFX1200-NEXT:    ret void
 //
 void test_amdgcn_swmmac_bf16_16x16x32_bf16_w64(global v4s* out, v4s a, v8s b, v4s c, int index)
@@ -56,10 +60,11 @@ void test_amdgcn_swmmac_bf16_16x16x32_bf16_w64(global v4s* out, v4s a, v8s b, v4
   *out = __builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w64(a, b, c, index);
 }
 
-// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_i32_16x16x32_iu8_w64(
-// CHECK-GFX1200-NEXT:  entry:
-// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.i32(i1 true, i32 [[A:%.*]], i1 true, <2 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], i32 [[INDEX:%.*]], i1 true)
-// CHECK-GFX1200-NEXT:    store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_swmmac_i32_16x16x32_iu8_w64(
+// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], i32 noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <4 x i32> noundef [[C:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-GFX1200-NEXT:  [[ENTRY:.*:]]
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.i32(i1 true, i32 [[A]], i1 true, <2 x i32> [[B]], <4 x i32> [[C]], i32 [[INDEX]], i1 true)
+// CHECK-GFX1200-NEXT:    store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA4]]
 // CHECK-GFX1200-NEXT:    ret void
 //
 void test_amdgcn_swmmac_i32_16x16x32_iu8_w64(global v4i* out, int a, v2i b, v4i c, int index)
@@ -67,10 +72,11 @@ void test_amdgcn_swmmac_i32_16x16x32_iu8_w64(global v4i* out, int a, v2i b, v4i
   *out = __builtin_amdgcn_swmmac_i32_16x16x32_iu8_w64(true, a, true, b, c, index, true);
 }
 
-// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_i32_16x16x32_iu4_w64(
-// CHECK-GFX1200-NEXT:  entry:
-// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.i32(i1 true, i32 [[A:%.*]], i1 true, i32 [[B:%.*]], <4 x i32> [[C:%.*]], i32 [[INDEX:%.*]], i1 true)
-// CHECK-GFX1200-NEXT:    store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_swmmac_i32_16x16x32_iu4_w64(
+// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], <4 x i32> noundef [[C:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-GFX1200-NEXT:  [[ENTRY:.*:]]
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.i32(i1 true, i32 [[A]], i1 true, i32 [[B]], <4 x i32> [[C]], i32 [[INDEX]], i1 true)
+// CHECK-GFX1200-NEXT:    store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA4]]
 // CHECK-GFX1200-NEXT:    ret void
 //
 void test_amdgcn_swmmac_i32_16x16x32_iu4_w64(global v4i* out, int a, int b, v4i c, int index)
@@ -78,10 +84,11 @@ void test_amdgcn_swmmac_i32_16x16x32_iu4_w64(global v4i* out, int a, int b, v4i
   *out = __builtin_amdgcn_swmmac_i32_16x16x32_iu4_w64(true, a, true, b, c, index, true);
 }
 
-// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_i32_16x16x64_iu4_w64(
-// CHECK-GFX1200-NEXT:  entry:
-// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.i32(i1 true, i32 [[A:%.*]], i1 true, <2 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], i32 [[INDEX:%.*]], i1 true)
-// CHECK-GFX1200-NEXT:    store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_swmmac_i32_16x16x64_iu4_w64(
+// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], i32 noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <4 x i32> noundef [[C:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-GFX1200-NEXT:  [[ENTRY:.*:]]
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.i32(i1 true, i32 [[A]], i1 true, <2 x i32> [[B]], <4 x i32> [[C]], i32 [[INDEX]], i1 true)
+// CHECK-GFX1200-NEXT:    store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA4]]
 // CHECK-GFX1200-NEXT:    ret void
 //
 void test_amdgcn_swmmac_i32_16x16x64_iu4_w64(global v4i* out, int a, v2i b, v4i c, int index)
@@ -89,10 +96,11 @@ void test_amdgcn_swmmac_i32_16x16x64_iu4_w64(global v4i* out, int a, v2i b, v4i
   *out = __builtin_amdgcn_swmmac_i32_16x16x64_iu4_w64(true, a, true, b, c, index, true);
 }
 
-// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64(
-// CHECK-GFX1200-NEXT:  entry:
-// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.i32(i32 [[A:%.*]], <2 x i32> [[B:%.*]], <4 x float> [[C:%.*]], i32 [[INDEX:%.*]])
-// CHECK-GFX1200-NEXT:    store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64(
+// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], i32 noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <4 x float> noundef [[C:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-GFX1200-NEXT:  [[ENTRY:.*:]]
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.i32(i32 [[A]], <2 x i32> [[B]], <4 x float> [[C]], i32 [[INDEX]])
+// CHECK-GFX1200-NEXT:    store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA4]]
 // CHECK-GFX1200-NEXT:    ret void
 //
 void test_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64(global v4f* out, int a, v2i b, v4f c, int index)
@@ -100,10 +108,11 @@ void test_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64(global v4f* out, int a, v2i b,
   *out = __builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64(a, b, c, index);
 }
 
-// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64(
-// CHECK-GFX1200-NEXT:  entry:
-// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.i32(i32 [[A:%.*]], <2 x i32> [[B:%.*]], <4 x float> [[C:%.*]], i32 [[INDEX:%.*]])
-// CHECK-GFX1200-NEXT:    store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64(
+// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], i32 noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <4 x float> noundef [[C:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-GFX1200-NEXT:  [[ENTRY:.*:]]
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.i32(i32 [[A]], <2 x i32> [[B]], <4 x float> [[C]], i32 [[INDEX]])
+// CHECK-GFX1200-NEXT:    store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA4]]
 // CHECK-GFX1200-NEXT:    ret void
 //
 void test_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64(global v4f* out, int a, v2i b, v4f c, int index)
@@ -111,10 +120,11 @@ void test_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64(global v4f* out, int a, v2i b,
   *out = __builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64(a, b, c, index);
 }
 
-// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64(
-// CHECK-GFX1200-NEXT:  entry:
-// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.i32(i32 [[A:%.*]], <2 x i32> [[B:%.*]], <4 x float> [[C:%.*]], i32 [[INDEX:%.*]])
-// CHECK-GFX1200-NEXT:    store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64(
+// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], i32 noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <4 x float> noundef [[C:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-GFX1200-NEXT:  [[ENTRY:.*:]]
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.i32(i32 [[A]], <2 x i32> [[B]], <4 x float> [[C]], i32 [[INDEX]])
+// CHECK-GFX1200-NEXT:    store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA4]]
 // CHECK-GFX1200-NEXT:    ret void
 //
 void test_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64(global v4f* out, int a, v2i b, v4f c, int index)
@@ -122,13 +132,19 @@ void test_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64(global v4f* out, int a, v2i b,
   *out = __builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64(a, b, c, index);
 }
 
-// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64(
-// CHECK-GFX1200-NEXT:  entry:
-// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.i32(i32 [[A:%.*]], <2 x i32> [[B:%.*]], <4 x float> [[C:%.*]], i32 [[INDEX:%.*]])
-// CHECK-GFX1200-NEXT:    store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64(
+// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], i32 noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <4 x float> noundef [[C:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-GFX1200-NEXT:  [[ENTRY:.*:]]
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.i32(i32 [[A]], <2 x i32> [[B]], <4 x float> [[C]], i32 [[INDEX]])
+// CHECK-GFX1200-NEXT:    store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA4]]
 // CHECK-GFX1200-NEXT:    ret void
 //
 void test_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64(global v4f* out, int a, v2i b, v4f c, int index)
 {
   *out = __builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64(a, b, c, index);
 }
+//.
+// CHECK-GFX1200: [[CHAR_TBAA4]] = !{[[META5:![0-9]+]], [[META5]], i64 0}
+// CHECK-GFX1200: [[META5]] = !{!"omnipotent char", [[META6:![0-9]+]], i64 0}
+// CHECK-GFX1200: [[META6]] = !{!"Simple C/C++ TBAA"}
+//.
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w32.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w32.cl
index 2f9a367ecab8a..853cd32f8bdce 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w32.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w32.cl
@@ -1,4 +1,4 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6
 // REQUIRES: amdgpu-registered-target
 // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1100 -DWMMA_GFX1100_TESTS -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1100
 
@@ -17,10 +17,11 @@ typedef short  v16s  __attribute__((ext_vector_type(16)));
 // amdgcn_wmma_f32_16x16x16_f16
 //
 
-// CHECK-GFX1100-LABEL: @test_amdgcn_wmma_f32_16x16x16_f16_w32(
-// CHECK-GFX1100-NEXT:  entry:
-// CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v16f16(<16 x half> [[A:%.*]], <16 x half> [[B:%.*]], <8 x float> [[C:%.*]])
-// CHECK-GFX1100-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4:![0-9]+]]
+// CHECK-GFX1100-LABEL: define dso_local void @test_amdgcn_wmma_f32_16x16x16_f16_w32(
+// CHECK-GFX1100-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], <16 x half> noundef [[A:%.*]], <16 x half> noundef [[B:%.*]], <8 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// CHECK-GFX1100-NEXT:  [[ENTRY:.*:]]
+// CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v16f16(<16 x half> [[A]], <16 x half> [[B]], <8 x float> [[C]])
+// CHECK-GFX1100-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA4:![0-9]+]]
 // CHECK-GFX1100-NEXT:    ret void
 //
 void test_amdgcn_wmma_f32_16x16x16_f16_w32(global v8f* out, v16h a, v16h b, v8f c)
@@ -32,10 +33,11 @@ void test_amdgcn_wmma_f32_16x16x16_f16_w32(global v8f* out, v16h a, v16h b, v8f
 // amdgcn_wmma_f32_16x16x16_bf16
 //
 
-// CHECK-GFX1100-LABEL: @test_amdgcn_wmma_f32_16x16x16_bf16_w32(
-// CHECK-GFX1100-NEXT:  entry:
-// CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v16i16(<16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x float> [[C:%.*]])
-// CHECK-GFX1100-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1100-LABEL: define dso_local void @test_amdgcn_wmma_f32_16x16x16_bf16_w32(
+// CHECK-GFX1100-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], <16 x i16> noundef [[A:%.*]], <16 x i16> noundef [[B:%.*]], <8 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-GFX1100-NEXT:  [[ENTRY:.*:]]
+// CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v16i16(<16 x i16> [[A]], <16 x i16> [[B]], <8 x float> [[C]])
+// CHECK-GFX1100-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA4]]
 // CHECK-GFX1100-NEXT:    ret void
 //
 void test_amdgcn_wmma_f32_16x16x16_bf16_w32(global v8f* out, v16s a, v16s b, v8f c)
@@ -47,10 +49,11 @@ void test_amdgcn_wmma_f32_16x16x16_bf16_w32(global v8f* out, v16s a, v16s b, v8f
 // amdgcn_wmma_f16_16x16x16_f16
 //
 
-// CHECK-GFX1100-LABEL: @test_amdgcn_wmma_f16_16x16x16_f16_w32(
-// CHECK-GFX1100-NEXT:  entry:
-// CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> [[A:%.*]], <16 x half> [[B:%.*]], <16 x half> [[C:%.*]], i1 true)
-// CHECK-GFX1100-NEXT:    store <16 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1100-LABEL: define dso_local void @test_amdgcn_wmma_f16_16x16x16_f16_w32(
+// CHECK-GFX1100-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], <16 x half> noundef [[A:%.*]], <16 x half> noundef [[B:%.*]], <16 x half> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-GFX1100-NEXT:  [[ENTRY:.*:]]
+// CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> [[A]], <16 x half> [[B]], <16 x half> [[C]], i1 true)
+// CHECK-GFX1100-NEXT:    store <16 x half> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA4]]
 // CHECK-GFX1100-NEXT:    ret void
 //
 void test_amdgcn_wmma_f16_16x16x16_f16_w32(global v16h* out, v16h a, v16h b, v16h c)
@@ -62,10 +65,11 @@ void test_amdgcn_wmma_f16_16x16x16_f16_w32(global v16h* out, v16h a, v16h b, v16
 // amdgcn_wmma_bf16_16x16x16_bf16
 //
 
-// CHECK-GFX1100-LABEL: @test_amdgcn_wmma_bf16_16x16x16_bf16_w32(
-// CHECK-GFX1100-NEXT:  entry:
-// CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v16i16.v16i16(<16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i16> [[C:%.*]], i1 true)
-// CHECK-GFX1100-NEXT:    store <16 x i16> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1100-LABEL: define dso_local void @test_amdgcn_wmma_bf16_16x16x16_bf16_w32(
+// CHECK-GFX1100-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], <16 x i16> noundef [[A:%.*]], <16 x i16> noundef [[B:%.*]], <16 x i16> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-GFX1100-NEXT:  [[ENTRY:.*:]]
+// CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v16i16.v16i16(<16 x i16> [[A]], <16 x i16> [[B]], <16 x i16> [[C]], i1 true)
+// CHECK-GFX1100-NEXT:    store <16 x i16> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA4]]
 // CHECK-GFX1100-NEXT:    ret void
 //
 void test_amdgcn_wmma_bf16_16x16x16_bf16_w32(global v16s* out, v16s a, v16s b, v16s c)
@@ -77,10 +81,11 @@ void test_amdgcn_wmma_bf16_16x16x16_bf16_w32(global v16s* out, v16s a, v16s b, v
 // amdgcn_wmma_f16_16x16x16_f16_tied
 //
 
-// CHECK-GFX1100-LABEL: @test_amdgcn_wmma_f16_16x16x16_f16_tied_w32(
-// CHECK-GFX1100-NEXT:  entry:
-// CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.tied.v16f16.v16f16(<16 x half> [[A:%.*]], <16 x half> [[B:%.*]], <16 x half> [[C:%.*]], i1 true)
-// CHECK-GFX1100-NEXT:    store <16 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1100-LABEL: define dso_local void @test_amdgcn_wmma_f16_16x16x16_f16_tied_w32(
+// CHECK-GFX1100-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], <16 x half> noundef [[A:%.*]], <16 x half> noundef [[B:%.*]], <16 x half> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-GFX1100-NEXT:  [[ENTRY:.*:]]
+// CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.tied.v16f16.v16f16(<16 x half> [[A]], <16 x half> [[B]], <16 x half> [[C]], i1 true)
+// CHECK-GFX1100-NEXT:    store <16 x half> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA4]]
 // CHECK-GFX1100-NEXT:    ret void
 //
 void test_amdgcn_wmma_f16_16x16x16_f16_tied_w32(global v16h* out, v16h a, v16h b, v16h c)
@@ -92,10 +97,11 @@ void test_amdgcn_wmma_f16_16x16x16_f16_tied_w32(global v16h* out, v16h a, v16h b
 // amdgcn_wmma_bf16_16x16x16_bf16_tied
 //
 
-// CHECK-GFX1100-LABEL: @test_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32(
-// CHECK-GFX1100-NEXT:  entry:
-// CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.tied.v16i16.v16i16(<16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i16> [[C:%.*]], i1 true)
-// CHECK-GFX1100-NEXT:    store <16 x i16> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1100-LABEL: define dso_local void @test_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32(
+// CHECK-GFX1100-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], <16 x i16> noundef [[A:%.*]], <16 x i16> noundef [[B:%.*]], <16 x i16> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-GFX1100-NEXT:  [[ENTRY:.*:]]
+// CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.tied.v16i16.v16i16(<16 x i16> [[A]], <16 x i16> [[B]], <16 x i16> [[C]], i1 true)
+// CHECK-GFX1100-NEXT:    store <16 x i16> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA4]]
 // CHECK-GFX1100-NEXT:    ret void
 //
 void test_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32(global v16s* out, v16s a, v16s b, v16s c)
@@ -107,10 +113,11 @@ void test_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32(global v16s* out, v16s a, v16s
 // amdgcn_wmma_i32_16x16x16_iu8
 //
 
-// CHECK-GFX1100-LABEL: @test_amdgcn_wmma_i32_16x16x16_iu8_w32(
-// CHECK-GFX1100-NEXT:  entry:
-// CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v4i32(i1 true, <4 x i32> [[A:%.*]], i1 true, <4 x i32> [[B:%.*]], <8 x i32> [[C:%.*]], i1 false)
-// CHECK-GFX1100-NEXT:    store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1100-LABEL: define dso_local void @test_amdgcn_wmma_i32_16x16x16_iu8_w32(
+// CHECK-GFX1100-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]], <8 x i32> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-GFX1100-NEXT:  [[ENTRY:.*:]]
+// CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v4i32(i1 true, <4 x i32> [[A]], i1 true, <4 x i32> [[B]], <8 x i32> [[C]], i1 false)
+// CHECK-GFX1100-NEXT:    store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA4]]
 // CHECK-GFX1100-NEXT:    ret void
 //
 void test_amdgcn_wmma_i32_16x16x16_iu8_w32(global v8i* out, v4i a, v4i b, v8i c)
@@ -122,10 +129,11 @@ void test_amdgcn_wmma_i32_16x16x16_iu8_w32(global v8i* out, v4i a, v4i b, v8i c)
 // amdgcn_wmma_i32_16x16x16_iu4
 //
 
-// CHECK-GFX1100-LABEL: @test_amdgcn_wmma_i32_16x16x16_iu4_w32(
-// CHECK-GFX1100-NEXT:  entry:
-// CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.v2i32(i1 true, <2 x i32> [[A:%.*]], i1 true, <2 x i32> [[B:%.*]], <8 x i32> [[C:%.*]], i1 false)
-// CHECK-GFX1100-NEXT:    store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1100-LABEL: define dso_local void @test_amdgcn_wmma_i32_16x16x16_iu4_w32(
+// CHECK-GFX1100-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <8 x i32> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-GFX1100-NEXT:  [[ENTRY:.*:]]
+// CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.v2i32(i1 true, <2 x i32> [[A]], i1 true, <2 x i32> [[B]], <8 x i32> [[C]], i1 false)
+// CHECK-GFX1100-NEXT:    store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA4]]
 // CHECK-GFX1100-NEXT:    ret void
 //
 void test_amdgcn_wmma_i32_16x16x16_iu4_w32(global v8i* out, v2i a, v2i b, v8i c)
@@ -134,3 +142,8 @@ void test_amdgcn_wmma_i32_16x16x16_iu4_w32(global v8i* out, v2i a, v2i b, v8i c)
 }
 
 #endif
+//.
+// CHECK-GFX1100: [[CHAR_TBAA4]] = !{[[META5:![0-9]+]], [[META5]], i64 0}
+// CHECK-GFX1100: [[META5]] = !{!"omnipotent char", [[META6:![0-9]+]], i64 0}
+// CHECK-GFX1100: [[META6]] = !{!"Simple C/C++ TBAA"}
+//.
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w64.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w64.cl
index 8dfe69bb9a744..9b6872f6b1e6d 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w64.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w64.cl
@@ -1,4 +1,4 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6
 // REQUIRES: amdgpu-registered-target
 // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1100 -target-feature +wavefrontsize64 -DWMMA_GFX1100_TESTS -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1100
 
@@ -18,10 +18,11 @@ typedef short  v16s  __attribute__((ext_vector_type(16)));
 // amdgcn_wmma_f32_16x16x16_f16
 //
 
-// CHECK-GFX1100-LABEL: @test_amdgcn_wmma_f32_16x16x16_f16_w64(
-// CHECK-GFX1100-NEXT:  entry:
-// CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v16f16(<16 x half> [[A:%.*]], <16 x half> [[B:%.*]], <4 x float> [[C:%.*]])
-// CHECK-GFX1100-NEXT:    store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4:![0-9]+]]
+// CHECK-GFX1100-LABEL: define dso_local void @test_amdgcn_wmma_f32_16x16x16_f16_w64(
+// CHECK-GFX1100-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], <16 x half> noundef [[A:%.*]], <16 x half> noundef [[B:%.*]], <4 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// CHECK-GFX1100-NEXT:  [[ENTRY:.*:]]
+// CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v16f16(<16 x half> [[A]], <16 x half> [[B]], <4 x float> [[C]])
+// CHECK-GFX1100-NEXT:    store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA4:![0-9]+]]
 // CHECK-GFX1100-NEXT:    ret void
 //
 void test_amdgcn_wmma_f32_16x16x16_f16_w64(global v4f* out, v16h a, v16h b, v4f c)
@@ -33,10 +34,11 @@ void test_amdgcn_wmma_f32_16x16x16_f16_w64(global v4f* out, v16h a, v16h b, v4f
 // amdgcn_wmma_f32_16x16x16_bf16
 //
 
-// CHECK-GFX1100-LABEL: @test_amdgcn_wmma_f32_16x16x16_bf16_w64(
-// CHECK-GFX1100-NEXT:  entry:
-// CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v16i16(<16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <4 x float> [[C:%.*]])
-// CHECK-GFX1100-NEXT:    store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1100-LABEL: define dso_local void @test_amdgcn_wmma_f32_16x16x16_bf16_w64(
+// CHECK-GFX1100-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], <16 x i16> noundef [[A:%.*]], <16 x i16> noundef [[B:%.*]], <4 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-GFX1100-NEXT:  [[ENTRY:.*:]]
+// CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v16i16(<16 x i16> [[A]], <16 x i16> [[B]], <4 x float> [[C]])
+// CHECK-GFX1100-NEXT:    store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA4]]
 // CHECK-GFX1100-NEXT:    ret void
 //
 void test_amdgcn_wmma_f32_16x16x16_bf16_w64(global v4f* out, v16s a, v16s b, v4f c)
@@ -48,10 +50,11 @@ void test_amdgcn_wmma_f32_16x16x16_bf16_w64(global v4f* out, v16s a, v16s b, v4f
 // amdgcn_wmma_f16_16x16x16_f16
 //
 
-// CHECK-GFX1100-LABEL: @test_amdgcn_wmma_f16_16x16x16_f16_w64(
-// CHECK-GFX1100-NEXT:  entry:
-// CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v16f16(<16 x half> [[A:%.*]], <16 x half> [[B:%.*]], <8 x half> [[C:%.*]], i1 true)
-// CHECK-GFX1100-NEXT:    store <8 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1100-LABEL: define dso_local void @test_amdgcn_wmma_f16_16x16x16_f16_w64(
+// CHECK-GFX1100-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], <16 x half> noundef [[A:%.*]], <16 x half> noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-GFX1100-NEXT:  [[ENTRY:.*:]]
+// CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v16f16(<16 x half> [[A]], <16 x half> [[B]], <8 x half> [[C]], i1 true)
+// CHECK-GFX1100-NEXT:    store <8 x half> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA4]]
 // CHECK-GFX1100-NEXT:    ret void
 //
 void test_amdgcn_wmma_f16_16x16x16_f16_w64(global v8h* out, v16h a, v16h b, v8h c)
@@ -63,10 +66,11 @@ void test_amdgcn_wmma_f16_16x16x16_f16_w64(global v8h* out, v16h a, v16h b, v8h
 // amdgcn_wmma_bf16_16x16x16_bf16
 //
 
-// CHECK-GFX1100-LABEL: @test_amdgcn_wmma_bf16_16x16x16_bf16_w64(
-// CHECK-GFX1100-NEXT:  entry:
-// CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v16i16(<16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i16> [[C:%.*]], i1 true)
-// CHECK-GFX1100-NEXT:    store <8 x i16> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1100-LABEL: define dso_local void @test_amdgcn_wmma_bf16_16x16x16_bf16_w64(
+// CHECK-GFX1100-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], <16 x i16> noundef [[A:%.*]], <16 x i16> noundef [[B:%.*]], <8 x i16> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-GFX1100-NEXT:  [[ENTRY:.*:]]
+// CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v16i16(<16 x i16> [[A]], <16 x i16> [[B]], <8 x i16> [[C]], i1 true)
+// CHECK-GFX1100-NEXT:    store <8 x i16> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA4]]
 // CHECK-GFX1100-NEXT:    ret void
 //
 void test_amdgcn_wmma_bf16_16x16x16_bf16_w64(global v8s* out, v16s a, v16s b, v8s c)
@@ -78,10 +82,11 @@ void test_amdgcn_wmma_bf16_16x16x16_bf16_w64(global v8s* out, v16s a, v16s b, v8
 // amdgcn_wmma_f16_16x16x16_f16_tied
 //
 
-// CHECK-GFX1100-LABEL: @test_amdgcn_wmma_f16_16x16x16_f16_tied_w64(
-// CHECK-GFX1100-NEXT:  entry:
-// CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.tied.v8f16.v16f16(<16 x half> [[A:%.*]], <16 x half> [[B:%.*]], <8 x half> [[C:%.*]], i1 true)
-// CHECK-GFX1100-NEXT:    store <8 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1100-LABEL: define dso_local void @test_amdgcn_wmma_f16_16x16x16_f16_tied_w64(
+// CHECK-GFX1100-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], <16 x half> noundef [[A:%.*]], <16 x half> noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-GFX1100-NEXT:  [[ENTRY:.*:]]
+// CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.tied.v8f16.v16f16(<16 x half> [[A]], <16 x half> [[B]], <8 x half> [[C]], i1 true)
+// CHECK-GFX1100-NEXT:    store <8 x half> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA4]]
 // CHECK-GFX1100-NEXT:    ret void
 //
 void test_amdgcn_wmma_f16_16x16x16_f16_tied_w64(global v8h* out, v16h a, v16h b, v8h c)
@@ -93,10 +98,11 @@ void test_amdgcn_wmma_f16_16x16x16_f16_tied_w64(global v8h* out, v16h a, v16h b,
 // amdgcn_wmma_bf16_16x16x16_bf16_tied
 //
 
-// CHECK-GFX1100-LABEL: @test_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64(
-// CHECK-GFX1100-NEXT:  entry:
-// CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.tied.v8i16.v16i16(<16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i16> [[C:%.*]], i1 true)
-// CHECK-GFX1100-NEXT:    store <8 x i16> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1100-LABEL: define dso_local void @test_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64(
+// CHECK-GFX1100-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], <16 x i16> noundef [[A:%.*]], <16 x i16> noundef [[B:%.*]], <8 x i16> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-GFX1100-NEXT:  [[ENTRY:.*:]]
+// CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.tied.v8i16.v16i16(<16 x i16> [[A]], <16 x i16> [[B]], <8 x i16> [[C]], i1 true)
+// CHECK-GFX1100-NEXT:    store <8 x i16> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA4]]
 // CHECK-GFX1100-NEXT:    ret void
 //
 void test_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64(global v8s* out, v16s a, v16s b, v8s c)
@@ -108,10 +114,11 @@ void test_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64(global v8s* out, v16s a, v16s
 // amdgcn_wmma_i32_16x16x16_iu8
 //
 
-// CHECK-GFX1100-LABEL: @test_amdgcn_wmma_i32_16x16x16_iu8_w64(
-// CHECK-GFX1100-NEXT:  entry:
-// CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.v4i32(i1 true, <4 x i32> [[A:%.*]], i1 true, <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], i1 false)
-// CHECK-GFX1100-NEXT:    store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1100-LABEL: define dso_local void @test_amdgcn_wmma_i32_16x16x16_iu8_w64(
+// CHECK-GFX1100-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]], <4 x i32> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-GFX1100-NEXT:  [[ENTRY:.*:]]
+// CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.v4i32(i1 true, <4 x i32> [[A]], i1 true, <4 x i32> [[B]], <4 x i32> [[C]], i1 false)
+// CHECK-GFX1100-NEXT:    store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA4]]
 // CHECK-GFX1100-NEXT:    ret void
 //
 void test_amdgcn_wmma_i32_16x16x16_iu8_w64(global v4i* out, v4i a, v4i b, v4i c)
@@ -123,10 +130,11 @@ void test_amdgcn_wmma_i32_16x16x16_iu8_w64(global v4i* out, v4i a, v4i b, v4i c)
 // amdgcn_wmma_i32_16x16x16_iu4
 //
 
-// CHECK-GFX1100-LABEL: @test_amdgcn_wmma_i32_16x16x16_iu4_w64(
-// CHECK-GFX1100-NEXT:  entry:
-// CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.v2i32(i1 true, <2 x i32> [[A:%.*]], i1 true, <2 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], i1 false)
-// CHECK-GFX1100-NEXT:    store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1100-LABEL: define dso_local void @test_amdgcn_wmma_i32_16x16x16_iu4_w64(
+// CHECK-GFX1100-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <4 x i32> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-GFX1100-NEXT:  [[ENTRY:.*:]]
+// CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.v2i32(i1 true, <2 x i32> [[A]], i1 true, <2 x i32> [[B]], <4 x i32> [[C]], i1 false)
+// CHECK-GFX1100-NEXT:    store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA4]]
 // CHECK-GFX1100-NEXT:    ret void
 //
 void test_amdgcn_wmma_i32_16x16x16_iu4_w64(global v4i* out, v2i a, v2i b, v4i c)
@@ -135,3 +143,8 @@ void test_amdgcn_wmma_i32_16x16x16_iu4_w64(global v4i* out, v2i a, v2i b, v4i c)
 }
 
 #endif
+//.
+// CHECK-GFX1100: [[CHAR_TBAA4]] = !{[[META5:![0-9]+]], [[META5]], i64 0}
+// CHECK-GFX1100: [[META5]] = !{!"omnipotent char", [[META6:![0-9]+]], i64 0}
+// CHECK-GFX1100: [[META6]] = !{!"Simple C/C++ TBAA"}
+//.
diff --git a/clang/test/CodeGenOpenCL/implicit-addrspacecast-function-parameter.cl b/clang/test/CodeGenOpenCL/implicit-addrspacecast-function-parameter.cl
index 4e40073c7e27a..4f2a75a76abbb 100644
--- a/clang/test/CodeGenOpenCL/implicit-addrspacecast-function-parameter.cl
+++ b/clang/test/CodeGenOpenCL/implicit-addrspacecast-function-parameter.cl
@@ -1,4 +1,4 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --include-generated-funcs --version 5
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --include-generated-funcs --version 6
 // RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s
 
 // Check there's no assertion when passing a pointer to an address space
@@ -33,7 +33,7 @@ __kernel void use_of_local_var()
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[X:%.*]] = alloca i32, align 4, addrspace(5)
 // CHECK-NEXT:    call void @llvm.lifetime.start.p5(ptr addrspace(5) [[X]]) #[[ATTR5:[0-9]+]]
-// CHECK-NEXT:    store i32 0, ptr addrspace(5) [[X]], align 4, !tbaa [[TBAA4:![0-9]+]]
+// CHECK-NEXT:    store i32 0, ptr addrspace(5) [[X]], align 4, !tbaa [[INT_TBAA4:![0-9]+]]
 // CHECK-NEXT:    call void @private_ptr(ptr addrspace(5) noundef [[X]]) #[[ATTR6:[0-9]+]]
 // CHECK-NEXT:    [[X_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[X]] to ptr
 // CHECK-NEXT:    call void @generic_ptr(ptr noundef [[X_ASCAST]]) #[[ATTR6]]
@@ -46,7 +46,7 @@ __kernel void use_of_local_var()
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[X_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
 // CHECK-NEXT:    [[X_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[X_ADDR]] to ptr
-// CHECK-NEXT:    store i32 [[X]], ptr [[X_ADDR_ASCAST]], align 4, !tbaa [[TBAA4]]
+// CHECK-NEXT:    store i32 [[X]], ptr [[X_ADDR_ASCAST]], align 4, !tbaa [[INT_TBAA4]]
 // CHECK-NEXT:    [[X_ADDR_ASCAST_ASCAST:%.*]] = addrspacecast ptr [[X_ADDR_ASCAST]] to ptr addrspace(5)
 // CHECK-NEXT:    call void @private_ptr(ptr addrspace(5) noundef [[X_ADDR_ASCAST_ASCAST]]) #[[ATTR6]]
 // CHECK-NEXT:    call void @generic_ptr(ptr noundef [[X_ADDR_ASCAST]]) #[[ATTR6]]
@@ -68,7 +68,7 @@ __kernel void use_of_local_var()
 // CHECK-NEXT:    ret void
 //
 //.
-// CHECK: [[TBAA4]] = !{[[META5:![0-9]+]], [[META5]], i64 0}
+// CHECK: [[INT_TBAA4]] = !{[[META5:![0-9]+]], [[META5]], i64 0}
 // CHECK: [[META5]] = !{!"int", [[META6:![0-9]+]], i64 0}
 // CHECK: [[META6]] = !{!"omnipotent char", [[META7:![0-9]+]], i64 0}
 // CHECK: [[META7]] = !{!"Simple C/C++ TBAA"}
diff --git a/clang/test/CodeGenOpenCL/numbered-address-space.cl b/clang/test/CodeGenOpenCL/numbered-address-space.cl
index bfbc1d6873551..2a3a90751bf31 100644
--- a/clang/test/CodeGenOpenCL/numbered-address-space.cl
+++ b/clang/test/CodeGenOpenCL/numbered-address-space.cl
@@ -1,5 +1,5 @@
 // REQUIRES: amdgpu-registered-target
-// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu tonga -emit-llvm -O0 -o - %s | FileCheck %s
+// RUN: %clang_cc1 -cl-std=CL2.0 -Wno-error=incompatible-pointer-types -triple amdgcn-unknown-unknown -target-cpu tonga -emit-llvm -O0 -o - %s | FileCheck %s
 
 // Make sure using numbered address spaces doesn't trigger crashes when a
 // builtin has an address space parameter.
diff --git a/clang/test/CodeGenOpenCL/preserve_vec3.cl b/clang/test/CodeGenOpenCL/preserve_vec3.cl
index e73657e30d884..6e5c1c49504ec 100644
--- a/clang/test/CodeGenOpenCL/preserve_vec3.cl
+++ b/clang/test/CodeGenOpenCL/preserve_vec3.cl
@@ -1,4 +1,4 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6
 // RUN: %clang_cc1 %s -emit-llvm -o - -triple spir-unknown-unknown | FileCheck %s
 
 typedef char char3 __attribute__((ext_vector_type(3)));
@@ -12,8 +12,8 @@ typedef float float4 __attribute__((ext_vector_type(4)));
 // CHECK-SAME: ptr addrspace(1) noundef readonly align 16 captures(none) [[A:%.*]], ptr addrspace(1) noundef writeonly align 16 captures(none) initializes((0, 16)) [[B:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] !kernel_arg_addr_space [[META3:![0-9]+]] !kernel_arg_access_qual [[META4:![0-9]+]] !kernel_arg_type [[META5:![0-9]+]] !kernel_arg_base_type [[META6:![0-9]+]] !kernel_arg_type_qual [[META7:![0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = load <3 x float>, ptr addrspace(1) [[A]], align 16
-// CHECK-NEXT:    [[EXTRACTVEC1:%.*]] = shufflevector <3 x float> [[TMP0]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
-// CHECK-NEXT:    store <4 x float> [[EXTRACTVEC1]], ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8:![0-9]+]]
+// CHECK-NEXT:    [[EXTRACTVEC1_I:%.*]] = shufflevector <3 x float> [[TMP0]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT:    store <4 x float> [[EXTRACTVEC1_I]], ptr addrspace(1) [[B]], align 16, !tbaa [[CHAR_TBAA8:![0-9]+]]
 // CHECK-NEXT:    ret void
 //
 void kernel foo(global float3 *a, global float3 *b) {
@@ -23,9 +23,9 @@ void kernel foo(global float3 *a, global float3 *b) {
 // CHECK-LABEL: define dso_local spir_kernel void @float4_to_float3(
 // CHECK-SAME: ptr addrspace(1) noundef writeonly align 16 captures(none) initializes((0, 16)) [[A:%.*]], ptr addrspace(1) noundef readonly align 16 captures(none) [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META11:![0-9]+]] !kernel_arg_base_type [[META12:![0-9]+]] !kernel_arg_type_qual [[META7]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <3 x float>, ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]]
-// CHECK-NEXT:    [[EXTRACTVEC:%.*]] = shufflevector <3 x float> [[TMP0]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
-// CHECK-NEXT:    store <4 x float> [[EXTRACTVEC]], ptr addrspace(1) [[A]], align 16, !tbaa [[TBAA8]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <3 x float>, ptr addrspace(1) [[B]], align 16, !tbaa [[CHAR_TBAA8]]
+// CHECK-NEXT:    [[EXTRACTVEC_I:%.*]] = shufflevector <3 x float> [[TMP0]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT:    store <4 x float> [[EXTRACTVEC_I]], ptr addrspace(1) [[A]], align 16, !tbaa [[CHAR_TBAA8]]
 // CHECK-NEXT:    ret void
 //
 void kernel float4_to_float3(global float3 *a, global float4 *b) {
@@ -36,8 +36,8 @@ void kernel float4_to_float3(global float3 *a, global float4 *b) {
 // CHECK-SAME: ptr addrspace(1) noundef readonly align 16 captures(none) [[A:%.*]], ptr addrspace(1) noundef writeonly align 16 captures(none) initializes((0, 16)) [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META11]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META7]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = load <3 x float>, ptr addrspace(1) [[A]], align 16
-// CHECK-NEXT:    [[ASTYPE:%.*]] = shufflevector <3 x float> [[TMP0]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
-// CHECK-NEXT:    store <4 x float> [[ASTYPE]], ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]]
+// CHECK-NEXT:    [[ASTYPE_I:%.*]] = shufflevector <3 x float> [[TMP0]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT:    store <4 x float> [[ASTYPE_I]], ptr addrspace(1) [[B]], align 16, !tbaa [[CHAR_TBAA8]]
 // CHECK-NEXT:    ret void
 //
 void kernel float3_to_float4(global float3 *a, global float4 *b) {
@@ -49,7 +49,7 @@ void kernel float3_to_float4(global float3 *a, global float4 *b) {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = load <3 x float>, ptr addrspace(1) [[A]], align 16
 // CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <3 x float> [[TMP0]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
-// CHECK-NEXT:    store <4 x float> [[TMP1]], ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]]
+// CHECK-NEXT:    store <4 x float> [[TMP1]], ptr addrspace(1) [[B]], align 16, !tbaa [[CHAR_TBAA8]]
 // CHECK-NEXT:    ret void
 //
 void kernel float3_to_double2(global float3 *a, global double2 *b) {
@@ -59,9 +59,9 @@ void kernel float3_to_double2(global float3 *a, global double2 *b) {
 // CHECK-LABEL: define dso_local spir_kernel void @char8_to_short3(
 // CHECK-SAME: ptr addrspace(1) noundef writeonly align 8 captures(none) initializes((0, 8)) [[A:%.*]], ptr addrspace(1) noundef readonly align 8 captures(none) [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META15:![0-9]+]] !kernel_arg_base_type [[META16:![0-9]+]] !kernel_arg_type_qual [[META7]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <3 x i16>, ptr addrspace(1) [[B]], align 8, !tbaa [[TBAA8]]
-// CHECK-NEXT:    [[EXTRACTVEC:%.*]] = shufflevector <3 x i16> [[TMP0]], <3 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
-// CHECK-NEXT:    store <4 x i16> [[EXTRACTVEC]], ptr addrspace(1) [[A]], align 8, !tbaa [[TBAA8]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <3 x i16>, ptr addrspace(1) [[B]], align 8, !tbaa [[CHAR_TBAA8]]
+// CHECK-NEXT:    [[EXTRACTVEC_I:%.*]] = shufflevector <3 x i16> [[TMP0]], <3 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT:    store <4 x i16> [[EXTRACTVEC_I]], ptr addrspace(1) [[A]], align 8, !tbaa [[CHAR_TBAA8]]
 // CHECK-NEXT:    ret void
 //
 void kernel char8_to_short3(global short3 *a, global char8 *b) {
@@ -69,10 +69,10 @@ void kernel char8_to_short3(global short3 *a, global char8 *b) {
 }
 
 // CHECK-LABEL: define dso_local spir_func void @from_char3(
-// CHECK-SAME: <3 x i8> noundef [[A:%.*]], ptr addrspace(1) noundef writeonly captures(none) initializes((0, 4)) [[OUT:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] {
+// CHECK-SAME: <3 x i8> noundef [[A:%.*]], ptr addrspace(1) noundef writeonly captures(none) initializes((0, 4)) [[OUT:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[EXTRACTVEC:%.*]] = shufflevector <3 x i8> [[A]], <3 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
-// CHECK-NEXT:    store <4 x i8> [[EXTRACTVEC]], ptr addrspace(1) [[OUT]], align 4, !tbaa [[TBAA17:![0-9]+]]
+// CHECK-NEXT:    store <4 x i8> [[EXTRACTVEC]], ptr addrspace(1) [[OUT]], align 4, !tbaa [[INT_TBAA17:![0-9]+]]
 // CHECK-NEXT:    ret void
 //
 void from_char3(char3 a, global int *out) {
@@ -80,10 +80,10 @@ void from_char3(char3 a, global int *out) {
 }
 
 // CHECK-LABEL: define dso_local spir_func void @from_short3(
-// CHECK-SAME: <3 x i16> noundef [[A:%.*]], ptr addrspace(1) noundef writeonly captures(none) initializes((0, 8)) [[OUT:%.*]]) local_unnamed_addr #[[ATTR1]] {
+// CHECK-SAME: <3 x i16> noundef [[A:%.*]], ptr addrspace(1) noundef writeonly captures(none) initializes((0, 8)) [[OUT:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[EXTRACTVEC:%.*]] = shufflevector <3 x i16> [[A]], <3 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
-// CHECK-NEXT:    store <4 x i16> [[EXTRACTVEC]], ptr addrspace(1) [[OUT]], align 8, !tbaa [[TBAA19:![0-9]+]]
+// CHECK-NEXT:    store <4 x i16> [[EXTRACTVEC]], ptr addrspace(1) [[OUT]], align 8, !tbaa [[LONG_TBAA19:![0-9]+]]
 // CHECK-NEXT:    ret void
 //
 void from_short3(short3 a, global long *out) {
@@ -91,11 +91,11 @@ void from_short3(short3 a, global long *out) {
 }
 
 // CHECK-LABEL: define dso_local spir_func void @scalar_to_char3(
-// CHECK-SAME: i32 noundef [[A:%.*]], ptr addrspace(1) noundef writeonly captures(none) initializes((0, 4)) [[OUT:%.*]]) local_unnamed_addr #[[ATTR1]] {
+// CHECK-SAME: i32 noundef [[A:%.*]], ptr addrspace(1) noundef writeonly captures(none) initializes((0, 4)) [[OUT:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A]] to <4 x i8>
 // CHECK-NEXT:    [[EXTRACTVEC:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
-// CHECK-NEXT:    store <4 x i8> [[EXTRACTVEC]], ptr addrspace(1) [[OUT]], align 4, !tbaa [[TBAA8]]
+// CHECK-NEXT:    store <4 x i8> [[EXTRACTVEC]], ptr addrspace(1) [[OUT]], align 4, !tbaa [[CHAR_TBAA8]]
 // CHECK-NEXT:    ret void
 //
 void scalar_to_char3(int a, global char3 *out) {
@@ -103,11 +103,11 @@ void scalar_to_char3(int a, global char3 *out) {
 }
 
 // CHECK-LABEL: define dso_local spir_func void @scalar_to_short3(
-// CHECK-SAME: i64 noundef [[A:%.*]], ptr addrspace(1) noundef writeonly captures(none) initializes((0, 8)) [[OUT:%.*]]) local_unnamed_addr #[[ATTR1]] {
+// CHECK-SAME: i64 noundef [[A:%.*]], ptr addrspace(1) noundef writeonly captures(none) initializes((0, 8)) [[OUT:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A]] to <4 x i16>
 // CHECK-NEXT:    [[EXTRACTVEC:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
-// CHECK-NEXT:    store <4 x i16> [[EXTRACTVEC]], ptr addrspace(1) [[OUT]], align 8, !tbaa [[TBAA8]]
+// CHECK-NEXT:    store <4 x i16> [[EXTRACTVEC]], ptr addrspace(1) [[OUT]], align 8, !tbaa [[CHAR_TBAA8]]
 // CHECK-NEXT:    ret void
 //
 void scalar_to_short3(long a, global short3 *out) {
@@ -120,7 +120,7 @@ void scalar_to_short3(long a, global short3 *out) {
 // CHECK: [[META5]] = !{!"float3*", !"float3*"}
 // CHECK: [[META6]] = !{!"float __attribute__((ext_vector_type(3)))*", !"float __attribute__((ext_vector_type(3)))*"}
 // CHECK: [[META7]] = !{!"", !""}
-// CHECK: [[TBAA8]] = !{[[META9:![0-9]+]], [[META9]], i64 0}
+// CHECK: [[CHAR_TBAA8]] = !{[[META9:![0-9]+]], [[META9]], i64 0}
 // CHECK: [[META9]] = !{!"omnipotent char", [[META10:![0-9]+]], i64 0}
 // CHECK: [[META10]] = !{!"Simple C/C++ TBAA"}
 // CHECK: [[META11]] = !{!"float3*", !"float4*"}
@@ -129,8 +129,8 @@ void scalar_to_short3(long a, global short3 *out) {
 // CHECK: [[META14]] = !{!"float __attribute__((ext_vector_type(3)))*", !"double __attribute__((ext_vector_type(2)))*"}
 // CHECK: [[META15]] = !{!"short3*", !"char8*"}
 // CHECK: [[META16]] = !{!"short __attribute__((ext_vector_type(3)))*", !"char __attribute__((ext_vector_type(8)))*"}
-// CHECK: [[TBAA17]] = !{[[META18:![0-9]+]], [[META18]], i64 0}
+// CHECK: [[INT_TBAA17]] = !{[[META18:![0-9]+]], [[META18]], i64 0}
 // CHECK: [[META18]] = !{!"int", [[META9]], i64 0}
-// CHECK: [[TBAA19]] = !{[[META20:![0-9]+]], [[META20]], i64 0}
+// CHECK: [[LONG_TBAA19]] = !{[[META20:![0-9]+]], [[META20]], i64 0}
 // CHECK: [[META20]] = !{!"long", [[META9]], i64 0}
 //.
diff --git a/clang/test/CodeGenOpenCLCXX/array-type-infinite-loop.clcpp b/clang/test/CodeGenOpenCLCXX/array-type-infinite-loop.clcpp
index 8d8f0b0b5d699..e932e75d025e0 100644
--- a/clang/test/CodeGenOpenCLCXX/array-type-infinite-loop.clcpp
+++ b/clang/test/CodeGenOpenCLCXX/array-type-infinite-loop.clcpp
@@ -1,12 +1,12 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6
 //RUN: %clang_cc1 %s -triple spir -emit-llvm -O1 -o - | FileCheck %s
 
 // CHECK-LABEL: define dso_local spir_kernel void @test(
 // CHECK-SAME: ptr addrspace(1) noundef readonly align 8 captures(none) [[IN:%.*]], ptr addrspace(1) noundef writeonly align 8 captures(none) initializes((0, 8)) [[OUT:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] !kernel_arg_addr_space [[META4:![0-9]+]] !kernel_arg_access_qual [[META5:![0-9]+]] !kernel_arg_type [[META6:![0-9]+]] !kernel_arg_base_type [[META6]] !kernel_arg_type_qual [[META7:![0-9]+]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(1) [[IN]], i32 8
-// CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr addrspace(1) [[ARRAYIDX1]], align 8, !tbaa [[TBAA8:![0-9]+]]
-// CHECK-NEXT:    store i64 [[TMP0]], ptr addrspace(1) [[OUT]], align 8, !tbaa [[TBAA8]]
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[ARRAYIDX1_I:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(1) [[IN]], i32 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr addrspace(1) [[ARRAYIDX1_I]], align 8, !tbaa [[LONG_TBAA8:![0-9]+]]
+// CHECK-NEXT:    store i64 [[TMP0]], ptr addrspace(1) [[OUT]], align 8, !tbaa [[LONG_TBAA8]]
 // CHECK-NEXT:    ret void
 //
 __kernel void test(__global long *In, __global long *Out) {
@@ -18,7 +18,7 @@ __kernel void test(__global long *In, __global long *Out) {
 // CHECK: [[META5]] = !{!"none", !"none"}
 // CHECK: [[META6]] = !{!"long*", !"long*"}
 // CHECK: [[META7]] = !{!"", !""}
-// CHECK: [[TBAA8]] = !{[[META9:![0-9]+]], [[META9]], i64 0}
+// CHECK: [[LONG_TBAA8]] = !{[[META9:![0-9]+]], [[META9]], i64 0}
 // CHECK: [[META9]] = !{!"long", [[META10:![0-9]+]], i64 0}
 // CHECK: [[META10]] = !{!"omnipotent char", [[META11:![0-9]+]], i64 0}
 // CHECK: [[META11]] = !{!"Simple C++ TBAA"}
diff --git a/clang/test/DebugInfo/CXX/vtable-external.cpp b/clang/test/DebugInfo/CXX/vtable-external.cpp
new file mode 100644
index 0000000000000..b5b34c4123e3b
--- /dev/null
+++ b/clang/test/DebugInfo/CXX/vtable-external.cpp
@@ -0,0 +1,108 @@
+// For the `CInlined` struct, where all member functions are inlined, we check the following cases:
+// - If the definition of its destructor is visible:
+//   * The vtable is generated with a COMDAT specifier
+//   * Its '_vtable$' is generated
+// - Otherwise:
+//   * The vtable is declared
+//   * Its '_vtable$' is NOT generated
+//
+// For the `CNoInline` strcut, where member functions are defined as non-inline, we check the following:
+// - Regardless of whether the definition of its destructor is visible or not:
+//   * The vtable is generated
+//   * Its '_vtable$' is generated
+//
+// For the `CNoFnDef` struct, where member functions are declared only, we check the following:
+// - Regardless of whether the definition of its destructor is visible or not:
+//  # when non-optimized:
+//   * The vtable is declared
+//   * Its '_vtable$' is NOT generated
+//  # when optimized even if no LLVM passes:
+//   * The vtable is declared as `available_externally` (which is potentially turned into `external` by LLVM passes)
+//   * Its '_vtable$' is generated
+
+struct CInlined {
+  virtual void f1() noexcept {}
+  virtual void f2() noexcept {}
+  virtual ~CInlined() noexcept;
+};
+#ifndef NO_DTOR_BODY
+inline CInlined::~CInlined() noexcept {}
+#endif
+
+struct CNoInline {
+  virtual void g1() noexcept;
+  virtual void g2() noexcept;
+  virtual ~CNoInline() noexcept;
+};
+
+void CNoInline::g1() noexcept {}
+void CNoInline::g2() noexcept {}
+#ifndef NO_DTOR_BODY
+CNoInline::~CNoInline() noexcept {}
+#endif
+
+struct CNoFnDef {
+  virtual void h1() noexcept;
+  virtual void h2() noexcept;
+  virtual ~CNoFnDef() noexcept;
+};
+
+#ifndef NO_DTOR_BODY
+CNoFnDef::~CNoFnDef() noexcept {}
+#endif
+
+int main() {
+  CInlined Inlined;
+  CNoInline NoInline;
+  CNoFnDef NoFnDef;
+
+  return 0;
+}
+
+// RUN: %clang_cc1 -triple x86_64-linux -emit-llvm -debug-info-kind=limited -dwarf-version=5 -O0 -disable-llvm-passes                %s -o - | FileCheck %s -check-prefixes CHECK-HAS-DTOR,CHECK-HAS-DTOR-O0
+// RUN: %clang_cc1 -triple x86_64-linux -emit-llvm -debug-info-kind=limited -dwarf-version=5 -O1 -disable-llvm-passes                %s -o - | FileCheck %s -check-prefixes CHECK-HAS-DTOR,CHECK-HAS-DTOR-O1
+// RUN: %clang_cc1 -triple x86_64-linux -emit-llvm -debug-info-kind=limited -dwarf-version=5 -O0 -disable-llvm-passes -DNO_DTOR_BODY %s -o - | FileCheck %s -check-prefixes CHECK-NO-DTOR,CHECK-NO-DTOR-O0
+// RUN: %clang_cc1 -triple x86_64-linux -emit-llvm -debug-info-kind=limited -dwarf-version=5 -O1 -disable-llvm-passes -DNO_DTOR_BODY %s -o - | FileCheck %s -check-prefixes CHECK-NO-DTOR,CHECK-NO-DTOR-O1
+
+// CHECK-HAS-DTOR: $_ZTV8CInlined = comdat any
+// CHECK-HAS-DTOR-NOT: $_ZTV9CNoInline
+// CHECK-HAS-DTOR-NOT: $_ZTV8CNoFnDef
+
+// CHECK-HAS-DTOR-DAG: @_ZTV8CInlined = linkonce_odr {{.*}}constant {{{ \[[^]]*\] } { \[[^]]*\] \[[^]]*\] }}}, comdat, align 8, !dbg [[INLINED_VTABLE_VAR:![0-9]+]]
+// CHECK-HAS-DTOR-DAG: @_ZTV9CNoInline = {{.*}}constant {{{ \[[^]]*\] } { \[[^]]*\] \[[^]]*\] }}}, align 8, !dbg [[NOINLINE_VTABLE_VAR:![0-9]+]]
+// CHECK-HAS-DTOR-O0-DAG: @_ZTV8CNoFnDef = external {{.*}}constant {{{ \[[^]]*\] }}}, align 8{{$}}
+// CHECK-HAS-DTOR-O1-DAG: @_ZTV8CNoFnDef = available_externally {{.*}}constant {{{ \[[^]]*\] } { \[[^]]*\] \[[^]]*\] }}}, align 8, !dbg [[NOFNDEF_VTABLE_VAR:![0-9]+]]
+
+// CHECK-HAS-DTOR: !llvm.dbg.cu
+
+// CHECK-HAS-DTOR-DAG: [[INLINED_VTABLE:![0-9]+]] = distinct !DIGlobalVariable(name: "_vtable$", linkageName: "_ZTV8CInlined"
+// CHECK-HAS-DTOR-DAG: [[INLINED_VTABLE_VAR]] = !DIGlobalVariableExpression(var: [[INLINED_VTABLE]], expr: !DIExpression())
+// CHECK-HAS-DTOR-DAG: [[INLINED:![0-9]+]] = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "CInlined"
+// CHECK-HAS-DTOR-DAG: !DIDerivedType(tag: DW_TAG_variable, name: "_vtable$", scope: [[INLINED]], file: {{.*}}, baseType: {{![0-9]+}}, flags: DIFlagPrivate | DIFlagArtificial | DIFlagStaticMember)
+
+// CHECK-HAS-DTOR-DAG: [[NOINLINE_VTABLE:![0-9]+]] = distinct !DIGlobalVariable(name: "_vtable$", linkageName: "_ZTV9CNoInline"
+// CHECK-HAS-DTOR-DAG: [[NOINLINE_VTABLE_VAR]] = !DIGlobalVariableExpression(var: [[NOINLINE_VTABLE]], expr: !DIExpression())
+// CHECK-HAS-DTOR-DAG: [[NOINLINE:![0-9]+]] = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "CNoInline"
+// CHECK-HAS-DTOR-DAG: !DIDerivedType(tag: DW_TAG_variable, name: "_vtable$", scope: [[NOINLINE]], file: {{.*}}, baseType: {{![0-9]+}}, flags: DIFlagPrivate | DIFlagArtificial | DIFlagStaticMember)
+
+// CHECK-HAS-DTOR-O1-DAG: [[NOFNDEF_VTABLE:![0-9]+]] = distinct !DIGlobalVariable(name: "_vtable$", linkageName: "_ZTV8CNoFnDef"
+// CHECK-HAS-DTOR-O1-DAG: [[NOFNDEF_VTABLE_VAR]] = !DIGlobalVariableExpression(var: [[NOFNDEF_VTABLE]], expr: !DIExpression())
+
+// CHECK-NO-DTOR-NOT: $_ZTV8CInlined
+// CHECK-NO-DTOR-NOT: $_ZTV9CNoInline
+// CHECK-NO-DTOR-NOT: $_ZTV8CNoFnDef
+
+// CHECK-NO-DTOR-DAG: @_ZTV8CInlined = external {{.*}}constant {{.*}}, align 8{{$}}
+// CHECK-NO-DTOR-DAG: @_ZTV9CNoInline = {{.*}}constant {{{ \[[^]]*\] } { \[[^]]*\] \[[^]]*\] }}}, align 8, !dbg [[NOINLINE_VTABLE_VAR:![0-9]+]]
+// CHECK-NO-DTOR-O0-DAG: @_ZTV8CNoFnDef = external {{.*}}constant {{{ \[[^]]*\] }}}, align 8{{$}}
+// CHECK-NO-DTOR-O1-DAG: @_ZTV8CNoFnDef = available_externally {{.*}}constant {{{ \[[^]]*\] } { \[[^]]*\] \[[^]]*\] }}}, align 8, !dbg [[NOFNDEF_VTABLE_VAR:![0-9]+]]
+
+// CHECK-NO-DTOR: !llvm.dbg.cu
+
+// CHECK-NO-DTOR-DAG: [[NOINLINE_VTABLE:![0-9]+]] = distinct !DIGlobalVariable(name: "_vtable$", linkageName: "_ZTV9CNoInline"
+// CHECK-NO-DTOR-DAG: [[NOINLINE_VTABLE_VAR]] = !DIGlobalVariableExpression(var: [[NOINLINE_VTABLE]], expr: !DIExpression())
+// CHECK-NO-DTOR-DAG: [[NOINLINE:![0-9]+]] = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "CNoInline"
+// CHECK-NO-DTOR-DAG: !DIDerivedType(tag: DW_TAG_variable, name: "_vtable$", scope: [[NOINLINE]], file: {{.*}}, baseType: {{![0-9]+}}, flags: DIFlagPrivate | DIFlagArtificial | DIFlagStaticMember)
+
+// CHECK-NO-DTOR-O1-DAG: [[NOFNDEF_VTABLE:![0-9]+]] = distinct !DIGlobalVariable(name: "_vtable$", linkageName: "_ZTV8CNoFnDef"
+// CHECK-NO-DTOR-O1-DAG: [[NOFNDEF_VTABLE_VAR]] = !DIGlobalVariableExpression(var: [[NOFNDEF_VTABLE]], expr: !DIExpression())
diff --git a/clang/test/DebugInfo/CXX/vtable-inheritance-diamond.cpp b/clang/test/DebugInfo/CXX/vtable-inheritance-diamond.cpp
index 5ed1353eebb10..5bf7dc15c46d0 100644
--- a/clang/test/DebugInfo/CXX/vtable-inheritance-diamond.cpp
+++ b/clang/test/DebugInfo/CXX/vtable-inheritance-diamond.cpp
@@ -1,5 +1,3 @@
-// REQUIRES: target={{x86_64.*-linux.*}}
-
 // Diamond inheritance case:
 // For CBase, CLeft, CRight and CDerived we check:
 // - Generation of their vtables (including attributes).
@@ -44,17 +42,18 @@ int main() {
   return 0;
 }
 
-// RUN: %clang --target=x86_64-linux -Xclang -disable-O0-optnone -Xclang -disable-llvm-passes -emit-llvm -S -g %s -o - | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-linux -emit-llvm -debug-info-kind=limited -dwarf-version=5 -O0 -disable-llvm-passes %s -o - | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-linux -emit-llvm -debug-info-kind=limited -dwarf-version=5 -O1 -disable-llvm-passes %s -o - | FileCheck %s
 
 // CHECK: $_ZTVN3NSP5CBaseE = comdat any
 // CHECK: $_ZTVN5NSP_15CLeftE = comdat any
 // CHECK: $_ZTVN5NSP_26CRightE = comdat any
 // CHECK: $_ZTV8CDerived = comdat any
 
-// CHECK: @_ZTVN3NSP5CBaseE = linkonce_odr {{dso_local|hidden}} unnamed_addr constant {{.*}}, comdat, align 8, !dbg [[BASE_VTABLE_VAR:![0-9]*]]
-// CHECK: @_ZTVN5NSP_15CLeftE = linkonce_odr {{dso_local|hidden}} unnamed_addr constant {{.*}}, comdat, align 8, !dbg [[LEFT_VTABLE_VAR:![0-9]*]]
-// CHECK: @_ZTVN5NSP_26CRightE = linkonce_odr {{dso_local|hidden}} unnamed_addr constant {{.*}}, comdat, align 8, !dbg [[RIGHT_VTABLE_VAR:![0-9]*]]
-// CHECK: @_ZTV8CDerived = linkonce_odr {{dso_local|hidden}} unnamed_addr constant {{.*}}, comdat, align 8, !dbg [[DERIVED_VTABLE_VAR:![0-9]*]]
+// CHECK: @_ZTVN3NSP5CBaseE = linkonce_odr {{.*}}unnamed_addr constant {{.*}}, comdat, align 8, !dbg [[BASE_VTABLE_VAR:![0-9]*]]
+// CHECK: @_ZTVN5NSP_15CLeftE = linkonce_odr {{.*}}unnamed_addr constant {{.*}}, comdat, align 8, !dbg [[LEFT_VTABLE_VAR:![0-9]*]]
+// CHECK: @_ZTVN5NSP_26CRightE = linkonce_odr {{.*}}unnamed_addr constant {{.*}}, comdat, align 8, !dbg [[RIGHT_VTABLE_VAR:![0-9]*]]
+// CHECK: @_ZTV8CDerived = linkonce_odr {{.*}}unnamed_addr constant {{.*}}, comdat, align 8, !dbg [[DERIVED_VTABLE_VAR:![0-9]*]]
 
 // CHECK: [[BASE_VTABLE_VAR]] = !DIGlobalVariableExpression(var: [[BASE_VTABLE:![0-9]*]], expr: !DIExpression())
 // CHECK-NEXT: [[BASE_VTABLE]] = distinct !DIGlobalVariable(name: "_vtable$", linkageName: "_ZTVN3NSP5CBaseE"
diff --git a/clang/test/DebugInfo/CXX/vtable-inheritance-multiple.cpp b/clang/test/DebugInfo/CXX/vtable-inheritance-multiple.cpp
index 23973a35d0e17..3b7e3a74f8eac 100644
--- a/clang/test/DebugInfo/CXX/vtable-inheritance-multiple.cpp
+++ b/clang/test/DebugInfo/CXX/vtable-inheritance-multiple.cpp
@@ -1,5 +1,3 @@
-// REQUIRES: target={{x86_64.*-linux.*}}
-
 // Multiple inheritance case:
 // For CBaseOne, CBaseTwo and CDerived we check:
 // - Generation of their vtables (including attributes).
@@ -38,15 +36,16 @@ int main() {
   return 0;
 }
 
-// RUN: %clang --target=x86_64-linux -Xclang -disable-O0-optnone -Xclang -disable-llvm-passes -emit-llvm -S -g %s -o - | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-linux -emit-llvm -debug-info-kind=limited -dwarf-version=5 -O0 -disable-llvm-passes %s -o - | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-linux -emit-llvm -debug-info-kind=limited -dwarf-version=5 -O1 -disable-llvm-passes %s -o - | FileCheck %s
 
 // CHECK: $_ZTVN5NSP_18CBaseOneE = comdat any
 // CHECK: $_ZTVN5NSP_28CBaseTwoE = comdat any
 // CHECK: $_ZTV8CDerived = comdat any
 
-// CHECK: @_ZTVN5NSP_18CBaseOneE = linkonce_odr {{dso_local|hidden}} unnamed_addr constant {{.*}}, comdat, align 8, !dbg [[BASE_ONE_VTABLE_VAR:![0-9]*]]
-// CHECK: @_ZTVN5NSP_28CBaseTwoE = linkonce_odr {{dso_local|hidden}} unnamed_addr constant {{.*}}, comdat, align 8, !dbg [[BASE_TWO_VTABLE_VAR:![0-9]*]]
-// CHECK: @_ZTV8CDerived = linkonce_odr {{dso_local|hidden}} unnamed_addr constant {{.*}}, comdat, align 8, !dbg [[DERIVED_VTABLE_VAR:![0-9]*]]
+// CHECK: @_ZTVN5NSP_18CBaseOneE = linkonce_odr {{.*}}unnamed_addr constant {{.*}}, comdat, align 8, !dbg [[BASE_ONE_VTABLE_VAR:![0-9]*]]
+// CHECK: @_ZTVN5NSP_28CBaseTwoE = linkonce_odr {{.*}}unnamed_addr constant {{.*}}, comdat, align 8, !dbg [[BASE_TWO_VTABLE_VAR:![0-9]*]]
+// CHECK: @_ZTV8CDerived = linkonce_odr {{.*}}unnamed_addr constant {{.*}}, comdat, align 8, !dbg [[DERIVED_VTABLE_VAR:![0-9]*]]
 
 // CHECK: [[BASE_ONE_VTABLE_VAR]] = !DIGlobalVariableExpression(var: [[BASE_ONE_VTABLE:![0-9]*]], expr: !DIExpression())
 // CHECK-NEXT: [[BASE_ONE_VTABLE]] = distinct !DIGlobalVariable(name: "_vtable$", linkageName: "_ZTVN5NSP_18CBaseOneE"
diff --git a/clang/test/DebugInfo/CXX/vtable-inheritance-simple-main.cpp b/clang/test/DebugInfo/CXX/vtable-inheritance-simple-main.cpp
index d64e711dddfa0..bcf8ff73cee69 100644
--- a/clang/test/DebugInfo/CXX/vtable-inheritance-simple-main.cpp
+++ b/clang/test/DebugInfo/CXX/vtable-inheritance-simple-main.cpp
@@ -1,5 +1,3 @@
-// REQUIRES: target={{x86_64.*-linux.*}}
-
 // Simple inheritance case:
 // For CBase and CDerived we check:
 // - Generation of their vtables (including attributes).
@@ -86,35 +84,35 @@ int main() {
 }
 #endif
 
-// RUN: %clang --target=x86_64-linux -Xclang -disable-O0-optnone -Xclang -disable-llvm-passes -emit-llvm -c -g %s -o %t.simple-base.bc    -DBASE_CODE
-// RUN: %clang --target=x86_64-linux -Xclang -disable-O0-optnone -Xclang -disable-llvm-passes -emit-llvm -c -g %s -o %t.simple-derived.bc -DDERIVED_CODE
-// RUN: %clang --target=x86_64-linux -Xclang -disable-O0-optnone -Xclang -disable-llvm-passes -emit-llvm -c -g %s -o %t.simple-main.bc    -DMAIN_CODE
+// RUN: %clang_cc1 -triple x86_64-linux -emit-llvm-bc -debug-info-kind=limited -dwarf-version=5 -O0 -disable-llvm-passes %s -o %t.simple-base.bc    -DBASE_CODE
+// RUN: %clang_cc1 -triple x86_64-linux -emit-llvm-bc -debug-info-kind=limited -dwarf-version=5 -O0 -disable-llvm-passes %s -o %t.simple-derived.bc -DDERIVED_CODE
+// RUN: %clang_cc1 -triple x86_64-linux -emit-llvm-bc -debug-info-kind=limited -dwarf-version=5 -O0 -disable-llvm-passes %s -o %t.simple-main.bc    -DMAIN_CODE
 // RUN: llvm-link %t.simple-base.bc %t.simple-derived.bc %t.simple-main.bc -S -o %t.simple-combined.ll
 // RUN: FileCheck --input-file=%t.simple-combined.ll -check-prefix=CHECK-ONE %s
 
-// RUN: %clang --target=x86_64-linux -Xclang -disable-O0-optnone -Xclang -disable-llvm-passes -emit-llvm -c -g -flto %s -o %t.simple-base.bc    -DBASE_CODE
-// RUN: %clang --target=x86_64-linux -Xclang -disable-O0-optnone -Xclang -disable-llvm-passes -emit-llvm -c -g -flto %s -o %t.simple-derived.bc -DDERIVED_CODE
-// RUN: %clang --target=x86_64-linux -Xclang -disable-O0-optnone -Xclang -disable-llvm-passes -emit-llvm -c -g -flto %s -o %t.simple-main.bc    -DMAIN_CODE
+// RUN: %clang_cc1 -triple x86_64-linux -emit-llvm-bc -debug-info-kind=limited -dwarf-version=5 -O0 -flto -disable-llvm-passes %s -o %t.simple-base.bc    -DBASE_CODE
+// RUN: %clang_cc1 -triple x86_64-linux -emit-llvm-bc -debug-info-kind=limited -dwarf-version=5 -O0 -flto -disable-llvm-passes %s -o %t.simple-derived.bc -DDERIVED_CODE
+// RUN: %clang_cc1 -triple x86_64-linux -emit-llvm-bc -debug-info-kind=limited -dwarf-version=5 -O0 -flto -disable-llvm-passes %s -o %t.simple-main.bc    -DMAIN_CODE
 // RUN: llvm-link %t.simple-base.bc %t.simple-derived.bc %t.simple-main.bc -S -o %t.simple-combined.ll
 // RUN: FileCheck --input-file=%t.simple-combined.ll -check-prefix=CHECK-ONE %s
 
-// RUN: %clang --target=x86_64-linux -Xclang -disable-O0-optnone -Xclang -disable-llvm-passes -emit-llvm -c -g %s -o %t.simple-base.bc    -DBASE_CODE    -DSYMBOL_AT_FILE_SCOPE
-// RUN: %clang --target=x86_64-linux -Xclang -disable-O0-optnone -Xclang -disable-llvm-passes -emit-llvm -c -g %s -o %t.simple-derived.bc -DDERIVED_CODE -DSYMBOL_AT_FILE_SCOPE
-// RUN: %clang --target=x86_64-linux -Xclang -disable-O0-optnone -Xclang -disable-llvm-passes -emit-llvm -c -g %s -o %t.simple-main.bc    -DMAIN_CODE    -DSYMBOL_AT_FILE_SCOPE
+// RUN: %clang_cc1 -triple x86_64-linux -emit-llvm-bc -debug-info-kind=limited -dwarf-version=5 -O0 -disable-llvm-passes %s -o %t.simple-base.bc    -DBASE_CODE    -DSYMBOL_AT_FILE_SCOPE
+// RUN: %clang_cc1 -triple x86_64-linux -emit-llvm-bc -debug-info-kind=limited -dwarf-version=5 -O0 -disable-llvm-passes %s -o %t.simple-derived.bc -DDERIVED_CODE -DSYMBOL_AT_FILE_SCOPE
+// RUN: %clang_cc1 -triple x86_64-linux -emit-llvm-bc -debug-info-kind=limited -dwarf-version=5 -O0 -disable-llvm-passes %s -o %t.simple-main.bc    -DMAIN_CODE    -DSYMBOL_AT_FILE_SCOPE
 // RUN: llvm-link %t.simple-base.bc %t.simple-derived.bc %t.simple-main.bc -S -o %t.simple-combined.ll
 // RUN: FileCheck --input-file=%t.simple-combined.ll -check-prefix=CHECK-TWO %s
 
-// RUN: %clang --target=x86_64-linux -Xclang -disable-O0-optnone -Xclang -disable-llvm-passes -emit-llvm -c -g -flto %s -o %t.simple-base.bc    -DBASE_CODE    -DSYMBOL_AT_FILE_SCOPE
-// RUN: %clang --target=x86_64-linux -Xclang -disable-O0-optnone -Xclang -disable-llvm-passes -emit-llvm -c -g -flto %s -o %t.simple-derived.bc -DDERIVED_CODE -DSYMBOL_AT_FILE_SCOPE
-// RUN: %clang --target=x86_64-linux -Xclang -disable-O0-optnone -Xclang -disable-llvm-passes -emit-llvm -c -g -flto %s -o %t.simple-main.bc    -DMAIN_CODE    -DSYMBOL_AT_FILE_SCOPE
+// RUN: %clang_cc1 -triple x86_64-linux -emit-llvm-bc -debug-info-kind=limited -dwarf-version=5 -O0 -flto -disable-llvm-passes %s -o %t.simple-base.bc    -DBASE_CODE    -DSYMBOL_AT_FILE_SCOPE
+// RUN: %clang_cc1 -triple x86_64-linux -emit-llvm-bc -debug-info-kind=limited -dwarf-version=5 -O0 -flto -disable-llvm-passes %s -o %t.simple-derived.bc -DDERIVED_CODE -DSYMBOL_AT_FILE_SCOPE
+// RUN: %clang_cc1 -triple x86_64-linux -emit-llvm-bc -debug-info-kind=limited -dwarf-version=5 -O0 -flto -disable-llvm-passes %s -o %t.simple-main.bc    -DMAIN_CODE    -DSYMBOL_AT_FILE_SCOPE
 // RUN: llvm-link %t.simple-base.bc %t.simple-derived.bc %t.simple-main.bc -S -o %t.simple-combined.ll
 // RUN: FileCheck --input-file=%t.simple-combined.ll -check-prefix=CHECK-TWO %s
 
 // CHECK-ONE: ${{_ZN3NSP5CBaseC2Ev|_ZN8CDerivedC2Ev}} = comdat any
 // CHECK-ONE: ${{_ZN3NSP5CBaseC2Ev|_ZN8CDerivedC2Ev}} = comdat any
 
-// CHECK-ONE: @_ZTV8CDerived = {{dso_local|hidden}} unnamed_addr constant {{.*}}, align 8, !dbg [[DERIVED_VTABLE_VAR:![0-9]*]]
-// CHECK-ONE: @_ZTVN3NSP5CBaseE = {{dso_local|hidden}} unnamed_addr constant {{.*}}, align 8, !dbg [[BASE_VTABLE_VAR:![0-9]*]]
+// CHECK-ONE: @_ZTV8CDerived = {{.*}}unnamed_addr constant {{.*}}, align 8, !dbg [[DERIVED_VTABLE_VAR:![0-9]*]]
+// CHECK-ONE: @_ZTVN3NSP5CBaseE = {{.*}}unnamed_addr constant {{.*}}, align 8, !dbg [[BASE_VTABLE_VAR:![0-9]*]]
 
 // CHECK-ONE: [[DERIVED_VTABLE_VAR]] = !DIGlobalVariableExpression(var: [[DERIVED_VTABLE:![0-9]*]], expr: !DIExpression())
 // CHECK-ONE-NEXT: [[DERIVED_VTABLE]] = distinct !DIGlobalVariable(name: "_vtable$", linkageName: "_ZTV8CDerived"
@@ -133,8 +131,8 @@ int main() {
 // CHECK-TWO: ${{_ZN3NSP5CBaseC2Ev|_ZN8CDerivedC2Ev}} = comdat any
 // CHECK-TWO: ${{_ZN3NSP5CBaseC2Ev|_ZN8CDerivedC2Ev}} = comdat any
 
-// CHECK-TWO: @_ZTVN3NSP5CBaseE = {{dso_local|hidden}} unnamed_addr constant {{.*}}, align 8, !dbg [[BASE_VTABLE_VAR:![0-9]*]]
-// CHECK-TWO: @_ZTV8CDerived = {{dso_local|hidden}} unnamed_addr constant {{.*}}, align 8, !dbg [[DERIVED_VTABLE_VAR:![0-9]*]]
+// CHECK-TWO: @_ZTVN3NSP5CBaseE = {{.*}}unnamed_addr constant {{.*}}, align 8, !dbg [[BASE_VTABLE_VAR:![0-9]*]]
+// CHECK-TWO: @_ZTV8CDerived = {{.*}}unnamed_addr constant {{.*}}, align 8, !dbg [[DERIVED_VTABLE_VAR:![0-9]*]]
 
 // CHECK-TWO: [[BASE_VTABLE_VAR]] = !DIGlobalVariableExpression(var: [[BASE_VTABLE:![0-9]*]], expr: !DIExpression())
 // CHECK-TWO-NEXT: [[BASE_VTABLE]] = distinct !DIGlobalVariable(name: "_vtable$", linkageName: "_ZTVN3NSP5CBaseE"
diff --git a/clang/test/DebugInfo/CXX/vtable-inheritance-simple.cpp b/clang/test/DebugInfo/CXX/vtable-inheritance-simple.cpp
index b24ece1598327..8d8c778dbb04e 100644
--- a/clang/test/DebugInfo/CXX/vtable-inheritance-simple.cpp
+++ b/clang/test/DebugInfo/CXX/vtable-inheritance-simple.cpp
@@ -28,18 +28,19 @@ int main() {
   return 0;
 }
 
-// RUN: %clang_cc1 -triple x86_64-linux -emit-llvm -mrelocation-model pic -pic-is-pie -debug-info-kind=limited -dwarf-version=5 -disable-O0-optnone -disable-llvm-passes %s -o - | FileCheck %s
-// RUN: %clang_cc1 -triple x86_64-windows-gnu -emit-llvm -mrelocation-model pic -pic-is-pie -debug-info-kind=limited -dwarf-version=5 -disable-O0-optnone -disable-llvm-passes %s -o - | FileCheck %s --check-prefix=COFF
+// RUN: %clang_cc1 -triple x86_64-linux -emit-llvm -debug-info-kind=limited -dwarf-version=5 -O0 -disable-llvm-passes %s -o - | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-linux -emit-llvm -debug-info-kind=limited -dwarf-version=5 -O1 -disable-llvm-passes %s -o - | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-mingw -emit-llvm -debug-info-kind=limited -dwarf-version=5 -O0 -disable-llvm-passes %s -o - | FileCheck %s --check-prefix=COFF
 
 // CHECK: $_ZTVN3NSP5CBaseE = comdat any
 // CHECK: $_ZTV8CDerived = comdat any
 
-// CHECK: @_ZTVN3NSP5CBaseE = linkonce_odr {{dso_local|hidden}} unnamed_addr constant {{.*}}, comdat, align 8, !dbg [[BASE_VTABLE_VAR:![0-9]*]]
-// CHECK: @_ZTV8CDerived = linkonce_odr {{dso_local|hidden}} unnamed_addr constant {{.*}}, comdat, align 8, !dbg [[DERIVED_VTABLE_VAR:![0-9]*]]
-// COFF: @_ZTVN3NSP5CBaseE = linkonce_odr {{dso_local|hidden}} unnamed_addr constant {{.*}}, comdat, align 8
+// CHECK: @_ZTVN3NSP5CBaseE = linkonce_odr {{.*}}unnamed_addr constant {{.*}}, comdat, align 8, !dbg [[BASE_VTABLE_VAR:![0-9]*]]
+// CHECK: @_ZTV8CDerived = linkonce_odr {{.*}}unnamed_addr constant {{.*}}, comdat, align 8, !dbg [[DERIVED_VTABLE_VAR:![0-9]*]]
+// COFF: @_ZTVN3NSP5CBaseE = linkonce_odr {{.*}}unnamed_addr constant {{.*}}, comdat, align 8
 // COFF-NOT: !dbg
 // COFF-SAME: {{$}}
-// COFF: @_ZTV8CDerived = linkonce_odr {{dso_local|hidden}} unnamed_addr constant {{.*}}, comdat, align 8
+// COFF: @_ZTV8CDerived = linkonce_odr {{.*}}unnamed_addr constant {{.*}}, comdat, align 8
 // COFF-NOT: !dbg
 // COFF-SAME: {{$}}
 
diff --git a/clang/test/DebugInfo/CXX/vtable-inheritance-virtual.cpp b/clang/test/DebugInfo/CXX/vtable-inheritance-virtual.cpp
index b01f156b7f654..c3015f0498419 100644
--- a/clang/test/DebugInfo/CXX/vtable-inheritance-virtual.cpp
+++ b/clang/test/DebugInfo/CXX/vtable-inheritance-virtual.cpp
@@ -1,5 +1,3 @@
-// REQUIRES: target={{x86_64.*-linux.*}}
-
 // Virtual inheritance case:
 // For CBase, CLeft, CRight and CDerived we check:
 // - Generation of their vtables (including attributes).
@@ -44,17 +42,18 @@ int main() {
   return 0;
 }
 
-// RUN: %clang --target=x86_64-linux -Xclang -disable-O0-optnone -Xclang -disable-llvm-passes -emit-llvm -S -g %s -o - | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-linux -emit-llvm -debug-info-kind=limited -dwarf-version=5 -O0 -disable-llvm-passes %s -o - | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-linux -emit-llvm -debug-info-kind=limited -dwarf-version=5 -O1 -disable-llvm-passes %s -o - | FileCheck %s
 
 // CHECK: $_ZTVN3NSP5CBaseE = comdat any
 // CHECK: $_ZTVN5NSP_15CLeftE = comdat any
 // CHECK: $_ZTVN5NSP_26CRightE = comdat any
 // CHECK: $_ZTV8CDerived = comdat any
 
-// CHECK: @_ZTVN3NSP5CBaseE = linkonce_odr {{dso_local|hidden}} unnamed_addr constant {{.*}}, comdat, align 8, !dbg [[BASE_VTABLE_VAR:![0-9]*]]
-// CHECK: @_ZTVN5NSP_15CLeftE = linkonce_odr {{dso_local|hidden}} unnamed_addr constant {{.*}}, comdat, align 8, !dbg [[LEFT_VTABLE_VAR:![0-9]*]]
-// CHECK: @_ZTVN5NSP_26CRightE = linkonce_odr {{dso_local|hidden}} unnamed_addr constant {{.*}}, comdat, align 8, !dbg [[RIGHT_VTABLE_VAR:![0-9]*]]
-// CHECK: @_ZTV8CDerived = linkonce_odr {{dso_local|hidden}} unnamed_addr constant {{.*}}, comdat, align 8, !dbg [[DERIVED_VTABLE_VAR:![0-9]*]]
+// CHECK: @_ZTVN3NSP5CBaseE = linkonce_odr {{.*}}unnamed_addr constant {{.*}}, comdat, align 8, !dbg [[BASE_VTABLE_VAR:![0-9]*]]
+// CHECK: @_ZTVN5NSP_15CLeftE = linkonce_odr {{.*}}unnamed_addr constant {{.*}}, comdat, align 8, !dbg [[LEFT_VTABLE_VAR:![0-9]*]]
+// CHECK: @_ZTVN5NSP_26CRightE = linkonce_odr {{.*}}unnamed_addr constant {{.*}}, comdat, align 8, !dbg [[RIGHT_VTABLE_VAR:![0-9]*]]
+// CHECK: @_ZTV8CDerived = linkonce_odr {{.*}}unnamed_addr constant {{.*}}, comdat, align 8, !dbg [[DERIVED_VTABLE_VAR:![0-9]*]]
 
 // CHECK: [[BASE_VTABLE_VAR]] = !DIGlobalVariableExpression(var: [[BASE_VTABLE:![0-9]*]], expr: !DIExpression())
 // CHECK-NEXT: [[BASE_VTABLE]] = distinct !DIGlobalVariable(name: "_vtable$", linkageName: "_ZTVN3NSP5CBaseE"
diff --git a/clang/test/DebugInfo/CXX/vtable-template-instantiation.cpp b/clang/test/DebugInfo/CXX/vtable-template-instantiation.cpp
new file mode 100644
index 0000000000000..60726d253a686
--- /dev/null
+++ b/clang/test/DebugInfo/CXX/vtable-template-instantiation.cpp
@@ -0,0 +1,84 @@
+// For the `CTemplate` templated class below, check the following cases:
+// - Implicitly instantiated whole class by up-casting (`NOCAST` not defined)
+//   or implicitly instantiated member functions only (`NOCAST` defined):
+//   * The vtable is generated with a COMDAT specifier
+//   * Its '_vtable$' is generated
+// - Define explicitly instantiation (`EXPLICIT` defined):
+//   * The vtable is generated with a COMDAT specifier
+//   * Its '_vtable$' is generated
+// - Declare explicitly instantiation as `extern` (`EXTERN` defined):
+//  # when non-optimized:
+//   * The vtable is declared
+//   * Its '_vtable$' is NOT generated
+//  # when optimized even if no LLVM passes
+//   * The vtable is declared as `available_externally` (which is potentially turned into `external` by LLVM passes)
+//   * Its '_vtable$' is generated
+
+struct CBase {
+  virtual void f() noexcept {}
+};
+
+template <typename T>
+struct CTemplate: CBase {
+  void f() noexcept override;
+  virtual ~CTemplate() noexcept;
+};
+template <typename T>
+void CTemplate<T>::f() noexcept {}
+template <typename T>
+CTemplate<T>::~CTemplate() noexcept {}
+
+#ifdef EXPLICIT
+template struct CTemplate<void>;
+#endif
+#ifdef EXTERN
+extern template struct CTemplate<void>;
+#endif
+
+CTemplate<void> *get(CBase *) noexcept;
+
+int main() {
+  CTemplate<void> Template;
+#ifdef NOCAST
+  get(nullptr)->f();
+#else
+  get(&Template)->f();
+#endif
+
+  return 0;
+}
+
+// RUN: %clang_cc1 -triple x86_64-linux -emit-llvm -debug-info-kind=limited -dwarf-version=5 -O0 -disable-llvm-passes %s -o -             | FileCheck %s -check-prefixes IMPLICIT
+// RUN: %clang_cc1 -triple x86_64-linux -emit-llvm -debug-info-kind=limited -dwarf-version=5 -O1 -disable-llvm-passes %s -o -             | FileCheck %s -check-prefixes IMPLICIT
+// RUN: %clang_cc1 -triple x86_64-linux -emit-llvm -debug-info-kind=limited -dwarf-version=5 -O0 -disable-llvm-passes %s -o - -DNOCAST    | FileCheck %s -check-prefixes IMPLICIT
+// RUN: %clang_cc1 -triple x86_64-linux -emit-llvm -debug-info-kind=limited -dwarf-version=5 -O1 -disable-llvm-passes %s -o - -DNOCAST    | FileCheck %s -check-prefixes IMPLICIT
+// RUN: %clang_cc1 -triple x86_64-linux -emit-llvm -debug-info-kind=limited -dwarf-version=5 -O0 -disable-llvm-passes %s -o - -DEXPLICIT  | FileCheck %s -check-prefixes EXPLICIT
+// RUN: %clang_cc1 -triple x86_64-linux -emit-llvm -debug-info-kind=limited -dwarf-version=5 -O1 -disable-llvm-passes %s -o - -DEXPLICIT  | FileCheck %s -check-prefixes EXPLICIT
+// RUN: %clang_cc1 -triple x86_64-linux -emit-llvm -debug-info-kind=limited -dwarf-version=5 -O0 -disable-llvm-passes %s -o - -DEXTERN    | FileCheck %s -check-prefixes EXTERN,EXTERN-O0
+// RUN: %clang_cc1 -triple x86_64-linux -emit-llvm -debug-info-kind=limited -dwarf-version=5 -O1 -disable-llvm-passes %s -o - -DEXTERN    | FileCheck %s -check-prefixes EXTERN,EXTERN-O1
+
+// IMPLICIT: $_ZTV9CTemplateIvE = comdat any
+// IMPLICIT: @_ZTV9CTemplateIvE = linkonce_odr {{.*}}unnamed_addr constant {{{ \[[^]]*\] } { \[[^]]*\] \[[^]]*\] }}}, comdat, align 8, !dbg [[VTABLE_VAR:![0-9]*]]
+// IMPLICIT-DAG: [[VTABLE:![0-9]+]] = distinct !DIGlobalVariable(name: "_vtable$", linkageName: "_ZTV9CTemplateIvE"
+// IMPLICIT-DAG: !DIGlobalVariableExpression(var: [[VTABLE]], expr: !DIExpression())
+// IMPLICIT-DAG: [[TYPE:![0-9]+]] = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "CTemplate<void>"
+// IMPLICIT-DAG: !DIDerivedType(tag: DW_TAG_variable, name: "_vtable$", scope: [[TYPE]], file: {{.*}}, baseType: [[PVOID:![0-9]+]], flags: DIFlagPrivate | DIFlagArtificial | DIFlagStaticMember)
+// IMPLICIT-DAG: [[PVOID]] = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: null, size: 64)
+
+// EXPLICIT: $_ZTV9CTemplateIvE = comdat any
+// EXPLICIT: @_ZTV9CTemplateIvE = weak_odr {{.*}}unnamed_addr constant {{{ \[[^]]*\] } { \[[^]]*\] \[[^]]*\] }}}, comdat, align 8, !dbg [[VTABLE_VAR:![0-9]*]]
+// EXPLICIT-DAG: [[VTABLE:![0-9]+]] = distinct !DIGlobalVariable(name: "_vtable$", linkageName: "_ZTV9CTemplateIvE"
+// EXPLICIT-DAG: [[VTABLE_VAR]] = !DIGlobalVariableExpression(var: [[VTABLE]], expr: !DIExpression())
+// EXPLICIT-DAG: [[TYPE:![0-9]+]] = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "CTemplate<void>"
+// EXPLICIT-DAG: !DIDerivedType(tag: DW_TAG_variable, name: "_vtable$", scope: [[TYPE]], file: {{.*}}, baseType: [[PVOID:![0-9]+]], flags: DIFlagPrivate | DIFlagArtificial | DIFlagStaticMember)
+// EXPLICIT-DAG: [[PVOID]] = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: null, size: 64)
+
+// EXTERN-NOT: $_ZTV9CTemplateIvE
+// EXTERN-O0: @_ZTV9CTemplateIvE = external {{.*}}unnamed_addr constant {{{ \[[^]]*\] }}}, align 8{{$}}
+// EXTERN-O1: @_ZTV9CTemplateIvE = available_externally {{.*}}unnamed_addr constant {{{ \[[^]]*\] } { \[[^]]*\] \[[^]]*\] }}}, align 8, !dbg [[VTABLE_VAR:![0-9]*]]
+// EXTERN-O0-NOT: linkageName: "_ZTV9CTemplateIvE"
+// EXTERN-O1-DAG: [[VTABLE:![0-9]+]] = distinct !DIGlobalVariable(name: "_vtable$", linkageName: "_ZTV9CTemplateIvE"
+// EXTERN-O1-DAG: [[VTABLE_VAR]] = !DIGlobalVariableExpression(var: [[VTABLE]], expr: !DIExpression())
+// EXTERN-O1-DAG: [[TYPE:![0-9]+]] = !DICompositeType(tag: DW_TAG_structure_type, name: "CTemplate<void>"
+// EXTERN-O1-DAG: !DIDerivedType(tag: DW_TAG_variable, name: "_vtable$", scope: [[TYPE]], file: {{.*}}, baseType: [[PVOID:![0-9]+]], flags: DIFlagPrivate | DIFlagArtificial | DIFlagStaticMember)
+// EXTERN-O1-DAG: [[PVOID]] = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: null, size: 64)
diff --git a/clang/test/DebugInfo/Generic/cc.c b/clang/test/DebugInfo/Generic/cc.c
index 2bfb1c28e9353..e430e4c8ed87b 100644
--- a/clang/test/DebugInfo/Generic/cc.c
+++ b/clang/test/DebugInfo/Generic/cc.c
@@ -1,5 +1,7 @@
 // RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -o - -emit-llvm -debug-info-kind=limited %s | FileCheck %s --check-prefix=LINUX
-// RUN: %clang_cc1 -triple x86_64-unknown-windows-msvc -o - -emit-llvm -debug-info-kind=limited %s | FileCheck %s --check-prefix=WINDOWS
+// RUN: %clang_cc1 -triple x86_64-unknown-windows-msvc   -o - -emit-llvm -debug-info-kind=limited %s | FileCheck %s --check-prefix=WINDOWS
+// RUN: %clang_cc1 -triple x86_64-unknown-windows-gnu    -o - -emit-llvm -debug-info-kind=limited %s | FileCheck %s --check-prefix=WINDOWS
+// RUN: %clang_cc1 -triple x86_64-unknown-windows-cygnus -o - -emit-llvm -debug-info-kind=limited %s | FileCheck %s --check-prefix=WINDOWS
 // RUN: %clang_cc1 -triple i386-pc-linux-gnu -o - -emit-llvm -debug-info-kind=limited %s | FileCheck %s --check-prefix=LINUX32
 // RUN: %clang_cc1 -triple armv7--linux-gnueabihf -o - -emit-llvm -debug-info-kind=limited %s | FileCheck %s --check-prefix=ARM
 
@@ -77,7 +79,7 @@ __attribute__((intel_ocl_bicc)) int add_inteloclbicc(int a, int b) {
 }
 #endif
 
-#ifdef _WIN64
+#if defined(_WIN64) || defined(__CYGWIN__)
 // WINDOWS: !DISubprogram({{.*}}"add_sysvabi", {{.*}}type: ![[FTY:[0-9]+]]
 // WINDOWS: ![[FTY]] = !DISubroutineType({{.*}}cc: DW_CC_LLVM_X86_64SysV,
 __attribute__((sysv_abi)) int add_sysvabi(int a, int b) {
diff --git a/clang/test/DebugInfo/Generic/unsigned-promotion-debuginfo.c b/clang/test/DebugInfo/Generic/unsigned-promotion-debuginfo.c
index 88e691d65334c..6ca17e1f9f285 100644
--- a/clang/test/DebugInfo/Generic/unsigned-promotion-debuginfo.c
+++ b/clang/test/DebugInfo/Generic/unsigned-promotion-debuginfo.c
@@ -1,4 +1,4 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6
 // RUN: %clang_cc1 -O2 -triple x86_64-linux-gnu -emit-llvm -o - %s \
 // RUN:   -fdebug-prefix-map=%S/= -fno-ident -fdebug-compilation-dir=%S -debug-info-kind=limited \
 // RUN:   -fsanitize-annotate-debug-info=signed-integer-overflow \
@@ -14,9 +14,9 @@ unsigned short si, sj, sk;
 // CHECKS-LABEL: define dso_local void @testshortmul(
 // CHECKS-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] !dbg [[DBG13:![0-9]+]] {
 // CHECKS-NEXT:  [[ENTRY:.*:]]
-// CHECKS-NEXT:    [[TMP0:%.*]] = load i16, ptr @sj, align 2, !dbg [[DBG16:![0-9]+]], !tbaa [[TBAA17:![0-9]+]]
+// CHECKS-NEXT:    [[TMP0:%.*]] = load i16, ptr @sj, align 2, !dbg [[DBG16:![0-9]+]], !tbaa [[SHORT_TBAA17:![0-9]+]]
 // CHECKS-NEXT:    [[CONV:%.*]] = zext i16 [[TMP0]] to i32, !dbg [[DBG16]]
-// CHECKS-NEXT:    [[TMP1:%.*]] = load i16, ptr @sk, align 2, !dbg [[DBG21:![0-9]+]], !tbaa [[TBAA17]]
+// CHECKS-NEXT:    [[TMP1:%.*]] = load i16, ptr @sk, align 2, !dbg [[DBG21:![0-9]+]], !tbaa [[SHORT_TBAA17]]
 // CHECKS-NEXT:    [[CONV1:%.*]] = zext i16 [[TMP1]] to i32, !dbg [[DBG21]]
 // CHECKS-NEXT:    [[TMP2:%.*]] = tail call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[CONV]], i32 [[CONV1]]), !dbg [[DBG22:![0-9]+]], !nosanitize [[META26:![0-9]+]]
 // CHECKS-NEXT:    [[TMP3:%.*]] = extractvalue { i32, i1 } [[TMP2]], 1, !dbg [[DBG22]], !nosanitize [[META26]]
@@ -29,16 +29,16 @@ unsigned short si, sj, sk;
 // CHECKS:       [[CONT]]:
 // CHECKS-NEXT:    [[TMP6:%.*]] = extractvalue { i32, i1 } [[TMP2]], 0, !dbg [[DBG22]], !nosanitize [[META26]]
 // CHECKS-NEXT:    [[CONV2:%.*]] = trunc i32 [[TMP6]] to i16, !dbg [[DBG16]]
-// CHECKS-NEXT:    store i16 [[CONV2]], ptr @si, align 2, !dbg [[DBG28:![0-9]+]], !tbaa [[TBAA17]]
+// CHECKS-NEXT:    store i16 [[CONV2]], ptr @si, align 2, !dbg [[DBG28:![0-9]+]], !tbaa [[SHORT_TBAA17]]
 // CHECKS-NEXT:    ret void, !dbg [[DBG29:![0-9]+]]
 //
 // CHECKU-LABEL: define dso_local void @testshortmul(
 // CHECKU-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] !dbg [[DBG13:![0-9]+]] {
 // CHECKU-NEXT:  [[ENTRY:.*:]]
-// CHECKU-NEXT:    [[TMP0:%.*]] = load i16, ptr @sj, align 2, !dbg [[DBG16:![0-9]+]], !tbaa [[TBAA17:![0-9]+]]
-// CHECKU-NEXT:    [[TMP1:%.*]] = load i16, ptr @sk, align 2, !dbg [[DBG21:![0-9]+]], !tbaa [[TBAA17]]
+// CHECKU-NEXT:    [[TMP0:%.*]] = load i16, ptr @sj, align 2, !dbg [[DBG16:![0-9]+]], !tbaa [[SHORT_TBAA17:![0-9]+]]
+// CHECKU-NEXT:    [[TMP1:%.*]] = load i16, ptr @sk, align 2, !dbg [[DBG21:![0-9]+]], !tbaa [[SHORT_TBAA17]]
 // CHECKU-NEXT:    [[MUL:%.*]] = mul i16 [[TMP1]], [[TMP0]], !dbg [[DBG22:![0-9]+]]
-// CHECKU-NEXT:    store i16 [[MUL]], ptr @si, align 2, !dbg [[DBG23:![0-9]+]], !tbaa [[TBAA17]]
+// CHECKU-NEXT:    store i16 [[MUL]], ptr @si, align 2, !dbg [[DBG23:![0-9]+]], !tbaa [[SHORT_TBAA17]]
 // CHECKU-NEXT:    ret void, !dbg [[DBG24:![0-9]+]]
 //
 void testshortmul(void) {
@@ -50,7 +50,7 @@ void testshortmul(void) {
 // CHECKS: [[META0:![0-9]+]] = !DIGlobalVariableExpression(var: [[META1:![0-9]+]], expr: !DIExpression())
 // CHECKS: [[META1]] = distinct !DIGlobalVariable(name: "sj", scope: [[META2:![0-9]+]], file: [[META7:![0-9]+]], line: 12, type: [[META8:![0-9]+]], isLocal: false, isDefinition: true)
 // CHECKS: [[META2]] = distinct !DICompileUnit(language: DW_LANG_C11, file: [[META3:![0-9]+]], isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, globals: [[META4:![0-9]+]], splitDebugInlining: false, nameTableKind: None)
-// CHECKS: [[META3]] = !DIFile(filename: "<stdin>", directory: {{.*}})
+// CHECKS: [[META3]] = !DIFile(filename: "{{.*}}<stdin>", directory: {{.*}})
 // CHECKS: [[META4]] = !{[[META5:![0-9]+]], [[META0]], [[META9:![0-9]+]]}
 // CHECKS: [[META5]] = !DIGlobalVariableExpression(var: [[META6:![0-9]+]], expr: !DIExpression())
 // CHECKS: [[META6]] = distinct !DIGlobalVariable(name: "si", scope: [[META2]], file: [[META7]], line: 12, type: [[META8]], isLocal: false, isDefinition: true)
@@ -62,7 +62,7 @@ void testshortmul(void) {
 // CHECKS: [[META14]] = !DISubroutineType(types: [[META15:![0-9]+]])
 // CHECKS: [[META15]] = !{null}
 // CHECKS: [[DBG16]] = !DILocation(line: 47, column: 8, scope: [[DBG13]])
-// CHECKS: [[TBAA17]] = !{[[META18:![0-9]+]], [[META18]], i64 0}
+// CHECKS: [[SHORT_TBAA17]] = !{[[META18:![0-9]+]], [[META18]], i64 0}
 // CHECKS: [[META18]] = !{!"short", [[META19:![0-9]+]], i64 0}
 // CHECKS: [[META19]] = !{!"omnipotent char", [[META20:![0-9]+]], i64 0}
 // CHECKS: [[META20]] = !{!"Simple C/C++ TBAA"}
@@ -79,7 +79,7 @@ void testshortmul(void) {
 // CHECKU: [[META0:![0-9]+]] = !DIGlobalVariableExpression(var: [[META1:![0-9]+]], expr: !DIExpression())
 // CHECKU: [[META1]] = distinct !DIGlobalVariable(name: "sj", scope: [[META2:![0-9]+]], file: [[META7:![0-9]+]], line: 12, type: [[META8:![0-9]+]], isLocal: false, isDefinition: true)
 // CHECKU: [[META2]] = distinct !DICompileUnit(language: DW_LANG_C11, file: [[META3:![0-9]+]], isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, globals: [[META4:![0-9]+]], splitDebugInlining: false, nameTableKind: None)
-// CHECKU: [[META3]] = !DIFile(filename: "<stdin>", directory: {{.*}})
+// CHECKU: [[META3]] = !DIFile(filename: "{{.*}}<stdin>", directory: {{.*}})
 // CHECKU: [[META4]] = !{[[META5:![0-9]+]], [[META0]], [[META9:![0-9]+]]}
 // CHECKU: [[META5]] = !DIGlobalVariableExpression(var: [[META6:![0-9]+]], expr: !DIExpression())
 // CHECKU: [[META6]] = distinct !DIGlobalVariable(name: "si", scope: [[META2]], file: [[META7]], line: 12, type: [[META8]], isLocal: false, isDefinition: true)
@@ -91,7 +91,7 @@ void testshortmul(void) {
 // CHECKU: [[META14]] = !DISubroutineType(types: [[META15:![0-9]+]])
 // CHECKU: [[META15]] = !{null}
 // CHECKU: [[DBG16]] = !DILocation(line: 47, column: 8, scope: [[DBG13]])
-// CHECKU: [[TBAA17]] = !{[[META18:![0-9]+]], [[META18]], i64 0}
+// CHECKU: [[SHORT_TBAA17]] = !{[[META18:![0-9]+]], [[META18]], i64 0}
 // CHECKU: [[META18]] = !{!"short", [[META19:![0-9]+]], i64 0}
 // CHECKU: [[META19]] = !{!"omnipotent char", [[META20:![0-9]+]], i64 0}
 // CHECKU: [[META20]] = !{!"Simple C/C++ TBAA"}
diff --git a/clang/test/Driver/DTLTO/dtlto.c b/clang/test/Driver/DTLTO/dtlto.c
index 96795d9a4e6a4..66299c890e4b3 100644
--- a/clang/test/Driver/DTLTO/dtlto.c
+++ b/clang/test/Driver/DTLTO/dtlto.c
@@ -1,16 +1,21 @@
 // REQUIRES: lld
 
+/// https://github.com/llvm/llvm-project/issues/159125.
+// XFAIL: llvm-driver
+
 /// Check DTLTO options are forwarded to the linker.
 
 /// Check that options are forwarded as expected with --thinlto-distributor=.
+// RUN: %python %S/filename.py %clang > %t_forward.log
 // RUN: %clang -flto=thin %s -### -fuse-ld=lld --target=x86_64-linux-gnu \
 // RUN:   -Xthinlto-distributor=a1 -Xthinlto-distributor=a2,a3 \
-// RUN:   -fthinlto-distributor=d.exe -Werror 2>&1 | \
-// RUN:   FileCheck %s --check-prefix=FORWARD
+// RUN:   -fthinlto-distributor=d.exe -Werror >>%t_forward.log 2>&1
+// RUN: FileCheck %s --input-file=%t_forward.log --check-prefix=FORWARD
 
+// FORWARD: filename.py:[[CLANG:.*]]
 // FORWARD: ld.lld
 // FORWARD-SAME: "--thinlto-distributor=d.exe"
-// FORWARD-SAME: "--thinlto-remote-compiler={{[^"]+}}"
+// FORWARD-SAME: "--thinlto-remote-compiler={{.*}}[[CLANG]]"
 // FORWARD-SAME: "--thinlto-distributor-arg=a1"
 // FORWARD-SAME: "--thinlto-distributor-arg=a2"
 // FORWARD-SAME: "--thinlto-distributor-arg=a3"
@@ -19,8 +24,8 @@
 /// that a warning is issued for unused -Xthinlto-distributor options.
 // RUN: %clang -flto=thin %s -### -fuse-ld=lld --target=x86_64-linux-gnu \
 // RUN:   -Xthinlto-distributor=a1 -Xthinlto-distributor=a2,a3 2>&1 | \
-// RUN:   FileCheck %s --check-prefix=NODIST --implicit-check-not=distributor \
-// RUN:     --implicit-check-not=remote-compiler
+// RUN: FileCheck %s --check-prefix=NODIST --implicit-check-not=distributor \
+// RUN:   --implicit-check-not=remote-compiler
 
 // NODIST: warning: argument unused during compilation: '-Xthinlto-distributor=a1'
 // NODIST: warning: argument unused during compilation: '-Xthinlto-distributor=a2,a3'
@@ -28,21 +33,23 @@
 
 /// Check the expected arguments are forwarded by default with only
 /// --thinlto-distributor=.
+// RUN: %python %S/filename.py %clang > %t_default.log
 // RUN: %clang -flto=thin %s -### -fuse-ld=lld --target=x86_64-linux-gnu \
-// RUN:   -fthinlto-distributor=d.exe -Werror 2>&1 | \
-// RUN:   FileCheck %s --check-prefix=DEFAULT --implicit-check-not=distributor \
-// RUN:     --implicit-check-not=remote-compiler
+// RUN:   -fthinlto-distributor=d.exe -Werror >>%t_default.log 2>&1
+// RUN: FileCheck %s --input-file=%t_default.log --check-prefix=DEFAULT \
+// RUN:   --implicit-check-not=distributor --implicit-check-not=remote-compiler
 
 // DEFAULT: ld.lld
 // DEFAULT-SAME: "--thinlto-distributor=d.exe"
-// DEFAULT-SAME: "--thinlto-remote-compiler={{.*}}clang{{[^\"]*}}"
+// DEFAULT-SAME: "--thinlto-remote-compiler={{[^"]+}}"
 
 /// Check that nothing is forwarded when the compiler is not in LTO mode, and that
 /// appropriate unused option warnings are issued.
+// RUN: %python %S/filename.py %clang > %t_noflto.log
 // RUN: %clang %s -### -fuse-ld=lld --target=x86_64-linux-gnu \
-// RUN:   -fthinlto-distributor=d.exe 2>&1 | \
-// RUN:   FileCheck %s --check-prefix=NOFLTO --implicit-check-not=distributor \
-// RUN:     --implicit-check-not=remote-compiler
+// RUN:   -fthinlto-distributor=d.exe  >>%t_noflto.log 2>&1
+// RUN: FileCheck %s --input-file=%t_noflto.log --check-prefix=NOFLTO \
+// RUN:   --implicit-check-not=distributor --implicit-check-not=remote-compiler
 
 // NOFLTO: warning: argument unused during compilation: '-fthinlto-distributor=d.exe'
 // NOFLTO: ld.lld
diff --git a/clang/test/Driver/DTLTO/filename.py b/clang/test/Driver/DTLTO/filename.py
new file mode 100644
index 0000000000000..df1aeb6682543
--- /dev/null
+++ b/clang/test/Driver/DTLTO/filename.py
@@ -0,0 +1,4 @@
+from pathlib import Path
+import sys
+
+print(f"filename.py:{Path(sys.argv[1]).resolve().name}")
diff --git a/clang/test/Driver/DTLTO/ps5-dtlto.c b/clang/test/Driver/DTLTO/ps5-dtlto.c
new file mode 100644
index 0000000000000..b52765db5b1c7
--- /dev/null
+++ b/clang/test/Driver/DTLTO/ps5-dtlto.c
@@ -0,0 +1,53 @@
+// REQUIRES: lld
+
+/// https://github.com/llvm/llvm-project/issues/159125.
+// XFAIL: llvm-driver
+
+/// Check DTLTO options are forwarded to the linker.
+
+/// Check that options are forwarded as expected with --thinlto-distributor=.
+// RUN: %python %S/filename.py %clang > %t_forward.log
+// RUN: %clang -flto=thin %s -### --target=x86_64-sie-ps5 \
+// RUN:   -Xthinlto-distributor=a1 -Xthinlto-distributor=a2,a3 \
+// RUN:   -fthinlto-distributor=d.exe -Werror >>%t_forward.log 2>&1
+// RUN: FileCheck %s --input-file=%t_forward.log --check-prefix=FORWARD
+
+// FORWARD: filename.py:[[CLANG:.*]]
+// FORWARD: prospero-lld
+// FORWARD-SAME: "--thinlto-distributor=d.exe"
+// FORWARD-SAME: "--thinlto-remote-compiler={{.*}}[[CLANG]]"
+// FORWARD-SAME: "--thinlto-distributor-arg=a1"
+// FORWARD-SAME: "--thinlto-distributor-arg=a2"
+// FORWARD-SAME: "--thinlto-distributor-arg=a3"
+
+/// Check that options are not added without --thinlto-distributor= and
+/// that a warning is issued for unused -Xthinlto-distributor options.
+// RUN: %clang -flto=thin %s -### --target=x86_64-sie-ps5 \
+// RUN:   -Xthinlto-distributor=a1 -Xthinlto-distributor=a2,a3 2>&1 | \
+// RUN: FileCheck %s --check-prefix=NODIST --implicit-check-not=distributor \
+// RUN:   --implicit-check-not=remote-compiler
+
+// NODIST: warning: argument unused during compilation: '-Xthinlto-distributor=a1'
+// NODIST: warning: argument unused during compilation: '-Xthinlto-distributor=a2,a3'
+// NODIST: prospero-lld
+
+/// Check the expected arguments are forwarded by default with only
+/// --thinlto-distributor=.
+// RUN: %python %S/filename.py %clang > %t_default.log
+// RUN: %clang -flto=thin %s -### --target=x86_64-sie-ps5 \
+// RUN:   -fthinlto-distributor=d.exe -Werror >>%t_default.log 2>&1
+// RUN: FileCheck %s --input-file=%t_default.log --check-prefix=DEFAULT \
+// RUN:   --implicit-check-not=distributor --implicit-check-not=remote-compiler
+
+// DEFAULT: filename.py:[[CLANG:.*]]
+// DEFAULT: prospero-lld
+// DEFAULT-SAME: "--thinlto-distributor=d.exe"
+// DEFAULT-SAME: "--thinlto-remote-compiler={{.*}}[[CLANG]]"
+
+/// Check that the arguments are forwarded unconditionally even when the
+/// compiler is not in LTO mode.
+// RUN: %python %S/filename.py %clang > %t_noflto.log
+// RUN: %clang %s -### --target=x86_64-sie-ps5 \
+// RUN:   -fthinlto-distributor=d.exe -Werror >>%t_noflto.log 2>&1
+// RUN: FileCheck %s --input-file=%t_noflto.log --check-prefix=DEFAULT \
+// RUN:   --implicit-check-not=distributor --implicit-check-not=remote-compiler
diff --git a/clang/test/Driver/aarch64-features.c b/clang/test/Driver/aarch64-features.c
index 11c7343544345..2fbb8629997d9 100644
--- a/clang/test/Driver/aarch64-features.c
+++ b/clang/test/Driver/aarch64-features.c
@@ -44,6 +44,9 @@
 // RUN: %clang --target=aarch64-windows-gnu -rtlib=compiler-rt \
 // RUN: -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-OUTLINE-ATOMICS-OFF %s
 
+// RUN: %clang --target=aarch64-unknown-freebsd -rtlib=compiler-rt \
+// RUN: -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-OUTLINE-ATOMICS-ON %s
+
 // RUN: %clang --target=aarch64-unknown-openbsd -rtlib=compiler-rt \
 // RUN: -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-OUTLINE-ATOMICS-ON %s
 
diff --git a/clang/test/Driver/aarch64-ptrauth.c b/clang/test/Driver/aarch64-ptrauth.c
index 1d2993f4c60c4..5bb963a90fcbd 100644
--- a/clang/test/Driver/aarch64-ptrauth.c
+++ b/clang/test/Driver/aarch64-ptrauth.c
@@ -23,18 +23,23 @@
 // RUN: %clang -### -c --target=aarch64-linux-pauthtest %s 2>&1 | FileCheck %s --check-prefix=PAUTHABI1
 // PAUTHABI1:      "-cc1"{{.*}} "-triple" "aarch64-unknown-linux-pauthtest"
 // PAUTHABI1-SAME: "-target-abi" "pauthtest"
-// PAUTHABI1-SAME: "-fptrauth-intrinsics" "-fptrauth-calls" "-fptrauth-returns" "-fptrauth-auth-traps" "-fptrauth-vtable-pointer-address-discrimination" "-fptrauth-vtable-pointer-type-discrimination" "-fptrauth-indirect-gotos" "-fptrauth-init-fini"
+// PAUTHABI1-SAME: "-fptrauth-intrinsics" "-fptrauth-calls" "-fptrauth-returns" "-fptrauth-auth-traps" "-fptrauth-vtable-pointer-address-discrimination" "-fptrauth-vtable-pointer-type-discrimination" "-fptrauth-type-info-vtable-pointer-discrimination" "-fptrauth-indirect-gotos" "-fptrauth-init-fini" "-fptrauth-init-fini-address-discrimination" "-faarch64-jump-table-hardening"
 
 // RUN: %clang -### -c --target=aarch64 -mabi=pauthtest -fno-ptrauth-intrinsics \
 // RUN:   -fno-ptrauth-calls -fno-ptrauth-returns -fno-ptrauth-auth-traps \
 // RUN:   -fno-ptrauth-vtable-pointer-address-discrimination -fno-ptrauth-vtable-pointer-type-discrimination \
-// RUN:   -fno-ptrauth-indirect-gotos -fno-ptrauth-init-fini %s 2>&1 | FileCheck %s --check-prefix=PAUTHABI2
+// RUN:   -fno-ptrauth-type-info-vtable-pointer-discrimination -fno-ptrauth-indirect-gotos \
+// RUN:   -fno-ptrauth-init-fini -fno-ptrauth-init-fini-address-discrimination \
+// RUN:   -fno-aarch64-jump-table-hardening %s 2>&1 | FileCheck %s --check-prefix=PAUTHABI2
 // RUN: %clang -### -c --target=aarch64-pauthtest -fno-ptrauth-intrinsics \
 // RUN:   -fno-ptrauth-calls -fno-ptrauth-returns -fno-ptrauth-auth-traps \
 // RUN:   -fno-ptrauth-vtable-pointer-address-discrimination -fno-ptrauth-vtable-pointer-type-discrimination \
-// RUN:   -fno-ptrauth-indirect-gotos -fno-ptrauth-init-fini %s 2>&1 | FileCheck %s --check-prefix=PAUTHABI2
+// RUN:   -fno-ptrauth-type-info-vtable-pointer-discrimination -fno-ptrauth-indirect-gotos \
+// RUN:   -fno-ptrauth-init-fini -fno-ptrauth-init-fini-address-discrimination \
+// RUN:   -fno-aarch64-jump-table-hardening %s 2>&1 | FileCheck %s --check-prefix=PAUTHABI2
 // PAUTHABI2:     "-cc1"
 // PAUTHABI2-NOT: "-fptrauth-
+// PAUTHABI2-NOT: "-faarch64-jump-table-hardening"
 
 // RUN: not %clang -### -c --target=x86_64 -fptrauth-intrinsics -fptrauth-calls -fptrauth-returns -fptrauth-auth-traps \
 // RUN:   -fptrauth-vtable-pointer-address-discrimination -fptrauth-vtable-pointer-type-discrimination \
diff --git a/clang/test/Driver/aarch64-toolchain-extra.c b/clang/test/Driver/aarch64-toolchain-extra.c
index 4945a622969c6..ccd2876ea84cb 100644
--- a/clang/test/Driver/aarch64-toolchain-extra.c
+++ b/clang/test/Driver/aarch64-toolchain-extra.c
@@ -2,8 +2,7 @@
 
 // The tests here are similar to those in aarch64-toolchain.c, however
 // these tests need to create symlinks to test directory trees in order to
-// set up the environment and therefore shell support is required.
-// REQUIRES: shell
+// set up the environment and therefore POSIX is required.
 // UNSUPPORTED: system-windows
 
 // If there is no GCC install detected then the driver searches for executables
diff --git a/clang/test/Driver/aix-gsplit-dwarf.c b/clang/test/Driver/aix-gsplit-dwarf.c
new file mode 100644
index 0000000000000..22f2a66961f36
--- /dev/null
+++ b/clang/test/Driver/aix-gsplit-dwarf.c
@@ -0,0 +1,12 @@
+// Verify error message is emitted for `-gsplit-dwarf` on AIX 
+// as it's unsupported at the moment.
+
+// RUN: not %clang -target powerpc-ibm-aix -gdwarf-4 -gsplit-dwarf %s 2>&1 \
+// RUN: | FileCheck %s --check-prefix=UNSUP_OPT_AIX
+// RUN: not %clang -target powerpc64-ibm-aix -gdwarf-4 -gsplit-dwarf %s 2>&1 \
+// RUN: | FileCheck %s --check-prefix=UNSUP_OPT_AIX64
+
+// UNSUP_OPT_AIX: error: unsupported option '-gsplit-dwarf' for target 'powerpc-ibm-aix'
+// UNSUP_OPT_AIX64: error: unsupported option '-gsplit-dwarf' for target 'powerpc64-ibm-aix'
+
+int main(){return 0;}
diff --git a/clang/test/Driver/amdgpu-hip-system-arch.c b/clang/test/Driver/amdgpu-hip-system-arch.c
index 12e298a8636b1..972105143debf 100644
--- a/clang/test/Driver/amdgpu-hip-system-arch.c
+++ b/clang/test/Driver/amdgpu-hip-system-arch.c
@@ -1,4 +1,5 @@
-// REQUIRES: shell
+// Needs chmod
+// UNSUPPORTED: system-windows
 // XFAIL: target={{.*}}-zos{{.*}}
 
 // RUN: mkdir -p %t
diff --git a/clang/test/Driver/amdgpu-openmp-system-arch-fail.c b/clang/test/Driver/amdgpu-openmp-system-arch-fail.c
index eb037183b4c3c..8973d66afbae4 100644
--- a/clang/test/Driver/amdgpu-openmp-system-arch-fail.c
+++ b/clang/test/Driver/amdgpu-openmp-system-arch-fail.c
@@ -1,4 +1,5 @@
-// REQUIRES: shell
+// Due to chmod
+// UNSUPPORTED: system-windows
 
 // RUN: mkdir -p %t
 // RUN: rm -f %t/amdgpu_arch_fail %t/amdgpu_arch_different
diff --git a/clang/test/Driver/arm-toolchain-extra.c b/clang/test/Driver/arm-toolchain-extra.c
index 43cca6112176c..03b30d6540898 100644
--- a/clang/test/Driver/arm-toolchain-extra.c
+++ b/clang/test/Driver/arm-toolchain-extra.c
@@ -3,7 +3,6 @@
 // The tests here are similar to those in arm-toolchain.c, however
 // these tests need to create symlinks to test directory trees in order to
 // set up the environment and therefore shell support is required.
-// REQUIRES: shell
 // UNSUPPORTED: system-windows
 
 // If there is no GCC install detected then the driver searches for executables
diff --git a/clang/test/Driver/baremetal-multilib-layered.yaml b/clang/test/Driver/baremetal-multilib-layered.yaml
index 6671d9d672f58..61d69140fb7fd 100644
--- a/clang/test/Driver/baremetal-multilib-layered.yaml
+++ b/clang/test/Driver/baremetal-multilib-layered.yaml
@@ -1,4 +1,3 @@
-# REQUIRES: shell
 # UNSUPPORTED: system-windows
 
 # This test demonstrates "layered" multilib in which more than one
diff --git a/clang/test/Driver/baremetal-multilib.yaml b/clang/test/Driver/baremetal-multilib.yaml
index 1a80c3b4ccfc8..c2b37fad97dea 100644
--- a/clang/test/Driver/baremetal-multilib.yaml
+++ b/clang/test/Driver/baremetal-multilib.yaml
@@ -1,4 +1,3 @@
-# REQUIRES: shell
 # UNSUPPORTED: system-windows
 
 # RUN: %clang --multi-lib-config=%s -no-canonical-prefixes -x c++ %s -### -o %t.out 2>&1 \
diff --git a/clang/test/Driver/baremetal-sysroot.cpp b/clang/test/Driver/baremetal-sysroot.cpp
index 4c062e28e6bc3..717466c185763 100644
--- a/clang/test/Driver/baremetal-sysroot.cpp
+++ b/clang/test/Driver/baremetal-sysroot.cpp
@@ -1,4 +1,3 @@
-// REQUIRES: shell
 // UNSUPPORTED: system-windows
 
 // Test that when a --sysroot is not provided, driver picks the default
diff --git a/clang/test/Driver/clang_f_opts.c b/clang/test/Driver/clang_f_opts.c
index ee7ded265769b..bdeb747aa66a3 100644
--- a/clang/test/Driver/clang_f_opts.c
+++ b/clang/test/Driver/clang_f_opts.c
@@ -52,6 +52,15 @@
 // CHECK-INTERCHANGE-LOOPS: "-floop-interchange"
 // CHECK-NO-INTERCHANGE-LOOPS: "-fno-loop-interchange"
 
+// RUN: %clang -### -S -fexperimental-loop-fusion %s -o /dev/null 2>&1 | FileCheck -check-prefix=CHECK-FUSE-LOOPS %s
+// CHECK-FUSE-LOOPS: "-fexperimental-loop-fusion"
+//
+// RUN: %clang -c -fexperimental-loop-fusion -mllvm -print-pipeline-passes -O3 %s -o /dev/null 2>&1 | FileCheck --check-prefixes=LOOP-FUSION-ON %s
+// RUN: %clang -c -mllvm -print-pipeline-passes -O3 %s -o /dev/null 2>&1 | FileCheck --check-prefixes=LOOP-FUSION-OFF %s
+
+// LOOP-FUSION-ON: loop-fusion
+// LOOP-FUSION-OFF-NOT: loop-fusion
+
 // RUN: %clang -### -S -fprofile-sample-accurate %s 2>&1 | FileCheck -check-prefix=CHECK-PROFILE-SAMPLE-ACCURATE %s
 // CHECK-PROFILE-SAMPLE-ACCURATE: "-fprofile-sample-accurate"
 
@@ -214,11 +223,11 @@
 // RUN: %clang -S -O20 -o /dev/null %s 2>&1 | FileCheck -check-prefix=CHECK-INVALID-O %s
 // CHECK-INVALID-O: warning: optimization level '-O20' is not supported; using '-O3' instead
 
-// RUN: not %clang -### -S -finput-charset=iso-8859-1 -o /dev/null %s 2>&1 | FileCheck -check-prefix=CHECK-INVALID-CHARSET %s
-// CHECK-INVALID-CHARSET: error: invalid value 'iso-8859-1' in '-finput-charset=iso-8859-1'
+// RUN: not %clang -### -S -finput-charset=iso-8859-1 -o /dev/null %s 2>&1 | FileCheck -check-prefix=CHECK-INVALID-INPUT-CHARSET %s
+// CHECK-INVALID-INPUT-CHARSET: error: invalid value 'iso-8859-1' in '-finput-charset=iso-8859-1'
 
-// RUN: not %clang -### -S -fexec-charset=iso-8859-1 -o /dev/null %s 2>&1 | FileCheck -check-prefix=CHECK-INVALID-INPUT-CHARSET %s
-// CHECK-INVALID-INPUT-CHARSET: error: invalid value 'iso-8859-1' in '-fexec-charset=iso-8859-1'
+// RUN: not %clang -### -S -fexec-charset=iso-8859-1 -o /dev/null %s 2>&1 | FileCheck -check-prefix=CHECK-INVALID-EXEC-CHARSET %s
+// CHECK-INVALID-EXEC-CHARSET: error: invalid value 'iso-8859-1' in '-fexec-charset=iso-8859-1'
 
 // Test that we don't error on these.
 // RUN: not %clang -### -S -Werror                                                \
diff --git a/clang/test/Driver/darwin-ld-demangle-lld.c b/clang/test/Driver/darwin-ld-demangle-lld.c
index 12fd8502ce7a0..4edeb1058b933 100644
--- a/clang/test/Driver/darwin-ld-demangle-lld.c
+++ b/clang/test/Driver/darwin-ld-demangle-lld.c
@@ -1,5 +1,4 @@
 // With -fuse-ld=lld, -demangle is always passed to the linker on Darwin.
-// REQUIRES: shell
 
 // RUN: %clang --target=x86_64-apple-darwin -### -fuse-ld=lld \
 // RUN:   -B%S/Inputs/lld -mlinker-version=0 %s 2>&1 \
diff --git a/clang/test/Driver/darwin-ld-lto-lld.c b/clang/test/Driver/darwin-ld-lto-lld.c
index 2f44cad534b1f..3e110463fc286 100644
--- a/clang/test/Driver/darwin-ld-lto-lld.c
+++ b/clang/test/Driver/darwin-ld-lto-lld.c
@@ -1,5 +1,3 @@
-// REQUIRES: shell
-
 // Check that lld gets "-lto_library".
 // (Separate test file since darwin-ld-lto requires system-darwin but this
 // test doesn't require that.)
diff --git a/clang/test/Driver/mingw-sysroot.cpp b/clang/test/Driver/mingw-sysroot.cpp
index 0ba2f336fd2e0..8e46d23c1782d 100644
--- a/clang/test/Driver/mingw-sysroot.cpp
+++ b/clang/test/Driver/mingw-sysroot.cpp
@@ -1,4 +1,3 @@
-// REQUIRES: shell
 // UNSUPPORTED: system-windows
 
 // RUN: rm -rf %t.dir/testroot-gcc
diff --git a/clang/test/Driver/no-canonical-prefixes.c b/clang/test/Driver/no-canonical-prefixes.c
index 669e56639284a..f2b76db8de7e1 100644
--- a/clang/test/Driver/no-canonical-prefixes.c
+++ b/clang/test/Driver/no-canonical-prefixes.c
@@ -1,5 +1,5 @@
 // Due to ln -sf:
-// REQUIRES: shell
+// UNSUPPORTED: system-windows
 // RUN: mkdir -p %t.real
 // RUN: cd %t.real
 // RUN: ln -sf %clang test-clang
diff --git a/clang/test/Driver/nvptx-cuda-system-arch.c b/clang/test/Driver/nvptx-cuda-system-arch.c
index 2d4eca8c43bc3..675d15bf22cc0 100644
--- a/clang/test/Driver/nvptx-cuda-system-arch.c
+++ b/clang/test/Driver/nvptx-cuda-system-arch.c
@@ -1,4 +1,4 @@
-// REQUIRES: shell
+// UNSUPPORTED: system-windows
 // XFAIL: target={{.*}}-zos{{.*}}
 
 // RUN: mkdir -p %t
diff --git a/clang/test/Driver/openmp-system-arch.c b/clang/test/Driver/openmp-system-arch.c
index 167b07a23f512..1670fd30f4b59 100644
--- a/clang/test/Driver/openmp-system-arch.c
+++ b/clang/test/Driver/openmp-system-arch.c
@@ -1,4 +1,5 @@
-// REQUIRES: shell
+// Needs chmod
+// UNSUPPORTED: system-windows
 // XFAIL: target={{.*}}-zos{{.*}}
 
 // RUN: mkdir -p %t
diff --git a/clang/test/Driver/parse-progname.c b/clang/test/Driver/parse-progname.c
index 34040b81dc733..104ea971684aa 100644
--- a/clang/test/Driver/parse-progname.c
+++ b/clang/test/Driver/parse-progname.c
@@ -1,4 +1,5 @@
-// REQUIRES: shell, arm-registered-target
+// REQUIRES: arm-registered-target
+// UNSUPPORTED: system-windows
 // UNSUPPORTED: llvm-driver
 
 // RUN: mkdir -p %t
diff --git a/clang/test/Driver/print-supported-extensions-riscv.c b/clang/test/Driver/print-supported-extensions-riscv.c
index 413275dba8438..f619d32254d15 100644
--- a/clang/test/Driver/print-supported-extensions-riscv.c
+++ b/clang/test/Driver/print-supported-extensions-riscv.c
@@ -212,6 +212,7 @@
 // CHECK-EMPTY:
 // CHECK-NEXT: Experimental extensions
 // CHECK-NEXT:     p                    0.15      'P' ('Base P' (Packed SIMD))
+// CHECK-NEXT:     zibi                 0.1       'Zibi' (Branch with Immediate)
 // CHECK-NEXT:     zicfilp              1.0       'Zicfilp' (Landing pad)
 // CHECK-NEXT:     zicfiss              1.0       'Zicfiss' (Shadow stack)
 // CHECK-NEXT:     zalasr               0.1       'Zalasr' (Load-Acquire and Store-Release Instructions)
diff --git a/clang/test/Driver/riscv32-toolchain-extra.c b/clang/test/Driver/riscv32-toolchain-extra.c
index 420f7b5203609..d228a58d23d4e 100644
--- a/clang/test/Driver/riscv32-toolchain-extra.c
+++ b/clang/test/Driver/riscv32-toolchain-extra.c
@@ -2,8 +2,7 @@
 
 // The tests here are similar to those in riscv32-toolchain.c, however
 // these tests need to create symlinks to test directory trees in order to
-// set up the environment and therefore shell support is required.
-// REQUIRES: shell
+// set up the environment and therefore POSIX support is required.
 // UNSUPPORTED: system-windows
 
 // If there is no GCC install detected then the driver searches for executables
diff --git a/clang/test/Driver/riscv64-toolchain-extra.c b/clang/test/Driver/riscv64-toolchain-extra.c
index 87bcdeb651a1e..e1d3a10dd788d 100644
--- a/clang/test/Driver/riscv64-toolchain-extra.c
+++ b/clang/test/Driver/riscv64-toolchain-extra.c
@@ -3,7 +3,6 @@
 // The tests here are similar to those in riscv64-toolchain.c, however
 // these tests need to create symlinks to test directory trees in order to
 // set up the environment and therefore shell support is required.
-// REQUIRES: shell
 // UNSUPPORTED: system-windows
 
 // If there is no GCC install detected then the driver searches for executables
diff --git a/clang/test/Driver/sigpipe-handling.c b/clang/test/Driver/sigpipe-handling.c
index 852f0bfaf7981..7283800ba3c2f 100644
--- a/clang/test/Driver/sigpipe-handling.c
+++ b/clang/test/Driver/sigpipe-handling.c
@@ -1,4 +1,4 @@
-// REQUIRES: shell
+// UNSUPPORTED: system-windows
 // RUN: %clang -E -fno-integrated-cc1 %s | head | FileCheck %s
 
 // Test that the parent clang driver process doesn't crash when the child cc1
diff --git a/clang/test/Driver/target-override.c b/clang/test/Driver/target-override.c
index 2c605ac9a03da..5bd88e10e8576 100644
--- a/clang/test/Driver/target-override.c
+++ b/clang/test/Driver/target-override.c
@@ -1,5 +1,4 @@
-// REQUIRES: shell
-// REQUIRES: x86-registered-target
+// REQUIRES: x86-registered-target, symlinks
 
 // RUN: rm -rf %t && mkdir %t
 // RUN: ln -s %clang %t/i386-clang
diff --git a/clang/test/Driver/verbose-output-quoting.c b/clang/test/Driver/verbose-output-quoting.c
index b2781b754ecd7..ebfe3d08fb234 100644
--- a/clang/test/Driver/verbose-output-quoting.c
+++ b/clang/test/Driver/verbose-output-quoting.c
@@ -1,4 +1,4 @@
-// REQUIRES: shell
+// UNSUPPORTED: system-windows
 // RUN: %clang --verbose -DSPACE="a b"  -### %s 2>&1 | FileCheck -check-prefix=SPACE     -strict-whitespace %s
 // RUN: %clang --verbose -DQUOTES=\"\"  -### %s 2>&1 | FileCheck -check-prefix=QUOTES    -strict-whitespace %s
 // RUN: %clang --verbose -DBACKSLASH=\\ -### %s 2>&1 | FileCheck -check-prefix=BACKSLASH -strict-whitespace %s
diff --git a/clang/test/FixIt/fixit-objc-arc.m b/clang/test/FixIt/fixit-objc-arc.m
index 763e8b7d29c86..36d3b941b45d9 100644
--- a/clang/test/FixIt/fixit-objc-arc.m
+++ b/clang/test/FixIt/fixit-objc-arc.m
@@ -1,7 +1,7 @@
-// RUN: %clang_cc1 -pedantic -verify %s
+// RUN: %clang_cc1 -pedantic -verify %s -Wno-error=incompatible-pointer-types
 // RUN: cp %s %t
-// RUN: not %clang_cc1 -pedantic -fobjc-arc -fixit -x objective-c %t
-// RUN: %clang_cc1 -pedantic -fobjc-arc -Werror -x objective-c %t
+// RUN: not %clang_cc1 -pedantic -fobjc-arc -fixit -x objective-c %t -Wno-error=incompatible-pointer-types
+// RUN: %clang_cc1 -pedantic -fobjc-arc -Werror -x objective-c %t -Wno-error=incompatible-pointer-types
 
 @class A;
 @class NSString;
diff --git a/clang/test/Frontend/cfi-unchecked-callee-attribute.cpp b/clang/test/Frontend/cfi-unchecked-callee-attribute.cpp
index f2c4e9e2f8890..072f217ff7b19 100644
--- a/clang/test/Frontend/cfi-unchecked-callee-attribute.cpp
+++ b/clang/test/Frontend/cfi-unchecked-callee-attribute.cpp
@@ -233,3 +233,7 @@ void lambdas() {
     checked_func = checked_lambda;
   };
 }
+
+CFI_UNCHECKED_CALLEE
+void func(void);
+void func(void) {}  // No warning expected.
diff --git a/clang/test/Frontend/dependency-gen-symlink.c b/clang/test/Frontend/dependency-gen-symlink.c
index 2fa339ad2abf2..b88fb7f1a6b2f 100644
--- a/clang/test/Frontend/dependency-gen-symlink.c
+++ b/clang/test/Frontend/dependency-gen-symlink.c
@@ -1,4 +1,4 @@
-// REQUIRES: shell
+// REQUIRES: symlinks
 
 // Basic test
 // RUN: rm -rf %t.dir
diff --git a/clang/test/Frontend/fixed_point_unknown_conversions.c b/clang/test/Frontend/fixed_point_unknown_conversions.c
index b80c571307ccf..3d3e684da41dc 100644
--- a/clang/test/Frontend/fixed_point_unknown_conversions.c
+++ b/clang/test/Frontend/fixed_point_unknown_conversions.c
@@ -26,11 +26,11 @@ void func(void) {
   accum = ic;      // expected-error{{conversion between fixed point and '_Complex int' is not yet supported}}
   accum = s;       // expected-error{{assigning to '_Accum' from incompatible type 'struct S'}}
   accum = ptr;     // expected-error{{assigning to '_Accum' from incompatible type 'int *'}}
-  accum_ptr = ptr; // expected-warning{{incompatible pointer types assigning to '_Accum *' from 'int *'}}
+  accum_ptr = ptr; // expected-error{{incompatible pointer types assigning to '_Accum *' from 'int *'}}
 
   dc = accum;      // expected-error{{conversion between fixed point and '_Complex double' is not yet supported}}
   ic = accum;      // expected-error{{conversion between fixed point and '_Complex int' is not yet supported}}
   s = accum;       // expected-error{{assigning to 'struct S' from incompatible type '_Accum'}}
   ptr = accum;     // expected-error{{assigning to 'int *' from incompatible type '_Accum'}}
-  ptr = accum_ptr; // expected-warning{{incompatible pointer types assigning to 'int *' from '_Accum *'}}
+  ptr = accum_ptr; // expected-error{{incompatible pointer types assigning to 'int *' from '_Accum *'}}
 }
diff --git a/clang/test/Headers/__clang_hip_math.hip b/clang/test/Headers/__clang_hip_math.hip
index 15bdb7589bf45..b88aa3cc18207 100644
--- a/clang/test/Headers/__clang_hip_math.hip
+++ b/clang/test/Headers/__clang_hip_math.hip
@@ -1,4 +1,4 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6
 // REQUIRES: amdgpu-registered-target
 // REQUIRES: spirv-registered-target
 
@@ -47,41 +47,43 @@
 #define BOOL_TYPE int
 typedef unsigned long long uint64_t;
 
-// CHECK-LABEL: @test___make_mantissa_base8(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[P:%.*]], align 1, !tbaa [[TBAA4:![0-9]+]]
+// CHECK-LABEL: define dso_local i64 @test___make_mantissa_base8(
+// CHECK-SAME: ptr noundef readonly captures(none) [[P:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*]]:
+// CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[P]], align 1, !tbaa [[CHAR_TBAA4:![0-9]+]]
 // CHECK-NEXT:    [[CMP_NOT_I1:%.*]] = icmp eq i8 [[TMP0]], 0
-// CHECK-NEXT:    br i1 [[CMP_NOT_I1]], label [[_ZL21__MAKE_MANTISSA_BASE8PKC_EXIT:%.*]], label [[WHILE_BODY_I:%.*]]
-// CHECK:       while.body.i:
-// CHECK-NEXT:    [[TMP1:%.*]] = phi i8 [ [[TMP3:%.*]], [[IF_THEN_I:%.*]] ], [ [[TMP0]], [[ENTRY:%.*]] ]
-// CHECK-NEXT:    [[__R_0_I3:%.*]] = phi i64 [ [[SUB_I:%.*]], [[IF_THEN_I]] ], [ 0, [[ENTRY]] ]
-// CHECK-NEXT:    [[__TAGP_ADDR_0_I2:%.*]] = phi ptr [ [[INCDEC_PTR_I:%.*]], [[IF_THEN_I]] ], [ [[P]], [[ENTRY]] ]
+// CHECK-NEXT:    br i1 [[CMP_NOT_I1]], label %[[_ZL21__MAKE_MANTISSA_BASE8PKC_EXIT:.*]], label %[[WHILE_BODY_I:.*]]
+// CHECK:       [[WHILE_BODY_I]]:
+// CHECK-NEXT:    [[TMP1:%.*]] = phi i8 [ [[TMP3:%.*]], %[[IF_THEN_I:.*]] ], [ [[TMP0]], %[[ENTRY]] ]
+// CHECK-NEXT:    [[__R_0_I3:%.*]] = phi i64 [ [[SUB_I:%.*]], %[[IF_THEN_I]] ], [ 0, %[[ENTRY]] ]
+// CHECK-NEXT:    [[__TAGP_ADDR_0_I2:%.*]] = phi ptr [ [[INCDEC_PTR_I:%.*]], %[[IF_THEN_I]] ], [ [[P]], %[[ENTRY]] ]
 // CHECK-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], -8
 // CHECK-NEXT:    [[OR_COND_I:%.*]] = icmp eq i8 [[TMP2]], 48
-// CHECK-NEXT:    br i1 [[OR_COND_I]], label [[IF_THEN_I]], label [[_ZL21__MAKE_MANTISSA_BASE8PKC_EXIT]]
-// CHECK:       if.then.i:
+// CHECK-NEXT:    br i1 [[OR_COND_I]], label %[[IF_THEN_I]], label %[[_ZL21__MAKE_MANTISSA_BASE8PKC_EXIT]]
+// CHECK:       [[IF_THEN_I]]:
 // CHECK-NEXT:    [[MUL_I:%.*]] = shl i64 [[__R_0_I3]], 3
 // CHECK-NEXT:    [[CONV5_I:%.*]] = zext nneg i8 [[TMP1]] to i64
 // CHECK-NEXT:    [[ADD_I:%.*]] = add i64 [[MUL_I]], -48
 // CHECK-NEXT:    [[SUB_I]] = add i64 [[ADD_I]], [[CONV5_I]]
 // CHECK-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds nuw i8, ptr [[__TAGP_ADDR_0_I2]], i64 1
-// CHECK-NEXT:    [[TMP3]] = load i8, ptr [[INCDEC_PTR_I]], align 1, !tbaa [[TBAA4]]
+// CHECK-NEXT:    [[TMP3]] = load i8, ptr [[INCDEC_PTR_I]], align 1, !tbaa [[CHAR_TBAA4]]
 // CHECK-NEXT:    [[CMP_NOT_I:%.*]] = icmp eq i8 [[TMP3]], 0
-// CHECK-NEXT:    br i1 [[CMP_NOT_I]], label [[_ZL21__MAKE_MANTISSA_BASE8PKC_EXIT]], label [[WHILE_BODY_I]], !llvm.loop [[LOOP7:![0-9]+]]
-// CHECK:       _ZL21__make_mantissa_base8PKc.exit:
-// CHECK-NEXT:    [[RETVAL_2_I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ 0, [[WHILE_BODY_I]] ], [ [[SUB_I]], [[IF_THEN_I]] ]
+// CHECK-NEXT:    br i1 [[CMP_NOT_I]], label %[[_ZL21__MAKE_MANTISSA_BASE8PKC_EXIT]], label %[[WHILE_BODY_I]], !llvm.loop [[LOOP7:![0-9]+]]
+// CHECK:       [[_ZL21__MAKE_MANTISSA_BASE8PKC_EXIT]]:
+// CHECK-NEXT:    [[RETVAL_2_I:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ 0, %[[WHILE_BODY_I]] ], [ [[SUB_I]], %[[IF_THEN_I]] ]
 // CHECK-NEXT:    ret i64 [[RETVAL_2_I]]
 //
-// AMDGCNSPIRV-LABEL: @test___make_mantissa_base8(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    br label [[WHILE_COND_I:%.*]]
-// AMDGCNSPIRV:       while.cond.i:
-// AMDGCNSPIRV-NEXT:    [[__TAGP_ADDR_0_I:%.*]] = phi ptr addrspace(4) [ [[P:%.*]], [[ENTRY:%.*]] ], [ [[__TAGP_ADDR_1_I:%.*]], [[WHILE_BODY_I:%.*]] ]
-// AMDGCNSPIRV-NEXT:    [[__R_0_I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[__R_1_I:%.*]], [[WHILE_BODY_I]] ]
-// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = load i8, ptr addrspace(4) [[__TAGP_ADDR_0_I]], align 1, !tbaa [[TBAA5:![0-9]+]]
+// AMDGCNSPIRV-LABEL: define spir_func i64 @test___make_mantissa_base8(
+// AMDGCNSPIRV-SAME: ptr addrspace(4) noundef readonly captures(none) [[P:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR2:[0-9]+]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*]]:
+// AMDGCNSPIRV-NEXT:    br label %[[WHILE_COND_I:.*]]
+// AMDGCNSPIRV:       [[WHILE_COND_I]]:
+// AMDGCNSPIRV-NEXT:    [[__TAGP_ADDR_0_I:%.*]] = phi ptr addrspace(4) [ [[P]], %[[ENTRY]] ], [ [[__TAGP_ADDR_1_I:%.*]], %[[WHILE_BODY_I:.*]] ]
+// AMDGCNSPIRV-NEXT:    [[__R_0_I:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[__R_1_I:%.*]], %[[WHILE_BODY_I]] ]
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = load i8, ptr addrspace(4) [[__TAGP_ADDR_0_I]], align 1, !tbaa [[CHAR_TBAA5:![0-9]+]]
 // AMDGCNSPIRV-NEXT:    [[CMP_NOT_I:%.*]] = icmp eq i8 [[TMP0]], 0
-// AMDGCNSPIRV-NEXT:    br i1 [[CMP_NOT_I]], label [[_ZL21__MAKE_MANTISSA_BASE8PKC_EXIT:%.*]], label [[WHILE_BODY_I]]
-// AMDGCNSPIRV:       while.body.i:
+// AMDGCNSPIRV-NEXT:    br i1 [[CMP_NOT_I]], label %[[_ZL21__MAKE_MANTISSA_BASE8PKC_EXIT:.*]], label %[[WHILE_BODY_I]]
+// AMDGCNSPIRV:       [[WHILE_BODY_I]]:
 // AMDGCNSPIRV-NEXT:    [[TMP1:%.*]] = and i8 [[TMP0]], -8
 // AMDGCNSPIRV-NEXT:    [[OR_COND_I:%.*]] = icmp eq i8 [[TMP1]], 48
 // AMDGCNSPIRV-NEXT:    [[MUL_I:%.*]] = shl i64 [[__R_0_I]], 3
@@ -91,50 +93,52 @@ typedef unsigned long long uint64_t;
 // AMDGCNSPIRV-NEXT:    [[__TAGP_ADDR_1_I_IDX:%.*]] = zext i1 [[OR_COND_I]] to i64
 // AMDGCNSPIRV-NEXT:    [[__TAGP_ADDR_1_I]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[__TAGP_ADDR_0_I]], i64 [[__TAGP_ADDR_1_I_IDX]]
 // AMDGCNSPIRV-NEXT:    [[__R_1_I]] = select i1 [[OR_COND_I]], i64 [[SUB_I]], i64 [[__R_0_I]]
-// AMDGCNSPIRV-NEXT:    br i1 [[OR_COND_I]], label [[WHILE_COND_I]], label [[_ZL21__MAKE_MANTISSA_BASE8PKC_EXIT]], !llvm.loop [[LOOP8:![0-9]+]]
-// AMDGCNSPIRV:       _ZL21__make_mantissa_base8PKc.exit:
-// AMDGCNSPIRV-NEXT:    [[RETVAL_2_I:%.*]] = phi i64 [ 0, [[WHILE_BODY_I]] ], [ [[__R_0_I]], [[WHILE_COND_I]] ]
+// AMDGCNSPIRV-NEXT:    br i1 [[OR_COND_I]], label %[[WHILE_COND_I]], label %[[_ZL21__MAKE_MANTISSA_BASE8PKC_EXIT]], !llvm.loop [[LOOP8:![0-9]+]]
+// AMDGCNSPIRV:       [[_ZL21__MAKE_MANTISSA_BASE8PKC_EXIT]]:
+// AMDGCNSPIRV-NEXT:    [[RETVAL_2_I:%.*]] = phi i64 [ 0, %[[WHILE_BODY_I]] ], [ [[__R_0_I]], %[[WHILE_COND_I]] ]
 // AMDGCNSPIRV-NEXT:    ret i64 [[RETVAL_2_I]]
 //
 extern "C" __device__ uint64_t test___make_mantissa_base8(const char *p) {
   return __make_mantissa_base8(p);
 }
 
-// CHECK-LABEL: @test___make_mantissa_base10(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[P:%.*]], align 1, !tbaa [[TBAA4]]
+// CHECK-LABEL: define dso_local i64 @test___make_mantissa_base10(
+// CHECK-SAME: ptr noundef readonly captures(none) [[P:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*]]:
+// CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[P]], align 1, !tbaa [[CHAR_TBAA4]]
 // CHECK-NEXT:    [[CMP_NOT_I1:%.*]] = icmp eq i8 [[TMP0]], 0
-// CHECK-NEXT:    br i1 [[CMP_NOT_I1]], label [[_ZL22__MAKE_MANTISSA_BASE10PKC_EXIT:%.*]], label [[WHILE_BODY_I:%.*]]
-// CHECK:       while.body.i:
-// CHECK-NEXT:    [[TMP1:%.*]] = phi i8 [ [[TMP3:%.*]], [[IF_THEN_I:%.*]] ], [ [[TMP0]], [[ENTRY:%.*]] ]
-// CHECK-NEXT:    [[__R_0_I3:%.*]] = phi i64 [ [[SUB_I:%.*]], [[IF_THEN_I]] ], [ 0, [[ENTRY]] ]
-// CHECK-NEXT:    [[__TAGP_ADDR_0_I2:%.*]] = phi ptr [ [[INCDEC_PTR_I:%.*]], [[IF_THEN_I]] ], [ [[P]], [[ENTRY]] ]
+// CHECK-NEXT:    br i1 [[CMP_NOT_I1]], label %[[_ZL22__MAKE_MANTISSA_BASE10PKC_EXIT:.*]], label %[[WHILE_BODY_I:.*]]
+// CHECK:       [[WHILE_BODY_I]]:
+// CHECK-NEXT:    [[TMP1:%.*]] = phi i8 [ [[TMP3:%.*]], %[[IF_THEN_I:.*]] ], [ [[TMP0]], %[[ENTRY]] ]
+// CHECK-NEXT:    [[__R_0_I3:%.*]] = phi i64 [ [[SUB_I:%.*]], %[[IF_THEN_I]] ], [ 0, %[[ENTRY]] ]
+// CHECK-NEXT:    [[__TAGP_ADDR_0_I2:%.*]] = phi ptr [ [[INCDEC_PTR_I:%.*]], %[[IF_THEN_I]] ], [ [[P]], %[[ENTRY]] ]
 // CHECK-NEXT:    [[TMP2:%.*]] = add i8 [[TMP1]], -48
 // CHECK-NEXT:    [[OR_COND_I:%.*]] = icmp ult i8 [[TMP2]], 10
-// CHECK-NEXT:    br i1 [[OR_COND_I]], label [[IF_THEN_I]], label [[_ZL22__MAKE_MANTISSA_BASE10PKC_EXIT]]
-// CHECK:       if.then.i:
+// CHECK-NEXT:    br i1 [[OR_COND_I]], label %[[IF_THEN_I]], label %[[_ZL22__MAKE_MANTISSA_BASE10PKC_EXIT]]
+// CHECK:       [[IF_THEN_I]]:
 // CHECK-NEXT:    [[MUL_I:%.*]] = mul i64 [[__R_0_I3]], 10
 // CHECK-NEXT:    [[CONV5_I:%.*]] = zext nneg i8 [[TMP1]] to i64
 // CHECK-NEXT:    [[ADD_I:%.*]] = add i64 [[MUL_I]], -48
 // CHECK-NEXT:    [[SUB_I]] = add i64 [[ADD_I]], [[CONV5_I]]
 // CHECK-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds nuw i8, ptr [[__TAGP_ADDR_0_I2]], i64 1
-// CHECK-NEXT:    [[TMP3]] = load i8, ptr [[INCDEC_PTR_I]], align 1, !tbaa [[TBAA4]]
+// CHECK-NEXT:    [[TMP3]] = load i8, ptr [[INCDEC_PTR_I]], align 1, !tbaa [[CHAR_TBAA4]]
 // CHECK-NEXT:    [[CMP_NOT_I:%.*]] = icmp eq i8 [[TMP3]], 0
-// CHECK-NEXT:    br i1 [[CMP_NOT_I]], label [[_ZL22__MAKE_MANTISSA_BASE10PKC_EXIT]], label [[WHILE_BODY_I]], !llvm.loop [[LOOP10:![0-9]+]]
-// CHECK:       _ZL22__make_mantissa_base10PKc.exit:
-// CHECK-NEXT:    [[RETVAL_2_I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ 0, [[WHILE_BODY_I]] ], [ [[SUB_I]], [[IF_THEN_I]] ]
+// CHECK-NEXT:    br i1 [[CMP_NOT_I]], label %[[_ZL22__MAKE_MANTISSA_BASE10PKC_EXIT]], label %[[WHILE_BODY_I]], !llvm.loop [[LOOP10:![0-9]+]]
+// CHECK:       [[_ZL22__MAKE_MANTISSA_BASE10PKC_EXIT]]:
+// CHECK-NEXT:    [[RETVAL_2_I:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ 0, %[[WHILE_BODY_I]] ], [ [[SUB_I]], %[[IF_THEN_I]] ]
 // CHECK-NEXT:    ret i64 [[RETVAL_2_I]]
 //
-// AMDGCNSPIRV-LABEL: @test___make_mantissa_base10(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    br label [[WHILE_COND_I:%.*]]
-// AMDGCNSPIRV:       while.cond.i:
-// AMDGCNSPIRV-NEXT:    [[__TAGP_ADDR_0_I:%.*]] = phi ptr addrspace(4) [ [[P:%.*]], [[ENTRY:%.*]] ], [ [[__TAGP_ADDR_1_I:%.*]], [[WHILE_BODY_I:%.*]] ]
-// AMDGCNSPIRV-NEXT:    [[__R_0_I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[__R_1_I:%.*]], [[WHILE_BODY_I]] ]
-// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = load i8, ptr addrspace(4) [[__TAGP_ADDR_0_I]], align 1, !tbaa [[TBAA5]]
+// AMDGCNSPIRV-LABEL: define spir_func i64 @test___make_mantissa_base10(
+// AMDGCNSPIRV-SAME: ptr addrspace(4) noundef readonly captures(none) [[P:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR2]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*]]:
+// AMDGCNSPIRV-NEXT:    br label %[[WHILE_COND_I:.*]]
+// AMDGCNSPIRV:       [[WHILE_COND_I]]:
+// AMDGCNSPIRV-NEXT:    [[__TAGP_ADDR_0_I:%.*]] = phi ptr addrspace(4) [ [[P]], %[[ENTRY]] ], [ [[__TAGP_ADDR_1_I:%.*]], %[[WHILE_BODY_I:.*]] ]
+// AMDGCNSPIRV-NEXT:    [[__R_0_I:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[__R_1_I:%.*]], %[[WHILE_BODY_I]] ]
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = load i8, ptr addrspace(4) [[__TAGP_ADDR_0_I]], align 1, !tbaa [[CHAR_TBAA5]]
 // AMDGCNSPIRV-NEXT:    [[CMP_NOT_I:%.*]] = icmp eq i8 [[TMP0]], 0
-// AMDGCNSPIRV-NEXT:    br i1 [[CMP_NOT_I]], label [[_ZL22__MAKE_MANTISSA_BASE10PKC_EXIT:%.*]], label [[WHILE_BODY_I]]
-// AMDGCNSPIRV:       while.body.i:
+// AMDGCNSPIRV-NEXT:    br i1 [[CMP_NOT_I]], label %[[_ZL22__MAKE_MANTISSA_BASE10PKC_EXIT:.*]], label %[[WHILE_BODY_I]]
+// AMDGCNSPIRV:       [[WHILE_BODY_I]]:
 // AMDGCNSPIRV-NEXT:    [[TMP1:%.*]] = add i8 [[TMP0]], -48
 // AMDGCNSPIRV-NEXT:    [[OR_COND_I:%.*]] = icmp ult i8 [[TMP1]], 10
 // AMDGCNSPIRV-NEXT:    [[MUL_I:%.*]] = mul i64 [[__R_0_I]], 10
@@ -144,220 +148,224 @@ extern "C" __device__ uint64_t test___make_mantissa_base8(const char *p) {
 // AMDGCNSPIRV-NEXT:    [[__TAGP_ADDR_1_I_IDX:%.*]] = zext i1 [[OR_COND_I]] to i64
 // AMDGCNSPIRV-NEXT:    [[__TAGP_ADDR_1_I]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[__TAGP_ADDR_0_I]], i64 [[__TAGP_ADDR_1_I_IDX]]
 // AMDGCNSPIRV-NEXT:    [[__R_1_I]] = select i1 [[OR_COND_I]], i64 [[SUB_I]], i64 [[__R_0_I]]
-// AMDGCNSPIRV-NEXT:    br i1 [[OR_COND_I]], label [[WHILE_COND_I]], label [[_ZL22__MAKE_MANTISSA_BASE10PKC_EXIT]], !llvm.loop [[LOOP11:![0-9]+]]
-// AMDGCNSPIRV:       _ZL22__make_mantissa_base10PKc.exit:
-// AMDGCNSPIRV-NEXT:    [[RETVAL_2_I:%.*]] = phi i64 [ 0, [[WHILE_BODY_I]] ], [ [[__R_0_I]], [[WHILE_COND_I]] ]
+// AMDGCNSPIRV-NEXT:    br i1 [[OR_COND_I]], label %[[WHILE_COND_I]], label %[[_ZL22__MAKE_MANTISSA_BASE10PKC_EXIT]], !llvm.loop [[LOOP11:![0-9]+]]
+// AMDGCNSPIRV:       [[_ZL22__MAKE_MANTISSA_BASE10PKC_EXIT]]:
+// AMDGCNSPIRV-NEXT:    [[RETVAL_2_I:%.*]] = phi i64 [ 0, %[[WHILE_BODY_I]] ], [ [[__R_0_I]], %[[WHILE_COND_I]] ]
 // AMDGCNSPIRV-NEXT:    ret i64 [[RETVAL_2_I]]
 //
 extern "C" __device__ uint64_t test___make_mantissa_base10(const char *p) {
   return __make_mantissa_base10(p);
 }
 
-// CHECK-LABEL: @test___make_mantissa_base16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[P:%.*]], align 1, !tbaa [[TBAA4]]
+// CHECK-LABEL: define dso_local i64 @test___make_mantissa_base16(
+// CHECK-SAME: ptr noundef readonly captures(none) [[P:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*]]:
+// CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[P]], align 1, !tbaa [[CHAR_TBAA4]]
 // CHECK-NEXT:    [[CMP_NOT_I1:%.*]] = icmp eq i8 [[TMP0]], 0
-// CHECK-NEXT:    br i1 [[CMP_NOT_I1]], label [[_ZL22__MAKE_MANTISSA_BASE16PKC_EXIT:%.*]], label [[WHILE_BODY_I:%.*]]
-// CHECK:       while.body.i:
-// CHECK-NEXT:    [[TMP1:%.*]] = phi i8 [ [[TMP5:%.*]], [[IF_END31_I:%.*]] ], [ [[TMP0]], [[ENTRY:%.*]] ]
-// CHECK-NEXT:    [[__R_0_I3:%.*]] = phi i64 [ [[ADD28_I:%.*]], [[IF_END31_I]] ], [ 0, [[ENTRY]] ]
-// CHECK-NEXT:    [[__TAGP_ADDR_0_I2:%.*]] = phi ptr [ [[INCDEC_PTR_I:%.*]], [[IF_END31_I]] ], [ [[P]], [[ENTRY]] ]
+// CHECK-NEXT:    br i1 [[CMP_NOT_I1]], label %[[_ZL22__MAKE_MANTISSA_BASE16PKC_EXIT:.*]], label %[[WHILE_BODY_I:.*]]
+// CHECK:       [[WHILE_BODY_I]]:
+// CHECK-NEXT:    [[TMP1:%.*]] = phi i8 [ [[TMP5:%.*]], %[[IF_END31_I:.*]] ], [ [[TMP0]], %[[ENTRY]] ]
+// CHECK-NEXT:    [[__R_0_I3:%.*]] = phi i64 [ [[ADD28_I:%.*]], %[[IF_END31_I]] ], [ 0, %[[ENTRY]] ]
+// CHECK-NEXT:    [[__TAGP_ADDR_0_I2:%.*]] = phi ptr [ [[INCDEC_PTR_I:%.*]], %[[IF_END31_I]] ], [ [[P]], %[[ENTRY]] ]
 // CHECK-NEXT:    [[TMP2:%.*]] = add i8 [[TMP1]], -48
 // CHECK-NEXT:    [[OR_COND_I:%.*]] = icmp ult i8 [[TMP2]], 10
-// CHECK-NEXT:    br i1 [[OR_COND_I]], label [[IF_END31_I]], label [[IF_ELSE_I:%.*]]
-// CHECK:       if.else.i:
+// CHECK-NEXT:    br i1 [[OR_COND_I]], label %[[IF_END31_I]], label %[[IF_ELSE_I:.*]]
+// CHECK:       [[IF_ELSE_I]]:
 // CHECK-NEXT:    [[TMP3:%.*]] = add i8 [[TMP1]], -97
 // CHECK-NEXT:    [[OR_COND33_I:%.*]] = icmp ult i8 [[TMP3]], 6
-// CHECK-NEXT:    br i1 [[OR_COND33_I]], label [[IF_END31_I]], label [[IF_ELSE17_I:%.*]]
-// CHECK:       if.else17.i:
+// CHECK-NEXT:    br i1 [[OR_COND33_I]], label %[[IF_END31_I]], label %[[IF_ELSE17_I:.*]]
+// CHECK:       [[IF_ELSE17_I]]:
 // CHECK-NEXT:    [[TMP4:%.*]] = add i8 [[TMP1]], -65
 // CHECK-NEXT:    [[OR_COND34_I:%.*]] = icmp ult i8 [[TMP4]], 6
-// CHECK-NEXT:    br i1 [[OR_COND34_I]], label [[IF_END31_I]], label [[_ZL22__MAKE_MANTISSA_BASE16PKC_EXIT]]
-// CHECK:       if.end31.i:
-// CHECK-NEXT:    [[DOTSINK:%.*]] = phi i64 [ -48, [[WHILE_BODY_I]] ], [ -87, [[IF_ELSE_I]] ], [ -55, [[IF_ELSE17_I]] ]
+// CHECK-NEXT:    br i1 [[OR_COND34_I]], label %[[IF_END31_I]], label %[[_ZL22__MAKE_MANTISSA_BASE16PKC_EXIT]]
+// CHECK:       [[IF_END31_I]]:
+// CHECK-NEXT:    [[DOTSINK:%.*]] = phi i64 [ -48, %[[WHILE_BODY_I]] ], [ -87, %[[IF_ELSE_I]] ], [ -55, %[[IF_ELSE17_I]] ]
 // CHECK-NEXT:    [[MUL24_I:%.*]] = shl i64 [[__R_0_I3]], 4
 // CHECK-NEXT:    [[CONV25_I:%.*]] = zext nneg i8 [[TMP1]] to i64
 // CHECK-NEXT:    [[ADD26_I:%.*]] = add i64 [[MUL24_I]], [[DOTSINK]]
 // CHECK-NEXT:    [[ADD28_I]] = add i64 [[ADD26_I]], [[CONV25_I]]
 // CHECK-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds nuw i8, ptr [[__TAGP_ADDR_0_I2]], i64 1
-// CHECK-NEXT:    [[TMP5]] = load i8, ptr [[INCDEC_PTR_I]], align 1, !tbaa [[TBAA4]]
+// CHECK-NEXT:    [[TMP5]] = load i8, ptr [[INCDEC_PTR_I]], align 1, !tbaa [[CHAR_TBAA4]]
 // CHECK-NEXT:    [[CMP_NOT_I:%.*]] = icmp eq i8 [[TMP5]], 0
-// CHECK-NEXT:    br i1 [[CMP_NOT_I]], label [[_ZL22__MAKE_MANTISSA_BASE16PKC_EXIT]], label [[WHILE_BODY_I]], !llvm.loop [[LOOP11:![0-9]+]]
-// CHECK:       _ZL22__make_mantissa_base16PKc.exit:
-// CHECK-NEXT:    [[RETVAL_2_I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ 0, [[IF_ELSE17_I]] ], [ [[ADD28_I]], [[IF_END31_I]] ]
+// CHECK-NEXT:    br i1 [[CMP_NOT_I]], label %[[_ZL22__MAKE_MANTISSA_BASE16PKC_EXIT]], label %[[WHILE_BODY_I]], !llvm.loop [[LOOP11:![0-9]+]]
+// CHECK:       [[_ZL22__MAKE_MANTISSA_BASE16PKC_EXIT]]:
+// CHECK-NEXT:    [[RETVAL_2_I:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ 0, %[[IF_ELSE17_I]] ], [ [[ADD28_I]], %[[IF_END31_I]] ]
 // CHECK-NEXT:    ret i64 [[RETVAL_2_I]]
 //
-// AMDGCNSPIRV-LABEL: @test___make_mantissa_base16(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = load i8, ptr addrspace(4) [[P:%.*]], align 1, !tbaa [[TBAA5]]
+// AMDGCNSPIRV-LABEL: define spir_func i64 @test___make_mantissa_base16(
+// AMDGCNSPIRV-SAME: ptr addrspace(4) noundef readonly captures(none) [[P:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR2]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*]]:
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = load i8, ptr addrspace(4) [[P]], align 1, !tbaa [[CHAR_TBAA5]]
 // AMDGCNSPIRV-NEXT:    [[CMP_NOT_I1:%.*]] = icmp eq i8 [[TMP0]], 0
-// AMDGCNSPIRV-NEXT:    br i1 [[CMP_NOT_I1]], label [[_ZL22__MAKE_MANTISSA_BASE16PKC_EXIT:%.*]], label [[WHILE_BODY_I:%.*]]
-// AMDGCNSPIRV:       while.body.i:
-// AMDGCNSPIRV-NEXT:    [[TMP1:%.*]] = phi i8 [ [[TMP5:%.*]], [[IF_END31_I:%.*]] ], [ [[TMP0]], [[ENTRY:%.*]] ]
-// AMDGCNSPIRV-NEXT:    [[__R_0_I3:%.*]] = phi i64 [ [[ADD28_I:%.*]], [[IF_END31_I]] ], [ 0, [[ENTRY]] ]
-// AMDGCNSPIRV-NEXT:    [[__TAGP_ADDR_0_I2:%.*]] = phi ptr addrspace(4) [ [[INCDEC_PTR_I:%.*]], [[IF_END31_I]] ], [ [[P]], [[ENTRY]] ]
+// AMDGCNSPIRV-NEXT:    br i1 [[CMP_NOT_I1]], label %[[_ZL22__MAKE_MANTISSA_BASE16PKC_EXIT:.*]], label %[[WHILE_BODY_I:.*]]
+// AMDGCNSPIRV:       [[WHILE_BODY_I]]:
+// AMDGCNSPIRV-NEXT:    [[TMP1:%.*]] = phi i8 [ [[TMP5:%.*]], %[[IF_END31_I:.*]] ], [ [[TMP0]], %[[ENTRY]] ]
+// AMDGCNSPIRV-NEXT:    [[__R_0_I3:%.*]] = phi i64 [ [[ADD28_I:%.*]], %[[IF_END31_I]] ], [ 0, %[[ENTRY]] ]
+// AMDGCNSPIRV-NEXT:    [[__TAGP_ADDR_0_I2:%.*]] = phi ptr addrspace(4) [ [[INCDEC_PTR_I:%.*]], %[[IF_END31_I]] ], [ [[P]], %[[ENTRY]] ]
 // AMDGCNSPIRV-NEXT:    [[TMP2:%.*]] = add i8 [[TMP1]], -48
 // AMDGCNSPIRV-NEXT:    [[OR_COND_I:%.*]] = icmp ult i8 [[TMP2]], 10
-// AMDGCNSPIRV-NEXT:    br i1 [[OR_COND_I]], label [[IF_END31_I]], label [[IF_ELSE_I:%.*]]
-// AMDGCNSPIRV:       if.else.i:
+// AMDGCNSPIRV-NEXT:    br i1 [[OR_COND_I]], label %[[IF_END31_I]], label %[[IF_ELSE_I:.*]]
+// AMDGCNSPIRV:       [[IF_ELSE_I]]:
 // AMDGCNSPIRV-NEXT:    [[TMP3:%.*]] = add i8 [[TMP1]], -97
 // AMDGCNSPIRV-NEXT:    [[OR_COND33_I:%.*]] = icmp ult i8 [[TMP3]], 6
-// AMDGCNSPIRV-NEXT:    br i1 [[OR_COND33_I]], label [[IF_END31_I]], label [[IF_ELSE17_I:%.*]]
-// AMDGCNSPIRV:       if.else17.i:
+// AMDGCNSPIRV-NEXT:    br i1 [[OR_COND33_I]], label %[[IF_END31_I]], label %[[IF_ELSE17_I:.*]]
+// AMDGCNSPIRV:       [[IF_ELSE17_I]]:
 // AMDGCNSPIRV-NEXT:    [[TMP4:%.*]] = add i8 [[TMP1]], -65
 // AMDGCNSPIRV-NEXT:    [[OR_COND34_I:%.*]] = icmp ult i8 [[TMP4]], 6
-// AMDGCNSPIRV-NEXT:    br i1 [[OR_COND34_I]], label [[IF_END31_I]], label [[_ZL22__MAKE_MANTISSA_BASE16PKC_EXIT]]
-// AMDGCNSPIRV:       if.end31.i:
-// AMDGCNSPIRV-NEXT:    [[DOTSINK:%.*]] = phi i64 [ -48, [[WHILE_BODY_I]] ], [ -87, [[IF_ELSE_I]] ], [ -55, [[IF_ELSE17_I]] ]
+// AMDGCNSPIRV-NEXT:    br i1 [[OR_COND34_I]], label %[[IF_END31_I]], label %[[_ZL22__MAKE_MANTISSA_BASE16PKC_EXIT]]
+// AMDGCNSPIRV:       [[IF_END31_I]]:
+// AMDGCNSPIRV-NEXT:    [[DOTSINK:%.*]] = phi i64 [ -48, %[[WHILE_BODY_I]] ], [ -87, %[[IF_ELSE_I]] ], [ -55, %[[IF_ELSE17_I]] ]
 // AMDGCNSPIRV-NEXT:    [[MUL24_I:%.*]] = shl i64 [[__R_0_I3]], 4
 // AMDGCNSPIRV-NEXT:    [[CONV25_I:%.*]] = zext nneg i8 [[TMP1]] to i64
 // AMDGCNSPIRV-NEXT:    [[ADD26_I:%.*]] = add i64 [[MUL24_I]], [[DOTSINK]]
 // AMDGCNSPIRV-NEXT:    [[ADD28_I]] = add i64 [[ADD26_I]], [[CONV25_I]]
 // AMDGCNSPIRV-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[__TAGP_ADDR_0_I2]], i64 1
-// AMDGCNSPIRV-NEXT:    [[TMP5]] = load i8, ptr addrspace(4) [[INCDEC_PTR_I]], align 1, !tbaa [[TBAA5]]
+// AMDGCNSPIRV-NEXT:    [[TMP5]] = load i8, ptr addrspace(4) [[INCDEC_PTR_I]], align 1, !tbaa [[CHAR_TBAA5]]
 // AMDGCNSPIRV-NEXT:    [[CMP_NOT_I:%.*]] = icmp eq i8 [[TMP5]], 0
-// AMDGCNSPIRV-NEXT:    br i1 [[CMP_NOT_I]], label [[_ZL22__MAKE_MANTISSA_BASE16PKC_EXIT]], label [[WHILE_BODY_I]], !llvm.loop [[LOOP12:![0-9]+]]
-// AMDGCNSPIRV:       _ZL22__make_mantissa_base16PKc.exit:
-// AMDGCNSPIRV-NEXT:    [[RETVAL_2_I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ 0, [[IF_ELSE17_I]] ], [ [[ADD28_I]], [[IF_END31_I]] ]
+// AMDGCNSPIRV-NEXT:    br i1 [[CMP_NOT_I]], label %[[_ZL22__MAKE_MANTISSA_BASE16PKC_EXIT]], label %[[WHILE_BODY_I]], !llvm.loop [[LOOP12:![0-9]+]]
+// AMDGCNSPIRV:       [[_ZL22__MAKE_MANTISSA_BASE16PKC_EXIT]]:
+// AMDGCNSPIRV-NEXT:    [[RETVAL_2_I:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ 0, %[[IF_ELSE17_I]] ], [ [[ADD28_I]], %[[IF_END31_I]] ]
 // AMDGCNSPIRV-NEXT:    ret i64 [[RETVAL_2_I]]
 //
 extern "C" __device__ uint64_t test___make_mantissa_base16(const char *p) {
   return __make_mantissa_base16(p);
 }
 
-// CHECK-LABEL: @test___make_mantissa(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[P:%.*]], align 1, !tbaa [[TBAA4]]
+// CHECK-LABEL: define dso_local i64 @test___make_mantissa(
+// CHECK-SAME: ptr noundef readonly captures(none) [[P:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[P]], align 1, !tbaa [[CHAR_TBAA4]]
 // CHECK-NEXT:    [[CMP_I:%.*]] = icmp eq i8 [[TMP0]], 48
-// CHECK-NEXT:    br i1 [[CMP_I]], label [[IF_THEN_I:%.*]], label [[WHILE_COND_I14_I_PREHEADER:%.*]]
-// CHECK:       while.cond.i14.i.preheader:
-// CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[P]], align 1, !tbaa [[TBAA4]]
+// CHECK-NEXT:    br i1 [[CMP_I]], label %[[IF_THEN_I:.*]], label %[[WHILE_COND_I14_I_PREHEADER:.*]]
+// CHECK:       [[WHILE_COND_I14_I_PREHEADER]]:
+// CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[P]], align 1, !tbaa [[CHAR_TBAA4]]
 // CHECK-NEXT:    [[CMP_NOT_I17_I5:%.*]] = icmp eq i8 [[TMP1]], 0
-// CHECK-NEXT:    br i1 [[CMP_NOT_I17_I5]], label [[_ZL15__MAKE_MANTISSAPKC_EXIT:%.*]], label [[WHILE_BODY_I18_I:%.*]]
-// CHECK:       if.then.i:
+// CHECK-NEXT:    br i1 [[CMP_NOT_I17_I5]], label %[[_ZL15__MAKE_MANTISSAPKC_EXIT:.*]], label %[[WHILE_BODY_I18_I:.*]]
+// CHECK:       [[IF_THEN_I]]:
 // CHECK-NEXT:    [[INCDEC_PTR_I:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr [[INCDEC_PTR_I]], align 1, !tbaa [[TBAA4]]
-// CHECK-NEXT:    switch i8 [[TMP2]], label [[WHILE_COND_I_I_PREHEADER:%.*]] [
-// CHECK-NEXT:      i8 120, label [[IF_THEN5_I:%.*]]
-// CHECK-NEXT:      i8 88, label [[IF_THEN5_I]]
+// CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr [[INCDEC_PTR_I]], align 1, !tbaa [[CHAR_TBAA4]]
+// CHECK-NEXT:    switch i8 [[TMP2]], label %[[WHILE_COND_I_I_PREHEADER:.*]] [
+// CHECK-NEXT:      i8 120, label %[[IF_THEN5_I:.*]]
+// CHECK-NEXT:      i8 88, label %[[IF_THEN5_I]]
 // CHECK-NEXT:    ]
-// CHECK:       while.cond.i.i.preheader:
-// CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr [[INCDEC_PTR_I]], align 1, !tbaa [[TBAA4]]
+// CHECK:       [[WHILE_COND_I_I_PREHEADER]]:
+// CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr [[INCDEC_PTR_I]], align 1, !tbaa [[CHAR_TBAA4]]
 // CHECK-NEXT:    [[CMP_NOT_I_I14:%.*]] = icmp eq i8 [[TMP3]], 0
-// CHECK-NEXT:    br i1 [[CMP_NOT_I_I14]], label [[_ZL15__MAKE_MANTISSAPKC_EXIT]], label [[WHILE_BODY_I_I:%.*]]
-// CHECK:       if.then5.i:
-// CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr [[INCDEC_PTR_I]], align 1, !tbaa [[TBAA4]]
+// CHECK-NEXT:    br i1 [[CMP_NOT_I_I14]], label %[[_ZL15__MAKE_MANTISSAPKC_EXIT]], label %[[WHILE_BODY_I_I:.*]]
+// CHECK:       [[IF_THEN5_I]]:
+// CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr [[INCDEC_PTR_I]], align 1, !tbaa [[CHAR_TBAA4]]
 // CHECK-NEXT:    [[CMP_NOT_I30_I9:%.*]] = icmp eq i8 [[TMP4]], 0
-// CHECK-NEXT:    br i1 [[CMP_NOT_I30_I9]], label [[_ZL15__MAKE_MANTISSAPKC_EXIT]], label [[WHILE_BODY_I31_I:%.*]]
-// CHECK:       while.body.i31.i:
-// CHECK-NEXT:    [[TMP5:%.*]] = phi i8 [ [[TMP9:%.*]], [[IF_END31_I_I:%.*]] ], [ [[TMP4]], [[IF_THEN5_I]] ]
-// CHECK-NEXT:    [[__R_0_I29_I11:%.*]] = phi i64 [ [[ADD28_I_I:%.*]], [[IF_END31_I_I]] ], [ 0, [[IF_THEN5_I]] ]
-// CHECK-NEXT:    [[__TAGP_ADDR_0_I28_I10:%.*]] = phi ptr [ [[INCDEC_PTR_I34_I:%.*]], [[IF_END31_I_I]] ], [ [[INCDEC_PTR_I]], [[IF_THEN5_I]] ]
+// CHECK-NEXT:    br i1 [[CMP_NOT_I30_I9]], label %[[_ZL15__MAKE_MANTISSAPKC_EXIT]], label %[[WHILE_BODY_I31_I:.*]]
+// CHECK:       [[WHILE_BODY_I31_I]]:
+// CHECK-NEXT:    [[TMP5:%.*]] = phi i8 [ [[TMP9:%.*]], %[[IF_END31_I_I:.*]] ], [ [[TMP4]], %[[IF_THEN5_I]] ]
+// CHECK-NEXT:    [[__R_0_I29_I11:%.*]] = phi i64 [ [[ADD28_I_I:%.*]], %[[IF_END31_I_I]] ], [ 0, %[[IF_THEN5_I]] ]
+// CHECK-NEXT:    [[__TAGP_ADDR_0_I28_I10:%.*]] = phi ptr [ [[INCDEC_PTR_I34_I:%.*]], %[[IF_END31_I_I]] ], [ [[INCDEC_PTR_I]], %[[IF_THEN5_I]] ]
 // CHECK-NEXT:    [[TMP6:%.*]] = add i8 [[TMP5]], -48
 // CHECK-NEXT:    [[OR_COND_I32_I:%.*]] = icmp ult i8 [[TMP6]], 10
-// CHECK-NEXT:    br i1 [[OR_COND_I32_I]], label [[IF_END31_I_I]], label [[IF_ELSE_I_I:%.*]]
-// CHECK:       if.else.i.i:
+// CHECK-NEXT:    br i1 [[OR_COND_I32_I]], label %[[IF_END31_I_I]], label %[[IF_ELSE_I_I:.*]]
+// CHECK:       [[IF_ELSE_I_I]]:
 // CHECK-NEXT:    [[TMP7:%.*]] = add i8 [[TMP5]], -97
 // CHECK-NEXT:    [[OR_COND33_I_I:%.*]] = icmp ult i8 [[TMP7]], 6
-// CHECK-NEXT:    br i1 [[OR_COND33_I_I]], label [[IF_END31_I_I]], label [[IF_ELSE17_I_I:%.*]]
-// CHECK:       if.else17.i.i:
+// CHECK-NEXT:    br i1 [[OR_COND33_I_I]], label %[[IF_END31_I_I]], label %[[IF_ELSE17_I_I:.*]]
+// CHECK:       [[IF_ELSE17_I_I]]:
 // CHECK-NEXT:    [[TMP8:%.*]] = add i8 [[TMP5]], -65
 // CHECK-NEXT:    [[OR_COND34_I_I:%.*]] = icmp ult i8 [[TMP8]], 6
-// CHECK-NEXT:    br i1 [[OR_COND34_I_I]], label [[IF_END31_I_I]], label [[_ZL15__MAKE_MANTISSAPKC_EXIT]]
-// CHECK:       if.end31.i.i:
-// CHECK-NEXT:    [[DOTSINK:%.*]] = phi i64 [ -48, [[WHILE_BODY_I31_I]] ], [ -87, [[IF_ELSE_I_I]] ], [ -55, [[IF_ELSE17_I_I]] ]
+// CHECK-NEXT:    br i1 [[OR_COND34_I_I]], label %[[IF_END31_I_I]], label %[[_ZL15__MAKE_MANTISSAPKC_EXIT]]
+// CHECK:       [[IF_END31_I_I]]:
+// CHECK-NEXT:    [[DOTSINK:%.*]] = phi i64 [ -48, %[[WHILE_BODY_I31_I]] ], [ -87, %[[IF_ELSE_I_I]] ], [ -55, %[[IF_ELSE17_I_I]] ]
 // CHECK-NEXT:    [[MUL24_I_I:%.*]] = shl i64 [[__R_0_I29_I11]], 4
 // CHECK-NEXT:    [[CONV25_I_I:%.*]] = zext nneg i8 [[TMP5]] to i64
 // CHECK-NEXT:    [[ADD26_I_I:%.*]] = add i64 [[MUL24_I_I]], [[DOTSINK]]
 // CHECK-NEXT:    [[ADD28_I_I]] = add i64 [[ADD26_I_I]], [[CONV25_I_I]]
 // CHECK-NEXT:    [[INCDEC_PTR_I34_I]] = getelementptr inbounds nuw i8, ptr [[__TAGP_ADDR_0_I28_I10]], i64 1
-// CHECK-NEXT:    [[TMP9]] = load i8, ptr [[INCDEC_PTR_I34_I]], align 1, !tbaa [[TBAA4]]
+// CHECK-NEXT:    [[TMP9]] = load i8, ptr [[INCDEC_PTR_I34_I]], align 1, !tbaa [[CHAR_TBAA4]]
 // CHECK-NEXT:    [[CMP_NOT_I30_I:%.*]] = icmp eq i8 [[TMP9]], 0
-// CHECK-NEXT:    br i1 [[CMP_NOT_I30_I]], label [[_ZL15__MAKE_MANTISSAPKC_EXIT]], label [[WHILE_BODY_I31_I]], !llvm.loop [[LOOP11]]
-// CHECK:       while.body.i.i:
-// CHECK-NEXT:    [[TMP10:%.*]] = phi i8 [ [[TMP12:%.*]], [[IF_THEN_I_I:%.*]] ], [ [[TMP3]], [[WHILE_COND_I_I_PREHEADER]] ]
-// CHECK-NEXT:    [[__R_0_I_I16:%.*]] = phi i64 [ [[SUB_I_I:%.*]], [[IF_THEN_I_I]] ], [ 0, [[WHILE_COND_I_I_PREHEADER]] ]
-// CHECK-NEXT:    [[__TAGP_ADDR_0_I_I15:%.*]] = phi ptr [ [[INCDEC_PTR_I_I:%.*]], [[IF_THEN_I_I]] ], [ [[INCDEC_PTR_I]], [[WHILE_COND_I_I_PREHEADER]] ]
+// CHECK-NEXT:    br i1 [[CMP_NOT_I30_I]], label %[[_ZL15__MAKE_MANTISSAPKC_EXIT]], label %[[WHILE_BODY_I31_I]], !llvm.loop [[LOOP11]]
+// CHECK:       [[WHILE_BODY_I_I]]:
+// CHECK-NEXT:    [[TMP10:%.*]] = phi i8 [ [[TMP12:%.*]], %[[IF_THEN_I_I:.*]] ], [ [[TMP3]], %[[WHILE_COND_I_I_PREHEADER]] ]
+// CHECK-NEXT:    [[__R_0_I_I16:%.*]] = phi i64 [ [[SUB_I_I:%.*]], %[[IF_THEN_I_I]] ], [ 0, %[[WHILE_COND_I_I_PREHEADER]] ]
+// CHECK-NEXT:    [[__TAGP_ADDR_0_I_I15:%.*]] = phi ptr [ [[INCDEC_PTR_I_I:%.*]], %[[IF_THEN_I_I]] ], [ [[INCDEC_PTR_I]], %[[WHILE_COND_I_I_PREHEADER]] ]
 // CHECK-NEXT:    [[TMP11:%.*]] = and i8 [[TMP10]], -8
 // CHECK-NEXT:    [[OR_COND_I_I:%.*]] = icmp eq i8 [[TMP11]], 48
-// CHECK-NEXT:    br i1 [[OR_COND_I_I]], label [[IF_THEN_I_I]], label [[_ZL15__MAKE_MANTISSAPKC_EXIT]]
-// CHECK:       if.then.i.i:
+// CHECK-NEXT:    br i1 [[OR_COND_I_I]], label %[[IF_THEN_I_I]], label %[[_ZL15__MAKE_MANTISSAPKC_EXIT]]
+// CHECK:       [[IF_THEN_I_I]]:
 // CHECK-NEXT:    [[MUL_I_I:%.*]] = shl i64 [[__R_0_I_I16]], 3
 // CHECK-NEXT:    [[CONV5_I_I:%.*]] = zext nneg i8 [[TMP10]] to i64
 // CHECK-NEXT:    [[ADD_I_I:%.*]] = add i64 [[MUL_I_I]], -48
 // CHECK-NEXT:    [[SUB_I_I]] = add i64 [[ADD_I_I]], [[CONV5_I_I]]
 // CHECK-NEXT:    [[INCDEC_PTR_I_I]] = getelementptr inbounds nuw i8, ptr [[__TAGP_ADDR_0_I_I15]], i64 1
-// CHECK-NEXT:    [[TMP12]] = load i8, ptr [[INCDEC_PTR_I_I]], align 1, !tbaa [[TBAA4]]
+// CHECK-NEXT:    [[TMP12]] = load i8, ptr [[INCDEC_PTR_I_I]], align 1, !tbaa [[CHAR_TBAA4]]
 // CHECK-NEXT:    [[CMP_NOT_I_I:%.*]] = icmp eq i8 [[TMP12]], 0
-// CHECK-NEXT:    br i1 [[CMP_NOT_I_I]], label [[_ZL15__MAKE_MANTISSAPKC_EXIT]], label [[WHILE_BODY_I_I]], !llvm.loop [[LOOP7]]
-// CHECK:       while.body.i18.i:
-// CHECK-NEXT:    [[TMP13:%.*]] = phi i8 [ [[TMP15:%.*]], [[IF_THEN_I21_I:%.*]] ], [ [[TMP1]], [[WHILE_COND_I14_I_PREHEADER]] ]
-// CHECK-NEXT:    [[__R_0_I16_I7:%.*]] = phi i64 [ [[SUB_I25_I:%.*]], [[IF_THEN_I21_I]] ], [ 0, [[WHILE_COND_I14_I_PREHEADER]] ]
-// CHECK-NEXT:    [[__TAGP_ADDR_0_I15_I6:%.*]] = phi ptr [ [[INCDEC_PTR_I26_I:%.*]], [[IF_THEN_I21_I]] ], [ [[P]], [[WHILE_COND_I14_I_PREHEADER]] ]
+// CHECK-NEXT:    br i1 [[CMP_NOT_I_I]], label %[[_ZL15__MAKE_MANTISSAPKC_EXIT]], label %[[WHILE_BODY_I_I]], !llvm.loop [[LOOP7]]
+// CHECK:       [[WHILE_BODY_I18_I]]:
+// CHECK-NEXT:    [[TMP13:%.*]] = phi i8 [ [[TMP15:%.*]], %[[IF_THEN_I21_I:.*]] ], [ [[TMP1]], %[[WHILE_COND_I14_I_PREHEADER]] ]
+// CHECK-NEXT:    [[__R_0_I16_I7:%.*]] = phi i64 [ [[SUB_I25_I:%.*]], %[[IF_THEN_I21_I]] ], [ 0, %[[WHILE_COND_I14_I_PREHEADER]] ]
+// CHECK-NEXT:    [[__TAGP_ADDR_0_I15_I6:%.*]] = phi ptr [ [[INCDEC_PTR_I26_I:%.*]], %[[IF_THEN_I21_I]] ], [ [[P]], %[[WHILE_COND_I14_I_PREHEADER]] ]
 // CHECK-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], -48
 // CHECK-NEXT:    [[OR_COND_I19_I:%.*]] = icmp ult i8 [[TMP14]], 10
-// CHECK-NEXT:    br i1 [[OR_COND_I19_I]], label [[IF_THEN_I21_I]], label [[_ZL15__MAKE_MANTISSAPKC_EXIT]]
-// CHECK:       if.then.i21.i:
+// CHECK-NEXT:    br i1 [[OR_COND_I19_I]], label %[[IF_THEN_I21_I]], label %[[_ZL15__MAKE_MANTISSAPKC_EXIT]]
+// CHECK:       [[IF_THEN_I21_I]]:
 // CHECK-NEXT:    [[MUL_I22_I:%.*]] = mul i64 [[__R_0_I16_I7]], 10
 // CHECK-NEXT:    [[CONV5_I23_I:%.*]] = zext nneg i8 [[TMP13]] to i64
 // CHECK-NEXT:    [[ADD_I24_I:%.*]] = add i64 [[MUL_I22_I]], -48
 // CHECK-NEXT:    [[SUB_I25_I]] = add i64 [[ADD_I24_I]], [[CONV5_I23_I]]
 // CHECK-NEXT:    [[INCDEC_PTR_I26_I]] = getelementptr inbounds nuw i8, ptr [[__TAGP_ADDR_0_I15_I6]], i64 1
-// CHECK-NEXT:    [[TMP15]] = load i8, ptr [[INCDEC_PTR_I26_I]], align 1, !tbaa [[TBAA4]]
+// CHECK-NEXT:    [[TMP15]] = load i8, ptr [[INCDEC_PTR_I26_I]], align 1, !tbaa [[CHAR_TBAA4]]
 // CHECK-NEXT:    [[CMP_NOT_I17_I:%.*]] = icmp eq i8 [[TMP15]], 0
-// CHECK-NEXT:    br i1 [[CMP_NOT_I17_I]], label [[_ZL15__MAKE_MANTISSAPKC_EXIT]], label [[WHILE_BODY_I18_I]], !llvm.loop [[LOOP10]]
-// CHECK:       _ZL15__make_mantissaPKc.exit:
-// CHECK-NEXT:    [[RETVAL_0_I:%.*]] = phi i64 [ 0, [[WHILE_COND_I_I_PREHEADER]] ], [ 0, [[IF_THEN5_I]] ], [ 0, [[WHILE_COND_I14_I_PREHEADER]] ], [ [[SUB_I_I]], [[IF_THEN_I_I]] ], [ 0, [[WHILE_BODY_I_I]] ], [ [[ADD28_I_I]], [[IF_END31_I_I]] ], [ 0, [[IF_ELSE17_I_I]] ], [ [[SUB_I25_I]], [[IF_THEN_I21_I]] ], [ 0, [[WHILE_BODY_I18_I]] ]
+// CHECK-NEXT:    br i1 [[CMP_NOT_I17_I]], label %[[_ZL15__MAKE_MANTISSAPKC_EXIT]], label %[[WHILE_BODY_I18_I]], !llvm.loop [[LOOP10]]
+// CHECK:       [[_ZL15__MAKE_MANTISSAPKC_EXIT]]:
+// CHECK-NEXT:    [[RETVAL_0_I:%.*]] = phi i64 [ 0, %[[WHILE_COND_I_I_PREHEADER]] ], [ 0, %[[IF_THEN5_I]] ], [ 0, %[[WHILE_COND_I14_I_PREHEADER]] ], [ [[SUB_I_I]], %[[IF_THEN_I_I]] ], [ 0, %[[WHILE_BODY_I_I]] ], [ [[ADD28_I_I]], %[[IF_END31_I_I]] ], [ 0, %[[IF_ELSE17_I_I]] ], [ [[SUB_I25_I]], %[[IF_THEN_I21_I]] ], [ 0, %[[WHILE_BODY_I18_I]] ]
 // CHECK-NEXT:    ret i64 [[RETVAL_0_I]]
 //
-// AMDGCNSPIRV-LABEL: @test___make_mantissa(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = load i8, ptr addrspace(4) [[P:%.*]], align 1, !tbaa [[TBAA5]]
+// AMDGCNSPIRV-LABEL: define spir_func i64 @test___make_mantissa(
+// AMDGCNSPIRV-SAME: ptr addrspace(4) noundef readonly captures(none) [[P:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR2]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*]]:
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = load i8, ptr addrspace(4) [[P]], align 1, !tbaa [[CHAR_TBAA5]]
 // AMDGCNSPIRV-NEXT:    [[CMP_I:%.*]] = icmp eq i8 [[TMP0]], 48
-// AMDGCNSPIRV-NEXT:    br i1 [[CMP_I]], label [[IF_THEN_I:%.*]], label [[WHILE_COND_I14_I:%.*]]
-// AMDGCNSPIRV:       if.then.i:
+// AMDGCNSPIRV-NEXT:    br i1 [[CMP_I]], label %[[IF_THEN_I:.*]], label %[[WHILE_COND_I14_I:.*]]
+// AMDGCNSPIRV:       [[IF_THEN_I]]:
 // AMDGCNSPIRV-NEXT:    [[INCDEC_PTR_I:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[P]], i64 1
-// AMDGCNSPIRV-NEXT:    [[TMP1:%.*]] = load i8, ptr addrspace(4) [[INCDEC_PTR_I]], align 1, !tbaa [[TBAA5]]
-// AMDGCNSPIRV-NEXT:    switch i8 [[TMP1]], label [[WHILE_COND_I_I:%.*]] [
-// AMDGCNSPIRV-NEXT:      i8 120, label [[IF_THEN5_I:%.*]]
-// AMDGCNSPIRV-NEXT:      i8 88, label [[IF_THEN5_I]]
+// AMDGCNSPIRV-NEXT:    [[TMP1:%.*]] = load i8, ptr addrspace(4) [[INCDEC_PTR_I]], align 1, !tbaa [[CHAR_TBAA5]]
+// AMDGCNSPIRV-NEXT:    switch i8 [[TMP1]], label %[[WHILE_COND_I_I:.*]] [
+// AMDGCNSPIRV-NEXT:      i8 120, label %[[IF_THEN5_I:.*]]
+// AMDGCNSPIRV-NEXT:      i8 88, label %[[IF_THEN5_I]]
 // AMDGCNSPIRV-NEXT:    ]
-// AMDGCNSPIRV:       if.then5.i:
-// AMDGCNSPIRV-NEXT:    [[TMP2:%.*]] = load i8, ptr addrspace(4) [[INCDEC_PTR_I]], align 1, !tbaa [[TBAA5]]
+// AMDGCNSPIRV:       [[IF_THEN5_I]]:
+// AMDGCNSPIRV-NEXT:    [[TMP2:%.*]] = load i8, ptr addrspace(4) [[INCDEC_PTR_I]], align 1, !tbaa [[CHAR_TBAA5]]
 // AMDGCNSPIRV-NEXT:    [[CMP_NOT_I31_I5:%.*]] = icmp eq i8 [[TMP2]], 0
-// AMDGCNSPIRV-NEXT:    br i1 [[CMP_NOT_I31_I5]], label [[_ZL15__MAKE_MANTISSAPKC_EXIT:%.*]], label [[WHILE_BODY_I32_I:%.*]]
-// AMDGCNSPIRV:       while.body.i32.i:
-// AMDGCNSPIRV-NEXT:    [[TMP3:%.*]] = phi i8 [ [[TMP7:%.*]], [[IF_END31_I_I:%.*]] ], [ [[TMP2]], [[IF_THEN5_I]] ]
-// AMDGCNSPIRV-NEXT:    [[__R_0_I30_I7:%.*]] = phi i64 [ [[ADD28_I_I:%.*]], [[IF_END31_I_I]] ], [ 0, [[IF_THEN5_I]] ]
-// AMDGCNSPIRV-NEXT:    [[__TAGP_ADDR_0_I29_I6:%.*]] = phi ptr addrspace(4) [ [[INCDEC_PTR_I36_I:%.*]], [[IF_END31_I_I]] ], [ [[INCDEC_PTR_I]], [[IF_THEN5_I]] ]
+// AMDGCNSPIRV-NEXT:    br i1 [[CMP_NOT_I31_I5]], label %[[_ZL15__MAKE_MANTISSAPKC_EXIT:.*]], label %[[WHILE_BODY_I32_I:.*]]
+// AMDGCNSPIRV:       [[WHILE_BODY_I32_I]]:
+// AMDGCNSPIRV-NEXT:    [[TMP3:%.*]] = phi i8 [ [[TMP7:%.*]], %[[IF_END31_I_I:.*]] ], [ [[TMP2]], %[[IF_THEN5_I]] ]
+// AMDGCNSPIRV-NEXT:    [[__R_0_I30_I7:%.*]] = phi i64 [ [[ADD28_I_I:%.*]], %[[IF_END31_I_I]] ], [ 0, %[[IF_THEN5_I]] ]
+// AMDGCNSPIRV-NEXT:    [[__TAGP_ADDR_0_I29_I6:%.*]] = phi ptr addrspace(4) [ [[INCDEC_PTR_I36_I:%.*]], %[[IF_END31_I_I]] ], [ [[INCDEC_PTR_I]], %[[IF_THEN5_I]] ]
 // AMDGCNSPIRV-NEXT:    [[TMP4:%.*]] = add i8 [[TMP3]], -48
 // AMDGCNSPIRV-NEXT:    [[OR_COND_I33_I:%.*]] = icmp ult i8 [[TMP4]], 10
-// AMDGCNSPIRV-NEXT:    br i1 [[OR_COND_I33_I]], label [[IF_END31_I_I]], label [[IF_ELSE_I_I:%.*]]
-// AMDGCNSPIRV:       if.else.i.i:
+// AMDGCNSPIRV-NEXT:    br i1 [[OR_COND_I33_I]], label %[[IF_END31_I_I]], label %[[IF_ELSE_I_I:.*]]
+// AMDGCNSPIRV:       [[IF_ELSE_I_I]]:
 // AMDGCNSPIRV-NEXT:    [[TMP5:%.*]] = add i8 [[TMP3]], -97
 // AMDGCNSPIRV-NEXT:    [[OR_COND33_I_I:%.*]] = icmp ult i8 [[TMP5]], 6
-// AMDGCNSPIRV-NEXT:    br i1 [[OR_COND33_I_I]], label [[IF_END31_I_I]], label [[IF_ELSE17_I_I:%.*]]
-// AMDGCNSPIRV:       if.else17.i.i:
+// AMDGCNSPIRV-NEXT:    br i1 [[OR_COND33_I_I]], label %[[IF_END31_I_I]], label %[[IF_ELSE17_I_I:.*]]
+// AMDGCNSPIRV:       [[IF_ELSE17_I_I]]:
 // AMDGCNSPIRV-NEXT:    [[TMP6:%.*]] = add i8 [[TMP3]], -65
 // AMDGCNSPIRV-NEXT:    [[OR_COND34_I_I:%.*]] = icmp ult i8 [[TMP6]], 6
-// AMDGCNSPIRV-NEXT:    br i1 [[OR_COND34_I_I]], label [[IF_END31_I_I]], label [[_ZL15__MAKE_MANTISSAPKC_EXIT]]
-// AMDGCNSPIRV:       if.end31.i.i:
-// AMDGCNSPIRV-NEXT:    [[DOTSINK:%.*]] = phi i64 [ -48, [[WHILE_BODY_I32_I]] ], [ -87, [[IF_ELSE_I_I]] ], [ -55, [[IF_ELSE17_I_I]] ]
+// AMDGCNSPIRV-NEXT:    br i1 [[OR_COND34_I_I]], label %[[IF_END31_I_I]], label %[[_ZL15__MAKE_MANTISSAPKC_EXIT]]
+// AMDGCNSPIRV:       [[IF_END31_I_I]]:
+// AMDGCNSPIRV-NEXT:    [[DOTSINK:%.*]] = phi i64 [ -48, %[[WHILE_BODY_I32_I]] ], [ -87, %[[IF_ELSE_I_I]] ], [ -55, %[[IF_ELSE17_I_I]] ]
 // AMDGCNSPIRV-NEXT:    [[MUL24_I_I:%.*]] = shl i64 [[__R_0_I30_I7]], 4
 // AMDGCNSPIRV-NEXT:    [[CONV25_I_I:%.*]] = zext nneg i8 [[TMP3]] to i64
 // AMDGCNSPIRV-NEXT:    [[ADD26_I_I:%.*]] = add i64 [[MUL24_I_I]], [[DOTSINK]]
 // AMDGCNSPIRV-NEXT:    [[ADD28_I_I]] = add i64 [[ADD26_I_I]], [[CONV25_I_I]]
 // AMDGCNSPIRV-NEXT:    [[INCDEC_PTR_I36_I]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[__TAGP_ADDR_0_I29_I6]], i64 1
-// AMDGCNSPIRV-NEXT:    [[TMP7]] = load i8, ptr addrspace(4) [[INCDEC_PTR_I36_I]], align 1, !tbaa [[TBAA5]]
+// AMDGCNSPIRV-NEXT:    [[TMP7]] = load i8, ptr addrspace(4) [[INCDEC_PTR_I36_I]], align 1, !tbaa [[CHAR_TBAA5]]
 // AMDGCNSPIRV-NEXT:    [[CMP_NOT_I31_I:%.*]] = icmp eq i8 [[TMP7]], 0
-// AMDGCNSPIRV-NEXT:    br i1 [[CMP_NOT_I31_I]], label [[_ZL15__MAKE_MANTISSAPKC_EXIT]], label [[WHILE_BODY_I32_I]], !llvm.loop [[LOOP12]]
-// AMDGCNSPIRV:       while.cond.i.i:
-// AMDGCNSPIRV-NEXT:    [[__TAGP_ADDR_0_I_I:%.*]] = phi ptr addrspace(4) [ [[__TAGP_ADDR_1_I_I:%.*]], [[WHILE_BODY_I_I:%.*]] ], [ [[INCDEC_PTR_I]], [[IF_THEN_I]] ]
-// AMDGCNSPIRV-NEXT:    [[__R_0_I_I:%.*]] = phi i64 [ [[__R_1_I_I:%.*]], [[WHILE_BODY_I_I]] ], [ 0, [[IF_THEN_I]] ]
-// AMDGCNSPIRV-NEXT:    [[TMP8:%.*]] = load i8, ptr addrspace(4) [[__TAGP_ADDR_0_I_I]], align 1, !tbaa [[TBAA5]]
+// AMDGCNSPIRV-NEXT:    br i1 [[CMP_NOT_I31_I]], label %[[_ZL15__MAKE_MANTISSAPKC_EXIT]], label %[[WHILE_BODY_I32_I]], !llvm.loop [[LOOP12]]
+// AMDGCNSPIRV:       [[WHILE_COND_I_I]]:
+// AMDGCNSPIRV-NEXT:    [[__TAGP_ADDR_0_I_I:%.*]] = phi ptr addrspace(4) [ [[__TAGP_ADDR_1_I_I:%.*]], %[[WHILE_BODY_I_I:.*]] ], [ [[INCDEC_PTR_I]], %[[IF_THEN_I]] ]
+// AMDGCNSPIRV-NEXT:    [[__R_0_I_I:%.*]] = phi i64 [ [[__R_1_I_I:%.*]], %[[WHILE_BODY_I_I]] ], [ 0, %[[IF_THEN_I]] ]
+// AMDGCNSPIRV-NEXT:    [[TMP8:%.*]] = load i8, ptr addrspace(4) [[__TAGP_ADDR_0_I_I]], align 1, !tbaa [[CHAR_TBAA5]]
 // AMDGCNSPIRV-NEXT:    [[CMP_NOT_I_I:%.*]] = icmp eq i8 [[TMP8]], 0
-// AMDGCNSPIRV-NEXT:    br i1 [[CMP_NOT_I_I]], label [[_ZL15__MAKE_MANTISSAPKC_EXIT]], label [[WHILE_BODY_I_I]]
-// AMDGCNSPIRV:       while.body.i.i:
+// AMDGCNSPIRV-NEXT:    br i1 [[CMP_NOT_I_I]], label %[[_ZL15__MAKE_MANTISSAPKC_EXIT]], label %[[WHILE_BODY_I_I]]
+// AMDGCNSPIRV:       [[WHILE_BODY_I_I]]:
 // AMDGCNSPIRV-NEXT:    [[TMP9:%.*]] = and i8 [[TMP8]], -8
 // AMDGCNSPIRV-NEXT:    [[OR_COND_I_I:%.*]] = icmp eq i8 [[TMP9]], 48
 // AMDGCNSPIRV-NEXT:    [[MUL_I_I:%.*]] = shl i64 [[__R_0_I_I]], 3
@@ -367,14 +375,14 @@ extern "C" __device__ uint64_t test___make_mantissa_base16(const char *p) {
 // AMDGCNSPIRV-NEXT:    [[__TAGP_ADDR_1_I_I_IDX:%.*]] = zext i1 [[OR_COND_I_I]] to i64
 // AMDGCNSPIRV-NEXT:    [[__TAGP_ADDR_1_I_I]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[__TAGP_ADDR_0_I_I]], i64 [[__TAGP_ADDR_1_I_I_IDX]]
 // AMDGCNSPIRV-NEXT:    [[__R_1_I_I]] = select i1 [[OR_COND_I_I]], i64 [[SUB_I_I]], i64 [[__R_0_I_I]]
-// AMDGCNSPIRV-NEXT:    br i1 [[OR_COND_I_I]], label [[WHILE_COND_I_I]], label [[_ZL15__MAKE_MANTISSAPKC_EXIT]], !llvm.loop [[LOOP8]]
-// AMDGCNSPIRV:       while.cond.i14.i:
-// AMDGCNSPIRV-NEXT:    [[__TAGP_ADDR_0_I15_I:%.*]] = phi ptr addrspace(4) [ [[__TAGP_ADDR_1_I25_I:%.*]], [[WHILE_BODY_I18_I:%.*]] ], [ [[P]], [[ENTRY:%.*]] ]
-// AMDGCNSPIRV-NEXT:    [[__R_0_I16_I:%.*]] = phi i64 [ [[__R_1_I26_I:%.*]], [[WHILE_BODY_I18_I]] ], [ 0, [[ENTRY]] ]
-// AMDGCNSPIRV-NEXT:    [[TMP10:%.*]] = load i8, ptr addrspace(4) [[__TAGP_ADDR_0_I15_I]], align 1, !tbaa [[TBAA5]]
+// AMDGCNSPIRV-NEXT:    br i1 [[OR_COND_I_I]], label %[[WHILE_COND_I_I]], label %[[_ZL15__MAKE_MANTISSAPKC_EXIT]], !llvm.loop [[LOOP8]]
+// AMDGCNSPIRV:       [[WHILE_COND_I14_I]]:
+// AMDGCNSPIRV-NEXT:    [[__TAGP_ADDR_0_I15_I:%.*]] = phi ptr addrspace(4) [ [[__TAGP_ADDR_1_I25_I:%.*]], %[[WHILE_BODY_I18_I:.*]] ], [ [[P]], %[[ENTRY]] ]
+// AMDGCNSPIRV-NEXT:    [[__R_0_I16_I:%.*]] = phi i64 [ [[__R_1_I26_I:%.*]], %[[WHILE_BODY_I18_I]] ], [ 0, %[[ENTRY]] ]
+// AMDGCNSPIRV-NEXT:    [[TMP10:%.*]] = load i8, ptr addrspace(4) [[__TAGP_ADDR_0_I15_I]], align 1, !tbaa [[CHAR_TBAA5]]
 // AMDGCNSPIRV-NEXT:    [[CMP_NOT_I17_I:%.*]] = icmp eq i8 [[TMP10]], 0
-// AMDGCNSPIRV-NEXT:    br i1 [[CMP_NOT_I17_I]], label [[_ZL15__MAKE_MANTISSAPKC_EXIT]], label [[WHILE_BODY_I18_I]]
-// AMDGCNSPIRV:       while.body.i18.i:
+// AMDGCNSPIRV-NEXT:    br i1 [[CMP_NOT_I17_I]], label %[[_ZL15__MAKE_MANTISSAPKC_EXIT]], label %[[WHILE_BODY_I18_I]]
+// AMDGCNSPIRV:       [[WHILE_BODY_I18_I]]:
 // AMDGCNSPIRV-NEXT:    [[TMP11:%.*]] = add i8 [[TMP10]], -48
 // AMDGCNSPIRV-NEXT:    [[OR_COND_I19_I:%.*]] = icmp ult i8 [[TMP11]], 10
 // AMDGCNSPIRV-NEXT:    [[MUL_I20_I:%.*]] = mul i64 [[__R_0_I16_I]], 10
@@ -384,225 +392,261 @@ extern "C" __device__ uint64_t test___make_mantissa_base16(const char *p) {
 // AMDGCNSPIRV-NEXT:    [[__TAGP_ADDR_1_I25_I_IDX:%.*]] = zext i1 [[OR_COND_I19_I]] to i64
 // AMDGCNSPIRV-NEXT:    [[__TAGP_ADDR_1_I25_I]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[__TAGP_ADDR_0_I15_I]], i64 [[__TAGP_ADDR_1_I25_I_IDX]]
 // AMDGCNSPIRV-NEXT:    [[__R_1_I26_I]] = select i1 [[OR_COND_I19_I]], i64 [[SUB_I23_I]], i64 [[__R_0_I16_I]]
-// AMDGCNSPIRV-NEXT:    br i1 [[OR_COND_I19_I]], label [[WHILE_COND_I14_I]], label [[_ZL15__MAKE_MANTISSAPKC_EXIT]], !llvm.loop [[LOOP11]]
-// AMDGCNSPIRV:       _ZL15__make_mantissaPKc.exit:
-// AMDGCNSPIRV-NEXT:    [[RETVAL_0_I:%.*]] = phi i64 [ 0, [[IF_THEN5_I]] ], [ 0, [[WHILE_BODY_I_I]] ], [ [[__R_0_I_I]], [[WHILE_COND_I_I]] ], [ [[ADD28_I_I]], [[IF_END31_I_I]] ], [ 0, [[IF_ELSE17_I_I]] ], [ 0, [[WHILE_BODY_I18_I]] ], [ [[__R_0_I16_I]], [[WHILE_COND_I14_I]] ]
+// AMDGCNSPIRV-NEXT:    br i1 [[OR_COND_I19_I]], label %[[WHILE_COND_I14_I]], label %[[_ZL15__MAKE_MANTISSAPKC_EXIT]], !llvm.loop [[LOOP11]]
+// AMDGCNSPIRV:       [[_ZL15__MAKE_MANTISSAPKC_EXIT]]:
+// AMDGCNSPIRV-NEXT:    [[RETVAL_0_I:%.*]] = phi i64 [ 0, %[[IF_THEN5_I]] ], [ 0, %[[WHILE_BODY_I_I]] ], [ [[__R_0_I_I]], %[[WHILE_COND_I_I]] ], [ [[ADD28_I_I]], %[[IF_END31_I_I]] ], [ 0, %[[IF_ELSE17_I_I]] ], [ 0, %[[WHILE_BODY_I18_I]] ], [ [[__R_0_I16_I]], %[[WHILE_COND_I14_I]] ]
 // AMDGCNSPIRV-NEXT:    ret i64 [[RETVAL_0_I]]
 //
 extern "C" __device__ uint64_t test___make_mantissa(const char *p) {
   return __make_mantissa(p);
 }
 
-// CHECK-LABEL: @test_abs(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call noundef range(i32 0, -2147483648) i32 @llvm.abs.i32(i32 [[X:%.*]], i1 true)
+// CHECK-LABEL: define dso_local noundef range(i32 0, -2147483648) i32 @test_abs(
+// CHECK-SAME: i32 noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call noundef range(i32 0, -2147483648) i32 @llvm.abs.i32(i32 [[X]], i1 true)
 // CHECK-NEXT:    ret i32 [[TMP0]]
 //
-// AMDGCNSPIRV-LABEL: @test_abs(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call noundef range(i32 0, -2147483648) addrspace(4) i32 @llvm.abs.i32(i32 [[X:%.*]], i1 true)
+// AMDGCNSPIRV-LABEL: define spir_func noundef range(i32 0, -2147483648) i32 @test_abs(
+// AMDGCNSPIRV-SAME: i32 noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR3:[0-9]+]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call noundef range(i32 0, -2147483648) addrspace(4) i32 @llvm.abs.i32(i32 [[X]], i1 true)
 // AMDGCNSPIRV-NEXT:    ret i32 [[TMP0]]
 //
 extern "C" __device__ int test_abs(int x) {
   return abs(x);
 }
 
-// CHECK-LABEL: @test_labs(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call noundef range(i64 0, -9223372036854775808) i64 @llvm.abs.i64(i64 [[X:%.*]], i1 true)
+// CHECK-LABEL: define dso_local noundef range(i64 0, -9223372036854775808) i64 @test_labs(
+// CHECK-SAME: i64 noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call noundef range(i64 0, -9223372036854775808) i64 @llvm.abs.i64(i64 [[X]], i1 true)
 // CHECK-NEXT:    ret i64 [[TMP0]]
 //
-// AMDGCNSPIRV-LABEL: @test_labs(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call noundef range(i64 0, -9223372036854775808) addrspace(4) i64 @llvm.abs.i64(i64 [[X:%.*]], i1 true)
+// AMDGCNSPIRV-LABEL: define spir_func noundef range(i64 0, -9223372036854775808) i64 @test_labs(
+// AMDGCNSPIRV-SAME: i64 noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR3]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call noundef range(i64 0, -9223372036854775808) addrspace(4) i64 @llvm.abs.i64(i64 [[X]], i1 true)
 // AMDGCNSPIRV-NEXT:    ret i64 [[TMP0]]
 //
 extern "C" __device__ long test_labs(long x) {
   return labs(x);
 }
 
-// CHECK-LABEL: @test_llabs(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call noundef range(i64 0, -9223372036854775808) i64 @llvm.abs.i64(i64 [[X:%.*]], i1 true)
+// CHECK-LABEL: define dso_local noundef range(i64 0, -9223372036854775808) i64 @test_llabs(
+// CHECK-SAME: i64 noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call noundef range(i64 0, -9223372036854775808) i64 @llvm.abs.i64(i64 [[X]], i1 true)
 // CHECK-NEXT:    ret i64 [[TMP0]]
 //
-// AMDGCNSPIRV-LABEL: @test_llabs(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call noundef range(i64 0, -9223372036854775808) addrspace(4) i64 @llvm.abs.i64(i64 [[X:%.*]], i1 true)
+// AMDGCNSPIRV-LABEL: define spir_func noundef range(i64 0, -9223372036854775808) i64 @test_llabs(
+// AMDGCNSPIRV-SAME: i64 noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR3]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call noundef range(i64 0, -9223372036854775808) addrspace(4) i64 @llvm.abs.i64(i64 [[X]], i1 true)
 // AMDGCNSPIRV-NEXT:    ret i64 [[TMP0]]
 //
 extern "C" __device__ long long test_llabs(long x) {
   return llabs(x);
 }
 
-// DEFAULT-LABEL: @test_acosf(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_acos_f32(float noundef [[X:%.*]]) #[[ATTR14:[0-9]+]]
+// DEFAULT-LABEL: define dso_local noundef float @test_acosf(
+// DEFAULT-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR4:[0-9]+]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_acos_f32(float noundef [[X]]) #[[ATTR14:[0-9]+]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
-// FINITEONLY-LABEL: @test_acosf(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_acos_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14:[0-9]+]]
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) float @test_acosf(
+// FINITEONLY-SAME: float noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR4:[0-9]+]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_acos_f32(float noundef nofpclass(nan inf) [[X]]) #[[ATTR14:[0-9]+]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
-// APPROX-LABEL: @test_acosf(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_acos_f32(float noundef [[X:%.*]]) #[[ATTR14:[0-9]+]]
+// APPROX-LABEL: define dso_local noundef float @test_acosf(
+// APPROX-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR4:[0-9]+]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_acos_f32(float noundef [[X]]) #[[ATTR14:[0-9]+]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
-// NCRDIV-LABEL: @test_acosf(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_acos_f32(float noundef [[X:%.*]]) #[[ATTR14:[0-9]+]]
+// NCRDIV-LABEL: define dso_local noundef float @test_acosf(
+// NCRDIV-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR4:[0-9]+]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_acos_f32(float noundef [[X]]) #[[ATTR14:[0-9]+]]
 // NCRDIV-NEXT:    ret float [[CALL_I]]
 //
-// AMDGCNSPIRV-LABEL: @test_acosf(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_acos_f32(float noundef [[X:%.*]]) #[[ATTR12:[0-9]+]]
+// AMDGCNSPIRV-LABEL: define spir_func noundef float @test_acosf(
+// AMDGCNSPIRV-SAME: float noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR4:[0-9]+]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_acos_f32(float noundef [[X]]) #[[ATTR12:[0-9]+]]
 // AMDGCNSPIRV-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_acosf(float x) {
   return acosf(x);
 }
 
-// DEFAULT-LABEL: @test_acos(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_acos_f64(double noundef [[X:%.*]]) #[[ATTR14]]
+// DEFAULT-LABEL: define dso_local noundef double @test_acos(
+// DEFAULT-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_acos_f64(double noundef [[X]]) #[[ATTR14]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
-// FINITEONLY-LABEL: @test_acos(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_acos_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]]
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) double @test_acos(
+// FINITEONLY-SAME: double noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_acos_f64(double noundef nofpclass(nan inf) [[X]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
-// APPROX-LABEL: @test_acos(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_acos_f64(double noundef [[X:%.*]]) #[[ATTR14]]
+// APPROX-LABEL: define dso_local noundef double @test_acos(
+// APPROX-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_acos_f64(double noundef [[X]]) #[[ATTR14]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
-// NCRDIV-LABEL: @test_acos(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_acos_f64(double noundef [[X:%.*]]) #[[ATTR14]]
+// NCRDIV-LABEL: define dso_local noundef double @test_acos(
+// NCRDIV-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_acos_f64(double noundef [[X]]) #[[ATTR14]]
 // NCRDIV-NEXT:    ret double [[CALL_I]]
 //
-// AMDGCNSPIRV-LABEL: @test_acos(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_acos_f64(double noundef [[X:%.*]]) #[[ATTR12]]
+// AMDGCNSPIRV-LABEL: define spir_func noundef double @test_acos(
+// AMDGCNSPIRV-SAME: double noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR4]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_acos_f64(double noundef [[X]]) #[[ATTR12]]
 // AMDGCNSPIRV-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_acos(double x) {
   return acos(x);
 }
 
-// DEFAULT-LABEL: @test_acoshf(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_acosh_f32(float noundef [[X:%.*]]) #[[ATTR15:[0-9]+]]
+// DEFAULT-LABEL: define dso_local noundef float @test_acoshf(
+// DEFAULT-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR5:[0-9]+]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_acosh_f32(float noundef [[X]]) #[[ATTR15:[0-9]+]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
-// FINITEONLY-LABEL: @test_acoshf(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_acosh_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15:[0-9]+]]
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) float @test_acoshf(
+// FINITEONLY-SAME: float noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR5:[0-9]+]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_acosh_f32(float noundef nofpclass(nan inf) [[X]]) #[[ATTR15:[0-9]+]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
-// APPROX-LABEL: @test_acoshf(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_acosh_f32(float noundef [[X:%.*]]) #[[ATTR15:[0-9]+]]
+// APPROX-LABEL: define dso_local noundef float @test_acoshf(
+// APPROX-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR5:[0-9]+]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_acosh_f32(float noundef [[X]]) #[[ATTR15:[0-9]+]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
-// NCRDIV-LABEL: @test_acoshf(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_acosh_f32(float noundef [[X:%.*]]) #[[ATTR15:[0-9]+]]
+// NCRDIV-LABEL: define dso_local noundef float @test_acoshf(
+// NCRDIV-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR5:[0-9]+]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_acosh_f32(float noundef [[X]]) #[[ATTR15:[0-9]+]]
 // NCRDIV-NEXT:    ret float [[CALL_I]]
 //
-// AMDGCNSPIRV-LABEL: @test_acoshf(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_acosh_f32(float noundef [[X:%.*]]) #[[ATTR13:[0-9]+]]
+// AMDGCNSPIRV-LABEL: define spir_func noundef float @test_acoshf(
+// AMDGCNSPIRV-SAME: float noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR5:[0-9]+]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_acosh_f32(float noundef [[X]]) #[[ATTR13:[0-9]+]]
 // AMDGCNSPIRV-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_acoshf(float x) {
   return acoshf(x);
 }
 
-// DEFAULT-LABEL: @test_acosh(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_acosh_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-LABEL: define dso_local noundef double @test_acosh(
+// DEFAULT-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_acosh_f64(double noundef [[X]]) #[[ATTR15]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
-// FINITEONLY-LABEL: @test_acosh(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_acosh_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) double @test_acosh(
+// FINITEONLY-SAME: double noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_acosh_f64(double noundef nofpclass(nan inf) [[X]]) #[[ATTR15]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
-// APPROX-LABEL: @test_acosh(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_acosh_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-LABEL: define dso_local noundef double @test_acosh(
+// APPROX-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_acosh_f64(double noundef [[X]]) #[[ATTR15]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
-// NCRDIV-LABEL: @test_acosh(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_acosh_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// NCRDIV-LABEL: define dso_local noundef double @test_acosh(
+// NCRDIV-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_acosh_f64(double noundef [[X]]) #[[ATTR15]]
 // NCRDIV-NEXT:    ret double [[CALL_I]]
 //
-// AMDGCNSPIRV-LABEL: @test_acosh(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_acosh_f64(double noundef [[X:%.*]]) #[[ATTR13]]
+// AMDGCNSPIRV-LABEL: define spir_func noundef double @test_acosh(
+// AMDGCNSPIRV-SAME: double noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR5]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_acosh_f64(double noundef [[X]]) #[[ATTR13]]
 // AMDGCNSPIRV-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_acosh(double x) {
   return acosh(x);
 }
 
-// DEFAULT-LABEL: @test_asinf(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_asin_f32(float noundef [[X:%.*]]) #[[ATTR14]]
+// DEFAULT-LABEL: define dso_local noundef float @test_asinf(
+// DEFAULT-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_asin_f32(float noundef [[X]]) #[[ATTR14]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
-// FINITEONLY-LABEL: @test_asinf(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_asin_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]]
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) float @test_asinf(
+// FINITEONLY-SAME: float noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_asin_f32(float noundef nofpclass(nan inf) [[X]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
-// APPROX-LABEL: @test_asinf(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_asin_f32(float noundef [[X:%.*]]) #[[ATTR14]]
+// APPROX-LABEL: define dso_local noundef float @test_asinf(
+// APPROX-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_asin_f32(float noundef [[X]]) #[[ATTR14]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
-// NCRDIV-LABEL: @test_asinf(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_asin_f32(float noundef [[X:%.*]]) #[[ATTR14]]
+// NCRDIV-LABEL: define dso_local noundef float @test_asinf(
+// NCRDIV-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_asin_f32(float noundef [[X]]) #[[ATTR14]]
 // NCRDIV-NEXT:    ret float [[CALL_I]]
 //
-// AMDGCNSPIRV-LABEL: @test_asinf(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_asin_f32(float noundef [[X:%.*]]) #[[ATTR12]]
+// AMDGCNSPIRV-LABEL: define spir_func noundef float @test_asinf(
+// AMDGCNSPIRV-SAME: float noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR4]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_asin_f32(float noundef [[X]]) #[[ATTR12]]
 // AMDGCNSPIRV-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_asinf(float x) {
   return asinf(x);
 }
 
-// DEFAULT-LABEL: @test_asin(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_asin_f64(double noundef [[X:%.*]]) #[[ATTR14]]
+// DEFAULT-LABEL: define dso_local noundef double @test_asin(
+// DEFAULT-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_asin_f64(double noundef [[X]]) #[[ATTR14]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
-// FINITEONLY-LABEL: @test_asin(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_asin_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]]
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) double @test_asin(
+// FINITEONLY-SAME: double noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_asin_f64(double noundef nofpclass(nan inf) [[X]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
-// APPROX-LABEL: @test_asin(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_asin_f64(double noundef [[X:%.*]]) #[[ATTR14]]
+// APPROX-LABEL: define dso_local noundef double @test_asin(
+// APPROX-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_asin_f64(double noundef [[X]]) #[[ATTR14]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
-// NCRDIV-LABEL: @test_asin(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_asin_f64(double noundef [[X:%.*]]) #[[ATTR14]]
+// NCRDIV-LABEL: define dso_local noundef double @test_asin(
+// NCRDIV-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_asin_f64(double noundef [[X]]) #[[ATTR14]]
 // NCRDIV-NEXT:    ret double [[CALL_I]]
 //
-// AMDGCNSPIRV-LABEL: @test_asin(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_asin_f64(double noundef [[X:%.*]]) #[[ATTR12]]
+// AMDGCNSPIRV-LABEL: define spir_func noundef double @test_asin(
+// AMDGCNSPIRV-SAME: double noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR4]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_asin_f64(double noundef [[X]]) #[[ATTR12]]
 // AMDGCNSPIRV-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_asin(double x) {
@@ -610,1551 +654,1816 @@ extern "C" __device__ double test_asin(double x) {
   return asin(x);
 }
 
-// DEFAULT-LABEL: @test_asinhf(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_asinh_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-LABEL: define dso_local noundef float @test_asinhf(
+// DEFAULT-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_asinh_f32(float noundef [[X]]) #[[ATTR15]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
-// FINITEONLY-LABEL: @test_asinhf(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_asinh_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) float @test_asinhf(
+// FINITEONLY-SAME: float noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_asinh_f32(float noundef nofpclass(nan inf) [[X]]) #[[ATTR15]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
-// APPROX-LABEL: @test_asinhf(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_asinh_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-LABEL: define dso_local noundef float @test_asinhf(
+// APPROX-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_asinh_f32(float noundef [[X]]) #[[ATTR15]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
-// NCRDIV-LABEL: @test_asinhf(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_asinh_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// NCRDIV-LABEL: define dso_local noundef float @test_asinhf(
+// NCRDIV-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_asinh_f32(float noundef [[X]]) #[[ATTR15]]
 // NCRDIV-NEXT:    ret float [[CALL_I]]
 //
-// AMDGCNSPIRV-LABEL: @test_asinhf(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_asinh_f32(float noundef [[X:%.*]]) #[[ATTR13]]
+// AMDGCNSPIRV-LABEL: define spir_func noundef float @test_asinhf(
+// AMDGCNSPIRV-SAME: float noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR5]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_asinh_f32(float noundef [[X]]) #[[ATTR13]]
 // AMDGCNSPIRV-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_asinhf(float x) {
   return asinhf(x);
 }
 
-// DEFAULT-LABEL: @test_asinh(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_asinh_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-LABEL: define dso_local noundef double @test_asinh(
+// DEFAULT-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_asinh_f64(double noundef [[X]]) #[[ATTR15]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
-// FINITEONLY-LABEL: @test_asinh(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_asinh_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) double @test_asinh(
+// FINITEONLY-SAME: double noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_asinh_f64(double noundef nofpclass(nan inf) [[X]]) #[[ATTR15]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
-// APPROX-LABEL: @test_asinh(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_asinh_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-LABEL: define dso_local noundef double @test_asinh(
+// APPROX-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_asinh_f64(double noundef [[X]]) #[[ATTR15]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
-// NCRDIV-LABEL: @test_asinh(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_asinh_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// NCRDIV-LABEL: define dso_local noundef double @test_asinh(
+// NCRDIV-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_asinh_f64(double noundef [[X]]) #[[ATTR15]]
 // NCRDIV-NEXT:    ret double [[CALL_I]]
 //
-// AMDGCNSPIRV-LABEL: @test_asinh(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_asinh_f64(double noundef [[X:%.*]]) #[[ATTR13]]
+// AMDGCNSPIRV-LABEL: define spir_func noundef double @test_asinh(
+// AMDGCNSPIRV-SAME: double noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR5]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_asinh_f64(double noundef [[X]]) #[[ATTR13]]
 // AMDGCNSPIRV-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_asinh(double x) {
   return asinh(x);
 }
 
-// DEFAULT-LABEL: @test_atan2f(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_atan2_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR14]]
+// DEFAULT-LABEL: define dso_local noundef float @test_atan2f(
+// DEFAULT-SAME: float noundef [[X:%.*]], float noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_atan2_f32(float noundef [[X]], float noundef [[Y]]) #[[ATTR14]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
-// FINITEONLY-LABEL: @test_atan2f(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_atan2_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR14]]
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) float @test_atan2f(
+// FINITEONLY-SAME: float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_atan2_f32(float noundef nofpclass(nan inf) [[X]], float noundef nofpclass(nan inf) [[Y]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
-// APPROX-LABEL: @test_atan2f(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_atan2_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR14]]
+// APPROX-LABEL: define dso_local noundef float @test_atan2f(
+// APPROX-SAME: float noundef [[X:%.*]], float noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_atan2_f32(float noundef [[X]], float noundef [[Y]]) #[[ATTR14]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
-// NCRDIV-LABEL: @test_atan2f(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_atan2_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR14]]
+// NCRDIV-LABEL: define dso_local noundef float @test_atan2f(
+// NCRDIV-SAME: float noundef [[X:%.*]], float noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_atan2_f32(float noundef [[X]], float noundef [[Y]]) #[[ATTR14]]
 // NCRDIV-NEXT:    ret float [[CALL_I]]
 //
-// AMDGCNSPIRV-LABEL: @test_atan2f(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_atan2_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR12]]
+// AMDGCNSPIRV-LABEL: define spir_func noundef float @test_atan2f(
+// AMDGCNSPIRV-SAME: float noundef [[X:%.*]], float noundef [[Y:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR4]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_atan2_f32(float noundef [[X]], float noundef [[Y]]) #[[ATTR12]]
 // AMDGCNSPIRV-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_atan2f(float x, float y) {
   return atan2f(x, y);
 }
 
-// DEFAULT-LABEL: @test_atan2(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_atan2_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR14]]
+// DEFAULT-LABEL: define dso_local noundef double @test_atan2(
+// DEFAULT-SAME: double noundef [[X:%.*]], double noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_atan2_f64(double noundef [[X]], double noundef [[Y]]) #[[ATTR14]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
-// FINITEONLY-LABEL: @test_atan2(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_atan2_f64(double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR14]]
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) double @test_atan2(
+// FINITEONLY-SAME: double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_atan2_f64(double noundef nofpclass(nan inf) [[X]], double noundef nofpclass(nan inf) [[Y]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
-// APPROX-LABEL: @test_atan2(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_atan2_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR14]]
+// APPROX-LABEL: define dso_local noundef double @test_atan2(
+// APPROX-SAME: double noundef [[X:%.*]], double noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_atan2_f64(double noundef [[X]], double noundef [[Y]]) #[[ATTR14]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
-// NCRDIV-LABEL: @test_atan2(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_atan2_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR14]]
+// NCRDIV-LABEL: define dso_local noundef double @test_atan2(
+// NCRDIV-SAME: double noundef [[X:%.*]], double noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_atan2_f64(double noundef [[X]], double noundef [[Y]]) #[[ATTR14]]
 // NCRDIV-NEXT:    ret double [[CALL_I]]
 //
-// AMDGCNSPIRV-LABEL: @test_atan2(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_atan2_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR12]]
+// AMDGCNSPIRV-LABEL: define spir_func noundef double @test_atan2(
+// AMDGCNSPIRV-SAME: double noundef [[X:%.*]], double noundef [[Y:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR4]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_atan2_f64(double noundef [[X]], double noundef [[Y]]) #[[ATTR12]]
 // AMDGCNSPIRV-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_atan2(double x, double y) {
   return atan2(x, y);
 }
 
-// DEFAULT-LABEL: @test_atanf(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_atan_f32(float noundef [[X:%.*]]) #[[ATTR14]]
+// DEFAULT-LABEL: define dso_local noundef float @test_atanf(
+// DEFAULT-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_atan_f32(float noundef [[X]]) #[[ATTR14]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
-// FINITEONLY-LABEL: @test_atanf(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_atan_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]]
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) float @test_atanf(
+// FINITEONLY-SAME: float noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_atan_f32(float noundef nofpclass(nan inf) [[X]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
-// APPROX-LABEL: @test_atanf(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_atan_f32(float noundef [[X:%.*]]) #[[ATTR14]]
+// APPROX-LABEL: define dso_local noundef float @test_atanf(
+// APPROX-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_atan_f32(float noundef [[X]]) #[[ATTR14]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
-// NCRDIV-LABEL: @test_atanf(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_atan_f32(float noundef [[X:%.*]]) #[[ATTR14]]
+// NCRDIV-LABEL: define dso_local noundef float @test_atanf(
+// NCRDIV-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_atan_f32(float noundef [[X]]) #[[ATTR14]]
 // NCRDIV-NEXT:    ret float [[CALL_I]]
 //
-// AMDGCNSPIRV-LABEL: @test_atanf(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_atan_f32(float noundef [[X:%.*]]) #[[ATTR12]]
+// AMDGCNSPIRV-LABEL: define spir_func noundef float @test_atanf(
+// AMDGCNSPIRV-SAME: float noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR4]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_atan_f32(float noundef [[X]]) #[[ATTR12]]
 // AMDGCNSPIRV-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_atanf(float x) {
   return atanf(x);
 }
 
-// DEFAULT-LABEL: @test_atan(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_atan_f64(double noundef [[X:%.*]]) #[[ATTR14]]
+// DEFAULT-LABEL: define dso_local noundef double @test_atan(
+// DEFAULT-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_atan_f64(double noundef [[X]]) #[[ATTR14]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
-// FINITEONLY-LABEL: @test_atan(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_atan_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]]
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) double @test_atan(
+// FINITEONLY-SAME: double noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_atan_f64(double noundef nofpclass(nan inf) [[X]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
-// APPROX-LABEL: @test_atan(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_atan_f64(double noundef [[X:%.*]]) #[[ATTR14]]
+// APPROX-LABEL: define dso_local noundef double @test_atan(
+// APPROX-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_atan_f64(double noundef [[X]]) #[[ATTR14]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
-// NCRDIV-LABEL: @test_atan(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_atan_f64(double noundef [[X:%.*]]) #[[ATTR14]]
+// NCRDIV-LABEL: define dso_local noundef double @test_atan(
+// NCRDIV-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_atan_f64(double noundef [[X]]) #[[ATTR14]]
 // NCRDIV-NEXT:    ret double [[CALL_I]]
 //
-// AMDGCNSPIRV-LABEL: @test_atan(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_atan_f64(double noundef [[X:%.*]]) #[[ATTR12]]
+// AMDGCNSPIRV-LABEL: define spir_func noundef double @test_atan(
+// AMDGCNSPIRV-SAME: double noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR4]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_atan_f64(double noundef [[X]]) #[[ATTR12]]
 // AMDGCNSPIRV-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_atan(double x) {
   return atan(x);
 }
 
-// DEFAULT-LABEL: @test_atanhf(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_atanh_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-LABEL: define dso_local noundef float @test_atanhf(
+// DEFAULT-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_atanh_f32(float noundef [[X]]) #[[ATTR15]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
-// FINITEONLY-LABEL: @test_atanhf(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_atanh_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) float @test_atanhf(
+// FINITEONLY-SAME: float noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_atanh_f32(float noundef nofpclass(nan inf) [[X]]) #[[ATTR15]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
-// APPROX-LABEL: @test_atanhf(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_atanh_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-LABEL: define dso_local noundef float @test_atanhf(
+// APPROX-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_atanh_f32(float noundef [[X]]) #[[ATTR15]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
-// NCRDIV-LABEL: @test_atanhf(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_atanh_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// NCRDIV-LABEL: define dso_local noundef float @test_atanhf(
+// NCRDIV-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_atanh_f32(float noundef [[X]]) #[[ATTR15]]
 // NCRDIV-NEXT:    ret float [[CALL_I]]
 //
-// AMDGCNSPIRV-LABEL: @test_atanhf(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_atanh_f32(float noundef [[X:%.*]]) #[[ATTR13]]
+// AMDGCNSPIRV-LABEL: define spir_func noundef float @test_atanhf(
+// AMDGCNSPIRV-SAME: float noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR5]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_atanh_f32(float noundef [[X]]) #[[ATTR13]]
 // AMDGCNSPIRV-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_atanhf(float x) {
   return atanhf(x);
 }
 
-// DEFAULT-LABEL: @test_atanh(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_atanh_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-LABEL: define dso_local noundef double @test_atanh(
+// DEFAULT-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_atanh_f64(double noundef [[X]]) #[[ATTR15]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
-// FINITEONLY-LABEL: @test_atanh(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_atanh_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) double @test_atanh(
+// FINITEONLY-SAME: double noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_atanh_f64(double noundef nofpclass(nan inf) [[X]]) #[[ATTR15]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
-// APPROX-LABEL: @test_atanh(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_atanh_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-LABEL: define dso_local noundef double @test_atanh(
+// APPROX-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_atanh_f64(double noundef [[X]]) #[[ATTR15]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
-// NCRDIV-LABEL: @test_atanh(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_atanh_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// NCRDIV-LABEL: define dso_local noundef double @test_atanh(
+// NCRDIV-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_atanh_f64(double noundef [[X]]) #[[ATTR15]]
 // NCRDIV-NEXT:    ret double [[CALL_I]]
 //
-// AMDGCNSPIRV-LABEL: @test_atanh(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_atanh_f64(double noundef [[X:%.*]]) #[[ATTR13]]
+// AMDGCNSPIRV-LABEL: define spir_func noundef double @test_atanh(
+// AMDGCNSPIRV-SAME: double noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR5]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_atanh_f64(double noundef [[X]]) #[[ATTR13]]
 // AMDGCNSPIRV-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_atanh(double x) {
   return atanh(x);
 }
 
-// DEFAULT-LABEL: @test_cbrtf(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_cbrt_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-LABEL: define dso_local noundef float @test_cbrtf(
+// DEFAULT-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_cbrt_f32(float noundef [[X]]) #[[ATTR15]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
-// FINITEONLY-LABEL: @test_cbrtf(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_cbrt_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) float @test_cbrtf(
+// FINITEONLY-SAME: float noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_cbrt_f32(float noundef nofpclass(nan inf) [[X]]) #[[ATTR15]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
-// APPROX-LABEL: @test_cbrtf(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_cbrt_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-LABEL: define dso_local noundef float @test_cbrtf(
+// APPROX-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_cbrt_f32(float noundef [[X]]) #[[ATTR15]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
-// NCRDIV-LABEL: @test_cbrtf(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_cbrt_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// NCRDIV-LABEL: define dso_local noundef float @test_cbrtf(
+// NCRDIV-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_cbrt_f32(float noundef [[X]]) #[[ATTR15]]
 // NCRDIV-NEXT:    ret float [[CALL_I]]
 //
-// AMDGCNSPIRV-LABEL: @test_cbrtf(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_cbrt_f32(float noundef [[X:%.*]]) #[[ATTR13]]
+// AMDGCNSPIRV-LABEL: define spir_func noundef float @test_cbrtf(
+// AMDGCNSPIRV-SAME: float noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR5]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_cbrt_f32(float noundef [[X]]) #[[ATTR13]]
 // AMDGCNSPIRV-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_cbrtf(float x) {
   return cbrtf(x);
 }
 
-// DEFAULT-LABEL: @test_cbrt(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_cbrt_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-LABEL: define dso_local noundef double @test_cbrt(
+// DEFAULT-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_cbrt_f64(double noundef [[X]]) #[[ATTR15]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
-// FINITEONLY-LABEL: @test_cbrt(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_cbrt_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) double @test_cbrt(
+// FINITEONLY-SAME: double noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_cbrt_f64(double noundef nofpclass(nan inf) [[X]]) #[[ATTR15]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
-// APPROX-LABEL: @test_cbrt(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_cbrt_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-LABEL: define dso_local noundef double @test_cbrt(
+// APPROX-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_cbrt_f64(double noundef [[X]]) #[[ATTR15]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
-// NCRDIV-LABEL: @test_cbrt(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_cbrt_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// NCRDIV-LABEL: define dso_local noundef double @test_cbrt(
+// NCRDIV-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_cbrt_f64(double noundef [[X]]) #[[ATTR15]]
 // NCRDIV-NEXT:    ret double [[CALL_I]]
 //
-// AMDGCNSPIRV-LABEL: @test_cbrt(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_cbrt_f64(double noundef [[X:%.*]]) #[[ATTR13]]
+// AMDGCNSPIRV-LABEL: define spir_func noundef double @test_cbrt(
+// AMDGCNSPIRV-SAME: double noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR5]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_cbrt_f64(double noundef [[X]]) #[[ATTR13]]
 // AMDGCNSPIRV-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_cbrt(double x) {
   return cbrt(x);
 }
 
-// DEFAULT-LABEL: @test_ceilf(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.ceil.f32(float [[X:%.*]])
+// DEFAULT-LABEL: define dso_local noundef float @test_ceilf(
+// DEFAULT-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.ceil.f32(float [[X]])
 // DEFAULT-NEXT:    ret float [[TMP0]]
 //
-// FINITEONLY-LABEL: @test_ceilf(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.ceil.f32(float nofpclass(nan inf) [[X:%.*]])
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) float @test_ceilf(
+// FINITEONLY-SAME: float noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.ceil.f32(float nofpclass(nan inf) [[X]])
 // FINITEONLY-NEXT:    ret float [[TMP0]]
 //
-// APPROX-LABEL: @test_ceilf(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.ceil.f32(float [[X:%.*]])
+// APPROX-LABEL: define dso_local noundef float @test_ceilf(
+// APPROX-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.ceil.f32(float [[X]])
 // APPROX-NEXT:    ret float [[TMP0]]
 //
-// NCRDIV-LABEL: @test_ceilf(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.ceil.f32(float [[X:%.*]])
+// NCRDIV-LABEL: define dso_local noundef float @test_ceilf(
+// NCRDIV-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.ceil.f32(float [[X]])
 // NCRDIV-NEXT:    ret float [[TMP0]]
 //
-// AMDGCNSPIRV-LABEL: @test_ceilf(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call contract noundef addrspace(4) float @llvm.ceil.f32(float [[X:%.*]])
+// AMDGCNSPIRV-LABEL: define spir_func noundef float @test_ceilf(
+// AMDGCNSPIRV-SAME: float noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR3]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call contract noundef addrspace(4) float @llvm.ceil.f32(float [[X]])
 // AMDGCNSPIRV-NEXT:    ret float [[TMP0]]
 //
 extern "C" __device__ float test_ceilf(float x) {
   return ceilf(x);
 }
 
-// DEFAULT-LABEL: @test_ceil(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.ceil.f64(double [[X:%.*]])
+// DEFAULT-LABEL: define dso_local noundef double @test_ceil(
+// DEFAULT-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.ceil.f64(double [[X]])
 // DEFAULT-NEXT:    ret double [[TMP0]]
 //
-// FINITEONLY-LABEL: @test_ceil(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.ceil.f64(double nofpclass(nan inf) [[X:%.*]])
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) double @test_ceil(
+// FINITEONLY-SAME: double noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.ceil.f64(double nofpclass(nan inf) [[X]])
 // FINITEONLY-NEXT:    ret double [[TMP0]]
 //
-// APPROX-LABEL: @test_ceil(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.ceil.f64(double [[X:%.*]])
+// APPROX-LABEL: define dso_local noundef double @test_ceil(
+// APPROX-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.ceil.f64(double [[X]])
 // APPROX-NEXT:    ret double [[TMP0]]
 //
-// NCRDIV-LABEL: @test_ceil(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.ceil.f64(double [[X:%.*]])
+// NCRDIV-LABEL: define dso_local noundef double @test_ceil(
+// NCRDIV-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.ceil.f64(double [[X]])
 // NCRDIV-NEXT:    ret double [[TMP0]]
 //
-// AMDGCNSPIRV-LABEL: @test_ceil(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call contract noundef addrspace(4) double @llvm.ceil.f64(double [[X:%.*]])
+// AMDGCNSPIRV-LABEL: define spir_func noundef double @test_ceil(
+// AMDGCNSPIRV-SAME: double noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR3]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call contract noundef addrspace(4) double @llvm.ceil.f64(double [[X]])
 // AMDGCNSPIRV-NEXT:    ret double [[TMP0]]
 //
 extern "C" __device__ double test_ceil(double x) {
   return ceil(x);
 }
 
-// DEFAULT-LABEL: @test_copysignf(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.copysign.f32(float [[X:%.*]], float [[Y:%.*]])
+// DEFAULT-LABEL: define dso_local noundef float @test_copysignf(
+// DEFAULT-SAME: float noundef [[X:%.*]], float noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.copysign.f32(float [[X]], float [[Y]])
 // DEFAULT-NEXT:    ret float [[TMP0]]
 //
-// FINITEONLY-LABEL: @test_copysignf(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.copysign.f32(float nofpclass(nan inf) [[X:%.*]], float nofpclass(nan inf) [[Y:%.*]])
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) float @test_copysignf(
+// FINITEONLY-SAME: float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.copysign.f32(float nofpclass(nan inf) [[X]], float nofpclass(nan inf) [[Y]])
 // FINITEONLY-NEXT:    ret float [[TMP0]]
 //
-// APPROX-LABEL: @test_copysignf(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.copysign.f32(float [[X:%.*]], float [[Y:%.*]])
+// APPROX-LABEL: define dso_local noundef float @test_copysignf(
+// APPROX-SAME: float noundef [[X:%.*]], float noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.copysign.f32(float [[X]], float [[Y]])
 // APPROX-NEXT:    ret float [[TMP0]]
 //
-// NCRDIV-LABEL: @test_copysignf(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.copysign.f32(float [[X:%.*]], float [[Y:%.*]])
+// NCRDIV-LABEL: define dso_local noundef float @test_copysignf(
+// NCRDIV-SAME: float noundef [[X:%.*]], float noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.copysign.f32(float [[X]], float [[Y]])
 // NCRDIV-NEXT:    ret float [[TMP0]]
 //
-// AMDGCNSPIRV-LABEL: @test_copysignf(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call contract noundef addrspace(4) float @llvm.copysign.f32(float [[X:%.*]], float [[Y:%.*]])
+// AMDGCNSPIRV-LABEL: define spir_func noundef float @test_copysignf(
+// AMDGCNSPIRV-SAME: float noundef [[X:%.*]], float noundef [[Y:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR3]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call contract noundef addrspace(4) float @llvm.copysign.f32(float [[X]], float [[Y]])
 // AMDGCNSPIRV-NEXT:    ret float [[TMP0]]
 //
 extern "C" __device__ float test_copysignf(float x, float y) {
   return copysignf(x, y);
 }
 
-// DEFAULT-LABEL: @test_copysign(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.copysign.f64(double [[X:%.*]], double [[Y:%.*]])
+// DEFAULT-LABEL: define dso_local noundef double @test_copysign(
+// DEFAULT-SAME: double noundef [[X:%.*]], double noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.copysign.f64(double [[X]], double [[Y]])
 // DEFAULT-NEXT:    ret double [[TMP0]]
 //
-// FINITEONLY-LABEL: @test_copysign(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.copysign.f64(double nofpclass(nan inf) [[X:%.*]], double nofpclass(nan inf) [[Y:%.*]])
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) double @test_copysign(
+// FINITEONLY-SAME: double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.copysign.f64(double nofpclass(nan inf) [[X]], double nofpclass(nan inf) [[Y]])
 // FINITEONLY-NEXT:    ret double [[TMP0]]
 //
-// APPROX-LABEL: @test_copysign(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.copysign.f64(double [[X:%.*]], double [[Y:%.*]])
+// APPROX-LABEL: define dso_local noundef double @test_copysign(
+// APPROX-SAME: double noundef [[X:%.*]], double noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.copysign.f64(double [[X]], double [[Y]])
 // APPROX-NEXT:    ret double [[TMP0]]
 //
-// NCRDIV-LABEL: @test_copysign(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.copysign.f64(double [[X:%.*]], double [[Y:%.*]])
+// NCRDIV-LABEL: define dso_local noundef double @test_copysign(
+// NCRDIV-SAME: double noundef [[X:%.*]], double noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.copysign.f64(double [[X]], double [[Y]])
 // NCRDIV-NEXT:    ret double [[TMP0]]
 //
-// AMDGCNSPIRV-LABEL: @test_copysign(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call contract noundef addrspace(4) double @llvm.copysign.f64(double [[X:%.*]], double [[Y:%.*]])
+// AMDGCNSPIRV-LABEL: define spir_func noundef double @test_copysign(
+// AMDGCNSPIRV-SAME: double noundef [[X:%.*]], double noundef [[Y:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR3]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call contract noundef addrspace(4) double @llvm.copysign.f64(double [[X]], double [[Y]])
 // AMDGCNSPIRV-NEXT:    ret double [[TMP0]]
 //
 extern "C" __device__ double test_copysign(double x, double y) {
   return copysign(x, y);
 }
 
-// DEFAULT-LABEL: @test_cosf(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_cos_f32(float noundef [[X:%.*]]) #[[ATTR16:[0-9]+]]
+// DEFAULT-LABEL: define dso_local noundef float @test_cosf(
+// DEFAULT-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR6:[0-9]+]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_cos_f32(float noundef [[X]]) #[[ATTR16:[0-9]+]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
-// FINITEONLY-LABEL: @test_cosf(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_cos_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16:[0-9]+]]
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) float @test_cosf(
+// FINITEONLY-SAME: float noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR6:[0-9]+]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_cos_f32(float noundef nofpclass(nan inf) [[X]]) #[[ATTR16:[0-9]+]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
-// APPROX-LABEL: @test_cosf(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I1:%.*]] = tail call contract noundef float @__ocml_native_cos_f32(float noundef [[X:%.*]]) #[[ATTR16:[0-9]+]]
+// APPROX-LABEL: define dso_local noundef float @test_cosf(
+// APPROX-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR6:[0-9]+]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[CALL_I1:%.*]] = tail call contract noundef float @__ocml_native_cos_f32(float noundef [[X]]) #[[ATTR16:[0-9]+]]
 // APPROX-NEXT:    ret float [[CALL_I1]]
 //
-// NCRDIV-LABEL: @test_cosf(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_cos_f32(float noundef [[X:%.*]]) #[[ATTR16:[0-9]+]]
+// NCRDIV-LABEL: define dso_local noundef float @test_cosf(
+// NCRDIV-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR6:[0-9]+]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_cos_f32(float noundef [[X]]) #[[ATTR16:[0-9]+]]
 // NCRDIV-NEXT:    ret float [[CALL_I]]
 //
-// AMDGCNSPIRV-LABEL: @test_cosf(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_cos_f32(float noundef [[X:%.*]]) #[[ATTR14:[0-9]+]]
+// AMDGCNSPIRV-LABEL: define spir_func noundef float @test_cosf(
+// AMDGCNSPIRV-SAME: float noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR6:[0-9]+]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_cos_f32(float noundef [[X]]) #[[ATTR14:[0-9]+]]
 // AMDGCNSPIRV-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_cosf(float x) {
   return cosf(x);
 }
 
-// DEFAULT-LABEL: @test_cos(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_cos_f64(double noundef [[X:%.*]]) #[[ATTR16]]
+// DEFAULT-LABEL: define dso_local noundef double @test_cos(
+// DEFAULT-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_cos_f64(double noundef [[X]]) #[[ATTR16]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
-// FINITEONLY-LABEL: @test_cos(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_cos_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) double @test_cos(
+// FINITEONLY-SAME: double noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_cos_f64(double noundef nofpclass(nan inf) [[X]]) #[[ATTR16]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
-// APPROX-LABEL: @test_cos(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_cos_f64(double noundef [[X:%.*]]) #[[ATTR16]]
+// APPROX-LABEL: define dso_local noundef double @test_cos(
+// APPROX-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_cos_f64(double noundef [[X]]) #[[ATTR16]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
-// NCRDIV-LABEL: @test_cos(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_cos_f64(double noundef [[X:%.*]]) #[[ATTR16]]
+// NCRDIV-LABEL: define dso_local noundef double @test_cos(
+// NCRDIV-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_cos_f64(double noundef [[X]]) #[[ATTR16]]
 // NCRDIV-NEXT:    ret double [[CALL_I]]
 //
-// AMDGCNSPIRV-LABEL: @test_cos(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_cos_f64(double noundef [[X:%.*]]) #[[ATTR14]]
+// AMDGCNSPIRV-LABEL: define spir_func noundef double @test_cos(
+// AMDGCNSPIRV-SAME: double noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR6]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_cos_f64(double noundef [[X]]) #[[ATTR14]]
 // AMDGCNSPIRV-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_cos(double x) {
   return cos(x);
 }
 
-// DEFAULT-LABEL: @test_coshf(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_cosh_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-LABEL: define dso_local noundef float @test_coshf(
+// DEFAULT-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_cosh_f32(float noundef [[X]]) #[[ATTR15]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
-// FINITEONLY-LABEL: @test_coshf(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_cosh_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) float @test_coshf(
+// FINITEONLY-SAME: float noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_cosh_f32(float noundef nofpclass(nan inf) [[X]]) #[[ATTR15]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
-// APPROX-LABEL: @test_coshf(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_cosh_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-LABEL: define dso_local noundef float @test_coshf(
+// APPROX-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_cosh_f32(float noundef [[X]]) #[[ATTR15]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
-// NCRDIV-LABEL: @test_coshf(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_cosh_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// NCRDIV-LABEL: define dso_local noundef float @test_coshf(
+// NCRDIV-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_cosh_f32(float noundef [[X]]) #[[ATTR15]]
 // NCRDIV-NEXT:    ret float [[CALL_I]]
 //
-// AMDGCNSPIRV-LABEL: @test_coshf(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_cosh_f32(float noundef [[X:%.*]]) #[[ATTR13]]
+// AMDGCNSPIRV-LABEL: define spir_func noundef float @test_coshf(
+// AMDGCNSPIRV-SAME: float noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR5]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_cosh_f32(float noundef [[X]]) #[[ATTR13]]
 // AMDGCNSPIRV-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_coshf(float x) {
   return coshf(x);
 }
 
-// DEFAULT-LABEL: @test_cosh(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_cosh_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-LABEL: define dso_local noundef double @test_cosh(
+// DEFAULT-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_cosh_f64(double noundef [[X]]) #[[ATTR15]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
-// FINITEONLY-LABEL: @test_cosh(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_cosh_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) double @test_cosh(
+// FINITEONLY-SAME: double noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_cosh_f64(double noundef nofpclass(nan inf) [[X]]) #[[ATTR15]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
-// APPROX-LABEL: @test_cosh(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_cosh_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-LABEL: define dso_local noundef double @test_cosh(
+// APPROX-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_cosh_f64(double noundef [[X]]) #[[ATTR15]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
-// NCRDIV-LABEL: @test_cosh(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_cosh_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// NCRDIV-LABEL: define dso_local noundef double @test_cosh(
+// NCRDIV-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_cosh_f64(double noundef [[X]]) #[[ATTR15]]
 // NCRDIV-NEXT:    ret double [[CALL_I]]
 //
-// AMDGCNSPIRV-LABEL: @test_cosh(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_cosh_f64(double noundef [[X:%.*]]) #[[ATTR13]]
+// AMDGCNSPIRV-LABEL: define spir_func noundef double @test_cosh(
+// AMDGCNSPIRV-SAME: double noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR5]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_cosh_f64(double noundef [[X]]) #[[ATTR13]]
 // AMDGCNSPIRV-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_cosh(double x) {
   return cosh(x);
 }
 
-// DEFAULT-LABEL: @test_cospif(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_cospi_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// DEFAULT-LABEL: define dso_local noundef float @test_cospif(
+// DEFAULT-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_cospi_f32(float noundef [[X]]) #[[ATTR16]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
-// FINITEONLY-LABEL: @test_cospif(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_cospi_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) float @test_cospif(
+// FINITEONLY-SAME: float noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_cospi_f32(float noundef nofpclass(nan inf) [[X]]) #[[ATTR16]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
-// APPROX-LABEL: @test_cospif(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_cospi_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// APPROX-LABEL: define dso_local noundef float @test_cospif(
+// APPROX-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_cospi_f32(float noundef [[X]]) #[[ATTR16]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
-// NCRDIV-LABEL: @test_cospif(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_cospi_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// NCRDIV-LABEL: define dso_local noundef float @test_cospif(
+// NCRDIV-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_cospi_f32(float noundef [[X]]) #[[ATTR16]]
 // NCRDIV-NEXT:    ret float [[CALL_I]]
 //
-// AMDGCNSPIRV-LABEL: @test_cospif(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_cospi_f32(float noundef [[X:%.*]]) #[[ATTR14]]
+// AMDGCNSPIRV-LABEL: define spir_func noundef float @test_cospif(
+// AMDGCNSPIRV-SAME: float noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR6]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_cospi_f32(float noundef [[X]]) #[[ATTR14]]
 // AMDGCNSPIRV-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_cospif(float x) {
   return cospif(x);
 }
 
-// DEFAULT-LABEL: @test_cospi(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_cospi_f64(double noundef [[X:%.*]]) #[[ATTR16]]
+// DEFAULT-LABEL: define dso_local noundef double @test_cospi(
+// DEFAULT-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_cospi_f64(double noundef [[X]]) #[[ATTR16]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
-// FINITEONLY-LABEL: @test_cospi(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_cospi_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) double @test_cospi(
+// FINITEONLY-SAME: double noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_cospi_f64(double noundef nofpclass(nan inf) [[X]]) #[[ATTR16]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
-// APPROX-LABEL: @test_cospi(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_cospi_f64(double noundef [[X:%.*]]) #[[ATTR16]]
+// APPROX-LABEL: define dso_local noundef double @test_cospi(
+// APPROX-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_cospi_f64(double noundef [[X]]) #[[ATTR16]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
-// NCRDIV-LABEL: @test_cospi(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_cospi_f64(double noundef [[X:%.*]]) #[[ATTR16]]
+// NCRDIV-LABEL: define dso_local noundef double @test_cospi(
+// NCRDIV-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_cospi_f64(double noundef [[X]]) #[[ATTR16]]
 // NCRDIV-NEXT:    ret double [[CALL_I]]
 //
-// AMDGCNSPIRV-LABEL: @test_cospi(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_cospi_f64(double noundef [[X:%.*]]) #[[ATTR14]]
+// AMDGCNSPIRV-LABEL: define spir_func noundef double @test_cospi(
+// AMDGCNSPIRV-SAME: double noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR6]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_cospi_f64(double noundef [[X]]) #[[ATTR14]]
 // AMDGCNSPIRV-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_cospi(double x) {
   return cospi(x);
 }
 
-// DEFAULT-LABEL: @test_cyl_bessel_i0f(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_i0_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// DEFAULT-LABEL: define dso_local noundef float @test_cyl_bessel_i0f(
+// DEFAULT-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_i0_f32(float noundef [[X]]) #[[ATTR16]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
-// FINITEONLY-LABEL: @test_cyl_bessel_i0f(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_i0_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) float @test_cyl_bessel_i0f(
+// FINITEONLY-SAME: float noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_i0_f32(float noundef nofpclass(nan inf) [[X]]) #[[ATTR16]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
-// APPROX-LABEL: @test_cyl_bessel_i0f(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_i0_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// APPROX-LABEL: define dso_local noundef float @test_cyl_bessel_i0f(
+// APPROX-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_i0_f32(float noundef [[X]]) #[[ATTR16]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
-// NCRDIV-LABEL: @test_cyl_bessel_i0f(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_i0_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// NCRDIV-LABEL: define dso_local noundef float @test_cyl_bessel_i0f(
+// NCRDIV-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_i0_f32(float noundef [[X]]) #[[ATTR16]]
 // NCRDIV-NEXT:    ret float [[CALL_I]]
 //
-// AMDGCNSPIRV-LABEL: @test_cyl_bessel_i0f(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_i0_f32(float noundef [[X:%.*]]) #[[ATTR14]]
+// AMDGCNSPIRV-LABEL: define spir_func noundef float @test_cyl_bessel_i0f(
+// AMDGCNSPIRV-SAME: float noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR6]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_i0_f32(float noundef [[X]]) #[[ATTR14]]
 // AMDGCNSPIRV-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_cyl_bessel_i0f(float x) {
   return cyl_bessel_i0f(x);
 }
 
-// DEFAULT-LABEL: @test_cyl_bessel_i0(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_i0_f64(double noundef [[X:%.*]]) #[[ATTR16]]
+// DEFAULT-LABEL: define dso_local noundef double @test_cyl_bessel_i0(
+// DEFAULT-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_i0_f64(double noundef [[X]]) #[[ATTR16]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
-// FINITEONLY-LABEL: @test_cyl_bessel_i0(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_i0_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) double @test_cyl_bessel_i0(
+// FINITEONLY-SAME: double noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_i0_f64(double noundef nofpclass(nan inf) [[X]]) #[[ATTR16]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
-// APPROX-LABEL: @test_cyl_bessel_i0(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_i0_f64(double noundef [[X:%.*]]) #[[ATTR16]]
+// APPROX-LABEL: define dso_local noundef double @test_cyl_bessel_i0(
+// APPROX-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_i0_f64(double noundef [[X]]) #[[ATTR16]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
-// NCRDIV-LABEL: @test_cyl_bessel_i0(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_i0_f64(double noundef [[X:%.*]]) #[[ATTR16]]
+// NCRDIV-LABEL: define dso_local noundef double @test_cyl_bessel_i0(
+// NCRDIV-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_i0_f64(double noundef [[X]]) #[[ATTR16]]
 // NCRDIV-NEXT:    ret double [[CALL_I]]
 //
-// AMDGCNSPIRV-LABEL: @test_cyl_bessel_i0(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_i0_f64(double noundef [[X:%.*]]) #[[ATTR14]]
+// AMDGCNSPIRV-LABEL: define spir_func noundef double @test_cyl_bessel_i0(
+// AMDGCNSPIRV-SAME: double noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR6]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_i0_f64(double noundef [[X]]) #[[ATTR14]]
 // AMDGCNSPIRV-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_cyl_bessel_i0(double x) {
   return cyl_bessel_i0(x);
 }
 
-// DEFAULT-LABEL: @test_cyl_bessel_i1f(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_i1_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// DEFAULT-LABEL: define dso_local noundef float @test_cyl_bessel_i1f(
+// DEFAULT-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_i1_f32(float noundef [[X]]) #[[ATTR16]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
-// FINITEONLY-LABEL: @test_cyl_bessel_i1f(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_i1_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) float @test_cyl_bessel_i1f(
+// FINITEONLY-SAME: float noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_i1_f32(float noundef nofpclass(nan inf) [[X]]) #[[ATTR16]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
-// APPROX-LABEL: @test_cyl_bessel_i1f(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_i1_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// APPROX-LABEL: define dso_local noundef float @test_cyl_bessel_i1f(
+// APPROX-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_i1_f32(float noundef [[X]]) #[[ATTR16]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
-// NCRDIV-LABEL: @test_cyl_bessel_i1f(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_i1_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// NCRDIV-LABEL: define dso_local noundef float @test_cyl_bessel_i1f(
+// NCRDIV-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_i1_f32(float noundef [[X]]) #[[ATTR16]]
 // NCRDIV-NEXT:    ret float [[CALL_I]]
 //
-// AMDGCNSPIRV-LABEL: @test_cyl_bessel_i1f(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_i1_f32(float noundef [[X:%.*]]) #[[ATTR14]]
+// AMDGCNSPIRV-LABEL: define spir_func noundef float @test_cyl_bessel_i1f(
+// AMDGCNSPIRV-SAME: float noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR6]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_i1_f32(float noundef [[X]]) #[[ATTR14]]
 // AMDGCNSPIRV-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_cyl_bessel_i1f(float x) {
   return cyl_bessel_i1f(x);
 }
 
-// DEFAULT-LABEL: @test_cyl_bessel_i1(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_i1_f64(double noundef [[X:%.*]]) #[[ATTR16]]
+// DEFAULT-LABEL: define dso_local noundef double @test_cyl_bessel_i1(
+// DEFAULT-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_i1_f64(double noundef [[X]]) #[[ATTR16]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
-// FINITEONLY-LABEL: @test_cyl_bessel_i1(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_i1_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) double @test_cyl_bessel_i1(
+// FINITEONLY-SAME: double noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_i1_f64(double noundef nofpclass(nan inf) [[X]]) #[[ATTR16]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
-// APPROX-LABEL: @test_cyl_bessel_i1(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_i1_f64(double noundef [[X:%.*]]) #[[ATTR16]]
+// APPROX-LABEL: define dso_local noundef double @test_cyl_bessel_i1(
+// APPROX-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_i1_f64(double noundef [[X]]) #[[ATTR16]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
-// NCRDIV-LABEL: @test_cyl_bessel_i1(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_i1_f64(double noundef [[X:%.*]]) #[[ATTR16]]
+// NCRDIV-LABEL: define dso_local noundef double @test_cyl_bessel_i1(
+// NCRDIV-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_i1_f64(double noundef [[X]]) #[[ATTR16]]
 // NCRDIV-NEXT:    ret double [[CALL_I]]
 //
-// AMDGCNSPIRV-LABEL: @test_cyl_bessel_i1(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_i1_f64(double noundef [[X:%.*]]) #[[ATTR14]]
+// AMDGCNSPIRV-LABEL: define spir_func noundef double @test_cyl_bessel_i1(
+// AMDGCNSPIRV-SAME: double noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR6]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_i1_f64(double noundef [[X]]) #[[ATTR14]]
 // AMDGCNSPIRV-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_cyl_bessel_i1(double x) {
   return cyl_bessel_i1(x);
 }
 
-// DEFAULT-LABEL: @test_erfcf(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_erfc_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-LABEL: define dso_local noundef float @test_erfcf(
+// DEFAULT-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_erfc_f32(float noundef [[X]]) #[[ATTR15]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
-// FINITEONLY-LABEL: @test_erfcf(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_erfc_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) float @test_erfcf(
+// FINITEONLY-SAME: float noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_erfc_f32(float noundef nofpclass(nan inf) [[X]]) #[[ATTR15]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
-// APPROX-LABEL: @test_erfcf(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_erfc_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-LABEL: define dso_local noundef float @test_erfcf(
+// APPROX-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_erfc_f32(float noundef [[X]]) #[[ATTR15]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
-// NCRDIV-LABEL: @test_erfcf(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_erfc_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// NCRDIV-LABEL: define dso_local noundef float @test_erfcf(
+// NCRDIV-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_erfc_f32(float noundef [[X]]) #[[ATTR15]]
 // NCRDIV-NEXT:    ret float [[CALL_I]]
 //
-// AMDGCNSPIRV-LABEL: @test_erfcf(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_erfc_f32(float noundef [[X:%.*]]) #[[ATTR13]]
+// AMDGCNSPIRV-LABEL: define spir_func noundef float @test_erfcf(
+// AMDGCNSPIRV-SAME: float noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR5]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_erfc_f32(float noundef [[X]]) #[[ATTR13]]
 // AMDGCNSPIRV-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_erfcf(float x) {
   return erfcf(x);
 }
 
-// DEFAULT-LABEL: @test_erfc(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_erfc_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-LABEL: define dso_local noundef double @test_erfc(
+// DEFAULT-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_erfc_f64(double noundef [[X]]) #[[ATTR15]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
-// FINITEONLY-LABEL: @test_erfc(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_erfc_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) double @test_erfc(
+// FINITEONLY-SAME: double noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_erfc_f64(double noundef nofpclass(nan inf) [[X]]) #[[ATTR15]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
-// APPROX-LABEL: @test_erfc(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_erfc_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-LABEL: define dso_local noundef double @test_erfc(
+// APPROX-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_erfc_f64(double noundef [[X]]) #[[ATTR15]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
-// NCRDIV-LABEL: @test_erfc(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_erfc_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// NCRDIV-LABEL: define dso_local noundef double @test_erfc(
+// NCRDIV-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_erfc_f64(double noundef [[X]]) #[[ATTR15]]
 // NCRDIV-NEXT:    ret double [[CALL_I]]
 //
-// AMDGCNSPIRV-LABEL: @test_erfc(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_erfc_f64(double noundef [[X:%.*]]) #[[ATTR13]]
+// AMDGCNSPIRV-LABEL: define spir_func noundef double @test_erfc(
+// AMDGCNSPIRV-SAME: double noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR5]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_erfc_f64(double noundef [[X]]) #[[ATTR13]]
 // AMDGCNSPIRV-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_erfc(double x) {
   return erfc(x);
 }
 
-// DEFAULT-LABEL: @test_erfinvf(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_erfinv_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-LABEL: define dso_local noundef float @test_erfinvf(
+// DEFAULT-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_erfinv_f32(float noundef [[X]]) #[[ATTR15]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
-// FINITEONLY-LABEL: @test_erfinvf(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_erfinv_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) float @test_erfinvf(
+// FINITEONLY-SAME: float noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_erfinv_f32(float noundef nofpclass(nan inf) [[X]]) #[[ATTR15]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
-// APPROX-LABEL: @test_erfinvf(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_erfinv_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-LABEL: define dso_local noundef float @test_erfinvf(
+// APPROX-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_erfinv_f32(float noundef [[X]]) #[[ATTR15]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
-// NCRDIV-LABEL: @test_erfinvf(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_erfinv_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// NCRDIV-LABEL: define dso_local noundef float @test_erfinvf(
+// NCRDIV-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_erfinv_f32(float noundef [[X]]) #[[ATTR15]]
 // NCRDIV-NEXT:    ret float [[CALL_I]]
 //
-// AMDGCNSPIRV-LABEL: @test_erfinvf(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_erfinv_f32(float noundef [[X:%.*]]) #[[ATTR13]]
+// AMDGCNSPIRV-LABEL: define spir_func noundef float @test_erfinvf(
+// AMDGCNSPIRV-SAME: float noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR5]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_erfinv_f32(float noundef [[X]]) #[[ATTR13]]
 // AMDGCNSPIRV-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_erfinvf(float x) {
   return erfinvf(x);
 }
 
-// DEFAULT-LABEL: @test_erfinv(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_erfinv_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-LABEL: define dso_local noundef double @test_erfinv(
+// DEFAULT-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_erfinv_f64(double noundef [[X]]) #[[ATTR15]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
-// FINITEONLY-LABEL: @test_erfinv(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_erfinv_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) double @test_erfinv(
+// FINITEONLY-SAME: double noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_erfinv_f64(double noundef nofpclass(nan inf) [[X]]) #[[ATTR15]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
-// APPROX-LABEL: @test_erfinv(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_erfinv_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-LABEL: define dso_local noundef double @test_erfinv(
+// APPROX-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_erfinv_f64(double noundef [[X]]) #[[ATTR15]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
-// NCRDIV-LABEL: @test_erfinv(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_erfinv_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// NCRDIV-LABEL: define dso_local noundef double @test_erfinv(
+// NCRDIV-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_erfinv_f64(double noundef [[X]]) #[[ATTR15]]
 // NCRDIV-NEXT:    ret double [[CALL_I]]
 //
-// AMDGCNSPIRV-LABEL: @test_erfinv(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_erfinv_f64(double noundef [[X:%.*]]) #[[ATTR13]]
+// AMDGCNSPIRV-LABEL: define spir_func noundef double @test_erfinv(
+// AMDGCNSPIRV-SAME: double noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR5]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_erfinv_f64(double noundef [[X]]) #[[ATTR13]]
 // AMDGCNSPIRV-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_erfinv(double x) {
   return erfinv(x);
 }
 
-// DEFAULT-LABEL: @test_exp10f(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.exp10.f32(float [[X:%.*]])
+// DEFAULT-LABEL: define dso_local noundef float @test_exp10f(
+// DEFAULT-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.exp10.f32(float [[X]])
 // DEFAULT-NEXT:    ret float [[TMP0]]
 //
-// FINITEONLY-LABEL: @test_exp10f(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.exp10.f32(float nofpclass(nan inf) [[X:%.*]])
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) float @test_exp10f(
+// FINITEONLY-SAME: float noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.exp10.f32(float nofpclass(nan inf) [[X]])
 // FINITEONLY-NEXT:    ret float [[TMP0]]
 //
-// APPROX-LABEL: @test_exp10f(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.exp10.f32(float [[X:%.*]])
+// APPROX-LABEL: define dso_local noundef float @test_exp10f(
+// APPROX-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.exp10.f32(float [[X]])
 // APPROX-NEXT:    ret float [[TMP0]]
 //
-// NCRDIV-LABEL: @test_exp10f(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.exp10.f32(float [[X:%.*]])
+// NCRDIV-LABEL: define dso_local noundef float @test_exp10f(
+// NCRDIV-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.exp10.f32(float [[X]])
 // NCRDIV-NEXT:    ret float [[TMP0]]
 //
-// AMDGCNSPIRV-LABEL: @test_exp10f(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call contract noundef addrspace(4) float @llvm.exp10.f32(float [[X:%.*]])
+// AMDGCNSPIRV-LABEL: define spir_func noundef float @test_exp10f(
+// AMDGCNSPIRV-SAME: float noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR3]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call contract noundef addrspace(4) float @llvm.exp10.f32(float [[X]])
 // AMDGCNSPIRV-NEXT:    ret float [[TMP0]]
 //
 extern "C" __device__ float test_exp10f(float x) {
   return exp10f(x);
 }
 
-// DEFAULT-LABEL: @test_exp10(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_exp10_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-LABEL: define dso_local noundef double @test_exp10(
+// DEFAULT-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_exp10_f64(double noundef [[X]]) #[[ATTR15]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
-// FINITEONLY-LABEL: @test_exp10(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_exp10_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) double @test_exp10(
+// FINITEONLY-SAME: double noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_exp10_f64(double noundef nofpclass(nan inf) [[X]]) #[[ATTR15]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
-// APPROX-LABEL: @test_exp10(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_exp10_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-LABEL: define dso_local noundef double @test_exp10(
+// APPROX-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_exp10_f64(double noundef [[X]]) #[[ATTR15]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
-// NCRDIV-LABEL: @test_exp10(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_exp10_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// NCRDIV-LABEL: define dso_local noundef double @test_exp10(
+// NCRDIV-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_exp10_f64(double noundef [[X]]) #[[ATTR15]]
 // NCRDIV-NEXT:    ret double [[CALL_I]]
 //
-// AMDGCNSPIRV-LABEL: @test_exp10(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_exp10_f64(double noundef [[X:%.*]]) #[[ATTR13]]
+// AMDGCNSPIRV-LABEL: define spir_func noundef double @test_exp10(
+// AMDGCNSPIRV-SAME: double noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR5]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_exp10_f64(double noundef [[X]]) #[[ATTR13]]
 // AMDGCNSPIRV-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_exp10(double x) {
   return exp10(x);
 }
 
-// DEFAULT-LABEL: @test_exp2f(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.exp2.f32(float [[X:%.*]])
+// DEFAULT-LABEL: define dso_local noundef float @test_exp2f(
+// DEFAULT-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.exp2.f32(float [[X]])
 // DEFAULT-NEXT:    ret float [[TMP0]]
 //
-// FINITEONLY-LABEL: @test_exp2f(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.exp2.f32(float nofpclass(nan inf) [[X:%.*]])
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) float @test_exp2f(
+// FINITEONLY-SAME: float noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.exp2.f32(float nofpclass(nan inf) [[X]])
 // FINITEONLY-NEXT:    ret float [[TMP0]]
 //
-// APPROX-LABEL: @test_exp2f(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.exp2.f32(float [[X:%.*]])
+// APPROX-LABEL: define dso_local noundef float @test_exp2f(
+// APPROX-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.exp2.f32(float [[X]])
 // APPROX-NEXT:    ret float [[TMP0]]
 //
-// NCRDIV-LABEL: @test_exp2f(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.exp2.f32(float [[X:%.*]])
+// NCRDIV-LABEL: define dso_local noundef float @test_exp2f(
+// NCRDIV-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.exp2.f32(float [[X]])
 // NCRDIV-NEXT:    ret float [[TMP0]]
 //
-// AMDGCNSPIRV-LABEL: @test_exp2f(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call contract noundef addrspace(4) float @llvm.exp2.f32(float [[X:%.*]])
+// AMDGCNSPIRV-LABEL: define spir_func noundef float @test_exp2f(
+// AMDGCNSPIRV-SAME: float noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR3]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call contract noundef addrspace(4) float @llvm.exp2.f32(float [[X]])
 // AMDGCNSPIRV-NEXT:    ret float [[TMP0]]
 //
 extern "C" __device__ float test_exp2f(float x) {
   return exp2f(x);
 }
 
-// DEFAULT-LABEL: @test_exp2(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_exp2_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-LABEL: define dso_local noundef double @test_exp2(
+// DEFAULT-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_exp2_f64(double noundef [[X]]) #[[ATTR15]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
-// FINITEONLY-LABEL: @test_exp2(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_exp2_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) double @test_exp2(
+// FINITEONLY-SAME: double noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_exp2_f64(double noundef nofpclass(nan inf) [[X]]) #[[ATTR15]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
-// APPROX-LABEL: @test_exp2(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_exp2_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-LABEL: define dso_local noundef double @test_exp2(
+// APPROX-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_exp2_f64(double noundef [[X]]) #[[ATTR15]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
-// NCRDIV-LABEL: @test_exp2(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_exp2_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// NCRDIV-LABEL: define dso_local noundef double @test_exp2(
+// NCRDIV-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_exp2_f64(double noundef [[X]]) #[[ATTR15]]
 // NCRDIV-NEXT:    ret double [[CALL_I]]
 //
-// AMDGCNSPIRV-LABEL: @test_exp2(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_exp2_f64(double noundef [[X:%.*]]) #[[ATTR13]]
+// AMDGCNSPIRV-LABEL: define spir_func noundef double @test_exp2(
+// AMDGCNSPIRV-SAME: double noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR5]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_exp2_f64(double noundef [[X]]) #[[ATTR13]]
 // AMDGCNSPIRV-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_exp2(double x) {
   return exp2(x);
 }
 
-// DEFAULT-LABEL: @test_expf(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.exp.f32(float [[X:%.*]])
+// DEFAULT-LABEL: define dso_local noundef float @test_expf(
+// DEFAULT-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.exp.f32(float [[X]])
 // DEFAULT-NEXT:    ret float [[TMP0]]
 //
-// FINITEONLY-LABEL: @test_expf(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.exp.f32(float nofpclass(nan inf) [[X:%.*]])
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) float @test_expf(
+// FINITEONLY-SAME: float noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.exp.f32(float nofpclass(nan inf) [[X]])
 // FINITEONLY-NEXT:    ret float [[TMP0]]
 //
-// APPROX-LABEL: @test_expf(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.exp.f32(float [[X:%.*]])
+// APPROX-LABEL: define dso_local noundef float @test_expf(
+// APPROX-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.exp.f32(float [[X]])
 // APPROX-NEXT:    ret float [[TMP0]]
 //
-// NCRDIV-LABEL: @test_expf(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.exp.f32(float [[X:%.*]])
+// NCRDIV-LABEL: define dso_local noundef float @test_expf(
+// NCRDIV-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.exp.f32(float [[X]])
 // NCRDIV-NEXT:    ret float [[TMP0]]
 //
-// AMDGCNSPIRV-LABEL: @test_expf(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call contract noundef addrspace(4) float @llvm.exp.f32(float [[X:%.*]])
+// AMDGCNSPIRV-LABEL: define spir_func noundef float @test_expf(
+// AMDGCNSPIRV-SAME: float noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR3]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call contract noundef addrspace(4) float @llvm.exp.f32(float [[X]])
 // AMDGCNSPIRV-NEXT:    ret float [[TMP0]]
 //
 extern "C" __device__ float test_expf(float x) {
   return expf(x);
 }
 
-// DEFAULT-LABEL: @test_exp(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_exp_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-LABEL: define dso_local noundef double @test_exp(
+// DEFAULT-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_exp_f64(double noundef [[X]]) #[[ATTR15]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
-// FINITEONLY-LABEL: @test_exp(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_exp_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) double @test_exp(
+// FINITEONLY-SAME: double noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_exp_f64(double noundef nofpclass(nan inf) [[X]]) #[[ATTR15]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
-// APPROX-LABEL: @test_exp(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_exp_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-LABEL: define dso_local noundef double @test_exp(
+// APPROX-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_exp_f64(double noundef [[X]]) #[[ATTR15]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
-// NCRDIV-LABEL: @test_exp(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_exp_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// NCRDIV-LABEL: define dso_local noundef double @test_exp(
+// NCRDIV-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_exp_f64(double noundef [[X]]) #[[ATTR15]]
 // NCRDIV-NEXT:    ret double [[CALL_I]]
 //
-// AMDGCNSPIRV-LABEL: @test_exp(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_exp_f64(double noundef [[X:%.*]]) #[[ATTR13]]
+// AMDGCNSPIRV-LABEL: define spir_func noundef double @test_exp(
+// AMDGCNSPIRV-SAME: double noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR5]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_exp_f64(double noundef [[X]]) #[[ATTR13]]
 // AMDGCNSPIRV-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_exp(double x) {
   return exp(x);
 }
 
-// DEFAULT-LABEL: @test_expm1f(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_expm1_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-LABEL: define dso_local noundef float @test_expm1f(
+// DEFAULT-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_expm1_f32(float noundef [[X]]) #[[ATTR15]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
-// FINITEONLY-LABEL: @test_expm1f(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_expm1_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) float @test_expm1f(
+// FINITEONLY-SAME: float noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_expm1_f32(float noundef nofpclass(nan inf) [[X]]) #[[ATTR15]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
-// APPROX-LABEL: @test_expm1f(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_expm1_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-LABEL: define dso_local noundef float @test_expm1f(
+// APPROX-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_expm1_f32(float noundef [[X]]) #[[ATTR15]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
-// NCRDIV-LABEL: @test_expm1f(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_expm1_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// NCRDIV-LABEL: define dso_local noundef float @test_expm1f(
+// NCRDIV-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_expm1_f32(float noundef [[X]]) #[[ATTR15]]
 // NCRDIV-NEXT:    ret float [[CALL_I]]
 //
-// AMDGCNSPIRV-LABEL: @test_expm1f(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_expm1_f32(float noundef [[X:%.*]]) #[[ATTR13]]
+// AMDGCNSPIRV-LABEL: define spir_func noundef float @test_expm1f(
+// AMDGCNSPIRV-SAME: float noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR5]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_expm1_f32(float noundef [[X]]) #[[ATTR13]]
 // AMDGCNSPIRV-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_expm1f(float x) {
   return expm1f(x);
 }
 
-// DEFAULT-LABEL: @test_expm1(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_expm1_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-LABEL: define dso_local noundef double @test_expm1(
+// DEFAULT-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_expm1_f64(double noundef [[X]]) #[[ATTR15]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
-// FINITEONLY-LABEL: @test_expm1(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_expm1_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) double @test_expm1(
+// FINITEONLY-SAME: double noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_expm1_f64(double noundef nofpclass(nan inf) [[X]]) #[[ATTR15]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
-// APPROX-LABEL: @test_expm1(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_expm1_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-LABEL: define dso_local noundef double @test_expm1(
+// APPROX-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_expm1_f64(double noundef [[X]]) #[[ATTR15]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
-// NCRDIV-LABEL: @test_expm1(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_expm1_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// NCRDIV-LABEL: define dso_local noundef double @test_expm1(
+// NCRDIV-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_expm1_f64(double noundef [[X]]) #[[ATTR15]]
 // NCRDIV-NEXT:    ret double [[CALL_I]]
 //
-// AMDGCNSPIRV-LABEL: @test_expm1(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_expm1_f64(double noundef [[X:%.*]]) #[[ATTR13]]
+// AMDGCNSPIRV-LABEL: define spir_func noundef double @test_expm1(
+// AMDGCNSPIRV-SAME: double noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR5]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_expm1_f64(double noundef [[X]]) #[[ATTR13]]
 // AMDGCNSPIRV-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_expm1(double x) {
   return expm1(x);
 }
 
-// DEFAULT-LABEL: @test_fabsf(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.fabs.f32(float [[X:%.*]])
+// DEFAULT-LABEL: define dso_local noundef float @test_fabsf(
+// DEFAULT-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.fabs.f32(float [[X]])
 // DEFAULT-NEXT:    ret float [[TMP0]]
 //
-// FINITEONLY-LABEL: @test_fabsf(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.fabs.f32(float nofpclass(nan inf) [[X:%.*]])
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) float @test_fabsf(
+// FINITEONLY-SAME: float noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.fabs.f32(float nofpclass(nan inf) [[X]])
 // FINITEONLY-NEXT:    ret float [[TMP0]]
 //
-// APPROX-LABEL: @test_fabsf(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.fabs.f32(float [[X:%.*]])
+// APPROX-LABEL: define dso_local noundef float @test_fabsf(
+// APPROX-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.fabs.f32(float [[X]])
 // APPROX-NEXT:    ret float [[TMP0]]
 //
-// NCRDIV-LABEL: @test_fabsf(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.fabs.f32(float [[X:%.*]])
+// NCRDIV-LABEL: define dso_local noundef float @test_fabsf(
+// NCRDIV-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.fabs.f32(float [[X]])
 // NCRDIV-NEXT:    ret float [[TMP0]]
 //
-// AMDGCNSPIRV-LABEL: @test_fabsf(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call contract noundef addrspace(4) float @llvm.fabs.f32(float [[X:%.*]])
+// AMDGCNSPIRV-LABEL: define spir_func noundef float @test_fabsf(
+// AMDGCNSPIRV-SAME: float noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR3]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call contract noundef addrspace(4) float @llvm.fabs.f32(float [[X]])
 // AMDGCNSPIRV-NEXT:    ret float [[TMP0]]
 //
 extern "C" __device__ float test_fabsf(float x) {
   return fabsf(x);
 }
 
-// DEFAULT-LABEL: @test_fabs(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.fabs.f64(double [[X:%.*]])
+// DEFAULT-LABEL: define dso_local noundef double @test_fabs(
+// DEFAULT-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.fabs.f64(double [[X]])
 // DEFAULT-NEXT:    ret double [[TMP0]]
 //
-// FINITEONLY-LABEL: @test_fabs(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.fabs.f64(double nofpclass(nan inf) [[X:%.*]])
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) double @test_fabs(
+// FINITEONLY-SAME: double noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.fabs.f64(double nofpclass(nan inf) [[X]])
 // FINITEONLY-NEXT:    ret double [[TMP0]]
 //
-// APPROX-LABEL: @test_fabs(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.fabs.f64(double [[X:%.*]])
+// APPROX-LABEL: define dso_local noundef double @test_fabs(
+// APPROX-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.fabs.f64(double [[X]])
 // APPROX-NEXT:    ret double [[TMP0]]
 //
-// NCRDIV-LABEL: @test_fabs(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.fabs.f64(double [[X:%.*]])
+// NCRDIV-LABEL: define dso_local noundef double @test_fabs(
+// NCRDIV-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.fabs.f64(double [[X]])
 // NCRDIV-NEXT:    ret double [[TMP0]]
 //
-// AMDGCNSPIRV-LABEL: @test_fabs(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call contract noundef addrspace(4) double @llvm.fabs.f64(double [[X:%.*]])
+// AMDGCNSPIRV-LABEL: define spir_func noundef double @test_fabs(
+// AMDGCNSPIRV-SAME: double noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR3]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call contract noundef addrspace(4) double @llvm.fabs.f64(double [[X]])
 // AMDGCNSPIRV-NEXT:    ret double [[TMP0]]
 //
 extern "C" __device__ double test_fabs(double x) {
   return fabs(x);
 }
 
-// DEFAULT-LABEL: @test_fdimf(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_fdim_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR14]]
+// DEFAULT-LABEL: define dso_local noundef float @test_fdimf(
+// DEFAULT-SAME: float noundef [[X:%.*]], float noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_fdim_f32(float noundef [[X]], float noundef [[Y]]) #[[ATTR14]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
-// FINITEONLY-LABEL: @test_fdimf(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_fdim_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR14]]
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) float @test_fdimf(
+// FINITEONLY-SAME: float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_fdim_f32(float noundef nofpclass(nan inf) [[X]], float noundef nofpclass(nan inf) [[Y]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
-// APPROX-LABEL: @test_fdimf(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_fdim_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR14]]
+// APPROX-LABEL: define dso_local noundef float @test_fdimf(
+// APPROX-SAME: float noundef [[X:%.*]], float noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_fdim_f32(float noundef [[X]], float noundef [[Y]]) #[[ATTR14]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
-// NCRDIV-LABEL: @test_fdimf(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_fdim_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR14]]
+// NCRDIV-LABEL: define dso_local noundef float @test_fdimf(
+// NCRDIV-SAME: float noundef [[X:%.*]], float noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_fdim_f32(float noundef [[X]], float noundef [[Y]]) #[[ATTR14]]
 // NCRDIV-NEXT:    ret float [[CALL_I]]
 //
-// AMDGCNSPIRV-LABEL: @test_fdimf(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_fdim_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR12]]
+// AMDGCNSPIRV-LABEL: define spir_func noundef float @test_fdimf(
+// AMDGCNSPIRV-SAME: float noundef [[X:%.*]], float noundef [[Y:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR4]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_fdim_f32(float noundef [[X]], float noundef [[Y]]) #[[ATTR12]]
 // AMDGCNSPIRV-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_fdimf(float x, float y) {
   return fdimf(x, y);
 }
 
-// DEFAULT-LABEL: @test_fdim(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_fdim_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR14]]
+// DEFAULT-LABEL: define dso_local noundef double @test_fdim(
+// DEFAULT-SAME: double noundef [[X:%.*]], double noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_fdim_f64(double noundef [[X]], double noundef [[Y]]) #[[ATTR14]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
-// FINITEONLY-LABEL: @test_fdim(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_fdim_f64(double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR14]]
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) double @test_fdim(
+// FINITEONLY-SAME: double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_fdim_f64(double noundef nofpclass(nan inf) [[X]], double noundef nofpclass(nan inf) [[Y]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
-// APPROX-LABEL: @test_fdim(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_fdim_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR14]]
+// APPROX-LABEL: define dso_local noundef double @test_fdim(
+// APPROX-SAME: double noundef [[X:%.*]], double noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_fdim_f64(double noundef [[X]], double noundef [[Y]]) #[[ATTR14]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
-// NCRDIV-LABEL: @test_fdim(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_fdim_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR14]]
+// NCRDIV-LABEL: define dso_local noundef double @test_fdim(
+// NCRDIV-SAME: double noundef [[X:%.*]], double noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_fdim_f64(double noundef [[X]], double noundef [[Y]]) #[[ATTR14]]
 // NCRDIV-NEXT:    ret double [[CALL_I]]
 //
-// AMDGCNSPIRV-LABEL: @test_fdim(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_fdim_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR12]]
+// AMDGCNSPIRV-LABEL: define spir_func noundef double @test_fdim(
+// AMDGCNSPIRV-SAME: double noundef [[X:%.*]], double noundef [[Y:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR4]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_fdim_f64(double noundef [[X]], double noundef [[Y]]) #[[ATTR12]]
 // AMDGCNSPIRV-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_fdim(double x, double y) {
   return fdim(x, y);
 }
 
-// DEFAULT-LABEL: @test_fdividef(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[DIV_I:%.*]] = fdiv contract float [[X:%.*]], [[Y:%.*]]
+// DEFAULT-LABEL: define dso_local noundef float @test_fdividef(
+// DEFAULT-SAME: float noundef [[X:%.*]], float noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[DIV_I:%.*]] = fdiv contract float [[X]], [[Y]]
 // DEFAULT-NEXT:    ret float [[DIV_I]]
 //
-// FINITEONLY-LABEL: @test_fdividef(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[DIV_I:%.*]] = fdiv nnan ninf contract float [[X:%.*]], [[Y:%.*]]
+// FINITEONLY-LABEL: define dso_local nofpclass(nan inf) float @test_fdividef(
+// FINITEONLY-SAME: float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[DIV_I:%.*]] = fdiv nnan ninf contract float [[X]], [[Y]]
 // FINITEONLY-NEXT:    ret float [[DIV_I]]
 //
-// APPROX-LABEL: @test_fdividef(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[DIV_I:%.*]] = fdiv contract float [[X:%.*]], [[Y:%.*]]
+// APPROX-LABEL: define dso_local noundef float @test_fdividef(
+// APPROX-SAME: float noundef [[X:%.*]], float noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[DIV_I:%.*]] = fdiv contract float [[X]], [[Y]]
 // APPROX-NEXT:    ret float [[DIV_I]]
 //
-// NCRDIV-LABEL: @test_fdividef(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[DIV_I:%.*]] = fdiv contract float [[X:%.*]], [[Y:%.*]], !fpmath [[META12:![0-9]+]]
+// NCRDIV-LABEL: define dso_local noundef float @test_fdividef(
+// NCRDIV-SAME: float noundef [[X:%.*]], float noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[DIV_I:%.*]] = fdiv contract float [[X]], [[Y]], !fpmath [[META12:![0-9]+]]
 // NCRDIV-NEXT:    ret float [[DIV_I]]
 //
-// AMDGCNSPIRV-LABEL: @test_fdividef(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[DIV_I:%.*]] = fdiv contract float [[X:%.*]], [[Y:%.*]]
+// AMDGCNSPIRV-LABEL: define spir_func noundef float @test_fdividef(
+// AMDGCNSPIRV-SAME: float noundef [[X:%.*]], float noundef [[Y:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR3]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[DIV_I:%.*]] = fdiv contract float [[X]], [[Y]]
 // AMDGCNSPIRV-NEXT:    ret float [[DIV_I]]
 //
 extern "C" __device__ float test_fdividef(float x, float y) {
   return fdividef(x, y);
 }
 
-// DEFAULT-LABEL: @test_floorf(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.floor.f32(float [[X:%.*]])
+// DEFAULT-LABEL: define dso_local noundef float @test_floorf(
+// DEFAULT-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.floor.f32(float [[X]])
 // DEFAULT-NEXT:    ret float [[TMP0]]
 //
-// FINITEONLY-LABEL: @test_floorf(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.floor.f32(float nofpclass(nan inf) [[X:%.*]])
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) float @test_floorf(
+// FINITEONLY-SAME: float noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.floor.f32(float nofpclass(nan inf) [[X]])
 // FINITEONLY-NEXT:    ret float [[TMP0]]
 //
-// APPROX-LABEL: @test_floorf(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.floor.f32(float [[X:%.*]])
+// APPROX-LABEL: define dso_local noundef float @test_floorf(
+// APPROX-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.floor.f32(float [[X]])
 // APPROX-NEXT:    ret float [[TMP0]]
 //
-// NCRDIV-LABEL: @test_floorf(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.floor.f32(float [[X:%.*]])
+// NCRDIV-LABEL: define dso_local noundef float @test_floorf(
+// NCRDIV-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.floor.f32(float [[X]])
 // NCRDIV-NEXT:    ret float [[TMP0]]
 //
-// AMDGCNSPIRV-LABEL: @test_floorf(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call contract noundef addrspace(4) float @llvm.floor.f32(float [[X:%.*]])
+// AMDGCNSPIRV-LABEL: define spir_func noundef float @test_floorf(
+// AMDGCNSPIRV-SAME: float noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR3]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call contract noundef addrspace(4) float @llvm.floor.f32(float [[X]])
 // AMDGCNSPIRV-NEXT:    ret float [[TMP0]]
 //
 extern "C" __device__ float test_floorf(float x) {
   return floorf(x);
 }
 
-// DEFAULT-LABEL: @test_floor(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.floor.f64(double [[X:%.*]])
+// DEFAULT-LABEL: define dso_local noundef double @test_floor(
+// DEFAULT-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.floor.f64(double [[X]])
 // DEFAULT-NEXT:    ret double [[TMP0]]
 //
-// FINITEONLY-LABEL: @test_floor(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.floor.f64(double nofpclass(nan inf) [[X:%.*]])
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) double @test_floor(
+// FINITEONLY-SAME: double noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.floor.f64(double nofpclass(nan inf) [[X]])
 // FINITEONLY-NEXT:    ret double [[TMP0]]
 //
-// APPROX-LABEL: @test_floor(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.floor.f64(double [[X:%.*]])
+// APPROX-LABEL: define dso_local noundef double @test_floor(
+// APPROX-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.floor.f64(double [[X]])
 // APPROX-NEXT:    ret double [[TMP0]]
 //
-// NCRDIV-LABEL: @test_floor(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.floor.f64(double [[X:%.*]])
+// NCRDIV-LABEL: define dso_local noundef double @test_floor(
+// NCRDIV-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.floor.f64(double [[X]])
 // NCRDIV-NEXT:    ret double [[TMP0]]
 //
-// AMDGCNSPIRV-LABEL: @test_floor(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call contract noundef addrspace(4) double @llvm.floor.f64(double [[X:%.*]])
+// AMDGCNSPIRV-LABEL: define spir_func noundef double @test_floor(
+// AMDGCNSPIRV-SAME: double noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR3]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call contract noundef addrspace(4) double @llvm.floor.f64(double [[X]])
 // AMDGCNSPIRV-NEXT:    ret double [[TMP0]]
 //
 extern "C" __device__ double test_floor(double x) {
   return floor(x);
 }
 
-// DEFAULT-LABEL: @test_fmaf(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.fma.f32(float [[X:%.*]], float [[Y:%.*]], float [[Z:%.*]])
+// DEFAULT-LABEL: define dso_local noundef float @test_fmaf(
+// DEFAULT-SAME: float noundef [[X:%.*]], float noundef [[Y:%.*]], float noundef [[Z:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.fma.f32(float [[X]], float [[Y]], float [[Z]])
 // DEFAULT-NEXT:    ret float [[TMP0]]
 //
-// FINITEONLY-LABEL: @test_fmaf(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.fma.f32(float nofpclass(nan inf) [[X:%.*]], float nofpclass(nan inf) [[Y:%.*]], float nofpclass(nan inf) [[Z:%.*]])
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) float @test_fmaf(
+// FINITEONLY-SAME: float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]], float noundef nofpclass(nan inf) [[Z:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.fma.f32(float nofpclass(nan inf) [[X]], float nofpclass(nan inf) [[Y]], float nofpclass(nan inf) [[Z]])
 // FINITEONLY-NEXT:    ret float [[TMP0]]
 //
-// APPROX-LABEL: @test_fmaf(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.fma.f32(float [[X:%.*]], float [[Y:%.*]], float [[Z:%.*]])
+// APPROX-LABEL: define dso_local noundef float @test_fmaf(
+// APPROX-SAME: float noundef [[X:%.*]], float noundef [[Y:%.*]], float noundef [[Z:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.fma.f32(float [[X]], float [[Y]], float [[Z]])
 // APPROX-NEXT:    ret float [[TMP0]]
 //
-// NCRDIV-LABEL: @test_fmaf(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.fma.f32(float [[X:%.*]], float [[Y:%.*]], float [[Z:%.*]])
+// NCRDIV-LABEL: define dso_local noundef float @test_fmaf(
+// NCRDIV-SAME: float noundef [[X:%.*]], float noundef [[Y:%.*]], float noundef [[Z:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.fma.f32(float [[X]], float [[Y]], float [[Z]])
 // NCRDIV-NEXT:    ret float [[TMP0]]
 //
-// AMDGCNSPIRV-LABEL: @test_fmaf(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call contract noundef addrspace(4) float @llvm.fma.f32(float [[X:%.*]], float [[Y:%.*]], float [[Z:%.*]])
+// AMDGCNSPIRV-LABEL: define spir_func noundef float @test_fmaf(
+// AMDGCNSPIRV-SAME: float noundef [[X:%.*]], float noundef [[Y:%.*]], float noundef [[Z:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR3]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call contract noundef addrspace(4) float @llvm.fma.f32(float [[X]], float [[Y]], float [[Z]])
 // AMDGCNSPIRV-NEXT:    ret float [[TMP0]]
 //
 extern "C" __device__ float test_fmaf(float x, float y, float z) {
   return fmaf(x, y, z);
 }
 
-// DEFAULT-LABEL: @test_fma(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.fma.f64(double [[X:%.*]], double [[Y:%.*]], double [[Z:%.*]])
+// DEFAULT-LABEL: define dso_local noundef double @test_fma(
+// DEFAULT-SAME: double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.fma.f64(double [[X]], double [[Y]], double [[Z]])
 // DEFAULT-NEXT:    ret double [[TMP0]]
 //
-// FINITEONLY-LABEL: @test_fma(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.fma.f64(double nofpclass(nan inf) [[X:%.*]], double nofpclass(nan inf) [[Y:%.*]], double nofpclass(nan inf) [[Z:%.*]])
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) double @test_fma(
+// FINITEONLY-SAME: double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]], double noundef nofpclass(nan inf) [[Z:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.fma.f64(double nofpclass(nan inf) [[X]], double nofpclass(nan inf) [[Y]], double nofpclass(nan inf) [[Z]])
 // FINITEONLY-NEXT:    ret double [[TMP0]]
 //
-// APPROX-LABEL: @test_fma(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.fma.f64(double [[X:%.*]], double [[Y:%.*]], double [[Z:%.*]])
+// APPROX-LABEL: define dso_local noundef double @test_fma(
+// APPROX-SAME: double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.fma.f64(double [[X]], double [[Y]], double [[Z]])
 // APPROX-NEXT:    ret double [[TMP0]]
 //
-// NCRDIV-LABEL: @test_fma(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.fma.f64(double [[X:%.*]], double [[Y:%.*]], double [[Z:%.*]])
+// NCRDIV-LABEL: define dso_local noundef double @test_fma(
+// NCRDIV-SAME: double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.fma.f64(double [[X]], double [[Y]], double [[Z]])
 // NCRDIV-NEXT:    ret double [[TMP0]]
 //
-// AMDGCNSPIRV-LABEL: @test_fma(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call contract noundef addrspace(4) double @llvm.fma.f64(double [[X:%.*]], double [[Y:%.*]], double [[Z:%.*]])
+// AMDGCNSPIRV-LABEL: define spir_func noundef double @test_fma(
+// AMDGCNSPIRV-SAME: double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR3]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call contract noundef addrspace(4) double @llvm.fma.f64(double [[X]], double [[Y]], double [[Z]])
 // AMDGCNSPIRV-NEXT:    ret double [[TMP0]]
 //
 extern "C" __device__ double test_fma(double x, double y, double z) {
   return fma(x, y, z);
 }
 
-// DEFAULT-LABEL: @test_fma_rn(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.fma.f64(double [[X:%.*]], double [[Y:%.*]], double [[Z:%.*]])
+// DEFAULT-LABEL: define dso_local noundef double @test_fma_rn(
+// DEFAULT-SAME: double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.fma.f64(double [[X]], double [[Y]], double [[Z]])
 // DEFAULT-NEXT:    ret double [[TMP0]]
 //
-// FINITEONLY-LABEL: @test_fma_rn(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.fma.f64(double nofpclass(nan inf) [[X:%.*]], double nofpclass(nan inf) [[Y:%.*]], double nofpclass(nan inf) [[Z:%.*]])
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) double @test_fma_rn(
+// FINITEONLY-SAME: double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]], double noundef nofpclass(nan inf) [[Z:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.fma.f64(double nofpclass(nan inf) [[X]], double nofpclass(nan inf) [[Y]], double nofpclass(nan inf) [[Z]])
 // FINITEONLY-NEXT:    ret double [[TMP0]]
 //
-// APPROX-LABEL: @test_fma_rn(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.fma.f64(double [[X:%.*]], double [[Y:%.*]], double [[Z:%.*]])
+// APPROX-LABEL: define dso_local noundef double @test_fma_rn(
+// APPROX-SAME: double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.fma.f64(double [[X]], double [[Y]], double [[Z]])
 // APPROX-NEXT:    ret double [[TMP0]]
 //
-// NCRDIV-LABEL: @test_fma_rn(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.fma.f64(double [[X:%.*]], double [[Y:%.*]], double [[Z:%.*]])
+// NCRDIV-LABEL: define dso_local noundef double @test_fma_rn(
+// NCRDIV-SAME: double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.fma.f64(double [[X]], double [[Y]], double [[Z]])
 // NCRDIV-NEXT:    ret double [[TMP0]]
 //
-// AMDGCNSPIRV-LABEL: @test_fma_rn(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call contract noundef addrspace(4) double @llvm.fma.f64(double [[X:%.*]], double [[Y:%.*]], double [[Z:%.*]])
+// AMDGCNSPIRV-LABEL: define spir_func noundef double @test_fma_rn(
+// AMDGCNSPIRV-SAME: double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR3]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call contract noundef addrspace(4) double @llvm.fma.f64(double [[X]], double [[Y]], double [[Z]])
 // AMDGCNSPIRV-NEXT:    ret double [[TMP0]]
 //
 extern "C" __device__ double test_fma_rn(double x, double y, double z) {
   return __fma_rn(x, y, z);
 }
 
-// DEFAULT-LABEL: @test_fmaxf(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.maxnum.f32(float [[X:%.*]], float [[Y:%.*]])
+// DEFAULT-LABEL: define dso_local noundef float @test_fmaxf(
+// DEFAULT-SAME: float noundef [[X:%.*]], float noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.maxnum.f32(float [[X]], float [[Y]])
 // DEFAULT-NEXT:    ret float [[TMP0]]
 //
-// FINITEONLY-LABEL: @test_fmaxf(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.maxnum.f32(float nofpclass(nan inf) [[X:%.*]], float nofpclass(nan inf) [[Y:%.*]])
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) float @test_fmaxf(
+// FINITEONLY-SAME: float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.maxnum.f32(float nofpclass(nan inf) [[X]], float nofpclass(nan inf) [[Y]])
 // FINITEONLY-NEXT:    ret float [[TMP0]]
 //
-// APPROX-LABEL: @test_fmaxf(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.maxnum.f32(float [[X:%.*]], float [[Y:%.*]])
+// APPROX-LABEL: define dso_local noundef float @test_fmaxf(
+// APPROX-SAME: float noundef [[X:%.*]], float noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.maxnum.f32(float [[X]], float [[Y]])
 // APPROX-NEXT:    ret float [[TMP0]]
 //
-// NCRDIV-LABEL: @test_fmaxf(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.maxnum.f32(float [[X:%.*]], float [[Y:%.*]])
+// NCRDIV-LABEL: define dso_local noundef float @test_fmaxf(
+// NCRDIV-SAME: float noundef [[X:%.*]], float noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.maxnum.f32(float [[X]], float [[Y]])
 // NCRDIV-NEXT:    ret float [[TMP0]]
 //
-// AMDGCNSPIRV-LABEL: @test_fmaxf(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call contract noundef addrspace(4) float @llvm.maxnum.f32(float [[X:%.*]], float [[Y:%.*]])
+// AMDGCNSPIRV-LABEL: define spir_func noundef float @test_fmaxf(
+// AMDGCNSPIRV-SAME: float noundef [[X:%.*]], float noundef [[Y:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR3]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call contract noundef addrspace(4) float @llvm.maxnum.f32(float [[X]], float [[Y]])
 // AMDGCNSPIRV-NEXT:    ret float [[TMP0]]
 //
 extern "C" __device__ float test_fmaxf(float x, float y) {
   return fmaxf(x, y);
 }
 
-// DEFAULT-LABEL: @test_fmax(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.maxnum.f64(double [[X:%.*]], double [[Y:%.*]])
+// DEFAULT-LABEL: define dso_local noundef double @test_fmax(
+// DEFAULT-SAME: double noundef [[X:%.*]], double noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.maxnum.f64(double [[X]], double [[Y]])
 // DEFAULT-NEXT:    ret double [[TMP0]]
 //
-// FINITEONLY-LABEL: @test_fmax(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.maxnum.f64(double nofpclass(nan inf) [[X:%.*]], double nofpclass(nan inf) [[Y:%.*]])
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) double @test_fmax(
+// FINITEONLY-SAME: double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.maxnum.f64(double nofpclass(nan inf) [[X]], double nofpclass(nan inf) [[Y]])
 // FINITEONLY-NEXT:    ret double [[TMP0]]
 //
-// APPROX-LABEL: @test_fmax(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.maxnum.f64(double [[X:%.*]], double [[Y:%.*]])
+// APPROX-LABEL: define dso_local noundef double @test_fmax(
+// APPROX-SAME: double noundef [[X:%.*]], double noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.maxnum.f64(double [[X]], double [[Y]])
 // APPROX-NEXT:    ret double [[TMP0]]
 //
-// NCRDIV-LABEL: @test_fmax(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.maxnum.f64(double [[X:%.*]], double [[Y:%.*]])
+// NCRDIV-LABEL: define dso_local noundef double @test_fmax(
+// NCRDIV-SAME: double noundef [[X:%.*]], double noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.maxnum.f64(double [[X]], double [[Y]])
 // NCRDIV-NEXT:    ret double [[TMP0]]
 //
-// AMDGCNSPIRV-LABEL: @test_fmax(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call contract noundef addrspace(4) double @llvm.maxnum.f64(double [[X:%.*]], double [[Y:%.*]])
+// AMDGCNSPIRV-LABEL: define spir_func noundef double @test_fmax(
+// AMDGCNSPIRV-SAME: double noundef [[X:%.*]], double noundef [[Y:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR3]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call contract noundef addrspace(4) double @llvm.maxnum.f64(double [[X]], double [[Y]])
 // AMDGCNSPIRV-NEXT:    ret double [[TMP0]]
 //
 extern "C" __device__ double test_fmax(double x, double y) {
   return fmax(x, y);
 }
 
-// DEFAULT-LABEL: @test_fminf(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.minnum.f32(float [[X:%.*]], float [[Y:%.*]])
+// DEFAULT-LABEL: define dso_local noundef float @test_fminf(
+// DEFAULT-SAME: float noundef [[X:%.*]], float noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.minnum.f32(float [[X]], float [[Y]])
 // DEFAULT-NEXT:    ret float [[TMP0]]
 //
-// FINITEONLY-LABEL: @test_fminf(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.minnum.f32(float nofpclass(nan inf) [[X:%.*]], float nofpclass(nan inf) [[Y:%.*]])
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) float @test_fminf(
+// FINITEONLY-SAME: float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.minnum.f32(float nofpclass(nan inf) [[X]], float nofpclass(nan inf) [[Y]])
 // FINITEONLY-NEXT:    ret float [[TMP0]]
 //
-// APPROX-LABEL: @test_fminf(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.minnum.f32(float [[X:%.*]], float [[Y:%.*]])
+// APPROX-LABEL: define dso_local noundef float @test_fminf(
+// APPROX-SAME: float noundef [[X:%.*]], float noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.minnum.f32(float [[X]], float [[Y]])
 // APPROX-NEXT:    ret float [[TMP0]]
 //
-// NCRDIV-LABEL: @test_fminf(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.minnum.f32(float [[X:%.*]], float [[Y:%.*]])
+// NCRDIV-LABEL: define dso_local noundef float @test_fminf(
+// NCRDIV-SAME: float noundef [[X:%.*]], float noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.minnum.f32(float [[X]], float [[Y]])
 // NCRDIV-NEXT:    ret float [[TMP0]]
 //
-// AMDGCNSPIRV-LABEL: @test_fminf(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call contract noundef addrspace(4) float @llvm.minnum.f32(float [[X:%.*]], float [[Y:%.*]])
+// AMDGCNSPIRV-LABEL: define spir_func noundef float @test_fminf(
+// AMDGCNSPIRV-SAME: float noundef [[X:%.*]], float noundef [[Y:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR3]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call contract noundef addrspace(4) float @llvm.minnum.f32(float [[X]], float [[Y]])
 // AMDGCNSPIRV-NEXT:    ret float [[TMP0]]
 //
 extern "C" __device__ float test_fminf(float x, float y) {
   return fminf(x, y);
 }
 
-// DEFAULT-LABEL: @test_fmin(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.minnum.f64(double [[X:%.*]], double [[Y:%.*]])
+// DEFAULT-LABEL: define dso_local noundef double @test_fmin(
+// DEFAULT-SAME: double noundef [[X:%.*]], double noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.minnum.f64(double [[X]], double [[Y]])
 // DEFAULT-NEXT:    ret double [[TMP0]]
 //
-// FINITEONLY-LABEL: @test_fmin(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.minnum.f64(double nofpclass(nan inf) [[X:%.*]], double nofpclass(nan inf) [[Y:%.*]])
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) double @test_fmin(
+// FINITEONLY-SAME: double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.minnum.f64(double nofpclass(nan inf) [[X]], double nofpclass(nan inf) [[Y]])
 // FINITEONLY-NEXT:    ret double [[TMP0]]
 //
-// APPROX-LABEL: @test_fmin(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.minnum.f64(double [[X:%.*]], double [[Y:%.*]])
+// APPROX-LABEL: define dso_local noundef double @test_fmin(
+// APPROX-SAME: double noundef [[X:%.*]], double noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.minnum.f64(double [[X]], double [[Y]])
 // APPROX-NEXT:    ret double [[TMP0]]
 //
-// NCRDIV-LABEL: @test_fmin(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.minnum.f64(double [[X:%.*]], double [[Y:%.*]])
+// NCRDIV-LABEL: define dso_local noundef double @test_fmin(
+// NCRDIV-SAME: double noundef [[X:%.*]], double noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.minnum.f64(double [[X]], double [[Y]])
 // NCRDIV-NEXT:    ret double [[TMP0]]
 //
-// AMDGCNSPIRV-LABEL: @test_fmin(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call contract noundef addrspace(4) double @llvm.minnum.f64(double [[X:%.*]], double [[Y:%.*]])
+// AMDGCNSPIRV-LABEL: define spir_func noundef double @test_fmin(
+// AMDGCNSPIRV-SAME: double noundef [[X:%.*]], double noundef [[Y:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR3]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call contract noundef addrspace(4) double @llvm.minnum.f64(double [[X]], double [[Y]])
 // AMDGCNSPIRV-NEXT:    ret double [[TMP0]]
 //
 extern "C" __device__ double test_fmin(double x, double y) {
   return fmin(x, y);
 }
 
-// DEFAULT-LABEL: @test_fmodf(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_fmod_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR14]]
+// DEFAULT-LABEL: define dso_local noundef float @test_fmodf(
+// DEFAULT-SAME: float noundef [[X:%.*]], float noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_fmod_f32(float noundef [[X]], float noundef [[Y]]) #[[ATTR14]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
-// FINITEONLY-LABEL: @test_fmodf(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_fmod_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR14]]
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) float @test_fmodf(
+// FINITEONLY-SAME: float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_fmod_f32(float noundef nofpclass(nan inf) [[X]], float noundef nofpclass(nan inf) [[Y]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
-// APPROX-LABEL: @test_fmodf(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_fmod_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR14]]
+// APPROX-LABEL: define dso_local noundef float @test_fmodf(
+// APPROX-SAME: float noundef [[X:%.*]], float noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_fmod_f32(float noundef [[X]], float noundef [[Y]]) #[[ATTR14]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
-// NCRDIV-LABEL: @test_fmodf(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_fmod_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR14]]
+// NCRDIV-LABEL: define dso_local noundef float @test_fmodf(
+// NCRDIV-SAME: float noundef [[X:%.*]], float noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_fmod_f32(float noundef [[X]], float noundef [[Y]]) #[[ATTR14]]
 // NCRDIV-NEXT:    ret float [[CALL_I]]
 //
-// AMDGCNSPIRV-LABEL: @test_fmodf(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_fmod_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR12]]
+// AMDGCNSPIRV-LABEL: define spir_func noundef float @test_fmodf(
+// AMDGCNSPIRV-SAME: float noundef [[X:%.*]], float noundef [[Y:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR4]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_fmod_f32(float noundef [[X]], float noundef [[Y]]) #[[ATTR12]]
 // AMDGCNSPIRV-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_fmodf(float x, float y) {
   return fmodf(x, y);
 }
 
-// DEFAULT-LABEL: @test_fmod(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_fmod_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR14]]
+// DEFAULT-LABEL: define dso_local noundef double @test_fmod(
+// DEFAULT-SAME: double noundef [[X:%.*]], double noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_fmod_f64(double noundef [[X]], double noundef [[Y]]) #[[ATTR14]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
-// FINITEONLY-LABEL: @test_fmod(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_fmod_f64(double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR14]]
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) double @test_fmod(
+// FINITEONLY-SAME: double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_fmod_f64(double noundef nofpclass(nan inf) [[X]], double noundef nofpclass(nan inf) [[Y]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
-// APPROX-LABEL: @test_fmod(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_fmod_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR14]]
+// APPROX-LABEL: define dso_local noundef double @test_fmod(
+// APPROX-SAME: double noundef [[X:%.*]], double noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_fmod_f64(double noundef [[X]], double noundef [[Y]]) #[[ATTR14]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
-// NCRDIV-LABEL: @test_fmod(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_fmod_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR14]]
+// NCRDIV-LABEL: define dso_local noundef double @test_fmod(
+// NCRDIV-SAME: double noundef [[X:%.*]], double noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_fmod_f64(double noundef [[X]], double noundef [[Y]]) #[[ATTR14]]
 // NCRDIV-NEXT:    ret double [[CALL_I]]
 //
-// AMDGCNSPIRV-LABEL: @test_fmod(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_fmod_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR12]]
+// AMDGCNSPIRV-LABEL: define spir_func noundef double @test_fmod(
+// AMDGCNSPIRV-SAME: double noundef [[X:%.*]], double noundef [[Y:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR4]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_fmod_f64(double noundef [[X]], double noundef [[Y]]) #[[ATTR12]]
 // AMDGCNSPIRV-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_fmod(double x, double y) {
   return fmod(x, y);
 }
 
-// DEFAULT-LABEL: @test_frexpf(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call { float, i32 } @llvm.frexp.f32.i32(float [[X:%.*]])
+// DEFAULT-LABEL: define dso_local noundef float @test_frexpf(
+// DEFAULT-SAME: float noundef [[X:%.*]], ptr noundef writeonly captures(none) initializes((0, 4)) [[Y:%.*]]) local_unnamed_addr #[[ATTR7:[0-9]+]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call { float, i32 } @llvm.frexp.f32.i32(float [[X]])
 // DEFAULT-NEXT:    [[TMP1:%.*]] = extractvalue { float, i32 } [[TMP0]], 1
-// DEFAULT-NEXT:    store i32 [[TMP1]], ptr [[Y:%.*]], align 4, !tbaa [[TBAA12:![0-9]+]]
+// DEFAULT-NEXT:    store i32 [[TMP1]], ptr [[Y]], align 4, !tbaa [[INT_TBAA12:![0-9]+]]
 // DEFAULT-NEXT:    [[TMP2:%.*]] = extractvalue { float, i32 } [[TMP0]], 0
 // DEFAULT-NEXT:    ret float [[TMP2]]
 //
-// FINITEONLY-LABEL: @test_frexpf(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call { float, i32 } @llvm.frexp.f32.i32(float nofpclass(nan inf) [[X:%.*]])
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) float @test_frexpf(
+// FINITEONLY-SAME: float noundef nofpclass(nan inf) [[X:%.*]], ptr noundef writeonly captures(none) initializes((0, 4)) [[Y:%.*]]) local_unnamed_addr #[[ATTR7:[0-9]+]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call { float, i32 } @llvm.frexp.f32.i32(float nofpclass(nan inf) [[X]])
 // FINITEONLY-NEXT:    [[TMP1:%.*]] = extractvalue { float, i32 } [[TMP0]], 1
-// FINITEONLY-NEXT:    store i32 [[TMP1]], ptr [[Y:%.*]], align 4, !tbaa [[TBAA12:![0-9]+]]
+// FINITEONLY-NEXT:    store i32 [[TMP1]], ptr [[Y]], align 4, !tbaa [[INT_TBAA12:![0-9]+]]
 // FINITEONLY-NEXT:    [[TMP2:%.*]] = extractvalue { float, i32 } [[TMP0]], 0
 // FINITEONLY-NEXT:    ret float [[TMP2]]
 //
-// APPROX-LABEL: @test_frexpf(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[TMP0:%.*]] = tail call { float, i32 } @llvm.frexp.f32.i32(float [[X:%.*]])
+// APPROX-LABEL: define dso_local noundef float @test_frexpf(
+// APPROX-SAME: float noundef [[X:%.*]], ptr noundef writeonly captures(none) initializes((0, 4)) [[Y:%.*]]) local_unnamed_addr #[[ATTR7:[0-9]+]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[TMP0:%.*]] = tail call { float, i32 } @llvm.frexp.f32.i32(float [[X]])
 // APPROX-NEXT:    [[TMP1:%.*]] = extractvalue { float, i32 } [[TMP0]], 1
-// APPROX-NEXT:    store i32 [[TMP1]], ptr [[Y:%.*]], align 4, !tbaa [[TBAA12:![0-9]+]]
+// APPROX-NEXT:    store i32 [[TMP1]], ptr [[Y]], align 4, !tbaa [[INT_TBAA12:![0-9]+]]
 // APPROX-NEXT:    [[TMP2:%.*]] = extractvalue { float, i32 } [[TMP0]], 0
 // APPROX-NEXT:    ret float [[TMP2]]
 //
-// NCRDIV-LABEL: @test_frexpf(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call { float, i32 } @llvm.frexp.f32.i32(float [[X:%.*]])
+// NCRDIV-LABEL: define dso_local noundef float @test_frexpf(
+// NCRDIV-SAME: float noundef [[X:%.*]], ptr noundef writeonly captures(none) initializes((0, 4)) [[Y:%.*]]) local_unnamed_addr #[[ATTR7:[0-9]+]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call { float, i32 } @llvm.frexp.f32.i32(float [[X]])
 // NCRDIV-NEXT:    [[TMP1:%.*]] = extractvalue { float, i32 } [[TMP0]], 1
-// NCRDIV-NEXT:    store i32 [[TMP1]], ptr [[Y:%.*]], align 4, !tbaa [[TBAA13:![0-9]+]]
+// NCRDIV-NEXT:    store i32 [[TMP1]], ptr [[Y]], align 4, !tbaa [[INT_TBAA13:![0-9]+]]
 // NCRDIV-NEXT:    [[TMP2:%.*]] = extractvalue { float, i32 } [[TMP0]], 0
 // NCRDIV-NEXT:    ret float [[TMP2]]
 //
-// AMDGCNSPIRV-LABEL: @test_frexpf(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call addrspace(4) { float, i32 } @llvm.frexp.f32.i32(float [[X:%.*]])
+// AMDGCNSPIRV-LABEL: define spir_func noundef float @test_frexpf(
+// AMDGCNSPIRV-SAME: float noundef [[X:%.*]], ptr addrspace(4) noundef writeonly captures(none) initializes((0, 4)) [[Y:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR7:[0-9]+]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call addrspace(4) { float, i32 } @llvm.frexp.f32.i32(float [[X]])
 // AMDGCNSPIRV-NEXT:    [[TMP1:%.*]] = extractvalue { float, i32 } [[TMP0]], 1
-// AMDGCNSPIRV-NEXT:    store i32 [[TMP1]], ptr addrspace(4) [[Y:%.*]], align 4, !tbaa [[TBAA13:![0-9]+]]
+// AMDGCNSPIRV-NEXT:    store i32 [[TMP1]], ptr addrspace(4) [[Y]], align 4, !tbaa [[INT_TBAA13:![0-9]+]]
 // AMDGCNSPIRV-NEXT:    [[TMP2:%.*]] = extractvalue { float, i32 } [[TMP0]], 0
 // AMDGCNSPIRV-NEXT:    ret float [[TMP2]]
 //
@@ -2162,43 +2471,48 @@ extern "C" __device__ float test_frexpf(float x, int* y) {
   return frexpf(x, y);
 }
 
-// DEFAULT-LABEL: @test_frexp(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call { double, i32 } @llvm.frexp.f64.i32(double [[X:%.*]])
+// DEFAULT-LABEL: define dso_local noundef double @test_frexp(
+// DEFAULT-SAME: double noundef [[X:%.*]], ptr noundef writeonly captures(none) initializes((0, 4)) [[Y:%.*]]) local_unnamed_addr #[[ATTR7]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call { double, i32 } @llvm.frexp.f64.i32(double [[X]])
 // DEFAULT-NEXT:    [[TMP1:%.*]] = extractvalue { double, i32 } [[TMP0]], 1
-// DEFAULT-NEXT:    store i32 [[TMP1]], ptr [[Y:%.*]], align 4, !tbaa [[TBAA12]]
+// DEFAULT-NEXT:    store i32 [[TMP1]], ptr [[Y]], align 4, !tbaa [[INT_TBAA12]]
 // DEFAULT-NEXT:    [[TMP2:%.*]] = extractvalue { double, i32 } [[TMP0]], 0
 // DEFAULT-NEXT:    ret double [[TMP2]]
 //
-// FINITEONLY-LABEL: @test_frexp(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call { double, i32 } @llvm.frexp.f64.i32(double nofpclass(nan inf) [[X:%.*]])
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) double @test_frexp(
+// FINITEONLY-SAME: double noundef nofpclass(nan inf) [[X:%.*]], ptr noundef writeonly captures(none) initializes((0, 4)) [[Y:%.*]]) local_unnamed_addr #[[ATTR7]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call { double, i32 } @llvm.frexp.f64.i32(double nofpclass(nan inf) [[X]])
 // FINITEONLY-NEXT:    [[TMP1:%.*]] = extractvalue { double, i32 } [[TMP0]], 1
-// FINITEONLY-NEXT:    store i32 [[TMP1]], ptr [[Y:%.*]], align 4, !tbaa [[TBAA12]]
+// FINITEONLY-NEXT:    store i32 [[TMP1]], ptr [[Y]], align 4, !tbaa [[INT_TBAA12]]
 // FINITEONLY-NEXT:    [[TMP2:%.*]] = extractvalue { double, i32 } [[TMP0]], 0
 // FINITEONLY-NEXT:    ret double [[TMP2]]
 //
-// APPROX-LABEL: @test_frexp(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[TMP0:%.*]] = tail call { double, i32 } @llvm.frexp.f64.i32(double [[X:%.*]])
+// APPROX-LABEL: define dso_local noundef double @test_frexp(
+// APPROX-SAME: double noundef [[X:%.*]], ptr noundef writeonly captures(none) initializes((0, 4)) [[Y:%.*]]) local_unnamed_addr #[[ATTR7]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[TMP0:%.*]] = tail call { double, i32 } @llvm.frexp.f64.i32(double [[X]])
 // APPROX-NEXT:    [[TMP1:%.*]] = extractvalue { double, i32 } [[TMP0]], 1
-// APPROX-NEXT:    store i32 [[TMP1]], ptr [[Y:%.*]], align 4, !tbaa [[TBAA12]]
+// APPROX-NEXT:    store i32 [[TMP1]], ptr [[Y]], align 4, !tbaa [[INT_TBAA12]]
 // APPROX-NEXT:    [[TMP2:%.*]] = extractvalue { double, i32 } [[TMP0]], 0
 // APPROX-NEXT:    ret double [[TMP2]]
 //
-// NCRDIV-LABEL: @test_frexp(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call { double, i32 } @llvm.frexp.f64.i32(double [[X:%.*]])
+// NCRDIV-LABEL: define dso_local noundef double @test_frexp(
+// NCRDIV-SAME: double noundef [[X:%.*]], ptr noundef writeonly captures(none) initializes((0, 4)) [[Y:%.*]]) local_unnamed_addr #[[ATTR7]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call { double, i32 } @llvm.frexp.f64.i32(double [[X]])
 // NCRDIV-NEXT:    [[TMP1:%.*]] = extractvalue { double, i32 } [[TMP0]], 1
-// NCRDIV-NEXT:    store i32 [[TMP1]], ptr [[Y:%.*]], align 4, !tbaa [[TBAA13]]
+// NCRDIV-NEXT:    store i32 [[TMP1]], ptr [[Y]], align 4, !tbaa [[INT_TBAA13]]
 // NCRDIV-NEXT:    [[TMP2:%.*]] = extractvalue { double, i32 } [[TMP0]], 0
 // NCRDIV-NEXT:    ret double [[TMP2]]
 //
-// AMDGCNSPIRV-LABEL: @test_frexp(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call addrspace(4) { double, i32 } @llvm.frexp.f64.i32(double [[X:%.*]])
+// AMDGCNSPIRV-LABEL: define spir_func noundef double @test_frexp(
+// AMDGCNSPIRV-SAME: double noundef [[X:%.*]], ptr addrspace(4) noundef writeonly captures(none) initializes((0, 4)) [[Y:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR7]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call addrspace(4) { double, i32 } @llvm.frexp.f64.i32(double [[X]])
 // AMDGCNSPIRV-NEXT:    [[TMP1:%.*]] = extractvalue { double, i32 } [[TMP0]], 1
-// AMDGCNSPIRV-NEXT:    store i32 [[TMP1]], ptr addrspace(4) [[Y:%.*]], align 4, !tbaa [[TBAA13]]
+// AMDGCNSPIRV-NEXT:    store i32 [[TMP1]], ptr addrspace(4) [[Y]], align 4, !tbaa [[INT_TBAA13]]
 // AMDGCNSPIRV-NEXT:    [[TMP2:%.*]] = extractvalue { double, i32 } [[TMP0]], 0
 // AMDGCNSPIRV-NEXT:    ret double [[TMP2]]
 //
@@ -2206,150 +2520,175 @@ extern "C" __device__ double test_frexp(double x, int* y) {
   return frexp(x, y);
 }
 
-// DEFAULT-LABEL: @test_hypotf(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_hypot_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR14]]
+// DEFAULT-LABEL: define dso_local noundef float @test_hypotf(
+// DEFAULT-SAME: float noundef [[X:%.*]], float noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_hypot_f32(float noundef [[X]], float noundef [[Y]]) #[[ATTR14]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
-// FINITEONLY-LABEL: @test_hypotf(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_hypot_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR14]]
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) float @test_hypotf(
+// FINITEONLY-SAME: float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_hypot_f32(float noundef nofpclass(nan inf) [[X]], float noundef nofpclass(nan inf) [[Y]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
-// APPROX-LABEL: @test_hypotf(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_hypot_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR14]]
+// APPROX-LABEL: define dso_local noundef float @test_hypotf(
+// APPROX-SAME: float noundef [[X:%.*]], float noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_hypot_f32(float noundef [[X]], float noundef [[Y]]) #[[ATTR14]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
-// NCRDIV-LABEL: @test_hypotf(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_hypot_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR14]]
+// NCRDIV-LABEL: define dso_local noundef float @test_hypotf(
+// NCRDIV-SAME: float noundef [[X:%.*]], float noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_hypot_f32(float noundef [[X]], float noundef [[Y]]) #[[ATTR14]]
 // NCRDIV-NEXT:    ret float [[CALL_I]]
 //
-// AMDGCNSPIRV-LABEL: @test_hypotf(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_hypot_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR12]]
+// AMDGCNSPIRV-LABEL: define spir_func noundef float @test_hypotf(
+// AMDGCNSPIRV-SAME: float noundef [[X:%.*]], float noundef [[Y:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR4]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_hypot_f32(float noundef [[X]], float noundef [[Y]]) #[[ATTR12]]
 // AMDGCNSPIRV-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_hypotf(float x, float y) {
   return hypotf(x, y);
 }
 
-// DEFAULT-LABEL: @test_hypot(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_hypot_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR14]]
+// DEFAULT-LABEL: define dso_local noundef double @test_hypot(
+// DEFAULT-SAME: double noundef [[X:%.*]], double noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_hypot_f64(double noundef [[X]], double noundef [[Y]]) #[[ATTR14]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
-// FINITEONLY-LABEL: @test_hypot(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_hypot_f64(double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR14]]
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) double @test_hypot(
+// FINITEONLY-SAME: double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_hypot_f64(double noundef nofpclass(nan inf) [[X]], double noundef nofpclass(nan inf) [[Y]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
-// APPROX-LABEL: @test_hypot(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_hypot_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR14]]
+// APPROX-LABEL: define dso_local noundef double @test_hypot(
+// APPROX-SAME: double noundef [[X:%.*]], double noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_hypot_f64(double noundef [[X]], double noundef [[Y]]) #[[ATTR14]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
-// NCRDIV-LABEL: @test_hypot(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_hypot_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR14]]
+// NCRDIV-LABEL: define dso_local noundef double @test_hypot(
+// NCRDIV-SAME: double noundef [[X:%.*]], double noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_hypot_f64(double noundef [[X]], double noundef [[Y]]) #[[ATTR14]]
 // NCRDIV-NEXT:    ret double [[CALL_I]]
 //
-// AMDGCNSPIRV-LABEL: @test_hypot(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_hypot_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR12]]
+// AMDGCNSPIRV-LABEL: define spir_func noundef double @test_hypot(
+// AMDGCNSPIRV-SAME: double noundef [[X:%.*]], double noundef [[Y:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR4]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_hypot_f64(double noundef [[X]], double noundef [[Y]]) #[[ATTR12]]
 // AMDGCNSPIRV-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_hypot(double x, double y) {
   return hypot(x, y);
 }
 
-// DEFAULT-LABEL: @test_ilogbf(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call noundef i32 @__ocml_ilogb_f32(float noundef [[X:%.*]]) #[[ATTR14]]
+// DEFAULT-LABEL: define dso_local noundef i32 @test_ilogbf(
+// DEFAULT-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call noundef i32 @__ocml_ilogb_f32(float noundef [[X]]) #[[ATTR14]]
 // DEFAULT-NEXT:    ret i32 [[CALL_I]]
 //
-// FINITEONLY-LABEL: @test_ilogbf(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call noundef i32 @__ocml_ilogb_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]]
+// FINITEONLY-LABEL: define dso_local noundef i32 @test_ilogbf(
+// FINITEONLY-SAME: float noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call noundef i32 @__ocml_ilogb_f32(float noundef nofpclass(nan inf) [[X]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    ret i32 [[CALL_I]]
 //
-// APPROX-LABEL: @test_ilogbf(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call noundef i32 @__ocml_ilogb_f32(float noundef [[X:%.*]]) #[[ATTR14]]
+// APPROX-LABEL: define dso_local noundef i32 @test_ilogbf(
+// APPROX-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call noundef i32 @__ocml_ilogb_f32(float noundef [[X]]) #[[ATTR14]]
 // APPROX-NEXT:    ret i32 [[CALL_I]]
 //
-// NCRDIV-LABEL: @test_ilogbf(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call noundef i32 @__ocml_ilogb_f32(float noundef [[X:%.*]]) #[[ATTR14]]
+// NCRDIV-LABEL: define dso_local noundef i32 @test_ilogbf(
+// NCRDIV-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call noundef i32 @__ocml_ilogb_f32(float noundef [[X]]) #[[ATTR14]]
 // NCRDIV-NEXT:    ret i32 [[CALL_I]]
 //
-// AMDGCNSPIRV-LABEL: @test_ilogbf(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call spir_func noundef addrspace(4) i32 @__ocml_ilogb_f32(float noundef [[X:%.*]]) #[[ATTR12]]
+// AMDGCNSPIRV-LABEL: define spir_func noundef i32 @test_ilogbf(
+// AMDGCNSPIRV-SAME: float noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR4]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call spir_func noundef addrspace(4) i32 @__ocml_ilogb_f32(float noundef [[X]]) #[[ATTR12]]
 // AMDGCNSPIRV-NEXT:    ret i32 [[CALL_I]]
 //
 extern "C" __device__ int test_ilogbf(float x) {
   return ilogbf(x);
 }
 
-// DEFAULT-LABEL: @test_ilogb(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call noundef i32 @__ocml_ilogb_f64(double noundef [[X:%.*]]) #[[ATTR14]]
+// DEFAULT-LABEL: define dso_local noundef i32 @test_ilogb(
+// DEFAULT-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call noundef i32 @__ocml_ilogb_f64(double noundef [[X]]) #[[ATTR14]]
 // DEFAULT-NEXT:    ret i32 [[CALL_I]]
 //
-// FINITEONLY-LABEL: @test_ilogb(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call noundef i32 @__ocml_ilogb_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]]
+// FINITEONLY-LABEL: define dso_local noundef i32 @test_ilogb(
+// FINITEONLY-SAME: double noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call noundef i32 @__ocml_ilogb_f64(double noundef nofpclass(nan inf) [[X]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    ret i32 [[CALL_I]]
 //
-// APPROX-LABEL: @test_ilogb(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call noundef i32 @__ocml_ilogb_f64(double noundef [[X:%.*]]) #[[ATTR14]]
+// APPROX-LABEL: define dso_local noundef i32 @test_ilogb(
+// APPROX-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call noundef i32 @__ocml_ilogb_f64(double noundef [[X]]) #[[ATTR14]]
 // APPROX-NEXT:    ret i32 [[CALL_I]]
 //
-// NCRDIV-LABEL: @test_ilogb(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call noundef i32 @__ocml_ilogb_f64(double noundef [[X:%.*]]) #[[ATTR14]]
+// NCRDIV-LABEL: define dso_local noundef i32 @test_ilogb(
+// NCRDIV-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call noundef i32 @__ocml_ilogb_f64(double noundef [[X]]) #[[ATTR14]]
 // NCRDIV-NEXT:    ret i32 [[CALL_I]]
 //
-// AMDGCNSPIRV-LABEL: @test_ilogb(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call spir_func noundef addrspace(4) i32 @__ocml_ilogb_f64(double noundef [[X:%.*]]) #[[ATTR12]]
+// AMDGCNSPIRV-LABEL: define spir_func noundef i32 @test_ilogb(
+// AMDGCNSPIRV-SAME: double noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR4]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call spir_func noundef addrspace(4) i32 @__ocml_ilogb_f64(double noundef [[X]]) #[[ATTR12]]
 // AMDGCNSPIRV-NEXT:    ret i32 [[CALL_I]]
 //
 extern "C" __device__ int test_ilogb(double x) {
   return ilogb(x);
 }
 
-// DEFAULT-LABEL: @test___finitef(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call float @llvm.fabs.f32(float [[X:%.*]])
+// DEFAULT-LABEL: define dso_local range(i32 0, 2) i32 @test___finitef(
+// DEFAULT-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call float @llvm.fabs.f32(float [[X]])
 // DEFAULT-NEXT:    [[TMP1:%.*]] = fcmp one float [[TMP0]], 0x7FF0000000000000
 // DEFAULT-NEXT:    [[CONV:%.*]] = zext i1 [[TMP1]] to i32
 // DEFAULT-NEXT:    ret i32 [[CONV]]
 //
-// FINITEONLY-LABEL: @test___finitef(
-// FINITEONLY-NEXT:  entry:
+// FINITEONLY-LABEL: define dso_local noundef range(i32 0, 2) i32 @test___finitef(
+// FINITEONLY-SAME: float noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
 // FINITEONLY-NEXT:    ret i32 1
 //
-// APPROX-LABEL: @test___finitef(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[TMP0:%.*]] = tail call float @llvm.fabs.f32(float [[X:%.*]])
+// APPROX-LABEL: define dso_local range(i32 0, 2) i32 @test___finitef(
+// APPROX-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[TMP0:%.*]] = tail call float @llvm.fabs.f32(float [[X]])
 // APPROX-NEXT:    [[TMP1:%.*]] = fcmp one float [[TMP0]], 0x7FF0000000000000
 // APPROX-NEXT:    [[CONV:%.*]] = zext i1 [[TMP1]] to i32
 // APPROX-NEXT:    ret i32 [[CONV]]
 //
-// NCRDIV-LABEL: @test___finitef(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call float @llvm.fabs.f32(float [[X:%.*]])
+// NCRDIV-LABEL: define dso_local range(i32 0, 2) i32 @test___finitef(
+// NCRDIV-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call float @llvm.fabs.f32(float [[X]])
 // NCRDIV-NEXT:    [[TMP1:%.*]] = fcmp one float [[TMP0]], 0x7FF0000000000000
 // NCRDIV-NEXT:    [[CONV:%.*]] = zext i1 [[TMP1]] to i32
 // NCRDIV-NEXT:    ret i32 [[CONV]]
 //
-// AMDGCNSPIRV-LABEL: @test___finitef(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call addrspace(4) float @llvm.fabs.f32(float [[X:%.*]])
+// AMDGCNSPIRV-LABEL: define spir_func range(i32 0, 2) i32 @test___finitef(
+// AMDGCNSPIRV-SAME: float noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR3]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call addrspace(4) float @llvm.fabs.f32(float [[X]])
 // AMDGCNSPIRV-NEXT:    [[TMP1:%.*]] = fcmp one float [[TMP0]], 0x7FF0000000000000
 // AMDGCNSPIRV-NEXT:    [[CONV:%.*]] = zext i1 [[TMP1]] to i32
 // AMDGCNSPIRV-NEXT:    ret i32 [[CONV]]
@@ -2358,34 +2697,39 @@ extern "C" __device__ BOOL_TYPE test___finitef(float x) {
   return __finitef(x);
 }
 
-// DEFAULT-LABEL: @test___finite(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call double @llvm.fabs.f64(double [[X:%.*]])
+// DEFAULT-LABEL: define dso_local range(i32 0, 2) i32 @test___finite(
+// DEFAULT-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call double @llvm.fabs.f64(double [[X]])
 // DEFAULT-NEXT:    [[TMP1:%.*]] = fcmp one double [[TMP0]], 0x7FF0000000000000
 // DEFAULT-NEXT:    [[CONV:%.*]] = zext i1 [[TMP1]] to i32
 // DEFAULT-NEXT:    ret i32 [[CONV]]
 //
-// FINITEONLY-LABEL: @test___finite(
-// FINITEONLY-NEXT:  entry:
+// FINITEONLY-LABEL: define dso_local noundef range(i32 0, 2) i32 @test___finite(
+// FINITEONLY-SAME: double noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
 // FINITEONLY-NEXT:    ret i32 1
 //
-// APPROX-LABEL: @test___finite(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[TMP0:%.*]] = tail call double @llvm.fabs.f64(double [[X:%.*]])
+// APPROX-LABEL: define dso_local range(i32 0, 2) i32 @test___finite(
+// APPROX-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[TMP0:%.*]] = tail call double @llvm.fabs.f64(double [[X]])
 // APPROX-NEXT:    [[TMP1:%.*]] = fcmp one double [[TMP0]], 0x7FF0000000000000
 // APPROX-NEXT:    [[CONV:%.*]] = zext i1 [[TMP1]] to i32
 // APPROX-NEXT:    ret i32 [[CONV]]
 //
-// NCRDIV-LABEL: @test___finite(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call double @llvm.fabs.f64(double [[X:%.*]])
+// NCRDIV-LABEL: define dso_local range(i32 0, 2) i32 @test___finite(
+// NCRDIV-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call double @llvm.fabs.f64(double [[X]])
 // NCRDIV-NEXT:    [[TMP1:%.*]] = fcmp one double [[TMP0]], 0x7FF0000000000000
 // NCRDIV-NEXT:    [[CONV:%.*]] = zext i1 [[TMP1]] to i32
 // NCRDIV-NEXT:    ret i32 [[CONV]]
 //
-// AMDGCNSPIRV-LABEL: @test___finite(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call addrspace(4) double @llvm.fabs.f64(double [[X:%.*]])
+// AMDGCNSPIRV-LABEL: define spir_func range(i32 0, 2) i32 @test___finite(
+// AMDGCNSPIRV-SAME: double noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR3]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call addrspace(4) double @llvm.fabs.f64(double [[X]])
 // AMDGCNSPIRV-NEXT:    [[TMP1:%.*]] = fcmp one double [[TMP0]], 0x7FF0000000000000
 // AMDGCNSPIRV-NEXT:    [[CONV:%.*]] = zext i1 [[TMP1]] to i32
 // AMDGCNSPIRV-NEXT:    ret i32 [[CONV]]
@@ -2394,34 +2738,39 @@ extern "C" __device__ BOOL_TYPE test___finite(double x) {
   return __finite(x);
 }
 
-// DEFAULT-LABEL: @test___isinff(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call float @llvm.fabs.f32(float [[X:%.*]])
+// DEFAULT-LABEL: define dso_local range(i32 0, 2) i32 @test___isinff(
+// DEFAULT-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call float @llvm.fabs.f32(float [[X]])
 // DEFAULT-NEXT:    [[TMP1:%.*]] = fcmp oeq float [[TMP0]], 0x7FF0000000000000
 // DEFAULT-NEXT:    [[CONV:%.*]] = zext i1 [[TMP1]] to i32
 // DEFAULT-NEXT:    ret i32 [[CONV]]
 //
-// FINITEONLY-LABEL: @test___isinff(
-// FINITEONLY-NEXT:  entry:
+// FINITEONLY-LABEL: define dso_local noundef range(i32 0, 2) i32 @test___isinff(
+// FINITEONLY-SAME: float noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
 // FINITEONLY-NEXT:    ret i32 0
 //
-// APPROX-LABEL: @test___isinff(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[TMP0:%.*]] = tail call float @llvm.fabs.f32(float [[X:%.*]])
+// APPROX-LABEL: define dso_local range(i32 0, 2) i32 @test___isinff(
+// APPROX-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[TMP0:%.*]] = tail call float @llvm.fabs.f32(float [[X]])
 // APPROX-NEXT:    [[TMP1:%.*]] = fcmp oeq float [[TMP0]], 0x7FF0000000000000
 // APPROX-NEXT:    [[CONV:%.*]] = zext i1 [[TMP1]] to i32
 // APPROX-NEXT:    ret i32 [[CONV]]
 //
-// NCRDIV-LABEL: @test___isinff(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call float @llvm.fabs.f32(float [[X:%.*]])
+// NCRDIV-LABEL: define dso_local range(i32 0, 2) i32 @test___isinff(
+// NCRDIV-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call float @llvm.fabs.f32(float [[X]])
 // NCRDIV-NEXT:    [[TMP1:%.*]] = fcmp oeq float [[TMP0]], 0x7FF0000000000000
 // NCRDIV-NEXT:    [[CONV:%.*]] = zext i1 [[TMP1]] to i32
 // NCRDIV-NEXT:    ret i32 [[CONV]]
 //
-// AMDGCNSPIRV-LABEL: @test___isinff(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call addrspace(4) float @llvm.fabs.f32(float [[X:%.*]])
+// AMDGCNSPIRV-LABEL: define spir_func range(i32 0, 2) i32 @test___isinff(
+// AMDGCNSPIRV-SAME: float noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR3]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call addrspace(4) float @llvm.fabs.f32(float [[X]])
 // AMDGCNSPIRV-NEXT:    [[TMP1:%.*]] = fcmp oeq float [[TMP0]], 0x7FF0000000000000
 // AMDGCNSPIRV-NEXT:    [[CONV:%.*]] = zext i1 [[TMP1]] to i32
 // AMDGCNSPIRV-NEXT:    ret i32 [[CONV]]
@@ -2430,34 +2779,39 @@ extern "C" __device__ BOOL_TYPE test___isinff(float x) {
   return __isinff(x);
 }
 
-// DEFAULT-LABEL: @test___isinf(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call double @llvm.fabs.f64(double [[X:%.*]])
+// DEFAULT-LABEL: define dso_local range(i32 0, 2) i32 @test___isinf(
+// DEFAULT-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call double @llvm.fabs.f64(double [[X]])
 // DEFAULT-NEXT:    [[TMP1:%.*]] = fcmp oeq double [[TMP0]], 0x7FF0000000000000
 // DEFAULT-NEXT:    [[CONV:%.*]] = zext i1 [[TMP1]] to i32
 // DEFAULT-NEXT:    ret i32 [[CONV]]
 //
-// FINITEONLY-LABEL: @test___isinf(
-// FINITEONLY-NEXT:  entry:
+// FINITEONLY-LABEL: define dso_local noundef range(i32 0, 2) i32 @test___isinf(
+// FINITEONLY-SAME: double noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
 // FINITEONLY-NEXT:    ret i32 0
 //
-// APPROX-LABEL: @test___isinf(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[TMP0:%.*]] = tail call double @llvm.fabs.f64(double [[X:%.*]])
+// APPROX-LABEL: define dso_local range(i32 0, 2) i32 @test___isinf(
+// APPROX-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[TMP0:%.*]] = tail call double @llvm.fabs.f64(double [[X]])
 // APPROX-NEXT:    [[TMP1:%.*]] = fcmp oeq double [[TMP0]], 0x7FF0000000000000
 // APPROX-NEXT:    [[CONV:%.*]] = zext i1 [[TMP1]] to i32
 // APPROX-NEXT:    ret i32 [[CONV]]
 //
-// NCRDIV-LABEL: @test___isinf(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call double @llvm.fabs.f64(double [[X:%.*]])
+// NCRDIV-LABEL: define dso_local range(i32 0, 2) i32 @test___isinf(
+// NCRDIV-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call double @llvm.fabs.f64(double [[X]])
 // NCRDIV-NEXT:    [[TMP1:%.*]] = fcmp oeq double [[TMP0]], 0x7FF0000000000000
 // NCRDIV-NEXT:    [[CONV:%.*]] = zext i1 [[TMP1]] to i32
 // NCRDIV-NEXT:    ret i32 [[CONV]]
 //
-// AMDGCNSPIRV-LABEL: @test___isinf(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call addrspace(4) double @llvm.fabs.f64(double [[X:%.*]])
+// AMDGCNSPIRV-LABEL: define spir_func range(i32 0, 2) i32 @test___isinf(
+// AMDGCNSPIRV-SAME: double noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR3]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call addrspace(4) double @llvm.fabs.f64(double [[X]])
 // AMDGCNSPIRV-NEXT:    [[TMP1:%.*]] = fcmp oeq double [[TMP0]], 0x7FF0000000000000
 // AMDGCNSPIRV-NEXT:    [[CONV:%.*]] = zext i1 [[TMP1]] to i32
 // AMDGCNSPIRV-NEXT:    ret i32 [[CONV]]
@@ -2466,31 +2820,36 @@ extern "C" __device__ BOOL_TYPE test___isinf(double x) {
   return __isinf(x);
 }
 
-// DEFAULT-LABEL: @test___isnanf(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[TMP0:%.*]] = fcmp uno float [[X:%.*]], 0.000000e+00
+// DEFAULT-LABEL: define dso_local range(i32 0, 2) i32 @test___isnanf(
+// DEFAULT-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[TMP0:%.*]] = fcmp uno float [[X]], 0.000000e+00
 // DEFAULT-NEXT:    [[CONV:%.*]] = zext i1 [[TMP0]] to i32
 // DEFAULT-NEXT:    ret i32 [[CONV]]
 //
-// FINITEONLY-LABEL: @test___isnanf(
-// FINITEONLY-NEXT:  entry:
+// FINITEONLY-LABEL: define dso_local noundef range(i32 0, 2) i32 @test___isnanf(
+// FINITEONLY-SAME: float noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
 // FINITEONLY-NEXT:    ret i32 0
 //
-// APPROX-LABEL: @test___isnanf(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[TMP0:%.*]] = fcmp uno float [[X:%.*]], 0.000000e+00
+// APPROX-LABEL: define dso_local range(i32 0, 2) i32 @test___isnanf(
+// APPROX-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[TMP0:%.*]] = fcmp uno float [[X]], 0.000000e+00
 // APPROX-NEXT:    [[CONV:%.*]] = zext i1 [[TMP0]] to i32
 // APPROX-NEXT:    ret i32 [[CONV]]
 //
-// NCRDIV-LABEL: @test___isnanf(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[TMP0:%.*]] = fcmp uno float [[X:%.*]], 0.000000e+00
+// NCRDIV-LABEL: define dso_local range(i32 0, 2) i32 @test___isnanf(
+// NCRDIV-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[TMP0:%.*]] = fcmp uno float [[X]], 0.000000e+00
 // NCRDIV-NEXT:    [[CONV:%.*]] = zext i1 [[TMP0]] to i32
 // NCRDIV-NEXT:    ret i32 [[CONV]]
 //
-// AMDGCNSPIRV-LABEL: @test___isnanf(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = fcmp uno float [[X:%.*]], 0.000000e+00
+// AMDGCNSPIRV-LABEL: define spir_func range(i32 0, 2) i32 @test___isnanf(
+// AMDGCNSPIRV-SAME: float noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR3]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = fcmp uno float [[X]], 0.000000e+00
 // AMDGCNSPIRV-NEXT:    [[CONV:%.*]] = zext i1 [[TMP0]] to i32
 // AMDGCNSPIRV-NEXT:    ret i32 [[CONV]]
 //
@@ -2498,31 +2857,36 @@ extern "C" __device__ BOOL_TYPE test___isnanf(float x) {
   return __isnanf(x);
 }
 
-// DEFAULT-LABEL: @test___isnan(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[TMP0:%.*]] = fcmp uno double [[X:%.*]], 0.000000e+00
+// DEFAULT-LABEL: define dso_local range(i32 0, 2) i32 @test___isnan(
+// DEFAULT-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[TMP0:%.*]] = fcmp uno double [[X]], 0.000000e+00
 // DEFAULT-NEXT:    [[CONV:%.*]] = zext i1 [[TMP0]] to i32
 // DEFAULT-NEXT:    ret i32 [[CONV]]
 //
-// FINITEONLY-LABEL: @test___isnan(
-// FINITEONLY-NEXT:  entry:
+// FINITEONLY-LABEL: define dso_local noundef range(i32 0, 2) i32 @test___isnan(
+// FINITEONLY-SAME: double noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
 // FINITEONLY-NEXT:    ret i32 0
 //
-// APPROX-LABEL: @test___isnan(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[TMP0:%.*]] = fcmp uno double [[X:%.*]], 0.000000e+00
+// APPROX-LABEL: define dso_local range(i32 0, 2) i32 @test___isnan(
+// APPROX-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[TMP0:%.*]] = fcmp uno double [[X]], 0.000000e+00
 // APPROX-NEXT:    [[CONV:%.*]] = zext i1 [[TMP0]] to i32
 // APPROX-NEXT:    ret i32 [[CONV]]
 //
-// NCRDIV-LABEL: @test___isnan(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[TMP0:%.*]] = fcmp uno double [[X:%.*]], 0.000000e+00
+// NCRDIV-LABEL: define dso_local range(i32 0, 2) i32 @test___isnan(
+// NCRDIV-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[TMP0:%.*]] = fcmp uno double [[X]], 0.000000e+00
 // NCRDIV-NEXT:    [[CONV:%.*]] = zext i1 [[TMP0]] to i32
 // NCRDIV-NEXT:    ret i32 [[CONV]]
 //
-// AMDGCNSPIRV-LABEL: @test___isnan(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = fcmp uno double [[X:%.*]], 0.000000e+00
+// AMDGCNSPIRV-LABEL: define spir_func range(i32 0, 2) i32 @test___isnan(
+// AMDGCNSPIRV-SAME: double noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR3]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = fcmp uno double [[X]], 0.000000e+00
 // AMDGCNSPIRV-NEXT:    [[CONV:%.*]] = zext i1 [[TMP0]] to i32
 // AMDGCNSPIRV-NEXT:    ret i32 [[CONV]]
 //
@@ -2530,143 +2894,164 @@ extern "C" __device__ BOOL_TYPE test___isnan(double x) {
   return __isnan(x);
 }
 
-// DEFAULT-LABEL: @test_j0f(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_j0_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// DEFAULT-LABEL: define dso_local noundef float @test_j0f(
+// DEFAULT-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_j0_f32(float noundef [[X]]) #[[ATTR16]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
-// FINITEONLY-LABEL: @test_j0f(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_j0_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) float @test_j0f(
+// FINITEONLY-SAME: float noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_j0_f32(float noundef nofpclass(nan inf) [[X]]) #[[ATTR16]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
-// APPROX-LABEL: @test_j0f(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_j0_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// APPROX-LABEL: define dso_local noundef float @test_j0f(
+// APPROX-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_j0_f32(float noundef [[X]]) #[[ATTR16]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
-// NCRDIV-LABEL: @test_j0f(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_j0_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// NCRDIV-LABEL: define dso_local noundef float @test_j0f(
+// NCRDIV-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_j0_f32(float noundef [[X]]) #[[ATTR16]]
 // NCRDIV-NEXT:    ret float [[CALL_I]]
 //
-// AMDGCNSPIRV-LABEL: @test_j0f(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_j0_f32(float noundef [[X:%.*]]) #[[ATTR14]]
+// AMDGCNSPIRV-LABEL: define spir_func noundef float @test_j0f(
+// AMDGCNSPIRV-SAME: float noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR6]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_j0_f32(float noundef [[X]]) #[[ATTR14]]
 // AMDGCNSPIRV-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_j0f(float x) {
   return j0f(x);
 }
 
-// DEFAULT-LABEL: @test_j0(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_j0_f64(double noundef [[X:%.*]]) #[[ATTR16]]
+// DEFAULT-LABEL: define dso_local noundef double @test_j0(
+// DEFAULT-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_j0_f64(double noundef [[X]]) #[[ATTR16]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
-// FINITEONLY-LABEL: @test_j0(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_j0_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) double @test_j0(
+// FINITEONLY-SAME: double noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_j0_f64(double noundef nofpclass(nan inf) [[X]]) #[[ATTR16]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
-// APPROX-LABEL: @test_j0(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_j0_f64(double noundef [[X:%.*]]) #[[ATTR16]]
+// APPROX-LABEL: define dso_local noundef double @test_j0(
+// APPROX-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_j0_f64(double noundef [[X]]) #[[ATTR16]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
-// NCRDIV-LABEL: @test_j0(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_j0_f64(double noundef [[X:%.*]]) #[[ATTR16]]
+// NCRDIV-LABEL: define dso_local noundef double @test_j0(
+// NCRDIV-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_j0_f64(double noundef [[X]]) #[[ATTR16]]
 // NCRDIV-NEXT:    ret double [[CALL_I]]
 //
-// AMDGCNSPIRV-LABEL: @test_j0(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_j0_f64(double noundef [[X:%.*]]) #[[ATTR14]]
+// AMDGCNSPIRV-LABEL: define spir_func noundef double @test_j0(
+// AMDGCNSPIRV-SAME: double noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR6]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_j0_f64(double noundef [[X]]) #[[ATTR14]]
 // AMDGCNSPIRV-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_j0(double x) {
   return j0(x);
 }
 
-// DEFAULT-LABEL: @test_j1f(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_j1_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// DEFAULT-LABEL: define dso_local noundef float @test_j1f(
+// DEFAULT-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_j1_f32(float noundef [[X]]) #[[ATTR16]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
-// FINITEONLY-LABEL: @test_j1f(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_j1_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) float @test_j1f(
+// FINITEONLY-SAME: float noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_j1_f32(float noundef nofpclass(nan inf) [[X]]) #[[ATTR16]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
-// APPROX-LABEL: @test_j1f(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_j1_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// APPROX-LABEL: define dso_local noundef float @test_j1f(
+// APPROX-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_j1_f32(float noundef [[X]]) #[[ATTR16]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
-// NCRDIV-LABEL: @test_j1f(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_j1_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// NCRDIV-LABEL: define dso_local noundef float @test_j1f(
+// NCRDIV-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_j1_f32(float noundef [[X]]) #[[ATTR16]]
 // NCRDIV-NEXT:    ret float [[CALL_I]]
 //
-// AMDGCNSPIRV-LABEL: @test_j1f(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_j1_f32(float noundef [[X:%.*]]) #[[ATTR14]]
+// AMDGCNSPIRV-LABEL: define spir_func noundef float @test_j1f(
+// AMDGCNSPIRV-SAME: float noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR6]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_j1_f32(float noundef [[X]]) #[[ATTR14]]
 // AMDGCNSPIRV-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_j1f(float x) {
   return j1f(x);
 }
 
-// DEFAULT-LABEL: @test_j1(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_j1_f64(double noundef [[X:%.*]]) #[[ATTR16]]
+// DEFAULT-LABEL: define dso_local noundef double @test_j1(
+// DEFAULT-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_j1_f64(double noundef [[X]]) #[[ATTR16]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
-// FINITEONLY-LABEL: @test_j1(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_j1_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) double @test_j1(
+// FINITEONLY-SAME: double noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_j1_f64(double noundef nofpclass(nan inf) [[X]]) #[[ATTR16]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
-// APPROX-LABEL: @test_j1(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_j1_f64(double noundef [[X:%.*]]) #[[ATTR16]]
+// APPROX-LABEL: define dso_local noundef double @test_j1(
+// APPROX-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_j1_f64(double noundef [[X]]) #[[ATTR16]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
-// NCRDIV-LABEL: @test_j1(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_j1_f64(double noundef [[X:%.*]]) #[[ATTR16]]
+// NCRDIV-LABEL: define dso_local noundef double @test_j1(
+// NCRDIV-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_j1_f64(double noundef [[X]]) #[[ATTR16]]
 // NCRDIV-NEXT:    ret double [[CALL_I]]
 //
-// AMDGCNSPIRV-LABEL: @test_j1(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_j1_f64(double noundef [[X:%.*]]) #[[ATTR14]]
+// AMDGCNSPIRV-LABEL: define spir_func noundef double @test_j1(
+// AMDGCNSPIRV-SAME: double noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR6]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_j1_f64(double noundef [[X]]) #[[ATTR14]]
 // AMDGCNSPIRV-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_j1(double x) {
   return j1(x);
 }
 
-// DEFAULT-LABEL: @test_jnf(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    switch i32 [[X:%.*]], label [[IF_END4_I:%.*]] [
-// DEFAULT-NEXT:      i32 0, label [[IF_THEN_I:%.*]]
-// DEFAULT-NEXT:      i32 1, label [[IF_THEN2_I:%.*]]
+// DEFAULT-LABEL: define dso_local float @test_jnf(
+// DEFAULT-SAME: i32 noundef [[X:%.*]], float noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    switch i32 [[X]], label %[[IF_END4_I:.*]] [
+// DEFAULT-NEXT:      i32 0, label %[[IF_THEN_I:.*]]
+// DEFAULT-NEXT:      i32 1, label %[[IF_THEN2_I:.*]]
 // DEFAULT-NEXT:    ]
-// DEFAULT:       if.then.i:
-// DEFAULT-NEXT:    [[CALL_I20_I:%.*]] = tail call contract noundef float @__ocml_j0_f32(float noundef [[Y:%.*]]) #[[ATTR16]]
-// DEFAULT-NEXT:    br label [[_ZL3JNFIF_EXIT:%.*]]
-// DEFAULT:       if.then2.i:
+// DEFAULT:       [[IF_THEN_I]]:
+// DEFAULT-NEXT:    [[CALL_I20_I:%.*]] = tail call contract noundef float @__ocml_j0_f32(float noundef [[Y]]) #[[ATTR16]]
+// DEFAULT-NEXT:    br label %[[_ZL3JNFIF_EXIT:.*]]
+// DEFAULT:       [[IF_THEN2_I]]:
 // DEFAULT-NEXT:    [[CALL_I22_I:%.*]] = tail call contract noundef float @__ocml_j1_f32(float noundef [[Y]]) #[[ATTR16]]
-// DEFAULT-NEXT:    br label [[_ZL3JNFIF_EXIT]]
-// DEFAULT:       if.end4.i:
+// DEFAULT-NEXT:    br label %[[_ZL3JNFIF_EXIT]]
+// DEFAULT:       [[IF_END4_I]]:
 // DEFAULT-NEXT:    [[CALL_I_I:%.*]] = tail call contract noundef float @__ocml_j0_f32(float noundef [[Y]]) #[[ATTR16]]
 // DEFAULT-NEXT:    [[CALL_I21_I:%.*]] = tail call contract noundef float @__ocml_j1_f32(float noundef [[Y]]) #[[ATTR16]]
 // DEFAULT-NEXT:    [[CMP7_I1:%.*]] = icmp sgt i32 [[X]], 1
-// DEFAULT-NEXT:    br i1 [[CMP7_I1]], label [[FOR_BODY_I:%.*]], label [[_ZL3JNFIF_EXIT]]
-// DEFAULT:       for.body.i:
-// DEFAULT-NEXT:    [[__I_0_I4:%.*]] = phi i32 [ [[INC_I:%.*]], [[FOR_BODY_I]] ], [ 1, [[IF_END4_I]] ]
-// DEFAULT-NEXT:    [[__X1_0_I3:%.*]] = phi float [ [[SUB_I:%.*]], [[FOR_BODY_I]] ], [ [[CALL_I21_I]], [[IF_END4_I]] ]
-// DEFAULT-NEXT:    [[__X0_0_I2:%.*]] = phi float [ [[__X1_0_I3]], [[FOR_BODY_I]] ], [ [[CALL_I_I]], [[IF_END4_I]] ]
+// DEFAULT-NEXT:    br i1 [[CMP7_I1]], label %[[FOR_BODY_I:.*]], label %[[_ZL3JNFIF_EXIT]]
+// DEFAULT:       [[FOR_BODY_I]]:
+// DEFAULT-NEXT:    [[__I_0_I4:%.*]] = phi i32 [ [[INC_I:%.*]], %[[FOR_BODY_I]] ], [ 1, %[[IF_END4_I]] ]
+// DEFAULT-NEXT:    [[__X1_0_I3:%.*]] = phi float [ [[SUB_I:%.*]], %[[FOR_BODY_I]] ], [ [[CALL_I21_I]], %[[IF_END4_I]] ]
+// DEFAULT-NEXT:    [[__X0_0_I2:%.*]] = phi float [ [[__X1_0_I3]], %[[FOR_BODY_I]] ], [ [[CALL_I_I]], %[[IF_END4_I]] ]
 // DEFAULT-NEXT:    [[MUL_I:%.*]] = shl nuw nsw i32 [[__I_0_I4]], 1
 // DEFAULT-NEXT:    [[CONV_I:%.*]] = uitofp nneg i32 [[MUL_I]] to float
 // DEFAULT-NEXT:    [[DIV_I:%.*]] = fdiv contract float [[CONV_I]], [[Y]]
@@ -2674,32 +3059,33 @@ extern "C" __device__ double test_j1(double x) {
 // DEFAULT-NEXT:    [[SUB_I]] = fsub contract float [[MUL8_I]], [[__X0_0_I2]]
 // DEFAULT-NEXT:    [[INC_I]] = add nuw nsw i32 [[__I_0_I4]], 1
 // DEFAULT-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC_I]], [[X]]
-// DEFAULT-NEXT:    br i1 [[EXITCOND_NOT]], label [[_ZL3JNFIF_EXIT]], label [[FOR_BODY_I]], !llvm.loop [[LOOP14:![0-9]+]]
-// DEFAULT:       _ZL3jnfif.exit:
-// DEFAULT-NEXT:    [[RETVAL_0_I:%.*]] = phi float [ [[CALL_I20_I]], [[IF_THEN_I]] ], [ [[CALL_I22_I]], [[IF_THEN2_I]] ], [ [[CALL_I21_I]], [[IF_END4_I]] ], [ [[SUB_I]], [[FOR_BODY_I]] ]
+// DEFAULT-NEXT:    br i1 [[EXITCOND_NOT]], label %[[_ZL3JNFIF_EXIT]], label %[[FOR_BODY_I]], !llvm.loop [[LOOP14:![0-9]+]]
+// DEFAULT:       [[_ZL3JNFIF_EXIT]]:
+// DEFAULT-NEXT:    [[RETVAL_0_I:%.*]] = phi float [ [[CALL_I20_I]], %[[IF_THEN_I]] ], [ [[CALL_I22_I]], %[[IF_THEN2_I]] ], [ [[CALL_I21_I]], %[[IF_END4_I]] ], [ [[SUB_I]], %[[FOR_BODY_I]] ]
 // DEFAULT-NEXT:    ret float [[RETVAL_0_I]]
 //
-// FINITEONLY-LABEL: @test_jnf(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    switch i32 [[X:%.*]], label [[IF_END4_I:%.*]] [
-// FINITEONLY-NEXT:      i32 0, label [[IF_THEN_I:%.*]]
-// FINITEONLY-NEXT:      i32 1, label [[IF_THEN2_I:%.*]]
+// FINITEONLY-LABEL: define dso_local nofpclass(nan inf) float @test_jnf(
+// FINITEONLY-SAME: i32 noundef [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    switch i32 [[X]], label %[[IF_END4_I:.*]] [
+// FINITEONLY-NEXT:      i32 0, label %[[IF_THEN_I:.*]]
+// FINITEONLY-NEXT:      i32 1, label %[[IF_THEN2_I:.*]]
 // FINITEONLY-NEXT:    ]
-// FINITEONLY:       if.then.i:
-// FINITEONLY-NEXT:    [[CALL_I20_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_j0_f32(float noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR16]]
-// FINITEONLY-NEXT:    br label [[_ZL3JNFIF_EXIT:%.*]]
-// FINITEONLY:       if.then2.i:
+// FINITEONLY:       [[IF_THEN_I]]:
+// FINITEONLY-NEXT:    [[CALL_I20_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_j0_f32(float noundef nofpclass(nan inf) [[Y]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    br label %[[_ZL3JNFIF_EXIT:.*]]
+// FINITEONLY:       [[IF_THEN2_I]]:
 // FINITEONLY-NEXT:    [[CALL_I22_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_j1_f32(float noundef nofpclass(nan inf) [[Y]]) #[[ATTR16]]
-// FINITEONLY-NEXT:    br label [[_ZL3JNFIF_EXIT]]
-// FINITEONLY:       if.end4.i:
+// FINITEONLY-NEXT:    br label %[[_ZL3JNFIF_EXIT]]
+// FINITEONLY:       [[IF_END4_I]]:
 // FINITEONLY-NEXT:    [[CALL_I_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_j0_f32(float noundef nofpclass(nan inf) [[Y]]) #[[ATTR16]]
 // FINITEONLY-NEXT:    [[CALL_I21_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_j1_f32(float noundef nofpclass(nan inf) [[Y]]) #[[ATTR16]]
 // FINITEONLY-NEXT:    [[CMP7_I1:%.*]] = icmp sgt i32 [[X]], 1
-// FINITEONLY-NEXT:    br i1 [[CMP7_I1]], label [[FOR_BODY_I:%.*]], label [[_ZL3JNFIF_EXIT]]
-// FINITEONLY:       for.body.i:
-// FINITEONLY-NEXT:    [[__I_0_I4:%.*]] = phi i32 [ [[INC_I:%.*]], [[FOR_BODY_I]] ], [ 1, [[IF_END4_I]] ]
-// FINITEONLY-NEXT:    [[__X1_0_I3:%.*]] = phi float [ [[SUB_I:%.*]], [[FOR_BODY_I]] ], [ [[CALL_I21_I]], [[IF_END4_I]] ]
-// FINITEONLY-NEXT:    [[__X0_0_I2:%.*]] = phi float [ [[__X1_0_I3]], [[FOR_BODY_I]] ], [ [[CALL_I_I]], [[IF_END4_I]] ]
+// FINITEONLY-NEXT:    br i1 [[CMP7_I1]], label %[[FOR_BODY_I:.*]], label %[[_ZL3JNFIF_EXIT]]
+// FINITEONLY:       [[FOR_BODY_I]]:
+// FINITEONLY-NEXT:    [[__I_0_I4:%.*]] = phi i32 [ [[INC_I:%.*]], %[[FOR_BODY_I]] ], [ 1, %[[IF_END4_I]] ]
+// FINITEONLY-NEXT:    [[__X1_0_I3:%.*]] = phi float [ [[SUB_I:%.*]], %[[FOR_BODY_I]] ], [ [[CALL_I21_I]], %[[IF_END4_I]] ]
+// FINITEONLY-NEXT:    [[__X0_0_I2:%.*]] = phi float [ [[__X1_0_I3]], %[[FOR_BODY_I]] ], [ [[CALL_I_I]], %[[IF_END4_I]] ]
 // FINITEONLY-NEXT:    [[MUL_I:%.*]] = shl nuw nsw i32 [[__I_0_I4]], 1
 // FINITEONLY-NEXT:    [[CONV_I:%.*]] = uitofp nneg i32 [[MUL_I]] to float
 // FINITEONLY-NEXT:    [[DIV_I:%.*]] = fdiv nnan ninf contract float [[CONV_I]], [[Y]]
@@ -2707,32 +3093,33 @@ extern "C" __device__ double test_j1(double x) {
 // FINITEONLY-NEXT:    [[SUB_I]] = fsub nnan ninf contract float [[MUL8_I]], [[__X0_0_I2]]
 // FINITEONLY-NEXT:    [[INC_I]] = add nuw nsw i32 [[__I_0_I4]], 1
 // FINITEONLY-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC_I]], [[X]]
-// FINITEONLY-NEXT:    br i1 [[EXITCOND_NOT]], label [[_ZL3JNFIF_EXIT]], label [[FOR_BODY_I]], !llvm.loop [[LOOP14:![0-9]+]]
-// FINITEONLY:       _ZL3jnfif.exit:
-// FINITEONLY-NEXT:    [[RETVAL_0_I:%.*]] = phi float [ [[CALL_I20_I]], [[IF_THEN_I]] ], [ [[CALL_I22_I]], [[IF_THEN2_I]] ], [ [[CALL_I21_I]], [[IF_END4_I]] ], [ [[SUB_I]], [[FOR_BODY_I]] ]
+// FINITEONLY-NEXT:    br i1 [[EXITCOND_NOT]], label %[[_ZL3JNFIF_EXIT]], label %[[FOR_BODY_I]], !llvm.loop [[LOOP14:![0-9]+]]
+// FINITEONLY:       [[_ZL3JNFIF_EXIT]]:
+// FINITEONLY-NEXT:    [[RETVAL_0_I:%.*]] = phi float [ [[CALL_I20_I]], %[[IF_THEN_I]] ], [ [[CALL_I22_I]], %[[IF_THEN2_I]] ], [ [[CALL_I21_I]], %[[IF_END4_I]] ], [ [[SUB_I]], %[[FOR_BODY_I]] ]
 // FINITEONLY-NEXT:    ret float [[RETVAL_0_I]]
 //
-// APPROX-LABEL: @test_jnf(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    switch i32 [[X:%.*]], label [[IF_END4_I:%.*]] [
-// APPROX-NEXT:      i32 0, label [[IF_THEN_I:%.*]]
-// APPROX-NEXT:      i32 1, label [[IF_THEN2_I:%.*]]
+// APPROX-LABEL: define dso_local float @test_jnf(
+// APPROX-SAME: i32 noundef [[X:%.*]], float noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    switch i32 [[X]], label %[[IF_END4_I:.*]] [
+// APPROX-NEXT:      i32 0, label %[[IF_THEN_I:.*]]
+// APPROX-NEXT:      i32 1, label %[[IF_THEN2_I:.*]]
 // APPROX-NEXT:    ]
-// APPROX:       if.then.i:
-// APPROX-NEXT:    [[CALL_I20_I:%.*]] = tail call contract noundef float @__ocml_j0_f32(float noundef [[Y:%.*]]) #[[ATTR16]]
-// APPROX-NEXT:    br label [[_ZL3JNFIF_EXIT:%.*]]
-// APPROX:       if.then2.i:
+// APPROX:       [[IF_THEN_I]]:
+// APPROX-NEXT:    [[CALL_I20_I:%.*]] = tail call contract noundef float @__ocml_j0_f32(float noundef [[Y]]) #[[ATTR16]]
+// APPROX-NEXT:    br label %[[_ZL3JNFIF_EXIT:.*]]
+// APPROX:       [[IF_THEN2_I]]:
 // APPROX-NEXT:    [[CALL_I22_I:%.*]] = tail call contract noundef float @__ocml_j1_f32(float noundef [[Y]]) #[[ATTR16]]
-// APPROX-NEXT:    br label [[_ZL3JNFIF_EXIT]]
-// APPROX:       if.end4.i:
+// APPROX-NEXT:    br label %[[_ZL3JNFIF_EXIT]]
+// APPROX:       [[IF_END4_I]]:
 // APPROX-NEXT:    [[CALL_I_I:%.*]] = tail call contract noundef float @__ocml_j0_f32(float noundef [[Y]]) #[[ATTR16]]
 // APPROX-NEXT:    [[CALL_I21_I:%.*]] = tail call contract noundef float @__ocml_j1_f32(float noundef [[Y]]) #[[ATTR16]]
 // APPROX-NEXT:    [[CMP7_I1:%.*]] = icmp sgt i32 [[X]], 1
-// APPROX-NEXT:    br i1 [[CMP7_I1]], label [[FOR_BODY_I:%.*]], label [[_ZL3JNFIF_EXIT]]
-// APPROX:       for.body.i:
-// APPROX-NEXT:    [[__I_0_I4:%.*]] = phi i32 [ [[INC_I:%.*]], [[FOR_BODY_I]] ], [ 1, [[IF_END4_I]] ]
-// APPROX-NEXT:    [[__X1_0_I3:%.*]] = phi float [ [[SUB_I:%.*]], [[FOR_BODY_I]] ], [ [[CALL_I21_I]], [[IF_END4_I]] ]
-// APPROX-NEXT:    [[__X0_0_I2:%.*]] = phi float [ [[__X1_0_I3]], [[FOR_BODY_I]] ], [ [[CALL_I_I]], [[IF_END4_I]] ]
+// APPROX-NEXT:    br i1 [[CMP7_I1]], label %[[FOR_BODY_I:.*]], label %[[_ZL3JNFIF_EXIT]]
+// APPROX:       [[FOR_BODY_I]]:
+// APPROX-NEXT:    [[__I_0_I4:%.*]] = phi i32 [ [[INC_I:%.*]], %[[FOR_BODY_I]] ], [ 1, %[[IF_END4_I]] ]
+// APPROX-NEXT:    [[__X1_0_I3:%.*]] = phi float [ [[SUB_I:%.*]], %[[FOR_BODY_I]] ], [ [[CALL_I21_I]], %[[IF_END4_I]] ]
+// APPROX-NEXT:    [[__X0_0_I2:%.*]] = phi float [ [[__X1_0_I3]], %[[FOR_BODY_I]] ], [ [[CALL_I_I]], %[[IF_END4_I]] ]
 // APPROX-NEXT:    [[MUL_I:%.*]] = shl nuw nsw i32 [[__I_0_I4]], 1
 // APPROX-NEXT:    [[CONV_I:%.*]] = uitofp nneg i32 [[MUL_I]] to float
 // APPROX-NEXT:    [[DIV_I:%.*]] = fdiv contract float [[CONV_I]], [[Y]]
@@ -2740,32 +3127,33 @@ extern "C" __device__ double test_j1(double x) {
 // APPROX-NEXT:    [[SUB_I]] = fsub contract float [[MUL8_I]], [[__X0_0_I2]]
 // APPROX-NEXT:    [[INC_I]] = add nuw nsw i32 [[__I_0_I4]], 1
 // APPROX-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC_I]], [[X]]
-// APPROX-NEXT:    br i1 [[EXITCOND_NOT]], label [[_ZL3JNFIF_EXIT]], label [[FOR_BODY_I]], !llvm.loop [[LOOP14:![0-9]+]]
-// APPROX:       _ZL3jnfif.exit:
-// APPROX-NEXT:    [[RETVAL_0_I:%.*]] = phi float [ [[CALL_I20_I]], [[IF_THEN_I]] ], [ [[CALL_I22_I]], [[IF_THEN2_I]] ], [ [[CALL_I21_I]], [[IF_END4_I]] ], [ [[SUB_I]], [[FOR_BODY_I]] ]
+// APPROX-NEXT:    br i1 [[EXITCOND_NOT]], label %[[_ZL3JNFIF_EXIT]], label %[[FOR_BODY_I]], !llvm.loop [[LOOP14:![0-9]+]]
+// APPROX:       [[_ZL3JNFIF_EXIT]]:
+// APPROX-NEXT:    [[RETVAL_0_I:%.*]] = phi float [ [[CALL_I20_I]], %[[IF_THEN_I]] ], [ [[CALL_I22_I]], %[[IF_THEN2_I]] ], [ [[CALL_I21_I]], %[[IF_END4_I]] ], [ [[SUB_I]], %[[FOR_BODY_I]] ]
 // APPROX-NEXT:    ret float [[RETVAL_0_I]]
 //
-// NCRDIV-LABEL: @test_jnf(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    switch i32 [[X:%.*]], label [[IF_END4_I:%.*]] [
-// NCRDIV-NEXT:      i32 0, label [[IF_THEN_I:%.*]]
-// NCRDIV-NEXT:      i32 1, label [[IF_THEN2_I:%.*]]
+// NCRDIV-LABEL: define dso_local float @test_jnf(
+// NCRDIV-SAME: i32 noundef [[X:%.*]], float noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    switch i32 [[X]], label %[[IF_END4_I:.*]] [
+// NCRDIV-NEXT:      i32 0, label %[[IF_THEN_I:.*]]
+// NCRDIV-NEXT:      i32 1, label %[[IF_THEN2_I:.*]]
 // NCRDIV-NEXT:    ]
-// NCRDIV:       if.then.i:
-// NCRDIV-NEXT:    [[CALL_I20_I:%.*]] = tail call contract noundef float @__ocml_j0_f32(float noundef [[Y:%.*]]) #[[ATTR16]]
-// NCRDIV-NEXT:    br label [[_ZL3JNFIF_EXIT:%.*]]
-// NCRDIV:       if.then2.i:
+// NCRDIV:       [[IF_THEN_I]]:
+// NCRDIV-NEXT:    [[CALL_I20_I:%.*]] = tail call contract noundef float @__ocml_j0_f32(float noundef [[Y]]) #[[ATTR16]]
+// NCRDIV-NEXT:    br label %[[_ZL3JNFIF_EXIT:.*]]
+// NCRDIV:       [[IF_THEN2_I]]:
 // NCRDIV-NEXT:    [[CALL_I22_I:%.*]] = tail call contract noundef float @__ocml_j1_f32(float noundef [[Y]]) #[[ATTR16]]
-// NCRDIV-NEXT:    br label [[_ZL3JNFIF_EXIT]]
-// NCRDIV:       if.end4.i:
+// NCRDIV-NEXT:    br label %[[_ZL3JNFIF_EXIT]]
+// NCRDIV:       [[IF_END4_I]]:
 // NCRDIV-NEXT:    [[CALL_I_I:%.*]] = tail call contract noundef float @__ocml_j0_f32(float noundef [[Y]]) #[[ATTR16]]
 // NCRDIV-NEXT:    [[CALL_I21_I:%.*]] = tail call contract noundef float @__ocml_j1_f32(float noundef [[Y]]) #[[ATTR16]]
 // NCRDIV-NEXT:    [[CMP7_I1:%.*]] = icmp sgt i32 [[X]], 1
-// NCRDIV-NEXT:    br i1 [[CMP7_I1]], label [[FOR_BODY_I:%.*]], label [[_ZL3JNFIF_EXIT]]
-// NCRDIV:       for.body.i:
-// NCRDIV-NEXT:    [[__I_0_I4:%.*]] = phi i32 [ [[INC_I:%.*]], [[FOR_BODY_I]] ], [ 1, [[IF_END4_I]] ]
-// NCRDIV-NEXT:    [[__X1_0_I3:%.*]] = phi float [ [[SUB_I:%.*]], [[FOR_BODY_I]] ], [ [[CALL_I21_I]], [[IF_END4_I]] ]
-// NCRDIV-NEXT:    [[__X0_0_I2:%.*]] = phi float [ [[__X1_0_I3]], [[FOR_BODY_I]] ], [ [[CALL_I_I]], [[IF_END4_I]] ]
+// NCRDIV-NEXT:    br i1 [[CMP7_I1]], label %[[FOR_BODY_I:.*]], label %[[_ZL3JNFIF_EXIT]]
+// NCRDIV:       [[FOR_BODY_I]]:
+// NCRDIV-NEXT:    [[__I_0_I4:%.*]] = phi i32 [ [[INC_I:%.*]], %[[FOR_BODY_I]] ], [ 1, %[[IF_END4_I]] ]
+// NCRDIV-NEXT:    [[__X1_0_I3:%.*]] = phi float [ [[SUB_I:%.*]], %[[FOR_BODY_I]] ], [ [[CALL_I21_I]], %[[IF_END4_I]] ]
+// NCRDIV-NEXT:    [[__X0_0_I2:%.*]] = phi float [ [[__X1_0_I3]], %[[FOR_BODY_I]] ], [ [[CALL_I_I]], %[[IF_END4_I]] ]
 // NCRDIV-NEXT:    [[MUL_I:%.*]] = shl nuw nsw i32 [[__I_0_I4]], 1
 // NCRDIV-NEXT:    [[CONV_I:%.*]] = uitofp nneg i32 [[MUL_I]] to float
 // NCRDIV-NEXT:    [[DIV_I:%.*]] = fdiv contract float [[CONV_I]], [[Y]], !fpmath [[META12]]
@@ -2773,32 +3161,33 @@ extern "C" __device__ double test_j1(double x) {
 // NCRDIV-NEXT:    [[SUB_I]] = fsub contract float [[MUL8_I]], [[__X0_0_I2]]
 // NCRDIV-NEXT:    [[INC_I]] = add nuw nsw i32 [[__I_0_I4]], 1
 // NCRDIV-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC_I]], [[X]]
-// NCRDIV-NEXT:    br i1 [[EXITCOND_NOT]], label [[_ZL3JNFIF_EXIT]], label [[FOR_BODY_I]], !llvm.loop [[LOOP15:![0-9]+]]
-// NCRDIV:       _ZL3jnfif.exit:
-// NCRDIV-NEXT:    [[RETVAL_0_I:%.*]] = phi float [ [[CALL_I20_I]], [[IF_THEN_I]] ], [ [[CALL_I22_I]], [[IF_THEN2_I]] ], [ [[CALL_I21_I]], [[IF_END4_I]] ], [ [[SUB_I]], [[FOR_BODY_I]] ]
+// NCRDIV-NEXT:    br i1 [[EXITCOND_NOT]], label %[[_ZL3JNFIF_EXIT]], label %[[FOR_BODY_I]], !llvm.loop [[LOOP15:![0-9]+]]
+// NCRDIV:       [[_ZL3JNFIF_EXIT]]:
+// NCRDIV-NEXT:    [[RETVAL_0_I:%.*]] = phi float [ [[CALL_I20_I]], %[[IF_THEN_I]] ], [ [[CALL_I22_I]], %[[IF_THEN2_I]] ], [ [[CALL_I21_I]], %[[IF_END4_I]] ], [ [[SUB_I]], %[[FOR_BODY_I]] ]
 // NCRDIV-NEXT:    ret float [[RETVAL_0_I]]
 //
-// AMDGCNSPIRV-LABEL: @test_jnf(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    switch i32 [[X:%.*]], label [[IF_END4_I:%.*]] [
-// AMDGCNSPIRV-NEXT:      i32 0, label [[IF_THEN_I:%.*]]
-// AMDGCNSPIRV-NEXT:      i32 1, label [[IF_THEN2_I:%.*]]
+// AMDGCNSPIRV-LABEL: define spir_func float @test_jnf(
+// AMDGCNSPIRV-SAME: i32 noundef [[X:%.*]], float noundef [[Y:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR6]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    switch i32 [[X]], label %[[IF_END4_I:.*]] [
+// AMDGCNSPIRV-NEXT:      i32 0, label %[[IF_THEN_I:.*]]
+// AMDGCNSPIRV-NEXT:      i32 1, label %[[IF_THEN2_I:.*]]
 // AMDGCNSPIRV-NEXT:    ]
-// AMDGCNSPIRV:       if.then.i:
-// AMDGCNSPIRV-NEXT:    [[CALL_I20_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_j0_f32(float noundef [[Y:%.*]]) #[[ATTR14]]
-// AMDGCNSPIRV-NEXT:    br label [[_ZL3JNFIF_EXIT:%.*]]
-// AMDGCNSPIRV:       if.then2.i:
+// AMDGCNSPIRV:       [[IF_THEN_I]]:
+// AMDGCNSPIRV-NEXT:    [[CALL_I20_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_j0_f32(float noundef [[Y]]) #[[ATTR14]]
+// AMDGCNSPIRV-NEXT:    br label %[[_ZL3JNFIF_EXIT:.*]]
+// AMDGCNSPIRV:       [[IF_THEN2_I]]:
 // AMDGCNSPIRV-NEXT:    [[CALL_I22_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_j1_f32(float noundef [[Y]]) #[[ATTR14]]
-// AMDGCNSPIRV-NEXT:    br label [[_ZL3JNFIF_EXIT]]
-// AMDGCNSPIRV:       if.end4.i:
+// AMDGCNSPIRV-NEXT:    br label %[[_ZL3JNFIF_EXIT]]
+// AMDGCNSPIRV:       [[IF_END4_I]]:
 // AMDGCNSPIRV-NEXT:    [[CALL_I_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_j0_f32(float noundef [[Y]]) #[[ATTR14]]
 // AMDGCNSPIRV-NEXT:    [[CALL_I21_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_j1_f32(float noundef [[Y]]) #[[ATTR14]]
 // AMDGCNSPIRV-NEXT:    [[CMP7_I1:%.*]] = icmp sgt i32 [[X]], 1
-// AMDGCNSPIRV-NEXT:    br i1 [[CMP7_I1]], label [[FOR_BODY_I:%.*]], label [[_ZL3JNFIF_EXIT]]
-// AMDGCNSPIRV:       for.body.i:
-// AMDGCNSPIRV-NEXT:    [[__I_0_I4:%.*]] = phi i32 [ [[INC_I:%.*]], [[FOR_BODY_I]] ], [ 1, [[IF_END4_I]] ]
-// AMDGCNSPIRV-NEXT:    [[__X1_0_I3:%.*]] = phi float [ [[SUB_I:%.*]], [[FOR_BODY_I]] ], [ [[CALL_I21_I]], [[IF_END4_I]] ]
-// AMDGCNSPIRV-NEXT:    [[__X0_0_I2:%.*]] = phi float [ [[__X1_0_I3]], [[FOR_BODY_I]] ], [ [[CALL_I_I]], [[IF_END4_I]] ]
+// AMDGCNSPIRV-NEXT:    br i1 [[CMP7_I1]], label %[[FOR_BODY_I:.*]], label %[[_ZL3JNFIF_EXIT]]
+// AMDGCNSPIRV:       [[FOR_BODY_I]]:
+// AMDGCNSPIRV-NEXT:    [[__I_0_I4:%.*]] = phi i32 [ [[INC_I:%.*]], %[[FOR_BODY_I]] ], [ 1, %[[IF_END4_I]] ]
+// AMDGCNSPIRV-NEXT:    [[__X1_0_I3:%.*]] = phi float [ [[SUB_I:%.*]], %[[FOR_BODY_I]] ], [ [[CALL_I21_I]], %[[IF_END4_I]] ]
+// AMDGCNSPIRV-NEXT:    [[__X0_0_I2:%.*]] = phi float [ [[__X1_0_I3]], %[[FOR_BODY_I]] ], [ [[CALL_I_I]], %[[IF_END4_I]] ]
 // AMDGCNSPIRV-NEXT:    [[MUL_I:%.*]] = shl nuw nsw i32 [[__I_0_I4]], 1
 // AMDGCNSPIRV-NEXT:    [[CONV_I:%.*]] = uitofp nneg i32 [[MUL_I]] to float
 // AMDGCNSPIRV-NEXT:    [[DIV_I:%.*]] = fdiv contract float [[CONV_I]], [[Y]]
@@ -2806,36 +3195,37 @@ extern "C" __device__ double test_j1(double x) {
 // AMDGCNSPIRV-NEXT:    [[SUB_I]] = fsub contract float [[MUL8_I]], [[__X0_0_I2]]
 // AMDGCNSPIRV-NEXT:    [[INC_I]] = add nuw nsw i32 [[__I_0_I4]], 1
 // AMDGCNSPIRV-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC_I]], [[X]]
-// AMDGCNSPIRV-NEXT:    br i1 [[EXITCOND_NOT]], label [[_ZL3JNFIF_EXIT]], label [[FOR_BODY_I]], !llvm.loop [[LOOP15:![0-9]+]]
-// AMDGCNSPIRV:       _ZL3jnfif.exit:
-// AMDGCNSPIRV-NEXT:    [[RETVAL_0_I:%.*]] = phi float [ [[CALL_I20_I]], [[IF_THEN_I]] ], [ [[CALL_I22_I]], [[IF_THEN2_I]] ], [ [[CALL_I21_I]], [[IF_END4_I]] ], [ [[SUB_I]], [[FOR_BODY_I]] ]
+// AMDGCNSPIRV-NEXT:    br i1 [[EXITCOND_NOT]], label %[[_ZL3JNFIF_EXIT]], label %[[FOR_BODY_I]], !llvm.loop [[LOOP15:![0-9]+]]
+// AMDGCNSPIRV:       [[_ZL3JNFIF_EXIT]]:
+// AMDGCNSPIRV-NEXT:    [[RETVAL_0_I:%.*]] = phi float [ [[CALL_I20_I]], %[[IF_THEN_I]] ], [ [[CALL_I22_I]], %[[IF_THEN2_I]] ], [ [[CALL_I21_I]], %[[IF_END4_I]] ], [ [[SUB_I]], %[[FOR_BODY_I]] ]
 // AMDGCNSPIRV-NEXT:    ret float [[RETVAL_0_I]]
 //
 extern "C" __device__ float test_jnf(int x, float y) {
   return jnf(x, y);
 }
 
-// DEFAULT-LABEL: @test_jn(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    switch i32 [[X:%.*]], label [[IF_END4_I:%.*]] [
-// DEFAULT-NEXT:      i32 0, label [[IF_THEN_I:%.*]]
-// DEFAULT-NEXT:      i32 1, label [[IF_THEN2_I:%.*]]
+// DEFAULT-LABEL: define dso_local double @test_jn(
+// DEFAULT-SAME: i32 noundef [[X:%.*]], double noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    switch i32 [[X]], label %[[IF_END4_I:.*]] [
+// DEFAULT-NEXT:      i32 0, label %[[IF_THEN_I:.*]]
+// DEFAULT-NEXT:      i32 1, label %[[IF_THEN2_I:.*]]
 // DEFAULT-NEXT:    ]
-// DEFAULT:       if.then.i:
-// DEFAULT-NEXT:    [[CALL_I20_I:%.*]] = tail call contract noundef double @__ocml_j0_f64(double noundef [[Y:%.*]]) #[[ATTR16]]
-// DEFAULT-NEXT:    br label [[_ZL2JNID_EXIT:%.*]]
-// DEFAULT:       if.then2.i:
+// DEFAULT:       [[IF_THEN_I]]:
+// DEFAULT-NEXT:    [[CALL_I20_I:%.*]] = tail call contract noundef double @__ocml_j0_f64(double noundef [[Y]]) #[[ATTR16]]
+// DEFAULT-NEXT:    br label %[[_ZL2JNID_EXIT:.*]]
+// DEFAULT:       [[IF_THEN2_I]]:
 // DEFAULT-NEXT:    [[CALL_I22_I:%.*]] = tail call contract noundef double @__ocml_j1_f64(double noundef [[Y]]) #[[ATTR16]]
-// DEFAULT-NEXT:    br label [[_ZL2JNID_EXIT]]
-// DEFAULT:       if.end4.i:
+// DEFAULT-NEXT:    br label %[[_ZL2JNID_EXIT]]
+// DEFAULT:       [[IF_END4_I]]:
 // DEFAULT-NEXT:    [[CALL_I_I:%.*]] = tail call contract noundef double @__ocml_j0_f64(double noundef [[Y]]) #[[ATTR16]]
 // DEFAULT-NEXT:    [[CALL_I21_I:%.*]] = tail call contract noundef double @__ocml_j1_f64(double noundef [[Y]]) #[[ATTR16]]
 // DEFAULT-NEXT:    [[CMP7_I1:%.*]] = icmp sgt i32 [[X]], 1
-// DEFAULT-NEXT:    br i1 [[CMP7_I1]], label [[FOR_BODY_I:%.*]], label [[_ZL2JNID_EXIT]]
-// DEFAULT:       for.body.i:
-// DEFAULT-NEXT:    [[__I_0_I4:%.*]] = phi i32 [ [[INC_I:%.*]], [[FOR_BODY_I]] ], [ 1, [[IF_END4_I]] ]
-// DEFAULT-NEXT:    [[__X1_0_I3:%.*]] = phi double [ [[SUB_I:%.*]], [[FOR_BODY_I]] ], [ [[CALL_I21_I]], [[IF_END4_I]] ]
-// DEFAULT-NEXT:    [[__X0_0_I2:%.*]] = phi double [ [[__X1_0_I3]], [[FOR_BODY_I]] ], [ [[CALL_I_I]], [[IF_END4_I]] ]
+// DEFAULT-NEXT:    br i1 [[CMP7_I1]], label %[[FOR_BODY_I:.*]], label %[[_ZL2JNID_EXIT]]
+// DEFAULT:       [[FOR_BODY_I]]:
+// DEFAULT-NEXT:    [[__I_0_I4:%.*]] = phi i32 [ [[INC_I:%.*]], %[[FOR_BODY_I]] ], [ 1, %[[IF_END4_I]] ]
+// DEFAULT-NEXT:    [[__X1_0_I3:%.*]] = phi double [ [[SUB_I:%.*]], %[[FOR_BODY_I]] ], [ [[CALL_I21_I]], %[[IF_END4_I]] ]
+// DEFAULT-NEXT:    [[__X0_0_I2:%.*]] = phi double [ [[__X1_0_I3]], %[[FOR_BODY_I]] ], [ [[CALL_I_I]], %[[IF_END4_I]] ]
 // DEFAULT-NEXT:    [[MUL_I:%.*]] = shl nuw nsw i32 [[__I_0_I4]], 1
 // DEFAULT-NEXT:    [[CONV_I:%.*]] = uitofp nneg i32 [[MUL_I]] to double
 // DEFAULT-NEXT:    [[DIV_I:%.*]] = fdiv contract double [[CONV_I]], [[Y]]
@@ -2843,32 +3233,33 @@ extern "C" __device__ float test_jnf(int x, float y) {
 // DEFAULT-NEXT:    [[SUB_I]] = fsub contract double [[MUL8_I]], [[__X0_0_I2]]
 // DEFAULT-NEXT:    [[INC_I]] = add nuw nsw i32 [[__I_0_I4]], 1
 // DEFAULT-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC_I]], [[X]]
-// DEFAULT-NEXT:    br i1 [[EXITCOND_NOT]], label [[_ZL2JNID_EXIT]], label [[FOR_BODY_I]], !llvm.loop [[LOOP15:![0-9]+]]
-// DEFAULT:       _ZL2jnid.exit:
-// DEFAULT-NEXT:    [[RETVAL_0_I:%.*]] = phi double [ [[CALL_I20_I]], [[IF_THEN_I]] ], [ [[CALL_I22_I]], [[IF_THEN2_I]] ], [ [[CALL_I21_I]], [[IF_END4_I]] ], [ [[SUB_I]], [[FOR_BODY_I]] ]
+// DEFAULT-NEXT:    br i1 [[EXITCOND_NOT]], label %[[_ZL2JNID_EXIT]], label %[[FOR_BODY_I]], !llvm.loop [[LOOP15:![0-9]+]]
+// DEFAULT:       [[_ZL2JNID_EXIT]]:
+// DEFAULT-NEXT:    [[RETVAL_0_I:%.*]] = phi double [ [[CALL_I20_I]], %[[IF_THEN_I]] ], [ [[CALL_I22_I]], %[[IF_THEN2_I]] ], [ [[CALL_I21_I]], %[[IF_END4_I]] ], [ [[SUB_I]], %[[FOR_BODY_I]] ]
 // DEFAULT-NEXT:    ret double [[RETVAL_0_I]]
 //
-// FINITEONLY-LABEL: @test_jn(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    switch i32 [[X:%.*]], label [[IF_END4_I:%.*]] [
-// FINITEONLY-NEXT:      i32 0, label [[IF_THEN_I:%.*]]
-// FINITEONLY-NEXT:      i32 1, label [[IF_THEN2_I:%.*]]
+// FINITEONLY-LABEL: define dso_local nofpclass(nan inf) double @test_jn(
+// FINITEONLY-SAME: i32 noundef [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    switch i32 [[X]], label %[[IF_END4_I:.*]] [
+// FINITEONLY-NEXT:      i32 0, label %[[IF_THEN_I:.*]]
+// FINITEONLY-NEXT:      i32 1, label %[[IF_THEN2_I:.*]]
 // FINITEONLY-NEXT:    ]
-// FINITEONLY:       if.then.i:
-// FINITEONLY-NEXT:    [[CALL_I20_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_j0_f64(double noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR16]]
-// FINITEONLY-NEXT:    br label [[_ZL2JNID_EXIT:%.*]]
-// FINITEONLY:       if.then2.i:
+// FINITEONLY:       [[IF_THEN_I]]:
+// FINITEONLY-NEXT:    [[CALL_I20_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_j0_f64(double noundef nofpclass(nan inf) [[Y]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    br label %[[_ZL2JNID_EXIT:.*]]
+// FINITEONLY:       [[IF_THEN2_I]]:
 // FINITEONLY-NEXT:    [[CALL_I22_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_j1_f64(double noundef nofpclass(nan inf) [[Y]]) #[[ATTR16]]
-// FINITEONLY-NEXT:    br label [[_ZL2JNID_EXIT]]
-// FINITEONLY:       if.end4.i:
+// FINITEONLY-NEXT:    br label %[[_ZL2JNID_EXIT]]
+// FINITEONLY:       [[IF_END4_I]]:
 // FINITEONLY-NEXT:    [[CALL_I_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_j0_f64(double noundef nofpclass(nan inf) [[Y]]) #[[ATTR16]]
 // FINITEONLY-NEXT:    [[CALL_I21_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_j1_f64(double noundef nofpclass(nan inf) [[Y]]) #[[ATTR16]]
 // FINITEONLY-NEXT:    [[CMP7_I1:%.*]] = icmp sgt i32 [[X]], 1
-// FINITEONLY-NEXT:    br i1 [[CMP7_I1]], label [[FOR_BODY_I:%.*]], label [[_ZL2JNID_EXIT]]
-// FINITEONLY:       for.body.i:
-// FINITEONLY-NEXT:    [[__I_0_I4:%.*]] = phi i32 [ [[INC_I:%.*]], [[FOR_BODY_I]] ], [ 1, [[IF_END4_I]] ]
-// FINITEONLY-NEXT:    [[__X1_0_I3:%.*]] = phi double [ [[SUB_I:%.*]], [[FOR_BODY_I]] ], [ [[CALL_I21_I]], [[IF_END4_I]] ]
-// FINITEONLY-NEXT:    [[__X0_0_I2:%.*]] = phi double [ [[__X1_0_I3]], [[FOR_BODY_I]] ], [ [[CALL_I_I]], [[IF_END4_I]] ]
+// FINITEONLY-NEXT:    br i1 [[CMP7_I1]], label %[[FOR_BODY_I:.*]], label %[[_ZL2JNID_EXIT]]
+// FINITEONLY:       [[FOR_BODY_I]]:
+// FINITEONLY-NEXT:    [[__I_0_I4:%.*]] = phi i32 [ [[INC_I:%.*]], %[[FOR_BODY_I]] ], [ 1, %[[IF_END4_I]] ]
+// FINITEONLY-NEXT:    [[__X1_0_I3:%.*]] = phi double [ [[SUB_I:%.*]], %[[FOR_BODY_I]] ], [ [[CALL_I21_I]], %[[IF_END4_I]] ]
+// FINITEONLY-NEXT:    [[__X0_0_I2:%.*]] = phi double [ [[__X1_0_I3]], %[[FOR_BODY_I]] ], [ [[CALL_I_I]], %[[IF_END4_I]] ]
 // FINITEONLY-NEXT:    [[MUL_I:%.*]] = shl nuw nsw i32 [[__I_0_I4]], 1
 // FINITEONLY-NEXT:    [[CONV_I:%.*]] = uitofp nneg i32 [[MUL_I]] to double
 // FINITEONLY-NEXT:    [[DIV_I:%.*]] = fdiv nnan ninf contract double [[CONV_I]], [[Y]]
@@ -2876,32 +3267,33 @@ extern "C" __device__ float test_jnf(int x, float y) {
 // FINITEONLY-NEXT:    [[SUB_I]] = fsub nnan ninf contract double [[MUL8_I]], [[__X0_0_I2]]
 // FINITEONLY-NEXT:    [[INC_I]] = add nuw nsw i32 [[__I_0_I4]], 1
 // FINITEONLY-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC_I]], [[X]]
-// FINITEONLY-NEXT:    br i1 [[EXITCOND_NOT]], label [[_ZL2JNID_EXIT]], label [[FOR_BODY_I]], !llvm.loop [[LOOP15:![0-9]+]]
-// FINITEONLY:       _ZL2jnid.exit:
-// FINITEONLY-NEXT:    [[RETVAL_0_I:%.*]] = phi double [ [[CALL_I20_I]], [[IF_THEN_I]] ], [ [[CALL_I22_I]], [[IF_THEN2_I]] ], [ [[CALL_I21_I]], [[IF_END4_I]] ], [ [[SUB_I]], [[FOR_BODY_I]] ]
+// FINITEONLY-NEXT:    br i1 [[EXITCOND_NOT]], label %[[_ZL2JNID_EXIT]], label %[[FOR_BODY_I]], !llvm.loop [[LOOP15:![0-9]+]]
+// FINITEONLY:       [[_ZL2JNID_EXIT]]:
+// FINITEONLY-NEXT:    [[RETVAL_0_I:%.*]] = phi double [ [[CALL_I20_I]], %[[IF_THEN_I]] ], [ [[CALL_I22_I]], %[[IF_THEN2_I]] ], [ [[CALL_I21_I]], %[[IF_END4_I]] ], [ [[SUB_I]], %[[FOR_BODY_I]] ]
 // FINITEONLY-NEXT:    ret double [[RETVAL_0_I]]
 //
-// APPROX-LABEL: @test_jn(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    switch i32 [[X:%.*]], label [[IF_END4_I:%.*]] [
-// APPROX-NEXT:      i32 0, label [[IF_THEN_I:%.*]]
-// APPROX-NEXT:      i32 1, label [[IF_THEN2_I:%.*]]
+// APPROX-LABEL: define dso_local double @test_jn(
+// APPROX-SAME: i32 noundef [[X:%.*]], double noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    switch i32 [[X]], label %[[IF_END4_I:.*]] [
+// APPROX-NEXT:      i32 0, label %[[IF_THEN_I:.*]]
+// APPROX-NEXT:      i32 1, label %[[IF_THEN2_I:.*]]
 // APPROX-NEXT:    ]
-// APPROX:       if.then.i:
-// APPROX-NEXT:    [[CALL_I20_I:%.*]] = tail call contract noundef double @__ocml_j0_f64(double noundef [[Y:%.*]]) #[[ATTR16]]
-// APPROX-NEXT:    br label [[_ZL2JNID_EXIT:%.*]]
-// APPROX:       if.then2.i:
+// APPROX:       [[IF_THEN_I]]:
+// APPROX-NEXT:    [[CALL_I20_I:%.*]] = tail call contract noundef double @__ocml_j0_f64(double noundef [[Y]]) #[[ATTR16]]
+// APPROX-NEXT:    br label %[[_ZL2JNID_EXIT:.*]]
+// APPROX:       [[IF_THEN2_I]]:
 // APPROX-NEXT:    [[CALL_I22_I:%.*]] = tail call contract noundef double @__ocml_j1_f64(double noundef [[Y]]) #[[ATTR16]]
-// APPROX-NEXT:    br label [[_ZL2JNID_EXIT]]
-// APPROX:       if.end4.i:
+// APPROX-NEXT:    br label %[[_ZL2JNID_EXIT]]
+// APPROX:       [[IF_END4_I]]:
 // APPROX-NEXT:    [[CALL_I_I:%.*]] = tail call contract noundef double @__ocml_j0_f64(double noundef [[Y]]) #[[ATTR16]]
 // APPROX-NEXT:    [[CALL_I21_I:%.*]] = tail call contract noundef double @__ocml_j1_f64(double noundef [[Y]]) #[[ATTR16]]
 // APPROX-NEXT:    [[CMP7_I1:%.*]] = icmp sgt i32 [[X]], 1
-// APPROX-NEXT:    br i1 [[CMP7_I1]], label [[FOR_BODY_I:%.*]], label [[_ZL2JNID_EXIT]]
-// APPROX:       for.body.i:
-// APPROX-NEXT:    [[__I_0_I4:%.*]] = phi i32 [ [[INC_I:%.*]], [[FOR_BODY_I]] ], [ 1, [[IF_END4_I]] ]
-// APPROX-NEXT:    [[__X1_0_I3:%.*]] = phi double [ [[SUB_I:%.*]], [[FOR_BODY_I]] ], [ [[CALL_I21_I]], [[IF_END4_I]] ]
-// APPROX-NEXT:    [[__X0_0_I2:%.*]] = phi double [ [[__X1_0_I3]], [[FOR_BODY_I]] ], [ [[CALL_I_I]], [[IF_END4_I]] ]
+// APPROX-NEXT:    br i1 [[CMP7_I1]], label %[[FOR_BODY_I:.*]], label %[[_ZL2JNID_EXIT]]
+// APPROX:       [[FOR_BODY_I]]:
+// APPROX-NEXT:    [[__I_0_I4:%.*]] = phi i32 [ [[INC_I:%.*]], %[[FOR_BODY_I]] ], [ 1, %[[IF_END4_I]] ]
+// APPROX-NEXT:    [[__X1_0_I3:%.*]] = phi double [ [[SUB_I:%.*]], %[[FOR_BODY_I]] ], [ [[CALL_I21_I]], %[[IF_END4_I]] ]
+// APPROX-NEXT:    [[__X0_0_I2:%.*]] = phi double [ [[__X1_0_I3]], %[[FOR_BODY_I]] ], [ [[CALL_I_I]], %[[IF_END4_I]] ]
 // APPROX-NEXT:    [[MUL_I:%.*]] = shl nuw nsw i32 [[__I_0_I4]], 1
 // APPROX-NEXT:    [[CONV_I:%.*]] = uitofp nneg i32 [[MUL_I]] to double
 // APPROX-NEXT:    [[DIV_I:%.*]] = fdiv contract double [[CONV_I]], [[Y]]
@@ -2909,32 +3301,33 @@ extern "C" __device__ float test_jnf(int x, float y) {
 // APPROX-NEXT:    [[SUB_I]] = fsub contract double [[MUL8_I]], [[__X0_0_I2]]
 // APPROX-NEXT:    [[INC_I]] = add nuw nsw i32 [[__I_0_I4]], 1
 // APPROX-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC_I]], [[X]]
-// APPROX-NEXT:    br i1 [[EXITCOND_NOT]], label [[_ZL2JNID_EXIT]], label [[FOR_BODY_I]], !llvm.loop [[LOOP15:![0-9]+]]
-// APPROX:       _ZL2jnid.exit:
-// APPROX-NEXT:    [[RETVAL_0_I:%.*]] = phi double [ [[CALL_I20_I]], [[IF_THEN_I]] ], [ [[CALL_I22_I]], [[IF_THEN2_I]] ], [ [[CALL_I21_I]], [[IF_END4_I]] ], [ [[SUB_I]], [[FOR_BODY_I]] ]
+// APPROX-NEXT:    br i1 [[EXITCOND_NOT]], label %[[_ZL2JNID_EXIT]], label %[[FOR_BODY_I]], !llvm.loop [[LOOP15:![0-9]+]]
+// APPROX:       [[_ZL2JNID_EXIT]]:
+// APPROX-NEXT:    [[RETVAL_0_I:%.*]] = phi double [ [[CALL_I20_I]], %[[IF_THEN_I]] ], [ [[CALL_I22_I]], %[[IF_THEN2_I]] ], [ [[CALL_I21_I]], %[[IF_END4_I]] ], [ [[SUB_I]], %[[FOR_BODY_I]] ]
 // APPROX-NEXT:    ret double [[RETVAL_0_I]]
 //
-// NCRDIV-LABEL: @test_jn(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    switch i32 [[X:%.*]], label [[IF_END4_I:%.*]] [
-// NCRDIV-NEXT:      i32 0, label [[IF_THEN_I:%.*]]
-// NCRDIV-NEXT:      i32 1, label [[IF_THEN2_I:%.*]]
+// NCRDIV-LABEL: define dso_local double @test_jn(
+// NCRDIV-SAME: i32 noundef [[X:%.*]], double noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    switch i32 [[X]], label %[[IF_END4_I:.*]] [
+// NCRDIV-NEXT:      i32 0, label %[[IF_THEN_I:.*]]
+// NCRDIV-NEXT:      i32 1, label %[[IF_THEN2_I:.*]]
 // NCRDIV-NEXT:    ]
-// NCRDIV:       if.then.i:
-// NCRDIV-NEXT:    [[CALL_I20_I:%.*]] = tail call contract noundef double @__ocml_j0_f64(double noundef [[Y:%.*]]) #[[ATTR16]]
-// NCRDIV-NEXT:    br label [[_ZL2JNID_EXIT:%.*]]
-// NCRDIV:       if.then2.i:
+// NCRDIV:       [[IF_THEN_I]]:
+// NCRDIV-NEXT:    [[CALL_I20_I:%.*]] = tail call contract noundef double @__ocml_j0_f64(double noundef [[Y]]) #[[ATTR16]]
+// NCRDIV-NEXT:    br label %[[_ZL2JNID_EXIT:.*]]
+// NCRDIV:       [[IF_THEN2_I]]:
 // NCRDIV-NEXT:    [[CALL_I22_I:%.*]] = tail call contract noundef double @__ocml_j1_f64(double noundef [[Y]]) #[[ATTR16]]
-// NCRDIV-NEXT:    br label [[_ZL2JNID_EXIT]]
-// NCRDIV:       if.end4.i:
+// NCRDIV-NEXT:    br label %[[_ZL2JNID_EXIT]]
+// NCRDIV:       [[IF_END4_I]]:
 // NCRDIV-NEXT:    [[CALL_I_I:%.*]] = tail call contract noundef double @__ocml_j0_f64(double noundef [[Y]]) #[[ATTR16]]
 // NCRDIV-NEXT:    [[CALL_I21_I:%.*]] = tail call contract noundef double @__ocml_j1_f64(double noundef [[Y]]) #[[ATTR16]]
 // NCRDIV-NEXT:    [[CMP7_I1:%.*]] = icmp sgt i32 [[X]], 1
-// NCRDIV-NEXT:    br i1 [[CMP7_I1]], label [[FOR_BODY_I:%.*]], label [[_ZL2JNID_EXIT]]
-// NCRDIV:       for.body.i:
-// NCRDIV-NEXT:    [[__I_0_I4:%.*]] = phi i32 [ [[INC_I:%.*]], [[FOR_BODY_I]] ], [ 1, [[IF_END4_I]] ]
-// NCRDIV-NEXT:    [[__X1_0_I3:%.*]] = phi double [ [[SUB_I:%.*]], [[FOR_BODY_I]] ], [ [[CALL_I21_I]], [[IF_END4_I]] ]
-// NCRDIV-NEXT:    [[__X0_0_I2:%.*]] = phi double [ [[__X1_0_I3]], [[FOR_BODY_I]] ], [ [[CALL_I_I]], [[IF_END4_I]] ]
+// NCRDIV-NEXT:    br i1 [[CMP7_I1]], label %[[FOR_BODY_I:.*]], label %[[_ZL2JNID_EXIT]]
+// NCRDIV:       [[FOR_BODY_I]]:
+// NCRDIV-NEXT:    [[__I_0_I4:%.*]] = phi i32 [ [[INC_I:%.*]], %[[FOR_BODY_I]] ], [ 1, %[[IF_END4_I]] ]
+// NCRDIV-NEXT:    [[__X1_0_I3:%.*]] = phi double [ [[SUB_I:%.*]], %[[FOR_BODY_I]] ], [ [[CALL_I21_I]], %[[IF_END4_I]] ]
+// NCRDIV-NEXT:    [[__X0_0_I2:%.*]] = phi double [ [[__X1_0_I3]], %[[FOR_BODY_I]] ], [ [[CALL_I_I]], %[[IF_END4_I]] ]
 // NCRDIV-NEXT:    [[MUL_I:%.*]] = shl nuw nsw i32 [[__I_0_I4]], 1
 // NCRDIV-NEXT:    [[CONV_I:%.*]] = uitofp nneg i32 [[MUL_I]] to double
 // NCRDIV-NEXT:    [[DIV_I:%.*]] = fdiv contract double [[CONV_I]], [[Y]]
@@ -2942,32 +3335,33 @@ extern "C" __device__ float test_jnf(int x, float y) {
 // NCRDIV-NEXT:    [[SUB_I]] = fsub contract double [[MUL8_I]], [[__X0_0_I2]]
 // NCRDIV-NEXT:    [[INC_I]] = add nuw nsw i32 [[__I_0_I4]], 1
 // NCRDIV-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC_I]], [[X]]
-// NCRDIV-NEXT:    br i1 [[EXITCOND_NOT]], label [[_ZL2JNID_EXIT]], label [[FOR_BODY_I]], !llvm.loop [[LOOP16:![0-9]+]]
-// NCRDIV:       _ZL2jnid.exit:
-// NCRDIV-NEXT:    [[RETVAL_0_I:%.*]] = phi double [ [[CALL_I20_I]], [[IF_THEN_I]] ], [ [[CALL_I22_I]], [[IF_THEN2_I]] ], [ [[CALL_I21_I]], [[IF_END4_I]] ], [ [[SUB_I]], [[FOR_BODY_I]] ]
+// NCRDIV-NEXT:    br i1 [[EXITCOND_NOT]], label %[[_ZL2JNID_EXIT]], label %[[FOR_BODY_I]], !llvm.loop [[LOOP16:![0-9]+]]
+// NCRDIV:       [[_ZL2JNID_EXIT]]:
+// NCRDIV-NEXT:    [[RETVAL_0_I:%.*]] = phi double [ [[CALL_I20_I]], %[[IF_THEN_I]] ], [ [[CALL_I22_I]], %[[IF_THEN2_I]] ], [ [[CALL_I21_I]], %[[IF_END4_I]] ], [ [[SUB_I]], %[[FOR_BODY_I]] ]
 // NCRDIV-NEXT:    ret double [[RETVAL_0_I]]
 //
-// AMDGCNSPIRV-LABEL: @test_jn(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    switch i32 [[X:%.*]], label [[IF_END4_I:%.*]] [
-// AMDGCNSPIRV-NEXT:      i32 0, label [[IF_THEN_I:%.*]]
-// AMDGCNSPIRV-NEXT:      i32 1, label [[IF_THEN2_I:%.*]]
+// AMDGCNSPIRV-LABEL: define spir_func double @test_jn(
+// AMDGCNSPIRV-SAME: i32 noundef [[X:%.*]], double noundef [[Y:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR6]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    switch i32 [[X]], label %[[IF_END4_I:.*]] [
+// AMDGCNSPIRV-NEXT:      i32 0, label %[[IF_THEN_I:.*]]
+// AMDGCNSPIRV-NEXT:      i32 1, label %[[IF_THEN2_I:.*]]
 // AMDGCNSPIRV-NEXT:    ]
-// AMDGCNSPIRV:       if.then.i:
-// AMDGCNSPIRV-NEXT:    [[CALL_I20_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_j0_f64(double noundef [[Y:%.*]]) #[[ATTR14]]
-// AMDGCNSPIRV-NEXT:    br label [[_ZL2JNID_EXIT:%.*]]
-// AMDGCNSPIRV:       if.then2.i:
+// AMDGCNSPIRV:       [[IF_THEN_I]]:
+// AMDGCNSPIRV-NEXT:    [[CALL_I20_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_j0_f64(double noundef [[Y]]) #[[ATTR14]]
+// AMDGCNSPIRV-NEXT:    br label %[[_ZL2JNID_EXIT:.*]]
+// AMDGCNSPIRV:       [[IF_THEN2_I]]:
 // AMDGCNSPIRV-NEXT:    [[CALL_I22_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_j1_f64(double noundef [[Y]]) #[[ATTR14]]
-// AMDGCNSPIRV-NEXT:    br label [[_ZL2JNID_EXIT]]
-// AMDGCNSPIRV:       if.end4.i:
+// AMDGCNSPIRV-NEXT:    br label %[[_ZL2JNID_EXIT]]
+// AMDGCNSPIRV:       [[IF_END4_I]]:
 // AMDGCNSPIRV-NEXT:    [[CALL_I_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_j0_f64(double noundef [[Y]]) #[[ATTR14]]
 // AMDGCNSPIRV-NEXT:    [[CALL_I21_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_j1_f64(double noundef [[Y]]) #[[ATTR14]]
 // AMDGCNSPIRV-NEXT:    [[CMP7_I1:%.*]] = icmp sgt i32 [[X]], 1
-// AMDGCNSPIRV-NEXT:    br i1 [[CMP7_I1]], label [[FOR_BODY_I:%.*]], label [[_ZL2JNID_EXIT]]
-// AMDGCNSPIRV:       for.body.i:
-// AMDGCNSPIRV-NEXT:    [[__I_0_I4:%.*]] = phi i32 [ [[INC_I:%.*]], [[FOR_BODY_I]] ], [ 1, [[IF_END4_I]] ]
-// AMDGCNSPIRV-NEXT:    [[__X1_0_I3:%.*]] = phi double [ [[SUB_I:%.*]], [[FOR_BODY_I]] ], [ [[CALL_I21_I]], [[IF_END4_I]] ]
-// AMDGCNSPIRV-NEXT:    [[__X0_0_I2:%.*]] = phi double [ [[__X1_0_I3]], [[FOR_BODY_I]] ], [ [[CALL_I_I]], [[IF_END4_I]] ]
+// AMDGCNSPIRV-NEXT:    br i1 [[CMP7_I1]], label %[[FOR_BODY_I:.*]], label %[[_ZL2JNID_EXIT]]
+// AMDGCNSPIRV:       [[FOR_BODY_I]]:
+// AMDGCNSPIRV-NEXT:    [[__I_0_I4:%.*]] = phi i32 [ [[INC_I:%.*]], %[[FOR_BODY_I]] ], [ 1, %[[IF_END4_I]] ]
+// AMDGCNSPIRV-NEXT:    [[__X1_0_I3:%.*]] = phi double [ [[SUB_I:%.*]], %[[FOR_BODY_I]] ], [ [[CALL_I21_I]], %[[IF_END4_I]] ]
+// AMDGCNSPIRV-NEXT:    [[__X0_0_I2:%.*]] = phi double [ [[__X1_0_I3]], %[[FOR_BODY_I]] ], [ [[CALL_I_I]], %[[IF_END4_I]] ]
 // AMDGCNSPIRV-NEXT:    [[MUL_I:%.*]] = shl nuw nsw i32 [[__I_0_I4]], 1
 // AMDGCNSPIRV-NEXT:    [[CONV_I:%.*]] = uitofp nneg i32 [[MUL_I]] to double
 // AMDGCNSPIRV-NEXT:    [[DIV_I:%.*]] = fdiv contract double [[CONV_I]], [[Y]]
@@ -2975,158 +3369,183 @@ extern "C" __device__ float test_jnf(int x, float y) {
 // AMDGCNSPIRV-NEXT:    [[SUB_I]] = fsub contract double [[MUL8_I]], [[__X0_0_I2]]
 // AMDGCNSPIRV-NEXT:    [[INC_I]] = add nuw nsw i32 [[__I_0_I4]], 1
 // AMDGCNSPIRV-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC_I]], [[X]]
-// AMDGCNSPIRV-NEXT:    br i1 [[EXITCOND_NOT]], label [[_ZL2JNID_EXIT]], label [[FOR_BODY_I]], !llvm.loop [[LOOP16:![0-9]+]]
-// AMDGCNSPIRV:       _ZL2jnid.exit:
-// AMDGCNSPIRV-NEXT:    [[RETVAL_0_I:%.*]] = phi double [ [[CALL_I20_I]], [[IF_THEN_I]] ], [ [[CALL_I22_I]], [[IF_THEN2_I]] ], [ [[CALL_I21_I]], [[IF_END4_I]] ], [ [[SUB_I]], [[FOR_BODY_I]] ]
+// AMDGCNSPIRV-NEXT:    br i1 [[EXITCOND_NOT]], label %[[_ZL2JNID_EXIT]], label %[[FOR_BODY_I]], !llvm.loop [[LOOP16:![0-9]+]]
+// AMDGCNSPIRV:       [[_ZL2JNID_EXIT]]:
+// AMDGCNSPIRV-NEXT:    [[RETVAL_0_I:%.*]] = phi double [ [[CALL_I20_I]], %[[IF_THEN_I]] ], [ [[CALL_I22_I]], %[[IF_THEN2_I]] ], [ [[CALL_I21_I]], %[[IF_END4_I]] ], [ [[SUB_I]], %[[FOR_BODY_I]] ]
 // AMDGCNSPIRV-NEXT:    ret double [[RETVAL_0_I]]
 //
 extern "C" __device__ double test_jn(int x, double y) {
   return jn(x, y);
 }
 
-// DEFAULT-LABEL: @test_ldexpf(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.ldexp.f32.i32(float [[X:%.*]], i32 [[Y:%.*]])
+// DEFAULT-LABEL: define dso_local noundef float @test_ldexpf(
+// DEFAULT-SAME: float noundef [[X:%.*]], i32 noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.ldexp.f32.i32(float [[X]], i32 [[Y]])
 // DEFAULT-NEXT:    ret float [[TMP0]]
 //
-// FINITEONLY-LABEL: @test_ldexpf(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.ldexp.f32.i32(float nofpclass(nan inf) [[X:%.*]], i32 [[Y:%.*]])
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) float @test_ldexpf(
+// FINITEONLY-SAME: float noundef nofpclass(nan inf) [[X:%.*]], i32 noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.ldexp.f32.i32(float nofpclass(nan inf) [[X]], i32 [[Y]])
 // FINITEONLY-NEXT:    ret float [[TMP0]]
 //
-// APPROX-LABEL: @test_ldexpf(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.ldexp.f32.i32(float [[X:%.*]], i32 [[Y:%.*]])
+// APPROX-LABEL: define dso_local noundef float @test_ldexpf(
+// APPROX-SAME: float noundef [[X:%.*]], i32 noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.ldexp.f32.i32(float [[X]], i32 [[Y]])
 // APPROX-NEXT:    ret float [[TMP0]]
 //
-// NCRDIV-LABEL: @test_ldexpf(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.ldexp.f32.i32(float [[X:%.*]], i32 [[Y:%.*]])
+// NCRDIV-LABEL: define dso_local noundef float @test_ldexpf(
+// NCRDIV-SAME: float noundef [[X:%.*]], i32 noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.ldexp.f32.i32(float [[X]], i32 [[Y]])
 // NCRDIV-NEXT:    ret float [[TMP0]]
 //
-// AMDGCNSPIRV-LABEL: @test_ldexpf(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call contract noundef addrspace(4) float @llvm.ldexp.f32.i32(float [[X:%.*]], i32 [[Y:%.*]])
+// AMDGCNSPIRV-LABEL: define spir_func noundef float @test_ldexpf(
+// AMDGCNSPIRV-SAME: float noundef [[X:%.*]], i32 noundef [[Y:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR3]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call contract noundef addrspace(4) float @llvm.ldexp.f32.i32(float [[X]], i32 [[Y]])
 // AMDGCNSPIRV-NEXT:    ret float [[TMP0]]
 //
 extern "C" __device__ float test_ldexpf(float x, int y) {
   return ldexpf(x, y);
 }
 
-// DEFAULT-LABEL: @test_ldexp(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.ldexp.f64.i32(double [[X:%.*]], i32 [[Y:%.*]])
+// DEFAULT-LABEL: define dso_local noundef double @test_ldexp(
+// DEFAULT-SAME: double noundef [[X:%.*]], i32 noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.ldexp.f64.i32(double [[X]], i32 [[Y]])
 // DEFAULT-NEXT:    ret double [[TMP0]]
 //
-// FINITEONLY-LABEL: @test_ldexp(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.ldexp.f64.i32(double nofpclass(nan inf) [[X:%.*]], i32 [[Y:%.*]])
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) double @test_ldexp(
+// FINITEONLY-SAME: double noundef nofpclass(nan inf) [[X:%.*]], i32 noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.ldexp.f64.i32(double nofpclass(nan inf) [[X]], i32 [[Y]])
 // FINITEONLY-NEXT:    ret double [[TMP0]]
 //
-// APPROX-LABEL: @test_ldexp(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.ldexp.f64.i32(double [[X:%.*]], i32 [[Y:%.*]])
+// APPROX-LABEL: define dso_local noundef double @test_ldexp(
+// APPROX-SAME: double noundef [[X:%.*]], i32 noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.ldexp.f64.i32(double [[X]], i32 [[Y]])
 // APPROX-NEXT:    ret double [[TMP0]]
 //
-// NCRDIV-LABEL: @test_ldexp(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.ldexp.f64.i32(double [[X:%.*]], i32 [[Y:%.*]])
+// NCRDIV-LABEL: define dso_local noundef double @test_ldexp(
+// NCRDIV-SAME: double noundef [[X:%.*]], i32 noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.ldexp.f64.i32(double [[X]], i32 [[Y]])
 // NCRDIV-NEXT:    ret double [[TMP0]]
 //
-// AMDGCNSPIRV-LABEL: @test_ldexp(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call contract noundef addrspace(4) double @llvm.ldexp.f64.i32(double [[X:%.*]], i32 [[Y:%.*]])
+// AMDGCNSPIRV-LABEL: define spir_func noundef double @test_ldexp(
+// AMDGCNSPIRV-SAME: double noundef [[X:%.*]], i32 noundef [[Y:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR3]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call contract noundef addrspace(4) double @llvm.ldexp.f64.i32(double [[X]], i32 [[Y]])
 // AMDGCNSPIRV-NEXT:    ret double [[TMP0]]
 //
 extern "C" __device__ double test_ldexp(double x, int y) {
   return ldexp(x, y);
 }
 
-// DEFAULT-LABEL: @test_lgammaf(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_lgamma_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// DEFAULT-LABEL: define dso_local noundef float @test_lgammaf(
+// DEFAULT-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_lgamma_f32(float noundef [[X]]) #[[ATTR16]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
-// FINITEONLY-LABEL: @test_lgammaf(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_lgamma_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) float @test_lgammaf(
+// FINITEONLY-SAME: float noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_lgamma_f32(float noundef nofpclass(nan inf) [[X]]) #[[ATTR16]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
-// APPROX-LABEL: @test_lgammaf(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_lgamma_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// APPROX-LABEL: define dso_local noundef float @test_lgammaf(
+// APPROX-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_lgamma_f32(float noundef [[X]]) #[[ATTR16]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
-// NCRDIV-LABEL: @test_lgammaf(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_lgamma_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// NCRDIV-LABEL: define dso_local noundef float @test_lgammaf(
+// NCRDIV-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_lgamma_f32(float noundef [[X]]) #[[ATTR16]]
 // NCRDIV-NEXT:    ret float [[CALL_I]]
 //
-// AMDGCNSPIRV-LABEL: @test_lgammaf(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_lgamma_f32(float noundef [[X:%.*]]) #[[ATTR14]]
+// AMDGCNSPIRV-LABEL: define spir_func noundef float @test_lgammaf(
+// AMDGCNSPIRV-SAME: float noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR6]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_lgamma_f32(float noundef [[X]]) #[[ATTR14]]
 // AMDGCNSPIRV-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_lgammaf(float x) {
   return lgammaf(x);
 }
 
-// DEFAULT-LABEL: @test_lgamma(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_lgamma_f64(double noundef [[X:%.*]]) #[[ATTR16]]
+// DEFAULT-LABEL: define dso_local noundef double @test_lgamma(
+// DEFAULT-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_lgamma_f64(double noundef [[X]]) #[[ATTR16]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
-// FINITEONLY-LABEL: @test_lgamma(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_lgamma_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) double @test_lgamma(
+// FINITEONLY-SAME: double noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_lgamma_f64(double noundef nofpclass(nan inf) [[X]]) #[[ATTR16]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
-// APPROX-LABEL: @test_lgamma(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_lgamma_f64(double noundef [[X:%.*]]) #[[ATTR16]]
+// APPROX-LABEL: define dso_local noundef double @test_lgamma(
+// APPROX-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_lgamma_f64(double noundef [[X]]) #[[ATTR16]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
-// NCRDIV-LABEL: @test_lgamma(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_lgamma_f64(double noundef [[X:%.*]]) #[[ATTR16]]
+// NCRDIV-LABEL: define dso_local noundef double @test_lgamma(
+// NCRDIV-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_lgamma_f64(double noundef [[X]]) #[[ATTR16]]
 // NCRDIV-NEXT:    ret double [[CALL_I]]
 //
-// AMDGCNSPIRV-LABEL: @test_lgamma(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_lgamma_f64(double noundef [[X:%.*]]) #[[ATTR14]]
+// AMDGCNSPIRV-LABEL: define spir_func noundef double @test_lgamma(
+// AMDGCNSPIRV-SAME: double noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR6]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_lgamma_f64(double noundef [[X]]) #[[ATTR14]]
 // AMDGCNSPIRV-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_lgamma(double x) {
   return lgamma(x);
 }
 
-// DEFAULT-LABEL: @test_llrintf(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract float @llvm.rint.f32(float [[X:%.*]])
+// DEFAULT-LABEL: define dso_local i64 @test_llrintf(
+// DEFAULT-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract float @llvm.rint.f32(float [[X]])
 // DEFAULT-NEXT:    [[CONV_I:%.*]] = fptosi float [[TMP0]] to i64
 // DEFAULT-NEXT:    ret i64 [[CONV_I]]
 //
-// FINITEONLY-LABEL: @test_llrintf(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract float @llvm.rint.f32(float nofpclass(nan inf) [[X:%.*]])
+// FINITEONLY-LABEL: define dso_local i64 @test_llrintf(
+// FINITEONLY-SAME: float noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract float @llvm.rint.f32(float nofpclass(nan inf) [[X]])
 // FINITEONLY-NEXT:    [[CONV_I:%.*]] = fptosi float [[TMP0]] to i64
 // FINITEONLY-NEXT:    ret i64 [[CONV_I]]
 //
-// APPROX-LABEL: @test_llrintf(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract float @llvm.rint.f32(float [[X:%.*]])
+// APPROX-LABEL: define dso_local i64 @test_llrintf(
+// APPROX-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract float @llvm.rint.f32(float [[X]])
 // APPROX-NEXT:    [[CONV_I:%.*]] = fptosi float [[TMP0]] to i64
 // APPROX-NEXT:    ret i64 [[CONV_I]]
 //
-// NCRDIV-LABEL: @test_llrintf(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call contract float @llvm.rint.f32(float [[X:%.*]])
+// NCRDIV-LABEL: define dso_local i64 @test_llrintf(
+// NCRDIV-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call contract float @llvm.rint.f32(float [[X]])
 // NCRDIV-NEXT:    [[CONV_I:%.*]] = fptosi float [[TMP0]] to i64
 // NCRDIV-NEXT:    ret i64 [[CONV_I]]
 //
-// AMDGCNSPIRV-LABEL: @test_llrintf(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call contract addrspace(4) float @llvm.rint.f32(float [[X:%.*]])
+// AMDGCNSPIRV-LABEL: define spir_func i64 @test_llrintf(
+// AMDGCNSPIRV-SAME: float noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR3]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call contract addrspace(4) float @llvm.rint.f32(float [[X]])
 // AMDGCNSPIRV-NEXT:    [[CONV_I:%.*]] = fptosi float [[TMP0]] to i64
 // AMDGCNSPIRV-NEXT:    ret i64 [[CONV_I]]
 //
@@ -3134,33 +3553,38 @@ extern "C" __device__ long long int test_llrintf(float x) {
   return llrintf(x);
 }
 
-// DEFAULT-LABEL: @test_llrint(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract double @llvm.rint.f64(double [[X:%.*]])
+// DEFAULT-LABEL: define dso_local i64 @test_llrint(
+// DEFAULT-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract double @llvm.rint.f64(double [[X]])
 // DEFAULT-NEXT:    [[CONV_I:%.*]] = fptosi double [[TMP0]] to i64
 // DEFAULT-NEXT:    ret i64 [[CONV_I]]
 //
-// FINITEONLY-LABEL: @test_llrint(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract double @llvm.rint.f64(double nofpclass(nan inf) [[X:%.*]])
+// FINITEONLY-LABEL: define dso_local i64 @test_llrint(
+// FINITEONLY-SAME: double noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract double @llvm.rint.f64(double nofpclass(nan inf) [[X]])
 // FINITEONLY-NEXT:    [[CONV_I:%.*]] = fptosi double [[TMP0]] to i64
 // FINITEONLY-NEXT:    ret i64 [[CONV_I]]
 //
-// APPROX-LABEL: @test_llrint(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract double @llvm.rint.f64(double [[X:%.*]])
+// APPROX-LABEL: define dso_local i64 @test_llrint(
+// APPROX-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract double @llvm.rint.f64(double [[X]])
 // APPROX-NEXT:    [[CONV_I:%.*]] = fptosi double [[TMP0]] to i64
 // APPROX-NEXT:    ret i64 [[CONV_I]]
 //
-// NCRDIV-LABEL: @test_llrint(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call contract double @llvm.rint.f64(double [[X:%.*]])
+// NCRDIV-LABEL: define dso_local i64 @test_llrint(
+// NCRDIV-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call contract double @llvm.rint.f64(double [[X]])
 // NCRDIV-NEXT:    [[CONV_I:%.*]] = fptosi double [[TMP0]] to i64
 // NCRDIV-NEXT:    ret i64 [[CONV_I]]
 //
-// AMDGCNSPIRV-LABEL: @test_llrint(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call contract addrspace(4) double @llvm.rint.f64(double [[X:%.*]])
+// AMDGCNSPIRV-LABEL: define spir_func i64 @test_llrint(
+// AMDGCNSPIRV-SAME: double noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR3]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call contract addrspace(4) double @llvm.rint.f64(double [[X]])
 // AMDGCNSPIRV-NEXT:    [[CONV_I:%.*]] = fptosi double [[TMP0]] to i64
 // AMDGCNSPIRV-NEXT:    ret i64 [[CONV_I]]
 //
@@ -3168,33 +3592,38 @@ extern "C" __device__ long long int test_llrint(double x) {
   return llrint(x);
 }
 
-// DEFAULT-LABEL: @test_llroundf(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract float @llvm.round.f32(float [[X:%.*]])
+// DEFAULT-LABEL: define dso_local i64 @test_llroundf(
+// DEFAULT-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract float @llvm.round.f32(float [[X]])
 // DEFAULT-NEXT:    [[CONV_I:%.*]] = fptosi float [[TMP0]] to i64
 // DEFAULT-NEXT:    ret i64 [[CONV_I]]
 //
-// FINITEONLY-LABEL: @test_llroundf(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract float @llvm.round.f32(float nofpclass(nan inf) [[X:%.*]])
+// FINITEONLY-LABEL: define dso_local i64 @test_llroundf(
+// FINITEONLY-SAME: float noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract float @llvm.round.f32(float nofpclass(nan inf) [[X]])
 // FINITEONLY-NEXT:    [[CONV_I:%.*]] = fptosi float [[TMP0]] to i64
 // FINITEONLY-NEXT:    ret i64 [[CONV_I]]
 //
-// APPROX-LABEL: @test_llroundf(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract float @llvm.round.f32(float [[X:%.*]])
+// APPROX-LABEL: define dso_local i64 @test_llroundf(
+// APPROX-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract float @llvm.round.f32(float [[X]])
 // APPROX-NEXT:    [[CONV_I:%.*]] = fptosi float [[TMP0]] to i64
 // APPROX-NEXT:    ret i64 [[CONV_I]]
 //
-// NCRDIV-LABEL: @test_llroundf(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call contract float @llvm.round.f32(float [[X:%.*]])
+// NCRDIV-LABEL: define dso_local i64 @test_llroundf(
+// NCRDIV-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call contract float @llvm.round.f32(float [[X]])
 // NCRDIV-NEXT:    [[CONV_I:%.*]] = fptosi float [[TMP0]] to i64
 // NCRDIV-NEXT:    ret i64 [[CONV_I]]
 //
-// AMDGCNSPIRV-LABEL: @test_llroundf(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call contract addrspace(4) float @llvm.round.f32(float [[X:%.*]])
+// AMDGCNSPIRV-LABEL: define spir_func i64 @test_llroundf(
+// AMDGCNSPIRV-SAME: float noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR3]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call contract addrspace(4) float @llvm.round.f32(float [[X]])
 // AMDGCNSPIRV-NEXT:    [[CONV_I:%.*]] = fptosi float [[TMP0]] to i64
 // AMDGCNSPIRV-NEXT:    ret i64 [[CONV_I]]
 //
@@ -3202,33 +3631,38 @@ extern "C" __device__ long long int test_llroundf(float x) {
   return llroundf(x);
 }
 
-// DEFAULT-LABEL: @test_llround(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract double @llvm.round.f64(double [[X:%.*]])
+// DEFAULT-LABEL: define dso_local i64 @test_llround(
+// DEFAULT-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract double @llvm.round.f64(double [[X]])
 // DEFAULT-NEXT:    [[CONV_I:%.*]] = fptosi double [[TMP0]] to i64
 // DEFAULT-NEXT:    ret i64 [[CONV_I]]
 //
-// FINITEONLY-LABEL: @test_llround(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract double @llvm.round.f64(double nofpclass(nan inf) [[X:%.*]])
+// FINITEONLY-LABEL: define dso_local i64 @test_llround(
+// FINITEONLY-SAME: double noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract double @llvm.round.f64(double nofpclass(nan inf) [[X]])
 // FINITEONLY-NEXT:    [[CONV_I:%.*]] = fptosi double [[TMP0]] to i64
 // FINITEONLY-NEXT:    ret i64 [[CONV_I]]
 //
-// APPROX-LABEL: @test_llround(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract double @llvm.round.f64(double [[X:%.*]])
+// APPROX-LABEL: define dso_local i64 @test_llround(
+// APPROX-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract double @llvm.round.f64(double [[X]])
 // APPROX-NEXT:    [[CONV_I:%.*]] = fptosi double [[TMP0]] to i64
 // APPROX-NEXT:    ret i64 [[CONV_I]]
 //
-// NCRDIV-LABEL: @test_llround(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call contract double @llvm.round.f64(double [[X:%.*]])
+// NCRDIV-LABEL: define dso_local i64 @test_llround(
+// NCRDIV-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call contract double @llvm.round.f64(double [[X]])
 // NCRDIV-NEXT:    [[CONV_I:%.*]] = fptosi double [[TMP0]] to i64
 // NCRDIV-NEXT:    ret i64 [[CONV_I]]
 //
-// AMDGCNSPIRV-LABEL: @test_llround(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call contract addrspace(4) double @llvm.round.f64(double [[X:%.*]])
+// AMDGCNSPIRV-LABEL: define spir_func i64 @test_llround(
+// AMDGCNSPIRV-SAME: double noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR3]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call contract addrspace(4) double @llvm.round.f64(double [[X]])
 // AMDGCNSPIRV-NEXT:    [[CONV_I:%.*]] = fptosi double [[TMP0]] to i64
 // AMDGCNSPIRV-NEXT:    ret i64 [[CONV_I]]
 //
@@ -3236,294 +3670,344 @@ extern "C" __device__ long long int test_llround(double x) {
   return llround(x);
 }
 
-// DEFAULT-LABEL: @test_log10f(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.log10.f32(float [[X:%.*]])
+// DEFAULT-LABEL: define dso_local noundef float @test_log10f(
+// DEFAULT-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.log10.f32(float [[X]])
 // DEFAULT-NEXT:    ret float [[TMP0]]
 //
-// FINITEONLY-LABEL: @test_log10f(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.log10.f32(float nofpclass(nan inf) [[X:%.*]])
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) float @test_log10f(
+// FINITEONLY-SAME: float noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.log10.f32(float nofpclass(nan inf) [[X]])
 // FINITEONLY-NEXT:    ret float [[TMP0]]
 //
-// APPROX-LABEL: @test_log10f(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.log10.f32(float [[X:%.*]])
+// APPROX-LABEL: define dso_local noundef float @test_log10f(
+// APPROX-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.log10.f32(float [[X]])
 // APPROX-NEXT:    ret float [[TMP0]]
 //
-// NCRDIV-LABEL: @test_log10f(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.log10.f32(float [[X:%.*]])
+// NCRDIV-LABEL: define dso_local noundef float @test_log10f(
+// NCRDIV-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.log10.f32(float [[X]])
 // NCRDIV-NEXT:    ret float [[TMP0]]
 //
-// AMDGCNSPIRV-LABEL: @test_log10f(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call contract noundef addrspace(4) float @llvm.log10.f32(float [[X:%.*]])
+// AMDGCNSPIRV-LABEL: define spir_func noundef float @test_log10f(
+// AMDGCNSPIRV-SAME: float noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR3]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call contract noundef addrspace(4) float @llvm.log10.f32(float [[X]])
 // AMDGCNSPIRV-NEXT:    ret float [[TMP0]]
 //
 extern "C" __device__ float test_log10f(float x) {
   return log10f(x);
 }
 
-// DEFAULT-LABEL: @test_log10(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_log10_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-LABEL: define dso_local noundef double @test_log10(
+// DEFAULT-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_log10_f64(double noundef [[X]]) #[[ATTR15]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
-// FINITEONLY-LABEL: @test_log10(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_log10_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) double @test_log10(
+// FINITEONLY-SAME: double noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_log10_f64(double noundef nofpclass(nan inf) [[X]]) #[[ATTR15]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
-// APPROX-LABEL: @test_log10(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_log10_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-LABEL: define dso_local noundef double @test_log10(
+// APPROX-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_log10_f64(double noundef [[X]]) #[[ATTR15]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
-// NCRDIV-LABEL: @test_log10(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_log10_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// NCRDIV-LABEL: define dso_local noundef double @test_log10(
+// NCRDIV-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_log10_f64(double noundef [[X]]) #[[ATTR15]]
 // NCRDIV-NEXT:    ret double [[CALL_I]]
 //
-// AMDGCNSPIRV-LABEL: @test_log10(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_log10_f64(double noundef [[X:%.*]]) #[[ATTR13]]
+// AMDGCNSPIRV-LABEL: define spir_func noundef double @test_log10(
+// AMDGCNSPIRV-SAME: double noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR5]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_log10_f64(double noundef [[X]]) #[[ATTR13]]
 // AMDGCNSPIRV-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_log10(double x) {
   return log10(x);
 }
 
-// DEFAULT-LABEL: @test_log1pf(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_log1p_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-LABEL: define dso_local noundef float @test_log1pf(
+// DEFAULT-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_log1p_f32(float noundef [[X]]) #[[ATTR15]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
-// FINITEONLY-LABEL: @test_log1pf(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_log1p_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) float @test_log1pf(
+// FINITEONLY-SAME: float noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_log1p_f32(float noundef nofpclass(nan inf) [[X]]) #[[ATTR15]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
-// APPROX-LABEL: @test_log1pf(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_log1p_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-LABEL: define dso_local noundef float @test_log1pf(
+// APPROX-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_log1p_f32(float noundef [[X]]) #[[ATTR15]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
-// NCRDIV-LABEL: @test_log1pf(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_log1p_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// NCRDIV-LABEL: define dso_local noundef float @test_log1pf(
+// NCRDIV-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_log1p_f32(float noundef [[X]]) #[[ATTR15]]
 // NCRDIV-NEXT:    ret float [[CALL_I]]
 //
-// AMDGCNSPIRV-LABEL: @test_log1pf(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_log1p_f32(float noundef [[X:%.*]]) #[[ATTR13]]
+// AMDGCNSPIRV-LABEL: define spir_func noundef float @test_log1pf(
+// AMDGCNSPIRV-SAME: float noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR5]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_log1p_f32(float noundef [[X]]) #[[ATTR13]]
 // AMDGCNSPIRV-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_log1pf(float x) {
   return log1pf(x);
 }
 
-// DEFAULT-LABEL: @test_log1p(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_log1p_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-LABEL: define dso_local noundef double @test_log1p(
+// DEFAULT-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_log1p_f64(double noundef [[X]]) #[[ATTR15]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
-// FINITEONLY-LABEL: @test_log1p(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_log1p_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) double @test_log1p(
+// FINITEONLY-SAME: double noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_log1p_f64(double noundef nofpclass(nan inf) [[X]]) #[[ATTR15]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
-// APPROX-LABEL: @test_log1p(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_log1p_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-LABEL: define dso_local noundef double @test_log1p(
+// APPROX-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_log1p_f64(double noundef [[X]]) #[[ATTR15]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
-// NCRDIV-LABEL: @test_log1p(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_log1p_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// NCRDIV-LABEL: define dso_local noundef double @test_log1p(
+// NCRDIV-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_log1p_f64(double noundef [[X]]) #[[ATTR15]]
 // NCRDIV-NEXT:    ret double [[CALL_I]]
 //
-// AMDGCNSPIRV-LABEL: @test_log1p(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_log1p_f64(double noundef [[X:%.*]]) #[[ATTR13]]
+// AMDGCNSPIRV-LABEL: define spir_func noundef double @test_log1p(
+// AMDGCNSPIRV-SAME: double noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR5]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_log1p_f64(double noundef [[X]]) #[[ATTR13]]
 // AMDGCNSPIRV-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_log1p(double x) {
   return log1p(x);
 }
 
-// DEFAULT-LABEL: @test_log2f(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.log2.f32(float [[X:%.*]])
+// DEFAULT-LABEL: define dso_local noundef float @test_log2f(
+// DEFAULT-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.log2.f32(float [[X]])
 // DEFAULT-NEXT:    ret float [[TMP0]]
 //
-// FINITEONLY-LABEL: @test_log2f(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.log2.f32(float nofpclass(nan inf) [[X:%.*]])
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) float @test_log2f(
+// FINITEONLY-SAME: float noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.log2.f32(float nofpclass(nan inf) [[X]])
 // FINITEONLY-NEXT:    ret float [[TMP0]]
 //
-// APPROX-LABEL: @test_log2f(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.amdgcn.log.f32(float [[X:%.*]])
+// APPROX-LABEL: define dso_local noundef float @test_log2f(
+// APPROX-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.amdgcn.log.f32(float [[X]])
 // APPROX-NEXT:    ret float [[TMP0]]
 //
-// NCRDIV-LABEL: @test_log2f(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.log2.f32(float [[X:%.*]])
+// NCRDIV-LABEL: define dso_local noundef float @test_log2f(
+// NCRDIV-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.log2.f32(float [[X]])
 // NCRDIV-NEXT:    ret float [[TMP0]]
 //
-// AMDGCNSPIRV-LABEL: @test_log2f(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call contract noundef addrspace(4) float @llvm.log2.f32(float [[X:%.*]])
+// AMDGCNSPIRV-LABEL: define spir_func noundef float @test_log2f(
+// AMDGCNSPIRV-SAME: float noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR3]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call contract noundef addrspace(4) float @llvm.log2.f32(float [[X]])
 // AMDGCNSPIRV-NEXT:    ret float [[TMP0]]
 //
 extern "C" __device__ float test_log2f(float x) {
   return log2f(x);
 }
 
-// DEFAULT-LABEL: @test_log2(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_log2_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-LABEL: define dso_local noundef double @test_log2(
+// DEFAULT-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_log2_f64(double noundef [[X]]) #[[ATTR15]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
-// FINITEONLY-LABEL: @test_log2(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_log2_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) double @test_log2(
+// FINITEONLY-SAME: double noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_log2_f64(double noundef nofpclass(nan inf) [[X]]) #[[ATTR15]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
-// APPROX-LABEL: @test_log2(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_log2_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-LABEL: define dso_local noundef double @test_log2(
+// APPROX-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_log2_f64(double noundef [[X]]) #[[ATTR15]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
-// NCRDIV-LABEL: @test_log2(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_log2_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// NCRDIV-LABEL: define dso_local noundef double @test_log2(
+// NCRDIV-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_log2_f64(double noundef [[X]]) #[[ATTR15]]
 // NCRDIV-NEXT:    ret double [[CALL_I]]
 //
-// AMDGCNSPIRV-LABEL: @test_log2(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_log2_f64(double noundef [[X:%.*]]) #[[ATTR13]]
+// AMDGCNSPIRV-LABEL: define spir_func noundef double @test_log2(
+// AMDGCNSPIRV-SAME: double noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR5]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_log2_f64(double noundef [[X]]) #[[ATTR13]]
 // AMDGCNSPIRV-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_log2(double x) {
   return log2(x);
 }
 
-// DEFAULT-LABEL: @test_logbf(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_logb_f32(float noundef [[X:%.*]]) #[[ATTR14]]
+// DEFAULT-LABEL: define dso_local noundef float @test_logbf(
+// DEFAULT-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_logb_f32(float noundef [[X]]) #[[ATTR14]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
-// FINITEONLY-LABEL: @test_logbf(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_logb_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]]
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) float @test_logbf(
+// FINITEONLY-SAME: float noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_logb_f32(float noundef nofpclass(nan inf) [[X]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
-// APPROX-LABEL: @test_logbf(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_logb_f32(float noundef [[X:%.*]]) #[[ATTR14]]
+// APPROX-LABEL: define dso_local noundef float @test_logbf(
+// APPROX-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_logb_f32(float noundef [[X]]) #[[ATTR14]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
-// NCRDIV-LABEL: @test_logbf(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_logb_f32(float noundef [[X:%.*]]) #[[ATTR14]]
+// NCRDIV-LABEL: define dso_local noundef float @test_logbf(
+// NCRDIV-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_logb_f32(float noundef [[X]]) #[[ATTR14]]
 // NCRDIV-NEXT:    ret float [[CALL_I]]
 //
-// AMDGCNSPIRV-LABEL: @test_logbf(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_logb_f32(float noundef [[X:%.*]]) #[[ATTR12]]
+// AMDGCNSPIRV-LABEL: define spir_func noundef float @test_logbf(
+// AMDGCNSPIRV-SAME: float noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR4]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_logb_f32(float noundef [[X]]) #[[ATTR12]]
 // AMDGCNSPIRV-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_logbf(float x) {
   return logbf(x);
 }
 
-// DEFAULT-LABEL: @test_logb(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_logb_f64(double noundef [[X:%.*]]) #[[ATTR14]]
+// DEFAULT-LABEL: define dso_local noundef double @test_logb(
+// DEFAULT-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_logb_f64(double noundef [[X]]) #[[ATTR14]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
-// FINITEONLY-LABEL: @test_logb(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_logb_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]]
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) double @test_logb(
+// FINITEONLY-SAME: double noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_logb_f64(double noundef nofpclass(nan inf) [[X]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
-// APPROX-LABEL: @test_logb(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_logb_f64(double noundef [[X:%.*]]) #[[ATTR14]]
+// APPROX-LABEL: define dso_local noundef double @test_logb(
+// APPROX-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_logb_f64(double noundef [[X]]) #[[ATTR14]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
-// NCRDIV-LABEL: @test_logb(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_logb_f64(double noundef [[X:%.*]]) #[[ATTR14]]
+// NCRDIV-LABEL: define dso_local noundef double @test_logb(
+// NCRDIV-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_logb_f64(double noundef [[X]]) #[[ATTR14]]
 // NCRDIV-NEXT:    ret double [[CALL_I]]
 //
-// AMDGCNSPIRV-LABEL: @test_logb(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_logb_f64(double noundef [[X:%.*]]) #[[ATTR12]]
+// AMDGCNSPIRV-LABEL: define spir_func noundef double @test_logb(
+// AMDGCNSPIRV-SAME: double noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR4]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_logb_f64(double noundef [[X]]) #[[ATTR12]]
 // AMDGCNSPIRV-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_logb(double x) {
   return logb(x);
 }
 
-// DEFAULT-LABEL: @test_logf(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.log.f32(float [[X:%.*]])
+// DEFAULT-LABEL: define dso_local noundef float @test_logf(
+// DEFAULT-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.log.f32(float [[X]])
 // DEFAULT-NEXT:    ret float [[TMP0]]
 //
-// FINITEONLY-LABEL: @test_logf(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.log.f32(float nofpclass(nan inf) [[X:%.*]])
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) float @test_logf(
+// FINITEONLY-SAME: float noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.log.f32(float nofpclass(nan inf) [[X]])
 // FINITEONLY-NEXT:    ret float [[TMP0]]
 //
-// APPROX-LABEL: @test_logf(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.log.f32(float [[X:%.*]])
+// APPROX-LABEL: define dso_local noundef float @test_logf(
+// APPROX-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.log.f32(float [[X]])
 // APPROX-NEXT:    ret float [[TMP0]]
 //
-// NCRDIV-LABEL: @test_logf(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.log.f32(float [[X:%.*]])
+// NCRDIV-LABEL: define dso_local noundef float @test_logf(
+// NCRDIV-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.log.f32(float [[X]])
 // NCRDIV-NEXT:    ret float [[TMP0]]
 //
-// AMDGCNSPIRV-LABEL: @test_logf(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call contract noundef addrspace(4) float @llvm.log.f32(float [[X:%.*]])
+// AMDGCNSPIRV-LABEL: define spir_func noundef float @test_logf(
+// AMDGCNSPIRV-SAME: float noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR3]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call contract noundef addrspace(4) float @llvm.log.f32(float [[X]])
 // AMDGCNSPIRV-NEXT:    ret float [[TMP0]]
 //
 extern "C" __device__ float test_logf(float x) {
   return logf(x);
 }
 
-// DEFAULT-LABEL: @test_lrintf(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract float @llvm.rint.f32(float [[X:%.*]])
+// DEFAULT-LABEL: define dso_local i64 @test_lrintf(
+// DEFAULT-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract float @llvm.rint.f32(float [[X]])
 // DEFAULT-NEXT:    [[CONV_I:%.*]] = fptosi float [[TMP0]] to i64
 // DEFAULT-NEXT:    ret i64 [[CONV_I]]
 //
-// FINITEONLY-LABEL: @test_lrintf(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract float @llvm.rint.f32(float nofpclass(nan inf) [[X:%.*]])
+// FINITEONLY-LABEL: define dso_local i64 @test_lrintf(
+// FINITEONLY-SAME: float noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract float @llvm.rint.f32(float nofpclass(nan inf) [[X]])
 // FINITEONLY-NEXT:    [[CONV_I:%.*]] = fptosi float [[TMP0]] to i64
 // FINITEONLY-NEXT:    ret i64 [[CONV_I]]
 //
-// APPROX-LABEL: @test_lrintf(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract float @llvm.rint.f32(float [[X:%.*]])
+// APPROX-LABEL: define dso_local i64 @test_lrintf(
+// APPROX-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract float @llvm.rint.f32(float [[X]])
 // APPROX-NEXT:    [[CONV_I:%.*]] = fptosi float [[TMP0]] to i64
 // APPROX-NEXT:    ret i64 [[CONV_I]]
 //
-// NCRDIV-LABEL: @test_lrintf(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call contract float @llvm.rint.f32(float [[X:%.*]])
+// NCRDIV-LABEL: define dso_local i64 @test_lrintf(
+// NCRDIV-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call contract float @llvm.rint.f32(float [[X]])
 // NCRDIV-NEXT:    [[CONV_I:%.*]] = fptosi float [[TMP0]] to i64
 // NCRDIV-NEXT:    ret i64 [[CONV_I]]
 //
-// AMDGCNSPIRV-LABEL: @test_lrintf(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call contract addrspace(4) float @llvm.rint.f32(float [[X:%.*]])
+// AMDGCNSPIRV-LABEL: define spir_func i64 @test_lrintf(
+// AMDGCNSPIRV-SAME: float noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR3]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call contract addrspace(4) float @llvm.rint.f32(float [[X]])
 // AMDGCNSPIRV-NEXT:    [[CONV_I:%.*]] = fptosi float [[TMP0]] to i64
 // AMDGCNSPIRV-NEXT:    ret i64 [[CONV_I]]
 //
@@ -3531,33 +4015,38 @@ extern "C" __device__ long int test_lrintf(float x) {
   return lrintf(x);
 }
 
-// DEFAULT-LABEL: @test_lrint(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract double @llvm.rint.f64(double [[X:%.*]])
+// DEFAULT-LABEL: define dso_local i64 @test_lrint(
+// DEFAULT-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract double @llvm.rint.f64(double [[X]])
 // DEFAULT-NEXT:    [[CONV_I:%.*]] = fptosi double [[TMP0]] to i64
 // DEFAULT-NEXT:    ret i64 [[CONV_I]]
 //
-// FINITEONLY-LABEL: @test_lrint(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract double @llvm.rint.f64(double nofpclass(nan inf) [[X:%.*]])
+// FINITEONLY-LABEL: define dso_local i64 @test_lrint(
+// FINITEONLY-SAME: double noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract double @llvm.rint.f64(double nofpclass(nan inf) [[X]])
 // FINITEONLY-NEXT:    [[CONV_I:%.*]] = fptosi double [[TMP0]] to i64
 // FINITEONLY-NEXT:    ret i64 [[CONV_I]]
 //
-// APPROX-LABEL: @test_lrint(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract double @llvm.rint.f64(double [[X:%.*]])
+// APPROX-LABEL: define dso_local i64 @test_lrint(
+// APPROX-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract double @llvm.rint.f64(double [[X]])
 // APPROX-NEXT:    [[CONV_I:%.*]] = fptosi double [[TMP0]] to i64
 // APPROX-NEXT:    ret i64 [[CONV_I]]
 //
-// NCRDIV-LABEL: @test_lrint(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call contract double @llvm.rint.f64(double [[X:%.*]])
+// NCRDIV-LABEL: define dso_local i64 @test_lrint(
+// NCRDIV-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call contract double @llvm.rint.f64(double [[X]])
 // NCRDIV-NEXT:    [[CONV_I:%.*]] = fptosi double [[TMP0]] to i64
 // NCRDIV-NEXT:    ret i64 [[CONV_I]]
 //
-// AMDGCNSPIRV-LABEL: @test_lrint(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call contract addrspace(4) double @llvm.rint.f64(double [[X:%.*]])
+// AMDGCNSPIRV-LABEL: define spir_func i64 @test_lrint(
+// AMDGCNSPIRV-SAME: double noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR3]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call contract addrspace(4) double @llvm.rint.f64(double [[X]])
 // AMDGCNSPIRV-NEXT:    [[CONV_I:%.*]] = fptosi double [[TMP0]] to i64
 // AMDGCNSPIRV-NEXT:    ret i64 [[CONV_I]]
 //
@@ -3565,33 +4054,38 @@ extern "C" __device__ long int test_lrint(double x) {
   return lrint(x);
 }
 
-// DEFAULT-LABEL: @test_lroundf(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract float @llvm.round.f32(float [[X:%.*]])
+// DEFAULT-LABEL: define dso_local i64 @test_lroundf(
+// DEFAULT-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract float @llvm.round.f32(float [[X]])
 // DEFAULT-NEXT:    [[CONV_I:%.*]] = fptosi float [[TMP0]] to i64
 // DEFAULT-NEXT:    ret i64 [[CONV_I]]
 //
-// FINITEONLY-LABEL: @test_lroundf(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract float @llvm.round.f32(float nofpclass(nan inf) [[X:%.*]])
+// FINITEONLY-LABEL: define dso_local i64 @test_lroundf(
+// FINITEONLY-SAME: float noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract float @llvm.round.f32(float nofpclass(nan inf) [[X]])
 // FINITEONLY-NEXT:    [[CONV_I:%.*]] = fptosi float [[TMP0]] to i64
 // FINITEONLY-NEXT:    ret i64 [[CONV_I]]
 //
-// APPROX-LABEL: @test_lroundf(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract float @llvm.round.f32(float [[X:%.*]])
+// APPROX-LABEL: define dso_local i64 @test_lroundf(
+// APPROX-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract float @llvm.round.f32(float [[X]])
 // APPROX-NEXT:    [[CONV_I:%.*]] = fptosi float [[TMP0]] to i64
 // APPROX-NEXT:    ret i64 [[CONV_I]]
 //
-// NCRDIV-LABEL: @test_lroundf(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call contract float @llvm.round.f32(float [[X:%.*]])
+// NCRDIV-LABEL: define dso_local i64 @test_lroundf(
+// NCRDIV-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call contract float @llvm.round.f32(float [[X]])
 // NCRDIV-NEXT:    [[CONV_I:%.*]] = fptosi float [[TMP0]] to i64
 // NCRDIV-NEXT:    ret i64 [[CONV_I]]
 //
-// AMDGCNSPIRV-LABEL: @test_lroundf(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call contract addrspace(4) float @llvm.round.f32(float [[X:%.*]])
+// AMDGCNSPIRV-LABEL: define spir_func i64 @test_lroundf(
+// AMDGCNSPIRV-SAME: float noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR3]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call contract addrspace(4) float @llvm.round.f32(float [[X]])
 // AMDGCNSPIRV-NEXT:    [[CONV_I:%.*]] = fptosi float [[TMP0]] to i64
 // AMDGCNSPIRV-NEXT:    ret i64 [[CONV_I]]
 //
@@ -3599,33 +4093,38 @@ extern "C" __device__ long int test_lroundf(float x) {
   return lroundf(x);
 }
 
-// DEFAULT-LABEL: @test_lround(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract double @llvm.round.f64(double [[X:%.*]])
+// DEFAULT-LABEL: define dso_local i64 @test_lround(
+// DEFAULT-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract double @llvm.round.f64(double [[X]])
 // DEFAULT-NEXT:    [[CONV_I:%.*]] = fptosi double [[TMP0]] to i64
 // DEFAULT-NEXT:    ret i64 [[CONV_I]]
 //
-// FINITEONLY-LABEL: @test_lround(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract double @llvm.round.f64(double nofpclass(nan inf) [[X:%.*]])
+// FINITEONLY-LABEL: define dso_local i64 @test_lround(
+// FINITEONLY-SAME: double noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract double @llvm.round.f64(double nofpclass(nan inf) [[X]])
 // FINITEONLY-NEXT:    [[CONV_I:%.*]] = fptosi double [[TMP0]] to i64
 // FINITEONLY-NEXT:    ret i64 [[CONV_I]]
 //
-// APPROX-LABEL: @test_lround(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract double @llvm.round.f64(double [[X:%.*]])
+// APPROX-LABEL: define dso_local i64 @test_lround(
+// APPROX-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract double @llvm.round.f64(double [[X]])
 // APPROX-NEXT:    [[CONV_I:%.*]] = fptosi double [[TMP0]] to i64
 // APPROX-NEXT:    ret i64 [[CONV_I]]
 //
-// NCRDIV-LABEL: @test_lround(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call contract double @llvm.round.f64(double [[X:%.*]])
+// NCRDIV-LABEL: define dso_local i64 @test_lround(
+// NCRDIV-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call contract double @llvm.round.f64(double [[X]])
 // NCRDIV-NEXT:    [[CONV_I:%.*]] = fptosi double [[TMP0]] to i64
 // NCRDIV-NEXT:    ret i64 [[CONV_I]]
 //
-// AMDGCNSPIRV-LABEL: @test_lround(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call contract addrspace(4) double @llvm.round.f64(double [[X:%.*]])
+// AMDGCNSPIRV-LABEL: define spir_func i64 @test_lround(
+// AMDGCNSPIRV-SAME: double noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR3]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call contract addrspace(4) double @llvm.round.f64(double [[X]])
 // AMDGCNSPIRV-NEXT:    [[CONV_I:%.*]] = fptosi double [[TMP0]] to i64
 // AMDGCNSPIRV-NEXT:    ret i64 [[CONV_I]]
 //
@@ -3633,54 +4132,59 @@ extern "C" __device__ long int test_lround(double x) {
   return lround(x);
 }
 
-// DEFAULT-LABEL: @test_modff(
-// DEFAULT-NEXT:  entry:
+// DEFAULT-LABEL: define dso_local noundef float @test_modff(
+// DEFAULT-SAME: float noundef [[X:%.*]], ptr noundef writeonly captures(none) initializes((0, 4)) [[Y:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
 // DEFAULT-NEXT:    [[__TMP_I:%.*]] = alloca float, align 4, addrspace(5)
 // DEFAULT-NEXT:    call void @llvm.lifetime.start.p5(ptr addrspace(5) [[__TMP_I]]) #[[ATTR17:[0-9]+]]
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = call contract noundef float @__ocml_modf_f32(float noundef [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
-// DEFAULT-NEXT:    [[TMP0:%.*]] = load float, ptr addrspace(5) [[__TMP_I]], align 4, !tbaa [[TBAA16:![0-9]+]]
-// DEFAULT-NEXT:    store float [[TMP0]], ptr [[Y:%.*]], align 4, !tbaa [[TBAA16]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = call contract noundef float @__ocml_modf_f32(float noundef [[X]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[TMP0:%.*]] = load float, ptr addrspace(5) [[__TMP_I]], align 4, !tbaa [[FLOAT_TBAA16:![0-9]+]]
+// DEFAULT-NEXT:    store float [[TMP0]], ptr [[Y]], align 4, !tbaa [[FLOAT_TBAA16]]
 // DEFAULT-NEXT:    call void @llvm.lifetime.end.p5(ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
-// FINITEONLY-LABEL: @test_modff(
-// FINITEONLY-NEXT:  entry:
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) float @test_modff(
+// FINITEONLY-SAME: float noundef nofpclass(nan inf) [[X:%.*]], ptr noundef writeonly captures(none) initializes((0, 4)) [[Y:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
 // FINITEONLY-NEXT:    [[__TMP_I:%.*]] = alloca float, align 4, addrspace(5)
 // FINITEONLY-NEXT:    call void @llvm.lifetime.start.p5(ptr addrspace(5) [[__TMP_I]]) #[[ATTR17:[0-9]+]]
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_modf_f32(float noundef nofpclass(nan inf) [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = load float, ptr addrspace(5) [[__TMP_I]], align 4, !tbaa [[TBAA16:![0-9]+]]
-// FINITEONLY-NEXT:    store float [[TMP0]], ptr [[Y:%.*]], align 4, !tbaa [[TBAA16]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_modf_f32(float noundef nofpclass(nan inf) [[X]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = load float, ptr addrspace(5) [[__TMP_I]], align 4, !tbaa [[FLOAT_TBAA16:![0-9]+]]
+// FINITEONLY-NEXT:    store float [[TMP0]], ptr [[Y]], align 4, !tbaa [[FLOAT_TBAA16]]
 // FINITEONLY-NEXT:    call void @llvm.lifetime.end.p5(ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
-// APPROX-LABEL: @test_modff(
-// APPROX-NEXT:  entry:
+// APPROX-LABEL: define dso_local noundef float @test_modff(
+// APPROX-SAME: float noundef [[X:%.*]], ptr noundef writeonly captures(none) initializes((0, 4)) [[Y:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
 // APPROX-NEXT:    [[__TMP_I:%.*]] = alloca float, align 4, addrspace(5)
 // APPROX-NEXT:    call void @llvm.lifetime.start.p5(ptr addrspace(5) [[__TMP_I]]) #[[ATTR17:[0-9]+]]
-// APPROX-NEXT:    [[CALL_I:%.*]] = call contract noundef float @__ocml_modf_f32(float noundef [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
-// APPROX-NEXT:    [[TMP0:%.*]] = load float, ptr addrspace(5) [[__TMP_I]], align 4, !tbaa [[TBAA16:![0-9]+]]
-// APPROX-NEXT:    store float [[TMP0]], ptr [[Y:%.*]], align 4, !tbaa [[TBAA16]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = call contract noundef float @__ocml_modf_f32(float noundef [[X]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
+// APPROX-NEXT:    [[TMP0:%.*]] = load float, ptr addrspace(5) [[__TMP_I]], align 4, !tbaa [[FLOAT_TBAA16:![0-9]+]]
+// APPROX-NEXT:    store float [[TMP0]], ptr [[Y]], align 4, !tbaa [[FLOAT_TBAA16]]
 // APPROX-NEXT:    call void @llvm.lifetime.end.p5(ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
-// NCRDIV-LABEL: @test_modff(
-// NCRDIV-NEXT:  entry:
+// NCRDIV-LABEL: define dso_local noundef float @test_modff(
+// NCRDIV-SAME: float noundef [[X:%.*]], ptr noundef writeonly captures(none) initializes((0, 4)) [[Y:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
 // NCRDIV-NEXT:    [[__TMP_I:%.*]] = alloca float, align 4, addrspace(5)
 // NCRDIV-NEXT:    call void @llvm.lifetime.start.p5(ptr addrspace(5) [[__TMP_I]]) #[[ATTR17:[0-9]+]]
-// NCRDIV-NEXT:    [[CALL_I:%.*]] = call contract noundef float @__ocml_modf_f32(float noundef [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
-// NCRDIV-NEXT:    [[TMP0:%.*]] = load float, ptr addrspace(5) [[__TMP_I]], align 4, !tbaa [[TBAA17:![0-9]+]]
-// NCRDIV-NEXT:    store float [[TMP0]], ptr [[Y:%.*]], align 4, !tbaa [[TBAA17]]
+// NCRDIV-NEXT:    [[CALL_I:%.*]] = call contract noundef float @__ocml_modf_f32(float noundef [[X]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
+// NCRDIV-NEXT:    [[TMP0:%.*]] = load float, ptr addrspace(5) [[__TMP_I]], align 4, !tbaa [[FLOAT_TBAA17:![0-9]+]]
+// NCRDIV-NEXT:    store float [[TMP0]], ptr [[Y]], align 4, !tbaa [[FLOAT_TBAA17]]
 // NCRDIV-NEXT:    call void @llvm.lifetime.end.p5(ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
 // NCRDIV-NEXT:    ret float [[CALL_I]]
 //
-// AMDGCNSPIRV-LABEL: @test_modff(
-// AMDGCNSPIRV-NEXT:  entry:
+// AMDGCNSPIRV-LABEL: define spir_func noundef float @test_modff(
+// AMDGCNSPIRV-SAME: float noundef [[X:%.*]], ptr addrspace(4) noundef writeonly captures(none) initializes((0, 4)) [[Y:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR6]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
 // AMDGCNSPIRV-NEXT:    [[__TMP_I:%.*]] = alloca float, align 4
 // AMDGCNSPIRV-NEXT:    [[__TMP_ASCAST_I:%.*]] = addrspacecast ptr [[__TMP_I]] to ptr addrspace(4)
 // AMDGCNSPIRV-NEXT:    call addrspace(4) void @llvm.lifetime.start.p0(ptr nonnull [[__TMP_I]]) #[[ATTR15:[0-9]+]]
-// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = call contract spir_func noundef addrspace(4) float @__ocml_modf_f32(float noundef [[X:%.*]], ptr noundef nonnull [[__TMP_I]]) #[[ATTR14]]
-// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = load float, ptr addrspace(4) [[__TMP_ASCAST_I]], align 4, !tbaa [[TBAA17:![0-9]+]]
-// AMDGCNSPIRV-NEXT:    store float [[TMP0]], ptr addrspace(4) [[Y:%.*]], align 4, !tbaa [[TBAA17]]
+// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = call contract spir_func noundef addrspace(4) float @__ocml_modf_f32(float noundef [[X]], ptr noundef nonnull [[__TMP_I]]) #[[ATTR14]]
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = load float, ptr addrspace(4) [[__TMP_ASCAST_I]], align 4, !tbaa [[FLOAT_TBAA17:![0-9]+]]
+// AMDGCNSPIRV-NEXT:    store float [[TMP0]], ptr addrspace(4) [[Y]], align 4, !tbaa [[FLOAT_TBAA17]]
 // AMDGCNSPIRV-NEXT:    call addrspace(4) void @llvm.lifetime.end.p0(ptr nonnull [[__TMP_I]]) #[[ATTR15]]
 // AMDGCNSPIRV-NEXT:    ret float [[CALL_I]]
 //
@@ -3688,54 +4192,59 @@ extern "C" __device__ float test_modff(float x, float* y) {
   return modff(x, y);
 }
 
-// DEFAULT-LABEL: @test_modf(
-// DEFAULT-NEXT:  entry:
+// DEFAULT-LABEL: define dso_local noundef double @test_modf(
+// DEFAULT-SAME: double noundef [[X:%.*]], ptr noundef writeonly captures(none) initializes((0, 8)) [[Y:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
 // DEFAULT-NEXT:    [[__TMP_I:%.*]] = alloca double, align 8, addrspace(5)
 // DEFAULT-NEXT:    call void @llvm.lifetime.start.p5(ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = call contract noundef double @__ocml_modf_f64(double noundef [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
-// DEFAULT-NEXT:    [[TMP0:%.*]] = load double, ptr addrspace(5) [[__TMP_I]], align 8, !tbaa [[TBAA18:![0-9]+]]
-// DEFAULT-NEXT:    store double [[TMP0]], ptr [[Y:%.*]], align 8, !tbaa [[TBAA18]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = call contract noundef double @__ocml_modf_f64(double noundef [[X]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[TMP0:%.*]] = load double, ptr addrspace(5) [[__TMP_I]], align 8, !tbaa [[DOUBLE_TBAA18:![0-9]+]]
+// DEFAULT-NEXT:    store double [[TMP0]], ptr [[Y]], align 8, !tbaa [[DOUBLE_TBAA18]]
 // DEFAULT-NEXT:    call void @llvm.lifetime.end.p5(ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
-// FINITEONLY-LABEL: @test_modf(
-// FINITEONLY-NEXT:  entry:
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) double @test_modf(
+// FINITEONLY-SAME: double noundef nofpclass(nan inf) [[X:%.*]], ptr noundef writeonly captures(none) initializes((0, 8)) [[Y:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
 // FINITEONLY-NEXT:    [[__TMP_I:%.*]] = alloca double, align 8, addrspace(5)
 // FINITEONLY-NEXT:    call void @llvm.lifetime.start.p5(ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_modf_f64(double noundef nofpclass(nan inf) [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = load double, ptr addrspace(5) [[__TMP_I]], align 8, !tbaa [[TBAA18:![0-9]+]]
-// FINITEONLY-NEXT:    store double [[TMP0]], ptr [[Y:%.*]], align 8, !tbaa [[TBAA18]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_modf_f64(double noundef nofpclass(nan inf) [[X]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = load double, ptr addrspace(5) [[__TMP_I]], align 8, !tbaa [[DOUBLE_TBAA18:![0-9]+]]
+// FINITEONLY-NEXT:    store double [[TMP0]], ptr [[Y]], align 8, !tbaa [[DOUBLE_TBAA18]]
 // FINITEONLY-NEXT:    call void @llvm.lifetime.end.p5(ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
-// APPROX-LABEL: @test_modf(
-// APPROX-NEXT:  entry:
+// APPROX-LABEL: define dso_local noundef double @test_modf(
+// APPROX-SAME: double noundef [[X:%.*]], ptr noundef writeonly captures(none) initializes((0, 8)) [[Y:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
 // APPROX-NEXT:    [[__TMP_I:%.*]] = alloca double, align 8, addrspace(5)
 // APPROX-NEXT:    call void @llvm.lifetime.start.p5(ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
-// APPROX-NEXT:    [[CALL_I:%.*]] = call contract noundef double @__ocml_modf_f64(double noundef [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
-// APPROX-NEXT:    [[TMP0:%.*]] = load double, ptr addrspace(5) [[__TMP_I]], align 8, !tbaa [[TBAA18:![0-9]+]]
-// APPROX-NEXT:    store double [[TMP0]], ptr [[Y:%.*]], align 8, !tbaa [[TBAA18]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = call contract noundef double @__ocml_modf_f64(double noundef [[X]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
+// APPROX-NEXT:    [[TMP0:%.*]] = load double, ptr addrspace(5) [[__TMP_I]], align 8, !tbaa [[DOUBLE_TBAA18:![0-9]+]]
+// APPROX-NEXT:    store double [[TMP0]], ptr [[Y]], align 8, !tbaa [[DOUBLE_TBAA18]]
 // APPROX-NEXT:    call void @llvm.lifetime.end.p5(ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
-// NCRDIV-LABEL: @test_modf(
-// NCRDIV-NEXT:  entry:
+// NCRDIV-LABEL: define dso_local noundef double @test_modf(
+// NCRDIV-SAME: double noundef [[X:%.*]], ptr noundef writeonly captures(none) initializes((0, 8)) [[Y:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
 // NCRDIV-NEXT:    [[__TMP_I:%.*]] = alloca double, align 8, addrspace(5)
 // NCRDIV-NEXT:    call void @llvm.lifetime.start.p5(ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
-// NCRDIV-NEXT:    [[CALL_I:%.*]] = call contract noundef double @__ocml_modf_f64(double noundef [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
-// NCRDIV-NEXT:    [[TMP0:%.*]] = load double, ptr addrspace(5) [[__TMP_I]], align 8, !tbaa [[TBAA19:![0-9]+]]
-// NCRDIV-NEXT:    store double [[TMP0]], ptr [[Y:%.*]], align 8, !tbaa [[TBAA19]]
+// NCRDIV-NEXT:    [[CALL_I:%.*]] = call contract noundef double @__ocml_modf_f64(double noundef [[X]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
+// NCRDIV-NEXT:    [[TMP0:%.*]] = load double, ptr addrspace(5) [[__TMP_I]], align 8, !tbaa [[DOUBLE_TBAA19:![0-9]+]]
+// NCRDIV-NEXT:    store double [[TMP0]], ptr [[Y]], align 8, !tbaa [[DOUBLE_TBAA19]]
 // NCRDIV-NEXT:    call void @llvm.lifetime.end.p5(ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
 // NCRDIV-NEXT:    ret double [[CALL_I]]
 //
-// AMDGCNSPIRV-LABEL: @test_modf(
-// AMDGCNSPIRV-NEXT:  entry:
+// AMDGCNSPIRV-LABEL: define spir_func noundef double @test_modf(
+// AMDGCNSPIRV-SAME: double noundef [[X:%.*]], ptr addrspace(4) noundef writeonly captures(none) initializes((0, 8)) [[Y:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR6]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
 // AMDGCNSPIRV-NEXT:    [[__TMP_I:%.*]] = alloca double, align 8
 // AMDGCNSPIRV-NEXT:    [[__TMP_ASCAST_I:%.*]] = addrspacecast ptr [[__TMP_I]] to ptr addrspace(4)
 // AMDGCNSPIRV-NEXT:    call addrspace(4) void @llvm.lifetime.start.p0(ptr nonnull [[__TMP_I]]) #[[ATTR15]]
-// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = call contract spir_func noundef addrspace(4) double @__ocml_modf_f64(double noundef [[X:%.*]], ptr noundef nonnull [[__TMP_I]]) #[[ATTR14]]
-// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = load double, ptr addrspace(4) [[__TMP_ASCAST_I]], align 8, !tbaa [[TBAA19:![0-9]+]]
-// AMDGCNSPIRV-NEXT:    store double [[TMP0]], ptr addrspace(4) [[Y:%.*]], align 8, !tbaa [[TBAA19]]
+// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = call contract spir_func noundef addrspace(4) double @__ocml_modf_f64(double noundef [[X]], ptr noundef nonnull [[__TMP_I]]) #[[ATTR14]]
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = load double, ptr addrspace(4) [[__TMP_ASCAST_I]], align 8, !tbaa [[DOUBLE_TBAA19:![0-9]+]]
+// AMDGCNSPIRV-NEXT:    store double [[TMP0]], ptr addrspace(4) [[Y]], align 8, !tbaa [[DOUBLE_TBAA19]]
 // AMDGCNSPIRV-NEXT:    call addrspace(4) void @llvm.lifetime.end.p0(ptr nonnull [[__TMP_I]]) #[[ATTR15]]
 // AMDGCNSPIRV-NEXT:    ret double [[CALL_I]]
 //
@@ -3743,325 +4252,330 @@ extern "C" __device__ double test_modf(double x, double* y) {
   return modf(x, y);
 }
 
-// DEFAULT-LABEL: @test_nanf(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[TMP0:%.*]] = load i8, ptr [[TAG:%.*]], align 1, !tbaa [[TBAA4]]
+// DEFAULT-LABEL: define dso_local float @test_nanf(
+// DEFAULT-SAME: ptr noundef readonly captures(none) [[TAG:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[TMP0:%.*]] = load i8, ptr [[TAG]], align 1, !tbaa [[CHAR_TBAA4]]
 // DEFAULT-NEXT:    [[CMP_I_I:%.*]] = icmp eq i8 [[TMP0]], 48
-// DEFAULT-NEXT:    br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[WHILE_COND_I14_I_I_PREHEADER:%.*]]
-// DEFAULT:       while.cond.i14.i.i.preheader:
-// DEFAULT-NEXT:    [[TMP1:%.*]] = load i8, ptr [[TAG]], align 1, !tbaa [[TBAA4]]
+// DEFAULT-NEXT:    br i1 [[CMP_I_I]], label %[[IF_THEN_I_I:.*]], label %[[WHILE_COND_I14_I_I_PREHEADER:.*]]
+// DEFAULT:       [[WHILE_COND_I14_I_I_PREHEADER]]:
+// DEFAULT-NEXT:    [[TMP1:%.*]] = load i8, ptr [[TAG]], align 1, !tbaa [[CHAR_TBAA4]]
 // DEFAULT-NEXT:    [[CMP_NOT_I17_I_I5:%.*]] = icmp eq i8 [[TMP1]], 0
-// DEFAULT-NEXT:    br i1 [[CMP_NOT_I17_I_I5]], label [[_ZL4NANFPKC_EXIT:%.*]], label [[WHILE_BODY_I18_I_I:%.*]]
-// DEFAULT:       if.then.i.i:
+// DEFAULT-NEXT:    br i1 [[CMP_NOT_I17_I_I5]], label %[[_ZL4NANFPKC_EXIT:.*]], label %[[WHILE_BODY_I18_I_I:.*]]
+// DEFAULT:       [[IF_THEN_I_I]]:
 // DEFAULT-NEXT:    [[INCDEC_PTR_I_I:%.*]] = getelementptr inbounds nuw i8, ptr [[TAG]], i64 1
-// DEFAULT-NEXT:    [[TMP2:%.*]] = load i8, ptr [[INCDEC_PTR_I_I]], align 1, !tbaa [[TBAA4]]
-// DEFAULT-NEXT:    switch i8 [[TMP2]], label [[WHILE_COND_I_I_I_PREHEADER:%.*]] [
-// DEFAULT-NEXT:      i8 120, label [[IF_THEN5_I_I:%.*]]
-// DEFAULT-NEXT:      i8 88, label [[IF_THEN5_I_I]]
+// DEFAULT-NEXT:    [[TMP2:%.*]] = load i8, ptr [[INCDEC_PTR_I_I]], align 1, !tbaa [[CHAR_TBAA4]]
+// DEFAULT-NEXT:    switch i8 [[TMP2]], label %[[WHILE_COND_I_I_I_PREHEADER:.*]] [
+// DEFAULT-NEXT:      i8 120, label %[[IF_THEN5_I_I:.*]]
+// DEFAULT-NEXT:      i8 88, label %[[IF_THEN5_I_I]]
 // DEFAULT-NEXT:    ]
-// DEFAULT:       while.cond.i.i.i.preheader:
-// DEFAULT-NEXT:    [[TMP3:%.*]] = load i8, ptr [[INCDEC_PTR_I_I]], align 1, !tbaa [[TBAA4]]
+// DEFAULT:       [[WHILE_COND_I_I_I_PREHEADER]]:
+// DEFAULT-NEXT:    [[TMP3:%.*]] = load i8, ptr [[INCDEC_PTR_I_I]], align 1, !tbaa [[CHAR_TBAA4]]
 // DEFAULT-NEXT:    [[CMP_NOT_I_I_I14:%.*]] = icmp eq i8 [[TMP3]], 0
-// DEFAULT-NEXT:    br i1 [[CMP_NOT_I_I_I14]], label [[_ZL4NANFPKC_EXIT]], label [[WHILE_BODY_I_I_I:%.*]]
-// DEFAULT:       if.then5.i.i:
-// DEFAULT-NEXT:    [[TMP4:%.*]] = load i8, ptr [[INCDEC_PTR_I_I]], align 1, !tbaa [[TBAA4]]
+// DEFAULT-NEXT:    br i1 [[CMP_NOT_I_I_I14]], label %[[_ZL4NANFPKC_EXIT]], label %[[WHILE_BODY_I_I_I:.*]]
+// DEFAULT:       [[IF_THEN5_I_I]]:
+// DEFAULT-NEXT:    [[TMP4:%.*]] = load i8, ptr [[INCDEC_PTR_I_I]], align 1, !tbaa [[CHAR_TBAA4]]
 // DEFAULT-NEXT:    [[CMP_NOT_I30_I_I9:%.*]] = icmp eq i8 [[TMP4]], 0
-// DEFAULT-NEXT:    br i1 [[CMP_NOT_I30_I_I9]], label [[_ZL4NANFPKC_EXIT]], label [[WHILE_BODY_I31_I_I:%.*]]
-// DEFAULT:       while.body.i31.i.i:
-// DEFAULT-NEXT:    [[TMP5:%.*]] = phi i8 [ [[TMP9:%.*]], [[IF_END31_I_I_I:%.*]] ], [ [[TMP4]], [[IF_THEN5_I_I]] ]
-// DEFAULT-NEXT:    [[__R_0_I29_I_I11:%.*]] = phi i64 [ [[ADD28_I_I_I:%.*]], [[IF_END31_I_I_I]] ], [ 0, [[IF_THEN5_I_I]] ]
-// DEFAULT-NEXT:    [[__TAGP_ADDR_0_I28_I_I10:%.*]] = phi ptr [ [[INCDEC_PTR_I34_I_I:%.*]], [[IF_END31_I_I_I]] ], [ [[INCDEC_PTR_I_I]], [[IF_THEN5_I_I]] ]
+// DEFAULT-NEXT:    br i1 [[CMP_NOT_I30_I_I9]], label %[[_ZL4NANFPKC_EXIT]], label %[[WHILE_BODY_I31_I_I:.*]]
+// DEFAULT:       [[WHILE_BODY_I31_I_I]]:
+// DEFAULT-NEXT:    [[TMP5:%.*]] = phi i8 [ [[TMP9:%.*]], %[[IF_END31_I_I_I:.*]] ], [ [[TMP4]], %[[IF_THEN5_I_I]] ]
+// DEFAULT-NEXT:    [[__R_0_I29_I_I11:%.*]] = phi i64 [ [[ADD28_I_I_I:%.*]], %[[IF_END31_I_I_I]] ], [ 0, %[[IF_THEN5_I_I]] ]
+// DEFAULT-NEXT:    [[__TAGP_ADDR_0_I28_I_I10:%.*]] = phi ptr [ [[INCDEC_PTR_I34_I_I:%.*]], %[[IF_END31_I_I_I]] ], [ [[INCDEC_PTR_I_I]], %[[IF_THEN5_I_I]] ]
 // DEFAULT-NEXT:    [[TMP6:%.*]] = add i8 [[TMP5]], -48
 // DEFAULT-NEXT:    [[OR_COND_I32_I_I:%.*]] = icmp ult i8 [[TMP6]], 10
-// DEFAULT-NEXT:    br i1 [[OR_COND_I32_I_I]], label [[IF_END31_I_I_I]], label [[IF_ELSE_I_I_I:%.*]]
-// DEFAULT:       if.else.i.i.i:
+// DEFAULT-NEXT:    br i1 [[OR_COND_I32_I_I]], label %[[IF_END31_I_I_I]], label %[[IF_ELSE_I_I_I:.*]]
+// DEFAULT:       [[IF_ELSE_I_I_I]]:
 // DEFAULT-NEXT:    [[TMP7:%.*]] = add i8 [[TMP5]], -97
 // DEFAULT-NEXT:    [[OR_COND33_I_I_I:%.*]] = icmp ult i8 [[TMP7]], 6
-// DEFAULT-NEXT:    br i1 [[OR_COND33_I_I_I]], label [[IF_END31_I_I_I]], label [[IF_ELSE17_I_I_I:%.*]]
-// DEFAULT:       if.else17.i.i.i:
+// DEFAULT-NEXT:    br i1 [[OR_COND33_I_I_I]], label %[[IF_END31_I_I_I]], label %[[IF_ELSE17_I_I_I:.*]]
+// DEFAULT:       [[IF_ELSE17_I_I_I]]:
 // DEFAULT-NEXT:    [[TMP8:%.*]] = add i8 [[TMP5]], -65
 // DEFAULT-NEXT:    [[OR_COND34_I_I_I:%.*]] = icmp ult i8 [[TMP8]], 6
-// DEFAULT-NEXT:    br i1 [[OR_COND34_I_I_I]], label [[IF_END31_I_I_I]], label [[_ZL4NANFPKC_EXIT]]
-// DEFAULT:       if.end31.i.i.i:
-// DEFAULT-NEXT:    [[DOTSINK:%.*]] = phi i64 [ -48, [[WHILE_BODY_I31_I_I]] ], [ -87, [[IF_ELSE_I_I_I]] ], [ -55, [[IF_ELSE17_I_I_I]] ]
+// DEFAULT-NEXT:    br i1 [[OR_COND34_I_I_I]], label %[[IF_END31_I_I_I]], label %[[_ZL4NANFPKC_EXIT]]
+// DEFAULT:       [[IF_END31_I_I_I]]:
+// DEFAULT-NEXT:    [[DOTSINK:%.*]] = phi i64 [ -48, %[[WHILE_BODY_I31_I_I]] ], [ -87, %[[IF_ELSE_I_I_I]] ], [ -55, %[[IF_ELSE17_I_I_I]] ]
 // DEFAULT-NEXT:    [[MUL24_I_I_I:%.*]] = shl i64 [[__R_0_I29_I_I11]], 4
 // DEFAULT-NEXT:    [[CONV25_I_I_I:%.*]] = zext nneg i8 [[TMP5]] to i64
 // DEFAULT-NEXT:    [[ADD26_I_I_I:%.*]] = add i64 [[MUL24_I_I_I]], [[DOTSINK]]
 // DEFAULT-NEXT:    [[ADD28_I_I_I]] = add i64 [[ADD26_I_I_I]], [[CONV25_I_I_I]]
 // DEFAULT-NEXT:    [[INCDEC_PTR_I34_I_I]] = getelementptr inbounds nuw i8, ptr [[__TAGP_ADDR_0_I28_I_I10]], i64 1
-// DEFAULT-NEXT:    [[TMP9]] = load i8, ptr [[INCDEC_PTR_I34_I_I]], align 1, !tbaa [[TBAA4]]
+// DEFAULT-NEXT:    [[TMP9]] = load i8, ptr [[INCDEC_PTR_I34_I_I]], align 1, !tbaa [[CHAR_TBAA4]]
 // DEFAULT-NEXT:    [[CMP_NOT_I30_I_I:%.*]] = icmp eq i8 [[TMP9]], 0
-// DEFAULT-NEXT:    br i1 [[CMP_NOT_I30_I_I]], label [[_ZL4NANFPKC_EXIT]], label [[WHILE_BODY_I31_I_I]], !llvm.loop [[LOOP11]]
-// DEFAULT:       while.body.i.i.i:
-// DEFAULT-NEXT:    [[TMP10:%.*]] = phi i8 [ [[TMP12:%.*]], [[IF_THEN_I_I_I:%.*]] ], [ [[TMP3]], [[WHILE_COND_I_I_I_PREHEADER]] ]
-// DEFAULT-NEXT:    [[__R_0_I_I_I16:%.*]] = phi i64 [ [[SUB_I_I_I:%.*]], [[IF_THEN_I_I_I]] ], [ 0, [[WHILE_COND_I_I_I_PREHEADER]] ]
-// DEFAULT-NEXT:    [[__TAGP_ADDR_0_I_I_I15:%.*]] = phi ptr [ [[INCDEC_PTR_I_I_I:%.*]], [[IF_THEN_I_I_I]] ], [ [[INCDEC_PTR_I_I]], [[WHILE_COND_I_I_I_PREHEADER]] ]
+// DEFAULT-NEXT:    br i1 [[CMP_NOT_I30_I_I]], label %[[_ZL4NANFPKC_EXIT]], label %[[WHILE_BODY_I31_I_I]], !llvm.loop [[LOOP11]]
+// DEFAULT:       [[WHILE_BODY_I_I_I]]:
+// DEFAULT-NEXT:    [[TMP10:%.*]] = phi i8 [ [[TMP12:%.*]], %[[IF_THEN_I_I_I:.*]] ], [ [[TMP3]], %[[WHILE_COND_I_I_I_PREHEADER]] ]
+// DEFAULT-NEXT:    [[__R_0_I_I_I16:%.*]] = phi i64 [ [[SUB_I_I_I:%.*]], %[[IF_THEN_I_I_I]] ], [ 0, %[[WHILE_COND_I_I_I_PREHEADER]] ]
+// DEFAULT-NEXT:    [[__TAGP_ADDR_0_I_I_I15:%.*]] = phi ptr [ [[INCDEC_PTR_I_I_I:%.*]], %[[IF_THEN_I_I_I]] ], [ [[INCDEC_PTR_I_I]], %[[WHILE_COND_I_I_I_PREHEADER]] ]
 // DEFAULT-NEXT:    [[TMP11:%.*]] = and i8 [[TMP10]], -8
 // DEFAULT-NEXT:    [[OR_COND_I_I_I:%.*]] = icmp eq i8 [[TMP11]], 48
-// DEFAULT-NEXT:    br i1 [[OR_COND_I_I_I]], label [[IF_THEN_I_I_I]], label [[_ZL4NANFPKC_EXIT]]
-// DEFAULT:       if.then.i.i.i:
+// DEFAULT-NEXT:    br i1 [[OR_COND_I_I_I]], label %[[IF_THEN_I_I_I]], label %[[_ZL4NANFPKC_EXIT]]
+// DEFAULT:       [[IF_THEN_I_I_I]]:
 // DEFAULT-NEXT:    [[MUL_I_I_I:%.*]] = shl i64 [[__R_0_I_I_I16]], 3
 // DEFAULT-NEXT:    [[CONV5_I_I_I:%.*]] = zext nneg i8 [[TMP10]] to i64
 // DEFAULT-NEXT:    [[ADD_I_I_I:%.*]] = add i64 [[MUL_I_I_I]], -48
 // DEFAULT-NEXT:    [[SUB_I_I_I]] = add i64 [[ADD_I_I_I]], [[CONV5_I_I_I]]
 // DEFAULT-NEXT:    [[INCDEC_PTR_I_I_I]] = getelementptr inbounds nuw i8, ptr [[__TAGP_ADDR_0_I_I_I15]], i64 1
-// DEFAULT-NEXT:    [[TMP12]] = load i8, ptr [[INCDEC_PTR_I_I_I]], align 1, !tbaa [[TBAA4]]
+// DEFAULT-NEXT:    [[TMP12]] = load i8, ptr [[INCDEC_PTR_I_I_I]], align 1, !tbaa [[CHAR_TBAA4]]
 // DEFAULT-NEXT:    [[CMP_NOT_I_I_I:%.*]] = icmp eq i8 [[TMP12]], 0
-// DEFAULT-NEXT:    br i1 [[CMP_NOT_I_I_I]], label [[_ZL4NANFPKC_EXIT]], label [[WHILE_BODY_I_I_I]], !llvm.loop [[LOOP7]]
-// DEFAULT:       while.body.i18.i.i:
-// DEFAULT-NEXT:    [[TMP13:%.*]] = phi i8 [ [[TMP15:%.*]], [[IF_THEN_I21_I_I:%.*]] ], [ [[TMP1]], [[WHILE_COND_I14_I_I_PREHEADER]] ]
-// DEFAULT-NEXT:    [[__R_0_I16_I_I7:%.*]] = phi i64 [ [[SUB_I25_I_I:%.*]], [[IF_THEN_I21_I_I]] ], [ 0, [[WHILE_COND_I14_I_I_PREHEADER]] ]
-// DEFAULT-NEXT:    [[__TAGP_ADDR_0_I15_I_I6:%.*]] = phi ptr [ [[INCDEC_PTR_I26_I_I:%.*]], [[IF_THEN_I21_I_I]] ], [ [[TAG]], [[WHILE_COND_I14_I_I_PREHEADER]] ]
+// DEFAULT-NEXT:    br i1 [[CMP_NOT_I_I_I]], label %[[_ZL4NANFPKC_EXIT]], label %[[WHILE_BODY_I_I_I]], !llvm.loop [[LOOP7]]
+// DEFAULT:       [[WHILE_BODY_I18_I_I]]:
+// DEFAULT-NEXT:    [[TMP13:%.*]] = phi i8 [ [[TMP15:%.*]], %[[IF_THEN_I21_I_I:.*]] ], [ [[TMP1]], %[[WHILE_COND_I14_I_I_PREHEADER]] ]
+// DEFAULT-NEXT:    [[__R_0_I16_I_I7:%.*]] = phi i64 [ [[SUB_I25_I_I:%.*]], %[[IF_THEN_I21_I_I]] ], [ 0, %[[WHILE_COND_I14_I_I_PREHEADER]] ]
+// DEFAULT-NEXT:    [[__TAGP_ADDR_0_I15_I_I6:%.*]] = phi ptr [ [[INCDEC_PTR_I26_I_I:%.*]], %[[IF_THEN_I21_I_I]] ], [ [[TAG]], %[[WHILE_COND_I14_I_I_PREHEADER]] ]
 // DEFAULT-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], -48
 // DEFAULT-NEXT:    [[OR_COND_I19_I_I:%.*]] = icmp ult i8 [[TMP14]], 10
-// DEFAULT-NEXT:    br i1 [[OR_COND_I19_I_I]], label [[IF_THEN_I21_I_I]], label [[_ZL4NANFPKC_EXIT]]
-// DEFAULT:       if.then.i21.i.i:
+// DEFAULT-NEXT:    br i1 [[OR_COND_I19_I_I]], label %[[IF_THEN_I21_I_I]], label %[[_ZL4NANFPKC_EXIT]]
+// DEFAULT:       [[IF_THEN_I21_I_I]]:
 // DEFAULT-NEXT:    [[MUL_I22_I_I:%.*]] = mul i64 [[__R_0_I16_I_I7]], 10
 // DEFAULT-NEXT:    [[CONV5_I23_I_I:%.*]] = zext nneg i8 [[TMP13]] to i64
 // DEFAULT-NEXT:    [[ADD_I24_I_I:%.*]] = add i64 [[MUL_I22_I_I]], -48
 // DEFAULT-NEXT:    [[SUB_I25_I_I]] = add i64 [[ADD_I24_I_I]], [[CONV5_I23_I_I]]
 // DEFAULT-NEXT:    [[INCDEC_PTR_I26_I_I]] = getelementptr inbounds nuw i8, ptr [[__TAGP_ADDR_0_I15_I_I6]], i64 1
-// DEFAULT-NEXT:    [[TMP15]] = load i8, ptr [[INCDEC_PTR_I26_I_I]], align 1, !tbaa [[TBAA4]]
+// DEFAULT-NEXT:    [[TMP15]] = load i8, ptr [[INCDEC_PTR_I26_I_I]], align 1, !tbaa [[CHAR_TBAA4]]
 // DEFAULT-NEXT:    [[CMP_NOT_I17_I_I:%.*]] = icmp eq i8 [[TMP15]], 0
-// DEFAULT-NEXT:    br i1 [[CMP_NOT_I17_I_I]], label [[_ZL4NANFPKC_EXIT]], label [[WHILE_BODY_I18_I_I]], !llvm.loop [[LOOP10]]
-// DEFAULT:       _ZL4nanfPKc.exit:
-// DEFAULT-NEXT:    [[RETVAL_0_I_I:%.*]] = phi i64 [ 0, [[WHILE_COND_I_I_I_PREHEADER]] ], [ 0, [[IF_THEN5_I_I]] ], [ 0, [[WHILE_COND_I14_I_I_PREHEADER]] ], [ [[SUB_I_I_I]], [[IF_THEN_I_I_I]] ], [ 0, [[WHILE_BODY_I_I_I]] ], [ [[ADD28_I_I_I]], [[IF_END31_I_I_I]] ], [ 0, [[IF_ELSE17_I_I_I]] ], [ [[SUB_I25_I_I]], [[IF_THEN_I21_I_I]] ], [ 0, [[WHILE_BODY_I18_I_I]] ]
+// DEFAULT-NEXT:    br i1 [[CMP_NOT_I17_I_I]], label %[[_ZL4NANFPKC_EXIT]], label %[[WHILE_BODY_I18_I_I]], !llvm.loop [[LOOP10]]
+// DEFAULT:       [[_ZL4NANFPKC_EXIT]]:
+// DEFAULT-NEXT:    [[RETVAL_0_I_I:%.*]] = phi i64 [ 0, %[[WHILE_COND_I_I_I_PREHEADER]] ], [ 0, %[[IF_THEN5_I_I]] ], [ 0, %[[WHILE_COND_I14_I_I_PREHEADER]] ], [ [[SUB_I_I_I]], %[[IF_THEN_I_I_I]] ], [ 0, %[[WHILE_BODY_I_I_I]] ], [ [[ADD28_I_I_I]], %[[IF_END31_I_I_I]] ], [ 0, %[[IF_ELSE17_I_I_I]] ], [ [[SUB_I25_I_I]], %[[IF_THEN_I21_I_I]] ], [ 0, %[[WHILE_BODY_I18_I_I]] ]
 // DEFAULT-NEXT:    [[CONV_I:%.*]] = trunc i64 [[RETVAL_0_I_I]] to i32
 // DEFAULT-NEXT:    [[BF_VALUE_I:%.*]] = and i32 [[CONV_I]], 4194303
 // DEFAULT-NEXT:    [[BF_SET9_I:%.*]] = or disjoint i32 [[BF_VALUE_I]], 2143289344
 // DEFAULT-NEXT:    [[TMP16:%.*]] = bitcast i32 [[BF_SET9_I]] to float
 // DEFAULT-NEXT:    ret float [[TMP16]]
 //
-// FINITEONLY-LABEL: @test_nanf(
-// FINITEONLY-NEXT:  entry:
+// FINITEONLY-LABEL: define dso_local nofpclass(nan inf) float @test_nanf(
+// FINITEONLY-SAME: ptr noundef readonly captures(none) [[TAG:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
 // FINITEONLY-NEXT:    ret float poison
 //
-// APPROX-LABEL: @test_nanf(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[TMP0:%.*]] = load i8, ptr [[TAG:%.*]], align 1, !tbaa [[TBAA4]]
+// APPROX-LABEL: define dso_local float @test_nanf(
+// APPROX-SAME: ptr noundef readonly captures(none) [[TAG:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[TMP0:%.*]] = load i8, ptr [[TAG]], align 1, !tbaa [[CHAR_TBAA4]]
 // APPROX-NEXT:    [[CMP_I_I:%.*]] = icmp eq i8 [[TMP0]], 48
-// APPROX-NEXT:    br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[WHILE_COND_I14_I_I_PREHEADER:%.*]]
-// APPROX:       while.cond.i14.i.i.preheader:
-// APPROX-NEXT:    [[TMP1:%.*]] = load i8, ptr [[TAG]], align 1, !tbaa [[TBAA4]]
+// APPROX-NEXT:    br i1 [[CMP_I_I]], label %[[IF_THEN_I_I:.*]], label %[[WHILE_COND_I14_I_I_PREHEADER:.*]]
+// APPROX:       [[WHILE_COND_I14_I_I_PREHEADER]]:
+// APPROX-NEXT:    [[TMP1:%.*]] = load i8, ptr [[TAG]], align 1, !tbaa [[CHAR_TBAA4]]
 // APPROX-NEXT:    [[CMP_NOT_I17_I_I5:%.*]] = icmp eq i8 [[TMP1]], 0
-// APPROX-NEXT:    br i1 [[CMP_NOT_I17_I_I5]], label [[_ZL4NANFPKC_EXIT:%.*]], label [[WHILE_BODY_I18_I_I:%.*]]
-// APPROX:       if.then.i.i:
+// APPROX-NEXT:    br i1 [[CMP_NOT_I17_I_I5]], label %[[_ZL4NANFPKC_EXIT:.*]], label %[[WHILE_BODY_I18_I_I:.*]]
+// APPROX:       [[IF_THEN_I_I]]:
 // APPROX-NEXT:    [[INCDEC_PTR_I_I:%.*]] = getelementptr inbounds nuw i8, ptr [[TAG]], i64 1
-// APPROX-NEXT:    [[TMP2:%.*]] = load i8, ptr [[INCDEC_PTR_I_I]], align 1, !tbaa [[TBAA4]]
-// APPROX-NEXT:    switch i8 [[TMP2]], label [[WHILE_COND_I_I_I_PREHEADER:%.*]] [
-// APPROX-NEXT:      i8 120, label [[IF_THEN5_I_I:%.*]]
-// APPROX-NEXT:      i8 88, label [[IF_THEN5_I_I]]
+// APPROX-NEXT:    [[TMP2:%.*]] = load i8, ptr [[INCDEC_PTR_I_I]], align 1, !tbaa [[CHAR_TBAA4]]
+// APPROX-NEXT:    switch i8 [[TMP2]], label %[[WHILE_COND_I_I_I_PREHEADER:.*]] [
+// APPROX-NEXT:      i8 120, label %[[IF_THEN5_I_I:.*]]
+// APPROX-NEXT:      i8 88, label %[[IF_THEN5_I_I]]
 // APPROX-NEXT:    ]
-// APPROX:       while.cond.i.i.i.preheader:
-// APPROX-NEXT:    [[TMP3:%.*]] = load i8, ptr [[INCDEC_PTR_I_I]], align 1, !tbaa [[TBAA4]]
+// APPROX:       [[WHILE_COND_I_I_I_PREHEADER]]:
+// APPROX-NEXT:    [[TMP3:%.*]] = load i8, ptr [[INCDEC_PTR_I_I]], align 1, !tbaa [[CHAR_TBAA4]]
 // APPROX-NEXT:    [[CMP_NOT_I_I_I14:%.*]] = icmp eq i8 [[TMP3]], 0
-// APPROX-NEXT:    br i1 [[CMP_NOT_I_I_I14]], label [[_ZL4NANFPKC_EXIT]], label [[WHILE_BODY_I_I_I:%.*]]
-// APPROX:       if.then5.i.i:
-// APPROX-NEXT:    [[TMP4:%.*]] = load i8, ptr [[INCDEC_PTR_I_I]], align 1, !tbaa [[TBAA4]]
+// APPROX-NEXT:    br i1 [[CMP_NOT_I_I_I14]], label %[[_ZL4NANFPKC_EXIT]], label %[[WHILE_BODY_I_I_I:.*]]
+// APPROX:       [[IF_THEN5_I_I]]:
+// APPROX-NEXT:    [[TMP4:%.*]] = load i8, ptr [[INCDEC_PTR_I_I]], align 1, !tbaa [[CHAR_TBAA4]]
 // APPROX-NEXT:    [[CMP_NOT_I30_I_I9:%.*]] = icmp eq i8 [[TMP4]], 0
-// APPROX-NEXT:    br i1 [[CMP_NOT_I30_I_I9]], label [[_ZL4NANFPKC_EXIT]], label [[WHILE_BODY_I31_I_I:%.*]]
-// APPROX:       while.body.i31.i.i:
-// APPROX-NEXT:    [[TMP5:%.*]] = phi i8 [ [[TMP9:%.*]], [[IF_END31_I_I_I:%.*]] ], [ [[TMP4]], [[IF_THEN5_I_I]] ]
-// APPROX-NEXT:    [[__R_0_I29_I_I11:%.*]] = phi i64 [ [[ADD28_I_I_I:%.*]], [[IF_END31_I_I_I]] ], [ 0, [[IF_THEN5_I_I]] ]
-// APPROX-NEXT:    [[__TAGP_ADDR_0_I28_I_I10:%.*]] = phi ptr [ [[INCDEC_PTR_I34_I_I:%.*]], [[IF_END31_I_I_I]] ], [ [[INCDEC_PTR_I_I]], [[IF_THEN5_I_I]] ]
+// APPROX-NEXT:    br i1 [[CMP_NOT_I30_I_I9]], label %[[_ZL4NANFPKC_EXIT]], label %[[WHILE_BODY_I31_I_I:.*]]
+// APPROX:       [[WHILE_BODY_I31_I_I]]:
+// APPROX-NEXT:    [[TMP5:%.*]] = phi i8 [ [[TMP9:%.*]], %[[IF_END31_I_I_I:.*]] ], [ [[TMP4]], %[[IF_THEN5_I_I]] ]
+// APPROX-NEXT:    [[__R_0_I29_I_I11:%.*]] = phi i64 [ [[ADD28_I_I_I:%.*]], %[[IF_END31_I_I_I]] ], [ 0, %[[IF_THEN5_I_I]] ]
+// APPROX-NEXT:    [[__TAGP_ADDR_0_I28_I_I10:%.*]] = phi ptr [ [[INCDEC_PTR_I34_I_I:%.*]], %[[IF_END31_I_I_I]] ], [ [[INCDEC_PTR_I_I]], %[[IF_THEN5_I_I]] ]
 // APPROX-NEXT:    [[TMP6:%.*]] = add i8 [[TMP5]], -48
 // APPROX-NEXT:    [[OR_COND_I32_I_I:%.*]] = icmp ult i8 [[TMP6]], 10
-// APPROX-NEXT:    br i1 [[OR_COND_I32_I_I]], label [[IF_END31_I_I_I]], label [[IF_ELSE_I_I_I:%.*]]
-// APPROX:       if.else.i.i.i:
+// APPROX-NEXT:    br i1 [[OR_COND_I32_I_I]], label %[[IF_END31_I_I_I]], label %[[IF_ELSE_I_I_I:.*]]
+// APPROX:       [[IF_ELSE_I_I_I]]:
 // APPROX-NEXT:    [[TMP7:%.*]] = add i8 [[TMP5]], -97
 // APPROX-NEXT:    [[OR_COND33_I_I_I:%.*]] = icmp ult i8 [[TMP7]], 6
-// APPROX-NEXT:    br i1 [[OR_COND33_I_I_I]], label [[IF_END31_I_I_I]], label [[IF_ELSE17_I_I_I:%.*]]
-// APPROX:       if.else17.i.i.i:
+// APPROX-NEXT:    br i1 [[OR_COND33_I_I_I]], label %[[IF_END31_I_I_I]], label %[[IF_ELSE17_I_I_I:.*]]
+// APPROX:       [[IF_ELSE17_I_I_I]]:
 // APPROX-NEXT:    [[TMP8:%.*]] = add i8 [[TMP5]], -65
 // APPROX-NEXT:    [[OR_COND34_I_I_I:%.*]] = icmp ult i8 [[TMP8]], 6
-// APPROX-NEXT:    br i1 [[OR_COND34_I_I_I]], label [[IF_END31_I_I_I]], label [[_ZL4NANFPKC_EXIT]]
-// APPROX:       if.end31.i.i.i:
-// APPROX-NEXT:    [[DOTSINK:%.*]] = phi i64 [ -48, [[WHILE_BODY_I31_I_I]] ], [ -87, [[IF_ELSE_I_I_I]] ], [ -55, [[IF_ELSE17_I_I_I]] ]
+// APPROX-NEXT:    br i1 [[OR_COND34_I_I_I]], label %[[IF_END31_I_I_I]], label %[[_ZL4NANFPKC_EXIT]]
+// APPROX:       [[IF_END31_I_I_I]]:
+// APPROX-NEXT:    [[DOTSINK:%.*]] = phi i64 [ -48, %[[WHILE_BODY_I31_I_I]] ], [ -87, %[[IF_ELSE_I_I_I]] ], [ -55, %[[IF_ELSE17_I_I_I]] ]
 // APPROX-NEXT:    [[MUL24_I_I_I:%.*]] = shl i64 [[__R_0_I29_I_I11]], 4
 // APPROX-NEXT:    [[CONV25_I_I_I:%.*]] = zext nneg i8 [[TMP5]] to i64
 // APPROX-NEXT:    [[ADD26_I_I_I:%.*]] = add i64 [[MUL24_I_I_I]], [[DOTSINK]]
 // APPROX-NEXT:    [[ADD28_I_I_I]] = add i64 [[ADD26_I_I_I]], [[CONV25_I_I_I]]
 // APPROX-NEXT:    [[INCDEC_PTR_I34_I_I]] = getelementptr inbounds nuw i8, ptr [[__TAGP_ADDR_0_I28_I_I10]], i64 1
-// APPROX-NEXT:    [[TMP9]] = load i8, ptr [[INCDEC_PTR_I34_I_I]], align 1, !tbaa [[TBAA4]]
+// APPROX-NEXT:    [[TMP9]] = load i8, ptr [[INCDEC_PTR_I34_I_I]], align 1, !tbaa [[CHAR_TBAA4]]
 // APPROX-NEXT:    [[CMP_NOT_I30_I_I:%.*]] = icmp eq i8 [[TMP9]], 0
-// APPROX-NEXT:    br i1 [[CMP_NOT_I30_I_I]], label [[_ZL4NANFPKC_EXIT]], label [[WHILE_BODY_I31_I_I]], !llvm.loop [[LOOP11]]
-// APPROX:       while.body.i.i.i:
-// APPROX-NEXT:    [[TMP10:%.*]] = phi i8 [ [[TMP12:%.*]], [[IF_THEN_I_I_I:%.*]] ], [ [[TMP3]], [[WHILE_COND_I_I_I_PREHEADER]] ]
-// APPROX-NEXT:    [[__R_0_I_I_I16:%.*]] = phi i64 [ [[SUB_I_I_I:%.*]], [[IF_THEN_I_I_I]] ], [ 0, [[WHILE_COND_I_I_I_PREHEADER]] ]
-// APPROX-NEXT:    [[__TAGP_ADDR_0_I_I_I15:%.*]] = phi ptr [ [[INCDEC_PTR_I_I_I:%.*]], [[IF_THEN_I_I_I]] ], [ [[INCDEC_PTR_I_I]], [[WHILE_COND_I_I_I_PREHEADER]] ]
+// APPROX-NEXT:    br i1 [[CMP_NOT_I30_I_I]], label %[[_ZL4NANFPKC_EXIT]], label %[[WHILE_BODY_I31_I_I]], !llvm.loop [[LOOP11]]
+// APPROX:       [[WHILE_BODY_I_I_I]]:
+// APPROX-NEXT:    [[TMP10:%.*]] = phi i8 [ [[TMP12:%.*]], %[[IF_THEN_I_I_I:.*]] ], [ [[TMP3]], %[[WHILE_COND_I_I_I_PREHEADER]] ]
+// APPROX-NEXT:    [[__R_0_I_I_I16:%.*]] = phi i64 [ [[SUB_I_I_I:%.*]], %[[IF_THEN_I_I_I]] ], [ 0, %[[WHILE_COND_I_I_I_PREHEADER]] ]
+// APPROX-NEXT:    [[__TAGP_ADDR_0_I_I_I15:%.*]] = phi ptr [ [[INCDEC_PTR_I_I_I:%.*]], %[[IF_THEN_I_I_I]] ], [ [[INCDEC_PTR_I_I]], %[[WHILE_COND_I_I_I_PREHEADER]] ]
 // APPROX-NEXT:    [[TMP11:%.*]] = and i8 [[TMP10]], -8
 // APPROX-NEXT:    [[OR_COND_I_I_I:%.*]] = icmp eq i8 [[TMP11]], 48
-// APPROX-NEXT:    br i1 [[OR_COND_I_I_I]], label [[IF_THEN_I_I_I]], label [[_ZL4NANFPKC_EXIT]]
-// APPROX:       if.then.i.i.i:
+// APPROX-NEXT:    br i1 [[OR_COND_I_I_I]], label %[[IF_THEN_I_I_I]], label %[[_ZL4NANFPKC_EXIT]]
+// APPROX:       [[IF_THEN_I_I_I]]:
 // APPROX-NEXT:    [[MUL_I_I_I:%.*]] = shl i64 [[__R_0_I_I_I16]], 3
 // APPROX-NEXT:    [[CONV5_I_I_I:%.*]] = zext nneg i8 [[TMP10]] to i64
 // APPROX-NEXT:    [[ADD_I_I_I:%.*]] = add i64 [[MUL_I_I_I]], -48
 // APPROX-NEXT:    [[SUB_I_I_I]] = add i64 [[ADD_I_I_I]], [[CONV5_I_I_I]]
 // APPROX-NEXT:    [[INCDEC_PTR_I_I_I]] = getelementptr inbounds nuw i8, ptr [[__TAGP_ADDR_0_I_I_I15]], i64 1
-// APPROX-NEXT:    [[TMP12]] = load i8, ptr [[INCDEC_PTR_I_I_I]], align 1, !tbaa [[TBAA4]]
+// APPROX-NEXT:    [[TMP12]] = load i8, ptr [[INCDEC_PTR_I_I_I]], align 1, !tbaa [[CHAR_TBAA4]]
 // APPROX-NEXT:    [[CMP_NOT_I_I_I:%.*]] = icmp eq i8 [[TMP12]], 0
-// APPROX-NEXT:    br i1 [[CMP_NOT_I_I_I]], label [[_ZL4NANFPKC_EXIT]], label [[WHILE_BODY_I_I_I]], !llvm.loop [[LOOP7]]
-// APPROX:       while.body.i18.i.i:
-// APPROX-NEXT:    [[TMP13:%.*]] = phi i8 [ [[TMP15:%.*]], [[IF_THEN_I21_I_I:%.*]] ], [ [[TMP1]], [[WHILE_COND_I14_I_I_PREHEADER]] ]
-// APPROX-NEXT:    [[__R_0_I16_I_I7:%.*]] = phi i64 [ [[SUB_I25_I_I:%.*]], [[IF_THEN_I21_I_I]] ], [ 0, [[WHILE_COND_I14_I_I_PREHEADER]] ]
-// APPROX-NEXT:    [[__TAGP_ADDR_0_I15_I_I6:%.*]] = phi ptr [ [[INCDEC_PTR_I26_I_I:%.*]], [[IF_THEN_I21_I_I]] ], [ [[TAG]], [[WHILE_COND_I14_I_I_PREHEADER]] ]
+// APPROX-NEXT:    br i1 [[CMP_NOT_I_I_I]], label %[[_ZL4NANFPKC_EXIT]], label %[[WHILE_BODY_I_I_I]], !llvm.loop [[LOOP7]]
+// APPROX:       [[WHILE_BODY_I18_I_I]]:
+// APPROX-NEXT:    [[TMP13:%.*]] = phi i8 [ [[TMP15:%.*]], %[[IF_THEN_I21_I_I:.*]] ], [ [[TMP1]], %[[WHILE_COND_I14_I_I_PREHEADER]] ]
+// APPROX-NEXT:    [[__R_0_I16_I_I7:%.*]] = phi i64 [ [[SUB_I25_I_I:%.*]], %[[IF_THEN_I21_I_I]] ], [ 0, %[[WHILE_COND_I14_I_I_PREHEADER]] ]
+// APPROX-NEXT:    [[__TAGP_ADDR_0_I15_I_I6:%.*]] = phi ptr [ [[INCDEC_PTR_I26_I_I:%.*]], %[[IF_THEN_I21_I_I]] ], [ [[TAG]], %[[WHILE_COND_I14_I_I_PREHEADER]] ]
 // APPROX-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], -48
 // APPROX-NEXT:    [[OR_COND_I19_I_I:%.*]] = icmp ult i8 [[TMP14]], 10
-// APPROX-NEXT:    br i1 [[OR_COND_I19_I_I]], label [[IF_THEN_I21_I_I]], label [[_ZL4NANFPKC_EXIT]]
-// APPROX:       if.then.i21.i.i:
+// APPROX-NEXT:    br i1 [[OR_COND_I19_I_I]], label %[[IF_THEN_I21_I_I]], label %[[_ZL4NANFPKC_EXIT]]
+// APPROX:       [[IF_THEN_I21_I_I]]:
 // APPROX-NEXT:    [[MUL_I22_I_I:%.*]] = mul i64 [[__R_0_I16_I_I7]], 10
 // APPROX-NEXT:    [[CONV5_I23_I_I:%.*]] = zext nneg i8 [[TMP13]] to i64
 // APPROX-NEXT:    [[ADD_I24_I_I:%.*]] = add i64 [[MUL_I22_I_I]], -48
 // APPROX-NEXT:    [[SUB_I25_I_I]] = add i64 [[ADD_I24_I_I]], [[CONV5_I23_I_I]]
 // APPROX-NEXT:    [[INCDEC_PTR_I26_I_I]] = getelementptr inbounds nuw i8, ptr [[__TAGP_ADDR_0_I15_I_I6]], i64 1
-// APPROX-NEXT:    [[TMP15]] = load i8, ptr [[INCDEC_PTR_I26_I_I]], align 1, !tbaa [[TBAA4]]
+// APPROX-NEXT:    [[TMP15]] = load i8, ptr [[INCDEC_PTR_I26_I_I]], align 1, !tbaa [[CHAR_TBAA4]]
 // APPROX-NEXT:    [[CMP_NOT_I17_I_I:%.*]] = icmp eq i8 [[TMP15]], 0
-// APPROX-NEXT:    br i1 [[CMP_NOT_I17_I_I]], label [[_ZL4NANFPKC_EXIT]], label [[WHILE_BODY_I18_I_I]], !llvm.loop [[LOOP10]]
-// APPROX:       _ZL4nanfPKc.exit:
-// APPROX-NEXT:    [[RETVAL_0_I_I:%.*]] = phi i64 [ 0, [[WHILE_COND_I_I_I_PREHEADER]] ], [ 0, [[IF_THEN5_I_I]] ], [ 0, [[WHILE_COND_I14_I_I_PREHEADER]] ], [ [[SUB_I_I_I]], [[IF_THEN_I_I_I]] ], [ 0, [[WHILE_BODY_I_I_I]] ], [ [[ADD28_I_I_I]], [[IF_END31_I_I_I]] ], [ 0, [[IF_ELSE17_I_I_I]] ], [ [[SUB_I25_I_I]], [[IF_THEN_I21_I_I]] ], [ 0, [[WHILE_BODY_I18_I_I]] ]
+// APPROX-NEXT:    br i1 [[CMP_NOT_I17_I_I]], label %[[_ZL4NANFPKC_EXIT]], label %[[WHILE_BODY_I18_I_I]], !llvm.loop [[LOOP10]]
+// APPROX:       [[_ZL4NANFPKC_EXIT]]:
+// APPROX-NEXT:    [[RETVAL_0_I_I:%.*]] = phi i64 [ 0, %[[WHILE_COND_I_I_I_PREHEADER]] ], [ 0, %[[IF_THEN5_I_I]] ], [ 0, %[[WHILE_COND_I14_I_I_PREHEADER]] ], [ [[SUB_I_I_I]], %[[IF_THEN_I_I_I]] ], [ 0, %[[WHILE_BODY_I_I_I]] ], [ [[ADD28_I_I_I]], %[[IF_END31_I_I_I]] ], [ 0, %[[IF_ELSE17_I_I_I]] ], [ [[SUB_I25_I_I]], %[[IF_THEN_I21_I_I]] ], [ 0, %[[WHILE_BODY_I18_I_I]] ]
 // APPROX-NEXT:    [[CONV_I:%.*]] = trunc i64 [[RETVAL_0_I_I]] to i32
 // APPROX-NEXT:    [[BF_VALUE_I:%.*]] = and i32 [[CONV_I]], 4194303
 // APPROX-NEXT:    [[BF_SET9_I:%.*]] = or disjoint i32 [[BF_VALUE_I]], 2143289344
 // APPROX-NEXT:    [[TMP16:%.*]] = bitcast i32 [[BF_SET9_I]] to float
 // APPROX-NEXT:    ret float [[TMP16]]
 //
-// NCRDIV-LABEL: @test_nanf(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[TMP0:%.*]] = load i8, ptr [[TAG:%.*]], align 1, !tbaa [[TBAA4]]
+// NCRDIV-LABEL: define dso_local float @test_nanf(
+// NCRDIV-SAME: ptr noundef readonly captures(none) [[TAG:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[TMP0:%.*]] = load i8, ptr [[TAG]], align 1, !tbaa [[CHAR_TBAA4]]
 // NCRDIV-NEXT:    [[CMP_I_I:%.*]] = icmp eq i8 [[TMP0]], 48
-// NCRDIV-NEXT:    br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[WHILE_COND_I14_I_I_PREHEADER:%.*]]
-// NCRDIV:       while.cond.i14.i.i.preheader:
-// NCRDIV-NEXT:    [[TMP1:%.*]] = load i8, ptr [[TAG]], align 1, !tbaa [[TBAA4]]
+// NCRDIV-NEXT:    br i1 [[CMP_I_I]], label %[[IF_THEN_I_I:.*]], label %[[WHILE_COND_I14_I_I_PREHEADER:.*]]
+// NCRDIV:       [[WHILE_COND_I14_I_I_PREHEADER]]:
+// NCRDIV-NEXT:    [[TMP1:%.*]] = load i8, ptr [[TAG]], align 1, !tbaa [[CHAR_TBAA4]]
 // NCRDIV-NEXT:    [[CMP_NOT_I17_I_I5:%.*]] = icmp eq i8 [[TMP1]], 0
-// NCRDIV-NEXT:    br i1 [[CMP_NOT_I17_I_I5]], label [[_ZL4NANFPKC_EXIT:%.*]], label [[WHILE_BODY_I18_I_I:%.*]]
-// NCRDIV:       if.then.i.i:
+// NCRDIV-NEXT:    br i1 [[CMP_NOT_I17_I_I5]], label %[[_ZL4NANFPKC_EXIT:.*]], label %[[WHILE_BODY_I18_I_I:.*]]
+// NCRDIV:       [[IF_THEN_I_I]]:
 // NCRDIV-NEXT:    [[INCDEC_PTR_I_I:%.*]] = getelementptr inbounds nuw i8, ptr [[TAG]], i64 1
-// NCRDIV-NEXT:    [[TMP2:%.*]] = load i8, ptr [[INCDEC_PTR_I_I]], align 1, !tbaa [[TBAA4]]
-// NCRDIV-NEXT:    switch i8 [[TMP2]], label [[WHILE_COND_I_I_I_PREHEADER:%.*]] [
-// NCRDIV-NEXT:      i8 120, label [[IF_THEN5_I_I:%.*]]
-// NCRDIV-NEXT:      i8 88, label [[IF_THEN5_I_I]]
+// NCRDIV-NEXT:    [[TMP2:%.*]] = load i8, ptr [[INCDEC_PTR_I_I]], align 1, !tbaa [[CHAR_TBAA4]]
+// NCRDIV-NEXT:    switch i8 [[TMP2]], label %[[WHILE_COND_I_I_I_PREHEADER:.*]] [
+// NCRDIV-NEXT:      i8 120, label %[[IF_THEN5_I_I:.*]]
+// NCRDIV-NEXT:      i8 88, label %[[IF_THEN5_I_I]]
 // NCRDIV-NEXT:    ]
-// NCRDIV:       while.cond.i.i.i.preheader:
-// NCRDIV-NEXT:    [[TMP3:%.*]] = load i8, ptr [[INCDEC_PTR_I_I]], align 1, !tbaa [[TBAA4]]
+// NCRDIV:       [[WHILE_COND_I_I_I_PREHEADER]]:
+// NCRDIV-NEXT:    [[TMP3:%.*]] = load i8, ptr [[INCDEC_PTR_I_I]], align 1, !tbaa [[CHAR_TBAA4]]
 // NCRDIV-NEXT:    [[CMP_NOT_I_I_I14:%.*]] = icmp eq i8 [[TMP3]], 0
-// NCRDIV-NEXT:    br i1 [[CMP_NOT_I_I_I14]], label [[_ZL4NANFPKC_EXIT]], label [[WHILE_BODY_I_I_I:%.*]]
-// NCRDIV:       if.then5.i.i:
-// NCRDIV-NEXT:    [[TMP4:%.*]] = load i8, ptr [[INCDEC_PTR_I_I]], align 1, !tbaa [[TBAA4]]
+// NCRDIV-NEXT:    br i1 [[CMP_NOT_I_I_I14]], label %[[_ZL4NANFPKC_EXIT]], label %[[WHILE_BODY_I_I_I:.*]]
+// NCRDIV:       [[IF_THEN5_I_I]]:
+// NCRDIV-NEXT:    [[TMP4:%.*]] = load i8, ptr [[INCDEC_PTR_I_I]], align 1, !tbaa [[CHAR_TBAA4]]
 // NCRDIV-NEXT:    [[CMP_NOT_I30_I_I9:%.*]] = icmp eq i8 [[TMP4]], 0
-// NCRDIV-NEXT:    br i1 [[CMP_NOT_I30_I_I9]], label [[_ZL4NANFPKC_EXIT]], label [[WHILE_BODY_I31_I_I:%.*]]
-// NCRDIV:       while.body.i31.i.i:
-// NCRDIV-NEXT:    [[TMP5:%.*]] = phi i8 [ [[TMP9:%.*]], [[IF_END31_I_I_I:%.*]] ], [ [[TMP4]], [[IF_THEN5_I_I]] ]
-// NCRDIV-NEXT:    [[__R_0_I29_I_I11:%.*]] = phi i64 [ [[ADD28_I_I_I:%.*]], [[IF_END31_I_I_I]] ], [ 0, [[IF_THEN5_I_I]] ]
-// NCRDIV-NEXT:    [[__TAGP_ADDR_0_I28_I_I10:%.*]] = phi ptr [ [[INCDEC_PTR_I34_I_I:%.*]], [[IF_END31_I_I_I]] ], [ [[INCDEC_PTR_I_I]], [[IF_THEN5_I_I]] ]
+// NCRDIV-NEXT:    br i1 [[CMP_NOT_I30_I_I9]], label %[[_ZL4NANFPKC_EXIT]], label %[[WHILE_BODY_I31_I_I:.*]]
+// NCRDIV:       [[WHILE_BODY_I31_I_I]]:
+// NCRDIV-NEXT:    [[TMP5:%.*]] = phi i8 [ [[TMP9:%.*]], %[[IF_END31_I_I_I:.*]] ], [ [[TMP4]], %[[IF_THEN5_I_I]] ]
+// NCRDIV-NEXT:    [[__R_0_I29_I_I11:%.*]] = phi i64 [ [[ADD28_I_I_I:%.*]], %[[IF_END31_I_I_I]] ], [ 0, %[[IF_THEN5_I_I]] ]
+// NCRDIV-NEXT:    [[__TAGP_ADDR_0_I28_I_I10:%.*]] = phi ptr [ [[INCDEC_PTR_I34_I_I:%.*]], %[[IF_END31_I_I_I]] ], [ [[INCDEC_PTR_I_I]], %[[IF_THEN5_I_I]] ]
 // NCRDIV-NEXT:    [[TMP6:%.*]] = add i8 [[TMP5]], -48
 // NCRDIV-NEXT:    [[OR_COND_I32_I_I:%.*]] = icmp ult i8 [[TMP6]], 10
-// NCRDIV-NEXT:    br i1 [[OR_COND_I32_I_I]], label [[IF_END31_I_I_I]], label [[IF_ELSE_I_I_I:%.*]]
-// NCRDIV:       if.else.i.i.i:
+// NCRDIV-NEXT:    br i1 [[OR_COND_I32_I_I]], label %[[IF_END31_I_I_I]], label %[[IF_ELSE_I_I_I:.*]]
+// NCRDIV:       [[IF_ELSE_I_I_I]]:
 // NCRDIV-NEXT:    [[TMP7:%.*]] = add i8 [[TMP5]], -97
 // NCRDIV-NEXT:    [[OR_COND33_I_I_I:%.*]] = icmp ult i8 [[TMP7]], 6
-// NCRDIV-NEXT:    br i1 [[OR_COND33_I_I_I]], label [[IF_END31_I_I_I]], label [[IF_ELSE17_I_I_I:%.*]]
-// NCRDIV:       if.else17.i.i.i:
+// NCRDIV-NEXT:    br i1 [[OR_COND33_I_I_I]], label %[[IF_END31_I_I_I]], label %[[IF_ELSE17_I_I_I:.*]]
+// NCRDIV:       [[IF_ELSE17_I_I_I]]:
 // NCRDIV-NEXT:    [[TMP8:%.*]] = add i8 [[TMP5]], -65
 // NCRDIV-NEXT:    [[OR_COND34_I_I_I:%.*]] = icmp ult i8 [[TMP8]], 6
-// NCRDIV-NEXT:    br i1 [[OR_COND34_I_I_I]], label [[IF_END31_I_I_I]], label [[_ZL4NANFPKC_EXIT]]
-// NCRDIV:       if.end31.i.i.i:
-// NCRDIV-NEXT:    [[DOTSINK:%.*]] = phi i64 [ -48, [[WHILE_BODY_I31_I_I]] ], [ -87, [[IF_ELSE_I_I_I]] ], [ -55, [[IF_ELSE17_I_I_I]] ]
+// NCRDIV-NEXT:    br i1 [[OR_COND34_I_I_I]], label %[[IF_END31_I_I_I]], label %[[_ZL4NANFPKC_EXIT]]
+// NCRDIV:       [[IF_END31_I_I_I]]:
+// NCRDIV-NEXT:    [[DOTSINK:%.*]] = phi i64 [ -48, %[[WHILE_BODY_I31_I_I]] ], [ -87, %[[IF_ELSE_I_I_I]] ], [ -55, %[[IF_ELSE17_I_I_I]] ]
 // NCRDIV-NEXT:    [[MUL24_I_I_I:%.*]] = shl i64 [[__R_0_I29_I_I11]], 4
 // NCRDIV-NEXT:    [[CONV25_I_I_I:%.*]] = zext nneg i8 [[TMP5]] to i64
 // NCRDIV-NEXT:    [[ADD26_I_I_I:%.*]] = add i64 [[MUL24_I_I_I]], [[DOTSINK]]
 // NCRDIV-NEXT:    [[ADD28_I_I_I]] = add i64 [[ADD26_I_I_I]], [[CONV25_I_I_I]]
 // NCRDIV-NEXT:    [[INCDEC_PTR_I34_I_I]] = getelementptr inbounds nuw i8, ptr [[__TAGP_ADDR_0_I28_I_I10]], i64 1
-// NCRDIV-NEXT:    [[TMP9]] = load i8, ptr [[INCDEC_PTR_I34_I_I]], align 1, !tbaa [[TBAA4]]
+// NCRDIV-NEXT:    [[TMP9]] = load i8, ptr [[INCDEC_PTR_I34_I_I]], align 1, !tbaa [[CHAR_TBAA4]]
 // NCRDIV-NEXT:    [[CMP_NOT_I30_I_I:%.*]] = icmp eq i8 [[TMP9]], 0
-// NCRDIV-NEXT:    br i1 [[CMP_NOT_I30_I_I]], label [[_ZL4NANFPKC_EXIT]], label [[WHILE_BODY_I31_I_I]], !llvm.loop [[LOOP11]]
-// NCRDIV:       while.body.i.i.i:
-// NCRDIV-NEXT:    [[TMP10:%.*]] = phi i8 [ [[TMP12:%.*]], [[IF_THEN_I_I_I:%.*]] ], [ [[TMP3]], [[WHILE_COND_I_I_I_PREHEADER]] ]
-// NCRDIV-NEXT:    [[__R_0_I_I_I16:%.*]] = phi i64 [ [[SUB_I_I_I:%.*]], [[IF_THEN_I_I_I]] ], [ 0, [[WHILE_COND_I_I_I_PREHEADER]] ]
-// NCRDIV-NEXT:    [[__TAGP_ADDR_0_I_I_I15:%.*]] = phi ptr [ [[INCDEC_PTR_I_I_I:%.*]], [[IF_THEN_I_I_I]] ], [ [[INCDEC_PTR_I_I]], [[WHILE_COND_I_I_I_PREHEADER]] ]
+// NCRDIV-NEXT:    br i1 [[CMP_NOT_I30_I_I]], label %[[_ZL4NANFPKC_EXIT]], label %[[WHILE_BODY_I31_I_I]], !llvm.loop [[LOOP11]]
+// NCRDIV:       [[WHILE_BODY_I_I_I]]:
+// NCRDIV-NEXT:    [[TMP10:%.*]] = phi i8 [ [[TMP12:%.*]], %[[IF_THEN_I_I_I:.*]] ], [ [[TMP3]], %[[WHILE_COND_I_I_I_PREHEADER]] ]
+// NCRDIV-NEXT:    [[__R_0_I_I_I16:%.*]] = phi i64 [ [[SUB_I_I_I:%.*]], %[[IF_THEN_I_I_I]] ], [ 0, %[[WHILE_COND_I_I_I_PREHEADER]] ]
+// NCRDIV-NEXT:    [[__TAGP_ADDR_0_I_I_I15:%.*]] = phi ptr [ [[INCDEC_PTR_I_I_I:%.*]], %[[IF_THEN_I_I_I]] ], [ [[INCDEC_PTR_I_I]], %[[WHILE_COND_I_I_I_PREHEADER]] ]
 // NCRDIV-NEXT:    [[TMP11:%.*]] = and i8 [[TMP10]], -8
 // NCRDIV-NEXT:    [[OR_COND_I_I_I:%.*]] = icmp eq i8 [[TMP11]], 48
-// NCRDIV-NEXT:    br i1 [[OR_COND_I_I_I]], label [[IF_THEN_I_I_I]], label [[_ZL4NANFPKC_EXIT]]
-// NCRDIV:       if.then.i.i.i:
+// NCRDIV-NEXT:    br i1 [[OR_COND_I_I_I]], label %[[IF_THEN_I_I_I]], label %[[_ZL4NANFPKC_EXIT]]
+// NCRDIV:       [[IF_THEN_I_I_I]]:
 // NCRDIV-NEXT:    [[MUL_I_I_I:%.*]] = shl i64 [[__R_0_I_I_I16]], 3
 // NCRDIV-NEXT:    [[CONV5_I_I_I:%.*]] = zext nneg i8 [[TMP10]] to i64
 // NCRDIV-NEXT:    [[ADD_I_I_I:%.*]] = add i64 [[MUL_I_I_I]], -48
 // NCRDIV-NEXT:    [[SUB_I_I_I]] = add i64 [[ADD_I_I_I]], [[CONV5_I_I_I]]
 // NCRDIV-NEXT:    [[INCDEC_PTR_I_I_I]] = getelementptr inbounds nuw i8, ptr [[__TAGP_ADDR_0_I_I_I15]], i64 1
-// NCRDIV-NEXT:    [[TMP12]] = load i8, ptr [[INCDEC_PTR_I_I_I]], align 1, !tbaa [[TBAA4]]
+// NCRDIV-NEXT:    [[TMP12]] = load i8, ptr [[INCDEC_PTR_I_I_I]], align 1, !tbaa [[CHAR_TBAA4]]
 // NCRDIV-NEXT:    [[CMP_NOT_I_I_I:%.*]] = icmp eq i8 [[TMP12]], 0
-// NCRDIV-NEXT:    br i1 [[CMP_NOT_I_I_I]], label [[_ZL4NANFPKC_EXIT]], label [[WHILE_BODY_I_I_I]], !llvm.loop [[LOOP7]]
-// NCRDIV:       while.body.i18.i.i:
-// NCRDIV-NEXT:    [[TMP13:%.*]] = phi i8 [ [[TMP15:%.*]], [[IF_THEN_I21_I_I:%.*]] ], [ [[TMP1]], [[WHILE_COND_I14_I_I_PREHEADER]] ]
-// NCRDIV-NEXT:    [[__R_0_I16_I_I7:%.*]] = phi i64 [ [[SUB_I25_I_I:%.*]], [[IF_THEN_I21_I_I]] ], [ 0, [[WHILE_COND_I14_I_I_PREHEADER]] ]
-// NCRDIV-NEXT:    [[__TAGP_ADDR_0_I15_I_I6:%.*]] = phi ptr [ [[INCDEC_PTR_I26_I_I:%.*]], [[IF_THEN_I21_I_I]] ], [ [[TAG]], [[WHILE_COND_I14_I_I_PREHEADER]] ]
+// NCRDIV-NEXT:    br i1 [[CMP_NOT_I_I_I]], label %[[_ZL4NANFPKC_EXIT]], label %[[WHILE_BODY_I_I_I]], !llvm.loop [[LOOP7]]
+// NCRDIV:       [[WHILE_BODY_I18_I_I]]:
+// NCRDIV-NEXT:    [[TMP13:%.*]] = phi i8 [ [[TMP15:%.*]], %[[IF_THEN_I21_I_I:.*]] ], [ [[TMP1]], %[[WHILE_COND_I14_I_I_PREHEADER]] ]
+// NCRDIV-NEXT:    [[__R_0_I16_I_I7:%.*]] = phi i64 [ [[SUB_I25_I_I:%.*]], %[[IF_THEN_I21_I_I]] ], [ 0, %[[WHILE_COND_I14_I_I_PREHEADER]] ]
+// NCRDIV-NEXT:    [[__TAGP_ADDR_0_I15_I_I6:%.*]] = phi ptr [ [[INCDEC_PTR_I26_I_I:%.*]], %[[IF_THEN_I21_I_I]] ], [ [[TAG]], %[[WHILE_COND_I14_I_I_PREHEADER]] ]
 // NCRDIV-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], -48
 // NCRDIV-NEXT:    [[OR_COND_I19_I_I:%.*]] = icmp ult i8 [[TMP14]], 10
-// NCRDIV-NEXT:    br i1 [[OR_COND_I19_I_I]], label [[IF_THEN_I21_I_I]], label [[_ZL4NANFPKC_EXIT]]
-// NCRDIV:       if.then.i21.i.i:
+// NCRDIV-NEXT:    br i1 [[OR_COND_I19_I_I]], label %[[IF_THEN_I21_I_I]], label %[[_ZL4NANFPKC_EXIT]]
+// NCRDIV:       [[IF_THEN_I21_I_I]]:
 // NCRDIV-NEXT:    [[MUL_I22_I_I:%.*]] = mul i64 [[__R_0_I16_I_I7]], 10
 // NCRDIV-NEXT:    [[CONV5_I23_I_I:%.*]] = zext nneg i8 [[TMP13]] to i64
 // NCRDIV-NEXT:    [[ADD_I24_I_I:%.*]] = add i64 [[MUL_I22_I_I]], -48
 // NCRDIV-NEXT:    [[SUB_I25_I_I]] = add i64 [[ADD_I24_I_I]], [[CONV5_I23_I_I]]
 // NCRDIV-NEXT:    [[INCDEC_PTR_I26_I_I]] = getelementptr inbounds nuw i8, ptr [[__TAGP_ADDR_0_I15_I_I6]], i64 1
-// NCRDIV-NEXT:    [[TMP15]] = load i8, ptr [[INCDEC_PTR_I26_I_I]], align 1, !tbaa [[TBAA4]]
+// NCRDIV-NEXT:    [[TMP15]] = load i8, ptr [[INCDEC_PTR_I26_I_I]], align 1, !tbaa [[CHAR_TBAA4]]
 // NCRDIV-NEXT:    [[CMP_NOT_I17_I_I:%.*]] = icmp eq i8 [[TMP15]], 0
-// NCRDIV-NEXT:    br i1 [[CMP_NOT_I17_I_I]], label [[_ZL4NANFPKC_EXIT]], label [[WHILE_BODY_I18_I_I]], !llvm.loop [[LOOP10]]
-// NCRDIV:       _ZL4nanfPKc.exit:
-// NCRDIV-NEXT:    [[RETVAL_0_I_I:%.*]] = phi i64 [ 0, [[WHILE_COND_I_I_I_PREHEADER]] ], [ 0, [[IF_THEN5_I_I]] ], [ 0, [[WHILE_COND_I14_I_I_PREHEADER]] ], [ [[SUB_I_I_I]], [[IF_THEN_I_I_I]] ], [ 0, [[WHILE_BODY_I_I_I]] ], [ [[ADD28_I_I_I]], [[IF_END31_I_I_I]] ], [ 0, [[IF_ELSE17_I_I_I]] ], [ [[SUB_I25_I_I]], [[IF_THEN_I21_I_I]] ], [ 0, [[WHILE_BODY_I18_I_I]] ]
+// NCRDIV-NEXT:    br i1 [[CMP_NOT_I17_I_I]], label %[[_ZL4NANFPKC_EXIT]], label %[[WHILE_BODY_I18_I_I]], !llvm.loop [[LOOP10]]
+// NCRDIV:       [[_ZL4NANFPKC_EXIT]]:
+// NCRDIV-NEXT:    [[RETVAL_0_I_I:%.*]] = phi i64 [ 0, %[[WHILE_COND_I_I_I_PREHEADER]] ], [ 0, %[[IF_THEN5_I_I]] ], [ 0, %[[WHILE_COND_I14_I_I_PREHEADER]] ], [ [[SUB_I_I_I]], %[[IF_THEN_I_I_I]] ], [ 0, %[[WHILE_BODY_I_I_I]] ], [ [[ADD28_I_I_I]], %[[IF_END31_I_I_I]] ], [ 0, %[[IF_ELSE17_I_I_I]] ], [ [[SUB_I25_I_I]], %[[IF_THEN_I21_I_I]] ], [ 0, %[[WHILE_BODY_I18_I_I]] ]
 // NCRDIV-NEXT:    [[CONV_I:%.*]] = trunc i64 [[RETVAL_0_I_I]] to i32
 // NCRDIV-NEXT:    [[BF_VALUE_I:%.*]] = and i32 [[CONV_I]], 4194303
 // NCRDIV-NEXT:    [[BF_SET9_I:%.*]] = or disjoint i32 [[BF_VALUE_I]], 2143289344
 // NCRDIV-NEXT:    [[TMP16:%.*]] = bitcast i32 [[BF_SET9_I]] to float
 // NCRDIV-NEXT:    ret float [[TMP16]]
 //
-// AMDGCNSPIRV-LABEL: @test_nanf(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = load i8, ptr addrspace(4) [[TAG:%.*]], align 1, !tbaa [[TBAA5]]
+// AMDGCNSPIRV-LABEL: define spir_func float @test_nanf(
+// AMDGCNSPIRV-SAME: ptr addrspace(4) noundef readonly captures(none) [[TAG:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR2]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*]]:
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = load i8, ptr addrspace(4) [[TAG]], align 1, !tbaa [[CHAR_TBAA5]]
 // AMDGCNSPIRV-NEXT:    [[CMP_I_I:%.*]] = icmp eq i8 [[TMP0]], 48
-// AMDGCNSPIRV-NEXT:    br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[WHILE_COND_I14_I_I:%.*]]
-// AMDGCNSPIRV:       if.then.i.i:
+// AMDGCNSPIRV-NEXT:    br i1 [[CMP_I_I]], label %[[IF_THEN_I_I:.*]], label %[[WHILE_COND_I14_I_I:.*]]
+// AMDGCNSPIRV:       [[IF_THEN_I_I]]:
 // AMDGCNSPIRV-NEXT:    [[INCDEC_PTR_I_I:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[TAG]], i64 1
-// AMDGCNSPIRV-NEXT:    [[TMP1:%.*]] = load i8, ptr addrspace(4) [[INCDEC_PTR_I_I]], align 1, !tbaa [[TBAA5]]
-// AMDGCNSPIRV-NEXT:    switch i8 [[TMP1]], label [[WHILE_COND_I_I_I:%.*]] [
-// AMDGCNSPIRV-NEXT:      i8 120, label [[IF_THEN5_I_I:%.*]]
-// AMDGCNSPIRV-NEXT:      i8 88, label [[IF_THEN5_I_I]]
+// AMDGCNSPIRV-NEXT:    [[TMP1:%.*]] = load i8, ptr addrspace(4) [[INCDEC_PTR_I_I]], align 1, !tbaa [[CHAR_TBAA5]]
+// AMDGCNSPIRV-NEXT:    switch i8 [[TMP1]], label %[[WHILE_COND_I_I_I:.*]] [
+// AMDGCNSPIRV-NEXT:      i8 120, label %[[IF_THEN5_I_I:.*]]
+// AMDGCNSPIRV-NEXT:      i8 88, label %[[IF_THEN5_I_I]]
 // AMDGCNSPIRV-NEXT:    ]
-// AMDGCNSPIRV:       if.then5.i.i:
-// AMDGCNSPIRV-NEXT:    [[TMP2:%.*]] = load i8, ptr addrspace(4) [[INCDEC_PTR_I_I]], align 1, !tbaa [[TBAA5]]
+// AMDGCNSPIRV:       [[IF_THEN5_I_I]]:
+// AMDGCNSPIRV-NEXT:    [[TMP2:%.*]] = load i8, ptr addrspace(4) [[INCDEC_PTR_I_I]], align 1, !tbaa [[CHAR_TBAA5]]
 // AMDGCNSPIRV-NEXT:    [[CMP_NOT_I31_I_I5:%.*]] = icmp eq i8 [[TMP2]], 0
-// AMDGCNSPIRV-NEXT:    br i1 [[CMP_NOT_I31_I_I5]], label [[_ZL4NANFPKC_EXIT:%.*]], label [[WHILE_BODY_I32_I_I:%.*]]
-// AMDGCNSPIRV:       while.body.i32.i.i:
-// AMDGCNSPIRV-NEXT:    [[TMP3:%.*]] = phi i8 [ [[TMP7:%.*]], [[IF_END31_I_I_I:%.*]] ], [ [[TMP2]], [[IF_THEN5_I_I]] ]
-// AMDGCNSPIRV-NEXT:    [[__R_0_I30_I_I7:%.*]] = phi i64 [ [[ADD28_I_I_I:%.*]], [[IF_END31_I_I_I]] ], [ 0, [[IF_THEN5_I_I]] ]
-// AMDGCNSPIRV-NEXT:    [[__TAGP_ADDR_0_I29_I_I6:%.*]] = phi ptr addrspace(4) [ [[INCDEC_PTR_I36_I_I:%.*]], [[IF_END31_I_I_I]] ], [ [[INCDEC_PTR_I_I]], [[IF_THEN5_I_I]] ]
+// AMDGCNSPIRV-NEXT:    br i1 [[CMP_NOT_I31_I_I5]], label %[[_ZL4NANFPKC_EXIT:.*]], label %[[WHILE_BODY_I32_I_I:.*]]
+// AMDGCNSPIRV:       [[WHILE_BODY_I32_I_I]]:
+// AMDGCNSPIRV-NEXT:    [[TMP3:%.*]] = phi i8 [ [[TMP7:%.*]], %[[IF_END31_I_I_I:.*]] ], [ [[TMP2]], %[[IF_THEN5_I_I]] ]
+// AMDGCNSPIRV-NEXT:    [[__R_0_I30_I_I7:%.*]] = phi i64 [ [[ADD28_I_I_I:%.*]], %[[IF_END31_I_I_I]] ], [ 0, %[[IF_THEN5_I_I]] ]
+// AMDGCNSPIRV-NEXT:    [[__TAGP_ADDR_0_I29_I_I6:%.*]] = phi ptr addrspace(4) [ [[INCDEC_PTR_I36_I_I:%.*]], %[[IF_END31_I_I_I]] ], [ [[INCDEC_PTR_I_I]], %[[IF_THEN5_I_I]] ]
 // AMDGCNSPIRV-NEXT:    [[TMP4:%.*]] = add i8 [[TMP3]], -48
 // AMDGCNSPIRV-NEXT:    [[OR_COND_I33_I_I:%.*]] = icmp ult i8 [[TMP4]], 10
-// AMDGCNSPIRV-NEXT:    br i1 [[OR_COND_I33_I_I]], label [[IF_END31_I_I_I]], label [[IF_ELSE_I_I_I:%.*]]
-// AMDGCNSPIRV:       if.else.i.i.i:
+// AMDGCNSPIRV-NEXT:    br i1 [[OR_COND_I33_I_I]], label %[[IF_END31_I_I_I]], label %[[IF_ELSE_I_I_I:.*]]
+// AMDGCNSPIRV:       [[IF_ELSE_I_I_I]]:
 // AMDGCNSPIRV-NEXT:    [[TMP5:%.*]] = add i8 [[TMP3]], -97
 // AMDGCNSPIRV-NEXT:    [[OR_COND33_I_I_I:%.*]] = icmp ult i8 [[TMP5]], 6
-// AMDGCNSPIRV-NEXT:    br i1 [[OR_COND33_I_I_I]], label [[IF_END31_I_I_I]], label [[IF_ELSE17_I_I_I:%.*]]
-// AMDGCNSPIRV:       if.else17.i.i.i:
+// AMDGCNSPIRV-NEXT:    br i1 [[OR_COND33_I_I_I]], label %[[IF_END31_I_I_I]], label %[[IF_ELSE17_I_I_I:.*]]
+// AMDGCNSPIRV:       [[IF_ELSE17_I_I_I]]:
 // AMDGCNSPIRV-NEXT:    [[TMP6:%.*]] = add i8 [[TMP3]], -65
 // AMDGCNSPIRV-NEXT:    [[OR_COND34_I_I_I:%.*]] = icmp ult i8 [[TMP6]], 6
-// AMDGCNSPIRV-NEXT:    br i1 [[OR_COND34_I_I_I]], label [[IF_END31_I_I_I]], label [[_ZL4NANFPKC_EXIT]]
-// AMDGCNSPIRV:       if.end31.i.i.i:
-// AMDGCNSPIRV-NEXT:    [[DOTSINK:%.*]] = phi i64 [ -48, [[WHILE_BODY_I32_I_I]] ], [ -87, [[IF_ELSE_I_I_I]] ], [ -55, [[IF_ELSE17_I_I_I]] ]
+// AMDGCNSPIRV-NEXT:    br i1 [[OR_COND34_I_I_I]], label %[[IF_END31_I_I_I]], label %[[_ZL4NANFPKC_EXIT]]
+// AMDGCNSPIRV:       [[IF_END31_I_I_I]]:
+// AMDGCNSPIRV-NEXT:    [[DOTSINK:%.*]] = phi i64 [ -48, %[[WHILE_BODY_I32_I_I]] ], [ -87, %[[IF_ELSE_I_I_I]] ], [ -55, %[[IF_ELSE17_I_I_I]] ]
 // AMDGCNSPIRV-NEXT:    [[MUL24_I_I_I:%.*]] = shl i64 [[__R_0_I30_I_I7]], 4
 // AMDGCNSPIRV-NEXT:    [[CONV25_I_I_I:%.*]] = zext nneg i8 [[TMP3]] to i64
 // AMDGCNSPIRV-NEXT:    [[ADD26_I_I_I:%.*]] = add i64 [[MUL24_I_I_I]], [[DOTSINK]]
 // AMDGCNSPIRV-NEXT:    [[ADD28_I_I_I]] = add i64 [[ADD26_I_I_I]], [[CONV25_I_I_I]]
 // AMDGCNSPIRV-NEXT:    [[INCDEC_PTR_I36_I_I]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[__TAGP_ADDR_0_I29_I_I6]], i64 1
-// AMDGCNSPIRV-NEXT:    [[TMP7]] = load i8, ptr addrspace(4) [[INCDEC_PTR_I36_I_I]], align 1, !tbaa [[TBAA5]]
+// AMDGCNSPIRV-NEXT:    [[TMP7]] = load i8, ptr addrspace(4) [[INCDEC_PTR_I36_I_I]], align 1, !tbaa [[CHAR_TBAA5]]
 // AMDGCNSPIRV-NEXT:    [[CMP_NOT_I31_I_I:%.*]] = icmp eq i8 [[TMP7]], 0
-// AMDGCNSPIRV-NEXT:    br i1 [[CMP_NOT_I31_I_I]], label [[_ZL4NANFPKC_EXIT]], label [[WHILE_BODY_I32_I_I]], !llvm.loop [[LOOP12]]
-// AMDGCNSPIRV:       while.cond.i.i.i:
-// AMDGCNSPIRV-NEXT:    [[__TAGP_ADDR_0_I_I_I:%.*]] = phi ptr addrspace(4) [ [[__TAGP_ADDR_1_I_I_I:%.*]], [[WHILE_BODY_I_I_I:%.*]] ], [ [[INCDEC_PTR_I_I]], [[IF_THEN_I_I]] ]
-// AMDGCNSPIRV-NEXT:    [[__R_0_I_I_I:%.*]] = phi i64 [ [[__R_1_I_I_I:%.*]], [[WHILE_BODY_I_I_I]] ], [ 0, [[IF_THEN_I_I]] ]
-// AMDGCNSPIRV-NEXT:    [[TMP8:%.*]] = load i8, ptr addrspace(4) [[__TAGP_ADDR_0_I_I_I]], align 1, !tbaa [[TBAA5]]
+// AMDGCNSPIRV-NEXT:    br i1 [[CMP_NOT_I31_I_I]], label %[[_ZL4NANFPKC_EXIT]], label %[[WHILE_BODY_I32_I_I]], !llvm.loop [[LOOP12]]
+// AMDGCNSPIRV:       [[WHILE_COND_I_I_I]]:
+// AMDGCNSPIRV-NEXT:    [[__TAGP_ADDR_0_I_I_I:%.*]] = phi ptr addrspace(4) [ [[__TAGP_ADDR_1_I_I_I:%.*]], %[[WHILE_BODY_I_I_I:.*]] ], [ [[INCDEC_PTR_I_I]], %[[IF_THEN_I_I]] ]
+// AMDGCNSPIRV-NEXT:    [[__R_0_I_I_I:%.*]] = phi i64 [ [[__R_1_I_I_I:%.*]], %[[WHILE_BODY_I_I_I]] ], [ 0, %[[IF_THEN_I_I]] ]
+// AMDGCNSPIRV-NEXT:    [[TMP8:%.*]] = load i8, ptr addrspace(4) [[__TAGP_ADDR_0_I_I_I]], align 1, !tbaa [[CHAR_TBAA5]]
 // AMDGCNSPIRV-NEXT:    [[CMP_NOT_I_I_I:%.*]] = icmp eq i8 [[TMP8]], 0
-// AMDGCNSPIRV-NEXT:    br i1 [[CMP_NOT_I_I_I]], label [[_ZL4NANFPKC_EXIT]], label [[WHILE_BODY_I_I_I]]
-// AMDGCNSPIRV:       while.body.i.i.i:
+// AMDGCNSPIRV-NEXT:    br i1 [[CMP_NOT_I_I_I]], label %[[_ZL4NANFPKC_EXIT]], label %[[WHILE_BODY_I_I_I]]
+// AMDGCNSPIRV:       [[WHILE_BODY_I_I_I]]:
 // AMDGCNSPIRV-NEXT:    [[TMP9:%.*]] = and i8 [[TMP8]], -8
 // AMDGCNSPIRV-NEXT:    [[OR_COND_I_I_I:%.*]] = icmp eq i8 [[TMP9]], 48
 // AMDGCNSPIRV-NEXT:    [[MUL_I_I_I:%.*]] = shl i64 [[__R_0_I_I_I]], 3
@@ -4071,14 +4585,14 @@ extern "C" __device__ double test_modf(double x, double* y) {
 // AMDGCNSPIRV-NEXT:    [[__TAGP_ADDR_1_I_I_I_IDX:%.*]] = zext i1 [[OR_COND_I_I_I]] to i64
 // AMDGCNSPIRV-NEXT:    [[__TAGP_ADDR_1_I_I_I]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[__TAGP_ADDR_0_I_I_I]], i64 [[__TAGP_ADDR_1_I_I_I_IDX]]
 // AMDGCNSPIRV-NEXT:    [[__R_1_I_I_I]] = select i1 [[OR_COND_I_I_I]], i64 [[SUB_I_I_I]], i64 [[__R_0_I_I_I]]
-// AMDGCNSPIRV-NEXT:    br i1 [[OR_COND_I_I_I]], label [[WHILE_COND_I_I_I]], label [[_ZL4NANFPKC_EXIT]], !llvm.loop [[LOOP8]]
-// AMDGCNSPIRV:       while.cond.i14.i.i:
-// AMDGCNSPIRV-NEXT:    [[__TAGP_ADDR_0_I15_I_I:%.*]] = phi ptr addrspace(4) [ [[__TAGP_ADDR_1_I25_I_I:%.*]], [[WHILE_BODY_I18_I_I:%.*]] ], [ [[TAG]], [[ENTRY:%.*]] ]
-// AMDGCNSPIRV-NEXT:    [[__R_0_I16_I_I:%.*]] = phi i64 [ [[__R_1_I26_I_I:%.*]], [[WHILE_BODY_I18_I_I]] ], [ 0, [[ENTRY]] ]
-// AMDGCNSPIRV-NEXT:    [[TMP10:%.*]] = load i8, ptr addrspace(4) [[__TAGP_ADDR_0_I15_I_I]], align 1, !tbaa [[TBAA5]]
+// AMDGCNSPIRV-NEXT:    br i1 [[OR_COND_I_I_I]], label %[[WHILE_COND_I_I_I]], label %[[_ZL4NANFPKC_EXIT]], !llvm.loop [[LOOP8]]
+// AMDGCNSPIRV:       [[WHILE_COND_I14_I_I]]:
+// AMDGCNSPIRV-NEXT:    [[__TAGP_ADDR_0_I15_I_I:%.*]] = phi ptr addrspace(4) [ [[__TAGP_ADDR_1_I25_I_I:%.*]], %[[WHILE_BODY_I18_I_I:.*]] ], [ [[TAG]], %[[ENTRY]] ]
+// AMDGCNSPIRV-NEXT:    [[__R_0_I16_I_I:%.*]] = phi i64 [ [[__R_1_I26_I_I:%.*]], %[[WHILE_BODY_I18_I_I]] ], [ 0, %[[ENTRY]] ]
+// AMDGCNSPIRV-NEXT:    [[TMP10:%.*]] = load i8, ptr addrspace(4) [[__TAGP_ADDR_0_I15_I_I]], align 1, !tbaa [[CHAR_TBAA5]]
 // AMDGCNSPIRV-NEXT:    [[CMP_NOT_I17_I_I:%.*]] = icmp eq i8 [[TMP10]], 0
-// AMDGCNSPIRV-NEXT:    br i1 [[CMP_NOT_I17_I_I]], label [[_ZL4NANFPKC_EXIT]], label [[WHILE_BODY_I18_I_I]]
-// AMDGCNSPIRV:       while.body.i18.i.i:
+// AMDGCNSPIRV-NEXT:    br i1 [[CMP_NOT_I17_I_I]], label %[[_ZL4NANFPKC_EXIT]], label %[[WHILE_BODY_I18_I_I]]
+// AMDGCNSPIRV:       [[WHILE_BODY_I18_I_I]]:
 // AMDGCNSPIRV-NEXT:    [[TMP11:%.*]] = add i8 [[TMP10]], -48
 // AMDGCNSPIRV-NEXT:    [[OR_COND_I19_I_I:%.*]] = icmp ult i8 [[TMP11]], 10
 // AMDGCNSPIRV-NEXT:    [[MUL_I20_I_I:%.*]] = mul i64 [[__R_0_I16_I_I]], 10
@@ -4088,9 +4602,9 @@ extern "C" __device__ double test_modf(double x, double* y) {
 // AMDGCNSPIRV-NEXT:    [[__TAGP_ADDR_1_I25_I_I_IDX:%.*]] = zext i1 [[OR_COND_I19_I_I]] to i64
 // AMDGCNSPIRV-NEXT:    [[__TAGP_ADDR_1_I25_I_I]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[__TAGP_ADDR_0_I15_I_I]], i64 [[__TAGP_ADDR_1_I25_I_I_IDX]]
 // AMDGCNSPIRV-NEXT:    [[__R_1_I26_I_I]] = select i1 [[OR_COND_I19_I_I]], i64 [[SUB_I23_I_I]], i64 [[__R_0_I16_I_I]]
-// AMDGCNSPIRV-NEXT:    br i1 [[OR_COND_I19_I_I]], label [[WHILE_COND_I14_I_I]], label [[_ZL4NANFPKC_EXIT]], !llvm.loop [[LOOP11]]
-// AMDGCNSPIRV:       _ZL4nanfPKc.exit:
-// AMDGCNSPIRV-NEXT:    [[RETVAL_0_I_I:%.*]] = phi i64 [ 0, [[IF_THEN5_I_I]] ], [ 0, [[WHILE_BODY_I_I_I]] ], [ [[__R_0_I_I_I]], [[WHILE_COND_I_I_I]] ], [ [[ADD28_I_I_I]], [[IF_END31_I_I_I]] ], [ 0, [[IF_ELSE17_I_I_I]] ], [ 0, [[WHILE_BODY_I18_I_I]] ], [ [[__R_0_I16_I_I]], [[WHILE_COND_I14_I_I]] ]
+// AMDGCNSPIRV-NEXT:    br i1 [[OR_COND_I19_I_I]], label %[[WHILE_COND_I14_I_I]], label %[[_ZL4NANFPKC_EXIT]], !llvm.loop [[LOOP11]]
+// AMDGCNSPIRV:       [[_ZL4NANFPKC_EXIT]]:
+// AMDGCNSPIRV-NEXT:    [[RETVAL_0_I_I:%.*]] = phi i64 [ 0, %[[IF_THEN5_I_I]] ], [ 0, %[[WHILE_BODY_I_I_I]] ], [ [[__R_0_I_I_I]], %[[WHILE_COND_I_I_I]] ], [ [[ADD28_I_I_I]], %[[IF_END31_I_I_I]] ], [ 0, %[[IF_ELSE17_I_I_I]] ], [ 0, %[[WHILE_BODY_I18_I_I]] ], [ [[__R_0_I16_I_I]], %[[WHILE_COND_I14_I_I]] ]
 // AMDGCNSPIRV-NEXT:    [[CONV_I:%.*]] = trunc i64 [[RETVAL_0_I_I]] to i32
 // AMDGCNSPIRV-NEXT:    [[BF_VALUE_I:%.*]] = and i32 [[CONV_I]], 4194303
 // AMDGCNSPIRV-NEXT:    [[BF_SET9_I:%.*]] = or disjoint i32 [[BF_VALUE_I]], 2143289344
@@ -4101,322 +4615,327 @@ extern "C" __device__ float test_nanf(const char *tag) {
   return nanf(tag);
 }
 
-// DEFAULT-LABEL: @test_nan(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[TMP0:%.*]] = load i8, ptr [[TAG:%.*]], align 1, !tbaa [[TBAA4]]
+// DEFAULT-LABEL: define dso_local double @test_nan(
+// DEFAULT-SAME: ptr noundef readonly captures(none) [[TAG:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[TMP0:%.*]] = load i8, ptr [[TAG]], align 1, !tbaa [[CHAR_TBAA4]]
 // DEFAULT-NEXT:    [[CMP_I_I:%.*]] = icmp eq i8 [[TMP0]], 48
-// DEFAULT-NEXT:    br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[WHILE_COND_I14_I_I_PREHEADER:%.*]]
-// DEFAULT:       while.cond.i14.i.i.preheader:
-// DEFAULT-NEXT:    [[TMP1:%.*]] = load i8, ptr [[TAG]], align 1, !tbaa [[TBAA4]]
+// DEFAULT-NEXT:    br i1 [[CMP_I_I]], label %[[IF_THEN_I_I:.*]], label %[[WHILE_COND_I14_I_I_PREHEADER:.*]]
+// DEFAULT:       [[WHILE_COND_I14_I_I_PREHEADER]]:
+// DEFAULT-NEXT:    [[TMP1:%.*]] = load i8, ptr [[TAG]], align 1, !tbaa [[CHAR_TBAA4]]
 // DEFAULT-NEXT:    [[CMP_NOT_I17_I_I5:%.*]] = icmp eq i8 [[TMP1]], 0
-// DEFAULT-NEXT:    br i1 [[CMP_NOT_I17_I_I5]], label [[_ZL3NANPKC_EXIT:%.*]], label [[WHILE_BODY_I18_I_I:%.*]]
-// DEFAULT:       if.then.i.i:
+// DEFAULT-NEXT:    br i1 [[CMP_NOT_I17_I_I5]], label %[[_ZL3NANPKC_EXIT:.*]], label %[[WHILE_BODY_I18_I_I:.*]]
+// DEFAULT:       [[IF_THEN_I_I]]:
 // DEFAULT-NEXT:    [[INCDEC_PTR_I_I:%.*]] = getelementptr inbounds nuw i8, ptr [[TAG]], i64 1
-// DEFAULT-NEXT:    [[TMP2:%.*]] = load i8, ptr [[INCDEC_PTR_I_I]], align 1, !tbaa [[TBAA4]]
-// DEFAULT-NEXT:    switch i8 [[TMP2]], label [[WHILE_COND_I_I_I_PREHEADER:%.*]] [
-// DEFAULT-NEXT:      i8 120, label [[IF_THEN5_I_I:%.*]]
-// DEFAULT-NEXT:      i8 88, label [[IF_THEN5_I_I]]
+// DEFAULT-NEXT:    [[TMP2:%.*]] = load i8, ptr [[INCDEC_PTR_I_I]], align 1, !tbaa [[CHAR_TBAA4]]
+// DEFAULT-NEXT:    switch i8 [[TMP2]], label %[[WHILE_COND_I_I_I_PREHEADER:.*]] [
+// DEFAULT-NEXT:      i8 120, label %[[IF_THEN5_I_I:.*]]
+// DEFAULT-NEXT:      i8 88, label %[[IF_THEN5_I_I]]
 // DEFAULT-NEXT:    ]
-// DEFAULT:       while.cond.i.i.i.preheader:
-// DEFAULT-NEXT:    [[TMP3:%.*]] = load i8, ptr [[INCDEC_PTR_I_I]], align 1, !tbaa [[TBAA4]]
+// DEFAULT:       [[WHILE_COND_I_I_I_PREHEADER]]:
+// DEFAULT-NEXT:    [[TMP3:%.*]] = load i8, ptr [[INCDEC_PTR_I_I]], align 1, !tbaa [[CHAR_TBAA4]]
 // DEFAULT-NEXT:    [[CMP_NOT_I_I_I14:%.*]] = icmp eq i8 [[TMP3]], 0
-// DEFAULT-NEXT:    br i1 [[CMP_NOT_I_I_I14]], label [[_ZL3NANPKC_EXIT]], label [[WHILE_BODY_I_I_I:%.*]]
-// DEFAULT:       if.then5.i.i:
-// DEFAULT-NEXT:    [[TMP4:%.*]] = load i8, ptr [[INCDEC_PTR_I_I]], align 1, !tbaa [[TBAA4]]
+// DEFAULT-NEXT:    br i1 [[CMP_NOT_I_I_I14]], label %[[_ZL3NANPKC_EXIT]], label %[[WHILE_BODY_I_I_I:.*]]
+// DEFAULT:       [[IF_THEN5_I_I]]:
+// DEFAULT-NEXT:    [[TMP4:%.*]] = load i8, ptr [[INCDEC_PTR_I_I]], align 1, !tbaa [[CHAR_TBAA4]]
 // DEFAULT-NEXT:    [[CMP_NOT_I30_I_I9:%.*]] = icmp eq i8 [[TMP4]], 0
-// DEFAULT-NEXT:    br i1 [[CMP_NOT_I30_I_I9]], label [[_ZL3NANPKC_EXIT]], label [[WHILE_BODY_I31_I_I:%.*]]
-// DEFAULT:       while.body.i31.i.i:
-// DEFAULT-NEXT:    [[TMP5:%.*]] = phi i8 [ [[TMP9:%.*]], [[IF_END31_I_I_I:%.*]] ], [ [[TMP4]], [[IF_THEN5_I_I]] ]
-// DEFAULT-NEXT:    [[__R_0_I29_I_I11:%.*]] = phi i64 [ [[ADD28_I_I_I:%.*]], [[IF_END31_I_I_I]] ], [ 0, [[IF_THEN5_I_I]] ]
-// DEFAULT-NEXT:    [[__TAGP_ADDR_0_I28_I_I10:%.*]] = phi ptr [ [[INCDEC_PTR_I34_I_I:%.*]], [[IF_END31_I_I_I]] ], [ [[INCDEC_PTR_I_I]], [[IF_THEN5_I_I]] ]
+// DEFAULT-NEXT:    br i1 [[CMP_NOT_I30_I_I9]], label %[[_ZL3NANPKC_EXIT]], label %[[WHILE_BODY_I31_I_I:.*]]
+// DEFAULT:       [[WHILE_BODY_I31_I_I]]:
+// DEFAULT-NEXT:    [[TMP5:%.*]] = phi i8 [ [[TMP9:%.*]], %[[IF_END31_I_I_I:.*]] ], [ [[TMP4]], %[[IF_THEN5_I_I]] ]
+// DEFAULT-NEXT:    [[__R_0_I29_I_I11:%.*]] = phi i64 [ [[ADD28_I_I_I:%.*]], %[[IF_END31_I_I_I]] ], [ 0, %[[IF_THEN5_I_I]] ]
+// DEFAULT-NEXT:    [[__TAGP_ADDR_0_I28_I_I10:%.*]] = phi ptr [ [[INCDEC_PTR_I34_I_I:%.*]], %[[IF_END31_I_I_I]] ], [ [[INCDEC_PTR_I_I]], %[[IF_THEN5_I_I]] ]
 // DEFAULT-NEXT:    [[TMP6:%.*]] = add i8 [[TMP5]], -48
 // DEFAULT-NEXT:    [[OR_COND_I32_I_I:%.*]] = icmp ult i8 [[TMP6]], 10
-// DEFAULT-NEXT:    br i1 [[OR_COND_I32_I_I]], label [[IF_END31_I_I_I]], label [[IF_ELSE_I_I_I:%.*]]
-// DEFAULT:       if.else.i.i.i:
+// DEFAULT-NEXT:    br i1 [[OR_COND_I32_I_I]], label %[[IF_END31_I_I_I]], label %[[IF_ELSE_I_I_I:.*]]
+// DEFAULT:       [[IF_ELSE_I_I_I]]:
 // DEFAULT-NEXT:    [[TMP7:%.*]] = add i8 [[TMP5]], -97
 // DEFAULT-NEXT:    [[OR_COND33_I_I_I:%.*]] = icmp ult i8 [[TMP7]], 6
-// DEFAULT-NEXT:    br i1 [[OR_COND33_I_I_I]], label [[IF_END31_I_I_I]], label [[IF_ELSE17_I_I_I:%.*]]
-// DEFAULT:       if.else17.i.i.i:
+// DEFAULT-NEXT:    br i1 [[OR_COND33_I_I_I]], label %[[IF_END31_I_I_I]], label %[[IF_ELSE17_I_I_I:.*]]
+// DEFAULT:       [[IF_ELSE17_I_I_I]]:
 // DEFAULT-NEXT:    [[TMP8:%.*]] = add i8 [[TMP5]], -65
 // DEFAULT-NEXT:    [[OR_COND34_I_I_I:%.*]] = icmp ult i8 [[TMP8]], 6
-// DEFAULT-NEXT:    br i1 [[OR_COND34_I_I_I]], label [[IF_END31_I_I_I]], label [[_ZL3NANPKC_EXIT]]
-// DEFAULT:       if.end31.i.i.i:
-// DEFAULT-NEXT:    [[DOTSINK:%.*]] = phi i64 [ -48, [[WHILE_BODY_I31_I_I]] ], [ -87, [[IF_ELSE_I_I_I]] ], [ -55, [[IF_ELSE17_I_I_I]] ]
+// DEFAULT-NEXT:    br i1 [[OR_COND34_I_I_I]], label %[[IF_END31_I_I_I]], label %[[_ZL3NANPKC_EXIT]]
+// DEFAULT:       [[IF_END31_I_I_I]]:
+// DEFAULT-NEXT:    [[DOTSINK:%.*]] = phi i64 [ -48, %[[WHILE_BODY_I31_I_I]] ], [ -87, %[[IF_ELSE_I_I_I]] ], [ -55, %[[IF_ELSE17_I_I_I]] ]
 // DEFAULT-NEXT:    [[MUL24_I_I_I:%.*]] = shl i64 [[__R_0_I29_I_I11]], 4
 // DEFAULT-NEXT:    [[CONV25_I_I_I:%.*]] = zext nneg i8 [[TMP5]] to i64
 // DEFAULT-NEXT:    [[ADD26_I_I_I:%.*]] = add i64 [[MUL24_I_I_I]], [[DOTSINK]]
 // DEFAULT-NEXT:    [[ADD28_I_I_I]] = add i64 [[ADD26_I_I_I]], [[CONV25_I_I_I]]
 // DEFAULT-NEXT:    [[INCDEC_PTR_I34_I_I]] = getelementptr inbounds nuw i8, ptr [[__TAGP_ADDR_0_I28_I_I10]], i64 1
-// DEFAULT-NEXT:    [[TMP9]] = load i8, ptr [[INCDEC_PTR_I34_I_I]], align 1, !tbaa [[TBAA4]]
+// DEFAULT-NEXT:    [[TMP9]] = load i8, ptr [[INCDEC_PTR_I34_I_I]], align 1, !tbaa [[CHAR_TBAA4]]
 // DEFAULT-NEXT:    [[CMP_NOT_I30_I_I:%.*]] = icmp eq i8 [[TMP9]], 0
-// DEFAULT-NEXT:    br i1 [[CMP_NOT_I30_I_I]], label [[_ZL3NANPKC_EXIT]], label [[WHILE_BODY_I31_I_I]], !llvm.loop [[LOOP11]]
-// DEFAULT:       while.body.i.i.i:
-// DEFAULT-NEXT:    [[TMP10:%.*]] = phi i8 [ [[TMP12:%.*]], [[IF_THEN_I_I_I:%.*]] ], [ [[TMP3]], [[WHILE_COND_I_I_I_PREHEADER]] ]
-// DEFAULT-NEXT:    [[__R_0_I_I_I16:%.*]] = phi i64 [ [[SUB_I_I_I:%.*]], [[IF_THEN_I_I_I]] ], [ 0, [[WHILE_COND_I_I_I_PREHEADER]] ]
-// DEFAULT-NEXT:    [[__TAGP_ADDR_0_I_I_I15:%.*]] = phi ptr [ [[INCDEC_PTR_I_I_I:%.*]], [[IF_THEN_I_I_I]] ], [ [[INCDEC_PTR_I_I]], [[WHILE_COND_I_I_I_PREHEADER]] ]
+// DEFAULT-NEXT:    br i1 [[CMP_NOT_I30_I_I]], label %[[_ZL3NANPKC_EXIT]], label %[[WHILE_BODY_I31_I_I]], !llvm.loop [[LOOP11]]
+// DEFAULT:       [[WHILE_BODY_I_I_I]]:
+// DEFAULT-NEXT:    [[TMP10:%.*]] = phi i8 [ [[TMP12:%.*]], %[[IF_THEN_I_I_I:.*]] ], [ [[TMP3]], %[[WHILE_COND_I_I_I_PREHEADER]] ]
+// DEFAULT-NEXT:    [[__R_0_I_I_I16:%.*]] = phi i64 [ [[SUB_I_I_I:%.*]], %[[IF_THEN_I_I_I]] ], [ 0, %[[WHILE_COND_I_I_I_PREHEADER]] ]
+// DEFAULT-NEXT:    [[__TAGP_ADDR_0_I_I_I15:%.*]] = phi ptr [ [[INCDEC_PTR_I_I_I:%.*]], %[[IF_THEN_I_I_I]] ], [ [[INCDEC_PTR_I_I]], %[[WHILE_COND_I_I_I_PREHEADER]] ]
 // DEFAULT-NEXT:    [[TMP11:%.*]] = and i8 [[TMP10]], -8
 // DEFAULT-NEXT:    [[OR_COND_I_I_I:%.*]] = icmp eq i8 [[TMP11]], 48
-// DEFAULT-NEXT:    br i1 [[OR_COND_I_I_I]], label [[IF_THEN_I_I_I]], label [[_ZL3NANPKC_EXIT]]
-// DEFAULT:       if.then.i.i.i:
+// DEFAULT-NEXT:    br i1 [[OR_COND_I_I_I]], label %[[IF_THEN_I_I_I]], label %[[_ZL3NANPKC_EXIT]]
+// DEFAULT:       [[IF_THEN_I_I_I]]:
 // DEFAULT-NEXT:    [[MUL_I_I_I:%.*]] = shl i64 [[__R_0_I_I_I16]], 3
 // DEFAULT-NEXT:    [[CONV5_I_I_I:%.*]] = zext nneg i8 [[TMP10]] to i64
 // DEFAULT-NEXT:    [[ADD_I_I_I:%.*]] = add i64 [[MUL_I_I_I]], -48
 // DEFAULT-NEXT:    [[SUB_I_I_I]] = add i64 [[ADD_I_I_I]], [[CONV5_I_I_I]]
 // DEFAULT-NEXT:    [[INCDEC_PTR_I_I_I]] = getelementptr inbounds nuw i8, ptr [[__TAGP_ADDR_0_I_I_I15]], i64 1
-// DEFAULT-NEXT:    [[TMP12]] = load i8, ptr [[INCDEC_PTR_I_I_I]], align 1, !tbaa [[TBAA4]]
+// DEFAULT-NEXT:    [[TMP12]] = load i8, ptr [[INCDEC_PTR_I_I_I]], align 1, !tbaa [[CHAR_TBAA4]]
 // DEFAULT-NEXT:    [[CMP_NOT_I_I_I:%.*]] = icmp eq i8 [[TMP12]], 0
-// DEFAULT-NEXT:    br i1 [[CMP_NOT_I_I_I]], label [[_ZL3NANPKC_EXIT]], label [[WHILE_BODY_I_I_I]], !llvm.loop [[LOOP7]]
-// DEFAULT:       while.body.i18.i.i:
-// DEFAULT-NEXT:    [[TMP13:%.*]] = phi i8 [ [[TMP15:%.*]], [[IF_THEN_I21_I_I:%.*]] ], [ [[TMP1]], [[WHILE_COND_I14_I_I_PREHEADER]] ]
-// DEFAULT-NEXT:    [[__R_0_I16_I_I7:%.*]] = phi i64 [ [[SUB_I25_I_I:%.*]], [[IF_THEN_I21_I_I]] ], [ 0, [[WHILE_COND_I14_I_I_PREHEADER]] ]
-// DEFAULT-NEXT:    [[__TAGP_ADDR_0_I15_I_I6:%.*]] = phi ptr [ [[INCDEC_PTR_I26_I_I:%.*]], [[IF_THEN_I21_I_I]] ], [ [[TAG]], [[WHILE_COND_I14_I_I_PREHEADER]] ]
+// DEFAULT-NEXT:    br i1 [[CMP_NOT_I_I_I]], label %[[_ZL3NANPKC_EXIT]], label %[[WHILE_BODY_I_I_I]], !llvm.loop [[LOOP7]]
+// DEFAULT:       [[WHILE_BODY_I18_I_I]]:
+// DEFAULT-NEXT:    [[TMP13:%.*]] = phi i8 [ [[TMP15:%.*]], %[[IF_THEN_I21_I_I:.*]] ], [ [[TMP1]], %[[WHILE_COND_I14_I_I_PREHEADER]] ]
+// DEFAULT-NEXT:    [[__R_0_I16_I_I7:%.*]] = phi i64 [ [[SUB_I25_I_I:%.*]], %[[IF_THEN_I21_I_I]] ], [ 0, %[[WHILE_COND_I14_I_I_PREHEADER]] ]
+// DEFAULT-NEXT:    [[__TAGP_ADDR_0_I15_I_I6:%.*]] = phi ptr [ [[INCDEC_PTR_I26_I_I:%.*]], %[[IF_THEN_I21_I_I]] ], [ [[TAG]], %[[WHILE_COND_I14_I_I_PREHEADER]] ]
 // DEFAULT-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], -48
 // DEFAULT-NEXT:    [[OR_COND_I19_I_I:%.*]] = icmp ult i8 [[TMP14]], 10
-// DEFAULT-NEXT:    br i1 [[OR_COND_I19_I_I]], label [[IF_THEN_I21_I_I]], label [[_ZL3NANPKC_EXIT]]
-// DEFAULT:       if.then.i21.i.i:
+// DEFAULT-NEXT:    br i1 [[OR_COND_I19_I_I]], label %[[IF_THEN_I21_I_I]], label %[[_ZL3NANPKC_EXIT]]
+// DEFAULT:       [[IF_THEN_I21_I_I]]:
 // DEFAULT-NEXT:    [[MUL_I22_I_I:%.*]] = mul i64 [[__R_0_I16_I_I7]], 10
 // DEFAULT-NEXT:    [[CONV5_I23_I_I:%.*]] = zext nneg i8 [[TMP13]] to i64
 // DEFAULT-NEXT:    [[ADD_I24_I_I:%.*]] = add i64 [[MUL_I22_I_I]], -48
 // DEFAULT-NEXT:    [[SUB_I25_I_I]] = add i64 [[ADD_I24_I_I]], [[CONV5_I23_I_I]]
 // DEFAULT-NEXT:    [[INCDEC_PTR_I26_I_I]] = getelementptr inbounds nuw i8, ptr [[__TAGP_ADDR_0_I15_I_I6]], i64 1
-// DEFAULT-NEXT:    [[TMP15]] = load i8, ptr [[INCDEC_PTR_I26_I_I]], align 1, !tbaa [[TBAA4]]
+// DEFAULT-NEXT:    [[TMP15]] = load i8, ptr [[INCDEC_PTR_I26_I_I]], align 1, !tbaa [[CHAR_TBAA4]]
 // DEFAULT-NEXT:    [[CMP_NOT_I17_I_I:%.*]] = icmp eq i8 [[TMP15]], 0
-// DEFAULT-NEXT:    br i1 [[CMP_NOT_I17_I_I]], label [[_ZL3NANPKC_EXIT]], label [[WHILE_BODY_I18_I_I]], !llvm.loop [[LOOP10]]
-// DEFAULT:       _ZL3nanPKc.exit:
-// DEFAULT-NEXT:    [[RETVAL_0_I_I:%.*]] = phi i64 [ 0, [[WHILE_COND_I_I_I_PREHEADER]] ], [ 0, [[IF_THEN5_I_I]] ], [ 0, [[WHILE_COND_I14_I_I_PREHEADER]] ], [ [[SUB_I_I_I]], [[IF_THEN_I_I_I]] ], [ 0, [[WHILE_BODY_I_I_I]] ], [ [[ADD28_I_I_I]], [[IF_END31_I_I_I]] ], [ 0, [[IF_ELSE17_I_I_I]] ], [ [[SUB_I25_I_I]], [[IF_THEN_I21_I_I]] ], [ 0, [[WHILE_BODY_I18_I_I]] ]
+// DEFAULT-NEXT:    br i1 [[CMP_NOT_I17_I_I]], label %[[_ZL3NANPKC_EXIT]], label %[[WHILE_BODY_I18_I_I]], !llvm.loop [[LOOP10]]
+// DEFAULT:       [[_ZL3NANPKC_EXIT]]:
+// DEFAULT-NEXT:    [[RETVAL_0_I_I:%.*]] = phi i64 [ 0, %[[WHILE_COND_I_I_I_PREHEADER]] ], [ 0, %[[IF_THEN5_I_I]] ], [ 0, %[[WHILE_COND_I14_I_I_PREHEADER]] ], [ [[SUB_I_I_I]], %[[IF_THEN_I_I_I]] ], [ 0, %[[WHILE_BODY_I_I_I]] ], [ [[ADD28_I_I_I]], %[[IF_END31_I_I_I]] ], [ 0, %[[IF_ELSE17_I_I_I]] ], [ [[SUB_I25_I_I]], %[[IF_THEN_I21_I_I]] ], [ 0, %[[WHILE_BODY_I18_I_I]] ]
 // DEFAULT-NEXT:    [[BF_VALUE_I:%.*]] = and i64 [[RETVAL_0_I_I]], 2251799813685247
 // DEFAULT-NEXT:    [[BF_SET9_I:%.*]] = or disjoint i64 [[BF_VALUE_I]], 9221120237041090560
 // DEFAULT-NEXT:    [[TMP16:%.*]] = bitcast i64 [[BF_SET9_I]] to double
 // DEFAULT-NEXT:    ret double [[TMP16]]
 //
-// FINITEONLY-LABEL: @test_nan(
-// FINITEONLY-NEXT:  entry:
+// FINITEONLY-LABEL: define dso_local nofpclass(nan inf) double @test_nan(
+// FINITEONLY-SAME: ptr noundef readonly captures(none) [[TAG:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
 // FINITEONLY-NEXT:    ret double poison
 //
-// APPROX-LABEL: @test_nan(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[TMP0:%.*]] = load i8, ptr [[TAG:%.*]], align 1, !tbaa [[TBAA4]]
+// APPROX-LABEL: define dso_local double @test_nan(
+// APPROX-SAME: ptr noundef readonly captures(none) [[TAG:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[TMP0:%.*]] = load i8, ptr [[TAG]], align 1, !tbaa [[CHAR_TBAA4]]
 // APPROX-NEXT:    [[CMP_I_I:%.*]] = icmp eq i8 [[TMP0]], 48
-// APPROX-NEXT:    br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[WHILE_COND_I14_I_I_PREHEADER:%.*]]
-// APPROX:       while.cond.i14.i.i.preheader:
-// APPROX-NEXT:    [[TMP1:%.*]] = load i8, ptr [[TAG]], align 1, !tbaa [[TBAA4]]
+// APPROX-NEXT:    br i1 [[CMP_I_I]], label %[[IF_THEN_I_I:.*]], label %[[WHILE_COND_I14_I_I_PREHEADER:.*]]
+// APPROX:       [[WHILE_COND_I14_I_I_PREHEADER]]:
+// APPROX-NEXT:    [[TMP1:%.*]] = load i8, ptr [[TAG]], align 1, !tbaa [[CHAR_TBAA4]]
 // APPROX-NEXT:    [[CMP_NOT_I17_I_I5:%.*]] = icmp eq i8 [[TMP1]], 0
-// APPROX-NEXT:    br i1 [[CMP_NOT_I17_I_I5]], label [[_ZL3NANPKC_EXIT:%.*]], label [[WHILE_BODY_I18_I_I:%.*]]
-// APPROX:       if.then.i.i:
+// APPROX-NEXT:    br i1 [[CMP_NOT_I17_I_I5]], label %[[_ZL3NANPKC_EXIT:.*]], label %[[WHILE_BODY_I18_I_I:.*]]
+// APPROX:       [[IF_THEN_I_I]]:
 // APPROX-NEXT:    [[INCDEC_PTR_I_I:%.*]] = getelementptr inbounds nuw i8, ptr [[TAG]], i64 1
-// APPROX-NEXT:    [[TMP2:%.*]] = load i8, ptr [[INCDEC_PTR_I_I]], align 1, !tbaa [[TBAA4]]
-// APPROX-NEXT:    switch i8 [[TMP2]], label [[WHILE_COND_I_I_I_PREHEADER:%.*]] [
-// APPROX-NEXT:      i8 120, label [[IF_THEN5_I_I:%.*]]
-// APPROX-NEXT:      i8 88, label [[IF_THEN5_I_I]]
+// APPROX-NEXT:    [[TMP2:%.*]] = load i8, ptr [[INCDEC_PTR_I_I]], align 1, !tbaa [[CHAR_TBAA4]]
+// APPROX-NEXT:    switch i8 [[TMP2]], label %[[WHILE_COND_I_I_I_PREHEADER:.*]] [
+// APPROX-NEXT:      i8 120, label %[[IF_THEN5_I_I:.*]]
+// APPROX-NEXT:      i8 88, label %[[IF_THEN5_I_I]]
 // APPROX-NEXT:    ]
-// APPROX:       while.cond.i.i.i.preheader:
-// APPROX-NEXT:    [[TMP3:%.*]] = load i8, ptr [[INCDEC_PTR_I_I]], align 1, !tbaa [[TBAA4]]
+// APPROX:       [[WHILE_COND_I_I_I_PREHEADER]]:
+// APPROX-NEXT:    [[TMP3:%.*]] = load i8, ptr [[INCDEC_PTR_I_I]], align 1, !tbaa [[CHAR_TBAA4]]
 // APPROX-NEXT:    [[CMP_NOT_I_I_I14:%.*]] = icmp eq i8 [[TMP3]], 0
-// APPROX-NEXT:    br i1 [[CMP_NOT_I_I_I14]], label [[_ZL3NANPKC_EXIT]], label [[WHILE_BODY_I_I_I:%.*]]
-// APPROX:       if.then5.i.i:
-// APPROX-NEXT:    [[TMP4:%.*]] = load i8, ptr [[INCDEC_PTR_I_I]], align 1, !tbaa [[TBAA4]]
+// APPROX-NEXT:    br i1 [[CMP_NOT_I_I_I14]], label %[[_ZL3NANPKC_EXIT]], label %[[WHILE_BODY_I_I_I:.*]]
+// APPROX:       [[IF_THEN5_I_I]]:
+// APPROX-NEXT:    [[TMP4:%.*]] = load i8, ptr [[INCDEC_PTR_I_I]], align 1, !tbaa [[CHAR_TBAA4]]
 // APPROX-NEXT:    [[CMP_NOT_I30_I_I9:%.*]] = icmp eq i8 [[TMP4]], 0
-// APPROX-NEXT:    br i1 [[CMP_NOT_I30_I_I9]], label [[_ZL3NANPKC_EXIT]], label [[WHILE_BODY_I31_I_I:%.*]]
-// APPROX:       while.body.i31.i.i:
-// APPROX-NEXT:    [[TMP5:%.*]] = phi i8 [ [[TMP9:%.*]], [[IF_END31_I_I_I:%.*]] ], [ [[TMP4]], [[IF_THEN5_I_I]] ]
-// APPROX-NEXT:    [[__R_0_I29_I_I11:%.*]] = phi i64 [ [[ADD28_I_I_I:%.*]], [[IF_END31_I_I_I]] ], [ 0, [[IF_THEN5_I_I]] ]
-// APPROX-NEXT:    [[__TAGP_ADDR_0_I28_I_I10:%.*]] = phi ptr [ [[INCDEC_PTR_I34_I_I:%.*]], [[IF_END31_I_I_I]] ], [ [[INCDEC_PTR_I_I]], [[IF_THEN5_I_I]] ]
+// APPROX-NEXT:    br i1 [[CMP_NOT_I30_I_I9]], label %[[_ZL3NANPKC_EXIT]], label %[[WHILE_BODY_I31_I_I:.*]]
+// APPROX:       [[WHILE_BODY_I31_I_I]]:
+// APPROX-NEXT:    [[TMP5:%.*]] = phi i8 [ [[TMP9:%.*]], %[[IF_END31_I_I_I:.*]] ], [ [[TMP4]], %[[IF_THEN5_I_I]] ]
+// APPROX-NEXT:    [[__R_0_I29_I_I11:%.*]] = phi i64 [ [[ADD28_I_I_I:%.*]], %[[IF_END31_I_I_I]] ], [ 0, %[[IF_THEN5_I_I]] ]
+// APPROX-NEXT:    [[__TAGP_ADDR_0_I28_I_I10:%.*]] = phi ptr [ [[INCDEC_PTR_I34_I_I:%.*]], %[[IF_END31_I_I_I]] ], [ [[INCDEC_PTR_I_I]], %[[IF_THEN5_I_I]] ]
 // APPROX-NEXT:    [[TMP6:%.*]] = add i8 [[TMP5]], -48
 // APPROX-NEXT:    [[OR_COND_I32_I_I:%.*]] = icmp ult i8 [[TMP6]], 10
-// APPROX-NEXT:    br i1 [[OR_COND_I32_I_I]], label [[IF_END31_I_I_I]], label [[IF_ELSE_I_I_I:%.*]]
-// APPROX:       if.else.i.i.i:
+// APPROX-NEXT:    br i1 [[OR_COND_I32_I_I]], label %[[IF_END31_I_I_I]], label %[[IF_ELSE_I_I_I:.*]]
+// APPROX:       [[IF_ELSE_I_I_I]]:
 // APPROX-NEXT:    [[TMP7:%.*]] = add i8 [[TMP5]], -97
 // APPROX-NEXT:    [[OR_COND33_I_I_I:%.*]] = icmp ult i8 [[TMP7]], 6
-// APPROX-NEXT:    br i1 [[OR_COND33_I_I_I]], label [[IF_END31_I_I_I]], label [[IF_ELSE17_I_I_I:%.*]]
-// APPROX:       if.else17.i.i.i:
+// APPROX-NEXT:    br i1 [[OR_COND33_I_I_I]], label %[[IF_END31_I_I_I]], label %[[IF_ELSE17_I_I_I:.*]]
+// APPROX:       [[IF_ELSE17_I_I_I]]:
 // APPROX-NEXT:    [[TMP8:%.*]] = add i8 [[TMP5]], -65
 // APPROX-NEXT:    [[OR_COND34_I_I_I:%.*]] = icmp ult i8 [[TMP8]], 6
-// APPROX-NEXT:    br i1 [[OR_COND34_I_I_I]], label [[IF_END31_I_I_I]], label [[_ZL3NANPKC_EXIT]]
-// APPROX:       if.end31.i.i.i:
-// APPROX-NEXT:    [[DOTSINK:%.*]] = phi i64 [ -48, [[WHILE_BODY_I31_I_I]] ], [ -87, [[IF_ELSE_I_I_I]] ], [ -55, [[IF_ELSE17_I_I_I]] ]
+// APPROX-NEXT:    br i1 [[OR_COND34_I_I_I]], label %[[IF_END31_I_I_I]], label %[[_ZL3NANPKC_EXIT]]
+// APPROX:       [[IF_END31_I_I_I]]:
+// APPROX-NEXT:    [[DOTSINK:%.*]] = phi i64 [ -48, %[[WHILE_BODY_I31_I_I]] ], [ -87, %[[IF_ELSE_I_I_I]] ], [ -55, %[[IF_ELSE17_I_I_I]] ]
 // APPROX-NEXT:    [[MUL24_I_I_I:%.*]] = shl i64 [[__R_0_I29_I_I11]], 4
 // APPROX-NEXT:    [[CONV25_I_I_I:%.*]] = zext nneg i8 [[TMP5]] to i64
 // APPROX-NEXT:    [[ADD26_I_I_I:%.*]] = add i64 [[MUL24_I_I_I]], [[DOTSINK]]
 // APPROX-NEXT:    [[ADD28_I_I_I]] = add i64 [[ADD26_I_I_I]], [[CONV25_I_I_I]]
 // APPROX-NEXT:    [[INCDEC_PTR_I34_I_I]] = getelementptr inbounds nuw i8, ptr [[__TAGP_ADDR_0_I28_I_I10]], i64 1
-// APPROX-NEXT:    [[TMP9]] = load i8, ptr [[INCDEC_PTR_I34_I_I]], align 1, !tbaa [[TBAA4]]
+// APPROX-NEXT:    [[TMP9]] = load i8, ptr [[INCDEC_PTR_I34_I_I]], align 1, !tbaa [[CHAR_TBAA4]]
 // APPROX-NEXT:    [[CMP_NOT_I30_I_I:%.*]] = icmp eq i8 [[TMP9]], 0
-// APPROX-NEXT:    br i1 [[CMP_NOT_I30_I_I]], label [[_ZL3NANPKC_EXIT]], label [[WHILE_BODY_I31_I_I]], !llvm.loop [[LOOP11]]
-// APPROX:       while.body.i.i.i:
-// APPROX-NEXT:    [[TMP10:%.*]] = phi i8 [ [[TMP12:%.*]], [[IF_THEN_I_I_I:%.*]] ], [ [[TMP3]], [[WHILE_COND_I_I_I_PREHEADER]] ]
-// APPROX-NEXT:    [[__R_0_I_I_I16:%.*]] = phi i64 [ [[SUB_I_I_I:%.*]], [[IF_THEN_I_I_I]] ], [ 0, [[WHILE_COND_I_I_I_PREHEADER]] ]
-// APPROX-NEXT:    [[__TAGP_ADDR_0_I_I_I15:%.*]] = phi ptr [ [[INCDEC_PTR_I_I_I:%.*]], [[IF_THEN_I_I_I]] ], [ [[INCDEC_PTR_I_I]], [[WHILE_COND_I_I_I_PREHEADER]] ]
+// APPROX-NEXT:    br i1 [[CMP_NOT_I30_I_I]], label %[[_ZL3NANPKC_EXIT]], label %[[WHILE_BODY_I31_I_I]], !llvm.loop [[LOOP11]]
+// APPROX:       [[WHILE_BODY_I_I_I]]:
+// APPROX-NEXT:    [[TMP10:%.*]] = phi i8 [ [[TMP12:%.*]], %[[IF_THEN_I_I_I:.*]] ], [ [[TMP3]], %[[WHILE_COND_I_I_I_PREHEADER]] ]
+// APPROX-NEXT:    [[__R_0_I_I_I16:%.*]] = phi i64 [ [[SUB_I_I_I:%.*]], %[[IF_THEN_I_I_I]] ], [ 0, %[[WHILE_COND_I_I_I_PREHEADER]] ]
+// APPROX-NEXT:    [[__TAGP_ADDR_0_I_I_I15:%.*]] = phi ptr [ [[INCDEC_PTR_I_I_I:%.*]], %[[IF_THEN_I_I_I]] ], [ [[INCDEC_PTR_I_I]], %[[WHILE_COND_I_I_I_PREHEADER]] ]
 // APPROX-NEXT:    [[TMP11:%.*]] = and i8 [[TMP10]], -8
 // APPROX-NEXT:    [[OR_COND_I_I_I:%.*]] = icmp eq i8 [[TMP11]], 48
-// APPROX-NEXT:    br i1 [[OR_COND_I_I_I]], label [[IF_THEN_I_I_I]], label [[_ZL3NANPKC_EXIT]]
-// APPROX:       if.then.i.i.i:
+// APPROX-NEXT:    br i1 [[OR_COND_I_I_I]], label %[[IF_THEN_I_I_I]], label %[[_ZL3NANPKC_EXIT]]
+// APPROX:       [[IF_THEN_I_I_I]]:
 // APPROX-NEXT:    [[MUL_I_I_I:%.*]] = shl i64 [[__R_0_I_I_I16]], 3
 // APPROX-NEXT:    [[CONV5_I_I_I:%.*]] = zext nneg i8 [[TMP10]] to i64
 // APPROX-NEXT:    [[ADD_I_I_I:%.*]] = add i64 [[MUL_I_I_I]], -48
 // APPROX-NEXT:    [[SUB_I_I_I]] = add i64 [[ADD_I_I_I]], [[CONV5_I_I_I]]
 // APPROX-NEXT:    [[INCDEC_PTR_I_I_I]] = getelementptr inbounds nuw i8, ptr [[__TAGP_ADDR_0_I_I_I15]], i64 1
-// APPROX-NEXT:    [[TMP12]] = load i8, ptr [[INCDEC_PTR_I_I_I]], align 1, !tbaa [[TBAA4]]
+// APPROX-NEXT:    [[TMP12]] = load i8, ptr [[INCDEC_PTR_I_I_I]], align 1, !tbaa [[CHAR_TBAA4]]
 // APPROX-NEXT:    [[CMP_NOT_I_I_I:%.*]] = icmp eq i8 [[TMP12]], 0
-// APPROX-NEXT:    br i1 [[CMP_NOT_I_I_I]], label [[_ZL3NANPKC_EXIT]], label [[WHILE_BODY_I_I_I]], !llvm.loop [[LOOP7]]
-// APPROX:       while.body.i18.i.i:
-// APPROX-NEXT:    [[TMP13:%.*]] = phi i8 [ [[TMP15:%.*]], [[IF_THEN_I21_I_I:%.*]] ], [ [[TMP1]], [[WHILE_COND_I14_I_I_PREHEADER]] ]
-// APPROX-NEXT:    [[__R_0_I16_I_I7:%.*]] = phi i64 [ [[SUB_I25_I_I:%.*]], [[IF_THEN_I21_I_I]] ], [ 0, [[WHILE_COND_I14_I_I_PREHEADER]] ]
-// APPROX-NEXT:    [[__TAGP_ADDR_0_I15_I_I6:%.*]] = phi ptr [ [[INCDEC_PTR_I26_I_I:%.*]], [[IF_THEN_I21_I_I]] ], [ [[TAG]], [[WHILE_COND_I14_I_I_PREHEADER]] ]
+// APPROX-NEXT:    br i1 [[CMP_NOT_I_I_I]], label %[[_ZL3NANPKC_EXIT]], label %[[WHILE_BODY_I_I_I]], !llvm.loop [[LOOP7]]
+// APPROX:       [[WHILE_BODY_I18_I_I]]:
+// APPROX-NEXT:    [[TMP13:%.*]] = phi i8 [ [[TMP15:%.*]], %[[IF_THEN_I21_I_I:.*]] ], [ [[TMP1]], %[[WHILE_COND_I14_I_I_PREHEADER]] ]
+// APPROX-NEXT:    [[__R_0_I16_I_I7:%.*]] = phi i64 [ [[SUB_I25_I_I:%.*]], %[[IF_THEN_I21_I_I]] ], [ 0, %[[WHILE_COND_I14_I_I_PREHEADER]] ]
+// APPROX-NEXT:    [[__TAGP_ADDR_0_I15_I_I6:%.*]] = phi ptr [ [[INCDEC_PTR_I26_I_I:%.*]], %[[IF_THEN_I21_I_I]] ], [ [[TAG]], %[[WHILE_COND_I14_I_I_PREHEADER]] ]
 // APPROX-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], -48
 // APPROX-NEXT:    [[OR_COND_I19_I_I:%.*]] = icmp ult i8 [[TMP14]], 10
-// APPROX-NEXT:    br i1 [[OR_COND_I19_I_I]], label [[IF_THEN_I21_I_I]], label [[_ZL3NANPKC_EXIT]]
-// APPROX:       if.then.i21.i.i:
+// APPROX-NEXT:    br i1 [[OR_COND_I19_I_I]], label %[[IF_THEN_I21_I_I]], label %[[_ZL3NANPKC_EXIT]]
+// APPROX:       [[IF_THEN_I21_I_I]]:
 // APPROX-NEXT:    [[MUL_I22_I_I:%.*]] = mul i64 [[__R_0_I16_I_I7]], 10
 // APPROX-NEXT:    [[CONV5_I23_I_I:%.*]] = zext nneg i8 [[TMP13]] to i64
 // APPROX-NEXT:    [[ADD_I24_I_I:%.*]] = add i64 [[MUL_I22_I_I]], -48
 // APPROX-NEXT:    [[SUB_I25_I_I]] = add i64 [[ADD_I24_I_I]], [[CONV5_I23_I_I]]
 // APPROX-NEXT:    [[INCDEC_PTR_I26_I_I]] = getelementptr inbounds nuw i8, ptr [[__TAGP_ADDR_0_I15_I_I6]], i64 1
-// APPROX-NEXT:    [[TMP15]] = load i8, ptr [[INCDEC_PTR_I26_I_I]], align 1, !tbaa [[TBAA4]]
+// APPROX-NEXT:    [[TMP15]] = load i8, ptr [[INCDEC_PTR_I26_I_I]], align 1, !tbaa [[CHAR_TBAA4]]
 // APPROX-NEXT:    [[CMP_NOT_I17_I_I:%.*]] = icmp eq i8 [[TMP15]], 0
-// APPROX-NEXT:    br i1 [[CMP_NOT_I17_I_I]], label [[_ZL3NANPKC_EXIT]], label [[WHILE_BODY_I18_I_I]], !llvm.loop [[LOOP10]]
-// APPROX:       _ZL3nanPKc.exit:
-// APPROX-NEXT:    [[RETVAL_0_I_I:%.*]] = phi i64 [ 0, [[WHILE_COND_I_I_I_PREHEADER]] ], [ 0, [[IF_THEN5_I_I]] ], [ 0, [[WHILE_COND_I14_I_I_PREHEADER]] ], [ [[SUB_I_I_I]], [[IF_THEN_I_I_I]] ], [ 0, [[WHILE_BODY_I_I_I]] ], [ [[ADD28_I_I_I]], [[IF_END31_I_I_I]] ], [ 0, [[IF_ELSE17_I_I_I]] ], [ [[SUB_I25_I_I]], [[IF_THEN_I21_I_I]] ], [ 0, [[WHILE_BODY_I18_I_I]] ]
+// APPROX-NEXT:    br i1 [[CMP_NOT_I17_I_I]], label %[[_ZL3NANPKC_EXIT]], label %[[WHILE_BODY_I18_I_I]], !llvm.loop [[LOOP10]]
+// APPROX:       [[_ZL3NANPKC_EXIT]]:
+// APPROX-NEXT:    [[RETVAL_0_I_I:%.*]] = phi i64 [ 0, %[[WHILE_COND_I_I_I_PREHEADER]] ], [ 0, %[[IF_THEN5_I_I]] ], [ 0, %[[WHILE_COND_I14_I_I_PREHEADER]] ], [ [[SUB_I_I_I]], %[[IF_THEN_I_I_I]] ], [ 0, %[[WHILE_BODY_I_I_I]] ], [ [[ADD28_I_I_I]], %[[IF_END31_I_I_I]] ], [ 0, %[[IF_ELSE17_I_I_I]] ], [ [[SUB_I25_I_I]], %[[IF_THEN_I21_I_I]] ], [ 0, %[[WHILE_BODY_I18_I_I]] ]
 // APPROX-NEXT:    [[BF_VALUE_I:%.*]] = and i64 [[RETVAL_0_I_I]], 2251799813685247
 // APPROX-NEXT:    [[BF_SET9_I:%.*]] = or disjoint i64 [[BF_VALUE_I]], 9221120237041090560
 // APPROX-NEXT:    [[TMP16:%.*]] = bitcast i64 [[BF_SET9_I]] to double
 // APPROX-NEXT:    ret double [[TMP16]]
 //
-// NCRDIV-LABEL: @test_nan(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[TMP0:%.*]] = load i8, ptr [[TAG:%.*]], align 1, !tbaa [[TBAA4]]
+// NCRDIV-LABEL: define dso_local double @test_nan(
+// NCRDIV-SAME: ptr noundef readonly captures(none) [[TAG:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[TMP0:%.*]] = load i8, ptr [[TAG]], align 1, !tbaa [[CHAR_TBAA4]]
 // NCRDIV-NEXT:    [[CMP_I_I:%.*]] = icmp eq i8 [[TMP0]], 48
-// NCRDIV-NEXT:    br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[WHILE_COND_I14_I_I_PREHEADER:%.*]]
-// NCRDIV:       while.cond.i14.i.i.preheader:
-// NCRDIV-NEXT:    [[TMP1:%.*]] = load i8, ptr [[TAG]], align 1, !tbaa [[TBAA4]]
+// NCRDIV-NEXT:    br i1 [[CMP_I_I]], label %[[IF_THEN_I_I:.*]], label %[[WHILE_COND_I14_I_I_PREHEADER:.*]]
+// NCRDIV:       [[WHILE_COND_I14_I_I_PREHEADER]]:
+// NCRDIV-NEXT:    [[TMP1:%.*]] = load i8, ptr [[TAG]], align 1, !tbaa [[CHAR_TBAA4]]
 // NCRDIV-NEXT:    [[CMP_NOT_I17_I_I5:%.*]] = icmp eq i8 [[TMP1]], 0
-// NCRDIV-NEXT:    br i1 [[CMP_NOT_I17_I_I5]], label [[_ZL3NANPKC_EXIT:%.*]], label [[WHILE_BODY_I18_I_I:%.*]]
-// NCRDIV:       if.then.i.i:
+// NCRDIV-NEXT:    br i1 [[CMP_NOT_I17_I_I5]], label %[[_ZL3NANPKC_EXIT:.*]], label %[[WHILE_BODY_I18_I_I:.*]]
+// NCRDIV:       [[IF_THEN_I_I]]:
 // NCRDIV-NEXT:    [[INCDEC_PTR_I_I:%.*]] = getelementptr inbounds nuw i8, ptr [[TAG]], i64 1
-// NCRDIV-NEXT:    [[TMP2:%.*]] = load i8, ptr [[INCDEC_PTR_I_I]], align 1, !tbaa [[TBAA4]]
-// NCRDIV-NEXT:    switch i8 [[TMP2]], label [[WHILE_COND_I_I_I_PREHEADER:%.*]] [
-// NCRDIV-NEXT:      i8 120, label [[IF_THEN5_I_I:%.*]]
-// NCRDIV-NEXT:      i8 88, label [[IF_THEN5_I_I]]
+// NCRDIV-NEXT:    [[TMP2:%.*]] = load i8, ptr [[INCDEC_PTR_I_I]], align 1, !tbaa [[CHAR_TBAA4]]
+// NCRDIV-NEXT:    switch i8 [[TMP2]], label %[[WHILE_COND_I_I_I_PREHEADER:.*]] [
+// NCRDIV-NEXT:      i8 120, label %[[IF_THEN5_I_I:.*]]
+// NCRDIV-NEXT:      i8 88, label %[[IF_THEN5_I_I]]
 // NCRDIV-NEXT:    ]
-// NCRDIV:       while.cond.i.i.i.preheader:
-// NCRDIV-NEXT:    [[TMP3:%.*]] = load i8, ptr [[INCDEC_PTR_I_I]], align 1, !tbaa [[TBAA4]]
+// NCRDIV:       [[WHILE_COND_I_I_I_PREHEADER]]:
+// NCRDIV-NEXT:    [[TMP3:%.*]] = load i8, ptr [[INCDEC_PTR_I_I]], align 1, !tbaa [[CHAR_TBAA4]]
 // NCRDIV-NEXT:    [[CMP_NOT_I_I_I14:%.*]] = icmp eq i8 [[TMP3]], 0
-// NCRDIV-NEXT:    br i1 [[CMP_NOT_I_I_I14]], label [[_ZL3NANPKC_EXIT]], label [[WHILE_BODY_I_I_I:%.*]]
-// NCRDIV:       if.then5.i.i:
-// NCRDIV-NEXT:    [[TMP4:%.*]] = load i8, ptr [[INCDEC_PTR_I_I]], align 1, !tbaa [[TBAA4]]
+// NCRDIV-NEXT:    br i1 [[CMP_NOT_I_I_I14]], label %[[_ZL3NANPKC_EXIT]], label %[[WHILE_BODY_I_I_I:.*]]
+// NCRDIV:       [[IF_THEN5_I_I]]:
+// NCRDIV-NEXT:    [[TMP4:%.*]] = load i8, ptr [[INCDEC_PTR_I_I]], align 1, !tbaa [[CHAR_TBAA4]]
 // NCRDIV-NEXT:    [[CMP_NOT_I30_I_I9:%.*]] = icmp eq i8 [[TMP4]], 0
-// NCRDIV-NEXT:    br i1 [[CMP_NOT_I30_I_I9]], label [[_ZL3NANPKC_EXIT]], label [[WHILE_BODY_I31_I_I:%.*]]
-// NCRDIV:       while.body.i31.i.i:
-// NCRDIV-NEXT:    [[TMP5:%.*]] = phi i8 [ [[TMP9:%.*]], [[IF_END31_I_I_I:%.*]] ], [ [[TMP4]], [[IF_THEN5_I_I]] ]
-// NCRDIV-NEXT:    [[__R_0_I29_I_I11:%.*]] = phi i64 [ [[ADD28_I_I_I:%.*]], [[IF_END31_I_I_I]] ], [ 0, [[IF_THEN5_I_I]] ]
-// NCRDIV-NEXT:    [[__TAGP_ADDR_0_I28_I_I10:%.*]] = phi ptr [ [[INCDEC_PTR_I34_I_I:%.*]], [[IF_END31_I_I_I]] ], [ [[INCDEC_PTR_I_I]], [[IF_THEN5_I_I]] ]
+// NCRDIV-NEXT:    br i1 [[CMP_NOT_I30_I_I9]], label %[[_ZL3NANPKC_EXIT]], label %[[WHILE_BODY_I31_I_I:.*]]
+// NCRDIV:       [[WHILE_BODY_I31_I_I]]:
+// NCRDIV-NEXT:    [[TMP5:%.*]] = phi i8 [ [[TMP9:%.*]], %[[IF_END31_I_I_I:.*]] ], [ [[TMP4]], %[[IF_THEN5_I_I]] ]
+// NCRDIV-NEXT:    [[__R_0_I29_I_I11:%.*]] = phi i64 [ [[ADD28_I_I_I:%.*]], %[[IF_END31_I_I_I]] ], [ 0, %[[IF_THEN5_I_I]] ]
+// NCRDIV-NEXT:    [[__TAGP_ADDR_0_I28_I_I10:%.*]] = phi ptr [ [[INCDEC_PTR_I34_I_I:%.*]], %[[IF_END31_I_I_I]] ], [ [[INCDEC_PTR_I_I]], %[[IF_THEN5_I_I]] ]
 // NCRDIV-NEXT:    [[TMP6:%.*]] = add i8 [[TMP5]], -48
 // NCRDIV-NEXT:    [[OR_COND_I32_I_I:%.*]] = icmp ult i8 [[TMP6]], 10
-// NCRDIV-NEXT:    br i1 [[OR_COND_I32_I_I]], label [[IF_END31_I_I_I]], label [[IF_ELSE_I_I_I:%.*]]
-// NCRDIV:       if.else.i.i.i:
+// NCRDIV-NEXT:    br i1 [[OR_COND_I32_I_I]], label %[[IF_END31_I_I_I]], label %[[IF_ELSE_I_I_I:.*]]
+// NCRDIV:       [[IF_ELSE_I_I_I]]:
 // NCRDIV-NEXT:    [[TMP7:%.*]] = add i8 [[TMP5]], -97
 // NCRDIV-NEXT:    [[OR_COND33_I_I_I:%.*]] = icmp ult i8 [[TMP7]], 6
-// NCRDIV-NEXT:    br i1 [[OR_COND33_I_I_I]], label [[IF_END31_I_I_I]], label [[IF_ELSE17_I_I_I:%.*]]
-// NCRDIV:       if.else17.i.i.i:
+// NCRDIV-NEXT:    br i1 [[OR_COND33_I_I_I]], label %[[IF_END31_I_I_I]], label %[[IF_ELSE17_I_I_I:.*]]
+// NCRDIV:       [[IF_ELSE17_I_I_I]]:
 // NCRDIV-NEXT:    [[TMP8:%.*]] = add i8 [[TMP5]], -65
 // NCRDIV-NEXT:    [[OR_COND34_I_I_I:%.*]] = icmp ult i8 [[TMP8]], 6
-// NCRDIV-NEXT:    br i1 [[OR_COND34_I_I_I]], label [[IF_END31_I_I_I]], label [[_ZL3NANPKC_EXIT]]
-// NCRDIV:       if.end31.i.i.i:
-// NCRDIV-NEXT:    [[DOTSINK:%.*]] = phi i64 [ -48, [[WHILE_BODY_I31_I_I]] ], [ -87, [[IF_ELSE_I_I_I]] ], [ -55, [[IF_ELSE17_I_I_I]] ]
+// NCRDIV-NEXT:    br i1 [[OR_COND34_I_I_I]], label %[[IF_END31_I_I_I]], label %[[_ZL3NANPKC_EXIT]]
+// NCRDIV:       [[IF_END31_I_I_I]]:
+// NCRDIV-NEXT:    [[DOTSINK:%.*]] = phi i64 [ -48, %[[WHILE_BODY_I31_I_I]] ], [ -87, %[[IF_ELSE_I_I_I]] ], [ -55, %[[IF_ELSE17_I_I_I]] ]
 // NCRDIV-NEXT:    [[MUL24_I_I_I:%.*]] = shl i64 [[__R_0_I29_I_I11]], 4
 // NCRDIV-NEXT:    [[CONV25_I_I_I:%.*]] = zext nneg i8 [[TMP5]] to i64
 // NCRDIV-NEXT:    [[ADD26_I_I_I:%.*]] = add i64 [[MUL24_I_I_I]], [[DOTSINK]]
 // NCRDIV-NEXT:    [[ADD28_I_I_I]] = add i64 [[ADD26_I_I_I]], [[CONV25_I_I_I]]
 // NCRDIV-NEXT:    [[INCDEC_PTR_I34_I_I]] = getelementptr inbounds nuw i8, ptr [[__TAGP_ADDR_0_I28_I_I10]], i64 1
-// NCRDIV-NEXT:    [[TMP9]] = load i8, ptr [[INCDEC_PTR_I34_I_I]], align 1, !tbaa [[TBAA4]]
+// NCRDIV-NEXT:    [[TMP9]] = load i8, ptr [[INCDEC_PTR_I34_I_I]], align 1, !tbaa [[CHAR_TBAA4]]
 // NCRDIV-NEXT:    [[CMP_NOT_I30_I_I:%.*]] = icmp eq i8 [[TMP9]], 0
-// NCRDIV-NEXT:    br i1 [[CMP_NOT_I30_I_I]], label [[_ZL3NANPKC_EXIT]], label [[WHILE_BODY_I31_I_I]], !llvm.loop [[LOOP11]]
-// NCRDIV:       while.body.i.i.i:
-// NCRDIV-NEXT:    [[TMP10:%.*]] = phi i8 [ [[TMP12:%.*]], [[IF_THEN_I_I_I:%.*]] ], [ [[TMP3]], [[WHILE_COND_I_I_I_PREHEADER]] ]
-// NCRDIV-NEXT:    [[__R_0_I_I_I16:%.*]] = phi i64 [ [[SUB_I_I_I:%.*]], [[IF_THEN_I_I_I]] ], [ 0, [[WHILE_COND_I_I_I_PREHEADER]] ]
-// NCRDIV-NEXT:    [[__TAGP_ADDR_0_I_I_I15:%.*]] = phi ptr [ [[INCDEC_PTR_I_I_I:%.*]], [[IF_THEN_I_I_I]] ], [ [[INCDEC_PTR_I_I]], [[WHILE_COND_I_I_I_PREHEADER]] ]
+// NCRDIV-NEXT:    br i1 [[CMP_NOT_I30_I_I]], label %[[_ZL3NANPKC_EXIT]], label %[[WHILE_BODY_I31_I_I]], !llvm.loop [[LOOP11]]
+// NCRDIV:       [[WHILE_BODY_I_I_I]]:
+// NCRDIV-NEXT:    [[TMP10:%.*]] = phi i8 [ [[TMP12:%.*]], %[[IF_THEN_I_I_I:.*]] ], [ [[TMP3]], %[[WHILE_COND_I_I_I_PREHEADER]] ]
+// NCRDIV-NEXT:    [[__R_0_I_I_I16:%.*]] = phi i64 [ [[SUB_I_I_I:%.*]], %[[IF_THEN_I_I_I]] ], [ 0, %[[WHILE_COND_I_I_I_PREHEADER]] ]
+// NCRDIV-NEXT:    [[__TAGP_ADDR_0_I_I_I15:%.*]] = phi ptr [ [[INCDEC_PTR_I_I_I:%.*]], %[[IF_THEN_I_I_I]] ], [ [[INCDEC_PTR_I_I]], %[[WHILE_COND_I_I_I_PREHEADER]] ]
 // NCRDIV-NEXT:    [[TMP11:%.*]] = and i8 [[TMP10]], -8
 // NCRDIV-NEXT:    [[OR_COND_I_I_I:%.*]] = icmp eq i8 [[TMP11]], 48
-// NCRDIV-NEXT:    br i1 [[OR_COND_I_I_I]], label [[IF_THEN_I_I_I]], label [[_ZL3NANPKC_EXIT]]
-// NCRDIV:       if.then.i.i.i:
+// NCRDIV-NEXT:    br i1 [[OR_COND_I_I_I]], label %[[IF_THEN_I_I_I]], label %[[_ZL3NANPKC_EXIT]]
+// NCRDIV:       [[IF_THEN_I_I_I]]:
 // NCRDIV-NEXT:    [[MUL_I_I_I:%.*]] = shl i64 [[__R_0_I_I_I16]], 3
 // NCRDIV-NEXT:    [[CONV5_I_I_I:%.*]] = zext nneg i8 [[TMP10]] to i64
 // NCRDIV-NEXT:    [[ADD_I_I_I:%.*]] = add i64 [[MUL_I_I_I]], -48
 // NCRDIV-NEXT:    [[SUB_I_I_I]] = add i64 [[ADD_I_I_I]], [[CONV5_I_I_I]]
 // NCRDIV-NEXT:    [[INCDEC_PTR_I_I_I]] = getelementptr inbounds nuw i8, ptr [[__TAGP_ADDR_0_I_I_I15]], i64 1
-// NCRDIV-NEXT:    [[TMP12]] = load i8, ptr [[INCDEC_PTR_I_I_I]], align 1, !tbaa [[TBAA4]]
+// NCRDIV-NEXT:    [[TMP12]] = load i8, ptr [[INCDEC_PTR_I_I_I]], align 1, !tbaa [[CHAR_TBAA4]]
 // NCRDIV-NEXT:    [[CMP_NOT_I_I_I:%.*]] = icmp eq i8 [[TMP12]], 0
-// NCRDIV-NEXT:    br i1 [[CMP_NOT_I_I_I]], label [[_ZL3NANPKC_EXIT]], label [[WHILE_BODY_I_I_I]], !llvm.loop [[LOOP7]]
-// NCRDIV:       while.body.i18.i.i:
-// NCRDIV-NEXT:    [[TMP13:%.*]] = phi i8 [ [[TMP15:%.*]], [[IF_THEN_I21_I_I:%.*]] ], [ [[TMP1]], [[WHILE_COND_I14_I_I_PREHEADER]] ]
-// NCRDIV-NEXT:    [[__R_0_I16_I_I7:%.*]] = phi i64 [ [[SUB_I25_I_I:%.*]], [[IF_THEN_I21_I_I]] ], [ 0, [[WHILE_COND_I14_I_I_PREHEADER]] ]
-// NCRDIV-NEXT:    [[__TAGP_ADDR_0_I15_I_I6:%.*]] = phi ptr [ [[INCDEC_PTR_I26_I_I:%.*]], [[IF_THEN_I21_I_I]] ], [ [[TAG]], [[WHILE_COND_I14_I_I_PREHEADER]] ]
+// NCRDIV-NEXT:    br i1 [[CMP_NOT_I_I_I]], label %[[_ZL3NANPKC_EXIT]], label %[[WHILE_BODY_I_I_I]], !llvm.loop [[LOOP7]]
+// NCRDIV:       [[WHILE_BODY_I18_I_I]]:
+// NCRDIV-NEXT:    [[TMP13:%.*]] = phi i8 [ [[TMP15:%.*]], %[[IF_THEN_I21_I_I:.*]] ], [ [[TMP1]], %[[WHILE_COND_I14_I_I_PREHEADER]] ]
+// NCRDIV-NEXT:    [[__R_0_I16_I_I7:%.*]] = phi i64 [ [[SUB_I25_I_I:%.*]], %[[IF_THEN_I21_I_I]] ], [ 0, %[[WHILE_COND_I14_I_I_PREHEADER]] ]
+// NCRDIV-NEXT:    [[__TAGP_ADDR_0_I15_I_I6:%.*]] = phi ptr [ [[INCDEC_PTR_I26_I_I:%.*]], %[[IF_THEN_I21_I_I]] ], [ [[TAG]], %[[WHILE_COND_I14_I_I_PREHEADER]] ]
 // NCRDIV-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], -48
 // NCRDIV-NEXT:    [[OR_COND_I19_I_I:%.*]] = icmp ult i8 [[TMP14]], 10
-// NCRDIV-NEXT:    br i1 [[OR_COND_I19_I_I]], label [[IF_THEN_I21_I_I]], label [[_ZL3NANPKC_EXIT]]
-// NCRDIV:       if.then.i21.i.i:
+// NCRDIV-NEXT:    br i1 [[OR_COND_I19_I_I]], label %[[IF_THEN_I21_I_I]], label %[[_ZL3NANPKC_EXIT]]
+// NCRDIV:       [[IF_THEN_I21_I_I]]:
 // NCRDIV-NEXT:    [[MUL_I22_I_I:%.*]] = mul i64 [[__R_0_I16_I_I7]], 10
 // NCRDIV-NEXT:    [[CONV5_I23_I_I:%.*]] = zext nneg i8 [[TMP13]] to i64
 // NCRDIV-NEXT:    [[ADD_I24_I_I:%.*]] = add i64 [[MUL_I22_I_I]], -48
 // NCRDIV-NEXT:    [[SUB_I25_I_I]] = add i64 [[ADD_I24_I_I]], [[CONV5_I23_I_I]]
 // NCRDIV-NEXT:    [[INCDEC_PTR_I26_I_I]] = getelementptr inbounds nuw i8, ptr [[__TAGP_ADDR_0_I15_I_I6]], i64 1
-// NCRDIV-NEXT:    [[TMP15]] = load i8, ptr [[INCDEC_PTR_I26_I_I]], align 1, !tbaa [[TBAA4]]
+// NCRDIV-NEXT:    [[TMP15]] = load i8, ptr [[INCDEC_PTR_I26_I_I]], align 1, !tbaa [[CHAR_TBAA4]]
 // NCRDIV-NEXT:    [[CMP_NOT_I17_I_I:%.*]] = icmp eq i8 [[TMP15]], 0
-// NCRDIV-NEXT:    br i1 [[CMP_NOT_I17_I_I]], label [[_ZL3NANPKC_EXIT]], label [[WHILE_BODY_I18_I_I]], !llvm.loop [[LOOP10]]
-// NCRDIV:       _ZL3nanPKc.exit:
-// NCRDIV-NEXT:    [[RETVAL_0_I_I:%.*]] = phi i64 [ 0, [[WHILE_COND_I_I_I_PREHEADER]] ], [ 0, [[IF_THEN5_I_I]] ], [ 0, [[WHILE_COND_I14_I_I_PREHEADER]] ], [ [[SUB_I_I_I]], [[IF_THEN_I_I_I]] ], [ 0, [[WHILE_BODY_I_I_I]] ], [ [[ADD28_I_I_I]], [[IF_END31_I_I_I]] ], [ 0, [[IF_ELSE17_I_I_I]] ], [ [[SUB_I25_I_I]], [[IF_THEN_I21_I_I]] ], [ 0, [[WHILE_BODY_I18_I_I]] ]
+// NCRDIV-NEXT:    br i1 [[CMP_NOT_I17_I_I]], label %[[_ZL3NANPKC_EXIT]], label %[[WHILE_BODY_I18_I_I]], !llvm.loop [[LOOP10]]
+// NCRDIV:       [[_ZL3NANPKC_EXIT]]:
+// NCRDIV-NEXT:    [[RETVAL_0_I_I:%.*]] = phi i64 [ 0, %[[WHILE_COND_I_I_I_PREHEADER]] ], [ 0, %[[IF_THEN5_I_I]] ], [ 0, %[[WHILE_COND_I14_I_I_PREHEADER]] ], [ [[SUB_I_I_I]], %[[IF_THEN_I_I_I]] ], [ 0, %[[WHILE_BODY_I_I_I]] ], [ [[ADD28_I_I_I]], %[[IF_END31_I_I_I]] ], [ 0, %[[IF_ELSE17_I_I_I]] ], [ [[SUB_I25_I_I]], %[[IF_THEN_I21_I_I]] ], [ 0, %[[WHILE_BODY_I18_I_I]] ]
 // NCRDIV-NEXT:    [[BF_VALUE_I:%.*]] = and i64 [[RETVAL_0_I_I]], 2251799813685247
 // NCRDIV-NEXT:    [[BF_SET9_I:%.*]] = or disjoint i64 [[BF_VALUE_I]], 9221120237041090560
 // NCRDIV-NEXT:    [[TMP16:%.*]] = bitcast i64 [[BF_SET9_I]] to double
 // NCRDIV-NEXT:    ret double [[TMP16]]
 //
-// AMDGCNSPIRV-LABEL: @test_nan(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = load i8, ptr addrspace(4) [[TAG:%.*]], align 1, !tbaa [[TBAA5]]
+// AMDGCNSPIRV-LABEL: define spir_func double @test_nan(
+// AMDGCNSPIRV-SAME: ptr addrspace(4) noundef readonly captures(none) [[TAG:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR2]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*]]:
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = load i8, ptr addrspace(4) [[TAG]], align 1, !tbaa [[CHAR_TBAA5]]
 // AMDGCNSPIRV-NEXT:    [[CMP_I_I:%.*]] = icmp eq i8 [[TMP0]], 48
-// AMDGCNSPIRV-NEXT:    br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[WHILE_COND_I14_I_I:%.*]]
-// AMDGCNSPIRV:       if.then.i.i:
+// AMDGCNSPIRV-NEXT:    br i1 [[CMP_I_I]], label %[[IF_THEN_I_I:.*]], label %[[WHILE_COND_I14_I_I:.*]]
+// AMDGCNSPIRV:       [[IF_THEN_I_I]]:
 // AMDGCNSPIRV-NEXT:    [[INCDEC_PTR_I_I:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[TAG]], i64 1
-// AMDGCNSPIRV-NEXT:    [[TMP1:%.*]] = load i8, ptr addrspace(4) [[INCDEC_PTR_I_I]], align 1, !tbaa [[TBAA5]]
-// AMDGCNSPIRV-NEXT:    switch i8 [[TMP1]], label [[WHILE_COND_I_I_I:%.*]] [
-// AMDGCNSPIRV-NEXT:      i8 120, label [[IF_THEN5_I_I:%.*]]
-// AMDGCNSPIRV-NEXT:      i8 88, label [[IF_THEN5_I_I]]
+// AMDGCNSPIRV-NEXT:    [[TMP1:%.*]] = load i8, ptr addrspace(4) [[INCDEC_PTR_I_I]], align 1, !tbaa [[CHAR_TBAA5]]
+// AMDGCNSPIRV-NEXT:    switch i8 [[TMP1]], label %[[WHILE_COND_I_I_I:.*]] [
+// AMDGCNSPIRV-NEXT:      i8 120, label %[[IF_THEN5_I_I:.*]]
+// AMDGCNSPIRV-NEXT:      i8 88, label %[[IF_THEN5_I_I]]
 // AMDGCNSPIRV-NEXT:    ]
-// AMDGCNSPIRV:       if.then5.i.i:
-// AMDGCNSPIRV-NEXT:    [[TMP2:%.*]] = load i8, ptr addrspace(4) [[INCDEC_PTR_I_I]], align 1, !tbaa [[TBAA5]]
+// AMDGCNSPIRV:       [[IF_THEN5_I_I]]:
+// AMDGCNSPIRV-NEXT:    [[TMP2:%.*]] = load i8, ptr addrspace(4) [[INCDEC_PTR_I_I]], align 1, !tbaa [[CHAR_TBAA5]]
 // AMDGCNSPIRV-NEXT:    [[CMP_NOT_I31_I_I5:%.*]] = icmp eq i8 [[TMP2]], 0
-// AMDGCNSPIRV-NEXT:    br i1 [[CMP_NOT_I31_I_I5]], label [[_ZL3NANPKC_EXIT:%.*]], label [[WHILE_BODY_I32_I_I:%.*]]
-// AMDGCNSPIRV:       while.body.i32.i.i:
-// AMDGCNSPIRV-NEXT:    [[TMP3:%.*]] = phi i8 [ [[TMP7:%.*]], [[IF_END31_I_I_I:%.*]] ], [ [[TMP2]], [[IF_THEN5_I_I]] ]
-// AMDGCNSPIRV-NEXT:    [[__R_0_I30_I_I7:%.*]] = phi i64 [ [[ADD28_I_I_I:%.*]], [[IF_END31_I_I_I]] ], [ 0, [[IF_THEN5_I_I]] ]
-// AMDGCNSPIRV-NEXT:    [[__TAGP_ADDR_0_I29_I_I6:%.*]] = phi ptr addrspace(4) [ [[INCDEC_PTR_I36_I_I:%.*]], [[IF_END31_I_I_I]] ], [ [[INCDEC_PTR_I_I]], [[IF_THEN5_I_I]] ]
+// AMDGCNSPIRV-NEXT:    br i1 [[CMP_NOT_I31_I_I5]], label %[[_ZL3NANPKC_EXIT:.*]], label %[[WHILE_BODY_I32_I_I:.*]]
+// AMDGCNSPIRV:       [[WHILE_BODY_I32_I_I]]:
+// AMDGCNSPIRV-NEXT:    [[TMP3:%.*]] = phi i8 [ [[TMP7:%.*]], %[[IF_END31_I_I_I:.*]] ], [ [[TMP2]], %[[IF_THEN5_I_I]] ]
+// AMDGCNSPIRV-NEXT:    [[__R_0_I30_I_I7:%.*]] = phi i64 [ [[ADD28_I_I_I:%.*]], %[[IF_END31_I_I_I]] ], [ 0, %[[IF_THEN5_I_I]] ]
+// AMDGCNSPIRV-NEXT:    [[__TAGP_ADDR_0_I29_I_I6:%.*]] = phi ptr addrspace(4) [ [[INCDEC_PTR_I36_I_I:%.*]], %[[IF_END31_I_I_I]] ], [ [[INCDEC_PTR_I_I]], %[[IF_THEN5_I_I]] ]
 // AMDGCNSPIRV-NEXT:    [[TMP4:%.*]] = add i8 [[TMP3]], -48
 // AMDGCNSPIRV-NEXT:    [[OR_COND_I33_I_I:%.*]] = icmp ult i8 [[TMP4]], 10
-// AMDGCNSPIRV-NEXT:    br i1 [[OR_COND_I33_I_I]], label [[IF_END31_I_I_I]], label [[IF_ELSE_I_I_I:%.*]]
-// AMDGCNSPIRV:       if.else.i.i.i:
+// AMDGCNSPIRV-NEXT:    br i1 [[OR_COND_I33_I_I]], label %[[IF_END31_I_I_I]], label %[[IF_ELSE_I_I_I:.*]]
+// AMDGCNSPIRV:       [[IF_ELSE_I_I_I]]:
 // AMDGCNSPIRV-NEXT:    [[TMP5:%.*]] = add i8 [[TMP3]], -97
 // AMDGCNSPIRV-NEXT:    [[OR_COND33_I_I_I:%.*]] = icmp ult i8 [[TMP5]], 6
-// AMDGCNSPIRV-NEXT:    br i1 [[OR_COND33_I_I_I]], label [[IF_END31_I_I_I]], label [[IF_ELSE17_I_I_I:%.*]]
-// AMDGCNSPIRV:       if.else17.i.i.i:
+// AMDGCNSPIRV-NEXT:    br i1 [[OR_COND33_I_I_I]], label %[[IF_END31_I_I_I]], label %[[IF_ELSE17_I_I_I:.*]]
+// AMDGCNSPIRV:       [[IF_ELSE17_I_I_I]]:
 // AMDGCNSPIRV-NEXT:    [[TMP6:%.*]] = add i8 [[TMP3]], -65
 // AMDGCNSPIRV-NEXT:    [[OR_COND34_I_I_I:%.*]] = icmp ult i8 [[TMP6]], 6
-// AMDGCNSPIRV-NEXT:    br i1 [[OR_COND34_I_I_I]], label [[IF_END31_I_I_I]], label [[_ZL3NANPKC_EXIT]]
-// AMDGCNSPIRV:       if.end31.i.i.i:
-// AMDGCNSPIRV-NEXT:    [[DOTSINK:%.*]] = phi i64 [ -48, [[WHILE_BODY_I32_I_I]] ], [ -87, [[IF_ELSE_I_I_I]] ], [ -55, [[IF_ELSE17_I_I_I]] ]
+// AMDGCNSPIRV-NEXT:    br i1 [[OR_COND34_I_I_I]], label %[[IF_END31_I_I_I]], label %[[_ZL3NANPKC_EXIT]]
+// AMDGCNSPIRV:       [[IF_END31_I_I_I]]:
+// AMDGCNSPIRV-NEXT:    [[DOTSINK:%.*]] = phi i64 [ -48, %[[WHILE_BODY_I32_I_I]] ], [ -87, %[[IF_ELSE_I_I_I]] ], [ -55, %[[IF_ELSE17_I_I_I]] ]
 // AMDGCNSPIRV-NEXT:    [[MUL24_I_I_I:%.*]] = shl i64 [[__R_0_I30_I_I7]], 4
 // AMDGCNSPIRV-NEXT:    [[CONV25_I_I_I:%.*]] = zext nneg i8 [[TMP3]] to i64
 // AMDGCNSPIRV-NEXT:    [[ADD26_I_I_I:%.*]] = add i64 [[MUL24_I_I_I]], [[DOTSINK]]
 // AMDGCNSPIRV-NEXT:    [[ADD28_I_I_I]] = add i64 [[ADD26_I_I_I]], [[CONV25_I_I_I]]
 // AMDGCNSPIRV-NEXT:    [[INCDEC_PTR_I36_I_I]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[__TAGP_ADDR_0_I29_I_I6]], i64 1
-// AMDGCNSPIRV-NEXT:    [[TMP7]] = load i8, ptr addrspace(4) [[INCDEC_PTR_I36_I_I]], align 1, !tbaa [[TBAA5]]
+// AMDGCNSPIRV-NEXT:    [[TMP7]] = load i8, ptr addrspace(4) [[INCDEC_PTR_I36_I_I]], align 1, !tbaa [[CHAR_TBAA5]]
 // AMDGCNSPIRV-NEXT:    [[CMP_NOT_I31_I_I:%.*]] = icmp eq i8 [[TMP7]], 0
-// AMDGCNSPIRV-NEXT:    br i1 [[CMP_NOT_I31_I_I]], label [[_ZL3NANPKC_EXIT]], label [[WHILE_BODY_I32_I_I]], !llvm.loop [[LOOP12]]
-// AMDGCNSPIRV:       while.cond.i.i.i:
-// AMDGCNSPIRV-NEXT:    [[__TAGP_ADDR_0_I_I_I:%.*]] = phi ptr addrspace(4) [ [[__TAGP_ADDR_1_I_I_I:%.*]], [[WHILE_BODY_I_I_I:%.*]] ], [ [[INCDEC_PTR_I_I]], [[IF_THEN_I_I]] ]
-// AMDGCNSPIRV-NEXT:    [[__R_0_I_I_I:%.*]] = phi i64 [ [[__R_1_I_I_I:%.*]], [[WHILE_BODY_I_I_I]] ], [ 0, [[IF_THEN_I_I]] ]
-// AMDGCNSPIRV-NEXT:    [[TMP8:%.*]] = load i8, ptr addrspace(4) [[__TAGP_ADDR_0_I_I_I]], align 1, !tbaa [[TBAA5]]
+// AMDGCNSPIRV-NEXT:    br i1 [[CMP_NOT_I31_I_I]], label %[[_ZL3NANPKC_EXIT]], label %[[WHILE_BODY_I32_I_I]], !llvm.loop [[LOOP12]]
+// AMDGCNSPIRV:       [[WHILE_COND_I_I_I]]:
+// AMDGCNSPIRV-NEXT:    [[__TAGP_ADDR_0_I_I_I:%.*]] = phi ptr addrspace(4) [ [[__TAGP_ADDR_1_I_I_I:%.*]], %[[WHILE_BODY_I_I_I:.*]] ], [ [[INCDEC_PTR_I_I]], %[[IF_THEN_I_I]] ]
+// AMDGCNSPIRV-NEXT:    [[__R_0_I_I_I:%.*]] = phi i64 [ [[__R_1_I_I_I:%.*]], %[[WHILE_BODY_I_I_I]] ], [ 0, %[[IF_THEN_I_I]] ]
+// AMDGCNSPIRV-NEXT:    [[TMP8:%.*]] = load i8, ptr addrspace(4) [[__TAGP_ADDR_0_I_I_I]], align 1, !tbaa [[CHAR_TBAA5]]
 // AMDGCNSPIRV-NEXT:    [[CMP_NOT_I_I_I:%.*]] = icmp eq i8 [[TMP8]], 0
-// AMDGCNSPIRV-NEXT:    br i1 [[CMP_NOT_I_I_I]], label [[_ZL3NANPKC_EXIT]], label [[WHILE_BODY_I_I_I]]
-// AMDGCNSPIRV:       while.body.i.i.i:
+// AMDGCNSPIRV-NEXT:    br i1 [[CMP_NOT_I_I_I]], label %[[_ZL3NANPKC_EXIT]], label %[[WHILE_BODY_I_I_I]]
+// AMDGCNSPIRV:       [[WHILE_BODY_I_I_I]]:
 // AMDGCNSPIRV-NEXT:    [[TMP9:%.*]] = and i8 [[TMP8]], -8
 // AMDGCNSPIRV-NEXT:    [[OR_COND_I_I_I:%.*]] = icmp eq i8 [[TMP9]], 48
 // AMDGCNSPIRV-NEXT:    [[MUL_I_I_I:%.*]] = shl i64 [[__R_0_I_I_I]], 3
@@ -4426,14 +4945,14 @@ extern "C" __device__ float test_nanf(const char *tag) {
 // AMDGCNSPIRV-NEXT:    [[__TAGP_ADDR_1_I_I_I_IDX:%.*]] = zext i1 [[OR_COND_I_I_I]] to i64
 // AMDGCNSPIRV-NEXT:    [[__TAGP_ADDR_1_I_I_I]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[__TAGP_ADDR_0_I_I_I]], i64 [[__TAGP_ADDR_1_I_I_I_IDX]]
 // AMDGCNSPIRV-NEXT:    [[__R_1_I_I_I]] = select i1 [[OR_COND_I_I_I]], i64 [[SUB_I_I_I]], i64 [[__R_0_I_I_I]]
-// AMDGCNSPIRV-NEXT:    br i1 [[OR_COND_I_I_I]], label [[WHILE_COND_I_I_I]], label [[_ZL3NANPKC_EXIT]], !llvm.loop [[LOOP8]]
-// AMDGCNSPIRV:       while.cond.i14.i.i:
-// AMDGCNSPIRV-NEXT:    [[__TAGP_ADDR_0_I15_I_I:%.*]] = phi ptr addrspace(4) [ [[__TAGP_ADDR_1_I25_I_I:%.*]], [[WHILE_BODY_I18_I_I:%.*]] ], [ [[TAG]], [[ENTRY:%.*]] ]
-// AMDGCNSPIRV-NEXT:    [[__R_0_I16_I_I:%.*]] = phi i64 [ [[__R_1_I26_I_I:%.*]], [[WHILE_BODY_I18_I_I]] ], [ 0, [[ENTRY]] ]
-// AMDGCNSPIRV-NEXT:    [[TMP10:%.*]] = load i8, ptr addrspace(4) [[__TAGP_ADDR_0_I15_I_I]], align 1, !tbaa [[TBAA5]]
+// AMDGCNSPIRV-NEXT:    br i1 [[OR_COND_I_I_I]], label %[[WHILE_COND_I_I_I]], label %[[_ZL3NANPKC_EXIT]], !llvm.loop [[LOOP8]]
+// AMDGCNSPIRV:       [[WHILE_COND_I14_I_I]]:
+// AMDGCNSPIRV-NEXT:    [[__TAGP_ADDR_0_I15_I_I:%.*]] = phi ptr addrspace(4) [ [[__TAGP_ADDR_1_I25_I_I:%.*]], %[[WHILE_BODY_I18_I_I:.*]] ], [ [[TAG]], %[[ENTRY]] ]
+// AMDGCNSPIRV-NEXT:    [[__R_0_I16_I_I:%.*]] = phi i64 [ [[__R_1_I26_I_I:%.*]], %[[WHILE_BODY_I18_I_I]] ], [ 0, %[[ENTRY]] ]
+// AMDGCNSPIRV-NEXT:    [[TMP10:%.*]] = load i8, ptr addrspace(4) [[__TAGP_ADDR_0_I15_I_I]], align 1, !tbaa [[CHAR_TBAA5]]
 // AMDGCNSPIRV-NEXT:    [[CMP_NOT_I17_I_I:%.*]] = icmp eq i8 [[TMP10]], 0
-// AMDGCNSPIRV-NEXT:    br i1 [[CMP_NOT_I17_I_I]], label [[_ZL3NANPKC_EXIT]], label [[WHILE_BODY_I18_I_I]]
-// AMDGCNSPIRV:       while.body.i18.i.i:
+// AMDGCNSPIRV-NEXT:    br i1 [[CMP_NOT_I17_I_I]], label %[[_ZL3NANPKC_EXIT]], label %[[WHILE_BODY_I18_I_I]]
+// AMDGCNSPIRV:       [[WHILE_BODY_I18_I_I]]:
 // AMDGCNSPIRV-NEXT:    [[TMP11:%.*]] = add i8 [[TMP10]], -48
 // AMDGCNSPIRV-NEXT:    [[OR_COND_I19_I_I:%.*]] = icmp ult i8 [[TMP11]], 10
 // AMDGCNSPIRV-NEXT:    [[MUL_I20_I_I:%.*]] = mul i64 [[__R_0_I16_I_I]], 10
@@ -4443,9 +4962,9 @@ extern "C" __device__ float test_nanf(const char *tag) {
 // AMDGCNSPIRV-NEXT:    [[__TAGP_ADDR_1_I25_I_I_IDX:%.*]] = zext i1 [[OR_COND_I19_I_I]] to i64
 // AMDGCNSPIRV-NEXT:    [[__TAGP_ADDR_1_I25_I_I]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[__TAGP_ADDR_0_I15_I_I]], i64 [[__TAGP_ADDR_1_I25_I_I_IDX]]
 // AMDGCNSPIRV-NEXT:    [[__R_1_I26_I_I]] = select i1 [[OR_COND_I19_I_I]], i64 [[SUB_I23_I_I]], i64 [[__R_0_I16_I_I]]
-// AMDGCNSPIRV-NEXT:    br i1 [[OR_COND_I19_I_I]], label [[WHILE_COND_I14_I_I]], label [[_ZL3NANPKC_EXIT]], !llvm.loop [[LOOP11]]
-// AMDGCNSPIRV:       _ZL3nanPKc.exit:
-// AMDGCNSPIRV-NEXT:    [[RETVAL_0_I_I:%.*]] = phi i64 [ 0, [[IF_THEN5_I_I]] ], [ 0, [[WHILE_BODY_I_I_I]] ], [ [[__R_0_I_I_I]], [[WHILE_COND_I_I_I]] ], [ [[ADD28_I_I_I]], [[IF_END31_I_I_I]] ], [ 0, [[IF_ELSE17_I_I_I]] ], [ 0, [[WHILE_BODY_I18_I_I]] ], [ [[__R_0_I16_I_I]], [[WHILE_COND_I14_I_I]] ]
+// AMDGCNSPIRV-NEXT:    br i1 [[OR_COND_I19_I_I]], label %[[WHILE_COND_I14_I_I]], label %[[_ZL3NANPKC_EXIT]], !llvm.loop [[LOOP11]]
+// AMDGCNSPIRV:       [[_ZL3NANPKC_EXIT]]:
+// AMDGCNSPIRV-NEXT:    [[RETVAL_0_I_I:%.*]] = phi i64 [ 0, %[[IF_THEN5_I_I]] ], [ 0, %[[WHILE_BODY_I_I_I]] ], [ [[__R_0_I_I_I]], %[[WHILE_COND_I_I_I]] ], [ [[ADD28_I_I_I]], %[[IF_END31_I_I_I]] ], [ 0, %[[IF_ELSE17_I_I_I]] ], [ 0, %[[WHILE_BODY_I18_I_I]] ], [ [[__R_0_I16_I_I]], %[[WHILE_COND_I14_I_I]] ]
 // AMDGCNSPIRV-NEXT:    [[BF_VALUE_I:%.*]] = and i64 [[RETVAL_0_I_I]], 2251799813685247
 // AMDGCNSPIRV-NEXT:    [[BF_SET9_I:%.*]] = or disjoint i64 [[BF_VALUE_I]], 9221120237041090560
 // AMDGCNSPIRV-NEXT:    [[TMP12:%.*]] = bitcast i64 [[BF_SET9_I]] to double
@@ -4455,958 +4974,1093 @@ extern "C" __device__ double test_nan(const char *tag) {
   return nan(tag);
 }
 
-// DEFAULT-LABEL: @test_nanf_emptystr(
-// DEFAULT-NEXT:  entry:
+// DEFAULT-LABEL: define dso_local noundef float @test_nanf_emptystr(
+// DEFAULT-SAME: ) local_unnamed_addr #[[ATTR3]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
 // DEFAULT-NEXT:    ret float 0x7FF8000000000000
 //
-// FINITEONLY-LABEL: @test_nanf_emptystr(
-// FINITEONLY-NEXT:  entry:
+// FINITEONLY-LABEL: define dso_local nofpclass(nan inf) float @test_nanf_emptystr(
+// FINITEONLY-SAME: ) local_unnamed_addr #[[ATTR3]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
 // FINITEONLY-NEXT:    ret float poison
 //
-// APPROX-LABEL: @test_nanf_emptystr(
-// APPROX-NEXT:  entry:
+// APPROX-LABEL: define dso_local noundef float @test_nanf_emptystr(
+// APPROX-SAME: ) local_unnamed_addr #[[ATTR3]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
 // APPROX-NEXT:    ret float 0x7FF8000000000000
 //
-// NCRDIV-LABEL: @test_nanf_emptystr(
-// NCRDIV-NEXT:  entry:
+// NCRDIV-LABEL: define dso_local noundef float @test_nanf_emptystr(
+// NCRDIV-SAME: ) local_unnamed_addr #[[ATTR3]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
 // NCRDIV-NEXT:    ret float 0x7FF8000000000000
 //
-// AMDGCNSPIRV-LABEL: @test_nanf_emptystr(
-// AMDGCNSPIRV-NEXT:  entry:
+// AMDGCNSPIRV-LABEL: define spir_func noundef float @test_nanf_emptystr(
+// AMDGCNSPIRV-SAME: ) local_unnamed_addr addrspace(4) #[[ATTR3]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
 // AMDGCNSPIRV-NEXT:    ret float 0x7FF8000000000000
 //
 extern "C" __device__ float test_nanf_emptystr() {
   return nanf("");
 }
 
-// DEFAULT-LABEL: @test_nan_emptystr(
-// DEFAULT-NEXT:  entry:
+// DEFAULT-LABEL: define dso_local noundef double @test_nan_emptystr(
+// DEFAULT-SAME: ) local_unnamed_addr #[[ATTR3]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
 // DEFAULT-NEXT:    ret double 0x7FF8000000000000
 //
-// FINITEONLY-LABEL: @test_nan_emptystr(
-// FINITEONLY-NEXT:  entry:
+// FINITEONLY-LABEL: define dso_local nofpclass(nan inf) double @test_nan_emptystr(
+// FINITEONLY-SAME: ) local_unnamed_addr #[[ATTR3]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
 // FINITEONLY-NEXT:    ret double poison
 //
-// APPROX-LABEL: @test_nan_emptystr(
-// APPROX-NEXT:  entry:
+// APPROX-LABEL: define dso_local noundef double @test_nan_emptystr(
+// APPROX-SAME: ) local_unnamed_addr #[[ATTR3]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
 // APPROX-NEXT:    ret double 0x7FF8000000000000
 //
-// NCRDIV-LABEL: @test_nan_emptystr(
-// NCRDIV-NEXT:  entry:
+// NCRDIV-LABEL: define dso_local noundef double @test_nan_emptystr(
+// NCRDIV-SAME: ) local_unnamed_addr #[[ATTR3]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
 // NCRDIV-NEXT:    ret double 0x7FF8000000000000
 //
-// AMDGCNSPIRV-LABEL: @test_nan_emptystr(
-// AMDGCNSPIRV-NEXT:  entry:
+// AMDGCNSPIRV-LABEL: define spir_func noundef double @test_nan_emptystr(
+// AMDGCNSPIRV-SAME: ) local_unnamed_addr addrspace(4) #[[ATTR3]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
 // AMDGCNSPIRV-NEXT:    ret double 0x7FF8000000000000
 //
 extern "C" __device__ double test_nan_emptystr() {
   return nan("");
 }
 
-// DEFAULT-LABEL: @test_nanf_fill(
-// DEFAULT-NEXT:  entry:
+// DEFAULT-LABEL: define dso_local noundef float @test_nanf_fill(
+// DEFAULT-SAME: ) local_unnamed_addr #[[ATTR3]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
 // DEFAULT-NEXT:    ret float 0x7FF8000000000000
 //
-// FINITEONLY-LABEL: @test_nanf_fill(
-// FINITEONLY-NEXT:  entry:
+// FINITEONLY-LABEL: define dso_local nofpclass(nan inf) float @test_nanf_fill(
+// FINITEONLY-SAME: ) local_unnamed_addr #[[ATTR3]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
 // FINITEONLY-NEXT:    ret float poison
 //
-// APPROX-LABEL: @test_nanf_fill(
-// APPROX-NEXT:  entry:
+// APPROX-LABEL: define dso_local noundef float @test_nanf_fill(
+// APPROX-SAME: ) local_unnamed_addr #[[ATTR3]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
 // APPROX-NEXT:    ret float 0x7FF8000000000000
 //
-// NCRDIV-LABEL: @test_nanf_fill(
-// NCRDIV-NEXT:  entry:
+// NCRDIV-LABEL: define dso_local noundef float @test_nanf_fill(
+// NCRDIV-SAME: ) local_unnamed_addr #[[ATTR3]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
 // NCRDIV-NEXT:    ret float 0x7FF8000000000000
 //
-// AMDGCNSPIRV-LABEL: @test_nanf_fill(
-// AMDGCNSPIRV-NEXT:  entry:
+// AMDGCNSPIRV-LABEL: define spir_func noundef float @test_nanf_fill(
+// AMDGCNSPIRV-SAME: ) local_unnamed_addr addrspace(4) #[[ATTR3]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
 // AMDGCNSPIRV-NEXT:    ret float 0x7FF8000000000000
 //
 extern "C" __device__ float test_nanf_fill() {
   return nanf("0x456");
 }
 
-// DEFAULT-LABEL: @test_nan_fill(
-// DEFAULT-NEXT:  entry:
+// DEFAULT-LABEL: define dso_local noundef double @test_nan_fill(
+// DEFAULT-SAME: ) local_unnamed_addr #[[ATTR3]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
 // DEFAULT-NEXT:    ret double 0x7FF8000000000000
 //
-// FINITEONLY-LABEL: @test_nan_fill(
-// FINITEONLY-NEXT:  entry:
+// FINITEONLY-LABEL: define dso_local nofpclass(nan inf) double @test_nan_fill(
+// FINITEONLY-SAME: ) local_unnamed_addr #[[ATTR3]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
 // FINITEONLY-NEXT:    ret double poison
 //
-// APPROX-LABEL: @test_nan_fill(
-// APPROX-NEXT:  entry:
+// APPROX-LABEL: define dso_local noundef double @test_nan_fill(
+// APPROX-SAME: ) local_unnamed_addr #[[ATTR3]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
 // APPROX-NEXT:    ret double 0x7FF8000000000000
 //
-// NCRDIV-LABEL: @test_nan_fill(
-// NCRDIV-NEXT:  entry:
+// NCRDIV-LABEL: define dso_local noundef double @test_nan_fill(
+// NCRDIV-SAME: ) local_unnamed_addr #[[ATTR3]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
 // NCRDIV-NEXT:    ret double 0x7FF8000000000000
 //
-// AMDGCNSPIRV-LABEL: @test_nan_fill(
-// AMDGCNSPIRV-NEXT:  entry:
+// AMDGCNSPIRV-LABEL: define spir_func noundef double @test_nan_fill(
+// AMDGCNSPIRV-SAME: ) local_unnamed_addr addrspace(4) #[[ATTR3]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
 // AMDGCNSPIRV-NEXT:    ret double 0x7FF8000000000000
 //
 extern "C" __device__ double test_nan_fill() {
   return nan("0x123");
 }
 
-// DEFAULT-LABEL: @test_nearbyintf(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.nearbyint.f32(float [[X:%.*]])
+// DEFAULT-LABEL: define dso_local noundef float @test_nearbyintf(
+// DEFAULT-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.nearbyint.f32(float [[X]])
 // DEFAULT-NEXT:    ret float [[TMP0]]
 //
-// FINITEONLY-LABEL: @test_nearbyintf(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.nearbyint.f32(float nofpclass(nan inf) [[X:%.*]])
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) float @test_nearbyintf(
+// FINITEONLY-SAME: float noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.nearbyint.f32(float nofpclass(nan inf) [[X]])
 // FINITEONLY-NEXT:    ret float [[TMP0]]
 //
-// APPROX-LABEL: @test_nearbyintf(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.nearbyint.f32(float [[X:%.*]])
+// APPROX-LABEL: define dso_local noundef float @test_nearbyintf(
+// APPROX-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.nearbyint.f32(float [[X]])
 // APPROX-NEXT:    ret float [[TMP0]]
 //
-// NCRDIV-LABEL: @test_nearbyintf(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.nearbyint.f32(float [[X:%.*]])
+// NCRDIV-LABEL: define dso_local noundef float @test_nearbyintf(
+// NCRDIV-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.nearbyint.f32(float [[X]])
 // NCRDIV-NEXT:    ret float [[TMP0]]
 //
-// AMDGCNSPIRV-LABEL: @test_nearbyintf(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call contract noundef addrspace(4) float @llvm.nearbyint.f32(float [[X:%.*]])
+// AMDGCNSPIRV-LABEL: define spir_func noundef float @test_nearbyintf(
+// AMDGCNSPIRV-SAME: float noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR3]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call contract noundef addrspace(4) float @llvm.nearbyint.f32(float [[X]])
 // AMDGCNSPIRV-NEXT:    ret float [[TMP0]]
 //
 extern "C" __device__ float test_nearbyintf(float x) {
   return nearbyintf(x);
 }
 
-// DEFAULT-LABEL: @test_nearbyint(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.nearbyint.f64(double [[X:%.*]])
+// DEFAULT-LABEL: define dso_local noundef double @test_nearbyint(
+// DEFAULT-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.nearbyint.f64(double [[X]])
 // DEFAULT-NEXT:    ret double [[TMP0]]
 //
-// FINITEONLY-LABEL: @test_nearbyint(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.nearbyint.f64(double nofpclass(nan inf) [[X:%.*]])
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) double @test_nearbyint(
+// FINITEONLY-SAME: double noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.nearbyint.f64(double nofpclass(nan inf) [[X]])
 // FINITEONLY-NEXT:    ret double [[TMP0]]
 //
-// APPROX-LABEL: @test_nearbyint(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.nearbyint.f64(double [[X:%.*]])
+// APPROX-LABEL: define dso_local noundef double @test_nearbyint(
+// APPROX-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.nearbyint.f64(double [[X]])
 // APPROX-NEXT:    ret double [[TMP0]]
 //
-// NCRDIV-LABEL: @test_nearbyint(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.nearbyint.f64(double [[X:%.*]])
+// NCRDIV-LABEL: define dso_local noundef double @test_nearbyint(
+// NCRDIV-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.nearbyint.f64(double [[X]])
 // NCRDIV-NEXT:    ret double [[TMP0]]
 //
-// AMDGCNSPIRV-LABEL: @test_nearbyint(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call contract noundef addrspace(4) double @llvm.nearbyint.f64(double [[X:%.*]])
+// AMDGCNSPIRV-LABEL: define spir_func noundef double @test_nearbyint(
+// AMDGCNSPIRV-SAME: double noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR3]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call contract noundef addrspace(4) double @llvm.nearbyint.f64(double [[X]])
 // AMDGCNSPIRV-NEXT:    ret double [[TMP0]]
 //
 extern "C" __device__ double test_nearbyint(double x) {
   return nearbyint(x);
 }
 
-// DEFAULT-LABEL: @test_nextafterf(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_nextafter_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR14]]
+// DEFAULT-LABEL: define dso_local noundef float @test_nextafterf(
+// DEFAULT-SAME: float noundef [[X:%.*]], float noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_nextafter_f32(float noundef [[X]], float noundef [[Y]]) #[[ATTR14]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
-// FINITEONLY-LABEL: @test_nextafterf(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_nextafter_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR14]]
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) float @test_nextafterf(
+// FINITEONLY-SAME: float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_nextafter_f32(float noundef nofpclass(nan inf) [[X]], float noundef nofpclass(nan inf) [[Y]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
-// APPROX-LABEL: @test_nextafterf(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_nextafter_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR14]]
+// APPROX-LABEL: define dso_local noundef float @test_nextafterf(
+// APPROX-SAME: float noundef [[X:%.*]], float noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_nextafter_f32(float noundef [[X]], float noundef [[Y]]) #[[ATTR14]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
-// NCRDIV-LABEL: @test_nextafterf(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_nextafter_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR14]]
+// NCRDIV-LABEL: define dso_local noundef float @test_nextafterf(
+// NCRDIV-SAME: float noundef [[X:%.*]], float noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_nextafter_f32(float noundef [[X]], float noundef [[Y]]) #[[ATTR14]]
 // NCRDIV-NEXT:    ret float [[CALL_I]]
 //
-// AMDGCNSPIRV-LABEL: @test_nextafterf(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_nextafter_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR12]]
+// AMDGCNSPIRV-LABEL: define spir_func noundef float @test_nextafterf(
+// AMDGCNSPIRV-SAME: float noundef [[X:%.*]], float noundef [[Y:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR4]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_nextafter_f32(float noundef [[X]], float noundef [[Y]]) #[[ATTR12]]
 // AMDGCNSPIRV-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_nextafterf(float x, float y) {
   return nextafterf(x, y);
 }
 
-// DEFAULT-LABEL: @test_nextafter(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_nextafter_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR14]]
+// DEFAULT-LABEL: define dso_local noundef double @test_nextafter(
+// DEFAULT-SAME: double noundef [[X:%.*]], double noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_nextafter_f64(double noundef [[X]], double noundef [[Y]]) #[[ATTR14]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
-// FINITEONLY-LABEL: @test_nextafter(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_nextafter_f64(double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR14]]
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) double @test_nextafter(
+// FINITEONLY-SAME: double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_nextafter_f64(double noundef nofpclass(nan inf) [[X]], double noundef nofpclass(nan inf) [[Y]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
-// APPROX-LABEL: @test_nextafter(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_nextafter_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR14]]
+// APPROX-LABEL: define dso_local noundef double @test_nextafter(
+// APPROX-SAME: double noundef [[X:%.*]], double noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_nextafter_f64(double noundef [[X]], double noundef [[Y]]) #[[ATTR14]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
-// NCRDIV-LABEL: @test_nextafter(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_nextafter_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR14]]
+// NCRDIV-LABEL: define dso_local noundef double @test_nextafter(
+// NCRDIV-SAME: double noundef [[X:%.*]], double noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_nextafter_f64(double noundef [[X]], double noundef [[Y]]) #[[ATTR14]]
 // NCRDIV-NEXT:    ret double [[CALL_I]]
 //
-// AMDGCNSPIRV-LABEL: @test_nextafter(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_nextafter_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR12]]
+// AMDGCNSPIRV-LABEL: define spir_func noundef double @test_nextafter(
+// AMDGCNSPIRV-SAME: double noundef [[X:%.*]], double noundef [[Y:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR4]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_nextafter_f64(double noundef [[X]], double noundef [[Y]]) #[[ATTR12]]
 // AMDGCNSPIRV-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_nextafter(double x, double y) {
   return nextafter(x, y);
 }
 
-// DEFAULT-LABEL: @test_norm3df(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_len3_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]], float noundef [[Z:%.*]]) #[[ATTR14]]
+// DEFAULT-LABEL: define dso_local noundef float @test_norm3df(
+// DEFAULT-SAME: float noundef [[X:%.*]], float noundef [[Y:%.*]], float noundef [[Z:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_len3_f32(float noundef [[X]], float noundef [[Y]], float noundef [[Z]]) #[[ATTR14]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
-// FINITEONLY-LABEL: @test_norm3df(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_len3_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]], float noundef nofpclass(nan inf) [[Z:%.*]]) #[[ATTR14]]
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) float @test_norm3df(
+// FINITEONLY-SAME: float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]], float noundef nofpclass(nan inf) [[Z:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_len3_f32(float noundef nofpclass(nan inf) [[X]], float noundef nofpclass(nan inf) [[Y]], float noundef nofpclass(nan inf) [[Z]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
-// APPROX-LABEL: @test_norm3df(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_len3_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]], float noundef [[Z:%.*]]) #[[ATTR14]]
+// APPROX-LABEL: define dso_local noundef float @test_norm3df(
+// APPROX-SAME: float noundef [[X:%.*]], float noundef [[Y:%.*]], float noundef [[Z:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_len3_f32(float noundef [[X]], float noundef [[Y]], float noundef [[Z]]) #[[ATTR14]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
-// NCRDIV-LABEL: @test_norm3df(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_len3_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]], float noundef [[Z:%.*]]) #[[ATTR14]]
+// NCRDIV-LABEL: define dso_local noundef float @test_norm3df(
+// NCRDIV-SAME: float noundef [[X:%.*]], float noundef [[Y:%.*]], float noundef [[Z:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_len3_f32(float noundef [[X]], float noundef [[Y]], float noundef [[Z]]) #[[ATTR14]]
 // NCRDIV-NEXT:    ret float [[CALL_I]]
 //
-// AMDGCNSPIRV-LABEL: @test_norm3df(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_len3_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]], float noundef [[Z:%.*]]) #[[ATTR12]]
+// AMDGCNSPIRV-LABEL: define spir_func noundef float @test_norm3df(
+// AMDGCNSPIRV-SAME: float noundef [[X:%.*]], float noundef [[Y:%.*]], float noundef [[Z:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR4]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_len3_f32(float noundef [[X]], float noundef [[Y]], float noundef [[Z]]) #[[ATTR12]]
 // AMDGCNSPIRV-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_norm3df(float x, float y, float z) {
   return norm3df(x, y, z);
 }
 
-// DEFAULT-LABEL: @test_norm3d(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_len3_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]]) #[[ATTR14]]
+// DEFAULT-LABEL: define dso_local noundef double @test_norm3d(
+// DEFAULT-SAME: double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_len3_f64(double noundef [[X]], double noundef [[Y]], double noundef [[Z]]) #[[ATTR14]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
-// FINITEONLY-LABEL: @test_norm3d(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_len3_f64(double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]], double noundef nofpclass(nan inf) [[Z:%.*]]) #[[ATTR14]]
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) double @test_norm3d(
+// FINITEONLY-SAME: double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]], double noundef nofpclass(nan inf) [[Z:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_len3_f64(double noundef nofpclass(nan inf) [[X]], double noundef nofpclass(nan inf) [[Y]], double noundef nofpclass(nan inf) [[Z]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
-// APPROX-LABEL: @test_norm3d(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_len3_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]]) #[[ATTR14]]
+// APPROX-LABEL: define dso_local noundef double @test_norm3d(
+// APPROX-SAME: double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_len3_f64(double noundef [[X]], double noundef [[Y]], double noundef [[Z]]) #[[ATTR14]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
-// NCRDIV-LABEL: @test_norm3d(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_len3_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]]) #[[ATTR14]]
+// NCRDIV-LABEL: define dso_local noundef double @test_norm3d(
+// NCRDIV-SAME: double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_len3_f64(double noundef [[X]], double noundef [[Y]], double noundef [[Z]]) #[[ATTR14]]
 // NCRDIV-NEXT:    ret double [[CALL_I]]
 //
-// AMDGCNSPIRV-LABEL: @test_norm3d(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_len3_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]]) #[[ATTR12]]
+// AMDGCNSPIRV-LABEL: define spir_func noundef double @test_norm3d(
+// AMDGCNSPIRV-SAME: double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR4]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_len3_f64(double noundef [[X]], double noundef [[Y]], double noundef [[Z]]) #[[ATTR12]]
 // AMDGCNSPIRV-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_norm3d(double x, double y, double z) {
   return norm3d(x, y, z);
 }
 
-// DEFAULT-LABEL: @test_norm4df(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_len4_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]], float noundef [[Z:%.*]], float noundef [[W:%.*]]) #[[ATTR14]]
+// DEFAULT-LABEL: define dso_local noundef float @test_norm4df(
+// DEFAULT-SAME: float noundef [[X:%.*]], float noundef [[Y:%.*]], float noundef [[Z:%.*]], float noundef [[W:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_len4_f32(float noundef [[X]], float noundef [[Y]], float noundef [[Z]], float noundef [[W]]) #[[ATTR14]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
-// FINITEONLY-LABEL: @test_norm4df(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_len4_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]], float noundef nofpclass(nan inf) [[Z:%.*]], float noundef nofpclass(nan inf) [[W:%.*]]) #[[ATTR14]]
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) float @test_norm4df(
+// FINITEONLY-SAME: float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]], float noundef nofpclass(nan inf) [[Z:%.*]], float noundef nofpclass(nan inf) [[W:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_len4_f32(float noundef nofpclass(nan inf) [[X]], float noundef nofpclass(nan inf) [[Y]], float noundef nofpclass(nan inf) [[Z]], float noundef nofpclass(nan inf) [[W]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
-// APPROX-LABEL: @test_norm4df(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_len4_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]], float noundef [[Z:%.*]], float noundef [[W:%.*]]) #[[ATTR14]]
+// APPROX-LABEL: define dso_local noundef float @test_norm4df(
+// APPROX-SAME: float noundef [[X:%.*]], float noundef [[Y:%.*]], float noundef [[Z:%.*]], float noundef [[W:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_len4_f32(float noundef [[X]], float noundef [[Y]], float noundef [[Z]], float noundef [[W]]) #[[ATTR14]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
-// NCRDIV-LABEL: @test_norm4df(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_len4_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]], float noundef [[Z:%.*]], float noundef [[W:%.*]]) #[[ATTR14]]
+// NCRDIV-LABEL: define dso_local noundef float @test_norm4df(
+// NCRDIV-SAME: float noundef [[X:%.*]], float noundef [[Y:%.*]], float noundef [[Z:%.*]], float noundef [[W:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_len4_f32(float noundef [[X]], float noundef [[Y]], float noundef [[Z]], float noundef [[W]]) #[[ATTR14]]
 // NCRDIV-NEXT:    ret float [[CALL_I]]
 //
-// AMDGCNSPIRV-LABEL: @test_norm4df(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_len4_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]], float noundef [[Z:%.*]], float noundef [[W:%.*]]) #[[ATTR12]]
+// AMDGCNSPIRV-LABEL: define spir_func noundef float @test_norm4df(
+// AMDGCNSPIRV-SAME: float noundef [[X:%.*]], float noundef [[Y:%.*]], float noundef [[Z:%.*]], float noundef [[W:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR4]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_len4_f32(float noundef [[X]], float noundef [[Y]], float noundef [[Z]], float noundef [[W]]) #[[ATTR12]]
 // AMDGCNSPIRV-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_norm4df(float x, float y, float z, float w) {
   return norm4df(x, y, z, w);
 }
 
-// DEFAULT-LABEL: @test_norm4d(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_len4_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]], double noundef [[W:%.*]]) #[[ATTR14]]
+// DEFAULT-LABEL: define dso_local noundef double @test_norm4d(
+// DEFAULT-SAME: double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]], double noundef [[W:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_len4_f64(double noundef [[X]], double noundef [[Y]], double noundef [[Z]], double noundef [[W]]) #[[ATTR14]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
-// FINITEONLY-LABEL: @test_norm4d(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_len4_f64(double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]], double noundef nofpclass(nan inf) [[Z:%.*]], double noundef nofpclass(nan inf) [[W:%.*]]) #[[ATTR14]]
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) double @test_norm4d(
+// FINITEONLY-SAME: double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]], double noundef nofpclass(nan inf) [[Z:%.*]], double noundef nofpclass(nan inf) [[W:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_len4_f64(double noundef nofpclass(nan inf) [[X]], double noundef nofpclass(nan inf) [[Y]], double noundef nofpclass(nan inf) [[Z]], double noundef nofpclass(nan inf) [[W]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
-// APPROX-LABEL: @test_norm4d(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_len4_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]], double noundef [[W:%.*]]) #[[ATTR14]]
+// APPROX-LABEL: define dso_local noundef double @test_norm4d(
+// APPROX-SAME: double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]], double noundef [[W:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_len4_f64(double noundef [[X]], double noundef [[Y]], double noundef [[Z]], double noundef [[W]]) #[[ATTR14]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
-// NCRDIV-LABEL: @test_norm4d(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_len4_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]], double noundef [[W:%.*]]) #[[ATTR14]]
+// NCRDIV-LABEL: define dso_local noundef double @test_norm4d(
+// NCRDIV-SAME: double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]], double noundef [[W:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_len4_f64(double noundef [[X]], double noundef [[Y]], double noundef [[Z]], double noundef [[W]]) #[[ATTR14]]
 // NCRDIV-NEXT:    ret double [[CALL_I]]
 //
-// AMDGCNSPIRV-LABEL: @test_norm4d(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_len4_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]], double noundef [[W:%.*]]) #[[ATTR12]]
+// AMDGCNSPIRV-LABEL: define spir_func noundef double @test_norm4d(
+// AMDGCNSPIRV-SAME: double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]], double noundef [[W:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR4]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_len4_f64(double noundef [[X]], double noundef [[Y]], double noundef [[Z]], double noundef [[W]]) #[[ATTR12]]
 // AMDGCNSPIRV-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_norm4d(double x, double y, double z, double w) {
   return norm4d(x, y, z, w);
 }
 
-// DEFAULT-LABEL: @test_normcdff(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_ncdf_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-LABEL: define dso_local noundef float @test_normcdff(
+// DEFAULT-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_ncdf_f32(float noundef [[X]]) #[[ATTR15]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
-// FINITEONLY-LABEL: @test_normcdff(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_ncdf_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) float @test_normcdff(
+// FINITEONLY-SAME: float noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_ncdf_f32(float noundef nofpclass(nan inf) [[X]]) #[[ATTR15]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
-// APPROX-LABEL: @test_normcdff(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_ncdf_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-LABEL: define dso_local noundef float @test_normcdff(
+// APPROX-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_ncdf_f32(float noundef [[X]]) #[[ATTR15]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
-// NCRDIV-LABEL: @test_normcdff(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_ncdf_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// NCRDIV-LABEL: define dso_local noundef float @test_normcdff(
+// NCRDIV-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_ncdf_f32(float noundef [[X]]) #[[ATTR15]]
 // NCRDIV-NEXT:    ret float [[CALL_I]]
 //
-// AMDGCNSPIRV-LABEL: @test_normcdff(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_ncdf_f32(float noundef [[X:%.*]]) #[[ATTR13]]
+// AMDGCNSPIRV-LABEL: define spir_func noundef float @test_normcdff(
+// AMDGCNSPIRV-SAME: float noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR5]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_ncdf_f32(float noundef [[X]]) #[[ATTR13]]
 // AMDGCNSPIRV-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_normcdff(float x) {
   return normcdff(x);
 }
 
-// DEFAULT-LABEL: @test_normcdf(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_ncdf_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-LABEL: define dso_local noundef double @test_normcdf(
+// DEFAULT-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_ncdf_f64(double noundef [[X]]) #[[ATTR15]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
-// FINITEONLY-LABEL: @test_normcdf(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_ncdf_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) double @test_normcdf(
+// FINITEONLY-SAME: double noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_ncdf_f64(double noundef nofpclass(nan inf) [[X]]) #[[ATTR15]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
-// APPROX-LABEL: @test_normcdf(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_ncdf_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-LABEL: define dso_local noundef double @test_normcdf(
+// APPROX-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_ncdf_f64(double noundef [[X]]) #[[ATTR15]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
-// NCRDIV-LABEL: @test_normcdf(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_ncdf_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// NCRDIV-LABEL: define dso_local noundef double @test_normcdf(
+// NCRDIV-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_ncdf_f64(double noundef [[X]]) #[[ATTR15]]
 // NCRDIV-NEXT:    ret double [[CALL_I]]
 //
-// AMDGCNSPIRV-LABEL: @test_normcdf(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_ncdf_f64(double noundef [[X:%.*]]) #[[ATTR13]]
+// AMDGCNSPIRV-LABEL: define spir_func noundef double @test_normcdf(
+// AMDGCNSPIRV-SAME: double noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR5]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_ncdf_f64(double noundef [[X]]) #[[ATTR13]]
 // AMDGCNSPIRV-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_normcdf(double x) {
   return normcdf(x);
 }
 
-// DEFAULT-LABEL: @test_normcdfinvf(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_ncdfinv_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-LABEL: define dso_local noundef float @test_normcdfinvf(
+// DEFAULT-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_ncdfinv_f32(float noundef [[X]]) #[[ATTR15]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
-// FINITEONLY-LABEL: @test_normcdfinvf(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_ncdfinv_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) float @test_normcdfinvf(
+// FINITEONLY-SAME: float noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_ncdfinv_f32(float noundef nofpclass(nan inf) [[X]]) #[[ATTR15]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
-// APPROX-LABEL: @test_normcdfinvf(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_ncdfinv_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-LABEL: define dso_local noundef float @test_normcdfinvf(
+// APPROX-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_ncdfinv_f32(float noundef [[X]]) #[[ATTR15]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
-// NCRDIV-LABEL: @test_normcdfinvf(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_ncdfinv_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// NCRDIV-LABEL: define dso_local noundef float @test_normcdfinvf(
+// NCRDIV-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_ncdfinv_f32(float noundef [[X]]) #[[ATTR15]]
 // NCRDIV-NEXT:    ret float [[CALL_I]]
 //
-// AMDGCNSPIRV-LABEL: @test_normcdfinvf(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_ncdfinv_f32(float noundef [[X:%.*]]) #[[ATTR13]]
+// AMDGCNSPIRV-LABEL: define spir_func noundef float @test_normcdfinvf(
+// AMDGCNSPIRV-SAME: float noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR5]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_ncdfinv_f32(float noundef [[X]]) #[[ATTR13]]
 // AMDGCNSPIRV-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_normcdfinvf(float x) {
   return normcdfinvf(x);
 }
 
-// DEFAULT-LABEL: @test_normcdfinv(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_ncdfinv_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-LABEL: define dso_local noundef double @test_normcdfinv(
+// DEFAULT-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_ncdfinv_f64(double noundef [[X]]) #[[ATTR15]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
-// FINITEONLY-LABEL: @test_normcdfinv(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_ncdfinv_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) double @test_normcdfinv(
+// FINITEONLY-SAME: double noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_ncdfinv_f64(double noundef nofpclass(nan inf) [[X]]) #[[ATTR15]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
-// APPROX-LABEL: @test_normcdfinv(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_ncdfinv_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-LABEL: define dso_local noundef double @test_normcdfinv(
+// APPROX-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_ncdfinv_f64(double noundef [[X]]) #[[ATTR15]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
-// NCRDIV-LABEL: @test_normcdfinv(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_ncdfinv_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// NCRDIV-LABEL: define dso_local noundef double @test_normcdfinv(
+// NCRDIV-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_ncdfinv_f64(double noundef [[X]]) #[[ATTR15]]
 // NCRDIV-NEXT:    ret double [[CALL_I]]
 //
-// AMDGCNSPIRV-LABEL: @test_normcdfinv(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_ncdfinv_f64(double noundef [[X:%.*]]) #[[ATTR13]]
+// AMDGCNSPIRV-LABEL: define spir_func noundef double @test_normcdfinv(
+// AMDGCNSPIRV-SAME: double noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR5]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_ncdfinv_f64(double noundef [[X]]) #[[ATTR13]]
 // AMDGCNSPIRV-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_normcdfinv(double x) {
   return normcdfinv(x);
 }
 
-// DEFAULT-LABEL: @test_normf(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[TOBOOL_NOT_I1:%.*]] = icmp eq i32 [[X:%.*]], 0
-// DEFAULT-NEXT:    br i1 [[TOBOOL_NOT_I1]], label [[_ZL5NORMFIPKF_EXIT:%.*]], label [[WHILE_BODY_I:%.*]]
-// DEFAULT:       while.body.i:
-// DEFAULT-NEXT:    [[__R_0_I4:%.*]] = phi float [ [[ADD_I:%.*]], [[WHILE_BODY_I]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
-// DEFAULT-NEXT:    [[__A_ADDR_0_I3:%.*]] = phi ptr [ [[INCDEC_PTR_I:%.*]], [[WHILE_BODY_I]] ], [ [[Y:%.*]], [[ENTRY]] ]
-// DEFAULT-NEXT:    [[__DIM_ADDR_0_I2:%.*]] = phi i32 [ [[DEC_I:%.*]], [[WHILE_BODY_I]] ], [ [[X]], [[ENTRY]] ]
+// DEFAULT-LABEL: define dso_local float @test_normf(
+// DEFAULT-SAME: i32 noundef [[X:%.*]], ptr noundef readonly captures(none) [[Y:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// DEFAULT-NEXT:  [[ENTRY:.*]]:
+// DEFAULT-NEXT:    [[TOBOOL_NOT_I1:%.*]] = icmp eq i32 [[X]], 0
+// DEFAULT-NEXT:    br i1 [[TOBOOL_NOT_I1]], label %[[_ZL5NORMFIPKF_EXIT:.*]], label %[[WHILE_BODY_I:.*]]
+// DEFAULT:       [[WHILE_BODY_I]]:
+// DEFAULT-NEXT:    [[__R_0_I4:%.*]] = phi float [ [[ADD_I:%.*]], %[[WHILE_BODY_I]] ], [ 0.000000e+00, %[[ENTRY]] ]
+// DEFAULT-NEXT:    [[__A_ADDR_0_I3:%.*]] = phi ptr [ [[INCDEC_PTR_I:%.*]], %[[WHILE_BODY_I]] ], [ [[Y]], %[[ENTRY]] ]
+// DEFAULT-NEXT:    [[__DIM_ADDR_0_I2:%.*]] = phi i32 [ [[DEC_I:%.*]], %[[WHILE_BODY_I]] ], [ [[X]], %[[ENTRY]] ]
 // DEFAULT-NEXT:    [[DEC_I]] = add nsw i32 [[__DIM_ADDR_0_I2]], -1
-// DEFAULT-NEXT:    [[TMP0:%.*]] = load float, ptr [[__A_ADDR_0_I3]], align 4, !tbaa [[TBAA16]]
+// DEFAULT-NEXT:    [[TMP0:%.*]] = load float, ptr [[__A_ADDR_0_I3]], align 4, !tbaa [[FLOAT_TBAA16]]
 // DEFAULT-NEXT:    [[MUL_I:%.*]] = fmul contract float [[TMP0]], [[TMP0]]
 // DEFAULT-NEXT:    [[ADD_I]] = fadd contract float [[__R_0_I4]], [[MUL_I]]
 // DEFAULT-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds nuw i8, ptr [[__A_ADDR_0_I3]], i64 4
 // DEFAULT-NEXT:    [[TOBOOL_NOT_I:%.*]] = icmp eq i32 [[DEC_I]], 0
-// DEFAULT-NEXT:    br i1 [[TOBOOL_NOT_I]], label [[_ZL5NORMFIPKF_EXIT_LOOPEXIT:%.*]], label [[WHILE_BODY_I]], !llvm.loop [[LOOP20:![0-9]+]]
-// DEFAULT:       _ZL5normfiPKf.exit.loopexit:
+// DEFAULT-NEXT:    br i1 [[TOBOOL_NOT_I]], label %[[_ZL5NORMFIPKF_EXIT_LOOPEXIT:.*]], label %[[WHILE_BODY_I]], !llvm.loop [[LOOP20:![0-9]+]]
+// DEFAULT:       [[_ZL5NORMFIPKF_EXIT_LOOPEXIT]]:
 // DEFAULT-NEXT:    [[TMP1:%.*]] = tail call contract float @llvm.sqrt.f32(float [[ADD_I]])
-// DEFAULT-NEXT:    br label [[_ZL5NORMFIPKF_EXIT]]
-// DEFAULT:       _ZL5normfiPKf.exit:
-// DEFAULT-NEXT:    [[__R_0_I_LCSSA:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP1]], [[_ZL5NORMFIPKF_EXIT_LOOPEXIT]] ]
+// DEFAULT-NEXT:    br label %[[_ZL5NORMFIPKF_EXIT]]
+// DEFAULT:       [[_ZL5NORMFIPKF_EXIT]]:
+// DEFAULT-NEXT:    [[__R_0_I_LCSSA:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ], [ [[TMP1]], %[[_ZL5NORMFIPKF_EXIT_LOOPEXIT]] ]
 // DEFAULT-NEXT:    ret float [[__R_0_I_LCSSA]]
 //
-// FINITEONLY-LABEL: @test_normf(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[TOBOOL_NOT_I1:%.*]] = icmp eq i32 [[X:%.*]], 0
-// FINITEONLY-NEXT:    br i1 [[TOBOOL_NOT_I1]], label [[_ZL5NORMFIPKF_EXIT:%.*]], label [[WHILE_BODY_I:%.*]]
-// FINITEONLY:       while.body.i:
-// FINITEONLY-NEXT:    [[__R_0_I4:%.*]] = phi float [ [[ADD_I:%.*]], [[WHILE_BODY_I]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
-// FINITEONLY-NEXT:    [[__A_ADDR_0_I3:%.*]] = phi ptr [ [[INCDEC_PTR_I:%.*]], [[WHILE_BODY_I]] ], [ [[Y:%.*]], [[ENTRY]] ]
-// FINITEONLY-NEXT:    [[__DIM_ADDR_0_I2:%.*]] = phi i32 [ [[DEC_I:%.*]], [[WHILE_BODY_I]] ], [ [[X]], [[ENTRY]] ]
+// FINITEONLY-LABEL: define dso_local nofpclass(nan inf) float @test_normf(
+// FINITEONLY-SAME: i32 noundef [[X:%.*]], ptr noundef readonly captures(none) [[Y:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*]]:
+// FINITEONLY-NEXT:    [[TOBOOL_NOT_I1:%.*]] = icmp eq i32 [[X]], 0
+// FINITEONLY-NEXT:    br i1 [[TOBOOL_NOT_I1]], label %[[_ZL5NORMFIPKF_EXIT:.*]], label %[[WHILE_BODY_I:.*]]
+// FINITEONLY:       [[WHILE_BODY_I]]:
+// FINITEONLY-NEXT:    [[__R_0_I4:%.*]] = phi float [ [[ADD_I:%.*]], %[[WHILE_BODY_I]] ], [ 0.000000e+00, %[[ENTRY]] ]
+// FINITEONLY-NEXT:    [[__A_ADDR_0_I3:%.*]] = phi ptr [ [[INCDEC_PTR_I:%.*]], %[[WHILE_BODY_I]] ], [ [[Y]], %[[ENTRY]] ]
+// FINITEONLY-NEXT:    [[__DIM_ADDR_0_I2:%.*]] = phi i32 [ [[DEC_I:%.*]], %[[WHILE_BODY_I]] ], [ [[X]], %[[ENTRY]] ]
 // FINITEONLY-NEXT:    [[DEC_I]] = add nsw i32 [[__DIM_ADDR_0_I2]], -1
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = load float, ptr [[__A_ADDR_0_I3]], align 4, !tbaa [[TBAA16]]
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = load float, ptr [[__A_ADDR_0_I3]], align 4, !tbaa [[FLOAT_TBAA16]]
 // FINITEONLY-NEXT:    [[MUL_I:%.*]] = fmul nnan ninf contract float [[TMP0]], [[TMP0]]
 // FINITEONLY-NEXT:    [[ADD_I]] = fadd nnan ninf contract float [[__R_0_I4]], [[MUL_I]]
 // FINITEONLY-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds nuw i8, ptr [[__A_ADDR_0_I3]], i64 4
 // FINITEONLY-NEXT:    [[TOBOOL_NOT_I:%.*]] = icmp eq i32 [[DEC_I]], 0
-// FINITEONLY-NEXT:    br i1 [[TOBOOL_NOT_I]], label [[_ZL5NORMFIPKF_EXIT_LOOPEXIT:%.*]], label [[WHILE_BODY_I]], !llvm.loop [[LOOP20:![0-9]+]]
-// FINITEONLY:       _ZL5normfiPKf.exit.loopexit:
+// FINITEONLY-NEXT:    br i1 [[TOBOOL_NOT_I]], label %[[_ZL5NORMFIPKF_EXIT_LOOPEXIT:.*]], label %[[WHILE_BODY_I]], !llvm.loop [[LOOP20:![0-9]+]]
+// FINITEONLY:       [[_ZL5NORMFIPKF_EXIT_LOOPEXIT]]:
 // FINITEONLY-NEXT:    [[TMP1:%.*]] = tail call nnan ninf contract float @llvm.sqrt.f32(float [[ADD_I]])
-// FINITEONLY-NEXT:    br label [[_ZL5NORMFIPKF_EXIT]]
-// FINITEONLY:       _ZL5normfiPKf.exit:
-// FINITEONLY-NEXT:    [[__R_0_I_LCSSA:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP1]], [[_ZL5NORMFIPKF_EXIT_LOOPEXIT]] ]
+// FINITEONLY-NEXT:    br label %[[_ZL5NORMFIPKF_EXIT]]
+// FINITEONLY:       [[_ZL5NORMFIPKF_EXIT]]:
+// FINITEONLY-NEXT:    [[__R_0_I_LCSSA:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ], [ [[TMP1]], %[[_ZL5NORMFIPKF_EXIT_LOOPEXIT]] ]
 // FINITEONLY-NEXT:    ret float [[__R_0_I_LCSSA]]
 //
-// APPROX-LABEL: @test_normf(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[TOBOOL_NOT_I1:%.*]] = icmp eq i32 [[X:%.*]], 0
-// APPROX-NEXT:    br i1 [[TOBOOL_NOT_I1]], label [[_ZL5NORMFIPKF_EXIT:%.*]], label [[WHILE_BODY_I:%.*]]
-// APPROX:       while.body.i:
-// APPROX-NEXT:    [[__R_0_I4:%.*]] = phi float [ [[ADD_I:%.*]], [[WHILE_BODY_I]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
-// APPROX-NEXT:    [[__A_ADDR_0_I3:%.*]] = phi ptr [ [[INCDEC_PTR_I:%.*]], [[WHILE_BODY_I]] ], [ [[Y:%.*]], [[ENTRY]] ]
-// APPROX-NEXT:    [[__DIM_ADDR_0_I2:%.*]] = phi i32 [ [[DEC_I:%.*]], [[WHILE_BODY_I]] ], [ [[X]], [[ENTRY]] ]
+// APPROX-LABEL: define dso_local float @test_normf(
+// APPROX-SAME: i32 noundef [[X:%.*]], ptr noundef readonly captures(none) [[Y:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// APPROX-NEXT:  [[ENTRY:.*]]:
+// APPROX-NEXT:    [[TOBOOL_NOT_I1:%.*]] = icmp eq i32 [[X]], 0
+// APPROX-NEXT:    br i1 [[TOBOOL_NOT_I1]], label %[[_ZL5NORMFIPKF_EXIT:.*]], label %[[WHILE_BODY_I:.*]]
+// APPROX:       [[WHILE_BODY_I]]:
+// APPROX-NEXT:    [[__R_0_I4:%.*]] = phi float [ [[ADD_I:%.*]], %[[WHILE_BODY_I]] ], [ 0.000000e+00, %[[ENTRY]] ]
+// APPROX-NEXT:    [[__A_ADDR_0_I3:%.*]] = phi ptr [ [[INCDEC_PTR_I:%.*]], %[[WHILE_BODY_I]] ], [ [[Y]], %[[ENTRY]] ]
+// APPROX-NEXT:    [[__DIM_ADDR_0_I2:%.*]] = phi i32 [ [[DEC_I:%.*]], %[[WHILE_BODY_I]] ], [ [[X]], %[[ENTRY]] ]
 // APPROX-NEXT:    [[DEC_I]] = add nsw i32 [[__DIM_ADDR_0_I2]], -1
-// APPROX-NEXT:    [[TMP0:%.*]] = load float, ptr [[__A_ADDR_0_I3]], align 4, !tbaa [[TBAA16]]
+// APPROX-NEXT:    [[TMP0:%.*]] = load float, ptr [[__A_ADDR_0_I3]], align 4, !tbaa [[FLOAT_TBAA16]]
 // APPROX-NEXT:    [[MUL_I:%.*]] = fmul contract float [[TMP0]], [[TMP0]]
 // APPROX-NEXT:    [[ADD_I]] = fadd contract float [[__R_0_I4]], [[MUL_I]]
 // APPROX-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds nuw i8, ptr [[__A_ADDR_0_I3]], i64 4
 // APPROX-NEXT:    [[TOBOOL_NOT_I:%.*]] = icmp eq i32 [[DEC_I]], 0
-// APPROX-NEXT:    br i1 [[TOBOOL_NOT_I]], label [[_ZL5NORMFIPKF_EXIT_LOOPEXIT:%.*]], label [[WHILE_BODY_I]], !llvm.loop [[LOOP20:![0-9]+]]
-// APPROX:       _ZL5normfiPKf.exit.loopexit:
+// APPROX-NEXT:    br i1 [[TOBOOL_NOT_I]], label %[[_ZL5NORMFIPKF_EXIT_LOOPEXIT:.*]], label %[[WHILE_BODY_I]], !llvm.loop [[LOOP20:![0-9]+]]
+// APPROX:       [[_ZL5NORMFIPKF_EXIT_LOOPEXIT]]:
 // APPROX-NEXT:    [[TMP1:%.*]] = tail call contract float @llvm.sqrt.f32(float [[ADD_I]])
-// APPROX-NEXT:    br label [[_ZL5NORMFIPKF_EXIT]]
-// APPROX:       _ZL5normfiPKf.exit:
-// APPROX-NEXT:    [[__R_0_I_LCSSA:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP1]], [[_ZL5NORMFIPKF_EXIT_LOOPEXIT]] ]
+// APPROX-NEXT:    br label %[[_ZL5NORMFIPKF_EXIT]]
+// APPROX:       [[_ZL5NORMFIPKF_EXIT]]:
+// APPROX-NEXT:    [[__R_0_I_LCSSA:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ], [ [[TMP1]], %[[_ZL5NORMFIPKF_EXIT_LOOPEXIT]] ]
 // APPROX-NEXT:    ret float [[__R_0_I_LCSSA]]
 //
-// NCRDIV-LABEL: @test_normf(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[TOBOOL_NOT_I1:%.*]] = icmp eq i32 [[X:%.*]], 0
-// NCRDIV-NEXT:    br i1 [[TOBOOL_NOT_I1]], label [[_ZL5NORMFIPKF_EXIT:%.*]], label [[WHILE_BODY_I:%.*]]
-// NCRDIV:       while.body.i:
-// NCRDIV-NEXT:    [[__R_0_I4:%.*]] = phi float [ [[ADD_I:%.*]], [[WHILE_BODY_I]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
-// NCRDIV-NEXT:    [[__A_ADDR_0_I3:%.*]] = phi ptr [ [[INCDEC_PTR_I:%.*]], [[WHILE_BODY_I]] ], [ [[Y:%.*]], [[ENTRY]] ]
-// NCRDIV-NEXT:    [[__DIM_ADDR_0_I2:%.*]] = phi i32 [ [[DEC_I:%.*]], [[WHILE_BODY_I]] ], [ [[X]], [[ENTRY]] ]
+// NCRDIV-LABEL: define dso_local float @test_normf(
+// NCRDIV-SAME: i32 noundef [[X:%.*]], ptr noundef readonly captures(none) [[Y:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// NCRDIV-NEXT:  [[ENTRY:.*]]:
+// NCRDIV-NEXT:    [[TOBOOL_NOT_I1:%.*]] = icmp eq i32 [[X]], 0
+// NCRDIV-NEXT:    br i1 [[TOBOOL_NOT_I1]], label %[[_ZL5NORMFIPKF_EXIT:.*]], label %[[WHILE_BODY_I:.*]]
+// NCRDIV:       [[WHILE_BODY_I]]:
+// NCRDIV-NEXT:    [[__R_0_I4:%.*]] = phi float [ [[ADD_I:%.*]], %[[WHILE_BODY_I]] ], [ 0.000000e+00, %[[ENTRY]] ]
+// NCRDIV-NEXT:    [[__A_ADDR_0_I3:%.*]] = phi ptr [ [[INCDEC_PTR_I:%.*]], %[[WHILE_BODY_I]] ], [ [[Y]], %[[ENTRY]] ]
+// NCRDIV-NEXT:    [[__DIM_ADDR_0_I2:%.*]] = phi i32 [ [[DEC_I:%.*]], %[[WHILE_BODY_I]] ], [ [[X]], %[[ENTRY]] ]
 // NCRDIV-NEXT:    [[DEC_I]] = add nsw i32 [[__DIM_ADDR_0_I2]], -1
-// NCRDIV-NEXT:    [[TMP0:%.*]] = load float, ptr [[__A_ADDR_0_I3]], align 4, !tbaa [[TBAA17]]
+// NCRDIV-NEXT:    [[TMP0:%.*]] = load float, ptr [[__A_ADDR_0_I3]], align 4, !tbaa [[FLOAT_TBAA17]]
 // NCRDIV-NEXT:    [[MUL_I:%.*]] = fmul contract float [[TMP0]], [[TMP0]]
 // NCRDIV-NEXT:    [[ADD_I]] = fadd contract float [[__R_0_I4]], [[MUL_I]]
 // NCRDIV-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds nuw i8, ptr [[__A_ADDR_0_I3]], i64 4
 // NCRDIV-NEXT:    [[TOBOOL_NOT_I:%.*]] = icmp eq i32 [[DEC_I]], 0
-// NCRDIV-NEXT:    br i1 [[TOBOOL_NOT_I]], label [[_ZL5NORMFIPKF_EXIT_LOOPEXIT:%.*]], label [[WHILE_BODY_I]], !llvm.loop [[LOOP21:![0-9]+]]
-// NCRDIV:       _ZL5normfiPKf.exit.loopexit:
+// NCRDIV-NEXT:    br i1 [[TOBOOL_NOT_I]], label %[[_ZL5NORMFIPKF_EXIT_LOOPEXIT:.*]], label %[[WHILE_BODY_I]], !llvm.loop [[LOOP21:![0-9]+]]
+// NCRDIV:       [[_ZL5NORMFIPKF_EXIT_LOOPEXIT]]:
 // NCRDIV-NEXT:    [[TMP1:%.*]] = tail call contract float @llvm.sqrt.f32(float [[ADD_I]])
-// NCRDIV-NEXT:    br label [[_ZL5NORMFIPKF_EXIT]]
-// NCRDIV:       _ZL5normfiPKf.exit:
-// NCRDIV-NEXT:    [[__R_0_I_LCSSA:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP1]], [[_ZL5NORMFIPKF_EXIT_LOOPEXIT]] ]
+// NCRDIV-NEXT:    br label %[[_ZL5NORMFIPKF_EXIT]]
+// NCRDIV:       [[_ZL5NORMFIPKF_EXIT]]:
+// NCRDIV-NEXT:    [[__R_0_I_LCSSA:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ], [ [[TMP1]], %[[_ZL5NORMFIPKF_EXIT_LOOPEXIT]] ]
 // NCRDIV-NEXT:    ret float [[__R_0_I_LCSSA]]
 //
-// AMDGCNSPIRV-LABEL: @test_normf(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[TOBOOL_NOT_I1:%.*]] = icmp eq i32 [[X:%.*]], 0
-// AMDGCNSPIRV-NEXT:    br i1 [[TOBOOL_NOT_I1]], label [[_ZL5NORMFIPKF_EXIT:%.*]], label [[WHILE_BODY_I:%.*]]
-// AMDGCNSPIRV:       while.body.i:
-// AMDGCNSPIRV-NEXT:    [[__R_0_I4:%.*]] = phi float [ [[ADD_I:%.*]], [[WHILE_BODY_I]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
-// AMDGCNSPIRV-NEXT:    [[__A_ADDR_0_I3:%.*]] = phi ptr addrspace(4) [ [[INCDEC_PTR_I:%.*]], [[WHILE_BODY_I]] ], [ [[Y:%.*]], [[ENTRY]] ]
-// AMDGCNSPIRV-NEXT:    [[__DIM_ADDR_0_I2:%.*]] = phi i32 [ [[DEC_I:%.*]], [[WHILE_BODY_I]] ], [ [[X]], [[ENTRY]] ]
+// AMDGCNSPIRV-LABEL: define spir_func float @test_normf(
+// AMDGCNSPIRV-SAME: i32 noundef [[X:%.*]], ptr addrspace(4) noundef readonly captures(none) [[Y:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR2]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*]]:
+// AMDGCNSPIRV-NEXT:    [[TOBOOL_NOT_I1:%.*]] = icmp eq i32 [[X]], 0
+// AMDGCNSPIRV-NEXT:    br i1 [[TOBOOL_NOT_I1]], label %[[_ZL5NORMFIPKF_EXIT:.*]], label %[[WHILE_BODY_I:.*]]
+// AMDGCNSPIRV:       [[WHILE_BODY_I]]:
+// AMDGCNSPIRV-NEXT:    [[__R_0_I4:%.*]] = phi float [ [[ADD_I:%.*]], %[[WHILE_BODY_I]] ], [ 0.000000e+00, %[[ENTRY]] ]
+// AMDGCNSPIRV-NEXT:    [[__A_ADDR_0_I3:%.*]] = phi ptr addrspace(4) [ [[INCDEC_PTR_I:%.*]], %[[WHILE_BODY_I]] ], [ [[Y]], %[[ENTRY]] ]
+// AMDGCNSPIRV-NEXT:    [[__DIM_ADDR_0_I2:%.*]] = phi i32 [ [[DEC_I:%.*]], %[[WHILE_BODY_I]] ], [ [[X]], %[[ENTRY]] ]
 // AMDGCNSPIRV-NEXT:    [[DEC_I]] = add nsw i32 [[__DIM_ADDR_0_I2]], -1
-// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = load float, ptr addrspace(4) [[__A_ADDR_0_I3]], align 4, !tbaa [[TBAA17]]
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = load float, ptr addrspace(4) [[__A_ADDR_0_I3]], align 4, !tbaa [[FLOAT_TBAA17]]
 // AMDGCNSPIRV-NEXT:    [[MUL_I:%.*]] = fmul contract float [[TMP0]], [[TMP0]]
 // AMDGCNSPIRV-NEXT:    [[ADD_I]] = fadd contract float [[__R_0_I4]], [[MUL_I]]
 // AMDGCNSPIRV-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[__A_ADDR_0_I3]], i64 4
 // AMDGCNSPIRV-NEXT:    [[TOBOOL_NOT_I:%.*]] = icmp eq i32 [[DEC_I]], 0
-// AMDGCNSPIRV-NEXT:    br i1 [[TOBOOL_NOT_I]], label [[_ZL5NORMFIPKF_EXIT_LOOPEXIT:%.*]], label [[WHILE_BODY_I]], !llvm.loop [[LOOP21:![0-9]+]]
-// AMDGCNSPIRV:       _ZL5normfiPKf.exit.loopexit:
+// AMDGCNSPIRV-NEXT:    br i1 [[TOBOOL_NOT_I]], label %[[_ZL5NORMFIPKF_EXIT_LOOPEXIT:.*]], label %[[WHILE_BODY_I]], !llvm.loop [[LOOP21:![0-9]+]]
+// AMDGCNSPIRV:       [[_ZL5NORMFIPKF_EXIT_LOOPEXIT]]:
 // AMDGCNSPIRV-NEXT:    [[TMP1:%.*]] = tail call contract addrspace(4) float @llvm.sqrt.f32(float [[ADD_I]])
-// AMDGCNSPIRV-NEXT:    br label [[_ZL5NORMFIPKF_EXIT]]
-// AMDGCNSPIRV:       _ZL5normfiPKf.exit:
-// AMDGCNSPIRV-NEXT:    [[__R_0_I_LCSSA:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP1]], [[_ZL5NORMFIPKF_EXIT_LOOPEXIT]] ]
+// AMDGCNSPIRV-NEXT:    br label %[[_ZL5NORMFIPKF_EXIT]]
+// AMDGCNSPIRV:       [[_ZL5NORMFIPKF_EXIT]]:
+// AMDGCNSPIRV-NEXT:    [[__R_0_I_LCSSA:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ], [ [[TMP1]], %[[_ZL5NORMFIPKF_EXIT_LOOPEXIT]] ]
 // AMDGCNSPIRV-NEXT:    ret float [[__R_0_I_LCSSA]]
 //
 extern "C" __device__ float test_normf(int x, const float *y) {
   return normf(x, y);
 }
 
-// DEFAULT-LABEL: @test_norm(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[TOBOOL_NOT_I1:%.*]] = icmp eq i32 [[X:%.*]], 0
-// DEFAULT-NEXT:    br i1 [[TOBOOL_NOT_I1]], label [[_ZL4NORMIPKD_EXIT:%.*]], label [[WHILE_BODY_I:%.*]]
-// DEFAULT:       while.body.i:
-// DEFAULT-NEXT:    [[__R_0_I4:%.*]] = phi double [ [[ADD_I:%.*]], [[WHILE_BODY_I]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
-// DEFAULT-NEXT:    [[__A_ADDR_0_I3:%.*]] = phi ptr [ [[INCDEC_PTR_I:%.*]], [[WHILE_BODY_I]] ], [ [[Y:%.*]], [[ENTRY]] ]
-// DEFAULT-NEXT:    [[__DIM_ADDR_0_I2:%.*]] = phi i32 [ [[DEC_I:%.*]], [[WHILE_BODY_I]] ], [ [[X]], [[ENTRY]] ]
+// DEFAULT-LABEL: define dso_local double @test_norm(
+// DEFAULT-SAME: i32 noundef [[X:%.*]], ptr noundef readonly captures(none) [[Y:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// DEFAULT-NEXT:  [[ENTRY:.*]]:
+// DEFAULT-NEXT:    [[TOBOOL_NOT_I1:%.*]] = icmp eq i32 [[X]], 0
+// DEFAULT-NEXT:    br i1 [[TOBOOL_NOT_I1]], label %[[_ZL4NORMIPKD_EXIT:.*]], label %[[WHILE_BODY_I:.*]]
+// DEFAULT:       [[WHILE_BODY_I]]:
+// DEFAULT-NEXT:    [[__R_0_I4:%.*]] = phi double [ [[ADD_I:%.*]], %[[WHILE_BODY_I]] ], [ 0.000000e+00, %[[ENTRY]] ]
+// DEFAULT-NEXT:    [[__A_ADDR_0_I3:%.*]] = phi ptr [ [[INCDEC_PTR_I:%.*]], %[[WHILE_BODY_I]] ], [ [[Y]], %[[ENTRY]] ]
+// DEFAULT-NEXT:    [[__DIM_ADDR_0_I2:%.*]] = phi i32 [ [[DEC_I:%.*]], %[[WHILE_BODY_I]] ], [ [[X]], %[[ENTRY]] ]
 // DEFAULT-NEXT:    [[DEC_I]] = add nsw i32 [[__DIM_ADDR_0_I2]], -1
-// DEFAULT-NEXT:    [[TMP0:%.*]] = load double, ptr [[__A_ADDR_0_I3]], align 8, !tbaa [[TBAA18]]
+// DEFAULT-NEXT:    [[TMP0:%.*]] = load double, ptr [[__A_ADDR_0_I3]], align 8, !tbaa [[DOUBLE_TBAA18]]
 // DEFAULT-NEXT:    [[MUL_I:%.*]] = fmul contract double [[TMP0]], [[TMP0]]
 // DEFAULT-NEXT:    [[ADD_I]] = fadd contract double [[__R_0_I4]], [[MUL_I]]
 // DEFAULT-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds nuw i8, ptr [[__A_ADDR_0_I3]], i64 8
 // DEFAULT-NEXT:    [[TOBOOL_NOT_I:%.*]] = icmp eq i32 [[DEC_I]], 0
-// DEFAULT-NEXT:    br i1 [[TOBOOL_NOT_I]], label [[_ZL4NORMIPKD_EXIT_LOOPEXIT:%.*]], label [[WHILE_BODY_I]], !llvm.loop [[LOOP21:![0-9]+]]
-// DEFAULT:       _ZL4normiPKd.exit.loopexit:
+// DEFAULT-NEXT:    br i1 [[TOBOOL_NOT_I]], label %[[_ZL4NORMIPKD_EXIT_LOOPEXIT:.*]], label %[[WHILE_BODY_I]], !llvm.loop [[LOOP21:![0-9]+]]
+// DEFAULT:       [[_ZL4NORMIPKD_EXIT_LOOPEXIT]]:
 // DEFAULT-NEXT:    [[TMP1:%.*]] = tail call contract double @llvm.sqrt.f64(double [[ADD_I]])
-// DEFAULT-NEXT:    br label [[_ZL4NORMIPKD_EXIT]]
-// DEFAULT:       _ZL4normiPKd.exit:
-// DEFAULT-NEXT:    [[__R_0_I_LCSSA:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[TMP1]], [[_ZL4NORMIPKD_EXIT_LOOPEXIT]] ]
+// DEFAULT-NEXT:    br label %[[_ZL4NORMIPKD_EXIT]]
+// DEFAULT:       [[_ZL4NORMIPKD_EXIT]]:
+// DEFAULT-NEXT:    [[__R_0_I_LCSSA:%.*]] = phi double [ 0.000000e+00, %[[ENTRY]] ], [ [[TMP1]], %[[_ZL4NORMIPKD_EXIT_LOOPEXIT]] ]
 // DEFAULT-NEXT:    ret double [[__R_0_I_LCSSA]]
 //
-// FINITEONLY-LABEL: @test_norm(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[TOBOOL_NOT_I1:%.*]] = icmp eq i32 [[X:%.*]], 0
-// FINITEONLY-NEXT:    br i1 [[TOBOOL_NOT_I1]], label [[_ZL4NORMIPKD_EXIT:%.*]], label [[WHILE_BODY_I:%.*]]
-// FINITEONLY:       while.body.i:
-// FINITEONLY-NEXT:    [[__R_0_I4:%.*]] = phi double [ [[ADD_I:%.*]], [[WHILE_BODY_I]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
-// FINITEONLY-NEXT:    [[__A_ADDR_0_I3:%.*]] = phi ptr [ [[INCDEC_PTR_I:%.*]], [[WHILE_BODY_I]] ], [ [[Y:%.*]], [[ENTRY]] ]
-// FINITEONLY-NEXT:    [[__DIM_ADDR_0_I2:%.*]] = phi i32 [ [[DEC_I:%.*]], [[WHILE_BODY_I]] ], [ [[X]], [[ENTRY]] ]
+// FINITEONLY-LABEL: define dso_local nofpclass(nan inf) double @test_norm(
+// FINITEONLY-SAME: i32 noundef [[X:%.*]], ptr noundef readonly captures(none) [[Y:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*]]:
+// FINITEONLY-NEXT:    [[TOBOOL_NOT_I1:%.*]] = icmp eq i32 [[X]], 0
+// FINITEONLY-NEXT:    br i1 [[TOBOOL_NOT_I1]], label %[[_ZL4NORMIPKD_EXIT:.*]], label %[[WHILE_BODY_I:.*]]
+// FINITEONLY:       [[WHILE_BODY_I]]:
+// FINITEONLY-NEXT:    [[__R_0_I4:%.*]] = phi double [ [[ADD_I:%.*]], %[[WHILE_BODY_I]] ], [ 0.000000e+00, %[[ENTRY]] ]
+// FINITEONLY-NEXT:    [[__A_ADDR_0_I3:%.*]] = phi ptr [ [[INCDEC_PTR_I:%.*]], %[[WHILE_BODY_I]] ], [ [[Y]], %[[ENTRY]] ]
+// FINITEONLY-NEXT:    [[__DIM_ADDR_0_I2:%.*]] = phi i32 [ [[DEC_I:%.*]], %[[WHILE_BODY_I]] ], [ [[X]], %[[ENTRY]] ]
 // FINITEONLY-NEXT:    [[DEC_I]] = add nsw i32 [[__DIM_ADDR_0_I2]], -1
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = load double, ptr [[__A_ADDR_0_I3]], align 8, !tbaa [[TBAA18]]
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = load double, ptr [[__A_ADDR_0_I3]], align 8, !tbaa [[DOUBLE_TBAA18]]
 // FINITEONLY-NEXT:    [[MUL_I:%.*]] = fmul nnan ninf contract double [[TMP0]], [[TMP0]]
 // FINITEONLY-NEXT:    [[ADD_I]] = fadd nnan ninf contract double [[__R_0_I4]], [[MUL_I]]
 // FINITEONLY-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds nuw i8, ptr [[__A_ADDR_0_I3]], i64 8
 // FINITEONLY-NEXT:    [[TOBOOL_NOT_I:%.*]] = icmp eq i32 [[DEC_I]], 0
-// FINITEONLY-NEXT:    br i1 [[TOBOOL_NOT_I]], label [[_ZL4NORMIPKD_EXIT_LOOPEXIT:%.*]], label [[WHILE_BODY_I]], !llvm.loop [[LOOP21:![0-9]+]]
-// FINITEONLY:       _ZL4normiPKd.exit.loopexit:
+// FINITEONLY-NEXT:    br i1 [[TOBOOL_NOT_I]], label %[[_ZL4NORMIPKD_EXIT_LOOPEXIT:.*]], label %[[WHILE_BODY_I]], !llvm.loop [[LOOP21:![0-9]+]]
+// FINITEONLY:       [[_ZL4NORMIPKD_EXIT_LOOPEXIT]]:
 // FINITEONLY-NEXT:    [[TMP1:%.*]] = tail call nnan ninf contract double @llvm.sqrt.f64(double [[ADD_I]])
-// FINITEONLY-NEXT:    br label [[_ZL4NORMIPKD_EXIT]]
-// FINITEONLY:       _ZL4normiPKd.exit:
-// FINITEONLY-NEXT:    [[__R_0_I_LCSSA:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[TMP1]], [[_ZL4NORMIPKD_EXIT_LOOPEXIT]] ]
+// FINITEONLY-NEXT:    br label %[[_ZL4NORMIPKD_EXIT]]
+// FINITEONLY:       [[_ZL4NORMIPKD_EXIT]]:
+// FINITEONLY-NEXT:    [[__R_0_I_LCSSA:%.*]] = phi double [ 0.000000e+00, %[[ENTRY]] ], [ [[TMP1]], %[[_ZL4NORMIPKD_EXIT_LOOPEXIT]] ]
 // FINITEONLY-NEXT:    ret double [[__R_0_I_LCSSA]]
 //
-// APPROX-LABEL: @test_norm(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[TOBOOL_NOT_I1:%.*]] = icmp eq i32 [[X:%.*]], 0
-// APPROX-NEXT:    br i1 [[TOBOOL_NOT_I1]], label [[_ZL4NORMIPKD_EXIT:%.*]], label [[WHILE_BODY_I:%.*]]
-// APPROX:       while.body.i:
-// APPROX-NEXT:    [[__R_0_I4:%.*]] = phi double [ [[ADD_I:%.*]], [[WHILE_BODY_I]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
-// APPROX-NEXT:    [[__A_ADDR_0_I3:%.*]] = phi ptr [ [[INCDEC_PTR_I:%.*]], [[WHILE_BODY_I]] ], [ [[Y:%.*]], [[ENTRY]] ]
-// APPROX-NEXT:    [[__DIM_ADDR_0_I2:%.*]] = phi i32 [ [[DEC_I:%.*]], [[WHILE_BODY_I]] ], [ [[X]], [[ENTRY]] ]
+// APPROX-LABEL: define dso_local double @test_norm(
+// APPROX-SAME: i32 noundef [[X:%.*]], ptr noundef readonly captures(none) [[Y:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// APPROX-NEXT:  [[ENTRY:.*]]:
+// APPROX-NEXT:    [[TOBOOL_NOT_I1:%.*]] = icmp eq i32 [[X]], 0
+// APPROX-NEXT:    br i1 [[TOBOOL_NOT_I1]], label %[[_ZL4NORMIPKD_EXIT:.*]], label %[[WHILE_BODY_I:.*]]
+// APPROX:       [[WHILE_BODY_I]]:
+// APPROX-NEXT:    [[__R_0_I4:%.*]] = phi double [ [[ADD_I:%.*]], %[[WHILE_BODY_I]] ], [ 0.000000e+00, %[[ENTRY]] ]
+// APPROX-NEXT:    [[__A_ADDR_0_I3:%.*]] = phi ptr [ [[INCDEC_PTR_I:%.*]], %[[WHILE_BODY_I]] ], [ [[Y]], %[[ENTRY]] ]
+// APPROX-NEXT:    [[__DIM_ADDR_0_I2:%.*]] = phi i32 [ [[DEC_I:%.*]], %[[WHILE_BODY_I]] ], [ [[X]], %[[ENTRY]] ]
 // APPROX-NEXT:    [[DEC_I]] = add nsw i32 [[__DIM_ADDR_0_I2]], -1
-// APPROX-NEXT:    [[TMP0:%.*]] = load double, ptr [[__A_ADDR_0_I3]], align 8, !tbaa [[TBAA18]]
+// APPROX-NEXT:    [[TMP0:%.*]] = load double, ptr [[__A_ADDR_0_I3]], align 8, !tbaa [[DOUBLE_TBAA18]]
 // APPROX-NEXT:    [[MUL_I:%.*]] = fmul contract double [[TMP0]], [[TMP0]]
 // APPROX-NEXT:    [[ADD_I]] = fadd contract double [[__R_0_I4]], [[MUL_I]]
 // APPROX-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds nuw i8, ptr [[__A_ADDR_0_I3]], i64 8
 // APPROX-NEXT:    [[TOBOOL_NOT_I:%.*]] = icmp eq i32 [[DEC_I]], 0
-// APPROX-NEXT:    br i1 [[TOBOOL_NOT_I]], label [[_ZL4NORMIPKD_EXIT_LOOPEXIT:%.*]], label [[WHILE_BODY_I]], !llvm.loop [[LOOP21:![0-9]+]]
-// APPROX:       _ZL4normiPKd.exit.loopexit:
+// APPROX-NEXT:    br i1 [[TOBOOL_NOT_I]], label %[[_ZL4NORMIPKD_EXIT_LOOPEXIT:.*]], label %[[WHILE_BODY_I]], !llvm.loop [[LOOP21:![0-9]+]]
+// APPROX:       [[_ZL4NORMIPKD_EXIT_LOOPEXIT]]:
 // APPROX-NEXT:    [[TMP1:%.*]] = tail call contract double @llvm.sqrt.f64(double [[ADD_I]])
-// APPROX-NEXT:    br label [[_ZL4NORMIPKD_EXIT]]
-// APPROX:       _ZL4normiPKd.exit:
-// APPROX-NEXT:    [[__R_0_I_LCSSA:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[TMP1]], [[_ZL4NORMIPKD_EXIT_LOOPEXIT]] ]
+// APPROX-NEXT:    br label %[[_ZL4NORMIPKD_EXIT]]
+// APPROX:       [[_ZL4NORMIPKD_EXIT]]:
+// APPROX-NEXT:    [[__R_0_I_LCSSA:%.*]] = phi double [ 0.000000e+00, %[[ENTRY]] ], [ [[TMP1]], %[[_ZL4NORMIPKD_EXIT_LOOPEXIT]] ]
 // APPROX-NEXT:    ret double [[__R_0_I_LCSSA]]
 //
-// NCRDIV-LABEL: @test_norm(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[TOBOOL_NOT_I1:%.*]] = icmp eq i32 [[X:%.*]], 0
-// NCRDIV-NEXT:    br i1 [[TOBOOL_NOT_I1]], label [[_ZL4NORMIPKD_EXIT:%.*]], label [[WHILE_BODY_I:%.*]]
-// NCRDIV:       while.body.i:
-// NCRDIV-NEXT:    [[__R_0_I4:%.*]] = phi double [ [[ADD_I:%.*]], [[WHILE_BODY_I]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
-// NCRDIV-NEXT:    [[__A_ADDR_0_I3:%.*]] = phi ptr [ [[INCDEC_PTR_I:%.*]], [[WHILE_BODY_I]] ], [ [[Y:%.*]], [[ENTRY]] ]
-// NCRDIV-NEXT:    [[__DIM_ADDR_0_I2:%.*]] = phi i32 [ [[DEC_I:%.*]], [[WHILE_BODY_I]] ], [ [[X]], [[ENTRY]] ]
+// NCRDIV-LABEL: define dso_local double @test_norm(
+// NCRDIV-SAME: i32 noundef [[X:%.*]], ptr noundef readonly captures(none) [[Y:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// NCRDIV-NEXT:  [[ENTRY:.*]]:
+// NCRDIV-NEXT:    [[TOBOOL_NOT_I1:%.*]] = icmp eq i32 [[X]], 0
+// NCRDIV-NEXT:    br i1 [[TOBOOL_NOT_I1]], label %[[_ZL4NORMIPKD_EXIT:.*]], label %[[WHILE_BODY_I:.*]]
+// NCRDIV:       [[WHILE_BODY_I]]:
+// NCRDIV-NEXT:    [[__R_0_I4:%.*]] = phi double [ [[ADD_I:%.*]], %[[WHILE_BODY_I]] ], [ 0.000000e+00, %[[ENTRY]] ]
+// NCRDIV-NEXT:    [[__A_ADDR_0_I3:%.*]] = phi ptr [ [[INCDEC_PTR_I:%.*]], %[[WHILE_BODY_I]] ], [ [[Y]], %[[ENTRY]] ]
+// NCRDIV-NEXT:    [[__DIM_ADDR_0_I2:%.*]] = phi i32 [ [[DEC_I:%.*]], %[[WHILE_BODY_I]] ], [ [[X]], %[[ENTRY]] ]
 // NCRDIV-NEXT:    [[DEC_I]] = add nsw i32 [[__DIM_ADDR_0_I2]], -1
-// NCRDIV-NEXT:    [[TMP0:%.*]] = load double, ptr [[__A_ADDR_0_I3]], align 8, !tbaa [[TBAA19]]
+// NCRDIV-NEXT:    [[TMP0:%.*]] = load double, ptr [[__A_ADDR_0_I3]], align 8, !tbaa [[DOUBLE_TBAA19]]
 // NCRDIV-NEXT:    [[MUL_I:%.*]] = fmul contract double [[TMP0]], [[TMP0]]
 // NCRDIV-NEXT:    [[ADD_I]] = fadd contract double [[__R_0_I4]], [[MUL_I]]
 // NCRDIV-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds nuw i8, ptr [[__A_ADDR_0_I3]], i64 8
 // NCRDIV-NEXT:    [[TOBOOL_NOT_I:%.*]] = icmp eq i32 [[DEC_I]], 0
-// NCRDIV-NEXT:    br i1 [[TOBOOL_NOT_I]], label [[_ZL4NORMIPKD_EXIT_LOOPEXIT:%.*]], label [[WHILE_BODY_I]], !llvm.loop [[LOOP22:![0-9]+]]
-// NCRDIV:       _ZL4normiPKd.exit.loopexit:
+// NCRDIV-NEXT:    br i1 [[TOBOOL_NOT_I]], label %[[_ZL4NORMIPKD_EXIT_LOOPEXIT:.*]], label %[[WHILE_BODY_I]], !llvm.loop [[LOOP22:![0-9]+]]
+// NCRDIV:       [[_ZL4NORMIPKD_EXIT_LOOPEXIT]]:
 // NCRDIV-NEXT:    [[TMP1:%.*]] = tail call contract double @llvm.sqrt.f64(double [[ADD_I]])
-// NCRDIV-NEXT:    br label [[_ZL4NORMIPKD_EXIT]]
-// NCRDIV:       _ZL4normiPKd.exit:
-// NCRDIV-NEXT:    [[__R_0_I_LCSSA:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[TMP1]], [[_ZL4NORMIPKD_EXIT_LOOPEXIT]] ]
+// NCRDIV-NEXT:    br label %[[_ZL4NORMIPKD_EXIT]]
+// NCRDIV:       [[_ZL4NORMIPKD_EXIT]]:
+// NCRDIV-NEXT:    [[__R_0_I_LCSSA:%.*]] = phi double [ 0.000000e+00, %[[ENTRY]] ], [ [[TMP1]], %[[_ZL4NORMIPKD_EXIT_LOOPEXIT]] ]
 // NCRDIV-NEXT:    ret double [[__R_0_I_LCSSA]]
 //
-// AMDGCNSPIRV-LABEL: @test_norm(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[TOBOOL_NOT_I1:%.*]] = icmp eq i32 [[X:%.*]], 0
-// AMDGCNSPIRV-NEXT:    br i1 [[TOBOOL_NOT_I1]], label [[_ZL4NORMIPKD_EXIT:%.*]], label [[WHILE_BODY_I:%.*]]
-// AMDGCNSPIRV:       while.body.i:
-// AMDGCNSPIRV-NEXT:    [[__R_0_I4:%.*]] = phi double [ [[ADD_I:%.*]], [[WHILE_BODY_I]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
-// AMDGCNSPIRV-NEXT:    [[__A_ADDR_0_I3:%.*]] = phi ptr addrspace(4) [ [[INCDEC_PTR_I:%.*]], [[WHILE_BODY_I]] ], [ [[Y:%.*]], [[ENTRY]] ]
-// AMDGCNSPIRV-NEXT:    [[__DIM_ADDR_0_I2:%.*]] = phi i32 [ [[DEC_I:%.*]], [[WHILE_BODY_I]] ], [ [[X]], [[ENTRY]] ]
+// AMDGCNSPIRV-LABEL: define spir_func double @test_norm(
+// AMDGCNSPIRV-SAME: i32 noundef [[X:%.*]], ptr addrspace(4) noundef readonly captures(none) [[Y:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR2]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*]]:
+// AMDGCNSPIRV-NEXT:    [[TOBOOL_NOT_I1:%.*]] = icmp eq i32 [[X]], 0
+// AMDGCNSPIRV-NEXT:    br i1 [[TOBOOL_NOT_I1]], label %[[_ZL4NORMIPKD_EXIT:.*]], label %[[WHILE_BODY_I:.*]]
+// AMDGCNSPIRV:       [[WHILE_BODY_I]]:
+// AMDGCNSPIRV-NEXT:    [[__R_0_I4:%.*]] = phi double [ [[ADD_I:%.*]], %[[WHILE_BODY_I]] ], [ 0.000000e+00, %[[ENTRY]] ]
+// AMDGCNSPIRV-NEXT:    [[__A_ADDR_0_I3:%.*]] = phi ptr addrspace(4) [ [[INCDEC_PTR_I:%.*]], %[[WHILE_BODY_I]] ], [ [[Y]], %[[ENTRY]] ]
+// AMDGCNSPIRV-NEXT:    [[__DIM_ADDR_0_I2:%.*]] = phi i32 [ [[DEC_I:%.*]], %[[WHILE_BODY_I]] ], [ [[X]], %[[ENTRY]] ]
 // AMDGCNSPIRV-NEXT:    [[DEC_I]] = add nsw i32 [[__DIM_ADDR_0_I2]], -1
-// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = load double, ptr addrspace(4) [[__A_ADDR_0_I3]], align 8, !tbaa [[TBAA19]]
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = load double, ptr addrspace(4) [[__A_ADDR_0_I3]], align 8, !tbaa [[DOUBLE_TBAA19]]
 // AMDGCNSPIRV-NEXT:    [[MUL_I:%.*]] = fmul contract double [[TMP0]], [[TMP0]]
 // AMDGCNSPIRV-NEXT:    [[ADD_I]] = fadd contract double [[__R_0_I4]], [[MUL_I]]
 // AMDGCNSPIRV-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[__A_ADDR_0_I3]], i64 8
 // AMDGCNSPIRV-NEXT:    [[TOBOOL_NOT_I:%.*]] = icmp eq i32 [[DEC_I]], 0
-// AMDGCNSPIRV-NEXT:    br i1 [[TOBOOL_NOT_I]], label [[_ZL4NORMIPKD_EXIT_LOOPEXIT:%.*]], label [[WHILE_BODY_I]], !llvm.loop [[LOOP22:![0-9]+]]
-// AMDGCNSPIRV:       _ZL4normiPKd.exit.loopexit:
+// AMDGCNSPIRV-NEXT:    br i1 [[TOBOOL_NOT_I]], label %[[_ZL4NORMIPKD_EXIT_LOOPEXIT:.*]], label %[[WHILE_BODY_I]], !llvm.loop [[LOOP22:![0-9]+]]
+// AMDGCNSPIRV:       [[_ZL4NORMIPKD_EXIT_LOOPEXIT]]:
 // AMDGCNSPIRV-NEXT:    [[TMP1:%.*]] = tail call contract addrspace(4) double @llvm.sqrt.f64(double [[ADD_I]])
-// AMDGCNSPIRV-NEXT:    br label [[_ZL4NORMIPKD_EXIT]]
-// AMDGCNSPIRV:       _ZL4normiPKd.exit:
-// AMDGCNSPIRV-NEXT:    [[__R_0_I_LCSSA:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[TMP1]], [[_ZL4NORMIPKD_EXIT_LOOPEXIT]] ]
+// AMDGCNSPIRV-NEXT:    br label %[[_ZL4NORMIPKD_EXIT]]
+// AMDGCNSPIRV:       [[_ZL4NORMIPKD_EXIT]]:
+// AMDGCNSPIRV-NEXT:    [[__R_0_I_LCSSA:%.*]] = phi double [ 0.000000e+00, %[[ENTRY]] ], [ [[TMP1]], %[[_ZL4NORMIPKD_EXIT_LOOPEXIT]] ]
 // AMDGCNSPIRV-NEXT:    ret double [[__R_0_I_LCSSA]]
 //
 extern "C" __device__ double test_norm(int x, const double *y) {
   return norm(x, y);
 }
 
-// DEFAULT-LABEL: @test_powf(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_pow_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR15]]
+// DEFAULT-LABEL: define dso_local noundef float @test_powf(
+// DEFAULT-SAME: float noundef [[X:%.*]], float noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_pow_f32(float noundef [[X]], float noundef [[Y]]) #[[ATTR15]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
-// FINITEONLY-LABEL: @test_powf(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_pow_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR15]]
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) float @test_powf(
+// FINITEONLY-SAME: float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_pow_f32(float noundef nofpclass(nan inf) [[X]], float noundef nofpclass(nan inf) [[Y]]) #[[ATTR15]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
-// APPROX-LABEL: @test_powf(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_pow_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR15]]
+// APPROX-LABEL: define dso_local noundef float @test_powf(
+// APPROX-SAME: float noundef [[X:%.*]], float noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_pow_f32(float noundef [[X]], float noundef [[Y]]) #[[ATTR15]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
-// NCRDIV-LABEL: @test_powf(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_pow_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR15]]
+// NCRDIV-LABEL: define dso_local noundef float @test_powf(
+// NCRDIV-SAME: float noundef [[X:%.*]], float noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_pow_f32(float noundef [[X]], float noundef [[Y]]) #[[ATTR15]]
 // NCRDIV-NEXT:    ret float [[CALL_I]]
 //
-// AMDGCNSPIRV-LABEL: @test_powf(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_pow_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR13]]
+// AMDGCNSPIRV-LABEL: define spir_func noundef float @test_powf(
+// AMDGCNSPIRV-SAME: float noundef [[X:%.*]], float noundef [[Y:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR5]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_pow_f32(float noundef [[X]], float noundef [[Y]]) #[[ATTR13]]
 // AMDGCNSPIRV-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_powf(float x, float y) {
   return powf(x, y);
 }
 
-// DEFAULT-LABEL: @test_pow(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_pow_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR15]]
+// DEFAULT-LABEL: define dso_local noundef double @test_pow(
+// DEFAULT-SAME: double noundef [[X:%.*]], double noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_pow_f64(double noundef [[X]], double noundef [[Y]]) #[[ATTR15]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
-// FINITEONLY-LABEL: @test_pow(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_pow_f64(double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR15]]
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) double @test_pow(
+// FINITEONLY-SAME: double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_pow_f64(double noundef nofpclass(nan inf) [[X]], double noundef nofpclass(nan inf) [[Y]]) #[[ATTR15]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
-// APPROX-LABEL: @test_pow(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_pow_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR15]]
+// APPROX-LABEL: define dso_local noundef double @test_pow(
+// APPROX-SAME: double noundef [[X:%.*]], double noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_pow_f64(double noundef [[X]], double noundef [[Y]]) #[[ATTR15]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
-// NCRDIV-LABEL: @test_pow(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_pow_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR15]]
+// NCRDIV-LABEL: define dso_local noundef double @test_pow(
+// NCRDIV-SAME: double noundef [[X:%.*]], double noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_pow_f64(double noundef [[X]], double noundef [[Y]]) #[[ATTR15]]
 // NCRDIV-NEXT:    ret double [[CALL_I]]
 //
-// AMDGCNSPIRV-LABEL: @test_pow(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_pow_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR13]]
+// AMDGCNSPIRV-LABEL: define spir_func noundef double @test_pow(
+// AMDGCNSPIRV-SAME: double noundef [[X:%.*]], double noundef [[Y:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR5]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_pow_f64(double noundef [[X]], double noundef [[Y]]) #[[ATTR13]]
 // AMDGCNSPIRV-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_pow(double x, double y) {
   return pow(x, y);
 }
 
-// DEFAULT-LABEL: @test_powif(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_pown_f32(float noundef [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ATTR15]]
+// DEFAULT-LABEL: define dso_local noundef float @test_powif(
+// DEFAULT-SAME: float noundef [[X:%.*]], i32 noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_pown_f32(float noundef [[X]], i32 noundef [[Y]]) #[[ATTR15]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
-// FINITEONLY-LABEL: @test_powif(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_pown_f32(float noundef nofpclass(nan inf) [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ATTR15]]
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) float @test_powif(
+// FINITEONLY-SAME: float noundef nofpclass(nan inf) [[X:%.*]], i32 noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_pown_f32(float noundef nofpclass(nan inf) [[X]], i32 noundef [[Y]]) #[[ATTR15]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
-// APPROX-LABEL: @test_powif(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_pown_f32(float noundef [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ATTR15]]
+// APPROX-LABEL: define dso_local noundef float @test_powif(
+// APPROX-SAME: float noundef [[X:%.*]], i32 noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_pown_f32(float noundef [[X]], i32 noundef [[Y]]) #[[ATTR15]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
-// NCRDIV-LABEL: @test_powif(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_pown_f32(float noundef [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ATTR15]]
+// NCRDIV-LABEL: define dso_local noundef float @test_powif(
+// NCRDIV-SAME: float noundef [[X:%.*]], i32 noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_pown_f32(float noundef [[X]], i32 noundef [[Y]]) #[[ATTR15]]
 // NCRDIV-NEXT:    ret float [[CALL_I]]
 //
-// AMDGCNSPIRV-LABEL: @test_powif(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_pown_f32(float noundef [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ATTR13]]
+// AMDGCNSPIRV-LABEL: define spir_func noundef float @test_powif(
+// AMDGCNSPIRV-SAME: float noundef [[X:%.*]], i32 noundef [[Y:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR5]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_pown_f32(float noundef [[X]], i32 noundef [[Y]]) #[[ATTR13]]
 // AMDGCNSPIRV-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_powif(float x, int y) {
   return powif(x, y);
 }
 
-// DEFAULT-LABEL: @test_powi(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_pown_f64(double noundef [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ATTR15]]
+// DEFAULT-LABEL: define dso_local noundef double @test_powi(
+// DEFAULT-SAME: double noundef [[X:%.*]], i32 noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_pown_f64(double noundef [[X]], i32 noundef [[Y]]) #[[ATTR15]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
-// FINITEONLY-LABEL: @test_powi(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_pown_f64(double noundef nofpclass(nan inf) [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ATTR15]]
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) double @test_powi(
+// FINITEONLY-SAME: double noundef nofpclass(nan inf) [[X:%.*]], i32 noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_pown_f64(double noundef nofpclass(nan inf) [[X]], i32 noundef [[Y]]) #[[ATTR15]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
-// APPROX-LABEL: @test_powi(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_pown_f64(double noundef [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ATTR15]]
+// APPROX-LABEL: define dso_local noundef double @test_powi(
+// APPROX-SAME: double noundef [[X:%.*]], i32 noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_pown_f64(double noundef [[X]], i32 noundef [[Y]]) #[[ATTR15]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
-// NCRDIV-LABEL: @test_powi(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_pown_f64(double noundef [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ATTR15]]
+// NCRDIV-LABEL: define dso_local noundef double @test_powi(
+// NCRDIV-SAME: double noundef [[X:%.*]], i32 noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_pown_f64(double noundef [[X]], i32 noundef [[Y]]) #[[ATTR15]]
 // NCRDIV-NEXT:    ret double [[CALL_I]]
 //
-// AMDGCNSPIRV-LABEL: @test_powi(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_pown_f64(double noundef [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ATTR13]]
+// AMDGCNSPIRV-LABEL: define spir_func noundef double @test_powi(
+// AMDGCNSPIRV-SAME: double noundef [[X:%.*]], i32 noundef [[Y:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR5]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_pown_f64(double noundef [[X]], i32 noundef [[Y]]) #[[ATTR13]]
 // AMDGCNSPIRV-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_powi(double x, int y) {
   return powi(x, y);
 }
 
-// DEFAULT-LABEL: @test_rcbrtf(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rcbrt_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-LABEL: define dso_local noundef float @test_rcbrtf(
+// DEFAULT-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rcbrt_f32(float noundef [[X]]) #[[ATTR15]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
-// FINITEONLY-LABEL: @test_rcbrtf(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_rcbrt_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) float @test_rcbrtf(
+// FINITEONLY-SAME: float noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_rcbrt_f32(float noundef nofpclass(nan inf) [[X]]) #[[ATTR15]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
-// APPROX-LABEL: @test_rcbrtf(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rcbrt_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-LABEL: define dso_local noundef float @test_rcbrtf(
+// APPROX-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rcbrt_f32(float noundef [[X]]) #[[ATTR15]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
-// NCRDIV-LABEL: @test_rcbrtf(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rcbrt_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// NCRDIV-LABEL: define dso_local noundef float @test_rcbrtf(
+// NCRDIV-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rcbrt_f32(float noundef [[X]]) #[[ATTR15]]
 // NCRDIV-NEXT:    ret float [[CALL_I]]
 //
-// AMDGCNSPIRV-LABEL: @test_rcbrtf(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_rcbrt_f32(float noundef [[X:%.*]]) #[[ATTR13]]
+// AMDGCNSPIRV-LABEL: define spir_func noundef float @test_rcbrtf(
+// AMDGCNSPIRV-SAME: float noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR5]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_rcbrt_f32(float noundef [[X]]) #[[ATTR13]]
 // AMDGCNSPIRV-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_rcbrtf(float x) {
   return rcbrtf(x);
 }
 
-// DEFAULT-LABEL: @test_rcbrt(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rcbrt_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-LABEL: define dso_local noundef double @test_rcbrt(
+// DEFAULT-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rcbrt_f64(double noundef [[X]]) #[[ATTR15]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
-// FINITEONLY-LABEL: @test_rcbrt(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_rcbrt_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) double @test_rcbrt(
+// FINITEONLY-SAME: double noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_rcbrt_f64(double noundef nofpclass(nan inf) [[X]]) #[[ATTR15]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
-// APPROX-LABEL: @test_rcbrt(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rcbrt_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-LABEL: define dso_local noundef double @test_rcbrt(
+// APPROX-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rcbrt_f64(double noundef [[X]]) #[[ATTR15]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
-// NCRDIV-LABEL: @test_rcbrt(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rcbrt_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// NCRDIV-LABEL: define dso_local noundef double @test_rcbrt(
+// NCRDIV-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rcbrt_f64(double noundef [[X]]) #[[ATTR15]]
 // NCRDIV-NEXT:    ret double [[CALL_I]]
 //
-// AMDGCNSPIRV-LABEL: @test_rcbrt(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_rcbrt_f64(double noundef [[X:%.*]]) #[[ATTR13]]
+// AMDGCNSPIRV-LABEL: define spir_func noundef double @test_rcbrt(
+// AMDGCNSPIRV-SAME: double noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR5]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_rcbrt_f64(double noundef [[X]]) #[[ATTR13]]
 // AMDGCNSPIRV-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_rcbrt(double x) {
   return rcbrt(x);
 }
 
-// DEFAULT-LABEL: @test_remainderf(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_remainder_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR14]]
+// DEFAULT-LABEL: define dso_local noundef float @test_remainderf(
+// DEFAULT-SAME: float noundef [[X:%.*]], float noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_remainder_f32(float noundef [[X]], float noundef [[Y]]) #[[ATTR14]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
-// FINITEONLY-LABEL: @test_remainderf(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_remainder_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR14]]
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) float @test_remainderf(
+// FINITEONLY-SAME: float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_remainder_f32(float noundef nofpclass(nan inf) [[X]], float noundef nofpclass(nan inf) [[Y]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
-// APPROX-LABEL: @test_remainderf(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_remainder_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR14]]
+// APPROX-LABEL: define dso_local noundef float @test_remainderf(
+// APPROX-SAME: float noundef [[X:%.*]], float noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_remainder_f32(float noundef [[X]], float noundef [[Y]]) #[[ATTR14]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
-// NCRDIV-LABEL: @test_remainderf(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_remainder_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR14]]
+// NCRDIV-LABEL: define dso_local noundef float @test_remainderf(
+// NCRDIV-SAME: float noundef [[X:%.*]], float noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_remainder_f32(float noundef [[X]], float noundef [[Y]]) #[[ATTR14]]
 // NCRDIV-NEXT:    ret float [[CALL_I]]
 //
-// AMDGCNSPIRV-LABEL: @test_remainderf(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_remainder_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR12]]
+// AMDGCNSPIRV-LABEL: define spir_func noundef float @test_remainderf(
+// AMDGCNSPIRV-SAME: float noundef [[X:%.*]], float noundef [[Y:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR4]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_remainder_f32(float noundef [[X]], float noundef [[Y]]) #[[ATTR12]]
 // AMDGCNSPIRV-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_remainderf(float x, float y) {
   return remainderf(x, y);
 }
 
-// DEFAULT-LABEL: @test_remainder(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_remainder_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR14]]
+// DEFAULT-LABEL: define dso_local noundef double @test_remainder(
+// DEFAULT-SAME: double noundef [[X:%.*]], double noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_remainder_f64(double noundef [[X]], double noundef [[Y]]) #[[ATTR14]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
-// FINITEONLY-LABEL: @test_remainder(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_remainder_f64(double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR14]]
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) double @test_remainder(
+// FINITEONLY-SAME: double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_remainder_f64(double noundef nofpclass(nan inf) [[X]], double noundef nofpclass(nan inf) [[Y]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
-// APPROX-LABEL: @test_remainder(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_remainder_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR14]]
+// APPROX-LABEL: define dso_local noundef double @test_remainder(
+// APPROX-SAME: double noundef [[X:%.*]], double noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_remainder_f64(double noundef [[X]], double noundef [[Y]]) #[[ATTR14]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
-// NCRDIV-LABEL: @test_remainder(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_remainder_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR14]]
+// NCRDIV-LABEL: define dso_local noundef double @test_remainder(
+// NCRDIV-SAME: double noundef [[X:%.*]], double noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_remainder_f64(double noundef [[X]], double noundef [[Y]]) #[[ATTR14]]
 // NCRDIV-NEXT:    ret double [[CALL_I]]
 //
-// AMDGCNSPIRV-LABEL: @test_remainder(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_remainder_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR12]]
+// AMDGCNSPIRV-LABEL: define spir_func noundef double @test_remainder(
+// AMDGCNSPIRV-SAME: double noundef [[X:%.*]], double noundef [[Y:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR4]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_remainder_f64(double noundef [[X]], double noundef [[Y]]) #[[ATTR12]]
 // AMDGCNSPIRV-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_remainder(double x, double y) {
   return remainder(x, y);
 }
 
-// DEFAULT-LABEL: @test_remquof(
-// DEFAULT-NEXT:  entry:
+// DEFAULT-LABEL: define dso_local noundef float @test_remquof(
+// DEFAULT-SAME: float noundef [[X:%.*]], float noundef [[Y:%.*]], ptr noundef writeonly captures(none) initializes((0, 4)) [[Z:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
 // DEFAULT-NEXT:    [[__TMP_I:%.*]] = alloca i32, align 4, addrspace(5)
 // DEFAULT-NEXT:    call void @llvm.lifetime.start.p5(ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = call contract noundef float @__ocml_remquo_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
-// DEFAULT-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[__TMP_I]], align 4, !tbaa [[TBAA12]]
-// DEFAULT-NEXT:    store i32 [[TMP0]], ptr [[Z:%.*]], align 4, !tbaa [[TBAA12]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = call contract noundef float @__ocml_remquo_f32(float noundef [[X]], float noundef [[Y]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[__TMP_I]], align 4, !tbaa [[INT_TBAA12]]
+// DEFAULT-NEXT:    store i32 [[TMP0]], ptr [[Z]], align 4, !tbaa [[INT_TBAA12]]
 // DEFAULT-NEXT:    call void @llvm.lifetime.end.p5(ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
-// FINITEONLY-LABEL: @test_remquof(
-// FINITEONLY-NEXT:  entry:
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) float @test_remquof(
+// FINITEONLY-SAME: float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]], ptr noundef writeonly captures(none) initializes((0, 4)) [[Z:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
 // FINITEONLY-NEXT:    [[__TMP_I:%.*]] = alloca i32, align 4, addrspace(5)
 // FINITEONLY-NEXT:    call void @llvm.lifetime.start.p5(ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_remquo_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[__TMP_I]], align 4, !tbaa [[TBAA12]]
-// FINITEONLY-NEXT:    store i32 [[TMP0]], ptr [[Z:%.*]], align 4, !tbaa [[TBAA12]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_remquo_f32(float noundef nofpclass(nan inf) [[X]], float noundef nofpclass(nan inf) [[Y]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[__TMP_I]], align 4, !tbaa [[INT_TBAA12]]
+// FINITEONLY-NEXT:    store i32 [[TMP0]], ptr [[Z]], align 4, !tbaa [[INT_TBAA12]]
 // FINITEONLY-NEXT:    call void @llvm.lifetime.end.p5(ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
-// APPROX-LABEL: @test_remquof(
-// APPROX-NEXT:  entry:
+// APPROX-LABEL: define dso_local noundef float @test_remquof(
+// APPROX-SAME: float noundef [[X:%.*]], float noundef [[Y:%.*]], ptr noundef writeonly captures(none) initializes((0, 4)) [[Z:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
 // APPROX-NEXT:    [[__TMP_I:%.*]] = alloca i32, align 4, addrspace(5)
 // APPROX-NEXT:    call void @llvm.lifetime.start.p5(ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
-// APPROX-NEXT:    [[CALL_I:%.*]] = call contract noundef float @__ocml_remquo_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
-// APPROX-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[__TMP_I]], align 4, !tbaa [[TBAA12]]
-// APPROX-NEXT:    store i32 [[TMP0]], ptr [[Z:%.*]], align 4, !tbaa [[TBAA12]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = call contract noundef float @__ocml_remquo_f32(float noundef [[X]], float noundef [[Y]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
+// APPROX-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[__TMP_I]], align 4, !tbaa [[INT_TBAA12]]
+// APPROX-NEXT:    store i32 [[TMP0]], ptr [[Z]], align 4, !tbaa [[INT_TBAA12]]
 // APPROX-NEXT:    call void @llvm.lifetime.end.p5(ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
-// NCRDIV-LABEL: @test_remquof(
-// NCRDIV-NEXT:  entry:
+// NCRDIV-LABEL: define dso_local noundef float @test_remquof(
+// NCRDIV-SAME: float noundef [[X:%.*]], float noundef [[Y:%.*]], ptr noundef writeonly captures(none) initializes((0, 4)) [[Z:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
 // NCRDIV-NEXT:    [[__TMP_I:%.*]] = alloca i32, align 4, addrspace(5)
 // NCRDIV-NEXT:    call void @llvm.lifetime.start.p5(ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
-// NCRDIV-NEXT:    [[CALL_I:%.*]] = call contract noundef float @__ocml_remquo_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
-// NCRDIV-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[__TMP_I]], align 4, !tbaa [[TBAA13]]
-// NCRDIV-NEXT:    store i32 [[TMP0]], ptr [[Z:%.*]], align 4, !tbaa [[TBAA13]]
+// NCRDIV-NEXT:    [[CALL_I:%.*]] = call contract noundef float @__ocml_remquo_f32(float noundef [[X]], float noundef [[Y]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
+// NCRDIV-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[__TMP_I]], align 4, !tbaa [[INT_TBAA13]]
+// NCRDIV-NEXT:    store i32 [[TMP0]], ptr [[Z]], align 4, !tbaa [[INT_TBAA13]]
 // NCRDIV-NEXT:    call void @llvm.lifetime.end.p5(ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
 // NCRDIV-NEXT:    ret float [[CALL_I]]
 //
-// AMDGCNSPIRV-LABEL: @test_remquof(
-// AMDGCNSPIRV-NEXT:  entry:
+// AMDGCNSPIRV-LABEL: define spir_func noundef float @test_remquof(
+// AMDGCNSPIRV-SAME: float noundef [[X:%.*]], float noundef [[Y:%.*]], ptr addrspace(4) noundef writeonly captures(none) initializes((0, 4)) [[Z:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR6]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
 // AMDGCNSPIRV-NEXT:    [[__TMP_I:%.*]] = alloca i32, align 4
 // AMDGCNSPIRV-NEXT:    [[__TMP_ASCAST_I:%.*]] = addrspacecast ptr [[__TMP_I]] to ptr addrspace(4)
 // AMDGCNSPIRV-NEXT:    call addrspace(4) void @llvm.lifetime.start.p0(ptr nonnull [[__TMP_I]]) #[[ATTR15]]
-// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = call contract spir_func noundef addrspace(4) float @__ocml_remquo_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]], ptr noundef nonnull [[__TMP_I]]) #[[ATTR14]]
-// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(4) [[__TMP_ASCAST_I]], align 4, !tbaa [[TBAA13]]
-// AMDGCNSPIRV-NEXT:    store i32 [[TMP0]], ptr addrspace(4) [[Z:%.*]], align 4, !tbaa [[TBAA13]]
+// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = call contract spir_func noundef addrspace(4) float @__ocml_remquo_f32(float noundef [[X]], float noundef [[Y]], ptr noundef nonnull [[__TMP_I]]) #[[ATTR14]]
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(4) [[__TMP_ASCAST_I]], align 4, !tbaa [[INT_TBAA13]]
+// AMDGCNSPIRV-NEXT:    store i32 [[TMP0]], ptr addrspace(4) [[Z]], align 4, !tbaa [[INT_TBAA13]]
 // AMDGCNSPIRV-NEXT:    call addrspace(4) void @llvm.lifetime.end.p0(ptr nonnull [[__TMP_I]]) #[[ATTR15]]
 // AMDGCNSPIRV-NEXT:    ret float [[CALL_I]]
 //
@@ -5414,54 +6068,59 @@ extern "C" __device__ float test_remquof(float x, float y, int* z) {
   return remquof(x, y, z);
 }
 
-// DEFAULT-LABEL: @test_remquo(
-// DEFAULT-NEXT:  entry:
+// DEFAULT-LABEL: define dso_local noundef double @test_remquo(
+// DEFAULT-SAME: double noundef [[X:%.*]], double noundef [[Y:%.*]], ptr noundef writeonly captures(none) initializes((0, 4)) [[Z:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
 // DEFAULT-NEXT:    [[__TMP_I:%.*]] = alloca i32, align 4, addrspace(5)
 // DEFAULT-NEXT:    call void @llvm.lifetime.start.p5(ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = call contract noundef double @__ocml_remquo_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
-// DEFAULT-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[__TMP_I]], align 4, !tbaa [[TBAA12]]
-// DEFAULT-NEXT:    store i32 [[TMP0]], ptr [[Z:%.*]], align 4, !tbaa [[TBAA12]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = call contract noundef double @__ocml_remquo_f64(double noundef [[X]], double noundef [[Y]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[__TMP_I]], align 4, !tbaa [[INT_TBAA12]]
+// DEFAULT-NEXT:    store i32 [[TMP0]], ptr [[Z]], align 4, !tbaa [[INT_TBAA12]]
 // DEFAULT-NEXT:    call void @llvm.lifetime.end.p5(ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
-// FINITEONLY-LABEL: @test_remquo(
-// FINITEONLY-NEXT:  entry:
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) double @test_remquo(
+// FINITEONLY-SAME: double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]], ptr noundef writeonly captures(none) initializes((0, 4)) [[Z:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
 // FINITEONLY-NEXT:    [[__TMP_I:%.*]] = alloca i32, align 4, addrspace(5)
 // FINITEONLY-NEXT:    call void @llvm.lifetime.start.p5(ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_remquo_f64(double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[__TMP_I]], align 4, !tbaa [[TBAA12]]
-// FINITEONLY-NEXT:    store i32 [[TMP0]], ptr [[Z:%.*]], align 4, !tbaa [[TBAA12]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_remquo_f64(double noundef nofpclass(nan inf) [[X]], double noundef nofpclass(nan inf) [[Y]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[__TMP_I]], align 4, !tbaa [[INT_TBAA12]]
+// FINITEONLY-NEXT:    store i32 [[TMP0]], ptr [[Z]], align 4, !tbaa [[INT_TBAA12]]
 // FINITEONLY-NEXT:    call void @llvm.lifetime.end.p5(ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
-// APPROX-LABEL: @test_remquo(
-// APPROX-NEXT:  entry:
+// APPROX-LABEL: define dso_local noundef double @test_remquo(
+// APPROX-SAME: double noundef [[X:%.*]], double noundef [[Y:%.*]], ptr noundef writeonly captures(none) initializes((0, 4)) [[Z:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
 // APPROX-NEXT:    [[__TMP_I:%.*]] = alloca i32, align 4, addrspace(5)
 // APPROX-NEXT:    call void @llvm.lifetime.start.p5(ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
-// APPROX-NEXT:    [[CALL_I:%.*]] = call contract noundef double @__ocml_remquo_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
-// APPROX-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[__TMP_I]], align 4, !tbaa [[TBAA12]]
-// APPROX-NEXT:    store i32 [[TMP0]], ptr [[Z:%.*]], align 4, !tbaa [[TBAA12]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = call contract noundef double @__ocml_remquo_f64(double noundef [[X]], double noundef [[Y]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
+// APPROX-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[__TMP_I]], align 4, !tbaa [[INT_TBAA12]]
+// APPROX-NEXT:    store i32 [[TMP0]], ptr [[Z]], align 4, !tbaa [[INT_TBAA12]]
 // APPROX-NEXT:    call void @llvm.lifetime.end.p5(ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
-// NCRDIV-LABEL: @test_remquo(
-// NCRDIV-NEXT:  entry:
+// NCRDIV-LABEL: define dso_local noundef double @test_remquo(
+// NCRDIV-SAME: double noundef [[X:%.*]], double noundef [[Y:%.*]], ptr noundef writeonly captures(none) initializes((0, 4)) [[Z:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
 // NCRDIV-NEXT:    [[__TMP_I:%.*]] = alloca i32, align 4, addrspace(5)
 // NCRDIV-NEXT:    call void @llvm.lifetime.start.p5(ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
-// NCRDIV-NEXT:    [[CALL_I:%.*]] = call contract noundef double @__ocml_remquo_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
-// NCRDIV-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[__TMP_I]], align 4, !tbaa [[TBAA13]]
-// NCRDIV-NEXT:    store i32 [[TMP0]], ptr [[Z:%.*]], align 4, !tbaa [[TBAA13]]
+// NCRDIV-NEXT:    [[CALL_I:%.*]] = call contract noundef double @__ocml_remquo_f64(double noundef [[X]], double noundef [[Y]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
+// NCRDIV-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[__TMP_I]], align 4, !tbaa [[INT_TBAA13]]
+// NCRDIV-NEXT:    store i32 [[TMP0]], ptr [[Z]], align 4, !tbaa [[INT_TBAA13]]
 // NCRDIV-NEXT:    call void @llvm.lifetime.end.p5(ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
 // NCRDIV-NEXT:    ret double [[CALL_I]]
 //
-// AMDGCNSPIRV-LABEL: @test_remquo(
-// AMDGCNSPIRV-NEXT:  entry:
+// AMDGCNSPIRV-LABEL: define spir_func noundef double @test_remquo(
+// AMDGCNSPIRV-SAME: double noundef [[X:%.*]], double noundef [[Y:%.*]], ptr addrspace(4) noundef writeonly captures(none) initializes((0, 4)) [[Z:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR6]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
 // AMDGCNSPIRV-NEXT:    [[__TMP_I:%.*]] = alloca i32, align 4
 // AMDGCNSPIRV-NEXT:    [[__TMP_ASCAST_I:%.*]] = addrspacecast ptr [[__TMP_I]] to ptr addrspace(4)
 // AMDGCNSPIRV-NEXT:    call addrspace(4) void @llvm.lifetime.start.p0(ptr nonnull [[__TMP_I]]) #[[ATTR15]]
-// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = call contract spir_func noundef addrspace(4) double @__ocml_remquo_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]], ptr noundef nonnull [[__TMP_I]]) #[[ATTR14]]
-// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(4) [[__TMP_ASCAST_I]], align 4, !tbaa [[TBAA13]]
-// AMDGCNSPIRV-NEXT:    store i32 [[TMP0]], ptr addrspace(4) [[Z:%.*]], align 4, !tbaa [[TBAA13]]
+// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = call contract spir_func noundef addrspace(4) double @__ocml_remquo_f64(double noundef [[X]], double noundef [[Y]], ptr noundef nonnull [[__TMP_I]]) #[[ATTR14]]
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(4) [[__TMP_ASCAST_I]], align 4, !tbaa [[INT_TBAA13]]
+// AMDGCNSPIRV-NEXT:    store i32 [[TMP0]], ptr addrspace(4) [[Z]], align 4, !tbaa [[INT_TBAA13]]
 // AMDGCNSPIRV-NEXT:    call addrspace(4) void @llvm.lifetime.end.p0(ptr nonnull [[__TMP_I]]) #[[ATTR15]]
 // AMDGCNSPIRV-NEXT:    ret double [[CALL_I]]
 //
@@ -5469,219 +6128,244 @@ extern "C" __device__ double test_remquo(double x, double y, int* z) {
   return remquo(x, y, z);
 }
 
-// DEFAULT-LABEL: @test_rhypotf(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rhypot_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR14]]
+// DEFAULT-LABEL: define dso_local noundef float @test_rhypotf(
+// DEFAULT-SAME: float noundef [[X:%.*]], float noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rhypot_f32(float noundef [[X]], float noundef [[Y]]) #[[ATTR14]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
-// FINITEONLY-LABEL: @test_rhypotf(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_rhypot_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR14]]
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) float @test_rhypotf(
+// FINITEONLY-SAME: float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_rhypot_f32(float noundef nofpclass(nan inf) [[X]], float noundef nofpclass(nan inf) [[Y]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
-// APPROX-LABEL: @test_rhypotf(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rhypot_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR14]]
+// APPROX-LABEL: define dso_local noundef float @test_rhypotf(
+// APPROX-SAME: float noundef [[X:%.*]], float noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rhypot_f32(float noundef [[X]], float noundef [[Y]]) #[[ATTR14]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
-// NCRDIV-LABEL: @test_rhypotf(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rhypot_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR14]]
+// NCRDIV-LABEL: define dso_local noundef float @test_rhypotf(
+// NCRDIV-SAME: float noundef [[X:%.*]], float noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rhypot_f32(float noundef [[X]], float noundef [[Y]]) #[[ATTR14]]
 // NCRDIV-NEXT:    ret float [[CALL_I]]
 //
-// AMDGCNSPIRV-LABEL: @test_rhypotf(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_rhypot_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR12]]
+// AMDGCNSPIRV-LABEL: define spir_func noundef float @test_rhypotf(
+// AMDGCNSPIRV-SAME: float noundef [[X:%.*]], float noundef [[Y:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR4]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_rhypot_f32(float noundef [[X]], float noundef [[Y]]) #[[ATTR12]]
 // AMDGCNSPIRV-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_rhypotf(float x, float y) {
   return rhypotf(x, y);
 }
 
-// DEFAULT-LABEL: @test_rhypot(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rhypot_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR14]]
+// DEFAULT-LABEL: define dso_local noundef double @test_rhypot(
+// DEFAULT-SAME: double noundef [[X:%.*]], double noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rhypot_f64(double noundef [[X]], double noundef [[Y]]) #[[ATTR14]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
-// FINITEONLY-LABEL: @test_rhypot(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_rhypot_f64(double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR14]]
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) double @test_rhypot(
+// FINITEONLY-SAME: double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_rhypot_f64(double noundef nofpclass(nan inf) [[X]], double noundef nofpclass(nan inf) [[Y]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
-// APPROX-LABEL: @test_rhypot(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rhypot_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR14]]
+// APPROX-LABEL: define dso_local noundef double @test_rhypot(
+// APPROX-SAME: double noundef [[X:%.*]], double noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rhypot_f64(double noundef [[X]], double noundef [[Y]]) #[[ATTR14]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
-// NCRDIV-LABEL: @test_rhypot(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rhypot_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR14]]
+// NCRDIV-LABEL: define dso_local noundef double @test_rhypot(
+// NCRDIV-SAME: double noundef [[X:%.*]], double noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rhypot_f64(double noundef [[X]], double noundef [[Y]]) #[[ATTR14]]
 // NCRDIV-NEXT:    ret double [[CALL_I]]
 //
-// AMDGCNSPIRV-LABEL: @test_rhypot(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_rhypot_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR12]]
+// AMDGCNSPIRV-LABEL: define spir_func noundef double @test_rhypot(
+// AMDGCNSPIRV-SAME: double noundef [[X:%.*]], double noundef [[Y:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR4]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_rhypot_f64(double noundef [[X]], double noundef [[Y]]) #[[ATTR12]]
 // AMDGCNSPIRV-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_rhypot(double x, double y) {
   return rhypot(x, y);
 }
 
-// DEFAULT-LABEL: @test_rintf(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.rint.f32(float [[X:%.*]])
+// DEFAULT-LABEL: define dso_local noundef float @test_rintf(
+// DEFAULT-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.rint.f32(float [[X]])
 // DEFAULT-NEXT:    ret float [[TMP0]]
 //
-// FINITEONLY-LABEL: @test_rintf(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.rint.f32(float nofpclass(nan inf) [[X:%.*]])
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) float @test_rintf(
+// FINITEONLY-SAME: float noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.rint.f32(float nofpclass(nan inf) [[X]])
 // FINITEONLY-NEXT:    ret float [[TMP0]]
 //
-// APPROX-LABEL: @test_rintf(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.rint.f32(float [[X:%.*]])
+// APPROX-LABEL: define dso_local noundef float @test_rintf(
+// APPROX-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.rint.f32(float [[X]])
 // APPROX-NEXT:    ret float [[TMP0]]
 //
-// NCRDIV-LABEL: @test_rintf(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.rint.f32(float [[X:%.*]])
+// NCRDIV-LABEL: define dso_local noundef float @test_rintf(
+// NCRDIV-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.rint.f32(float [[X]])
 // NCRDIV-NEXT:    ret float [[TMP0]]
 //
-// AMDGCNSPIRV-LABEL: @test_rintf(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call contract noundef addrspace(4) float @llvm.rint.f32(float [[X:%.*]])
+// AMDGCNSPIRV-LABEL: define spir_func noundef float @test_rintf(
+// AMDGCNSPIRV-SAME: float noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR3]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call contract noundef addrspace(4) float @llvm.rint.f32(float [[X]])
 // AMDGCNSPIRV-NEXT:    ret float [[TMP0]]
 //
 extern "C" __device__ float test_rintf(float x) {
   return rintf(x);
 }
 
-// DEFAULT-LABEL: @test_rint(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.rint.f64(double [[X:%.*]])
+// DEFAULT-LABEL: define dso_local noundef double @test_rint(
+// DEFAULT-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.rint.f64(double [[X]])
 // DEFAULT-NEXT:    ret double [[TMP0]]
 //
-// FINITEONLY-LABEL: @test_rint(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.rint.f64(double nofpclass(nan inf) [[X:%.*]])
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) double @test_rint(
+// FINITEONLY-SAME: double noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.rint.f64(double nofpclass(nan inf) [[X]])
 // FINITEONLY-NEXT:    ret double [[TMP0]]
 //
-// APPROX-LABEL: @test_rint(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.rint.f64(double [[X:%.*]])
+// APPROX-LABEL: define dso_local noundef double @test_rint(
+// APPROX-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.rint.f64(double [[X]])
 // APPROX-NEXT:    ret double [[TMP0]]
 //
-// NCRDIV-LABEL: @test_rint(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.rint.f64(double [[X:%.*]])
+// NCRDIV-LABEL: define dso_local noundef double @test_rint(
+// NCRDIV-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.rint.f64(double [[X]])
 // NCRDIV-NEXT:    ret double [[TMP0]]
 //
-// AMDGCNSPIRV-LABEL: @test_rint(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call contract noundef addrspace(4) double @llvm.rint.f64(double [[X:%.*]])
+// AMDGCNSPIRV-LABEL: define spir_func noundef double @test_rint(
+// AMDGCNSPIRV-SAME: double noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR3]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call contract noundef addrspace(4) double @llvm.rint.f64(double [[X]])
 // AMDGCNSPIRV-NEXT:    ret double [[TMP0]]
 //
 extern "C" __device__ double test_rint(double x) {
   return rint(x);
 }
 
-// DEFAULT-LABEL: @test_rnormf(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[TOBOOL_NOT_I1:%.*]] = icmp eq i32 [[X:%.*]], 0
-// DEFAULT-NEXT:    br i1 [[TOBOOL_NOT_I1]], label [[_ZL6RNORMFIPKF_EXIT:%.*]], label [[WHILE_BODY_I:%.*]]
-// DEFAULT:       while.body.i:
-// DEFAULT-NEXT:    [[__R_0_I4:%.*]] = phi float [ [[ADD_I:%.*]], [[WHILE_BODY_I]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
-// DEFAULT-NEXT:    [[__A_ADDR_0_I3:%.*]] = phi ptr [ [[INCDEC_PTR_I:%.*]], [[WHILE_BODY_I]] ], [ [[Y:%.*]], [[ENTRY]] ]
-// DEFAULT-NEXT:    [[__DIM_ADDR_0_I2:%.*]] = phi i32 [ [[DEC_I:%.*]], [[WHILE_BODY_I]] ], [ [[X]], [[ENTRY]] ]
+// DEFAULT-LABEL: define dso_local noundef float @test_rnormf(
+// DEFAULT-SAME: i32 noundef [[X:%.*]], ptr noundef readonly captures(none) [[Y:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// DEFAULT-NEXT:  [[ENTRY:.*]]:
+// DEFAULT-NEXT:    [[TOBOOL_NOT_I1:%.*]] = icmp eq i32 [[X]], 0
+// DEFAULT-NEXT:    br i1 [[TOBOOL_NOT_I1]], label %[[_ZL6RNORMFIPKF_EXIT:.*]], label %[[WHILE_BODY_I:.*]]
+// DEFAULT:       [[WHILE_BODY_I]]:
+// DEFAULT-NEXT:    [[__R_0_I4:%.*]] = phi float [ [[ADD_I:%.*]], %[[WHILE_BODY_I]] ], [ 0.000000e+00, %[[ENTRY]] ]
+// DEFAULT-NEXT:    [[__A_ADDR_0_I3:%.*]] = phi ptr [ [[INCDEC_PTR_I:%.*]], %[[WHILE_BODY_I]] ], [ [[Y]], %[[ENTRY]] ]
+// DEFAULT-NEXT:    [[__DIM_ADDR_0_I2:%.*]] = phi i32 [ [[DEC_I:%.*]], %[[WHILE_BODY_I]] ], [ [[X]], %[[ENTRY]] ]
 // DEFAULT-NEXT:    [[DEC_I]] = add nsw i32 [[__DIM_ADDR_0_I2]], -1
-// DEFAULT-NEXT:    [[TMP0:%.*]] = load float, ptr [[__A_ADDR_0_I3]], align 4, !tbaa [[TBAA16]]
+// DEFAULT-NEXT:    [[TMP0:%.*]] = load float, ptr [[__A_ADDR_0_I3]], align 4, !tbaa [[FLOAT_TBAA16]]
 // DEFAULT-NEXT:    [[MUL_I:%.*]] = fmul contract float [[TMP0]], [[TMP0]]
 // DEFAULT-NEXT:    [[ADD_I]] = fadd contract float [[__R_0_I4]], [[MUL_I]]
 // DEFAULT-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds nuw i8, ptr [[__A_ADDR_0_I3]], i64 4
 // DEFAULT-NEXT:    [[TOBOOL_NOT_I:%.*]] = icmp eq i32 [[DEC_I]], 0
-// DEFAULT-NEXT:    br i1 [[TOBOOL_NOT_I]], label [[_ZL6RNORMFIPKF_EXIT]], label [[WHILE_BODY_I]], !llvm.loop [[LOOP22:![0-9]+]]
-// DEFAULT:       _ZL6rnormfiPKf.exit:
-// DEFAULT-NEXT:    [[__R_0_I_LCSSA:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD_I]], [[WHILE_BODY_I]] ]
+// DEFAULT-NEXT:    br i1 [[TOBOOL_NOT_I]], label %[[_ZL6RNORMFIPKF_EXIT]], label %[[WHILE_BODY_I]], !llvm.loop [[LOOP22:![0-9]+]]
+// DEFAULT:       [[_ZL6RNORMFIPKF_EXIT]]:
+// DEFAULT-NEXT:    [[__R_0_I_LCSSA:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ], [ [[ADD_I]], %[[WHILE_BODY_I]] ]
 // DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rsqrt_f32(float noundef [[__R_0_I_LCSSA]]) #[[ATTR15]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
-// FINITEONLY-LABEL: @test_rnormf(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[TOBOOL_NOT_I1:%.*]] = icmp eq i32 [[X:%.*]], 0
-// FINITEONLY-NEXT:    br i1 [[TOBOOL_NOT_I1]], label [[_ZL6RNORMFIPKF_EXIT:%.*]], label [[WHILE_BODY_I:%.*]]
-// FINITEONLY:       while.body.i:
-// FINITEONLY-NEXT:    [[__R_0_I4:%.*]] = phi float [ [[ADD_I:%.*]], [[WHILE_BODY_I]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
-// FINITEONLY-NEXT:    [[__A_ADDR_0_I3:%.*]] = phi ptr [ [[INCDEC_PTR_I:%.*]], [[WHILE_BODY_I]] ], [ [[Y:%.*]], [[ENTRY]] ]
-// FINITEONLY-NEXT:    [[__DIM_ADDR_0_I2:%.*]] = phi i32 [ [[DEC_I:%.*]], [[WHILE_BODY_I]] ], [ [[X]], [[ENTRY]] ]
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) float @test_rnormf(
+// FINITEONLY-SAME: i32 noundef [[X:%.*]], ptr noundef readonly captures(none) [[Y:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*]]:
+// FINITEONLY-NEXT:    [[TOBOOL_NOT_I1:%.*]] = icmp eq i32 [[X]], 0
+// FINITEONLY-NEXT:    br i1 [[TOBOOL_NOT_I1]], label %[[_ZL6RNORMFIPKF_EXIT:.*]], label %[[WHILE_BODY_I:.*]]
+// FINITEONLY:       [[WHILE_BODY_I]]:
+// FINITEONLY-NEXT:    [[__R_0_I4:%.*]] = phi float [ [[ADD_I:%.*]], %[[WHILE_BODY_I]] ], [ 0.000000e+00, %[[ENTRY]] ]
+// FINITEONLY-NEXT:    [[__A_ADDR_0_I3:%.*]] = phi ptr [ [[INCDEC_PTR_I:%.*]], %[[WHILE_BODY_I]] ], [ [[Y]], %[[ENTRY]] ]
+// FINITEONLY-NEXT:    [[__DIM_ADDR_0_I2:%.*]] = phi i32 [ [[DEC_I:%.*]], %[[WHILE_BODY_I]] ], [ [[X]], %[[ENTRY]] ]
 // FINITEONLY-NEXT:    [[DEC_I]] = add nsw i32 [[__DIM_ADDR_0_I2]], -1
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = load float, ptr [[__A_ADDR_0_I3]], align 4, !tbaa [[TBAA16]]
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = load float, ptr [[__A_ADDR_0_I3]], align 4, !tbaa [[FLOAT_TBAA16]]
 // FINITEONLY-NEXT:    [[MUL_I:%.*]] = fmul nnan ninf contract float [[TMP0]], [[TMP0]]
 // FINITEONLY-NEXT:    [[ADD_I]] = fadd nnan ninf contract float [[__R_0_I4]], [[MUL_I]]
 // FINITEONLY-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds nuw i8, ptr [[__A_ADDR_0_I3]], i64 4
 // FINITEONLY-NEXT:    [[TOBOOL_NOT_I:%.*]] = icmp eq i32 [[DEC_I]], 0
-// FINITEONLY-NEXT:    br i1 [[TOBOOL_NOT_I]], label [[_ZL6RNORMFIPKF_EXIT]], label [[WHILE_BODY_I]], !llvm.loop [[LOOP22:![0-9]+]]
-// FINITEONLY:       _ZL6rnormfiPKf.exit:
-// FINITEONLY-NEXT:    [[__R_0_I_LCSSA:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD_I]], [[WHILE_BODY_I]] ]
+// FINITEONLY-NEXT:    br i1 [[TOBOOL_NOT_I]], label %[[_ZL6RNORMFIPKF_EXIT]], label %[[WHILE_BODY_I]], !llvm.loop [[LOOP22:![0-9]+]]
+// FINITEONLY:       [[_ZL6RNORMFIPKF_EXIT]]:
+// FINITEONLY-NEXT:    [[__R_0_I_LCSSA:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ], [ [[ADD_I]], %[[WHILE_BODY_I]] ]
 // FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_rsqrt_f32(float noundef nofpclass(nan inf) [[__R_0_I_LCSSA]]) #[[ATTR15]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
-// APPROX-LABEL: @test_rnormf(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[TOBOOL_NOT_I1:%.*]] = icmp eq i32 [[X:%.*]], 0
-// APPROX-NEXT:    br i1 [[TOBOOL_NOT_I1]], label [[_ZL6RNORMFIPKF_EXIT:%.*]], label [[WHILE_BODY_I:%.*]]
-// APPROX:       while.body.i:
-// APPROX-NEXT:    [[__R_0_I4:%.*]] = phi float [ [[ADD_I:%.*]], [[WHILE_BODY_I]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
-// APPROX-NEXT:    [[__A_ADDR_0_I3:%.*]] = phi ptr [ [[INCDEC_PTR_I:%.*]], [[WHILE_BODY_I]] ], [ [[Y:%.*]], [[ENTRY]] ]
-// APPROX-NEXT:    [[__DIM_ADDR_0_I2:%.*]] = phi i32 [ [[DEC_I:%.*]], [[WHILE_BODY_I]] ], [ [[X]], [[ENTRY]] ]
+// APPROX-LABEL: define dso_local noundef float @test_rnormf(
+// APPROX-SAME: i32 noundef [[X:%.*]], ptr noundef readonly captures(none) [[Y:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// APPROX-NEXT:  [[ENTRY:.*]]:
+// APPROX-NEXT:    [[TOBOOL_NOT_I1:%.*]] = icmp eq i32 [[X]], 0
+// APPROX-NEXT:    br i1 [[TOBOOL_NOT_I1]], label %[[_ZL6RNORMFIPKF_EXIT:.*]], label %[[WHILE_BODY_I:.*]]
+// APPROX:       [[WHILE_BODY_I]]:
+// APPROX-NEXT:    [[__R_0_I4:%.*]] = phi float [ [[ADD_I:%.*]], %[[WHILE_BODY_I]] ], [ 0.000000e+00, %[[ENTRY]] ]
+// APPROX-NEXT:    [[__A_ADDR_0_I3:%.*]] = phi ptr [ [[INCDEC_PTR_I:%.*]], %[[WHILE_BODY_I]] ], [ [[Y]], %[[ENTRY]] ]
+// APPROX-NEXT:    [[__DIM_ADDR_0_I2:%.*]] = phi i32 [ [[DEC_I:%.*]], %[[WHILE_BODY_I]] ], [ [[X]], %[[ENTRY]] ]
 // APPROX-NEXT:    [[DEC_I]] = add nsw i32 [[__DIM_ADDR_0_I2]], -1
-// APPROX-NEXT:    [[TMP0:%.*]] = load float, ptr [[__A_ADDR_0_I3]], align 4, !tbaa [[TBAA16]]
+// APPROX-NEXT:    [[TMP0:%.*]] = load float, ptr [[__A_ADDR_0_I3]], align 4, !tbaa [[FLOAT_TBAA16]]
 // APPROX-NEXT:    [[MUL_I:%.*]] = fmul contract float [[TMP0]], [[TMP0]]
 // APPROX-NEXT:    [[ADD_I]] = fadd contract float [[__R_0_I4]], [[MUL_I]]
 // APPROX-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds nuw i8, ptr [[__A_ADDR_0_I3]], i64 4
 // APPROX-NEXT:    [[TOBOOL_NOT_I:%.*]] = icmp eq i32 [[DEC_I]], 0
-// APPROX-NEXT:    br i1 [[TOBOOL_NOT_I]], label [[_ZL6RNORMFIPKF_EXIT]], label [[WHILE_BODY_I]], !llvm.loop [[LOOP22:![0-9]+]]
-// APPROX:       _ZL6rnormfiPKf.exit:
-// APPROX-NEXT:    [[__R_0_I_LCSSA:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD_I]], [[WHILE_BODY_I]] ]
+// APPROX-NEXT:    br i1 [[TOBOOL_NOT_I]], label %[[_ZL6RNORMFIPKF_EXIT]], label %[[WHILE_BODY_I]], !llvm.loop [[LOOP22:![0-9]+]]
+// APPROX:       [[_ZL6RNORMFIPKF_EXIT]]:
+// APPROX-NEXT:    [[__R_0_I_LCSSA:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ], [ [[ADD_I]], %[[WHILE_BODY_I]] ]
 // APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rsqrt_f32(float noundef [[__R_0_I_LCSSA]]) #[[ATTR15]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
-// NCRDIV-LABEL: @test_rnormf(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[TOBOOL_NOT_I1:%.*]] = icmp eq i32 [[X:%.*]], 0
-// NCRDIV-NEXT:    br i1 [[TOBOOL_NOT_I1]], label [[_ZL6RNORMFIPKF_EXIT:%.*]], label [[WHILE_BODY_I:%.*]]
-// NCRDIV:       while.body.i:
-// NCRDIV-NEXT:    [[__R_0_I4:%.*]] = phi float [ [[ADD_I:%.*]], [[WHILE_BODY_I]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
-// NCRDIV-NEXT:    [[__A_ADDR_0_I3:%.*]] = phi ptr [ [[INCDEC_PTR_I:%.*]], [[WHILE_BODY_I]] ], [ [[Y:%.*]], [[ENTRY]] ]
-// NCRDIV-NEXT:    [[__DIM_ADDR_0_I2:%.*]] = phi i32 [ [[DEC_I:%.*]], [[WHILE_BODY_I]] ], [ [[X]], [[ENTRY]] ]
+// NCRDIV-LABEL: define dso_local noundef float @test_rnormf(
+// NCRDIV-SAME: i32 noundef [[X:%.*]], ptr noundef readonly captures(none) [[Y:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// NCRDIV-NEXT:  [[ENTRY:.*]]:
+// NCRDIV-NEXT:    [[TOBOOL_NOT_I1:%.*]] = icmp eq i32 [[X]], 0
+// NCRDIV-NEXT:    br i1 [[TOBOOL_NOT_I1]], label %[[_ZL6RNORMFIPKF_EXIT:.*]], label %[[WHILE_BODY_I:.*]]
+// NCRDIV:       [[WHILE_BODY_I]]:
+// NCRDIV-NEXT:    [[__R_0_I4:%.*]] = phi float [ [[ADD_I:%.*]], %[[WHILE_BODY_I]] ], [ 0.000000e+00, %[[ENTRY]] ]
+// NCRDIV-NEXT:    [[__A_ADDR_0_I3:%.*]] = phi ptr [ [[INCDEC_PTR_I:%.*]], %[[WHILE_BODY_I]] ], [ [[Y]], %[[ENTRY]] ]
+// NCRDIV-NEXT:    [[__DIM_ADDR_0_I2:%.*]] = phi i32 [ [[DEC_I:%.*]], %[[WHILE_BODY_I]] ], [ [[X]], %[[ENTRY]] ]
 // NCRDIV-NEXT:    [[DEC_I]] = add nsw i32 [[__DIM_ADDR_0_I2]], -1
-// NCRDIV-NEXT:    [[TMP0:%.*]] = load float, ptr [[__A_ADDR_0_I3]], align 4, !tbaa [[TBAA17]]
+// NCRDIV-NEXT:    [[TMP0:%.*]] = load float, ptr [[__A_ADDR_0_I3]], align 4, !tbaa [[FLOAT_TBAA17]]
 // NCRDIV-NEXT:    [[MUL_I:%.*]] = fmul contract float [[TMP0]], [[TMP0]]
 // NCRDIV-NEXT:    [[ADD_I]] = fadd contract float [[__R_0_I4]], [[MUL_I]]
 // NCRDIV-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds nuw i8, ptr [[__A_ADDR_0_I3]], i64 4
 // NCRDIV-NEXT:    [[TOBOOL_NOT_I:%.*]] = icmp eq i32 [[DEC_I]], 0
-// NCRDIV-NEXT:    br i1 [[TOBOOL_NOT_I]], label [[_ZL6RNORMFIPKF_EXIT]], label [[WHILE_BODY_I]], !llvm.loop [[LOOP23:![0-9]+]]
-// NCRDIV:       _ZL6rnormfiPKf.exit:
-// NCRDIV-NEXT:    [[__R_0_I_LCSSA:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD_I]], [[WHILE_BODY_I]] ]
+// NCRDIV-NEXT:    br i1 [[TOBOOL_NOT_I]], label %[[_ZL6RNORMFIPKF_EXIT]], label %[[WHILE_BODY_I]], !llvm.loop [[LOOP23:![0-9]+]]
+// NCRDIV:       [[_ZL6RNORMFIPKF_EXIT]]:
+// NCRDIV-NEXT:    [[__R_0_I_LCSSA:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ], [ [[ADD_I]], %[[WHILE_BODY_I]] ]
 // NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rsqrt_f32(float noundef [[__R_0_I_LCSSA]]) #[[ATTR15]]
 // NCRDIV-NEXT:    ret float [[CALL_I]]
 //
-// AMDGCNSPIRV-LABEL: @test_rnormf(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[TOBOOL_NOT_I1:%.*]] = icmp eq i32 [[X:%.*]], 0
-// AMDGCNSPIRV-NEXT:    br i1 [[TOBOOL_NOT_I1]], label [[_ZL6RNORMFIPKF_EXIT:%.*]], label [[WHILE_BODY_I:%.*]]
-// AMDGCNSPIRV:       while.body.i:
-// AMDGCNSPIRV-NEXT:    [[__R_0_I4:%.*]] = phi float [ [[ADD_I:%.*]], [[WHILE_BODY_I]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
-// AMDGCNSPIRV-NEXT:    [[__A_ADDR_0_I3:%.*]] = phi ptr addrspace(4) [ [[INCDEC_PTR_I:%.*]], [[WHILE_BODY_I]] ], [ [[Y:%.*]], [[ENTRY]] ]
-// AMDGCNSPIRV-NEXT:    [[__DIM_ADDR_0_I2:%.*]] = phi i32 [ [[DEC_I:%.*]], [[WHILE_BODY_I]] ], [ [[X]], [[ENTRY]] ]
+// AMDGCNSPIRV-LABEL: define spir_func noundef float @test_rnormf(
+// AMDGCNSPIRV-SAME: i32 noundef [[X:%.*]], ptr addrspace(4) noundef readonly captures(none) [[Y:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR5]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*]]:
+// AMDGCNSPIRV-NEXT:    [[TOBOOL_NOT_I1:%.*]] = icmp eq i32 [[X]], 0
+// AMDGCNSPIRV-NEXT:    br i1 [[TOBOOL_NOT_I1]], label %[[_ZL6RNORMFIPKF_EXIT:.*]], label %[[WHILE_BODY_I:.*]]
+// AMDGCNSPIRV:       [[WHILE_BODY_I]]:
+// AMDGCNSPIRV-NEXT:    [[__R_0_I4:%.*]] = phi float [ [[ADD_I:%.*]], %[[WHILE_BODY_I]] ], [ 0.000000e+00, %[[ENTRY]] ]
+// AMDGCNSPIRV-NEXT:    [[__A_ADDR_0_I3:%.*]] = phi ptr addrspace(4) [ [[INCDEC_PTR_I:%.*]], %[[WHILE_BODY_I]] ], [ [[Y]], %[[ENTRY]] ]
+// AMDGCNSPIRV-NEXT:    [[__DIM_ADDR_0_I2:%.*]] = phi i32 [ [[DEC_I:%.*]], %[[WHILE_BODY_I]] ], [ [[X]], %[[ENTRY]] ]
 // AMDGCNSPIRV-NEXT:    [[DEC_I]] = add nsw i32 [[__DIM_ADDR_0_I2]], -1
-// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = load float, ptr addrspace(4) [[__A_ADDR_0_I3]], align 4, !tbaa [[TBAA17]]
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = load float, ptr addrspace(4) [[__A_ADDR_0_I3]], align 4, !tbaa [[FLOAT_TBAA17]]
 // AMDGCNSPIRV-NEXT:    [[MUL_I:%.*]] = fmul contract float [[TMP0]], [[TMP0]]
 // AMDGCNSPIRV-NEXT:    [[ADD_I]] = fadd contract float [[__R_0_I4]], [[MUL_I]]
 // AMDGCNSPIRV-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[__A_ADDR_0_I3]], i64 4
 // AMDGCNSPIRV-NEXT:    [[TOBOOL_NOT_I:%.*]] = icmp eq i32 [[DEC_I]], 0
-// AMDGCNSPIRV-NEXT:    br i1 [[TOBOOL_NOT_I]], label [[_ZL6RNORMFIPKF_EXIT]], label [[WHILE_BODY_I]], !llvm.loop [[LOOP23:![0-9]+]]
-// AMDGCNSPIRV:       _ZL6rnormfiPKf.exit:
-// AMDGCNSPIRV-NEXT:    [[__R_0_I_LCSSA:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD_I]], [[WHILE_BODY_I]] ]
+// AMDGCNSPIRV-NEXT:    br i1 [[TOBOOL_NOT_I]], label %[[_ZL6RNORMFIPKF_EXIT]], label %[[WHILE_BODY_I]], !llvm.loop [[LOOP23:![0-9]+]]
+// AMDGCNSPIRV:       [[_ZL6RNORMFIPKF_EXIT]]:
+// AMDGCNSPIRV-NEXT:    [[__R_0_I_LCSSA:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ], [ [[ADD_I]], %[[WHILE_BODY_I]] ]
 // AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_rsqrt_f32(float noundef [[__R_0_I_LCSSA]]) #[[ATTR13]]
 // AMDGCNSPIRV-NEXT:    ret float [[CALL_I]]
 //
@@ -5689,103 +6373,108 @@ extern "C" __device__ float test_rnormf(int x, const float* y) {
   return rnormf(x, y);
 }
 
-// DEFAULT-LABEL: @test_rnorm(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[TOBOOL_NOT_I1:%.*]] = icmp eq i32 [[X:%.*]], 0
-// DEFAULT-NEXT:    br i1 [[TOBOOL_NOT_I1]], label [[_ZL5RNORMIPKD_EXIT:%.*]], label [[WHILE_BODY_I:%.*]]
-// DEFAULT:       while.body.i:
-// DEFAULT-NEXT:    [[__R_0_I4:%.*]] = phi double [ [[ADD_I:%.*]], [[WHILE_BODY_I]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
-// DEFAULT-NEXT:    [[__A_ADDR_0_I3:%.*]] = phi ptr [ [[INCDEC_PTR_I:%.*]], [[WHILE_BODY_I]] ], [ [[Y:%.*]], [[ENTRY]] ]
-// DEFAULT-NEXT:    [[__DIM_ADDR_0_I2:%.*]] = phi i32 [ [[DEC_I:%.*]], [[WHILE_BODY_I]] ], [ [[X]], [[ENTRY]] ]
+// DEFAULT-LABEL: define dso_local noundef double @test_rnorm(
+// DEFAULT-SAME: i32 noundef [[X:%.*]], ptr noundef readonly captures(none) [[Y:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// DEFAULT-NEXT:  [[ENTRY:.*]]:
+// DEFAULT-NEXT:    [[TOBOOL_NOT_I1:%.*]] = icmp eq i32 [[X]], 0
+// DEFAULT-NEXT:    br i1 [[TOBOOL_NOT_I1]], label %[[_ZL5RNORMIPKD_EXIT:.*]], label %[[WHILE_BODY_I:.*]]
+// DEFAULT:       [[WHILE_BODY_I]]:
+// DEFAULT-NEXT:    [[__R_0_I4:%.*]] = phi double [ [[ADD_I:%.*]], %[[WHILE_BODY_I]] ], [ 0.000000e+00, %[[ENTRY]] ]
+// DEFAULT-NEXT:    [[__A_ADDR_0_I3:%.*]] = phi ptr [ [[INCDEC_PTR_I:%.*]], %[[WHILE_BODY_I]] ], [ [[Y]], %[[ENTRY]] ]
+// DEFAULT-NEXT:    [[__DIM_ADDR_0_I2:%.*]] = phi i32 [ [[DEC_I:%.*]], %[[WHILE_BODY_I]] ], [ [[X]], %[[ENTRY]] ]
 // DEFAULT-NEXT:    [[DEC_I]] = add nsw i32 [[__DIM_ADDR_0_I2]], -1
-// DEFAULT-NEXT:    [[TMP0:%.*]] = load double, ptr [[__A_ADDR_0_I3]], align 8, !tbaa [[TBAA18]]
+// DEFAULT-NEXT:    [[TMP0:%.*]] = load double, ptr [[__A_ADDR_0_I3]], align 8, !tbaa [[DOUBLE_TBAA18]]
 // DEFAULT-NEXT:    [[MUL_I:%.*]] = fmul contract double [[TMP0]], [[TMP0]]
 // DEFAULT-NEXT:    [[ADD_I]] = fadd contract double [[__R_0_I4]], [[MUL_I]]
 // DEFAULT-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds nuw i8, ptr [[__A_ADDR_0_I3]], i64 8
 // DEFAULT-NEXT:    [[TOBOOL_NOT_I:%.*]] = icmp eq i32 [[DEC_I]], 0
-// DEFAULT-NEXT:    br i1 [[TOBOOL_NOT_I]], label [[_ZL5RNORMIPKD_EXIT]], label [[WHILE_BODY_I]], !llvm.loop [[LOOP23:![0-9]+]]
-// DEFAULT:       _ZL5rnormiPKd.exit:
-// DEFAULT-NEXT:    [[__R_0_I_LCSSA:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[ADD_I]], [[WHILE_BODY_I]] ]
+// DEFAULT-NEXT:    br i1 [[TOBOOL_NOT_I]], label %[[_ZL5RNORMIPKD_EXIT]], label %[[WHILE_BODY_I]], !llvm.loop [[LOOP23:![0-9]+]]
+// DEFAULT:       [[_ZL5RNORMIPKD_EXIT]]:
+// DEFAULT-NEXT:    [[__R_0_I_LCSSA:%.*]] = phi double [ 0.000000e+00, %[[ENTRY]] ], [ [[ADD_I]], %[[WHILE_BODY_I]] ]
 // DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rsqrt_f64(double noundef [[__R_0_I_LCSSA]]) #[[ATTR15]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
-// FINITEONLY-LABEL: @test_rnorm(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[TOBOOL_NOT_I1:%.*]] = icmp eq i32 [[X:%.*]], 0
-// FINITEONLY-NEXT:    br i1 [[TOBOOL_NOT_I1]], label [[_ZL5RNORMIPKD_EXIT:%.*]], label [[WHILE_BODY_I:%.*]]
-// FINITEONLY:       while.body.i:
-// FINITEONLY-NEXT:    [[__R_0_I4:%.*]] = phi double [ [[ADD_I:%.*]], [[WHILE_BODY_I]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
-// FINITEONLY-NEXT:    [[__A_ADDR_0_I3:%.*]] = phi ptr [ [[INCDEC_PTR_I:%.*]], [[WHILE_BODY_I]] ], [ [[Y:%.*]], [[ENTRY]] ]
-// FINITEONLY-NEXT:    [[__DIM_ADDR_0_I2:%.*]] = phi i32 [ [[DEC_I:%.*]], [[WHILE_BODY_I]] ], [ [[X]], [[ENTRY]] ]
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) double @test_rnorm(
+// FINITEONLY-SAME: i32 noundef [[X:%.*]], ptr noundef readonly captures(none) [[Y:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*]]:
+// FINITEONLY-NEXT:    [[TOBOOL_NOT_I1:%.*]] = icmp eq i32 [[X]], 0
+// FINITEONLY-NEXT:    br i1 [[TOBOOL_NOT_I1]], label %[[_ZL5RNORMIPKD_EXIT:.*]], label %[[WHILE_BODY_I:.*]]
+// FINITEONLY:       [[WHILE_BODY_I]]:
+// FINITEONLY-NEXT:    [[__R_0_I4:%.*]] = phi double [ [[ADD_I:%.*]], %[[WHILE_BODY_I]] ], [ 0.000000e+00, %[[ENTRY]] ]
+// FINITEONLY-NEXT:    [[__A_ADDR_0_I3:%.*]] = phi ptr [ [[INCDEC_PTR_I:%.*]], %[[WHILE_BODY_I]] ], [ [[Y]], %[[ENTRY]] ]
+// FINITEONLY-NEXT:    [[__DIM_ADDR_0_I2:%.*]] = phi i32 [ [[DEC_I:%.*]], %[[WHILE_BODY_I]] ], [ [[X]], %[[ENTRY]] ]
 // FINITEONLY-NEXT:    [[DEC_I]] = add nsw i32 [[__DIM_ADDR_0_I2]], -1
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = load double, ptr [[__A_ADDR_0_I3]], align 8, !tbaa [[TBAA18]]
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = load double, ptr [[__A_ADDR_0_I3]], align 8, !tbaa [[DOUBLE_TBAA18]]
 // FINITEONLY-NEXT:    [[MUL_I:%.*]] = fmul nnan ninf contract double [[TMP0]], [[TMP0]]
 // FINITEONLY-NEXT:    [[ADD_I]] = fadd nnan ninf contract double [[__R_0_I4]], [[MUL_I]]
 // FINITEONLY-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds nuw i8, ptr [[__A_ADDR_0_I3]], i64 8
 // FINITEONLY-NEXT:    [[TOBOOL_NOT_I:%.*]] = icmp eq i32 [[DEC_I]], 0
-// FINITEONLY-NEXT:    br i1 [[TOBOOL_NOT_I]], label [[_ZL5RNORMIPKD_EXIT]], label [[WHILE_BODY_I]], !llvm.loop [[LOOP23:![0-9]+]]
-// FINITEONLY:       _ZL5rnormiPKd.exit:
-// FINITEONLY-NEXT:    [[__R_0_I_LCSSA:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[ADD_I]], [[WHILE_BODY_I]] ]
+// FINITEONLY-NEXT:    br i1 [[TOBOOL_NOT_I]], label %[[_ZL5RNORMIPKD_EXIT]], label %[[WHILE_BODY_I]], !llvm.loop [[LOOP23:![0-9]+]]
+// FINITEONLY:       [[_ZL5RNORMIPKD_EXIT]]:
+// FINITEONLY-NEXT:    [[__R_0_I_LCSSA:%.*]] = phi double [ 0.000000e+00, %[[ENTRY]] ], [ [[ADD_I]], %[[WHILE_BODY_I]] ]
 // FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_rsqrt_f64(double noundef nofpclass(nan inf) [[__R_0_I_LCSSA]]) #[[ATTR15]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
-// APPROX-LABEL: @test_rnorm(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[TOBOOL_NOT_I1:%.*]] = icmp eq i32 [[X:%.*]], 0
-// APPROX-NEXT:    br i1 [[TOBOOL_NOT_I1]], label [[_ZL5RNORMIPKD_EXIT:%.*]], label [[WHILE_BODY_I:%.*]]
-// APPROX:       while.body.i:
-// APPROX-NEXT:    [[__R_0_I4:%.*]] = phi double [ [[ADD_I:%.*]], [[WHILE_BODY_I]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
-// APPROX-NEXT:    [[__A_ADDR_0_I3:%.*]] = phi ptr [ [[INCDEC_PTR_I:%.*]], [[WHILE_BODY_I]] ], [ [[Y:%.*]], [[ENTRY]] ]
-// APPROX-NEXT:    [[__DIM_ADDR_0_I2:%.*]] = phi i32 [ [[DEC_I:%.*]], [[WHILE_BODY_I]] ], [ [[X]], [[ENTRY]] ]
+// APPROX-LABEL: define dso_local noundef double @test_rnorm(
+// APPROX-SAME: i32 noundef [[X:%.*]], ptr noundef readonly captures(none) [[Y:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// APPROX-NEXT:  [[ENTRY:.*]]:
+// APPROX-NEXT:    [[TOBOOL_NOT_I1:%.*]] = icmp eq i32 [[X]], 0
+// APPROX-NEXT:    br i1 [[TOBOOL_NOT_I1]], label %[[_ZL5RNORMIPKD_EXIT:.*]], label %[[WHILE_BODY_I:.*]]
+// APPROX:       [[WHILE_BODY_I]]:
+// APPROX-NEXT:    [[__R_0_I4:%.*]] = phi double [ [[ADD_I:%.*]], %[[WHILE_BODY_I]] ], [ 0.000000e+00, %[[ENTRY]] ]
+// APPROX-NEXT:    [[__A_ADDR_0_I3:%.*]] = phi ptr [ [[INCDEC_PTR_I:%.*]], %[[WHILE_BODY_I]] ], [ [[Y]], %[[ENTRY]] ]
+// APPROX-NEXT:    [[__DIM_ADDR_0_I2:%.*]] = phi i32 [ [[DEC_I:%.*]], %[[WHILE_BODY_I]] ], [ [[X]], %[[ENTRY]] ]
 // APPROX-NEXT:    [[DEC_I]] = add nsw i32 [[__DIM_ADDR_0_I2]], -1
-// APPROX-NEXT:    [[TMP0:%.*]] = load double, ptr [[__A_ADDR_0_I3]], align 8, !tbaa [[TBAA18]]
+// APPROX-NEXT:    [[TMP0:%.*]] = load double, ptr [[__A_ADDR_0_I3]], align 8, !tbaa [[DOUBLE_TBAA18]]
 // APPROX-NEXT:    [[MUL_I:%.*]] = fmul contract double [[TMP0]], [[TMP0]]
 // APPROX-NEXT:    [[ADD_I]] = fadd contract double [[__R_0_I4]], [[MUL_I]]
 // APPROX-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds nuw i8, ptr [[__A_ADDR_0_I3]], i64 8
 // APPROX-NEXT:    [[TOBOOL_NOT_I:%.*]] = icmp eq i32 [[DEC_I]], 0
-// APPROX-NEXT:    br i1 [[TOBOOL_NOT_I]], label [[_ZL5RNORMIPKD_EXIT]], label [[WHILE_BODY_I]], !llvm.loop [[LOOP23:![0-9]+]]
-// APPROX:       _ZL5rnormiPKd.exit:
-// APPROX-NEXT:    [[__R_0_I_LCSSA:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[ADD_I]], [[WHILE_BODY_I]] ]
+// APPROX-NEXT:    br i1 [[TOBOOL_NOT_I]], label %[[_ZL5RNORMIPKD_EXIT]], label %[[WHILE_BODY_I]], !llvm.loop [[LOOP23:![0-9]+]]
+// APPROX:       [[_ZL5RNORMIPKD_EXIT]]:
+// APPROX-NEXT:    [[__R_0_I_LCSSA:%.*]] = phi double [ 0.000000e+00, %[[ENTRY]] ], [ [[ADD_I]], %[[WHILE_BODY_I]] ]
 // APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rsqrt_f64(double noundef [[__R_0_I_LCSSA]]) #[[ATTR15]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
-// NCRDIV-LABEL: @test_rnorm(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[TOBOOL_NOT_I1:%.*]] = icmp eq i32 [[X:%.*]], 0
-// NCRDIV-NEXT:    br i1 [[TOBOOL_NOT_I1]], label [[_ZL5RNORMIPKD_EXIT:%.*]], label [[WHILE_BODY_I:%.*]]
-// NCRDIV:       while.body.i:
-// NCRDIV-NEXT:    [[__R_0_I4:%.*]] = phi double [ [[ADD_I:%.*]], [[WHILE_BODY_I]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
-// NCRDIV-NEXT:    [[__A_ADDR_0_I3:%.*]] = phi ptr [ [[INCDEC_PTR_I:%.*]], [[WHILE_BODY_I]] ], [ [[Y:%.*]], [[ENTRY]] ]
-// NCRDIV-NEXT:    [[__DIM_ADDR_0_I2:%.*]] = phi i32 [ [[DEC_I:%.*]], [[WHILE_BODY_I]] ], [ [[X]], [[ENTRY]] ]
+// NCRDIV-LABEL: define dso_local noundef double @test_rnorm(
+// NCRDIV-SAME: i32 noundef [[X:%.*]], ptr noundef readonly captures(none) [[Y:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// NCRDIV-NEXT:  [[ENTRY:.*]]:
+// NCRDIV-NEXT:    [[TOBOOL_NOT_I1:%.*]] = icmp eq i32 [[X]], 0
+// NCRDIV-NEXT:    br i1 [[TOBOOL_NOT_I1]], label %[[_ZL5RNORMIPKD_EXIT:.*]], label %[[WHILE_BODY_I:.*]]
+// NCRDIV:       [[WHILE_BODY_I]]:
+// NCRDIV-NEXT:    [[__R_0_I4:%.*]] = phi double [ [[ADD_I:%.*]], %[[WHILE_BODY_I]] ], [ 0.000000e+00, %[[ENTRY]] ]
+// NCRDIV-NEXT:    [[__A_ADDR_0_I3:%.*]] = phi ptr [ [[INCDEC_PTR_I:%.*]], %[[WHILE_BODY_I]] ], [ [[Y]], %[[ENTRY]] ]
+// NCRDIV-NEXT:    [[__DIM_ADDR_0_I2:%.*]] = phi i32 [ [[DEC_I:%.*]], %[[WHILE_BODY_I]] ], [ [[X]], %[[ENTRY]] ]
 // NCRDIV-NEXT:    [[DEC_I]] = add nsw i32 [[__DIM_ADDR_0_I2]], -1
-// NCRDIV-NEXT:    [[TMP0:%.*]] = load double, ptr [[__A_ADDR_0_I3]], align 8, !tbaa [[TBAA19]]
+// NCRDIV-NEXT:    [[TMP0:%.*]] = load double, ptr [[__A_ADDR_0_I3]], align 8, !tbaa [[DOUBLE_TBAA19]]
 // NCRDIV-NEXT:    [[MUL_I:%.*]] = fmul contract double [[TMP0]], [[TMP0]]
 // NCRDIV-NEXT:    [[ADD_I]] = fadd contract double [[__R_0_I4]], [[MUL_I]]
 // NCRDIV-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds nuw i8, ptr [[__A_ADDR_0_I3]], i64 8
 // NCRDIV-NEXT:    [[TOBOOL_NOT_I:%.*]] = icmp eq i32 [[DEC_I]], 0
-// NCRDIV-NEXT:    br i1 [[TOBOOL_NOT_I]], label [[_ZL5RNORMIPKD_EXIT]], label [[WHILE_BODY_I]], !llvm.loop [[LOOP24:![0-9]+]]
-// NCRDIV:       _ZL5rnormiPKd.exit:
-// NCRDIV-NEXT:    [[__R_0_I_LCSSA:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[ADD_I]], [[WHILE_BODY_I]] ]
+// NCRDIV-NEXT:    br i1 [[TOBOOL_NOT_I]], label %[[_ZL5RNORMIPKD_EXIT]], label %[[WHILE_BODY_I]], !llvm.loop [[LOOP24:![0-9]+]]
+// NCRDIV:       [[_ZL5RNORMIPKD_EXIT]]:
+// NCRDIV-NEXT:    [[__R_0_I_LCSSA:%.*]] = phi double [ 0.000000e+00, %[[ENTRY]] ], [ [[ADD_I]], %[[WHILE_BODY_I]] ]
 // NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rsqrt_f64(double noundef [[__R_0_I_LCSSA]]) #[[ATTR15]]
 // NCRDIV-NEXT:    ret double [[CALL_I]]
 //
-// AMDGCNSPIRV-LABEL: @test_rnorm(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[TOBOOL_NOT_I1:%.*]] = icmp eq i32 [[X:%.*]], 0
-// AMDGCNSPIRV-NEXT:    br i1 [[TOBOOL_NOT_I1]], label [[_ZL5RNORMIPKD_EXIT:%.*]], label [[WHILE_BODY_I:%.*]]
-// AMDGCNSPIRV:       while.body.i:
-// AMDGCNSPIRV-NEXT:    [[__R_0_I4:%.*]] = phi double [ [[ADD_I:%.*]], [[WHILE_BODY_I]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
-// AMDGCNSPIRV-NEXT:    [[__A_ADDR_0_I3:%.*]] = phi ptr addrspace(4) [ [[INCDEC_PTR_I:%.*]], [[WHILE_BODY_I]] ], [ [[Y:%.*]], [[ENTRY]] ]
-// AMDGCNSPIRV-NEXT:    [[__DIM_ADDR_0_I2:%.*]] = phi i32 [ [[DEC_I:%.*]], [[WHILE_BODY_I]] ], [ [[X]], [[ENTRY]] ]
+// AMDGCNSPIRV-LABEL: define spir_func noundef double @test_rnorm(
+// AMDGCNSPIRV-SAME: i32 noundef [[X:%.*]], ptr addrspace(4) noundef readonly captures(none) [[Y:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR5]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*]]:
+// AMDGCNSPIRV-NEXT:    [[TOBOOL_NOT_I1:%.*]] = icmp eq i32 [[X]], 0
+// AMDGCNSPIRV-NEXT:    br i1 [[TOBOOL_NOT_I1]], label %[[_ZL5RNORMIPKD_EXIT:.*]], label %[[WHILE_BODY_I:.*]]
+// AMDGCNSPIRV:       [[WHILE_BODY_I]]:
+// AMDGCNSPIRV-NEXT:    [[__R_0_I4:%.*]] = phi double [ [[ADD_I:%.*]], %[[WHILE_BODY_I]] ], [ 0.000000e+00, %[[ENTRY]] ]
+// AMDGCNSPIRV-NEXT:    [[__A_ADDR_0_I3:%.*]] = phi ptr addrspace(4) [ [[INCDEC_PTR_I:%.*]], %[[WHILE_BODY_I]] ], [ [[Y]], %[[ENTRY]] ]
+// AMDGCNSPIRV-NEXT:    [[__DIM_ADDR_0_I2:%.*]] = phi i32 [ [[DEC_I:%.*]], %[[WHILE_BODY_I]] ], [ [[X]], %[[ENTRY]] ]
 // AMDGCNSPIRV-NEXT:    [[DEC_I]] = add nsw i32 [[__DIM_ADDR_0_I2]], -1
-// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = load double, ptr addrspace(4) [[__A_ADDR_0_I3]], align 8, !tbaa [[TBAA19]]
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = load double, ptr addrspace(4) [[__A_ADDR_0_I3]], align 8, !tbaa [[DOUBLE_TBAA19]]
 // AMDGCNSPIRV-NEXT:    [[MUL_I:%.*]] = fmul contract double [[TMP0]], [[TMP0]]
 // AMDGCNSPIRV-NEXT:    [[ADD_I]] = fadd contract double [[__R_0_I4]], [[MUL_I]]
 // AMDGCNSPIRV-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[__A_ADDR_0_I3]], i64 8
 // AMDGCNSPIRV-NEXT:    [[TOBOOL_NOT_I:%.*]] = icmp eq i32 [[DEC_I]], 0
-// AMDGCNSPIRV-NEXT:    br i1 [[TOBOOL_NOT_I]], label [[_ZL5RNORMIPKD_EXIT]], label [[WHILE_BODY_I]], !llvm.loop [[LOOP24:![0-9]+]]
-// AMDGCNSPIRV:       _ZL5rnormiPKd.exit:
-// AMDGCNSPIRV-NEXT:    [[__R_0_I_LCSSA:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[ADD_I]], [[WHILE_BODY_I]] ]
+// AMDGCNSPIRV-NEXT:    br i1 [[TOBOOL_NOT_I]], label %[[_ZL5RNORMIPKD_EXIT]], label %[[WHILE_BODY_I]], !llvm.loop [[LOOP24:![0-9]+]]
+// AMDGCNSPIRV:       [[_ZL5RNORMIPKD_EXIT]]:
+// AMDGCNSPIRV-NEXT:    [[__R_0_I_LCSSA:%.*]] = phi double [ 0.000000e+00, %[[ENTRY]] ], [ [[ADD_I]], %[[WHILE_BODY_I]] ]
 // AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_rsqrt_f64(double noundef [[__R_0_I_LCSSA]]) #[[ATTR13]]
 // AMDGCNSPIRV-NEXT:    ret double [[CALL_I]]
 //
@@ -5793,383 +6482,466 @@ extern "C" __device__ double test_rnorm(int x, const double* y) {
   return rnorm(x, y);
 }
 
-// DEFAULT-LABEL: @test_rnorm3df(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rlen3_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]], float noundef [[Z:%.*]]) #[[ATTR14]]
+// DEFAULT-LABEL: define dso_local noundef float @test_rnorm3df(
+// DEFAULT-SAME: float noundef [[X:%.*]], float noundef [[Y:%.*]], float noundef [[Z:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rlen3_f32(float noundef [[X]], float noundef [[Y]], float noundef [[Z]]) #[[ATTR14]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
-// FINITEONLY-LABEL: @test_rnorm3df(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_rlen3_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]], float noundef nofpclass(nan inf) [[Z:%.*]]) #[[ATTR14]]
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) float @test_rnorm3df(
+// FINITEONLY-SAME: float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]], float noundef nofpclass(nan inf) [[Z:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_rlen3_f32(float noundef nofpclass(nan inf) [[X]], float noundef nofpclass(nan inf) [[Y]], float noundef nofpclass(nan inf) [[Z]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
-// APPROX-LABEL: @test_rnorm3df(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rlen3_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]], float noundef [[Z:%.*]]) #[[ATTR14]]
+// APPROX-LABEL: define dso_local noundef float @test_rnorm3df(
+// APPROX-SAME: float noundef [[X:%.*]], float noundef [[Y:%.*]], float noundef [[Z:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rlen3_f32(float noundef [[X]], float noundef [[Y]], float noundef [[Z]]) #[[ATTR14]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
-// NCRDIV-LABEL: @test_rnorm3df(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rlen3_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]], float noundef [[Z:%.*]]) #[[ATTR14]]
+// NCRDIV-LABEL: define dso_local noundef float @test_rnorm3df(
+// NCRDIV-SAME: float noundef [[X:%.*]], float noundef [[Y:%.*]], float noundef [[Z:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rlen3_f32(float noundef [[X]], float noundef [[Y]], float noundef [[Z]]) #[[ATTR14]]
 // NCRDIV-NEXT:    ret float [[CALL_I]]
 //
-// AMDGCNSPIRV-LABEL: @test_rnorm3df(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_rlen3_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]], float noundef [[Z:%.*]]) #[[ATTR12]]
+// AMDGCNSPIRV-LABEL: define spir_func noundef float @test_rnorm3df(
+// AMDGCNSPIRV-SAME: float noundef [[X:%.*]], float noundef [[Y:%.*]], float noundef [[Z:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR4]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_rlen3_f32(float noundef [[X]], float noundef [[Y]], float noundef [[Z]]) #[[ATTR12]]
 // AMDGCNSPIRV-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_rnorm3df(float x, float y, float z) {
   return rnorm3df(x, y, z);
 }
 
-// DEFAULT-LABEL: @test_rnorm3d(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rlen3_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]]) #[[ATTR14]]
+// DEFAULT-LABEL: define dso_local noundef double @test_rnorm3d(
+// DEFAULT-SAME: double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rlen3_f64(double noundef [[X]], double noundef [[Y]], double noundef [[Z]]) #[[ATTR14]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
-// FINITEONLY-LABEL: @test_rnorm3d(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_rlen3_f64(double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]], double noundef nofpclass(nan inf) [[Z:%.*]]) #[[ATTR14]]
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) double @test_rnorm3d(
+// FINITEONLY-SAME: double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]], double noundef nofpclass(nan inf) [[Z:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_rlen3_f64(double noundef nofpclass(nan inf) [[X]], double noundef nofpclass(nan inf) [[Y]], double noundef nofpclass(nan inf) [[Z]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
-// APPROX-LABEL: @test_rnorm3d(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rlen3_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]]) #[[ATTR14]]
+// APPROX-LABEL: define dso_local noundef double @test_rnorm3d(
+// APPROX-SAME: double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rlen3_f64(double noundef [[X]], double noundef [[Y]], double noundef [[Z]]) #[[ATTR14]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
-// NCRDIV-LABEL: @test_rnorm3d(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rlen3_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]]) #[[ATTR14]]
+// NCRDIV-LABEL: define dso_local noundef double @test_rnorm3d(
+// NCRDIV-SAME: double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rlen3_f64(double noundef [[X]], double noundef [[Y]], double noundef [[Z]]) #[[ATTR14]]
 // NCRDIV-NEXT:    ret double [[CALL_I]]
 //
-// AMDGCNSPIRV-LABEL: @test_rnorm3d(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_rlen3_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]]) #[[ATTR12]]
+// AMDGCNSPIRV-LABEL: define spir_func noundef double @test_rnorm3d(
+// AMDGCNSPIRV-SAME: double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR4]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_rlen3_f64(double noundef [[X]], double noundef [[Y]], double noundef [[Z]]) #[[ATTR12]]
 // AMDGCNSPIRV-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_rnorm3d(double x, double y, double z) {
   return rnorm3d(x, y, z);
 }
 
-// DEFAULT-LABEL: @test_rnorm4df(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rlen4_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]], float noundef [[Z:%.*]], float noundef [[W:%.*]]) #[[ATTR14]]
+// DEFAULT-LABEL: define dso_local noundef float @test_rnorm4df(
+// DEFAULT-SAME: float noundef [[X:%.*]], float noundef [[Y:%.*]], float noundef [[Z:%.*]], float noundef [[W:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rlen4_f32(float noundef [[X]], float noundef [[Y]], float noundef [[Z]], float noundef [[W]]) #[[ATTR14]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
-// FINITEONLY-LABEL: @test_rnorm4df(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_rlen4_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]], float noundef nofpclass(nan inf) [[Z:%.*]], float noundef nofpclass(nan inf) [[W:%.*]]) #[[ATTR14]]
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) float @test_rnorm4df(
+// FINITEONLY-SAME: float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]], float noundef nofpclass(nan inf) [[Z:%.*]], float noundef nofpclass(nan inf) [[W:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_rlen4_f32(float noundef nofpclass(nan inf) [[X]], float noundef nofpclass(nan inf) [[Y]], float noundef nofpclass(nan inf) [[Z]], float noundef nofpclass(nan inf) [[W]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
-// APPROX-LABEL: @test_rnorm4df(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rlen4_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]], float noundef [[Z:%.*]], float noundef [[W:%.*]]) #[[ATTR14]]
+// APPROX-LABEL: define dso_local noundef float @test_rnorm4df(
+// APPROX-SAME: float noundef [[X:%.*]], float noundef [[Y:%.*]], float noundef [[Z:%.*]], float noundef [[W:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rlen4_f32(float noundef [[X]], float noundef [[Y]], float noundef [[Z]], float noundef [[W]]) #[[ATTR14]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
-// NCRDIV-LABEL: @test_rnorm4df(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rlen4_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]], float noundef [[Z:%.*]], float noundef [[W:%.*]]) #[[ATTR14]]
+// NCRDIV-LABEL: define dso_local noundef float @test_rnorm4df(
+// NCRDIV-SAME: float noundef [[X:%.*]], float noundef [[Y:%.*]], float noundef [[Z:%.*]], float noundef [[W:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rlen4_f32(float noundef [[X]], float noundef [[Y]], float noundef [[Z]], float noundef [[W]]) #[[ATTR14]]
 // NCRDIV-NEXT:    ret float [[CALL_I]]
 //
-// AMDGCNSPIRV-LABEL: @test_rnorm4df(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_rlen4_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]], float noundef [[Z:%.*]], float noundef [[W:%.*]]) #[[ATTR12]]
+// AMDGCNSPIRV-LABEL: define spir_func noundef float @test_rnorm4df(
+// AMDGCNSPIRV-SAME: float noundef [[X:%.*]], float noundef [[Y:%.*]], float noundef [[Z:%.*]], float noundef [[W:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR4]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_rlen4_f32(float noundef [[X]], float noundef [[Y]], float noundef [[Z]], float noundef [[W]]) #[[ATTR12]]
 // AMDGCNSPIRV-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_rnorm4df(float x, float y, float z, float w) {
   return rnorm4df(x, y, z, w);
 }
 
-// DEFAULT-LABEL: @test_rnorm4d(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rlen4_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]], double noundef [[W:%.*]]) #[[ATTR14]]
+// DEFAULT-LABEL: define dso_local noundef double @test_rnorm4d(
+// DEFAULT-SAME: double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]], double noundef [[W:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rlen4_f64(double noundef [[X]], double noundef [[Y]], double noundef [[Z]], double noundef [[W]]) #[[ATTR14]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
-// FINITEONLY-LABEL: @test_rnorm4d(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_rlen4_f64(double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]], double noundef nofpclass(nan inf) [[Z:%.*]], double noundef nofpclass(nan inf) [[W:%.*]]) #[[ATTR14]]
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) double @test_rnorm4d(
+// FINITEONLY-SAME: double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]], double noundef nofpclass(nan inf) [[Z:%.*]], double noundef nofpclass(nan inf) [[W:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_rlen4_f64(double noundef nofpclass(nan inf) [[X]], double noundef nofpclass(nan inf) [[Y]], double noundef nofpclass(nan inf) [[Z]], double noundef nofpclass(nan inf) [[W]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
-// APPROX-LABEL: @test_rnorm4d(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rlen4_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]], double noundef [[W:%.*]]) #[[ATTR14]]
+// APPROX-LABEL: define dso_local noundef double @test_rnorm4d(
+// APPROX-SAME: double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]], double noundef [[W:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rlen4_f64(double noundef [[X]], double noundef [[Y]], double noundef [[Z]], double noundef [[W]]) #[[ATTR14]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
-// NCRDIV-LABEL: @test_rnorm4d(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rlen4_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]], double noundef [[W:%.*]]) #[[ATTR14]]
+// NCRDIV-LABEL: define dso_local noundef double @test_rnorm4d(
+// NCRDIV-SAME: double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]], double noundef [[W:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rlen4_f64(double noundef [[X]], double noundef [[Y]], double noundef [[Z]], double noundef [[W]]) #[[ATTR14]]
 // NCRDIV-NEXT:    ret double [[CALL_I]]
 //
-// AMDGCNSPIRV-LABEL: @test_rnorm4d(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_rlen4_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]], double noundef [[W:%.*]]) #[[ATTR12]]
+// AMDGCNSPIRV-LABEL: define spir_func noundef double @test_rnorm4d(
+// AMDGCNSPIRV-SAME: double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]], double noundef [[W:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR4]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_rlen4_f64(double noundef [[X]], double noundef [[Y]], double noundef [[Z]], double noundef [[W]]) #[[ATTR12]]
 // AMDGCNSPIRV-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_rnorm4d(double x, double y, double z, double w) {
   return rnorm4d(x, y, z, w);
 }
 
-// DEFAULT-LABEL: @test_roundf(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.round.f32(float [[X:%.*]])
+// DEFAULT-LABEL: define dso_local noundef float @test_roundf(
+// DEFAULT-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.round.f32(float [[X]])
 // DEFAULT-NEXT:    ret float [[TMP0]]
 //
-// FINITEONLY-LABEL: @test_roundf(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.round.f32(float nofpclass(nan inf) [[X:%.*]])
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) float @test_roundf(
+// FINITEONLY-SAME: float noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.round.f32(float nofpclass(nan inf) [[X]])
 // FINITEONLY-NEXT:    ret float [[TMP0]]
 //
-// APPROX-LABEL: @test_roundf(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.round.f32(float [[X:%.*]])
+// APPROX-LABEL: define dso_local noundef float @test_roundf(
+// APPROX-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.round.f32(float [[X]])
 // APPROX-NEXT:    ret float [[TMP0]]
 //
-// NCRDIV-LABEL: @test_roundf(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.round.f32(float [[X:%.*]])
+// NCRDIV-LABEL: define dso_local noundef float @test_roundf(
+// NCRDIV-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.round.f32(float [[X]])
 // NCRDIV-NEXT:    ret float [[TMP0]]
 //
-// AMDGCNSPIRV-LABEL: @test_roundf(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call contract noundef addrspace(4) float @llvm.round.f32(float [[X:%.*]])
+// AMDGCNSPIRV-LABEL: define spir_func noundef float @test_roundf(
+// AMDGCNSPIRV-SAME: float noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR3]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call contract noundef addrspace(4) float @llvm.round.f32(float [[X]])
 // AMDGCNSPIRV-NEXT:    ret float [[TMP0]]
 //
 extern "C" __device__ float test_roundf(float x) {
   return roundf(x);
 }
 
-// DEFAULT-LABEL: @test_round(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.round.f64(double [[X:%.*]])
+// DEFAULT-LABEL: define dso_local noundef double @test_round(
+// DEFAULT-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.round.f64(double [[X]])
 // DEFAULT-NEXT:    ret double [[TMP0]]
 //
-// FINITEONLY-LABEL: @test_round(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.round.f64(double nofpclass(nan inf) [[X:%.*]])
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) double @test_round(
+// FINITEONLY-SAME: double noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.round.f64(double nofpclass(nan inf) [[X]])
 // FINITEONLY-NEXT:    ret double [[TMP0]]
 //
-// APPROX-LABEL: @test_round(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.round.f64(double [[X:%.*]])
+// APPROX-LABEL: define dso_local noundef double @test_round(
+// APPROX-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.round.f64(double [[X]])
 // APPROX-NEXT:    ret double [[TMP0]]
 //
-// NCRDIV-LABEL: @test_round(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.round.f64(double [[X:%.*]])
+// NCRDIV-LABEL: define dso_local noundef double @test_round(
+// NCRDIV-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.round.f64(double [[X]])
 // NCRDIV-NEXT:    ret double [[TMP0]]
 //
-// AMDGCNSPIRV-LABEL: @test_round(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call contract noundef addrspace(4) double @llvm.round.f64(double [[X:%.*]])
+// AMDGCNSPIRV-LABEL: define spir_func noundef double @test_round(
+// AMDGCNSPIRV-SAME: double noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR3]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call contract noundef addrspace(4) double @llvm.round.f64(double [[X]])
 // AMDGCNSPIRV-NEXT:    ret double [[TMP0]]
 //
 extern "C" __device__ double test_round(double x) {
   return round(x);
 }
 
-// DEFAULT-LABEL: @test_rsqrtf(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rsqrt_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-LABEL: define dso_local noundef float @test_rsqrtf(
+// DEFAULT-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rsqrt_f32(float noundef [[X]]) #[[ATTR15]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
-// FINITEONLY-LABEL: @test_rsqrtf(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_rsqrt_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) float @test_rsqrtf(
+// FINITEONLY-SAME: float noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_rsqrt_f32(float noundef nofpclass(nan inf) [[X]]) #[[ATTR15]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
-// APPROX-LABEL: @test_rsqrtf(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rsqrt_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-LABEL: define dso_local noundef float @test_rsqrtf(
+// APPROX-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rsqrt_f32(float noundef [[X]]) #[[ATTR15]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
-// NCRDIV-LABEL: @test_rsqrtf(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rsqrt_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// NCRDIV-LABEL: define dso_local noundef float @test_rsqrtf(
+// NCRDIV-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rsqrt_f32(float noundef [[X]]) #[[ATTR15]]
 // NCRDIV-NEXT:    ret float [[CALL_I]]
 //
-// AMDGCNSPIRV-LABEL: @test_rsqrtf(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_rsqrt_f32(float noundef [[X:%.*]]) #[[ATTR13]]
+// AMDGCNSPIRV-LABEL: define spir_func noundef float @test_rsqrtf(
+// AMDGCNSPIRV-SAME: float noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR5]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_rsqrt_f32(float noundef [[X]]) #[[ATTR13]]
 // AMDGCNSPIRV-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_rsqrtf(float x) {
   return rsqrtf(x);
 }
 
-// DEFAULT-LABEL: @test_rsqrt(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rsqrt_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-LABEL: define dso_local noundef double @test_rsqrt(
+// DEFAULT-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rsqrt_f64(double noundef [[X]]) #[[ATTR15]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
-// FINITEONLY-LABEL: @test_rsqrt(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_rsqrt_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) double @test_rsqrt(
+// FINITEONLY-SAME: double noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_rsqrt_f64(double noundef nofpclass(nan inf) [[X]]) #[[ATTR15]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
-// APPROX-LABEL: @test_rsqrt(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rsqrt_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-LABEL: define dso_local noundef double @test_rsqrt(
+// APPROX-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rsqrt_f64(double noundef [[X]]) #[[ATTR15]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
-// NCRDIV-LABEL: @test_rsqrt(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rsqrt_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// NCRDIV-LABEL: define dso_local noundef double @test_rsqrt(
+// NCRDIV-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rsqrt_f64(double noundef [[X]]) #[[ATTR15]]
 // NCRDIV-NEXT:    ret double [[CALL_I]]
 //
-// AMDGCNSPIRV-LABEL: @test_rsqrt(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_rsqrt_f64(double noundef [[X:%.*]]) #[[ATTR13]]
+// AMDGCNSPIRV-LABEL: define spir_func noundef double @test_rsqrt(
+// AMDGCNSPIRV-SAME: double noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR5]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_rsqrt_f64(double noundef [[X]]) #[[ATTR13]]
 // AMDGCNSPIRV-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_rsqrt(double x) {
   return rsqrt(x);
 }
 
-// DEFAULT-LABEL: @test_scalblnf(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[SPEC_STORE_SELECT_I:%.*]] = tail call i64 @llvm.smax.i64(i64 [[Y:%.*]], i64 -2147483648)
+// DEFAULT-LABEL: define dso_local noundef float @test_scalblnf(
+// DEFAULT-SAME: float noundef [[X:%.*]], i64 noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[SPEC_STORE_SELECT_I:%.*]] = tail call i64 @llvm.smax.i64(i64 [[Y]], i64 -2147483648)
 // DEFAULT-NEXT:    [[CONV_I:%.*]] = trunc i64 [[SPEC_STORE_SELECT_I]] to i32
-// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.ldexp.f32.i32(float [[X:%.*]], i32 [[CONV_I]])
+// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.ldexp.f32.i32(float [[X]], i32 [[CONV_I]])
 // DEFAULT-NEXT:    ret float [[TMP0]]
 //
-// FINITEONLY-LABEL: @test_scalblnf(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[SPEC_STORE_SELECT_I:%.*]] = tail call i64 @llvm.smax.i64(i64 [[Y:%.*]], i64 -2147483648)
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) float @test_scalblnf(
+// FINITEONLY-SAME: float noundef nofpclass(nan inf) [[X:%.*]], i64 noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[SPEC_STORE_SELECT_I:%.*]] = tail call i64 @llvm.smax.i64(i64 [[Y]], i64 -2147483648)
 // FINITEONLY-NEXT:    [[CONV_I:%.*]] = trunc i64 [[SPEC_STORE_SELECT_I]] to i32
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.ldexp.f32.i32(float nofpclass(nan inf) [[X:%.*]], i32 [[CONV_I]])
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.ldexp.f32.i32(float nofpclass(nan inf) [[X]], i32 [[CONV_I]])
 // FINITEONLY-NEXT:    ret float [[TMP0]]
 //
-// APPROX-LABEL: @test_scalblnf(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[SPEC_STORE_SELECT_I:%.*]] = tail call i64 @llvm.smax.i64(i64 [[Y:%.*]], i64 -2147483648)
+// APPROX-LABEL: define dso_local noundef float @test_scalblnf(
+// APPROX-SAME: float noundef [[X:%.*]], i64 noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[SPEC_STORE_SELECT_I:%.*]] = tail call i64 @llvm.smax.i64(i64 [[Y]], i64 -2147483648)
 // APPROX-NEXT:    [[CONV_I:%.*]] = trunc i64 [[SPEC_STORE_SELECT_I]] to i32
-// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.ldexp.f32.i32(float [[X:%.*]], i32 [[CONV_I]])
+// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.ldexp.f32.i32(float [[X]], i32 [[CONV_I]])
 // APPROX-NEXT:    ret float [[TMP0]]
 //
-// NCRDIV-LABEL: @test_scalblnf(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[SPEC_STORE_SELECT_I:%.*]] = tail call i64 @llvm.smax.i64(i64 [[Y:%.*]], i64 -2147483648)
+// NCRDIV-LABEL: define dso_local noundef float @test_scalblnf(
+// NCRDIV-SAME: float noundef [[X:%.*]], i64 noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[SPEC_STORE_SELECT_I:%.*]] = tail call i64 @llvm.smax.i64(i64 [[Y]], i64 -2147483648)
 // NCRDIV-NEXT:    [[CONV_I:%.*]] = trunc i64 [[SPEC_STORE_SELECT_I]] to i32
-// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.ldexp.f32.i32(float [[X:%.*]], i32 [[CONV_I]])
+// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.ldexp.f32.i32(float [[X]], i32 [[CONV_I]])
 // NCRDIV-NEXT:    ret float [[TMP0]]
 //
-// AMDGCNSPIRV-LABEL: @test_scalblnf(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[SPEC_STORE_SELECT_I:%.*]] = tail call addrspace(4) i64 @llvm.smax.i64(i64 [[Y:%.*]], i64 -2147483648)
+// AMDGCNSPIRV-LABEL: define spir_func noundef float @test_scalblnf(
+// AMDGCNSPIRV-SAME: float noundef [[X:%.*]], i64 noundef [[Y:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR3]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[SPEC_STORE_SELECT_I:%.*]] = tail call addrspace(4) i64 @llvm.smax.i64(i64 [[Y]], i64 -2147483648)
 // AMDGCNSPIRV-NEXT:    [[CONV_I:%.*]] = trunc i64 [[SPEC_STORE_SELECT_I]] to i32
-// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call contract noundef addrspace(4) float @llvm.ldexp.f32.i32(float [[X:%.*]], i32 [[CONV_I]])
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call contract noundef addrspace(4) float @llvm.ldexp.f32.i32(float [[X]], i32 [[CONV_I]])
 // AMDGCNSPIRV-NEXT:    ret float [[TMP0]]
 //
 extern "C" __device__ float test_scalblnf(float x, long int y) {
   return scalblnf(x, y);
 }
 
-// DEFAULT-LABEL: @test_scalbln(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[SPEC_STORE_SELECT_I:%.*]] = tail call i64 @llvm.smax.i64(i64 [[Y:%.*]], i64 -2147483648)
+// DEFAULT-LABEL: define dso_local noundef double @test_scalbln(
+// DEFAULT-SAME: double noundef [[X:%.*]], i64 noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[SPEC_STORE_SELECT_I:%.*]] = tail call i64 @llvm.smax.i64(i64 [[Y]], i64 -2147483648)
 // DEFAULT-NEXT:    [[CONV_I:%.*]] = trunc i64 [[SPEC_STORE_SELECT_I]] to i32
-// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.ldexp.f64.i32(double [[X:%.*]], i32 [[CONV_I]])
+// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.ldexp.f64.i32(double [[X]], i32 [[CONV_I]])
 // DEFAULT-NEXT:    ret double [[TMP0]]
 //
-// FINITEONLY-LABEL: @test_scalbln(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[SPEC_STORE_SELECT_I:%.*]] = tail call i64 @llvm.smax.i64(i64 [[Y:%.*]], i64 -2147483648)
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) double @test_scalbln(
+// FINITEONLY-SAME: double noundef nofpclass(nan inf) [[X:%.*]], i64 noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[SPEC_STORE_SELECT_I:%.*]] = tail call i64 @llvm.smax.i64(i64 [[Y]], i64 -2147483648)
 // FINITEONLY-NEXT:    [[CONV_I:%.*]] = trunc i64 [[SPEC_STORE_SELECT_I]] to i32
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.ldexp.f64.i32(double nofpclass(nan inf) [[X:%.*]], i32 [[CONV_I]])
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.ldexp.f64.i32(double nofpclass(nan inf) [[X]], i32 [[CONV_I]])
 // FINITEONLY-NEXT:    ret double [[TMP0]]
 //
-// APPROX-LABEL: @test_scalbln(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[SPEC_STORE_SELECT_I:%.*]] = tail call i64 @llvm.smax.i64(i64 [[Y:%.*]], i64 -2147483648)
+// APPROX-LABEL: define dso_local noundef double @test_scalbln(
+// APPROX-SAME: double noundef [[X:%.*]], i64 noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[SPEC_STORE_SELECT_I:%.*]] = tail call i64 @llvm.smax.i64(i64 [[Y]], i64 -2147483648)
 // APPROX-NEXT:    [[CONV_I:%.*]] = trunc i64 [[SPEC_STORE_SELECT_I]] to i32
-// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.ldexp.f64.i32(double [[X:%.*]], i32 [[CONV_I]])
+// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.ldexp.f64.i32(double [[X]], i32 [[CONV_I]])
 // APPROX-NEXT:    ret double [[TMP0]]
 //
-// NCRDIV-LABEL: @test_scalbln(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[SPEC_STORE_SELECT_I:%.*]] = tail call i64 @llvm.smax.i64(i64 [[Y:%.*]], i64 -2147483648)
+// NCRDIV-LABEL: define dso_local noundef double @test_scalbln(
+// NCRDIV-SAME: double noundef [[X:%.*]], i64 noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[SPEC_STORE_SELECT_I:%.*]] = tail call i64 @llvm.smax.i64(i64 [[Y]], i64 -2147483648)
 // NCRDIV-NEXT:    [[CONV_I:%.*]] = trunc i64 [[SPEC_STORE_SELECT_I]] to i32
-// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.ldexp.f64.i32(double [[X:%.*]], i32 [[CONV_I]])
+// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.ldexp.f64.i32(double [[X]], i32 [[CONV_I]])
 // NCRDIV-NEXT:    ret double [[TMP0]]
 //
-// AMDGCNSPIRV-LABEL: @test_scalbln(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[SPEC_STORE_SELECT_I:%.*]] = tail call addrspace(4) i64 @llvm.smax.i64(i64 [[Y:%.*]], i64 -2147483648)
+// AMDGCNSPIRV-LABEL: define spir_func noundef double @test_scalbln(
+// AMDGCNSPIRV-SAME: double noundef [[X:%.*]], i64 noundef [[Y:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR3]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[SPEC_STORE_SELECT_I:%.*]] = tail call addrspace(4) i64 @llvm.smax.i64(i64 [[Y]], i64 -2147483648)
 // AMDGCNSPIRV-NEXT:    [[CONV_I:%.*]] = trunc i64 [[SPEC_STORE_SELECT_I]] to i32
-// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call contract noundef addrspace(4) double @llvm.ldexp.f64.i32(double [[X:%.*]], i32 [[CONV_I]])
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call contract noundef addrspace(4) double @llvm.ldexp.f64.i32(double [[X]], i32 [[CONV_I]])
 // AMDGCNSPIRV-NEXT:    ret double [[TMP0]]
 //
 extern "C" __device__ double test_scalbln(double x, long int y) {
   return scalbln(x, y);
 }
 
-// DEFAULT-LABEL: @test_scalbnf(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.ldexp.f32.i32(float [[X:%.*]], i32 [[Y:%.*]])
+// DEFAULT-LABEL: define dso_local noundef float @test_scalbnf(
+// DEFAULT-SAME: float noundef [[X:%.*]], i32 noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.ldexp.f32.i32(float [[X]], i32 [[Y]])
 // DEFAULT-NEXT:    ret float [[TMP0]]
 //
-// FINITEONLY-LABEL: @test_scalbnf(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.ldexp.f32.i32(float nofpclass(nan inf) [[X:%.*]], i32 [[Y:%.*]])
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) float @test_scalbnf(
+// FINITEONLY-SAME: float noundef nofpclass(nan inf) [[X:%.*]], i32 noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.ldexp.f32.i32(float nofpclass(nan inf) [[X]], i32 [[Y]])
 // FINITEONLY-NEXT:    ret float [[TMP0]]
 //
-// APPROX-LABEL: @test_scalbnf(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.ldexp.f32.i32(float [[X:%.*]], i32 [[Y:%.*]])
+// APPROX-LABEL: define dso_local noundef float @test_scalbnf(
+// APPROX-SAME: float noundef [[X:%.*]], i32 noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.ldexp.f32.i32(float [[X]], i32 [[Y]])
 // APPROX-NEXT:    ret float [[TMP0]]
 //
-// NCRDIV-LABEL: @test_scalbnf(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.ldexp.f32.i32(float [[X:%.*]], i32 [[Y:%.*]])
+// NCRDIV-LABEL: define dso_local noundef float @test_scalbnf(
+// NCRDIV-SAME: float noundef [[X:%.*]], i32 noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.ldexp.f32.i32(float [[X]], i32 [[Y]])
 // NCRDIV-NEXT:    ret float [[TMP0]]
 //
-// AMDGCNSPIRV-LABEL: @test_scalbnf(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call contract noundef addrspace(4) float @llvm.ldexp.f32.i32(float [[X:%.*]], i32 [[Y:%.*]])
+// AMDGCNSPIRV-LABEL: define spir_func noundef float @test_scalbnf(
+// AMDGCNSPIRV-SAME: float noundef [[X:%.*]], i32 noundef [[Y:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR3]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call contract noundef addrspace(4) float @llvm.ldexp.f32.i32(float [[X]], i32 [[Y]])
 // AMDGCNSPIRV-NEXT:    ret float [[TMP0]]
 //
 extern "C" __device__ float test_scalbnf(float x, int y) {
   return scalbnf(x, y);
 }
 
-// DEFAULT-LABEL: @test_scalbn(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.ldexp.f64.i32(double [[X:%.*]], i32 [[Y:%.*]])
+// DEFAULT-LABEL: define dso_local noundef double @test_scalbn(
+// DEFAULT-SAME: double noundef [[X:%.*]], i32 noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.ldexp.f64.i32(double [[X]], i32 [[Y]])
 // DEFAULT-NEXT:    ret double [[TMP0]]
 //
-// FINITEONLY-LABEL: @test_scalbn(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.ldexp.f64.i32(double nofpclass(nan inf) [[X:%.*]], i32 [[Y:%.*]])
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) double @test_scalbn(
+// FINITEONLY-SAME: double noundef nofpclass(nan inf) [[X:%.*]], i32 noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.ldexp.f64.i32(double nofpclass(nan inf) [[X]], i32 [[Y]])
 // FINITEONLY-NEXT:    ret double [[TMP0]]
 //
-// APPROX-LABEL: @test_scalbn(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.ldexp.f64.i32(double [[X:%.*]], i32 [[Y:%.*]])
+// APPROX-LABEL: define dso_local noundef double @test_scalbn(
+// APPROX-SAME: double noundef [[X:%.*]], i32 noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.ldexp.f64.i32(double [[X]], i32 [[Y]])
 // APPROX-NEXT:    ret double [[TMP0]]
 //
-// NCRDIV-LABEL: @test_scalbn(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.ldexp.f64.i32(double [[X:%.*]], i32 [[Y:%.*]])
+// NCRDIV-LABEL: define dso_local noundef double @test_scalbn(
+// NCRDIV-SAME: double noundef [[X:%.*]], i32 noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.ldexp.f64.i32(double [[X]], i32 [[Y]])
 // NCRDIV-NEXT:    ret double [[TMP0]]
 //
-// AMDGCNSPIRV-LABEL: @test_scalbn(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call contract noundef addrspace(4) double @llvm.ldexp.f64.i32(double [[X:%.*]], i32 [[Y:%.*]])
+// AMDGCNSPIRV-LABEL: define spir_func noundef double @test_scalbn(
+// AMDGCNSPIRV-SAME: double noundef [[X:%.*]], i32 noundef [[Y:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR3]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call contract noundef addrspace(4) double @llvm.ldexp.f64.i32(double [[X]], i32 [[Y]])
 // AMDGCNSPIRV-NEXT:    ret double [[TMP0]]
 //
 extern "C" __device__ double test_scalbn(double x, int y) {
   return scalbn(x, y);
 }
 
-// CHECK-LABEL: @test___signbitf(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast float [[X:%.*]] to i32
-// CHECK-NEXT:    [[DOTLOBIT:%.*]] = lshr i32 [[TMP0]], 31
-// CHECK-NEXT:    ret i32 [[DOTLOBIT]]
-//
-// AMDGCNSPIRV-LABEL: @test___signbitf(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = bitcast float [[X:%.*]] to i32
+// DEFAULT-LABEL: define dso_local noundef range(i32 0, 2) i32 @test___signbitf(
+// DEFAULT-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[TMP0:%.*]] = bitcast float [[X]] to i32
+// DEFAULT-NEXT:    [[DOTLOBIT:%.*]] = lshr i32 [[TMP0]], 31
+// DEFAULT-NEXT:    ret i32 [[DOTLOBIT]]
+//
+// FINITEONLY-LABEL: define dso_local noundef range(i32 0, 2) i32 @test___signbitf(
+// FINITEONLY-SAME: float noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = bitcast float [[X]] to i32
+// FINITEONLY-NEXT:    [[DOTLOBIT:%.*]] = lshr i32 [[TMP0]], 31
+// FINITEONLY-NEXT:    ret i32 [[DOTLOBIT]]
+//
+// APPROX-LABEL: define dso_local noundef range(i32 0, 2) i32 @test___signbitf(
+// APPROX-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[TMP0:%.*]] = bitcast float [[X]] to i32
+// APPROX-NEXT:    [[DOTLOBIT:%.*]] = lshr i32 [[TMP0]], 31
+// APPROX-NEXT:    ret i32 [[DOTLOBIT]]
+//
+// NCRDIV-LABEL: define dso_local noundef range(i32 0, 2) i32 @test___signbitf(
+// NCRDIV-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[TMP0:%.*]] = bitcast float [[X]] to i32
+// NCRDIV-NEXT:    [[DOTLOBIT:%.*]] = lshr i32 [[TMP0]], 31
+// NCRDIV-NEXT:    ret i32 [[DOTLOBIT]]
+//
+// AMDGCNSPIRV-LABEL: define spir_func noundef range(i32 0, 2) i32 @test___signbitf(
+// AMDGCNSPIRV-SAME: float noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR3]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = bitcast float [[X]] to i32
 // AMDGCNSPIRV-NEXT:    [[DOTLOBIT:%.*]] = lshr i32 [[TMP0]], 31
 // AMDGCNSPIRV-NEXT:    ret i32 [[DOTLOBIT]]
 //
@@ -6177,16 +6949,42 @@ extern "C" __device__ BOOL_TYPE test___signbitf(float x) {
   return __signbitf(x);
 }
 
-// CHECK-LABEL: @test___signbit(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast double [[X:%.*]] to i64
-// CHECK-NEXT:    [[DOTLOBIT:%.*]] = lshr i64 [[TMP0]], 63
-// CHECK-NEXT:    [[CONV:%.*]] = trunc nuw nsw i64 [[DOTLOBIT]] to i32
-// CHECK-NEXT:    ret i32 [[CONV]]
+// DEFAULT-LABEL: define dso_local range(i32 0, 2) i32 @test___signbit(
+// DEFAULT-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[TMP0:%.*]] = bitcast double [[X]] to i64
+// DEFAULT-NEXT:    [[DOTLOBIT:%.*]] = lshr i64 [[TMP0]], 63
+// DEFAULT-NEXT:    [[CONV:%.*]] = trunc nuw nsw i64 [[DOTLOBIT]] to i32
+// DEFAULT-NEXT:    ret i32 [[CONV]]
+//
+// FINITEONLY-LABEL: define dso_local range(i32 0, 2) i32 @test___signbit(
+// FINITEONLY-SAME: double noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = bitcast double [[X]] to i64
+// FINITEONLY-NEXT:    [[DOTLOBIT:%.*]] = lshr i64 [[TMP0]], 63
+// FINITEONLY-NEXT:    [[CONV:%.*]] = trunc nuw nsw i64 [[DOTLOBIT]] to i32
+// FINITEONLY-NEXT:    ret i32 [[CONV]]
+//
+// APPROX-LABEL: define dso_local range(i32 0, 2) i32 @test___signbit(
+// APPROX-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[TMP0:%.*]] = bitcast double [[X]] to i64
+// APPROX-NEXT:    [[DOTLOBIT:%.*]] = lshr i64 [[TMP0]], 63
+// APPROX-NEXT:    [[CONV:%.*]] = trunc nuw nsw i64 [[DOTLOBIT]] to i32
+// APPROX-NEXT:    ret i32 [[CONV]]
+//
+// NCRDIV-LABEL: define dso_local range(i32 0, 2) i32 @test___signbit(
+// NCRDIV-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[TMP0:%.*]] = bitcast double [[X]] to i64
+// NCRDIV-NEXT:    [[DOTLOBIT:%.*]] = lshr i64 [[TMP0]], 63
+// NCRDIV-NEXT:    [[CONV:%.*]] = trunc nuw nsw i64 [[DOTLOBIT]] to i32
+// NCRDIV-NEXT:    ret i32 [[CONV]]
 //
-// AMDGCNSPIRV-LABEL: @test___signbit(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = bitcast double [[X:%.*]] to i64
+// AMDGCNSPIRV-LABEL: define spir_func range(i32 0, 2) i32 @test___signbit(
+// AMDGCNSPIRV-SAME: double noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR3]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = bitcast double [[X]] to i64
 // AMDGCNSPIRV-NEXT:    [[DOTLOBIT:%.*]] = lshr i64 [[TMP0]], 63
 // AMDGCNSPIRV-NEXT:    [[CONV:%.*]] = trunc nuw nsw i64 [[DOTLOBIT]] to i32
 // AMDGCNSPIRV-NEXT:    ret i32 [[CONV]]
@@ -6195,59 +6993,64 @@ extern "C" __device__ BOOL_TYPE test___signbit(double x) {
   return __signbit(x);
 }
 
-// DEFAULT-LABEL: @test_sincosf(
-// DEFAULT-NEXT:  entry:
+// DEFAULT-LABEL: define dso_local void @test_sincosf(
+// DEFAULT-SAME: float noundef [[X:%.*]], ptr noundef writeonly captures(none) initializes((0, 4)) [[Y:%.*]], ptr noundef writeonly captures(none) initializes((0, 4)) [[Z:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
 // DEFAULT-NEXT:    [[__TMP_I:%.*]] = alloca float, align 4, addrspace(5)
 // DEFAULT-NEXT:    call void @llvm.lifetime.start.p5(ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = call contract float @__ocml_sincos_f32(float noundef [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
-// DEFAULT-NEXT:    store float [[CALL_I]], ptr [[Y:%.*]], align 4, !tbaa [[TBAA16]]
-// DEFAULT-NEXT:    [[TMP0:%.*]] = load float, ptr addrspace(5) [[__TMP_I]], align 4, !tbaa [[TBAA16]]
-// DEFAULT-NEXT:    store float [[TMP0]], ptr [[Z:%.*]], align 4, !tbaa [[TBAA16]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = call contract float @__ocml_sincos_f32(float noundef [[X]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
+// DEFAULT-NEXT:    store float [[CALL_I]], ptr [[Y]], align 4, !tbaa [[FLOAT_TBAA16]]
+// DEFAULT-NEXT:    [[TMP0:%.*]] = load float, ptr addrspace(5) [[__TMP_I]], align 4, !tbaa [[FLOAT_TBAA16]]
+// DEFAULT-NEXT:    store float [[TMP0]], ptr [[Z]], align 4, !tbaa [[FLOAT_TBAA16]]
 // DEFAULT-NEXT:    call void @llvm.lifetime.end.p5(ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
 // DEFAULT-NEXT:    ret void
 //
-// FINITEONLY-LABEL: @test_sincosf(
-// FINITEONLY-NEXT:  entry:
+// FINITEONLY-LABEL: define dso_local void @test_sincosf(
+// FINITEONLY-SAME: float noundef nofpclass(nan inf) [[X:%.*]], ptr noundef writeonly captures(none) initializes((0, 4)) [[Y:%.*]], ptr noundef writeonly captures(none) initializes((0, 4)) [[Z:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
 // FINITEONLY-NEXT:    [[__TMP_I:%.*]] = alloca float, align 4, addrspace(5)
 // FINITEONLY-NEXT:    call void @llvm.lifetime.start.p5(ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = call nnan ninf contract nofpclass(nan inf) float @__ocml_sincos_f32(float noundef nofpclass(nan inf) [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
-// FINITEONLY-NEXT:    store float [[CALL_I]], ptr [[Y:%.*]], align 4, !tbaa [[TBAA16]]
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = load float, ptr addrspace(5) [[__TMP_I]], align 4, !tbaa [[TBAA16]]
-// FINITEONLY-NEXT:    store float [[TMP0]], ptr [[Z:%.*]], align 4, !tbaa [[TBAA16]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = call nnan ninf contract nofpclass(nan inf) float @__ocml_sincos_f32(float noundef nofpclass(nan inf) [[X]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    store float [[CALL_I]], ptr [[Y]], align 4, !tbaa [[FLOAT_TBAA16]]
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = load float, ptr addrspace(5) [[__TMP_I]], align 4, !tbaa [[FLOAT_TBAA16]]
+// FINITEONLY-NEXT:    store float [[TMP0]], ptr [[Z]], align 4, !tbaa [[FLOAT_TBAA16]]
 // FINITEONLY-NEXT:    call void @llvm.lifetime.end.p5(ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
 // FINITEONLY-NEXT:    ret void
 //
-// APPROX-LABEL: @test_sincosf(
-// APPROX-NEXT:  entry:
+// APPROX-LABEL: define dso_local void @test_sincosf(
+// APPROX-SAME: float noundef [[X:%.*]], ptr noundef writeonly captures(none) initializes((0, 4)) [[Y:%.*]], ptr noundef writeonly captures(none) initializes((0, 4)) [[Z:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
 // APPROX-NEXT:    [[__TMP_I:%.*]] = alloca float, align 4, addrspace(5)
 // APPROX-NEXT:    call void @llvm.lifetime.start.p5(ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
-// APPROX-NEXT:    [[CALL_I:%.*]] = call contract float @__ocml_sincos_f32(float noundef [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
-// APPROX-NEXT:    store float [[CALL_I]], ptr [[Y:%.*]], align 4, !tbaa [[TBAA16]]
-// APPROX-NEXT:    [[TMP0:%.*]] = load float, ptr addrspace(5) [[__TMP_I]], align 4, !tbaa [[TBAA16]]
-// APPROX-NEXT:    store float [[TMP0]], ptr [[Z:%.*]], align 4, !tbaa [[TBAA16]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = call contract float @__ocml_sincos_f32(float noundef [[X]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
+// APPROX-NEXT:    store float [[CALL_I]], ptr [[Y]], align 4, !tbaa [[FLOAT_TBAA16]]
+// APPROX-NEXT:    [[TMP0:%.*]] = load float, ptr addrspace(5) [[__TMP_I]], align 4, !tbaa [[FLOAT_TBAA16]]
+// APPROX-NEXT:    store float [[TMP0]], ptr [[Z]], align 4, !tbaa [[FLOAT_TBAA16]]
 // APPROX-NEXT:    call void @llvm.lifetime.end.p5(ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
 // APPROX-NEXT:    ret void
 //
-// NCRDIV-LABEL: @test_sincosf(
-// NCRDIV-NEXT:  entry:
+// NCRDIV-LABEL: define dso_local void @test_sincosf(
+// NCRDIV-SAME: float noundef [[X:%.*]], ptr noundef writeonly captures(none) initializes((0, 4)) [[Y:%.*]], ptr noundef writeonly captures(none) initializes((0, 4)) [[Z:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
 // NCRDIV-NEXT:    [[__TMP_I:%.*]] = alloca float, align 4, addrspace(5)
 // NCRDIV-NEXT:    call void @llvm.lifetime.start.p5(ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
-// NCRDIV-NEXT:    [[CALL_I:%.*]] = call contract float @__ocml_sincos_f32(float noundef [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
-// NCRDIV-NEXT:    store float [[CALL_I]], ptr [[Y:%.*]], align 4, !tbaa [[TBAA17]]
-// NCRDIV-NEXT:    [[TMP0:%.*]] = load float, ptr addrspace(5) [[__TMP_I]], align 4, !tbaa [[TBAA17]]
-// NCRDIV-NEXT:    store float [[TMP0]], ptr [[Z:%.*]], align 4, !tbaa [[TBAA17]]
+// NCRDIV-NEXT:    [[CALL_I:%.*]] = call contract float @__ocml_sincos_f32(float noundef [[X]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
+// NCRDIV-NEXT:    store float [[CALL_I]], ptr [[Y]], align 4, !tbaa [[FLOAT_TBAA17]]
+// NCRDIV-NEXT:    [[TMP0:%.*]] = load float, ptr addrspace(5) [[__TMP_I]], align 4, !tbaa [[FLOAT_TBAA17]]
+// NCRDIV-NEXT:    store float [[TMP0]], ptr [[Z]], align 4, !tbaa [[FLOAT_TBAA17]]
 // NCRDIV-NEXT:    call void @llvm.lifetime.end.p5(ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
 // NCRDIV-NEXT:    ret void
 //
-// AMDGCNSPIRV-LABEL: @test_sincosf(
-// AMDGCNSPIRV-NEXT:  entry:
+// AMDGCNSPIRV-LABEL: define spir_func void @test_sincosf(
+// AMDGCNSPIRV-SAME: float noundef [[X:%.*]], ptr addrspace(4) noundef writeonly captures(none) initializes((0, 4)) [[Y:%.*]], ptr addrspace(4) noundef writeonly captures(none) initializes((0, 4)) [[Z:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR6]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
 // AMDGCNSPIRV-NEXT:    [[__TMP_I:%.*]] = alloca float, align 4
 // AMDGCNSPIRV-NEXT:    [[__TMP_ASCAST_I:%.*]] = addrspacecast ptr [[__TMP_I]] to ptr addrspace(4)
 // AMDGCNSPIRV-NEXT:    call addrspace(4) void @llvm.lifetime.start.p0(ptr nonnull [[__TMP_I]]) #[[ATTR15]]
-// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = call contract spir_func addrspace(4) float @__ocml_sincos_f32(float noundef [[X:%.*]], ptr noundef nonnull [[__TMP_I]]) #[[ATTR14]]
-// AMDGCNSPIRV-NEXT:    store float [[CALL_I]], ptr addrspace(4) [[Y:%.*]], align 4, !tbaa [[TBAA17]]
-// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = load float, ptr addrspace(4) [[__TMP_ASCAST_I]], align 4, !tbaa [[TBAA17]]
-// AMDGCNSPIRV-NEXT:    store float [[TMP0]], ptr addrspace(4) [[Z:%.*]], align 4, !tbaa [[TBAA17]]
+// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = call contract spir_func addrspace(4) float @__ocml_sincos_f32(float noundef [[X]], ptr noundef nonnull [[__TMP_I]]) #[[ATTR14]]
+// AMDGCNSPIRV-NEXT:    store float [[CALL_I]], ptr addrspace(4) [[Y]], align 4, !tbaa [[FLOAT_TBAA17]]
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = load float, ptr addrspace(4) [[__TMP_ASCAST_I]], align 4, !tbaa [[FLOAT_TBAA17]]
+// AMDGCNSPIRV-NEXT:    store float [[TMP0]], ptr addrspace(4) [[Z]], align 4, !tbaa [[FLOAT_TBAA17]]
 // AMDGCNSPIRV-NEXT:    call addrspace(4) void @llvm.lifetime.end.p0(ptr nonnull [[__TMP_I]]) #[[ATTR15]]
 // AMDGCNSPIRV-NEXT:    ret void
 //
@@ -6255,59 +7058,64 @@ extern "C" __device__ void test_sincosf(float x, float *y, float *z) {
   sincosf(x, y, z);
 }
 
-// DEFAULT-LABEL: @test_sincos(
-// DEFAULT-NEXT:  entry:
+// DEFAULT-LABEL: define dso_local void @test_sincos(
+// DEFAULT-SAME: double noundef [[X:%.*]], ptr noundef writeonly captures(none) initializes((0, 8)) [[Y:%.*]], ptr noundef writeonly captures(none) initializes((0, 8)) [[Z:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
 // DEFAULT-NEXT:    [[__TMP_I:%.*]] = alloca double, align 8, addrspace(5)
 // DEFAULT-NEXT:    call void @llvm.lifetime.start.p5(ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = call contract double @__ocml_sincos_f64(double noundef [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
-// DEFAULT-NEXT:    store double [[CALL_I]], ptr [[Y:%.*]], align 8, !tbaa [[TBAA18]]
-// DEFAULT-NEXT:    [[TMP0:%.*]] = load double, ptr addrspace(5) [[__TMP_I]], align 8, !tbaa [[TBAA18]]
-// DEFAULT-NEXT:    store double [[TMP0]], ptr [[Z:%.*]], align 8, !tbaa [[TBAA18]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = call contract double @__ocml_sincos_f64(double noundef [[X]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
+// DEFAULT-NEXT:    store double [[CALL_I]], ptr [[Y]], align 8, !tbaa [[DOUBLE_TBAA18]]
+// DEFAULT-NEXT:    [[TMP0:%.*]] = load double, ptr addrspace(5) [[__TMP_I]], align 8, !tbaa [[DOUBLE_TBAA18]]
+// DEFAULT-NEXT:    store double [[TMP0]], ptr [[Z]], align 8, !tbaa [[DOUBLE_TBAA18]]
 // DEFAULT-NEXT:    call void @llvm.lifetime.end.p5(ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
 // DEFAULT-NEXT:    ret void
 //
-// FINITEONLY-LABEL: @test_sincos(
-// FINITEONLY-NEXT:  entry:
+// FINITEONLY-LABEL: define dso_local void @test_sincos(
+// FINITEONLY-SAME: double noundef nofpclass(nan inf) [[X:%.*]], ptr noundef writeonly captures(none) initializes((0, 8)) [[Y:%.*]], ptr noundef writeonly captures(none) initializes((0, 8)) [[Z:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
 // FINITEONLY-NEXT:    [[__TMP_I:%.*]] = alloca double, align 8, addrspace(5)
 // FINITEONLY-NEXT:    call void @llvm.lifetime.start.p5(ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = call nnan ninf contract nofpclass(nan inf) double @__ocml_sincos_f64(double noundef nofpclass(nan inf) [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
-// FINITEONLY-NEXT:    store double [[CALL_I]], ptr [[Y:%.*]], align 8, !tbaa [[TBAA18]]
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = load double, ptr addrspace(5) [[__TMP_I]], align 8, !tbaa [[TBAA18]]
-// FINITEONLY-NEXT:    store double [[TMP0]], ptr [[Z:%.*]], align 8, !tbaa [[TBAA18]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = call nnan ninf contract nofpclass(nan inf) double @__ocml_sincos_f64(double noundef nofpclass(nan inf) [[X]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    store double [[CALL_I]], ptr [[Y]], align 8, !tbaa [[DOUBLE_TBAA18]]
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = load double, ptr addrspace(5) [[__TMP_I]], align 8, !tbaa [[DOUBLE_TBAA18]]
+// FINITEONLY-NEXT:    store double [[TMP0]], ptr [[Z]], align 8, !tbaa [[DOUBLE_TBAA18]]
 // FINITEONLY-NEXT:    call void @llvm.lifetime.end.p5(ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
 // FINITEONLY-NEXT:    ret void
 //
-// APPROX-LABEL: @test_sincos(
-// APPROX-NEXT:  entry:
+// APPROX-LABEL: define dso_local void @test_sincos(
+// APPROX-SAME: double noundef [[X:%.*]], ptr noundef writeonly captures(none) initializes((0, 8)) [[Y:%.*]], ptr noundef writeonly captures(none) initializes((0, 8)) [[Z:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
 // APPROX-NEXT:    [[__TMP_I:%.*]] = alloca double, align 8, addrspace(5)
 // APPROX-NEXT:    call void @llvm.lifetime.start.p5(ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
-// APPROX-NEXT:    [[CALL_I:%.*]] = call contract double @__ocml_sincos_f64(double noundef [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
-// APPROX-NEXT:    store double [[CALL_I]], ptr [[Y:%.*]], align 8, !tbaa [[TBAA18]]
-// APPROX-NEXT:    [[TMP0:%.*]] = load double, ptr addrspace(5) [[__TMP_I]], align 8, !tbaa [[TBAA18]]
-// APPROX-NEXT:    store double [[TMP0]], ptr [[Z:%.*]], align 8, !tbaa [[TBAA18]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = call contract double @__ocml_sincos_f64(double noundef [[X]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
+// APPROX-NEXT:    store double [[CALL_I]], ptr [[Y]], align 8, !tbaa [[DOUBLE_TBAA18]]
+// APPROX-NEXT:    [[TMP0:%.*]] = load double, ptr addrspace(5) [[__TMP_I]], align 8, !tbaa [[DOUBLE_TBAA18]]
+// APPROX-NEXT:    store double [[TMP0]], ptr [[Z]], align 8, !tbaa [[DOUBLE_TBAA18]]
 // APPROX-NEXT:    call void @llvm.lifetime.end.p5(ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
 // APPROX-NEXT:    ret void
 //
-// NCRDIV-LABEL: @test_sincos(
-// NCRDIV-NEXT:  entry:
+// NCRDIV-LABEL: define dso_local void @test_sincos(
+// NCRDIV-SAME: double noundef [[X:%.*]], ptr noundef writeonly captures(none) initializes((0, 8)) [[Y:%.*]], ptr noundef writeonly captures(none) initializes((0, 8)) [[Z:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
 // NCRDIV-NEXT:    [[__TMP_I:%.*]] = alloca double, align 8, addrspace(5)
 // NCRDIV-NEXT:    call void @llvm.lifetime.start.p5(ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
-// NCRDIV-NEXT:    [[CALL_I:%.*]] = call contract double @__ocml_sincos_f64(double noundef [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
-// NCRDIV-NEXT:    store double [[CALL_I]], ptr [[Y:%.*]], align 8, !tbaa [[TBAA19]]
-// NCRDIV-NEXT:    [[TMP0:%.*]] = load double, ptr addrspace(5) [[__TMP_I]], align 8, !tbaa [[TBAA19]]
-// NCRDIV-NEXT:    store double [[TMP0]], ptr [[Z:%.*]], align 8, !tbaa [[TBAA19]]
+// NCRDIV-NEXT:    [[CALL_I:%.*]] = call contract double @__ocml_sincos_f64(double noundef [[X]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
+// NCRDIV-NEXT:    store double [[CALL_I]], ptr [[Y]], align 8, !tbaa [[DOUBLE_TBAA19]]
+// NCRDIV-NEXT:    [[TMP0:%.*]] = load double, ptr addrspace(5) [[__TMP_I]], align 8, !tbaa [[DOUBLE_TBAA19]]
+// NCRDIV-NEXT:    store double [[TMP0]], ptr [[Z]], align 8, !tbaa [[DOUBLE_TBAA19]]
 // NCRDIV-NEXT:    call void @llvm.lifetime.end.p5(ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
 // NCRDIV-NEXT:    ret void
 //
-// AMDGCNSPIRV-LABEL: @test_sincos(
-// AMDGCNSPIRV-NEXT:  entry:
+// AMDGCNSPIRV-LABEL: define spir_func void @test_sincos(
+// AMDGCNSPIRV-SAME: double noundef [[X:%.*]], ptr addrspace(4) noundef writeonly captures(none) initializes((0, 8)) [[Y:%.*]], ptr addrspace(4) noundef writeonly captures(none) initializes((0, 8)) [[Z:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR6]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
 // AMDGCNSPIRV-NEXT:    [[__TMP_I:%.*]] = alloca double, align 8
 // AMDGCNSPIRV-NEXT:    [[__TMP_ASCAST_I:%.*]] = addrspacecast ptr [[__TMP_I]] to ptr addrspace(4)
 // AMDGCNSPIRV-NEXT:    call addrspace(4) void @llvm.lifetime.start.p0(ptr nonnull [[__TMP_I]]) #[[ATTR15]]
-// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = call contract spir_func addrspace(4) double @__ocml_sincos_f64(double noundef [[X:%.*]], ptr noundef nonnull [[__TMP_I]]) #[[ATTR14]]
-// AMDGCNSPIRV-NEXT:    store double [[CALL_I]], ptr addrspace(4) [[Y:%.*]], align 8, !tbaa [[TBAA19]]
-// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = load double, ptr addrspace(4) [[__TMP_ASCAST_I]], align 8, !tbaa [[TBAA19]]
-// AMDGCNSPIRV-NEXT:    store double [[TMP0]], ptr addrspace(4) [[Z:%.*]], align 8, !tbaa [[TBAA19]]
+// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = call contract spir_func addrspace(4) double @__ocml_sincos_f64(double noundef [[X]], ptr noundef nonnull [[__TMP_I]]) #[[ATTR14]]
+// AMDGCNSPIRV-NEXT:    store double [[CALL_I]], ptr addrspace(4) [[Y]], align 8, !tbaa [[DOUBLE_TBAA19]]
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = load double, ptr addrspace(4) [[__TMP_ASCAST_I]], align 8, !tbaa [[DOUBLE_TBAA19]]
+// AMDGCNSPIRV-NEXT:    store double [[TMP0]], ptr addrspace(4) [[Z]], align 8, !tbaa [[DOUBLE_TBAA19]]
 // AMDGCNSPIRV-NEXT:    call addrspace(4) void @llvm.lifetime.end.p0(ptr nonnull [[__TMP_I]]) #[[ATTR15]]
 // AMDGCNSPIRV-NEXT:    ret void
 //
@@ -6315,59 +7123,64 @@ extern "C" __device__ void test_sincos(double x, double *y, double *z) {
   sincos(x, y, z);
 }
 
-// DEFAULT-LABEL: @test_sincospif(
-// DEFAULT-NEXT:  entry:
+// DEFAULT-LABEL: define dso_local void @test_sincospif(
+// DEFAULT-SAME: float noundef [[X:%.*]], ptr noundef writeonly captures(none) initializes((0, 4)) [[Y:%.*]], ptr noundef writeonly captures(none) initializes((0, 4)) [[Z:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
 // DEFAULT-NEXT:    [[__TMP_I:%.*]] = alloca float, align 4, addrspace(5)
 // DEFAULT-NEXT:    call void @llvm.lifetime.start.p5(ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = call contract float @__ocml_sincospi_f32(float noundef [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
-// DEFAULT-NEXT:    store float [[CALL_I]], ptr [[Y:%.*]], align 4, !tbaa [[TBAA16]]
-// DEFAULT-NEXT:    [[TMP0:%.*]] = load float, ptr addrspace(5) [[__TMP_I]], align 4, !tbaa [[TBAA16]]
-// DEFAULT-NEXT:    store float [[TMP0]], ptr [[Z:%.*]], align 4, !tbaa [[TBAA16]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = call contract float @__ocml_sincospi_f32(float noundef [[X]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
+// DEFAULT-NEXT:    store float [[CALL_I]], ptr [[Y]], align 4, !tbaa [[FLOAT_TBAA16]]
+// DEFAULT-NEXT:    [[TMP0:%.*]] = load float, ptr addrspace(5) [[__TMP_I]], align 4, !tbaa [[FLOAT_TBAA16]]
+// DEFAULT-NEXT:    store float [[TMP0]], ptr [[Z]], align 4, !tbaa [[FLOAT_TBAA16]]
 // DEFAULT-NEXT:    call void @llvm.lifetime.end.p5(ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
 // DEFAULT-NEXT:    ret void
 //
-// FINITEONLY-LABEL: @test_sincospif(
-// FINITEONLY-NEXT:  entry:
+// FINITEONLY-LABEL: define dso_local void @test_sincospif(
+// FINITEONLY-SAME: float noundef nofpclass(nan inf) [[X:%.*]], ptr noundef writeonly captures(none) initializes((0, 4)) [[Y:%.*]], ptr noundef writeonly captures(none) initializes((0, 4)) [[Z:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
 // FINITEONLY-NEXT:    [[__TMP_I:%.*]] = alloca float, align 4, addrspace(5)
 // FINITEONLY-NEXT:    call void @llvm.lifetime.start.p5(ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = call nnan ninf contract nofpclass(nan inf) float @__ocml_sincospi_f32(float noundef nofpclass(nan inf) [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
-// FINITEONLY-NEXT:    store float [[CALL_I]], ptr [[Y:%.*]], align 4, !tbaa [[TBAA16]]
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = load float, ptr addrspace(5) [[__TMP_I]], align 4, !tbaa [[TBAA16]]
-// FINITEONLY-NEXT:    store float [[TMP0]], ptr [[Z:%.*]], align 4, !tbaa [[TBAA16]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = call nnan ninf contract nofpclass(nan inf) float @__ocml_sincospi_f32(float noundef nofpclass(nan inf) [[X]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    store float [[CALL_I]], ptr [[Y]], align 4, !tbaa [[FLOAT_TBAA16]]
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = load float, ptr addrspace(5) [[__TMP_I]], align 4, !tbaa [[FLOAT_TBAA16]]
+// FINITEONLY-NEXT:    store float [[TMP0]], ptr [[Z]], align 4, !tbaa [[FLOAT_TBAA16]]
 // FINITEONLY-NEXT:    call void @llvm.lifetime.end.p5(ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
 // FINITEONLY-NEXT:    ret void
 //
-// APPROX-LABEL: @test_sincospif(
-// APPROX-NEXT:  entry:
+// APPROX-LABEL: define dso_local void @test_sincospif(
+// APPROX-SAME: float noundef [[X:%.*]], ptr noundef writeonly captures(none) initializes((0, 4)) [[Y:%.*]], ptr noundef writeonly captures(none) initializes((0, 4)) [[Z:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
 // APPROX-NEXT:    [[__TMP_I:%.*]] = alloca float, align 4, addrspace(5)
 // APPROX-NEXT:    call void @llvm.lifetime.start.p5(ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
-// APPROX-NEXT:    [[CALL_I:%.*]] = call contract float @__ocml_sincospi_f32(float noundef [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
-// APPROX-NEXT:    store float [[CALL_I]], ptr [[Y:%.*]], align 4, !tbaa [[TBAA16]]
-// APPROX-NEXT:    [[TMP0:%.*]] = load float, ptr addrspace(5) [[__TMP_I]], align 4, !tbaa [[TBAA16]]
-// APPROX-NEXT:    store float [[TMP0]], ptr [[Z:%.*]], align 4, !tbaa [[TBAA16]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = call contract float @__ocml_sincospi_f32(float noundef [[X]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
+// APPROX-NEXT:    store float [[CALL_I]], ptr [[Y]], align 4, !tbaa [[FLOAT_TBAA16]]
+// APPROX-NEXT:    [[TMP0:%.*]] = load float, ptr addrspace(5) [[__TMP_I]], align 4, !tbaa [[FLOAT_TBAA16]]
+// APPROX-NEXT:    store float [[TMP0]], ptr [[Z]], align 4, !tbaa [[FLOAT_TBAA16]]
 // APPROX-NEXT:    call void @llvm.lifetime.end.p5(ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
 // APPROX-NEXT:    ret void
 //
-// NCRDIV-LABEL: @test_sincospif(
-// NCRDIV-NEXT:  entry:
+// NCRDIV-LABEL: define dso_local void @test_sincospif(
+// NCRDIV-SAME: float noundef [[X:%.*]], ptr noundef writeonly captures(none) initializes((0, 4)) [[Y:%.*]], ptr noundef writeonly captures(none) initializes((0, 4)) [[Z:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
 // NCRDIV-NEXT:    [[__TMP_I:%.*]] = alloca float, align 4, addrspace(5)
 // NCRDIV-NEXT:    call void @llvm.lifetime.start.p5(ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
-// NCRDIV-NEXT:    [[CALL_I:%.*]] = call contract float @__ocml_sincospi_f32(float noundef [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
-// NCRDIV-NEXT:    store float [[CALL_I]], ptr [[Y:%.*]], align 4, !tbaa [[TBAA17]]
-// NCRDIV-NEXT:    [[TMP0:%.*]] = load float, ptr addrspace(5) [[__TMP_I]], align 4, !tbaa [[TBAA17]]
-// NCRDIV-NEXT:    store float [[TMP0]], ptr [[Z:%.*]], align 4, !tbaa [[TBAA17]]
+// NCRDIV-NEXT:    [[CALL_I:%.*]] = call contract float @__ocml_sincospi_f32(float noundef [[X]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
+// NCRDIV-NEXT:    store float [[CALL_I]], ptr [[Y]], align 4, !tbaa [[FLOAT_TBAA17]]
+// NCRDIV-NEXT:    [[TMP0:%.*]] = load float, ptr addrspace(5) [[__TMP_I]], align 4, !tbaa [[FLOAT_TBAA17]]
+// NCRDIV-NEXT:    store float [[TMP0]], ptr [[Z]], align 4, !tbaa [[FLOAT_TBAA17]]
 // NCRDIV-NEXT:    call void @llvm.lifetime.end.p5(ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
 // NCRDIV-NEXT:    ret void
 //
-// AMDGCNSPIRV-LABEL: @test_sincospif(
-// AMDGCNSPIRV-NEXT:  entry:
+// AMDGCNSPIRV-LABEL: define spir_func void @test_sincospif(
+// AMDGCNSPIRV-SAME: float noundef [[X:%.*]], ptr addrspace(4) noundef writeonly captures(none) initializes((0, 4)) [[Y:%.*]], ptr addrspace(4) noundef writeonly captures(none) initializes((0, 4)) [[Z:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR6]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
 // AMDGCNSPIRV-NEXT:    [[__TMP_I:%.*]] = alloca float, align 4
 // AMDGCNSPIRV-NEXT:    [[__TMP_ASCAST_I:%.*]] = addrspacecast ptr [[__TMP_I]] to ptr addrspace(4)
 // AMDGCNSPIRV-NEXT:    call addrspace(4) void @llvm.lifetime.start.p0(ptr nonnull [[__TMP_I]]) #[[ATTR15]]
-// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = call contract spir_func addrspace(4) float @__ocml_sincospi_f32(float noundef [[X:%.*]], ptr noundef nonnull [[__TMP_I]]) #[[ATTR14]]
-// AMDGCNSPIRV-NEXT:    store float [[CALL_I]], ptr addrspace(4) [[Y:%.*]], align 4, !tbaa [[TBAA17]]
-// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = load float, ptr addrspace(4) [[__TMP_ASCAST_I]], align 4, !tbaa [[TBAA17]]
-// AMDGCNSPIRV-NEXT:    store float [[TMP0]], ptr addrspace(4) [[Z:%.*]], align 4, !tbaa [[TBAA17]]
+// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = call contract spir_func addrspace(4) float @__ocml_sincospi_f32(float noundef [[X]], ptr noundef nonnull [[__TMP_I]]) #[[ATTR14]]
+// AMDGCNSPIRV-NEXT:    store float [[CALL_I]], ptr addrspace(4) [[Y]], align 4, !tbaa [[FLOAT_TBAA17]]
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = load float, ptr addrspace(4) [[__TMP_ASCAST_I]], align 4, !tbaa [[FLOAT_TBAA17]]
+// AMDGCNSPIRV-NEXT:    store float [[TMP0]], ptr addrspace(4) [[Z]], align 4, !tbaa [[FLOAT_TBAA17]]
 // AMDGCNSPIRV-NEXT:    call addrspace(4) void @llvm.lifetime.end.p0(ptr nonnull [[__TMP_I]]) #[[ATTR15]]
 // AMDGCNSPIRV-NEXT:    ret void
 //
@@ -6375,59 +7188,64 @@ extern "C" __device__ void test_sincospif(float x, float *y, float *z) {
   sincospif(x, y, z);
 }
 
-// DEFAULT-LABEL: @test_sincospi(
-// DEFAULT-NEXT:  entry:
+// DEFAULT-LABEL: define dso_local void @test_sincospi(
+// DEFAULT-SAME: double noundef [[X:%.*]], ptr noundef writeonly captures(none) initializes((0, 8)) [[Y:%.*]], ptr noundef writeonly captures(none) initializes((0, 8)) [[Z:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
 // DEFAULT-NEXT:    [[__TMP_I:%.*]] = alloca double, align 8, addrspace(5)
 // DEFAULT-NEXT:    call void @llvm.lifetime.start.p5(ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = call contract double @__ocml_sincospi_f64(double noundef [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
-// DEFAULT-NEXT:    store double [[CALL_I]], ptr [[Y:%.*]], align 8, !tbaa [[TBAA18]]
-// DEFAULT-NEXT:    [[TMP0:%.*]] = load double, ptr addrspace(5) [[__TMP_I]], align 8, !tbaa [[TBAA18]]
-// DEFAULT-NEXT:    store double [[TMP0]], ptr [[Z:%.*]], align 8, !tbaa [[TBAA18]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = call contract double @__ocml_sincospi_f64(double noundef [[X]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
+// DEFAULT-NEXT:    store double [[CALL_I]], ptr [[Y]], align 8, !tbaa [[DOUBLE_TBAA18]]
+// DEFAULT-NEXT:    [[TMP0:%.*]] = load double, ptr addrspace(5) [[__TMP_I]], align 8, !tbaa [[DOUBLE_TBAA18]]
+// DEFAULT-NEXT:    store double [[TMP0]], ptr [[Z]], align 8, !tbaa [[DOUBLE_TBAA18]]
 // DEFAULT-NEXT:    call void @llvm.lifetime.end.p5(ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
 // DEFAULT-NEXT:    ret void
 //
-// FINITEONLY-LABEL: @test_sincospi(
-// FINITEONLY-NEXT:  entry:
+// FINITEONLY-LABEL: define dso_local void @test_sincospi(
+// FINITEONLY-SAME: double noundef nofpclass(nan inf) [[X:%.*]], ptr noundef writeonly captures(none) initializes((0, 8)) [[Y:%.*]], ptr noundef writeonly captures(none) initializes((0, 8)) [[Z:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
 // FINITEONLY-NEXT:    [[__TMP_I:%.*]] = alloca double, align 8, addrspace(5)
 // FINITEONLY-NEXT:    call void @llvm.lifetime.start.p5(ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = call nnan ninf contract nofpclass(nan inf) double @__ocml_sincospi_f64(double noundef nofpclass(nan inf) [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
-// FINITEONLY-NEXT:    store double [[CALL_I]], ptr [[Y:%.*]], align 8, !tbaa [[TBAA18]]
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = load double, ptr addrspace(5) [[__TMP_I]], align 8, !tbaa [[TBAA18]]
-// FINITEONLY-NEXT:    store double [[TMP0]], ptr [[Z:%.*]], align 8, !tbaa [[TBAA18]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = call nnan ninf contract nofpclass(nan inf) double @__ocml_sincospi_f64(double noundef nofpclass(nan inf) [[X]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    store double [[CALL_I]], ptr [[Y]], align 8, !tbaa [[DOUBLE_TBAA18]]
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = load double, ptr addrspace(5) [[__TMP_I]], align 8, !tbaa [[DOUBLE_TBAA18]]
+// FINITEONLY-NEXT:    store double [[TMP0]], ptr [[Z]], align 8, !tbaa [[DOUBLE_TBAA18]]
 // FINITEONLY-NEXT:    call void @llvm.lifetime.end.p5(ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
 // FINITEONLY-NEXT:    ret void
 //
-// APPROX-LABEL: @test_sincospi(
-// APPROX-NEXT:  entry:
+// APPROX-LABEL: define dso_local void @test_sincospi(
+// APPROX-SAME: double noundef [[X:%.*]], ptr noundef writeonly captures(none) initializes((0, 8)) [[Y:%.*]], ptr noundef writeonly captures(none) initializes((0, 8)) [[Z:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
 // APPROX-NEXT:    [[__TMP_I:%.*]] = alloca double, align 8, addrspace(5)
 // APPROX-NEXT:    call void @llvm.lifetime.start.p5(ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
-// APPROX-NEXT:    [[CALL_I:%.*]] = call contract double @__ocml_sincospi_f64(double noundef [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
-// APPROX-NEXT:    store double [[CALL_I]], ptr [[Y:%.*]], align 8, !tbaa [[TBAA18]]
-// APPROX-NEXT:    [[TMP0:%.*]] = load double, ptr addrspace(5) [[__TMP_I]], align 8, !tbaa [[TBAA18]]
-// APPROX-NEXT:    store double [[TMP0]], ptr [[Z:%.*]], align 8, !tbaa [[TBAA18]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = call contract double @__ocml_sincospi_f64(double noundef [[X]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
+// APPROX-NEXT:    store double [[CALL_I]], ptr [[Y]], align 8, !tbaa [[DOUBLE_TBAA18]]
+// APPROX-NEXT:    [[TMP0:%.*]] = load double, ptr addrspace(5) [[__TMP_I]], align 8, !tbaa [[DOUBLE_TBAA18]]
+// APPROX-NEXT:    store double [[TMP0]], ptr [[Z]], align 8, !tbaa [[DOUBLE_TBAA18]]
 // APPROX-NEXT:    call void @llvm.lifetime.end.p5(ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
 // APPROX-NEXT:    ret void
 //
-// NCRDIV-LABEL: @test_sincospi(
-// NCRDIV-NEXT:  entry:
+// NCRDIV-LABEL: define dso_local void @test_sincospi(
+// NCRDIV-SAME: double noundef [[X:%.*]], ptr noundef writeonly captures(none) initializes((0, 8)) [[Y:%.*]], ptr noundef writeonly captures(none) initializes((0, 8)) [[Z:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
 // NCRDIV-NEXT:    [[__TMP_I:%.*]] = alloca double, align 8, addrspace(5)
 // NCRDIV-NEXT:    call void @llvm.lifetime.start.p5(ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
-// NCRDIV-NEXT:    [[CALL_I:%.*]] = call contract double @__ocml_sincospi_f64(double noundef [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
-// NCRDIV-NEXT:    store double [[CALL_I]], ptr [[Y:%.*]], align 8, !tbaa [[TBAA19]]
-// NCRDIV-NEXT:    [[TMP0:%.*]] = load double, ptr addrspace(5) [[__TMP_I]], align 8, !tbaa [[TBAA19]]
-// NCRDIV-NEXT:    store double [[TMP0]], ptr [[Z:%.*]], align 8, !tbaa [[TBAA19]]
+// NCRDIV-NEXT:    [[CALL_I:%.*]] = call contract double @__ocml_sincospi_f64(double noundef [[X]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
+// NCRDIV-NEXT:    store double [[CALL_I]], ptr [[Y]], align 8, !tbaa [[DOUBLE_TBAA19]]
+// NCRDIV-NEXT:    [[TMP0:%.*]] = load double, ptr addrspace(5) [[__TMP_I]], align 8, !tbaa [[DOUBLE_TBAA19]]
+// NCRDIV-NEXT:    store double [[TMP0]], ptr [[Z]], align 8, !tbaa [[DOUBLE_TBAA19]]
 // NCRDIV-NEXT:    call void @llvm.lifetime.end.p5(ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
 // NCRDIV-NEXT:    ret void
 //
-// AMDGCNSPIRV-LABEL: @test_sincospi(
-// AMDGCNSPIRV-NEXT:  entry:
+// AMDGCNSPIRV-LABEL: define spir_func void @test_sincospi(
+// AMDGCNSPIRV-SAME: double noundef [[X:%.*]], ptr addrspace(4) noundef writeonly captures(none) initializes((0, 8)) [[Y:%.*]], ptr addrspace(4) noundef writeonly captures(none) initializes((0, 8)) [[Z:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR6]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
 // AMDGCNSPIRV-NEXT:    [[__TMP_I:%.*]] = alloca double, align 8
 // AMDGCNSPIRV-NEXT:    [[__TMP_ASCAST_I:%.*]] = addrspacecast ptr [[__TMP_I]] to ptr addrspace(4)
 // AMDGCNSPIRV-NEXT:    call addrspace(4) void @llvm.lifetime.start.p0(ptr nonnull [[__TMP_I]]) #[[ATTR15]]
-// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = call contract spir_func addrspace(4) double @__ocml_sincospi_f64(double noundef [[X:%.*]], ptr noundef nonnull [[__TMP_I]]) #[[ATTR14]]
-// AMDGCNSPIRV-NEXT:    store double [[CALL_I]], ptr addrspace(4) [[Y:%.*]], align 8, !tbaa [[TBAA19]]
-// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = load double, ptr addrspace(4) [[__TMP_ASCAST_I]], align 8, !tbaa [[TBAA19]]
-// AMDGCNSPIRV-NEXT:    store double [[TMP0]], ptr addrspace(4) [[Z:%.*]], align 8, !tbaa [[TBAA19]]
+// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = call contract spir_func addrspace(4) double @__ocml_sincospi_f64(double noundef [[X]], ptr noundef nonnull [[__TMP_I]]) #[[ATTR14]]
+// AMDGCNSPIRV-NEXT:    store double [[CALL_I]], ptr addrspace(4) [[Y]], align 8, !tbaa [[DOUBLE_TBAA19]]
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = load double, ptr addrspace(4) [[__TMP_ASCAST_I]], align 8, !tbaa [[DOUBLE_TBAA19]]
+// AMDGCNSPIRV-NEXT:    store double [[TMP0]], ptr addrspace(4) [[Z]], align 8, !tbaa [[DOUBLE_TBAA19]]
 // AMDGCNSPIRV-NEXT:    call addrspace(4) void @llvm.lifetime.end.p0(ptr nonnull [[__TMP_I]]) #[[ATTR15]]
 // AMDGCNSPIRV-NEXT:    ret void
 //
@@ -6435,549 +7253,640 @@ extern "C" __device__ void test_sincospi(double x, double *y, double *z) {
   sincospi(x, y, z);
 }
 
-// DEFAULT-LABEL: @test_sinf(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_sin_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// DEFAULT-LABEL: define dso_local noundef float @test_sinf(
+// DEFAULT-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_sin_f32(float noundef [[X]]) #[[ATTR16]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
-// FINITEONLY-LABEL: @test_sinf(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_sin_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) float @test_sinf(
+// FINITEONLY-SAME: float noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_sin_f32(float noundef nofpclass(nan inf) [[X]]) #[[ATTR16]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
-// APPROX-LABEL: @test_sinf(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I1:%.*]] = tail call contract noundef float @__ocml_native_sin_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// APPROX-LABEL: define dso_local noundef float @test_sinf(
+// APPROX-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[CALL_I1:%.*]] = tail call contract noundef float @__ocml_native_sin_f32(float noundef [[X]]) #[[ATTR16]]
 // APPROX-NEXT:    ret float [[CALL_I1]]
 //
-// NCRDIV-LABEL: @test_sinf(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_sin_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// NCRDIV-LABEL: define dso_local noundef float @test_sinf(
+// NCRDIV-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_sin_f32(float noundef [[X]]) #[[ATTR16]]
 // NCRDIV-NEXT:    ret float [[CALL_I]]
 //
-// AMDGCNSPIRV-LABEL: @test_sinf(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_sin_f32(float noundef [[X:%.*]]) #[[ATTR14]]
+// AMDGCNSPIRV-LABEL: define spir_func noundef float @test_sinf(
+// AMDGCNSPIRV-SAME: float noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR6]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_sin_f32(float noundef [[X]]) #[[ATTR14]]
 // AMDGCNSPIRV-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_sinf(float x) {
   return sinf(x);
 }
 
-// DEFAULT-LABEL: @test_sin(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_sin_f64(double noundef [[X:%.*]]) #[[ATTR16]]
+// DEFAULT-LABEL: define dso_local noundef double @test_sin(
+// DEFAULT-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_sin_f64(double noundef [[X]]) #[[ATTR16]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
-// FINITEONLY-LABEL: @test_sin(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_sin_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) double @test_sin(
+// FINITEONLY-SAME: double noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_sin_f64(double noundef nofpclass(nan inf) [[X]]) #[[ATTR16]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
-// APPROX-LABEL: @test_sin(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_sin_f64(double noundef [[X:%.*]]) #[[ATTR16]]
+// APPROX-LABEL: define dso_local noundef double @test_sin(
+// APPROX-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_sin_f64(double noundef [[X]]) #[[ATTR16]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
-// NCRDIV-LABEL: @test_sin(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_sin_f64(double noundef [[X:%.*]]) #[[ATTR16]]
+// NCRDIV-LABEL: define dso_local noundef double @test_sin(
+// NCRDIV-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_sin_f64(double noundef [[X]]) #[[ATTR16]]
 // NCRDIV-NEXT:    ret double [[CALL_I]]
 //
-// AMDGCNSPIRV-LABEL: @test_sin(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_sin_f64(double noundef [[X:%.*]]) #[[ATTR14]]
+// AMDGCNSPIRV-LABEL: define spir_func noundef double @test_sin(
+// AMDGCNSPIRV-SAME: double noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR6]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_sin_f64(double noundef [[X]]) #[[ATTR14]]
 // AMDGCNSPIRV-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_sin(double x) {
   return sin(x);
 }
 
-// DEFAULT-LABEL: @test_sinpif(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_sinpi_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// DEFAULT-LABEL: define dso_local noundef float @test_sinpif(
+// DEFAULT-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_sinpi_f32(float noundef [[X]]) #[[ATTR16]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
-// FINITEONLY-LABEL: @test_sinpif(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_sinpi_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) float @test_sinpif(
+// FINITEONLY-SAME: float noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_sinpi_f32(float noundef nofpclass(nan inf) [[X]]) #[[ATTR16]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
-// APPROX-LABEL: @test_sinpif(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_sinpi_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// APPROX-LABEL: define dso_local noundef float @test_sinpif(
+// APPROX-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_sinpi_f32(float noundef [[X]]) #[[ATTR16]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
-// NCRDIV-LABEL: @test_sinpif(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_sinpi_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// NCRDIV-LABEL: define dso_local noundef float @test_sinpif(
+// NCRDIV-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_sinpi_f32(float noundef [[X]]) #[[ATTR16]]
 // NCRDIV-NEXT:    ret float [[CALL_I]]
 //
-// AMDGCNSPIRV-LABEL: @test_sinpif(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_sinpi_f32(float noundef [[X:%.*]]) #[[ATTR14]]
+// AMDGCNSPIRV-LABEL: define spir_func noundef float @test_sinpif(
+// AMDGCNSPIRV-SAME: float noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR6]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_sinpi_f32(float noundef [[X]]) #[[ATTR14]]
 // AMDGCNSPIRV-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_sinpif(float x) {
   return sinpif(x);
 }
 
-// DEFAULT-LABEL: @test_sinpi(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_sinpi_f64(double noundef [[X:%.*]]) #[[ATTR16]]
+// DEFAULT-LABEL: define dso_local noundef double @test_sinpi(
+// DEFAULT-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_sinpi_f64(double noundef [[X]]) #[[ATTR16]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
-// FINITEONLY-LABEL: @test_sinpi(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_sinpi_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) double @test_sinpi(
+// FINITEONLY-SAME: double noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_sinpi_f64(double noundef nofpclass(nan inf) [[X]]) #[[ATTR16]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
-// APPROX-LABEL: @test_sinpi(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_sinpi_f64(double noundef [[X:%.*]]) #[[ATTR16]]
+// APPROX-LABEL: define dso_local noundef double @test_sinpi(
+// APPROX-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_sinpi_f64(double noundef [[X]]) #[[ATTR16]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
-// NCRDIV-LABEL: @test_sinpi(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_sinpi_f64(double noundef [[X:%.*]]) #[[ATTR16]]
+// NCRDIV-LABEL: define dso_local noundef double @test_sinpi(
+// NCRDIV-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_sinpi_f64(double noundef [[X]]) #[[ATTR16]]
 // NCRDIV-NEXT:    ret double [[CALL_I]]
 //
-// AMDGCNSPIRV-LABEL: @test_sinpi(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_sinpi_f64(double noundef [[X:%.*]]) #[[ATTR14]]
+// AMDGCNSPIRV-LABEL: define spir_func noundef double @test_sinpi(
+// AMDGCNSPIRV-SAME: double noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR6]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_sinpi_f64(double noundef [[X]]) #[[ATTR14]]
 // AMDGCNSPIRV-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_sinpi(double x) {
   return sinpi(x);
 }
 
-// DEFAULT-LABEL: @test_sqrtf(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.sqrt.f32(float [[X:%.*]])
+// DEFAULT-LABEL: define dso_local noundef float @test_sqrtf(
+// DEFAULT-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.sqrt.f32(float [[X]])
 // DEFAULT-NEXT:    ret float [[TMP0]]
 //
-// FINITEONLY-LABEL: @test_sqrtf(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.sqrt.f32(float nofpclass(nan inf) [[X:%.*]])
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) float @test_sqrtf(
+// FINITEONLY-SAME: float noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.sqrt.f32(float nofpclass(nan inf) [[X]])
 // FINITEONLY-NEXT:    ret float [[TMP0]]
 //
-// APPROX-LABEL: @test_sqrtf(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.sqrt.f32(float [[X:%.*]])
+// APPROX-LABEL: define dso_local noundef float @test_sqrtf(
+// APPROX-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.sqrt.f32(float [[X]])
 // APPROX-NEXT:    ret float [[TMP0]]
 //
-// NCRDIV-LABEL: @test_sqrtf(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.sqrt.f32(float [[X:%.*]]), !fpmath [[META25:![0-9]+]]
+// NCRDIV-LABEL: define dso_local noundef float @test_sqrtf(
+// NCRDIV-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.sqrt.f32(float [[X]]), !fpmath [[META25:![0-9]+]]
 // NCRDIV-NEXT:    ret float [[TMP0]]
 //
-// AMDGCNSPIRV-LABEL: @test_sqrtf(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call contract noundef addrspace(4) float @llvm.sqrt.f32(float [[X:%.*]])
+// AMDGCNSPIRV-LABEL: define spir_func noundef float @test_sqrtf(
+// AMDGCNSPIRV-SAME: float noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR3]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call contract noundef addrspace(4) float @llvm.sqrt.f32(float [[X]])
 // AMDGCNSPIRV-NEXT:    ret float [[TMP0]]
 //
 extern "C" __device__ float test_sqrtf(float x) {
   return sqrtf(x);
 }
 
-// DEFAULT-LABEL: @test_sqrt(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.sqrt.f64(double [[X:%.*]])
+// DEFAULT-LABEL: define dso_local noundef double @test_sqrt(
+// DEFAULT-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.sqrt.f64(double [[X]])
 // DEFAULT-NEXT:    ret double [[TMP0]]
 //
-// FINITEONLY-LABEL: @test_sqrt(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.sqrt.f64(double nofpclass(nan inf) [[X:%.*]])
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) double @test_sqrt(
+// FINITEONLY-SAME: double noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.sqrt.f64(double nofpclass(nan inf) [[X]])
 // FINITEONLY-NEXT:    ret double [[TMP0]]
 //
-// APPROX-LABEL: @test_sqrt(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.sqrt.f64(double [[X:%.*]])
+// APPROX-LABEL: define dso_local noundef double @test_sqrt(
+// APPROX-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.sqrt.f64(double [[X]])
 // APPROX-NEXT:    ret double [[TMP0]]
 //
-// NCRDIV-LABEL: @test_sqrt(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.sqrt.f64(double [[X:%.*]])
+// NCRDIV-LABEL: define dso_local noundef double @test_sqrt(
+// NCRDIV-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.sqrt.f64(double [[X]])
 // NCRDIV-NEXT:    ret double [[TMP0]]
 //
-// AMDGCNSPIRV-LABEL: @test_sqrt(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call contract noundef addrspace(4) double @llvm.sqrt.f64(double [[X:%.*]])
+// AMDGCNSPIRV-LABEL: define spir_func noundef double @test_sqrt(
+// AMDGCNSPIRV-SAME: double noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR3]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call contract noundef addrspace(4) double @llvm.sqrt.f64(double [[X]])
 // AMDGCNSPIRV-NEXT:    ret double [[TMP0]]
 //
 extern "C" __device__ double test_sqrt(double x) {
   return sqrt(x);
 }
 
-// DEFAULT-LABEL: @test_tanf(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_tan_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// DEFAULT-LABEL: define dso_local noundef float @test_tanf(
+// DEFAULT-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_tan_f32(float noundef [[X]]) #[[ATTR16]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
-// FINITEONLY-LABEL: @test_tanf(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_tan_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) float @test_tanf(
+// FINITEONLY-SAME: float noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_tan_f32(float noundef nofpclass(nan inf) [[X]]) #[[ATTR16]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
-// APPROX-LABEL: @test_tanf(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_tan_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// APPROX-LABEL: define dso_local noundef float @test_tanf(
+// APPROX-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_tan_f32(float noundef [[X]]) #[[ATTR16]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
-// NCRDIV-LABEL: @test_tanf(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_tan_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// NCRDIV-LABEL: define dso_local noundef float @test_tanf(
+// NCRDIV-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_tan_f32(float noundef [[X]]) #[[ATTR16]]
 // NCRDIV-NEXT:    ret float [[CALL_I]]
 //
-// AMDGCNSPIRV-LABEL: @test_tanf(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_tan_f32(float noundef [[X:%.*]]) #[[ATTR14]]
+// AMDGCNSPIRV-LABEL: define spir_func noundef float @test_tanf(
+// AMDGCNSPIRV-SAME: float noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR6]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_tan_f32(float noundef [[X]]) #[[ATTR14]]
 // AMDGCNSPIRV-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_tanf(float x) {
   return tanf(x);
 }
 
-// DEFAULT-LABEL: @test_tan(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_tan_f64(double noundef [[X:%.*]]) #[[ATTR16]]
+// DEFAULT-LABEL: define dso_local noundef double @test_tan(
+// DEFAULT-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_tan_f64(double noundef [[X]]) #[[ATTR16]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
-// FINITEONLY-LABEL: @test_tan(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_tan_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) double @test_tan(
+// FINITEONLY-SAME: double noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_tan_f64(double noundef nofpclass(nan inf) [[X]]) #[[ATTR16]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
-// APPROX-LABEL: @test_tan(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_tan_f64(double noundef [[X:%.*]]) #[[ATTR16]]
+// APPROX-LABEL: define dso_local noundef double @test_tan(
+// APPROX-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_tan_f64(double noundef [[X]]) #[[ATTR16]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
-// NCRDIV-LABEL: @test_tan(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_tan_f64(double noundef [[X:%.*]]) #[[ATTR16]]
+// NCRDIV-LABEL: define dso_local noundef double @test_tan(
+// NCRDIV-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_tan_f64(double noundef [[X]]) #[[ATTR16]]
 // NCRDIV-NEXT:    ret double [[CALL_I]]
 //
-// AMDGCNSPIRV-LABEL: @test_tan(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_tan_f64(double noundef [[X:%.*]]) #[[ATTR14]]
+// AMDGCNSPIRV-LABEL: define spir_func noundef double @test_tan(
+// AMDGCNSPIRV-SAME: double noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR6]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_tan_f64(double noundef [[X]]) #[[ATTR14]]
 // AMDGCNSPIRV-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_tan(double x) {
   return tan(x);
 }
 
-// DEFAULT-LABEL: @test_tanhf(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_tanh_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-LABEL: define dso_local noundef float @test_tanhf(
+// DEFAULT-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_tanh_f32(float noundef [[X]]) #[[ATTR15]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
-// FINITEONLY-LABEL: @test_tanhf(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_tanh_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) float @test_tanhf(
+// FINITEONLY-SAME: float noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_tanh_f32(float noundef nofpclass(nan inf) [[X]]) #[[ATTR15]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
-// APPROX-LABEL: @test_tanhf(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_tanh_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-LABEL: define dso_local noundef float @test_tanhf(
+// APPROX-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_tanh_f32(float noundef [[X]]) #[[ATTR15]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
-// NCRDIV-LABEL: @test_tanhf(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_tanh_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// NCRDIV-LABEL: define dso_local noundef float @test_tanhf(
+// NCRDIV-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_tanh_f32(float noundef [[X]]) #[[ATTR15]]
 // NCRDIV-NEXT:    ret float [[CALL_I]]
 //
-// AMDGCNSPIRV-LABEL: @test_tanhf(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_tanh_f32(float noundef [[X:%.*]]) #[[ATTR13]]
+// AMDGCNSPIRV-LABEL: define spir_func noundef float @test_tanhf(
+// AMDGCNSPIRV-SAME: float noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR5]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_tanh_f32(float noundef [[X]]) #[[ATTR13]]
 // AMDGCNSPIRV-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_tanhf(float x) {
   return tanhf(x);
 }
 
-// DEFAULT-LABEL: @test_tanh(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_tanh_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-LABEL: define dso_local noundef double @test_tanh(
+// DEFAULT-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_tanh_f64(double noundef [[X]]) #[[ATTR15]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
-// FINITEONLY-LABEL: @test_tanh(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_tanh_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) double @test_tanh(
+// FINITEONLY-SAME: double noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_tanh_f64(double noundef nofpclass(nan inf) [[X]]) #[[ATTR15]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
-// APPROX-LABEL: @test_tanh(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_tanh_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-LABEL: define dso_local noundef double @test_tanh(
+// APPROX-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_tanh_f64(double noundef [[X]]) #[[ATTR15]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
-// NCRDIV-LABEL: @test_tanh(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_tanh_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// NCRDIV-LABEL: define dso_local noundef double @test_tanh(
+// NCRDIV-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_tanh_f64(double noundef [[X]]) #[[ATTR15]]
 // NCRDIV-NEXT:    ret double [[CALL_I]]
 //
-// AMDGCNSPIRV-LABEL: @test_tanh(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_tanh_f64(double noundef [[X:%.*]]) #[[ATTR13]]
+// AMDGCNSPIRV-LABEL: define spir_func noundef double @test_tanh(
+// AMDGCNSPIRV-SAME: double noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR5]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_tanh_f64(double noundef [[X]]) #[[ATTR13]]
 // AMDGCNSPIRV-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_tanh(double x) {
   return tanh(x);
 }
 
-// DEFAULT-LABEL: @test_tgammaf(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_tgamma_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// DEFAULT-LABEL: define dso_local noundef float @test_tgammaf(
+// DEFAULT-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_tgamma_f32(float noundef [[X]]) #[[ATTR16]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
-// FINITEONLY-LABEL: @test_tgammaf(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_tgamma_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) float @test_tgammaf(
+// FINITEONLY-SAME: float noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_tgamma_f32(float noundef nofpclass(nan inf) [[X]]) #[[ATTR16]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
-// APPROX-LABEL: @test_tgammaf(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_tgamma_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// APPROX-LABEL: define dso_local noundef float @test_tgammaf(
+// APPROX-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_tgamma_f32(float noundef [[X]]) #[[ATTR16]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
-// NCRDIV-LABEL: @test_tgammaf(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_tgamma_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// NCRDIV-LABEL: define dso_local noundef float @test_tgammaf(
+// NCRDIV-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_tgamma_f32(float noundef [[X]]) #[[ATTR16]]
 // NCRDIV-NEXT:    ret float [[CALL_I]]
 //
-// AMDGCNSPIRV-LABEL: @test_tgammaf(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_tgamma_f32(float noundef [[X:%.*]]) #[[ATTR14]]
+// AMDGCNSPIRV-LABEL: define spir_func noundef float @test_tgammaf(
+// AMDGCNSPIRV-SAME: float noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR6]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_tgamma_f32(float noundef [[X]]) #[[ATTR14]]
 // AMDGCNSPIRV-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_tgammaf(float x) {
   return tgammaf(x);
 }
 
-// DEFAULT-LABEL: @test_tgamma(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_tgamma_f64(double noundef [[X:%.*]]) #[[ATTR16]]
+// DEFAULT-LABEL: define dso_local noundef double @test_tgamma(
+// DEFAULT-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_tgamma_f64(double noundef [[X]]) #[[ATTR16]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
-// FINITEONLY-LABEL: @test_tgamma(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_tgamma_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) double @test_tgamma(
+// FINITEONLY-SAME: double noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_tgamma_f64(double noundef nofpclass(nan inf) [[X]]) #[[ATTR16]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
-// APPROX-LABEL: @test_tgamma(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_tgamma_f64(double noundef [[X:%.*]]) #[[ATTR16]]
+// APPROX-LABEL: define dso_local noundef double @test_tgamma(
+// APPROX-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_tgamma_f64(double noundef [[X]]) #[[ATTR16]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
-// NCRDIV-LABEL: @test_tgamma(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_tgamma_f64(double noundef [[X:%.*]]) #[[ATTR16]]
+// NCRDIV-LABEL: define dso_local noundef double @test_tgamma(
+// NCRDIV-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_tgamma_f64(double noundef [[X]]) #[[ATTR16]]
 // NCRDIV-NEXT:    ret double [[CALL_I]]
 //
-// AMDGCNSPIRV-LABEL: @test_tgamma(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_tgamma_f64(double noundef [[X:%.*]]) #[[ATTR14]]
+// AMDGCNSPIRV-LABEL: define spir_func noundef double @test_tgamma(
+// AMDGCNSPIRV-SAME: double noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR6]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_tgamma_f64(double noundef [[X]]) #[[ATTR14]]
 // AMDGCNSPIRV-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_tgamma(double x) {
   return tgamma(x);
 }
 
-// DEFAULT-LABEL: @test_truncf(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.trunc.f32(float [[X:%.*]])
+// DEFAULT-LABEL: define dso_local noundef float @test_truncf(
+// DEFAULT-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.trunc.f32(float [[X]])
 // DEFAULT-NEXT:    ret float [[TMP0]]
 //
-// FINITEONLY-LABEL: @test_truncf(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.trunc.f32(float nofpclass(nan inf) [[X:%.*]])
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) float @test_truncf(
+// FINITEONLY-SAME: float noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.trunc.f32(float nofpclass(nan inf) [[X]])
 // FINITEONLY-NEXT:    ret float [[TMP0]]
 //
-// APPROX-LABEL: @test_truncf(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.trunc.f32(float [[X:%.*]])
+// APPROX-LABEL: define dso_local noundef float @test_truncf(
+// APPROX-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.trunc.f32(float [[X]])
 // APPROX-NEXT:    ret float [[TMP0]]
 //
-// NCRDIV-LABEL: @test_truncf(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.trunc.f32(float [[X:%.*]])
+// NCRDIV-LABEL: define dso_local noundef float @test_truncf(
+// NCRDIV-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.trunc.f32(float [[X]])
 // NCRDIV-NEXT:    ret float [[TMP0]]
 //
-// AMDGCNSPIRV-LABEL: @test_truncf(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call contract noundef addrspace(4) float @llvm.trunc.f32(float [[X:%.*]])
+// AMDGCNSPIRV-LABEL: define spir_func noundef float @test_truncf(
+// AMDGCNSPIRV-SAME: float noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR3]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call contract noundef addrspace(4) float @llvm.trunc.f32(float [[X]])
 // AMDGCNSPIRV-NEXT:    ret float [[TMP0]]
 //
 extern "C" __device__ float test_truncf(float x) {
   return truncf(x);
 }
 
-// DEFAULT-LABEL: @test_trunc(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.trunc.f64(double [[X:%.*]])
+// DEFAULT-LABEL: define dso_local noundef double @test_trunc(
+// DEFAULT-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.trunc.f64(double [[X]])
 // DEFAULT-NEXT:    ret double [[TMP0]]
 //
-// FINITEONLY-LABEL: @test_trunc(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.trunc.f64(double nofpclass(nan inf) [[X:%.*]])
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) double @test_trunc(
+// FINITEONLY-SAME: double noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.trunc.f64(double nofpclass(nan inf) [[X]])
 // FINITEONLY-NEXT:    ret double [[TMP0]]
 //
-// APPROX-LABEL: @test_trunc(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.trunc.f64(double [[X:%.*]])
+// APPROX-LABEL: define dso_local noundef double @test_trunc(
+// APPROX-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.trunc.f64(double [[X]])
 // APPROX-NEXT:    ret double [[TMP0]]
 //
-// NCRDIV-LABEL: @test_trunc(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.trunc.f64(double [[X:%.*]])
+// NCRDIV-LABEL: define dso_local noundef double @test_trunc(
+// NCRDIV-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.trunc.f64(double [[X]])
 // NCRDIV-NEXT:    ret double [[TMP0]]
 //
-// AMDGCNSPIRV-LABEL: @test_trunc(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call contract noundef addrspace(4) double @llvm.trunc.f64(double [[X:%.*]])
+// AMDGCNSPIRV-LABEL: define spir_func noundef double @test_trunc(
+// AMDGCNSPIRV-SAME: double noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR3]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call contract noundef addrspace(4) double @llvm.trunc.f64(double [[X]])
 // AMDGCNSPIRV-NEXT:    ret double [[TMP0]]
 //
 extern "C" __device__ double test_trunc(double x) {
   return trunc(x);
 }
 
-// DEFAULT-LABEL: @test_y0f(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_y0_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// DEFAULT-LABEL: define dso_local noundef float @test_y0f(
+// DEFAULT-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_y0_f32(float noundef [[X]]) #[[ATTR16]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
-// FINITEONLY-LABEL: @test_y0f(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_y0_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) float @test_y0f(
+// FINITEONLY-SAME: float noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_y0_f32(float noundef nofpclass(nan inf) [[X]]) #[[ATTR16]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
-// APPROX-LABEL: @test_y0f(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_y0_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// APPROX-LABEL: define dso_local noundef float @test_y0f(
+// APPROX-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_y0_f32(float noundef [[X]]) #[[ATTR16]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
-// NCRDIV-LABEL: @test_y0f(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_y0_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// NCRDIV-LABEL: define dso_local noundef float @test_y0f(
+// NCRDIV-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_y0_f32(float noundef [[X]]) #[[ATTR16]]
 // NCRDIV-NEXT:    ret float [[CALL_I]]
 //
-// AMDGCNSPIRV-LABEL: @test_y0f(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_y0_f32(float noundef [[X:%.*]]) #[[ATTR14]]
+// AMDGCNSPIRV-LABEL: define spir_func noundef float @test_y0f(
+// AMDGCNSPIRV-SAME: float noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR6]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_y0_f32(float noundef [[X]]) #[[ATTR14]]
 // AMDGCNSPIRV-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_y0f(float x) {
   return y0f(x);
 }
 
-// DEFAULT-LABEL: @test_y0(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_y0_f64(double noundef [[X:%.*]]) #[[ATTR16]]
+// DEFAULT-LABEL: define dso_local noundef double @test_y0(
+// DEFAULT-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_y0_f64(double noundef [[X]]) #[[ATTR16]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
-// FINITEONLY-LABEL: @test_y0(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_y0_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) double @test_y0(
+// FINITEONLY-SAME: double noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_y0_f64(double noundef nofpclass(nan inf) [[X]]) #[[ATTR16]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
-// APPROX-LABEL: @test_y0(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_y0_f64(double noundef [[X:%.*]]) #[[ATTR16]]
+// APPROX-LABEL: define dso_local noundef double @test_y0(
+// APPROX-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_y0_f64(double noundef [[X]]) #[[ATTR16]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
-// NCRDIV-LABEL: @test_y0(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_y0_f64(double noundef [[X:%.*]]) #[[ATTR16]]
+// NCRDIV-LABEL: define dso_local noundef double @test_y0(
+// NCRDIV-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_y0_f64(double noundef [[X]]) #[[ATTR16]]
 // NCRDIV-NEXT:    ret double [[CALL_I]]
 //
-// AMDGCNSPIRV-LABEL: @test_y0(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_y0_f64(double noundef [[X:%.*]]) #[[ATTR14]]
+// AMDGCNSPIRV-LABEL: define spir_func noundef double @test_y0(
+// AMDGCNSPIRV-SAME: double noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR6]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_y0_f64(double noundef [[X]]) #[[ATTR14]]
 // AMDGCNSPIRV-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_y0(double x) {
   return y0(x);
 }
 
-// DEFAULT-LABEL: @test_y1f(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_y1_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// DEFAULT-LABEL: define dso_local noundef float @test_y1f(
+// DEFAULT-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_y1_f32(float noundef [[X]]) #[[ATTR16]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
-// FINITEONLY-LABEL: @test_y1f(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_y1_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) float @test_y1f(
+// FINITEONLY-SAME: float noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_y1_f32(float noundef nofpclass(nan inf) [[X]]) #[[ATTR16]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
-// APPROX-LABEL: @test_y1f(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_y1_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// APPROX-LABEL: define dso_local noundef float @test_y1f(
+// APPROX-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_y1_f32(float noundef [[X]]) #[[ATTR16]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
-// NCRDIV-LABEL: @test_y1f(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_y1_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// NCRDIV-LABEL: define dso_local noundef float @test_y1f(
+// NCRDIV-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_y1_f32(float noundef [[X]]) #[[ATTR16]]
 // NCRDIV-NEXT:    ret float [[CALL_I]]
 //
-// AMDGCNSPIRV-LABEL: @test_y1f(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_y1_f32(float noundef [[X:%.*]]) #[[ATTR14]]
+// AMDGCNSPIRV-LABEL: define spir_func noundef float @test_y1f(
+// AMDGCNSPIRV-SAME: float noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR6]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_y1_f32(float noundef [[X]]) #[[ATTR14]]
 // AMDGCNSPIRV-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_y1f(float x) {
   return y1f(x);
 }
 
-// DEFAULT-LABEL: @test_y1(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_y1_f64(double noundef [[X:%.*]]) #[[ATTR16]]
+// DEFAULT-LABEL: define dso_local noundef double @test_y1(
+// DEFAULT-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_y1_f64(double noundef [[X]]) #[[ATTR16]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
-// FINITEONLY-LABEL: @test_y1(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_y1_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) double @test_y1(
+// FINITEONLY-SAME: double noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_y1_f64(double noundef nofpclass(nan inf) [[X]]) #[[ATTR16]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
-// APPROX-LABEL: @test_y1(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_y1_f64(double noundef [[X:%.*]]) #[[ATTR16]]
+// APPROX-LABEL: define dso_local noundef double @test_y1(
+// APPROX-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_y1_f64(double noundef [[X]]) #[[ATTR16]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
-// NCRDIV-LABEL: @test_y1(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_y1_f64(double noundef [[X:%.*]]) #[[ATTR16]]
+// NCRDIV-LABEL: define dso_local noundef double @test_y1(
+// NCRDIV-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_y1_f64(double noundef [[X]]) #[[ATTR16]]
 // NCRDIV-NEXT:    ret double [[CALL_I]]
 //
-// AMDGCNSPIRV-LABEL: @test_y1(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_y1_f64(double noundef [[X:%.*]]) #[[ATTR14]]
+// AMDGCNSPIRV-LABEL: define spir_func noundef double @test_y1(
+// AMDGCNSPIRV-SAME: double noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR6]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_y1_f64(double noundef [[X]]) #[[ATTR14]]
 // AMDGCNSPIRV-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_y1(double x) {
   return y1(x);
 }
 
-// DEFAULT-LABEL: @test_ynf(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    switch i32 [[X:%.*]], label [[IF_END4_I:%.*]] [
-// DEFAULT-NEXT:      i32 0, label [[IF_THEN_I:%.*]]
-// DEFAULT-NEXT:      i32 1, label [[IF_THEN2_I:%.*]]
+// DEFAULT-LABEL: define dso_local float @test_ynf(
+// DEFAULT-SAME: i32 noundef [[X:%.*]], float noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    switch i32 [[X]], label %[[IF_END4_I:.*]] [
+// DEFAULT-NEXT:      i32 0, label %[[IF_THEN_I:.*]]
+// DEFAULT-NEXT:      i32 1, label %[[IF_THEN2_I:.*]]
 // DEFAULT-NEXT:    ]
-// DEFAULT:       if.then.i:
-// DEFAULT-NEXT:    [[CALL_I20_I:%.*]] = tail call contract noundef float @__ocml_y0_f32(float noundef [[Y:%.*]]) #[[ATTR16]]
-// DEFAULT-NEXT:    br label [[_ZL3YNFIF_EXIT:%.*]]
-// DEFAULT:       if.then2.i:
+// DEFAULT:       [[IF_THEN_I]]:
+// DEFAULT-NEXT:    [[CALL_I20_I:%.*]] = tail call contract noundef float @__ocml_y0_f32(float noundef [[Y]]) #[[ATTR16]]
+// DEFAULT-NEXT:    br label %[[_ZL3YNFIF_EXIT:.*]]
+// DEFAULT:       [[IF_THEN2_I]]:
 // DEFAULT-NEXT:    [[CALL_I22_I:%.*]] = tail call contract noundef float @__ocml_y1_f32(float noundef [[Y]]) #[[ATTR16]]
-// DEFAULT-NEXT:    br label [[_ZL3YNFIF_EXIT]]
-// DEFAULT:       if.end4.i:
+// DEFAULT-NEXT:    br label %[[_ZL3YNFIF_EXIT]]
+// DEFAULT:       [[IF_END4_I]]:
 // DEFAULT-NEXT:    [[CALL_I_I:%.*]] = tail call contract noundef float @__ocml_y0_f32(float noundef [[Y]]) #[[ATTR16]]
 // DEFAULT-NEXT:    [[CALL_I21_I:%.*]] = tail call contract noundef float @__ocml_y1_f32(float noundef [[Y]]) #[[ATTR16]]
 // DEFAULT-NEXT:    [[CMP7_I1:%.*]] = icmp sgt i32 [[X]], 1
-// DEFAULT-NEXT:    br i1 [[CMP7_I1]], label [[FOR_BODY_I:%.*]], label [[_ZL3YNFIF_EXIT]]
-// DEFAULT:       for.body.i:
-// DEFAULT-NEXT:    [[__I_0_I4:%.*]] = phi i32 [ [[INC_I:%.*]], [[FOR_BODY_I]] ], [ 1, [[IF_END4_I]] ]
-// DEFAULT-NEXT:    [[__X1_0_I3:%.*]] = phi float [ [[SUB_I:%.*]], [[FOR_BODY_I]] ], [ [[CALL_I21_I]], [[IF_END4_I]] ]
-// DEFAULT-NEXT:    [[__X0_0_I2:%.*]] = phi float [ [[__X1_0_I3]], [[FOR_BODY_I]] ], [ [[CALL_I_I]], [[IF_END4_I]] ]
+// DEFAULT-NEXT:    br i1 [[CMP7_I1]], label %[[FOR_BODY_I:.*]], label %[[_ZL3YNFIF_EXIT]]
+// DEFAULT:       [[FOR_BODY_I]]:
+// DEFAULT-NEXT:    [[__I_0_I4:%.*]] = phi i32 [ [[INC_I:%.*]], %[[FOR_BODY_I]] ], [ 1, %[[IF_END4_I]] ]
+// DEFAULT-NEXT:    [[__X1_0_I3:%.*]] = phi float [ [[SUB_I:%.*]], %[[FOR_BODY_I]] ], [ [[CALL_I21_I]], %[[IF_END4_I]] ]
+// DEFAULT-NEXT:    [[__X0_0_I2:%.*]] = phi float [ [[__X1_0_I3]], %[[FOR_BODY_I]] ], [ [[CALL_I_I]], %[[IF_END4_I]] ]
 // DEFAULT-NEXT:    [[MUL_I:%.*]] = shl nuw nsw i32 [[__I_0_I4]], 1
 // DEFAULT-NEXT:    [[CONV_I:%.*]] = uitofp nneg i32 [[MUL_I]] to float
 // DEFAULT-NEXT:    [[DIV_I:%.*]] = fdiv contract float [[CONV_I]], [[Y]]
@@ -6985,32 +7894,33 @@ extern "C" __device__ double test_y1(double x) {
 // DEFAULT-NEXT:    [[SUB_I]] = fsub contract float [[MUL8_I]], [[__X0_0_I2]]
 // DEFAULT-NEXT:    [[INC_I]] = add nuw nsw i32 [[__I_0_I4]], 1
 // DEFAULT-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC_I]], [[X]]
-// DEFAULT-NEXT:    br i1 [[EXITCOND_NOT]], label [[_ZL3YNFIF_EXIT]], label [[FOR_BODY_I]], !llvm.loop [[LOOP24:![0-9]+]]
-// DEFAULT:       _ZL3ynfif.exit:
-// DEFAULT-NEXT:    [[RETVAL_0_I:%.*]] = phi float [ [[CALL_I20_I]], [[IF_THEN_I]] ], [ [[CALL_I22_I]], [[IF_THEN2_I]] ], [ [[CALL_I21_I]], [[IF_END4_I]] ], [ [[SUB_I]], [[FOR_BODY_I]] ]
+// DEFAULT-NEXT:    br i1 [[EXITCOND_NOT]], label %[[_ZL3YNFIF_EXIT]], label %[[FOR_BODY_I]], !llvm.loop [[LOOP24:![0-9]+]]
+// DEFAULT:       [[_ZL3YNFIF_EXIT]]:
+// DEFAULT-NEXT:    [[RETVAL_0_I:%.*]] = phi float [ [[CALL_I20_I]], %[[IF_THEN_I]] ], [ [[CALL_I22_I]], %[[IF_THEN2_I]] ], [ [[CALL_I21_I]], %[[IF_END4_I]] ], [ [[SUB_I]], %[[FOR_BODY_I]] ]
 // DEFAULT-NEXT:    ret float [[RETVAL_0_I]]
 //
-// FINITEONLY-LABEL: @test_ynf(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    switch i32 [[X:%.*]], label [[IF_END4_I:%.*]] [
-// FINITEONLY-NEXT:      i32 0, label [[IF_THEN_I:%.*]]
-// FINITEONLY-NEXT:      i32 1, label [[IF_THEN2_I:%.*]]
+// FINITEONLY-LABEL: define dso_local nofpclass(nan inf) float @test_ynf(
+// FINITEONLY-SAME: i32 noundef [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    switch i32 [[X]], label %[[IF_END4_I:.*]] [
+// FINITEONLY-NEXT:      i32 0, label %[[IF_THEN_I:.*]]
+// FINITEONLY-NEXT:      i32 1, label %[[IF_THEN2_I:.*]]
 // FINITEONLY-NEXT:    ]
-// FINITEONLY:       if.then.i:
-// FINITEONLY-NEXT:    [[CALL_I20_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_y0_f32(float noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR16]]
-// FINITEONLY-NEXT:    br label [[_ZL3YNFIF_EXIT:%.*]]
-// FINITEONLY:       if.then2.i:
+// FINITEONLY:       [[IF_THEN_I]]:
+// FINITEONLY-NEXT:    [[CALL_I20_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_y0_f32(float noundef nofpclass(nan inf) [[Y]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    br label %[[_ZL3YNFIF_EXIT:.*]]
+// FINITEONLY:       [[IF_THEN2_I]]:
 // FINITEONLY-NEXT:    [[CALL_I22_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_y1_f32(float noundef nofpclass(nan inf) [[Y]]) #[[ATTR16]]
-// FINITEONLY-NEXT:    br label [[_ZL3YNFIF_EXIT]]
-// FINITEONLY:       if.end4.i:
+// FINITEONLY-NEXT:    br label %[[_ZL3YNFIF_EXIT]]
+// FINITEONLY:       [[IF_END4_I]]:
 // FINITEONLY-NEXT:    [[CALL_I_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_y0_f32(float noundef nofpclass(nan inf) [[Y]]) #[[ATTR16]]
 // FINITEONLY-NEXT:    [[CALL_I21_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_y1_f32(float noundef nofpclass(nan inf) [[Y]]) #[[ATTR16]]
 // FINITEONLY-NEXT:    [[CMP7_I1:%.*]] = icmp sgt i32 [[X]], 1
-// FINITEONLY-NEXT:    br i1 [[CMP7_I1]], label [[FOR_BODY_I:%.*]], label [[_ZL3YNFIF_EXIT]]
-// FINITEONLY:       for.body.i:
-// FINITEONLY-NEXT:    [[__I_0_I4:%.*]] = phi i32 [ [[INC_I:%.*]], [[FOR_BODY_I]] ], [ 1, [[IF_END4_I]] ]
-// FINITEONLY-NEXT:    [[__X1_0_I3:%.*]] = phi float [ [[SUB_I:%.*]], [[FOR_BODY_I]] ], [ [[CALL_I21_I]], [[IF_END4_I]] ]
-// FINITEONLY-NEXT:    [[__X0_0_I2:%.*]] = phi float [ [[__X1_0_I3]], [[FOR_BODY_I]] ], [ [[CALL_I_I]], [[IF_END4_I]] ]
+// FINITEONLY-NEXT:    br i1 [[CMP7_I1]], label %[[FOR_BODY_I:.*]], label %[[_ZL3YNFIF_EXIT]]
+// FINITEONLY:       [[FOR_BODY_I]]:
+// FINITEONLY-NEXT:    [[__I_0_I4:%.*]] = phi i32 [ [[INC_I:%.*]], %[[FOR_BODY_I]] ], [ 1, %[[IF_END4_I]] ]
+// FINITEONLY-NEXT:    [[__X1_0_I3:%.*]] = phi float [ [[SUB_I:%.*]], %[[FOR_BODY_I]] ], [ [[CALL_I21_I]], %[[IF_END4_I]] ]
+// FINITEONLY-NEXT:    [[__X0_0_I2:%.*]] = phi float [ [[__X1_0_I3]], %[[FOR_BODY_I]] ], [ [[CALL_I_I]], %[[IF_END4_I]] ]
 // FINITEONLY-NEXT:    [[MUL_I:%.*]] = shl nuw nsw i32 [[__I_0_I4]], 1
 // FINITEONLY-NEXT:    [[CONV_I:%.*]] = uitofp nneg i32 [[MUL_I]] to float
 // FINITEONLY-NEXT:    [[DIV_I:%.*]] = fdiv nnan ninf contract float [[CONV_I]], [[Y]]
@@ -7018,32 +7928,33 @@ extern "C" __device__ double test_y1(double x) {
 // FINITEONLY-NEXT:    [[SUB_I]] = fsub nnan ninf contract float [[MUL8_I]], [[__X0_0_I2]]
 // FINITEONLY-NEXT:    [[INC_I]] = add nuw nsw i32 [[__I_0_I4]], 1
 // FINITEONLY-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC_I]], [[X]]
-// FINITEONLY-NEXT:    br i1 [[EXITCOND_NOT]], label [[_ZL3YNFIF_EXIT]], label [[FOR_BODY_I]], !llvm.loop [[LOOP24:![0-9]+]]
-// FINITEONLY:       _ZL3ynfif.exit:
-// FINITEONLY-NEXT:    [[RETVAL_0_I:%.*]] = phi float [ [[CALL_I20_I]], [[IF_THEN_I]] ], [ [[CALL_I22_I]], [[IF_THEN2_I]] ], [ [[CALL_I21_I]], [[IF_END4_I]] ], [ [[SUB_I]], [[FOR_BODY_I]] ]
+// FINITEONLY-NEXT:    br i1 [[EXITCOND_NOT]], label %[[_ZL3YNFIF_EXIT]], label %[[FOR_BODY_I]], !llvm.loop [[LOOP24:![0-9]+]]
+// FINITEONLY:       [[_ZL3YNFIF_EXIT]]:
+// FINITEONLY-NEXT:    [[RETVAL_0_I:%.*]] = phi float [ [[CALL_I20_I]], %[[IF_THEN_I]] ], [ [[CALL_I22_I]], %[[IF_THEN2_I]] ], [ [[CALL_I21_I]], %[[IF_END4_I]] ], [ [[SUB_I]], %[[FOR_BODY_I]] ]
 // FINITEONLY-NEXT:    ret float [[RETVAL_0_I]]
 //
-// APPROX-LABEL: @test_ynf(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    switch i32 [[X:%.*]], label [[IF_END4_I:%.*]] [
-// APPROX-NEXT:      i32 0, label [[IF_THEN_I:%.*]]
-// APPROX-NEXT:      i32 1, label [[IF_THEN2_I:%.*]]
+// APPROX-LABEL: define dso_local float @test_ynf(
+// APPROX-SAME: i32 noundef [[X:%.*]], float noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    switch i32 [[X]], label %[[IF_END4_I:.*]] [
+// APPROX-NEXT:      i32 0, label %[[IF_THEN_I:.*]]
+// APPROX-NEXT:      i32 1, label %[[IF_THEN2_I:.*]]
 // APPROX-NEXT:    ]
-// APPROX:       if.then.i:
-// APPROX-NEXT:    [[CALL_I20_I:%.*]] = tail call contract noundef float @__ocml_y0_f32(float noundef [[Y:%.*]]) #[[ATTR16]]
-// APPROX-NEXT:    br label [[_ZL3YNFIF_EXIT:%.*]]
-// APPROX:       if.then2.i:
+// APPROX:       [[IF_THEN_I]]:
+// APPROX-NEXT:    [[CALL_I20_I:%.*]] = tail call contract noundef float @__ocml_y0_f32(float noundef [[Y]]) #[[ATTR16]]
+// APPROX-NEXT:    br label %[[_ZL3YNFIF_EXIT:.*]]
+// APPROX:       [[IF_THEN2_I]]:
 // APPROX-NEXT:    [[CALL_I22_I:%.*]] = tail call contract noundef float @__ocml_y1_f32(float noundef [[Y]]) #[[ATTR16]]
-// APPROX-NEXT:    br label [[_ZL3YNFIF_EXIT]]
-// APPROX:       if.end4.i:
+// APPROX-NEXT:    br label %[[_ZL3YNFIF_EXIT]]
+// APPROX:       [[IF_END4_I]]:
 // APPROX-NEXT:    [[CALL_I_I:%.*]] = tail call contract noundef float @__ocml_y0_f32(float noundef [[Y]]) #[[ATTR16]]
 // APPROX-NEXT:    [[CALL_I21_I:%.*]] = tail call contract noundef float @__ocml_y1_f32(float noundef [[Y]]) #[[ATTR16]]
 // APPROX-NEXT:    [[CMP7_I1:%.*]] = icmp sgt i32 [[X]], 1
-// APPROX-NEXT:    br i1 [[CMP7_I1]], label [[FOR_BODY_I:%.*]], label [[_ZL3YNFIF_EXIT]]
-// APPROX:       for.body.i:
-// APPROX-NEXT:    [[__I_0_I4:%.*]] = phi i32 [ [[INC_I:%.*]], [[FOR_BODY_I]] ], [ 1, [[IF_END4_I]] ]
-// APPROX-NEXT:    [[__X1_0_I3:%.*]] = phi float [ [[SUB_I:%.*]], [[FOR_BODY_I]] ], [ [[CALL_I21_I]], [[IF_END4_I]] ]
-// APPROX-NEXT:    [[__X0_0_I2:%.*]] = phi float [ [[__X1_0_I3]], [[FOR_BODY_I]] ], [ [[CALL_I_I]], [[IF_END4_I]] ]
+// APPROX-NEXT:    br i1 [[CMP7_I1]], label %[[FOR_BODY_I:.*]], label %[[_ZL3YNFIF_EXIT]]
+// APPROX:       [[FOR_BODY_I]]:
+// APPROX-NEXT:    [[__I_0_I4:%.*]] = phi i32 [ [[INC_I:%.*]], %[[FOR_BODY_I]] ], [ 1, %[[IF_END4_I]] ]
+// APPROX-NEXT:    [[__X1_0_I3:%.*]] = phi float [ [[SUB_I:%.*]], %[[FOR_BODY_I]] ], [ [[CALL_I21_I]], %[[IF_END4_I]] ]
+// APPROX-NEXT:    [[__X0_0_I2:%.*]] = phi float [ [[__X1_0_I3]], %[[FOR_BODY_I]] ], [ [[CALL_I_I]], %[[IF_END4_I]] ]
 // APPROX-NEXT:    [[MUL_I:%.*]] = shl nuw nsw i32 [[__I_0_I4]], 1
 // APPROX-NEXT:    [[CONV_I:%.*]] = uitofp nneg i32 [[MUL_I]] to float
 // APPROX-NEXT:    [[DIV_I:%.*]] = fdiv contract float [[CONV_I]], [[Y]]
@@ -7051,32 +7962,33 @@ extern "C" __device__ double test_y1(double x) {
 // APPROX-NEXT:    [[SUB_I]] = fsub contract float [[MUL8_I]], [[__X0_0_I2]]
 // APPROX-NEXT:    [[INC_I]] = add nuw nsw i32 [[__I_0_I4]], 1
 // APPROX-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC_I]], [[X]]
-// APPROX-NEXT:    br i1 [[EXITCOND_NOT]], label [[_ZL3YNFIF_EXIT]], label [[FOR_BODY_I]], !llvm.loop [[LOOP24:![0-9]+]]
-// APPROX:       _ZL3ynfif.exit:
-// APPROX-NEXT:    [[RETVAL_0_I:%.*]] = phi float [ [[CALL_I20_I]], [[IF_THEN_I]] ], [ [[CALL_I22_I]], [[IF_THEN2_I]] ], [ [[CALL_I21_I]], [[IF_END4_I]] ], [ [[SUB_I]], [[FOR_BODY_I]] ]
+// APPROX-NEXT:    br i1 [[EXITCOND_NOT]], label %[[_ZL3YNFIF_EXIT]], label %[[FOR_BODY_I]], !llvm.loop [[LOOP24:![0-9]+]]
+// APPROX:       [[_ZL3YNFIF_EXIT]]:
+// APPROX-NEXT:    [[RETVAL_0_I:%.*]] = phi float [ [[CALL_I20_I]], %[[IF_THEN_I]] ], [ [[CALL_I22_I]], %[[IF_THEN2_I]] ], [ [[CALL_I21_I]], %[[IF_END4_I]] ], [ [[SUB_I]], %[[FOR_BODY_I]] ]
 // APPROX-NEXT:    ret float [[RETVAL_0_I]]
 //
-// NCRDIV-LABEL: @test_ynf(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    switch i32 [[X:%.*]], label [[IF_END4_I:%.*]] [
-// NCRDIV-NEXT:      i32 0, label [[IF_THEN_I:%.*]]
-// NCRDIV-NEXT:      i32 1, label [[IF_THEN2_I:%.*]]
+// NCRDIV-LABEL: define dso_local float @test_ynf(
+// NCRDIV-SAME: i32 noundef [[X:%.*]], float noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    switch i32 [[X]], label %[[IF_END4_I:.*]] [
+// NCRDIV-NEXT:      i32 0, label %[[IF_THEN_I:.*]]
+// NCRDIV-NEXT:      i32 1, label %[[IF_THEN2_I:.*]]
 // NCRDIV-NEXT:    ]
-// NCRDIV:       if.then.i:
-// NCRDIV-NEXT:    [[CALL_I20_I:%.*]] = tail call contract noundef float @__ocml_y0_f32(float noundef [[Y:%.*]]) #[[ATTR16]]
-// NCRDIV-NEXT:    br label [[_ZL3YNFIF_EXIT:%.*]]
-// NCRDIV:       if.then2.i:
+// NCRDIV:       [[IF_THEN_I]]:
+// NCRDIV-NEXT:    [[CALL_I20_I:%.*]] = tail call contract noundef float @__ocml_y0_f32(float noundef [[Y]]) #[[ATTR16]]
+// NCRDIV-NEXT:    br label %[[_ZL3YNFIF_EXIT:.*]]
+// NCRDIV:       [[IF_THEN2_I]]:
 // NCRDIV-NEXT:    [[CALL_I22_I:%.*]] = tail call contract noundef float @__ocml_y1_f32(float noundef [[Y]]) #[[ATTR16]]
-// NCRDIV-NEXT:    br label [[_ZL3YNFIF_EXIT]]
-// NCRDIV:       if.end4.i:
+// NCRDIV-NEXT:    br label %[[_ZL3YNFIF_EXIT]]
+// NCRDIV:       [[IF_END4_I]]:
 // NCRDIV-NEXT:    [[CALL_I_I:%.*]] = tail call contract noundef float @__ocml_y0_f32(float noundef [[Y]]) #[[ATTR16]]
 // NCRDIV-NEXT:    [[CALL_I21_I:%.*]] = tail call contract noundef float @__ocml_y1_f32(float noundef [[Y]]) #[[ATTR16]]
 // NCRDIV-NEXT:    [[CMP7_I1:%.*]] = icmp sgt i32 [[X]], 1
-// NCRDIV-NEXT:    br i1 [[CMP7_I1]], label [[FOR_BODY_I:%.*]], label [[_ZL3YNFIF_EXIT]]
-// NCRDIV:       for.body.i:
-// NCRDIV-NEXT:    [[__I_0_I4:%.*]] = phi i32 [ [[INC_I:%.*]], [[FOR_BODY_I]] ], [ 1, [[IF_END4_I]] ]
-// NCRDIV-NEXT:    [[__X1_0_I3:%.*]] = phi float [ [[SUB_I:%.*]], [[FOR_BODY_I]] ], [ [[CALL_I21_I]], [[IF_END4_I]] ]
-// NCRDIV-NEXT:    [[__X0_0_I2:%.*]] = phi float [ [[__X1_0_I3]], [[FOR_BODY_I]] ], [ [[CALL_I_I]], [[IF_END4_I]] ]
+// NCRDIV-NEXT:    br i1 [[CMP7_I1]], label %[[FOR_BODY_I:.*]], label %[[_ZL3YNFIF_EXIT]]
+// NCRDIV:       [[FOR_BODY_I]]:
+// NCRDIV-NEXT:    [[__I_0_I4:%.*]] = phi i32 [ [[INC_I:%.*]], %[[FOR_BODY_I]] ], [ 1, %[[IF_END4_I]] ]
+// NCRDIV-NEXT:    [[__X1_0_I3:%.*]] = phi float [ [[SUB_I:%.*]], %[[FOR_BODY_I]] ], [ [[CALL_I21_I]], %[[IF_END4_I]] ]
+// NCRDIV-NEXT:    [[__X0_0_I2:%.*]] = phi float [ [[__X1_0_I3]], %[[FOR_BODY_I]] ], [ [[CALL_I_I]], %[[IF_END4_I]] ]
 // NCRDIV-NEXT:    [[MUL_I:%.*]] = shl nuw nsw i32 [[__I_0_I4]], 1
 // NCRDIV-NEXT:    [[CONV_I:%.*]] = uitofp nneg i32 [[MUL_I]] to float
 // NCRDIV-NEXT:    [[DIV_I:%.*]] = fdiv contract float [[CONV_I]], [[Y]], !fpmath [[META12]]
@@ -7084,32 +7996,33 @@ extern "C" __device__ double test_y1(double x) {
 // NCRDIV-NEXT:    [[SUB_I]] = fsub contract float [[MUL8_I]], [[__X0_0_I2]]
 // NCRDIV-NEXT:    [[INC_I]] = add nuw nsw i32 [[__I_0_I4]], 1
 // NCRDIV-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC_I]], [[X]]
-// NCRDIV-NEXT:    br i1 [[EXITCOND_NOT]], label [[_ZL3YNFIF_EXIT]], label [[FOR_BODY_I]], !llvm.loop [[LOOP26:![0-9]+]]
-// NCRDIV:       _ZL3ynfif.exit:
-// NCRDIV-NEXT:    [[RETVAL_0_I:%.*]] = phi float [ [[CALL_I20_I]], [[IF_THEN_I]] ], [ [[CALL_I22_I]], [[IF_THEN2_I]] ], [ [[CALL_I21_I]], [[IF_END4_I]] ], [ [[SUB_I]], [[FOR_BODY_I]] ]
+// NCRDIV-NEXT:    br i1 [[EXITCOND_NOT]], label %[[_ZL3YNFIF_EXIT]], label %[[FOR_BODY_I]], !llvm.loop [[LOOP26:![0-9]+]]
+// NCRDIV:       [[_ZL3YNFIF_EXIT]]:
+// NCRDIV-NEXT:    [[RETVAL_0_I:%.*]] = phi float [ [[CALL_I20_I]], %[[IF_THEN_I]] ], [ [[CALL_I22_I]], %[[IF_THEN2_I]] ], [ [[CALL_I21_I]], %[[IF_END4_I]] ], [ [[SUB_I]], %[[FOR_BODY_I]] ]
 // NCRDIV-NEXT:    ret float [[RETVAL_0_I]]
 //
-// AMDGCNSPIRV-LABEL: @test_ynf(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    switch i32 [[X:%.*]], label [[IF_END4_I:%.*]] [
-// AMDGCNSPIRV-NEXT:      i32 0, label [[IF_THEN_I:%.*]]
-// AMDGCNSPIRV-NEXT:      i32 1, label [[IF_THEN2_I:%.*]]
+// AMDGCNSPIRV-LABEL: define spir_func float @test_ynf(
+// AMDGCNSPIRV-SAME: i32 noundef [[X:%.*]], float noundef [[Y:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR6]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    switch i32 [[X]], label %[[IF_END4_I:.*]] [
+// AMDGCNSPIRV-NEXT:      i32 0, label %[[IF_THEN_I:.*]]
+// AMDGCNSPIRV-NEXT:      i32 1, label %[[IF_THEN2_I:.*]]
 // AMDGCNSPIRV-NEXT:    ]
-// AMDGCNSPIRV:       if.then.i:
-// AMDGCNSPIRV-NEXT:    [[CALL_I20_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_y0_f32(float noundef [[Y:%.*]]) #[[ATTR14]]
-// AMDGCNSPIRV-NEXT:    br label [[_ZL3YNFIF_EXIT:%.*]]
-// AMDGCNSPIRV:       if.then2.i:
+// AMDGCNSPIRV:       [[IF_THEN_I]]:
+// AMDGCNSPIRV-NEXT:    [[CALL_I20_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_y0_f32(float noundef [[Y]]) #[[ATTR14]]
+// AMDGCNSPIRV-NEXT:    br label %[[_ZL3YNFIF_EXIT:.*]]
+// AMDGCNSPIRV:       [[IF_THEN2_I]]:
 // AMDGCNSPIRV-NEXT:    [[CALL_I22_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_y1_f32(float noundef [[Y]]) #[[ATTR14]]
-// AMDGCNSPIRV-NEXT:    br label [[_ZL3YNFIF_EXIT]]
-// AMDGCNSPIRV:       if.end4.i:
+// AMDGCNSPIRV-NEXT:    br label %[[_ZL3YNFIF_EXIT]]
+// AMDGCNSPIRV:       [[IF_END4_I]]:
 // AMDGCNSPIRV-NEXT:    [[CALL_I_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_y0_f32(float noundef [[Y]]) #[[ATTR14]]
 // AMDGCNSPIRV-NEXT:    [[CALL_I21_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_y1_f32(float noundef [[Y]]) #[[ATTR14]]
 // AMDGCNSPIRV-NEXT:    [[CMP7_I1:%.*]] = icmp sgt i32 [[X]], 1
-// AMDGCNSPIRV-NEXT:    br i1 [[CMP7_I1]], label [[FOR_BODY_I:%.*]], label [[_ZL3YNFIF_EXIT]]
-// AMDGCNSPIRV:       for.body.i:
-// AMDGCNSPIRV-NEXT:    [[__I_0_I4:%.*]] = phi i32 [ [[INC_I:%.*]], [[FOR_BODY_I]] ], [ 1, [[IF_END4_I]] ]
-// AMDGCNSPIRV-NEXT:    [[__X1_0_I3:%.*]] = phi float [ [[SUB_I:%.*]], [[FOR_BODY_I]] ], [ [[CALL_I21_I]], [[IF_END4_I]] ]
-// AMDGCNSPIRV-NEXT:    [[__X0_0_I2:%.*]] = phi float [ [[__X1_0_I3]], [[FOR_BODY_I]] ], [ [[CALL_I_I]], [[IF_END4_I]] ]
+// AMDGCNSPIRV-NEXT:    br i1 [[CMP7_I1]], label %[[FOR_BODY_I:.*]], label %[[_ZL3YNFIF_EXIT]]
+// AMDGCNSPIRV:       [[FOR_BODY_I]]:
+// AMDGCNSPIRV-NEXT:    [[__I_0_I4:%.*]] = phi i32 [ [[INC_I:%.*]], %[[FOR_BODY_I]] ], [ 1, %[[IF_END4_I]] ]
+// AMDGCNSPIRV-NEXT:    [[__X1_0_I3:%.*]] = phi float [ [[SUB_I:%.*]], %[[FOR_BODY_I]] ], [ [[CALL_I21_I]], %[[IF_END4_I]] ]
+// AMDGCNSPIRV-NEXT:    [[__X0_0_I2:%.*]] = phi float [ [[__X1_0_I3]], %[[FOR_BODY_I]] ], [ [[CALL_I_I]], %[[IF_END4_I]] ]
 // AMDGCNSPIRV-NEXT:    [[MUL_I:%.*]] = shl nuw nsw i32 [[__I_0_I4]], 1
 // AMDGCNSPIRV-NEXT:    [[CONV_I:%.*]] = uitofp nneg i32 [[MUL_I]] to float
 // AMDGCNSPIRV-NEXT:    [[DIV_I:%.*]] = fdiv contract float [[CONV_I]], [[Y]]
@@ -7117,36 +8030,37 @@ extern "C" __device__ double test_y1(double x) {
 // AMDGCNSPIRV-NEXT:    [[SUB_I]] = fsub contract float [[MUL8_I]], [[__X0_0_I2]]
 // AMDGCNSPIRV-NEXT:    [[INC_I]] = add nuw nsw i32 [[__I_0_I4]], 1
 // AMDGCNSPIRV-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC_I]], [[X]]
-// AMDGCNSPIRV-NEXT:    br i1 [[EXITCOND_NOT]], label [[_ZL3YNFIF_EXIT]], label [[FOR_BODY_I]], !llvm.loop [[LOOP25:![0-9]+]]
-// AMDGCNSPIRV:       _ZL3ynfif.exit:
-// AMDGCNSPIRV-NEXT:    [[RETVAL_0_I:%.*]] = phi float [ [[CALL_I20_I]], [[IF_THEN_I]] ], [ [[CALL_I22_I]], [[IF_THEN2_I]] ], [ [[CALL_I21_I]], [[IF_END4_I]] ], [ [[SUB_I]], [[FOR_BODY_I]] ]
+// AMDGCNSPIRV-NEXT:    br i1 [[EXITCOND_NOT]], label %[[_ZL3YNFIF_EXIT]], label %[[FOR_BODY_I]], !llvm.loop [[LOOP25:![0-9]+]]
+// AMDGCNSPIRV:       [[_ZL3YNFIF_EXIT]]:
+// AMDGCNSPIRV-NEXT:    [[RETVAL_0_I:%.*]] = phi float [ [[CALL_I20_I]], %[[IF_THEN_I]] ], [ [[CALL_I22_I]], %[[IF_THEN2_I]] ], [ [[CALL_I21_I]], %[[IF_END4_I]] ], [ [[SUB_I]], %[[FOR_BODY_I]] ]
 // AMDGCNSPIRV-NEXT:    ret float [[RETVAL_0_I]]
 //
 extern "C" __device__ float test_ynf(int x, float y) {
   return ynf(x, y);
 }
 
-// DEFAULT-LABEL: @test_yn(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    switch i32 [[X:%.*]], label [[IF_END4_I:%.*]] [
-// DEFAULT-NEXT:      i32 0, label [[IF_THEN_I:%.*]]
-// DEFAULT-NEXT:      i32 1, label [[IF_THEN2_I:%.*]]
+// DEFAULT-LABEL: define dso_local double @test_yn(
+// DEFAULT-SAME: i32 noundef [[X:%.*]], double noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    switch i32 [[X]], label %[[IF_END4_I:.*]] [
+// DEFAULT-NEXT:      i32 0, label %[[IF_THEN_I:.*]]
+// DEFAULT-NEXT:      i32 1, label %[[IF_THEN2_I:.*]]
 // DEFAULT-NEXT:    ]
-// DEFAULT:       if.then.i:
-// DEFAULT-NEXT:    [[CALL_I20_I:%.*]] = tail call contract noundef double @__ocml_y0_f64(double noundef [[Y:%.*]]) #[[ATTR16]]
-// DEFAULT-NEXT:    br label [[_ZL2YNID_EXIT:%.*]]
-// DEFAULT:       if.then2.i:
+// DEFAULT:       [[IF_THEN_I]]:
+// DEFAULT-NEXT:    [[CALL_I20_I:%.*]] = tail call contract noundef double @__ocml_y0_f64(double noundef [[Y]]) #[[ATTR16]]
+// DEFAULT-NEXT:    br label %[[_ZL2YNID_EXIT:.*]]
+// DEFAULT:       [[IF_THEN2_I]]:
 // DEFAULT-NEXT:    [[CALL_I22_I:%.*]] = tail call contract noundef double @__ocml_y1_f64(double noundef [[Y]]) #[[ATTR16]]
-// DEFAULT-NEXT:    br label [[_ZL2YNID_EXIT]]
-// DEFAULT:       if.end4.i:
+// DEFAULT-NEXT:    br label %[[_ZL2YNID_EXIT]]
+// DEFAULT:       [[IF_END4_I]]:
 // DEFAULT-NEXT:    [[CALL_I_I:%.*]] = tail call contract noundef double @__ocml_y0_f64(double noundef [[Y]]) #[[ATTR16]]
 // DEFAULT-NEXT:    [[CALL_I21_I:%.*]] = tail call contract noundef double @__ocml_y1_f64(double noundef [[Y]]) #[[ATTR16]]
 // DEFAULT-NEXT:    [[CMP7_I1:%.*]] = icmp sgt i32 [[X]], 1
-// DEFAULT-NEXT:    br i1 [[CMP7_I1]], label [[FOR_BODY_I:%.*]], label [[_ZL2YNID_EXIT]]
-// DEFAULT:       for.body.i:
-// DEFAULT-NEXT:    [[__I_0_I4:%.*]] = phi i32 [ [[INC_I:%.*]], [[FOR_BODY_I]] ], [ 1, [[IF_END4_I]] ]
-// DEFAULT-NEXT:    [[__X1_0_I3:%.*]] = phi double [ [[SUB_I:%.*]], [[FOR_BODY_I]] ], [ [[CALL_I21_I]], [[IF_END4_I]] ]
-// DEFAULT-NEXT:    [[__X0_0_I2:%.*]] = phi double [ [[__X1_0_I3]], [[FOR_BODY_I]] ], [ [[CALL_I_I]], [[IF_END4_I]] ]
+// DEFAULT-NEXT:    br i1 [[CMP7_I1]], label %[[FOR_BODY_I:.*]], label %[[_ZL2YNID_EXIT]]
+// DEFAULT:       [[FOR_BODY_I]]:
+// DEFAULT-NEXT:    [[__I_0_I4:%.*]] = phi i32 [ [[INC_I:%.*]], %[[FOR_BODY_I]] ], [ 1, %[[IF_END4_I]] ]
+// DEFAULT-NEXT:    [[__X1_0_I3:%.*]] = phi double [ [[SUB_I:%.*]], %[[FOR_BODY_I]] ], [ [[CALL_I21_I]], %[[IF_END4_I]] ]
+// DEFAULT-NEXT:    [[__X0_0_I2:%.*]] = phi double [ [[__X1_0_I3]], %[[FOR_BODY_I]] ], [ [[CALL_I_I]], %[[IF_END4_I]] ]
 // DEFAULT-NEXT:    [[MUL_I:%.*]] = shl nuw nsw i32 [[__I_0_I4]], 1
 // DEFAULT-NEXT:    [[CONV_I:%.*]] = uitofp nneg i32 [[MUL_I]] to double
 // DEFAULT-NEXT:    [[DIV_I:%.*]] = fdiv contract double [[CONV_I]], [[Y]]
@@ -7154,32 +8068,33 @@ extern "C" __device__ float test_ynf(int x, float y) {
 // DEFAULT-NEXT:    [[SUB_I]] = fsub contract double [[MUL8_I]], [[__X0_0_I2]]
 // DEFAULT-NEXT:    [[INC_I]] = add nuw nsw i32 [[__I_0_I4]], 1
 // DEFAULT-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC_I]], [[X]]
-// DEFAULT-NEXT:    br i1 [[EXITCOND_NOT]], label [[_ZL2YNID_EXIT]], label [[FOR_BODY_I]], !llvm.loop [[LOOP25:![0-9]+]]
-// DEFAULT:       _ZL2ynid.exit:
-// DEFAULT-NEXT:    [[RETVAL_0_I:%.*]] = phi double [ [[CALL_I20_I]], [[IF_THEN_I]] ], [ [[CALL_I22_I]], [[IF_THEN2_I]] ], [ [[CALL_I21_I]], [[IF_END4_I]] ], [ [[SUB_I]], [[FOR_BODY_I]] ]
+// DEFAULT-NEXT:    br i1 [[EXITCOND_NOT]], label %[[_ZL2YNID_EXIT]], label %[[FOR_BODY_I]], !llvm.loop [[LOOP25:![0-9]+]]
+// DEFAULT:       [[_ZL2YNID_EXIT]]:
+// DEFAULT-NEXT:    [[RETVAL_0_I:%.*]] = phi double [ [[CALL_I20_I]], %[[IF_THEN_I]] ], [ [[CALL_I22_I]], %[[IF_THEN2_I]] ], [ [[CALL_I21_I]], %[[IF_END4_I]] ], [ [[SUB_I]], %[[FOR_BODY_I]] ]
 // DEFAULT-NEXT:    ret double [[RETVAL_0_I]]
 //
-// FINITEONLY-LABEL: @test_yn(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    switch i32 [[X:%.*]], label [[IF_END4_I:%.*]] [
-// FINITEONLY-NEXT:      i32 0, label [[IF_THEN_I:%.*]]
-// FINITEONLY-NEXT:      i32 1, label [[IF_THEN2_I:%.*]]
+// FINITEONLY-LABEL: define dso_local nofpclass(nan inf) double @test_yn(
+// FINITEONLY-SAME: i32 noundef [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    switch i32 [[X]], label %[[IF_END4_I:.*]] [
+// FINITEONLY-NEXT:      i32 0, label %[[IF_THEN_I:.*]]
+// FINITEONLY-NEXT:      i32 1, label %[[IF_THEN2_I:.*]]
 // FINITEONLY-NEXT:    ]
-// FINITEONLY:       if.then.i:
-// FINITEONLY-NEXT:    [[CALL_I20_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_y0_f64(double noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR16]]
-// FINITEONLY-NEXT:    br label [[_ZL2YNID_EXIT:%.*]]
-// FINITEONLY:       if.then2.i:
+// FINITEONLY:       [[IF_THEN_I]]:
+// FINITEONLY-NEXT:    [[CALL_I20_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_y0_f64(double noundef nofpclass(nan inf) [[Y]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    br label %[[_ZL2YNID_EXIT:.*]]
+// FINITEONLY:       [[IF_THEN2_I]]:
 // FINITEONLY-NEXT:    [[CALL_I22_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_y1_f64(double noundef nofpclass(nan inf) [[Y]]) #[[ATTR16]]
-// FINITEONLY-NEXT:    br label [[_ZL2YNID_EXIT]]
-// FINITEONLY:       if.end4.i:
+// FINITEONLY-NEXT:    br label %[[_ZL2YNID_EXIT]]
+// FINITEONLY:       [[IF_END4_I]]:
 // FINITEONLY-NEXT:    [[CALL_I_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_y0_f64(double noundef nofpclass(nan inf) [[Y]]) #[[ATTR16]]
 // FINITEONLY-NEXT:    [[CALL_I21_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_y1_f64(double noundef nofpclass(nan inf) [[Y]]) #[[ATTR16]]
 // FINITEONLY-NEXT:    [[CMP7_I1:%.*]] = icmp sgt i32 [[X]], 1
-// FINITEONLY-NEXT:    br i1 [[CMP7_I1]], label [[FOR_BODY_I:%.*]], label [[_ZL2YNID_EXIT]]
-// FINITEONLY:       for.body.i:
-// FINITEONLY-NEXT:    [[__I_0_I4:%.*]] = phi i32 [ [[INC_I:%.*]], [[FOR_BODY_I]] ], [ 1, [[IF_END4_I]] ]
-// FINITEONLY-NEXT:    [[__X1_0_I3:%.*]] = phi double [ [[SUB_I:%.*]], [[FOR_BODY_I]] ], [ [[CALL_I21_I]], [[IF_END4_I]] ]
-// FINITEONLY-NEXT:    [[__X0_0_I2:%.*]] = phi double [ [[__X1_0_I3]], [[FOR_BODY_I]] ], [ [[CALL_I_I]], [[IF_END4_I]] ]
+// FINITEONLY-NEXT:    br i1 [[CMP7_I1]], label %[[FOR_BODY_I:.*]], label %[[_ZL2YNID_EXIT]]
+// FINITEONLY:       [[FOR_BODY_I]]:
+// FINITEONLY-NEXT:    [[__I_0_I4:%.*]] = phi i32 [ [[INC_I:%.*]], %[[FOR_BODY_I]] ], [ 1, %[[IF_END4_I]] ]
+// FINITEONLY-NEXT:    [[__X1_0_I3:%.*]] = phi double [ [[SUB_I:%.*]], %[[FOR_BODY_I]] ], [ [[CALL_I21_I]], %[[IF_END4_I]] ]
+// FINITEONLY-NEXT:    [[__X0_0_I2:%.*]] = phi double [ [[__X1_0_I3]], %[[FOR_BODY_I]] ], [ [[CALL_I_I]], %[[IF_END4_I]] ]
 // FINITEONLY-NEXT:    [[MUL_I:%.*]] = shl nuw nsw i32 [[__I_0_I4]], 1
 // FINITEONLY-NEXT:    [[CONV_I:%.*]] = uitofp nneg i32 [[MUL_I]] to double
 // FINITEONLY-NEXT:    [[DIV_I:%.*]] = fdiv nnan ninf contract double [[CONV_I]], [[Y]]
@@ -7187,32 +8102,33 @@ extern "C" __device__ float test_ynf(int x, float y) {
 // FINITEONLY-NEXT:    [[SUB_I]] = fsub nnan ninf contract double [[MUL8_I]], [[__X0_0_I2]]
 // FINITEONLY-NEXT:    [[INC_I]] = add nuw nsw i32 [[__I_0_I4]], 1
 // FINITEONLY-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC_I]], [[X]]
-// FINITEONLY-NEXT:    br i1 [[EXITCOND_NOT]], label [[_ZL2YNID_EXIT]], label [[FOR_BODY_I]], !llvm.loop [[LOOP25:![0-9]+]]
-// FINITEONLY:       _ZL2ynid.exit:
-// FINITEONLY-NEXT:    [[RETVAL_0_I:%.*]] = phi double [ [[CALL_I20_I]], [[IF_THEN_I]] ], [ [[CALL_I22_I]], [[IF_THEN2_I]] ], [ [[CALL_I21_I]], [[IF_END4_I]] ], [ [[SUB_I]], [[FOR_BODY_I]] ]
+// FINITEONLY-NEXT:    br i1 [[EXITCOND_NOT]], label %[[_ZL2YNID_EXIT]], label %[[FOR_BODY_I]], !llvm.loop [[LOOP25:![0-9]+]]
+// FINITEONLY:       [[_ZL2YNID_EXIT]]:
+// FINITEONLY-NEXT:    [[RETVAL_0_I:%.*]] = phi double [ [[CALL_I20_I]], %[[IF_THEN_I]] ], [ [[CALL_I22_I]], %[[IF_THEN2_I]] ], [ [[CALL_I21_I]], %[[IF_END4_I]] ], [ [[SUB_I]], %[[FOR_BODY_I]] ]
 // FINITEONLY-NEXT:    ret double [[RETVAL_0_I]]
 //
-// APPROX-LABEL: @test_yn(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    switch i32 [[X:%.*]], label [[IF_END4_I:%.*]] [
-// APPROX-NEXT:      i32 0, label [[IF_THEN_I:%.*]]
-// APPROX-NEXT:      i32 1, label [[IF_THEN2_I:%.*]]
+// APPROX-LABEL: define dso_local double @test_yn(
+// APPROX-SAME: i32 noundef [[X:%.*]], double noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    switch i32 [[X]], label %[[IF_END4_I:.*]] [
+// APPROX-NEXT:      i32 0, label %[[IF_THEN_I:.*]]
+// APPROX-NEXT:      i32 1, label %[[IF_THEN2_I:.*]]
 // APPROX-NEXT:    ]
-// APPROX:       if.then.i:
-// APPROX-NEXT:    [[CALL_I20_I:%.*]] = tail call contract noundef double @__ocml_y0_f64(double noundef [[Y:%.*]]) #[[ATTR16]]
-// APPROX-NEXT:    br label [[_ZL2YNID_EXIT:%.*]]
-// APPROX:       if.then2.i:
+// APPROX:       [[IF_THEN_I]]:
+// APPROX-NEXT:    [[CALL_I20_I:%.*]] = tail call contract noundef double @__ocml_y0_f64(double noundef [[Y]]) #[[ATTR16]]
+// APPROX-NEXT:    br label %[[_ZL2YNID_EXIT:.*]]
+// APPROX:       [[IF_THEN2_I]]:
 // APPROX-NEXT:    [[CALL_I22_I:%.*]] = tail call contract noundef double @__ocml_y1_f64(double noundef [[Y]]) #[[ATTR16]]
-// APPROX-NEXT:    br label [[_ZL2YNID_EXIT]]
-// APPROX:       if.end4.i:
+// APPROX-NEXT:    br label %[[_ZL2YNID_EXIT]]
+// APPROX:       [[IF_END4_I]]:
 // APPROX-NEXT:    [[CALL_I_I:%.*]] = tail call contract noundef double @__ocml_y0_f64(double noundef [[Y]]) #[[ATTR16]]
 // APPROX-NEXT:    [[CALL_I21_I:%.*]] = tail call contract noundef double @__ocml_y1_f64(double noundef [[Y]]) #[[ATTR16]]
 // APPROX-NEXT:    [[CMP7_I1:%.*]] = icmp sgt i32 [[X]], 1
-// APPROX-NEXT:    br i1 [[CMP7_I1]], label [[FOR_BODY_I:%.*]], label [[_ZL2YNID_EXIT]]
-// APPROX:       for.body.i:
-// APPROX-NEXT:    [[__I_0_I4:%.*]] = phi i32 [ [[INC_I:%.*]], [[FOR_BODY_I]] ], [ 1, [[IF_END4_I]] ]
-// APPROX-NEXT:    [[__X1_0_I3:%.*]] = phi double [ [[SUB_I:%.*]], [[FOR_BODY_I]] ], [ [[CALL_I21_I]], [[IF_END4_I]] ]
-// APPROX-NEXT:    [[__X0_0_I2:%.*]] = phi double [ [[__X1_0_I3]], [[FOR_BODY_I]] ], [ [[CALL_I_I]], [[IF_END4_I]] ]
+// APPROX-NEXT:    br i1 [[CMP7_I1]], label %[[FOR_BODY_I:.*]], label %[[_ZL2YNID_EXIT]]
+// APPROX:       [[FOR_BODY_I]]:
+// APPROX-NEXT:    [[__I_0_I4:%.*]] = phi i32 [ [[INC_I:%.*]], %[[FOR_BODY_I]] ], [ 1, %[[IF_END4_I]] ]
+// APPROX-NEXT:    [[__X1_0_I3:%.*]] = phi double [ [[SUB_I:%.*]], %[[FOR_BODY_I]] ], [ [[CALL_I21_I]], %[[IF_END4_I]] ]
+// APPROX-NEXT:    [[__X0_0_I2:%.*]] = phi double [ [[__X1_0_I3]], %[[FOR_BODY_I]] ], [ [[CALL_I_I]], %[[IF_END4_I]] ]
 // APPROX-NEXT:    [[MUL_I:%.*]] = shl nuw nsw i32 [[__I_0_I4]], 1
 // APPROX-NEXT:    [[CONV_I:%.*]] = uitofp nneg i32 [[MUL_I]] to double
 // APPROX-NEXT:    [[DIV_I:%.*]] = fdiv contract double [[CONV_I]], [[Y]]
@@ -7220,32 +8136,33 @@ extern "C" __device__ float test_ynf(int x, float y) {
 // APPROX-NEXT:    [[SUB_I]] = fsub contract double [[MUL8_I]], [[__X0_0_I2]]
 // APPROX-NEXT:    [[INC_I]] = add nuw nsw i32 [[__I_0_I4]], 1
 // APPROX-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC_I]], [[X]]
-// APPROX-NEXT:    br i1 [[EXITCOND_NOT]], label [[_ZL2YNID_EXIT]], label [[FOR_BODY_I]], !llvm.loop [[LOOP25:![0-9]+]]
-// APPROX:       _ZL2ynid.exit:
-// APPROX-NEXT:    [[RETVAL_0_I:%.*]] = phi double [ [[CALL_I20_I]], [[IF_THEN_I]] ], [ [[CALL_I22_I]], [[IF_THEN2_I]] ], [ [[CALL_I21_I]], [[IF_END4_I]] ], [ [[SUB_I]], [[FOR_BODY_I]] ]
+// APPROX-NEXT:    br i1 [[EXITCOND_NOT]], label %[[_ZL2YNID_EXIT]], label %[[FOR_BODY_I]], !llvm.loop [[LOOP25:![0-9]+]]
+// APPROX:       [[_ZL2YNID_EXIT]]:
+// APPROX-NEXT:    [[RETVAL_0_I:%.*]] = phi double [ [[CALL_I20_I]], %[[IF_THEN_I]] ], [ [[CALL_I22_I]], %[[IF_THEN2_I]] ], [ [[CALL_I21_I]], %[[IF_END4_I]] ], [ [[SUB_I]], %[[FOR_BODY_I]] ]
 // APPROX-NEXT:    ret double [[RETVAL_0_I]]
 //
-// NCRDIV-LABEL: @test_yn(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    switch i32 [[X:%.*]], label [[IF_END4_I:%.*]] [
-// NCRDIV-NEXT:      i32 0, label [[IF_THEN_I:%.*]]
-// NCRDIV-NEXT:      i32 1, label [[IF_THEN2_I:%.*]]
+// NCRDIV-LABEL: define dso_local double @test_yn(
+// NCRDIV-SAME: i32 noundef [[X:%.*]], double noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    switch i32 [[X]], label %[[IF_END4_I:.*]] [
+// NCRDIV-NEXT:      i32 0, label %[[IF_THEN_I:.*]]
+// NCRDIV-NEXT:      i32 1, label %[[IF_THEN2_I:.*]]
 // NCRDIV-NEXT:    ]
-// NCRDIV:       if.then.i:
-// NCRDIV-NEXT:    [[CALL_I20_I:%.*]] = tail call contract noundef double @__ocml_y0_f64(double noundef [[Y:%.*]]) #[[ATTR16]]
-// NCRDIV-NEXT:    br label [[_ZL2YNID_EXIT:%.*]]
-// NCRDIV:       if.then2.i:
+// NCRDIV:       [[IF_THEN_I]]:
+// NCRDIV-NEXT:    [[CALL_I20_I:%.*]] = tail call contract noundef double @__ocml_y0_f64(double noundef [[Y]]) #[[ATTR16]]
+// NCRDIV-NEXT:    br label %[[_ZL2YNID_EXIT:.*]]
+// NCRDIV:       [[IF_THEN2_I]]:
 // NCRDIV-NEXT:    [[CALL_I22_I:%.*]] = tail call contract noundef double @__ocml_y1_f64(double noundef [[Y]]) #[[ATTR16]]
-// NCRDIV-NEXT:    br label [[_ZL2YNID_EXIT]]
-// NCRDIV:       if.end4.i:
+// NCRDIV-NEXT:    br label %[[_ZL2YNID_EXIT]]
+// NCRDIV:       [[IF_END4_I]]:
 // NCRDIV-NEXT:    [[CALL_I_I:%.*]] = tail call contract noundef double @__ocml_y0_f64(double noundef [[Y]]) #[[ATTR16]]
 // NCRDIV-NEXT:    [[CALL_I21_I:%.*]] = tail call contract noundef double @__ocml_y1_f64(double noundef [[Y]]) #[[ATTR16]]
 // NCRDIV-NEXT:    [[CMP7_I1:%.*]] = icmp sgt i32 [[X]], 1
-// NCRDIV-NEXT:    br i1 [[CMP7_I1]], label [[FOR_BODY_I:%.*]], label [[_ZL2YNID_EXIT]]
-// NCRDIV:       for.body.i:
-// NCRDIV-NEXT:    [[__I_0_I4:%.*]] = phi i32 [ [[INC_I:%.*]], [[FOR_BODY_I]] ], [ 1, [[IF_END4_I]] ]
-// NCRDIV-NEXT:    [[__X1_0_I3:%.*]] = phi double [ [[SUB_I:%.*]], [[FOR_BODY_I]] ], [ [[CALL_I21_I]], [[IF_END4_I]] ]
-// NCRDIV-NEXT:    [[__X0_0_I2:%.*]] = phi double [ [[__X1_0_I3]], [[FOR_BODY_I]] ], [ [[CALL_I_I]], [[IF_END4_I]] ]
+// NCRDIV-NEXT:    br i1 [[CMP7_I1]], label %[[FOR_BODY_I:.*]], label %[[_ZL2YNID_EXIT]]
+// NCRDIV:       [[FOR_BODY_I]]:
+// NCRDIV-NEXT:    [[__I_0_I4:%.*]] = phi i32 [ [[INC_I:%.*]], %[[FOR_BODY_I]] ], [ 1, %[[IF_END4_I]] ]
+// NCRDIV-NEXT:    [[__X1_0_I3:%.*]] = phi double [ [[SUB_I:%.*]], %[[FOR_BODY_I]] ], [ [[CALL_I21_I]], %[[IF_END4_I]] ]
+// NCRDIV-NEXT:    [[__X0_0_I2:%.*]] = phi double [ [[__X1_0_I3]], %[[FOR_BODY_I]] ], [ [[CALL_I_I]], %[[IF_END4_I]] ]
 // NCRDIV-NEXT:    [[MUL_I:%.*]] = shl nuw nsw i32 [[__I_0_I4]], 1
 // NCRDIV-NEXT:    [[CONV_I:%.*]] = uitofp nneg i32 [[MUL_I]] to double
 // NCRDIV-NEXT:    [[DIV_I:%.*]] = fdiv contract double [[CONV_I]], [[Y]]
@@ -7253,32 +8170,33 @@ extern "C" __device__ float test_ynf(int x, float y) {
 // NCRDIV-NEXT:    [[SUB_I]] = fsub contract double [[MUL8_I]], [[__X0_0_I2]]
 // NCRDIV-NEXT:    [[INC_I]] = add nuw nsw i32 [[__I_0_I4]], 1
 // NCRDIV-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC_I]], [[X]]
-// NCRDIV-NEXT:    br i1 [[EXITCOND_NOT]], label [[_ZL2YNID_EXIT]], label [[FOR_BODY_I]], !llvm.loop [[LOOP27:![0-9]+]]
-// NCRDIV:       _ZL2ynid.exit:
-// NCRDIV-NEXT:    [[RETVAL_0_I:%.*]] = phi double [ [[CALL_I20_I]], [[IF_THEN_I]] ], [ [[CALL_I22_I]], [[IF_THEN2_I]] ], [ [[CALL_I21_I]], [[IF_END4_I]] ], [ [[SUB_I]], [[FOR_BODY_I]] ]
+// NCRDIV-NEXT:    br i1 [[EXITCOND_NOT]], label %[[_ZL2YNID_EXIT]], label %[[FOR_BODY_I]], !llvm.loop [[LOOP27:![0-9]+]]
+// NCRDIV:       [[_ZL2YNID_EXIT]]:
+// NCRDIV-NEXT:    [[RETVAL_0_I:%.*]] = phi double [ [[CALL_I20_I]], %[[IF_THEN_I]] ], [ [[CALL_I22_I]], %[[IF_THEN2_I]] ], [ [[CALL_I21_I]], %[[IF_END4_I]] ], [ [[SUB_I]], %[[FOR_BODY_I]] ]
 // NCRDIV-NEXT:    ret double [[RETVAL_0_I]]
 //
-// AMDGCNSPIRV-LABEL: @test_yn(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    switch i32 [[X:%.*]], label [[IF_END4_I:%.*]] [
-// AMDGCNSPIRV-NEXT:      i32 0, label [[IF_THEN_I:%.*]]
-// AMDGCNSPIRV-NEXT:      i32 1, label [[IF_THEN2_I:%.*]]
+// AMDGCNSPIRV-LABEL: define spir_func double @test_yn(
+// AMDGCNSPIRV-SAME: i32 noundef [[X:%.*]], double noundef [[Y:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR6]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    switch i32 [[X]], label %[[IF_END4_I:.*]] [
+// AMDGCNSPIRV-NEXT:      i32 0, label %[[IF_THEN_I:.*]]
+// AMDGCNSPIRV-NEXT:      i32 1, label %[[IF_THEN2_I:.*]]
 // AMDGCNSPIRV-NEXT:    ]
-// AMDGCNSPIRV:       if.then.i:
-// AMDGCNSPIRV-NEXT:    [[CALL_I20_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_y0_f64(double noundef [[Y:%.*]]) #[[ATTR14]]
-// AMDGCNSPIRV-NEXT:    br label [[_ZL2YNID_EXIT:%.*]]
-// AMDGCNSPIRV:       if.then2.i:
+// AMDGCNSPIRV:       [[IF_THEN_I]]:
+// AMDGCNSPIRV-NEXT:    [[CALL_I20_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_y0_f64(double noundef [[Y]]) #[[ATTR14]]
+// AMDGCNSPIRV-NEXT:    br label %[[_ZL2YNID_EXIT:.*]]
+// AMDGCNSPIRV:       [[IF_THEN2_I]]:
 // AMDGCNSPIRV-NEXT:    [[CALL_I22_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_y1_f64(double noundef [[Y]]) #[[ATTR14]]
-// AMDGCNSPIRV-NEXT:    br label [[_ZL2YNID_EXIT]]
-// AMDGCNSPIRV:       if.end4.i:
+// AMDGCNSPIRV-NEXT:    br label %[[_ZL2YNID_EXIT]]
+// AMDGCNSPIRV:       [[IF_END4_I]]:
 // AMDGCNSPIRV-NEXT:    [[CALL_I_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_y0_f64(double noundef [[Y]]) #[[ATTR14]]
 // AMDGCNSPIRV-NEXT:    [[CALL_I21_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_y1_f64(double noundef [[Y]]) #[[ATTR14]]
 // AMDGCNSPIRV-NEXT:    [[CMP7_I1:%.*]] = icmp sgt i32 [[X]], 1
-// AMDGCNSPIRV-NEXT:    br i1 [[CMP7_I1]], label [[FOR_BODY_I:%.*]], label [[_ZL2YNID_EXIT]]
-// AMDGCNSPIRV:       for.body.i:
-// AMDGCNSPIRV-NEXT:    [[__I_0_I4:%.*]] = phi i32 [ [[INC_I:%.*]], [[FOR_BODY_I]] ], [ 1, [[IF_END4_I]] ]
-// AMDGCNSPIRV-NEXT:    [[__X1_0_I3:%.*]] = phi double [ [[SUB_I:%.*]], [[FOR_BODY_I]] ], [ [[CALL_I21_I]], [[IF_END4_I]] ]
-// AMDGCNSPIRV-NEXT:    [[__X0_0_I2:%.*]] = phi double [ [[__X1_0_I3]], [[FOR_BODY_I]] ], [ [[CALL_I_I]], [[IF_END4_I]] ]
+// AMDGCNSPIRV-NEXT:    br i1 [[CMP7_I1]], label %[[FOR_BODY_I:.*]], label %[[_ZL2YNID_EXIT]]
+// AMDGCNSPIRV:       [[FOR_BODY_I]]:
+// AMDGCNSPIRV-NEXT:    [[__I_0_I4:%.*]] = phi i32 [ [[INC_I:%.*]], %[[FOR_BODY_I]] ], [ 1, %[[IF_END4_I]] ]
+// AMDGCNSPIRV-NEXT:    [[__X1_0_I3:%.*]] = phi double [ [[SUB_I:%.*]], %[[FOR_BODY_I]] ], [ [[CALL_I21_I]], %[[IF_END4_I]] ]
+// AMDGCNSPIRV-NEXT:    [[__X0_0_I2:%.*]] = phi double [ [[__X1_0_I3]], %[[FOR_BODY_I]] ], [ [[CALL_I_I]], %[[IF_END4_I]] ]
 // AMDGCNSPIRV-NEXT:    [[MUL_I:%.*]] = shl nuw nsw i32 [[__I_0_I4]], 1
 // AMDGCNSPIRV-NEXT:    [[CONV_I:%.*]] = uitofp nneg i32 [[MUL_I]] to double
 // AMDGCNSPIRV-NEXT:    [[DIV_I:%.*]] = fdiv contract double [[CONV_I]], [[Y]]
@@ -7286,71 +8204,81 @@ extern "C" __device__ float test_ynf(int x, float y) {
 // AMDGCNSPIRV-NEXT:    [[SUB_I]] = fsub contract double [[MUL8_I]], [[__X0_0_I2]]
 // AMDGCNSPIRV-NEXT:    [[INC_I]] = add nuw nsw i32 [[__I_0_I4]], 1
 // AMDGCNSPIRV-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC_I]], [[X]]
-// AMDGCNSPIRV-NEXT:    br i1 [[EXITCOND_NOT]], label [[_ZL2YNID_EXIT]], label [[FOR_BODY_I]], !llvm.loop [[LOOP26:![0-9]+]]
-// AMDGCNSPIRV:       _ZL2ynid.exit:
-// AMDGCNSPIRV-NEXT:    [[RETVAL_0_I:%.*]] = phi double [ [[CALL_I20_I]], [[IF_THEN_I]] ], [ [[CALL_I22_I]], [[IF_THEN2_I]] ], [ [[CALL_I21_I]], [[IF_END4_I]] ], [ [[SUB_I]], [[FOR_BODY_I]] ]
+// AMDGCNSPIRV-NEXT:    br i1 [[EXITCOND_NOT]], label %[[_ZL2YNID_EXIT]], label %[[FOR_BODY_I]], !llvm.loop [[LOOP26:![0-9]+]]
+// AMDGCNSPIRV:       [[_ZL2YNID_EXIT]]:
+// AMDGCNSPIRV-NEXT:    [[RETVAL_0_I:%.*]] = phi double [ [[CALL_I20_I]], %[[IF_THEN_I]] ], [ [[CALL_I22_I]], %[[IF_THEN2_I]] ], [ [[CALL_I21_I]], %[[IF_END4_I]] ], [ [[SUB_I]], %[[FOR_BODY_I]] ]
 // AMDGCNSPIRV-NEXT:    ret double [[RETVAL_0_I]]
 //
 extern "C" __device__ double test_yn(int x, double y) {
   return yn(x, y);
 }
 
-// DEFAULT-LABEL: @test___cosf(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_native_cos_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// DEFAULT-LABEL: define dso_local noundef float @test___cosf(
+// DEFAULT-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_native_cos_f32(float noundef [[X]]) #[[ATTR16]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
-// FINITEONLY-LABEL: @test___cosf(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_native_cos_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) float @test___cosf(
+// FINITEONLY-SAME: float noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_native_cos_f32(float noundef nofpclass(nan inf) [[X]]) #[[ATTR16]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
-// APPROX-LABEL: @test___cosf(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_native_cos_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// APPROX-LABEL: define dso_local noundef float @test___cosf(
+// APPROX-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_native_cos_f32(float noundef [[X]]) #[[ATTR16]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
-// NCRDIV-LABEL: @test___cosf(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_native_cos_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// NCRDIV-LABEL: define dso_local noundef float @test___cosf(
+// NCRDIV-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_native_cos_f32(float noundef [[X]]) #[[ATTR16]]
 // NCRDIV-NEXT:    ret float [[CALL_I]]
 //
-// AMDGCNSPIRV-LABEL: @test___cosf(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_native_cos_f32(float noundef [[X:%.*]]) #[[ATTR14]]
+// AMDGCNSPIRV-LABEL: define spir_func noundef float @test___cosf(
+// AMDGCNSPIRV-SAME: float noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR6]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_native_cos_f32(float noundef [[X]]) #[[ATTR14]]
 // AMDGCNSPIRV-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test___cosf(float x) {
   return __cosf(x);
 }
 
-// DEFAULT-LABEL: @test___exp10f(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[MUL_I:%.*]] = fmul contract float [[X:%.*]], 0x400A934F00000000
+// DEFAULT-LABEL: define dso_local noundef float @test___exp10f(
+// DEFAULT-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[MUL_I:%.*]] = fmul contract float [[X]], 0x400A934F00000000
 // DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.amdgcn.exp2.f32(float [[MUL_I]])
 // DEFAULT-NEXT:    ret float [[TMP0]]
 //
-// FINITEONLY-LABEL: @test___exp10f(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[MUL_I:%.*]] = fmul nnan ninf contract float [[X:%.*]], 0x400A934F00000000
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) float @test___exp10f(
+// FINITEONLY-SAME: float noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[MUL_I:%.*]] = fmul nnan ninf contract float [[X]], 0x400A934F00000000
 // FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.amdgcn.exp2.f32(float [[MUL_I]])
 // FINITEONLY-NEXT:    ret float [[TMP0]]
 //
-// APPROX-LABEL: @test___exp10f(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[MUL_I:%.*]] = fmul contract float [[X:%.*]], 0x400A934F00000000
+// APPROX-LABEL: define dso_local noundef float @test___exp10f(
+// APPROX-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[MUL_I:%.*]] = fmul contract float [[X]], 0x400A934F00000000
 // APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.amdgcn.exp2.f32(float [[MUL_I]])
 // APPROX-NEXT:    ret float [[TMP0]]
 //
-// NCRDIV-LABEL: @test___exp10f(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[MUL_I:%.*]] = fmul contract float [[X:%.*]], 0x400A934F00000000
+// NCRDIV-LABEL: define dso_local noundef float @test___exp10f(
+// NCRDIV-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[MUL_I:%.*]] = fmul contract float [[X]], 0x400A934F00000000
 // NCRDIV-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.amdgcn.exp2.f32(float [[MUL_I]])
 // NCRDIV-NEXT:    ret float [[TMP0]]
 //
-// AMDGCNSPIRV-LABEL: @test___exp10f(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[MUL_I:%.*]] = fmul contract float [[X:%.*]], 0x400A934F00000000
+// AMDGCNSPIRV-LABEL: define spir_func noundef float @test___exp10f(
+// AMDGCNSPIRV-SAME: float noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR3]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[MUL_I:%.*]] = fmul contract float [[X]], 0x400A934F00000000
 // AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call contract noundef addrspace(4) float @llvm.amdgcn.exp2.f32(float [[MUL_I]])
 // AMDGCNSPIRV-NEXT:    ret float [[TMP0]]
 //
@@ -7358,33 +8286,38 @@ extern "C" __device__ float test___exp10f(float x) {
   return __exp10f(x);
 }
 
-// DEFAULT-LABEL: @test___expf(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[MUL_I:%.*]] = fmul contract float [[X:%.*]], 0x3FF7154760000000
+// DEFAULT-LABEL: define dso_local noundef float @test___expf(
+// DEFAULT-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[MUL_I:%.*]] = fmul contract float [[X]], 0x3FF7154760000000
 // DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.amdgcn.exp2.f32(float [[MUL_I]])
 // DEFAULT-NEXT:    ret float [[TMP0]]
 //
-// FINITEONLY-LABEL: @test___expf(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[MUL_I:%.*]] = fmul nnan ninf contract float [[X:%.*]], 0x3FF7154760000000
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) float @test___expf(
+// FINITEONLY-SAME: float noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[MUL_I:%.*]] = fmul nnan ninf contract float [[X]], 0x3FF7154760000000
 // FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.amdgcn.exp2.f32(float [[MUL_I]])
 // FINITEONLY-NEXT:    ret float [[TMP0]]
 //
-// APPROX-LABEL: @test___expf(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[MUL_I:%.*]] = fmul contract float [[X:%.*]], 0x3FF7154760000000
+// APPROX-LABEL: define dso_local noundef float @test___expf(
+// APPROX-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[MUL_I:%.*]] = fmul contract float [[X]], 0x3FF7154760000000
 // APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.amdgcn.exp2.f32(float [[MUL_I]])
 // APPROX-NEXT:    ret float [[TMP0]]
 //
-// NCRDIV-LABEL: @test___expf(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[MUL_I:%.*]] = fmul contract float [[X:%.*]], 0x3FF7154760000000
+// NCRDIV-LABEL: define dso_local noundef float @test___expf(
+// NCRDIV-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[MUL_I:%.*]] = fmul contract float [[X]], 0x3FF7154760000000
 // NCRDIV-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.amdgcn.exp2.f32(float [[MUL_I]])
 // NCRDIV-NEXT:    ret float [[TMP0]]
 //
-// AMDGCNSPIRV-LABEL: @test___expf(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[MUL_I:%.*]] = fmul contract float [[X:%.*]], 0x3FF7154760000000
+// AMDGCNSPIRV-LABEL: define spir_func noundef float @test___expf(
+// AMDGCNSPIRV-SAME: float noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR3]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[MUL_I:%.*]] = fmul contract float [[X]], 0x3FF7154760000000
 // AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call contract noundef addrspace(4) float @llvm.amdgcn.exp2.f32(float [[MUL_I]])
 // AMDGCNSPIRV-NEXT:    ret float [[TMP0]]
 //
@@ -7392,389 +8325,454 @@ extern "C" __device__ float test___expf(float x) {
   return __expf(x);
 }
 
-// DEFAULT-LABEL: @test___fadd_rn(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[ADD_I:%.*]] = fadd contract float [[X:%.*]], [[Y:%.*]]
+// DEFAULT-LABEL: define dso_local noundef float @test___fadd_rn(
+// DEFAULT-SAME: float noundef [[X:%.*]], float noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[ADD_I:%.*]] = fadd contract float [[X]], [[Y]]
 // DEFAULT-NEXT:    ret float [[ADD_I]]
 //
-// FINITEONLY-LABEL: @test___fadd_rn(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[ADD_I:%.*]] = fadd nnan ninf contract float [[X:%.*]], [[Y:%.*]]
+// FINITEONLY-LABEL: define dso_local nofpclass(nan inf) float @test___fadd_rn(
+// FINITEONLY-SAME: float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[ADD_I:%.*]] = fadd nnan ninf contract float [[X]], [[Y]]
 // FINITEONLY-NEXT:    ret float [[ADD_I]]
 //
-// APPROX-LABEL: @test___fadd_rn(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[ADD_I:%.*]] = fadd contract float [[X:%.*]], [[Y:%.*]]
+// APPROX-LABEL: define dso_local noundef float @test___fadd_rn(
+// APPROX-SAME: float noundef [[X:%.*]], float noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[ADD_I:%.*]] = fadd contract float [[X]], [[Y]]
 // APPROX-NEXT:    ret float [[ADD_I]]
 //
-// NCRDIV-LABEL: @test___fadd_rn(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[ADD_I:%.*]] = fadd contract float [[X:%.*]], [[Y:%.*]]
+// NCRDIV-LABEL: define dso_local noundef float @test___fadd_rn(
+// NCRDIV-SAME: float noundef [[X:%.*]], float noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[ADD_I:%.*]] = fadd contract float [[X]], [[Y]]
 // NCRDIV-NEXT:    ret float [[ADD_I]]
 //
-// AMDGCNSPIRV-LABEL: @test___fadd_rn(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[ADD_I:%.*]] = fadd contract float [[X:%.*]], [[Y:%.*]]
+// AMDGCNSPIRV-LABEL: define spir_func noundef float @test___fadd_rn(
+// AMDGCNSPIRV-SAME: float noundef [[X:%.*]], float noundef [[Y:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR3]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[ADD_I:%.*]] = fadd contract float [[X]], [[Y]]
 // AMDGCNSPIRV-NEXT:    ret float [[ADD_I]]
 //
 extern "C" __device__ float test___fadd_rn(float x, float y) {
   return __fadd_rn(x, y);
 }
 
-// DEFAULT-LABEL: @test___fdividef(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[DIV_I:%.*]] = fdiv contract float [[X:%.*]], [[Y:%.*]]
+// DEFAULT-LABEL: define dso_local noundef float @test___fdividef(
+// DEFAULT-SAME: float noundef [[X:%.*]], float noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[DIV_I:%.*]] = fdiv contract float [[X]], [[Y]]
 // DEFAULT-NEXT:    ret float [[DIV_I]]
 //
-// FINITEONLY-LABEL: @test___fdividef(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[DIV_I:%.*]] = fdiv nnan ninf contract float [[X:%.*]], [[Y:%.*]]
+// FINITEONLY-LABEL: define dso_local nofpclass(nan inf) float @test___fdividef(
+// FINITEONLY-SAME: float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[DIV_I:%.*]] = fdiv nnan ninf contract float [[X]], [[Y]]
 // FINITEONLY-NEXT:    ret float [[DIV_I]]
 //
-// APPROX-LABEL: @test___fdividef(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[DIV_I:%.*]] = fdiv contract float [[X:%.*]], [[Y:%.*]]
+// APPROX-LABEL: define dso_local noundef float @test___fdividef(
+// APPROX-SAME: float noundef [[X:%.*]], float noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[DIV_I:%.*]] = fdiv contract float [[X]], [[Y]]
 // APPROX-NEXT:    ret float [[DIV_I]]
 //
-// NCRDIV-LABEL: @test___fdividef(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[DIV_I:%.*]] = fdiv contract float [[X:%.*]], [[Y:%.*]], !fpmath [[META12]]
+// NCRDIV-LABEL: define dso_local noundef float @test___fdividef(
+// NCRDIV-SAME: float noundef [[X:%.*]], float noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[DIV_I:%.*]] = fdiv contract float [[X]], [[Y]], !fpmath [[META12]]
 // NCRDIV-NEXT:    ret float [[DIV_I]]
 //
-// AMDGCNSPIRV-LABEL: @test___fdividef(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[DIV_I:%.*]] = fdiv contract float [[X:%.*]], [[Y:%.*]]
+// AMDGCNSPIRV-LABEL: define spir_func noundef float @test___fdividef(
+// AMDGCNSPIRV-SAME: float noundef [[X:%.*]], float noundef [[Y:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR3]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[DIV_I:%.*]] = fdiv contract float [[X]], [[Y]]
 // AMDGCNSPIRV-NEXT:    ret float [[DIV_I]]
 //
 extern "C" __device__ float test___fdividef(float x, float y) {
   return __fdividef(x, y);
 }
 
-// DEFAULT-LABEL: @test__fmaf_rn(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.fma.f32(float [[X:%.*]], float [[Y:%.*]], float [[Z:%.*]])
+// DEFAULT-LABEL: define dso_local noundef float @test__fmaf_rn(
+// DEFAULT-SAME: float noundef [[X:%.*]], float noundef [[Y:%.*]], float noundef [[Z:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.fma.f32(float [[X]], float [[Y]], float [[Z]])
 // DEFAULT-NEXT:    ret float [[TMP0]]
 //
-// FINITEONLY-LABEL: @test__fmaf_rn(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.fma.f32(float nofpclass(nan inf) [[X:%.*]], float nofpclass(nan inf) [[Y:%.*]], float nofpclass(nan inf) [[Z:%.*]])
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) float @test__fmaf_rn(
+// FINITEONLY-SAME: float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]], float noundef nofpclass(nan inf) [[Z:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.fma.f32(float nofpclass(nan inf) [[X]], float nofpclass(nan inf) [[Y]], float nofpclass(nan inf) [[Z]])
 // FINITEONLY-NEXT:    ret float [[TMP0]]
 //
-// APPROX-LABEL: @test__fmaf_rn(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.fma.f32(float [[X:%.*]], float [[Y:%.*]], float [[Z:%.*]])
+// APPROX-LABEL: define dso_local noundef float @test__fmaf_rn(
+// APPROX-SAME: float noundef [[X:%.*]], float noundef [[Y:%.*]], float noundef [[Z:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.fma.f32(float [[X]], float [[Y]], float [[Z]])
 // APPROX-NEXT:    ret float [[TMP0]]
 //
-// NCRDIV-LABEL: @test__fmaf_rn(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.fma.f32(float [[X:%.*]], float [[Y:%.*]], float [[Z:%.*]])
+// NCRDIV-LABEL: define dso_local noundef float @test__fmaf_rn(
+// NCRDIV-SAME: float noundef [[X:%.*]], float noundef [[Y:%.*]], float noundef [[Z:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.fma.f32(float [[X]], float [[Y]], float [[Z]])
 // NCRDIV-NEXT:    ret float [[TMP0]]
 //
-// AMDGCNSPIRV-LABEL: @test__fmaf_rn(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call contract noundef addrspace(4) float @llvm.fma.f32(float [[X:%.*]], float [[Y:%.*]], float [[Z:%.*]])
+// AMDGCNSPIRV-LABEL: define spir_func noundef float @test__fmaf_rn(
+// AMDGCNSPIRV-SAME: float noundef [[X:%.*]], float noundef [[Y:%.*]], float noundef [[Z:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR3]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call contract noundef addrspace(4) float @llvm.fma.f32(float [[X]], float [[Y]], float [[Z]])
 // AMDGCNSPIRV-NEXT:    ret float [[TMP0]]
 //
 extern "C" __device__ float test__fmaf_rn(float x, float y, float z) {
   return __fmaf_rn(x, y, z);
 }
 
-// DEFAULT-LABEL: @test___fmul_rn(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[MUL_I:%.*]] = fmul contract float [[X:%.*]], [[Y:%.*]]
+// DEFAULT-LABEL: define dso_local noundef float @test___fmul_rn(
+// DEFAULT-SAME: float noundef [[X:%.*]], float noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[MUL_I:%.*]] = fmul contract float [[X]], [[Y]]
 // DEFAULT-NEXT:    ret float [[MUL_I]]
 //
-// FINITEONLY-LABEL: @test___fmul_rn(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[MUL_I:%.*]] = fmul nnan ninf contract float [[X:%.*]], [[Y:%.*]]
+// FINITEONLY-LABEL: define dso_local nofpclass(nan inf) float @test___fmul_rn(
+// FINITEONLY-SAME: float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[MUL_I:%.*]] = fmul nnan ninf contract float [[X]], [[Y]]
 // FINITEONLY-NEXT:    ret float [[MUL_I]]
 //
-// APPROX-LABEL: @test___fmul_rn(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[MUL_I:%.*]] = fmul contract float [[X:%.*]], [[Y:%.*]]
+// APPROX-LABEL: define dso_local noundef float @test___fmul_rn(
+// APPROX-SAME: float noundef [[X:%.*]], float noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[MUL_I:%.*]] = fmul contract float [[X]], [[Y]]
 // APPROX-NEXT:    ret float [[MUL_I]]
 //
-// NCRDIV-LABEL: @test___fmul_rn(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[MUL_I:%.*]] = fmul contract float [[X:%.*]], [[Y:%.*]]
+// NCRDIV-LABEL: define dso_local noundef float @test___fmul_rn(
+// NCRDIV-SAME: float noundef [[X:%.*]], float noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[MUL_I:%.*]] = fmul contract float [[X]], [[Y]]
 // NCRDIV-NEXT:    ret float [[MUL_I]]
 //
-// AMDGCNSPIRV-LABEL: @test___fmul_rn(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[MUL_I:%.*]] = fmul contract float [[X:%.*]], [[Y:%.*]]
+// AMDGCNSPIRV-LABEL: define spir_func noundef float @test___fmul_rn(
+// AMDGCNSPIRV-SAME: float noundef [[X:%.*]], float noundef [[Y:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR3]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[MUL_I:%.*]] = fmul contract float [[X]], [[Y]]
 // AMDGCNSPIRV-NEXT:    ret float [[MUL_I]]
 //
 extern "C" __device__ float test___fmul_rn(float x, float y) {
   return __fmul_rn(x, y);
 }
 
-// DEFAULT-LABEL: @test___frcp_rn(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[DIV_I:%.*]] = fdiv contract float 1.000000e+00, [[X:%.*]]
+// DEFAULT-LABEL: define dso_local noundef float @test___frcp_rn(
+// DEFAULT-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[DIV_I:%.*]] = fdiv contract float 1.000000e+00, [[X]]
 // DEFAULT-NEXT:    ret float [[DIV_I]]
 //
-// FINITEONLY-LABEL: @test___frcp_rn(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[DIV_I:%.*]] = fdiv nnan ninf contract float 1.000000e+00, [[X:%.*]]
+// FINITEONLY-LABEL: define dso_local nofpclass(nan inf) float @test___frcp_rn(
+// FINITEONLY-SAME: float noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[DIV_I:%.*]] = fdiv nnan ninf contract float 1.000000e+00, [[X]]
 // FINITEONLY-NEXT:    ret float [[DIV_I]]
 //
-// APPROX-LABEL: @test___frcp_rn(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[DIV_I:%.*]] = fdiv contract float 1.000000e+00, [[X:%.*]]
+// APPROX-LABEL: define dso_local noundef float @test___frcp_rn(
+// APPROX-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[DIV_I:%.*]] = fdiv contract float 1.000000e+00, [[X]]
 // APPROX-NEXT:    ret float [[DIV_I]]
 //
-// NCRDIV-LABEL: @test___frcp_rn(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[DIV_I:%.*]] = fdiv contract float 1.000000e+00, [[X:%.*]], !fpmath [[META12]]
+// NCRDIV-LABEL: define dso_local noundef float @test___frcp_rn(
+// NCRDIV-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[DIV_I:%.*]] = fdiv contract float 1.000000e+00, [[X]], !fpmath [[META12]]
 // NCRDIV-NEXT:    ret float [[DIV_I]]
 //
-// AMDGCNSPIRV-LABEL: @test___frcp_rn(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[DIV_I:%.*]] = fdiv contract float 1.000000e+00, [[X:%.*]]
+// AMDGCNSPIRV-LABEL: define spir_func noundef float @test___frcp_rn(
+// AMDGCNSPIRV-SAME: float noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR3]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[DIV_I:%.*]] = fdiv contract float 1.000000e+00, [[X]]
 // AMDGCNSPIRV-NEXT:    ret float [[DIV_I]]
 //
 extern "C" __device__ float test___frcp_rn(float x) {
   return __frcp_rn(x);
 }
 
-// DEFAULT-LABEL: @test___frsqrt_rn(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.amdgcn.rsq.f32(float [[X:%.*]])
+// DEFAULT-LABEL: define dso_local noundef float @test___frsqrt_rn(
+// DEFAULT-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.amdgcn.rsq.f32(float [[X]])
 // DEFAULT-NEXT:    ret float [[TMP0]]
 //
-// FINITEONLY-LABEL: @test___frsqrt_rn(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.amdgcn.rsq.f32(float nofpclass(nan inf) [[X:%.*]])
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) float @test___frsqrt_rn(
+// FINITEONLY-SAME: float noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.amdgcn.rsq.f32(float nofpclass(nan inf) [[X]])
 // FINITEONLY-NEXT:    ret float [[TMP0]]
 //
-// APPROX-LABEL: @test___frsqrt_rn(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.amdgcn.rsq.f32(float [[X:%.*]])
+// APPROX-LABEL: define dso_local noundef float @test___frsqrt_rn(
+// APPROX-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.amdgcn.rsq.f32(float [[X]])
 // APPROX-NEXT:    ret float [[TMP0]]
 //
-// NCRDIV-LABEL: @test___frsqrt_rn(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.amdgcn.rsq.f32(float [[X:%.*]])
+// NCRDIV-LABEL: define dso_local noundef float @test___frsqrt_rn(
+// NCRDIV-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.amdgcn.rsq.f32(float [[X]])
 // NCRDIV-NEXT:    ret float [[TMP0]]
 //
-// AMDGCNSPIRV-LABEL: @test___frsqrt_rn(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call contract noundef addrspace(4) float @llvm.amdgcn.rsq.f32(float [[X:%.*]])
+// AMDGCNSPIRV-LABEL: define spir_func noundef float @test___frsqrt_rn(
+// AMDGCNSPIRV-SAME: float noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR3]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call contract noundef addrspace(4) float @llvm.amdgcn.rsq.f32(float [[X]])
 // AMDGCNSPIRV-NEXT:    ret float [[TMP0]]
 //
 extern "C" __device__ float test___frsqrt_rn(float x) {
   return __frsqrt_rn(x);
 }
 
-// DEFAULT-LABEL: @test___fsqrt_rn(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_native_sqrt_f32(float noundef [[X:%.*]]) #[[ATTR14]]
+// DEFAULT-LABEL: define dso_local noundef float @test___fsqrt_rn(
+// DEFAULT-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_native_sqrt_f32(float noundef [[X]]) #[[ATTR14]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
-// FINITEONLY-LABEL: @test___fsqrt_rn(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_native_sqrt_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]]
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) float @test___fsqrt_rn(
+// FINITEONLY-SAME: float noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_native_sqrt_f32(float noundef nofpclass(nan inf) [[X]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
-// APPROX-LABEL: @test___fsqrt_rn(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_native_sqrt_f32(float noundef [[X:%.*]]) #[[ATTR14]]
+// APPROX-LABEL: define dso_local noundef float @test___fsqrt_rn(
+// APPROX-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_native_sqrt_f32(float noundef [[X]]) #[[ATTR14]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
-// NCRDIV-LABEL: @test___fsqrt_rn(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_native_sqrt_f32(float noundef [[X:%.*]]) #[[ATTR14]]
+// NCRDIV-LABEL: define dso_local noundef float @test___fsqrt_rn(
+// NCRDIV-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_native_sqrt_f32(float noundef [[X]]) #[[ATTR14]]
 // NCRDIV-NEXT:    ret float [[CALL_I]]
 //
-// AMDGCNSPIRV-LABEL: @test___fsqrt_rn(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_native_sqrt_f32(float noundef [[X:%.*]]) #[[ATTR12]]
+// AMDGCNSPIRV-LABEL: define spir_func noundef float @test___fsqrt_rn(
+// AMDGCNSPIRV-SAME: float noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR4]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_native_sqrt_f32(float noundef [[X]]) #[[ATTR12]]
 // AMDGCNSPIRV-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test___fsqrt_rn(float x) {
   return __fsqrt_rn(x);
 }
 
-// DEFAULT-LABEL: @test___fsub_rn(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[SUB_I:%.*]] = fsub contract float [[X:%.*]], [[Y:%.*]]
+// DEFAULT-LABEL: define dso_local noundef float @test___fsub_rn(
+// DEFAULT-SAME: float noundef [[X:%.*]], float noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[SUB_I:%.*]] = fsub contract float [[X]], [[Y]]
 // DEFAULT-NEXT:    ret float [[SUB_I]]
 //
-// FINITEONLY-LABEL: @test___fsub_rn(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[SUB_I:%.*]] = fsub nnan ninf contract float [[X:%.*]], [[Y:%.*]]
+// FINITEONLY-LABEL: define dso_local nofpclass(nan inf) float @test___fsub_rn(
+// FINITEONLY-SAME: float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[SUB_I:%.*]] = fsub nnan ninf contract float [[X]], [[Y]]
 // FINITEONLY-NEXT:    ret float [[SUB_I]]
 //
-// APPROX-LABEL: @test___fsub_rn(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[SUB_I:%.*]] = fsub contract float [[X:%.*]], [[Y:%.*]]
+// APPROX-LABEL: define dso_local noundef float @test___fsub_rn(
+// APPROX-SAME: float noundef [[X:%.*]], float noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[SUB_I:%.*]] = fsub contract float [[X]], [[Y]]
 // APPROX-NEXT:    ret float [[SUB_I]]
 //
-// NCRDIV-LABEL: @test___fsub_rn(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[SUB_I:%.*]] = fsub contract float [[X:%.*]], [[Y:%.*]]
+// NCRDIV-LABEL: define dso_local noundef float @test___fsub_rn(
+// NCRDIV-SAME: float noundef [[X:%.*]], float noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[SUB_I:%.*]] = fsub contract float [[X]], [[Y]]
 // NCRDIV-NEXT:    ret float [[SUB_I]]
 //
-// AMDGCNSPIRV-LABEL: @test___fsub_rn(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[SUB_I:%.*]] = fsub contract float [[X:%.*]], [[Y:%.*]]
+// AMDGCNSPIRV-LABEL: define spir_func noundef float @test___fsub_rn(
+// AMDGCNSPIRV-SAME: float noundef [[X:%.*]], float noundef [[Y:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR3]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[SUB_I:%.*]] = fsub contract float [[X]], [[Y]]
 // AMDGCNSPIRV-NEXT:    ret float [[SUB_I]]
 //
 extern "C" __device__ float test___fsub_rn(float x, float y) {
   return __fsub_rn(x, y);
 }
 
-// DEFAULT-LABEL: @test___log10f(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.log10.f32(float [[X:%.*]])
+// DEFAULT-LABEL: define dso_local noundef float @test___log10f(
+// DEFAULT-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.log10.f32(float [[X]])
 // DEFAULT-NEXT:    ret float [[TMP0]]
 //
-// FINITEONLY-LABEL: @test___log10f(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.log10.f32(float nofpclass(nan inf) [[X:%.*]])
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) float @test___log10f(
+// FINITEONLY-SAME: float noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.log10.f32(float nofpclass(nan inf) [[X]])
 // FINITEONLY-NEXT:    ret float [[TMP0]]
 //
-// APPROX-LABEL: @test___log10f(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.log10.f32(float [[X:%.*]])
+// APPROX-LABEL: define dso_local noundef float @test___log10f(
+// APPROX-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.log10.f32(float [[X]])
 // APPROX-NEXT:    ret float [[TMP0]]
 //
-// NCRDIV-LABEL: @test___log10f(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.log10.f32(float [[X:%.*]])
+// NCRDIV-LABEL: define dso_local noundef float @test___log10f(
+// NCRDIV-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.log10.f32(float [[X]])
 // NCRDIV-NEXT:    ret float [[TMP0]]
 //
-// AMDGCNSPIRV-LABEL: @test___log10f(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call contract noundef addrspace(4) float @llvm.log10.f32(float [[X:%.*]])
+// AMDGCNSPIRV-LABEL: define spir_func noundef float @test___log10f(
+// AMDGCNSPIRV-SAME: float noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR3]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call contract noundef addrspace(4) float @llvm.log10.f32(float [[X]])
 // AMDGCNSPIRV-NEXT:    ret float [[TMP0]]
 //
 extern "C" __device__ float test___log10f(float x) {
   return __log10f(x);
 }
 
-// DEFAULT-LABEL: @test___log2f(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.amdgcn.log.f32(float [[X:%.*]])
+// DEFAULT-LABEL: define dso_local noundef float @test___log2f(
+// DEFAULT-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.amdgcn.log.f32(float [[X]])
 // DEFAULT-NEXT:    ret float [[TMP0]]
 //
-// FINITEONLY-LABEL: @test___log2f(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.amdgcn.log.f32(float nofpclass(nan inf) [[X:%.*]])
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) float @test___log2f(
+// FINITEONLY-SAME: float noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.amdgcn.log.f32(float nofpclass(nan inf) [[X]])
 // FINITEONLY-NEXT:    ret float [[TMP0]]
 //
-// APPROX-LABEL: @test___log2f(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.amdgcn.log.f32(float [[X:%.*]])
+// APPROX-LABEL: define dso_local noundef float @test___log2f(
+// APPROX-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.amdgcn.log.f32(float [[X]])
 // APPROX-NEXT:    ret float [[TMP0]]
 //
-// NCRDIV-LABEL: @test___log2f(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.amdgcn.log.f32(float [[X:%.*]])
+// NCRDIV-LABEL: define dso_local noundef float @test___log2f(
+// NCRDIV-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.amdgcn.log.f32(float [[X]])
 // NCRDIV-NEXT:    ret float [[TMP0]]
 //
-// AMDGCNSPIRV-LABEL: @test___log2f(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call contract noundef addrspace(4) float @llvm.amdgcn.log.f32(float [[X:%.*]])
+// AMDGCNSPIRV-LABEL: define spir_func noundef float @test___log2f(
+// AMDGCNSPIRV-SAME: float noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR3]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call contract noundef addrspace(4) float @llvm.amdgcn.log.f32(float [[X]])
 // AMDGCNSPIRV-NEXT:    ret float [[TMP0]]
 //
 extern "C" __device__ float test___log2f(float x) {
   return __log2f(x);
 }
 
-// DEFAULT-LABEL: @test___logf(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.log.f32(float [[X:%.*]])
+// DEFAULT-LABEL: define dso_local noundef float @test___logf(
+// DEFAULT-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.log.f32(float [[X]])
 // DEFAULT-NEXT:    ret float [[TMP0]]
 //
-// FINITEONLY-LABEL: @test___logf(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.log.f32(float nofpclass(nan inf) [[X:%.*]])
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) float @test___logf(
+// FINITEONLY-SAME: float noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.log.f32(float nofpclass(nan inf) [[X]])
 // FINITEONLY-NEXT:    ret float [[TMP0]]
 //
-// APPROX-LABEL: @test___logf(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.log.f32(float [[X:%.*]])
+// APPROX-LABEL: define dso_local noundef float @test___logf(
+// APPROX-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.log.f32(float [[X]])
 // APPROX-NEXT:    ret float [[TMP0]]
 //
-// NCRDIV-LABEL: @test___logf(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.log.f32(float [[X:%.*]])
+// NCRDIV-LABEL: define dso_local noundef float @test___logf(
+// NCRDIV-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.log.f32(float [[X]])
 // NCRDIV-NEXT:    ret float [[TMP0]]
 //
-// AMDGCNSPIRV-LABEL: @test___logf(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call contract noundef addrspace(4) float @llvm.log.f32(float [[X:%.*]])
+// AMDGCNSPIRV-LABEL: define spir_func noundef float @test___logf(
+// AMDGCNSPIRV-SAME: float noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR3]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call contract noundef addrspace(4) float @llvm.log.f32(float [[X]])
 // AMDGCNSPIRV-NEXT:    ret float [[TMP0]]
 //
 extern "C" __device__ float test___logf(float x) {
   return __logf(x);
 }
 
-// DEFAULT-LABEL: @test___powf(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_pow_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR15]]
+// DEFAULT-LABEL: define dso_local noundef float @test___powf(
+// DEFAULT-SAME: float noundef [[X:%.*]], float noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_pow_f32(float noundef [[X]], float noundef [[Y]]) #[[ATTR15]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
-// FINITEONLY-LABEL: @test___powf(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_pow_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR15]]
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) float @test___powf(
+// FINITEONLY-SAME: float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_pow_f32(float noundef nofpclass(nan inf) [[X]], float noundef nofpclass(nan inf) [[Y]]) #[[ATTR15]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
-// APPROX-LABEL: @test___powf(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_pow_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR15]]
+// APPROX-LABEL: define dso_local noundef float @test___powf(
+// APPROX-SAME: float noundef [[X:%.*]], float noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_pow_f32(float noundef [[X]], float noundef [[Y]]) #[[ATTR15]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
-// NCRDIV-LABEL: @test___powf(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_pow_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR15]]
+// NCRDIV-LABEL: define dso_local noundef float @test___powf(
+// NCRDIV-SAME: float noundef [[X:%.*]], float noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_pow_f32(float noundef [[X]], float noundef [[Y]]) #[[ATTR15]]
 // NCRDIV-NEXT:    ret float [[CALL_I]]
 //
-// AMDGCNSPIRV-LABEL: @test___powf(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_pow_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR13]]
+// AMDGCNSPIRV-LABEL: define spir_func noundef float @test___powf(
+// AMDGCNSPIRV-SAME: float noundef [[X:%.*]], float noundef [[Y:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR5]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_pow_f32(float noundef [[X]], float noundef [[Y]]) #[[ATTR13]]
 // AMDGCNSPIRV-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test___powf(float x, float y) {
   return __powf(x, y);
 }
 
-// DEFAULT-LABEL: @test___saturatef(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CMP_I:%.*]] = fcmp contract olt float [[X:%.*]], 0.000000e+00
+// DEFAULT-LABEL: define dso_local noundef float @test___saturatef(
+// DEFAULT-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[CMP_I:%.*]] = fcmp contract olt float [[X]], 0.000000e+00
 // DEFAULT-NEXT:    [[CMP1_I:%.*]] = fcmp contract ogt float [[X]], 1.000000e+00
 // DEFAULT-NEXT:    [[COND_I:%.*]] = select contract i1 [[CMP1_I]], float 1.000000e+00, float [[X]]
 // DEFAULT-NEXT:    [[COND5_I:%.*]] = select contract i1 [[CMP_I]], float 0.000000e+00, float [[COND_I]]
 // DEFAULT-NEXT:    ret float [[COND5_I]]
 //
-// FINITEONLY-LABEL: @test___saturatef(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CMP_I:%.*]] = fcmp nnan ninf contract olt float [[X:%.*]], 0.000000e+00
+// FINITEONLY-LABEL: define dso_local nofpclass(nan inf) float @test___saturatef(
+// FINITEONLY-SAME: float noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[CMP_I:%.*]] = fcmp nnan ninf contract olt float [[X]], 0.000000e+00
 // FINITEONLY-NEXT:    [[CMP1_I:%.*]] = fcmp nnan ninf contract ogt float [[X]], 1.000000e+00
 // FINITEONLY-NEXT:    [[COND_I:%.*]] = select nnan ninf contract i1 [[CMP1_I]], float 1.000000e+00, float [[X]]
 // FINITEONLY-NEXT:    [[COND5_I:%.*]] = select nnan ninf contract i1 [[CMP_I]], float 0.000000e+00, float [[COND_I]]
 // FINITEONLY-NEXT:    ret float [[COND5_I]]
 //
-// APPROX-LABEL: @test___saturatef(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CMP_I:%.*]] = fcmp contract olt float [[X:%.*]], 0.000000e+00
+// APPROX-LABEL: define dso_local noundef float @test___saturatef(
+// APPROX-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[CMP_I:%.*]] = fcmp contract olt float [[X]], 0.000000e+00
 // APPROX-NEXT:    [[CMP1_I:%.*]] = fcmp contract ogt float [[X]], 1.000000e+00
 // APPROX-NEXT:    [[COND_I:%.*]] = select contract i1 [[CMP1_I]], float 1.000000e+00, float [[X]]
 // APPROX-NEXT:    [[COND5_I:%.*]] = select contract i1 [[CMP_I]], float 0.000000e+00, float [[COND_I]]
 // APPROX-NEXT:    ret float [[COND5_I]]
 //
-// NCRDIV-LABEL: @test___saturatef(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[CMP_I:%.*]] = fcmp contract olt float [[X:%.*]], 0.000000e+00
+// NCRDIV-LABEL: define dso_local noundef float @test___saturatef(
+// NCRDIV-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[CMP_I:%.*]] = fcmp contract olt float [[X]], 0.000000e+00
 // NCRDIV-NEXT:    [[CMP1_I:%.*]] = fcmp contract ogt float [[X]], 1.000000e+00
 // NCRDIV-NEXT:    [[COND_I:%.*]] = select contract i1 [[CMP1_I]], float 1.000000e+00, float [[X]]
 // NCRDIV-NEXT:    [[COND5_I:%.*]] = select contract i1 [[CMP_I]], float 0.000000e+00, float [[COND_I]]
 // NCRDIV-NEXT:    ret float [[COND5_I]]
 //
-// AMDGCNSPIRV-LABEL: @test___saturatef(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[CMP_I:%.*]] = fcmp contract olt float [[X:%.*]], 0.000000e+00
+// AMDGCNSPIRV-LABEL: define spir_func noundef float @test___saturatef(
+// AMDGCNSPIRV-SAME: float noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR3]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[CMP_I:%.*]] = fcmp contract olt float [[X]], 0.000000e+00
 // AMDGCNSPIRV-NEXT:    [[CMP1_I:%.*]] = fcmp contract ogt float [[X]], 1.000000e+00
 // AMDGCNSPIRV-NEXT:    [[COND_I:%.*]] = select contract i1 [[CMP1_I]], float 1.000000e+00, float [[X]]
 // AMDGCNSPIRV-NEXT:    [[COND5_I:%.*]] = select contract i1 [[CMP_I]], float 0.000000e+00, float [[COND_I]]
@@ -7784,114 +8782,129 @@ extern "C" __device__ float test___saturatef(float x) {
   return __saturatef(x);
 }
 
-// DEFAULT-LABEL: @test___sincosf(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_native_sin_f32(float noundef [[X:%.*]]) #[[ATTR16]]
-// DEFAULT-NEXT:    store float [[CALL_I]], ptr [[Y:%.*]], align 4, !tbaa [[TBAA16]]
+// DEFAULT-LABEL: define dso_local void @test___sincosf(
+// DEFAULT-SAME: float noundef [[X:%.*]], ptr noundef writeonly captures(none) initializes((0, 4)) [[Y:%.*]], ptr noundef writeonly captures(none) initializes((0, 4)) [[Z:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_native_sin_f32(float noundef [[X]]) #[[ATTR16]]
+// DEFAULT-NEXT:    store float [[CALL_I]], ptr [[Y]], align 4, !tbaa [[FLOAT_TBAA16]]
 // DEFAULT-NEXT:    [[CALL1_I:%.*]] = tail call contract float @__ocml_native_cos_f32(float noundef [[X]]) #[[ATTR16]]
-// DEFAULT-NEXT:    store float [[CALL1_I]], ptr [[Z:%.*]], align 4, !tbaa [[TBAA16]]
+// DEFAULT-NEXT:    store float [[CALL1_I]], ptr [[Z]], align 4, !tbaa [[FLOAT_TBAA16]]
 // DEFAULT-NEXT:    ret void
 //
-// FINITEONLY-LABEL: @test___sincosf(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) float @__ocml_native_sin_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
-// FINITEONLY-NEXT:    store float [[CALL_I]], ptr [[Y:%.*]], align 4, !tbaa [[TBAA16]]
+// FINITEONLY-LABEL: define dso_local void @test___sincosf(
+// FINITEONLY-SAME: float noundef nofpclass(nan inf) [[X:%.*]], ptr noundef writeonly captures(none) initializes((0, 4)) [[Y:%.*]], ptr noundef writeonly captures(none) initializes((0, 4)) [[Z:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) float @__ocml_native_sin_f32(float noundef nofpclass(nan inf) [[X]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    store float [[CALL_I]], ptr [[Y]], align 4, !tbaa [[FLOAT_TBAA16]]
 // FINITEONLY-NEXT:    [[CALL1_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) float @__ocml_native_cos_f32(float noundef nofpclass(nan inf) [[X]]) #[[ATTR16]]
-// FINITEONLY-NEXT:    store float [[CALL1_I]], ptr [[Z:%.*]], align 4, !tbaa [[TBAA16]]
+// FINITEONLY-NEXT:    store float [[CALL1_I]], ptr [[Z]], align 4, !tbaa [[FLOAT_TBAA16]]
 // FINITEONLY-NEXT:    ret void
 //
-// APPROX-LABEL: @test___sincosf(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_native_sin_f32(float noundef [[X:%.*]]) #[[ATTR16]]
-// APPROX-NEXT:    store float [[CALL_I]], ptr [[Y:%.*]], align 4, !tbaa [[TBAA16]]
+// APPROX-LABEL: define dso_local void @test___sincosf(
+// APPROX-SAME: float noundef [[X:%.*]], ptr noundef writeonly captures(none) initializes((0, 4)) [[Y:%.*]], ptr noundef writeonly captures(none) initializes((0, 4)) [[Z:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_native_sin_f32(float noundef [[X]]) #[[ATTR16]]
+// APPROX-NEXT:    store float [[CALL_I]], ptr [[Y]], align 4, !tbaa [[FLOAT_TBAA16]]
 // APPROX-NEXT:    [[CALL1_I:%.*]] = tail call contract float @__ocml_native_cos_f32(float noundef [[X]]) #[[ATTR16]]
-// APPROX-NEXT:    store float [[CALL1_I]], ptr [[Z:%.*]], align 4, !tbaa [[TBAA16]]
+// APPROX-NEXT:    store float [[CALL1_I]], ptr [[Z]], align 4, !tbaa [[FLOAT_TBAA16]]
 // APPROX-NEXT:    ret void
 //
-// NCRDIV-LABEL: @test___sincosf(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_native_sin_f32(float noundef [[X:%.*]]) #[[ATTR16]]
-// NCRDIV-NEXT:    store float [[CALL_I]], ptr [[Y:%.*]], align 4, !tbaa [[TBAA17]]
+// NCRDIV-LABEL: define dso_local void @test___sincosf(
+// NCRDIV-SAME: float noundef [[X:%.*]], ptr noundef writeonly captures(none) initializes((0, 4)) [[Y:%.*]], ptr noundef writeonly captures(none) initializes((0, 4)) [[Z:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_native_sin_f32(float noundef [[X]]) #[[ATTR16]]
+// NCRDIV-NEXT:    store float [[CALL_I]], ptr [[Y]], align 4, !tbaa [[FLOAT_TBAA17]]
 // NCRDIV-NEXT:    [[CALL1_I:%.*]] = tail call contract float @__ocml_native_cos_f32(float noundef [[X]]) #[[ATTR16]]
-// NCRDIV-NEXT:    store float [[CALL1_I]], ptr [[Z:%.*]], align 4, !tbaa [[TBAA17]]
+// NCRDIV-NEXT:    store float [[CALL1_I]], ptr [[Z]], align 4, !tbaa [[FLOAT_TBAA17]]
 // NCRDIV-NEXT:    ret void
 //
-// AMDGCNSPIRV-LABEL: @test___sincosf(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func addrspace(4) float @__ocml_native_sin_f32(float noundef [[X:%.*]]) #[[ATTR14]]
-// AMDGCNSPIRV-NEXT:    store float [[CALL_I]], ptr addrspace(4) [[Y:%.*]], align 4, !tbaa [[TBAA17]]
+// AMDGCNSPIRV-LABEL: define spir_func void @test___sincosf(
+// AMDGCNSPIRV-SAME: float noundef [[X:%.*]], ptr addrspace(4) noundef writeonly captures(none) initializes((0, 4)) [[Y:%.*]], ptr addrspace(4) noundef writeonly captures(none) initializes((0, 4)) [[Z:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR6]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func addrspace(4) float @__ocml_native_sin_f32(float noundef [[X]]) #[[ATTR14]]
+// AMDGCNSPIRV-NEXT:    store float [[CALL_I]], ptr addrspace(4) [[Y]], align 4, !tbaa [[FLOAT_TBAA17]]
 // AMDGCNSPIRV-NEXT:    [[CALL1_I:%.*]] = tail call contract spir_func addrspace(4) float @__ocml_native_cos_f32(float noundef [[X]]) #[[ATTR14]]
-// AMDGCNSPIRV-NEXT:    store float [[CALL1_I]], ptr addrspace(4) [[Z:%.*]], align 4, !tbaa [[TBAA17]]
+// AMDGCNSPIRV-NEXT:    store float [[CALL1_I]], ptr addrspace(4) [[Z]], align 4, !tbaa [[FLOAT_TBAA17]]
 // AMDGCNSPIRV-NEXT:    ret void
 //
 extern "C" __device__ void test___sincosf(float x, float *y, float *z) {
   __sincosf(x, y, z);
 }
 
-// DEFAULT-LABEL: @test___sinf(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_native_sin_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// DEFAULT-LABEL: define dso_local noundef float @test___sinf(
+// DEFAULT-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_native_sin_f32(float noundef [[X]]) #[[ATTR16]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
-// FINITEONLY-LABEL: @test___sinf(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_native_sin_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) float @test___sinf(
+// FINITEONLY-SAME: float noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_native_sin_f32(float noundef nofpclass(nan inf) [[X]]) #[[ATTR16]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
-// APPROX-LABEL: @test___sinf(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_native_sin_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// APPROX-LABEL: define dso_local noundef float @test___sinf(
+// APPROX-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_native_sin_f32(float noundef [[X]]) #[[ATTR16]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
-// NCRDIV-LABEL: @test___sinf(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_native_sin_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// NCRDIV-LABEL: define dso_local noundef float @test___sinf(
+// NCRDIV-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_native_sin_f32(float noundef [[X]]) #[[ATTR16]]
 // NCRDIV-NEXT:    ret float [[CALL_I]]
 //
-// AMDGCNSPIRV-LABEL: @test___sinf(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_native_sin_f32(float noundef [[X:%.*]]) #[[ATTR14]]
+// AMDGCNSPIRV-LABEL: define spir_func noundef float @test___sinf(
+// AMDGCNSPIRV-SAME: float noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR6]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_native_sin_f32(float noundef [[X]]) #[[ATTR14]]
 // AMDGCNSPIRV-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test___sinf(float x) {
   return __sinf(x);
 }
 
-// DEFAULT-LABEL: @test___tanf(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I3_I:%.*]] = tail call contract noundef float @__ocml_native_sin_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// DEFAULT-LABEL: define dso_local float @test___tanf(
+// DEFAULT-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[CALL_I3_I:%.*]] = tail call contract noundef float @__ocml_native_sin_f32(float noundef [[X]]) #[[ATTR16]]
 // DEFAULT-NEXT:    [[CALL_I_I:%.*]] = tail call contract noundef float @__ocml_native_cos_f32(float noundef [[X]]) #[[ATTR16]]
 // DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract float @llvm.amdgcn.rcp.f32(float [[CALL_I_I]])
 // DEFAULT-NEXT:    [[MUL_I:%.*]] = fmul contract float [[CALL_I3_I]], [[TMP0]]
 // DEFAULT-NEXT:    ret float [[MUL_I]]
 //
-// FINITEONLY-LABEL: @test___tanf(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I3_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_native_sin_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
+// FINITEONLY-LABEL: define dso_local nofpclass(nan inf) float @test___tanf(
+// FINITEONLY-SAME: float noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[CALL_I3_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_native_sin_f32(float noundef nofpclass(nan inf) [[X]]) #[[ATTR16]]
 // FINITEONLY-NEXT:    [[CALL_I_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_native_cos_f32(float noundef nofpclass(nan inf) [[X]]) #[[ATTR16]]
 // FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract float @llvm.amdgcn.rcp.f32(float [[CALL_I_I]])
 // FINITEONLY-NEXT:    [[MUL_I:%.*]] = fmul nnan ninf contract float [[CALL_I3_I]], [[TMP0]]
 // FINITEONLY-NEXT:    ret float [[MUL_I]]
 //
-// APPROX-LABEL: @test___tanf(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I3_I:%.*]] = tail call contract noundef float @__ocml_native_sin_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// APPROX-LABEL: define dso_local float @test___tanf(
+// APPROX-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[CALL_I3_I:%.*]] = tail call contract noundef float @__ocml_native_sin_f32(float noundef [[X]]) #[[ATTR16]]
 // APPROX-NEXT:    [[CALL_I_I:%.*]] = tail call contract noundef float @__ocml_native_cos_f32(float noundef [[X]]) #[[ATTR16]]
 // APPROX-NEXT:    [[TMP0:%.*]] = tail call contract float @llvm.amdgcn.rcp.f32(float [[CALL_I_I]])
 // APPROX-NEXT:    [[MUL_I:%.*]] = fmul contract float [[CALL_I3_I]], [[TMP0]]
 // APPROX-NEXT:    ret float [[MUL_I]]
 //
-// NCRDIV-LABEL: @test___tanf(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[CALL_I3_I:%.*]] = tail call contract noundef float @__ocml_native_sin_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// NCRDIV-LABEL: define dso_local float @test___tanf(
+// NCRDIV-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[CALL_I3_I:%.*]] = tail call contract noundef float @__ocml_native_sin_f32(float noundef [[X]]) #[[ATTR16]]
 // NCRDIV-NEXT:    [[CALL_I_I:%.*]] = tail call contract noundef float @__ocml_native_cos_f32(float noundef [[X]]) #[[ATTR16]]
 // NCRDIV-NEXT:    [[TMP0:%.*]] = tail call contract float @llvm.amdgcn.rcp.f32(float [[CALL_I_I]])
 // NCRDIV-NEXT:    [[MUL_I:%.*]] = fmul contract float [[CALL_I3_I]], [[TMP0]]
 // NCRDIV-NEXT:    ret float [[MUL_I]]
 //
-// AMDGCNSPIRV-LABEL: @test___tanf(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[CALL_I3_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_native_sin_f32(float noundef [[X:%.*]]) #[[ATTR14]]
+// AMDGCNSPIRV-LABEL: define spir_func float @test___tanf(
+// AMDGCNSPIRV-SAME: float noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR6]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[CALL_I3_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_native_sin_f32(float noundef [[X]]) #[[ATTR14]]
 // AMDGCNSPIRV-NEXT:    [[CALL_I_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_native_cos_f32(float noundef [[X]]) #[[ATTR14]]
 // AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call contract addrspace(4) float @llvm.amdgcn.rcp.f32(float [[CALL_I_I]])
 // AMDGCNSPIRV-NEXT:    [[MUL_I:%.*]] = fmul contract float [[CALL_I3_I]], [[TMP0]]
@@ -7901,319 +8914,491 @@ extern "C" __device__ float test___tanf(float x) {
   return __tanf(x);
 }
 
-// DEFAULT-LABEL: @test___dadd_rn(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[ADD_I:%.*]] = fadd contract double [[X:%.*]], [[Y:%.*]]
+// DEFAULT-LABEL: define dso_local noundef double @test___dadd_rn(
+// DEFAULT-SAME: double noundef [[X:%.*]], double noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[ADD_I:%.*]] = fadd contract double [[X]], [[Y]]
 // DEFAULT-NEXT:    ret double [[ADD_I]]
 //
-// FINITEONLY-LABEL: @test___dadd_rn(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[ADD_I:%.*]] = fadd nnan ninf contract double [[X:%.*]], [[Y:%.*]]
+// FINITEONLY-LABEL: define dso_local nofpclass(nan inf) double @test___dadd_rn(
+// FINITEONLY-SAME: double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[ADD_I:%.*]] = fadd nnan ninf contract double [[X]], [[Y]]
 // FINITEONLY-NEXT:    ret double [[ADD_I]]
 //
-// APPROX-LABEL: @test___dadd_rn(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[ADD_I:%.*]] = fadd contract double [[X:%.*]], [[Y:%.*]]
+// APPROX-LABEL: define dso_local noundef double @test___dadd_rn(
+// APPROX-SAME: double noundef [[X:%.*]], double noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[ADD_I:%.*]] = fadd contract double [[X]], [[Y]]
 // APPROX-NEXT:    ret double [[ADD_I]]
 //
-// NCRDIV-LABEL: @test___dadd_rn(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[ADD_I:%.*]] = fadd contract double [[X:%.*]], [[Y:%.*]]
+// NCRDIV-LABEL: define dso_local noundef double @test___dadd_rn(
+// NCRDIV-SAME: double noundef [[X:%.*]], double noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[ADD_I:%.*]] = fadd contract double [[X]], [[Y]]
 // NCRDIV-NEXT:    ret double [[ADD_I]]
 //
-// AMDGCNSPIRV-LABEL: @test___dadd_rn(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[ADD_I:%.*]] = fadd contract double [[X:%.*]], [[Y:%.*]]
+// AMDGCNSPIRV-LABEL: define spir_func noundef double @test___dadd_rn(
+// AMDGCNSPIRV-SAME: double noundef [[X:%.*]], double noundef [[Y:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR3]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[ADD_I:%.*]] = fadd contract double [[X]], [[Y]]
 // AMDGCNSPIRV-NEXT:    ret double [[ADD_I]]
 //
 extern "C" __device__ double test___dadd_rn(double x, double y) {
   return __dadd_rn(x, y);
 }
 
-// DEFAULT-LABEL: @test___ddiv_rn(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[DIV_I:%.*]] = fdiv contract double [[X:%.*]], [[Y:%.*]]
+// DEFAULT-LABEL: define dso_local noundef double @test___ddiv_rn(
+// DEFAULT-SAME: double noundef [[X:%.*]], double noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[DIV_I:%.*]] = fdiv contract double [[X]], [[Y]]
 // DEFAULT-NEXT:    ret double [[DIV_I]]
 //
-// FINITEONLY-LABEL: @test___ddiv_rn(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[DIV_I:%.*]] = fdiv nnan ninf contract double [[X:%.*]], [[Y:%.*]]
+// FINITEONLY-LABEL: define dso_local nofpclass(nan inf) double @test___ddiv_rn(
+// FINITEONLY-SAME: double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[DIV_I:%.*]] = fdiv nnan ninf contract double [[X]], [[Y]]
 // FINITEONLY-NEXT:    ret double [[DIV_I]]
 //
-// APPROX-LABEL: @test___ddiv_rn(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[DIV_I:%.*]] = fdiv contract double [[X:%.*]], [[Y:%.*]]
+// APPROX-LABEL: define dso_local noundef double @test___ddiv_rn(
+// APPROX-SAME: double noundef [[X:%.*]], double noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[DIV_I:%.*]] = fdiv contract double [[X]], [[Y]]
 // APPROX-NEXT:    ret double [[DIV_I]]
 //
-// NCRDIV-LABEL: @test___ddiv_rn(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[DIV_I:%.*]] = fdiv contract double [[X:%.*]], [[Y:%.*]]
+// NCRDIV-LABEL: define dso_local noundef double @test___ddiv_rn(
+// NCRDIV-SAME: double noundef [[X:%.*]], double noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[DIV_I:%.*]] = fdiv contract double [[X]], [[Y]]
 // NCRDIV-NEXT:    ret double [[DIV_I]]
 //
-// AMDGCNSPIRV-LABEL: @test___ddiv_rn(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[DIV_I:%.*]] = fdiv contract double [[X:%.*]], [[Y:%.*]]
+// AMDGCNSPIRV-LABEL: define spir_func noundef double @test___ddiv_rn(
+// AMDGCNSPIRV-SAME: double noundef [[X:%.*]], double noundef [[Y:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR3]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[DIV_I:%.*]] = fdiv contract double [[X]], [[Y]]
 // AMDGCNSPIRV-NEXT:    ret double [[DIV_I]]
 //
 extern "C" __device__ double test___ddiv_rn(double x, double y) {
   return __ddiv_rn(x, y);
 }
 
-// DEFAULT-LABEL: @test___dmul_rn(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[MUL_I:%.*]] = fmul contract double [[X:%.*]], [[Y:%.*]]
+// DEFAULT-LABEL: define dso_local noundef double @test___dmul_rn(
+// DEFAULT-SAME: double noundef [[X:%.*]], double noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[MUL_I:%.*]] = fmul contract double [[X]], [[Y]]
 // DEFAULT-NEXT:    ret double [[MUL_I]]
 //
-// FINITEONLY-LABEL: @test___dmul_rn(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[MUL_I:%.*]] = fmul nnan ninf contract double [[X:%.*]], [[Y:%.*]]
+// FINITEONLY-LABEL: define dso_local nofpclass(nan inf) double @test___dmul_rn(
+// FINITEONLY-SAME: double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[MUL_I:%.*]] = fmul nnan ninf contract double [[X]], [[Y]]
 // FINITEONLY-NEXT:    ret double [[MUL_I]]
 //
-// APPROX-LABEL: @test___dmul_rn(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[MUL_I:%.*]] = fmul contract double [[X:%.*]], [[Y:%.*]]
+// APPROX-LABEL: define dso_local noundef double @test___dmul_rn(
+// APPROX-SAME: double noundef [[X:%.*]], double noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[MUL_I:%.*]] = fmul contract double [[X]], [[Y]]
 // APPROX-NEXT:    ret double [[MUL_I]]
 //
-// NCRDIV-LABEL: @test___dmul_rn(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[MUL_I:%.*]] = fmul contract double [[X:%.*]], [[Y:%.*]]
+// NCRDIV-LABEL: define dso_local noundef double @test___dmul_rn(
+// NCRDIV-SAME: double noundef [[X:%.*]], double noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[MUL_I:%.*]] = fmul contract double [[X]], [[Y]]
 // NCRDIV-NEXT:    ret double [[MUL_I]]
 //
-// AMDGCNSPIRV-LABEL: @test___dmul_rn(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[MUL_I:%.*]] = fmul contract double [[X:%.*]], [[Y:%.*]]
+// AMDGCNSPIRV-LABEL: define spir_func noundef double @test___dmul_rn(
+// AMDGCNSPIRV-SAME: double noundef [[X:%.*]], double noundef [[Y:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR3]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[MUL_I:%.*]] = fmul contract double [[X]], [[Y]]
 // AMDGCNSPIRV-NEXT:    ret double [[MUL_I]]
 //
 extern "C" __device__ double test___dmul_rn(double x, double y) {
   return __dmul_rn(x, y);
 }
 
-// DEFAULT-LABEL: @test___drcp_rn(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[DIV_I:%.*]] = fdiv contract double 1.000000e+00, [[X:%.*]]
+// DEFAULT-LABEL: define dso_local noundef double @test___drcp_rn(
+// DEFAULT-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[DIV_I:%.*]] = fdiv contract double 1.000000e+00, [[X]]
 // DEFAULT-NEXT:    ret double [[DIV_I]]
 //
-// FINITEONLY-LABEL: @test___drcp_rn(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[DIV_I:%.*]] = fdiv nnan ninf contract double 1.000000e+00, [[X:%.*]]
+// FINITEONLY-LABEL: define dso_local nofpclass(nan inf) double @test___drcp_rn(
+// FINITEONLY-SAME: double noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[DIV_I:%.*]] = fdiv nnan ninf contract double 1.000000e+00, [[X]]
 // FINITEONLY-NEXT:    ret double [[DIV_I]]
 //
-// APPROX-LABEL: @test___drcp_rn(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[DIV_I:%.*]] = fdiv contract double 1.000000e+00, [[X:%.*]]
+// APPROX-LABEL: define dso_local noundef double @test___drcp_rn(
+// APPROX-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[DIV_I:%.*]] = fdiv contract double 1.000000e+00, [[X]]
 // APPROX-NEXT:    ret double [[DIV_I]]
 //
-// NCRDIV-LABEL: @test___drcp_rn(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[DIV_I:%.*]] = fdiv contract double 1.000000e+00, [[X:%.*]]
+// NCRDIV-LABEL: define dso_local noundef double @test___drcp_rn(
+// NCRDIV-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[DIV_I:%.*]] = fdiv contract double 1.000000e+00, [[X]]
 // NCRDIV-NEXT:    ret double [[DIV_I]]
 //
-// AMDGCNSPIRV-LABEL: @test___drcp_rn(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[DIV_I:%.*]] = fdiv contract double 1.000000e+00, [[X:%.*]]
+// AMDGCNSPIRV-LABEL: define spir_func noundef double @test___drcp_rn(
+// AMDGCNSPIRV-SAME: double noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR3]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[DIV_I:%.*]] = fdiv contract double 1.000000e+00, [[X]]
 // AMDGCNSPIRV-NEXT:    ret double [[DIV_I]]
 //
 extern "C" __device__ double test___drcp_rn(double x) {
   return __drcp_rn(x);
 }
 
-// DEFAULT-LABEL: @test___dsqrt_rn(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.sqrt.f64(double [[X:%.*]])
+// DEFAULT-LABEL: define dso_local noundef double @test___dsqrt_rn(
+// DEFAULT-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.sqrt.f64(double [[X]])
 // DEFAULT-NEXT:    ret double [[TMP0]]
 //
-// FINITEONLY-LABEL: @test___dsqrt_rn(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.sqrt.f64(double nofpclass(nan inf) [[X:%.*]])
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) double @test___dsqrt_rn(
+// FINITEONLY-SAME: double noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.sqrt.f64(double nofpclass(nan inf) [[X]])
 // FINITEONLY-NEXT:    ret double [[TMP0]]
 //
-// APPROX-LABEL: @test___dsqrt_rn(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.sqrt.f64(double [[X:%.*]])
+// APPROX-LABEL: define dso_local noundef double @test___dsqrt_rn(
+// APPROX-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.sqrt.f64(double [[X]])
 // APPROX-NEXT:    ret double [[TMP0]]
 //
-// NCRDIV-LABEL: @test___dsqrt_rn(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.sqrt.f64(double [[X:%.*]])
+// NCRDIV-LABEL: define dso_local noundef double @test___dsqrt_rn(
+// NCRDIV-SAME: double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.sqrt.f64(double [[X]])
 // NCRDIV-NEXT:    ret double [[TMP0]]
 //
-// AMDGCNSPIRV-LABEL: @test___dsqrt_rn(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call contract noundef addrspace(4) double @llvm.sqrt.f64(double [[X:%.*]])
+// AMDGCNSPIRV-LABEL: define spir_func noundef double @test___dsqrt_rn(
+// AMDGCNSPIRV-SAME: double noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR3]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call contract noundef addrspace(4) double @llvm.sqrt.f64(double [[X]])
 // AMDGCNSPIRV-NEXT:    ret double [[TMP0]]
 //
 extern "C" __device__ double test___dsqrt_rn(double x) {
   return __dsqrt_rn(x);
 }
 
-// DEFAULT-LABEL: @test__fma_rn(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.fma.f64(double [[X:%.*]], double [[Y:%.*]], double [[Z:%.*]])
+// DEFAULT-LABEL: define dso_local noundef double @test__fma_rn(
+// DEFAULT-SAME: double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.fma.f64(double [[X]], double [[Y]], double [[Z]])
 // DEFAULT-NEXT:    ret double [[TMP0]]
 //
-// FINITEONLY-LABEL: @test__fma_rn(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.fma.f64(double nofpclass(nan inf) [[X:%.*]], double nofpclass(nan inf) [[Y:%.*]], double nofpclass(nan inf) [[Z:%.*]])
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) double @test__fma_rn(
+// FINITEONLY-SAME: double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]], double noundef nofpclass(nan inf) [[Z:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.fma.f64(double nofpclass(nan inf) [[X]], double nofpclass(nan inf) [[Y]], double nofpclass(nan inf) [[Z]])
 // FINITEONLY-NEXT:    ret double [[TMP0]]
 //
-// APPROX-LABEL: @test__fma_rn(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.fma.f64(double [[X:%.*]], double [[Y:%.*]], double [[Z:%.*]])
+// APPROX-LABEL: define dso_local noundef double @test__fma_rn(
+// APPROX-SAME: double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.fma.f64(double [[X]], double [[Y]], double [[Z]])
 // APPROX-NEXT:    ret double [[TMP0]]
 //
-// NCRDIV-LABEL: @test__fma_rn(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.fma.f64(double [[X:%.*]], double [[Y:%.*]], double [[Z:%.*]])
+// NCRDIV-LABEL: define dso_local noundef double @test__fma_rn(
+// NCRDIV-SAME: double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.fma.f64(double [[X]], double [[Y]], double [[Z]])
 // NCRDIV-NEXT:    ret double [[TMP0]]
 //
-// AMDGCNSPIRV-LABEL: @test__fma_rn(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call contract noundef addrspace(4) double @llvm.fma.f64(double [[X:%.*]], double [[Y:%.*]], double [[Z:%.*]])
+// AMDGCNSPIRV-LABEL: define spir_func noundef double @test__fma_rn(
+// AMDGCNSPIRV-SAME: double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR3]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call contract noundef addrspace(4) double @llvm.fma.f64(double [[X]], double [[Y]], double [[Z]])
 // AMDGCNSPIRV-NEXT:    ret double [[TMP0]]
 //
 extern "C" __device__ double test__fma_rn(double x, double y, double z) {
   return __fma_rn(x, y, z);
 }
 
-// DEFAULT-LABEL: @test_float_min(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.minnum.f32(float [[X:%.*]], float [[Y:%.*]])
+// DEFAULT-LABEL: define dso_local noundef float @test_float_min(
+// DEFAULT-SAME: float noundef [[X:%.*]], float noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.minnum.f32(float [[X]], float [[Y]])
 // DEFAULT-NEXT:    ret float [[TMP0]]
 //
-// FINITEONLY-LABEL: @test_float_min(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.minnum.f32(float nofpclass(nan inf) [[X:%.*]], float nofpclass(nan inf) [[Y:%.*]])
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) float @test_float_min(
+// FINITEONLY-SAME: float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.minnum.f32(float nofpclass(nan inf) [[X]], float nofpclass(nan inf) [[Y]])
 // FINITEONLY-NEXT:    ret float [[TMP0]]
 //
-// APPROX-LABEL: @test_float_min(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.minnum.f32(float [[X:%.*]], float [[Y:%.*]])
+// APPROX-LABEL: define dso_local noundef float @test_float_min(
+// APPROX-SAME: float noundef [[X:%.*]], float noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.minnum.f32(float [[X]], float [[Y]])
 // APPROX-NEXT:    ret float [[TMP0]]
 //
-// NCRDIV-LABEL: @test_float_min(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.minnum.f32(float [[X:%.*]], float [[Y:%.*]])
+// NCRDIV-LABEL: define dso_local noundef float @test_float_min(
+// NCRDIV-SAME: float noundef [[X:%.*]], float noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.minnum.f32(float [[X]], float [[Y]])
 // NCRDIV-NEXT:    ret float [[TMP0]]
 //
-// AMDGCNSPIRV-LABEL: @test_float_min(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call contract noundef addrspace(4) float @llvm.minnum.f32(float [[X:%.*]], float [[Y:%.*]])
+// AMDGCNSPIRV-LABEL: define spir_func noundef float @test_float_min(
+// AMDGCNSPIRV-SAME: float noundef [[X:%.*]], float noundef [[Y:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR3]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call contract noundef addrspace(4) float @llvm.minnum.f32(float [[X]], float [[Y]])
 // AMDGCNSPIRV-NEXT:    ret float [[TMP0]]
 //
 extern "C" __device__ float test_float_min(float x, float y) {
   return min(x, y);
 }
 
-// DEFAULT-LABEL: @test_float_max(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.maxnum.f32(float [[X:%.*]], float [[Y:%.*]])
+// DEFAULT-LABEL: define dso_local noundef float @test_float_max(
+// DEFAULT-SAME: float noundef [[X:%.*]], float noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.maxnum.f32(float [[X]], float [[Y]])
 // DEFAULT-NEXT:    ret float [[TMP0]]
 //
-// FINITEONLY-LABEL: @test_float_max(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.maxnum.f32(float nofpclass(nan inf) [[X:%.*]], float nofpclass(nan inf) [[Y:%.*]])
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) float @test_float_max(
+// FINITEONLY-SAME: float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.maxnum.f32(float nofpclass(nan inf) [[X]], float nofpclass(nan inf) [[Y]])
 // FINITEONLY-NEXT:    ret float [[TMP0]]
 //
-// APPROX-LABEL: @test_float_max(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.maxnum.f32(float [[X:%.*]], float [[Y:%.*]])
+// APPROX-LABEL: define dso_local noundef float @test_float_max(
+// APPROX-SAME: float noundef [[X:%.*]], float noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.maxnum.f32(float [[X]], float [[Y]])
 // APPROX-NEXT:    ret float [[TMP0]]
 //
-// NCRDIV-LABEL: @test_float_max(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.maxnum.f32(float [[X:%.*]], float [[Y:%.*]])
+// NCRDIV-LABEL: define dso_local noundef float @test_float_max(
+// NCRDIV-SAME: float noundef [[X:%.*]], float noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.maxnum.f32(float [[X]], float [[Y]])
 // NCRDIV-NEXT:    ret float [[TMP0]]
 //
-// AMDGCNSPIRV-LABEL: @test_float_max(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call contract noundef addrspace(4) float @llvm.maxnum.f32(float [[X:%.*]], float [[Y:%.*]])
+// AMDGCNSPIRV-LABEL: define spir_func noundef float @test_float_max(
+// AMDGCNSPIRV-SAME: float noundef [[X:%.*]], float noundef [[Y:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR3]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call contract noundef addrspace(4) float @llvm.maxnum.f32(float [[X]], float [[Y]])
 // AMDGCNSPIRV-NEXT:    ret float [[TMP0]]
 //
 extern "C" __device__ float test_float_max(float x, float y) {
   return max(x, y);
 }
 
-// DEFAULT-LABEL: @test_double_min(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.minnum.f64(double [[X:%.*]], double [[Y:%.*]])
+// DEFAULT-LABEL: define dso_local noundef double @test_double_min(
+// DEFAULT-SAME: double noundef [[X:%.*]], double noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.minnum.f64(double [[X]], double [[Y]])
 // DEFAULT-NEXT:    ret double [[TMP0]]
 //
-// FINITEONLY-LABEL: @test_double_min(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.minnum.f64(double nofpclass(nan inf) [[X:%.*]], double nofpclass(nan inf) [[Y:%.*]])
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) double @test_double_min(
+// FINITEONLY-SAME: double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.minnum.f64(double nofpclass(nan inf) [[X]], double nofpclass(nan inf) [[Y]])
 // FINITEONLY-NEXT:    ret double [[TMP0]]
 //
-// APPROX-LABEL: @test_double_min(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.minnum.f64(double [[X:%.*]], double [[Y:%.*]])
+// APPROX-LABEL: define dso_local noundef double @test_double_min(
+// APPROX-SAME: double noundef [[X:%.*]], double noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.minnum.f64(double [[X]], double [[Y]])
 // APPROX-NEXT:    ret double [[TMP0]]
 //
-// NCRDIV-LABEL: @test_double_min(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.minnum.f64(double [[X:%.*]], double [[Y:%.*]])
+// NCRDIV-LABEL: define dso_local noundef double @test_double_min(
+// NCRDIV-SAME: double noundef [[X:%.*]], double noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.minnum.f64(double [[X]], double [[Y]])
 // NCRDIV-NEXT:    ret double [[TMP0]]
 //
-// AMDGCNSPIRV-LABEL: @test_double_min(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call contract noundef addrspace(4) double @llvm.minnum.f64(double [[X:%.*]], double [[Y:%.*]])
+// AMDGCNSPIRV-LABEL: define spir_func noundef double @test_double_min(
+// AMDGCNSPIRV-SAME: double noundef [[X:%.*]], double noundef [[Y:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR3]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call contract noundef addrspace(4) double @llvm.minnum.f64(double [[X]], double [[Y]])
 // AMDGCNSPIRV-NEXT:    ret double [[TMP0]]
 //
 extern "C" __device__ double test_double_min(double x, double y) {
   return min(x, y);
 }
 
-// DEFAULT-LABEL: @test_double_max(
-// DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.maxnum.f64(double [[X:%.*]], double [[Y:%.*]])
+// DEFAULT-LABEL: define dso_local noundef double @test_double_max(
+// DEFAULT-SAME: double noundef [[X:%.*]], double noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.maxnum.f64(double [[X]], double [[Y]])
 // DEFAULT-NEXT:    ret double [[TMP0]]
 //
-// FINITEONLY-LABEL: @test_double_max(
-// FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.maxnum.f64(double nofpclass(nan inf) [[X:%.*]], double nofpclass(nan inf) [[Y:%.*]])
+// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) double @test_double_max(
+// FINITEONLY-SAME: double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// FINITEONLY-NEXT:  [[ENTRY:.*:]]
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.maxnum.f64(double nofpclass(nan inf) [[X]], double nofpclass(nan inf) [[Y]])
 // FINITEONLY-NEXT:    ret double [[TMP0]]
 //
-// APPROX-LABEL: @test_double_max(
-// APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.maxnum.f64(double [[X:%.*]], double [[Y:%.*]])
+// APPROX-LABEL: define dso_local noundef double @test_double_max(
+// APPROX-SAME: double noundef [[X:%.*]], double noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// APPROX-NEXT:  [[ENTRY:.*:]]
+// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.maxnum.f64(double [[X]], double [[Y]])
 // APPROX-NEXT:    ret double [[TMP0]]
 //
-// NCRDIV-LABEL: @test_double_max(
-// NCRDIV-NEXT:  entry:
-// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.maxnum.f64(double [[X:%.*]], double [[Y:%.*]])
+// NCRDIV-LABEL: define dso_local noundef double @test_double_max(
+// NCRDIV-SAME: double noundef [[X:%.*]], double noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// NCRDIV-NEXT:  [[ENTRY:.*:]]
+// NCRDIV-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.maxnum.f64(double [[X]], double [[Y]])
 // NCRDIV-NEXT:    ret double [[TMP0]]
 //
-// AMDGCNSPIRV-LABEL: @test_double_max(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call contract noundef addrspace(4) double @llvm.maxnum.f64(double [[X:%.*]], double [[Y:%.*]])
+// AMDGCNSPIRV-LABEL: define spir_func noundef double @test_double_max(
+// AMDGCNSPIRV-SAME: double noundef [[X:%.*]], double noundef [[Y:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR3]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call contract noundef addrspace(4) double @llvm.maxnum.f64(double [[X]], double [[Y]])
 // AMDGCNSPIRV-NEXT:    ret double [[TMP0]]
 //
 extern "C" __device__ double test_double_max(double x, double y) {
   return max(x, y);
 }
-// CHECK-LABEL: @test_int_min(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[COND_I:%.*]] = tail call noundef i32 @llvm.smin.i32(i32 [[X:%.*]], i32 [[Y:%.*]])
+// CHECK-LABEL: define dso_local noundef i32 @test_int_min(
+// CHECK-SAME: i32 noundef [[X:%.*]], i32 noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[COND_I:%.*]] = tail call noundef i32 @llvm.smin.i32(i32 [[X]], i32 [[Y]])
 // CHECK-NEXT:    ret i32 [[COND_I]]
 //
-// AMDGCNSPIRV-LABEL: @test_int_min(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[COND_I:%.*]] = tail call noundef addrspace(4) i32 @llvm.smin.i32(i32 [[X:%.*]], i32 [[Y:%.*]])
+// AMDGCNSPIRV-LABEL: define spir_func noundef i32 @test_int_min(
+// AMDGCNSPIRV-SAME: i32 noundef [[X:%.*]], i32 noundef [[Y:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR3]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[COND_I:%.*]] = tail call noundef addrspace(4) i32 @llvm.smin.i32(i32 [[X]], i32 [[Y]])
 // AMDGCNSPIRV-NEXT:    ret i32 [[COND_I]]
 //
 extern "C" __device__ int test_int_min(int x, int y) {
   return min(x, y);
 }
 
-// CHECK-LABEL: @test_int_max(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[COND_I:%.*]] = tail call noundef i32 @llvm.smax.i32(i32 [[X:%.*]], i32 [[Y:%.*]])
+// CHECK-LABEL: define dso_local noundef i32 @test_int_max(
+// CHECK-SAME: i32 noundef [[X:%.*]], i32 noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[COND_I:%.*]] = tail call noundef i32 @llvm.smax.i32(i32 [[X]], i32 [[Y]])
 // CHECK-NEXT:    ret i32 [[COND_I]]
 //
-// AMDGCNSPIRV-LABEL: @test_int_max(
-// AMDGCNSPIRV-NEXT:  entry:
-// AMDGCNSPIRV-NEXT:    [[COND_I:%.*]] = tail call noundef addrspace(4) i32 @llvm.smax.i32(i32 [[X:%.*]], i32 [[Y:%.*]])
+// AMDGCNSPIRV-LABEL: define spir_func noundef i32 @test_int_max(
+// AMDGCNSPIRV-SAME: i32 noundef [[X:%.*]], i32 noundef [[Y:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR3]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[COND_I:%.*]] = tail call noundef addrspace(4) i32 @llvm.smax.i32(i32 [[X]], i32 [[Y]])
 // AMDGCNSPIRV-NEXT:    ret i32 [[COND_I]]
 //
 extern "C" __device__ int test_int_max(int x, int y) {
   return max(x, y);
 }
+//.
+// DEFAULT: [[CHAR_TBAA4]] = !{[[META5:![0-9]+]], [[META5]], i64 0}
+// DEFAULT: [[META5]] = !{!"omnipotent char", [[META6:![0-9]+]], i64 0}
+// DEFAULT: [[META6]] = !{!"Simple C++ TBAA"}
+// DEFAULT: [[LOOP7]] = distinct !{[[LOOP7]], [[META8:![0-9]+]], [[META9:![0-9]+]]}
+// DEFAULT: [[META8]] = !{!"llvm.loop.mustprogress"}
+// DEFAULT: [[META9]] = !{!"llvm.loop.unroll.disable"}
+// DEFAULT: [[LOOP10]] = distinct !{[[LOOP10]], [[META8]], [[META9]]}
+// DEFAULT: [[LOOP11]] = distinct !{[[LOOP11]], [[META8]], [[META9]]}
+// DEFAULT: [[INT_TBAA12]] = !{[[META13:![0-9]+]], [[META13]], i64 0}
+// DEFAULT: [[META13]] = !{!"int", [[META5]], i64 0}
+// DEFAULT: [[LOOP14]] = distinct !{[[LOOP14]], [[META8]], [[META9]]}
+// DEFAULT: [[LOOP15]] = distinct !{[[LOOP15]], [[META8]], [[META9]]}
+// DEFAULT: [[FLOAT_TBAA16]] = !{[[META17:![0-9]+]], [[META17]], i64 0}
+// DEFAULT: [[META17]] = !{!"float", [[META5]], i64 0}
+// DEFAULT: [[DOUBLE_TBAA18]] = !{[[META19:![0-9]+]], [[META19]], i64 0}
+// DEFAULT: [[META19]] = !{!"double", [[META5]], i64 0}
+// DEFAULT: [[LOOP20]] = distinct !{[[LOOP20]], [[META8]], [[META9]]}
+// DEFAULT: [[LOOP21]] = distinct !{[[LOOP21]], [[META8]], [[META9]]}
+// DEFAULT: [[LOOP22]] = distinct !{[[LOOP22]], [[META8]], [[META9]]}
+// DEFAULT: [[LOOP23]] = distinct !{[[LOOP23]], [[META8]], [[META9]]}
+// DEFAULT: [[LOOP24]] = distinct !{[[LOOP24]], [[META8]], [[META9]]}
+// DEFAULT: [[LOOP25]] = distinct !{[[LOOP25]], [[META8]], [[META9]]}
+//.
+// FINITEONLY: [[CHAR_TBAA4]] = !{[[META5:![0-9]+]], [[META5]], i64 0}
+// FINITEONLY: [[META5]] = !{!"omnipotent char", [[META6:![0-9]+]], i64 0}
+// FINITEONLY: [[META6]] = !{!"Simple C++ TBAA"}
+// FINITEONLY: [[LOOP7]] = distinct !{[[LOOP7]], [[META8:![0-9]+]], [[META9:![0-9]+]]}
+// FINITEONLY: [[META8]] = !{!"llvm.loop.mustprogress"}
+// FINITEONLY: [[META9]] = !{!"llvm.loop.unroll.disable"}
+// FINITEONLY: [[LOOP10]] = distinct !{[[LOOP10]], [[META8]], [[META9]]}
+// FINITEONLY: [[LOOP11]] = distinct !{[[LOOP11]], [[META8]], [[META9]]}
+// FINITEONLY: [[INT_TBAA12]] = !{[[META13:![0-9]+]], [[META13]], i64 0}
+// FINITEONLY: [[META13]] = !{!"int", [[META5]], i64 0}
+// FINITEONLY: [[LOOP14]] = distinct !{[[LOOP14]], [[META8]], [[META9]]}
+// FINITEONLY: [[LOOP15]] = distinct !{[[LOOP15]], [[META8]], [[META9]]}
+// FINITEONLY: [[FLOAT_TBAA16]] = !{[[META17:![0-9]+]], [[META17]], i64 0}
+// FINITEONLY: [[META17]] = !{!"float", [[META5]], i64 0}
+// FINITEONLY: [[DOUBLE_TBAA18]] = !{[[META19:![0-9]+]], [[META19]], i64 0}
+// FINITEONLY: [[META19]] = !{!"double", [[META5]], i64 0}
+// FINITEONLY: [[LOOP20]] = distinct !{[[LOOP20]], [[META8]], [[META9]]}
+// FINITEONLY: [[LOOP21]] = distinct !{[[LOOP21]], [[META8]], [[META9]]}
+// FINITEONLY: [[LOOP22]] = distinct !{[[LOOP22]], [[META8]], [[META9]]}
+// FINITEONLY: [[LOOP23]] = distinct !{[[LOOP23]], [[META8]], [[META9]]}
+// FINITEONLY: [[LOOP24]] = distinct !{[[LOOP24]], [[META8]], [[META9]]}
+// FINITEONLY: [[LOOP25]] = distinct !{[[LOOP25]], [[META8]], [[META9]]}
+//.
+// APPROX: [[CHAR_TBAA4]] = !{[[META5:![0-9]+]], [[META5]], i64 0}
+// APPROX: [[META5]] = !{!"omnipotent char", [[META6:![0-9]+]], i64 0}
+// APPROX: [[META6]] = !{!"Simple C++ TBAA"}
+// APPROX: [[LOOP7]] = distinct !{[[LOOP7]], [[META8:![0-9]+]], [[META9:![0-9]+]]}
+// APPROX: [[META8]] = !{!"llvm.loop.mustprogress"}
+// APPROX: [[META9]] = !{!"llvm.loop.unroll.disable"}
+// APPROX: [[LOOP10]] = distinct !{[[LOOP10]], [[META8]], [[META9]]}
+// APPROX: [[LOOP11]] = distinct !{[[LOOP11]], [[META8]], [[META9]]}
+// APPROX: [[INT_TBAA12]] = !{[[META13:![0-9]+]], [[META13]], i64 0}
+// APPROX: [[META13]] = !{!"int", [[META5]], i64 0}
+// APPROX: [[LOOP14]] = distinct !{[[LOOP14]], [[META8]], [[META9]]}
+// APPROX: [[LOOP15]] = distinct !{[[LOOP15]], [[META8]], [[META9]]}
+// APPROX: [[FLOAT_TBAA16]] = !{[[META17:![0-9]+]], [[META17]], i64 0}
+// APPROX: [[META17]] = !{!"float", [[META5]], i64 0}
+// APPROX: [[DOUBLE_TBAA18]] = !{[[META19:![0-9]+]], [[META19]], i64 0}
+// APPROX: [[META19]] = !{!"double", [[META5]], i64 0}
+// APPROX: [[LOOP20]] = distinct !{[[LOOP20]], [[META8]], [[META9]]}
+// APPROX: [[LOOP21]] = distinct !{[[LOOP21]], [[META8]], [[META9]]}
+// APPROX: [[LOOP22]] = distinct !{[[LOOP22]], [[META8]], [[META9]]}
+// APPROX: [[LOOP23]] = distinct !{[[LOOP23]], [[META8]], [[META9]]}
+// APPROX: [[LOOP24]] = distinct !{[[LOOP24]], [[META8]], [[META9]]}
+// APPROX: [[LOOP25]] = distinct !{[[LOOP25]], [[META8]], [[META9]]}
+//.
+// NCRDIV: [[CHAR_TBAA4]] = !{[[META5:![0-9]+]], [[META5]], i64 0}
+// NCRDIV: [[META5]] = !{!"omnipotent char", [[META6:![0-9]+]], i64 0}
+// NCRDIV: [[META6]] = !{!"Simple C++ TBAA"}
+// NCRDIV: [[LOOP7]] = distinct !{[[LOOP7]], [[META8:![0-9]+]], [[META9:![0-9]+]]}
+// NCRDIV: [[META8]] = !{!"llvm.loop.mustprogress"}
+// NCRDIV: [[META9]] = !{!"llvm.loop.unroll.disable"}
+// NCRDIV: [[LOOP10]] = distinct !{[[LOOP10]], [[META8]], [[META9]]}
+// NCRDIV: [[LOOP11]] = distinct !{[[LOOP11]], [[META8]], [[META9]]}
+// NCRDIV: [[META12]] = !{float 2.500000e+00}
+// NCRDIV: [[INT_TBAA13]] = !{[[META14:![0-9]+]], [[META14]], i64 0}
+// NCRDIV: [[META14]] = !{!"int", [[META5]], i64 0}
+// NCRDIV: [[LOOP15]] = distinct !{[[LOOP15]], [[META8]], [[META9]]}
+// NCRDIV: [[LOOP16]] = distinct !{[[LOOP16]], [[META8]], [[META9]]}
+// NCRDIV: [[FLOAT_TBAA17]] = !{[[META18:![0-9]+]], [[META18]], i64 0}
+// NCRDIV: [[META18]] = !{!"float", [[META5]], i64 0}
+// NCRDIV: [[DOUBLE_TBAA19]] = !{[[META20:![0-9]+]], [[META20]], i64 0}
+// NCRDIV: [[META20]] = !{!"double", [[META5]], i64 0}
+// NCRDIV: [[LOOP21]] = distinct !{[[LOOP21]], [[META8]], [[META9]]}
+// NCRDIV: [[LOOP22]] = distinct !{[[LOOP22]], [[META8]], [[META9]]}
+// NCRDIV: [[LOOP23]] = distinct !{[[LOOP23]], [[META8]], [[META9]]}
+// NCRDIV: [[LOOP24]] = distinct !{[[LOOP24]], [[META8]], [[META9]]}
+// NCRDIV: [[META25]] = !{float 3.000000e+00}
+// NCRDIV: [[LOOP26]] = distinct !{[[LOOP26]], [[META8]], [[META9]]}
+// NCRDIV: [[LOOP27]] = distinct !{[[LOOP27]], [[META8]], [[META9]]}
+//.
+// AMDGCNSPIRV: [[CHAR_TBAA5]] = !{[[META6:![0-9]+]], [[META6]], i64 0}
+// AMDGCNSPIRV: [[META6]] = !{!"omnipotent char", [[META7:![0-9]+]], i64 0}
+// AMDGCNSPIRV: [[META7]] = !{!"Simple C++ TBAA"}
+// AMDGCNSPIRV: [[LOOP8]] = distinct !{[[LOOP8]], [[META9:![0-9]+]], [[META10:![0-9]+]]}
+// AMDGCNSPIRV: [[META9]] = !{!"llvm.loop.mustprogress"}
+// AMDGCNSPIRV: [[META10]] = !{!"llvm.loop.unroll.disable"}
+// AMDGCNSPIRV: [[LOOP11]] = distinct !{[[LOOP11]], [[META9]], [[META10]]}
+// AMDGCNSPIRV: [[LOOP12]] = distinct !{[[LOOP12]], [[META9]], [[META10]]}
+// AMDGCNSPIRV: [[INT_TBAA13]] = !{[[META14:![0-9]+]], [[META14]], i64 0}
+// AMDGCNSPIRV: [[META14]] = !{!"int", [[META6]], i64 0}
+// AMDGCNSPIRV: [[LOOP15]] = distinct !{[[LOOP15]], [[META9]], [[META10]]}
+// AMDGCNSPIRV: [[LOOP16]] = distinct !{[[LOOP16]], [[META9]], [[META10]]}
+// AMDGCNSPIRV: [[FLOAT_TBAA17]] = !{[[META18:![0-9]+]], [[META18]], i64 0}
+// AMDGCNSPIRV: [[META18]] = !{!"float", [[META6]], i64 0}
+// AMDGCNSPIRV: [[DOUBLE_TBAA19]] = !{[[META20:![0-9]+]], [[META20]], i64 0}
+// AMDGCNSPIRV: [[META20]] = !{!"double", [[META6]], i64 0}
+// AMDGCNSPIRV: [[LOOP21]] = distinct !{[[LOOP21]], [[META9]], [[META10]]}
+// AMDGCNSPIRV: [[LOOP22]] = distinct !{[[LOOP22]], [[META9]], [[META10]]}
+// AMDGCNSPIRV: [[LOOP23]] = distinct !{[[LOOP23]], [[META9]], [[META10]]}
+// AMDGCNSPIRV: [[LOOP24]] = distinct !{[[LOOP24]], [[META9]], [[META10]]}
+// AMDGCNSPIRV: [[LOOP25]] = distinct !{[[LOOP25]], [[META9]], [[META10]]}
+// AMDGCNSPIRV: [[LOOP26]] = distinct !{[[LOOP26]], [[META9]], [[META10]]}
+//.
diff --git a/clang/test/Headers/arm-acle-no-direct-include.c b/clang/test/Headers/arm-acle-no-direct-include.c
new file mode 100644
index 0000000000000..d6a01a7417f0c
--- /dev/null
+++ b/clang/test/Headers/arm-acle-no-direct-include.c
@@ -0,0 +1,8 @@
+// RUN: %clang_cl --target=aarch64-windows-msvc -Xclang -verify /E -U__STDC_HOSTED__ -Wno-builtin-macro-redefined -- %s 2>&1 | FileCheck %s
+
+// expected-no-diagnostics
+
+// CHECK: void __yield(void);
+#include <intrin.h>
+void f() { __yield(); }
+
diff --git a/clang/test/Headers/wasm.c b/clang/test/Headers/wasm.c
index d27756259fa2f..7f427ca313ddc 100644
--- a/clang/test/Headers/wasm.c
+++ b/clang/test/Headers/wasm.c
@@ -1,4 +1,4 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6
 // REQUIRES: webassembly-registered-target, asserts
 
 // FIXME: This should not be using -O2 and implicitly testing the entire IR opt pipeline.
@@ -7,18 +7,20 @@
 
 #include <wasm_simd128.h>
 
-// CHECK-LABEL: @test_v128_load(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[MEM:%.*]], align 1, !tbaa [[TBAA2:![0-9]+]]
+// CHECK-LABEL: define hidden <4 x i32> @test_v128_load(
+// CHECK-SAME: ptr noundef readonly captures(none) [[MEM:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[MEM]], align 1, !tbaa [[CHAR_TBAA2:![0-9]+]]
 // CHECK-NEXT:    ret <4 x i32> [[TMP0]]
 //
 v128_t test_v128_load(const void *mem) {
   return wasm_v128_load(mem);
 }
 
-// CHECK-LABEL: @test_v128_load8_splat(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[MEM:%.*]], align 1, !tbaa [[TBAA2]]
+// CHECK-LABEL: define hidden <4 x i32> @test_v128_load8_splat(
+// CHECK-SAME: ptr noundef readonly captures(none) [[MEM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[MEM]], align 1, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <16 x i8> poison, i8 [[TMP0]], i64 0
 // CHECK-NEXT:    [[VECINIT16_I:%.*]] = shufflevector <16 x i8> [[VECINIT_I]], <16 x i8> poison, <16 x i32> zeroinitializer
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[VECINIT16_I]] to <4 x i32>
@@ -28,9 +30,10 @@ v128_t test_v128_load8_splat(const void *mem) {
   return wasm_v128_load8_splat(mem);
 }
 
-// CHECK-LABEL: @test_v128_load16_splat(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[MEM:%.*]], align 1, !tbaa [[TBAA2]]
+// CHECK-LABEL: define hidden <4 x i32> @test_v128_load16_splat(
+// CHECK-SAME: ptr noundef readonly captures(none) [[MEM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[MEM]], align 1, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[TMP0]], i64 0
 // CHECK-NEXT:    [[VECINIT8_I:%.*]] = shufflevector <8 x i16> [[VECINIT_I]], <8 x i16> poison, <8 x i32> zeroinitializer
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[VECINIT8_I]] to <4 x i32>
@@ -40,9 +43,10 @@ v128_t test_v128_load16_splat(const void *mem) {
   return wasm_v128_load16_splat(mem);
 }
 
-// CHECK-LABEL: @test_v128_load32_splat(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[MEM:%.*]], align 1, !tbaa [[TBAA2]]
+// CHECK-LABEL: define hidden <4 x i32> @test_v128_load32_splat(
+// CHECK-SAME: ptr noundef readonly captures(none) [[MEM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[MEM]], align 1, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i64 0
 // CHECK-NEXT:    [[VECINIT4_I:%.*]] = shufflevector <4 x i32> [[VECINIT_I]], <4 x i32> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    ret <4 x i32> [[VECINIT4_I]]
@@ -51,9 +55,10 @@ v128_t test_v128_load32_splat(const void *mem) {
   return wasm_v128_load32_splat(mem);
 }
 
-// CHECK-LABEL: @test_v128_load64_splat(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[MEM:%.*]], align 1, !tbaa [[TBAA2]]
+// CHECK-LABEL: define hidden <4 x i32> @test_v128_load64_splat(
+// CHECK-SAME: ptr noundef readonly captures(none) [[MEM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[MEM]], align 1, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i64> poison, i64 [[TMP0]], i64 0
 // CHECK-NEXT:    [[VECINIT2_I:%.*]] = shufflevector <2 x i64> [[VECINIT_I]], <2 x i64> poison, <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[VECINIT2_I]] to <4 x i32>
@@ -63,9 +68,10 @@ v128_t test_v128_load64_splat(const void *mem) {
   return wasm_v128_load64_splat(mem);
 }
 
-// CHECK-LABEL: @test_i16x8_load8x8(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[MEM:%.*]], align 1, !tbaa [[TBAA2]]
+// CHECK-LABEL: define hidden <4 x i32> @test_i16x8_load8x8(
+// CHECK-SAME: ptr noundef readonly captures(none) [[MEM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[MEM]], align 1, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[CONV_I:%.*]] = sext <8 x i8> [[TMP0]] to <8 x i16>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[CONV_I]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP1]]
@@ -74,9 +80,10 @@ v128_t test_i16x8_load8x8(const void *mem) {
   return wasm_i16x8_load8x8(mem);
 }
 
-// CHECK-LABEL: @test_u16x8_load8x8(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[MEM:%.*]], align 1, !tbaa [[TBAA2]]
+// CHECK-LABEL: define hidden <4 x i32> @test_u16x8_load8x8(
+// CHECK-SAME: ptr noundef readonly captures(none) [[MEM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[MEM]], align 1, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[CONV_I:%.*]] = zext <8 x i8> [[TMP0]] to <8 x i16>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[CONV_I]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP1]]
@@ -85,9 +92,10 @@ v128_t test_u16x8_load8x8(const void *mem) {
   return wasm_u16x8_load8x8(mem);
 }
 
-// CHECK-LABEL: @test_i32x4_load16x4(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[MEM:%.*]], align 1, !tbaa [[TBAA2]]
+// CHECK-LABEL: define hidden range(i32 -32768, 32768) <4 x i32> @test_i32x4_load16x4(
+// CHECK-SAME: ptr noundef readonly captures(none) [[MEM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[MEM]], align 1, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[CONV_I:%.*]] = sext <4 x i16> [[TMP0]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[CONV_I]]
 //
@@ -95,9 +103,10 @@ v128_t test_i32x4_load16x4(const void *mem) {
   return wasm_i32x4_load16x4(mem);
 }
 
-// CHECK-LABEL: @test_u32x4_load16x4(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[MEM:%.*]], align 1, !tbaa [[TBAA2]]
+// CHECK-LABEL: define hidden range(i32 0, 65536) <4 x i32> @test_u32x4_load16x4(
+// CHECK-SAME: ptr noundef readonly captures(none) [[MEM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[MEM]], align 1, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[CONV_I:%.*]] = zext <4 x i16> [[TMP0]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[CONV_I]]
 //
@@ -105,9 +114,10 @@ v128_t test_u32x4_load16x4(const void *mem) {
   return wasm_u32x4_load16x4(mem);
 }
 
-// CHECK-LABEL: @test_i64x2_load32x2(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[MEM:%.*]], align 1, !tbaa [[TBAA2]]
+// CHECK-LABEL: define hidden <4 x i32> @test_i64x2_load32x2(
+// CHECK-SAME: ptr noundef readonly captures(none) [[MEM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[MEM]], align 1, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[CONV_I:%.*]] = sext <2 x i32> [[TMP0]] to <2 x i64>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[CONV_I]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP1]]
@@ -116,9 +126,10 @@ v128_t test_i64x2_load32x2(const void *mem) {
   return wasm_i64x2_load32x2(mem);
 }
 
-// CHECK-LABEL: @test_u64x2_load32x2(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[MEM:%.*]], align 1, !tbaa [[TBAA2]]
+// CHECK-LABEL: define hidden <4 x i32> @test_u64x2_load32x2(
+// CHECK-SAME: ptr noundef readonly captures(none) [[MEM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[MEM]], align 1, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[CONV_I:%.*]] = zext <2 x i32> [[TMP0]] to <2 x i64>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[CONV_I]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP1]]
@@ -127,9 +138,10 @@ v128_t test_u64x2_load32x2(const void *mem) {
   return wasm_u64x2_load32x2(mem);
 }
 
-// CHECK-LABEL: @test_v128_load32_zero(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[MEM:%.*]], align 1, !tbaa [[TBAA2]]
+// CHECK-LABEL: define hidden <4 x i32> @test_v128_load32_zero(
+// CHECK-SAME: ptr noundef readonly captures(none) [[MEM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[MEM]], align 1, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <4 x i32> <i32 poison, i32 0, i32 0, i32 0>, i32 [[TMP0]], i64 0
 // CHECK-NEXT:    ret <4 x i32> [[VECINIT4_I]]
 //
@@ -137,9 +149,10 @@ v128_t test_v128_load32_zero(const void *mem) {
   return wasm_v128_load32_zero(mem);
 }
 
-// CHECK-LABEL: @test_v128_load64_zero(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[MEM:%.*]], align 1, !tbaa [[TBAA2]]
+// CHECK-LABEL: define hidden <4 x i32> @test_v128_load64_zero(
+// CHECK-SAME: ptr noundef readonly captures(none) [[MEM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[MEM]], align 1, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <2 x i64> <i64 poison, i64 0>, i64 [[TMP0]], i64 0
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[VECINIT2_I]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP1]]
@@ -148,10 +161,11 @@ v128_t test_v128_load64_zero(const void *mem) {
   return wasm_v128_load64_zero(mem);
 }
 
-// CHECK-LABEL: @test_v128_load8_lane(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[PTR:%.*]], align 1, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[VEC:%.*]] to <16 x i8>
+// CHECK-LABEL: define hidden <4 x i32> @test_v128_load8_lane(
+// CHECK-SAME: ptr noundef readonly captures(none) [[PTR:%.*]], <4 x i32> noundef [[VEC:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[PTR]], align 1, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[VEC]] to <16 x i8>
 // CHECK-NEXT:    [[VECINS_I:%.*]] = insertelement <16 x i8> [[TMP1]], i8 [[TMP0]], i64 15
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[VECINS_I]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP2]]
@@ -160,10 +174,11 @@ v128_t test_v128_load8_lane(const uint8_t *ptr, v128_t vec) {
   return wasm_v128_load8_lane(ptr, vec, 15);
 }
 
-// CHECK-LABEL: @test_v128_load16_lane(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[PTR:%.*]], align 1, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[VEC:%.*]] to <8 x i16>
+// CHECK-LABEL: define hidden <4 x i32> @test_v128_load16_lane(
+// CHECK-SAME: ptr noundef readonly captures(none) [[PTR:%.*]], <4 x i32> noundef [[VEC:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[PTR]], align 1, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[VEC]] to <8 x i16>
 // CHECK-NEXT:    [[VECINS_I:%.*]] = insertelement <8 x i16> [[TMP1]], i16 [[TMP0]], i64 7
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[VECINS_I]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP2]]
@@ -172,20 +187,22 @@ v128_t test_v128_load16_lane(const uint16_t *ptr, v128_t vec) {
   return wasm_v128_load16_lane(ptr, vec, 7);
 }
 
-// CHECK-LABEL: @test_v128_load32_lane(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[PTR:%.*]], align 1, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[VECINS_I:%.*]] = insertelement <4 x i32> [[VEC:%.*]], i32 [[TMP0]], i64 3
+// CHECK-LABEL: define hidden <4 x i32> @test_v128_load32_lane(
+// CHECK-SAME: ptr noundef readonly captures(none) [[PTR:%.*]], <4 x i32> noundef [[VEC:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[PTR]], align 1, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[VECINS_I:%.*]] = insertelement <4 x i32> [[VEC]], i32 [[TMP0]], i64 3
 // CHECK-NEXT:    ret <4 x i32> [[VECINS_I]]
 //
 v128_t test_v128_load32_lane(const uint32_t *ptr, v128_t vec) {
   return wasm_v128_load32_lane(ptr, vec, 3);
 }
 
-// CHECK-LABEL: @test_v128_load64_lane(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[PTR:%.*]], align 1, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[VEC:%.*]] to <2 x i64>
+// CHECK-LABEL: define hidden <4 x i32> @test_v128_load64_lane(
+// CHECK-SAME: ptr noundef readonly captures(none) [[PTR:%.*]], <4 x i32> noundef [[VEC:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[PTR]], align 1, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[VEC]] to <2 x i64>
 // CHECK-NEXT:    [[VECINS_I:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[TMP0]], i64 1
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[VECINS_I]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP2]]
@@ -194,76 +211,82 @@ v128_t test_v128_load64_lane(const uint64_t *ptr, v128_t vec) {
   return wasm_v128_load64_lane(ptr, vec, 1);
 }
 
-// CHECK-LABEL: @test_v128_store(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    store <4 x i32> [[A:%.*]], ptr [[MEM:%.*]], align 1, !tbaa [[TBAA2]]
+// CHECK-LABEL: define hidden void @test_v128_store(
+// CHECK-SAME: ptr noundef writeonly captures(none) initializes((0, 16)) [[MEM:%.*]], <4 x i32> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    store <4 x i32> [[A]], ptr [[MEM]], align 1, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 void test_v128_store(void *mem, v128_t a) {
   wasm_v128_store(mem, a);
 }
 
-// CHECK-LABEL: @test_v128_store8_lane(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[VEC:%.*]] to <16 x i8>
+// CHECK-LABEL: define hidden void @test_v128_store8_lane(
+// CHECK-SAME: ptr noundef writeonly captures(none) initializes((0, 1)) [[PTR:%.*]], <4 x i32> noundef [[VEC:%.*]]) local_unnamed_addr #[[ATTR1]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[VEC]] to <16 x i8>
 // CHECK-NEXT:    [[VECEXT_I:%.*]] = extractelement <16 x i8> [[TMP0]], i64 15
-// CHECK-NEXT:    store i8 [[VECEXT_I]], ptr [[PTR:%.*]], align 1, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store i8 [[VECEXT_I]], ptr [[PTR]], align 1, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 void test_v128_store8_lane(uint8_t *ptr, v128_t vec) {
   wasm_v128_store8_lane(ptr, vec, 15);
 }
 
-// CHECK-LABEL: @test_v128_store16_lane(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[VEC:%.*]] to <8 x i16>
+// CHECK-LABEL: define hidden void @test_v128_store16_lane(
+// CHECK-SAME: ptr noundef writeonly captures(none) initializes((0, 2)) [[PTR:%.*]], <4 x i32> noundef [[VEC:%.*]]) local_unnamed_addr #[[ATTR1]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[VEC]] to <8 x i16>
 // CHECK-NEXT:    [[VECEXT_I:%.*]] = extractelement <8 x i16> [[TMP0]], i64 7
-// CHECK-NEXT:    store i16 [[VECEXT_I]], ptr [[PTR:%.*]], align 1, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store i16 [[VECEXT_I]], ptr [[PTR]], align 1, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 void test_v128_store16_lane(uint16_t *ptr, v128_t vec) {
   wasm_v128_store16_lane(ptr, vec, 7);
 }
 
-// CHECK-LABEL: @test_v128_store32_lane(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VECEXT_I:%.*]] = extractelement <4 x i32> [[VEC:%.*]], i64 3
-// CHECK-NEXT:    store i32 [[VECEXT_I]], ptr [[PTR:%.*]], align 1, !tbaa [[TBAA2]]
+// CHECK-LABEL: define hidden void @test_v128_store32_lane(
+// CHECK-SAME: ptr noundef writeonly captures(none) initializes((0, 4)) [[PTR:%.*]], <4 x i32> noundef [[VEC:%.*]]) local_unnamed_addr #[[ATTR1]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECEXT_I:%.*]] = extractelement <4 x i32> [[VEC]], i64 3
+// CHECK-NEXT:    store i32 [[VECEXT_I]], ptr [[PTR]], align 1, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 void test_v128_store32_lane(uint32_t *ptr, v128_t vec) {
   wasm_v128_store32_lane(ptr, vec, 3);
 }
 
-// CHECK-LABEL: @test_v128_store64_lane(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[VEC:%.*]] to <2 x i64>
+// CHECK-LABEL: define hidden void @test_v128_store64_lane(
+// CHECK-SAME: ptr noundef writeonly captures(none) initializes((0, 8)) [[PTR:%.*]], <4 x i32> noundef [[VEC:%.*]]) local_unnamed_addr #[[ATTR1]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[VEC]] to <2 x i64>
 // CHECK-NEXT:    [[VECEXT_I:%.*]] = extractelement <2 x i64> [[TMP0]], i64 1
-// CHECK-NEXT:    store i64 [[VECEXT_I]], ptr [[PTR:%.*]], align 1, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store i64 [[VECEXT_I]], ptr [[PTR]], align 1, !tbaa [[CHAR_TBAA2]]
 // CHECK-NEXT:    ret void
 //
 void test_v128_store64_lane(uint64_t *ptr, v128_t vec) {
   wasm_v128_store64_lane(ptr, vec, 1);
 }
 
-// CHECK-LABEL: @test_i8x16_make(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <16 x i8> poison, i8 [[C0:%.*]], i64 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <16 x i8> [[VECINIT_I]], i8 [[C1:%.*]], i64 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <16 x i8> [[VECINIT1_I]], i8 [[C2:%.*]], i64 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <16 x i8> [[VECINIT2_I]], i8 [[C3:%.*]], i64 3
-// CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <16 x i8> [[VECINIT3_I]], i8 [[C4:%.*]], i64 4
-// CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <16 x i8> [[VECINIT4_I]], i8 [[C5:%.*]], i64 5
-// CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <16 x i8> [[VECINIT5_I]], i8 [[C6:%.*]], i64 6
-// CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <16 x i8> [[VECINIT6_I]], i8 [[C7:%.*]], i64 7
-// CHECK-NEXT:    [[VECINIT8_I:%.*]] = insertelement <16 x i8> [[VECINIT7_I]], i8 [[C8:%.*]], i64 8
-// CHECK-NEXT:    [[VECINIT9_I:%.*]] = insertelement <16 x i8> [[VECINIT8_I]], i8 [[C9:%.*]], i64 9
-// CHECK-NEXT:    [[VECINIT10_I:%.*]] = insertelement <16 x i8> [[VECINIT9_I]], i8 [[C10:%.*]], i64 10
-// CHECK-NEXT:    [[VECINIT11_I:%.*]] = insertelement <16 x i8> [[VECINIT10_I]], i8 [[C11:%.*]], i64 11
-// CHECK-NEXT:    [[VECINIT12_I:%.*]] = insertelement <16 x i8> [[VECINIT11_I]], i8 [[C12:%.*]], i64 12
-// CHECK-NEXT:    [[VECINIT13_I:%.*]] = insertelement <16 x i8> [[VECINIT12_I]], i8 [[C13:%.*]], i64 13
-// CHECK-NEXT:    [[VECINIT14_I:%.*]] = insertelement <16 x i8> [[VECINIT13_I]], i8 [[C14:%.*]], i64 14
-// CHECK-NEXT:    [[VECINIT15_I:%.*]] = insertelement <16 x i8> [[VECINIT14_I]], i8 [[C15:%.*]], i64 15
+// CHECK-LABEL: define hidden <4 x i32> @test_i8x16_make(
+// CHECK-SAME: i8 noundef signext [[C0:%.*]], i8 noundef signext [[C1:%.*]], i8 noundef signext [[C2:%.*]], i8 noundef signext [[C3:%.*]], i8 noundef signext [[C4:%.*]], i8 noundef signext [[C5:%.*]], i8 noundef signext [[C6:%.*]], i8 noundef signext [[C7:%.*]], i8 noundef signext [[C8:%.*]], i8 noundef signext [[C9:%.*]], i8 noundef signext [[C10:%.*]], i8 noundef signext [[C11:%.*]], i8 noundef signext [[C12:%.*]], i8 noundef signext [[C13:%.*]], i8 noundef signext [[C14:%.*]], i8 noundef signext [[C15:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <16 x i8> poison, i8 [[C0]], i64 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <16 x i8> [[VECINIT_I]], i8 [[C1]], i64 1
+// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <16 x i8> [[VECINIT1_I]], i8 [[C2]], i64 2
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <16 x i8> [[VECINIT2_I]], i8 [[C3]], i64 3
+// CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <16 x i8> [[VECINIT3_I]], i8 [[C4]], i64 4
+// CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <16 x i8> [[VECINIT4_I]], i8 [[C5]], i64 5
+// CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <16 x i8> [[VECINIT5_I]], i8 [[C6]], i64 6
+// CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <16 x i8> [[VECINIT6_I]], i8 [[C7]], i64 7
+// CHECK-NEXT:    [[VECINIT8_I:%.*]] = insertelement <16 x i8> [[VECINIT7_I]], i8 [[C8]], i64 8
+// CHECK-NEXT:    [[VECINIT9_I:%.*]] = insertelement <16 x i8> [[VECINIT8_I]], i8 [[C9]], i64 9
+// CHECK-NEXT:    [[VECINIT10_I:%.*]] = insertelement <16 x i8> [[VECINIT9_I]], i8 [[C10]], i64 10
+// CHECK-NEXT:    [[VECINIT11_I:%.*]] = insertelement <16 x i8> [[VECINIT10_I]], i8 [[C11]], i64 11
+// CHECK-NEXT:    [[VECINIT12_I:%.*]] = insertelement <16 x i8> [[VECINIT11_I]], i8 [[C12]], i64 12
+// CHECK-NEXT:    [[VECINIT13_I:%.*]] = insertelement <16 x i8> [[VECINIT12_I]], i8 [[C13]], i64 13
+// CHECK-NEXT:    [[VECINIT14_I:%.*]] = insertelement <16 x i8> [[VECINIT13_I]], i8 [[C14]], i64 14
+// CHECK-NEXT:    [[VECINIT15_I:%.*]] = insertelement <16 x i8> [[VECINIT14_I]], i8 [[C15]], i64 15
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[VECINIT15_I]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP0]]
 //
@@ -271,24 +294,25 @@ v128_t test_i8x16_make(int8_t c0, int8_t c1, int8_t c2, int8_t c3, int8_t c4, in
   return wasm_i8x16_make(c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11, c12, c13, c14, c15);
 }
 
-// CHECK-LABEL: @test_u8x16_make(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <16 x i8> poison, i8 [[C0:%.*]], i64 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <16 x i8> [[VECINIT_I]], i8 [[C1:%.*]], i64 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <16 x i8> [[VECINIT1_I]], i8 [[C2:%.*]], i64 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <16 x i8> [[VECINIT2_I]], i8 [[C3:%.*]], i64 3
-// CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <16 x i8> [[VECINIT3_I]], i8 [[C4:%.*]], i64 4
-// CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <16 x i8> [[VECINIT4_I]], i8 [[C5:%.*]], i64 5
-// CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <16 x i8> [[VECINIT5_I]], i8 [[C6:%.*]], i64 6
-// CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <16 x i8> [[VECINIT6_I]], i8 [[C7:%.*]], i64 7
-// CHECK-NEXT:    [[VECINIT8_I:%.*]] = insertelement <16 x i8> [[VECINIT7_I]], i8 [[C8:%.*]], i64 8
-// CHECK-NEXT:    [[VECINIT9_I:%.*]] = insertelement <16 x i8> [[VECINIT8_I]], i8 [[C9:%.*]], i64 9
-// CHECK-NEXT:    [[VECINIT10_I:%.*]] = insertelement <16 x i8> [[VECINIT9_I]], i8 [[C10:%.*]], i64 10
-// CHECK-NEXT:    [[VECINIT11_I:%.*]] = insertelement <16 x i8> [[VECINIT10_I]], i8 [[C11:%.*]], i64 11
-// CHECK-NEXT:    [[VECINIT12_I:%.*]] = insertelement <16 x i8> [[VECINIT11_I]], i8 [[C12:%.*]], i64 12
-// CHECK-NEXT:    [[VECINIT13_I:%.*]] = insertelement <16 x i8> [[VECINIT12_I]], i8 [[C13:%.*]], i64 13
-// CHECK-NEXT:    [[VECINIT14_I:%.*]] = insertelement <16 x i8> [[VECINIT13_I]], i8 [[C14:%.*]], i64 14
-// CHECK-NEXT:    [[VECINIT15_I:%.*]] = insertelement <16 x i8> [[VECINIT14_I]], i8 [[C15:%.*]], i64 15
+// CHECK-LABEL: define hidden <4 x i32> @test_u8x16_make(
+// CHECK-SAME: i8 noundef zeroext [[C0:%.*]], i8 noundef zeroext [[C1:%.*]], i8 noundef zeroext [[C2:%.*]], i8 noundef zeroext [[C3:%.*]], i8 noundef zeroext [[C4:%.*]], i8 noundef zeroext [[C5:%.*]], i8 noundef zeroext [[C6:%.*]], i8 noundef zeroext [[C7:%.*]], i8 noundef zeroext [[C8:%.*]], i8 noundef zeroext [[C9:%.*]], i8 noundef zeroext [[C10:%.*]], i8 noundef zeroext [[C11:%.*]], i8 noundef zeroext [[C12:%.*]], i8 noundef zeroext [[C13:%.*]], i8 noundef zeroext [[C14:%.*]], i8 noundef zeroext [[C15:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <16 x i8> poison, i8 [[C0]], i64 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <16 x i8> [[VECINIT_I]], i8 [[C1]], i64 1
+// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <16 x i8> [[VECINIT1_I]], i8 [[C2]], i64 2
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <16 x i8> [[VECINIT2_I]], i8 [[C3]], i64 3
+// CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <16 x i8> [[VECINIT3_I]], i8 [[C4]], i64 4
+// CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <16 x i8> [[VECINIT4_I]], i8 [[C5]], i64 5
+// CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <16 x i8> [[VECINIT5_I]], i8 [[C6]], i64 6
+// CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <16 x i8> [[VECINIT6_I]], i8 [[C7]], i64 7
+// CHECK-NEXT:    [[VECINIT8_I:%.*]] = insertelement <16 x i8> [[VECINIT7_I]], i8 [[C8]], i64 8
+// CHECK-NEXT:    [[VECINIT9_I:%.*]] = insertelement <16 x i8> [[VECINIT8_I]], i8 [[C9]], i64 9
+// CHECK-NEXT:    [[VECINIT10_I:%.*]] = insertelement <16 x i8> [[VECINIT9_I]], i8 [[C10]], i64 10
+// CHECK-NEXT:    [[VECINIT11_I:%.*]] = insertelement <16 x i8> [[VECINIT10_I]], i8 [[C11]], i64 11
+// CHECK-NEXT:    [[VECINIT12_I:%.*]] = insertelement <16 x i8> [[VECINIT11_I]], i8 [[C12]], i64 12
+// CHECK-NEXT:    [[VECINIT13_I:%.*]] = insertelement <16 x i8> [[VECINIT12_I]], i8 [[C13]], i64 13
+// CHECK-NEXT:    [[VECINIT14_I:%.*]] = insertelement <16 x i8> [[VECINIT13_I]], i8 [[C14]], i64 14
+// CHECK-NEXT:    [[VECINIT15_I:%.*]] = insertelement <16 x i8> [[VECINIT14_I]], i8 [[C15]], i64 15
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[VECINIT15_I]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP0]]
 //
@@ -296,16 +320,17 @@ v128_t test_u8x16_make(uint8_t c0, uint8_t c1, uint8_t c2, uint8_t c3, uint8_t c
   return wasm_u8x16_make(c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11, c12, c13, c14, c15);
 }
 
-// CHECK-LABEL: @test_i16x8_make(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[C0:%.*]], i64 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[C1:%.*]], i64 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[C2:%.*]], i64 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[C3:%.*]], i64 3
-// CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[C4:%.*]], i64 4
-// CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[C5:%.*]], i64 5
-// CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[C6:%.*]], i64 6
-// CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[C7:%.*]], i64 7
+// CHECK-LABEL: define hidden <4 x i32> @test_i16x8_make(
+// CHECK-SAME: i16 noundef signext [[C0:%.*]], i16 noundef signext [[C1:%.*]], i16 noundef signext [[C2:%.*]], i16 noundef signext [[C3:%.*]], i16 noundef signext [[C4:%.*]], i16 noundef signext [[C5:%.*]], i16 noundef signext [[C6:%.*]], i16 noundef signext [[C7:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[C0]], i64 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[C1]], i64 1
+// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[C2]], i64 2
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[C3]], i64 3
+// CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[C4]], i64 4
+// CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[C5]], i64 5
+// CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[C6]], i64 6
+// CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[C7]], i64 7
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[VECINIT7_I]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP0]]
 //
@@ -313,16 +338,17 @@ v128_t test_i16x8_make(int16_t c0, int16_t c1, int16_t c2, int16_t c3, int16_t c
   return wasm_i16x8_make(c0, c1, c2, c3, c4, c5, c6, c7);
 }
 
-// CHECK-LABEL: @test_u16x8_make(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[C0:%.*]], i64 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[C1:%.*]], i64 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[C2:%.*]], i64 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[C3:%.*]], i64 3
-// CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[C4:%.*]], i64 4
-// CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[C5:%.*]], i64 5
-// CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[C6:%.*]], i64 6
-// CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[C7:%.*]], i64 7
+// CHECK-LABEL: define hidden <4 x i32> @test_u16x8_make(
+// CHECK-SAME: i16 noundef zeroext [[C0:%.*]], i16 noundef zeroext [[C1:%.*]], i16 noundef zeroext [[C2:%.*]], i16 noundef zeroext [[C3:%.*]], i16 noundef zeroext [[C4:%.*]], i16 noundef zeroext [[C5:%.*]], i16 noundef zeroext [[C6:%.*]], i16 noundef zeroext [[C7:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[C0]], i64 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[C1]], i64 1
+// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[C2]], i64 2
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[C3]], i64 3
+// CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[C4]], i64 4
+// CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[C5]], i64 5
+// CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[C6]], i64 6
+// CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[C7]], i64 7
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[VECINIT7_I]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP0]]
 //
@@ -330,34 +356,37 @@ v128_t test_u16x8_make(uint16_t c0, uint16_t c1, uint16_t c2, uint16_t c3, uint1
   return wasm_u16x8_make(c0, c1, c2, c3, c4, c5, c6, c7);
 }
 
-// CHECK-LABEL: @test_i32x4_make(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[C0:%.*]], i64 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 [[C1:%.*]], i64 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 [[C2:%.*]], i64 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 [[C3:%.*]], i64 3
+// CHECK-LABEL: define hidden <4 x i32> @test_i32x4_make(
+// CHECK-SAME: i32 noundef [[C0:%.*]], i32 noundef [[C1:%.*]], i32 noundef [[C2:%.*]], i32 noundef [[C3:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[C0]], i64 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 [[C1]], i64 1
+// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 [[C2]], i64 2
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 [[C3]], i64 3
 // CHECK-NEXT:    ret <4 x i32> [[VECINIT3_I]]
 //
 v128_t test_i32x4_make(int32_t c0, int32_t c1, int32_t c2, int32_t c3) {
   return wasm_i32x4_make(c0, c1, c2, c3);
 }
 
-// CHECK-LABEL: @test_u32x4_make(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[C0:%.*]], i64 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 [[C1:%.*]], i64 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 [[C2:%.*]], i64 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 [[C3:%.*]], i64 3
+// CHECK-LABEL: define hidden <4 x i32> @test_u32x4_make(
+// CHECK-SAME: i32 noundef [[C0:%.*]], i32 noundef [[C1:%.*]], i32 noundef [[C2:%.*]], i32 noundef [[C3:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[C0]], i64 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 [[C1]], i64 1
+// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 [[C2]], i64 2
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 [[C3]], i64 3
 // CHECK-NEXT:    ret <4 x i32> [[VECINIT3_I]]
 //
 v128_t test_u32x4_make(uint32_t c0, uint32_t c1, uint32_t c2, uint32_t c3) {
   return wasm_u32x4_make(c0, c1, c2, c3);
 }
 
-// CHECK-LABEL: @test_i64x2_make(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i64> poison, i64 [[C0:%.*]], i64 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i64> [[VECINIT_I]], i64 [[C1:%.*]], i64 1
+// CHECK-LABEL: define hidden <4 x i32> @test_i64x2_make(
+// CHECK-SAME: i64 noundef [[C0:%.*]], i64 noundef [[C1:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i64> poison, i64 [[C0]], i64 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i64> [[VECINIT_I]], i64 [[C1]], i64 1
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[VECINIT1_I]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP0]]
 //
@@ -365,10 +394,11 @@ v128_t test_i64x2_make(int64_t c0, int64_t c1) {
   return wasm_i64x2_make(c0, c1);
 }
 
-// CHECK-LABEL: @test_u64x2_make(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i64> poison, i64 [[C0:%.*]], i64 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i64> [[VECINIT_I]], i64 [[C1:%.*]], i64 1
+// CHECK-LABEL: define hidden <4 x i32> @test_u64x2_make(
+// CHECK-SAME: i64 noundef [[C0:%.*]], i64 noundef [[C1:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i64> poison, i64 [[C0]], i64 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i64> [[VECINIT_I]], i64 [[C1]], i64 1
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[VECINIT1_I]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP0]]
 //
@@ -376,12 +406,13 @@ v128_t test_u64x2_make(uint64_t c0, uint64_t c1) {
   return wasm_u64x2_make(c0, c1);
 }
 
-// CHECK-LABEL: @test_f32x4_make(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x float> poison, float [[C0:%.*]], i64 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float [[C1:%.*]], i64 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float [[C2:%.*]], i64 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float [[C3:%.*]], i64 3
+// CHECK-LABEL: define hidden <4 x i32> @test_f32x4_make(
+// CHECK-SAME: float noundef [[C0:%.*]], float noundef [[C1:%.*]], float noundef [[C2:%.*]], float noundef [[C3:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x float> poison, float [[C0]], i64 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float [[C1]], i64 1
+// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float [[C2]], i64 2
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float [[C3]], i64 3
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[VECINIT3_I]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP0]]
 //
@@ -389,10 +420,11 @@ v128_t test_f32x4_make(float c0, float c1, float c2, float c3) {
   return wasm_f32x4_make(c0, c1, c2, c3);
 }
 
-// CHECK-LABEL: @test_f64x2_make(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x double> poison, double [[C0:%.*]], i64 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x double> [[VECINIT_I]], double [[C1:%.*]], i64 1
+// CHECK-LABEL: define hidden <4 x i32> @test_f64x2_make(
+// CHECK-SAME: double noundef [[C0:%.*]], double noundef [[C1:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x double> poison, double [[C0]], i64 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x double> [[VECINIT_I]], double [[C1]], i64 1
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x double> [[VECINIT1_I]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP0]]
 //
@@ -400,169 +432,190 @@ v128_t test_f64x2_make(double c0, double c1) {
   return wasm_f64x2_make(c0, c1);
 }
 
-// CHECK-LABEL: @test_i8x16_const(
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define hidden noundef range(i32 50462976, 252579085) <4 x i32> @test_i8x16_const(
+// CHECK-SAME: ) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    ret <4 x i32> <i32 50462976, i32 117835012, i32 185207048, i32 252579084>
 //
 v128_t test_i8x16_const(void) {
   return wasm_i8x16_const(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
 }
 
-// CHECK-LABEL: @test_u8x16_const(
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define hidden noundef range(i32 50462976, 252579085) <4 x i32> @test_u8x16_const(
+// CHECK-SAME: ) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    ret <4 x i32> <i32 50462976, i32 117835012, i32 185207048, i32 252579084>
 //
 v128_t test_u8x16_const(void) {
   return wasm_u8x16_const(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
 }
 
-// CHECK-LABEL: @test_i16x8_const(
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define hidden noundef range(i32 65536, 458759) <4 x i32> @test_i16x8_const(
+// CHECK-SAME: ) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    ret <4 x i32> <i32 65536, i32 196610, i32 327684, i32 458758>
 //
 v128_t test_i16x8_const(void) {
   return wasm_i16x8_const(0, 1, 2, 3, 4, 5, 6, 7);
 }
 
-// CHECK-LABEL: @test_u16x8_const(
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define hidden noundef range(i32 65536, 458759) <4 x i32> @test_u16x8_const(
+// CHECK-SAME: ) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    ret <4 x i32> <i32 65536, i32 196610, i32 327684, i32 458758>
 //
 v128_t test_u16x8_const(void) {
   return wasm_u16x8_const(0, 1, 2, 3, 4, 5, 6, 7);
 }
 
-// CHECK-LABEL: @test_i32x4_const(
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define hidden noundef range(i32 0, 4) <4 x i32> @test_i32x4_const(
+// CHECK-SAME: ) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    ret <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 //
 v128_t test_i32x4_const(void) {
   return wasm_i32x4_const(0, 1, 2, 3);
 }
 
-// CHECK-LABEL: @test_u32x4_const(
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define hidden noundef range(i32 0, 4) <4 x i32> @test_u32x4_const(
+// CHECK-SAME: ) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    ret <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 //
 v128_t test_u32x4_const(void) {
   return wasm_u32x4_const(0, 1, 2, 3);
 }
 
-// CHECK-LABEL: @test_i64x2_const(
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define hidden noundef range(i32 0, 2) <4 x i32> @test_i64x2_const(
+// CHECK-SAME: ) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    ret <4 x i32> <i32 0, i32 0, i32 1, i32 0>
 //
 v128_t test_i64x2_const(void) {
   return wasm_i64x2_const(0, 1);
 }
 
-// CHECK-LABEL: @test_u64x2_const(
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define hidden noundef range(i32 0, 2) <4 x i32> @test_u64x2_const(
+// CHECK-SAME: ) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    ret <4 x i32> <i32 0, i32 0, i32 1, i32 0>
 //
 v128_t test_u64x2_const(void) {
   return wasm_u64x2_const(0, 1);
 }
 
-// CHECK-LABEL: @test_f32x4_const(
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define hidden noundef range(i32 0, 1077936129) <4 x i32> @test_f32x4_const(
+// CHECK-SAME: ) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    ret <4 x i32> <i32 0, i32 1065353216, i32 1073741824, i32 1077936128>
 //
 v128_t test_f32x4_const(void) {
   return wasm_f32x4_const(0, 1, 2, 3);
 }
 
-// CHECK-LABEL: @test_f64x2_const(
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define hidden noundef range(i32 0, 1072693249) <4 x i32> @test_f64x2_const(
+// CHECK-SAME: ) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    ret <4 x i32> <i32 0, i32 0, i32 0, i32 1072693248>
 //
 v128_t test_f64x2_const(void) {
   return wasm_f64x2_const(0, 1);
 }
 
-// CHECK-LABEL: @test_i8x16_const_splat(
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_i8x16_const_splat(
+// CHECK-SAME: ) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    ret <4 x i32> splat (i32 707406378)
 //
 v128_t test_i8x16_const_splat(void) {
   return wasm_i8x16_const_splat(42);
 }
 
-// CHECK-LABEL: @test_u8x16_const_splat(
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_u8x16_const_splat(
+// CHECK-SAME: ) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    ret <4 x i32> splat (i32 707406378)
 //
 v128_t test_u8x16_const_splat(void) {
   return wasm_u8x16_const_splat(42);
 }
 
-// CHECK-LABEL: @test_i16x8_const_splat(
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_i16x8_const_splat(
+// CHECK-SAME: ) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    ret <4 x i32> splat (i32 2752554)
 //
 v128_t test_i16x8_const_splat(void) {
   return wasm_i16x8_const_splat(42);
 }
 
-// CHECK-LABEL: @test_u16x8_const_splat(
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_u16x8_const_splat(
+// CHECK-SAME: ) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    ret <4 x i32> splat (i32 2752554)
 //
 v128_t test_u16x8_const_splat(void) {
   return wasm_u16x8_const_splat(42);
 }
 
-// CHECK-LABEL: @test_i32x4_const_splat(
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_i32x4_const_splat(
+// CHECK-SAME: ) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    ret <4 x i32> splat (i32 42)
 //
 v128_t test_i32x4_const_splat(void) {
   return wasm_i32x4_const_splat(42);
 }
 
-// CHECK-LABEL: @test_u32x4_const_splat(
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_u32x4_const_splat(
+// CHECK-SAME: ) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    ret <4 x i32> splat (i32 42)
 //
 v128_t test_u32x4_const_splat(void) {
   return wasm_u32x4_const_splat(42);
 }
 
-// CHECK-LABEL: @test_i64x2_const_splat(
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define hidden noundef range(i32 0, 43) <4 x i32> @test_i64x2_const_splat(
+// CHECK-SAME: ) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    ret <4 x i32> <i32 42, i32 0, i32 42, i32 0>
 //
 v128_t test_i64x2_const_splat(void) {
   return wasm_i64x2_const_splat(42);
 }
 
-// CHECK-LABEL: @test_u64x2_const_splat(
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define hidden noundef range(i32 0, 43) <4 x i32> @test_u64x2_const_splat(
+// CHECK-SAME: ) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    ret <4 x i32> <i32 42, i32 0, i32 42, i32 0>
 //
 v128_t test_u64x2_const_splat(void) {
   return wasm_u64x2_const_splat(42);
 }
 
-// CHECK-LABEL: @test_f32x4_const_splat(
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_f32x4_const_splat(
+// CHECK-SAME: ) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    ret <4 x i32> splat (i32 1109917696)
 //
 v128_t test_f32x4_const_splat(void) {
   return wasm_f32x4_const_splat(42);
 }
 
-// CHECK-LABEL: @test_f64x2_const_splat(
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define hidden noundef range(i32 0, 1078263809) <4 x i32> @test_f64x2_const_splat(
+// CHECK-SAME: ) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    ret <4 x i32> <i32 0, i32 1078263808, i32 0, i32 1078263808>
 //
 v128_t test_f64x2_const_splat(void) {
   return wasm_f64x2_const_splat(42);
 }
 
-// CHECK-LABEL: @test_i8x16_splat(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <16 x i8> poison, i8 [[A:%.*]], i64 0
+// CHECK-LABEL: define hidden <4 x i32> @test_i8x16_splat(
+// CHECK-SAME: i8 noundef signext [[A:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <16 x i8> poison, i8 [[A]], i64 0
 // CHECK-NEXT:    [[VECINIT15_I:%.*]] = shufflevector <16 x i8> [[VECINIT_I]], <16 x i8> poison, <16 x i32> zeroinitializer
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[VECINIT15_I]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP0]]
@@ -571,9 +624,10 @@ v128_t test_i8x16_splat(int8_t a) {
   return wasm_i8x16_splat(a);
 }
 
-// CHECK-LABEL: @test_u8x16_splat(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <16 x i8> poison, i8 [[A:%.*]], i64 0
+// CHECK-LABEL: define hidden <4 x i32> @test_u8x16_splat(
+// CHECK-SAME: i8 noundef zeroext [[A:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <16 x i8> poison, i8 [[A]], i64 0
 // CHECK-NEXT:    [[VECINIT15_I:%.*]] = shufflevector <16 x i8> [[VECINIT_I]], <16 x i8> poison, <16 x i32> zeroinitializer
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[VECINIT15_I]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP0]]
@@ -582,9 +636,10 @@ v128_t test_u8x16_splat(uint8_t a) {
   return wasm_u8x16_splat(a);
 }
 
-// CHECK-LABEL: @test_i8x16_extract_lane(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
+// CHECK-LABEL: define hidden noundef signext i8 @test_i8x16_extract_lane(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
 // CHECK-NEXT:    [[VECEXT_I:%.*]] = extractelement <16 x i8> [[TMP0]], i64 15
 // CHECK-NEXT:    ret i8 [[VECEXT_I]]
 //
@@ -592,9 +647,10 @@ int8_t test_i8x16_extract_lane(v128_t a) {
   return wasm_i8x16_extract_lane(a, 15);
 }
 
-// CHECK-LABEL: @test_u8x16_extract_lane(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
+// CHECK-LABEL: define hidden noundef zeroext i8 @test_u8x16_extract_lane(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
 // CHECK-NEXT:    [[VECEXT_I:%.*]] = extractelement <16 x i8> [[TMP0]], i64 15
 // CHECK-NEXT:    ret i8 [[VECEXT_I]]
 //
@@ -602,10 +658,11 @@ uint8_t test_u8x16_extract_lane(v128_t a) {
   return wasm_u8x16_extract_lane(a, 15);
 }
 
-// CHECK-LABEL: @test_i8x16_replace_lane(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[VECINS_I:%.*]] = insertelement <16 x i8> [[TMP0]], i8 [[B:%.*]], i64 15
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_i8x16_replace_lane(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], i8 noundef signext [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VECINS_I:%.*]] = insertelement <16 x i8> [[TMP0]], i8 [[B]], i64 15
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[VECINS_I]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP1]]
 //
@@ -613,10 +670,11 @@ v128_t test_i8x16_replace_lane(v128_t a, int8_t b) {
   return wasm_i8x16_replace_lane(a, 15, b);
 }
 
-// CHECK-LABEL: @test_u8x16_replace_lane(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[VECINS_I:%.*]] = insertelement <16 x i8> [[TMP0]], i8 [[B:%.*]], i64 15
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_u8x16_replace_lane(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], i8 noundef zeroext [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VECINS_I:%.*]] = insertelement <16 x i8> [[TMP0]], i8 [[B]], i64 15
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[VECINS_I]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP1]]
 //
@@ -624,9 +682,10 @@ v128_t test_u8x16_replace_lane(v128_t a, uint8_t b) {
   return wasm_u8x16_replace_lane(a, 15, b);
 }
 
-// CHECK-LABEL: @test_i16x8_splat(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[A:%.*]], i64 0
+// CHECK-LABEL: define hidden <4 x i32> @test_i16x8_splat(
+// CHECK-SAME: i16 noundef signext [[A:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[A]], i64 0
 // CHECK-NEXT:    [[VECINIT7_I:%.*]] = shufflevector <8 x i16> [[VECINIT_I]], <8 x i16> poison, <8 x i32> zeroinitializer
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[VECINIT7_I]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP0]]
@@ -635,9 +694,10 @@ v128_t test_i16x8_splat(int16_t a) {
   return wasm_i16x8_splat(a);
 }
 
-// CHECK-LABEL: @test_u16x8_splat(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[A:%.*]], i64 0
+// CHECK-LABEL: define hidden <4 x i32> @test_u16x8_splat(
+// CHECK-SAME: i16 noundef zeroext [[A:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[A]], i64 0
 // CHECK-NEXT:    [[VECINIT7_I:%.*]] = shufflevector <8 x i16> [[VECINIT_I]], <8 x i16> poison, <8 x i32> zeroinitializer
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[VECINIT7_I]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP0]]
@@ -646,9 +706,10 @@ v128_t test_u16x8_splat(uint16_t a) {
   return wasm_u16x8_splat(a);
 }
 
-// CHECK-LABEL: @test_i16x8_extract_lane(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16>
+// CHECK-LABEL: define hidden noundef signext i16 @test_i16x8_extract_lane(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <8 x i16>
 // CHECK-NEXT:    [[VECEXT_I:%.*]] = extractelement <8 x i16> [[TMP0]], i64 7
 // CHECK-NEXT:    ret i16 [[VECEXT_I]]
 //
@@ -656,9 +717,10 @@ int16_t test_i16x8_extract_lane(v128_t a) {
   return wasm_i16x8_extract_lane(a, 7);
 }
 
-// CHECK-LABEL: @test_u16x8_extract_lane(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16>
+// CHECK-LABEL: define hidden noundef zeroext i16 @test_u16x8_extract_lane(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <8 x i16>
 // CHECK-NEXT:    [[VECEXT_I:%.*]] = extractelement <8 x i16> [[TMP0]], i64 7
 // CHECK-NEXT:    ret i16 [[VECEXT_I]]
 //
@@ -666,10 +728,11 @@ uint16_t test_u16x8_extract_lane(v128_t a) {
   return wasm_u16x8_extract_lane(a, 7);
 }
 
-// CHECK-LABEL: @test_i16x8_replace_lane(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16>
-// CHECK-NEXT:    [[VECINS_I:%.*]] = insertelement <8 x i16> [[TMP0]], i16 [[B:%.*]], i64 7
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_i16x8_replace_lane(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], i16 noundef signext [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <8 x i16>
+// CHECK-NEXT:    [[VECINS_I:%.*]] = insertelement <8 x i16> [[TMP0]], i16 [[B]], i64 7
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[VECINS_I]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP1]]
 //
@@ -677,10 +740,11 @@ v128_t test_i16x8_replace_lane(v128_t a, int16_t b) {
   return wasm_i16x8_replace_lane(a, 7, b);
 }
 
-// CHECK-LABEL: @test_u16x8_replace_lane(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16>
-// CHECK-NEXT:    [[VECINS_I:%.*]] = insertelement <8 x i16> [[TMP0]], i16 [[B:%.*]], i64 7
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_u16x8_replace_lane(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], i16 noundef zeroext [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <8 x i16>
+// CHECK-NEXT:    [[VECINS_I:%.*]] = insertelement <8 x i16> [[TMP0]], i16 [[B]], i64 7
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[VECINS_I]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP1]]
 //
@@ -688,9 +752,10 @@ v128_t test_u16x8_replace_lane(v128_t a, uint16_t b) {
   return wasm_u16x8_replace_lane(a, 7, b);
 }
 
-// CHECK-LABEL: @test_i32x4_splat(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[A:%.*]], i64 0
+// CHECK-LABEL: define hidden <4 x i32> @test_i32x4_splat(
+// CHECK-SAME: i32 noundef [[A:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i64 0
 // CHECK-NEXT:    [[VECINIT3_I:%.*]] = shufflevector <4 x i32> [[VECINIT_I]], <4 x i32> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    ret <4 x i32> [[VECINIT3_I]]
 //
@@ -698,9 +763,10 @@ v128_t test_i32x4_splat(int32_t a) {
   return wasm_i32x4_splat(a);
 }
 
-// CHECK-LABEL: @test_u32x4_splat(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[A:%.*]], i64 0
+// CHECK-LABEL: define hidden <4 x i32> @test_u32x4_splat(
+// CHECK-SAME: i32 noundef [[A:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i64 0
 // CHECK-NEXT:    [[VECINIT3_I:%.*]] = shufflevector <4 x i32> [[VECINIT_I]], <4 x i32> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    ret <4 x i32> [[VECINIT3_I]]
 //
@@ -708,45 +774,50 @@ v128_t test_u32x4_splat(uint32_t a) {
   return wasm_u32x4_splat(a);
 }
 
-// CHECK-LABEL: @test_i32x4_extract_lane(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VECEXT_I:%.*]] = extractelement <4 x i32> [[A:%.*]], i64 3
+// CHECK-LABEL: define hidden noundef i32 @test_i32x4_extract_lane(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECEXT_I:%.*]] = extractelement <4 x i32> [[A]], i64 3
 // CHECK-NEXT:    ret i32 [[VECEXT_I]]
 //
 int32_t test_i32x4_extract_lane(v128_t a) {
   return wasm_i32x4_extract_lane(a, 3);
 }
 
-// CHECK-LABEL: @test_u32x4_extract_lane(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VECEXT_I:%.*]] = extractelement <4 x i32> [[A:%.*]], i64 3
+// CHECK-LABEL: define hidden noundef i32 @test_u32x4_extract_lane(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECEXT_I:%.*]] = extractelement <4 x i32> [[A]], i64 3
 // CHECK-NEXT:    ret i32 [[VECEXT_I]]
 //
 uint32_t test_u32x4_extract_lane(v128_t a) {
   return wasm_u32x4_extract_lane(a, 3);
 }
 
-// CHECK-LABEL: @test_i32x4_replace_lane(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VECINS_I:%.*]] = insertelement <4 x i32> [[A:%.*]], i32 [[B:%.*]], i64 3
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_i32x4_replace_lane(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], i32 noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINS_I:%.*]] = insertelement <4 x i32> [[A]], i32 [[B]], i64 3
 // CHECK-NEXT:    ret <4 x i32> [[VECINS_I]]
 //
 v128_t test_i32x4_replace_lane(v128_t a, int32_t b) {
   return wasm_i32x4_replace_lane(a, 3, b);
 }
 
-// CHECK-LABEL: @test_u32x4_replace_lane(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VECINS_I:%.*]] = insertelement <4 x i32> [[A:%.*]], i32 [[B:%.*]], i64 3
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_u32x4_replace_lane(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], i32 noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINS_I:%.*]] = insertelement <4 x i32> [[A]], i32 [[B]], i64 3
 // CHECK-NEXT:    ret <4 x i32> [[VECINS_I]]
 //
 v128_t test_u32x4_replace_lane(v128_t a, uint32_t b) {
   return wasm_u32x4_replace_lane(a, 3, b);
 }
 
-// CHECK-LABEL: @test_i64x2_splat(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i64> poison, i64 [[A:%.*]], i64 0
+// CHECK-LABEL: define hidden <4 x i32> @test_i64x2_splat(
+// CHECK-SAME: i64 noundef [[A:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i64> poison, i64 [[A]], i64 0
 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = shufflevector <2 x i64> [[VECINIT_I]], <2 x i64> poison, <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[VECINIT1_I]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP0]]
@@ -755,9 +826,10 @@ v128_t test_i64x2_splat(int64_t a) {
   return wasm_i64x2_splat(a);
 }
 
-// CHECK-LABEL: @test_u64x2_splat(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i64> poison, i64 [[A:%.*]], i64 0
+// CHECK-LABEL: define hidden <4 x i32> @test_u64x2_splat(
+// CHECK-SAME: i64 noundef [[A:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i64> poison, i64 [[A]], i64 0
 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = shufflevector <2 x i64> [[VECINIT_I]], <2 x i64> poison, <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[VECINIT1_I]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP0]]
@@ -766,9 +838,10 @@ v128_t test_u64x2_splat(uint64_t a) {
   return wasm_u64x2_splat(a);
 }
 
-// CHECK-LABEL: @test_i64x2_extract_lane(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x i64>
+// CHECK-LABEL: define hidden noundef i64 @test_i64x2_extract_lane(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <2 x i64>
 // CHECK-NEXT:    [[VECEXT_I:%.*]] = extractelement <2 x i64> [[TMP0]], i64 1
 // CHECK-NEXT:    ret i64 [[VECEXT_I]]
 //
@@ -776,9 +849,10 @@ int64_t test_i64x2_extract_lane(v128_t a) {
   return wasm_i64x2_extract_lane(a, 1);
 }
 
-// CHECK-LABEL: @test_u64x2_extract_lane(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x i64>
+// CHECK-LABEL: define hidden noundef i64 @test_u64x2_extract_lane(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <2 x i64>
 // CHECK-NEXT:    [[VECEXT_I:%.*]] = extractelement <2 x i64> [[TMP0]], i64 1
 // CHECK-NEXT:    ret i64 [[VECEXT_I]]
 //
@@ -786,10 +860,11 @@ uint64_t test_u64x2_extract_lane(v128_t a) {
   return wasm_u64x2_extract_lane(a, 1);
 }
 
-// CHECK-LABEL: @test_i64x2_replace_lane(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x i64>
-// CHECK-NEXT:    [[VECINS_I:%.*]] = insertelement <2 x i64> [[TMP0]], i64 [[B:%.*]], i64 1
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_i64x2_replace_lane(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], i64 noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <2 x i64>
+// CHECK-NEXT:    [[VECINS_I:%.*]] = insertelement <2 x i64> [[TMP0]], i64 [[B]], i64 1
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[VECINS_I]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP1]]
 //
@@ -797,10 +872,11 @@ v128_t test_i64x2_replace_lane(v128_t a, int64_t b) {
   return wasm_i64x2_replace_lane(a, 1, b);
 }
 
-// CHECK-LABEL: @test_u64x2_replace_lane(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x i64>
-// CHECK-NEXT:    [[VECINS_I:%.*]] = insertelement <2 x i64> [[TMP0]], i64 [[B:%.*]], i64 1
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_u64x2_replace_lane(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], i64 noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <2 x i64>
+// CHECK-NEXT:    [[VECINS_I:%.*]] = insertelement <2 x i64> [[TMP0]], i64 [[B]], i64 1
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[VECINS_I]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP1]]
 //
@@ -808,9 +884,10 @@ v128_t test_u64x2_replace_lane(v128_t a, uint64_t b) {
   return wasm_u64x2_replace_lane(a, 1, b);
 }
 
-// CHECK-LABEL: @test_f32x4_splat(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x float> poison, float [[A:%.*]], i64 0
+// CHECK-LABEL: define hidden <4 x i32> @test_f32x4_splat(
+// CHECK-SAME: float noundef [[A:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x float> poison, float [[A]], i64 0
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[VECINIT_I]] to <4 x i32>
 // CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    ret <4 x i32> [[TMP1]]
@@ -819,9 +896,10 @@ v128_t test_f32x4_splat(float a) {
   return wasm_f32x4_splat(a);
 }
 
-// CHECK-LABEL: @test_f32x4_extract_lane(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <4 x float>
+// CHECK-LABEL: define hidden noundef float @test_f32x4_extract_lane(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <4 x float>
 // CHECK-NEXT:    [[VECEXT_I:%.*]] = extractelement <4 x float> [[TMP0]], i64 3
 // CHECK-NEXT:    ret float [[VECEXT_I]]
 //
@@ -829,10 +907,11 @@ float test_f32x4_extract_lane(v128_t a) {
   return wasm_f32x4_extract_lane(a, 3);
 }
 
-// CHECK-LABEL: @test_f32x4_replace_lane(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <4 x float>
-// CHECK-NEXT:    [[VECINS_I:%.*]] = insertelement <4 x float> [[TMP0]], float [[B:%.*]], i64 3
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_f32x4_replace_lane(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], float noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <4 x float>
+// CHECK-NEXT:    [[VECINS_I:%.*]] = insertelement <4 x float> [[TMP0]], float [[B]], i64 3
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[VECINS_I]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP1]]
 //
@@ -840,9 +919,10 @@ v128_t test_f32x4_replace_lane(v128_t a, float b) {
   return wasm_f32x4_replace_lane(a, 3, b);
 }
 
-// CHECK-LABEL: @test_f64x2_splat(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x double> poison, double [[A:%.*]], i64 0
+// CHECK-LABEL: define hidden <4 x i32> @test_f64x2_splat(
+// CHECK-SAME: double noundef [[A:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x double> poison, double [[A]], i64 0
 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = shufflevector <2 x double> [[VECINIT_I]], <2 x double> poison, <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x double> [[VECINIT1_I]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP0]]
@@ -851,9 +931,10 @@ v128_t test_f64x2_splat(double a) {
   return wasm_f64x2_splat(a);
 }
 
-// CHECK-LABEL: @test_f64x2_extract_lane(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x double>
+// CHECK-LABEL: define hidden noundef double @test_f64x2_extract_lane(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <2 x double>
 // CHECK-NEXT:    [[VECEXT_I:%.*]] = extractelement <2 x double> [[TMP0]], i64 1
 // CHECK-NEXT:    ret double [[VECEXT_I]]
 //
@@ -861,10 +942,11 @@ double test_f64x2_extract_lane(v128_t a) {
   return wasm_f64x2_extract_lane(a, 1);
 }
 
-// CHECK-LABEL: @test_f64x2_replace_lane(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x double>
-// CHECK-NEXT:    [[VECINS_I:%.*]] = insertelement <2 x double> [[TMP0]], double [[B:%.*]], i64 1
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_f64x2_replace_lane(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], double noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <2 x double>
+// CHECK-NEXT:    [[VECINS_I:%.*]] = insertelement <2 x double> [[TMP0]], double [[B]], i64 1
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x double> [[VECINS_I]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP1]]
 //
@@ -872,10 +954,11 @@ v128_t test_f64x2_replace_lane(v128_t a, double b) {
   return wasm_f64x2_replace_lane(a, 1, b);
 }
 
-// CHECK-LABEL: @test_i8x16_eq(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <16 x i8>
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_i8x16_eq(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[CMP_I:%.*]] = icmp eq <16 x i8> [[TMP0]], [[TMP1]]
 // CHECK-NEXT:    [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[SEXT_I]] to <4 x i32>
@@ -885,10 +968,11 @@ v128_t test_i8x16_eq(v128_t a, v128_t b) {
   return wasm_i8x16_eq(a, b);
 }
 
-// CHECK-LABEL: @test_i8x16_ne(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <16 x i8>
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_i8x16_ne(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[CMP_I:%.*]] = icmp ne <16 x i8> [[TMP0]], [[TMP1]]
 // CHECK-NEXT:    [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[SEXT_I]] to <4 x i32>
@@ -898,10 +982,11 @@ v128_t test_i8x16_ne(v128_t a, v128_t b) {
   return wasm_i8x16_ne(a, b);
 }
 
-// CHECK-LABEL: @test_i8x16_lt(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <16 x i8>
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_i8x16_lt(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[CMP_I:%.*]] = icmp slt <16 x i8> [[TMP0]], [[TMP1]]
 // CHECK-NEXT:    [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[SEXT_I]] to <4 x i32>
@@ -911,10 +996,11 @@ v128_t test_i8x16_lt(v128_t a, v128_t b) {
   return wasm_i8x16_lt(a, b);
 }
 
-// CHECK-LABEL: @test_u8x16_lt(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <16 x i8>
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_u8x16_lt(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[CMP_I:%.*]] = icmp ult <16 x i8> [[TMP0]], [[TMP1]]
 // CHECK-NEXT:    [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[SEXT_I]] to <4 x i32>
@@ -924,10 +1010,11 @@ v128_t test_u8x16_lt(v128_t a, v128_t b) {
   return wasm_u8x16_lt(a, b);
 }
 
-// CHECK-LABEL: @test_i8x16_gt(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <16 x i8>
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_i8x16_gt(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[CMP_I:%.*]] = icmp sgt <16 x i8> [[TMP0]], [[TMP1]]
 // CHECK-NEXT:    [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[SEXT_I]] to <4 x i32>
@@ -937,10 +1024,11 @@ v128_t test_i8x16_gt(v128_t a, v128_t b) {
   return wasm_i8x16_gt(a, b);
 }
 
-// CHECK-LABEL: @test_u8x16_gt(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <16 x i8>
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_u8x16_gt(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[CMP_I:%.*]] = icmp ugt <16 x i8> [[TMP0]], [[TMP1]]
 // CHECK-NEXT:    [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[SEXT_I]] to <4 x i32>
@@ -950,10 +1038,11 @@ v128_t test_u8x16_gt(v128_t a, v128_t b) {
   return wasm_u8x16_gt(a, b);
 }
 
-// CHECK-LABEL: @test_i8x16_le(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <16 x i8>
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_i8x16_le(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[CMP_I:%.*]] = icmp sle <16 x i8> [[TMP0]], [[TMP1]]
 // CHECK-NEXT:    [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[SEXT_I]] to <4 x i32>
@@ -963,10 +1052,11 @@ v128_t test_i8x16_le(v128_t a, v128_t b) {
   return wasm_i8x16_le(a, b);
 }
 
-// CHECK-LABEL: @test_u8x16_le(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <16 x i8>
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_u8x16_le(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[CMP_I:%.*]] = icmp ule <16 x i8> [[TMP0]], [[TMP1]]
 // CHECK-NEXT:    [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[SEXT_I]] to <4 x i32>
@@ -976,10 +1066,11 @@ v128_t test_u8x16_le(v128_t a, v128_t b) {
   return wasm_u8x16_le(a, b);
 }
 
-// CHECK-LABEL: @test_i8x16_ge(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <16 x i8>
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_i8x16_ge(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[CMP_I:%.*]] = icmp sge <16 x i8> [[TMP0]], [[TMP1]]
 // CHECK-NEXT:    [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[SEXT_I]] to <4 x i32>
@@ -989,10 +1080,11 @@ v128_t test_i8x16_ge(v128_t a, v128_t b) {
   return wasm_i8x16_ge(a, b);
 }
 
-// CHECK-LABEL: @test_u8x16_ge(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <16 x i8>
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_u8x16_ge(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[CMP_I:%.*]] = icmp uge <16 x i8> [[TMP0]], [[TMP1]]
 // CHECK-NEXT:    [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[SEXT_I]] to <4 x i32>
@@ -1002,10 +1094,11 @@ v128_t test_u8x16_ge(v128_t a, v128_t b) {
   return wasm_u8x16_ge(a, b);
 }
 
-// CHECK-LABEL: @test_i16x8_eq(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <8 x i16>
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_i16x8_eq(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <8 x i16>
 // CHECK-NEXT:    [[CMP_I:%.*]] = icmp eq <8 x i16> [[TMP0]], [[TMP1]]
 // CHECK-NEXT:    [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[SEXT_I]] to <4 x i32>
@@ -1015,10 +1108,11 @@ v128_t test_i16x8_eq(v128_t a, v128_t b) {
   return wasm_i16x8_eq(a, b);
 }
 
-// CHECK-LABEL: @test_i16x8_ne(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <8 x i16>
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_i16x8_ne(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <8 x i16>
 // CHECK-NEXT:    [[CMP_I:%.*]] = icmp ne <8 x i16> [[TMP0]], [[TMP1]]
 // CHECK-NEXT:    [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[SEXT_I]] to <4 x i32>
@@ -1028,10 +1122,11 @@ v128_t test_i16x8_ne(v128_t a, v128_t b) {
   return wasm_i16x8_ne(a, b);
 }
 
-// CHECK-LABEL: @test_i16x8_lt(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <8 x i16>
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_i16x8_lt(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <8 x i16>
 // CHECK-NEXT:    [[CMP_I:%.*]] = icmp slt <8 x i16> [[TMP0]], [[TMP1]]
 // CHECK-NEXT:    [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[SEXT_I]] to <4 x i32>
@@ -1041,10 +1136,11 @@ v128_t test_i16x8_lt(v128_t a, v128_t b) {
   return wasm_i16x8_lt(a, b);
 }
 
-// CHECK-LABEL: @test_u16x8_lt(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <8 x i16>
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_u16x8_lt(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <8 x i16>
 // CHECK-NEXT:    [[CMP_I:%.*]] = icmp ult <8 x i16> [[TMP0]], [[TMP1]]
 // CHECK-NEXT:    [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[SEXT_I]] to <4 x i32>
@@ -1054,10 +1150,11 @@ v128_t test_u16x8_lt(v128_t a, v128_t b) {
   return wasm_u16x8_lt(a, b);
 }
 
-// CHECK-LABEL: @test_i16x8_gt(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <8 x i16>
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_i16x8_gt(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <8 x i16>
 // CHECK-NEXT:    [[CMP_I:%.*]] = icmp sgt <8 x i16> [[TMP0]], [[TMP1]]
 // CHECK-NEXT:    [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[SEXT_I]] to <4 x i32>
@@ -1067,10 +1164,11 @@ v128_t test_i16x8_gt(v128_t a, v128_t b) {
   return wasm_i16x8_gt(a, b);
 }
 
-// CHECK-LABEL: @test_u16x8_gt(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <8 x i16>
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_u16x8_gt(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <8 x i16>
 // CHECK-NEXT:    [[CMP_I:%.*]] = icmp ugt <8 x i16> [[TMP0]], [[TMP1]]
 // CHECK-NEXT:    [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[SEXT_I]] to <4 x i32>
@@ -1080,10 +1178,11 @@ v128_t test_u16x8_gt(v128_t a, v128_t b) {
   return wasm_u16x8_gt(a, b);
 }
 
-// CHECK-LABEL: @test_i16x8_le(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <8 x i16>
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_i16x8_le(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <8 x i16>
 // CHECK-NEXT:    [[CMP_I:%.*]] = icmp sle <8 x i16> [[TMP0]], [[TMP1]]
 // CHECK-NEXT:    [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[SEXT_I]] to <4 x i32>
@@ -1093,10 +1192,11 @@ v128_t test_i16x8_le(v128_t a, v128_t b) {
   return wasm_i16x8_le(a, b);
 }
 
-// CHECK-LABEL: @test_u16x8_le(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <8 x i16>
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_u16x8_le(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <8 x i16>
 // CHECK-NEXT:    [[CMP_I:%.*]] = icmp ule <8 x i16> [[TMP0]], [[TMP1]]
 // CHECK-NEXT:    [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[SEXT_I]] to <4 x i32>
@@ -1106,10 +1206,11 @@ v128_t test_u16x8_le(v128_t a, v128_t b) {
   return wasm_u16x8_le(a, b);
 }
 
-// CHECK-LABEL: @test_i16x8_ge(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <8 x i16>
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_i16x8_ge(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <8 x i16>
 // CHECK-NEXT:    [[CMP_I:%.*]] = icmp sge <8 x i16> [[TMP0]], [[TMP1]]
 // CHECK-NEXT:    [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[SEXT_I]] to <4 x i32>
@@ -1119,10 +1220,11 @@ v128_t test_i16x8_ge(v128_t a, v128_t b) {
   return wasm_i16x8_ge(a, b);
 }
 
-// CHECK-LABEL: @test_u16x8_ge(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <8 x i16>
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_u16x8_ge(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <8 x i16>
 // CHECK-NEXT:    [[CMP_I:%.*]] = icmp uge <8 x i16> [[TMP0]], [[TMP1]]
 // CHECK-NEXT:    [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[SEXT_I]] to <4 x i32>
@@ -1132,9 +1234,10 @@ v128_t test_u16x8_ge(v128_t a, v128_t b) {
   return wasm_u16x8_ge(a, b);
 }
 
-// CHECK-LABEL: @test_i32x4_eq(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[CMP_I:%.*]] = icmp eq <4 x i32> [[A:%.*]], [[B:%.*]]
+// CHECK-LABEL: define hidden range(i32 -1, 1) <4 x i32> @test_i32x4_eq(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp eq <4 x i32> [[A]], [[B]]
 // CHECK-NEXT:    [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[SEXT_I]]
 //
@@ -1142,9 +1245,10 @@ v128_t test_i32x4_eq(v128_t a, v128_t b) {
   return wasm_i32x4_eq(a, b);
 }
 
-// CHECK-LABEL: @test_i32x4_ne(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[CMP_I:%.*]] = icmp ne <4 x i32> [[A:%.*]], [[B:%.*]]
+// CHECK-LABEL: define hidden range(i32 -1, 1) <4 x i32> @test_i32x4_ne(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp ne <4 x i32> [[A]], [[B]]
 // CHECK-NEXT:    [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[SEXT_I]]
 //
@@ -1152,9 +1256,10 @@ v128_t test_i32x4_ne(v128_t a, v128_t b) {
   return wasm_i32x4_ne(a, b);
 }
 
-// CHECK-LABEL: @test_i32x4_lt(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[CMP_I:%.*]] = icmp slt <4 x i32> [[A:%.*]], [[B:%.*]]
+// CHECK-LABEL: define hidden range(i32 -1, 1) <4 x i32> @test_i32x4_lt(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp slt <4 x i32> [[A]], [[B]]
 // CHECK-NEXT:    [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[SEXT_I]]
 //
@@ -1162,9 +1267,10 @@ v128_t test_i32x4_lt(v128_t a, v128_t b) {
   return wasm_i32x4_lt(a, b);
 }
 
-// CHECK-LABEL: @test_u32x4_lt(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[CMP_I:%.*]] = icmp ult <4 x i32> [[A:%.*]], [[B:%.*]]
+// CHECK-LABEL: define hidden range(i32 -1, 1) <4 x i32> @test_u32x4_lt(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp ult <4 x i32> [[A]], [[B]]
 // CHECK-NEXT:    [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[SEXT_I]]
 //
@@ -1172,9 +1278,10 @@ v128_t test_u32x4_lt(v128_t a, v128_t b) {
   return wasm_u32x4_lt(a, b);
 }
 
-// CHECK-LABEL: @test_i32x4_gt(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[CMP_I:%.*]] = icmp sgt <4 x i32> [[A:%.*]], [[B:%.*]]
+// CHECK-LABEL: define hidden range(i32 -1, 1) <4 x i32> @test_i32x4_gt(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp sgt <4 x i32> [[A]], [[B]]
 // CHECK-NEXT:    [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[SEXT_I]]
 //
@@ -1182,9 +1289,10 @@ v128_t test_i32x4_gt(v128_t a, v128_t b) {
   return wasm_i32x4_gt(a, b);
 }
 
-// CHECK-LABEL: @test_u32x4_gt(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[CMP_I:%.*]] = icmp ugt <4 x i32> [[A:%.*]], [[B:%.*]]
+// CHECK-LABEL: define hidden range(i32 -1, 1) <4 x i32> @test_u32x4_gt(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp ugt <4 x i32> [[A]], [[B]]
 // CHECK-NEXT:    [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[SEXT_I]]
 //
@@ -1192,9 +1300,10 @@ v128_t test_u32x4_gt(v128_t a, v128_t b) {
   return wasm_u32x4_gt(a, b);
 }
 
-// CHECK-LABEL: @test_i32x4_le(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[CMP_I:%.*]] = icmp sle <4 x i32> [[A:%.*]], [[B:%.*]]
+// CHECK-LABEL: define hidden range(i32 -1, 1) <4 x i32> @test_i32x4_le(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp sle <4 x i32> [[A]], [[B]]
 // CHECK-NEXT:    [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[SEXT_I]]
 //
@@ -1202,9 +1311,10 @@ v128_t test_i32x4_le(v128_t a, v128_t b) {
   return wasm_i32x4_le(a, b);
 }
 
-// CHECK-LABEL: @test_u32x4_le(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[CMP_I:%.*]] = icmp ule <4 x i32> [[A:%.*]], [[B:%.*]]
+// CHECK-LABEL: define hidden range(i32 -1, 1) <4 x i32> @test_u32x4_le(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp ule <4 x i32> [[A]], [[B]]
 // CHECK-NEXT:    [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[SEXT_I]]
 //
@@ -1212,9 +1322,10 @@ v128_t test_u32x4_le(v128_t a, v128_t b) {
   return wasm_u32x4_le(a, b);
 }
 
-// CHECK-LABEL: @test_i32x4_ge(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[CMP_I:%.*]] = icmp sge <4 x i32> [[A:%.*]], [[B:%.*]]
+// CHECK-LABEL: define hidden range(i32 -1, 1) <4 x i32> @test_i32x4_ge(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp sge <4 x i32> [[A]], [[B]]
 // CHECK-NEXT:    [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[SEXT_I]]
 //
@@ -1222,9 +1333,10 @@ v128_t test_i32x4_ge(v128_t a, v128_t b) {
   return wasm_i32x4_ge(a, b);
 }
 
-// CHECK-LABEL: @test_u32x4_ge(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[CMP_I:%.*]] = icmp uge <4 x i32> [[A:%.*]], [[B:%.*]]
+// CHECK-LABEL: define hidden range(i32 -1, 1) <4 x i32> @test_u32x4_ge(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp uge <4 x i32> [[A]], [[B]]
 // CHECK-NEXT:    [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[SEXT_I]]
 //
@@ -1232,10 +1344,11 @@ v128_t test_u32x4_ge(v128_t a, v128_t b) {
   return wasm_u32x4_ge(a, b);
 }
 
-// CHECK-LABEL: @test_i64x2_eq(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x i64>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <2 x i64>
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_i64x2_eq(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <2 x i64>
 // CHECK-NEXT:    [[CMP_I:%.*]] = icmp eq <2 x i64> [[TMP0]], [[TMP1]]
 // CHECK-NEXT:    [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[SEXT_I]] to <4 x i32>
@@ -1245,10 +1358,11 @@ v128_t test_i64x2_eq(v128_t a, v128_t b) {
   return wasm_i64x2_eq(a, b);
 }
 
-// CHECK-LABEL: @test_i64x2_ne(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x i64>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <2 x i64>
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_i64x2_ne(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <2 x i64>
 // CHECK-NEXT:    [[CMP_I:%.*]] = icmp ne <2 x i64> [[TMP0]], [[TMP1]]
 // CHECK-NEXT:    [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[SEXT_I]] to <4 x i32>
@@ -1258,10 +1372,11 @@ v128_t test_i64x2_ne(v128_t a, v128_t b) {
   return wasm_i64x2_ne(a, b);
 }
 
-// CHECK-LABEL: @test_i64x2_lt(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x i64>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <2 x i64>
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_i64x2_lt(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <2 x i64>
 // CHECK-NEXT:    [[CMP_I:%.*]] = icmp slt <2 x i64> [[TMP0]], [[TMP1]]
 // CHECK-NEXT:    [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[SEXT_I]] to <4 x i32>
@@ -1271,10 +1386,11 @@ v128_t test_i64x2_lt(v128_t a, v128_t b) {
   return wasm_i64x2_lt(a, b);
 }
 
-// CHECK-LABEL: @test_i64x2_gt(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x i64>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <2 x i64>
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_i64x2_gt(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <2 x i64>
 // CHECK-NEXT:    [[CMP_I:%.*]] = icmp sgt <2 x i64> [[TMP0]], [[TMP1]]
 // CHECK-NEXT:    [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[SEXT_I]] to <4 x i32>
@@ -1284,10 +1400,11 @@ v128_t test_i64x2_gt(v128_t a, v128_t b) {
   return wasm_i64x2_gt(a, b);
 }
 
-// CHECK-LABEL: @test_i64x2_le(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x i64>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <2 x i64>
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_i64x2_le(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <2 x i64>
 // CHECK-NEXT:    [[CMP_I:%.*]] = icmp sle <2 x i64> [[TMP0]], [[TMP1]]
 // CHECK-NEXT:    [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[SEXT_I]] to <4 x i32>
@@ -1297,10 +1414,11 @@ v128_t test_i64x2_le(v128_t a, v128_t b) {
   return wasm_i64x2_le(a, b);
 }
 
-// CHECK-LABEL: @test_i64x2_ge(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x i64>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <2 x i64>
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_i64x2_ge(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <2 x i64>
 // CHECK-NEXT:    [[CMP_I:%.*]] = icmp sge <2 x i64> [[TMP0]], [[TMP1]]
 // CHECK-NEXT:    [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[SEXT_I]] to <4 x i32>
@@ -1310,10 +1428,11 @@ v128_t test_i64x2_ge(v128_t a, v128_t b) {
   return wasm_i64x2_ge(a, b);
 }
 
-// CHECK-LABEL: @test_f32x4_eq(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <4 x float>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <4 x float>
+// CHECK-LABEL: define hidden range(i32 -1, 1) <4 x i32> @test_f32x4_eq(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <4 x float>
 // CHECK-NEXT:    [[CMP_I:%.*]] = fcmp oeq <4 x float> [[TMP0]], [[TMP1]]
 // CHECK-NEXT:    [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[SEXT_I]]
@@ -1322,10 +1441,11 @@ v128_t test_f32x4_eq(v128_t a, v128_t b) {
   return wasm_f32x4_eq(a, b);
 }
 
-// CHECK-LABEL: @test_f32x4_ne(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <4 x float>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <4 x float>
+// CHECK-LABEL: define hidden range(i32 -1, 1) <4 x i32> @test_f32x4_ne(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <4 x float>
 // CHECK-NEXT:    [[CMP_I:%.*]] = fcmp une <4 x float> [[TMP0]], [[TMP1]]
 // CHECK-NEXT:    [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[SEXT_I]]
@@ -1334,10 +1454,11 @@ v128_t test_f32x4_ne(v128_t a, v128_t b) {
   return wasm_f32x4_ne(a, b);
 }
 
-// CHECK-LABEL: @test_f32x4_lt(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <4 x float>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <4 x float>
+// CHECK-LABEL: define hidden range(i32 -1, 1) <4 x i32> @test_f32x4_lt(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <4 x float>
 // CHECK-NEXT:    [[CMP_I:%.*]] = fcmp olt <4 x float> [[TMP0]], [[TMP1]]
 // CHECK-NEXT:    [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[SEXT_I]]
@@ -1346,10 +1467,11 @@ v128_t test_f32x4_lt(v128_t a, v128_t b) {
   return wasm_f32x4_lt(a, b);
 }
 
-// CHECK-LABEL: @test_f32x4_gt(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <4 x float>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <4 x float>
+// CHECK-LABEL: define hidden range(i32 -1, 1) <4 x i32> @test_f32x4_gt(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <4 x float>
 // CHECK-NEXT:    [[CMP_I:%.*]] = fcmp ogt <4 x float> [[TMP0]], [[TMP1]]
 // CHECK-NEXT:    [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[SEXT_I]]
@@ -1358,10 +1480,11 @@ v128_t test_f32x4_gt(v128_t a, v128_t b) {
   return wasm_f32x4_gt(a, b);
 }
 
-// CHECK-LABEL: @test_f32x4_le(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <4 x float>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <4 x float>
+// CHECK-LABEL: define hidden range(i32 -1, 1) <4 x i32> @test_f32x4_le(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <4 x float>
 // CHECK-NEXT:    [[CMP_I:%.*]] = fcmp ole <4 x float> [[TMP0]], [[TMP1]]
 // CHECK-NEXT:    [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[SEXT_I]]
@@ -1370,10 +1493,11 @@ v128_t test_f32x4_le(v128_t a, v128_t b) {
   return wasm_f32x4_le(a, b);
 }
 
-// CHECK-LABEL: @test_f32x4_ge(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <4 x float>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <4 x float>
+// CHECK-LABEL: define hidden range(i32 -1, 1) <4 x i32> @test_f32x4_ge(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <4 x float>
 // CHECK-NEXT:    [[CMP_I:%.*]] = fcmp oge <4 x float> [[TMP0]], [[TMP1]]
 // CHECK-NEXT:    [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[SEXT_I]]
@@ -1382,10 +1506,11 @@ v128_t test_f32x4_ge(v128_t a, v128_t b) {
   return wasm_f32x4_ge(a, b);
 }
 
-// CHECK-LABEL: @test_f64x2_eq(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x double>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <2 x double>
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_f64x2_eq(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <2 x double>
 // CHECK-NEXT:    [[CMP_I:%.*]] = fcmp oeq <2 x double> [[TMP0]], [[TMP1]]
 // CHECK-NEXT:    [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[SEXT_I]] to <4 x i32>
@@ -1395,10 +1520,11 @@ v128_t test_f64x2_eq(v128_t a, v128_t b) {
   return wasm_f64x2_eq(a, b);
 }
 
-// CHECK-LABEL: @test_f64x2_ne(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x double>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <2 x double>
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_f64x2_ne(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <2 x double>
 // CHECK-NEXT:    [[CMP_I:%.*]] = fcmp une <2 x double> [[TMP0]], [[TMP1]]
 // CHECK-NEXT:    [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[SEXT_I]] to <4 x i32>
@@ -1408,10 +1534,11 @@ v128_t test_f64x2_ne(v128_t a, v128_t b) {
   return wasm_f64x2_ne(a, b);
 }
 
-// CHECK-LABEL: @test_f64x2_lt(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x double>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <2 x double>
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_f64x2_lt(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <2 x double>
 // CHECK-NEXT:    [[CMP_I:%.*]] = fcmp olt <2 x double> [[TMP0]], [[TMP1]]
 // CHECK-NEXT:    [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[SEXT_I]] to <4 x i32>
@@ -1421,10 +1548,11 @@ v128_t test_f64x2_lt(v128_t a, v128_t b) {
   return wasm_f64x2_lt(a, b);
 }
 
-// CHECK-LABEL: @test_f64x2_gt(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x double>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <2 x double>
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_f64x2_gt(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <2 x double>
 // CHECK-NEXT:    [[CMP_I:%.*]] = fcmp ogt <2 x double> [[TMP0]], [[TMP1]]
 // CHECK-NEXT:    [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[SEXT_I]] to <4 x i32>
@@ -1434,10 +1562,11 @@ v128_t test_f64x2_gt(v128_t a, v128_t b) {
   return wasm_f64x2_gt(a, b);
 }
 
-// CHECK-LABEL: @test_f64x2_le(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x double>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <2 x double>
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_f64x2_le(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <2 x double>
 // CHECK-NEXT:    [[CMP_I:%.*]] = fcmp ole <2 x double> [[TMP0]], [[TMP1]]
 // CHECK-NEXT:    [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[SEXT_I]] to <4 x i32>
@@ -1447,10 +1576,11 @@ v128_t test_f64x2_le(v128_t a, v128_t b) {
   return wasm_f64x2_le(a, b);
 }
 
-// CHECK-LABEL: @test_f64x2_ge(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x double>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <2 x double>
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_f64x2_ge(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <2 x double>
 // CHECK-NEXT:    [[CMP_I:%.*]] = fcmp oge <2 x double> [[TMP0]], [[TMP1]]
 // CHECK-NEXT:    [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[SEXT_I]] to <4 x i32>
@@ -1460,55 +1590,61 @@ v128_t test_f64x2_ge(v128_t a, v128_t b) {
   return wasm_f64x2_ge(a, b);
 }
 
-// CHECK-LABEL: @test_v128_not(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[NOT_I:%.*]] = xor <4 x i32> [[A:%.*]], splat (i32 -1)
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_v128_not(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[NOT_I:%.*]] = xor <4 x i32> [[A]], splat (i32 -1)
 // CHECK-NEXT:    ret <4 x i32> [[NOT_I]]
 //
 v128_t test_v128_not(v128_t a) {
   return wasm_v128_not(a);
 }
 
-// CHECK-LABEL: @test_v128_and(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[AND_I:%.*]] = and <4 x i32> [[B:%.*]], [[A:%.*]]
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_v128_and(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[AND_I:%.*]] = and <4 x i32> [[B]], [[A]]
 // CHECK-NEXT:    ret <4 x i32> [[AND_I]]
 //
 v128_t test_v128_and(v128_t a, v128_t b) {
   return wasm_v128_and(a, b);
 }
 
-// CHECK-LABEL: @test_v128_or(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[OR_I:%.*]] = or <4 x i32> [[B:%.*]], [[A:%.*]]
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_v128_or(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[OR_I:%.*]] = or <4 x i32> [[B]], [[A]]
 // CHECK-NEXT:    ret <4 x i32> [[OR_I]]
 //
 v128_t test_v128_or(v128_t a, v128_t b) {
   return wasm_v128_or(a, b);
 }
 
-// CHECK-LABEL: @test_v128_xor(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[XOR_I:%.*]] = xor <4 x i32> [[B:%.*]], [[A:%.*]]
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_v128_xor(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[XOR_I:%.*]] = xor <4 x i32> [[B]], [[A]]
 // CHECK-NEXT:    ret <4 x i32> [[XOR_I]]
 //
 v128_t test_v128_xor(v128_t a, v128_t b) {
   return wasm_v128_xor(a, b);
 }
 
-// CHECK-LABEL: @test_v128_andnot(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[NOT_I:%.*]] = xor <4 x i32> [[B:%.*]], splat (i32 -1)
-// CHECK-NEXT:    [[AND_I:%.*]] = and <4 x i32> [[A:%.*]], [[NOT_I]]
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_v128_andnot(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[NOT_I:%.*]] = xor <4 x i32> [[B]], splat (i32 -1)
+// CHECK-NEXT:    [[AND_I:%.*]] = and <4 x i32> [[A]], [[NOT_I]]
 // CHECK-NEXT:    ret <4 x i32> [[AND_I]]
 //
 v128_t test_v128_andnot(v128_t a, v128_t b) {
   return wasm_v128_andnot(a, b);
 }
 
-// CHECK-LABEL: @test_v128_any_true(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
+// CHECK-LABEL: define hidden zeroext i1 @test_v128_any_true(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.wasm.anytrue.v16i8(<16 x i8> [[TMP0]])
 // CHECK-NEXT:    [[TOBOOL_I:%.*]] = icmp ne i32 [[TMP1]], 0
 // CHECK-NEXT:    ret i1 [[TOBOOL_I]]
@@ -1517,18 +1653,20 @@ bool test_v128_any_true(v128_t a) {
   return wasm_v128_any_true(a);
 }
 
-// CHECK-LABEL: @test_v128_bitselect(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.wasm.bitselect.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[MASK:%.*]])
+// CHECK-LABEL: define hidden <4 x i32> @test_v128_bitselect(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]], <4 x i32> noundef [[MASK:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.wasm.bitselect.v4i32(<4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> [[MASK]])
 // CHECK-NEXT:    ret <4 x i32> [[TMP0]]
 //
 v128_t test_v128_bitselect(v128_t a, v128_t b, v128_t mask) {
   return wasm_v128_bitselect(a, b, mask);
 }
 
-// CHECK-LABEL: @test_i8x16_abs(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_i8x16_abs(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
 // CHECK-NEXT:    [[ABS_I:%.*]] = tail call <16 x i8> @llvm.abs.v16i8(<16 x i8> [[TMP0]], i1 false)
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[ABS_I]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP1]]
@@ -1537,9 +1675,10 @@ v128_t test_i8x16_abs(v128_t a) {
   return wasm_i8x16_abs(a);
 }
 
-// CHECK-LABEL: @test_i8x16_neg(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_i8x16_neg(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
 // CHECK-NEXT:    [[SUB_I:%.*]] = sub <16 x i8> zeroinitializer, [[TMP0]]
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[SUB_I]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP1]]
@@ -1548,9 +1687,10 @@ v128_t test_i8x16_neg(v128_t a) {
   return wasm_i8x16_neg(a);
 }
 
-// CHECK-LABEL: @test_i8x16_all_true(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
+// CHECK-LABEL: define hidden zeroext i1 @test_i8x16_all_true(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.wasm.alltrue.v16i8(<16 x i8> [[TMP0]])
 // CHECK-NEXT:    [[TOBOOL_I:%.*]] = icmp ne i32 [[TMP1]], 0
 // CHECK-NEXT:    ret i1 [[TOBOOL_I]]
@@ -1559,9 +1699,10 @@ bool test_i8x16_all_true(v128_t a) {
   return wasm_i8x16_all_true(a);
 }
 
-// CHECK-LABEL: @test_i8x16_bitmask(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
+// CHECK-LABEL: define hidden i32 @test_i8x16_bitmask(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.wasm.bitmask.v16i8(<16 x i8> [[TMP0]])
 // CHECK-NEXT:    ret i32 [[TMP1]]
 //
@@ -1569,21 +1710,23 @@ uint32_t test_i8x16_bitmask(v128_t a) {
   return wasm_i8x16_bitmask(a);
 }
 
-// CHECK-LABEL: @test_i8x16_popcnt(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call range(i8 0, 9) <16 x i8> @llvm.ctpop.v16i8(<16 x i8> [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+// CHECK-LABEL: define hidden <4 x i32> @test_i8x16_popcnt(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[ELT_CTPOP_I:%.*]] = tail call range(i8 0, 9) <16 x i8> @llvm.ctpop.v16i8(<16 x i8> [[TMP0]])
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[ELT_CTPOP_I]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP1]]
 //
 v128_t test_i8x16_popcnt(v128_t a) {
   return wasm_i8x16_popcnt(a);
 }
 
-// CHECK-LABEL: @test_i8x16_shl(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[B:%.*]] to i8
+// CHECK-LABEL: define hidden <4 x i32> @test_i8x16_shl(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], i32 noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[B]] to i8
 // CHECK-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], 7
 // CHECK-NEXT:    [[TMP3:%.*]] = insertelement <16 x i8> poison, i8 [[TMP2]], i64 0
 // CHECK-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <16 x i8> [[TMP3]], <16 x i8> poison, <16 x i32> zeroinitializer
@@ -1595,10 +1738,11 @@ v128_t test_i8x16_shl(v128_t a, uint32_t b) {
   return wasm_i8x16_shl(a, b);
 }
 
-// CHECK-LABEL: @test_i8x16_shr(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[B:%.*]] to i8
+// CHECK-LABEL: define hidden <4 x i32> @test_i8x16_shr(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], i32 noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[B]] to i8
 // CHECK-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], 7
 // CHECK-NEXT:    [[TMP3:%.*]] = insertelement <16 x i8> poison, i8 [[TMP2]], i64 0
 // CHECK-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <16 x i8> [[TMP3]], <16 x i8> poison, <16 x i32> zeroinitializer
@@ -1610,10 +1754,11 @@ v128_t test_i8x16_shr(v128_t a, uint32_t b) {
   return wasm_i8x16_shr(a, b);
 }
 
-// CHECK-LABEL: @test_u8x16_shr(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[B:%.*]] to i8
+// CHECK-LABEL: define hidden <4 x i32> @test_u8x16_shr(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], i32 noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[B]] to i8
 // CHECK-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], 7
 // CHECK-NEXT:    [[TMP3:%.*]] = insertelement <16 x i8> poison, i8 [[TMP2]], i64 0
 // CHECK-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <16 x i8> [[TMP3]], <16 x i8> poison, <16 x i32> zeroinitializer
@@ -1625,10 +1770,11 @@ v128_t test_u8x16_shr(v128_t a, uint32_t b) {
   return wasm_u8x16_shr(a, b);
 }
 
-// CHECK-LABEL: @test_i8x16_add(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <16 x i8>
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_i8x16_add(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[ADD_I:%.*]] = add <16 x i8> [[TMP1]], [[TMP0]]
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[ADD_I]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP2]]
@@ -1637,34 +1783,37 @@ v128_t test_i8x16_add(v128_t a, v128_t b) {
   return wasm_i8x16_add(a, b);
 }
 
-// CHECK-LABEL: @test_i8x16_add_sat(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
-// CHECK-NEXT:    ret <4 x i32> [[TMP3]]
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_i8x16_add_sat(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[ELT_SAT_I:%.*]] = tail call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[ELT_SAT_I]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
 //
 v128_t test_i8x16_add_sat(v128_t a, v128_t b) {
   return wasm_i8x16_add_sat(a, b);
 }
 
-// CHECK-LABEL: @test_u8x16_add_sat(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
-// CHECK-NEXT:    ret <4 x i32> [[TMP3]]
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_u8x16_add_sat(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[ELT_SAT_I:%.*]] = tail call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[ELT_SAT_I]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
 //
 v128_t test_u8x16_add_sat(v128_t a, v128_t b) {
   return wasm_u8x16_add_sat(a, b);
 }
 
-// CHECK-LABEL: @test_i8x16_sub(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <16 x i8>
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_i8x16_sub(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[SUB_I:%.*]] = sub <16 x i8> [[TMP0]], [[TMP1]]
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[SUB_I]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP2]]
@@ -1673,82 +1822,89 @@ v128_t test_i8x16_sub(v128_t a, v128_t b) {
   return wasm_i8x16_sub(a, b);
 }
 
-// CHECK-LABEL: @test_i8x16_sub_sat(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
-// CHECK-NEXT:    ret <4 x i32> [[TMP3]]
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_i8x16_sub_sat(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[ELT_SAT_I:%.*]] = tail call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[ELT_SAT_I]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
 //
 v128_t test_i8x16_sub_sat(v128_t a, v128_t b) {
   return wasm_i8x16_sub_sat(a, b);
 }
 
-// CHECK-LABEL: @test_u8x16_sub_sat(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
-// CHECK-NEXT:    ret <4 x i32> [[TMP3]]
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_u8x16_sub_sat(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[ELT_SAT_I:%.*]] = tail call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[ELT_SAT_I]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
 //
 v128_t test_u8x16_sub_sat(v128_t a, v128_t b) {
   return wasm_u8x16_sub_sat(a, b);
 }
 
-// CHECK-LABEL: @test_i8x16_min(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.smin.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
-// CHECK-NEXT:    ret <4 x i32> [[TMP3]]
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_i8x16_min(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[ELT_MIN_I:%.*]] = tail call <16 x i8> @llvm.smin.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[ELT_MIN_I]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
 //
 v128_t test_i8x16_min(v128_t a, v128_t b) {
   return wasm_i8x16_min(a, b);
 }
 
-// CHECK-LABEL: @test_u8x16_min(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
-// CHECK-NEXT:    ret <4 x i32> [[TMP3]]
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_u8x16_min(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[ELT_MIN_I:%.*]] = tail call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[ELT_MIN_I]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
 //
 v128_t test_u8x16_min(v128_t a, v128_t b) {
   return wasm_u8x16_min(a, b);
 }
 
-// CHECK-LABEL: @test_i8x16_max(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.smax.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
-// CHECK-NEXT:    ret <4 x i32> [[TMP3]]
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_i8x16_max(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[ELT_MAX_I:%.*]] = tail call <16 x i8> @llvm.smax.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[ELT_MAX_I]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
 //
 v128_t test_i8x16_max(v128_t a, v128_t b) {
   return wasm_i8x16_max(a, b);
 }
 
-// CHECK-LABEL: @test_u8x16_max(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.umax.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
-// CHECK-NEXT:    ret <4 x i32> [[TMP3]]
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_u8x16_max(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[ELT_MAX_I:%.*]] = tail call <16 x i8> @llvm.umax.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[ELT_MAX_I]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
 //
 v128_t test_u8x16_max(v128_t a, v128_t b) {
   return wasm_u8x16_max(a, b);
 }
 
-// CHECK-LABEL: @test_u8x16_avgr(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <16 x i8>
+// CHECK-LABEL: define hidden <4 x i32> @test_u8x16_avgr(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.wasm.avgr.unsigned.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP3]]
@@ -1757,9 +1913,10 @@ v128_t test_u8x16_avgr(v128_t a, v128_t b) {
   return wasm_u8x16_avgr(a, b);
 }
 
-// CHECK-LABEL: @test_i16x8_abs(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16>
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_i16x8_abs(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <8 x i16>
 // CHECK-NEXT:    [[ABS_I:%.*]] = tail call <8 x i16> @llvm.abs.v8i16(<8 x i16> [[TMP0]], i1 false)
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[ABS_I]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP1]]
@@ -1768,9 +1925,10 @@ v128_t test_i16x8_abs(v128_t a) {
   return wasm_i16x8_abs(a);
 }
 
-// CHECK-LABEL: @test_i16x8_neg(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16>
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_i16x8_neg(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <8 x i16>
 // CHECK-NEXT:    [[SUB_I:%.*]] = sub <8 x i16> zeroinitializer, [[TMP0]]
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[SUB_I]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP1]]
@@ -1779,9 +1937,10 @@ v128_t test_i16x8_neg(v128_t a) {
   return wasm_i16x8_neg(a);
 }
 
-// CHECK-LABEL: @test_i16x8_all_true(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16>
+// CHECK-LABEL: define hidden zeroext i1 @test_i16x8_all_true(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <8 x i16>
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.wasm.alltrue.v8i16(<8 x i16> [[TMP0]])
 // CHECK-NEXT:    [[TOBOOL_I:%.*]] = icmp ne i32 [[TMP1]], 0
 // CHECK-NEXT:    ret i1 [[TOBOOL_I]]
@@ -1790,9 +1949,10 @@ bool test_i16x8_all_true(v128_t a) {
   return wasm_i16x8_all_true(a);
 }
 
-// CHECK-LABEL: @test_i16x8_bitmask(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16>
+// CHECK-LABEL: define hidden i32 @test_i16x8_bitmask(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <8 x i16>
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.wasm.bitmask.v8i16(<8 x i16> [[TMP0]])
 // CHECK-NEXT:    ret i32 [[TMP1]]
 //
@@ -1800,10 +1960,11 @@ uint32_t test_i16x8_bitmask(v128_t a) {
   return wasm_i16x8_bitmask(a);
 }
 
-// CHECK-LABEL: @test_i16x8_shl(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16>
-// CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[B:%.*]] to i16
+// CHECK-LABEL: define hidden <4 x i32> @test_i16x8_shl(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], i32 noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[B]] to i16
 // CHECK-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 15
 // CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x i16> poison, i16 [[TMP2]], i64 0
 // CHECK-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> poison, <8 x i32> zeroinitializer
@@ -1815,10 +1976,11 @@ v128_t test_i16x8_shl(v128_t a, uint32_t b) {
   return wasm_i16x8_shl(a, b);
 }
 
-// CHECK-LABEL: @test_i16x8_shr(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16>
-// CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[B:%.*]] to i16
+// CHECK-LABEL: define hidden <4 x i32> @test_i16x8_shr(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], i32 noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[B]] to i16
 // CHECK-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 15
 // CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x i16> poison, i16 [[TMP2]], i64 0
 // CHECK-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> poison, <8 x i32> zeroinitializer
@@ -1830,10 +1992,11 @@ v128_t test_i16x8_shr(v128_t a, uint32_t b) {
   return wasm_i16x8_shr(a, b);
 }
 
-// CHECK-LABEL: @test_u16x8_shr(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16>
-// CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[B:%.*]] to i16
+// CHECK-LABEL: define hidden <4 x i32> @test_u16x8_shr(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], i32 noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[B]] to i16
 // CHECK-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 15
 // CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x i16> poison, i16 [[TMP2]], i64 0
 // CHECK-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> poison, <8 x i32> zeroinitializer
@@ -1845,10 +2008,11 @@ v128_t test_u16x8_shr(v128_t a, uint32_t b) {
   return wasm_u16x8_shr(a, b);
 }
 
-// CHECK-LABEL: @test_i16x8_add(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <8 x i16>
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_i16x8_add(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <8 x i16>
 // CHECK-NEXT:    [[ADD_I:%.*]] = add <8 x i16> [[TMP1]], [[TMP0]]
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[ADD_I]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP2]]
@@ -1857,34 +2021,37 @@ v128_t test_i16x8_add(v128_t a, v128_t b) {
   return wasm_i16x8_add(a, b);
 }
 
-// CHECK-LABEL: @test_i16x8_add_sat(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <8 x i16>
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <4 x i32>
-// CHECK-NEXT:    ret <4 x i32> [[TMP3]]
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_i16x8_add_sat(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <8 x i16>
+// CHECK-NEXT:    [[ELT_SAT_I:%.*]] = tail call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[ELT_SAT_I]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
 //
 v128_t test_i16x8_add_sat(v128_t a, v128_t b) {
   return wasm_i16x8_add_sat(a, b);
 }
 
-// CHECK-LABEL: @test_u16x8_add_sat(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <8 x i16>
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <4 x i32>
-// CHECK-NEXT:    ret <4 x i32> [[TMP3]]
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_u16x8_add_sat(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <8 x i16>
+// CHECK-NEXT:    [[ELT_SAT_I:%.*]] = tail call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[ELT_SAT_I]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
 //
 v128_t test_u16x8_add_sat(v128_t a, v128_t b) {
   return wasm_u16x8_add_sat(a, b);
 }
 
-// CHECK-LABEL: @test_i16x8_sub(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <8 x i16>
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_i16x8_sub(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <8 x i16>
 // CHECK-NEXT:    [[SUB_I:%.*]] = sub <8 x i16> [[TMP0]], [[TMP1]]
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[SUB_I]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP2]]
@@ -1893,34 +2060,37 @@ v128_t test_i16x8_sub(v128_t a, v128_t b) {
   return wasm_i16x8_sub(a, b);
 }
 
-// CHECK-LABEL: @test_i16x8_sub_sat(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <8 x i16>
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <4 x i32>
-// CHECK-NEXT:    ret <4 x i32> [[TMP3]]
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_i16x8_sub_sat(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <8 x i16>
+// CHECK-NEXT:    [[ELT_SAT_I:%.*]] = tail call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[ELT_SAT_I]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
 //
 v128_t test_i16x8_sub_sat(v128_t a, v128_t b) {
   return wasm_i16x8_sub_sat(a, b);
 }
 
-// CHECK-LABEL: @test_u16x8_sub_sat(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <8 x i16>
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <4 x i32>
-// CHECK-NEXT:    ret <4 x i32> [[TMP3]]
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_u16x8_sub_sat(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <8 x i16>
+// CHECK-NEXT:    [[ELT_SAT_I:%.*]] = tail call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[ELT_SAT_I]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
 //
 v128_t test_u16x8_sub_sat(v128_t a, v128_t b) {
   return wasm_u16x8_sub_sat(a, b);
 }
 
-// CHECK-LABEL: @test_i16x8_mul(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <8 x i16>
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_i16x8_mul(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <8 x i16>
 // CHECK-NEXT:    [[MUL_I:%.*]] = mul <8 x i16> [[TMP1]], [[TMP0]]
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[MUL_I]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP2]]
@@ -1929,58 +2099,63 @@ v128_t test_i16x8_mul(v128_t a, v128_t b) {
   return wasm_i16x8_mul(a, b);
 }
 
-// CHECK-LABEL: @test_i16x8_min(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <8 x i16>
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.smin.v8i16(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <4 x i32>
-// CHECK-NEXT:    ret <4 x i32> [[TMP3]]
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_i16x8_min(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <8 x i16>
+// CHECK-NEXT:    [[ELT_MIN_I:%.*]] = tail call <8 x i16> @llvm.smin.v8i16(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[ELT_MIN_I]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
 //
 v128_t test_i16x8_min(v128_t a, v128_t b) {
   return wasm_i16x8_min(a, b);
 }
 
-// CHECK-LABEL: @test_u16x8_min(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <8 x i16>
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <4 x i32>
-// CHECK-NEXT:    ret <4 x i32> [[TMP3]]
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_u16x8_min(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <8 x i16>
+// CHECK-NEXT:    [[ELT_MIN_I:%.*]] = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[ELT_MIN_I]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
 //
 v128_t test_u16x8_min(v128_t a, v128_t b) {
   return wasm_u16x8_min(a, b);
 }
 
-// CHECK-LABEL: @test_i16x8_max(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <8 x i16>
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.smax.v8i16(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <4 x i32>
-// CHECK-NEXT:    ret <4 x i32> [[TMP3]]
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_i16x8_max(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <8 x i16>
+// CHECK-NEXT:    [[ELT_MAX_I:%.*]] = tail call <8 x i16> @llvm.smax.v8i16(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[ELT_MAX_I]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
 //
 v128_t test_i16x8_max(v128_t a, v128_t b) {
   return wasm_i16x8_max(a, b);
 }
 
-// CHECK-LABEL: @test_u16x8_max(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <8 x i16>
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.umax.v8i16(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <4 x i32>
-// CHECK-NEXT:    ret <4 x i32> [[TMP3]]
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_u16x8_max(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <8 x i16>
+// CHECK-NEXT:    [[ELT_MAX_I:%.*]] = tail call <8 x i16> @llvm.umax.v8i16(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[ELT_MAX_I]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
 //
 v128_t test_u16x8_max(v128_t a, v128_t b) {
   return wasm_u16x8_max(a, b);
 }
 
-// CHECK-LABEL: @test_u16x8_avgr(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <8 x i16>
+// CHECK-LABEL: define hidden <4 x i32> @test_u16x8_avgr(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <8 x i16>
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.wasm.avgr.unsigned.v8i16(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP3]]
@@ -1989,27 +2164,30 @@ v128_t test_u16x8_avgr(v128_t a, v128_t b) {
   return wasm_u16x8_avgr(a, b);
 }
 
-// CHECK-LABEL: @test_i32x4_abs(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ABS_I:%.*]] = tail call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[A:%.*]], i1 false)
+// CHECK-LABEL: define hidden noundef range(i32 0, -2147483647) <4 x i32> @test_i32x4_abs(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[ABS_I:%.*]] = tail call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[A]], i1 false)
 // CHECK-NEXT:    ret <4 x i32> [[ABS_I]]
 //
 v128_t test_i32x4_abs(v128_t a) {
   return wasm_i32x4_abs(a);
 }
 
-// CHECK-LABEL: @test_i32x4_neg(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SUB_I:%.*]] = sub <4 x i32> zeroinitializer, [[A:%.*]]
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_i32x4_neg(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <4 x i32> zeroinitializer, [[A]]
 // CHECK-NEXT:    ret <4 x i32> [[SUB_I]]
 //
 v128_t test_i32x4_neg(v128_t a) {
   return wasm_i32x4_neg(a);
 }
 
-// CHECK-LABEL: @test_i32x4_all_true(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.wasm.alltrue.v4i32(<4 x i32> [[A:%.*]])
+// CHECK-LABEL: define hidden zeroext i1 @test_i32x4_all_true(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.wasm.alltrue.v4i32(<4 x i32> [[A]])
 // CHECK-NEXT:    [[TOBOOL_I:%.*]] = icmp ne i32 [[TMP0]], 0
 // CHECK-NEXT:    ret i1 [[TOBOOL_I]]
 //
@@ -2017,118 +2195,130 @@ bool test_i32x4_all_true(v128_t a) {
   return wasm_i32x4_all_true(a);
 }
 
-// CHECK-LABEL: @test_i32x4_bitmask(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.wasm.bitmask.v4i32(<4 x i32> [[A:%.*]])
+// CHECK-LABEL: define hidden i32 @test_i32x4_bitmask(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.wasm.bitmask.v4i32(<4 x i32> [[A]])
 // CHECK-NEXT:    ret i32 [[TMP0]]
 //
 uint32_t test_i32x4_bitmask(v128_t a) {
   return wasm_i32x4_bitmask(a);
 }
 
-// CHECK-LABEL: @test_i32x4_shl(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[AND_I:%.*]] = and i32 [[B:%.*]], 31
+// CHECK-LABEL: define hidden <4 x i32> @test_i32x4_shl(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], i32 noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[AND_I:%.*]] = and i32 [[B]], 31
 // CHECK-NEXT:    [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <4 x i32> poison, i32 [[AND_I]], i64 0
 // CHECK-NEXT:    [[SPLAT_SPLAT_I:%.*]] = shufflevector <4 x i32> [[SPLAT_SPLATINSERT_I]], <4 x i32> poison, <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[SHL_I:%.*]] = shl <4 x i32> [[A:%.*]], [[SPLAT_SPLAT_I]]
+// CHECK-NEXT:    [[SHL_I:%.*]] = shl <4 x i32> [[A]], [[SPLAT_SPLAT_I]]
 // CHECK-NEXT:    ret <4 x i32> [[SHL_I]]
 //
 v128_t test_i32x4_shl(v128_t a, uint32_t b) {
   return wasm_i32x4_shl(a, b);
 }
 
-// CHECK-LABEL: @test_i32x4_shr(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[AND_I:%.*]] = and i32 [[B:%.*]], 31
+// CHECK-LABEL: define hidden <4 x i32> @test_i32x4_shr(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], i32 noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[AND_I:%.*]] = and i32 [[B]], 31
 // CHECK-NEXT:    [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <4 x i32> poison, i32 [[AND_I]], i64 0
 // CHECK-NEXT:    [[SPLAT_SPLAT_I:%.*]] = shufflevector <4 x i32> [[SPLAT_SPLATINSERT_I]], <4 x i32> poison, <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[SHR_I:%.*]] = ashr <4 x i32> [[A:%.*]], [[SPLAT_SPLAT_I]]
+// CHECK-NEXT:    [[SHR_I:%.*]] = ashr <4 x i32> [[A]], [[SPLAT_SPLAT_I]]
 // CHECK-NEXT:    ret <4 x i32> [[SHR_I]]
 //
 v128_t test_i32x4_shr(v128_t a, uint32_t b) {
   return wasm_i32x4_shr(a, b);
 }
 
-// CHECK-LABEL: @test_u32x4_shr(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[AND_I:%.*]] = and i32 [[B:%.*]], 31
+// CHECK-LABEL: define hidden <4 x i32> @test_u32x4_shr(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], i32 noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[AND_I:%.*]] = and i32 [[B]], 31
 // CHECK-NEXT:    [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <4 x i32> poison, i32 [[AND_I]], i64 0
 // CHECK-NEXT:    [[SPLAT_SPLAT_I:%.*]] = shufflevector <4 x i32> [[SPLAT_SPLATINSERT_I]], <4 x i32> poison, <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[SHR_I:%.*]] = lshr <4 x i32> [[A:%.*]], [[SPLAT_SPLAT_I]]
+// CHECK-NEXT:    [[SHR_I:%.*]] = lshr <4 x i32> [[A]], [[SPLAT_SPLAT_I]]
 // CHECK-NEXT:    ret <4 x i32> [[SHR_I]]
 //
 v128_t test_u32x4_shr(v128_t a, uint32_t b) {
   return wasm_u32x4_shr(a, b);
 }
 
-// CHECK-LABEL: @test_i32x4_add(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD_I:%.*]] = add <4 x i32> [[B:%.*]], [[A:%.*]]
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_i32x4_add(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <4 x i32> [[B]], [[A]]
 // CHECK-NEXT:    ret <4 x i32> [[ADD_I]]
 //
 v128_t test_i32x4_add(v128_t a, v128_t b) {
   return wasm_i32x4_add(a, b);
 }
 
-// CHECK-LABEL: @test_i32x4_sub(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SUB_I:%.*]] = sub <4 x i32> [[A:%.*]], [[B:%.*]]
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_i32x4_sub(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <4 x i32> [[A]], [[B]]
 // CHECK-NEXT:    ret <4 x i32> [[SUB_I]]
 //
 v128_t test_i32x4_sub(v128_t a, v128_t b) {
   return wasm_i32x4_sub(a, b);
 }
 
-// CHECK-LABEL: @test_i32x4_mul(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[MUL_I:%.*]] = mul <4 x i32> [[B:%.*]], [[A:%.*]]
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_i32x4_mul(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[MUL_I:%.*]] = mul <4 x i32> [[B]], [[A]]
 // CHECK-NEXT:    ret <4 x i32> [[MUL_I]]
 //
 v128_t test_i32x4_mul(v128_t a, v128_t b) {
   return wasm_i32x4_mul(a, b);
 }
 
-// CHECK-LABEL: @test_i32x4_min(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.smin.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_i32x4_min(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[ELT_MIN_I:%.*]] = tail call <4 x i32> @llvm.smin.v4i32(<4 x i32> [[A]], <4 x i32> [[B]])
+// CHECK-NEXT:    ret <4 x i32> [[ELT_MIN_I]]
 //
 v128_t test_i32x4_min(v128_t a, v128_t b) {
   return wasm_i32x4_min(a, b);
 }
 
-// CHECK-LABEL: @test_u32x4_min(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.umin.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_u32x4_min(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[ELT_MIN_I:%.*]] = tail call <4 x i32> @llvm.umin.v4i32(<4 x i32> [[A]], <4 x i32> [[B]])
+// CHECK-NEXT:    ret <4 x i32> [[ELT_MIN_I]]
 //
 v128_t test_u32x4_min(v128_t a, v128_t b) {
   return wasm_u32x4_min(a, b);
 }
 
-// CHECK-LABEL: @test_i32x4_max(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_i32x4_max(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[ELT_MAX_I:%.*]] = tail call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[A]], <4 x i32> [[B]])
+// CHECK-NEXT:    ret <4 x i32> [[ELT_MAX_I]]
 //
 v128_t test_i32x4_max(v128_t a, v128_t b) {
   return wasm_i32x4_max(a, b);
 }
 
-// CHECK-LABEL: @test_u32x4_max(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.umax.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_u32x4_max(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[ELT_MAX_I:%.*]] = tail call <4 x i32> @llvm.umax.v4i32(<4 x i32> [[A]], <4 x i32> [[B]])
+// CHECK-NEXT:    ret <4 x i32> [[ELT_MAX_I]]
 //
 v128_t test_u32x4_max(v128_t a, v128_t b) {
   return wasm_u32x4_max(a, b);
 }
 
-// CHECK-LABEL: @test_i32x4_dot_i16x8(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <8 x i16>
+// CHECK-LABEL: define hidden <4 x i32> @test_i32x4_dot_i16x8(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <8 x i16>
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.wasm.dot(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
 // CHECK-NEXT:    ret <4 x i32> [[TMP2]]
 //
@@ -2136,9 +2326,10 @@ v128_t test_i32x4_dot_i16x8(v128_t a, v128_t b) {
   return wasm_i32x4_dot_i16x8(a, b);
 }
 
-// CHECK-LABEL: @test_i64x2_abs(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x i64>
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_i64x2_abs(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <2 x i64>
 // CHECK-NEXT:    [[ABS_I:%.*]] = tail call <2 x i64> @llvm.abs.v2i64(<2 x i64> [[TMP0]], i1 false)
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[ABS_I]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP1]]
@@ -2147,9 +2338,10 @@ v128_t test_i64x2_abs(v128_t a) {
   return wasm_i64x2_abs(a);
 }
 
-// CHECK-LABEL: @test_i64x2_neg(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x i64>
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_i64x2_neg(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <2 x i64>
 // CHECK-NEXT:    [[SUB_I:%.*]] = sub <2 x i64> zeroinitializer, [[TMP0]]
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[SUB_I]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP1]]
@@ -2158,9 +2350,10 @@ v128_t test_i64x2_neg(v128_t a) {
   return wasm_i64x2_neg(a);
 }
 
-// CHECK-LABEL: @test_i64x2_all_true(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x i64>
+// CHECK-LABEL: define hidden zeroext i1 @test_i64x2_all_true(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <2 x i64>
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.wasm.alltrue.v2i64(<2 x i64> [[TMP0]])
 // CHECK-NEXT:    [[TOBOOL_I:%.*]] = icmp ne i32 [[TMP1]], 0
 // CHECK-NEXT:    ret i1 [[TOBOOL_I]]
@@ -2169,9 +2362,10 @@ bool test_i64x2_all_true(v128_t a) {
   return wasm_i64x2_all_true(a);
 }
 
-// CHECK-LABEL: @test_i64x2_bitmask(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x i64>
+// CHECK-LABEL: define hidden i32 @test_i64x2_bitmask(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <2 x i64>
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.wasm.bitmask.v2i64(<2 x i64> [[TMP0]])
 // CHECK-NEXT:    ret i32 [[TMP1]]
 //
@@ -2179,10 +2373,11 @@ uint32_t test_i64x2_bitmask(v128_t a) {
   return wasm_i64x2_bitmask(a);
 }
 
-// CHECK-LABEL: @test_i64x2_shl(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x i64>
-// CHECK-NEXT:    [[TMP1:%.*]] = and i32 [[B:%.*]], 63
+// CHECK-LABEL: define hidden <4 x i32> @test_i64x2_shl(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], i32 noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = and i32 [[B]], 63
 // CHECK-NEXT:    [[AND_I:%.*]] = zext nneg i32 [[TMP1]] to i64
 // CHECK-NEXT:    [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <2 x i64> poison, i64 [[AND_I]], i64 0
 // CHECK-NEXT:    [[SPLAT_SPLAT_I:%.*]] = shufflevector <2 x i64> [[SPLAT_SPLATINSERT_I]], <2 x i64> poison, <2 x i32> zeroinitializer
@@ -2194,10 +2389,11 @@ v128_t test_i64x2_shl(v128_t a, uint32_t b) {
   return wasm_i64x2_shl(a, b);
 }
 
-// CHECK-LABEL: @test_i64x2_shr(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x i64>
-// CHECK-NEXT:    [[TMP1:%.*]] = and i32 [[B:%.*]], 63
+// CHECK-LABEL: define hidden <4 x i32> @test_i64x2_shr(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], i32 noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = and i32 [[B]], 63
 // CHECK-NEXT:    [[AND_I:%.*]] = zext nneg i32 [[TMP1]] to i64
 // CHECK-NEXT:    [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <2 x i64> poison, i64 [[AND_I]], i64 0
 // CHECK-NEXT:    [[SPLAT_SPLAT_I:%.*]] = shufflevector <2 x i64> [[SPLAT_SPLATINSERT_I]], <2 x i64> poison, <2 x i32> zeroinitializer
@@ -2209,10 +2405,11 @@ v128_t test_i64x2_shr(v128_t a, uint32_t b) {
   return wasm_i64x2_shr(a, b);
 }
 
-// CHECK-LABEL: @test_u64x2_shr(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x i64>
-// CHECK-NEXT:    [[TMP1:%.*]] = and i32 [[B:%.*]], 63
+// CHECK-LABEL: define hidden <4 x i32> @test_u64x2_shr(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], i32 noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = and i32 [[B]], 63
 // CHECK-NEXT:    [[AND_I:%.*]] = zext nneg i32 [[TMP1]] to i64
 // CHECK-NEXT:    [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <2 x i64> poison, i64 [[AND_I]], i64 0
 // CHECK-NEXT:    [[SPLAT_SPLAT_I:%.*]] = shufflevector <2 x i64> [[SPLAT_SPLATINSERT_I]], <2 x i64> poison, <2 x i32> zeroinitializer
@@ -2224,10 +2421,11 @@ v128_t test_u64x2_shr(v128_t a, uint32_t b) {
   return wasm_u64x2_shr(a, b);
 }
 
-// CHECK-LABEL: @test_i64x2_add(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x i64>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <2 x i64>
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_i64x2_add(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <2 x i64>
 // CHECK-NEXT:    [[ADD_I:%.*]] = add <2 x i64> [[TMP1]], [[TMP0]]
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[ADD_I]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP2]]
@@ -2236,10 +2434,11 @@ v128_t test_i64x2_add(v128_t a, v128_t b) {
   return wasm_i64x2_add(a, b);
 }
 
-// CHECK-LABEL: @test_i64x2_sub(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x i64>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <2 x i64>
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_i64x2_sub(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <2 x i64>
 // CHECK-NEXT:    [[SUB_I:%.*]] = sub <2 x i64> [[TMP0]], [[TMP1]]
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[SUB_I]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP2]]
@@ -2248,10 +2447,11 @@ v128_t test_i64x2_sub(v128_t a, v128_t b) {
   return wasm_i64x2_sub(a, b);
 }
 
-// CHECK-LABEL: @test_i64x2_mul(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x i64>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <2 x i64>
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_i64x2_mul(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <2 x i64>
 // CHECK-NEXT:    [[MUL_I:%.*]] = mul <2 x i64> [[TMP1]], [[TMP0]]
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[MUL_I]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP2]]
@@ -2260,9 +2460,10 @@ v128_t test_i64x2_mul(v128_t a, v128_t b) {
   return wasm_i64x2_mul(a, b);
 }
 
-// CHECK-LABEL: @test_f32x4_abs(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <4 x float>
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_f32x4_abs(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <4 x float>
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP0]])
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[TMP1]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP2]]
@@ -2271,9 +2472,10 @@ v128_t test_f32x4_abs(v128_t a) {
   return wasm_f32x4_abs(a);
 }
 
-// CHECK-LABEL: @test_f32x4_neg(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <4 x float>
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_f32x4_neg(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <4 x float>
 // CHECK-NEXT:    [[FNEG_I:%.*]] = fneg <4 x float> [[TMP0]]
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[FNEG_I]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP1]]
@@ -2282,9 +2484,10 @@ v128_t test_f32x4_neg(v128_t a) {
   return wasm_f32x4_neg(a);
 }
 
-// CHECK-LABEL: @test_f32x4_sqrt(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <4 x float>
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_f32x4_sqrt(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <4 x float>
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> [[TMP0]])
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[TMP1]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP2]]
@@ -2293,9 +2496,10 @@ v128_t test_f32x4_sqrt(v128_t a) {
   return wasm_f32x4_sqrt(a);
 }
 
-// CHECK-LABEL: @test_f32x4_ceil(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <4 x float>
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_f32x4_ceil(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <4 x float>
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.ceil.v4f32(<4 x float> [[TMP0]])
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[TMP1]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP2]]
@@ -2304,9 +2508,10 @@ v128_t test_f32x4_ceil(v128_t a) {
   return wasm_f32x4_ceil(a);
 }
 
-// CHECK-LABEL: @test_f32x4_floor(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <4 x float>
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_f32x4_floor(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <4 x float>
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.floor.v4f32(<4 x float> [[TMP0]])
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[TMP1]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP2]]
@@ -2315,9 +2520,10 @@ v128_t test_f32x4_floor(v128_t a) {
   return wasm_f32x4_floor(a);
 }
 
-// CHECK-LABEL: @test_f32x4_trunc(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <4 x float>
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_f32x4_trunc(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <4 x float>
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.trunc.v4f32(<4 x float> [[TMP0]])
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[TMP1]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP2]]
@@ -2326,9 +2532,10 @@ v128_t test_f32x4_trunc(v128_t a) {
   return wasm_f32x4_trunc(a);
 }
 
-// CHECK-LABEL: @test_f32x4_nearest(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <4 x float>
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_f32x4_nearest(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <4 x float>
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[TMP0]])
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[TMP1]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP2]]
@@ -2337,10 +2544,11 @@ v128_t test_f32x4_nearest(v128_t a) {
   return wasm_f32x4_nearest(a);
 }
 
-// CHECK-LABEL: @test_f32x4_add(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <4 x float>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <4 x float>
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_f32x4_add(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <4 x float>
 // CHECK-NEXT:    [[ADD_I:%.*]] = fadd <4 x float> [[TMP0]], [[TMP1]]
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[ADD_I]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP2]]
@@ -2349,10 +2557,11 @@ v128_t test_f32x4_add(v128_t a, v128_t b) {
   return wasm_f32x4_add(a, b);
 }
 
-// CHECK-LABEL: @test_f32x4_sub(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <4 x float>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <4 x float>
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_f32x4_sub(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <4 x float>
 // CHECK-NEXT:    [[SUB_I:%.*]] = fsub <4 x float> [[TMP0]], [[TMP1]]
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[SUB_I]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP2]]
@@ -2361,10 +2570,11 @@ v128_t test_f32x4_sub(v128_t a, v128_t b) {
   return wasm_f32x4_sub(a, b);
 }
 
-// CHECK-LABEL: @test_f32x4_mul(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <4 x float>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <4 x float>
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_f32x4_mul(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <4 x float>
 // CHECK-NEXT:    [[MUL_I:%.*]] = fmul <4 x float> [[TMP0]], [[TMP1]]
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[MUL_I]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP2]]
@@ -2373,10 +2583,11 @@ v128_t test_f32x4_mul(v128_t a, v128_t b) {
   return wasm_f32x4_mul(a, b);
 }
 
-// CHECK-LABEL: @test_f32x4_div(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <4 x float>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <4 x float>
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_f32x4_div(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <4 x float>
 // CHECK-NEXT:    [[DIV_I:%.*]] = fdiv <4 x float> [[TMP0]], [[TMP1]]
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[DIV_I]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP2]]
@@ -2385,10 +2596,11 @@ v128_t test_f32x4_div(v128_t a, v128_t b) {
   return wasm_f32x4_div(a, b);
 }
 
-// CHECK-LABEL: @test_f32x4_min(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <4 x float>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <4 x float>
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_f32x4_min(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <4 x float>
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x float> @llvm.minimum.v4f32(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x float> [[TMP2]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP3]]
@@ -2397,10 +2609,11 @@ v128_t test_f32x4_min(v128_t a, v128_t b) {
   return wasm_f32x4_min(a, b);
 }
 
-// CHECK-LABEL: @test_f32x4_max(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <4 x float>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <4 x float>
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_f32x4_max(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <4 x float>
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x float> @llvm.maximum.v4f32(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x float> [[TMP2]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP3]]
@@ -2409,10 +2622,11 @@ v128_t test_f32x4_max(v128_t a, v128_t b) {
   return wasm_f32x4_max(a, b);
 }
 
-// CHECK-LABEL: @test_f32x4_pmin(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <4 x float>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <4 x float>
+// CHECK-LABEL: define hidden <4 x i32> @test_f32x4_pmin(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <4 x float>
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x float> @llvm.wasm.pmin.v4f32(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x float> [[TMP2]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP3]]
@@ -2421,10 +2635,11 @@ v128_t test_f32x4_pmin(v128_t a, v128_t b) {
   return wasm_f32x4_pmin(a, b);
 }
 
-// CHECK-LABEL: @test_f32x4_pmax(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <4 x float>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <4 x float>
+// CHECK-LABEL: define hidden <4 x i32> @test_f32x4_pmax(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <4 x float>
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x float> @llvm.wasm.pmax.v4f32(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x float> [[TMP2]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP3]]
@@ -2433,9 +2648,10 @@ v128_t test_f32x4_pmax(v128_t a, v128_t b) {
   return wasm_f32x4_pmax(a, b);
 }
 
-// CHECK-LABEL: @test_f64x2_abs(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x double>
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_f64x2_abs(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <2 x double>
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.fabs.v2f64(<2 x double> [[TMP0]])
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[TMP1]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP2]]
@@ -2444,9 +2660,10 @@ v128_t test_f64x2_abs(v128_t a) {
   return wasm_f64x2_abs(a);
 }
 
-// CHECK-LABEL: @test_f64x2_neg(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x double>
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_f64x2_neg(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <2 x double>
 // CHECK-NEXT:    [[FNEG_I:%.*]] = fneg <2 x double> [[TMP0]]
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x double> [[FNEG_I]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP1]]
@@ -2455,9 +2672,10 @@ v128_t test_f64x2_neg(v128_t a) {
   return wasm_f64x2_neg(a);
 }
 
-// CHECK-LABEL: @test_f64x2_sqrt(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x double>
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_f64x2_sqrt(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <2 x double>
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP0]])
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[TMP1]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP2]]
@@ -2466,9 +2684,10 @@ v128_t test_f64x2_sqrt(v128_t a) {
   return wasm_f64x2_sqrt(a);
 }
 
-// CHECK-LABEL: @test_f64x2_ceil(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x double>
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_f64x2_ceil(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <2 x double>
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.ceil.v2f64(<2 x double> [[TMP0]])
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[TMP1]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP2]]
@@ -2477,9 +2696,10 @@ v128_t test_f64x2_ceil(v128_t a) {
   return wasm_f64x2_ceil(a);
 }
 
-// CHECK-LABEL: @test_f64x2_floor(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x double>
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_f64x2_floor(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <2 x double>
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.floor.v2f64(<2 x double> [[TMP0]])
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[TMP1]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP2]]
@@ -2488,9 +2708,10 @@ v128_t test_f64x2_floor(v128_t a) {
   return wasm_f64x2_floor(a);
 }
 
-// CHECK-LABEL: @test_f64x2_trunc(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x double>
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_f64x2_trunc(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <2 x double>
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.trunc.v2f64(<2 x double> [[TMP0]])
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[TMP1]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP2]]
@@ -2499,9 +2720,10 @@ v128_t test_f64x2_trunc(v128_t a) {
   return wasm_f64x2_trunc(a);
 }
 
-// CHECK-LABEL: @test_f64x2_nearest(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x double>
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_f64x2_nearest(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <2 x double>
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[TMP0]])
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[TMP1]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP2]]
@@ -2510,10 +2732,11 @@ v128_t test_f64x2_nearest(v128_t a) {
   return wasm_f64x2_nearest(a);
 }
 
-// CHECK-LABEL: @test_f64x2_add(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x double>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <2 x double>
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_f64x2_add(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <2 x double>
 // CHECK-NEXT:    [[ADD_I:%.*]] = fadd <2 x double> [[TMP0]], [[TMP1]]
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[ADD_I]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP2]]
@@ -2522,10 +2745,11 @@ v128_t test_f64x2_add(v128_t a, v128_t b) {
   return wasm_f64x2_add(a, b);
 }
 
-// CHECK-LABEL: @test_f64x2_sub(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x double>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <2 x double>
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_f64x2_sub(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <2 x double>
 // CHECK-NEXT:    [[SUB_I:%.*]] = fsub <2 x double> [[TMP0]], [[TMP1]]
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[SUB_I]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP2]]
@@ -2534,10 +2758,11 @@ v128_t test_f64x2_sub(v128_t a, v128_t b) {
   return wasm_f64x2_sub(a, b);
 }
 
-// CHECK-LABEL: @test_f64x2_mul(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x double>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <2 x double>
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_f64x2_mul(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <2 x double>
 // CHECK-NEXT:    [[MUL_I:%.*]] = fmul <2 x double> [[TMP0]], [[TMP1]]
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[MUL_I]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP2]]
@@ -2546,10 +2771,11 @@ v128_t test_f64x2_mul(v128_t a, v128_t b) {
   return wasm_f64x2_mul(a, b);
 }
 
-// CHECK-LABEL: @test_f64x2_div(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x double>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <2 x double>
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_f64x2_div(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <2 x double>
 // CHECK-NEXT:    [[DIV_I:%.*]] = fdiv <2 x double> [[TMP0]], [[TMP1]]
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[DIV_I]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP2]]
@@ -2558,10 +2784,11 @@ v128_t test_f64x2_div(v128_t a, v128_t b) {
   return wasm_f64x2_div(a, b);
 }
 
-// CHECK-LABEL: @test_f64x2_min(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x double>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <2 x double>
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_f64x2_min(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <2 x double>
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x double> @llvm.minimum.v2f64(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x double> [[TMP2]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP3]]
@@ -2570,10 +2797,11 @@ v128_t test_f64x2_min(v128_t a, v128_t b) {
   return wasm_f64x2_min(a, b);
 }
 
-// CHECK-LABEL: @test_f64x2_max(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x double>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <2 x double>
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_f64x2_max(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <2 x double>
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x double> @llvm.maximum.v2f64(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x double> [[TMP2]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP3]]
@@ -2582,10 +2810,11 @@ v128_t test_f64x2_max(v128_t a, v128_t b) {
   return wasm_f64x2_max(a, b);
 }
 
-// CHECK-LABEL: @test_f64x2_pmin(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x double>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <2 x double>
+// CHECK-LABEL: define hidden <4 x i32> @test_f64x2_pmin(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <2 x double>
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x double> @llvm.wasm.pmin.v2f64(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x double> [[TMP2]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP3]]
@@ -2594,10 +2823,11 @@ v128_t test_f64x2_pmin(v128_t a, v128_t b) {
   return wasm_f64x2_pmin(a, b);
 }
 
-// CHECK-LABEL: @test_f64x2_pmax(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x double>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <2 x double>
+// CHECK-LABEL: define hidden <4 x i32> @test_f64x2_pmax(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <2 x double>
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x double> @llvm.wasm.pmax.v2f64(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x double> [[TMP2]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP3]]
@@ -2606,9 +2836,10 @@ v128_t test_f64x2_pmax(v128_t a, v128_t b) {
   return wasm_f64x2_pmax(a, b);
 }
 
-// CHECK-LABEL: @test_i32x4_trunc_sat_f32x4(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <4 x float>
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_i32x4_trunc_sat_f32x4(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <4 x float>
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.fptosi.sat.v4i32.v4f32(<4 x float> [[TMP0]])
 // CHECK-NEXT:    ret <4 x i32> [[TMP1]]
 //
@@ -2616,9 +2847,10 @@ v128_t test_i32x4_trunc_sat_f32x4(v128_t a) {
   return wasm_i32x4_trunc_sat_f32x4(a);
 }
 
-// CHECK-LABEL: @test_u32x4_trunc_sat_f32x4(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <4 x float>
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_u32x4_trunc_sat_f32x4(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <4 x float>
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.fptoui.sat.v4i32.v4f32(<4 x float> [[TMP0]])
 // CHECK-NEXT:    ret <4 x i32> [[TMP1]]
 //
@@ -2626,9 +2858,10 @@ v128_t test_u32x4_trunc_sat_f32x4(v128_t a) {
   return wasm_u32x4_trunc_sat_f32x4(a);
 }
 
-// CHECK-LABEL: @test_f32x4_convert_i32x4(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[CONV_I:%.*]] = sitofp <4 x i32> [[A:%.*]] to <4 x float>
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_f32x4_convert_i32x4(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CONV_I:%.*]] = sitofp <4 x i32> [[A]] to <4 x float>
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[CONV_I]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP0]]
 //
@@ -2636,9 +2869,10 @@ v128_t test_f32x4_convert_i32x4(v128_t a) {
   return wasm_f32x4_convert_i32x4(a);
 }
 
-// CHECK-LABEL: @test_f32x4_convert_u32x4(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[CONV_I:%.*]] = uitofp <4 x i32> [[A:%.*]] to <4 x float>
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_f32x4_convert_u32x4(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CONV_I:%.*]] = uitofp <4 x i32> [[A]] to <4 x float>
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[CONV_I]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP0]]
 //
@@ -2646,9 +2880,10 @@ v128_t test_f32x4_convert_u32x4(v128_t a) {
   return wasm_f32x4_convert_u32x4(a);
 }
 
-// CHECK-LABEL: @test_f64x2_convert_low_i32x4(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <2 x i32> <i32 0, i32 1>
+// CHECK-LABEL: define hidden <4 x i32> @test_f64x2_convert_low_i32x4(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT2_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> poison, <2 x i32> <i32 0, i32 1>
 // CHECK-NEXT:    [[CONV_I:%.*]] = sitofp <2 x i32> [[VECINIT2_I]] to <2 x double>
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x double> [[CONV_I]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP0]]
@@ -2657,9 +2892,10 @@ v128_t test_f64x2_convert_low_i32x4(v128_t a) {
   return wasm_f64x2_convert_low_i32x4(a);
 }
 
-// CHECK-LABEL: @test_f64x2_convert_low_u32x4(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <2 x i32> <i32 0, i32 1>
+// CHECK-LABEL: define hidden <4 x i32> @test_f64x2_convert_low_u32x4(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT2_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> poison, <2 x i32> <i32 0, i32 1>
 // CHECK-NEXT:    [[CONV_I:%.*]] = uitofp <2 x i32> [[VECINIT2_I]] to <2 x double>
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x double> [[CONV_I]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP0]]
@@ -2668,9 +2904,10 @@ v128_t test_f64x2_convert_low_u32x4(v128_t a) {
   return wasm_f64x2_convert_low_u32x4(a);
 }
 
-// CHECK-LABEL: @test_i32x4_trunc_sat_f64x2_zero(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x double>
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_i32x4_trunc_sat_f64x2_zero(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <2 x double>
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i32> @llvm.fptosi.sat.v2i32.v2f64(<2 x double> [[TMP0]])
 // CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 // CHECK-NEXT:    ret <4 x i32> [[TMP2]]
@@ -2679,9 +2916,10 @@ v128_t test_i32x4_trunc_sat_f64x2_zero(v128_t a) {
   return wasm_i32x4_trunc_sat_f64x2_zero(a);
 }
 
-// CHECK-LABEL: @test_u32x4_trunc_sat_f64x2_zero(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x double>
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_u32x4_trunc_sat_f64x2_zero(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <2 x double>
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i32> @llvm.fptoui.sat.v2i32.v2f64(<2 x double> [[TMP0]])
 // CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 // CHECK-NEXT:    ret <4 x i32> [[TMP2]]
@@ -2690,9 +2928,10 @@ v128_t test_u32x4_trunc_sat_f64x2_zero(v128_t a) {
   return wasm_u32x4_trunc_sat_f64x2_zero(a);
 }
 
-// CHECK-LABEL: @test_f32x4_demote_f64x2_zero(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x double>
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_f32x4_demote_f64x2_zero(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <2 x double>
 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 // CHECK-NEXT:    [[CONV_I:%.*]] = fptrunc <4 x double> [[SHUFFLE_I]] to <4 x float>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[CONV_I]] to <4 x i32>
@@ -2702,9 +2941,10 @@ v128_t test_f32x4_demote_f64x2_zero(v128_t a) {
   return wasm_f32x4_demote_f64x2_zero(a);
 }
 
-// CHECK-LABEL: @test_f64x2_promote_low_f32x4(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <4 x float>
+// CHECK-LABEL: define hidden <4 x i32> @test_f64x2_promote_low_f32x4(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <4 x float>
 // CHECK-NEXT:    [[VECINIT2_I:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 0, i32 1>
 // CHECK-NEXT:    [[CONV_I:%.*]] = fpext <2 x float> [[VECINIT2_I]] to <2 x double>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x double> [[CONV_I]] to <4 x i32>
@@ -2714,10 +2954,11 @@ v128_t test_f64x2_promote_low_f32x4(v128_t a) {
   return wasm_f64x2_promote_low_f32x4(a);
 }
 
-// CHECK-LABEL: @test_i8x16_shuffle(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <16 x i8>
+// CHECK-LABEL: define hidden <4 x i32> @test_i8x16_shuffle(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.wasm.shuffle(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0)
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP3]]
@@ -2726,10 +2967,11 @@ v128_t test_i8x16_shuffle(v128_t a, v128_t b) {
   return wasm_i8x16_shuffle(a, b, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
 }
 
-// CHECK-LABEL: @test_i16x8_shuffle(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <16 x i8>
+// CHECK-LABEL: define hidden <4 x i32> @test_i16x8_shuffle(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.wasm.shuffle(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], i32 14, i32 15, i32 12, i32 13, i32 10, i32 11, i32 8, i32 9, i32 6, i32 7, i32 4, i32 5, i32 2, i32 3, i32 0, i32 1)
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP3]]
@@ -2738,10 +2980,11 @@ v128_t test_i16x8_shuffle(v128_t a, v128_t b) {
   return wasm_i16x8_shuffle(a, b, 7, 6, 5, 4, 3, 2, 1, 0);
 }
 
-// CHECK-LABEL: @test_i32x4_shuffle(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <16 x i8>
+// CHECK-LABEL: define hidden <4 x i32> @test_i32x4_shuffle(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.wasm.shuffle(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3)
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP3]]
@@ -2750,10 +2993,11 @@ v128_t test_i32x4_shuffle(v128_t a, v128_t b) {
   return wasm_i32x4_shuffle(a, b, 3, 2, 1, 0);
 }
 
-// CHECK-LABEL: @test_i64x2_shuffle(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <16 x i8>
+// CHECK-LABEL: define hidden <4 x i32> @test_i64x2_shuffle(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.wasm.shuffle(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7)
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP3]]
@@ -2762,10 +3006,11 @@ v128_t test_i64x2_shuffle(v128_t a, v128_t b) {
   return wasm_i64x2_shuffle(a, b, 1, 0);
 }
 
-// CHECK-LABEL: @test_i8x16_swizzle(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <16 x i8>
+// CHECK-LABEL: define hidden <4 x i32> @test_i8x16_swizzle(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.wasm.swizzle(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP3]]
@@ -2774,10 +3019,11 @@ v128_t test_i8x16_swizzle(v128_t a, v128_t b) {
   return wasm_i8x16_swizzle(a, b);
 }
 
-// CHECK-LABEL: @test_i8x16_narrow_i16x8(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <8 x i16>
+// CHECK-LABEL: define hidden <4 x i32> @test_i8x16_narrow_i16x8(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <8 x i16>
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.wasm.narrow.signed.v16i8.v8i16(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP3]]
@@ -2786,10 +3032,11 @@ v128_t test_i8x16_narrow_i16x8(v128_t a, v128_t b) {
   return wasm_i8x16_narrow_i16x8(a, b);
 }
 
-// CHECK-LABEL: @test_u8x16_narrow_i16x8(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <8 x i16>
+// CHECK-LABEL: define hidden <4 x i32> @test_u8x16_narrow_i16x8(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <8 x i16>
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.wasm.narrow.unsigned.v16i8.v8i16(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP3]]
@@ -2798,9 +3045,10 @@ v128_t test_u8x16_narrow_i16x8(v128_t a, v128_t b) {
   return wasm_u8x16_narrow_i16x8(a, b);
 }
 
-// CHECK-LABEL: @test_i16x8_narrow_i32x4(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.wasm.narrow.signed.v8i16.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]])
+// CHECK-LABEL: define hidden <4 x i32> @test_i16x8_narrow_i32x4(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.wasm.narrow.signed.v8i16.v4i32(<4 x i32> [[A]], <4 x i32> [[B]])
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP1]]
 //
@@ -2808,9 +3056,10 @@ v128_t test_i16x8_narrow_i32x4(v128_t a, v128_t b) {
   return wasm_i16x8_narrow_i32x4(a, b);
 }
 
-// CHECK-LABEL: @test_u16x8_narrow_i32x4(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.wasm.narrow.unsigned.v8i16.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]])
+// CHECK-LABEL: define hidden <4 x i32> @test_u16x8_narrow_i32x4(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.wasm.narrow.unsigned.v8i16.v4i32(<4 x i32> [[A]], <4 x i32> [[B]])
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP1]]
 //
@@ -2818,9 +3067,10 @@ v128_t test_u16x8_narrow_i32x4(v128_t a, v128_t b) {
   return wasm_u16x8_narrow_i32x4(a, b);
 }
 
-// CHECK-LABEL: @test_i16x8_extend_low_i8x16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
+// CHECK-LABEL: define hidden <4 x i32> @test_i16x8_extend_low_i8x16(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
 // CHECK-NEXT:    [[VECINIT14_I:%.*]] = shufflevector <16 x i8> [[TMP0]], <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 // CHECK-NEXT:    [[CONV_I:%.*]] = sext <8 x i8> [[VECINIT14_I]] to <8 x i16>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[CONV_I]] to <4 x i32>
@@ -2830,9 +3080,10 @@ v128_t test_i16x8_extend_low_i8x16(v128_t a) {
   return wasm_i16x8_extend_low_i8x16(a);
 }
 
-// CHECK-LABEL: @test_i16x8_extend_high_i8x16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
+// CHECK-LABEL: define hidden <4 x i32> @test_i16x8_extend_high_i8x16(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
 // CHECK-NEXT:    [[VECINIT14_I:%.*]] = shufflevector <16 x i8> [[TMP0]], <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 // CHECK-NEXT:    [[CONV_I:%.*]] = sext <8 x i8> [[VECINIT14_I]] to <8 x i16>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[CONV_I]] to <4 x i32>
@@ -2842,9 +3093,10 @@ v128_t test_i16x8_extend_high_i8x16(v128_t a) {
   return wasm_i16x8_extend_high_i8x16(a);
 }
 
-// CHECK-LABEL: @test_u16x8_extend_low_u8x16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
+// CHECK-LABEL: define hidden <4 x i32> @test_u16x8_extend_low_u8x16(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
 // CHECK-NEXT:    [[VECINIT14_I:%.*]] = shufflevector <16 x i8> [[TMP0]], <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 // CHECK-NEXT:    [[CONV_I:%.*]] = zext <8 x i8> [[VECINIT14_I]] to <8 x i16>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[CONV_I]] to <4 x i32>
@@ -2854,9 +3106,10 @@ v128_t test_u16x8_extend_low_u8x16(v128_t a) {
   return wasm_u16x8_extend_low_u8x16(a);
 }
 
-// CHECK-LABEL: @test_u16x8_extend_high_u8x16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
+// CHECK-LABEL: define hidden <4 x i32> @test_u16x8_extend_high_u8x16(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
 // CHECK-NEXT:    [[VECINIT14_I:%.*]] = shufflevector <16 x i8> [[TMP0]], <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 // CHECK-NEXT:    [[CONV_I:%.*]] = zext <8 x i8> [[VECINIT14_I]] to <8 x i16>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[CONV_I]] to <4 x i32>
@@ -2866,9 +3119,10 @@ v128_t test_u16x8_extend_high_u8x16(v128_t a) {
   return wasm_u16x8_extend_high_u8x16(a);
 }
 
-// CHECK-LABEL: @test_i32x4_extend_low_i16x8(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16>
+// CHECK-LABEL: define hidden range(i32 -32768, 32768) <4 x i32> @test_i32x4_extend_low_i16x8(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <8 x i16>
 // CHECK-NEXT:    [[VECINIT6_I:%.*]] = shufflevector <8 x i16> [[TMP0]], <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 // CHECK-NEXT:    [[CONV_I:%.*]] = sext <4 x i16> [[VECINIT6_I]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[CONV_I]]
@@ -2877,9 +3131,10 @@ v128_t test_i32x4_extend_low_i16x8(v128_t a) {
   return wasm_i32x4_extend_low_i16x8(a);
 }
 
-// CHECK-LABEL: @test_i32x4_extend_high_i16x8(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16>
+// CHECK-LABEL: define hidden range(i32 -32768, 32768) <4 x i32> @test_i32x4_extend_high_i16x8(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <8 x i16>
 // CHECK-NEXT:    [[VECINIT6_I:%.*]] = shufflevector <8 x i16> [[TMP0]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 // CHECK-NEXT:    [[CONV_I:%.*]] = sext <4 x i16> [[VECINIT6_I]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[CONV_I]]
@@ -2888,9 +3143,10 @@ v128_t test_i32x4_extend_high_i16x8(v128_t a) {
   return wasm_i32x4_extend_high_i16x8(a);
 }
 
-// CHECK-LABEL: @test_u32x4_extend_low_u16x8(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16>
+// CHECK-LABEL: define hidden range(i32 0, 65536) <4 x i32> @test_u32x4_extend_low_u16x8(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <8 x i16>
 // CHECK-NEXT:    [[VECINIT6_I:%.*]] = shufflevector <8 x i16> [[TMP0]], <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 // CHECK-NEXT:    [[CONV_I:%.*]] = zext <4 x i16> [[VECINIT6_I]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[CONV_I]]
@@ -2899,9 +3155,10 @@ v128_t test_u32x4_extend_low_u16x8(v128_t a) {
   return wasm_u32x4_extend_low_u16x8(a);
 }
 
-// CHECK-LABEL: @test_u32x4_extend_high_u16x8(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16>
+// CHECK-LABEL: define hidden range(i32 0, 65536) <4 x i32> @test_u32x4_extend_high_u16x8(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <8 x i16>
 // CHECK-NEXT:    [[VECINIT6_I:%.*]] = shufflevector <8 x i16> [[TMP0]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 // CHECK-NEXT:    [[CONV_I:%.*]] = zext <4 x i16> [[VECINIT6_I]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[CONV_I]]
@@ -2910,9 +3167,10 @@ v128_t test_u32x4_extend_high_u16x8(v128_t a) {
   return wasm_u32x4_extend_high_u16x8(a);
 }
 
-// CHECK-LABEL: @test_i64x2_extend_low_i32x4(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <2 x i32> <i32 0, i32 1>
+// CHECK-LABEL: define hidden <4 x i32> @test_i64x2_extend_low_i32x4(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT2_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> poison, <2 x i32> <i32 0, i32 1>
 // CHECK-NEXT:    [[CONV_I:%.*]] = sext <2 x i32> [[VECINIT2_I]] to <2 x i64>
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[CONV_I]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP0]]
@@ -2921,9 +3179,10 @@ v128_t test_i64x2_extend_low_i32x4(v128_t a) {
   return wasm_i64x2_extend_low_i32x4(a);
 }
 
-// CHECK-LABEL: @test_i64x2_extend_high_i32x4(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+// CHECK-LABEL: define hidden <4 x i32> @test_i64x2_extend_high_i32x4(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT2_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> poison, <2 x i32> <i32 2, i32 3>
 // CHECK-NEXT:    [[CONV_I:%.*]] = sext <2 x i32> [[VECINIT2_I]] to <2 x i64>
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[CONV_I]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP0]]
@@ -2932,9 +3191,10 @@ v128_t test_i64x2_extend_high_i32x4(v128_t a) {
   return wasm_i64x2_extend_high_i32x4(a);
 }
 
-// CHECK-LABEL: @test_u64x2_extend_low_u32x4(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <2 x i32> <i32 0, i32 1>
+// CHECK-LABEL: define hidden <4 x i32> @test_u64x2_extend_low_u32x4(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT2_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> poison, <2 x i32> <i32 0, i32 1>
 // CHECK-NEXT:    [[CONV_I:%.*]] = zext <2 x i32> [[VECINIT2_I]] to <2 x i64>
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[CONV_I]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP0]]
@@ -2943,9 +3203,10 @@ v128_t test_u64x2_extend_low_u32x4(v128_t a) {
   return wasm_u64x2_extend_low_u32x4(a);
 }
 
-// CHECK-LABEL: @test_u64x2_extend_high_u32x4(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+// CHECK-LABEL: define hidden <4 x i32> @test_u64x2_extend_high_u32x4(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT2_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> poison, <2 x i32> <i32 2, i32 3>
 // CHECK-NEXT:    [[CONV_I:%.*]] = zext <2 x i32> [[VECINIT2_I]] to <2 x i64>
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[CONV_I]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP0]]
@@ -2954,9 +3215,10 @@ v128_t test_u64x2_extend_high_u32x4(v128_t a) {
   return wasm_u64x2_extend_high_u32x4(a);
 }
 
-// CHECK-LABEL: @test_i16x8_extadd_pairwise_i8x16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
+// CHECK-LABEL: define hidden <4 x i32> @test_i16x8_extadd_pairwise_i8x16(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.wasm.extadd.pairwise.signed.v8i16(<16 x i8> [[TMP0]])
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP2]]
@@ -2965,9 +3227,10 @@ v128_t test_i16x8_extadd_pairwise_i8x16(v128_t a) {
   return wasm_i16x8_extadd_pairwise_i8x16(a);
 }
 
-// CHECK-LABEL: @test_u16x8_extadd_pairwise_u8x16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
+// CHECK-LABEL: define hidden <4 x i32> @test_u16x8_extadd_pairwise_u8x16(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.wasm.extadd.pairwise.unsigned.v8i16(<16 x i8> [[TMP0]])
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP2]]
@@ -2976,9 +3239,10 @@ v128_t test_u16x8_extadd_pairwise_u8x16(v128_t a) {
   return wasm_u16x8_extadd_pairwise_u8x16(a);
 }
 
-// CHECK-LABEL: @test_i32x4_extadd_pairwise_i16x8(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16>
+// CHECK-LABEL: define hidden <4 x i32> @test_i32x4_extadd_pairwise_i16x8(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <8 x i16>
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.wasm.extadd.pairwise.signed.v4i32(<8 x i16> [[TMP0]])
 // CHECK-NEXT:    ret <4 x i32> [[TMP1]]
 //
@@ -2986,9 +3250,10 @@ v128_t test_i32x4_extadd_pairwise_i16x8(v128_t a) {
   return wasm_i32x4_extadd_pairwise_i16x8(a);
 }
 
-// CHECK-LABEL: @test_u32x4_extadd_pairwise_u16x8(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16>
+// CHECK-LABEL: define hidden <4 x i32> @test_u32x4_extadd_pairwise_u16x8(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <8 x i16>
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.wasm.extadd.pairwise.unsigned.v4i32(<8 x i16> [[TMP0]])
 // CHECK-NEXT:    ret <4 x i32> [[TMP1]]
 //
@@ -2996,12 +3261,13 @@ v128_t test_u32x4_extadd_pairwise_u16x8(v128_t a) {
   return wasm_u32x4_extadd_pairwise_u16x8(a);
 }
 
-// CHECK-LABEL: @test_i16x8_extmul_low_i8x16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
+// CHECK-LABEL: define hidden <4 x i32> @test_i16x8_extmul_low_i8x16(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
 // CHECK-NEXT:    [[VECINIT14_I2_I:%.*]] = shufflevector <16 x i8> [[TMP0]], <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 // CHECK-NEXT:    [[CONV_I3_I:%.*]] = sext <8 x i8> [[VECINIT14_I2_I]] to <8 x i16>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VECINIT14_I_I:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 // CHECK-NEXT:    [[CONV_I_I:%.*]] = sext <8 x i8> [[VECINIT14_I_I]] to <8 x i16>
 // CHECK-NEXT:    [[MUL_I:%.*]] = mul nsw <8 x i16> [[CONV_I_I]], [[CONV_I3_I]]
@@ -3012,12 +3278,13 @@ v128_t test_i16x8_extmul_low_i8x16(v128_t a, v128_t b) {
   return wasm_i16x8_extmul_low_i8x16(a, b);
 }
 
-// CHECK-LABEL: @test_i16x8_extmul_high_i8x16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
+// CHECK-LABEL: define hidden <4 x i32> @test_i16x8_extmul_high_i8x16(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
 // CHECK-NEXT:    [[VECINIT14_I2_I:%.*]] = shufflevector <16 x i8> [[TMP0]], <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 // CHECK-NEXT:    [[CONV_I3_I:%.*]] = sext <8 x i8> [[VECINIT14_I2_I]] to <8 x i16>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VECINIT14_I_I:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 // CHECK-NEXT:    [[CONV_I_I:%.*]] = sext <8 x i8> [[VECINIT14_I_I]] to <8 x i16>
 // CHECK-NEXT:    [[MUL_I:%.*]] = mul nsw <8 x i16> [[CONV_I_I]], [[CONV_I3_I]]
@@ -3028,12 +3295,13 @@ v128_t test_i16x8_extmul_high_i8x16(v128_t a, v128_t b) {
   return wasm_i16x8_extmul_high_i8x16(a, b);
 }
 
-// CHECK-LABEL: @test_u16x8_extmul_low_u8x16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
+// CHECK-LABEL: define hidden <4 x i32> @test_u16x8_extmul_low_u8x16(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
 // CHECK-NEXT:    [[VECINIT14_I2_I:%.*]] = shufflevector <16 x i8> [[TMP0]], <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 // CHECK-NEXT:    [[CONV_I3_I:%.*]] = zext <8 x i8> [[VECINIT14_I2_I]] to <8 x i16>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VECINIT14_I_I:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 // CHECK-NEXT:    [[CONV_I_I:%.*]] = zext <8 x i8> [[VECINIT14_I_I]] to <8 x i16>
 // CHECK-NEXT:    [[MUL_I:%.*]] = mul nuw <8 x i16> [[CONV_I_I]], [[CONV_I3_I]]
@@ -3044,12 +3312,13 @@ v128_t test_u16x8_extmul_low_u8x16(v128_t a, v128_t b) {
   return wasm_u16x8_extmul_low_u8x16(a, b);
 }
 
-// CHECK-LABEL: @test_u16x8_extmul_high_u8x16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
+// CHECK-LABEL: define hidden <4 x i32> @test_u16x8_extmul_high_u8x16(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
 // CHECK-NEXT:    [[VECINIT14_I2_I:%.*]] = shufflevector <16 x i8> [[TMP0]], <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 // CHECK-NEXT:    [[CONV_I3_I:%.*]] = zext <8 x i8> [[VECINIT14_I2_I]] to <8 x i16>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VECINIT14_I_I:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 // CHECK-NEXT:    [[CONV_I_I:%.*]] = zext <8 x i8> [[VECINIT14_I_I]] to <8 x i16>
 // CHECK-NEXT:    [[MUL_I:%.*]] = mul nuw <8 x i16> [[CONV_I_I]], [[CONV_I3_I]]
@@ -3060,12 +3329,13 @@ v128_t test_u16x8_extmul_high_u8x16(v128_t a, v128_t b) {
   return wasm_u16x8_extmul_high_u8x16(a, b);
 }
 
-// CHECK-LABEL: @test_i32x4_extmul_low_i16x8(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16>
+// CHECK-LABEL: define hidden range(i32 -1073709056, 1073741825) <4 x i32> @test_i32x4_extmul_low_i16x8(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <8 x i16>
 // CHECK-NEXT:    [[VECINIT6_I2_I:%.*]] = shufflevector <8 x i16> [[TMP0]], <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 // CHECK-NEXT:    [[CONV_I3_I:%.*]] = sext <4 x i16> [[VECINIT6_I2_I]] to <4 x i32>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <8 x i16>
 // CHECK-NEXT:    [[VECINIT6_I_I:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 // CHECK-NEXT:    [[CONV_I_I:%.*]] = sext <4 x i16> [[VECINIT6_I_I]] to <4 x i32>
 // CHECK-NEXT:    [[MUL_I:%.*]] = mul nsw <4 x i32> [[CONV_I_I]], [[CONV_I3_I]]
@@ -3075,12 +3345,13 @@ v128_t test_i32x4_extmul_low_i16x8(v128_t a, v128_t b) {
   return wasm_i32x4_extmul_low_i16x8(a, b);
 }
 
-// CHECK-LABEL: @test_i32x4_extmul_high_i16x8(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16>
+// CHECK-LABEL: define hidden range(i32 -1073709056, 1073741825) <4 x i32> @test_i32x4_extmul_high_i16x8(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <8 x i16>
 // CHECK-NEXT:    [[VECINIT6_I2_I:%.*]] = shufflevector <8 x i16> [[TMP0]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 // CHECK-NEXT:    [[CONV_I3_I:%.*]] = sext <4 x i16> [[VECINIT6_I2_I]] to <4 x i32>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <8 x i16>
 // CHECK-NEXT:    [[VECINIT6_I_I:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 // CHECK-NEXT:    [[CONV_I_I:%.*]] = sext <4 x i16> [[VECINIT6_I_I]] to <4 x i32>
 // CHECK-NEXT:    [[MUL_I:%.*]] = mul nsw <4 x i32> [[CONV_I_I]], [[CONV_I3_I]]
@@ -3090,12 +3361,13 @@ v128_t test_i32x4_extmul_high_i16x8(v128_t a, v128_t b) {
   return wasm_i32x4_extmul_high_i16x8(a, b);
 }
 
-// CHECK-LABEL: @test_u32x4_extmul_low_u16x8(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16>
+// CHECK-LABEL: define hidden range(i32 0, -131070) <4 x i32> @test_u32x4_extmul_low_u16x8(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <8 x i16>
 // CHECK-NEXT:    [[VECINIT6_I2_I:%.*]] = shufflevector <8 x i16> [[TMP0]], <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 // CHECK-NEXT:    [[CONV_I3_I:%.*]] = zext <4 x i16> [[VECINIT6_I2_I]] to <4 x i32>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <8 x i16>
 // CHECK-NEXT:    [[VECINIT6_I_I:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 // CHECK-NEXT:    [[CONV_I_I:%.*]] = zext <4 x i16> [[VECINIT6_I_I]] to <4 x i32>
 // CHECK-NEXT:    [[MUL_I:%.*]] = mul nuw <4 x i32> [[CONV_I_I]], [[CONV_I3_I]]
@@ -3105,12 +3377,13 @@ v128_t test_u32x4_extmul_low_u16x8(v128_t a, v128_t b) {
   return wasm_u32x4_extmul_low_u16x8(a, b);
 }
 
-// CHECK-LABEL: @test_u32x4_extmul_high_u16x8(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16>
+// CHECK-LABEL: define hidden range(i32 0, -131070) <4 x i32> @test_u32x4_extmul_high_u16x8(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <8 x i16>
 // CHECK-NEXT:    [[VECINIT6_I2_I:%.*]] = shufflevector <8 x i16> [[TMP0]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 // CHECK-NEXT:    [[CONV_I3_I:%.*]] = zext <4 x i16> [[VECINIT6_I2_I]] to <4 x i32>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <8 x i16>
 // CHECK-NEXT:    [[VECINIT6_I_I:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 // CHECK-NEXT:    [[CONV_I_I:%.*]] = zext <4 x i16> [[VECINIT6_I_I]] to <4 x i32>
 // CHECK-NEXT:    [[MUL_I:%.*]] = mul nuw <4 x i32> [[CONV_I_I]], [[CONV_I3_I]]
@@ -3120,11 +3393,12 @@ v128_t test_u32x4_extmul_high_u16x8(v128_t a, v128_t b) {
   return wasm_u32x4_extmul_high_u16x8(a, b);
 }
 
-// CHECK-LABEL: @test_i64x2_extmul_low_i32x4(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VECINIT2_I2_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <2 x i32> <i32 0, i32 1>
+// CHECK-LABEL: define hidden <4 x i32> @test_i64x2_extmul_low_i32x4(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT2_I2_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> poison, <2 x i32> <i32 0, i32 1>
 // CHECK-NEXT:    [[CONV_I3_I:%.*]] = sext <2 x i32> [[VECINIT2_I2_I]] to <2 x i64>
-// CHECK-NEXT:    [[VECINIT2_I_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <2 x i32> <i32 0, i32 1>
+// CHECK-NEXT:    [[VECINIT2_I_I:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> poison, <2 x i32> <i32 0, i32 1>
 // CHECK-NEXT:    [[CONV_I_I:%.*]] = sext <2 x i32> [[VECINIT2_I_I]] to <2 x i64>
 // CHECK-NEXT:    [[MUL_I:%.*]] = mul nsw <2 x i64> [[CONV_I_I]], [[CONV_I3_I]]
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[MUL_I]] to <4 x i32>
@@ -3134,11 +3408,12 @@ v128_t test_i64x2_extmul_low_i32x4(v128_t a, v128_t b) {
   return wasm_i64x2_extmul_low_i32x4(a, b);
 }
 
-// CHECK-LABEL: @test_i64x2_extmul_high_i32x4(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VECINIT2_I2_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+// CHECK-LABEL: define hidden <4 x i32> @test_i64x2_extmul_high_i32x4(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT2_I2_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> poison, <2 x i32> <i32 2, i32 3>
 // CHECK-NEXT:    [[CONV_I3_I:%.*]] = sext <2 x i32> [[VECINIT2_I2_I]] to <2 x i64>
-// CHECK-NEXT:    [[VECINIT2_I_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+// CHECK-NEXT:    [[VECINIT2_I_I:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> poison, <2 x i32> <i32 2, i32 3>
 // CHECK-NEXT:    [[CONV_I_I:%.*]] = sext <2 x i32> [[VECINIT2_I_I]] to <2 x i64>
 // CHECK-NEXT:    [[MUL_I:%.*]] = mul nsw <2 x i64> [[CONV_I_I]], [[CONV_I3_I]]
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[MUL_I]] to <4 x i32>
@@ -3148,11 +3423,12 @@ v128_t test_i64x2_extmul_high_i32x4(v128_t a, v128_t b) {
   return wasm_i64x2_extmul_high_i32x4(a, b);
 }
 
-// CHECK-LABEL: @test_u64x2_extmul_low_u32x4(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VECINIT2_I2_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <2 x i32> <i32 0, i32 1>
+// CHECK-LABEL: define hidden <4 x i32> @test_u64x2_extmul_low_u32x4(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT2_I2_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> poison, <2 x i32> <i32 0, i32 1>
 // CHECK-NEXT:    [[CONV_I3_I:%.*]] = zext <2 x i32> [[VECINIT2_I2_I]] to <2 x i64>
-// CHECK-NEXT:    [[VECINIT2_I_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <2 x i32> <i32 0, i32 1>
+// CHECK-NEXT:    [[VECINIT2_I_I:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> poison, <2 x i32> <i32 0, i32 1>
 // CHECK-NEXT:    [[CONV_I_I:%.*]] = zext <2 x i32> [[VECINIT2_I_I]] to <2 x i64>
 // CHECK-NEXT:    [[MUL_I:%.*]] = mul nuw <2 x i64> [[CONV_I_I]], [[CONV_I3_I]]
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[MUL_I]] to <4 x i32>
@@ -3162,11 +3438,12 @@ v128_t test_u64x2_extmul_low_u32x4(v128_t a, v128_t b) {
   return wasm_u64x2_extmul_low_u32x4(a, b);
 }
 
-// CHECK-LABEL: @test_u64x2_extmul_high_u32x4(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VECINIT2_I2_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+// CHECK-LABEL: define hidden <4 x i32> @test_u64x2_extmul_high_u32x4(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT2_I2_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> poison, <2 x i32> <i32 2, i32 3>
 // CHECK-NEXT:    [[CONV_I3_I:%.*]] = zext <2 x i32> [[VECINIT2_I2_I]] to <2 x i64>
-// CHECK-NEXT:    [[VECINIT2_I_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+// CHECK-NEXT:    [[VECINIT2_I_I:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> poison, <2 x i32> <i32 2, i32 3>
 // CHECK-NEXT:    [[CONV_I_I:%.*]] = zext <2 x i32> [[VECINIT2_I_I]] to <2 x i64>
 // CHECK-NEXT:    [[MUL_I:%.*]] = mul nuw <2 x i64> [[CONV_I_I]], [[CONV_I3_I]]
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[MUL_I]] to <4 x i32>
@@ -3176,10 +3453,11 @@ v128_t test_u64x2_extmul_high_u32x4(v128_t a, v128_t b) {
   return wasm_u64x2_extmul_high_u32x4(a, b);
 }
 
-// CHECK-LABEL: @test_i16x8_q15mulr_sat(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <8 x i16>
+// CHECK-LABEL: define hidden <4 x i32> @test_i16x8_q15mulr_sat(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <8 x i16>
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.wasm.q15mulr.sat.signed(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP3]]
@@ -3187,3 +3465,8 @@ v128_t test_u64x2_extmul_high_u32x4(v128_t a, v128_t b) {
 v128_t test_i16x8_q15mulr_sat(v128_t a, v128_t b) {
   return wasm_i16x8_q15mulr_sat(a, b);
 }
+//.
+// CHECK: [[CHAR_TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
+// CHECK: [[META3]] = !{!"omnipotent char", [[META4:![0-9]+]], i64 0}
+// CHECK: [[META4]] = !{!"Simple C/C++ TBAA"}
+//.
diff --git a/clang/test/Index/preamble-reparse-changed-module.m b/clang/test/Index/preamble-reparse-changed-module.m
index 349ed0db27d01..88e837afe6b0a 100644
--- a/clang/test/Index/preamble-reparse-changed-module.m
+++ b/clang/test/Index/preamble-reparse-changed-module.m
@@ -1,4 +1,4 @@
-// REQUIRES: shell
+// UNSUPPORTED: system-windows
 
 // RUN: rm -rf %t
 // RUN: mkdir -p %t/mod
diff --git a/clang/test/Index/preamble.c b/clang/test/Index/preamble.c
index 08e62ecf449b9..324436cfa190b 100644
--- a/clang/test/Index/preamble.c
+++ b/clang/test/Index/preamble.c
@@ -21,7 +21,7 @@ void f(int x) {
 // CHECK: preamble.h:5:10: IntegerLiteral= Extent=[5:10 - 5:11]
 // CHECK: preamble.c:8:5: FunctionDecl=wibble:8:5 Extent=[8:1 - 8:16]
 // CHECK: preamble.c:8:15: ParmDecl=:8:15 (Definition) Extent=[8:12 - 8:15]
-// CHECK-DIAG: preamble.h:4:7:{4:9-4:13}: warning: incompatible pointer types assigning to 'int *' from 'float *'
+// CHECK-DIAG: preamble.h:4:7:{4:9-4:13}: error: incompatible pointer types assigning to 'int *' from 'float *'
 // FIXME: Should see:
 //     preamble.c:5:9: warning: macro is not used
 // CHECK-DIAG-NOT: preamble.c:6:9: warning: macro is not used
diff --git a/clang/test/Index/warning-flags.c b/clang/test/Index/warning-flags.c
index 1694c6abab562..d7501543ab8a6 100644
--- a/clang/test/Index/warning-flags.c
+++ b/clang/test/Index/warning-flags.c
@@ -1,5 +1,5 @@
 int foo(void) { }
-int *bar(float *f) { return f; }
+void bar(void) { int; }
 
 // RUN: c-index-test -test-load-source all %s 2>&1|FileCheck -check-prefix=CHECK-BOTH-WARNINGS %s
 // RUN: c-index-test -test-load-source-reparse 5 all %s 2>&1|FileCheck -check-prefix=CHECK-BOTH-WARNINGS %s
@@ -10,9 +10,9 @@ int *bar(float *f) { return f; }
 // RUN: c-index-test -test-load-source all -w -O4 %s 2>&1 | FileCheck -check-prefix=NOWARNINGS %s
 
 // CHECK-BOTH-WARNINGS: warning: non-void function does not return a value
-// CHECK-BOTH-WARNINGS: warning: incompatible pointer types returning 'float *' from a function with result type 'int *'
+// CHECK-BOTH-WARNINGS: warning: declaration does not declare anything
 
 // CHECK-SECOND-WARNING-NOT:non-void function does not return a value
-// CHECK-SECOND-WARNING: warning: incompatible pointer types returning 'float *' from a function with result type 'int *'
+// CHECK-SECOND-WARNING: warning: declaration does not declare anything
 
 // NOWARNINGS-NOT: warning:
diff --git a/clang/test/InterfaceStubs/driver-test.c b/clang/test/InterfaceStubs/driver-test.c
index 741cdab3e9d31..9080890f85cde 100644
--- a/clang/test/InterfaceStubs/driver-test.c
+++ b/clang/test/InterfaceStubs/driver-test.c
@@ -1,5 +1,4 @@
 // REQUIRES: x86-registered-target
-// REQUIRES: shell
 
 // NOTE: -fno-integrated-cc1 has been added to work around an ASAN failure
 //       caused by in-process cc1 invocation. Clang InterfaceStubs is not the
diff --git a/clang/test/InterfaceStubs/driver-test2.c b/clang/test/InterfaceStubs/driver-test2.c
index 905b27922264c..6c0eb00a62f37 100644
--- a/clang/test/InterfaceStubs/driver-test2.c
+++ b/clang/test/InterfaceStubs/driver-test2.c
@@ -1,5 +1,4 @@
 // REQUIRES: x86-registered-target
-// REQUIRES: shell
 
 // NOTE: -fno-integrated-cc1 has been added to work around an ASAN failure
 //       caused by in-process cc1 invocation. Clang InterfaceStubs is not the
diff --git a/clang/test/InterfaceStubs/driver-test3.c b/clang/test/InterfaceStubs/driver-test3.c
index 407fb5c20cb41..0973f1ffc18f4 100644
--- a/clang/test/InterfaceStubs/driver-test3.c
+++ b/clang/test/InterfaceStubs/driver-test3.c
@@ -1,5 +1,4 @@
 // REQUIRES: x86-registered-target
-// REQUIRES: shell
 
 // RUN: mkdir -p %t; cd %t
 // RUN: %clang -target x86_64-unknown-linux-gnu -c -emit-interface-stubs %s -o %t/driver-test3.o
diff --git a/clang/test/Lexer/cxx-features.cpp b/clang/test/Lexer/cxx-features.cpp
index 8c1867d5c7365..ced5bcaf0db16 100644
--- a/clang/test/Lexer/cxx-features.cpp
+++ b/clang/test/Lexer/cxx-features.cpp
@@ -49,6 +49,10 @@
 #error "wrong value for __cpp_placeholder_variables"
 #endif
 
+#if check(trivial_relocatability, 202502, 202502, 202502, 202502, 202502, 202502, 202502)
+#error "wrong value for __cpp_trivial_relocatability"
+#endif
+
 // --- C++23 features ---
 
 #if check(auto_cast, 0, 0, 0, 0, 0, 202110, 202110)
diff --git a/clang/test/Misc/remap-file.c b/clang/test/Misc/remap-file.c
index d6b925a5e0ab0..45ccac910e11b 100644
--- a/clang/test/Misc/remap-file.c
+++ b/clang/test/Misc/remap-file.c
@@ -1,6 +1,6 @@
-// RUN: %clang_cc1 -remap-file "%s;%S/Inputs/remapped-file" -fsyntax-only %s 2>&1 | FileCheck -check-prefix=CHECK-EXIST %s
-// RUN: %clang_cc1 -remap-file "%S/nonexistent.c;%S/Inputs/remapped-file" -fsyntax-only %S/nonexistent.c 2>&1 | FileCheck -check-prefix=CHECK-NONEXIST %s
-// RUN: %clang_cc1 -remap-file "%S/nonexistent.c;%S/Inputs/remapped-file-2" -remap-file "%S%{fs-sep}nonexistent.h;%S/Inputs/remapped-file-3" -fsyntax-only %S/nonexistent.c 2>&1 | FileCheck -check-prefix=CHECK-HEADER %s
+// RUN: %clang_cc1 -Wno-error=incompatible-pointer-types -remap-file "%s;%S/Inputs/remapped-file" -fsyntax-only %s 2>&1 | FileCheck -check-prefix=CHECK-EXIST %s
+// RUN: %clang_cc1 -Wno-error=incompatible-pointer-types -remap-file "%S/nonexistent.c;%S/Inputs/remapped-file" -fsyntax-only %S/nonexistent.c 2>&1 | FileCheck -check-prefix=CHECK-NONEXIST %s
+// RUN: %clang_cc1 -Wno-error=incompatible-pointer-types -remap-file "%S/nonexistent.c;%S/Inputs/remapped-file-2" -remap-file "%S%{fs-sep}nonexistent.h;%S/Inputs/remapped-file-3" -fsyntax-only %S/nonexistent.c 2>&1 | FileCheck -check-prefix=CHECK-HEADER %s
 
 // CHECK-EXIST: remap-file.c:1:28: warning: incompatible pointer types
 // CHECK-NONEXIST: nonexistent.c:1:28: warning: incompatible pointer types
diff --git a/clang/test/Modules/Inputs/glob-delete-with-virtual-dtor/glob-delete-with-virtual-dtor.h b/clang/test/Modules/Inputs/glob-delete-with-virtual-dtor/glob-delete-with-virtual-dtor.h
new file mode 100644
index 0000000000000..405f955f4d50e
--- /dev/null
+++ b/clang/test/Modules/Inputs/glob-delete-with-virtual-dtor/glob-delete-with-virtual-dtor.h
@@ -0,0 +1,18 @@
+class H {
+  void operator delete(void *);
+public:
+  virtual ~H();
+};
+H::~H() { }
+
+class S : public H {
+  void operator delete(void *);
+public:
+  virtual ~S();
+};
+S::~S() { }
+
+void in_h_tests() {
+  H* h = new H();
+  ::delete h;
+}
diff --git a/clang/test/Modules/Inputs/glob-delete-with-virtual-dtor/module.modulemap b/clang/test/Modules/Inputs/glob-delete-with-virtual-dtor/module.modulemap
new file mode 100644
index 0000000000000..31c1f69b01693
--- /dev/null
+++ b/clang/test/Modules/Inputs/glob-delete-with-virtual-dtor/module.modulemap
@@ -0,0 +1 @@
+module glob_delete_with_virtual_dtor { header "glob-delete-with-virtual-dtor.h" export * }
diff --git a/clang/test/Modules/added-visible-decls.cppm b/clang/test/Modules/added-visible-decls.cppm
index 2f387db452905..28df3bf6f8543 100644
--- a/clang/test/Modules/added-visible-decls.cppm
+++ b/clang/test/Modules/added-visible-decls.cppm
@@ -5,6 +5,7 @@
 // RUN: %clang_cc1 -std=c++20 %t/b.cppm -emit-reduced-module-interface -o %t/b.pcm -fprebuilt-module-path=%t
 // RUN: %clang_cc1 -std=c++20 %t/c.cppm -emit-reduced-module-interface -o %t/c.pcm -fprebuilt-module-path=%t
 // RUN: %clang_cc1 -std=c++20 %t/d.cpp -fprebuilt-module-path=%t -fsyntax-only -verify
+// RUN: %clang_cc1 -std=c++20 %t/d.cpp -fprebuilt-module-path=%t -fsyntax-only -verify -fexperimental-new-constant-interpreter
 
 //--- a.h
 template <typename T>
diff --git a/clang/test/Modules/crash-vfs-headermaps.m b/clang/test/Modules/crash-vfs-headermaps.m
index 0afa0dee63792..26ff3f26450b5 100644
--- a/clang/test/Modules/crash-vfs-headermaps.m
+++ b/clang/test/Modules/crash-vfs-headermaps.m
@@ -1,4 +1,4 @@
-// REQUIRES: crash-recovery, shell, system-darwin
+// REQUIRES: crash-recovery, system-darwin
 
 // RUN: rm -rf %t
 // RUN: mkdir -p %t/m %t/i/Foo.framework/Headers
diff --git a/clang/test/Modules/crash-vfs-include-pch.m b/clang/test/Modules/crash-vfs-include-pch.m
index 9ca10020094b0..2610b06974578 100644
--- a/clang/test/Modules/crash-vfs-include-pch.m
+++ b/clang/test/Modules/crash-vfs-include-pch.m
@@ -1,4 +1,4 @@
-// REQUIRES: crash-recovery, shell, system-darwin
+// REQUIRES: crash-recovery, system-darwin
 //
 // RUN: rm -rf %t
 // RUN: mkdir -p %t/m %t/out
diff --git a/clang/test/Modules/crash-vfs-path-emptydir-entries.m b/clang/test/Modules/crash-vfs-path-emptydir-entries.m
index a7ee1fe176fb0..9564f11cdbcff 100644
--- a/clang/test/Modules/crash-vfs-path-emptydir-entries.m
+++ b/clang/test/Modules/crash-vfs-path-emptydir-entries.m
@@ -1,4 +1,5 @@
-// REQUIRES: crash-recovery, shell
+// UNSUPPORTED: system-windows
+// REQUIRES: crash-recovery
 
 // FIXME: This XFAIL is cargo-culted from crash-report.c. Do we need it?
 // XFAIL: target={{.*-windows-gnu}}
diff --git a/clang/test/Modules/crash-vfs-path-symlink-topheader.m b/clang/test/Modules/crash-vfs-path-symlink-topheader.m
index 5c2d502b209da..bab754fcb749b 100644
--- a/clang/test/Modules/crash-vfs-path-symlink-topheader.m
+++ b/clang/test/Modules/crash-vfs-path-symlink-topheader.m
@@ -1,4 +1,4 @@
-// REQUIRES: crash-recovery, shell
+// REQUIRES: crash-recovery, symlinks
 
 // FIXME: This XFAIL is cargo-culted from crash-report.c. Do we need it?
 // XFAIL: target={{.*-windows-gnu}}
diff --git a/clang/test/Modules/crash-vfs-umbrella-frameworks.m b/clang/test/Modules/crash-vfs-umbrella-frameworks.m
index 3861dfb36819b..9f79fb1c09b0d 100644
--- a/clang/test/Modules/crash-vfs-umbrella-frameworks.m
+++ b/clang/test/Modules/crash-vfs-umbrella-frameworks.m
@@ -1,7 +1,6 @@
-// REQUIRES: crash-recovery, shell
-
-// FIXME: This XFAIL is cargo-culted from crash-report.c. Do we need it?
-// XFAIL: target={{.*-windows-gnu}}
+// REQUIRES: crash-recovery
+// File path separator differences.
+// UNSUPPORTED: system-windows
 
 // RUN: rm -rf %t
 // RUN: mkdir -p %t/i %t/m %t
@@ -44,4 +43,4 @@
 // RUN: rm -rf i
 // RUN: rm -rf crash-vfs-umbrella-*.cache/modules/*
 // RUN: chmod 755 crash-vfs-*.sh
-// RUN: ./crash-vfs-*.sh
+// RUN: bash ./crash-vfs-*.sh
diff --git a/clang/test/Modules/diamond-pch.c b/clang/test/Modules/diamond-pch.c
index b60420cb8ed0b..afa8f7a3cf643 100644
--- a/clang/test/Modules/diamond-pch.c
+++ b/clang/test/Modules/diamond-pch.c
@@ -13,7 +13,7 @@ void test_diamond(int i, float f, double d, char c) {
   right(&d);
   bottom(&c);
   bottom(&d);
-  // expected-warning@-1{{incompatible pointer types passing 'double *' to parameter of type 'char *'}}
+  // expected-error@-1{{incompatible pointer types passing 'double *' to parameter of type 'char *'}}
   // expected-note@Inputs/diamond_bottom.h:4{{passing argument to parameter 'x' here}}
 
   // Names in multiple places in the diamond.
diff --git a/clang/test/Modules/diamond.c b/clang/test/Modules/diamond.c
index 1e0641cafd87b..48a82ebf1722c 100644
--- a/clang/test/Modules/diamond.c
+++ b/clang/test/Modules/diamond.c
@@ -14,7 +14,7 @@ void test_diamond(int i, float f, double d, char c) {
   right(&d);
   bottom(&c);
   bottom(&d);
-  // expected-warning@-1{{incompatible pointer types passing 'double *' to parameter of type 'char *'}}
+  // expected-error@-1{{incompatible pointer types passing 'double *' to parameter of type 'char *'}}
   // expected-note@Inputs/diamond_bottom.h:4{{passing argument to parameter 'x' here}}
 
   // Names in multiple places in the diamond.
diff --git a/clang/test/Modules/embed-files-compressed.cpp b/clang/test/Modules/embed-files-compressed.cpp
index aca9983ff160b..5318aeb10a81e 100644
--- a/clang/test/Modules/embed-files-compressed.cpp
+++ b/clang/test/Modules/embed-files-compressed.cpp
@@ -1,5 +1,4 @@
 // REQUIRES: zlib || zstd
-// REQUIRES: shell
 //
 // RUN: rm -rf %t
 // RUN: mkdir %t
diff --git a/clang/test/Modules/embed-files.cpp b/clang/test/Modules/embed-files.cpp
index 8e5a16e544008..946daaee9991e 100644
--- a/clang/test/Modules/embed-files.cpp
+++ b/clang/test/Modules/embed-files.cpp
@@ -13,7 +13,7 @@
 // FIXME: This test is flaky on Windows because attempting to delete a file
 // after writing it just doesn't seem to work well, at least not in the lit
 // shell.
-// REQUIRES: shell
+// UNSUPPORTED: system-windows
 // RUN: rm %t/x.h
 // RUN: %clang_cc1 -fmodules -I%t -fmodule-map-file=%t/modulemap -fmodule-file=%t/a.pcm -fmodule-file=%t/b.pcm %s -verify
 #include "a.h"
diff --git a/clang/test/Modules/exponential-paths.cpp b/clang/test/Modules/exponential-paths.cpp
index b5641933f8d08..05b586bf5cd18 100644
--- a/clang/test/Modules/exponential-paths.cpp
+++ b/clang/test/Modules/exponential-paths.cpp
@@ -1,5 +1,3 @@
-// REQUIRES: shell
-//
 // RUN: rm -rf %t
 // RUN: mkdir %t
 //
diff --git a/clang/test/Modules/framework-name.m b/clang/test/Modules/framework-name.m
index 52e68f12de5ae..3e0c45971f969 100644
--- a/clang/test/Modules/framework-name.m
+++ b/clang/test/Modules/framework-name.m
@@ -1,4 +1,5 @@
-// REQUIRES: shell
+// REQUIRES: symlinks
+
 // RUN: rm -rf %t.mcp %t
 // RUN: mkdir -p %t
 // RUN: ln -s %S/Inputs/NameInDir2.framework %t/NameInImport.framework
diff --git a/clang/test/Modules/glob-delete-with-virtual-dtor.cpp b/clang/test/Modules/glob-delete-with-virtual-dtor.cpp
new file mode 100644
index 0000000000000..fb2e2a4decf60
--- /dev/null
+++ b/clang/test/Modules/glob-delete-with-virtual-dtor.cpp
@@ -0,0 +1,44 @@
+// RUN: rm -rf %t
+// RUN: %clang_cc1 -fmodules -fimplicit-module-maps %s -x c++ -fmodules-cache-path=%t -I %S/Inputs/glob-delete-with-virtual-dtor -emit-llvm -triple=i386-pc-win32 -o - | FileCheck %s --check-prefixes CHECK,CHECK32
+// RUN: %clang_cc1 -fmodules -fimplicit-module-maps %s -x c++ -fmodules-cache-path=%t -I %S/Inputs/glob-delete-with-virtual-dtor -emit-llvm -triple=x86_64-pc-win32 -o - | FileCheck %s --check-prefixes CHECK,CHECK64
+
+#include "glob-delete-with-virtual-dtor.h"
+
+static void call_in_module_function(void) {
+    in_h_tests();
+}
+
+void out_of_module_tests() {
+  S* s = new S();
+  ::delete s;
+}
+
+// CHECK32:      define {{.*}} @"??_GH@@UAEPAXI@Z"
+// CHECK64:      define {{.*}} @"??_GH@@UEAAPEAXI@Z"
+// CHECK:        store i32 %should_call_delete, ptr %[[SHOULD_DELETE_VAR:[0-9a-z._]+]], align 4
+// CHECK:        store ptr %{{.*}}, ptr %[[RETVAL:retval]]
+// CHECK:        %[[SHOULD_DELETE_VALUE:[0-9a-z._]+]] = load i32, ptr %[[SHOULD_DELETE_VAR]]
+// CHECK32:        call x86_thiscallcc void @"??1H@@UAE@XZ"(ptr {{[^,]*}} %[[THIS:[0-9a-z]+]])
+// CHECK64:        call void @"??1H@@UEAA@XZ"(ptr {{[^,]*}} %[[THIS:[0-9a-z]+]])
+// CHECK-NEXT:   %[[AND:[0-9]+]] = and i32 %[[SHOULD_DELETE_VALUE]], 1
+// CHECK-NEXT:   %[[CONDITION:[0-9]+]] = icmp eq i32 %[[AND]], 0
+// CHECK-NEXT:   br i1 %[[CONDITION]], label %[[CONTINUE_LABEL:[0-9a-z._]+]], label %[[CALL_DELETE_LABEL:[0-9a-z._]+]]
+//
+// CHECK:      [[CALL_DELETE_LABEL]]
+// CHECK-NEXT:   %[[AND:[0-9]+]] = and i32 %[[SHOULD_DELETE_VALUE]], 4
+// CHECK-NEXT:   %[[CONDITION1:[0-9]+]] = icmp eq i32 %[[AND]], 0
+// CHECK-NEXT:   br i1 %[[CONDITION1]], label %[[CALL_CLASS_DELETE:[0-9a-z._]+]], label %[[CALL_GLOB_DELETE:[0-9a-z._]+]]
+//
+// CHECK:      [[CALL_GLOB_DELETE]]
+// CHECK32-NEXT:   call void @"??3@YAXPAXI@Z"
+// CHECK64-NEXT:   call void @"??3@YAXPEAX_K@Z"
+// CHECK-NEXT:   br label %[[CONTINUE_LABEL]]
+//
+// CHECK:      [[CALL_CLASS_DELETE]]
+// CHECK32-NEXT:   call void @"??3H@@CAXPAX@Z"
+// CHECK64-NEXT:   call void @"??3H@@CAXPEAX@Z"
+// CHECK-NEXT:   br label %[[CONTINUE_LABEL]]
+//
+// CHECK:      [[CONTINUE_LABEL]]
+// CHECK-NEXT:   %[[RET:.*]] = load ptr, ptr %[[RETVAL]]
+// CHECK-NEXT:   ret ptr %[[RET]]
diff --git a/clang/test/Modules/implicit-private-without-public.m b/clang/test/Modules/implicit-private-without-public.m
index e4920bcc7ec6f..ee0674e64a252 100644
--- a/clang/test/Modules/implicit-private-without-public.m
+++ b/clang/test/Modules/implicit-private-without-public.m
@@ -1,4 +1,3 @@
-// REQUIRES: shell
 // RUN: rm -rf %t
 // RUN: %clang_cc1 -fmodules -fimplicit-module-maps -fmodules-cache-path=%t \
 // RUN:   -F%S/Inputs/implicit-private-without-public \
diff --git a/clang/test/Modules/inferred-framework-case.m b/clang/test/Modules/inferred-framework-case.m
index 2ed443f2b5a18..64828b5cdd868 100644
--- a/clang/test/Modules/inferred-framework-case.m
+++ b/clang/test/Modules/inferred-framework-case.m
@@ -1,7 +1,7 @@
 // RUN: rm -rf %t
 // RUN: %clang_cc1 -fmodules-cache-path=%t -fmodules -fimplicit-module-maps -F %S/Inputs %s -verify -DA
 // FIXME: PR20299 - getCanonicalName() is not implemented on Windows.
-// REQUIRES: shell
+// UNSUPPORTED: system-windows
 
 @import MOdule; // expected-error{{module 'MOdule' not found}}
 @import Module;
diff --git a/clang/test/Modules/lambda-merge.cpp b/clang/test/Modules/lambda-merge.cpp
index e996c9c0d5d1f..6b61d356ec581 100644
--- a/clang/test/Modules/lambda-merge.cpp
+++ b/clang/test/Modules/lambda-merge.cpp
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 -fmodules -std=c++17 -emit-llvm %s -o - -triple x86_64-linux-gnu | FileCheck %s
+// RUN: %clang_cc1 -fmodules -std=c++17 -emit-llvm %s -o - -triple x86_64-linux-gnu -fexperimental-new-constant-interpreter | FileCheck %s
 
 #pragma clang module build A
 module A {}
diff --git a/clang/test/Modules/module-file-modified.c b/clang/test/Modules/module-file-modified.c
index 57160f34a46cf..1a02b3fa511b3 100644
--- a/clang/test/Modules/module-file-modified.c
+++ b/clang/test/Modules/module-file-modified.c
@@ -9,4 +9,3 @@
 int foo = 0; // redefinition of 'foo'
 // CHECK: fatal error: file {{.*}} has been modified since the module file {{.*}} was built
 // CHECK: note: please rebuild precompiled file
-// REQUIRES: shell
diff --git a/clang/test/Modules/module-symlink.m b/clang/test/Modules/module-symlink.m
index efdaf3db0dfef..9de1cf9b5fb5d 100644
--- a/clang/test/Modules/module-symlink.m
+++ b/clang/test/Modules/module-symlink.m
@@ -1,4 +1,4 @@
-// REQUIRES: shell
+// REQUIRES: symlinks
 
 // RUN: rm -rf %t
 // RUN: %clang_cc1 -fmodules-cache-path=%t/modules -fmodules -fimplicit-module-maps -I %S/Inputs -emit-pch -o %t.pch %s -verify
diff --git a/clang/test/Modules/modulemap-collision.m b/clang/test/Modules/modulemap-collision.m
index 5ada45da3dae1..2778386dfd331 100644
--- a/clang/test/Modules/modulemap-collision.m
+++ b/clang/test/Modules/modulemap-collision.m
@@ -1,4 +1,5 @@
-// REQUIRES: shell
+// Most likely platform specific sed differences
+// UNSUPPORTED: system-windows
 
 // RUN: rm -rf %t
 // RUN: mkdir -p %t/sources %t/build
diff --git a/clang/test/Modules/serialized-diags.m b/clang/test/Modules/serialized-diags.m
index 18bce06047e90..c961260e3d078 100644
--- a/clang/test/Modules/serialized-diags.m
+++ b/clang/test/Modules/serialized-diags.m
@@ -8,7 +8,7 @@
 double *double_ptr = &float_val;
 
 // RUN: rm -rf %t %t.diag %t.out
-// RUN: %clang -fmodules -fmodules-cache-path=%t/ModuleCache -I %S/Inputs/ModuleDiags -fsyntax-only %s --serialize-diagnostics %t.diag > /dev/null 2>&1
+// RUN: %clang -Wno-error=incompatible-pointer-types -fmodules -fmodules-cache-path=%t/ModuleCache -I %S/Inputs/ModuleDiags -fsyntax-only %s --serialize-diagnostics %t.diag > /dev/null 2>&1
 // RUN: c-index-test -read-diagnostics %t.diag > %t.out 2>&1
 // RUN: FileCheck --input-file=%t.out %s
 
@@ -18,7 +18,7 @@
 // CHECK: Number of diagnostics: 2
 
 // RUN: rm -rf %t %t.diag_errors %t.out_errors
-// RUN: not %clang -fmodules -fmodules-cache-path=%t/ModuleCache -I %S/Inputs/ModuleDiags -fsyntax-only -DWITH_ERRORS %s --serialize-diagnostics %t.diag_errors > /dev/null 2>&1
+// RUN: not %clang -Wno-error=incompatible-pointer-types -fmodules -fmodules-cache-path=%t/ModuleCache -I %S/Inputs/ModuleDiags -fsyntax-only -DWITH_ERRORS %s --serialize-diagnostics %t.diag_errors > /dev/null 2>&1
 // RUN: c-index-test -read-diagnostics %t.diag_errors > %t.out_errors 2>&1
 // RUN: FileCheck -check-prefix=CHECK-WITH-ERRORS --input-file=%t.out_errors %s
 
diff --git a/clang/test/Modules/validate-file-content.m b/clang/test/Modules/validate-file-content.m
index 9977aa4665f04..cff89884552b7 100644
--- a/clang/test/Modules/validate-file-content.m
+++ b/clang/test/Modules/validate-file-content.m
@@ -1,5 +1,3 @@
-// REQUIRES: shell
-//
 // Check driver works
 // RUN: %clang -fmodules -fsyntax-only -fmodules-validate-input-files-content %s -### 2>&1 | FileCheck --check-prefix=CHECK-CC1 %s
 // CHECK-CC1: -fvalidate-ast-input-files-content
diff --git a/clang/test/OpenMP/bug54082.c b/clang/test/OpenMP/bug54082.c
index bda4bd29b9e66..ef3e7153545bf 100644
--- a/clang/test/OpenMP/bug54082.c
+++ b/clang/test/OpenMP/bug54082.c
@@ -1,4 +1,4 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --prefix-filecheck-ir-name _
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --include-generated-funcs --prefix-filecheck-ir-name _ --version 6
 // RUN: %clang_cc1 -fopenmp -O1 -x c -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK
 
 typedef enum omp_allocator_handle_t {
@@ -63,47 +63,47 @@ void foo() {
     (void)x;
   }
 }
-// CHECK-LABEL: define {{[^@]+}}@foo
-// CHECK-SAME: () local_unnamed_addr #[[ATTR0:[0-9]+]] {
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define void @foo(
+// CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[X_TRAITS:%.*]] = alloca [1 x %struct.omp_alloctrait_t], align 16
 // CHECK-NEXT:    [[X_ALLOC:%.*]] = alloca i64, align 8
 // CHECK-NEXT:    call void @llvm.lifetime.start.p0(ptr nonnull [[X_TRAITS]]) #[[ATTR5:[0-9]+]]
 // CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(16) [[X_TRAITS]], ptr noundef nonnull align 16 dereferenceable(16) @__const.foo.x_traits, i64 16, i1 false)
 // CHECK-NEXT:    call void @llvm.lifetime.start.p0(ptr nonnull [[X_ALLOC]]) #[[ATTR5]]
 // CHECK-NEXT:    [[CALL:%.*]] = call i64 @omp_init_allocator(i64 noundef 0, i32 noundef 1, ptr noundef nonnull [[X_TRAITS]]) #[[ATTR5]]
-// CHECK-NEXT:    store i64 [[CALL]], ptr [[X_ALLOC]], align 8, !tbaa [[TBAA3:![0-9]+]]
+// CHECK-NEXT:    store i64 [[CALL]], ptr [[X_ALLOC]], align 8, !tbaa [[LONG_TBAA3:![0-9]+]]
 // CHECK-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr nonnull @[[GLOB2:[0-9]+]], i32 1, ptr nonnull @foo.omp_outlined, ptr nonnull [[X_ALLOC]])
 // CHECK-NEXT:    call void @llvm.lifetime.end.p0(ptr nonnull [[X_ALLOC]]) #[[ATTR5]]
 // CHECK-NEXT:    call void @llvm.lifetime.end.p0(ptr nonnull [[X_TRAITS]]) #[[ATTR5]]
 // CHECK-NEXT:    ret void
 //
 //
-// CHECK-LABEL: define {{[^@]+}}@foo.omp_outlined
-// CHECK-SAME: (ptr noalias noundef readonly captures(none) [[DOTGLOBAL_TID_:%.*]], ptr noalias readnone captures(none) [[DOTBOUND_TID_:%.*]], ptr noundef nonnull readonly align 8 captures(none) dereferenceable(8) [[X_ALLOC:%.*]]) #[[ATTR4:[0-9]+]] {
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define internal void @foo.omp_outlined(
+// CHECK-SAME: ptr noalias noundef readonly captures(none) [[DOTGLOBAL_TID_:%.*]], ptr noalias readnone captures(none) [[DOTBOUND_TID_:%.*]], ptr noundef nonnull readonly align 8 captures(none) dereferenceable(8) [[X_ALLOC:%.*]]) #[[ATTR4:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
 // CHECK-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
 // CHECK-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
 // CHECK-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
 // CHECK-NEXT:    call void @llvm.lifetime.start.p0(ptr nonnull [[DOTOMP_LB]]) #[[ATTR5]]
-// CHECK-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4, !tbaa [[TBAA7:![0-9]+]]
+// CHECK-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4, !tbaa [[INT_TBAA7:![0-9]+]]
 // CHECK-NEXT:    call void @llvm.lifetime.start.p0(ptr nonnull [[DOTOMP_UB]]) #[[ATTR5]]
-// CHECK-NEXT:    store i32 1023, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA7]]
+// CHECK-NEXT:    store i32 1023, ptr [[DOTOMP_UB]], align 4, !tbaa [[INT_TBAA7]]
 // CHECK-NEXT:    call void @llvm.lifetime.start.p0(ptr nonnull [[DOTOMP_STRIDE]]) #[[ATTR5]]
-// CHECK-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4, !tbaa [[TBAA7]]
+// CHECK-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4, !tbaa [[INT_TBAA7]]
 // CHECK-NEXT:    call void @llvm.lifetime.start.p0(ptr nonnull [[DOTOMP_IS_LAST]]) #[[ATTR5]]
-// CHECK-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4, !tbaa [[TBAA7]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA7]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X_ALLOC]], align 8, !tbaa [[TBAA3]]
+// CHECK-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4, !tbaa [[INT_TBAA7]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[INT_TBAA7]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X_ALLOC]], align 8, !tbaa [[LONG_TBAA3]]
 // CHECK-NEXT:    [[CONV:%.*]] = inttoptr i64 [[TMP1]] to ptr
 // CHECK-NEXT:    [[DOTX__VOID_ADDR:%.*]] = tail call ptr @__kmpc_alloc(i32 [[TMP0]], i64 8, ptr [[CONV]])
 // CHECK-NEXT:    call void @__kmpc_for_static_init_4(ptr nonnull @[[GLOB1:[0-9]+]], i32 [[TMP0]], i32 34, ptr nonnull [[DOTOMP_IS_LAST]], ptr nonnull [[DOTOMP_LB]], ptr nonnull [[DOTOMP_UB]], ptr nonnull [[DOTOMP_STRIDE]], i32 1, i32 1)
-// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA7]]
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[INT_TBAA7]]
 // CHECK-NEXT:    [[COND:%.*]] = call i32 @llvm.smin.i32(i32 [[TMP2]], i32 1023)
-// CHECK-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA7]]
+// CHECK-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4, !tbaa [[INT_TBAA7]]
 // CHECK-NEXT:    call void @__kmpc_for_static_fini(ptr nonnull @[[GLOB1]], i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X_ALLOC]], align 8, !tbaa [[TBAA3]]
+// CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X_ALLOC]], align 8, !tbaa [[LONG_TBAA3]]
 // CHECK-NEXT:    [[CONV5:%.*]] = inttoptr i64 [[TMP3]] to ptr
 // CHECK-NEXT:    call void @__kmpc_free(i32 [[TMP0]], ptr [[DOTX__VOID_ADDR]], ptr [[CONV5]])
 // CHECK-NEXT:    call void @llvm.lifetime.end.p0(ptr nonnull [[DOTOMP_IS_LAST]]) #[[ATTR5]]
@@ -112,3 +112,11 @@ void foo() {
 // CHECK-NEXT:    call void @llvm.lifetime.end.p0(ptr nonnull [[DOTOMP_LB]]) #[[ATTR5]]
 // CHECK-NEXT:    ret void
 //
+//.
+// CHECK: [[LONG_TBAA3]] = !{[[META4:![0-9]+]], [[META4]], i64 0}
+// CHECK: [[META4]] = !{!"long", [[META5:![0-9]+]], i64 0}
+// CHECK: [[META5]] = !{!"omnipotent char", [[META6:![0-9]+]], i64 0}
+// CHECK: [[META6]] = !{!"Simple C/C++ TBAA"}
+// CHECK: [[INT_TBAA7]] = !{[[META8:![0-9]+]], [[META8]], i64 0}
+// CHECK: [[META8]] = !{!"int", [[META5]], i64 0}
+//.
diff --git a/clang/test/OpenMP/bug56913.c b/clang/test/OpenMP/bug56913.c
index fad9e17ac4dd8..fa5e46d30ae85 100644
--- a/clang/test/OpenMP/bug56913.c
+++ b/clang/test/OpenMP/bug56913.c
@@ -1,4 +1,4 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --prefix-filecheck-ir-name _
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --include-generated-funcs --prefix-filecheck-ir-name _ --version 6
 // RUN: %clang_cc1 -fopenmp-simd -O1 -x c -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK
 
 int j;
@@ -12,21 +12,31 @@ void loop(int n) {
     u = &j;
   }
 }
-// CHECK-LABEL: define {{[^@]+}}@loop
-// CHECK-SAME: (i32 noundef [[N:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define void @loop(
+// CHECK-SAME: i32 noundef [[N:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[J:%.*]] = alloca i32, align 4
 // CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[N]], 0
-// CHECK-NEXT:    br i1 [[CMP]], label [[SIMD_IF_THEN:%.*]], label [[SIMD_IF_END:%.*]]
-// CHECK:       simd.if.then:
-// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr @j, align 4, !tbaa [[TBAA2:![0-9]+]]
+// CHECK-NEXT:    br i1 [[CMP]], label %[[SIMD_IF_THEN:.*]], label %[[SIMD_IF_END:.*]]
+// CHECK:       [[SIMD_IF_THEN]]:
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr @j, align 4, !tbaa [[INT_TBAA2:![0-9]+]]
 // CHECK-NEXT:    call void @llvm.lifetime.start.p0(ptr nonnull [[J]]) #[[ATTR2:[0-9]+]]
-// CHECK-NEXT:    store ptr [[J]], ptr @u, align 8, !tbaa [[TBAA6:![0-9]+]], !llvm.access.group [[ACC_GRP8:![0-9]+]]
+// CHECK-NEXT:    store ptr [[J]], ptr @u, align 8, !tbaa [[INTPTR_TBAA6:![0-9]+]], !llvm.access.group [[ACC_GRP9:![0-9]+]]
 // CHECK-NEXT:    [[INC_LE:%.*]] = add i32 [[TMP0]], [[N]]
-// CHECK-NEXT:    store i32 [[INC_LE]], ptr [[J]], align 4, !tbaa [[TBAA2]]
-// CHECK-NEXT:    store i32 [[INC_LE]], ptr @j, align 4, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store i32 [[INC_LE]], ptr [[J]], align 4, !tbaa [[INT_TBAA2]]
+// CHECK-NEXT:    store i32 [[INC_LE]], ptr @j, align 4, !tbaa [[INT_TBAA2]]
 // CHECK-NEXT:    call void @llvm.lifetime.end.p0(ptr nonnull [[J]]) #[[ATTR2]]
-// CHECK-NEXT:    br label [[SIMD_IF_END]]
-// CHECK:       simd.if.end:
+// CHECK-NEXT:    br label %[[SIMD_IF_END]]
+// CHECK:       [[SIMD_IF_END]]:
 // CHECK-NEXT:    ret void
 //
+//.
+// CHECK: [[INT_TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
+// CHECK: [[META3]] = !{!"int", [[META4:![0-9]+]], i64 0}
+// CHECK: [[META4]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0}
+// CHECK: [[META5]] = !{!"Simple C/C++ TBAA"}
+// CHECK: [[INTPTR_TBAA6]] = !{[[META7:![0-9]+]], [[META7]], i64 0}
+// CHECK: [[META7]] = !{!"p1 int", [[META8:![0-9]+]], i64 0}
+// CHECK: [[META8]] = !{!"any pointer", [[META4]], i64 0}
+// CHECK: [[ACC_GRP9]] = distinct !{}
+//.
diff --git a/clang/test/OpenMP/bug57757.cpp b/clang/test/OpenMP/bug57757.cpp
index caf53a5b62c1c..5b61e143a0548 100644
--- a/clang/test/OpenMP/bug57757.cpp
+++ b/clang/test/OpenMP/bug57757.cpp
@@ -1,4 +1,4 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --prefix-filecheck-ir-name _
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --include-generated-funcs --prefix-filecheck-ir-name _ --version 6
 // RUN: %clang_cc1 -fopenmp -O1 -x c++ -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK
 
 template <class Function, class... Args>
@@ -14,42 +14,42 @@ void foo() {
   float b;
   run_task(bar, a, b);
 }
-// CHECK-LABEL: define {{[^@]+}}@_Z3foov
-// CHECK-SAME: () local_unnamed_addr #[[ATTR0:[0-9]+]] {
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define void @_Z3foov(
+// CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @__kmpc_global_thread_num(ptr nonnull @[[GLOB1:[0-9]+]])
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call ptr @__kmpc_omp_task_alloc(ptr nonnull @[[GLOB1]], i32 [[TMP0]], i32 0, i64 56, i64 1, ptr nonnull @.omp_task_entry.)
 // CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 40
-// CHECK-NEXT:    store ptr @_Z3barif, ptr [[TMP2]], align 8, !tbaa [[TBAA3:![0-9]+]]
+// CHECK-NEXT:    store ptr @_Z3barif, ptr [[TMP2]], align 8, !tbaa [[ANYPTR_TBAA3:![0-9]+]]
 // CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 16
-// CHECK-NEXT:    store i32 0, ptr [[TMP3]], align 8, !tbaa [[TBAA12:![0-9]+]]
+// CHECK-NEXT:    store i32 0, ptr [[TMP3]], align 8, !tbaa [[INT_TBAA12:![0-9]+]]
 // CHECK-NEXT:    [[TMP4:%.*]] = tail call i32 @__kmpc_omp_task(ptr nonnull @[[GLOB1]], i32 [[TMP0]], ptr [[TMP1]])
 // CHECK-NEXT:    ret void
 //
 //
-// CHECK-LABEL: define {{[^@]+}}@.omp_task_entry.
-// CHECK-SAME: (i32 noundef [[TMP0:%.*]], ptr noalias noundef [[TMP1:%.*]]) #[[ATTR3:[0-9]+]] {
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define internal noundef i32 @.omp_task_entry.(
+// CHECK-SAME: i32 noundef [[TMP0:%.*]], ptr noalias noundef [[TMP1:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 16
 // CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META13:![0-9]+]])
-// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4, !tbaa [[TBAA16:![0-9]+]], !alias.scope [[META13]], !noalias [[META17:![0-9]+]]
-// CHECK-NEXT:    switch i32 [[TMP3]], label [[DOTOMP_OUTLINED__EXIT:%.*]] [
-// CHECK-NEXT:      i32 0, label [[DOTUNTIED_JMP__I:%.*]]
-// CHECK-NEXT:      i32 1, label [[DOTUNTIED_NEXT__I:%.*]]
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4, !tbaa [[INT_TBAA16:![0-9]+]], !alias.scope [[META13]], !noalias [[META17:![0-9]+]]
+// CHECK-NEXT:    switch i32 [[TMP3]], [[DOTOMP_OUTLINED__EXIT:label %.*]] [
+// CHECK-NEXT:      i32 0, [[DOTUNTIED_JMP__I:label %.*]]
+// CHECK-NEXT:      i32 1, [[DOTUNTIED_NEXT__I:label %.*]]
 // CHECK-NEXT:    ]
-// CHECK:       .untied.jmp..i:
-// CHECK-NEXT:    store i32 1, ptr [[TMP2]], align 4, !tbaa [[TBAA16]], !alias.scope [[META13]], !noalias [[META17]]
+// CHECK:       [[_UNTIED_JMP__I:.*:]]
+// CHECK-NEXT:    store i32 1, ptr [[TMP2]], align 4, !tbaa [[INT_TBAA16]], !alias.scope [[META13]], !noalias [[META17]]
 // CHECK-NEXT:    [[TMP4:%.*]] = tail call i32 @__kmpc_omp_task(ptr nonnull @[[GLOB1]], i32 [[TMP0]], ptr nonnull [[TMP1]]), !noalias [[META13]]
-// CHECK-NEXT:    br label [[DOTOMP_OUTLINED__EXIT]]
-// CHECK:       .untied.next..i:
+// CHECK-NEXT:    br [[DOTOMP_OUTLINED__EXIT]]
+// CHECK:       [[_UNTIED_NEXT__I:.*:]]
 // CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 40
 // CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 52
 // CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 48
-// CHECK-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[TMP5]], align 8, !tbaa [[TBAA19:![0-9]+]], !noalias [[META13]]
-// CHECK-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP7]], align 8, !tbaa [[TBAA16]], !noalias [[META13]]
-// CHECK-NEXT:    [[TMP10:%.*]] = load float, ptr [[TMP6]], align 4, !tbaa [[TBAA20:![0-9]+]], !noalias [[META13]]
+// CHECK-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[TMP5]], align 8, !tbaa [[ANYPTR_TBAA19:![0-9]+]], !noalias [[META13]]
+// CHECK-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP7]], align 8, !tbaa [[INT_TBAA16]], !noalias [[META13]]
+// CHECK-NEXT:    [[TMP10:%.*]] = load float, ptr [[TMP6]], align 4, !tbaa [[FLOAT_TBAA20:![0-9]+]], !noalias [[META13]]
 // CHECK-NEXT:    tail call void [[TMP8]](i32 noundef [[TMP9]], float noundef [[TMP10]]) #[[ATTR2:[0-9]+]], !noalias [[META13]]
-// CHECK-NEXT:    br label [[DOTOMP_OUTLINED__EXIT]]
-// CHECK:       .omp_outlined..exit:
+// CHECK-NEXT:    br [[DOTOMP_OUTLINED__EXIT]]
+// CHECK:       [[_OMP_OUTLINED__EXIT:.*:]]
 // CHECK-NEXT:    ret i32 0
 //
diff --git a/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen_tbaa_PR46146.cpp b/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen_tbaa_PR46146.cpp
index 20e344f0a34a0..46c87eb31969d 100644
--- a/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen_tbaa_PR46146.cpp
+++ b/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen_tbaa_PR46146.cpp
@@ -1,4 +1,4 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _ --version 6
 // RUN: %clang_cc1 -no-enable-noundef-analysis -x c++ -O1 -disable-llvm-optzns -verify -fopenmp -internal-isystem %S/../Headers/Inputs/include -internal-isystem %S/../../lib/Headers/openmp_wrappers -include __clang_openmp_device_functions.h -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
 // RUN: %clang_cc1 -no-enable-noundef-analysis -x c++ -O1 -disable-llvm-optzns -verify -fopenmp -internal-isystem %S/../Headers/Inputs/include -internal-isystem %S/../../lib/Headers/openmp_wrappers -include __clang_openmp_device_functions.h -triple nvptx64-unknown-unknown -aux-triple powerpc64le-unknown-unknown  -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK1
 // RUN: %clang_cc1 -no-enable-noundef-analysis -x c++ -O1 -disable-llvm-optzns -verify -fopenmp -internal-isystem %S/../Headers/Inputs/include -internal-isystem %S/../../lib/Headers/openmp_wrappers -include __clang_openmp_device_functions.h -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc
@@ -30,30 +30,30 @@ void test() {
   complex_reduction<double>();
 }
 #endif
-// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z17complex_reductionIfEvv_l16
-// CHECK1-SAME: (ptr noalias [[DYN_PTR:%.*]]) #[[ATTR0:[0-9]+]] {
-// CHECK1-NEXT:  entry:
+// CHECK1-LABEL: define weak_odr protected ptx_kernel void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z17complex_reductionIfEvv_l16(
+// CHECK1-SAME: ptr noalias [[DYN_PTR:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK1-NEXT:  [[ENTRY:.*:]]
 // CHECK1-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-// CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8, !tbaa [[TBAA10:![0-9]+]]
+// CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8, !tbaa [[ANYPTR_TBAA6:![0-9]+]]
 // CHECK1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z17complex_reductionIfEvv_l16_kernel_environment, ptr [[DYN_PTR]])
 // CHECK1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
-// CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
-// CHECK1:       user_code.entry:
+// CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label %[[USER_CODE_ENTRY:.*]], label %[[WORKER_EXIT:.*]]
+// CHECK1:       [[USER_CODE_ENTRY]]:
 // CHECK1-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1:[0-9]+]])
 // CHECK1-NEXT:    store i32 0, ptr [[DOTZERO_ADDR]], align 4
-// CHECK1-NEXT:    store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA15:![0-9]+]]
+// CHECK1-NEXT:    store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[INT_TBAA10:![0-9]+]]
 // CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z17complex_reductionIfEvv_l16_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4:[0-9]+]]
 // CHECK1-NEXT:    call void @__kmpc_target_deinit()
 // CHECK1-NEXT:    ret void
-// CHECK1:       worker.exit:
+// CHECK1:       [[WORKER_EXIT]]:
 // CHECK1-NEXT:    ret void
 //
 //
-// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z17complex_reductionIfEvv_l16_omp_outlined
-// CHECK1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1:[0-9]+]] {
-// CHECK1-NEXT:  entry:
+// CHECK1-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z17complex_reductionIfEvv_l16_omp_outlined(
+// CHECK1-SAME: ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK1-NEXT:  [[ENTRY:.*:]]
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
@@ -66,82 +66,82 @@ void test() {
 // CHECK1-NEXT:    [[REF_TMP:%.*]] = alloca float, align 4
 // CHECK1-NEXT:    [[REF_TMP2:%.*]] = alloca float, align 4
 // CHECK1-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x ptr], align 8
-// CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8, !tbaa [[TBAA17:![0-9]+]]
-// CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8, !tbaa [[TBAA17]]
+// CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8, !tbaa [[INTPTR_TBAA12:![0-9]+]]
+// CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8, !tbaa [[INTPTR_TBAA12]]
 // CHECK1-NEXT:    [[ISTART:%.*]] = call align 16 ptr @__kmpc_alloc_shared(i64 4)
 // CHECK1-NEXT:    [[IEND:%.*]] = call align 16 ptr @__kmpc_alloc_shared(i64 4)
 // CHECK1-NEXT:    [[PARTIAL_SUM:%.*]] = call align 16 ptr @__kmpc_alloc_shared(i64 8)
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(ptr [[DOTOMP_IV]]) #[[ATTR4]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(ptr [[DOTOMP_LB]]) #[[ATTR4]]
-// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4, !tbaa [[TBAA15]]
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4, !tbaa [[INT_TBAA10]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(ptr [[DOTOMP_UB]]) #[[ATTR4]]
-// CHECK1-NEXT:    store i32 99, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA15]]
+// CHECK1-NEXT:    store i32 99, ptr [[DOTOMP_UB]], align 4, !tbaa [[INT_TBAA10]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(ptr [[DOTOMP_STRIDE]]) #[[ATTR4]]
-// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4, !tbaa [[TBAA15]]
+// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4, !tbaa [[INT_TBAA10]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(ptr [[DOTOMP_IS_LAST]]) #[[ATTR4]]
-// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4, !tbaa [[TBAA15]]
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4, !tbaa [[INT_TBAA10]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(ptr [[IB]]) #[[ATTR4]]
 // CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4, !tbaa [[TBAA15]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4, !tbaa [[INT_TBAA10]]
 // CHECK1-NEXT:    call void @__kmpc_distribute_static_init_4(ptr @[[GLOB2:[0-9]+]], i32 [[TMP1]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
-// CHECK1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA15]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[INT_TBAA10]]
 // CHECK1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 99
-// CHECK1-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
-// CHECK1:       cond.true:
-// CHECK1-NEXT:    br label [[COND_END:%.*]]
-// CHECK1:       cond.false:
-// CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA15]]
-// CHECK1-NEXT:    br label [[COND_END]]
-// CHECK1:       cond.end:
-// CHECK1-NEXT:    [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
-// CHECK1-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA15]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !tbaa [[TBAA15]]
-// CHECK1-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA15]]
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
-// CHECK1:       omp.inner.for.cond:
-// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA15]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA15]]
+// CHECK1-NEXT:    br i1 [[CMP]], label %[[COND_TRUE:.*]], label %[[COND_FALSE:.*]]
+// CHECK1:       [[COND_TRUE]]:
+// CHECK1-NEXT:    br label %[[COND_END:.*]]
+// CHECK1:       [[COND_FALSE]]:
+// CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    br label %[[COND_END]]
+// CHECK1:       [[COND_END]]:
+// CHECK1-NEXT:    [[COND:%.*]] = phi i32 [ 99, %[[COND_TRUE]] ], [ [[TMP3]], %[[COND_FALSE]] ]
+// CHECK1-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    br label %[[OMP_INNER_FOR_COND:.*]]
+// CHECK1:       [[OMP_INNER_FOR_COND]]:
+// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[INT_TBAA10]]
 // CHECK1-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
-// CHECK1-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_COND_CLEANUP:%.*]]
-// CHECK1:       omp.inner.for.cond.cleanup:
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_END:%.*]]
-// CHECK1:       omp.inner.for.body:
-// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA15]]
+// CHECK1-NEXT:    br i1 [[CMP1]], label %[[OMP_INNER_FOR_BODY:.*]], label %[[OMP_INNER_FOR_COND_CLEANUP:.*]]
+// CHECK1:       [[OMP_INNER_FOR_COND_CLEANUP]]:
+// CHECK1-NEXT:    br label %[[OMP_INNER_FOR_END:.*]]
+// CHECK1:       [[OMP_INNER_FOR_BODY]]:
+// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[INT_TBAA10]]
 // CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
 // CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK1-NEXT:    store i32 [[ADD]], ptr [[IB]], align 4, !tbaa [[TBAA15]]
+// CHECK1-NEXT:    store i32 [[ADD]], ptr [[IB]], align 4, !tbaa [[INT_TBAA10]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(ptr [[REF_TMP]]) #[[ATTR4]]
-// CHECK1-NEXT:    store float 0.000000e+00, ptr [[REF_TMP]], align 4, !tbaa [[TBAA19:![0-9]+]]
+// CHECK1-NEXT:    store float 0.000000e+00, ptr [[REF_TMP]], align 4, !tbaa [[FLOAT_TBAA14:![0-9]+]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(ptr [[REF_TMP2]]) #[[ATTR4]]
-// CHECK1-NEXT:    store float 0.000000e+00, ptr [[REF_TMP2]], align 4, !tbaa [[TBAA19]]
+// CHECK1-NEXT:    store float 0.000000e+00, ptr [[REF_TMP2]], align 4, !tbaa [[FLOAT_TBAA14]]
 // CHECK1-NEXT:    call void @_ZNSt7complexIfEC1ERKfS2_(ptr nonnull align 4 dereferenceable(8) [[PARTIAL_SUM]], ptr nonnull align 4 dereferenceable(4) [[REF_TMP]], ptr nonnull align 4 dereferenceable(4) [[REF_TMP2]]) #[[ATTR11:[0-9]+]]
 // CHECK1-NEXT:    call void @llvm.lifetime.end.p0(ptr [[REF_TMP2]]) #[[ATTR4]]
 // CHECK1-NEXT:    call void @llvm.lifetime.end.p0(ptr [[REF_TMP]]) #[[ATTR4]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[IB]], align 4, !tbaa [[TBAA15]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[IB]], align 4, !tbaa [[INT_TBAA10]]
 // CHECK1-NEXT:    [[MUL3:%.*]] = mul nsw i32 [[TMP8]], 4
-// CHECK1-NEXT:    store i32 [[MUL3]], ptr [[ISTART]], align 4, !tbaa [[TBAA15]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[IB]], align 4, !tbaa [[TBAA15]]
+// CHECK1-NEXT:    store i32 [[MUL3]], ptr [[ISTART]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[IB]], align 4, !tbaa [[INT_TBAA10]]
 // CHECK1-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP9]], 1
 // CHECK1-NEXT:    [[MUL5:%.*]] = mul nsw i32 [[ADD4]], 4
-// CHECK1-NEXT:    store i32 [[MUL5]], ptr [[IEND]], align 4, !tbaa [[TBAA15]]
+// CHECK1-NEXT:    store i32 [[MUL5]], ptr [[IEND]], align 4, !tbaa [[INT_TBAA10]]
 // CHECK1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [3 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
-// CHECK1-NEXT:    store ptr [[ISTART]], ptr [[TMP10]], align 8, !tbaa [[TBAA21:![0-9]+]]
+// CHECK1-NEXT:    store ptr [[ISTART]], ptr [[TMP10]], align 8, !tbaa [[ANYPTR_TBAA16:![0-9]+]]
 // CHECK1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [3 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 1
-// CHECK1-NEXT:    store ptr [[IEND]], ptr [[TMP11]], align 8, !tbaa [[TBAA21]]
+// CHECK1-NEXT:    store ptr [[IEND]], ptr [[TMP11]], align 8, !tbaa [[ANYPTR_TBAA16]]
 // CHECK1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [3 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 2
-// CHECK1-NEXT:    store ptr [[PARTIAL_SUM]], ptr [[TMP12]], align 8, !tbaa [[TBAA21]]
+// CHECK1-NEXT:    store ptr [[PARTIAL_SUM]], ptr [[TMP12]], align 8, !tbaa [[ANYPTR_TBAA16]]
 // CHECK1-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z17complex_reductionIfEvv_l16_omp_outlined_omp_outlined, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z17complex_reductionIfEvv_l16_omp_outlined_omp_outlined_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 3)
-// CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
-// CHECK1:       omp.body.continue:
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
-// CHECK1:       omp.inner.for.inc:
-// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA15]]
+// CHECK1-NEXT:    br label %[[OMP_BODY_CONTINUE:.*]]
+// CHECK1:       [[OMP_BODY_CONTINUE]]:
+// CHECK1-NEXT:    br label %[[OMP_INNER_FOR_INC:.*]]
+// CHECK1:       [[OMP_INNER_FOR_INC]]:
+// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[INT_TBAA10]]
 // CHECK1-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP13]], 1
-// CHECK1-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA15]]
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]]
-// CHECK1:       omp.inner.for.end:
-// CHECK1-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
-// CHECK1:       omp.loop.exit:
+// CHECK1-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    br label %[[OMP_INNER_FOR_COND]]
+// CHECK1:       [[OMP_INNER_FOR_END]]:
+// CHECK1-NEXT:    br label %[[OMP_LOOP_EXIT:.*]]
+// CHECK1:       [[OMP_LOOP_EXIT]]:
 // CHECK1-NEXT:    call void @__kmpc_distribute_static_fini(ptr @[[GLOB2]], i32 [[TMP1]])
 // CHECK1-NEXT:    call void @llvm.lifetime.end.p0(ptr [[IB]]) #[[ATTR4]]
 // CHECK1-NEXT:    call void @llvm.lifetime.end.p0(ptr [[DOTOMP_IS_LAST]]) #[[ATTR4]]
@@ -155,15 +155,15 @@ void test() {
 // CHECK1-NEXT:    ret void
 //
 //
-// CHECK1-LABEL: define {{[^@]+}}@_ZNSt7complexIfEC1ERKfS2_
-// CHECK1-SAME: (ptr nonnull align 4 dereferenceable(8) [[THIS:%.*]], ptr nonnull align 4 dereferenceable(4) [[__RE:%.*]], ptr nonnull align 4 dereferenceable(4) [[__IM:%.*]]) unnamed_addr #[[ATTR5:[0-9]+]] comdat align 2 {
-// CHECK1-NEXT:  entry:
+// CHECK1-LABEL: define linkonce_odr hidden void @_ZNSt7complexIfEC1ERKfS2_(
+// CHECK1-SAME: ptr nonnull align 4 dereferenceable(8) [[THIS:%.*]], ptr nonnull align 4 dereferenceable(4) [[__RE:%.*]], ptr nonnull align 4 dereferenceable(4) [[__IM:%.*]]) unnamed_addr #[[ATTR5:[0-9]+]] comdat align 2 {
+// CHECK1-NEXT:  [[ENTRY:.*:]]
 // CHECK1-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[__RE_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[__IM_ADDR:%.*]] = alloca ptr, align 8
-// CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8, !tbaa [[TBAA23:![0-9]+]]
-// CHECK1-NEXT:    store ptr [[__RE]], ptr [[__RE_ADDR]], align 8, !tbaa [[TBAA25:![0-9]+]]
-// CHECK1-NEXT:    store ptr [[__IM]], ptr [[__IM_ADDR]], align 8, !tbaa [[TBAA25]]
+// CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8, !tbaa [[_ZTSST7COMPLEXIFEPTR_TBAA18:![0-9]+]]
+// CHECK1-NEXT:    store ptr [[__RE]], ptr [[__RE_ADDR]], align 8, !tbaa [[FLOATPTR_TBAA20:![0-9]+]]
+// CHECK1-NEXT:    store ptr [[__IM]], ptr [[__IM_ADDR]], align 8, !tbaa [[FLOATPTR_TBAA20]]
 // CHECK1-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[__RE_ADDR]], align 8
 // CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[__IM_ADDR]], align 8
@@ -171,9 +171,9 @@ void test() {
 // CHECK1-NEXT:    ret void
 //
 //
-// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z17complex_reductionIfEvv_l16_omp_outlined_omp_outlined
-// CHECK1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(4) [[ISTART:%.*]], ptr nonnull align 4 dereferenceable(4) [[IEND:%.*]], ptr nonnull align 4 dereferenceable(8) [[PARTIAL_SUM:%.*]]) #[[ATTR1]] {
-// CHECK1-NEXT:  entry:
+// CHECK1-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z17complex_reductionIfEvv_l16_omp_outlined_omp_outlined(
+// CHECK1-SAME: ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(4) [[ISTART:%.*]], ptr nonnull align 4 dereferenceable(4) [[IEND:%.*]], ptr nonnull align 4 dereferenceable(8) [[PARTIAL_SUM:%.*]]) #[[ATTR1]] {
+// CHECK1-NEXT:  [[ENTRY:.*:]]
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[ISTART_ADDR:%.*]] = alloca ptr, align 8
@@ -197,155 +197,155 @@ void test() {
 // CHECK1-NEXT:    [[REF_TMP15:%.*]] = alloca float, align 4
 // CHECK1-NEXT:    [[REF_TMP16:%.*]] = alloca float, align 4
 // CHECK1-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 8
-// CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8, !tbaa [[TBAA17]]
-// CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8, !tbaa [[TBAA17]]
-// CHECK1-NEXT:    store ptr [[ISTART]], ptr [[ISTART_ADDR]], align 8, !tbaa [[TBAA17]]
-// CHECK1-NEXT:    store ptr [[IEND]], ptr [[IEND_ADDR]], align 8, !tbaa [[TBAA17]]
-// CHECK1-NEXT:    store ptr [[PARTIAL_SUM]], ptr [[PARTIAL_SUM_ADDR]], align 8, !tbaa [[TBAA23]]
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ISTART_ADDR]], align 8, !tbaa [[TBAA17]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[IEND_ADDR]], align 8, !tbaa [[TBAA17]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[PARTIAL_SUM_ADDR]], align 8, !tbaa [[TBAA23]]
+// CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8, !tbaa [[INTPTR_TBAA12]]
+// CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8, !tbaa [[INTPTR_TBAA12]]
+// CHECK1-NEXT:    store ptr [[ISTART]], ptr [[ISTART_ADDR]], align 8, !tbaa [[INTPTR_TBAA12]]
+// CHECK1-NEXT:    store ptr [[IEND]], ptr [[IEND_ADDR]], align 8, !tbaa [[INTPTR_TBAA12]]
+// CHECK1-NEXT:    store ptr [[PARTIAL_SUM]], ptr [[PARTIAL_SUM_ADDR]], align 8, !tbaa [[_ZTSST7COMPLEXIFEPTR_TBAA18]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ISTART_ADDR]], align 8, !tbaa [[INTPTR_TBAA12]], !nonnull [[META22:![0-9]+]], !align [[META23:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[IEND_ADDR]], align 8, !tbaa [[INTPTR_TBAA12]], !nonnull [[META22]], !align [[META23]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[PARTIAL_SUM_ADDR]], align 8, !tbaa [[_ZTSST7COMPLEXIFEPTR_TBAA18]], !nonnull [[META22]], !align [[META23]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(ptr [[DOTOMP_IV]]) #[[ATTR4]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(ptr [[DOTCAPTURE_EXPR_]]) #[[ATTR4]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP0]], align 4, !tbaa [[TBAA15]]
-// CHECK1-NEXT:    store i32 [[TMP3]], ptr [[DOTCAPTURE_EXPR_]], align 4, !tbaa [[TBAA15]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP0]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    store i32 [[TMP3]], ptr [[DOTCAPTURE_EXPR_]], align 4, !tbaa [[INT_TBAA10]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(ptr [[DOTCAPTURE_EXPR_1]]) #[[ATTR4]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP1]], align 4, !tbaa [[TBAA15]]
-// CHECK1-NEXT:    store i32 [[TMP4]], ptr [[DOTCAPTURE_EXPR_1]], align 4, !tbaa [[TBAA15]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP1]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    store i32 [[TMP4]], ptr [[DOTCAPTURE_EXPR_1]], align 4, !tbaa [[INT_TBAA10]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(ptr [[DOTCAPTURE_EXPR_2]]) #[[ATTR4]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4, !tbaa [[TBAA15]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4, !tbaa [[TBAA15]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4, !tbaa [[INT_TBAA10]]
 // CHECK1-NEXT:    [[SUB:%.*]] = sub i32 [[TMP5]], [[TMP6]]
 // CHECK1-NEXT:    [[SUB3:%.*]] = sub i32 [[SUB]], 1
 // CHECK1-NEXT:    [[ADD:%.*]] = add i32 [[SUB3]], 1
 // CHECK1-NEXT:    [[DIV:%.*]] = udiv i32 [[ADD]], 1
 // CHECK1-NEXT:    [[SUB4:%.*]] = sub i32 [[DIV]], 1
-// CHECK1-NEXT:    store i32 [[SUB4]], ptr [[DOTCAPTURE_EXPR_2]], align 4, !tbaa [[TBAA15]]
+// CHECK1-NEXT:    store i32 [[SUB4]], ptr [[DOTCAPTURE_EXPR_2]], align 4, !tbaa [[INT_TBAA10]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(ptr [[I]]) #[[ATTR4]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4, !tbaa [[TBAA15]]
-// CHECK1-NEXT:    store i32 [[TMP7]], ptr [[I]], align 4, !tbaa [[TBAA15]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    store i32 [[TMP7]], ptr [[I]], align 4, !tbaa [[INT_TBAA10]]
 // CHECK1-NEXT:    call void @llvm.lifetime.end.p0(ptr [[I]]) #[[ATTR4]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4, !tbaa [[TBAA15]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4, !tbaa [[TBAA15]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4, !tbaa [[INT_TBAA10]]
 // CHECK1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP8]], [[TMP9]]
-// CHECK1-NEXT:    br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
-// CHECK1:       omp.precond.then:
+// CHECK1-NEXT:    br i1 [[CMP]], label %[[OMP_PRECOND_THEN:.*]], label %[[OMP_PRECOND_END:.*]]
+// CHECK1:       [[OMP_PRECOND_THEN]]:
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(ptr [[DOTOMP_LB]]) #[[ATTR4]]
-// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4, !tbaa [[TBAA15]]
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4, !tbaa [[INT_TBAA10]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(ptr [[DOTOMP_UB]]) #[[ATTR4]]
-// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4, !tbaa [[TBAA15]]
-// CHECK1-NEXT:    store i32 [[TMP10]], ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA15]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    store i32 [[TMP10]], ptr [[DOTOMP_UB]], align 4, !tbaa [[INT_TBAA10]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(ptr [[DOTOMP_STRIDE]]) #[[ATTR4]]
-// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4, !tbaa [[TBAA15]]
+// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4, !tbaa [[INT_TBAA10]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(ptr [[DOTOMP_IS_LAST]]) #[[ATTR4]]
-// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4, !tbaa [[TBAA15]]
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4, !tbaa [[INT_TBAA10]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(ptr [[PARTIAL_SUM5]]) #[[ATTR4]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(ptr [[REF_TMP]]) #[[ATTR4]]
-// CHECK1-NEXT:    store float 0.000000e+00, ptr [[REF_TMP]], align 4, !tbaa [[TBAA19]]
+// CHECK1-NEXT:    store float 0.000000e+00, ptr [[REF_TMP]], align 4, !tbaa [[FLOAT_TBAA14]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(ptr [[REF_TMP6]]) #[[ATTR4]]
-// CHECK1-NEXT:    store float 0.000000e+00, ptr [[REF_TMP6]], align 4, !tbaa [[TBAA19]]
+// CHECK1-NEXT:    store float 0.000000e+00, ptr [[REF_TMP6]], align 4, !tbaa [[FLOAT_TBAA14]]
 // CHECK1-NEXT:    call void @_ZNSt7complexIfEC1ERKfS2_(ptr nonnull align 4 dereferenceable(8) [[PARTIAL_SUM5]], ptr nonnull align 4 dereferenceable(4) [[REF_TMP]], ptr nonnull align 4 dereferenceable(4) [[REF_TMP6]]) #[[ATTR11]]
 // CHECK1-NEXT:    call void @llvm.lifetime.end.p0(ptr [[REF_TMP6]]) #[[ATTR4]]
 // CHECK1-NEXT:    call void @llvm.lifetime.end.p0(ptr [[REF_TMP]]) #[[ATTR4]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(ptr [[I7]]) #[[ATTR4]]
 // CHECK1-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP11]], align 4, !tbaa [[TBAA15]]
+// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP11]], align 4, !tbaa [[INT_TBAA10]]
 // CHECK1-NEXT:    call void @__kmpc_for_static_init_4u(ptr @[[GLOB3:[0-9]+]], i32 [[TMP12]], i32 33, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
-// CHECK1-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
-// CHECK1:       omp.dispatch.cond:
-// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA15]]
-// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4, !tbaa [[TBAA15]]
+// CHECK1-NEXT:    br label %[[OMP_DISPATCH_COND:.*]]
+// CHECK1:       [[OMP_DISPATCH_COND]]:
+// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4, !tbaa [[INT_TBAA10]]
 // CHECK1-NEXT:    [[CMP8:%.*]] = icmp ugt i32 [[TMP13]], [[TMP14]]
-// CHECK1-NEXT:    br i1 [[CMP8]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
-// CHECK1:       cond.true:
-// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4, !tbaa [[TBAA15]]
-// CHECK1-NEXT:    br label [[COND_END:%.*]]
-// CHECK1:       cond.false:
-// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA15]]
-// CHECK1-NEXT:    br label [[COND_END]]
-// CHECK1:       cond.end:
-// CHECK1-NEXT:    [[COND:%.*]] = phi i32 [ [[TMP15]], [[COND_TRUE]] ], [ [[TMP16]], [[COND_FALSE]] ]
-// CHECK1-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA15]]
-// CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !tbaa [[TBAA15]]
-// CHECK1-NEXT:    store i32 [[TMP17]], ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA15]]
-// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA15]]
-// CHECK1-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA15]]
+// CHECK1-NEXT:    br i1 [[CMP8]], label %[[COND_TRUE:.*]], label %[[COND_FALSE:.*]]
+// CHECK1:       [[COND_TRUE]]:
+// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    br label %[[COND_END:.*]]
+// CHECK1:       [[COND_FALSE]]:
+// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    br label %[[COND_END]]
+// CHECK1:       [[COND_END]]:
+// CHECK1-NEXT:    [[COND:%.*]] = phi i32 [ [[TMP15]], %[[COND_TRUE]] ], [ [[TMP16]], %[[COND_FALSE]] ]
+// CHECK1-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    store i32 [[TMP17]], ptr [[DOTOMP_IV]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[INT_TBAA10]]
 // CHECK1-NEXT:    [[ADD9:%.*]] = add i32 [[TMP19]], 1
 // CHECK1-NEXT:    [[CMP10:%.*]] = icmp ult i32 [[TMP18]], [[ADD9]]
-// CHECK1-NEXT:    br i1 [[CMP10]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_CLEANUP:%.*]]
-// CHECK1:       omp.dispatch.cleanup:
-// CHECK1-NEXT:    br label [[OMP_DISPATCH_END:%.*]]
-// CHECK1:       omp.dispatch.body:
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
-// CHECK1:       omp.inner.for.cond:
-// CHECK1-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA15]]
-// CHECK1-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA15]]
+// CHECK1-NEXT:    br i1 [[CMP10]], label %[[OMP_DISPATCH_BODY:.*]], label %[[OMP_DISPATCH_CLEANUP:.*]]
+// CHECK1:       [[OMP_DISPATCH_CLEANUP]]:
+// CHECK1-NEXT:    br label %[[OMP_DISPATCH_END:.*]]
+// CHECK1:       [[OMP_DISPATCH_BODY]]:
+// CHECK1-NEXT:    br label %[[OMP_INNER_FOR_COND:.*]]
+// CHECK1:       [[OMP_INNER_FOR_COND]]:
+// CHECK1-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[INT_TBAA10]]
 // CHECK1-NEXT:    [[ADD11:%.*]] = add i32 [[TMP21]], 1
 // CHECK1-NEXT:    [[CMP12:%.*]] = icmp ult i32 [[TMP20]], [[ADD11]]
-// CHECK1-NEXT:    br i1 [[CMP12]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_COND_CLEANUP:%.*]]
-// CHECK1:       omp.inner.for.cond.cleanup:
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_END:%.*]]
-// CHECK1:       omp.inner.for.body:
-// CHECK1-NEXT:    [[TMP22:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4, !tbaa [[TBAA15]]
-// CHECK1-NEXT:    [[TMP23:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA15]]
+// CHECK1-NEXT:    br i1 [[CMP12]], label %[[OMP_INNER_FOR_BODY:.*]], label %[[OMP_INNER_FOR_COND_CLEANUP:.*]]
+// CHECK1:       [[OMP_INNER_FOR_COND_CLEANUP]]:
+// CHECK1-NEXT:    br label %[[OMP_INNER_FOR_END:.*]]
+// CHECK1:       [[OMP_INNER_FOR_BODY]]:
+// CHECK1-NEXT:    [[TMP22:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    [[TMP23:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[INT_TBAA10]]
 // CHECK1-NEXT:    [[MUL:%.*]] = mul i32 [[TMP23]], 1
 // CHECK1-NEXT:    [[ADD13:%.*]] = add i32 [[TMP22]], [[MUL]]
-// CHECK1-NEXT:    store i32 [[ADD13]], ptr [[I7]], align 4, !tbaa [[TBAA15]]
+// CHECK1-NEXT:    store i32 [[ADD13]], ptr [[I7]], align 4, !tbaa [[INT_TBAA10]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(ptr [[REF_TMP14]]) #[[ATTR4]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(ptr [[REF_TMP15]]) #[[ATTR4]]
-// CHECK1-NEXT:    [[TMP24:%.*]] = load i32, ptr [[I7]], align 4, !tbaa [[TBAA15]]
+// CHECK1-NEXT:    [[TMP24:%.*]] = load i32, ptr [[I7]], align 4, !tbaa [[INT_TBAA10]]
 // CHECK1-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP24]] to float
-// CHECK1-NEXT:    store float [[CONV]], ptr [[REF_TMP15]], align 4, !tbaa [[TBAA19]]
+// CHECK1-NEXT:    store float [[CONV]], ptr [[REF_TMP15]], align 4, !tbaa [[FLOAT_TBAA14]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(ptr [[REF_TMP16]]) #[[ATTR4]]
-// CHECK1-NEXT:    [[TMP25:%.*]] = load i32, ptr [[I7]], align 4, !tbaa [[TBAA15]]
+// CHECK1-NEXT:    [[TMP25:%.*]] = load i32, ptr [[I7]], align 4, !tbaa [[INT_TBAA10]]
 // CHECK1-NEXT:    [[CONV17:%.*]] = sitofp i32 [[TMP25]] to float
-// CHECK1-NEXT:    store float [[CONV17]], ptr [[REF_TMP16]], align 4, !tbaa [[TBAA19]]
+// CHECK1-NEXT:    store float [[CONV17]], ptr [[REF_TMP16]], align 4, !tbaa [[FLOAT_TBAA14]]
 // CHECK1-NEXT:    call void @_ZNSt7complexIfEC1ERKfS2_(ptr nonnull align 4 dereferenceable(8) [[REF_TMP14]], ptr nonnull align 4 dereferenceable(4) [[REF_TMP15]], ptr nonnull align 4 dereferenceable(4) [[REF_TMP16]]) #[[ATTR11]]
 // CHECK1-NEXT:    [[CALL:%.*]] = call nonnull align 4 dereferenceable(8) ptr @_ZNSt7complexIfEpLIfEERS0_RKS_IT_E(ptr nonnull align 4 dereferenceable(8) [[PARTIAL_SUM5]], ptr nonnull align 4 dereferenceable(8) [[REF_TMP14]]) #[[ATTR11]]
 // CHECK1-NEXT:    call void @llvm.lifetime.end.p0(ptr [[REF_TMP16]]) #[[ATTR4]]
 // CHECK1-NEXT:    call void @llvm.lifetime.end.p0(ptr [[REF_TMP15]]) #[[ATTR4]]
 // CHECK1-NEXT:    call void @llvm.lifetime.end.p0(ptr [[REF_TMP14]]) #[[ATTR4]]
-// CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
-// CHECK1:       omp.body.continue:
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
-// CHECK1:       omp.inner.for.inc:
-// CHECK1-NEXT:    [[TMP26:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA15]]
+// CHECK1-NEXT:    br label %[[OMP_BODY_CONTINUE:.*]]
+// CHECK1:       [[OMP_BODY_CONTINUE]]:
+// CHECK1-NEXT:    br label %[[OMP_INNER_FOR_INC:.*]]
+// CHECK1:       [[OMP_INNER_FOR_INC]]:
+// CHECK1-NEXT:    [[TMP26:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[INT_TBAA10]]
 // CHECK1-NEXT:    [[ADD18:%.*]] = add i32 [[TMP26]], 1
-// CHECK1-NEXT:    store i32 [[ADD18]], ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA15]]
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]]
-// CHECK1:       omp.inner.for.end:
-// CHECK1-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
-// CHECK1:       omp.dispatch.inc:
-// CHECK1-NEXT:    [[TMP27:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !tbaa [[TBAA15]]
-// CHECK1-NEXT:    [[TMP28:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !tbaa [[TBAA15]]
+// CHECK1-NEXT:    store i32 [[ADD18]], ptr [[DOTOMP_IV]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    br label %[[OMP_INNER_FOR_COND]]
+// CHECK1:       [[OMP_INNER_FOR_END]]:
+// CHECK1-NEXT:    br label %[[OMP_DISPATCH_INC:.*]]
+// CHECK1:       [[OMP_DISPATCH_INC]]:
+// CHECK1-NEXT:    [[TMP27:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    [[TMP28:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !tbaa [[INT_TBAA10]]
 // CHECK1-NEXT:    [[ADD19:%.*]] = add i32 [[TMP27]], [[TMP28]]
-// CHECK1-NEXT:    store i32 [[ADD19]], ptr [[DOTOMP_LB]], align 4, !tbaa [[TBAA15]]
-// CHECK1-NEXT:    [[TMP29:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA15]]
-// CHECK1-NEXT:    [[TMP30:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !tbaa [[TBAA15]]
+// CHECK1-NEXT:    store i32 [[ADD19]], ptr [[DOTOMP_LB]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    [[TMP29:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    [[TMP30:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !tbaa [[INT_TBAA10]]
 // CHECK1-NEXT:    [[ADD20:%.*]] = add i32 [[TMP29]], [[TMP30]]
-// CHECK1-NEXT:    store i32 [[ADD20]], ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA15]]
-// CHECK1-NEXT:    br label [[OMP_DISPATCH_COND]]
-// CHECK1:       omp.dispatch.end:
+// CHECK1-NEXT:    store i32 [[ADD20]], ptr [[DOTOMP_UB]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    br label %[[OMP_DISPATCH_COND]]
+// CHECK1:       [[OMP_DISPATCH_END]]:
 // CHECK1-NEXT:    [[TMP31:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT:    [[TMP32:%.*]] = load i32, ptr [[TMP31]], align 4, !tbaa [[TBAA15]]
+// CHECK1-NEXT:    [[TMP32:%.*]] = load i32, ptr [[TMP31]], align 4, !tbaa [[INT_TBAA10]]
 // CHECK1-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB3]], i32 [[TMP32]])
 // CHECK1-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0
 // CHECK1-NEXT:    store ptr [[PARTIAL_SUM5]], ptr [[TMP33]], align 8
 // CHECK1-NEXT:    [[TMP34:%.*]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(ptr @[[GLOB1]], i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_omp_reduction_shuffle_and_reduce_func, ptr @_omp_reduction_inter_warp_copy_func)
 // CHECK1-NEXT:    [[TMP35:%.*]] = icmp eq i32 [[TMP34]], 1
-// CHECK1-NEXT:    br i1 [[TMP35]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
-// CHECK1:       .omp.reduction.then:
+// CHECK1-NEXT:    br i1 [[TMP35]], [[DOTOMP_REDUCTION_THEN:label %.*]], [[DOTOMP_REDUCTION_DONE:label %.*]]
+// CHECK1:       [[_OMP_REDUCTION_THEN:.*:]]
 // CHECK1-NEXT:    [[CALL21:%.*]] = call nonnull align 4 dereferenceable(8) ptr @_ZNSt7complexIfEpLIfEERS0_RKS_IT_E(ptr nonnull align 4 dereferenceable(8) [[TMP2]], ptr nonnull align 4 dereferenceable(8) [[PARTIAL_SUM5]]) #[[ATTR11]]
-// CHECK1-NEXT:    br label [[DOTOMP_REDUCTION_DONE]]
-// CHECK1:       .omp.reduction.done:
+// CHECK1-NEXT:    br [[DOTOMP_REDUCTION_DONE]]
+// CHECK1:       [[_OMP_REDUCTION_DONE:.*:]]
 // CHECK1-NEXT:    call void @llvm.lifetime.end.p0(ptr [[I7]]) #[[ATTR4]]
 // CHECK1-NEXT:    call void @llvm.lifetime.end.p0(ptr [[PARTIAL_SUM5]]) #[[ATTR4]]
 // CHECK1-NEXT:    call void @llvm.lifetime.end.p0(ptr [[DOTOMP_IS_LAST]]) #[[ATTR4]]
 // CHECK1-NEXT:    call void @llvm.lifetime.end.p0(ptr [[DOTOMP_STRIDE]]) #[[ATTR4]]
 // CHECK1-NEXT:    call void @llvm.lifetime.end.p0(ptr [[DOTOMP_UB]]) #[[ATTR4]]
 // CHECK1-NEXT:    call void @llvm.lifetime.end.p0(ptr [[DOTOMP_LB]]) #[[ATTR4]]
-// CHECK1-NEXT:    br label [[OMP_PRECOND_END]]
-// CHECK1:       omp.precond.end:
+// CHECK1-NEXT:    br label %[[OMP_PRECOND_END]]
+// CHECK1:       [[OMP_PRECOND_END]]:
 // CHECK1-NEXT:    call void @llvm.lifetime.end.p0(ptr [[DOTCAPTURE_EXPR_2]]) #[[ATTR4]]
 // CHECK1-NEXT:    call void @llvm.lifetime.end.p0(ptr [[DOTCAPTURE_EXPR_1]]) #[[ATTR4]]
 // CHECK1-NEXT:    call void @llvm.lifetime.end.p0(ptr [[DOTCAPTURE_EXPR_]]) #[[ATTR4]]
@@ -353,32 +353,32 @@ void test() {
 // CHECK1-NEXT:    ret void
 //
 //
-// CHECK1-LABEL: define {{[^@]+}}@_ZNSt7complexIfEpLIfEERS0_RKS_IT_E
-// CHECK1-SAME: (ptr nonnull align 4 dereferenceable(8) [[THIS:%.*]], ptr nonnull align 4 dereferenceable(8) [[__C:%.*]]) #[[ATTR5]] comdat align 2 {
-// CHECK1-NEXT:  entry:
+// CHECK1-LABEL: define linkonce_odr hidden nonnull align 4 dereferenceable(8) ptr @_ZNSt7complexIfEpLIfEERS0_RKS_IT_E(
+// CHECK1-SAME: ptr nonnull align 4 dereferenceable(8) [[THIS:%.*]], ptr nonnull align 4 dereferenceable(8) [[__C:%.*]]) #[[ATTR5]] comdat align 2 {
+// CHECK1-NEXT:  [[ENTRY:.*:]]
 // CHECK1-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[__C_ADDR:%.*]] = alloca ptr, align 8
-// CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8, !tbaa [[TBAA23]]
-// CHECK1-NEXT:    store ptr [[__C]], ptr [[__C_ADDR]], align 8, !tbaa [[TBAA23]]
+// CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8, !tbaa [[_ZTSST7COMPLEXIFEPTR_TBAA18]]
+// CHECK1-NEXT:    store ptr [[__C]], ptr [[__C_ADDR]], align 8, !tbaa [[_ZTSST7COMPLEXIFEPTR_TBAA18]]
 // CHECK1-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[__C_ADDR]], align 8, !tbaa [[TBAA23]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[__C_ADDR]], align 8, !tbaa [[_ZTSST7COMPLEXIFEPTR_TBAA18]], !nonnull [[META22]], !align [[META23]]
 // CHECK1-NEXT:    [[CALL:%.*]] = call float @_ZNKSt7complexIfE4realEv(ptr nonnull align 4 dereferenceable(8) [[TMP0]]) #[[ATTR11]]
 // CHECK1-NEXT:    [[__RE_:%.*]] = getelementptr inbounds nuw %"class.std::complex", ptr [[THIS1]], i32 0, i32 0
-// CHECK1-NEXT:    [[TMP1:%.*]] = load float, ptr [[__RE_]], align 4, !tbaa [[TBAA27:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = load float, ptr [[__RE_]], align 4, !tbaa [[FLOAT_TBAA24:![0-9]+]]
 // CHECK1-NEXT:    [[ADD:%.*]] = fadd float [[TMP1]], [[CALL]]
-// CHECK1-NEXT:    store float [[ADD]], ptr [[__RE_]], align 4, !tbaa [[TBAA27]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[__C_ADDR]], align 8, !tbaa [[TBAA23]]
+// CHECK1-NEXT:    store float [[ADD]], ptr [[__RE_]], align 4, !tbaa [[FLOAT_TBAA24]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[__C_ADDR]], align 8, !tbaa [[_ZTSST7COMPLEXIFEPTR_TBAA18]], !nonnull [[META22]], !align [[META23]]
 // CHECK1-NEXT:    [[CALL2:%.*]] = call float @_ZNKSt7complexIfE4imagEv(ptr nonnull align 4 dereferenceable(8) [[TMP2]]) #[[ATTR11]]
 // CHECK1-NEXT:    [[__IM_:%.*]] = getelementptr inbounds nuw %"class.std::complex", ptr [[THIS1]], i32 0, i32 1
-// CHECK1-NEXT:    [[TMP3:%.*]] = load float, ptr [[__IM_]], align 4, !tbaa [[TBAA29:![0-9]+]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load float, ptr [[__IM_]], align 4, !tbaa [[FLOAT_TBAA26:![0-9]+]]
 // CHECK1-NEXT:    [[ADD3:%.*]] = fadd float [[TMP3]], [[CALL2]]
-// CHECK1-NEXT:    store float [[ADD3]], ptr [[__IM_]], align 4, !tbaa [[TBAA29]]
+// CHECK1-NEXT:    store float [[ADD3]], ptr [[__IM_]], align 4, !tbaa [[FLOAT_TBAA26]]
 // CHECK1-NEXT:    ret ptr [[THIS1]]
 //
 //
-// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func
-// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], i16 noundef signext [[TMP1:%.*]], i16 noundef signext [[TMP2:%.*]], i16 noundef signext [[TMP3:%.*]]) #[[ATTR1]] {
-// CHECK1-NEXT:  entry:
+// CHECK1-LABEL: define internal void @_omp_reduction_shuffle_and_reduce_func(
+// CHECK1-SAME: ptr noundef [[TMP0:%.*]], i16 noundef signext [[TMP1:%.*]], i16 noundef signext [[TMP2:%.*]], i16 noundef signext [[TMP3:%.*]]) #[[ATTR1]] {
+// CHECK1-NEXT:  [[ENTRY:.*:]]
 // CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca i16, align 2
 // CHECK1-NEXT:    [[DOTADDR2:%.*]] = alloca i16, align 2
@@ -417,33 +417,33 @@ void test() {
 // CHECK1-NEXT:    [[TMP27:%.*]] = and i1 [[TMP25]], [[TMP26]]
 // CHECK1-NEXT:    [[TMP28:%.*]] = or i1 [[TMP18]], [[TMP21]]
 // CHECK1-NEXT:    [[TMP29:%.*]] = or i1 [[TMP28]], [[TMP27]]
-// CHECK1-NEXT:    br i1 [[TMP29]], label [[THEN:%.*]], label [[ELSE:%.*]]
-// CHECK1:       then:
+// CHECK1-NEXT:    br i1 [[TMP29]], label %[[THEN:.*]], label %[[ELSE:.*]]
+// CHECK1:       [[THEN]]:
 // CHECK1-NEXT:    call void @"{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z17complex_reductionIfEvv_l16_omp_outlined_omp_outlined_omp$reduction$reduction_func"(ptr [[TMP4]], ptr [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]]) #[[ATTR4]]
-// CHECK1-NEXT:    br label [[IFCONT:%.*]]
-// CHECK1:       else:
-// CHECK1-NEXT:    br label [[IFCONT]]
-// CHECK1:       ifcont:
+// CHECK1-NEXT:    br label %[[IFCONT:.*]]
+// CHECK1:       [[ELSE]]:
+// CHECK1-NEXT:    br label %[[IFCONT]]
+// CHECK1:       [[IFCONT]]:
 // CHECK1-NEXT:    [[TMP30:%.*]] = icmp eq i16 [[TMP7]], 1
 // CHECK1-NEXT:    [[TMP31:%.*]] = icmp uge i16 [[TMP5]], [[TMP6]]
 // CHECK1-NEXT:    [[TMP32:%.*]] = and i1 [[TMP30]], [[TMP31]]
-// CHECK1-NEXT:    br i1 [[TMP32]], label [[THEN4:%.*]], label [[ELSE5:%.*]]
-// CHECK1:       then4:
+// CHECK1-NEXT:    br i1 [[TMP32]], label %[[THEN4:.*]], label %[[ELSE5:.*]]
+// CHECK1:       [[THEN4]]:
 // CHECK1-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0
 // CHECK1-NEXT:    [[TMP34:%.*]] = load ptr, ptr [[TMP33]], align 8
 // CHECK1-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP4]], i64 0, i64 0
 // CHECK1-NEXT:    [[TMP36:%.*]] = load ptr, ptr [[TMP35]], align 8
 // CHECK1-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[TMP36]], ptr align 8 [[TMP34]], i64 8, i1 false)
-// CHECK1-NEXT:    br label [[IFCONT6:%.*]]
-// CHECK1:       else5:
-// CHECK1-NEXT:    br label [[IFCONT6]]
-// CHECK1:       ifcont6:
+// CHECK1-NEXT:    br label %[[IFCONT6:.*]]
+// CHECK1:       [[ELSE5]]:
+// CHECK1-NEXT:    br label %[[IFCONT6]]
+// CHECK1:       [[IFCONT6]]:
 // CHECK1-NEXT:    ret void
 //
 //
-// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func
-// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]]) #[[ATTR1]] {
-// CHECK1-NEXT:  entry:
+// CHECK1-LABEL: define internal void @_omp_reduction_inter_warp_copy_func(
+// CHECK1-SAME: ptr noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]]) #[[ATTR1]] {
+// CHECK1-NEXT:  [[ENTRY:.*:]]
 // CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[DOTCNT_ADDR:%.*]] = alloca i32, align 4
@@ -456,96 +456,96 @@ void test() {
 // CHECK1-NEXT:    [[NVPTX_WARP_ID:%.*]] = ashr i32 [[TMP4]], 5
 // CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[DOTADDR]], align 8
 // CHECK1-NEXT:    store i32 0, ptr [[DOTCNT_ADDR]], align 4
-// CHECK1-NEXT:    br label [[PRECOND:%.*]]
-// CHECK1:       precond:
+// CHECK1-NEXT:    br label %[[PRECOND:.*]]
+// CHECK1:       [[PRECOND]]:
 // CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTCNT_ADDR]], align 4
 // CHECK1-NEXT:    [[TMP7:%.*]] = icmp ult i32 [[TMP6]], 2
-// CHECK1-NEXT:    br i1 [[TMP7]], label [[BODY:%.*]], label [[EXIT:%.*]]
-// CHECK1:       body:
+// CHECK1-NEXT:    br i1 [[TMP7]], label %[[BODY:.*]], label %[[EXIT:.*]]
+// CHECK1:       [[BODY]]:
 // CHECK1-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK1-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB4:[0-9]+]], i32 [[OMP_GLOBAL_THREAD_NUM]])
 // CHECK1-NEXT:    [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
-// CHECK1-NEXT:    br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
-// CHECK1:       then:
+// CHECK1-NEXT:    br i1 [[WARP_MASTER]], label %[[THEN:.*]], label %[[ELSE:.*]]
+// CHECK1:       [[THEN]]:
 // CHECK1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP5]], i64 0, i64 0
 // CHECK1-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[TMP8]], align 8
 // CHECK1-NEXT:    [[TMP10:%.*]] = getelementptr i32, ptr [[TMP9]], i32 [[TMP6]]
 // CHECK1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
 // CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP10]], align 4
 // CHECK1-NEXT:    store volatile i32 [[TMP12]], ptr addrspace(3) [[TMP11]], align 4
-// CHECK1-NEXT:    br label [[IFCONT:%.*]]
-// CHECK1:       else:
-// CHECK1-NEXT:    br label [[IFCONT]]
-// CHECK1:       ifcont:
+// CHECK1-NEXT:    br label %[[IFCONT:.*]]
+// CHECK1:       [[ELSE]]:
+// CHECK1-NEXT:    br label %[[IFCONT]]
+// CHECK1:       [[IFCONT]]:
 // CHECK1-NEXT:    [[OMP_GLOBAL_THREAD_NUM2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK1-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB4]], i32 [[OMP_GLOBAL_THREAD_NUM2]])
 // CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTADDR1]], align 4
 // CHECK1-NEXT:    [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP2]], [[TMP13]]
-// CHECK1-NEXT:    br i1 [[IS_ACTIVE_THREAD]], label [[THEN3:%.*]], label [[ELSE4:%.*]]
-// CHECK1:       then3:
+// CHECK1-NEXT:    br i1 [[IS_ACTIVE_THREAD]], label %[[THEN3:.*]], label %[[ELSE4:.*]]
+// CHECK1:       [[THEN3]]:
 // CHECK1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP2]]
 // CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP5]], i64 0, i64 0
 // CHECK1-NEXT:    [[TMP16:%.*]] = load ptr, ptr [[TMP15]], align 8
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr i32, ptr [[TMP16]], i32 [[TMP6]]
 // CHECK1-NEXT:    [[TMP18:%.*]] = load volatile i32, ptr addrspace(3) [[TMP14]], align 4
 // CHECK1-NEXT:    store i32 [[TMP18]], ptr [[TMP17]], align 4
-// CHECK1-NEXT:    br label [[IFCONT5:%.*]]
-// CHECK1:       else4:
-// CHECK1-NEXT:    br label [[IFCONT5]]
-// CHECK1:       ifcont5:
+// CHECK1-NEXT:    br label %[[IFCONT5:.*]]
+// CHECK1:       [[ELSE4]]:
+// CHECK1-NEXT:    br label %[[IFCONT5]]
+// CHECK1:       [[IFCONT5]]:
 // CHECK1-NEXT:    [[TMP19:%.*]] = add nsw i32 [[TMP6]], 1
 // CHECK1-NEXT:    store i32 [[TMP19]], ptr [[DOTCNT_ADDR]], align 4
-// CHECK1-NEXT:    br label [[PRECOND]]
-// CHECK1:       exit:
+// CHECK1-NEXT:    br label %[[PRECOND]]
+// CHECK1:       [[EXIT]]:
 // CHECK1-NEXT:    ret void
 //
 //
-// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z17complex_reductionIfEvv_l16_omp_outlined_omp_outlined_wrapper
-// CHECK1-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR8:[0-9]+]] {
-// CHECK1-NEXT:  entry:
+// CHECK1-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z17complex_reductionIfEvv_l16_omp_outlined_omp_outlined_wrapper(
+// CHECK1-SAME: i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR8:[0-9]+]] {
+// CHECK1-NEXT:  [[ENTRY:.*:]]
 // CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca i16, align 2
 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8
-// CHECK1-NEXT:    store i16 [[TMP0]], ptr [[DOTADDR]], align 2, !tbaa [[TBAA30:![0-9]+]]
-// CHECK1-NEXT:    store i32 [[TMP1]], ptr [[DOTADDR1]], align 4, !tbaa [[TBAA15]]
+// CHECK1-NEXT:    store i16 [[TMP0]], ptr [[DOTADDR]], align 2, !tbaa [[SHORT_TBAA27:![0-9]+]]
+// CHECK1-NEXT:    store i32 [[TMP1]], ptr [[DOTADDR1]], align 4, !tbaa [[INT_TBAA10]]
 // CHECK1-NEXT:    store i32 0, ptr [[DOTZERO_ADDR]], align 4
 // CHECK1-NEXT:    call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]])
 // CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[GLOBAL_ARGS]], align 8
 // CHECK1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds ptr, ptr [[TMP2]], i64 0
-// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[TMP3]], align 8, !tbaa [[TBAA32:![0-9]+]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[TMP3]], align 8, !tbaa [[ANYPTR_TBAA6]]
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds ptr, ptr [[TMP2]], i64 1
-// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[TMP5]], align 8, !tbaa [[TBAA32]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[TMP5]], align 8, !tbaa [[ANYPTR_TBAA6]]
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds ptr, ptr [[TMP2]], i64 2
-// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[TMP7]], align 8, !tbaa [[TBAA34:![0-9]+]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[TMP7]], align 8, !tbaa [[ANYPTR_TBAA6]]
 // CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z17complex_reductionIfEvv_l16_omp_outlined_omp_outlined(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]], ptr [[TMP4]], ptr [[TMP6]], ptr [[TMP8]]) #[[ATTR4]]
 // CHECK1-NEXT:    ret void
 //
 //
-// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z17complex_reductionIdEvv_l16
-// CHECK1-SAME: (ptr noalias [[DYN_PTR:%.*]]) #[[ATTR0]] {
-// CHECK1-NEXT:  entry:
+// CHECK1-LABEL: define weak_odr protected ptx_kernel void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z17complex_reductionIdEvv_l16(
+// CHECK1-SAME: ptr noalias [[DYN_PTR:%.*]]) #[[ATTR0]] {
+// CHECK1-NEXT:  [[ENTRY:.*:]]
 // CHECK1-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-// CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8, !tbaa [[ANYPTR_TBAA6]]
 // CHECK1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z17complex_reductionIdEvv_l16_kernel_environment, ptr [[DYN_PTR]])
 // CHECK1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
-// CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
-// CHECK1:       user_code.entry:
+// CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label %[[USER_CODE_ENTRY:.*]], label %[[WORKER_EXIT:.*]]
+// CHECK1:       [[USER_CODE_ENTRY]]:
 // CHECK1-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK1-NEXT:    store i32 0, ptr [[DOTZERO_ADDR]], align 4
-// CHECK1-NEXT:    store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA15]]
+// CHECK1-NEXT:    store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[INT_TBAA10]]
 // CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z17complex_reductionIdEvv_l16_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]]
 // CHECK1-NEXT:    call void @__kmpc_target_deinit()
 // CHECK1-NEXT:    ret void
-// CHECK1:       worker.exit:
+// CHECK1:       [[WORKER_EXIT]]:
 // CHECK1-NEXT:    ret void
 //
 //
-// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z17complex_reductionIdEvv_l16_omp_outlined
-// CHECK1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
-// CHECK1-NEXT:  entry:
+// CHECK1-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z17complex_reductionIdEvv_l16_omp_outlined(
+// CHECK1-SAME: ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK1-NEXT:  [[ENTRY:.*:]]
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
@@ -558,82 +558,82 @@ void test() {
 // CHECK1-NEXT:    [[REF_TMP:%.*]] = alloca double, align 8
 // CHECK1-NEXT:    [[REF_TMP2:%.*]] = alloca double, align 8
 // CHECK1-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x ptr], align 8
-// CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8, !tbaa [[TBAA17]]
-// CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8, !tbaa [[TBAA17]]
+// CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8, !tbaa [[INTPTR_TBAA12]]
+// CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8, !tbaa [[INTPTR_TBAA12]]
 // CHECK1-NEXT:    [[ISTART:%.*]] = call align 16 ptr @__kmpc_alloc_shared(i64 4)
 // CHECK1-NEXT:    [[IEND:%.*]] = call align 16 ptr @__kmpc_alloc_shared(i64 4)
 // CHECK1-NEXT:    [[PARTIAL_SUM:%.*]] = call align 16 ptr @__kmpc_alloc_shared(i64 16)
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(ptr [[DOTOMP_IV]]) #[[ATTR4]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(ptr [[DOTOMP_LB]]) #[[ATTR4]]
-// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4, !tbaa [[TBAA15]]
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4, !tbaa [[INT_TBAA10]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(ptr [[DOTOMP_UB]]) #[[ATTR4]]
-// CHECK1-NEXT:    store i32 99, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA15]]
+// CHECK1-NEXT:    store i32 99, ptr [[DOTOMP_UB]], align 4, !tbaa [[INT_TBAA10]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(ptr [[DOTOMP_STRIDE]]) #[[ATTR4]]
-// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4, !tbaa [[TBAA15]]
+// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4, !tbaa [[INT_TBAA10]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(ptr [[DOTOMP_IS_LAST]]) #[[ATTR4]]
-// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4, !tbaa [[TBAA15]]
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4, !tbaa [[INT_TBAA10]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(ptr [[IB]]) #[[ATTR4]]
 // CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4, !tbaa [[TBAA15]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4, !tbaa [[INT_TBAA10]]
 // CHECK1-NEXT:    call void @__kmpc_distribute_static_init_4(ptr @[[GLOB2]], i32 [[TMP1]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
-// CHECK1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA15]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[INT_TBAA10]]
 // CHECK1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 99
-// CHECK1-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
-// CHECK1:       cond.true:
-// CHECK1-NEXT:    br label [[COND_END:%.*]]
-// CHECK1:       cond.false:
-// CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA15]]
-// CHECK1-NEXT:    br label [[COND_END]]
-// CHECK1:       cond.end:
-// CHECK1-NEXT:    [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
-// CHECK1-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA15]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !tbaa [[TBAA15]]
-// CHECK1-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA15]]
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
-// CHECK1:       omp.inner.for.cond:
-// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA15]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA15]]
+// CHECK1-NEXT:    br i1 [[CMP]], label %[[COND_TRUE:.*]], label %[[COND_FALSE:.*]]
+// CHECK1:       [[COND_TRUE]]:
+// CHECK1-NEXT:    br label %[[COND_END:.*]]
+// CHECK1:       [[COND_FALSE]]:
+// CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    br label %[[COND_END]]
+// CHECK1:       [[COND_END]]:
+// CHECK1-NEXT:    [[COND:%.*]] = phi i32 [ 99, %[[COND_TRUE]] ], [ [[TMP3]], %[[COND_FALSE]] ]
+// CHECK1-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    br label %[[OMP_INNER_FOR_COND:.*]]
+// CHECK1:       [[OMP_INNER_FOR_COND]]:
+// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[INT_TBAA10]]
 // CHECK1-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
-// CHECK1-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_COND_CLEANUP:%.*]]
-// CHECK1:       omp.inner.for.cond.cleanup:
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_END:%.*]]
-// CHECK1:       omp.inner.for.body:
-// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA15]]
+// CHECK1-NEXT:    br i1 [[CMP1]], label %[[OMP_INNER_FOR_BODY:.*]], label %[[OMP_INNER_FOR_COND_CLEANUP:.*]]
+// CHECK1:       [[OMP_INNER_FOR_COND_CLEANUP]]:
+// CHECK1-NEXT:    br label %[[OMP_INNER_FOR_END:.*]]
+// CHECK1:       [[OMP_INNER_FOR_BODY]]:
+// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[INT_TBAA10]]
 // CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
 // CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK1-NEXT:    store i32 [[ADD]], ptr [[IB]], align 4, !tbaa [[TBAA15]]
+// CHECK1-NEXT:    store i32 [[ADD]], ptr [[IB]], align 4, !tbaa [[INT_TBAA10]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(ptr [[REF_TMP]]) #[[ATTR4]]
-// CHECK1-NEXT:    store double 0.000000e+00, ptr [[REF_TMP]], align 8, !tbaa [[TBAA36:![0-9]+]]
+// CHECK1-NEXT:    store double 0.000000e+00, ptr [[REF_TMP]], align 8, !tbaa [[DOUBLE_TBAA29:![0-9]+]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(ptr [[REF_TMP2]]) #[[ATTR4]]
-// CHECK1-NEXT:    store double 0.000000e+00, ptr [[REF_TMP2]], align 8, !tbaa [[TBAA36]]
+// CHECK1-NEXT:    store double 0.000000e+00, ptr [[REF_TMP2]], align 8, !tbaa [[DOUBLE_TBAA29]]
 // CHECK1-NEXT:    call void @_ZNSt7complexIdEC1ERKdS2_(ptr nonnull align 8 dereferenceable(16) [[PARTIAL_SUM]], ptr nonnull align 8 dereferenceable(8) [[REF_TMP]], ptr nonnull align 8 dereferenceable(8) [[REF_TMP2]]) #[[ATTR11]]
 // CHECK1-NEXT:    call void @llvm.lifetime.end.p0(ptr [[REF_TMP2]]) #[[ATTR4]]
 // CHECK1-NEXT:    call void @llvm.lifetime.end.p0(ptr [[REF_TMP]]) #[[ATTR4]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[IB]], align 4, !tbaa [[TBAA15]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[IB]], align 4, !tbaa [[INT_TBAA10]]
 // CHECK1-NEXT:    [[MUL3:%.*]] = mul nsw i32 [[TMP8]], 4
-// CHECK1-NEXT:    store i32 [[MUL3]], ptr [[ISTART]], align 4, !tbaa [[TBAA15]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[IB]], align 4, !tbaa [[TBAA15]]
+// CHECK1-NEXT:    store i32 [[MUL3]], ptr [[ISTART]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[IB]], align 4, !tbaa [[INT_TBAA10]]
 // CHECK1-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP9]], 1
 // CHECK1-NEXT:    [[MUL5:%.*]] = mul nsw i32 [[ADD4]], 4
-// CHECK1-NEXT:    store i32 [[MUL5]], ptr [[IEND]], align 4, !tbaa [[TBAA15]]
+// CHECK1-NEXT:    store i32 [[MUL5]], ptr [[IEND]], align 4, !tbaa [[INT_TBAA10]]
 // CHECK1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [3 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
-// CHECK1-NEXT:    store ptr [[ISTART]], ptr [[TMP10]], align 8, !tbaa [[TBAA21]]
+// CHECK1-NEXT:    store ptr [[ISTART]], ptr [[TMP10]], align 8, !tbaa [[ANYPTR_TBAA16]]
 // CHECK1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [3 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 1
-// CHECK1-NEXT:    store ptr [[IEND]], ptr [[TMP11]], align 8, !tbaa [[TBAA21]]
+// CHECK1-NEXT:    store ptr [[IEND]], ptr [[TMP11]], align 8, !tbaa [[ANYPTR_TBAA16]]
 // CHECK1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [3 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 2
-// CHECK1-NEXT:    store ptr [[PARTIAL_SUM]], ptr [[TMP12]], align 8, !tbaa [[TBAA21]]
+// CHECK1-NEXT:    store ptr [[PARTIAL_SUM]], ptr [[TMP12]], align 8, !tbaa [[ANYPTR_TBAA16]]
 // CHECK1-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z17complex_reductionIdEvv_l16_omp_outlined_omp_outlined, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z17complex_reductionIdEvv_l16_omp_outlined_omp_outlined_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 3)
-// CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
-// CHECK1:       omp.body.continue:
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
-// CHECK1:       omp.inner.for.inc:
-// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA15]]
+// CHECK1-NEXT:    br label %[[OMP_BODY_CONTINUE:.*]]
+// CHECK1:       [[OMP_BODY_CONTINUE]]:
+// CHECK1-NEXT:    br label %[[OMP_INNER_FOR_INC:.*]]
+// CHECK1:       [[OMP_INNER_FOR_INC]]:
+// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[INT_TBAA10]]
 // CHECK1-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP13]], 1
-// CHECK1-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA15]]
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]]
-// CHECK1:       omp.inner.for.end:
-// CHECK1-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
-// CHECK1:       omp.loop.exit:
+// CHECK1-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    br label %[[OMP_INNER_FOR_COND]]
+// CHECK1:       [[OMP_INNER_FOR_END]]:
+// CHECK1-NEXT:    br label %[[OMP_LOOP_EXIT:.*]]
+// CHECK1:       [[OMP_LOOP_EXIT]]:
 // CHECK1-NEXT:    call void @__kmpc_distribute_static_fini(ptr @[[GLOB2]], i32 [[TMP1]])
 // CHECK1-NEXT:    call void @llvm.lifetime.end.p0(ptr [[IB]]) #[[ATTR4]]
 // CHECK1-NEXT:    call void @llvm.lifetime.end.p0(ptr [[DOTOMP_IS_LAST]]) #[[ATTR4]]
@@ -647,15 +647,15 @@ void test() {
 // CHECK1-NEXT:    ret void
 //
 //
-// CHECK1-LABEL: define {{[^@]+}}@_ZNSt7complexIdEC1ERKdS2_
-// CHECK1-SAME: (ptr nonnull align 8 dereferenceable(16) [[THIS:%.*]], ptr nonnull align 8 dereferenceable(8) [[__RE:%.*]], ptr nonnull align 8 dereferenceable(8) [[__IM:%.*]]) unnamed_addr #[[ATTR5]] comdat align 2 {
-// CHECK1-NEXT:  entry:
+// CHECK1-LABEL: define linkonce_odr hidden void @_ZNSt7complexIdEC1ERKdS2_(
+// CHECK1-SAME: ptr nonnull align 8 dereferenceable(16) [[THIS:%.*]], ptr nonnull align 8 dereferenceable(8) [[__RE:%.*]], ptr nonnull align 8 dereferenceable(8) [[__IM:%.*]]) unnamed_addr #[[ATTR5]] comdat align 2 {
+// CHECK1-NEXT:  [[ENTRY:.*:]]
 // CHECK1-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[__RE_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[__IM_ADDR:%.*]] = alloca ptr, align 8
-// CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8, !tbaa [[TBAA38:![0-9]+]]
-// CHECK1-NEXT:    store ptr [[__RE]], ptr [[__RE_ADDR]], align 8, !tbaa [[TBAA40:![0-9]+]]
-// CHECK1-NEXT:    store ptr [[__IM]], ptr [[__IM_ADDR]], align 8, !tbaa [[TBAA40]]
+// CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8, !tbaa [[_ZTSST7COMPLEXIDEPTR_TBAA31:![0-9]+]]
+// CHECK1-NEXT:    store ptr [[__RE]], ptr [[__RE_ADDR]], align 8, !tbaa [[DOUBLEPTR_TBAA33:![0-9]+]]
+// CHECK1-NEXT:    store ptr [[__IM]], ptr [[__IM_ADDR]], align 8, !tbaa [[DOUBLEPTR_TBAA33]]
 // CHECK1-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[__RE_ADDR]], align 8
 // CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[__IM_ADDR]], align 8
@@ -663,9 +663,9 @@ void test() {
 // CHECK1-NEXT:    ret void
 //
 //
-// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z17complex_reductionIdEvv_l16_omp_outlined_omp_outlined
-// CHECK1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(4) [[ISTART:%.*]], ptr nonnull align 4 dereferenceable(4) [[IEND:%.*]], ptr nonnull align 8 dereferenceable(16) [[PARTIAL_SUM:%.*]]) #[[ATTR1]] {
-// CHECK1-NEXT:  entry:
+// CHECK1-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z17complex_reductionIdEvv_l16_omp_outlined_omp_outlined(
+// CHECK1-SAME: ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(4) [[ISTART:%.*]], ptr nonnull align 4 dereferenceable(4) [[IEND:%.*]], ptr nonnull align 8 dereferenceable(16) [[PARTIAL_SUM:%.*]]) #[[ATTR1]] {
+// CHECK1-NEXT:  [[ENTRY:.*:]]
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[ISTART_ADDR:%.*]] = alloca ptr, align 8
@@ -689,155 +689,155 @@ void test() {
 // CHECK1-NEXT:    [[REF_TMP15:%.*]] = alloca double, align 8
 // CHECK1-NEXT:    [[REF_TMP16:%.*]] = alloca double, align 8
 // CHECK1-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 8
-// CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8, !tbaa [[TBAA17]]
-// CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8, !tbaa [[TBAA17]]
-// CHECK1-NEXT:    store ptr [[ISTART]], ptr [[ISTART_ADDR]], align 8, !tbaa [[TBAA17]]
-// CHECK1-NEXT:    store ptr [[IEND]], ptr [[IEND_ADDR]], align 8, !tbaa [[TBAA17]]
-// CHECK1-NEXT:    store ptr [[PARTIAL_SUM]], ptr [[PARTIAL_SUM_ADDR]], align 8, !tbaa [[TBAA38]]
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ISTART_ADDR]], align 8, !tbaa [[TBAA17]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[IEND_ADDR]], align 8, !tbaa [[TBAA17]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[PARTIAL_SUM_ADDR]], align 8, !tbaa [[TBAA38]]
+// CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8, !tbaa [[INTPTR_TBAA12]]
+// CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8, !tbaa [[INTPTR_TBAA12]]
+// CHECK1-NEXT:    store ptr [[ISTART]], ptr [[ISTART_ADDR]], align 8, !tbaa [[INTPTR_TBAA12]]
+// CHECK1-NEXT:    store ptr [[IEND]], ptr [[IEND_ADDR]], align 8, !tbaa [[INTPTR_TBAA12]]
+// CHECK1-NEXT:    store ptr [[PARTIAL_SUM]], ptr [[PARTIAL_SUM_ADDR]], align 8, !tbaa [[_ZTSST7COMPLEXIDEPTR_TBAA31]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ISTART_ADDR]], align 8, !tbaa [[INTPTR_TBAA12]], !nonnull [[META22]], !align [[META23]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[IEND_ADDR]], align 8, !tbaa [[INTPTR_TBAA12]], !nonnull [[META22]], !align [[META23]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[PARTIAL_SUM_ADDR]], align 8, !tbaa [[_ZTSST7COMPLEXIDEPTR_TBAA31]], !nonnull [[META22]], !align [[META35:![0-9]+]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(ptr [[DOTOMP_IV]]) #[[ATTR4]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(ptr [[DOTCAPTURE_EXPR_]]) #[[ATTR4]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP0]], align 4, !tbaa [[TBAA15]]
-// CHECK1-NEXT:    store i32 [[TMP3]], ptr [[DOTCAPTURE_EXPR_]], align 4, !tbaa [[TBAA15]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP0]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    store i32 [[TMP3]], ptr [[DOTCAPTURE_EXPR_]], align 4, !tbaa [[INT_TBAA10]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(ptr [[DOTCAPTURE_EXPR_1]]) #[[ATTR4]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP1]], align 4, !tbaa [[TBAA15]]
-// CHECK1-NEXT:    store i32 [[TMP4]], ptr [[DOTCAPTURE_EXPR_1]], align 4, !tbaa [[TBAA15]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP1]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    store i32 [[TMP4]], ptr [[DOTCAPTURE_EXPR_1]], align 4, !tbaa [[INT_TBAA10]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(ptr [[DOTCAPTURE_EXPR_2]]) #[[ATTR4]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4, !tbaa [[TBAA15]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4, !tbaa [[TBAA15]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4, !tbaa [[INT_TBAA10]]
 // CHECK1-NEXT:    [[SUB:%.*]] = sub i32 [[TMP5]], [[TMP6]]
 // CHECK1-NEXT:    [[SUB3:%.*]] = sub i32 [[SUB]], 1
 // CHECK1-NEXT:    [[ADD:%.*]] = add i32 [[SUB3]], 1
 // CHECK1-NEXT:    [[DIV:%.*]] = udiv i32 [[ADD]], 1
 // CHECK1-NEXT:    [[SUB4:%.*]] = sub i32 [[DIV]], 1
-// CHECK1-NEXT:    store i32 [[SUB4]], ptr [[DOTCAPTURE_EXPR_2]], align 4, !tbaa [[TBAA15]]
+// CHECK1-NEXT:    store i32 [[SUB4]], ptr [[DOTCAPTURE_EXPR_2]], align 4, !tbaa [[INT_TBAA10]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(ptr [[I]]) #[[ATTR4]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4, !tbaa [[TBAA15]]
-// CHECK1-NEXT:    store i32 [[TMP7]], ptr [[I]], align 4, !tbaa [[TBAA15]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    store i32 [[TMP7]], ptr [[I]], align 4, !tbaa [[INT_TBAA10]]
 // CHECK1-NEXT:    call void @llvm.lifetime.end.p0(ptr [[I]]) #[[ATTR4]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4, !tbaa [[TBAA15]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4, !tbaa [[TBAA15]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4, !tbaa [[INT_TBAA10]]
 // CHECK1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP8]], [[TMP9]]
-// CHECK1-NEXT:    br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
-// CHECK1:       omp.precond.then:
+// CHECK1-NEXT:    br i1 [[CMP]], label %[[OMP_PRECOND_THEN:.*]], label %[[OMP_PRECOND_END:.*]]
+// CHECK1:       [[OMP_PRECOND_THEN]]:
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(ptr [[DOTOMP_LB]]) #[[ATTR4]]
-// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4, !tbaa [[TBAA15]]
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4, !tbaa [[INT_TBAA10]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(ptr [[DOTOMP_UB]]) #[[ATTR4]]
-// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4, !tbaa [[TBAA15]]
-// CHECK1-NEXT:    store i32 [[TMP10]], ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA15]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    store i32 [[TMP10]], ptr [[DOTOMP_UB]], align 4, !tbaa [[INT_TBAA10]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(ptr [[DOTOMP_STRIDE]]) #[[ATTR4]]
-// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4, !tbaa [[TBAA15]]
+// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4, !tbaa [[INT_TBAA10]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(ptr [[DOTOMP_IS_LAST]]) #[[ATTR4]]
-// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4, !tbaa [[TBAA15]]
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4, !tbaa [[INT_TBAA10]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(ptr [[PARTIAL_SUM5]]) #[[ATTR4]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(ptr [[REF_TMP]]) #[[ATTR4]]
-// CHECK1-NEXT:    store double 0.000000e+00, ptr [[REF_TMP]], align 8, !tbaa [[TBAA36]]
+// CHECK1-NEXT:    store double 0.000000e+00, ptr [[REF_TMP]], align 8, !tbaa [[DOUBLE_TBAA29]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(ptr [[REF_TMP6]]) #[[ATTR4]]
-// CHECK1-NEXT:    store double 0.000000e+00, ptr [[REF_TMP6]], align 8, !tbaa [[TBAA36]]
+// CHECK1-NEXT:    store double 0.000000e+00, ptr [[REF_TMP6]], align 8, !tbaa [[DOUBLE_TBAA29]]
 // CHECK1-NEXT:    call void @_ZNSt7complexIdEC1ERKdS2_(ptr nonnull align 8 dereferenceable(16) [[PARTIAL_SUM5]], ptr nonnull align 8 dereferenceable(8) [[REF_TMP]], ptr nonnull align 8 dereferenceable(8) [[REF_TMP6]]) #[[ATTR11]]
 // CHECK1-NEXT:    call void @llvm.lifetime.end.p0(ptr [[REF_TMP6]]) #[[ATTR4]]
 // CHECK1-NEXT:    call void @llvm.lifetime.end.p0(ptr [[REF_TMP]]) #[[ATTR4]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(ptr [[I7]]) #[[ATTR4]]
 // CHECK1-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP11]], align 4, !tbaa [[TBAA15]]
+// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP11]], align 4, !tbaa [[INT_TBAA10]]
 // CHECK1-NEXT:    call void @__kmpc_for_static_init_4u(ptr @[[GLOB3]], i32 [[TMP12]], i32 33, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
-// CHECK1-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
-// CHECK1:       omp.dispatch.cond:
-// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA15]]
-// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4, !tbaa [[TBAA15]]
+// CHECK1-NEXT:    br label %[[OMP_DISPATCH_COND:.*]]
+// CHECK1:       [[OMP_DISPATCH_COND]]:
+// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4, !tbaa [[INT_TBAA10]]
 // CHECK1-NEXT:    [[CMP8:%.*]] = icmp ugt i32 [[TMP13]], [[TMP14]]
-// CHECK1-NEXT:    br i1 [[CMP8]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
-// CHECK1:       cond.true:
-// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4, !tbaa [[TBAA15]]
-// CHECK1-NEXT:    br label [[COND_END:%.*]]
-// CHECK1:       cond.false:
-// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA15]]
-// CHECK1-NEXT:    br label [[COND_END]]
-// CHECK1:       cond.end:
-// CHECK1-NEXT:    [[COND:%.*]] = phi i32 [ [[TMP15]], [[COND_TRUE]] ], [ [[TMP16]], [[COND_FALSE]] ]
-// CHECK1-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA15]]
-// CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !tbaa [[TBAA15]]
-// CHECK1-NEXT:    store i32 [[TMP17]], ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA15]]
-// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA15]]
-// CHECK1-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA15]]
+// CHECK1-NEXT:    br i1 [[CMP8]], label %[[COND_TRUE:.*]], label %[[COND_FALSE:.*]]
+// CHECK1:       [[COND_TRUE]]:
+// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    br label %[[COND_END:.*]]
+// CHECK1:       [[COND_FALSE]]:
+// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    br label %[[COND_END]]
+// CHECK1:       [[COND_END]]:
+// CHECK1-NEXT:    [[COND:%.*]] = phi i32 [ [[TMP15]], %[[COND_TRUE]] ], [ [[TMP16]], %[[COND_FALSE]] ]
+// CHECK1-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    store i32 [[TMP17]], ptr [[DOTOMP_IV]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[INT_TBAA10]]
 // CHECK1-NEXT:    [[ADD9:%.*]] = add i32 [[TMP19]], 1
 // CHECK1-NEXT:    [[CMP10:%.*]] = icmp ult i32 [[TMP18]], [[ADD9]]
-// CHECK1-NEXT:    br i1 [[CMP10]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_CLEANUP:%.*]]
-// CHECK1:       omp.dispatch.cleanup:
-// CHECK1-NEXT:    br label [[OMP_DISPATCH_END:%.*]]
-// CHECK1:       omp.dispatch.body:
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
-// CHECK1:       omp.inner.for.cond:
-// CHECK1-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA15]]
-// CHECK1-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA15]]
+// CHECK1-NEXT:    br i1 [[CMP10]], label %[[OMP_DISPATCH_BODY:.*]], label %[[OMP_DISPATCH_CLEANUP:.*]]
+// CHECK1:       [[OMP_DISPATCH_CLEANUP]]:
+// CHECK1-NEXT:    br label %[[OMP_DISPATCH_END:.*]]
+// CHECK1:       [[OMP_DISPATCH_BODY]]:
+// CHECK1-NEXT:    br label %[[OMP_INNER_FOR_COND:.*]]
+// CHECK1:       [[OMP_INNER_FOR_COND]]:
+// CHECK1-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[INT_TBAA10]]
 // CHECK1-NEXT:    [[ADD11:%.*]] = add i32 [[TMP21]], 1
 // CHECK1-NEXT:    [[CMP12:%.*]] = icmp ult i32 [[TMP20]], [[ADD11]]
-// CHECK1-NEXT:    br i1 [[CMP12]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_COND_CLEANUP:%.*]]
-// CHECK1:       omp.inner.for.cond.cleanup:
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_END:%.*]]
-// CHECK1:       omp.inner.for.body:
-// CHECK1-NEXT:    [[TMP22:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4, !tbaa [[TBAA15]]
-// CHECK1-NEXT:    [[TMP23:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA15]]
+// CHECK1-NEXT:    br i1 [[CMP12]], label %[[OMP_INNER_FOR_BODY:.*]], label %[[OMP_INNER_FOR_COND_CLEANUP:.*]]
+// CHECK1:       [[OMP_INNER_FOR_COND_CLEANUP]]:
+// CHECK1-NEXT:    br label %[[OMP_INNER_FOR_END:.*]]
+// CHECK1:       [[OMP_INNER_FOR_BODY]]:
+// CHECK1-NEXT:    [[TMP22:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    [[TMP23:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[INT_TBAA10]]
 // CHECK1-NEXT:    [[MUL:%.*]] = mul i32 [[TMP23]], 1
 // CHECK1-NEXT:    [[ADD13:%.*]] = add i32 [[TMP22]], [[MUL]]
-// CHECK1-NEXT:    store i32 [[ADD13]], ptr [[I7]], align 4, !tbaa [[TBAA15]]
+// CHECK1-NEXT:    store i32 [[ADD13]], ptr [[I7]], align 4, !tbaa [[INT_TBAA10]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(ptr [[REF_TMP14]]) #[[ATTR4]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(ptr [[REF_TMP15]]) #[[ATTR4]]
-// CHECK1-NEXT:    [[TMP24:%.*]] = load i32, ptr [[I7]], align 4, !tbaa [[TBAA15]]
+// CHECK1-NEXT:    [[TMP24:%.*]] = load i32, ptr [[I7]], align 4, !tbaa [[INT_TBAA10]]
 // CHECK1-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP24]] to double
-// CHECK1-NEXT:    store double [[CONV]], ptr [[REF_TMP15]], align 8, !tbaa [[TBAA36]]
+// CHECK1-NEXT:    store double [[CONV]], ptr [[REF_TMP15]], align 8, !tbaa [[DOUBLE_TBAA29]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(ptr [[REF_TMP16]]) #[[ATTR4]]
-// CHECK1-NEXT:    [[TMP25:%.*]] = load i32, ptr [[I7]], align 4, !tbaa [[TBAA15]]
+// CHECK1-NEXT:    [[TMP25:%.*]] = load i32, ptr [[I7]], align 4, !tbaa [[INT_TBAA10]]
 // CHECK1-NEXT:    [[CONV17:%.*]] = sitofp i32 [[TMP25]] to double
-// CHECK1-NEXT:    store double [[CONV17]], ptr [[REF_TMP16]], align 8, !tbaa [[TBAA36]]
+// CHECK1-NEXT:    store double [[CONV17]], ptr [[REF_TMP16]], align 8, !tbaa [[DOUBLE_TBAA29]]
 // CHECK1-NEXT:    call void @_ZNSt7complexIdEC1ERKdS2_(ptr nonnull align 8 dereferenceable(16) [[REF_TMP14]], ptr nonnull align 8 dereferenceable(8) [[REF_TMP15]], ptr nonnull align 8 dereferenceable(8) [[REF_TMP16]]) #[[ATTR11]]
 // CHECK1-NEXT:    [[CALL:%.*]] = call nonnull align 8 dereferenceable(16) ptr @_ZNSt7complexIdEpLIdEERS0_RKS_IT_E(ptr nonnull align 8 dereferenceable(16) [[PARTIAL_SUM5]], ptr nonnull align 8 dereferenceable(16) [[REF_TMP14]]) #[[ATTR11]]
 // CHECK1-NEXT:    call void @llvm.lifetime.end.p0(ptr [[REF_TMP16]]) #[[ATTR4]]
 // CHECK1-NEXT:    call void @llvm.lifetime.end.p0(ptr [[REF_TMP15]]) #[[ATTR4]]
 // CHECK1-NEXT:    call void @llvm.lifetime.end.p0(ptr [[REF_TMP14]]) #[[ATTR4]]
-// CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
-// CHECK1:       omp.body.continue:
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
-// CHECK1:       omp.inner.for.inc:
-// CHECK1-NEXT:    [[TMP26:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA15]]
+// CHECK1-NEXT:    br label %[[OMP_BODY_CONTINUE:.*]]
+// CHECK1:       [[OMP_BODY_CONTINUE]]:
+// CHECK1-NEXT:    br label %[[OMP_INNER_FOR_INC:.*]]
+// CHECK1:       [[OMP_INNER_FOR_INC]]:
+// CHECK1-NEXT:    [[TMP26:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[INT_TBAA10]]
 // CHECK1-NEXT:    [[ADD18:%.*]] = add i32 [[TMP26]], 1
-// CHECK1-NEXT:    store i32 [[ADD18]], ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA15]]
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]]
-// CHECK1:       omp.inner.for.end:
-// CHECK1-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
-// CHECK1:       omp.dispatch.inc:
-// CHECK1-NEXT:    [[TMP27:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !tbaa [[TBAA15]]
-// CHECK1-NEXT:    [[TMP28:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !tbaa [[TBAA15]]
+// CHECK1-NEXT:    store i32 [[ADD18]], ptr [[DOTOMP_IV]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    br label %[[OMP_INNER_FOR_COND]]
+// CHECK1:       [[OMP_INNER_FOR_END]]:
+// CHECK1-NEXT:    br label %[[OMP_DISPATCH_INC:.*]]
+// CHECK1:       [[OMP_DISPATCH_INC]]:
+// CHECK1-NEXT:    [[TMP27:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    [[TMP28:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !tbaa [[INT_TBAA10]]
 // CHECK1-NEXT:    [[ADD19:%.*]] = add i32 [[TMP27]], [[TMP28]]
-// CHECK1-NEXT:    store i32 [[ADD19]], ptr [[DOTOMP_LB]], align 4, !tbaa [[TBAA15]]
-// CHECK1-NEXT:    [[TMP29:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA15]]
-// CHECK1-NEXT:    [[TMP30:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !tbaa [[TBAA15]]
+// CHECK1-NEXT:    store i32 [[ADD19]], ptr [[DOTOMP_LB]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    [[TMP29:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    [[TMP30:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !tbaa [[INT_TBAA10]]
 // CHECK1-NEXT:    [[ADD20:%.*]] = add i32 [[TMP29]], [[TMP30]]
-// CHECK1-NEXT:    store i32 [[ADD20]], ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA15]]
-// CHECK1-NEXT:    br label [[OMP_DISPATCH_COND]]
-// CHECK1:       omp.dispatch.end:
+// CHECK1-NEXT:    store i32 [[ADD20]], ptr [[DOTOMP_UB]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    br label %[[OMP_DISPATCH_COND]]
+// CHECK1:       [[OMP_DISPATCH_END]]:
 // CHECK1-NEXT:    [[TMP31:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT:    [[TMP32:%.*]] = load i32, ptr [[TMP31]], align 4, !tbaa [[TBAA15]]
+// CHECK1-NEXT:    [[TMP32:%.*]] = load i32, ptr [[TMP31]], align 4, !tbaa [[INT_TBAA10]]
 // CHECK1-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB3]], i32 [[TMP32]])
 // CHECK1-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0
 // CHECK1-NEXT:    store ptr [[PARTIAL_SUM5]], ptr [[TMP33]], align 8
 // CHECK1-NEXT:    [[TMP34:%.*]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(ptr @[[GLOB1]], i64 16, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_omp_reduction_shuffle_and_reduce_func1, ptr @_omp_reduction_inter_warp_copy_func2)
 // CHECK1-NEXT:    [[TMP35:%.*]] = icmp eq i32 [[TMP34]], 1
-// CHECK1-NEXT:    br i1 [[TMP35]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
-// CHECK1:       .omp.reduction.then:
+// CHECK1-NEXT:    br i1 [[TMP35]], [[DOTOMP_REDUCTION_THEN:label %.*]], [[DOTOMP_REDUCTION_DONE:label %.*]]
+// CHECK1:       [[_OMP_REDUCTION_THEN:.*:]]
 // CHECK1-NEXT:    [[CALL21:%.*]] = call nonnull align 8 dereferenceable(16) ptr @_ZNSt7complexIdEpLIdEERS0_RKS_IT_E(ptr nonnull align 8 dereferenceable(16) [[TMP2]], ptr nonnull align 8 dereferenceable(16) [[PARTIAL_SUM5]]) #[[ATTR11]]
-// CHECK1-NEXT:    br label [[DOTOMP_REDUCTION_DONE]]
-// CHECK1:       .omp.reduction.done:
+// CHECK1-NEXT:    br [[DOTOMP_REDUCTION_DONE]]
+// CHECK1:       [[_OMP_REDUCTION_DONE:.*:]]
 // CHECK1-NEXT:    call void @llvm.lifetime.end.p0(ptr [[I7]]) #[[ATTR4]]
 // CHECK1-NEXT:    call void @llvm.lifetime.end.p0(ptr [[PARTIAL_SUM5]]) #[[ATTR4]]
 // CHECK1-NEXT:    call void @llvm.lifetime.end.p0(ptr [[DOTOMP_IS_LAST]]) #[[ATTR4]]
 // CHECK1-NEXT:    call void @llvm.lifetime.end.p0(ptr [[DOTOMP_STRIDE]]) #[[ATTR4]]
 // CHECK1-NEXT:    call void @llvm.lifetime.end.p0(ptr [[DOTOMP_UB]]) #[[ATTR4]]
 // CHECK1-NEXT:    call void @llvm.lifetime.end.p0(ptr [[DOTOMP_LB]]) #[[ATTR4]]
-// CHECK1-NEXT:    br label [[OMP_PRECOND_END]]
-// CHECK1:       omp.precond.end:
+// CHECK1-NEXT:    br label %[[OMP_PRECOND_END]]
+// CHECK1:       [[OMP_PRECOND_END]]:
 // CHECK1-NEXT:    call void @llvm.lifetime.end.p0(ptr [[DOTCAPTURE_EXPR_2]]) #[[ATTR4]]
 // CHECK1-NEXT:    call void @llvm.lifetime.end.p0(ptr [[DOTCAPTURE_EXPR_1]]) #[[ATTR4]]
 // CHECK1-NEXT:    call void @llvm.lifetime.end.p0(ptr [[DOTCAPTURE_EXPR_]]) #[[ATTR4]]
@@ -845,32 +845,32 @@ void test() {
 // CHECK1-NEXT:    ret void
 //
 //
-// CHECK1-LABEL: define {{[^@]+}}@_ZNSt7complexIdEpLIdEERS0_RKS_IT_E
-// CHECK1-SAME: (ptr nonnull align 8 dereferenceable(16) [[THIS:%.*]], ptr nonnull align 8 dereferenceable(16) [[__C:%.*]]) #[[ATTR5]] comdat align 2 {
-// CHECK1-NEXT:  entry:
+// CHECK1-LABEL: define linkonce_odr hidden nonnull align 8 dereferenceable(16) ptr @_ZNSt7complexIdEpLIdEERS0_RKS_IT_E(
+// CHECK1-SAME: ptr nonnull align 8 dereferenceable(16) [[THIS:%.*]], ptr nonnull align 8 dereferenceable(16) [[__C:%.*]]) #[[ATTR5]] comdat align 2 {
+// CHECK1-NEXT:  [[ENTRY:.*:]]
 // CHECK1-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[__C_ADDR:%.*]] = alloca ptr, align 8
-// CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8, !tbaa [[TBAA38]]
-// CHECK1-NEXT:    store ptr [[__C]], ptr [[__C_ADDR]], align 8, !tbaa [[TBAA38]]
+// CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8, !tbaa [[_ZTSST7COMPLEXIDEPTR_TBAA31]]
+// CHECK1-NEXT:    store ptr [[__C]], ptr [[__C_ADDR]], align 8, !tbaa [[_ZTSST7COMPLEXIDEPTR_TBAA31]]
 // CHECK1-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[__C_ADDR]], align 8, !tbaa [[TBAA38]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[__C_ADDR]], align 8, !tbaa [[_ZTSST7COMPLEXIDEPTR_TBAA31]], !nonnull [[META22]], !align [[META35]]
 // CHECK1-NEXT:    [[CALL:%.*]] = call double @_ZNKSt7complexIdE4realEv(ptr nonnull align 8 dereferenceable(16) [[TMP0]]) #[[ATTR11]]
 // CHECK1-NEXT:    [[__RE_:%.*]] = getelementptr inbounds nuw %"class.std::complex.0", ptr [[THIS1]], i32 0, i32 0
-// CHECK1-NEXT:    [[TMP1:%.*]] = load double, ptr [[__RE_]], align 8, !tbaa [[TBAA42:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = load double, ptr [[__RE_]], align 8, !tbaa [[DOUBLE_TBAA36:![0-9]+]]
 // CHECK1-NEXT:    [[ADD:%.*]] = fadd double [[TMP1]], [[CALL]]
-// CHECK1-NEXT:    store double [[ADD]], ptr [[__RE_]], align 8, !tbaa [[TBAA42]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[__C_ADDR]], align 8, !tbaa [[TBAA38]]
+// CHECK1-NEXT:    store double [[ADD]], ptr [[__RE_]], align 8, !tbaa [[DOUBLE_TBAA36]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[__C_ADDR]], align 8, !tbaa [[_ZTSST7COMPLEXIDEPTR_TBAA31]], !nonnull [[META22]], !align [[META35]]
 // CHECK1-NEXT:    [[CALL2:%.*]] = call double @_ZNKSt7complexIdE4imagEv(ptr nonnull align 8 dereferenceable(16) [[TMP2]]) #[[ATTR11]]
 // CHECK1-NEXT:    [[__IM_:%.*]] = getelementptr inbounds nuw %"class.std::complex.0", ptr [[THIS1]], i32 0, i32 1
-// CHECK1-NEXT:    [[TMP3:%.*]] = load double, ptr [[__IM_]], align 8, !tbaa [[TBAA44:![0-9]+]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load double, ptr [[__IM_]], align 8, !tbaa [[DOUBLE_TBAA38:![0-9]+]]
 // CHECK1-NEXT:    [[ADD3:%.*]] = fadd double [[TMP3]], [[CALL2]]
-// CHECK1-NEXT:    store double [[ADD3]], ptr [[__IM_]], align 8, !tbaa [[TBAA44]]
+// CHECK1-NEXT:    store double [[ADD3]], ptr [[__IM_]], align 8, !tbaa [[DOUBLE_TBAA38]]
 // CHECK1-NEXT:    ret ptr [[THIS1]]
 //
 //
-// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func1
-// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], i16 noundef signext [[TMP1:%.*]], i16 noundef signext [[TMP2:%.*]], i16 noundef signext [[TMP3:%.*]]) #[[ATTR1]] {
-// CHECK1-NEXT:  entry:
+// CHECK1-LABEL: define internal void @_omp_reduction_shuffle_and_reduce_func1(
+// CHECK1-SAME: ptr noundef [[TMP0:%.*]], i16 noundef signext [[TMP1:%.*]], i16 noundef signext [[TMP2:%.*]], i16 noundef signext [[TMP3:%.*]]) #[[ATTR1]] {
+// CHECK1-NEXT:  [[ENTRY:.*]]:
 // CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca i16, align 2
 // CHECK1-NEXT:    [[DOTADDR2:%.*]] = alloca i16, align 2
@@ -889,17 +889,17 @@ void test() {
 // CHECK1-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[TMP8]], align 8
 // CHECK1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0
 // CHECK1-NEXT:    [[TMP11:%.*]] = getelementptr %"class.std::complex.0", ptr [[TMP9]], i64 1
-// CHECK1-NEXT:    br label [[DOTSHUFFLE_PRE_COND:%.*]]
-// CHECK1:       .shuffle.pre_cond:
-// CHECK1-NEXT:    [[TMP12:%.*]] = phi ptr [ [[TMP9]], [[ENTRY:%.*]] ], [ [[TMP23:%.*]], [[DOTSHUFFLE_THEN:%.*]] ]
-// CHECK1-NEXT:    [[TMP13:%.*]] = phi ptr [ [[DOTOMP_REDUCTION_ELEMENT]], [[ENTRY]] ], [ [[TMP24:%.*]], [[DOTSHUFFLE_THEN]] ]
+// CHECK1-NEXT:    br [[DOTSHUFFLE_PRE_COND:label %.*]]
+// CHECK1:       [[_SHUFFLE_PRE_COND:.*:]]
+// CHECK1-NEXT:    [[TMP12:%.*]] = phi ptr [ [[TMP9]], %[[ENTRY]] ], [ [[TMP23:%.*]], %[[DOTSHUFFLE_THEN:.*]] ]
+// CHECK1-NEXT:    [[TMP13:%.*]] = phi ptr [ [[DOTOMP_REDUCTION_ELEMENT]], %[[ENTRY]] ], [ [[TMP24:%.*]], %[[DOTSHUFFLE_THEN]] ]
 // CHECK1-NEXT:    [[TMP14:%.*]] = ptrtoint ptr [[TMP11]] to i64
 // CHECK1-NEXT:    [[TMP15:%.*]] = ptrtoint ptr [[TMP12]] to i64
 // CHECK1-NEXT:    [[TMP16:%.*]] = sub i64 [[TMP14]], [[TMP15]]
 // CHECK1-NEXT:    [[TMP17:%.*]] = sdiv exact i64 [[TMP16]], ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64)
 // CHECK1-NEXT:    [[TMP18:%.*]] = icmp sgt i64 [[TMP17]], 7
-// CHECK1-NEXT:    br i1 [[TMP18]], label [[DOTSHUFFLE_THEN]], label [[DOTSHUFFLE_EXIT:%.*]]
-// CHECK1:       .shuffle.then:
+// CHECK1-NEXT:    br i1 [[TMP18]], label %[[DOTSHUFFLE_THEN]], [[DOTSHUFFLE_EXIT:label %.*]]
+// CHECK1:       [[_SHUFFLE_THEN:.*:]]
 // CHECK1-NEXT:    [[TMP19:%.*]] = load i64, ptr [[TMP12]], align 8
 // CHECK1-NEXT:    [[TMP20:%.*]] = call i32 @__kmpc_get_warp_size()
 // CHECK1-NEXT:    [[TMP21:%.*]] = trunc i32 [[TMP20]] to i16
@@ -907,8 +907,8 @@ void test() {
 // CHECK1-NEXT:    store i64 [[TMP22]], ptr [[TMP13]], align 8
 // CHECK1-NEXT:    [[TMP23]] = getelementptr i64, ptr [[TMP12]], i64 1
 // CHECK1-NEXT:    [[TMP24]] = getelementptr i64, ptr [[TMP13]], i64 1
-// CHECK1-NEXT:    br label [[DOTSHUFFLE_PRE_COND]]
-// CHECK1:       .shuffle.exit:
+// CHECK1-NEXT:    br [[DOTSHUFFLE_PRE_COND]]
+// CHECK1:       [[_SHUFFLE_EXIT:.*:]]
 // CHECK1-NEXT:    store ptr [[DOTOMP_REDUCTION_ELEMENT]], ptr [[TMP10]], align 8
 // CHECK1-NEXT:    [[TMP25:%.*]] = icmp eq i16 [[TMP7]], 0
 // CHECK1-NEXT:    [[TMP26:%.*]] = icmp eq i16 [[TMP7]], 1
@@ -922,33 +922,33 @@ void test() {
 // CHECK1-NEXT:    [[TMP34:%.*]] = and i1 [[TMP32]], [[TMP33]]
 // CHECK1-NEXT:    [[TMP35:%.*]] = or i1 [[TMP25]], [[TMP28]]
 // CHECK1-NEXT:    [[TMP36:%.*]] = or i1 [[TMP35]], [[TMP34]]
-// CHECK1-NEXT:    br i1 [[TMP36]], label [[THEN:%.*]], label [[ELSE:%.*]]
-// CHECK1:       then:
+// CHECK1-NEXT:    br i1 [[TMP36]], label %[[THEN:.*]], label %[[ELSE:.*]]
+// CHECK1:       [[THEN]]:
 // CHECK1-NEXT:    call void @"{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z17complex_reductionIdEvv_l16_omp_outlined_omp_outlined_omp$reduction$reduction_func"(ptr [[TMP4]], ptr [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]]) #[[ATTR4]]
-// CHECK1-NEXT:    br label [[IFCONT:%.*]]
-// CHECK1:       else:
-// CHECK1-NEXT:    br label [[IFCONT]]
-// CHECK1:       ifcont:
+// CHECK1-NEXT:    br label %[[IFCONT:.*]]
+// CHECK1:       [[ELSE]]:
+// CHECK1-NEXT:    br label %[[IFCONT]]
+// CHECK1:       [[IFCONT]]:
 // CHECK1-NEXT:    [[TMP37:%.*]] = icmp eq i16 [[TMP7]], 1
 // CHECK1-NEXT:    [[TMP38:%.*]] = icmp uge i16 [[TMP5]], [[TMP6]]
 // CHECK1-NEXT:    [[TMP39:%.*]] = and i1 [[TMP37]], [[TMP38]]
-// CHECK1-NEXT:    br i1 [[TMP39]], label [[THEN4:%.*]], label [[ELSE5:%.*]]
-// CHECK1:       then4:
+// CHECK1-NEXT:    br i1 [[TMP39]], label %[[THEN4:.*]], label %[[ELSE5:.*]]
+// CHECK1:       [[THEN4]]:
 // CHECK1-NEXT:    [[TMP40:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0
 // CHECK1-NEXT:    [[TMP41:%.*]] = load ptr, ptr [[TMP40]], align 8
 // CHECK1-NEXT:    [[TMP42:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP4]], i64 0, i64 0
 // CHECK1-NEXT:    [[TMP43:%.*]] = load ptr, ptr [[TMP42]], align 8
 // CHECK1-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[TMP43]], ptr align 8 [[TMP41]], i64 16, i1 false)
-// CHECK1-NEXT:    br label [[IFCONT6:%.*]]
-// CHECK1:       else5:
-// CHECK1-NEXT:    br label [[IFCONT6]]
-// CHECK1:       ifcont6:
+// CHECK1-NEXT:    br label %[[IFCONT6:.*]]
+// CHECK1:       [[ELSE5]]:
+// CHECK1-NEXT:    br label %[[IFCONT6]]
+// CHECK1:       [[IFCONT6]]:
 // CHECK1-NEXT:    ret void
 //
 //
-// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func2
-// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]]) #[[ATTR1]] {
-// CHECK1-NEXT:  entry:
+// CHECK1-LABEL: define internal void @_omp_reduction_inter_warp_copy_func2(
+// CHECK1-SAME: ptr noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]]) #[[ATTR1]] {
+// CHECK1-NEXT:  [[ENTRY:.*:]]
 // CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[DOTCNT_ADDR:%.*]] = alloca i32, align 4
@@ -961,154 +961,189 @@ void test() {
 // CHECK1-NEXT:    [[NVPTX_WARP_ID:%.*]] = ashr i32 [[TMP4]], 5
 // CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[DOTADDR]], align 8
 // CHECK1-NEXT:    store i32 0, ptr [[DOTCNT_ADDR]], align 4
-// CHECK1-NEXT:    br label [[PRECOND:%.*]]
-// CHECK1:       precond:
+// CHECK1-NEXT:    br label %[[PRECOND:.*]]
+// CHECK1:       [[PRECOND]]:
 // CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTCNT_ADDR]], align 4
 // CHECK1-NEXT:    [[TMP7:%.*]] = icmp ult i32 [[TMP6]], 4
-// CHECK1-NEXT:    br i1 [[TMP7]], label [[BODY:%.*]], label [[EXIT:%.*]]
-// CHECK1:       body:
+// CHECK1-NEXT:    br i1 [[TMP7]], label %[[BODY:.*]], label %[[EXIT:.*]]
+// CHECK1:       [[BODY]]:
 // CHECK1-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK1-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB4]], i32 [[OMP_GLOBAL_THREAD_NUM]])
 // CHECK1-NEXT:    [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
-// CHECK1-NEXT:    br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
-// CHECK1:       then:
+// CHECK1-NEXT:    br i1 [[WARP_MASTER]], label %[[THEN:.*]], label %[[ELSE:.*]]
+// CHECK1:       [[THEN]]:
 // CHECK1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP5]], i64 0, i64 0
 // CHECK1-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[TMP8]], align 8
 // CHECK1-NEXT:    [[TMP10:%.*]] = getelementptr i32, ptr [[TMP9]], i32 [[TMP6]]
 // CHECK1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
 // CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP10]], align 4
 // CHECK1-NEXT:    store volatile i32 [[TMP12]], ptr addrspace(3) [[TMP11]], align 4
-// CHECK1-NEXT:    br label [[IFCONT:%.*]]
-// CHECK1:       else:
-// CHECK1-NEXT:    br label [[IFCONT]]
-// CHECK1:       ifcont:
+// CHECK1-NEXT:    br label %[[IFCONT:.*]]
+// CHECK1:       [[ELSE]]:
+// CHECK1-NEXT:    br label %[[IFCONT]]
+// CHECK1:       [[IFCONT]]:
 // CHECK1-NEXT:    [[OMP_GLOBAL_THREAD_NUM2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK1-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB4]], i32 [[OMP_GLOBAL_THREAD_NUM2]])
 // CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTADDR1]], align 4
 // CHECK1-NEXT:    [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP2]], [[TMP13]]
-// CHECK1-NEXT:    br i1 [[IS_ACTIVE_THREAD]], label [[THEN3:%.*]], label [[ELSE4:%.*]]
-// CHECK1:       then3:
+// CHECK1-NEXT:    br i1 [[IS_ACTIVE_THREAD]], label %[[THEN3:.*]], label %[[ELSE4:.*]]
+// CHECK1:       [[THEN3]]:
 // CHECK1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP2]]
 // CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP5]], i64 0, i64 0
 // CHECK1-NEXT:    [[TMP16:%.*]] = load ptr, ptr [[TMP15]], align 8
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr i32, ptr [[TMP16]], i32 [[TMP6]]
 // CHECK1-NEXT:    [[TMP18:%.*]] = load volatile i32, ptr addrspace(3) [[TMP14]], align 4
 // CHECK1-NEXT:    store i32 [[TMP18]], ptr [[TMP17]], align 4
-// CHECK1-NEXT:    br label [[IFCONT5:%.*]]
-// CHECK1:       else4:
-// CHECK1-NEXT:    br label [[IFCONT5]]
-// CHECK1:       ifcont5:
+// CHECK1-NEXT:    br label %[[IFCONT5:.*]]
+// CHECK1:       [[ELSE4]]:
+// CHECK1-NEXT:    br label %[[IFCONT5]]
+// CHECK1:       [[IFCONT5]]:
 // CHECK1-NEXT:    [[TMP19:%.*]] = add nsw i32 [[TMP6]], 1
 // CHECK1-NEXT:    store i32 [[TMP19]], ptr [[DOTCNT_ADDR]], align 4
-// CHECK1-NEXT:    br label [[PRECOND]]
-// CHECK1:       exit:
+// CHECK1-NEXT:    br label %[[PRECOND]]
+// CHECK1:       [[EXIT]]:
 // CHECK1-NEXT:    ret void
 //
 //
-// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z17complex_reductionIdEvv_l16_omp_outlined_omp_outlined_wrapper
-// CHECK1-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR8]] {
-// CHECK1-NEXT:  entry:
+// CHECK1-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z17complex_reductionIdEvv_l16_omp_outlined_omp_outlined_wrapper(
+// CHECK1-SAME: i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR8]] {
+// CHECK1-NEXT:  [[ENTRY:.*:]]
 // CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca i16, align 2
 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8
-// CHECK1-NEXT:    store i16 [[TMP0]], ptr [[DOTADDR]], align 2, !tbaa [[TBAA30]]
-// CHECK1-NEXT:    store i32 [[TMP1]], ptr [[DOTADDR1]], align 4, !tbaa [[TBAA15]]
+// CHECK1-NEXT:    store i16 [[TMP0]], ptr [[DOTADDR]], align 2, !tbaa [[SHORT_TBAA27]]
+// CHECK1-NEXT:    store i32 [[TMP1]], ptr [[DOTADDR1]], align 4, !tbaa [[INT_TBAA10]]
 // CHECK1-NEXT:    store i32 0, ptr [[DOTZERO_ADDR]], align 4
 // CHECK1-NEXT:    call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]])
 // CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[GLOBAL_ARGS]], align 8
 // CHECK1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds ptr, ptr [[TMP2]], i64 0
-// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[TMP3]], align 8, !tbaa [[TBAA32]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[TMP3]], align 8, !tbaa [[ANYPTR_TBAA6]]
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds ptr, ptr [[TMP2]], i64 1
-// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[TMP5]], align 8, !tbaa [[TBAA32]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[TMP5]], align 8, !tbaa [[ANYPTR_TBAA6]]
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds ptr, ptr [[TMP2]], i64 2
-// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[TMP7]], align 8, !tbaa [[TBAA45:![0-9]+]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[TMP7]], align 8, !tbaa [[ANYPTR_TBAA6]]
 // CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z17complex_reductionIdEvv_l16_omp_outlined_omp_outlined(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]], ptr [[TMP4]], ptr [[TMP6]], ptr [[TMP8]]) #[[ATTR4]]
 // CHECK1-NEXT:    ret void
 //
 //
-// CHECK1-LABEL: define {{[^@]+}}@_ZNSt7complexIfEC2ERKfS2_
-// CHECK1-SAME: (ptr nonnull align 4 dereferenceable(8) [[THIS:%.*]], ptr nonnull align 4 dereferenceable(4) [[__RE:%.*]], ptr nonnull align 4 dereferenceable(4) [[__IM:%.*]]) unnamed_addr #[[ATTR5]] comdat align 2 {
-// CHECK1-NEXT:  entry:
+// CHECK1-LABEL: define linkonce_odr hidden void @_ZNSt7complexIfEC2ERKfS2_(
+// CHECK1-SAME: ptr nonnull align 4 dereferenceable(8) [[THIS:%.*]], ptr nonnull align 4 dereferenceable(4) [[__RE:%.*]], ptr nonnull align 4 dereferenceable(4) [[__IM:%.*]]) unnamed_addr #[[ATTR5]] comdat align 2 {
+// CHECK1-NEXT:  [[ENTRY:.*:]]
 // CHECK1-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[__RE_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[__IM_ADDR:%.*]] = alloca ptr, align 8
-// CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8, !tbaa [[TBAA23]]
-// CHECK1-NEXT:    store ptr [[__RE]], ptr [[__RE_ADDR]], align 8, !tbaa [[TBAA25]]
-// CHECK1-NEXT:    store ptr [[__IM]], ptr [[__IM_ADDR]], align 8, !tbaa [[TBAA25]]
+// CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8, !tbaa [[_ZTSST7COMPLEXIFEPTR_TBAA18]]
+// CHECK1-NEXT:    store ptr [[__RE]], ptr [[__RE_ADDR]], align 8, !tbaa [[FLOATPTR_TBAA20]]
+// CHECK1-NEXT:    store ptr [[__IM]], ptr [[__IM_ADDR]], align 8, !tbaa [[FLOATPTR_TBAA20]]
 // CHECK1-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CHECK1-NEXT:    [[__RE_:%.*]] = getelementptr inbounds nuw %"class.std::complex", ptr [[THIS1]], i32 0, i32 0
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[__RE_ADDR]], align 8, !tbaa [[TBAA25]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = load float, ptr [[TMP0]], align 4, !tbaa [[TBAA19]]
-// CHECK1-NEXT:    store float [[TMP1]], ptr [[__RE_]], align 4, !tbaa [[TBAA27]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[__RE_ADDR]], align 8, !tbaa [[FLOATPTR_TBAA20]], !nonnull [[META22]], !align [[META23]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = load float, ptr [[TMP0]], align 4, !tbaa [[FLOAT_TBAA14]]
+// CHECK1-NEXT:    store float [[TMP1]], ptr [[__RE_]], align 4, !tbaa [[FLOAT_TBAA24]]
 // CHECK1-NEXT:    [[__IM_:%.*]] = getelementptr inbounds nuw %"class.std::complex", ptr [[THIS1]], i32 0, i32 1
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[__IM_ADDR]], align 8, !tbaa [[TBAA25]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load float, ptr [[TMP2]], align 4, !tbaa [[TBAA19]]
-// CHECK1-NEXT:    store float [[TMP3]], ptr [[__IM_]], align 4, !tbaa [[TBAA29]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[__IM_ADDR]], align 8, !tbaa [[FLOATPTR_TBAA20]], !nonnull [[META22]], !align [[META23]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load float, ptr [[TMP2]], align 4, !tbaa [[FLOAT_TBAA14]]
+// CHECK1-NEXT:    store float [[TMP3]], ptr [[__IM_]], align 4, !tbaa [[FLOAT_TBAA26]]
 // CHECK1-NEXT:    ret void
 //
 //
-// CHECK1-LABEL: define {{[^@]+}}@_ZNKSt7complexIfE4realEv
-// CHECK1-SAME: (ptr nonnull align 4 dereferenceable(8) [[THIS:%.*]]) #[[ATTR5]] comdat align 2 {
-// CHECK1-NEXT:  entry:
+// CHECK1-LABEL: define linkonce_odr hidden float @_ZNKSt7complexIfE4realEv(
+// CHECK1-SAME: ptr nonnull align 4 dereferenceable(8) [[THIS:%.*]]) #[[ATTR5]] comdat align 2 {
+// CHECK1-NEXT:  [[ENTRY:.*:]]
 // CHECK1-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
-// CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8, !tbaa [[TBAA23]]
+// CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8, !tbaa [[_ZTSST7COMPLEXIFEPTR_TBAA18]]
 // CHECK1-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CHECK1-NEXT:    [[__RE_:%.*]] = getelementptr inbounds nuw %"class.std::complex", ptr [[THIS1]], i32 0, i32 0
-// CHECK1-NEXT:    [[TMP0:%.*]] = load float, ptr [[__RE_]], align 4, !tbaa [[TBAA27]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load float, ptr [[__RE_]], align 4, !tbaa [[FLOAT_TBAA24]]
 // CHECK1-NEXT:    ret float [[TMP0]]
 //
 //
-// CHECK1-LABEL: define {{[^@]+}}@_ZNKSt7complexIfE4imagEv
-// CHECK1-SAME: (ptr nonnull align 4 dereferenceable(8) [[THIS:%.*]]) #[[ATTR5]] comdat align 2 {
-// CHECK1-NEXT:  entry:
+// CHECK1-LABEL: define linkonce_odr hidden float @_ZNKSt7complexIfE4imagEv(
+// CHECK1-SAME: ptr nonnull align 4 dereferenceable(8) [[THIS:%.*]]) #[[ATTR5]] comdat align 2 {
+// CHECK1-NEXT:  [[ENTRY:.*:]]
 // CHECK1-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
-// CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8, !tbaa [[TBAA23]]
+// CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8, !tbaa [[_ZTSST7COMPLEXIFEPTR_TBAA18]]
 // CHECK1-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CHECK1-NEXT:    [[__IM_:%.*]] = getelementptr inbounds nuw %"class.std::complex", ptr [[THIS1]], i32 0, i32 1
-// CHECK1-NEXT:    [[TMP0:%.*]] = load float, ptr [[__IM_]], align 4, !tbaa [[TBAA29]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load float, ptr [[__IM_]], align 4, !tbaa [[FLOAT_TBAA26]]
 // CHECK1-NEXT:    ret float [[TMP0]]
 //
 //
-// CHECK1-LABEL: define {{[^@]+}}@_ZNSt7complexIdEC2ERKdS2_
-// CHECK1-SAME: (ptr nonnull align 8 dereferenceable(16) [[THIS:%.*]], ptr nonnull align 8 dereferenceable(8) [[__RE:%.*]], ptr nonnull align 8 dereferenceable(8) [[__IM:%.*]]) unnamed_addr #[[ATTR5]] comdat align 2 {
-// CHECK1-NEXT:  entry:
+// CHECK1-LABEL: define linkonce_odr hidden void @_ZNSt7complexIdEC2ERKdS2_(
+// CHECK1-SAME: ptr nonnull align 8 dereferenceable(16) [[THIS:%.*]], ptr nonnull align 8 dereferenceable(8) [[__RE:%.*]], ptr nonnull align 8 dereferenceable(8) [[__IM:%.*]]) unnamed_addr #[[ATTR5]] comdat align 2 {
+// CHECK1-NEXT:  [[ENTRY:.*:]]
 // CHECK1-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[__RE_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[__IM_ADDR:%.*]] = alloca ptr, align 8
-// CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8, !tbaa [[TBAA38]]
-// CHECK1-NEXT:    store ptr [[__RE]], ptr [[__RE_ADDR]], align 8, !tbaa [[TBAA40]]
-// CHECK1-NEXT:    store ptr [[__IM]], ptr [[__IM_ADDR]], align 8, !tbaa [[TBAA40]]
+// CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8, !tbaa [[_ZTSST7COMPLEXIDEPTR_TBAA31]]
+// CHECK1-NEXT:    store ptr [[__RE]], ptr [[__RE_ADDR]], align 8, !tbaa [[DOUBLEPTR_TBAA33]]
+// CHECK1-NEXT:    store ptr [[__IM]], ptr [[__IM_ADDR]], align 8, !tbaa [[DOUBLEPTR_TBAA33]]
 // CHECK1-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CHECK1-NEXT:    [[__RE_:%.*]] = getelementptr inbounds nuw %"class.std::complex.0", ptr [[THIS1]], i32 0, i32 0
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[__RE_ADDR]], align 8, !tbaa [[TBAA40]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = load double, ptr [[TMP0]], align 8, !tbaa [[TBAA36]]
-// CHECK1-NEXT:    store double [[TMP1]], ptr [[__RE_]], align 8, !tbaa [[TBAA42]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[__RE_ADDR]], align 8, !tbaa [[DOUBLEPTR_TBAA33]], !nonnull [[META22]], !align [[META35]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = load double, ptr [[TMP0]], align 8, !tbaa [[DOUBLE_TBAA29]]
+// CHECK1-NEXT:    store double [[TMP1]], ptr [[__RE_]], align 8, !tbaa [[DOUBLE_TBAA36]]
 // CHECK1-NEXT:    [[__IM_:%.*]] = getelementptr inbounds nuw %"class.std::complex.0", ptr [[THIS1]], i32 0, i32 1
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[__IM_ADDR]], align 8, !tbaa [[TBAA40]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load double, ptr [[TMP2]], align 8, !tbaa [[TBAA36]]
-// CHECK1-NEXT:    store double [[TMP3]], ptr [[__IM_]], align 8, !tbaa [[TBAA44]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[__IM_ADDR]], align 8, !tbaa [[DOUBLEPTR_TBAA33]], !nonnull [[META22]], !align [[META35]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load double, ptr [[TMP2]], align 8, !tbaa [[DOUBLE_TBAA29]]
+// CHECK1-NEXT:    store double [[TMP3]], ptr [[__IM_]], align 8, !tbaa [[DOUBLE_TBAA38]]
 // CHECK1-NEXT:    ret void
 //
 //
-// CHECK1-LABEL: define {{[^@]+}}@_ZNKSt7complexIdE4realEv
-// CHECK1-SAME: (ptr nonnull align 8 dereferenceable(16) [[THIS:%.*]]) #[[ATTR5]] comdat align 2 {
-// CHECK1-NEXT:  entry:
+// CHECK1-LABEL: define linkonce_odr hidden double @_ZNKSt7complexIdE4realEv(
+// CHECK1-SAME: ptr nonnull align 8 dereferenceable(16) [[THIS:%.*]]) #[[ATTR5]] comdat align 2 {
+// CHECK1-NEXT:  [[ENTRY:.*:]]
 // CHECK1-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
-// CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8, !tbaa [[TBAA38]]
+// CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8, !tbaa [[_ZTSST7COMPLEXIDEPTR_TBAA31]]
 // CHECK1-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CHECK1-NEXT:    [[__RE_:%.*]] = getelementptr inbounds nuw %"class.std::complex.0", ptr [[THIS1]], i32 0, i32 0
-// CHECK1-NEXT:    [[TMP0:%.*]] = load double, ptr [[__RE_]], align 8, !tbaa [[TBAA42]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load double, ptr [[__RE_]], align 8, !tbaa [[DOUBLE_TBAA36]]
 // CHECK1-NEXT:    ret double [[TMP0]]
 //
 //
-// CHECK1-LABEL: define {{[^@]+}}@_ZNKSt7complexIdE4imagEv
-// CHECK1-SAME: (ptr nonnull align 8 dereferenceable(16) [[THIS:%.*]]) #[[ATTR5]] comdat align 2 {
-// CHECK1-NEXT:  entry:
+// CHECK1-LABEL: define linkonce_odr hidden double @_ZNKSt7complexIdE4imagEv(
+// CHECK1-SAME: ptr nonnull align 8 dereferenceable(16) [[THIS:%.*]]) #[[ATTR5]] comdat align 2 {
+// CHECK1-NEXT:  [[ENTRY:.*:]]
 // CHECK1-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
-// CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8, !tbaa [[TBAA38]]
+// CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8, !tbaa [[_ZTSST7COMPLEXIDEPTR_TBAA31]]
 // CHECK1-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CHECK1-NEXT:    [[__IM_:%.*]] = getelementptr inbounds nuw %"class.std::complex.0", ptr [[THIS1]], i32 0, i32 1
-// CHECK1-NEXT:    [[TMP0:%.*]] = load double, ptr [[__IM_]], align 8, !tbaa [[TBAA44]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load double, ptr [[__IM_]], align 8, !tbaa [[DOUBLE_TBAA38]]
 // CHECK1-NEXT:    ret double [[TMP0]]
 //
+//.
+// CHECK1: [[ANYPTR_TBAA6]] = !{[[META7:![0-9]+]], [[META7]], i64 0}
+// CHECK1: [[META7]] = !{!"any pointer", [[META8:![0-9]+]], i64 0}
+// CHECK1: [[META8]] = !{!"omnipotent char", [[META9:![0-9]+]], i64 0}
+// CHECK1: [[META9]] = !{!"Simple C++ TBAA"}
+// CHECK1: [[INT_TBAA10]] = !{[[META11:![0-9]+]], [[META11]], i64 0}
+// CHECK1: [[META11]] = !{!"int", [[META8]], i64 0}
+// CHECK1: [[INTPTR_TBAA12]] = !{[[META13:![0-9]+]], [[META13]], i64 0}
+// CHECK1: [[META13]] = !{!"p1 int", [[META7]], i64 0}
+// CHECK1: [[FLOAT_TBAA14]] = !{[[META15:![0-9]+]], [[META15]], i64 0}
+// CHECK1: [[META15]] = !{!"float", [[META8]], i64 0}
+// CHECK1: [[ANYPTR_TBAA16]] = !{[[META17:![0-9]+]], [[META17]], i64 0}
+// CHECK1: [[META17]] = !{!"any p2 pointer", [[META7]], i64 0}
+// CHECK1: [[_ZTSST7COMPLEXIFEPTR_TBAA18]] = !{[[META19:![0-9]+]], [[META19]], i64 0}
+// CHECK1: [[META19]] = !{!"p1 _ZTSSt7complexIfE", [[META7]], i64 0}
+// CHECK1: [[FLOATPTR_TBAA20]] = !{[[META21:![0-9]+]], [[META21]], i64 0}
+// CHECK1: [[META21]] = !{!"p1 float", [[META7]], i64 0}
+// CHECK1: [[META22]] = !{}
+// CHECK1: [[META23]] = !{i64 4}
+// CHECK1: [[FLOAT_TBAA24]] = !{[[META25:![0-9]+]], [[META15]], i64 0}
+// CHECK1: [[META25]] = !{!"_ZTSSt7complexIfE", [[META15]], i64 0, [[META15]], i64 4}
+// CHECK1: [[FLOAT_TBAA26]] = !{[[META25]], [[META15]], i64 4}
+// CHECK1: [[SHORT_TBAA27]] = !{[[META28:![0-9]+]], [[META28]], i64 0}
+// CHECK1: [[META28]] = !{!"short", [[META8]], i64 0}
+// CHECK1: [[DOUBLE_TBAA29]] = !{[[META30:![0-9]+]], [[META30]], i64 0}
+// CHECK1: [[META30]] = !{!"double", [[META8]], i64 0}
+// CHECK1: [[_ZTSST7COMPLEXIDEPTR_TBAA31]] = !{[[META32:![0-9]+]], [[META32]], i64 0}
+// CHECK1: [[META32]] = !{!"p1 _ZTSSt7complexIdE", [[META7]], i64 0}
+// CHECK1: [[DOUBLEPTR_TBAA33]] = !{[[META34:![0-9]+]], [[META34]], i64 0}
+// CHECK1: [[META34]] = !{!"p1 double", [[META7]], i64 0}
+// CHECK1: [[META35]] = !{i64 8}
+// CHECK1: [[DOUBLE_TBAA36]] = !{[[META37:![0-9]+]], [[META30]], i64 0}
+// CHECK1: [[META37]] = !{!"_ZTSSt7complexIdE", [[META30]], i64 0, [[META30]], i64 8}
+// CHECK1: [[DOUBLE_TBAA38]] = !{[[META37]], [[META30]], i64 8}
+//.
diff --git a/clang/test/OpenMP/parallel_ast_print.cpp b/clang/test/OpenMP/parallel_ast_print.cpp
index 15439ea31215a..28dc611bf864d 100644
--- a/clang/test/OpenMP/parallel_ast_print.cpp
+++ b/clang/test/OpenMP/parallel_ast_print.cpp
@@ -21,11 +21,19 @@
 // RUN: %clang_cc1 -DOMP60 -verify -Wno-vla -fopenmp-simd -fopenmp-version=60 -ast-print %s | FileCheck -check-prefixes=CHECK,OMP60 %s
 // RUN: %clang_cc1 -DOMP60 -fopenmp-simd -fopenmp-version=60 -x c++ -std=c++11 -emit-pch -o %t %s
 // RUN: %clang_cc1 -DOMP60 -fopenmp-simd -fopenmp-version=60 -std=c++11 -include-pch %t -verify -Wno-vla %s -ast-print | FileCheck -check-prefixes=CHECK,OMP60 %s
+// RUN: %clang_cc1 -DOMP60 -fopenmp-simd -fopenmp-version=60 -std=c++11 -verify -Wno-vla %s -ast-dump | FileCheck -check-prefixes=OMP60_DUMP %s
 // expected-no-diagnostics
 
 #ifndef HEADER
 #define HEADER
 
+#ifdef OMP60
+int global;
+int global2;
+
+void bar(int j) { };
+#endif
+
 void foo() {}
 
 struct S1 {
@@ -185,6 +193,32 @@ T tmain(T argc, T *argv) {
   return 0;
 }
 
+
+#ifdef OMP60
+// OMP60_DUMP: FunctionDecl {{.*}}mainVC {{.*}}
+// OMP60_DUMP: OMPParallelDirective {{.*}}
+// OMP60_DUMP-NEXT: OMPSharedClause{{.*}}
+// OMP60_DUMP-NEXT: {{.*}}DeclRefExpr{{.*}} 'global' 'int'{{.*}}
+// OMP60_DUMP-NEXT: OMPDefaultClause {{.*}}
+// OMP60_DUMP-NEXT: OMPFirstprivateClause{{.*}}
+// OMP60_DUMP-NEXT: {{.*}}DeclRefExpr{{.*}} 'h' 'int[20]'{{.*}}
+// OMP60_DUMP: OMPParallelDirective {{.*}}
+// OMP60_DUMP-NEXT: OMPPrivateClause{{.*}}
+// OMP60_DUMP-NEXT: {{.*}}DeclRefExpr{{.*}} 'global2' 'int'{{.*}}
+// OMP60_DUMP-NEXT: OMPDefaultClause {{.*}}
+// OMP60_DUMP-NEXT: OMPPrivateClause {{.*}}
+// OMP60_DUMP-NEXT: {{.*}}DeclRefExpr{{.*}} 'j' 'int'{{.*}}
+int mainVC(int argc, int *argv) {
+  int h[20];
+  int j;
+#pragma omp parallel shared(global) default(firstprivate: aggregate)
+  bar(h[1]), h[1] = global;
+#pragma omp parallel private(global2) default(private: scalar)
+  bar(global2), j = global2;
+  return 0;
+}
+#endif
+
 // CHECK: template <typename T, int C> T tmain(T argc, T *argv) {
 // CHECK-NEXT: T b = argc, c, d, e, f, g;
 // CHECK-NEXT: static T a;
@@ -237,6 +271,14 @@ T tmain(T argc, T *argv) {
 // OMP60-NEXT: #pragma omp parallel if(1) num_threads(strict: s) proc_bind(close) reduction(^: e,f,arr[0:1][:argc]) reduction(default, &&: g) reduction(task, +: argc) message("msg") severity(warning)
 // OMP60-NEXT: foo()
 
+// OMP60: int mainVC(int argc, int *argv) {
+// OMP60-NEXT: int h[20];
+// OMP60-NEXT: int j;
+// OMP60-NEXT: #pragma omp parallel shared(global) default(firstprivate:aggregate)
+// OMP60-NEXT: bar(h[1]) , h[1] = global;
+// OMP60-NEXT: #pragma omp parallel private(global2) default(private:scalar)
+// OMP60-NEXT: bar(global2) , j = global2;
+
 enum Enum { };
 
 int main (int argc, char **argv) {
diff --git a/clang/test/OpenMP/parallel_default_messages.cpp b/clang/test/OpenMP/parallel_default_messages.cpp
index 37d3b5565f83c..842b1ac5a96b8 100644
--- a/clang/test/OpenMP/parallel_default_messages.cpp
+++ b/clang/test/OpenMP/parallel_default_messages.cpp
@@ -6,6 +6,7 @@
 // RUN: %clang_cc1 -verify -fopenmp-version=30 -fopenmp -ferror-limit 100 -o - %s -Wuninitialized
 // RUN: %clang_cc1 -verify=expected,ge40 -fopenmp -DOMP51 -ferror-limit 100 -o - %s -Wuninitialized
 // RUN: %clang_cc1 -verify=expected,ge40 -fopenmp-simd -DOMP51 -ferror-limit 100 -o - %s -Wuninitialized
+// RUN: %clang_cc1 -verify=expected,ge40 -fopenmp-version=60 -fopenmp -DOMP60 -ferror-limit 100 -o - %s -Wuninitialized
 
 void foo();
 
@@ -47,6 +48,23 @@ int main(int argc, char **argv) {
   }
 #endif
 
+#ifdef OMP60
+#pragma omp parallel default(shared:) private(x,y) // expected-error {{wrong variable category specified with modifier shared in the default clause}}
+  {
+    ++x;
+    ++y;
+  }
+#pragma omp parallel default(shared: junk) private(x,y) // expected-error {{wrong variable category specified with modifier shared in the default clause}}
+  {
+    ++x;
+    ++y;
+  }
+#pragma omp parallel default(firstprivate: junk) private(x,y) // expected-error {{wrong variable category specified with modifier firstprivate in the default clause}}
+  {
+    ++x;
+    ++y;
+  }
+#endif
   return 0;
 }
 
diff --git a/clang/test/OpenMP/parallel_if_codegen_PR51349.cpp b/clang/test/OpenMP/parallel_if_codegen_PR51349.cpp
index 1c6a56239204c..268b39087f4bd 100644
--- a/clang/test/OpenMP/parallel_if_codegen_PR51349.cpp
+++ b/clang/test/OpenMP/parallel_if_codegen_PR51349.cpp
@@ -1,4 +1,4 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --check-attributes --include-generated-funcs
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-attributes --include-generated-funcs --version 6
 // RUN: %clang_cc1 -x c++ -O1 -fopenmp-version=45 -disable-llvm-optzns -verify -fopenmp -triple x86_64-unknown-linux -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK
 // RUN: %clang_cc1 -x c++ -O1 -fopenmp-version=45 -disable-llvm-optzns -verify -fopenmp -triple x86_64-unknown-linux -emit-llvm -fno-inline %s -o - | FileCheck %s --check-prefix=CHECK-NOINLINE
 // expected-no-diagnostics
@@ -15,14 +15,14 @@ void foo() {
 
 #endif
 // CHECK: Function Attrs: mustprogress nounwind
-// CHECK-LABEL: define {{[^@]+}}@_Z3foov
-// CHECK-SAME: () #[[ATTR0:[0-9]+]] {
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define dso_local void @_Z3foov(
+// CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
 // CHECK-NEXT:    [[DOTBOUND_ZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1:[0-9]+]])
 // CHECK-NEXT:    call void @__kmpc_serialized_parallel(ptr @[[GLOB1]], i32 [[TMP0]])
-// CHECK-NEXT:    store i32 [[TMP0]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA3:![0-9]+]]
+// CHECK-NEXT:    store i32 [[TMP0]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[INT_TBAA3:![0-9]+]]
 // CHECK-NEXT:    store i32 0, ptr [[DOTBOUND_ZERO_ADDR]], align 4
 // CHECK-NEXT:    call void @_Z3foov.omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTBOUND_ZERO_ADDR]]) #[[ATTR2:[0-9]+]]
 // CHECK-NEXT:    call void @__kmpc_end_serialized_parallel(ptr @[[GLOB1]], i32 [[TMP0]])
@@ -31,36 +31,36 @@ void foo() {
 //
 //
 // CHECK: Function Attrs: noinline norecurse nounwind
-// CHECK-LABEL: define {{[^@]+}}@_Z3foov.omp_outlined
-// CHECK-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1:[0-9]+]] {
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define internal void @_Z3foov.omp_outlined(
+// CHECK-SAME: ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8, !tbaa [[TBAA7:![0-9]+]]
-// CHECK-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8, !tbaa [[TBAA7]]
+// CHECK-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8, !tbaa [[INTPTR_TBAA7:![0-9]+]]
+// CHECK-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8, !tbaa [[INTPTR_TBAA7]]
 // CHECK-NEXT:    ret void
 //
 //
 // CHECK: Function Attrs: alwaysinline norecurse nounwind
-// CHECK-LABEL: define {{[^@]+}}@_Z3foov.omp_outlined.1
-// CHECK-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR3:[0-9]+]] {
-// CHECK-NEXT:  entry:
+// CHECK-LABEL: define internal void @_Z3foov.omp_outlined.1(
+// CHECK-SAME: ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8, !tbaa [[TBAA7]]
-// CHECK-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8, !tbaa [[TBAA7]]
+// CHECK-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8, !tbaa [[INTPTR_TBAA7]]
+// CHECK-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8, !tbaa [[INTPTR_TBAA7]]
 // CHECK-NEXT:    ret void
 //
 //
 // CHECK-NOINLINE: Function Attrs: mustprogress noinline nounwind
-// CHECK-NOINLINE-LABEL: define {{[^@]+}}@_Z3foov
-// CHECK-NOINLINE-SAME: () #[[ATTR0:[0-9]+]] {
-// CHECK-NOINLINE-NEXT:  entry:
+// CHECK-NOINLINE-LABEL: define dso_local void @_Z3foov(
+// CHECK-NOINLINE-SAME: ) #[[ATTR0:[0-9]+]] {
+// CHECK-NOINLINE-NEXT:  [[ENTRY:.*:]]
 // CHECK-NOINLINE-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
 // CHECK-NOINLINE-NEXT:    [[DOTBOUND_ZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK-NOINLINE-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1:[0-9]+]])
 // CHECK-NOINLINE-NEXT:    call void @__kmpc_serialized_parallel(ptr @[[GLOB1]], i32 [[TMP0]])
-// CHECK-NOINLINE-NEXT:    store i32 [[TMP0]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA3:![0-9]+]]
+// CHECK-NOINLINE-NEXT:    store i32 [[TMP0]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[INT_TBAA3:![0-9]+]]
 // CHECK-NOINLINE-NEXT:    store i32 0, ptr [[DOTBOUND_ZERO_ADDR]], align 4
 // CHECK-NOINLINE-NEXT:    call void @_Z3foov.omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTBOUND_ZERO_ADDR]]) #[[ATTR2:[0-9]+]]
 // CHECK-NOINLINE-NEXT:    call void @__kmpc_end_serialized_parallel(ptr @[[GLOB1]], i32 [[TMP0]])
@@ -69,23 +69,40 @@ void foo() {
 //
 //
 // CHECK-NOINLINE: Function Attrs: noinline norecurse nounwind
-// CHECK-NOINLINE-LABEL: define {{[^@]+}}@_Z3foov.omp_outlined
-// CHECK-NOINLINE-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1:[0-9]+]] {
-// CHECK-NOINLINE-NEXT:  entry:
+// CHECK-NOINLINE-LABEL: define internal void @_Z3foov.omp_outlined(
+// CHECK-NOINLINE-SAME: ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK-NOINLINE-NEXT:  [[ENTRY:.*:]]
 // CHECK-NOINLINE-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK-NOINLINE-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NOINLINE-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8, !tbaa [[TBAA7:![0-9]+]]
-// CHECK-NOINLINE-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8, !tbaa [[TBAA7]]
+// CHECK-NOINLINE-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8, !tbaa [[INTPTR_TBAA7:![0-9]+]]
+// CHECK-NOINLINE-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8, !tbaa [[INTPTR_TBAA7]]
 // CHECK-NOINLINE-NEXT:    ret void
 //
 //
 // CHECK-NOINLINE: Function Attrs: alwaysinline norecurse nounwind
-// CHECK-NOINLINE-LABEL: define {{[^@]+}}@_Z3foov.omp_outlined.1
-// CHECK-NOINLINE-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR3:[0-9]+]] {
-// CHECK-NOINLINE-NEXT:  entry:
+// CHECK-NOINLINE-LABEL: define internal void @_Z3foov.omp_outlined.1(
+// CHECK-NOINLINE-SAME: ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK-NOINLINE-NEXT:  [[ENTRY:.*:]]
 // CHECK-NOINLINE-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK-NOINLINE-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NOINLINE-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8, !tbaa [[TBAA7]]
-// CHECK-NOINLINE-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8, !tbaa [[TBAA7]]
+// CHECK-NOINLINE-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8, !tbaa [[INTPTR_TBAA7]]
+// CHECK-NOINLINE-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8, !tbaa [[INTPTR_TBAA7]]
 // CHECK-NOINLINE-NEXT:    ret void
 //
+//.
+// CHECK: [[INT_TBAA3]] = !{[[META4:![0-9]+]], [[META4]], i64 0}
+// CHECK: [[META4]] = !{!"int", [[META5:![0-9]+]], i64 0}
+// CHECK: [[META5]] = !{!"omnipotent char", [[META6:![0-9]+]], i64 0}
+// CHECK: [[META6]] = !{!"Simple C++ TBAA"}
+// CHECK: [[INTPTR_TBAA7]] = !{[[META8:![0-9]+]], [[META8]], i64 0}
+// CHECK: [[META8]] = !{!"p1 int", [[META9:![0-9]+]], i64 0}
+// CHECK: [[META9]] = !{!"any pointer", [[META5]], i64 0}
+//.
+// CHECK-NOINLINE: [[INT_TBAA3]] = !{[[META4:![0-9]+]], [[META4]], i64 0}
+// CHECK-NOINLINE: [[META4]] = !{!"int", [[META5:![0-9]+]], i64 0}
+// CHECK-NOINLINE: [[META5]] = !{!"omnipotent char", [[META6:![0-9]+]], i64 0}
+// CHECK-NOINLINE: [[META6]] = !{!"Simple C++ TBAA"}
+// CHECK-NOINLINE: [[INTPTR_TBAA7]] = !{[[META8:![0-9]+]], [[META8]], i64 0}
+// CHECK-NOINLINE: [[META8]] = !{!"p1 int", [[META9:![0-9]+]], i64 0}
+// CHECK-NOINLINE: [[META9]] = !{!"any pointer", [[META5]], i64 0}
+//.
diff --git a/clang/test/OpenMP/spirv_locstr.cpp b/clang/test/OpenMP/spirv_locstr.cpp
index 20d9c9d2f7393..80f05029dd8a4 100644
--- a/clang/test/OpenMP/spirv_locstr.cpp
+++ b/clang/test/OpenMP/spirv_locstr.cpp
@@ -4,7 +4,7 @@
 // expected-no-diagnostics
 
 // CHECK: @[[#LOC:]] = private unnamed_addr addrspace(1) constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
-// CHECK: = private unnamed_addr addrspace(1) constant %struct.ident_t { i32 0, i32 2, i32 0, i32 {{.*}}, ptr addrspacecast (ptr addrspace(1) @[[#LOC]] to ptr) }, align 8
+// CHECK: = private unnamed_addr addrspace(1) constant %struct.ident_t { i32 0, i32 2, i32 0, i32 {{.*}}, ptr addrspace(4) addrspacecast (ptr addrspace(1) @[[#LOC]] to ptr addrspace(4)) }, align 8
 
 int main() {
   int ret = 0;
diff --git a/clang/test/OpenMP/target_defaultmap_codegen_01.cpp b/clang/test/OpenMP/target_defaultmap_codegen_01.cpp
index ce862632dd940..0936aa08e21e7 100644
--- a/clang/test/OpenMP/target_defaultmap_codegen_01.cpp
+++ b/clang/test/OpenMP/target_defaultmap_codegen_01.cpp
@@ -6,6 +6,8 @@
 
 ///==========================================================================///
 // RUN: %clang_cc1 -no-enable-noundef-analysis -DCK1 -verify -Wno-vla -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s --check-prefix CK1
+// Verify implicit-behavior 'storage' as alias for 'alloc' in OpenMP 6.0
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK1 -DOMP6 -verify -Wno-vla -fopenmp -fopenmp-version=60 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s --check-prefix CK1
 // RUN: %clang_cc1 -no-enable-noundef-analysis -DCK1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
 // RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify -Wno-vla %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK1
 // RUN: %clang_cc1 -no-enable-noundef-analysis -DCK1 -verify -Wno-vla -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK1
@@ -43,8 +45,12 @@ void implicit_maps_double_complex (int a){
 // CK1-DAG: store ptr [[PTR]], ptr [[P1]]
 
 // CK1: call void [[KERNEL:@.+]](ptr [[PTR]])
-#pragma omp target defaultmap(alloc \
-                              : scalar)
+#ifdef OMP6
+// 'storage' is an alias for 'alloc' in OpenMP 6.0
+#pragma omp target defaultmap(storage : scalar)
+#else
+#pragma omp target defaultmap(alloc : scalar)
+#endif // OMP6
   {
    dc *= dc;
   }
@@ -235,6 +241,8 @@ void implicit_maps_double (int a){
 #endif
 ///==========================================================================///
 // RUN: %clang_cc1 -no-enable-noundef-analysis -DCK5 -verify -Wno-vla  -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s --check-prefix CK5
+// Verify implicit-behavior 'alloc' still accepted in OpenMP 6.0
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK5 -verify -Wno-vla  -fopenmp -fopenmp-version=60 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s --check-prefix CK5
 // RUN: %clang_cc1 -no-enable-noundef-analysis -DCK5 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
 // RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify -Wno-vla  %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK5
 // RUN: %clang_cc1 -no-enable-noundef-analysis -DCK5 -verify -Wno-vla  -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK5
diff --git a/clang/test/OpenMP/target_defaultmap_codegen_03.cpp b/clang/test/OpenMP/target_defaultmap_codegen_03.cpp
new file mode 100644
index 0000000000000..05a144e576e38
--- /dev/null
+++ b/clang/test/OpenMP/target_defaultmap_codegen_03.cpp
@@ -0,0 +1,764 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _ --version 5
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+///==========================================================================///
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK1 -verify -Wno-vla  -fopenmp -fopenmp-version=60 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s --check-prefix CK1-64
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK1 -fopenmp -fopenmp-version=60 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp -fopenmp-version=60 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify -Wno-vla  %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s --check-prefix CK1-64
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK1 -verify -Wno-vla  -fopenmp -fopenmp-version=60 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s --check-prefix CK1-32
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK1 -fopenmp -fopenmp-version=60 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp -fopenmp-version=60 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify -Wno-vla  %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s --check-prefix CK1-32
+
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK1 -verify -Wno-vla  -fopenmp-simd -fopenmp-version=60 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY1-64 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK1 -fopenmp-simd -fopenmp-version=60 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp-simd -fopenmp-version=60 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify -Wno-vla  %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY1-64 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK1 -verify -Wno-vla  -fopenmp-simd -fopenmp-version=60 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY1-32 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK1 -fopenmp-simd -fopenmp-version=60 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp-simd -fopenmp-version=60 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify -Wno-vla  %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY1-32 %s
+#ifdef CK1
+void foo1(int a){
+  double d = (double)a;
+
+  #pragma omp target defaultmap(private : scalar)
+  {
+    d += 1.0;
+  }
+}
+#endif
+
+///==========================================================================///
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK2 -verify -Wno-vla  -fopenmp -fopenmp-version=60 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s --check-prefix CK2-64
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK2 -fopenmp -fopenmp-version=60 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp -fopenmp-version=60 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify -Wno-vla  %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s --check-prefix CK2-64
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK2 -verify -Wno-vla  -fopenmp -fopenmp-version=60 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s --check-prefix CK2-32
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK2 -fopenmp -fopenmp-version=60 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp -fopenmp-version=60 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify -Wno-vla  %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s --check-prefix CK2-32
+
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK2 -verify -Wno-vla  -fopenmp-simd -fopenmp-version=60 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap --check-prefix SIMD-ONLY2-64 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK2 -fopenmp-simd -fopenmp-version=60 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp-simd -fopenmp-version=60 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify -Wno-vla  %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap --check-prefix SIMD-ONLY2-64 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK2 -verify -Wno-vla  -fopenmp-simd -fopenmp-version=60 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap --check-prefix SIMD-ONLY2-32 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK2 -fopenmp-simd -fopenmp-version=60 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp-simd -fopenmp-version=60 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify -Wno-vla  %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap --check-prefix SIMD-ONLY2-32 %s
+
+#ifdef CK2
+void foo2(){
+  int pvtArr[10];
+
+  #pragma omp target defaultmap(private : aggregate)
+  {
+    pvtArr[5]++;
+  }
+}
+#endif
+
+///==========================================================================///
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK3 -verify -Wno-vla  -fopenmp -fopenmp-version=60 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s --check-prefix CK3-64
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK3 -fopenmp -fopenmp-version=60 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp -fopenmp-version=60 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify -Wno-vla  %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK3-64
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK3 -verify -Wno-vla  -fopenmp -fopenmp-version=60 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK3-32
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK3 -fopenmp -fopenmp-version=60 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp -fopenmp-version=60 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify -Wno-vla  %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK3-32
+
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK3 -verify -Wno-vla  -fopenmp-simd -fopenmp-version=60 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY3-64 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK3 -fopenmp-simd -fopenmp-version=60 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp-simd -fopenmp-version=60 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify -Wno-vla  %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY3-64 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK3 -verify -Wno-vla  -fopenmp-simd -fopenmp-version=60 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY3-32 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK3 -fopenmp-simd -fopenmp-version=60 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp-simd -fopenmp-version=60 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify -Wno-vla  %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY3-32 %s
+#ifdef CK3
+void foo3(){
+  int *pa;
+
+  #pragma omp target defaultmap(private : pointer)
+  {
+    pa[50]++;
+  }
+}
+#endif
+
+///==========================================================================///
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK4 -verify -Wno-vla  -fopenmp -fopenmp-version=60 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s --check-prefix CK4-64
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK4 -fopenmp -fopenmp-version=60 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp -fopenmp-version=60 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify -Wno-vla  %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK4-64
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK4 -verify -Wno-vla  -fopenmp -fopenmp-version=60 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK4-32
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK4 -fopenmp -fopenmp-version=60 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp -fopenmp-version=60 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify -Wno-vla  %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK4-32
+
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK4 -verify -Wno-vla  -fopenmp-simd -fopenmp-version=60 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY4-64 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK4 -fopenmp-simd -fopenmp-version=60 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp-simd -fopenmp-version=60 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify -Wno-vla  %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY4-64 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK4 -verify -Wno-vla  -fopenmp-simd -fopenmp-version=60 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY4-32 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK4 -fopenmp-simd -fopenmp-version=60 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp-simd -fopenmp-version=60 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify -Wno-vla  %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY4-32 %s
+
+// Specified variable-category doesn't apply to referenced variable, so
+// normal implicitly determined data-sharing applies.
+#ifdef CK4
+void foo4(){
+  int p;
+
+  #pragma omp target defaultmap(private : pointer)
+  {
+    p++;
+  }
+}
+#endif
+
+#endif // HEADER
+// CK1-64-LABEL: define dso_local void @_Z4foo1i(
+// CK1-64-SAME: i32 signext [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+// CK1-64-NEXT:  [[ENTRY:.*:]]
+// CK1-64-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// CK1-64-NEXT:    [[D:%.*]] = alloca double, align 8
+// CK1-64-NEXT:    [[D_CASTED:%.*]] = alloca i64, align 8
+// CK1-64-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 8
+// CK1-64-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 8
+// CK1-64-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 8
+// CK1-64-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CK1-64-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
+// CK1-64-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CK1-64-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP0]] to double
+// CK1-64-NEXT:    store double [[CONV]], ptr [[D]], align 8
+// CK1-64-NEXT:    [[TMP1:%.*]] = load double, ptr [[D]], align 8
+// CK1-64-NEXT:    store double [[TMP1]], ptr [[D_CASTED]], align 8
+// CK1-64-NEXT:    [[TMP2:%.*]] = load i64, ptr [[D_CASTED]], align 8
+// CK1-64-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CK1-64-NEXT:    store i64 [[TMP2]], ptr [[TMP3]], align 8
+// CK1-64-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CK1-64-NEXT:    store i64 [[TMP2]], ptr [[TMP4]], align 8
+// CK1-64-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0
+// CK1-64-NEXT:    store ptr null, ptr [[TMP5]], align 8
+// CK1-64-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CK1-64-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CK1-64-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// CK1-64-NEXT:    store i32 3, ptr [[TMP8]], align 4
+// CK1-64-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// CK1-64-NEXT:    store i32 1, ptr [[TMP9]], align 4
+// CK1-64-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// CK1-64-NEXT:    store ptr [[TMP6]], ptr [[TMP10]], align 8
+// CK1-64-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// CK1-64-NEXT:    store ptr [[TMP7]], ptr [[TMP11]], align 8
+// CK1-64-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// CK1-64-NEXT:    store ptr @.offload_sizes, ptr [[TMP12]], align 8
+// CK1-64-NEXT:    [[TMP13:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// CK1-64-NEXT:    store ptr @.offload_maptypes, ptr [[TMP13]], align 8
+// CK1-64-NEXT:    [[TMP14:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// CK1-64-NEXT:    store ptr null, ptr [[TMP14]], align 8
+// CK1-64-NEXT:    [[TMP15:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// CK1-64-NEXT:    store ptr null, ptr [[TMP15]], align 8
+// CK1-64-NEXT:    [[TMP16:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// CK1-64-NEXT:    store i64 0, ptr [[TMP16]], align 8
+// CK1-64-NEXT:    [[TMP17:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// CK1-64-NEXT:    store i64 0, ptr [[TMP17]], align 8
+// CK1-64-NEXT:    [[TMP18:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// CK1-64-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP18]], align 4
+// CK1-64-NEXT:    [[TMP19:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// CK1-64-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP19]], align 4
+// CK1-64-NEXT:    [[TMP20:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// CK1-64-NEXT:    store i32 0, ptr [[TMP20]], align 4
+// CK1-64-NEXT:    [[TMP21:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1:[0-9]+]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo1i_l24.region_id, ptr [[KERNEL_ARGS]])
+// CK1-64-NEXT:    [[TMP22:%.*]] = icmp ne i32 [[TMP21]], 0
+// CK1-64-NEXT:    br i1 [[TMP22]], label %[[OMP_OFFLOAD_FAILED:.*]], label %[[OMP_OFFLOAD_CONT:.*]]
+// CK1-64:       [[OMP_OFFLOAD_FAILED]]:
+// CK1-64-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo1i_l24(i64 [[TMP2]]) #[[ATTR2:[0-9]+]]
+// CK1-64-NEXT:    br label %[[OMP_OFFLOAD_CONT]]
+// CK1-64:       [[OMP_OFFLOAD_CONT]]:
+// CK1-64-NEXT:    ret void
+//
+//
+// CK1-64-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo1i_l24(
+// CK1-64-SAME: i64 [[D:%.*]]) #[[ATTR1:[0-9]+]] {
+// CK1-64-NEXT:  [[ENTRY:.*:]]
+// CK1-64-NEXT:    [[D_ADDR:%.*]] = alloca i64, align 8
+// CK1-64-NEXT:    [[D1:%.*]] = alloca double, align 8
+// CK1-64-NEXT:    store i64 [[D]], ptr [[D_ADDR]], align 8
+// CK1-64-NEXT:    [[TMP0:%.*]] = load double, ptr [[D1]], align 8
+// CK1-64-NEXT:    [[ADD:%.*]] = fadd double [[TMP0]], 1.000000e+00
+// CK1-64-NEXT:    store double [[ADD]], ptr [[D1]], align 8
+// CK1-64-NEXT:    ret void
+//
+//
+// CK1-32-LABEL: define dso_local void @_Z4foo1i(
+// CK1-32-SAME: i32 [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+// CK1-32-NEXT:  [[ENTRY:.*:]]
+// CK1-32-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// CK1-32-NEXT:    [[D:%.*]] = alloca double, align 8
+// CK1-32-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 4
+// CK1-32-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 4
+// CK1-32-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 4
+// CK1-32-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CK1-32-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
+// CK1-32-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CK1-32-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP0]] to double
+// CK1-32-NEXT:    store double [[CONV]], ptr [[D]], align 8
+// CK1-32-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CK1-32-NEXT:    store ptr [[D]], ptr [[TMP1]], align 4
+// CK1-32-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CK1-32-NEXT:    store ptr [[D]], ptr [[TMP2]], align 4
+// CK1-32-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 0
+// CK1-32-NEXT:    store ptr null, ptr [[TMP3]], align 4
+// CK1-32-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CK1-32-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CK1-32-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// CK1-32-NEXT:    store i32 3, ptr [[TMP6]], align 4
+// CK1-32-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// CK1-32-NEXT:    store i32 1, ptr [[TMP7]], align 4
+// CK1-32-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// CK1-32-NEXT:    store ptr [[TMP4]], ptr [[TMP8]], align 4
+// CK1-32-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// CK1-32-NEXT:    store ptr [[TMP5]], ptr [[TMP9]], align 4
+// CK1-32-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// CK1-32-NEXT:    store ptr @.offload_sizes, ptr [[TMP10]], align 4
+// CK1-32-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// CK1-32-NEXT:    store ptr @.offload_maptypes, ptr [[TMP11]], align 4
+// CK1-32-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// CK1-32-NEXT:    store ptr null, ptr [[TMP12]], align 4
+// CK1-32-NEXT:    [[TMP13:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// CK1-32-NEXT:    store ptr null, ptr [[TMP13]], align 4
+// CK1-32-NEXT:    [[TMP14:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// CK1-32-NEXT:    store i64 0, ptr [[TMP14]], align 8
+// CK1-32-NEXT:    [[TMP15:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// CK1-32-NEXT:    store i64 0, ptr [[TMP15]], align 8
+// CK1-32-NEXT:    [[TMP16:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// CK1-32-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP16]], align 4
+// CK1-32-NEXT:    [[TMP17:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// CK1-32-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP17]], align 4
+// CK1-32-NEXT:    [[TMP18:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// CK1-32-NEXT:    store i32 0, ptr [[TMP18]], align 4
+// CK1-32-NEXT:    [[TMP19:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1:[0-9]+]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo1i_l24.region_id, ptr [[KERNEL_ARGS]])
+// CK1-32-NEXT:    [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0
+// CK1-32-NEXT:    br i1 [[TMP20]], label %[[OMP_OFFLOAD_FAILED:.*]], label %[[OMP_OFFLOAD_CONT:.*]]
+// CK1-32:       [[OMP_OFFLOAD_FAILED]]:
+// CK1-32-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo1i_l24(ptr [[D]]) #[[ATTR2:[0-9]+]]
+// CK1-32-NEXT:    br label %[[OMP_OFFLOAD_CONT]]
+// CK1-32:       [[OMP_OFFLOAD_CONT]]:
+// CK1-32-NEXT:    ret void
+//
+//
+// CK1-32-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo1i_l24(
+// CK1-32-SAME: ptr nonnull align 4 dereferenceable(8) [[D:%.*]]) #[[ATTR1:[0-9]+]] {
+// CK1-32-NEXT:  [[ENTRY:.*:]]
+// CK1-32-NEXT:    [[D_ADDR:%.*]] = alloca ptr, align 4
+// CK1-32-NEXT:    [[D1:%.*]] = alloca double, align 8
+// CK1-32-NEXT:    store ptr [[D]], ptr [[D_ADDR]], align 4
+// CK1-32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[D_ADDR]], align 4, !nonnull [[META6:![0-9]+]], !align [[META7:![0-9]+]]
+// CK1-32-NEXT:    [[TMP1:%.*]] = load double, ptr [[D1]], align 8
+// CK1-32-NEXT:    [[ADD:%.*]] = fadd double [[TMP1]], 1.000000e+00
+// CK1-32-NEXT:    store double [[ADD]], ptr [[D1]], align 8
+// CK1-32-NEXT:    ret void
+//
+//
+// SIMD-ONLY1-64-LABEL: define dso_local void @_Z4foo1i(
+// SIMD-ONLY1-64-SAME: i32 signext [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+// SIMD-ONLY1-64-NEXT:  [[ENTRY:.*:]]
+// SIMD-ONLY1-64-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// SIMD-ONLY1-64-NEXT:    [[D:%.*]] = alloca double, align 8
+// SIMD-ONLY1-64-NEXT:    [[D1:%.*]] = alloca double, align 8
+// SIMD-ONLY1-64-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
+// SIMD-ONLY1-64-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// SIMD-ONLY1-64-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP0]] to double
+// SIMD-ONLY1-64-NEXT:    store double [[CONV]], ptr [[D]], align 8
+// SIMD-ONLY1-64-NEXT:    [[TMP1:%.*]] = load double, ptr [[D1]], align 8
+// SIMD-ONLY1-64-NEXT:    [[ADD:%.*]] = fadd double [[TMP1]], 1.000000e+00
+// SIMD-ONLY1-64-NEXT:    store double [[ADD]], ptr [[D1]], align 8
+// SIMD-ONLY1-64-NEXT:    ret void
+//
+//
+// SIMD-ONLY1-32-LABEL: define dso_local void @_Z4foo1i(
+// SIMD-ONLY1-32-SAME: i32 [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+// SIMD-ONLY1-32-NEXT:  [[ENTRY:.*:]]
+// SIMD-ONLY1-32-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// SIMD-ONLY1-32-NEXT:    [[D:%.*]] = alloca double, align 8
+// SIMD-ONLY1-32-NEXT:    [[D1:%.*]] = alloca double, align 8
+// SIMD-ONLY1-32-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
+// SIMD-ONLY1-32-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// SIMD-ONLY1-32-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP0]] to double
+// SIMD-ONLY1-32-NEXT:    store double [[CONV]], ptr [[D]], align 8
+// SIMD-ONLY1-32-NEXT:    [[TMP1:%.*]] = load double, ptr [[D1]], align 8
+// SIMD-ONLY1-32-NEXT:    [[ADD:%.*]] = fadd double [[TMP1]], 1.000000e+00
+// SIMD-ONLY1-32-NEXT:    store double [[ADD]], ptr [[D1]], align 8
+// SIMD-ONLY1-32-NEXT:    ret void
+//
+//
+// CK2-64-LABEL: define dso_local void @_Z4foo2v(
+// CK2-64-SAME: ) #[[ATTR0:[0-9]+]] {
+// CK2-64-NEXT:  [[ENTRY:.*:]]
+// CK2-64-NEXT:    [[PVTARR:%.*]] = alloca [10 x i32], align 4
+// CK2-64-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 8
+// CK2-64-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 8
+// CK2-64-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 8
+// CK2-64-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CK2-64-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CK2-64-NEXT:    store ptr [[PVTARR]], ptr [[TMP0]], align 8
+// CK2-64-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CK2-64-NEXT:    store ptr [[PVTARR]], ptr [[TMP1]], align 8
+// CK2-64-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0
+// CK2-64-NEXT:    store ptr null, ptr [[TMP2]], align 8
+// CK2-64-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CK2-64-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CK2-64-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// CK2-64-NEXT:    store i32 3, ptr [[TMP5]], align 4
+// CK2-64-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// CK2-64-NEXT:    store i32 1, ptr [[TMP6]], align 4
+// CK2-64-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// CK2-64-NEXT:    store ptr [[TMP3]], ptr [[TMP7]], align 8
+// CK2-64-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// CK2-64-NEXT:    store ptr [[TMP4]], ptr [[TMP8]], align 8
+// CK2-64-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// CK2-64-NEXT:    store ptr @.offload_sizes, ptr [[TMP9]], align 8
+// CK2-64-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// CK2-64-NEXT:    store ptr @.offload_maptypes, ptr [[TMP10]], align 8
+// CK2-64-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// CK2-64-NEXT:    store ptr null, ptr [[TMP11]], align 8
+// CK2-64-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// CK2-64-NEXT:    store ptr null, ptr [[TMP12]], align 8
+// CK2-64-NEXT:    [[TMP13:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// CK2-64-NEXT:    store i64 0, ptr [[TMP13]], align 8
+// CK2-64-NEXT:    [[TMP14:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// CK2-64-NEXT:    store i64 0, ptr [[TMP14]], align 8
+// CK2-64-NEXT:    [[TMP15:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// CK2-64-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP15]], align 4
+// CK2-64-NEXT:    [[TMP16:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// CK2-64-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP16]], align 4
+// CK2-64-NEXT:    [[TMP17:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// CK2-64-NEXT:    store i32 0, ptr [[TMP17]], align 4
+// CK2-64-NEXT:    [[TMP18:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1:[0-9]+]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo2v_l50.region_id, ptr [[KERNEL_ARGS]])
+// CK2-64-NEXT:    [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
+// CK2-64-NEXT:    br i1 [[TMP19]], label %[[OMP_OFFLOAD_FAILED:.*]], label %[[OMP_OFFLOAD_CONT:.*]]
+// CK2-64:       [[OMP_OFFLOAD_FAILED]]:
+// CK2-64-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo2v_l50(ptr [[PVTARR]]) #[[ATTR2:[0-9]+]]
+// CK2-64-NEXT:    br label %[[OMP_OFFLOAD_CONT]]
+// CK2-64:       [[OMP_OFFLOAD_CONT]]:
+// CK2-64-NEXT:    ret void
+//
+//
+// CK2-64-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo2v_l50(
+// CK2-64-SAME: ptr nonnull align 4 dereferenceable(40) [[PVTARR:%.*]]) #[[ATTR1:[0-9]+]] {
+// CK2-64-NEXT:  [[ENTRY:.*:]]
+// CK2-64-NEXT:    [[PVTARR_ADDR:%.*]] = alloca ptr, align 8
+// CK2-64-NEXT:    [[PVTARR1:%.*]] = alloca [10 x i32], align 4
+// CK2-64-NEXT:    store ptr [[PVTARR]], ptr [[PVTARR_ADDR]], align 8
+// CK2-64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PVTARR_ADDR]], align 8, !nonnull [[META5:![0-9]+]], !align [[META6:![0-9]+]]
+// CK2-64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[PVTARR1]], i64 0, i64 5
+// CK2-64-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+// CK2-64-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP1]], 1
+// CK2-64-NEXT:    store i32 [[INC]], ptr [[ARRAYIDX]], align 4
+// CK2-64-NEXT:    ret void
+//
+//
+// CK2-32-LABEL: define dso_local void @_Z4foo2v(
+// CK2-32-SAME: ) #[[ATTR0:[0-9]+]] {
+// CK2-32-NEXT:  [[ENTRY:.*:]]
+// CK2-32-NEXT:    [[PVTARR:%.*]] = alloca [10 x i32], align 4
+// CK2-32-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 4
+// CK2-32-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 4
+// CK2-32-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 4
+// CK2-32-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CK2-32-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CK2-32-NEXT:    store ptr [[PVTARR]], ptr [[TMP0]], align 4
+// CK2-32-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CK2-32-NEXT:    store ptr [[PVTARR]], ptr [[TMP1]], align 4
+// CK2-32-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 0
+// CK2-32-NEXT:    store ptr null, ptr [[TMP2]], align 4
+// CK2-32-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CK2-32-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CK2-32-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// CK2-32-NEXT:    store i32 3, ptr [[TMP5]], align 4
+// CK2-32-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// CK2-32-NEXT:    store i32 1, ptr [[TMP6]], align 4
+// CK2-32-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// CK2-32-NEXT:    store ptr [[TMP3]], ptr [[TMP7]], align 4
+// CK2-32-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// CK2-32-NEXT:    store ptr [[TMP4]], ptr [[TMP8]], align 4
+// CK2-32-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// CK2-32-NEXT:    store ptr @.offload_sizes, ptr [[TMP9]], align 4
+// CK2-32-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// CK2-32-NEXT:    store ptr @.offload_maptypes, ptr [[TMP10]], align 4
+// CK2-32-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// CK2-32-NEXT:    store ptr null, ptr [[TMP11]], align 4
+// CK2-32-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// CK2-32-NEXT:    store ptr null, ptr [[TMP12]], align 4
+// CK2-32-NEXT:    [[TMP13:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// CK2-32-NEXT:    store i64 0, ptr [[TMP13]], align 8
+// CK2-32-NEXT:    [[TMP14:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// CK2-32-NEXT:    store i64 0, ptr [[TMP14]], align 8
+// CK2-32-NEXT:    [[TMP15:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// CK2-32-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP15]], align 4
+// CK2-32-NEXT:    [[TMP16:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// CK2-32-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP16]], align 4
+// CK2-32-NEXT:    [[TMP17:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// CK2-32-NEXT:    store i32 0, ptr [[TMP17]], align 4
+// CK2-32-NEXT:    [[TMP18:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1:[0-9]+]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo2v_l50.region_id, ptr [[KERNEL_ARGS]])
+// CK2-32-NEXT:    [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
+// CK2-32-NEXT:    br i1 [[TMP19]], label %[[OMP_OFFLOAD_FAILED:.*]], label %[[OMP_OFFLOAD_CONT:.*]]
+// CK2-32:       [[OMP_OFFLOAD_FAILED]]:
+// CK2-32-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo2v_l50(ptr [[PVTARR]]) #[[ATTR2:[0-9]+]]
+// CK2-32-NEXT:    br label %[[OMP_OFFLOAD_CONT]]
+// CK2-32:       [[OMP_OFFLOAD_CONT]]:
+// CK2-32-NEXT:    ret void
+//
+//
+// CK2-32-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo2v_l50(
+// CK2-32-SAME: ptr nonnull align 4 dereferenceable(40) [[PVTARR:%.*]]) #[[ATTR1:[0-9]+]] {
+// CK2-32-NEXT:  [[ENTRY:.*:]]
+// CK2-32-NEXT:    [[PVTARR_ADDR:%.*]] = alloca ptr, align 4
+// CK2-32-NEXT:    [[PVTARR1:%.*]] = alloca [10 x i32], align 4
+// CK2-32-NEXT:    store ptr [[PVTARR]], ptr [[PVTARR_ADDR]], align 4
+// CK2-32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PVTARR_ADDR]], align 4, !nonnull [[META6:![0-9]+]], !align [[META7:![0-9]+]]
+// CK2-32-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[PVTARR1]], i32 0, i32 5
+// CK2-32-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+// CK2-32-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP1]], 1
+// CK2-32-NEXT:    store i32 [[INC]], ptr [[ARRAYIDX]], align 4
+// CK2-32-NEXT:    ret void
+//
+//
+// SIMD-ONLY2-64-LABEL: define dso_local void @_Z4foo2v(
+// SIMD-ONLY2-64-SAME: ) #[[ATTR0:[0-9]+]] {
+// SIMD-ONLY2-64-NEXT:  [[ENTRY:.*:]]
+// SIMD-ONLY2-64-NEXT:    [[PVTARR:%.*]] = alloca [10 x i32], align 4
+// SIMD-ONLY2-64-NEXT:    [[PVTARR1:%.*]] = alloca [10 x i32], align 4
+// SIMD-ONLY2-64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[PVTARR1]], i64 0, i64 5
+// SIMD-ONLY2-64-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+// SIMD-ONLY2-64-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP0]], 1
+// SIMD-ONLY2-64-NEXT:    store i32 [[INC]], ptr [[ARRAYIDX]], align 4
+// SIMD-ONLY2-64-NEXT:    ret void
+//
+//
+// SIMD-ONLY2-32-LABEL: define dso_local void @_Z4foo2v(
+// SIMD-ONLY2-32-SAME: ) #[[ATTR0:[0-9]+]] {
+// SIMD-ONLY2-32-NEXT:  [[ENTRY:.*:]]
+// SIMD-ONLY2-32-NEXT:    [[PVTARR:%.*]] = alloca [10 x i32], align 4
+// SIMD-ONLY2-32-NEXT:    [[PVTARR1:%.*]] = alloca [10 x i32], align 4
+// SIMD-ONLY2-32-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[PVTARR1]], i32 0, i32 5
+// SIMD-ONLY2-32-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+// SIMD-ONLY2-32-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP0]], 1
+// SIMD-ONLY2-32-NEXT:    store i32 [[INC]], ptr [[ARRAYIDX]], align 4
+// SIMD-ONLY2-32-NEXT:    ret void
+//
+//
+// CK3-64-LABEL: define dso_local void @_Z4foo3v(
+// CK3-64-SAME: ) #[[ATTR0:[0-9]+]] {
+// CK3-64-NEXT:  [[ENTRY:.*:]]
+// CK3-64-NEXT:    [[PA:%.*]] = alloca ptr, align 8
+// CK3-64-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 8
+// CK3-64-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 8
+// CK3-64-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 8
+// CK3-64-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CK3-64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PA]], align 8
+// CK3-64-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CK3-64-NEXT:    store ptr [[TMP0]], ptr [[TMP1]], align 8
+// CK3-64-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CK3-64-NEXT:    store ptr [[TMP0]], ptr [[TMP2]], align 8
+// CK3-64-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0
+// CK3-64-NEXT:    store ptr null, ptr [[TMP3]], align 8
+// CK3-64-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CK3-64-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CK3-64-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// CK3-64-NEXT:    store i32 3, ptr [[TMP6]], align 4
+// CK3-64-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// CK3-64-NEXT:    store i32 1, ptr [[TMP7]], align 4
+// CK3-64-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// CK3-64-NEXT:    store ptr [[TMP4]], ptr [[TMP8]], align 8
+// CK3-64-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// CK3-64-NEXT:    store ptr [[TMP5]], ptr [[TMP9]], align 8
+// CK3-64-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// CK3-64-NEXT:    store ptr @.offload_sizes, ptr [[TMP10]], align 8
+// CK3-64-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// CK3-64-NEXT:    store ptr @.offload_maptypes, ptr [[TMP11]], align 8
+// CK3-64-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// CK3-64-NEXT:    store ptr null, ptr [[TMP12]], align 8
+// CK3-64-NEXT:    [[TMP13:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// CK3-64-NEXT:    store ptr null, ptr [[TMP13]], align 8
+// CK3-64-NEXT:    [[TMP14:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// CK3-64-NEXT:    store i64 0, ptr [[TMP14]], align 8
+// CK3-64-NEXT:    [[TMP15:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// CK3-64-NEXT:    store i64 0, ptr [[TMP15]], align 8
+// CK3-64-NEXT:    [[TMP16:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// CK3-64-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP16]], align 4
+// CK3-64-NEXT:    [[TMP17:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// CK3-64-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP17]], align 4
+// CK3-64-NEXT:    [[TMP18:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// CK3-64-NEXT:    store i32 0, ptr [[TMP18]], align 4
+// CK3-64-NEXT:    [[TMP19:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1:[0-9]+]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo3v_l75.region_id, ptr [[KERNEL_ARGS]])
+// CK3-64-NEXT:    [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0
+// CK3-64-NEXT:    br i1 [[TMP20]], label %[[OMP_OFFLOAD_FAILED:.*]], label %[[OMP_OFFLOAD_CONT:.*]]
+// CK3-64:       [[OMP_OFFLOAD_FAILED]]:
+// CK3-64-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo3v_l75(ptr [[TMP0]]) #[[ATTR2:[0-9]+]]
+// CK3-64-NEXT:    br label %[[OMP_OFFLOAD_CONT]]
+// CK3-64:       [[OMP_OFFLOAD_CONT]]:
+// CK3-64-NEXT:    ret void
+//
+//
+// CK3-64-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo3v_l75(
+// CK3-64-SAME: ptr [[PA:%.*]]) #[[ATTR1:[0-9]+]] {
+// CK3-64-NEXT:  [[ENTRY:.*:]]
+// CK3-64-NEXT:    [[PA_ADDR:%.*]] = alloca ptr, align 8
+// CK3-64-NEXT:    [[PA1:%.*]] = alloca ptr, align 8
+// CK3-64-NEXT:    store ptr [[PA]], ptr [[PA_ADDR]], align 8
+// CK3-64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PA1]], align 8
+// CK3-64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 50
+// CK3-64-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+// CK3-64-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP1]], 1
+// CK3-64-NEXT:    store i32 [[INC]], ptr [[ARRAYIDX]], align 4
+// CK3-64-NEXT:    ret void
+//
+//
+// CK3-32-LABEL: define dso_local void @_Z4foo3v(
+// CK3-32-SAME: ) #[[ATTR0:[0-9]+]] {
+// CK3-32-NEXT:  [[ENTRY:.*:]]
+// CK3-32-NEXT:    [[PA:%.*]] = alloca ptr, align 4
+// CK3-32-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 4
+// CK3-32-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 4
+// CK3-32-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 4
+// CK3-32-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CK3-32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PA]], align 4
+// CK3-32-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CK3-32-NEXT:    store ptr [[TMP0]], ptr [[TMP1]], align 4
+// CK3-32-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CK3-32-NEXT:    store ptr [[TMP0]], ptr [[TMP2]], align 4
+// CK3-32-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 0
+// CK3-32-NEXT:    store ptr null, ptr [[TMP3]], align 4
+// CK3-32-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CK3-32-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CK3-32-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// CK3-32-NEXT:    store i32 3, ptr [[TMP6]], align 4
+// CK3-32-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// CK3-32-NEXT:    store i32 1, ptr [[TMP7]], align 4
+// CK3-32-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// CK3-32-NEXT:    store ptr [[TMP4]], ptr [[TMP8]], align 4
+// CK3-32-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// CK3-32-NEXT:    store ptr [[TMP5]], ptr [[TMP9]], align 4
+// CK3-32-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// CK3-32-NEXT:    store ptr @.offload_sizes, ptr [[TMP10]], align 4
+// CK3-32-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// CK3-32-NEXT:    store ptr @.offload_maptypes, ptr [[TMP11]], align 4
+// CK3-32-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// CK3-32-NEXT:    store ptr null, ptr [[TMP12]], align 4
+// CK3-32-NEXT:    [[TMP13:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// CK3-32-NEXT:    store ptr null, ptr [[TMP13]], align 4
+// CK3-32-NEXT:    [[TMP14:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// CK3-32-NEXT:    store i64 0, ptr [[TMP14]], align 8
+// CK3-32-NEXT:    [[TMP15:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// CK3-32-NEXT:    store i64 0, ptr [[TMP15]], align 8
+// CK3-32-NEXT:    [[TMP16:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// CK3-32-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP16]], align 4
+// CK3-32-NEXT:    [[TMP17:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// CK3-32-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP17]], align 4
+// CK3-32-NEXT:    [[TMP18:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// CK3-32-NEXT:    store i32 0, ptr [[TMP18]], align 4
+// CK3-32-NEXT:    [[TMP19:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1:[0-9]+]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo3v_l75.region_id, ptr [[KERNEL_ARGS]])
+// CK3-32-NEXT:    [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0
+// CK3-32-NEXT:    br i1 [[TMP20]], label %[[OMP_OFFLOAD_FAILED:.*]], label %[[OMP_OFFLOAD_CONT:.*]]
+// CK3-32:       [[OMP_OFFLOAD_FAILED]]:
+// CK3-32-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo3v_l75(ptr [[TMP0]]) #[[ATTR2:[0-9]+]]
+// CK3-32-NEXT:    br label %[[OMP_OFFLOAD_CONT]]
+// CK3-32:       [[OMP_OFFLOAD_CONT]]:
+// CK3-32-NEXT:    ret void
+//
+//
+// CK3-32-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo3v_l75(
+// CK3-32-SAME: ptr [[PA:%.*]]) #[[ATTR1:[0-9]+]] {
+// CK3-32-NEXT:  [[ENTRY:.*:]]
+// CK3-32-NEXT:    [[PA_ADDR:%.*]] = alloca ptr, align 4
+// CK3-32-NEXT:    [[PA1:%.*]] = alloca ptr, align 4
+// CK3-32-NEXT:    store ptr [[PA]], ptr [[PA_ADDR]], align 4
+// CK3-32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PA1]], align 4
+// CK3-32-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 50
+// CK3-32-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+// CK3-32-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP1]], 1
+// CK3-32-NEXT:    store i32 [[INC]], ptr [[ARRAYIDX]], align 4
+// CK3-32-NEXT:    ret void
+//
+//
+// SIMD-ONLY3-64-LABEL: define dso_local void @_Z4foo3v(
+// SIMD-ONLY3-64-SAME: ) #[[ATTR0:[0-9]+]] {
+// SIMD-ONLY3-64-NEXT:  [[ENTRY:.*:]]
+// SIMD-ONLY3-64-NEXT:    [[PA:%.*]] = alloca ptr, align 8
+// SIMD-ONLY3-64-NEXT:    [[PA1:%.*]] = alloca ptr, align 8
+// SIMD-ONLY3-64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PA1]], align 8
+// SIMD-ONLY3-64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 50
+// SIMD-ONLY3-64-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+// SIMD-ONLY3-64-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP1]], 1
+// SIMD-ONLY3-64-NEXT:    store i32 [[INC]], ptr [[ARRAYIDX]], align 4
+// SIMD-ONLY3-64-NEXT:    ret void
+//
+//
+// SIMD-ONLY3-32-LABEL: define dso_local void @_Z4foo3v(
+// SIMD-ONLY3-32-SAME: ) #[[ATTR0:[0-9]+]] {
+// SIMD-ONLY3-32-NEXT:  [[ENTRY:.*:]]
+// SIMD-ONLY3-32-NEXT:    [[PA:%.*]] = alloca ptr, align 4
+// SIMD-ONLY3-32-NEXT:    [[PA1:%.*]] = alloca ptr, align 4
+// SIMD-ONLY3-32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PA1]], align 4
+// SIMD-ONLY3-32-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 50
+// SIMD-ONLY3-32-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+// SIMD-ONLY3-32-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP1]], 1
+// SIMD-ONLY3-32-NEXT:    store i32 [[INC]], ptr [[ARRAYIDX]], align 4
+// SIMD-ONLY3-32-NEXT:    ret void
+//
+//
+// CK4-64-LABEL: define dso_local void @_Z4foo4v(
+// CK4-64-SAME: ) #[[ATTR0:[0-9]+]] {
+// CK4-64-NEXT:  [[ENTRY:.*:]]
+// CK4-64-NEXT:    [[P:%.*]] = alloca i32, align 4
+// CK4-64-NEXT:    [[P_CASTED:%.*]] = alloca i64, align 8
+// CK4-64-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 8
+// CK4-64-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 8
+// CK4-64-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 8
+// CK4-64-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CK4-64-NEXT:    [[TMP0:%.*]] = load i32, ptr [[P]], align 4
+// CK4-64-NEXT:    store i32 [[TMP0]], ptr [[P_CASTED]], align 4
+// CK4-64-NEXT:    [[TMP1:%.*]] = load i64, ptr [[P_CASTED]], align 8
+// CK4-64-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CK4-64-NEXT:    store i64 [[TMP1]], ptr [[TMP2]], align 8
+// CK4-64-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CK4-64-NEXT:    store i64 [[TMP1]], ptr [[TMP3]], align 8
+// CK4-64-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0
+// CK4-64-NEXT:    store ptr null, ptr [[TMP4]], align 8
+// CK4-64-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CK4-64-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CK4-64-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// CK4-64-NEXT:    store i32 3, ptr [[TMP7]], align 4
+// CK4-64-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// CK4-64-NEXT:    store i32 1, ptr [[TMP8]], align 4
+// CK4-64-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// CK4-64-NEXT:    store ptr [[TMP5]], ptr [[TMP9]], align 8
+// CK4-64-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// CK4-64-NEXT:    store ptr [[TMP6]], ptr [[TMP10]], align 8
+// CK4-64-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// CK4-64-NEXT:    store ptr @.offload_sizes, ptr [[TMP11]], align 8
+// CK4-64-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// CK4-64-NEXT:    store ptr @.offload_maptypes, ptr [[TMP12]], align 8
+// CK4-64-NEXT:    [[TMP13:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// CK4-64-NEXT:    store ptr null, ptr [[TMP13]], align 8
+// CK4-64-NEXT:    [[TMP14:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// CK4-64-NEXT:    store ptr null, ptr [[TMP14]], align 8
+// CK4-64-NEXT:    [[TMP15:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// CK4-64-NEXT:    store i64 0, ptr [[TMP15]], align 8
+// CK4-64-NEXT:    [[TMP16:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// CK4-64-NEXT:    store i64 0, ptr [[TMP16]], align 8
+// CK4-64-NEXT:    [[TMP17:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// CK4-64-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP17]], align 4
+// CK4-64-NEXT:    [[TMP18:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// CK4-64-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP18]], align 4
+// CK4-64-NEXT:    [[TMP19:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// CK4-64-NEXT:    store i32 0, ptr [[TMP19]], align 4
+// CK4-64-NEXT:    [[TMP20:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1:[0-9]+]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo4v_l103.region_id, ptr [[KERNEL_ARGS]])
+// CK4-64-NEXT:    [[TMP21:%.*]] = icmp ne i32 [[TMP20]], 0
+// CK4-64-NEXT:    br i1 [[TMP21]], label %[[OMP_OFFLOAD_FAILED:.*]], label %[[OMP_OFFLOAD_CONT:.*]]
+// CK4-64:       [[OMP_OFFLOAD_FAILED]]:
+// CK4-64-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo4v_l103(i64 [[TMP1]]) #[[ATTR2:[0-9]+]]
+// CK4-64-NEXT:    br label %[[OMP_OFFLOAD_CONT]]
+// CK4-64:       [[OMP_OFFLOAD_CONT]]:
+// CK4-64-NEXT:    ret void
+//
+//
+// CK4-64-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo4v_l103(
+// CK4-64-SAME: i64 [[P:%.*]]) #[[ATTR1:[0-9]+]] {
+// CK4-64-NEXT:  [[ENTRY:.*:]]
+// CK4-64-NEXT:    [[P_ADDR:%.*]] = alloca i64, align 8
+// CK4-64-NEXT:    store i64 [[P]], ptr [[P_ADDR]], align 8
+// CK4-64-NEXT:    [[TMP0:%.*]] = load i32, ptr [[P_ADDR]], align 4
+// CK4-64-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP0]], 1
+// CK4-64-NEXT:    store i32 [[INC]], ptr [[P_ADDR]], align 4
+// CK4-64-NEXT:    ret void
+//
+//
+// CK4-32-LABEL: define dso_local void @_Z4foo4v(
+// CK4-32-SAME: ) #[[ATTR0:[0-9]+]] {
+// CK4-32-NEXT:  [[ENTRY:.*:]]
+// CK4-32-NEXT:    [[P:%.*]] = alloca i32, align 4
+// CK4-32-NEXT:    [[P_CASTED:%.*]] = alloca i32, align 4
+// CK4-32-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 4
+// CK4-32-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 4
+// CK4-32-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 4
+// CK4-32-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CK4-32-NEXT:    [[TMP0:%.*]] = load i32, ptr [[P]], align 4
+// CK4-32-NEXT:    store i32 [[TMP0]], ptr [[P_CASTED]], align 4
+// CK4-32-NEXT:    [[TMP1:%.*]] = load i32, ptr [[P_CASTED]], align 4
+// CK4-32-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CK4-32-NEXT:    store i32 [[TMP1]], ptr [[TMP2]], align 4
+// CK4-32-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CK4-32-NEXT:    store i32 [[TMP1]], ptr [[TMP3]], align 4
+// CK4-32-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 0
+// CK4-32-NEXT:    store ptr null, ptr [[TMP4]], align 4
+// CK4-32-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CK4-32-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CK4-32-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// CK4-32-NEXT:    store i32 3, ptr [[TMP7]], align 4
+// CK4-32-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// CK4-32-NEXT:    store i32 1, ptr [[TMP8]], align 4
+// CK4-32-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// CK4-32-NEXT:    store ptr [[TMP5]], ptr [[TMP9]], align 4
+// CK4-32-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// CK4-32-NEXT:    store ptr [[TMP6]], ptr [[TMP10]], align 4
+// CK4-32-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// CK4-32-NEXT:    store ptr @.offload_sizes, ptr [[TMP11]], align 4
+// CK4-32-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// CK4-32-NEXT:    store ptr @.offload_maptypes, ptr [[TMP12]], align 4
+// CK4-32-NEXT:    [[TMP13:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// CK4-32-NEXT:    store ptr null, ptr [[TMP13]], align 4
+// CK4-32-NEXT:    [[TMP14:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// CK4-32-NEXT:    store ptr null, ptr [[TMP14]], align 4
+// CK4-32-NEXT:    [[TMP15:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// CK4-32-NEXT:    store i64 0, ptr [[TMP15]], align 8
+// CK4-32-NEXT:    [[TMP16:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// CK4-32-NEXT:    store i64 0, ptr [[TMP16]], align 8
+// CK4-32-NEXT:    [[TMP17:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// CK4-32-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP17]], align 4
+// CK4-32-NEXT:    [[TMP18:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// CK4-32-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP18]], align 4
+// CK4-32-NEXT:    [[TMP19:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// CK4-32-NEXT:    store i32 0, ptr [[TMP19]], align 4
+// CK4-32-NEXT:    [[TMP20:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1:[0-9]+]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo4v_l103.region_id, ptr [[KERNEL_ARGS]])
+// CK4-32-NEXT:    [[TMP21:%.*]] = icmp ne i32 [[TMP20]], 0
+// CK4-32-NEXT:    br i1 [[TMP21]], label %[[OMP_OFFLOAD_FAILED:.*]], label %[[OMP_OFFLOAD_CONT:.*]]
+// CK4-32:       [[OMP_OFFLOAD_FAILED]]:
+// CK4-32-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo4v_l103(i32 [[TMP1]]) #[[ATTR2:[0-9]+]]
+// CK4-32-NEXT:    br label %[[OMP_OFFLOAD_CONT]]
+// CK4-32:       [[OMP_OFFLOAD_CONT]]:
+// CK4-32-NEXT:    ret void
+//
+//
+// CK4-32-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo4v_l103(
+// CK4-32-SAME: i32 [[P:%.*]]) #[[ATTR1:[0-9]+]] {
+// CK4-32-NEXT:  [[ENTRY:.*:]]
+// CK4-32-NEXT:    [[P_ADDR:%.*]] = alloca i32, align 4
+// CK4-32-NEXT:    store i32 [[P]], ptr [[P_ADDR]], align 4
+// CK4-32-NEXT:    [[TMP0:%.*]] = load i32, ptr [[P_ADDR]], align 4
+// CK4-32-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP0]], 1
+// CK4-32-NEXT:    store i32 [[INC]], ptr [[P_ADDR]], align 4
+// CK4-32-NEXT:    ret void
+//
+//
+// SIMD-ONLY4-64-LABEL: define dso_local void @_Z4foo4v(
+// SIMD-ONLY4-64-SAME: ) #[[ATTR0:[0-9]+]] {
+// SIMD-ONLY4-64-NEXT:  [[ENTRY:.*:]]
+// SIMD-ONLY4-64-NEXT:    [[P:%.*]] = alloca i32, align 4
+// SIMD-ONLY4-64-NEXT:    [[TMP0:%.*]] = load i32, ptr [[P]], align 4
+// SIMD-ONLY4-64-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP0]], 1
+// SIMD-ONLY4-64-NEXT:    store i32 [[INC]], ptr [[P]], align 4
+// SIMD-ONLY4-64-NEXT:    ret void
+//
+//
+// SIMD-ONLY4-32-LABEL: define dso_local void @_Z4foo4v(
+// SIMD-ONLY4-32-SAME: ) #[[ATTR0:[0-9]+]] {
+// SIMD-ONLY4-32-NEXT:  [[ENTRY:.*:]]
+// SIMD-ONLY4-32-NEXT:    [[P:%.*]] = alloca i32, align 4
+// SIMD-ONLY4-32-NEXT:    [[TMP0:%.*]] = load i32, ptr [[P]], align 4
+// SIMD-ONLY4-32-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP0]], 1
+// SIMD-ONLY4-32-NEXT:    store i32 [[INC]], ptr [[P]], align 4
+// SIMD-ONLY4-32-NEXT:    ret void
+//
+//.
+// CK1-32: [[META6]] = !{}
+// CK1-32: [[META7]] = !{i64 4}
+//.
+// CK2-64: [[META5]] = !{}
+// CK2-64: [[META6]] = !{i64 4}
+//.
+// CK2-32: [[META6]] = !{}
+// CK2-32: [[META7]] = !{i64 4}
+//.
diff --git a/clang/test/OpenMP/target_defaultmap_messages.cpp b/clang/test/OpenMP/target_defaultmap_messages.cpp
index 88ae3b7962d55..67dfb4717e179 100644
--- a/clang/test/OpenMP/target_defaultmap_messages.cpp
+++ b/clang/test/OpenMP/target_defaultmap_messages.cpp
@@ -1,5 +1,8 @@
-// RUN: %clang_cc1 -verify -Wno-vla -fopenmp %s -fopenmp-version=52 -verify=expected,omp5x,omp52 -Wuninitialized
-// RUN: %clang_cc1 -verify -Wno-vla -fopenmp-simd %s -fopenmp-version=52 -verify=expected,omp5x,omp52 -Wuninitialized
+// RUN: %clang_cc1 -verify -Wno-vla -fopenmp %s -fopenmp-version=60 -verify=expected,omp-ge52,omp60 -Wuninitialized
+// RUN: %clang_cc1 -verify -Wno-vla -fopenmp-simd %s -fopenmp-version=60 -verify=expected,omp-ge52,omp60 -Wuninitialized
+
+// RUN: %clang_cc1 -verify -Wno-vla -fopenmp %s -fopenmp-version=52 -verify=expected,omp5x,omp-ge52 -Wuninitialized
+// RUN: %clang_cc1 -verify -Wno-vla -fopenmp-simd %s -fopenmp-version=52 -verify=expected,omp5x,omp-ge52 -Wuninitialized
 
 // RUN: %clang_cc1 -verify -Wno-vla -fopenmp %s -fopenmp-version=51 -verify=expected,omp5x,omp51 -Wuninitialized -DOMP51
 // RUN: %clang_cc1 -verify -Wno-vla -fopenmp-simd %s -fopenmp-version=51 -verify=expected,omp5x,omp51 -Wuninitialized -DOMP51
@@ -33,23 +36,23 @@ template <class T, typename S, int N, int ST>
 T tmain(T argc, S **argv) {
   #pragma omp target defaultmap // expected-error {{expected '(' after 'defaultmap'}}
   foo();
-#pragma omp target defaultmap( // omp5x-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+#pragma omp target defaultmap( // omp60-error {{expected 'storage', 'from', 'to', 'tofrom', 'firstprivate', 'private', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5x-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
   foo();
-#pragma omp target defaultmap() // omp5x-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+#pragma omp target defaultmap() // omp60-error {{expected 'storage', 'from', 'to', 'tofrom', 'firstprivate', 'private', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5x-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
   foo();
 #pragma omp target defaultmap(tofrom // expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   foo();
-  #pragma omp target defaultmap (tofrom: // expected-error {{expected ')'}} expected-note {{to match this '('}} omp52-error {{expected 'scalar', 'aggregate', 'pointer', 'all' in OpenMP clause 'defaultmap'}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}} 
+  #pragma omp target defaultmap (tofrom: // expected-error {{expected ')'}} expected-note {{to match this '('}} omp-ge52-error {{expected 'scalar', 'aggregate', 'pointer', 'all' in OpenMP clause 'defaultmap'}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}} 
   foo();
 #pragma omp target defaultmap(tofrom) // omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   foo();
 #pragma omp target defaultmap(tofrom, // expected-error {{expected ')'}} omp45-warning {{missing ':' after defaultmap modifier - ignoring}} expected-note {{to match this '('}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   foo();
-  #pragma omp target defaultmap (scalar: // omp5x-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp52-error {{expected 'scalar', 'aggregate', 'pointer', 'all' in OpenMP clause 'defaultmap'}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} expected-error {{expected ')'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+  #pragma omp target defaultmap (scalar: // omp60-error {{expected 'storage', 'from', 'to', 'tofrom', 'firstprivate', 'private', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5x-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp-ge52-error {{expected 'scalar', 'aggregate', 'pointer', 'all' in OpenMP clause 'defaultmap'}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} expected-error {{expected ')'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
   foo();
 #pragma omp target defaultmap(tofrom, scalar // expected-error {{expected ')'}} omp45-warning {{missing ':' after defaultmap modifier - ignoring}} expected-note {{to match this '('}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   foo();
-  #pragma omp target defaultmap(tofrom:scalar) defaultmap(tofrom:scalar) // omp45-error {{directive '#pragma omp target' cannot contain more than one 'defaultmap' clause}} omp5-error {{at most one defaultmap clause for each variable-category can appear on the directive}} omp5x-error {{at most one defaultmap clause for each variable-category can appear on the directive}}
+  #pragma omp target defaultmap(tofrom:scalar) defaultmap(tofrom:scalar) // omp45-error {{directive '#pragma omp target' cannot contain more than one 'defaultmap' clause}} omp5-error {{at most one defaultmap clause for each variable-category can appear on the directive}} omp5x-error {{at most one defaultmap clause for each variable-category can appear on the directive}} omp60-error {{at most one defaultmap clause for each variable-category can appear on the directive}}
 
   foo();
 
@@ -96,23 +99,26 @@ T tmain(T argc, S **argv) {
 int main(int argc, char **argv) {
 #pragma omp target defaultmap // expected-error {{expected '(' after 'defaultmap'}}
   foo();
-#pragma omp target defaultmap( // omp5x-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+#pragma omp target defaultmap( // omp60-error {{expected 'storage', 'from', 'to', 'tofrom', 'firstprivate', 'private', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5x-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
   foo();
-#pragma omp target defaultmap() // omp5x-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+#pragma omp target defaultmap() // omp60-error {{expected 'storage', 'from', 'to', 'tofrom', 'firstprivate', 'private', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5x-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
   foo();
 #pragma omp target defaultmap(tofrom // expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   foo();
-#pragma omp target defaultmap(tofrom: // expected-error {{expected ')'}} expected-note {{to match this '('}} omp52-error {{expected 'scalar', 'aggregate', 'pointer', 'all' in OpenMP clause 'defaultmap'}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
+#pragma omp target defaultmap(tofrom: // expected-error {{expected ')'}} expected-note {{to match this '('}} omp-ge52-error {{expected 'scalar', 'aggregate', 'pointer', 'all' in OpenMP clause 'defaultmap'}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   foo();
 #pragma omp target defaultmap(tofrom) // omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   foo();
 #pragma omp target defaultmap(tofrom, // expected-error {{expected ')'}} omp45-warning {{missing ':' after defaultmap modifier - ignoring}} expected-note {{to match this '('}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   foo();
-#pragma omp target defaultmap(scalar: // omp5x-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp52-error {{expected 'scalar', 'aggregate', 'pointer', 'all' in OpenMP clause 'defaultmap'}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} expected-error {{expected ')'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+#pragma omp target defaultmap(scalar: // omp60-error {{expected 'storage', 'from', 'to', 'tofrom', 'firstprivate', 'private', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5x-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp-ge52-error {{expected 'scalar', 'aggregate', 'pointer', 'all' in OpenMP clause 'defaultmap'}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} expected-error {{expected ')'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
   foo();
 #pragma omp target defaultmap(tofrom, scalar // expected-error {{expected ')'}} omp45-warning {{missing ':' after defaultmap modifier - ignoring}} expected-note {{to match this '('}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   foo();
-#pragma omp target defaultmap(tofrom: scalar) defaultmap(tofrom: scalar) // omp45-error {{directive '#pragma omp target' cannot contain more than one 'defaultmap' clause}} omp5-error {{at most one defaultmap clause for each variable-category can appear on the directive}} omp5x-error {{at most one defaultmap clause for each variable-category can appear on the directive}}
+#pragma omp target defaultmap(tofrom: scalar) defaultmap(tofrom: scalar) // omp45-error {{directive '#pragma omp target' cannot contain more than one 'defaultmap' clause}} omp5-error {{at most one defaultmap clause for each variable-category can appear on the directive}} omp5x-error {{at most one defaultmap clause for each variable-category can appear on the directive}} omp60-error {{at most one defaultmap clause for each variable-category can appear on the directive}}
+  foo();
+  // Verify 'storage' causes an error prior to OpenMP 6.0
+#pragma omp target defaultmap(storage) // omp5x-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
   foo();
 
 #ifdef OMP5
diff --git a/clang/test/OpenMP/taskloop_strictmodifier_codegen.cpp b/clang/test/OpenMP/taskloop_strictmodifier_codegen.cpp
index 82dd07a1a63bb..cddd31da1b7fb 100644
--- a/clang/test/OpenMP/taskloop_strictmodifier_codegen.cpp
+++ b/clang/test/OpenMP/taskloop_strictmodifier_codegen.cpp
@@ -1,4 +1,4 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --include-generated-funcs --prefix-filecheck-ir-name _ --version 5
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --include-generated-funcs --prefix-filecheck-ir-name _ --version 6
 // RUN: %clang_cc1 -fopenmp -O1 -x c++ -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK
 // expected-no-diagnostics
 #ifndef HEADER
@@ -34,31 +34,32 @@ struct S {
 
 #endif
 
+
 // CHECK-LABEL: define noundef i32 @main(
 // CHECK-SAME: i32 noundef [[ARGC:%.*]], ptr noundef [[ARGV:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[ARGC_ADDR:%.*]] = alloca i32, align 4
 // CHECK-NEXT:    [[ARGV_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @__kmpc_global_thread_num(ptr nonnull @[[GLOB1:[0-9]+]])
-// CHECK-NEXT:    store i32 [[ARGC]], ptr [[ARGC_ADDR]], align 4, !tbaa [[TBAA3:![0-9]+]]
-// CHECK-NEXT:    store ptr [[ARGV]], ptr [[ARGV_ADDR]], align 8, !tbaa [[TBAA7:![0-9]+]]
+// CHECK-NEXT:    store i32 [[ARGC]], ptr [[ARGC_ADDR]], align 4, !tbaa [[INT_TBAA3:![0-9]+]]
+// CHECK-NEXT:    store ptr [[ARGV]], ptr [[ARGV_ADDR]], align 8, !tbaa [[CHARPTR_TBAA7:![0-9]+]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call ptr @__kmpc_omp_task_alloc(ptr nonnull @[[GLOB1]], i32 [[TMP0]], i32 1, i64 48, i64 1, ptr nonnull @.omp_task_entry..2)
 // CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 40
-// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ARGC_ADDR]], align 4, !tbaa [[TBAA3]]
-// CHECK-NEXT:    store i32 [[TMP3]], ptr [[TMP2]], align 8, !tbaa [[TBAA10:![0-9]+]]
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ARGC_ADDR]], align 4, !tbaa [[INT_TBAA3]]
+// CHECK-NEXT:    store i32 [[TMP3]], ptr [[TMP2]], align 8, !tbaa [[INT_TBAA11:![0-9]+]]
 // CHECK-NEXT:    [[TMP4:%.*]] = tail call i32 @__kmpc_omp_task(ptr nonnull @[[GLOB1]], i32 [[TMP0]], ptr [[TMP1]])
 // CHECK-NEXT:    [[TMP5:%.*]] = tail call i32 @__kmpc_master(ptr nonnull @[[GLOB1]], i32 [[TMP0]])
 // CHECK-NEXT:    [[DOTNOT:%.*]] = icmp eq i32 [[TMP5]], 0
 // CHECK-NEXT:    br i1 [[DOTNOT]], label %[[OMP_IF_END:.*]], label %[[OMP_IF_THEN:.*]]
 // CHECK:       [[OMP_IF_THEN]]:
-// CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[ARGC_ADDR]], align 4, !tbaa [[TBAA3]]
+// CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[ARGC_ADDR]], align 4, !tbaa [[INT_TBAA3]]
 // CHECK-NEXT:    [[TMP7:%.*]] = tail call ptr @__kmpc_omp_task_alloc(ptr nonnull @[[GLOB1]], i32 [[TMP0]], i32 1, i64 80, i64 1, ptr nonnull @.omp_task_entry..4)
 // CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i64 40
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8, !tbaa [[TBAA15:![0-9]+]]
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8, !tbaa [[LONG_TBAA15:![0-9]+]]
 // CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i64 48
-// CHECK-NEXT:    store i64 9, ptr [[TMP9]], align 8, !tbaa [[TBAA15]]
+// CHECK-NEXT:    store i64 9, ptr [[TMP9]], align 8, !tbaa [[LONG_TBAA15]]
 // CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i64 56
-// CHECK-NEXT:    store i64 1, ptr [[TMP10]], align 8, !tbaa [[TBAA15]]
+// CHECK-NEXT:    store i64 1, ptr [[TMP10]], align 8, !tbaa [[LONG_TBAA15]]
 // CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i64 72
 // CHECK-NEXT:    store i64 0, ptr [[TMP11]], align 8
 // CHECK-NEXT:    [[TMP12:%.*]] = zext i32 [[TMP6]] to i64
@@ -71,32 +72,32 @@ struct S {
 // CHECK-NEXT:    br i1 [[DOTNOT22]], label %[[OMP_IF_END17:.*]], label %[[OMP_IF_THEN2:.*]]
 // CHECK:       [[OMP_IF_THEN2]]:
 // CHECK-NEXT:    tail call void @__kmpc_taskgroup(ptr nonnull @[[GLOB1]], i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP14:%.*]] = load i32, ptr [[ARGC_ADDR]], align 4, !tbaa [[TBAA3]]
-// CHECK-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8, !tbaa [[TBAA7]]
+// CHECK-NEXT:    [[TMP14:%.*]] = load i32, ptr [[ARGC_ADDR]], align 4, !tbaa [[INT_TBAA3]]
+// CHECK-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8, !tbaa [[CHARPTR_TBAA7]]
 // CHECK-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP14]] to i64
 // CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[TMP15]], i64 [[IDXPROM]]
-// CHECK-NEXT:    [[TMP16:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA17:![0-9]+]]
+// CHECK-NEXT:    [[TMP16:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8, !tbaa [[CHARPTR_TBAA17:![0-9]+]]
 // CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i64 [[IDXPROM]]
-// CHECK-NEXT:    [[TMP17:%.*]] = load i8, ptr [[ARRAYIDX9]], align 1, !tbaa [[TBAA19:![0-9]+]]
+// CHECK-NEXT:    [[TMP17:%.*]] = load i8, ptr [[ARRAYIDX9]], align 1, !tbaa [[CHAR_TBAA19:![0-9]+]]
 // CHECK-NEXT:    [[CONV:%.*]] = sext i8 [[TMP17]] to i32
 // CHECK-NEXT:    [[SUB12:%.*]] = sub i32 [[CONV]], [[TMP14]]
 // CHECK-NEXT:    [[CONV15:%.*]] = zext i32 [[SUB12]] to i64
 // CHECK-NEXT:    [[MUL:%.*]] = mul nsw i64 [[CONV15]], [[IDXPROM]]
 // CHECK-NEXT:    [[SUB16:%.*]] = add nsw i64 [[MUL]], -1
 // CHECK-NEXT:    [[TMP18:%.*]] = tail call ptr @__kmpc_omp_task_alloc(ptr nonnull @[[GLOB1]], i32 [[TMP0]], i32 1, i64 80, i64 16, ptr nonnull @.omp_task_entry..6)
-// CHECK-NEXT:    [[TMP19:%.*]] = load ptr, ptr [[TMP18]], align 8, !tbaa [[TBAA20:![0-9]+]]
-// CHECK-NEXT:    store ptr [[ARGC_ADDR]], ptr [[TMP19]], align 8, !tbaa [[TBAA23:![0-9]+]]
+// CHECK-NEXT:    [[TMP19:%.*]] = load ptr, ptr [[TMP18]], align 8, !tbaa [[ANYPTR_TBAA20:![0-9]+]]
+// CHECK-NEXT:    store ptr [[ARGC_ADDR]], ptr [[TMP19]], align 8, !tbaa [[INTPTR_TBAA23:![0-9]+]]
 // CHECK-NEXT:    [[AGG_CAPTURED3_SROA_2_0__SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP19]], i64 8
-// CHECK-NEXT:    store ptr [[ARGV_ADDR]], ptr [[AGG_CAPTURED3_SROA_2_0__SROA_IDX]], align 8, !tbaa [[TBAA25:![0-9]+]]
-// CHECK-NEXT:    [[TMP20:%.*]] = load i32, ptr [[ARGC_ADDR]], align 4, !tbaa [[TBAA3]]
+// CHECK-NEXT:    store ptr [[ARGV_ADDR]], ptr [[AGG_CAPTURED3_SROA_2_0__SROA_IDX]], align 8, !tbaa [[CHARPTR_TBAA25:![0-9]+]]
+// CHECK-NEXT:    [[TMP20:%.*]] = load i32, ptr [[ARGC_ADDR]], align 4, !tbaa [[INT_TBAA3]]
 // CHECK-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP20]], 0
 // CHECK-NEXT:    [[TMP21:%.*]] = sext i1 [[TOBOOL]] to i32
 // CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP18]], i64 40
-// CHECK-NEXT:    store i64 0, ptr [[TMP22]], align 8, !tbaa [[TBAA15]]
+// CHECK-NEXT:    store i64 0, ptr [[TMP22]], align 8, !tbaa [[LONG_TBAA15]]
 // CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP18]], i64 48
-// CHECK-NEXT:    store i64 [[SUB16]], ptr [[TMP23]], align 8, !tbaa [[TBAA15]]
+// CHECK-NEXT:    store i64 [[SUB16]], ptr [[TMP23]], align 8, !tbaa [[LONG_TBAA15]]
 // CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP18]], i64 56
-// CHECK-NEXT:    store i64 1, ptr [[TMP24]], align 8, !tbaa [[TBAA15]]
+// CHECK-NEXT:    store i64 1, ptr [[TMP24]], align 8, !tbaa [[LONG_TBAA15]]
 // CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP18]], i64 72
 // CHECK-NEXT:    store i64 0, ptr [[TMP25]], align 8
 // CHECK-NEXT:    call void @__kmpc_taskloop_5(ptr nonnull @[[GLOB1]], i32 [[TMP0]], ptr nonnull [[TMP18]], i32 [[TMP21]], ptr nonnull [[TMP22]], ptr nonnull [[TMP23]], i64 1, i32 1, i32 2, i64 4, i32 1, ptr null) #[[ATTR1]]
@@ -111,11 +112,11 @@ struct S {
 // CHECK-NEXT:    call void @__kmpc_taskgroup(ptr nonnull @[[GLOB1]], i32 [[TMP0]])
 // CHECK-NEXT:    [[TMP27:%.*]] = call ptr @__kmpc_omp_task_alloc(ptr nonnull @[[GLOB1]], i32 [[TMP0]], i32 1, i64 80, i64 1, ptr nonnull @.omp_task_entry..8)
 // CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP27]], i64 40
-// CHECK-NEXT:    store i64 0, ptr [[TMP28]], align 8, !tbaa [[TBAA15]]
+// CHECK-NEXT:    store i64 0, ptr [[TMP28]], align 8, !tbaa [[LONG_TBAA15]]
 // CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP27]], i64 48
-// CHECK-NEXT:    store i64 9, ptr [[TMP29]], align 8, !tbaa [[TBAA15]]
+// CHECK-NEXT:    store i64 9, ptr [[TMP29]], align 8, !tbaa [[LONG_TBAA15]]
 // CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP27]], i64 56
-// CHECK-NEXT:    store i64 1, ptr [[TMP30]], align 8, !tbaa [[TBAA15]]
+// CHECK-NEXT:    store i64 1, ptr [[TMP30]], align 8, !tbaa [[LONG_TBAA15]]
 // CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP27]], i64 72
 // CHECK-NEXT:    store i64 0, ptr [[TMP31]], align 8
 // CHECK-NEXT:    call void @__kmpc_taskloop(ptr nonnull @[[GLOB1]], i32 [[TMP0]], ptr [[TMP27]], i32 1, ptr nonnull [[TMP28]], ptr nonnull [[TMP29]], i64 1, i32 1, i32 0, i64 0, ptr null)
@@ -126,33 +127,160 @@ struct S {
 // CHECK-NEXT:    ret i32 0
 //
 //
+// CHECK-LABEL: define internal noundef i32 @.omp_task_entry.(
+// CHECK-SAME: i32 [[TMP0:%.*]], ptr noalias noundef readonly captures(none) [[TMP1:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*]]:
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 40
+// CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8, !tbaa [[LONG_TBAA28:![0-9]+]]
+// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 48
+// CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr [[TMP4]], align 8, !tbaa [[LONG_TBAA29:![0-9]+]]
+// CHECK-NEXT:    [[SEXT:%.*]] = shl i64 [[TMP3]], 32
+// CHECK-NEXT:    [[TMP6:%.*]] = ashr exact i64 [[SEXT]], 32
+// CHECK-NEXT:    br label %[[OMP_INNER_FOR_COND_I:.*]]
+// CHECK:       [[OMP_INNER_FOR_COND_I]]:
+// CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[OMP_INNER_FOR_COND_I]] ], [ [[TMP6]], %[[ENTRY]] ]
+// CHECK-NEXT:    [[CMP_NOT_I:%.*]] = icmp ult i64 [[TMP5]], [[INDVARS_IV]]
+// CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], 1
+// CHECK-NEXT:    br i1 [[CMP_NOT_I]], [[DOTOMP_OUTLINED__1_EXIT:label %.*]], label %[[OMP_INNER_FOR_COND_I]]
+// CHECK:       [[_OMP_OUTLINED__1_EXIT:.*:]]
+// CHECK-NEXT:    ret i32 0
+//
+//
+// CHECK-LABEL: define internal noundef i32 @.omp_task_entry..2(
+// CHECK-SAME: i32 noundef [[TMP0:%.*]], ptr noalias noundef readonly captures(none) [[TMP1:%.*]]) #[[ATTR4:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call i32 @__kmpc_master(ptr nonnull @[[GLOB1]], i32 [[TMP0]])
+// CHECK-NEXT:    [[DOTNOT_I:%.*]] = icmp eq i32 [[TMP2]], 0
+// CHECK-NEXT:    br i1 [[DOTNOT_I]], [[DOTOMP_OUTLINED__EXIT:label %.*]], label %[[OMP_IF_THEN_I:.*]]
+// CHECK:       [[OMP_IF_THEN_I]]:
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 40
+// CHECK-NEXT:    tail call void @__kmpc_taskgroup(ptr nonnull @[[GLOB1]], i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[INT_TBAA3]]
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call ptr @__kmpc_omp_task_alloc(ptr nonnull @[[GLOB1]], i32 [[TMP0]], i32 33, i64 80, i64 1, ptr nonnull @.omp_task_entry.)
+// CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP5]], i64 32
+// CHECK-NEXT:    store i32 [[TMP4]], ptr [[TMP6]], align 8, !tbaa [[CHAR_TBAA19]]
+// CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP5]], i64 40
+// CHECK-NEXT:    store i64 0, ptr [[TMP7]], align 8, !tbaa [[LONG_TBAA15]]
+// CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP5]], i64 48
+// CHECK-NEXT:    store i64 9, ptr [[TMP8]], align 8, !tbaa [[LONG_TBAA15]]
+// CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP5]], i64 56
+// CHECK-NEXT:    store i64 1, ptr [[TMP9]], align 8, !tbaa [[LONG_TBAA15]]
+// CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP5]], i64 72
+// CHECK-NEXT:    store i64 0, ptr [[TMP10]], align 8
+// CHECK-NEXT:    tail call void @__kmpc_taskloop(ptr nonnull @[[GLOB1]], i32 [[TMP0]], ptr [[TMP5]], i32 1, ptr nonnull [[TMP7]], ptr nonnull [[TMP8]], i64 1, i32 1, i32 0, i64 0, ptr null)
+// CHECK-NEXT:    tail call void @__kmpc_end_taskgroup(ptr nonnull @[[GLOB1]], i32 [[TMP0]])
+// CHECK-NEXT:    tail call void @__kmpc_end_master(ptr nonnull @[[GLOB1]], i32 [[TMP0]])
+// CHECK-NEXT:    br [[DOTOMP_OUTLINED__EXIT]]
+// CHECK:       [[_OMP_OUTLINED__EXIT:.*:]]
+// CHECK-NEXT:    ret i32 0
+//
+//
+// CHECK-LABEL: define internal noundef i32 @.omp_task_entry..4(
+// CHECK-SAME: i32 [[TMP0:%.*]], ptr noalias noundef readonly captures(none) [[TMP1:%.*]]) #[[ATTR3]] {
+// CHECK-NEXT:  [[ENTRY:.*]]:
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 40
+// CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8, !tbaa [[LONG_TBAA28]]
+// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 48
+// CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr [[TMP4]], align 8, !tbaa [[LONG_TBAA29]]
+// CHECK-NEXT:    [[SEXT:%.*]] = shl i64 [[TMP3]], 32
+// CHECK-NEXT:    [[TMP6:%.*]] = ashr exact i64 [[SEXT]], 32
+// CHECK-NEXT:    br label %[[OMP_INNER_FOR_COND_I:.*]]
+// CHECK:       [[OMP_INNER_FOR_COND_I]]:
+// CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[OMP_INNER_FOR_COND_I]] ], [ [[TMP6]], %[[ENTRY]] ]
+// CHECK-NEXT:    [[CMP_NOT_I:%.*]] = icmp ult i64 [[TMP5]], [[INDVARS_IV]]
+// CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], 1
+// CHECK-NEXT:    br i1 [[CMP_NOT_I]], [[DOTOMP_OUTLINED__3_EXIT:label %.*]], label %[[OMP_INNER_FOR_COND_I]]
+// CHECK:       [[_OMP_OUTLINED__3_EXIT:.*:]]
+// CHECK-NEXT:    ret i32 0
+//
+//
+// CHECK-LABEL: define internal noundef i32 @.omp_task_entry..6(
+// CHECK-SAME: i32 [[TMP0:%.*]], ptr noalias noundef readonly captures(none) [[TMP1:%.*]]) #[[ATTR5:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8, !tbaa [[ANYPTR_TBAA20]]
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 40
+// CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[TMP3]], align 8, !tbaa [[LONG_TBAA28]]
+// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 48
+// CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP5]], align 8, !tbaa [[LONG_TBAA29]]
+// CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META30:![0-9]+]])
+// CHECK-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[INTPTR_TBAA33:![0-9]+]], !alias.scope [[META30]], !nonnull [[META35:![0-9]+]], !align [[META36:![0-9]+]]
+// CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[INT_TBAA3]], !noalias [[META30]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp sgt i32 [[TMP8]], 0
+// CHECK-NEXT:    br i1 [[CMP_I]], label %[[LAND_LHS_TRUE_I:.*]], [[DOTOMP_OUTLINED__5_EXIT:label %.*]]
+// CHECK:       [[LAND_LHS_TRUE_I]]:
+// CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i64 8
+// CHECK-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[TMP9]], align 8, !tbaa [[CHARPTR_TBAA37:![0-9]+]], !alias.scope [[META30]], !nonnull [[META35]], !align [[META38:![0-9]+]]
+// CHECK-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[TMP10]], align 8, !tbaa [[CHARPTR_TBAA7]], !noalias [[META30]]
+// CHECK-NEXT:    [[IDXPROM_I:%.*]] = zext nneg i32 [[TMP8]] to i64
+// CHECK-NEXT:    [[ARRAYIDX_I:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP11]], i64 [[IDXPROM_I]]
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[ARRAYIDX_I]], align 8, !tbaa [[CHARPTR_TBAA17]], !noalias [[META30]]
+// CHECK-NEXT:    [[ARRAYIDX5_I:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP12]], i64 [[IDXPROM_I]]
+// CHECK-NEXT:    [[TMP13:%.*]] = load i8, ptr [[ARRAYIDX5_I]], align 1, !tbaa [[CHAR_TBAA19]], !noalias [[META30]]
+// CHECK-NEXT:    [[CONV_I:%.*]] = sext i8 [[TMP13]] to i32
+// CHECK-NEXT:    [[CMP13_I:%.*]] = icmp slt i32 [[TMP8]], [[CONV_I]]
+// CHECK-NEXT:    br i1 [[CMP13_I]], label %[[OMP_INNER_FOR_COND_I:.*]], [[DOTOMP_OUTLINED__5_EXIT]]
+// CHECK:       [[OMP_INNER_FOR_COND_I]]:
+// CHECK-NEXT:    [[DOTOMP_IV_0_I:%.*]] = phi i64 [ [[ADD46_I:%.*]], %[[OMP_INNER_FOR_COND_I]] ], [ [[TMP4]], %[[LAND_LHS_TRUE_I]] ]
+// CHECK-NEXT:    [[CMP16_NOT_I:%.*]] = icmp ugt i64 [[DOTOMP_IV_0_I]], [[TMP6]]
+// CHECK-NEXT:    [[ADD46_I]] = add nsw i64 [[DOTOMP_IV_0_I]], 1
+// CHECK-NEXT:    br i1 [[CMP16_NOT_I]], [[DOTOMP_OUTLINED__5_EXIT]], label %[[OMP_INNER_FOR_COND_I]]
+// CHECK:       [[_OMP_OUTLINED__5_EXIT:.*:]]
+// CHECK-NEXT:    ret i32 0
+//
+//
+// CHECK-LABEL: define internal noundef i32 @.omp_task_entry..8(
+// CHECK-SAME: i32 noundef [[TMP0:%.*]], ptr noalias noundef readonly captures(none) [[TMP1:%.*]]) #[[ATTR4]] {
+// CHECK-NEXT:  [[ENTRY:.*]]:
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 40
+// CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8, !tbaa [[LONG_TBAA28]]
+// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 48
+// CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr [[TMP4]], align 8, !tbaa [[LONG_TBAA29]]
+// CHECK-NEXT:    [[SEXT:%.*]] = shl i64 [[TMP3]], 32
+// CHECK-NEXT:    [[CONV1_I2:%.*]] = ashr exact i64 [[SEXT]], 32
+// CHECK-NEXT:    [[CMP_NOT_I3:%.*]] = icmp ult i64 [[TMP5]], [[CONV1_I2]]
+// CHECK-NEXT:    br i1 [[CMP_NOT_I3]], [[DOTOMP_OUTLINED__7_EXIT:label %.*]], label %[[OMP_INNER_FOR_BODY_I:.*]]
+// CHECK:       [[OMP_INNER_FOR_BODY_I]]:
+// CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[DOTCANCEL_CONTINUE_I:.*]] ], [ [[CONV1_I2]], %[[ENTRY]] ]
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call i32 @__kmpc_cancel(ptr nonnull @[[GLOB1]], i32 [[TMP0]], i32 4)
+// CHECK-NEXT:    [[DOTNOT_I:%.*]] = icmp eq i32 [[TMP6]], 0
+// CHECK-NEXT:    br i1 [[DOTNOT_I]], label %[[DOTCANCEL_CONTINUE_I]], [[DOTOMP_OUTLINED__7_EXIT]]
+// CHECK:       [[_CANCEL_CONTINUE_I:.*:]]
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call i32 @__kmpc_cancellationpoint(ptr nonnull @[[GLOB1]], i32 [[TMP0]], i32 4)
+// CHECK-NEXT:    [[DOTNOT12_I:%.*]] = icmp ne i32 [[TMP7]], 0
+// CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], 1
+// CHECK-NEXT:    [[CMP_NOT_I:%.*]] = icmp ult i64 [[TMP5]], [[INDVARS_IV_NEXT]]
+// CHECK-NEXT:    [[OR_COND:%.*]] = select i1 [[DOTNOT12_I]], i1 true, i1 [[CMP_NOT_I]]
+// CHECK-NEXT:    br i1 [[OR_COND]], [[DOTOMP_OUTLINED__7_EXIT]], label %[[OMP_INNER_FOR_BODY_I]]
+// CHECK:       [[_OMP_OUTLINED__7_EXIT:.*:]]
+// CHECK-NEXT:    ret i32 0
+//
 //
 // CHECK-LABEL: define linkonce_odr void @_ZN1SC2Ei(
 // CHECK-SAME: ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], i32 noundef [[C:%.*]]) unnamed_addr #[[ATTR6:[0-9]+]] align 2 {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[C_ADDR:%.*]] = alloca i32, align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @__kmpc_global_thread_num(ptr nonnull @[[GLOB1]])
-// CHECK-NEXT:    store i32 [[C]], ptr [[C_ADDR]], align 4, !tbaa [[TBAA3]]
+// CHECK-NEXT:    store i32 [[C]], ptr [[C_ADDR]], align 4, !tbaa [[INT_TBAA3]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @__kmpc_master(ptr nonnull @[[GLOB1]], i32 [[TMP0]])
 // CHECK-NEXT:    [[DOTNOT:%.*]] = icmp eq i32 [[TMP1]], 0
 // CHECK-NEXT:    br i1 [[DOTNOT]], label %[[OMP_IF_END:.*]], label %[[OMP_IF_THEN:.*]]
 // CHECK:       [[OMP_IF_THEN]]:
-// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[THIS]], align 4, !tbaa [[TBAA35:![0-9]+]]
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[THIS]], align 4, !tbaa [[INT_TBAA39:![0-9]+]]
 // CHECK-NEXT:    tail call void @__kmpc_taskgroup(ptr nonnull @[[GLOB1]], i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[C_ADDR]], align 4, !tbaa [[TBAA3]]
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[C_ADDR]], align 4, !tbaa [[INT_TBAA3]]
 // CHECK-NEXT:    [[SUB4:%.*]] = add nsw i32 [[TMP3]], -1
 // CHECK-NEXT:    [[TMP4:%.*]] = tail call ptr @__kmpc_omp_task_alloc(ptr nonnull @[[GLOB1]], i32 [[TMP0]], i32 1, i64 80, i64 16, ptr nonnull @.omp_task_entry..10)
-// CHECK-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[TMP4]], align 8, !tbaa [[TBAA20]]
-// CHECK-NEXT:    store ptr [[THIS]], ptr [[TMP5]], align 8, !tbaa [[TBAA37:![0-9]+]]
+// CHECK-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[TMP4]], align 8, !tbaa [[ANYPTR_TBAA20]]
+// CHECK-NEXT:    store ptr [[THIS]], ptr [[TMP5]], align 8, !tbaa [[_ZTS1SPTR_TBAA41:![0-9]+]]
 // CHECK-NEXT:    [[AGG_CAPTURED_SROA_2_0__SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP5]], i64 8
-// CHECK-NEXT:    store ptr [[C_ADDR]], ptr [[AGG_CAPTURED_SROA_2_0__SROA_IDX]], align 8, !tbaa [[TBAA23]]
+// CHECK-NEXT:    store ptr [[C_ADDR]], ptr [[AGG_CAPTURED_SROA_2_0__SROA_IDX]], align 8, !tbaa [[INTPTR_TBAA23]]
 // CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP4]], i64 40
-// CHECK-NEXT:    store i64 0, ptr [[TMP6]], align 8, !tbaa [[TBAA15]]
+// CHECK-NEXT:    store i64 0, ptr [[TMP6]], align 8, !tbaa [[LONG_TBAA15]]
 // CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP4]], i64 48
 // CHECK-NEXT:    [[CONV:%.*]] = sext i32 [[SUB4]] to i64
-// CHECK-NEXT:    store i64 [[CONV]], ptr [[TMP7]], align 8, !tbaa [[TBAA15]]
+// CHECK-NEXT:    store i64 [[CONV]], ptr [[TMP7]], align 8, !tbaa [[LONG_TBAA15]]
 // CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP4]], i64 56
-// CHECK-NEXT:    store i64 1, ptr [[TMP8]], align 8, !tbaa [[TBAA15]]
+// CHECK-NEXT:    store i64 1, ptr [[TMP8]], align 8, !tbaa [[LONG_TBAA15]]
 // CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP4]], i64 72
 // CHECK-NEXT:    store i64 0, ptr [[TMP9]], align 8
 // CHECK-NEXT:    [[TMP10:%.*]] = zext i32 [[TMP2]] to i64
@@ -162,4 +290,85 @@ struct S {
 // CHECK-NEXT:    br label %[[OMP_IF_END]]
 // CHECK:       [[OMP_IF_END]]:
 // CHECK-NEXT:    ret void
-
+//
+//
+// CHECK-LABEL: define internal noundef i32 @.omp_task_entry..10(
+// CHECK-SAME: i32 [[TMP0:%.*]], ptr noalias noundef readonly captures(none) [[TMP1:%.*]]) #[[ATTR5]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8, !tbaa [[ANYPTR_TBAA20]]
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 40
+// CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[TMP3]], align 8, !tbaa [[LONG_TBAA28]]
+// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 48
+// CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP5]], align 8, !tbaa [[LONG_TBAA29]]
+// CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META43:![0-9]+]])
+// CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i64 8
+// CHECK-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[TMP7]], align 8, !tbaa [[INTPTR_TBAA46:![0-9]+]], !alias.scope [[META43]], !nonnull [[META35]], !align [[META36]]
+// CHECK-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP8]], align 4, !tbaa [[INT_TBAA3]], !noalias [[META43]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp sgt i32 [[TMP9]], 0
+// CHECK-NEXT:    br i1 [[CMP_I]], label %[[TASKLOOP_IF_THEN_I:.*]], [[DOTOMP_OUTLINED__9_EXIT:label %.*]]
+// CHECK:       [[TASKLOOP_IF_THEN_I]]:
+// CHECK-NEXT:    [[SEXT:%.*]] = shl i64 [[TMP4]], 32
+// CHECK-NEXT:    [[TMP10:%.*]] = ashr exact i64 [[SEXT]], 32
+// CHECK-NEXT:    br label %[[OMP_INNER_FOR_COND_I:.*]]
+// CHECK:       [[OMP_INNER_FOR_COND_I]]:
+// CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[OMP_INNER_FOR_COND_I]] ], [ [[TMP10]], %[[TASKLOOP_IF_THEN_I]] ]
+// CHECK-NEXT:    [[CMP8_NOT_I:%.*]] = icmp ult i64 [[TMP6]], [[INDVARS_IV]]
+// CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], 1
+// CHECK-NEXT:    br i1 [[CMP8_NOT_I]], [[DOTOMP_OUTLINED__9_EXIT]], label %[[OMP_INNER_FOR_COND_I]]
+// CHECK:       [[_OMP_OUTLINED__9_EXIT:.*:]]
+// CHECK-NEXT:    ret i32 0
+//
+//
+// CHECK-LABEL: define internal void @_GLOBAL__sub_I_taskloop_strictmodifier_codegen.cpp(
+// CHECK-SAME: ) #[[ATTR7:[0-9]+]] section "__TEXT,__StaticInit,regular,pure_instructions" {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    tail call void @_ZN1SC2Ei(ptr noundef nonnull align 4 dereferenceable(4) @s, i32 noundef 1)
+// CHECK-NEXT:    ret void
+//
+//.
+// CHECK: [[INT_TBAA3]] = !{[[META4:![0-9]+]], [[META4]], i64 0}
+// CHECK: [[META4]] = !{!"int", [[META5:![0-9]+]], i64 0}
+// CHECK: [[META5]] = !{!"omnipotent char", [[META6:![0-9]+]], i64 0}
+// CHECK: [[META6]] = !{!"Simple C++ TBAA"}
+// CHECK: [[CHARPTR_TBAA7]] = !{[[META8:![0-9]+]], [[META8]], i64 0}
+// CHECK: [[META8]] = !{!"p2 omnipotent char", [[META9:![0-9]+]], i64 0}
+// CHECK: [[META9]] = !{!"any p2 pointer", [[META10:![0-9]+]], i64 0}
+// CHECK: [[META10]] = !{!"any pointer", [[META5]], i64 0}
+// CHECK: [[INT_TBAA11]] = !{[[META12:![0-9]+]], [[META4]], i64 40}
+// CHECK: [[META12]] = !{!"_ZTS24kmp_task_t_with_privates", [[META13:![0-9]+]], i64 0, [[META14:![0-9]+]], i64 40}
+// CHECK: [[META13]] = !{!"_ZTS10kmp_task_t", [[META10]], i64 0, [[META10]], i64 8, [[META4]], i64 16, [[META5]], i64 24, [[META5]], i64 32}
+// CHECK: [[META14]] = !{!"_ZTS15.kmp_privates.t", [[META4]], i64 0}
+// CHECK: [[LONG_TBAA15]] = !{[[META16:![0-9]+]], [[META16]], i64 0}
+// CHECK: [[META16]] = !{!"long", [[META5]], i64 0}
+// CHECK: [[CHARPTR_TBAA17]] = !{[[META18:![0-9]+]], [[META18]], i64 0}
+// CHECK: [[META18]] = !{!"p1 omnipotent char", [[META10]], i64 0}
+// CHECK: [[CHAR_TBAA19]] = !{[[META5]], [[META5]], i64 0}
+// CHECK: [[ANYPTR_TBAA20]] = !{[[META21:![0-9]+]], [[META10]], i64 0}
+// CHECK: [[META21]] = !{!"_ZTS24kmp_task_t_with_privates", [[META22:![0-9]+]], i64 0}
+// CHECK: [[META22]] = !{!"_ZTS10kmp_task_t", [[META10]], i64 0, [[META10]], i64 8, [[META4]], i64 16, [[META5]], i64 24, [[META5]], i64 32, [[META16]], i64 40, [[META16]], i64 48, [[META16]], i64 56, [[META4]], i64 64, [[META10]], i64 72}
+// CHECK: [[INTPTR_TBAA23]] = !{[[META24:![0-9]+]], [[META24]], i64 0}
+// CHECK: [[META24]] = !{!"p1 int", [[META10]], i64 0}
+// CHECK: [[CHARPTR_TBAA25]] = !{[[META26:![0-9]+]], [[META26]], i64 0}
+// CHECK: [[META26]] = !{!"p3 omnipotent char", [[META27:![0-9]+]], i64 0}
+// CHECK: [[META27]] = !{!"any p3 pointer", [[META9]], i64 0}
+// CHECK: [[LONG_TBAA28]] = !{[[META21]], [[META16]], i64 40}
+// CHECK: [[LONG_TBAA29]] = !{[[META21]], [[META16]], i64 48}
+// CHECK: [[META30]] = !{[[META31:![0-9]+]]}
+// CHECK: [[META31]] = distinct !{[[META31]], [[META32:![0-9]+]], !".omp_outlined..5: %__context"}
+// CHECK: [[META32]] = distinct !{[[META32]], !".omp_outlined..5"}
+// CHECK: [[INTPTR_TBAA33]] = !{[[META34:![0-9]+]], [[META24]], i64 0}
+// CHECK: [[META34]] = !{!"_ZTSZ4mainE3$_3", [[META24]], i64 0, [[META26]], i64 8}
+// CHECK: [[META35]] = !{}
+// CHECK: [[META36]] = !{i64 4}
+// CHECK: [[CHARPTR_TBAA37]] = !{[[META34]], [[META26]], i64 8}
+// CHECK: [[META38]] = !{i64 8}
+// CHECK: [[INT_TBAA39]] = !{[[META40:![0-9]+]], [[META4]], i64 0}
+// CHECK: [[META40]] = !{!"_ZTS1S", [[META4]], i64 0}
+// CHECK: [[_ZTS1SPTR_TBAA41]] = !{[[META42:![0-9]+]], [[META42]], i64 0}
+// CHECK: [[META42]] = !{!"p1 _ZTS1S", [[META10]], i64 0}
+// CHECK: [[META43]] = !{[[META44:![0-9]+]]}
+// CHECK: [[META44]] = distinct !{[[META44]], [[META45:![0-9]+]], !".omp_outlined..9: %__context"}
+// CHECK: [[META45]] = distinct !{[[META45]], !".omp_outlined..9"}
+// CHECK: [[INTPTR_TBAA46]] = !{[[META47:![0-9]+]], [[META24]], i64 8}
+// CHECK: [[META47]] = !{!"_ZTSZN1SC1EiEUt_", [[META42]], i64 0, [[META24]], i64 8}
+//.
diff --git a/clang/test/OpenMP/vla_crash.c b/clang/test/OpenMP/vla_crash.c
index 6eaa3bdab564b..31cf097523f19 100644
--- a/clang/test/OpenMP/vla_crash.c
+++ b/clang/test/OpenMP/vla_crash.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _
-// RUN: %clang_cc1 -verify -triple powerpc64le-unknown-linux-gnu -fopenmp -x c -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK1
+// RUN: %clang_cc1 -Wno-error=incompatible-pointer-types -verify -triple powerpc64le-unknown-linux-gnu -fopenmp -x c -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK1
 
-// RUN: %clang_cc1 -verify -triple powerpc64le-unknown-linux-gnu -fopenmp-simd -x c -emit-llvm %s -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+// RUN: %clang_cc1 -Wno-error=incompatible-pointer-types -verify -triple powerpc64le-unknown-linux-gnu -fopenmp-simd -x c -emit-llvm %s -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
 
 int a;
 
diff --git a/clang/test/PCH/Inputs/glob-delete-with-virtual-dtor.h b/clang/test/PCH/Inputs/glob-delete-with-virtual-dtor.h
new file mode 100644
index 0000000000000..b5469f014fb38
--- /dev/null
+++ b/clang/test/PCH/Inputs/glob-delete-with-virtual-dtor.h
@@ -0,0 +1,18 @@
+class H {
+  void operator delete(void *);
+public:
+  virtual ~H();
+};
+H::~H() { }
+
+class S : public H {
+  void operator delete(void *);
+public:
+  virtual ~S();
+};
+S::~S() { }
+
+void in_pch_tests() {
+  H* h = new H();
+  ::delete h;
+}
diff --git a/clang/test/PCH/functions.c b/clang/test/PCH/functions.c
index fa2ba8d29c038..d06f56a207426 100644
--- a/clang/test/PCH/functions.c
+++ b/clang/test/PCH/functions.c
@@ -16,7 +16,7 @@ float *test_f1(int val, double x, double y) {
 }
 
 void test_g0(int *x, float * y) {
-  g0(y); // expected-warning{{incompatible pointer types passing 'float *' to parameter of type 'int *'}}
+  g0(y); // expected-error{{incompatible pointer types passing 'float *' to parameter of type 'int *'}}
          // expected-note@functions.h:9{{passing argument to parameter here}}
   g0(x); 
 }
diff --git a/clang/test/PCH/glob-delete-with-virtual-dtor.cpp b/clang/test/PCH/glob-delete-with-virtual-dtor.cpp
new file mode 100644
index 0000000000000..29242b04c4a7f
--- /dev/null
+++ b/clang/test/PCH/glob-delete-with-virtual-dtor.cpp
@@ -0,0 +1,47 @@
+// Test this without pch.
+// RUN: %clang_cc1 -x c++ -include %S/Inputs/glob-delete-with-virtual-dtor.h -emit-llvm -o - %s
+
+// Test with pch.
+// RUN: %clang_cc1 -x c++ -fno-rtti -emit-pch -o %t -triple=i386-pc-win32 %S/Inputs/glob-delete-with-virtual-dtor.h
+// RUN: %clang_cc1 -x c++ -fno-rtti -include-pch %t -emit-llvm -triple=i386-pc-win32 -o - %s | FileCheck %s --check-prefixes CHECK,CHECK32
+// RUN: %clang_cc1 -x c++ -fno-rtti -emit-pch -o %t -triple=x86_64-pc-win32 %S/Inputs/glob-delete-with-virtual-dtor.h
+// RUN: %clang_cc1 -x c++ -fno-rtti -include-pch %t -emit-llvm -triple=x86_64-pc-win32 -o - %s | FileCheck %s --check-prefixes CHECK,CHECK64
+
+static void call_in_pch_function(void) {
+    in_pch_tests();
+}
+
+void out_of_pch_tests() {
+  S* s = new S();
+  ::delete s;
+}
+
+// CHECK32:      define {{.*}} @"??_GH@@UAEPAXI@Z"
+// CHECK64:      define {{.*}} @"??_GH@@UEAAPEAXI@Z"
+// CHECK:        store i32 %should_call_delete, ptr %[[SHOULD_DELETE_VAR:[0-9a-z._]+]], align 4
+// CHECK:        store ptr %{{.*}}, ptr %[[RETVAL:retval]]
+// CHECK:        %[[SHOULD_DELETE_VALUE:[0-9a-z._]+]] = load i32, ptr %[[SHOULD_DELETE_VAR]]
+// CHECK32:        call x86_thiscallcc void @"??1H@@UAE@XZ"(ptr {{[^,]*}} %[[THIS:[0-9a-z]+]])
+// CHECK64:        call void @"??1H@@UEAA@XZ"(ptr {{[^,]*}} %[[THIS:[0-9a-z]+]])
+// CHECK-NEXT:   %[[AND:[0-9]+]] = and i32 %[[SHOULD_DELETE_VALUE]], 1
+// CHECK-NEXT:   %[[CONDITION:[0-9]+]] = icmp eq i32 %[[AND]], 0
+// CHECK-NEXT:   br i1 %[[CONDITION]], label %[[CONTINUE_LABEL:[0-9a-z._]+]], label %[[CALL_DELETE_LABEL:[0-9a-z._]+]]
+//
+// CHECK:      [[CALL_DELETE_LABEL]]
+// CHECK-NEXT:   %[[AND:[0-9]+]] = and i32 %[[SHOULD_DELETE_VALUE]], 4
+// CHECK-NEXT:   %[[CONDITION1:[0-9]+]] = icmp eq i32 %[[AND]], 0
+// CHECK-NEXT:   br i1 %[[CONDITION1]], label %[[CALL_CLASS_DELETE:[0-9a-z._]+]], label %[[CALL_GLOB_DELETE:[0-9a-z._]+]]
+//
+// CHECK:      [[CALL_GLOB_DELETE]]
+// CHECK32-NEXT:   call void @"??3@YAXPAXI@Z"
+// CHECK64-NEXT:   call void @"??3@YAXPEAX_K@Z"
+// CHECK-NEXT:   br label %[[CONTINUE_LABEL]]
+//
+// CHECK:      [[CALL_CLASS_DELETE]]
+// CHECK32-NEXT:   call void @"??3H@@CAXPAX@Z"
+// CHECK64-NEXT:   call void @"??3H@@CAXPEAX@Z"
+// CHECK-NEXT:   br label %[[CONTINUE_LABEL]]
+//
+// CHECK:      [[CONTINUE_LABEL]]
+// CHECK-NEXT:   %[[RET:.*]] = load ptr, ptr %[[RETVAL]]
+// CHECK-NEXT:   ret ptr %[[RET]]
diff --git a/clang/test/PCH/leakfiles.test b/clang/test/PCH/leakfiles.test
index dc4047ac3ff48..45dc36f6708bf 100644
--- a/clang/test/PCH/leakfiles.test
+++ b/clang/test/PCH/leakfiles.test
@@ -1,9 +1,8 @@
 // Test that compiling using a PCH doesn't leak file descriptors.
 // https://bugs.chromium.org/p/chromium/issues/detail?id=924225
 //
-// This test requires bash loops and ulimit.
-// REQUIRES: shell
-// UNSUPPORTED: target={{.*win32.*}}
+// This test uses ulimit.
+// UNSUPPORTED: system-windows
 //
 // Set up source files. lib/lib.h includes lots of lib*.h files in that dir.
 // client.c includes lib/lib.h, and also the individual files directly.
@@ -12,10 +11,10 @@
 // RUN: mkdir %t
 // RUN: cd %t
 // RUN: mkdir lib
-// RUN: for i in {1..300}; do touch lib/lib$i.h; done
-// RUN: for i in {1..300}; do echo "#include \"lib$i.h\"" >> lib/lib.h; done
+// RUN: %python -c "from pathlib import Path; list(map(lambda i: Path(f'lib/lib{i}.h').touch(), range(1, 301)))"
+// RUN: %python -c "for i in range(1, 301): print(f'#include \"lib{i}.h\"')" > lib/lib.h
 // RUN: echo "#include \"lib/lib.h\"" > client.c
-// RUN: for i in {1..300}; do echo "#include \"lib/lib$i.h\"" >> client.c; done
+// RUN: %python -c "for i in range(1, 301): print(f'#include \"lib/lib{i}.h\"')" > client.c
 //
 // We want to verify that we don't hold all the files open at the same time.
 // This is important e.g. on mac, which has a low default FD limit.
diff --git a/clang/test/PCH/objc_exprs.m b/clang/test/PCH/objc_exprs.m
index 7dbe3195f8ea8..b927e8df59b08 100644
--- a/clang/test/PCH/objc_exprs.m
+++ b/clang/test/PCH/objc_exprs.m
@@ -6,19 +6,19 @@
 // RUN: %clang_cc1 -fblocks -include-pch %t -fsyntax-only -verify %s 
 
 // Expressions
-int *A1 = (objc_string)0;   // expected-warning {{aka 'NSString *'}}
+int *A1 = (objc_string)0;   // expected-error {{aka 'NSString *'}}
 
 char A2 = (objc_encode){};  // expected-error {{not a compile-time constant}} \
                                expected-error {{char[2]}}
 
-int *A3 = (objc_protocol)0; // expected-warning {{aka 'Protocol *'}}
+int *A3 = (objc_protocol)0; // expected-error {{aka 'Protocol *'}}
 
 
 // Types.
-int *T0 = (objc_id_protocol_ty)0; // expected-warning {{aka 'id<foo>'}}
+int *T0 = (objc_id_protocol_ty)0; // expected-error {{aka 'id<foo>'}}
 
-int *T1 = (objc_interface_ty)0; // expected-warning {{aka 'itf *'}}
-int *T2 = (objc_qual_interface_ty)0; // expected-warning {{aka 'itf<foo> *'}}
+int *T1 = (objc_interface_ty)0; // expected-error {{aka 'itf *'}}
+int *T2 = (objc_qual_interface_ty)0; // expected-error {{aka 'itf<foo> *'}}
 
 objc_selector_noArgs s1;
 objc_selector_oneArg s2;
diff --git a/clang/test/PCH/objc_kindof.m b/clang/test/PCH/objc_kindof.m
index 437c417104087..0d8ecc3455fd2 100644
--- a/clang/test/PCH/objc_kindof.m
+++ b/clang/test/PCH/objc_kindof.m
@@ -27,7 +27,7 @@ @interface NSNumber : NSObject <NSCopying>
 
 #else
 void testPrettyPrint(int *ip) {
-  ip = kindof_NSObject_NSCopying; // expected-warning{{from '__kindof NSObject<NSCopying> *'}}
+  ip = kindof_NSObject_NSCopying; // expected-error{{from '__kindof NSObject<NSCopying> *'}}
 }
 
 #endif
diff --git a/clang/test/PCH/validate-file-content.m b/clang/test/PCH/validate-file-content.m
index b98979341b76a..8863b7abea3af 100644
--- a/clang/test/PCH/validate-file-content.m
+++ b/clang/test/PCH/validate-file-content.m
@@ -1,5 +1,3 @@
-// REQUIRES: shell
-//
 // Check driver works
 // RUN: %clang -x objective-c-header -fsyntax-only -fpch-validate-input-files-content %s -### 2>&1 | FileCheck --check-prefix=CHECK-CC1 %s
 // CHECK-CC1: -fvalidate-ast-input-files-content
diff --git a/clang/test/PCH/variables.c b/clang/test/PCH/variables.c
index 2c8c1368e173a..7bf06aed11629 100644
--- a/clang/test/PCH/variables.c
+++ b/clang/test/PCH/variables.c
@@ -31,7 +31,7 @@ int UNIQUE(a);  // a1
 #else
 
 int *ip2 = &x;
-float *fp = &ip; // expected-warning{{incompatible pointer types}}
+float *fp = &ip; // expected-error{{incompatible pointer types}}
 double z; // expected-error{{redefinition}} expected-note@14{{previous}}
 int z2 = 18; // expected-error{{redefinition}} expected-note@16{{previous}}
 double VeryHappy; // expected-error{{redefinition}} expected-note@19{{previous definition is here}}
diff --git a/clang/test/Parser/cxx03-attributes.cpp b/clang/test/Parser/cxx03-attributes.cpp
new file mode 100644
index 0000000000000..d3afef76366a3
--- /dev/null
+++ b/clang/test/Parser/cxx03-attributes.cpp
@@ -0,0 +1,6 @@
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++03 %s
+// expected-no-diagnostics
+
+struct S {
+    S([[clang::lifetimebound]] int&) {}
+};
diff --git a/clang/test/Parser/declarators.c b/clang/test/Parser/declarators.c
index 365d52bd48570..0a3580596646d 100644
--- a/clang/test/Parser/declarators.c
+++ b/clang/test/Parser/declarators.c
@@ -57,7 +57,7 @@ myenum c;      // expected-error {{must use 'enum' tag to refer to type 'myenum'
 float *test7(void) {
   // We should recover 'b' by parsing it with a valid type of "struct xyz", which
   // allows us to diagnose other bad things done with y, such as this.
-  return &b.y;   // expected-warning {{incompatible pointer types returning 'int *' from a function with result type 'float *'}}
+  return &b.y;   // expected-error {{incompatible pointer types returning 'int *' from a function with result type 'float *'}}
 }
 
 struct xyz test8(void) { return a; }  // a should be marked invalid, no diag.
diff --git a/clang/test/Parser/x64-windows-calling-convention-handling.c b/clang/test/Parser/x64-windows-calling-convention-handling.c
index c027663414829..224931c4eb91d 100644
--- a/clang/test/Parser/x64-windows-calling-convention-handling.c
+++ b/clang/test/Parser/x64-windows-calling-convention-handling.c
@@ -1,4 +1,6 @@
-// RUN: %clang_cc1 -triple x86_64-windows -fms-compatibility -fsyntax-only -verify %s
+// RUN: %clang_cc1 -triple x86_64-windows -fsyntax-only -verify %s
+// RUN: %clang_cc1 -triple x86_64-mingw   -fsyntax-only -verify %s
+// RUN: %clang_cc1 -triple x86_64-cygwin  -fsyntax-only -verify %s
 
 int __cdecl cdecl(int a, int b, int c, int d) { // expected-no-diagnostics
   return a + b + c + d;
diff --git a/clang/test/ParserHLSL/hlsl_resource_handle_attrs.hlsl b/clang/test/ParserHLSL/hlsl_resource_handle_attrs.hlsl
index 36430d4e2b628..e1ceec93407a0 100644
--- a/clang/test/ParserHLSL/hlsl_resource_handle_attrs.hlsl
+++ b/clang/test/ParserHLSL/hlsl_resource_handle_attrs.hlsl
@@ -3,7 +3,7 @@
 // CHECK: ClassTemplateSpecializationDecl {{.*}} class RWBuffer definition implicit_instantiation
 // CHECK: TemplateArgument type 'float'
 // CHECK: BuiltinType {{.*}} 'float'
-// CHECK: FieldDecl {{.*}} implicit referenced __handle '__hlsl_resource_t
+// CHECK: FieldDecl {{.*}} implicit{{.*}} __handle '__hlsl_resource_t
 // CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]]
 // CHECK-SAME{LITERAL}: [[hlsl::contained_type(float)]]
 RWBuffer<float> Buffer1;
@@ -12,7 +12,7 @@ RWBuffer<float> Buffer1;
 // CHECK: TemplateArgument type 'vector<float, 4>'
 // CHECK: ExtVectorType {{.*}} 'vector<float, 4>' 4
 // CHECK: BuiltinType {{.*}} 'float'
-// CHECK: FieldDecl {{.*}} implicit referenced __handle '__hlsl_resource_t
+// CHECK: FieldDecl {{.*}} implicit{{.*}} __handle '__hlsl_resource_t
 // CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]
 // CHECK-SAME{LITERAL}: [[hlsl::is_rov]]
 // CHECK-SAME{LITERAL}: [[hlsl::contained_type(vector<float, 4>)]]
diff --git a/clang/test/Preprocessor/embed_zos.c b/clang/test/Preprocessor/embed_zos.c
index 564a65f42afcd..12f9bf439ee8b 100644
--- a/clang/test/Preprocessor/embed_zos.c
+++ b/clang/test/Preprocessor/embed_zos.c
@@ -4,7 +4,7 @@
 // RUN: %clang_cc1 -std=c23 %s -fsyntax-only --embed-dir=%t -verify
 // expected-no-diagnostics
 
-// REQUIRES: shell, system-zos
+// REQUIRES: system-zos
 
 const char data[] = {
 #embed <media/art.txt>
diff --git a/clang/test/Preprocessor/nonportable-include-with-hmap.c b/clang/test/Preprocessor/nonportable-include-with-hmap.c
index 07907dfb40d5b..f7e1abf69ad1e 100644
--- a/clang/test/Preprocessor/nonportable-include-with-hmap.c
+++ b/clang/test/Preprocessor/nonportable-include-with-hmap.c
@@ -1,4 +1,5 @@
-// REQUIRES: shell
+// Most likely platform specific sed differences
+// UNSUPPORTED: system-windows
 // REQUIRES: case-insensitive-filesystem
 
 // RUN: rm -f %t.hmap
diff --git a/clang/test/Preprocessor/riscv-target-features.c b/clang/test/Preprocessor/riscv-target-features.c
index 204c9851e680c..0dcdb29445b4b 100644
--- a/clang/test/Preprocessor/riscv-target-features.c
+++ b/clang/test/Preprocessor/riscv-target-features.c
@@ -96,6 +96,7 @@
 // CHECK-NOT: __riscv_zfinx {{.*$}}
 // CHECK-NOT: __riscv_zhinx {{.*$}}
 // CHECK-NOT: __riscv_zhinxmin {{.*$}}
+// CHECK-NOT: __riscv_zibi {{.*$}}
 // CHECK-NOT: __riscv_zic64b {{.*$}}
 // CHECK-NOT: __riscv_zicbom {{.*$}}
 // CHECK-NOT: __riscv_zicbop {{.*$}}
@@ -812,6 +813,14 @@
 // RUN:   -o - | FileCheck --check-prefix=CHECK-ZHINXMIN-EXT %s
 // CHECK-ZHINXMIN-EXT: __riscv_zhinxmin 1000000{{$}}
 
+// RUN: %clang --target=riscv32 -menable-experimental-extensions \
+// RUN:   -march=rv32i_zibi0p1 -E -dM %s \
+// RUN:   -o - | FileCheck --check-prefix=CHECK-ZIBI-EXT %s
+// RUN: %clang --target=riscv64 -menable-experimental-extensions \
+// RUN:   -march=rv64i_zibi0p1 -E -dM %s \
+// RUN:   -o - | FileCheck --check-prefix=CHECK-ZIBI-EXT %s
+// CHECK-ZIBI-EXT: __riscv_zibi
+
 // RUN: %clang --target=riscv32-unknown-linux-gnu \
 // RUN:   -march=rv32izic64b -E -dM %s \
 // RUN:   -o - | FileCheck --check-prefix=CHECK-ZIC64B-EXT %s
diff --git a/clang/test/Profile/cxx-hash-v2.cpp b/clang/test/Profile/cxx-hash-v2.cpp
index 995fe008f5236..cb633d53f6f30 100644
--- a/clang/test/Profile/cxx-hash-v2.cpp
+++ b/clang/test/Profile/cxx-hash-v2.cpp
@@ -1,5 +1,3 @@
-// REQUIRES: shell
-
 // Check that all of the hashes in this file are unique (i.e, that none of the
 // profiles for these functions are mutually interchangeable).
 //
diff --git a/clang/test/Rewriter/objc-string-concat-1.m b/clang/test/Rewriter/objc-string-concat-1.m
index 9a23abcc7ff97..8b88a5e8bbaf4 100644
--- a/clang/test/Rewriter/objc-string-concat-1.m
+++ b/clang/test/Rewriter/objc-string-concat-1.m
@@ -1,4 +1,5 @@
-// RUN: %clang_cc1 -rewrite-objc -fobjc-runtime=macosx-fragile-10.5  %s -o -
+// -Wno-incompatible-pointer-types is needed to disable warning triggered by assigning NSString to NSConstantString
+// RUN: %clang_cc1 -rewrite-objc -fobjc-runtime=macosx-fragile-10.5 -Wno-incompatible-pointer-types %s -o -
 
 @class NSString;
 
@@ -11,4 +12,3 @@ @interface NSConstantString;
 NSConstantString *t = @"123"     @"4567"; // concat
 NSConstantString *t1 = @"123"     @"4567" /* COMMENT */ @"89"; // concat
 NSConstantString *t2 = @"123"     @/* COMMENT */ "4567"; // concat
-
diff --git a/clang/test/Sema/MicrosoftCompatibility-x64.c b/clang/test/Sema/MicrosoftCompatibility-x64.c
index 7d1f64996eb3c..a422b549dcc00 100644
--- a/clang/test/Sema/MicrosoftCompatibility-x64.c
+++ b/clang/test/Sema/MicrosoftCompatibility-x64.c
@@ -1,4 +1,6 @@
-// RUN: %clang_cc1 %s -Wmicrosoft -verify -fms-compatibility -triple x86_64-pc-win32
+// RUN: %clang_cc1 %s -Wmicrosoft -verify -triple x86_64-pc-win32
+// RUN: %clang_cc1 %s -Wmicrosoft -verify -triple x86_64-w64-mingw32
+// RUN: %clang_cc1 %s -Wmicrosoft -verify -triple x86_64-pc-cygwin
 
 // None of these should warn. stdcall is treated as equivalent to cdecl on
 // x64.
diff --git a/clang/test/Sema/MicrosoftExtensions.c b/clang/test/Sema/MicrosoftExtensions.c
index cf7463d2e76eb..e13c71570408e 100644
--- a/clang/test/Sema/MicrosoftExtensions.c
+++ b/clang/test/Sema/MicrosoftExtensions.c
@@ -206,7 +206,7 @@ void myprintf(const char *f, ...) {
     vmyprintf(f, ap);
     ap = 0;
   } else {
-    __va_start(ap, f); // expected-warning {{incompatible pointer types passing 'my_va_list'}}
+    __va_start(ap, f); // expected-error {{incompatible pointer types passing 'my_va_list'}}
   }
 }
 
diff --git a/clang/test/Sema/arm-neon-types.c b/clang/test/Sema/arm-neon-types.c
index 499cd271293b7..48df609e2f7a1 100644
--- a/clang/test/Sema/arm-neon-types.c
+++ b/clang/test/Sema/arm-neon-types.c
@@ -31,10 +31,10 @@ int32x4_t test4(int32x4_t a, vSInt32 b) {
 
 // Warn for incompatible pointer types used with vld/vst intrinsics.
 int16x8_t test5(int *p) {
-  return vld1q_s16(p); // expected-warning {{incompatible pointer types}}
+  return vld1q_s16(p); // expected-error {{incompatible pointer types}}
 }
 void test6(float *p, int32x2_t v) {
-  return vst1_s32(p, v); // expected-warning {{incompatible pointer types}}
+  return vst1_s32(p, v); // expected-error {{incompatible pointer types}}
 }
 
 #define INCLUDE
diff --git a/clang/test/Sema/atomic-expr.c b/clang/test/Sema/atomic-expr.c
index 96571e3e68c87..9ef5510e41b35 100644
--- a/clang/test/Sema/atomic-expr.c
+++ b/clang/test/Sema/atomic-expr.c
@@ -159,7 +159,7 @@ void func_17(void) {
   _Atomic(const int *) acip = cip;
   _Atomic(const int *) bad_acip = vcip; // expected-warning {{initializing '_Atomic(const int *)' with an expression of type 'const volatile int *' discards qualifiers}}
   _Atomic(const int *) acip2 = cicp;
-  _Atomic(int *) aip = &i; // expected-warning {{incompatible pointer types initializing '_Atomic(int *)' with an expression of type '_Atomic(int) *'}} \
+  _Atomic(int *) aip = &i; // expected-error {{incompatible pointer types initializing '_Atomic(int *)' with an expression of type '_Atomic(int) *'}} \
 
   // the left operand has atomic ... pointer type, and (considering the type
   // the left operand would have after lvalue conversion) one operand is a
@@ -220,7 +220,7 @@ _Atomic(const int *) acip2 = cvp; // expected-error {{initializer element is not
 // initializer, but the bit-cast inserted due to the pointer conversion is
 // tripping up the test for whether the initializer is a constant expression.
 // The warning is correct but the error is not.
-_Atomic(int *) aip3 = &ai; /* expected-warning {{incompatible pointer types initializing '_Atomic(int *)' with an expression of type '_Atomic(int) *'}}
+_Atomic(int *) aip3 = &ai; /* expected-error {{incompatible pointer types initializing '_Atomic(int *)' with an expression of type '_Atomic(int) *'}}
                               expected-error {{initializer element is not a compile-time constant}}
                             */
 
diff --git a/clang/test/Sema/atomic-ops.c b/clang/test/Sema/atomic-ops.c
index aae7aced2628a..ddeb29e19f760 100644
--- a/clang/test/Sema/atomic-ops.c
+++ b/clang/test/Sema/atomic-ops.c
@@ -181,7 +181,7 @@ void f(_Atomic(int) *i, const _Atomic(int) *ci,
   __atomic_load(i, I, memory_order_relaxed); // expected-error {{must be a pointer to a trivially-copyable type}}
   __atomic_load(CI, I, memory_order_relaxed);
 
-  __atomic_load(I, i, memory_order_relaxed); // expected-warning {{passing '_Atomic(int) *' to parameter of type 'int *'}}
+  __atomic_load(I, i, memory_order_relaxed); // expected-error {{passing '_Atomic(int) *' to parameter of type 'int *'}}
   __atomic_load(I, *P, memory_order_relaxed);
   __atomic_load(I, *P, memory_order_relaxed, 42); // expected-error {{too many arguments}}
   (int)__atomic_load(I, I, memory_order_seq_cst); // expected-error {{operand of type 'void'}}
@@ -213,7 +213,7 @@ void f(_Atomic(int) *i, const _Atomic(int) *ci,
   int exchange_4 = __atomic_exchange_n(I, 1, memory_order_seq_cst);
 
   __atomic_exchange(s1, s2, s2, memory_order_seq_cst);
-  __atomic_exchange(s1, I, P, memory_order_seq_cst); // expected-warning 2{{parameter of type 'struct S *'}}
+  __atomic_exchange(s1, I, P, memory_order_seq_cst); // expected-error 2{{parameter of type 'struct S *'}}
   (int)__atomic_exchange(s1, s2, s2, memory_order_seq_cst); // expected-error {{operand of type 'void'}}
   __atomic_exchange(I, I, I, memory_order_seq_cst);
   __atomic_exchange(CI, I, I, memory_order_seq_cst); // expected-error {{address argument to atomic operation must be a pointer to non-const type ('const int *' invalid)}}
@@ -259,22 +259,22 @@ void f(_Atomic(int) *i, const _Atomic(int) *ci,
 
   _Bool cmpexch_1 = __c11_atomic_compare_exchange_strong(i, I, 1, memory_order_seq_cst, memory_order_seq_cst);
   _Bool cmpexch_2 = __c11_atomic_compare_exchange_strong(p, P, (int*)1, memory_order_seq_cst, memory_order_seq_cst);
-  _Bool cmpexch_3 = __c11_atomic_compare_exchange_strong(f, I, 1, memory_order_seq_cst, memory_order_seq_cst); // expected-warning {{incompatible pointer types}}
+  _Bool cmpexch_3 = __c11_atomic_compare_exchange_strong(f, I, 1, memory_order_seq_cst, memory_order_seq_cst); // expected-error {{incompatible pointer types}}
   (void)__c11_atomic_compare_exchange_strong(i, CI, 1, memory_order_seq_cst, memory_order_seq_cst); // expected-warning {{passing 'const int *' to parameter of type 'int *' discards qualifiers}}
 
   _Bool cmpexchw_1 = __c11_atomic_compare_exchange_weak(i, I, 1, memory_order_seq_cst, memory_order_seq_cst);
   _Bool cmpexchw_2 = __c11_atomic_compare_exchange_weak(p, P, (int*)1, memory_order_seq_cst, memory_order_seq_cst);
-  _Bool cmpexchw_3 = __c11_atomic_compare_exchange_weak(f, I, 1, memory_order_seq_cst, memory_order_seq_cst); // expected-warning {{incompatible pointer types}}
+  _Bool cmpexchw_3 = __c11_atomic_compare_exchange_weak(f, I, 1, memory_order_seq_cst, memory_order_seq_cst); // expected-error {{incompatible pointer types}}
   (void)__c11_atomic_compare_exchange_weak(i, CI, 1, memory_order_seq_cst, memory_order_seq_cst); // expected-warning {{passing 'const int *' to parameter of type 'int *' discards qualifiers}}
 
   _Bool cmpexch_4 = __atomic_compare_exchange_n(I, I, 5, 1, memory_order_seq_cst, memory_order_seq_cst);
-  _Bool cmpexch_5 = __atomic_compare_exchange_n(I, P, 5, 0, memory_order_seq_cst, memory_order_seq_cst); // expected-warning {{; dereference with *}}
+  _Bool cmpexch_5 = __atomic_compare_exchange_n(I, P, 5, 0, memory_order_seq_cst, memory_order_seq_cst); // expected-error {{; dereference with *}}
   _Bool cmpexch_6 = __atomic_compare_exchange_n(I, I, P, 0, memory_order_seq_cst, memory_order_seq_cst); // expected-error {{passing 'int **' to parameter of type 'int'}}
   (void)__atomic_compare_exchange_n(CI, I, 5, 1, memory_order_seq_cst, memory_order_seq_cst); // expected-error {{address argument to atomic operation must be a pointer to non-const type ('const int *' invalid)}}
   (void)__atomic_compare_exchange_n(I, CI, 5, 1, memory_order_seq_cst, memory_order_seq_cst); // expected-warning {{passing 'const int *' to parameter of type 'int *' discards qualifiers}}
 
   _Bool cmpexch_7 = __atomic_compare_exchange(I, I, 5, 1, memory_order_seq_cst, memory_order_seq_cst); // expected-error {{passing 'int' to parameter of type 'int *'}}
-  _Bool cmpexch_8 = __atomic_compare_exchange(I, P, I, 0, memory_order_seq_cst, memory_order_seq_cst); // expected-warning {{; dereference with *}}
+  _Bool cmpexch_8 = __atomic_compare_exchange(I, P, I, 0, memory_order_seq_cst, memory_order_seq_cst); // expected-error {{; dereference with *}}
   _Bool cmpexch_9 = __atomic_compare_exchange(I, I, I, 0, memory_order_seq_cst, memory_order_seq_cst);
   (void)__atomic_compare_exchange(CI, I, I, 0, memory_order_seq_cst, memory_order_seq_cst); // expected-error {{address argument to atomic operation must be a pointer to non-const type ('const int *' invalid)}}
   (void)__atomic_compare_exchange(I, CI, I, 0, memory_order_seq_cst, memory_order_seq_cst); // expected-warning {{passing 'const int *' to parameter of type 'int *' discards qualifiers}}
@@ -375,7 +375,7 @@ void PR12527(void) { int *b = PR12527_a; }
 void PR16931(int* x) { // expected-note {{passing argument to parameter 'x' here}}
   typedef struct { _Atomic(_Bool) flag; } flag;
   flag flagvar = { 0 };
-  PR16931(&flagvar); // expected-warning {{incompatible pointer types}}
+  PR16931(&flagvar); // expected-error {{incompatible pointer types}}
 }
 
 void memory_checks(_Atomic(int) *Ap, int *p, int val) {
diff --git a/clang/test/Sema/attr-args.c b/clang/test/Sema/attr-args.c
index 23815f3a4e675..01bfcc1951cd8 100644
--- a/clang/test/Sema/attr-args.c
+++ b/clang/test/Sema/attr-args.c
@@ -29,3 +29,9 @@ __attribute__ ((__format_arg__(2))) // expected-error {{'__format_arg__' attribu
 void test (int, ...);
 
 void __attribute__ ((alloc_size (2, 3))) *test2(int, ...); // expected-error {{'alloc_size' attribute parameter 1 is out of bounds}}
+
+void gh159080_a(void);
+void *gh159080_b(void) __attribute__((malloc(gh159080_a))); // expected-error{{'malloc' argument 'gh159080_a' must take a pointer type as its first argument}}
+void gh159080_c();
+void *gh159080_d(void) __attribute__((malloc(gh159080_c))); // expected-error{{'malloc' argument 'gh159080_c' must take a pointer type as its first argument}}
+
diff --git a/clang/test/Sema/attr-format.c b/clang/test/Sema/attr-format.c
index 5a8b1ac9eca5c..5b9e4d02bbaf9 100644
--- a/clang/test/Sema/attr-format.c
+++ b/clang/test/Sema/attr-format.c
@@ -55,7 +55,7 @@ void callnull(void){
   null(0,        0); // no error
   null(0, (char*)0); // no error
   null(0, (void*)0); // no error
-  null(0,  (int*)0); // expected-warning {{incompatible pointer types}}
+  null(0,  (int*)0); // expected-error {{incompatible pointer types}}
 }
 
 // FreeBSD kernel extensions
diff --git a/clang/test/Sema/c2x-auto.c b/clang/test/Sema/c2x-auto.c
index 97754b2fb836b..7d62db9ea6c28 100644
--- a/clang/test/Sema/c2x-auto.c
+++ b/clang/test/Sema/c2x-auto.c
@@ -74,7 +74,7 @@ void test_qualifiers(const int y) {
   static auto c = 1UL;
   int* pa = &a; // expected-warning {{initializing 'int *' with an expression of type 'const int *' discards qualifiers}}
   const int* pb = &b;
-  int* pc = &c; // expected-warning {{incompatible pointer types initializing 'int *' with an expression of type 'unsigned long *'}}
+  int* pc = &c; // expected-error {{incompatible pointer types initializing 'int *' with an expression of type 'unsigned long *'}}
 
   _Static_assert(_Generic(a, int : 1));
   _Static_assert(_Generic(b, int : 1));
diff --git a/clang/test/Sema/conditional-expr.c b/clang/test/Sema/conditional-expr.c
index b54b689ec4f05..77f92b79dfcbc 100644
--- a/clang/test/Sema/conditional-expr.c
+++ b/clang/test/Sema/conditional-expr.c
@@ -18,11 +18,11 @@ void foo(void) {
 
   dp = vp;
   vp = dp;
-  ip = dp; // expected-warning {{incompatible pointer types assigning to 'int *' from 'double *'}}
-  dp = ip; // expected-warning {{incompatible pointer types assigning to 'double *' from 'int *'}}
+  ip = dp; // expected-error {{incompatible pointer types assigning to 'int *' from 'double *'}}
+  dp = ip; // expected-error {{incompatible pointer types assigning to 'double *' from 'int *'}}
   dp = 0 ? (double *)0 : (void *)0;
   vp = 0 ? (double *)0 : (void *)0;
-  ip = 0 ? (double *)0 : (void *)0; // expected-warning {{incompatible pointer types assigning to 'int *' from 'double *'}}
+  ip = 0 ? (double *)0 : (void *)0; // expected-error {{incompatible pointer types assigning to 'int *' from 'double *'}}
 
   const int *cip;
   vp = (0 ? vp : cip); // expected-warning {{discards qualifiers}}
diff --git a/clang/test/Sema/constant-builtins-vector.cpp b/clang/test/Sema/constant-builtins-vector.cpp
index 714a7fb753214..455284ef65e9b 100644
--- a/clang/test/Sema/constant-builtins-vector.cpp
+++ b/clang/test/Sema/constant-builtins-vector.cpp
@@ -731,6 +731,19 @@ permitted in a constexpr context}}
         vector4charConst1,
         vector4charConst2, -1, -1, -1, -1);
 
+namespace UnaryShuffleUnsupported {
+  typedef int vi6 __attribute__((ext_vector_type(2)));
+  constexpr int foo() { // expected-error {{never produces a constant expression}}
+    vi6 a = {1,2};
+    vi6 b = {3,4};
+    vi6 r = __builtin_shufflevector(a, b); // expected-note 2{{subexpression not valid in a constant expression}}
+
+    return r[0] + r[1];
+  }
+  static_assert(foo() == 0); // expected-error {{not an integral constant expression}} \
+                             // expected-note {{in call to}}
+}
+
 static_assert(__builtin_reduce_add((vector4char){}) == 0);
 static_assert(__builtin_reduce_add((vector4char){1, 2, 3, 4}) == 10);
 static_assert(__builtin_reduce_add((vector4short){10, 20, 30, 40}) == 100);
diff --git a/clang/test/Sema/decl-type-merging.c b/clang/test/Sema/decl-type-merging.c
index 7576cac7cba21..1c81a7886b875 100644
--- a/clang/test/Sema/decl-type-merging.c
+++ b/clang/test/Sema/decl-type-merging.c
@@ -9,8 +9,8 @@ int (*a)(int (*x)[], int (*y)[5]);
 void b(void) {
   int x[10], y[5];
   a(&x, &y);
-  a(&y, &y); // expected-warning {{incompatible pointer}}
-  a(&x, &x); // expected-warning {{incompatible pointer}}
+  a(&y, &y); // expected-error {{incompatible pointer}}
+  a(&x, &x); // expected-error {{incompatible pointer}}
 }
 
 
diff --git a/clang/test/Sema/enum.c b/clang/test/Sema/enum.c
index 01e41d4ebe956..f0da5f097fa80 100644
--- a/clang/test/Sema/enum.c
+++ b/clang/test/Sema/enum.c
@@ -111,7 +111,7 @@ void PR8694(int* e) // expected-note {{passing argument to parameter 'e' here}}
 void crash(enum E *e) // expected-warning {{declaration of 'enum E' will not be visible outside of this function}} \
                       // expected-warning {{ISO C forbids forward references to 'enum' types}}
 {
-        PR8694(e); // expected-warning {{incompatible pointer types passing 'enum E *' to parameter of type 'int *'}}
+        PR8694(e); // expected-error {{incompatible pointer types passing 'enum E *' to parameter of type 'int *'}}
 }
 
 typedef enum { NegativeShort = (short)-1 } NegativeShortEnum;
diff --git a/clang/test/Sema/format-strings.c b/clang/test/Sema/format-strings.c
index af30ad5d15fe2..4bff30c313c8f 100644
--- a/clang/test/Sema/format-strings.c
+++ b/clang/test/Sema/format-strings.c
@@ -230,8 +230,8 @@ void check_wide_string(char* b, ...)
   va_list ap;
   va_start(ap,b);
 
-  printf(L"foo %d",2); // expected-warning {{incompatible pointer types}}, expected-warning {{should not be a wide string}}
-  vsprintf(b,L"bar %d",ap); // expected-warning {{incompatible pointer types}}, expected-warning {{should not be a wide string}}
+  printf(L"foo %d",2); // expected-error {{incompatible pointer types}}, expected-warning {{should not be a wide string}}
+  vsprintf(b,L"bar %d",ap); // expected-error {{incompatible pointer types}}, expected-warning {{should not be a wide string}}
 }
 
 void check_asterisk_precision_width(int x) {
diff --git a/clang/test/Sema/function.c b/clang/test/Sema/function.c
index 5d803e03b3af9..990631ee69cd5 100644
--- a/clang/test/Sema/function.c
+++ b/clang/test/Sema/function.c
@@ -107,8 +107,8 @@ void decays(int a[3][3]);   // expected-note {{passing argument to parameter 'a'
 void no_decay(int (*a)[3]); // expected-note {{passing argument to parameter 'a' here}}
 
 void t22(int *ptr, int (*array)[3]) {
-  decays(ptr);   // expected-warning {{incompatible pointer types passing 'int *' to parameter of type 'int (*)[3]'}}
-  no_decay(ptr); // expected-warning {{incompatible pointer types passing 'int *' to parameter of type 'int (*)[3]'}}
+  decays(ptr);   // expected-error {{incompatible pointer types passing 'int *' to parameter of type 'int (*)[3]'}}
+  no_decay(ptr); // expected-error {{incompatible pointer types passing 'int *' to parameter of type 'int (*)[3]'}}
   decays(array);
   no_decay(array);
 }
diff --git a/clang/test/Sema/merge-decls.c b/clang/test/Sema/merge-decls.c
index 652cd8fd2454b..7bf9fdab0aa48 100644
--- a/clang/test/Sema/merge-decls.c
+++ b/clang/test/Sema/merge-decls.c
@@ -81,14 +81,14 @@ void test6_f(a)
 {}
 void test6_g() {
   int arr[10];
-  test6_f(&arr); // expected-warning {{incompatible pointer types passing 'int (*)[10]' to parameter of type 'int (*)[11]}}
+  test6_f(&arr); // expected-error {{incompatible pointer types passing 'int (*)[10]' to parameter of type 'int (*)[11]}}
 }
 
 void test7_f(int (*)[10]);
 void test7_f(int (*)[]); // expected-note {{passing argument to parameter here}}
 void test7_g() {
   int x[5];
-  test7_f(&x); // expected-warning {{incompatible pointer types passing 'int (*)[5]' to parameter of type 'int (*)[10]}}
+  test7_f(&x); // expected-error {{incompatible pointer types passing 'int (*)[5]' to parameter of type 'int (*)[10]}}
 }
 
 char d;
diff --git a/clang/test/Sema/nullability.c b/clang/test/Sema/nullability.c
index 0401516233b6d..5af473ecc41b5 100644
--- a/clang/test/Sema/nullability.c
+++ b/clang/test/Sema/nullability.c
@@ -63,10 +63,10 @@ void acceptBlockPtr(_Nonnull int *(^)(void));
 
 void testBlockFunctionPtrNullability(void) {
   float *fp;
-  fp = (function_pointer_type_3)0; // expected-warning{{from 'function_pointer_type_3' (aka 'int * _Nonnull (*)(int, int)')}}
+  fp = (function_pointer_type_3)0; // expected-error{{from 'function_pointer_type_3' (aka 'int * _Nonnull (*)(int, int)')}}
   fp = (block_type_3)0; // expected-error{{from incompatible type 'block_type_3' (aka 'int * _Nonnull (^)(int, int)')}}
-  fp = (function_pointer_type_4)0; // expected-warning{{from 'function_pointer_type_4' (aka 'int * _Nonnull (*)(int, int)')}}
-  fp = (function_pointer_type_5)0; // expected-warning{{from 'function_pointer_type_5' (aka 'void (*)(int * _Nonnull)')}}
+  fp = (function_pointer_type_4)0; // expected-error{{from 'function_pointer_type_4' (aka 'int * _Nonnull (*)(int, int)')}}
+  fp = (function_pointer_type_5)0; // expected-error{{from 'function_pointer_type_5' (aka 'void (*)(int * _Nonnull)')}}
   fp = (block_type_4)0; // expected-error{{from incompatible type 'block_type_4' (aka 'int_ptr  _Nonnull (^)(int, int)')}}
 
   acceptFunctionPtr(0); // no-warning
@@ -86,18 +86,18 @@ typedef int * _Nullable ambiguous_int_ptr;
 
 // Printing of nullability.
 float f;
-int * _Nonnull ip_1 = &f; // expected-warning{{incompatible pointer types initializing 'int * _Nonnull' with an expression of type 'float *'}}
+int * _Nonnull ip_1 = &f; // expected-error{{incompatible pointer types initializing 'int * _Nonnull' with an expression of type 'float *'}}
 
 // Check printing of nullability specifiers.
 void printing_nullability(void) {
   int * _Nonnull iptr;
-  float *fptr = iptr; // expected-warning{{incompatible pointer types initializing 'float *' with an expression of type 'int * _Nonnull'}}
+  float *fptr = iptr; // expected-error{{incompatible pointer types initializing 'float *' with an expression of type 'int * _Nonnull'}}
 
   int * * _Nonnull iptrptr;
-  float **fptrptr = iptrptr; // expected-warning{{incompatible pointer types initializing 'float **' with an expression of type 'int ** _Nonnull'}}
+  float **fptrptr = iptrptr; // expected-error{{incompatible pointer types initializing 'float **' with an expression of type 'int ** _Nonnull'}}
 
   int * _Nullable * _Nonnull iptrptr2;
-  float * *fptrptr2 = iptrptr2; // expected-warning{{incompatible pointer types initializing 'float **' with an expression of type 'int * _Nullable * _Nonnull'}}
+  float * *fptrptr2 = iptrptr2; // expected-error{{incompatible pointer types initializing 'float **' with an expression of type 'int * _Nullable * _Nonnull'}}
 }
 
 // Check passing null to a _Nonnull argument.
diff --git a/clang/test/Sema/pass-object-size.c b/clang/test/Sema/pass-object-size.c
index 688290ca9890a..5360f4af71c25 100644
--- a/clang/test/Sema/pass-object-size.c
+++ b/clang/test/Sema/pass-object-size.c
@@ -58,7 +58,7 @@ void FunctionPtrs(void) {
 
   int P;
   (&NotOverloaded)(&P); //expected-error{{cannot take address of function 'NotOverloaded' because parameter 1 has pass_object_size attribute}}
-  (&IsOverloaded)(&P); //expected-warning{{incompatible pointer types passing 'int *' to parameter of type 'char *'}}
+  (&IsOverloaded)(&P); //expected-error{{incompatible pointer types passing 'int *' to parameter of type 'char *'}}
 }
 
 void mismatch(void *p __attribute__((pass_object_size(0)))); // expected-note {{previous declaration is here}}
diff --git a/clang/test/Sema/ptrauth-atomic-ops.c b/clang/test/Sema/ptrauth-atomic-ops.c
index 8872090d83b8d..a5411f2e197b3 100644
--- a/clang/test/Sema/ptrauth-atomic-ops.c
+++ b/clang/test/Sema/ptrauth-atomic-ops.c
@@ -50,7 +50,7 @@ void f() {
   __c11_atomic_store(ATOMIZE(non_addr_discriminatedauthenticated_ptr), 0, memory_order_relaxed);
   __c11_atomic_load(ATOMIZE(non_addr_discriminatedauthenticated_ptr), memory_order_seq_cst);
   __atomic_store(&j, ATOMIZE(non_addr_discriminatedauthenticated_ptr), memory_order_release);
-  // expected-warning@-1 {{incompatible pointer types passing 'volatile __ptrauth(2,0,200) _Atomic(int *) *' to parameter of type 'int *'}}
+  // expected-error@-1 {{incompatible pointer types passing 'volatile __ptrauth(2,0,200) _Atomic(int *) *' to parameter of type 'int *'}}
   __c11_atomic_exchange(ATOMIZE(j), ATOMIZE(non_addr_discriminatedauthenticated_ptr), memory_order_seq_cst);
   // expected-error@-1 {{incompatible pointer to integer conversion passing 'volatile __ptrauth(2,0,200) _Atomic(int *) *' to parameter of type 'typeof (j)' (aka 'int')}}
   __c11_atomic_fetch_add(ATOMIZE(non_addr_discriminatedauthenticated_ptr), ATOMIZE(j), memory_order_seq_cst);
diff --git a/clang/test/Sema/ptrauth.c b/clang/test/Sema/ptrauth.c
index b4e5214a7cb50..6c4b92b0104df 100644
--- a/clang/test/Sema/ptrauth.c
+++ b/clang/test/Sema/ptrauth.c
@@ -35,7 +35,7 @@ void test_strip(int *dp, int (*fp)(int)) {
   int (*fr)(int) = __builtin_ptrauth_strip(fp, VALID_CODE_KEY);
   fr = __builtin_ptrauth_strip(fp, INVALID_KEY); // expected-error {{does not identify a valid pointer authentication key for the current target}}
 
-  float *mismatch = __builtin_ptrauth_strip(dp, VALID_DATA_KEY); // expected-warning {{incompatible pointer types initializing 'float *' with an expression of type 'int *'}}
+  float *mismatch = __builtin_ptrauth_strip(dp, VALID_DATA_KEY); // expected-error {{incompatible pointer types initializing 'float *' with an expression of type 'int *'}}
 }
 
 void test_blend_discriminator(int *dp, int (*fp)(int), int value) {
@@ -55,7 +55,7 @@ void test_string_discriminator(const char *str) {
   (void) __builtin_ptrauth_string_discriminator("test string"); // no warning
 
   __builtin_ptrauth_string_discriminator(str); // expected-error {{argument must be a string literal}}
-  __builtin_ptrauth_string_discriminator(L"wide test"); // expected-error {{argument must be a string literal}} expected-warning {{incompatible pointer types passing 'int[10]' to parameter of type 'const char *'}}
+  __builtin_ptrauth_string_discriminator(L"wide test"); // expected-error {{argument must be a string literal}} expected-error {{incompatible pointer types passing 'int[10]' to parameter of type 'const char *'}}
 
   void *mismatch = __builtin_ptrauth_string_discriminator("test string"); // expected-error {{incompatible integer to pointer conversion initializing 'void *' with an expression of type '__size_t'}}
 }
@@ -77,7 +77,7 @@ void test_sign_unauthenticated(int *dp, int (*fp)(int)) {
   int (*fr)(int) = __builtin_ptrauth_sign_unauthenticated(fp, VALID_CODE_KEY, 0);
   fr = __builtin_ptrauth_sign_unauthenticated(fp, INVALID_KEY, 0); // expected-error {{does not identify a valid pointer authentication key for the current target}}
 
-  float *mismatch = __builtin_ptrauth_sign_unauthenticated(dp, VALID_DATA_KEY, 0); // expected-warning {{incompatible pointer types initializing 'float *' with an expression of type 'int *'}}
+  float *mismatch = __builtin_ptrauth_sign_unauthenticated(dp, VALID_DATA_KEY, 0); // expected-error {{incompatible pointer types initializing 'float *' with an expression of type 'int *'}}
 }
 
 void test_auth(int *dp, int (*fp)(int)) {
@@ -96,7 +96,7 @@ void test_auth(int *dp, int (*fp)(int)) {
   int (*fr)(int) = __builtin_ptrauth_auth(fp, VALID_CODE_KEY, 0);
   fr = __builtin_ptrauth_auth(fp, INVALID_KEY, 0); // expected-error {{does not identify a valid pointer authentication key for the current target}}
 
-  float *mismatch = __builtin_ptrauth_auth(dp, VALID_DATA_KEY, 0); // expected-warning {{incompatible pointer types initializing 'float *' with an expression of type 'int *'}}
+  float *mismatch = __builtin_ptrauth_auth(dp, VALID_DATA_KEY, 0); // expected-error {{incompatible pointer types initializing 'float *' with an expression of type 'int *'}}
 }
 
 void test_auth_and_resign(int *dp, int (*fp)(int)) {
@@ -119,7 +119,7 @@ void test_auth_and_resign(int *dp, int (*fp)(int)) {
   fr = __builtin_ptrauth_auth_and_resign(fp, INVALID_KEY, 0, VALID_CODE_KEY, dp); // expected-error {{does not identify a valid pointer authentication key for the current target}}
   fr = __builtin_ptrauth_auth_and_resign(fp, VALID_CODE_KEY, 0, INVALID_KEY, dp); // expected-error {{does not identify a valid pointer authentication key for the current target}}
 
-  float *mismatch = __builtin_ptrauth_auth_and_resign(dp, VALID_DATA_KEY, 0, VALID_DATA_KEY, dp); // expected-warning {{incompatible pointer types initializing 'float *' with an expression of type 'int *'}}
+  float *mismatch = __builtin_ptrauth_auth_and_resign(dp, VALID_DATA_KEY, 0, VALID_DATA_KEY, dp); // expected-error {{incompatible pointer types initializing 'float *' with an expression of type 'int *'}}
 }
 
 void test_sign_generic_data(int *dp) {
@@ -152,7 +152,7 @@ int *t_cst_sig3 = __builtin_ptrauth_sign_constant(mismatched_type, VALID_DATA_KE
 int *t_cst_sig4 = __builtin_ptrauth_sign_constant(&dv, mismatched_type, 0); // expected-error {{passing 'struct A' to parameter of incompatible type 'int'}}
 int *t_cst_sig5 = __builtin_ptrauth_sign_constant(&dv, VALID_DATA_KEY, mismatched_type); // expected-error {{extra discriminator must have pointer or integer type; type here is 'struct A'}}
 
-float *t_cst_result = __builtin_ptrauth_sign_constant(&dv, VALID_DATA_KEY, 0); // expected-warning {{incompatible pointer types initializing 'float *' with an expression of type 'int *'}}
+float *t_cst_result = __builtin_ptrauth_sign_constant(&dv, VALID_DATA_KEY, 0); // expected-error {{incompatible pointer types initializing 'float *' with an expression of type 'int *'}}
 
 int *t_cst_valid1 = __builtin_ptrauth_sign_constant(&dv, VALID_DATA_KEY, 0);
 int *t_cst_valid2 = __builtin_ptrauth_sign_constant(&dv, VALID_DATA_KEY, __builtin_ptrauth_blend_discriminator(&dv, 0));
@@ -184,7 +184,7 @@ void test_sign_constant(int *dp, fp_t fp) {
   int *sig4 = __builtin_ptrauth_sign_constant(&dv, mismatched_type, 0); // expected-error {{passing 'struct A' to parameter of incompatible type 'int'}}
   int *sig5 = __builtin_ptrauth_sign_constant(&dv, VALID_DATA_KEY, mismatched_type); // expected-error {{extra discriminator must have pointer or integer type; type here is 'struct A'}}
 
-  float *result = __builtin_ptrauth_sign_constant(&dv, VALID_DATA_KEY, 0); // expected-warning {{incompatible pointer types initializing 'float *' with an expression of type 'int *'}}
+  float *result = __builtin_ptrauth_sign_constant(&dv, VALID_DATA_KEY, 0); // expected-error {{incompatible pointer types initializing 'float *' with an expression of type 'int *'}}
 
   int *valid1 = __builtin_ptrauth_sign_constant(&dv, VALID_DATA_KEY, 0);
   int *valid2 = __builtin_ptrauth_sign_constant(&dv, VALID_DATA_KEY, __builtin_ptrauth_blend_discriminator(&dv, 0));
diff --git a/clang/test/Sema/static-array.c b/clang/test/Sema/static-array.c
index 87efb4894f1bb..3574a106c7104 100644
--- a/clang/test/Sema/static-array.c
+++ b/clang/test/Sema/static-array.c
@@ -23,11 +23,11 @@ void f(int *p) {
 
   char d[4];
   cat((int *)d); // expected-warning {{array argument is too small; is of size 4, callee requires at least 12}}
-  cat(d); // expected-warning {{array argument is too small; is of size 4, callee requires at least 12}} expected-warning {{incompatible pointer types}}
+  cat(d); // expected-warning {{array argument is too small; is of size 4, callee requires at least 12}} expected-error {{incompatible pointer types}}
 
   char e[12];
   cat((int *)e);
-  cat(e); // expected-warning {{incompatible pointer types}}
+  cat(e); // expected-error {{incompatible pointer types}}
 }
 
 
diff --git a/clang/test/Sema/struct-compat.c b/clang/test/Sema/struct-compat.c
index 3bc66b0b9d88e..eb735f0bcfc90 100644
--- a/clang/test/Sema/struct-compat.c
+++ b/clang/test/Sema/struct-compat.c
@@ -8,7 +8,7 @@ struct x;
 int a(struct x* b) {
 // Per C99 6.7.2.3, since the outer and inner "struct x"es have different
 // scopes, they don't refer to the same type, and are therefore incompatible
-struct x {int a;} *c = b; // expected-warning {{incompatible pointer types}}
+struct x {int a;} *c = b; // expected-error {{incompatible pointer types}}
 }
 
 struct x {int a;} r;
diff --git a/clang/test/Sema/var-redecl.c b/clang/test/Sema/var-redecl.c
index 30f1fb229d8c8..f915403ba9255 100644
--- a/clang/test/Sema/var-redecl.c
+++ b/clang/test/Sema/var-redecl.c
@@ -54,7 +54,7 @@ void g18(void) { // expected-note{{'g18' declared here}}
   extern int g19;
 }
 int *p=&g19; // expected-error{{use of undeclared identifier 'g19'}} \
-             // expected-warning{{incompatible pointer types}}
+             // expected-error{{incompatible pointer types}}
 
 // PR3645
 static int a;
diff --git a/clang/test/Sema/vector-assign.c b/clang/test/Sema/vector-assign.c
index d7972ed8b4c7b..119a320585ef6 100644
--- a/clang/test/Sema/vector-assign.c
+++ b/clang/test/Sema/vector-assign.c
@@ -49,5 +49,5 @@ longlongvec;
 
 void test3a(longlongvec *); // expected-note{{passing argument to parameter here}}
 void test3(const unsigned *src) {
-  test3a(src);  // expected-warning {{incompatible pointer types passing 'const unsigned int *' to parameter of type 'longlongvec *'}}
+  test3a(src);  // expected-error {{incompatible pointer types passing 'const unsigned int *' to parameter of type 'longlongvec *'}}
 }
diff --git a/clang/test/Sema/vector-bool-assign.c b/clang/test/Sema/vector-bool-assign.c
new file mode 100644
index 0000000000000..8f9d7b17a1ecb
--- /dev/null
+++ b/clang/test/Sema/vector-bool-assign.c
@@ -0,0 +1,29 @@
+// RUN: %clang_cc1 -triple x86_64 -fsyntax-only -verify %s
+
+typedef _Bool bool;
+
+typedef __attribute__((ext_vector_type(8))) int v8i;
+typedef __attribute__((ext_vector_type(8))) bool v8b;
+typedef __attribute__((ext_vector_type(4))) float v4f;
+typedef __attribute__((ext_vector_type(4))) bool v4b;
+
+void foo(v8b);
+
+v8b integral(v8i v) {
+  v8b m1 = __builtin_convertvector(v, __attribute__((ext_vector_type(8))) int);
+  v8b m2 = __builtin_convertvector(v, __attribute__((ext_vector_type(8))) unsigned);
+  v8b m3 = __builtin_convertvector(v, __attribute__((ext_vector_type(8))) long);
+  v8b m4 = __builtin_convertvector(v, __attribute__((ext_vector_type(8))) unsigned long);
+  v8b m5 = __builtin_convertvector(v, __attribute__((ext_vector_type(8))) char);
+  v8b m6 = __builtin_convertvector(v, __attribute__((ext_vector_type(8))) unsigned char);
+  foo(v);
+  return v;
+}
+
+v4b non_integral(v4f vf) {
+  return vf; // expected-error{{returning 'v4f' (vector of 4 'float' values) from a function with incompatible result type 'v4b' (vector of 4 'bool' values}}
+}
+
+v4b size_mismatch(v8i v) {
+  return v; // expected-error{{returning 'v8i' (vector of 8 'int' values) from a function with incompatible result type 'v4b' (vector of 4 'bool' values)}}
+}
diff --git a/clang/test/Sema/vector-bool-assign.cpp b/clang/test/Sema/vector-bool-assign.cpp
new file mode 100644
index 0000000000000..115ec50f6b448
--- /dev/null
+++ b/clang/test/Sema/vector-bool-assign.cpp
@@ -0,0 +1,27 @@
+// RUN: %clang_cc1 -triple x86_64 -fsyntax-only -verify %s
+
+using v8i = int [[clang::ext_vector_type(8)]];
+using v8b = bool [[clang::ext_vector_type(8)]];
+using v4f = float [[clang::ext_vector_type(4)]];
+using v4b = bool [[clang::ext_vector_type(4)]];
+
+void foo(v8b);
+
+v8b integral(v8i v) {
+  v8b m1 = __builtin_convertvector(v, int [[clang::ext_vector_type(8)]]);
+  v8b m2 = __builtin_convertvector(v, unsigned [[clang::ext_vector_type(8)]]);
+  v8b m3 = __builtin_convertvector(v, long [[clang::ext_vector_type(8)]]);
+  v8b m4 = __builtin_convertvector(v, unsigned long [[clang::ext_vector_type(8)]]);
+  v8b m5 = __builtin_convertvector(v, char [[clang::ext_vector_type(8)]]);
+  v8b m6 = __builtin_convertvector(v, unsigned char [[clang::ext_vector_type(8)]]);
+  foo(v);
+  return v;
+}
+
+v4b non_integral(v4f vf) {
+  return vf; // expected-error{{cannot initialize return object of type 'v4b' (vector of 4 'bool' values) with an lvalue of type 'v4f' (vector of 4 'float' values)}}
+}
+
+v4b size_mismatch(v8i v) {
+  return v; // expected-error{{cannot initialize return object of type 'v4b' (vector of 4 'bool' values) with an lvalue of type 'v8i' (vector of 8 'int' values)}}
+}
diff --git a/clang/test/Sema/vla.c b/clang/test/Sema/vla.c
index 54afe7c681bd0..c8b051e7b46de 100644
--- a/clang/test/Sema/vla.c
+++ b/clang/test/Sema/vla.c
@@ -82,9 +82,9 @@ void VLAPtrAssign(int size) {
   // This is well formed
   int (*p)[2][3][size][4][5] = array;
   // Last array dimension too large
-  int (*p2)[2][3][size][4][6] = array; // expected-warning {{incompatible pointer types}}
+  int (*p2)[2][3][size][4][6] = array; // expected-error {{incompatible pointer types}}
   // Second array dimension too large
-  int (*p3)[20][3][size][4][5] = array; // expected-warning {{incompatible pointer types}}
+  int (*p3)[20][3][size][4][5] = array; // expected-error {{incompatible pointer types}}
 
   // Not illegal in C, program _might_ be well formed if size == 3.
   int (*p4)[2][size][3][4][5] = array;
diff --git a/clang/test/Sema/warn-lifetime-safety.cpp b/clang/test/Sema/warn-lifetime-safety.cpp
index 660b9c9d5e243..bc8a5f3f7150f 100644
--- a/clang/test/Sema/warn-lifetime-safety.cpp
+++ b/clang/test/Sema/warn-lifetime-safety.cpp
@@ -6,6 +6,12 @@ struct MyObj {
   MyObj operator+(MyObj);
 };
 
+struct [[gsl::Pointer()]] View {
+  View(const MyObj&); // Borrows from MyObj
+  View();
+  void use() const;
+};
+
 //===----------------------------------------------------------------------===//
 // Basic Definite Use-After-Free (-W...permissive)
 // These are cases where the pointer is guaranteed to be dangling at the use site.
@@ -20,12 +26,31 @@ void definite_simple_case() {
   (void)*p;     // expected-note {{later used here}}
 }
 
+void definite_simple_case_gsl() {
+  View v;
+  {
+    MyObj s;
+    v = s;      // expected-warning {{object whose reference is captured does not live long enough}}
+  }             // expected-note {{destroyed here}}
+  v.use();      // expected-note {{later used here}}
+}
+
 void no_use_no_error() {
   MyObj* p;
   {
     MyObj s;
     p = &s;
   }
+  // 'p' is dangling here, but since it is never used, no warning is issued.
+}
+
+void no_use_no_error_gsl() {
+  View v;
+  {
+    MyObj s;
+    v = s;
+  }
+  // 'v' is dangling here, but since it is never used, no warning is issued.
 }
 
 void definite_pointer_chain() {
@@ -39,6 +64,16 @@ void definite_pointer_chain() {
   (void)*q;     // expected-note {{later used here}}
 }
 
+void definite_propagation_gsl() {
+  View v1, v2;
+  {
+    MyObj s;
+    v1 = s;     // expected-warning {{object whose reference is captured does not live long enough}}
+    v2 = v1;
+  }             // expected-note {{destroyed here}}
+  v2.use();     // expected-note {{later used here}}
+}
+
 void definite_multiple_uses_one_warning() {
   MyObj* p;
   {
@@ -78,6 +113,19 @@ void definite_single_pointer_multiple_loans(bool cond) {
   (void)*p;     // expected-note 2  {{later used here}}
 }
 
+void definite_single_pointer_multiple_loans_gsl(bool cond) {
+  View v;
+  if (cond){
+    MyObj s;
+    v = s;      // expected-warning {{object whose reference is captured does not live long enough}}
+  }             // expected-note {{destroyed here}}
+  else {
+    MyObj t;
+    v = t;      // expected-warning {{object whose reference is captured does not live long enough}}
+  }             // expected-note {{destroyed here}}
+  v.use();      // expected-note 2 {{later used here}}
+}
+
 
 //===----------------------------------------------------------------------===//
 // Potential (Maybe) Use-After-Free (-W...strict)
@@ -94,18 +142,14 @@ void potential_if_branch(bool cond) {
   (void)*p;     // expected-note {{later used here}}
 }
 
-// If all paths lead to a dangle, it becomes a definite error.
-void potential_becomes_definite(bool cond) {
-  MyObj* p;
+void potential_if_branch_gsl(bool cond) {
+  MyObj safe;
+  View v = safe;
   if (cond) {
-    MyObj temp1;
-    p = &temp1; // expected-warning {{does not live long enough}}
-  }             // expected-note {{destroyed here}}
-  else {      
-    MyObj temp2;
-    p = &temp2; // expected-warning {{does not live long enough}}
+    MyObj temp;
+    v = temp;   // expected-warning {{object whose reference is captured may not live long enough}}
   }             // expected-note {{destroyed here}}
-  (void)*p;     // expected-note 2 {{later used here}}
+  v.use();      // expected-note {{later used here}}
 }
 
 void definite_potential_together(bool cond) {
@@ -159,6 +203,16 @@ void potential_for_loop_use_after_loop_body(MyObj safe) {
   (void)*p;     // expected-note {{later used here}}
 }
 
+void potential_for_loop_gsl() {
+  MyObj safe;
+  View v = safe;
+  for (int i = 0; i < 1; ++i) {
+    MyObj s;
+    v = s;      // expected-warning {{object whose reference is captured may not live long enough}}
+  }             // expected-note {{destroyed here}}
+  v.use();      // expected-note {{later used here}}
+}
+
 void potential_for_loop_use_before_loop_body(MyObj safe) {
   MyObj* p = &safe;
   for (int i = 0; i < 1; ++i) {
@@ -182,6 +236,19 @@ void potential_loop_with_break(bool cond) {
   (void)*p;     // expected-note {{later used here}}
 }
 
+void potential_loop_with_break_gsl(bool cond) {
+  MyObj safe;
+  View v = safe;
+  for (int i = 0; i < 10; ++i) {
+    if (cond) {
+      MyObj temp;
+      v = temp;   // expected-warning {{object whose reference is captured may not live long enough}}
+      break;      // expected-note {{destroyed here}}
+    }
+  }
+  v.use();      // expected-note {{later used here}}
+}
+
 void potential_multiple_expiry_of_same_loan(bool cond) {
   // Choose the last expiry location for the loan.
   MyObj safe;
@@ -258,6 +325,28 @@ void definite_switch(int mode) {
   (void)*p;     // expected-note 3 {{later used here}}
 }
 
+void definite_switch_gsl(int mode) {
+  View v;
+  switch (mode) {
+  case 1: {
+    MyObj temp1;
+    v = temp1;  // expected-warning {{object whose reference is captured does not live long enough}}
+    break;      // expected-note {{destroyed here}}
+  }
+  case 2: {
+    MyObj temp2;
+    v = temp2;  // expected-warning {{object whose reference is captured does not live long enough}}
+    break;      // expected-note {{destroyed here}}
+  }
+  default: {
+    MyObj temp3;
+    v = temp3;  // expected-warning {{object whose reference is captured does not live long enough}}
+    break;      // expected-note {{destroyed here}}
+  }
+  }
+  v.use();      // expected-note 3 {{later used here}}
+}
+
 //===----------------------------------------------------------------------===//
 // No-Error Cases
 //===----------------------------------------------------------------------===//
@@ -271,3 +360,14 @@ void no_error_if_dangle_then_rescue() {
   p = &safe;    // p is "rescued" before use.
   (void)*p;     // This is safe.
 }
+
+void no_error_if_dangle_then_rescue_gsl() {
+  MyObj safe;
+  View v;
+  {
+    MyObj temp;
+    v = temp;  // 'v' is temporarily dangling.
+  }
+  v = safe;    // 'v' is "rescued" before use by reassigning to a valid object.
+  v.use();     // This is safe.
+}
diff --git a/clang/test/SemaCUDA/consteval-func.cu b/clang/test/SemaCUDA/consteval-func.cu
new file mode 100644
index 0000000000000..293c1ce85830a
--- /dev/null
+++ b/clang/test/SemaCUDA/consteval-func.cu
@@ -0,0 +1,8 @@
+// RUN: %clang_cc1 -std=c++20 -fsyntax-only -verify %s
+
+// expected-no-diagnostics
+
+#include "Inputs/cuda.h"
+
+__device__ consteval int f() { return 0; }
+int main() { return f(); }
diff --git a/clang/test/SemaCXX/PR51712-large-array-constexpr-check-oom.cpp b/clang/test/SemaCXX/PR51712-large-array-constexpr-check-oom.cpp
index 98e1a9afae6ea..df5d8c513d514 100644
--- a/clang/test/SemaCXX/PR51712-large-array-constexpr-check-oom.cpp
+++ b/clang/test/SemaCXX/PR51712-large-array-constexpr-check-oom.cpp
@@ -1,7 +1,6 @@
 // Only run this test where ulimit is known to work well.
 // (There's nothing really platform-specific being tested, this is just ulimit).
 //
-// REQUIRES: shell
 // REQUIRES: system-linux
 // UNSUPPORTED: msan
 // UNSUPPORTED: asan
diff --git a/clang/test/SemaCXX/PR68605.cpp b/clang/test/SemaCXX/PR68605.cpp
new file mode 100644
index 0000000000000..97eb858b77246
--- /dev/null
+++ b/clang/test/SemaCXX/PR68605.cpp
@@ -0,0 +1,72 @@
+// RUN: %clang_cc1 -verify -fsyntax-only -std=c++20 -Wshadow %s
+// RUN: %clang_cc1 -verify=all -fsyntax-only -std=c++20 -Wshadow-all %s
+
+// Test for issue #68605: Inconsistent shadow warnings for lambda capture of structured bindings.
+// 
+// The issue was that structured binding lambda captures were incorrectly classified
+// as regular shadow warnings (shown with -Wshadow) while regular parameter captures 
+// were classified as uncaptured-local warnings (shown only with -Wshadow-all).
+//
+// This test validates that both VarDecl and BindingDecl lambda captures now 
+// behave consistently: no warnings with -Wshadow, but uncaptured-local warnings 
+// with -Wshadow-all.
+
+namespace std {
+  template<typename T> T&& move(T&& t) { return static_cast<T&&>(t); }
+}
+
+namespace issue_68605 {
+
+// Simple pair-like struct for testing
+struct Pair {
+  int first;
+  int second;
+  Pair(int f, int s) : first(f), second(s) {}
+};
+
+// Test case 1: Regular parameter - consistent behavior
+void foo1(Pair val) { // all-note {{previous declaration is here}}
+  [val = std::move(val)](){}(); // all-warning {{declaration shadows a local variable}}
+}
+
+// Test case 2: Structured binding - now consistent with regular parameter
+void foo2(Pair val) {
+  auto [a,b] = val; // all-note {{previous declaration is here}}
+  [a = std::move(a)](){}(); // all-warning {{declaration shadows a structured binding}}
+}
+
+// Test case 3: Multiple captures showing consistent behavior
+void foo3() {
+  Pair data{42, 100};
+  auto [id, value] = data; // all-note 2{{previous declaration is here}}
+  
+  // Both show consistent uncaptured-local warnings with -Wshadow-all
+  auto lambda1 = [id = id](){ return id; }; // all-warning {{declaration shadows a structured binding}}
+  auto lambda2 = [value = value](){ return value; }; // all-warning {{declaration shadows a structured binding}}
+}
+
+// Test case 4: Mixed scenario showing consistent behavior
+void foo4() {
+  int regular_var = 10; // all-note {{previous declaration is here}}
+  Pair pair_data{1, 2};
+  auto [x, y] = pair_data; // all-note 2{{previous declaration is here}}
+  
+  // All captures now show consistent uncaptured-local warnings with -Wshadow-all
+  auto lambda1 = [regular_var = regular_var](){}; // all-warning {{declaration shadows a local variable}}
+  auto lambda2 = [x = x](){}; // all-warning {{declaration shadows a structured binding}}
+  auto lambda3 = [y = y](){}; // all-warning {{declaration shadows a structured binding}}
+}
+
+// Test case 5: Ensure we don't break existing shadow detection for actual shadowing
+void foo5() {
+  int outer = 5; // expected-note {{previous declaration is here}} all-note {{previous declaration is here}}
+  auto [a, b] = Pair{1, 2}; // expected-note {{previous declaration is here}} all-note {{previous declaration is here}}
+  
+  // This SHOULD still warn - it's actual shadowing within the lambda body
+  auto lambda = [outer, a](){ // expected-note {{variable 'outer' is explicitly captured here}} all-note {{variable 'outer' is explicitly captured here}} expected-note {{variable 'a' is explicitly captured here}} all-note {{variable 'a' is explicitly captured here}}
+    int outer = 10; // expected-warning {{declaration shadows a local variable}} all-warning {{declaration shadows a local variable}}
+    int a = 20;     // expected-warning {{declaration shadows a structured binding}} all-warning {{declaration shadows a structured binding}}
+  };
+}
+
+} // namespace issue_68605
diff --git a/clang/test/SemaCXX/reinterpret-cast.cpp b/clang/test/SemaCXX/reinterpret-cast.cpp
index bfb808773b900..10b2ed183e2a5 100644
--- a/clang/test/SemaCXX/reinterpret-cast.cpp
+++ b/clang/test/SemaCXX/reinterpret-cast.cpp
@@ -167,6 +167,10 @@ void dereference_reinterpret_cast() {
   (void)reinterpret_cast<float&>(d);  // expected-warning {{reinterpret_cast from 'double' to 'float &' has undefined behavior}}
   (void)*reinterpret_cast<float*>(&d);  // expected-warning {{dereference of type 'float *' that was reinterpret_cast from type 'double *' has undefined behavior}}
 
+  // Look through parens
+  (void)*(reinterpret_cast<double*>(&l));  // expected-warning {{dereference of type 'double *' that was reinterpret_cast from type 'long *' has undefined behavior}}
+  (void)*((reinterpret_cast<double*>((&l))));  // expected-warning {{dereference of type 'double *' that was reinterpret_cast from type 'long *' has undefined behavior}}
+
   // TODO: add warning for tag types
   (void)reinterpret_cast<A&>(b);
   (void)*reinterpret_cast<A*>(&b);
diff --git a/clang/test/SemaCXX/warn-shadow-in-lambdas.cpp b/clang/test/SemaCXX/warn-shadow-in-lambdas.cpp
index d54b394df4eb8..2388c5f16e4ca 100644
--- a/clang/test/SemaCXX/warn-shadow-in-lambdas.cpp
+++ b/clang/test/SemaCXX/warn-shadow-in-lambdas.cpp
@@ -258,10 +258,15 @@ struct S {
 };
 
 int foo() {
-  auto [a] = S{0}; // expected-note {{previous}} \
-                   // cxx14-warning {{decomposition declarations are a C++17 extension}}
+#ifdef AVOID
+  auto [a] = S{0}; // cxx14-warning {{decomposition declarations are a C++17 extension}}
+  [a = a] () { // No warning with basic -Wshadow due to uncaptured-local classification
+  }();
+#else
+  auto [a] = S{0}; // cxx14-warning {{decomposition declarations are a C++17 extension}} expected-note {{previous declaration is here}}
   [a = a] () { // expected-warning {{declaration shadows a structured binding}}
   }();
+#endif
 }
 
 }
diff --git a/clang/test/SemaCXX/warn-unsafe-buffer-usage-debug-unclaimed/warn-unsafe-buffer-usage-debug-unclaimed.cpp b/clang/test/SemaCXX/warn-unsafe-buffer-usage-debug-unclaimed/warn-unsafe-buffer-usage-debug-unclaimed.cpp
index ab3d925753d47..64dede2568df1 100644
--- a/clang/test/SemaCXX/warn-unsafe-buffer-usage-debug-unclaimed/warn-unsafe-buffer-usage-debug-unclaimed.cpp
+++ b/clang/test/SemaCXX/warn-unsafe-buffer-usage-debug-unclaimed/warn-unsafe-buffer-usage-debug-unclaimed.cpp
@@ -13,7 +13,6 @@
 // This debugging facility is only available in debug builds.
 //
 // REQUIRES: asserts
-// REQUIRES: shell
 
 void test_unclaimed_use(int *p) { // expected-warning{{'p' is an unsafe pointer used for buffer access}}
   p++;           //  expected-note{{used in pointer arithmetic here}} \
diff --git a/clang/test/SemaObjC/arc-decls.m b/clang/test/SemaObjC/arc-decls.m
index 2b4a9b0319def..3ff4fc0dd50dc 100644
--- a/clang/test/SemaObjC/arc-decls.m
+++ b/clang/test/SemaObjC/arc-decls.m
@@ -131,7 +131,7 @@ void test7(void) {
   id *px = &x; // expected-error {{pointer to non-const type 'id' with no explicit ownership}}
 
   I *y;
-  J **py = &y; // expected-error {{pointer to non-const type 'J *' with no explicit ownership}} expected-warning {{incompatible pointer types initializing}}
+  J **py = &y; // expected-error {{pointer to non-const type 'J *' with no explicit ownership}} expected-error {{incompatible pointer types initializing}}
 }
 
 void func(void) __attribute__((objc_ownership(none)));  // expected-warning {{'objc_ownership' only applies to Objective-C object or block pointer types; type here is 'void (void)'}}
diff --git a/clang/test/SemaObjC/arc-objcbridge-related-attribute.m b/clang/test/SemaObjC/arc-objcbridge-related-attribute.m
index 7fd9f804a83e2..da291727e3d2b 100644
--- a/clang/test/SemaObjC/arc-objcbridge-related-attribute.m
+++ b/clang/test/SemaObjC/arc-objcbridge-related-attribute.m
@@ -24,9 +24,9 @@ - (NSColor *)backgroundColor;
 }
 
 NSColor * Test2(NSTextField *textField, CGColorRef1 newColor) {
-  foo(newColor); // expected-warning {{incompatible pointer types passing 'CGColorRef1' (aka 'struct CGColor1 *') to parameter of type 'NSColor *'}}
-  textField.backgroundColor = newColor; // expected-warning {{incompatible pointer types assigning to 'NSColor *__strong' from 'CGColorRef1' (aka 'struct CGColor1 *')}}
-  return newColor; // expected-warning {{incompatible pointer types returning 'CGColorRef1' (aka 'struct CGColor1 *') from a function with result type 'NSColor *'}}
+  foo(newColor); // expected-error {{incompatible pointer types passing 'CGColorRef1' (aka 'struct CGColor1 *') to parameter of type 'NSColor *'}}
+  textField.backgroundColor = newColor; // expected-error {{incompatible pointer types assigning to 'NSColor *__strong' from 'CGColorRef1' (aka 'struct CGColor1 *')}}
+  return newColor; // expected-error {{incompatible pointer types returning 'CGColorRef1' (aka 'struct CGColor1 *') from a function with result type 'NSColor *'}}
 }
 
 CGColorRef Test3(NSTextField *textField, CGColorRef newColor) {
@@ -35,6 +35,6 @@ CGColorRef Test3(NSTextField *textField, CGColorRef newColor) {
 }
 
 CGColorRef2 Test4(NSTextField *textField, CGColorRef2 newColor) {
-  newColor = textField.backgroundColor; // expected-warning {{incompatible pointer types assigning}}
-  return textField.backgroundColor; // expected-warning {{incompatible pointer types returning}}
+  newColor = textField.backgroundColor; // expected-error {{incompatible pointer types assigning}}
+  return textField.backgroundColor; // expected-error {{incompatible pointer types returning}}
 }
diff --git a/clang/test/SemaObjC/arc.m b/clang/test/SemaObjC/arc.m
index 7cc4d824ab52f..010d576f414f7 100644
--- a/clang/test/SemaObjC/arc.m
+++ b/clang/test/SemaObjC/arc.m
@@ -751,9 +751,9 @@ @interface NSMutableArray : NSArray @end
 typedef __strong NSMutableArray * PSNS;
 
 void test(NSArray *x) {
-  NSMutableArray *y = x; // expected-warning {{incompatible pointer types initializing 'NSMutableArray *' with an expression of type 'NSArray *'}}
-  __strong NSMutableArray *y1 = x; // expected-warning {{incompatible pointer types initializing 'NSMutableArray *' with an expression of type 'NSArray *'}}
-  PSNS y2 = x; // expected-warning {{incompatible pointer types initializing 'NSMutableArray *' with an expression of type 'NSArray *'}}
+  NSMutableArray *y = x; // expected-error {{incompatible pointer types initializing 'NSMutableArray *' with an expression of type 'NSArray *'}}
+  __strong NSMutableArray *y1 = x; // expected-error {{incompatible pointer types initializing 'NSMutableArray *' with an expression of type 'NSArray *'}}
+  PSNS y2 = x; // expected-error {{incompatible pointer types initializing 'NSMutableArray *' with an expression of type 'NSArray *'}}
 }
 
 @class NSString;
diff --git a/clang/test/SemaObjC/attr-objc-NSObject.m b/clang/test/SemaObjC/attr-objc-NSObject.m
index 76a01dcef0163..6f5c369c56a7f 100644
--- a/clang/test/SemaObjC/attr-objc-NSObject.m
+++ b/clang/test/SemaObjC/attr-objc-NSObject.m
@@ -18,6 +18,6 @@ void good() {
 void bad() {
   BarRef object;
   NSArray<BarRef> *array; // expected-error {{type argument 'BarRef' (aka 'struct Bar *') is neither an Objective-C object nor a block type}}
-  [array containsObject:object]; // expected-warning {{incompatible pointer types sending 'BarRef' (aka 'struct Bar *') to parameter of type 'id'}}
+  [array containsObject:object]; // expected-error {{incompatible pointer types sending 'BarRef' (aka 'struct Bar *') to parameter of type 'id'}}
   [object description]; // expected-warning {{receiver type 'BarRef' (aka 'struct Bar *') is not 'id' or interface pointer, consider casting it to 'id'}}
 }
diff --git a/clang/test/SemaObjC/check-objcbridge-related-attribute-lookup.m b/clang/test/SemaObjC/check-objcbridge-related-attribute-lookup.m
index 89486f0336422..fe3851717ceae 100644
--- a/clang/test/SemaObjC/check-objcbridge-related-attribute-lookup.m
+++ b/clang/test/SemaObjC/check-objcbridge-related-attribute-lookup.m
@@ -20,21 +20,21 @@ - (NSColor *)backgroundColor;
 
 NSColor * Test1(NSTextField *textField, CGColorRef newColor) {
  textField.backgroundColor = newColor; // expected-error {{'CGColorRef' (aka 'struct CGColor *') must be explicitly converted to 'NSColor *'; use '+colorXWithCGColor:' method for this conversion}} \
-					// expected-warning {{incompatible pointer types assigning to 'NSColor *' from 'CGColorRef' (aka 'struct CGColor *')}}
+					// expected-error {{incompatible pointer types assigning to 'NSColor *' from 'CGColorRef' (aka 'struct CGColor *')}}
  newColor = textField.backgroundColor; // expected-error {{'NSColor *' must be explicitly converted to 'CGColorRef' (aka 'struct CGColor *'); use '-CXGColor' method for this conversion}} \
-					// expected-warning {{incompatible pointer types assigning to 'CGColorRef' (aka 'struct CGColor *') from 'NSColor *'}}
+					// expected-error {{incompatible pointer types assigning to 'CGColorRef' (aka 'struct CGColor *') from 'NSColor *'}}
 }
 NSColor * Test2(NSTextField *textField, CGColorRef1 newColor) {
  textField.backgroundColor = newColor; // expected-error {{could not find Objective-C class 'XNSColor' to convert 'CGColorRef1' (aka 'struct CGColor1 *') to 'NSColor *'}} \
-				       // expected-warning {{incompatible pointer types assigning to 'NSColor *' from 'CGColorRef1' (aka 'struct CGColor1 *')}}
+				       // expected-error {{incompatible pointer types assigning to 'NSColor *' from 'CGColorRef1' (aka 'struct CGColor1 *')}}
  newColor = textField.backgroundColor ; // expected-error {{could not find Objective-C class 'XNSColor' to convert 'NSColor *' to 'CGColorRef1' (aka 'struct CGColor1 *')}} \
-					// expected-warning {{incompatible pointer types assigning to 'CGColorRef1' (aka 'struct CGColor1 *') from 'NSColor *'}}
+					// expected-error {{incompatible pointer types assigning to 'CGColorRef1' (aka 'struct CGColor1 *') from 'NSColor *'}}
 }
 
 NSColor * Test3(NSTextField *textField, CGColorRef2 newColor) {
  textField.backgroundColor = newColor; // expected-error {{'PNsColor' must be name of an Objective-C class to be able to convert 'CGColorRef2' (aka 'struct CGColor2 *') to 'NSColor *'}} \
-					// expected-warning {{incompatible pointer types assigning to 'NSColor *' from 'CGColorRef2' (aka 'struct CGColor2 *')}}
+					// expected-error {{incompatible pointer types assigning to 'NSColor *' from 'CGColorRef2' (aka 'struct CGColor2 *')}}
  newColor = textField.backgroundColor; // expected-error {{'PNsColor' must be name of an Objective-C class to be able to convert 'NSColor *' to 'CGColorRef2' (aka 'struct CGColor2 *')}} \
-					// expected-warning {{incompatible pointer types assigning to 'CGColorRef2' (aka 'struct CGColor2 *') from 'NSColor *'}}
+					// expected-error {{incompatible pointer types assigning to 'CGColorRef2' (aka 'struct CGColor2 *') from 'NSColor *'}}
 }
 
diff --git a/clang/test/SemaObjC/class-method-self.m b/clang/test/SemaObjC/class-method-self.m
index 821160c884144..4c0bcc73c1f86 100644
--- a/clang/test/SemaObjC/class-method-self.m
+++ b/clang/test/SemaObjC/class-method-self.m
@@ -17,8 +17,8 @@ @implementation YY
 static XX *obj;
 
 + (void)classMethod {
-  [obj addObserver:self];     // expected-warning {{incompatible pointer types sending 'Class' to parameter of type 'XX *'}}
+  [obj addObserver:self];     // expected-error {{incompatible pointer types sending 'Class' to parameter of type 'XX *'}}
   Class whatever;
-  [obj addObserver:whatever]; // expected-warning {{incompatible pointer types sending 'Class' to parameter of type 'XX *'}}
+  [obj addObserver:whatever]; // expected-error {{incompatible pointer types sending 'Class' to parameter of type 'XX *'}}
 }
 @end
diff --git a/clang/test/SemaObjC/comptypes-1.m b/clang/test/SemaObjC/comptypes-1.m
index a18ebd13ce1b6..3829ad386ee3e 100644
--- a/clang/test/SemaObjC/comptypes-1.m
+++ b/clang/test/SemaObjC/comptypes-1.m
@@ -37,9 +37,9 @@ int main(void)
      warning, unless done from an 'id'.  */
   obj_c = obj;    /* Ok */
   obj_c = obj_p;  // expected-warning {{assigning to 'MyClass *' from incompatible type 'id<MyProtocol>'}}
-  obj_c = obj_cp; // expected-warning {{incompatible pointer types assigning to 'MyClass *' from 'MyOtherClass *'}}
-  obj_c = obj_C;  // expected-warning {{incompatible pointer types assigning to 'MyClass *' from 'Class'}}
-  obj_c = obj_CP; // expected-warning {{incompatible pointer types assigning to 'MyClass *' from 'Class<MyProtocol>'}}
+  obj_c = obj_cp; // expected-error {{incompatible pointer types assigning to 'MyClass *' from 'MyOtherClass *'}}
+  obj_c = obj_C;  // expected-error {{incompatible pointer types assigning to 'MyClass *' from 'Class'}}
+  obj_c = obj_CP; // expected-error {{incompatible pointer types assigning to 'MyClass *' from 'Class<MyProtocol>'}}
 
   /* Assigning to an 'id<MyProtocol>' variable should generate a
      warning if done from a 'MyClass *' (which doesn't implement
@@ -48,28 +48,28 @@ int main(void)
   obj_p = obj;    /* Ok */
   obj_p = obj_c;  // expected-warning {{assigning to 'id<MyProtocol>' from incompatible type 'MyClass *'}}
   obj_p = obj_cp; /* Ok  */
-  obj_p = obj_C;  // expected-warning {{incompatible pointer types assigning to 'id<MyProtocol>' from 'Class'}}
+  obj_p = obj_C;  // expected-error {{incompatible pointer types assigning to 'id<MyProtocol>' from 'Class'}}
   obj_p = obj_CP; // expected-warning {{assigning to 'id<MyProtocol>' from incompatible type 'Class<MyProtocol>'}}
 
   /* Assigning to a 'MyOtherClass *' variable should always generate
      a warning, unless done from an 'id' or an 'id<MyProtocol>' (since
      MyOtherClass implements MyProtocol).  */
   obj_cp = obj;    /* Ok */
-  obj_cp = obj_c;  // expected-warning {{incompatible pointer types assigning to 'MyOtherClass *' from 'MyClass *'}}
+  obj_cp = obj_c;  // expected-error {{incompatible pointer types assigning to 'MyOtherClass *' from 'MyClass *'}}
   obj_cp = obj_p;  /* Ok */
-  obj_cp = obj_C;  // expected-warning {{incompatible pointer types assigning to 'MyOtherClass *' from 'Class'}}
-  obj_cp = obj_CP; // expected-warning {{incompatible pointer types assigning to 'MyOtherClass *' from 'Class<MyProtocol>'}}
+  obj_cp = obj_C;  // expected-error {{incompatible pointer types assigning to 'MyOtherClass *' from 'Class'}}
+  obj_cp = obj_CP; // expected-error {{incompatible pointer types assigning to 'MyOtherClass *' from 'Class<MyProtocol>'}}
 
   obj_C = obj;     // Ok
-  obj_C = obj_p;   // expected-warning {{incompatible pointer types assigning to 'Class' from 'id<MyProtocol>'}}
-  obj_C = obj_c;   // expected-warning {{incompatible pointer types assigning to 'Class' from 'MyClass *'}}
-  obj_C = obj_cp;  // expected-warning {{incompatible pointer types assigning to 'Class' from 'MyOtherClass *'}}
+  obj_C = obj_p;   // expected-error {{incompatible pointer types assigning to 'Class' from 'id<MyProtocol>'}}
+  obj_C = obj_c;   // expected-error {{incompatible pointer types assigning to 'Class' from 'MyClass *'}}
+  obj_C = obj_cp;  // expected-error {{incompatible pointer types assigning to 'Class' from 'MyOtherClass *'}}
   obj_C = obj_CP;  // Ok
 
   obj_CP = obj;     // Ok
   obj_CP = obj_p;   // expected-warning {{assigning to 'Class<MyProtocol>' from incompatible type 'id<MyProtocol>'}}
-  obj_CP = obj_c;   // expected-warning {{incompatible pointer types assigning to 'Class<MyProtocol>' from 'MyClass *}}
-  obj_CP = obj_cp;  // expected-warning {{incompatible pointer types assigning to 'Class<MyProtocol>' from 'MyOtherClass *'}}
+  obj_CP = obj_c;   // expected-error {{incompatible pointer types assigning to 'Class<MyProtocol>' from 'MyClass *}}
+  obj_CP = obj_cp;  // expected-error {{incompatible pointer types assigning to 'Class<MyProtocol>' from 'MyOtherClass *'}}
   obj_CP = obj_C;   // Ok
 
   /* Any comparison involving an 'id' must be without warnings.  */
diff --git a/clang/test/SemaObjC/comptypes-4.m b/clang/test/SemaObjC/comptypes-4.m
index f1c2e54570c74..b939af2defead 100644
--- a/clang/test/SemaObjC/comptypes-4.m
+++ b/clang/test/SemaObjC/comptypes-4.m
@@ -12,7 +12,7 @@ int main(void)
   MyClass *obj_cp;
 
   obj_cp = obj_p;  
-  obj_p = obj_cp;	// expected-warning {{incompatible pointer types assigning to 'MyClass<MyProtocol> *' from 'MyClass *'}}
+  obj_p = obj_cp;	// expected-error {{incompatible pointer types assigning to 'MyClass<MyProtocol> *' from 'MyClass *'}}
 
   if (obj_cp == obj_p)
     foo();
diff --git a/clang/test/SemaObjC/comptypes-5.m b/clang/test/SemaObjC/comptypes-5.m
index 4b50684938c9b..fdf075b387bd7 100644
--- a/clang/test/SemaObjC/comptypes-5.m
+++ b/clang/test/SemaObjC/comptypes-5.m
@@ -38,7 +38,7 @@ int main(void)
 
   obj_c_cat_p = obj_c_super_p; // ok.
   obj_c_cat_p = obj_c_super_p_q; // ok.
-  obj_c_super_p = obj_c_cat_p_q; // expected-warning {{incompatible pointer types}}
+  obj_c_super_p = obj_c_cat_p_q; // expected-error {{incompatible pointer types}}
   obj_c_cat_p_q = obj_c_super_p;
   return 0;
 }
diff --git a/clang/test/SemaObjC/comptypes-6.m b/clang/test/SemaObjC/comptypes-6.m
index 98cf488792123..0e1c3e31035c1 100644
--- a/clang/test/SemaObjC/comptypes-6.m
+++ b/clang/test/SemaObjC/comptypes-6.m
@@ -9,7 +9,7 @@ @interface Object @end
 
 static Derived *test(void)
 {
-   Derived *m = foo();   // expected-warning {{incompatible pointer types initializing 'Derived *' with an expression of type 'Object *'}}
+   Derived *m = foo();   // expected-error {{incompatible pointer types initializing 'Derived *' with an expression of type 'Object *'}}
 
    return m;
 }
diff --git a/clang/test/SemaObjC/comptypes-7.m b/clang/test/SemaObjC/comptypes-7.m
index 5de24837c7bf8..eac06c89e8809 100644
--- a/clang/test/SemaObjC/comptypes-7.m
+++ b/clang/test/SemaObjC/comptypes-7.m
@@ -25,26 +25,26 @@ int main(void)
      incompatible integer to/from pointer conversions default to an error. */
   
   obj = i; // expected-error {{incompatible integer to pointer conversion assigning to 'id' from 'int'}}
-  obj = j; // expected-warning {{incompatible pointer types assigning to 'id' from 'int *'}}
+  obj = j; // expected-error {{incompatible pointer types assigning to 'id' from 'int *'}}
 
   obj_p = i; // expected-error {{incompatible integer to pointer conversion assigning to 'id<MyProtocol>' from 'int'}}
-  obj_p = j; // expected-warning {{incompatible pointer types assigning to 'id<MyProtocol>' from 'int *'}}
+  obj_p = j; // expected-error {{incompatible pointer types assigning to 'id<MyProtocol>' from 'int *'}}
   
   obj_c = i; // expected-error {{incompatible integer to pointer conversion assigning to 'MyClass *' from 'int'}}
-  obj_c = j; // expected-warning {{incompatible pointer types assigning to 'MyClass *' from 'int *'}}
+  obj_c = j; // expected-error {{incompatible pointer types assigning to 'MyClass *' from 'int *'}}
 
   obj_C = i; // expected-error {{incompatible integer to pointer conversion assigning to 'Class' from 'int'}}
-  obj_C = j; // expected-warning {{incompatible pointer types assigning to 'Class' from 'int *'}}
+  obj_C = j; // expected-error {{incompatible pointer types assigning to 'Class' from 'int *'}}
   
   i = obj;   // expected-error {{incompatible pointer to integer conversion assigning to 'int' from 'id'}}
   i = obj_p; // expected-error {{incompatible pointer to integer conversion assigning to 'int' from 'id<MyProtocol>'}}
   i = obj_c; // expected-error {{incompatible pointer to integer conversion assigning to 'int' from 'MyClass *'}}
   i = obj_C; // expected-error {{incompatible pointer to integer conversion assigning to 'int' from 'Class'}}
   
-  j = obj;   // expected-warning {{incompatible pointer types assigning to 'int *' from 'id'}}
-  j = obj_p; // expected-warning {{incompatible pointer types assigning to 'int *' from 'id<MyProtocol>'}}
-  j = obj_c; // expected-warning {{incompatible pointer types assigning to 'int *' from 'MyClass *'}}
-  j = obj_C; // expected-warning {{incompatible pointer types assigning to 'int *' from 'Class'}}
+  j = obj;   // expected-error {{incompatible pointer types assigning to 'int *' from 'id'}}
+  j = obj_p; // expected-error {{incompatible pointer types assigning to 'int *' from 'id<MyProtocol>'}}
+  j = obj_c; // expected-error {{incompatible pointer types assigning to 'int *' from 'MyClass *'}}
+  j = obj_C; // expected-error {{incompatible pointer types assigning to 'int *' from 'Class'}}
   
   if (obj == i) foo() ; // expected-warning {{comparison between pointer and integer ('id' and 'int')}}
   if (i == obj) foo() ; // expected-warning {{comparison between pointer and integer ('int' and 'id')}}
diff --git a/clang/test/SemaObjC/conditional-expr-2.m b/clang/test/SemaObjC/conditional-expr-2.m
index fdf3d1381a87f..23371aa463ab0 100644
--- a/clang/test/SemaObjC/conditional-expr-2.m
+++ b/clang/test/SemaObjC/conditional-expr-2.m
@@ -25,5 +25,5 @@ void foo (int i, NSKey *NSKeyValueCoding_NullValue, UpdatesList *nukedUpdatesLis
   obj = i ? NSKeyValueCoding_NullValue : nukedUpdatesList; // expected-warning{{incompatible operand types ('NSKey *' and 'UpdatesList *')}}
   key = i ? NSKeyValueCoding_NullValue : nukedUpdatesList; // expected-warning{{incompatible operand types ('NSKey *' and 'UpdatesList *')}}
   key = i ? NSKeyValueCoding_NullValue : keysub;
-  keysub = i ? NSKeyValueCoding_NullValue : keysub; // expected-warning{{incompatible pointer types assigning to 'KeySub *' from 'NSKey *'}}
+  keysub = i ? NSKeyValueCoding_NullValue : keysub; // expected-error{{incompatible pointer types assigning to 'KeySub *' from 'NSKey *'}}
 }
diff --git a/clang/test/SemaObjC/conditional-expr.m b/clang/test/SemaObjC/conditional-expr.m
index 71bdb1b2d341b..eb72c9b50d491 100644
--- a/clang/test/SemaObjC/conditional-expr.m
+++ b/clang/test/SemaObjC/conditional-expr.m
@@ -110,7 +110,7 @@ int f8(int a, A<P0> *x, A *y) {
 void f9(int a, A<P0> *x, A<P1> *y) {
   id l0 = (a ? x : y );     // Ok. y is of A<P1> object type and A is qualified by P0.
   A<P0> *l1 = (a ? x : y ); // Ok. y is of A<P1> object type and A is qualified by P0.
-  A<P1> *l2 = (a ? x : y ); // expected-warning {{incompatible pointer types initializing 'A<P1> *' with an expression of type 'A<P0> *'}}
+  A<P1> *l2 = (a ? x : y ); // expected-error {{incompatible pointer types initializing 'A<P1> *' with an expression of type 'A<P0> *'}}
   (void)[ (a ? x : y ) intProp ]; // Ok. Common type is A<P0> * and P0's property intProp is accessed.
 }
 
@@ -123,9 +123,9 @@ void f11(int a, id<P0> x, id<P1> y) {
 }
 
 void f12(int a, A<P0> *x, A<P1> *y) {
-  A<P1>* l0 = (a ? x : y ); // expected-warning {{incompatible pointer types initializing 'A<P1> *' with an expression of type 'A<P0> *'}}
+  A<P1>* l0 = (a ? x : y ); // expected-error {{incompatible pointer types initializing 'A<P1> *' with an expression of type 'A<P0> *'}}
 }
 
 void f13(int a, B<P3, P0> *x, E<P0, P4> *y) {
-  int *ip = a ? x : y; // expected-warning{{expression of type 'A<P1> *'}}
+  int *ip = a ? x : y; // expected-error{{incompatible pointer types initializing 'int *' with an expression of type 'A<P1> *'}}
 }
diff --git a/clang/test/SemaObjC/id.m b/clang/test/SemaObjC/id.m
index e599758dcafe7..dcebbec862f01 100644
--- a/clang/test/SemaObjC/id.m
+++ b/clang/test/SemaObjC/id.m
@@ -9,8 +9,8 @@ void foo(void) {
   // Test assignment compatibility of Class and id.  No warning should be
   // produced.
   // Class and id<foo> are compatible.
-  S = T; // expected-warning {{incompatible pointer types assigning to 'id<Foo>' from 'Class'}}
-  T = S; // expected-warning {{incompatible pointer types assigning to 'Class' from 'id<Foo>'}}
+  S = T; // expected-error {{incompatible pointer types assigning to 'id<Foo>' from 'Class'}}
+  T = S; // expected-error {{incompatible pointer types assigning to 'Class' from 'id<Foo>'}}
   R = T; T = R;
   R = S; S = R;
 }
diff --git a/clang/test/SemaObjC/incompatible-protocol-qualified-types.m b/clang/test/SemaObjC/incompatible-protocol-qualified-types.m
index 494d23e8b2678..b8264d1565fe5 100644
--- a/clang/test/SemaObjC/incompatible-protocol-qualified-types.m
+++ b/clang/test/SemaObjC/incompatible-protocol-qualified-types.m
@@ -21,15 +21,15 @@ @interface INTF @end
 
 INTF <MyProto1, MyProto2> * Func2(INTF <MyProto1> *p2)
 {
-	Func(p2);	// expected-warning {{incompatible pointer types passing 'INTF<MyProto1> *' to parameter of type 'INTF<MyProto1,MyProto2> *'}}
-	return p2;	// expected-warning {{incompatible pointer types returning 'INTF<MyProto1> *' from a function with result type 'INTF<MyProto1,MyProto2> *'}}
+	Func(p2);	// expected-error {{incompatible pointer types passing 'INTF<MyProto1> *' to parameter of type 'INTF<MyProto1,MyProto2> *'}}
+	return p2;	// expected-error {{incompatible pointer types returning 'INTF<MyProto1> *' from a function with result type 'INTF<MyProto1,MyProto2> *'}}
 }
 
 
 
 INTF <MyProto1> * Func3(INTF <MyProto2> *p2)
 {
-	return p2;	// expected-warning {{incompatible pointer types returning 'INTF<MyProto2> *' from a function with result type 'INTF<MyProto1> *'}}
+	return p2;	// expected-error {{incompatible pointer types returning 'INTF<MyProto2> *' from a function with result type 'INTF<MyProto1> *'}}
 }
 
 
diff --git a/clang/test/SemaObjC/instancetype.m b/clang/test/SemaObjC/instancetype.m
index 2fe2f5cd8c0b4..a9dd4c7b5d173 100644
--- a/clang/test/SemaObjC/instancetype.m
+++ b/clang/test/SemaObjC/instancetype.m
@@ -143,7 +143,7 @@ - (int)otherMethodInProto2; // expected-warning{{protocol method is expected to
 
 @implementation Subclass4
 + (id)alloc {
-  return self; // expected-warning{{incompatible pointer types returning 'Class' from a function with result type 'Subclass4 *'}}
+  return self; // expected-error{{incompatible pointer types returning 'Class' from a function with result type 'Subclass4 *'}}
 }
 
 - (Subclass3 *)init { return 0; } // don't complain: we lost the related return type
@@ -166,12 +166,12 @@ void test_instancetype_inherited(void) {
 @implementation Subclass2
 - (instancetype)initSubclass2 { // expected-note {{explicitly declared 'instancetype'}}
   Subclass1 *sc1 = [[Subclass1 alloc] init];
-  return sc1; // expected-warning{{incompatible pointer types returning 'Subclass1 *' from a function with result type 'Subclass2 *'}}
+  return sc1; // expected-error{{incompatible pointer types returning 'Subclass1 *' from a function with result type 'Subclass2 *'}}
 }
 - (void)methodOnSubclass2 {}
 - (id)self {
   Subclass1 *sc1 = [[Subclass1 alloc] init];
-  return sc1; // expected-warning{{incompatible pointer types returning 'Subclass1 *' from a function with result type 'Subclass2 *'}}
+  return sc1; // expected-error{{incompatible pointer types returning 'Subclass1 *' from a function with result type 'Subclass2 *'}}
 }
 @end
 
@@ -201,10 +201,10 @@ @implementation A4 {
   B4 *_b;
 }
 - (id) foo {
-  return _b; // expected-warning {{incompatible pointer types returning 'B4 *' from a function with result type 'A4 *'}}
+  return _b; // expected-error {{incompatible pointer types returning 'B4 *' from a function with result type 'A4 *'}}
 }
 - (id) bar {
-  return _b; // expected-warning {{incompatible pointer types returning 'B4 *' from a function with result type 'A4 *'}}
+  return _b; // expected-error {{incompatible pointer types returning 'B4 *' from a function with result type 'A4 *'}}
 }
 
 // This is really just to ensure that we don't crash.
diff --git a/clang/test/SemaObjC/ivar-lookup.m b/clang/test/SemaObjC/ivar-lookup.m
index d88299e58e0f5..8854b1c212da6 100644
--- a/clang/test/SemaObjC/ivar-lookup.m
+++ b/clang/test/SemaObjC/ivar-lookup.m
@@ -29,7 +29,7 @@ - (int*)method;
 
 @implementation A
 - (int*)method {
-  int *ip = [Ivar method]; // expected-warning{{incompatible pointer types initializing 'int *' with an expression of type 'float *'}}
+  int *ip = [Ivar method]; // expected-error{{incompatible pointer types initializing 'int *' with an expression of type 'float *'}}
                            // Note that there is no warning in Objective-C++
   return 0;
 }
diff --git a/clang/test/SemaObjC/kindof.m b/clang/test/SemaObjC/kindof.m
index 1462814e36984..84da9a7f5d33a 100644
--- a/clang/test/SemaObjC/kindof.m
+++ b/clang/test/SemaObjC/kindof.m
@@ -75,16 +75,16 @@ - (NSNumber *)numberByAddingNumber:(NSNumber *)number;
 // ---------------------------------------------------------------------------
 void test_pretty_print(int *ip) {
   __kindof NSObject *kindof_NSObject;
-  ip = kindof_NSObject; // expected-warning{{from '__kindof NSObject *'}}
+  ip = kindof_NSObject; // expected-error{{from '__kindof NSObject *'}}
  
   __kindof NSObject_ptr_typedef kindof_NSObject_ptr;
-  ip = kindof_NSObject_ptr; // expected-warning{{from '__kindof NSObject_ptr_typedef'}}
+  ip = kindof_NSObject_ptr; // expected-error{{from '__kindof NSObject_ptr_typedef'}}
 
   __kindof id <NSCopying> *kindof_NSCopying;
-  ip = kindof_NSCopying; // expected-warning{{from '__kindof id<NSCopying> *'}}
+  ip = kindof_NSCopying; // expected-error{{from '__kindof id<NSCopying> *'}}
 
   __kindof NSObject_ptr_typedef *kindof_NSObject_ptr_typedef;
-  ip = kindof_NSObject_ptr_typedef; // expected-warning{{from '__kindof NSObject_ptr_typedef *'}}
+  ip = kindof_NSObject_ptr_typedef; // expected-error{{from '__kindof NSObject_ptr_typedef *'}}
 }
 
 // ---------------------------------------------------------------------------
@@ -161,9 +161,9 @@ void test_downcast_conversions(void) {
 
   // Implicit downcasting.
   kindof_NSString_obj = kindof_NSObject_obj;
-  kindof_NSString_obj = NSObject_obj; // expected-warning{{assigning to '__kindof NSString *' from 'NSObject *'}}
+  kindof_NSString_obj = NSObject_obj; // expected-error{{assigning to '__kindof NSString *' from 'NSObject *'}}
   NSString_obj = kindof_NSObject_obj;
-  NSString_obj = NSObject_obj; // expected-warning{{assigning to 'NSString *' from 'NSObject *'}}
+  NSString_obj = NSObject_obj; // expected-error{{assigning to 'NSString *' from 'NSObject *'}}
 
   // Implicit downcasting with qualified id.
   __kindof id <NSCopying> kindof_NSCopying_obj;
@@ -184,7 +184,7 @@ void test_crosscast_conversions(void) {
   __kindof NSNumber *kindof_NSNumber_obj;
   NSNumber *NSNumber_obj;
 
-  NSString_obj = kindof_NSNumber_obj; // expected-warning{{from '__kindof NSNumber *'}}
+  NSString_obj = kindof_NSNumber_obj; // expected-error{{from '__kindof NSNumber *'}}
 }
 
 @interface NSCell : NSObject
@@ -350,9 +350,9 @@ void implicit_convert_array(NSArray<__kindof NSString *> *kindofStringsArray,
   stringsArray = kindofStringsArray;
 
   // Other covariant and contravariant conversions still not permitted.
-  kindofStringsArray = mutStringsArray; // expected-warning{{incompatible pointer types}}
-  stringsArray = kindofMutStringsArray; // expected-warning{{incompatible pointer types}}
-  mutStringsArray = kindofStringsArray; // expected-warning{{incompatible pointer types}}
+  kindofStringsArray = mutStringsArray; // expected-error{{incompatible pointer types}}
+  stringsArray = kindofMutStringsArray; // expected-error{{incompatible pointer types}}
+  mutStringsArray = kindofStringsArray; // expected-error{{incompatible pointer types}}
 
   // Adding/removing nested __kindof is okay.
   NSArray<NSArray<__kindof NSString *> *> *kindofStringsArrayArray;
@@ -411,7 +411,7 @@ @interface NSDefaultGeneric<ObjectType : NSString *> : NSObject
 void testGeneric(NSGeneric<NSString*> *generic) {
   NSObject *NSObject_obj;
   // Assign from NSObject_obj to __kindof NSString*.
-  [generic test:NSObject_obj]; // expected-warning{{incompatible pointer types sending 'NSObject *' to parameter of type '__kindof NSString *'}}
+  [generic test:NSObject_obj]; // expected-error{{incompatible pointer types sending 'NSObject *' to parameter of type '__kindof NSString *'}}
   NSString *NSString_str;
   [generic test:NSString_str];
 }
@@ -421,29 +421,29 @@ void testGenericAssignment(void) {
   NSNumber *NSNumber_obj;
 
   NSGeneric<NSString*> *generic;
-  NSMutableString_str = generic.object; // expected-warning{{incompatible pointer types}}
-  NSNumber_obj = generic.object; // expected-warning{{incompatible pointer types}}
+  NSMutableString_str = generic.object; // expected-error{{incompatible pointer types}}
+  NSNumber_obj = generic.object; // expected-error{{incompatible pointer types}}
   NSMutableString_str = generic.kindof_object;
-  NSNumber_obj = generic.kindof_object; // expected-warning{{incompatible pointer types assigning to 'NSNumber *' from '__kindof NSString *'}}
+  NSNumber_obj = generic.kindof_object; // expected-error{{incompatible pointer types assigning to 'NSNumber *' from '__kindof NSString *'}}
 
   NSGeneric<__kindof NSString*> *kindof_generic;
   NSMutableString_str = kindof_generic.object;
-  NSNumber_obj = kindof_generic.object; // expected-warning{{incompatible pointer types assigning to 'NSNumber *' from '__kindof NSString *'}}
+  NSNumber_obj = kindof_generic.object; // expected-error{{incompatible pointer types assigning to 'NSNumber *' from '__kindof NSString *'}}
   NSMutableString_str = kindof_generic.kindof_object;
-  NSNumber_obj = kindof_generic.kindof_object; // expected-warning{{incompatible pointer types assigning to 'NSNumber *' from '__kindof __kindof NSString *'}}
+  NSNumber_obj = kindof_generic.kindof_object; // expected-error{{incompatible pointer types assigning to 'NSNumber *' from '__kindof __kindof NSString *'}}
 
   NSDefaultGeneric *default_generic;
   NSMutableString_str = default_generic.object;
-  NSNumber_obj = default_generic.object; // expected-warning{{incompatible pointer types}}
+  NSNumber_obj = default_generic.object; // expected-error{{incompatible pointer types}}
   NSMutableString_str = default_generic.kindof_object;
-  NSNumber_obj = default_generic.kindof_object; // expected-warning{{incompatible pointer types assigning to 'NSNumber *' from '__kindof __kindof NSString *'}}
+  NSNumber_obj = default_generic.kindof_object; // expected-error{{incompatible pointer types assigning to 'NSNumber *' from '__kindof __kindof NSString *'}}
 
   typedef NSString *Typedef_NSString;
   NSGeneric<Typedef_NSString> *typedef_generic;
-  NSMutableString_str = typedef_generic.object; // expected-warning{{incompatible pointer types}}
-  NSNumber_obj = typedef_generic.object; // expected-warning{{incompatible pointer types}}
+  NSMutableString_str = typedef_generic.object; // expected-error{{incompatible pointer types}}
+  NSNumber_obj = typedef_generic.object; // expected-error{{incompatible pointer types}}
   NSMutableString_str = typedef_generic.kindof_object;
-  NSNumber_obj = typedef_generic.kindof_object; // expected-warning{{incompatible pointer types assigning to 'NSNumber *' from '__kindof Typedef_NSString'}}
+  NSNumber_obj = typedef_generic.kindof_object; // expected-error{{incompatible pointer types assigning to 'NSNumber *' from '__kindof Typedef_NSString'}}
 }
 
 void testKindofNonObjectType(void) {
diff --git a/clang/test/SemaObjC/method-prototype-scope.m b/clang/test/SemaObjC/method-prototype-scope.m
index e1080cbfeaddd..2e4e1ce94ac90 100644
--- a/clang/test/SemaObjC/method-prototype-scope.m
+++ b/clang/test/SemaObjC/method-prototype-scope.m
@@ -18,7 +18,7 @@ @implementation Test
 - (NSString *)doSomethingWith:(NSString *)object and:(NSArray *)object // expected-warning {{redefinition of method parameter 'object'}} \
 					  // expected-note {{previous declaration is here}}
 {
-    return object; // expected-warning {{incompatible pointer types returning 'NSArray *' from a function with result type 'NSString *'}}
+    return object; // expected-error {{incompatible pointer types returning 'NSArray *' from a function with result type 'NSString *'}}
 }
 
 - Func:(int)XXXX, id object { return object; } // expected-warning {{use of C-style parameters in Objective-C method declarations is deprecated}}
diff --git a/clang/test/SemaObjC/nullability.m b/clang/test/SemaObjC/nullability.m
index 0ab97a2cfbb24..da53335bf99f5 100644
--- a/clang/test/SemaObjC/nullability.m
+++ b/clang/test/SemaObjC/nullability.m
@@ -64,7 +64,7 @@ void test_accepts_nonnull_null_pointer_literal(NSFoo *foo, _Nonnull NSBar *bar)
   bar.property2 = 0; // expected-warning{{null passed to a callee that requires a non-null argument}}
   [bar setProperty1: 0]; // expected-warning{{null passed to a callee that requires a non-null argument}}
   [bar setProperty2: 0]; // expected-warning{{null passed to a callee that requires a non-null argument}}
-  int *ptr = bar.property1; // expected-warning{{incompatible pointer types initializing 'int *' with an expression of type 'NSFoo * _Nonnull'}}
+  int *ptr = bar.property1; // expected-error{{incompatible pointer types initializing 'int *' with an expression of type 'NSFoo * _Nonnull'}}
 }
 
 // Check returning nil from a nonnull-returning method.
@@ -82,7 +82,7 @@ - (NSFoo *)conflictingMethod1 {
 }
 - (NSFoo *)redundantMethod1 {
   int *ip = 0;
-  return ip; // expected-warning{{result type 'NSFoo * _Nonnull'}}
+  return ip; // expected-error{{result type 'NSFoo * _Nonnull'}}
 }
 @end
 
@@ -95,8 +95,8 @@ - (NSFoo *)methodC:(NSFoo*)foo;
 
 @implementation NSMerge
 - (NSFoo *)methodA:(NSFoo*)foo {
-  int *ptr = foo; // expected-warning{{incompatible pointer types initializing 'int *' with an expression of type 'NSFoo * _Nonnull'}}
-  return ptr; // expected-warning{{result type 'NSFoo * _Nonnull'}}
+  int *ptr = foo; // expected-error{{incompatible pointer types initializing 'int *' with an expression of type 'NSFoo * _Nonnull'}}
+  return ptr; // expected-error{{result type 'NSFoo * _Nonnull'}}
 }
 
 - (nullable NSFoo *)methodB:(null_unspecified NSFoo*)foo { // expected-error{{nullability specifier 'nullable' conflicts with existing specifier 'nonnull'}} \
@@ -106,7 +106,7 @@ - (nullable NSFoo *)methodB:(null_unspecified NSFoo*)foo { // expected-error{{nu
 
 - (nonnull NSFoo *)methodC:(nullable NSFoo*)foo {
   int *ip = 0;
-  return ip; // expected-warning{{result type 'NSFoo * _Nonnull'}}
+  return ip; // expected-error{{result type 'NSFoo * _Nonnull'}}
 }
 @end
 
@@ -126,31 +126,31 @@ void test_receiver_merge(NSMergeReceiver *none,
                          _Null_unspecified NSMergeReceiver *null_unspecified) {
   int *ptr;
 
-  ptr = [nullable returnsNullable]; // expected-warning{{'id _Nullable'}}
-  ptr = [nullable returnsNullUnspecified]; // expected-warning{{'id _Nullable'}}
-  ptr = [nullable returnsNonNull]; // expected-warning{{'id _Nullable'}}
-  ptr = [nullable returnsNone]; // expected-warning{{'id _Nullable'}}
-
-  ptr = [nullable_result returnsNullable]; // expected-warning{{'id _Nullable'}}
-  ptr = [nullable_result returnsNullUnspecified]; // expected-warning{{'id _Nullable'}}
-  ptr = [nullable_result returnsNonNull]; // expected-warning{{'id _Nullable'}}
-  ptr = [nullable_result returnsNone]; // expected-warning{{'id _Nullable'}}
-  ptr = [nullable_result returnsNullableResult]; // expected-warning{{'id _Nullable_result'}}
-
-  ptr = [null_unspecified returnsNullable]; // expected-warning{{'id _Nullable'}}
-  ptr = [null_unspecified returnsNullUnspecified]; // expected-warning{{'id _Null_unspecified'}}
-  ptr = [null_unspecified returnsNonNull]; // expected-warning{{'id _Null_unspecified'}}
-  ptr = [null_unspecified returnsNone]; // expected-warning{{'id'}}
-
-  ptr = [nonnull returnsNullable]; // expected-warning{{'id _Nullable'}}
-  ptr = [nonnull returnsNullUnspecified]; // expected-warning{{'id _Null_unspecified'}}
-  ptr = [nonnull returnsNonNull]; // expected-warning{{'id _Nonnull'}}
-  ptr = [nonnull returnsNone]; // expected-warning{{'id'}}
-
-  ptr = [none returnsNullable]; // expected-warning{{'id _Nullable'}}
-  ptr = [none returnsNullUnspecified]; // expected-warning{{'id'}}
-  ptr = [none returnsNonNull]; // expected-warning{{'id'}}
-  ptr = [none returnsNone]; // expected-warning{{'id'}}
+  ptr = [nullable returnsNullable]; // expected-error{{'id _Nullable'}}
+  ptr = [nullable returnsNullUnspecified]; // expected-error{{'id _Nullable'}}
+  ptr = [nullable returnsNonNull]; // expected-error{{'id _Nullable'}}
+  ptr = [nullable returnsNone]; // expected-error{{'id _Nullable'}}
+
+  ptr = [nullable_result returnsNullable]; // expected-error{{'id _Nullable'}}
+  ptr = [nullable_result returnsNullUnspecified]; // expected-error{{'id _Nullable'}}
+  ptr = [nullable_result returnsNonNull]; // expected-error{{'id _Nullable'}}
+  ptr = [nullable_result returnsNone]; // expected-error{{'id _Nullable'}}
+  ptr = [nullable_result returnsNullableResult]; // expected-error{{'id _Nullable_result'}}
+
+  ptr = [null_unspecified returnsNullable]; // expected-error{{'id _Nullable'}}
+  ptr = [null_unspecified returnsNullUnspecified]; // expected-error{{'id _Null_unspecified'}}
+  ptr = [null_unspecified returnsNonNull]; // expected-error{{'id _Null_unspecified'}}
+  ptr = [null_unspecified returnsNone]; // expected-error{{'id'}}
+
+  ptr = [nonnull returnsNullable]; // expected-error{{'id _Nullable'}}
+  ptr = [nonnull returnsNullUnspecified]; // expected-error{{'id _Null_unspecified'}}
+  ptr = [nonnull returnsNonNull]; // expected-error{{'id _Nonnull'}}
+  ptr = [nonnull returnsNone]; // expected-error{{'id'}}
+
+  ptr = [none returnsNullable]; // expected-error{{'id _Nullable'}}
+  ptr = [none returnsNullUnspecified]; // expected-error{{'id'}}
+  ptr = [none returnsNonNull]; // expected-error{{'id'}}
+  ptr = [none returnsNone]; // expected-error{{'id'}}
   
 }
 
@@ -171,13 +171,13 @@ + (_Nonnull instancetype)returnInstanceOfMe2;
 @end
 
 void test_instancetype(InitializableClass * _Nonnull ic, id _Nonnull object) {
-  int *ip = [ic returnMe]; // expected-warning{{incompatible pointer types initializing 'int *' with an expression of type 'InitializableClass * _Nullable'}}
-  ip = [InitializableClass returnMe]; // expected-warning{{incompatible pointer types assigning to 'int *' from 'id _Nullable'}}
-  ip = [InitializableClass returnInstanceOfMe]; // expected-warning{{incompatible pointer types assigning to 'int *' from 'InitializableClass * _Nullable'}}
-  ip = [object returnMe]; // expected-warning{{incompatible pointer types assigning to 'int *' from 'id _Nullable'}}
+  int *ip = [ic returnMe]; // expected-error{{incompatible pointer types initializing 'int *' with an expression of type 'InitializableClass * _Nullable'}}
+  ip = [InitializableClass returnMe]; // expected-error{{incompatible pointer types assigning to 'int *' from 'id _Nullable'}}
+  ip = [InitializableClass returnInstanceOfMe]; // expected-error{{incompatible pointer types assigning to 'int *' from 'InitializableClass * _Nullable'}}
+  ip = [object returnMe]; // expected-error{{incompatible pointer types assigning to 'int *' from 'id _Nullable'}}
 
-  ip = [ic returnMe2]; // expected-warning{{incompatible pointer types assigning to 'int *' from 'InitializableClass * _Nullable'}}
-  ip = [InitializableClass returnInstanceOfMe2]; // expected-warning{{incompatible pointer types assigning to 'int *' from 'InitializableClass * _Nonnull'}}
+  ip = [ic returnMe2]; // expected-error{{incompatible pointer types assigning to 'int *' from 'InitializableClass * _Nullable'}}
+  ip = [InitializableClass returnInstanceOfMe2]; // expected-error{{incompatible pointer types assigning to 'int *' from 'InitializableClass * _Nonnull'}}
 }
 
 // Check null_resettable getters/setters.
@@ -192,14 +192,14 @@ @interface NSResettable
 @end
 
 void test_null_resettable(NSResettable *r, int *ip) {
-  [r setResettable1:ip]; // expected-warning{{incompatible pointer types sending 'int *' to parameter of type 'NSResettable * _Nullable'}}
-  r.resettable1 = ip; // expected-warning{{incompatible pointer types assigning to 'NSResettable * _Nullable' from 'int *'}}
+  [r setResettable1:ip]; // expected-error{{incompatible pointer types sending 'int *' to parameter of type 'NSResettable * _Nullable'}}
+  r.resettable1 = ip; // expected-error{{incompatible pointer types assigning to 'NSResettable * _Nullable' from 'int *'}}
 }
 
 @implementation NSResettable // expected-warning{{synthesized setter 'setResettable4:' for null_resettable property 'resettable4' does not handle nil}}
 - (NSResettable *)resettable1 {
   int *ip = 0;
-  return ip; // expected-warning{{result type 'NSResettable * _Nonnull'}}
+  return ip; // expected-error{{result type 'NSResettable * _Nonnull'}}
 }
 
 - (void)setResettable1:(NSResettable *)param {
@@ -225,8 +225,8 @@ @interface MultiProp
 
 void testMultiProp(MultiProp *foo) {
   int *ip;
-  ip = foo.a; // expected-warning{{from 'id _Nullable'}}
-  ip = foo.d; // expected-warning{{from 'MultiProp * _Nullable'}}
+  ip = foo.a; // expected-error{{from 'id _Nullable'}}
+  ip = foo.d; // expected-error{{from 'MultiProp * _Nullable'}}
   ip = foo.e; // expected-error{{incompatible type 'MultiProp *(^ _Nullable)(int)'}}
 }
 
@@ -235,7 +235,7 @@ void testBlockLiterals(void) {
   (void)(^id _Nullable (void) { return 0; });
   (void)(^ _Nullable id(void) { return 0; });
 
-  int *x = (^ _Nullable id(void) { return 0; })(); // expected-warning{{incompatible pointer types initializing 'int *' with an expression of type 'id _Nullable'}}
+  int *x = (^ _Nullable id(void) { return 0; })(); // expected-error{{incompatible pointer types initializing 'int *' with an expression of type 'id _Nullable'}}
 }
 
 // Check nullability of conditional expressions.
diff --git a/clang/test/SemaObjC/objcbridge-related-attribute.m b/clang/test/SemaObjC/objcbridge-related-attribute.m
index e56678436bf30..f281bc7f9ab22 100644
--- a/clang/test/SemaObjC/objcbridge-related-attribute.m
+++ b/clang/test/SemaObjC/objcbridge-related-attribute.m
@@ -24,9 +24,9 @@ - (NSColor *)backgroundColor;
 }
 
 NSColor * Test2(NSTextField *textField, CGColorRef1 newColor) {
-  foo(newColor); // expected-warning {{incompatible pointer types passing 'CGColorRef1'}}
-  textField.backgroundColor = newColor; // expected-warning {{incompatible pointer types assigning}}
-  return newColor; // expected-warning {{incompatible pointer types returning}}
+  foo(newColor); // expected-error {{incompatible pointer types passing 'CGColorRef1'}}
+  textField.backgroundColor = newColor; // expected-error {{incompatible pointer types assigning}}
+  return newColor; // expected-error {{incompatible pointer types returning}}
 }
 
 CGColorRef Test3(NSTextField *textField, CGColorRef newColor) {
@@ -35,6 +35,6 @@ CGColorRef Test3(NSTextField *textField, CGColorRef newColor) {
 }
 
 CGColorRef2 Test4(NSTextField *textField, CGColorRef2 newColor) {
-  newColor = textField.backgroundColor; // expected-warning {{incompatible pointer types assigning}}
-  return textField.backgroundColor; // expected-warning {{incompatible pointer types returning}}
+  newColor = textField.backgroundColor; // expected-error {{incompatible pointer types assigning}}
+  return textField.backgroundColor; // expected-error {{incompatible pointer types returning}}
 }
diff --git a/clang/test/SemaObjC/parameterized_classes.m b/clang/test/SemaObjC/parameterized_classes.m
index 2b1dcc91a010f..b95bb4733c666 100644
--- a/clang/test/SemaObjC/parameterized_classes.m
+++ b/clang/test/SemaObjC/parameterized_classes.m
@@ -301,11 +301,11 @@ @interface PC15<T : id, U : NSObject *, V : id<NSCopying>> : NSObject
 void testSpecializedTypePrinting(void) {
   int *ip;
 
-  ip = (typeArgs15*)0; // expected-warning{{'typeArgs15 *' (aka 'PC1<NSObject *,NSString *> *')}}
-  ip = (typeArgsAndProtocolQuals4*)0; // expected-warning{{'typeArgsAndProtocolQuals4 *' (aka 'PC1<NSObject *,NSString *><NSCopying> *')}}
-  ip = (typeArgsAndProtocolQuals5*)0; // expected-warning{{'typeArgsAndProtocolQuals5 *' (aka 'typeArgs15<NSCopying> *')}}
+  ip = (typeArgs15*)0; // expected-error{{'typeArgs15 *' (aka 'PC1<NSObject *,NSString *> *')}}
+  ip = (typeArgsAndProtocolQuals4*)0; // expected-error{{'typeArgsAndProtocolQuals4 *' (aka 'PC1<NSObject *,NSString *><NSCopying> *')}}
+  ip = (typeArgsAndProtocolQuals5*)0; // expected-error{{'typeArgsAndProtocolQuals5 *' (aka 'typeArgs15<NSCopying> *')}}
   ip = (typeArgsAndProtocolQuals6)0; // expected-error{{used type 'typeArgsAndProtocolQuals6' (aka 'typeArgs15<NSObject>')}}
-  ip = (typeArgsAndProtocolQuals6*)0;// expected-warning{{'typeArgsAndProtocolQuals6 *' (aka 'typeArgs15<NSObject> *')}}
+  ip = (typeArgsAndProtocolQuals6*)0;// expected-error{{'typeArgsAndProtocolQuals6 *' (aka 'typeArgs15<NSObject> *')}}
 }
 
 // --------------------------------------------------------------------------
diff --git a/clang/test/SemaObjC/parameterized_classes_arc.m b/clang/test/SemaObjC/parameterized_classes_arc.m
index 623de5be3b4ce..f12842b73a400 100644
--- a/clang/test/SemaObjC/parameterized_classes_arc.m
+++ b/clang/test/SemaObjC/parameterized_classes_arc.m
@@ -31,18 +31,18 @@ void test1c(PC1<id> *obj) {
 
 // Test that this doesn't completely kill downstream type-checking.
 void test1d(PC1<__weak Forward*> *obj) { // expected-error {{type argument 'Forward *__weak' cannot be qualified with '__weak'}}
-  Forward2 *x = [obj get]; // expected-warning {{incompatible}}
-  [obj set: x]; // expected-warning {{incompatible}}
+  Forward2 *x = [obj get]; // expected-error {{incompatible}}
+  [obj set: x]; // expected-error {{incompatible}}
 }
 
 void test1e(PC1<__strong Forward*> *obj) { // expected-error {{type argument 'Forward *__strong' cannot be qualified with '__strong'}}
-  Forward2 *x = [obj get]; // expected-warning {{incompatible}}
-  [obj set: x]; // expected-warning {{incompatible}}
+  Forward2 *x = [obj get]; // expected-error {{incompatible}}
+  [obj set: x]; // expected-error {{incompatible}}
 }
 
 void test1f(PC1<Forward*> *obj) {
-  Forward2 *x = [obj get]; // expected-warning {{incompatible}}
-  [obj set: x]; // expected-warning {{incompatible}}
+  Forward2 *x = [obj get]; // expected-error {{incompatible}}
+  [obj set: x]; // expected-error {{incompatible}}
 }
 
 // Typedefs are fine, just silently ignore them.
@@ -54,8 +54,8 @@ void test1g(PC1<StrongID> *obj) {
 
 typedef __strong Forward *StrongForward;
 void test1h(PC1<StrongForward> *obj) {
-  Forward2 *x = [obj get]; // expected-warning {{incompatible}}
-  [obj set: x]; // expected-warning {{incompatible}}
+  Forward2 *x = [obj get]; // expected-error {{incompatible}}
+  [obj set: x]; // expected-error {{incompatible}}
 }
 
 // These aren't really ARC-specific, but they're the same basic idea.
diff --git a/clang/test/SemaObjC/parameterized_classes_subst.m b/clang/test/SemaObjC/parameterized_classes_subst.m
index 98dd88c3bc98e..3358194845fc3 100644
--- a/clang/test/SemaObjC/parameterized_classes_subst.m
+++ b/clang/test/SemaObjC/parameterized_classes_subst.m
@@ -156,37 +156,37 @@ void test_message_send_result(
        NSArray<__kindof NSString *> *kindofStringArray,
        void (^block)(void)) {
   int *ip;
-  ip = [stringSet firstObject]; // expected-warning{{from 'NSString *'}}
-  ip = [mutStringSet firstObject]; // expected-warning{{from 'NSString *'}}
-  ip = [widgetSet firstObject]; // expected-warning{{from 'Widget *'}}
-  ip = [untypedMutSet firstObject]; // expected-warning{{from 'id'}}
-  ip = [mutStringArraySet firstObject]; // expected-warning{{from 'NSArray<NSString *> *'}}
-  ip = [set firstObject]; // expected-warning{{from 'id'}}
-  ip = [mutSet firstObject]; // expected-warning{{from 'id'}}
-  ip = [mutArraySet firstObject]; // expected-warning{{from 'id'}}
-  ip = [block firstObject]; // expected-warning{{from 'id'}}
-
-  ip = [stringSet findObject:@"blah"]; // expected-warning{{from 'NSString *'}}
+  ip = [stringSet firstObject]; // expected-error{{from 'NSString *'}}
+  ip = [mutStringSet firstObject]; // expected-error{{from 'NSString *'}}
+  ip = [widgetSet firstObject]; // expected-error{{from 'Widget *'}}
+  ip = [untypedMutSet firstObject]; // expected-error{{from 'id'}}
+  ip = [mutStringArraySet firstObject]; // expected-error{{from 'NSArray<NSString *> *'}}
+  ip = [set firstObject]; // expected-error{{from 'id'}}
+  ip = [mutSet firstObject]; // expected-error{{from 'id'}}
+  ip = [mutArraySet firstObject]; // expected-error{{from 'id'}}
+  ip = [block firstObject]; // expected-error{{from 'id'}}
+
+  ip = [stringSet findObject:@"blah"]; // expected-error{{from 'NSString *'}}
 
   // Class messages.
-  ip = [NSSet<NSString *> alloc]; // expected-warning{{from 'NSSet<NSString *> *'}}
-  ip = [NSSet alloc]; // expected-warning{{from 'NSSet *'}}
-  ip = [MutableSetOfArrays<NSString *> alloc]; // expected-warning{{from 'MutableSetOfArrays<NSString *> *'}}
-  ip = [MutableSetOfArrays alloc];  // expected-warning{{from 'MutableSetOfArrays *'}}
-  ip = [NSArray<NSString *> array]; // expected-warning{{from 'NSArray<NSString *> *'}}
-  ip = [NSArray<NSString *><NSCopying> array]; // expected-warning{{from 'NSArray<NSString *> *'}}
+  ip = [NSSet<NSString *> alloc]; // expected-error{{from 'NSSet<NSString *> *'}}
+  ip = [NSSet alloc]; // expected-error{{from 'NSSet *'}}
+  ip = [MutableSetOfArrays<NSString *> alloc]; // expected-error{{from 'MutableSetOfArrays<NSString *> *'}}
+  ip = [MutableSetOfArrays alloc];  // expected-error{{from 'MutableSetOfArrays *'}}
+  ip = [NSArray<NSString *> array]; // expected-error{{from 'NSArray<NSString *> *'}}
+  ip = [NSArray<NSString *><NSCopying> array]; // expected-error{{from 'NSArray<NSString *> *'}}
 
-  ip = [[NSMutableArray<NSString *> alloc] init];  // expected-warning{{from 'NSMutableArray<NSString *> *'}}
+  ip = [[NSMutableArray<NSString *> alloc] init];  // expected-error{{from 'NSMutableArray<NSString *> *'}}
 
   [[NSMutableArray alloc] initWithArray: stringArray]; // okay
   [[NSMutableArray<NSString *> alloc] initWithArray: stringArray]; // okay
-  [[NSMutableArray<NSNumber *> alloc] initWithArray: stringArray]; // expected-warning{{sending 'NSArray<NSString *> *' to parameter of type 'NSArray<NSNumber *> *'}}
+  [[NSMutableArray<NSNumber *> alloc] initWithArray: stringArray]; // expected-error{{sending 'NSArray<NSString *> *' to parameter of type 'NSArray<NSNumber *> *'}}
 
-  ip = [[[NSViewController alloc] init] view]; // expected-warning{{from '__kindof NSView *'}}
+  ip = [[[NSViewController alloc] init] view]; // expected-error{{from '__kindof NSView *'}}
   [[[[NSViewController alloc] init] view] toggle];
 
   NSMutableString *mutStr = kindofStringArray[0];
-  NSNumber *number = kindofStringArray[0]; // expected-warning{{of type '__kindof NSString *'}}
+  NSNumber *number = kindofStringArray[0]; // expected-error{{of type '__kindof NSString *'}}
 }
 
 void test_message_send_param(
@@ -200,13 +200,13 @@ void test_message_send_param(
        void (^block)(void)) {
   Window *window;
 
-  [mutStringSet addObject: window]; // expected-warning{{parameter of type 'NSString *'}}
-  [widgetSet addObject: window]; // expected-warning{{parameter of type 'Widget *'}}
+  [mutStringSet addObject: window]; // expected-error{{parameter of type 'NSString *'}}
+  [widgetSet addObject: window]; // expected-error{{parameter of type 'Widget *'}}
   [untypedMutSet addObject: window]; // expected-warning{{parameter of incompatible type 'id<NSCopying>'}}
-  [mutStringArraySet addObject: window]; // expected-warning{{parameter of type 'NSArray<NSString *> *'}}
+  [mutStringArraySet addObject: window]; // expected-error{{parameter of type 'NSArray<NSString *> *'}}
   [mutSet addObject: window]; // expected-warning{{parameter of incompatible type 'id<NSCopying>'}}
   [mutArraySet addObject: window]; // expected-warning{{parameter of incompatible type 'id<NSCopying>'}}
-  [typedefTypeParam test: window]; // expected-warning{{parameter of type 'NSString *'}}
+  [typedefTypeParam test: window]; // expected-error{{parameter of type 'NSString *'}}
   [block addObject: window]; // expected-warning{{parameter of incompatible type 'id<NSCopying>'}}
 }
 
@@ -224,18 +224,18 @@ void test_property_read(
        MutableSetOfArrays *mutArraySet,
        NSMutableDictionary *mutDict) {
   int *ip;
-  ip = stringSet.allObjects; // expected-warning{{from 'NSArray<NSString *> *'}}
-  ip = mutStringSet.allObjects; // expected-warning{{from 'NSArray<NSString *> *'}}
-  ip = widgetSet.allObjects; // expected-warning{{from 'NSArray<Widget *> *'}}
-  ip = untypedMutSet.allObjects; // expected-warning{{from 'NSArray *'}}
-  ip = mutStringArraySet.allObjects; // expected-warning{{from 'NSArray<NSArray<NSString *> *> *'}}
-  ip = set.allObjects; // expected-warning{{from 'NSArray *'}}
-  ip = mutSet.allObjects; // expected-warning{{from 'NSArray *'}}
-  ip = mutArraySet.allObjects; // expected-warning{{from 'NSArray *'}}
-
-  ip = mutDict.someRandomKey; // expected-warning{{from '__kindof id<NSCopying>'}}
-
-  ip = [[NSViewController alloc] init].view; // expected-warning{{from '__kindof NSView *'}}
+  ip = stringSet.allObjects; // expected-error{{from 'NSArray<NSString *> *'}}
+  ip = mutStringSet.allObjects; // expected-error{{from 'NSArray<NSString *> *'}}
+  ip = widgetSet.allObjects; // expected-error{{from 'NSArray<Widget *> *'}}
+  ip = untypedMutSet.allObjects; // expected-error{{from 'NSArray *'}}
+  ip = mutStringArraySet.allObjects; // expected-error{{from 'NSArray<NSArray<NSString *> *> *'}}
+  ip = set.allObjects; // expected-error{{from 'NSArray *'}}
+  ip = mutSet.allObjects; // expected-error{{from 'NSArray *'}}
+  ip = mutArraySet.allObjects; // expected-error{{from 'NSArray *'}}
+
+  ip = mutDict.someRandomKey; // expected-error{{from '__kindof id<NSCopying>'}}
+
+  ip = [[NSViewController alloc] init].view; // expected-error{{from '__kindof NSView *'}}
 }
 
 void test_property_write(
@@ -248,14 +248,14 @@ void test_property_write(
        NSMutableDictionary *mutDict) {
   int *ip;
 
-  mutStringSet.allObjects = ip; // expected-warning{{to 'NSArray<NSString *> *'}}
-  widgetSet.allObjects = ip; // expected-warning{{to 'NSArray<Widget *> *'}}
-  untypedMutSet.allObjects = ip; // expected-warning{{to 'NSArray *'}}
-  mutStringArraySet.allObjects = ip; // expected-warning{{to 'NSArray<NSArray<NSString *> *> *'}}
-  mutSet.allObjects = ip; // expected-warning{{to 'NSArray *'}}
-  mutArraySet.allObjects = ip; // expected-warning{{to 'NSArray *'}}
+  mutStringSet.allObjects = ip; // expected-error{{to 'NSArray<NSString *> *'}}
+  widgetSet.allObjects = ip; // expected-error{{to 'NSArray<Widget *> *'}}
+  untypedMutSet.allObjects = ip; // expected-error{{to 'NSArray *'}}
+  mutStringArraySet.allObjects = ip; // expected-error{{to 'NSArray<NSArray<NSString *> *> *'}}
+  mutSet.allObjects = ip; // expected-error{{to 'NSArray *'}}
+  mutArraySet.allObjects = ip; // expected-error{{to 'NSArray *'}}
 
-  mutDict.someRandomKey = ip; // expected-warning{{to 'id<NSCopying>'}}
+  mutDict.someRandomKey = ip; // expected-error{{to 'id<NSCopying>'}}
 }
 
 // --------------------------------------------------------------------------
@@ -275,28 +275,28 @@ void test_subscripting(
   Widget *widget;
   Window *window;
 
-  ip = stringArray[0]; // expected-warning{{from 'NSString *'}}
+  ip = stringArray[0]; // expected-error{{from 'NSString *'}}
 
-  ip = mutStringArray[0]; // expected-warning{{from 'NSString *'}}
-  mutStringArray[0] = ip; // expected-warning{{parameter of type 'NSString *'}}
+  ip = mutStringArray[0]; // expected-error{{from 'NSString *'}}
+  mutStringArray[0] = ip; // expected-error{{parameter of type 'NSString *'}}
 
-  ip = array[0]; // expected-warning{{from 'id'}}
+  ip = array[0]; // expected-error{{from 'id'}}
 
-  ip = mutArray[0]; // expected-warning{{from 'id'}}
-  mutArray[0] = ip; // expected-warning{{parameter of type 'id'}}
+  ip = mutArray[0]; // expected-error{{from 'id'}}
+  mutArray[0] = ip; // expected-error{{parameter of type 'id'}}
 
-  ip = stringWidgetDict[string]; // expected-warning{{from 'Widget *'}}
-  widget = stringWidgetDict[widget]; // expected-warning{{to parameter of type 'NSString *'}}
+  ip = stringWidgetDict[string]; // expected-error{{from 'Widget *'}}
+  widget = stringWidgetDict[widget]; // expected-error{{to parameter of type 'NSString *'}}
 
-  ip = mutStringWidgetDict[string]; // expected-warning{{from 'Widget *'}}
-  widget = mutStringWidgetDict[widget]; // expected-warning{{to parameter of type 'NSString *'}}
-  mutStringWidgetDict[string] = ip; // expected-warning{{to parameter of type 'Widget *'}}
-  mutStringWidgetDict[widget] = widget; // expected-warning{{to parameter of type 'NSString *'}}
+  ip = mutStringWidgetDict[string]; // expected-error{{from 'Widget *'}}
+  widget = mutStringWidgetDict[widget]; // expected-error{{to parameter of type 'NSString *'}}
+  mutStringWidgetDict[string] = ip; // expected-error{{to parameter of type 'Widget *'}}
+  mutStringWidgetDict[widget] = widget; // expected-error{{to parameter of type 'NSString *'}}
 
-  ip = dict[string]; // expected-warning{{from 'id'}}
+  ip = dict[string]; // expected-error{{from 'id'}}
 
-  ip = mutDict[string]; // expected-warning{{from 'id'}}
-  mutDict[string] = ip; // expected-warning{{to parameter of type 'id'}}
+  ip = mutDict[string]; // expected-error{{from 'id'}}
+  mutDict[string] = ip; // expected-error{{to parameter of type 'id'}}
 
   widget = mutDict[window];
   mutDict[window] = widget; // expected-warning{{parameter of incompatible type 'id<NSCopying>'}}
@@ -309,15 +309,15 @@ void test_instance_variable(NSArray<NSString *> *stringArray,
                             NSArray *array) {
   int *ip;
 
-  ip = stringArray->data; // expected-warning{{from 'NSString **'}}
-  ip = array->data; // expected-warning{{from 'id *'}}
+  ip = stringArray->data; // expected-error{{from 'NSString **'}}
+  ip = array->data; // expected-error{{from 'id *'}}
 }
 
 @implementation WindowArray
 - (void)testInstanceVariable {
   int *ip;
 
-  ip = data; // expected-warning{{from 'Window **'}}
+  ip = data; // expected-error{{from 'Window **'}}
 }
 @end
 
@@ -336,13 +336,13 @@ void test_implicit_conversions(NSArray<NSString *> *stringArray,
   stringArray = array;
 
   // Specialized -> specialized failure (same level).
-  stringArray = numberArray; // expected-warning{{incompatible pointer types assigning to 'NSArray<NSString *> *' from 'NSArray<NSNumber *> *'}}
+  stringArray = numberArray; // expected-error{{incompatible pointer types assigning to 'NSArray<NSString *> *' from 'NSArray<NSNumber *> *'}}
 
   // Specialized -> specialized (different levels).
   stringArray = mutStringArray;
 
   // Specialized -> specialized failure (different levels).
-  numberArray = mutStringArray; // expected-warning{{incompatible pointer types assigning to 'NSArray<NSNumber *> *' from 'NSMutableArray<NSString *> *'}}
+  numberArray = mutStringArray; // expected-error{{incompatible pointer types assigning to 'NSArray<NSNumber *> *' from 'NSMutableArray<NSString *> *'}}
 
   // Unspecialized -> specialized (different levels).
   stringArray = mutArray;
@@ -366,10 +366,10 @@ void test_variance(NSCovariant1<NSString *> *covariant1,
                    NSContravariant1<NSString *> *contravariant1,
                    NSContravariant1<NSMutableString *> *contravariant2) {
   covariant1 = covariant2; // okay
-  covariant2 = covariant1; // expected-warning{{incompatible pointer types assigning to 'NSCovariant1<NSMutableString *> *' from 'NSCovariant1<NSString *> *'}}
+  covariant2 = covariant1; // expected-error{{incompatible pointer types assigning to 'NSCovariant1<NSMutableString *> *' from 'NSCovariant1<NSString *> *'}}
 
   covariant3 = covariant4; // okay
-  covariant4 = covariant3; // expected-warning{{incompatible pointer types assigning to 'NSCovariant1<NSMutableString *(^)(void)> *' from 'NSCovariant1<NSString *(^)(void)> *'}}
+  covariant4 = covariant3; // expected-error{{incompatible pointer types assigning to 'NSCovariant1<NSMutableString *(^)(void)> *' from 'NSCovariant1<NSString *(^)(void)> *'}}
 
   covariant5 = covariant1; // okay
   covariant1 = covariant5; // okay: id is promiscuous
@@ -377,7 +377,7 @@ void test_variance(NSCovariant1<NSString *> *covariant1,
   covariant5 = covariant3; // okay
   covariant3 = covariant5; // okay
 
-  contravariant1 = contravariant2; // expected-warning{{incompatible pointer types assigning to 'NSContravariant1<NSString *> *' from 'NSContravariant1<NSMutableString *> *'}}
+  contravariant1 = contravariant2; // expected-error{{incompatible pointer types assigning to 'NSContravariant1<NSString *> *' from 'NSContravariant1<NSMutableString *> *'}}
   contravariant2 = contravariant1; // okay
 }
 
@@ -394,19 +394,19 @@ void test_ternary_operator(NSArray<NSString *> *stringArray,
   int *ip;
   id object;
 
-  ip = cond ? stringArray : mutStringArray; // expected-warning{{from 'NSArray<NSString *> *'}}
-  ip = cond ? mutStringArray : stringArray; // expected-warning{{from 'NSArray<NSString *> *'}}
+  ip = cond ? stringArray : mutStringArray; // expected-error{{from 'NSArray<NSString *> *'}}
+  ip = cond ? mutStringArray : stringArray; // expected-error{{from 'NSArray<NSString *> *'}}
 
-  ip = cond ? stringArray2 : mutStringArray; // expected-warning{{from 'NSArray<NSString *> *'}}
-  ip = cond ? mutStringArray : stringArray2; // expected-warning{{from 'NSArray<NSString *> *'}}
+  ip = cond ? stringArray2 : mutStringArray; // expected-error{{from 'NSArray<NSString *> *'}}
+  ip = cond ? mutStringArray : stringArray2; // expected-error{{from 'NSArray<NSString *> *'}}
 
-  ip = cond ? stringArray : mutArray; // expected-warning{{from 'NSArray *'}}
+  ip = cond ? stringArray : mutArray; // expected-error{{from 'NSArray *'}}
 
-  ip = cond ? stringArray2 : mutArray; // expected-warning{{from 'NSArray *'}}
+  ip = cond ? stringArray2 : mutArray; // expected-error{{from 'NSArray *'}}
 
-  ip = cond ? mutArray : stringArray; // expected-warning{{from 'NSArray *'}}
+  ip = cond ? mutArray : stringArray; // expected-error{{from 'NSArray *'}}
 
-  ip = cond ? mutArray : stringArray2; // expected-warning{{from 'NSArray *'}}
+  ip = cond ? mutArray : stringArray2; // expected-error{{from 'NSArray *'}}
 
   object = cond ? stringArray : numberArray; // expected-warning{{incompatible operand types ('NSArray<NSString *> *' and 'NSArray<NSNumber *> *')}}
 }
@@ -417,16 +417,16 @@ void test_ternary_operator(NSArray<NSString *> *stringArray,
 @implementation NSStringArray
 - (void)useSuperMethod {
   int *ip;
-  ip = super.lastObject; // expected-warning{{from 'NSString *'}}
-  super.lastObject = ip; // expected-warning{{to 'NSString *'}}
-  ip = [super objectAtIndexedSubscript:0]; // expected-warning{{from 'NSString *'}}
+  ip = super.lastObject; // expected-error{{from 'NSString *'}}
+  super.lastObject = ip; // expected-error{{to 'NSString *'}}
+  ip = [super objectAtIndexedSubscript:0]; // expected-error{{from 'NSString *'}}
 }
 
 + (void)useSuperMethod {
   int *ip;
-  ip = super.array; // expected-warning{{from 'NSArray<NSString *> *'}}
-  super.array = ip; // expected-warning{{to 'NSArray<NSString *> *'}}
-  ip = [super array]; // expected-warning{{from 'NSArray<NSString *> *'}}
+  ip = super.array; // expected-error{{from 'NSArray<NSString *> *'}}
+  super.array = ip; // expected-error{{to 'NSArray<NSString *> *'}}
+  ip = [super array]; // expected-error{{from 'NSArray<NSString *> *'}}
 }
 @end
 
@@ -443,8 +443,8 @@ - (void)setObject:(ObjectType)obj forKeyedSubscript:(KeyType <NSCopying>)key; //
 void bar(MyMutableDictionary<NSString *, NSString *> *stringsByString,
                              NSNumber *n1, NSNumber *n2) {
   // We warn here when the key types do not match.
-  stringsByString[n1] = n2; // expected-warning{{incompatible pointer types sending 'NSNumber *' to parameter of type 'NSString *'}} \
-    // expected-warning{{incompatible pointer types sending 'NSNumber *' to parameter of type 'NSString<NSCopying> *'}}
+  stringsByString[n1] = n2; // expected-error{{incompatible pointer types sending 'NSNumber *' to parameter of type 'NSString *'}} \
+    // expected-error{{incompatible pointer types sending 'NSNumber *' to parameter of type 'NSString<NSCopying> *'}}
 }
 
 @interface MyTest<K, V> : NSObject <NSCopying>
diff --git a/clang/test/SemaObjC/protocol-typecheck.m b/clang/test/SemaObjC/protocol-typecheck.m
index 0166cc3c09f6e..9aa363f87cbf1 100644
--- a/clang/test/SemaObjC/protocol-typecheck.m
+++ b/clang/test/SemaObjC/protocol-typecheck.m
@@ -20,6 +20,6 @@ void func(void) {
   [obj setFlexElement:flexer];
   // FIXME: GCC provides the following diagnostic (which is much better):
   // protocol-typecheck.m:21: warning: class 'NSObject <PWhatever, XCElementP>' does not implement the 'XCElementSpacerP' protocol
-  [obj setFlexElement2:flexer2]; // expected-warning{{incompatible pointer types sending 'NSObject<PWhatever,XCElementP> *' to parameter of type 'NSObject<PWhatever,XCElementSpacerP> *'}}
+  [obj setFlexElement2:flexer2]; // expected-error{{incompatible pointer types sending 'NSObject<PWhatever,XCElementP> *' to parameter of type 'NSObject<PWhatever,XCElementSpacerP> *'}}
 }
 
diff --git a/clang/test/SemaObjC/protocol-warn.m b/clang/test/SemaObjC/protocol-warn.m
index 2b900a4382d3f..51d0fda61c36a 100644
--- a/clang/test/SemaObjC/protocol-warn.m
+++ b/clang/test/SemaObjC/protocol-warn.m
@@ -50,5 +50,5 @@ @implementation UIWebPDFView
 {
     UIWebBrowserView *browserView;
     UIWebPDFView *pdfView;
-    return pdfView ? pdfView : browserView; // expected-warning {{incompatible pointer types returning 'UIView *' from a function with result type 'UIWebPDFView *'}}
+    return pdfView ? pdfView : browserView; // expected-error {{incompatible pointer types returning 'UIView *' from a function with result type 'UIWebPDFView *'}}
 }
diff --git a/clang/test/SemaObjC/related-result-type-inference.m b/clang/test/SemaObjC/related-result-type-inference.m
index 1eb7b17b67422..9035f5bf98384 100644
--- a/clang/test/SemaObjC/related-result-type-inference.m
+++ b/clang/test/SemaObjC/related-result-type-inference.m
@@ -66,7 +66,7 @@ void test_inference(void) {
 
   
   NSArray *arr = [[NSMutableArray alloc] init];
-  NSMutableArray *marr = [arr retain]; // expected-warning{{incompatible pointer types initializing 'NSMutableArray *' with an expression of type 'NSArray *'}}
+  NSMutableArray *marr = [arr retain]; // expected-error{{incompatible pointer types initializing 'NSMutableArray *' with an expression of type 'NSArray *'}}
 }
 
 @implementation NSBlah
@@ -192,6 +192,6 @@ @implementation UIViewController
 + (UIViewController<PMFilterManager> *)newFilterViewControllerForType // expected-note {{compiler has implicitly changed method 'newFilterViewControllerForType' return type}}
 {
         UIViewController<PMFilterManager> *filterVC;
-        return filterVC; // expected-warning {{incompatible pointer types casting 'UIViewController *' to type 'UIViewController<PMFilterManager> *'}}
+        return filterVC; // expected-error {{incompatible pointer types casting 'UIViewController *' to type 'UIViewController<PMFilterManager> *'}}
 }
 @end
diff --git a/clang/test/SemaObjC/unqualified-to-qualified-class-warn.m b/clang/test/SemaObjC/unqualified-to-qualified-class-warn.m
index cc56fcc9d20b8..67c5ac1e9854c 100644
--- a/clang/test/SemaObjC/unqualified-to-qualified-class-warn.m
+++ b/clang/test/SemaObjC/unqualified-to-qualified-class-warn.m
@@ -24,7 +24,7 @@ void functionTakingAClassConformingToAProtocol(AClass <Fooable> *instance) { //
 int main (void) {
     AClass *aobject = 0;
     BClass *bobject = 0;
-    functionTakingAClassConformingToAProtocol(aobject);  // expected-warning {{incompatible pointer types passing 'AClass *' to parameter of type 'AClass<Fooable> *'}}
+    functionTakingAClassConformingToAProtocol(aobject);  // expected-error {{incompatible pointer types passing 'AClass *' to parameter of type 'AClass<Fooable> *'}}
     functionTakingAClassConformingToAProtocol(bobject); // Shouldn't warn -  does implement Fooable
     return 0;
 }
diff --git a/clang/test/SemaObjC/warn-incompatible-builtin-types.m b/clang/test/SemaObjC/warn-incompatible-builtin-types.m
index f69ca09b35673..882a3e1a50c93 100644
--- a/clang/test/SemaObjC/warn-incompatible-builtin-types.m
+++ b/clang/test/SemaObjC/warn-incompatible-builtin-types.m
@@ -9,8 +9,8 @@ void FUNC(void) {
     SEL s1, s2;
     id i, i1;
     Foo *f;
-    [f foo:f];	// expected-warning {{incompatible pointer types sending 'Foo *' to parameter of type 'Class'}}
-    c = f;	// expected-warning {{incompatible pointer types assigning to 'Class' from 'Foo *'}}
+    [f foo:f];	// expected-error {{incompatible pointer types sending 'Foo *' to parameter of type 'Class'}}
+    c = f;	// expected-error {{incompatible pointer types assigning to 'Class' from 'Foo *'}}
 
     c = i;
 
@@ -20,22 +20,22 @@ void FUNC(void) {
 
     i = i1;
 
-    s1 = i;	// expected-warning {{incompatible pointer types assigning to 'SEL' from 'id'}}
-    i = s1;	// expected-warning {{incompatible pointer types assigning to 'id' from 'SEL'}}
+    s1 = i;	// expected-error {{incompatible pointer types assigning to 'SEL' from 'id'}}
+    i = s1;	// expected-error {{incompatible pointer types assigning to 'id' from 'SEL'}}
 
     s1 = s2;
 
-    s1 = c;	// expected-warning {{incompatible pointer types assigning to 'SEL' from 'Class'}}
+    s1 = c;	// expected-error {{incompatible pointer types assigning to 'SEL' from 'Class'}}
 
-    c = s1;	// expected-warning {{incompatible pointer types assigning to 'Class' from 'SEL'}}
+    c = s1;	// expected-error {{incompatible pointer types assigning to 'Class' from 'SEL'}}
 
     f = i;
 
-    f = c;	// expected-warning {{incompatible pointer types assigning to 'Foo *' from 'Class'}}
+    f = c;	// expected-error {{incompatible pointer types assigning to 'Foo *' from 'Class'}}
 
-    f = s1;	// expected-warning {{incompatible pointer types assigning to 'Foo *' from 'SEL'}}
+    f = s1;	// expected-error {{incompatible pointer types assigning to 'Foo *' from 'SEL'}}
 
     i = f;
 
-    s1 = f; 	// expected-warning {{incompatible pointer types assigning to 'SEL' from 'Foo *'}}
+    s1 = f; 	// expected-error {{incompatible pointer types assigning to 'SEL' from 'Foo *'}}
 }
diff --git a/clang/test/SemaObjC/warn-superclass-method-mismatch.m b/clang/test/SemaObjC/warn-superclass-method-mismatch.m
index 52054739d53ec..d0e57864c5020 100644
--- a/clang/test/SemaObjC/warn-superclass-method-mismatch.m
+++ b/clang/test/SemaObjC/warn-superclass-method-mismatch.m
@@ -38,7 +38,7 @@ void f(Base *base, Sub *sub) {
   Base *b;
   [base method1:b]; // if base is actuall 'Sub'  it will use [Sub method1] with wrong argument.
 
-  [base method2:b];  // expected-warning {{}}
+  [base method2:b];  // expected-error {{incompatible pointer types}}
 
   Sub *s;
   [base method2:s]; // if base is actually 'Sub' OK. Either way OK.
diff --git a/clang/test/SemaObjCXX/blocks.mm b/clang/test/SemaObjCXX/blocks.mm
index 0ae91ab985ac1..f1b5b301087eb 100644
--- a/clang/test/SemaObjCXX/blocks.mm
+++ b/clang/test/SemaObjCXX/blocks.mm
@@ -8,7 +8,7 @@ void foo(id <NSObject>(^objectCreationBlock)(void)) {
 
 void bar2(id(*)(void));
 void foo2(id <NSObject>(*objectCreationBlock)(void)) {
-    return bar2(objectCreationBlock); // expected-warning{{incompatible pointer types passing 'id<NSObject> (*)()' to parameter of type 'id (*)()'}}
+    return bar2(objectCreationBlock); // expected-error{{incompatible pointer types passing 'id<NSObject> (*)()' to parameter of type 'id (*)()'}}
 }
 
 void bar3(id(*)()); // expected-note{{candidate function}}
diff --git a/clang/test/SemaObjCXX/objc-pointer-conv.mm b/clang/test/SemaObjCXX/objc-pointer-conv.mm
index 611b7bc009ae5..d6073674b29b9 100644
--- a/clang/test/SemaObjCXX/objc-pointer-conv.mm
+++ b/clang/test/SemaObjCXX/objc-pointer-conv.mm
@@ -42,7 +42,7 @@ @interface DerivedFromI : I
 void accept_derived(DerivedFromI*);
 
 void test_base_to_derived(I* i) {
-  accept_derived(i); // expected-warning{{incompatible pointer types passing 'I *' to parameter of type 'DerivedFromI *'}}
-  DerivedFromI *di = i; // expected-warning{{incompatible pointer types initializing 'DerivedFromI *' with an expression of type 'I *'}}
+  accept_derived(i); // expected-error{{incompatible pointer types passing 'I *' to parameter of type 'DerivedFromI *'}}
+  DerivedFromI *di = i; // expected-error{{incompatible pointer types initializing 'DerivedFromI *' with an expression of type 'I *'}}
   DerivedFromI *di2 = (DerivedFromI *)i;
 }
diff --git a/clang/test/SemaObjCXX/overload.mm b/clang/test/SemaObjCXX/overload.mm
index 75423b431ae56..c848e3493f15c 100644
--- a/clang/test/SemaObjCXX/overload.mm
+++ b/clang/test/SemaObjCXX/overload.mm
@@ -51,13 +51,13 @@ void test0(A* a, B* b, id val) {
 }
 
 void test1(A* a) {
-  B* b = a; // expected-warning{{incompatible pointer types initializing 'B *' with an expression of type 'A *'}}
-  B *c; c = a; // expected-warning{{incompatible pointer types assigning to 'B *' from 'A *'}}
+  B* b = a; // expected-error{{incompatible pointer types initializing 'B *' with an expression of type 'A *'}}
+  B *c; c = a; // expected-error{{incompatible pointer types assigning to 'B *' from 'A *'}}
 }
 
 void test2(A** ap) {
-  B** bp = ap; // expected-warning{{incompatible pointer types initializing 'B **' with an expression of type 'A **'}}
-  bp = ap; // expected-warning{{incompatible pointer types assigning to 'B **' from 'A **'}}
+  B** bp = ap; // expected-error{{incompatible pointer types initializing 'B **' with an expression of type 'A **'}}
+  bp = ap; // expected-error{{incompatible pointer types assigning to 'B **' from 'A **'}}
 }
 
 int& cv(A*);
@@ -97,7 +97,7 @@ void qualid_test(A *a, B *b, C *c) {
 
 void (*_NSExceptionRaiser(void))(NSException *) {
     objc_exception_functions_t exc_funcs;
-    return exc_funcs.throw_exc; // expected-warning{{incompatible pointer types returning 'void (*)(id)' from a function with result type 'void (*)(NSException *)'}}
+    return exc_funcs.throw_exc; // expected-error{{incompatible pointer types returning 'void (*)(id)' from a function with result type 'void (*)(NSException *)'}}
 }
 
 namespace test5 {
diff --git a/clang/test/SemaObjCXX/parameterized_classes_subst.mm b/clang/test/SemaObjCXX/parameterized_classes_subst.mm
index 8aacf21faf091..4658c3d617567 100644
--- a/clang/test/SemaObjCXX/parameterized_classes_subst.mm
+++ b/clang/test/SemaObjCXX/parameterized_classes_subst.mm
@@ -306,10 +306,10 @@ void test_variance(NSCovariant1<NSString *> *covariant1,
                    NSContravariant1<NSString *> *contravariant1,
                    NSContravariant1<NSMutableString *> *contravariant2) {
   covariant1 = covariant2; // okay
-  covariant2 = covariant1; // expected-warning{{incompatible pointer types assigning to 'NSCovariant1<NSMutableString *> *' from 'NSCovariant1<NSString *> *'}}
+  covariant2 = covariant1; // expected-error{{incompatible pointer types assigning to 'NSCovariant1<NSMutableString *> *' from 'NSCovariant1<NSString *> *'}}
 
   covariant3 = covariant4; // okay
-  covariant4 = covariant3; // expected-warning{{incompatible pointer types assigning to 'NSCovariant1<NSMutableString *(^)()> *' from 'NSCovariant1<NSString *(^)()> *'}}
+  covariant4 = covariant3; // expected-error{{incompatible pointer types assigning to 'NSCovariant1<NSMutableString *(^)()> *' from 'NSCovariant1<NSString *(^)()> *'}}
 
   covariant5 = covariant1; // okay
   covariant1 = covariant5; // okay: id is promiscuous
@@ -317,7 +317,7 @@ void test_variance(NSCovariant1<NSString *> *covariant1,
   covariant5 = covariant3; // okay
   covariant3 = covariant5; // okay
 
-  contravariant1 = contravariant2; // expected-warning{{incompatible pointer types assigning to 'NSContravariant1<NSString *> *' from 'NSContravariant1<NSMutableString *> *'}}
+  contravariant1 = contravariant2; // expected-error{{incompatible pointer types assigning to 'NSContravariant1<NSString *> *' from 'NSContravariant1<NSMutableString *> *'}}
   contravariant2 = contravariant1; // okay
 }
 
diff --git a/clang/test/SemaObjCXX/related-result-type-inference.mm b/clang/test/SemaObjCXX/related-result-type-inference.mm
index 675e6acbda143..9c25e07ab2033 100644
--- a/clang/test/SemaObjCXX/related-result-type-inference.mm
+++ b/clang/test/SemaObjCXX/related-result-type-inference.mm
@@ -65,7 +65,7 @@ void test_inference() {
   __typeof__(([NSBlah newUnrelated])) *unrelated2 = (Unrelated**)0;  
 
   NSArray *arr = [[NSMutableArray alloc] init];
-  NSMutableArray *marr = [arr retain]; // expected-warning{{incompatible pointer types initializing 'NSMutableArray *' with an expression of type 'NSArray *'}}
-  marr = [arr retain]; // expected-warning{{incompatible pointer types assigning to 'NSMutableArray *' from 'NSArray *'}}
+  NSMutableArray *marr = [arr retain]; // expected-error{{incompatible pointer types initializing 'NSMutableArray *' with an expression of type 'NSArray *'}}
+  marr = [arr retain]; // expected-error{{incompatible pointer types assigning to 'NSMutableArray *' from 'NSArray *'}}
   arr = [marr retain];
 }
diff --git a/clang/test/SemaOpenACC/combined-construct-if-clause.c b/clang/test/SemaOpenACC/combined-construct-if-clause.c
index d5e42d6105963..4de517cdb16b9 100644
--- a/clang/test/SemaOpenACC/combined-construct-if-clause.c
+++ b/clang/test/SemaOpenACC/combined-construct-if-clause.c
@@ -27,7 +27,7 @@ void BoolExpr(int *I, float *F) {
 #pragma acc kernels loop if (Array)
   for (unsigned i = 0; i < 5; ++i);
 
-  // expected-warning@+4{{incompatible pointer types assigning to 'int *' from 'float *'}}
+  // expected-error@+4{{incompatible pointer types assigning to 'int *' from 'float *'}}
   // expected-warning@+3{{using the result of an assignment as a condition without parentheses}}
   // expected-note@+2{{place parentheses around the assignment to silence this warning}}
   // expected-note@+1{{use '==' to turn this assignment into an equality comparison}}
diff --git a/clang/test/SemaOpenACC/compute-construct-if-clause.c b/clang/test/SemaOpenACC/compute-construct-if-clause.c
index 0064303e1e217..ca3b9b1c55f70 100644
--- a/clang/test/SemaOpenACC/compute-construct-if-clause.c
+++ b/clang/test/SemaOpenACC/compute-construct-if-clause.c
@@ -27,7 +27,7 @@ void BoolExpr(int *I, float *F) {
 #pragma acc kernels if (Array)
   while(0);
 
-  // expected-warning@+4{{incompatible pointer types assigning to 'int *' from 'float *'}}
+  // expected-error@+4{{incompatible pointer types assigning to 'int *' from 'float *'}}
   // expected-warning@+3{{using the result of an assignment as a condition without parentheses}}
   // expected-note@+2{{place parentheses around the assignment to silence this warning}}
   // expected-note@+1{{use '==' to turn this assignment into an equality comparison}}
diff --git a/clang/test/SemaOpenCL/address-spaces.cl b/clang/test/SemaOpenCL/address-spaces.cl
index 70ad89eb3ce22..86435b7b15b34 100644
--- a/clang/test/SemaOpenCL/address-spaces.cl
+++ b/clang/test/SemaOpenCL/address-spaces.cl
@@ -144,7 +144,7 @@ void nested(__global int *g, __global int * __private *gg, __local int *l, __loc
   gg = g;    // expected-error {{assigning '__global int *__private' to '__global int *__private *__private' changes address space of pointer}}
   gg = l;    // expected-error {{assigning '__local int *__private' to '__global int *__private *__private' changes address space of pointer}}
   gg = ll;   // expected-error {{assigning '__local int *__private *__private' to '__global int *__private *__private' changes address space of nested pointer}}
-  gg = gg_f; // expected-warning {{incompatible pointer types assigning to '__global int *__private *__private' from '__global float *__private *__private'}}
+  gg = gg_f; // expected-error {{incompatible pointer types assigning to '__global int *__private *__private' from '__global float *__private *__private'}}
   gg = (__global int * __private *)gg_f;
 
   l = g;     // expected-error {{assigning '__global int *__private' to '__local int *__private' changes address space of pointer}}
@@ -160,14 +160,13 @@ void nested(__global int *g, __global int * __private *gg, __local int *l, __loc
   ll = (__local int * __private *)gg_f; // expected-warning {{casting '__global float *__private *' to type '__local int *__private *' discards qualifiers in nested pointer types}}
 
   gg_f = g;  // expected-error {{assigning '__global int *__private' to '__global float *__private *__private' changes address space of pointer}}
-  gg_f = gg; // expected-warning {{incompatible pointer types assigning to '__global float *__private *__private' from '__global int *__private *__private'}}
+  gg_f = gg; // expected-error {{incompatible pointer types assigning to '__global float *__private *__private' from '__global int *__private *__private'}}
   gg_f = l;  // expected-error {{assigning '__local int *__private' to '__global float *__private *__private' changes address space of pointer}}
   gg_f = ll; // expected-error {{assigning '__local int *__private *__private' to '__global float *__private *__private' changes address space of nested pointer}}
   gg_f = (__global float * __private *)gg;
 
-  // FIXME: This doesn't seem right. This should be an error, not a warning.
   __local int * __global * __private * lll;
-  lll = gg; // expected-warning {{incompatible pointer types assigning to '__local int *__global *__private *__private' from '__global int *__private *__private'}}
+  lll = gg; // expected-error {{incompatible pointer types assigning to '__local int *__global *__private *__private' from '__global int *__private *__private'}}
 
   typedef __local int * l_t;
   typedef __global int * g_t;
diff --git a/clang/test/SemaOpenCL/atomic-ops.cl b/clang/test/SemaOpenCL/atomic-ops.cl
index babebba31e82b..59d8b32e9954f 100644
--- a/clang/test/SemaOpenCL/atomic-ops.cl
+++ b/clang/test/SemaOpenCL/atomic-ops.cl
@@ -68,12 +68,12 @@ void f(atomic_int *i, const atomic_int *ci,
 
   bool cmpexch_1 = __opencl_atomic_compare_exchange_strong(i, I, 1, memory_order_seq_cst, memory_order_seq_cst, memory_scope_work_group);
   bool cmpexch_2 = __opencl_atomic_compare_exchange_strong(p, P, 1, memory_order_seq_cst, memory_order_seq_cst, memory_scope_work_group);
-  bool cmpexch_3 = __opencl_atomic_compare_exchange_strong(f, I, 1, memory_order_seq_cst, memory_order_seq_cst, memory_scope_work_group); // expected-warning {{incompatible pointer types passing '__generic int *__private' to parameter of type '__generic float *'}}
+  bool cmpexch_3 = __opencl_atomic_compare_exchange_strong(f, I, 1, memory_order_seq_cst, memory_order_seq_cst, memory_scope_work_group); // expected-error {{incompatible pointer types passing '__generic int *__private' to parameter of type '__generic float *'}}
   (void)__opencl_atomic_compare_exchange_strong(i, CI, 1, memory_order_seq_cst, memory_order_seq_cst, memory_scope_work_group); // expected-warning {{passing 'const __generic int *__private' to parameter of type '__generic int *' discards qualifiers}}
 
   bool cmpexchw_1 = __opencl_atomic_compare_exchange_weak(i, I, 1, memory_order_seq_cst, memory_order_seq_cst, memory_scope_work_group);
   bool cmpexchw_2 = __opencl_atomic_compare_exchange_weak(p, P, 1, memory_order_seq_cst, memory_order_seq_cst, memory_scope_work_group);
-  bool cmpexchw_3 = __opencl_atomic_compare_exchange_weak(f, I, 1, memory_order_seq_cst, memory_order_seq_cst, memory_scope_work_group); // expected-warning {{incompatible pointer types passing '__generic int *__private' to parameter of type '__generic float *'}}
+  bool cmpexchw_3 = __opencl_atomic_compare_exchange_weak(f, I, 1, memory_order_seq_cst, memory_order_seq_cst, memory_scope_work_group); // expected-error {{incompatible pointer types passing '__generic int *__private' to parameter of type '__generic float *'}}
   (void)__opencl_atomic_compare_exchange_weak(i, CI, 1, memory_order_seq_cst, memory_order_seq_cst, memory_scope_work_group); // expected-warning {{passing 'const __generic int *__private' to parameter of type '__generic int *' discards qualifiers}}
 
   // Pointers to different address spaces are allowed.
diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl
index 273c65e6d106d..3cea47b66d6a6 100644
--- a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl
+++ b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl
@@ -75,21 +75,21 @@ void test_cvt_scale_pk(global half8 *outh8, global bfloat8 *outy8, uint2 src2,
   *outf16 = __builtin_amdgcn_cvt_scale_pk16_f32_fp6(src3, scale, scale_sel); // expected-error {{'__builtin_amdgcn_cvt_scale_pk16_f32_fp6' must be a constant integer}}
   *outf16 = __builtin_amdgcn_cvt_scale_pk16_f32_bf6(src3, scale, scale_sel); // expected-error {{'__builtin_amdgcn_cvt_scale_pk16_f32_bf6' must be a constant integer}}
 
-  *outh8 = __builtin_amdgcn_cvt_scale_pk8_f16_fp8(src2, scale, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
-  *outy8 = __builtin_amdgcn_cvt_scale_pk8_bf16_fp8(src2, scale, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
-  *outh8 = __builtin_amdgcn_cvt_scale_pk8_f16_bf8(src2, scale, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
-  *outy8 = __builtin_amdgcn_cvt_scale_pk8_bf16_bf8(src2, scale, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
-  *outh8 = __builtin_amdgcn_cvt_scale_pk8_f16_fp4(src1, scale, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
-  *outy8 = __builtin_amdgcn_cvt_scale_pk8_bf16_fp4(src1, scale, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
-  *outf8 = __builtin_amdgcn_cvt_scale_pk8_f32_fp8(src2, scale, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
-  *outf8 = __builtin_amdgcn_cvt_scale_pk8_f32_bf8(src2, scale, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
-  *outf8 = __builtin_amdgcn_cvt_scale_pk8_f32_fp4(src1, scale, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
-  *outh16 = __builtin_amdgcn_cvt_scale_pk16_f16_fp6(src3, scale, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
-  *outy16 = __builtin_amdgcn_cvt_scale_pk16_bf16_fp6(src3, scale, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
-  *outh16 = __builtin_amdgcn_cvt_scale_pk16_f16_bf6(src3, scale, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
-  *outy16 = __builtin_amdgcn_cvt_scale_pk16_bf16_bf6(src3, scale, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
-  *outf16 = __builtin_amdgcn_cvt_scale_pk16_f32_fp6(src3, scale, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
-  *outf16 = __builtin_amdgcn_cvt_scale_pk16_f32_bf6(src3, scale, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+  *outh8 = __builtin_amdgcn_cvt_scale_pk8_f16_fp8(src2, scale, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  *outy8 = __builtin_amdgcn_cvt_scale_pk8_bf16_fp8(src2, scale, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  *outh8 = __builtin_amdgcn_cvt_scale_pk8_f16_bf8(src2, scale, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  *outy8 = __builtin_amdgcn_cvt_scale_pk8_bf16_bf8(src2, scale, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  *outh8 = __builtin_amdgcn_cvt_scale_pk8_f16_fp4(src1, scale, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  *outy8 = __builtin_amdgcn_cvt_scale_pk8_bf16_fp4(src1, scale, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  *outf8 = __builtin_amdgcn_cvt_scale_pk8_f32_fp8(src2, scale, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  *outf8 = __builtin_amdgcn_cvt_scale_pk8_f32_bf8(src2, scale, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  *outf8 = __builtin_amdgcn_cvt_scale_pk8_f32_fp4(src1, scale, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  *outh16 = __builtin_amdgcn_cvt_scale_pk16_f16_fp6(src3, scale, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  *outy16 = __builtin_amdgcn_cvt_scale_pk16_bf16_fp6(src3, scale, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  *outh16 = __builtin_amdgcn_cvt_scale_pk16_f16_bf6(src3, scale, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  *outy16 = __builtin_amdgcn_cvt_scale_pk16_bf16_bf6(src3, scale, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  *outf16 = __builtin_amdgcn_cvt_scale_pk16_f32_fp6(src3, scale, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  *outf16 = __builtin_amdgcn_cvt_scale_pk16_f32_bf6(src3, scale, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
 }
 
 void test_amdgcn_load_monitor(global int* b32gaddr, global v2i* b64gaddr, global v4i* b128gaddr, int *b32faddr, v2i* b64faddr, v4i *b128faddr,
diff --git a/clang/test/SemaOpenCL/to_addr_builtin.cl b/clang/test/SemaOpenCL/to_addr_builtin.cl
index 51721a6434518..f8e1189cf2b1c 100644
--- a/clang/test/SemaOpenCL/to_addr_builtin.cl
+++ b/clang/test/SemaOpenCL/to_addr_builtin.cl
@@ -115,7 +115,7 @@ void test(void) {
 #if (__OPENCL_C_VERSION__ < CL_VERSION_2_0) || (__OPENCL_C_VERSION__ >= CL_VERSION_3_0 && !defined(__opencl_c_generic_address_space))
   // expected-error@-2{{use of undeclared identifier 'to_global'}}
 #else
-  // expected-warning@-4{{incompatible pointer types initializing '__global char *__private' with an expression of type '__global int *'}}
+  // expected-error@-4{{incompatible pointer types initializing '__global char *__private' with an expression of type '__global int *'}}
   // expected-warning@-5{{passing non-generic address space pointer to to_global may cause dynamic conversion affecting performance}}
 #endif
 
@@ -123,7 +123,7 @@ void test(void) {
 #if (__OPENCL_C_VERSION__ < CL_VERSION_2_0) || (__OPENCL_C_VERSION__ >= CL_VERSION_3_0 && !defined(__opencl_c_generic_address_space))
   // expected-error@-2{{use of undeclared identifier 'to_global'}}
 #else
-  // expected-warning@-4{{incompatible pointer types assigning to '__global float *__private' from '__global int *'}}
+  // expected-error@-4{{incompatible pointer types assigning to '__global float *__private' from '__global int *'}}
   // expected-warning@-5{{passing non-generic address space pointer to to_global may cause dynamic conversion affecting performance}}
 #endif
 
diff --git a/clang/test/SemaTemplate/concepts.cpp b/clang/test/SemaTemplate/concepts.cpp
index d63ad01b35800..209e7dc69797d 100644
--- a/clang/test/SemaTemplate/concepts.cpp
+++ b/clang/test/SemaTemplate/concepts.cpp
@@ -1251,6 +1251,27 @@ int i = SVGPropertyOwnerRegistry<SVGCircleElement>::fastAnimatedPropertyLookup()
 
 }
 
+namespace GH61824 {
+
+template<typename T, typename U = typename T::type> // #T_Type
+concept C = true;
+
+constexpr bool f(C auto) { // #GH61824_f
+  return true;
+}
+
+C auto x = 0;
+// expected-error@#T_Type {{type 'int' cannot be used prior to '::'}} \
+// expected-note@-1 {{in instantiation of default argument}}
+
+// This will be fixed when we merge https://github.com/llvm/llvm-project/pull/141776
+// Which makes us behave like GCC.
+static_assert(f(0));
+// expected-error@-1 {{no matching function for call}} \
+// expected-note@#GH61824_f {{constraints not satisfied}} \
+// expected-note@#T_Type {{type 'int' cannot be used prior to '::'}}
+
+}
 
 namespace GH149986 {
 template <typename T> concept PerfectSquare = [](){} // expected-note 2{{here}}
diff --git a/clang/test/SemaTemplate/temp_arg_nontype_cxx20.cpp b/clang/test/SemaTemplate/temp_arg_nontype_cxx20.cpp
index 56ceb7af4ccd9..8450ff037e184 100644
--- a/clang/test/SemaTemplate/temp_arg_nontype_cxx20.cpp
+++ b/clang/test/SemaTemplate/temp_arg_nontype_cxx20.cpp
@@ -371,3 +371,18 @@ namespace ReportedRegression2 {
     fn<str>();
   }
 }
+
+namespace GH151531 {
+struct w {
+    int n;
+};
+
+template <const w *X> void f() { static_assert(X->n == 42); }
+
+template <w X> void g() { f<&X>(); }
+
+void test() {
+    constexpr w X = {42};
+    g<X>();
+}
+}
diff --git a/clang/test/Tooling/auto-detect-from-source-parent-of-cwd.cpp b/clang/test/Tooling/auto-detect-from-source-parent-of-cwd.cpp
index 762c89e9e52aa..cc017a08cffda 100644
--- a/clang/test/Tooling/auto-detect-from-source-parent-of-cwd.cpp
+++ b/clang/test/Tooling/auto-detect-from-source-parent-of-cwd.cpp
@@ -1,3 +1,5 @@
+// REQUIRES: symlinks
+
 // RUN: rm -rf %t
 // RUN: mkdir -p %t/abc/def/ijk/qwe
 // RUN: echo "[{\"directory\":\".\",\"command\":\"clang++ -c %t/abc/def/ijk/qwe/test.cpp\",\"file\":\"%t/abc/def/ijk/qwe/test.cpp\"}]" | sed -e 's/\\/\\\\/g' > %t/compile_commands.json
@@ -9,5 +11,3 @@
 // CHECK: a type specifier is required
 // CHECK: /abc/def/ijk/qwe/test.cpp
 invalid;
-
-// REQUIRES: shell
diff --git a/clang/test/Tooling/clang-check-pwd.cpp b/clang/test/Tooling/clang-check-pwd.cpp
index 2e8d4a3fe12b6..309cee54aadd9 100644
--- a/clang/test/Tooling/clang-check-pwd.cpp
+++ b/clang/test/Tooling/clang-check-pwd.cpp
@@ -1,3 +1,5 @@
+// REQUIRES: symlinks
+
 // RUN: rm -rf %t
 // RUN: mkdir %t
 // RUN: echo "[{\"directory\":\".\",\"command\":\"clang++ -c %t/test.cpp\",\"file\":\"%t/test.cpp\"}]" | sed -e 's/\\/\\\\/g' > %t/compile_commands.json
diff --git a/clang/test/VFS/broken-vfs-module-dep.c b/clang/test/VFS/broken-vfs-module-dep.c
index 2336306de8c6d..1c371a13e85c9 100644
--- a/clang/test/VFS/broken-vfs-module-dep.c
+++ b/clang/test/VFS/broken-vfs-module-dep.c
@@ -2,6 +2,5 @@
 // RUN: mkdir -p %t
 // RUN: not %clang_cc1 -module-dependency-dir %t -ivfsoverlay %S/Inputs/invalid-yaml.yaml %s 2>&1 | FileCheck %s
 
-// CHECK: error: Unexpected token
 // CHECK: error: Unexpected token
 // CHECK: 1 error generated
diff --git a/clang/tools/clang-import-test/clang-import-test.cpp b/clang/tools/clang-import-test/clang-import-test.cpp
index ab021a51bf295..910e08ca4dffa 100644
--- a/clang/tools/clang-import-test/clang-import-test.cpp
+++ b/clang/tools/clang-import-test/clang-import-test.cpp
@@ -207,8 +207,8 @@ std::unique_ptr<CompilerInstance> BuildCompilerInstance() {
 
   auto Ins = std::make_unique<CompilerInstance>(std::move(Inv));
 
-  Ins->createDiagnostics(*llvm::vfs::getRealFileSystem(), DC.release(),
-                         /*ShouldOwnClient=*/true);
+  Ins->createVirtualFileSystem(llvm::vfs::getRealFileSystem(), DC.get());
+  Ins->createDiagnostics(DC.release(), /*ShouldOwnClient=*/true);
 
   TargetInfo *TI = TargetInfo::CreateTargetInfo(
       Ins->getDiagnostics(), Ins->getInvocation().getTargetOpts());
diff --git a/clang/tools/clang-installapi/ClangInstallAPI.cpp b/clang/tools/clang-installapi/ClangInstallAPI.cpp
index 049b0bd8f8dbf..16abeb10284c0 100644
--- a/clang/tools/clang-installapi/ClangInstallAPI.cpp
+++ b/clang/tools/clang-installapi/ClangInstallAPI.cpp
@@ -115,7 +115,7 @@ static bool run(ArrayRef<const char *> Args, const char *ProgName) {
   // Set up compilation.
   std::unique_ptr<CompilerInstance> CI(new CompilerInstance());
   CI->setFileManager(FM);
-  CI->createDiagnostics(FM->getVirtualFileSystem());
+  CI->createDiagnostics();
   if (!CI->hasDiagnostics())
     return EXIT_FAILURE;
 
diff --git a/clang/tools/clang-scan-deps/ClangScanDeps.cpp b/clang/tools/clang-scan-deps/ClangScanDeps.cpp
index f10b73278381b..0e2758d123edc 100644
--- a/clang/tools/clang-scan-deps/ClangScanDeps.cpp
+++ b/clang/tools/clang-scan-deps/ClangScanDeps.cpp
@@ -429,12 +429,12 @@ class FullDeps {
         auto Res = Modules.insert(I, {{MD.ID, InputIndex}, std::move(MD)});
         NewMDs.push_back(&Res->second);
       }
-      // First call to \c getBuildArguments is somewhat expensive. Let's call it
-      // on the current thread (instead of the main one), and outside the
-      // critical section.
-      for (ModuleDeps *MD : NewMDs)
-        (void)MD->getBuildArguments();
     }
+    // First call to \c getBuildArguments is somewhat expensive. Let's call it
+    // on the current thread (instead of the main one), and outside the
+    // critical section.
+    for (ModuleDeps *MD : NewMDs)
+      (void)MD->getBuildArguments();
   }
 
   bool roundTripCommand(ArrayRef<std::string> ArgStrs,
diff --git a/clang/tools/driver/cc1_main.cpp b/clang/tools/driver/cc1_main.cpp
index 854ab3e33555b..49f8843515a35 100644
--- a/clang/tools/driver/cc1_main.cpp
+++ b/clang/tools/driver/cc1_main.cpp
@@ -271,8 +271,11 @@ int cc1_main(ArrayRef<const char *> Argv, const char *Argv0, void *MainAddr) {
     Clang->getHeaderSearchOpts().ResourceDir =
       CompilerInvocation::GetResourcesPath(Argv0, MainAddr);
 
+  /// Create the actual file system.
+  Clang->createVirtualFileSystem(llvm::vfs::getRealFileSystem(), DiagsBuffer);
+
   // Create the actual diagnostics engine.
-  Clang->createDiagnostics(*llvm::vfs::getRealFileSystem());
+  Clang->createDiagnostics();
   if (!Clang->hasDiagnostics())
     return 1;
 
@@ -319,8 +322,7 @@ int cc1_main(ArrayRef<const char *> Argv, const char *Argv0, void *MainAddr) {
     // options are stored in the compiler invocation and we can recreate the VFS
     // from the compiler invocation.
     if (!Clang->hasFileManager())
-      Clang->createFileManager(createVFSFromCompilerInvocation(
-          Clang->getInvocation(), Clang->getDiagnostics()));
+      Clang->createFileManager();
 
     if (auto profilerOutput = Clang->createOutputFile(
             Clang->getFrontendOpts().TimeTracePath, /*Binary=*/false,
diff --git a/clang/tools/libclang/CIndex.cpp b/clang/tools/libclang/CIndex.cpp
index 0ed029c39885f..9526f629bda42 100644
--- a/clang/tools/libclang/CIndex.cpp
+++ b/clang/tools/libclang/CIndex.cpp
@@ -1834,19 +1834,6 @@ bool CursorVisitor::VisitDependentNameTypeLoc(DependentNameTypeLoc TL) {
   return VisitNestedNameSpecifierLoc(TL.getQualifierLoc());
 }
 
-bool CursorVisitor::VisitDependentTemplateSpecializationTypeLoc(
-    DependentTemplateSpecializationTypeLoc TL) {
-  if (VisitNestedNameSpecifierLoc(TL.getQualifierLoc()))
-    return true;
-
-  // Visit the template arguments.
-  for (unsigned I = 0, N = TL.getNumArgs(); I != N; ++I)
-    if (VisitTemplateArgumentLoc(TL.getArgLoc(I)))
-      return true;
-
-  return false;
-}
-
 bool CursorVisitor::VisitPackExpansionTypeLoc(PackExpansionTypeLoc TL) {
   return Visit(TL.getPatternLoc());
 }
diff --git a/clang/tools/libclang/CXIndexDataConsumer.cpp b/clang/tools/libclang/CXIndexDataConsumer.cpp
index 423dd1b25adad..932201a94cdae 100644
--- a/clang/tools/libclang/CXIndexDataConsumer.cpp
+++ b/clang/tools/libclang/CXIndexDataConsumer.cpp
@@ -393,8 +393,6 @@ SourceLocation CXIndexDataConsumer::CXXBasesListInfo::getBaseLoc(
   // TypeLoc::getNameLoc()
   if (auto TTL = TL.getAs<DependentNameTypeLoc>())
     return TTL.getNameLoc();
-  if (auto TTL = TL.getAs<DependentTemplateSpecializationTypeLoc>())
-    return TTL.getTemplateNameLoc();
   if (auto TTL = TL.getAs<TemplateSpecializationTypeLoc>())
     return TTL.getTemplateNameLoc();
   if (auto TTL = TL.getAs<TagTypeLoc>())
diff --git a/clang/unittests/AST/ASTImporterTest.cpp b/clang/unittests/AST/ASTImporterTest.cpp
index 5badbd7d65e48..e7160bcf2e0c2 100644
--- a/clang/unittests/AST/ASTImporterTest.cpp
+++ b/clang/unittests/AST/ASTImporterTest.cpp
@@ -774,8 +774,8 @@ TEST_P(ImportType, ImportDependentTemplateSpecialization) {
              "  typename A<T>::template B<T> a;"
              "};",
              Lang_CXX03, "", Lang_CXX03, Verifier,
-             classTemplateDecl(has(cxxRecordDecl(has(
-                 fieldDecl(hasType(dependentTemplateSpecializationType())))))));
+             classTemplateDecl(has(cxxRecordDecl(
+                 has(fieldDecl(hasType(templateSpecializationType())))))));
 }
 
 TEST_P(ImportType, ImportDeducedTemplateSpecialization) {
@@ -10025,7 +10025,8 @@ struct ImportTemplateParmDeclDefaultValue
       EXPECT_EQ(ToD->getPreviousDecl(), ToDInherited);
     } else {
       EXPECT_EQ(FromD, FromDInherited->getPreviousDecl());
-      EXPECT_EQ(ToD, ToDInherited->getPreviousDecl());
+      // The order is reversed by the import process.
+      EXPECT_EQ(ToD->getPreviousDecl(), ToDInherited);
     }
   }
 
diff --git a/clang/unittests/AST/CMakeLists.txt b/clang/unittests/AST/CMakeLists.txt
index f27d34e8a0719..3a12a4a06a33a 100644
--- a/clang/unittests/AST/CMakeLists.txt
+++ b/clang/unittests/AST/CMakeLists.txt
@@ -26,6 +26,7 @@ add_clang_unittest(ASTTests
   ExternalASTSourceTest.cpp
   NamedDeclPrinterTest.cpp
   ProfilingTest.cpp
+  QualTypeNamesTest.cpp
   RandstructTest.cpp
   RawCommentForDeclTest.cpp
   RecursiveASTVisitorTest.cpp
diff --git a/clang/unittests/AST/ExternalASTSourceTest.cpp b/clang/unittests/AST/ExternalASTSourceTest.cpp
index 21d4ce4dcf212..15483ad250976 100644
--- a/clang/unittests/AST/ExternalASTSourceTest.cpp
+++ b/clang/unittests/AST/ExternalASTSourceTest.cpp
@@ -59,7 +59,8 @@ bool testExternalASTSource(llvm::IntrusiveRefCntPtr<ExternalASTSource> Source,
   CompilerInvocation::CreateFromArgs(*Invocation, Args, *InvocationDiags);
 
   CompilerInstance Compiler(std::move(Invocation));
-  Compiler.createDiagnostics(*llvm::vfs::getRealFileSystem());
+  Compiler.setVirtualFileSystem(llvm::vfs::getRealFileSystem());
+  Compiler.createDiagnostics();
 
   TestFrontendAction Action(Source);
   return Compiler.ExecuteAction(Action);
diff --git a/clang/unittests/AST/QualTypeNamesTest.cpp b/clang/unittests/AST/QualTypeNamesTest.cpp
new file mode 100644
index 0000000000000..5b88391c84d08
--- /dev/null
+++ b/clang/unittests/AST/QualTypeNamesTest.cpp
@@ -0,0 +1,56 @@
+//===- unittests/AST/QualTypeNamesTest.cpp --------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains tests for helpers from QualTypeNames.h.
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/AST/QualTypeNames.h"
+#include "ASTPrint.h"
+#include "clang/AST/ASTContext.h"
+#include "clang/AST/Decl.h"
+#include "clang/AST/DeclarationName.h"
+#include "clang/AST/TypeBase.h"
+#include "clang/Tooling/Tooling.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Casting.h"
+#include "gtest/gtest.h"
+
+namespace clang {
+namespace {
+
+TEST(QualTypeNamesTest, TemplateParameters) {
+  constexpr llvm::StringLiteral Code = R"cpp(
+    template <template<class> class T> struct Foo {
+      using type_of_interest = T<int>;
+    };
+  )cpp";
+  auto AST = tooling::buildASTFromCode(Code);
+  ASSERT_NE(AST, nullptr);
+
+  auto &Ctx = AST->getASTContext();
+  auto FooLR = Ctx.getTranslationUnitDecl()->lookup(
+      DeclarationName(AST->getPreprocessor().getIdentifierInfo("Foo")));
+  ASSERT_TRUE(FooLR.isSingleResult());
+
+  auto TypeLR =
+      llvm::cast<ClassTemplateDecl>(FooLR.front())
+          ->getTemplatedDecl()
+          ->lookup(DeclarationName(
+              AST->getPreprocessor().getIdentifierInfo("type_of_interest")));
+  ASSERT_TRUE(TypeLR.isSingleResult());
+
+  auto Type = cast<TypeAliasDecl>(TypeLR.front())->getUnderlyingType();
+  ASSERT_TRUE(isa<TemplateSpecializationType>(Type));
+
+  EXPECT_EQ(TypeName::getFullyQualifiedName(Type, Ctx, Ctx.getPrintingPolicy()),
+            "T<int>");
+}
+
+} // namespace
+} // namespace clang
diff --git a/clang/unittests/ASTMatchers/ASTMatchersNodeTest.cpp b/clang/unittests/ASTMatchers/ASTMatchersNodeTest.cpp
index d7df9cae01f33..9692d6e6fae97 100644
--- a/clang/unittests/ASTMatchers/ASTMatchersNodeTest.cpp
+++ b/clang/unittests/ASTMatchers/ASTMatchersNodeTest.cpp
@@ -2031,7 +2031,7 @@ TEST_P(ASTMatchersTest, DependentTemplateSpecializationType) {
           typename A<T>::template B<T> a;
         };
       )",
-      dependentTemplateSpecializationType()));
+      templateSpecializationType()));
 }
 
 TEST_P(ASTMatchersTest, RecordType) {
diff --git a/clang/unittests/Analysis/LifetimeSafetyTest.cpp b/clang/unittests/Analysis/LifetimeSafetyTest.cpp
index 13e5832d70050..bff5378c0a8a9 100644
--- a/clang/unittests/Analysis/LifetimeSafetyTest.cpp
+++ b/clang/unittests/Analysis/LifetimeSafetyTest.cpp
@@ -11,7 +11,6 @@
 #include "clang/ASTMatchers/ASTMatchers.h"
 #include "clang/Testing/TestAST.h"
 #include "llvm/ADT/StringMap.h"
-#include "llvm/Testing/Support/Error.h"
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
 #include <optional>
@@ -31,7 +30,13 @@ class LifetimeTestRunner {
   LifetimeTestRunner(llvm::StringRef Code) {
     std::string FullCode = R"(
       #define POINT(name) void("__lifetime_test_point_" #name)
+
       struct MyObj { ~MyObj() {} int i; };
+
+      struct [[gsl::Pointer()]] View { 
+        View(const MyObj&);
+        View();
+      };
     )";
     FullCode += Code.str();
 
@@ -741,5 +746,180 @@ TEST_F(LifetimeAnalysisTest, NoDuplicateLoansForImplicitCastToConst) {
   EXPECT_THAT(Helper->getLoansForVar("a"), SizeIs(2));
 }
 
+TEST_F(LifetimeAnalysisTest, GslPointerSimpleLoan) {
+  SetupTest(R"(
+    void target() {
+      MyObj a;
+      View x = a;
+      POINT(p1);
+    }
+  )");
+  EXPECT_THAT(Origin("x"), HasLoansTo({"a"}, "p1"));
+}
+
+TEST_F(LifetimeAnalysisTest, GslPointerConstructFromOwner) {
+  SetupTest(R"(
+    void target() {
+      MyObj al, bl, cl, dl, el, fl;
+      View a = View(al);
+      View b = View{bl};
+      View c = View(View(View(cl)));
+      View d = View{View(View(dl))};
+      View e = View{View{View{el}}};
+      View f = {fl};
+      POINT(p1);
+    }
+  )");
+  EXPECT_THAT(Origin("a"), HasLoansTo({"al"}, "p1"));
+  EXPECT_THAT(Origin("b"), HasLoansTo({"bl"}, "p1"));
+  EXPECT_THAT(Origin("c"), HasLoansTo({"cl"}, "p1"));
+  EXPECT_THAT(Origin("d"), HasLoansTo({"dl"}, "p1"));
+  EXPECT_THAT(Origin("e"), HasLoansTo({"el"}, "p1"));
+  EXPECT_THAT(Origin("f"), HasLoansTo({"fl"}, "p1"));
+}
+
+TEST_F(LifetimeAnalysisTest, GslPointerConstructFromView) {
+  SetupTest(R"(
+    void target() {
+      MyObj a;
+      View x = View(a);
+      View y = View{x};
+      View z = View(View(View(y)));
+      View p = View{View(View(x))};
+      View q = {x};
+      POINT(p1);
+    }
+  )");
+  EXPECT_THAT(Origin("x"), HasLoansTo({"a"}, "p1"));
+  EXPECT_THAT(Origin("y"), HasLoansTo({"a"}, "p1"));
+  EXPECT_THAT(Origin("z"), HasLoansTo({"a"}, "p1"));
+  EXPECT_THAT(Origin("p"), HasLoansTo({"a"}, "p1"));
+  EXPECT_THAT(Origin("q"), HasLoansTo({"a"}, "p1"));
+}
+
+// FIXME: Handle loans in ternary operator!
+TEST_F(LifetimeAnalysisTest, GslPointerInConditionalOperator) {
+  SetupTest(R"(
+    void target(bool cond) {
+      MyObj a, b;
+      View v = cond ? a : b;
+      POINT(p1);
+    }
+  )");
+  EXPECT_THAT(Origin("v"), HasLoansTo({}, "p1"));
+}
+
+// FIXME: Handle temporaries.
+TEST_F(LifetimeAnalysisTest, ViewFromTemporary) {
+  SetupTest(R"(
+    MyObj temporary();
+    void target() {
+      View v = temporary();
+      POINT(p1);
+    }
+  )");
+  EXPECT_THAT(Origin("v"), HasLoansTo({}, "p1"));
+}
+
+TEST_F(LifetimeAnalysisTest, GslPointerWithConstAndAuto) {
+  SetupTest(R"(
+    void target() {
+      MyObj a;
+      const View v1 = a;
+      auto v2 = v1;
+      const auto& v3 = v2;
+      POINT(p1);
+    }
+  )");
+  EXPECT_THAT(Origin("v1"), HasLoansTo({"a"}, "p1"));
+  EXPECT_THAT(Origin("v2"), HasLoansTo({"a"}, "p1"));
+  EXPECT_THAT(Origin("v3"), HasLoansTo({"a"}, "p1"));
+}
+
+TEST_F(LifetimeAnalysisTest, GslPointerPropagation) {
+  SetupTest(R"(
+    void target() {
+      MyObj a;
+      View x = a;
+      POINT(p1);
+
+      View y = x; // Propagation via copy-construction
+      POINT(p2);
+
+      View z;
+      z = x;       // Propagation via copy-assignment
+      POINT(p3);
+    }
+  )");
+
+  EXPECT_THAT(Origin("x"), HasLoansTo({"a"}, "p1"));
+  EXPECT_THAT(Origin("y"), HasLoansTo({"a"}, "p2"));
+  EXPECT_THAT(Origin("z"), HasLoansTo({"a"}, "p3"));
+}
+
+TEST_F(LifetimeAnalysisTest, GslPointerLoanExpiration) {
+  SetupTest(R"(
+    void target() {
+      View x;
+      {
+        MyObj a;
+        x = a;
+        POINT(before_expiry);
+      } // `a` is destroyed here.
+      POINT(after_expiry);
+    }
+  )");
+
+  EXPECT_THAT(NoLoans(), AreExpiredAt("before_expiry"));
+  EXPECT_THAT(LoansTo({"a"}), AreExpiredAt("after_expiry"));
+}
+
+TEST_F(LifetimeAnalysisTest, GslPointerReassignment) {
+  SetupTest(R"(
+    void target() {
+      MyObj safe;
+      View v;
+      v = safe;
+      POINT(p1);
+      {
+        MyObj unsafe;
+        v = unsafe;
+        POINT(p2);
+      } // `unsafe` expires here.
+      POINT(p3);
+    }
+  )");
+
+  EXPECT_THAT(Origin("v"), HasLoansTo({"safe"}, "p1"));
+  EXPECT_THAT(Origin("v"), HasLoansTo({"unsafe"}, "p2"));
+  EXPECT_THAT(Origin("v"), HasLoansTo({"unsafe"}, "p3"));
+  EXPECT_THAT(LoansTo({"unsafe"}), AreExpiredAt("p3"));
+}
+
+TEST_F(LifetimeAnalysisTest, GslPointerConversionOperator) {
+  SetupTest(R"(
+    struct String;
+
+    struct [[gsl::Pointer()]] StringView {
+      StringView() = default;
+    };
+
+    struct String {
+      ~String() {}
+      operator StringView() const;
+    };
+
+    void target() {
+      String xl, yl;
+      StringView x = xl;
+      StringView y;
+      y = yl;
+      POINT(p1);
+    }
+  )");
+  EXPECT_THAT(Origin("x"), HasLoansTo({"xl"}, "p1"));
+  EXPECT_THAT(Origin("y"), HasLoansTo({"yl"}, "p1"));
+}
+
 } // anonymous namespace
 } // namespace clang::lifetimes::internal
diff --git a/clang/unittests/CodeGen/TestCompiler.h b/clang/unittests/CodeGen/TestCompiler.h
index f6fada5f8a1f0..57b5b079a2e30 100644
--- a/clang/unittests/CodeGen/TestCompiler.h
+++ b/clang/unittests/CodeGen/TestCompiler.h
@@ -36,7 +36,8 @@ struct TestCompiler {
                clang::CodeGenOptions CGO = clang::CodeGenOptions()) {
     compiler.getLangOpts() = LO;
     compiler.getCodeGenOpts() = CGO;
-    compiler.createDiagnostics(*llvm::vfs::getRealFileSystem());
+    compiler.setVirtualFileSystem(llvm::vfs::getRealFileSystem());
+    compiler.createDiagnostics();
 
     std::string TrStr = llvm::Triple::normalize(llvm::sys::getProcessTriple());
     llvm::Triple Tr(TrStr);
diff --git a/clang/unittests/Driver/ToolChainTest.cpp b/clang/unittests/Driver/ToolChainTest.cpp
index 4fa27297cfd87..c1c5c9604aa16 100644
--- a/clang/unittests/Driver/ToolChainTest.cpp
+++ b/clang/unittests/Driver/ToolChainTest.cpp
@@ -579,7 +579,8 @@ TEST(CompilerInvocation, SplitSwarfSingleCrash) {
 
 TEST(ToolChainTest, UEFICallingConventionTest) {
   clang::CompilerInstance compiler;
-  compiler.createDiagnostics(*llvm::vfs::getRealFileSystem());
+  compiler.setVirtualFileSystem(llvm::vfs::getRealFileSystem());
+  compiler.createDiagnostics();
 
   std::string TrStr = "x86_64-unknown-uefi";
   llvm::Triple Tr(TrStr);
diff --git a/clang/unittests/Format/CMakeLists.txt b/clang/unittests/Format/CMakeLists.txt
index 5e5a7a0552993..03fff988d4663 100644
--- a/clang/unittests/Format/CMakeLists.txt
+++ b/clang/unittests/Format/CMakeLists.txt
@@ -28,6 +28,7 @@ add_distinct_clang_unittest(FormatTests
   MacroExpanderTest.cpp
   MatchFilePathTest.cpp
   NamespaceEndCommentsFixerTest.cpp
+  NumericLiteralCaseTest.cpp
   NumericLiteralInfoTest.cpp
   ObjCPropertyAttributeOrderFixerTest.cpp
   QualifierFixerTest.cpp
diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp
index 4e9d31895998f..d9db06667d802 100644
--- a/clang/unittests/Format/FormatTest.cpp
+++ b/clang/unittests/Format/FormatTest.cpp
@@ -5564,6 +5564,63 @@ TEST_F(FormatTest, IndentsPPDirectiveWithPPIndentWidth) {
                " }",
                style);
 
+  style.IndentPPDirectives = FormatStyle::PPDIS_Leave;
+  style.IndentWidth = 4;
+  verifyNoChange("#ifndef foo\n"
+                 "#define foo\n"
+                 "if (emacs) {\n"
+                 "#ifdef is\n"
+                 "#define lit           \\\n"
+                 "    if (af) {         \\\n"
+                 "        return duh(); \\\n"
+                 "    }\n"
+                 "#endif\n"
+                 "}\n"
+                 "#endif",
+                 style);
+  verifyNoChange("#ifndef foo\n"
+                 "  #define foo\n"
+                 "if (emacs) {\n"
+                 "  #ifdef is\n"
+                 "#define lit           \\\n"
+                 "    if (af) {         \\\n"
+                 "        return duh(); \\\n"
+                 "    }\n"
+                 "  #endif\n"
+                 "}\n"
+                 "#endif",
+                 style);
+  verifyNoChange("  #ifndef foo\n"
+                 "#  define foo\n"
+                 "if (emacs) {\n"
+                 "#ifdef is\n"
+                 "  #  define lit       \\\n"
+                 "    if (af) {         \\\n"
+                 "        return duh(); \\\n"
+                 "    }\n"
+                 "#endif\n"
+                 "}\n"
+                 "  #endif",
+                 style);
+  verifyNoChange("#ifdef foo\n"
+                 "#else\n"
+                 "/* This is a comment */\n"
+                 "#ifdef BAR\n"
+                 "#endif\n"
+                 "#endif",
+                 style);
+
+  style.IndentWidth = 1;
+  style.PPIndentWidth = 4;
+  verifyNoChange("# if 1\n"
+                 "  #define X \\\n"
+                 " {          \\\n"
+                 "  x;        \\\n"
+                 "  x;        \\\n"
+                 " }\n"
+                 "# endif",
+                 style);
+
   style.IndentWidth = 4;
   style.PPIndentWidth = 1;
   style.IndentPPDirectives = FormatStyle::PPDIS_AfterHash;
@@ -25597,6 +25654,20 @@ TEST_F(FormatTest, SkipMacroDefinitionBody) {
                  "a",
                  Style);
 
+  Style.IndentPPDirectives = FormatStyle::PPDIS_Leave;
+  verifyNoChange("#if A\n"
+                 "#define A a\n"
+                 "#endif",
+                 Style);
+  verifyNoChange("#if A\n"
+                 "  #define A a\n"
+                 "#endif",
+                 Style);
+  verifyNoChange("#if A\n"
+                 "#  define A a\n"
+                 "#endif",
+                 Style);
+
   // Adjust indendations but don't change the definition.
   Style.IndentPPDirectives = FormatStyle::PPDIS_None;
   verifyNoChange("#if A\n"
diff --git a/clang/unittests/Format/NumericLiteralCaseTest.cpp b/clang/unittests/Format/NumericLiteralCaseTest.cpp
new file mode 100644
index 0000000000000..ecd230d73f692
--- /dev/null
+++ b/clang/unittests/Format/NumericLiteralCaseTest.cpp
@@ -0,0 +1,346 @@
+//===- unittest/Format/NumericLiteralCaseTest.cpp -------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "FormatTestBase.h"
+
+#define DEBUG_TYPE "numeric-literal-case-test"
+
+namespace clang {
+namespace format {
+namespace test {
+namespace {
+
+class NumericLiteralCaseTest : public FormatTestBase {};
+
+TEST_F(NumericLiteralCaseTest, Prefix) {
+  constexpr StringRef Bin0("b = 0b0'10'010uL;");
+  constexpr StringRef Bin1("b = 0B010'010Ul;");
+  constexpr StringRef Hex0("b = 0xdead'BEEFuL;");
+  constexpr StringRef Hex1("b = 0Xdead'BEEFUl;");
+  verifyFormat(Bin0);
+  verifyFormat(Bin1);
+  verifyFormat(Hex0);
+  verifyFormat(Hex1);
+
+  auto Style = getLLVMStyle();
+  EXPECT_EQ(Style.NumericLiteralCase.Prefix, FormatStyle::NLCS_Leave);
+  EXPECT_EQ(Style.NumericLiteralCase.HexDigit, FormatStyle::NLCS_Leave);
+  EXPECT_EQ(Style.NumericLiteralCase.ExponentLetter, FormatStyle::NLCS_Leave);
+  EXPECT_EQ(Style.NumericLiteralCase.Suffix, FormatStyle::NLCS_Leave);
+
+  Style.NumericLiteralCase.Prefix = FormatStyle::NLCS_Upper;
+  verifyFormat("b = 0B0'10'010uL;", Bin0, Style);
+  verifyFormat(Bin1, Style);
+  verifyFormat("b = 0Xdead'BEEFuL;", Hex0, Style);
+  verifyFormat(Hex1, Style);
+  verifyFormat("i = 0XaBcD.a0Ebp123F;", Style);
+  verifyFormat("j = 0XaBcD.a0EbP123f;", Style);
+
+  Style.NumericLiteralCase.Prefix = FormatStyle::NLCS_Lower;
+  verifyFormat(Bin0, Style);
+  verifyFormat("b = 0b010'010Ul;", Bin1, Style);
+  verifyFormat(Hex0, Style);
+  verifyFormat("b = 0xdead'BEEFUl;", Hex1, Style);
+}
+
+TEST_F(NumericLiteralCaseTest, HexDigit) {
+  constexpr StringRef A("a = 0xaBc0'123fuL;");
+  constexpr StringRef B("b = 0XaBc0'123FUl;");
+  constexpr StringRef C("c = 0xa'Bc.0p12'3f32;");
+  constexpr StringRef D("d = 0xa'Bc.0P12'3F128;");
+  constexpr StringRef E("e = 0b0011'00Ull;");
+  constexpr StringRef F("f = 0B0100'000zu;");
+  constexpr StringRef G("g = 0.123e-19f;");
+  constexpr StringRef H("h = 0.12'3E-19F16;");
+  constexpr StringRef I("i = 0x.0000aBcp12'3F128;");
+  constexpr StringRef J("j = 0xaa1'fP12'3F128;");
+  constexpr StringRef K("k = 0x0;");
+  constexpr StringRef L("l = 0xA;");
+  verifyFormat(A);
+  verifyFormat(B);
+  verifyFormat(C);
+  verifyFormat(D);
+  verifyFormat(E);
+  verifyFormat(F);
+  verifyFormat(G);
+  verifyFormat(H);
+  verifyFormat(I);
+  verifyFormat(J);
+  verifyFormat(K);
+  verifyFormat(L);
+
+  auto Style = getLLVMStyle();
+  Style.NumericLiteralCase.HexDigit = FormatStyle::NLCS_Upper;
+  verifyFormat("a = 0xABC0'123FuL;", A, Style);
+  verifyFormat("b = 0XABC0'123FUl;", B, Style);
+  verifyFormat("c = 0xA'BC.0p12'3f32;", C, Style);
+  verifyFormat("d = 0xA'BC.0P12'3F128;", D, Style);
+  verifyFormat(E, Style);
+  verifyFormat(F, Style);
+  verifyFormat(G, Style);
+  verifyFormat(H, Style);
+  verifyFormat("i = 0x.0000ABCp12'3F128;", I, Style);
+  verifyFormat("j = 0xAA1'FP12'3F128;", J, Style);
+  verifyFormat(K, Style);
+  verifyFormat(L, Style);
+
+  Style.NumericLiteralCase.HexDigit = FormatStyle::NLCS_Lower;
+  verifyFormat("a = 0xabc0'123fuL;", A, Style);
+  verifyFormat("b = 0Xabc0'123fUl;", B, Style);
+  verifyFormat("c = 0xa'bc.0p12'3f32;", C, Style);
+  verifyFormat("d = 0xa'bc.0P12'3F128;", D, Style);
+  verifyFormat(E, Style);
+  verifyFormat(F, Style);
+  verifyFormat(G, Style);
+  verifyFormat(H, Style);
+  verifyFormat("i = 0x.0000abcp12'3F128;", I, Style);
+  verifyFormat("j = 0xaa1'fP12'3F128;", J, Style);
+  verifyFormat(K, Style);
+  verifyFormat("l = 0xa;", Style);
+}
+
+TEST_F(NumericLiteralCaseTest, ExponentLetter) {
+  constexpr StringRef A("a = .0'01e-19f;");
+  constexpr StringRef B("b = .00'1E2F;");
+  constexpr StringRef C("c = 10'2.e99;");
+  constexpr StringRef D("d = 123.456E-1;");
+  constexpr StringRef E("e = 0x12abEe3.456p-10'0;");
+  constexpr StringRef F("f = 0x.deEfP23;");
+  constexpr StringRef G("g = 0xe0E1.p-1;");
+  verifyFormat(A);
+  verifyFormat(B);
+  verifyFormat(C);
+  verifyFormat(D);
+  verifyFormat(E);
+  verifyFormat(F);
+  verifyFormat(G);
+
+  auto Style = getLLVMStyle();
+  Style.NumericLiteralCase.ExponentLetter = FormatStyle::NLCS_Lower;
+  verifyFormat(A, Style);
+  verifyFormat("b = .00'1e2F;", B, Style);
+  verifyFormat(C, Style);
+  verifyFormat("d = 123.456e-1;", D, Style);
+  verifyFormat(E, Style);
+  verifyFormat("f = 0x.deEfp23;", F, Style);
+  verifyFormat(G, Style);
+
+  Style.NumericLiteralCase.ExponentLetter = FormatStyle::NLCS_Upper;
+  verifyFormat("a = .0'01E-19f;", A, Style);
+  verifyFormat(B, Style);
+  verifyFormat("c = 10'2.E99;", C, Style);
+  verifyFormat(D, Style);
+  verifyFormat("e = 0x12abEe3.456P-10'0;", E, Style);
+  verifyFormat(F, Style);
+  verifyFormat("g = 0xe0E1.P-1;", G, Style);
+}
+
+TEST_F(NumericLiteralCaseTest, IntegerSuffix) {
+  constexpr StringRef A("a = 102u;");
+  constexpr StringRef B("b = 0177U;");
+  constexpr StringRef C("c = 0b101'111llU;");
+  constexpr StringRef D("d = 0xdead'BeefuZ;");
+  constexpr StringRef E("e = 3lU;");
+  constexpr StringRef F("f = 1zu;");
+  constexpr StringRef G("g = 0uLL;");
+  constexpr StringRef H("h = 10'233'213'0101uLL;");
+  verifyFormat(A);
+  verifyFormat(B);
+  verifyFormat(C);
+  verifyFormat(D);
+  verifyFormat(E);
+  verifyFormat(F);
+  verifyFormat(G);
+  verifyFormat(H);
+
+  auto Style = getLLVMStyle();
+  Style.NumericLiteralCase.Suffix = FormatStyle::NLCS_Lower;
+  verifyFormat(A, Style);
+  verifyFormat("b = 0177u;", B, Style);
+  verifyFormat("c = 0b101'111llu;", C, Style);
+  verifyFormat("d = 0xdead'Beefuz;", D, Style);
+  verifyFormat("e = 3lu;", E, Style);
+  verifyFormat(F, Style);
+  verifyFormat("g = 0ull;", G, Style);
+  verifyFormat("h = 10'233'213'0101ull;", H, Style);
+
+  Style.NumericLiteralCase.Suffix = FormatStyle::NLCS_Upper;
+  verifyFormat("a = 102U;", A, Style);
+  verifyFormat(B, Style);
+  verifyFormat("c = 0b101'111LLU;", C, Style);
+  verifyFormat("d = 0xdead'BeefUZ;", D, Style);
+  verifyFormat("e = 3LU;", E, Style);
+  verifyFormat("f = 1ZU;", F, Style);
+  verifyFormat("g = 0ULL;", G, Style);
+  verifyFormat("h = 10'233'213'0101ULL;", H, Style);
+}
+
+TEST_F(NumericLiteralCaseTest, FloatingPointSuffix) {
+  auto Style = getLLVMStyle();
+  // Floating point literals without suffixes.
+  constexpr std::array<StringRef, 6> FloatingPointStatements = {
+      "a = 0.",       "b = 1.0",        "c = .123'45E-10",
+      "d = 12'3.0e1", "e = 0Xa0eE.P10", "f = 0xeE01.aFf3p6",
+  };
+
+  // All legal floating-point literal suffixes defined in the C++23 standard in
+  // lowercase.
+  constexpr std::array<StringRef, 7> FloatingPointSuffixes = {
+      "f", "l", "f16", "f32", "f64", "f128", "bf16",
+  };
+
+  // Test all combinations of literals with suffixes.
+  for (const auto &Statement : FloatingPointStatements) {
+    for (const auto &Suffix : FloatingPointSuffixes) {
+      const auto LowerLine = Statement.str() + Suffix.str() + ";";
+      const auto UpperLine = Statement.str() + Suffix.upper() + ";";
+
+      Style.NumericLiteralCase.Suffix = FormatStyle::NLCS_Leave;
+      verifyFormat(LowerLine, Style);
+      verifyFormat(UpperLine, Style);
+
+      Style.NumericLiteralCase.Suffix = FormatStyle::NLCS_Lower;
+      verifyFormat(LowerLine, Style);
+      verifyFormat(LowerLine, UpperLine, Style);
+
+      Style.NumericLiteralCase.Suffix = FormatStyle::NLCS_Upper;
+      verifyFormat(UpperLine, LowerLine, Style);
+      verifyFormat(UpperLine, Style);
+    }
+  }
+}
+
+TEST_F(NumericLiteralCaseTest, CppStandardAndUserDefinedLiteralsAreUntouched) {
+  auto Style = getLLVMStyle();
+  Style.NumericLiteralCase.Prefix = FormatStyle::NLCS_Upper;
+  Style.NumericLiteralCase.HexDigit = FormatStyle::NLCS_Upper;
+  Style.NumericLiteralCase.ExponentLetter = FormatStyle::NLCS_Upper;
+  Style.NumericLiteralCase.Suffix = FormatStyle::NLCS_Upper;
+
+  // C++ user-defined suffixes begin with '_' or are reserved for the standard
+  // library.
+  constexpr StringRef UDLiterals("a = 12.if;\n"
+                                 "b = -3i;\n"
+                                 "c = 100'01il;\n"
+                                 "d = 100'0.12il;\n"
+                                 "e = 12h;\n"
+                                 "f = 0XABE12h;\n"
+                                 "g = 0XFA03min;\n"
+                                 "h = 0X12B4Ds;\n"
+                                 "i = 20.13E-1ms;\n"
+                                 "j = 20.13E-1us;\n"
+                                 "k = 20.13E-1ns;\n"
+                                 "l = 20.13E-1y;\n"
+                                 "m = 20.13E-1d;\n"
+                                 "n = 20.13E-1d;\n"
+                                 "o = 1d;\n"
+                                 "p = 102_ffl_lzlz;\n"
+                                 "q = 10.2_l;\n"
+                                 "r = 0XABDE.0'1P-23_f;\n"
+                                 "s = 102_foo_bar;\n"
+                                 "t = 123.456_felfz_ballpen;\n"
+                                 "u = 0XBEAD1_spacebar;");
+
+  verifyFormat(UDLiterals, Style);
+  Style.NumericLiteralCase.Suffix = FormatStyle::NLCS_Lower;
+  verifyFormat(UDLiterals, Style);
+}
+
+TEST_F(NumericLiteralCaseTest, FixRanges) {
+  auto Style = getLLVMStyle();
+  Style.NumericLiteralCase.Prefix = FormatStyle::NLCS_Lower;
+  Style.NumericLiteralCase.HexDigit = FormatStyle::NLCS_Lower;
+  Style.NumericLiteralCase.ExponentLetter = FormatStyle::NLCS_Lower;
+  Style.NumericLiteralCase.Suffix = FormatStyle::NLCS_Lower;
+
+  constexpr StringRef CodeBlock("a = 0xFea3duLL;\n"
+                                "b = 0X.aEbp-12f;\n"
+                                "c = 0uLL;\n"
+                                "// clang-format off\n"
+                                "e = 0xBeAdu;\n"
+                                "// clang-format on\n"
+                                "g = 0xabCDu;\n"
+                                "h = 0b010uL;\n"
+                                "// clang-format off\n"
+                                "i = 0B1010'000Zu;\n"
+                                "// clang-format on\n"
+                                "k = 0XaBuL;");
+
+  verifyFormat("a = 0xfea3dull;\n"
+               "b = 0x.aebp-12f;\n"
+               "c = 0ull;\n"
+               "// clang-format off\n"
+               "e = 0xBeAdu;\n"
+               "// clang-format on\n"
+               "g = 0xabcdu;\n"
+               "h = 0b010ul;\n"
+               "// clang-format off\n"
+               "i = 0B1010'000Zu;\n"
+               "// clang-format on\n"
+               "k = 0xabul;",
+               CodeBlock, Style);
+}
+
+TEST_F(NumericLiteralCaseTest, UnderScoreSeparatorLanguages) {
+  auto Style = getLLVMStyle();
+
+  constexpr StringRef CodeBlock("a = 0xFea_3dl;\n"
+                                "b = 0123_345;\n"
+                                "c = 0b11____00lU;\n"
+                                "d = 0XB_e_A_du;\n"
+                                "e = 123_456.333__456e-10f;\n"
+                                "f = .1_0E-10D;\n"
+                                "g = 1_0.F;\n"
+                                "h = 0B1_0;");
+  auto TestUnderscore = [&](auto Language) {
+    Style.Language = Language;
+    Style.NumericLiteralCase.Prefix = FormatStyle::NLCS_Lower;
+    Style.NumericLiteralCase.HexDigit = FormatStyle::NLCS_Upper;
+    Style.NumericLiteralCase.ExponentLetter = FormatStyle::NLCS_Lower;
+    Style.NumericLiteralCase.Suffix = FormatStyle::NLCS_Upper;
+    verifyFormat("a = 0xFEA_3DL;\n"
+                 "b = 0123_345;\n"
+                 "c = 0b11____00LU;\n"
+                 "d = 0xB_E_A_DU;\n"
+                 "e = 123_456.333__456e-10F;\n"
+                 "f = .1_0e-10D;\n"
+                 "g = 1_0.F;\n"
+                 "h = 0b1_0;",
+                 CodeBlock, Style);
+
+    Style.NumericLiteralCase.Prefix = FormatStyle::NLCS_Upper;
+    Style.NumericLiteralCase.HexDigit = FormatStyle::NLCS_Lower;
+    Style.NumericLiteralCase.ExponentLetter = FormatStyle::NLCS_Upper;
+    Style.NumericLiteralCase.Suffix = FormatStyle::NLCS_Lower;
+
+    verifyFormat("a = 0Xfea_3dl;\n"
+                 "b = 0123_345;\n"
+                 "c = 0B11____00lu;\n"
+                 "d = 0Xb_e_a_du;\n"
+                 "e = 123_456.333__456E-10f;\n"
+                 "f = .1_0E-10d;\n"
+                 "g = 1_0.f;\n"
+                 "h = 0B1_0;",
+                 CodeBlock, Style);
+  };
+
+  TestUnderscore(FormatStyle::LK_CSharp);
+  TestUnderscore(FormatStyle::LK_Java);
+  TestUnderscore(FormatStyle::LK_JavaScript);
+
+  Style.Language = FormatStyle::LK_JavaScript;
+  Style.NumericLiteralCase.Prefix = FormatStyle::NLCS_Upper;
+  verifyFormat("o = 0O0_10_010;", "o = 0o0_10_010;", Style);
+  Style.NumericLiteralCase.Prefix = FormatStyle::NLCS_Lower;
+  verifyFormat("o = 0o0_10_010;", "o = 0O0_10_010;", Style);
+}
+
+} // namespace
+} // namespace test
+} // namespace format
+} // namespace clang
diff --git a/clang/unittests/Format/TokenAnnotatorTest.cpp b/clang/unittests/Format/TokenAnnotatorTest.cpp
index 141b0001cb52d..f6435f13f0791 100644
--- a/clang/unittests/Format/TokenAnnotatorTest.cpp
+++ b/clang/unittests/Format/TokenAnnotatorTest.cpp
@@ -4105,6 +4105,13 @@ TEST_F(TokenAnnotatorTest, UTF8StringLiteral) {
   EXPECT_TOKEN(Tokens[1], tok::utf8_string_literal, TT_Unknown);
 }
 
+TEST_F(TokenAnnotatorTest, C23DigitSeparator) {
+  auto Tokens = annotate("return 1'000;", getLLVMStyle(FormatStyle::LK_C));
+  ASSERT_EQ(Tokens.size(), 4u) << Tokens;
+  EXPECT_EQ(Tokens[1]->TokenText, "1'000");
+  EXPECT_TOKEN(Tokens[2], tok::semi, TT_Unknown);
+}
+
 TEST_F(TokenAnnotatorTest, IdentifierPackage) {
   auto Tokens = annotate("auto package;");
   ASSERT_EQ(Tokens.size(), 4u) << Tokens;
diff --git a/clang/unittests/Frontend/CodeGenActionTest.cpp b/clang/unittests/Frontend/CodeGenActionTest.cpp
index b2792c44ba5fe..182afdc7ea313 100644
--- a/clang/unittests/Frontend/CodeGenActionTest.cpp
+++ b/clang/unittests/Frontend/CodeGenActionTest.cpp
@@ -52,7 +52,8 @@ TEST(CodeGenTest, TestNullCodeGen) {
   Invocation->getFrontendOpts().ProgramAction = EmitLLVM;
   Invocation->getTargetOpts().Triple = "i386-unknown-linux-gnu";
   CompilerInstance Compiler(std::move(Invocation));
-  Compiler.createDiagnostics(*llvm::vfs::getRealFileSystem());
+  Compiler.setVirtualFileSystem(llvm::vfs::getRealFileSystem());
+  Compiler.createDiagnostics();
   EXPECT_TRUE(Compiler.hasDiagnostics());
 
   std::unique_ptr<FrontendAction> Act(new NullCodeGenAction);
@@ -69,7 +70,8 @@ TEST(CodeGenTest, CodeGenFromIRMemBuffer) {
   Invocation->getFrontendOpts().ProgramAction = frontend::EmitLLVMOnly;
   Invocation->getTargetOpts().Triple = "i386-unknown-linux-gnu";
   CompilerInstance Compiler(std::move(Invocation));
-  Compiler.createDiagnostics(*llvm::vfs::getRealFileSystem());
+  Compiler.setVirtualFileSystem(llvm::vfs::getRealFileSystem());
+  Compiler.createDiagnostics();
   EXPECT_TRUE(Compiler.hasDiagnostics());
 
   EmitLLVMOnlyAction Action;
diff --git a/clang/unittests/Frontend/CompilerInstanceTest.cpp b/clang/unittests/Frontend/CompilerInstanceTest.cpp
index 7c1b6539095fa..36cac5a5dd010 100644
--- a/clang/unittests/Frontend/CompilerInstanceTest.cpp
+++ b/clang/unittests/Frontend/CompilerInstanceTest.cpp
@@ -72,6 +72,7 @@ TEST(CompilerInstance, DefaultVFSOverlayFromInvocation) {
   // in the CompilerInvocation (as we don't explicitly set our own).
   CompilerInstance Instance(std::move(CInvok));
   Instance.setDiagnostics(Diags);
+  Instance.createVirtualFileSystem();
   Instance.createFileManager();
 
   // Check if the virtual file exists which means that our VFS is used by the
@@ -135,8 +136,9 @@ TEST(CompilerInstance, MultipleInputsCleansFileIDs) {
   ASSERT_TRUE(CInvok) << "could not create compiler invocation";
 
   CompilerInstance Instance(std::move(CInvok));
+  Instance.setVirtualFileSystem(VFS);
   Instance.setDiagnostics(Diags);
-  Instance.createFileManager(VFS);
+  Instance.createFileManager();
 
   // Run once for `a.cc` and then for `a.h`. This makes sure we get the same
   // file ID for `b.h` in the second run as `a.h` from first run.
diff --git a/clang/unittests/Frontend/FrontendActionTest.cpp b/clang/unittests/Frontend/FrontendActionTest.cpp
index 48c0cfd6a185a..c4003182c4b1d 100644
--- a/clang/unittests/Frontend/FrontendActionTest.cpp
+++ b/clang/unittests/Frontend/FrontendActionTest.cpp
@@ -91,7 +91,8 @@ TEST(ASTFrontendAction, Sanity) {
   invocation->getFrontendOpts().ProgramAction = frontend::ParseSyntaxOnly;
   invocation->getTargetOpts().Triple = "i386-unknown-linux-gnu";
   CompilerInstance compiler(std::move(invocation));
-  compiler.createDiagnostics(*llvm::vfs::getRealFileSystem());
+  compiler.setVirtualFileSystem(llvm::vfs::getRealFileSystem());
+  compiler.createDiagnostics();
 
   TestASTFrontendAction test_action;
   ASSERT_TRUE(compiler.ExecuteAction(test_action));
@@ -110,7 +111,8 @@ TEST(ASTFrontendAction, IncrementalParsing) {
   invocation->getFrontendOpts().ProgramAction = frontend::ParseSyntaxOnly;
   invocation->getTargetOpts().Triple = "i386-unknown-linux-gnu";
   CompilerInstance compiler(std::move(invocation));
-  compiler.createDiagnostics(*llvm::vfs::getRealFileSystem());
+  compiler.setVirtualFileSystem(llvm::vfs::getRealFileSystem());
+  compiler.createDiagnostics();
 
   TestASTFrontendAction test_action(/*enableIncrementalProcessing=*/true);
   ASSERT_TRUE(compiler.ExecuteAction(test_action));
@@ -136,7 +138,8 @@ TEST(ASTFrontendAction, LateTemplateIncrementalParsing) {
   invocation->getFrontendOpts().ProgramAction = frontend::ParseSyntaxOnly;
   invocation->getTargetOpts().Triple = "i386-unknown-linux-gnu";
   CompilerInstance compiler(std::move(invocation));
-  compiler.createDiagnostics(*llvm::vfs::getRealFileSystem());
+  compiler.setVirtualFileSystem(llvm::vfs::getRealFileSystem());
+  compiler.createDiagnostics();
 
   TestASTFrontendAction test_action(/*enableIncrementalProcessing=*/true,
                                     /*actOnEndOfTranslationUnit=*/true);
@@ -181,7 +184,8 @@ TEST(PreprocessorFrontendAction, EndSourceFile) {
   Invocation->getFrontendOpts().ProgramAction = frontend::ParseSyntaxOnly;
   Invocation->getTargetOpts().Triple = "i386-unknown-linux-gnu";
   CompilerInstance Compiler(std::move(Invocation));
-  Compiler.createDiagnostics(*llvm::vfs::getRealFileSystem());
+  Compiler.setVirtualFileSystem(llvm::vfs::getRealFileSystem());
+  Compiler.createDiagnostics();
 
   TestPPCallbacks *Callbacks = new TestPPCallbacks;
   TestPPCallbacksFrontendAction TestAction(Callbacks);
@@ -242,8 +246,8 @@ TEST(ASTFrontendAction, ExternalSemaSource) {
   Invocation->getTargetOpts().Triple = "i386-unknown-linux-gnu";
   CompilerInstance Compiler(std::move(Invocation));
   auto *TDC = new TypoDiagnosticConsumer;
-  Compiler.createDiagnostics(*llvm::vfs::getRealFileSystem(), TDC,
-                             /*ShouldOwnClient=*/true);
+  Compiler.setVirtualFileSystem(llvm::vfs::getRealFileSystem());
+  Compiler.createDiagnostics(TDC, /*ShouldOwnClient=*/true);
   Compiler.setExternalSemaSource(
       llvm::makeIntrusiveRefCnt<TypoExternalSemaSource>(Compiler));
 
@@ -275,7 +279,8 @@ TEST(GeneratePCHFrontendAction, CacheGeneratedPCH) {
     Invocation->getFrontendOpts().ProgramAction = frontend::GeneratePCH;
     Invocation->getTargetOpts().Triple = "x86_64-apple-darwin19.0.0";
     CompilerInstance Compiler(std::move(Invocation));
-    Compiler.createDiagnostics(*llvm::vfs::getRealFileSystem());
+    Compiler.setVirtualFileSystem(llvm::vfs::getRealFileSystem());
+    Compiler.createDiagnostics();
 
     GeneratePCHAction TestAction;
     ASSERT_TRUE(Compiler.ExecuteAction(TestAction));
diff --git a/clang/unittests/Frontend/OutputStreamTest.cpp b/clang/unittests/Frontend/OutputStreamTest.cpp
index dfb5a544cb88a..9e288f86351ca 100644
--- a/clang/unittests/Frontend/OutputStreamTest.cpp
+++ b/clang/unittests/Frontend/OutputStreamTest.cpp
@@ -38,7 +38,8 @@ TEST(FrontendOutputTests, TestOutputStream) {
       new raw_svector_ostream(IRBuffer));
 
   Compiler.setOutputStream(std::move(IRStream));
-  Compiler.createDiagnostics(*llvm::vfs::getRealFileSystem());
+  Compiler.setVirtualFileSystem(llvm::vfs::getRealFileSystem());
+  Compiler.createDiagnostics();
 
   bool Success = ExecuteCompilerInvocation(&Compiler);
   EXPECT_TRUE(Success);
@@ -62,8 +63,8 @@ TEST(FrontendOutputTests, TestVerboseOutputStreamShared) {
 
   Compiler.setOutputStream(std::make_unique<raw_null_ostream>());
   DiagnosticOptions DiagOpts;
-  Compiler.createDiagnostics(*llvm::vfs::getRealFileSystem(),
-                             new TextDiagnosticPrinter(llvm::nulls(), DiagOpts),
+  Compiler.setVirtualFileSystem(llvm::vfs::getRealFileSystem());
+  Compiler.createDiagnostics(new TextDiagnosticPrinter(llvm::nulls(), DiagOpts),
                              true);
   Compiler.setVerboseOutputStream(VerboseStream);
 
@@ -92,8 +93,8 @@ TEST(FrontendOutputTests, TestVerboseOutputStreamOwned) {
 
     Compiler.setOutputStream(std::make_unique<raw_null_ostream>());
     DiagnosticOptions DiagOpts;
+    Compiler.setVirtualFileSystem(llvm::vfs::getRealFileSystem());
     Compiler.createDiagnostics(
-        *llvm::vfs::getRealFileSystem(),
         new TextDiagnosticPrinter(llvm::nulls(), DiagOpts), true);
     Compiler.setVerboseOutputStream(std::move(VerboseStream));
 
diff --git a/clang/unittests/Interpreter/CMakeLists.txt b/clang/unittests/Interpreter/CMakeLists.txt
index db9f80d9f53fe..7b8dcfc9b0546 100644
--- a/clang/unittests/Interpreter/CMakeLists.txt
+++ b/clang/unittests/Interpreter/CMakeLists.txt
@@ -29,12 +29,25 @@ set(CLANG_LIBS_TO_LINK
   )
 endif()
 
-add_distinct_clang_unittest(ClangReplInterpreterTests
+set(CLANG_REPL_TEST_SOURCES
   IncrementalCompilerBuilderTest.cpp
   IncrementalProcessingTest.cpp
   InterpreterTest.cpp
   InterpreterExtensionsTest.cpp
   CodeCompletionTest.cpp
+)
+
+if(TARGET compiler-rt)
+  list(APPEND CLANG_REPL_TEST_SOURCES
+    OutOfProcessInterpreterTests.cpp
+  )
+  message(STATUS "Compiler-RT found, enabling out of process JIT tests")
+endif()
+
+add_distinct_clang_unittest(ClangReplInterpreterTests
+  ${CLANG_REPL_TEST_SOURCES}
+
+  PARTIAL_SOURCES_INTENDED
 
   EXPORT_SYMBOLS
 
@@ -48,6 +61,14 @@ add_distinct_clang_unittest(ClangReplInterpreterTests
   ${LLVM_COMPONENTS_TO_LINK}
   )
 
+if(TARGET compiler-rt)
+  add_dependencies(ClangReplInterpreterTests 
+    llvm-jitlink-executor 
+    compiler-rt
+  )
+  message(STATUS "Adding dependency on compiler-rt for out of process JIT tests")
+endif()
+
 if(EMSCRIPTEN)
 # Without the above you try to link to LLVMSupport twice, and end
 # up with a duplicate symbol error when creating the main module
diff --git a/clang/unittests/Interpreter/OutOfProcessInterpreterTests.cpp b/clang/unittests/Interpreter/OutOfProcessInterpreterTests.cpp
new file mode 100644
index 0000000000000..704ddc37e642e
--- /dev/null
+++ b/clang/unittests/Interpreter/OutOfProcessInterpreterTests.cpp
@@ -0,0 +1,203 @@
+//===- unittests/Interpreter/OutOfProcessInterpreterTest.cpp --- Interpreter
+// tests when Out-of-Process ----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Unit tests for Clang's Interpreter library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "InterpreterTestFixture.h"
+#include "clang/AST/Decl.h"
+#include "clang/AST/DeclGroup.h"
+#include "clang/AST/Mangle.h"
+#include "clang/Basic/Version.h"
+#include "clang/Config/config.h"
+#include "clang/Frontend/CompilerInstance.h"
+#include "clang/Frontend/TextDiagnosticPrinter.h"
+#include "clang/Interpreter/Interpreter.h"
+#include "clang/Interpreter/Value.h"
+#include "clang/Sema/Lookup.h"
+#include "clang/Sema/Sema.h"
+#include "llvm/Support/Error.h"
+#include "llvm/TargetParser/Host.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include <memory>
+#include <signal.h>
+#include <sstream>
+#include <unistd.h>
+
+using namespace clang;
+
+llvm::ExitOnError ExitOnError;
+
+namespace {
+
+using Args = std::vector<const char *>;
+
+struct FileDeleter {
+  void operator()(FILE *f) {
+    if (f)
+      fclose(f);
+  }
+};
+
+struct IOContext {
+  std::unique_ptr<FILE, FileDeleter> stdin_file;
+  std::unique_ptr<FILE, FileDeleter> stdout_file;
+  std::unique_ptr<FILE, FileDeleter> stderr_file;
+
+  bool initializeTempFiles() {
+    stdin_file.reset(tmpfile());
+    stdout_file.reset(tmpfile());
+    stderr_file.reset(tmpfile());
+    return stdin_file && stdout_file && stderr_file;
+  }
+
+  std::string readStdoutContent() {
+    if (!stdout_file)
+      return "";
+    rewind(stdout_file.get());
+    std::ostringstream content;
+    char buffer[1024];
+    size_t bytes_read;
+    while ((bytes_read = fread(buffer, 1, sizeof(buffer), stdout_file.get())) >
+           0) {
+      content.write(buffer, bytes_read);
+    }
+    return content.str();
+  }
+
+  std::string readStderrContent() {
+    if (!stderr_file)
+      return "";
+    rewind(stderr_file.get());
+    std::ostringstream content;
+    char buffer[1024];
+    size_t bytes_read;
+    while ((bytes_read = fread(buffer, 1, sizeof(buffer), stderr_file.get())) >
+           0) {
+      content.write(buffer, bytes_read);
+    }
+    return content.str();
+  }
+};
+
+static void removePathComponent(unsigned N, llvm::SmallString<256> &Path) {
+  for (unsigned i = 0; i < N; ++i)
+    llvm::sys::path::remove_filename(Path);
+}
+
+static std::string getExecutorPath() {
+  llvm::SmallString<256> ExecutorPath(llvm::sys::fs::getMainExecutable(
+      nullptr, reinterpret_cast<void *>(&getExecutorPath)));
+  removePathComponent(5, ExecutorPath);
+  llvm::sys::path::append(ExecutorPath, "bin", "llvm-jitlink-executor");
+  return ExecutorPath.str().str();
+}
+
+static std::string getOrcRuntimePath() {
+  llvm::SmallString<256> RuntimePath(llvm::sys::fs::getMainExecutable(
+      nullptr, reinterpret_cast<void *>(&getOrcRuntimePath)));
+  removePathComponent(5, RuntimePath);
+  llvm::sys::path::append(RuntimePath, CLANG_INSTALL_LIBDIR_BASENAME, "clang",
+                          CLANG_VERSION_MAJOR_STRING, "lib");
+
+  llvm::Triple SystemTriple(llvm::sys::getProcessTriple());
+  if (SystemTriple.isOSBinFormatMachO()) {
+    llvm::sys::path::append(RuntimePath, "darwin", "liborc_rt_osx.a");
+  } else if (SystemTriple.isOSBinFormatELF()) {
+    llvm::sys::path::append(RuntimePath, "x86_64-unknown-linux-gnu",
+                            "liborc_rt.a");
+  }
+  return RuntimePath.str().str();
+}
+
+static std::unique_ptr<Interpreter>
+createInterpreterWithRemoteExecution(std::shared_ptr<IOContext> io_ctx,
+                                     const Args &ExtraArgs = {}) {
+  Args ClangArgs = {"-Xclang", "-emit-llvm-only"};
+  llvm::append_range(ClangArgs, ExtraArgs);
+  auto CB = clang::IncrementalCompilerBuilder();
+  CB.SetCompilerArgs(ClangArgs);
+  auto CI = cantFail(CB.CreateCpp());
+
+  clang::Interpreter::JITConfig Config;
+  llvm::Triple SystemTriple(llvm::sys::getProcessTriple());
+
+  if (SystemTriple.isOSBinFormatELF() || SystemTriple.isOSBinFormatMachO()) {
+    Config.IsOutOfProcess = true;
+    Config.OOPExecutor = getExecutorPath();
+    Config.UseSharedMemory = false;
+    Config.SlabAllocateSize = 0;
+    Config.OrcRuntimePath = getOrcRuntimePath();
+
+    int stdin_fd = fileno(io_ctx->stdin_file.get());
+    int stdout_fd = fileno(io_ctx->stdout_file.get());
+    int stderr_fd = fileno(io_ctx->stderr_file.get());
+
+    Config.CustomizeFork = [=] {
+      auto redirect = [](int from, int to) {
+        if (from != to) {
+          dup2(from, to);
+          close(from);
+        }
+      };
+
+      redirect(stdin_fd, STDIN_FILENO);
+      redirect(stdout_fd, STDOUT_FILENO);
+      redirect(stderr_fd, STDERR_FILENO);
+
+      setvbuf(stdout, nullptr, _IONBF, 0);
+      setvbuf(stderr, nullptr, _IONBF, 0);
+
+      printf("CustomizeFork executed\n");
+      fflush(stdout);
+    };
+  }
+
+  return cantFail(clang::Interpreter::create(std::move(CI), Config));
+}
+
+static size_t DeclsSize(TranslationUnitDecl *PTUDecl) {
+  return std::distance(PTUDecl->decls().begin(), PTUDecl->decls().end());
+}
+
+TEST_F(InterpreterTestBase, SanityWithRemoteExecution) {
+  if (!HostSupportsJIT())
+    GTEST_SKIP();
+
+  std::string OrcRuntimePath = getOrcRuntimePath();
+  std::string ExecutorPath = getExecutorPath();
+
+  if (!llvm::sys::fs::exists(OrcRuntimePath) ||
+      !llvm::sys::fs::exists(ExecutorPath))
+    GTEST_SKIP();
+
+  auto io_ctx = std::make_shared<IOContext>();
+  ASSERT_TRUE(io_ctx->initializeTempFiles());
+
+  std::unique_ptr<Interpreter> Interp =
+      createInterpreterWithRemoteExecution(io_ctx);
+  ASSERT_TRUE(Interp);
+
+  using PTU = PartialTranslationUnit;
+  PTU &R1(cantFail(Interp->Parse("void g(); void g() {}")));
+  EXPECT_EQ(2U, DeclsSize(R1.TUPart));
+
+  PTU &R2(cantFail(Interp->Parse("int i = 42;")));
+  EXPECT_EQ(1U, DeclsSize(R2.TUPart));
+
+  std::string captured_stdout = io_ctx->readStdoutContent();
+  std::string captured_stderr = io_ctx->readStderrContent();
+
+  EXPECT_TRUE(captured_stdout.find("CustomizeFork executed") !=
+              std::string::npos);
+}
+
+} // end anonymous namespace
\ No newline at end of file
diff --git a/clang/unittests/Sema/HeuristicResolverTest.cpp b/clang/unittests/Sema/HeuristicResolverTest.cpp
index 883a4e20e40a7..0eb14f032f16c 100644
--- a/clang/unittests/Sema/HeuristicResolverTest.cpp
+++ b/clang/unittests/Sema/HeuristicResolverTest.cpp
@@ -216,7 +216,6 @@ TEST(HeuristicResolver, MemberExpr_AutoTypeDeduction2) {
     struct B {
       int waldo;
     };
-
     template <typename T>
     struct A {
       B b;
@@ -251,6 +250,103 @@ TEST(HeuristicResolver, MemberExpr_Chained) {
       cxxMethodDecl(hasName("foo")).bind("output"));
 }
 
+TEST(HeuristicResolver, MemberExpr_Chained_ReferenceType) {
+  std::string Code = R"cpp(
+    struct B {
+      int waldo;
+    };
+    template <typename T>
+    struct A {
+      B &foo();
+    };
+    template <typename T>
+    void bar(A<T> a) {
+      a.foo().waldo;
+    }
+  )cpp";
+  // Test resolution of "waldo" in "a.foo().waldo"
+  expectResolution(
+      Code, &HeuristicResolver::resolveMemberExpr,
+      cxxDependentScopeMemberExpr(hasMemberName("waldo")).bind("input"),
+      fieldDecl(hasName("waldo")).bind("output"));
+}
+
+TEST(HeuristicResolver, MemberExpr_Chained_PointerArrow) {
+  std::string Code = R"cpp(
+    struct B {
+      int waldo;
+    };
+    template <typename T>
+    B* foo(T);
+    template <class T>
+    void bar(T t) {
+      foo(t)->waldo;
+    }
+  )cpp";
+  // Test resolution of "waldo" in "foo(t)->waldo"
+  expectResolution(
+      Code, &HeuristicResolver::resolveMemberExpr,
+      cxxDependentScopeMemberExpr(hasMemberName("waldo")).bind("input"),
+      fieldDecl(hasName("waldo")).bind("output"));
+}
+
+TEST(HeuristicResolver, MemberExpr_Chained_PointerDeref) {
+  std::string Code = R"cpp(
+    struct B {
+      int waldo;
+    };
+    template <typename T>
+    B* foo(T);
+    template <class T>
+    void bar(T t) {
+      (*foo(t)).waldo;
+    }
+  )cpp";
+  // Test resolution of "waldo" in "foo(t)->waldo"
+  expectResolution(
+      Code, &HeuristicResolver::resolveMemberExpr,
+      cxxDependentScopeMemberExpr(hasMemberName("waldo")).bind("input"),
+      fieldDecl(hasName("waldo")).bind("output"));
+}
+
+TEST(HeuristicResolver, MemberExpr_Chained_Overload) {
+  std::string Code = R"cpp(
+    struct B {
+      int waldo;
+    };
+    B overloaded(int);
+    B overloaded(double);
+    template <typename T>
+    void foo(T t) {
+      overloaded(t).waldo;
+    }
+  )cpp";
+  // Test resolution of "waldo" in "overloaded(t).waldo"
+  expectResolution(
+      Code, &HeuristicResolver::resolveMemberExpr,
+      cxxDependentScopeMemberExpr(hasMemberName("waldo")).bind("input"),
+      fieldDecl(hasName("waldo")).bind("output"));
+}
+
+TEST(HeuristicResolver, MemberExpr_CallToFunctionTemplate) {
+  std::string Code = R"cpp(
+    struct B {
+      int waldo;
+    };
+    template <typename T>
+    B bar(T);
+    template <typename T>
+    void foo(T t) {
+      bar(t).waldo;
+    }
+  )cpp";
+  // Test resolution of "waldo" in "bar(t).waldo"
+  expectResolution(
+      Code, &HeuristicResolver::resolveMemberExpr,
+      cxxDependentScopeMemberExpr(hasMemberName("waldo")).bind("input"),
+      fieldDecl(hasName("waldo")).bind("output"));
+}
+
 TEST(HeuristicResolver, MemberExpr_ReferenceType) {
   std::string Code = R"cpp(
     struct B {
diff --git a/clang/unittests/Serialization/ForceCheckFileInputTest.cpp b/clang/unittests/Serialization/ForceCheckFileInputTest.cpp
index 92ff76b016283..24e2fd65f3c0a 100644
--- a/clang/unittests/Serialization/ForceCheckFileInputTest.cpp
+++ b/clang/unittests/Serialization/ForceCheckFileInputTest.cpp
@@ -91,10 +91,8 @@ export int aa = 43;
 
     Instance.getFrontendOpts().OutputFile = BMIPath;
 
-    if (auto VFSWithRemapping = createVFSFromCompilerInvocation(
-            Instance.getInvocation(), Instance.getDiagnostics(), CIOpts.VFS))
-      CIOpts.VFS = VFSWithRemapping;
-    Instance.createFileManager(CIOpts.VFS);
+    Instance.createVirtualFileSystem(CIOpts.VFS);
+    Instance.createFileManager();
 
     Instance.getHeaderSearchOpts().ValidateASTInputFilesContent = true;
 
@@ -123,7 +121,8 @@ export int aa = 43;
     CompilerInstance Clang(std::move(Invocation));
 
     Clang.setDiagnostics(Diags);
-    FileManager *FM = Clang.createFileManager(CIOpts.VFS);
+    Clang.createVirtualFileSystem(CIOpts.VFS);
+    FileManager *FM = Clang.createFileManager();
     Clang.createSourceManager(*FM);
 
     EXPECT_TRUE(Clang.createTarget());
diff --git a/clang/unittests/Serialization/ModuleCacheTest.cpp b/clang/unittests/Serialization/ModuleCacheTest.cpp
index 1f64401a08314..e9b8da3dba6af 100644
--- a/clang/unittests/Serialization/ModuleCacheTest.cpp
+++ b/clang/unittests/Serialization/ModuleCacheTest.cpp
@@ -121,6 +121,7 @@ TEST_F(ModuleCacheTest, CachedModuleNewPath) {
       createInvocationAndEnableFree(Args, CIOpts);
   ASSERT_TRUE(Invocation);
   CompilerInstance Instance(std::move(Invocation));
+  Instance.setVirtualFileSystem(CIOpts.VFS);
   Instance.setDiagnostics(Diags);
   SyntaxOnlyAction Action;
   ASSERT_TRUE(Instance.ExecuteAction(Action));
@@ -145,6 +146,7 @@ TEST_F(ModuleCacheTest, CachedModuleNewPath) {
   CompilerInstance Instance2(std::move(Invocation2),
                              Instance.getPCHContainerOperations(),
                              &Instance.getModuleCache());
+  Instance2.setVirtualFileSystem(CIOpts.VFS);
   Instance2.setDiagnostics(Diags);
   SyntaxOnlyAction Action2;
   ASSERT_FALSE(Instance2.ExecuteAction(Action2));
@@ -171,6 +173,7 @@ TEST_F(ModuleCacheTest, CachedModuleNewPathAllowErrors) {
       createInvocationAndEnableFree(Args, CIOpts);
   ASSERT_TRUE(Invocation);
   CompilerInstance Instance(std::move(Invocation));
+  Instance.setVirtualFileSystem(CIOpts.VFS);
   Instance.setDiagnostics(Diags);
   SyntaxOnlyAction Action;
   ASSERT_TRUE(Instance.ExecuteAction(Action));
@@ -189,6 +192,7 @@ TEST_F(ModuleCacheTest, CachedModuleNewPathAllowErrors) {
   CompilerInstance Instance2(std::move(Invocation2),
                              Instance.getPCHContainerOperations(),
                              &Instance.getModuleCache());
+  Instance2.setVirtualFileSystem(CIOpts.VFS);
   Instance2.setDiagnostics(Diags);
   SyntaxOnlyAction Action2;
   ASSERT_FALSE(Instance2.ExecuteAction(Action2));
diff --git a/clang/unittests/Serialization/NoCommentsTest.cpp b/clang/unittests/Serialization/NoCommentsTest.cpp
index ed96c7c7959a0..01bb6999a7c90 100644
--- a/clang/unittests/Serialization/NoCommentsTest.cpp
+++ b/clang/unittests/Serialization/NoCommentsTest.cpp
@@ -99,6 +99,7 @@ void foo() {}
   ASSERT_TRUE(Invocation);
 
   CompilerInstance Instance(std::move(Invocation));
+  Instance.createVirtualFileSystem(CIOpts.VFS);
   Instance.setDiagnostics(Diags);
   Instance.getFrontendOpts().OutputFile = CacheBMIPath;
   GenerateReducedModuleInterfaceAction Action;
diff --git a/clang/unittests/Serialization/PreambleInNamedModulesTest.cpp b/clang/unittests/Serialization/PreambleInNamedModulesTest.cpp
index f9d7736a77ee1..55ee72875ead2 100644
--- a/clang/unittests/Serialization/PreambleInNamedModulesTest.cpp
+++ b/clang/unittests/Serialization/PreambleInNamedModulesTest.cpp
@@ -113,12 +113,8 @@ export using ::E;
 
   auto Clang = std::make_unique<CompilerInstance>(std::move(Invocation));
   Clang->setDiagnostics(Diags);
-
-  if (auto VFSWithRemapping = createVFSFromCompilerInvocation(
-          Clang->getInvocation(), Clang->getDiagnostics(), VFS))
-    VFS = VFSWithRemapping;
-
-  Clang->createFileManager(VFS);
+  Clang->createVirtualFileSystem(VFS);
+  Clang->createFileManager();
   EXPECT_TRUE(Clang->createTarget());
 
   Buffer.release();
diff --git a/clang/unittests/Support/TimeProfilerTest.cpp b/clang/unittests/Support/TimeProfilerTest.cpp
index 871c59f650c82..e544c892635e8 100644
--- a/clang/unittests/Support/TimeProfilerTest.cpp
+++ b/clang/unittests/Support/TimeProfilerTest.cpp
@@ -52,7 +52,6 @@ bool compileFromString(StringRef Code, StringRef Standard, StringRef File,
     FS->addFile(Header.getKey(), 0,
                 MemoryBuffer::getMemBuffer(Header.getValue()));
   }
-  auto Files = llvm::makeIntrusiveRefCnt<FileManager>(FileSystemOptions(), FS);
 
   auto Invocation = std::make_shared<CompilerInvocation>();
   std::vector<const char *> Args = {Standard.data(), File.data()};
@@ -62,8 +61,9 @@ bool compileFromString(StringRef Code, StringRef Standard, StringRef File,
   CompilerInvocation::CreateFromArgs(*Invocation, Args, *InvocationDiags);
 
   CompilerInstance Compiler(std::move(Invocation));
-  Compiler.createDiagnostics(Files->getVirtualFileSystem());
-  Compiler.setFileManager(Files);
+  Compiler.setVirtualFileSystem(std::move(FS));
+  Compiler.createDiagnostics();
+  Compiler.createFileManager();
 
   class TestFrontendAction : public ASTFrontendAction {
   private:
diff --git a/clang/unittests/Tooling/DependencyScanning/DependencyScannerTest.cpp b/clang/unittests/Tooling/DependencyScanning/DependencyScannerTest.cpp
index b16dd8e6e2b8f..80289efd374cf 100644
--- a/clang/unittests/Tooling/DependencyScanning/DependencyScannerTest.cpp
+++ b/clang/unittests/Tooling/DependencyScanning/DependencyScannerTest.cpp
@@ -61,8 +61,7 @@ class TestDependencyScanningAction : public tooling::ToolAction {
                               std::move(PCHContainerOps));
     Compiler.setFileManager(FileMgr);
 
-    Compiler.createDiagnostics(FileMgr->getVirtualFileSystem(), DiagConsumer,
-                               /*ShouldOwnClient=*/false);
+    Compiler.createDiagnostics(DiagConsumer, /*ShouldOwnClient=*/false);
     if (!Compiler.hasDiagnostics())
       return false;
 
diff --git a/clang/utils/TableGen/ClangAttrEmitter.cpp b/clang/utils/TableGen/ClangAttrEmitter.cpp
index a4e4de32ba53f..1342e1a6ffb5b 100644
--- a/clang/utils/TableGen/ClangAttrEmitter.cpp
+++ b/clang/utils/TableGen/ClangAttrEmitter.cpp
@@ -5169,7 +5169,7 @@ enum class SpellingKind : size_t {
 static const size_t NumSpellingKinds = (size_t)SpellingKind::NumSpellingKinds;
 
 class SpellingList {
-  std::vector<std::string> Spellings[NumSpellingKinds];
+  std::array<std::vector<std::string>, NumSpellingKinds> Spellings;
 
 public:
   ArrayRef<std::string> operator[](SpellingKind K) const {
@@ -5217,11 +5217,7 @@ class SpellingList {
   }
 
   bool hasSpelling() const {
-    for (size_t Kind = 0; Kind < NumSpellingKinds; ++Kind) {
-      if (Spellings[Kind].size() > 0)
-        return true;
-    }
-    return false;
+    return llvm::any_of(Spellings, [](const auto &L) { return !L.empty(); });
   }
 };
 
diff --git a/clang/www/cxx_dr_status.html b/clang/www/cxx_dr_status.html
index ad82d7ab0cec6..b7da22cf9fb22 100755
--- a/clang/www/cxx_dr_status.html
+++ b/clang/www/cxx_dr_status.html
@@ -1063,7 +1063,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="170">
     <td><a href="https://cplusplus.github.io/CWG/issues/170.html">170</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Pointer-to-member conversions</td>
     <td class="full" align="center">Clang 3.1</td>
   </tr>
@@ -1442,7 +1442,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="233">
     <td><a href="https://cplusplus.github.io/CWG/issues/233.html">233</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>References vs pointers in UDC overload resolution</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -2763,7 +2763,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="453">
     <td><a href="https://cplusplus.github.io/CWG/issues/453.html">453</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>References may only bind to &#8220;valid&#8221; objects</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -4739,7 +4739,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/787.html">787</a></td>
     <td>CD2</td>
     <td>Unnecessary lexical undefined behavior</td>
-    <td class="unreleased" align="center">Clang 21</td>
+    <td class="full" align="center">Clang 21</td>
   </tr>
   <tr id="788">
     <td><a href="https://cplusplus.github.io/CWG/issues/788.html">788</a></td>
@@ -6045,7 +6045,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="1038">
     <td><a href="https://cplusplus.github.io/CWG/issues/1038.html">1038</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Overload resolution of <TT>&amp;x.static_func</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -7155,7 +7155,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="1223">
     <td><a href="https://cplusplus.github.io/CWG/issues/1223.html">1223</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Syntactic disambiguation and <I>trailing-return-type</I>s</td>
     <td class="full" align="center">Clang 17</td>
   </tr>
@@ -7935,7 +7935,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="1353">
     <td><a href="https://cplusplus.github.io/CWG/issues/1353.html">1353</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Array and variant members and deleted special member functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -8815,7 +8815,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="1499">
     <td><a href="https://cplusplus.github.io/CWG/issues/1499.html">1499</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Missing case for deleted move assignment operator</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -9677,7 +9677,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="1642">
     <td><a href="https://cplusplus.github.io/CWG/issues/1642.html">1642</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Missing requirements for prvalue operands</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -9845,7 +9845,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr class="open" id="1670">
     <td><a href="https://cplusplus.github.io/CWG/issues/1670.html">1670</a></td>
-    <td>drafting</td>
+    <td>review</td>
     <td><TT>auto</TT> as <I>conversion-type-id</I></td>
     <td align="center">Not resolved</td>
   </tr>
@@ -10013,7 +10013,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="1698">
     <td><a href="https://cplusplus.github.io/CWG/issues/1698.html">1698</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Files ending in <TT>\</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -11123,7 +11123,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr class="open" id="1883">
     <td><a href="https://cplusplus.github.io/CWG/issues/1883.html">1883</a></td>
-    <td>drafting</td>
+    <td>review</td>
     <td>Protected access to constructors in <I>mem-initializer</I>s</td>
     <td align="center">Not resolved</td>
   </tr>
@@ -11547,13 +11547,13 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="1953">
     <td><a href="https://cplusplus.github.io/CWG/issues/1953.html">1953</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Data races and common initial sequence</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1954">
     <td><a href="https://cplusplus.github.io/CWG/issues/1954.html">1954</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td><TT>typeid</TT> null dereference check in subexpressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -11617,11 +11617,11 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><I>opaque-enum-declaration</I> in <I>alias-declaration</I>?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
-  <tr class="open" id="1965">
+  <tr id="1965">
     <td><a href="https://cplusplus.github.io/CWG/issues/1965.html">1965</a></td>
-    <td>open</td>
+    <td>CD7</td>
     <td>Explicit casts to reference types</td>
-    <td align="center">Not resolved</td>
+    <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1966">
     <td><a href="https://cplusplus.github.io/CWG/issues/1966.html">1966</a></td>
@@ -11667,7 +11667,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="1973">
     <td><a href="https://cplusplus.github.io/CWG/issues/1973.html">1973</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Which <I>parameter-declaration-clause</I> in a <I>lambda-expression</I>?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -11811,7 +11811,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="1997">
     <td><a href="https://cplusplus.github.io/CWG/issues/1997.html">1997</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Placement new and previous initialization</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -12123,7 +12123,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2049">
     <td><a href="https://cplusplus.github.io/CWG/issues/2049.html">2049</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>List initializer in non-type template default argument</td>
     <td class="full" align="center">Clang 18</td>
   </tr>
@@ -12153,7 +12153,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2054">
     <td><a href="https://cplusplus.github.io/CWG/issues/2054.html">2054</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Missing description of class SFINAE</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -12441,7 +12441,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2102">
     <td><a href="https://cplusplus.github.io/CWG/issues/2102.html">2102</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Constructor checking in <I>new-expression</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -12693,7 +12693,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2144">
     <td><a href="https://cplusplus.github.io/CWG/issues/2144.html">2144</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Function/variable declaration ambiguity</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -12723,7 +12723,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2149">
     <td><a href="https://cplusplus.github.io/CWG/issues/2149.html">2149</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Brace elision and array length deduction</td>
     <td class="full" align="center">Clang 3.1</td>
   </tr>
@@ -13197,7 +13197,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr class="open" id="2228">
     <td><a href="https://cplusplus.github.io/CWG/issues/2228.html">2228</a></td>
-    <td>open</td>
+    <td>review</td>
     <td>Ambiguity resolution for cast to function type</td>
     <td align="center">Not resolved</td>
   </tr>
@@ -13341,7 +13341,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2252">
     <td><a href="https://cplusplus.github.io/CWG/issues/2252.html">2252</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Enumeration list-initialization from the same type</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -13527,7 +13527,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2283">
     <td><a href="https://cplusplus.github.io/CWG/issues/2283.html">2283</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Missing complete type requirements</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -14091,7 +14091,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/2376.html">2376</a></td>
     <td>CD5</td>
     <td>Class template argument deduction with array declarator</td>
-    <td class="unreleased" align="center">Clang 21</td>
+    <td class="full" align="center">Clang 21</td>
   </tr>
   <tr id="2377">
     <td><a href="https://cplusplus.github.io/CWG/issues/2377.html">2377</a></td>
@@ -14533,7 +14533,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2450">
     <td><a href="https://cplusplus.github.io/CWG/issues/2450.html">2450</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td><I>braced-init-list</I> as a <I>template-argument</I></td>
     <td class="full" align="center">Clang 18</td>
   </tr>
@@ -14587,7 +14587,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2459">
     <td><a href="https://cplusplus.github.io/CWG/issues/2459.html">2459</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Template parameter initialization</td>
     <td class="full" align="center">Clang 18</td>
   </tr>
@@ -14689,7 +14689,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2476">
     <td><a href="https://cplusplus.github.io/CWG/issues/2476.html">2476</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td><I>placeholder-type-specifier</I>s and function declarators</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -14743,7 +14743,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2485">
     <td><a href="https://cplusplus.github.io/CWG/issues/2485.html">2485</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Bit-fields in integral promotions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -14811,7 +14811,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/2496.html">2496</a></td>
     <td>CD6</td>
     <td><I>ref-qualifier</I>s and virtual overriding</td>
-    <td class="unreleased" align="center">Clang 21</td>
+    <td class="full" align="center">Clang 21</td>
   </tr>
   <tr class="open" id="2497">
     <td><a href="https://cplusplus.github.io/CWG/issues/2497.html">2497</a></td>
@@ -14857,7 +14857,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2504">
     <td><a href="https://cplusplus.github.io/CWG/issues/2504.html">2504</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Inheriting constructors from virtual base classes</td>
     <td class="none" align="center">No</td>
   </tr>
@@ -14937,7 +14937,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/2517.html">2517</a></td>
     <td>C++23</td>
     <td>Useless restriction on use of parameter in <I>constraint-expression</I></td>
-    <td class="unreleased" align="center">Clang 21</td>
+    <td class="full" align="center">Clang 21</td>
   </tr>
   <tr id="2518">
     <td><a href="https://cplusplus.github.io/CWG/issues/2518.html">2518</a></td>
@@ -14947,7 +14947,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2519">
     <td><a href="https://cplusplus.github.io/CWG/issues/2519.html">2519</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Object representation of a bit-field</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -15019,7 +15019,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2531">
     <td><a href="https://cplusplus.github.io/CWG/issues/2531.html">2531</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Static data members redeclared as constexpr</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -15031,7 +15031,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2533">
     <td><a href="https://cplusplus.github.io/CWG/issues/2533.html">2533</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Storage duration of implicitly created objects</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -15085,7 +15085,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2542">
     <td><a href="https://cplusplus.github.io/CWG/issues/2542.html">2542</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Is a closure type a structural type?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -15109,13 +15109,13 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2546">
     <td><a href="https://cplusplus.github.io/CWG/issues/2546.html">2546</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Defaulted secondary comparison operators defined as deleted</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2547">
     <td><a href="https://cplusplus.github.io/CWG/issues/2547.html">2547</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Defaulted comparison operator function for non-classes</td>
     <td class="full" align="center">Clang 20</td>
   </tr>
@@ -15127,13 +15127,13 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2549">
     <td><a href="https://cplusplus.github.io/CWG/issues/2549.html">2549</a></td>
-    <td>DR</td>
+    <td>CD7</td>
     <td>Implicitly moving the operand of a <I>throw-expression</I> in unevaluated contexts</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2550">
     <td><a href="https://cplusplus.github.io/CWG/issues/2550.html">2550</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Type "reference to <I>cv</I> <TT>void</TT>" outside of a declarator</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -15145,7 +15145,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2552">
     <td><a href="https://cplusplus.github.io/CWG/issues/2552.html">2552</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Constant evaluation of non-defining variable declarations</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -15177,7 +15177,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2556">
     <td><a href="https://cplusplus.github.io/CWG/issues/2556.html">2556</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Unusable <TT>promise::return_void</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -15201,13 +15201,13 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2560">
     <td><a href="https://cplusplus.github.io/CWG/issues/2560.html">2560</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Parameter type determination in a <I>requirement-parameter-list</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2561">
     <td><a href="https://cplusplus.github.io/CWG/issues/2561.html">2561</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Conversion to function pointer for lambda with explicit object parameter</td>
     <td class="none" align="center">No</td>
   </tr>
@@ -15219,7 +15219,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr class="open" id="2563">
     <td><a href="https://cplusplus.github.io/CWG/issues/2563.html">2563</a></td>
-    <td>drafting</td>
+    <td>review</td>
     <td>Initialization of coroutine result object</td>
     <td align="center">Not resolved</td>
   </tr>
@@ -15253,7 +15253,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2568">
     <td><a href="https://cplusplus.github.io/CWG/issues/2568.html">2568</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Access checking during synthesis of defaulted comparison operator</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -15265,7 +15265,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2570">
     <td><a href="https://cplusplus.github.io/CWG/issues/2570.html">2570</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Clarify constexpr for defaulted functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -15283,13 +15283,13 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2573">
     <td><a href="https://cplusplus.github.io/CWG/issues/2573.html">2573</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Undefined behavior when splicing results in a <I>universal-character-name</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2574">
     <td><a href="https://cplusplus.github.io/CWG/issues/2574.html">2574</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Undefined behavior when lexing unmatched quotes</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -15373,7 +15373,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2588">
     <td><a href="https://cplusplus.github.io/CWG/issues/2588.html">2588</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>friend declarations and module linkage</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -15391,7 +15391,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2591">
     <td><a href="https://cplusplus.github.io/CWG/issues/2591.html">2591</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Implicit change of active union member for anonymous union in union</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -15415,7 +15415,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2595">
     <td><a href="https://cplusplus.github.io/CWG/issues/2595.html">2595</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>"More constrained" for eligible special member functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -15445,7 +15445,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2600">
     <td><a href="https://cplusplus.github.io/CWG/issues/2600.html">2600</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Type dependency of placeholder types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -15613,7 +15613,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2628">
     <td><a href="https://cplusplus.github.io/CWG/issues/2628.html">2628</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Implicit deduction guides should propagate constraints</td>
     <td class="full" align="center">Clang 20</td>
   </tr>
@@ -15649,7 +15649,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2634">
     <td><a href="https://cplusplus.github.io/CWG/issues/2634.html">2634</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Avoid circularity in specification of scope for friend class declarations</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -15667,13 +15667,13 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2637">
     <td><a href="https://cplusplus.github.io/CWG/issues/2637.html">2637</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Injected-class-name as a <I>simple-template-id</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2638">
     <td><a href="https://cplusplus.github.io/CWG/issues/2638.html">2638</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Improve the example for initializing by initializer list</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -15787,7 +15787,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2657">
     <td><a href="https://cplusplus.github.io/CWG/issues/2657.html">2657</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Cv-qualification adjustment when binding reference to temporary</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -15811,7 +15811,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2661">
     <td><a href="https://cplusplus.github.io/CWG/issues/2661.html">2661</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Missing disambiguation rule for <I>pure-specifier</I> vs. <I>brace-or-equal-initializer</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -15823,7 +15823,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2663">
     <td><a href="https://cplusplus.github.io/CWG/issues/2663.html">2663</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Example for member redeclarations with <I>using-declaration</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -15853,7 +15853,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2668">
     <td><a href="https://cplusplus.github.io/CWG/issues/2668.html">2668</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td><TT>co_await</TT> in a <I>lambda-expression</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -15877,7 +15877,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2672">
     <td><a href="https://cplusplus.github.io/CWG/issues/2672.html">2672</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Lambda body SFINAE is still required, contrary to intent and note</td>
     <td class="full" align="center">Clang 18</td>
   </tr>
@@ -15943,7 +15943,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2683">
     <td><a href="https://cplusplus.github.io/CWG/issues/2683.html">2683</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Default arguments for member functions of templated nested classes</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -15979,7 +15979,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2689">
     <td><a href="https://cplusplus.github.io/CWG/issues/2689.html">2689</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Are cv-qualified <TT>std::nullptr_t</TT> fundamental types?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -16027,25 +16027,25 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2697">
     <td><a href="https://cplusplus.github.io/CWG/issues/2697.html">2697</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Deduction guides using abbreviated function syntax</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2698">
     <td><a href="https://cplusplus.github.io/CWG/issues/2698.html">2698</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Using extended integer types with <TT>z</TT> suffix</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2699">
     <td><a href="https://cplusplus.github.io/CWG/issues/2699.html">2699</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Inconsistency of <I>throw-expression</I> specification</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2700">
     <td><a href="https://cplusplus.github.io/CWG/issues/2700.html">2700</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td><TT>#error</TT> disallows existing implementation practice</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -16063,7 +16063,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2703">
     <td><a href="https://cplusplus.github.io/CWG/issues/2703.html">2703</a></td>
-    <td>DR</td>
+    <td>CD7</td>
     <td>Three-way comparison requiring strong ordering for floating-point types, take 2</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -16087,13 +16087,13 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2707">
     <td><a href="https://cplusplus.github.io/CWG/issues/2707.html">2707</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Deduction guides cannot have a trailing <I>requires-clause</I></td>
     <td class="full" align="center">Clang 20</td>
   </tr>
   <tr id="2708">
     <td><a href="https://cplusplus.github.io/CWG/issues/2708.html">2708</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Parenthesized initialization of arrays</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -16105,97 +16105,97 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2710">
     <td><a href="https://cplusplus.github.io/CWG/issues/2710.html">2710</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Loops in constant expressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2711">
     <td><a href="https://cplusplus.github.io/CWG/issues/2711.html">2711</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Source for copy-initializing the exception object</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2712">
     <td><a href="https://cplusplus.github.io/CWG/issues/2712.html">2712</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Simplify restrictions on built-in assignment operator candidates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2713">
     <td><a href="https://cplusplus.github.io/CWG/issues/2713.html">2713</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Initialization of reference-to-aggregate from designated initializer list</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2714">
     <td><a href="https://cplusplus.github.io/CWG/issues/2714.html">2714</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Implicit deduction guides omit properties from the parameter-declaration-clause of a constructor</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2715">
     <td><a href="https://cplusplus.github.io/CWG/issues/2715.html">2715</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>"calling function" for parameter initialization may not exist</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2716">
     <td><a href="https://cplusplus.github.io/CWG/issues/2716.html">2716</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Rule about self-or-base conversion is normatively redundant</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2717">
     <td><a href="https://cplusplus.github.io/CWG/issues/2717.html">2717</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Pack expansion for <I>alignment-specifier</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2718">
     <td><a href="https://cplusplus.github.io/CWG/issues/2718.html">2718</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Type completeness for derived-to-base conversions</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="2719">
     <td><a href="https://cplusplus.github.io/CWG/issues/2719.html">2719</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Creating objects in misaligned storage</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2720">
     <td><a href="https://cplusplus.github.io/CWG/issues/2720.html">2720</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Template validity rules for templated entities and alias templates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2721">
     <td><a href="https://cplusplus.github.io/CWG/issues/2721.html">2721</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>When exactly is storage reused?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2722">
     <td><a href="https://cplusplus.github.io/CWG/issues/2722.html">2722</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Temporary materialization conversion for <TT>noexcept</TT> operator</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2723">
     <td><a href="https://cplusplus.github.io/CWG/issues/2723.html">2723</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Range of representable values for floating-point types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2724">
     <td><a href="https://cplusplus.github.io/CWG/issues/2724.html">2724</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Clarify rounding for arithmetic right shift</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2725">
     <td><a href="https://cplusplus.github.io/CWG/issues/2725.html">2725</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Overload resolution for non-call of class member access</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -16213,13 +16213,13 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2728">
     <td><a href="https://cplusplus.github.io/CWG/issues/2728.html">2728</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Evaluation of conversions in a <I>delete-expression</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2729">
     <td><a href="https://cplusplus.github.io/CWG/issues/2729.html">2729</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Meaning of <I>new-type-id</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -16237,13 +16237,13 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2732">
     <td><a href="https://cplusplus.github.io/CWG/issues/2732.html">2732</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Can importable headers react to preprocessor state from point of import?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2733">
     <td><a href="https://cplusplus.github.io/CWG/issues/2733.html">2733</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Applying <TT>[[maybe_unused]]</TT> to a label</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -16315,37 +16315,37 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2745">
     <td><a href="https://cplusplus.github.io/CWG/issues/2745.html">2745</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Dependent odr-use in generic lambdas</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2746">
     <td><a href="https://cplusplus.github.io/CWG/issues/2746.html">2746</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Checking of default template arguments</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2747">
     <td><a href="https://cplusplus.github.io/CWG/issues/2747.html">2747</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Cannot depend on an already-deleted splice</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2748">
     <td><a href="https://cplusplus.github.io/CWG/issues/2748.html">2748</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Accessing static data members via null pointer</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2749">
     <td><a href="https://cplusplus.github.io/CWG/issues/2749.html">2749</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Treatment of "pointer to void" for relational comparisons</td>
     <td class="full" align="center">Clang 20</td>
   </tr>
   <tr id="2750">
     <td><a href="https://cplusplus.github.io/CWG/issues/2750.html">2750</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>construct_at without constructor call</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -16363,19 +16363,19 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2753">
     <td><a href="https://cplusplus.github.io/CWG/issues/2753.html">2753</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Storage reuse for string literal objects and backing arrays</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2754">
     <td><a href="https://cplusplus.github.io/CWG/issues/2754.html">2754</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Using *this in explicit object member functions that are coroutines</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2755">
     <td><a href="https://cplusplus.github.io/CWG/issues/2755.html">2755</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Incorrect wording applied by P2738R1</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -16393,43 +16393,43 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2758">
     <td><a href="https://cplusplus.github.io/CWG/issues/2758.html">2758</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>What is "access and ambiguity control"?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2759">
     <td><a href="https://cplusplus.github.io/CWG/issues/2759.html">2759</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>[[no_unique_address] and common initial sequence</td>
     <td class="full" align="center">Clang 19</td>
   </tr>
   <tr id="2760">
     <td><a href="https://cplusplus.github.io/CWG/issues/2760.html">2760</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Defaulted constructor that is an immediate function</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2761">
     <td><a href="https://cplusplus.github.io/CWG/issues/2761.html">2761</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Implicitly invoking the deleted destructor of an anonymous union member</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2762">
     <td><a href="https://cplusplus.github.io/CWG/issues/2762.html">2762</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Type of implicit object parameter</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2763">
     <td><a href="https://cplusplus.github.io/CWG/issues/2763.html">2763</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Ignorability of [[noreturn]] during constant evaluation</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2764">
     <td><a href="https://cplusplus.github.io/CWG/issues/2764.html">2764</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Use of placeholders affecting name mangling</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -16454,7 +16454,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2768">
     <td><a href="https://cplusplus.github.io/CWG/issues/2768.html">2768</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Assignment to enumeration variable with a <I>braced-init-list</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -16476,13 +16476,13 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2771">
     <td><a href="https://cplusplus.github.io/CWG/issues/2771.html">2771</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Transformation for <I>unqualified-id</I>s in address operator</td>
     <td class="full" align="center">Clang 18</td>
   </tr>
   <tr id="2772">
     <td><a href="https://cplusplus.github.io/CWG/issues/2772.html">2772</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Missing Annex C entry for linkage effects of <I>linkage-specification</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -16500,7 +16500,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2775">
     <td><a href="https://cplusplus.github.io/CWG/issues/2775.html">2775</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Unclear argument type for copy of exception object</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -16512,7 +16512,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2777">
     <td><a href="https://cplusplus.github.io/CWG/issues/2777.html">2777</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Type of <I>id-expression</I> denoting a template parameter object</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -16530,7 +16530,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2780">
     <td><a href="https://cplusplus.github.io/CWG/issues/2780.html">2780</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td><TT>reinterpret_cast</TT> to reference to function types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -16548,7 +16548,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2783">
     <td><a href="https://cplusplus.github.io/CWG/issues/2783.html">2783</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Handling of deduction guides in <I>global-module-fragment</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -16560,7 +16560,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2785">
     <td><a href="https://cplusplus.github.io/CWG/issues/2785.html">2785</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Type-dependence of <I>requires-expression</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -16584,7 +16584,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2789">
     <td><a href="https://cplusplus.github.io/CWG/issues/2789.html">2789</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Overload resolution with implicit and explicit object member functions</td>
     <td class="full" align="center">Clang 18</td>
   </tr>
@@ -16596,19 +16596,19 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2791">
     <td><a href="https://cplusplus.github.io/CWG/issues/2791.html">2791</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Unclear phrasing about "returning to the caller"</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2792">
     <td><a href="https://cplusplus.github.io/CWG/issues/2792.html">2792</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Clean up specification of <TT>noexcept</TT> operator</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2793">
     <td><a href="https://cplusplus.github.io/CWG/issues/2793.html">2793</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Block-scope declaration conflicting with parameter name</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -16620,13 +16620,13 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2795">
     <td><a href="https://cplusplus.github.io/CWG/issues/2795.html">2795</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Overlapping empty subobjects with different cv-qualification</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2796">
     <td><a href="https://cplusplus.github.io/CWG/issues/2796.html">2796</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Function pointer conversions for relational operators</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -16638,7 +16638,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2798">
     <td><a href="https://cplusplus.github.io/CWG/issues/2798.html">2798</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Manifestly constant evaluation of the <TT>static_assert</TT> message</td>
     <td class="full" align="center">Clang 17</td>
   </tr>
@@ -16656,7 +16656,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2801">
     <td><a href="https://cplusplus.github.io/CWG/issues/2801.html">2801</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Reference binding with reference-related types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -16668,7 +16668,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2803">
     <td><a href="https://cplusplus.github.io/CWG/issues/2803.html">2803</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Overload resolution for reference binding of similar types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -16686,13 +16686,13 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2806">
     <td><a href="https://cplusplus.github.io/CWG/issues/2806.html">2806</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Make a <I>type-requirement</I> a type-only context</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2807">
     <td><a href="https://cplusplus.github.io/CWG/issues/2807.html">2807</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Destructors declared <TT>consteval</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -16704,19 +16704,19 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2809">
     <td><a href="https://cplusplus.github.io/CWG/issues/2809.html">2809</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>An implicit definition does not redeclare a function</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2810">
     <td><a href="https://cplusplus.github.io/CWG/issues/2810.html">2810</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Requiring the absence of diagnostics for templates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2811">
     <td><a href="https://cplusplus.github.io/CWG/issues/2811.html">2811</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Clarify "use" of main</td>
     <td class="full" align="center">Clang 3.5</td>
   </tr>
@@ -16728,7 +16728,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2813">
     <td><a href="https://cplusplus.github.io/CWG/issues/2813.html">2813</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Class member access with prvalues</td>
     <td class="full" align="center">Clang 20</td>
   </tr>
@@ -16740,7 +16740,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2815">
     <td><a href="https://cplusplus.github.io/CWG/issues/2815.html">2815</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Overload resolution for references/pointers to <TT>noexcept</TT> functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -16758,19 +16758,19 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2818">
     <td><a href="https://cplusplus.github.io/CWG/issues/2818.html">2818</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Use of predefined reserved identifiers</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2819">
     <td><a href="https://cplusplus.github.io/CWG/issues/2819.html">2819</a></td>
-    <td>WP</td>
+    <td>CD7</td>
     <td>Cast from null pointer value in a constant expression</td>
     <td class="full" align="center">Clang 19 (C++26 onwards)</td>
   </tr>
   <tr id="2820">
     <td><a href="https://cplusplus.github.io/CWG/issues/2820.html">2820</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Value-initialization and default constructors</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -16782,25 +16782,25 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2822">
     <td><a href="https://cplusplus.github.io/CWG/issues/2822.html">2822</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Side-effect-free pointer zap</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2823">
     <td><a href="https://cplusplus.github.io/CWG/issues/2823.html">2823</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Implicit undefined behavior when dereferencing pointers</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2824">
     <td><a href="https://cplusplus.github.io/CWG/issues/2824.html">2824</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Copy-initialization of arrays</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2825">
     <td><a href="https://cplusplus.github.io/CWG/issues/2825.html">2825</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Range-based for statement using a <I>braced-init-list</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -16818,7 +16818,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2828">
     <td><a href="https://cplusplus.github.io/CWG/issues/2828.html">2828</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Ambiguous interpretation of C-style cast</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -16830,13 +16830,13 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2830">
     <td><a href="https://cplusplus.github.io/CWG/issues/2830.html">2830</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Top-level cv-qualification should be ignored for list-initialization</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2831">
     <td><a href="https://cplusplus.github.io/CWG/issues/2831.html">2831</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Non-templated function definitions and <I>requires-clause</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -16866,7 +16866,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2836">
     <td><a href="https://cplusplus.github.io/CWG/issues/2836.html">2836</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Conversion rank of <TT>long double</TT> and extended floating-point types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -16906,11 +16906,11 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td>Preferring an <TT>initializer_list</TT> over a single value</td>
     <td align="center">Not resolved</td>
   </tr>
-  <tr class="open" id="2843">
+  <tr id="2843">
     <td><a href="https://cplusplus.github.io/CWG/issues/2843.html">2843</a></td>
-    <td>drafting</td>
+    <td>CD7</td>
     <td>Undated reference to Unicode makes C++ a moving target</td>
-    <td align="center">Not resolved</td>
+    <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2844">
     <td><a href="https://cplusplus.github.io/CWG/issues/2844.html">2844</a></td>
@@ -16920,13 +16920,13 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2845">
     <td><a href="https://cplusplus.github.io/CWG/issues/2845.html">2845</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Make the closure type of a captureless lambda a structural type</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2846">
     <td><a href="https://cplusplus.github.io/CWG/issues/2846.html">2846</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Out-of-class definitions of explicit object member functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -16942,25 +16942,25 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2848">
     <td><a href="https://cplusplus.github.io/CWG/issues/2848.html">2848</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Omitting an empty template argument list for explicit instantiation</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2849">
     <td><a href="https://cplusplus.github.io/CWG/issues/2849.html">2849</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Parameter objects are not temporary objects</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2850">
     <td><a href="https://cplusplus.github.io/CWG/issues/2850.html">2850</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Unclear storage duration for function parameter objects</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2851">
     <td><a href="https://cplusplus.github.io/CWG/issues/2851.html">2851</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Allow floating-point conversions in converted constant expressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -16972,43 +16972,43 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2853">
     <td><a href="https://cplusplus.github.io/CWG/issues/2853.html">2853</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Pointer arithmetic with pointer to hypothetical element</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2854">
     <td><a href="https://cplusplus.github.io/CWG/issues/2854.html">2854</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Storage duration of exception objects</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2855">
     <td><a href="https://cplusplus.github.io/CWG/issues/2855.html">2855</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Undefined behavior in postfix increment</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2856">
     <td><a href="https://cplusplus.github.io/CWG/issues/2856.html">2856</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Copy-list-initialization with explicit default constructors</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2857">
     <td><a href="https://cplusplus.github.io/CWG/issues/2857.html">2857</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Argument-dependent lookup with incomplete class types</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="2858">
     <td><a href="https://cplusplus.github.io/CWG/issues/2858.html">2858</a></td>
-    <td>WP</td>
+    <td>CD7</td>
     <td>Declarative <I>nested-name-specifier</I>s and <I>pack-index-specifier</I>s</td>
     <td class="full" align="center">Clang 19</td>
   </tr>
   <tr id="2859">
     <td><a href="https://cplusplus.github.io/CWG/issues/2859.html">2859</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Value-initialization with multiple default constructors</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -17020,7 +17020,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2861">
     <td><a href="https://cplusplus.github.io/CWG/issues/2861.html">2861</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td><TT>dynamic_cast</TT> on bad pointer value</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -17038,13 +17038,13 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2864">
     <td><a href="https://cplusplus.github.io/CWG/issues/2864.html">2864</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Narrowing floating-point conversions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2865">
     <td><a href="https://cplusplus.github.io/CWG/issues/2865.html">2865</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Regression on result of conditional operator</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -17056,7 +17056,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2867">
     <td><a href="https://cplusplus.github.io/CWG/issues/2867.html">2867</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Order of initialization for structured bindings</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -17068,25 +17068,25 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2869">
     <td><a href="https://cplusplus.github.io/CWG/issues/2869.html">2869</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td><TT>this</TT> in local classes</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2870">
     <td><a href="https://cplusplus.github.io/CWG/issues/2870.html">2870</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Combining absent <I>encoding-prefix</I>es</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2871">
     <td><a href="https://cplusplus.github.io/CWG/issues/2871.html">2871</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>User-declared constructor templates inhibiting default constructors</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2872">
     <td><a href="https://cplusplus.github.io/CWG/issues/2872.html">2872</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Linkage and unclear "can be referred to"</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -17098,7 +17098,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2874">
     <td><a href="https://cplusplus.github.io/CWG/issues/2874.html">2874</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Qualified declarations of partial specializations</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -17110,13 +17110,13 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2876">
     <td><a href="https://cplusplus.github.io/CWG/issues/2876.html">2876</a></td>
-    <td>WP</td>
+    <td>CD7</td>
     <td>Disambiguation of <TT>T x = delete("text")</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2877">
     <td><a href="https://cplusplus.github.io/CWG/issues/2877.html">2877</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Type-only lookup for <I>using-enum-declarator</I></td>
     <td class="full" align="center">Clang 19</td>
   </tr>
@@ -17128,31 +17128,31 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2879">
     <td><a href="https://cplusplus.github.io/CWG/issues/2879.html">2879</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Undesired outcomes with <TT>const_cast</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2880">
     <td><a href="https://cplusplus.github.io/CWG/issues/2880.html">2880</a></td>
-    <td>WP</td>
+    <td>CD7</td>
     <td>Accessibility check for destructor of incomplete class type</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2881">
     <td><a href="https://cplusplus.github.io/CWG/issues/2881.html">2881</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Type restrictions for the explicit object parameter of a lambda</td>
     <td class="full" align="center">Clang 19</td>
   </tr>
   <tr id="2882">
     <td><a href="https://cplusplus.github.io/CWG/issues/2882.html">2882</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Unclear treatment of conversion to <TT>void</TT></td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="2883">
     <td><a href="https://cplusplus.github.io/CWG/issues/2883.html">2883</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Definition of "odr-usable" ignores lambda scopes</td>
     <td class="none" align="center">No</td>
   </tr>
@@ -17174,13 +17174,13 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2886">
     <td><a href="https://cplusplus.github.io/CWG/issues/2886.html">2886</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Temporaries and trivial potentially-throwing special member functions</td>
     <td class="full" align="center">Clang 9</td>
   </tr>
   <tr id="2887">
     <td><a href="https://cplusplus.github.io/CWG/issues/2887.html">2887</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Missing compatibility entries for xvalues</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -17198,19 +17198,19 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2890">
     <td><a href="https://cplusplus.github.io/CWG/issues/2890.html">2890</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Defining members of local classes</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2891">
     <td><a href="https://cplusplus.github.io/CWG/issues/2891.html">2891</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Normative status of implementation limits</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2892">
     <td><a href="https://cplusplus.github.io/CWG/issues/2892.html">2892</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Unclear usual arithmetic conversions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -17222,13 +17222,13 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2894">
     <td><a href="https://cplusplus.github.io/CWG/issues/2894.html">2894</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Functional casts create prvalues of reference type</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2895">
     <td><a href="https://cplusplus.github.io/CWG/issues/2895.html">2895</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Initialization should ignore the destination type's cv-qualification</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -17244,15 +17244,15 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td>Copying potentially-overlapping union subobjects</td>
     <td align="center">Not resolved</td>
   </tr>
-  <tr class="open" id="2898">
+  <tr id="2898">
     <td><a href="https://cplusplus.github.io/CWG/issues/2898.html">2898</a></td>
-    <td>tentatively ready</td>
+    <td>CD7</td>
     <td>Clarify implicit conversion sequence from <I>cv</I> <TT>T</TT> to <TT>T</TT></td>
-    <td align="center">Not resolved</td>
+    <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2899">
     <td><a href="https://cplusplus.github.io/CWG/issues/2899.html">2899</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Bad value representations should cause undefined behavior</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -17264,7 +17264,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2901">
     <td><a href="https://cplusplus.github.io/CWG/issues/2901.html">2901</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Unclear semantics for near-match aliased access</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -17288,43 +17288,43 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2905">
     <td><a href="https://cplusplus.github.io/CWG/issues/2905.html">2905</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Value-dependence of <I>noexcept-expression</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2906">
     <td><a href="https://cplusplus.github.io/CWG/issues/2906.html">2906</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Lvalue-to-rvalue conversion of class types for conditional operator</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2907">
     <td><a href="https://cplusplus.github.io/CWG/issues/2907.html">2907</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Constant lvalue-to-rvalue conversion on uninitialized <TT>std::nullptr_t</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2908">
     <td><a href="https://cplusplus.github.io/CWG/issues/2908.html">2908</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Counting physical source lines for <TT>__LINE__</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2909">
     <td><a href="https://cplusplus.github.io/CWG/issues/2909.html">2909</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Subtle difference between constant-initialized and constexpr</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2910">
     <td><a href="https://cplusplus.github.io/CWG/issues/2910.html">2910</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Effect of <I>requirement-parameter-list</I>s on odr-usability</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2911">
     <td><a href="https://cplusplus.github.io/CWG/issues/2911.html">2911</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Unclear meaning of expressions "appearing within" subexpressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -17336,7 +17336,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2913">
     <td><a href="https://cplusplus.github.io/CWG/issues/2913.html">2913</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Grammar for <I>deduction-guide</I> has <I>requires-clause</I> in the wrong position</td>
     <td class="full" align="center">Clang 20</td>
   </tr>
@@ -17348,7 +17348,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2915">
     <td><a href="https://cplusplus.github.io/CWG/issues/2915.html">2915</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Explicit object parameters of type <TT>void</TT></td>
     <td class="full" align="center">Clang 20</td>
   </tr>
@@ -17370,13 +17370,13 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2918">
     <td><a href="https://cplusplus.github.io/CWG/issues/2918.html">2918</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Consideration of constraints for address of overloaded function</td>
-    <td class="unreleased" align="center">Clang 21</td>
+    <td class="full" align="center">Clang 21</td>
   </tr>
   <tr id="2919">
     <td><a href="https://cplusplus.github.io/CWG/issues/2919.html">2919</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Conversion function candidates for initialization of const lvalue reference</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -17388,13 +17388,13 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2921">
     <td><a href="https://cplusplus.github.io/CWG/issues/2921.html">2921</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Exporting redeclarations of entities not attached to a named module</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2922">
     <td><a href="https://cplusplus.github.io/CWG/issues/2922.html">2922</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>constexpr placement-new is too permissive</td>
     <td class="full" align="center">Clang 20</td>
   </tr>
@@ -17406,7 +17406,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2924">
     <td><a href="https://cplusplus.github.io/CWG/issues/2924.html">2924</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Undefined behavior during constant evaluation</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -17424,7 +17424,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2927">
     <td><a href="https://cplusplus.github.io/CWG/issues/2927.html">2927</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Unclear status of translation unit with <TT>module</TT> keyword</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -17442,13 +17442,13 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2930">
     <td><a href="https://cplusplus.github.io/CWG/issues/2930.html">2930</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Unclear term "copy/move operation" in specification of copy elision</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2931">
     <td><a href="https://cplusplus.github.io/CWG/issues/2931.html">2931</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Restrictions on operator functions that are explicit object member functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -17460,7 +17460,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2933">
     <td><a href="https://cplusplus.github.io/CWG/issues/2933.html">2933</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Dangling references</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -17478,13 +17478,13 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2936">
     <td><a href="https://cplusplus.github.io/CWG/issues/2936.html">2936</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Local classes of templated functions should be part of the current instantiation</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2937">
     <td><a href="https://cplusplus.github.io/CWG/issues/2937.html">2937</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Grammar for <I>preprocessing-file</I> has no normative effect</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -17496,7 +17496,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2939">
     <td><a href="https://cplusplus.github.io/CWG/issues/2939.html">2939</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Do not allow <TT>reinterpret_cast</TT> from prvalue to rvalue reference</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -17520,13 +17520,13 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2943">
     <td><a href="https://cplusplus.github.io/CWG/issues/2943.html">2943</a></td>
-    <td>DR</td>
+    <td>CD7</td>
     <td>Discarding a void return value</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2944">
     <td><a href="https://cplusplus.github.io/CWG/issues/2944.html">2944</a></td>
-    <td>DRWP</td>
+    <td>CD7</td>
     <td>Unsequenced <I>throw-expression</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -17682,7 +17682,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2970">
     <td><a href="https://cplusplus.github.io/CWG/issues/2970.html">2970</a></td>
-    <td>DR</td>
+    <td>CD7</td>
     <td>Races with <TT>volatile sig_atomic_t</TT> bit-fields</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -17752,11 +17752,11 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td>Usual arithmetic conversions and result types</td>
     <td align="center">Not resolved</td>
   </tr>
-  <tr class="open" id="2982">
+  <tr id="2982">
     <td><a href="https://cplusplus.github.io/CWG/issues/2982.html">2982</a></td>
-    <td>tentatively ready</td>
+    <td>CD7</td>
     <td>Deduction in <I>type-constraint</I>s</td>
-    <td align="center">Not resolved</td>
+    <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2983">
     <td><a href="https://cplusplus.github.io/CWG/issues/2983.html">2983</a></td>
@@ -17770,11 +17770,11 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td>Value-dependent structured bindings</td>
     <td align="center">Not resolved</td>
   </tr>
-  <tr class="open" id="2985">
+  <tr id="2985">
     <td><a href="https://cplusplus.github.io/CWG/issues/2985.html">2985</a></td>
-    <td>tentatively ready</td>
+    <td>CD7</td>
     <td>Unclear rules for reference initialization with conversion</td>
-    <td align="center">Not resolved</td>
+    <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2986">
     <td><a href="https://cplusplus.github.io/CWG/issues/2986.html">2986</a></td>
@@ -17782,11 +17782,11 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td>Creating objects within a mutable member of a const object</td>
     <td align="center">Not resolved</td>
   </tr>
-  <tr class="open" id="2987">
+  <tr id="2987">
     <td><a href="https://cplusplus.github.io/CWG/issues/2987.html">2987</a></td>
-    <td>tentatively ready</td>
+    <td>CD7</td>
     <td>Remove dilapidated wording from <TT>static_cast</TT></td>
-    <td align="center">Not resolved</td>
+    <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2988">
     <td><a href="https://cplusplus.github.io/CWG/issues/2988.html">2988</a></td>
@@ -17802,7 +17802,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2990">
     <td><a href="https://cplusplus.github.io/CWG/issues/2990.html">2990</a></td>
-    <td>DR</td>
+    <td>CD7</td>
     <td>Exporting redeclarations of namespaces</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -17862,47 +17862,47 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr class="open" id="3000">
     <td><a href="https://cplusplus.github.io/CWG/issues/3000.html">3000</a></td>
-    <td>open</td>
+    <td>review</td>
     <td>Handling of cv-qualified class types in conditional operator</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3001">
     <td><a href="https://cplusplus.github.io/CWG/issues/3001.html">3001</a></td>
-    <td>open</td>
+    <td>review</td>
     <td>Inconsistent restrictions for <TT>static_cast</TT> on pointers to out-of-lifetime objects</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3002">
     <td><a href="https://cplusplus.github.io/CWG/issues/3002.html">3002</a></td>
-    <td>open</td>
+    <td>tentatively ready</td>
     <td>Template parameter/argument confusion</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3003">
     <td><a href="https://cplusplus.github.io/CWG/issues/3003.html">3003</a></td>
-    <td>open</td>
+    <td>review</td>
     <td>Naming a deducible template for class template argument deduction</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3004">
     <td><a href="https://cplusplus.github.io/CWG/issues/3004.html">3004</a></td>
-    <td>open</td>
+    <td>tentatively ready</td>
     <td>Pointer arithmetic on array of unknown bound</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3005">
     <td><a href="https://cplusplus.github.io/CWG/issues/3005.html">3005</a></td>
-    <td>open</td>
+    <td>tentatively ready</td>
     <td>Function parameters should never be name-independent</td>
     <td align="center">
       <details>
         <summary>Not resolved</summary>
-        Clang 21 implements 2025-03-10 resolution
+        Clang 21 implements 2025-09-12 resolution
       </details></td>
   </tr>
   <tr class="open" id="3006">
     <td><a href="https://cplusplus.github.io/CWG/issues/3006.html">3006</a></td>
-    <td>open</td>
+    <td>review</td>
     <td>Vague restrictions for explicit instantiations of class templates</td>
     <td align="center">Not resolved</td>
   </tr>
@@ -17914,7 +17914,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr class="open" id="3008">
     <td><a href="https://cplusplus.github.io/CWG/issues/3008.html">3008</a></td>
-    <td>open</td>
+    <td>tentatively ready</td>
     <td>Missing Annex C entry for <TT>void</TT> object declarations</td>
     <td align="center">Not resolved</td>
   </tr>
@@ -17942,29 +17942,29 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td>Deviating <TT>constexpr</TT> or <TT>consteval</TT> across translation units</td>
     <td align="center">Not resolved</td>
   </tr>
-  <tr class="open" id="3013">
+  <tr id="3013">
     <td><a href="https://cplusplus.github.io/CWG/issues/3013.html">3013</a></td>
-    <td>open</td>
+    <td>CD7</td>
     <td>Disallowing macros for <TT>#embed</TT> parameters</td>
-    <td align="center">Not resolved</td>
+    <td class="unknown" align="center">Unknown</td>
   </tr>
-  <tr class="open" id="3014">
+  <tr id="3014">
     <td><a href="https://cplusplus.github.io/CWG/issues/3014.html">3014</a></td>
-    <td>open</td>
+    <td>CD7</td>
     <td>Comma-delimited vs. comma-separated output for <TT>#embed</TT></td>
-    <td align="center">Not resolved</td>
+    <td class="unknown" align="center">Unknown</td>
   </tr>
-  <tr class="open" id="3015">
+  <tr id="3015">
     <td><a href="https://cplusplus.github.io/CWG/issues/3015.html">3015</a></td>
-    <td>open</td>
+    <td>CD7</td>
     <td>Handling of <I>header-name</I>s for <TT>#include</TT> and <TT>#embed</TT></td>
-    <td align="center">Not resolved</td>
+    <td class="unknown" align="center">Unknown</td>
   </tr>
-  <tr class="open" id="3016">
+  <tr id="3016">
     <td><a href="https://cplusplus.github.io/CWG/issues/3016.html">3016</a></td>
-    <td>open</td>
+    <td>CD7</td>
     <td>Satisfying the syntactic requirements of <TT>#include</TT> and <TT>#embed</TT></td>
-    <td align="center">Not resolved</td>
+    <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="3017">
     <td><a href="https://cplusplus.github.io/CWG/issues/3017.html">3017</a></td>
@@ -17972,11 +17972,11 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td>Commas in controlling expression of conditional inclusion</td>
     <td align="center">Not resolved</td>
   </tr>
-  <tr class="open" id="3018">
+  <tr id="3018">
     <td><a href="https://cplusplus.github.io/CWG/issues/3018.html">3018</a></td>
-    <td>open</td>
+    <td>CD7</td>
     <td>Validity of <TT>defined</TT> in <TT>__has_embed</TT></td>
-    <td align="center">Not resolved</td>
+    <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="3019">
     <td><a href="https://cplusplus.github.io/CWG/issues/3019.html">3019</a></td>
@@ -17984,17 +17984,287 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td>Restrictions on character sequences in <I>header-name</I>s</td>
     <td align="center">Not resolved</td>
   </tr>
-  <tr class="open" id="3020">
+  <tr id="3020">
     <td><a href="https://cplusplus.github.io/CWG/issues/3020.html">3020</a></td>
-    <td>open</td>
+    <td>CD7</td>
     <td>Missing specification for <TT>__has_cpp_attribute(indeterminate)</TT></td>
-    <td align="center">Not resolved</td>
+    <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="3021">
     <td><a href="https://cplusplus.github.io/CWG/issues/3021.html">3021</a></td>
     <td>open</td>
     <td>Subsumption rules for fold expanded constraints</td>
     <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="3022">
+    <td><a href="https://cplusplus.github.io/CWG/issues/3022.html">3022</a></td>
+    <td>review</td>
+    <td>Redundant specification of explicit destructor calls</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="3023">
+    <td><a href="https://cplusplus.github.io/CWG/issues/3023.html">3023</a></td>
+    <td>open</td>
+    <td>Default arguments in list-initialization</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="3024">
+    <td><a href="https://cplusplus.github.io/CWG/issues/3024.html">3024</a></td>
+    <td>open</td>
+    <td>Alignment of references</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="3025">
+    <td><a href="https://cplusplus.github.io/CWG/issues/3025.html">3025</a></td>
+    <td>open</td>
+    <td>Deallocation functions returning void</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="3026">
+    <td><a href="https://cplusplus.github.io/CWG/issues/3026.html">3026</a></td>
+    <td>open</td>
+    <td>Class for pointer-to-member formation</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="3027">
+    <td><a href="https://cplusplus.github.io/CWG/issues/3027.html">3027</a></td>
+    <td>open</td>
+    <td>Equivalence of <I>pack-index-specifier</I>s</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="3028">
+    <td><a href="https://cplusplus.github.io/CWG/issues/3028.html">3028</a></td>
+    <td>open</td>
+    <td>A <I>using-declarator</I> should bind a name</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="3029">
+    <td><a href="https://cplusplus.github.io/CWG/issues/3029.html">3029</a></td>
+    <td>drafting</td>
+    <td>Confusing note about ordinary character types for aligned memory areas</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="3030">
+    <td><a href="https://cplusplus.github.io/CWG/issues/3030.html">3030</a></td>
+    <td>open</td>
+    <td>Initializing array prvalues of unknown bound</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="3031">
+    <td><a href="https://cplusplus.github.io/CWG/issues/3031.html">3031</a></td>
+    <td>open</td>
+    <td>Finding declarations for conversion operators for access checking</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="3032">
+    <td><a href="https://cplusplus.github.io/CWG/issues/3032.html">3032</a></td>
+    <td>open</td>
+    <td>Template argument disambiguation</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="3033">
+    <td><a href="https://cplusplus.github.io/CWG/issues/3033.html">3033</a></td>
+    <td>open</td>
+    <td>Scope after <I>declarator-id</I> before determining correspondence</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="3034">
+    <td><a href="https://cplusplus.github.io/CWG/issues/3034.html">3034</a></td>
+    <td>open</td>
+    <td>Infinite recursion should hit an implementation limit</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="3035">
+    <td><a href="https://cplusplus.github.io/CWG/issues/3035.html">3035</a></td>
+    <td>open</td>
+    <td>Lambda expressions in anonymous unions</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="3036">
+    <td><a href="https://cplusplus.github.io/CWG/issues/3036.html">3036</a></td>
+    <td>open</td>
+    <td>Extended floating-point types should not be cv-qualified</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="3037">
+    <td><a href="https://cplusplus.github.io/CWG/issues/3037.html">3037</a></td>
+    <td>open</td>
+    <td>Name lookup results for <I>using-declarator</I>s</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="3038">
+    <td><a href="https://cplusplus.github.io/CWG/issues/3038.html">3038</a></td>
+    <td>open</td>
+    <td>Ignorability of attributes, again</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="3039">
+    <td><a href="https://cplusplus.github.io/CWG/issues/3039.html">3039</a></td>
+    <td>open</td>
+    <td>Undefined behavior from implicit object creation ignores observable checkpoints</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="3040">
+    <td><a href="https://cplusplus.github.io/CWG/issues/3040.html">3040</a></td>
+    <td>open</td>
+    <td>Mishandling of lambda coroutines</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="3041">
+    <td><a href="https://cplusplus.github.io/CWG/issues/3041.html">3041</a></td>
+    <td>open</td>
+    <td>Overly aggressive rule for deleting the destructor of a union</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="3042">
+    <td><a href="https://cplusplus.github.io/CWG/issues/3042.html">3042</a></td>
+    <td>open</td>
+    <td>Implicit object creation is insufficient to model effective type rule of C</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="3043">
+    <td><a href="https://cplusplus.github.io/CWG/issues/3043.html">3043</a></td>
+    <td>open</td>
+    <td>Lifetime extension for temporaries in expansion statements</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="3044">
+    <td><a href="https://cplusplus.github.io/CWG/issues/3044.html">3044</a></td>
+    <td>tentatively ready</td>
+    <td>Iterating expansion statements woes</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="3045">
+    <td><a href="https://cplusplus.github.io/CWG/issues/3045.html">3045</a></td>
+    <td>tentatively ready</td>
+    <td>Regularizing environment interactions of expansion statement</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="3046">
+    <td><a href="https://cplusplus.github.io/CWG/issues/3046.html">3046</a></td>
+    <td>open</td>
+    <td>Enumerations as part of the common initial sequence</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="3047">
+    <td><a href="https://cplusplus.github.io/CWG/issues/3047.html">3047</a></td>
+    <td>open</td>
+    <td>Calling destructors on out-of-lifetime objects</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="3048">
+    <td><a href="https://cplusplus.github.io/CWG/issues/3048.html">3048</a></td>
+    <td>tentatively ready</td>
+    <td>Empty destructuring expansion statements</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="3049">
+    <td><a href="https://cplusplus.github.io/CWG/issues/3049.html">3049</a></td>
+    <td>open</td>
+    <td>Implicitly deleted move operation should not disable trivial relocation</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="3050">
+    <td><a href="https://cplusplus.github.io/CWG/issues/3050.html">3050</a></td>
+    <td>open</td>
+    <td>[[deprecated]] for class template partial specializations</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="3051">
+    <td><a href="https://cplusplus.github.io/CWG/issues/3051.html">3051</a></td>
+    <td>open</td>
+    <td>Missing specification for types of member subobjects</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="3052">
+    <td><a href="https://cplusplus.github.io/CWG/issues/3052.html">3052</a></td>
+    <td>open</td>
+    <td>Unclear handling of checks on discarded <TT>return</TT> statements</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="3053">
+    <td><a href="https://cplusplus.github.io/CWG/issues/3053.html">3053</a></td>
+    <td>open</td>
+    <td>Allowing <TT>#undef likely</TT></td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="3054">
+    <td><a href="https://cplusplus.github.io/CWG/issues/3054.html">3054</a></td>
+    <td>open</td>
+    <td>Use of default arguments depending on shape of <I>postfix-expression</I> in a function call</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="3055">
+    <td><a href="https://cplusplus.github.io/CWG/issues/3055.html">3055</a></td>
+    <td>open</td>
+    <td>Misleading body for surrogate call function</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="3056">
+    <td><a href="https://cplusplus.github.io/CWG/issues/3056.html">3056</a></td>
+    <td>open</td>
+    <td>Missing semicolons in grammar for <I>type-requirement</I></td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="3057">
+    <td><a href="https://cplusplus.github.io/CWG/issues/3057.html">3057</a></td>
+    <td>open</td>
+    <td>Ranking of derived-to-base conversions should ignore reference binding</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="3058">
+    <td><a href="https://cplusplus.github.io/CWG/issues/3058.html">3058</a></td>
+    <td>open</td>
+    <td>"Program point" is not defined</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="3059">
+    <td><a href="https://cplusplus.github.io/CWG/issues/3059.html">3059</a></td>
+    <td>open</td>
+    <td><TT>throw;</TT> in constant expressions</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="3060">
+    <td><a href="https://cplusplus.github.io/CWG/issues/3060.html">3060</a></td>
+    <td>open</td>
+    <td>Change in behavior for <TT>noexcept</TT> <TT>main</TT></td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="3061">
+    <td><a href="https://cplusplus.github.io/CWG/issues/3061.html">3061</a></td>
+    <td>tentatively ready</td>
+    <td>Trailing comma in an <I>expansion-init-list</I></td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="3062">
+    <td><a href="https://cplusplus.github.io/CWG/issues/3062.html">3062</a></td>
+    <td>open</td>
+    <td>Overlapping specification of default template arguments</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="3063">
+    <td><a href="https://cplusplus.github.io/CWG/issues/3063.html">3063</a></td>
+    <td>open</td>
+    <td>Lifetime extension of temporaries past function return</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="3064">
+    <td><a href="https://cplusplus.github.io/CWG/issues/3064.html">3064</a></td>
+    <td>open</td>
+    <td>Mishandling of placement-new in lifetime rules</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="3065">
+    <td><a href="https://cplusplus.github.io/CWG/issues/3065.html">3065</a></td>
+    <td>open</td>
+    <td>Reachability and completeness of types</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="3066">
+    <td><a href="https://cplusplus.github.io/CWG/issues/3066.html">3066</a></td>
+    <td>tentatively ready</td>
+    <td>Declarative <I>nested-name-specifier</I> in explicit instantiation</td>
+    <td align="center">Not resolved</td>
   </tr></table>
 
 </div>
diff --git a/clang/www/cxx_status.html b/clang/www/cxx_status.html
index bb7144b827c3c..25940cc2899c1 100755
--- a/clang/www/cxx_status.html
+++ b/clang/www/cxx_status.html
@@ -280,12 +280,7 @@ <h2 id="cxx26">C++2c implementation status</h2>
  <tr>
   <td>Trivial Relocatability</pre></td>
   <td><a href="https://wg21.link/P2786">P2786R13</a></td>
-  <td class="partial" align="center">
-    <details>
-      <summary>Clang 21 (Partial)</summary>
-      The feature test macro (<code>__cpp_trivial_relocatability</code>) has not yet been set.
-    </details>
-  </td>
+  <td class="unreleased" align="center">Clang 21</td>
  </tr>
  <tr>
   <td><pre>#embed</pre></td>
diff --git a/clang/www/make_cxx_dr_status b/clang/www/make_cxx_dr_status
index 31844c31ede2e..485a9a56267ca 100755
--- a/clang/www/make_cxx_dr_status
+++ b/clang/www/make_cxx_dr_status
@@ -1,7 +1,7 @@
 #! /usr/bin/env python3
 import sys, os, re, urllib.request
 
-latest_release = 20
+latest_release = 21
 
 clang_www_dir = os.path.dirname(__file__)
 default_issue_list_path = os.path.join(clang_www_dir, 'cwg_index.html')
diff --git a/compiler-rt/cmake/caches/hexagon-builtins-baremetal.cmake b/compiler-rt/cmake/caches/hexagon-builtins-baremetal.cmake
new file mode 100644
index 0000000000000..632e9ea5758a0
--- /dev/null
+++ b/compiler-rt/cmake/caches/hexagon-builtins-baremetal.cmake
@@ -0,0 +1,26 @@
+set(CMAKE_ASM_FLAGS "-G0 -mlong-calls -fno-pic" CACHE STRING "")
+set(LLVM_ENABLE_PER_TARGET_RUNTIME_DIR ON CACHE BOOL "")
+set(LLVM_TARGET_TRIPLE hexagon-unknown-none-elf CACHE STRING "")
+set(COMPILER_RT_DEFAULT_TARGET_TRIPLE hexagon-unknown-none-elf CACHE STRING "")
+set(COMPILER_RT_BUILD_BUILTINS ON CACHE BOOL "")
+set(COMPILER_RT_BUILD_SANITIZERS OFF CACHE BOOL "")
+set(COMPILER_RT_BUILD_XRAY OFF CACHE BOOL "")
+set(COMPILER_RT_BUILD_LIBFUZZER OFF CACHE BOOL "")
+set(COMPILER_RT_BUILD_PROFILE OFF CACHE BOOL "")
+set(COMPILER_RT_BUILD_MEMPROF OFF CACHE BOOL "")
+set(COMPILER_RT_BUILD_ORC OFF CACHE BOOL "")
+set(COMPILER_RT_BUILD_GWP_ASAN OFF CACHE BOOL "")
+set(COMPILER_RT_BUILTINS_ENABLE_PIC OFF CACHE BOOL "")
+set(COMPILER_RT_SUPPORTED_ARCH hexagon CACHE STRING "")
+# without this, build tries to use pthread which is not supported by hexagon-unknown-none-elf
+set(COMPILER_RT_BAREMETAL_BUILD ON CACHE BOOL "" FORCE)
+
+set(CMAKE_C_FLAGS "-ffreestanding" CACHE STRING "")
+set(CMAKE_CXX_FLAGS "-ffreestanding" CACHE STRING "")
+set(CMAKE_CROSSCOMPILING ON CACHE BOOL "")
+set(CAN_TARGET_hexagon 1 CACHE STRING "")
+set(CMAKE_C_COMPILER_FORCED ON CACHE BOOL "")
+set(CMAKE_CXX_COMPILER_FORCED ON CACHE BOOL "")
+
+set(CMAKE_C_COMPILER_TARGET hexagon-unknown-none-elf CACHE STRING "")
+set(CMAKE_CXX_COMPILER_TARGET hexagon-unknown-none-elf CACHE STRING "")
diff --git a/compiler-rt/lib/builtins/CMakeLists.txt b/compiler-rt/lib/builtins/CMakeLists.txt
index 1dadb6a810efb..0d7fc65cfd3e9 100644
--- a/compiler-rt/lib/builtins/CMakeLists.txt
+++ b/compiler-rt/lib/builtins/CMakeLists.txt
@@ -674,6 +674,11 @@ if (MINGW)
   )
 endif()
 
+# Don't build enable_execute_stack on arm64 darwin.
+if (APPLE)
+  list(REMOVE_ITEM aarch64_SOURCES enable_execute_stack.c)
+endif()
+
 set(amdgcn_SOURCES ${GENERIC_SOURCES})
 
 set(armv4t_SOURCES ${arm_min_SOURCES})
diff --git a/compiler-rt/lib/builtins/aarch64/sme-abi.S b/compiler-rt/lib/builtins/aarch64/sme-abi.S
index d5510ac0cfa50..1713a5969459a 100644
--- a/compiler-rt/lib/builtins/aarch64/sme-abi.S
+++ b/compiler-rt/lib/builtins/aarch64/sme-abi.S
@@ -280,17 +280,17 @@ DEFINE_COMPILERRT_FUNCTION(__arm_sme_save)
   mov     w16, #1
   str     x16, [x0]
 
-  add     x18, x0, #32
+  add     x16, x0, #32
   tbz     x17, #FEAT_SME2_BIT, 1f
 
   // Store ZT0
-  str     zt0, [x18]
-  add     x18, x18, #64
+  str     zt0, [x16]
+  add     x16, x16, #64
 
 1:
-  // Set up lazy-save (x18 = pointer to buffer)
+  // Set up lazy-save (x16 = pointer to buffer)
   rdsvl   x17, #1
-  str     x18, [x0, #16]!
+  str     x16, [x0, #16]!
   strh    w17, [x0, #8]
   strh    wzr, [x0, #10]
   str     wzr, [x0, #12]
diff --git a/compiler-rt/lib/builtins/cpu_model/x86.c b/compiler-rt/lib/builtins/cpu_model/x86.c
index 79705ca0886f8..a40675c071ffc 100644
--- a/compiler-rt/lib/builtins/cpu_model/x86.c
+++ b/compiler-rt/lib/builtins/cpu_model/x86.c
@@ -1094,6 +1094,12 @@ static void getAvailableFeatures(unsigned ECX, unsigned EDX, unsigned MaxLeaf,
   if (HasExtLeaf8 && ((EBX >> 9) & 1))
     setFeature(FEATURE_WBNOINVD);
 
+  bool HasExtLeaf21 = MaxExtLevel >= 0x80000021 &&
+                      !getX86CpuIDAndInfo(0x80000021, &EAX, &EBX, &ECX, &EDX);
+  // AMD cpuid bit for prefetchi is different from Intel
+  if (HasExtLeaf21 && ((EAX >> 20) & 1))
+    setFeature(FEATURE_PREFETCHI);
+
   bool HasLeaf14 = MaxLevel >= 0x14 &&
                    !getX86CpuIDAndInfoEx(0x14, 0x0, &EAX, &EBX, &ECX, &EDX);
   if (HasLeaf14 && ((EBX >> 4) & 1))
diff --git a/compiler-rt/lib/dfsan/dfsan.cpp b/compiler-rt/lib/dfsan/dfsan.cpp
index d09a9a70fd83b..524422a9f9388 100644
--- a/compiler-rt/lib/dfsan/dfsan.cpp
+++ b/compiler-rt/lib/dfsan/dfsan.cpp
@@ -1128,7 +1128,8 @@ static bool CheckMemoryRangeAvailability(uptr beg, uptr size, bool verbose) {
     uptr end = beg + size - 1;
     if (!MemoryRangeIsAvailable(beg, end)) {
       if (verbose)
-        Printf("FATAL: Memory range %p - %p is not available.\n", beg, end);
+        Printf("FATAL: Memory range %p - %p is not available.\n", (void*)beg,
+               (void*)end);
       return false;
     }
   }
@@ -1150,8 +1151,8 @@ static bool ProtectMemoryRange(uptr beg, uptr size, const char *name) {
     }
     if ((uptr)addr != beg) {
       uptr end = beg + size - 1;
-      Printf("FATAL: Cannot protect memory range %p - %p (%s).\n", beg, end,
-             name);
+      Printf("FATAL: Cannot protect memory range %p - %p (%s).\n", (void*)beg,
+             (void*)end, name);
       return false;
     }
   }
@@ -1172,7 +1173,7 @@ static bool InitShadow(bool init_origins, bool dry_run) {
   if (!MEM_IS_APP(&__dfsan::dfsan_init)) {
     if (!dry_run)
       Printf("FATAL: Code %p is out of application range. Non-PIE build?\n",
-             (uptr)&__dfsan::dfsan_init);
+             (void*)&__dfsan::dfsan_init);
     return false;
   }
 
diff --git a/compiler-rt/lib/fuzzer/FuzzerCorpus.h b/compiler-rt/lib/fuzzer/FuzzerCorpus.h
index 48b5a2cff02e2..12a75fea24fba 100644
--- a/compiler-rt/lib/fuzzer/FuzzerCorpus.h
+++ b/compiler-rt/lib/fuzzer/FuzzerCorpus.h
@@ -336,7 +336,8 @@ class InputCorpus {
   void PrintFeatureSet() {
     for (size_t i = 0; i < kFeatureSetSize; i++) {
       if(size_t Sz = GetFeature(i))
-        Printf("[%zd: id %zd sz%zd] ", i, SmallestElementPerFeature[i], Sz);
+        Printf("[%zd: id %zd sz%zd] ", i, (size_t)SmallestElementPerFeature[i],
+               Sz);
     }
     Printf("\n\t");
     for (size_t i = 0; i < Inputs.size(); i++)
diff --git a/compiler-rt/lib/fuzzer/FuzzerDriver.cpp b/compiler-rt/lib/fuzzer/FuzzerDriver.cpp
index af9c260537e2f..6b25aa9942d2e 100644
--- a/compiler-rt/lib/fuzzer/FuzzerDriver.cpp
+++ b/compiler-rt/lib/fuzzer/FuzzerDriver.cpp
@@ -163,13 +163,13 @@ static bool ParseOneFlag(const char *Param) {
         auto Val = MyStol(Str);
         *FlagDescriptions[F].IntFlag = static_cast<int>(Val);
         if (Flags.verbosity >= 2)
-          Printf("Flag: %s %d\n", Name, Val);
+          Printf("Flag: %s %d\n", Name, (int)Val);
         return true;
       } else if (FlagDescriptions[F].UIntFlag) {
         auto Val = std::stoul(Str);
         *FlagDescriptions[F].UIntFlag = static_cast<unsigned int>(Val);
         if (Flags.verbosity >= 2)
-          Printf("Flag: %s %u\n", Name, Val);
+          Printf("Flag: %s %u\n", Name, (uint32_t)Val);
         return true;
       } else if (FlagDescriptions[F].StrFlag) {
         *FlagDescriptions[F].StrFlag = Str;
diff --git a/compiler-rt/lib/fuzzer/FuzzerLoop.cpp b/compiler-rt/lib/fuzzer/FuzzerLoop.cpp
index 6f415dd5763ac..75c2fb71eb070 100644
--- a/compiler-rt/lib/fuzzer/FuzzerLoop.cpp
+++ b/compiler-rt/lib/fuzzer/FuzzerLoop.cpp
@@ -125,8 +125,8 @@ void FreeHook(const volatile void *ptr) {
 void Fuzzer::HandleMalloc(size_t Size) {
   if (!Options.MallocLimitMb || (Size >> 20) < (size_t)Options.MallocLimitMb)
     return;
-  Printf("==%d== ERROR: libFuzzer: out-of-memory (malloc(%zd))\n", GetPid(),
-         Size);
+  Printf("==%d== ERROR: libFuzzer: out-of-memory (malloc(%zd))\n",
+         (int)GetPid(), Size);
   Printf("   To change the out-of-memory limit use -rss_limit_mb=<N>\n\n");
   PrintStackTrace();
   DumpCurrentUnit("oom-");
@@ -568,7 +568,7 @@ size_t Fuzzer::GetCurrentUnitInFuzzingThead(const uint8_t **Data) const {
 
 void Fuzzer::CrashOnOverwrittenData() {
   Printf("==%d== ERROR: libFuzzer: fuzz target overwrites its const input\n",
-         GetPid());
+         (int)GetPid());
   PrintStackTrace();
   Printf("SUMMARY: libFuzzer: overwrites-const-input\n");
   DumpCurrentUnit("crash-");
diff --git a/compiler-rt/lib/hwasan/hwasan_report.cpp b/compiler-rt/lib/hwasan/hwasan_report.cpp
index 6eafcf9163afa..871aa74f99c4e 100644
--- a/compiler-rt/lib/hwasan/hwasan_report.cpp
+++ b/compiler-rt/lib/hwasan/hwasan_report.cpp
@@ -738,9 +738,9 @@ void BaseReport::PrintHeapOrGlobalCandidate() const {
     Printf("%s", d.Default());
     Printf("%s", d.Location());
     Printf("%p is located %zd bytes %s a %zd-byte region [%p,%p)\n",
-           untagged_addr, offset, whence,
+           (void*)untagged_addr, offset, whence,
            candidate.heap.end - candidate.heap.begin,
-           (void *)candidate.heap.begin, (void *)candidate.heap.end);
+           (void*)candidate.heap.begin, (void*)candidate.heap.end);
     Printf("%s", d.Allocation());
     Printf("allocated by thread T%u here:\n", candidate.heap.thread_id);
     Printf("%s", d.Default());
@@ -775,14 +775,14 @@ void BaseReport::PrintHeapOrGlobalCandidate() const {
         Printf(
             "%p is located %s a global variable in "
             "\n    #0 0x%x (%s+0x%x)\n",
-            (void *)untagged_addr, candidate.after ? "after" : "before",
-            (void *)candidate.untagged_addr, module_name, (u32)module_address);
+            (void*)untagged_addr, candidate.after ? "after" : "before",
+            (u32)candidate.untagged_addr, module_name, (u32)module_address);
       else
         Printf(
             "%p is located %s a %zd-byte global variable in "
             "\n    #0 0x%x (%s+0x%x)\n",
-            (void *)untagged_addr, candidate.after ? "after" : "before", size,
-            (void *)candidate.untagged_addr, module_name, (u32)module_address);
+            (void*)untagged_addr, candidate.after ? "after" : "before", size,
+            (u32)candidate.untagged_addr, module_name, (u32)module_address);
     }
     Printf("%s", d.Default());
   }
@@ -843,9 +843,9 @@ void BaseReport::PrintAddressDescription() const {
     Printf("\nCause: use-after-free\n");
     Printf("%s", d.Location());
     Printf("%p is located %zd bytes inside a %zd-byte region [%p,%p)\n",
-           (void *)untagged_addr, untagged_addr - UntagAddr(har.tagged_addr),
-           (ssize)har.requested_size, UntagAddr(har.tagged_addr),
-           (void *)(UntagAddr(har.tagged_addr) + har.requested_size));
+           (void*)untagged_addr, untagged_addr - UntagAddr(har.tagged_addr),
+           (ssize)har.requested_size, (void*)UntagAddr(har.tagged_addr),
+           (void*)(UntagAddr(har.tagged_addr) + har.requested_size));
     Printf("%s", d.Allocation());
     Printf("freed by thread T%u here:\n", ha.free_thread_id);
     Printf("%s", d.Default());
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_mac.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_mac.cpp
index d4811ff4ed217..18ec1195e8eb0 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_mac.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_mac.cpp
@@ -22,6 +22,11 @@
 #  endif
 #  include <stdio.h>
 
+// Start searching for available memory region past PAGEZERO, which is
+// 4KB on 32-bit and 4GB on 64-bit.
+#  define GAP_SEARCH_START_ADDRESS \
+    ((SANITIZER_WORDSIZE == 32) ? 0x000000001000 : 0x000100000000)
+
 #  include "sanitizer_common.h"
 #  include "sanitizer_file.h"
 #  include "sanitizer_flags.h"
@@ -58,9 +63,11 @@ extern char ***_NSGetArgv(void);
 #  include <dlfcn.h>  // for dladdr()
 #  include <errno.h>
 #  include <fcntl.h>
+#  include <inttypes.h>
 #  include <libkern/OSAtomic.h>
 #  include <mach-o/dyld.h>
 #  include <mach/mach.h>
+#  include <mach/mach_error.h>
 #  include <mach/mach_time.h>
 #  include <mach/vm_statistics.h>
 #  include <malloc/malloc.h>
@@ -1106,6 +1113,67 @@ static void StripEnv() {
 }
 #endif  // SANITIZER_GO
 
+// Prints out a consolidated memory map: contiguous regions
+// are merged together.
+static void PrintVmmap() {
+  const mach_vm_address_t max_vm_address = GetMaxVirtualAddress() + 1;
+  mach_vm_address_t address = GAP_SEARCH_START_ADDRESS;
+  kern_return_t kr = KERN_SUCCESS;
+
+  Report("Memory map:\n");
+  mach_vm_address_t last = 0;
+  mach_vm_address_t lastsz = 0;
+
+  while (1) {
+    mach_vm_size_t vmsize = 0;
+    natural_t depth = 0;
+    vm_region_submap_short_info_data_64_t vminfo;
+    mach_msg_type_number_t count = VM_REGION_SUBMAP_SHORT_INFO_COUNT_64;
+    kr = mach_vm_region_recurse(mach_task_self(), &address, &vmsize, &depth,
+                                (vm_region_info_t)&vminfo, &count);
+
+    if (kr == KERN_DENIED) {
+      Report(
+          "ERROR: mach_vm_region_recurse got KERN_DENIED when printing memory "
+          "map.\n");
+      Report(
+          "HINT: Check whether mach_vm_region_recurse is allowed by "
+          "sandbox.\n");
+    }
+
+    if (kr == KERN_SUCCESS && address < max_vm_address) {
+      if (last + lastsz == address) {
+        // This region is contiguous with the last; merge together.
+        lastsz += vmsize;
+      } else {
+        if (lastsz)
+          Printf("|| `[%p, %p]` || size=0x%016" PRIx64 " ||\n", last,
+                 last + lastsz, lastsz);
+
+        last = address;
+        lastsz = vmsize;
+      }
+      address += vmsize;
+    } else {
+      // We've reached the end of the memory map. Print the last remaining
+      // region, if there is one.
+      if (lastsz)
+        Printf("|| `[%p, %p]` || size=0x%016" PRIx64 " ||\n", last,
+               last + lastsz, lastsz);
+
+      break;
+    }
+  }
+}
+
+static void ReportShadowAllocFail(uptr shadow_size_bytes, uptr alignment) {
+  Report(
+      "FATAL: Failed to allocate shadow memory. Tried to allocate %p bytes "
+      "(alignment=%p).\n",
+      shadow_size_bytes, alignment);
+  PrintVmmap();
+}
+
 char **GetArgv() {
   return *_NSGetArgv();
 }
@@ -1213,10 +1281,11 @@ uptr MapDynamicShadow(uptr shadow_size_bytes, uptr shadow_scale,
     if (new_max_vm < max_occupied_addr) {
       Report("Unable to find a memory range for dynamic shadow.\n");
       Report(
-          "space_size = %p, largest_gap_found = %p, max_occupied_addr = %p, "
-          "new_max_vm = %p\n",
-          (void *)space_size, (void *)largest_gap_found,
-          (void *)max_occupied_addr, (void *)new_max_vm);
+          "\tspace_size = %p\n\tlargest_gap_found = %p\n\tmax_occupied_addr "
+          "= %p\n\tnew_max_vm = %p\n",
+          (void*)space_size, (void*)largest_gap_found, (void*)max_occupied_addr,
+          (void*)new_max_vm);
+      ReportShadowAllocFail(shadow_size_bytes, alignment);
       CHECK(0 && "cannot place shadow");
     }
     RestrictMemoryToMaxAddress(new_max_vm);
@@ -1227,6 +1296,7 @@ uptr MapDynamicShadow(uptr shadow_size_bytes, uptr shadow_scale,
                                             nullptr, nullptr);
     if (shadow_start == 0) {
       Report("Unable to find a memory range after restricting VM.\n");
+      ReportShadowAllocFail(shadow_size_bytes, alignment);
       CHECK(0 && "cannot place shadow after restricting vm");
     }
   }
@@ -1242,40 +1312,51 @@ uptr MapDynamicShadowAndAliases(uptr shadow_size, uptr alias_size,
 }
 
 uptr FindAvailableMemoryRange(uptr size, uptr alignment, uptr left_padding,
-                              uptr *largest_gap_found,
-                              uptr *max_occupied_addr) {
-  typedef vm_region_submap_short_info_data_64_t RegionInfo;
-  enum { kRegionInfoSize = VM_REGION_SUBMAP_SHORT_INFO_COUNT_64 };
-  // Start searching for available memory region past PAGEZERO, which is
-  // 4KB on 32-bit and 4GB on 64-bit.
-  mach_vm_address_t start_address =
-    (SANITIZER_WORDSIZE == 32) ? 0x000000001000 : 0x000100000000;
-
+                              uptr* largest_gap_found,
+                              uptr* max_occupied_addr) {
   const mach_vm_address_t max_vm_address = GetMaxVirtualAddress() + 1;
-  mach_vm_address_t address = start_address;
-  mach_vm_address_t free_begin = start_address;
+  mach_vm_address_t address = GAP_SEARCH_START_ADDRESS;
+  mach_vm_address_t free_begin = GAP_SEARCH_START_ADDRESS;
   kern_return_t kr = KERN_SUCCESS;
   if (largest_gap_found) *largest_gap_found = 0;
   if (max_occupied_addr) *max_occupied_addr = 0;
   while (kr == KERN_SUCCESS) {
     mach_vm_size_t vmsize = 0;
     natural_t depth = 0;
-    RegionInfo vminfo;
-    mach_msg_type_number_t count = kRegionInfoSize;
+    vm_region_submap_short_info_data_64_t vminfo;
+    mach_msg_type_number_t count = VM_REGION_SUBMAP_SHORT_INFO_COUNT_64;
     kr = mach_vm_region_recurse(mach_task_self(), &address, &vmsize, &depth,
                                 (vm_region_info_t)&vminfo, &count);
 
-    // There are cases where going beyond the processes' max vm does
-    // not return KERN_INVALID_ADDRESS so we check for going beyond that
-    // max address as well.
-    if (kr == KERN_INVALID_ADDRESS || address > max_vm_address) {
+    if (kr == KERN_SUCCESS) {
+      // There are cases where going beyond the processes' max vm does
+      // not return KERN_INVALID_ADDRESS so we check for going beyond that
+      // max address as well.
+      if (address > max_vm_address) {
+        address = max_vm_address;
+        kr = -1;  // break after this iteration.
+      }
+
+      if (max_occupied_addr)
+        *max_occupied_addr = address + vmsize;
+    } else if (kr == KERN_INVALID_ADDRESS) {
       // No more regions beyond "address", consider the gap at the end of VM.
       address = max_vm_address;
-      vmsize = 0;
-      kr = -1;  // break after this iteration.
+
+      // We will break after this iteration anyway since kr != KERN_SUCCESS
+    } else if (kr == KERN_DENIED) {
+      Report("ERROR: Unable to find a memory range for dynamic shadow.\n");
+      Report("HINT: Ensure mach_vm_region_recurse is allowed under sandbox.\n");
+      Die();
     } else {
-      if (max_occupied_addr) *max_occupied_addr = address + vmsize;
+      Report(
+          "WARNING: mach_vm_region_recurse returned unexpected code %d (%s)\n",
+          kr, mach_error_string(kr));
+      DCHECK(false && "mach_vm_region_recurse returned unexpected code");
+      break;  // address is not valid unless KERN_SUCCESS, therefore we must not
+              // use it.
     }
+
     if (free_begin != address) {
       // We found a free region [free_begin..address-1].
       uptr gap_start = RoundUpTo((uptr)free_begin + left_padding, alignment);
@@ -1298,6 +1379,29 @@ uptr FindAvailableMemoryRange(uptr size, uptr alignment, uptr left_padding,
   return 0;
 }
 
+// Returns true if the address is definitely mapped, and false if it is not
+// mapped or could not be determined.
+bool IsAddressInMappedRegion(uptr addr) {
+  mach_vm_size_t vmsize = 0;
+  natural_t depth = 0;
+  vm_region_submap_short_info_data_64_t vminfo;
+  mach_msg_type_number_t count = VM_REGION_SUBMAP_SHORT_INFO_COUNT_64;
+  mach_vm_address_t address = addr;
+
+  kern_return_t kr =
+      mach_vm_region_recurse(mach_task_self(), &address, &vmsize, &depth,
+                             (vm_region_info_t)&vminfo, &count);
+
+  if (kr == KERN_DENIED) {
+    Report(
+        "WARN: mach_vm_region_recurse returned KERN_DENIED when checking "
+        "whether an address is mapped.\n");
+    Report("HINT: Is mach_vm_region_recurse allowed by sandbox?\n");
+  }
+
+  return (kr == KERN_SUCCESS && addr >= address && addr < address + vmsize);
+}
+
 // FIXME implement on this platform.
 void GetMemoryProfile(fill_profile_f cb, uptr *stats) {}
 
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_mac.h b/compiler-rt/lib/sanitizer_common/sanitizer_mac.h
index b0e4ac7f40745..789dd8e4d8e9c 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_mac.h
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_mac.h
@@ -76,6 +76,8 @@ struct ThreadEventCallbacks {
 
 void InstallPthreadIntrospectionHook(const ThreadEventCallbacks &callbacks);
 
+bool IsAddressInMappedRegion(uptr addr);
+
 }  // namespace __sanitizer
 
 #endif  // SANITIZER_APPLE
diff --git a/compiler-rt/lib/scudo/standalone/combined.h b/compiler-rt/lib/scudo/standalone/combined.h
index 985bfb49884d1..c9ba28a52f780 100644
--- a/compiler-rt/lib/scudo/standalone/combined.h
+++ b/compiler-rt/lib/scudo/standalone/combined.h
@@ -25,6 +25,7 @@
 #include "size_class_allocator.h"
 #include "stack_depot.h"
 #include "string_utils.h"
+#include "tracing.h"
 #include "tsd.h"
 
 #include "scudo/interface.h"
@@ -671,10 +672,11 @@ class Allocator {
 
   void releaseToOS(ReleaseToOS ReleaseType) {
     initThreadMaybe();
+    SCUDO_SCOPED_TRACE(GetReleaseToOSTraceName(ReleaseType));
     if (ReleaseType == ReleaseToOS::ForceAll)
       drainCaches();
     Primary.releaseToOS(ReleaseType);
-    Secondary.releaseToOS();
+    Secondary.releaseToOS(ReleaseType);
   }
 
   // Iterate over all chunks and call a callback for all busy chunks located
diff --git a/compiler-rt/lib/scudo/standalone/primary32.h b/compiler-rt/lib/scudo/standalone/primary32.h
index e2de50b93adc3..645c3968ce3e1 100644
--- a/compiler-rt/lib/scudo/standalone/primary32.h
+++ b/compiler-rt/lib/scudo/standalone/primary32.h
@@ -511,6 +511,8 @@ uptr SizeClassAllocator32<Config>::tryReleaseToOS(uptr ClassId,
 
 template <typename Config>
 uptr SizeClassAllocator32<Config>::releaseToOS(ReleaseToOS ReleaseType) {
+  SCUDO_SCOPED_TRACE(GetPrimaryReleaseToOSTraceName(ReleaseType));
+
   uptr TotalReleasedBytes = 0;
   for (uptr I = 0; I < NumClasses; I++) {
     if (I == SizeClassMap::BatchClassId)
@@ -1092,6 +1094,12 @@ uptr SizeClassAllocator32<Config>::releaseToOSMaybe(SizeClassInfo *Sci,
   // 2. Mark the free blocks and we can tell which pages are in-use by
   //    querying `PageReleaseContext`.
   // ==================================================================== //
+
+  // Only add trace point after the quick returns have occurred to avoid
+  // incurring performance penalties. Most of the time in this function
+  // will be the mark free blocks call and the actual release to OS call.
+  SCUDO_SCOPED_TRACE(GetPrimaryReleaseToOSMaybeTraceName(ReleaseType));
+
   PageReleaseContext Context = markFreeBlocks(Sci, ClassId, BlockSize, Base,
                                               NumberOfRegions, ReleaseType);
   if (!Context.hasBlockMarked())
diff --git a/compiler-rt/lib/scudo/standalone/primary64.h b/compiler-rt/lib/scudo/standalone/primary64.h
index 3cb040c514eda..d08103008ef7c 100644
--- a/compiler-rt/lib/scudo/standalone/primary64.h
+++ b/compiler-rt/lib/scudo/standalone/primary64.h
@@ -22,6 +22,7 @@
 #include "stats.h"
 #include "string_utils.h"
 #include "thread_annotations.h"
+#include "tracing.h"
 
 namespace scudo {
 
@@ -1307,6 +1308,8 @@ uptr SizeClassAllocator64<Config>::tryReleaseToOS(uptr ClassId,
 
 template <typename Config>
 uptr SizeClassAllocator64<Config>::releaseToOS(ReleaseToOS ReleaseType) {
+  SCUDO_SCOPED_TRACE(GetPrimaryReleaseToOSTraceName(ReleaseType));
+
   uptr TotalReleasedBytes = 0;
   for (uptr I = 0; I < NumClasses; I++) {
     if (I == SizeClassMap::BatchClassId)
@@ -1439,6 +1442,12 @@ uptr SizeClassAllocator64<Config>::releaseToOSMaybe(RegionInfo *Region,
   //    Then we can tell which pages are in-use by querying
   //    `PageReleaseContext`.
   // ==================================================================== //
+
+  // Only add trace point after the quick returns have occurred to avoid
+  // incurring performance penalties. Most of the time in this function
+  // will be the mark free blocks call and the actual release to OS call.
+  SCUDO_SCOPED_TRACE(GetPrimaryReleaseToOSMaybeTraceName(ReleaseType));
+
   PageReleaseContext Context =
       markFreeBlocks(Region, BlockSize, AllocatedUserEnd,
                      getCompactPtrBaseByClassId(ClassId), GroupsToRelease);
diff --git a/compiler-rt/lib/scudo/standalone/secondary.h b/compiler-rt/lib/scudo/standalone/secondary.h
index 38c9a9e6e2d70..f0b7bceb010f0 100644
--- a/compiler-rt/lib/scudo/standalone/secondary.h
+++ b/compiler-rt/lib/scudo/standalone/secondary.h
@@ -19,6 +19,7 @@
 #include "stats.h"
 #include "string_utils.h"
 #include "thread_annotations.h"
+#include "tracing.h"
 #include "vector.h"
 
 namespace scudo {
@@ -118,7 +119,7 @@ template <typename Config> class MapAllocatorNoCache {
   bool canCache(UNUSED uptr Size) { return false; }
   void disable() {}
   void enable() {}
-  void releaseToOS() {}
+  void releaseToOS(ReleaseToOS) {}
   void disableMemoryTagging() {}
   void unmapTestOnly() {}
   bool setOption(Option O, UNUSED sptr Value) {
@@ -351,6 +352,9 @@ class MapAllocatorCache {
       // same time will not actually release any extra elements. Therefore,
       // let any other thread continue, skipping the release.
       if (Mutex.tryLock()) {
+        SCUDO_SCOPED_TRACE(
+            GetSecondaryReleaseToOSTraceName(ReleaseToOS::Normal));
+
         // TODO: Add ReleaseToOS logic to LRU algorithm
         releaseOlderThan(Time - static_cast<u64>(Interval) * 1000000);
         Mutex.unlock();
@@ -499,7 +503,9 @@ class MapAllocatorCache {
     return true;
   }
 
-  void releaseToOS() EXCLUDES(Mutex) {
+  void releaseToOS([[maybe_unused]] ReleaseToOS ReleaseType) EXCLUDES(Mutex) {
+    SCUDO_SCOPED_TRACE(GetSecondaryReleaseToOSTraceName(ReleaseType));
+
     // Since this is a request to release everything, always wait for the
     // lock so that we guarantee all entries are released after this call.
     ScopedLock L(Mutex);
@@ -574,6 +580,8 @@ class MapAllocatorCache {
   }
 
   void releaseOlderThan(u64 Time) REQUIRES(Mutex) {
+    SCUDO_SCOPED_TRACE(GetSecondaryReleaseOlderThanTraceName());
+
     if (!LRUEntries.size() || OldestTime == 0 || OldestTime > Time)
       return;
     OldestTime = 0;
@@ -669,7 +677,7 @@ template <typename Config> class MapAllocator {
 
   bool setOption(Option O, sptr Value) { return Cache.setOption(O, Value); }
 
-  void releaseToOS() { Cache.releaseToOS(); }
+  void releaseToOS(ReleaseToOS ReleaseType) { Cache.releaseToOS(ReleaseType); }
 
   void disableMemoryTagging() { Cache.disableMemoryTagging(); }
 
diff --git a/compiler-rt/lib/scudo/standalone/tracing.h b/compiler-rt/lib/scudo/standalone/tracing.h
new file mode 100644
index 0000000000000..ac1f746128823
--- /dev/null
+++ b/compiler-rt/lib/scudo/standalone/tracing.h
@@ -0,0 +1,50 @@
+//===-- tracing.h -----------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SCUDO_TRACING_H_
+#define SCUDO_TRACING_H_
+
+#if defined(SCUDO_ENABLE_TRACING)
+
+// This file must include definitions for all of the functions below.
+#include "custom_scudo_tracing.h"
+
+#else
+
+// Should start a trace in the given scope, and end the trace when going out of
+// scope.
+#define SCUDO_SCOPED_TRACE(Name)
+
+// Create a trace name for the call to releaseToOS.
+static inline const char *GetReleaseToOSTraceName(scudo::ReleaseToOS) {
+  return nullptr;
+}
+
+// Create a trace name for the call to releaseToOSMaybe in the primary.
+static inline const char *
+GetPrimaryReleaseToOSMaybeTraceName(scudo::ReleaseToOS) {
+  return nullptr;
+}
+
+static inline const char *GetPrimaryReleaseToOSTraceName(scudo::ReleaseToOS) {
+  return nullptr;
+}
+
+// Create a trace name for the call to releaseToOS in the secondary.
+static inline const char *GetSecondaryReleaseToOSTraceName(scudo::ReleaseToOS) {
+  return nullptr;
+}
+
+// Create a trace name for the call to releaseOlderThan in the secondary.
+static inline const char *GetSecondaryReleaseOlderThanTraceName() {
+  return nullptr;
+}
+
+#endif
+
+#endif // SCUDO_TRACING_H_
diff --git a/compiler-rt/lib/tsan/rtl/tsan_platform_mac.cpp b/compiler-rt/lib/tsan/rtl/tsan_platform_mac.cpp
index eb344df168ab9..5cc81bab5b911 100644
--- a/compiler-rt/lib/tsan/rtl/tsan_platform_mac.cpp
+++ b/compiler-rt/lib/tsan/rtl/tsan_platform_mac.cpp
@@ -226,9 +226,19 @@ static void ThreadTerminateCallback(uptr thread) {
 void InitializePlatformEarly() {
 #  if !SANITIZER_GO && SANITIZER_IOS
   uptr max_vm = GetMaxUserVirtualAddress() + 1;
-  if (max_vm != HiAppMemEnd()) {
-    Printf("ThreadSanitizer: unsupported vm address limit %p, expected %p.\n",
-           (void *)max_vm, (void *)HiAppMemEnd());
+  if (max_vm < HiAppMemEnd()) {
+    Report(
+        "ThreadSanitizer: Unsupported virtual memory layout:\n\tVM address "
+        "limit = %p\n\tExpected %p.\n",
+        (void*)max_vm, (void*)HiAppMemEnd());
+    Die();
+  }
+  // In some configurations, the max_vm is expanded, but much of this space is
+  // already mapped. TSAN will not work in this configuration.
+  if (IsAddressInMappedRegion(HiAppMemEnd() - 1)) {
+    Report(
+        "ThreadSanitizer: Unsupported virtual memory layout: Address %p is "
+        "already mapped.\n");
     Die();
   }
 #endif
diff --git a/compiler-rt/test/sanitizer_common/TestCases/Darwin/atos-symbolized-recover.cpp b/compiler-rt/test/asan/TestCases/Darwin/atos-symbolized-recover.cpp
similarity index 69%
rename from compiler-rt/test/sanitizer_common/TestCases/Darwin/atos-symbolized-recover.cpp
rename to compiler-rt/test/asan/TestCases/Darwin/atos-symbolized-recover.cpp
index 4234e0c9a9af3..08b31af136fa0 100644
--- a/compiler-rt/test/sanitizer_common/TestCases/Darwin/atos-symbolized-recover.cpp
+++ b/compiler-rt/test/asan/TestCases/Darwin/atos-symbolized-recover.cpp
@@ -1,12 +1,9 @@
 // Check that there is a warning when atos fails to symbolize an address
 // and that atos continues symbolicating correctly after.
 
-// RUN: %clangxx -O0 %s -o %t
+// RUN: %clangxx_asan -O0 %s -o %t
 // RUN: not %run %t 2>&1 | FileCheck %s
-
-// This test tests for undefined behavior and is leading to various failures. 
-// Going to disable to unblock CI and rethink a test for this. rdar://107846128
-// UNSUPPORTED: darwin
+// REQUIRES: iossim
 
 void bar() {
   void *invalid_addr = reinterpret_cast<void *>(0xDEADBEEF);
@@ -19,4 +16,4 @@ int main() {
   return 0;
   // CHECK: WARNING: atos failed to symbolize address{{.*}}
   // CHECK: {{.*}}atos-symbolized-recover.cpp:[[@LINE-3]]{{.*}}
-}
+}
\ No newline at end of file
diff --git a/compiler-rt/test/asan/TestCases/Darwin/sandbox-vm-region-recurse.cpp b/compiler-rt/test/asan/TestCases/Darwin/sandbox-vm-region-recurse.cpp
new file mode 100644
index 0000000000000..c496d822a7fb8
--- /dev/null
+++ b/compiler-rt/test/asan/TestCases/Darwin/sandbox-vm-region-recurse.cpp
@@ -0,0 +1,33 @@
+// Check that if mach_vm_region_recurse is disallowed by sandbox, we report a message saying so.
+
+// RUN: %clangxx_asan -O0 %s -o %t
+// RUN: not %run sandbox-exec -p '(version 1)(allow default)(deny syscall-mig (kernel-mig-routine mach_vm_region_recurse))' %t 2>&1 | FileCheck --check-prefix=CHECK-DENY %s
+// RUN: not %run %t 2>&1 | FileCheck --check-prefix=CHECK-ALLOW %s
+// RUN: %clangxx_asan -O3 %s -o %t
+// RUN: not %run sandbox-exec -p '(version 1)(allow default)(deny syscall-mig (kernel-mig-routine mach_vm_region_recurse))' %t 2>&1 | FileCheck --check-prefix=CHECK-DENY %s
+// RUN: not %run %t 2>&1 | FileCheck --check-prefix=CHECK-ALLOW %s
+
+// sandbox-exec isn't available on iOS
+// UNSUPPORTED: ios
+
+// x86_64 does not use ASAN_SHADOW_OFFSET_DYNAMIC
+// UNSUPPORTED: x86_64-darwin || x86_64h-darwin
+
+#include <stdlib.h>
+
+int main() {
+  char *x = (char *)malloc(10 * sizeof(char));
+  free(x);
+  return x[5];
+  // CHECK-ALLOW: {{.*ERROR: AddressSanitizer: heap-use-after-free on address}}
+  // CHECK-DENY-NOT: {{.*ERROR: AddressSanitizer: heap-use-after-free on address}}
+  // CHECK-ALLOW: {{READ of size 1 at 0x.* thread T0}}
+  // CHECK-ALLOW: {{    #0 0x.* in main}}
+  // CHECK-ALLOW: {{freed by thread T0 here:}}
+  // CHECK-ALLOW: {{    #0 0x.* in free}}
+  // CHECK-ALLOW: {{    #1 0x.* in main}}
+  // CHECK-ALLOW: {{previously allocated by thread T0 here:}}
+  // CHECK-ALLOW: {{    #0 0x.* in malloc}}
+  // CHECK-ALLOW: {{    #1 0x.* in main}}
+  // CHECK-DENY: {{.*HINT: Ensure mach_vm_region_recurse is allowed under sandbox}}
+}
diff --git a/compiler-rt/test/lit.common.cfg.py b/compiler-rt/test/lit.common.cfg.py
index 7734491310edf..897f4cd76e21c 100644
--- a/compiler-rt/test/lit.common.cfg.py
+++ b/compiler-rt/test/lit.common.cfg.py
@@ -708,31 +708,7 @@ def get_macos_aligned_version(macos_vers):
     config.substitutions.append(("%push_to_device", "echo "))
     config.substitutions.append(("%adb_shell", "echo "))
 
-if config.target_os == "Linux":
-    def add_glibc_versions(ver_string):
-        if config.android:
-            return
-
-        from packaging.version import Version
-
-        ver = Version(ver_string)
-        any_glibc = False
-        for required in [
-            "2.19",
-            "2.27",
-            "2.30",
-            "2.33",
-            "2.34",
-            "2.37",
-            "2.38",
-            "2.40",
-        ]:
-            if ver >= Version(required):
-                config.available_features.add("glibc-" + required)
-                any_glibc = True
-            if any_glibc:
-                config.available_features.add("glibc")
-
+if config.target_os == "Linux" and not config.android:
     # detect whether we are using glibc, and which version
     cmd_args = [
         config.clang.strip(),
@@ -754,7 +730,27 @@ def add_glibc_versions(ver_string):
     try:
         sout, _ = cmd.communicate(b"#include <features.h>")
         m = dict(re.findall(r"#define (__GLIBC__|__GLIBC_MINOR__) (\d+)", str(sout)))
-        add_glibc_versions(f"{m['__GLIBC__']}.{m['__GLIBC_MINOR__']}")
+        major = int(m["__GLIBC__"])
+        minor = int(m["__GLIBC_MINOR__"])
+        any_glibc = False
+        for required in [
+            (2, 19),
+            (2, 27),
+            (2, 30),
+            (2, 33),
+            (2, 34),
+            (2, 37),
+            (2, 38),
+            (2, 40),
+        ]:
+            if (major, minor) >= required:
+                (required_major, required_minor) = required
+                config.available_features.add(
+                    f"glibc-{required_major}.{required_minor}"
+                )
+                any_glibc = True
+            if any_glibc:
+                config.available_features.add("glibc")
     except:
         pass
 
diff --git a/compiler-rt/test/safestack/pthread-cleanup.c b/compiler-rt/test/safestack/pthread-cleanup.c
index e177e376f21d2..c2645a6506c74 100644
--- a/compiler-rt/test/safestack/pthread-cleanup.c
+++ b/compiler-rt/test/safestack/pthread-cleanup.c
@@ -28,7 +28,7 @@ int main(int argc, char **argv)
 
   if (pthread_create(&t1, NULL, start, NULL))
     abort();
-  if (pthread_join(t1, &t1_buffer))
+  if (pthread_join(t1, (void **)&t1_buffer))
     abort();
 
   // Stack has not yet been deallocated
diff --git a/compiler-rt/test/tysan/violation-pr62544.c b/compiler-rt/test/tysan/violation-pr62544.c
index 65dd333272116..d779abd668529 100644
--- a/compiler-rt/test/tysan/violation-pr62544.c
+++ b/compiler-rt/test/tysan/violation-pr62544.c
@@ -7,7 +7,7 @@ int printf(const char *, ...);
 int a, b, c;
 long d;
 int main() {
-  short *e = &a;
+  short *e = (short *)&a;
   int *f = &a;
   *f = 0;
   for (; b <= 9; b++) {
diff --git a/cross-project-tests/debuginfo-tests/dexter/dex/debugger/DAP.py b/cross-project-tests/debuginfo-tests/dexter/dex/debugger/DAP.py
index 4e64f880487f5..a849990678d42 100644
--- a/cross-project-tests/debuginfo-tests/dexter/dex/debugger/DAP.py
+++ b/cross-project-tests/debuginfo-tests/dexter/dex/debugger/DAP.py
@@ -59,10 +59,17 @@ def _custom_enter(self):
         if self.log_file == "-":
             self.out_handle = sys.stdout
             return
+        if self.log_file == "-e":
+            self.out_handle = sys.stderr
+            return
         self.out_handle = open(self.log_file, "w+", encoding="utf-8")
 
     def _custom_exit(self):
-        if self.out_handle is not None and self.log_file != "-":
+        if (
+            self.out_handle is not None
+            and self.log_file != "-"
+            and self.log_file != "-e"
+        ):
             self.out_handle.close()
         self.open = False
 
diff --git a/cross-project-tests/debuginfo-tests/dexter/dex/debugger/Debuggers.py b/cross-project-tests/debuginfo-tests/dexter/dex/debugger/Debuggers.py
index ef23dcf6bebbc..0232bdeb64b57 100644
--- a/cross-project-tests/debuginfo-tests/dexter/dex/debugger/Debuggers.py
+++ b/cross-project-tests/debuginfo-tests/dexter/dex/debugger/Debuggers.py
@@ -74,7 +74,7 @@ def add_debugger_tool_base_arguments(parser, defaults):
         type=str,
         metavar="<filepath>",
         default=None,
-        help="log file for messages between Dexter and the debug adapter; set to '-' to log to stdout",
+        help="log file for messages between Dexter and the debug adapter; set to '-' to log to stdout, '-e' to log to stderr",
     )
     dap_group.add_argument(
         "--colorize-dap-log",
@@ -186,7 +186,11 @@ def handle_debugger_tool_base_options(context, defaults):  # noqa
                 '<d>could not find</> <r>"{}"</>'.format(options.lldb_executable)
             )
 
-    if options.dap_message_log is not None and options.dap_message_log != "-":
+    if (
+        options.dap_message_log is not None
+        and options.dap_message_log != "-"
+        and options.dap_message_log != "-e"
+    ):
         options.dap_message_log = os.path.abspath(options.dap_message_log)
 
 
diff --git a/cross-project-tests/lit.cfg.py b/cross-project-tests/lit.cfg.py
index f042c27aece9f..e702a7739f511 100644
--- a/cross-project-tests/lit.cfg.py
+++ b/cross-project-tests/lit.cfg.py
@@ -121,7 +121,7 @@ def configure_dexter_substitutions():
         tools.append(
             ToolSubst(
                 "%dexter_lldb_args",
-                f'--lldb-executable "{lldb_dap_path}" --debugger lldb-dap',
+                f'--lldb-executable "{lldb_dap_path}" --debugger lldb-dap --dap-message-log=-e',
             )
         )
 
@@ -148,7 +148,9 @@ def configure_dexter_substitutions():
         dexter_regression_test_c_builder = "clang"
         dexter_regression_test_cxx_builder = "clang++"
         dexter_regression_test_debugger = "lldb-dap"
-        dexter_regression_test_additional_flags = f'--lldb-executable "{lldb_dap_path}"'
+        dexter_regression_test_additional_flags = (
+            f'--lldb-executable "{lldb_dap_path}" --dap-message-log=-e'
+        )
         dexter_regression_test_c_flags = "-O0 -glldb -std=gnu11"
         dexter_regression_test_cxx_flags = "-O0 -glldb -std=gnu++11"
 
diff --git a/flang-rt/lib/runtime/complex-reduction.h b/flang-rt/lib/runtime/complex-reduction.h
index 44c52fb02fa43..43854462496df 100644
--- a/flang-rt/lib/runtime/complex-reduction.h
+++ b/flang-rt/lib/runtime/complex-reduction.h
@@ -17,6 +17,7 @@
 #include "flang/Common/float128.h"
 #include "flang/Runtime/entry-names.h"
 #include <complex.h>
+#include <stdbool.h>
 
 struct CppDescriptor; /* dummy type name for Fortran::runtime::Descriptor */
 
diff --git a/flang-rt/lib/runtime/extensions.cpp b/flang-rt/lib/runtime/extensions.cpp
index be0eed6f49dc8..2c42597a56541 100644
--- a/flang-rt/lib/runtime/extensions.cpp
+++ b/flang-rt/lib/runtime/extensions.cpp
@@ -60,7 +60,7 @@ inline void CtimeBuffer(char *buffer, size_t bufsize, const time_t cur_time,
 
 namespace Fortran::runtime {
 
-// Common implementation that could be used for either SECNDS() or SECNDSD(),
+// Common implementation that could be used for either SECNDS() or DSECNDS(),
 // which are defined for float or double.
 template <typename T> T SecndsImpl(T *refTime) {
   static_assert(std::is_same<T, float>::value || std::is_same<T, double>::value,
@@ -381,6 +381,17 @@ float RTNAME(Secnds)(float *refTime, const char *sourceFile, int line) {
   return FORTRAN_PROCEDURE_NAME(secnds)(refTime);
 }
 
+// PGI extension function DSECNDS(refTime)
+double FORTRAN_PROCEDURE_NAME(dsecnds)(double *refTime) {
+  return SecndsImpl(refTime);
+}
+
+double RTNAME(Dsecnds)(double *refTime, const char *sourceFile, int line) {
+  Terminator terminator{sourceFile, line};
+  RUNTIME_CHECK(terminator, refTime != nullptr);
+  return FORTRAN_PROCEDURE_NAME(dsecnds)(refTime);
+}
+
 // GNU extension function TIME()
 std::int64_t RTNAME(time)() { return time(nullptr); }
 
diff --git a/flang-rt/lib/runtime/misc-intrinsic.cpp b/flang-rt/lib/runtime/misc-intrinsic.cpp
index 4d1165f25687c..3812c990cd946 100644
--- a/flang-rt/lib/runtime/misc-intrinsic.cpp
+++ b/flang-rt/lib/runtime/misc-intrinsic.cpp
@@ -59,14 +59,22 @@ RT_EXT_API_GROUP_BEGIN
 void RTDEF(Rename)(const Descriptor &path1, const Descriptor &path2,
     const Descriptor *status, const char *sourceFile, int line) {
   Terminator terminator{sourceFile, line};
+
+  // Semantics for character strings: A null character (CHAR(0)) can be used to
+  // mark the end of the names in PATH1 and PATH2; otherwise, trailing blanks in
+  // the file names are ignored.
+  // (https://gcc.gnu.org/onlinedocs/gfortran/RENAME.html)
 #if !defined(RT_DEVICE_COMPILATION)
-  char *pathSrc{EnsureNullTerminated(
-      path1.OffsetElement(), path1.ElementBytes(), terminator)};
-  char *pathDst{EnsureNullTerminated(
-      path2.OffsetElement(), path2.ElementBytes(), terminator)};
+  // Trim tailing spaces, respect presences of null character when doing so.
+  auto pathSrc{SaveDefaultCharacter(path1.OffsetElement(),
+      TrimTrailingSpaces(path1.OffsetElement(), path1.ElementBytes()),
+      terminator)};
+  auto pathDst{SaveDefaultCharacter(path2.OffsetElement(),
+      TrimTrailingSpaces(path2.OffsetElement(), path2.ElementBytes()),
+      terminator)};
 
-  // We simply call rename(2) from POSIX
-  int result{rename(pathSrc, pathDst)};
+  // We can now simply call rename(2) from POSIX.
+  int result{rename(pathSrc.get(), pathDst.get())};
   if (status) {
     // When an error has happened,
     int errorCode{0}; // Assume success
@@ -76,14 +84,6 @@ void RTDEF(Rename)(const Descriptor &path1, const Descriptor &path2,
     }
     StoreIntToDescriptor(status, errorCode, terminator);
   }
-
-  // Deallocate memory if EnsureNullTerminated dynamically allocated memory
-  if (pathSrc != path1.OffsetElement()) {
-    FreeMemory(pathSrc);
-  }
-  if (pathDst != path2.OffsetElement()) {
-    FreeMemory(pathDst);
-  }
 #else // !defined(RT_DEVICE_COMPILATION)
   terminator.Crash("RENAME intrinsic is only supported on host devices");
 #endif // !defined(RT_DEVICE_COMPILATION)
diff --git a/flang-rt/lib/runtime/namelist.cpp b/flang-rt/lib/runtime/namelist.cpp
index 79dbe4b822921..e9c0b8ffa2da2 100644
--- a/flang-rt/lib/runtime/namelist.cpp
+++ b/flang-rt/lib/runtime/namelist.cpp
@@ -258,13 +258,40 @@ static RT_API_ATTRS bool HandleSubscripts(IoStatementState &io,
   return false;
 }
 
-static RT_API_ATTRS void StorageSequenceExtension(
-    Descriptor &desc, const Descriptor &source) {
+static RT_API_ATTRS bool HasDefinedIoSubroutine(common::DefinedIo definedIo,
+    typeInfo::SpecialBinding::Which specialBinding,
+    const typeInfo::DerivedType *derivedType,
+    const NonTbpDefinedIoTable *table) {
+  for (; derivedType; derivedType = derivedType->GetParentType()) {
+    if ((table && table->Find(*derivedType, definedIo) != nullptr) ||
+        derivedType->FindSpecialBinding(specialBinding)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+static RT_API_ATTRS bool HasDefinedIoSubroutine(common::DefinedIo definedIo,
+    typeInfo::SpecialBinding::Which specialBinding,
+    const Descriptor &descriptor, const NonTbpDefinedIoTable *table) {
+  const DescriptorAddendum *addendum{descriptor.Addendum()};
+  return addendum &&
+      HasDefinedIoSubroutine(
+          definedIo, specialBinding, addendum->derivedType(), table);
+}
+
+static RT_API_ATTRS void StorageSequenceExtension(Descriptor &desc,
+    const Descriptor &source, const io::NonTbpDefinedIoTable *table) {
   // Support the near-universal extension of NAMELIST input into a
   // designatable storage sequence identified by its initial scalar array
   // element.  For example, treat "A(1) = 1. 2. 3." as if it had been
   // "A(1:) = 1. 2. 3.".
-  if (desc.rank() == 0 && (source.rank() == 1 || source.IsContiguous())) {
+  // (But don't do this for derived types with defined formatted READs,
+  // since they might do non-list-directed input that won't stop at the
+  // next namelist input item name.)
+  if (desc.rank() == 0 && (source.rank() == 1 || source.IsContiguous()) &&
+      !HasDefinedIoSubroutine(common::DefinedIo::ReadFormatted,
+          typeInfo::SpecialBinding::Which::ReadFormatted, desc, table)) {
     if (auto stride{source.rank() == 1
                 ? source.GetDimension(0).ByteStride()
                 : static_cast<SubscriptValue>(source.ElementBytes())};
@@ -561,7 +588,8 @@ bool IODEF(InputNamelist)(Cookie cookie, const NamelistGroup &group) {
         next = io.GetCurrentChar(byteCount);
       } while (next && (*next == '(' || *next == '%'));
       if (lastSubscriptDescriptor) {
-        StorageSequenceExtension(*lastSubscriptDescriptor, *lastSubscriptBase);
+        StorageSequenceExtension(*lastSubscriptDescriptor, *lastSubscriptBase,
+            group.nonTbpDefinedIo);
       }
     }
     // Skip the '='
@@ -596,6 +624,12 @@ bool IODEF(InputNamelist)(Cookie cookie, const NamelistGroup &group) {
   }
   if (next && *next == '/') {
     io.HandleRelativePosition(byteCount);
+    if (auto *listInput{
+            io.get_if<ListDirectedStatementState<Direction::Input>>()}) {
+      // Don't let the namelist's terminal '/' mess up a parent I/O's
+      // list-directed input.
+      listInput->set_hitSlash(false);
+    }
   } else if (*next && (*next == '&' || *next == '$')) {
     // stop at beginning of next group
   } else {
diff --git a/flang-rt/lib/runtime/unit.cpp b/flang-rt/lib/runtime/unit.cpp
index da3783417f234..549fbeaca05b3 100644
--- a/flang-rt/lib/runtime/unit.cpp
+++ b/flang-rt/lib/runtime/unit.cpp
@@ -827,6 +827,7 @@ ChildIo &ExternalFileUnit::PushChildIo(IoStatementState &parent) {
   Terminator &terminator{parent.GetIoErrorHandler()};
   OwningPtr<ChildIo> next{New<ChildIo>{terminator}(parent, std::move(current))};
   child_.reset(next.release());
+  leftTabLimit = positionInRecord;
   return *child_;
 }
 
diff --git a/flang/docs/Extensions.md b/flang/docs/Extensions.md
index cf528b8231d56..c442a9cd6859e 100644
--- a/flang/docs/Extensions.md
+++ b/flang/docs/Extensions.md
@@ -930,3 +930,6 @@ print *, [(j,j=1,10)]
   or contiguous array can be used as the initial element of a storage
   sequence.  For example, "&GRP A(1)=1. 2. 3./" is treated as if had been
   "&GRP A(1:)=1. 2. 3./".
+  This extension is necessarily disabled when the type of the array
+  has an accessible defined formatted READ subroutine.
+
diff --git a/flang/docs/Intrinsics.md b/flang/docs/Intrinsics.md
index 4b000877e7844..3314d1bcc64a2 100644
--- a/flang/docs/Intrinsics.md
+++ b/flang/docs/Intrinsics.md
@@ -1149,6 +1149,32 @@ PROGRAM example_secnds
   PRINT *, "Elapsed seconds:", elapsed
 END PROGRAM example_secnds
 ```
+### Non-Standard Intrinsics: DSECNDS
+#### Description
+`DSECNDS(refTime)` is the double precision variant of `SECNDS`. It returns the number of seconds
+since midnight minus a user-supplied reference time `refTime`. Uses `REAL(KIND=8)` for higher precision.
+
+#### Usage and Info
+- **Standard:** PGI extension  
+- **Class:**     function  
+- **Syntax:**    result = `DSECNDS(refTime)`  
+- **Arguments:** 
+
+| ARGUMENT  | INTENT |      TYPE     |          KIND           |           Description                    |
+|-----------|--------|---------------|-------------------------|------------------------------------------|
+| `refTime` | `IN`   | `REAL, scalar`| REAL(KIND=8), required  | Reference time in seconds since midnight |
+
+- **Return Value:** REAL(KIND=8), scalar — seconds elapsed since `refTime`.  
+- **Purity:** Impure
+
+#### Example
+```fortran
+PROGRAM example_dsecnds
+  DOUBLE PRECISION :: refTime
+  refTime = 0.0D0
+  PRINT '(F24.15)', DSECNDS(refTime)
+END PROGRAM example_dsecnds
+```
 
 ### Non-standard Intrinsics: SECOND
 This intrinsic is an alias for `CPU_TIME`: supporting both a subroutine and a
diff --git a/flang/docs/ReleaseNotes.md b/flang/docs/ReleaseNotes.md
index c9623ea08c4e6..6a285f829053b 100644
--- a/flang/docs/ReleaseNotes.md
+++ b/flang/docs/ReleaseNotes.md
@@ -35,6 +35,8 @@ page](https://llvm.org/releases/).
 
 ## New Compiler Flags
 
+* -fexperimental-loop-fusion is now recognized by flang.
+
 ## Windows Support
 
 ## Fortran Language Changes in Flang
diff --git a/flang/include/flang/Frontend/CodeGenOptions.def b/flang/include/flang/Frontend/CodeGenOptions.def
index cdeea93c9aecb..f09159962883f 100644
--- a/flang/include/flang/Frontend/CodeGenOptions.def
+++ b/flang/include/flang/Frontend/CodeGenOptions.def
@@ -43,9 +43,11 @@ CODEGENOPT(StackArrays, 1, 0) ///< -fstack-arrays (enable the stack-arrays pass)
 CODEGENOPT(VectorizeLoop, 1, 0) ///< Enable loop vectorization.
 CODEGENOPT(VectorizeSLP, 1, 0) ///< Enable SLP vectorization.
 CODEGENOPT(InterchangeLoops, 1, 0) ///< Enable loop interchange.
+CODEGENOPT(FuseLoops, 1, 0) ///< Enable loop fusion.
 CODEGENOPT(LoopVersioning, 1, 0) ///< Enable loop versioning.
 CODEGENOPT(UnrollLoops, 1, 0) ///< Enable loop unrolling
 CODEGENOPT(AliasAnalysis, 1, 0) ///< Enable alias analysis pass
+CODEGENOPT(DwarfVersion, 3, 0) ///< Dwarf version
 
 CODEGENOPT(Underscoring, 1, 1)
 ENUM_CODEGENOPT(RelocationModel, llvm::Reloc::Model, 3, llvm::Reloc::PIC_) ///< Name of the relocation model to use.
diff --git a/flang/include/flang/Lower/AbstractConverter.h b/flang/include/flang/Lower/AbstractConverter.h
index 8e9de418e1b7e..0ffe27ea038e8 100644
--- a/flang/include/flang/Lower/AbstractConverter.h
+++ b/flang/include/flang/Lower/AbstractConverter.h
@@ -271,6 +271,9 @@ class AbstractConverter {
   virtual const Fortran::lower::pft::FunctionLikeUnit *
   getCurrentFunctionUnit() const = 0;
 
+  /// Check support of Multi-image features if -fcoarray is provided
+  virtual void checkCoarrayEnabled() = 0;
+
   //===--------------------------------------------------------------------===//
   // Types
   //===--------------------------------------------------------------------===//
diff --git a/flang/include/flang/Lower/OpenACC.h b/flang/include/flang/Lower/OpenACC.h
index e974f3d6eef11..19d759479abaf 100644
--- a/flang/include/flang/Lower/OpenACC.h
+++ b/flang/include/flang/Lower/OpenACC.h
@@ -66,11 +66,11 @@ struct Evaluation;
 } // namespace pft
 
 static constexpr llvm::StringRef declarePostAllocSuffix =
-    "_acc_declare_update_desc_post_alloc";
+    "_acc_declare_post_alloc";
 static constexpr llvm::StringRef declarePreDeallocSuffix =
-    "_acc_declare_update_desc_pre_dealloc";
+    "_acc_declare_pre_dealloc";
 static constexpr llvm::StringRef declarePostDeallocSuffix =
-    "_acc_declare_update_desc_post_dealloc";
+    "_acc_declare_post_dealloc";
 
 static constexpr llvm::StringRef privatizationRecipePrefix = "privatization";
 
diff --git a/flang/include/flang/Lower/OpenMP/Clauses.h b/flang/include/flang/Lower/OpenMP/Clauses.h
index 638846835094c..18e2f209c2d7a 100644
--- a/flang/include/flang/Lower/OpenMP/Clauses.h
+++ b/flang/include/flang/Lower/OpenMP/Clauses.h
@@ -277,6 +277,7 @@ using Read = tomp::clause::ReadT<TypeTy, IdTy, ExprTy>;
 using Reduction = tomp::clause::ReductionT<TypeTy, IdTy, ExprTy>;
 using Relaxed = tomp::clause::RelaxedT<TypeTy, IdTy, ExprTy>;
 using Release = tomp::clause::ReleaseT<TypeTy, IdTy, ExprTy>;
+using Replayable = tomp::clause::ReplayableT<TypeTy, IdTy, ExprTy>;
 using ReverseOffload = tomp::clause::ReverseOffloadT<TypeTy, IdTy, ExprTy>;
 using Safelen = tomp::clause::SafelenT<TypeTy, IdTy, ExprTy>;
 using Schedule = tomp::clause::ScheduleT<TypeTy, IdTy, ExprTy>;
@@ -290,6 +291,7 @@ using Permutation = tomp::clause::PermutationT<TypeTy, IdTy, ExprTy>;
 using TaskReduction = tomp::clause::TaskReductionT<TypeTy, IdTy, ExprTy>;
 using ThreadLimit = tomp::clause::ThreadLimitT<TypeTy, IdTy, ExprTy>;
 using Threads = tomp::clause::ThreadsT<TypeTy, IdTy, ExprTy>;
+using Transparent = tomp::clause::TransparentT<TypeTy, IdTy, ExprTy>;
 using To = tomp::clause::ToT<TypeTy, IdTy, ExprTy>;
 using UnifiedAddress = tomp::clause::UnifiedAddressT<TypeTy, IdTy, ExprTy>;
 using UnifiedSharedMemory =
diff --git a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
index 3c020abd59417..320f913858956 100644
--- a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
+++ b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
@@ -253,6 +253,8 @@ struct IntrinsicLibrary {
   mlir::Value genCosd(mlir::Type, llvm::ArrayRef<mlir::Value>);
   mlir::Value genCospi(mlir::Type, llvm::ArrayRef<mlir::Value>);
   void genDateAndTime(llvm::ArrayRef<fir::ExtendedValue>);
+  fir::ExtendedValue genDsecnds(mlir::Type resultType,
+                                llvm::ArrayRef<fir::ExtendedValue> args);
   mlir::Value genDim(mlir::Type, llvm::ArrayRef<mlir::Value>);
   fir::ExtendedValue genDotProduct(mlir::Type,
                                    llvm::ArrayRef<fir::ExtendedValue>);
@@ -573,15 +575,6 @@ struct IntrinsicLibrary {
 
   void setResultMustBeFreed() { resultMustBeFreed = true; }
 
-  // Check support of coarray features
-  void checkCoarrayEnabled() {
-    if (converter &&
-        !converter->getFoldingContext().languageFeatures().IsEnabled(
-            Fortran::common::LanguageFeature::Coarray))
-      fir::emitFatalError(loc, "Coarrays disabled, use '-fcoarray' to enable.",
-                          false);
-  }
-
   fir::FirOpBuilder &builder;
   mlir::Location loc;
   bool resultMustBeFreed = false;
diff --git a/flang/include/flang/Optimizer/Builder/Runtime/Coarray.h b/flang/include/flang/Optimizer/Builder/Runtime/Coarray.h
index 10ed503a485a3..20bfb7c124af2 100644
--- a/flang/include/flang/Optimizer/Builder/Runtime/Coarray.h
+++ b/flang/include/flang/Optimizer/Builder/Runtime/Coarray.h
@@ -71,5 +71,15 @@ void genCoMin(fir::FirOpBuilder &builder, mlir::Location loc, mlir::Value A,
 void genCoSum(fir::FirOpBuilder &builder, mlir::Location loc, mlir::Value A,
               mlir::Value resultImage, mlir::Value stat, mlir::Value errmsg);
 
+/// Generate call to runtime subroutine prif_sync_all
+void genSyncAllStatement(fir::FirOpBuilder &builder, mlir::Location loc,
+                         mlir::Value stat, mlir::Value errmsg);
+/// Generate call to runtime subroutine prif_sync_memory
+void genSyncMemoryStatement(fir::FirOpBuilder &builder, mlir::Location loc,
+                            mlir::Value stat, mlir::Value errmsg);
+/// Generate call to runtime subroutine prif_sync_images
+void genSyncImagesStatement(fir::FirOpBuilder &builder, mlir::Location loc,
+                            mlir::Value imageSet, mlir::Value stat,
+                            mlir::Value errmsg);
 } // namespace fir::runtime
 #endif // FORTRAN_OPTIMIZER_BUILDER_RUNTIME_COARRAY_H
diff --git a/flang/include/flang/Optimizer/Builder/Runtime/Intrinsics.h b/flang/include/flang/Optimizer/Builder/Runtime/Intrinsics.h
index 548ee4bb65818..7a97172cfbb9a 100644
--- a/flang/include/flang/Optimizer/Builder/Runtime/Intrinsics.h
+++ b/flang/include/flang/Optimizer/Builder/Runtime/Intrinsics.h
@@ -44,6 +44,10 @@ void genDateAndTime(fir::FirOpBuilder &, mlir::Location,
                     std::optional<fir::CharBoxValue> date,
                     std::optional<fir::CharBoxValue> time,
                     std::optional<fir::CharBoxValue> zone, mlir::Value values);
+
+mlir::Value genDsecnds(fir::FirOpBuilder &builder, mlir::Location loc,
+                       mlir::Value refTime);
+
 void genEtime(fir::FirOpBuilder &builder, mlir::Location loc,
               mlir::Value values, mlir::Value time);
 
diff --git a/flang/include/flang/Optimizer/Passes/Pipelines.h b/flang/include/flang/Optimizer/Passes/Pipelines.h
index fd8c43cc88a19..f9c41b382abe5 100644
--- a/flang/include/flang/Optimizer/Passes/Pipelines.h
+++ b/flang/include/flang/Optimizer/Passes/Pipelines.h
@@ -102,7 +102,7 @@ void addCompilerGeneratedNamesConversionPass(mlir::PassManager &pm);
 void addDebugInfoPass(mlir::PassManager &pm,
                       llvm::codegenoptions::DebugInfoKind debugLevel,
                       llvm::OptimizationLevel optLevel,
-                      llvm::StringRef inputFilename);
+                      llvm::StringRef inputFilename, int32_t dwarfVersion);
 
 void addFIRToLLVMPass(mlir::PassManager &pm,
                       const MLIRToLLVMPassPipelineConfig &config);
@@ -158,7 +158,7 @@ void createOpenMPFIRPassPipeline(mlir::PassManager &pm,
 void createDebugPasses(mlir::PassManager &pm,
                        llvm::codegenoptions::DebugInfoKind debugLevel,
                        llvm::OptimizationLevel OptLevel,
-                       llvm::StringRef inputFilename);
+                       llvm::StringRef inputFilename, int32_t dwarfVersion);
 
 void createDefaultFIRCodeGenPassPipeline(mlir::PassManager &pm,
                                          MLIRToLLVMPassPipelineConfig config,
diff --git a/flang/include/flang/Optimizer/Transforms/Passes.td b/flang/include/flang/Optimizer/Transforms/Passes.td
index e3001454cdf19..b7fa0ca5f5719 100644
--- a/flang/include/flang/Optimizer/Transforms/Passes.td
+++ b/flang/include/flang/Optimizer/Transforms/Passes.td
@@ -242,6 +242,10 @@ def AddDebugInfo : Pass<"add-debug-info", "mlir::ModuleOp"> {
            "std::string",
            /*default=*/"std::string{}",
            "name of the input source file">,
+    Option<"dwarfVersion", "dwarf-version",
+           "int32_t",
+           /*default=*/"0",
+           "dwarf version">,
   ];
 }
 
diff --git a/flang/include/flang/Parser/dump-parse-tree.h b/flang/include/flang/Parser/dump-parse-tree.h
index d2ab7cbd8fe35..1c9fd7673e06d 100644
--- a/flang/include/flang/Parser/dump-parse-tree.h
+++ b/flang/include/flang/Parser/dump-parse-tree.h
@@ -658,6 +658,7 @@ class ParseTreeDumper {
   NODE(parser, OmpReductionSpecifier)
   NODE(parser, OmpRefModifier)
   NODE_ENUM(OmpRefModifier, Value)
+  NODE(parser, OmpReplayableClause)
   NODE(parser, OmpScheduleClause)
   NODE(OmpScheduleClause, Modifier)
   NODE_ENUM(OmpScheduleClause, Kind)
@@ -686,6 +687,7 @@ class ParseTreeDumper {
   NODE(parser, OmpTraitSetSelector)
   NODE(parser, OmpTraitSetSelectorName)
   NODE_ENUM(OmpTraitSetSelectorName, Value)
+  NODE(parser, OmpTransparentClause)
   NODE(parser, OmpTypeNameList)
   NODE(parser, OmpTypeSpecifier)
   NODE(parser, OmpUpdateClause)
diff --git a/flang/include/flang/Parser/openmp-utils.h b/flang/include/flang/Parser/openmp-utils.h
index 3d3dfae290d96..032fb8996fe48 100644
--- a/flang/include/flang/Parser/openmp-utils.h
+++ b/flang/include/flang/Parser/openmp-utils.h
@@ -67,8 +67,7 @@ struct DirectiveNameScope {
   }
 
   static OmpDirectiveName GetOmpDirectiveName(const OmpBeginLoopDirective &x) {
-    auto &dir{std::get<OmpLoopDirective>(x.t)};
-    return MakeName(dir.source, dir.v);
+    return x.DirName();
   }
 
   static OmpDirectiveName GetOmpDirectiveName(const OpenMPSectionConstruct &x) {
@@ -155,6 +154,8 @@ template <typename T> OmpDirectiveName GetOmpDirectiveName(const T &x) {
 }
 
 const OmpObjectList *GetOmpObjectList(const OmpClause &clause);
+const BlockConstruct *GetFortranBlockConstruct(
+    const ExecutionPartConstruct &epc);
 
 } // namespace Fortran::parser::omp
 
diff --git a/flang/include/flang/Parser/parse-tree.h b/flang/include/flang/Parser/parse-tree.h
index 622b5f90a9fba..7307283eb91ec 100644
--- a/flang/include/flang/Parser/parse-tree.h
+++ b/flang/include/flang/Parser/parse-tree.h
@@ -4643,6 +4643,14 @@ struct OmpReductionClause {
   std::tuple<MODIFIERS(), OmpObjectList> t;
 };
 
+// Ref: [6.0:440:441]
+//
+// replayable-clause ->
+//    REPLAYABLE[(replayable-expression)]           // since 6.0
+struct OmpReplayableClause {
+  WRAPPER_CLASS_BOILERPLATE(OmpReplayableClause, Scalar<Logical<ConstantExpr>>);
+};
+
 // Ref: [4.5:56-63], [5.0:101-109], [5.1:126-133], [5.2:252-254]
 //
 // schedule-clause ->
@@ -4692,6 +4700,14 @@ struct OmpToClause {
   std::tuple<MODIFIERS(), OmpObjectList, /*CommaSeparated=*/bool> t;
 };
 
+// Ref: [6.0:510-511]
+//
+// transparent-clause ->
+//    TRANSPARENT[(impex-type)]                     // since 6.0
+struct OmpTransparentClause {
+  WRAPPER_CLASS_BOILERPLATE(OmpTransparentClause, ScalarIntExpr);
+};
+
 // Ref: [5.0:254-255], [5.1:287-288], [5.2:321-322]
 //
 // In ATOMIC construct
@@ -5142,16 +5158,12 @@ struct OpenMPStandaloneConstruct {
       u;
 };
 
-struct OmpBeginLoopDirective {
-  TUPLE_CLASS_BOILERPLATE(OmpBeginLoopDirective);
-  std::tuple<OmpLoopDirective, OmpClauseList> t;
-  CharBlock source;
+struct OmpBeginLoopDirective : public OmpBeginDirective {
+  INHERITED_TUPLE_CLASS_BOILERPLATE(OmpBeginLoopDirective, OmpBeginDirective);
 };
 
-struct OmpEndLoopDirective {
-  TUPLE_CLASS_BOILERPLATE(OmpEndLoopDirective);
-  std::tuple<OmpLoopDirective, OmpClauseList> t;
-  CharBlock source;
+struct OmpEndLoopDirective : public OmpEndDirective {
+  INHERITED_TUPLE_CLASS_BOILERPLATE(OmpEndLoopDirective, OmpEndDirective);
 };
 
 // OpenMP directives enclosing do loop
@@ -5161,6 +5173,13 @@ struct OpenMPLoopConstruct {
   TUPLE_CLASS_BOILERPLATE(OpenMPLoopConstruct);
   OpenMPLoopConstruct(OmpBeginLoopDirective &&a)
       : t({std::move(a), std::nullopt, std::nullopt}) {}
+
+  const OmpBeginLoopDirective &BeginDir() const {
+    return std::get<OmpBeginLoopDirective>(t);
+  }
+  const std::optional<OmpEndLoopDirective> &EndDir() const {
+    return std::get<std::optional<OmpEndLoopDirective>>(t);
+  }
   std::tuple<OmpBeginLoopDirective, std::optional<NestedConstruct>,
       std::optional<OmpEndLoopDirective>>
       t;
diff --git a/flang/include/flang/Runtime/extensions.h b/flang/include/flang/Runtime/extensions.h
index 9a100cec9e6b9..7e4201f15171f 100644
--- a/flang/include/flang/Runtime/extensions.h
+++ b/flang/include/flang/Runtime/extensions.h
@@ -28,6 +28,10 @@ typedef std::uint32_t gid_t;
 
 extern "C" {
 
+// PGI extension function DSECNDS(refTime)
+double FORTRAN_PROCEDURE_NAME(dsecnds)(double *refTime);
+double RTNAME(Dsecnds)(double *refTime, const char *sourceFile, int line);
+
 // CALL FLUSH(n) antedates the Fortran 2003 FLUSH statement.
 void FORTRAN_PROCEDURE_NAME(flush)(const int &unit);
 
diff --git a/flang/include/flang/Runtime/freestanding-tools.h b/flang/include/flang/Runtime/freestanding-tools.h
index 6753b7a24ebe1..7ef7cc74f213b 100644
--- a/flang/include/flang/Runtime/freestanding-tools.h
+++ b/flang/include/flang/Runtime/freestanding-tools.h
@@ -13,6 +13,7 @@
 #include "flang/Runtime/c-or-cpp.h"
 #include <algorithm>
 #include <cctype>
+#include <cstdlib>
 #include <cstring>
 
 // The file defines a set of utilities/classes that might be
diff --git a/flang/include/flang/Semantics/openmp-utils.h b/flang/include/flang/Semantics/openmp-utils.h
index 1c54124a5738a..68318d6093a1e 100644
--- a/flang/include/flang/Semantics/openmp-utils.h
+++ b/flang/include/flang/Semantics/openmp-utils.h
@@ -83,6 +83,7 @@ const SomeExpr *HasStorageOverlap(
 bool IsAssignment(const parser::ActionStmt *x);
 bool IsPointerAssignment(const evaluate::Assignment &x);
 const parser::Block &GetInnermostExecPart(const parser::Block &block);
+bool IsStrictlyStructuredBlock(const parser::Block &block);
 } // namespace omp
 } // namespace Fortran::semantics
 
diff --git a/flang/include/flang/Tools/CrossToolHelpers.h b/flang/include/flang/Tools/CrossToolHelpers.h
index 335f0a45531c8..038f388f2ec0b 100644
--- a/flang/include/flang/Tools/CrossToolHelpers.h
+++ b/flang/include/flang/Tools/CrossToolHelpers.h
@@ -108,6 +108,7 @@ struct MLIRToLLVMPassPipelineConfig : public FlangEPCallBacks {
       InstrumentFunctionEntry = "__cyg_profile_func_enter";
       InstrumentFunctionExit = "__cyg_profile_func_exit";
     }
+    DwarfVersion = opts.DwarfVersion;
   }
 
   llvm::OptimizationLevel OptLevel; ///< optimisation level
@@ -143,6 +144,7 @@ struct MLIRToLLVMPassPipelineConfig : public FlangEPCallBacks {
   Fortran::frontend::CodeGenOptions::ComplexRangeKind ComplexRange =
       Fortran::frontend::CodeGenOptions::ComplexRangeKind::
           CX_Full; ///< Method for calculating complex number division
+  int32_t DwarfVersion = 0; ///< Version of DWARF debug info to generate
 };
 
 struct OffloadModuleOpts {
diff --git a/flang/lib/Evaluate/intrinsics.cpp b/flang/lib/Evaluate/intrinsics.cpp
index abe53c31210d0..c7f174f7989dd 100644
--- a/flang/lib/Evaluate/intrinsics.cpp
+++ b/flang/lib/Evaluate/intrinsics.cpp
@@ -462,6 +462,10 @@ static const IntrinsicInterface genericIntrinsicFunction[]{
             {"vector_b", AnyNumeric, Rank::vector}},
         ResultNumeric, Rank::scalar, IntrinsicClass::transformationalFunction},
     {"dprod", {{"x", DefaultReal}, {"y", DefaultReal}}, DoublePrecision},
+    {"dsecnds",
+        {{"refTime", TypePattern{RealType, KindCode::exactKind, 8},
+            Rank::scalar}},
+        TypePattern{RealType, KindCode::exactKind, 8}, Rank::scalar},
     {"dshiftl",
         {{"i", SameIntOrUnsigned},
             {"j", SameIntOrUnsigned, Rank::elementalOrBOZ}, {"shift", AnyInt}},
diff --git a/flang/lib/Frontend/CompilerInvocation.cpp b/flang/lib/Frontend/CompilerInvocation.cpp
index 6295a58b1bdad..4729f8a7611a2 100644
--- a/flang/lib/Frontend/CompilerInvocation.cpp
+++ b/flang/lib/Frontend/CompilerInvocation.cpp
@@ -157,6 +157,9 @@ static bool parseDebugArgs(Fortran::frontend::CodeGenOptions &opts,
           clang::DiagnosticsEngine::Warning, "Unsupported debug option: %0");
       diags.Report(debugWarning) << arg->getValue();
     }
+    opts.DwarfVersion =
+        getLastArgIntValue(args, clang::driver::options::OPT_dwarf_version_EQ,
+                           /*Default=*/0, diags);
   }
   return true;
 }
@@ -276,6 +279,9 @@ static void parseCodeGenArgs(Fortran::frontend::CodeGenOptions &opts,
   if (args.getLastArg(clang::driver::options::OPT_floop_interchange))
     opts.InterchangeLoops = 1;
 
+  if (args.getLastArg(clang::driver::options::OPT_fexperimental_loop_fusion))
+    opts.FuseLoops = 1;
+
   if (args.getLastArg(clang::driver::options::OPT_vectorize_loops))
     opts.VectorizeLoop = 1;
 
diff --git a/flang/lib/Frontend/FrontendActions.cpp b/flang/lib/Frontend/FrontendActions.cpp
index 3bef6b1c31825..23cc1e63e773d 100644
--- a/flang/lib/Frontend/FrontendActions.cpp
+++ b/flang/lib/Frontend/FrontendActions.cpp
@@ -958,6 +958,7 @@ void CodeGenAction::runOptimizationPipeline(llvm::raw_pwrite_stream &os) {
     si.getTimePasses().setOutStream(ci.getTimingStreamLLVM());
   pto.LoopUnrolling = opts.UnrollLoops;
   pto.LoopInterchange = opts.InterchangeLoops;
+  pto.LoopFusion = opts.FuseLoops;
   pto.LoopInterleaving = opts.UnrollLoops;
   pto.LoopVectorization = opts.VectorizeLoop;
   pto.SLPVectorization = opts.VectorizeSLP;
diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp
index 6125ea9153662..4a5b9885bb7c4 100644
--- a/flang/lib/Lower/Bridge.cpp
+++ b/flang/lib/Lower/Bridge.cpp
@@ -1131,6 +1131,16 @@ class FirConverter : public Fortran::lower::AbstractConverter {
     return currentFunctionUnit;
   }
 
+  void checkCoarrayEnabled() override final {
+    if (!getFoldingContext().languageFeatures().IsEnabled(
+            Fortran::common::LanguageFeature::Coarray))
+      fir::emitFatalError(
+          getCurrentLocation(),
+          "Not yet implemented: Multi-image features are experimental and are "
+          "disabled by default, use '-fcoarray' to enable.",
+          false);
+  }
+
   void registerTypeInfo(mlir::Location loc,
                         Fortran::lower::SymbolRef typeInfoSym,
                         const Fortran::semantics::DerivedTypeSpec &typeSpec,
diff --git a/flang/lib/Lower/ConvertCall.cpp b/flang/lib/Lower/ConvertCall.cpp
index e82d4ea0904f1..a5a954a5ccea5 100644
--- a/flang/lib/Lower/ConvertCall.cpp
+++ b/flang/lib/Lower/ConvertCall.cpp
@@ -516,16 +516,8 @@ Fortran::lower::genCallOpAndResult(
     mlir::Value cast;
     auto *context = builder.getContext();
 
-    // Special handling for %VAL arguments: internal procedures expect
-    // reference parameters. When %VAL is used, the argument should be
-    // passed by value. Pass the originally loaded value.
-    if (fir::isa_ref_type(snd) && !fir::isa_ref_type(fst.getType()) &&
-        fir::dyn_cast_ptrEleTy(snd) == fst.getType()) {
-      auto loadOp = mlir::cast<fir::LoadOp>(fst.getDefiningOp());
-      mlir::Value originalStorage = loadOp.getMemref();
-      cast = originalStorage;
-    } else if (mlir::isa<fir::BoxProcType>(snd) &&
-               mlir::isa<mlir::FunctionType>(fst.getType())) {
+    if (mlir::isa<fir::BoxProcType>(snd) &&
+        mlir::isa<mlir::FunctionType>(fst.getType())) {
       mlir::FunctionType funcTy = mlir::FunctionType::get(context, {}, {});
       fir::BoxProcType boxProcTy = builder.getBoxProcType(funcTy);
       if (mlir::Value host = argumentHostAssocs(converter, fst)) {
@@ -1677,17 +1669,8 @@ void prepareUserCallArguments(
         break;
       }
       // For %VAL arguments, we should pass the value directly without
-      // conversion to reference types. If argTy is different from value type,
-      // it might be due to signature mismatch with internal procedures.
-      if (argTy == value.getType())
-        caller.placeInput(arg, value);
-      else if (fir::isa_ref_type(argTy) &&
-               fir::dyn_cast_ptrEleTy(argTy) == value.getType()) {
-        auto loadOp = mlir::cast<fir::LoadOp>(value.getDefiningOp());
-        mlir::Value originalStorage = loadOp.getMemref();
-        caller.placeInput(arg, originalStorage);
-      } else
-        caller.placeInput(arg, builder.createConvert(loc, argTy, value));
+      // conversion to reference types.
+      caller.placeInput(arg, builder.createConvert(loc, argTy, value));
 
     } break;
     case PassBy::BaseAddressValueAttribute:
diff --git a/flang/lib/Lower/ConvertVariable.cpp b/flang/lib/Lower/ConvertVariable.cpp
index ccfde16ce2c32..da964c956dbd0 100644
--- a/flang/lib/Lower/ConvertVariable.cpp
+++ b/flang/lib/Lower/ConvertVariable.cpp
@@ -2439,6 +2439,11 @@ void Fortran::lower::mapSymbolAttributes(
 
   // Compute array extents and lower bounds.
   if (ba.isArray()) {
+    // Handle unused entry dummy arrays with BaseBoxType before processing shape
+    if (isUnusedEntryDummy &&
+        llvm::isa<fir::BaseBoxType>(converter.genType(var)))
+      if (genUnusedEntryPointBox())
+        return;
     if (ba.isStaticArray()) {
       if (ba.lboundIsAllOnes()) {
         for (std::int64_t extent :
diff --git a/flang/lib/Lower/OpenACC.cpp b/flang/lib/Lower/OpenACC.cpp
index d8a0e4d8a8fa0..07234663cbef6 100644
--- a/flang/lib/Lower/OpenACC.cpp
+++ b/flang/lib/Lower/OpenACC.cpp
@@ -249,17 +249,29 @@ static void createDeclareAllocFuncWithArg(mlir::OpBuilder &modBuilder,
   if (unwrapFirBox)
     asFortranDesc << accFirDescriptorPostfix.str();
 
-  // Updating descriptor must occur before the mapping of the data so that
-  // attached data pointer is not overwritten.
-  mlir::acc::UpdateDeviceOp updateDeviceOp =
-      createDataEntryOp<mlir::acc::UpdateDeviceOp>(
-          builder, loc, registerFuncOp.getArgument(0), asFortranDesc, bounds,
-          /*structured=*/false, /*implicit=*/true,
-          mlir::acc::DataClause::acc_update_device, descTy,
-          /*async=*/{}, /*asyncDeviceTypes=*/{}, /*asyncOnlyDeviceTypes=*/{});
-  llvm::SmallVector<int32_t> operandSegments{0, 0, 0, 1};
-  llvm::SmallVector<mlir::Value> operands{updateDeviceOp.getResult()};
-  createSimpleOp<mlir::acc::UpdateOp>(builder, loc, operands, operandSegments);
+  // For descriptor, preserve old behavior when unwrapping FIR box: update.
+  if (unwrapFirBox) {
+    mlir::acc::UpdateDeviceOp updateDeviceOp =
+        createDataEntryOp<mlir::acc::UpdateDeviceOp>(
+            builder, loc, registerFuncOp.getArgument(0), asFortranDesc, bounds,
+            /*structured=*/false, /*implicit=*/true,
+            mlir::acc::DataClause::acc_update_device, descTy,
+            /*async=*/{}, /*asyncDeviceTypes=*/{}, /*asyncOnlyDeviceTypes=*/{});
+    llvm::SmallVector<int32_t> operandSegments{0, 0, 0, 1};
+    llvm::SmallVector<mlir::Value> operands{updateDeviceOp.getResult()};
+    createSimpleOp<mlir::acc::UpdateOp>(builder, loc, operands,
+                                        operandSegments);
+  } else {
+    // New behavior: start a structured region with declare_enter.
+    EntryOp descEntryOp = createDataEntryOp<EntryOp>(
+        builder, loc, registerFuncOp.getArgument(0), asFortranDesc, bounds,
+        /*structured=*/false, /*implicit=*/true, clause, descTy,
+        /*async=*/{}, /*asyncDeviceTypes=*/{}, /*asyncOnlyDeviceTypes=*/{});
+    mlir::acc::DeclareEnterOp::create(
+        builder, loc,
+        mlir::acc::DeclareTokenType::get(descEntryOp.getContext()),
+        mlir::ValueRange(descEntryOp.getAccVar()));
+  }
 
   if (unwrapFirBox) {
     mlir::Value desc =
@@ -304,30 +316,58 @@ static void createDeclareDeallocFuncWithArg(
   }
 
   llvm::SmallVector<mlir::Value> bounds;
-  mlir::acc::GetDevicePtrOp entryOp =
-      createDataEntryOp<mlir::acc::GetDevicePtrOp>(
-          builder, loc, var, asFortran, bounds,
-          /*structured=*/false, /*implicit=*/false, clause, var.getType(),
-          /*async=*/{}, /*asyncDeviceTypes=*/{}, /*asyncOnlyDeviceTypes=*/{});
-  mlir::acc::DeclareExitOp::create(builder, loc, mlir::Value{},
-                                   mlir::ValueRange(entryOp.getAccVar()));
-
-  if constexpr (std::is_same_v<ExitOp, mlir::acc::CopyoutOp> ||
-                std::is_same_v<ExitOp, mlir::acc::UpdateHostOp>)
-    ExitOp::create(builder, entryOp.getLoc(), entryOp.getAccVar(),
-                   entryOp.getVar(), entryOp.getVarType(), entryOp.getBounds(),
-                   entryOp.getAsyncOperands(),
-                   entryOp.getAsyncOperandsDeviceTypeAttr(),
-                   entryOp.getAsyncOnlyAttr(), entryOp.getDataClause(),
-                   /*structured=*/false, /*implicit=*/false,
-                   builder.getStringAttr(*entryOp.getName()));
-  else
-    ExitOp::create(builder, entryOp.getLoc(), entryOp.getAccVar(),
-                   entryOp.getBounds(), entryOp.getAsyncOperands(),
-                   entryOp.getAsyncOperandsDeviceTypeAttr(),
-                   entryOp.getAsyncOnlyAttr(), entryOp.getDataClause(),
-                   /*structured=*/false, /*implicit=*/false,
-                   builder.getStringAttr(*entryOp.getName()));
+  if (unwrapFirBox) {
+    // Unwrap: delete device payload using getdeviceptr + declare_exit + ExitOp
+    mlir::acc::GetDevicePtrOp entryOp =
+        createDataEntryOp<mlir::acc::GetDevicePtrOp>(
+            builder, loc, var, asFortran, bounds,
+            /*structured=*/false, /*implicit=*/false, clause, var.getType(),
+            /*async=*/{}, /*asyncDeviceTypes=*/{}, /*asyncOnlyDeviceTypes=*/{});
+    mlir::acc::DeclareExitOp::create(builder, loc, mlir::Value{},
+                                     mlir::ValueRange(entryOp.getAccVar()));
+
+    if constexpr (std::is_same_v<ExitOp, mlir::acc::CopyoutOp> ||
+                  std::is_same_v<ExitOp, mlir::acc::UpdateHostOp>)
+      ExitOp::create(builder, entryOp.getLoc(), entryOp.getAccVar(),
+                     entryOp.getVar(), entryOp.getVarType(),
+                     entryOp.getBounds(), entryOp.getAsyncOperands(),
+                     entryOp.getAsyncOperandsDeviceTypeAttr(),
+                     entryOp.getAsyncOnlyAttr(), entryOp.getDataClause(),
+                     /*structured=*/false, /*implicit=*/false,
+                     builder.getStringAttr(*entryOp.getName()));
+    else
+      ExitOp::create(builder, entryOp.getLoc(), entryOp.getAccVar(),
+                     entryOp.getBounds(), entryOp.getAsyncOperands(),
+                     entryOp.getAsyncOperandsDeviceTypeAttr(),
+                     entryOp.getAsyncOnlyAttr(), entryOp.getDataClause(),
+                     /*structured=*/false, /*implicit=*/false,
+                     builder.getStringAttr(*entryOp.getName()));
+  } else {
+    mlir::acc::GetDevicePtrOp entryOp =
+        createDataEntryOp<mlir::acc::GetDevicePtrOp>(
+            builder, loc, var, asFortran, bounds,
+            /*structured=*/false, /*implicit=*/false, clause, var.getType(),
+            /*async=*/{}, /*asyncDeviceTypes=*/{}, /*asyncOnlyDeviceTypes=*/{});
+    mlir::acc::DeclareExitOp::create(builder, loc, mlir::Value{},
+                                     mlir::ValueRange(entryOp.getAccVar()));
+
+    if constexpr (std::is_same_v<ExitOp, mlir::acc::CopyoutOp> ||
+                  std::is_same_v<ExitOp, mlir::acc::UpdateHostOp>)
+      ExitOp::create(builder, entryOp.getLoc(), entryOp.getAccVar(),
+                     entryOp.getVar(), entryOp.getVarType(),
+                     entryOp.getBounds(), entryOp.getAsyncOperands(),
+                     entryOp.getAsyncOperandsDeviceTypeAttr(),
+                     entryOp.getAsyncOnlyAttr(), entryOp.getDataClause(),
+                     /*structured=*/false, /*implicit=*/false,
+                     builder.getStringAttr(*entryOp.getName()));
+    else
+      ExitOp::create(builder, entryOp.getLoc(), entryOp.getAccVar(),
+                     entryOp.getBounds(), entryOp.getAsyncOperands(),
+                     entryOp.getAsyncOperandsDeviceTypeAttr(),
+                     entryOp.getAsyncOnlyAttr(), entryOp.getDataClause(),
+                     /*structured=*/false, /*implicit=*/false,
+                     builder.getStringAttr(*entryOp.getName()));
+  }
 
   // Generate the post dealloc function.
   modBuilder.setInsertionPointAfter(preDeallocOp);
@@ -343,15 +383,28 @@ static void createDeclareDeallocFuncWithArg(
     asFortran << accFirDescriptorPostfix.str();
   }
 
-  mlir::acc::UpdateDeviceOp updateDeviceOp =
-      createDataEntryOp<mlir::acc::UpdateDeviceOp>(
-          builder, loc, var, asFortran, bounds,
-          /*structured=*/false, /*implicit=*/true,
-          mlir::acc::DataClause::acc_update_device, var.getType(),
-          /*async=*/{}, /*asyncDeviceTypes=*/{}, /*asyncOnlyDeviceTypes=*/{});
-  llvm::SmallVector<int32_t> operandSegments{0, 0, 0, 1};
-  llvm::SmallVector<mlir::Value> operands{updateDeviceOp.getResult()};
-  createSimpleOp<mlir::acc::UpdateOp>(builder, loc, operands, operandSegments);
+  if (unwrapFirBox) {
+    // Old behavior: update descriptor after deallocation.
+    mlir::acc::UpdateDeviceOp updateDeviceOp =
+        createDataEntryOp<mlir::acc::UpdateDeviceOp>(
+            builder, loc, var, asFortran, bounds,
+            /*structured=*/false, /*implicit=*/true,
+            mlir::acc::DataClause::acc_update_device, var.getType(),
+            /*async=*/{}, /*asyncDeviceTypes=*/{}, /*asyncOnlyDeviceTypes=*/{});
+    llvm::SmallVector<int32_t> operandSegments{0, 0, 0, 1};
+    llvm::SmallVector<mlir::Value> operands{updateDeviceOp.getResult()};
+    createSimpleOp<mlir::acc::UpdateOp>(builder, loc, operands,
+                                        operandSegments);
+  } else {
+    // New behavior: end structured region with declare_exit.
+    mlir::acc::GetDevicePtrOp postEntryOp =
+        createDataEntryOp<mlir::acc::GetDevicePtrOp>(
+            builder, loc, var, asFortran, bounds,
+            /*structured=*/false, /*implicit=*/true, clause, var.getType(),
+            /*async=*/{}, /*asyncDeviceTypes=*/{}, /*asyncOnlyDeviceTypes=*/{});
+    mlir::acc::DeclareExitOp::create(builder, loc, mlir::Value{},
+                                     mlir::ValueRange(postEntryOp.getAccVar()));
+  }
   modBuilder.setInsertionPointAfter(postDeallocOp);
   builder.restoreInsertionPoint(crtInsPt);
 }
@@ -1253,6 +1306,15 @@ mlir::acc::FirstprivateRecipeOp Fortran::lower::createOrGetFirstprivateRecipe(
     auto right =
         genDesignateWithTriplets(firBuilder, loc, rightEntity, triplets, shape);
     hlfir::AssignOp::create(firBuilder, loc, left, right);
+  } else {
+    // Copy scalar derived type.
+    // The temporary_lhs flag allows indicating that user defined assignments
+    // should not be called while copying components, and that the LHS and RHS
+    // are known to not alias since the LHS is a created object.
+    hlfir::AssignOp::create(
+        builder, loc, recipe.getCopyRegion().getArgument(0),
+        recipe.getCopyRegion().getArgument(1), /*realloc=*/false,
+        /*keep_lhs_length_if_realloc=*/false, /*temporary_lhs=*/true);
   }
 
   mlir::acc::TerminatorOp::create(builder, loc);
@@ -3994,17 +4056,28 @@ static void createDeclareAllocFunc(mlir::OpBuilder &modBuilder,
     asFortranDesc << accFirDescriptorPostfix.str();
   llvm::SmallVector<mlir::Value> bounds;
 
-  // Updating descriptor must occur before the mapping of the data so that
-  // attached data pointer is not overwritten.
-  mlir::acc::UpdateDeviceOp updateDeviceOp =
-      createDataEntryOp<mlir::acc::UpdateDeviceOp>(
-          builder, loc, addrOp, asFortranDesc, bounds,
-          /*structured=*/false, /*implicit=*/true,
-          mlir::acc::DataClause::acc_update_device, addrOp.getType(),
-          /*async=*/{}, /*asyncDeviceTypes=*/{}, /*asyncOnlyDeviceTypes=*/{});
-  llvm::SmallVector<int32_t> operandSegments{0, 0, 0, 1};
-  llvm::SmallVector<mlir::Value> operands{updateDeviceOp.getResult()};
-  createSimpleOp<mlir::acc::UpdateOp>(builder, loc, operands, operandSegments);
+  // For unwrapFirBox=false this remains declare_enter; for unwrapFirBox=true,
+  // the descriptor post-alloc remains update behavior.
+  if (unwrapFirBox) {
+    mlir::acc::UpdateDeviceOp updDesc =
+        createDataEntryOp<mlir::acc::UpdateDeviceOp>(
+            builder, loc, addrOp, asFortranDesc, bounds,
+            /*structured=*/false, /*implicit=*/true,
+            mlir::acc::DataClause::acc_update_device, addrOp.getType(),
+            /*async=*/{}, /*asyncDeviceTypes=*/{}, /*asyncOnlyDeviceTypes=*/{});
+    llvm::SmallVector<int32_t> seg{0, 0, 0, 1};
+    llvm::SmallVector<mlir::Value> ops{updDesc.getResult()};
+    createSimpleOp<mlir::acc::UpdateOp>(builder, loc, ops, seg);
+  } else {
+    EntryOp descEntryOp = createDataEntryOp<EntryOp>(
+        builder, loc, addrOp, asFortranDesc, bounds,
+        /*structured=*/false, /*implicit=*/true, clause, addrOp.getType(),
+        /*async=*/{}, /*asyncDeviceTypes=*/{}, /*asyncOnlyDeviceTypes=*/{});
+    mlir::acc::DeclareEnterOp::create(
+        builder, loc,
+        mlir::acc::DeclareTokenType::get(descEntryOp.getContext()),
+        mlir::ValueRange(descEntryOp.getAccVar()));
+  }
 
   if (unwrapFirBox) {
     auto loadOp = fir::LoadOp::create(builder, loc, addrOp.getResult());
@@ -4097,15 +4170,27 @@ static void createDeclareDeallocFunc(mlir::OpBuilder &modBuilder,
   if (unwrapFirBox)
     asFortran << accFirDescriptorPostfix.str();
   llvm::SmallVector<mlir::Value> bounds;
-  mlir::acc::UpdateDeviceOp updateDeviceOp =
-      createDataEntryOp<mlir::acc::UpdateDeviceOp>(
-          builder, loc, addrOp, asFortran, bounds,
-          /*structured=*/false, /*implicit=*/true,
-          mlir::acc::DataClause::acc_update_device, addrOp.getType(),
-          /*async=*/{}, /*asyncDeviceTypes=*/{}, /*asyncOnlyDeviceTypes=*/{});
-  llvm::SmallVector<int32_t> operandSegments{0, 0, 0, 1};
-  llvm::SmallVector<mlir::Value> operands{updateDeviceOp.getResult()};
-  createSimpleOp<mlir::acc::UpdateOp>(builder, loc, operands, operandSegments);
+  if (unwrapFirBox) {
+    // Unwrap mode: update the descriptor after deallocation (no declare_exit).
+    mlir::acc::UpdateDeviceOp updDesc =
+        createDataEntryOp<mlir::acc::UpdateDeviceOp>(
+            builder, loc, addrOp, asFortran, bounds,
+            /*structured=*/false, /*implicit=*/true,
+            mlir::acc::DataClause::acc_update_device, addrOp.getType(),
+            /*async=*/{}, /*asyncDeviceTypes=*/{}, /*asyncOnlyDeviceTypes=*/{});
+    llvm::SmallVector<int32_t> seg{0, 0, 0, 1};
+    llvm::SmallVector<mlir::Value> ops{updDesc.getResult()};
+    createSimpleOp<mlir::acc::UpdateOp>(builder, loc, ops, seg);
+  } else {
+    // Default: end the structured declare region using declare_exit.
+    mlir::acc::GetDevicePtrOp descEntryOp =
+        createDataEntryOp<mlir::acc::GetDevicePtrOp>(
+            builder, loc, addrOp, asFortran, bounds,
+            /*structured=*/false, /*implicit=*/true, clause, addrOp.getType(),
+            /*async=*/{}, /*asyncDeviceTypes=*/{}, /*asyncOnlyDeviceTypes=*/{});
+    mlir::acc::DeclareExitOp::create(builder, loc, mlir::Value{},
+                                     mlir::ValueRange(descEntryOp.getAccVar()));
+  }
   modBuilder.setInsertionPointAfter(postDeallocOp);
 }
 
diff --git a/flang/lib/Lower/OpenMP/Clauses.cpp b/flang/lib/Lower/OpenMP/Clauses.cpp
index cecc1a9395892..78fe5aa031ba1 100644
--- a/flang/lib/Lower/OpenMP/Clauses.cpp
+++ b/flang/lib/Lower/OpenMP/Clauses.cpp
@@ -239,11 +239,11 @@ MAKE_EMPTY_CLASS(Relaxed, Relaxed);
 MAKE_EMPTY_CLASS(Release, Release);
 MAKE_EMPTY_CLASS(ReverseOffload, ReverseOffload);
 MAKE_EMPTY_CLASS(SeqCst, SeqCst);
+MAKE_EMPTY_CLASS(SelfMaps, SelfMaps);
 MAKE_EMPTY_CLASS(Simd, Simd);
 MAKE_EMPTY_CLASS(Threads, Threads);
 MAKE_EMPTY_CLASS(UnifiedAddress, UnifiedAddress);
 MAKE_EMPTY_CLASS(UnifiedSharedMemory, UnifiedSharedMemory);
-MAKE_EMPTY_CLASS(SelfMaps, SelfMaps);
 MAKE_EMPTY_CLASS(Unknown, Unknown);
 MAKE_EMPTY_CLASS(Untied, Untied);
 MAKE_EMPTY_CLASS(Weak, Weak);
@@ -257,6 +257,8 @@ MAKE_EMPTY_CLASS(Threadprivate, Threadprivate);
 
 MAKE_INCOMPLETE_CLASS(AdjustArgs, AdjustArgs);
 MAKE_INCOMPLETE_CLASS(AppendArgs, AppendArgs);
+MAKE_INCOMPLETE_CLASS(Replayable, Replayable);
+MAKE_INCOMPLETE_CLASS(Transparent, Transparent);
 
 List<IteratorSpecifier>
 makeIteratorSpecifiers(const parser::OmpIteratorSpecifier &inp,
diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp
index 0ec33e6b24dbf..3a59c0f5f5a90 100644
--- a/flang/lib/Lower/OpenMP/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -408,26 +408,15 @@ static void processHostEvalClauses(lower::AbstractConverter &converter,
     const parser::OmpClauseList *beginClauseList = nullptr;
     const parser::OmpClauseList *endClauseList = nullptr;
     common::visit(
-        common::visitors{
-            [&](const parser::OmpBlockConstruct &ompConstruct) {
-              beginClauseList = &ompConstruct.BeginDir().Clauses();
-              if (auto &endSpec = ompConstruct.EndDir())
-                endClauseList = &endSpec->Clauses();
-            },
-            [&](const parser::OpenMPLoopConstruct &ompConstruct) {
-              const auto &beginDirective =
-                  std::get<parser::OmpBeginLoopDirective>(ompConstruct.t);
-              beginClauseList =
-                  &std::get<parser::OmpClauseList>(beginDirective.t);
-
-              if (auto &endDirective =
-                      std::get<std::optional<parser::OmpEndLoopDirective>>(
-                          ompConstruct.t)) {
-                endClauseList =
-                    &std::get<parser::OmpClauseList>(endDirective->t);
-              }
-            },
-            [&](const auto &) {}},
+        [&](const auto &construct) {
+          using Type = llvm::remove_cvref_t<decltype(construct)>;
+          if constexpr (std::is_same_v<Type, parser::OmpBlockConstruct> ||
+                        std::is_same_v<Type, parser::OpenMPLoopConstruct>) {
+            beginClauseList = &construct.BeginDir().Clauses();
+            if (auto &endSpec = construct.EndDir())
+              endClauseList = &endSpec->Clauses();
+          }
+        },
         ompEval->u);
 
     assert(beginClauseList && "expected begin directive");
@@ -3820,19 +3809,12 @@ static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
                    semantics::SemanticsContext &semaCtx,
                    lower::pft::Evaluation &eval,
                    const parser::OpenMPLoopConstruct &loopConstruct) {
-  const auto &beginLoopDirective =
-      std::get<parser::OmpBeginLoopDirective>(loopConstruct.t);
-  List<Clause> clauses = makeClauses(
-      std::get<parser::OmpClauseList>(beginLoopDirective.t), semaCtx);
-  if (auto &endLoopDirective =
-          std::get<std::optional<parser::OmpEndLoopDirective>>(
-              loopConstruct.t)) {
-    clauses.append(makeClauses(
-        std::get<parser::OmpClauseList>(endLoopDirective->t), semaCtx));
-  }
+  const parser::OmpDirectiveSpecification &beginSpec = loopConstruct.BeginDir();
+  List<Clause> clauses = makeClauses(beginSpec.Clauses(), semaCtx);
+  if (auto &endSpec = loopConstruct.EndDir())
+    clauses.append(makeClauses(endSpec->Clauses(), semaCtx));
 
-  mlir::Location currentLocation =
-      converter.genLocation(beginLoopDirective.source);
+  mlir::Location currentLocation = converter.genLocation(beginSpec.source);
 
   auto &optLoopCons =
       std::get<std::optional<parser::NestedConstruct>>(loopConstruct.t);
@@ -3858,13 +3840,10 @@ static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
     }
   }
 
-  llvm::omp::Directive directive =
-      parser::omp::GetOmpDirectiveName(beginLoopDirective).v;
-  const parser::CharBlock &source =
-      std::get<parser::OmpLoopDirective>(beginLoopDirective.t).source;
+  const parser::OmpDirectiveName &beginName = beginSpec.DirName();
   ConstructQueue queue{
       buildConstructQueue(converter.getFirOpBuilder().getModule(), semaCtx,
-                          eval, source, directive, clauses)};
+                          eval, beginName.source, beginName.v, clauses)};
   genOMPDispatch(converter, symTable, semaCtx, eval, currentLocation, queue,
                  queue.begin());
 }
@@ -4047,8 +4026,7 @@ bool Fortran::lower::isOpenMPTargetConstruct(
     dir = block->BeginDir().DirId();
   } else if (const auto *loop =
                  std::get_if<parser::OpenMPLoopConstruct>(&omp.u)) {
-    const auto &begin = std::get<parser::OmpBeginLoopDirective>(loop->t);
-    dir = std::get<parser::OmpLoopDirective>(begin.t).v;
+    dir = loop->BeginDir().DirId();
   }
   return llvm::omp::allTargetSet.test(dir);
 }
diff --git a/flang/lib/Lower/OpenMP/Utils.cpp b/flang/lib/Lower/OpenMP/Utils.cpp
index d1d1cd68a5b44..83b7ccb1ce0ee 100644
--- a/flang/lib/Lower/OpenMP/Utils.cpp
+++ b/flang/lib/Lower/OpenMP/Utils.cpp
@@ -616,16 +616,11 @@ static void processTileSizesFromOpenMPConstruct(
             &(nestedOptional.value()));
     if (innerConstruct) {
       const auto &innerLoopDirective = innerConstruct->value();
-      const auto &innerBegin =
-          std::get<parser::OmpBeginLoopDirective>(innerLoopDirective.t);
-      const auto &innerDirective =
-          std::get<parser::OmpLoopDirective>(innerBegin.t).v;
-
-      if (innerDirective == llvm::omp::Directive::OMPD_tile) {
+      const parser::OmpDirectiveSpecification &innerBeginSpec =
+          innerLoopDirective.BeginDir();
+      if (innerBeginSpec.DirId() == llvm::omp::Directive::OMPD_tile) {
         // Get the size values from parse tree and convert to a vector.
-        const auto &innerClauseList{
-            std::get<parser::OmpClauseList>(innerBegin.t)};
-        for (const auto &clause : innerClauseList.v) {
+        for (const auto &clause : innerBeginSpec.Clauses().v) {
           if (const auto tclause{
                   std::get_if<parser::OmpClause::Sizes>(&clause.u)}) {
             processFun(tclause);
diff --git a/flang/lib/Lower/Runtime.cpp b/flang/lib/Lower/Runtime.cpp
index 494dd49e961b0..b19ca0182b4b5 100644
--- a/flang/lib/Lower/Runtime.cpp
+++ b/flang/lib/Lower/Runtime.cpp
@@ -12,6 +12,7 @@
 #include "flang/Lower/OpenMP.h"
 #include "flang/Lower/StatementContext.h"
 #include "flang/Optimizer/Builder/FIRBuilder.h"
+#include "flang/Optimizer/Builder/Runtime/Coarray.h"
 #include "flang/Optimizer/Builder/Runtime/RTBuilder.h"
 #include "flang/Optimizer/Builder/Todo.h"
 #include "flang/Optimizer/Dialect/FIROpsSupport.h"
@@ -47,6 +48,42 @@ static void genUnreachable(fir::FirOpBuilder &builder, mlir::Location loc) {
   builder.setInsertionPointToStart(newBlock);
 }
 
+/// Initializes values for STAT and ERRMSG
+static std::pair<mlir::Value, mlir::Value> getStatAndErrmsg(
+    Fortran::lower::AbstractConverter &converter, mlir::Location loc,
+    const std::list<Fortran::parser::StatOrErrmsg> &statOrErrList) {
+  fir::FirOpBuilder &builder = converter.getFirOpBuilder();
+  Fortran::lower::StatementContext stmtCtx;
+
+  mlir::Value errMsgExpr, statExpr;
+  for (const Fortran::parser::StatOrErrmsg &statOrErr : statOrErrList) {
+    std::visit(Fortran::common::visitors{
+                   [&](const Fortran::parser::StatVariable &statVar) {
+                     statExpr = fir::getBase(converter.genExprAddr(
+                         loc, Fortran::semantics::GetExpr(statVar), stmtCtx));
+                   },
+                   [&](const Fortran::parser::MsgVariable &errMsgVar) {
+                     const Fortran::semantics::SomeExpr *expr =
+                         Fortran::semantics::GetExpr(errMsgVar);
+                     errMsgExpr = fir::getBase(
+                         converter.genExprBox(loc, *expr, stmtCtx));
+                   }},
+               statOrErr.u);
+  }
+
+  if (!statExpr) {
+    statExpr = fir::AbsentOp::create(builder, loc,
+                                     builder.getRefType(builder.getI32Type()));
+  }
+  if (!errMsgExpr) {
+    errMsgExpr = fir::AbsentOp::create(
+        builder, loc,
+        fir::BoxType::get(fir::CharacterType::get(
+            builder.getContext(), 1, fir::CharacterType::unknownLen())));
+  }
+  return {statExpr, errMsgExpr};
+}
+
 //===----------------------------------------------------------------------===//
 // Misc. Fortran statements that lower to runtime calls
 //===----------------------------------------------------------------------===//
@@ -169,20 +206,68 @@ void Fortran::lower::genUnlockStatement(
 
 void Fortran::lower::genSyncAllStatement(
     Fortran::lower::AbstractConverter &converter,
-    const Fortran::parser::SyncAllStmt &) {
-  TODO(converter.getCurrentLocation(), "coarray: SYNC ALL runtime");
+    const Fortran::parser::SyncAllStmt &stmt) {
+  mlir::Location loc = converter.getCurrentLocation();
+  converter.checkCoarrayEnabled();
+
+  // Handle STAT and ERRMSG values
+  const std::list<Fortran::parser::StatOrErrmsg> &statOrErrList = stmt.v;
+  auto [statAddr, errMsgAddr] = getStatAndErrmsg(converter, loc, statOrErrList);
+
+  fir::FirOpBuilder &builder = converter.getFirOpBuilder();
+  fir::runtime::genSyncAllStatement(builder, loc, statAddr, errMsgAddr);
 }
 
 void Fortran::lower::genSyncImagesStatement(
     Fortran::lower::AbstractConverter &converter,
-    const Fortran::parser::SyncImagesStmt &) {
-  TODO(converter.getCurrentLocation(), "coarray: SYNC IMAGES runtime");
+    const Fortran::parser::SyncImagesStmt &stmt) {
+  mlir::Location loc = converter.getCurrentLocation();
+  converter.checkCoarrayEnabled();
+  fir::FirOpBuilder &builder = converter.getFirOpBuilder();
+
+  // Handle STAT and ERRMSG values
+  const std::list<Fortran::parser::StatOrErrmsg> &statOrErrList =
+      std::get<std::list<Fortran::parser::StatOrErrmsg>>(stmt.t);
+  auto [statAddr, errMsgAddr] = getStatAndErrmsg(converter, loc, statOrErrList);
+
+  // SYNC_IMAGES(*) is passed as count == -1 while  SYNC IMAGES([]) has count
+  // == 0. Note further that SYNC IMAGES(*) is not semantically equivalent to
+  // SYNC ALL.
+  Fortran::lower::StatementContext stmtCtx;
+  mlir::Value imageSet;
+  const Fortran::parser::SyncImagesStmt::ImageSet &imgSet =
+      std::get<Fortran::parser::SyncImagesStmt::ImageSet>(stmt.t);
+  std::visit(Fortran::common::visitors{
+                 [&](const Fortran::parser::IntExpr &intExpr) {
+                   const SomeExpr *expr = Fortran::semantics::GetExpr(intExpr);
+                   imageSet =
+                       fir::getBase(converter.genExprBox(loc, *expr, stmtCtx));
+                 },
+                 [&](const Fortran::parser::Star &) {
+                   imageSet = fir::AbsentOp::create(
+                       builder, loc,
+                       fir::BoxType::get(fir::SequenceType::get(
+                           {fir::SequenceType::getUnknownExtent()},
+                           builder.getI32Type())));
+                 }},
+             imgSet.u);
+
+  fir::runtime::genSyncImagesStatement(builder, loc, imageSet, statAddr,
+                                       errMsgAddr);
 }
 
 void Fortran::lower::genSyncMemoryStatement(
     Fortran::lower::AbstractConverter &converter,
-    const Fortran::parser::SyncMemoryStmt &) {
-  TODO(converter.getCurrentLocation(), "coarray: SYNC MEMORY runtime");
+    const Fortran::parser::SyncMemoryStmt &stmt) {
+  mlir::Location loc = converter.getCurrentLocation();
+  converter.checkCoarrayEnabled();
+
+  // Handle STAT and ERRMSG values
+  const std::list<Fortran::parser::StatOrErrmsg> &statOrErrList = stmt.v;
+  auto [statAddr, errMsgAddr] = getStatAndErrmsg(converter, loc, statOrErrList);
+
+  fir::FirOpBuilder &builder = converter.getFirOpBuilder();
+  fir::runtime::genSyncMemoryStatement(builder, loc, statAddr, errMsgAddr);
 }
 
 void Fortran::lower::genSyncTeamStatement(
diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
index 6ae48c1d5d88b..ce1376fd209cc 100644
--- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
@@ -455,6 +455,10 @@ static constexpr IntrinsicHandler handlers[]{
      {{{"vector_a", asBox}, {"vector_b", asBox}}},
      /*isElemental=*/false},
     {"dprod", &I::genDprod},
+    {"dsecnds",
+     &I::genDsecnds,
+     {{{"refTime", asAddr}}},
+     /*isElemental=*/false},
     {"dshiftl", &I::genDshiftl},
     {"dshiftr", &I::genDshiftr},
     {"eoshift",
@@ -3716,7 +3720,7 @@ mlir::Value IntrinsicLibrary::genCmplx(mlir::Type resultType,
 
 // CO_BROADCAST
 void IntrinsicLibrary::genCoBroadcast(llvm::ArrayRef<fir::ExtendedValue> args) {
-  checkCoarrayEnabled();
+  converter->checkCoarrayEnabled();
   assert(args.size() == 4);
   mlir::Value sourceImage = fir::getBase(args[1]);
   mlir::Value status =
@@ -3735,7 +3739,7 @@ void IntrinsicLibrary::genCoBroadcast(llvm::ArrayRef<fir::ExtendedValue> args) {
 
 // CO_MAX
 void IntrinsicLibrary::genCoMax(llvm::ArrayRef<fir::ExtendedValue> args) {
-  checkCoarrayEnabled();
+  converter->checkCoarrayEnabled();
   assert(args.size() == 4);
   mlir::Value refNone =
       fir::AbsentOp::create(builder, loc,
@@ -3755,7 +3759,7 @@ void IntrinsicLibrary::genCoMax(llvm::ArrayRef<fir::ExtendedValue> args) {
 
 // CO_MIN
 void IntrinsicLibrary::genCoMin(llvm::ArrayRef<fir::ExtendedValue> args) {
-  checkCoarrayEnabled();
+  converter->checkCoarrayEnabled();
   assert(args.size() == 4);
   mlir::Value refNone =
       fir::AbsentOp::create(builder, loc,
@@ -3775,7 +3779,7 @@ void IntrinsicLibrary::genCoMin(llvm::ArrayRef<fir::ExtendedValue> args) {
 
 // CO_SUM
 void IntrinsicLibrary::genCoSum(llvm::ArrayRef<fir::ExtendedValue> args) {
-  checkCoarrayEnabled();
+  converter->checkCoarrayEnabled();
   assert(args.size() == 4);
   mlir::Value absentInt =
       fir::AbsentOp::create(builder, loc,
@@ -4048,6 +4052,23 @@ mlir::Value IntrinsicLibrary::genDprod(mlir::Type resultType,
   return mlir::arith::MulFOp::create(builder, loc, a, b);
 }
 
+// DSECNDS
+// Double precision variant of SECNDS (PGI extension)
+fir::ExtendedValue
+IntrinsicLibrary::genDsecnds(mlir::Type resultType,
+                             llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 1 && "DSECNDS expects one argument");
+
+  mlir::Value refTime = fir::getBase(args[0]);
+
+  if (!refTime)
+    fir::emitFatalError(loc, "expected REFERENCE TIME parameter");
+
+  mlir::Value result = fir::runtime::genDsecnds(builder, loc, refTime);
+
+  return builder.createConvert(loc, resultType, result);
+}
+
 // DSHIFTL
 mlir::Value IntrinsicLibrary::genDshiftl(mlir::Type resultType,
                                          llvm::ArrayRef<mlir::Value> args) {
@@ -7438,7 +7459,7 @@ IntrinsicLibrary::genNull(mlir::Type, llvm::ArrayRef<fir::ExtendedValue> args) {
 fir::ExtendedValue
 IntrinsicLibrary::genNumImages(mlir::Type resultType,
                                llvm::ArrayRef<fir::ExtendedValue> args) {
-  checkCoarrayEnabled();
+  converter->checkCoarrayEnabled();
   assert(args.size() == 0 || args.size() == 1);
 
   if (args.size())
@@ -8519,7 +8540,7 @@ mlir::Value IntrinsicLibrary::genThisGrid(mlir::Type resultType,
 fir::ExtendedValue
 IntrinsicLibrary::genThisImage(mlir::Type resultType,
                                llvm::ArrayRef<fir::ExtendedValue> args) {
-  checkCoarrayEnabled();
+  converter->checkCoarrayEnabled();
   assert(args.size() >= 1 && args.size() <= 3);
   const bool coarrayIsAbsent = args.size() == 1;
   mlir::Value team =
diff --git a/flang/lib/Optimizer/Builder/MutableBox.cpp b/flang/lib/Optimizer/Builder/MutableBox.cpp
index d4cdfecd0b088..bcec49b3e3c8e 100644
--- a/flang/lib/Optimizer/Builder/MutableBox.cpp
+++ b/flang/lib/Optimizer/Builder/MutableBox.cpp
@@ -67,7 +67,7 @@ createNewFirBox(fir::FirOpBuilder &builder, mlir::Location loc,
       cleanedLengths.append(lengths.begin(), lengths.end());
   } else if (fir::isUnlimitedPolymorphicType(box.getBoxTy())) {
     if (auto charTy = mlir::dyn_cast<fir::CharacterType>(
-            fir::dyn_cast_ptrEleTy(addr.getType()))) {
+            fir::getFortranElementType(addr.getType()))) {
       if (charTy.getLen() == fir::CharacterType::unknownLen())
         cleanedLengths.append(lengths.begin(), lengths.end());
     }
diff --git a/flang/lib/Optimizer/Builder/Runtime/Coarray.cpp b/flang/lib/Optimizer/Builder/Runtime/Coarray.cpp
index 9a893d61122ac..364e7b753c6ee 100644
--- a/flang/lib/Optimizer/Builder/Runtime/Coarray.cpp
+++ b/flang/lib/Optimizer/Builder/Runtime/Coarray.cpp
@@ -165,3 +165,64 @@ void fir::runtime::genCoSum(fir::FirOpBuilder &builder, mlir::Location loc,
   genCollectiveSubroutine(builder, loc, A, resultImage, stat, errmsg,
                           PRIFNAME_SUB("co_sum"));
 }
+
+/// Generate call to runtime subroutine prif_sync_all
+void fir::runtime::genSyncAllStatement(fir::FirOpBuilder &builder,
+                                       mlir::Location loc, mlir::Value stat,
+                                       mlir::Value errmsg) {
+  mlir::FunctionType ftype =
+      PRIF_FUNCTYPE(PRIF_STAT_TYPE, PRIF_ERRMSG_TYPE, PRIF_ERRMSG_TYPE);
+  mlir::func::FuncOp funcOp =
+      builder.createFunction(loc, PRIFNAME_SUB("sync_all"), ftype);
+
+  auto [errmsgArg, errmsgAllocArg] = genErrmsgPRIF(builder, loc, errmsg);
+  llvm::SmallVector<mlir::Value> args = fir::runtime::createArguments(
+      builder, loc, ftype, stat, errmsgArg, errmsgAllocArg);
+  fir::CallOp::create(builder, loc, funcOp, args);
+}
+
+/// Generate call to runtime subroutine prif_sync_memory
+void fir::runtime::genSyncMemoryStatement(fir::FirOpBuilder &builder,
+                                          mlir::Location loc, mlir::Value stat,
+                                          mlir::Value errmsg) {
+  mlir::FunctionType ftype =
+      PRIF_FUNCTYPE(PRIF_STAT_TYPE, PRIF_ERRMSG_TYPE, PRIF_ERRMSG_TYPE);
+  mlir::func::FuncOp funcOp =
+      builder.createFunction(loc, PRIFNAME_SUB("sync_memory"), ftype);
+
+  auto [errmsgArg, errmsgAllocArg] = genErrmsgPRIF(builder, loc, errmsg);
+  llvm::SmallVector<mlir::Value> args = fir::runtime::createArguments(
+      builder, loc, ftype, stat, errmsgArg, errmsgAllocArg);
+  fir::CallOp::create(builder, loc, funcOp, args);
+}
+
+/// Generate call to runtime subroutine prif_sync_images
+void fir::runtime::genSyncImagesStatement(fir::FirOpBuilder &builder,
+                                          mlir::Location loc,
+                                          mlir::Value imageSet,
+                                          mlir::Value stat,
+                                          mlir::Value errmsg) {
+  mlir::Type imgSetTy = fir::BoxType::get(fir::SequenceType::get(
+      {fir::SequenceType::getUnknownExtent()}, builder.getI32Type()));
+  mlir::FunctionType ftype = PRIF_FUNCTYPE(imgSetTy, PRIF_STAT_TYPE,
+                                           PRIF_ERRMSG_TYPE, PRIF_ERRMSG_TYPE);
+  mlir::func::FuncOp funcOp =
+      builder.createFunction(loc, PRIFNAME_SUB("sync_images"), ftype);
+
+  // If imageSet is scalar, PRIF require to pass an array of size 1.
+  if (auto boxTy = mlir::dyn_cast<fir::BoxType>(imageSet.getType())) {
+    if (!mlir::isa<fir::SequenceType>(boxTy.getEleTy())) {
+      mlir::Value one =
+          builder.createIntegerConstant(loc, builder.getI32Type(), 1);
+      mlir::Value shape = fir::ShapeOp::create(builder, loc, one);
+      imageSet = fir::ReboxOp::create(
+          builder, loc,
+          fir::BoxType::get(fir::SequenceType::get({1}, builder.getI32Type())),
+          imageSet, shape, mlir::Value{});
+    }
+  }
+  auto [errmsgArg, errmsgAllocArg] = genErrmsgPRIF(builder, loc, errmsg);
+  llvm::SmallVector<mlir::Value> args = fir::runtime::createArguments(
+      builder, loc, ftype, imageSet, stat, errmsgArg, errmsgAllocArg);
+  fir::CallOp::create(builder, loc, funcOp, args);
+}
diff --git a/flang/lib/Optimizer/Builder/Runtime/Intrinsics.cpp b/flang/lib/Optimizer/Builder/Runtime/Intrinsics.cpp
index dc61903ddd369..110b1b20898c7 100644
--- a/flang/lib/Optimizer/Builder/Runtime/Intrinsics.cpp
+++ b/flang/lib/Optimizer/Builder/Runtime/Intrinsics.cpp
@@ -106,6 +106,23 @@ void fir::runtime::genDateAndTime(fir::FirOpBuilder &builder,
   fir::CallOp::create(builder, loc, callee, args);
 }
 
+mlir::Value fir::runtime::genDsecnds(fir::FirOpBuilder &builder,
+                                     mlir::Location loc, mlir::Value refTime) {
+  auto runtimeFunc =
+      fir::runtime::getRuntimeFunc<mkRTKey(Dsecnds)>(loc, builder);
+
+  mlir::FunctionType runtimeFuncTy = runtimeFunc.getFunctionType();
+
+  mlir::Value sourceFile = fir::factory::locationToFilename(builder, loc);
+  mlir::Value sourceLine =
+      fir::factory::locationToLineNo(builder, loc, runtimeFuncTy.getInput(2));
+
+  llvm::SmallVector<mlir::Value> args = {refTime, sourceFile, sourceLine};
+  args = fir::runtime::createArguments(builder, loc, runtimeFuncTy, args);
+
+  return fir::CallOp::create(builder, loc, runtimeFunc, args).getResult(0);
+}
+
 void fir::runtime::genEtime(fir::FirOpBuilder &builder, mlir::Location loc,
                             mlir::Value values, mlir::Value time) {
   auto runtimeFunc = fir::runtime::getRuntimeFunc<mkRTKey(Etime)>(loc, builder);
diff --git a/flang/lib/Optimizer/CodeGen/CodeGen.cpp b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
index 0800ed4db8c31..9d707250d11d9 100644
--- a/flang/lib/Optimizer/CodeGen/CodeGen.cpp
+++ b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
@@ -132,6 +132,8 @@ addLLVMOpBundleAttrs(mlir::ConversionPatternRewriter &rewriter,
 
 namespace {
 
+// Replaces an existing operation with an AddressOfOp or an AddrSpaceCastOp
+// depending on the existing address spaces of the type.
 mlir::Value replaceWithAddrOfOrASCast(mlir::ConversionPatternRewriter &rewriter,
                                       mlir::Location loc,
                                       std::uint64_t globalAS,
@@ -3211,7 +3213,8 @@ struct GlobalOpConversion : public fir::FIROpConversion<fir::GlobalOp> {
 
     if (global.getDataAttr() &&
         *global.getDataAttr() == cuf::DataAttribute::Shared)
-      g.setAddrSpace(mlir::NVVM::NVVMMemorySpace::kSharedMemorySpace);
+      g.setAddrSpace(
+          static_cast<unsigned>(mlir::NVVM::NVVMMemorySpace::Shared));
 
     rewriter.eraseOp(global);
     return mlir::success();
diff --git a/flang/lib/Optimizer/CodeGen/FIROpPatterns.cpp b/flang/lib/Optimizer/CodeGen/FIROpPatterns.cpp
index c52be5601bbfe..96e3caa481f51 100644
--- a/flang/lib/Optimizer/CodeGen/FIROpPatterns.cpp
+++ b/flang/lib/Optimizer/CodeGen/FIROpPatterns.cpp
@@ -349,7 +349,10 @@ unsigned ConvertFIRToLLVMPattern::getAllocaAddressSpace(
   mlir::Operation *parentOp = rewriter.getInsertionBlock()->getParentOp();
   assert(parentOp != nullptr &&
          "expected insertion block to have parent operation");
-  if (auto module = parentOp->getParentOfType<mlir::ModuleOp>())
+  auto module = mlir::isa<mlir::ModuleOp>(parentOp)
+                    ? mlir::cast<mlir::ModuleOp>(parentOp)
+                    : parentOp->getParentOfType<mlir::ModuleOp>();
+  if (module)
     if (mlir::Attribute addrSpace =
             mlir::DataLayout(module).getAllocaMemorySpace())
       return llvm::cast<mlir::IntegerAttr>(addrSpace).getUInt();
@@ -361,7 +364,10 @@ unsigned ConvertFIRToLLVMPattern::getProgramAddressSpace(
   mlir::Operation *parentOp = rewriter.getInsertionBlock()->getParentOp();
   assert(parentOp != nullptr &&
          "expected insertion block to have parent operation");
-  if (auto module = parentOp->getParentOfType<mlir::ModuleOp>())
+  auto module = mlir::isa<mlir::ModuleOp>(parentOp)
+                    ? mlir::cast<mlir::ModuleOp>(parentOp)
+                    : parentOp->getParentOfType<mlir::ModuleOp>();
+  if (module)
     if (mlir::Attribute addrSpace =
             mlir::DataLayout(module).getProgramMemorySpace())
       return llvm::cast<mlir::IntegerAttr>(addrSpace).getUInt();
@@ -373,8 +379,14 @@ unsigned ConvertFIRToLLVMPattern::getGlobalAddressSpace(
   mlir::Operation *parentOp = rewriter.getInsertionBlock()->getParentOp();
   assert(parentOp != nullptr &&
          "expected insertion block to have parent operation");
-  auto dataLayout = mlir::DataLayout::closest(parentOp);
-  return fir::factory::getGlobalAddressSpace(&dataLayout);
+  auto module = mlir::isa<mlir::ModuleOp>(parentOp)
+                    ? mlir::cast<mlir::ModuleOp>(parentOp)
+                    : parentOp->getParentOfType<mlir::ModuleOp>();
+  if (module)
+    if (mlir::Attribute addrSpace =
+            mlir::DataLayout(module).getGlobalMemorySpace())
+      return llvm::cast<mlir::IntegerAttr>(addrSpace).getUInt();
+  return defaultAddressSpace;
 }
 
 } // namespace fir
diff --git a/flang/lib/Optimizer/Dialect/FIROps.cpp b/flang/lib/Optimizer/Dialect/FIROps.cpp
index 87f9899aa7879..1712af1d1eba7 100644
--- a/flang/lib/Optimizer/Dialect/FIROps.cpp
+++ b/flang/lib/Optimizer/Dialect/FIROps.cpp
@@ -1521,8 +1521,8 @@ bool fir::ConvertOp::canBeConverted(mlir::Type inType, mlir::Type outType) {
          (isInteger(inType) && isFloatCompatible(outType)) ||
          (isFloatCompatible(inType) && isInteger(outType)) ||
          (isFloatCompatible(inType) && isFloatCompatible(outType)) ||
-         (isIntegerCompatible(inType) && isPointerCompatible(outType)) ||
-         (isPointerCompatible(inType) && isIntegerCompatible(outType)) ||
+         (isInteger(inType) && isPointerCompatible(outType)) ||
+         (isPointerCompatible(inType) && isInteger(outType)) ||
          (mlir::isa<fir::BoxType>(inType) &&
           mlir::isa<fir::BoxType>(outType)) ||
          (mlir::isa<fir::BoxProcType>(inType) &&
@@ -1774,12 +1774,13 @@ llvm::LogicalResult fir::CoordinateOp::verify() {
             return emitOpError("too many operands for len_param_index case");
         }
         if (eleTy != index.getOnType())
-          emitOpError(
+          return emitOpError(
               "len_param_index type not compatible with reference type");
         return mlir::success();
       } else if (auto index = mlir::dyn_cast<fir::FieldIndexOp>(defOp)) {
         if (eleTy != index.getOnType())
-          emitOpError("field_index type not compatible with reference type");
+          return emitOpError(
+              "field_index type not compatible with reference type");
         if (auto recTy = mlir::dyn_cast<fir::RecordType>(eleTy)) {
           eleTy = recTy.getType(index.getFieldName());
           continue;
@@ -3406,26 +3407,30 @@ llvm::LogicalResult fir::SaveResultOp::verify() {
   auto eleTy = resultType;
   if (auto seqTy = mlir::dyn_cast<fir::SequenceType>(resultType)) {
     if (seqTy.getDimension() != shapeTyRank)
-      emitOpError("shape operand must be provided and have the value rank "
-                  "when the value is a fir.array");
+      return emitOpError(
+          "shape operand must be provided and have the value rank "
+          "when the value is a fir.array");
     eleTy = seqTy.getEleTy();
   } else {
     if (shapeTyRank != 0)
-      emitOpError(
+      return emitOpError(
           "shape operand should only be provided if the value is a fir.array");
   }
 
   if (auto recTy = mlir::dyn_cast<fir::RecordType>(eleTy)) {
     if (recTy.getNumLenParams() != getTypeparams().size())
-      emitOpError("length parameters number must match with the value type "
-                  "length parameters");
+      return emitOpError(
+          "length parameters number must match with the value type "
+          "length parameters");
   } else if (auto charTy = mlir::dyn_cast<fir::CharacterType>(eleTy)) {
     if (getTypeparams().size() > 1)
-      emitOpError("no more than one length parameter must be provided for "
-                  "character value");
+      return emitOpError(
+          "no more than one length parameter must be provided for "
+          "character value");
   } else {
     if (!getTypeparams().empty())
-      emitOpError("length parameters must not be provided for this value type");
+      return emitOpError(
+          "length parameters must not be provided for this value type");
   }
 
   return mlir::success();
diff --git a/flang/lib/Optimizer/Dialect/FortranVariableInterface.cpp b/flang/lib/Optimizer/Dialect/FortranVariableInterface.cpp
index f16072a90dfae..925276e1f86c4 100644
--- a/flang/lib/Optimizer/Dialect/FortranVariableInterface.cpp
+++ b/flang/lib/Optimizer/Dialect/FortranVariableInterface.cpp
@@ -53,8 +53,9 @@ fir::FortranVariableOpInterface::verifyDeclareLikeOpImpl(mlir::Value memref) {
         shapeRank = shapeShiftType.getRank();
       } else {
         if (!sourceIsBoxValue)
-          emitOpError("of array entity with a raw address base must have a "
-                      "shape operand that is a shape or shapeshift");
+          return emitOpError(
+              "of array entity with a raw address base must have a "
+              "shape operand that is a shape or shapeshift");
         shapeRank = mlir::cast<fir::ShiftType>(shape.getType()).getRank();
       }
 
@@ -62,8 +63,9 @@ fir::FortranVariableOpInterface::verifyDeclareLikeOpImpl(mlir::Value memref) {
       if (!rank || *rank != shapeRank)
         return emitOpError("has conflicting shape and base operand ranks");
     } else if (!sourceIsBox) {
-      emitOpError("of array entity with a raw address base must have a shape "
-                  "operand that is a shape or shapeshift");
+      return emitOpError(
+          "of array entity with a raw address base must have a shape "
+          "operand that is a shape or shapeshift");
     }
   }
   return mlir::success();
diff --git a/flang/lib/Optimizer/HLFIR/IR/HLFIROps.cpp b/flang/lib/Optimizer/HLFIR/IR/HLFIROps.cpp
index 1a63b1bea3177..0cc65f939723e 100644
--- a/flang/lib/Optimizer/HLFIR/IR/HLFIROps.cpp
+++ b/flang/lib/Optimizer/HLFIR/IR/HLFIROps.cpp
@@ -1618,12 +1618,15 @@ static llvm::LogicalResult verifyArrayShift(Op op) {
     if (mlir::Value boundary = op.getBoundary()) {
       mlir::Type boundaryTy =
           hlfir::getFortranElementOrSequenceType(boundary.getType());
-      if (auto match = areMatchingTypes(
-              op, eleTy, hlfir::getFortranElementType(boundaryTy),
-              /*allowCharacterLenMismatch=*/!useStrictIntrinsicVerifier);
-          match.failed())
-        return op.emitOpError(
-            "ARRAY and BOUNDARY operands must have the same element type");
+      // In case of polymorphic ARRAY type, the BOUNDARY's element type
+      // may not match the ARRAY's element type.
+      if (!hlfir::isPolymorphicType(array.getType()))
+        if (auto match = areMatchingTypes(
+                op, eleTy, hlfir::getFortranElementType(boundaryTy),
+                /*allowCharacterLenMismatch=*/!useStrictIntrinsicVerifier);
+            match.failed())
+          return op.emitOpError(
+              "ARRAY and BOUNDARY operands must have the same element type");
       if (failed(verifyOperandTypeShape(boundaryTy, "BOUNDARY")))
         return mlir::failure();
     }
diff --git a/flang/lib/Optimizer/Passes/Pipelines.cpp b/flang/lib/Optimizer/Passes/Pipelines.cpp
index 7c2777baebef1..58f60d43b1d49 100644
--- a/flang/lib/Optimizer/Passes/Pipelines.cpp
+++ b/flang/lib/Optimizer/Passes/Pipelines.cpp
@@ -95,11 +95,12 @@ getEmissionKind(llvm::codegenoptions::DebugInfoKind kind) {
 void addDebugInfoPass(mlir::PassManager &pm,
                       llvm::codegenoptions::DebugInfoKind debugLevel,
                       llvm::OptimizationLevel optLevel,
-                      llvm::StringRef inputFilename) {
+                      llvm::StringRef inputFilename, int32_t dwarfVersion) {
   fir::AddDebugInfoOptions options;
   options.debugLevel = getEmissionKind(debugLevel);
   options.isOptimized = optLevel != llvm::OptimizationLevel::O0;
   options.inputFilename = inputFilename;
+  options.dwarfVersion = dwarfVersion;
   addPassConditionally(pm, disableDebugInfo,
                        [&]() { return fir::createAddDebugInfoPass(options); });
 }
@@ -333,9 +334,9 @@ void createOpenMPFIRPassPipeline(mlir::PassManager &pm,
 void createDebugPasses(mlir::PassManager &pm,
                        llvm::codegenoptions::DebugInfoKind debugLevel,
                        llvm::OptimizationLevel OptLevel,
-                       llvm::StringRef inputFilename) {
+                       llvm::StringRef inputFilename, int32_t dwarfVersion) {
   if (debugLevel != llvm::codegenoptions::NoDebugInfo)
-    addDebugInfoPass(pm, debugLevel, OptLevel, inputFilename);
+    addDebugInfoPass(pm, debugLevel, OptLevel, inputFilename, dwarfVersion);
 }
 
 void createDefaultFIRCodeGenPassPipeline(mlir::PassManager &pm,
@@ -352,7 +353,8 @@ void createDefaultFIRCodeGenPassPipeline(mlir::PassManager &pm,
   fir::addCodeGenRewritePass(
       pm, (config.DebugInfo != llvm::codegenoptions::NoDebugInfo));
   fir::addExternalNameConversionPass(pm, config.Underscoring);
-  fir::createDebugPasses(pm, config.DebugInfo, config.OptLevel, inputFilename);
+  fir::createDebugPasses(pm, config.DebugInfo, config.OptLevel, inputFilename,
+                         config.DwarfVersion);
   fir::addTargetRewritePass(pm);
   fir::addCompilerGeneratedNamesConversionPass(pm);
 
diff --git a/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp b/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp
index 6eb914e67fd54..af96c0be6fae9 100644
--- a/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp
+++ b/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp
@@ -649,6 +649,19 @@ void AddDebugInfoPass::runOnOperation() {
     signalPassFailure();
     return;
   }
+  mlir::OpBuilder builder(context);
+  if (dwarfVersion > 0) {
+    mlir::OpBuilder::InsertionGuard guard(builder);
+    builder.setInsertionPointToEnd(module.getBody());
+    llvm::SmallVector<mlir::Attribute> moduleFlags;
+    mlir::IntegerType int32Ty = mlir::IntegerType::get(context, 32);
+    moduleFlags.push_back(builder.getAttr<mlir::LLVM::ModuleFlagAttr>(
+        mlir::LLVM::ModFlagBehavior::Max,
+        mlir::StringAttr::get(context, "Dwarf Version"),
+        mlir::IntegerAttr::get(int32Ty, dwarfVersion)));
+    mlir::LLVM::ModuleFlagsOp::create(builder, module.getLoc(),
+                                      builder.getArrayAttr(moduleFlags));
+  }
   fir::DebugTypeGenerator typeGen(module, &symbolTable, *dl);
   // We need 2 type of file paths here.
   // 1. Name of the file as was presented to compiler. This can be absolute
@@ -686,7 +699,6 @@ void AddDebugInfoPass::runOnOperation() {
   module.walk([&](mlir::func::FuncOp funcOp) {
     handleFuncOp(funcOp, fileAttr, cuAttr, typeGen, &symbolTable);
   });
-  mlir::OpBuilder builder(context);
   // We have processed all function. Attach common block variables to the
   // global that represent the storage.
   for (auto [global, exprs] : globalToGlobalExprsMap) {
diff --git a/flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp b/flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp
index a40ed95391c3a..40f180a8c1657 100644
--- a/flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp
+++ b/flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp
@@ -221,7 +221,8 @@ static mlir::Value createAddressOfOp(mlir::ConversionPatternRewriter &rewriter,
                                      gpu::GPUModuleOp gpuMod,
                                      std::string &sharedGlobalName) {
   auto llvmPtrTy = mlir::LLVM::LLVMPointerType::get(
-      rewriter.getContext(), mlir::NVVM::NVVMMemorySpace::kSharedMemorySpace);
+      rewriter.getContext(),
+      static_cast<unsigned>(mlir::NVVM::NVVMMemorySpace::Shared));
   if (auto g = gpuMod.lookupSymbol<fir::GlobalOp>(sharedGlobalName))
     return mlir::LLVM::AddressOfOp::create(rewriter, loc, llvmPtrTy,
                                            g.getSymName());
diff --git a/flang/lib/Parser/characters.cpp b/flang/lib/Parser/characters.cpp
index 1a00b16eefe9d..69b6d2ed5fafb 100644
--- a/flang/lib/Parser/characters.cpp
+++ b/flang/lib/Parser/characters.cpp
@@ -158,21 +158,24 @@ DecodedCharacter DecodeRawCharacter<Encoding::UTF_8>(
     const char *cp, std::size_t bytes) {
   auto p{reinterpret_cast<const std::uint8_t *>(cp)};
   char32_t ch{*p};
-  if (ch <= 0x7f) {
+  // Valid UTF-8 encodings must be minimal.
+  if (ch <= 0x7f) { // 1 byte: 7 bits of payload
     return {ch, 1};
-  } else if ((ch & 0xf8) == 0xf0 && bytes >= 4 && ch > 0xf0 &&
-      ((p[1] | p[2] | p[3]) & 0xc0) == 0x80) {
+  } else if ((ch & 0xf8) == 0xf0 && bytes >= 4 &&
+      ((p[1] | p[2] | p[3]) & 0xc0) == 0x80 && (ch > 0xf0 || p[1] > 0x8f)) {
+    // 4 bytes: 3+6+6+6=21 bits of payload
     ch = ((ch & 7) << 6) | (p[1] & 0x3f);
     ch = (ch << 6) | (p[2] & 0x3f);
     ch = (ch << 6) | (p[3] & 0x3f);
     return {ch, 4};
-  } else if ((ch & 0xf0) == 0xe0 && bytes >= 3 && ch > 0xe0 &&
-      ((p[1] | p[2]) & 0xc0) == 0x80) {
+  } else if ((ch & 0xf0) == 0xe0 && bytes >= 3 &&
+      ((p[1] | p[2]) & 0xc0) == 0x80 && (ch > 0xe0 || p[1] > 0x9f)) {
+    // 3 bytes: 4+6+6=16 bits of payload
     ch = ((ch & 0xf) << 6) | (p[1] & 0x3f);
     ch = (ch << 6) | (p[2] & 0x3f);
     return {ch, 3};
   } else if ((ch & 0xe0) == 0xc0 && bytes >= 2 && ch > 0xc0 &&
-      (p[1] & 0xc0) == 0x80) {
+      (p[1] & 0xc0) == 0x80) { // 2 bytes: 5+6=11 bits of payload
     ch = ((ch & 0x1f) << 6) | (p[1] & 0x3f);
     return {ch, 2};
   } else {
diff --git a/flang/lib/Parser/openmp-parsers.cpp b/flang/lib/Parser/openmp-parsers.cpp
index 68e0acdf91fe2..c6d4de108fb59 100644
--- a/flang/lib/Parser/openmp-parsers.cpp
+++ b/flang/lib/Parser/openmp-parsers.cpp
@@ -15,15 +15,34 @@
 #include "stmt-parser.h"
 #include "token-parsers.h"
 #include "type-parser-implementation.h"
+#include "flang/Parser/openmp-utils.h"
 #include "flang/Parser/parse-tree.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/Bitset.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSet.h"
 #include "llvm/Frontend/OpenMP/OMP.h"
+#include "llvm/Support/MathExtras.h"
+
+#include <algorithm>
+#include <cctype>
+#include <iterator>
+#include <list>
+#include <optional>
+#include <string>
+#include <tuple>
+#include <type_traits>
+#include <utility>
+#include <variant>
+#include <vector>
 
 // OpenMP Directives and Clauses
 namespace Fortran::parser {
+using namespace Fortran::parser::omp;
+
+using DirectiveSet =
+    llvm::Bitset<llvm::NextPowerOf2(llvm::omp::Directive_enumSize)>;
 
 // Helper function to print the buffer contents starting at the current point.
 [[maybe_unused]] static std::string ahead(const ParseState &state) {
@@ -34,6 +53,9 @@ namespace Fortran::parser {
 constexpr auto startOmpLine = skipStuffBeforeStatement >> "!$OMP "_sptok;
 constexpr auto endOmpLine = space >> endOfLine;
 
+constexpr auto logicalConstantExpr{logical(constantExpr)};
+constexpr auto scalarLogicalConstantExpr{scalar(logicalConstantExpr)};
+
 // Given a parser for a single element, and a parser for a list of elements
 // of the same type, create a parser that constructs the entire list by having
 // the single element be the head of the list, and the rest be the tail.
@@ -868,6 +890,8 @@ TYPE_PARSER(construct<OmpReductionClause>(
     maybe(nonemptyList(Parser<OmpReductionClause::Modifier>{}) / ":"),
     Parser<OmpObjectList>{}))
 
+TYPE_PARSER(construct<OmpReplayableClause>(scalarLogicalConstantExpr))
+
 // OMP 5.0 2.19.5.6 IN_REDUCTION (reduction-identifier: variable-name-list)
 TYPE_PARSER(construct<OmpInReductionClause>(
     maybe(nonemptyList(Parser<OmpInReductionClause::Modifier>{}) / ":"),
@@ -877,6 +901,8 @@ TYPE_PARSER(construct<OmpTaskReductionClause>(
     maybe(nonemptyList(Parser<OmpTaskReductionClause::Modifier>{}) / ":"),
     Parser<OmpObjectList>{}))
 
+TYPE_PARSER(construct<OmpTransparentClause>(scalarIntExpr))
+
 // OMP 5.0 2.11.4 allocate-clause -> ALLOCATE ([allocator:] variable-name-list)
 // OMP 5.2 2.13.4 allocate-clause -> ALLOCATE ([allocate-modifier
 //                                   [, allocate-modifier] :]
@@ -1192,6 +1218,8 @@ TYPE_PARSER( //
     "READ" >> construct<OmpClause>(construct<OmpClause::Read>()) ||
     "RELAXED" >> construct<OmpClause>(construct<OmpClause::Relaxed>()) ||
     "RELEASE" >> construct<OmpClause>(construct<OmpClause::Release>()) ||
+    "REPLAYABLE" >> construct<OmpClause>(construct<OmpClause::Replayable>(
+                        maybe(parenthesized(Parser<OmpReplayableClause>{})))) ||
     "REVERSE_OFFLOAD" >>
         construct<OmpClause>(construct<OmpClause::ReverseOffload>()) ||
     "SAFELEN" >> construct<OmpClause>(construct<OmpClause::Safelen>(
@@ -1215,6 +1243,9 @@ TYPE_PARSER( //
                           parenthesized(scalarIntExpr))) ||
     "TO" >> construct<OmpClause>(construct<OmpClause::To>(
                 parenthesized(Parser<OmpToClause>{}))) ||
+    "TRANSPARENT" >>
+        construct<OmpClause>(construct<OmpClause::Transparent>(
+            maybe(parenthesized(Parser<OmpTransparentClause>{})))) ||
     "USE" >> construct<OmpClause>(construct<OmpClause::Use>(
                  parenthesized(Parser<OmpObject>{}))) ||
     "USE_DEVICE_PTR" >> construct<OmpClause>(construct<OmpClause::UseDevicePtr>(
@@ -1280,16 +1311,6 @@ TYPE_PARSER(sourced(
         maybe(Parser<OmpClauseList>{}),
         pure(OmpDirectiveSpecification::Flags::None))))
 
-static bool IsFortranBlockConstruct(const ExecutionPartConstruct &epc) {
-  // ExecutionPartConstruct -> ExecutableConstruct
-  //   -> Indirection<BlockConstruct>
-  if (auto *ec{std::get_if<ExecutableConstruct>(&epc.u)}) {
-    return std::holds_alternative<common::Indirection<BlockConstruct>>(ec->u);
-  } else {
-    return false;
-  }
-}
-
 static bool IsStandaloneOrdered(const OmpDirectiveSpecification &dirSpec) {
   // An ORDERED construct is standalone if it has DOACROSS or DEPEND clause.
   return dirSpec.DirId() == llvm::omp::Directive::OMPD_ordered &&
@@ -1307,7 +1328,7 @@ struct StrictlyStructuredBlockParser {
     // Detect BLOCK construct without parsing the entire thing.
     if (lookAhead(skipStuffBeforeStatement >> "BLOCK"_tok).Parse(state)) {
       if (auto epc{Parser<ExecutionPartConstruct>{}.Parse(state)}) {
-        if (IsFortranBlockConstruct(*epc)) {
+        if (GetFortranBlockConstruct(*epc) != nullptr) {
           Block body;
           body.emplace_back(std::move(*epc));
           return std::move(body);
@@ -1345,95 +1366,46 @@ TYPE_PARSER(sourced(construct<OpenMPUtilityConstruct>(
 TYPE_PARSER(sourced(construct<OmpMetadirectiveDirective>(
     verbatim("METADIRECTIVE"_tok), Parser<OmpClauseList>{})))
 
-// Omp directives enclosing do loop
-TYPE_PARSER(sourced(construct<OmpLoopDirective>(first(
-    "DISTRIBUTE PARALLEL DO SIMD" >>
-        pure(llvm::omp::Directive::OMPD_distribute_parallel_do_simd),
-    "DISTRIBUTE PARALLEL DO" >>
-        pure(llvm::omp::Directive::OMPD_distribute_parallel_do),
-    "DISTRIBUTE SIMD" >> pure(llvm::omp::Directive::OMPD_distribute_simd),
-    "DISTRIBUTE" >> pure(llvm::omp::Directive::OMPD_distribute),
-    "DO SIMD" >> pure(llvm::omp::Directive::OMPD_do_simd),
-    "DO" >> pure(llvm::omp::Directive::OMPD_do),
-    "LOOP" >> pure(llvm::omp::Directive::OMPD_loop),
-    "MASKED TASKLOOP SIMD" >>
-        pure(llvm::omp::Directive::OMPD_masked_taskloop_simd),
-    "MASKED TASKLOOP" >> pure(llvm::omp::Directive::OMPD_masked_taskloop),
-    "MASTER TASKLOOP SIMD" >>
-        pure(llvm::omp::Directive::OMPD_master_taskloop_simd),
-    "MASTER TASKLOOP" >> pure(llvm::omp::Directive::OMPD_master_taskloop),
-    "PARALLEL DO SIMD" >> pure(llvm::omp::Directive::OMPD_parallel_do_simd),
-    "PARALLEL DO" >> pure(llvm::omp::Directive::OMPD_parallel_do),
-    "PARALLEL MASKED TASKLOOP SIMD" >>
-        pure(llvm::omp::Directive::OMPD_parallel_masked_taskloop_simd),
-    "PARALLEL MASKED TASKLOOP" >>
-        pure(llvm::omp::Directive::OMPD_parallel_masked_taskloop),
-    "PARALLEL MASTER TASKLOOP SIMD" >>
-        pure(llvm::omp::Directive::OMPD_parallel_master_taskloop_simd),
-    "PARALLEL MASTER TASKLOOP" >>
-        pure(llvm::omp::Directive::OMPD_parallel_master_taskloop),
-    "SIMD" >> pure(llvm::omp::Directive::OMPD_simd),
-    "TARGET LOOP" >> pure(llvm::omp::Directive::OMPD_target_loop),
-    "TARGET PARALLEL DO SIMD" >>
-        pure(llvm::omp::Directive::OMPD_target_parallel_do_simd),
-    "TARGET PARALLEL DO" >> pure(llvm::omp::Directive::OMPD_target_parallel_do),
-    "TARGET PARALLEL LOOP" >>
-        pure(llvm::omp::Directive::OMPD_target_parallel_loop),
-    "TARGET SIMD" >> pure(llvm::omp::Directive::OMPD_target_simd),
-    "TARGET TEAMS DISTRIBUTE PARALLEL DO SIMD" >>
-        pure(llvm::omp::Directive::
-                OMPD_target_teams_distribute_parallel_do_simd),
-    "TARGET TEAMS DISTRIBUTE PARALLEL DO" >>
-        pure(llvm::omp::Directive::OMPD_target_teams_distribute_parallel_do),
-    "TARGET TEAMS DISTRIBUTE SIMD" >>
-        pure(llvm::omp::Directive::OMPD_target_teams_distribute_simd),
-    "TARGET TEAMS DISTRIBUTE" >>
-        pure(llvm::omp::Directive::OMPD_target_teams_distribute),
-    "TARGET TEAMS LOOP" >> pure(llvm::omp::Directive::OMPD_target_teams_loop),
-    "TASKLOOP SIMD" >> pure(llvm::omp::Directive::OMPD_taskloop_simd),
-    "TASKLOOP" >> pure(llvm::omp::Directive::OMPD_taskloop),
-    "TEAMS DISTRIBUTE PARALLEL DO SIMD" >>
-        pure(llvm::omp::Directive::OMPD_teams_distribute_parallel_do_simd),
-    "TEAMS DISTRIBUTE PARALLEL DO" >>
-        pure(llvm::omp::Directive::OMPD_teams_distribute_parallel_do),
-    "TEAMS DISTRIBUTE SIMD" >>
-        pure(llvm::omp::Directive::OMPD_teams_distribute_simd),
-    "TEAMS DISTRIBUTE" >> pure(llvm::omp::Directive::OMPD_teams_distribute),
-    "TEAMS LOOP" >> pure(llvm::omp::Directive::OMPD_teams_loop),
-    "TILE" >> pure(llvm::omp::Directive::OMPD_tile),
-    "UNROLL" >> pure(llvm::omp::Directive::OMPD_unroll)))))
-
-TYPE_PARSER(sourced(construct<OmpBeginLoopDirective>(
-    sourced(Parser<OmpLoopDirective>{}), Parser<OmpClauseList>{})))
-
 static inline constexpr auto IsDirective(llvm::omp::Directive dir) {
   return [dir](const OmpDirectiveName &name) -> bool { return dir == name.v; };
 }
 
+static inline constexpr auto IsMemberOf(const DirectiveSet &dirs) {
+  return [&dirs](const OmpDirectiveName &name) -> bool {
+    return dirs.test(llvm::to_underlying(name.v));
+  };
+}
+
 struct OmpBeginDirectiveParser {
   using resultType = OmpDirectiveSpecification;
 
-  constexpr OmpBeginDirectiveParser(llvm::omp::Directive dir) : dir_(dir) {}
+  constexpr OmpBeginDirectiveParser(DirectiveSet dirs) : dirs_(dirs) {}
+  constexpr OmpBeginDirectiveParser(llvm::omp::Directive dir) {
+    dirs_.set(llvm::to_underlying(dir));
+  }
 
   std::optional<resultType> Parse(ParseState &state) const {
-    auto &&p{predicated(Parser<OmpDirectiveName>{}, IsDirective(dir_)) >=
+    auto &&p{predicated(Parser<OmpDirectiveName>{}, IsMemberOf(dirs_)) >=
         Parser<OmpDirectiveSpecification>{}};
     return p.Parse(state);
   }
 
 private:
-  llvm::omp::Directive dir_;
+  DirectiveSet dirs_;
 };
 
 struct OmpEndDirectiveParser {
   using resultType = OmpDirectiveSpecification;
 
-  constexpr OmpEndDirectiveParser(llvm::omp::Directive dir) : dir_(dir) {}
+  constexpr OmpEndDirectiveParser(DirectiveSet dirs) : dirs_(dirs) {}
+  constexpr OmpEndDirectiveParser(llvm::omp::Directive dir) {
+    dirs_.set(llvm::to_underlying(dir));
+  }
 
   std::optional<resultType> Parse(ParseState &state) const {
     if (startOmpLine.Parse(state)) {
       if (auto endToken{verbatim("END"_sptok).Parse(state)}) {
-        if (auto &&dirSpec{OmpBeginDirectiveParser(dir_).Parse(state)}) {
+        if (auto &&dirSpec{OmpBeginDirectiveParser(dirs_).Parse(state)}) {
           // Extend the "source" on both the OmpDirectiveName and the
           // OmpDirectiveNameSpecification.
           CharBlock &nameSource{std::get<OmpDirectiveName>(dirSpec->t).source};
@@ -1447,7 +1419,7 @@ struct OmpEndDirectiveParser {
   }
 
 private:
-  llvm::omp::Directive dir_;
+  DirectiveSet dirs_;
 };
 
 struct OmpStatementConstructParser {
@@ -1942,11 +1914,56 @@ TYPE_CONTEXT_PARSER("OpenMP construct"_en_US,
                 construct<OpenMPConstruct>(Parser<OpenMPAssumeConstruct>{}),
                 construct<OpenMPConstruct>(Parser<OpenMPCriticalConstruct>{}))))
 
+static constexpr DirectiveSet GetLoopDirectives() {
+  using Directive = llvm::omp::Directive;
+  constexpr DirectiveSet loopDirectives{
+      unsigned(Directive::OMPD_distribute),
+      unsigned(Directive::OMPD_distribute_parallel_do),
+      unsigned(Directive::OMPD_distribute_parallel_do_simd),
+      unsigned(Directive::OMPD_distribute_simd),
+      unsigned(Directive::OMPD_do),
+      unsigned(Directive::OMPD_do_simd),
+      unsigned(Directive::OMPD_loop),
+      unsigned(Directive::OMPD_masked_taskloop),
+      unsigned(Directive::OMPD_masked_taskloop_simd),
+      unsigned(Directive::OMPD_master_taskloop),
+      unsigned(Directive::OMPD_master_taskloop_simd),
+      unsigned(Directive::OMPD_parallel_do),
+      unsigned(Directive::OMPD_parallel_do_simd),
+      unsigned(Directive::OMPD_parallel_masked_taskloop),
+      unsigned(Directive::OMPD_parallel_masked_taskloop_simd),
+      unsigned(Directive::OMPD_parallel_master_taskloop),
+      unsigned(Directive::OMPD_parallel_master_taskloop_simd),
+      unsigned(Directive::OMPD_simd),
+      unsigned(Directive::OMPD_target_loop),
+      unsigned(Directive::OMPD_target_parallel_do),
+      unsigned(Directive::OMPD_target_parallel_do_simd),
+      unsigned(Directive::OMPD_target_parallel_loop),
+      unsigned(Directive::OMPD_target_simd),
+      unsigned(Directive::OMPD_target_teams_distribute),
+      unsigned(Directive::OMPD_target_teams_distribute_parallel_do),
+      unsigned(Directive::OMPD_target_teams_distribute_parallel_do_simd),
+      unsigned(Directive::OMPD_target_teams_distribute_simd),
+      unsigned(Directive::OMPD_target_teams_loop),
+      unsigned(Directive::OMPD_taskloop),
+      unsigned(Directive::OMPD_taskloop_simd),
+      unsigned(Directive::OMPD_teams_distribute),
+      unsigned(Directive::OMPD_teams_distribute_parallel_do),
+      unsigned(Directive::OMPD_teams_distribute_parallel_do_simd),
+      unsigned(Directive::OMPD_teams_distribute_simd),
+      unsigned(Directive::OMPD_teams_loop),
+      unsigned(Directive::OMPD_tile),
+      unsigned(Directive::OMPD_unroll),
+  };
+  return loopDirectives;
+}
+
+TYPE_PARSER(sourced(construct<OmpBeginLoopDirective>(
+    sourced(OmpBeginDirectiveParser(GetLoopDirectives())))))
+
 // END OMP Loop directives
-TYPE_PARSER(
-    startOmpLine >> sourced(construct<OmpEndLoopDirective>(
-                        sourced("END"_tok >> Parser<OmpLoopDirective>{}),
-                        Parser<OmpClauseList>{})))
+TYPE_PARSER(sourced(construct<OmpEndLoopDirective>(
+    sourced(OmpEndDirectiveParser(GetLoopDirectives())))))
 
 TYPE_PARSER(construct<OpenMPLoopConstruct>(
     Parser<OmpBeginLoopDirective>{} / endOmpLine))
diff --git a/flang/lib/Parser/openmp-utils.cpp b/flang/lib/Parser/openmp-utils.cpp
index ef7e4fcdbbd07..937a17f29f221 100644
--- a/flang/lib/Parser/openmp-utils.cpp
+++ b/flang/lib/Parser/openmp-utils.cpp
@@ -12,6 +12,7 @@
 
 #include "flang/Parser/openmp-utils.h"
 
+#include "flang/Common/indirection.h"
 #include "flang/Common/template.h"
 #include "flang/Common/visit.h"
 
@@ -61,4 +62,16 @@ const OmpObjectList *GetOmpObjectList(const OmpClause &clause) {
       clause.u);
 }
 
+const BlockConstruct *GetFortranBlockConstruct(
+    const ExecutionPartConstruct &epc) {
+  // ExecutionPartConstruct -> ExecutableConstruct
+  //   -> Indirection<BlockConstruct>
+  if (auto *ec{std::get_if<ExecutableConstruct>(&epc.u)}) {
+    if (auto *ind{std::get_if<common::Indirection<BlockConstruct>>(&ec->u)}) {
+      return &ind->value();
+    }
+  }
+  return nullptr;
+}
+
 } // namespace Fortran::parser::omp
diff --git a/flang/lib/Parser/unparse.cpp b/flang/lib/Parser/unparse.cpp
index dc6d33607146b..73bbbc04f46b1 100644
--- a/flang/lib/Parser/unparse.cpp
+++ b/flang/lib/Parser/unparse.cpp
@@ -2403,120 +2403,6 @@ class UnparseVisitor {
   }
 #define GEN_FLANG_CLAUSE_UNPARSE
 #include "llvm/Frontend/OpenMP/OMP.inc"
-  void Unparse(const OmpLoopDirective &x) {
-    switch (x.v) {
-    case llvm::omp::Directive::OMPD_distribute:
-      Word("DISTRIBUTE ");
-      break;
-    case llvm::omp::Directive::OMPD_distribute_parallel_do:
-      Word("DISTRIBUTE PARALLEL DO ");
-      break;
-    case llvm::omp::Directive::OMPD_distribute_parallel_do_simd:
-      Word("DISTRIBUTE PARALLEL DO SIMD ");
-      break;
-    case llvm::omp::Directive::OMPD_distribute_simd:
-      Word("DISTRIBUTE SIMD ");
-      break;
-    case llvm::omp::Directive::OMPD_do:
-      Word("DO ");
-      break;
-    case llvm::omp::Directive::OMPD_do_simd:
-      Word("DO SIMD ");
-      break;
-    case llvm::omp::Directive::OMPD_loop:
-      Word("LOOP ");
-      break;
-    case llvm::omp::Directive::OMPD_masked_taskloop_simd:
-      Word("MASKED TASKLOOP SIMD");
-      break;
-    case llvm::omp::Directive::OMPD_masked_taskloop:
-      Word("MASKED TASKLOOP");
-      break;
-    case llvm::omp::Directive::OMPD_master_taskloop_simd:
-      Word("MASTER TASKLOOP SIMD");
-      break;
-    case llvm::omp::Directive::OMPD_master_taskloop:
-      Word("MASTER TASKLOOP");
-      break;
-    case llvm::omp::Directive::OMPD_parallel_do:
-      Word("PARALLEL DO ");
-      break;
-    case llvm::omp::Directive::OMPD_parallel_do_simd:
-      Word("PARALLEL DO SIMD ");
-      break;
-    case llvm::omp::Directive::OMPD_parallel_masked_taskloop_simd:
-      Word("PARALLEL MASKED TASKLOOP SIMD");
-      break;
-    case llvm::omp::Directive::OMPD_parallel_masked_taskloop:
-      Word("PARALLEL MASKED TASKLOOP");
-      break;
-    case llvm::omp::Directive::OMPD_parallel_master_taskloop_simd:
-      Word("PARALLEL MASTER TASKLOOP SIMD");
-      break;
-    case llvm::omp::Directive::OMPD_parallel_master_taskloop:
-      Word("PARALLEL MASTER TASKLOOP");
-      break;
-    case llvm::omp::Directive::OMPD_simd:
-      Word("SIMD ");
-      break;
-    case llvm::omp::Directive::OMPD_target_loop:
-      Word("TARGET LOOP ");
-      break;
-    case llvm::omp::Directive::OMPD_target_parallel_do:
-      Word("TARGET PARALLEL DO ");
-      break;
-    case llvm::omp::Directive::OMPD_target_parallel_do_simd:
-      Word("TARGET PARALLEL DO SIMD ");
-      break;
-    case llvm::omp::Directive::OMPD_target_parallel_loop:
-      Word("TARGET PARALLEL LOOP ");
-      break;
-    case llvm::omp::Directive::OMPD_target_teams_distribute:
-      Word("TARGET TEAMS DISTRIBUTE ");
-      break;
-    case llvm::omp::Directive::OMPD_target_teams_distribute_parallel_do:
-      Word("TARGET TEAMS DISTRIBUTE PARALLEL DO ");
-      break;
-    case llvm::omp::Directive::OMPD_target_teams_distribute_parallel_do_simd:
-      Word("TARGET TEAMS DISTRIBUTE PARALLEL DO SIMD ");
-      break;
-    case llvm::omp::Directive::OMPD_target_teams_distribute_simd:
-      Word("TARGET TEAMS DISTRIBUTE SIMD ");
-      break;
-    case llvm::omp::Directive::OMPD_target_teams_loop:
-      Word("TARGET TEAMS LOOP ");
-      break;
-    case llvm::omp::Directive::OMPD_target_simd:
-      Word("TARGET SIMD ");
-      break;
-    case llvm::omp::Directive::OMPD_taskloop:
-      Word("TASKLOOP ");
-      break;
-    case llvm::omp::Directive::OMPD_taskloop_simd:
-      Word("TASKLOOP SIMD ");
-      break;
-    case llvm::omp::Directive::OMPD_teams_distribute:
-      Word("TEAMS DISTRIBUTE ");
-      break;
-    case llvm::omp::Directive::OMPD_teams_distribute_parallel_do:
-      Word("TEAMS DISTRIBUTE PARALLEL DO ");
-      break;
-    case llvm::omp::Directive::OMPD_teams_distribute_parallel_do_simd:
-      Word("TEAMS DISTRIBUTE PARALLEL DO SIMD ");
-      break;
-    case llvm::omp::Directive::OMPD_teams_distribute_simd:
-      Word("TEAMS DISTRIBUTE SIMD ");
-      break;
-    case llvm::omp::Directive::OMPD_tile:
-      Word("TILE ");
-      break;
-    case llvm::omp::Directive::OMPD_unroll:
-      Word("UNROLL ");
-      break;
-    default:
-      break;
-    }
-  }
   void Unparse(const OmpObjectList &x) { Walk(x.v, ","); }
 
   void Unparse(const common::OmpMemoryOrderType &x) {
@@ -2815,13 +2701,11 @@ class UnparseVisitor {
     Put("\n");
     EndOpenMP();
   }
+  void Unparse(const OmpBeginLoopDirective &x) {
+    Unparse(static_cast<const OmpBeginDirective &>(x));
+  }
   void Unparse(const OmpEndLoopDirective &x) {
-    BeginOpenMP();
-    Word("!$OMP END ");
-    Walk(std::get<OmpLoopDirective>(x.t));
-    Walk(std::get<OmpClauseList>(x.t));
-    Put("\n");
-    EndOpenMP();
+    Unparse(static_cast<const OmpEndDirective &>(x));
   }
   void Unparse(const OmpClauseList &x, const char *sep = " ") {
     Walk(" ", x.v, sep);
@@ -2834,11 +2718,7 @@ class UnparseVisitor {
     EndOpenMP();
   }
   void Unparse(const OpenMPLoopConstruct &x) {
-    BeginOpenMP();
-    Word("!$OMP ");
     Walk(std::get<OmpBeginLoopDirective>(x.t));
-    Put("\n");
-    EndOpenMP();
     Walk(std::get<std::optional<std::variant<DoConstruct,
             common::Indirection<parser::OpenMPLoopConstruct>>>>(x.t));
     Walk(std::get<std::optional<OmpEndLoopDirective>>(x.t));
diff --git a/flang/lib/Semantics/canonicalize-omp.cpp b/flang/lib/Semantics/canonicalize-omp.cpp
index 9722eca19447d..c884658bf464a 100644
--- a/flang/lib/Semantics/canonicalize-omp.cpp
+++ b/flang/lib/Semantics/canonicalize-omp.cpp
@@ -42,11 +42,11 @@ class CanonicalizationOfOmp {
       } else if (auto *endDir{
                      GetConstructIf<parser::OmpEndLoopDirective>(*it)}) {
         // Unmatched OmpEndLoopDirective
-        auto &dir{std::get<parser::OmpLoopDirective>(endDir->t)};
-        messages_.Say(dir.source,
+        const parser::OmpDirectiveName &endName{endDir->DirName()};
+        messages_.Say(endName.source,
             "The %s directive must follow the DO loop associated with the "
             "loop construct"_err_en_US,
-            parser::ToUpperCaseLetters(dir.source.ToString()));
+            parser::ToUpperCaseLetters(endName.source.ToString()));
       }
     } // Block list
   }
@@ -128,17 +128,20 @@ class CanonicalizationOfOmp {
     //     DoConstruct
     //     OmpEndLoopDirective (if available)
     parser::Block::iterator nextIt;
-    auto &beginDir{std::get<parser::OmpBeginLoopDirective>(x.t)};
-    auto &dir{std::get<parser::OmpLoopDirective>(beginDir.t)};
-    auto missingDoConstruct = [](auto &dir, auto &messages) {
-      messages.Say(dir.source,
+    const parser::OmpDirectiveSpecification &beginDir{x.BeginDir()};
+    const parser::OmpDirectiveName &beginName{beginDir.DirName()};
+
+    auto missingDoConstruct = [](const parser::OmpDirectiveName &dirName,
+                                  parser::Messages &messages) {
+      messages.Say(dirName.source,
           "A DO loop must follow the %s directive"_err_en_US,
-          parser::ToUpperCaseLetters(dir.source.ToString()));
+          parser::ToUpperCaseLetters(dirName.source.ToString()));
     };
-    auto tileUnrollError = [](auto &dir, auto &messages) {
-      messages.Say(dir.source,
+    auto tileUnrollError = [](const parser::OmpDirectiveName &dirName,
+                               parser::Messages &messages) {
+      messages.Say(dirName.source,
           "If a loop construct has been fully unrolled, it cannot then be tiled"_err_en_US,
-          parser::ToUpperCaseLetters(dir.source.ToString()));
+          parser::ToUpperCaseLetters(dirName.source.ToString()));
     };
 
     nextIt = it;
@@ -164,23 +167,20 @@ class CanonicalizationOfOmp {
             }
           }
         } else {
-          messages_.Say(dir.source,
+          messages_.Say(beginName.source,
               "DO loop after the %s directive must have loop control"_err_en_US,
-              parser::ToUpperCaseLetters(dir.source.ToString()));
+              parser::ToUpperCaseLetters(beginName.source.ToString()));
         }
       } else if (auto *ompLoopCons{
                      GetOmpIf<parser::OpenMPLoopConstruct>(*nextIt)}) {
         // We should allow UNROLL and TILE constructs to be inserted between an
         // OpenMP Loop Construct and the DO loop itself
-        auto &nestedBeginDirective =
-            std::get<parser::OmpBeginLoopDirective>(ompLoopCons->t);
-        auto &nestedBeginLoopDirective =
-            std::get<parser::OmpLoopDirective>(nestedBeginDirective.t);
-        if ((nestedBeginLoopDirective.v == llvm::omp::Directive::OMPD_unroll ||
-                nestedBeginLoopDirective.v ==
-                    llvm::omp::Directive::OMPD_tile) &&
-            !(nestedBeginLoopDirective.v == llvm::omp::Directive::OMPD_unroll &&
-                dir.v == llvm::omp::Directive::OMPD_tile)) {
+        auto &nestedBeginDirective = ompLoopCons->BeginDir();
+        auto &nestedBeginName = nestedBeginDirective.DirName();
+        if ((nestedBeginName.v == llvm::omp::Directive::OMPD_unroll ||
+                nestedBeginName.v == llvm::omp::Directive::OMPD_tile) &&
+            !(nestedBeginName.v == llvm::omp::Directive::OMPD_unroll &&
+                beginName.v == llvm::omp::Directive::OMPD_tile)) {
           // iterate through the remaining block items to find the end directive
           // for the unroll/tile directive.
           parser::Block::iterator endIt;
@@ -188,9 +188,8 @@ class CanonicalizationOfOmp {
           while (endIt != block.end()) {
             if (auto *endDir{
                     GetConstructIf<parser::OmpEndLoopDirective>(*endIt)}) {
-              auto &endLoopDirective =
-                  std::get<parser::OmpLoopDirective>(endDir->t);
-              if (endLoopDirective.v == dir.v) {
+              auto &endDirName = endDir->DirName();
+              if (endDirName.v == beginName.v) {
                 std::get<std::optional<parser::OmpEndLoopDirective>>(x.t) =
                     std::move(*endDir);
                 endIt = block.erase(endIt);
@@ -205,41 +204,38 @@ class CanonicalizationOfOmp {
               std::optional<parser::NestedConstruct>{parser::NestedConstruct{
                   common::Indirection{std::move(*ompLoopCons)}}};
           nextIt = block.erase(nextIt);
-        } else if (nestedBeginLoopDirective.v ==
-                llvm::omp::Directive::OMPD_unroll &&
-            dir.v == llvm::omp::Directive::OMPD_tile) {
+        } else if (nestedBeginName.v == llvm::omp::Directive::OMPD_unroll &&
+            beginName.v == llvm::omp::Directive::OMPD_tile) {
           // if a loop has been unrolled, the user can not then tile that loop
           // as it has been unrolled
-          parser::OmpClauseList &unrollClauseList{
-              std::get<parser::OmpClauseList>(nestedBeginDirective.t)};
+          const parser::OmpClauseList &unrollClauseList{
+              nestedBeginDirective.Clauses()};
           if (unrollClauseList.v.empty()) {
             // if the clause list is empty for an unroll construct, we assume
             // the loop is being fully unrolled
-            tileUnrollError(dir, messages_);
+            tileUnrollError(beginName, messages_);
           } else {
             // parse the clauses for the unroll directive to find the full
             // clause
-            for (auto clause{unrollClauseList.v.begin()};
-                clause != unrollClauseList.v.end(); ++clause) {
-              if (clause->Id() == llvm::omp::OMPC_full) {
-                tileUnrollError(dir, messages_);
+            for (auto &clause : unrollClauseList.v) {
+              if (clause.Id() == llvm::omp::OMPC_full) {
+                tileUnrollError(beginName, messages_);
               }
             }
           }
         } else {
-          messages_.Say(nestedBeginLoopDirective.source,
+          messages_.Say(nestedBeginName.source,
               "Only Loop Transformation Constructs or Loop Nests can be nested within Loop Constructs"_err_en_US,
-              parser::ToUpperCaseLetters(
-                  nestedBeginLoopDirective.source.ToString()));
+              parser::ToUpperCaseLetters(nestedBeginName.source.ToString()));
         }
       } else {
-        missingDoConstruct(dir, messages_);
+        missingDoConstruct(beginName, messages_);
       }
       // If we get here, we either found a loop, or issued an error message.
       return;
     }
     if (nextIt == block.end()) {
-      missingDoConstruct(dir, messages_);
+      missingDoConstruct(beginName, messages_);
     }
   }
 
diff --git a/flang/lib/Semantics/check-acc-structure.cpp b/flang/lib/Semantics/check-acc-structure.cpp
index 6cb7e5e9e6e25..3cd6d6ba7689a 100644
--- a/flang/lib/Semantics/check-acc-structure.cpp
+++ b/flang/lib/Semantics/check-acc-structure.cpp
@@ -136,10 +136,10 @@ void AccStructureChecker::CheckNotInComputeConstruct() {
   }
 }
 
-bool AccStructureChecker::IsInsideParallelConstruct() const {
+bool AccStructureChecker::IsInsideKernelsConstruct() const {
   if (auto directive = getParentComputeConstruct())
-    if (*directive == llvm::acc::ACCD_parallel ||
-        *directive == llvm::acc::ACCD_parallel_loop)
+    if (*directive == llvm::acc::ACCD_kernels ||
+        *directive == llvm::acc::ACCD_kernels_loop)
       return true;
   return false;
 }
@@ -293,7 +293,10 @@ void AccStructureChecker::CheckNotInSameOrSubLevelLoopConstruct() {
           bool invalid{false};
           if (parentClause == llvm::acc::Clause::ACCC_gang &&
               cl == llvm::acc::Clause::ACCC_gang) {
-            if (IsInsideParallelConstruct()) {
+            if (IsInsideKernelsConstruct()) {
+              context_.Say(GetContext().clauseSource,
+                  "Nested GANG loops are not allowed in the region of a KERNELS construct"_err_en_US);
+            } else {
               auto parentDim = getGangDimensionSize(parent);
               auto currentDim = getGangDimensionSize(GetContext());
               std::int64_t parentDimNum = 1, currentDimNum = 1;
@@ -317,8 +320,6 @@ void AccStructureChecker::CheckNotInSameOrSubLevelLoopConstruct() {
                     parentDimStr);
                 continue;
               }
-            } else {
-              invalid = true;
             }
           } else if (parentClause == llvm::acc::Clause::ACCC_worker &&
               (cl == llvm::acc::Clause::ACCC_gang ||
diff --git a/flang/lib/Semantics/check-acc-structure.h b/flang/lib/Semantics/check-acc-structure.h
index 711d0326349a4..09399297ca4be 100644
--- a/flang/lib/Semantics/check-acc-structure.h
+++ b/flang/lib/Semantics/check-acc-structure.h
@@ -101,7 +101,7 @@ class AccStructureChecker
   bool IsLoopConstruct(llvm::acc::Directive directive) const;
   std::optional<llvm::acc::Directive> getParentComputeConstruct() const;
   bool IsInsideComputeConstruct() const;
-  bool IsInsideParallelConstruct() const;
+  bool IsInsideKernelsConstruct() const;
   void CheckNotInComputeConstruct();
   std::optional<std::int64_t> getGangDimensionSize(
       DirectiveContext &dirContext);
diff --git a/flang/lib/Semantics/check-declarations.cpp b/flang/lib/Semantics/check-declarations.cpp
index 84edcebc64973..1049a6d2c1b2e 100644
--- a/flang/lib/Semantics/check-declarations.cpp
+++ b/flang/lib/Semantics/check-declarations.cpp
@@ -4164,13 +4164,17 @@ void DistinguishabilityHelper::SayNotDistinguishable(const Scope &scope,
 // comes from a different module but is not necessarily use-associated.
 void DistinguishabilityHelper::AttachDeclaration(
     parser::Message &msg, const Scope &scope, const Symbol &proc) {
-  const Scope &unit{GetTopLevelUnitContaining(proc)};
-  if (unit == scope) {
+  if (proc.owner().IsTopLevel()) {
     evaluate::AttachDeclaration(msg, proc);
   } else {
-    msg.Attach(unit.GetName().value(),
-        "'%s' is USE-associated from module '%s'"_en_US, proc.name(),
-        unit.GetName().value());
+    const Scope &unit{GetTopLevelUnitContaining(proc)};
+    if (unit == scope) {
+      evaluate::AttachDeclaration(msg, proc);
+    } else {
+      msg.Attach(unit.GetName().value(),
+          "'%s' is USE-associated from module '%s'"_en_US, proc.name(),
+          unit.GetName().value());
+    }
   }
 }
 
diff --git a/flang/lib/Semantics/check-do-forall.cpp b/flang/lib/Semantics/check-do-forall.cpp
index e258df86a4b1c..a2f3685950c1c 100644
--- a/flang/lib/Semantics/check-do-forall.cpp
+++ b/flang/lib/Semantics/check-do-forall.cpp
@@ -1177,14 +1177,27 @@ void DoForallChecker::Leave(const parser::IoControlSpec &ioControlSpec) {
   }
 }
 
-void DoForallChecker::Leave(const parser::OutputImpliedDo &outputImpliedDo) {
-  const auto &control{std::get<parser::IoImpliedDoControl>(outputImpliedDo.t)};
-  const parser::Name &name{control.name.thing.thing};
+static void CheckIoImpliedDoIndex(
+    SemanticsContext &context, const parser::Name &name) {
   if (name.symbol) {
-    context_.CheckIndexVarRedefine(name.source, *name.symbol);
+    context.CheckIndexVarRedefine(name.source, *name.symbol);
+    if (auto why{WhyNotDefinable(name.source, name.symbol->owner(),
+            DefinabilityFlags{}, *name.symbol)}) {
+      context.Say(std::move(*why));
+    }
   }
 }
 
+void DoForallChecker::Leave(const parser::OutputImpliedDo &outputImpliedDo) {
+  CheckIoImpliedDoIndex(context_,
+      std::get<parser::IoImpliedDoControl>(outputImpliedDo.t).name.thing.thing);
+}
+
+void DoForallChecker::Leave(const parser::InputImpliedDo &inputImpliedDo) {
+  CheckIoImpliedDoIndex(context_,
+      std::get<parser::IoImpliedDoControl>(inputImpliedDo.t).name.thing.thing);
+}
+
 void DoForallChecker::Leave(const parser::StatVariable &statVariable) {
   context_.CheckIndexVarRedefine(statVariable.v.thing.thing);
 }
diff --git a/flang/lib/Semantics/check-do-forall.h b/flang/lib/Semantics/check-do-forall.h
index 4a65818364d76..c32d4a26a4f45 100644
--- a/flang/lib/Semantics/check-do-forall.h
+++ b/flang/lib/Semantics/check-do-forall.h
@@ -26,6 +26,7 @@ struct ForallStmt;
 struct InquireSpec;
 struct IoControlSpec;
 struct OutputImpliedDo;
+struct InputImpliedDo;
 struct StatVariable;
 } // namespace Fortran::parser
 
@@ -55,6 +56,7 @@ class DoForallChecker : public virtual BaseChecker {
   void Leave(const parser::InquireSpec &);
   void Leave(const parser::IoControlSpec &);
   void Leave(const parser::OutputImpliedDo &);
+  void Leave(const parser::InputImpliedDo &);
   void Leave(const parser::StatVariable &);
 
 private:
diff --git a/flang/lib/Semantics/check-omp-loop.cpp b/flang/lib/Semantics/check-omp-loop.cpp
index 9384e039cf3f3..562bd1b4e79a4 100644
--- a/flang/lib/Semantics/check-omp-loop.cpp
+++ b/flang/lib/Semantics/check-omp-loop.cpp
@@ -128,9 +128,8 @@ using namespace Fortran::semantics::omp;
 void OmpStructureChecker::HasInvalidDistributeNesting(
     const parser::OpenMPLoopConstruct &x) {
   bool violation{false};
-  const auto &beginLoopDir{std::get<parser::OmpBeginLoopDirective>(x.t)};
-  const auto &beginDir{std::get<parser::OmpLoopDirective>(beginLoopDir.t)};
-  if (llvm::omp::topDistributeSet.test(beginDir.v)) {
+  const parser::OmpDirectiveName &beginName{x.BeginDir().DirName()};
+  if (llvm::omp::topDistributeSet.test(beginName.v)) {
     // `distribute` region has to be nested
     if (!CurrentDirectiveIsNested()) {
       violation = true;
@@ -142,29 +141,28 @@ void OmpStructureChecker::HasInvalidDistributeNesting(
     }
   }
   if (violation) {
-    context_.Say(beginDir.source,
+    context_.Say(beginName.source,
         "`DISTRIBUTE` region has to be strictly nested inside `TEAMS` "
         "region."_err_en_US);
   }
 }
 void OmpStructureChecker::HasInvalidLoopBinding(
     const parser::OpenMPLoopConstruct &x) {
-  const auto &beginLoopDir{std::get<parser::OmpBeginLoopDirective>(x.t)};
-  const auto &beginDir{std::get<parser::OmpLoopDirective>(beginLoopDir.t)};
+  const parser::OmpDirectiveSpecification &beginSpec{x.BeginDir()};
+  const parser::OmpDirectiveName &beginName{beginSpec.DirName()};
 
   auto teamsBindingChecker = [&](parser::MessageFixedText msg) {
-    const auto &clauseList{std::get<parser::OmpClauseList>(beginLoopDir.t)};
-    for (const auto &clause : clauseList.v) {
+    for (const auto &clause : beginSpec.Clauses().v) {
       if (const auto *bindClause{
               std::get_if<parser::OmpClause::Bind>(&clause.u)}) {
         if (bindClause->v.v != parser::OmpBindClause::Binding::Teams) {
-          context_.Say(beginDir.source, msg);
+          context_.Say(beginName.source, msg);
         }
       }
     }
   };
 
-  if (llvm::omp::Directive::OMPD_loop == beginDir.v &&
+  if (llvm::omp::Directive::OMPD_loop == beginName.v &&
       CurrentDirectiveIsNested() &&
       llvm::omp::bottomTeamsSet.test(GetContextParent().directive)) {
     teamsBindingChecker(
@@ -174,7 +172,7 @@ void OmpStructureChecker::HasInvalidLoopBinding(
 
   if (OmpDirectiveSet{
           llvm::omp::OMPD_teams_loop, llvm::omp::OMPD_target_teams_loop}
-          .test(beginDir.v)) {
+          .test(beginName.v)) {
     teamsBindingChecker(
         "`BIND(TEAMS)` must be specified since the `LOOP` directive is "
         "combined with a `TEAMS` construct."_err_en_US);
@@ -225,13 +223,10 @@ void OmpStructureChecker::CheckSIMDNest(const parser::OpenMPConstruct &c) {
           },
           // Allowing SIMD and loop construct
           [&](const parser::OpenMPLoopConstruct &c) {
-            const auto &beginLoopDir{
-                std::get<parser::OmpBeginLoopDirective>(c.t)};
-            const auto &beginDir{
-                std::get<parser::OmpLoopDirective>(beginLoopDir.t)};
-            if ((beginDir.v == llvm::omp::Directive::OMPD_simd) ||
-                (beginDir.v == llvm::omp::Directive::OMPD_do_simd) ||
-                (beginDir.v == llvm::omp::Directive::OMPD_loop)) {
+            const auto &beginName{c.BeginDir().DirName()};
+            if (beginName.v == llvm::omp::Directive::OMPD_simd ||
+                beginName.v == llvm::omp::Directive::OMPD_do_simd ||
+                beginName.v == llvm::omp::Directive::OMPD_loop) {
               eligibleSIMD = true;
             }
           },
@@ -253,20 +248,15 @@ void OmpStructureChecker::CheckSIMDNest(const parser::OpenMPConstruct &c) {
 
 void OmpStructureChecker::Enter(const parser::OpenMPLoopConstruct &x) {
   loopStack_.push_back(&x);
-  const auto &beginLoopDir{std::get<parser::OmpBeginLoopDirective>(x.t)};
-  const auto &beginDir{std::get<parser::OmpLoopDirective>(beginLoopDir.t)};
 
-  PushContextAndClauseSets(beginDir.source, beginDir.v);
+  const parser::OmpDirectiveName &beginName{x.BeginDir().DirName()};
+  PushContextAndClauseSets(beginName.source, beginName.v);
 
-  // check matching, End directive is optional
-  if (const auto &endLoopDir{
-          std::get<std::optional<parser::OmpEndLoopDirective>>(x.t)}) {
-    const auto &endDir{
-        std::get<parser::OmpLoopDirective>(endLoopDir.value().t)};
+  // Check matching, end directive is optional
+  if (auto &endSpec{x.EndDir()}) {
+    CheckMatching<parser::OmpDirectiveName>(beginName, endSpec->DirName());
 
-    CheckMatching<parser::OmpLoopDirective>(beginDir, endDir);
-
-    AddEndDirectiveClauses(std::get<parser::OmpClauseList>(endLoopDir->t));
+    AddEndDirectiveClauses(endSpec->Clauses());
   }
 
   if (llvm::omp::allSimdSet.test(GetContext().directive)) {
@@ -276,11 +266,11 @@ void OmpStructureChecker::Enter(const parser::OpenMPLoopConstruct &x) {
   // Combined target loop constructs are target device constructs. Keep track of
   // whether any such construct has been visited to later check that REQUIRES
   // directives for target-related options don't appear after them.
-  if (llvm::omp::allTargetSet.test(beginDir.v)) {
+  if (llvm::omp::allTargetSet.test(beginName.v)) {
     deviceConstructFound_ = true;
   }
 
-  if (beginDir.v == llvm::omp::Directive::OMPD_do) {
+  if (beginName.v == llvm::omp::Directive::OMPD_do) {
     // 2.7.1 do-clause -> private-clause |
     //                    firstprivate-clause |
     //                    lastprivate-clause |
@@ -292,7 +282,7 @@ void OmpStructureChecker::Enter(const parser::OpenMPLoopConstruct &x) {
 
     // nesting check
     HasInvalidWorksharingNesting(
-        beginDir.source, llvm::omp::nestedWorkshareErrSet);
+        beginName.source, llvm::omp::nestedWorkshareErrSet);
   }
   SetLoopInfo(x);
 
@@ -301,7 +291,7 @@ void OmpStructureChecker::Enter(const parser::OpenMPLoopConstruct &x) {
     if (const auto &doConstruct{
             std::get_if<parser::DoConstruct>(&*optLoopCons)}) {
       const auto &doBlock{std::get<parser::Block>(doConstruct->t)};
-      CheckNoBranching(doBlock, beginDir.v, beginDir.source);
+      CheckNoBranching(doBlock, beginName.v, beginName.source);
     }
   }
   CheckLoopItrVariableIsInt(x);
@@ -310,10 +300,10 @@ void OmpStructureChecker::Enter(const parser::OpenMPLoopConstruct &x) {
   HasInvalidLoopBinding(x);
   if (CurrentDirectiveIsNested() &&
       llvm::omp::bottomTeamsSet.test(GetContextParent().directive)) {
-    HasInvalidTeamsNesting(beginDir.v, beginDir.source);
+    HasInvalidTeamsNesting(beginName.v, beginName.source);
   }
-  if ((beginDir.v == llvm::omp::Directive::OMPD_distribute_parallel_do_simd) ||
-      (beginDir.v == llvm::omp::Directive::OMPD_distribute_simd)) {
+  if (beginName.v == llvm::omp::Directive::OMPD_distribute_parallel_do_simd ||
+      beginName.v == llvm::omp::Directive::OMPD_distribute_simd) {
     CheckDistLinear(x);
   }
 }
@@ -370,13 +360,12 @@ void OmpStructureChecker::CheckLoopItrVariableIsInt(
 
 std::int64_t OmpStructureChecker::GetOrdCollapseLevel(
     const parser::OpenMPLoopConstruct &x) {
-  const auto &beginLoopDir{std::get<parser::OmpBeginLoopDirective>(x.t)};
-  const auto &clauseList{std::get<parser::OmpClauseList>(beginLoopDir.t)};
+  const parser::OmpDirectiveSpecification &beginSpec{x.BeginDir()};
   std::int64_t orderedCollapseLevel{1};
   std::int64_t orderedLevel{1};
   std::int64_t collapseLevel{1};
 
-  for (const auto &clause : clauseList.v) {
+  for (const auto &clause : beginSpec.Clauses().v) {
     if (const auto *collapseClause{
             std::get_if<parser::OmpClause::Collapse>(&clause.u)}) {
       if (const auto v{GetIntValue(collapseClause->v)}) {
@@ -407,9 +396,7 @@ void OmpStructureChecker::CheckAssociatedLoopConstraints(
 
 void OmpStructureChecker::CheckDistLinear(
     const parser::OpenMPLoopConstruct &x) {
-
-  const auto &beginLoopDir{std::get<parser::OmpBeginLoopDirective>(x.t)};
-  const auto &clauses{std::get<parser::OmpClauseList>(beginLoopDir.t)};
+  const parser::OmpClauseList &clauses{x.BeginDir().Clauses()};
 
   SymbolSourceMap indexVars;
 
@@ -467,8 +454,7 @@ void OmpStructureChecker::CheckDistLinear(
 }
 
 void OmpStructureChecker::Leave(const parser::OpenMPLoopConstruct &x) {
-  const auto &beginLoopDir{std::get<parser::OmpBeginLoopDirective>(x.t)};
-  const auto &clauseList{std::get<parser::OmpClauseList>(beginLoopDir.t)};
+  const parser::OmpClauseList &clauseList{x.BeginDir().Clauses()};
 
   // A few semantic checks for InScan reduction are performed below as SCAN
   // constructs inside LOOP may add the relevant information. Scan reduction is
@@ -527,7 +513,7 @@ void OmpStructureChecker::Leave(const parser::OpenMPLoopConstruct &x) {
 }
 
 void OmpStructureChecker::Enter(const parser::OmpEndLoopDirective &x) {
-  const auto &dir{std::get<parser::OmpLoopDirective>(x.t)};
+  const parser::OmpDirectiveName &dir{x.DirName()};
   ResetPartialContext(dir.source);
   switch (dir.v) {
   // 2.7.1 end-do -> END DO [nowait-clause]
diff --git a/flang/lib/Semantics/check-omp-structure.cpp b/flang/lib/Semantics/check-omp-structure.cpp
index d1654a3adcc9c..7d3fd7a699ff5 100644
--- a/flang/lib/Semantics/check-omp-structure.cpp
+++ b/flang/lib/Semantics/check-omp-structure.cpp
@@ -815,11 +815,11 @@ void OmpStructureChecker::CheckTargetNest(const parser::OpenMPConstruct &c) {
   common::visit(
       common::visitors{
           [&](const parser::OmpBlockConstruct &c) {
-            const parser::OmpDirectiveSpecification &beginSpec{c.BeginDir()};
-            source = beginSpec.DirName().source;
-            if (beginSpec.DirId() == llvm::omp::Directive::OMPD_target_data) {
+            const parser::OmpDirectiveName &beginName{c.BeginDir().DirName()};
+            source = beginName.source;
+            if (beginName.v == llvm::omp::Directive::OMPD_target_data) {
               eligibleTarget = false;
-              ineligibleTargetDir = beginSpec.DirId();
+              ineligibleTargetDir = beginName.v;
             }
           },
           [&](const parser::OpenMPStandaloneConstruct &c) {
@@ -843,14 +843,11 @@ void OmpStructureChecker::CheckTargetNest(const parser::OpenMPConstruct &c) {
                 c.u);
           },
           [&](const parser::OpenMPLoopConstruct &c) {
-            const auto &beginLoopDir{
-                std::get<parser::OmpBeginLoopDirective>(c.t)};
-            const auto &beginDir{
-                std::get<parser::OmpLoopDirective>(beginLoopDir.t)};
-            source = beginLoopDir.source;
-            if (llvm::omp::allTargetSet.test(beginDir.v)) {
+            const parser::OmpDirectiveName &beginName{c.BeginDir().DirName()};
+            source = beginName.source;
+            if (llvm::omp::allTargetSet.test(beginName.v)) {
               eligibleTarget = false;
-              ineligibleTargetDir = beginDir.v;
+              ineligibleTargetDir = beginName.v;
             }
           },
           [&](const auto &c) {},
@@ -874,22 +871,8 @@ void OmpStructureChecker::Enter(const parser::OmpBlockConstruct &x) {
   // Missing mandatory end block: this is checked in semantics because that
   // makes it easier to control the error messages.
   // The end block is mandatory when the construct is not applied to a strictly
-  // structured block (aka it is applied to a loosely structured block). In
-  // other words, the body doesn't contain exactly one parser::BlockConstruct.
-  auto isStrictlyStructuredBlock{[](const parser::Block &block) -> bool {
-    if (block.size() != 1) {
-      return false;
-    }
-    const parser::ExecutionPartConstruct &contents{block.front()};
-    auto *executableConstruct{
-        std::get_if<parser::ExecutableConstruct>(&contents.u)};
-    if (!executableConstruct) {
-      return false;
-    }
-    return std::holds_alternative<common::Indirection<parser::BlockConstruct>>(
-        executableConstruct->u);
-  }};
-  if (!endSpec && !isStrictlyStructuredBlock(block)) {
+  // structured block (aka it is applied to a loosely structured block).
+  if (!endSpec && !IsStrictlyStructuredBlock(block)) {
     llvm::omp::Directive dirId{beginSpec.DirId()};
     auto &msg{context_.Say(beginSpec.source,
         "Expected OpenMP END %s directive"_err_en_US,
@@ -2845,6 +2828,8 @@ CHECK_SIMPLE_CLAUSE(AcqRel, OMPC_acq_rel)
 CHECK_SIMPLE_CLAUSE(Acquire, OMPC_acquire)
 CHECK_SIMPLE_CLAUSE(Relaxed, OMPC_relaxed)
 CHECK_SIMPLE_CLAUSE(Release, OMPC_release)
+CHECK_SIMPLE_CLAUSE(Replayable, OMPC_replayable)
+CHECK_SIMPLE_CLAUSE(Transparent, OMPC_transparent)
 CHECK_SIMPLE_CLAUSE(SeqCst, OMPC_seq_cst)
 CHECK_SIMPLE_CLAUSE(Fail, OMPC_fail)
 
@@ -3980,16 +3965,6 @@ void OmpStructureChecker::CheckDoacross(const parser::OmpDoacross &doa) {
   // Ignore any mismatch between the size of the iteration vector and the
   // number of DO constructs on the stack. This is checked elsewhere.
 
-  auto GetLoopDirective{[](const parser::OpenMPLoopConstruct &x) {
-    auto &begin{std::get<parser::OmpBeginLoopDirective>(x.t)};
-    return std::get<parser::OmpLoopDirective>(begin.t).v;
-  }};
-  auto GetLoopClauses{[](const parser::OpenMPLoopConstruct &x)
-                          -> const std::list<parser::OmpClause> & {
-    auto &begin{std::get<parser::OmpBeginLoopDirective>(x.t)};
-    return std::get<parser::OmpClauseList>(begin.t).v;
-  }};
-
   std::set<const Symbol *> inductionVars;
   for (const LoopConstruct &loop : llvm::reverse(loopStack_)) {
     if (auto *doc{std::get_if<const parser::DoConstruct *>(&loop)}) {
@@ -4003,13 +3978,14 @@ void OmpStructureChecker::CheckDoacross(const parser::OmpDoacross &doa) {
       // Omp-loop-construct, check if it's do/simd with an ORDERED clause.
       auto *loopc{std::get_if<const parser::OpenMPLoopConstruct *>(&loop)};
       assert(loopc && "Expecting OpenMPLoopConstruct");
-      llvm::omp::Directive loopDir{GetLoopDirective(**loopc)};
+      const parser::OmpDirectiveSpecification &beginSpec{(*loopc)->BeginDir()};
+      llvm::omp::Directive loopDir{beginSpec.DirId()};
       if (loopDir == llvm::omp::OMPD_do || loopDir == llvm::omp::OMPD_simd) {
         auto IsOrdered{[](const parser::OmpClause &c) {
           return c.Id() == llvm::omp::OMPC_ordered;
         }};
         // If it has ORDERED clause, stop the traversal.
-        if (llvm::any_of(GetLoopClauses(**loopc), IsOrdered)) {
+        if (llvm::any_of(beginSpec.Clauses().v, IsOrdered)) {
           break;
         }
       }
@@ -4667,11 +4643,7 @@ void OmpStructureChecker::CheckWorkshareBlockStmts(
         } else if (const auto *ompLoopConstruct{
                        std::get_if<parser::OpenMPLoopConstruct>(
                            &ompConstruct->u)}) {
-          const auto &beginLoopDir{
-              std::get<parser::OmpBeginLoopDirective>(ompLoopConstruct->t)};
-          const auto &beginDir{
-              std::get<parser::OmpLoopDirective>(beginLoopDir.t)};
-          currentDir = beginDir.v;
+          currentDir = ompLoopConstruct->BeginDir().DirId();
         } else if (const auto *ompSectionsConstruct{
                        std::get_if<parser::OpenMPSectionsConstruct>(
                            &ompConstruct->u)}) {
diff --git a/flang/lib/Semantics/openmp-utils.cpp b/flang/lib/Semantics/openmp-utils.cpp
index e8df346ccdc3e..2980f827d3ef3 100644
--- a/flang/lib/Semantics/openmp-utils.cpp
+++ b/flang/lib/Semantics/openmp-utils.cpp
@@ -21,6 +21,7 @@
 #include "flang/Evaluate/traverse.h"
 #include "flang/Evaluate/type.h"
 #include "flang/Evaluate/variable.h"
+#include "flang/Parser/openmp-utils.h"
 #include "flang/Parser/parse-tree.h"
 #include "flang/Semantics/expression.h"
 #include "flang/Semantics/semantics.h"
@@ -37,6 +38,7 @@
 #include <vector>
 
 namespace Fortran::semantics::omp {
+using namespace Fortran::parser::omp;
 
 SourcedActionStmt GetActionStmt(const parser::ExecutionPartConstruct *x) {
   if (x == nullptr) {
@@ -397,16 +399,21 @@ const parser::Block &GetInnermostExecPart(const parser::Block &block) {
   const parser::Block *iter{&block};
   while (iter->size() == 1) {
     const parser::ExecutionPartConstruct &ep{iter->front()};
-    if (auto *exec{std::get_if<parser::ExecutableConstruct>(&ep.u)}) {
-      using BlockConstruct = common::Indirection<parser::BlockConstruct>;
-      if (auto *bc{std::get_if<BlockConstruct>(&exec->u)}) {
-        iter = &std::get<parser::Block>(bc->value().t);
-        continue;
-      }
+    if (auto *bc{GetFortranBlockConstruct(ep)}) {
+      iter = &std::get<parser::Block>(bc->t);
+    } else {
+      break;
     }
-    break;
   }
   return *iter;
 }
 
+bool IsStrictlyStructuredBlock(const parser::Block &block) {
+  if (block.size() == 1) {
+    return GetFortranBlockConstruct(block.front()) != nullptr;
+  } else {
+    return false;
+  }
+}
+
 } // namespace Fortran::semantics::omp
diff --git a/flang/lib/Semantics/resolve-directives.cpp b/flang/lib/Semantics/resolve-directives.cpp
index 16b895d8259dd..abb8f6430b29b 100644
--- a/flang/lib/Semantics/resolve-directives.cpp
+++ b/flang/lib/Semantics/resolve-directives.cpp
@@ -1890,9 +1890,9 @@ bool OmpAttributeVisitor::Pre(
 }
 
 bool OmpAttributeVisitor::Pre(const parser::OpenMPLoopConstruct &x) {
-  const auto &beginLoopDir{std::get<parser::OmpBeginLoopDirective>(x.t)};
-  const auto &beginDir{std::get<parser::OmpLoopDirective>(beginLoopDir.t)};
-  switch (beginDir.v) {
+  const parser::OmpDirectiveSpecification &beginSpec{x.BeginDir()};
+  const parser::OmpDirectiveName &beginName{beginSpec.DirName()};
+  switch (beginName.v) {
   case llvm::omp::Directive::OMPD_distribute:
   case llvm::omp::Directive::OMPD_distribute_parallel_do:
   case llvm::omp::Directive::OMPD_distribute_parallel_do_simd:
@@ -1930,21 +1930,23 @@ bool OmpAttributeVisitor::Pre(const parser::OpenMPLoopConstruct &x) {
   case llvm::omp::Directive::OMPD_teams_loop:
   case llvm::omp::Directive::OMPD_tile:
   case llvm::omp::Directive::OMPD_unroll:
-    PushContext(beginDir.source, beginDir.v);
+    PushContext(beginName.source, beginName.v);
     break;
   default:
     break;
   }
-  if (beginDir.v == llvm::omp::OMPD_master_taskloop ||
-      beginDir.v == llvm::omp::OMPD_master_taskloop_simd ||
-      beginDir.v == llvm::omp::OMPD_parallel_master_taskloop ||
-      beginDir.v == llvm::omp::OMPD_parallel_master_taskloop_simd ||
-      beginDir.v == llvm::omp::Directive::OMPD_target_loop)
-    IssueNonConformanceWarning(beginDir.v, beginDir.source, 52);
+  if (beginName.v == llvm::omp::OMPD_master_taskloop ||
+      beginName.v == llvm::omp::OMPD_master_taskloop_simd ||
+      beginName.v == llvm::omp::OMPD_parallel_master_taskloop ||
+      beginName.v == llvm::omp::OMPD_parallel_master_taskloop_simd ||
+      beginName.v == llvm::omp::Directive::OMPD_target_loop) {
+    unsigned version{context_.langOptions().OpenMPVersion};
+    IssueNonConformanceWarning(beginName.v, beginName.source, version);
+  }
   ClearDataSharingAttributeObjects();
   SetContextAssociatedLoopLevel(GetNumAffectedLoopsFromLoopConstruct(x));
 
-  if (beginDir.v == llvm::omp::Directive::OMPD_do) {
+  if (beginName.v == llvm::omp::Directive::OMPD_do) {
     auto &optLoopCons = std::get<std::optional<parser::NestedConstruct>>(x.t);
     if (optLoopCons.has_value()) {
       if (const auto &doConstruct{
@@ -2094,8 +2096,7 @@ void OmpAttributeVisitor::CollectNumAffectedLoopsFromLoopConstruct(
     const parser::OpenMPLoopConstruct &x,
     llvm::SmallVector<std::int64_t> &levels,
     llvm::SmallVector<const parser::OmpClause *> &clauses) {
-  const auto &beginLoopDir{std::get<parser::OmpBeginLoopDirective>(x.t)};
-  const auto &clauseList{std::get<parser::OmpClauseList>(beginLoopDir.t)};
+  const auto &clauseList{x.BeginDir().Clauses()};
 
   CollectNumAffectedLoopsFromClauses(clauseList, levels, clauses);
   CollectNumAffectedLoopsFromInnerLoopContruct(x, levels, clauses);
@@ -2228,15 +2229,14 @@ void OmpAttributeVisitor::PrivatizeAssociatedLoopIndexAndCheckLoopLevel(
         }
       }
       CheckAssocLoopLevel(level, GetAssociatedClause());
-    } else if (const auto &loop{std::get_if<
+    } else if (const auto *loop{std::get_if<
                    common::Indirection<parser::OpenMPLoopConstruct>>(
                    innerMostNest)}) {
-      auto &beginDirective =
-          std::get<parser::OmpBeginLoopDirective>(loop->value().t);
-      auto &beginLoopDirective =
-          std::get<parser::OmpLoopDirective>(beginDirective.t);
-      if (beginLoopDirective.v != llvm::omp::Directive::OMPD_unroll &&
-          beginLoopDirective.v != llvm::omp::Directive::OMPD_tile) {
+      const parser::OmpDirectiveSpecification &beginSpec{
+          loop->value().BeginDir()};
+      const parser::OmpDirectiveName &beginName{beginSpec.DirName()};
+      if (beginName.v != llvm::omp::Directive::OMPD_unroll &&
+          beginName.v != llvm::omp::Directive::OMPD_tile) {
         context_.Say(GetContext().directiveSource,
             "Only UNROLL or TILE constructs are allowed between an OpenMP Loop Construct and a DO construct"_err_en_US,
             parser::ToUpperCaseLetters(llvm::omp::getOpenMPDirectiveName(
diff --git a/flang/lib/Semantics/resolve-names.cpp b/flang/lib/Semantics/resolve-names.cpp
index 077bee930675e..d0d3b0e1caa5a 100644
--- a/flang/lib/Semantics/resolve-names.cpp
+++ b/flang/lib/Semantics/resolve-names.cpp
@@ -2994,12 +2994,20 @@ Symbol *ScopeHandler::FindSymbol(const Scope &scope, const parser::Name &name) {
       }
     }
     return FindSymbol(scope.parent(), name);
-  } else {
+  } else if (scope.kind() == Scope::Kind::ImpliedDos) {
+    if (Symbol * symbol{FindInScope(scope, name)}) {
+      return Resolve(name, symbol);
+    } else {
+      // Don't use scope.FindSymbol() as below, since implied DO scopes
+      // can be parts of initializers in derived type components.
+      return FindSymbol(scope.parent(), name);
+    }
+  } else if (inEquivalenceStmt_) {
     // In EQUIVALENCE statements only resolve names in the local scope, see
     // 19.5.1.4, paragraph 2, item (10)
-    return Resolve(name,
-        inEquivalenceStmt_ ? FindInScope(scope, name)
-                           : scope.FindSymbol(name.source));
+    return Resolve(name, FindInScope(scope, name));
+  } else {
+    return Resolve(name, scope.FindSymbol(name.source));
   }
 }
 
@@ -8722,7 +8730,6 @@ const parser::Name *DeclarationVisitor::ResolveName(const parser::Name &name) {
       return &name;
     }
   }
-
   if (CheckForHostAssociatedImplicit(name)) {
     NotePossibleBadForwardRef(name);
     return &name;
diff --git a/flang/lib/Semantics/rewrite-parse-tree.cpp b/flang/lib/Semantics/rewrite-parse-tree.cpp
index eae22dc257fa7..5b7dab309eda7 100644
--- a/flang/lib/Semantics/rewrite-parse-tree.cpp
+++ b/flang/lib/Semantics/rewrite-parse-tree.cpp
@@ -117,9 +117,7 @@ static bool ReturnsDataPointer(const Symbol &symbol) {
 }
 
 static bool LoopConstructIsSIMD(parser::OpenMPLoopConstruct *ompLoop) {
-  auto &begin = std::get<parser::OmpBeginLoopDirective>(ompLoop->t);
-  auto directive = std::get<parser::OmpLoopDirective>(begin.t).v;
-  return llvm::omp::allSimdSet.test(directive);
+  return llvm::omp::allSimdSet.test(ompLoop->BeginDir().DirName().v);
 }
 
 // Remove non-SIMD OpenMPConstructs once they are parsed.
diff --git a/flang/lib/Semantics/runtime-type-info.cpp b/flang/lib/Semantics/runtime-type-info.cpp
index b8c3db8723964..bbaded36c62e3 100644
--- a/flang/lib/Semantics/runtime-type-info.cpp
+++ b/flang/lib/Semantics/runtime-type-info.cpp
@@ -1385,12 +1385,31 @@ CollectNonTbpDefinedIoGenericInterfaces(
           if (const DeclTypeSpec *
               declType{GetDefinedIoSpecificArgType(*specific)}) {
             const DerivedTypeSpec &derived{DEREF(declType->AsDerived())};
-            if (const Symbol *
-                dtDesc{derived.scope()
-                        ? derived.scope()->runtimeDerivedTypeDescription()
+            const Scope *derivedScope{derived.scope()};
+            if (!declType->IsPolymorphic()) {
+              // A defined I/O subroutine with a monomorphic "dtv" dummy
+              // argument implies a non-extensible sequence or BIND(C) derived
+              // type.  Such types may be defined more than once in the program
+              // so long as they are structurally equivalent.  If the current
+              // scope has an equivalent type, use it for the table rather
+              // than the "dtv" argument's type.
+              if (const Symbol *inScope{scope.FindSymbol(derived.name())}) {
+                const Symbol &ultimate{inScope->GetUltimate()};
+                DerivedTypeSpec localDerivedType{inScope->name(), ultimate};
+                if (ultimate.has<DerivedTypeDetails>() &&
+                    evaluate::DynamicType{derived, /*isPolymorphic=*/false}
+                        .IsTkCompatibleWith(evaluate::DynamicType{
+                            localDerivedType, /*iP=*/false})) {
+                  derivedScope = ultimate.scope();
+                }
+              }
+            }
+            if (const Symbol *dtDesc{derivedScope
+                        ? derivedScope->runtimeDerivedTypeDescription()
                         : nullptr}) {
               if (useRuntimeTypeInfoEntries &&
-                  &derived.scope()->parent() == &generic->owner()) {
+                  derivedScope == derived.scope() &&
+                  &derivedScope->parent() == &generic->owner()) {
                 // This non-TBP defined I/O generic was defined in the
                 // same scope as the derived type, and it will be
                 // included in the derived type's special bindings
@@ -1454,7 +1473,8 @@ static const Symbol *FindSpecificDefinedIo(const Scope &scope,
       const Symbol &specific{*ref};
       if (const DeclTypeSpec *
           thisType{GetDefinedIoSpecificArgType(specific)}) {
-        if (evaluate::DynamicType{DEREF(thisType->AsDerived()), true}
+        if (evaluate::DynamicType{
+                DEREF(thisType->AsDerived()), thisType->IsPolymorphic()}
                 .IsTkCompatibleWith(derived)) {
           return &specific.GetUltimate();
         }
diff --git a/flang/test/Driver/flang-dwarf-version.f90 b/flang/test/Driver/flang-dwarf-version.f90
new file mode 100644
index 0000000000000..dc69140a7eda1
--- /dev/null
+++ b/flang/test/Driver/flang-dwarf-version.f90
@@ -0,0 +1,24 @@
+// RUN: %flang -### -S %s -g -gdwarf-5  2>&1 \
+// RUN:             | FileCheck --check-prefix=CHECK-DWARF5 %s
+// RUN: %flang -### -S %s -gdwarf-5  2>&1 \
+// RUN:             | FileCheck --check-prefix=CHECK-DWARF5 %s
+// RUN: %flang -### -S %s -g1 -gdwarf-5  2>&1 \
+// RUN:             | FileCheck --check-prefix=CHECK-WITH-G1-DWARF5 %s
+// RUN: %flang -### -S %s -gdwarf-4  2>&1 \
+// RUN:             | FileCheck --check-prefix=CHECK-DWARF4 %s
+// RUN: %flang -### -S %s -gdwarf-3  2>&1 \
+// RUN:             | FileCheck --check-prefix=CHECK-DWARF3 %s
+// RUN: %flang -### -S %s -gdwarf-2  2>&1 \
+// RUN:             | FileCheck --check-prefix=CHECK-DWARF2 %s
+
+// CHECK-DWARF5: -debug-info-kind=standalone
+// CHECK-DWARF5-SAME: -dwarf-version=5
+
+// CHECK-WITH-G1-DWARF5: -debug-info-kind=line-tables-only
+// CHECK-WITH-G1-DWARF5-SAME: -dwarf-version=5
+
+// CHECK-DWARF4: -dwarf-version=4
+
+// CHECK-DWARF3: -dwarf-version=3
+
+// CHECK-DWARF2: -dwarf-version=2
diff --git a/flang/test/Driver/loop-fuse.f90 b/flang/test/Driver/loop-fuse.f90
new file mode 100644
index 0000000000000..ddfd9065e0fd4
--- /dev/null
+++ b/flang/test/Driver/loop-fuse.f90
@@ -0,0 +1,17 @@
+! RUN: %flang -### -S -fexperimental-loop-fusion %s 2>&1 | FileCheck -check-prefix=CHECK-LOOP-FUSE %s
+! RUN: %flang -### -S -fno-experimental-loop-fusion %s 2>&1 | FileCheck -check-prefix=CHECK-NO-LOOP-FUSE %s
+! RUN: %flang -### -S -O0 %s 2>&1 | FileCheck -check-prefix=CHECK-NO-LOOP-FUSE %s
+! RUN: %flang -### -S -O1 %s 2>&1 | FileCheck -check-prefix=CHECK-NO-LOOP-FUSE %s
+! RUN: %flang -### -S -O2 %s 2>&1 | FileCheck -check-prefix=CHECK-NO-LOOP-FUSE %s
+! RUN: %flang -### -S -O3 %s 2>&1 | FileCheck -check-prefix=CHECK-NO-LOOP-FUSE %s
+! RUN: %flang -### -S -Os %s 2>&1 | FileCheck -check-prefix=CHECK-NO-LOOP-FUSE %s
+! RUN: %flang -### -S -Oz %s 2>&1 | FileCheck -check-prefix=CHECK-NO-LOOP-FUSE %s
+! CHECK-LOOP-FUSE: "-fexperimental-loop-fusion"
+! CHECK-NO-LOOP-FUSE-NOT: "-fexperimental-loop-fusion"
+! RUN: %flang_fc1 -emit-llvm -O2 -fexperimental-loop-fusion -mllvm -print-pipeline-passes -o /dev/null %s 2>&1 | FileCheck -check-prefix=CHECK-LOOP-FUSE-PASS %s
+! RUN: %flang_fc1 -emit-llvm -O2 -fno-experimental-loop-fusion -mllvm -print-pipeline-passes -o /dev/null %s 2>&1 | FileCheck -check-prefix=CHECK-NO-LOOP-FUSE-PASS %s
+! CHECK-LOOP-FUSE-PASS: loop-fusion
+! CHECK-NO-LOOP-FUSE-PASS-NOT: loop-fusion
+
+program test
+end program
diff --git a/flang/test/Fir/convert-to-llvm.fir b/flang/test/Fir/convert-to-llvm.fir
index cd87bf8d28ed5..864368740be02 100644
--- a/flang/test/Fir/convert-to-llvm.fir
+++ b/flang/test/Fir/convert-to-llvm.fir
@@ -3,8 +3,8 @@
 // RUN: fir-opt --split-input-file --fir-to-llvm-ir="target=i386-unknown-linux-gnu" %s | FileCheck %s --check-prefixes=CHECK,CHECK-COMDAT,GENERIC
 // RUN: fir-opt --split-input-file --fir-to-llvm-ir="target=powerpc64le-unknown-linux-gnu" %s | FileCheck %s --check-prefixes=CHECK,CHECK-COMDAT,GENERIC
 // RUN: fir-opt --split-input-file --fir-to-llvm-ir="target=x86_64-pc-win32" %s | FileCheck %s --check-prefixes=CHECK,CHECK-COMDAT,GENERIC
-// RUN: fir-opt --split-input-file --fir-to-llvm-ir="target=aarch64-apple-darwin" %s | FileCheck %s --check-prefixes=CHECK,CHECK-NO-COMDAT,GENERIC 
-// RUN: fir-opt --split-input-file --fir-to-llvm-ir="target=amdgcn-amd-amdhsa, datalayout=e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-P0" %s | FileCheck -check-prefixes=CHECK,AMDGPU %s
+// RUN: fir-opt --split-input-file --fir-to-llvm-ir="target=aarch64-apple-darwin" %s | FileCheck %s --check-prefixes=CHECK,CHECK-NO-COMDAT,GENERIC
+// RUN: fir-opt --split-input-file --fir-to-llvm-ir="target=amdgcn-amd-amdhsa, datalayout=e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9" %s | FileCheck -check-prefixes=CHECK,AMDGPU %s
 
 //===================================================
 // SUMMARY: Tests for FIR --> LLVM MLIR conversion
@@ -17,7 +17,10 @@ fir.global @g_i0 : i32 {
   fir.has_value %1 : i32
 }
 
-// CHECK: llvm.mlir.global external @g_i0() {addr_space = 0 : i32} : i32 {
+// CHECK: llvm.mlir.global external @g_i0()
+// GENERIC-SAME: {addr_space = 0 : i32}
+// AMDGPU-SAME: {addr_space = 1 : i32}
+// CHECK-SAME: i32 {
 // CHECK:   %[[C0:.*]] = llvm.mlir.constant(0 : i32) : i32
 // CHECK:   llvm.return %[[C0]] : i32
 // CHECK: }
@@ -29,7 +32,10 @@ fir.global @g_ci5 constant : i32 {
   fir.has_value %c : i32
 }
 
-// CHECK: llvm.mlir.global external constant @g_ci5() {addr_space = 0 : i32} : i32 {
+// CHECK: llvm.mlir.global external constant @g_ci5()
+// GENERIC-SAME: {addr_space = 0 : i32}
+// AMDGPU-SAME: {addr_space = 1 : i32}
+// CHECK-SAME: i32 {
 // CHECK:   %[[C5:.*]] = llvm.mlir.constant(5 : i32) : i32
 // CHECK:   llvm.return %[[C5]] : i32
 // CHECK: }
@@ -37,17 +43,26 @@ fir.global @g_ci5 constant : i32 {
 // -----
 
 fir.global internal @i_i515 (515:i32) : i32
-// CHECK: llvm.mlir.global internal @i_i515(515 : i32) {addr_space = 0 : i32} : i32
+// CHECK: llvm.mlir.global internal @i_i515(515 : i32)
+// GENERIC-SAME: {addr_space = 0 : i32}
+// AMDGPU-SAME: {addr_space = 1 : i32}
+// CHECK-SAME: : i32
 
 // -----
 
 fir.global common @C_i511 (0:i32) : i32
-// CHECK: llvm.mlir.global common @C_i511(0 : i32) {addr_space = 0 : i32} : i32
+// CHECK: llvm.mlir.global common @C_i511(0 : i32)
+// GENERIC-SAME: {addr_space = 0 : i32}
+// AMDGPU-SAME: {addr_space = 1 : i32}
+// CHECK-SAME: : i32
 
 // -----
 
 fir.global weak @w_i86 (86:i32) : i32
-// CHECK: llvm.mlir.global weak @w_i86(86 : i32) {addr_space = 0 : i32} : i32
+// CHECK: llvm.mlir.global weak @w_i86(86 : i32)
+// GENERIC-SAME: {addr_space = 0 : i32}
+// AMDGPU-SAME: {addr_space = 1 : i32}
+// CHECK-SAME: : i32
 
 // -----
 
@@ -69,9 +84,13 @@ fir.global @symbol : i64 {
   fir.has_value %0 : i64
 }
 
-// CHECK: %{{.*}} = llvm.mlir.addressof @[[SYMBOL:.*]] : !llvm.ptr
+// CHECK:  %[[ADDROF:.*]] = llvm.mlir.addressof @[[SYMBOL:.*]] : !llvm.ptr
+// AMDGPU: %{{.*}} = llvm.addrspacecast %[[ADDROF]] : !llvm.ptr<1> to !llvm.ptr
 
-// CHECK: llvm.mlir.global external @[[SYMBOL]]() {addr_space = 0 : i32} : i64 {
+// CHECK: llvm.mlir.global external @[[SYMBOL]]()
+// GENERIC-SAME: {addr_space = 0 : i32}
+// AMDGPU-SAME: {addr_space = 1 : i32}
+// CHECK-SAME: i64 {
 // CHECK:   %{{.*}} = llvm.mlir.constant(1 : i64) : i64
 // CHECK:   llvm.return %{{.*}} : i64
 // CHECK: }
@@ -88,7 +107,10 @@ fir.global internal @_QEmultiarray : !fir.array<32x32xi32> {
   fir.has_value %2 : !fir.array<32x32xi32>
 }
 
-// CHECK: llvm.mlir.global internal @_QEmultiarray() {addr_space = 0 : i32} : !llvm.array<32 x array<32 x i32>> {
+// CHECK: llvm.mlir.global internal @_QEmultiarray() 
+// GENERIC-SAME: {addr_space = 0 : i32}
+// AMDGPU-SAME: {addr_space = 1 : i32}
+// CHECK-SAME: : !llvm.array<32 x array<32 x i32>> {
 // CHECK:   %[[CST:.*]] = llvm.mlir.constant(dense<1> : vector<32x32xi32>) : !llvm.array<32 x array<32 x i32>>
 // CHECK:   llvm.return %[[CST]] : !llvm.array<32 x array<32 x i32>>
 // CHECK: }
@@ -105,7 +127,10 @@ fir.global internal @_QEmultiarray : !fir.array<32xi32> {
   fir.has_value %2 : !fir.array<32xi32>
 }
 
-// CHECK:          llvm.mlir.global internal @_QEmultiarray() {addr_space = 0 : i32} : !llvm.array<32 x i32> {
+// CHECK:          llvm.mlir.global internal @_QEmultiarray() 
+// GENERIC-SAME: {addr_space = 0 : i32}
+// AMDGPU-SAME: {addr_space = 1 : i32}
+// CHECK-SAME: : !llvm.array<32 x i32> {
 // CHECK:            %[[CST:.*]] = llvm.mlir.constant(1 : i32) : i32
 // CHECK:            %{{.*}} = llvm.mlir.undef : !llvm.array<32 x i32>
 // CHECK:            %{{.*}} = llvm.insertvalue %[[CST]], %{{.*}}[5] : !llvm.array<32 x i32>
@@ -1801,7 +1826,9 @@ func.func @embox1(%arg0: !fir.ref<!fir.type<_QMtest_dinitTtseq{i:i32}>>) {
 // CHECK:         %{{.*}} = llvm.insertvalue %[[VERSION]], %{{.*}}[2] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, ptr, array<1 x i64>)> 
 // CHECK:         %[[TYPE_CODE_I8:.*]] = llvm.trunc %[[TYPE_CODE]] : i32 to i8
 // CHECK:         %{{.*}} = llvm.insertvalue %[[TYPE_CODE_I8]], %{{.*}}[4] : !llvm.struct<(ptr, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, ptr, array<1 x i{{.*}}>)>
-// CHECK:         %[[TDESC:.*]] = llvm.mlir.addressof @_QMtest_dinitE.dt.tseq : !llvm.ptr
+// GENERIC:       %[[TDESC:.*]] = llvm.mlir.addressof @_QMtest_dinitE.dt.tseq : !llvm.ptr
+// AMDGPU:        %[[ADDROF:.*]] = llvm.mlir.addressof @_QMtest_dinitE.dt.tseq : !llvm.ptr<1>
+// AMDGPU:        %[[TDESC:.*]] = llvm.addrspacecast %[[ADDROF]] : !llvm.ptr<1> to !llvm.ptr
 // CHECK:         %{{.*}} = llvm.insertvalue %[[TDESC]], %{{.*}}[7] : !llvm.struct<(ptr, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, ptr, array<1 x i{{.*}}>)>
 
 // -----
@@ -2824,7 +2851,10 @@ func.func @coordinate_array_unknown_size_1d(%arg0: !fir.ptr<!fir.array<? x i32>>
 
 fir.global common @c_(dense<0> : vector<4294967296xi8>) : !fir.array<4294967296xi8>
 
-// CHECK: llvm.mlir.global common @c_(dense<0> : vector<4294967296xi8>) {addr_space = 0 : i32} : !llvm.array<4294967296 x i8>
+// CHECK: llvm.mlir.global common @c_(dense<0> : vector<4294967296xi8>) 
+// GENERIC-SAME: {addr_space = 0 : i32}
+// AMDGPU-SAME: {addr_space = 1 : i32}
+// CHECK-SAME: !llvm.array<4294967296 x i8>
 
 // -----
 
diff --git a/flang/test/Integration/debug-dwarf-flags.f90 b/flang/test/Integration/debug-dwarf-flags.f90
new file mode 100644
index 0000000000000..ac5b1c0d8d4b2
--- /dev/null
+++ b/flang/test/Integration/debug-dwarf-flags.f90
@@ -0,0 +1,23 @@
+! RUN: %flang_fc1 -emit-llvm -debug-info-kind=standalone -dwarf-version=5 %s  \
+! RUN:         -o - | FileCheck --check-prefix=CHECK-DWARF5 %s
+! RUN: %flang_fc1 -emit-llvm -debug-info-kind=line-tables-only -dwarf-version=5 \
+! RUN:         %s -o - | FileCheck --check-prefix=CHECK-DWARF5 %s
+! RUN: %flang_fc1 -emit-llvm -debug-info-kind=standalone -dwarf-version=4 %s  \
+! RUN:         -o - | FileCheck --check-prefix=CHECK-DWARF4 %s
+! RUN: %flang_fc1 -emit-llvm -debug-info-kind=standalone -dwarf-version=3 %s  \
+! RUN:         -o - | FileCheck --check-prefix=CHECK-DWARF3 %s
+! RUN: %flang_fc1 -emit-llvm -debug-info-kind=standalone -dwarf-version=2 %s  \
+! RUN:         -o - | FileCheck --check-prefix=CHECK-DWARF2 %s
+! RUN: %flang_fc1 -emit-llvm -debug-info-kind=standalone %s  -o -\
+! RUN:         | FileCheck --check-prefix=CHECK-WITHOUT-VERSION %s
+! RUN: %flang_fc1 -emit-llvm -dwarf-version=5 %s  -o - \
+! RUN:         | FileCheck --check-prefix=CHECK-WITHOUT-VERSION %s
+
+program test
+end program test
+
+! CHECK-DWARF5: !{i32 7, !"Dwarf Version", i32 5}
+! CHECK-DWARF4: !{i32 7, !"Dwarf Version", i32 4}
+! CHECK-DWARF3: !{i32 7, !"Dwarf Version", i32 3}
+! CHECK-DWARF2: !{i32 7, !"Dwarf Version", i32 2}
+! CHECK-WITHOUT-VERSION-NOT: "Dwarf Version"
diff --git a/flang/test/Lower/Coarray/sync_all.f90 b/flang/test/Lower/Coarray/sync_all.f90
new file mode 100644
index 0000000000000..c2c12d8cdf237
--- /dev/null
+++ b/flang/test/Lower/Coarray/sync_all.f90
@@ -0,0 +1,37 @@
+! RUN: %flang_fc1 -emit-hlfir -fcoarray %s -o - | FileCheck %s --check-prefixes=COARRAY
+! RUN: not %flang_fc1 -emit-hlfir %s 2>&1 | FileCheck %s --check-prefixes=NOCOARRAY
+
+program test_sync_all
+  implicit none
+  ! NOCOARRAY: Not yet implemented: Multi-image features are experimental and are disabled by default, use '-fcoarray' to enable.
+ 
+  ! COARRAY: %[[ERRMSG:.*]]:2 = hlfir.declare %[[VAL_1:.*]] typeparams %[[C_128:.*]] {uniq_name = "_QFEerror_message"} : (!fir.ref<!fir.char<1,128>>, index) -> (!fir.ref<!fir.char<1,128>>, !fir.ref<!fir.char<1,128>>)
+  ! COARRAY: %[[STAT:.*]]:2 = hlfir.declare %[[VAL_2:.*]] {uniq_name = "_QFEsync_status"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+  integer sync_status
+  character(len=128) :: error_message
+
+  ! COARRAY: %[[VAL_3:.*]] = fir.absent !fir.ref<i32>
+  ! COARRAY: %[[VAL_4:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  ! COARRAY: %[[VAL_5:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  ! COARRAY: fir.call @_QMprifPprif_sync_all(%[[VAL_3]], %[[VAL_4]], %[[VAL_5]]) fastmath<contract> : (!fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
+  sync all
+
+  ! COARRAY: %[[VAL_6:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  ! COARRAY: %[[VAL_7:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  ! COARRAY: fir.call @_QMprifPprif_sync_all(%[[STAT]]#0, %[[VAL_6]], %[[VAL_7]]) fastmath<contract> : (!fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
+  sync all(stat=sync_status)
+  
+  ! COARRAY: %[[VAL_8:.*]] = fir.embox %[[ERRMSG]]#0 : (!fir.ref<!fir.char<1,128>>) -> !fir.box<!fir.char<1,128>>
+  ! COARRAY: %[[VAL_9:.*]] = fir.absent !fir.ref<i32>
+  ! COARRAY: %[[VAL_10:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  ! COARRAY: %[[VAL_11:.*]] = fir.convert %[[VAL_8]] : (!fir.box<!fir.char<1,128>>) -> !fir.box<!fir.char<1,?>>
+  ! COARRAY: fir.call @_QMprifPprif_sync_all(%[[VAL_9]], %[[VAL_11]], %[[VAL_10]]) fastmath<contract> : (!fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
+  sync all(                  errmsg=error_message)
+  
+  ! COARRAY: %[[VAL_12:.*]] = fir.embox %[[ERRMSG]]#0 : (!fir.ref<!fir.char<1,128>>) -> !fir.box<!fir.char<1,128>>
+  ! COARRAY: %[[VAL_13:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  ! COARRAY: %[[VAL_14:.*]] = fir.convert %[[VAL_12]] : (!fir.box<!fir.char<1,128>>) -> !fir.box<!fir.char<1,?>>
+  ! COARRAY: fir.call @_QMprifPprif_sync_all(%[[STAT]]#0, %[[VAL_14]], %[[VAL_13]]) fastmath<contract> : (!fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
+  sync all(stat=sync_status, errmsg=error_message)
+
+end program test_sync_all
diff --git a/flang/test/Lower/Coarray/sync_images.f90 b/flang/test/Lower/Coarray/sync_images.f90
new file mode 100644
index 0000000000000..0224bf235c36c
--- /dev/null
+++ b/flang/test/Lower/Coarray/sync_images.f90
@@ -0,0 +1,62 @@
+! RUN: %flang_fc1 -emit-hlfir -fcoarray %s -o - | FileCheck %s --check-prefixes=COARRAY
+! RUN: not %flang_fc1 -emit-hlfir %s 2>&1 | FileCheck %s --check-prefixes=NOCOARRAY
+
+program test_sync_images
+  implicit none
+  ! NOCOARRAY: Not yet implemented: Multi-image features are experimental and are disabled by default, use '-fcoarray' to enable.
+ 
+  ! COARRAY: %[[ERRMSG:.*]]:2 = hlfir.declare %[[VAL_1:.*]] typeparams %[[C_128:.*]] {uniq_name = "_QFEerror_message"} : (!fir.ref<!fir.char<1,128>>, index) -> (!fir.ref<!fir.char<1,128>>, !fir.ref<!fir.char<1,128>>)
+  ! COARRAY: %[[ME:.*]]:2 = hlfir.declare %[[VAL_3:.*]] {uniq_name = "_QFEme"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+  ! COARRAY: %[[STAT:.*]]:2 = hlfir.declare %[[VAL_2:.*]] {uniq_name = "_QFEsync_status"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+  integer sync_status, me
+  character(len=128) :: error_message
+
+  ! COARRAY: %[[VAL_1:.*]] = fir.embox %[[ERRMSG]]#0 : (!fir.ref<!fir.char<1,128>>) -> !fir.box<!fir.char<1,128>>
+  ! COARRAY: %[[VAL_2:.*]] = fir.absent !fir.box<!fir.array<?xi32>> 
+  ! COARRAY: %[[VAL_3:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  ! COARRAY: %[[VAL_4:.*]] = fir.convert %[[VAL_1]] : (!fir.box<!fir.char<1,128>>) -> !fir.box<!fir.char<1,?>>
+  ! COARRAY: fir.call @_QMprifPprif_sync_images(%[[VAL_2]], %[[STAT]]#0, %[[VAL_4]], %[[VAL_3]]) fastmath<contract> : (!fir.box<!fir.array<?xi32>>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
+  sync images(*, stat=sync_status, errmsg=error_message)
+
+  ! COARRAY: %[[VAL_5:.*]] = fir.embox %[[ERRMSG]]#0 : (!fir.ref<!fir.char<1,128>>) -> !fir.box<!fir.char<1,128>>
+  ! COARRAY: %[[VAL_6:.*]] = fir.embox %[[ME]]#0 : (!fir.ref<i32>) -> !fir.box<i32>
+  ! COARRAY: %[[VAL_7:.*]] = fir.rebox %[[VAL_6]](%[[SHAPE:.*]]) : (!fir.box<i32>, !fir.shape<1>) -> !fir.box<!fir.array<1xi32>>
+  ! COARRAY: %[[VAL_8:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  ! COARRAY: %[[VAL_9:.*]] = fir.convert %[[VAL_7]] : (!fir.box<!fir.array<1xi32>>) -> !fir.box<!fir.array<?xi32>> 
+  ! COARRAY: %[[VAL_10:.*]] = fir.convert %[[VAL_5]] : (!fir.box<!fir.char<1,128>>) -> !fir.box<!fir.char<1,?>>
+  ! COARRAY: fir.call @_QMprifPprif_sync_images(%[[VAL_9]], %[[STAT]]#0, %[[VAL_10]], %[[VAL_8]]) fastmath<contract> : (!fir.box<!fir.array<?xi32>>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
+  sync images(me,   stat=sync_status, errmsg=error_message)
+
+  ! COARRAY: %[[VAL_11:.*]] = fir.embox %[[ERRMSG]]#0 : (!fir.ref<!fir.char<1,128>>) -> !fir.box<!fir.char<1,128>>
+  ! COARRAY: %[[VAL_12:.*]] = fir.embox %[[IMG_SET:.*]]#0(%[[SHAPE_1:.*]]) : (!fir.ref<!fir.array<1xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<1xi32>>
+  ! COARRAY: %[[VAL_13:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  ! COARRAY: %[[VAL_14:.*]] = fir.convert %[[VAL_12]] : (!fir.box<!fir.array<1xi32>>) -> !fir.box<!fir.array<?xi32>>
+  ! COARRAY: %[[VAL_15:.*]] = fir.convert %[[VAL_11]] : (!fir.box<!fir.char<1,128>>) -> !fir.box<!fir.char<1,?>>
+  ! COARRAY: fir.call @_QMprifPprif_sync_images(%[[VAL_14]], %[[STAT]]#0, %[[VAL_15]], %[[VAL_13]]) fastmath<contract> : (!fir.box<!fir.array<?xi32>>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
+  sync images([1],  stat=sync_status, errmsg=error_message)
+  
+  ! COARRAY: %[[VAL_17:.*]] = fir.absent !fir.ref<i32>
+  ! COARRAY: %[[VAL_18:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  ! COARRAY: %[[VAL_19:.*]] = fir.absent !fir.box<!fir.array<?xi32>> 
+  ! COARRAY: %[[VAL_20:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  ! COARRAY: fir.call @_QMprifPprif_sync_images(%[[VAL_19]], %[[VAL_17]], %[[VAL_18]], %[[VAL_20]]) fastmath<contract> : (!fir.box<!fir.array<?xi32>>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
+  sync images(*)
+  
+  ! COARRAY: %[[VAL_23:.*]] = fir.absent !fir.ref<i32>
+  ! COARRAY: %[[VAL_24:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  ! COARRAY: %[[VAL_21:.*]] = fir.embox %[[ME]]#0 : (!fir.ref<i32>) -> !fir.box<i32>
+  ! COARRAY: %[[VAL_22:.*]] = fir.rebox %[[VAL_21]](%[[SHAPE_2:.*]]) : (!fir.box<i32>, !fir.shape<1>) -> !fir.box<!fir.array<1xi32>>
+  ! COARRAY: %[[VAL_25:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  ! COARRAY: %[[VAL_26:.*]] = fir.convert %[[VAL_22]] : (!fir.box<!fir.array<1xi32>>) -> !fir.box<!fir.array<?xi32>> 
+  ! COARRAY: fir.call @_QMprifPprif_sync_images(%[[VAL_26]], %[[VAL_23]], %[[VAL_24]], %[[VAL_25]]) fastmath<contract> : (!fir.box<!fir.array<?xi32>>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
+  sync images(me)
+  
+  ! COARRAY: %[[VAL_28:.*]] = fir.absent !fir.ref<i32>
+  ! COARRAY: %[[VAL_29:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  ! COARRAY: %[[VAL_27:.*]] = fir.embox %[[IMG_SET:.*]]#0(%[[SHAPE_3:.*]]) : (!fir.ref<!fir.array<1xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<1xi32>>
+  ! COARRAY: %[[VAL_30:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  ! COARRAY: %[[VAL_31:.*]] = fir.convert %[[VAL_27]] : (!fir.box<!fir.array<1xi32>>) -> !fir.box<!fir.array<?xi32>>
+  ! COARRAY: fir.call @_QMprifPprif_sync_images(%[[VAL_31]], %[[VAL_28]], %[[VAL_29]], %[[VAL_30]]) fastmath<contract> : (!fir.box<!fir.array<?xi32>>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
+  sync images([1])
+
+end program test_sync_images
diff --git a/flang/test/Lower/Coarray/sync_memory.f90 b/flang/test/Lower/Coarray/sync_memory.f90
new file mode 100644
index 0000000000000..773cb6fe4efb7
--- /dev/null
+++ b/flang/test/Lower/Coarray/sync_memory.f90
@@ -0,0 +1,37 @@
+! RUN: %flang_fc1 -emit-hlfir -fcoarray %s -o - | FileCheck %s --check-prefixes=COARRAY
+! RUN: not %flang_fc1 -emit-hlfir %s 2>&1 | FileCheck %s --check-prefixes=NOCOARRAY
+
+program test_sync_memory
+  implicit none
+  ! NOCOARRAY: Not yet implemented: Multi-image features are experimental and are disabled by default, use '-fcoarray' to enable.
+ 
+  ! COARRAY: %[[ERRMSG:.*]]:2 = hlfir.declare %[[VAL_1:.*]] typeparams %[[C_128:.*]] {uniq_name = "_QFEerror_message"} : (!fir.ref<!fir.char<1,128>>, index) -> (!fir.ref<!fir.char<1,128>>, !fir.ref<!fir.char<1,128>>)
+  ! COARRAY: %[[STAT:.*]]:2 = hlfir.declare %[[VAL_2:.*]] {uniq_name = "_QFEsync_status"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+  integer sync_status
+  character(len=128) :: error_message
+
+  ! COARRAY: %[[VAL_3:.*]] = fir.absent !fir.ref<i32>
+  ! COARRAY: %[[VAL_4:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  ! COARRAY: %[[VAL_5:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  ! COARRAY: fir.call @_QMprifPprif_sync_memory(%[[VAL_3]], %[[VAL_4]], %[[VAL_5]]) fastmath<contract> : (!fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
+  sync memory
+
+  ! COARRAY: %[[VAL_6:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  ! COARRAY: %[[VAL_7:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  ! COARRAY: fir.call @_QMprifPprif_sync_memory(%[[STAT]]#0, %[[VAL_6]], %[[VAL_7]]) fastmath<contract> : (!fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
+  sync memory(stat=sync_status)
+  
+  ! COARRAY: %[[VAL_8:.*]] = fir.embox %[[ERRMSG]]#0 : (!fir.ref<!fir.char<1,128>>) -> !fir.box<!fir.char<1,128>>
+  ! COARRAY: %[[VAL_9:.*]] = fir.absent !fir.ref<i32>
+  ! COARRAY: %[[VAL_10:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  ! COARRAY: %[[VAL_11:.*]] = fir.convert %[[VAL_8]] : (!fir.box<!fir.char<1,128>>) -> !fir.box<!fir.char<1,?>>
+  ! COARRAY: fir.call @_QMprifPprif_sync_memory(%[[VAL_9]], %[[VAL_11]], %[[VAL_10]]) fastmath<contract> : (!fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
+  sync memory(                  errmsg=error_message)
+  
+  ! COARRAY: %[[VAL_12:.*]] = fir.embox %[[ERRMSG]]#0 : (!fir.ref<!fir.char<1,128>>) -> !fir.box<!fir.char<1,128>>
+  ! COARRAY: %[[VAL_13:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  ! COARRAY: %[[VAL_14:.*]] = fir.convert %[[VAL_12]] : (!fir.box<!fir.char<1,128>>) -> !fir.box<!fir.char<1,?>>
+  ! COARRAY: fir.call @_QMprifPprif_sync_memory(%[[STAT]]#0, %[[VAL_14]], %[[VAL_13]]) fastmath<contract> : (!fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
+  sync memory(stat=sync_status, errmsg=error_message)
+
+end program test_sync_memory
diff --git a/flang/test/Lower/HLFIR/actual_target_for_dummy_pointer.f90 b/flang/test/Lower/HLFIR/actual_target_for_dummy_pointer.f90
index cd0398c1850fb..efe9e6dd190c0 100644
--- a/flang/test/Lower/HLFIR/actual_target_for_dummy_pointer.f90
+++ b/flang/test/Lower/HLFIR/actual_target_for_dummy_pointer.f90
@@ -246,7 +246,7 @@ end subroutine char_explicit_shape_array
 ! CHECK:           fir.store %[[VAL_30]] to %[[VAL_2]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.char<1,?>>>>>
 ! CHECK:           fir.call @_QPchar_explicit_shape_array_assumed_len_callee(%[[VAL_2]]) fastmath<contract> : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.char<1,?>>>>>) -> ()
 ! CHECK:           %[[VAL_31:.*]] = fir.shape %[[VAL_14]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_32:.*]] = fir.embox %[[VAL_16]]#1(%[[VAL_31]]) : (!fir.ref<!fir.array<100x!fir.char<1,?>>>, !fir.shape<1>) -> !fir.class<!fir.ptr<!fir.array<?xnone>>>
+! CHECK:           %[[VAL_32:.*]] = fir.embox %[[VAL_16]]#1(%[[VAL_31]]) typeparams %[[VAL_12]]#1 : (!fir.ref<!fir.array<100x!fir.char<1,?>>>, !fir.shape<1>, index) -> !fir.class<!fir.ptr<!fir.array<?xnone>>>
 ! CHECK:           fir.store %[[VAL_32]] to %[[VAL_1]] : !fir.ref<!fir.class<!fir.ptr<!fir.array<?xnone>>>>
 ! CHECK:           fir.call @_QPchar_explicit_shape_array_uclass_callee(%[[VAL_1]]) fastmath<contract> : (!fir.ref<!fir.class<!fir.ptr<!fir.array<?xnone>>>>) -> ()
 ! CHECK:           return
diff --git a/flang/test/Lower/HLFIR/eoshift.f90 b/flang/test/Lower/HLFIR/eoshift.f90
index e7fb98c2b0401..8d541779a2569 100644
--- a/flang/test/Lower/HLFIR/eoshift.f90
+++ b/flang/test/Lower/HLFIR/eoshift.f90
@@ -4,6 +4,8 @@
 module eoshift_types
   type t
   end type t
+  type, extends(t) :: t2
+  end type t2
 end module eoshift_types
 
 ! 1d shift by scalar
@@ -269,3 +271,12 @@ subroutine eoshift14(array)
 ! CHECK-DAG:           %[[VAL_3]] = arith.constant 1 : i32
 ! CHECK:           %[[VAL_5:.*]] = hlfir.eoshift{{.*}}boundary %[[VAL_4]] : (!fir.box<!fir.array<?xui32>>, i32, ui32) -> !hlfir.expr<?xui32>
 end subroutine eoshift14
+
+! CHECK-LABEL:   func.func @_QPeoshift15(
+subroutine eoshift15(array, boundary)
+  use eoshift_types
+  class(t), allocatable :: array(:,:)
+  type(t) :: boundary(:)
+  array = eoshift(array, shift=1, boundary=boundary)
+! CHECK:           hlfir.eoshift %{{.*}} %{{.*}} boundary %{{.*}}#0 : (!fir.class<!fir.heap<!fir.array<?x?x!fir.type<_QMeoshift_typesTt>>>>, i32, !fir.box<!fir.array<?x!fir.type<_QMeoshift_typesTt>>>) -> !hlfir.expr<?x?x!fir.type<_QMeoshift_typesTt>?>
+end subroutine eoshift15
diff --git a/flang/test/Lower/Intrinsics/dsecnds.f90 b/flang/test/Lower/Intrinsics/dsecnds.f90
new file mode 100644
index 0000000000000..03814ff60bd80
--- /dev/null
+++ b/flang/test/Lower/Intrinsics/dsecnds.f90
@@ -0,0 +1,33 @@
+! RUN: bbc -emit-hlfir %s -o - | FileCheck %s
+
+! CHECK-LABEL: func.func @_QPuse_dsecnds(
+! CHECK-SAME: %[[arg0:.*]]: !fir.ref<f64>
+function use_dsecnds(refTime) result(elapsed)
+  double precision :: refTime, elapsed
+  elapsed = dsecnds(refTime)
+end function
+
+! The argument is lowered with hlfir.declare, which returns two results.
+! Capture it here to check that the correct SSA value (%...#0)
+! is passed to the runtime call later
+! CHECK: %[[DECL:.*]]:2 = hlfir.declare %[[arg0]] dummy_scope
+
+! The file name and source line are also lowered and passed as runtime arguments
+! Capture the constant line number and convert the file name to i8*.
+! CHECK: %[[STRADDR:.*]] = fir.address_of(
+! CHECK: %[[LINE:.*]] = arith.constant {{.*}} : i32
+! CHECK: %[[FNAME8:.*]] = fir.convert %[[STRADDR]] : (!fir.ref<!fir.char<1,{{.*}}>>) -> !fir.ref<i8>
+
+! Verify the runtime call is made with:
+!   - the declared refTime value (%[[DECL]]#0)
+!   - the converted filename
+!   - the source line constant
+! CHECK: %[[CALL:.*]] = fir.call @_FortranADsecnds(%[[DECL]]#0, %[[FNAME8]], %[[LINE]]) {{.*}} : (!fir.ref<f64>, !fir.ref<i8>, i32) -> f64
+
+! Ensure there is no illegal conversion of a value result into a reference
+! CHECK-NOT: fir.convert {{.*}} : (f64) -> !fir.ref<f64>
+
+! Confirm the function result is returned as a plain f64
+! CHECK: return {{.*}} : f64
+
+
diff --git a/flang/test/Lower/OpenACC/acc-declare-unwrap-defaultbounds.f90 b/flang/test/Lower/OpenACC/acc-declare-unwrap-defaultbounds.f90
index 6869af863644d..4b181f8a26987 100644
--- a/flang/test/Lower/OpenACC/acc-declare-unwrap-defaultbounds.f90
+++ b/flang/test/Lower/OpenACC/acc-declare-unwrap-defaultbounds.f90
@@ -240,23 +240,23 @@ subroutine acc_declare_allocate()
     allocate(a(100))
 
 ! CHECK: %{{.*}} = fir.allocmem !fir.array<?xi32>, %{{.*}} {fir.must_be_heap = true, uniq_name = "_QMacc_declareFacc_declare_allocateEa.alloc"}
-! CHECK: fir.store %{{.*}} to %{{.*}} {acc.declare_action = #acc.declare_action<postAlloc = @_QMacc_declareFacc_declare_allocateEa_acc_declare_update_desc_post_alloc>} : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+! CHECK: fir.store %{{.*}} to %{{.*}} {acc.declare_action = #acc.declare_action<postAlloc = @_QMacc_declareFacc_declare_allocateEa_acc_declare_post_alloc>} : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
 
     deallocate(a)
 
-! CHECK: %{{.*}} = fir.box_addr %{{.*}} {acc.declare_action = #acc.declare_action<preDealloc = @_QMacc_declareFacc_declare_allocateEa_acc_declare_update_desc_pre_dealloc>} : (!fir.box<!fir.heap<!fir.array<?xi32>>>) -> !fir.heap<!fir.array<?xi32>>
+! CHECK: %{{.*}} = fir.box_addr %{{.*}} {acc.declare_action = #acc.declare_action<preDealloc = @_QMacc_declareFacc_declare_allocateEa_acc_declare_pre_dealloc>} : (!fir.box<!fir.heap<!fir.array<?xi32>>>) -> !fir.heap<!fir.array<?xi32>>
 
 ! CHECK: fir.freemem %{{.*}} : !fir.heap<!fir.array<?xi32>>
-! CHECK: fir.store %{{.*}} to %{{.*}} {acc.declare_action = #acc.declare_action<postDealloc = @_QMacc_declareFacc_declare_allocateEa_acc_declare_update_desc_post_dealloc>} : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+! CHECK: fir.store %{{.*}} to %{{.*}} {acc.declare_action = #acc.declare_action<postDealloc = @_QMacc_declareFacc_declare_allocateEa_acc_declare_post_dealloc>} : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
 
 ! CHECK: fir.if
 ! CHECK: fir.freemem %{{.*}} : !fir.heap<!fir.array<?xi32>>
-! CHECK: fir.store %{{.*}} to %{{.*}}#0 {acc.declare_action = #acc.declare_action<postDealloc = @_QMacc_declareFacc_declare_allocateEa_acc_declare_update_desc_post_dealloc>} : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+! CHECK: fir.store %{{.*}} to %{{.*}}#0 {acc.declare_action = #acc.declare_action<postDealloc = @_QMacc_declareFacc_declare_allocateEa_acc_declare_post_dealloc>} : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
 ! CHECK: }
 
   end subroutine
 
-! CHECK-LABEL: func.func private @_QMacc_declareFacc_declare_allocateEa_acc_declare_update_desc_post_alloc(
+! CHECK-LABEL: func.func private @_QMacc_declareFacc_declare_allocateEa_acc_declare_post_alloc(
 ! CHECK-SAME:    %[[ARG0:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) {
 ! CHECK:         %[[UPDATE:.*]] = acc.update_device varPtr(%[[ARG0]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> {implicit = true, name = "a_desc", structured = false}
 ! CHECK:         acc.update dataOperands(%[[UPDATE]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>)
@@ -267,7 +267,7 @@ subroutine acc_declare_allocate()
 ! CHECK:         return
 ! CHECK:       }
 
-! CHECK-LABEL: func.func private @_QMacc_declareFacc_declare_allocateEa_acc_declare_update_desc_pre_dealloc(
+! CHECK-LABEL: func.func private @_QMacc_declareFacc_declare_allocateEa_acc_declare_pre_dealloc(
 ! CHECK-SAME:    %[[ARG0:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) {
 ! CHECK:         %[[LOAD:.*]] = fir.load %[[ARG0]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
 ! CHECK:         %[[BOX_ADDR:.*]] = fir.box_addr %[[LOAD]] {acc.declare = #acc.declare<dataClause =  acc_create>} : (!fir.box<!fir.heap<!fir.array<?xi32>>>) -> !fir.heap<!fir.array<?xi32>>
@@ -277,7 +277,7 @@ subroutine acc_declare_allocate()
 ! CHECK:         return
 ! CHECK:       }
 
-! CHECK-LABEL: func.func private @_QMacc_declareFacc_declare_allocateEa_acc_declare_update_desc_post_dealloc(
+! CHECK-LABEL: func.func private @_QMacc_declareFacc_declare_allocateEa_acc_declare_post_dealloc(
 ! CHECK-SAME:    %[[ARG0:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) {
 ! CHECK:         %[[LOAD:.*]] = fir.load %[[ARG0]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
 ! CHECK:         %[[BOX_ADDR:.*]] = fir.box_addr %[[LOAD]] : (!fir.box<!fir.heap<!fir.array<?xi32>>>) -> !fir.heap<!fir.array<?xi32>>
@@ -337,8 +337,8 @@ subroutine acc_declare_allocate_with_stat()
   end subroutine
 
 ! CHECK-LABEL: func.func @_QMacc_declarePacc_declare_allocate_with_stat()
-! CHECK: fir.call @_FortranAPointerAllocate(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}} {acc.declare_action = #acc.declare_action<postAlloc = @_QMacc_declareFacc_declare_allocate_with_statElocalptr_acc_declare_update_desc_post_alloc>}
-! CHECK: fir.call @_FortranAPointerDeallocate(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}} {acc.declare_action = #acc.declare_action<preDealloc = @_QMacc_declareFacc_declare_allocate_with_statElocalptr_acc_declare_update_desc_pre_dealloc, postDealloc = @_QMacc_declareFacc_declare_allocate_with_statElocalptr_acc_declare_update_desc_post_dealloc>}
+! CHECK: fir.call @_FortranAPointerAllocate(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}} {acc.declare_action = #acc.declare_action<postAlloc = @_QMacc_declareFacc_declare_allocate_with_statElocalptr_acc_declare_post_alloc>}
+! CHECK: fir.call @_FortranAPointerDeallocate(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}} {acc.declare_action = #acc.declare_action<preDealloc = @_QMacc_declareFacc_declare_allocate_with_statElocalptr_acc_declare_pre_dealloc, postDealloc = @_QMacc_declareFacc_declare_allocate_with_statElocalptr_acc_declare_post_dealloc>}
 end module
 
 module acc_declare_allocatable_test
@@ -353,7 +353,7 @@ module acc_declare_allocatable_test
 ! CHECK:         acc.terminator
 ! CHECK:       }
 
-! CHECK-LABEL: func.func private @_QMacc_declare_allocatable_testEdata1_acc_declare_update_desc_post_alloc() {
+! CHECK-LABEL: func.func private @_QMacc_declare_allocatable_testEdata1_acc_declare_post_alloc() {
 ! CHECK:         %[[GLOBAL_ADDR:.*]] = fir.address_of(@_QMacc_declare_allocatable_testEdata1) : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
 ! CHECK:         %[[UPDATE:.*]] = acc.update_device varPtr(%[[GLOBAL_ADDR]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> {implicit = true, name = "data1_desc", structured = false}
 ! CHECK:         acc.update dataOperands(%[[UPDATE]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>)
@@ -364,7 +364,7 @@ module acc_declare_allocatable_test
 ! CHECK:         return
 ! CHECK:       }
 
-! CHECK-LABEL: func.func private @_QMacc_declare_allocatable_testEdata1_acc_declare_update_desc_pre_dealloc() {
+! CHECK-LABEL: func.func private @_QMacc_declare_allocatable_testEdata1_acc_declare_pre_dealloc() {
 ! CHECK:         %[[GLOBAL_ADDR:.*]] = fir.address_of(@_QMacc_declare_allocatable_testEdata1) : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
 ! CHECK:         %[[LOAD:.*]] = fir.load %[[GLOBAL_ADDR]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
 ! CHECK:         %[[BOXADDR:.*]] = fir.box_addr %[[LOAD]] {acc.declare = #acc.declare<dataClause =  acc_create>} : (!fir.box<!fir.heap<!fir.array<?xi32>>>) -> !fir.heap<!fir.array<?xi32>>
@@ -374,7 +374,7 @@ module acc_declare_allocatable_test
 ! CHECK:         return
 ! CHECK:       }
 
-! CHECK-LABEL: func.func private @_QMacc_declare_allocatable_testEdata1_acc_declare_update_desc_post_dealloc() {
+! CHECK-LABEL: func.func private @_QMacc_declare_allocatable_testEdata1_acc_declare_post_dealloc() {
 ! CHECK:         %[[GLOBAL_ADDR:.*]] = fir.address_of(@_QMacc_declare_allocatable_testEdata1) : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
 ! CHECK:         %[[UPDATE:.*]] = acc.update_device varPtr(%[[GLOBAL_ADDR]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>)   -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> {implicit = true, name = "data1_desc", structured = false}
 ! CHECK:         acc.update dataOperands(%[[UPDATE]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>)
@@ -440,14 +440,14 @@ module acc_declare_allocatable_test2
   subroutine init()
     use acc_declare_allocatable_test
     allocate(data1(100))
-! CHECK: fir.store %{{.*}} to %{{.*}} {acc.declare_action = #acc.declare_action<postAlloc = @_QMacc_declare_allocatable_testEdata1_acc_declare_update_desc_post_alloc>} : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+! CHECK: fir.store %{{.*}} to %{{.*}} {acc.declare_action = #acc.declare_action<postAlloc = @_QMacc_declare_allocatable_testEdata1_acc_declare_post_alloc>} : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
   end subroutine
 
   subroutine finalize()
     use acc_declare_allocatable_test
     deallocate(data1)
-! CHECK: %{{.*}} = fir.box_addr %{{.*}} {acc.declare_action = #acc.declare_action<preDealloc = @_QMacc_declare_allocatable_testEdata1_acc_declare_update_desc_pre_dealloc>} : (!fir.box<!fir.heap<!fir.array<?xi32>>>) -> !fir.heap<!fir.array<?xi32>>
-! CHECK: fir.store %{{.*}} to %{{.*}} {acc.declare_action = #acc.declare_action<postDealloc = @_QMacc_declare_allocatable_testEdata1_acc_declare_update_desc_post_dealloc>} : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+! CHECK: %{{.*}} = fir.box_addr %{{.*}} {acc.declare_action = #acc.declare_action<preDealloc = @_QMacc_declare_allocatable_testEdata1_acc_declare_pre_dealloc>} : (!fir.box<!fir.heap<!fir.array<?xi32>>>) -> !fir.heap<!fir.array<?xi32>>
+! CHECK: fir.store %{{.*}} to %{{.*}} {acc.declare_action = #acc.declare_action<postDealloc = @_QMacc_declare_allocatable_testEdata1_acc_declare_post_dealloc>} : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
   end subroutine
 end module
 
@@ -473,6 +473,6 @@ subroutine init()
 end module
 
 ! CHECK-LABEL: func.func @_QMacc_declare_post_action_statPinit()
-! CHECK: fir.call @_FortranAAllocatableAllocate({{.*}}) fastmath<contract> {acc.declare_action = #acc.declare_action<postAlloc = @_QMacc_declare_post_action_statEx_acc_declare_update_desc_post_alloc>} : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+! CHECK: fir.call @_FortranAAllocatableAllocate({{.*}}) fastmath<contract> {acc.declare_action = #acc.declare_action<postAlloc = @_QMacc_declare_post_action_statEx_acc_declare_post_alloc>} : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 ! CHECK: fir.if
-! CHECK: fir.call @_FortranAAllocatableAllocate({{.*}}) fastmath<contract> {acc.declare_action = #acc.declare_action<postAlloc = @_QMacc_declare_post_action_statEy_acc_declare_update_desc_post_alloc>} : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+! CHECK: fir.call @_FortranAAllocatableAllocate({{.*}}) fastmath<contract> {acc.declare_action = #acc.declare_action<postAlloc = @_QMacc_declare_post_action_statEy_acc_declare_post_alloc>} : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
diff --git a/flang/test/Lower/OpenACC/acc-declare.f90 b/flang/test/Lower/OpenACC/acc-declare.f90
index 4d95ffa10edaf..edae0e6a4d37e 100644
--- a/flang/test/Lower/OpenACC/acc-declare.f90
+++ b/flang/test/Lower/OpenACC/acc-declare.f90
@@ -232,33 +232,33 @@ subroutine acc_declare_allocate()
     allocate(a(100))
 
 ! CHECK: %{{.*}} = fir.allocmem !fir.array<?xi32>, %{{.*}} {fir.must_be_heap = true, uniq_name = "_QMacc_declareFacc_declare_allocateEa.alloc"}
-! CHECK: fir.store %{{.*}} to %{{.*}} {acc.declare_action = #acc.declare_action<postAlloc = @_QMacc_declareFacc_declare_allocateEa_acc_declare_update_desc_post_alloc>} : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+! CHECK: fir.store %{{.*}} to %{{.*}} {acc.declare_action = #acc.declare_action<postAlloc = @_QMacc_declareFacc_declare_allocateEa_acc_declare_post_alloc>} : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
 
     deallocate(a)
 
-! CHECK: %{{.*}} = fir.box_addr %{{.*}} {acc.declare_action = #acc.declare_action<preDealloc = @_QMacc_declareFacc_declare_allocateEa_acc_declare_update_desc_pre_dealloc>} : (!fir.box<!fir.heap<!fir.array<?xi32>>>) -> !fir.heap<!fir.array<?xi32>>
+! CHECK: %{{.*}} = fir.box_addr %{{.*}} {acc.declare_action = #acc.declare_action<preDealloc = @_QMacc_declareFacc_declare_allocateEa_acc_declare_pre_dealloc>} : (!fir.box<!fir.heap<!fir.array<?xi32>>>) -> !fir.heap<!fir.array<?xi32>>
 
 ! CHECK: fir.freemem %{{.*}} : !fir.heap<!fir.array<?xi32>>
-! CHECK: fir.store %{{.*}} to %{{.*}} {acc.declare_action = #acc.declare_action<postDealloc = @_QMacc_declareFacc_declare_allocateEa_acc_declare_update_desc_post_dealloc>} : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+! CHECK: fir.store %{{.*}} to %{{.*}} {acc.declare_action = #acc.declare_action<postDealloc = @_QMacc_declareFacc_declare_allocateEa_acc_declare_post_dealloc>} : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
 
 ! CHECK: fir.if
 ! CHECK: fir.freemem %{{.*}} : !fir.heap<!fir.array<?xi32>>
-! CHECK: fir.store %{{.*}} to %{{.*}}#0 {acc.declare_action = #acc.declare_action<postDealloc = @_QMacc_declareFacc_declare_allocateEa_acc_declare_update_desc_post_dealloc>} : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+! CHECK: fir.store %{{.*}} to %{{.*}}#0 {acc.declare_action = #acc.declare_action<postDealloc = @_QMacc_declareFacc_declare_allocateEa_acc_declare_post_dealloc>} : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
 ! CHECK: }
 
   end subroutine
 
-! CHECK-LABEL: func.func private @_QMacc_declareFacc_declare_allocateEa_acc_declare_update_desc_post_alloc(
+! CHECK-LABEL: func.func private @_QMacc_declareFacc_declare_allocateEa_acc_declare_post_alloc(
 ! CHECK-SAME:    %[[ARG0:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) {
-! CHECK:         %[[UPDATE:.*]] = acc.update_device varPtr(%[[ARG0]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> {implicit = true, name = "a", structured = false}
-! CHECK:         acc.update dataOperands(%[[UPDATE]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>)
+! CHECK:         %[[CREATE_DESC:.*]] = acc.create varPtr(%[[ARG0]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> {implicit = true, name = "a", structured = false}
+! CHECK:         acc.declare_enter dataOperands(%[[CREATE_DESC]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>)
 ! CHECK:         return
 ! CHECK:       }
 
-! CHECK-LABEL: func.func private @_QMacc_declareFacc_declare_allocateEa_acc_declare_update_desc_post_dealloc(
+! CHECK-LABEL: func.func private @_QMacc_declareFacc_declare_allocateEa_acc_declare_post_dealloc(
 ! CHECK-SAME:    %[[ARG0:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) {
-! CHECK:         %[[UPDATE:.*]] = acc.update_device varPtr(%[[ARG0]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> {implicit = true, name = "a", structured = false}
-! CHECK:         acc.update dataOperands(%[[UPDATE]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>)
+! CHECK:         %[[DEVPTR:.*]] = acc.getdeviceptr varPtr(%[[ARG0]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> {dataClause = #acc<data_clause acc_create>, implicit = true, name = "a", structured = false}
+! CHECK:         acc.declare_exit dataOperands(%[[DEVPTR]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>)
 ! CHECK:         return
 ! CHECK:       }
 
@@ -312,8 +312,8 @@ subroutine acc_declare_allocate_with_stat()
   end subroutine
 
 ! CHECK-LABEL: func.func @_QMacc_declarePacc_declare_allocate_with_stat()
-! CHECK: fir.call @_FortranAPointerAllocate(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}} {acc.declare_action = #acc.declare_action<postAlloc = @_QMacc_declareFacc_declare_allocate_with_statElocalptr_acc_declare_update_desc_post_alloc>}
-! CHECK: fir.call @_FortranAPointerDeallocate(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}} {acc.declare_action = #acc.declare_action<preDealloc = @_QMacc_declareFacc_declare_allocate_with_statElocalptr_acc_declare_update_desc_pre_dealloc, postDealloc = @_QMacc_declareFacc_declare_allocate_with_statElocalptr_acc_declare_update_desc_post_dealloc>}
+! CHECK: fir.call @_FortranAPointerAllocate(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}} {acc.declare_action = #acc.declare_action<postAlloc = @_QMacc_declareFacc_declare_allocate_with_statElocalptr_acc_declare_post_alloc>}
+! CHECK: fir.call @_FortranAPointerDeallocate(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}} {acc.declare_action = #acc.declare_action<preDealloc = @_QMacc_declareFacc_declare_allocate_with_statElocalptr_acc_declare_pre_dealloc, postDealloc = @_QMacc_declareFacc_declare_allocate_with_statElocalptr_acc_declare_post_dealloc>}
 end module
 
 module acc_declare_allocatable_test
@@ -328,17 +328,17 @@ module acc_declare_allocatable_test
 ! CHECK:         acc.terminator
 ! CHECK:       }
 
-! CHECK-LABEL: func.func private @_QMacc_declare_allocatable_testEdata1_acc_declare_update_desc_post_alloc() {
+! CHECK-LABEL: func.func private @_QMacc_declare_allocatable_testEdata1_acc_declare_post_alloc() {
 ! CHECK:         %[[GLOBAL_ADDR:.*]] = fir.address_of(@_QMacc_declare_allocatable_testEdata1) : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
-! CHECK:         %[[UPDATE:.*]] = acc.update_device varPtr(%[[GLOBAL_ADDR]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> {implicit = true, name = "data1", structured = false}
-! CHECK:         acc.update dataOperands(%[[UPDATE]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>)
+! CHECK:         %[[CREATE_DESC:.*]] = acc.create varPtr(%[[GLOBAL_ADDR]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> {implicit = true, name = "data1", structured = false}
+! CHECK:         acc.declare_enter dataOperands(%[[CREATE_DESC]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>)
 ! CHECK:         return
 ! CHECK:       }
 
-! CHECK-LABEL: func.func private @_QMacc_declare_allocatable_testEdata1_acc_declare_update_desc_post_dealloc() {
+! CHECK-LABEL: func.func private @_QMacc_declare_allocatable_testEdata1_acc_declare_post_dealloc() {
 ! CHECK:         %[[GLOBAL_ADDR:.*]] = fir.address_of(@_QMacc_declare_allocatable_testEdata1) : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
-! CHECK:         %[[UPDATE:.*]] = acc.update_device varPtr(%[[GLOBAL_ADDR]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>)   -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> {implicit = true, name = "data1", structured = false}
-! CHECK:         acc.update dataOperands(%[[UPDATE]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>)
+! CHECK:         %[[DEVPTR:.*]] = acc.getdeviceptr varPtr(%[[GLOBAL_ADDR]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>)   -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> {dataClause = #acc<data_clause acc_create>, implicit = true, name = "data1", structured = false}
+! CHECK:         acc.declare_exit dataOperands(%[[DEVPTR]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>)
 ! CHECK:         return
 ! CHECK:       }
 
@@ -401,14 +401,14 @@ module acc_declare_allocatable_test2
   subroutine init()
     use acc_declare_allocatable_test
     allocate(data1(100))
-! CHECK: fir.store %{{.*}} to %{{.*}} {acc.declare_action = #acc.declare_action<postAlloc = @_QMacc_declare_allocatable_testEdata1_acc_declare_update_desc_post_alloc>} : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+! CHECK: fir.store %{{.*}} to %{{.*}} {acc.declare_action = #acc.declare_action<postAlloc = @_QMacc_declare_allocatable_testEdata1_acc_declare_post_alloc>} : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
   end subroutine
 
   subroutine finalize()
     use acc_declare_allocatable_test
     deallocate(data1)
-! CHECK: %{{.*}} = fir.box_addr %{{.*}} {acc.declare_action = #acc.declare_action<preDealloc = @_QMacc_declare_allocatable_testEdata1_acc_declare_update_desc_pre_dealloc>} : (!fir.box<!fir.heap<!fir.array<?xi32>>>) -> !fir.heap<!fir.array<?xi32>>
-! CHECK: fir.store %{{.*}} to %{{.*}} {acc.declare_action = #acc.declare_action<postDealloc = @_QMacc_declare_allocatable_testEdata1_acc_declare_update_desc_post_dealloc>} : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+! CHECK: %{{.*}} = fir.box_addr %{{.*}} {acc.declare_action = #acc.declare_action<preDealloc = @_QMacc_declare_allocatable_testEdata1_acc_declare_pre_dealloc>} : (!fir.box<!fir.heap<!fir.array<?xi32>>>) -> !fir.heap<!fir.array<?xi32>>
+! CHECK: fir.store %{{.*}} to %{{.*}} {acc.declare_action = #acc.declare_action<postDealloc = @_QMacc_declare_allocatable_testEdata1_acc_declare_post_dealloc>} : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
   end subroutine
 end module
 
@@ -434,6 +434,6 @@ subroutine init()
 end module
 
 ! CHECK-LABEL: func.func @_QMacc_declare_post_action_statPinit()
-! CHECK: fir.call @_FortranAAllocatableAllocate({{.*}}) fastmath<contract> {acc.declare_action = #acc.declare_action<postAlloc = @_QMacc_declare_post_action_statEx_acc_declare_update_desc_post_alloc>} : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+! CHECK: fir.call @_FortranAAllocatableAllocate({{.*}}) fastmath<contract> {acc.declare_action = #acc.declare_action<postAlloc = @_QMacc_declare_post_action_statEx_acc_declare_post_alloc>} : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 ! CHECK: fir.if
-! CHECK: fir.call @_FortranAAllocatableAllocate({{.*}}) fastmath<contract> {acc.declare_action = #acc.declare_action<postAlloc = @_QMacc_declare_post_action_statEy_acc_declare_update_desc_post_alloc>} : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+! CHECK: fir.call @_FortranAAllocatableAllocate({{.*}}) fastmath<contract> {acc.declare_action = #acc.declare_action<postAlloc = @_QMacc_declare_post_action_statEy_acc_declare_post_alloc>} : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
diff --git a/flang/test/Lower/OpenACC/acc-firstprivate-derived-allocatable-component.f90 b/flang/test/Lower/OpenACC/acc-firstprivate-derived-allocatable-component.f90
new file mode 100644
index 0000000000000..b07ca21a43aa7
--- /dev/null
+++ b/flang/test/Lower/OpenACC/acc-firstprivate-derived-allocatable-component.f90
@@ -0,0 +1,67 @@
+! Test lowering of firstprivate on derived type with allocatable components.
+! The runtime is called to handled the deep copy of the allocatable components.
+
+! RUN: bbc -fopenacc -emit-hlfir %s -o - | FileCheck %s
+! RUN: bbc -fopenacc -emit-fir %s -o - | FileCheck %s --check-prefix=FIR-CHECK
+
+module m_firstprivate_derived_alloc_comp
+ type point
+   real, allocatable :: x(:)
+ end type point
+ contains
+   subroutine test(a)
+     type(point) :: a
+
+     !$acc parallel loop firstprivate(a)
+     do i = 1, n
+      a%x(10) = 1
+     enddo
+   end
+ end module
+
+! CHECK-LABEL:   acc.firstprivate.recipe @firstprivatization_ref_rec__QMm_firstprivate_derived_alloc_compTpoint : !fir.ref<!fir.type<_QMm_firstprivate_derived_alloc_compTpoint{x:!fir.box<!fir.heap<!fir.array<?xf32>>>}>> init {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.type<_QMm_firstprivate_derived_alloc_compTpoint{x:!fir.box<!fir.heap<!fir.array<?xf32>>>}>>):
+! CHECK:           acc.yield %[[VAL_0]] : !fir.ref<!fir.type<_QMm_firstprivate_derived_alloc_compTpoint{x:!fir.box<!fir.heap<!fir.array<?xf32>>>}>>
+!
+! CHECK:         } copy {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.type<_QMm_firstprivate_derived_alloc_compTpoint{x:!fir.box<!fir.heap<!fir.array<?xf32>>>}>>, %[[VAL_1:.*]]: !fir.ref<!fir.type<_QMm_firstprivate_derived_alloc_compTpoint{x:!fir.box<!fir.heap<!fir.array<?xf32>>>}>>):
+! CHECK:           hlfir.assign %[[VAL_0]] to %[[VAL_1]] temporary_lhs : !fir.ref<!fir.type<_QMm_firstprivate_derived_alloc_compTpoint{x:!fir.box<!fir.heap<!fir.array<?xf32>>>}>>, !fir.ref<!fir.type<_QMm_firstprivate_derived_alloc_compTpoint{x:!fir.box<!fir.heap<!fir.array<?xf32>>>}>>
+! CHECK:           acc.terminator
+! CHECK:         }
+!
+! CHECK-LABEL:   func.func @_QMm_firstprivate_derived_alloc_compPtest(
+! CHECK-SAME:      %[[ARG0:.*]]: !fir.ref<!fir.type<_QMm_firstprivate_derived_alloc_compTpoint{x:!fir.box<!fir.heap<!fir.array<?xf32>>>}>> {fir.bindc_name = "a"}) {
+! CHECK:           %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] {uniq_name = "_QMm_firstprivate_derived_alloc_compFtestEa"} : (!fir.ref<!fir.type<_QMm_firstprivate_derived_alloc_compTpoint{x:!fir.box<!fir.heap<!fir.array<?xf32>>>}>>, !fir.dscope) -> (!fir.ref<!fir.type<_QMm_firstprivate_derived_alloc_compTpoint{x:!fir.box<!fir.heap<!fir.array<?xf32>>>}>>, !fir.ref<!fir.type<_QMm_firstprivate_derived_alloc_compTpoint{x:!fir.box<!fir.heap<!fir.array<?xf32>>>}>>)
+! CHECK:           %[[VAL_2:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QMm_firstprivate_derived_alloc_compFtestEi"}
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_2]] {uniq_name = "_QMm_firstprivate_derived_alloc_compFtestEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_4:.*]] = fir.alloca i32 {bindc_name = "n", uniq_name = "_QMm_firstprivate_derived_alloc_compFtestEn"}
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_4]] {uniq_name = "_QMm_firstprivate_derived_alloc_compFtestEn"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_6:.*]] = acc.firstprivate varPtr(%[[VAL_1]]#0 : !fir.ref<!fir.type<_QMm_firstprivate_derived_alloc_compTpoint{x:!fir.box<!fir.heap<!fir.array<?xf32>>>}>>) -> !fir.ref<!fir.type<_QMm_firstprivate_derived_alloc_compTpoint{x:!fir.box<!fir.heap<!fir.array<?xf32>>>}>> {name = "a"}
+! CHECK:           acc.parallel combined(loop) firstprivate(@firstprivatization_ref_rec__QMm_firstprivate_derived_alloc_compTpoint -> %[[VAL_6]] : !fir.ref<!fir.type<_QMm_firstprivate_derived_alloc_compTpoint{x:!fir.box<!fir.heap<!fir.array<?xf32>>>}>>) {
+! CHECK:             %[[VAL_7:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_8:.*]] = fir.load %[[VAL_5]]#0 : !fir.ref<i32>
+! CHECK:             %[[VAL_9:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_10:.*]] = acc.private varPtr(%[[VAL_3]]#0 : !fir.ref<i32>) -> !fir.ref<i32> {implicit = true, name = "i"}
+! CHECK:             %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_10]] {uniq_name = "_QMm_firstprivate_derived_alloc_compFtestEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:             acc.loop combined(parallel) private(@privatization_ref_i32 -> %[[VAL_10]] : !fir.ref<i32>) control(%[[VAL_12:.*]] : i32) = (%[[VAL_7]] : i32) to (%[[VAL_8]] : i32)  step (%[[VAL_9]] : i32) {
+! CHECK:               fir.store %[[VAL_12]] to %[[VAL_11]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_13:.*]] = arith.constant 1.000000e+00 : f32
+! CHECK:               %[[VAL_14:.*]] = hlfir.designate %[[VAL_1]]#0{"x"}   {fortran_attrs = #fir.var_attrs<allocatable>} : (!fir.ref<!fir.type<_QMm_firstprivate_derived_alloc_compTpoint{x:!fir.box<!fir.heap<!fir.array<?xf32>>>}>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
+! CHECK:               %[[VAL_15:.*]] = fir.load %[[VAL_14]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
+! CHECK:               %[[VAL_16:.*]] = arith.constant 10 : index
+! CHECK:               %[[VAL_17:.*]] = hlfir.designate %[[VAL_15]] (%[[VAL_16]])  : (!fir.box<!fir.heap<!fir.array<?xf32>>>, index) -> !fir.ref<f32>
+! CHECK:               hlfir.assign %[[VAL_13]] to %[[VAL_17]] : f32, !fir.ref<f32>
+! CHECK:               acc.yield
+! CHECK:             } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
+! CHECK:             acc.yield
+! CHECK:           }
+! CHECK:           return
+! CHECK:         }
+
+
+! FIR-CHECK-LABEL:   acc.firstprivate.recipe @firstprivatization_ref_rec__QMm_firstprivate_derived_alloc_compTpoint : !fir.ref<!fir.type<_QMm_firstprivate_derived_alloc_compTpoint{x:!fir.box<!fir.heap<!fir.array<?xf32>>>}>> init {
+! FIR-CHECK:   } copy {
+! FIR-CHECK:           fir.call @_FortranAAssignTemporary(
+! FIR-CHECK:           acc.terminator
+! FIR-CHECK:         }
diff --git a/flang/test/Lower/OpenACC/acc-firstprivate-derived-pointer-component.f90 b/flang/test/Lower/OpenACC/acc-firstprivate-derived-pointer-component.f90
new file mode 100644
index 0000000000000..8973b4b0085c2
--- /dev/null
+++ b/flang/test/Lower/OpenACC/acc-firstprivate-derived-pointer-component.f90
@@ -0,0 +1,76 @@
+! Test lowering of firstprivate on derived type with pointer components.
+! No deep copy must be done.
+
+! RUN: bbc -fopenacc -emit-hlfir %s -o - | FileCheck %s
+! RUN: bbc -fopenacc -emit-fir %s -o - | FileCheck %s --check-prefix=FIR-CHECK
+
+module m_firstprivate_derived_ptr_comp
+ type point
+   real, pointer :: x(:)
+ end type point
+ contains
+   subroutine test(a)
+     type(point) :: a
+
+     !$acc parallel loop firstprivate(a)
+     do i = 1, n
+      a%x(10) = 1
+     enddo
+   end
+ end module
+
+! CHECK-LABEL:   acc.firstprivate.recipe @firstprivatization_ref_rec__QMm_firstprivate_derived_ptr_compTpoint : !fir.ref<!fir.type<_QMm_firstprivate_derived_ptr_compTpoint{x:!fir.box<!fir.ptr<!fir.array<?xf32>>>}>> init {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.type<_QMm_firstprivate_derived_ptr_compTpoint{x:!fir.box<!fir.ptr<!fir.array<?xf32>>>}>>):
+! CHECK:           acc.yield %[[VAL_0]] : !fir.ref<!fir.type<_QMm_firstprivate_derived_ptr_compTpoint{x:!fir.box<!fir.ptr<!fir.array<?xf32>>>}>>
+!
+! CHECK:         } copy {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.type<_QMm_firstprivate_derived_ptr_compTpoint{x:!fir.box<!fir.ptr<!fir.array<?xf32>>>}>>, %[[VAL_1:.*]]: !fir.ref<!fir.type<_QMm_firstprivate_derived_ptr_compTpoint{x:!fir.box<!fir.ptr<!fir.array<?xf32>>>}>>):
+! CHECK:           hlfir.assign %[[VAL_0]] to %[[VAL_1]] temporary_lhs : !fir.ref<!fir.type<_QMm_firstprivate_derived_ptr_compTpoint{x:!fir.box<!fir.ptr<!fir.array<?xf32>>>}>>, !fir.ref<!fir.type<_QMm_firstprivate_derived_ptr_compTpoint{x:!fir.box<!fir.ptr<!fir.array<?xf32>>>}>>
+! CHECK:           acc.terminator
+! CHECK:         }
+!
+! CHECK-LABEL:   func.func @_QMm_firstprivate_derived_ptr_compPtest(
+! CHECK-SAME:      %[[ARG0:.*]]: !fir.ref<!fir.type<_QMm_firstprivate_derived_ptr_compTpoint{x:!fir.box<!fir.ptr<!fir.array<?xf32>>>}>> {fir.bindc_name = "a"}) {
+! CHECK:           %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] {uniq_name = "_QMm_firstprivate_derived_ptr_compFtestEa"} : (!fir.ref<!fir.type<_QMm_firstprivate_derived_ptr_compTpoint{x:!fir.box<!fir.ptr<!fir.array<?xf32>>>}>>, !fir.dscope) -> (!fir.ref<!fir.type<_QMm_firstprivate_derived_ptr_compTpoint{x:!fir.box<!fir.ptr<!fir.array<?xf32>>>}>>, !fir.ref<!fir.type<_QMm_firstprivate_derived_ptr_compTpoint{x:!fir.box<!fir.ptr<!fir.array<?xf32>>>}>>)
+! CHECK:           %[[VAL_2:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QMm_firstprivate_derived_ptr_compFtestEi"}
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_2]] {uniq_name = "_QMm_firstprivate_derived_ptr_compFtestEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_4:.*]] = fir.alloca i32 {bindc_name = "n", uniq_name = "_QMm_firstprivate_derived_ptr_compFtestEn"}
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_4]] {uniq_name = "_QMm_firstprivate_derived_ptr_compFtestEn"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_6:.*]] = acc.firstprivate varPtr(%[[VAL_1]]#0 : !fir.ref<!fir.type<_QMm_firstprivate_derived_ptr_compTpoint{x:!fir.box<!fir.ptr<!fir.array<?xf32>>>}>>) -> !fir.ref<!fir.type<_QMm_firstprivate_derived_ptr_compTpoint{x:!fir.box<!fir.ptr<!fir.array<?xf32>>>}>> {name = "a"}
+! CHECK:           acc.parallel combined(loop) firstprivate(@firstprivatization_ref_rec__QMm_firstprivate_derived_ptr_compTpoint -> %[[VAL_6]] : !fir.ref<!fir.type<_QMm_firstprivate_derived_ptr_compTpoint{x:!fir.box<!fir.ptr<!fir.array<?xf32>>>}>>) {
+! CHECK:             %[[VAL_7:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_8:.*]] = fir.load %[[VAL_5]]#0 : !fir.ref<i32>
+! CHECK:             %[[VAL_9:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_10:.*]] = acc.private varPtr(%[[VAL_3]]#0 : !fir.ref<i32>) -> !fir.ref<i32> {implicit = true, name = "i"}
+! CHECK:             %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_10]] {uniq_name = "_QMm_firstprivate_derived_ptr_compFtestEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:             acc.loop combined(parallel) private(@privatization_ref_i32 -> %[[VAL_10]] : !fir.ref<i32>) control(%[[VAL_12:.*]] : i32) = (%[[VAL_7]] : i32) to (%[[VAL_8]] : i32)  step (%[[VAL_9]] : i32) {
+! CHECK:               fir.store %[[VAL_12]] to %[[VAL_11]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_13:.*]] = arith.constant 1.000000e+00 : f32
+! CHECK:               %[[VAL_14:.*]] = hlfir.designate %[[VAL_1]]#0{"x"}   {fortran_attrs = #fir.var_attrs<pointer>} : (!fir.ref<!fir.type<_QMm_firstprivate_derived_ptr_compTpoint{x:!fir.box<!fir.ptr<!fir.array<?xf32>>>}>>) -> !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>
+! CHECK:               %[[VAL_15:.*]] = fir.load %[[VAL_14]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>
+! CHECK:               %[[VAL_16:.*]] = arith.constant 10 : index
+! CHECK:               %[[VAL_17:.*]] = hlfir.designate %[[VAL_15]] (%[[VAL_16]])  : (!fir.box<!fir.ptr<!fir.array<?xf32>>>, index) -> !fir.ref<f32>
+! CHECK:               hlfir.assign %[[VAL_13]] to %[[VAL_17]] : f32, !fir.ref<f32>
+! CHECK:               acc.yield
+! CHECK:             } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
+! CHECK:             acc.yield
+! CHECK:           }
+! CHECK:           return
+! CHECK:         }
+
+
+! FIR-CHECK-LABEL:   acc.firstprivate.recipe @firstprivatization_ref_rec__QMm_firstprivate_derived_ptr_compTpoint : !fir.ref<!fir.type<_QMm_firstprivate_derived_ptr_compTpoint{x:!fir.box<!fir.ptr<!fir.array<?xf32>>>}>> init {
+! FIR-CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.type<_QMm_firstprivate_derived_ptr_compTpoint{x:!fir.box<!fir.ptr<!fir.array<?xf32>>>}>>):
+! FIR-CHECK:           acc.yield %[[VAL_0]] : !fir.ref<!fir.type<_QMm_firstprivate_derived_ptr_compTpoint{x:!fir.box<!fir.ptr<!fir.array<?xf32>>>}>>
+!
+! FIR-CHECK-LABEL:   } copy {
+! FIR-CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.type<_QMm_firstprivate_derived_ptr_compTpoint{x:!fir.box<!fir.ptr<!fir.array<?xf32>>>}>>, %[[VAL_1:.*]]: !fir.ref<!fir.type<_QMm_firstprivate_derived_ptr_compTpoint{x:!fir.box<!fir.ptr<!fir.array<?xf32>>>}>>):
+! FIR-CHECK:           %[[VAL_2:.*]] = fir.field_index x, !fir.type<_QMm_firstprivate_derived_ptr_compTpoint{x:!fir.box<!fir.ptr<!fir.array<?xf32>>>}>
+! FIR-CHECK:           %[[VAL_3:.*]] = fir.coordinate_of %[[VAL_0]], x : (!fir.ref<!fir.type<_QMm_firstprivate_derived_ptr_compTpoint{x:!fir.box<!fir.ptr<!fir.array<?xf32>>>}>>) -> !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>
+! FIR-CHECK:           %[[VAL_4:.*]] = fir.field_index x, !fir.type<_QMm_firstprivate_derived_ptr_compTpoint{x:!fir.box<!fir.ptr<!fir.array<?xf32>>>}>
+! FIR-CHECK:           %[[VAL_5:.*]] = fir.coordinate_of %[[VAL_1]], x : (!fir.ref<!fir.type<_QMm_firstprivate_derived_ptr_compTpoint{x:!fir.box<!fir.ptr<!fir.array<?xf32>>>}>>) -> !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>
+! FIR-CHECK:           %[[VAL_6:.*]] = fir.load %[[VAL_3]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>
+! FIR-CHECK:           fir.store %[[VAL_6]] to %[[VAL_5]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>
+! FIR-CHECK:           acc.terminator
+! FIR-CHECK:         }
diff --git a/flang/test/Lower/OpenACC/acc-firstprivate-derived-user-assign.f90 b/flang/test/Lower/OpenACC/acc-firstprivate-derived-user-assign.f90
new file mode 100644
index 0000000000000..02065b7c03664
--- /dev/null
+++ b/flang/test/Lower/OpenACC/acc-firstprivate-derived-user-assign.f90
@@ -0,0 +1,99 @@
+! Test lowering of firstprivate on derived type with user defined assignments,
+! The user defined assignments should not be called when making firstprivate
+! copies.
+
+! RUN: bbc -fopenacc -emit-hlfir %s -o - | FileCheck %s
+! RUN: bbc -fopenacc -emit-fir %s -o - | FileCheck %s --check-prefix=FIR-CHECK
+
+module m_firstprivate_derived_user_def
+ type point
+   real :: x, y, z
+  contains
+    procedure :: user_copy
+    generic :: assignment(=) => user_copy
+ end type point
+ contains
+
+ subroutine user_copy(lhs, rhs)
+   class(point), intent(out) :: lhs
+   class(point), intent(in) :: rhs
+   print *, "hello, I am a side effect"
+   lhs%x = rhs%x
+   lhs%y = rhs%y
+   lhs%z = rhs%z
+ end subroutine
+
+   subroutine test()
+     type(point) :: a
+
+     !$acc parallel loop firstprivate(a)
+     do i = 1, n
+      a%x = 1
+     enddo
+   end
+ end module
+
+! CHECK-LABEL:   acc.firstprivate.recipe @firstprivatization_ref_rec__QMm_firstprivate_derived_user_defTpoint : !fir.ref<!fir.type<_QMm_firstprivate_derived_user_defTpoint{x:f32,y:f32,z:f32}>> init {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.type<_QMm_firstprivate_derived_user_defTpoint{x:f32,y:f32,z:f32}>>):
+! CHECK:           acc.yield %[[VAL_0]] : !fir.ref<!fir.type<_QMm_firstprivate_derived_user_defTpoint{x:f32,y:f32,z:f32}>>
+!
+! CHECK:         } copy {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.type<_QMm_firstprivate_derived_user_defTpoint{x:f32,y:f32,z:f32}>>, %[[VAL_1:.*]]: !fir.ref<!fir.type<_QMm_firstprivate_derived_user_defTpoint{x:f32,y:f32,z:f32}>>):
+! CHECK:           hlfir.assign %[[VAL_0]] to %[[VAL_1]] temporary_lhs : !fir.ref<!fir.type<_QMm_firstprivate_derived_user_defTpoint{x:f32,y:f32,z:f32}>>, !fir.ref<!fir.type<_QMm_firstprivate_derived_user_defTpoint{x:f32,y:f32,z:f32}>>
+! CHECK:           acc.terminator
+! CHECK:         }
+!
+! CHECK-LABEL:   func.func @_QMm_firstprivate_derived_user_defPtest() {
+! CHECK:           %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope
+! CHECK:           %[[VAL_1:.*]] = fir.alloca !fir.type<_QMm_firstprivate_derived_user_defTpoint{x:f32,y:f32,z:f32}> {bindc_name = "a", uniq_name = "_QMm_firstprivate_derived_user_defFtestEa"}
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QMm_firstprivate_derived_user_defFtestEa"} : (!fir.ref<!fir.type<_QMm_firstprivate_derived_user_defTpoint{x:f32,y:f32,z:f32}>>) -> (!fir.ref<!fir.type<_QMm_firstprivate_derived_user_defTpoint{x:f32,y:f32,z:f32}>>, !fir.ref<!fir.type<_QMm_firstprivate_derived_user_defTpoint{x:f32,y:f32,z:f32}>>)
+! CHECK:           %[[VAL_3:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QMm_firstprivate_derived_user_defFtestEi"}
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QMm_firstprivate_derived_user_defFtestEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_5:.*]] = fir.alloca i32 {bindc_name = "n", uniq_name = "_QMm_firstprivate_derived_user_defFtestEn"}
+! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]] {uniq_name = "_QMm_firstprivate_derived_user_defFtestEn"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_7:.*]] = acc.firstprivate varPtr(%[[VAL_2]]#0 : !fir.ref<!fir.type<_QMm_firstprivate_derived_user_defTpoint{x:f32,y:f32,z:f32}>>) -> !fir.ref<!fir.type<_QMm_firstprivate_derived_user_defTpoint{x:f32,y:f32,z:f32}>> {name = "a"}
+! CHECK:           acc.parallel combined(loop) firstprivate(@firstprivatization_ref_rec__QMm_firstprivate_derived_user_defTpoint -> %[[VAL_7]] : !fir.ref<!fir.type<_QMm_firstprivate_derived_user_defTpoint{x:f32,y:f32,z:f32}>>) {
+! CHECK:             %[[VAL_8:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_9:.*]] = fir.load %[[VAL_6]]#0 : !fir.ref<i32>
+! CHECK:             %[[VAL_10:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_11:.*]] = acc.private varPtr(%[[VAL_4]]#0 : !fir.ref<i32>) -> !fir.ref<i32> {implicit = true, name = "i"}
+! CHECK:             %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_11]] {uniq_name = "_QMm_firstprivate_derived_user_defFtestEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:             acc.loop combined(parallel) private(@privatization_ref_i32 -> %[[VAL_11]] : !fir.ref<i32>) control(%[[VAL_13:.*]] : i32) = (%[[VAL_8]] : i32) to (%[[VAL_9]] : i32)  step (%[[VAL_10]] : i32) {
+! CHECK:               fir.store %[[VAL_13]] to %[[VAL_12]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_14:.*]] = arith.constant 1.000000e+00 : f32
+! CHECK:               %[[VAL_15:.*]] = hlfir.designate %[[VAL_2]]#0{"x"}   : (!fir.ref<!fir.type<_QMm_firstprivate_derived_user_defTpoint{x:f32,y:f32,z:f32}>>) -> !fir.ref<f32>
+! CHECK:               hlfir.assign %[[VAL_14]] to %[[VAL_15]] : f32, !fir.ref<f32>
+! CHECK:               acc.yield
+! CHECK:             } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
+! CHECK:             acc.yield
+! CHECK:           }
+! CHECK:           return
+! CHECK:         }
+
+
+! FIR-CHECK-LABEL:   acc.firstprivate.recipe @firstprivatization_ref_rec__QMm_firstprivate_derived_user_defTpoint : !fir.ref<!fir.type<_QMm_firstprivate_derived_user_defTpoint{x:f32,y:f32,z:f32}>> init {
+! FIR-CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.type<_QMm_firstprivate_derived_user_defTpoint{x:f32,y:f32,z:f32}>>):
+! FIR-CHECK:           acc.yield %[[VAL_0]] : !fir.ref<!fir.type<_QMm_firstprivate_derived_user_defTpoint{x:f32,y:f32,z:f32}>>
+! FIR-
+! FIR-CHECK:        } copy {
+! FIR-CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.type<_QMm_firstprivate_derived_user_defTpoint{x:f32,y:f32,z:f32}>>, %[[VAL_1:.*]]: !fir.ref<!fir.type<_QMm_firstprivate_derived_user_defTpoint{x:f32,y:f32,z:f32}>>):
+! FIR-CHECK:           %[[VAL_2:.*]] = fir.field_index x, !fir.type<_QMm_firstprivate_derived_user_defTpoint{x:f32,y:f32,z:f32}>
+! FIR-CHECK:           %[[VAL_3:.*]] = fir.coordinate_of %[[VAL_0]], x : (!fir.ref<!fir.type<_QMm_firstprivate_derived_user_defTpoint{x:f32,y:f32,z:f32}>>) -> !fir.ref<f32>
+! FIR-CHECK:           %[[VAL_4:.*]] = fir.field_index x, !fir.type<_QMm_firstprivate_derived_user_defTpoint{x:f32,y:f32,z:f32}>
+! FIR-CHECK:           %[[VAL_5:.*]] = fir.coordinate_of %[[VAL_1]], x : (!fir.ref<!fir.type<_QMm_firstprivate_derived_user_defTpoint{x:f32,y:f32,z:f32}>>) -> !fir.ref<f32>
+! FIR-CHECK:           %[[VAL_6:.*]] = fir.load %[[VAL_3]] : !fir.ref<f32>
+! FIR-CHECK:           fir.store %[[VAL_6]] to %[[VAL_5]] : !fir.ref<f32>
+! FIR-CHECK:           %[[VAL_7:.*]] = fir.field_index y, !fir.type<_QMm_firstprivate_derived_user_defTpoint{x:f32,y:f32,z:f32}>
+! FIR-CHECK:           %[[VAL_8:.*]] = fir.coordinate_of %[[VAL_0]], y : (!fir.ref<!fir.type<_QMm_firstprivate_derived_user_defTpoint{x:f32,y:f32,z:f32}>>) -> !fir.ref<f32>
+! FIR-CHECK:           %[[VAL_9:.*]] = fir.field_index y, !fir.type<_QMm_firstprivate_derived_user_defTpoint{x:f32,y:f32,z:f32}>
+! FIR-CHECK:           %[[VAL_10:.*]] = fir.coordinate_of %[[VAL_1]], y : (!fir.ref<!fir.type<_QMm_firstprivate_derived_user_defTpoint{x:f32,y:f32,z:f32}>>) -> !fir.ref<f32>
+! FIR-CHECK:           %[[VAL_11:.*]] = fir.load %[[VAL_8]] : !fir.ref<f32>
+! FIR-CHECK:           fir.store %[[VAL_11]] to %[[VAL_10]] : !fir.ref<f32>
+! FIR-CHECK:           %[[VAL_12:.*]] = fir.field_index z, !fir.type<_QMm_firstprivate_derived_user_defTpoint{x:f32,y:f32,z:f32}>
+! FIR-CHECK:           %[[VAL_13:.*]] = fir.coordinate_of %[[VAL_0]], z : (!fir.ref<!fir.type<_QMm_firstprivate_derived_user_defTpoint{x:f32,y:f32,z:f32}>>) -> !fir.ref<f32>
+! FIR-CHECK:           %[[VAL_14:.*]] = fir.field_index z, !fir.type<_QMm_firstprivate_derived_user_defTpoint{x:f32,y:f32,z:f32}>
+! FIR-CHECK:           %[[VAL_15:.*]] = fir.coordinate_of %[[VAL_1]], z : (!fir.ref<!fir.type<_QMm_firstprivate_derived_user_defTpoint{x:f32,y:f32,z:f32}>>) -> !fir.ref<f32>
+! FIR-CHECK:           %[[VAL_16:.*]] = fir.load %[[VAL_13]] : !fir.ref<f32>
+! FIR-CHECK:           fir.store %[[VAL_16]] to %[[VAL_15]] : !fir.ref<f32>
+! FIR-CHECK:           acc.terminator
+! FIR-CHECK:         }
diff --git a/flang/test/Lower/OpenACC/acc-firstprivate-derived.f90 b/flang/test/Lower/OpenACC/acc-firstprivate-derived.f90
new file mode 100644
index 0000000000000..b4580355159ea
--- /dev/null
+++ b/flang/test/Lower/OpenACC/acc-firstprivate-derived.f90
@@ -0,0 +1,85 @@
+! Test lowering of firstprivate on simple derived types (no allocatable/pointer
+! components, no user defined procedures).
+
+! RUN: bbc -fopenacc -emit-hlfir %s -o - | FileCheck %s
+! RUN: bbc -fopenacc -emit-fir %s -o - | FileCheck %s --check-prefix=FIR-CHECK
+
+module m_firstprivate_derived
+ type point
+   real :: x, y, z
+ end type point
+ contains
+   subroutine test()
+     type(point) :: a
+
+     !$acc parallel loop firstprivate(a)
+     do i = 1, n
+      a%x = 1
+     enddo
+   end
+ end module
+
+! CHECK-LABEL:   acc.firstprivate.recipe @firstprivatization_ref_rec__QMm_firstprivate_derivedTpoint : !fir.ref<!fir.type<_QMm_firstprivate_derivedTpoint{x:f32,y:f32,z:f32}>> init {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.type<_QMm_firstprivate_derivedTpoint{x:f32,y:f32,z:f32}>>):
+! CHECK:           acc.yield %[[VAL_0]] : !fir.ref<!fir.type<_QMm_firstprivate_derivedTpoint{x:f32,y:f32,z:f32}>>
+!
+! CHECK:         } copy {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.type<_QMm_firstprivate_derivedTpoint{x:f32,y:f32,z:f32}>>, %[[VAL_1:.*]]: !fir.ref<!fir.type<_QMm_firstprivate_derivedTpoint{x:f32,y:f32,z:f32}>>):
+! CHECK:           hlfir.assign %[[VAL_0]] to %[[VAL_1]] temporary_lhs : !fir.ref<!fir.type<_QMm_firstprivate_derivedTpoint{x:f32,y:f32,z:f32}>>, !fir.ref<!fir.type<_QMm_firstprivate_derivedTpoint{x:f32,y:f32,z:f32}>>
+! CHECK:           acc.terminator
+! CHECK:         }
+!
+! CHECK-LABEL:   func.func @_QMm_firstprivate_derivedPtest() {
+! CHECK:           %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope
+! CHECK:           %[[VAL_1:.*]] = fir.alloca !fir.type<_QMm_firstprivate_derivedTpoint{x:f32,y:f32,z:f32}> {bindc_name = "a", uniq_name = "_QMm_firstprivate_derivedFtestEa"}
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QMm_firstprivate_derivedFtestEa"} : (!fir.ref<!fir.type<_QMm_firstprivate_derivedTpoint{x:f32,y:f32,z:f32}>>) -> (!fir.ref<!fir.type<_QMm_firstprivate_derivedTpoint{x:f32,y:f32,z:f32}>>, !fir.ref<!fir.type<_QMm_firstprivate_derivedTpoint{x:f32,y:f32,z:f32}>>)
+! CHECK:           %[[VAL_3:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QMm_firstprivate_derivedFtestEi"}
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QMm_firstprivate_derivedFtestEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_5:.*]] = fir.alloca i32 {bindc_name = "n", uniq_name = "_QMm_firstprivate_derivedFtestEn"}
+! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]] {uniq_name = "_QMm_firstprivate_derivedFtestEn"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_7:.*]] = acc.firstprivate varPtr(%[[VAL_2]]#0 : !fir.ref<!fir.type<_QMm_firstprivate_derivedTpoint{x:f32,y:f32,z:f32}>>) -> !fir.ref<!fir.type<_QMm_firstprivate_derivedTpoint{x:f32,y:f32,z:f32}>> {name = "a"}
+! CHECK:           acc.parallel combined(loop) firstprivate(@firstprivatization_ref_rec__QMm_firstprivate_derivedTpoint -> %[[VAL_7]] : !fir.ref<!fir.type<_QMm_firstprivate_derivedTpoint{x:f32,y:f32,z:f32}>>) {
+! CHECK:             %[[VAL_8:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_9:.*]] = fir.load %[[VAL_6]]#0 : !fir.ref<i32>
+! CHECK:             %[[VAL_10:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_11:.*]] = acc.private varPtr(%[[VAL_4]]#0 : !fir.ref<i32>) -> !fir.ref<i32> {implicit = true, name = "i"}
+! CHECK:             %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_11]] {uniq_name = "_QMm_firstprivate_derivedFtestEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:             acc.loop combined(parallel) private(@privatization_ref_i32 -> %[[VAL_11]] : !fir.ref<i32>) control(%[[VAL_13:.*]] : i32) = (%[[VAL_8]] : i32) to (%[[VAL_9]] : i32)  step (%[[VAL_10]] : i32) {
+! CHECK:               fir.store %[[VAL_13]] to %[[VAL_12]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_14:.*]] = arith.constant 1.000000e+00 : f32
+! CHECK:               %[[VAL_15:.*]] = hlfir.designate %[[VAL_2]]#0{"x"}   : (!fir.ref<!fir.type<_QMm_firstprivate_derivedTpoint{x:f32,y:f32,z:f32}>>) -> !fir.ref<f32>
+! CHECK:               hlfir.assign %[[VAL_14]] to %[[VAL_15]] : f32, !fir.ref<f32>
+! CHECK:               acc.yield
+! CHECK:             } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
+! CHECK:             acc.yield
+! CHECK:           }
+! CHECK:           return
+! CHECK:         }
+
+
+! FIR-CHECK-LABEL:   acc.firstprivate.recipe @firstprivatization_ref_rec__QMm_firstprivate_derivedTpoint : !fir.ref<!fir.type<_QMm_firstprivate_derivedTpoint{x:f32,y:f32,z:f32}>> init {
+! FIR-CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.type<_QMm_firstprivate_derivedTpoint{x:f32,y:f32,z:f32}>>):
+! FIR-CHECK:           acc.yield %[[VAL_0]] : !fir.ref<!fir.type<_QMm_firstprivate_derivedTpoint{x:f32,y:f32,z:f32}>>
+! FIR-
+! FIR-CHECK:        } copy {
+! FIR-CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.type<_QMm_firstprivate_derivedTpoint{x:f32,y:f32,z:f32}>>, %[[VAL_1:.*]]: !fir.ref<!fir.type<_QMm_firstprivate_derivedTpoint{x:f32,y:f32,z:f32}>>):
+! FIR-CHECK:           %[[VAL_2:.*]] = fir.field_index x, !fir.type<_QMm_firstprivate_derivedTpoint{x:f32,y:f32,z:f32}>
+! FIR-CHECK:           %[[VAL_3:.*]] = fir.coordinate_of %[[VAL_0]], x : (!fir.ref<!fir.type<_QMm_firstprivate_derivedTpoint{x:f32,y:f32,z:f32}>>) -> !fir.ref<f32>
+! FIR-CHECK:           %[[VAL_4:.*]] = fir.field_index x, !fir.type<_QMm_firstprivate_derivedTpoint{x:f32,y:f32,z:f32}>
+! FIR-CHECK:           %[[VAL_5:.*]] = fir.coordinate_of %[[VAL_1]], x : (!fir.ref<!fir.type<_QMm_firstprivate_derivedTpoint{x:f32,y:f32,z:f32}>>) -> !fir.ref<f32>
+! FIR-CHECK:           %[[VAL_6:.*]] = fir.load %[[VAL_3]] : !fir.ref<f32>
+! FIR-CHECK:           fir.store %[[VAL_6]] to %[[VAL_5]] : !fir.ref<f32>
+! FIR-CHECK:           %[[VAL_7:.*]] = fir.field_index y, !fir.type<_QMm_firstprivate_derivedTpoint{x:f32,y:f32,z:f32}>
+! FIR-CHECK:           %[[VAL_8:.*]] = fir.coordinate_of %[[VAL_0]], y : (!fir.ref<!fir.type<_QMm_firstprivate_derivedTpoint{x:f32,y:f32,z:f32}>>) -> !fir.ref<f32>
+! FIR-CHECK:           %[[VAL_9:.*]] = fir.field_index y, !fir.type<_QMm_firstprivate_derivedTpoint{x:f32,y:f32,z:f32}>
+! FIR-CHECK:           %[[VAL_10:.*]] = fir.coordinate_of %[[VAL_1]], y : (!fir.ref<!fir.type<_QMm_firstprivate_derivedTpoint{x:f32,y:f32,z:f32}>>) -> !fir.ref<f32>
+! FIR-CHECK:           %[[VAL_11:.*]] = fir.load %[[VAL_8]] : !fir.ref<f32>
+! FIR-CHECK:           fir.store %[[VAL_11]] to %[[VAL_10]] : !fir.ref<f32>
+! FIR-CHECK:           %[[VAL_12:.*]] = fir.field_index z, !fir.type<_QMm_firstprivate_derivedTpoint{x:f32,y:f32,z:f32}>
+! FIR-CHECK:           %[[VAL_13:.*]] = fir.coordinate_of %[[VAL_0]], z : (!fir.ref<!fir.type<_QMm_firstprivate_derivedTpoint{x:f32,y:f32,z:f32}>>) -> !fir.ref<f32>
+! FIR-CHECK:           %[[VAL_14:.*]] = fir.field_index z, !fir.type<_QMm_firstprivate_derivedTpoint{x:f32,y:f32,z:f32}>
+! FIR-CHECK:           %[[VAL_15:.*]] = fir.coordinate_of %[[VAL_1]], z : (!fir.ref<!fir.type<_QMm_firstprivate_derivedTpoint{x:f32,y:f32,z:f32}>>) -> !fir.ref<f32>
+! FIR-CHECK:           %[[VAL_16:.*]] = fir.load %[[VAL_13]] : !fir.ref<f32>
+! FIR-CHECK:           fir.store %[[VAL_16]] to %[[VAL_15]] : !fir.ref<f32>
+! FIR-CHECK:           acc.terminator
+! FIR-CHECK:         }
diff --git a/flang/test/Lower/OpenMP/lastprivate-alloc-scope.f90 b/flang/test/Lower/OpenMP/lastprivate-alloc-scope.f90
new file mode 100644
index 0000000000000..67d885ed5fb7a
--- /dev/null
+++ b/flang/test/Lower/OpenMP/lastprivate-alloc-scope.f90
@@ -0,0 +1,22 @@
+! RUN: %flang_fc1 -fopenmp -emit-hlfir %s -o - | FileCheck %s
+
+program p
+  type y3; integer, allocatable :: x; end type
+  type(y3) :: v
+  integer :: s, n, i
+  s = 1; n = 10
+  allocate(v%x); v%x = 0
+!$omp parallel
+  if (.not. allocated(v%x)) print *, '101', allocated(v%x)
+!$omp do schedule(dynamic) lastprivate(v)
+  do i = s, n
+    v%x = i
+  end do
+!$omp end do
+!$omp end parallel
+end program
+
+! CHECK:      omp.parallel {
+! CHECK-NOT:  private(
+! CHECK:      omp.wsloop
+! CHECK-SAME: private(
diff --git a/flang/test/Lower/box-address.f90 b/flang/test/Lower/box-address.f90
new file mode 100644
index 0000000000000..04f14188a7bec
--- /dev/null
+++ b/flang/test/Lower/box-address.f90
@@ -0,0 +1,34 @@
+! RUN: flang -fc1 -emit-hlfir %s -o - | FileCheck %s
+
+module m3
+  type x1
+     integer::ix1
+  end type x1
+  type,extends(x1)::x2
+  end type x2
+  type,extends(x2)::x3
+  end type x3
+  class(x1),pointer,dimension(:)::cy1
+contains
+  subroutine dummy()
+  entry      chk(c1)
+   class(x1),dimension(3)::c1
+ end subroutine dummy
+end module m3
+! CHECK-LABEL: func.func @_QMm3Pchk(
+! CHECK-SAME:    %[[ARG0:.*]]: !fir.class<!fir.array<3x!fir.type<_QMm3Tx1{ix1:i32}>>> {fir.bindc_name = "c1"}) {
+! CHECK:         %[[DUMMY_SCOPE:.*]] = fir.dummy_scope : !fir.dscope
+! CHECK:         %[[DECLARE:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[DUMMY_SCOPE]] {uniq_name = "_QMm3FdummyEc1"} : (!fir.class<!fir.array<3x!fir.type<_QMm3Tx1{ix1:i32}>>>, !fir.dscope) -> (!fir.class<!fir.array<3x!fir.type<_QMm3Tx1{ix1:i32}>>>, !fir.class<!fir.array<3x!fir.type<_QMm3Tx1{ix1:i32}>>>)
+
+subroutine s1
+  use m3
+  type(x1),target::ty1(3)
+  ty1%ix1=[1,2,3]
+  cy1=>ty1
+  call chk(cy1)
+end subroutine s1
+
+program main
+  call s1
+  print *,'pass'
+end program main
diff --git a/flang/test/Lower/call-character-array-to-polymorphic-pointer.f90 b/flang/test/Lower/call-character-array-to-polymorphic-pointer.f90
new file mode 100644
index 0000000000000..8644a4a3faf7f
--- /dev/null
+++ b/flang/test/Lower/call-character-array-to-polymorphic-pointer.f90
@@ -0,0 +1,29 @@
+! Test passing character array to unlimited polymorphic array pointer.
+! Regression test from https://github.com/llvm/llvm-project/issues/150749
+
+! RUN: bbc -emit-hlfir -o - %s | FileCheck %s
+
+subroutine char_explicit_shape_array(a2)
+interface
+subroutine char_explicit_shape_array_uclass_callee(p)
+ class(*), pointer, intent(in) :: p(:)
+end subroutine char_explicit_shape_array_uclass_callee
+end interface
+character(*), target :: a2(100)
+call char_explicit_shape_array_uclass_callee(a2)
+end subroutine char_explicit_shape_array
+! CHECK-LABEL:   func.func @_QPchar_explicit_shape_array(
+! CHECK-SAME:      %[[ARG0:.*]]: !fir.boxchar<1> {fir.bindc_name = "a2", fir.target}) {
+! CHECK:           %[[VAL_0:.*]] = fir.alloca !fir.class<!fir.ptr<!fir.array<?xnone>>>
+! CHECK:           %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope
+! CHECK:           %[[VAL_2:.*]]:2 = fir.unboxchar %[[ARG0]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
+! CHECK:           %[[VAL_3:.*]] = fir.convert %[[VAL_2]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.array<100x!fir.char<1,?>>>
+! CHECK:           %[[VAL_4:.*]] = arith.constant 100 : index
+! CHECK:           %[[VAL_5:.*]] = fir.shape %[[VAL_4]] : (index) -> !fir.shape<1>
+! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_3]](%[[VAL_5]]) typeparams %[[VAL_2]]#1 dummy_scope %[[VAL_1]] {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFchar_explicit_shape_arrayEa2"} : (!fir.ref<!fir.array<100x!fir.char<1,?>>>, !fir.shape<1>, index, !fir.dscope) -> (!fir.box<!fir.array<100x!fir.char<1,?>>>, !fir.ref<!fir.array<100x!fir.char<1,?>>>)
+! CHECK:           %[[VAL_7:.*]] = fir.shape %[[VAL_4]] : (index) -> !fir.shape<1>
+! CHECK:           %[[VAL_8:.*]] = fir.embox %[[VAL_6]]#1(%[[VAL_7]]) typeparams %[[VAL_2]]#1 : (!fir.ref<!fir.array<100x!fir.char<1,?>>>, !fir.shape<1>, index) -> !fir.class<!fir.ptr<!fir.array<?xnone>>>
+! CHECK:           fir.store %[[VAL_8]] to %[[VAL_0]] : !fir.ref<!fir.class<!fir.ptr<!fir.array<?xnone>>>>
+! CHECK:           fir.call @_QPchar_explicit_shape_array_uclass_callee(%[[VAL_0]]) fastmath<contract> : (!fir.ref<!fir.class<!fir.ptr<!fir.array<?xnone>>>>) -> ()
+! CHECK:           return
+! CHECK:         }
diff --git a/flang/test/Lower/percent-val-actual-argument.f90 b/flang/test/Lower/percent-val-actual-argument.f90
index 890b1972e80bb..b4e635bef2887 100644
--- a/flang/test/Lower/percent-val-actual-argument.f90
+++ b/flang/test/Lower/percent-val-actual-argument.f90
@@ -6,7 +6,10 @@ program main
   call sa(%val(a1))
 ! CHECK: %[[A1_ADDR:.*]] = fir.address_of(@_QFEa1) : !fir.ref<!fir.logical<4>>
 ! CHECK: %[[A1_DECL:.*]]:2 = hlfir.declare %[[A1_ADDR]] {uniq_name = "_QFEa1"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
-! CHECK: fir.call @_QPsa(%[[A1_DECL]]#0) fastmath<contract> : (!fir.ref<!fir.logical<4>>) -> ()
+! CHECK: %[[A1_LOADED:.*]] = fir.load %[[A1_DECL]]#0 : !fir.ref<!fir.logical<4>>
+! CHECK: %[[SA_ADDR:.*]] = fir.address_of(@_QPsa) : (!fir.ref<!fir.logical<4>>) -> ()
+! CHECK: %[[SA_CONVERT:.*]] = fir.convert %[[SA_ADDR]] : ((!fir.ref<!fir.logical<4>>) -> ()) -> ((!fir.logical<4>) -> ())
+! CHECK: fir.call %[[SA_CONVERT]](%[[A1_LOADED]]) fastmath<contract> : (!fir.logical<4>) -> ()
 ! CHECK: func.func @_QPsa(%[[SA_ARG:.*]]: !fir.ref<!fir.logical<4>> {fir.bindc_name = "x1"}) {
   write(6,*) "a1 = ", a1
 end program main
diff --git a/flang/test/Parser/OpenMP/bind-clause.f90 b/flang/test/Parser/OpenMP/bind-clause.f90
index 5f1e6b47f1c8d..a4fb3aa66c1c8 100644
--- a/flang/test/Parser/OpenMP/bind-clause.f90
+++ b/flang/test/Parser/OpenMP/bind-clause.f90
@@ -19,7 +19,8 @@ subroutine f00
 
 !PARSE-TREE: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPLoopConstruct
 !PARSE-TREE: | OmpBeginLoopDirective
-!PARSE-TREE: | | OmpLoopDirective -> llvm::omp::Directive = loop
+!PARSE-TREE: | | OmpDirectiveName -> llvm::omp::Directive = loop
 !PARSE-TREE: | | OmpClauseList -> OmpClause -> Bind -> OmpBindClause -> Binding = Parallel
+!PARSE-TREE: | | Flags = None
 !PARSE-TREE: | DoConstruct
 
diff --git a/flang/test/Parser/OpenMP/declare-reduction-multi.f90 b/flang/test/Parser/OpenMP/declare-reduction-multi.f90
index 0e1adcc9958d7..693e69d8896be 100644
--- a/flang/test/Parser/OpenMP/declare-reduction-multi.f90
+++ b/flang/test/Parser/OpenMP/declare-reduction-multi.f90
@@ -76,10 +76,11 @@ program omp_examples
 !CHECK: !$OMP PARALLEL DO  REDUCTION(+: sum) 
 !PARSE-TREE: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPLoopConstruct
 !PARSE-TREE: OmpBeginLoopDirective
-!PARSE-TREE: OmpLoopDirective -> llvm::omp::Directive = parallel do
+!PARSE-TREE: OmpDirectiveName -> llvm::omp::Directive = parallel do
 !PARSE-TREE: OmpClauseList -> OmpClause -> Reduction -> OmpReductionClause
 !PARSE-TREE: Modifier -> OmpReductionIdentifier -> DefinedOperator -> IntrinsicOperator = Add
 !PARSE-TREE: OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'sum
+!PARSE-TREE: Flags = None
 !PARSE-TREE: DoConstruct
   do i = 1, n
      sum%r = sum%r + values(i)%r
@@ -90,10 +91,11 @@ program omp_examples
 !CHECK: !$OMP PARALLEL DO  REDUCTION(*: prod)
 !PARSE-TREE:  ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPLoopConstruct
 !PARSE-TREE: OmpBeginLoopDirective
-!PARSE-TREE: OmpLoopDirective -> llvm::omp::Directive = parallel do
+!PARSE-TREE: OmpDirectiveName -> llvm::omp::Directive = parallel do
 !PARSE-TREE: OmpClauseList -> OmpClause -> Reduction -> OmpReductionClause
 !PARSE-TREE: Modifier -> OmpReductionIdentifier -> DefinedOperator -> IntrinsicOperator = Multiply
 !PARSE-TREE: OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'prod'
+!PARSE-TREE: Flags = None
 !PARSE-TREE: DoConstruct
   do i = 1, n
      prod%r = prod%r * (values(i)%r+0.6)
@@ -104,10 +106,11 @@ program omp_examples
 !CHECK:  $OMP PARALLEL DO  REDUCTION(max: big) 
 !PARSE-TREE: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPLoopConstruct
 !PARSE-TREE: OmpBeginLoopDirective
-!PARSE-TREE: OmpLoopDirective -> llvm::omp::Directive = parallel do
+!PARSE-TREE: OmpDirectiveName -> llvm::omp::Directive = parallel do
 !PARSE-TREE: OmpClauseList -> OmpClause -> Reduction -> OmpReductionClause
 !PARSE-TREE: Modifier -> OmpReductionIdentifier -> ProcedureDesignator -> Name = 'max'
 !PARSE-TREE: OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'big'
+!PARSE-TREE: Flags = None
 !PARSE-TREE: DoConstruct
   do i = 1, n
      big = mymax(values(i), big)
@@ -118,10 +121,11 @@ program omp_examples
 !CHECK: !$OMP PARALLEL DO  REDUCTION(min: small)
 !CHECK-TREE: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPLoopConstruct
 !CHECK-TREE: OmpBeginLoopDirective
-!CHECK-TREE: OmpLoopDirective -> llvm::omp::Directive = parallel do
+!CHECK-TREE: OmpDirectiveName -> llvm::omp::Directive = parallel do
 !CHECK-TREE: OmpClauseList -> OmpClause -> Reduction -> OmpReductionClause
 !CHECK-TREE: Modifier -> OmpReductionIdentifier -> ProcedureDesignator -> Name = 'min'
 !CHECK-TREE: OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'small'
+!PARSE-TREE: Flags = None
 !CHECK-TREE: DoConstruct
   do i = 1, n
      small%r = min(values(i)%r, small%r)
diff --git a/flang/test/Parser/OpenMP/declare-reduction-unparse.f90 b/flang/test/Parser/OpenMP/declare-reduction-unparse.f90
index 7b1b569d87f78..0ed693e5821d6 100644
--- a/flang/test/Parser/OpenMP/declare-reduction-unparse.f90
+++ b/flang/test/Parser/OpenMP/declare-reduction-unparse.f90
@@ -30,7 +30,7 @@ end subroutine initme
 !CHECK: !$OMP SIMD REDUCTION(red_add: res)
 !PARSE-TREE: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPLoopConstruct
 !PARSE-TREE:  OmpBeginLoopDirective
-!PARSE-TREE:  OmpLoopDirective -> llvm::omp::Directive = simd
+!PARSE-TREE:  OmpDirectiveName -> llvm::omp::Directive = simd
 !PARSE-TREE:  OmpClauseList -> OmpClause -> Reduction -> OmpReductionClause
 !PARSE-TREE:  Modifier -> OmpReductionIdentifier -> ProcedureDesignator -> Name = 'red_add
   do i=1,n
diff --git a/flang/test/Parser/OpenMP/do-tile-size.f90 b/flang/test/Parser/OpenMP/do-tile-size.f90
index 886ee4a2a680c..9ba6a3a6c2c41 100644
--- a/flang/test/Parser/OpenMP/do-tile-size.f90
+++ b/flang/test/Parser/OpenMP/do-tile-size.f90
@@ -23,7 +23,7 @@ subroutine openmp_do_tiles(x)
 !PARSE-TREE:| | | OmpBeginLoopDirective
 !PARSE-TREE:| | | OpenMPLoopConstruct
 !PARSE-TREE:| | | | OmpBeginLoopDirective
-!PARSE-TREE:| | | | | OmpLoopDirective -> llvm::omp::Directive = tile
+!PARSE-TREE:| | | | | OmpDirectiveName -> llvm::omp::Directive = tile
 !PARSE-TREE:| | | | | OmpClauseList -> OmpClause -> Sizes -> Scalar -> Integer -> Expr = '2_4'
 !PARSE-TREE: | | | | DoConstruct
 END subroutine openmp_do_tiles
diff --git a/flang/test/Parser/OpenMP/doacross-clause.f90 b/flang/test/Parser/OpenMP/doacross-clause.f90
index 8686e1f13a7ab..d2a52c59cc1a0 100644
--- a/flang/test/Parser/OpenMP/doacross-clause.f90
+++ b/flang/test/Parser/OpenMP/doacross-clause.f90
@@ -27,7 +27,7 @@ subroutine f00(x)
 
 !PARSE-TREE-LABEL: ProgramUnit -> SubroutineSubprogram
 !PARSE-TREE: OmpBeginLoopDirective
-!PARSE-TREE: | OmpLoopDirective -> llvm::omp::Directive = do
+!PARSE-TREE: | OmpDirectiveName -> llvm::omp::Directive = do
 !PARSE-TREE: | OmpClauseList -> OmpClause -> Ordered -> Scalar -> Integer -> Constant -> Expr = '2_4'
 !PARSE-TREE: | | LiteralConstant -> IntLiteralConstant = '2'
 ![...]
@@ -61,7 +61,7 @@ subroutine f01(x)
 
 !PARSE-TREE-LABEL: ProgramUnit -> SubroutineSubprogram
 !PARSE-TREE: OmpBeginLoopDirective
-!PARSE-TREE: | OmpLoopDirective -> llvm::omp::Directive = do
+!PARSE-TREE: | OmpDirectiveName -> llvm::omp::Directive = do
 !PARSE-TREE: | OmpClauseList -> OmpClause -> Ordered -> Scalar -> Integer -> Constant -> Expr = '2_4'
 !PARSE-TREE: | | LiteralConstant -> IntLiteralConstant = '2'
 ![...]
diff --git a/flang/test/Parser/OpenMP/if-clause.f90 b/flang/test/Parser/OpenMP/if-clause.f90
index 2bf80cb99f919..3c422ef15918c 100644
--- a/flang/test/Parser/OpenMP/if-clause.f90
+++ b/flang/test/Parser/OpenMP/if-clause.f90
@@ -30,7 +30,7 @@ program openmp_parse_if
   !$omp target data map(tofrom: i) if(target data: cond)
   !$omp end target data
 
-  ! CHECK: OmpLoopDirective -> llvm::omp::Directive = target teams distribute parallel do simd
+  ! CHECK: OmpDirectiveName -> llvm::omp::Directive = target teams distribute parallel do simd
   ! CHECK: OmpClause -> If -> OmpIfClause
   ! CHECK-NEXT: OmpDirectiveName -> llvm::omp::Directive = target
   ! CHECK: OmpClause -> If -> OmpIfClause
@@ -51,7 +51,7 @@ program openmp_parse_if
   !$omp task if(task: cond)
   !$omp end task
 
-  ! CHECK: OmpLoopDirective -> llvm::omp::Directive = taskloop
+  ! CHECK: OmpDirectiveName -> llvm::omp::Directive = taskloop
   ! CHECK-NEXT: OmpClause -> If -> OmpIfClause
   ! CHECK-NEXT: OmpDirectiveName -> llvm::omp::Directive = taskloop
   !$omp taskloop if(taskloop: cond)
diff --git a/flang/test/Parser/OpenMP/in-reduction-clause.f90 b/flang/test/Parser/OpenMP/in-reduction-clause.f90
index 611068e83900e..6059fb27d5be3 100644
--- a/flang/test/Parser/OpenMP/in-reduction-clause.f90
+++ b/flang/test/Parser/OpenMP/in-reduction-clause.f90
@@ -42,10 +42,11 @@ end subroutine omp_in_reduction_taskgroup
 
 !PARSE-TREE: OpenMPConstruct -> OpenMPLoopConstruct
 !PARSE-TREE-NEXT: OmpBeginLoopDirective
-!PARSE-TREE-NEXT: OmpLoopDirective -> llvm::omp::Directive = taskloop
+!PARSE-TREE-NEXT: OmpDirectiveName -> llvm::omp::Directive = taskloop
 !PARSE-TREE-NEXT: OmpClauseList -> OmpClause -> InReduction -> OmpInReductionClause
 !PARSE-TREE-NEXT: OmpReductionIdentifier -> DefinedOperator -> IntrinsicOperator = Add
 !PARSE-TREE-NEXT: OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'z'
+!PARSE-TREE-NEXT: Flags = None
 
 subroutine omp_in_reduction_parallel()
     integer :: z
@@ -72,8 +73,9 @@ end subroutine omp_in_reduction_parallel
 
 !PARSE-TREE: OpenMPConstruct -> OpenMPLoopConstruct
 !PARSE-TREE-NEXT: OmpBeginLoopDirective
-!PARSE-TREE-NEXT: OmpLoopDirective -> llvm::omp::Directive = taskloop simd
+!PARSE-TREE-NEXT: OmpDirectiveName -> llvm::omp::Directive = taskloop simd
 !PARSE-TREE-NEXT: OmpClauseList -> OmpClause -> InReduction -> OmpInReductionClause
 !PARSE-TREE-NEXT: OmpReductionIdentifier -> DefinedOperator -> IntrinsicOperator = Add
 !PARSE-TREE-NEXT: OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'z'
+!PARSE-TREE-NEXT: Flags = None
 
diff --git a/flang/test/Parser/OpenMP/lastprivate-clause.f90 b/flang/test/Parser/OpenMP/lastprivate-clause.f90
index ac25174f3cc42..6364b74255521 100644
--- a/flang/test/Parser/OpenMP/lastprivate-clause.f90
+++ b/flang/test/Parser/OpenMP/lastprivate-clause.f90
@@ -21,7 +21,7 @@ subroutine foo1()
 
 !PARSE-TREE: SubroutineStmt
 !PARSE-TREE:   Name = 'foo1'
-!PARSE-TREE: OmpLoopDirective -> llvm::omp::Directive = parallel do
+!PARSE-TREE: OmpDirectiveName -> llvm::omp::Directive = parallel do
 !PARSE-TREE: OmpClauseList -> OmpClause -> Lastprivate -> OmpLastprivateClause
 !PARSE-TREE:   OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'x'
 !PARSE-TREE: EndSubroutineStmt
@@ -47,7 +47,7 @@ subroutine foo2()
 
 !PARSE-TREE: SubroutineStmt
 !PARSE-TREE:   Name = 'foo2'
-!PARSE-TREE: OmpLoopDirective -> llvm::omp::Directive = parallel do
+!PARSE-TREE: OmpDirectiveName -> llvm::omp::Directive = parallel do
 !PARSE-TREE: OmpClauseList -> OmpClause -> Lastprivate -> OmpLastprivateClause
 !PARSE-TREE:   Modifier -> OmpLastprivateModifier -> Value = Conditional
 !PARSE-TREE:   OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'x'
diff --git a/flang/test/Parser/OpenMP/linear-clause.f90 b/flang/test/Parser/OpenMP/linear-clause.f90
index 5f031b0694149..5ea31ce58fc5a 100644
--- a/flang/test/Parser/OpenMP/linear-clause.f90
+++ b/flang/test/Parser/OpenMP/linear-clause.f90
@@ -18,10 +18,11 @@ subroutine f00(x)
 !UNPARSE: END SUBROUTINE
 
 !PARSE-TREE: OmpBeginLoopDirective
-!PARSE-TREE: | OmpLoopDirective -> llvm::omp::Directive = do
+!PARSE-TREE: | OmpDirectiveName -> llvm::omp::Directive = do
 !PARSE-TREE: | OmpClauseList -> OmpClause -> Linear -> OmpLinearClause
 !PARSE-TREE: | | OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'x'
 !PARSE-TREE: | | bool = 'true'
+!PARSE-TREE: | Flags = None
 !PARSE-TREE: DoConstruct
 
 subroutine f01(x)
@@ -41,12 +42,13 @@ subroutine f01(x)
 !UNPARSE: END SUBROUTINE
 
 !PARSE-TREE: OmpBeginLoopDirective
-!PARSE-TREE: | OmpLoopDirective -> llvm::omp::Directive = do
+!PARSE-TREE: | OmpDirectiveName -> llvm::omp::Directive = do
 !PARSE-TREE: | OmpClauseList -> OmpClause -> Linear -> OmpLinearClause
 !PARSE-TREE: | | OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'x'
 !PARSE-TREE: | | Modifier -> OmpStepSimpleModifier -> Scalar -> Integer -> Expr = '2_4'
 !PARSE-TREE: | | | LiteralConstant -> IntLiteralConstant = '2'
 !PARSE-TREE: | | bool = 'true'
+!PARSE-TREE: | Flags = None
 !PARSE-TREE: DoConstruct
 
 subroutine f02(x)
@@ -66,12 +68,13 @@ subroutine f02(x)
 !UNPARSE: END SUBROUTINE
 
 !PARSE-TREE: OmpBeginLoopDirective
-!PARSE-TREE: | OmpLoopDirective -> llvm::omp::Directive = do
+!PARSE-TREE: | OmpDirectiveName -> llvm::omp::Directive = do
 !PARSE-TREE: | OmpClauseList -> OmpClause -> Linear -> OmpLinearClause
 !PARSE-TREE: | | OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'x'
 !PARSE-TREE: | | Modifier -> OmpStepComplexModifier -> Scalar -> Integer -> Expr = '3_4'
 !PARSE-TREE: | | | LiteralConstant -> IntLiteralConstant = '3'
 !PARSE-TREE: | | bool = 'true'
+!PARSE-TREE: | Flags = None
 !PARSE-TREE: DoConstruct
 
 subroutine f03(x)
diff --git a/flang/test/Parser/OpenMP/loop-transformation-construct01.f90 b/flang/test/Parser/OpenMP/loop-transformation-construct01.f90
index baffc2f6e2f1e..9595889b1bf98 100644
--- a/flang/test/Parser/OpenMP/loop-transformation-construct01.f90
+++ b/flang/test/Parser/OpenMP/loop-transformation-construct01.f90
@@ -21,12 +21,14 @@ subroutine loop_transformation_construct
 !CHECK-PARSE: | ExecutionPart -> Block
 !CHECK-PARSE-NEXT: | | ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPLoopConstruct
 !CHECK-PARSE-NEXT: | | | OmpBeginLoopDirective
-!CHECK-PARSE-NEXT: | | | | OmpLoopDirective -> llvm::omp::Directive = do
+!CHECK-PARSE-NEXT: | | | | OmpDirectiveName -> llvm::omp::Directive = do
 !CHECK-PARSE-NEXT: | | | | OmpClauseList ->
+!CHECK-PARSE-NEXT: | | | | Flags = None
 !CHECK-PARSE-NEXT: | | | OpenMPLoopConstruct
 !CHECK-PARSE-NEXT: | | | | OmpBeginLoopDirective
-!CHECK-PARSE-NEXT: | | | | | OmpLoopDirective -> llvm::omp::Directive = unroll
+!CHECK-PARSE-NEXT: | | | | | OmpDirectiveName -> llvm::omp::Directive = unroll
 !CHECK-PARSE-NEXT: | | | | | OmpClauseList ->
+!CHECK-PARSE-NEXT: | | | | | Flags = None
 !CHECK-PARSE-NEXT: | | | | DoConstruct
 !CHECK-PARSE-NEXT: | | | | | NonLabelDoStmt
 !CHECK-PARSE-NEXT: | | | | | | LoopControl -> LoopBounds
@@ -53,11 +55,13 @@ subroutine loop_transformation_construct
 !CHECK-PARSE-NEXT: | | | | | | | | | | LiteralConstant -> IntLiteralConstant = '5'
 !CHECK-PARSE-NEXT: | | | | | EndDoStmt ->
 !CHECK-PARSE-NEXT: | | | | OmpEndLoopDirective
-!CHECK-PARSE-NEXT: | | | | | OmpLoopDirective -> llvm::omp::Directive = unroll
+!CHECK-PARSE-NEXT: | | | | | OmpDirectiveName -> llvm::omp::Directive = unroll
 !CHECK-PARSE-NEXT: | | | | | OmpClauseList ->
+!CHECK-PARSE-NEXT: | | | | | Flags = None
 !CHECK-PARSE-NEXT: | | | OmpEndLoopDirective
-!CHECK-PARSE-NEXT: | | | | OmpLoopDirective -> llvm::omp::Directive = do
+!CHECK-PARSE-NEXT: | | | | OmpDirectiveName -> llvm::omp::Directive = do
 !CHECK-PARSE-NEXT: | | | | OmpClauseList ->
+!CHECK-PARSE-NEXT: | | | | Flags = None
 
 !CHECK-UNPARSE: SUBROUTINE loop_transformation_construct
 !CHECK-UNPARSE-NEXT:  IMPLICIT NONE
diff --git a/flang/test/Parser/OpenMP/loop-transformation-construct02.f90 b/flang/test/Parser/OpenMP/loop-transformation-construct02.f90
index b50e7183841cc..a6af35a0111a3 100644
--- a/flang/test/Parser/OpenMP/loop-transformation-construct02.f90
+++ b/flang/test/Parser/OpenMP/loop-transformation-construct02.f90
@@ -23,16 +23,19 @@ subroutine loop_transformation_construct
 !CHECK-PARSE: | ExecutionPart -> Block
 !CHECK-PARSE-NEXT: | | ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPLoopConstruct
 !CHECK-PARSE-NEXT: | | | OmpBeginLoopDirective
-!CHECK-PARSE-NEXT: | | | | OmpLoopDirective -> llvm::omp::Directive = do
+!CHECK-PARSE-NEXT: | | | | OmpDirectiveName -> llvm::omp::Directive = do
 !CHECK-PARSE-NEXT: | | | | OmpClauseList ->
+!CHECK-PARSE-NEXT: | | | | Flags = None
 !CHECK-PARSE-NEXT: | | | OpenMPLoopConstruct
 !CHECK-PARSE-NEXT: | | | | OmpBeginLoopDirective
-!CHECK-PARSE-NEXT: | | | | | OmpLoopDirective -> llvm::omp::Directive = unroll
+!CHECK-PARSE-NEXT: | | | | | OmpDirectiveName -> llvm::omp::Directive = unroll
 !CHECK-PARSE-NEXT: | | | | | OmpClauseList ->
+!CHECK-PARSE-NEXT: | | | | | Flags = None
 !CHECK-PARSE-NEXT: | | | | OpenMPLoopConstruct
 !CHECK-PARSE-NEXT: | | | | | OmpBeginLoopDirective
-!CHECK-PARSE-NEXT: | | | | | | OmpLoopDirective -> llvm::omp::Directive = tile
+!CHECK-PARSE-NEXT: | | | | | | OmpDirectiveName -> llvm::omp::Directive = tile
 !CHECK-PARSE-NEXT: | | | | | | OmpClauseList ->
+!CHECK-PARSE-NEXT: | | | | | | Flags = None
 !CHECK-PARSE-NEXT: | | | | | DoConstruct
 !CHECK-PARSE-NEXT: | | | | | | NonLabelDoStmt
 !CHECK-PARSE-NEXT: | | | | | | | LoopControl -> LoopBounds
@@ -59,14 +62,17 @@ subroutine loop_transformation_construct
 !CHECK-PARSE-NEXT: | | | | | | | | | | | LiteralConstant -> IntLiteralConstant = '5'
 !CHECK-PARSE-NEXT: | | | | | | EndDoStmt ->
 !CHECK-PARSE-NEXT: | | | | | OmpEndLoopDirective
-!CHECK-PARSE-NEXT: | | | | | | OmpLoopDirective -> llvm::omp::Directive = tile
+!CHECK-PARSE-NEXT: | | | | | | OmpDirectiveName -> llvm::omp::Directive = tile
 !CHECK-PARSE-NEXT: | | | | | | OmpClauseList ->
+!CHECK-PARSE-NEXT: | | | | | | Flags = None
 !CHECK-PARSE-NEXT: | | | | OmpEndLoopDirective
-!CHECK-PARSE-NEXT: | | | | | OmpLoopDirective -> llvm::omp::Directive = unroll
+!CHECK-PARSE-NEXT: | | | | | OmpDirectiveName -> llvm::omp::Directive = unroll
 !CHECK-PARSE-NEXT: | | | | | OmpClauseList ->
+!CHECK-PARSE-NEXT: | | | | | Flags = None
 !CHECK-PARSE-NEXT: | | | OmpEndLoopDirective
-!CHECK-PARSE-NEXT: | | | | OmpLoopDirective -> llvm::omp::Directive = do
+!CHECK-PARSE-NEXT: | | | | OmpDirectiveName -> llvm::omp::Directive = do
 !CHECK-PARSE-NEXT: | | | | OmpClauseList ->
+!CHECK-PARSE-NEXT: | | | | Flags = None
 
 !CHECK-UNPARSE: SUBROUTINE loop_transformation_construct
 !CHECK-UNPARSE-NEXT:  IMPLICIT NONE
diff --git a/flang/test/Parser/OpenMP/loop-transformation-construct03.f90 b/flang/test/Parser/OpenMP/loop-transformation-construct03.f90
index 81f0de1b76263..8725025a51321 100644
--- a/flang/test/Parser/OpenMP/loop-transformation-construct03.f90
+++ b/flang/test/Parser/OpenMP/loop-transformation-construct03.f90
@@ -21,10 +21,11 @@ subroutine loop_transformation_construct7
 !CHECK-PARSE: | ExecutionPart -> Block
 !CHECK-PARSE-NEXT: | | ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPLoopConstruct
 !CHECK-PARSE-NEXT: | | | OmpBeginLoopDirective
-!CHECK-PARSE-NEXT: | | | | OmpLoopDirective -> llvm::omp::Directive = target teams distribute parallel do
+!CHECK-PARSE-NEXT: | | | | OmpDirectiveName -> llvm::omp::Directive = target teams distribute parallel do
 !CHECK-PARSE-NEXT: | | | | OmpClauseList -> OmpClause -> Collapse -> Scalar -> Integer -> Constant -> Expr = '2_4'
 !CHECK-PARSE-NEXT: | | | | | LiteralConstant -> IntLiteralConstant = '2'
 !CHECK-PARSE-NEXT: | | | | OmpClause -> Private -> OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'b'
+!CHECK-PARSE-NEXT: | | | | Flags = None
 !CHECK-PARSE-NEXT: | | | DoConstruct
 !CHECK-PARSE-NEXT: | | | | NonLabelDoStmt
 !CHECK-PARSE-NEXT: | | | | | LoopControl -> LoopBounds
diff --git a/flang/test/Parser/OpenMP/masked-unparse.f90 b/flang/test/Parser/OpenMP/masked-unparse.f90
index 46ddd3722216a..786b60416846f 100644
--- a/flang/test/Parser/OpenMP/masked-unparse.f90
+++ b/flang/test/Parser/OpenMP/masked-unparse.f90
@@ -25,7 +25,7 @@ subroutine test_masked()
 subroutine test_masked_taskloop_simd()
   integer :: i, j = 1
   !PARSE-TREE: OmpBeginLoopDirective
-  !PARSE-TREE-NEXT: OmpLoopDirective -> llvm::omp::Directive = masked taskloop simd
+  !PARSE-TREE-NEXT: OmpDirectiveName -> llvm::omp::Directive = masked taskloop simd
   !CHECK: !$omp masked taskloop simd
   !$omp masked taskloop simd 
   do i=1,10
@@ -37,7 +37,7 @@ subroutine test_masked_taskloop_simd()
 subroutine test_masked_taskloop
   integer :: i, j = 1
   !PARSE-TREE: OmpBeginLoopDirective
-  !PARSE-TREE-NEXT: OmpLoopDirective -> llvm::omp::Directive = masked taskloop
+  !PARSE-TREE-NEXT: OmpDirectiveName -> llvm::omp::Directive = masked taskloop
   !PARSE-TREE-NEXT: OmpClauseList -> OmpClause -> Filter -> Scalar -> Integer -> Expr = '2_4'
   !PARSE-TREE-NEXT: LiteralConstant -> IntLiteralConstant = '2'
   !CHECK: !$omp masked taskloop filter(2_4)
@@ -68,7 +68,7 @@ subroutine test_parallel_masked
 subroutine test_parallel_masked_taskloop_simd
   integer :: i, j = 1
   !PARSE-TREE: OmpBeginLoopDirective
-  !PARSE-TREE-NEXT: OmpLoopDirective -> llvm::omp::Directive = parallel masked taskloop simd
+  !PARSE-TREE-NEXT: OmpDirectiveName -> llvm::omp::Directive = parallel masked taskloop simd
   !CHECK: !$omp parallel masked taskloop simd
   !$omp parallel masked taskloop simd 
   do i=1,10
@@ -80,7 +80,7 @@ subroutine test_parallel_masked_taskloop_simd
 subroutine test_parallel_masked_taskloop
   integer :: i, j = 1
   !PARSE-TREE: OmpBeginLoopDirective
-  !PARSE-TREE-NEXT: OmpLoopDirective -> llvm::omp::Directive = parallel masked taskloop
+  !PARSE-TREE-NEXT: OmpDirectiveName -> llvm::omp::Directive = parallel masked taskloop
   !PARSE-TREE-NEXT: OmpClauseList -> OmpClause -> Filter -> Scalar -> Integer -> Expr = '2_4'
   !PARSE-TREE-NEXT: LiteralConstant -> IntLiteralConstant = '2'
   !CHECK: !$omp parallel masked taskloop filter(2_4)
diff --git a/flang/test/Parser/OpenMP/master-unparse.f90 b/flang/test/Parser/OpenMP/master-unparse.f90
index ec7a7d3845014..36935d4fe1a7d 100644
--- a/flang/test/Parser/OpenMP/master-unparse.f90
+++ b/flang/test/Parser/OpenMP/master-unparse.f90
@@ -17,7 +17,7 @@ subroutine test_master()
 subroutine test_master_taskloop_simd()
   integer :: i, j = 1
   !PARSE-TREE: OmpBeginLoopDirective
-  !PARSE-TREE-NEXT: OmpLoopDirective -> llvm::omp::Directive = master taskloop simd
+  !PARSE-TREE-NEXT: OmpDirectiveName -> llvm::omp::Directive = master taskloop simd
   !CHECK: !$omp master taskloop simd
   !$omp master taskloop simd 
   do i=1,10
@@ -29,7 +29,7 @@ subroutine test_master_taskloop_simd()
 subroutine test_master_taskloop
   integer :: i, j = 1
   !PARSE-TREE: OmpBeginLoopDirective
-  !PARSE-TREE-NEXT: OmpLoopDirective -> llvm::omp::Directive = master taskloop
+  !PARSE-TREE-NEXT: OmpDirectiveName -> llvm::omp::Directive = master taskloop
   !CHECK: !$omp master taskloop
   !$omp master taskloop
   do i=1,10
@@ -51,7 +51,7 @@ subroutine test_parallel_master
 subroutine test_parallel_master_taskloop_simd
   integer :: i, j = 1
   !PARSE-TREE: OmpBeginLoopDirective
-  !PARSE-TREE-NEXT: OmpLoopDirective -> llvm::omp::Directive = parallel master taskloop simd
+  !PARSE-TREE-NEXT: OmpDirectiveName -> llvm::omp::Directive = parallel master taskloop simd
   !CHECK: !$omp parallel master taskloop simd
   !$omp parallel master taskloop simd 
   do i=1,10
@@ -63,7 +63,7 @@ subroutine test_parallel_master_taskloop_simd
 subroutine test_parallel_master_taskloop
   integer :: i, j = 1
   !PARSE-TREE: OmpBeginLoopDirective
-  !PARSE-TREE-NEXT: OmpLoopDirective -> llvm::omp::Directive = parallel master taskloop
+  !PARSE-TREE-NEXT: OmpDirectiveName -> llvm::omp::Directive = parallel master taskloop
   !CHECK: !$omp parallel master taskloop
   !$omp parallel master taskloop
   do i=1,10
diff --git a/flang/test/Parser/OpenMP/order-clause01.f90 b/flang/test/Parser/OpenMP/order-clause01.f90
index f810eb74ee29d..087e400934de5 100644
--- a/flang/test/Parser/OpenMP/order-clause01.f90
+++ b/flang/test/Parser/OpenMP/order-clause01.f90
@@ -15,9 +15,10 @@ subroutine test_do_order()
 
 !PARSE-TREE: OpenMPConstruct -> OpenMPLoopConstruct
 !PARSE-TREE-NEXT: OmpBeginLoopDirective
-!PARSE-TREE-NEXT: OmpLoopDirective -> llvm::omp::Directive = do
+!PARSE-TREE-NEXT: OmpDirectiveName -> llvm::omp::Directive = do
 !PARSE-TREE-NEXT: OmpClauseList -> OmpClause -> Order -> OmpOrderClause
 !PARSE-TREE-NEXT: Ordering = Concurrent
+!PARSE-TREE-NEXT: Flags = None
 
 subroutine test_simd_order_reproducible()
  integer :: i, j = 1
@@ -31,10 +32,11 @@ subroutine test_simd_order_reproducible()
 
 !PARSE-TREE: OpenMPConstruct -> OpenMPLoopConstruct
 !PARSE-TREE-NEXT: OmpBeginLoopDirective
-!PARSE-TREE-NEXT: OmpLoopDirective -> llvm::omp::Directive = simd
+!PARSE-TREE-NEXT: OmpDirectiveName -> llvm::omp::Directive = simd
 !PARSE-TREE-NEXT: OmpClauseList -> OmpClause -> Order -> OmpOrderClause
 !PARSE-TREE-NEXT: OmpOrderModifier -> Value = Reproducible
 !PARSE-TREE-NEXT: Ordering = Concurrent
+!PARSE-TREE-NEXT: Flags = None
 
 subroutine test_do_simd_order_unconstrained()
  integer :: i, j = 1
@@ -48,10 +50,11 @@ subroutine test_do_simd_order_unconstrained()
 
 !PARSE-TREE: OpenMPConstruct -> OpenMPLoopConstruct
 !PARSE-TREE-NEXT: OmpBeginLoopDirective
-!PARSE-TREE-NEXT: OmpLoopDirective -> llvm::omp::Directive = do simd
+!PARSE-TREE-NEXT: OmpDirectiveName -> llvm::omp::Directive = do simd
 !PARSE-TREE-NEXT: OmpClauseList -> OmpClause -> Order -> OmpOrderClause
 !PARSE-TREE-NEXT: OmpOrderModifier -> Value = Unconstrained
 !PARSE-TREE-NEXT: Ordering = Concurrent
+!PARSE-TREE-NEXT: Flags = None
 
 subroutine test_parallel_do_order()
  integer :: i, j = 1
@@ -65,9 +68,10 @@ subroutine test_parallel_do_order()
 
 !PARSE-TREE: OpenMPConstruct -> OpenMPLoopConstruct
 !PARSE-TREE-NEXT: OmpBeginLoopDirective
-!PARSE-TREE-NEXT: OmpLoopDirective -> llvm::omp::Directive = parallel do
+!PARSE-TREE-NEXT: OmpDirectiveName -> llvm::omp::Directive = parallel do
 !PARSE-TREE-NEXT: OmpClauseList -> OmpClause -> Order -> OmpOrderClause
 !PARSE-TREE-NEXT: Ordering = Concurrent
+!PARSE-TREE-NEXT: Flags = None
 
 subroutine test_parallel_do_simd_order_reproducible()
  integer :: i, j = 1
@@ -81,10 +85,11 @@ subroutine test_parallel_do_simd_order_reproducible()
 
 !PARSE-TREE: OpenMPConstruct -> OpenMPLoopConstruct
 !PARSE-TREE-NEXT: OmpBeginLoopDirective
-!PARSE-TREE-NEXT: OmpLoopDirective -> llvm::omp::Directive = parallel do simd
+!PARSE-TREE-NEXT: OmpDirectiveName -> llvm::omp::Directive = parallel do simd
 !PARSE-TREE-NEXT: OmpClauseList -> OmpClause -> Order -> OmpOrderClause
 !PARSE-TREE-NEXT: OmpOrderModifier -> Value = Reproducible
 !PARSE-TREE-NEXT: Ordering = Concurrent
+!PARSE-TREE-NEXT: Flags = None
 
 subroutine test_target_simd_order_unconstrained()
  integer :: i, j = 1
@@ -98,10 +103,11 @@ subroutine test_target_simd_order_unconstrained()
 
 !PARSE-TREE: OpenMPConstruct -> OpenMPLoopConstruct
 !PARSE-TREE-NEXT: OmpBeginLoopDirective
-!PARSE-TREE-NEXT: OmpLoopDirective -> llvm::omp::Directive = target simd
+!PARSE-TREE-NEXT: OmpDirectiveName -> llvm::omp::Directive = target simd
 !PARSE-TREE-NEXT: OmpClauseList -> OmpClause -> Order -> OmpOrderClause
 !PARSE-TREE-NEXT: OmpOrderModifier -> Value = Unconstrained
 !PARSE-TREE-NEXT: Ordering = Concurrent
+!PARSE-TREE-NEXT: Flags = None
 
 subroutine test_target_parallel_do_order()
  integer :: i, j = 1
@@ -115,9 +121,10 @@ subroutine test_target_parallel_do_order()
 
 !PARSE-TREE: OpenMPConstruct -> OpenMPLoopConstruct
 !PARSE-TREE-NEXT: OmpBeginLoopDirective
-!PARSE-TREE-NEXT: OmpLoopDirective -> llvm::omp::Directive = target parallel do
+!PARSE-TREE-NEXT: OmpDirectiveName -> llvm::omp::Directive = target parallel do
 !PARSE-TREE-NEXT: OmpClauseList -> OmpClause -> Order -> OmpOrderClause
 !PARSE-TREE-NEXT: Ordering = Concurrent
+!PARSE-TREE-NEXT: Flags = None
 
 subroutine test_target_parallel_do_simd_order_reproducible()
  integer :: i, j = 1
@@ -131,10 +138,11 @@ subroutine test_target_parallel_do_simd_order_reproducible()
 
 !PARSE-TREE: OpenMPConstruct -> OpenMPLoopConstruct
 !PARSE-TREE-NEXT: OmpBeginLoopDirective
-!PARSE-TREE-NEXT: OmpLoopDirective -> llvm::omp::Directive = target parallel do simd
+!PARSE-TREE-NEXT: OmpDirectiveName -> llvm::omp::Directive = target parallel do simd
 !PARSE-TREE-NEXT: OmpClauseList -> OmpClause -> Order -> OmpOrderClause
 !PARSE-TREE-NEXT: OmpOrderModifier -> Value = Reproducible
 !PARSE-TREE-NEXT: Ordering = Concurrent
+!PARSE-TREE-NEXT: Flags = None
 
 subroutine test_teams_distribute_simd_order_unconstrained()
  integer :: i, j = 1
@@ -148,10 +156,11 @@ subroutine test_teams_distribute_simd_order_unconstrained()
 
 !PARSE-TREE: OpenMPConstruct -> OpenMPLoopConstruct
 !PARSE-TREE-NEXT: OmpBeginLoopDirective
-!PARSE-TREE-NEXT: OmpLoopDirective -> llvm::omp::Directive = teams distribute simd
+!PARSE-TREE-NEXT: OmpDirectiveName -> llvm::omp::Directive = teams distribute simd
 !PARSE-TREE-NEXT: OmpClauseList -> OmpClause -> Order -> OmpOrderClause
 !PARSE-TREE-NEXT: OmpOrderModifier -> Value = Unconstrained
 !PARSE-TREE-NEXT: Ordering = Concurrent
+!PARSE-TREE-NEXT: Flags = None
 
 subroutine test_teams_distribute_parallel_do_order()
  integer :: i, j = 1
@@ -165,9 +174,10 @@ subroutine test_teams_distribute_parallel_do_order()
 
 !PARSE-TREE: OpenMPConstruct -> OpenMPLoopConstruct
 !PARSE-TREE-NEXT: OmpBeginLoopDirective
-!PARSE-TREE-NEXT: OmpLoopDirective -> llvm::omp::Directive = teams distribute parallel do
+!PARSE-TREE-NEXT: OmpDirectiveName -> llvm::omp::Directive = teams distribute parallel do
 !PARSE-TREE-NEXT: OmpClauseList -> OmpClause -> Order -> OmpOrderClause
 !PARSE-TREE-NEXT: Ordering = Concurrent
+!PARSE-TREE-NEXT: Flags = None
 
 subroutine test_teams_distribute_parallel_do_simd_order_reproducible()
  integer :: i, j = 1
@@ -181,10 +191,11 @@ subroutine test_teams_distribute_parallel_do_simd_order_reproducible()
 
 !PARSE-TREE: OpenMPConstruct -> OpenMPLoopConstruct
 !PARSE-TREE-NEXT: OmpBeginLoopDirective
-!PARSE-TREE-NEXT: OmpLoopDirective -> llvm::omp::Directive = teams distribute parallel do simd
+!PARSE-TREE-NEXT: OmpDirectiveName -> llvm::omp::Directive = teams distribute parallel do simd
 !PARSE-TREE-NEXT: OmpClauseList -> OmpClause -> Order -> OmpOrderClause
 !PARSE-TREE-NEXT: OmpOrderModifier -> Value = Reproducible
 !PARSE-TREE-NEXT: Ordering = Concurrent
+!PARSE-TREE-NEXT: Flags = None
 
 subroutine test_target_teams_distribute_simd_order_unconstrained()
  integer :: i, j = 1
@@ -198,10 +209,11 @@ subroutine test_target_teams_distribute_simd_order_unconstrained()
 
 !PARSE-TREE: OpenMPConstruct -> OpenMPLoopConstruct
 !PARSE-TREE-NEXT: OmpBeginLoopDirective
-!PARSE-TREE-NEXT: OmpLoopDirective -> llvm::omp::Directive = target teams distribute simd
+!PARSE-TREE-NEXT: OmpDirectiveName -> llvm::omp::Directive = target teams distribute simd
 !PARSE-TREE-NEXT: OmpClauseList -> OmpClause -> Order -> OmpOrderClause
 !PARSE-TREE-NEXT: OmpOrderModifier -> Value = Unconstrained
 !PARSE-TREE-NEXT: Ordering = Concurrent
+!PARSE-TREE-NEXT: Flags = None
 
 subroutine test_target_teams_distribute_parallel_do_order()
  integer :: i, j = 1
@@ -215,9 +227,10 @@ subroutine test_target_teams_distribute_parallel_do_order()
 
 !PARSE-TREE: OpenMPConstruct -> OpenMPLoopConstruct
 !PARSE-TREE-NEXT: OmpBeginLoopDirective
-!PARSE-TREE-NEXT: OmpLoopDirective -> llvm::omp::Directive = target teams distribute parallel do
+!PARSE-TREE-NEXT: OmpDirectiveName -> llvm::omp::Directive = target teams distribute parallel do
 !PARSE-TREE-NEXT: OmpClauseList -> OmpClause -> Order -> OmpOrderClause
 !PARSE-TREE-NEXT: Ordering = Concurrent
+!PARSE-TREE-NEXT: Flags = None
 
 subroutine test_target_teams_distribute_parallel_do_simd_order_reproducible()
  integer :: i, j = 1
@@ -231,10 +244,11 @@ subroutine test_target_teams_distribute_parallel_do_simd_order_reproducible()
 
 !PARSE-TREE: OpenMPConstruct -> OpenMPLoopConstruct
 !PARSE-TREE-NEXT: OmpBeginLoopDirective
-!PARSE-TREE-NEXT: OmpLoopDirective -> llvm::omp::Directive = target teams distribute parallel do simd
+!PARSE-TREE-NEXT: OmpDirectiveName -> llvm::omp::Directive = target teams distribute parallel do simd
 !PARSE-TREE-NEXT: OmpClauseList -> OmpClause -> Order -> OmpOrderClause
 !PARSE-TREE-NEXT: OmpOrderModifier -> Value = Reproducible
 !PARSE-TREE-NEXT: Ordering = Concurrent
+!PARSE-TREE-NEXT: Flags = None
 
 subroutine test_taskloop_simd_order_unconstrained()
  integer :: i, j = 1
@@ -248,7 +262,8 @@ subroutine test_taskloop_simd_order_unconstrained()
 
 !PARSE-TREE: OpenMPConstruct -> OpenMPLoopConstruct
 !PARSE-TREE-NEXT: OmpBeginLoopDirective
-!PARSE-TREE-NEXT: OmpLoopDirective -> llvm::omp::Directive = taskloop simd
+!PARSE-TREE-NEXT: OmpDirectiveName -> llvm::omp::Directive = taskloop simd
 !PARSE-TREE-NEXT: OmpClauseList -> OmpClause -> Order -> OmpOrderClause
 !PARSE-TREE-NEXT: OmpOrderModifier -> Value = Unconstrained
 !PARSE-TREE-NEXT: Ordering = Concurrent
+!PARSE-TREE-NEXT: Flags = None
diff --git a/flang/test/Parser/OpenMP/ordered-depend.f90 b/flang/test/Parser/OpenMP/ordered-depend.f90
index 71eff105e03c6..4826d134362c8 100644
--- a/flang/test/Parser/OpenMP/ordered-depend.f90
+++ b/flang/test/Parser/OpenMP/ordered-depend.f90
@@ -27,7 +27,7 @@ subroutine f00(x)
 
 !PARSE-TREE-LABEL: ProgramUnit -> SubroutineSubprogram
 !PARSE-TREE: OmpBeginLoopDirective
-!PARSE-TREE: | OmpLoopDirective -> llvm::omp::Directive = do
+!PARSE-TREE: | OmpDirectiveName -> llvm::omp::Directive = do
 !PARSE-TREE: | OmpClauseList -> OmpClause -> Ordered -> Scalar -> Integer -> Constant -> Expr = '2_4'
 !PARSE-TREE: | | LiteralConstant -> IntLiteralConstant = '2'
 ![...]
@@ -61,7 +61,7 @@ subroutine f01(x)
 
 !PARSE-TREE-LABEL: ProgramUnit -> SubroutineSubprogram
 !PARSE-TREE: OmpBeginLoopDirective
-!PARSE-TREE: | OmpLoopDirective -> llvm::omp::Directive = do
+!PARSE-TREE: | OmpDirectiveName -> llvm::omp::Directive = do
 !PARSE-TREE: | OmpClauseList -> OmpClause -> Ordered -> Scalar -> Integer -> Constant -> Expr = '2_4'
 !PARSE-TREE: | | LiteralConstant -> IntLiteralConstant = '2'
 ![...]
diff --git a/flang/test/Parser/OpenMP/reduction-modifier.f90 b/flang/test/Parser/OpenMP/reduction-modifier.f90
index 56303af66395e..8c7d5b1472018 100644
--- a/flang/test/Parser/OpenMP/reduction-modifier.f90
+++ b/flang/test/Parser/OpenMP/reduction-modifier.f90
@@ -7,7 +7,7 @@ subroutine foo()
 ! CHECK: !$OMP DO  REDUCTION(TASK, *: j)
 ! PARSE-TREE: | | ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPLoopConstruct
 ! PARSE-TREE: | | | OmpBeginLoopDirective
-! PARSE-TREE: | | | | OmpLoopDirective -> llvm::omp::Directive = do
+! PARSE-TREE: | | | | OmpDirectiveName -> llvm::omp::Directive = do
 ! PARSE-TREE: | | | | OmpClauseList -> OmpClause -> Reduction -> OmpReductionClause
 ! PARSE-TREE: | | | | | Modifier -> OmpReductionModifier -> Value = Task
 ! PARSE-TREE: | | | | | Modifier -> OmpReductionIdentifier -> DefinedOperator -> IntrinsicOperator = Multiply
diff --git a/flang/test/Parser/OpenMP/replayable-clause.f90 b/flang/test/Parser/OpenMP/replayable-clause.f90
new file mode 100644
index 0000000000000..c1733449fcb70
--- /dev/null
+++ b/flang/test/Parser/OpenMP/replayable-clause.f90
@@ -0,0 +1,60 @@
+!RUN: %flang_fc1 -fdebug-unparse -fopenmp -fopenmp-version=60 %s | FileCheck --ignore-case --check-prefix="UNPARSE" %s
+!RUN: %flang_fc1 -fdebug-dump-parse-tree -fopenmp -fopenmp-version=60 %s | FileCheck --check-prefix="PARSE-TREE" %s
+
+subroutine f00
+  !$omp task replayable
+  block
+  end block
+end
+
+!UNPARSE: SUBROUTINE f00
+!UNPARSE: !$OMP TASK REPLAYABLE
+!UNPARSE:  BLOCK
+!UNPARSE:  END BLOCK
+!UNPARSE: END SUBROUTINE
+
+!PARSE-TREE: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OmpBlockConstruct
+!PARSE-TREE: | OmpBeginDirective
+!PARSE-TREE: | | OmpDirectiveName -> llvm::omp::Directive = task
+!PARSE-TREE: | | OmpClauseList -> OmpClause -> Replayable ->
+!PARSE-TREE: | | Flags = None
+!PARSE-TREE: | Block
+
+
+subroutine f01(x)
+  implicit none
+  integer :: x
+  !$omp target_update to(x) replayable(.true.)
+end
+
+!UNPARSE: SUBROUTINE f01 (x)
+!UNPARSE:  IMPLICIT NONE
+!UNPARSE:  INTEGER x
+!UNPARSE: !$OMP TARGET_UPDATE TO(x) REPLAYABLE(.true._4)
+!UNPARSE: END SUBROUTINE
+
+!PARSE-TREE: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPStandaloneConstruct -> OpenMPSimpleStandaloneConstruct -> OmpDirectiveSpecification
+!PARSE-TREE: | OmpDirectiveName -> llvm::omp::Directive = target update
+!PARSE-TREE: | OmpClauseList -> OmpClause -> To -> OmpToClause
+!PARSE-TREE: | | OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'x'
+!PARSE-TREE: | | bool = 'true'
+!PARSE-TREE: | OmpClause -> Replayable -> OmpReplayableClause -> Scalar -> Logical -> Constant -> Expr = '.true._4'
+!PARSE-TREE: | | LiteralConstant -> LogicalLiteralConstant
+!PARSE-TREE: | | | bool = 'true'
+!PARSE-TREE: | Flags = None
+
+
+subroutine f02
+  !$omp taskwait replayable(.false.)
+end
+
+!UNPARSE: SUBROUTINE f02
+!UNPARSE: !$OMP TASKWAIT REPLAYABLE(.false._4)
+!UNPARSE: END SUBROUTINE
+
+!PARSE-TREE: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPStandaloneConstruct -> OpenMPSimpleStandaloneConstruct -> OmpDirectiveSpecification
+!PARSE-TREE: | OmpDirectiveName -> llvm::omp::Directive = taskwait
+!PARSE-TREE: | OmpClauseList -> OmpClause -> Replayable -> OmpReplayableClause -> Scalar -> Logical -> Constant -> Expr = '.false._4'
+!PARSE-TREE: | | LiteralConstant -> LogicalLiteralConstant
+!PARSE-TREE: | | | bool = 'false'
+!PARSE-TREE: | Flags = None
diff --git a/flang/test/Parser/OpenMP/target-loop-unparse.f90 b/flang/test/Parser/OpenMP/target-loop-unparse.f90
index ee0013f613403..142bcabf0894b 100644
--- a/flang/test/Parser/OpenMP/target-loop-unparse.f90
+++ b/flang/test/Parser/OpenMP/target-loop-unparse.f90
@@ -9,7 +9,7 @@
 subroutine test_loop
   integer :: i, j = 1
   !PARSE-TREE: OmpBeginLoopDirective
-  !PARSE-TREE-NEXT: OmpLoopDirective -> llvm::omp::Directive = loop
+  !PARSE-TREE-NEXT: OmpDirectiveName -> llvm::omp::Directive = loop
   !CHECK: !$omp loop
   !$omp loop
   do i=1,10
@@ -18,7 +18,7 @@ subroutine test_loop
   !$omp end loop
 
   !PARSE-TREE: OmpBeginLoopDirective
-  !PARSE-TREE-NEXT: OmpLoopDirective -> llvm::omp::Directive = loop
+  !PARSE-TREE-NEXT: OmpDirectiveName -> llvm::omp::Directive = loop
   !PARSE-TREE-NEXT: OmpClauseList -> OmpClause -> Bind -> OmpBindClause -> Binding = Thread
   !CHECK: !$omp loop
   !$omp loop bind(thread)
@@ -31,7 +31,7 @@ subroutine test_loop
 subroutine test_target_loop
   integer :: i, j = 1
   !PARSE-TREE: OmpBeginLoopDirective
-  !PARSE-TREE-NEXT: OmpLoopDirective -> llvm::omp::Directive = target loop
+  !PARSE-TREE-NEXT: OmpDirectiveName -> llvm::omp::Directive = target loop
   !CHECK: !$omp target loop
   !$omp target loop
   do i=1,10
@@ -43,7 +43,7 @@ subroutine test_target_loop
 subroutine test_target_teams_loop
   integer :: i, j = 1
   !PARSE-TREE: OmpBeginLoopDirective
-  !PARSE-TREE-NEXT: OmpLoopDirective -> llvm::omp::Directive = target teams loop
+  !PARSE-TREE-NEXT: OmpDirectiveName -> llvm::omp::Directive = target teams loop
   !CHECK: !$omp target teams loop
   !$omp target teams loop
   do i=1,10
@@ -55,7 +55,7 @@ subroutine test_target_teams_loop
 subroutine test_target_parallel_loop
   integer :: i, j = 1
   !PARSE-TREE: OmpBeginLoopDirective
-  !PARSE-TREE-NEXT: OmpLoopDirective -> llvm::omp::Directive = target parallel loop
+  !PARSE-TREE-NEXT: OmpDirectiveName -> llvm::omp::Directive = target parallel loop
   !CHECK: !$omp target parallel loop
   !$omp target parallel loop
   do i=1,10
diff --git a/flang/test/Parser/OpenMP/taskloop.f90 b/flang/test/Parser/OpenMP/taskloop.f90
index f053aa7f0cff3..3ea91daae160e 100644
--- a/flang/test/Parser/OpenMP/taskloop.f90
+++ b/flang/test/Parser/OpenMP/taskloop.f90
@@ -6,7 +6,7 @@ subroutine parallel_work
 
 !CHECK: !$OMP TASKLOOP  GRAINSIZE(STRICT: 500_4)
 !PARSE-TREE: OmpBeginLoopDirective
-!PARSE-TREE-NEXT: OmpLoopDirective -> llvm::omp::Directive = taskloop
+!PARSE-TREE-NEXT: OmpDirectiveName -> llvm::omp::Directive = taskloop
 !PARSE-TREE-NEXT: OmpClauseList -> OmpClause -> Grainsize -> OmpGrainsizeClause
 !PARSE-TREE-NEXT: Modifier -> OmpPrescriptiveness -> Value = Strict
 !PARSE-TREE-NEXT: Scalar -> Integer -> Expr = '500_4'
@@ -18,7 +18,7 @@ subroutine parallel_work
 
 !CHECK: !$OMP TASKLOOP  GRAINSIZE(500_4)
 !PARSE-TREE: OmpBeginLoopDirective
-!PARSE-TREE-NEXT: OmpLoopDirective -> llvm::omp::Directive = taskloop
+!PARSE-TREE-NEXT: OmpDirectiveName -> llvm::omp::Directive = taskloop
 !PARSE-TREE-NEXT: OmpClauseList -> OmpClause -> Grainsize -> OmpGrainsizeClause
 !PARSE-TREE-NEXT: Scalar -> Integer -> Expr = '500_4'
   !$omp taskloop grainsize(500)
@@ -29,7 +29,7 @@ subroutine parallel_work
 
 !CHECK: !$OMP TASKLOOP  NUM_TASKS(STRICT: 500_4)
 !PARSE-TREE: OmpBeginLoopDirective
-!PARSE-TREE-NEXT: OmpLoopDirective -> llvm::omp::Directive = taskloop
+!PARSE-TREE-NEXT: OmpDirectiveName -> llvm::omp::Directive = taskloop
 !PARSE-TREE-NEXT: OmpClauseList -> OmpClause -> NumTasks -> OmpNumTasksClause
 !PARSE-TREE-NEXT: Modifier -> OmpPrescriptiveness -> Value = Strict
 !PARSE-TREE-NEXT: Scalar -> Integer -> Expr = '500_4'
diff --git a/flang/test/Parser/OpenMP/tile-size.f90 b/flang/test/Parser/OpenMP/tile-size.f90
index 64bc3c5319e88..5110493de4a0c 100644
--- a/flang/test/Parser/OpenMP/tile-size.f90
+++ b/flang/test/Parser/OpenMP/tile-size.f90
@@ -18,6 +18,6 @@ subroutine openmp_tiles(x)
 
 !PARSE-TREE: OpenMPConstruct -> OpenMPLoopConstruct
 !PARSE-TREE: OmpBeginLoopDirective
-!PARSE-TREE: OmpLoopDirective -> llvm::omp::Directive = tile
+!PARSE-TREE: OmpDirectiveName -> llvm::omp::Directive = tile
 !PARSE-TREE: OmpClauseList -> OmpClause -> Sizes -> Scalar -> Integer -> Expr = '2_4'
 END subroutine openmp_tiles
diff --git a/flang/test/Parser/OpenMP/tile.f90 b/flang/test/Parser/OpenMP/tile.f90
index ee9b6aa5c84ca..2ea17471866a4 100644
--- a/flang/test/Parser/OpenMP/tile.f90
+++ b/flang/test/Parser/OpenMP/tile.f90
@@ -17,7 +17,7 @@ subroutine openmp_tiles(x)
 
 !PARSE-TREE: OpenMPConstruct -> OpenMPLoopConstruct
 !PARSE-TREE: OmpBeginLoopDirective
-!PARSE-TREE: OmpLoopDirective -> llvm::omp::Directive = tile
+!PARSE-TREE: OmpDirectiveName -> llvm::omp::Directive = tile
 
 END subroutine openmp_tiles
 
diff --git a/flang/test/Parser/OpenMP/transparent-clause.f90 b/flang/test/Parser/OpenMP/transparent-clause.f90
new file mode 100644
index 0000000000000..8f669546f2dea
--- /dev/null
+++ b/flang/test/Parser/OpenMP/transparent-clause.f90
@@ -0,0 +1,77 @@
+!RUN: %flang_fc1 -fdebug-unparse -fopenmp -fopenmp-version=60 %s | FileCheck --ignore-case --check-prefix="UNPARSE" %s
+!RUN: %flang_fc1 -fdebug-dump-parse-tree -fopenmp -fopenmp-version=60 %s | FileCheck --check-prefix="PARSE-TREE" %s
+
+subroutine f00(x)
+  implicit none
+  integer :: x
+  !$omp target_data map(to: x) transparent
+  block
+  end block
+end
+
+!UNPARSE: SUBROUTINE f00 (x)
+!UNPARSE:  IMPLICIT NONE
+!UNPARSE:  INTEGER x
+!UNPARSE: !$OMP TARGET_DATA MAP(TO: x) TRANSPARENT
+!UNPARSE:  BLOCK
+!UNPARSE:  END BLOCK
+!UNPARSE: END SUBROUTINE
+
+!PARSE-TREE: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OmpBlockConstruct
+!PARSE-TREE: | OmpBeginDirective
+!PARSE-TREE: | | OmpDirectiveName -> llvm::omp::Directive = target data
+!PARSE-TREE: | | OmpClauseList -> OmpClause -> Map -> OmpMapClause
+!PARSE-TREE: | | | Modifier -> OmpMapType -> Value = To
+!PARSE-TREE: | | | OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'x'
+!PARSE-TREE: | | | bool = 'true'
+!PARSE-TREE: | | OmpClause -> Transparent ->
+!PARSE-TREE: | | Flags = None
+!PARSE-TREE: | Block
+
+
+subroutine f01
+  !$omp task transparent(0)
+  !$omp end task
+end
+
+!UNPARSE: SUBROUTINE f01
+!UNPARSE: !$OMP TASK TRANSPARENT(0_4)
+!UNPARSE: !$OMP END TASK
+!UNPARSE: END SUBROUTINE
+
+!PARSE-TREE: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OmpBlockConstruct
+!PARSE-TREE: | OmpBeginDirective
+!PARSE-TREE: | | OmpDirectiveName -> llvm::omp::Directive = task
+!PARSE-TREE: | | OmpClauseList -> OmpClause -> Transparent -> OmpTransparentClause -> Scalar -> Integer -> Expr = '0_4'
+!PARSE-TREE: | | | LiteralConstant -> IntLiteralConstant = '0'
+!PARSE-TREE: | | Flags = None
+!PARSE-TREE: | Block
+!PARSE-TREE: | OmpEndDirective
+!PARSE-TREE: | | OmpDirectiveName -> llvm::omp::Directive = task
+!PARSE-TREE: | | OmpClauseList ->
+!PARSE-TREE: | | Flags = None
+
+
+subroutine f02
+  implicit none
+  integer :: i
+  !$omp taskloop transparent(2)
+  do i = 1, 10
+  end do
+end
+
+!UNPARSE: SUBROUTINE f02
+!UNPARSE:  IMPLICIT NONE
+!UNPARSE:  INTEGER i
+!UNPARSE: !$OMP TASKLOOP  TRANSPARENT(2_4)
+!UNPARSE:  DO i=1_4,10_4
+!UNPARSE:  END DO
+!UNPARSE: END SUBROUTINE
+
+!PARSE-TREE: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPLoopConstruct
+!PARSE-TREE: | OmpBeginLoopDirective
+!PARSE-TREE: | | OmpDirectiveName -> llvm::omp::Directive = taskloop
+!PARSE-TREE: | | OmpClauseList -> OmpClause -> Transparent -> OmpTransparentClause -> Scalar -> Integer -> Expr = '2_4'
+!PARSE-TREE: | | | LiteralConstant -> IntLiteralConstant = '2'
+!PARSE-TREE: | | Flags = None
+!PARSE-TREE: | DoConstruct
diff --git a/flang/test/Parser/OpenMP/unroll-full.f90 b/flang/test/Parser/OpenMP/unroll-full.f90
index 30d2f46624991..80b2cac296fee 100644
--- a/flang/test/Parser/OpenMP/unroll-full.f90
+++ b/flang/test/Parser/OpenMP/unroll-full.f90
@@ -17,6 +17,6 @@ subroutine openmp_parse_unroll(x)
 
 !PARSE-TREE: OpenMPConstruct -> OpenMPLoopConstruct
 !PARSE-TREE: OmpBeginLoopDirective
-!PARSE-TREE: OmpLoopDirective -> llvm::omp::Directive = unroll
+!PARSE-TREE: OmpDirectiveName -> llvm::omp::Directive = unroll
 !PARSE-TREE: OmpClauseList -> OmpClause -> Full
 END subroutine openmp_parse_unroll
diff --git a/flang/test/Parser/OpenMP/unroll-heuristic.f90 b/flang/test/Parser/OpenMP/unroll-heuristic.f90
index 2f589af0c83ca..bbc2df3b57df6 100644
--- a/flang/test/Parser/OpenMP/unroll-heuristic.f90
+++ b/flang/test/Parser/OpenMP/unroll-heuristic.f90
@@ -20,8 +20,9 @@ END subroutine openmp_parse_unroll_heuristic
 
 !PTREE:      OpenMPConstruct -> OpenMPLoopConstruct
 !PTREE-NEXT: | OmpBeginLoopDirective
-!PTREE-NEXT: | | OmpLoopDirective -> llvm::omp::Directive = unroll
+!PTREE-NEXT: | | OmpDirectiveName -> llvm::omp::Directive = unroll
 !PTREE-NEXT: | | OmpClauseList ->
+!PTREE-NEXT: | | Flags = None
 !PTREE-NEXT: | DoConstruct
 !PTREE-NEXT: | | NonLabelDoStmt
 !PTREE-NEXT: | | | LoopControl -> LoopBounds
@@ -39,5 +40,6 @@ END subroutine openmp_parse_unroll_heuristic
 !PTREE-NEXT: | | | | | | | Designator -> DataRef -> Name = 'i'
 !PTREE-NEXT: | | EndDoStmt ->
 !PTREE-NEXT: | OmpEndLoopDirective
-!PTREE-NEXT: | | OmpLoopDirective -> llvm::omp::Directive = unroll
+!PTREE-NEXT: | | OmpDirectiveName -> llvm::omp::Directive = unroll
 !PTREE-NEXT: | | OmpClauseList ->
+!PTREE-NEXT: | | Flags = None
diff --git a/flang/test/Parser/OpenMP/unroll-partial.f90 b/flang/test/Parser/OpenMP/unroll-partial.f90
index 8ac2a74166773..59dffb63cee6c 100644
--- a/flang/test/Parser/OpenMP/unroll-partial.f90
+++ b/flang/test/Parser/OpenMP/unroll-partial.f90
@@ -17,7 +17,7 @@ subroutine openmp_parse_unroll(x)
 
 !PARSE-TREE: OpenMPConstruct -> OpenMPLoopConstruct
 !PARSE-TREE: OmpBeginLoopDirective
-!PARSE-TREE: OmpLoopDirective -> llvm::omp::Directive = unroll
+!PARSE-TREE: OmpDirectiveName -> llvm::omp::Directive = unroll
 !PARSE-TREE: OmpClauseList -> OmpClause -> Partial -> Scalar -> Integer -> Constant -> Expr = '3_4'
 !PARSE-TREE: LiteralConstant -> IntLiteralConstant = '3'
 
diff --git a/flang/test/Parser/utf8-01.f90 b/flang/test/Parser/utf8-01.f90
new file mode 100644
index 0000000000000..3a3745524d807
--- /dev/null
+++ b/flang/test/Parser/utf8-01.f90
@@ -0,0 +1,15 @@
+!RUN: %flang_fc1 -fdebug-unparse %s 2>&1 | FileCheck %s
+
+character(kind=4), parameter :: c(2) = [character(kind=4) :: &
+4_'🍌', 4_'水' ]
+print *, '🍌'
+print *, 4_'🍌'
+print *, '水'
+print *, 4_'水'
+end
+
+!CHECK: CHARACTER(KIND=4_4), PARAMETER :: c(2_4) = [CHARACTER(KIND=4,LEN=1)::4_"\360\237\215\214",4_"\346\260\264"]
+!CHECK: PRINT *, "\360\237\215\214"
+!CHECK: PRINT *, 4_"\360\237\215\214"
+!CHECK: PRINT *, "\346\260\264"
+!CHECK: PRINT *, 4_"\346\260\264"
diff --git a/flang/test/Semantics/OpenACC/acc-loop.f90 b/flang/test/Semantics/OpenACC/acc-loop.f90
index 77c427e0a85ae..635dbb04cd666 100644
--- a/flang/test/Semantics/OpenACC/acc-loop.f90
+++ b/flang/test/Semantics/OpenACC/acc-loop.f90
@@ -340,10 +340,10 @@ program openacc_loop_validity
 
   !$acc kernels loop gang(dim:3)
   do i = 1, n
-    !ERROR: GANG clause is not allowed in the region of a loop with the GANG clause
+    !ERROR: Nested GANG loops are not allowed in the region of a KERNELS construct
     !$acc loop gang(dim:2)
     do j = 1, n
-      !ERROR: GANG clause is not allowed in the region of a loop with the GANG clause
+      !ERROR: Nested GANG loops are not allowed in the region of a KERNELS construct
       !$acc loop gang(dim:1) worker vector
       do k = 1, i
       end do
@@ -447,4 +447,35 @@ program openacc_loop_validity
     END DO
   END DO
 
+contains
+
+  subroutine sub1()
+    !$acc routine gang(dim:2)
+    implicit none
+    integer, parameter :: N = 256
+    integer :: i, j
+
+    !$acc loop gang(dim:2)
+    DO j = 1, N
+       !$acc loop gang(dim:1) vector
+       DO i = 1, N
+       END DO
+    END DO
+  end subroutine sub1
+
+  subroutine sub2()
+    !$acc routine gang(dim:2)
+    implicit none
+    integer, parameter :: N = 256
+    integer :: i, j
+
+    !$acc loop gang(dim:2)
+    DO j = 1, N
+       !ERROR: GANG(dim:2) clause is not allowed in the region of a loop with the GANG(dim:2) clause
+       !$acc loop gang(dim:2) vector
+       DO i = 1, N
+       END DO
+    END DO
+  end subroutine sub2
+
 end program openacc_loop_validity
diff --git a/flang/test/Semantics/OpenMP/replayable-clause.f90 b/flang/test/Semantics/OpenMP/replayable-clause.f90
new file mode 100644
index 0000000000000..b8fe6cea23a6f
--- /dev/null
+++ b/flang/test/Semantics/OpenMP/replayable-clause.f90
@@ -0,0 +1,22 @@
+!RUN: %python %S/../test_errors.py %s %flang -fopenmp -fopenmp-version=60
+
+subroutine f00(x)
+  implicit none
+  logical :: x
+  !ERROR: Must be a constant value
+  !$omp task replayable(x)
+  !$omp end task
+end
+
+subroutine f01
+  !ERROR: Must have LOGICAL type, but is INTEGER(4)
+  !$omp task replayable(7)
+  !$omp end task
+end
+
+subroutine f02
+  !No diagnostic expected
+  !$omp task replayable
+  !$omp end task
+end
+
diff --git a/flang/test/Semantics/OpenMP/simd-only.f90 b/flang/test/Semantics/OpenMP/simd-only.f90
index 33ab3d62c98e9..e137ef7d82929 100644
--- a/flang/test/Semantics/OpenMP/simd-only.f90
+++ b/flang/test/Semantics/OpenMP/simd-only.f90
@@ -9,7 +9,7 @@ subroutine test_simd()
   integer :: i
 
   ! CHECK: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPLoopConstruct
-  ! CHECK: OmpLoopDirective -> llvm::omp::Directive = simd
+  ! CHECK: OmpDirectiveName -> llvm::omp::Directive = simd
   ! CHECK-NOT: ExecutionPartConstruct -> ExecutableConstruct -> DoConstruct
   !$omp simd
   do i = 1, 100
@@ -21,7 +21,7 @@ subroutine test_do_simd()
   integer :: i
 
   ! CHECK: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPLoopConstruct
-  ! CHECK: OmpLoopDirective -> llvm::omp::Directive = do simd
+  ! CHECK: OmpDirectiveName -> llvm::omp::Directive = do simd
   ! CHECK-NOT: ExecutionPartConstruct -> ExecutableConstruct -> DoConstruct
   !$omp do simd
   do i = 1, 100
@@ -34,7 +34,7 @@ subroutine test_parallel_do_simd()
   integer :: i
 
   ! CHECK: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPLoopConstruct
-  ! CHECK: OmpLoopDirective -> llvm::omp::Directive = parallel do simd
+  ! CHECK: OmpDirectiveName -> llvm::omp::Directive = parallel do simd
   ! CHECK-NOT: ExecutionPartConstruct -> ExecutableConstruct -> DoConstruct
   !$omp parallel do simd
   do i = 1, 100
@@ -47,7 +47,7 @@ subroutine test_simd_scan()
   real :: sum
 
   ! CHECK: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPLoopConstruct
-  ! CHECK: OmpLoopDirective -> llvm::omp::Directive = simd
+  ! CHECK: OmpDirectiveName -> llvm::omp::Directive = simd
   !$omp simd reduction(inscan,+:sum)
   do i = 1, N
     sum = sum + a(i)
@@ -64,7 +64,7 @@ subroutine test_simd_atomic()
   integer :: i, x
 
   ! CHECK: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPLoopConstruct
-  ! CHECK: OmpLoopDirective -> llvm::omp::Directive = simd
+  ! CHECK: OmpDirectiveName -> llvm::omp::Directive = simd
   ! CHECK-NOT: ExecutionPartConstruct -> ExecutableConstruct -> DoConstruct
   !$omp simd
   do i = 1, 100
@@ -80,7 +80,7 @@ subroutine test_do()
   integer :: i
 
   ! CHECK-NOT: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPLoopConstruct
-  ! CHECK-NOT: OmpLoopDirective -> llvm::omp::Directive = do
+  ! CHECK-NOT: OmpDirectiveName -> llvm::omp::Directive = do
   ! CHECK: ExecutionPartConstruct -> ExecutableConstruct -> DoConstruct
   !$omp parallel do
   do i = 1, 100
@@ -92,7 +92,7 @@ subroutine test_do_nested()
   integer :: i
 
   ! CHECK-NOT: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPLoopConstruct
-  ! CHECK-NOT: OmpLoopDirective -> llvm::omp::Directive = parallel do
+  ! CHECK-NOT: OmpDirectiveName -> llvm::omp::Directive = parallel do
   ! CHECK: ExecutionPartConstruct -> ExecutableConstruct -> DoConstruct
   ! CHECK: ExecutionPartConstruct -> ExecutableConstruct -> DoConstruct
   !$omp parallel do
@@ -107,7 +107,7 @@ subroutine test_target()
   integer :: i
 
   ! CHECK-NOT: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPBlockct
-  ! CHECK-NOT: OmpLoopDirective -> llvm::omp::Directive = target
+  ! CHECK-NOT: OmpDirectiveName -> llvm::omp::Directive = target
   ! CHECK: ExecutionPartConstruct -> ExecutableConstruct -> DoConstruct
   !$omp target
   do i = 1, 100
@@ -120,7 +120,7 @@ subroutine test_target_teams_distribute()
   integer :: i
 
   ! CHECK-NOT: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPLoopConstruct
-  ! CHECK-NOT: OmpLoopDirective -> llvm::omp::Directive = target teams distribute
+  ! CHECK-NOT: OmpDirectiveName -> llvm::omp::Directive = target teams distribute
   ! CHECK: ExecutionPartConstruct -> ExecutableConstruct -> DoConstruct
   !$omp target teams distribute
   do i = 1, 100
@@ -132,7 +132,7 @@ subroutine test_target_teams_distribute()
 ! CHECK-LABEL: Name = 'test_target_data'
 subroutine test_target_data()
   ! CHECK-NOT: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OmpBlockConstruct
-  ! CHECK-NOT: OmpLoopDirective -> llvm::omp::Directive = target data
+  ! CHECK-NOT: OmpDirectiveName -> llvm::omp::Directive = target data
   ! CHECK: ExecutionPartConstruct -> ExecutableConstruct -> DoConstruct
   !$omp target data map(to: A) map(tofrom: B)
   do i = 1, 100
@@ -145,7 +145,7 @@ subroutine test_loop()
   integer :: i
 
   ! CHECK-NOT: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPLoopConstruct
-  ! CHECK-NOT: OmpLoopDirective -> llvm::omp::Directive = loop
+  ! CHECK-NOT: OmpDirectiveName -> llvm::omp::Directive = loop
   ! CHECK: ExecutionPartConstruct -> ExecutableConstruct -> DoConstruct
   !$omp loop bind(thread)
   do i = 1, 100
@@ -157,7 +157,7 @@ subroutine test_unroll()
   integer :: i
 
   ! CHECK-NOT: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPLoopConstruct
-  ! CHECK-NOT: OmpLoopDirective -> llvm::omp::Directive = unroll
+  ! CHECK-NOT: OmpDirectiveName -> llvm::omp::Directive = unroll
   ! CHECK: ExecutionPartConstruct -> ExecutableConstruct -> DoConstruct
   !$omp unroll
   do i = 1, 100
@@ -170,12 +170,12 @@ subroutine test_do_ordered()
   x = 0
 
   ! CHECK-NOT: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPLoopConstruct
-  ! CHECK-NOT: OmpLoopDirective -> llvm::omp::Directive = do
+  ! CHECK-NOT: OmpDirectiveName -> llvm::omp::Directive = do
   ! CHECK: ExecutionPartConstruct -> ExecutableConstruct -> DoConstruct
   !$omp do ordered
   do i = 1, 100
   ! CHECK-NOT: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OmpBlockConstruct
-  ! CHECK-NOT: OmpLoopDirective -> llvm::omp::Directive = ordered
+  ! CHECK-NOT: OmpDirectiveName -> llvm::omp::Directive = ordered
   !$omp ordered
   x = x + 1
   !$omp end ordered
@@ -188,17 +188,17 @@ subroutine test_cancel()
   x = 0
 
   ! CHECK-NOT: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPLoopConstruct
-  ! CHECK-NOT: OmpLoopDirective -> llvm::omp::Directive = parallel do
+  ! CHECK-NOT: OmpDirectiveName -> llvm::omp::Directive = parallel do
   ! CHECK: ExecutionPartConstruct -> ExecutableConstruct -> DoConstruct
   !$omp parallel do
   do i = 1, 100
   if (i == 10) then
     ! CHECK-NOT: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPStandaloneConstruct -> OpenMPCancelConstruct -> OmpDirectiveSpecification
-    ! CHECK-NOT: OmpLoopDirective -> llvm::omp::Directive = cancel
+    ! CHECK-NOT: OmpDirectiveName -> llvm::omp::Directive = cancel
     !$omp cancel do
   end if
   ! CHECK-NOT: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPStandaloneConstruct -> OpenMPCancellationPointConstruct -> OmpDirectiveSpecification
-  ! CHECK-NOT: OmpLoopDirective -> llvm::omp::Directive = cancellation point
+  ! CHECK-NOT: OmpDirectiveName -> llvm::omp::Directive = cancellation point
   !$omp cancellation point do
   end do
 end subroutine
@@ -208,7 +208,7 @@ subroutine test_scan()
   integer :: i, sum
 
   ! CHECK-NOT: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPLoopConstruct
-  ! CHECK-NOT: OmpLoopDirective -> llvm::omp::Directive = parallel do
+  ! CHECK-NOT: OmpDirectiveName -> llvm::omp::Directive = parallel do
   ! CHECK: ExecutionPartConstruct -> ExecutableConstruct -> DoConstruct
   !$omp parallel do reduction(inscan, +: sum)
   do i = 1, n
@@ -225,7 +225,7 @@ subroutine test_target_map()
   integer :: array(10)
 
   ! CHECK-NOT: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OmpBlockConstruct
-  ! CHECK-NOT: OmpLoopDirective -> llvm::omp::Directive = target
+  ! CHECK-NOT: OmpDirectiveName -> llvm::omp::Directive = target
   !$omp target map(tofrom: array(2:10))
     array(2) = array(2) * 2
   !$omp end target
diff --git a/flang/test/Semantics/OpenMP/transparent-clause.f90 b/flang/test/Semantics/OpenMP/transparent-clause.f90
new file mode 100644
index 0000000000000..4831ba0f7cef6
--- /dev/null
+++ b/flang/test/Semantics/OpenMP/transparent-clause.f90
@@ -0,0 +1,19 @@
+!RUN: %python %S/../test_errors.py %s %flang -fopenmp -fopenmp-version=60
+
+subroutine f00(x)
+  integer :: x(10)
+  !ERROR: Must be a scalar value, but is a rank-1 array
+  !$omp task transparent(x)
+  !$omp end task
+end
+
+subroutine f01
+  implicit none
+  integer :: i
+  !ERROR: Must have INTEGER type, but is CHARACTER(KIND=1,LEN=5_8)
+  !$omp taskloop transparent("hello")
+  do i = 1, 10
+  end do
+  !$omp end taskloop
+end
+
diff --git a/flang/test/Semantics/bug158405.f90 b/flang/test/Semantics/bug158405.f90
new file mode 100644
index 0000000000000..3c334675dfd64
--- /dev/null
+++ b/flang/test/Semantics/bug158405.f90
@@ -0,0 +1,9 @@
+! RUN: %python %S/test_errors.py %s %flang_fc1
+subroutine s()
+  ! ERROR: Generic 'f' may not have specific procedures 's' and 'ss' as their interfaces are not distinguishable
+  interface f
+    procedure s
+    procedure ss
+  end interface
+ entry ss()
+end
diff --git a/flang/test/Semantics/definable07.f90 b/flang/test/Semantics/definable07.f90
new file mode 100644
index 0000000000000..6da90e762e162
--- /dev/null
+++ b/flang/test/Semantics/definable07.f90
@@ -0,0 +1,8 @@
+! RUN: %python %S/test_errors.py %s %flang_fc1
+integer, parameter :: j = 5
+real a(5)
+!ERROR: 'j' is not a variable
+read *, (a(j), j=1, 5)
+!ERROR: 'j' is not a variable
+print *, (a(j), j=1, 5)
+end
diff --git a/flang/test/Semantics/resolve127.f90 b/flang/test/Semantics/resolve127.f90
new file mode 100644
index 0000000000000..e1dacf628426e
--- /dev/null
+++ b/flang/test/Semantics/resolve127.f90
@@ -0,0 +1,7 @@
+!RUN: %flang_fc1 -fdebug-unparse %s 2>&1 | FileCheck %s
+double precision, parameter :: x = 1.0d0
+type t
+  !CHECK: REAL :: x(1_4) = [INTEGER(4)::8_4]
+  real :: x(1) = [(kind(x),j=1,1)]
+end type
+end
diff --git a/flang/test/Transforms/DoConcurrent/allocatable.f90 b/flang/test/Transforms/DoConcurrent/allocatable.f90
new file mode 100644
index 0000000000000..03962f150eb95
--- /dev/null
+++ b/flang/test/Transforms/DoConcurrent/allocatable.f90
@@ -0,0 +1,29 @@
+! Verifies that proper `omp.map.bounds` ops are emitted when an allocatable is
+! implicitly mapped by a `do concurrent` loop.
+
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -fdo-concurrent-to-openmp=device %s -o - \
+! RUN:   | FileCheck %s
+program main
+   implicit none
+
+   integer,parameter :: n = 1000000
+   real, allocatable, dimension(:) :: y
+   integer :: i
+
+   allocate(y(1:n))
+
+   do concurrent(i=1:n)
+       y(i) = 42
+   end do
+
+   deallocate(y)
+end program main
+
+! CHECK: %[[Y_DECL:.*]]:2 = hlfir.declare %{{.*}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFEy"}
+! CHECK: %[[Y_VAL:.*]] = fir.load %[[Y_DECL]]#0
+! CHECK: %[[Y_DIM0:.*]]:3 = fir.box_dims %[[Y_VAL]], %{{c0_.*}}
+! CHECK: %[[Y_LB:.*]] = arith.constant 0 : index
+! CHECK: %[[Y_UB:.*]] = arith.subi %[[Y_DIM0]]#1, %{{c1_.*}} : index
+! CHECK: %[[Y_BOUNDS:.*]] = omp.map.bounds lower_bound(%[[Y_LB]] : index) upper_bound(%[[Y_UB]] : index) extent(%[[Y_DIM0]]#1 : index)
+! CHECK: %[[MEM_MAP:.*]] = omp.map.info {{.*}} bounds(%[[Y_BOUNDS]])
+! CHECK: omp.map.info var_ptr(%[[Y_DECL]]#1 : {{.*}}) {{.*}} members(%[[MEM_MAP]] : {{.*}})
diff --git a/flang/test/Transforms/DoConcurrent/host_eval.f90 b/flang/test/Transforms/DoConcurrent/host_eval.f90
new file mode 100644
index 0000000000000..4eacdd64c6e5f
--- /dev/null
+++ b/flang/test/Transforms/DoConcurrent/host_eval.f90
@@ -0,0 +1,65 @@
+! REQUIRES: amdgpu-registered-target
+
+! Tests `host_eval` clause code-gen and loop nest bounds on host vs. device.
+
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa   \
+! RUN:   -fdo-concurrent-to-openmp=device %s -o -                           \
+! RUN: | FileCheck %s --check-prefix=HOST -vv
+
+! RUN: %flang_fc1 -triple amdgcn-amd-amdhsa -emit-hlfir -fopenmp            \
+! RUN:   -fopenmp-is-target-device -fdo-concurrent-to-openmp=device %s -o - \
+! RUN: | FileCheck %s --check-prefix=DEVICE
+
+program do_concurrent_host_eval
+    implicit none
+    integer :: i, j
+
+    do concurrent (i=1:10, j=1:20)
+    end do
+end program do_concurrent_host_eval
+
+! HOST: omp.target host_eval(
+! HOST-SAME:    %{{[^[:space:]]+}} -> %[[I_LB:[^,]+]],
+! HOST-SAME:    %{{[^[:space:]]+}} -> %[[I_UB:[^,]+]],
+! HOST-SAME:    %{{[^[:space:]]+}} -> %[[I_ST:[^,]+]],
+! HOST-SAME:    %{{[^[:space:]]+}} -> %[[J_LB:[^,]+]],
+! HOST-SAME:    %{{[^[:space:]]+}} -> %[[J_UB:[^,]+]],
+! HOST-SAME:    %{{[^[:space:]]+}} -> %[[J_ST:[^,]+]] : {{.*}}) map_entries
+
+! HOST: omp.loop_nest ({{.*}}, {{.*}}) : index = (%[[I_LB]], %[[J_LB]]) to
+! HOST-SAME:    (%[[I_UB]], %[[J_UB]]) inclusive step
+! HOST-SAME:    (%[[I_ST]], %[[J_ST]])
+
+! DEVICE: omp.target map_entries(
+! DEVICE-SAME:  %{{[^[:space:]]+}} -> %[[I_LB_MAP:[^,]+]],
+! DEVICE-SAME:  %{{[^[:space:]]+}} -> %[[I_UB_MAP:[^,]+]],
+! DEVICE-SAME:  %{{[^[:space:]]+}} -> %[[I_ST_MAP:[^,]+]],
+
+! DEVICE-SAME:  %{{[^[:space:]]+}} -> %[[J_LB_MAP:[^,]+]],
+! DEVICE-SAME:  %{{[^[:space:]]+}} -> %[[J_UB_MAP:[^,]+]],
+! DEVICE-SAME:  %{{[^[:space:]]+}} -> %[[J_ST_MAP:[^,]+]],
+
+! DEVICE-SAME:  %{{[^[:space:]]+}} -> %{{[^,]+}},
+! DEVICE-SAME:  %{{[^[:space:]]+}} -> %{{[^,]+}} : {{.*}})
+
+! DEVICE: %[[I_LB_DECL:.*]]:2 = hlfir.declare %[[I_LB_MAP]]
+! DEVICE: %[[I_LB:.*]] = fir.load %[[I_LB_DECL]]#1 : !fir.ref<index>
+
+! DEVICE: %[[I_UB_DECL:.*]]:2 = hlfir.declare %[[I_UB_MAP]]
+! DEVICE: %[[I_UB:.*]] = fir.load %[[I_UB_DECL]]#1 : !fir.ref<index>
+
+! DEVICE: %[[I_ST_DECL:.*]]:2 = hlfir.declare %[[I_ST_MAP]]
+! DEVICE: %[[I_ST:.*]] = fir.load %[[I_ST_DECL]]#1 : !fir.ref<index>
+
+! DEVICE: %[[J_LB_DECL:.*]]:2 = hlfir.declare %[[J_LB_MAP]]
+! DEVICE: %[[J_LB:.*]] = fir.load %[[J_LB_DECL]]#1 : !fir.ref<index>
+
+! DEVICE: %[[J_UB_DECL:.*]]:2 = hlfir.declare %[[J_UB_MAP]]
+! DEVICE: %[[J_UB:.*]] = fir.load %[[J_UB_DECL]]#1 : !fir.ref<index>
+
+! DEVICE: %[[J_ST_DECL:.*]]:2 = hlfir.declare %[[J_ST_MAP]]
+! DEVICE: %[[J_ST:.*]] = fir.load %[[J_ST_DECL]]#1 : !fir.ref<index>
+
+! DEVICE: omp.loop_nest ({{.*}}, {{.*}}) : index = (%[[I_LB]], %[[J_LB]]) to
+! DEVICE-SAME:    (%[[I_UB]], %[[J_UB]]) inclusive step
+! DEVICE-SAME:    (%[[I_ST]], %[[J_ST]])
diff --git a/flang/test/Transforms/DoConcurrent/locally_destroyed_temp.f90 b/flang/test/Transforms/DoConcurrent/locally_destroyed_temp.f90
index f82696669eca6..6a29b57a90d00 100644
--- a/flang/test/Transforms/DoConcurrent/locally_destroyed_temp.f90
+++ b/flang/test/Transforms/DoConcurrent/locally_destroyed_temp.f90
@@ -3,7 +3,10 @@
 ! for a definition of "loop-local values" and how they are handled.
 
 ! RUN: %flang_fc1 -emit-hlfir -fopenmp -fdo-concurrent-to-openmp=host %s -o - \
-! RUN:   | FileCheck %s
+! RUN:   | FileCheck %s --check-prefixes=COMMON
+
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -fdo-concurrent-to-openmp=device %s -o - \
+! RUN:   | FileCheck %s --check-prefixes=COMMON,DEVICE
 module struct_mod
     type test_struct
         integer, allocatable :: x_
@@ -46,17 +49,25 @@ program main
     print *, "total =", total
 end program main
 
-! CHECK: omp.parallel {
-! CHECK:   %[[LOCAL_TEMP:.*]] = fir.alloca !fir.type<_QMstruct_modTtest_struct{x_:!fir.box<!fir.heap<i32>>}> {bindc_name = ".result"}
-! CHECK:   omp.wsloop {
-! CHECK:     omp.loop_nest {{.*}} {
-! CHECK:       %[[TEMP_VAL:.*]] = fir.call @_QMstruct_modPconstruct_from_components
-! CHECK:       fir.save_result %[[TEMP_VAL]] to %[[LOCAL_TEMP]]
-! CHECK:       %[[EMBOXED_LOCAL:.*]] = fir.embox %[[LOCAL_TEMP]]
-! CHECK:       %[[CONVERTED_LOCAL:.*]] = fir.convert %[[EMBOXED_LOCAL]]
-! CHECK:       fir.call @_FortranADestroy(%[[CONVERTED_LOCAL]])
-! CHECK:       omp.yield
-! CHECK:     }
-! CHECK:   }
-! CHECK:   omp.terminator
-! CHECK: }
+! DEVICE: omp.target {{.*}} {
+! DEVICE: omp.teams {
+! COMMON: omp.parallel {
+! COMMON:   %[[LOCAL_TEMP:.*]] = fir.alloca !fir.type<_QMstruct_modTtest_struct{x_:!fir.box<!fir.heap<i32>>}> {bindc_name = ".result"}
+! DEVICE:   omp.distribute {
+! COMMON:   omp.wsloop {
+! COMMON:     omp.loop_nest {{.*}} {
+! COMMON:       %[[TEMP_VAL:.*]] = fir.call @_QMstruct_modPconstruct_from_components
+! COMMON:       fir.save_result %[[TEMP_VAL]] to %[[LOCAL_TEMP]]
+! COMMON:       %[[EMBOXED_LOCAL:.*]] = fir.embox %[[LOCAL_TEMP]]
+! COMMON:       %[[CONVERTED_LOCAL:.*]] = fir.convert %[[EMBOXED_LOCAL]]
+! COMMON:       fir.call @_FortranADestroy(%[[CONVERTED_LOCAL]])
+! COMMON:       omp.yield
+! COMMON:     }
+! COMMON:   }
+! DEVICE:   }
+! COMMON:   omp.terminator
+! COMMON: }
+! DEVICE: omp.terminator
+! DEVICE: }
+! DEVICE: omp.terminator
+! DEVICE: }
diff --git a/flang/test/Transforms/DoConcurrent/map_shape_info.f90 b/flang/test/Transforms/DoConcurrent/map_shape_info.f90
new file mode 100644
index 0000000000000..3dca1340ae6b9
--- /dev/null
+++ b/flang/test/Transforms/DoConcurrent/map_shape_info.f90
@@ -0,0 +1,104 @@
+! Tests mapping of a basic `do concurrent` loop to
+! `!$omp target teams distribute parallel do`.
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -fdo-concurrent-to-openmp=device %s -o - \
+! RUN:   | FileCheck %s
+
+program do_concurrent_shape
+    implicit none
+    integer :: a(10, 20)
+    integer :: i, j
+
+    do concurrent (i=1:10, j=1:20)
+        a(i, j) = i * j
+    end do
+end program do_concurrent_shape
+
+! CHECK: fir.store %{{c10.*}} to %[[DIM0_EXT:.*]] : !fir.ref<index>
+! CHECK: fir.store %{{c20.*}} to %[[DIM1_EXT:.*]] : !fir.ref<index>
+
+! CHECK: omp.map.info
+! CHECK: omp.map.info
+! CHECK: omp.map.info
+
+! CHECK: omp.map.info
+! CHECK: omp.map.info
+! CHECK: omp.map.info
+
+! CHECK: omp.map.info
+! CHECK: omp.map.info
+! CHECK: omp.map.info
+
+! CHECK: %[[DIM0_EXT_MAP:.*]] = omp.map.info 
+! CHECK-SAME:   var_ptr(%[[DIM0_EXT]] : !fir.ref<index>, index)
+! CHECK-SAME:   map_clauses(implicit, exit_release_or_enter_alloc) 
+! CHECK-SAME:   capture(ByCopy) -> !fir.ref<index> {name = "_QFEa.extent.dim0"}
+
+! CHECK: %[[DIM1_EXT_MAP:.*]] = omp.map.info
+! CHECK-SAME:   var_ptr(%[[DIM1_EXT]] : !fir.ref<index>, index)
+! CHECK-SAME:   map_clauses(implicit, exit_release_or_enter_alloc)
+! CHECK-SAME:   capture(ByCopy) -> !fir.ref<index> {name = "_QFEa.extent.dim1"}
+
+! CHECK: omp.target host_eval({{.*}}) map_entries(
+! CHECK-SAME:   %{{[^[:space:]]+}} -> %{{[^,]+}},
+! CHECK-SAME:   %{{[^[:space:]]+}} -> %{{[^,]+}},
+! CHECK-SAME:   %{{[^[:space:]]+}} -> %{{[^,]+}},
+! CHECK-SAME:   %{{[^[:space:]]+}} -> %{{[^,]+}},
+! CHECK-SAME:   %{{[^[:space:]]+}} -> %{{[^,]+}},
+! CHECK-SAME:   %{{[^[:space:]]+}} -> %{{[^,]+}},
+! CHECK-SAME:   %{{[^[:space:]]+}} -> %{{[^,]+}},
+! CHECK-SAME:   %{{[^[:space:]]+}} -> %{{[^,]+}},
+! CHECK-SAME:   %{{[^[:space:]]+}} -> %{{[^,]+}},
+! CHECK-SAME:   %[[DIM0_EXT_MAP]] -> %[[DIM0_EXT_ARG:[^,]+]],
+! CHECK-SAME:   %[[DIM1_EXT_MAP]] -> %[[DIM1_EXT_ARG:[^,]+]] : {{.*}})
+
+! CHECK-DAG:    %[[DIM0_EXT_DEV:.*]] = fir.load %[[DIM0_EXT_ARG]]
+! CHECK-DAG:    %[[DIM1_EXT_DEV:.*]] = fir.load %[[DIM1_EXT_ARG]]
+
+! CHECK:        %[[SHAPE:.*]] = fir.shape %[[DIM0_EXT_DEV]], %[[DIM1_EXT_DEV]]
+! CHECK:        %{{.*}}:2 = hlfir.declare %{{.*}}(%[[SHAPE]]) {uniq_name = "_QFEa"}
+
+subroutine do_concurrent_shape_shift
+    implicit none
+    integer :: a(2:10)
+    integer :: i
+
+    do concurrent (i=1:10)
+        a(i) = i
+    end do
+end subroutine do_concurrent_shape_shift
+
+! CHECK: fir.store %{{c2.*}} to %[[DIM0_STRT:.*]] : !fir.ref<index>
+! CHECK: fir.store %{{c9.*}} to %[[DIM0_EXT:.*]] : !fir.ref<index>
+
+! CHECK: omp.map.info
+! CHECK: omp.map.info
+! CHECK: omp.map.info
+
+! CHECK: omp.map.info
+! CHECK: omp.map.info
+
+! CHECK: %[[DIM0_STRT_MAP:.*]] = omp.map.info 
+! CHECK-SAME:   var_ptr(%[[DIM0_STRT]] : !fir.ref<index>, index)
+! CHECK-SAME:   map_clauses(implicit, exit_release_or_enter_alloc) 
+! CHECK-SAME:   capture(ByCopy) -> !fir.ref<index> {name = "_QF{{.*}}Ea.start_idx.dim0"}
+
+! CHECK: %[[DIM0_EXT_MAP:.*]] = omp.map.info
+! CHECK-SAME:   var_ptr(%[[DIM0_EXT]] : !fir.ref<index>, index)
+! CHECK-SAME:   map_clauses(implicit, exit_release_or_enter_alloc)
+! CHECK-SAME:   capture(ByCopy) -> !fir.ref<index> {name = "_QF{{.*}}Ea.extent.dim0"}
+
+! CHECK: omp.target host_eval({{.*}}) map_entries(
+! CHECK-SAME:   %{{[^[:space:]]+}} -> %{{[^,]+}},
+! CHECK-SAME:   %{{[^[:space:]]+}} -> %{{[^,]+}},
+! CHECK-SAME:   %{{[^[:space:]]+}} -> %{{[^,]+}},
+! CHECK-SAME:   %{{[^[:space:]]+}} -> %{{[^,]+}},
+! CHECK-SAME:   %{{[^[:space:]]+}} -> %{{[^,]+}},
+! CHECK-SAME:   %[[DIM0_STRT_MAP]] -> %[[DIM0_STRT_ARG:[^,]+]],
+! CHECK-SAME:   %[[DIM0_EXT_MAP]] -> %[[DIM0_EXT_ARG:[^,]+]] : {{.*}})
+
+! CHECK-DAG:    %[[DIM0_STRT_DEV:.*]] = fir.load %[[DIM0_STRT_ARG]]
+! CHECK-DAG:    %[[DIM0_EXT_DEV:.*]] = fir.load %[[DIM0_EXT_ARG]]
+
+! CHECK:        %[[SHAPE_SHIFT:.*]] = fir.shape_shift %[[DIM0_STRT_DEV]], %[[DIM0_EXT_DEV]]
+! CHECK:        %{{.*}}:2 = hlfir.declare %{{.*}}(%[[SHAPE_SHIFT]]) {uniq_name = "_QF{{.*}}Ea"}
+
diff --git a/flang/test/Transforms/DoConcurrent/multiple_iteration_ranges.f90 b/flang/test/Transforms/DoConcurrent/multiple_iteration_ranges.f90
index d0210726de83e..015a9104942e3 100644
--- a/flang/test/Transforms/DoConcurrent/multiple_iteration_ranges.f90
+++ b/flang/test/Transforms/DoConcurrent/multiple_iteration_ranges.f90
@@ -3,7 +3,10 @@
 ! RUN: split-file %s %t
 
 ! RUN: %flang_fc1 -emit-hlfir -fopenmp -fdo-concurrent-to-openmp=host %t/multi_range.f90 -o - \
-! RUN:   | FileCheck %s
+! RUN:   | FileCheck %s --check-prefixes=HOST,COMMON
+
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -fdo-concurrent-to-openmp=device %t/multi_range.f90 -o - \
+! RUN:   | FileCheck %s --check-prefixes=DEVICE,COMMON
 
 !--- multi_range.f90
 program main
@@ -17,56 +20,75 @@ program main
    end do
 end
 
-! CHECK: func.func @_QQmain
+! COMMON: func.func @_QQmain
+
+! COMMON: %[[C3:.*]] = arith.constant 3 : i32
+! COMMON: %[[LB_I:.*]] = fir.convert %[[C3]] : (i32) -> index
+! COMMON: %[[C20:.*]] = arith.constant 20 : i32
+! COMMON: %[[UB_I:.*]] = fir.convert %[[C20]] : (i32) -> index
+! COMMON: %[[STEP_I:.*]] = arith.constant 1 : index
+
+! COMMON: %[[C5:.*]] = arith.constant 5 : i32
+! COMMON: %[[LB_J:.*]] = fir.convert %[[C5]] : (i32) -> index
+! COMMON: %[[C40:.*]] = arith.constant 40 : i32
+! COMMON: %[[UB_J:.*]] = fir.convert %[[C40]] : (i32) -> index
+! COMMON: %[[STEP_J:.*]] = arith.constant 1 : index
+
+! COMMON: %[[C7:.*]] = arith.constant 7 : i32
+! COMMON: %[[LB_K:.*]] = fir.convert %[[C7]] : (i32) -> index
+! COMMON: %[[C60:.*]] = arith.constant 60 : i32
+! COMMON: %[[UB_K:.*]] = fir.convert %[[C60]] : (i32) -> index
+! COMMON: %[[STEP_K:.*]] = arith.constant 1 : index
+
+! DEVICE: omp.target host_eval(
+! DEVICE-SAME: %[[LB_I]] -> %[[LB_I:[[:alnum:]]+]],
+! DEVICE-SAME: %[[UB_I]] -> %[[UB_I:[[:alnum:]]+]],
+! DEVICE-SAME: %[[STEP_I]] -> %[[STEP_I:[[:alnum:]]+]],
+! DEVICE-SAME: %[[LB_J]] -> %[[LB_J:[[:alnum:]]+]],
+! DEVICE-SAME: %[[UB_J]] -> %[[UB_J:[[:alnum:]]+]],
+! DEVICE-SAME: %[[STEP_J]] -> %[[STEP_J:[[:alnum:]]+]],
+! DEVICE-SAME: %[[LB_K]] -> %[[LB_K:[[:alnum:]]+]],
+! DEVICE-SAME: %[[UB_K]] -> %[[UB_K:[[:alnum:]]+]],
+! DEVICE-SAME: %[[STEP_K]] -> %[[STEP_K:[[:alnum:]]+]] :
+! DEVICE-SAME: index, index, index, index, index, index, index, index, index)
 
-! CHECK: %[[C3:.*]] = arith.constant 3 : i32
-! CHECK: %[[LB_I:.*]] = fir.convert %[[C3]] : (i32) -> index
-! CHECK: %[[C20:.*]] = arith.constant 20 : i32
-! CHECK: %[[UB_I:.*]] = fir.convert %[[C20]] : (i32) -> index
-! CHECK: %[[STEP_I:.*]] = arith.constant 1 : index
+! DEVICE: omp.teams
 
-! CHECK: %[[C5:.*]] = arith.constant 5 : i32
-! CHECK: %[[LB_J:.*]] = fir.convert %[[C5]] : (i32) -> index
-! CHECK: %[[C40:.*]] = arith.constant 40 : i32
-! CHECK: %[[UB_J:.*]] = fir.convert %[[C40]] : (i32) -> index
-! CHECK: %[[STEP_J:.*]] = arith.constant 1 : index
+! HOST-NOT: omp.target
+! HOST-NOT: omp.teams
 
-! CHECK: %[[C7:.*]] = arith.constant 7 : i32
-! CHECK: %[[LB_K:.*]] = fir.convert %[[C7]] : (i32) -> index
-! CHECK: %[[C60:.*]] = arith.constant 60 : i32
-! CHECK: %[[UB_K:.*]] = fir.convert %[[C60]] : (i32) -> index
-! CHECK: %[[STEP_K:.*]] = arith.constant 1 : index
+! COMMON: omp.parallel {
 
-! CHECK: omp.parallel {
+! COMMON-NEXT: %[[ITER_VAR_I:.*]] = fir.alloca i32 {bindc_name = "i"}
+! COMMON-NEXT: %[[BINDING_I:.*]]:2 = hlfir.declare %[[ITER_VAR_I]] {uniq_name = "_QFEi"}
 
-! CHECK-NEXT: %[[ITER_VAR_I:.*]] = fir.alloca i32 {bindc_name = "i"}
-! CHECK-NEXT: %[[BINDING_I:.*]]:2 = hlfir.declare %[[ITER_VAR_I]] {uniq_name = "_QFEi"}
+! COMMON-NEXT: %[[ITER_VAR_J:.*]] = fir.alloca i32 {bindc_name = "j"}
+! COMMON-NEXT: %[[BINDING_J:.*]]:2 = hlfir.declare %[[ITER_VAR_J]] {uniq_name = "_QFEj"}
 
-! CHECK-NEXT: %[[ITER_VAR_J:.*]] = fir.alloca i32 {bindc_name = "j"}
-! CHECK-NEXT: %[[BINDING_J:.*]]:2 = hlfir.declare %[[ITER_VAR_J]] {uniq_name = "_QFEj"}
+! COMMON-NEXT: %[[ITER_VAR_K:.*]] = fir.alloca i32 {bindc_name = "k"}
+! COMMON-NEXT: %[[BINDING_K:.*]]:2 = hlfir.declare %[[ITER_VAR_K]] {uniq_name = "_QFEk"}
 
-! CHECK-NEXT: %[[ITER_VAR_K:.*]] = fir.alloca i32 {bindc_name = "k"}
-! CHECK-NEXT: %[[BINDING_K:.*]]:2 = hlfir.declare %[[ITER_VAR_K]] {uniq_name = "_QFEk"}
+! DEVICE: omp.distribute
 
-! CHECK: omp.wsloop {
-! CHECK-NEXT: omp.loop_nest
-! CHECK-SAME:   (%[[ARG0:[^[:space:]]+]], %[[ARG1:[^[:space:]]+]], %[[ARG2:[^[:space:]]+]])
-! CHECK-SAME:   : index = (%[[LB_I]], %[[LB_J]], %[[LB_K]])
-! CHECK-SAME:     to (%[[UB_I]], %[[UB_J]], %[[UB_K]]) inclusive
-! CHECK-SAME:     step (%[[STEP_I]], %[[STEP_J]], %[[STEP_K]]) {
+! COMMON: omp.wsloop {
+! COMMON-NEXT: omp.loop_nest
+! COMMON-SAME:   (%[[ARG0:[^[:space:]]+]], %[[ARG1:[^[:space:]]+]], %[[ARG2:[^[:space:]]+]])
+! COMMON-SAME:   : index = (%[[LB_I]], %[[LB_J]], %[[LB_K]])
+! COMMON-SAME:     to (%[[UB_I]], %[[UB_J]], %[[UB_K]]) inclusive
+! COMMON-SAME:     step (%[[STEP_I]], %[[STEP_J]], %[[STEP_K]]) {
 
-! CHECK-NEXT: %[[IV_IDX_I:.*]] = fir.convert %[[ARG0]]
-! CHECK-NEXT: fir.store %[[IV_IDX_I]] to %[[BINDING_I]]#0
+! COMMON-NEXT: %[[IV_IDX_I:.*]] = fir.convert %[[ARG0]]
+! COMMON-NEXT: fir.store %[[IV_IDX_I]] to %[[BINDING_I]]#0
 
-! CHECK-NEXT: %[[IV_IDX_J:.*]] = fir.convert %[[ARG1]]
-! CHECK-NEXT: fir.store %[[IV_IDX_J]] to %[[BINDING_J]]#0
+! COMMON-NEXT: %[[IV_IDX_J:.*]] = fir.convert %[[ARG1]]
+! COMMON-NEXT: fir.store %[[IV_IDX_J]] to %[[BINDING_J]]#0
 
-! CHECK-NEXT: %[[IV_IDX_K:.*]] = fir.convert %[[ARG2]]
-! CHECK-NEXT: fir.store %[[IV_IDX_K]] to %[[BINDING_K]]#0
+! COMMON-NEXT: %[[IV_IDX_K:.*]] = fir.convert %[[ARG2]]
+! COMMON-NEXT: fir.store %[[IV_IDX_K]] to %[[BINDING_K]]#0
 
-! CHECK:      omp.yield
-! CHECK-NEXT: }
-! CHECK-NEXT: }
+! COMMON:      omp.yield
+! COMMON-NEXT: }
+! COMMON-NEXT: }
 
-! CHECK-NEXT: omp.terminator
-! CHECK-NEXT: }
+! HOST-NEXT: omp.terminator
+! HOST-NEXT: }
diff --git a/flang/test/Transforms/DoConcurrent/non_reference_to_device.f90 b/flang/test/Transforms/DoConcurrent/non_reference_to_device.f90
new file mode 100644
index 0000000000000..b6b2136e2d405
--- /dev/null
+++ b/flang/test/Transforms/DoConcurrent/non_reference_to_device.f90
@@ -0,0 +1,34 @@
+! Tests that we can map "unnamed" and non-reference/non-box values to device; for
+! example, values that result from `fix.box_dims` ops.
+
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -fdo-concurrent-to-openmp=device %s -o - \
+! RUN:   | FileCheck %s
+! RUN: bbc -emit-hlfir -fopenmp -fdo-concurrent-to-openmp=device %s -o - \
+! RUN:   | FileCheck %s
+
+subroutine test_non_refernece
+  integer i
+  real, allocatable :: arr(:)
+
+  associate(a => arr)
+    do concurrent (i = 1:10)
+      block
+        real z(size(a,1))
+      end block
+    end do
+  end associate
+end subroutine test_non_refernece
+
+! CHECK: omp.map.info var_ptr(%{{.*}} : !fir.ref<index>, index)
+! CHECK: omp.map.info var_ptr(%{{.*}} : !fir.ref<index>, index)
+! CHECK: omp.map.info var_ptr(%{{.*}} : !fir.ref<index>, index)
+
+! CHECK:      %[[DIM_MAP:.*]] = omp.map.info var_ptr(%{{.*}} : !fir.ref<index>, index)
+! CHECK-SAME:                     map_clauses(implicit, exit_release_or_enter_alloc)
+! CHECK-SAME:                     capture(ByCopy) -> !fir.ref<index> {name = ""}
+
+
+! CHECK:      omp.target host_eval({{.*}} : index, index, index)
+! CHECK-SAME:   map_entries({{.*}}, %[[DIM_MAP]] -> %{{.*}} :
+! CHECK-SAME:               !fir.ref<i32>, !fir.ref<index>)
+
diff --git a/flang/test/Transforms/DoConcurrent/not_perfectly_nested.f90 b/flang/test/Transforms/DoConcurrent/not_perfectly_nested.f90
index 74799359e0476..d00e1610c2b5e 100644
--- a/flang/test/Transforms/DoConcurrent/not_perfectly_nested.f90
+++ b/flang/test/Transforms/DoConcurrent/not_perfectly_nested.f90
@@ -1,8 +1,12 @@
 ! Tests that if `do concurrent` is not perfectly nested in its parent loop, that
 ! we skip converting the not-perfectly nested `do concurrent` loop.
 
+
 ! RUN: %flang_fc1 -emit-hlfir -fopenmp -fdo-concurrent-to-openmp=host %s -o - \
-! RUN:   | FileCheck %s
+! RUN:   | FileCheck %s --check-prefixes=COMMON
+
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -fdo-concurrent-to-openmp=device %s -o - \
+! RUN:   | FileCheck %s --check-prefixes=DEVICE,COMMON
 
 program main
    integer, parameter :: n = 10
@@ -19,28 +23,46 @@ program main
    end do
 end
 
-! CHECK: omp.parallel {
 
-! CHECK: omp.wsloop {
-! CHECK: omp.loop_nest ({{[^[:space:]]+}}) {{.*}} {
-! CHECK:   fir.do_concurrent {
 
-! CHECK:     %[[ORIG_J_ALLOC:.*]] = fir.alloca i32 {bindc_name = "j"}
-! CHECK:     %[[ORIG_J_DECL:.*]]:2 = hlfir.declare %[[ORIG_J_ALLOC]]
+! DEVICE: omp.target {{.*}}map_entries(
+! DEVICE-SAME:   %{{[[:alnum:]]+}} -> %{{[^,]+}},
+! DEVICE-SAME:   %{{[[:alnum:]]+}} -> %{{[^,]+}},
+! DEVICE-SAME:   %{{[[:alnum:]]+}} -> %{{[^,]+}},
+! DEVICE-SAME:   %{{[^[:space:]]+}} -> %[[I_ARG:[^,]+]],
+! DEVICE-SAME:   %{{[^[:space:]]+}} -> %[[X_ARG:[^,]+]],
+! DEVICE-SAME:   %{{[^[:space:]]+}} -> %[[A_ARG:[^,]+]],
+! DEVICE-SAME:   %{{[^[:space:]]+}} -> %{{[^,]+}},
+! DEVICE-SAME:   %{{[^[:space:]]+}} -> %{{[^,]+}},
+! DEVICE-SAME:   %{{[^[:space:]]+}} -> %{{[^:]+}} :
+! DEVICE-SAME:   {{.*}}) {
+
+! DEVICE: omp.teams
+
+! COMMON: omp.parallel {
+
+! DEVICE: omp.distribute
+
+! COMMON: omp.wsloop {
+! COMMON: omp.loop_nest ({{[^[:space:]]+}}) {{.*}} {
+! COMMON:   fir.do_concurrent {
+
+! COMMON:     %[[ORIG_J_ALLOC:.*]] = fir.alloca i32 {bindc_name = "j"}
+! COMMON:     %[[ORIG_J_DECL:.*]]:2 = hlfir.declare %[[ORIG_J_ALLOC]]
 
-! CHECK:     %[[ORIG_K_ALLOC:.*]] = fir.alloca i32 {bindc_name = "k"}
-! CHECK:     %[[ORIG_K_DECL:.*]]:2 = hlfir.declare %[[ORIG_K_ALLOC]]
+! COMMON:     %[[ORIG_K_ALLOC:.*]] = fir.alloca i32 {bindc_name = "k"}
+! COMMON:     %[[ORIG_K_DECL:.*]]:2 = hlfir.declare %[[ORIG_K_ALLOC]]
 
-! CHECK:     fir.do_concurrent.loop (%[[J_IV:.*]], %[[K_IV:.*]]) = {{.*}} {
-! CHECK:       %[[J_IV_CONV:.*]] = fir.convert %[[J_IV]] : (index) -> i32
-! CHECK:       fir.store %[[J_IV_CONV]] to %[[ORIG_J_DECL]]#0
+! COMMON:     fir.do_concurrent.loop (%[[J_IV:.*]], %[[K_IV:.*]]) = {{.*}} {
+! COMMON:       %[[J_IV_CONV:.*]] = fir.convert %[[J_IV]] : (index) -> i32
+! COMMON:       fir.store %[[J_IV_CONV]] to %[[ORIG_J_DECL]]#0
 
-! CHECK:       %[[K_IV_CONV:.*]] = fir.convert %[[K_IV]] : (index) -> i32
-! CHECK:       fir.store %[[K_IV_CONV]] to %[[ORIG_K_DECL]]#0
-! CHECK:     }
-! CHECK:   }
-! CHECK: omp.yield
-! CHECK: }
-! CHECK: }
-! CHECK: omp.terminator
-! CHECK: }
+! COMMON:       %[[K_IV_CONV:.*]] = fir.convert %[[K_IV]] : (index) -> i32
+! COMMON:       fir.store %[[K_IV_CONV]] to %[[ORIG_K_DECL]]#0
+! COMMON:     }
+! COMMON:   }
+! COMMON: omp.yield
+! COMMON: }
+! COMMON: }
+! COMMON: omp.terminator
+! COMMON: }
diff --git a/flang/test/Transforms/DoConcurrent/runtime_sized_array.f90 b/flang/test/Transforms/DoConcurrent/runtime_sized_array.f90
new file mode 100644
index 0000000000000..e38474a68747f
--- /dev/null
+++ b/flang/test/Transforms/DoConcurrent/runtime_sized_array.f90
@@ -0,0 +1,42 @@
+! Tests `do concurrent` mapping when mapped value(s) depend on values defined
+! outside the target region; e.g. the size of the array is dynamic. This needs
+! to be handled by localizing these region outsiders by either cloning them in
+! the region or in case we cannot do that, map them and use the mapped values.
+
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -fdo-concurrent-to-openmp=device %s -o - \
+! RUN:   | FileCheck %s
+
+subroutine foo(n)
+  implicit none
+  integer :: n
+  integer :: i
+  integer, dimension(n) :: a
+
+  do concurrent(i=1:10)
+    a(i) = i
+  end do
+end subroutine
+
+! CHECK-DAG: %[[I_DECL:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFfooEi"}
+! CHECK-DAG: %[[A_DECL:.*]]:2 = hlfir.declare %{{.*}}(%{{.*}}) {uniq_name = "_QFfooEa"}
+
+! CHECK-DAG: %[[I_MAP:.*]] = omp.map.info var_ptr(%[[I_DECL]]#1 : {{.*}}) {{.*}} {name = "_QFfooEi"}
+! CHECK-DAG: %[[A_MAP:.*]] = omp.map.info var_ptr(%[[A_DECL]]#1 : {{.*}}) {{.*}} {name = "_QFfooEa"}
+! CHECK-DAG: %[[N_MAP:.*]] = omp.map.info var_ptr(%{{.*}} : {{.*}}) {{.*}} {name = "_QFfooEa.extent.dim0"}
+
+! CHECK: omp.target
+! CHECK-SAME: map_entries(
+! CHECK-SAME:     %{{[[:alnum:]]+}} -> %{{[^,]+}},
+! CHECK-SAME:     %{{[[:alnum:]]+}} -> %{{[^,]+}},
+! CHECK-SAME:     %{{[[:alnum:]]+}} -> %{{[^,]+}},
+! CHECK-SAME:     %[[I_MAP]] -> %[[I_ARG:arg[0-9]*]],
+! CHECK-SAME:     %[[A_MAP]] -> %[[A_ARG:arg[0-9]*]],
+! CHECK-SAME:     %[[N_MAP]] -> %[[N_ARG:arg[0-9]*]] : {{.*}})
+! CHECK-SAME: {{.*}} {
+
+! CHECK-DAG:  %{{.*}} = hlfir.declare %[[I_ARG]]
+! CHECK-DAG:  %{{.*}} = hlfir.declare %[[A_ARG]]
+! CHECK-DAG:  %{{.*}} = fir.load %[[N_ARG]]
+
+! CHECK:   omp.terminator
+! CHECK: }
diff --git a/flang/test/Transforms/DoConcurrent/skip_all_nested_loops.f90 b/flang/test/Transforms/DoConcurrent/skip_all_nested_loops.f90
new file mode 100644
index 0000000000000..2dada05396ad6
--- /dev/null
+++ b/flang/test/Transforms/DoConcurrent/skip_all_nested_loops.f90
@@ -0,0 +1,68 @@
+! Tests that if `do concurrent` is indirectly nested in its parent loop, that we
+! skip converting the indirectly nested `do concurrent` loop.
+
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -fdo-concurrent-to-openmp=host %s -o - \
+! RUN:   | FileCheck %s --check-prefixes=HOST,COMMON
+
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -fdo-concurrent-to-openmp=device %s -o - \
+! RUN:   | FileCheck %s --check-prefixes=DEVICE,COMMON
+
+program main
+   integer, parameter :: n = 10
+   integer, parameter :: m = 20
+   integer, parameter :: l = 30
+   integer x;
+   integer :: a(n, m, l)
+
+   do concurrent(i=1:n)
+     do j=1,m
+       do concurrent(k=1:l)
+         a(i,j,k) = i * j + k
+       end do
+     end do
+   end do
+end
+
+! HOST: %[[ORIG_J_ALLOC:.*]] = fir.alloca i32 {bindc_name = "j", {{.*}}}
+! HOST: %[[ORIG_J_DECL:.*]]:2 = hlfir.declare %[[ORIG_J_ALLOC]]
+
+! DEVICE: omp.target {{.*}}map_entries(
+! DEVICE-SAME:   %{{[[:alnum:]]+}} -> %{{[^,]+}},
+! DEVICE-SAME:   %{{[[:alnum:]]+}} -> %{{[^,]+}},
+! DEVICE-SAME:   %{{[[:alnum:]]+}} -> %{{[^,]+}},
+! DEVICE-SAME:   %{{[^[:space:]]+}} -> %[[I_ARG:[^,]+]],
+! DEVICE-SAME:   %{{[^[:space:]]+}} -> %[[J_ARG:[^,]+]],
+! DEVICE-SAME:   %{{[^[:space:]]+}} -> %[[A_ARG:[^,]+]],
+! DEVICE-SAME:   %{{[^[:space:]]+}} -> %{{[^,]+}},
+! DEVICE-SAME:   %{{[^[:space:]]+}} -> %{{[^,]+}},
+! DEVICE-SAME:   %{{[^[:space:]]+}} -> %{{[^:]+}} :
+! DEVICE-SAME:   {{.*}}) {
+
+! DEVICE: %[[TARGET_J_DECL:.*]]:2 = hlfir.declare %[[J_ARG]] {uniq_name = "_QFEj"}
+
+! DEVICE: omp.teams
+
+! COMMON: omp.parallel {
+
+! DEVICE: omp.distribute
+
+! COMMON: omp.wsloop {
+! COMMON: omp.loop_nest ({{[^[:space:]]+}}) {{.*}} {
+! COMMON:   fir.do_loop {{.*}} iter_args(%[[J_IV:.*]] = {{.*}}) -> {{.*}} {
+! HOST:       fir.store %[[J_IV]] to %[[ORIG_J_DECL]]#0
+! DEVICE:     fir.store %[[J_IV]] to %[[TARGET_J_DECL]]#0
+
+! COMMON:     fir.do_concurrent {
+! COMMON:         %[[ORIG_K_ALLOC:.*]] = fir.alloca i32 {bindc_name = "k"}
+! COMMON:         %[[ORIG_K_DECL:.*]]:2 = hlfir.declare %[[ORIG_K_ALLOC]]
+! COMMON:       fir.do_concurrent.loop (%[[K_IV:.*]]) = {{.*}} {
+! COMMON:         %[[K_IV_CONV:.*]] = fir.convert %[[K_IV]] : (index) -> i32
+! COMMON:           fir.store %[[K_IV_CONV]] to %[[ORIG_K_DECL]]#0
+! COMMON:       }
+! COMMON:     }
+! COMMON:   }
+! COMMON: omp.yield
+! COMMON: }
+! COMMON: }
+! COMMON: omp.terminator
+! COMMON: }
diff --git a/flang/test/Transforms/debug-dwarf-version.fir b/flang/test/Transforms/debug-dwarf-version.fir
new file mode 100644
index 0000000000000..fe2700274ab87
--- /dev/null
+++ b/flang/test/Transforms/debug-dwarf-version.fir
@@ -0,0 +1,21 @@
+// RUN: fir-opt --add-debug-info="dwarf-version=5" --mlir-print-debuginfo %s \
+// RUN:         | FileCheck --check-prefix=CHECK-DWARF5 %s
+// RUN: fir-opt --add-debug-info="dwarf-version=4" --mlir-print-debuginfo %s \
+// RUN:         | FileCheck --check-prefix=CHECK-DWARF4 %s
+// RUN: fir-opt --add-debug-info="dwarf-version=3" --mlir-print-debuginfo %s \
+// RUN:         | FileCheck --check-prefix=CHECK-DWARF3 %s
+// RUN: fir-opt --add-debug-info="dwarf-version=2" --mlir-print-debuginfo %s \
+// RUN:         | FileCheck --check-prefix=CHECK-DWARF2 %s
+// RUN: fir-opt --add-debug-info= --mlir-print-debuginfo %s \
+// RUN:         | FileCheck --check-prefix=CHECK-WITHOUT-VERSION %s
+// REQUIRES: system-linux
+
+module {
+} loc(#loc)
+#loc = loc("simple.f90":0:0)
+
+// CHECK-DWARF5: llvm.module_flags [#llvm.mlir.module_flag<max, "Dwarf Version", 5 : i32>]
+// CHECK-DWARF4: llvm.module_flags [#llvm.mlir.module_flag<max, "Dwarf Version", 4 : i32>]
+// CHECK-DWARF3: llvm.module_flags [#llvm.mlir.module_flag<max, "Dwarf Version", 3 : i32>]
+// CHECK-DWARF2: llvm.module_flags [#llvm.mlir.module_flag<max, "Dwarf Version", 2 : i32>]
+// CHECK-WITHOUT-VERSION-NOT: llvm.module_flags
diff --git a/libc/config/linux/aarch64/entrypoints.txt b/libc/config/linux/aarch64/entrypoints.txt
index 840d4cec14bbf..00c4b2ec0f828 100644
--- a/libc/config/linux/aarch64/entrypoints.txt
+++ b/libc/config/linux/aarch64/entrypoints.txt
@@ -735,6 +735,7 @@ if(LIBC_TYPES_HAS_FLOAT16)
     libc.src.math.rintf16
     libc.src.math.roundevenf16
     libc.src.math.roundf16
+    libc.src.math.rsqrtf16
     libc.src.math.scalblnf16
     libc.src.math.scalbnf16
     libc.src.math.setpayloadf16
diff --git a/libc/config/linux/riscv/entrypoints.txt b/libc/config/linux/riscv/entrypoints.txt
index 653282c7d3935..89e3653186d13 100644
--- a/libc/config/linux/riscv/entrypoints.txt
+++ b/libc/config/linux/riscv/entrypoints.txt
@@ -749,6 +749,7 @@ if(LIBC_TYPES_HAS_FLOAT16)
     libc.src.math.rintf16
     libc.src.math.roundevenf16
     libc.src.math.roundf16
+    libc.src.math.rsqrtf16
     libc.src.math.scalblnf16
     libc.src.math.scalbnf16
     libc.src.math.setpayloadf16
diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt
index 1fef16f190af6..0bb8a683c5b01 100644
--- a/libc/config/linux/x86_64/entrypoints.txt
+++ b/libc/config/linux/x86_64/entrypoints.txt
@@ -784,6 +784,7 @@ if(LIBC_TYPES_HAS_FLOAT16)
     libc.src.math.rintf16
     libc.src.math.roundevenf16
     libc.src.math.roundf16
+    libc.src.math.rsqrtf16
     libc.src.math.scalblnf16
     libc.src.math.scalbnf16
     libc.src.math.setpayloadf16
diff --git a/libc/docs/headers/math/index.rst b/libc/docs/headers/math/index.rst
index 6c0e2190808df..7d5b341ba674a 100644
--- a/libc/docs/headers/math/index.rst
+++ b/libc/docs/headers/math/index.rst
@@ -255,6 +255,7 @@ Basic Operations
 Higher Math Functions
 =====================
 
+
 +-----------+------------------+-----------------+------------------------+----------------------+------------------------+----------++------------+------------------------+----------------------------+
 | <Func>    | <Func_f> (float) | <Func> (double) | <Func_l> (long double) | <Func_f16> (float16) | <Func_f128> (float128) | <Func_bf16> (bfloat16) | C23 Definition Section | C23 Error Handling Section |
 +===========+==================+=================+========================+======================+========================+========================+========================+============================+
@@ -342,7 +343,7 @@ Higher Math Functions
 +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+------------------------+----------------------------+
 | rootn     |                  |                 |                        |                      |                        |                        | 7.12.7.8               | F.10.4.8                   |
 +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+------------------------+----------------------------+
-| rsqrt     |                  |                 |                        |                      |                        |                        | 7.12.7.9               | F.10.4.9                   |
+| rsqrt     |                  |                 |                        | |check|              |                        |                        | 7.12.7.9               | F.10.4.9                   |
 +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+------------------------+----------------------------+
 | sin       | |check|          | |check|         |                        | |check|              |                        |                        | 7.12.4.6               | F.10.1.6                   |
 +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+------------------------+----------------------------+
@@ -363,6 +364,7 @@ Higher Math Functions
 | tgamma    |                  |                 |                        |                      |                        |                        | 7.12.8.4               | F.10.5.4                   |
 +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+------------------------+----------------------------+
 
+
 Legends:
 
 * |check| : correctly rounded for all 4 rounding modes.
diff --git a/libc/include/math.yaml b/libc/include/math.yaml
index 17f26fcfcb308..6c800a0e2aa28 100644
--- a/libc/include/math.yaml
+++ b/libc/include/math.yaml
@@ -2349,6 +2349,13 @@ functions:
     return_type: long double
     arguments:
       - type: long double
+  - name: rsqrtf16
+    standards:
+      - stdc
+    return_type: _Float16
+    arguments:
+      - type: _Float16
+    guard: LIBC_TYPES_HAS_FLOAT16
   - name: scalbln
     standards:
       - stdc
diff --git a/libc/shared/math.h b/libc/shared/math.h
index 69d785b3e0291..4f20095912bf1 100644
--- a/libc/shared/math.h
+++ b/libc/shared/math.h
@@ -53,4 +53,6 @@
 #include "math/ldexpf128.h"
 #include "math/ldexpf16.h"
 
+#include "math/rsqrtf16.h"
+
 #endif // LLVM_LIBC_SHARED_MATH_H
diff --git a/libc/shared/math/rsqrtf16.h b/libc/shared/math/rsqrtf16.h
new file mode 100644
index 0000000000000..54c7499214636
--- /dev/null
+++ b/libc/shared/math/rsqrtf16.h
@@ -0,0 +1,29 @@
+//===-- Shared rsqrtf16 function -------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SHARED_MATH_RSQRTF16_H
+#define LLVM_LIBC_SHARED_MATH_RSQRTF16_H
+
+#include "include/llvm-libc-macros/float16-macros.h"
+
+#ifdef LIBC_TYPES_HAS_FLOAT16
+
+#include "shared/libc_common.h"
+#include "src/__support/math/rsqrtf16.h"
+
+namespace LIBC_NAMESPACE_DECL {
+namespace shared {
+
+using math::rsqrtf16;
+
+} // namespace shared
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LIBC_TYPES_HAS_FLOAT16
+
+#endif // LLVM_LIBC_SHARED_MATH_RSQRTF16_H
diff --git a/libc/src/__support/CMakeLists.txt b/libc/src/__support/CMakeLists.txt
index b6e87ac336fb2..0ef09a9b8c9d0 100644
--- a/libc/src/__support/CMakeLists.txt
+++ b/libc/src/__support/CMakeLists.txt
@@ -302,6 +302,7 @@ add_header_library(
   DEPENDS
     libc.hdr.stdint_proxy
     libc.src.__support.common
+    libc.src.string.memory_utils.inline_memcpy
 )
 
 add_header_library(
diff --git a/libc/src/__support/CPP/bit.h b/libc/src/__support/CPP/bit.h
index 5a997ef555702..8dbb30047faec 100644
--- a/libc/src/__support/CPP/bit.h
+++ b/libc/src/__support/CPP/bit.h
@@ -39,7 +39,7 @@ bit_cast(const From &from) {
 #if __has_builtin(__builtin_bit_cast)
   return __builtin_bit_cast(To, from);
 #else
-  To to;
+  To to{};
   char *dst = reinterpret_cast<char *>(&to);
   const char *src = reinterpret_cast<const char *>(&from);
 #if __has_builtin(__builtin_memcpy_inline)
diff --git a/libc/src/__support/CPP/simd.h b/libc/src/__support/CPP/simd.h
index 3c7e65acc3c0a..d2a5b17fa4b9f 100644
--- a/libc/src/__support/CPP/simd.h
+++ b/libc/src/__support/CPP/simd.h
@@ -57,6 +57,29 @@ using simd = T [[clang::ext_vector_type(N)]];
 template <typename T>
 using simd_mask = simd<bool, internal::native_vector_size<T>>;
 
+// Type trait helpers.
+template <typename T>
+struct simd_size : cpp::integral_constant<size_t, __builtin_vectorelements(T)> {
+};
+template <class T> constexpr size_t simd_size_v = simd_size<T>::value;
+
+template <typename T> struct is_simd : cpp::integral_constant<bool, false> {};
+template <typename T, unsigned N>
+struct is_simd<simd<T, N>> : cpp::integral_constant<bool, true> {};
+template <class T> constexpr bool is_simd_v = is_simd<T>::value;
+
+template <typename T>
+struct is_simd_mask : cpp::integral_constant<bool, false> {};
+template <unsigned N>
+struct is_simd_mask<simd<bool, N>> : cpp::integral_constant<bool, true> {};
+template <class T> constexpr bool is_simd_mask_v = is_simd_mask<T>::value;
+
+template <typename T> struct simd_element_type;
+template <typename T, size_t N> struct simd_element_type<simd<T, N>> {
+  using type = T;
+};
+template <typename T>
+using simd_element_type_t = typename simd_element_type<T>::type;
 namespace internal {
 
 template <typename T>
@@ -123,34 +146,14 @@ LIBC_INLINE constexpr static auto split(cpp::simd<T, N> x) {
     return result;
 }
 
-} // namespace internal
-
-// Type trait helpers.
+// Helper trait
 template <typename T>
-struct simd_size : cpp::integral_constant<size_t, __builtin_vectorelements(T)> {
-};
-template <class T> constexpr size_t simd_size_v = simd_size<T>::value;
-
-template <typename T> struct is_simd : cpp::integral_constant<bool, false> {};
-template <typename T, unsigned N>
-struct is_simd<simd<T, N>> : cpp::integral_constant<bool, true> {};
-template <class T> constexpr bool is_simd_v = is_simd<T>::value;
-
-template <typename T>
-struct is_simd_mask : cpp::integral_constant<bool, false> {};
-template <unsigned N>
-struct is_simd_mask<simd<bool, N>> : cpp::integral_constant<bool, true> {};
-template <class T> constexpr bool is_simd_mask_v = is_simd_mask<T>::value;
+using enable_if_integral_t = cpp::enable_if_t<cpp::is_integral_v<T>, T>;
 
-template <typename T> struct simd_element_type;
-template <typename T, size_t N> struct simd_element_type<simd<T, N>> {
-  using type = T;
-};
 template <typename T>
-using simd_element_type_t = typename simd_element_type<T>::type;
+using enable_if_simd_t = cpp::enable_if_t<is_simd_v<T>, bool>;
 
-template <typename T>
-using enable_if_simd_t = cpp::enable_if_t<is_simd_v<T>, T>;
+} // namespace internal
 
 // Casting.
 template <typename To, typename From, size_t N>
@@ -159,29 +162,34 @@ LIBC_INLINE constexpr static simd<To, N> simd_cast(simd<From, N> v) {
 }
 
 // SIMD mask operations.
-template <size_t N> LIBC_INLINE constexpr static bool all_of(simd<bool, N> m) {
-  return __builtin_reduce_and(m);
+template <typename T, size_t N, internal::enable_if_integral_t<T> = 0>
+LIBC_INLINE constexpr static bool all_of(simd<T, N> v) {
+  return __builtin_reduce_and(simd_cast<bool>(v));
 }
-template <size_t N> LIBC_INLINE constexpr static bool any_of(simd<bool, N> m) {
-  return __builtin_reduce_or(m);
+template <typename T, size_t N, internal::enable_if_integral_t<T> = 0>
+LIBC_INLINE constexpr static bool any_of(simd<T, N> v) {
+  return __builtin_reduce_or(simd_cast<bool>(v));
 }
-template <size_t N> LIBC_INLINE constexpr static bool none_of(simd<bool, N> m) {
-  return !any_of(m);
+template <typename T, size_t N, internal::enable_if_integral_t<T> = 0>
+LIBC_INLINE constexpr static bool none_of(simd<T, N> v) {
+  return !any_of(v);
 }
-template <size_t N> LIBC_INLINE constexpr static bool some_of(simd<bool, N> m) {
-  return any_of(m) && !all_of(m);
+template <typename T, size_t N, internal::enable_if_integral_t<T> = 0>
+LIBC_INLINE constexpr static bool some_of(simd<T, N> v) {
+  return any_of(v) && !all_of(v);
 }
-template <size_t N> LIBC_INLINE constexpr static int popcount(simd<bool, N> m) {
-  return __builtin_popcountg(m);
+template <typename T, size_t N, internal::enable_if_integral_t<T> = 0>
+LIBC_INLINE constexpr static int popcount(simd<T, N> v) {
+  return __builtin_popcountg(v);
 }
-template <size_t N>
-LIBC_INLINE constexpr static int find_first_set(simd<bool, N> m) {
-  return __builtin_ctzg(m);
+template <typename T, size_t N, internal::enable_if_integral_t<T> = 0>
+LIBC_INLINE constexpr static int find_first_set(simd<T, N> v) {
+  return __builtin_ctzg(simd_cast<bool>(v));
 }
-template <size_t N>
-LIBC_INLINE constexpr static int find_last_set(simd<bool, N> m) {
-  constexpr size_t size = simd_size_v<simd<bool, N>>;
-  return size - 1 - __builtin_clzg(m);
+template <typename T, size_t N, internal::enable_if_integral_t<T> = 0>
+LIBC_INLINE constexpr static int find_last_set(simd<T, N> v) {
+  constexpr size_t size = simd_size_v<simd<T, N>>;
+  return size - 1 - __builtin_clzg(simd_cast<bool>(v));
 }
 
 // Elementwise operations.
@@ -279,33 +287,32 @@ LIBC_INLINE constexpr static T hmax(simd<T, N> v) {
 }
 
 // Accessor helpers.
-template <typename T>
-LIBC_INLINE enable_if_simd_t<T> load_unaligned(const void *ptr) {
+template <typename T, internal::enable_if_simd_t<T> = 0>
+LIBC_INLINE T load_unaligned(const void *ptr) {
   T tmp;
   __builtin_memcpy(&tmp, ptr, sizeof(T));
   return tmp;
 }
-template <typename T>
-LIBC_INLINE enable_if_simd_t<T> load_aligned(const void *ptr) {
+template <typename T, internal::enable_if_simd_t<T> = 0>
+LIBC_INLINE T load_aligned(const void *ptr) {
   return load_unaligned<T>(__builtin_assume_aligned(ptr, alignof(T)));
 }
-template <typename T>
-LIBC_INLINE enable_if_simd_t<T> store_unaligned(T v, void *ptr) {
+template <typename T, internal::enable_if_simd_t<T> = 0>
+LIBC_INLINE T store_unaligned(T v, void *ptr) {
   __builtin_memcpy(ptr, &v, sizeof(T));
 }
-template <typename T>
-LIBC_INLINE enable_if_simd_t<T> store_aligned(T v, void *ptr) {
+template <typename T, internal::enable_if_simd_t<T> = 0>
+LIBC_INLINE T store_aligned(T v, void *ptr) {
   store_unaligned<T>(v, __builtin_assume_aligned(ptr, alignof(T)));
 }
-template <typename T>
-LIBC_INLINE enable_if_simd_t<T>
+template <typename T, internal::enable_if_simd_t<T> = 0>
+LIBC_INLINE T
 masked_load(simd<bool, simd_size_v<T>> m, void *ptr,
             T passthru = internal::poison<simd_element_type<T>>()) {
   return __builtin_masked_load(m, ptr, passthru);
 }
-template <typename T>
-LIBC_INLINE enable_if_simd_t<T> masked_store(simd<bool, simd_size_v<T>> m, T v,
-                                             void *ptr) {
+template <typename T, internal::enable_if_simd_t<T> = 0>
+LIBC_INLINE T masked_store(simd<bool, simd_size_v<T>> m, T v, void *ptr) {
   __builtin_masked_store(
       m, v, static_cast<T *>(__builtin_assume_aligned(ptr, alignof(T))));
 }
diff --git a/libc/src/__support/CPP/type_traits/is_destructible.h b/libc/src/__support/CPP/type_traits/is_destructible.h
index 830f22efafa52..7ada2235b4e73 100644
--- a/libc/src/__support/CPP/type_traits/is_destructible.h
+++ b/libc/src/__support/CPP/type_traits/is_destructible.h
@@ -22,7 +22,7 @@ namespace LIBC_NAMESPACE_DECL {
 namespace cpp {
 
 // is_destructible
-#if __has_builtin(__is_destructible)
+#if __has_builtin(__is_destructible) || defined(LIBC_COMPILER_IS_MSVC)
 template <typename T>
 struct is_destructible : bool_constant<__is_destructible(T)> {};
 #else
diff --git a/libc/src/__support/CPP/type_traits/is_unsigned.h b/libc/src/__support/CPP/type_traits/is_unsigned.h
index 3ae6337ceb50a..b4267eedd19fc 100644
--- a/libc/src/__support/CPP/type_traits/is_unsigned.h
+++ b/libc/src/__support/CPP/type_traits/is_unsigned.h
@@ -16,6 +16,8 @@
 #include "src/__support/macros/attributes.h"
 #include "src/__support/macros/config.h"
 
+#include <stddef.h>
+
 namespace LIBC_NAMESPACE_DECL {
 namespace cpp {
 
@@ -46,6 +48,10 @@ template <typename T> struct is_unsigned {
   LIBC_INLINE constexpr bool operator()() const { return is_unsigned::value; }
 };
 #endif // LIBC_COMPILER_HAS_FIXED_POINT
+#if LIBC_HAS_VECTOR_TYPE
+template <typename T, size_t N>
+struct is_unsigned<T [[clang::ext_vector_type(N)]]> : bool_constant<false> {};
+#endif
 
 template <typename T>
 LIBC_INLINE_VAR constexpr bool is_unsigned_v = is_unsigned<T>::value;
diff --git a/libc/src/__support/FPUtil/FEnvImpl.h b/libc/src/__support/FPUtil/FEnvImpl.h
index 7bd56434e58fe..ef3f60a5b3d7f 100644
--- a/libc/src/__support/FPUtil/FEnvImpl.h
+++ b/libc/src/__support/FPUtil/FEnvImpl.h
@@ -18,6 +18,7 @@
 #include "src/__support/macros/config.h"
 #include "src/__support/macros/optimization.h"
 #include "src/__support/macros/properties/architectures.h"
+#include "src/__support/macros/properties/compiler.h"
 
 #if defined(LIBC_TARGET_ARCH_IS_AARCH64) && defined(__ARM_FP)
 #if defined(__APPLE__)
@@ -29,9 +30,12 @@
 // The extra !defined(APPLE) condition is to cause x86_64 MacOS builds to use
 // the dummy implementations below. Once a proper x86_64 darwin fenv is set up,
 // the apple condition here should be removed.
-#elif defined(LIBC_TARGET_ARCH_IS_X86) && !defined(__APPLE__)
+// TODO: fully support fenv for MSVC.
+#elif defined(LIBC_TARGET_ARCH_IS_X86) && !defined(__APPLE__) &&               \
+    !defined(LIBC_COMPILER_IS_MSVC)
 #include "x86_64/FEnvImpl.h"
-#elif defined(LIBC_TARGET_ARCH_IS_ARM) && defined(__ARM_FP)
+#elif defined(LIBC_TARGET_ARCH_IS_ARM) && defined(__ARM_FP) &&                 \
+    !defined(LIBC_COMPILER_IS_MSVC)
 #include "arm/FEnvImpl.h"
 #elif defined(LIBC_TARGET_ARCH_IS_ANY_RISCV) && defined(__riscv_flen)
 #include "riscv/FEnvImpl.h"
diff --git a/libc/src/__support/FPUtil/FPBits.h b/libc/src/__support/FPUtil/FPBits.h
index 2f695c1583755..ce4925bae125a 100644
--- a/libc/src/__support/FPUtil/FPBits.h
+++ b/libc/src/__support/FPUtil/FPBits.h
@@ -789,16 +789,16 @@ struct FPRep : public FPRepImpl<fp_type, FPRep<fp_type>> {
 // Returns the FPType corresponding to C++ type T on the host.
 template <typename T> LIBC_INLINE static constexpr FPType get_fp_type() {
   using UnqualT = cpp::remove_cv_t<T>;
-  if constexpr (cpp::is_same_v<UnqualT, float> && __FLT_MANT_DIG__ == 24)
+  if constexpr (cpp::is_same_v<UnqualT, float> && FLT_MANT_DIG == 24)
     return FPType::IEEE754_Binary32;
-  else if constexpr (cpp::is_same_v<UnqualT, double> && __DBL_MANT_DIG__ == 53)
+  else if constexpr (cpp::is_same_v<UnqualT, double> && DBL_MANT_DIG == 53)
     return FPType::IEEE754_Binary64;
   else if constexpr (cpp::is_same_v<UnqualT, long double>) {
-    if constexpr (__LDBL_MANT_DIG__ == 53)
+    if constexpr (LDBL_MANT_DIG == 53)
       return FPType::IEEE754_Binary64;
-    else if constexpr (__LDBL_MANT_DIG__ == 64)
+    else if constexpr (LDBL_MANT_DIG == 64)
       return FPType::X86_Binary80;
-    else if constexpr (__LDBL_MANT_DIG__ == 113)
+    else if constexpr (LDBL_MANT_DIG == 113)
       return FPType::IEEE754_Binary128;
   }
 #if defined(LIBC_TYPES_HAS_FLOAT16)
diff --git a/libc/src/__support/arg_list.h b/libc/src/__support/arg_list.h
index 1e26a5e8ef9c7..7b78a9c0fe619 100644
--- a/libc/src/__support/arg_list.h
+++ b/libc/src/__support/arg_list.h
@@ -12,6 +12,7 @@
 #include "hdr/stdint_proxy.h"
 #include "src/__support/common.h"
 #include "src/__support/macros/config.h"
+#include "src/string/memory_utils/inline_memcpy.h"
 
 #include <stdarg.h>
 #include <stddef.h>
@@ -126,7 +127,7 @@ template <bool packed> class StructArgList {
 
     // Memcpy because pointer alignment may be illegal given a packed struct.
     T val;
-    __builtin_memcpy(&val, ptr, sizeof(T));
+    inline_memcpy(&val, ptr, sizeof(T));
 
     ptr =
         reinterpret_cast<void *>(reinterpret_cast<uintptr_t>(ptr) + sizeof(T));
diff --git a/libc/src/__support/endian_internal.h b/libc/src/__support/endian_internal.h
index c78090ad85e05..07cde7b905c4d 100644
--- a/libc/src/__support/endian_internal.h
+++ b/libc/src/__support/endian_internal.h
@@ -16,13 +16,51 @@
 namespace LIBC_NAMESPACE_DECL {
 
 // We rely on compiler preprocessor defines to allow for cross compilation.
+#ifdef LIBC_COMPILER_IS_MSVC
+#define __BYTE_ORDER__ 0
+#define __ORDER_LITTLE_ENDIAN__ 0
+#define __ORDER_BIG_ENDIAN__ 1
+#else // !LIBC_COMPILER_IS_MSVC
 #if !defined(__BYTE_ORDER__) || !defined(__ORDER_LITTLE_ENDIAN__) ||           \
     !defined(__ORDER_BIG_ENDIAN__)
 #error "Missing preprocessor definitions for endianness detection."
 #endif
+#endif // LIBC_COMPILER_IS_MSVC
 
 namespace internal {
 
+template <typename T> LIBC_INLINE T byte_swap(T value);
+
+template <> LIBC_INLINE uint16_t byte_swap<uint16_t>(uint16_t value) {
+#if __has_builtin(__builtin_bswap16)
+  return __builtin_bswap16(value);
+#else
+  return (value << 8) | (value >> 8);
+#endif // __builtin_bswap16
+}
+
+template <> LIBC_INLINE uint32_t byte_swap<uint32_t>(uint32_t value) {
+#if __has_builtin(__builtin_bswap32)
+  return __builtin_bswap32(value);
+#else
+  return byte_swap<uint16_t>(static_cast<uint16_t>(value >> 16)) ||
+         (static_cast<uint32_t>(
+              byte_swap<uint16_t>(static_cast<uint16_t>(value)))
+          << 16);
+#endif // __builtin_bswap64
+}
+
+template <> LIBC_INLINE uint64_t byte_swap<uint64_t>(uint64_t value) {
+#if __has_builtin(__builtin_bswap64)
+  return __builtin_bswap64(value);
+#else
+  return byte_swap<uint32_t>(static_cast<uint32_t>(value >> 32)) ||
+         (static_cast<uint64_t>(
+              byte_swap<uint32_t>(static_cast<uint32_t>(value)))
+          << 32);
+#endif // __builtin_bswap64
+}
+
 // Converts uint8_t, uint16_t, uint32_t, uint64_t to its big or little endian
 // counterpart.
 // We use explicit template specialization:
@@ -53,7 +91,7 @@ template <>
 template <>
 LIBC_INLINE uint16_t
 Endian<__ORDER_LITTLE_ENDIAN__>::to_big_endian<uint16_t>(uint16_t v) {
-  return __builtin_bswap16(v);
+  return byte_swap<uint16_t>(v);
 }
 template <>
 template <>
@@ -65,7 +103,7 @@ template <>
 template <>
 LIBC_INLINE uint32_t
 Endian<__ORDER_LITTLE_ENDIAN__>::to_big_endian<uint32_t>(uint32_t v) {
-  return __builtin_bswap32(v);
+  return byte_swap<uint32_t>(v);
 }
 template <>
 template <>
@@ -77,7 +115,7 @@ template <>
 template <>
 LIBC_INLINE uint64_t
 Endian<__ORDER_LITTLE_ENDIAN__>::to_big_endian<uint64_t>(uint64_t v) {
-  return __builtin_bswap64(v);
+  return byte_swap<uint64_t>(v);
 }
 template <>
 template <>
@@ -109,7 +147,7 @@ template <>
 template <>
 LIBC_INLINE uint16_t
 Endian<__ORDER_BIG_ENDIAN__>::to_little_endian<uint16_t>(uint16_t v) {
-  return __builtin_bswap16(v);
+  return byte_swap<uint16_t>(v);
 }
 template <>
 template <>
@@ -121,7 +159,7 @@ template <>
 template <>
 LIBC_INLINE uint32_t
 Endian<__ORDER_BIG_ENDIAN__>::to_little_endian<uint32_t>(uint32_t v) {
-  return __builtin_bswap32(v);
+  return byte_swap<uint32_t>(v);
 }
 template <>
 template <>
@@ -133,7 +171,7 @@ template <>
 template <>
 LIBC_INLINE uint64_t
 Endian<__ORDER_BIG_ENDIAN__>::to_little_endian<uint64_t>(uint64_t v) {
-  return __builtin_bswap64(v);
+  return byte_swap<uint64_t>(v);
 }
 
 } // namespace internal
diff --git a/libc/src/__support/macros/config.h b/libc/src/__support/macros/config.h
index 685188893e7b7..b06a890c9c13c 100644
--- a/libc/src/__support/macros/config.h
+++ b/libc/src/__support/macros/config.h
@@ -44,6 +44,9 @@
 #endif
 
 #define __builtin_expect(value, expectation) (value)
+#define __builtin_unreachable() __assume(0)
+
+#define __builtin_prefetch(X, Y, Z)
 
 #endif // LIBC_COMPILER_IS_MSVC
 
diff --git a/libc/src/__support/math/CMakeLists.txt b/libc/src/__support/math/CMakeLists.txt
index 39dc0e57f4472..ed5f314b0a9b5 100644
--- a/libc/src/__support/math/CMakeLists.txt
+++ b/libc/src/__support/math/CMakeLists.txt
@@ -109,6 +109,22 @@ add_header_library(
     libc.src.__support.macros.properties.types
 )
 
+
+add_header_library(
+  rsqrtf16
+  HDRS
+    rsqrtf16.h
+  DEPENDS
+    libc.src.__support.FPUtil.cast
+    libc.src.__support.FPUtil.fenv_impl
+    libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.FPUtil.multiply_add
+    libc.src.__support.FPUtil.polyeval
+    libc.src.__support.FPUtil.manipulation_functions
+    libc.src.__support.macros.optimization
+    libc.src.__support.macros.properties.types
+)
+
 add_header_library(
   asin_utils
   HDRS
diff --git a/libc/src/__support/math/rsqrtf16.h b/libc/src/__support/math/rsqrtf16.h
new file mode 100644
index 0000000000000..30ab58f8a5798
--- /dev/null
+++ b/libc/src/__support/math/rsqrtf16.h
@@ -0,0 +1,86 @@
+//===-- Implementation header for rsqrtf16 ----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_MATH_RSQRTF16_H
+#define LLVM_LIBC_SRC___SUPPORT_MATH_RSQRTF16_H
+
+#include "include/llvm-libc-macros/float16-macros.h"
+
+#ifdef LIBC_TYPES_HAS_FLOAT16
+
+#include "src/__support/FPUtil/FEnvImpl.h"
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/FPUtil/ManipulationFunctions.h"
+#include "src/__support/FPUtil/cast.h"
+#include "src/__support/FPUtil/multiply_add.h"
+#include "src/__support/FPUtil/sqrt.h"
+#include "src/__support/macros/optimization.h"
+
+namespace LIBC_NAMESPACE_DECL {
+namespace math {
+
+LIBC_INLINE static constexpr float16 rsqrtf16(float16 x) {
+  using FPBits = fputil::FPBits<float16>;
+  FPBits xbits(x);
+
+  uint16_t x_u = xbits.uintval();
+  uint16_t x_abs = x_u & 0x7fff;
+
+  constexpr uint16_t INF_BIT = FPBits::inf().uintval();
+
+  // x is 0, inf/nan, or negative.
+  if (LIBC_UNLIKELY(x_u == 0 || x_u >= INF_BIT)) {
+    // x is NaN
+    if (x_abs > INF_BIT) {
+      if (xbits.is_signaling_nan()) {
+        fputil::raise_except_if_required(FE_INVALID);
+        return FPBits::quiet_nan().get_val();
+      }
+      return x;
+    }
+
+    // |x| = 0
+    if (x_abs == 0) {
+      fputil::raise_except_if_required(FE_DIVBYZERO);
+      fputil::set_errno_if_required(ERANGE);
+      return FPBits::inf(xbits.sign()).get_val();
+    }
+
+    // -inf <= x < 0
+    if (x_u > 0x7fff) {
+      fputil::raise_except_if_required(FE_INVALID);
+      fputil::set_errno_if_required(EDOM);
+      return FPBits::quiet_nan().get_val();
+    }
+
+    // x = +inf => rsqrt(x) = 0
+    return FPBits::zero().get_val();
+  }
+
+  // TODO: add integer based implementation when LIBC_TARGET_CPU_HAS_FPU_FLOAT
+  // is not defined
+  float result = 1.0f / fputil::sqrt<float>(fputil::cast<float>(x));
+
+  // Targeted post-corrections to ensure correct rounding in half for specific
+  // mantissa patterns
+  const uint16_t half_mantissa = x_abs & 0x3ff;
+  if (LIBC_UNLIKELY(half_mantissa == 0x011F)) {
+    result = fputil::multiply_add(result, 0x1.0p-21f, result);
+  } else if (LIBC_UNLIKELY(half_mantissa == 0x0313)) {
+    result = fputil::multiply_add(result, -0x1.0p-21f, result);
+  }
+
+  return fputil::cast<float16>(result);
+}
+
+} // namespace math
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LIBC_TYPES_HAS_FLOAT16
+
+#endif // LLVM_LIBC_SRC___SUPPORT_MATH_RSQRTF16_H
diff --git a/libc/src/__support/math_extras.h b/libc/src/__support/math_extras.h
index 954bcb1b6ef89..d4dc6dcb4acf6 100644
--- a/libc/src/__support/math_extras.h
+++ b/libc/src/__support/math_extras.h
@@ -55,13 +55,23 @@ mask_leading_zeros() {
 // Returns whether 'a + b' overflows, the result is stored in 'res'.
 template <typename T>
 [[nodiscard]] LIBC_INLINE constexpr bool add_overflow(T a, T b, T &res) {
+#if __has_builtin(__builtin_add_overflow)
   return __builtin_add_overflow(a, b, &res);
+#else
+  res = a + b;
+  return (res < a) || (res < b);
+#endif // __builtin_add_overflow
 }
 
 // Returns whether 'a - b' overflows, the result is stored in 'res'.
 template <typename T>
 [[nodiscard]] LIBC_INLINE constexpr bool sub_overflow(T a, T b, T &res) {
+#if __has_builtin(__builtin_sub_overflow)
   return __builtin_sub_overflow(a, b, &res);
+#else
+  res = a - b;
+  return (res > a);
+#endif // __builtin_sub_overflow
 }
 
 #define RETURN_IF(TYPE, BUILTIN)                                               \
diff --git a/libc/src/math/CMakeLists.txt b/libc/src/math/CMakeLists.txt
index e418a8b0e24b9..a6f400c873b7e 100644
--- a/libc/src/math/CMakeLists.txt
+++ b/libc/src/math/CMakeLists.txt
@@ -516,6 +516,8 @@ add_math_entrypoint_object(roundevenf16)
 add_math_entrypoint_object(roundevenf128)
 add_math_entrypoint_object(roundevenbf16)
 
+add_math_entrypoint_object(rsqrtf16)
+
 add_math_entrypoint_object(scalbln)
 add_math_entrypoint_object(scalblnf)
 add_math_entrypoint_object(scalblnl)
diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt
index 263c5dfd0832b..ca7baeccae01a 100644
--- a/libc/src/math/generic/CMakeLists.txt
+++ b/libc/src/math/generic/CMakeLists.txt
@@ -973,7 +973,7 @@ add_entrypoint_object(
 )
 
 add_entrypoint_object(
-    roundevenbf16
+  roundevenbf16
   SRCS
     roundevenbf16.cpp
   HDRS
@@ -988,6 +988,17 @@ add_entrypoint_object(
     ROUND_OPT
 )
 
+add_entrypoint_object(
+  rsqrtf16
+  SRCS
+    rsqrtf16.cpp
+  HDRS
+    ../rsqrtf16.h
+  DEPENDS
+    libc.src.__support.math.rsqrtf16
+    libc.src.errno.errno
+)
+
 add_entrypoint_object(
   lround
   SRCS
diff --git a/libc/src/math/generic/rsqrtf16.cpp b/libc/src/math/generic/rsqrtf16.cpp
new file mode 100644
index 0000000000000..fb166b131d673
--- /dev/null
+++ b/libc/src/math/generic/rsqrtf16.cpp
@@ -0,0 +1,15 @@
+//===-- Half-precision rsqrt function -------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception.
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/rsqrtf16.h"
+#include "src/__support/math/rsqrtf16.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(float16, rsqrtf16, (float16 x)) { return math::rsqrtf16(x); }
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/rsqrtf16.h b/libc/src/math/rsqrtf16.h
new file mode 100644
index 0000000000000..c88ab5256ce88
--- /dev/null
+++ b/libc/src/math/rsqrtf16.h
@@ -0,0 +1,21 @@
+//===-- Implementation header for rsqrtf16 ----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_RSQRTF16_H
+#define LLVM_LIBC_SRC_MATH_RSQRTF16_H
+
+#include "src/__support/macros/config.h"
+#include "src/__support/macros/properties/types.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+float16 rsqrtf16(float16 x);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_MATH_RSQRTF16_H
diff --git a/libc/src/stdio/printf_core/CMakeLists.txt b/libc/src/stdio/printf_core/CMakeLists.txt
index 76eb0a2fdaaa5..ee66145e60156 100644
--- a/libc/src/stdio/printf_core/CMakeLists.txt
+++ b/libc/src/stdio/printf_core/CMakeLists.txt
@@ -112,6 +112,7 @@ add_header_library(
     libc.src.__support.libc_assert
     libc.src.__support.uint128
     libc.src.__support.StringUtil.error_to_string
+    libc.src.string.memory_utils.inline_memcpy
 )
 
 add_header_library(
diff --git a/libc/src/stdio/printf_core/float_dec_converter_limited.h b/libc/src/stdio/printf_core/float_dec_converter_limited.h
index f468dbc8e2ae8..9cdc13573d320 100644
--- a/libc/src/stdio/printf_core/float_dec_converter_limited.h
+++ b/libc/src/stdio/printf_core/float_dec_converter_limited.h
@@ -53,6 +53,7 @@
 #include "src/stdio/printf_core/core_structs.h"
 #include "src/stdio/printf_core/float_inf_nan_converter.h"
 #include "src/stdio/printf_core/writer.h"
+#include "src/string/memory_utils/inline_memcpy.h"
 
 namespace LIBC_NAMESPACE_DECL {
 namespace printf_core {
@@ -250,7 +251,7 @@ DigitsOutput decimal_digits(DigitsInput input, int precision, bool e_mode) {
   // there's space for it in the DigitsOutput buffer).
   DigitsOutput output;
   output.ndigits = view.size();
-  __builtin_memcpy(output.digits, view.data(), output.ndigits);
+  inline_memcpy(output.digits, view.data(), output.ndigits);
 
   // Set up the output exponent, which is done differently depending on mode.
   // Also, figure out whether we have one digit too many, and if so, set the
@@ -551,7 +552,7 @@ convert_float_inner(Writer<write_mode> *writer, const FormatSection &to_conv,
     cpp::string_view expview = expcvt.view();
     expbuf[0] = internal::islower(to_conv.conv_name) ? 'e' : 'E';
     explen = expview.size() + 1;
-    __builtin_memcpy(expbuf + 1, expview.data(), expview.size());
+    inline_memcpy(expbuf + 1, expview.data(), expview.size());
   }
 
   // Now we know enough to work out the length of the unpadded output:
diff --git a/libc/src/stdlib/CMakeLists.txt b/libc/src/stdlib/CMakeLists.txt
index aa653c38a8c3f..c464f82dcbda7 100644
--- a/libc/src/stdlib/CMakeLists.txt
+++ b/libc/src/stdlib/CMakeLists.txt
@@ -292,6 +292,7 @@ add_header_library(
     libc.hdr.stdint_proxy
     libc.include.stdlib
     libc.src.__support.CPP.cstddef
+    libc.src.string.memory_utils.inline_memcpy    
 )
 
 add_entrypoint_object(
diff --git a/libc/src/stdlib/qsort_data.h b/libc/src/stdlib/qsort_data.h
index 739fce88ab75d..4f9774088fbd3 100644
--- a/libc/src/stdlib/qsort_data.h
+++ b/libc/src/stdlib/qsort_data.h
@@ -12,6 +12,7 @@
 #include "hdr/stdint_proxy.h"
 #include "src/__support/CPP/cstddef.h"
 #include "src/__support/macros/config.h"
+#include "src/string/memory_utils/inline_memcpy.h"
 
 namespace LIBC_NAMESPACE_DECL {
 namespace internal {
@@ -54,9 +55,9 @@ class ArrayGenericSize {
     const cpp::byte *elem_i_block_end = elem_i + (elem_size - elem_size_rem);
 
     while (elem_i != elem_i_block_end) {
-      __builtin_memcpy(tmp_block, elem_i, BLOCK_SIZE);
-      __builtin_memcpy(elem_i, elem_j, BLOCK_SIZE);
-      __builtin_memcpy(elem_j, tmp_block, BLOCK_SIZE);
+      inline_memcpy(tmp_block, elem_i, BLOCK_SIZE);
+      inline_memcpy(elem_i, elem_j, BLOCK_SIZE);
+      inline_memcpy(elem_j, tmp_block, BLOCK_SIZE);
 
       elem_i += BLOCK_SIZE;
       elem_j += BLOCK_SIZE;
@@ -112,9 +113,9 @@ template <size_t ELEM_SIZE> class ArrayFixedSize {
     cpp::byte *elem_i = get_internal(i);
     cpp::byte *elem_j = get_internal(j);
 
-    __builtin_memcpy(tmp, elem_i, ELEM_SIZE);
+    inline_memcpy(tmp, elem_i, ELEM_SIZE);
     __builtin_memmove(elem_i, elem_j, ELEM_SIZE);
-    __builtin_memcpy(elem_j, tmp, ELEM_SIZE);
+    inline_memcpy(elem_j, tmp, ELEM_SIZE);
   }
 
   LIBC_INLINE size_t len() const { return array_len; }
diff --git a/libc/src/string/CMakeLists.txt b/libc/src/string/CMakeLists.txt
index 5c9f622d44397..b8cdb2a7d3538 100644
--- a/libc/src/string/CMakeLists.txt
+++ b/libc/src/string/CMakeLists.txt
@@ -22,6 +22,7 @@ add_header_library(
     libc.src.__support.CPP.type_traits
     libc.src.__support.CPP.simd
     libc.src.__support.common
+    libc.src.string.memory_utils.inline_memcpy
   ${string_config_options}
 )
 
diff --git a/libc/src/string/memory_utils/CMakeLists.txt b/libc/src/string/memory_utils/CMakeLists.txt
index 670db30129572..9cabfb9318012 100644
--- a/libc/src/string/memory_utils/CMakeLists.txt
+++ b/libc/src/string/memory_utils/CMakeLists.txt
@@ -42,6 +42,7 @@ add_header_library(
     libc.src.__support.macros.config
     libc.src.__support.macros.optimization
     libc.src.__support.macros.properties.architectures
+    libc.src.__support.macros.properties.compiler
 )
 
 add_header_library(
diff --git a/libc/src/string/memory_utils/generic/inline_strlen.h b/libc/src/string/memory_utils/generic/inline_strlen.h
index 68fba2afb3a5c..5e553e301d4da 100644
--- a/libc/src/string/memory_utils/generic/inline_strlen.h
+++ b/libc/src/string/memory_utils/generic/inline_strlen.h
@@ -33,14 +33,14 @@ string_length(const char *src) {
       __builtin_align_down(src, alignment));
 
   cpp::simd<char> chars = cpp::load_aligned<cpp::simd<char>>(aligned);
-  cpp::simd_mask<char> mask = cpp::simd_cast<bool>(chars == null_byte);
+  cpp::simd_mask<char> mask = chars == null_byte;
   size_t offset = src - reinterpret_cast<const char *>(aligned);
   if (cpp::any_of(shift_mask(mask, offset)))
     return cpp::find_first_set(shift_mask(mask, offset));
 
   for (;;) {
     cpp::simd<char> chars = cpp::load_aligned<cpp::simd<char>>(++aligned);
-    cpp::simd_mask<char> mask = cpp::simd_cast<bool>(chars == null_byte);
+    cpp::simd_mask<char> mask = chars == null_byte;
     if (cpp::any_of(mask))
       return (reinterpret_cast<const char *>(aligned) - src) +
              cpp::find_first_set(mask);
diff --git a/libc/src/string/memory_utils/op_generic.h b/libc/src/string/memory_utils/op_generic.h
index 37603410e3a51..010f2187a4ffd 100644
--- a/libc/src/string/memory_utils/op_generic.h
+++ b/libc/src/string/memory_utils/op_generic.h
@@ -31,6 +31,7 @@
 #include "src/__support/macros/attributes.h" // LIBC_INLINE
 #include "src/__support/macros/config.h"     // LIBC_NAMESPACE_DECL
 #include "src/__support/macros/optimization.h"
+#include "src/__support/macros/properties/compiler.h"
 #include "src/__support/macros/properties/types.h" // LIBC_TYPES_HAS_INT64
 #include "src/string/memory_utils/op_builtin.h"
 #include "src/string/memory_utils/utils.h"
@@ -39,12 +40,22 @@ static_assert((UINTPTR_MAX == 4294967295U) ||
                   (UINTPTR_MAX == 18446744073709551615UL),
               "We currently only support 32- or 64-bit platforms");
 
+#ifdef LIBC_COMPILER_IS_MSVC
+
+namespace LIBC_NAMESPACE_DECL {
+using generic_v128 = __m128i;
+using generic_v256 = __m256i;
+using generic_v512 = __m512i;
+} // namespace LIBC_NAMESPACE_DECL
+
+#else
 namespace LIBC_NAMESPACE_DECL {
 // Compiler types using the vector attributes.
 using generic_v128 = uint8_t __attribute__((__vector_size__(16)));
 using generic_v256 = uint8_t __attribute__((__vector_size__(32)));
 using generic_v512 = uint8_t __attribute__((__vector_size__(64)));
 } // namespace LIBC_NAMESPACE_DECL
+#endif // LIBC_COMPILER_IS_MSVC
 
 namespace LIBC_NAMESPACE_DECL {
 namespace generic {
diff --git a/libc/src/string/memory_utils/op_x86.h b/libc/src/string/memory_utils/op_x86.h
index 8bd84120c4ffa..1b4052747552d 100644
--- a/libc/src/string/memory_utils/op_x86.h
+++ b/libc/src/string/memory_utils/op_x86.h
@@ -15,6 +15,7 @@
 #include "src/__support/macros/attributes.h" // LIBC_INLINE
 #include "src/__support/macros/config.h"     // LIBC_NAMESPACE_DECL
 #include "src/__support/macros/properties/architectures.h"
+#include "src/__support/macros/properties/compiler.h"
 
 #if defined(LIBC_TARGET_ARCH_IS_X86)
 
@@ -57,7 +58,12 @@ LIBC_INLINE_VAR constexpr bool K_AVX512_BW = LLVM_LIBC_IS_DEFINED(__AVX512BW__);
 // Memcpy repmovsb implementation
 struct Memcpy {
   LIBC_INLINE static void repmovsb(void *dst, const void *src, size_t count) {
+#ifdef LIBC_COMPILER_IS_MSVC
+    __movsb(static_cast<unsigned char *>(dst),
+            static_cast<const unsigned char *>(src), count);
+#else
     asm volatile("rep movsb" : "+D"(dst), "+S"(src), "+c"(count) : : "memory");
+#endif // LIBC_COMPILER_IS_MSVC
   }
 };
 
@@ -138,8 +144,10 @@ LIBC_INLINE MemcmpReturnType cmp_neq<uint64_t>(CPtr p1, CPtr p2,
 // When we use these SIMD types in template specialization GCC complains:
 // "ignoring attributes on template argument ‘__m128i’ [-Wignored-attributes]"
 // Therefore, we disable this warning in this file.
+#ifndef LIBC_COMPILER_IS_MSVC
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wignored-attributes"
+#endif // !LIBC_COMPILER_IS_MSVC
 
 ///////////////////////////////////////////////////////////////////////////////
 // Specializations for __m128i
@@ -366,7 +374,9 @@ LIBC_INLINE MemcmpReturnType cmp_neq<__m512i>(CPtr p1, CPtr p2, size_t offset) {
 }
 #endif // __AVX512BW__
 
+#ifndef LIBC_COMPILER_IS_MSVC
 #pragma GCC diagnostic pop
+#endif // !LIBC_COMPILER_IS_MSVC
 
 } // namespace generic
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/string/memory_utils/utils.h b/libc/src/string/memory_utils/utils.h
index 0f9c9e36a3dcd..86ff4f12e8c26 100644
--- a/libc/src/string/memory_utils/utils.h
+++ b/libc/src/string/memory_utils/utils.h
@@ -17,6 +17,7 @@
 #include "src/__support/macros/attributes.h" // LIBC_INLINE
 #include "src/__support/macros/config.h"     // LIBC_NAMESPACE_DECL
 #include "src/__support/macros/properties/architectures.h"
+#include "src/__support/macros/properties/compiler.h"
 
 #include <stddef.h> // size_t
 
@@ -90,13 +91,17 @@ LIBC_INLINE void memcpy_inline(void *__restrict dst,
   // different value of the Size parameter. This doesn't play well with GCC's
   // Value Range Analysis that wrongly detects out of bounds accesses. We
   // disable these warnings for the purpose of this function.
+#ifndef LIBC_COMPILER_IS_MSVC
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Warray-bounds"
 #pragma GCC diagnostic ignored "-Wstringop-overread"
 #pragma GCC diagnostic ignored "-Wstringop-overflow"
+#endif // !LIBC_COMPILER_IS_MSVC
   for (size_t i = 0; i < Size; ++i)
     static_cast<char *>(dst)[i] = static_cast<const char *>(src)[i];
+#ifndef LIBC_COMPILER_IS_MSVC
 #pragma GCC diagnostic pop
+#endif // !LIBC_COMPILER_IS_MSVC
 #endif
 }
 
diff --git a/libc/src/string/stpcpy.cpp b/libc/src/string/stpcpy.cpp
index 48c0db950ace0..fefae81172585 100644
--- a/libc/src/string/stpcpy.cpp
+++ b/libc/src/string/stpcpy.cpp
@@ -8,6 +8,7 @@
 
 #include "src/string/stpcpy.h"
 #include "src/__support/macros/config.h"
+#include "src/string/memory_utils/inline_memcpy.h"
 #include "src/string/string_utils.h"
 
 #include "src/__support/common.h"
@@ -17,7 +18,7 @@ namespace LIBC_NAMESPACE_DECL {
 LLVM_LIBC_FUNCTION(char *, stpcpy,
                    (char *__restrict dest, const char *__restrict src)) {
   size_t size = internal::string_length(src) + 1;
-  __builtin_memcpy(dest, src, size);
+  inline_memcpy(dest, src, size);
   char *result = dest + size;
 
   if (result != nullptr)
diff --git a/libc/src/string/string_utils.h b/libc/src/string/string_utils.h
index 10803488b6cf5..9d636d02f4756 100644
--- a/libc/src/string/string_utils.h
+++ b/libc/src/string/string_utils.h
@@ -21,6 +21,7 @@
 #include "src/__support/CPP/type_traits.h" // cpp::is_same_v
 #include "src/__support/macros/config.h"
 #include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
+#include "src/string/memory_utils/inline_memcpy.h"
 
 #if defined(LIBC_COPT_STRING_UNSAFE_WIDE_READ)
 #if LIBC_HAS_VECTOR_TYPE
@@ -242,7 +243,7 @@ LIBC_INLINE size_t strlcpy(char *__restrict dst, const char *__restrict src,
   if (!size)
     return len;
   size_t n = len < size - 1 ? len : size - 1;
-  __builtin_memcpy(dst, src, n);
+  inline_memcpy(dst, src, n);
   dst[n] = '\0';
   return len;
 }
diff --git a/libc/src/wchar/CMakeLists.txt b/libc/src/wchar/CMakeLists.txt
index 9ba0a06c57b7f..adde382bf0950 100644
--- a/libc/src/wchar/CMakeLists.txt
+++ b/libc/src/wchar/CMakeLists.txt
@@ -452,6 +452,7 @@ add_entrypoint_object(
   DEPENDS
     libc.hdr.types.size_t
     libc.hdr.wchar_macros
+    libc.src.string.memory_utils.inline_memcpy
 )
 
 add_entrypoint_object(
diff --git a/libc/src/wchar/wcpcpy.cpp b/libc/src/wchar/wcpcpy.cpp
index 9e2b12f09eb05..b6d80d4d671d9 100644
--- a/libc/src/wchar/wcpcpy.cpp
+++ b/libc/src/wchar/wcpcpy.cpp
@@ -19,7 +19,7 @@ namespace LIBC_NAMESPACE_DECL {
 LLVM_LIBC_FUNCTION(wchar_t *, wcpcpy,
                    (wchar_t *__restrict s1, const wchar_t *__restrict s2)) {
   size_t size = internal::string_length(s2);
-  __builtin_memcpy(s1, s2, (size + 1) * sizeof(wchar_t));
+  inline_memcpy(s1, s2, (size + 1) * sizeof(wchar_t));
   wchar_t *result = s1 + size;
   return result;
 }
diff --git a/libc/src/wchar/wcscpy.cpp b/libc/src/wchar/wcscpy.cpp
index 01ba994cecbb2..703706e6a7be8 100644
--- a/libc/src/wchar/wcscpy.cpp
+++ b/libc/src/wchar/wcscpy.cpp
@@ -19,7 +19,7 @@ namespace LIBC_NAMESPACE_DECL {
 LLVM_LIBC_FUNCTION(wchar_t *, wcscpy,
                    (wchar_t *__restrict s1, const wchar_t *__restrict s2)) {
   size_t size = internal::string_length(s2) + 1;
-  __builtin_memcpy(s1, s2, size * sizeof(wchar_t));
+  inline_memcpy(s1, s2, size * sizeof(wchar_t));
   return s1;
 }
 
diff --git a/libc/src/wchar/wmemcpy.cpp b/libc/src/wchar/wmemcpy.cpp
index bf92309b20944..56708d6cee496 100644
--- a/libc/src/wchar/wmemcpy.cpp
+++ b/libc/src/wchar/wmemcpy.cpp
@@ -12,13 +12,14 @@
 #include "hdr/types/wchar_t.h"
 #include "src/__support/common.h"
 #include "src/__support/macros/config.h"
+#include "src/string/memory_utils/inline_memcpy.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(wchar_t *, wmemcpy,
                    (wchar_t *__restrict s1, const wchar_t *__restrict s2,
                     size_t n)) {
-  __builtin_memcpy(s1, s2, n * sizeof(wchar_t));
+  inline_memcpy(s1, s2, n * sizeof(wchar_t));
   return s1;
 }
 
diff --git a/libc/src/wchar/wmempcpy.cpp b/libc/src/wchar/wmempcpy.cpp
index 21e16210a757a..d8b89c0a88d05 100644
--- a/libc/src/wchar/wmempcpy.cpp
+++ b/libc/src/wchar/wmempcpy.cpp
@@ -11,13 +11,14 @@
 #include "hdr/types/size_t.h"
 #include "hdr/types/wchar_t.h"
 #include "src/__support/common.h"
+#include "src/string/memory_utils/inline_memcpy.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(wchar_t *, wmempcpy,
                    (wchar_t *__restrict to, const wchar_t *__restrict from,
                     size_t size)) {
-  __builtin_memcpy(to, from, size * sizeof(wchar_t));
+  inline_memcpy(to, from, size * sizeof(wchar_t));
   return reinterpret_cast<wchar_t *>(to) + size;
 }
 
diff --git a/libc/startup/baremetal/fini.h b/libc/startup/baremetal/fini.h
index 74e9601983a33..605d4920f9704 100644
--- a/libc/startup/baremetal/fini.h
+++ b/libc/startup/baremetal/fini.h
@@ -7,6 +7,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "hdr/stdint_proxy.h"
+#include "src/__support/macros/config.h"
+
+// NOTE: The namespace is necessary here to set the correct symbol visibility.
+namespace LIBC_NAMESPACE_DECL {
 
 extern "C" {
 extern uintptr_t __fini_array_start[];
@@ -14,3 +18,5 @@ extern uintptr_t __fini_array_end[];
 
 void __libc_fini_array(void);
 } // extern "C"
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/startup/baremetal/init.h b/libc/startup/baremetal/init.h
index 6b545db3976da..31497aefa170a 100644
--- a/libc/startup/baremetal/init.h
+++ b/libc/startup/baremetal/init.h
@@ -7,6 +7,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "hdr/stdint_proxy.h"
+#include "src/__support/macros/config.h"
+
+// NOTE: The namespace is necessary here to set the correct symbol visibility.
+namespace LIBC_NAMESPACE_DECL {
 
 extern "C" {
 extern uintptr_t __preinit_array_start[];
@@ -16,3 +20,5 @@ extern uintptr_t __init_array_end[];
 
 void __libc_init_array(void);
 } // extern "C"
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/test/IntegrationTest/test.h b/libc/test/IntegrationTest/test.h
index 24c007d2e12e6..4a03f7aa6318b 100644
--- a/libc/test/IntegrationTest/test.h
+++ b/libc/test/IntegrationTest/test.h
@@ -68,9 +68,9 @@
 ////////////////////////////////////////////////////////////////////////////////
 // Errno checks.
 
-#define ASSERT_ERRNO_EQ(VAL) ASSERT_EQ(VAL, static_cast<int>(libc_errno))
-#define ASSERT_ERRNO_SUCCESS() ASSERT_EQ(0, static_cast<int>(libc_errno))
-#define ASSERT_ERRNO_FAILURE() ASSERT_NE(0, static_cast<int>(libc_errno))
+#define ASSERT_ERRNO_EQ(VAL) ASSERT_EQ(VAL, static_cast<int>(errno))
+#define ASSERT_ERRNO_SUCCESS() ASSERT_EQ(0, static_cast<int>(errno))
+#define ASSERT_ERRNO_FAILURE() ASSERT_NE(0, static_cast<int>(errno))
 
 // Integration tests are compiled with -ffreestanding which stops treating
 // the main function as a non-overloadable special function. Hence, we use a
diff --git a/libc/test/UnitTest/CMakeLists.txt b/libc/test/UnitTest/CMakeLists.txt
index f1a83fc601e5e..31d1e9dce8204 100644
--- a/libc/test/UnitTest/CMakeLists.txt
+++ b/libc/test/UnitTest/CMakeLists.txt
@@ -76,6 +76,7 @@ add_unittest_framework_library(
     libc.src.__support.CPP.string_view
     libc.src.__support.CPP.type_traits
     libc.src.__support.fixed_point.fx_rep
+    libc.src.__support.macros.properties.compiler
     libc.src.__support.macros.properties.types
     libc.src.__support.OSUtil.osutil
     libc.src.__support.uint128
diff --git a/libc/test/UnitTest/ErrnoCheckingTest.h b/libc/test/UnitTest/ErrnoCheckingTest.h
index 4b7ff452f409c..5b1bc9441d830 100644
--- a/libc/test/UnitTest/ErrnoCheckingTest.h
+++ b/libc/test/UnitTest/ErrnoCheckingTest.h
@@ -13,6 +13,21 @@
 #include "src/__support/macros/config.h"
 #include "test/UnitTest/Test.h"
 
+// Define macro to validate the value stored in the errno and restore it
+// to zero.
+
+#define ASSERT_ERRNO_EQ(VAL)                                                   \
+  do {                                                                         \
+    ASSERT_EQ(VAL, static_cast<int>(libc_errno));                              \
+    libc_errno = 0;                                                            \
+  } while (0)
+#define ASSERT_ERRNO_SUCCESS() ASSERT_EQ(0, static_cast<int>(libc_errno))
+#define ASSERT_ERRNO_FAILURE()                                                 \
+  do {                                                                         \
+    ASSERT_NE(0, static_cast<int>(libc_errno));                                \
+    libc_errno = 0;                                                            \
+  } while (0)
+
 namespace LIBC_NAMESPACE_DECL {
 namespace testing {
 
diff --git a/libc/test/UnitTest/LibcTest.h b/libc/test/UnitTest/LibcTest.h
index fbeafd0bacb75..cf098cdd7a49a 100644
--- a/libc/test/UnitTest/LibcTest.h
+++ b/libc/test/UnitTest/LibcTest.h
@@ -30,6 +30,7 @@
 #include "src/__support/CPP/string_view.h"
 #include "src/__support/CPP/type_traits.h"
 #include "src/__support/c_string.h"
+#include "src/__support/macros/properties/compiler.h"
 #include "test/UnitTest/ExecuteFunction.h"
 #include "test/UnitTest/TestLogger.h"
 
@@ -260,7 +261,11 @@ constexpr char const *GetPrettyFunctionParamType(char const *str) {
 // This function recovers ParamType at compile time by using __PRETTY_FUNCTION__
 // It can be customized by using the REGISTER_TYPE_NAME macro below.
 template <typename ParamType> static constexpr const char *GetTypeName() {
+#ifdef LIBC_COMPILER_IS_MSVC
+  return GetPrettyFunctionParamType(__FUNCSIG__);
+#else
   return GetPrettyFunctionParamType(__PRETTY_FUNCTION__);
+#endif // LIBC_COMPILER_IS_MSVC
 }
 
 template <typename T>
diff --git a/libc/test/UnitTest/Test.h b/libc/test/UnitTest/Test.h
index e70fc51869624..6643e3882fd2b 100644
--- a/libc/test/UnitTest/Test.h
+++ b/libc/test/UnitTest/Test.h
@@ -37,21 +37,6 @@
 #include "LibcTest.h"
 #endif
 
-// These are defined the same way for each framework, in terms of the macros
-// they all provide.
-
-#define ASSERT_ERRNO_EQ(VAL)                                                   \
-  do {                                                                         \
-    ASSERT_EQ(VAL, static_cast<int>(libc_errno));                              \
-    libc_errno = 0;                                                            \
-  } while (0)
-#define ASSERT_ERRNO_SUCCESS() ASSERT_EQ(0, static_cast<int>(libc_errno))
-#define ASSERT_ERRNO_FAILURE()                                                 \
-  do {                                                                         \
-    ASSERT_NE(0, static_cast<int>(libc_errno));                                \
-    libc_errno = 0;                                                            \
-  } while (0)
-
 // Some macro utility to append file names with LIBC_TEST macro's value to be
 // used in stdio tests.
 #undef STR
diff --git a/libc/test/integration/src/pthread/pthread_create_test.cpp b/libc/test/integration/src/pthread/pthread_create_test.cpp
index aecbad6514aaa..abd348e707c09 100644
--- a/libc/test/integration/src/pthread/pthread_create_test.cpp
+++ b/libc/test/integration/src/pthread/pthread_create_test.cpp
@@ -29,10 +29,9 @@
 #include "src/__support/CPP/new.h"
 #include "src/__support/threads/thread.h"
 
-#include "src/__support/libc_errno.h"
-
 #include "test/IntegrationTest/test.h"
 
+#include <errno.h>
 #include <linux/param.h> // For EXEC_PAGESIZE.
 #include <pthread.h>
 
@@ -332,7 +331,7 @@ static void run_failure_tests() {
 }
 
 TEST_MAIN() {
-  libc_errno = 0;
+  errno = 0;
   run_success_tests();
   run_failure_tests();
   return 0;
diff --git a/libc/test/integration/src/pthread/pthread_join_test.cpp b/libc/test/integration/src/pthread/pthread_join_test.cpp
index 5d0bcd8e23658..6dea99de1a64f 100644
--- a/libc/test/integration/src/pthread/pthread_join_test.cpp
+++ b/libc/test/integration/src/pthread/pthread_join_test.cpp
@@ -9,9 +9,9 @@
 #include "src/pthread/pthread_create.h"
 #include "src/pthread/pthread_join.h"
 
-#include "src/__support/libc_errno.h"
-
 #include "test/IntegrationTest/test.h"
+
+#include <errno.h>
 #include <pthread.h>
 
 static void *simpleFunc(void *) { return nullptr; }
@@ -25,7 +25,7 @@ static void nullJoinTest() {
 }
 
 TEST_MAIN() {
-  libc_errno = 0;
+  errno = 0;
   nullJoinTest();
   return 0;
 }
diff --git a/libc/test/integration/src/pthread/pthread_name_test.cpp b/libc/test/integration/src/pthread/pthread_name_test.cpp
index 343a22356593a..d2a5ffc544ec9 100644
--- a/libc/test/integration/src/pthread/pthread_name_test.cpp
+++ b/libc/test/integration/src/pthread/pthread_name_test.cpp
@@ -8,7 +8,6 @@
 
 #include "hdr/stdint_proxy.h" // uintptr_t
 #include "src/__support/CPP/string_view.h"
-#include "src/__support/libc_errno.h"
 #include "src/pthread/pthread_create.h"
 #include "src/pthread/pthread_getname_np.h"
 #include "src/pthread/pthread_join.h"
@@ -20,6 +19,7 @@
 #include "src/pthread/pthread_setname_np.h"
 #include "test/IntegrationTest/test.h"
 
+#include <errno.h>
 #include <pthread.h>
 
 using string_view = LIBC_NAMESPACE::cpp::string_view;
diff --git a/libc/test/integration/src/unistd/getcwd_test.cpp b/libc/test/integration/src/unistd/getcwd_test.cpp
index 1b321b01e9315..7b87a8f0ed41c 100644
--- a/libc/test/integration/src/unistd/getcwd_test.cpp
+++ b/libc/test/integration/src/unistd/getcwd_test.cpp
@@ -7,12 +7,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/__support/CPP/string_view.h"
-#include "src/__support/libc_errno.h"
 #include "src/stdlib/getenv.h"
 #include "src/unistd/getcwd.h"
 
 #include "test/IntegrationTest/test.h"
 
+#include <errno.h>
 #include <stdlib.h> // For malloc and free
 
 using LIBC_NAMESPACE::cpp::string_view;
@@ -31,13 +31,12 @@ TEST_MAIN(int argc, char **argv, char **envp) {
   cwd = LIBC_NAMESPACE::getcwd(buffer, 0);
   ASSERT_TRUE(cwd == nullptr);
   ASSERT_ERRNO_EQ(EINVAL);
-  libc_errno = 0;
 
   // Insufficient size
+  errno = 0;
   cwd = LIBC_NAMESPACE::getcwd(buffer, 2);
   ASSERT_TRUE(cwd == nullptr);
-  int err = libc_errno;
-  ASSERT_EQ(err, ERANGE);
+  ASSERT_ERRNO_EQ(ERANGE);
 
   return 0;
 }
diff --git a/libc/test/integration/startup/linux/tls_test.cpp b/libc/test/integration/startup/linux/tls_test.cpp
index de3bd06c39cf6..688a94bdeb6fb 100644
--- a/libc/test/integration/startup/linux/tls_test.cpp
+++ b/libc/test/integration/startup/linux/tls_test.cpp
@@ -6,10 +6,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "src/__support/libc_errno.h"
 #include "src/sys/mman/mmap.h"
 #include "test/IntegrationTest/test.h"
 
+#include <errno.h>
 #include <sys/mman.h>
 
 constexpr int threadLocalDataSize = 101;
diff --git a/libc/test/shared/CMakeLists.txt b/libc/test/shared/CMakeLists.txt
index 48241d3f55287..495d6f0a81a4c 100644
--- a/libc/test/shared/CMakeLists.txt
+++ b/libc/test/shared/CMakeLists.txt
@@ -48,4 +48,6 @@ add_fp_unittest(
     libc.src.__support.math.ldexpf
     libc.src.__support.math.ldexpf128
     libc.src.__support.math.ldexpf16
+    libc.src.__support.math.rsqrtf16
+
 )
diff --git a/libc/test/shared/shared_math_test.cpp b/libc/test/shared/shared_math_test.cpp
index 2e5a2d51146d4..aa459f88c29f5 100644
--- a/libc/test/shared/shared_math_test.cpp
+++ b/libc/test/shared/shared_math_test.cpp
@@ -17,6 +17,8 @@ TEST(LlvmLibcSharedMathTest, AllFloat16) {
 
   EXPECT_FP_EQ(0x0p+0f16, LIBC_NAMESPACE::shared::acoshf16(1.0f16));
   EXPECT_FP_EQ(0x0p+0f16, LIBC_NAMESPACE::shared::acospif16(1.0f16));
+  EXPECT_FP_EQ(0x1p+0f16, LIBC_NAMESPACE::shared::rsqrtf16(1.0f16));
+
   EXPECT_FP_EQ(0x0p+0f16, LIBC_NAMESPACE::shared::asinf16(0.0f16));
   EXPECT_FP_EQ(0x0p+0f16, LIBC_NAMESPACE::shared::asinhf16(0.0f16));
   EXPECT_FP_EQ(0x0p+0f16, LIBC_NAMESPACE::shared::atanf16(0.0f16));
diff --git a/libc/test/src/__support/CMakeLists.txt b/libc/test/src/__support/CMakeLists.txt
index 5d1d0e0e5316b..a02514106a307 100644
--- a/libc/test/src/__support/CMakeLists.txt
+++ b/libc/test/src/__support/CMakeLists.txt
@@ -123,6 +123,8 @@ add_libc_test(
     str_to_float_test.cpp
     str_to_double_test.cpp
     str_to_long_double_test.cpp
+  HDRS
+    str_to_fp_test.h
   DEPENDS
     libc.src.__support.integer_literals
     libc.src.__support.str_to_float
diff --git a/libc/test/src/__support/CPP/simd_test.cpp b/libc/test/src/__support/CPP/simd_test.cpp
index b4f5685e3b1d1..c8f34df8ab028 100644
--- a/libc/test/src/__support/CPP/simd_test.cpp
+++ b/libc/test/src/__support/CPP/simd_test.cpp
@@ -64,23 +64,25 @@ TEST(LlvmLibcSIMDTest, MaskOperations) {
 
   EXPECT_TRUE(cpp::any_of(mask));
   EXPECT_FALSE(cpp::all_of(mask));
+  EXPECT_FALSE(cpp::none_of(mask));
   EXPECT_TRUE(cpp::some_of(mask));
   EXPECT_EQ(cpp::find_first_set(mask), 0);
   EXPECT_EQ(cpp::find_last_set(mask), 2);
+  EXPECT_EQ(cpp::popcount(mask), 2);
 }
 
 TEST(LlvmLibcSIMDTest, SplitConcat) {
   cpp::simd<char, 8> v{1, 1, 2, 2, 3, 3, 4, 4};
   auto [v1, v2, v3, v4] = cpp::split<2, 2, 2, 2>(v);
-  EXPECT_TRUE(cpp::all_of(cpp::simd_cast<bool>(v1 == 1)));
-  EXPECT_TRUE(cpp::all_of(cpp::simd_cast<bool>(v2 == 2)));
-  EXPECT_TRUE(cpp::all_of(cpp::simd_cast<bool>(v3 == 3)));
-  EXPECT_TRUE(cpp::all_of(cpp::simd_cast<bool>(v4 == 4)));
+  EXPECT_TRUE(cpp::all_of(v1 == 1));
+  EXPECT_TRUE(cpp::all_of(v2 == 2));
+  EXPECT_TRUE(cpp::all_of(v3 == 3));
+  EXPECT_TRUE(cpp::all_of(v4 == 4));
 
   cpp::simd<char, 8> m = cpp::concat(v1, v2, v3, v4);
-  EXPECT_TRUE(cpp::all_of(cpp::simd_cast<bool>(m == v)));
+  EXPECT_TRUE(cpp::all_of(m == v));
 
   cpp::simd<char, 1> c(~0);
   cpp::simd<char, 8> n = cpp::concat(c, c, c, c, c, c, c, c);
-  EXPECT_TRUE(cpp::all_of(cpp::simd_cast<bool>(n == ~0)));
+  EXPECT_TRUE(cpp::all_of(n == ~0));
 }
diff --git a/libc/test/src/__support/str_to_fp_test.h b/libc/test/src/__support/str_to_fp_test.h
index 9b4844d410db2..d349192f107c0 100644
--- a/libc/test/src/__support/str_to_fp_test.h
+++ b/libc/test/src/__support/str_to_fp_test.h
@@ -7,7 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/str_to_float.h"
 #include "src/__support/uint128.h"
diff --git a/libc/test/src/__support/str_to_integer_test.cpp b/libc/test/src/__support/str_to_integer_test.cpp
index 40cb76a8bd6a2..1ec882b212b8a 100644
--- a/libc/test/src/__support/str_to_integer_test.cpp
+++ b/libc/test/src/__support/str_to_integer_test.cpp
@@ -6,7 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "src/__support/libc_errno.h"
 #include "src/__support/str_to_integer.h"
 #include <stddef.h>
 
diff --git a/libc/test/src/__support/wcs_to_integer_test.cpp b/libc/test/src/__support/wcs_to_integer_test.cpp
index e4107929c15fc..4554968be67ce 100644
--- a/libc/test/src/__support/wcs_to_integer_test.cpp
+++ b/libc/test/src/__support/wcs_to_integer_test.cpp
@@ -6,7 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "src/__support/libc_errno.h"
 #include "src/__support/wcs_to_integer.h"
 #include <stddef.h>
 
diff --git a/libc/test/src/dirent/CMakeLists.txt b/libc/test/src/dirent/CMakeLists.txt
index b8ae813141c0d..8db512129f893 100644
--- a/libc/test/src/dirent/CMakeLists.txt
+++ b/libc/test/src/dirent/CMakeLists.txt
@@ -14,5 +14,6 @@ add_libc_unittest(
     libc.src.dirent.opendir
     libc.src.dirent.readdir
     libc.src.errno.errno
+    libc.test.UnitTest.ErrnoCheckingTest
 )
 
diff --git a/libc/test/src/dirent/dirent_test.cpp b/libc/test/src/dirent/dirent_test.cpp
index 3f0095ca5ebe8..2862b140ba8ed 100644
--- a/libc/test/src/dirent/dirent_test.cpp
+++ b/libc/test/src/dirent/dirent_test.cpp
@@ -7,19 +7,20 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/__support/CPP/string_view.h"
-#include "src/__support/libc_errno.h"
 #include "src/dirent/closedir.h"
 #include "src/dirent/dirfd.h"
 #include "src/dirent/opendir.h"
 #include "src/dirent/readdir.h"
 
+#include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/Test.h"
 
 #include <dirent.h>
 
+using LlvmLibcDirentTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
 using string_view = LIBC_NAMESPACE::cpp::string_view;
 
-TEST(LlvmLibcDirentTest, SimpleOpenAndRead) {
+TEST_F(LlvmLibcDirentTest, SimpleOpenAndRead) {
   ::DIR *dir = LIBC_NAMESPACE::opendir("testdata");
   ASSERT_TRUE(dir != nullptr);
   // The file descriptors 0, 1 and 2 are reserved for standard streams.
@@ -54,18 +55,14 @@ TEST(LlvmLibcDirentTest, SimpleOpenAndRead) {
   ASSERT_EQ(LIBC_NAMESPACE::closedir(dir), 0);
 }
 
-TEST(LlvmLibcDirentTest, OpenNonExistentDir) {
-  libc_errno = 0;
+TEST_F(LlvmLibcDirentTest, OpenNonExistentDir) {
   ::DIR *dir = LIBC_NAMESPACE::opendir("___xyz123__.non_existent__");
   ASSERT_TRUE(dir == nullptr);
   ASSERT_ERRNO_EQ(ENOENT);
-  libc_errno = 0;
 }
 
-TEST(LlvmLibcDirentTest, OpenFile) {
-  libc_errno = 0;
+TEST_F(LlvmLibcDirentTest, OpenFile) {
   ::DIR *dir = LIBC_NAMESPACE::opendir("testdata/file1.txt");
   ASSERT_TRUE(dir == nullptr);
   ASSERT_ERRNO_EQ(ENOTDIR);
-  libc_errno = 0;
 }
diff --git a/libc/test/src/errno/CMakeLists.txt b/libc/test/src/errno/CMakeLists.txt
index b73962fb4de4d..264574204e6cb 100644
--- a/libc/test/src/errno/CMakeLists.txt
+++ b/libc/test/src/errno/CMakeLists.txt
@@ -12,4 +12,5 @@ add_libc_unittest(
     errno_test.cpp
   DEPENDS
     libc.src.errno.errno
+    libc.test.UnitTest.ErrnoCheckingTest
 )
diff --git a/libc/test/src/errno/errno_test.cpp b/libc/test/src/errno/errno_test.cpp
index de82b0077f177..32fb3ec764063 100644
--- a/libc/test/src/errno/errno_test.cpp
+++ b/libc/test/src/errno/errno_test.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/__support/libc_errno.h"
+#include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/Test.h"
 
 TEST(LlvmLibcErrnoTest, Basic) {
diff --git a/libc/test/src/fcntl/CMakeLists.txt b/libc/test/src/fcntl/CMakeLists.txt
index b522fef7439df..ff62210c13f43 100644
--- a/libc/test/src/fcntl/CMakeLists.txt
+++ b/libc/test/src/fcntl/CMakeLists.txt
@@ -14,6 +14,7 @@ add_libc_unittest(
     libc.src.fcntl.creat
     libc.src.fcntl.open
     libc.src.unistd.close
+    libc.test.UnitTest.ErrnoCheckingTest
     libc.test.UnitTest.ErrnoSetterMatcher
 )
 
@@ -32,6 +33,7 @@ add_libc_unittest(
     libc.src.unistd.getpid
     libc.hdr.types.struct_flock
     libc.hdr.fcntl_macros
+    libc.test.UnitTest.ErrnoCheckingTest
     libc.test.UnitTest.ErrnoSetterMatcher
 )
 
@@ -48,5 +50,6 @@ add_libc_unittest(
     libc.src.fcntl.openat
     libc.src.unistd.close
     libc.src.unistd.read
+    libc.test.UnitTest.ErrnoCheckingTest
     libc.test.UnitTest.ErrnoSetterMatcher
 )
diff --git a/libc/test/src/fcntl/creat_test.cpp b/libc/test/src/fcntl/creat_test.cpp
index d60c984934703..c578cf289689b 100644
--- a/libc/test/src/fcntl/creat_test.cpp
+++ b/libc/test/src/fcntl/creat_test.cpp
@@ -6,16 +6,18 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "src/__support/libc_errno.h"
 #include "src/fcntl/creat.h"
 #include "src/fcntl/open.h"
 #include "src/unistd/close.h"
+#include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
 #include <sys/stat.h>
 
-TEST(LlvmLibcCreatTest, CreatAndOpen) {
+using LlvmLibcCreatTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
+
+TEST_F(LlvmLibcCreatTest, CreatAndOpen) {
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
   constexpr const char *TEST_FILE = "testdata/creat.test";
   int fd = LIBC_NAMESPACE::creat(TEST_FILE, S_IRWXU);
diff --git a/libc/test/src/fcntl/fcntl_test.cpp b/libc/test/src/fcntl/fcntl_test.cpp
index 082c42481777b..84feb34e537a0 100644
--- a/libc/test/src/fcntl/fcntl_test.cpp
+++ b/libc/test/src/fcntl/fcntl_test.cpp
@@ -9,17 +9,19 @@
 #include "hdr/fcntl_macros.h"
 #include "hdr/stdio_macros.h"
 #include "hdr/types/struct_flock.h"
-#include "src/__support/libc_errno.h"
 #include "src/fcntl/fcntl.h"
 #include "src/fcntl/open.h"
 #include "src/unistd/close.h"
 #include "src/unistd/getpid.h"
+#include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
 #include <sys/stat.h> // For S_IRWXU
 
-TEST(LlvmLibcFcntlTest, FcntlDupfd) {
+using LlvmLibcFcntlTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
+
+TEST_F(LlvmLibcFcntlTest, FcntlDupfd) {
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
   constexpr const char *TEST_FILE_NAME = "testdata/fcntl_dup.test";
   auto TEST_FILE = libc_make_test_file_path(TEST_FILE_NAME);
@@ -41,7 +43,7 @@ TEST(LlvmLibcFcntlTest, FcntlDupfd) {
   ASSERT_THAT(LIBC_NAMESPACE::close(fd3), Succeeds(0));
 }
 
-TEST(LlvmLibcFcntlTest, FcntlGetFl) {
+TEST_F(LlvmLibcFcntlTest, FcntlGetFl) {
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
   constexpr const char *TEST_FILE_NAME = "testdata/fcntl_getfl.test";
   auto TEST_FILE = libc_make_test_file_path(TEST_FILE_NAME);
@@ -57,7 +59,7 @@ TEST(LlvmLibcFcntlTest, FcntlGetFl) {
   ASSERT_THAT(LIBC_NAMESPACE::close(fd), Succeeds(0));
 }
 
-TEST(LlvmLibcFcntlTest, FcntlSetFl) {
+TEST_F(LlvmLibcFcntlTest, FcntlSetFl) {
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
   constexpr const char *TEST_FILE_NAME = "testdata/fcntl_setfl.test";
   auto TEST_FILE = libc_make_test_file_path(TEST_FILE_NAME);
@@ -92,7 +94,7 @@ TEST(LlvmLibcFcntlTest, FcntlSetFl) {
   ASSERT_THAT(LIBC_NAMESPACE::close(fd), Succeeds(0));
 }
 
-TEST(LlvmLibcFcntlTest, FcntlGetLkRead) {
+TEST_F(LlvmLibcFcntlTest, FcntlGetLkRead) {
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
   constexpr const char *TEST_FILE_NAME = "testdata/fcntl_getlkread.test";
   auto TEST_FILE = libc_make_test_file_path(TEST_FILE_NAME);
@@ -124,7 +126,7 @@ TEST(LlvmLibcFcntlTest, FcntlGetLkRead) {
   ASSERT_THAT(LIBC_NAMESPACE::close(fd), Succeeds(0));
 }
 
-TEST(LlvmLibcFcntlTest, FcntlGetLkWrite) {
+TEST_F(LlvmLibcFcntlTest, FcntlGetLkWrite) {
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
   constexpr const char *TEST_FILE_NAME = "testdata/fcntl_getlkwrite.test";
   auto TEST_FILE = libc_make_test_file_path(TEST_FILE_NAME);
@@ -155,7 +157,7 @@ TEST(LlvmLibcFcntlTest, FcntlGetLkWrite) {
   ASSERT_THAT(LIBC_NAMESPACE::close(fd), Succeeds(0));
 }
 
-TEST(LlvmLibcFcntlTest, UseAfterClose) {
+TEST_F(LlvmLibcFcntlTest, UseAfterClose) {
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
   constexpr const char *TEST_FILE_NAME = "testdata/fcntl_use_after_close.test";
   auto TEST_FILE = libc_make_test_file_path(TEST_FILE_NAME);
@@ -165,8 +167,7 @@ TEST(LlvmLibcFcntlTest, UseAfterClose) {
   ASSERT_ERRNO_EQ(EBADF);
 }
 
-TEST(LlvmLibcFcntlTest, SetGetOwnerTest) {
-  libc_errno = 0;
+TEST_F(LlvmLibcFcntlTest, SetGetOwnerTest) {
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
   pid_t pid = LIBC_NAMESPACE::getpid();
   ASSERT_GT(pid, -1);
diff --git a/libc/test/src/fcntl/openat_test.cpp b/libc/test/src/fcntl/openat_test.cpp
index 1997476f16a60..e40260ad1f205 100644
--- a/libc/test/src/fcntl/openat_test.cpp
+++ b/libc/test/src/fcntl/openat_test.cpp
@@ -6,17 +6,19 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "src/__support/libc_errno.h"
 #include "src/fcntl/open.h"
 #include "src/fcntl/openat.h"
 #include "src/unistd/close.h"
 #include "src/unistd/read.h"
+#include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
 #include "hdr/fcntl_macros.h"
 
-TEST(LlvmLibcUniStd, OpenAndReadTest) {
+using LlvmLibcOpenAtTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
+
+TEST_F(LlvmLibcOpenAtTest, OpenAndReadTest) {
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
   constexpr const char *TEST_DIR = "testdata";
   constexpr const char *TEST_FILE = "openat.test";
@@ -36,7 +38,7 @@ TEST(LlvmLibcUniStd, OpenAndReadTest) {
   ASSERT_THAT(LIBC_NAMESPACE::close(dir_fd), Succeeds(0));
 }
 
-TEST(LlvmLibcUniStd, FailTest) {
+TEST_F(LlvmLibcOpenAtTest, FailTest) {
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails;
   EXPECT_THAT(LIBC_NAMESPACE::openat(AT_FDCWD, "openat.test", O_RDONLY),
               Fails(ENOENT));
diff --git a/libc/test/src/math/CMakeLists.txt b/libc/test/src/math/CMakeLists.txt
index 378eadcf9e70b..9d644703a61ae 100644
--- a/libc/test/src/math/CMakeLists.txt
+++ b/libc/test/src/math/CMakeLists.txt
@@ -1678,6 +1678,17 @@ add_fp_unittest(
     libc.src.math.sqrtl
 )
 
+add_fp_unittest(
+  rsqrtf16_test
+  NEED_MPFR
+  SUITE
+    libc-math-unittests
+  SRCS
+    rsqrtf16_test.cpp
+  DEPENDS
+    libc.src.math.rsqrtf16
+)
+
 add_fp_unittest(
   sqrtf16_test
   NEED_MPFR
diff --git a/libc/test/src/math/rsqrtf16_test.cpp b/libc/test/src/math/rsqrtf16_test.cpp
new file mode 100644
index 0000000000000..d01c3f94f08cc
--- /dev/null
+++ b/libc/test/src/math/rsqrtf16_test.cpp
@@ -0,0 +1,43 @@
+//===-- Exhaustive test for rsqrtf16 --------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/rsqrtf16.h"
+#include "test/UnitTest/FPMatcher.h"
+#include "test/UnitTest/Test.h"
+#include "utils/MPFRWrapper/MPFRUtils.h"
+
+using LlvmLibcRsqrtf16Test = LIBC_NAMESPACE::testing::FPTest<float16>;
+
+namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
+
+// Range: [0, Inf]
+static constexpr uint16_t POS_START = 0x0000U;
+static constexpr uint16_t POS_STOP = 0x7c00U;
+
+// Range: [-Inf, 0)
+// rsqrt(-0.0) is -inf, not the same for mpfr.
+static constexpr uint16_t NEG_START = 0x8001U;
+static constexpr uint16_t NEG_STOP = 0xfc00U;
+
+TEST_F(LlvmLibcRsqrtf16Test, PositiveRange) {
+  for (uint16_t v = POS_START; v <= POS_STOP; ++v) {
+    float16 x = FPBits(v).get_val();
+
+    EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Rsqrt, x,
+                                   LIBC_NAMESPACE::rsqrtf16(x), 0.5);
+  }
+}
+
+TEST_F(LlvmLibcRsqrtf16Test, NegativeRange) {
+  for (uint16_t v = NEG_START; v <= NEG_STOP; ++v) {
+    float16 x = FPBits(v).get_val();
+
+    EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Rsqrt, x,
+                                   LIBC_NAMESPACE::rsqrtf16(x), 0.5);
+  }
+}
diff --git a/libc/test/src/math/smoke/CMakeLists.txt b/libc/test/src/math/smoke/CMakeLists.txt
index b8d5ecf4d77e5..eadd5f0970722 100644
--- a/libc/test/src/math/smoke/CMakeLists.txt
+++ b/libc/test/src/math/smoke/CMakeLists.txt
@@ -3502,6 +3502,17 @@ add_fp_unittest(
     libc.src.math.sqrtl
 )
 
+add_fp_unittest(
+  rsqrtf16_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    rsqrtf16_test.cpp
+  DEPENDS
+    libc.src.math.rsqrtf16
+    libc.hdr.errno_macros
+)
+
 add_fp_unittest(
   sqrtf16_test
   SUITE
diff --git a/libc/test/src/math/smoke/rsqrtf16_test.cpp b/libc/test/src/math/smoke/rsqrtf16_test.cpp
new file mode 100644
index 0000000000000..5eb3e2fd6692c
--- /dev/null
+++ b/libc/test/src/math/smoke/rsqrtf16_test.cpp
@@ -0,0 +1,45 @@
+//===-- Unittests for rsqrtf16 --------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception.
+//
+//===----------------------------------------------------------------------===//
+
+#include "hdr/errno_macros.h"
+#include "src/__support/FPUtil/cast.h"
+#include "src/math/rsqrtf16.h"
+#include "test/UnitTest/FPMatcher.h"
+#include "test/UnitTest/Test.h"
+
+using LlvmLibcRsqrtf16Test = LIBC_NAMESPACE::testing::FPTest<float16>;
+using LIBC_NAMESPACE::fputil::cast;
+
+TEST_F(LlvmLibcRsqrtf16Test, SpecialNumbers) {
+  EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::rsqrtf16(aNaN));
+  EXPECT_MATH_ERRNO(0);
+
+  EXPECT_FP_IS_NAN_WITH_EXCEPTION(LIBC_NAMESPACE::rsqrtf16(sNaN), FE_INVALID);
+  EXPECT_MATH_ERRNO(0);
+
+  EXPECT_FP_EQ(inf, LIBC_NAMESPACE::rsqrtf16(zero));
+  EXPECT_MATH_ERRNO(ERANGE);
+
+  EXPECT_FP_EQ(neg_inf, LIBC_NAMESPACE::rsqrtf16(neg_zero));
+  EXPECT_MATH_ERRNO(ERANGE);
+
+  EXPECT_FP_EQ(
+      LIBC_NAMESPACE::fputil::cast<float16>(1.0f),
+      LIBC_NAMESPACE::rsqrtf16(LIBC_NAMESPACE::fputil::cast<float16>(1.0f)));
+  EXPECT_MATH_ERRNO(0);
+
+  EXPECT_FP_EQ(zero, LIBC_NAMESPACE::rsqrtf16(inf));
+  EXPECT_MATH_ERRNO(0);
+
+  EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::rsqrtf16(neg_inf));
+  EXPECT_MATH_ERRNO(EDOM);
+
+  EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::rsqrtf16(
+                         LIBC_NAMESPACE::fputil::cast<float16>(-2.0f)));
+  EXPECT_MATH_ERRNO(EDOM);
+}
diff --git a/libc/test/src/poll/CMakeLists.txt b/libc/test/src/poll/CMakeLists.txt
index c4af14168b906..54e00330f2bff 100644
--- a/libc/test/src/poll/CMakeLists.txt
+++ b/libc/test/src/poll/CMakeLists.txt
@@ -10,5 +10,5 @@ add_libc_unittest(
     libc.hdr.limits_macros
     libc.src.errno.errno
     libc.src.poll.poll
-    libc.test.UnitTest.ErrnoSetterMatcher
+    libc.test.UnitTest.ErrnoCheckingTest
 )
diff --git a/libc/test/src/poll/poll_test.cpp b/libc/test/src/poll/poll_test.cpp
index 97b7b02718172..5bf2d5e4353f6 100644
--- a/libc/test/src/poll/poll_test.cpp
+++ b/libc/test/src/poll/poll_test.cpp
@@ -7,18 +7,19 @@
 //===----------------------------------------------------------------------===//
 
 #include "hdr/limits_macros.h" // UINT_MAX
-#include "src/__support/libc_errno.h"
 #include "src/poll/poll.h"
+#include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/Test.h"
 
-TEST(LlvmLibcPollTest, SmokeTest) {
-  libc_errno = 0;
+using LlvmLibcPollTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
+
+TEST_F(LlvmLibcPollTest, SmokeTest) {
   int ret = LIBC_NAMESPACE::poll(nullptr, 0, 0);
   ASSERT_ERRNO_SUCCESS();
   ASSERT_EQ(0, ret);
 }
-TEST(LlvmLibcPollTest, SmokeFailureTest) {
-  libc_errno = 0;
+
+TEST_F(LlvmLibcPollTest, SmokeFailureTest) {
   int ret = LIBC_NAMESPACE::poll(nullptr, UINT_MAX, 0);
   ASSERT_ERRNO_EQ(EINVAL);
   ASSERT_EQ(-1, ret);
diff --git a/libc/test/src/sched/CMakeLists.txt b/libc/test/src/sched/CMakeLists.txt
index 362c526312d42..93752ed26108d 100644
--- a/libc/test/src/sched/CMakeLists.txt
+++ b/libc/test/src/sched/CMakeLists.txt
@@ -14,6 +14,7 @@ add_libc_unittest(
     libc.src.errno.errno
     libc.src.sched.sched_getaffinity
     libc.src.sched.sched_setaffinity
+    libc.test.UnitTest.ErrnoCheckingTest
     libc.test.UnitTest.ErrnoSetterMatcher
 )
 
@@ -26,6 +27,7 @@ add_libc_unittest(
   DEPENDS
     libc.src.errno.errno
     libc.src.sched.sched_yield
+    libc.test.UnitTest.ErrnoCheckingTest
 )
 
 add_libc_unittest(
@@ -39,6 +41,7 @@ add_libc_unittest(
     libc.src.errno.errno
     libc.src.sched.sched_get_priority_min
     libc.src.sched.sched_get_priority_max
+    libc.test.UnitTest.ErrnoCheckingTest
 )
 
 add_libc_unittest(
@@ -70,6 +73,7 @@ add_libc_unittest(
     libc.src.sched.sched_get_priority_min
     libc.src.sched.sched_get_priority_max
     libc.src.unistd.getuid
+    libc.test.UnitTest.ErrnoCheckingTest
 )
 
 add_libc_unittest(
@@ -87,6 +91,7 @@ add_libc_unittest(
     libc.src.sched.sched_get_priority_min
     libc.src.sched.sched_rr_get_interval
     libc.src.unistd.getuid
+    libc.test.UnitTest.ErrnoCheckingTest
 )
 
 add_libc_unittest(
@@ -104,5 +109,6 @@ add_libc_unittest(
     libc.src.errno.errno
     libc.src.sched.sched_getaffinity
     libc.src.sched.__sched_getcpucount
+    libc.test.UnitTest.ErrnoCheckingTest
     libc.test.UnitTest.ErrnoSetterMatcher
 )
diff --git a/libc/test/src/sched/affinity_test.cpp b/libc/test/src/sched/affinity_test.cpp
index 1c8599bb67d4c..51159bae7907c 100644
--- a/libc/test/src/sched/affinity_test.cpp
+++ b/libc/test/src/sched/affinity_test.cpp
@@ -7,18 +7,19 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/__support/OSUtil/syscall.h"
-#include "src/__support/libc_errno.h"
 #include "src/sched/sched_getaffinity.h"
 #include "src/sched/sched_setaffinity.h"
+#include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/ErrnoSetterMatcher.h"
 
 #include "hdr/types/cpu_set_t.h"
 #include "hdr/types/pid_t.h"
 #include <sys/syscall.h>
 
-TEST(LlvmLibcSchedAffinityTest, SmokeTest) {
+using LlvmLibcSchedAffinityTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
+
+TEST_F(LlvmLibcSchedAffinityTest, SmokeTest) {
   cpu_set_t mask;
-  libc_errno = 0;
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
   pid_t tid = LIBC_NAMESPACE::syscall_impl<pid_t>(SYS_gettid);
   ASSERT_GT(tid, pid_t(0));
@@ -29,19 +30,15 @@ TEST(LlvmLibcSchedAffinityTest, SmokeTest) {
               Succeeds(0));
 }
 
-TEST(LlvmLibcSchedAffinityTest, BadMask) {
+TEST_F(LlvmLibcSchedAffinityTest, BadMask) {
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails;
   pid_t tid = LIBC_NAMESPACE::syscall_impl<pid_t>(SYS_gettid);
 
-  libc_errno = 0;
   ASSERT_THAT(
       LIBC_NAMESPACE::sched_getaffinity(tid, sizeof(cpu_set_t), nullptr),
       Fails(EFAULT));
 
-  libc_errno = 0;
   ASSERT_THAT(
       LIBC_NAMESPACE::sched_setaffinity(tid, sizeof(cpu_set_t), nullptr),
       Fails(EFAULT));
-
-  libc_errno = 0;
 }
diff --git a/libc/test/src/sched/cpu_count_test.cpp b/libc/test/src/sched/cpu_count_test.cpp
index 06e4fff98bd21..217324e3e4766 100644
--- a/libc/test/src/sched/cpu_count_test.cpp
+++ b/libc/test/src/sched/cpu_count_test.cpp
@@ -7,18 +7,19 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/__support/OSUtil/syscall.h"
-#include "src/__support/libc_errno.h"
 #include "src/sched/sched_getaffinity.h"
 #include "src/sched/sched_getcpucount.h"
+#include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/ErrnoSetterMatcher.h"
 
 #include "hdr/sched_macros.h"
 #include "hdr/types/cpu_set_t.h"
 #include "hdr/types/pid_t.h"
 
-TEST(LlvmLibcSchedCpuCountTest, SmokeTest) {
+using LlvmLibcSchedCpuCountTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
+
+TEST_F(LlvmLibcSchedCpuCountTest, SmokeTest) {
   cpu_set_t mask;
-  libc_errno = 0;
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
   pid_t tid = LIBC_NAMESPACE::syscall_impl<pid_t>(SYS_gettid);
   ASSERT_GT(tid, pid_t(0));
diff --git a/libc/test/src/sched/get_priority_test.cpp b/libc/test/src/sched/get_priority_test.cpp
index bf4fca8ece092..fb168c2e96430 100644
--- a/libc/test/src/sched/get_priority_test.cpp
+++ b/libc/test/src/sched/get_priority_test.cpp
@@ -6,14 +6,16 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "src/__support/libc_errno.h"
 #include "src/sched/sched_get_priority_max.h"
 #include "src/sched/sched_get_priority_min.h"
+#include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/Test.h"
 
 #include "hdr/sched_macros.h"
 
-TEST(LlvmLibcSchedGetPriorityTest, HandleBadPolicyTest) {
+using LlvmLibcSchedGetPriorityTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
+
+TEST_F(LlvmLibcSchedGetPriorityTest, HandleBadPolicyTest) {
 
   // Test arbitrary values for which there is no policy.
   {
@@ -57,9 +59,7 @@ TEST(LlvmLibcSchedGetPriorityTest, HandleBadPolicyTest) {
   }
 }
 
-TEST(LlvmLibcSchedGetPriorityTest, SmokeTest) {
-  libc_errno = 0;
-
+TEST_F(LlvmLibcSchedGetPriorityTest, SmokeTest) {
   // We Test:
   // SCHED_OTHER, SCHED_FIFO, SCHED_RR
   // Linux specific test could also include:
diff --git a/libc/test/src/sched/getcpu_test.cpp b/libc/test/src/sched/getcpu_test.cpp
index fc4ada8a722f5..cf19d25f816df 100644
--- a/libc/test/src/sched/getcpu_test.cpp
+++ b/libc/test/src/sched/getcpu_test.cpp
@@ -7,7 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/__support/OSUtil/syscall.h"
-#include "src/__support/libc_errno.h"
 #include "src/sched/getcpu.h"
 #include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/ErrnoSetterMatcher.h"
diff --git a/libc/test/src/sched/param_and_scheduler_test.cpp b/libc/test/src/sched/param_and_scheduler_test.cpp
index b8ee1233dfb86..57eb59865b1aa 100644
--- a/libc/test/src/sched/param_and_scheduler_test.cpp
+++ b/libc/test/src/sched/param_and_scheduler_test.cpp
@@ -14,6 +14,7 @@
 #include "src/sched/sched_setparam.h"
 #include "src/sched/sched_setscheduler.h"
 #include "src/unistd/getuid.h"
+#include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/Test.h"
 
 #include "hdr/sched_macros.h"
@@ -35,11 +36,9 @@
 //       Linux specific test could also include:
 //          SCHED_ISO, SCHED_DEADLINE
 
-class SchedTest : public LIBC_NAMESPACE::testing::Test {
+class SchedTest : public LIBC_NAMESPACE::testing::ErrnoCheckingTest {
 public:
   void testSched(int policy, bool is_mandatory) {
-    libc_errno = 0;
-
     int init_policy = LIBC_NAMESPACE::sched_getscheduler(0);
     ASSERT_GE(init_policy, 0);
     ASSERT_ERRNO_SUCCESS();
@@ -56,22 +55,18 @@ class SchedTest : public LIBC_NAMESPACE::testing::Test {
     // Negative pid
     ASSERT_EQ(LIBC_NAMESPACE::sched_setscheduler(-1, policy, &param), -1);
     ASSERT_ERRNO_EQ(EINVAL);
-    libc_errno = 0;
 
     ASSERT_EQ(LIBC_NAMESPACE::sched_getscheduler(-1), -1);
     ASSERT_ERRNO_EQ(EINVAL);
-    libc_errno = 0;
 
     // Invalid Policy
     ASSERT_EQ(LIBC_NAMESPACE::sched_setscheduler(0, policy | 128, &param), -1);
     ASSERT_ERRNO_EQ(EINVAL);
-    libc_errno = 0;
 
     // Out of bounds priority
     param.sched_priority = min_priority - 1;
     ASSERT_EQ(LIBC_NAMESPACE::sched_setscheduler(0, policy, &param), -1);
     ASSERT_ERRNO_EQ(EINVAL);
-    libc_errno = 0;
 
     param.sched_priority = max_priority + 1;
     ASSERT_EQ(LIBC_NAMESPACE::sched_setscheduler(0, policy, &param), -1);
@@ -99,12 +94,10 @@ class SchedTest : public LIBC_NAMESPACE::testing::Test {
     param.sched_priority = -1;
     ASSERT_EQ(LIBC_NAMESPACE::sched_setparam(0, &param), -1);
     ASSERT_ERRNO_EQ(EINVAL);
-    libc_errno = 0;
 
     param.sched_priority = max_priority + 1;
     ASSERT_EQ(LIBC_NAMESPACE::sched_setparam(0, &param), -1);
     ASSERT_ERRNO_EQ(EINVAL);
-    libc_errno = 0;
 
     for (int priority = min_priority; priority <= max_priority; ++priority) {
       ASSERT_EQ(LIBC_NAMESPACE::sched_getparam(0, &param), 0);
@@ -116,11 +109,9 @@ class SchedTest : public LIBC_NAMESPACE::testing::Test {
       // Negative pid
       ASSERT_EQ(LIBC_NAMESPACE::sched_setparam(-1, &param), -1);
       ASSERT_ERRNO_EQ(EINVAL);
-      libc_errno = 0;
 
       ASSERT_EQ(LIBC_NAMESPACE::sched_getparam(-1, &param), -1);
       ASSERT_ERRNO_EQ(EINVAL);
-      libc_errno = 0;
 
       // Success/unsupported policy/missing permissions
       int setparam_result = LIBC_NAMESPACE::sched_setparam(0, &param);
@@ -141,7 +132,6 @@ class SchedTest : public LIBC_NAMESPACE::testing::Test {
     // Null test
     ASSERT_EQ(LIBC_NAMESPACE::sched_setscheduler(0, policy, nullptr), -1);
     ASSERT_ERRNO_EQ(EINVAL);
-    libc_errno = 0;
   }
 };
 
@@ -159,13 +149,9 @@ LIST_SCHED_TESTS(SCHED_BATCH, true)
 LIST_SCHED_TESTS(SCHED_IDLE, true)
 
 TEST(LlvmLibcSchedParamAndSchedulerTest, NullParamTest) {
-  libc_errno = 0;
-
   ASSERT_EQ(LIBC_NAMESPACE::sched_setparam(0, nullptr), -1);
   ASSERT_ERRNO_EQ(EINVAL);
-  libc_errno = 0;
 
   ASSERT_EQ(LIBC_NAMESPACE::sched_getparam(0, nullptr), -1);
   ASSERT_ERRNO_EQ(EINVAL);
-  libc_errno = 0;
 }
diff --git a/libc/test/src/sched/sched_rr_get_interval_test.cpp b/libc/test/src/sched/sched_rr_get_interval_test.cpp
index e5dc4e31d1c9d..d5eecb5bee5b5 100644
--- a/libc/test/src/sched/sched_rr_get_interval_test.cpp
+++ b/libc/test/src/sched/sched_rr_get_interval_test.cpp
@@ -6,19 +6,21 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "src/__support/libc_errno.h"
 #include "src/sched/sched_get_priority_min.h"
 #include "src/sched/sched_getscheduler.h"
 #include "src/sched/sched_rr_get_interval.h"
 #include "src/sched/sched_setscheduler.h"
 #include "src/unistd/getuid.h"
+#include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/Test.h"
 
 #include "hdr/sched_macros.h"
 #include "hdr/types/struct_timespec.h"
 
-TEST(LlvmLibcSchedRRGetIntervalTest, SmokeTest) {
-  libc_errno = 0;
+using LlvmLibcSchedRRGetIntervalTest =
+    LIBC_NAMESPACE::testing::ErrnoCheckingTest;
+
+TEST_F(LlvmLibcSchedRRGetIntervalTest, SmokeTest) {
   auto SetSched = [&](int policy) {
     int min_priority = LIBC_NAMESPACE::sched_get_priority_min(policy);
     ASSERT_GE(min_priority, 0);
@@ -59,19 +61,16 @@ TEST(LlvmLibcSchedRRGetIntervalTest, SmokeTest) {
     // Null timespec
     ASSERT_EQ(LIBC_NAMESPACE::sched_rr_get_interval(0, nullptr), -1);
     ASSERT_ERRNO_EQ(EFAULT);
-    libc_errno = 0;
 
     // Negative pid
     ASSERT_EQ(LIBC_NAMESPACE::sched_rr_get_interval(-1, &ts), -1);
     ASSERT_ERRNO_EQ(EINVAL);
-    libc_errno = 0;
   }
 
   // Negative tests don't have SCHED_RR set
   SetSched(SCHED_OTHER);
   ASSERT_EQ(LIBC_NAMESPACE::sched_rr_get_interval(0, &ts), 0);
   ASSERT_ERRNO_SUCCESS();
-  libc_errno = 0;
 
   // TODO: Missing unkown pid -> ESRCH. This is read only so safe to try a few
   //       unlikely values.
diff --git a/libc/test/src/sched/yield_test.cpp b/libc/test/src/sched/yield_test.cpp
index 4d13d50e25eb2..1cd30abb88513 100644
--- a/libc/test/src/sched/yield_test.cpp
+++ b/libc/test/src/sched/yield_test.cpp
@@ -6,12 +6,13 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "src/__support/libc_errno.h"
 #include "src/sched/sched_yield.h"
+#include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/Test.h"
 
-TEST(LlvmLibcSchedYieldTest, SmokeTest) {
-  libc_errno = 0;
+using LlvmLibcSchedYieldTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
+
+TEST_F(LlvmLibcSchedYieldTest, SmokeTest) {
   // sched_yield() always succeeds, just do a basic test that errno/ret are
   // properly 0.
   ASSERT_EQ(LIBC_NAMESPACE::sched_yield(), 0);
diff --git a/libc/test/src/signal/CMakeLists.txt b/libc/test/src/signal/CMakeLists.txt
index 6b5041d1dedd6..2135164e7d088 100644
--- a/libc/test/src/signal/CMakeLists.txt
+++ b/libc/test/src/signal/CMakeLists.txt
@@ -51,6 +51,7 @@ add_libc_unittest(
     libc.src.signal.sigaddset
     libc.src.signal.sigemptyset
     libc.src.signal.sigprocmask
+    libc.test.UnitTest.ErrnoCheckingTest
     libc.test.UnitTest.ErrnoSetterMatcher
 )
 
@@ -77,6 +78,7 @@ add_libc_unittest(
     libc.src.errno.errno
     libc.src.signal.raise
     libc.src.signal.signal
+    libc.test.UnitTest.ErrnoCheckingTest
     libc.test.UnitTest.ErrnoSetterMatcher
 )
 
@@ -124,5 +126,6 @@ add_libc_unittest(
     libc.src.signal.raise
     libc.src.signal.sigaltstack
     libc.src.signal.sigaction
+    libc.test.UnitTest.ErrnoCheckingTest
     libc.test.UnitTest.ErrnoSetterMatcher
 )
diff --git a/libc/test/src/signal/sigaltstack_test.cpp b/libc/test/src/signal/sigaltstack_test.cpp
index a9c5cd939581e..8c252c47452df 100644
--- a/libc/test/src/signal/sigaltstack_test.cpp
+++ b/libc/test/src/signal/sigaltstack_test.cpp
@@ -9,11 +9,11 @@
 #include "hdr/signal_macros.h"
 #include "hdr/stdint_proxy.h"
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
-#include "src/__support/libc_errno.h"
 #include "src/signal/linux/signal_utils.h"
 #include "src/signal/raise.h"
 #include "src/signal/sigaction.h"
 #include "src/signal/sigaltstack.h"
+#include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
@@ -23,6 +23,7 @@ constexpr int LOCAL_VAR_SIZE = 512;
 constexpr int ALT_STACK_SIZE = SIGSTKSZ + LOCAL_VAR_SIZE * 2;
 static uint8_t alt_stack[ALT_STACK_SIZE];
 
+using LlvmLibcSigaltstackTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
 using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails;
 using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
 
@@ -44,9 +45,8 @@ static void handler(int) {
   good_stack = true;
 }
 
-TEST(LlvmLibcSignalTest, SigaltstackRunOnAltStack) {
+TEST_F(LlvmLibcSigaltstackTest, SigaltstackRunOnAltStack) {
   struct sigaction action;
-  libc_errno = 0;
   ASSERT_THAT(LIBC_NAMESPACE::sigaction(SIGUSR1, nullptr, &action),
               Succeeds(0));
   action.sa_handler = handler;
@@ -68,7 +68,7 @@ TEST(LlvmLibcSignalTest, SigaltstackRunOnAltStack) {
 }
 
 // This tests for invalid input.
-TEST(LlvmLibcSignalTest, SigaltstackInvalidStack) {
+TEST_F(LlvmLibcSigaltstackTest, SigaltstackInvalidStack) {
   stack_t ss;
   ss.ss_sp = alt_stack;
   ss.ss_size = 0;
diff --git a/libc/test/src/signal/signal_test.cpp b/libc/test/src/signal/signal_test.cpp
index 62b86bf440291..cfcab0c7676b0 100644
--- a/libc/test/src/signal/signal_test.cpp
+++ b/libc/test/src/signal/signal_test.cpp
@@ -6,18 +6,17 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "src/__support/libc_errno.h"
 #include "src/signal/raise.h"
 #include "src/signal/signal.h"
-
+#include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
+using LlvmLibcSignalTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
 using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails;
 using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
 
-TEST(LlvmLibcSignal, Invalid) {
-  libc_errno = 0;
+TEST(LlvmLibcSignalTest, Invalid) {
   auto *valid = +[](int) {};
   EXPECT_THAT((void *)LIBC_NAMESPACE::signal(0, valid),
               Fails(EINVAL, (void *)SIG_ERR));
@@ -26,7 +25,7 @@ TEST(LlvmLibcSignal, Invalid) {
 }
 
 static int sum;
-TEST(LlvmLibcSignal, Basic) {
+TEST(LlvmLibcSignalTest, Basic) {
   // In case test get run multiple times.
   sum = 0;
   ASSERT_NE(LIBC_NAMESPACE::signal(
diff --git a/libc/test/src/signal/sigprocmask_test.cpp b/libc/test/src/signal/sigprocmask_test.cpp
index 891eac0f5bf75..54de8f7a8683f 100644
--- a/libc/test/src/signal/sigprocmask_test.cpp
+++ b/libc/test/src/signal/sigprocmask_test.cpp
@@ -6,25 +6,29 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "src/__support/libc_errno.h"
 #include "src/signal/raise.h"
 #include "src/signal/sigaddset.h"
 #include "src/signal/sigemptyset.h"
 #include "src/signal/sigprocmask.h"
-
+#include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
 #include <signal.h>
 
-class LlvmLibcSignalTest : public LIBC_NAMESPACE::testing::Test {
+class LlvmLibcSigprocmaskTest
+    : public LIBC_NAMESPACE::testing::ErrnoCheckingTest {
   sigset_t oldSet;
 
 public:
-  void SetUp() override { LIBC_NAMESPACE::sigprocmask(0, nullptr, &oldSet); }
+  void SetUp() override {
+    ErrnoCheckingTest::SetUp();
+    LIBC_NAMESPACE::sigprocmask(0, nullptr, &oldSet);
+  }
 
   void TearDown() override {
     LIBC_NAMESPACE::sigprocmask(SIG_SETMASK, &oldSet, nullptr);
+    ErrnoCheckingTest::TearDown();
   }
 };
 
@@ -32,9 +36,7 @@ using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails;
 using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
 
 // This tests for invalid input.
-TEST_F(LlvmLibcSignalTest, SigprocmaskInvalid) {
-  libc_errno = 0;
-
+TEST_F(LlvmLibcSigprocmaskTest, SigprocmaskInvalid) {
   sigset_t valid;
   // 17 and -4 are out of the range for sigprocmask's how paramater.
   EXPECT_THAT(LIBC_NAMESPACE::sigprocmask(17, &valid, nullptr), Fails(EINVAL));
@@ -49,7 +51,7 @@ TEST_F(LlvmLibcSignalTest, SigprocmaskInvalid) {
 
 // This tests that when nothing is blocked, a process gets killed and alse tests
 // that when signals are blocked they are not delivered to the process.
-TEST_F(LlvmLibcSignalTest, BlockUnblock) {
+TEST_F(LlvmLibcSigprocmaskTest, BlockUnblock) {
   sigset_t sigset;
   EXPECT_EQ(LIBC_NAMESPACE::sigemptyset(&sigset), 0);
   EXPECT_EQ(LIBC_NAMESPACE::sigprocmask(SIG_SETMASK, &sigset, nullptr), 0);
diff --git a/libc/test/src/spawn/CMakeLists.txt b/libc/test/src/spawn/CMakeLists.txt
index 04814db46dca2..103925cf3a22d 100644
--- a/libc/test/src/spawn/CMakeLists.txt
+++ b/libc/test/src/spawn/CMakeLists.txt
@@ -7,6 +7,7 @@ add_libc_unittest(
   SRCS
     posix_spawn_file_actions_test.cpp
   DEPENDS
+    libc.hdr.errno_macros
     libc.hdr.stdint_proxy
     libc.include.spawn
     libc.src.spawn.file_actions
@@ -15,5 +16,4 @@ add_libc_unittest(
     libc.src.spawn.posix_spawn_file_actions_addopen
     libc.src.spawn.posix_spawn_file_actions_destroy
     libc.src.spawn.posix_spawn_file_actions_init
-    libc.src.errno.errno
 )
diff --git a/libc/test/src/spawn/posix_spawn_file_actions_test.cpp b/libc/test/src/spawn/posix_spawn_file_actions_test.cpp
index 935a3540d9a58..20ab312f1f999 100644
--- a/libc/test/src/spawn/posix_spawn_file_actions_test.cpp
+++ b/libc/test/src/spawn/posix_spawn_file_actions_test.cpp
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "hdr/errno_macros.h"
 #include "hdr/stdint_proxy.h"
-#include "src/__support/libc_errno.h"
 #include "src/spawn/file_actions.h"
 #include "src/spawn/posix_spawn_file_actions_addclose.h"
 #include "src/spawn/posix_spawn_file_actions_adddup2.h"
diff --git a/libc/test/src/sys/ioctl/linux/CMakeLists.txt b/libc/test/src/sys/ioctl/linux/CMakeLists.txt
index 2df67e9d9cbde..2ccef25f4264f 100644
--- a/libc/test/src/sys/ioctl/linux/CMakeLists.txt
+++ b/libc/test/src/sys/ioctl/linux/CMakeLists.txt
@@ -14,5 +14,7 @@ add_libc_unittest(
     libc.src.unistd.close
     libc.src.unistd.read
     libc.src.unistd.write
+    libc.test.UnitTest.ErrnoCheckingTest
+    libc.test.UnitTest.ErrnoSetterMatcher
 )
 
diff --git a/libc/test/src/sys/ioctl/linux/ioctl_test.cpp b/libc/test/src/sys/ioctl/linux/ioctl_test.cpp
index b76dc14824c95..4560bcf6e2e96 100644
--- a/libc/test/src/sys/ioctl/linux/ioctl_test.cpp
+++ b/libc/test/src/sys/ioctl/linux/ioctl_test.cpp
@@ -6,13 +6,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "src/__support/libc_errno.h"
 #include "src/fcntl/open.h"
 #include "src/sys/ioctl/ioctl.h"
 #include "src/unistd/close.h"
 #include "src/unistd/read.h"
 #include "src/unistd/write.h"
-
+#include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
@@ -20,11 +19,10 @@
 
 #include "hdr/sys_ioctl_macros.h"
 
+using LlvmLibcSysIoctlTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
 using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
 
-TEST(LlvmLibcSysIoctlTest, InvalidCommandAndFIONREAD) {
-  LIBC_NAMESPACE::libc_errno = 0;
-
+TEST_F(LlvmLibcSysIoctlTest, InvalidCommandAndFIONREAD) {
   // Setup the test file
   constexpr const char *TEST_FILE_NAME = "ioctl.test";
   constexpr const char TEST_MSG[] = "ioctl test";
diff --git a/libc/test/src/termios/CMakeLists.txt b/libc/test/src/termios/CMakeLists.txt
index 302dd300fb59f..059c272c105c4 100644
--- a/libc/test/src/termios/CMakeLists.txt
+++ b/libc/test/src/termios/CMakeLists.txt
@@ -18,5 +18,6 @@ add_libc_unittest(
     libc.src.termios.tcgetsid
     libc.src.termios.tcsetattr
     libc.src.unistd.close
+    libc.test.UnitTest.ErrnoCheckingTest
     libc.test.UnitTest.ErrnoSetterMatcher
 )
diff --git a/libc/test/src/termios/termios_test.cpp b/libc/test/src/termios/termios_test.cpp
index 5ec169a886b1e..7a8075997a4a8 100644
--- a/libc/test/src/termios/termios_test.cpp
+++ b/libc/test/src/termios/termios_test.cpp
@@ -16,49 +16,52 @@
 #include "src/termios/tcgetsid.h"
 #include "src/termios/tcsetattr.h"
 #include "src/unistd/close.h"
+#include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
 #include <termios.h>
 
-using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails;
-using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
+using LlvmLibcTermiosTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
+using namespace LIBC_NAMESPACE::testing::ErrnoSetterMatcher;
 
 // We just list a bunch of smoke tests here as it is not possible to
 // test functionality at the least because we want to run the tests
 // from ninja/make which change the terminal behavior.
 
-TEST(LlvmLibcTermiosTest, SpeedSmokeTest) {
+TEST_F(LlvmLibcTermiosTest, SpeedSmokeTest) {
   struct termios t;
-  libc_errno = 0;
   ASSERT_THAT(LIBC_NAMESPACE::cfsetispeed(&t, B50), Succeeds(0));
   ASSERT_EQ(LIBC_NAMESPACE::cfgetispeed(&t), speed_t(B50));
   ASSERT_THAT(LIBC_NAMESPACE::cfsetospeed(&t, B75), Succeeds(0));
   ASSERT_EQ(LIBC_NAMESPACE::cfgetospeed(&t), speed_t(B75));
 
-  libc_errno = 0;
   ASSERT_THAT(LIBC_NAMESPACE::cfsetispeed(&t, ~CBAUD), Fails(EINVAL));
-  libc_errno = 0;
   ASSERT_THAT(LIBC_NAMESPACE::cfsetospeed(&t, ~CBAUD), Fails(EINVAL));
 }
 
-TEST(LlvmLibcTermiosTest, GetAttrSmokeTest) {
+TEST_F(LlvmLibcTermiosTest, GetAttrSmokeTest) {
   struct termios t;
-  libc_errno = 0;
   int fd = LIBC_NAMESPACE::open("/dev/tty", O_RDONLY);
-  if (fd < 0)
-    return; // When /dev/tty is not available, no point continuing.
+  if (fd < 0) {
+    // When /dev/tty is not available, no point continuing
+    libc_errno = 0;
+    return;
+  }
   ASSERT_ERRNO_SUCCESS();
   ASSERT_THAT(LIBC_NAMESPACE::tcgetattr(fd, &t), Succeeds(0));
-  ASSERT_EQ(LIBC_NAMESPACE::close(fd), 0);
+  ASSERT_THAT(LIBC_NAMESPACE::close(fd), Succeeds(0));
 }
 
-TEST(LlvmLibcTermiosTest, TcGetSidSmokeTest) {
-  libc_errno = 0;
+TEST_F(LlvmLibcTermiosTest, TcGetSidSmokeTest) {
   int fd = LIBC_NAMESPACE::open("/dev/tty", O_RDONLY);
-  if (fd < 0)
-    return; // When /dev/tty is not available, no point continuing.
+  if (fd < 0) {
+    // When /dev/tty is not available, no point continuing
+    libc_errno = 0;
+    return;
+  }
   ASSERT_ERRNO_SUCCESS();
-  ASSERT_GT(LIBC_NAMESPACE::tcgetsid(fd), pid_t(0));
-  ASSERT_EQ(LIBC_NAMESPACE::close(fd), 0);
+  ASSERT_THAT(LIBC_NAMESPACE::tcgetsid(fd),
+              returns(GT(pid_t(0))).with_errno(EQ(0)));
+  ASSERT_THAT(LIBC_NAMESPACE::close(fd), Succeeds(0));
 }
diff --git a/libc/utils/MPFRWrapper/MPCommon.cpp b/libc/utils/MPFRWrapper/MPCommon.cpp
index c255220774110..6b78bee6e7cae 100644
--- a/libc/utils/MPFRWrapper/MPCommon.cpp
+++ b/libc/utils/MPFRWrapper/MPCommon.cpp
@@ -393,6 +393,12 @@ MPFRNumber MPFRNumber::rint(mpfr_rnd_t rnd) const {
   return result;
 }
 
+MPFRNumber MPFRNumber::rsqrt() const {
+  MPFRNumber result(*this);
+  mpfr_rec_sqrt(result.value, value, mpfr_rounding);
+  return result;
+}
+
 MPFRNumber MPFRNumber::mod_2pi() const {
   MPFRNumber result(0.0, 1280);
   MPFRNumber _2pi(0.0, 1280);
diff --git a/libc/utils/MPFRWrapper/MPCommon.h b/libc/utils/MPFRWrapper/MPCommon.h
index 25bdc9bc00250..9f4107a7961d2 100644
--- a/libc/utils/MPFRWrapper/MPCommon.h
+++ b/libc/utils/MPFRWrapper/MPCommon.h
@@ -222,6 +222,7 @@ class MPFRNumber {
   bool round_to_long(long &result) const;
   bool round_to_long(mpfr_rnd_t rnd, long &result) const;
   MPFRNumber rint(mpfr_rnd_t rnd) const;
+  MPFRNumber rsqrt() const;
   MPFRNumber mod_2pi() const;
   MPFRNumber mod_pi_over_2() const;
   MPFRNumber mod_pi_over_4() const;
diff --git a/libc/utils/MPFRWrapper/MPFRUtils.cpp b/libc/utils/MPFRWrapper/MPFRUtils.cpp
index 144a4ec25d213..a7d307b47c3e8 100644
--- a/libc/utils/MPFRWrapper/MPFRUtils.cpp
+++ b/libc/utils/MPFRWrapper/MPFRUtils.cpp
@@ -91,6 +91,8 @@ unary_operation(Operation op, InputType input, unsigned int precision,
     return mpfrInput.round();
   case Operation::RoundEven:
     return mpfrInput.roundeven();
+  case Operation::Rsqrt:
+    return mpfrInput.rsqrt();
   case Operation::Sin:
     return mpfrInput.sin();
   case Operation::Sinpi:
diff --git a/libc/utils/MPFRWrapper/MPFRUtils.h b/libc/utils/MPFRWrapper/MPFRUtils.h
index 35d7942a2620e..a33fcd21789f9 100644
--- a/libc/utils/MPFRWrapper/MPFRUtils.h
+++ b/libc/utils/MPFRWrapper/MPFRUtils.h
@@ -56,6 +56,7 @@ enum class Operation : int {
   ModPIOver4,
   Round,
   RoundEven,
+  Rsqrt,
   Sin,
   Sinpi,
   Sinh,
diff --git a/libclc/CMakeLists.txt b/libclc/CMakeLists.txt
index c75f450d8d3ad..7960f3494770e 100644
--- a/libclc/CMakeLists.txt
+++ b/libclc/CMakeLists.txt
@@ -104,6 +104,7 @@ else()
 
   # Note we do not adhere to LLVM_ENABLE_PER_TARGET_RUNTIME_DIR.
   set( LIBCLC_OUTPUT_LIBRARY_DIR ${LIBCLC_OUTPUT_DIR}/lib/libclc )
+  file( MAKE_DIRECTORY ${LIBCLC_OUTPUT_LIBRARY_DIR} )
 endif()
 
 if( EXISTS ${LIBCLC_CUSTOM_LLVM_TOOLS_BINARY_DIR} )
diff --git a/libclc/README.md b/libclc/README.md
index 34f329d861199..4f14066425d2d 100644
--- a/libclc/README.md
+++ b/libclc/README.md
@@ -31,8 +31,8 @@ more targets is welcome.
 
 For an in-tree build, Clang must also be built at the same time:
 ```
-$ cmake <path-to>/llvm-project/llvm/CMakeLists.txt -DLLVM_ENABLE_PROJECTS="libclc;clang" \
-    -DCMAKE_BUILD_TYPE=Release -G Ninja
+$ cmake <path-to>/llvm-project/llvm/CMakeLists.txt -DLLVM_ENABLE_PROJECTS="clang" \
+    -DLLVM_ENABLE_RUNTIMES="libclc" -DCMAKE_BUILD_TYPE=Release -G Ninja
 $ ninja
 ```
 Then install:
diff --git a/libclc/utils/CMakeLists.txt b/libclc/utils/CMakeLists.txt
index 6851ae16bda07..a14d133985a64 100644
--- a/libclc/utils/CMakeLists.txt
+++ b/libclc/utils/CMakeLists.txt
@@ -1,6 +1,3 @@
-# Construct LLVM version define
-set( LLVM_VERSION_DEFINE "-DHAVE_LLVM=0x${LLVM_VERSION_MAJOR}0${LLVM_VERSION_MINOR}" )
-
 # Setup prepare_builtins tools
 set( LLVM_LINK_COMPONENTS
   BitReader
@@ -19,6 +16,5 @@ else()
   setup_host_tool( prepare_builtins PREPARE_BUILTINS prepare_builtins_exe prepare_builtins_target )
 endif()
 
-target_compile_definitions( prepare_builtins PRIVATE ${LLVM_VERSION_DEFINE} )
 # These were not properly reported in early LLVM and we don't need them
 target_compile_options( prepare_builtins PRIVATE -fno-rtti -fno-exceptions )
diff --git a/libclc/utils/prepare-builtins.cpp b/libclc/utils/prepare-builtins.cpp
index b10dfccc6d88c..40a5445ef507f 100644
--- a/libclc/utils/prepare-builtins.cpp
+++ b/libclc/utils/prepare-builtins.cpp
@@ -6,12 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#if HAVE_LLVM > 0x0390
 #include "llvm/Bitcode/BitcodeReader.h"
 #include "llvm/Bitcode/BitcodeWriter.h"
-#else
-#include "llvm/Bitcode/ReaderWriter.h"
-#endif
 
 #include "llvm/Config/llvm-config.h"
 #include "llvm/IR/Function.h"
@@ -62,12 +58,8 @@ int main(int argc, char **argv) {
       std::unique_ptr<MemoryBuffer> &BufferPtr = BufferOrErr.get();
       SMDiagnostic Err;
       std::unique_ptr<llvm::Module> MPtr =
-#if HAVE_LLVM > 0x0390
           ExitOnErr(Expected<std::unique_ptr<llvm::Module>>(
               parseIR(BufferPtr.get()->getMemBufferRef(), Err, Context)));
-#else
-          parseIR(BufferPtr.get()->getMemBufferRef(), Err, Context);
-#endif
       M = MPtr.release();
     }
   }
@@ -106,13 +98,8 @@ int main(int argc, char **argv) {
   }
 
   std::error_code EC;
-#if HAVE_LLVM >= 0x0600
   std::unique_ptr<ToolOutputFile> Out(
       new ToolOutputFile(OutputFilename, EC, sys::fs::OF_None));
-#else
-  std::unique_ptr<tool_output_file> Out(
-      new tool_output_file(OutputFilename, EC, sys::fs::OF_None));
-#endif
   if (EC) {
     errs() << EC.message() << '\n';
     exit(1);
@@ -121,11 +108,7 @@ int main(int argc, char **argv) {
   if (TextualOut)
     M->print(Out->os(), nullptr, true);
   else
-#if HAVE_LLVM >= 0x0700
     WriteBitcodeToFile(*M, Out->os());
-#else
-    WriteBitcodeToFile(M, Out->os());
-#endif
 
   // Declare success.
   Out->keep();
diff --git a/libcxx/cmake/caches/Armv7M-picolibc.cmake b/libcxx/cmake/caches/Armv7M-picolibc.cmake
index 0f8189b457285..9df71fba2cadd 100644
--- a/libcxx/cmake/caches/Armv7M-picolibc.cmake
+++ b/libcxx/cmake/caches/Armv7M-picolibc.cmake
@@ -5,6 +5,7 @@ set(CMAKE_C_COMPILER_TARGET "armv7m-none-eabi" CACHE STRING "")
 set(CMAKE_C_FLAGS "-mfloat-abi=soft" CACHE STRING "")
 set(CMAKE_SYSTEM_NAME Generic CACHE STRING "")
 set(CMAKE_TRY_COMPILE_TARGET_TYPE STATIC_LIBRARY CACHE STRING "")
+set(LLVM_USE_LINKER "lld" CACHE STRING "")
 set(COMPILER_RT_BAREMETAL_BUILD ON CACHE BOOL "")
 set(COMPILER_RT_BUILD_LIBFUZZER OFF CACHE BOOL "")
 set(COMPILER_RT_BUILD_PROFILE OFF CACHE BOOL "")
diff --git a/libcxx/docs/ReleaseNotes/22.rst b/libcxx/docs/ReleaseNotes/22.rst
index e56f0a88db138..2b928a3a26913 100644
--- a/libcxx/docs/ReleaseNotes/22.rst
+++ b/libcxx/docs/ReleaseNotes/22.rst
@@ -57,7 +57,6 @@ Improvements and New Features
   has been improved by up to 3x
 - The performance of ``insert(iterator, iterator)`` of ``map``, ``set``, ``multimap`` and ``multiset`` has been improved
   by up to 2.5x
-- The performance of ``erase(iterator, iterator)`` in the unordered containers has been improved by up to 1.9x
 - The performance of ``map::insert_or_assign`` has been improved by up to 2x
 
 - ``ofstream::write`` has been optimized to pass through large strings to system calls directly instead of copying them
diff --git a/libcxx/docs/TestingLibcxx.rst b/libcxx/docs/TestingLibcxx.rst
index 44463385b81a7..227791031bab0 100644
--- a/libcxx/docs/TestingLibcxx.rst
+++ b/libcxx/docs/TestingLibcxx.rst
@@ -531,6 +531,16 @@ Finally, use ``compare-benchmarks`` to compare both:
 The ``compare-benchmarks`` script provides some useful options like creating a chart to easily visualize
 differences in a browser window. Use ``compare-benchmarks --help`` for details.
 
+Additionally, adding a comment of the following form to a libc++ PR will cause the specified benchmarks to be run
+on our pre-commit CI infrastructure and the results to be reported in the PR by our CI system:
+
+.. code-block::
+
+    /libcxx-bot benchmark <path/to/benchmark1.bench.cpp> <path/to/benchmark2.bench.cpp> ...
+
+Note that this is currently experimental and the results should not be relied upon too strongly, since
+we do not have dedicated hardware to run the benchmarks on.
+
 .. _`Google Benchmark`: https://github.com/google/benchmark
 
 .. _testing-hardening-assertions:
diff --git a/libcxx/include/__cxx03/fstream b/libcxx/include/__cxx03/fstream
index 44bdabc4602b5..65c2c3e975032 100644
--- a/libcxx/include/__cxx03/fstream
+++ b/libcxx/include/__cxx03/fstream
@@ -191,6 +191,7 @@ typedef basic_fstream<wchar_t> wfstream;
 #include <__cxx03/__config>
 #include <__cxx03/__fwd/fstream.h>
 #include <__cxx03/__locale>
+#include <__cxx03/__memory/addressof.h>
 #include <__cxx03/__type_traits/enable_if.h>
 #include <__cxx03/__type_traits/is_same.h>
 #include <__cxx03/__utility/move.h>
@@ -1062,11 +1063,12 @@ private:
 };
 
 template <class _CharT, class _Traits>
-inline basic_ifstream<_CharT, _Traits>::basic_ifstream() : basic_istream<char_type, traits_type>(&__sb_) {}
+inline basic_ifstream<_CharT, _Traits>::basic_ifstream()
+    : basic_istream<char_type, traits_type>(std::addressof(__sb_)) {}
 
 template <class _CharT, class _Traits>
 inline basic_ifstream<_CharT, _Traits>::basic_ifstream(const char* __s, ios_base::openmode __mode)
-    : basic_istream<char_type, traits_type>(&__sb_) {
+    : basic_istream<char_type, traits_type>(std::addressof(__sb_)) {
   if (__sb_.open(__s, __mode | ios_base::in) == nullptr)
     this->setstate(ios_base::failbit);
 }
@@ -1074,15 +1076,16 @@ inline basic_ifstream<_CharT, _Traits>::basic_ifstream(const char* __s, ios_base
 #  ifdef _LIBCPP_HAS_OPEN_WITH_WCHAR
 template <class _CharT, class _Traits>
 inline basic_ifstream<_CharT, _Traits>::basic_ifstream(const wchar_t* __s, ios_base::openmode __mode)
-    : basic_istream<char_type, traits_type>(&__sb_) {
+    : basic_istream<char_type, traits_type>(std::addressof(__sb_)) {
   if (__sb_.open(__s, __mode | ios_base::in) == nullptr)
     this->setstate(ios_base::failbit);
 }
 #  endif
 
+// extension
 template <class _CharT, class _Traits>
 inline basic_ifstream<_CharT, _Traits>::basic_ifstream(const string& __s, ios_base::openmode __mode)
-    : basic_istream<char_type, traits_type>(&__sb_) {
+    : basic_istream<char_type, traits_type>(std::addressof(__sb_)) {
   if (__sb_.open(__s, __mode | ios_base::in) == nullptr)
     this->setstate(ios_base::failbit);
 }
@@ -1090,7 +1093,7 @@ inline basic_ifstream<_CharT, _Traits>::basic_ifstream(const string& __s, ios_ba
 template <class _CharT, class _Traits>
 inline basic_ifstream<_CharT, _Traits>::basic_ifstream(basic_ifstream&& __rhs)
     : basic_istream<char_type, traits_type>(std::move(__rhs)), __sb_(std::move(__rhs.__sb_)) {
-  this->set_rdbuf(&__sb_);
+  this->set_rdbuf(std::addressof(__sb_));
 }
 
 template <class _CharT, class _Traits>
@@ -1113,7 +1116,7 @@ inline _LIBCPP_HIDE_FROM_ABI void swap(basic_ifstream<_CharT, _Traits>& __x, bas
 
 template <class _CharT, class _Traits>
 inline basic_filebuf<_CharT, _Traits>* basic_ifstream<_CharT, _Traits>::rdbuf() const {
-  return const_cast<basic_filebuf<char_type, traits_type>*>(&__sb_);
+  return const_cast<basic_filebuf<char_type, traits_type>*>(std::addressof(__sb_));
 }
 
 template <class _CharT, class _Traits>
@@ -1199,11 +1202,12 @@ private:
 };
 
 template <class _CharT, class _Traits>
-inline basic_ofstream<_CharT, _Traits>::basic_ofstream() : basic_ostream<char_type, traits_type>(&__sb_) {}
+inline basic_ofstream<_CharT, _Traits>::basic_ofstream()
+    : basic_ostream<char_type, traits_type>(std::addressof(__sb_)) {}
 
 template <class _CharT, class _Traits>
 inline basic_ofstream<_CharT, _Traits>::basic_ofstream(const char* __s, ios_base::openmode __mode)
-    : basic_ostream<char_type, traits_type>(&__sb_) {
+    : basic_ostream<char_type, traits_type>(std::addressof(__sb_)) {
   if (__sb_.open(__s, __mode | ios_base::out) == nullptr)
     this->setstate(ios_base::failbit);
 }
@@ -1211,15 +1215,16 @@ inline basic_ofstream<_CharT, _Traits>::basic_ofstream(const char* __s, ios_base
 #  ifdef _LIBCPP_HAS_OPEN_WITH_WCHAR
 template <class _CharT, class _Traits>
 inline basic_ofstream<_CharT, _Traits>::basic_ofstream(const wchar_t* __s, ios_base::openmode __mode)
-    : basic_ostream<char_type, traits_type>(&__sb_) {
+    : basic_ostream<char_type, traits_type>(std::addressof(__sb_)) {
   if (__sb_.open(__s, __mode | ios_base::out) == nullptr)
     this->setstate(ios_base::failbit);
 }
 #  endif
 
+// extension
 template <class _CharT, class _Traits>
 inline basic_ofstream<_CharT, _Traits>::basic_ofstream(const string& __s, ios_base::openmode __mode)
-    : basic_ostream<char_type, traits_type>(&__sb_) {
+    : basic_ostream<char_type, traits_type>(std::addressof(__sb_)) {
   if (__sb_.open(__s, __mode | ios_base::out) == nullptr)
     this->setstate(ios_base::failbit);
 }
@@ -1227,7 +1232,7 @@ inline basic_ofstream<_CharT, _Traits>::basic_ofstream(const string& __s, ios_ba
 template <class _CharT, class _Traits>
 inline basic_ofstream<_CharT, _Traits>::basic_ofstream(basic_ofstream&& __rhs)
     : basic_ostream<char_type, traits_type>(std::move(__rhs)), __sb_(std::move(__rhs.__sb_)) {
-  this->set_rdbuf(&__sb_);
+  this->set_rdbuf(std::addressof(__sb_));
 }
 
 template <class _CharT, class _Traits>
@@ -1250,7 +1255,7 @@ inline _LIBCPP_HIDE_FROM_ABI void swap(basic_ofstream<_CharT, _Traits>& __x, bas
 
 template <class _CharT, class _Traits>
 inline basic_filebuf<_CharT, _Traits>* basic_ofstream<_CharT, _Traits>::rdbuf() const {
-  return const_cast<basic_filebuf<char_type, traits_type>*>(&__sb_);
+  return const_cast<basic_filebuf<char_type, traits_type>*>(std::addressof(__sb_));
 }
 
 template <class _CharT, class _Traits>
@@ -1340,11 +1345,12 @@ private:
 };
 
 template <class _CharT, class _Traits>
-inline basic_fstream<_CharT, _Traits>::basic_fstream() : basic_iostream<char_type, traits_type>(&__sb_) {}
+inline basic_fstream<_CharT, _Traits>::basic_fstream()
+    : basic_iostream<char_type, traits_type>(std::addressof(__sb_)) {}
 
 template <class _CharT, class _Traits>
 inline basic_fstream<_CharT, _Traits>::basic_fstream(const char* __s, ios_base::openmode __mode)
-    : basic_iostream<char_type, traits_type>(&__sb_) {
+    : basic_iostream<char_type, traits_type>(std::addressof(__sb_)) {
   if (__sb_.open(__s, __mode) == nullptr)
     this->setstate(ios_base::failbit);
 }
@@ -1352,7 +1358,7 @@ inline basic_fstream<_CharT, _Traits>::basic_fstream(const char* __s, ios_base::
 #  ifdef _LIBCPP_HAS_OPEN_WITH_WCHAR
 template <class _CharT, class _Traits>
 inline basic_fstream<_CharT, _Traits>::basic_fstream(const wchar_t* __s, ios_base::openmode __mode)
-    : basic_iostream<char_type, traits_type>(&__sb_) {
+    : basic_iostream<char_type, traits_type>(std::addressof(__sb_)) {
   if (__sb_.open(__s, __mode) == nullptr)
     this->setstate(ios_base::failbit);
 }
@@ -1360,15 +1366,16 @@ inline basic_fstream<_CharT, _Traits>::basic_fstream(const wchar_t* __s, ios_bas
 
 template <class _CharT, class _Traits>
 inline basic_fstream<_CharT, _Traits>::basic_fstream(const string& __s, ios_base::openmode __mode)
-    : basic_iostream<char_type, traits_type>(&__sb_) {
+    : basic_iostream<char_type, traits_type>(std::addressof(__sb_)) {
   if (__sb_.open(__s, __mode) == nullptr)
     this->setstate(ios_base::failbit);
 }
 
+// extension
 template <class _CharT, class _Traits>
 inline basic_fstream<_CharT, _Traits>::basic_fstream(basic_fstream&& __rhs)
     : basic_iostream<char_type, traits_type>(std::move(__rhs)), __sb_(std::move(__rhs.__sb_)) {
-  this->set_rdbuf(&__sb_);
+  this->set_rdbuf(std::addressof(__sb_));
 }
 
 template <class _CharT, class _Traits>
@@ -1391,7 +1398,7 @@ inline _LIBCPP_HIDE_FROM_ABI void swap(basic_fstream<_CharT, _Traits>& __x, basi
 
 template <class _CharT, class _Traits>
 inline basic_filebuf<_CharT, _Traits>* basic_fstream<_CharT, _Traits>::rdbuf() const {
-  return const_cast<basic_filebuf<char_type, traits_type>*>(&__sb_);
+  return const_cast<basic_filebuf<char_type, traits_type>*>(std::addressof(__sb_));
 }
 
 template <class _CharT, class _Traits>
diff --git a/libcxx/include/__cxx03/ios b/libcxx/include/__cxx03/ios
index 7c522909e6428..aa03ce348b624 100644
--- a/libcxx/include/__cxx03/ios
+++ b/libcxx/include/__cxx03/ios
@@ -218,6 +218,7 @@ storage-class-specifier const error_category& iostream_category() noexcept;
 #  include <__cxx03/__fwd/ios.h>
 #  include <__cxx03/__ios/fpos.h>
 #  include <__cxx03/__locale>
+#  include <__cxx03/__memory/addressof.h>
 #  include <__cxx03/__system_error/error_category.h>
 #  include <__cxx03/__system_error/error_code.h>
 #  include <__cxx03/__system_error/error_condition.h>
@@ -696,7 +697,7 @@ inline _LIBCPP_HIDE_FROM_ABI _CharT basic_ios<_CharT, _Traits>::fill(char_type _
 
 template <class _CharT, class _Traits>
 basic_ios<_CharT, _Traits>& basic_ios<_CharT, _Traits>::copyfmt(const basic_ios& __rhs) {
-  if (this != &__rhs) {
+  if (this != std::addressof(__rhs)) {
     __call_callbacks(erase_event);
     ios_base::copyfmt(__rhs);
     __tie_  = __rhs.__tie_;
diff --git a/libcxx/include/__cxx03/sstream b/libcxx/include/__cxx03/sstream
index de56cd99553e2..44c2423a6e1fa 100644
--- a/libcxx/include/__cxx03/sstream
+++ b/libcxx/include/__cxx03/sstream
@@ -713,18 +713,19 @@ private:
 
 public:
   // [istringstream.cons] Constructors:
-  _LIBCPP_HIDE_FROM_ABI basic_istringstream() : basic_istream<_CharT, _Traits>(&__sb_), __sb_(ios_base::in) {}
+  _LIBCPP_HIDE_FROM_ABI basic_istringstream()
+      : basic_istream<_CharT, _Traits>(std::addressof(__sb_)), __sb_(ios_base::in) {}
 
   _LIBCPP_HIDE_FROM_ABI explicit basic_istringstream(ios_base::openmode __wch)
-      : basic_istream<_CharT, _Traits>(&__sb_), __sb_(__wch | ios_base::in) {}
+      : basic_istream<_CharT, _Traits>(std::addressof(__sb_)), __sb_(__wch | ios_base::in) {}
 
   _LIBCPP_HIDE_FROM_ABI explicit basic_istringstream(const string_type& __s, ios_base::openmode __wch = ios_base::in)
-      : basic_istream<_CharT, _Traits>(&__sb_), __sb_(__s, __wch | ios_base::in) {}
+      : basic_istream<_CharT, _Traits>(std::addressof(__sb_)), __sb_(__s, __wch | ios_base::in) {}
 
   basic_istringstream(const basic_istringstream&) = delete;
   _LIBCPP_HIDE_FROM_ABI basic_istringstream(basic_istringstream&& __rhs)
       : basic_istream<_CharT, _Traits>(std::move(__rhs)), __sb_(std::move(__rhs.__sb_)) {
-    basic_istream<_CharT, _Traits>::set_rdbuf(&__sb_);
+    basic_istream<_CharT, _Traits>::set_rdbuf(std::addressof(__sb_));
   }
 
   // [istringstream.assign] Assign and swap:
@@ -741,7 +742,7 @@ public:
 
   // [istringstream.members] Member functions:
   _LIBCPP_HIDE_FROM_ABI basic_stringbuf<char_type, traits_type, allocator_type>* rdbuf() const {
-    return const_cast<basic_stringbuf<char_type, traits_type, allocator_type>*>(&__sb_);
+    return const_cast<basic_stringbuf<char_type, traits_type, allocator_type>*>(std::addressof(__sb_));
   }
 
   _LIBCPP_HIDE_FROM_ABI string_type str() const { return __sb_.str(); }
@@ -774,18 +775,19 @@ private:
 
 public:
   // [ostringstream.cons] Constructors:
-  _LIBCPP_HIDE_FROM_ABI basic_ostringstream() : basic_ostream<_CharT, _Traits>(&__sb_), __sb_(ios_base::out) {}
+  _LIBCPP_HIDE_FROM_ABI basic_ostringstream()
+      : basic_ostream<_CharT, _Traits>(std::addressof(__sb_)), __sb_(ios_base::out) {}
 
   _LIBCPP_HIDE_FROM_ABI explicit basic_ostringstream(ios_base::openmode __wch)
-      : basic_ostream<_CharT, _Traits>(&__sb_), __sb_(__wch | ios_base::out) {}
+      : basic_ostream<_CharT, _Traits>(std::addressof(__sb_)), __sb_(__wch | ios_base::out) {}
 
   _LIBCPP_HIDE_FROM_ABI explicit basic_ostringstream(const string_type& __s, ios_base::openmode __wch = ios_base::out)
-      : basic_ostream<_CharT, _Traits>(&__sb_), __sb_(__s, __wch | ios_base::out) {}
+      : basic_ostream<_CharT, _Traits>(std::addressof(__sb_)), __sb_(__s, __wch | ios_base::out) {}
 
   basic_ostringstream(const basic_ostringstream&) = delete;
   _LIBCPP_HIDE_FROM_ABI basic_ostringstream(basic_ostringstream&& __rhs)
       : basic_ostream<_CharT, _Traits>(std::move(__rhs)), __sb_(std::move(__rhs.__sb_)) {
-    basic_ostream<_CharT, _Traits>::set_rdbuf(&__sb_);
+    basic_ostream<_CharT, _Traits>::set_rdbuf(std::addressof(__sb_));
   }
 
   // [ostringstream.assign] Assign and swap:
@@ -803,7 +805,7 @@ public:
 
   // [ostringstream.members] Member functions:
   _LIBCPP_HIDE_FROM_ABI basic_stringbuf<char_type, traits_type, allocator_type>* rdbuf() const {
-    return const_cast<basic_stringbuf<char_type, traits_type, allocator_type>*>(&__sb_);
+    return const_cast<basic_stringbuf<char_type, traits_type, allocator_type>*>(std::addressof(__sb_));
   }
 
   _LIBCPP_HIDE_FROM_ABI string_type str() const { return __sb_.str(); }
@@ -836,19 +838,19 @@ private:
 public:
   // [stringstream.cons] constructors
   _LIBCPP_HIDE_FROM_ABI basic_stringstream()
-      : basic_iostream<_CharT, _Traits>(&__sb_), __sb_(ios_base::in | ios_base::out) {}
+      : basic_iostream<_CharT, _Traits>(std::addressof(__sb_)), __sb_(ios_base::in | ios_base::out) {}
 
   _LIBCPP_HIDE_FROM_ABI explicit basic_stringstream(ios_base::openmode __wch)
-      : basic_iostream<_CharT, _Traits>(&__sb_), __sb_(__wch) {}
+      : basic_iostream<_CharT, _Traits>(std::addressof(__sb_)), __sb_(__wch) {}
 
   _LIBCPP_HIDE_FROM_ABI explicit basic_stringstream(const string_type& __s,
                                                     ios_base::openmode __wch = ios_base::in | ios_base::out)
-      : basic_iostream<_CharT, _Traits>(&__sb_), __sb_(__s, __wch) {}
+      : basic_iostream<_CharT, _Traits>(std::addressof(__sb_)), __sb_(__s, __wch) {}
 
   basic_stringstream(const basic_stringstream&) = delete;
   _LIBCPP_HIDE_FROM_ABI basic_stringstream(basic_stringstream&& __rhs)
       : basic_iostream<_CharT, _Traits>(std::move(__rhs)), __sb_(std::move(__rhs.__sb_)) {
-    basic_istream<_CharT, _Traits>::set_rdbuf(&__sb_);
+    basic_istream<_CharT, _Traits>::set_rdbuf(std::addressof(__sb_));
   }
 
   // [stringstream.assign] Assign and swap:
@@ -865,7 +867,7 @@ public:
 
   // [stringstream.members] Member functions:
   _LIBCPP_HIDE_FROM_ABI basic_stringbuf<char_type, traits_type, allocator_type>* rdbuf() const {
-    return const_cast<basic_stringbuf<char_type, traits_type, allocator_type>*>(&__sb_);
+    return const_cast<basic_stringbuf<char_type, traits_type, allocator_type>*>(std::addressof(__sb_));
   }
 
   _LIBCPP_HIDE_FROM_ABI string_type str() const { return __sb_.str(); }
diff --git a/libcxx/include/__cxx03/string b/libcxx/include/__cxx03/string
index 178140486105e..6c43fca08f109 100644
--- a/libcxx/include/__cxx03/string
+++ b/libcxx/include/__cxx03/string
@@ -2866,13 +2866,13 @@ inline void basic_string<_CharT, _Traits, _Allocator>::swap(basic_string& __str)
       "swapping non-equal allocators");
   if (!__is_long())
     __annotate_delete();
-  if (this != &__str && !__str.__is_long())
+  if (this != std::addressof(__str) && !__str.__is_long())
     __str.__annotate_delete();
   std::swap(__r_.first(), __str.__r_.first());
   std::__swap_allocator(__alloc(), __str.__alloc());
   if (!__is_long())
     __annotate_new(__get_short_size());
-  if (this != &__str && !__str.__is_long())
+  if (this != std::addressof(__str) && !__str.__is_long())
     __str.__annotate_new(__str.__get_short_size());
 }
 
diff --git a/libcxx/include/__hash_table b/libcxx/include/__hash_table
index 2018b5a194e89..2b246f82ce36d 100644
--- a/libcxx/include/__hash_table
+++ b/libcxx/include/__hash_table
@@ -1036,21 +1036,7 @@ private:
   }
   _LIBCPP_HIDE_FROM_ABI void __move_assign_alloc(__hash_table&, false_type) _NOEXCEPT {}
 
-  _LIBCPP_HIDE_FROM_ABI void __deallocate_node(__node_pointer __nd) _NOEXCEPT {
-    auto& __alloc = __node_alloc();
-    __node_traits::destroy(__alloc, std::addressof(__nd->__get_value()));
-    std::__destroy_at(std::__to_address(__nd));
-    __node_traits::deallocate(__alloc, __nd, 1);
-  }
-
-  _LIBCPP_HIDE_FROM_ABI void __deallocate_node_list(__next_pointer __np) _NOEXCEPT {
-    while (__np != nullptr) {
-      __next_pointer __next = __np->__next_;
-      __deallocate_node(__np->__upcast());
-      __np = __next;
-    }
-  }
-
+  _LIBCPP_HIDE_FROM_ABI void __deallocate_node(__next_pointer __np) _NOEXCEPT;
   _LIBCPP_HIDE_FROM_ABI __next_pointer __detach() _NOEXCEPT;
 
   template <class _From, class _ValueT = _Tp, __enable_if_t<__is_hash_value_type<_ValueT>::value, int> = 0>
@@ -1188,7 +1174,7 @@ __hash_table<_Tp, _Hash, _Equal, _Alloc>::~__hash_table() {
   static_assert(is_copy_constructible<hasher>::value, "Hasher must be copy-constructible.");
 #endif
 
-  __deallocate_node_list(__first_node_.__next_);
+  __deallocate_node(__first_node_.__next_);
 }
 
 template <class _Tp, class _Hash, class _Equal, class _Alloc>
@@ -1264,7 +1250,7 @@ __hash_table<_Tp, _Hash, _Equal, _Alloc>::operator=(const __hash_table& __other)
   // At this point we either have consumed the whole incoming hash table, or we don't have any more nodes to reuse in
   // the destination. Either continue with constructing new nodes, or deallocate the left over nodes.
   if (__own_iter->__next_) {
-    __deallocate_node_list(__own_iter->__next_);
+    __deallocate_node(__own_iter->__next_);
     __own_iter->__next_ = nullptr;
   } else {
     __copy_construct(__other_iter, __own_iter, __current_chash);
@@ -1275,6 +1261,19 @@ __hash_table<_Tp, _Hash, _Equal, _Alloc>::operator=(const __hash_table& __other)
   return *this;
 }
 
+template <class _Tp, class _Hash, class _Equal, class _Alloc>
+void __hash_table<_Tp, _Hash, _Equal, _Alloc>::__deallocate_node(__next_pointer __np) _NOEXCEPT {
+  __node_allocator& __na = __node_alloc();
+  while (__np != nullptr) {
+    __next_pointer __next    = __np->__next_;
+    __node_pointer __real_np = __np->__upcast();
+    __node_traits::destroy(__na, std::addressof(__real_np->__get_value()));
+    std::__destroy_at(std::addressof(*__real_np));
+    __node_traits::deallocate(__na, __real_np, 1);
+    __np = __next;
+  }
+}
+
 template <class _Tp, class _Hash, class _Equal, class _Alloc>
 typename __hash_table<_Tp, _Hash, _Equal, _Alloc>::__next_pointer
 __hash_table<_Tp, _Hash, _Equal, _Alloc>::__detach() _NOEXCEPT {
@@ -1330,11 +1329,11 @@ void __hash_table<_Tp, _Hash, _Equal, _Alloc>::__move_assign(__hash_table& __u,
         }
 #if _LIBCPP_HAS_EXCEPTIONS
       } catch (...) {
-        __deallocate_node_list(__cache);
+        __deallocate_node(__cache);
         throw;
       }
 #endif // _LIBCPP_HAS_EXCEPTIONS
-      __deallocate_node_list(__cache);
+      __deallocate_node(__cache);
     }
     const_iterator __i = __u.begin();
     while (__u.size() != 0)
@@ -1373,11 +1372,11 @@ void __hash_table<_Tp, _Hash, _Equal, _Alloc>::__assign_unique(_InputIterator __
       }
 #if _LIBCPP_HAS_EXCEPTIONS
     } catch (...) {
-      __deallocate_node_list(__cache);
+      __deallocate_node(__cache);
       throw;
     }
 #endif // _LIBCPP_HAS_EXCEPTIONS
-    __deallocate_node_list(__cache);
+    __deallocate_node(__cache);
   }
   for (; __first != __last; ++__first)
     __emplace_unique(*__first);
@@ -1403,11 +1402,11 @@ void __hash_table<_Tp, _Hash, _Equal, _Alloc>::__assign_multi(_InputIterator __f
       }
 #if _LIBCPP_HAS_EXCEPTIONS
     } catch (...) {
-      __deallocate_node_list(__cache);
+      __deallocate_node(__cache);
       throw;
     }
 #endif // _LIBCPP_HAS_EXCEPTIONS
-    __deallocate_node_list(__cache);
+    __deallocate_node(__cache);
   }
   for (; __first != __last; ++__first)
     __emplace_multi(*__first);
@@ -1440,7 +1439,7 @@ __hash_table<_Tp, _Hash, _Equal, _Alloc>::end() const _NOEXCEPT {
 template <class _Tp, class _Hash, class _Equal, class _Alloc>
 void __hash_table<_Tp, _Hash, _Equal, _Alloc>::clear() _NOEXCEPT {
   if (size() > 0) {
-    __deallocate_node_list(__first_node_.__next_);
+    __deallocate_node(__first_node_.__next_);
     __first_node_.__next_ = nullptr;
     size_type __bc        = bucket_count();
     for (size_type __i = 0; __i < __bc; ++__i)
@@ -1900,57 +1899,12 @@ __hash_table<_Tp, _Hash, _Equal, _Alloc>::erase(const_iterator __p) {
 template <class _Tp, class _Hash, class _Equal, class _Alloc>
 typename __hash_table<_Tp, _Hash, _Equal, _Alloc>::iterator
 __hash_table<_Tp, _Hash, _Equal, _Alloc>::erase(const_iterator __first, const_iterator __last) {
-  if (__first == __last)
-    return iterator(__last.__node_);
-
-  // current node
-  __next_pointer __current = __first.__node_;
-  size_type __bucket_count = bucket_count();
-  size_t __chash           = std::__constrain_hash(__current->__hash(), __bucket_count);
-  // find previous node
-  __next_pointer __before_first = __bucket_list_[__chash];
-  for (; __before_first->__next_ != __current; __before_first = __before_first->__next_)
-    ;
-
-  __next_pointer __last_node = __last.__node_;
-
-  // If __before_first is in the same bucket (i.e. the first element we erase is not the first in the bucket), clear
-  // this bucket first without re-linking it
-  if (__before_first != __first_node_.__ptr() &&
-      std::__constrain_hash(__before_first->__hash(), __bucket_count) == __chash) {
-    while (__current != __last_node) {
-      if (auto __next_chash = std::__constrain_hash(__current->__hash(), __bucket_count); __next_chash != __chash) {
-        __chash = __next_chash;
-        break;
-      }
-      auto __next = __current->__next_;
-      __deallocate_node(__current->__upcast());
-      __current = __next;
-      --__size_;
-    }
+  for (const_iterator __p = __first; __first != __last; __p = __first) {
+    ++__first;
+    erase(__p);
   }
-
-  while (__current != __last_node) {
-    auto __next = __current->__next_;
-    __deallocate_node(__current->__upcast());
-    __current = __next;
-    --__size_;
-
-    // When switching buckets, set the old bucket to be empty and update the next bucket to have __before_first as its
-    // before-first element
-    if (__next) {
-      if (auto __next_chash = std::__constrain_hash(__next->__hash(), __bucket_count); __next_chash != __chash) {
-        __bucket_list_[__chash] = nullptr;
-        __chash                 = __next_chash;
-        __bucket_list_[__chash] = __before_first;
-      }
-    }
-  }
-
-  // re-link __before_first with __last
-  __before_first->__next_ = __current;
-
-  return iterator(__last.__node_);
+  __next_pointer __np = __last.__node_;
+  return iterator(__np);
 }
 
 template <class _Tp, class _Hash, class _Equal, class _Alloc>
diff --git a/libcxx/include/__split_buffer b/libcxx/include/__split_buffer
index 21e58f4abc6b3..15368a3bc8955 100644
--- a/libcxx/include/__split_buffer
+++ b/libcxx/include/__split_buffer
@@ -13,10 +13,12 @@
 #include <__algorithm/max.h>
 #include <__algorithm/move.h>
 #include <__algorithm/move_backward.h>
+#include <__assert>
 #include <__config>
 #include <__iterator/distance.h>
 #include <__iterator/iterator_traits.h>
 #include <__iterator/move_iterator.h>
+#include <__memory/addressof.h>
 #include <__memory/allocate_at_least.h>
 #include <__memory/allocator.h>
 #include <__memory/allocator_traits.h>
@@ -45,25 +47,434 @@ _LIBCPP_PUSH_MACROS
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
-// __split_buffer allocates a contiguous chunk of memory and stores objects in the range [__begin_, __end_).
-// It has uninitialized memory in the ranges  [__first_, __begin_) and [__end_, __cap_). That allows
-// it to grow both in the front and back without having to move the data.
+template <class _Tp, class _Allocator, template <class, class, class> class _Layout>
+class __split_buffer;
+
+template <class _SplitBuffer, class _Tp, class _Allocator>
+class __split_buffer_pointer_layout {
+protected:
+  using value_type                      = _Tp;
+  using allocator_type                  = _Allocator;
+  using __alloc_rr _LIBCPP_NODEBUG      = __libcpp_remove_reference_t<allocator_type>;
+  using __alloc_traits _LIBCPP_NODEBUG  = allocator_traits<__alloc_rr>;
+  using reference                       = value_type&;
+  using const_reference                 = const value_type&;
+  using size_type                       = typename __alloc_traits::size_type;
+  using difference_type                 = typename __alloc_traits::difference_type;
+  using pointer                         = typename __alloc_traits::pointer;
+  using const_pointer                   = typename __alloc_traits::const_pointer;
+  using iterator                        = pointer;
+  using const_iterator                  = const_pointer;
+  using __sentinel_type _LIBCPP_NODEBUG = pointer;
 
-template <class _Tp, class _Allocator = allocator<_Tp> >
-struct __split_buffer {
 public:
-  using value_type                     = _Tp;
-  using allocator_type                 = _Allocator;
-  using __alloc_rr _LIBCPP_NODEBUG     = __libcpp_remove_reference_t<allocator_type>;
-  using __alloc_traits _LIBCPP_NODEBUG = allocator_traits<__alloc_rr>;
-  using reference                      = value_type&;
-  using const_reference                = const value_type&;
-  using size_type                      = typename __alloc_traits::size_type;
-  using difference_type                = typename __alloc_traits::difference_type;
-  using pointer                        = typename __alloc_traits::pointer;
-  using const_pointer                  = typename __alloc_traits::const_pointer;
-  using iterator                       = pointer;
-  using const_iterator                 = const_pointer;
+  // Can't be defaulted due to _LIBCPP_COMPRESSED_PAIR not being an aggregate in C++03 and C++11.
+  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI __split_buffer_pointer_layout() : __back_cap_(nullptr) {}
+
+  _LIBCPP_CONSTEXPR_SINCE_CXX20
+  _LIBCPP_HIDE_FROM_ABI explicit __split_buffer_pointer_layout(const allocator_type& __alloc)
+      : __back_cap_(nullptr), __alloc_(__alloc) {}
+
+  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI pointer __front_cap() _NOEXCEPT { return __front_cap_; }
+
+  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI const_pointer __front_cap() const _NOEXCEPT {
+    return __front_cap_;
+  }
+
+  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI pointer begin() _NOEXCEPT { return __begin_; }
+
+  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI const_pointer begin() const _NOEXCEPT { return __begin_; }
+
+  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI pointer end() _NOEXCEPT { return __end_; }
+
+  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI pointer end() const _NOEXCEPT { return __end_; }
+
+  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI size_type size() const _NOEXCEPT {
+    return static_cast<size_type>(__end_ - __begin_);
+  }
+
+  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI bool empty() const _NOEXCEPT { return __begin_ == __end_; }
+
+  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI size_type capacity() const _NOEXCEPT {
+    return static_cast<size_type>(__back_cap_ - __front_cap_);
+  }
+
+  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI allocator_type& __get_allocator() _NOEXCEPT { return __alloc_; }
+
+  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI allocator_type const& __get_allocator() const _NOEXCEPT {
+    return __alloc_;
+  }
+
+  // Returns the sentinel object directly. Should be used in conjunction with automatic type deduction,
+  // not explicit types.
+  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI __sentinel_type __raw_sentinel() const _NOEXCEPT {
+    return __end_;
+  }
+  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI __sentinel_type __raw_capacity() const _NOEXCEPT {
+    return __back_cap_;
+  }
+
+  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void __set_data(pointer __new_first) _NOEXCEPT {
+    __front_cap_ = __new_first;
+  }
+
+  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void
+  __set_valid_range(pointer __new_begin, pointer __new_end) _NOEXCEPT {
+    __begin_ = __new_begin;
+    __end_   = __new_end;
+  }
+
+  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void
+  __set_valid_range(pointer __new_begin, size_type __new_size) _NOEXCEPT {
+    __begin_ = __new_begin;
+    __end_   = __begin_ + __new_size;
+  }
+
+  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void __set_sentinel(pointer __new_end) _NOEXCEPT {
+    _LIBCPP_ASSERT_INTERNAL(__front_cap_ <= __new_end, "__new_end cannot precede __front_cap_");
+    __end_ = __new_end;
+  }
+
+  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void __set_sentinel(size_type __new_size) _NOEXCEPT {
+    __end_ = __begin_ + __new_size;
+  }
+
+  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void __set_capacity(size_type __new_capacity) _NOEXCEPT {
+    __back_cap_ = __front_cap_ + __new_capacity;
+  }
+
+  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void __set_capacity(pointer __new_capacity) _NOEXCEPT {
+    __back_cap_ = __new_capacity;
+  }
+
+  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI size_type __front_spare() const _NOEXCEPT {
+    return static_cast<size_type>(__begin_ - __front_cap_);
+  }
+
+  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI size_type __back_spare() const _NOEXCEPT {
+    return static_cast<size_type>(__back_cap_ - __end_);
+  }
+
+  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI reference back() _NOEXCEPT { return *(__end_ - 1); }
+
+  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI const_reference back() const _NOEXCEPT { return *(__end_ - 1); }
+
+  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void __swap_without_allocator(
+      __split_buffer_pointer_layout<__split_buffer<value_type, __alloc_rr&, __split_buffer_pointer_layout>,
+                                    value_type,
+                                    __alloc_rr&>& __other) _NOEXCEPT {
+    std::swap(__front_cap_, __other.__front_cap_);
+    std::swap(__begin_, __other.__begin_);
+    std::swap(__back_cap_, __other.__back_cap_);
+    std::swap(__end_, __other.__end_);
+  }
+
+  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void swap(__split_buffer_pointer_layout& __other) _NOEXCEPT {
+    std::swap(__front_cap_, __other.__front_cap_);
+    std::swap(__begin_, __other.__begin_);
+    std::swap(__back_cap_, __other.__back_cap_);
+    std::swap(__end_, __other.__end_);
+    std::__swap_allocator(__alloc_, __other.__alloc_);
+  }
+
+  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void __reset() _NOEXCEPT {
+    __front_cap_ = nullptr;
+    __begin_     = nullptr;
+    __end_       = nullptr;
+    __back_cap_  = nullptr;
+  }
+
+  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void
+  __copy_without_alloc(__split_buffer_pointer_layout const& __other)
+      _NOEXCEPT_(is_nothrow_copy_assignable<pointer>::value) {
+    __front_cap_ = __other.__front_cap_;
+    __begin_     = __other.__begin_;
+    __end_       = __other.__end_;
+    __back_cap_  = __other.__back_cap_;
+  }
+
+private:
+  pointer __front_cap_ = nullptr;
+  pointer __begin_     = nullptr;
+  pointer __end_       = nullptr;
+  _LIBCPP_COMPRESSED_PAIR(pointer, __back_cap_, allocator_type, __alloc_);
+
+  template <class, class, class>
+  friend class __split_buffer_pointer_layout;
+};
+
+template <class _SplitBuffer, class _Tp, class _Allocator>
+class __split_buffer_size_layout {
+protected:
+  using value_type                      = _Tp;
+  using allocator_type                  = _Allocator;
+  using __alloc_rr _LIBCPP_NODEBUG      = __libcpp_remove_reference_t<allocator_type>;
+  using __alloc_traits _LIBCPP_NODEBUG  = allocator_traits<__alloc_rr>;
+  using reference                       = value_type&;
+  using const_reference                 = const value_type&;
+  using size_type                       = typename __alloc_traits::size_type;
+  using difference_type                 = typename __alloc_traits::difference_type;
+  using pointer                         = typename __alloc_traits::pointer;
+  using const_pointer                   = typename __alloc_traits::const_pointer;
+  using iterator                        = pointer;
+  using const_iterator                  = const_pointer;
+  using __sentinel_type _LIBCPP_NODEBUG = size_type;
+
+public:
+  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI __split_buffer_size_layout() = default;
+
+  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI explicit __split_buffer_size_layout(const allocator_type& __alloc)
+      : __alloc_(__alloc) {}
+
+  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI pointer __front_cap() _NOEXCEPT { return __front_cap_; }
+
+  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI const_pointer __front_cap() const _NOEXCEPT {
+    return __front_cap_;
+  }
+
+  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI pointer begin() _NOEXCEPT { return __begin_; }
+
+  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI const_pointer begin() const _NOEXCEPT { return __begin_; }
+
+  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI pointer end() _NOEXCEPT { return __begin_ + __size_; }
+
+  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI pointer end() const _NOEXCEPT { return __begin_ + __size_; }
+
+  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI size_type size() const _NOEXCEPT { return __size_; }
+
+  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI bool empty() const _NOEXCEPT { return __size_ == 0; }
+
+  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI size_type capacity() const _NOEXCEPT { return __cap_; }
+
+  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI allocator_type& __get_allocator() _NOEXCEPT { return __alloc_; }
+
+  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI allocator_type const& __get_allocator() const _NOEXCEPT {
+    return __alloc_;
+  }
+
+  // Returns the sentinel object directly. Should be used in conjunction with automatic type deduction,
+  // not explicit types.
+  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI __sentinel_type __raw_sentinel() const _NOEXCEPT {
+    return __size_;
+  }
+  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI __sentinel_type __raw_capacity() const _NOEXCEPT {
+    return __cap_;
+  }
+
+  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void __set_data(pointer __new_first) _NOEXCEPT {
+    __front_cap_ = __new_first;
+  }
+
+  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void
+  __set_valid_range(pointer __new_begin, pointer __new_end) _NOEXCEPT {
+    // Size-based __split_buffers track their size directly: we need to explicitly update the size
+    // when the front is adjusted.
+    __size_ -= __new_begin - __begin_;
+    __begin_ = __new_begin;
+    __set_sentinel(__new_end);
+  }
+
+  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void
+  __set_valid_range(pointer __new_begin, size_type __new_size) _NOEXCEPT {
+    // Size-based __split_buffers track their size directly: we need to explicitly update the size
+    // when the front is adjusted.
+    __size_ -= __new_begin - __begin_;
+    __begin_ = __new_begin;
+    __set_sentinel(__new_size);
+  }
+
+  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void __set_sentinel(pointer __new_end) _NOEXCEPT {
+    _LIBCPP_ASSERT_INTERNAL(__front_cap_ <= __new_end, "__new_end cannot precede __front_cap_");
+    __size_ += __new_end - end();
+  }
+
+  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void __set_sentinel(size_type __new_size) _NOEXCEPT {
+    __size_ = __new_size;
+  }
+
+  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void __set_capacity(size_type __new_capacity) _NOEXCEPT {
+    __cap_ = __new_capacity;
+  }
+
+  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void __set_capacity(pointer __new_capacity) _NOEXCEPT {
+    __cap_ = __new_capacity - __begin_;
+  }
+
+  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI size_type __front_spare() const _NOEXCEPT {
+    return static_cast<size_type>(__begin_ - __front_cap_);
+  }
+
+  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI size_type __back_spare() const _NOEXCEPT {
+    // `__cap_ - __end_` tells us the total number of spares when in size-mode. We need to remove
+    // the __front_spare from the count.
+    return __cap_ - __size_ - __front_spare();
+  }
+
+  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI reference back() _NOEXCEPT { return __begin_[__size_ - 1]; }
+
+  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI const_reference back() const _NOEXCEPT {
+    return __begin_[__size_ - 1];
+  }
+
+  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void __swap_without_allocator(
+      __split_buffer_pointer_layout<__split_buffer<value_type, __alloc_rr&, __split_buffer_pointer_layout>,
+                                    value_type,
+                                    __alloc_rr&>& __other) _NOEXCEPT {
+    std::swap(__front_cap_, __other.__front_cap_);
+    std::swap(__begin_, __other.__begin_);
+    std::swap(__cap_, __other.__cap_);
+    std::swap(__size_, __other.__size_);
+  }
+
+  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void swap(__split_buffer_size_layout& __other) _NOEXCEPT {
+    std::swap(__front_cap_, __other.__front_cap_);
+    std::swap(__begin_, __other.__begin_);
+    std::swap(__cap_, __other.__cap_);
+    std::swap(__size_, __other.__size_);
+    std::__swap_allocator(__alloc_, __other.__alloc_);
+  }
+
+  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void __reset() _NOEXCEPT {
+    __front_cap_ = nullptr;
+    __begin_     = nullptr;
+    __size_      = 0;
+    __cap_       = 0;
+  }
+
+  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void
+  __copy_without_alloc(__split_buffer_size_layout const& __other)
+      _NOEXCEPT_(is_nothrow_copy_assignable<pointer>::value) {
+    __front_cap_ = __other.__front_cap_;
+    __begin_     = __other.__begin_;
+    __cap_       = __other.__cap_;
+    __size_      = __other.__size_;
+  }
+
+private:
+  pointer __front_cap_ = nullptr;
+  pointer __begin_     = nullptr;
+  size_type __size_    = 0;
+  size_type __cap_     = 0;
+  _LIBCPP_NO_UNIQUE_ADDRESS allocator_type __alloc_;
+
+  template <class, class, class>
+  friend class __split_buffer_size_layout;
+};
+
+// `__split_buffer` is a contiguous array data structure. It may hold spare capacity at both ends of
+// the sequence. This allows for a `__split_buffer` to grow from both the front and the back without
+// relocating its contents until it runs out of room. This characteristic sets it apart from
+// `std::vector`, which only holds spare capacity at its end. As such, `__split_buffer` is useful
+// for implementing both `std::vector` and `std::deque`.
+//
+// The sequence is stored as a contiguous chunk of memory delimited by the following "pointers" (`o` denotes
+// uninitialized memory and `x` denotes a valid object):
+//
+//     |oooooooooooooooooooxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxoooooooooooooooooooooooo|
+//      ^                  ^                                    ^                       ^
+//  __front_cap_        __begin_                              __end_               __back_cap_
+//
+// The range [__front_cap_, __begin_) contains uninitialized memory. It is referred to as the "front spare capacity".
+// The range [__begin_, __end_) contains valid objects. It is referred to as the "valid range".
+// The range [__end_, __back_cap_) contains uninitialized memory. It is referred to as the "back spare capacity".
+//
+// The layout of `__split_buffer` is determined by the `_Layout` template template parameter. This
+// `_Layout` allows the above pointers to be stored as different representations, such as integer
+// offsets. A layout class template must provide the following interface:
+//
+//    template<class _Tp, class _Allocator, class _Layout>
+//    class __layout {
+//    protected:
+//      using value_type                     = _Tp;
+//      using allocator_type                 = _Allocator;
+//      using __alloc_rr                     = __libcpp_remove_reference_t<allocator_type>;
+//      using __alloc_traits                 = allocator_traits<__alloc_rr>;
+//      using reference                      = value_type&;
+//      using const_reference                = const value_type&;
+//      using size_type                      = typename __alloc_traits::size_type;
+//      using difference_type                = typename __alloc_traits::difference_type;
+//      using pointer                        = typename __alloc_traits::pointer;
+//      using const_pointer                  = typename __alloc_traits::const_pointer;
+//      using iterator                       = pointer;
+//      using const_iterator                 = const_pointer;
+//      using __sentinel_type                = /* type that represents the layout's sentinel */;
+//
+//    public:
+//      __layout() = default;
+//      explicit __layout(const allocator_type&);
+//
+//      pointer __front_cap();
+//      const_pointer __front_cap() const;
+//
+//      pointer begin();
+//      const_pointer begin() const;
+//
+//      pointer end();
+//      pointer end() const;
+//
+//      size_type size() const;
+//      bool empty() const;
+//      size_type capacity() const;
+//
+//      allocator_type& __get_allocator();
+//      allocator_type const& __get_allocator() const;
+//
+//      __sentinel_type __raw_sentinel() const;
+//      __sentinel_type __raw_capacity() const;
+//
+//      void __set_data(pointer);
+//      void __set_valid_range(pointer __begin, pointer __end);
+//      void __set_valid_range(pointer __begin, size_type __size);
+//      void __set_sentinel(pointer __end);
+//      void __set_sentinel(size_type __size);
+//
+//      void __set_capacity(size_type __capacity);
+//      void __set_capacity(pointer __capacity);
+//
+//      size_type __front_spare() const;
+//      size_type __back_spare() const;
+//
+//      reference back();
+//      const_reference back() const;
+//
+//      template<class _OtherLayout>
+//      void __swap_without_allocator(_OtherLayout&);
+//      void swap(__layout&);
+//
+//      void __reset();
+//      void __copy_without_alloc(__layout const&);
+//    };
+//
+template <class _Tp, class _Allocator, template <class, class, class> class _Layout>
+class __split_buffer : _Layout<__split_buffer<_Tp, _Allocator, _Layout>, _Tp, _Allocator> {
+  using __base_type _LIBCPP_NODEBUG = _Layout<__split_buffer<_Tp, _Allocator, _Layout>, _Tp, _Allocator>;
+
+public:
+  using __base_type::__back_spare;
+  using __base_type::__copy_without_alloc;
+  using __base_type::__front_cap;
+  using __base_type::__front_spare;
+  using __base_type::__get_allocator;
+  using __base_type::__raw_capacity;
+  using __base_type::__raw_sentinel;
+  using __base_type::__reset;
+  using __base_type::__set_capacity;
+  using __base_type::__set_data;
+  using __base_type::__set_sentinel;
+  using __base_type::__set_valid_range;
+
+  using typename __base_type::__alloc_rr;
+  using typename __base_type::__alloc_traits;
+  using typename __base_type::allocator_type;
+  using typename __base_type::const_iterator;
+  using typename __base_type::const_pointer;
+  using typename __base_type::const_reference;
+  using typename __base_type::difference_type;
+  using typename __base_type::iterator;
+  using typename __base_type::pointer;
+  using typename __base_type::reference;
+  using typename __base_type::size_type;
+  using typename __base_type::value_type;
 
   // A __split_buffer contains the following members which may be trivially relocatable:
   // - pointer: may be trivially relocatable, so it's checked
@@ -78,23 +489,15 @@ public:
                       __split_buffer,
                       void>;
 
-  pointer __first_;
-  pointer __begin_;
-  pointer __end_;
-  _LIBCPP_COMPRESSED_PAIR(pointer, __cap_, allocator_type, __alloc_);
-
   __split_buffer(const __split_buffer&)            = delete;
   __split_buffer& operator=(const __split_buffer&) = delete;
 
-  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI __split_buffer()
-      _NOEXCEPT_(is_nothrow_default_constructible<allocator_type>::value)
-      : __first_(nullptr), __begin_(nullptr), __end_(nullptr), __cap_(nullptr) {}
+  _LIBCPP_HIDE_FROM_ABI __split_buffer() = default;
 
-  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI explicit __split_buffer(__alloc_rr& __a)
-      : __first_(nullptr), __begin_(nullptr), __end_(nullptr), __cap_(nullptr), __alloc_(__a) {}
+  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI explicit __split_buffer(__alloc_rr& __a) : __base_type(__a) {}
 
   _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI explicit __split_buffer(const __alloc_rr& __a)
-      : __first_(nullptr), __begin_(nullptr), __end_(nullptr), __cap_(nullptr), __alloc_(__a) {}
+      : __base_type(__a) {}
 
   _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI
   __split_buffer(size_type __cap, size_type __start, __alloc_rr& __a);
@@ -111,36 +514,16 @@ public:
 
   _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI ~__split_buffer();
 
-  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI iterator begin() _NOEXCEPT { return __begin_; }
-  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI const_iterator begin() const _NOEXCEPT { return __begin_; }
-
-  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI iterator end() _NOEXCEPT { return __end_; }
-  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI const_iterator end() const _NOEXCEPT { return __end_; }
-
-  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void clear() _NOEXCEPT { __destruct_at_end(__begin_); }
+  using __base_type::back;
+  using __base_type::begin;
+  using __base_type::capacity;
+  using __base_type::empty;
+  using __base_type::end;
+  using __base_type::size;
 
-  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI size_type size() const {
-    return static_cast<size_type>(__end_ - __begin_);
-  }
-
-  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI bool empty() const { return __end_ == __begin_; }
-
-  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI size_type capacity() const {
-    return static_cast<size_type>(__cap_ - __first_);
-  }
-
-  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI size_type __front_spare() const {
-    return static_cast<size_type>(__begin_ - __first_);
-  }
-
-  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI size_type __back_spare() const {
-    return static_cast<size_type>(__cap_ - __end_);
-  }
-
-  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI reference front() { return *__begin_; }
-  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI const_reference front() const { return *__begin_; }
-  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI reference back() { return *(__end_ - 1); }
-  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI const_reference back() const { return *(__end_ - 1); }
+  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void clear() _NOEXCEPT { __destruct_at_end(begin()); }
+  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI reference front() { return *begin(); }
+  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI const_reference front() const { return *begin(); }
 
   _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void shrink_to_fit() _NOEXCEPT;
 
@@ -149,8 +532,8 @@ public:
   template <class... _Args>
   _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void emplace_back(_Args&&... __args);
 
-  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void pop_front() { __destruct_at_begin(__begin_ + 1); }
-  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void pop_back() { __destruct_at_end(__end_ - 1); }
+  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void pop_front() { __destruct_at_begin(begin() + 1); }
+  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void pop_back() { __destruct_at_end(end() - 1); }
 
   _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void __construct_at_end(size_type __n);
   _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void __construct_at_end(size_type __n, const_reference __x);
@@ -184,242 +567,240 @@ public:
   _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void swap(__split_buffer& __x)
       _NOEXCEPT_(!__alloc_traits::propagate_on_container_swap::value || __is_nothrow_swappable_v<__alloc_rr>);
 
-  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI bool __invariants() const;
+  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI bool __invariants() const {
+    if (__front_cap() == nullptr) {
+      if (begin() != nullptr)
+        return false;
+
+      if (!empty())
+        return false;
+
+      if (capacity() != 0)
+        return false;
+
+      return true;
+    } else {
+      if (begin() < __front_cap())
+        return false;
+
+      if (capacity() < size())
+        return false;
+
+      if (end() < begin())
+        return false;
+
+      return true;
+    }
+  }
+
+  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void
+  __swap_without_allocator(__split_buffer<value_type, __alloc_rr&, _Layout>& __other) _NOEXCEPT {
+    __base_type::__swap_without_allocator(__other);
+  }
 
 private:
   _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void __move_assign_alloc(__split_buffer& __c, true_type)
       _NOEXCEPT_(is_nothrow_move_assignable<allocator_type>::value) {
-    __alloc_ = std::move(__c.__alloc_);
+    __get_allocator() = std::move(__c.__get_allocator());
   }
 
   _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void __move_assign_alloc(__split_buffer&, false_type) _NOEXCEPT {}
 
   struct _ConstructTransaction {
     _LIBCPP_CONSTEXPR_SINCE_CXX20
-    _LIBCPP_HIDE_FROM_ABI explicit _ConstructTransaction(pointer* __p, size_type __n) _NOEXCEPT
-        : __pos_(*__p),
-          __end_(*__p + __n),
-          __dest_(__p) {}
+    _LIBCPP_HIDE_FROM_ABI explicit _ConstructTransaction(__split_buffer* __parent, pointer __p, size_type __n) _NOEXCEPT
+        : __pos_(__p),
+          __end_(__p + __n),
+          __parent_(__parent) {}
 
-    _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI ~_ConstructTransaction() { *__dest_ = __pos_; }
+    _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI ~_ConstructTransaction() { __parent_->__set_sentinel(__pos_); }
 
     pointer __pos_;
     const pointer __end_;
 
   private:
-    pointer* __dest_;
+    __split_buffer* __parent_;
   };
-};
 
-template <class _Tp, class _Allocator>
-_LIBCPP_CONSTEXPR_SINCE_CXX20 bool __split_buffer<_Tp, _Allocator>::__invariants() const {
-  if (__first_ == nullptr) {
-    if (__begin_ != nullptr)
-      return false;
-    if (__end_ != nullptr)
-      return false;
-    if (__cap_ != nullptr)
-      return false;
-  } else {
-    if (__begin_ < __first_)
-      return false;
-    if (__end_ < __begin_)
-      return false;
-    if (__cap_ < __end_)
-      return false;
-  }
-  return true;
-}
+  template <class _T2, class _A2, template <class, class, class> class _L2>
+  friend class __split_buffer;
+};
 
-//  Default constructs __n objects starting at __end_
+//  Default constructs __n objects starting at `end()`
 //  throws if construction throws
 //  Precondition:  __n > 0
 //  Precondition:  size() + __n <= capacity()
 //  Postcondition:  size() == size() + __n
-template <class _Tp, class _Allocator>
-_LIBCPP_CONSTEXPR_SINCE_CXX20 void __split_buffer<_Tp, _Allocator>::__construct_at_end(size_type __n) {
-  _ConstructTransaction __tx(std::addressof(this->__end_), __n);
+template <class _Tp, class _Allocator, template <class, class, class> class _Layout>
+_LIBCPP_CONSTEXPR_SINCE_CXX20 void __split_buffer<_Tp, _Allocator, _Layout>::__construct_at_end(size_type __n) {
+  _ConstructTransaction __tx(this, end(), __n);
   for (; __tx.__pos_ != __tx.__end_; ++__tx.__pos_) {
-    __alloc_traits::construct(__alloc_, std::__to_address(__tx.__pos_));
+    __alloc_traits::construct(__get_allocator(), std::__to_address(__tx.__pos_));
   }
 }
 
-//  Copy constructs __n objects starting at __end_ from __x
+//  Copy constructs __n objects starting at `end()` from __x
 //  throws if construction throws
 //  Precondition:  __n > 0
 //  Precondition:  size() + __n <= capacity()
 //  Postcondition:  size() == old size() + __n
 //  Postcondition:  [i] == __x for all i in [size() - __n, __n)
-template <class _Tp, class _Allocator>
+template <class _Tp, class _Allocator, template <class, class, class> class _Layout>
 _LIBCPP_CONSTEXPR_SINCE_CXX20 void
-__split_buffer<_Tp, _Allocator>::__construct_at_end(size_type __n, const_reference __x) {
-  _ConstructTransaction __tx(std::addressof(this->__end_), __n);
+__split_buffer<_Tp, _Allocator, _Layout>::__construct_at_end(size_type __n, const_reference __x) {
+  _ConstructTransaction __tx(this, end(), __n);
   for (; __tx.__pos_ != __tx.__end_; ++__tx.__pos_) {
-    __alloc_traits::construct(__alloc_, std::__to_address(__tx.__pos_), __x);
+    __alloc_traits::construct(__get_allocator(), std::__to_address(__tx.__pos_), __x);
   }
 }
 
-template <class _Tp, class _Allocator>
+template <class _Tp, class _Allocator, template <class, class, class> class _Layout>
 template <class _Iterator, class _Sentinel>
 _LIBCPP_CONSTEXPR_SINCE_CXX20 void
-__split_buffer<_Tp, _Allocator>::__construct_at_end_with_sentinel(_Iterator __first, _Sentinel __last) {
-  __alloc_rr& __a = __alloc_;
+__split_buffer<_Tp, _Allocator, _Layout>::__construct_at_end_with_sentinel(_Iterator __first, _Sentinel __last) {
+  __alloc_rr& __a = __get_allocator();
   for (; __first != __last; ++__first) {
-    if (__end_ == __cap_) {
-      size_type __old_cap = __cap_ - __first_;
+    if (__back_spare() == 0) {
+      size_type __old_cap = capacity();
       size_type __new_cap = std::max<size_type>(2 * __old_cap, 8);
       __split_buffer __buf(__new_cap, 0, __a);
-      for (pointer __p = __begin_; __p != __end_; ++__p, (void)++__buf.__end_)
-        __alloc_traits::construct(__buf.__alloc_, std::__to_address(__buf.__end_), std::move(*__p));
+      pointer __buf_end = __buf.end();
+      pointer __end     = end();
+      for (pointer __p = begin(); __p != __end; ++__p) {
+        __alloc_traits::construct(__buf.__get_allocator(), std::__to_address(__buf_end), std::move(*__p));
+        __buf.__set_sentinel(++__buf_end);
+      }
       swap(__buf);
     }
-    __alloc_traits::construct(__a, std::__to_address(this->__end_), *__first);
-    ++this->__end_;
+
+    __alloc_traits::construct(__a, std::__to_address(end()), *__first);
+    __set_sentinel(size() + 1);
   }
 }
-template <class _Tp, class _Allocator>
+
+template <class _Tp, class _Allocator, template <class, class, class> class _Layout>
 template <class _ForwardIterator, __enable_if_t<__has_forward_iterator_category<_ForwardIterator>::value, int> >
 _LIBCPP_CONSTEXPR_SINCE_CXX20 void
-__split_buffer<_Tp, _Allocator>::__construct_at_end(_ForwardIterator __first, _ForwardIterator __last) {
+__split_buffer<_Tp, _Allocator, _Layout>::__construct_at_end(_ForwardIterator __first, _ForwardIterator __last) {
   __construct_at_end_with_size(__first, std::distance(__first, __last));
 }
 
-template <class _Tp, class _Allocator>
+template <class _Tp, class _Allocator, template <class, class, class> class _Layout>
 template <class _ForwardIterator>
 _LIBCPP_CONSTEXPR_SINCE_CXX20 void
-__split_buffer<_Tp, _Allocator>::__construct_at_end_with_size(_ForwardIterator __first, size_type __n) {
-  _ConstructTransaction __tx(std::addressof(this->__end_), __n);
+__split_buffer<_Tp, _Allocator, _Layout>::__construct_at_end_with_size(_ForwardIterator __first, size_type __n) {
+  _ConstructTransaction __tx(this, end(), __n);
   for (; __tx.__pos_ != __tx.__end_; ++__tx.__pos_, (void)++__first) {
-    __alloc_traits::construct(__alloc_, std::__to_address(__tx.__pos_), *__first);
+    __alloc_traits::construct(__get_allocator(), std::__to_address(__tx.__pos_), *__first);
   }
 }
 
-template <class _Tp, class _Allocator>
+template <class _Tp, class _Allocator, template <class, class, class> class _Layout>
 _LIBCPP_CONSTEXPR_SINCE_CXX20 inline void
-__split_buffer<_Tp, _Allocator>::__destruct_at_begin(pointer __new_begin, false_type) {
-  while (__begin_ != __new_begin)
-    __alloc_traits::destroy(__alloc_, std::__to_address(__begin_++));
+__split_buffer<_Tp, _Allocator, _Layout>::__destruct_at_begin(pointer __new_begin, false_type) {
+  pointer __begin = begin();
+  // Updating begin at every iteration is unnecessary because destruction can't throw.
+  while (__begin != __new_begin)
+    __alloc_traits::destroy(__get_allocator(), std::__to_address(__begin++));
+  __set_valid_range(__begin, end());
 }
 
-template <class _Tp, class _Allocator>
+template <class _Tp, class _Allocator, template <class, class, class> class _Layout>
 _LIBCPP_CONSTEXPR_SINCE_CXX20 inline void
-__split_buffer<_Tp, _Allocator>::__destruct_at_begin(pointer __new_begin, true_type) {
-  __begin_ = __new_begin;
-}
-
-template <class _Tp, class _Allocator>
-_LIBCPP_CONSTEXPR_SINCE_CXX20 inline _LIBCPP_HIDE_FROM_ABI void
-__split_buffer<_Tp, _Allocator>::__destruct_at_end(pointer __new_last, false_type) _NOEXCEPT {
-  while (__new_last != __end_)
-    __alloc_traits::destroy(__alloc_, std::__to_address(--__end_));
+__split_buffer<_Tp, _Allocator, _Layout>::__destruct_at_begin(pointer __new_begin, true_type) {
+  __set_valid_range(__new_begin, end());
 }
 
-template <class _Tp, class _Allocator>
+template <class _Tp, class _Allocator, template <class, class, class> class _Layout>
 _LIBCPP_CONSTEXPR_SINCE_CXX20 inline _LIBCPP_HIDE_FROM_ABI void
-__split_buffer<_Tp, _Allocator>::__destruct_at_end(pointer __new_last, true_type) _NOEXCEPT {
-  __end_ = __new_last;
+__split_buffer<_Tp, _Allocator, _Layout>::__destruct_at_end(pointer __new_last, false_type) _NOEXCEPT {
+  pointer __end = end();
+  // Updating begin at every iteration is unnecessary because destruction can't throw.
+  while (__new_last != __end)
+    __alloc_traits::destroy(__get_allocator(), std::__to_address(--__end));
+  __set_sentinel(__end);
 }
 
-template <class _Tp, class _Allocator>
+template <class _Tp, class _Allocator, template <class, class, class> class _Layout>
 _LIBCPP_CONSTEXPR_SINCE_CXX20
-__split_buffer<_Tp, _Allocator>::__split_buffer(size_type __cap, size_type __start, __alloc_rr& __a)
-    : __cap_(nullptr), __alloc_(__a) {
-  if (__cap == 0) {
-    __first_ = nullptr;
-  } else {
-    auto __allocation = std::__allocate_at_least(__alloc_, __cap);
-    __first_          = __allocation.ptr;
-    __cap             = __allocation.count;
+__split_buffer<_Tp, _Allocator, _Layout>::__split_buffer(size_type __cap, size_type __start, __alloc_rr& __a)
+    : __base_type(__a) {
+  _LIBCPP_ASSERT_INTERNAL(__cap >= __start, "can't have a start point outside the capacity");
+  if (__cap > 0) {
+    auto __allocation = std::__allocate_at_least(__get_allocator(), __cap);
+    __set_data(__allocation.ptr);
+    __cap = __allocation.count;
   }
-  __begin_ = __end_ = __first_ + __start;
-  __cap_            = __first_ + __cap;
+
+  pointer __begin = __front_cap() + __start;
+  __set_valid_range(__begin, __begin);
+  __set_capacity(__cap);
 }
 
-template <class _Tp, class _Allocator>
-_LIBCPP_CONSTEXPR_SINCE_CXX20 __split_buffer<_Tp, _Allocator>::~__split_buffer() {
+template <class _Tp, class _Allocator, template <class, class, class> class _Layout>
+_LIBCPP_CONSTEXPR_SINCE_CXX20 __split_buffer<_Tp, _Allocator, _Layout>::~__split_buffer() {
   clear();
-  if (__first_)
-    __alloc_traits::deallocate(__alloc_, __first_, capacity());
+  if (__front_cap())
+    __alloc_traits::deallocate(__get_allocator(), __front_cap(), capacity());
 }
 
-template <class _Tp, class _Allocator>
-_LIBCPP_CONSTEXPR_SINCE_CXX20 __split_buffer<_Tp, _Allocator>::__split_buffer(__split_buffer&& __c)
+template <class _Tp, class _Allocator, template <class, class, class> class _Layout>
+_LIBCPP_CONSTEXPR_SINCE_CXX20 __split_buffer<_Tp, _Allocator, _Layout>::__split_buffer(__split_buffer&& __c)
     _NOEXCEPT_(is_nothrow_move_constructible<allocator_type>::value)
-    : __first_(std::move(__c.__first_)),
-      __begin_(std::move(__c.__begin_)),
-      __end_(std::move(__c.__end_)),
-      __cap_(std::move(__c.__cap_)),
-      __alloc_(std::move(__c.__alloc_)) {
-  __c.__first_ = nullptr;
-  __c.__begin_ = nullptr;
-  __c.__end_   = nullptr;
-  __c.__cap_   = nullptr;
+    : __base_type(std::move(__c)) {
+  __c.__reset();
 }
 
-template <class _Tp, class _Allocator>
+template <class _Tp, class _Allocator, template <class, class, class> class _Layout>
 _LIBCPP_CONSTEXPR_SINCE_CXX20
-__split_buffer<_Tp, _Allocator>::__split_buffer(__split_buffer&& __c, const __alloc_rr& __a)
-    : __cap_(nullptr), __alloc_(__a) {
-  if (__a == __c.__alloc_) {
-    __first_     = __c.__first_;
-    __begin_     = __c.__begin_;
-    __end_       = __c.__end_;
-    __cap_       = __c.__cap_;
-    __c.__first_ = nullptr;
-    __c.__begin_ = nullptr;
-    __c.__end_   = nullptr;
-    __c.__cap_   = nullptr;
+__split_buffer<_Tp, _Allocator, _Layout>::__split_buffer(__split_buffer&& __c, const __alloc_rr& __a)
+    : __base_type(__a) {
+  if (__a == __c.__get_allocator()) {
+    __set_data(__c.__front_cap());
+    __set_valid_range(__c.begin(), __c.end());
+    __set_capacity(__c.capacity());
+    __c.__reset();
   } else {
-    auto __allocation = std::__allocate_at_least(__alloc_, __c.size());
-    __first_          = __allocation.ptr;
-    __begin_ = __end_ = __first_;
-    __cap_            = __first_ + __allocation.count;
+    auto __allocation = std::__allocate_at_least(__get_allocator(), __c.size());
+    __set_data(__allocation.ptr);
+    __set_valid_range(__front_cap(), __front_cap());
+    __set_capacity(__allocation.count);
     typedef move_iterator<iterator> _Ip;
     __construct_at_end(_Ip(__c.begin()), _Ip(__c.end()));
   }
 }
 
-template <class _Tp, class _Allocator>
-_LIBCPP_CONSTEXPR_SINCE_CXX20 __split_buffer<_Tp, _Allocator>&
-__split_buffer<_Tp, _Allocator>::operator=(__split_buffer&& __c)
+template <class _Tp, class _Allocator, template <class, class, class> class _Layout>
+_LIBCPP_CONSTEXPR_SINCE_CXX20 __split_buffer<_Tp, _Allocator, _Layout>&
+__split_buffer<_Tp, _Allocator, _Layout>::operator=(__split_buffer&& __c)
     _NOEXCEPT_((__alloc_traits::propagate_on_container_move_assignment::value &&
                 is_nothrow_move_assignable<allocator_type>::value) ||
                !__alloc_traits::propagate_on_container_move_assignment::value) {
   clear();
   shrink_to_fit();
-  __first_ = __c.__first_;
-  __begin_ = __c.__begin_;
-  __end_   = __c.__end_;
-  __cap_   = __c.__cap_;
+  __copy_without_alloc(__c);
   __move_assign_alloc(__c, integral_constant<bool, __alloc_traits::propagate_on_container_move_assignment::value>());
-  __c.__first_ = __c.__begin_ = __c.__end_ = __c.__cap_ = nullptr;
+  __c.__reset();
   return *this;
 }
 
-template <class _Tp, class _Allocator>
-_LIBCPP_CONSTEXPR_SINCE_CXX20 void __split_buffer<_Tp, _Allocator>::swap(__split_buffer& __x)
+template <class _Tp, class _Allocator, template <class, class, class> class _Layout>
+_LIBCPP_CONSTEXPR_SINCE_CXX20 void __split_buffer<_Tp, _Allocator, _Layout>::swap(__split_buffer& __x)
     _NOEXCEPT_(!__alloc_traits::propagate_on_container_swap::value || __is_nothrow_swappable_v<__alloc_rr>) {
-  std::swap(__first_, __x.__first_);
-  std::swap(__begin_, __x.__begin_);
-  std::swap(__end_, __x.__end_);
-  std::swap(__cap_, __x.__cap_);
-  std::__swap_allocator(__alloc_, __x.__alloc_);
+  __base_type::swap(__x);
 }
 
-template <class _Tp, class _Allocator>
-_LIBCPP_CONSTEXPR_SINCE_CXX20 void __split_buffer<_Tp, _Allocator>::shrink_to_fit() _NOEXCEPT {
+template <class _Tp, class _Allocator, template <class, class, class> class _Layout>
+_LIBCPP_CONSTEXPR_SINCE_CXX20 void __split_buffer<_Tp, _Allocator, _Layout>::shrink_to_fit() _NOEXCEPT {
   if (capacity() > size()) {
 #if _LIBCPP_HAS_EXCEPTIONS
     try {
 #endif // _LIBCPP_HAS_EXCEPTIONS
-      __split_buffer<value_type, __alloc_rr&> __t(size(), 0, __alloc_);
+      __split_buffer<value_type, __alloc_rr&, _Layout> __t(size(), 0, __get_allocator());
       if (__t.capacity() < capacity()) {
-        __t.__construct_at_end(move_iterator<pointer>(__begin_), move_iterator<pointer>(__end_));
-        __t.__end_ = __t.__begin_ + (__end_ - __begin_);
-        std::swap(__first_, __t.__first_);
-        std::swap(__begin_, __t.__begin_);
-        std::swap(__end_, __t.__end_);
-        std::swap(__cap_, __t.__cap_);
+        __t.__construct_at_end(move_iterator<pointer>(begin()), move_iterator<pointer>(end()));
+        __t.__set_sentinel(size());
+        __swap_without_allocator(__t);
       }
 #if _LIBCPP_HAS_EXCEPTIONS
     } catch (...) {
@@ -428,55 +809,56 @@ _LIBCPP_CONSTEXPR_SINCE_CXX20 void __split_buffer<_Tp, _Allocator>::shrink_to_fi
   }
 }
 
-template <class _Tp, class _Allocator>
+template <class _Tp, class _Allocator, template <class, class, class> class _Layout>
 template <class... _Args>
-_LIBCPP_CONSTEXPR_SINCE_CXX20 void __split_buffer<_Tp, _Allocator>::emplace_front(_Args&&... __args) {
-  if (__begin_ == __first_) {
-    if (__end_ < __cap_) {
-      difference_type __d = __cap_ - __end_;
+_LIBCPP_CONSTEXPR_SINCE_CXX20 void __split_buffer<_Tp, _Allocator, _Layout>::emplace_front(_Args&&... __args) {
+  if (__front_spare() == 0) {
+    pointer __end = end();
+    if (__back_spare() > 0) {
+      // The elements are pressed up against the front of the buffer: we need to move them back a
+      // little bit to make `emplace_front` have amortised O(1) complexity.
+      difference_type __d = __back_spare();
       __d                 = (__d + 1) / 2;
-      __begin_            = std::move_backward(__begin_, __end_, __end_ + __d);
-      __end_ += __d;
+      auto __new_end      = __end + __d;
+      __set_valid_range(std::move_backward(begin(), __end, __new_end), __new_end);
     } else {
-      size_type __c = std::max<size_type>(2 * static_cast<size_type>(__cap_ - __first_), 1);
-      __split_buffer<value_type, __alloc_rr&> __t(__c, (__c + 3) / 4, __alloc_);
-      __t.__construct_at_end(move_iterator<pointer>(__begin_), move_iterator<pointer>(__end_));
-      std::swap(__first_, __t.__first_);
-      std::swap(__begin_, __t.__begin_);
-      std::swap(__end_, __t.__end_);
-      std::swap(__cap_, __t.__cap_);
+      size_type __c = std::max<size_type>(2 * capacity(), 1);
+      __split_buffer<value_type, __alloc_rr&, _Layout> __t(__c, (__c + 3) / 4, __get_allocator());
+      __t.__construct_at_end(move_iterator<pointer>(begin()), move_iterator<pointer>(__end));
+      __base_type::__swap_without_allocator(__t);
     }
   }
-  __alloc_traits::construct(__alloc_, std::__to_address(__begin_ - 1), std::forward<_Args>(__args)...);
-  --__begin_;
+
+  __alloc_traits::construct(__get_allocator(), std::__to_address(begin() - 1), std::forward<_Args>(__args)...);
+  __set_valid_range(begin() - 1, size() + 1);
 }
 
-template <class _Tp, class _Allocator>
+template <class _Tp, class _Allocator, template <class, class, class> class _Layout>
 template <class... _Args>
-_LIBCPP_CONSTEXPR_SINCE_CXX20 void __split_buffer<_Tp, _Allocator>::emplace_back(_Args&&... __args) {
-  if (__end_ == __cap_) {
-    if (__begin_ > __first_) {
-      difference_type __d = __begin_ - __first_;
+_LIBCPP_CONSTEXPR_SINCE_CXX20 void __split_buffer<_Tp, _Allocator, _Layout>::emplace_back(_Args&&... __args) {
+  pointer __end = end();
+  if (__back_spare() == 0) {
+    if (__front_spare() > 0) {
+      difference_type __d = __front_spare();
       __d                 = (__d + 1) / 2;
-      __end_              = std::move(__begin_, __end_, __begin_ - __d);
-      __begin_ -= __d;
+      __end               = std::move(begin(), __end, begin() - __d);
+      __set_valid_range(begin() - __d, __end);
     } else {
-      size_type __c = std::max<size_type>(2 * static_cast<size_type>(__cap_ - __first_), 1);
-      __split_buffer<value_type, __alloc_rr&> __t(__c, __c / 4, __alloc_);
-      __t.__construct_at_end(move_iterator<pointer>(__begin_), move_iterator<pointer>(__end_));
-      std::swap(__first_, __t.__first_);
-      std::swap(__begin_, __t.__begin_);
-      std::swap(__end_, __t.__end_);
-      std::swap(__cap_, __t.__cap_);
+      size_type __c = std::max<size_type>(2 * capacity(), 1);
+      __split_buffer<value_type, __alloc_rr&, _Layout> __t(__c, __c / 4, __get_allocator());
+      __t.__construct_at_end(move_iterator<pointer>(begin()), move_iterator<pointer>(__end));
+      __base_type::__swap_without_allocator(__t);
     }
   }
-  __alloc_traits::construct(__alloc_, std::__to_address(__end_), std::forward<_Args>(__args)...);
-  ++__end_;
+
+  __alloc_traits::construct(__get_allocator(), std::__to_address(__end), std::forward<_Args>(__args)...);
+  __set_sentinel(++__end);
 }
 
-template <class _Tp, class _Allocator>
+template <class _Tp, class _Allocator, template <class, class, class> class _Layout>
 _LIBCPP_CONSTEXPR_SINCE_CXX20 inline _LIBCPP_HIDE_FROM_ABI void
-swap(__split_buffer<_Tp, _Allocator>& __x, __split_buffer<_Tp, _Allocator>& __y) _NOEXCEPT_(_NOEXCEPT_(__x.swap(__y))) {
+swap(__split_buffer<_Tp, _Allocator, _Layout>& __x, __split_buffer<_Tp, _Allocator, _Layout>& __y)
+    _NOEXCEPT_(_NOEXCEPT_(__x.swap(__y))) {
   __x.swap(__y);
 }
 
diff --git a/libcxx/include/__type_traits/desugars_to.h b/libcxx/include/__type_traits/desugars_to.h
index b67baae31b181..029b3c6336837 100644
--- a/libcxx/include/__type_traits/desugars_to.h
+++ b/libcxx/include/__type_traits/desugars_to.h
@@ -10,6 +10,7 @@
 #define _LIBCPP___TYPE_TRAITS_DESUGARS_TO_H
 
 #include <__config>
+#include <__type_traits/integral_constant.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
@@ -64,6 +65,9 @@ template <class _CanonicalTag, class _Operation, class... _Args>
 inline const bool __desugars_to_v<_CanonicalTag, _Operation&&, _Args...> =
     __desugars_to_v<_CanonicalTag, _Operation, _Args...>;
 
+template <class _CanonicalTag, class _Operation, class... _Args>
+struct __desugars_to : integral_constant<bool, __desugars_to_v<_CanonicalTag, _Operation, _Args...> > {};
+
 _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP___TYPE_TRAITS_DESUGARS_TO_H
diff --git a/libcxx/include/__utility/default_three_way_comparator.h b/libcxx/include/__utility/default_three_way_comparator.h
index ce423c6ce98e4..438ab55b43230 100644
--- a/libcxx/include/__utility/default_three_way_comparator.h
+++ b/libcxx/include/__utility/default_three_way_comparator.h
@@ -27,9 +27,11 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 template <class _LHS, class _RHS, class = void>
 struct __default_three_way_comparator;
 
-template <class _Tp>
-struct __default_three_way_comparator<_Tp, _Tp, __enable_if_t<is_arithmetic<_Tp>::value> > {
-  _LIBCPP_HIDE_FROM_ABI static int operator()(_Tp __lhs, _Tp __rhs) {
+template <class _LHS, class _RHS>
+struct __default_three_way_comparator<_LHS,
+                                      _RHS,
+                                      __enable_if_t<is_arithmetic<_LHS>::value && is_arithmetic<_RHS>::value> > {
+  _LIBCPP_HIDE_FROM_ABI static int operator()(_LHS __lhs, _RHS __rhs) {
     if (__lhs < __rhs)
       return -1;
     if (__lhs > __rhs)
@@ -38,12 +40,30 @@ struct __default_three_way_comparator<_Tp, _Tp, __enable_if_t<is_arithmetic<_Tp>
   }
 };
 
+#if _LIBCPP_STD_VER >= 20 && __has_builtin(__builtin_lt_synthesises_from_spaceship)
+template <class _LHS, class _RHS>
+struct __default_three_way_comparator<
+    _LHS,
+    _RHS,
+    __enable_if_t<!(is_arithmetic<_LHS>::value && is_arithmetic<_RHS>::value) &&
+                  __builtin_lt_synthesises_from_spaceship(const _LHS&, const _RHS&)>> {
+  _LIBCPP_HIDE_FROM_ABI static int operator()(const _LHS& __lhs, const _RHS& __rhs) {
+    auto __res = __lhs <=> __rhs;
+    if (__res < 0)
+      return -1;
+    if (__res > 0)
+      return 1;
+    return 0;
+  }
+};
+#endif
+
 template <class _LHS, class _RHS, bool = true>
-inline const bool __has_default_three_way_comparator_v = false;
+struct __has_default_three_way_comparator : false_type {};
 
 template <class _LHS, class _RHS>
-inline const bool
-    __has_default_three_way_comparator_v< _LHS, _RHS, sizeof(__default_three_way_comparator<_LHS, _RHS>) >= 0> = true;
+struct __has_default_three_way_comparator<_LHS, _RHS, sizeof(__default_three_way_comparator<_LHS, _RHS>) >= 0>
+    : true_type {};
 
 _LIBCPP_END_NAMESPACE_STD
 
diff --git a/libcxx/include/__utility/lazy_synth_three_way_comparator.h b/libcxx/include/__utility/lazy_synth_three_way_comparator.h
index ca98845f04191..8c78742ccb4e3 100644
--- a/libcxx/include/__utility/lazy_synth_three_way_comparator.h
+++ b/libcxx/include/__utility/lazy_synth_three_way_comparator.h
@@ -10,6 +10,7 @@
 #define _LIBCPP___UTILITY_LAZY_SYNTH_THREE_WAY_COMPARATOR_H
 
 #include <__config>
+#include <__type_traits/conjunction.h>
 #include <__type_traits/desugars_to.h>
 #include <__type_traits/enable_if.h>
 #include <__utility/default_three_way_comparator.h>
@@ -72,8 +73,8 @@ template <class _Comparator, class _LHS, class _RHS>
 struct __lazy_synth_three_way_comparator<_Comparator,
                                          _LHS,
                                          _RHS,
-                                         __enable_if_t<__desugars_to_v<__less_tag, _Comparator, _LHS, _RHS> &&
-                                                       __has_default_three_way_comparator_v<_LHS, _RHS> > > {
+                                         __enable_if_t<_And<__desugars_to<__less_tag, _Comparator, _LHS, _RHS>,
+                                                            __has_default_three_way_comparator<_LHS, _RHS> >::value> > {
   // This lifetimebound annotation is technically incorrect, but other specializations actually capture the lifetime of
   // the comparator.
   _LIBCPP_HIDE_FROM_ABI __lazy_synth_three_way_comparator(_LIBCPP_CTOR_LIFETIMEBOUND const _Comparator&) {}
@@ -85,6 +86,23 @@ struct __lazy_synth_three_way_comparator<_Comparator,
   }
 };
 
+template <class _Comparator, class _LHS, class _RHS>
+struct __lazy_synth_three_way_comparator<_Comparator,
+                                         _LHS,
+                                         _RHS,
+                                         __enable_if_t<_And<__desugars_to<__greater_tag, _Comparator, _LHS, _RHS>,
+                                                            __has_default_three_way_comparator<_LHS, _RHS> >::value> > {
+  // This lifetimebound annotation is technically incorrect, but other specializations actually capture the lifetime of
+  // the comparator.
+  _LIBCPP_HIDE_FROM_ABI __lazy_synth_three_way_comparator(_LIBCPP_CTOR_LIFETIMEBOUND const _Comparator&) {}
+
+  // Same comment as above.
+  _LIBCPP_HIDE_FROM_ABI static __eager_compare_result
+  operator()(_LIBCPP_LIFETIMEBOUND const _LHS& __lhs, _LIBCPP_LIFETIMEBOUND const _RHS& __rhs) {
+    return __eager_compare_result(-__default_three_way_comparator<_LHS, _RHS>()(__lhs, __rhs));
+  }
+};
+
 _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP___UTILITY_LAZY_SYNTH_THREE_WAY_COMPARATOR_H
diff --git a/libcxx/include/__vector/vector.h b/libcxx/include/__vector/vector.h
index 4307e78f6ddbc..27e681aeef22a 100644
--- a/libcxx/include/__vector/vector.h
+++ b/libcxx/include/__vector/vector.h
@@ -86,6 +86,9 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 
 template <class _Tp, class _Allocator /* = allocator<_Tp> */>
 class vector {
+  template <class _Up, class _Alloc>
+  using __split_buffer _LIBCPP_NODEBUG = std::__split_buffer<_Up, _Alloc, __split_buffer_pointer_layout>;
+
 public:
   //
   // Types
@@ -820,6 +823,24 @@ class vector {
   __add_alignment_assumption(_Ptr __p) _NOEXCEPT {
     return __p;
   }
+
+  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void __swap_layouts(__split_buffer<_Tp, allocator_type&>& __sb) {
+    auto __vector_begin    = __begin_;
+    auto __vector_sentinel = __end_;
+    auto __vector_cap      = __cap_;
+
+    auto __sb_begin    = __sb.begin();
+    auto __sb_sentinel = __sb.__raw_sentinel();
+    auto __sb_cap      = __sb.__raw_capacity();
+
+    // TODO: replace with __set_valid_range and __set_capacity when vector supports it.
+    __begin_ = __sb_begin;
+    __end_   = __sb_sentinel;
+    __cap_   = __sb_cap;
+
+    __sb.__set_valid_range(__vector_begin, __vector_sentinel);
+    __sb.__set_capacity(__vector_cap);
+  }
 };
 
 #if _LIBCPP_STD_VER >= 17
@@ -850,15 +871,14 @@ template <class _Tp, class _Allocator>
 _LIBCPP_CONSTEXPR_SINCE_CXX20 void
 vector<_Tp, _Allocator>::__swap_out_circular_buffer(__split_buffer<value_type, allocator_type&>& __v) {
   __annotate_delete();
-  auto __new_begin = __v.__begin_ - (__end_ - __begin_);
+  auto __new_begin = __v.begin() - size();
   std::__uninitialized_allocator_relocate(
       this->__alloc_, std::__to_address(__begin_), std::__to_address(__end_), std::__to_address(__new_begin));
-  __v.__begin_ = __new_begin;
+  __v.__set_valid_range(__new_begin, __v.end());
   __end_       = __begin_; // All the objects have been destroyed by relocating them.
-  std::swap(this->__begin_, __v.__begin_);
-  std::swap(this->__end_, __v.__end_);
-  std::swap(this->__cap_, __v.__cap_);
-  __v.__first_ = __v.__begin_;
+
+  __swap_layouts(__v);
+  __v.__set_data(__v.begin());
   __annotate_new(size());
 }
 
@@ -870,25 +890,23 @@ template <class _Tp, class _Allocator>
 _LIBCPP_CONSTEXPR_SINCE_CXX20 typename vector<_Tp, _Allocator>::pointer
 vector<_Tp, _Allocator>::__swap_out_circular_buffer(__split_buffer<value_type, allocator_type&>& __v, pointer __p) {
   __annotate_delete();
-  pointer __ret = __v.__begin_;
+  pointer __ret = __v.begin();
 
   // Relocate [__p, __end_) first to avoid having a hole in [__begin_, __end_)
   // in case something in [__begin_, __p) throws.
   std::__uninitialized_allocator_relocate(
-      this->__alloc_, std::__to_address(__p), std::__to_address(__end_), std::__to_address(__v.__end_));
-  __v.__end_ += (__end_ - __p);
+      this->__alloc_, std::__to_address(__p), std::__to_address(__end_), std::__to_address(__v.end()));
+  auto __relocated_so_far = __end_ - __p;
+  __v.__set_sentinel(__v.end() + __relocated_so_far);
   __end_           = __p; // The objects in [__p, __end_) have been destroyed by relocating them.
-  auto __new_begin = __v.__begin_ - (__p - __begin_);
+  auto __new_begin = __v.begin() - (__p - __begin_);
 
   std::__uninitialized_allocator_relocate(
       this->__alloc_, std::__to_address(__begin_), std::__to_address(__p), std::__to_address(__new_begin));
-  __v.__begin_ = __new_begin;
-  __end_       = __begin_; // All the objects have been destroyed by relocating them.
-
-  std::swap(this->__begin_, __v.__begin_);
-  std::swap(this->__end_, __v.__end_);
-  std::swap(this->__cap_, __v.__cap_);
-  __v.__first_ = __v.__begin_;
+  __v.__set_valid_range(__new_begin, __v.end());
+  __end_ = __begin_; // All the objects have been destroyed by relocating them.
+  __swap_layouts(__v);
+  __v.__set_data(__v.begin());
   __annotate_new(size());
   return __ret;
 }
@@ -1136,12 +1154,31 @@ _LIBCPP_CONSTEXPR_SINCE_CXX20 typename vector<_Tp, _Allocator>::pointer
 vector<_Tp, _Allocator>::__emplace_back_slow_path(_Args&&... __args) {
   __split_buffer<value_type, allocator_type&> __v(__recommend(size() + 1), size(), this->__alloc_);
   //    __v.emplace_back(std::forward<_Args>(__args)...);
-  __alloc_traits::construct(this->__alloc_, std::__to_address(__v.__end_), std::forward<_Args>(__args)...);
-  __v.__end_++;
+  pointer __end = __v.end();
+  __alloc_traits::construct(this->__alloc_, std::__to_address(__end), std::forward<_Args>(__args)...);
+  __v.__set_sentinel(++__end);
   __swap_out_circular_buffer(__v);
   return this->__end_;
 }
 
+// This makes the compiler inline `__else()` if `__cond` is known to be false. Currently LLVM doesn't do that without
+// the `__builtin_constant_p`, since it considers `__else` unlikely even through it's known to be run.
+// See https://llvm.org/PR154292
+template <class _If, class _Else>
+_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 void __if_likely_else(bool __cond, _If __if, _Else __else) {
+  if (__builtin_constant_p(__cond)) {
+    if (__cond)
+      __if();
+    else
+      __else();
+  } else {
+    if (__cond) [[__likely__]]
+      __if();
+    else
+      __else();
+  }
+}
+
 template <class _Tp, class _Allocator>
 template <class... _Args>
 _LIBCPP_CONSTEXPR_SINCE_CXX20 inline
@@ -1152,12 +1189,14 @@ _LIBCPP_CONSTEXPR_SINCE_CXX20 inline
 #endif
     vector<_Tp, _Allocator>::emplace_back(_Args&&... __args) {
   pointer __end = this->__end_;
-  if (__end < this->__cap_) {
-    __emplace_back_assume_capacity(std::forward<_Args>(__args)...);
-    ++__end;
-  } else {
-    __end = __emplace_back_slow_path(std::forward<_Args>(__args)...);
-  }
+  std::__if_likely_else(
+      __end < this->__cap_,
+      [&] {
+        __emplace_back_assume_capacity(std::forward<_Args>(__args)...);
+        ++__end;
+      },
+      [&] { __end = __emplace_back_slow_path(std::forward<_Args>(__args)...); });
+
   this->__end_ = __end;
 #if _LIBCPP_STD_VER >= 17
   return *(__end - 1);
@@ -1312,14 +1351,14 @@ vector<_Tp, _Allocator>::__insert_with_sentinel(const_iterator __position, _Inpu
     __split_buffer<value_type, allocator_type&> __merged(
         __recommend(size() + __v.size()), __off, __alloc_); // has `__off` positions available at the front
     std::__uninitialized_allocator_relocate(
-        __alloc_, std::__to_address(__old_last), std::__to_address(this->__end_), std::__to_address(__merged.__end_));
+        __alloc_, std::__to_address(__old_last), std::__to_address(this->__end_), std::__to_address(__merged.end()));
     __guard.__complete(); // Release the guard once objects in [__old_last_, __end_) have been successfully relocated.
-    __merged.__end_ += this->__end_ - __old_last;
+    __merged.__set_sentinel(__merged.end() + (this->__end_ - __old_last));
     this->__end_ = __old_last;
     std::__uninitialized_allocator_relocate(
-        __alloc_, std::__to_address(__v.__begin_), std::__to_address(__v.__end_), std::__to_address(__merged.__end_));
-    __merged.__end_ += __v.size();
-    __v.__end_ = __v.__begin_;
+        __alloc_, std::__to_address(__v.begin()), std::__to_address(__v.end()), std::__to_address(__merged.end()));
+    __merged.__set_sentinel(__merged.size() + __v.size());
+    __v.__set_sentinel(__v.begin());
     __p        = __swap_out_circular_buffer(__merged, __p);
   }
   return __make_iter(__p);
diff --git a/libcxx/include/deque b/libcxx/include/deque
index 395a1076fd3c4..98d1dbbddb7e8 100644
--- a/libcxx/include/deque
+++ b/libcxx/include/deque
@@ -487,6 +487,9 @@ const _DiffType __deque_iterator<_ValueType, _Pointer, _Reference, _MapPointer,
 
 template <class _Tp, class _Allocator /*= allocator<_Tp>*/>
 class deque {
+  template <class _Up, class _Alloc>
+  using __split_buffer _LIBCPP_NODEBUG = std::__split_buffer<_Up, _Alloc, __split_buffer_pointer_layout>;
+
 public:
   // types:
 
@@ -1238,8 +1241,8 @@ private:
       clear();
       shrink_to_fit();
     }
-    __alloc()       = __c.__alloc();
-    __map_.__alloc_ = __c.__map_.__alloc_;
+    __alloc()                = __c.__alloc();
+    __map_.__get_allocator() = __c.__map_.__get_allocator();
   }
 
   _LIBCPP_HIDE_FROM_ABI void __copy_assign_alloc(const deque&, false_type) {}
@@ -1318,7 +1321,7 @@ deque<_Tp, _Allocator>::deque(const deque& __c)
     : __map_(__pointer_allocator(__alloc_traits::select_on_container_copy_construction(__c.__alloc()))),
       __start_(0),
       __size_(0),
-      __alloc_(__map_.__alloc_) {
+      __alloc_(__map_.__get_allocator()) {
   __annotate_new(0);
   __append(__c.begin(), __c.end());
 }
@@ -2071,7 +2074,7 @@ void deque<_Tp, _Allocator>::__add_front_capacity() {
   // Else need to allocate 1 buffer, *and* we need to reallocate __map_.
   else {
     __split_buffer<pointer, __pointer_allocator&> __buf(
-        std::max<size_type>(2 * __map_.capacity(), 1), 0, __map_.__alloc_);
+        std::max<size_type>(2 * __map_.capacity(), 1), 0, __map_.__get_allocator());
 
     typedef __allocator_destructor<_Allocator> _Dp;
     unique_ptr<pointer, _Dp> __hold(__alloc_traits::allocate(__a, __block_size), _Dp(__a, __block_size));
@@ -2080,10 +2083,7 @@ void deque<_Tp, _Allocator>::__add_front_capacity() {
 
     for (__map_pointer __i = __map_.begin(); __i != __map_.end(); ++__i)
       __buf.emplace_back(*__i);
-    std::swap(__map_.__first_, __buf.__first_);
-    std::swap(__map_.__begin_, __buf.__begin_);
-    std::swap(__map_.__end_, __buf.__end_);
-    std::swap(__map_.__cap_, __buf.__cap_);
+    __map_.__swap_without_allocator(__buf);
     __start_ = __map_.size() == 1 ? __block_size / 2 : __start_ + __block_size;
   }
   __annotate_whole_block(0, __asan_poison);
@@ -2134,7 +2134,7 @@ void deque<_Tp, _Allocator>::__add_front_capacity(size_type __n) {
   else {
     size_type __ds = (__nb + __back_capacity) * __block_size - __map_.empty();
     __split_buffer<pointer, __pointer_allocator&> __buf(
-        std::max<size_type>(2 * __map_.capacity(), __nb + __map_.size()), 0, __map_.__alloc_);
+        std::max<size_type>(2 * __map_.capacity(), __nb + __map_.size()), 0, __map_.__get_allocator());
 #  if _LIBCPP_HAS_EXCEPTIONS
     try {
 #  endif // _LIBCPP_HAS_EXCEPTIONS
@@ -2157,10 +2157,7 @@ void deque<_Tp, _Allocator>::__add_front_capacity(size_type __n) {
     }
     for (__map_pointer __i = __map_.begin(); __i != __map_.end(); ++__i)
       __buf.emplace_back(*__i);
-    std::swap(__map_.__first_, __buf.__first_);
-    std::swap(__map_.__begin_, __buf.__begin_);
-    std::swap(__map_.__end_, __buf.__end_);
-    std::swap(__map_.__cap_, __buf.__cap_);
+    __map_.__swap_without_allocator(__buf);
     __start_ += __ds;
   }
 }
@@ -2194,7 +2191,7 @@ void deque<_Tp, _Allocator>::__add_back_capacity() {
   // Else need to allocate 1 buffer, *and* we need to reallocate __map_.
   else {
     __split_buffer<pointer, __pointer_allocator&> __buf(
-        std::max<size_type>(2 * __map_.capacity(), 1), __map_.size(), __map_.__alloc_);
+        std::max<size_type>(2 * __map_.capacity(), 1), __map_.size(), __map_.__get_allocator());
 
     typedef __allocator_destructor<_Allocator> _Dp;
     unique_ptr<pointer, _Dp> __hold(__alloc_traits::allocate(__a, __block_size), _Dp(__a, __block_size));
@@ -2203,10 +2200,7 @@ void deque<_Tp, _Allocator>::__add_back_capacity() {
 
     for (__map_pointer __i = __map_.end(); __i != __map_.begin();)
       __buf.emplace_front(*--__i);
-    std::swap(__map_.__first_, __buf.__first_);
-    std::swap(__map_.__begin_, __buf.__begin_);
-    std::swap(__map_.__end_, __buf.__end_);
-    std::swap(__map_.__cap_, __buf.__cap_);
+    __map_.__swap_without_allocator(__buf);
     __annotate_whole_block(__map_.size() - 1, __asan_poison);
   }
 }
@@ -2259,7 +2253,7 @@ void deque<_Tp, _Allocator>::__add_back_capacity(size_type __n) {
     __split_buffer<pointer, __pointer_allocator&> __buf(
         std::max<size_type>(2 * __map_.capacity(), __nb + __map_.size()),
         __map_.size() - __front_capacity,
-        __map_.__alloc_);
+        __map_.__get_allocator());
 #  if _LIBCPP_HAS_EXCEPTIONS
     try {
 #  endif // _LIBCPP_HAS_EXCEPTIONS
@@ -2282,10 +2276,7 @@ void deque<_Tp, _Allocator>::__add_back_capacity(size_type __n) {
     }
     for (__map_pointer __i = __map_.end(); __i != __map_.begin();)
       __buf.emplace_front(*--__i);
-    std::swap(__map_.__first_, __buf.__first_);
-    std::swap(__map_.__begin_, __buf.__begin_);
-    std::swap(__map_.__end_, __buf.__end_);
-    std::swap(__map_.__cap_, __buf.__cap_);
+    __map_.__swap_without_allocator(__buf);
     __start_ -= __ds;
   }
 }
diff --git a/libcxx/include/string b/libcxx/include/string
index 0abdfebcb863f..b0fdd6ac0946d 100644
--- a/libcxx/include/string
+++ b/libcxx/include/string
@@ -1312,13 +1312,17 @@ public:
 #  if _LIBCPP_STD_VER >= 23
   template <class _Op>
   _LIBCPP_HIDE_FROM_ABI constexpr void resize_and_overwrite(size_type __n, _Op __op) {
-    __resize_default_init(__n);
-    __erase_to_end(std::move(__op)(data(), _LIBCPP_AUTO_CAST(__n)));
+    size_type __sz  = size();
+    size_type __cap = capacity();
+    if (__n > __cap)
+      __grow_by_without_replace(__cap, __n - __cap, __sz, __sz, 0);
+    __annotate_delete();
+    __set_size(__n);
+    __annotate_new(__n);
+    __erase_to_end(std::move(__op)(data(), auto(__n)));
   }
 #  endif
 
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __resize_default_init(size_type __n);
-
 #  if _LIBCPP_STD_VER < 26 || defined(_LIBCPP_ENABLE_CXX26_REMOVED_STRING_RESERVE)
   _LIBCPP_DEPRECATED_IN_CXX20 _LIBCPP_HIDE_FROM_ABI void reserve() _NOEXCEPT { shrink_to_fit(); }
 #  endif
@@ -1410,8 +1414,6 @@ public:
   _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& append(const value_type* _LIBCPP_DIAGNOSE_NULLPTR __s);
   _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& append(size_type __n, value_type __c);
 
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __append_default_init(size_type __n);
-
   template <class _InputIterator, __enable_if_t<__has_exactly_input_iterator_category<_InputIterator>::value, int> = 0>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string&
   append(_InputIterator __first, _InputIterator __last) {
@@ -2521,6 +2523,7 @@ _LIBCPP_STRING_V1_EXTERN_TEMPLATE_LIST(_LIBCPP_DECLARE, wchar_t)
 #  endif
 #  undef _LIBCPP_DECLARE
 
+#  if _LIBCPP_STD_VER <= 17 || !__has_builtin(__builtin_lt_synthesises_from_spaceship)
 template <class _CharT, class _Traits, class _Alloc>
 struct __default_three_way_comparator<basic_string<_CharT, _Traits, _Alloc>, basic_string<_CharT, _Traits, _Alloc> > {
   using __string_t _LIBCPP_NODEBUG = basic_string<_CharT, _Traits, _Alloc>;
@@ -2533,6 +2536,7 @@ struct __default_three_way_comparator<basic_string<_CharT, _Traits, _Alloc>, bas
     return __ret;
   }
 };
+#  endif
 
 #  if _LIBCPP_STD_VER >= 17
 template <class _InputIterator,
@@ -3079,22 +3083,6 @@ basic_string<_CharT, _Traits, _Allocator>::append(size_type __n, value_type __c)
   return *this;
 }
 
-template <class _CharT, class _Traits, class _Allocator>
-_LIBCPP_CONSTEXPR_SINCE_CXX20 inline void
-basic_string<_CharT, _Traits, _Allocator>::__append_default_init(size_type __n) {
-  if (__n == 0)
-    return;
-  size_type __cap = capacity();
-  size_type __sz  = size();
-  if (__cap - __sz < __n)
-    __grow_by_without_replace(__cap, __sz + __n - __cap, __sz, __sz, 0);
-  __annotate_increase(__n);
-  pointer __p = __get_pointer();
-  __sz += __n;
-  __set_size(__sz);
-  traits_type::assign(__p[__sz], value_type());
-}
-
 template <class _CharT, class _Traits, class _Allocator>
 _LIBCPP_CONSTEXPR_SINCE_CXX20 void basic_string<_CharT, _Traits, _Allocator>::push_back(value_type __c) {
   bool __is_short = !__is_long();
@@ -3433,16 +3421,6 @@ _LIBCPP_CONSTEXPR_SINCE_CXX20 void basic_string<_CharT, _Traits, _Allocator>::re
     __erase_to_end(__n);
 }
 
-template <class _CharT, class _Traits, class _Allocator>
-_LIBCPP_CONSTEXPR_SINCE_CXX20 inline void
-basic_string<_CharT, _Traits, _Allocator>::__resize_default_init(size_type __n) {
-  size_type __sz = size();
-  if (__n > __sz) {
-    __append_default_init(__n - __sz);
-  } else
-    __erase_to_end(__n);
-}
-
 template <class _CharT, class _Traits, class _Allocator>
 _LIBCPP_CONSTEXPR_SINCE_CXX20 void basic_string<_CharT, _Traits, _Allocator>::reserve(size_type __requested_capacity) {
   if (__requested_capacity > max_size())
diff --git a/libcxx/src/filesystem/format_string.h b/libcxx/src/filesystem/format_string.h
index ad6c57579a0a6..e91475e440480 100644
--- a/libcxx/src/filesystem/format_string.h
+++ b/libcxx/src/filesystem/format_string.h
@@ -34,20 +34,19 @@ inline _LIBCPP_ATTRIBUTE_FORMAT(__printf__, 1, 0) string vformat_string(const ch
 
   va_list apcopy;
   va_copy(apcopy, ap);
-  int ret = ::vsnprintf(buf.data(), buf.size(), msg, apcopy);
+  int size = ::vsnprintf(buf.data(), buf.size(), msg, apcopy);
   va_end(apcopy);
 
   string result;
-  if (static_cast<size_t>(ret) < buf.size()) {
-    result.assign(buf.data(), static_cast<size_t>(ret));
+  if (static_cast<size_t>(size) < buf.size()) {
+    result.assign(buf.data(), static_cast<size_t>(size));
   } else {
     // we did not provide a long enough buffer on our first attempt. The
     // return value is the number of bytes (excluding the null byte) that are
     // needed for formatting.
-    size_t size_with_null = static_cast<size_t>(ret) + 1;
-    result.__resize_default_init(size_with_null - 1);
-    ret = ::vsnprintf(&result[0], size_with_null, msg, ap);
-    _LIBCPP_ASSERT_INTERNAL(static_cast<size_t>(ret) == (size_with_null - 1), "TODO");
+    result.resize_and_overwrite(size, [&](char* res, size_t n) { return ::vsnprintf(res, n, msg, ap); });
+    _LIBCPP_ASSERT_INTERNAL(static_cast<size_t>(size) == result.size(),
+                            "vsnprintf did not result in the same number of characters as the first attempt?");
   }
   return result;
 }
diff --git a/libcxx/test/benchmarks/algorithms/modifying/unique.bench.cpp b/libcxx/test/benchmarks/algorithms/modifying/unique.bench.cpp
index c0aee942eef64..e3ac50187ef4b 100644
--- a/libcxx/test/benchmarks/algorithms/modifying/unique.bench.cpp
+++ b/libcxx/test/benchmarks/algorithms/modifying/unique.bench.cpp
@@ -76,7 +76,7 @@ int main(int argc, char** argv) {
             }
           })
           ->Arg(32)
-          ->Arg(50) // non power-of-two
+          ->Arg(52) // non power-of-two
           ->Arg(1024)
           ->Arg(8192);
     };
@@ -143,7 +143,7 @@ int main(int argc, char** argv) {
             }
           })
           ->Arg(32)
-          ->Arg(50) // non power-of-two
+          ->Arg(52) // non power-of-two
           ->Arg(1024)
           ->Arg(8192);
     };
diff --git a/libcxx/test/benchmarks/algorithms/modifying/unique_copy.bench.cpp b/libcxx/test/benchmarks/algorithms/modifying/unique_copy.bench.cpp
index 45b52dd23b695..1885b53e51413 100644
--- a/libcxx/test/benchmarks/algorithms/modifying/unique_copy.bench.cpp
+++ b/libcxx/test/benchmarks/algorithms/modifying/unique_copy.bench.cpp
@@ -61,6 +61,7 @@ int main(int argc, char** argv) {
             }
           })
           ->Arg(32)
+          ->Arg(52) // non power-of-two
           ->Arg(1024)
           ->Arg(8192);
     };
@@ -115,7 +116,7 @@ int main(int argc, char** argv) {
             }
           })
           ->Arg(32)
-          ->Arg(50) // non power-of-two
+          ->Arg(52) // non power-of-two
           ->Arg(1024)
           ->Arg(8192);
     };
diff --git a/libcxx/test/benchmarks/containers/string.bench.cpp b/libcxx/test/benchmarks/containers/string.bench.cpp
index aeff6ad6f6333..966775d31a8cf 100644
--- a/libcxx/test/benchmarks/containers/string.bench.cpp
+++ b/libcxx/test/benchmarks/containers/string.bench.cpp
@@ -6,8 +6,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
 
+#include <algorithm>
 #include <cstdint>
 #include <cstdlib>
 #include <new>
@@ -67,6 +68,21 @@ static void BM_StringCtorDefault(benchmark::State& state) {
 }
 BENCHMARK(BM_StringCtorDefault);
 
+static void BM_StringResizeAndOverwrite(benchmark::State& state) {
+  std::string str;
+
+  for (auto _ : state) {
+    benchmark::DoNotOptimize(str);
+    str.resize_and_overwrite(10, [](char* ptr, size_t n) {
+      std::fill_n(ptr, n, 'a');
+      return n;
+    });
+    benchmark::DoNotOptimize(str);
+    str.clear();
+  }
+}
+BENCHMARK(BM_StringResizeAndOverwrite);
+
 enum class Length { Empty, Small, Large, Huge };
 struct AllLengths : EnumValuesAsTuple<AllLengths, Length, 4> {
   static constexpr const char* Names[] = {"Empty", "Small", "Large", "Huge"};
diff --git a/libcxx/test/benchmarks/filesystem.bench.cpp b/libcxx/test/benchmarks/filesystem.bench.cpp
index c058a5d41a150..61d14a453e72f 100644
--- a/libcxx/test/benchmarks/filesystem.bench.cpp
+++ b/libcxx/test/benchmarks/filesystem.bench.cpp
@@ -30,9 +30,8 @@ void BM_PathConstructString(benchmark::State& st, GenInputs gen) {
     const path P(PP.native());
     benchmark::DoNotOptimize(P.native().data());
   }
-  st.SetComplexityN(st.range(0));
 }
-BENCHMARK_CAPTURE(BM_PathConstructString, large_string, getRandomStringInputs)->Range(8, TestNumInputs)->Complexity();
+BENCHMARK_CAPTURE(BM_PathConstructString, large_string, getRandomStringInputs)->Range(8, TestNumInputs);
 
 template <class GenInputs>
 void BM_PathConstructCStr(benchmark::State& st, GenInputs gen) {
@@ -66,7 +65,6 @@ void BM_PathConstructIter(benchmark::State& st, GenInputs gen) {
     const path P(Start, End);
     benchmark::DoNotOptimize(P.native().data());
   }
-  st.SetComplexityN(st.range(0));
 }
 template <class GenInputs>
 void BM_PathConstructInputIter(benchmark::State& st, GenInputs gen) {
@@ -77,11 +75,9 @@ void BM_PathConstructForwardIter(benchmark::State& st, GenInputs gen) {
   BM_PathConstructIter<forward_iterator>(st, gen);
 }
 BENCHMARK_CAPTURE(BM_PathConstructInputIter, large_string, getRandomStringInputs)
-    ->Range(8, TestNumInputs)
-    ->Complexity();
+    ->Range(8, TestNumInputs);
 BENCHMARK_CAPTURE(BM_PathConstructForwardIter, large_string, getRandomStringInputs)
-    ->Range(8, TestNumInputs)
-    ->Complexity();
+    ->Range(8, TestNumInputs);
 
 template <class GenInputs>
 void BM_PathIterateMultipleTimes(benchmark::State& st, GenInputs gen) {
@@ -97,11 +93,9 @@ void BM_PathIterateMultipleTimes(benchmark::State& st, GenInputs gen) {
     }
     benchmark::ClobberMemory();
   }
-  st.SetComplexityN(st.range(0));
 }
 BENCHMARK_CAPTURE(BM_PathIterateMultipleTimes, iterate_elements, getRandomStringInputs)
-    ->Range(8, TestNumInputs)
-    ->Complexity();
+    ->Range(8, TestNumInputs);
 
 template <class GenInputs>
 void BM_PathIterateOnce(benchmark::State& st, GenInputs gen) {
@@ -118,9 +112,8 @@ void BM_PathIterateOnce(benchmark::State& st, GenInputs gen) {
     }
     benchmark::ClobberMemory();
   }
-  st.SetComplexityN(st.range(0));
 }
-BENCHMARK_CAPTURE(BM_PathIterateOnce, iterate_elements, getRandomStringInputs)->Range(8, TestNumInputs)->Complexity();
+BENCHMARK_CAPTURE(BM_PathIterateOnce, iterate_elements, getRandomStringInputs)->Range(8, TestNumInputs);
 
 template <class GenInputs>
 void BM_PathIterateOnceBackwards(benchmark::State& st, GenInputs gen) {
@@ -160,16 +153,13 @@ void BM_LexicallyNormal(benchmark::State& st, GenInput gen, size_t PathLen) {
   while (st.KeepRunning()) {
     benchmark::DoNotOptimize(In.lexically_normal());
   }
-  st.SetComplexityN(st.range(0));
 }
 BENCHMARK_CAPTURE(BM_LexicallyNormal, small_path, getRandomPaths, /*PathLen*/ 5)
     ->RangeMultiplier(2)
-    ->Range(2, 256)
-    ->Complexity();
+    ->Range(2, 256);
 BENCHMARK_CAPTURE(BM_LexicallyNormal, large_path, getRandomPaths, /*PathLen*/ 32)
     ->RangeMultiplier(2)
-    ->Range(2, 256)
-    ->Complexity();
+    ->Range(2, 256);
 
 template <class GenInput>
 void BM_LexicallyRelative(benchmark::State& st, GenInput gen, size_t PathLen) {
@@ -180,15 +170,12 @@ void BM_LexicallyRelative(benchmark::State& st, GenInput gen, size_t PathLen) {
   for (auto _ : st) {
     benchmark::DoNotOptimize(TargetPath.lexically_relative(BasePath));
   }
-  st.SetComplexityN(st.range(0));
 }
 BENCHMARK_CAPTURE(BM_LexicallyRelative, small_path, getRandomPaths, /*PathLen*/ 5)
     ->RangeMultiplier(2)
-    ->Range(2, 256)
-    ->Complexity();
+    ->Range(2, 256);
 BENCHMARK_CAPTURE(BM_LexicallyRelative, large_path, getRandomPaths, /*PathLen*/ 32)
     ->RangeMultiplier(2)
-    ->Range(2, 256)
-    ->Complexity();
+    ->Range(2, 256);
 
 BENCHMARK_MAIN();
diff --git a/libcxx/test/benchmarks/stringstream.bench.cpp b/libcxx/test/benchmarks/stringstream.bench.cpp
index b7c50a96ef51e..367024c49ac69 100644
--- a/libcxx/test/benchmarks/stringstream.bench.cpp
+++ b/libcxx/test/benchmarks/stringstream.bench.cpp
@@ -80,7 +80,7 @@ static void BM_Istream_numbers(benchmark::State& state) {
   while (state.KeepRunning())
     benchmark::DoNotOptimize(i += istream_numbers(sel.imbue));
 }
-BENCHMARK(BM_Istream_numbers)->DenseRange(0, 3)->UseRealTime()->Threads(1)->ThreadPerCpu();
+BENCHMARK(BM_Istream_numbers)->DenseRange(0, 3);
 
 static void BM_Ostream_number(benchmark::State& state) {
   LocaleSelector sel(state);
@@ -92,6 +92,6 @@ static void BM_Ostream_number(benchmark::State& state) {
     benchmark::DoNotOptimize(ss.str().c_str());
   }
 }
-BENCHMARK(BM_Ostream_number)->DenseRange(0, 3)->UseRealTime()->Threads(1)->ThreadPerCpu();
+BENCHMARK(BM_Ostream_number)->DenseRange(0, 3);
 
 BENCHMARK_MAIN();
diff --git a/libcxx/test/configs/armv7m-picolibc-libc++.cfg.in b/libcxx/test/configs/armv7m-picolibc-libc++.cfg.in
index 9bff5021494ef..b2669a713e2c0 100644
--- a/libcxx/test/configs/armv7m-picolibc-libc++.cfg.in
+++ b/libcxx/test/configs/armv7m-picolibc-libc++.cfg.in
@@ -13,7 +13,7 @@ config.substitutions.append(('%{compile_flags}',
     ' -Wno-atomic-alignment'
 ))
 config.substitutions.append(('%{link_flags}',
-    '-nostdlib -nostdlib++ -L %{lib-dir} -lc++ -lc++abi'
+    '-fuse-ld=lld -nostdlib -nostdlib++ -L %{lib-dir} -lc++ -lc++abi'
     ' -lc -lm -lclang_rt.builtins -lsemihost -lcrt0-semihost' +
     ' -T {}'.format(libc_linker_script) +
     ' -Wl,--defsym=__flash=0x0'
diff --git a/libcxx/test/libcxx-03/strings/basic.string/string.modifiers/resize_default_initialized.pass.cpp b/libcxx/test/libcxx-03/strings/basic.string/string.modifiers/resize_default_initialized.pass.cpp
deleted file mode 100644
index 8e6e07d659c1a..0000000000000
--- a/libcxx/test/libcxx-03/strings/basic.string/string.modifiers/resize_default_initialized.pass.cpp
+++ /dev/null
@@ -1,75 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <string>
-
-// __resize_default_init(size_type)
-
-#include <string>
-#include <cassert>
-
-#include "test_macros.h"
-
-TEST_CONSTEXPR_CXX20 void write_c_str(char* buf, int size) {
-  for (int i = 0; i < size; ++i) {
-    buf[i] = 'a';
-  }
-  buf[size] = '\0';
-}
-
-template <class S>
-TEST_CONSTEXPR_CXX20 void test_buffer_usage() {
-  {
-    unsigned buff_size = 125;
-    unsigned used_size = buff_size - 16;
-    S s;
-    s.__resize_default_init(buff_size);
-    write_c_str(&s[0], used_size);
-    assert(s.size() == buff_size);
-    assert(std::char_traits<char>().length(s.data()) == used_size);
-    s.__resize_default_init(used_size);
-    assert(s.size() == used_size);
-    assert(s.data()[used_size] == '\0');
-    for (unsigned i = 0; i < used_size; ++i) {
-      assert(s[i] == 'a');
-    }
-  }
-}
-
-template <class S>
-TEST_CONSTEXPR_CXX20 void test_basic() {
-  {
-    S s;
-    s.__resize_default_init(3);
-    assert(s.size() == 3);
-    assert(s.data()[3] == '\0');
-    for (int i = 0; i < 3; ++i)
-      s[i] = 'a' + i;
-    s.__resize_default_init(1);
-    assert(s[0] == 'a');
-    assert(s.data()[1] == '\0');
-    assert(s.size() == 1);
-  }
-}
-
-template <class S>
-TEST_CONSTEXPR_CXX20 bool test() {
-  test_basic<S>();
-  test_buffer_usage<S>();
-
-  return true;
-}
-
-int main(int, char**) {
-  test<std::string>();
-#if TEST_STD_VER > 17
-  static_assert(test<std::string>());
-#endif
-
-  return 0;
-}
diff --git a/libcxx/test/libcxx/input.output/filesystems/class.directory_entry/directory_entry.mods/last_write_time.pass.cpp b/libcxx/test/libcxx/input.output/filesystems/class.directory_entry/directory_entry.mods/last_write_time.pass.cpp
index 1acbed55d2b51..a25601698e4ef 100644
--- a/libcxx/test/libcxx/input.output/filesystems/class.directory_entry/directory_entry.mods/last_write_time.pass.cpp
+++ b/libcxx/test/libcxx/input.output/filesystems/class.directory_entry/directory_entry.mods/last_write_time.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// UNSUPPORTED: c++03, c++11, c++14
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
 // UNSUPPORTED: availability-filesystem-missing
 // UNSUPPORTED: no-filesystem
 // ADDITIONAL_COMPILE_FLAGS: -I %{libcxx-dir}/src
diff --git a/libcxx/test/libcxx/input.output/filesystems/convert_file_time.pass.cpp b/libcxx/test/libcxx/input.output/filesystems/convert_file_time.pass.cpp
index c501969c31167..6e5c5aa52674c 100644
--- a/libcxx/test/libcxx/input.output/filesystems/convert_file_time.pass.cpp
+++ b/libcxx/test/libcxx/input.output/filesystems/convert_file_time.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// UNSUPPORTED: c++03, c++11, c++14
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
 // UNSUPPORTED: availability-filesystem-missing
 
 // <filesystem>
diff --git a/libcxx/test/libcxx/strings/basic.string/string.modifiers/resize_default_initialized.pass.cpp b/libcxx/test/libcxx/strings/basic.string/string.modifiers/resize_default_initialized.pass.cpp
deleted file mode 100644
index 8e6e07d659c1a..0000000000000
--- a/libcxx/test/libcxx/strings/basic.string/string.modifiers/resize_default_initialized.pass.cpp
+++ /dev/null
@@ -1,75 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <string>
-
-// __resize_default_init(size_type)
-
-#include <string>
-#include <cassert>
-
-#include "test_macros.h"
-
-TEST_CONSTEXPR_CXX20 void write_c_str(char* buf, int size) {
-  for (int i = 0; i < size; ++i) {
-    buf[i] = 'a';
-  }
-  buf[size] = '\0';
-}
-
-template <class S>
-TEST_CONSTEXPR_CXX20 void test_buffer_usage() {
-  {
-    unsigned buff_size = 125;
-    unsigned used_size = buff_size - 16;
-    S s;
-    s.__resize_default_init(buff_size);
-    write_c_str(&s[0], used_size);
-    assert(s.size() == buff_size);
-    assert(std::char_traits<char>().length(s.data()) == used_size);
-    s.__resize_default_init(used_size);
-    assert(s.size() == used_size);
-    assert(s.data()[used_size] == '\0');
-    for (unsigned i = 0; i < used_size; ++i) {
-      assert(s[i] == 'a');
-    }
-  }
-}
-
-template <class S>
-TEST_CONSTEXPR_CXX20 void test_basic() {
-  {
-    S s;
-    s.__resize_default_init(3);
-    assert(s.size() == 3);
-    assert(s.data()[3] == '\0');
-    for (int i = 0; i < 3; ++i)
-      s[i] = 'a' + i;
-    s.__resize_default_init(1);
-    assert(s[0] == 'a');
-    assert(s.data()[1] == '\0');
-    assert(s.size() == 1);
-  }
-}
-
-template <class S>
-TEST_CONSTEXPR_CXX20 bool test() {
-  test_basic<S>();
-  test_buffer_usage<S>();
-
-  return true;
-}
-
-int main(int, char**) {
-  test<std::string>();
-#if TEST_STD_VER > 17
-  static_assert(test<std::string>());
-#endif
-
-  return 0;
-}
diff --git a/libcxx/test/libcxx/type_traits/is_replaceable.compile.pass.cpp b/libcxx/test/libcxx/type_traits/is_replaceable.compile.pass.cpp
index 546240a6c3286..c04e9443c8e67 100644
--- a/libcxx/test/libcxx/type_traits/is_replaceable.compile.pass.cpp
+++ b/libcxx/test/libcxx/type_traits/is_replaceable.compile.pass.cpp
@@ -133,16 +133,58 @@ static_assert(!std::__is_replaceable<CustomMoveAssignment>::value, "");
 // ----------------------
 
 // __split_buffer
-static_assert(std::__is_replaceable<std::__split_buffer<int> >::value, "");
-static_assert(std::__is_replaceable<std::__split_buffer<NotTriviallyCopyable> >::value, "");
-static_assert(!std::__is_replaceable<std::__split_buffer<int, NonPropagatingStatefulCopyAssignAlloc<int> > >::value,
-              "");
-static_assert(!std::__is_replaceable<std::__split_buffer<int, NonPropagatingStatefulMoveAssignAlloc<int> > >::value,
-              "");
-static_assert(std::__is_replaceable<std::__split_buffer<int, NonPropagatingStatelessCopyAssignAlloc<int> > >::value,
+static_assert(
+    std::__is_replaceable<std::__split_buffer<int, std::allocator<int>, std::__split_buffer_pointer_layout> >::value,
+    "");
+static_assert(std::__is_replaceable<std::__split_buffer<NotTriviallyCopyable,
+                                                        std::allocator<NotTriviallyCopyable>,
+                                                        std::__split_buffer_pointer_layout> >::value,
               "");
-static_assert(std::__is_replaceable<std::__split_buffer<int, NonPropagatingStatelessMoveAssignAlloc<int> > >::value,
+static_assert(
+    !std::__is_replaceable<
+        std::__split_buffer<int, NonPropagatingStatefulCopyAssignAlloc<int>, std::__split_buffer_pointer_layout > >::
+        value,
+    "");
+static_assert(
+    !std::__is_replaceable<
+        std::__split_buffer<int, NonPropagatingStatefulMoveAssignAlloc<int>, std::__split_buffer_pointer_layout > >::
+        value,
+    "");
+static_assert(
+    std::__is_replaceable<
+        std::__split_buffer<int, NonPropagatingStatelessCopyAssignAlloc<int>, std::__split_buffer_pointer_layout > >::
+        value,
+    "");
+static_assert(
+    std::__is_replaceable<
+        std::__split_buffer<int, NonPropagatingStatelessMoveAssignAlloc<int>, std::__split_buffer_pointer_layout > >::
+        value,
+    "");
+
+static_assert(
+    std::__is_replaceable<std::__split_buffer<int, std::allocator<int>, std::__split_buffer_size_layout> >::value, "");
+static_assert(std::__is_replaceable<std::__split_buffer<NotTriviallyCopyable,
+                                                        std::allocator<NotTriviallyCopyable>,
+                                                        std::__split_buffer_size_layout> >::value,
               "");
+static_assert(
+    !std::__is_replaceable<
+        std::__split_buffer<int, NonPropagatingStatefulCopyAssignAlloc<int>, std::__split_buffer_size_layout > >::value,
+    "");
+static_assert(
+    !std::__is_replaceable<
+        std::__split_buffer<int, NonPropagatingStatefulMoveAssignAlloc<int>, std::__split_buffer_size_layout > >::value,
+    "");
+static_assert(
+    std::__is_replaceable<
+        std::__split_buffer<int, NonPropagatingStatelessCopyAssignAlloc<int>, std::__split_buffer_size_layout > >::
+        value,
+    "");
+static_assert(
+    std::__is_replaceable<
+        std::__split_buffer<int, NonPropagatingStatelessMoveAssignAlloc<int>, std::__split_buffer_size_layout > >::
+        value,
+    "");
 
 // standard library types
 // ----------------------
diff --git a/libcxx/test/libcxx/type_traits/is_trivially_relocatable.compile.pass.cpp b/libcxx/test/libcxx/type_traits/is_trivially_relocatable.compile.pass.cpp
index c462672616f77..10889eb50870d 100644
--- a/libcxx/test/libcxx/type_traits/is_trivially_relocatable.compile.pass.cpp
+++ b/libcxx/test/libcxx/type_traits/is_trivially_relocatable.compile.pass.cpp
@@ -68,9 +68,27 @@ static_assert(!std::__libcpp_is_trivially_relocatable<NonTrivialDestructor>::val
 // ----------------------
 
 // __split_buffer
-static_assert(std::__libcpp_is_trivially_relocatable<std::__split_buffer<int> >::value, "");
-static_assert(std::__libcpp_is_trivially_relocatable<std::__split_buffer<NotTriviallyCopyable> >::value, "");
-static_assert(!std::__libcpp_is_trivially_relocatable<std::__split_buffer<int, test_allocator<int> > >::value, "");
+static_assert(std::__libcpp_is_trivially_relocatable<
+                  std::__split_buffer<int, std::allocator<int>, std::__split_buffer_pointer_layout> >::value,
+              "");
+static_assert(std::__libcpp_is_trivially_relocatable<std::__split_buffer<NotTriviallyCopyable,
+                                                                         std::allocator<NotTriviallyCopyable>,
+                                                                         std::__split_buffer_pointer_layout> >::value,
+              "");
+static_assert(!std::__libcpp_is_trivially_relocatable<
+                  std::__split_buffer<int, test_allocator<int>, std::__split_buffer_pointer_layout > >::value,
+              "");
+
+static_assert(std::__libcpp_is_trivially_relocatable<
+                  std::__split_buffer<int, std::allocator<int>, std::__split_buffer_size_layout> >::value,
+              "");
+static_assert(std::__libcpp_is_trivially_relocatable<std::__split_buffer<NotTriviallyCopyable,
+                                                                         std::allocator<NotTriviallyCopyable>,
+                                                                         std::__split_buffer_size_layout> >::value,
+              "");
+static_assert(!std::__libcpp_is_trivially_relocatable<
+                  std::__split_buffer<int, test_allocator<int>, std::__split_buffer_size_layout > >::value,
+              "");
 
 // standard library types
 // ----------------------
diff --git a/libcxx/test/libcxx/utilities/utility/has_default_three_way.compile.pass.cpp b/libcxx/test/libcxx/utilities/utility/has_default_three_way.compile.pass.cpp
new file mode 100644
index 0000000000000..42b4855a9fddd
--- /dev/null
+++ b/libcxx/test/libcxx/utilities/utility/has_default_three_way.compile.pass.cpp
@@ -0,0 +1,38 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+
+#include <__utility/default_three_way_comparator.h>
+#include <string>
+#include <vector>
+
+static_assert(std::__has_default_three_way_comparator<int, int>::value);
+static_assert(std::__has_default_three_way_comparator<int, long>::value);
+static_assert(std::__has_default_three_way_comparator<long, int>::value);
+static_assert(std::__has_default_three_way_comparator<long, long>::value);
+static_assert(std::__has_default_three_way_comparator<std::string, std::string>::value);
+
+#if __has_builtin(__builtin_lt_synthesises_from_spaceship)
+static_assert(std::__has_default_three_way_comparator<const std::string&, const std::string&>::value);
+static_assert(std::__has_default_three_way_comparator<const std::string&, const std::string_view&>::value);
+static_assert(std::__has_default_three_way_comparator<std::string, std::string_view>::value);
+static_assert(std::__has_default_three_way_comparator<const std::string&, const char*>::value);
+static_assert(std::__has_default_three_way_comparator<std::string, const char*>::value);
+static_assert(!std::__has_default_three_way_comparator<const std::string&, const wchar_t*>::value);
+
+static_assert(std::__has_default_three_way_comparator<const std::vector<int>&, const std::vector<int>&>::value);
+
+struct MyStruct {
+  int i;
+
+  friend auto operator<=>(MyStruct, MyStruct) = default;
+};
+
+static_assert(std::__has_default_three_way_comparator<const MyStruct&, const MyStruct&>::value);
+#endif
diff --git a/libcxx/test/std/algorithms/alg.modifying.operations/alg.rotate/ranges_rotate.pass.cpp b/libcxx/test/std/algorithms/alg.modifying.operations/alg.rotate/ranges_rotate.pass.cpp
index 5f594400e8321..574e96dea46a0 100644
--- a/libcxx/test/std/algorithms/alg.modifying.operations/alg.rotate/ranges_rotate.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.modifying.operations/alg.rotate/ranges_rotate.pass.cpp
@@ -173,6 +173,7 @@ constexpr bool test() {
       auto end   = adl::Iterator::TrackSwaps(in.data() + in.size(), swaps);
 
       for (std::size_t mid = 0; mid != input.size(); ++mid) {
+        swaps = 0;
         std::ranges::rotate(begin, begin + mid, end);
         assert(swaps <= expected);
       }
@@ -186,6 +187,7 @@ constexpr bool test() {
       auto range = std::ranges::subrange(begin, end);
 
       for (std::size_t mid = 0; mid != input.size(); ++mid) {
+        swaps = 0;
         std::ranges::rotate(range, begin + mid);
         assert(swaps <= expected);
       }
diff --git a/libcxx/test/std/containers/associative/map/map.ops/find.pass.cpp b/libcxx/test/std/containers/associative/map/map.ops/find.pass.cpp
index 534d78128407d..63dbcda512803 100644
--- a/libcxx/test/std/containers/associative/map/map.ops/find.pass.cpp
+++ b/libcxx/test/std/containers/associative/map/map.ops/find.pass.cpp
@@ -72,6 +72,22 @@ int main(int, char**) {
       assert(r == std::next(m.begin(), 8));
     }
   }
+  { // Check with std::greater to ensure we're actually using the correct comparator
+    using Pair = std::pair<const int, int>;
+    using Map  = std::map<int, int, std::greater<int> >;
+    Pair ar[]  = {Pair(5, 5), Pair(6, 6), Pair(7, 7), Pair(8, 8), Pair(9, 9), Pair(10, 10), Pair(11, 11), Pair(12, 12)};
+    Map m(ar, ar + sizeof(ar) / sizeof(ar[0]));
+    assert(m.find(12) == std::next(m.begin(), 0));
+    assert(m.find(11) == std::next(m.begin(), 1));
+    assert(m.find(10) == std::next(m.begin(), 2));
+    assert(m.find(9) == std::next(m.begin(), 3));
+    assert(m.find(8) == std::next(m.begin(), 4));
+    assert(m.find(7) == std::next(m.begin(), 5));
+    assert(m.find(6) == std::next(m.begin(), 6));
+    assert(m.find(5) == std::next(m.begin(), 7));
+    assert(m.find(4) == std::next(m.begin(), 8));
+    assert(std::next(m.begin(), 8) == m.end());
+  }
 #if TEST_STD_VER >= 11
   {
     typedef std::pair<const int, double> V;
diff --git a/libcxx/test/std/containers/associative/multimap/multimap.ops/find.pass.cpp b/libcxx/test/std/containers/associative/multimap/multimap.ops/find.pass.cpp
index 15df6c15bfa78..7939e77da308d 100644
--- a/libcxx/test/std/containers/associative/multimap/multimap.ops/find.pass.cpp
+++ b/libcxx/test/std/containers/associative/multimap/multimap.ops/find.pass.cpp
@@ -69,6 +69,19 @@ int main(int, char**) {
       assert(r == m.end());
     }
   }
+  {
+    using Pair = std::pair<const int, int>;
+    using Map  = std::multimap<int, int, std::greater<int> >;
+    Pair arr[] = {
+        Pair(5, 1), Pair(5, 2), Pair(5, 3), Pair(7, 1), Pair(7, 2), Pair(7, 3), Pair(9, 1), Pair(9, 2), Pair(9, 3)};
+    const Map m(arr, arr + sizeof(arr) / sizeof(arr[0]));
+    assert(iter_in_range(std::next(m.begin(), 6), std::next(m.begin(), 9), m.find(5)));
+    assert(m.find(6) == m.end());
+    assert(iter_in_range(std::next(m.begin(), 3), std::next(m.begin(), 6), m.find(7)));
+    assert(m.find(8) == m.end());
+    assert(iter_in_range(std::next(m.begin(), 0), std::next(m.begin(), 3), m.find(9)));
+    assert(m.find(10) == m.end());
+  }
 #if TEST_STD_VER >= 11
   {
     typedef std::multimap<int, double, std::less<int>, min_allocator<std::pair<const int, double>>> M;
diff --git a/libcxx/test/std/containers/associative/multiset/find.pass.cpp b/libcxx/test/std/containers/associative/multiset/find.pass.cpp
index 62e6b9dae431d..866de0da5ea93 100644
--- a/libcxx/test/std/containers/associative/multiset/find.pass.cpp
+++ b/libcxx/test/std/containers/associative/multiset/find.pass.cpp
@@ -71,6 +71,21 @@ int main(int, char**) {
       assert(r == std::next(m.begin(), 8));
     }
   }
+  { // Check with std::greater to ensure we're actually using the correct comparator
+    using Set = std::multiset<int, std::greater<int> >;
+    int ar[]  = {5, 6, 7, 8, 9, 10, 11, 12};
+    Set m(ar, ar + sizeof(ar) / sizeof(ar[0]));
+    assert(m.find(12) == std::next(m.begin(), 0));
+    assert(m.find(11) == std::next(m.begin(), 1));
+    assert(m.find(10) == std::next(m.begin(), 2));
+    assert(m.find(9) == std::next(m.begin(), 3));
+    assert(m.find(8) == std::next(m.begin(), 4));
+    assert(m.find(7) == std::next(m.begin(), 5));
+    assert(m.find(6) == std::next(m.begin(), 6));
+    assert(m.find(5) == std::next(m.begin(), 7));
+    assert(m.find(4) == std::next(m.begin(), 8));
+    assert(std::next(m.begin(), 8) == m.end());
+  }
 #if TEST_STD_VER >= 11
   {
     typedef int V;
diff --git a/libcxx/test/std/containers/associative/set/find.pass.cpp b/libcxx/test/std/containers/associative/set/find.pass.cpp
index 88ceff0cb144f..deb193c17bfa9 100644
--- a/libcxx/test/std/containers/associative/set/find.pass.cpp
+++ b/libcxx/test/std/containers/associative/set/find.pass.cpp
@@ -71,6 +71,21 @@ int main(int, char**) {
       assert(r == std::next(m.begin(), 8));
     }
   }
+  { // Check with std::greater to ensure we're actually using the correct comparator
+    using Set = std::set<int, std::greater<int> >;
+    int ar[]  = {5, 6, 7, 8, 9, 10, 11, 12};
+    Set m(ar, ar + sizeof(ar) / sizeof(ar[0]));
+    assert(m.find(12) == std::next(m.begin(), 0));
+    assert(m.find(11) == std::next(m.begin(), 1));
+    assert(m.find(10) == std::next(m.begin(), 2));
+    assert(m.find(9) == std::next(m.begin(), 3));
+    assert(m.find(8) == std::next(m.begin(), 4));
+    assert(m.find(7) == std::next(m.begin(), 5));
+    assert(m.find(6) == std::next(m.begin(), 6));
+    assert(m.find(5) == std::next(m.begin(), 7));
+    assert(m.find(4) == std::next(m.begin(), 8));
+    assert(std::next(m.begin(), 8) == m.end());
+  }
 #if TEST_STD_VER >= 11
   {
     typedef int V;
diff --git a/libcxx/test/std/containers/unord/unord.multimap/unord.multimap.modifiers/erase_range.pass.cpp b/libcxx/test/std/containers/unord/unord.multimap/unord.multimap.modifiers/erase_range.pass.cpp
index a8a9f5fdbb428..f8a2bdd3fee73 100644
--- a/libcxx/test/std/containers/unord/unord.multimap/unord.multimap.modifiers/erase_range.pass.cpp
+++ b/libcxx/test/std/containers/unord/unord.multimap/unord.multimap.modifiers/erase_range.pass.cpp
@@ -91,20 +91,6 @@ int main(int, char**) {
     assert(c.size() == 0);
     assert(k == c.end());
   }
-  {   // Make sure we're properly destroying the elements when erasing
-    { // When erasing part of a bucket
-      std::unordered_multimap<int, std::string> map;
-      map.insert(std::make_pair(1, "This is a long string to make sure ASan can detect a memory leak"));
-      map.insert(std::make_pair(1, "This is another long string to make sure ASan can detect a memory leak"));
-      map.erase(++map.begin(), map.end());
-    }
-    { // When erasing the whole bucket
-      std::unordered_multimap<int, std::string> map;
-      map.insert(std::make_pair(1, "This is a long string to make sure ASan can detect a memory leak"));
-      map.insert(std::make_pair(1, "This is another long string to make sure ASan can detect a memory leak"));
-      map.erase(map.begin(), map.end());
-    }
-  }
 #if TEST_STD_VER >= 11
   {
     typedef std::unordered_multimap<int,
diff --git a/libcxx/test/std/input.output/file.streams/fstreams/fstream.cons/default.pass.cpp b/libcxx/test/std/input.output/file.streams/fstreams/fstream.cons/default.pass.cpp
index d2efaf1561631..d15276b440157 100644
--- a/libcxx/test/std/input.output/file.streams/fstreams/fstream.cons/default.pass.cpp
+++ b/libcxx/test/std/input.output/file.streams/fstreams/fstream.cons/default.pass.cpp
@@ -13,8 +13,6 @@
 
 // basic_fstream();
 
-// XFAIL: FROZEN-CXX03-HEADERS-FIXME
-
 #include <fstream>
 
 #include "test_macros.h"
diff --git a/libcxx/test/std/input.output/file.streams/fstreams/fstream.cons/move.pass.cpp b/libcxx/test/std/input.output/file.streams/fstreams/fstream.cons/move.pass.cpp
index 153487898e157..95a04bdfccdbc 100644
--- a/libcxx/test/std/input.output/file.streams/fstreams/fstream.cons/move.pass.cpp
+++ b/libcxx/test/std/input.output/file.streams/fstreams/fstream.cons/move.pass.cpp
@@ -13,8 +13,6 @@
 
 // basic_fstream(basic_fstream&& rhs);
 
-// XFAIL: FROZEN-CXX03-HEADERS-FIXME
-
 #include <fstream>
 #include <cassert>
 
diff --git a/libcxx/test/std/input.output/file.streams/fstreams/fstream.cons/pointer.pass.cpp b/libcxx/test/std/input.output/file.streams/fstreams/fstream.cons/pointer.pass.cpp
index ca226242773ad..2e0ebcd684d79 100644
--- a/libcxx/test/std/input.output/file.streams/fstreams/fstream.cons/pointer.pass.cpp
+++ b/libcxx/test/std/input.output/file.streams/fstreams/fstream.cons/pointer.pass.cpp
@@ -18,8 +18,6 @@
 
 // XFAIL: LIBCXX-AIX-FIXME
 
-// XFAIL: FROZEN-CXX03-HEADERS-FIXME
-
 #include <fstream>
 #include <cassert>
 
diff --git a/libcxx/test/std/input.output/file.streams/fstreams/fstream.cons/string.pass.cpp b/libcxx/test/std/input.output/file.streams/fstreams/fstream.cons/string.pass.cpp
index 28cefc77d6a90..ca0921a00b9b6 100644
--- a/libcxx/test/std/input.output/file.streams/fstreams/fstream.cons/string.pass.cpp
+++ b/libcxx/test/std/input.output/file.streams/fstreams/fstream.cons/string.pass.cpp
@@ -13,8 +13,6 @@
 
 // explicit basic_fstream(const string& s, ios_base::openmode mode = ios_base::in|ios_base::out);
 
-// XFAIL: FROZEN-CXX03-HEADERS-FIXME
-
 #include <fstream>
 #include <cassert>
 
diff --git a/libcxx/test/std/input.output/file.streams/fstreams/ifstream.cons/default.pass.cpp b/libcxx/test/std/input.output/file.streams/fstreams/ifstream.cons/default.pass.cpp
index 256380d2c164a..70d1efca20c65 100644
--- a/libcxx/test/std/input.output/file.streams/fstreams/ifstream.cons/default.pass.cpp
+++ b/libcxx/test/std/input.output/file.streams/fstreams/ifstream.cons/default.pass.cpp
@@ -13,8 +13,6 @@
 
 // basic_ifstream();
 
-// XFAIL: FROZEN-CXX03-HEADERS-FIXME
-
 #include <fstream>
 
 #include "test_macros.h"
diff --git a/libcxx/test/std/input.output/file.streams/fstreams/ifstream.cons/move.pass.cpp b/libcxx/test/std/input.output/file.streams/fstreams/ifstream.cons/move.pass.cpp
index c8be388f40698..81ec800954cc2 100644
--- a/libcxx/test/std/input.output/file.streams/fstreams/ifstream.cons/move.pass.cpp
+++ b/libcxx/test/std/input.output/file.streams/fstreams/ifstream.cons/move.pass.cpp
@@ -8,8 +8,6 @@
 
 // FILE_DEPENDENCIES: test.dat
 
-// XFAIL: FROZEN-CXX03-HEADERS-FIXME
-
 // <fstream>
 
 // template <class charT, class traits = char_traits<charT> >
diff --git a/libcxx/test/std/input.output/file.streams/fstreams/ifstream.cons/pointer.pass.cpp b/libcxx/test/std/input.output/file.streams/fstreams/ifstream.cons/pointer.pass.cpp
index 711ab2a74b516..6bbe6f1ff7754 100644
--- a/libcxx/test/std/input.output/file.streams/fstreams/ifstream.cons/pointer.pass.cpp
+++ b/libcxx/test/std/input.output/file.streams/fstreams/ifstream.cons/pointer.pass.cpp
@@ -8,8 +8,6 @@
 
 // FILE_DEPENDENCIES: test.dat
 
-// XFAIL: FROZEN-CXX03-HEADERS-FIXME
-
 // <fstream>
 
 // template <class charT, class traits = char_traits<charT> >
diff --git a/libcxx/test/std/input.output/file.streams/fstreams/ifstream.cons/string.pass.cpp b/libcxx/test/std/input.output/file.streams/fstreams/ifstream.cons/string.pass.cpp
index d4bbb3c0cabfc..e1a9b53da1348 100644
--- a/libcxx/test/std/input.output/file.streams/fstreams/ifstream.cons/string.pass.cpp
+++ b/libcxx/test/std/input.output/file.streams/fstreams/ifstream.cons/string.pass.cpp
@@ -15,8 +15,6 @@
 
 // explicit basic_ifstream(const string& s, ios_base::openmode mode = ios_base::in);
 
-// XFAIL: FROZEN-CXX03-HEADERS-FIXME
-
 #include <fstream>
 #include <cassert>
 
diff --git a/libcxx/test/std/input.output/file.streams/fstreams/ofstream.cons/default.pass.cpp b/libcxx/test/std/input.output/file.streams/fstreams/ofstream.cons/default.pass.cpp
index 4cda1db438342..a7b0918f79365 100644
--- a/libcxx/test/std/input.output/file.streams/fstreams/ofstream.cons/default.pass.cpp
+++ b/libcxx/test/std/input.output/file.streams/fstreams/ofstream.cons/default.pass.cpp
@@ -13,8 +13,6 @@
 
 // basic_ofstream();
 
-// XFAIL: FROZEN-CXX03-HEADERS-FIXME
-
 #include <fstream>
 
 #include "test_macros.h"
diff --git a/libcxx/test/std/input.output/file.streams/fstreams/ofstream.cons/move.pass.cpp b/libcxx/test/std/input.output/file.streams/fstreams/ofstream.cons/move.pass.cpp
index 501a4c90ca3fe..ec02fa2621c19 100644
--- a/libcxx/test/std/input.output/file.streams/fstreams/ofstream.cons/move.pass.cpp
+++ b/libcxx/test/std/input.output/file.streams/fstreams/ofstream.cons/move.pass.cpp
@@ -13,8 +13,6 @@
 
 // basic_ofstream(basic_ofstream&& rhs);
 
-// XFAIL: FROZEN-CXX03-HEADERS-FIXME
-
 #include <fstream>
 #include <cassert>
 
diff --git a/libcxx/test/std/input.output/file.streams/fstreams/ofstream.cons/pointer.pass.cpp b/libcxx/test/std/input.output/file.streams/fstreams/ofstream.cons/pointer.pass.cpp
index 1b5a55df73717..fbb03f1e85841 100644
--- a/libcxx/test/std/input.output/file.streams/fstreams/ofstream.cons/pointer.pass.cpp
+++ b/libcxx/test/std/input.output/file.streams/fstreams/ofstream.cons/pointer.pass.cpp
@@ -18,8 +18,6 @@
 
 // XFAIL: LIBCXX-AIX-FIXME
 
-// XFAIL: FROZEN-CXX03-HEADERS-FIXME
-
 #include <fstream>
 #include <cassert>
 #include <ios>
diff --git a/libcxx/test/std/input.output/file.streams/fstreams/ofstream.cons/string.pass.cpp b/libcxx/test/std/input.output/file.streams/fstreams/ofstream.cons/string.pass.cpp
index bb18c88bd326e..33a7e9b2b6f50 100644
--- a/libcxx/test/std/input.output/file.streams/fstreams/ofstream.cons/string.pass.cpp
+++ b/libcxx/test/std/input.output/file.streams/fstreams/ofstream.cons/string.pass.cpp
@@ -13,8 +13,6 @@
 
 // explicit basic_ofstream(const string& s, ios_base::openmode mode = ios_base::out);
 
-// XFAIL: FROZEN-CXX03-HEADERS-FIXME
-
 #include <fstream>
 #include <cassert>
 #include <ios>
diff --git a/libcxx/test/std/input.output/iostreams.base/ios/basic.ios.members/copyfmt.pass.cpp b/libcxx/test/std/input.output/iostreams.base/ios/basic.ios.members/copyfmt.pass.cpp
index 768922192038b..d78f7df8f6b5e 100644
--- a/libcxx/test/std/input.output/iostreams.base/ios/basic.ios.members/copyfmt.pass.cpp
+++ b/libcxx/test/std/input.output/iostreams.base/ios/basic.ios.members/copyfmt.pass.cpp
@@ -15,8 +15,6 @@
 
 // basic_ios& copyfmt(const basic_ios& rhs);
 
-// XFAIL: FROZEN-CXX03-HEADERS-FIXME
-
 #include <ios>
 #include <memory>
 #include <streambuf>
diff --git a/libcxx/test/std/input.output/string.streams/istringstream/istringstream.cons/default.pass.cpp b/libcxx/test/std/input.output/string.streams/istringstream/istringstream.cons/default.pass.cpp
index 8cd23d45598b8..8c73df42ae4be 100644
--- a/libcxx/test/std/input.output/string.streams/istringstream/istringstream.cons/default.pass.cpp
+++ b/libcxx/test/std/input.output/string.streams/istringstream/istringstream.cons/default.pass.cpp
@@ -15,8 +15,6 @@
 // basic_istringstream() : basic_istringstream(ios_base::in) {}           // C++20
 // explicit basic_istringstream(ios_base::openmode which);                // C++20
 
-// XFAIL: FROZEN-CXX03-HEADERS-FIXME
-
 #include <sstream>
 #include <cassert>
 
diff --git a/libcxx/test/std/input.output/string.streams/istringstream/istringstream.cons/move.pass.cpp b/libcxx/test/std/input.output/string.streams/istringstream/istringstream.cons/move.pass.cpp
index 1af3304d08971..00ac7cc6414e9 100644
--- a/libcxx/test/std/input.output/string.streams/istringstream/istringstream.cons/move.pass.cpp
+++ b/libcxx/test/std/input.output/string.streams/istringstream/istringstream.cons/move.pass.cpp
@@ -13,8 +13,6 @@
 
 // basic_istringstream(basic_istringstream&& rhs);
 
-// XFAIL: FROZEN-CXX03-HEADERS-FIXME
-
 #include <sstream>
 #include <cassert>
 
diff --git a/libcxx/test/std/input.output/string.streams/istringstream/istringstream.cons/string.pass.cpp b/libcxx/test/std/input.output/string.streams/istringstream/istringstream.cons/string.pass.cpp
index 7755dd926c2f6..4a5965e7e96e9 100644
--- a/libcxx/test/std/input.output/string.streams/istringstream/istringstream.cons/string.pass.cpp
+++ b/libcxx/test/std/input.output/string.streams/istringstream/istringstream.cons/string.pass.cpp
@@ -14,8 +14,6 @@
 // explicit basic_istringstream(const basic_string<charT,traits,allocator>& str,
 //                              ios_base::openmode which = ios_base::in);
 
-// XFAIL: FROZEN-CXX03-HEADERS-FIXME
-
 #include <sstream>
 #include <cassert>
 
diff --git a/libcxx/test/std/input.output/string.streams/ostringstream/ostringstream.cons/default.pass.cpp b/libcxx/test/std/input.output/string.streams/ostringstream/ostringstream.cons/default.pass.cpp
index eb248a7801a3c..a6b98a4e36293 100644
--- a/libcxx/test/std/input.output/string.streams/ostringstream/ostringstream.cons/default.pass.cpp
+++ b/libcxx/test/std/input.output/string.streams/ostringstream/ostringstream.cons/default.pass.cpp
@@ -15,8 +15,6 @@
 // basic_ostringstream() : basic_ostringstream(ios_base::out) {}           // C++20
 // explicit basic_ostringstream(ios_base::openmode which);                 // C++20
 
-// XFAIL: FROZEN-CXX03-HEADERS-FIXME
-
 #include <sstream>
 #include <cassert>
 
diff --git a/libcxx/test/std/input.output/string.streams/ostringstream/ostringstream.cons/move.pass.cpp b/libcxx/test/std/input.output/string.streams/ostringstream/ostringstream.cons/move.pass.cpp
index 62ece7b63b31c..596a3e7d53584 100644
--- a/libcxx/test/std/input.output/string.streams/ostringstream/ostringstream.cons/move.pass.cpp
+++ b/libcxx/test/std/input.output/string.streams/ostringstream/ostringstream.cons/move.pass.cpp
@@ -13,8 +13,6 @@
 
 // basic_ostringstream(basic_ostringstream&& rhs);
 
-// XFAIL: FROZEN-CXX03-HEADERS-FIXME
-
 #include <sstream>
 #include <cassert>
 
diff --git a/libcxx/test/std/input.output/string.streams/ostringstream/ostringstream.cons/string.pass.cpp b/libcxx/test/std/input.output/string.streams/ostringstream/ostringstream.cons/string.pass.cpp
index bbec8f79a1862..9e9405ad49217 100644
--- a/libcxx/test/std/input.output/string.streams/ostringstream/ostringstream.cons/string.pass.cpp
+++ b/libcxx/test/std/input.output/string.streams/ostringstream/ostringstream.cons/string.pass.cpp
@@ -14,8 +14,6 @@
 // explicit basic_ostringstream(const basic_string<charT,traits,allocator>& str,
 //                              ios_base::openmode which = ios_base::in);
 
-// XFAIL: FROZEN-CXX03-HEADERS-FIXME
-
 #include <sstream>
 #include <cassert>
 
diff --git a/libcxx/test/std/input.output/string.streams/stringstream/stringstream.cons/default.pass.cpp b/libcxx/test/std/input.output/string.streams/stringstream/stringstream.cons/default.pass.cpp
index 0e535814ae54e..4f9e7e026c50f 100644
--- a/libcxx/test/std/input.output/string.streams/stringstream/stringstream.cons/default.pass.cpp
+++ b/libcxx/test/std/input.output/string.streams/stringstream/stringstream.cons/default.pass.cpp
@@ -15,8 +15,6 @@
 // basic_stringstream() : basic_stringstream(ios_base::out | ios_base::in) {}            // C++20
 // explicit basic_stringstream(ios_base::openmode which);                                // C++20
 
-// XFAIL: FROZEN-CXX03-HEADERS-FIXME
-
 #include <sstream>
 #include <cassert>
 
diff --git a/libcxx/test/std/input.output/string.streams/stringstream/stringstream.cons/move.pass.cpp b/libcxx/test/std/input.output/string.streams/stringstream/stringstream.cons/move.pass.cpp
index e905f5f7c686a..0702d9a278d3c 100644
--- a/libcxx/test/std/input.output/string.streams/stringstream/stringstream.cons/move.pass.cpp
+++ b/libcxx/test/std/input.output/string.streams/stringstream/stringstream.cons/move.pass.cpp
@@ -13,8 +13,6 @@
 
 // basic_stringstream(basic_stringstream&& rhs);
 
-// XFAIL: FROZEN-CXX03-HEADERS-FIXME
-
 #include <sstream>
 #include <cassert>
 
diff --git a/libcxx/test/std/input.output/string.streams/stringstream/stringstream.cons/string.pass.cpp b/libcxx/test/std/input.output/string.streams/stringstream/stringstream.cons/string.pass.cpp
index d4fe18afbd28f..08880878361da 100644
--- a/libcxx/test/std/input.output/string.streams/stringstream/stringstream.cons/string.pass.cpp
+++ b/libcxx/test/std/input.output/string.streams/stringstream/stringstream.cons/string.pass.cpp
@@ -14,8 +14,6 @@
 // explicit basic_stringstream(const basic_string<charT,traits,Allocator>& str,
 //                             ios_base::openmode which = ios_base::out|ios_base::in);
 
-// XFAIL: FROZEN-CXX03-HEADERS-FIXME
-
 #include <sstream>
 #include <cassert>
 
diff --git a/libcxx/test/support/MinSequenceContainer.h b/libcxx/test/support/MinSequenceContainer.h
index 9af5847f08871..f9e67cd726eb7 100644
--- a/libcxx/test/support/MinSequenceContainer.h
+++ b/libcxx/test/support/MinSequenceContainer.h
@@ -29,6 +29,13 @@ struct MinSequenceContainer {
   template <class It>
   explicit TEST_CONSTEXPR_CXX20 MinSequenceContainer(It first, It last) : data_(first, last) {}
   TEST_CONSTEXPR_CXX20 MinSequenceContainer(std::initializer_list<T> il) : data_(il) {}
+#if TEST_STD_VER >= 23
+  template <class Range>
+  constexpr MinSequenceContainer(std::from_range_t, Range&& rg) : data_(std::from_range, std::forward<Range>(rg)) {}
+#endif
+  TEST_CONSTEXPR_CXX20 MinSequenceContainer(size_type n, T value) : data_(n, value) {}
+
+  TEST_CONSTEXPR_CXX20 MinSequenceContainer& operator=(std::initializer_list<T> il) { data_ = il; }
 
   template <class It>
   TEST_CONSTEXPR_CXX20 void assign(It first, It last) {
@@ -36,6 +43,12 @@ struct MinSequenceContainer {
   }
   TEST_CONSTEXPR_CXX20 void assign(std::initializer_list<T> il) { data_.assign(il); }
   TEST_CONSTEXPR_CXX20 void assign(size_type n, value_type t) { data_.assign(n, t); }
+#if TEST_STD_VER >= 23
+  template <class Range>
+  constexpr void assign_range(Range&& rg) {
+    data_.assign_range(std::forward<Range>(rg));
+  }
+#endif
   TEST_CONSTEXPR_CXX20 iterator begin() { return iterator(data_.data()); }
   TEST_CONSTEXPR_CXX20 const_iterator begin() const { return const_iterator(data_.data()); }
   TEST_CONSTEXPR_CXX20 const_iterator cbegin() const { return const_iterator(data_.data()); }
@@ -55,10 +68,20 @@ struct MinSequenceContainer {
     return from_vector_iterator(data_.insert(to_vector_iterator(p), std::move(value)));
   }
 
+  TEST_CONSTEXPR_CXX20 iterator insert(const_iterator p, size_type n, T value) {
+    return from_vector_iterator(data_.insert(to_vector_iterator(p), n, value));
+  }
+
+  TEST_CONSTEXPR_CXX20 iterator insert(const_iterator p, std::initializer_list<T> il) {
+    return from_vector_iterator(data_.insert(to_vector_iterator(p), il));
+  }
+
+#if TEST_STD_VER >= 23
   template <class Range>
-  TEST_CONSTEXPR_CXX20 iterator insert_range(const_iterator p, Range&& rg) {
+  constexpr iterator insert_range(const_iterator p, Range&& rg) {
     return from_vector_iterator(data_.insert_range(to_vector_iterator(p), std::forward<Range>(rg)));
   }
+#endif
 
   TEST_CONSTEXPR_CXX20 iterator erase(const_iterator first, const_iterator last) {
     return from_vector_iterator(data_.erase(to_vector_iterator(first), to_vector_iterator(last)));
diff --git a/libcxx/utils/ci/Dockerfile b/libcxx/utils/ci/Dockerfile
index 79e11569c0d08..8e1c341c10b92 100644
--- a/libcxx/utils/ci/Dockerfile
+++ b/libcxx/utils/ci/Dockerfile
@@ -76,6 +76,9 @@ RUN sudo apt-get update \
     && sudo apt-get install -y \
         tzdata
 
+# Install various tools used by the build or the test suite
+# TODO add ninja-build once 1.11 is available in Ubuntu, also remove the manual
+# installation below.
 RUN sudo apt-get update \
     && sudo apt-get install -y \
         bash \
@@ -108,9 +111,6 @@ RUN sudo apt-get update \
         xz-utils \
     && sudo rm -rf /var/lib/apt/lists/*
 
-# Install various tools used by the build or the test suite
-#RUN apt-get update && apt-get install -y ninja-build python3 python3-distutils python3-psutil git gdb ccache
-# TODO add ninja-build once 1.11 is available in Ubuntu, also remove the manual installation.
 RUN <<EOF
   set -e
   wget -qO /tmp/ninja.gz https://github.com/ninja-build/ninja/releases/latest/download/ninja-linux.zip
diff --git a/libcxx/utils/ci/build-picolibc.sh b/libcxx/utils/ci/build-picolibc.sh
index 521c1bef9fc7e..a25a588cb8d2e 100755
--- a/libcxx/utils/ci/build-picolibc.sh
+++ b/libcxx/utils/ci/build-picolibc.sh
@@ -81,7 +81,7 @@ cat <<EOF > "${picolibc_build_dir}/meson-cross-build.txt"
 c = ['${CC:-cc}', '--target=${target}', '-mfloat-abi=soft', '-nostdlib']
 ar = 'llvm-ar'
 as = 'llvm-as'
-ld = 'lld'
+c_ld = 'lld'
 strip = 'llvm-strip'
 [host_machine]
 system = 'none'
@@ -95,7 +95,7 @@ EOF
 venv_dir="${build_dir}/meson-venv"
 python3 -m venv "${venv_dir}"
 # Install the version of meson that was the latest at the time this script was written.
-"${venv_dir}/bin/pip" install "meson==1.1.1"
+"${venv_dir}/bin/pip" install "meson==1.9.0"
 
 "${venv_dir}/bin/meson" setup \
   -Dincludedir=include -Dlibdir=lib -Dspecsdir=none -Dmultilib=false -Dpicoexit=false \
diff --git a/libcxx/utils/parse-google-benchmark-results b/libcxx/utils/parse-google-benchmark-results
index 280c8045db6c9..f0bace81a0054 100755
--- a/libcxx/utils/parse-google-benchmark-results
+++ b/libcxx/utils/parse-google-benchmark-results
@@ -26,6 +26,8 @@ def main(argv):
     for file in args.filename:
         js = json.load(file)
         for bm in js['benchmarks']:
+            if args.timing not in bm:
+                raise RuntimeError(f'Benchmark does not contain key for {args.timing}: {bm}')
             row = [bm['name'], bm[args.timing]]
             rows.append(row)
 
@@ -39,7 +41,10 @@ def main(argv):
         benchmark = headers.index('Benchmark')
         time = headers.index(args.timing)
         for row in rows:
-            print(f'{row[benchmark].replace(".", "_")}.execution_time {row[time]}')
+            # LNT format uses '.' to separate the benchmark name from the metric, and ' '
+            # to separate the benchmark name + metric from the numerical value. Escape both.
+            escaped = row[benchmark].replace(".", "_").replace(" ", "_")
+            print(f'{escaped}.execution_time {row[time]}')
 
 if __name__ == '__main__':
     main(sys.argv[1:])
diff --git a/libcxx/utils/test-at-commit b/libcxx/utils/test-at-commit
index 1ef1ec0c52815..5b3fcede48ab2 100755
--- a/libcxx/utils/test-at-commit
+++ b/libcxx/utils/test-at-commit
@@ -70,7 +70,9 @@ def main(argv):
 
     with tempfile.TemporaryDirectory() as install_dir:
         # Build the library at the baseline
-        build_cmd = [os.path.join(PARENT_DIR, 'build-at-commit'), '--install-dir', install_dir, '--commit', args.commit]
+        build_cmd = [os.path.join(PARENT_DIR, 'build-at-commit'), '--git-repo', args.git_repo,
+                                                                  '--install-dir', install_dir,
+                                                                  '--commit', args.commit]
         build_cmd += ['--', '-DCMAKE_BUILD_TYPE=RelWithDebInfo']
         subprocess.check_call(build_cmd)
 
diff --git a/libcxxabi/test/configs/armv7m-picolibc-libc++abi.cfg.in b/libcxxabi/test/configs/armv7m-picolibc-libc++abi.cfg.in
index b4744f935ad85..0594ba4ce89b7 100644
--- a/libcxxabi/test/configs/armv7m-picolibc-libc++abi.cfg.in
+++ b/libcxxabi/test/configs/armv7m-picolibc-libc++abi.cfg.in
@@ -8,7 +8,7 @@ config.substitutions.append(('%{compile_flags}',
     '-nostdinc++ -I %{include} -I %{cxx-include} -I %{cxx-target-include} %{maybe-include-libunwind} -I %{libcxx}/test/support -I %{libcxx}/src -D_LIBCPP_ENABLE_CXX17_REMOVED_UNEXPECTED_FUNCTIONS'
 ))
 config.substitutions.append(('%{link_flags}',
-    '-nostdlib -nostdlib++ -L %{lib} -lc++ -lc++abi'
+    '-fuse-ld=lld -nostdlib -nostdlib++ -L %{lib} -lc++ -lc++abi'
     ' -lc -lm -lclang_rt.builtins -lsemihost -lcrt0-semihost' +
     ' -T {}'.format(libc_linker_script) +
     ' -Wl,--defsym=__flash=0x0'
diff --git a/libunwind/test/configs/armv7m-picolibc-libunwind.cfg.in b/libunwind/test/configs/armv7m-picolibc-libunwind.cfg.in
index e8f68a51fc53f..fc54900e1e0a1 100644
--- a/libunwind/test/configs/armv7m-picolibc-libunwind.cfg.in
+++ b/libunwind/test/configs/armv7m-picolibc-libunwind.cfg.in
@@ -8,7 +8,7 @@ config.substitutions.append(('%{compile_flags}',
     '-nostdinc++ -I %{include}'
 ))
 config.substitutions.append(('%{link_flags}',
-    '-nostdlib -nostdlib++ -L %{lib} -lunwind'
+    '-fuse-ld=lld -nostdlib -nostdlib++ -L %{lib} -lunwind'
     ' -lc -lm -lclang_rt.builtins -lsemihost -lcrt0-semihost' +
     ' -T {}'.format(libc_linker_script) +
     ' -Wl,--defsym=__flash=0x0'
diff --git a/lld/ELF/Arch/RISCV.cpp b/lld/ELF/Arch/RISCV.cpp
index 85ddf1b6da283..7f2bfefa5578a 100644
--- a/lld/ELF/Arch/RISCV.cpp
+++ b/lld/ELF/Arch/RISCV.cpp
@@ -756,7 +756,7 @@ static void relaxCall(Ctx &ctx, const InputSection &sec, size_t i, uint64_t loc,
 
   // When the caller specifies the old value of `remove`, disallow its
   // increment.
-  if (remove >= 6 && rvc && isInt<12>(displace) && rd == 0) {
+  if (remove >= 6 && rvc && isInt<12>(displace) && rd == X_X0) {
     sec.relaxAux->relocTypes[i] = R_RISCV_RVC_JUMP;
     sec.relaxAux->writes.push_back(0xa001); // c.j
     remove = 6;
diff --git a/lld/MachO/ICF.cpp b/lld/MachO/ICF.cpp
index ae0bee8e942cb..7b31378c3781e 100644
--- a/lld/MachO/ICF.cpp
+++ b/lld/MachO/ICF.cpp
@@ -449,7 +449,7 @@ void ICF::run() {
 
     ConcatInputSection *beginIsec = icfInputs[begin];
     for (size_t i = begin + 1; i < end; ++i) {
-      // Skip keepUnique inputs when using safe_thunks (already handeled above)
+      // Skip keepUnique inputs when using safe_thunks (already handled above)
       if (useSafeThunks && icfInputs[i]->keepUnique) {
         // Assert keepUnique sections are either small or replaced with thunks.
         assert(!icfInputs[i]->live ||
diff --git a/lld/MachO/ObjC.cpp b/lld/MachO/ObjC.cpp
index 35954b25f7149..ab7f73c3a1df6 100644
--- a/lld/MachO/ObjC.cpp
+++ b/lld/MachO/ObjC.cpp
@@ -632,7 +632,7 @@ bool ObjcCategoryMerger::collectCategoryWriterInfoFromCategory(
         tryGetDefinedAtIsecOffset(catInfo.catBodyIsec, catLayout.nameOffset);
 
     if (!catNameSym) {
-      // This is an unhandeled case where the category name is not a symbol but
+      // This is an unhandled case where the category name is not a symbol but
       // instead points to an CStringInputSection (that doesn't have any symbol)
       // TODO: Find a small repro and either fix or add a test case for this
       // scenario
diff --git a/lldb/docs/.htaccess b/lldb/docs/.htaccess
index f094bd6ebc783..34e7fcb8f5516 100644
--- a/lldb/docs/.htaccess
+++ b/lldb/docs/.htaccess
@@ -19,6 +19,7 @@ Redirect 301 /resources/architecture.html https://lldb.llvm.org/resources/overvi
 Redirect 301 /design/sbapi.html https://lldb.llvm.org/resources/sbapi.html
 Redirect 301 /design/overview.html https://lldb.llvm.org/resources/overview.html
 Redirect 301 /use/extensions.html https://lldb.llvm.org/resources/extensions.html
+Redirect 301 /use/python.html https://lldb.llvm.org/use/tutorials/script-driven-debugging.html
 Redirect 301 /resources/bots.html https://lldb.llvm.org/resources/test.html
 
 # Redirect old Python API to new Python API.
diff --git a/lldb/docs/dil-expr-lang.ebnf b/lldb/docs/dil-expr-lang.ebnf
index 67328939ba420..70eda3bf40650 100644
--- a/lldb/docs/dil-expr-lang.ebnf
+++ b/lldb/docs/dil-expr-lang.ebnf
@@ -16,6 +16,7 @@ postfix_expression = primary_expression
                    | postfix_expression "->" id_expression ;
 
 primary_expression = numeric_literal
+                   | boolean_literal
                    | id_expression
                    | "(" expression ")" ;
 
@@ -35,6 +36,8 @@ integer_literal = ? Integer constant: hexademical, decimal, octal, binary ? ;
 numeric_literal = ? Integer constant: hexademical, decimal, octal, binary ?
                 | ? Floating constant ? ;
 
+boolean_literal = "true" | "false" ;
+
 register = "$" ? Register name ? ;
 
 nested_name_specifier = type_name "::"
diff --git a/lldb/docs/use/python-reference.rst b/lldb/docs/use/python-reference.rst
index 4292714c9c208..6ac2ec93fbd1f 100644
--- a/lldb/docs/use/python-reference.rst
+++ b/lldb/docs/use/python-reference.rst
@@ -10,1126 +10,21 @@ command interpreter (we refer to this for brevity as the embedded interpreter).
 Of course, in this context it has full access to the LLDB API - with some
 additional conveniences we will call out in the FAQ.
 
-Documentation
---------------
-
-The LLDB API is contained in a python module named lldb. A useful resource when
-writing Python extensions is the lldb Python classes reference guide.
-
-The documentation is also accessible in an interactive debugger session with
-the following command:
-
-::
-
-   (lldb) script help(lldb)
-      Help on package lldb:
-
-      NAME
-         lldb - The lldb module contains the public APIs for Python binding.
-
-      FILE
-         /System/Library/PrivateFrameworks/LLDB.framework/Versions/A/Resources/Python/lldb/__init__.py
-
-      DESCRIPTION
-   ...
-
-You can also get help using a module class name. The full API that is exposed
-for that class will be displayed in a man page style window. Below we want to
-get help on the lldb.SBFrame class:
-
-::
-
-   (lldb) script help(lldb.SBFrame)
-      Help on class SBFrame in module lldb:
-
-      class SBFrame(__builtin__.object)
-      |  Represents one of the stack frames associated with a thread.
-      |  SBThread contains SBFrame(s). For example (from test/lldbutil.py),
-      |
-      |  def print_stacktrace(thread, string_buffer = False):
-      |      '''Prints a simple stack trace of this thread.'''
-      |
-   ...
-
-Or you can get help using any python object, here we use the lldb.process
-object which is a global variable in the lldb module which represents the
-currently selected process:
-
-::
-
-   (lldb) script help(lldb.process)
-      Help on SBProcess in module lldb object:
-
-      class SBProcess(__builtin__.object)
-      |  Represents the process associated with the target program.
-      |
-      |  SBProcess supports thread iteration. For example (from test/lldbutil.py),
-      |
-      |  # ==================================================
-      |  # Utility functions related to Threads and Processes
-      |  # ==================================================
-      |
-   ...
-
-Embedded Python Interpreter
----------------------------
-
-The embedded python interpreter can be accessed in a variety of ways from
-within LLDB. The easiest way is to use the lldb command script with no
-arguments at the lldb command prompt:
-
-::
-
-   (lldb) script
-   Python Interactive Interpreter. To exit, type 'quit()', 'exit()' or Ctrl-D.
-   >>> 2+3
-   5
-   >>> hex(12345)
-   '0x3039'
-   >>>
-
-This drops you into the embedded python interpreter. When running under the
-script command, lldb sets some convenience variables that give you quick access
-to the currently selected entities that characterize the program and debugger
-state. In each case, if there is no currently selected entity of the
-appropriate type, the variable's IsValid method will return false. These
-variables are:
-
-+-------------------+---------------------+-------------------------------------+-------------------------------------------------------------------------------------+
-| Variable          | Type                | Equivalent                          | Description                                                                         |
-+-------------------+---------------------+-------------------------------------+-------------------------------------------------------------------------------------+
-| ``lldb.debugger`` | `lldb.SBDebugger`   | `SBTarget.GetDebugger`              | Contains the debugger object whose ``script`` command was invoked.                  |
-|                   |                     |                                     | The `lldb.SBDebugger` object owns the command interpreter                           |
-|                   |                     |                                     | and all the targets in your debug session.  There will always be a                  |
-|                   |                     |                                     | Debugger in the embedded interpreter.                                               |
-+-------------------+---------------------+-------------------------------------+-------------------------------------------------------------------------------------+
-| ``lldb.target``   | `lldb.SBTarget`     | `SBDebugger.GetSelectedTarget`      | Contains the currently selected target - for instance the one made with the         |
-|                   |                     |                                     | ``file`` or selected by the ``target select <target-index>`` command.               |
-|                   |                     | `SBProcess.GetTarget`               | The `lldb.SBTarget` manages one running process, and all the executable             |
-|                   |                     |                                     | and debug files for the process.                                                    |
-+-------------------+---------------------+-------------------------------------+-------------------------------------------------------------------------------------+
-| ``lldb.process``  | `lldb.SBProcess`    | `SBTarget.GetProcess`               | Contains the process of the currently selected target.                              |
-|                   |                     |                                     | The `lldb.SBProcess` object manages the threads and allows access to                |
-|                   |                     | `SBThread.GetProcess`               | memory for the process.                                                             |
-+-------------------+---------------------+-------------------------------------+-------------------------------------------------------------------------------------+
-| ``lldb.thread``   | `lldb.SBThread`     | `SBProcess.GetSelectedThread`       | Contains the currently selected thread.                                             |
-|                   |                     |                                     | The `lldb.SBThread` object manages the stack frames in that thread.                 |
-|                   |                     | `SBFrame.GetThread`                 | A thread is always selected in the command interpreter when a target stops.         |
-|                   |                     |                                     | The ``thread select <thread-index>`` command can be used to change the              |
-|                   |                     |                                     | currently selected thread.  So as long as you have a stopped process, there will be |
-|                   |                     |                                     | some selected thread.                                                               |
-+-------------------+---------------------+-------------------------------------+-------------------------------------------------------------------------------------+
-| ``lldb.frame``    | `lldb.SBFrame`      | `SBThread.GetSelectedFrame`         | Contains the currently selected stack frame.                                        |
-|                   |                     |                                     | The `lldb.SBFrame` object manage the stack locals and the register set for          |
-|                   |                     |                                     | that stack.                                                                         |
-|                   |                     |                                     | A stack frame is always selected in the command interpreter when a target stops.    |
-|                   |                     |                                     | The ``frame select <frame-index>`` command can be used to change the                |
-|                   |                     |                                     | currently selected frame.  So as long as you have a stopped process, there will     |
-|                   |                     |                                     | be some selected frame.                                                             |
-+-------------------+---------------------+-------------------------------------+-------------------------------------------------------------------------------------+
-
-While extremely convenient, these variables have a couple caveats that you
-should be aware of. First of all, they hold the values of the selected objects
-on entry to the embedded interpreter. They do not update as you use the LLDB
-API's to change, for example, the currently selected stack frame or thread.
-
-Moreover, they are only defined and meaningful while in the interactive Python
-interpreter. There is no guarantee on their value in any other situation, hence
-you should not use them when defining Python formatters, breakpoint scripts and
-commands (or any other Python extension point that LLDB provides). For the
-latter you'll be passed an `SBDebugger`, `SBTarget`, `SBProcess`, `SBThread` or
-`SBFrame` instance and you can use the functions from the "Equivalent" column
-to navigate between them.
-
-As a rationale for such behavior, consider that lldb can run in a multithreaded
-environment, and another thread might call the "script" command, changing the
-value out from under you.
-
-To get started with these objects and LLDB scripting, please note that almost
-all of the lldb Python objects are able to briefly describe themselves when you
-pass them to the Python print function:
-
-::
-
-   (lldb) script
-   Python Interactive Interpreter. To exit, type 'quit()', 'exit()' or Ctrl-D.
-   >>> print(lldb.debugger)
-   Debugger (instance: "debugger_1", id: 1)
-   >>> print(lldb.target)
-   a.out
-   >>> print(lldb.process)
-   SBProcess: pid = 58842, state = stopped, threads = 1, executable = a.out
-   >>> print(lldb.thread)
-   thread #1: tid = 0x2265ce3, 0x0000000100000334 a.out`main at t.c:2:3, queue = 'com.apple.main-thread', stop reason = breakpoint 1.1
-   >>> print(lldb.frame)
-   frame #0: 0x0000000100000334 a.out`main at t.c:2:3
-
-
-Running a python script when a breakpoint gets hit
---------------------------------------------------
-
-One very powerful use of the lldb Python API is to have a python script run
-when a breakpoint gets hit. Adding python scripts to breakpoints provides a way
-to create complex breakpoint conditions and also allows for smart logging and
-data gathering.
-
-When your process hits a breakpoint to which you have attached some python
-code, the code is executed as the body of a function which takes three
-arguments:
-
-::
-
-  def breakpoint_function_wrapper(frame, bp_loc, internal_dict):
-     # Your code goes here
-
-or:
-
-::
-
-  def breakpoint_function_wrapper(frame, bp_loc, extra_args, internal_dict):
-     # Your code goes here
-
-
-+-------------------+-------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------+
-| Argument          | Type                          | Description                                                                                                                               |
-+-------------------+-------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------+
-| ``frame``         | `lldb.SBFrame`                | The current stack frame where the breakpoint got hit.                                                                                     |
-|                   |                               | The object will always be valid.                                                                                                          |
-|                   |                               | This ``frame`` argument might *not* match the currently selected stack frame found in the `lldb` module global variable ``lldb.frame``.   |
-+-------------------+-------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------+
-| ``bp_loc``        | `lldb.SBBreakpointLocation`   | The breakpoint location that just got hit. Breakpoints are represented by `lldb.SBBreakpoint`                                             |
-|                   |                               | objects. These breakpoint objects can have one or more locations. These locations                                                         |
-|                   |                               | are represented by `lldb.SBBreakpointLocation` objects.                                                                                   |
-+-------------------+-------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------+
-| ``extra_args``    | `lldb.SBStructuredData`       | ``Optional`` If your breakpoint callback function takes this extra parameter, then when the callback gets added to a breakpoint, its      |
-|                   |                               | contents can parametrize this use of the callback.  For instance, instead of writing a callback that stops when the caller is "Foo",      |
-|                   |                               | you could take the function name from a field in the ``extra_args``, making the callback more general.  The ``-k`` and ``-v`` options     |
-|                   |                               | to ``breakpoint command add`` will be passed as a Dictionary in the ``extra_args`` parameter, or you can provide it with the SB API's.    |
-+-------------------+-------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------+
-| ``internal_dict`` | ``dict``                      | The python session dictionary as a standard python dictionary object.                                                                     |
-+-------------------+-------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------+
-
-Optionally, a Python breakpoint command can return a value. Returning False
-tells LLDB that you do not want to stop at the breakpoint. Any other return
-value (including None or leaving out the return statement altogether) is akin
-to telling LLDB to actually stop at the breakpoint. This can be useful in
-situations where a breakpoint only needs to stop the process when certain
-conditions are met, and you do not want to inspect the program state manually
-at every stop and then continue.
-
-An example will show how simple it is to write some python code and attach it
-to a breakpoint. The following example will allow you to track the order in
-which the functions in a given shared library are first executed during one run
-of your program. This is a simple method to gather an order file which can be
-used to optimize function placement within a binary for execution locality.
-
-We do this by setting a regular expression breakpoint that will match every
-function in the shared library. The regular expression '.' will match any
-string that has at least one character in it, so we will use that. This will
-result in one lldb.SBBreakpoint object that contains an
-lldb.SBBreakpointLocation object for each function. As the breakpoint gets hit,
-we use a counter to track the order in which the function at this particular
-breakpoint location got hit. Since our code is passed the location that was
-hit, we can get the name of the function from the location, disable the
-location so we won't count this function again; then log some info and continue
-the process.
-
-Note we also have to initialize our counter, which we do with the simple
-one-line version of the script command.
-
-Here is the code:
-
-::
-
-   (lldb) breakpoint set --func-regex=. --shlib=libfoo.dylib
-   Breakpoint created: 1: regex = '.', module = libfoo.dylib, locations = 223
-   (lldb) script counter = 0
-   (lldb) breakpoint command add --script-type python 1
-   Enter your Python command(s). Type 'DONE' to end.
-   > # Increment our counter.  Since we are in a function, this must be a global python variable
-   > global counter
-   > counter += 1
-   > # Get the name of the function
-   > name = frame.GetFunctionName()
-   > # Print the order and the function name
-   > print('[%i] %s' % (counter, name))
-   > # Disable the current breakpoint location so it doesn't get hit again
-   > bp_loc.SetEnabled(False)
-   > # No need to stop here
-   > return False
-   > DONE
-
-The breakpoint command add command above attaches a python script to breakpoint 1. To remove the breakpoint command:
-
-::
-
-   (lldb) breakpoint command delete 1
-
-
-Using the python api's to create custom breakpoints
----------------------------------------------------
-
-
-Another use of the Python API's in lldb is to create a custom breakpoint
-resolver. This facility was added in r342259.
-
-It allows you to provide the algorithm which will be used in the breakpoint's
-search of the space of the code in a given Target to determine where to set the
-breakpoint locations - the actual places where the breakpoint will trigger. To
-understand how this works you need to know a little about how lldb handles
-breakpoints.
-
-In lldb, a breakpoint is composed of three parts: the Searcher, the Resolver,
-and the Stop Options. The Searcher and Resolver cooperate to determine how
-breakpoint locations are set and differ between each breakpoint type. Stop
-options determine what happens when a location triggers and includes the
-commands, conditions, ignore counts, etc. Stop options are common between all
-breakpoint types, so for our purposes only the Searcher and Resolver are
-relevant.
-
-The Searcher's job is to traverse in a structured way the code in the current
-target. It proceeds from the Target, to search all the Modules in the Target,
-in each Module it can recurse into the Compile Units in that module, and within
-each Compile Unit it can recurse over the Functions it contains.
-
-The Searcher can be provided with a SearchFilter that it will use to restrict
-this search. For instance, if the SearchFilter specifies a list of Modules, the
-Searcher will not recurse into Modules that aren't on the list. When you pass
-the -s modulename flag to break set you are creating a Module-based search
-filter. When you pass -f filename.c to break set -n you are creating a file
-based search filter. If neither of these is specified, the breakpoint will have
-a no-op search filter, so all parts of the program are searched and all
-locations accepted.
-
-The Resolver has two functions. The most important one is the callback it
-provides. This will get called at the appropriate time in the course of the
-search. The callback is where the job of adding locations to the breakpoint
-gets done.
-
-The other function is specifying to the Searcher at what depth in the above
-described recursion it wants to be called. Setting a search depth also provides
-a stop for the recursion. For instance, if you request a Module depth search,
-then the callback will be called for each Module as it gets added to the
-Target, but the searcher will not recurse into the Compile Units in the module.
-
-One other slight subtlety is that the depth at which you get called back is not
-necessarily the depth at which the SearchFilter is specified. For instance,
-if you are doing symbol searches, it is convenient to use the Module depth for
-the search, since symbols are stored in the module. But the SearchFilter might
-specify some subset of CompileUnits, so not all the symbols you might find in
-each module will pass the search. You don't need to handle this situation
-yourself, since SBBreakpoint::AddLocation will only add locations that pass the
-Search Filter. This API returns an SBError to inform you whether your location
-was added.
-
-When the breakpoint is originally created, its Searcher will process all the
-currently loaded modules. The Searcher will also visit any new modules as they
-are added to the target. This happens, for instance, when a new shared library
-gets added to the target in the course of running, or on rerunning if any of
-the currently loaded modules have been changed. Note, in the latter case, all
-the locations set in the old module will get deleted and you will be asked to
-recreate them in the new version of the module when your callback gets called
-with that module. For this reason, you shouldn't try to manage the locations
-you add to the breakpoint yourself. Note that the Breakpoint takes care of
-deduplicating equal addresses in AddLocation, so you shouldn't need to worry
-about that anyway.
-
-At present, when adding a scripted Breakpoint type, you can only provide a
-custom Resolver, not a custom SearchFilter.
-
-The custom Resolver is provided as a Python class with the following methods:
-
-+--------------------+---------------------------------------+------------------------------------------------------------------------------------------------------------------+
-| Name               | Arguments                             | Description                                                                                                      |
-+--------------------+---------------------------------------+------------------------------------------------------------------------------------------------------------------+
-| ``__init__``       | ``bkpt``:`lldb.SBBreakpoint`          | This is the constructor for the new Resolver.                                                                    |
-|                    | ``extra_args``:`lldb.SBStructuredData`|                                                                                                                  |
-|                    |                                       |                                                                                                                  |
-|                    |                                       | ``bkpt`` is the breakpoint owning this Resolver.                                                                 |
-|                    |                                       |                                                                                                                  |
-|                    |                                       |                                                                                                                  |
-|                    |                                       | ``extra_args`` is an `SBStructuredData` object that the user can pass in when creating instances of this         |
-|                    |                                       | breakpoint.  It is not required, but is quite handy.  For instance if you were implementing a breakpoint on some |
-|                    |                                       | symbol name, you could write a generic symbol name based Resolver, and then allow the user to pass               |
-|                    |                                       | in the particular symbol in the extra_args                                                                       |
-+--------------------+---------------------------------------+------------------------------------------------------------------------------------------------------------------+
-| ``__callback__``   | ``sym_ctx``:`lldb.SBSymbolContext`    | This is the Resolver callback.                                                                                   |
-|                    |                                       | The ``sym_ctx`` argument will be filled with the current stage                                                   |
-|                    |                                       | of the search.                                                                                                   |
-|                    |                                       |                                                                                                                  |
-|                    |                                       |                                                                                                                  |
-|                    |                                       | For instance, if you asked for a search depth of lldb.eSearchDepthCompUnit, then the                             |
-|                    |                                       | target, module and compile_unit fields of the sym_ctx will be filled.  The callback should look just in the      |
-|                    |                                       | context passed in ``sym_ctx`` for new locations.  If the callback finds an address of interest, it               |
-|                    |                                       | can add it to the breakpoint with the `SBBreakpoint.AddLocation` method, using the breakpoint passed             |
-|                    |                                       | in to the ``__init__`` method.                                                                                   |
-+--------------------+---------------------------------------+------------------------------------------------------------------------------------------------------------------+
-| ``__get_depth__``  | ``None``                              | Specify the depth at which you wish your callback to get called.  The currently supported options are:           |
-|                    |                                       |                                                                                                                  |
-|                    |                                       | `lldb.eSearchDepthModule`                                                                                        |
-|                    |                                       | `lldb.eSearchDepthCompUnit`                                                                                      |
-|                    |                                       | `lldb.eSearchDepthFunction`                                                                                      |
-|                    |                                       |                                                                                                                  |
-|                    |                                       | For instance, if you are looking                                                                                 |
-|                    |                                       | up symbols, which are stored at the Module level, you will want to get called back module by module.             |
-|                    |                                       | So you would want to return `lldb.eSearchDepthModule`.  This method is optional.  If not provided the search     |
-|                    |                                       | will be done at Module depth.                                                                                    |
-+--------------------+---------------------------------------+------------------------------------------------------------------------------------------------------------------+
-| ``get_short_help`` | ``None``                              | This is an optional method.  If provided, the returned string will be printed at the beginning of                |
-|                    |                                       | the description for this breakpoint.                                                                             |
-+--------------------+---------------------------------------+------------------------------------------------------------------------------------------------------------------+
-
-To define a new breakpoint command defined by this class from the lldb command
-line, use the command:
-
-::
-
-  (lldb) breakpoint set -P MyModule.MyResolverClass
-
-You can also populate the extra_args SBStructuredData with a dictionary of
-key/value pairs with:
-
-::
-
-  (lldb) breakpoint set -P MyModule.MyResolverClass -k key_1 -v value_1 -k key_2 -v value_2
-
-Although you can't write a scripted SearchFilter, both the command line and the
-SB API's for adding a scripted resolver allow you to specify a SearchFilter
-restricted to certain modules or certain compile units. When using the command
-line to create the resolver, you can specify a Module specific SearchFilter by
-passing the -s ModuleName option - which can be specified multiple times. You
-can also specify a SearchFilter restricted to certain compile units by passing
-in the -f CompUnitName option. This can also be specified more than once. And
-you can mix the two to specify "this comp unit in this module". So, for
-instance,
-
-::
-
-  (lldb) breakpoint set -P MyModule.MyResolverClass -s a.out
-
-will use your resolver, but will only recurse into or accept new locations in
-the module a.out.
-
-Another option for creating scripted breakpoints is to use the
-SBTarget.BreakpointCreateFromScript API. This one has the advantage that you
-can pass in an arbitrary SBStructuredData object, so you can create more
-complex parametrizations. SBStructuredData has a handy SetFromJSON method which
-you can use for this purpose. Your __init__ function gets passed this
-SBStructuredData object. This API also allows you to directly provide the list
-of Modules and the list of CompileUnits that will make up the SearchFilter. If
-you pass in empty lists, the breakpoint will use the default "search
-everywhere,accept everything" filter.
-
-Using the python API' to create custom stepping logic
------------------------------------------------------
-
-A slightly esoteric use of the Python API's is to construct custom stepping
-types. LLDB's stepping is driven by a stack of "thread plans" and a fairly
-simple state machine that runs the plans. You can create a Python class that
-works as a thread plan, and responds to the requests the state machine makes to
-run its operations.
-
-There is a longer discussion of scripted thread plans and the state machine,
-and several interesting examples of their use in:
-
-https://github.com/llvm/llvm-project/blob/main/lldb/examples/python/scripted_step.py
-
-And for a MUCH fuller discussion of the whole state machine, see:
-
-https://github.com/llvm/llvm-project/blob/main/lldb/include/lldb/Target/ThreadPlan.h
-
-If you are reading those comments it is useful to know that scripted thread
-plans are set to be "ControllingPlans", and not "OkayToDiscard".
-
-To implement a scripted step, you define a python class that has the following
-methods:
-
-+-------------------+------------------------------------+---------------------------------------------------------------------------------------+
-| Name              | Arguments                          | Description                                                                           |
-+-------------------+------------------------------------+---------------------------------------------------------------------------------------+
-| ``__init__``      | ``thread_plan``:`lldb.SBThreadPlan`| This is the underlying `SBThreadPlan` that is pushed onto the plan stack.             |
-|                   |                                    | You will want to store this away in an ivar.  Also, if you are going to               |
-|                   |                                    | use one of the canned thread plans, you can queue it at this point.                   |
-+-------------------+------------------------------------+---------------------------------------------------------------------------------------+
-| ``explains_stop`` | ``event``: `lldb.SBEvent`          | Return True if this stop is part of your thread plans logic, false otherwise.         |
-+-------------------+------------------------------------+---------------------------------------------------------------------------------------+
-| ``is_stale``      | ``None``                           | If your plan is no longer relevant (for instance, you were                            |
-|                   |                                    | stepping in a particular stack frame, but some other operation                        |
-|                   |                                    | pushed that frame off the stack) return True and your plan will                       |
-|                   |                                    | get popped.                                                                           |
-+-------------------+------------------------------------+---------------------------------------------------------------------------------------+
-| ``should_step``   | ``None``                           | Return ``True`` if you want lldb to instruction step one instruction,                 |
-|                   |                                    | or False to continue till the next breakpoint is hit.                                 |
-+-------------------+------------------------------------+---------------------------------------------------------------------------------------+
-| ``should_stop``   | ``event``: `lldb.SBEvent`          | If your plan wants to stop and return control to the user at this point, return True. |
-|                   |                                    | If your plan is done at this point, call SetPlanComplete on your                      |
-|                   |                                    | thread plan instance.                                                                 |
-|                   |                                    | Also, do any work you need here to set up the next stage of stepping.                 |
-+-------------------+------------------------------------+---------------------------------------------------------------------------------------+
-
-To use this class to implement a step, use the command:
-
-::
-
-  (lldb) thread step-scripted -C MyModule.MyStepPlanClass
-
-Or use the SBThread.StepUsingScriptedThreadPlan API. The SBThreadPlan passed
-into your __init__ function can also push several common plans (step
-in/out/over and run-to-address) in front of itself on the stack, which can be
-used to compose more complex stepping operations. When you use subsidiary plans
-your explains_stop and should_stop methods won't get called until the
-subsidiary plan is done, or the process stops for an event the subsidiary plan
-doesn't explain. For instance, step over plans don't explain a breakpoint hit
-while performing the step-over.
-
-
-Create a new lldb command using a Python function
--------------------------------------------------
-
-Python functions can be used to create new LLDB command interpreter commands,
-which will work like all the natively defined lldb commands. This provides a
-very flexible and easy way to extend LLDB to meet your debugging requirements.
-
-To write a python function that implements a new LLDB command define the
-function to take five arguments as follows:
-
-::
-
-  def command_function(debugger, command, exe_ctx, result, internal_dict):
-      # Your code goes here
-
-The meaning of the arguments is given in the table below.
-
-If you provide a Python docstring in your command function LLDB will use it
-when providing "long help" for your command, as in:
-
-::
-
-  def command_function(debugger, command, result, internal_dict):
-      """This command takes a lot of options and does many fancy things"""
-      # Your code goes here
-
-though providing help can also be done programmatically (see below).
-
-Prior to lldb 3.5.2 (April 2015), LLDB Python command definitions didn't take the SBExecutionContext
-argument. So you may still see commands where the command definition is:
-
-::
-
-  def command_function(debugger, command, result, internal_dict):
-      # Your code goes here
-
-Using this form is strongly discouraged because it can only operate on the "currently selected"
-target, process, thread, frame.  The command will behave as expected when run
-directly on the command line.  But if the command is used in a stop-hook, breakpoint
-callback, etc. where the response to the callback determines whether we will select
-this or that particular process/frame/thread, the global "currently selected"
-entity is not necessarily the one the callback is meant to handle.  In that case, this
-command definition form can't do the right thing.
-
-+-------------------+--------------------------------+----------------------------------------------------------------------------------------------------------------------------------+
-| Argument          | Type                           | Description                                                                                                                      |
-+-------------------+--------------------------------+----------------------------------------------------------------------------------------------------------------------------------+
-| ``debugger``      | `lldb.SBDebugger`              | The current debugger object.                                                                                                     |
-+-------------------+--------------------------------+----------------------------------------------------------------------------------------------------------------------------------+
-| ``command``       | ``python string``              | A python string containing all arguments for your command. If you need to chop up the arguments                                  |
-|                   |                                | try using the ``shlex`` module's ``shlex.split(command)`` to properly extract the                                                |
-|                   |                                | arguments.                                                                                                                       |
-+-------------------+--------------------------------+----------------------------------------------------------------------------------------------------------------------------------+
-| ``exe_ctx``       | `lldb.SBExecutionContext`      | An execution context object carrying around information on the inferior process' context in which the command is expected to act |
-|                   |                                |                                                                                                                                  |
-|                   |                                | *Optional since lldb 3.5.2, unavailable before*                                                                                  |
-+-------------------+--------------------------------+----------------------------------------------------------------------------------------------------------------------------------+
-| ``result``        | `lldb.SBCommandReturnObject`   | A return object which encapsulates success/failure information for the command and output text                                   |
-|                   |                                | that needs to be printed as a result of the command. The plain Python "print" command also works but                             |
-|                   |                                | text won't go in the result by default (it is useful as a temporary logging facility).                                           |
-+-------------------+--------------------------------+----------------------------------------------------------------------------------------------------------------------------------+
-| ``internal_dict`` | ``python dict object``         | The dictionary for the current embedded script session which contains all variables                                              |
-|                   |                                | and functions.                                                                                                                   |
-+-------------------+--------------------------------+----------------------------------------------------------------------------------------------------------------------------------+
-
-Since lldb 3.7, Python commands can also be implemented by means of a class
-which should implement the following interface:
-
-.. code-block:: python
-
-  class CommandObjectType:
-      def __init__(self, debugger, internal_dict):
-          this call should initialize the command with respect to the command interpreter for the passed-in debugger
-      def __call__(self, debugger, command, exe_ctx, result):
-          this is the actual bulk of the command, akin to Python command functions
-      def get_short_help(self):
-          this call should return the short help text for this command[1]
-      def get_long_help(self):
-          this call should return the long help text for this command[1]
-      def get_flags(self):
-          this will be called when the command is added to the command interpreter,
-          and should return a flag field made from or-ing together the appropriate
-          elements of the lldb.CommandFlags enum to specify the requirements of this command.
-          The CommandInterpreter will make sure all these requirements are met, and will
-          return the standard lldb error if they are not.[1]
-      def get_repeat_command(self, command):
-          The auto-repeat command is what will get executed when the user types just
-          a return at the next prompt after this command is run.  Even if your command
-          was run because it was specified as a repeat command, that invocation will still
-          get asked for IT'S repeat command, so you can chain a series of repeats, for instance
-          to implement a pager.
-
-          The command argument is the command that is about to be executed.
-
-          If this call returns None, then the ordinary repeat mechanism will be used
-          If this call returns an empty string, then auto-repeat is disabled
-          If this call returns any other string, that will be the repeat command [1]
-
-[1] This method is optional.
-
-As a convenience, you can treat the result object as a Python file object, and
-say
-
-.. code-block:: python
-
-  print("my command does lots of cool stuff", file=result)
-
-SBCommandReturnObject and SBStream both support this file-like behavior by
-providing write() and flush() calls at the Python layer.
-
-The commands that are added using this class definition are what lldb calls
-"raw" commands.  The command interpreter doesn't attempt to parse the command,
-doesn't handle option values, neither generating help for them, or their
-completion.  Raw commands are useful when the arguments passed to the command
-are unstructured, and having to protect them against lldb command parsing would
-be onerous.  For instance, "expr" is a raw command.
-
-You can also add scripted commands that implement the "parsed command", where
-the options and their types are specified, as well as the argument and argument
-types.  These commands look and act like the majority of lldb commands, and you
-can also add custom completions for the options and/or the arguments if you have
-special needs.
-
-The easiest way to do this is to derive your new command from the lldb.ParsedCommand
-class.  That responds in the same way to the help & repeat command interfaces, and
-provides some convenience methods, and most importantly an LLDBOptionValueParser,
-accessed through lldb.ParsedCommand.get_parser().  The parser is used to set
-your command definitions, and to retrieve option values in the __call__ method.
-
-To set up the command definition, implement the ParsedCommand abstract method:
-
-.. code-block:: python
-
-   def setup_command_definition(self):
-
-This is called when your command is added to lldb.  In this method you add the
-options and their types, the option help strings, etc. to the command using the API:
-
-.. code-block:: python
-
-    def add_option(self, short_option, long_option, help, default,
-                   dest = None, required=False, groups = None,
-                   value_type=lldb.eArgTypeNone, completion_type=None,
-                   enum_values=None):
-        """
-        short_option: one character, must be unique, not required
-        long_option:  no spaces, must be unique, required
-        help:         a usage string for this option, will print in the command help
-        default:      the initial value for this option (if it has a value)
-        dest:         the name of the property that gives you access to the value for
-                      this value.  Defaults to the long option if not provided.
-        required: if true, this option must be provided or the command will error out
-        groups: Which "option groups" does this option belong to.  This can either be
-                a simple list (e.g. [1, 3, 4, 5]) or you can specify ranges by sublists:
-                so [1, [3,5]] is the same as [1, 3, 4, 5].
-        value_type: one of the lldb.eArgType enum values.  Some of the common arg
-                    types also have default completers, which will be applied automatically.
-        completion_type: currently these are values form the lldb.CompletionType enum.	If
-                         you need custom completions, implement	handle_option_argument_completion.
-        enum_values: An array of duples: ["element_name", "element_help"].  If provided,
-                     only one of the enum elements is allowed.  The value will be the
-                     element_name for the chosen enum element as a string.
-        """
-
-Similarly, you can add argument types to the command:
-
-.. code-block:: python
-
-    def make_argument_element(self, arg_type, repeat = "optional", groups = None):
-        """
-      	arg_type: The argument type, one of the	lldb.eArgType enum values.
-      	repeat:	Choose from the	following options:
-      	      	"plain"	- one value
-      	      	"optional" - zero or more values
-      	      	"plus" - one or	more values
-      	groups:	As with	add_option.
-        """
-
-Then implement the body of the command by defining:
-
-.. code-block:: python
-
-    def __call__(self, debugger, args_array, exe_ctx, result):
-        """This is the command callback.  The option values are
-        provided by the 'dest' properties on the parser.
-
-        args_array: This is the list of arguments provided.
-        exe_ctx: Gives the SBExecutionContext on which the
-                 command should operate.
-        result:  Any results of the command should be
-                 written into this SBCommandReturnObject.
-        """
-
-This differs from the "raw" command's __call__ in that the arguments are already
-parsed into the args_array, and the option values are set in the parser, and
-can be accessed using their property name.  The LLDBOptionValueParser class has
-a couple of other handy methods:
-
-.. code-block:: python
-    def was_set(self, long_option_name):
-
-returns True if the option was specified on the command line.
-
-.. code-block:: python
-
-    def dest_for_option(self, long_option_name):
-    """
-    This will return the value of the dest variable you defined for opt_name.
-    Mostly useful for handle_completion where you get passed the long option.
-    """
-
-lldb will handle completing your option names, and all your enum values
-automatically.  If your option or argument types have associated built-in completers,
-then lldb will also handle that completion for you.  But if you have a need for
-custom completions, either in your arguments or option values, you can handle
-completion by hand as well.  To handle completion of option value arguments,
-your lldb.ParsedCommand subclass should implement:
-
-.. code-block:: python
-
-    def handle_option_argument_completion(self, long_option, cursor_pos):
-    """
-    long_option: The long option name of the option whose value you are
-                 asked to complete.
-    cursor_pos: The cursor position in the value for that option - which
-    you can get from the option parser.
-    """
-
-And to handle the completion of arguments:
-
-.. code-block:: python
-
-    def handle_argument_completion(self, args, arg_pos, cursor_pos):
-    """
-    args: A list of the arguments to the command
-    arg_pos: An index into the args list of the argument with the cursor
-    cursor_pos: The cursor position in the arg specified by arg_pos
-    """
-
-When either of these API's is called, the command line will have been parsed up to
-the word containing the cursor, and any option values set in that part of the command
-string are available from the option value parser.  That's useful for instance
-if you have a --shared-library option that would constrain the completions for,
-say, a symbol name option or argument.
-
-The return value specifies what the completion options are.  You have four
-choices:
-
-- `True`: the completion was handled with no completions.
-
-- `False`: the completion was not handled, forward it to the regular
-completion machinery.
-
-- A dictionary with the key: "completion": there is one candidate,
-whose value is the value of the "completion" key.  Optionally you can pass a
-"mode" key whose value is either "partial" or "complete".  Return partial if
-the "completion" string is a prefix for all the completed value.
-
-For instance, if the string you are completing is "Test" and the available completions are:
-"Test1", "Test11" and "Test111", you should return the dictionary:
-
-.. code-block:: python
-
-   return {"completion": "Test1", "mode" : "partial"}
-
-and then lldb will add the "1" at the cursor and advance it after the added string,
-waiting for more completions.  But if "Test1" is the only completion, return:
-
-.. code-block:: python
-
-   {"completion": "Test1", "mode": "complete"}
-
-and lldb will add "1 " at the cursor, indicating the command string is complete.
-
-The default is "complete", you don't need to specify a "mode" in that case.
-
-- A dictionary with the key: "values" whose value is a list of candidate completion
-strings.  The command interpreter will present those strings as the available choices.
-You can optionally include a "descriptions" key, whose value is a parallel array
-of description strings, and the completion will show the description next to
-each completion.
-
-
-One other handy convenience when defining lldb command-line commands is the
-command "command script import" which will import a module specified by file
-path, so you don't have to change your PYTHONPATH for temporary scripts. It
-also has another convenience that if your new script module has a function of
-the form:
-
-.. code-block python
-
-  def __lldb_init_module(debugger, internal_dict):
-      # Command Initialization code goes here
-
-where debugger and internal_dict are as above, that function will get run when
-the module is loaded allowing you to add whatever commands you want into the
-current debugger. Note that this function will only be run when using the LLDB
-command ``command script import``, it will not get run if anyone imports your
-module from another module.
-
-The standard test for ``__main__``, like many python modules do, is useful for
-creating scripts that can be run from the command line. However, for command
-line scripts, the debugger instance must be created manually. Sample code would
-look like:
-
-.. code-block:: python
-
-  if __name__ == '__main__':
-      # Initialize the debugger before making any API calls.
-      lldb.SBDebugger.Initialize()
-      # Create a new debugger instance in your module if your module
-      # can be run from the command line. When we run a script from
-      # the command line, we won't have any debugger object in
-      # lldb.debugger, so we can just create it if it will be needed
-      debugger = lldb.SBDebugger.Create()
-
-      # Next, do whatever work this module should do when run as a command.
-      # ...
-
-      # Finally, dispose of the debugger you just made.
-      lldb.SBDebugger.Destroy(debugger)
-      # Terminate the debug session
-      lldb.SBDebugger.Terminate()
-
-
-Now we can create a module called ls.py in the file ~/ls.py that will implement
-a function that can be used by LLDB's python command code:
-
-.. code-block:: python
-
-  #!/usr/bin/env python
-
-  import lldb
-  import commands
-  import optparse
-  import shlex
-
-  def ls(debugger, command, result, internal_dict):
-      print >>result, (commands.getoutput('/bin/ls %s' % command))
-
-  # And the initialization code to add your commands
-  def __lldb_init_module(debugger, internal_dict):
-      debugger.HandleCommand('command script add -f ls.ls ls')
-      print('The "ls" python command has been installed and is ready for use.')
-
-Now we can load the module into LLDB and use it
-
-::
-
-  $ lldb
-  (lldb) command script import ~/ls.py
-  The "ls" python command has been installed and is ready for use.
-  (lldb) ls -l /tmp/
-  total 365848
-  -rw-r--r--@  1 someuser  wheel         6148 Jan 19 17:27 .DS_Store
-  -rw-------   1 someuser  wheel         7331 Jan 19 15:37 crash.log
-
-You can also make "container" commands to organize the commands you are adding to
-lldb.  Most of the lldb built-in commands structure themselves this way, and using
-a tree structure has the benefit of leaving the one-word command space free for user
-aliases.  It can also make it easier to find commands if you are adding more than
-a few of them.  Here's a trivial example of adding two "utility" commands into a
-"my-utilities" container:
-
-::
-
-  #!/usr/bin/env python
-
-  import lldb
-
-  def first_utility(debugger, command, result, internal_dict):
-      print("I am the first utility")
-
-  def second_utility(debugger, command, result, internal_dict):
-      print("I am the second utility")
-
-  # And the initialization code to add your commands
-  def __lldb_init_module(debugger, internal_dict):
-      debugger.HandleCommand('command container add -h "A container for my utilities" my-utilities')
-      debugger.HandleCommand('command script add -f my_utilities.first_utility -h "My first utility" my-utilities first')
-      debugger.HandleCommand('command script add -f my_utilities.second_utility -h "My second utility" my-utilities second')
-      print('The "my-utilities" python command has been installed and its subcommands are ready for use.')
-
-Then your new commands are available under the my-utilities node:
-
-::
-
-  (lldb) help my-utilities
-  A container for my utilities
-
-  Syntax: my-utilities
-
-  The following subcommands are supported:
-
-      first  -- My first utility  Expects 'raw' input (see 'help raw-input'.)
-      second -- My second utility  Expects 'raw' input (see 'help raw-input'.)
-
-  For more help on any particular subcommand, type 'help <command> <subcommand>'.
-  (lldb) my-utilities first
-  I am the first utility
-
-
-A more interesting template has been created in the source repository that can
-help you to create lldb command quickly:
-
-https://github.com/llvm/llvm-project/blob/main/lldb/examples/python/cmdtemplate.py
-
-A commonly required facility is being able to create a command that does some
-token substitution, and then runs a different debugger command (usually, it
-po'es the result of an expression evaluated on its argument). For instance,
-given the following program:
-
-::
-
-  #import <Foundation/Foundation.h>
-  NSString*
-  ModifyString(NSString* src)
-  {
-  	return [src stringByAppendingString:@"foobar"];
-  }
-
-  int main()
-  {
-  	NSString* aString = @"Hello world";
-  	NSString* anotherString = @"Let's be friends";
-  	return 1;
-  }
-
-you may want a pofoo X command, that equates po [ModifyString(X)
-capitalizedString]. The following debugger interaction shows how to achieve
-that goal:
-
-::
-
-  (lldb) script
-  Python Interactive Interpreter. To exit, type 'quit()', 'exit()' or Ctrl-D.
-  >>> def pofoo_funct(debugger, command, result, internal_dict):
-  ...	cmd = "po [ModifyString(" + command + ") capitalizedString]"
-  ...	debugger.HandleCommand(cmd)
-  ...
-  >>> ^D
-  (lldb) command script add pofoo -f pofoo_funct
-  (lldb) pofoo aString
-  $1 = 0x000000010010aa00 Hello Worldfoobar
-  (lldb) pofoo anotherString
-  $2 = 0x000000010010aba0 Let's Be Friendsfoobar
-
-Using the lldb.py module in Python
-----------------------------------
-
-LLDB has all of its core code built into a shared library which gets used by
-the `lldb` command line application. On macOS this shared library is a
-framework: LLDB.framework and on other unix variants the program is a shared
-library: lldb.so. LLDB also provides an lldb.py module that contains the
-bindings from LLDB into Python. To use the LLDB.framework to create your own
-stand-alone python programs, you will need to tell python where to look in
-order to find this module. This is done by setting the PYTHONPATH environment
-variable, adding a path to the directory that contains the lldb.py python
-module. The lldb driver program has an option to report the path to the lldb
-module. You can use that to point to correct lldb.py:
-
-For csh and tcsh:
-
-::
-
-  % setenv PYTHONPATH `lldb -P`
-
-For sh and bash:
-
-::
-
-  $ export PYTHONPATH=`lldb -P`
-
-Alternatively, you can append the LLDB Python directory to the sys.path list
-directly in your Python code before importing the lldb module.
-
-Now your python scripts are ready to import the lldb module. Below is a python
-script that will launch a program from the current working directory called
-"a.out", set a breakpoint at "main", and then run and hit the breakpoint, and
-print the process, thread and frame objects if the process stopped:
-
-.. code-block:: python
-
-  #!/usr/bin/env python3
-
-  import lldb
-  import os
-
-
-  def disassemble_instructions(insts):
-      for i in insts:
-          print(i)
-
-
-  # Set the path to the executable to debug
-  exe = "./a.out"
-
-  # Create a new debugger instance
-  debugger = lldb.SBDebugger.Create()
-
-  # When we step or continue, don't return from the function until the process
-  # stops. Otherwise we would have to handle the process events ourselves which, while doable is
-  # a little tricky.  We do this by setting the async mode to false.
-  debugger.SetAsync(False)
-
-  # Create a target from a file and arch
-  print("Creating a target for '%s'" % exe)
-
-  target = debugger.CreateTargetWithFileAndArch(exe, lldb.LLDB_ARCH_DEFAULT)
-
-  if target:
-      # If the target is valid set a breakpoint at main
-      main_bp = target.BreakpointCreateByName(
-          "main", target.GetExecutable().GetFilename()
-      )
-
-      print(main_bp)
-
-      # Launch the process. Since we specified synchronous mode, we won't return
-      # from this function until we hit the breakpoint at main
-      process = target.LaunchSimple(None, None, os.getcwd())
-
-      # Make sure the launch went ok
-      if process:
-          # Print some simple process info
-          state = process.GetState()
-          print(process)
-          if state == lldb.eStateStopped:
-              # Get the first thread
-              thread = process.GetThreadAtIndex(0)
-              if thread:
-                  # Print some simple thread info
-                  print(thread)
-                  # Get the first frame
-                  frame = thread.GetFrameAtIndex(0)
-                  if frame:
-                      # Print some simple frame info
-                      print(frame)
-                      function = frame.GetFunction()
-                      # See if we have debug info (a function)
-                      if function:
-                          # We do have a function, print some info for the function
-                          print(function)
-                          # Now get all instructions for this function and print them
-                          insts = function.GetInstructions(target)
-                          disassemble_instructions(insts)
-                      else:
-                          # See if we have a symbol in the symbol table for where we stopped
-                          symbol = frame.GetSymbol()
-                          if symbol:
-                              # We do have a symbol, print some info for the symbol
-                              print(symbol)
-
-Writing lldb frame recognizers in Python
-----------------------------------------
-
-Frame recognizers allow for retrieving information about special frames based
-on ABI, arguments or other special properties of that frame, even without
-source code or debug info. Currently, one use case is to extract function
-arguments that would otherwise be inaccessible, or augment existing arguments.
-
-Adding a custom frame recognizer is done by implementing a Python class and
-using the 'frame recognizer add' command. The Python class should have a
-'get_recognized_arguments' method and it will receive an argument of type
-lldb.SBFrame representing the current frame that we are trying to recognize.
-The method should return a (possibly empty) list of lldb.SBValue objects that
-represent the recognized arguments.
-
-An example of a recognizer that retrieves the file descriptor values from libc
-functions 'read', 'write' and 'close' follows:
-
-::
-
-  class LibcFdRecognizer(object):
-    def get_recognized_arguments(self, frame):
-      if frame.name in ["read", "write", "close"]:
-        fd = frame.EvaluateExpression("$arg1").unsigned
-        target = frame.thread.process.target
-        value = target.CreateValueFromExpression("fd", "(int)%d" % fd)
-        return [value]
-      return []
-
-The file containing this implementation can be imported via ``command script import``
-and then we can register this recognizer with ``frame recognizer add``.
-It's important to restrict the recognizer to the libc library (which is
-libsystem_kernel.dylib on macOS) to avoid matching functions with the same name
-in other modules:
-
-::
-
-  (lldb) command script import .../fd_recognizer.py
-  (lldb) frame recognizer add -l fd_recognizer.LibcFdRecognizer -n read -s libsystem_kernel.dylib
-
-When the program is stopped at the beginning of the 'read' function in libc, we can view the recognizer arguments in 'frame variable':
-
-::
-
-  (lldb) b read
-  (lldb) r
-  Process 1234 stopped
-  * thread #1, queue = 'com.apple.main-thread', stop reason = breakpoint 1.3
-      frame #0: 0x00007fff06013ca0 libsystem_kernel.dylib`read
-  (lldb) frame variable
-  (int) fd = 3
-
-Writing Target Stop-Hooks in Python
------------------------------------
-
-Stop hooks fire whenever the process stops just before control is returned to the
-user.  Stop hooks can either be a set of lldb command-line commands, or can
-be implemented by a suitably defined Python class.  The Python-based stop-hooks
-can also be passed as a set of -key -value pairs when they are added, and those
-will get packaged up into a SBStructuredData Dictionary and passed to the
-constructor of the Python object managing the stop hook.  This allows for
-parameterization of the stop hooks.
-
-To add a Python-based stop hook, first define a class with the following methods:
-
-+--------------------+---------------------------------------+------------------------------------------------------------------------------------------------------------------+
-| Name               | Arguments                             | Description                                                                                                      |
-+--------------------+---------------------------------------+------------------------------------------------------------------------------------------------------------------+
-| ``__init__``       | ``target: lldb.SBTarget``             | This is the constructor for the new stop-hook.                                                                   |
-|                    | ``extra_args: lldb.SBStructuredData`` |                                                                                                                  |
-|                    |                                       |                                                                                                                  |
-|                    |                                       | ``target`` is the SBTarget to which the stop hook is added.                                                      |
-|                    |                                       |                                                                                                                  |
-|                    |                                       | ``extra_args`` is an SBStructuredData object that the user can pass in when creating instances of this           |
-|                    |                                       | breakpoint.  It is not required, but allows for reuse of stop-hook classes.                                      |
-+--------------------+---------------------------------------+------------------------------------------------------------------------------------------------------------------+
-| ``handle_stop``    | ``exe_ctx: lldb.SBExecutionContext``  | This is the called when the target stops.                                                                        |
-|                    | ``stream: lldb.SBStream``             |                                                                                                                  |
-|                    |                                       | ``exe_ctx`` argument will be filled with the current stop point for which the stop hook is                       |
-|                    |                                       | being evaluated.                                                                                                 |
-|                    |                                       |                                                                                                                  |
-|                    |                                       | ``stream`` an lldb.SBStream, anything written to this stream will be written to the debugger console.            |
-|                    |                                       |                                                                                                                  |
-|                    |                                       | The return value is a "Should Stop" vote from this thread.  If the method returns either True or no return       |
-|                    |                                       | this thread votes to stop.  If it returns False, then the thread votes to continue after all the stop-hooks      |
-|                    |                                       | are evaluated.                                                                                                   |
-|                    |                                       | Note, the --auto-continue flag to 'target stop-hook add' overrides a True return value from the method.          |
-+--------------------+---------------------------------------+------------------------------------------------------------------------------------------------------------------+
-
-To use this class in lldb, run the command:
-
-::
-
-   (lldb) command script import MyModule.py
-   (lldb) target stop-hook add -P MyModule.MyStopHook -k first -v 1 -k second -v 2
-
-where MyModule.py is the file containing the class definition MyStopHook.
+Python Tutorials
+-----------------
+
+The following tutorials and documentation demonstrate various Python capabilities within LLDB:
+
+.. toctree::
+   :maxdepth: 1
+
+   tutorials/accessing-documentation
+   tutorials/python-embedded-interpreter
+   tutorials/script-driven-debugging
+   tutorials/breakpoint-triggered-scripts
+   tutorials/creating-custom-breakpoints
+   tutorials/automating-stepping-logic
+   tutorials/writing-custom-commands
+   tutorials/implementing-standalone-scripts
+   tutorials/custom-frame-recognizers
+   tutorials/extending-target-stop-hooks
\ No newline at end of file
diff --git a/lldb/docs/use/python.rst b/lldb/docs/use/python.rst
deleted file mode 100644
index 3a919f2a8cdb1..0000000000000
--- a/lldb/docs/use/python.rst
+++ /dev/null
@@ -1,799 +0,0 @@
-Python Scripting
-================
-
-LLDB has been structured from the beginning to be scriptable in two
-ways -- a Unix Python session can initiate/run a debug session
-non-interactively using LLDB; and within the LLDB debugger tool, Python
-scripts can be used to help with many tasks, including inspecting
-program data, iterating over containers and determining if a breakpoint
-should stop execution or continue. This document will show how to do
-some of these things by going through an example, explaining how to use
-Python scripting to find a bug in a program that searches for text in a
-large binary tree.
-
-The Test Program and Input
---------------------------
-
-We have a simple C program (dictionary.c) that reads in a text file,
-and stores all the words from the file in a Binary Search Tree, sorted
-alphabetically. It then enters a loop prompting the user for a word,
-searching for the word in the tree (using Binary Search), and reporting
-to the user whether or not it found the word in the tree.
-
-The input text file we are using to test our program contains the text
-for William Shakespeare's famous tragedy "Romeo and Juliet".
-
-The Bug
--------
-
-When we try running our program, we find there is a problem. While it
-successfully finds some of the words we would expect to find, such as
-"love" or "sun", it fails to find the word "Romeo", which MUST be in
-the input text file:
-
-::
-
-   $ ./dictionary Romeo-and-Juliet.txt
-   Dictionary loaded.
-   Enter search word: love
-   Yes!
-   Enter search word: sun
-   Yes!
-   Enter search word: Romeo
-   No!
-   Enter search word: ^D
-   $
-
-Using Depth First Search
-------------------------
-
-Our first job is to determine if the word "Romeo" actually got inserted
-into the tree or not. Since "Romeo and Juliet" has thousands of words,
-trying to examine our binary search tree by hand is completely
-impractical. Therefore we will write a Python script to search the tree
-for us. We will write a recursive Depth First Search function that
-traverses the entire tree searching for a word, and maintaining
-information about the path from the root of the tree to the current
-node. If it finds the word in the tree, it returns the path from the
-root to the node containing the word. This is what our DFS function in
-Python would look like, with line numbers added for easy reference in
-later explanations:
-
-::
-
-   1: def DFS (root, word, cur_path):
-   2:     root_word_ptr = root.GetChildMemberWithName ("word")
-   3:     left_child_ptr = root.GetChildMemberWithName ("left")
-   4:     right_child_ptr = root.GetChildMemberWithName ("right")
-   5:     root_word = root_word_ptr.GetSummary()
-   6:     end = len (root_word) - 1
-   7:     if root_word[0] == '"' and root_word[end] == '"':
-   8:         root_word = root_word[1:end]
-   9:     end = len (root_word) - 1
-   10:     if root_word[0] == '\'' and root_word[end] == '\'':
-   11:        root_word = root_word[1:end]
-   12:     if root_word == word:
-   13:         return cur_path
-   14:     elif word < root_word:
-   15:         if left_child_ptr.GetValue() is None:
-   16:             return ""
-   17:         else:
-   18:             cur_path = cur_path + "L"
-   19:             return DFS (left_child_ptr, word, cur_path)
-   20:     else:
-   21:         if right_child_ptr.GetValue() is None:
-   22:             return ""
-   23:         else:
-   24:             cur_path = cur_path + "R"
-   25:             return DFS (right_child_ptr, word, cur_path)
-
-
-Accessing & Manipulating Program Variables
-------------------------------------------
-
-Before we can call any Python function on any of our program's
-variables, we need to get the variable into a form that Python can
-access. To show you how to do this we will look at the parameters for
-the DFS function. The first parameter is going to be a node in our
-binary search tree, put into a Python variable. The second parameter is
-the word we are searching for (a string), and the third parameter is a
-string representing the path from the root of the tree to our current
-node.
-
-The most interesting parameter is the first one, the Python variable
-that needs to contain a node in our search tree. How can we take a
-variable out of our program and put it into a Python variable? What
-kind of Python variable will it be? The answers are to use the LLDB API
-functions, provided as part of the LLDB Python module. Running Python
-from inside LLDB, LLDB will automatically give us our current frame
-object as a Python variable, "lldb.frame". This variable has the type
-`SBFrame` (see the LLDB API for more information about `SBFrame`
-objects). One of the things we can do with a frame object, is to ask it
-to find and return its local variable. We will call the API function
-`SBFrame.FindVariable` on the lldb.frame object to give us our dictionary
-variable as a Python variable:
-
-::
-
-   root = lldb.frame.FindVariable ("dictionary")
-
-The line above, executed in the Python script interpreter in LLDB, asks the
-current frame to find the variable named "dictionary" and return it. We then
-store the returned value in the Python variable named "root". This answers the
-question of HOW to get the variable, but it still doesn't explain WHAT actually
-gets put into "root". If you examine the LLDB API, you will find that the
-`SBFrame` method "FindVariable" returns an object of type `SBValue`. `SBValue`
-objects are used, among other things, to wrap up program variables and values.
-There are many useful methods defined in the `SBValue` class to allow you to get
-information or children values out of SBValues. For complete information, see
-the header file SBValue.h. The `SBValue` methods that we use in our DFS function
-are ``GetChildMemberWithName()``, ``GetSummary()``, and ``GetValue()``.
-
-
-Explaining DFS Script in Detail
--------------------------------
-
-Before diving into the details of this code, it would be best to give a
-high-level overview of what it does. The nodes in our binary search tree were
-defined to have type ``tree_node *``, which is defined as:
-
-::
-
-   typedef struct tree_node
-   {
-      const char *word;
-      struct tree_node *left;
-      struct tree_node *right;
-   } tree_node;
-
-Lines 2-11 of DFS are getting data out of the current tree node and getting
-ready to do the actual search; lines 12-25 are the actual depth-first search.
-Lines 2-4 of our DFS function get the word, left and right fields out of the
-current node and store them in Python variables. Since root_word_ptr is a
-pointer to our word, and we want the actual word, line 5 calls GetSummary() to
-get a string containing the value out of the pointer. Since GetSummary() adds
-quotes around its result, lines 6-11 strip surrounding quotes off the word.
-
-Line 12 checks to see if the word in the current node is the one we are
-searching for. If so, we are done, and line 13 returns the current path.
-Otherwise, line 14 checks to see if we should go left (search word comes before
-the current word). If we decide to go left, line 15 checks to see if the left
-pointer child is NULL ("None" is the Python equivalent of NULL). If the left
-pointer is NULL, then the word is not in this tree and we return an empty path
-(line 16). Otherwise, we add an "L" to the end of our current path string, to
-indicate we are going left (line 18), and then recurse on the left child (line
-19). Lines 20-25 are the same as lines 14-19, except for going right rather
-than going left.
-
-One other note: Typing something as long as our DFS function directly into the
-interpreter can be difficult, as making a single typing mistake means having to
-start all over. Therefore we recommend doing as we have done: Writing your
-longer, more complicated script functions in a separate file (in this case
-tree_utils.py) and then importing it into your LLDB Python interpreter.
-
-
-The DFS Script in Action
-------------------------
-
-At this point we are ready to use the DFS function to see if the word "Romeo"
-is in our tree or not. To actually use it in LLDB on our dictionary program,
-you would do something like this:
-
-::
-
-   $ lldb
-   (lldb) process attach -n "dictionary"
-   Architecture set to: x86_64.
-   Process 521 stopped
-   * thread #1: tid = 0x2c03, 0x00007fff86c8bea0 libSystem.B.dylib`read$NOCANCEL + 8, stop reason = signal SIGSTOP
-   frame #0: 0x00007fff86c8bea0 libSystem.B.dylib`read$NOCANCEL + 8
-   (lldb) breakpoint set -n find_word
-   Breakpoint created: 1: name = 'find_word', locations = 1, resolved = 1
-   (lldb) continue
-   Process 521 resuming
-   Process 521 stopped
-   * thread #1: tid = 0x2c03, 0x0000000100001830 dictionary`find_word + 16
-   at dictionary.c:105, stop reason = breakpoint 1.1
-   frame #0: 0x0000000100001830 dictionary`find_word + 16 at dictionary.c:105
-   102 int
-   103 find_word (tree_node *dictionary, char *word)
-   104 {
-   -> 105 if (!word || !dictionary)
-   106 return 0;
-   107
-   108 int compare_value = strcmp (word, dictionary->word);
-   (lldb) script
-   Python Interactive Interpreter. To exit, type 'quit()', 'exit()' or Ctrl-D.
-   >>> import tree_utils
-   >>> root = lldb.frame.FindVariable ("dictionary")
-   >>> current_path = ""
-   >>> path = tree_utils.DFS (root, "Romeo", current_path)
-   >>> print path
-   LLRRL
-   >>> ^D
-   (lldb)
-
-The first bit of code above shows starting lldb, attaching to the dictionary
-program, and getting to the find_word function in LLDB. The interesting part
-(as far as this example is concerned) begins when we enter the script command
-and drop into the embedded interactive Python interpreter. We will go over this
-Python code line by line. The first line
-
-::
-
-   import tree_utils
-
-
-imports the file where we wrote our DFS function, tree_utils.py, into Python.
-Notice that to import the file we leave off the ".py" extension. We can now
-call any function in that file, giving it the prefix "tree_utils.", so that
-Python knows where to look for the function. The line
-
-::
-
-   root = lldb.frame.FindVariable ("dictionary")
-
-
-gets our program variable "dictionary" (which contains the binary search tree)
-and puts it into the Python variable "root". See Accessing & Manipulating
-Program Variables in Python above for more details about how this works. The
-next line is
-
-::
-
-   current_path = ""
-
-This line initializes the current_path from the root of the tree to our current
-node. Since we are starting at the root of the tree, our current path starts as
-an empty string. As we go right and left through the tree, the DFS function
-will append an 'R' or an 'L' to the current path, as appropriate. The line
-
-::
-
-   path = tree_utils.DFS (root, "Romeo", current_path)
-
-calls our DFS function (prefixing it with the module name so that Python can
-find it). We pass in our binary tree stored in the variable root, the word we
-are searching for, and our current path. We assign whatever path the DFS
-function returns to the Python variable path.
-
-Finally, we want to see if the word was found or not, and if so we want to see
-the path through the tree to the word. So we do
-
-::
-
-   print path
-
-From this we can see that the word "Romeo" was indeed found in the tree, and
-the path from the root of the tree to the node containing "Romeo" is
-left-left-right-right-left.
-
-Using Breakpoint Command Scripts
---------------------------------
-
-We are halfway to figuring out what the problem is. We know the word we are
-looking for is in the binary tree, and we know exactly where it is in the
-binary tree. Now we need to figure out why our binary search algorithm is not
-finding the word. We will do this using breakpoint command scripts.
-
-The idea is as follows. The binary search algorithm has two main decision
-points: the decision to follow the right branch; and, the decision to follow
-the left branch. We will set a breakpoint at each of these decision points, and
-attach a Python breakpoint command script to each breakpoint. The breakpoint
-commands will use the global path Python variable that we got from our DFS
-function. Each time one of these decision breakpoints is hit, the script will
-compare the actual decision with the decision the front of the path variable
-says should be made (the first character of the path). If the actual decision
-and the path agree, then the front character is stripped off the path, and
-execution is resumed. In this case the user never even sees the breakpoint
-being hit. But if the decision differs from what the path says it should be,
-then the script prints out a message and does NOT resume execution, leaving the
-user sitting at the first point where a wrong decision is being made.
-
-Python Breakpoint Command Scripts Are Not What They Seem
---------------------------------------------------------
-
-What do we mean by that? When you enter a Python breakpoint command in LLDB, it
-appears that you are entering one or more plain lines of Python. BUT LLDB then
-takes what you entered and wraps it into a Python FUNCTION (just like using the
-"def" Python command). It automatically gives the function an obscure, unique,
-hard-to-stumble-across function name, and gives it two parameters: frame and
-bp_loc. When the breakpoint gets hit, LLDB wraps up the frame object where the
-breakpoint was hit, and the breakpoint location object for the breakpoint that
-was hit, and puts them into Python variables for you. It then calls the Python
-function that was created for the breakpoint command, and passes in the frame
-and breakpoint location objects.
-
-So, being practical, what does this mean for you when you write your Python
-breakpoint commands? It means that there are two things you need to keep in
-mind: 1. If you want to access any Python variables created outside your
-script, you must declare such variables to be global. If you do not declare
-them as global, then the Python function will treat them as local variables,
-and you will get unexpected behavior. 2. All Python breakpoint command scripts
-automatically have a frame and a bp_loc variable. The variables are pre-loaded
-by LLDB with the correct context for the breakpoint. You do not have to use
-these variables, but they are there if you want them.
-
-The Decision Point Breakpoint Commands
---------------------------------------
-
-This is what the Python breakpoint command script would look like for the
-decision to go right:
-
-::
-
-   global path
-   if path[0] == 'R':
-      path = path[1:]
-      thread = frame.GetThread()
-      process = thread.GetProcess()
-      process.Continue()
-   else:
-      print "Here is the problem; going right, should go left!"
-
-
-Just as a reminder, LLDB is going to take this script and wrap it up in a function, like this:
-
-::
-
-   def some_unique_and_obscure_function_name (frame, bp_loc):
-      global path
-      if path[0] == 'R':
-         path = path[1:]
-         thread = frame.GetThread()
-         process = thread.GetProcess()
-         process.Continue()
-      else:
-         print "Here is the problem; going right, should go left!"
-
-LLDB will call the function, passing in the correct frame and breakpoint
-location whenever the breakpoint gets hit. There are several things to notice
-about this function. The first one is that we are accessing and updating a
-piece of state (the path variable), and actually conditioning our behavior
-based upon this variable. Since the variable was defined outside of our script
-(and therefore outside of the corresponding function) we need to tell Python
-that we are accessing a global variable. That is what the first line of the
-script does. Next we check where the path says we should go and compare it to
-our decision (recall that we are at the breakpoint for the decision to go
-right). If the path agrees with our decision, then we strip the first character
-off of the path.
-
-Since the decision matched the path, we want to resume execution. To do this we
-make use of the frame parameter that LLDB guarantees will be there for us. We
-use LLDB API functions to get the current thread from the current frame, and
-then to get the process from the thread. Once we have the process, we tell it
-to resume execution (using the Continue() API function).
-
-If the decision to go right does not agree with the path, then we do not resume
-execution. We allow the breakpoint to remain stopped (by doing nothing), and we
-print an informational message telling the user we have found the problem, and
-what the problem is.
-
-Actually Using The Breakpoint Commands
---------------------------------------
-
-Now we will look at what happens when we actually use these breakpoint commands
-on our program. Doing a source list -n find_word shows us the function
-containing our two decision points. Looking at the code below, we see that we
-want to set our breakpoints on lines 113 and 115:
-
-::
-
-   (lldb) source list -n find_word
-   File: /Volumes/Data/HD2/carolinetice/Desktop/LLDB-Web-Examples/dictionary.c.
-   101
-   102 int
-   103 find_word (tree_node *dictionary, char *word)
-   104 {
-   105   if (!word || !dictionary)
-   106     return 0;
-   107
-   108   int compare_value = strcmp (word, dictionary->word);
-   109
-   110   if (compare_value == 0)
-   111     return 1;
-   112   else if (compare_value < 0)
-   113     return find_word (dictionary->left, word);
-   114   else
-   115     return find_word (dictionary->right, word);
-   116 }
-   117
-
-
-So, we set our breakpoints, enter our breakpoint command scripts, and see what happens:
-
-::
-
-   (lldb) breakpoint set -l 113
-   Breakpoint created: 2: file ='dictionary.c', line = 113, locations = 1, resolved = 1
-   (lldb) breakpoint set -l 115
-   Breakpoint created: 3: file ='dictionary.c', line = 115, locations = 1, resolved = 1
-   (lldb) breakpoint command add -s python 2
-   Enter your Python command(s). Type 'DONE' to end.
-   > global path
-   > if (path[0] == 'L'):
-   >     path = path[1:]
-   >     thread = frame.GetThread()
-   >     process = thread.GetProcess()
-   >     process.Continue()
-   > else:
-   >     print "Here is the problem. Going left, should go right!"
-   > DONE
-   (lldb) breakpoint command add -s python 3
-   Enter your Python command(s). Type 'DONE' to end.
-   > global path
-   > if (path[0] == 'R'):
-   >     path = path[1:]
-   >     thread = frame.GetThread()
-   >     process = thread.GetProcess()
-   >     process.Continue()
-   > else:
-   >     print "Here is the problem. Going right, should go left!"
-   > DONE
-   (lldb) continue
-   Process 696 resuming
-   Here is the problem. Going right, should go left!
-   Process 696 stopped
-   * thread #1: tid = 0x2d03, 0x000000010000189f dictionary`find_word + 127 at dictionary.c:115, stop reason = breakpoint 3.1
-   frame #0: 0x000000010000189f dictionary`find_word + 127 at dictionary.c:115
-      112   else if (compare_value < 0)
-      113     return find_word (dictionary->left, word);
-      114   else
-   -> 115     return find_word (dictionary->right, word);
-      116 }
-      117
-      118 void
-   (lldb)
-
-
-After setting our breakpoints, adding our breakpoint commands and continuing,
-we run for a little bit and then hit one of our breakpoints, printing out the
-error message from the breakpoint command. Apparently at this point in the
-tree, our search algorithm decided to go right, but our path says the node we
-want is to the left. Examining the word at the node where we stopped, and our
-search word, we see:
-
-::
-
-   (lldb) expr dictionary->word
-   (const char *) $1 = 0x0000000100100080 "dramatis"
-   (lldb) expr word
-   (char *) $2 = 0x00007fff5fbff108 "romeo"
-
-So the word at our current node is "dramatis", and the word we are searching
-for is "romeo". "romeo" comes after "dramatis" alphabetically, so it seems like
-going right would be the correct decision. Let's ask Python what it thinks the
-path from the current node to our word is:
-
-::
-
-   (lldb) script print path
-   LLRRL
-
-According to Python we need to go left-left-right-right-left from our current
-node to find the word we are looking for. Let's double check our tree, and see
-what word it has at that node:
-
-::
-
-   (lldb) expr dictionary->left->left->right->right->left->word
-   (const char *) $4 = 0x0000000100100880 "Romeo"
-
-So the word we are searching for is "romeo" and the word at our DFS location is
-"Romeo". Aha! One is uppercase and the other is lowercase: We seem to have a
-case conversion problem somewhere in our program (we do).
-
-This is the end of our example on how you might use Python scripting in LLDB to
-help you find bugs in your program.
-
-Source Files for The Example
-----------------------------
-
-The complete code for the Dictionary program (with case-conversion bug), the
-DFS function and other Python script examples (tree_utils.py) used for this
-example are available below.
-
-tree_utils.py - Example Python functions using LLDB's API, including DFS
-
-::
-
-   """
-   # ===-- tree_utils.py ---------------------------------------*- Python -*-===//
-   #
-   #  Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-   #  See https://llvm.org/LICENSE.txt for license information.
-   #  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-   #
-   # ===----------------------------------------------------------------------===//
-
-   tree_utils.py  - A set of functions for examining binary
-   search trees, based on the example search tree defined in
-   dictionary.c.  These functions contain calls to LLDB API
-   functions, and assume that the LLDB Python module has been
-   imported.
-
-   For a thorough explanation of how the DFS function works, and
-   for more information about dictionary.c go to
-   http://lldb.llvm.org/scripting.html
-   """
-
-
-   def DFS(root, word, cur_path):
-      """
-      Recursively traverse a binary search tree containing
-      words sorted alphabetically, searching for a particular
-      word in the tree.  Also maintains a string representing
-      the path from the root of the tree to the current node.
-      If the word is found in the tree, return the path string.
-      Otherwise return an empty string.
-
-      This function assumes the binary search tree is
-      the one defined in dictionary.c  It uses LLDB API
-      functions to examine and traverse the tree nodes.
-      """
-
-      # Get pointer field values out of node 'root'
-
-      root_word_ptr = root.GetChildMemberWithName("word")
-      left_child_ptr = root.GetChildMemberWithName("left")
-      right_child_ptr = root.GetChildMemberWithName("right")
-
-      # Get the word out of the word pointer and strip off
-      # surrounding quotes (added by call to GetSummary).
-
-      root_word = root_word_ptr.GetSummary()
-      end = len(root_word) - 1
-      if root_word[0] == '"' and root_word[end] == '"':
-         root_word = root_word[1:end]
-      end = len(root_word) - 1
-      if root_word[0] == '\'' and root_word[end] == '\'':
-         root_word = root_word[1:end]
-
-      # Main depth first search
-
-      if root_word == word:
-         return cur_path
-      elif word < root_word:
-
-         # Check to see if left child is NULL
-
-         if left_child_ptr.GetValue() is None:
-               return ""
-         else:
-               cur_path = cur_path + "L"
-               return DFS(left_child_ptr, word, cur_path)
-      else:
-
-         # Check to see if right child is NULL
-
-         if right_child_ptr.GetValue() is None:
-               return ""
-         else:
-               cur_path = cur_path + "R"
-               return DFS(right_child_ptr, word, cur_path)
-
-
-   def tree_size(root):
-      """
-      Recursively traverse a binary search tree, counting
-      the nodes in the tree.  Returns the final count.
-
-      This function assumes the binary search tree is
-      the one defined in dictionary.c  It uses LLDB API
-      functions to examine and traverse the tree nodes.
-      """
-      if (root.GetValue is None):
-         return 0
-
-      if (int(root.GetValue(), 16) == 0):
-         return 0
-
-      left_size = tree_size(root.GetChildAtIndex(1))
-      right_size = tree_size(root.GetChildAtIndex(2))
-
-      total_size = left_size + right_size + 1
-      return total_size
-
-
-   def print_tree(root):
-      """
-      Recursively traverse a binary search tree, printing out
-      the words at the nodes in alphabetical order (the
-      search order for the binary tree).
-
-      This function assumes the binary search tree is
-      the one defined in dictionary.c  It uses LLDB API
-      functions to examine and traverse the tree nodes.
-      """
-      if (root.GetChildAtIndex(1).GetValue() is not None) and (
-               int(root.GetChildAtIndex(1).GetValue(), 16) != 0):
-         print_tree(root.GetChildAtIndex(1))
-
-      print root.GetChildAtIndex(0).GetSummary()
-
-      if (root.GetChildAtIndex(2).GetValue() is not None) and (
-               int(root.GetChildAtIndex(2).GetValue(), 16) != 0):
-         print_tree(root.GetChildAtIndex(2))
-
-
-dictionary.c - Sample dictionary program, with bug
-
-::
-
-   //===-- dictionary.c ---------------------------------------------*- C -*-===//
-   //
-   // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-   // See https://llvm.org/LICENSE.txt for license information.
-   // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-   //
-   //===----------------------------------------------------------------------===//
-   #include <ctype.h>
-   #include <stdio.h>
-   #include <stdlib.h>
-   #include <string.h>
-
-   typedef struct tree_node {
-   const char *word;
-   struct tree_node *left;
-   struct tree_node *right;
-   } tree_node;
-
-   /* Given a char*, returns a substring that starts at the first
-      alphabet character and ends at the last alphabet character, i.e. it
-      strips off beginning or ending quotes, punctuation, etc. */
-
-   char *strip(char **word) {
-   char *start = *word;
-   int len = strlen(start);
-   char *end = start + len - 1;
-
-   while ((start < end) && (!isalpha(start[0])))
-      start++;
-
-   while ((end > start) && (!isalpha(end[0])))
-      end--;
-
-   if (start > end)
-      return NULL;
-
-   end[1] = '\0';
-   *word = start;
-
-   return start;
-   }
-
-   /* Given a binary search tree (sorted alphabetically by the word at
-      each node), and a new word, inserts the word at the appropriate
-      place in the tree.  */
-
-   void insert(tree_node *root, char *word) {
-   if (root == NULL)
-      return;
-
-   int compare_value = strcmp(word, root->word);
-
-   if (compare_value == 0)
-      return;
-
-   if (compare_value < 0) {
-      if (root->left != NULL)
-         insert(root->left, word);
-      else {
-         tree_node *new_node = (tree_node *)malloc(sizeof(tree_node));
-         new_node->word = strdup(word);
-         new_node->left = NULL;
-         new_node->right = NULL;
-         root->left = new_node;
-      }
-   } else {
-      if (root->right != NULL)
-         insert(root->right, word);
-      else {
-         tree_node *new_node = (tree_node *)malloc(sizeof(tree_node));
-         new_node->word = strdup(word);
-         new_node->left = NULL;
-         new_node->right = NULL;
-         root->right = new_node;
-      }
-   }
-   }
-
-   /* Read in a text file and storea all the words from the file in a
-      binary search tree.  */
-
-   void populate_dictionary(tree_node **dictionary, char *filename) {
-   FILE *in_file;
-   char word[1024];
-
-   in_file = fopen(filename, "r");
-   if (in_file) {
-      while (fscanf(in_file, "%s", word) == 1) {
-         char *new_word = (strdup(word));
-         new_word = strip(&new_word);
-         if (*dictionary == NULL) {
-         tree_node *new_node = (tree_node *)malloc(sizeof(tree_node));
-         new_node->word = new_word;
-         new_node->left = NULL;
-         new_node->right = NULL;
-         *dictionary = new_node;
-         } else
-         insert(*dictionary, new_word);
-      }
-   }
-   }
-
-   /* Given a binary search tree and a word, search for the word
-      in the binary search tree.  */
-
-   int find_word(tree_node *dictionary, char *word) {
-   if (!word || !dictionary)
-      return 0;
-
-   int compare_value = strcmp(word, dictionary->word);
-
-   if (compare_value == 0)
-      return 1;
-   else if (compare_value < 0)
-      return find_word(dictionary->left, word);
-   else
-      return find_word(dictionary->right, word);
-   }
-
-   /* Print out the words in the binary search tree, in sorted order.  */
-
-   void print_tree(tree_node *dictionary) {
-   if (!dictionary)
-      return;
-
-   if (dictionary->left)
-      print_tree(dictionary->left);
-
-   printf("%s\n", dictionary->word);
-
-   if (dictionary->right)
-      print_tree(dictionary->right);
-   }
-
-   int main(int argc, char **argv) {
-   tree_node *dictionary = NULL;
-   char buffer[1024];
-   char *filename;
-   int done = 0;
-
-   if (argc == 2)
-      filename = argv[1];
-
-   if (!filename)
-      return -1;
-
-   populate_dictionary(&dictionary, filename);
-   fprintf(stdout, "Dictionary loaded.\nEnter search word: ");
-   while (!done && fgets(buffer, sizeof(buffer), stdin)) {
-      char *word = buffer;
-      int len = strlen(word);
-      int i;
-
-      for (i = 0; i < len; ++i)
-         word[i] = tolower(word[i]);
-
-      if ((len > 0) && (word[len - 1] == '\n')) {
-         word[len - 1] = '\0';
-         len = len - 1;
-      }
-
-      if (find_word(dictionary, word))
-         fprintf(stdout, "Yes!\n");
-      else
-         fprintf(stdout, "No!\n");
-
-      fprintf(stdout, "Enter search word: ");
-   }
-
-   fprintf(stdout, "\n");
-   return 0;
-   }
-
-
-The text for "Romeo and Juliet" can be obtained from the Gutenberg Project
-(http://www.gutenberg.org).
-
diff --git a/lldb/docs/use/tutorials/accessing-documentation.md b/lldb/docs/use/tutorials/accessing-documentation.md
new file mode 100644
index 0000000000000..d14efa5f3c428
--- /dev/null
+++ b/lldb/docs/use/tutorials/accessing-documentation.md
@@ -0,0 +1,62 @@
+# Accessing Script Documentation
+
+The LLDB API is contained in a python module named lldb. A useful resource when
+writing Python extensions is the lldb Python classes reference guide.
+
+The documentation is also accessible in an interactive debugger session with
+the following command:
+
+```python3
+(lldb) script help(lldb)
+   Help on package lldb:
+
+   NAME
+      lldb - The lldb module contains the public APIs for Python binding.
+
+   FILE
+      /System/Library/PrivateFrameworks/LLDB.framework/Versions/A/Resources/Python/lldb/__init__.py
+
+   DESCRIPTION
+...
+```
+
+You can also get help using a module class name. The full API that is exposed
+for that class will be displayed in a man page style window. Below we want to
+get help on the lldb.SBFrame class:
+
+```python3
+(lldb) script help(lldb.SBFrame)
+   Help on class SBFrame in module lldb:
+
+   class SBFrame(builtins.object)
+    |  SBFrame(*args)
+    |  
+    |  Represents one of the stack frames associated with a thread.
+    |  
+    |  SBThread contains SBFrame(s). For example (from test/lldbutil.py), ::
+    |  
+    |      def print_stacktrace(thread, string_buffer = False):
+    |          '''Prints a simple stack trace of this thread.'''
+...
+```
+
+Or you can get help using any python object, here we use the lldb.process
+object which is a global variable in the lldb module which represents the
+currently selected process:
+
+```python3
+(lldb) script help(lldb.process)
+   Help on SBProcess in module lldb object:
+
+   class SBProcess(builtins.object)
+    |  SBProcess(*args)
+    |  
+    |  Represents the process associated with the target program.
+    |  
+    |  SBProcess supports thread iteration. For example (from test/lldbutil.py), ::
+    |  
+    |      # ==================================================
+    |      # Utility functions related to Threads and Processes
+    |      # ==================================================
+...
+```
\ No newline at end of file
diff --git a/lldb/docs/use/tutorials/automating-stepping-logic.md b/lldb/docs/use/tutorials/automating-stepping-logic.md
new file mode 100644
index 0000000000000..564d3ec1f14d4
--- /dev/null
+++ b/lldb/docs/use/tutorials/automating-stepping-logic.md
@@ -0,0 +1,42 @@
+# Automating Stepping Logic
+
+A slightly esoteric use of the Python API's is to construct custom stepping
+types. LLDB's stepping is driven by a stack of "thread plans" and a fairly
+simple state machine that runs the plans. You can create a Python class that
+works as a thread plan, and responds to the requests the state machine makes to
+run its operations.
+
+The base class for the [ScriptedThreadPlan](https://lldb.llvm.org/python_api/lldb.plugins.scripted_thread_plan.ScriptedThreadPlan.html) is provided as part of the lldb python module, making it easy to derive a new class from it.
+
+There is a longer discussion of scripted thread plans and the state machine,
+and several interesting examples of their use in [scripted_step.py](https://github.com/llvm/llvm-project/blob/main/lldb/examples/python/scripted_step.py)
+and for a **MUCH** fuller discussion of the whole state machine, see [ThreadPlan.h](https://github.com/llvm/llvm-project/blob/main/lldb/include/lldb/Target/ThreadPlan.h)
+
+If you are reading those comments it is useful to know that scripted thread
+plans are set to be either ***"ControllingPlans"*** or ***"OkayToDiscard"***.
+
+To implement a scripted step, you define a python class that has the following
+methods:
+
+| Name | Arguments | Description |
+|------|-----------|-------------|
+| `__init__` | `thread_plan`: `lldb.SBThreadPlan` | This is the underlying `SBThreadPlan` that is pushed onto the plan stack. You will want to store this away in an ivar. Also, if you are going to use one of the canned thread plans, you can queue it at this point. |
+| `explains_stop` | `event`: `lldb.SBEvent` | Return True if this stop is part of your thread plans logic, false otherwise. |
+| `is_stale` | `None` | If your plan is no longer relevant (for instance, you were stepping in a particular stack frame, but some other operation pushed that frame off the stack) return True and your plan will get popped. |
+| `should_step` | `None` | Return `True` if you want lldb to instruction step one instruction, or False to continue till the next breakpoint is hit. |
+| `should_stop` | `event`: `lldb.SBEvent` | If your plan wants to stop and return control to the user at this point, return True. If your plan is done at this point, call SetPlanComplete on your thread plan instance. Also, do any work you need here to set up the next stage of stepping. |
+
+To use this class to implement a step, use the command:
+
+```python3
+(lldb) thread step-scripted -C MyModule.MyStepPlanClass
+```
+
+Or use the `SBThread.StepUsingScriptedThreadPlan` API. The `SBThreadPlan` passed
+into your `__init__` function can also push several common plans (step
+in/out/over and run-to-address) in front of itself on the stack, which can be
+used to compose more complex stepping operations. When you use subsidiary plans
+your explains_stop and should_stop methods won't get called until the
+subsidiary plan is done, or the process stops for an event the subsidiary plan
+doesn't explain. For instance, step over plans don't explain a breakpoint hit
+while performing the step-over.
\ No newline at end of file
diff --git a/lldb/docs/use/tutorials/breakpoint-triggered-scripts.md b/lldb/docs/use/tutorials/breakpoint-triggered-scripts.md
new file mode 100644
index 0000000000000..0cd9f945f0d11
--- /dev/null
+++ b/lldb/docs/use/tutorials/breakpoint-triggered-scripts.md
@@ -0,0 +1,85 @@
+# Breakpoint-Triggered Scripts
+
+One very powerful use of the lldb Python API is to have a python script run
+when a breakpoint gets hit. Adding python scripts to breakpoints provides a way
+to create complex breakpoint conditions and also allows for smart logging and
+data gathering.
+
+When your process hits a breakpoint to which you have attached some python
+code, the code is executed as the body of a function which takes three
+arguments:
+
+```python3
+def breakpoint_function_wrapper(frame, bp_loc, internal_dict):
+   # Your code goes here
+```
+
+or:
+
+```python3
+def breakpoint_function_wrapper(frame, bp_loc, extra_args, internal_dict):
+   # Your code goes here
+```
+
+| Argument | Type | Description |
+|----------|------|-------------|
+| `frame` | `lldb.SBFrame` | The current stack frame where the breakpoint got hit. The object will always be valid. This `frame` argument might *not* match the currently selected stack frame found in the `lldb` module global variable `lldb.frame`. |
+| `bp_loc` | `lldb.SBBreakpointLocation` | The breakpoint location that just got hit. Breakpoints are represented by `lldb.SBBreakpoint` objects. These breakpoint objects can have one or more locations. These locations are represented by `lldb.SBBreakpointLocation` objects. |
+| `extra_args` | `lldb.SBStructuredData` | **Optional** If your breakpoint callback function takes this extra parameter, then when the callback gets added to a breakpoint, its contents can parametrize this use of the callback. For instance, instead of writing a callback that stops when the caller is "Foo", you could take the function name from a field in the `extra_args`, making the callback more general. The `-k` and `-v` options to `breakpoint command add` will be passed as a Dictionary in the `extra_args` parameter, or you can provide it with the SB API's. |
+| `internal_dict` | `dict` | The python session dictionary as a standard python dictionary object. |
+
+Optionally, a Python breakpoint command can return a value. Returning `False`
+tells LLDB that you do not want to stop at the breakpoint. Any other return
+value (including None or leaving out the return statement altogether) is akin
+to telling LLDB to actually stop at the breakpoint. This can be useful in
+situations where a breakpoint only needs to stop the process when certain
+conditions are met, and you do not want to inspect the program state manually
+at every stop and then continue.
+
+An example will show how simple it is to write some python code and attach it
+to a breakpoint. The following example will allow you to track the order in
+which the functions in a given shared library are first executed during one run
+of your program. This is a simple method to gather an order file which can be
+used to optimize function placement within a binary for execution locality.
+
+We do this by setting a regular expression breakpoint that will match every
+function in the shared library. The regular expression '.' will match any
+string that has at least one character in it, so we will use that. This will
+result in one lldb.SBBreakpoint object that contains an
+lldb.SBBreakpointLocation object for each function. As the breakpoint gets hit,
+we use a counter to track the order in which the function at this particular
+breakpoint location got hit. Since our code is passed the location that was
+hit, we can get the name of the function from the location, disable the
+location so we won't count this function again; then log some info and continue
+the process.
+
+Note we also have to initialize our counter, which we do with the simple
+one-line version of the script command.
+
+Here is the code:
+
+```python3
+(lldb) breakpoint set --func-regex=. --shlib=libfoo.dylib
+Breakpoint created: 1: regex = '.', module = libfoo.dylib, locations = 223
+(lldb) script counter = 0
+(lldb) breakpoint command add --script-type python 1
+Enter your Python command(s). Type 'DONE' to end.
+> # Increment our counter.  Since we are in a function, this must be a global python variable
+> global counter
+> counter += 1
+> # Get the name of the function
+> name = frame.GetFunctionName()
+> # Print the order and the function name
+> print('[%i] %s' % (counter, name))
+> # Disable the current breakpoint location so it doesn't get hit again
+> bp_loc.SetEnabled(False)
+> # No need to stop here
+> return False
+> DONE
+```
+
+The breakpoint command add command above attaches a python script to breakpoint 1. To remove the breakpoint command:
+
+```python3
+(lldb) breakpoint command delete 1
+```
\ No newline at end of file
diff --git a/lldb/docs/use/tutorials/creating-custom-breakpoints.md b/lldb/docs/use/tutorials/creating-custom-breakpoints.md
new file mode 100644
index 0000000000000..e3081c44e3650
--- /dev/null
+++ b/lldb/docs/use/tutorials/creating-custom-breakpoints.md
@@ -0,0 +1,128 @@
+# Custom Breakpoint Resolvers
+
+Another use of the Python API's in lldb is to create a custom breakpoint
+resolver.
+
+It allows you to provide the algorithm which will be used in the breakpoint's
+search of the space of the code in a given Target to determine where to set the
+breakpoint locations - the actual places where the breakpoint will trigger. To
+understand how this works you need to know a little about how lldb handles
+breakpoints.
+
+In lldb, a breakpoint is composed of three parts:
+1. the Searcher
+2. the Resolver,
+3. the Stop Options.
+
+The Searcher and Resolver cooperate to determine how breakpoint locations are
+set and differ between each breakpoint type. Stop options determine what
+happens when a location triggers and includes the commands, conditions, ignore
+counts, etc. Stop options are common between all breakpoint types, so for our
+purposes only the Searcher and Resolver are relevant.
+
+### Breakpoint Searcher
+
+The Searcher's job is to traverse in a structured way the code in the current
+target. It proceeds from the Target, to search all the Modules in the Target,
+in each Module it can recurse into the Compile Units in that module, and within
+each Compile Unit it can recurse over the Functions it contains.
+
+The Searcher can be provided with a SearchFilter that it will use to restrict
+this search. For instance, if the SearchFilter specifies a list of Modules, the
+Searcher will not recurse into Modules that aren't on the list. When you pass
+the -s modulename flag to break set you are creating a Module-based search
+filter. When you pass -f filename.c to break set -n you are creating a file
+based search filter. If neither of these is specified, the breakpoint will have
+a no-op search filter, so all parts of the program are searched and all
+locations accepted.
+
+### Breakpoint Resolver
+
+The Resolver has two functions:
+
+The most important one is the callback it provides. This will get called at the
+appropriate time in the course of the search. The callback is where the job of
+adding locations to the breakpoint gets done.
+
+The other function is specifying to the Searcher at what depth in the above
+described recursion it wants to be called. Setting a search depth also provides
+a stop for the recursion. For instance, if you request a Module depth search,
+then the callback will be called for each Module as it gets added to the
+Target, but the searcher will not recurse into the Compile Units in the module.
+
+One other slight subtlety is that the depth at which you get called back is not
+necessarily the depth at which the SearchFilter is specified. For instance,
+if you are doing symbol searches, it is convenient to use the Module depth for
+the search, since symbols are stored in the module. But the SearchFilter might
+specify some subset of CompileUnits, so not all the symbols you might find in
+each module will pass the search. You don't need to handle this situation
+yourself, since SBBreakpoint::AddLocation will only add locations that pass the
+Search Filter. This API returns an SBError to inform you whether your location
+was added.
+
+When the breakpoint is originally created, its Searcher will process all the
+currently loaded modules. The Searcher will also visit any new modules as they
+are added to the target. This happens, for instance, when a new shared library
+gets added to the target in the course of running, or on rerunning if any of
+the currently loaded modules have been changed. Note, in the latter case, all
+the locations set in the old module will get deleted and you will be asked to
+recreate them in the new version of the module when your callback gets called
+with that module. For this reason, you shouldn't try to manage the locations
+you add to the breakpoint yourself. Note that the Breakpoint takes care of
+deduplicating equal addresses in AddLocation, so you shouldn't need to worry
+about that anyway.
+
+### Scripted Breakpoint Resolver
+
+At present, when adding a ScriptedBreakpoint type, you can only provide a
+custom Resolver, not a custom SearchFilter.
+
+The custom Resolver is provided as a Python class with the following methods:
+
+| Name | Arguments | Description |
+|------|-----------|-------------|
+| `__init__` | `bkpt`: `lldb.SBBreakpoint` `extra_args`: `lldb.SBStructuredData` | This is the constructor for the new Resolver. `bkpt` is the breakpoint owning this Resolver. `extra_args` is an `SBStructuredData` object that the user can pass in when creating instances of this breakpoint. It is not required, but is quite handy. For instance if you were implementing a breakpoint on some symbol name, you could write a generic symbol name based Resolver, and then allow the user to pass in the particular symbol in the extra_args |
+| `__callback__` | `sym_ctx`: `lldb.SBSymbolContext` | This is the Resolver callback. The `sym_ctx` argument will be filled with the current stage of the search. For instance, if you asked for a search depth of lldb.eSearchDepthCompUnit, then the target, module and compile_unit fields of the sym_ctx will be filled. The callback should look just in the context passed in `sym_ctx` for new locations. If the callback finds an address of interest, it can add it to the breakpoint with the `SBBreakpoint.AddLocation` method, using the breakpoint passed in to the `__init__` method. |
+| `__get_depth__` | `None` | Specify the depth at which you wish your callback to get called. The currently supported options are: `lldb.eSearchDepthModule` `lldb.eSearchDepthCompUnit` `lldb.eSearchDepthFunction` For instance, if you are looking up symbols, which are stored at the Module level, you will want to get called back module by module. So you would want to return `lldb.eSearchDepthModule`. This method is optional. If not provided the search will be done at Module depth. |
+| `get_short_help` | `None` | This is an optional method. If provided, the returned string will be printed at the beginning of the description for this breakpoint. |
+
+To define a new breakpoint command defined by this class from the lldb command
+line, use the command:
+
+```
+(lldb) breakpoint set -P MyModule.MyResolverClass
+```
+
+You can also populate the extra_args SBStructuredData with a dictionary of
+key/value pairs with:
+
+```
+(lldb) breakpoint set -P MyModule.MyResolverClass -k key_1 -v value_1 -k key_2 -v value_2
+```
+
+Although you can't write a scripted SearchFilter, both the command line and the
+SB API's for adding a scripted resolver allow you to specify a SearchFilter
+restricted to certain modules or certain compile units. When using the command
+line to create the resolver, you can specify a Module specific SearchFilter by
+passing the -s ModuleName option - which can be specified multiple times. You
+can also specify a SearchFilter restricted to certain compile units by passing
+in the -f CompUnitName option. This can also be specified more than once. And
+you can mix the two to specify "this comp unit in this module". So, for
+instance,
+
+```
+(lldb) breakpoint set -P MyModule.MyResolverClass -s a.out
+```
+
+will use your resolver, but will only recurse into or accept new locations in
+the module a.out.
+
+Another option for creating scripted breakpoints is to use the
+SBTarget.BreakpointCreateFromScript API. This one has the advantage that you
+can pass in an arbitrary SBStructuredData object, so you can create more
+complex parametrizations. SBStructuredData has a handy SetFromJSON method which
+you can use for this purpose. Your __init__ function gets passed this
+SBStructuredData object. This API also allows you to directly provide the list
+of Modules and the list of CompileUnits that will make up the SearchFilter. If
+you pass in empty lists, the breakpoint will use the default "search
+everywhere,accept everything" filter.
\ No newline at end of file
diff --git a/lldb/docs/use/tutorials/custom-frame-recognizers.md b/lldb/docs/use/tutorials/custom-frame-recognizers.md
new file mode 100644
index 0000000000000..17bf9637d9a85
--- /dev/null
+++ b/lldb/docs/use/tutorials/custom-frame-recognizers.md
@@ -0,0 +1,51 @@
+# Detecting Patterns With Recognizers
+
+Frame recognizers allow for retrieving information about special frames based
+on ABI, arguments or other special properties of that frame, even without
+source code or debug info. Currently, one use case is to extract function
+arguments that would otherwise be inaccessible, or augment existing arguments.
+
+Adding a custom frame recognizer is done by implementing a Python class and
+using the `frame recognizer add` command. The Python class should implement the
+`get_recognized_arguments` method and it will receive an argument of type
+`lldb.SBFrame` representing the current frame that we are trying to recognize.
+The method should return a (possibly empty) list of `lldb.SBValue` objects that
+represent the recognized arguments.
+
+An example of a recognizer that retrieves the file descriptor values from libc
+functions 'read', 'write' and 'close' follows:
+
+```python3
+class LibcFdRecognizer:
+  def get_recognized_arguments(self, frame: lldb.SBFrame):
+    if frame.name in ["read", "write", "close"]:
+      fd = frame.EvaluateExpression("$arg1").unsigned
+      target = frame.thread.process.target
+      value = target.CreateValueFromExpression("fd", "(int)%d" % fd)
+      return [value]
+    return []
+```
+
+The file containing this implementation can be imported via `command script import`
+and then we can register this recognizer with `frame recognizer add`.
+
+It's important to restrict the recognizer to the libc library (which is
+`libsystem_kernel.dylib` on macOS) to avoid matching functions with the same name
+in other modules:
+
+```c++
+(lldb) command script import .../fd_recognizer.py
+(lldb) frame recognizer add -l fd_recognizer.LibcFdRecognizer -n read -s libsystem_kernel.dylib
+```
+
+When the program is stopped at the beginning of the 'read' function in libc, we can view the recognizer arguments in 'frame variable':
+
+```c++
+(lldb) b read
+(lldb) r
+Process 1234 stopped
+* thread #1, queue = 'com.apple.main-thread', stop reason = breakpoint 1.3
+    frame #0: 0x00007fff06013ca0 libsystem_kernel.dylib`read
+(lldb) frame variable
+(int) fd = 3
+```
\ No newline at end of file
diff --git a/lldb/docs/use/tutorials/extending-target-stop-hooks.md b/lldb/docs/use/tutorials/extending-target-stop-hooks.md
new file mode 100644
index 0000000000000..232187d0dcf11
--- /dev/null
+++ b/lldb/docs/use/tutorials/extending-target-stop-hooks.md
@@ -0,0 +1,25 @@
+# Extending Target Stop-Hooks
+
+Stop hooks fire whenever the process stops just before control is returned to the
+user.  Stop hooks can either be a set of lldb command-line commands, or can
+be implemented by a suitably defined Python class.  The Python-based stop-hooks
+can also be passed as a set of -key -value pairs when they are added, and those
+will get packaged up into a `SBStructuredData` Dictionary and passed to the
+constructor of the Python object managing the stop hook.  This allows for
+parameterization of the stop hooks.
+
+To add a Python-based stop hook, first define a class with the following methods:
+
+| Name | Arguments | Description |
+|------|-----------|-------------|
+| `__init__` | `target: lldb.SBTarget` `extra_args: lldb.SBStructuredData` | This is the constructor for the new stop-hook. `target` is the SBTarget to which the stop hook is added. `extra_args` is an SBStructuredData object that the user can pass in when creating instances of this breakpoint. It is not required, but allows for reuse of stop-hook classes. |
+| `handle_stop` | `exe_ctx: lldb.SBExecutionContext` `stream: lldb.SBStream` | This is the called when the target stops. `exe_ctx` argument will be filled with the current stop point for which the stop hook is being evaluated. `stream` an lldb.SBStream, anything written to this stream will be written to the debugger console. The return value is a "Should Stop" vote from this thread. If the method returns either True or no return this thread votes to stop. If it returns False, then the thread votes to continue after all the stop-hooks are evaluated. Note, the --auto-continue flag to 'target stop-hook add' overrides a True return value from the method. |
+
+To use this class in lldb, run the command:
+
+```
+(lldb) command script import MyModule.py
+(lldb) target stop-hook add -P MyModule.MyStopHook -k first -v 1 -k second -v 2
+```
+
+where `MyModule.py` is the file containing the class definition `MyStopHook`.
\ No newline at end of file
diff --git a/lldb/docs/use/tutorials/implementing-standalone-scripts.md b/lldb/docs/use/tutorials/implementing-standalone-scripts.md
new file mode 100644
index 0000000000000..b8aaacf22fc2e
--- /dev/null
+++ b/lldb/docs/use/tutorials/implementing-standalone-scripts.md
@@ -0,0 +1,134 @@
+# Implementing Standalone Scripts
+
+### Configuring `PYTHONPATH`
+
+LLDB has all of its core code built into a shared library which gets used by
+the `lldb` command line application.
+- On macOS this shared library is a framework: `LLDB.framework`.
+- On other unix variants the program is a shared library: lldb.so.
+
+LLDB also provides an `lldb.py` module that contains the bindings from LLDB
+into Python. To use the `LLDB.framework` to create your own stand-alone python
+programs, you will need to tell python where to look in order to find this
+module. This is done by setting the `PYTHONPATH` environment variable,
+adding a path to the directory that contains the `lldb.py` python
+module. The lldb driver program has an option to report the path to the lldb
+module. You can use that to point to correct lldb.py:
+
+For csh and tcsh:
+
+```csh
+% setenv PYTHONPATH `lldb -P`
+```
+
+For sh and bash:
+
+```bash
+$ export PYTHONPATH=`lldb -P`
+```
+
+Alternatively, you can append the LLDB Python directory to the sys.path list
+directly in your Python code before importing the lldb module.
+
+### Initialization
+
+The standard test for `__main__`, like many python modules do, is useful for
+creating scripts that can be run from the command line. However, for command
+line scripts, the debugger instance must be created manually. Sample code would
+look like:
+
+```python3
+if __name__ == '__main__':
+    # Initialize the debugger before making any API calls.
+    lldb.SBDebugger.Initialize()
+    # Create a new debugger instance in your module if your module
+    # can be run from the command line. When we run a script from
+    # the command line, we won't have any debugger object in
+    # lldb.debugger, so we can just create it if it will be needed
+    debugger = lldb.SBDebugger.Create()
+
+    # Next, do whatever work this module should do when run as a command.
+    # ...
+
+    # Finally, dispose of the debugger you just made.
+    lldb.SBDebugger.Destroy(debugger)
+    # Terminate the debug session
+    lldb.SBDebugger.Terminate()
+```
+
+### Example
+
+Now your python scripts are ready to import the lldb module. Below is a python
+script that will launch a program from the current working directory called
+`a.out`, set a breakpoint at `main`, and then run and hit the breakpoint, and
+print the process, thread and frame objects if the process stopped:
+
+```python3
+#!/usr/bin/env python3
+
+import lldb
+import os
+
+def disassemble_instructions(insts):
+    for i in insts:
+        print(i)
+
+# Set the path to the executable to debug
+exe = "./a.out"
+
+# Create a new debugger instance
+debugger = lldb.SBDebugger.Create()
+
+# When we step or continue, don't return from the function until the process
+# stops. Otherwise we would have to handle the process events ourselves which, while doable is
+# a little tricky.  We do this by setting the async mode to false.
+debugger.SetAsync(False)
+
+# Create a target from a file and arch
+print("Creating a target for '%s'" % exe)
+
+target = debugger.CreateTargetWithFileAndArch(exe, lldb.LLDB_ARCH_DEFAULT)
+
+if target:
+    # If the target is valid set a breakpoint at main
+    main_bp = target.BreakpointCreateByName(
+        "main", target.GetExecutable().GetFilename()
+    )
+
+    print(main_bp)
+
+    # Launch the process. Since we specified synchronous mode, we won't return
+    # from this function until we hit the breakpoint at main
+    process = target.LaunchSimple(None, None, os.getcwd())
+
+    # Make sure the launch went ok
+    if process:
+        # Print some simple process info
+        state = process.GetState()
+        print(process)
+        if state == lldb.eStateStopped:
+            # Get the first thread
+            thread = process.GetThreadAtIndex(0)
+            if thread:
+                # Print some simple thread info
+                print(thread)
+                # Get the first frame
+                frame = thread.GetFrameAtIndex(0)
+                if frame:
+                    # Print some simple frame info
+                    print(frame)
+                    function = frame.GetFunction()
+                    # See if we have debug info (a function)
+                    if function:
+                        # We do have a function, print some info for the function
+                        print(function)
+                        # Now get all instructions for this function and print them
+                        insts = function.GetInstructions(target)
+                        disassemble_instructions(insts)
+                    else:
+                        # See if we have a symbol in the symbol table for where we stopped
+                        symbol = frame.GetSymbol()
+                        if symbol:
+                            # We do have a symbol, print some info for the symbol
+                            print(symbol)
+```
\ No newline at end of file
diff --git a/lldb/docs/use/tutorials/python-embedded-interpreter.md b/lldb/docs/use/tutorials/python-embedded-interpreter.md
new file mode 100644
index 0000000000000..719d746b35d43
--- /dev/null
+++ b/lldb/docs/use/tutorials/python-embedded-interpreter.md
@@ -0,0 +1,66 @@
+# Embedded Python Interpreter
+
+The embedded python interpreter can be accessed in a variety of ways from
+within LLDB. The easiest way is to use the lldb command script with no
+arguments at the lldb command prompt:
+
+```python3
+(lldb) script
+Python Interactive Interpreter. To exit, type 'quit()', 'exit()' or Ctrl-D.
+>>> 2+3
+5
+>>> hex(12345)
+'0x3039'
+>>>
+```
+
+This drops you into the embedded python interpreter. When running under the
+script command, lldb sets some convenience variables that give you quick access
+to the currently selected entities that characterize the program and debugger
+state. In each case, if there is no currently selected entity of the
+appropriate type, the variable's IsValid method will return false. These
+variables are:
+
+| Variable | Type | Equivalent | Description |
+|----------|------|------------|-------------|
+| `lldb.debugger` | `lldb.SBDebugger` | `SBTarget.GetDebugger` | Contains the debugger object whose `script` command was invoked. The `lldb.SBDebugger` object owns the command interpreter and all the targets in your debug session. There will always be a Debugger in the embedded interpreter. |
+| `lldb.target` | `lldb.SBTarget` | `SBDebugger.GetSelectedTarget` `SBProcess.GetTarget` | Contains the currently selected target - for instance the one made with the `file` or selected by the `target select <target-index>` command. The `lldb.SBTarget` manages one running process, and all the executable and debug files for the process. |
+| `lldb.process` | `lldb.SBProcess` | `SBTarget.GetProcess` `SBThread.GetProcess` | Contains the process of the currently selected target. The `lldb.SBProcess` object manages the threads and allows access to memory for the process. |
+| `lldb.thread` | `lldb.SBThread` | `SBProcess.GetSelectedThread` `SBFrame.GetThread` | Contains the currently selected thread. The `lldb.SBThread` object manages the stack frames in that thread. A thread is always selected in the command interpreter when a target stops. The `thread select <thread-index>` command can be used to change the currently selected thread. So as long as you have a stopped process, there will be some selected thread. |
+| `lldb.frame` | `lldb.SBFrame` | `SBThread.GetSelectedFrame` | Contains the currently selected stack frame. The `lldb.SBFrame` object manage the stack locals and the register set for that stack. A stack frame is always selected in the command interpreter when a target stops. The `frame select <frame-index>` command can be used to change the currently selected frame. So as long as you have a stopped process, there will be some selected frame. |
+
+While extremely convenient, these variables have a couple caveats that you
+should be aware of. First of all, they hold the values of the selected objects
+on entry to the embedded interpreter. They do not update as you use the LLDB
+API's to change, for example, the currently selected stack frame or thread.
+
+Moreover, they are only defined and meaningful while in the interactive Python
+interpreter. There is no guarantee on their value in any other situation, hence
+you should not use them when defining Python formatters, breakpoint scripts and
+commands (or any other Python extension point that LLDB provides). For the
+latter you'll be passed an `SBDebugger`, `SBTarget`, `SBProcess`, `SBThread` or
+`SBFrame` instance and you can use the functions from the "Equivalent" column
+to navigate between them.
+
+As a rationale for such behavior, consider that lldb can run in a multithreaded
+environment, and another thread might call the "script" command, changing the
+value out from under you.
+
+To get started with these objects and LLDB scripting, please note that almost
+all of the lldb Python objects are able to briefly describe themselves when you
+pass them to the Python print function:
+
+```python3
+(lldb) script
+Python Interactive Interpreter. To exit, type 'quit()', 'exit()' or Ctrl-D.
+>>> print(lldb.debugger)
+Debugger (instance: "debugger_1", id: 1)
+>>> print(lldb.target)
+a.out
+>>> print(lldb.process)
+SBProcess: pid = 58842, state = stopped, threads = 1, executable = a.out
+>>> print(lldb.thread)
+thread #1: tid = 0x2265ce3, 0x0000000100000334 a.out`main at t.c:2:3, queue = 'com.apple.main-thread', stop reason = breakpoint 1.1
+>>> print(lldb.frame)
+frame #0: 0x0000000100000334 a.out`main at t.c:2:3
+```
\ No newline at end of file
diff --git a/lldb/docs/use/tutorials/script-driven-debugging.md b/lldb/docs/use/tutorials/script-driven-debugging.md
new file mode 100644
index 0000000000000..55b90b1e25bf5
--- /dev/null
+++ b/lldb/docs/use/tutorials/script-driven-debugging.md
@@ -0,0 +1,492 @@
+# Script-Driven Debugging
+
+LLDB has been structured from the beginning to be scriptable in two
+ways:
+- a Unix Python session can initiate/run a debug session non-interactively
+using LLDB;
+- and within the LLDB debugger tool, Python scripts can be used to help with
+many tasks, including inspecting program data, iterating over containers and
+determining if a breakpoint should stop execution or continue.
+
+This document will show how to do some of these things by going through an
+example, explaining how to use Python scripting to find a bug in a program
+that searches for text in a large binary tree.
+
+### The Test Program and Input
+
+We have a simple C program ([dictionary.c](https://github.com/llvm/llvm-project/blob/main/lldb/examples/scripting/dictionary.c))
+that reads in a text file, and stores all the words from the file in a
+Binary Search Tree, sorted alphabetically. It then enters a loop
+prompting the user for a word, searching for the word in the tree
+(using Binary Search), and reporting to the user whether or not it found
+the word in the tree.
+
+The input text file we are using to test our program contains the text
+for William Shakespeare's famous tragedy "Romeo and Juliet".
+
+### The Bug
+
+When we try running our program, we find there is a problem. While it
+successfully finds some of the words we would expect to find, such as
+"love" or "sun", it fails to find the word "Romeo", which **MUST** be in
+the input text file:
+
+```shell
+$ ./dictionary Romeo-and-Juliet.txt
+Dictionary loaded.
+Enter search word: love
+Yes!
+Enter search word: sun
+Yes!
+Enter search word: Romeo
+No!
+Enter search word: ^D
+$
+```
+
+### Using Depth First Search
+
+Our first job is to determine if the word "Romeo" actually got inserted
+into the tree or not. Since "Romeo and Juliet" has thousands of words,
+trying to examine our binary search tree by hand is completely
+impractical. Therefore we will write a Python script to search the tree
+for us. We will write a recursive Depth First Search function that
+traverses the entire tree searching for a word, and maintaining
+information about the path from the root of the tree to the current
+node. If it finds the word in the tree, it returns the path from the
+root to the node containing the word. This is what our DFS function in
+Python would look like, with line numbers added for easy reference in
+later explanations:
+
+```python3
+1: def DFS (root, word, cur_path):
+2:     root_word_ptr = root.GetChildMemberWithName ("word")
+3:     left_child_ptr = root.GetChildMemberWithName ("left")
+4:     right_child_ptr = root.GetChildMemberWithName ("right")
+5:     root_word = root_word_ptr.GetSummary()
+6:     end = len (root_word) - 1
+7:     if root_word[0] == '"' and root_word[end] == '"':
+8:         root_word = root_word[1:end]
+9:     end = len (root_word) - 1
+10:     if root_word[0] == '\'' and root_word[end] == '\'':
+11:        root_word = root_word[1:end]
+12:     if root_word == word:
+13:         return cur_path
+14:     elif word < root_word:
+15:         if left_child_ptr.GetValue() is None:
+16:             return ""
+17:         else:
+18:             cur_path = cur_path + "L"
+19:             return DFS (left_child_ptr, word, cur_path)
+20:     else:
+21:         if right_child_ptr.GetValue() is None:
+22:             return ""
+23:         else:
+24:             cur_path = cur_path + "R"
+25:             return DFS (right_child_ptr, word, cur_path)
+```
+
+### Accessing & Manipulating Program Variables
+
+Before we can call any Python function on any of our program's
+variables, we need to get the variable into a form that Python can
+access. To show you how to do this we will look at the parameters for
+the DFS function. The first parameter is going to be a node in our
+binary search tree, put into a Python variable. The second parameter is
+the word we are searching for (a string), and the third parameter is a
+string representing the path from the root of the tree to our current
+node.
+
+The most interesting parameter is the first one, the Python variable
+that needs to contain a node in our search tree. How can we take a
+variable out of our program and put it into a Python variable? What
+kind of Python variable will it be? The answers are to use the LLDB API
+functions, provided as part of the LLDB Python module. Running Python
+from inside LLDB, LLDB will automatically give us our current frame
+object as a Python variable, "lldb.frame". This variable has the type
+`SBFrame` (see the LLDB API for more information about `SBFrame`
+objects). One of the things we can do with a frame object, is to ask it
+to find and return its local variable. We will call the API function
+`SBFrame.FindVariable` on the `lldb.frame` object to give us our
+dictionary variable as a Python variable:
+
+```python3
+root = lldb.frame.FindVariable ("dictionary")
+```
+
+The line above, executed in the Python script interpreter in LLDB, asks the
+current frame to find the variable named "dictionary" and return it. We then
+store the returned value in the Python variable named "root". This answers the
+question of HOW to get the variable, but it still doesn't explain WHAT actually
+gets put into "root". If you examine the LLDB API, you will find that the
+`SBFrame` method "FindVariable" returns an object of type `SBValue`. `SBValue`
+objects are used, among other things, to wrap up program variables and values.
+There are many useful methods defined in the `SBValue` class to allow you to get
+information or children values out of SBValues. For complete information, see
+the header file SBValue.h. The `SBValue` methods that we use in our DFS function
+are `GetChildMemberWithName()`, `GetSummary()`, and `GetValue()`.
+
+### Explaining DFS Script in Detail
+
+Before diving into the details of this code, it would be best to give a
+high-level overview of what it does. The nodes in our binary search tree were
+defined to have type `tree_node *`, which is defined as:
+
+```c++
+typedef struct tree_node
+{
+   const char *word;
+   struct tree_node *left;
+   struct tree_node *right;
+} tree_node;
+```
+
+Lines 2-11 of DFS are getting data out of the current tree node and getting
+ready to do the actual search; lines 12-25 are the actual depth-first search.
+Lines 2-4 of our DFS function get the word, left and right fields out of the
+current node and store them in Python variables. Since root_word_ptr is a
+pointer to our word, and we want the actual word, line 5 calls GetSummary() to
+get a string containing the value out of the pointer. Since GetSummary() adds
+quotes around its result, lines 6-11 strip surrounding quotes off the word.
+
+Line 12 checks to see if the word in the current node is the one we are
+searching for. If so, we are done, and line 13 returns the current path.
+Otherwise, line 14 checks to see if we should go left (search word comes before
+the current word). If we decide to go left, line 15 checks to see if the left
+pointer child is NULL ("None" is the Python equivalent of NULL). If the left
+pointer is NULL, then the word is not in this tree and we return an empty path
+(line 16). Otherwise, we add an "L" to the end of our current path string, to
+indicate we are going left (line 18), and then recurse on the left child (line
+19). Lines 20-25 are the same as lines 14-19, except for going right rather
+than going left.
+
+One other note: Typing something as long as our DFS function directly into the
+interpreter can be difficult, as making a single typing mistake means having to
+start all over. Therefore we recommend doing as we have done: Writing your
+longer, more complicated script functions in a separate file (in this case
+tree_utils.py) and then importing it into your LLDB Python interpreter.
+
+### The DFS Script in Action
+
+At this point we are ready to use the DFS function to see if the word "Romeo"
+is in our tree or not. To actually use it in LLDB on our dictionary program,
+you would do something like this:
+
+```c++
+$ lldb
+(lldb) process attach -n "dictionary"
+Architecture set to: x86_64.
+Process 521 stopped
+* thread #1: tid = 0x2c03, 0x00007fff86c8bea0 libSystem.B.dylib`read$NOCANCEL + 8, stop reason = signal SIGSTOP
+frame #0: 0x00007fff86c8bea0 libSystem.B.dylib`read$NOCANCEL + 8
+(lldb) breakpoint set -n find_word
+Breakpoint created: 1: name = 'find_word', locations = 1, resolved = 1
+(lldb) continue
+Process 521 resuming
+Process 521 stopped
+* thread #1: tid = 0x2c03, 0x0000000100001830 dictionary`find_word + 16
+at dictionary.c:105, stop reason = breakpoint 1.1
+frame #0: 0x0000000100001830 dictionary`find_word + 16 at dictionary.c:105
+102 int
+103 find_word (tree_node *dictionary, char *word)
+104 {
+-> 105 if (!word || !dictionary)
+106 return 0;
+107
+108 int compare_value = strcmp (word, dictionary->word);
+(lldb) script
+```
+```python3
+Python Interactive Interpreter. To exit, type 'quit()', 'exit()' or Ctrl-D.
+>>> import tree_utils
+>>> root = lldb.frame.FindVariable ("dictionary")
+>>> current_path = ""
+>>> path = tree_utils.DFS (root, "Romeo", current_path)
+>>> print path
+LLRRL
+>>> ^D
+(lldb)
+```
+
+The first bit of code above shows starting lldb, attaching to the dictionary
+program, and getting to the find_word function in LLDB. The interesting part
+(as far as this example is concerned) begins when we enter the script command
+and drop into the embedded interactive Python interpreter. We will go over this
+Python code line by line. The first line
+
+```python3
+import tree_utils
+```
+
+imports the file where we wrote our DFS function, tree_utils.py, into Python.
+Notice that to import the file we leave off the ".py" extension. We can now
+call any function in that file, giving it the prefix "tree_utils.", so that
+Python knows where to look for the function. The line
+
+```python3
+root = lldb.frame.FindVariable ("dictionary")
+```
+
+gets our program variable "dictionary" (which contains the binary search tree)
+and puts it into the Python variable "root". See Accessing & Manipulating
+Program Variables in Python above for more details about how this works. The
+next line is
+
+```python3
+current_path = ""
+```
+
+This line initializes the current_path from the root of the tree to our current
+node. Since we are starting at the root of the tree, our current path starts as
+an empty string. As we go right and left through the tree, the DFS function
+will append an 'R' or an 'L' to the current path, as appropriate. The line
+
+```python3
+path = tree_utils.DFS (root, "Romeo", current_path)
+```
+
+calls our DFS function (prefixing it with the module name so that Python can
+find it). We pass in our binary tree stored in the variable root, the word we
+are searching for, and our current path. We assign whatever path the DFS
+function returns to the Python variable path.
+
+Finally, we want to see if the word was found or not, and if so we want to see
+the path through the tree to the word. So we do
+
+```python3
+print path
+```
+
+From this we can see that the word "Romeo" was indeed found in the tree, and
+the path from the root of the tree to the node containing "Romeo" is
+left-left-right-right-left.
+
+### Using Breakpoint Command Scripts
+
+We are halfway to figuring out what the problem is. We know the word we are
+looking for is in the binary tree, and we know exactly where it is in the
+binary tree. Now we need to figure out why our binary search algorithm is not
+finding the word. We will do this using breakpoint command scripts.
+
+The idea is as follows. The binary search algorithm has two main decision
+points: the decision to follow the right branch; and, the decision to follow
+the left branch. We will set a breakpoint at each of these decision points, and
+attach a Python breakpoint command script to each breakpoint. The breakpoint
+commands will use the global path Python variable that we got from our DFS
+function. Each time one of these decision breakpoints is hit, the script will
+compare the actual decision with the decision the front of the path variable
+says should be made (the first character of the path). If the actual decision
+and the path agree, then the front character is stripped off the path, and
+execution is resumed. In this case the user never even sees the breakpoint
+being hit. But if the decision differs from what the path says it should be,
+then the script prints out a message and does NOT resume execution, leaving the
+user sitting at the first point where a wrong decision is being made.
+
+### Python Breakpoint Command Scripts Are Not What They Seem
+
+What do we mean by that? When you enter a Python breakpoint command in LLDB, it
+appears that you are entering one or more plain lines of Python. BUT LLDB then
+takes what you entered and wraps it into a Python FUNCTION (just like using the
+"def" Python command). It automatically gives the function an obscure, unique,
+hard-to-stumble-across function name, and gives it two parameters: frame and
+bp_loc. When the breakpoint gets hit, LLDB wraps up the frame object where the
+breakpoint was hit, and the breakpoint location object for the breakpoint that
+was hit, and puts them into Python variables for you. It then calls the Python
+function that was created for the breakpoint command, and passes in the frame
+and breakpoint location objects.
+
+So, being practical, what does this mean for you when you write your Python
+breakpoint commands? It means that there are two things you need to keep in
+mind: 1. If you want to access any Python variables created outside your
+script, you must declare such variables to be global. If you do not declare
+them as global, then the Python function will treat them as local variables,
+and you will get unexpected behavior. 2. All Python breakpoint command scripts
+automatically have a frame and a bp_loc variable. The variables are pre-loaded
+by LLDB with the correct context for the breakpoint. You do not have to use
+these variables, but they are there if you want them.
+
+### The Decision Point Breakpoint Commands
+
+This is what the Python breakpoint command script would look like for the
+decision to go right:
+
+```python3
+global path
+if path[0] == 'R':
+   path = path[1:]
+   thread = frame.GetThread()
+   process = thread.GetProcess()
+   process.Continue()
+else:
+   print "Here is the problem; going right, should go left!"
+```
+
+Just as a reminder, LLDB is going to take this script and wrap it up in a function, like this:
+
+```python3
+def some_unique_and_obscure_function_name (frame, bp_loc):
+   global path
+   if path[0] == 'R':
+      path = path[1:]
+      thread = frame.GetThread()
+      process = thread.GetProcess()
+      process.Continue()
+   else:
+      print "Here is the problem; going right, should go left!"
+```
+
+LLDB will call the function, passing in the correct frame and breakpoint
+location whenever the breakpoint gets hit. There are several things to notice
+about this function. The first one is that we are accessing and updating a
+piece of state (the path variable), and actually conditioning our behavior
+based upon this variable. Since the variable was defined outside of our script
+(and therefore outside of the corresponding function) we need to tell Python
+that we are accessing a global variable. That is what the first line of the
+script does. Next we check where the path says we should go and compare it to
+our decision (recall that we are at the breakpoint for the decision to go
+right). If the path agrees with our decision, then we strip the first character
+off of the path.
+
+Since the decision matched the path, we want to resume execution. To do this we
+make use of the frame parameter that LLDB guarantees will be there for us. We
+use LLDB API functions to get the current thread from the current frame, and
+then to get the process from the thread. Once we have the process, we tell it
+to resume execution (using the Continue() API function).
+
+If the decision to go right does not agree with the path, then we do not resume
+execution. We allow the breakpoint to remain stopped (by doing nothing), and we
+print an informational message telling the user we have found the problem, and
+what the problem is.
+
+### Actually Using The Breakpoint Commands
+
+Now we will look at what happens when we actually use these breakpoint commands
+on our program. Doing a source list -n find_word shows us the function
+containing our two decision points. Looking at the code below, we see that we
+want to set our breakpoints on lines 113 and 115:
+
+```c++
+(lldb) source list -n find_word
+File: /Volumes/Data/HD2/carolinetice/Desktop/LLDB-Web-Examples/dictionary.c.
+101
+102 int
+103 find_word (tree_node *dictionary, char *word)
+104 {
+105   if (!word || !dictionary)
+106     return 0;
+107
+108   int compare_value = strcmp (word, dictionary->word);
+109
+110   if (compare_value == 0)
+111     return 1;
+112   else if (compare_value < 0)
+113     return find_word (dictionary->left, word);
+114   else
+115     return find_word (dictionary->right, word);
+116 }
+117
+```
+
+So, we set our breakpoints, enter our breakpoint command scripts, and see what happens:
+
+```c++
+(lldb) breakpoint set -l 113
+Breakpoint created: 2: file ="dictionary.c", line = 113, locations = 1, resolved = 1
+(lldb) breakpoint set -l 115
+Breakpoint created: 3: file ="dictionary.c", line = 115, locations = 1, resolved = 1
+(lldb) breakpoint command add -s python 2
+```
+```python3
+Enter your Python command(s). Type 'DONE' to end.
+> global path
+> if (path[0] == 'L'):
+>     path = path[1:]
+>     thread = frame.GetThread()
+>     process = thread.GetProcess()
+>     process.Continue()
+> else:
+>     print "Here is the problem. Going left, should go right!"
+> DONE
+```
+```c++
+(lldb) breakpoint command add -s python 3
+```
+```python3
+Enter your Python command(s). Type 'DONE' to end.
+> global path
+> if (path[0] == 'R'):
+>     path = path[1:]
+>     thread = frame.GetThread()
+>     process = thread.GetProcess()
+>     process.Continue()
+> else:
+>     print "Here is the problem. Going right, should go left!"
+> DONE
+```
+```c++
+(lldb) continue
+Process 696 resuming
+Here is the problem. Going right, should go left!
+Process 696 stopped
+* thread #1: tid = 0x2d03, 0x000000010000189f dictionary`find_word + 127 at dictionary.c:115, stop reason = breakpoint 3.1
+frame #0: 0x000000010000189f dictionary`find_word + 127 at dictionary.c:115
+   112   else if (compare_value < 0)
+   113     return find_word (dictionary->left, word);
+   114   else
+-> 115     return find_word (dictionary->right, word);
+   116 }
+   117
+   118 void
+(lldb)
+```
+
+After setting our breakpoints, adding our breakpoint commands and continuing,
+we run for a little bit and then hit one of our breakpoints, printing out the
+error message from the breakpoint command. Apparently at this point in the
+tree, our search algorithm decided to go right, but our path says the node we
+want is to the left. Examining the word at the node where we stopped, and our
+search word, we see:
+
+```c++
+(lldb) expr dictionary->word
+(const char *) $1 = 0x0000000100100080 "dramatis"
+(lldb) expr word
+(char *) $2 = 0x00007fff5fbff108 "romeo"
+```
+
+So the word at our current node is "dramatis", and the word we are searching
+for is "romeo". "romeo" comes after "dramatis" alphabetically, so it seems like
+going right would be the correct decision. Let's ask Python what it thinks the
+path from the current node to our word is:
+
+```c++
+(lldb) script print path
+LLRRL
+```
+
+According to Python we need to go left-left-right-right-left from our current
+node to find the word we are looking for. Let's double check our tree, and see
+what word it has at that node:
+
+```c++
+(lldb) expr dictionary->left->left->right->right->left->word
+(const char *) $4 = 0x0000000100100880 "Romeo"
+```
+
+So the word we are searching for is "romeo" and the word at our DFS location is
+"Romeo". Aha! One is uppercase and the other is lowercase: We seem to have a
+case conversion problem somewhere in our program (we do).
+
+This is the end of our example on how you might use Python scripting in LLDB to
+help you find bugs in your program.
+
+### Sources
+
+The complete code for the Dictionary program (with case-conversion bug), the
+DFS function and other Python script examples used for this example are
+available below.
+
+- [tree_utils.py](https://github.com/llvm/llvm-project/blob/main/lldb/examples/scripting/tree_utils.py) - Example Python functions using LLDB's API, including DFS
+- [dictionary.c](https://github.com/llvm/llvm-project/blob/main/lldb/examples/scripting/dictionary.c) - Sample dictionary program, with bug
+- The text for "Romeo and Juliet" can be obtained from [the Gutenberg Project](https://www.gutenberg.org).
+
diff --git a/lldb/docs/use/tutorials/writing-custom-commands.md b/lldb/docs/use/tutorials/writing-custom-commands.md
new file mode 100644
index 0000000000000..d53b7e473a505
--- /dev/null
+++ b/lldb/docs/use/tutorials/writing-custom-commands.md
@@ -0,0 +1,429 @@
+# Writing Custom Commands
+
+### Create a new command using a Python function
+
+Python functions can be used to create new LLDB command interpreter commands,
+which will work like all the natively defined lldb commands. This provides a
+very flexible and easy way to extend LLDB to meet your debugging requirements.
+
+To write a python function that implements a new LLDB command define the
+function to take five arguments as follows:
+
+```python3
+def command_function(debugger, command, exe_ctx, result, internal_dict):
+    # Your code goes here
+```
+
+The meaning of the arguments is given in the table below.
+
+If you provide a Python docstring in your command function LLDB will use it
+when providing "long help" for your command, as in:
+
+```python3
+def command_function(debugger, command, result, internal_dict):
+    """This command takes a lot of options and does many fancy things"""
+    # Your code goes here
+```
+
+though providing help can also be done programmatically (see below).
+
+Prior to lldb 3.5.2 (April 2015), LLDB Python command definitions didn't take the SBExecutionContext
+argument. So you may still see commands where the command definition is:
+
+```python3
+def command_function(debugger, command, result, internal_dict):
+    # Your code goes here
+```
+
+Using this form is strongly discouraged because it can only operate on the "currently selected"
+target, process, thread, frame.  The command will behave as expected when run
+directly on the command line.  But if the command is used in a stop-hook, breakpoint
+callback, etc. where the response to the callback determines whether we will select
+this or that particular process/frame/thread, the global "currently selected"
+entity is not necessarily the one the callback is meant to handle.  In that case, this
+command definition form can't do the right thing.
+
+| Argument | Type | Description |
+|----------|------|-------------|
+| `debugger` | `lldb.SBDebugger` | The current debugger object. |
+| `command` | `python string` | A python string containing all arguments for your command. If you need to chop up the arguments try using the `shlex` module's `shlex.split(command)` to properly extract the arguments. |
+| `exe_ctx` | `lldb.SBExecutionContext` | An execution context object carrying around information on the inferior process' context in which the command is expected to act *Optional since lldb 3.5.2, unavailable before* |
+| `result` | `lldb.SBCommandReturnObject` | A return object which encapsulates success/failure information for the command and output text that needs to be printed as a result of the command. The plain Python "print" command also works but text won't go in the result by default (it is useful as a temporary logging facility). |
+| `internal_dict` | `python dict object` | The dictionary for the current embedded script session which contains all variables and functions. |
+
+### Create a new command using a Python class
+
+Since lldb 3.7, Python commands can also be implemented by means of a class
+which should implement the following interface:
+
+```python3
+class CommandObjectType:
+    def __init__(self, debugger, internal_dict):
+        # this call should initialize the command with respect to the command interpreter for the passed-in debugger
+
+    def __call__(self, debugger, command, exe_ctx, result):
+        # this is the actual bulk of the command, akin to Python command functions
+
+    def get_short_help(self):
+        # this call should return the short help text for this command[1]
+
+    def get_long_help(self):
+        # this call should return the long help text for this command[1]
+
+    def get_flags(self):
+        # this will be called when the command is added to the command interpreter,
+        # and should return a flag field made from or-ing together the appropriate
+        # elements of the lldb.CommandFlags enum to specify the requirements of this command.
+        # The CommandInterpreter will make sure all these requirements are met, and will
+        # return the standard lldb error if they are not.[1]
+
+    def get_repeat_command(self, command):
+        # The auto-repeat command is what will get executed when the user types just
+        # a return at the next prompt after this command is run.  Even if your command
+        # was run because it was specified as a repeat command, that invocation will still
+        # get asked for IT'S repeat command, so you can chain a series of repeats, for instance
+        # to implement a pager.
+
+        # The command argument is the command that is about to be executed.
+
+        # If this call returns None, then the ordinary repeat mechanism will be used
+        # If this call returns an empty string, then auto-repeat is disabled
+        # If this call returns any other string, that will be the repeat command [1]
+```
+
+[1] This method is optional.
+
+As a convenience, you can treat the result object as a Python file object, and
+say
+
+```python3
+print("my command does lots of cool stuff", file=result)
+```
+
+`SBCommandReturnObject` and `SBStream` both support this file-like behavior by
+providing `write()` and `flush()` calls at the Python layer.
+
+### Parsed Commands
+
+The commands that are added using this class definition are what lldb calls
+"raw" commands.  The command interpreter doesn't attempt to parse the command,
+doesn't handle option values, neither generating help for them, or their
+completion.  Raw commands are useful when the arguments passed to the command
+are unstructured, and having to protect them against lldb command parsing would
+be onerous.  For instance, "expr" is a raw command.
+
+You can also add scripted commands that implement the "parsed command", where
+the options and their types are specified, as well as the argument and argument
+types.  These commands look and act like the majority of lldb commands, and you
+can also add custom completions for the options and/or the arguments if you have
+special needs.
+
+The easiest way to do this is to derive your new command from the lldb.ParsedCommand
+class.  That responds in the same way to the help & repeat command interfaces, and
+provides some convenience methods, and most importantly an LLDBOptionValueParser,
+accessed through lldb.ParsedCommand.get_parser().  The parser is used to set
+your command definitions, and to retrieve option values in the `__call__` method.
+
+To set up the command definition, implement the ParsedCommand abstract method:
+
+```python3
+def setup_command_definition(self):
+```
+
+This is called when your command is added to lldb.  In this method you add the
+options and their types, the option help strings, etc. to the command using the API:
+
+```python3
+def add_option(self, short_option, long_option, help, default,
+               dest = None, required=False, groups = None,
+               value_type=lldb.eArgTypeNone, completion_type=None,
+               enum_values=None):
+    """
+    short_option: one character, must be unique, not required
+    long_option:  no spaces, must be unique, required
+    help:         a usage string for this option, will print in the command help
+    default:      the initial value for this option (if it has a value)
+    dest:         the name of the property that gives you access to the value for
+                  this value.  Defaults to the long option if not provided.
+    required: if true, this option must be provided or the command will error out
+    groups: Which "option groups" does this option belong to.  This can either be
+            a simple list (e.g. [1, 3, 4, 5]) or you can specify ranges by sublists:
+            so [1, [3,5]] is the same as [1, 3, 4, 5].
+    value_type: one of the lldb.eArgType enum values.  Some of the common arg
+                types also have default completers, which will be applied automatically.
+    completion_type: currently these are values form the lldb.CompletionType enum.	If
+                     you need custom completions, implement	handle_option_argument_completion.
+    enum_values: An array of duples: ["element_name", "element_help"].  If provided,
+                 only one of the enum elements is allowed.  The value will be the
+                 element_name for the chosen enum element as a string.
+    """
+```
+
+Similarly, you can add argument types to the command:
+
+```python3
+def make_argument_element(self, arg_type, repeat = "optional", groups = None):
+    """
+  	arg_type: The argument type, one of the	lldb.eArgType enum values.
+  	repeat:	Choose from the	following options:
+  	      	"plain"	- one value
+  	      	"optional" - zero or more values
+  	      	"plus" - one or	more values
+  	groups:	As with	add_option.
+    """
+```
+
+Then implement the body of the command by defining:
+
+```python3
+def __call__(self, debugger, args_array, exe_ctx, result):
+    """This is the command callback.  The option values are
+    provided by the 'dest' properties on the parser.
+
+    args_array: This is the list of arguments provided.
+    exe_ctx: Gives the SBExecutionContext on which the
+             command should operate.
+    result:  Any results of the command should be
+             written into this SBCommandReturnObject.
+    """
+```
+
+This differs from the "raw" command's `__call__` in that the arguments are already
+parsed into the args_array, and the option values are set in the parser, and
+can be accessed using their property name.  The LLDBOptionValueParser class has
+a couple of other handy methods:
+
+```python3
+def was_set(self, long_option_name):
+```
+
+returns `True` if the option was specified on the command line.
+
+```python
+def dest_for_option(self, long_option_name):
+"""
+This will return the value of the dest variable you defined for opt_name.
+Mostly useful for handle_completion where you get passed the long option.
+"""
+```
+
+### Completion
+
+lldb will handle completing your option names, and all your enum values
+automatically.  If your option or argument types have associated built-in completers,
+then lldb will also handle that completion for you.  But if you have a need for
+custom completions, either in your arguments or option values, you can handle
+completion by hand as well.  To handle completion of option value arguments,
+your lldb.ParsedCommand subclass should implement:
+
+```python3
+def handle_option_argument_completion(self, long_option, cursor_pos):
+"""
+long_option: The long option name of the option whose value you are
+             asked to complete.
+cursor_pos: The cursor position in the value for that option - which
+you can get from the option parser.
+"""
+```
+
+And to handle the completion of arguments:
+
+```python3
+def handle_argument_completion(self, args, arg_pos, cursor_pos):
+"""
+args: A list of the arguments to the command
+arg_pos: An index into the args list of the argument with the cursor
+cursor_pos: The cursor position in the arg specified by arg_pos
+"""
+```
+
+When either of these API's is called, the command line will have been parsed up to
+the word containing the cursor, and any option values set in that part of the command
+string are available from the option value parser.  That's useful for instance
+if you have a --shared-library option that would constrain the completions for,
+say, a symbol name option or argument.
+
+The return value specifies what the completion options are.  You have four
+choices:
+
+- `True`: the completion was handled with no completions.
+
+- `False`: the completion was not handled, forward it to the regular
+completion machinery.
+
+- A dictionary with the key: "completion": there is one candidate,
+whose value is the value of the "completion" key.  Optionally you can pass a
+"mode" key whose value is either "partial" or "complete".  Return partial if
+the "completion" string is a prefix for all the completed value.
+
+For instance, if the string you are completing is "Test" and the available completions are:
+"Test1", "Test11" and "Test111", you should return the dictionary:
+
+```python3
+return {"completion": "Test1", "mode" : "partial"}
+```
+
+and then lldb will add the "1" at the cursor and advance it after the added string,
+waiting for more completions.  But if "Test1" is the only completion, return:
+
+```python3
+{"completion": "Test1", "mode": "complete"}
+```
+
+and lldb will add "1 " at the cursor, indicating the command string is complete.
+
+The default is "complete", you don't need to specify a "mode" in that case.
+
+- A dictionary with the key: "values" whose value is a list of candidate completion
+strings.  The command interpreter will present those strings as the available choices.
+You can optionally include a "descriptions" key, whose value is a parallel array
+of description strings, and the completion will show the description next to
+each completion.
+
+### Loading Commands
+
+One other handy convenience when defining lldb command-line commands is the
+command "command script import" which will import a module specified by file
+path, so you don't have to change your PYTHONPATH for temporary scripts. It
+also has another convenience that if your new script module has a function of
+the form:
+
+```python
+def __lldb_init_module(debugger, internal_dict):
+    # Command Initialization code goes here
+```
+
+where debugger and internal_dict are as above, that function will get run when
+the module is loaded allowing you to add whatever commands you want into the
+current debugger. Note that this function will only be run when using the LLDB
+command `command script import`, it will not get run if anyone imports your
+module from another module.
+
+Another way to load custom commands in lldb is to use the
+`@lldb.command(command_name=None, doc=None)` decorator.
+
+```python3
+@lldb.command()
+def goodstuff(debugger, command, ctx, result, internal_dict):
+    """command help string"""
+    # Command Implementation code goes here
+```
+
+### Examples
+
+Now we can create a module called ls.py in the file ~/ls.py that will implement
+a function that can be used by LLDB's python command code:
+
+```python3
+#!/usr/bin/env python3
+
+import lldb
+import subprocess
+
+def ls(debugger, command, result, internal_dict):
+    output = subprocess.check_output(["/bin/ls"] + command.split(), text=True)
+    print(output, file=result)
+
+# And the initialization code to add your commands
+def __lldb_init_module(debugger, internal_dict):
+    debugger.HandleCommand('command script add -f ls.ls ls')
+    print('The "ls" python command has been installed and is ready for use.')
+```
+
+Now we can load the module into LLDB and use it
+
+```shell
+$ lldb
+(lldb) command script import ~/ls.py
+The "ls" python command has been installed and is ready for use.
+(lldb) ls -l /tmp/
+total 365848
+-rw-------   1 someuser  wheel         7331 Jan 19 15:37 crash.log
+```
+
+You can also make "container" commands to organize the commands you are adding to
+lldb.  Most of the lldb built-in commands structure themselves this way, and using
+a tree structure has the benefit of leaving the one-word command space free for user
+aliases.  It can also make it easier to find commands if you are adding more than
+a few of them.  Here's a trivial example of adding two "utility" commands into a
+"my-utilities" container:
+
+```python3
+#!/usr/bin/env python
+
+import lldb
+
+def first_utility(debugger, command, result, internal_dict):
+    print("I am the first utility")
+
+def second_utility(debugger, command, result, internal_dict):
+    print("I am the second utility")
+
+# And the initialization code to add your commands
+def __lldb_init_module(debugger, internal_dict):
+    debugger.HandleCommand('command container add -h "A container for my utilities" my-utilities')
+    debugger.HandleCommand('command script add -f my_utilities.first_utility -h "My first utility" my-utilities first')
+    debugger.HandleCommand('command script add -f my_utilities.second_utility -h "My second utility" my-utilities second')
+    print('The "my-utilities" python command has been installed and its subcommands are ready for use.')
+```
+
+Then your new commands are available under the my-utilities node:
+
+```
+(lldb) help my-utilities
+A container for my utilities
+
+Syntax: my-utilities
+
+The following subcommands are supported:
+
+    first  -- My first utility  Expects 'raw' input (see 'help raw-input'.)
+    second -- My second utility  Expects 'raw' input (see 'help raw-input'.)
+
+For more help on any particular subcommand, type 'help <command> <subcommand>'.
+(lldb) my-utilities first
+I am the first utility
+```
+
+A more interesting [template](https://github.com/llvm/llvm-project/blob/main/lldb/examples/python/cmdtemplate.py)
+has been created in the source repository that can help you to create lldb command quickly.
+
+A commonly required facility is being able to create a command that does some
+token substitution, and then runs a different debugger command (usually, it
+po'es the result of an expression evaluated on its argument). For instance,
+given the following program:
+
+```objc
+#import <Foundation/Foundation.h>
+NSString*
+ModifyString(NSString* src)
+{
+	return [src stringByAppendingString:@"foobar"];
+}
+
+int main()
+{
+	NSString* aString = @"Hello world";
+	NSString* anotherString = @"Let's be friends";
+	return 1;
+}
+```
+
+you may want a `pofoo` X command, that equates po [ModifyString(X)
+capitalizedString]. The following debugger interaction shows how to achieve
+that goal:
+
+```python3
+(lldb) script
+Python Interactive Interpreter. To exit, type 'quit()', 'exit()' or Ctrl-D.
+>>> def pofoo_funct(debugger, command, result, internal_dict):
+...	cmd = "po [ModifyString(" + command + ") capitalizedString]"
+...	debugger.HandleCommand(cmd)
+...
+>>> ^D
+(lldb) command script add pofoo -f pofoo_funct
+(lldb) pofoo aString
+$1 = 0x000000010010aa00 Hello Worldfoobar
+(lldb) pofoo anotherString
+$2 = 0x000000010010aba0 Let's Be Friendsfoobar
+```
\ No newline at end of file
diff --git a/lldb/examples/synthetic/libcxx.py b/lldb/examples/synthetic/libcxx.py
index 5abeb3061f4f5..549255e280c1d 100644
--- a/lldb/examples/synthetic/libcxx.py
+++ b/lldb/examples/synthetic/libcxx.py
@@ -1,3 +1,6 @@
+from enum import Enum
+from sys import stderr
+import sys
 import lldb
 import lldb.formatters.Logger
 
@@ -74,6 +77,59 @@ def stdstring_SummaryProvider(valobj, dict):
             return '"' + strval + '"'
 
 
+def get_buffer_end(buffer, begin):
+    """
+    Returns a pointer to where the next element would be pushed.
+
+    For libc++'s stable ABI and unstable < LLVM 22, returns `__end_`.
+    For libc++'s unstable ABI, returns `__begin_ + __size_`.
+    """
+    map_end = buffer.GetChildMemberWithName("__end_")
+    if map_end.IsValid():
+        return map_end.GetValueAsUnsigned(0)
+    map_size = buffer.GetChildMemberWithName("__size_").GetValueAsUnsigned(0)
+    return begin + map_size
+
+
+def get_buffer_endcap(parent, buffer, begin, has_compressed_pair_layout, is_size_based):
+    """
+    Returns a pointer to the end of the buffer.
+
+    For libc++'s stable ABI and unstable < LLVM 22, returns:
+        * `__end_cap_`, if `__compressed_pair` is being used
+        * `__cap_`, otherwise
+    For libc++'s unstable ABI, returns `__begin_ + __cap_`.
+    """
+    if has_compressed_pair_layout:
+        map_endcap = parent._get_value_of_compressed_pair(
+            buffer.GetChildMemberWithName("__end_cap_")
+        )
+    elif buffer.GetType().GetNumberOfDirectBaseClasses() == 1:
+        # LLVM 22's __split_buffer is derived from a base class that describes its layout. When the
+        # compressed pair ABI is required, we also use an anonymous struct. Per [#158131], LLDB
+        # is unable to access members of an anonymous struct to a base class, through the derived
+        # class. This means that in order to access the compressed pair's pointer, we need to first
+        # get to its base class.
+        #
+        # [#158131]: https://github.com/llvm/llvm-project/issues/158131
+        buffer = buffer.GetChildAtIndex(0)
+        if is_size_based:
+            map_endcap = buffer.GetChildMemberWithName("__cap_")
+        else:
+            map_endcap = buffer.GetChildMemberWithName("__back_cap_")
+        map_endcap = map_endcap.GetValueAsUnsigned(0)
+    else:
+        map_endcap = buffer.GetChildMemberWithName("__cap_")
+        if not map_endcap.IsValid():
+            map_endcap = buffer.GetChildMemberWithName("__end_cap_")
+        map_endcap = map_endcap.GetValueAsUnsigned(0)
+
+    if is_size_based:
+        return begin + map_endcap
+
+    return map_endcap
+
+
 class stdvector_SynthProvider:
     def __init__(self, valobj, dict):
         logger = lldb.formatters.Logger.Logger()
@@ -755,23 +811,21 @@ def update(self):
             if self.block_size < 0:
                 logger.write("block_size < 0")
                 return
-            map_ = self.valobj.GetChildMemberWithName("__map_")
             start = self.valobj.GetChildMemberWithName("__start_").GetValueAsUnsigned(0)
+
+            map_ = self.valobj.GetChildMemberWithName("__map_")
+            is_size_based = map_.GetChildMemberWithName("__size_").IsValid()
             first = map_.GetChildMemberWithName("__first_")
+            # LLVM 22 renames __map_.__begin_ to __map_.__front_cap_
+            if not first:
+                first = map_.GetChildMemberWithName("__front_cap_")
             map_first = first.GetValueAsUnsigned(0)
             self.map_begin = map_.GetChildMemberWithName("__begin_")
             map_begin = self.map_begin.GetValueAsUnsigned(0)
-            map_end = map_.GetChildMemberWithName("__end_").GetValueAsUnsigned(0)
-
-            if has_compressed_pair_layout:
-                map_endcap = self._get_value_of_compressed_pair(
-                    map_.GetChildMemberWithName("__end_cap_")
-                )
-            else:
-                map_endcap = map_.GetChildMemberWithName("__cap_")
-                if not map_endcap.IsValid():
-                    map_endcap = map_.GetChildMemberWithName("__end_cap_")
-                map_endcap = map_endcap.GetValueAsUnsigned(0)
+            map_end = get_buffer_end(map_, map_begin)
+            map_endcap = get_buffer_endcap(
+                self, map_, map_begin, has_compressed_pair_layout, is_size_based
+            )
 
             # check consistency
             if not map_first <= map_begin <= map_end <= map_endcap:
diff --git a/lldb/include/lldb/Protocol/MCP/Server.h b/lldb/include/lldb/Protocol/MCP/Server.h
index b674d58159550..1f916ae525b5c 100644
--- a/lldb/include/lldb/Protocol/MCP/Server.h
+++ b/lldb/include/lldb/Protocol/MCP/Server.h
@@ -108,8 +108,7 @@ bool fromJSON(const llvm::json::Value &, ServerInfo &, llvm::json::Path);
 /// once it is no longer referenced.
 class ServerInfoHandle {
 public:
-  ServerInfoHandle();
-  explicit ServerInfoHandle(llvm::StringRef filename);
+  explicit ServerInfoHandle(llvm::StringRef filename = "");
   ~ServerInfoHandle();
 
   ServerInfoHandle(ServerInfoHandle &&other);
@@ -121,6 +120,9 @@ class ServerInfoHandle {
   ServerInfoHandle &operator=(const ServerInfoHandle &) = delete;
   /// @}
 
+  /// Remove the file.
+  void Remove();
+
 private:
   llvm::SmallString<128> m_filename;
 };
diff --git a/lldb/include/lldb/Symbol/SymbolFile.h b/lldb/include/lldb/Symbol/SymbolFile.h
index ff67e002e5b02..3b4d7bc01d132 100644
--- a/lldb/include/lldb/Symbol/SymbolFile.h
+++ b/lldb/include/lldb/Symbol/SymbolFile.h
@@ -297,7 +297,8 @@ class SymbolFile : public PluginInterface {
                        lldb::SymbolContextItem resolve_scope,
                        SymbolContextList &sc_list);
 
-  virtual void DumpClangAST(Stream &s, llvm::StringRef filter) {}
+  virtual void DumpClangAST(Stream &s, llvm::StringRef filter,
+                            bool show_colors) {}
   virtual void FindGlobalVariables(ConstString name,
                                    const CompilerDeclContext &parent_decl_ctx,
                                    uint32_t max_matches,
diff --git a/lldb/include/lldb/Symbol/SymbolFileOnDemand.h b/lldb/include/lldb/Symbol/SymbolFileOnDemand.h
index 6e3c2477d1769..b376de73419d4 100644
--- a/lldb/include/lldb/Symbol/SymbolFileOnDemand.h
+++ b/lldb/include/lldb/Symbol/SymbolFileOnDemand.h
@@ -127,7 +127,8 @@ class SymbolFileOnDemand : public lldb_private::SymbolFile {
       lldb_private::SymbolContextList &sc_list) override;
 
   void Dump(lldb_private::Stream &s) override;
-  void DumpClangAST(lldb_private::Stream &s, llvm::StringRef filter) override;
+  void DumpClangAST(lldb_private::Stream &s, llvm::StringRef filter,
+                    bool show_color) override;
 
   void
   FindGlobalVariables(lldb_private::ConstString name,
diff --git a/lldb/include/lldb/Symbol/TypeSystem.h b/lldb/include/lldb/Symbol/TypeSystem.h
index 16a2e0b5a52fb..0ec3a28898329 100644
--- a/lldb/include/lldb/Symbol/TypeSystem.h
+++ b/lldb/include/lldb/Symbol/TypeSystem.h
@@ -448,7 +448,9 @@ class TypeSystem : public PluginInterface,
   /// \param[out] output Stream to dup the AST into.
   /// \param[in] filter If empty, dump whole AST. If non-empty, will only
   /// dump decls whose names contain \c filter.
-  virtual void Dump(llvm::raw_ostream &output, llvm::StringRef filter) = 0;
+  /// \param[in] show_color If true, prints the AST color-highlighted.
+  virtual void Dump(llvm::raw_ostream &output, llvm::StringRef filter,
+                    bool show_color) = 0;
 
   /// This is used by swift.
   virtual bool IsRuntimeGeneratedType(lldb::opaque_compiler_type_t type) = 0;
diff --git a/lldb/include/lldb/Target/StackID.h b/lldb/include/lldb/Target/StackID.h
index c2a5d733dcd69..18461533d648a 100644
--- a/lldb/include/lldb/Target/StackID.h
+++ b/lldb/include/lldb/Target/StackID.h
@@ -26,7 +26,11 @@ class StackID {
 
   lldb::addr_t GetPC() const { return m_pc; }
 
-  lldb::addr_t GetCallFrameAddress() const { return m_cfa; }
+  lldb::addr_t GetCallFrameAddressWithMetadata() const {
+    return m_cfa_with_metadata;
+  }
+
+  lldb::addr_t GetCallFrameAddressWithoutMetadata() const { return m_cfa; }
 
   SymbolContextScope *GetSymbolContextScope() const { return m_symbol_scope; }
 
@@ -62,6 +66,9 @@ class StackID {
   /// below)
   lldb::addr_t m_cfa = LLDB_INVALID_ADDRESS;
 
+  /// The cfa with metadata (i.e. prior to Process::FixAddress).
+  lldb::addr_t m_cfa_with_metadata = LLDB_INVALID_ADDRESS;
+
   /// If nullptr, there is no block or symbol for this frame. If not nullptr,
   /// this will either be the scope for the lexical block for the frame, or the
   /// scope for the symbol. Symbol context scopes are always be unique pointers
diff --git a/lldb/include/lldb/Utility/ArchSpec.h b/lldb/include/lldb/Utility/ArchSpec.h
index 96bd5e3597b68..361108fd8f0e7 100644
--- a/lldb/include/lldb/Utility/ArchSpec.h
+++ b/lldb/include/lldb/Utility/ArchSpec.h
@@ -327,6 +327,11 @@ class ArchSpec {
   ///  \return a boolean value.
   bool IsMIPS() const;
 
+  /// If NVPTX architecture return true.
+  ///
+  ///  \return a boolean value.
+  bool IsNVPTX() const;
+
   /// Returns a string representing current architecture as a target CPU for
   /// tools like compiler, disassembler etc.
   ///
diff --git a/lldb/include/lldb/ValueObject/DILAST.h b/lldb/include/lldb/ValueObject/DILAST.h
index 1d10755c46e39..0f05d753f1b56 100644
--- a/lldb/include/lldb/ValueObject/DILAST.h
+++ b/lldb/include/lldb/ValueObject/DILAST.h
@@ -20,6 +20,7 @@ namespace lldb_private::dil {
 enum class NodeKind {
   eArraySubscriptNode,
   eBitExtractionNode,
+  eBooleanLiteralNode,
   eErrorNode,
   eFloatLiteralNode,
   eIdentifierNode,
@@ -226,6 +227,23 @@ class FloatLiteralNode : public ASTNode {
   llvm::APFloat m_value;
 };
 
+class BooleanLiteralNode : public ASTNode {
+public:
+  BooleanLiteralNode(uint32_t location, bool value)
+      : ASTNode(location, NodeKind::eBooleanLiteralNode), m_value(value) {}
+
+  llvm::Expected<lldb::ValueObjectSP> Accept(Visitor *v) const override;
+
+  bool GetValue() const & { return m_value; }
+
+  static bool classof(const ASTNode *node) {
+    return node->GetKind() == NodeKind::eBooleanLiteralNode;
+  }
+
+private:
+  bool m_value;
+};
+
 /// This class contains one Visit method for each specialized type of
 /// DIL AST node. The Visit methods are used to dispatch a DIL AST node to
 /// the correct function in the DIL expression evaluator for evaluating that
@@ -247,6 +265,8 @@ class Visitor {
   Visit(const IntegerLiteralNode *node) = 0;
   virtual llvm::Expected<lldb::ValueObjectSP>
   Visit(const FloatLiteralNode *node) = 0;
+  virtual llvm::Expected<lldb::ValueObjectSP>
+  Visit(const BooleanLiteralNode *node) = 0;
 };
 
 } // namespace lldb_private::dil
diff --git a/lldb/include/lldb/ValueObject/DILEval.h b/lldb/include/lldb/ValueObject/DILEval.h
index 5a48c2c989f4d..eab3218ff828f 100644
--- a/lldb/include/lldb/ValueObject/DILEval.h
+++ b/lldb/include/lldb/ValueObject/DILEval.h
@@ -58,6 +58,8 @@ class Interpreter : Visitor {
   Visit(const IntegerLiteralNode *node) override;
   llvm::Expected<lldb::ValueObjectSP>
   Visit(const FloatLiteralNode *node) override;
+  llvm::Expected<lldb::ValueObjectSP>
+  Visit(const BooleanLiteralNode *node) override;
 
   llvm::Expected<CompilerType>
   PickIntegerType(lldb::TypeSystemSP type_system,
diff --git a/lldb/include/lldb/ValueObject/DILLexer.h b/lldb/include/lldb/ValueObject/DILLexer.h
index 4345e6ce7f26b..28b94a79c5902 100644
--- a/lldb/include/lldb/ValueObject/DILLexer.h
+++ b/lldb/include/lldb/ValueObject/DILLexer.h
@@ -31,6 +31,8 @@ class Token {
     float_constant,
     identifier,
     integer_constant,
+    kw_false,
+    kw_true,
     l_paren,
     l_square,
     minus,
diff --git a/lldb/include/lldb/ValueObject/DILParser.h b/lldb/include/lldb/ValueObject/DILParser.h
index 90df109337dcf..d17ed66d9b3ee 100644
--- a/lldb/include/lldb/ValueObject/DILParser.h
+++ b/lldb/include/lldb/ValueObject/DILParser.h
@@ -99,11 +99,14 @@ class DILParser {
   ASTNodeUP ParseNumericLiteral();
   ASTNodeUP ParseIntegerLiteral();
   ASTNodeUP ParseFloatingPointLiteral();
+  ASTNodeUP ParseBooleanLiteral();
 
   void BailOut(const std::string &error, uint32_t loc, uint16_t err_len);
 
   void Expect(Token::Kind kind);
 
+  void ExpectOneOf(std::vector<Token::Kind> kinds_vec);
+
   void TentativeParsingRollback(uint32_t saved_idx) {
     if (m_error)
       llvm::consumeError(std::move(m_error));
diff --git a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py
index 51debcf477a9d..daa3e76df6d82 100644
--- a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py
+++ b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py
@@ -215,6 +215,7 @@ def __init__(
         self.terminated: bool = False
         self.events: List[Event] = []
         self.progress_events: List[Event] = []
+        self.invalidated_event: Optional[Event] = None
         self.reverse_requests: List[Request] = []
         self.module_events: List[Dict] = []
         self.sequence: int = 1
@@ -440,6 +441,8 @@ def _handle_event(self, packet: Event) -> None:
         elif event == "capabilities" and body:
             # Update the capabilities with new ones from the event.
             self.capabilities.update(body["capabilities"])
+        elif event == "invalidated":
+            self.invalidated_event = packet
 
     def _handle_reverse_request(self, request: Request) -> None:
         if request in self.reverse_requests:
@@ -1014,6 +1017,7 @@ def request_initialize(self, sourceInitFile=False):
                 "supportsVariableType": True,
                 "supportsStartDebuggingRequest": True,
                 "supportsProgressReporting": True,
+                "supportsInvalidatedEvent": True,
                 "$__lldb_sourceInitFile": sourceInitFile,
             },
         }
@@ -1035,6 +1039,7 @@ def request_launch(
         disableSTDIO=False,
         shellExpandArguments=False,
         console: Optional[str] = None,
+        stdio: Optional[list[str]] = None,
         enableAutoVariableSummaries=False,
         displayExtendedBacktrace=False,
         enableSyntheticChildDebugging=False,
@@ -1086,6 +1091,8 @@ def request_launch(
             args_dict["sourceMap"] = sourceMap
         if console:
             args_dict["console"] = console
+        if stdio:
+            args_dict["stdio"] = stdio
         if postRunCommands:
             args_dict["postRunCommands"] = postRunCommands
         if customFrameFormat:
diff --git a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py
index fffd4c23d6fcd..a0a009ae6cc9a 100644
--- a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py
+++ b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py
@@ -241,6 +241,13 @@ def verify_commands(self, flavor: str, output: str, commands: list[str]):
                 f"Command '{flavor}' - '{cmd}' not found in output: {output}",
             )
 
+    def verify_invalidated_event(self, expected_areas):
+        event = self.dap_server.invalidated_event
+        self.dap_server.invalidated_event = None
+        self.assertIsNotNone(event)
+        areas = event["body"].get("areas", [])
+        self.assertEqual(set(expected_areas), set(areas))
+
     def get_dict_value(self, d: dict, key_path: list[str]) -> Any:
         """Verify each key in the key_path array is in contained in each
         dictionary within "d". Assert if any key isn't in the
@@ -352,13 +359,20 @@ def get_local_as_int(self, name, threadId=None):
         else:
             return int(value)
 
+    def set_variable(self, varRef, name, value, id=None):
+        """Set a variable."""
+        response = self.dap_server.request_setVariable(varRef, name, str(value), id=id)
+        if response["success"]:
+            self.verify_invalidated_event(["variables"])
+        return response
+
     def set_local(self, name, value, id=None):
         """Set a top level local variable only."""
-        return self.dap_server.request_setVariable(1, name, str(value), id=id)
+        return self.set_variable(1, name, str(value), id=id)
 
     def set_global(self, name, value, id=None):
         """Set a top level global variable only."""
-        return self.dap_server.request_setVariable(2, name, str(value), id=id)
+        return self.set_variable(2, name, str(value), id=id)
 
     def stepIn(
         self,
@@ -577,4 +591,6 @@ def writeMemory(self, memoryReference, data=None, offset=0, allowPartial=False):
         response = self.dap_server.request_writeMemory(
             memoryReference, encodedData, offset=offset, allowPartial=allowPartial
         )
+        if response["success"]:
+            self.verify_invalidated_event(["all"])
         return response
diff --git a/lldb/scripts/framework-header-fix.py b/lldb/scripts/framework-header-fix.py
index 36c5c67c59d36..3447dfc29a761 100755
--- a/lldb/scripts/framework-header-fix.py
+++ b/lldb/scripts/framework-header-fix.py
@@ -115,8 +115,10 @@ def main():
         unifdef_guards = ["-U" + guard for guard in args.unifdef_guards]
 
     # Create the framework's header dir if it doesn't already exist
-    if not os.path.exists(os.path.dirname(output_file_path)):
+    try:
         os.makedirs(os.path.dirname(output_file_path))
+    except FileExistsError:
+        pass
 
     if framework_version == "lldb_main":
         modify_main_includes(input_file_path, output_file_path)
diff --git a/lldb/source/API/SBFrame.cpp b/lldb/source/API/SBFrame.cpp
index b6724bb0c4119..42dbed490a33d 100644
--- a/lldb/source/API/SBFrame.cpp
+++ b/lldb/source/API/SBFrame.cpp
@@ -267,7 +267,7 @@ lldb::addr_t SBFrame::GetCFA() const {
   }
 
   if (StackFrame *frame = exe_ctx->GetFramePtr())
-    return frame->GetStackID().GetCallFrameAddress();
+    return frame->GetStackID().GetCallFrameAddressWithoutMetadata();
   return LLDB_INVALID_ADDRESS;
 }
 
diff --git a/lldb/source/Commands/CommandObjectTarget.cpp b/lldb/source/Commands/CommandObjectTarget.cpp
index 004542e3e6aed..940be42d1b6e3 100644
--- a/lldb/source/Commands/CommandObjectTarget.cpp
+++ b/lldb/source/Commands/CommandObjectTarget.cpp
@@ -2210,7 +2210,9 @@ class CommandObjectTargetModulesDumpClangPCMInfo : public CommandObjectParsed {
 
     const char *clang_args[] = {"clang", pcm_path};
     clang::CompilerInstance compiler(clang::createInvocation(clang_args));
-    compiler.createDiagnostics(*FileSystem::Instance().GetVirtualFileSystem());
+    compiler.setVirtualFileSystem(
+        FileSystem::Instance().GetVirtualFileSystem());
+    compiler.createDiagnostics();
 
     // Pass empty deleter to not attempt to free memory that was allocated
     // outside of the current scope, possibly statically.
@@ -2275,7 +2277,8 @@ class CommandObjectTargetModulesDumpClangAST
         if (INTERRUPT_REQUESTED(GetDebugger(), "Interrupted dumping clang ast"))
           break;
         if (SymbolFile *sf = module_sp->GetSymbolFile())
-          sf->DumpClangAST(result.GetOutputStream(), filter);
+          sf->DumpClangAST(result.GetOutputStream(), filter,
+                           GetCommandInterpreter().GetDebugger().GetUseColor());
       }
       result.SetStatus(eReturnStatusSuccessFinishResult);
       return;
@@ -2304,7 +2307,8 @@ class CommandObjectTargetModulesDumpClangAST
 
         Module *m = module_list.GetModulePointerAtIndex(i);
         if (SymbolFile *sf = m->GetSymbolFile())
-          sf->DumpClangAST(result.GetOutputStream(), filter);
+          sf->DumpClangAST(result.GetOutputStream(), filter,
+                           GetCommandInterpreter().GetDebugger().GetUseColor());
       }
     }
     result.SetStatus(eReturnStatusSuccessFinishResult);
@@ -5294,7 +5298,8 @@ class CommandObjectTargetDumpTypesystem : public CommandObjectParsed {
     // Go over every scratch TypeSystem and dump to the command output.
     for (lldb::TypeSystemSP ts : GetTarget().GetScratchTypeSystems())
       if (ts)
-        ts->Dump(result.GetOutputStream().AsRawOstream(), "");
+        ts->Dump(result.GetOutputStream().AsRawOstream(), "",
+                 GetCommandInterpreter().GetDebugger().GetUseColor());
 
     result.SetStatus(eReturnStatusSuccessFinishResult);
   }
diff --git a/lldb/source/Expression/DWARFExpression.cpp b/lldb/source/Expression/DWARFExpression.cpp
index 332cf2c86024a..5040351f4975b 100644
--- a/lldb/source/Expression/DWARFExpression.cpp
+++ b/lldb/source/Expression/DWARFExpression.cpp
@@ -2195,7 +2195,7 @@ llvm::Expected<Value> DWARFExpression::Evaluate(
         // Note that we don't have to parse FDEs because this DWARF expression
         // is commonly evaluated with a valid stack frame.
         StackID id = frame->GetStackID();
-        addr_t cfa = id.GetCallFrameAddress();
+        addr_t cfa = id.GetCallFrameAddressWithMetadata();
         if (cfa != LLDB_INVALID_ADDRESS) {
           stack.push_back(Scalar(cfa));
           stack.back().SetValueType(Value::ValueType::LoadAddress);
diff --git a/lldb/source/Host/common/File.cpp b/lldb/source/Host/common/File.cpp
index 8fd1ca069dc01..1272f13c1d82d 100644
--- a/lldb/source/Host/common/File.cpp
+++ b/lldb/source/Host/common/File.cpp
@@ -659,7 +659,7 @@ Status NativeFile::Write(const void *buf, size_t &num_bytes) {
 #ifdef _WIN32
     if (is_windows_console) {
       llvm::raw_fd_ostream(_fileno(m_stream), false)
-          .write((char *)buf, num_bytes);
+          .write((const char *)buf, num_bytes);
       return error;
     }
 #endif
diff --git a/lldb/source/Host/common/Socket.cpp b/lldb/source/Host/common/Socket.cpp
index 3511cde8bb36f..bc3d849c5c6c6 100644
--- a/lldb/source/Host/common/Socket.cpp
+++ b/lldb/source/Host/common/Socket.cpp
@@ -506,7 +506,7 @@ Socket::GetProtocolAndMode(llvm::StringRef scheme) {
       .Case("unix-abstract-accept",
             ProtocolModePair{SocketProtocol::ProtocolUnixAbstract,
                              SocketMode::ModeAccept})
-      .Cases("connect", "tcp-connect",
+      .Cases("connect", "tcp-connect", "connection",
              ProtocolModePair{SocketProtocol::ProtocolTcp,
                               SocketMode::ModeConnect})
       .Case("udp", ProtocolModePair{SocketProtocol::ProtocolTcp,
diff --git a/lldb/source/Host/windows/Host.cpp b/lldb/source/Host/windows/Host.cpp
index e8973a3fb937a..d5704eed10ecb 100644
--- a/lldb/source/Host/windows/Host.cpp
+++ b/lldb/source/Host/windows/Host.cpp
@@ -321,7 +321,6 @@ void Host::SystemLog(Severity severity, llvm::StringRef message) {
     stream << "[Error] ";
     break;
   case lldb::eSeverityInfo:
-  default:
     stream << "[Info] ";
     break;
   }
diff --git a/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionParser.cpp b/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionParser.cpp
index 097a4661e0c96..6885977baa24e 100644
--- a/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionParser.cpp
+++ b/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionParser.cpp
@@ -720,14 +720,16 @@ ClangExpressionParser::ClangExpressionParser(
   m_compiler = std::make_unique<CompilerInstance>();
 
   // Make sure clang uses the same VFS as LLDB.
-  m_compiler->createFileManager(FileSystem::Instance().GetVirtualFileSystem());
+  m_compiler->setVirtualFileSystem(
+      FileSystem::Instance().GetVirtualFileSystem());
+  m_compiler->createFileManager();
 
   // 2. Configure the compiler with a set of default options that are
   // appropriate for most situations.
   SetupTargetOpts(*m_compiler, *target_sp);
 
   // 3. Create and install the target on the compiler.
-  m_compiler->createDiagnostics(m_compiler->getVirtualFileSystem());
+  m_compiler->createDiagnostics();
   // Limit the number of error diagnostics we emit.
   // A value of 0 means no limit for both LLDB and Clang.
   m_compiler->getDiagnostics().setErrorLimit(target_sp->GetExprErrorLimit());
diff --git a/lldb/source/Plugins/ExpressionParser/Clang/ClangModulesDeclVendor.cpp b/lldb/source/Plugins/ExpressionParser/Clang/ClangModulesDeclVendor.cpp
index d54f0729659eb..67984c5f44bf0 100644
--- a/lldb/source/Plugins/ExpressionParser/Clang/ClangModulesDeclVendor.cpp
+++ b/lldb/source/Plugins/ExpressionParser/Clang/ClangModulesDeclVendor.cpp
@@ -746,7 +746,8 @@ ClangModulesDeclVendor::Create(Target &target) {
   auto instance = std::make_unique<clang::CompilerInstance>(invocation);
 
   // Make sure clang uses the same VFS as LLDB.
-  instance->createFileManager(FileSystem::Instance().GetVirtualFileSystem());
+  instance->setVirtualFileSystem(FileSystem::Instance().GetVirtualFileSystem());
+  instance->createFileManager();
   instance->setDiagnostics(diagnostics_engine);
 
   std::unique_ptr<clang::FrontendAction> action(new clang::SyntaxOnlyAction);
diff --git a/lldb/source/Plugins/Instruction/ARM/EmulateInstructionARM.cpp b/lldb/source/Plugins/Instruction/ARM/EmulateInstructionARM.cpp
index 89da4d200699f..f5f077ffb0bfc 100644
--- a/lldb/source/Plugins/Instruction/ARM/EmulateInstructionARM.cpp
+++ b/lldb/source/Plugins/Instruction/ARM/EmulateInstructionARM.cpp
@@ -14135,7 +14135,13 @@ EmulateInstructionARM::AddWithCarry(uint32_t x, uint32_t y, uint8_t carry_in) {
   uint8_t overflow;
 
   uint64_t unsigned_sum = x + y + carry_in;
-  int64_t signed_sum = (int32_t)x + (int32_t)y + (int32_t)carry_in;
+  int64_t signed_sum = 0;
+  int32_t signed_sum32;
+  if (llvm::AddOverflow((int32_t)x, (int32_t)y, signed_sum32))
+    signed_sum++;
+  signed_sum += signed_sum32;
+
+  signed_sum += (int32_t)carry_in;
 
   result = UnsignedBits(unsigned_sum, 31, 0);
   //    carry_out = (result == unsigned_sum ? 0 : 1);
diff --git a/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp b/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp
index 277de8f444828..4e8a430af8c6c 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp
+++ b/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp
@@ -190,14 +190,16 @@ static bool IsTrivialBasename(const llvm::StringRef &basename) {
   if (basename.size() <= idx)
     return false; // Empty string or "~"
 
-  if (!std::isalpha(basename[idx]) && basename[idx] != '_')
+  if (!std::isalpha(static_cast<unsigned char>(basename[idx])) &&
+      basename[idx] != '_')
     return false; // First character (after removing the possible '~'') isn't in
                   // [A-Za-z_]
 
   // Read all characters matching [A-Za-z_0-9]
   ++idx;
   while (idx < basename.size()) {
-    if (!std::isalnum(basename[idx]) && basename[idx] != '_')
+    if (!std::isalnum(static_cast<unsigned char>(basename[idx])) &&
+        basename[idx] != '_')
       break;
     ++idx;
   }
@@ -2197,6 +2199,7 @@ bool CPlusPlusLanguage::GetFunctionDisplayName(
   case FunctionNameRepresentation::eName:
     return false;
   }
+  llvm_unreachable("Fully covered switch above");
 }
 
 bool CPlusPlusLanguage::HandleFrameFormatVariable(
diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxx.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibCxx.cpp
index 6053d042b29b1..141c5c9a2caf9 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/LibCxx.cpp
+++ b/lldb/source/Plugins/Language/CPlusPlus/LibCxx.cpp
@@ -40,8 +40,10 @@ using namespace lldb_private::formatters;
 static void consumeInlineNamespace(llvm::StringRef &name) {
   // Delete past an inline namespace, if any: __[a-zA-Z0-9_]+::
   auto scratch = name;
-  if (scratch.consume_front("__") && std::isalnum(scratch[0])) {
-    scratch = scratch.drop_while([](char c) { return std::isalnum(c); });
+  if (scratch.consume_front("__") &&
+      std::isalnum(static_cast<unsigned char>(scratch[0]))) {
+    scratch = scratch.drop_while(
+        [](char c) { return std::isalnum(static_cast<unsigned char>(c)); });
     if (scratch.consume_front("::")) {
       // Successfully consumed a namespace.
       name = scratch;
diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxxUnorderedMap.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibCxxUnorderedMap.cpp
index f88a5319068a2..4b183a8d62e53 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/LibCxxUnorderedMap.cpp
+++ b/lldb/source/Plugins/Language/CPlusPlus/LibCxxUnorderedMap.cpp
@@ -113,10 +113,11 @@ CompilerType lldb_private::formatters::LibcxxStdUnorderedMapSyntheticFrontEnd::
   // wraps a std::pair. Peel away the internal wrapper type - whose structure is
   // of no value to users, to expose the std::pair. This matches the structure
   // returned by the std::map synthetic provider.
-  if (isUnorderedMap(m_backend.GetCompilerType()
-                         .GetNonReferenceType()
-                         .GetCanonicalType()
-                         .GetTypeName())) {
+  CompilerType backend_type = m_backend.GetCompilerType();
+  if (backend_type.IsPointerOrReferenceType())
+    backend_type = backend_type.GetPointeeType();
+
+  if (isUnorderedMap(backend_type.GetCanonicalType().GetTypeName())) {
     std::string name;
     CompilerType field_type =
         element_type.GetFieldAtIndex(0, name, nullptr, nullptr, nullptr);
diff --git a/lldb/source/Plugins/Process/Utility/HistoryUnwind.cpp b/lldb/source/Plugins/Process/Utility/HistoryUnwind.cpp
index 3b0618fa10374..1204faf4fa61e 100644
--- a/lldb/source/Plugins/Process/Utility/HistoryUnwind.cpp
+++ b/lldb/source/Plugins/Process/Utility/HistoryUnwind.cpp
@@ -60,6 +60,7 @@ static bool BehavesLikeZerothFrame(HistoryPCType pc_type, uint32_t frame_idx) {
   case HistoryPCType::Calls:
     return true;
   }
+  llvm_unreachable("Fully covered switch above");
 }
 
 bool HistoryUnwind::DoGetFrameInfoAtIndex(uint32_t frame_idx, lldb::addr_t &cfa,
diff --git a/lldb/source/Plugins/Process/Utility/StopInfoMachException.cpp b/lldb/source/Plugins/Process/Utility/StopInfoMachException.cpp
index 29a64a2a03bf0..6853121f3e01c 100644
--- a/lldb/source/Plugins/Process/Utility/StopInfoMachException.cpp
+++ b/lldb/source/Plugins/Process/Utility/StopInfoMachException.cpp
@@ -77,6 +77,35 @@ static void DescribeAddressBriefly(Stream &strm, const Address &addr,
   strm.Printf(".\n");
 }
 
+static constexpr uint8_t g_mte_tag_shift = 64 - 8;
+static constexpr uintptr_t g_mte_tag_mask = (uintptr_t)0x0f << g_mte_tag_shift;
+
+bool StopInfoMachException::DetermineTagMismatch(ExecutionContext &exe_ctx) {
+  const bool IsBadAccess = m_value == 1;            // EXC_BAD_ACCESS
+  const bool IsMTETagFault = (m_exc_code == 0x106); // EXC_ARM_MTE_TAG_FAULT
+  if (!IsBadAccess || !IsMTETagFault)
+    return false;
+
+  if (m_exc_data_count < 2)
+    return false;
+
+  const uint64_t bad_address = m_exc_subcode;
+
+  StreamString strm;
+  strm.Printf("EXC_ARM_MTE_TAG_FAULT (code=%" PRIu64 ", address=0x%" PRIx64
+              ")\n",
+              m_exc_code, bad_address);
+
+  const uint8_t tag = (bad_address & g_mte_tag_mask) >> g_mte_tag_shift;
+  const uint64_t canonical_addr = bad_address & ~g_mte_tag_mask;
+  strm.Printf(
+      "Note: MTE tag mismatch detected: pointer tag=%d, address=0x%" PRIx64,
+      tag, canonical_addr);
+  m_description = std::string(strm.GetString());
+
+  return true;
+}
+
 bool StopInfoMachException::DeterminePtrauthFailure(ExecutionContext &exe_ctx) {
   bool IsBreakpoint = m_value == 6; // EXC_BREAKPOINT
   bool IsBadAccess = m_value == 1;  // EXC_BAD_ACCESS
@@ -266,6 +295,8 @@ const char *StopInfoMachException::GetDescription() {
     case llvm::Triple::aarch64:
       if (DeterminePtrauthFailure(exe_ctx))
         return m_description.c_str();
+      if (DetermineTagMismatch(exe_ctx))
+        return m_description.c_str();
       break;
 
     default:
diff --git a/lldb/source/Plugins/Process/Utility/StopInfoMachException.h b/lldb/source/Plugins/Process/Utility/StopInfoMachException.h
index c612ac400b4c4..c02389e5b3642 100644
--- a/lldb/source/Plugins/Process/Utility/StopInfoMachException.h
+++ b/lldb/source/Plugins/Process/Utility/StopInfoMachException.h
@@ -27,6 +27,8 @@ class StopInfoMachException : public StopInfo {
   /// is auth-related failure, and returns false otherwise.
   bool DeterminePtrauthFailure(ExecutionContext &exe_ctx);
 
+  bool DetermineTagMismatch(ExecutionContext &exe_ctx);
+
 public:
   // Constructors and Destructors
   StopInfoMachException(Thread &thread, uint32_t exc_type,
diff --git a/lldb/source/Plugins/Process/wasm/ProcessWasm.cpp b/lldb/source/Plugins/Process/wasm/ProcessWasm.cpp
index 580e8c1d9cfa4..62bcf442d097a 100644
--- a/lldb/source/Plugins/Process/wasm/ProcessWasm.cpp
+++ b/lldb/source/Plugins/Process/wasm/ProcessWasm.cpp
@@ -98,6 +98,7 @@ size_t ProcessWasm::ReadMemory(lldb::addr_t vm_addr, void *buf, size_t size,
         "Wasm read failed for invalid address 0x%" PRIx64, vm_addr);
     return 0;
   }
+  llvm_unreachable("Fully covered switch above");
 }
 
 llvm::Expected<std::vector<lldb::addr_t>>
diff --git a/lldb/source/Plugins/Protocol/MCP/ProtocolServerMCP.cpp b/lldb/source/Plugins/Protocol/MCP/ProtocolServerMCP.cpp
index dc18c8e06803a..d3af3cf25c4a1 100644
--- a/lldb/source/Plugins/Protocol/MCP/ProtocolServerMCP.cpp
+++ b/lldb/source/Plugins/Protocol/MCP/ProtocolServerMCP.cpp
@@ -10,8 +10,6 @@
 #include "Resource.h"
 #include "Tool.h"
 #include "lldb/Core/PluginManager.h"
-#include "lldb/Host/FileSystem.h"
-#include "lldb/Host/HostInfo.h"
 #include "lldb/Protocol/MCP/Server.h"
 #include "lldb/Utility/LLDBLog.h"
 #include "lldb/Utility/Log.h"
@@ -60,7 +58,9 @@ void ProtocolServerMCP::Extend(lldb_protocol::mcp::Server &server) const {
                                            "MCP initialization complete");
                                 });
   server.AddTool(
-      std::make_unique<CommandTool>("lldb_command", "Run an lldb command."));
+      std::make_unique<CommandTool>("command", "Run an lldb command."));
+  server.AddTool(std::make_unique<DebuggerListTool>(
+      "debugger_list", "List debugger instances with their debugger_id."));
   server.AddResourceProvider(std::make_unique<DebuggerResourceProvider>());
 }
 
@@ -145,8 +145,8 @@ llvm::Error ProtocolServerMCP::Stop() {
   if (m_loop_thread.joinable())
     m_loop_thread.join();
 
+  m_server_info_handle.Remove();
   m_listen_handlers.clear();
-  m_server_info_handle = ServerInfoHandle();
   m_instances.clear();
 
   return llvm::Error::success();
diff --git a/lldb/source/Plugins/Protocol/MCP/Tool.cpp b/lldb/source/Plugins/Protocol/MCP/Tool.cpp
index 2f451bf76e81d..cb134b965c2e2 100644
--- a/lldb/source/Plugins/Protocol/MCP/Tool.cpp
+++ b/lldb/source/Plugins/Protocol/MCP/Tool.cpp
@@ -7,26 +7,36 @@
 //===----------------------------------------------------------------------===//
 
 #include "Tool.h"
+#include "lldb/Core/Debugger.h"
 #include "lldb/Interpreter/CommandInterpreter.h"
 #include "lldb/Interpreter/CommandReturnObject.h"
 #include "lldb/Protocol/MCP/Protocol.h"
+#include "lldb/Utility/UriParser.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Error.h"
+#include <cstdint>
+#include <optional>
 
 using namespace lldb_private;
 using namespace lldb_protocol;
 using namespace lldb_private::mcp;
+using namespace lldb;
 using namespace llvm;
 
 namespace {
+
+static constexpr StringLiteral kSchemeAndHost = "lldb-mcp://debugger/";
+
 struct CommandToolArguments {
-  uint64_t debugger_id;
-  std::string arguments;
+  /// Either an id like '1' or a uri like 'lldb-mcp://debugger/1'.
+  std::string debugger;
+  std::string command;
 };
 
-bool fromJSON(const llvm::json::Value &V, CommandToolArguments &A,
-              llvm::json::Path P) {
-  llvm::json::ObjectMapper O(V, P);
-  return O && O.map("debugger_id", A.debugger_id) &&
-         O.mapOptional("arguments", A.arguments);
+bool fromJSON(const json::Value &V, CommandToolArguments &A, json::Path P) {
+  json::ObjectMapper O(V, P);
+  return O && O.mapOptional("debugger", A.debugger) &&
+         O.mapOptional("command", A.command);
 }
 
 /// Helper function to create a CallToolResult from a string output.
@@ -39,9 +49,13 @@ createTextResult(std::string output, bool is_error = false) {
   return text_result;
 }
 
+std::string to_uri(DebuggerSP debugger) {
+  return (kSchemeAndHost + std::to_string(debugger->GetID())).str();
+}
+
 } // namespace
 
-llvm::Expected<lldb_protocol::mcp::CallToolResult>
+Expected<lldb_protocol::mcp::CallToolResult>
 CommandTool::Call(const lldb_protocol::mcp::ToolArguments &args) {
   if (!std::holds_alternative<json::Value>(args))
     return createStringError("CommandTool requires arguments");
@@ -52,19 +66,35 @@ CommandTool::Call(const lldb_protocol::mcp::ToolArguments &args) {
   if (!fromJSON(std::get<json::Value>(args), arguments, root))
     return root.getError();
 
-  lldb::DebuggerSP debugger_sp =
-      Debugger::FindDebuggerWithID(arguments.debugger_id);
+  lldb::DebuggerSP debugger_sp;
+
+  if (!arguments.debugger.empty()) {
+    llvm::StringRef debugger_specifier = arguments.debugger;
+    debugger_specifier.consume_front(kSchemeAndHost);
+    uint32_t debugger_id = 0;
+    if (debugger_specifier.consumeInteger(10, debugger_id))
+      return createStringError(
+          formatv("malformed debugger specifier {0}", arguments.debugger));
+
+    debugger_sp = Debugger::FindDebuggerWithID(debugger_id);
+  } else {
+    for (size_t i = 0; i < Debugger::GetNumDebuggers(); i++) {
+      debugger_sp = Debugger::GetDebuggerAtIndex(i);
+      if (debugger_sp)
+        break;
+    }
+  }
+
   if (!debugger_sp)
-    return createStringError(
-        llvm::formatv("no debugger with id {0}", arguments.debugger_id));
+    return createStringError("no debugger found");
 
   // FIXME: Disallow certain commands and their aliases.
   CommandReturnObject result(/*colors=*/false);
-  debugger_sp->GetCommandInterpreter().HandleCommand(
-      arguments.arguments.c_str(), eLazyBoolYes, result);
+  debugger_sp->GetCommandInterpreter().HandleCommand(arguments.command.c_str(),
+                                                     eLazyBoolYes, result);
 
   std::string output;
-  llvm::StringRef output_str = result.GetOutputString();
+  StringRef output_str = result.GetOutputString();
   if (!output_str.empty())
     output += output_str.str();
 
@@ -78,14 +108,42 @@ CommandTool::Call(const lldb_protocol::mcp::ToolArguments &args) {
   return createTextResult(output, !result.Succeeded());
 }
 
-std::optional<llvm::json::Value> CommandTool::GetSchema() const {
-  llvm::json::Object id_type{{"type", "number"}};
-  llvm::json::Object str_type{{"type", "string"}};
-  llvm::json::Object properties{{"debugger_id", std::move(id_type)},
-                                {"arguments", std::move(str_type)}};
-  llvm::json::Array required{"debugger_id"};
-  llvm::json::Object schema{{"type", "object"},
-                            {"properties", std::move(properties)},
-                            {"required", std::move(required)}};
+std::optional<json::Value> CommandTool::GetSchema() const {
+  using namespace llvm::json;
+  Object properties{
+      {"debugger",
+       Object{{"type", "string"},
+              {"description",
+               "The debugger ID or URI to a specific debug session. If not "
+               "specified, the first debugger will be used."}}},
+      {"command",
+       Object{{"type", "string"}, {"description", "An lldb command to run."}}}};
+  Object schema{{"type", "object"}, {"properties", std::move(properties)}};
   return schema;
 }
+
+Expected<lldb_protocol::mcp::CallToolResult>
+DebuggerListTool::Call(const lldb_protocol::mcp::ToolArguments &args) {
+  llvm::json::Path::Root root;
+
+  // Return a nested Markdown list with debuggers and target.
+  // Example output:
+  //
+  // - lldb-mcp://debugger/1
+  // - lldb-mcp://debugger/2
+  //
+  // FIXME: Use Structured Content when we adopt protocol version 2025-06-18.
+  std::string output;
+  llvm::raw_string_ostream os(output);
+
+  const size_t num_debuggers = Debugger::GetNumDebuggers();
+  for (size_t i = 0; i < num_debuggers; ++i) {
+    lldb::DebuggerSP debugger_sp = Debugger::GetDebuggerAtIndex(i);
+    if (!debugger_sp)
+      continue;
+
+    os << "- " << to_uri(debugger_sp) << '\n';
+  }
+
+  return createTextResult(output);
+}
diff --git a/lldb/source/Plugins/Protocol/MCP/Tool.h b/lldb/source/Plugins/Protocol/MCP/Tool.h
index 1886525b9168f..8450ce3d6c2dd 100644
--- a/lldb/source/Plugins/Protocol/MCP/Tool.h
+++ b/lldb/source/Plugins/Protocol/MCP/Tool.h
@@ -28,6 +28,15 @@ class CommandTool : public lldb_protocol::mcp::Tool {
   std::optional<llvm::json::Value> GetSchema() const override;
 };
 
+class DebuggerListTool : public lldb_protocol::mcp::Tool {
+public:
+  using lldb_protocol::mcp::Tool::Tool;
+  ~DebuggerListTool() = default;
+
+  llvm::Expected<lldb_protocol::mcp::CallToolResult>
+  Call(const lldb_protocol::mcp::ToolArguments &args) override;
+};
+
 } // namespace lldb_private::mcp
 
 #endif
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp
index d3d0110d5e302..881268bc4ca03 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp
@@ -2502,6 +2502,7 @@ static llvm::StringRef ClangToItaniumCtorKind(clang::CXXCtorType kind) {
   case clang::CXXCtorType::Ctor_Comdat:
     llvm_unreachable("Unexpected constructor kind.");
   }
+  llvm_unreachable("Fully covered switch above");
 }
 
 static llvm::StringRef ClangToItaniumDtorKind(clang::CXXDtorType kind) {
@@ -2517,6 +2518,7 @@ static llvm::StringRef ClangToItaniumDtorKind(clang::CXXDtorType kind) {
   case clang::CXXDtorType::Dtor_Comdat:
     llvm_unreachable("Unexpected destructor kind.");
   }
+  llvm_unreachable("Fully covered switch above");
 }
 
 static llvm::StringRef
@@ -4325,7 +4327,8 @@ void SymbolFileDWARF::Dump(lldb_private::Stream &s) {
   m_index->Dump(s);
 }
 
-void SymbolFileDWARF::DumpClangAST(Stream &s, llvm::StringRef filter) {
+void SymbolFileDWARF::DumpClangAST(Stream &s, llvm::StringRef filter,
+                                   bool show_color) {
   auto ts_or_err = GetTypeSystemForLanguage(eLanguageTypeC_plus_plus);
   if (!ts_or_err)
     return;
@@ -4333,7 +4336,7 @@ void SymbolFileDWARF::DumpClangAST(Stream &s, llvm::StringRef filter) {
   TypeSystemClang *clang = llvm::dyn_cast_or_null<TypeSystemClang>(ts.get());
   if (!clang)
     return;
-  clang->Dump(s.AsRawOstream(), filter);
+  clang->Dump(s.AsRawOstream(), filter, show_color);
 }
 
 bool SymbolFileDWARF::GetSeparateDebugInfo(StructuredData::Dictionary &d,
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.h b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.h
index 85306d8b4fb5f..a60527b8eda33 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.h
+++ b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.h
@@ -277,7 +277,8 @@ class SymbolFileDWARF : public SymbolFileCommon {
 
   void Dump(Stream &s) override;
 
-  void DumpClangAST(Stream &s, llvm::StringRef filter) override;
+  void DumpClangAST(Stream &s, llvm::StringRef filter,
+                    bool show_colors) override;
 
   /// List separate dwo files.
   bool GetSeparateDebugInfo(StructuredData::Dictionary &d, bool errors_only,
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDebugMap.cpp b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDebugMap.cpp
index 8b8229a7020c5..a44bf82d3b705 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDebugMap.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDebugMap.cpp
@@ -1267,9 +1267,10 @@ CompilerDeclContext SymbolFileDWARFDebugMap::FindNamespace(
   return matching_namespace;
 }
 
-void SymbolFileDWARFDebugMap::DumpClangAST(Stream &s, llvm::StringRef filter) {
+void SymbolFileDWARFDebugMap::DumpClangAST(Stream &s, llvm::StringRef filter,
+                                           bool show_color) {
   ForEachSymbolFile("Dumping clang AST", [&](SymbolFileDWARF &oso_dwarf) {
-    oso_dwarf.DumpClangAST(s, filter);
+    oso_dwarf.DumpClangAST(s, filter, show_color);
     // The underlying assumption is that DumpClangAST(...) will obtain the
     // AST from the underlying TypeSystem and therefore we only need to do
     // this once and can stop after the first iteration hence we return true.
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDebugMap.h b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDebugMap.h
index bce1ed2671af0..74b97f610f29c 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDebugMap.h
+++ b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDebugMap.h
@@ -129,7 +129,8 @@ class SymbolFileDWARFDebugMap : public SymbolFileCommon {
   std::vector<std::unique_ptr<CallEdge>>
   ParseCallEdgesInFunction(UserID func_id) override;
 
-  void DumpClangAST(Stream &s, llvm::StringRef filter) override;
+  void DumpClangAST(Stream &s, llvm::StringRef filter,
+                    bool show_color) override;
 
   /// List separate oso files.
   bool GetSeparateDebugInfo(StructuredData::Dictionary &d, bool errors_only,
diff --git a/lldb/source/Plugins/SymbolFile/NativePDB/PdbAstBuilder.cpp b/lldb/source/Plugins/SymbolFile/NativePDB/PdbAstBuilder.cpp
index 933c4361d93da..51bdcc92b05a8 100644
--- a/lldb/source/Plugins/SymbolFile/NativePDB/PdbAstBuilder.cpp
+++ b/lldb/source/Plugins/SymbolFile/NativePDB/PdbAstBuilder.cpp
@@ -1454,8 +1454,9 @@ PdbAstBuilder::FromCompilerDeclContext(CompilerDeclContext context) {
   return static_cast<clang::DeclContext *>(context.GetOpaqueDeclContext());
 }
 
-void PdbAstBuilder::Dump(Stream &stream, llvm::StringRef filter) {
-  m_clang.Dump(stream.AsRawOstream(), filter);
+void PdbAstBuilder::Dump(Stream &stream, llvm::StringRef filter,
+                         bool show_color) {
+  m_clang.Dump(stream.AsRawOstream(), filter, show_color);
 }
 
 clang::NamespaceDecl *
diff --git a/lldb/source/Plugins/SymbolFile/NativePDB/PdbAstBuilder.h b/lldb/source/Plugins/SymbolFile/NativePDB/PdbAstBuilder.h
index fef65227bc8f5..16247fcf88b12 100644
--- a/lldb/source/Plugins/SymbolFile/NativePDB/PdbAstBuilder.h
+++ b/lldb/source/Plugins/SymbolFile/NativePDB/PdbAstBuilder.h
@@ -87,7 +87,7 @@ class PdbAstBuilder {
   TypeSystemClang &clang() { return m_clang; }
   ClangASTImporter &GetClangASTImporter() { return m_importer; }
 
-  void Dump(Stream &stream, llvm::StringRef filter);
+  void Dump(Stream &stream, llvm::StringRef filter, bool show_color);
 
   clang::NamespaceDecl *FindNamespaceDecl(const clang::DeclContext *parent,
                                           llvm::StringRef name);
diff --git a/lldb/source/Plugins/SymbolFile/NativePDB/SymbolFileNativePDB.cpp b/lldb/source/Plugins/SymbolFile/NativePDB/SymbolFileNativePDB.cpp
index e99c585d7eb1f..b866e473853f3 100644
--- a/lldb/source/Plugins/SymbolFile/NativePDB/SymbolFileNativePDB.cpp
+++ b/lldb/source/Plugins/SymbolFile/NativePDB/SymbolFileNativePDB.cpp
@@ -1054,7 +1054,44 @@ lldb::LanguageType SymbolFileNativePDB::ParseLanguage(CompileUnit &comp_unit) {
   return TranslateLanguage(item->m_compile_opts->getLanguage());
 }
 
-void SymbolFileNativePDB::AddSymbols(Symtab &symtab) {}
+void SymbolFileNativePDB::AddSymbols(Symtab &symtab) {
+  auto *section_list = m_objfile_sp->GetSectionList();
+  if (!section_list)
+    return;
+
+  for (auto pid : m_index->publics().getPublicsTable()) {
+    PdbGlobalSymId global{pid, true};
+    CVSymbol sym = m_index->ReadSymbolRecord(global);
+    auto kind = sym.kind();
+    if (kind != S_PUB32)
+      continue;
+    PublicSym32 pub =
+        llvm::cantFail(SymbolDeserializer::deserializeAs<PublicSym32>(sym));
+
+    auto section_sp = section_list->FindSectionByID(pub.Segment);
+    if (!section_sp)
+      continue;
+
+    lldb::SymbolType type = eSymbolTypeData;
+    if ((pub.Flags & PublicSymFlags::Function) != PublicSymFlags::None ||
+        (pub.Flags & PublicSymFlags::Code) != PublicSymFlags::None)
+      type = eSymbolTypeCode;
+
+    symtab.AddSymbol(Symbol(/*symID=*/pid,
+                            /*name=*/pub.Name,
+                            /*type=*/type,
+                            /*external=*/true,
+                            /*is_debug=*/true,
+                            /*is_trampoline=*/false,
+                            /*is_artificial=*/false,
+                            /*section_sp=*/section_sp,
+                            /*value=*/pub.Offset,
+                            /*size=*/0,
+                            /*size_is_valid=*/false,
+                            /*contains_linker_annotations=*/false,
+                            /*flags=*/0));
+  }
+}
 
 size_t SymbolFileNativePDB::ParseFunctions(CompileUnit &comp_unit) {
   std::lock_guard<std::recursive_mutex> guard(GetModuleMutex());
@@ -1646,7 +1683,8 @@ size_t SymbolFileNativePDB::ParseSymbolArrayInScope(
   return count;
 }
 
-void SymbolFileNativePDB::DumpClangAST(Stream &s, llvm::StringRef filter) {
+void SymbolFileNativePDB::DumpClangAST(Stream &s, llvm::StringRef filter,
+                                       bool show_color) {
   auto ts_or_err = GetTypeSystemForLanguage(eLanguageTypeC_plus_plus);
   if (!ts_or_err)
     return;
@@ -1654,7 +1692,7 @@ void SymbolFileNativePDB::DumpClangAST(Stream &s, llvm::StringRef filter) {
   TypeSystemClang *clang = llvm::dyn_cast_or_null<TypeSystemClang>(ts.get());
   if (!clang)
     return;
-  clang->GetNativePDBParser()->Dump(s, filter);
+  clang->GetNativePDBParser()->Dump(s, filter, show_color);
 }
 
 void SymbolFileNativePDB::CacheGlobalBaseNames() {
diff --git a/lldb/source/Plugins/SymbolFile/NativePDB/SymbolFileNativePDB.h b/lldb/source/Plugins/SymbolFile/NativePDB/SymbolFileNativePDB.h
index 095b40c72c52a..2405f8b299339 100644
--- a/lldb/source/Plugins/SymbolFile/NativePDB/SymbolFileNativePDB.h
+++ b/lldb/source/Plugins/SymbolFile/NativePDB/SymbolFileNativePDB.h
@@ -157,7 +157,8 @@ class SymbolFileNativePDB : public SymbolFileCommon {
 
   PdbIndex &GetIndex() { return *m_index; };
 
-  void DumpClangAST(Stream &s, llvm::StringRef filter) override;
+  void DumpClangAST(Stream &s, llvm::StringRef filter,
+                    bool show_color) override;
 
   std::optional<llvm::codeview::TypeIndex>
   GetParentType(llvm::codeview::TypeIndex ti);
diff --git a/lldb/source/Plugins/SymbolFile/PDB/SymbolFilePDB.cpp b/lldb/source/Plugins/SymbolFile/PDB/SymbolFilePDB.cpp
index 2f0b6539445fb..0e2ca1784e7e9 100644
--- a/lldb/source/Plugins/SymbolFile/PDB/SymbolFilePDB.cpp
+++ b/lldb/source/Plugins/SymbolFile/PDB/SymbolFilePDB.cpp
@@ -1515,7 +1515,8 @@ void SymbolFilePDB::AddSymbols(lldb_private::Symtab &symtab) {
   symtab.Finalize();
 }
 
-void SymbolFilePDB::DumpClangAST(Stream &s, llvm::StringRef filter) {
+void SymbolFilePDB::DumpClangAST(Stream &s, llvm::StringRef filter,
+                                 bool show_color) {
   auto type_system_or_err =
       GetTypeSystemForLanguage(lldb::eLanguageTypeC_plus_plus);
   if (auto err = type_system_or_err.takeError()) {
@@ -1529,7 +1530,7 @@ void SymbolFilePDB::DumpClangAST(Stream &s, llvm::StringRef filter) {
       llvm::dyn_cast_or_null<TypeSystemClang>(ts.get());
   if (!clang_type_system)
     return;
-  clang_type_system->Dump(s.AsRawOstream(), filter);
+  clang_type_system->Dump(s.AsRawOstream(), filter, show_color);
 }
 
 void SymbolFilePDB::FindTypesByRegex(
diff --git a/lldb/source/Plugins/SymbolFile/PDB/SymbolFilePDB.h b/lldb/source/Plugins/SymbolFile/PDB/SymbolFilePDB.h
index e6560813ce75e..ccbf02db1159f 100644
--- a/lldb/source/Plugins/SymbolFile/PDB/SymbolFilePDB.h
+++ b/lldb/source/Plugins/SymbolFile/PDB/SymbolFilePDB.h
@@ -159,7 +159,8 @@ class SymbolFilePDB : public lldb_private::SymbolFileCommon {
 
   const llvm::pdb::IPDBSession &GetPDBSession() const;
 
-  void DumpClangAST(lldb_private::Stream &s, llvm::StringRef filter) override;
+  void DumpClangAST(lldb_private::Stream &s, llvm::StringRef filter,
+                    bool show_color) override;
 
 private:
   struct SecContribInfo {
diff --git a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
index 39aacdb58e694..1948f51c3f2e1 100644
--- a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
+++ b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
@@ -11,6 +11,7 @@
 #include "clang/AST/DeclBase.h"
 #include "clang/AST/ExprCXX.h"
 #include "clang/Frontend/ASTConsumers.h"
+#include "llvm/ADT/ScopeExit.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/FormatAdapters.h"
 #include "llvm/Support/FormatVariadic.h"
@@ -3962,8 +3963,6 @@ TypeSystemClang::GetTypeInfo(lldb::opaque_compiler_type_t type,
     return 0;
   case clang::Type::DependentSizedExtVector:
     return eTypeHasChildren | eTypeIsVector;
-  case clang::Type::DependentTemplateSpecialization:
-    return eTypeIsTemplate;
 
   case clang::Type::Enum:
     if (pointee_or_element_clang_type)
@@ -4237,8 +4236,6 @@ TypeSystemClang::GetTypeClass(lldb::opaque_compiler_type_t type) {
     break;
   case clang::Type::DependentName:
     break;
-  case clang::Type::DependentTemplateSpecialization:
-    break;
   case clang::Type::PackExpansion:
     break;
 
@@ -5108,7 +5105,6 @@ lldb::Encoding TypeSystemClang::GetEncoding(lldb::opaque_compiler_type_t type,
   case clang::Type::SubstTemplateTypeParmPack:
   case clang::Type::InjectedClassName:
   case clang::Type::DependentName:
-  case clang::Type::DependentTemplateSpecialization:
   case clang::Type::PackExpansion:
   case clang::Type::ObjCObject:
 
@@ -5277,7 +5273,6 @@ lldb::Format TypeSystemClang::GetFormat(lldb::opaque_compiler_type_t type) {
   case clang::Type::SubstTemplateTypeParmPack:
   case clang::Type::InjectedClassName:
   case clang::Type::DependentName:
-  case clang::Type::DependentTemplateSpecialization:
   case clang::Type::PackExpansion:
   case clang::Type::ObjCObject:
 
@@ -6171,8 +6166,6 @@ uint32_t TypeSystemClang::GetNumPointeeChildren(clang::QualType type) {
     return 0;
   case clang::Type::DependentName:
     return 1;
-  case clang::Type::DependentTemplateSpecialization:
-    return 1;
   case clang::Type::ObjCObject:
     return 0;
   case clang::Type::ObjCInterface:
@@ -8540,7 +8533,24 @@ TypeSystemClang::dump(lldb::opaque_compiler_type_t type) const {
 }
 #endif
 
-void TypeSystemClang::Dump(llvm::raw_ostream &output, llvm::StringRef filter) {
+namespace {
+struct ScopedASTColor {
+  ScopedASTColor(clang::ASTContext &ast, bool show_colors)
+      : ast(ast), old_show_colors(ast.getDiagnostics().getShowColors()) {
+    ast.getDiagnostics().setShowColors(show_colors);
+  }
+
+  ~ScopedASTColor() { ast.getDiagnostics().setShowColors(old_show_colors); }
+
+  clang::ASTContext &ast;
+  const bool old_show_colors;
+};
+} // namespace
+
+void TypeSystemClang::Dump(llvm::raw_ostream &output, llvm::StringRef filter,
+                           bool show_color) {
+  ScopedASTColor colored(getASTContext(), show_color);
+
   auto consumer =
       clang::CreateASTDumper(output, filter,
                              /*DumpDecls=*/true,
@@ -9683,10 +9693,10 @@ GetNameForIsolatedASTKind(ScratchTypeSystemClang::IsolatedASTKind kind) {
 }
 
 void ScratchTypeSystemClang::Dump(llvm::raw_ostream &output,
-                                  llvm::StringRef filter) {
+                                  llvm::StringRef filter, bool show_color) {
   // First dump the main scratch AST.
   output << "State of scratch Clang type system:\n";
-  TypeSystemClang::Dump(output, filter);
+  TypeSystemClang::Dump(output, filter, show_color);
 
   // Now sort the isolated sub-ASTs.
   typedef std::pair<IsolatedASTKey, TypeSystem *> KeyAndTS;
@@ -9701,7 +9711,7 @@ void ScratchTypeSystemClang::Dump(llvm::raw_ostream &output,
         static_cast<ScratchTypeSystemClang::IsolatedASTKind>(a.first);
     output << "State of scratch Clang type subsystem "
            << GetNameForIsolatedASTKind(kind) << ":\n";
-    a.second->Dump(output, filter);
+    a.second->Dump(output, filter, show_color);
   }
 }
 
diff --git a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.h b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.h
index 709f89590ba3b..9e0a54209345d 100644
--- a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.h
+++ b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.h
@@ -1074,7 +1074,8 @@ class TypeSystemClang : public TypeSystem {
 #endif
 
   /// \see lldb_private::TypeSystem::Dump
-  void Dump(llvm::raw_ostream &output, llvm::StringRef filter) override;
+  void Dump(llvm::raw_ostream &output, llvm::StringRef filter,
+            bool show_color) override;
 
   /// Dump clang AST types from the symbol file.
   ///
@@ -1318,7 +1319,8 @@ class ScratchTypeSystemClang : public TypeSystemClang {
   }
 
   /// \see lldb_private::TypeSystem::Dump
-  void Dump(llvm::raw_ostream &output, llvm::StringRef filter) override;
+  void Dump(llvm::raw_ostream &output, llvm::StringRef filter,
+            bool show_color) override;
 
   UserExpression *GetUserExpression(llvm::StringRef expr,
                                     llvm::StringRef prefix,
diff --git a/lldb/source/Protocol/MCP/Server.cpp b/lldb/source/Protocol/MCP/Server.cpp
index f3489c620832f..a08874e7321af 100644
--- a/lldb/source/Protocol/MCP/Server.cpp
+++ b/lldb/source/Protocol/MCP/Server.cpp
@@ -22,34 +22,32 @@ using namespace llvm;
 using namespace lldb_private;
 using namespace lldb_protocol::mcp;
 
-ServerInfoHandle::ServerInfoHandle() : ServerInfoHandle("") {}
-
 ServerInfoHandle::ServerInfoHandle(StringRef filename) : m_filename(filename) {
   if (!m_filename.empty())
     sys::RemoveFileOnSignal(m_filename);
 }
 
-ServerInfoHandle::~ServerInfoHandle() {
-  if (m_filename.empty())
-    return;
-
-  sys::fs::remove(m_filename);
-  sys::DontRemoveFileOnSignal(m_filename);
-  m_filename.clear();
-}
+ServerInfoHandle::~ServerInfoHandle() { Remove(); }
 
-ServerInfoHandle::ServerInfoHandle(ServerInfoHandle &&other)
-    : m_filename(other.m_filename) {
+ServerInfoHandle::ServerInfoHandle(ServerInfoHandle &&other) {
   *this = std::move(other);
 }
 
 ServerInfoHandle &
 ServerInfoHandle::operator=(ServerInfoHandle &&other) noexcept {
-  m_filename = other.m_filename;
-  other.m_filename.clear();
+  m_filename = std::move(other.m_filename);
   return *this;
 }
 
+void ServerInfoHandle::Remove() {
+  if (m_filename.empty())
+    return;
+
+  sys::fs::remove(m_filename);
+  sys::DontRemoveFileOnSignal(m_filename);
+  m_filename.clear();
+}
+
 json::Value lldb_protocol::mcp::toJSON(const ServerInfo &SM) {
   return json::Object{{"connection_uri", SM.connection_uri}};
 }
diff --git a/lldb/source/Symbol/SymbolFileOnDemand.cpp b/lldb/source/Symbol/SymbolFileOnDemand.cpp
index 807c2124e48d9..9fc13c5989fc2 100644
--- a/lldb/source/Symbol/SymbolFileOnDemand.cpp
+++ b/lldb/source/Symbol/SymbolFileOnDemand.cpp
@@ -306,13 +306,13 @@ void SymbolFileOnDemand::Dump(lldb_private::Stream &s) {
 }
 
 void SymbolFileOnDemand::DumpClangAST(lldb_private::Stream &s,
-                                      llvm::StringRef filter) {
+                                      llvm::StringRef filter, bool show_color) {
   if (!m_debug_info_enabled) {
     LLDB_LOG(GetLog(), "[{0}] {1} is skipped", GetSymbolFileName(),
              __FUNCTION__);
     return;
   }
-  return m_sym_file_impl->DumpClangAST(s, filter);
+  return m_sym_file_impl->DumpClangAST(s, filter, show_color);
 }
 
 void SymbolFileOnDemand::FindGlobalVariables(const RegularExpression &regex,
diff --git a/lldb/source/Target/RegisterContextUnwind.cpp b/lldb/source/Target/RegisterContextUnwind.cpp
index 787eb94be3b48..3b018c09b8b72 100644
--- a/lldb/source/Target/RegisterContextUnwind.cpp
+++ b/lldb/source/Target/RegisterContextUnwind.cpp
@@ -2039,8 +2039,6 @@ bool RegisterContextUnwind::ReadFrameAddress(
             reg_info, cfa_reg_contents, reg_info->byte_size, reg_value);
         if (error.Success()) {
           address = reg_value.GetAsUInt64();
-          if (abi_sp)
-            address = abi_sp->FixCodeAddress(address);
           UnwindLogMsg(
               "CFA value via dereferencing reg %s (%d): reg has val 0x%" PRIx64
               ", CFA value is 0x%" PRIx64,
@@ -2062,8 +2060,6 @@ bool RegisterContextUnwind::ReadFrameAddress(
     RegisterNumber cfa_reg(m_thread, row_register_kind,
                            fa.GetRegisterNumber());
     if (ReadGPRValue(cfa_reg, cfa_reg_contents)) {
-      if (abi_sp)
-        cfa_reg_contents = abi_sp->FixDataAddress(cfa_reg_contents);
       if (cfa_reg_contents == LLDB_INVALID_ADDRESS || cfa_reg_contents == 0 ||
           cfa_reg_contents == 1) {
         UnwindLogMsg(
@@ -2100,9 +2096,6 @@ bool RegisterContextUnwind::ReadFrameAddress(
         dwarfexpr.Evaluate(&exe_ctx, this, 0, nullptr, nullptr);
     if (result) {
       address = result->GetScalar().ULongLong();
-      if (ABISP abi_sp = m_thread.GetProcess()->GetABI())
-        address = abi_sp->FixCodeAddress(address);
-
       UnwindLogMsg("CFA value set by DWARF expression is 0x%" PRIx64,
                    address);
       return true;
@@ -2143,7 +2136,6 @@ bool RegisterContextUnwind::ReadFrameAddress(
   }
   case UnwindPlan::Row::FAValue::isConstant: {
     address = fa.GetConstant();
-    address = m_thread.GetProcess()->FixDataAddress(address);
     UnwindLogMsg("CFA value set by constant is 0x%" PRIx64, address);
     return true;
   }
diff --git a/lldb/source/Target/StackFrameList.cpp b/lldb/source/Target/StackFrameList.cpp
index fa5d159c0c91a..ccf874fc03ebd 100644
--- a/lldb/source/Target/StackFrameList.cpp
+++ b/lldb/source/Target/StackFrameList.cpp
@@ -449,7 +449,7 @@ bool StackFrameList::FetchFramesUpTo(uint32_t end_idx,
         }
       } else {
         unwind_frame_sp = m_frames.front();
-        cfa = unwind_frame_sp->m_id.GetCallFrameAddress();
+        cfa = unwind_frame_sp->m_id.GetCallFrameAddressWithoutMetadata();
       }
     } else {
       // Check for interruption when building the frames.
diff --git a/lldb/source/Target/StackID.cpp b/lldb/source/Target/StackID.cpp
index f879276527dda..137c776a84d2f 100644
--- a/lldb/source/Target/StackID.cpp
+++ b/lldb/source/Target/StackID.cpp
@@ -17,7 +17,8 @@ using namespace lldb_private;
 
 StackID::StackID(lldb::addr_t pc, lldb::addr_t cfa,
                  SymbolContextScope *symbol_scope, Process *process)
-    : m_pc(pc), m_cfa(cfa), m_symbol_scope(symbol_scope) {
+    : m_pc(pc), m_cfa(cfa), m_cfa_with_metadata(cfa),
+      m_symbol_scope(symbol_scope) {
   if (process) {
     m_pc = process->FixCodeAddress(m_pc);
     m_cfa = process->FixDataAddress(m_cfa);
@@ -29,6 +30,7 @@ void StackID::SetPC(lldb::addr_t pc, Process *process) {
 }
 
 void StackID::SetCFA(lldb::addr_t cfa, Process *process) {
+  m_cfa_with_metadata = cfa;
   m_cfa = process ? process->FixDataAddress(cfa) : cfa;
 }
 
@@ -49,7 +51,8 @@ void StackID::Dump(Stream *s) {
 }
 
 bool lldb_private::operator==(const StackID &lhs, const StackID &rhs) {
-  if (lhs.GetCallFrameAddress() != rhs.GetCallFrameAddress())
+  if (lhs.GetCallFrameAddressWithoutMetadata() !=
+      rhs.GetCallFrameAddressWithoutMetadata())
     return false;
 
   SymbolContextScope *lhs_scope = lhs.GetSymbolContextScope();
@@ -67,8 +70,8 @@ bool lldb_private::operator!=(const StackID &lhs, const StackID &rhs) {
 }
 
 bool lldb_private::operator<(const StackID &lhs, const StackID &rhs) {
-  const lldb::addr_t lhs_cfa = lhs.GetCallFrameAddress();
-  const lldb::addr_t rhs_cfa = rhs.GetCallFrameAddress();
+  const lldb::addr_t lhs_cfa = lhs.GetCallFrameAddressWithoutMetadata();
+  const lldb::addr_t rhs_cfa = rhs.GetCallFrameAddressWithoutMetadata();
 
   // FIXME: We are assuming that the stacks grow downward in memory.  That's not
   // necessary, but true on
diff --git a/lldb/source/Utility/ArchSpec.cpp b/lldb/source/Utility/ArchSpec.cpp
index 1b8dae39735df..2a87cc6bf7de9 100644
--- a/lldb/source/Utility/ArchSpec.cpp
+++ b/lldb/source/Utility/ArchSpec.cpp
@@ -545,6 +545,8 @@ const char *ArchSpec::GetArchitectureName() const {
 
 bool ArchSpec::IsMIPS() const { return GetTriple().isMIPS(); }
 
+bool ArchSpec::IsNVPTX() const { return GetTriple().isNVPTX(); }
+
 std::string ArchSpec::GetTargetABI() const {
 
   std::string abi;
diff --git a/lldb/source/Utility/Scalar.cpp b/lldb/source/Utility/Scalar.cpp
index 7fbe46d46194f..c8766bdf2aee7 100644
--- a/lldb/source/Utility/Scalar.cpp
+++ b/lldb/source/Utility/Scalar.cpp
@@ -565,12 +565,13 @@ const Scalar lldb_private::operator-(Scalar lhs, Scalar rhs) {
 
 const Scalar lldb_private::operator/(Scalar lhs, Scalar rhs) {
   Scalar result;
-  if ((result.m_type = Scalar::PromoteToMaxType(lhs, rhs)) != Scalar::e_void &&
-      !rhs.IsZero()) {
+  if ((result.m_type = Scalar::PromoteToMaxType(lhs, rhs)) != Scalar::e_void) {
     switch (result.m_type) {
     case Scalar::e_void:
       break;
     case Scalar::e_int:
+      if (rhs.IsZero())
+        break;
       result.m_integer = lhs.m_integer / rhs.m_integer;
       return result;
     case Scalar::e_float:
diff --git a/lldb/source/ValueObject/DILAST.cpp b/lldb/source/ValueObject/DILAST.cpp
index 70564663a62cd..7ed34db6e20df 100644
--- a/lldb/source/ValueObject/DILAST.cpp
+++ b/lldb/source/ValueObject/DILAST.cpp
@@ -46,4 +46,9 @@ llvm::Expected<lldb::ValueObjectSP> FloatLiteralNode::Accept(Visitor *v) const {
   return v->Visit(this);
 }
 
+llvm::Expected<lldb::ValueObjectSP>
+BooleanLiteralNode::Accept(Visitor *v) const {
+  return v->Visit(this);
+}
+
 } // namespace lldb_private::dil
diff --git a/lldb/source/ValueObject/DILEval.cpp b/lldb/source/ValueObject/DILEval.cpp
index c6cf41ee9e9ee..a9dbfad298d05 100644
--- a/lldb/source/ValueObject/DILEval.cpp
+++ b/lldb/source/ValueObject/DILEval.cpp
@@ -602,4 +602,10 @@ Interpreter::Visit(const FloatLiteralNode *node) {
                                                   "result");
 }
 
+llvm::Expected<lldb::ValueObjectSP>
+Interpreter::Visit(const BooleanLiteralNode *node) {
+  bool value = node->GetValue();
+  return ValueObject::CreateValueObjectFromBool(m_target, value, "result");
+}
+
 } // namespace lldb_private::dil
diff --git a/lldb/source/ValueObject/DILLexer.cpp b/lldb/source/ValueObject/DILLexer.cpp
index 0b2288a9d9230..e0202a2fe24cc 100644
--- a/lldb/source/ValueObject/DILLexer.cpp
+++ b/lldb/source/ValueObject/DILLexer.cpp
@@ -34,6 +34,10 @@ llvm::StringRef Token::GetTokenName(Kind kind) {
     return "identifier";
   case Kind::integer_constant:
     return "integer_constant";
+  case Kind::kw_false:
+    return "false";
+  case Kind::kw_true:
+    return "true";
   case Kind::l_paren:
     return "l_paren";
   case Kind::l_square:
@@ -42,7 +46,6 @@ llvm::StringRef Token::GetTokenName(Kind kind) {
     return "minus";
   case Kind::period:
     return "period";
-    return "l_square";
   case Kind::plus:
     return "plus";
   case Kind::r_paren:
@@ -137,8 +140,14 @@ llvm::Expected<Token> DILLexer::Lex(llvm::StringRef expr,
     return Token(kind, maybe_number->str(), position);
   }
   std::optional<llvm::StringRef> maybe_word = IsWord(expr, remainder);
-  if (maybe_word)
-    return Token(Token::identifier, maybe_word->str(), position);
+  if (maybe_word) {
+    llvm::StringRef word = *maybe_word;
+    Token::Kind kind = llvm::StringSwitch<Token::Kind>(word)
+                           .Case("false", Token::kw_false)
+                           .Case("true", Token::kw_true)
+                           .Default(Token::identifier);
+    return Token(kind, word.str(), position);
+  }
 
   constexpr std::pair<Token::Kind, const char *> operators[] = {
       {Token::amp, "&"},      {Token::arrow, "->"},   {Token::coloncolon, "::"},
diff --git a/lldb/source/ValueObject/DILParser.cpp b/lldb/source/ValueObject/DILParser.cpp
index 8c4f7fdb25bea..566bcaf81094a 100644
--- a/lldb/source/ValueObject/DILParser.cpp
+++ b/lldb/source/ValueObject/DILParser.cpp
@@ -180,12 +180,15 @@ ASTNodeUP DILParser::ParsePostfixExpression() {
 //
 //  primary_expression:
 //    numeric_literal
+//    boolean_literal
 //    id_expression
 //    "(" expression ")"
 //
 ASTNodeUP DILParser::ParsePrimaryExpression() {
   if (CurToken().IsOneOf({Token::integer_constant, Token::float_constant}))
     return ParseNumericLiteral();
+  if (CurToken().IsOneOf({Token::kw_true, Token::kw_false}))
+    return ParseBooleanLiteral();
   if (CurToken().IsOneOf(
           {Token::coloncolon, Token::identifier, Token::l_paren})) {
     // Save the source location for the diagnostics message.
@@ -336,6 +339,20 @@ std::string DILParser::ParseUnqualifiedId() {
   return identifier;
 }
 
+// Parse an boolean_literal.
+//
+//  boolean_literal:
+//    "true"
+//    "false"
+//
+ASTNodeUP DILParser::ParseBooleanLiteral() {
+  ExpectOneOf(std::vector<Token::Kind>{Token::kw_true, Token::kw_false});
+  uint32_t loc = CurToken().GetLocation();
+  bool literal_value = CurToken().Is(Token::kw_true);
+  m_dil_lexer.Advance();
+  return std::make_unique<BooleanLiteralNode>(loc, literal_value);
+}
+
 void DILParser::BailOut(const std::string &error, uint32_t loc,
                         uint16_t err_len) {
   if (m_error)
@@ -444,4 +461,12 @@ void DILParser::Expect(Token::Kind kind) {
   }
 }
 
+void DILParser::ExpectOneOf(std::vector<Token::Kind> kinds_vec) {
+  if (!CurToken().IsOneOf(kinds_vec)) {
+    BailOut(llvm::formatv("expected any of ({0}), got: {1}",
+                          llvm::iterator_range(kinds_vec), CurToken()),
+            CurToken().GetLocation(), CurToken().GetSpelling().length());
+  }
+}
+
 } // namespace lldb_private::dil
diff --git a/lldb/test/API/commands/dwim-print/TestDWIMPrint.py b/lldb/test/API/commands/dwim-print/TestDWIMPrint.py
index 492d49f008a9e..82ff59f74f41f 100644
--- a/lldb/test/API/commands/dwim-print/TestDWIMPrint.py
+++ b/lldb/test/API/commands/dwim-print/TestDWIMPrint.py
@@ -16,7 +16,7 @@ def _run_cmd(self, cmd: str) -> str:
         self.ci.HandleCommand(cmd, result)
         return result.GetOutput().rstrip()
 
-    VAR_IDENT = re.compile(r"(?:\$\d+|[\w.]+) = ")
+    VAR_IDENT = re.compile(r"(?:\$\d+|(?:::)?[\w.]+) = ")
 
     def _strip_result_var(self, string: str) -> str:
         """
@@ -185,3 +185,11 @@ def test_direct_child_access(self):
             self, "break inside", lldb.SBFileSpec("main.cpp")
         )
         self._expect_cmd("dwim-print number", "frame variable")
+
+    def test_global_variables(self):
+        """Test dwim-print supports global variables."""
+        self.build()
+        lldbutil.run_to_source_breakpoint(
+            self, "break here", lldb.SBFileSpec("main.cpp")
+        )
+        self._expect_cmd("dwim-print gGlobal", "frame variable")
diff --git a/lldb/test/API/commands/dwim-print/main.cpp b/lldb/test/API/commands/dwim-print/main.cpp
index d1abb5a85dd45..5b7cbd7da764b 100644
--- a/lldb/test/API/commands/dwim-print/main.cpp
+++ b/lldb/test/API/commands/dwim-print/main.cpp
@@ -1,5 +1,8 @@
 extern "C" int puts(const char *s);
 
+extern int gGlobal;
+int gGlobal = 23;
+
 struct Structure {
   int number = 30;
   void f() { puts("break inside"); }
diff --git a/lldb/test/API/commands/expression/import-std-module/vector-dbg-info-content/TestDbgInfoContentVectorFromStdModule.py b/lldb/test/API/commands/expression/import-std-module/vector-dbg-info-content/TestDbgInfoContentVectorFromStdModule.py
index 1c32222e64f14..5f43fa1ea5662 100644
--- a/lldb/test/API/commands/expression/import-std-module/vector-dbg-info-content/TestDbgInfoContentVectorFromStdModule.py
+++ b/lldb/test/API/commands/expression/import-std-module/vector-dbg-info-content/TestDbgInfoContentVectorFromStdModule.py
@@ -13,7 +13,9 @@ class TestDbgInfoContentVector(TestBase):
     @skipIf(compiler=no_match("clang"))
     @skipIf(compiler="clang", compiler_version=["<", "12.0"])
     @skipIf(macos_version=["<", "14.0"])
-    @skipIfDarwin  # https://github.com/llvm/llvm-project/issues/106475
+    @skipIf(
+        bugnumber="ASTImport of lambdas not supported: https://github.com/llvm/llvm-project/issues/149477"
+    )
     def test(self):
         self.build()
 
diff --git a/lldb/test/API/commands/expression/import-std-module/vector-of-vectors/TestVectorOfVectorsFromStdModule.py b/lldb/test/API/commands/expression/import-std-module/vector-of-vectors/TestVectorOfVectorsFromStdModule.py
index 2cddce09aa8b8..f0eb4758ca54b 100644
--- a/lldb/test/API/commands/expression/import-std-module/vector-of-vectors/TestVectorOfVectorsFromStdModule.py
+++ b/lldb/test/API/commands/expression/import-std-module/vector-of-vectors/TestVectorOfVectorsFromStdModule.py
@@ -11,6 +11,9 @@ class TestVectorOfVectors(TestBase):
     @add_test_categories(["libc++"])
     @skipIf(compiler=no_match("clang"))
     @skipIf(macos_version=["<", "15.0"])
+    @skipIf(
+        bugnumber="ASTImport of lambdas not supported: https://github.com/llvm/llvm-project/issues/149477"
+    )
     def test(self):
         self.build()
 
diff --git a/lldb/test/API/commands/frame/var-dil/expr/Literals/TestFrameVarDILLiterals.py b/lldb/test/API/commands/frame/var-dil/expr/Literals/TestFrameVarDILLiterals.py
index 431ec2829bc75..ca3357cd683a0 100644
--- a/lldb/test/API/commands/frame/var-dil/expr/Literals/TestFrameVarDILLiterals.py
+++ b/lldb/test/API/commands/frame/var-dil/expr/Literals/TestFrameVarDILLiterals.py
@@ -19,6 +19,10 @@ def test_literals(self):
 
         self.runCmd("settings set target.experimental.use-DIL true")
 
+        # Check boolean literals parsing
+        self.expect_var_path("true", value="true", type="bool")
+        self.expect_var_path("false", value="false", type="bool")
+
         # Check number literals parsing
         self.expect_var_path("1.0", value="1", type="double")
         self.expect_var_path("1.0f", value="1", type="float")
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-objc/Makefile b/lldb/test/API/functionalities/data-formatter/data-formatter-objc/Makefile
index 8b322ff320bb0..5ef65e0c08451 100644
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-objc/Makefile
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-objc/Makefile
@@ -1,6 +1,6 @@
 OBJC_SOURCES := main.m
 
-CFLAGS_EXTRAS := -w
+CFLAGS_EXTRAS := -w -Wno-error=incompatible-pointer-types
 
 
 
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/unordered_map-iterator/TestDataFormatterStdUnorderedMap.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/unordered_map-iterator/TestDataFormatterStdUnorderedMap.py
index d2382373f4810..1e920faab6397 100644
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/unordered_map-iterator/TestDataFormatterStdUnorderedMap.py
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/unordered_map-iterator/TestDataFormatterStdUnorderedMap.py
@@ -113,7 +113,6 @@ def do_test_ptr(self):
         Test that pointers to std::unordered_map are formatted correctly.
         """
 
-        self.build()
         (self.target, process, thread, bkpt) = lldbutil.run_to_source_breakpoint(
             self, "Stop here", lldb.SBFileSpec("main.cpp", False)
         )
diff --git a/lldb/test/API/functionalities/postmortem/netbsd-core/TestNetBSDCore.py b/lldb/test/API/functionalities/postmortem/netbsd-core/TestNetBSDCore.py
index ff1ef21e02e31..179dbdf88fa8a 100644
--- a/lldb/test/API/functionalities/postmortem/netbsd-core/TestNetBSDCore.py
+++ b/lldb/test/API/functionalities/postmortem/netbsd-core/TestNetBSDCore.py
@@ -117,6 +117,8 @@ def check_backtrace(self, thread, filename, backtrace):
                 )
 
     def do_test(self, filename, pid, region_count):
+        # Temporary workaround for https://github.com/llvm/llvm-project/issues/159377
+        self.runCmd("settings set target.parallel-module-load false")
         target = self.dbg.CreateTarget(filename)
         process = target.LoadCore(filename + ".core")
 
diff --git a/lldb/test/API/functionalities/thread/exit_during_expression/main.c b/lldb/test/API/functionalities/thread/exit_during_expression/main.c
index f633632e96cc4..c4a9d20627802 100644
--- a/lldb/test/API/functionalities/thread/exit_during_expression/main.c
+++ b/lldb/test/API/functionalities/thread/exit_during_expression/main.c
@@ -27,7 +27,7 @@ void *exiting_thread_func(void *unused) {
 }
 
 int main() {
-  char *exit_ptr;
+  void *exit_ptr;
   pthread_t exiting_thread;
 
   pthread_create(&exiting_thread, NULL, exiting_thread_func, NULL);
diff --git a/lldb/test/API/functionalities/unwind/cortex-m-exception/TestCortexMExceptionUnwind.py b/lldb/test/API/functionalities/unwind/cortex-m-exception/TestCortexMExceptionUnwind.py
index 30b2a525eaab1..768dd6fe6867c 100644
--- a/lldb/test/API/functionalities/unwind/cortex-m-exception/TestCortexMExceptionUnwind.py
+++ b/lldb/test/API/functionalities/unwind/cortex-m-exception/TestCortexMExceptionUnwind.py
@@ -26,10 +26,7 @@ class TestCortexMExceptionUnwind(TestBase):
     # the frame pointer, and we can walk the stack.
     # ABISysV_arm::CreateDefaultUnwindPlan will only get one frame and
     # not be able to continue.
-    #
-    # This may only be occuring on a 32-bit Ubuntu bot; need to test
-    # 64-bit Ubuntu and confirm.
-    @skipUnlessDarwin
+    @skipIfRemote
     def test_no_fpu(self):
         """Test that we can backtrace correctly through an ARM Cortex-M Exception return stack"""
 
diff --git a/lldb/test/API/functionalities/valobj_errors/hidden.c b/lldb/test/API/functionalities/valobj_errors/hidden.c
index d3b93ce1ab9cf..616a2c330b9f3 100644
--- a/lldb/test/API/functionalities/valobj_errors/hidden.c
+++ b/lldb/test/API/functionalities/valobj_errors/hidden.c
@@ -1,4 +1,4 @@
 struct Opaque {
   int i, j, k;
 } *global;
-struct Opaque *getOpaque() { return &global; }
+struct Opaque *getOpaque() { return (struct Opaque *)&global; }
diff --git a/lldb/test/API/macosx/arm-pointer-metadata-cfa-dwarf-expr/Makefile b/lldb/test/API/macosx/arm-pointer-metadata-cfa-dwarf-expr/Makefile
new file mode 100644
index 0000000000000..f0de8ffca59fc
--- /dev/null
+++ b/lldb/test/API/macosx/arm-pointer-metadata-cfa-dwarf-expr/Makefile
@@ -0,0 +1,11 @@
+ASM_SOURCES := main.s
+
+# This is to appease Makefile.rules, there is no main.c
+C_SOURCES := main.c
+
+ASM_OBJS := $(ASM_SOURCES:.s=.o)
+
+%.o: %.s
+	$(CC) -c -x assembler $< -o $@
+
+include Makefile.rules
diff --git a/lldb/test/API/macosx/arm-pointer-metadata-cfa-dwarf-expr/TestArmPointerMetadataCFADwarfExpr.py b/lldb/test/API/macosx/arm-pointer-metadata-cfa-dwarf-expr/TestArmPointerMetadataCFADwarfExpr.py
new file mode 100644
index 0000000000000..839e0e1a4fc4d
--- /dev/null
+++ b/lldb/test/API/macosx/arm-pointer-metadata-cfa-dwarf-expr/TestArmPointerMetadataCFADwarfExpr.py
@@ -0,0 +1,35 @@
+import lldb
+from lldbsuite.test.decorators import *
+from lldbsuite.test.lldbtest import *
+from lldbsuite.test import lldbutil
+
+
+@skipUnlessDarwin
+@skipIf(archs=no_match(["arm64"]))
+class TestArmPointerMetadataStripping(TestBase):
+    def test(self):
+        self.build()
+        target, process, thread, bkpt = lldbutil.run_to_name_breakpoint(self, "foo")
+
+        # Step over the first two instructions of foo in order to
+        # toggle the bit of fp and save it on the stack:
+        # orr   x29, x29, #0x1000000000000000
+        # stp	x29, x30, [sp, #-16]!
+        # This is effectively adding metadata to the CFA of the caller frame (main).
+        thread.StepInstruction(False)
+        thread.StepInstruction(False)
+
+        # The location of `argv` has been artificially made equal to the CFA of the frame.
+        # As such, it should have the metadata artificially set previously.
+        argv_addr = thread.frames[1].GetValueForVariablePath("&argv")
+        self.assertTrue(argv_addr.IsValid())
+        argv_addr_uint = argv_addr.GetValueAsUnsigned()
+        self.assertNotEqual((argv_addr_uint & (1 << 60)), 0)
+
+        # GetCFA strips metadata.
+        cfa = thread.frames[1].GetCFA()
+        self.assertEqual((cfa & (1 << 60)), 0)
+
+        # If the test worked correctly, the cfa and the location should be identical,
+        # modulo the metadata.
+        self.assertEqual(cfa | (1 << 60), argv_addr_uint)
diff --git a/lldb/test/API/macosx/arm-pointer-metadata-cfa-dwarf-expr/main.s b/lldb/test/API/macosx/arm-pointer-metadata-cfa-dwarf-expr/main.s
new file mode 100644
index 0000000000000..0825c5ddd08b5
--- /dev/null
+++ b/lldb/test/API/macosx/arm-pointer-metadata-cfa-dwarf-expr/main.s
@@ -0,0 +1,226 @@
+; The assembly below corresponds to this program:
+; __attribute__((nodebug))
+; int foo() {
+;   return 10;
+; }
+; int main(int argc, char **argv) {
+;   foo();
+;   return 0;
+; }
+;
+; The assembly was edited in two places (search for "EDIT"):
+; 1. A "orr    x29, x29, #0x1000000000000000" instruction was added in foo. This
+; effectively changes the CFA value of the frame above foo (i.e. main).
+; 2. In main, the DWARF location of `argv` was changed to DW_AT_call_frame_cfa.
+;
+; This allows us to stop in foo, go to frame 1 (main) and do `v &argv`,
+; obtaining the result of evaluating DW_AT_call_frame_cfa.
+
+	.section	__TEXT,__text,regular,pure_instructions
+	.globl	_foo                            ; -- Begin function foo
+	.p2align	2
+_foo:                                   ; @foo
+Lfunc_begin0:
+	.cfi_startproc
+  orr    x29, x29, #0x1000000000000000    ; EDIT: Set top byte of fp.
+	stp	x29, x30, [sp, #-16]!           ; 16-byte Folded Spill
+	mov	x29, sp
+	.cfi_def_cfa w29, 16
+	.cfi_offset w30, -8
+	.cfi_offset w29, -16
+	mov	w0, #10                         ; =0xa
+	ldp	x29, x30, [sp], #16             ; 16-byte Folded Reload
+	ret
+Lfunc_end0:
+	.cfi_endproc
+                                        ; -- End function
+	.globl	_main                           ; -- Begin function main
+	.p2align	2
+_main:                                  ; @main
+Lfunc_begin1:
+	.file	1 "/test" "test.c"
+	.loc	1 6 0                           ; test.c:6:0
+	.cfi_startproc
+	sub	sp, sp, #48
+	stp	x29, x30, [sp, #32]             ; 16-byte Folded Spill
+	add	x29, sp, #32
+	.cfi_def_cfa w29, 16
+	.cfi_offset w30, -8
+	.cfi_offset w29, -16
+	mov	w8, #0                          ; =0x0
+	str	w8, [sp, #12]                   ; 4-byte Folded Spill
+	stur	wzr, [x29, #-4]
+	stur	w0, [x29, #-8]
+	str	x1, [sp, #16]
+Ltmp0:
+	bl	_foo
+	ldr	w0, [sp, #12]                   ; 4-byte Folded Reload
+	ldp	x29, x30, [sp, #32]             ; 16-byte Folded Reload
+	add	sp, sp, #48
+	ret
+Ltmp1:
+Lfunc_end1:
+	.cfi_endproc
+                                        ; -- End function
+	.section	__DWARF,__debug_abbrev,regular,debug
+Lsection_abbrev:
+	.byte	1                               ; Abbreviation Code
+	.byte	17                              ; DW_TAG_compile_unit
+	.byte	1                               ; DW_CHILDREN_yes
+	.byte	37                              ; DW_AT_producer
+	.byte	14                              ; DW_FORM_strp
+	.byte	19                              ; DW_AT_language
+	.byte	5                               ; DW_FORM_data2
+	.byte	3                               ; DW_AT_name
+	.byte	14                              ; DW_FORM_strp
+	.ascii	"\202|"                       ; DW_AT_LLVM_sysroot
+	.byte	14                              ; DW_FORM_strp
+	.ascii	"\357\177"                    ; DW_AT_APPLE_sdk
+	.byte	14                              ; DW_FORM_strp
+	.byte	16                              ; DW_AT_stmt_list
+	.byte	23                              ; DW_FORM_sec_offset
+	.byte	27                              ; DW_AT_comp_dir
+	.byte	14                              ; DW_FORM_strp
+	.byte	17                              ; DW_AT_low_pc
+	.byte	1                               ; DW_FORM_addr
+	.byte	18                              ; DW_AT_high_pc
+	.byte	6                               ; DW_FORM_data4
+	.byte	0                               ; EOM(1)
+	.byte	0                               ; EOM(2)
+	.byte	2                               ; Abbreviation Code
+	.byte	46                              ; DW_TAG_subprogram
+	.byte	1                               ; DW_CHILDREN_yes
+	.byte	17                              ; DW_AT_low_pc
+	.byte	1                               ; DW_FORM_addr
+	.byte	18                              ; DW_AT_high_pc
+	.byte	6                               ; DW_FORM_data4
+	.byte	64                              ; DW_AT_frame_base
+	.byte	24                              ; DW_FORM_exprloc
+	.byte	3                               ; DW_AT_name
+	.byte	14                              ; DW_FORM_strp
+	.byte	58                              ; DW_AT_decl_file
+	.byte	11                              ; DW_FORM_data1
+	.byte	59                              ; DW_AT_decl_line
+	.byte	11                              ; DW_FORM_data1
+	.byte	39                              ; DW_AT_prototyped
+	.byte	25                              ; DW_FORM_flag_present
+	.byte	73                              ; DW_AT_type
+	.byte	19                              ; DW_FORM_ref4
+	.byte	63                              ; DW_AT_external
+	.byte	25                              ; DW_FORM_flag_present
+	.byte	0                               ; EOM(1)
+	.byte	0                               ; EOM(2)
+	.byte	3                               ; Abbreviation Code
+	.byte	5                               ; DW_TAG_formal_parameter
+	.byte	0                               ; DW_CHILDREN_no
+	.byte	2                               ; DW_AT_location
+	.byte	24                              ; DW_FORM_exprloc
+	.byte	3                               ; DW_AT_name
+	.byte	14                              ; DW_FORM_strp
+	.byte	58                              ; DW_AT_decl_file
+	.byte	11                              ; DW_FORM_data1
+	.byte	59                              ; DW_AT_decl_line
+	.byte	11                              ; DW_FORM_data1
+	.byte	73                              ; DW_AT_type
+	.byte	19                              ; DW_FORM_ref4
+	.byte	0                               ; EOM(1)
+	.byte	0                               ; EOM(2)
+	.byte	4                               ; Abbreviation Code
+	.byte	36                              ; DW_TAG_base_type
+	.byte	0                               ; DW_CHILDREN_no
+	.byte	3                               ; DW_AT_name
+	.byte	14                              ; DW_FORM_strp
+	.byte	62                              ; DW_AT_encoding
+	.byte	11                              ; DW_FORM_data1
+	.byte	11                              ; DW_AT_byte_size
+	.byte	11                              ; DW_FORM_data1
+	.byte	0                               ; EOM(1)
+	.byte	0                               ; EOM(2)
+	.byte	5                               ; Abbreviation Code
+	.byte	15                              ; DW_TAG_pointer_type
+	.byte	0                               ; DW_CHILDREN_no
+	.byte	73                              ; DW_AT_type
+	.byte	19                              ; DW_FORM_ref4
+	.byte	0                               ; EOM(1)
+	.byte	0                               ; EOM(2)
+	.byte	0                               ; EOM(3)
+	.section	__DWARF,__debug_info,regular,debug
+Lsection_info:
+Lcu_begin0:
+.set Lset0, Ldebug_info_end0-Ldebug_info_start0 ; Length of Unit
+	.long	Lset0
+Ldebug_info_start0:
+	.short	4                               ; DWARF version number
+.set Lset1, Lsection_abbrev-Lsection_abbrev ; Offset Into Abbrev. Section
+	.long	Lset1
+	.byte	8                               ; Address Size (in bytes)
+	.byte	1                               ; Abbrev [1] 0xb:0x76 DW_TAG_compile_unit
+	.long	0                               ; DW_AT_producer
+	.short	12                              ; DW_AT_language
+	.long	47                              ; DW_AT_name
+	.long	54                              ; DW_AT_LLVM_sysroot
+	.long	165                             ; DW_AT_APPLE_sdk
+.set Lset2, Lline_table_start0-Lsection_line ; DW_AT_stmt_list
+	.long	Lset2
+	.long	180                             ; DW_AT_comp_dir
+	.quad	Lfunc_begin1                    ; DW_AT_low_pc
+.set Lset3, Lfunc_end1-Lfunc_begin1     ; DW_AT_high_pc
+	.long	Lset3
+	.byte	2                               ; Abbrev [2] 0x32:0x36 DW_TAG_subprogram
+	.quad	Lfunc_begin1                    ; DW_AT_low_pc
+.set Lset4, Lfunc_end1-Lfunc_begin1     ; DW_AT_high_pc
+	.long	Lset4
+	.byte	1                               ; DW_AT_frame_base
+	.byte	109
+	.long	247                             ; DW_AT_name
+	.byte	1                               ; DW_AT_decl_file
+	.byte	6                               ; DW_AT_decl_line
+                                        ; DW_AT_prototyped
+	.long	107                             ; DW_AT_type
+                                        ; DW_AT_external
+	.byte	3                               ; Abbrev [3] 0x4b:0xe DW_TAG_formal_parameter
+	.byte	2                               ; DW_AT_location
+	.byte	145
+	.byte	120
+	.long	256                             ; DW_AT_name
+	.byte	1                               ; DW_AT_decl_file
+	.byte	6                               ; DW_AT_decl_line
+	.long	103                             ; DW_AT_type
+	.byte	3                               ; Abbrev [3] 0x59:0xe DW_TAG_formal_parameter
+	.byte	1                               ; DW_AT_location
+	.byte	0x9c                            ; EDIT: DW_AT_call_frame_cfa
+	.long	261                             ; DW_AT_name
+	.byte	1                               ; DW_AT_decl_file
+	.byte	6                               ; DW_AT_decl_line
+	.long	110                             ; DW_AT_type
+	.byte	0                               ; End Of Children Mark
+	.byte	4                               ; Abbrev [4] 0x68:0x7 DW_TAG_base_type
+	.long	252                             ; DW_AT_name
+	.byte	5                               ; DW_AT_encoding
+	.byte	4                               ; DW_AT_byte_size
+	.byte	5                               ; Abbrev [5] 0x6f:0x5 DW_TAG_pointer_type
+	.long	115                             ; DW_AT_type
+	.byte	5                               ; Abbrev [5] 0x74:0x5 DW_TAG_pointer_type
+	.long	120                             ; DW_AT_type
+	.byte	4                               ; Abbrev [4] 0x79:0x7 DW_TAG_base_type
+	.long	266                             ; DW_AT_name
+	.byte	6                               ; DW_AT_encoding
+	.byte	1                               ; DW_AT_byte_size
+	.byte	0                               ; End Of Children Mark
+Ldebug_info_end0:
+	.section	__DWARF,__debug_str,regular,debug
+Linfo_string:
+	.asciz	"Apple clang                                   " ; string offset=0
+	.asciz	"test.c"                        ; string offset=47
+	.asciz	"/Applications/Xcode..........................................................................................." ; string offset=54
+	.asciz	".............."                ; string offset=165
+	.asciz	"......................................................../llvm_src1" ; string offset=180
+	.asciz	"main"                          ; string offset=247
+	.asciz	"int"                           ; string offset=252
+	.asciz	"argc"                          ; string offset=256
+	.asciz	"argv"                          ; string offset=261
+	.asciz	"char"                          ; string offset=266
+.subsections_via_symbols
+	.section	__DWARF,__debug_line,regular,debug
+Lsection_line:
+Lline_table_start0:
diff --git a/lldb/test/API/macosx/ignore_exceptions/Makefile b/lldb/test/API/macosx/ignore_exceptions/Makefile
index 695335e068c0c..14ed92da5296b 100644
--- a/lldb/test/API/macosx/ignore_exceptions/Makefile
+++ b/lldb/test/API/macosx/ignore_exceptions/Makefile
@@ -1,4 +1,4 @@
 C_SOURCES := main.c
-CFLAGS_EXTRAS := -std=c99
+CFLAGS_EXTRAS := -std=c99 -Wno-error=incompatible-pointer-types
 
 include Makefile.rules
diff --git a/lldb/test/API/tools/lldb-dap/attach/TestDAP_attach.py b/lldb/test/API/tools/lldb-dap/attach/TestDAP_attach.py
index d7d25ca20f85a..d3952e150e125 100644
--- a/lldb/test/API/tools/lldb-dap/attach/TestDAP_attach.py
+++ b/lldb/test/API/tools/lldb-dap/attach/TestDAP_attach.py
@@ -56,6 +56,7 @@ def test_by_pid(self):
         self.set_and_hit_breakpoint(continueToExit=True)
 
     @skipIfNetBSD  # Hangs on NetBSD as well
+    @skipIfWindows # https://github.com/llvm/llvm-project/issues/137660
     def test_by_name(self):
         """
         Tests attaching to a process by process name.
diff --git a/lldb/test/API/tools/lldb-dap/cancel/TestDAP_cancel.py b/lldb/test/API/tools/lldb-dap/cancel/TestDAP_cancel.py
index 109f34ff10a5d..9dea325694f00 100644
--- a/lldb/test/API/tools/lldb-dap/cancel/TestDAP_cancel.py
+++ b/lldb/test/API/tools/lldb-dap/cancel/TestDAP_cancel.py
@@ -70,6 +70,7 @@ def test_pending_request(self):
         self.assertEqual(cancel_resp["success"], True)
         self.continue_to_exit()
 
+    @skipIfWindows # https://github.com/llvm/llvm-project/issues/137660
     def test_inflight_request(self):
         """
         Tests cancelling an inflight request.
diff --git a/lldb/test/API/tools/lldb-dap/io/TestDAP_io.py b/lldb/test/API/tools/lldb-dap/io/TestDAP_io.py
index af5c62a8c4eb5..246ad3ae944cc 100644
--- a/lldb/test/API/tools/lldb-dap/io/TestDAP_io.py
+++ b/lldb/test/API/tools/lldb-dap/io/TestDAP_io.py
@@ -76,6 +76,7 @@ def test_incorrect_content_length(self):
         process.stdin.close()
         self.assertEqual(process.wait(timeout=5.0), EXIT_FAILURE)
 
+    @skipIfWindows # https://github.com/llvm/llvm-project/issues/137660
     def test_partial_content_length(self):
         """
         lldb-dap returns a failure exit code when the input stream is closed
diff --git a/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch.py b/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch.py
index 22fcd42b3d36a..096ce5e0236a2 100644
--- a/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch.py
+++ b/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch.py
@@ -6,6 +6,7 @@
 from lldbsuite.test.lldbtest import *
 import lldbdap_testcase
 import os
+import pathlib
 import re
 import tempfile
 
@@ -143,6 +144,7 @@ def test_cwd(self):
                 )
         self.assertTrue(found, "verified program working directory")
 
+    @skipIfWindows # https://github.com/llvm/llvm-project/issues/137660
     def test_debuggerRoot(self):
         """
         Tests the "debuggerRoot" will change the working directory of
@@ -624,3 +626,18 @@ def test_no_lldbinit_flag(self):
 
             # Verify the initCommands were executed
             self.verify_commands("initCommands", output, initCommands)
+
+    def test_stdio_redirection(self):
+        """
+        Test stdio redirection.
+        """
+        self.build_and_create_debug_adapter()
+        program = self.getBuildArtifact("a.out")
+
+        with tempfile.NamedTemporaryFile("rt") as f:
+            self.launch(program, stdio=[None, f.name, None])
+            self.continue_to_exit()
+            lines = f.readlines()
+            self.assertIn(
+                program, lines[0], "make sure program path is in first argument"
+            )
diff --git a/lldb/test/API/tools/lldb-dap/memory/TestDAP_memory.py b/lldb/test/API/tools/lldb-dap/memory/TestDAP_memory.py
index f51056d7020c6..7c9ad0c0f75ee 100644
--- a/lldb/test/API/tools/lldb-dap/memory/TestDAP_memory.py
+++ b/lldb/test/API/tools/lldb-dap/memory/TestDAP_memory.py
@@ -72,9 +72,7 @@ def test_memory_refs_set_variable(self):
         ptr_value = self.get_local_as_int("rawptr")
         self.assertIn(
             "memoryReference",
-            self.dap_server.request_setVariable(1, "rawptr", ptr_value + 2)[
-                "body"
-            ].keys(),
+            self.set_local("rawptr", ptr_value + 2)["body"].keys(),
         )
 
     @skipIfWindows
diff --git a/lldb/test/API/tools/lldb-dap/variables/TestDAP_variables.py b/lldb/test/API/tools/lldb-dap/variables/TestDAP_variables.py
index a3a4bdaaf40a6..13a694602f230 100644
--- a/lldb/test/API/tools/lldb-dap/variables/TestDAP_variables.py
+++ b/lldb/test/API/tools/lldb-dap/variables/TestDAP_variables.py
@@ -298,7 +298,7 @@ def do_test_scopes_variables_setVariable_evaluate(
         # Set a variable value whose name is synthetic, like a variable index
         # and verify the value by reading it
         variable_value = 100
-        response = self.dap_server.request_setVariable(varRef, "[0]", variable_value)
+        response = self.set_variable(varRef, "[0]", variable_value)
         # Verify dap sent the correct response
         verify_response = {
             "type": "int",
@@ -315,7 +315,7 @@ def do_test_scopes_variables_setVariable_evaluate(
         # Set a variable value whose name is a real child value, like "pt.x"
         # and verify the value by reading it
         varRef = varref_dict["pt"]
-        self.dap_server.request_setVariable(varRef, "x", 111)
+        self.set_variable(varRef, "x", 111)
         response = self.dap_server.request_variables(varRef, start=0, count=1)
         value = response["body"]["variables"][0]["value"]
         self.assertEqual(
@@ -341,27 +341,15 @@ def do_test_scopes_variables_setVariable_evaluate(
         self.verify_variables(verify_locals, self.dap_server.get_local_variables())
 
         # Now we verify that we correctly change the name of a variable with and without differentiator suffix
-        self.assertFalse(self.dap_server.request_setVariable(1, "x2", 9)["success"])
-        self.assertFalse(
-            self.dap_server.request_setVariable(1, "x @ main.cpp:0", 9)["success"]
-        )
+        self.assertFalse(self.set_local("x2", 9)["success"])
+        self.assertFalse(self.set_local("x @ main.cpp:0", 9)["success"])
 
-        self.assertTrue(
-            self.dap_server.request_setVariable(1, "x @ main.cpp:19", 19)["success"]
-        )
-        self.assertTrue(
-            self.dap_server.request_setVariable(1, "x @ main.cpp:21", 21)["success"]
-        )
-        self.assertTrue(
-            self.dap_server.request_setVariable(1, "x @ main.cpp:23", 23)["success"]
-        )
+        self.assertTrue(self.set_local("x @ main.cpp:19", 19)["success"])
+        self.assertTrue(self.set_local("x @ main.cpp:21", 21)["success"])
+        self.assertTrue(self.set_local("x @ main.cpp:23", 23)["success"])
 
         # The following should have no effect
-        self.assertFalse(
-            self.dap_server.request_setVariable(1, "x @ main.cpp:23", "invalid")[
-                "success"
-            ]
-        )
+        self.assertFalse(self.set_local("x @ main.cpp:23", "invalid")["success"])
 
         verify_locals["x @ main.cpp:19"]["equals"]["value"] = "19"
         verify_locals["x @ main.cpp:21"]["equals"]["value"] = "21"
@@ -370,7 +358,7 @@ def do_test_scopes_variables_setVariable_evaluate(
         self.verify_variables(verify_locals, self.dap_server.get_local_variables())
 
         # The plain x variable shold refer to the innermost x
-        self.assertTrue(self.dap_server.request_setVariable(1, "x", 22)["success"])
+        self.assertTrue(self.set_local("x", 22)["success"])
         verify_locals["x @ main.cpp:23"]["equals"]["value"] = "22"
 
         self.verify_variables(verify_locals, self.dap_server.get_local_variables())
@@ -708,9 +696,7 @@ def test_return_variables(self):
                 self.verify_variables(verify_locals, local_variables, varref_dict)
                 break
 
-        self.assertFalse(
-            self.dap_server.request_setVariable(1, "(Return Value)", 20)["success"]
-        )
+        self.assertFalse(self.set_local("(Return Value)", 20)["success"])
 
     @skipIfWindows
     def test_indexedVariables(self):
diff --git a/lldb/test/CMakeLists.txt b/lldb/test/CMakeLists.txt
index 39462560c4b98..8116f4c3c823a 100644
--- a/lldb/test/CMakeLists.txt
+++ b/lldb/test/CMakeLists.txt
@@ -250,6 +250,7 @@ llvm_canonicalize_cmake_booleans(
   LLDB_ENABLE_LZMA
   LLVM_ENABLE_ZLIB
   LLVM_ENABLE_SHARED_LIBS
+  LLVM_ENABLE_DIA_SDK
   LLDB_HAS_LIBCXX
   LLDB_TEST_SHELL_DISABLE_REMOTE
   LLDB_TOOL_LLDB_SERVER_BUILD
diff --git a/lldb/test/Shell/Commands/command-image-dump-ast-colored.test b/lldb/test/Shell/Commands/command-image-dump-ast-colored.test
new file mode 100644
index 0000000000000..355ef6bb1d199
--- /dev/null
+++ b/lldb/test/Shell/Commands/command-image-dump-ast-colored.test
@@ -0,0 +1,34 @@
+# Test AST dumping with and without color.
+
+# RUN: split-file %s %t
+# RUN: %clang_host -g -gdwarf %t/main.cpp -o %t.out
+# RUN: %lldb -x -b -s %t/commands.input %t.out -o exit 2>&1 \
+# RUN:       | FileCheck %s
+
+#--- main.cpp
+
+int main() {}
+
+#--- commands.input
+
+settings set use-color true
+
+target modules dump ast
+
+# CHECK:      target modules dump ast
+# CHECK:      TranslationUnitDecl
+# CHECK-SAME: 
+
+target dump typesystem
+
+# CHECK:      target dump typesystem
+# CHECK:      TranslationUnitDecl
+# CHECK-SAME: 
+
+settings set use-color false
+
+target modules dump ast
+target dump typesystem
+
+# CHECK:     settings set use-color false
+# CHECK-NOT: 
diff --git a/lldb/test/Shell/SymbolFile/NativePDB/disassembly.cpp b/lldb/test/Shell/SymbolFile/NativePDB/disassembly.cpp
index db3b85fa7e59f..b3f7b098a95d9 100644
--- a/lldb/test/Shell/SymbolFile/NativePDB/disassembly.cpp
+++ b/lldb/test/Shell/SymbolFile/NativePDB/disassembly.cpp
@@ -18,9 +18,7 @@ int main(int argc, char **argv) {
 
 
 // CHECK:      (lldb) disassemble --flavor=intel -m -n main
-// CHECK:         12   int foo() { return 42; }
-// CHECK-NEXT:    13
-// CHECK-NEXT: ** 14   int main(int argc, char **argv) {
+// CHECK: ** 14   int main(int argc, char **argv) {
 // CHECK:      disassembly.cpp.tmp.exe`main:
 // CHECK-NEXT: disassembly.cpp.tmp.exe[{{.*}}] <+0>:  sub    rsp, 0x38
 // CHECK-NEXT: disassembly.cpp.tmp.exe[{{.*}}] <+4>:  mov    dword ptr [rsp + 0x34], 0x0
diff --git a/lldb/test/Shell/SymbolFile/NativePDB/inline_sites.test b/lldb/test/Shell/SymbolFile/NativePDB/inline_sites.test
index 6293148d90ce4..769f18de51472 100644
--- a/lldb/test/Shell/SymbolFile/NativePDB/inline_sites.test
+++ b/lldb/test/Shell/SymbolFile/NativePDB/inline_sites.test
@@ -61,6 +61,7 @@
 # CHECK:       Function: id = {{.*}}, name = "main", range = [0x0000000140001000-0x0000000140001046)
 # CHECK:       Blocks: id = {{.*}}, range = [0x140001000-0x140001046)
 # CHECK:       LineEntry: [0x0000000140001000-0x0000000140001004): /tmp/a.cpp:2
+# CHECK-NEXT:  Symbol: id = {{.*}}, range = [0x0000000140001000-0x0000000140001046), name="main"
 # CHECK-NEXT:  Variable: id = {{.*}}, name = "argc", type = "int", valid ranges = <block>, location = [0x0000000140001000, 0x000000014000102d) -> DW_OP_reg26 XMM9
 # CHECK-NEXT:  Variable: id = {{.*}}, name = "argv", type = "char **", valid ranges = <block>, location = [0x0000000140001000, 0x0000000140001045) -> DW_OP_reg3 RBX
 
@@ -71,6 +72,7 @@
 # CHECK:         Blocks: id = {{.*}}, range = [0x140001000-0x140001046)
 # CHECK-NEXT:            id = {{.*}}, ranges = [0x140001004-0x140001039)[0x14000103f-0x140001046), name = "Namespace1::foo", decl = a.h:4
 # CHECK:       LineEntry: [0x0000000140001004-0x000000014000100c): /tmp/a.h:5
+# CHECK-NEXT:  Symbol: id = {{.*}}, range = [0x0000000140001000-0x0000000140001046), name="main"
 # CHECK-NEXT:  Variable: id = {{.*}}, name = "x", type = "int", valid ranges = <block>, location = <empty>, decl =
 # CHECK-NEXT:  Variable: id = {{.*}}, name = "foo_local", type = "int", valid ranges = <block>, location = [0x0000000140001004, 0x0000000140001039) -> DW_OP_breg7 RSP+44
 # CHECK-NEXT:  Variable: id = {{.*}}, name = "argc", type = "int", valid ranges = <block>, location = [0x0000000140001000, 0x000000014000102d) -> DW_OP_reg26 XMM9
@@ -84,6 +86,7 @@
 # CHECK:         Blocks: id = {{.*}}, range = [0x140001000-0x140001046)
 # CHECK-NEXT:            id = {{.*}}, ranges = [0x140001004-0x140001039)[0x14000103f-0x140001046), name = "Namespace1::foo", decl = a.h:4
 # CHECK:       LineEntry: [0x0000000140001010-0x0000000140001018): /tmp/a.h:7
+# CHECK-NEXT:  Symbol: id = {{.*}}, range = [0x0000000140001000-0x0000000140001046), name="main"
 # CHECK-NEXT:  Variable: id = {{.*}}, name = "x", type = "int", valid ranges = <block>, location = <empty>, decl =
 # CHECK-NEXT:  Variable: id = {{.*}}, name = "foo_local", type = "int", valid ranges = <block>, location = [0x0000000140001004, 0x0000000140001039) -> DW_OP_breg7 RSP+44
 # CHECK-NEXT:  Variable: id = {{.*}}, name = "argc", type = "int", valid ranges = <block>, location = [0x0000000140001000, 0x000000014000102d) -> DW_OP_reg26 XMM9
@@ -99,6 +102,7 @@
 # CHECK-NEXT:            id = {{.*}}, ranges = [0x140001004-0x140001039)[0x14000103f-0x140001046), name = "Namespace1::foo", decl = a.h:4
 # CHECK-NEXT:            id = {{.*}}, range = [0x14000101c-0x140001039), name = "Class1::bar", decl = b.h:4
 # CHECK:       LineEntry: [0x000000014000101c-0x0000000140001022): /tmp/b.h:5
+# CHECK-NEXT:  Symbol: id = {{.*}}, range = [0x0000000140001000-0x0000000140001046), name="main"
 # CHECK-NEXT:  Variable: id = {{.*}}, name = "x", type = "int", valid ranges = <block>, location = [0x000000014000101c, 0x000000014000101e) -> DW_OP_reg24 XMM7
 # CHECK-NEXT:  Variable: id = {{.*}}, name = "bar_local", type = "int", valid ranges = <block>, location = [0x000000014000101c, 0x0000000140001039) -> DW_OP_breg7 RSP+52
 # CHECK-NEXT:  Variable: id = {{.*}}, name = "x", type = "int", valid ranges = <block>, location = <empty>, decl =
@@ -118,6 +122,7 @@
 # CHECK-NEXT:            id = {{.*}}, range = [0x14000101c-0x140001039), name = "Class1::bar", decl = b.h:4
 # CHECK-NEXT:            id = {{.*}}, range = [0x14000102a-0x140001039), name = "Namespace2::Class2::func", decl = c.h:4
 # CHECK:       LineEntry: [0x000000014000102a-0x0000000140001031): /tmp/c.h:5
+# CHECK-NEXT:  Symbol: id = {{.*}}, range = [0x0000000140001000-0x0000000140001046), name="main"
 # CHECK-NEXT:  Variable: id = {{.*}}, name = "x", type = "int", valid ranges = <block>, location = [0x000000014000102a, 0x0000000140001039) -> DW_OP_reg24 XMM7
 # CHECK-NEXT:  Variable: id = {{.*}}, name = "func_local", type = "int", valid ranges = <block>, location = [0x000000014000102a, 0x0000000140001039) -> DW_OP_breg7 RSP+48
 # CHECK-NEXT:  Variable: id = {{.*}}, name = "bar_local", type = "int", valid ranges = <block>, location = [0x000000014000101c, 0x0000000140001039) -> DW_OP_breg7 RSP+52
@@ -132,6 +137,7 @@
 # CHECK:       Function: id = {{.*}}, name = "main", range = [0x0000000140001000-0x0000000140001046)
 # CHECK:         Blocks: id = {{.*}}, range = [0x140001000-0x140001046)
 # CHECK:       LineEntry: [0x0000000140001039-0x000000014000103d): /tmp/a.cpp:3
+# CHECK-NEXT:  Symbol: id = {{.*}}, range = [0x0000000140001000-0x0000000140001046), name="main"
 # CHECK-NEXT:  Variable: id = {{.*}}, name = "argv", type = "char **", valid ranges = <block>, location = [0x0000000140001000, 0x0000000140001045) -> DW_OP_reg3 RBX
 # CHECK-NEXT:  Variable: id = {{.*}}, name = "main_local", type = "int", valid ranges = <block>, location = [0x0000000140001004, 0x0000000140001046) -> DW_OP_breg7 RSP+48
 
@@ -142,6 +148,7 @@
 # CHECK:         Blocks: id = {{.*}}, range = [0x140001000-0x140001046)
 # CHECK-NEXT:            id = {{.*}}, ranges = [0x140001004-0x140001039)[0x14000103f-0x140001046), name = "Namespace1::foo", decl = a.h:4
 # CHECK:       LineEntry: [0x0000000140001044-0x0000000140001046): /tmp/a.h:8
+# CHECK-NEXT:  Symbol: id = {{.*}}, range = [0x0000000140001000-0x0000000140001046), name="main"
 # CHECK-NEXT:  Variable: id = {{.*}}, name = "x", type = "int", valid ranges = <block>, location = <empty>, decl =
 # CHECK-NEXT:  Variable: id = {{.*}}, name = "foo_local", type = "int", valid ranges = <block>, location = [0x0000000140001044, 0x0000000140001046) -> DW_OP_breg7 RSP+44
 # CHECK-NEXT:  Variable: id = {{.*}}, name = "argc", type = "int", valid ranges = <block>, location = [0x0000000140001044, 0x0000000140001045) -> DW_OP_reg26 XMM9
diff --git a/lldb/test/Shell/SymbolFile/NativePDB/local-variables-registers.s b/lldb/test/Shell/SymbolFile/NativePDB/local-variables-registers.s
index 85d92a2447939..fe2f397d60c01 100644
--- a/lldb/test/Shell/SymbolFile/NativePDB/local-variables-registers.s
+++ b/lldb/test/Shell/SymbolFile/NativePDB/local-variables-registers.s
@@ -34,38 +34,46 @@
 
 # CHECK:      (lldb) image lookup -a 0x140001000 -v
 # CHECK:          LineEntry: [0x0000000140001000-0x0000000140001003): C:\src\test\a.cpp:10
+# CHECK-NEXT: 	     Symbol: id = {{.*}}, range = [0x0000000140001000-0x0000000140001011), name="struct S CreateS(int, char)", mangled="?CreateS@@YA?AUS@@HD@Z"
 # CHECK-NEXT:      Variable: id = {{.*}}, name = "p1", type = "int", valid ranges = <block>, location = [0x0000000140001000, 0x0000000140001003) -> DW_OP_reg26 XMM9
 # CHECK-NEXT:      Variable: id = {{.*}}, name = "p2", type = "char", valid ranges = <block>, location = [0x0000000140001000, 0x0000000140001006) -> DW_OP_regx 0x3f
 # CHECK-EMPTY:
 # CHECK:      (lldb) image lookup -a 0x140001003 -v
 # CHECK:          LineEntry: [0x0000000140001003-0x0000000140001006): C:\src\test\a.cpp:11
+# CHECK-NEXT: 	     Symbol: id = {{.*}}, range = [0x0000000140001000-0x0000000140001011), name="struct S CreateS(int, char)", mangled="?CreateS@@YA?AUS@@HD@Z"
 # CHECK-NEXT:      Variable: id = {{.*}}, name = "p2", type = "char", valid ranges = <block>, location = [0x0000000140001000, 0x0000000140001006) -> DW_OP_regx 0x3f
 # CHECK-NEXT:      Variable: id = {{.*}}, name = "s", type = "S", valid ranges = <block>, location = [0x0000000140001003, 0x0000000140001006) -> DW_OP_piece 0x4, DW_OP_regx 0x3f, DW_OP_piece 0x1, DW_OP_piece 0x3
 # CHECK-EMPTY:
 # CHECK:      (lldb) image lookup -a 0x140001006 -v
 # CHECK:          LineEntry: [0x0000000140001006-0x0000000140001011): C:\src\test\a.cpp:12
+# CHECK-NEXT: 	     Symbol: id = {{.*}}, range = [0x0000000140001000-0x0000000140001011), name="struct S CreateS(int, char)", mangled="?CreateS@@YA?AUS@@HD@Z"
 # CHECK-NEXT:      Variable: id = {{.*}}, name = "s", type = "S", valid ranges = <block>, location = [0x0000000140001006, 0x0000000140001011) -> DW_OP_reg26 XMM9, DW_OP_piece 0x4, DW_OP_regx 0x3f, DW_OP_piece 0x1, DW_OP_piece 0x3
 # CHECK-EMPTY:
 # CHECK:      (lldb) image lookup -a 0x140001011 -v
 # CHECK:          LineEntry: [0x0000000140001011-0x0000000140001015): C:\src\test\a.cpp:15
+# CHECK-NEXT:        Symbol: id = {{.*}}, range = [0x0000000140001011-0x0000000140001050), name="main"
 # CHECK-NEXT:      Variable: id = {{.*}}, name = "argc", type = "int", valid ranges = <block>, location = [0x0000000140001011, 0x0000000140001017) -> DW_OP_reg26 XMM9
 # CHECK-NEXT:      Variable: id = {{.*}}, name = "argv", type = "char **", valid ranges = <block>, location = [0x0000000140001011, 0x0000000140001019) -> DW_OP_reg3 RBX
 # CHECK-EMPTY:
 # CHECK:      (lldb) image lookup -a 0x140001017 -v
 # CHECK:          LineEntry: [0x0000000140001017-0x000000014000101e): C:\src\test\a.cpp:17
+# CHECK-NEXT:        Symbol: id = {{.*}}, range = [0x0000000140001011-0x0000000140001050), name="main"
 # CHECK-NEXT:      Variable: id = {{.*}}, name = "argv", type = "char **", valid ranges = <block>, location = [0x0000000140001011, 0x0000000140001019) -> DW_OP_reg3 RBX
 # CHECK-NEXT:      Variable: id = {{.*}}, name = "local", type = "int", valid ranges = <block>, location = [0x0000000140001017, 0x000000014000101e) -> DW_OP_reg26 XMM9
 # CHECK-EMPTY:
 # CHECK:      (lldb) image lookup -a 0x140001019 -v
 # CHECK:          LineEntry: [0x0000000140001017-0x000000014000101e): C:\src\test\a.cpp:17
+# CHECK-NEXT:        Symbol: id = {{.*}}, range = [0x0000000140001011-0x0000000140001050), name="main"
 # CHECK-NEXT:      Variable: id = {{.*}}, name = "local", type = "int", valid ranges = <block>, location = [0x0000000140001017, 0x000000014000101e) -> DW_OP_reg26 XMM9
 # CHECK-EMPTY:
 # CHECK:      (lldb) image lookup -a 0x14000101e -v
 # CHECK:          LineEntry: [0x000000014000101e-0x0000000140001031): C:\src\test\a.cpp:18
+# CHECK-NEXT:        Symbol: id = {{.*}}, range = [0x0000000140001011-0x0000000140001050), name="main"
 # CHECK-NEXT:      Variable: id = {{.*}}, name = "s", type = "S", valid ranges = <block>, location = [0x000000014000101e, 0x000000014000102c) -> DW_OP_reg24 XMM7, DW_OP_piece 0x4, DW_OP_piece 0x4
 # CHECK-EMPTY:
 # CHECK:      (lldb) image lookup -a 0x14000102c -v
 # CHECK:          LineEntry: [0x000000014000101e-0x0000000140001031): C:\src\test\a.cpp:18
+# CHECK-NEXT:        Symbol: id = {{.*}}, range = [0x0000000140001011-0x0000000140001050), name="main"
 
 	.text
 	.def	 @feat.00;
@@ -406,14 +414,17 @@ main:                                   # @main
 	.short	.Ltmp103-.Ltmp102
 # CHECK:      (lldb) image lookup -a 0x140001031 -v
 # CHECK:          LineEntry: [0x0000000140001031-0x0000000140001034): C:\src\test\a.cpp:1000
+# CHECK-NEXT:        Symbol: id = {{.*}}, range = [0x0000000140001011-0x0000000140001050), name="main"
 # CHECK-NEXT:      Variable: id = {{.*}}, name = "non_overlapped_ranges", type = "S1", valid ranges = <block>, location = [0x0000000140001031, 0x0000000140001032) -> DW_OP_reg3 RBX
 # CHECK-EMPTY:
 # CHECK:      (lldb) image lookup -a 0x140001032 -v
 # CHECK:          LineEntry: [0x0000000140001031-0x0000000140001034): C:\src\test\a.cpp:1000
+# CHECK-NEXT:        Symbol: id = {{.*}}, range = [0x0000000140001011-0x0000000140001050), name="main"
 # CHECK-NEXT:      Variable: id = {{.*}}, name = "non_overlapped_ranges", type = "S1", valid ranges = <block>, location = [0x0000000140001032, 0x0000000140001033) -> DW_OP_reg2 RCX
 # CHECK-EMPTY:
 # CHECK:      (lldb) image lookup -a 0x140001033 -v
 # CHECK:          LineEntry: [0x0000000140001031-0x0000000140001034): C:\src\test\a.cpp:1000
+# CHECK-NEXT:        Symbol: id = {{.*}}, range = [0x0000000140001011-0x0000000140001050), name="main"
 # CHECK-NEXT:      Variable: id = {{.*}}, name = "non_overlapped_ranges", type = "S1", valid ranges = <block>, location = [0x0000000140001033, 0x0000000140001034) -> DW_OP_reg8 R8
 # CHECK-EMPTY:
 
@@ -431,18 +442,22 @@ main:                                   # @main
  	.short	.Ltmp105-.Ltmp104
 # CHECK:      (lldb) image lookup -a 0x140001034 -v
 # CHECK:          LineEntry: [0x0000000140001034-0x000000014000103b): C:\src\test\a.cpp:1001
+# CHECK-NEXT:        Symbol: id = {{.*}}, range = [0x0000000140001011-0x0000000140001050), name="main"
 # CHECK-NEXT:      Variable: id = {{.*}}, name = "overlapped_subfield_ranges", type = "S1", valid ranges = <block>, location = [0x0000000140001034, 0x0000000140001035) -> DW_OP_regx 0x3f, DW_OP_piece 0x1, DW_OP_piece 0x7
 # CHECK-EMPTY:
 # CHECK:      (lldb) image lookup -a 0x140001035 -v
 # CHECK:          LineEntry: [0x0000000140001034-0x000000014000103b): C:\src\test\a.cpp:1001
+# CHECK-NEXT:        Symbol: id = {{.*}}, range = [0x0000000140001011-0x0000000140001050), name="main"
 # CHECK-NEXT:      Variable: id = {{.*}}, name = "overlapped_subfield_ranges", type = "S1", valid ranges = <block>, location = [0x0000000140001035, 0x0000000140001036) -> DW_OP_regx 0x3f, DW_OP_piece 0x1, DW_OP_piece 0x3, DW_OP_reg24 XMM7, DW_OP_piece 0x4
 # CHECK-EMPTY:
 # CHECK:      (lldb) image lookup -a 0x140001036 -v
 # CHECK:          LineEntry: [0x0000000140001034-0x000000014000103b): C:\src\test\a.cpp:1001
+# CHECK-NEXT:        Symbol: id = {{.*}}, range = [0x0000000140001011-0x0000000140001050), name="main"
 # CHECK-NEXT:      Variable: id = {{.*}}, name = "overlapped_subfield_ranges", type = "S1", valid ranges = <block>, location = [0x0000000140001036, 0x0000000140001037) -> DW_OP_piece 0x4, DW_OP_reg24 XMM7, DW_OP_piece 0x4
 # CHECK-EMPTY:
 # CHECK:      (lldb) image lookup -a 0x140001037 -v
 # CHECK:          LineEntry: [0x0000000140001034-0x000000014000103b): C:\src\test\a.cpp:1001
+# CHECK-NEXT:        Symbol: id = {{.*}}, range = [0x0000000140001011-0x0000000140001050), name="main"
 # CHECK-NEXT:      Variable: id = {{.*}}, name = "overlapped_subfield_ranges", type = "S1", valid ranges = <block>, location = [0x0000000140001037, 0x0000000140001039) -> DW_OP_piece 0x4, DW_OP_reg26 XMM9, DW_OP_piece 0x4
 # CHECK-EMPTY:
 
@@ -461,22 +476,27 @@ main:                                   # @main
 	.short	.Ltmp107-.Ltmp106
 # CHECK:      (lldb) image lookup -a 0x14000103b -v
 # CHECK:          LineEntry: [0x000000014000103b-0x0000000140001045): C:\src\test\a.cpp:1002
+# CHECK-NEXT:        Symbol: id = {{.*}}, range = [0x0000000140001011-0x0000000140001050), name="main"
 # CHECK-NEXT:      Variable: id = {{.*}}, name = "overlapped_ranges_2", type = "S1", valid ranges = <block>, location = [0x000000014000103b, 0x000000014000103c) -> DW_OP_reg3 RBX
 # CHECK-EMPTY:
 # CHECK:      (lldb) image lookup -a 0x14000103d -v
 # CHECK:          LineEntry: [0x000000014000103b-0x0000000140001045): C:\src\test\a.cpp:1002
+# CHECK-NEXT:        Symbol: id = {{.*}}, range = [0x0000000140001011-0x0000000140001050), name="main"
 # CHECK-NEXT:      Variable: id = {{.*}}, name = "overlapped_ranges_2", type = "S1", valid ranges = <block>, location = [0x000000014000103c, 0x000000014000103e) -> DW_OP_reg2 RCX
 # CHECK-EMPTY:
 # CHECK:      (lldb) image lookup -a 0x14000103f -v
 # CHECK:          LineEntry: [0x000000014000103b-0x0000000140001045): C:\src\test\a.cpp:1002
+# CHECK-NEXT:        Symbol: id = {{.*}}, range = [0x0000000140001011-0x0000000140001050), name="main"
 # CHECK-NEXT:      Variable: id = {{.*}}, name = "overlapped_ranges_2", type = "S1", valid ranges = <block>, location = [0x000000014000103f, 0x0000000140001041) -> DW_OP_reg11 R11
 # CHECK-EMPTY:
 # CHECK:      (lldb) image lookup -a 0x140001041 -v
 # CHECK:          LineEntry: [0x000000014000103b-0x0000000140001045): C:\src\test\a.cpp:1002
+# CHECK-NEXT:        Symbol: id = {{.*}}, range = [0x0000000140001011-0x0000000140001050), name="main"
 # CHECK-NEXT:      Variable: id = {{.*}}, name = "overlapped_ranges_2", type = "S1", valid ranges = <block>, location = [0x0000000140001041, 0x0000000140001043) -> DW_OP_reg0 RAX
 # CHECK-EMPTY:
 # CHECK:      (lldb) image lookup -a 0x140001043 -v
 # CHECK:          LineEntry: [0x000000014000103b-0x0000000140001045): C:\src\test\a.cpp:1002
+# CHECK-NEXT:        Symbol: id = {{.*}}, range = [0x0000000140001011-0x0000000140001050), name="main"
 # CHECK-NEXT:      Variable: id = {{.*}}, name = "overlapped_ranges_2", type = "S1", valid ranges = <block>, location = [0x0000000140001043, 0x0000000140001044) -> DW_OP_reg11 R11
 # CHECK-EMPTY:
 
@@ -505,33 +525,41 @@ main:                                   # @main
  	.short .Ltmp109-.Ltmp108
 # CHECK:      (lldb) image lookup -a 0x140001045 -v
 # CHECK:          LineEntry: [0x0000000140001045-0x000000014000104e): C:\src\test\a.cpp:1003
+# CHECK-NEXT:        Symbol: id = {{.*}}, range = [0x0000000140001011-0x0000000140001050), name="main"
 # CHECK-NEXT:      Variable: id = {{.*}}, name = "overlapped_ranges_3", type = "S1", valid ranges = <block>, location = [0x0000000140001045, 0x0000000140001046) -> DW_OP_regx 0x3f, DW_OP_piece 0x1, DW_OP_piece 0x7
 # CHECK-EMPTY:
 # CHECK:      (lldb) image lookup -a 0x140001046 -v
 # CHECK:          LineEntry: [0x0000000140001045-0x000000014000104e): C:\src\test\a.cpp:1003
+# CHECK-NEXT:        Symbol: id = {{.*}}, range = [0x0000000140001011-0x0000000140001050), name="main"
 # CHECK-NEXT:      Variable: id = {{.*}}, name = "overlapped_ranges_3", type = "S1", valid ranges = <block>, location = [0x0000000140001046, 0x0000000140001047) -> DW_OP_regx 0x3f, DW_OP_piece 0x1, DW_OP_piece 0x3, DW_OP_reg24 XMM7, DW_OP_piece 0x4
 # CHECK-EMPTY:
 # CHECK:      (lldb) image lookup -a 0x140001047 -v
 # CHECK:          LineEntry: [0x0000000140001045-0x000000014000104e): C:\src\test\a.cpp:1003
+# CHECK-NEXT:        Symbol: id = {{.*}}, range = [0x0000000140001011-0x0000000140001050), name="main"
 # CHECK-NEXT:      Variable: id = {{.*}}, name = "overlapped_ranges_3", type = "S1", valid ranges = <block>, location = [0x0000000140001047, 0x0000000140001048) -> DW_OP_reg3 RBX
 # CHECK-EMPTY:
 # CHECK:      (lldb) image lookup -a 0x140001048 -v
 # CHECK:          LineEntry: [0x0000000140001045-0x000000014000104e): C:\src\test\a.cpp:1003
+# CHECK-NEXT:        Symbol: id = {{.*}}, range = [0x0000000140001011-0x0000000140001050), name="main"
 # CHECK-NEXT:      Variable: id = {{.*}}, name = "overlapped_ranges_3", type = "S1", valid ranges = <block>, location = [0x0000000140001048, 0x0000000140001049) -> DW_OP_regx 0x3f, DW_OP_piece 0x1, DW_OP_piece 0x3, DW_OP_reg24 XMM7, DW_OP_piece 0x4
 # CHECK-EMPTY:
 # CHECK:      (lldb) image lookup -a 0x140001049 -v
 # CHECK:          LineEntry: [0x0000000140001045-0x000000014000104e): C:\src\test\a.cpp:1003
+# CHECK-NEXT:        Symbol: id = {{.*}}, range = [0x0000000140001011-0x0000000140001050), name="main"
 # CHECK-NEXT:      Variable: id = {{.*}}, name = "overlapped_ranges_3", type = "S1", valid ranges = <block>, location = [0x0000000140001049, 0x000000014000104a) -> DW_OP_reg0 RAX
 # CHECK-EMPTY:
 # CHECK:      (lldb) image lookup -a 0x14000104a -v
 # CHECK:          LineEntry: [0x0000000140001045-0x000000014000104e): C:\src\test\a.cpp:1003
+# CHECK-NEXT:        Symbol: id = {{.*}}, range = [0x0000000140001011-0x0000000140001050), name="main"
 # CHECK-EMPTY:
 # CHECK:      (lldb) image lookup -a 0x14000104b -v
 # CHECK:          LineEntry: [0x0000000140001045-0x000000014000104e): C:\src\test\a.cpp:1003
+# CHECK-NEXT:        Symbol: id = {{.*}}, range = [0x0000000140001011-0x0000000140001050), name="main"
 # CHECK-NEXT:      Variable: id = {{.*}}, name = "overlapped_ranges_3", type = "S1", valid ranges = <block>, location = [0x000000014000104b, 0x000000014000104e) -> DW_OP_reg2 RCX
 # CHECK-EMPTY:
 # CHECK:      (lldb) image lookup -a 0x14000104c -v
 # CHECK:          LineEntry: [0x0000000140001045-0x000000014000104e): C:\src\test\a.cpp:1003
+# CHECK-NEXT:        Symbol: id = {{.*}}, range = [0x0000000140001011-0x0000000140001050), name="main"
 # CHECK-NEXT:      Variable: id = {{.*}}, name = "overlapped_ranges_3", type = "S1", valid ranges = <block>, location = [0x000000014000104b, 0x000000014000104e) -> DW_OP_reg2 RCX
 # CHECK-EMPTY:
 
@@ -549,10 +577,12 @@ main:                                   # @main
 	.short	4431                            # Record kind: S_PROC_ID_END
 # CHECK:      (lldb) image lookup -a 0x14000104e -v
 # CHECK:          LineEntry: [0x000000014000104e-0x0000000140001050): C:\src\test\a.cpp:1004
+# CHECK-NEXT:        Symbol: id = {{.*}}, range = [0x0000000140001011-0x0000000140001050), name="main"
 # CHECK-NEXT:      Variable: id = {{.*}}, name = "simple_type1", type = "int64_t", valid ranges = <block>, location = [0x000000014000104e, 0x000000014000104f) -> DW_OP_reg26 XMM9, DW_OP_piece 0x4, DW_OP_reg24 XMM7, DW_OP_piece 0x4
 # CHECK-EMPTY:
 # CHECK:      (lldb) image lookup -a 0x14000104f -v
 # CHECK:          LineEntry: [0x000000014000104e-0x0000000140001050): C:\src\test\a.cpp:1004
+# CHECK-NEXT:        Symbol: id = {{.*}}, range = [0x0000000140001011-0x0000000140001050), name="main"
 # CHECK-NEXT:      Variable: id = {{.*}}, name = "simple_type1", type = "int64_t", valid ranges = <block>, location = [0x000000014000104f, 0x0000000140001050) -> DW_OP_reg26 XMM9, DW_OP_piece 0x4, DW_OP_piece 0x4
 # CHECK-EMPTY:
 
diff --git a/lldb/test/Shell/SymbolFile/NativePDB/nested-blocks-same-address.s b/lldb/test/Shell/SymbolFile/NativePDB/nested-blocks-same-address.s
index dc3ee844fe364..e51b280d4213e 100644
--- a/lldb/test/Shell/SymbolFile/NativePDB/nested-blocks-same-address.s
+++ b/lldb/test/Shell/SymbolFile/NativePDB/nested-blocks-same-address.s
@@ -30,6 +30,7 @@
 # CHECK-NEXT:            id = {{.*}}, range = [0x140001025-0x140001046)
 # CHECK-NEXT:            id = {{.*}}, range = [0x140001025-0x140001046)
 # CHECK-NEXT: LineEntry: [0x0000000140001035-0x0000000140001046): /tmp/test.cpp:10
+# CHECK-NEXT:    Symbol: id = {{.*}}, range = [0x0000000140001020-0x000000014000104d), name="main"
 # CHECK-NEXT:  Variable: id = {{.*}}, name = "path", type = "volatile char[10]", valid ranges = <block>, location = [0x0000000140001025, 0x0000000140001046) -> DW_OP_breg7 RSP+40, decl =
 # CHECK-NEXT:  Variable: id = {{.*}}, name = "kMfDLL", type = "const char *", valid ranges = <block>, location = [0x000000014000103c, 0x0000000140001046) -> DW_OP_reg2 RCX, decl =
 # CHECK-NEXT:  Variable: id = {{.*}}, name = "__range1", type = "const char *const (&)[1]", valid ranges = <block>, location = <empty>, decl =
diff --git a/lldb/test/Shell/SymbolFile/NativePDB/symtab.cpp b/lldb/test/Shell/SymbolFile/NativePDB/symtab.cpp
new file mode 100644
index 0000000000000..81d643d9572d8
--- /dev/null
+++ b/lldb/test/Shell/SymbolFile/NativePDB/symtab.cpp
@@ -0,0 +1,59 @@
+// REQUIRES: x86
+
+// Test symtab reading
+// RUN: %build --compiler=clang-cl --arch=64 --nodefaultlib -o %t.exe -- %s
+// RUN: env LLDB_USE_NATIVE_PDB_READER=1 lldb-test symtab %t.exe --find-symbols-by-regex=".*" | FileCheck %s
+// RUN: env LLDB_USE_NATIVE_PDB_READER=0 lldb-test symtab %t.exe --find-symbols-by-regex=".*" | FileCheck %s
+
+struct A {
+  void something() {}
+};
+
+namespace ns {
+template <typename T> struct B {
+  struct C {
+    static int static_fn() { return 1; }
+  };
+
+  int b_func() const { return 3; }
+};
+
+struct Dyn {
+  virtual ~Dyn() = default;
+};
+
+int a_function() { return 1; }
+} // namespace ns
+
+void *operator new(unsigned long long n) { return nullptr; }
+void operator delete(void *p, unsigned long long i) {}
+
+A global_a;
+ns::B<long long>::C global_c;
+int global_int;
+
+int main(int argc, char **argv) {
+  A a;
+  a.something();
+  ns::B<int>::C::static_fn();
+  ns::B<bool>::C::static_fn();
+  ns::B<short> b;
+  ns::Dyn dyn;
+  return ns::a_function() + b.b_func();
+}
+
+// CHECK-DAG: Code {{.*}} main
+// CHECK-DAG: Code {{.*}} ?b_func@?$B@F@ns@@QEBAHXZ
+// CHECK-DAG: Code {{.*}} ?something@A@@QEAAXXZ
+// CHECK-DAG: Code {{.*}} ??_GDyn@ns@@UEAAPEAXI@Z
+// CHECK-DAG: Code {{.*}} ??2@YAPEAX_K@Z
+// CHECK-DAG: Code {{.*}} ??3@YAXPEAX_K@Z
+// CHECK-DAG: Code {{.*}} ?static_fn@C@?$B@H@ns@@SAHXZ
+// CHECK-DAG: Code {{.*}} ?a_function@ns@@YAHXZ
+// CHECK-DAG: Code {{.*}} ?static_fn@C@?$B@_N@ns@@SAHXZ
+// CHECK-DAG: Code {{.*}} ??1Dyn@ns@@UEAA@XZ
+// CHECK-DAG: Code {{.*}} ??0Dyn@ns@@QEAA@XZ
+// CHECK-DAG: Data {{.*}} ?global_int@@3HA
+// CHECK-DAG: Data {{.*}} ??_7Dyn@ns@@6B@
+// CHECK-DAG: Data {{.*}} ?global_a@@3UA@@A
+// CHECK-DAG: Data {{.*}} ?global_c@@3UC@?$B@_J@ns@@A
diff --git a/lldb/test/Shell/SymbolFile/PDB/native-setting.cpp b/lldb/test/Shell/SymbolFile/PDB/native-setting.cpp
index ce188e75553c7..a3077252f08f1 100644
--- a/lldb/test/Shell/SymbolFile/PDB/native-setting.cpp
+++ b/lldb/test/Shell/SymbolFile/PDB/native-setting.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: target-windows
+// REQUIRES: diasdk, target-windows
 
 // Test plugin.symbol-file.pdb.reader setting
 // RUN: %build -o %t.exe -- %s
diff --git a/lldb/test/Shell/Unwind/windows-unaligned-x86_64.test b/lldb/test/Shell/Unwind/windows-unaligned-x86_64.test
index 0356960424328..9f0a97527de4e 100644
--- a/lldb/test/Shell/Unwind/windows-unaligned-x86_64.test
+++ b/lldb/test/Shell/Unwind/windows-unaligned-x86_64.test
@@ -5,7 +5,8 @@
 # REQUIRES: target-x86_64, native, system-windows
 
 # RUN: %build %p/Inputs/windows-unaligned-x86_64.cpp %p/Inputs/windows-unaligned-x86_64-asm.s -o %t
-# RUN: %lldb %t -s %s -o exit | FileCheck %s
+# RUN: env LLDB_USE_NATIVE_PDB_READER=0 %lldb %t -s %s -o exit | FileCheck %s
+# RUN: env LLDB_USE_NATIVE_PDB_READER=1 %lldb %t -s %s -o exit | FileCheck %s
 
 # Future TODO: If %build could compile the source file in C mode, the symbol
 # name handling would be easier across msvc and mingw build configurations.
diff --git a/lldb/test/Shell/lit.cfg.py b/lldb/test/Shell/lit.cfg.py
index 46e2117cdb8e7..505847fb763e0 100644
--- a/lldb/test/Shell/lit.cfg.py
+++ b/lldb/test/Shell/lit.cfg.py
@@ -170,6 +170,9 @@ def calculate_arch_features(arch_string):
             )
         )
 
+if config.have_dia_sdk:
+    config.available_features.add("diasdk")
+
 # NetBSD permits setting dbregs either if one is root
 # or if user_set_dbregs is enabled
 can_set_dbregs = True
diff --git a/lldb/test/Shell/lit.site.cfg.py.in b/lldb/test/Shell/lit.site.cfg.py.in
index beaa41e6fd379..47beac002a19c 100644
--- a/lldb/test/Shell/lit.site.cfg.py.in
+++ b/lldb/test/Shell/lit.site.cfg.py.in
@@ -34,6 +34,7 @@ config.have_lldb_server = @LLDB_TOOL_LLDB_SERVER_BUILD@
 config.lldb_system_debugserver = @LLDB_USE_SYSTEM_DEBUGSERVER@
 config.llvm_use_sanitizer = "@LLVM_USE_SANITIZER@"
 config.lldb_has_lldbrpc = @LLDB_BUILD_LLDBRPC@
+config.have_dia_sdk = @LLVM_ENABLE_DIA_SDK@
 # The shell tests use their own module caches.
 config.lldb_module_cache = os.path.join("@LLDB_TEST_MODULE_CACHE_LLDB@", "lldb-shell")
 config.clang_module_cache = os.path.join("@LLDB_TEST_MODULE_CACHE_CLANG@", "lldb-shell")
diff --git a/lldb/tools/lldb-dap/EventHelper.cpp b/lldb/tools/lldb-dap/EventHelper.cpp
index ecd630cb530d6..6eb468e76b16c 100644
--- a/lldb/tools/lldb-dap/EventHelper.cpp
+++ b/lldb/tools/lldb-dap/EventHelper.cpp
@@ -12,9 +12,11 @@
 #include "JSONUtils.h"
 #include "LLDBUtils.h"
 #include "Protocol/ProtocolEvents.h"
+#include "Protocol/ProtocolRequests.h"
 #include "Protocol/ProtocolTypes.h"
 #include "lldb/API/SBFileSpec.h"
 #include "llvm/Support/Error.h"
+#include <utility>
 
 #if defined(_WIN32)
 #define NOMINMAX
@@ -273,4 +275,13 @@ void SendProcessExitedEvent(DAP &dap, lldb::SBProcess &process) {
   dap.SendJSON(llvm::json::Value(std::move(event)));
 }
 
+void SendInvalidatedEvent(
+    DAP &dap, llvm::ArrayRef<protocol::InvalidatedEventBody::Area> areas) {
+  if (!dap.clientFeatures.contains(protocol::eClientFeatureInvalidatedEvent))
+    return;
+  protocol::InvalidatedEventBody body;
+  body.areas = areas;
+  dap.Send(protocol::Event{"invalidated", std::move(body)});
+}
+
 } // namespace lldb_dap
diff --git a/lldb/tools/lldb-dap/EventHelper.h b/lldb/tools/lldb-dap/EventHelper.h
index 592c1b81c46af..0c57afbaf1f33 100644
--- a/lldb/tools/lldb-dap/EventHelper.h
+++ b/lldb/tools/lldb-dap/EventHelper.h
@@ -10,6 +10,8 @@
 #define LLDB_TOOLS_LLDB_DAP_EVENTHELPER_H
 
 #include "DAPForward.h"
+#include "Protocol/ProtocolEvents.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/Support/Error.h"
 
 namespace lldb_dap {
@@ -32,6 +34,9 @@ void SendContinuedEvent(DAP &dap);
 
 void SendProcessExitedEvent(DAP &dap, lldb::SBProcess &process);
 
+void SendInvalidatedEvent(
+    DAP &dap, llvm::ArrayRef<protocol::InvalidatedEventBody::Area> areas);
+
 } // namespace lldb_dap
 
 #endif
diff --git a/lldb/tools/lldb-dap/Handler/RequestHandler.cpp b/lldb/tools/lldb-dap/Handler/RequestHandler.cpp
index 4fadf1c22e0e3..773891353db6a 100644
--- a/lldb/tools/lldb-dap/Handler/RequestHandler.cpp
+++ b/lldb/tools/lldb-dap/Handler/RequestHandler.cpp
@@ -51,6 +51,33 @@ static uint32_t SetLaunchFlag(uint32_t flags, bool flag,
   return flags;
 }
 
+static void
+SetupIORedirection(const std::vector<std::optional<std::string>> &stdio,
+                   lldb::SBLaunchInfo &launch_info) {
+  size_t n = std::max(stdio.size(), static_cast<size_t>(3));
+  for (size_t i = 0; i < n; i++) {
+    std::optional<std::string> path;
+    if (stdio.size() < i)
+      path = stdio.back();
+    else
+      path = stdio[i];
+    if (!path)
+      continue;
+    switch (i) {
+    case 0:
+      launch_info.AddOpenFileAction(i, path->c_str(), true, false);
+      break;
+    case 1:
+    case 2:
+      launch_info.AddOpenFileAction(i, path->c_str(), false, true);
+      break;
+    default:
+      launch_info.AddOpenFileAction(i, path->c_str(), true, true);
+      break;
+    }
+  }
+}
+
 static llvm::Error
 RunInTerminal(DAP &dap, const protocol::LaunchRequestArguments &arguments) {
   if (!dap.clientFeatures.contains(
@@ -177,6 +204,9 @@ llvm::Error BaseRequestHandler::LaunchProcess(
     launch_info.SetEnvironment(env, true);
   }
 
+  if (!arguments.stdio.empty() && !arguments.disableSTDIO)
+    SetupIORedirection(arguments.stdio, launch_info);
+
   launch_info.SetDetachOnError(arguments.detachOnError);
   launch_info.SetShellExpandArguments(arguments.shellExpandArguments);
 
diff --git a/lldb/tools/lldb-dap/Handler/SetVariableRequestHandler.cpp b/lldb/tools/lldb-dap/Handler/SetVariableRequestHandler.cpp
index d07c0d6c9afa8..2a50dea0b4ada 100644
--- a/lldb/tools/lldb-dap/Handler/SetVariableRequestHandler.cpp
+++ b/lldb/tools/lldb-dap/Handler/SetVariableRequestHandler.cpp
@@ -9,6 +9,7 @@
 #include "DAP.h"
 #include "EventHelper.h"
 #include "JSONUtils.h"
+#include "Protocol/ProtocolEvents.h"
 #include "RequestHandler.h"
 
 using namespace lldb_dap::protocol;
@@ -77,6 +78,10 @@ SetVariableRequestHandler::Run(const SetVariableArguments &args) const {
   if (ValuePointsToCode(variable))
     body.valueLocationReference = new_var_ref;
 
+  // Also send invalidated event to signal client that some variables
+  // (e.g. references) can be changed.
+  SendInvalidatedEvent(dap, {InvalidatedEventBody::eAreaVariables});
+
   return body;
 }
 
diff --git a/lldb/tools/lldb-dap/Handler/WriteMemoryRequestHandler.cpp b/lldb/tools/lldb-dap/Handler/WriteMemoryRequestHandler.cpp
index 313f59dceab24..3e34e488d1158 100644
--- a/lldb/tools/lldb-dap/Handler/WriteMemoryRequestHandler.cpp
+++ b/lldb/tools/lldb-dap/Handler/WriteMemoryRequestHandler.cpp
@@ -7,21 +7,24 @@
 //===----------------------------------------------------------------------===//
 
 #include "DAP.h"
+#include "EventHelper.h"
 #include "JSONUtils.h"
+#include "Protocol/ProtocolEvents.h"
 #include "RequestHandler.h"
 #include "lldb/API/SBMemoryRegionInfo.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/Base64.h"
 
+using namespace lldb_dap::protocol;
+
 namespace lldb_dap {
 
 // Writes bytes to memory at the provided location.
 //
 // Clients should only call this request if the corresponding capability
 //  supportsWriteMemoryRequest is true.
-llvm::Expected<protocol::WriteMemoryResponseBody>
-WriteMemoryRequestHandler::Run(
-    const protocol::WriteMemoryArguments &args) const {
+llvm::Expected<WriteMemoryResponseBody>
+WriteMemoryRequestHandler::Run(const WriteMemoryArguments &args) const {
   const lldb::addr_t address = args.memoryReference + args.offset;
 
   lldb::SBProcess process = dap.target.GetProcess();
@@ -91,8 +94,13 @@ WriteMemoryRequestHandler::Run(
   if (bytes_written == 0) {
     return llvm::make_error<DAPError>(write_error.GetCString());
   }
-  protocol::WriteMemoryResponseBody response;
+  WriteMemoryResponseBody response;
   response.bytesWritten = bytes_written;
+
+  // Also send invalidated event to signal client that some things
+  // (e.g. variables) can be changed.
+  SendInvalidatedEvent(dap, {InvalidatedEventBody::eAreaAll});
+
   return response;
 }
 
diff --git a/lldb/tools/lldb-dap/Protocol/ProtocolEvents.cpp b/lldb/tools/lldb-dap/Protocol/ProtocolEvents.cpp
index 4faf65567c3ea..062b9494ec10f 100644
--- a/lldb/tools/lldb-dap/Protocol/ProtocolEvents.cpp
+++ b/lldb/tools/lldb-dap/Protocol/ProtocolEvents.cpp
@@ -33,4 +33,27 @@ json::Value toJSON(const ModuleEventBody &MEB) {
   return json::Object{{"reason", MEB.reason}, {"module", MEB.module}};
 }
 
+llvm::json::Value toJSON(const InvalidatedEventBody::Area &IEBA) {
+  switch (IEBA) {
+  case InvalidatedEventBody::eAreaAll:
+    return "all";
+  case InvalidatedEventBody::eAreaStacks:
+    return "stacks";
+  case InvalidatedEventBody::eAreaThreads:
+    return "threads";
+  case InvalidatedEventBody::eAreaVariables:
+    return "variables";
+  }
+  llvm_unreachable("unhandled invalidated event area!.");
+}
+
+llvm::json::Value toJSON(const InvalidatedEventBody &IEB) {
+  json::Object Result{{"areas", IEB.areas}};
+  if (IEB.threadId)
+    Result.insert({"threadID", IEB.threadId});
+  if (IEB.stackFrameId)
+    Result.insert({"stackFrameId", IEB.stackFrameId});
+  return Result;
+}
+
 } // namespace lldb_dap::protocol
diff --git a/lldb/tools/lldb-dap/Protocol/ProtocolEvents.h b/lldb/tools/lldb-dap/Protocol/ProtocolEvents.h
index ee9e03c499eae..cb976d3395217 100644
--- a/lldb/tools/lldb-dap/Protocol/ProtocolEvents.h
+++ b/lldb/tools/lldb-dap/Protocol/ProtocolEvents.h
@@ -21,7 +21,11 @@
 #define LLDB_TOOLS_LLDB_DAP_PROTOCOL_PROTOCOL_EVENTS_H
 
 #include "Protocol/ProtocolTypes.h"
+#include "lldb/lldb-types.h"
 #include "llvm/Support/JSON.h"
+#include <cstdint>
+#include <optional>
+#include <vector>
 
 namespace lldb_dap::protocol {
 
@@ -56,6 +60,34 @@ struct ModuleEventBody {
 llvm::json::Value toJSON(const ModuleEventBody::Reason &);
 llvm::json::Value toJSON(const ModuleEventBody &);
 
+/// This event signals that some state in the debug adapter has changed and
+/// requires that the client needs to re-render the data snapshot previously
+/// requested.
+///
+/// Debug adapters do not have to emit this event for runtime changes like
+/// stopped or thread events because in that case the client refetches the new
+/// state anyway. But the event can be used for example to refresh the UI after
+/// rendering formatting has changed in the debug adapter.
+///
+/// This event should only be sent if the corresponding capability
+/// supportsInvalidatedEvent is true.
+struct InvalidatedEventBody {
+  enum Area : unsigned { eAreaAll, eAreaStacks, eAreaThreads, eAreaVariables };
+
+  /// Set of logical areas that got invalidated.
+  std::vector<Area> areas;
+
+  /// If specified, the client only needs to refetch data related to this
+  /// thread.
+  std::optional<lldb::tid_t> threadId;
+
+  /// If specified, the client only needs to refetch data related to this stack
+  /// frame (and the `threadId` is ignored).
+  std::optional<uint64_t> stackFrameId;
+};
+llvm::json::Value toJSON(const InvalidatedEventBody::Area &);
+llvm::json::Value toJSON(const InvalidatedEventBody &);
+
 } // end namespace lldb_dap::protocol
 
 #endif
diff --git a/lldb/tools/lldb-dap/Protocol/ProtocolRequests.cpp b/lldb/tools/lldb-dap/Protocol/ProtocolRequests.cpp
index e1806d6230a80..b455112cd37d9 100644
--- a/lldb/tools/lldb-dap/Protocol/ProtocolRequests.cpp
+++ b/lldb/tools/lldb-dap/Protocol/ProtocolRequests.cpp
@@ -303,7 +303,8 @@ bool fromJSON(const json::Value &Params, LaunchRequestArguments &LRA,
          O.mapOptional("disableSTDIO", LRA.disableSTDIO) &&
          O.mapOptional("shellExpandArguments", LRA.shellExpandArguments) &&
          O.mapOptional("runInTerminal", LRA.console) &&
-         O.mapOptional("console", LRA.console) && parseEnv(Params, LRA.env, P);
+         O.mapOptional("console", LRA.console) &&
+         O.mapOptional("stdio", LRA.stdio) && parseEnv(Params, LRA.env, P);
 }
 
 bool fromJSON(const json::Value &Params, AttachRequestArguments &ARA,
diff --git a/lldb/tools/lldb-dap/Protocol/ProtocolRequests.h b/lldb/tools/lldb-dap/Protocol/ProtocolRequests.h
index 0848ee53b4410..92dada2295841 100644
--- a/lldb/tools/lldb-dap/Protocol/ProtocolRequests.h
+++ b/lldb/tools/lldb-dap/Protocol/ProtocolRequests.h
@@ -300,6 +300,8 @@ struct LaunchRequestArguments {
   /// terminal or external terminal.
   Console console = eConsoleInternal;
 
+  std::vector<std::optional<std::string>> stdio;
+
   /// @}
 };
 bool fromJSON(const llvm::json::Value &, LaunchRequestArguments &,
diff --git a/lldb/tools/lldb-dap/README.md b/lldb/tools/lldb-dap/README.md
index 39dabcc1342c8..e83384f89bc1f 100644
--- a/lldb/tools/lldb-dap/README.md
+++ b/lldb/tools/lldb-dap/README.md
@@ -44,6 +44,34 @@ adds `FOO=1` and `bar` to the environment:
 }
 ```
 
+#### Launch in integrated terminal
+
+This will launch process in IDE's integrated terminal.
+
+```javascript
+{
+  "type": "lldb-dap",
+  "request": "launch",
+  "name": "Debug",
+  "program": "/tmp/a.out",
+  "console": "integratedTerminal"
+}
+```
+
+#### Setup IO redirection
+
+This will launch process and connect `stdin` to `in.txt`, both of `stdout` and `stderr` to `out.txt`.
+
+```javascript
+{
+  "type": "lldb-dap",
+  "request": "launch",
+  "name": "Debug",
+  "program": "/tmp/a.out",
+  "stdio": ["in.txt", "out.txt"]
+}
+```
+
 ### Attaching to a process
 
 When attaching to a process using LLDB, you can attach in multiple ways:
@@ -237,6 +265,7 @@ contain the following key/value pairs:
 | **stopOnEntry**                   | boolean     |     | Whether to stop program immediately after launching.
 | **runInTerminal** (deprecated)    | boolean     |     | Launch the program inside an integrated terminal in the IDE. Useful for debugging interactive command line programs.
 | **console**                       | string      |     | Specify where to launch the program: internal console (`internalConsole`), integrated terminal (`integratedTerminal`) or external terminal (`externalTerminal`). Supported from lldb-dap 21.0 version.
+| **stdio**                         | [string]    |     | The stdio property specifies the redirection targets for the debuggee's stdio streams. A null value redirects a stream to the default debug terminal. String can be a path to file, named pipe or TTY device. If less than three values are provided, the list will be padded with the last value. Specifying more than three values will create additional file descriptors (4, 5, etc.). Supported from lldb-dap 22.0 version.
 | **launchCommands**                | [string]    |     | LLDB commands executed to launch the program.
 
 For JSON configurations of `"type": "attach"`, the JSON configuration can contain
diff --git a/lldb/tools/lldb-dap/package.json b/lldb/tools/lldb-dap/package.json
index 9cc653cee405b..6566ba3bdee13 100644
--- a/lldb/tools/lldb-dap/package.json
+++ b/lldb/tools/lldb-dap/package.json
@@ -409,7 +409,7 @@
                 "anyOf": [
                   {
                     "type": "object",
-                    "markdownDescription": "Additional environment variables to set when launching the debug adapter executable. E.g. `{ \"FOO\": \"1\" }`",
+                    "markdownDescription": "Additional environment variables to set when launching the debug adapter executable. For example `{ \"FOO\": \"1\" }`",
                     "patternProperties": {
                       ".*": {
                         "type": "string"
@@ -419,10 +419,10 @@
                   },
                   {
                     "type": "array",
-                    "markdownDescription": "Additional environment variables to set when launching the debug adapter executable. E.g. `[\"FOO=1\", \"BAR\"]`",
+                    "markdownDescription": "Additional environment variables to set when launching the debug adapter executable. For example `[\"FOO=1\", \"BAR\"]`",
                     "items": {
                       "type": "string",
-                      "pattern": "^((\\w+=.*)|^\\w+)$"
+                      "pattern": "^\\w+(=.*)?$"
                     },
                     "default": []
                   }
@@ -615,6 +615,14 @@
                 "description": "Specify where to launch the program: internal console, integrated terminal or external terminal.",
                 "default": "internalConsole"
               },
+              "stdio": {
+                "type": "array",
+                "items": {
+                  "type": "string"
+                },
+                "description": "The stdio property specifies the redirection targets for the debuggee's stdio streams. A null value redirects a stream to the default debug terminal. String can be a path to file, named pipe or TTY device. If less than three values are provided, the list will be padded with the last value. Specifying more than three values will create additional file descriptors (4, 5, etc.).",
+                "default": []
+              },
               "timeout": {
                 "type": "number",
                 "description": "The time in seconds to wait for a program to stop at entry point when launching with \"launchCommands\". Defaults to 30 seconds."
@@ -672,6 +680,29 @@
                 },
                 "markdownDescription": "The list of additional arguments used to launch the debug adapter executable. Overrides any user or workspace settings."
               },
+              "debugAdapterEnv": {
+                "anyOf": [
+                  {
+                    "type": "object",
+                    "markdownDescription": "Additional environment variables to set when launching the debug adapter executable. For example `{ \"FOO\": \"1\" }`",
+                    "patternProperties": {
+                      ".*": {
+                        "type": "string"
+                      }
+                    },
+                    "default": {}
+                  },
+                  {
+                    "type": "array",
+                    "markdownDescription": "Additional environment variables to set when launching the debug adapter executable. For example `[\"FOO=1\", \"BAR\"]`",
+                    "items": {
+                      "type": "string",
+                      "pattern": "^\\w+(=.*)?$"
+                    },
+                    "default": []
+                  }
+                ]
+              },
               "program": {
                 "type": "string",
                 "description": "Path to the program to attach to."
diff --git a/lldb/tools/lldb-dap/src-ts/debug-adapter-factory.ts b/lldb/tools/lldb-dap/src-ts/debug-adapter-factory.ts
index f7e92ee95ca32..7060638a94864 100644
--- a/lldb/tools/lldb-dap/src-ts/debug-adapter-factory.ts
+++ b/lldb/tools/lldb-dap/src-ts/debug-adapter-factory.ts
@@ -92,7 +92,7 @@ function validateDAPEnv(debugConfigEnv: any): boolean {
     Array.isArray(debugConfigEnv) &&
     debugConfigEnv.findIndex(
       (entry) =>
-        typeof entry !== "string" || !/^((\\w+=.*)|^\\w+)$/.test(entry),
+        typeof entry !== "string" || !/^\w+(=.*)?$/.test(entry),
     ) !== -1
   ) {
     return false;
diff --git a/lldb/tools/lldb-mcp/lldb-mcp.cpp b/lldb/tools/lldb-mcp/lldb-mcp.cpp
index 12545dcf3a3cc..68e987237cc69 100644
--- a/lldb/tools/lldb-mcp/lldb-mcp.cpp
+++ b/lldb/tools/lldb-mcp/lldb-mcp.cpp
@@ -8,12 +8,16 @@
 
 #include "lldb/Host/Config.h"
 #include "lldb/Host/File.h"
+#include "lldb/Host/FileSystem.h"
+#include "lldb/Host/Host.h"
 #include "lldb/Host/MainLoop.h"
 #include "lldb/Host/MainLoopBase.h"
+#include "lldb/Host/ProcessLaunchInfo.h"
 #include "lldb/Host/Socket.h"
 #include "lldb/Initialization/SystemInitializerCommon.h"
 #include "lldb/Initialization/SystemLifetimeManager.h"
 #include "lldb/Protocol/MCP/Server.h"
+#include "lldb/Utility/FileSpec.h"
 #include "lldb/Utility/Status.h"
 #include "lldb/Utility/UriParser.h"
 #include "lldb/lldb-forward.h"
@@ -24,8 +28,10 @@
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/Signals.h"
 #include "llvm/Support/WithColor.h"
+#include <chrono>
 #include <cstdlib>
 #include <memory>
+#include <thread>
 
 #if defined(_WIN32)
 #include <fcntl.h>
@@ -35,13 +41,25 @@ using namespace llvm;
 using namespace lldb;
 using namespace lldb_protocol::mcp;
 
+using lldb_private::Environment;
 using lldb_private::File;
+using lldb_private::FileSpec;
+using lldb_private::FileSystem;
+using lldb_private::Host;
 using lldb_private::MainLoop;
 using lldb_private::MainLoopBase;
 using lldb_private::NativeFile;
 
 namespace {
 
+#if defined(_WIN32)
+constexpr StringLiteral kDriverName = "lldb.exe";
+#else
+constexpr StringLiteral kDriverName = "lldb";
+#endif
+
+constexpr size_t kForwardIOBufferSize = 1024;
+
 inline void exitWithError(llvm::Error Err, StringRef Prefix = "") {
   handleAllErrors(std::move(Err), [&](ErrorInfoBase &Info) {
     WithColor::error(errs(), Prefix) << Info.message() << '\n';
@@ -49,10 +67,67 @@ inline void exitWithError(llvm::Error Err, StringRef Prefix = "") {
   std::exit(EXIT_FAILURE);
 }
 
-constexpr size_t kForwardIOBufferSize = 1024;
+FileSpec driverPath() {
+  Environment host_env = Host::GetEnvironment();
+
+  // Check if an override for which lldb we're using exists, otherwise look next
+  // to the current binary.
+  std::string lldb_exe_path = host_env.lookup("LLDB_EXE_PATH");
+  auto &fs = FileSystem::Instance();
+  if (fs.Exists(lldb_exe_path))
+    return FileSpec(lldb_exe_path);
+
+  FileSpec lldb_exec_spec = lldb_private::HostInfo::GetProgramFileSpec();
+  lldb_exec_spec.SetFilename(kDriverName);
+  return lldb_exec_spec;
+}
+
+llvm::Error launch() {
+  FileSpec lldb_exec = driverPath();
+  lldb_private::ProcessLaunchInfo info;
+  info.SetExecutableFile(lldb_exec,
+                         /*add_exe_file_as_first_arg=*/true);
+  info.GetArguments().AppendArgument("-O");
+  info.GetArguments().AppendArgument("protocol start MCP");
+  return Host::LaunchProcess(info).takeError();
+}
+
+Expected<ServerInfo> loadOrStart(
+    // FIXME: This should become a CLI arg.
+    lldb_private::Timeout<std::micro> timeout = std::chrono::seconds(30)) {
+  using namespace std::chrono;
+  bool started = false;
+
+  const auto deadline = steady_clock::now() + *timeout;
+  while (steady_clock::now() < deadline) {
+    Expected<std::vector<ServerInfo>> servers = ServerInfo::Load();
+    if (!servers)
+      return servers.takeError();
+
+    if (servers->empty()) {
+      if (!started) {
+        started = true;
+        if (llvm::Error err = launch())
+          return std::move(err);
+      }
+
+      // FIXME: Can we use MainLoop to watch the directory?
+      std::this_thread::sleep_for(microseconds(250));
+      continue;
+    }
+
+    // FIXME: Support selecting / multiplexing a specific lldb instance.
+    if (servers->size() > 1)
+      return createStringError("too many MCP servers running, picking a "
+                               "specific one is not yet implemented");
+
+    return servers->front();
+  }
+
+  return createStringError("timed out waiting for MCP server to start");
+}
 
-void forwardIO(lldb_private::MainLoopBase &loop, lldb::IOObjectSP &from,
-               lldb::IOObjectSP &to) {
+void forwardIO(MainLoopBase &loop, IOObjectSP &from, IOObjectSP &to) {
   char buf[kForwardIOBufferSize];
   size_t num_bytes = sizeof(buf);
 
@@ -67,21 +142,24 @@ void forwardIO(lldb_private::MainLoopBase &loop, lldb::IOObjectSP &from,
     exitWithError(std::move(err));
 }
 
-void connectAndForwardIO(lldb_private::MainLoop &loop, ServerInfo &info,
-                         IOObjectSP &input_sp, IOObjectSP &output_sp) {
+llvm::Error connectAndForwardIO(lldb_private::MainLoop &loop, ServerInfo &info,
+                                IOObjectSP &input_sp, IOObjectSP &output_sp) {
   auto uri = lldb_private::URI::Parse(info.connection_uri);
   if (!uri)
-    exitWithError(createStringError("invalid connection_uri"));
+    return createStringError("invalid connection_uri");
 
   std::optional<lldb_private::Socket::ProtocolModePair> protocol_and_mode =
       lldb_private::Socket::GetProtocolAndMode(uri->scheme);
 
+  if (!protocol_and_mode)
+    return createStringError("unknown protocol scheme");
+
   lldb_private::Status status;
   std::unique_ptr<lldb_private::Socket> sock =
       lldb_private::Socket::Create(protocol_and_mode->first, status);
 
   if (status.Fail())
-    exitWithError(status.takeError());
+    return status.takeError();
 
   if (uri->port && !uri->hostname.empty())
     status = sock->Connect(
@@ -89,24 +167,22 @@ void connectAndForwardIO(lldb_private::MainLoop &loop, ServerInfo &info,
   else
     status = sock->Connect(uri->path);
   if (status.Fail())
-    exitWithError(status.takeError());
+    return status.takeError();
 
   IOObjectSP sock_sp = std::move(sock);
   auto input_handle = loop.RegisterReadObject(
       input_sp, std::bind(forwardIO, std::placeholders::_1, input_sp, sock_sp),
       status);
   if (status.Fail())
-    exitWithError(status.takeError());
+    return status.takeError();
 
   auto socket_handle = loop.RegisterReadObject(
       sock_sp, std::bind(forwardIO, std::placeholders::_1, sock_sp, output_sp),
       status);
   if (status.Fail())
-    exitWithError(status.takeError());
+    return status.takeError();
 
-  status = loop.Run();
-  if (status.Fail())
-    exitWithError(status.takeError());
+  return loop.Run().takeError();
 }
 
 llvm::ManagedStatic<lldb_private::SystemLifetimeManager> g_debugger_lifetime;
@@ -147,30 +223,19 @@ int main(int argc, char *argv[]) {
   IOObjectSP output_sp = std::make_shared<NativeFile>(
       fileno(stdout), File::eOpenOptionWriteOnly, NativeFile::Unowned);
 
-  static MainLoop loop;
+  Expected<ServerInfo> server_info = loadOrStart();
+  if (!server_info)
+    exitWithError(server_info.takeError());
 
+  static MainLoop loop;
   sys::SetInterruptFunction([]() {
     loop.AddPendingCallback(
         [](MainLoopBase &loop) { loop.RequestTermination(); });
   });
 
-  auto existing_servers = ServerInfo::Load();
-
-  if (!existing_servers)
-    exitWithError(existing_servers.takeError());
-
-  // FIXME: Launch `lldb -o 'protocol start MCP'`.
-  if (existing_servers->empty())
-    exitWithError(createStringError("No MCP servers running"));
-
-  // FIXME: Support selecting a specific server.
-  if (existing_servers->size() != 1)
-    exitWithError(
-        createStringError("To many MCP servers running, picking a specific "
-                          "one is not yet implemented."));
-
-  ServerInfo &info = existing_servers->front();
-  connectAndForwardIO(loop, info, input_sp, output_sp);
+  if (llvm::Error error =
+          connectAndForwardIO(loop, *server_info, input_sp, output_sp))
+    exitWithError(std::move(error));
 
   return EXIT_SUCCESS;
 }
diff --git a/lldb/unittests/DAP/ProtocolTypesTest.cpp b/lldb/unittests/DAP/ProtocolTypesTest.cpp
index c5d47fcb08da4..61d197a705e0e 100644
--- a/lldb/unittests/DAP/ProtocolTypesTest.cpp
+++ b/lldb/unittests/DAP/ProtocolTypesTest.cpp
@@ -1073,3 +1073,18 @@ TEST(ProtocolTypesTest, CompletionsResponseBody) {
   ASSERT_THAT_EXPECTED(expected, llvm::Succeeded());
   EXPECT_EQ(pp(*expected), pp(response));
 }
+
+TEST(ProtocolTypesTest, InvalidatedEventBody) {
+  InvalidatedEventBody body;
+  body.areas = {InvalidatedEventBody::eAreaStacks,
+                InvalidatedEventBody::eAreaThreads};
+  body.stackFrameId = 1;
+  StringRef json = R"({
+  "areas": [
+    "stacks",
+    "threads"
+  ],
+  "stackFrameId": 1
+})";
+  EXPECT_EQ(json, pp(body));
+}
diff --git a/lldb/unittests/Utility/ScalarTest.cpp b/lldb/unittests/Utility/ScalarTest.cpp
index 256d456783583..6d5caef42bee4 100644
--- a/lldb/unittests/Utility/ScalarTest.cpp
+++ b/lldb/unittests/Utility/ScalarTest.cpp
@@ -337,6 +337,12 @@ TEST(ScalarTest, Division) {
   Scalar r = lhs / rhs;
   EXPECT_TRUE(r.IsValid());
   EXPECT_EQ(r, Scalar(2.5));
+
+  Scalar inf = Scalar(1) / Scalar(0.0f);
+  Scalar int0 = Scalar(1) / Scalar(0);
+  Scalar ref_inf = llvm::APFloat::getInf(llvm::APFloat::IEEEsingle());
+  EXPECT_EQ(inf, ref_inf);
+  EXPECT_FALSE(int0.IsValid());
 }
 
 TEST(ScalarTest, Promotion) {
diff --git a/llvm/benchmarks/RuntimeLibcalls.cpp b/llvm/benchmarks/RuntimeLibcalls.cpp
index 9ac77bb74a3df..707bdca7ceab7 100644
--- a/llvm/benchmarks/RuntimeLibcalls.cpp
+++ b/llvm/benchmarks/RuntimeLibcalls.cpp
@@ -54,10 +54,7 @@ static std::vector<std::string> readSymbolsFromFile(StringRef InputFile) {
   // Hackily figure out if there's a prefix on the symbol names - llvm-nm
   // appears to not have a flag to skip this.
   llvm::Triple HostTriple(LLVM_HOST_TRIPLE);
-  std::string DummyDatalayout = "e";
-  DummyDatalayout += DataLayout::getManglingComponent(HostTriple);
-
-  DataLayout DL(DummyDatalayout);
+  DataLayout DL(HostTriple.computeDataLayout());
   char GlobalPrefix = DL.getGlobalPrefix();
 
   std::vector<std::string> Lines;
diff --git a/llvm/cmake/modules/AddLLVM.cmake b/llvm/cmake/modules/AddLLVM.cmake
index c98e78da97b39..80e59a4df2433 100644
--- a/llvm/cmake/modules/AddLLVM.cmake
+++ b/llvm/cmake/modules/AddLLVM.cmake
@@ -1442,6 +1442,7 @@ if(NOT LLVM_TOOLCHAIN_TOOLS)
     llvm-lib
     llvm-mca
     llvm-ml
+    llvm-ml64
     llvm-nm
     llvm-objcopy
     llvm-objdump
diff --git a/llvm/cmake/modules/HandleLLVMOptions.cmake b/llvm/cmake/modules/HandleLLVMOptions.cmake
index 5ab34bc3b9c74..24142c934b918 100644
--- a/llvm/cmake/modules/HandleLLVMOptions.cmake
+++ b/llvm/cmake/modules/HandleLLVMOptions.cmake
@@ -876,39 +876,45 @@ if (LLVM_ENABLE_WARNINGS AND (LLVM_COMPILER_IS_GCC_COMPATIBLE OR CLANG_CL))
   append_if(USE_NO_UNINITIALIZED "-Wno-uninitialized" CMAKE_CXX_FLAGS)
   append_if(USE_NO_MAYBE_UNINITIALIZED "-Wno-maybe-uninitialized" CMAKE_CXX_FLAGS)
 
-  # Disable -Wnonnull for GCC warning as it is emitting a lot of false positives.
   if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+    # Disable -Wnonnull for GCC warning as it is emitting a lot of false positives.
     append("-Wno-nonnull" CMAKE_CXX_FLAGS)
-  endif()
 
-  # Disable -Wclass-memaccess, a C++-only warning from GCC 8 that fires on
-  # LLVM's ADT classes.
-  if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+    # Disable -Wclass-memaccess, a C++-only warning from GCC 8 that fires on
+    # LLVM's ADT classes.
     if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 8.1)
       append("-Wno-class-memaccess" CMAKE_CXX_FLAGS)
     endif()
-  endif()
 
-  # Disable -Wdangling-reference, a C++-only warning from GCC 13 that seems
-  # to produce a large number of false positives.
-  if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+    # Disable -Wdangling-reference, a C++-only warning from GCC 13 that seems
+    # to produce a large number of false positives.
     if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 13.1)
       append("-Wno-dangling-reference" CMAKE_CXX_FLAGS)
     endif()
-  endif()
 
-  # Disable -Wredundant-move and -Wpessimizing-move on GCC>=9. GCC wants to
-  # remove std::move in code like
-  # "A foo(ConvertibleToA a) { return std::move(a); }",
-  # but this code does not compile (or uses the copy
-  # constructor instead) on clang<=3.8. Clang also has a -Wredundant-move and
-  # -Wpessimizing-move, but they only fire when the types match exactly, so we
-  # can keep them here.
-  if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+    # Disable -Wredundant-move and -Wpessimizing-move on GCC>=9. GCC wants to
+    # remove std::move in code like
+    # "A foo(ConvertibleToA a) { return std::move(a); }",
+    # but this code does not compile (or uses the copy
+    # constructor instead) on clang<=3.8. Clang also has a -Wredundant-move and
+    # -Wpessimizing-move, but they only fire when the types match exactly, so we
+    # can keep them here.
     if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 9.1)
       append("-Wno-redundant-move" CMAKE_CXX_FLAGS)
       append("-Wno-pessimizing-move" CMAKE_CXX_FLAGS)
     endif()
+
+    # Disable -Warray-bounds on GCC; this warning exists since a very long time,
+    # but since GCC 11, it produces a lot of very noisy, seemingly false positive
+    # warnings (potentially originating in libstdc++).
+    append("-Wno-array-bounds" CMAKE_CXX_FLAGS)
+
+    # Disable -Wstringop-overread on GCC; this warning produces a number of very
+    # noisy diagnostics when -Warray-bounds is disabled above; this option exists
+    # since GCC 11.
+    if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 11.1)
+      append("-Wno-stringop-overread" CMAKE_CXX_FLAGS)
+    endif()
   endif()
 
   # The LLVM libraries have no stable C++ API, so -Wnoexcept-type is not useful.
@@ -1463,3 +1469,11 @@ if(LLVM_ENABLE_LLVM_LIBC)
     message(WARNING "Unable to link against LLVM libc. LLVM will be built without linking against the LLVM libc overlay.")
   endif()
 endif()
+
+check_symbol_exists(flock "sys/file.h" HAVE_FLOCK)
+set(LLVM_ENABLE_ONDISK_CAS_default OFF)
+if(HAVE_FLOCK OR LLVM_ON_WIN32)
+  # LLVM OnDisk CAS currently requires flock on Unix.
+  set(LLVM_ENABLE_ONDISK_CAS_default ON)
+endif()
+option(LLVM_ENABLE_ONDISK_CAS "Build OnDiskCAS." ${LLVM_ENABLE_ONDISK_CAS_default})
diff --git a/llvm/cmake/modules/LLVMConfig.cmake.in b/llvm/cmake/modules/LLVMConfig.cmake.in
index c15b9576cd5d5..70c807abea98a 100644
--- a/llvm/cmake/modules/LLVMConfig.cmake.in
+++ b/llvm/cmake/modules/LLVMConfig.cmake.in
@@ -36,6 +36,8 @@ set(LLVM_TARGETS_WITH_JIT @LLVM_TARGETS_WITH_JIT@)
 
 set(LLVM_TARGET_TRIPLE "@LLVM_TARGET_TRIPLE@")
 
+set(LLVM_TARGET_TRIPLE_ENV "@LLVM_TARGET_TRIPLE_ENV@")
+
 set(LLVM_HOST_TRIPLE "@LLVM_HOST_TRIPLE@")
 
 set(LLVM_ABI_BREAKING_CHECKS @LLVM_ABI_BREAKING_CHECKS@)
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 37563203f2f83..1265ec40c06d6 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -1672,6 +1672,15 @@ The AMDGPU backend supports the following LLVM IR attributes.
      "amdgpu-no-workgroup-id-z"                       The same as amdgpu-no-workitem-id-x, except for the
                                                       llvm.amdgcn.workgroup.id.z intrinsic.
 
+     "amdgpu-no-cluster-id-x"                         The same as amdgpu-no-workitem-id-x, except for the
+                                                      llvm.amdgcn.cluster.id.x intrinsic.
+
+     "amdgpu-no-cluster-id-y"                         The same as amdgpu-no-workitem-id-x, except for the
+                                                      llvm.amdgcn.cluster.id.y intrinsic.
+
+     "amdgpu-no-cluster-id-z"                         The same as amdgpu-no-workitem-id-x, except for the
+                                                      llvm.amdgcn.cluster.id.z intrinsic.
+
      "amdgpu-no-dispatch-ptr"                         The same as amdgpu-no-workitem-id-x, except for the
                                                       llvm.amdgcn.dispatch.ptr intrinsic.
 
@@ -1812,6 +1821,13 @@ The AMDGPU backend supports the following LLVM IR attributes.
                                                       offset by one less than the number of dynamic VGPR blocks required
                                                       by the function encoded in bits 5..3.
 
+     "amdgpu-cluster-dims"="x,y,z"                    Specify the cluster workgroup dimensions. A value of "0,0,0" indicates that
+                                                      cluster is disabled. A value of "1024,1024,1024" indicates that cluster is enabled,
+                                                      but the dimensions cannot be determined at compile time. Any other value explicitly
+                                                      specifies the cluster dimensions.
+
+                                                      This is only relevant on targets with cluster support.
+
      ================================================ ==========================================================
 
 Calling Conventions
@@ -4823,7 +4839,24 @@ Code object V5 metadata is the same as
 
      ====================== ============== ========= ================================
 
-..
+.. _amdgpu-amdhsa-code-object-metadata-v6:
+
+Code Object V6 Metadata
++++++++++++++++++++++++
+
+Code object V6 metadata is the same as
+:ref:`amdgpu-amdhsa-code-object-metadata-v5` with the changes defined in table
+:ref:`amdgpu-amdhsa-code-object-kernel-metadata-map-table-v6`.
+
+    .. table:: AMDHSA Code Object V6 Kernel Metadata Map Additions
+     :name: amdgpu-amdhsa-code-object-kernel-metadata-map-table-v6
+
+     ============================= ============= ========== =======================================
+     String Key                    Value Type     Required? Description
+     ============================= ============= ========== =======================================
+     ".cluster_dims"               sequence of              The dimension of the cluster.
+                                   3 integers
+     ============================= ============= ========== =======================================
 
 Kernel Dispatch
 ~~~~~~~~~~~~~~~
diff --git a/llvm/docs/Atomics.rst b/llvm/docs/Atomics.rst
index 4dee3e6bd9f4f..522aed150bf62 100644
--- a/llvm/docs/Atomics.rst
+++ b/llvm/docs/Atomics.rst
@@ -43,8 +43,8 @@ address, the first store can be erased. This transformation is not allowed for a
 pair of volatile stores. On the other hand, a non-volatile non-atomic load can
 be moved across a volatile load freely, but not an Acquire load.
 
-This document is intended to provide a guide to anyone either writing a frontend
-for LLVM or working on optimization passes for LLVM with a guide for how to deal
+This document is intended to guide anyone writing a frontend
+for LLVM or working on optimization passes for LLVM on how to deal
 with instructions with special semantics in the presence of concurrency. This
 is not intended to be a precise guide to the semantics; the details can get
 extremely complicated and unreadable, and are not usually necessary.
@@ -94,7 +94,7 @@ The following is equivalent in non-concurrent situations:
 
 However, LLVM is not allowed to transform the former to the latter: it could
 indirectly introduce undefined behavior if another thread can access ``x`` at
-the same time. That thread would read `undef` instead of the value it was
+the same time. That thread would read ``undef`` instead of the value it was
 expecting, which can lead to undefined behavior down the line. (This example is
 particularly of interest because before the concurrency model was implemented,
 LLVM would perform this transformation.)
@@ -149,7 +149,7 @@ NotAtomic
 NotAtomic is the obvious, a load or store which is not atomic. (This isn't
 really a level of atomicity, but is listed here for comparison.) This is
 essentially a regular load or store. If there is a race on a given memory
-location, loads from that location return undef.
+location, loads from that location return ``undef``.
 
 Relevant standard
   This is intended to match shared variables in C/C++, and to be used in any
@@ -429,7 +429,7 @@ support *ALL* operations of that size in a lock-free manner.
 
 When the target implements atomic ``cmpxchg`` or LL/SC instructions (as most do)
 this is trivial: all the other operations can be implemented on top of those
-primitives. However, on many older CPUs (e.g. ARMv5, SparcV8, Intel 80386) there
+primitives. However, on many older CPUs (e.g. ARMv5, Sparc V8, Intel 80386) there
 are atomic load and store instructions, but no ``cmpxchg`` or LL/SC. As it is
 invalid to implement ``atomic load`` using the native instruction, but
 ``cmpxchg`` using a library call to a function that uses a mutex, ``atomic
@@ -475,7 +475,7 @@ atomic constructs. Here are some lowerings it can do:
   ``shouldExpandAtomicRMWInIR``, ``emitMaskedAtomicRMWIntrinsic``,
   ``shouldExpandAtomicCmpXchgInIR``, and ``emitMaskedAtomicCmpXchgIntrinsic``.
 
-For an example of these look at the ARM (first five lowerings) or RISC-V (last
+For an example of these, look at the ARM (first five lowerings) or RISC-V (last
 lowering) backend.
 
 AtomicExpandPass supports two strategies for lowering atomicrmw/cmpxchg to
@@ -542,7 +542,7 @@ to take note of:
 
 - They support all sizes and alignments -- including those which cannot be
   implemented natively on any existing hardware. Therefore, they will certainly
-  use mutexes in for some sizes/alignments.
+  use mutexes for some sizes/alignments.
 
 - As a consequence, they cannot be shipped in a statically linked
   compiler-support library, as they have state which must be shared amongst all
@@ -568,7 +568,7 @@ Libcalls: __sync_*
 Some targets or OS/target combinations can support lock-free atomics, but for
 various reasons, it is not practical to emit the instructions inline.
 
-There's two typical examples of this.
+There are two typical examples of this.
 
 Some CPUs support multiple instruction sets which can be switched back and forth
 on function-call boundaries. For example, MIPS supports the MIPS16 ISA, which
@@ -589,7 +589,7 @@ case. The only common architecture without that property is SPARC -- SPARCV8 SMP
 systems were common, yet it doesn't support any sort of compare-and-swap
 operation.
 
-Some targets (like RISCV) support a ``+forced-atomics`` target feature, which
+Some targets (like RISC-V) support a ``+forced-atomics`` target feature, which
 enables the use of lock-free atomics even if LLVM is not aware of any specific
 OS support for them. In this case, the user is responsible for ensuring that
 necessary ``__sync_*`` implementations are available. Code using
@@ -653,6 +653,6 @@ implemented in both ``compiler-rt`` and ``libgcc`` libraries
   iN __aarch64_ldeorN_ORDER(iN val, iN *ptr)
   iN __aarch64_ldsetN_ORDER(iN val, iN *ptr)
 
-Please note, if LSE instruction set is specified for AArch64 target then
+Please note, if LSE instruction set is specified for AArch64 target, then
 out-of-line atomics calls are not generated and single-instruction atomic
 operations are used in place.
diff --git a/llvm/docs/ConvergentOperations.rst b/llvm/docs/ConvergentOperations.rst
index 5081efffc89ac..cdd3e89aba1f4 100644
--- a/llvm/docs/ConvergentOperations.rst
+++ b/llvm/docs/ConvergentOperations.rst
@@ -13,7 +13,7 @@ Some parallel execution environments execute threads in groups that allow
 efficient communication within the group using special primitives called
 *convergent* operations. The outcome of a convergent operation is sensitive to
 the set of threads that executes it "together", i.e., convergently. When control
-flow :ref:`diverges <convergence-and-uniformity>`, i.e. threads of the same
+flow :ref:`diverges <convergence-and-uniformity>`, i.e., threads of the same
 group follow different
 paths through the CFG, not all threads of the group may be available to
 participate in this communication. This is the defining characteristic that
@@ -41,7 +41,7 @@ In structured programming languages, there is often an intuitive and
 unambiguous way of determining the threads that are expected to communicate.
 However, this is not always the case even in structured programming languages,
 and the intuition breaks down entirely in unstructured control flow. This
-document describes the formal semantics in LLVM, i.e. how to determine the set
+document describes the formal semantics in LLVM, i.e., how to determine the set
 of communicating threads for convergent operations.
 
 The definitions in this document leave many details open, such as how groups of
@@ -449,15 +449,15 @@ Consider the following example:
     // E
   }
 
-In this program, the call to convergent_op() is lexically "inside" the ``for``
+In this program, the call to ``convergent_op()`` is lexically "inside" the ``for``
 loop. But when translated to LLVM IR, the basic block B is an exiting block
 ending in a divergent branch, and the basic block C is an exit of the loop.
-Thus, the call to convergent_op() is outside the loop. This causes a mismatch
+Thus, the call to ``convergent_op()`` is outside the loop. This causes a mismatch
 between the programmer's expectation and the compiled program. The call should
 be executed convergently on every iteration of the loop, by threads that
 together take the branch to exit the loop. But when compiled, all threads that
 take the divergent exit on different iterations first converge at the beginning
-of basic block C and then together execute the call to convergent_op().
+of basic block C and then together execute the call to ``convergent_op()``.
 
 In this case, :ref:`llvm.experimental.convergence.loop
 <llvm.experimental.convergence.loop>` can be used to express the desired
@@ -588,18 +588,18 @@ indirectly.
 
   token @llvm.experimental.convergence.entry() convergent readnone
 
-This intrinsic is used to tie the dynamic instances inside of a function to
+This intrinsic is used to tie the dynamic instances inside a function to
 those in the caller.
 
 1. If the function is called from outside the scope of LLVM, the convergence of
-   dynamic instances of this intrinsic are environment-defined. For example:
+   dynamic instances of this intrinsic is environment-defined. For example:
 
    a. In an OpenCL *kernel launch*, the maximal set of threads that
       can communicate outside the memory model is a *workgroup*.
       Hence, a suitable choice is to specify that all the threads from
       a single workgroup in OpenCL execute converged dynamic instances
       of this intrinsic.
-   b. In a C/C++ program, threads are launched independently and they can
+   b. In a C/C++ program, threads are launched independently and can
       communicate only through the memory model. Hence the dynamic instances of
       this intrinsic in a C/C++ program are never converged.
 2. If the function is called from a call-site in LLVM IR, then two
@@ -701,7 +701,7 @@ convergent operation in the same basic block.
 
   token @llvm.experimental.convergence.anchor() convergent readnone
 
-This intrinsic produces an initial convergence token that is independent from
+This intrinsic produces an initial convergence token that is independent of
 any "outer scope". The set of threads executing converged dynamic instances of
 this intrinsic is implementation-defined.
 
@@ -1483,7 +1483,7 @@ There is no guarantee about the value of ``%id`` in the threads where
 hoisting ``@subgroupShuffle`` might introduce UB.
 
 On the other hand, if ``@subgroupShuffle`` is defined such that it merely
-produces an undefined value or poison as result when ``%id`` is "out of range",
+produces an undefined value or poison as a result when ``%id`` is "out of range",
 then speculating is okay.
 
 Even though
@@ -1502,7 +1502,7 @@ Assuming that ``%tok`` is only used inside the conditional block, the anchor can
 be sunk. The rationale is two-fold. First, the anchor has implementation-defined
 behavior, and the sinking is part of the implementation. Second, already in the
 original program, the set of threads that communicates in the
-``@convergent.operation`` is automatically subset to the threads for which
+``@convergent.operation`` is automatically a subset of the threads for which
 ``condition`` is true.
 
 Anchors can be hoisted in acyclic control flow. For example:
diff --git a/llvm/docs/GarbageCollection.rst b/llvm/docs/GarbageCollection.rst
index 06ef93bd8dedd..67be080db1310 100644
--- a/llvm/docs/GarbageCollection.rst
+++ b/llvm/docs/GarbageCollection.rst
@@ -15,14 +15,14 @@ garbage collector.**  You must provide your own.
 Quick Start
 ============
 
-First, you should pick a collector strategy.  LLVM includes a number of built
-in ones, but you can also implement a loadable plugin with a custom definition.
+First, you should pick a collector strategy.  LLVM includes a number of built-in
+ones, but you can also implement a loadable plugin with a custom definition.
 Note that the collector strategy is a description of how LLVM should generate
 code such that it interacts with your collector and runtime, not a description
 of the collector itself.
 
 Next, mark your generated functions as using your chosen collector strategy.
-From c++, you can call:
+From C++, you can call:
 
 .. code-block:: c++
 
@@ -40,7 +40,7 @@ When generating LLVM IR for your functions, you will need to:
 
 * Use ``@llvm.gcread`` and/or ``@llvm.gcwrite`` in place of standard load and
   store instructions.  These intrinsics are used to represent load and store
-  barriers.  If you collector does not require such barriers, you can skip
+  barriers.  If your collector does not require such barriers, you can skip
   this step.
 
 * Use the memory allocation routines provided by your garbage collector's
@@ -49,7 +49,7 @@ When generating LLVM IR for your functions, you will need to:
 * If your collector requires them, generate type maps according to your
   runtime's binary interface.  LLVM is not involved in the process.  In
   particular, the LLVM type system is not suitable for conveying such
-  information though the compiler.
+  information through the compiler.
 
 * Insert any coordination code required for interacting with your collector.
   Many collectors require running application code to periodically check a
@@ -59,7 +59,7 @@ When generating LLVM IR for your functions, you will need to:
 You will need to identify roots (i.e. references to heap objects your collector
 needs to know about) in your generated IR, so that LLVM can encode them into
 your final stack maps.  Depending on the collector strategy chosen, this is
-accomplished by using either the ``@llvm.gcroot`` intrinsics or an
+accomplished by using either the ``@llvm.gcroot`` intrinsics or a
 ``gc.statepoint`` relocation sequence.
 
 Don't forget to create a root for each intermediate value that is generated when
@@ -142,11 +142,11 @@ Perl, Python, Lua, Ruby, other scripting languages, and more.
 
 Note that LLVM **does not itself provide a garbage collector** --- this should
 be part of your language's runtime library.  LLVM provides a framework for
-describing the garbage collectors requirements to the compiler.  In particular,
+describing the garbage collector's requirements to the compiler.  In particular,
 LLVM provides support for generating stack maps at call sites, polling for a
 safepoint, and emitting load and store barriers.  You can also extend LLVM -
 possibly through a loadable :ref:`code generation plugins <plugin>` - to
-generate code and data structures which conforms to the *binary interface*
+generate code and data structures which conform to the *binary interface*
 specified by the *runtime library*.  This is similar to the relationship between
 LLVM and DWARF debugging info, for example.  The difference primarily lies in
 the lack of an established standard in the domain of garbage collection --- thus
@@ -185,10 +185,10 @@ adequately addressed with other features of the IR and does not specify a
 particular binary interface.  On the plus side, this means that you should be
 able to integrate LLVM with an existing runtime.  On the other hand, it can
 have the effect of leaving a lot of work for the developer of a novel
-language.  We try to mitigate this by providing built in collector strategy
+language.  We try to mitigate this by providing built-in collector strategy
 descriptions that can work with many common collector designs and easy
 extension points.  If you don't already have a specific binary interface
-you need to support, we recommend trying to use one of these built in collector
+you need to support, we recommend trying to use one of these built-in collector
 strategies.
 
 .. _gc_intrinsics:
@@ -257,7 +257,7 @@ associated with the pointer, and **must** be a constant or global value
 address.  If your target collector uses tags, use a null pointer for metadata.
 
 A compiler which performs manual SSA construction **must** ensure that SSA
-values representing GC references are stored in to the alloca passed to the
+values representing GC references are stored into the alloca passed to the
 respective ``gcroot`` before every call site and reloaded after every call.
 A compiler which uses mem2reg to raise imperative code using ``alloca`` into
 SSA form need only add a call to ``@llvm.gcroot`` for those variables which
@@ -265,8 +265,8 @@ are pointers into the GC heap.
 
 It is also important to mark intermediate values with ``llvm.gcroot``.  For
 example, consider ``h(f(), g())``.  Beware leaking the result of ``f()`` in the
-case that ``g()`` triggers a collection.  Note, that stack variables must be
-initialized and marked with ``llvm.gcroot`` in function's prologue.
+case that ``g()`` triggers a collection.  Note that stack variables must be
+initialized and marked with ``llvm.gcroot`` in the function's prologue.
 
 The ``%metadata`` argument can be used to avoid requiring heap objects to have
 'isa' pointers or tag bits. [Appel89_, Goldberg91_, Tolmach94_] If specified,
@@ -388,10 +388,10 @@ greater performance impact since pointer reads are more frequent than writes.
 
 .. _builtin-gc-strategies:
 
-Built In GC Strategies
+Built-In GC Strategies
 ======================
 
-LLVM includes built in support for several varieties of garbage collectors.
+LLVM includes built-in support for several varieties of garbage collectors.
 
 The Shadow Stack GC
 ----------------------
@@ -481,17 +481,17 @@ data structure, but there are only 20 lines of meaningful code.)
   }
 
 
-The 'Erlang' and 'Ocaml' GCs
+The 'Erlang' and 'OCaml' GCs
 -----------------------------
 
 LLVM ships with two example collectors which leverage the ``gcroot``
 mechanisms.  To our knowledge, these are not actually used by any language
 runtime, but they do provide a reasonable starting point for someone interested
 in writing an ``gcroot`` compatible GC plugin.  In particular, these are the
-only in tree examples of how to produce a custom binary stack map format using
+only in-tree examples of how to produce a custom binary stack map format using
 a ``gcroot`` strategy.
 
-As there names imply, the binary format produced is intended to model that
+As their names imply, the binary format produced is intended to model that
 used by the Erlang and OCaml compilers respectively.
 
 .. _statepoint_example_gc:
@@ -544,14 +544,14 @@ certain aspects like:
 Custom GC Strategies
 ====================
 
-If none of the built in GC strategy descriptions met your needs above, you will
+If none of the built-in GC strategy descriptions met your needs above, you will
 need to define a custom GCStrategy and possibly, a custom LLVM pass to perform
 lowering.  Your best example of where to start defining a custom GCStrategy
-would be to look at one of the built in strategies.
+would be to look at one of the built-in strategies.
 
 You may be able to structure this additional code as a loadable plugin library.
 Loadable plugins are sufficient if all you need is to enable a different
-combination of built in functionality, but if you need to provide a custom
+combination of built-in functionality, but if you need to provide a custom
 lowering pass, you will need to build a patched version of LLVM.  If you think
 you need a patched build, please ask for advice on llvm-dev.  There may be an
 easy way we can extend the support to make it work for your use case without
@@ -576,7 +576,7 @@ You should be able to leverage any existing collector library that includes the
 #. A mechanism for identifying references in global locations (e.g. global
    variables).
 
-#. If you collector requires them, an LLVM IR implementation of your collectors
+#. If your collector requires them, an LLVM IR implementation of your collector's
    load and store barriers.  Note that since many collectors don't require
    barriers at all, LLVM defaults to lowering such barriers to normal loads
    and stores unless you arrange otherwise.
@@ -598,7 +598,7 @@ runtime library).  This can be accomplished in about 100 lines of code.
 This is not the appropriate place to implement a garbage collected heap or a
 garbage collector itself.  That code should exist in the language's runtime
 library.  The compiler plugin is responsible for generating code which conforms
-to the binary interface defined by library, most essentially the :ref:`stack map
+to the binary interface defined by the library, most essentially the :ref:`stack map
 <stack-map>`.
 
 To subclass ``llvm::GCStrategy`` and register it with the compiler:
@@ -850,11 +850,11 @@ Custom lowering of intrinsics
 ------------------------------
 
 For GCs which use barriers or unusual treatment of stack roots, the
-implementor is responsibly for providing a custom pass to lower the
+implementor is responsible for providing a custom pass to lower the
 intrinsics with the desired semantics.  If you have opted in to custom
 lowering of a particular intrinsic your pass **must** eliminate all
 instances of the corresponding intrinsic in functions which opt in to
-your GC.  The best example of such a pass is the ShadowStackGC and it's
+your GC.  The best example of such a pass is the ShadowStackGC and its
 ShadowStackGCLowering pass.
 
 There is currently no way to register such a custom lowering pass
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 43d31b03932cf..5fd0f6573bb97 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -2256,9 +2256,16 @@ For example:
     behavior at runtime if the function ever does dynamically return. Annotated
     functions may still raise an exception, i.a., ``nounwind`` is not implied.
 ``norecurse``
-    This function attribute indicates that the function does not call itself
-    either directly or indirectly down any possible call path. This produces
-    undefined behavior at runtime if the function ever does recurse.
+    This function attribute indicates that the function is not recursive and
+    does not participate in recursion. This means that the function never
+    occurs inside a cycle in the dynamic call graph.
+    For example:
+
+.. code-block:: llvm
+
+    fn -> other_fn -> fn       ; fn is not norecurse
+    other_fn -> fn -> other_fn ; fn is not norecurse
+    fn -> other_fn -> other_fn ; fn is norecurse
 
 .. _langref_willreturn:
 
@@ -7840,6 +7847,54 @@ If a loop was successfully processed by the loop distribution pass,
 this metadata is added (i.e., has been distributed).  See
 :ref:`Transformation Metadata <transformation-metadata>` for details.
 
+'``llvm.loop.estimated_trip_count``' Metadata
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+This metadata records an estimated trip count for the loop.  The first operand
+is the string ``llvm.loop.estimated_trip_count``.  The second operand is an
+integer constant of type ``i32`` or smaller specifying the estimate.  For
+example:
+
+.. code-block:: llvm
+
+   !0 = !{!"llvm.loop.estimated_trip_count", i32 8}
+
+Purpose
+"""""""
+
+A loop's estimated trip count is an estimate of the average number of loop
+iterations (specifically, the number of times the loop's header executes) each
+time execution reaches the loop.  It is usually only an estimate based on, for
+example, profile data.  The actual number of iterations might vary widely.
+
+The estimated trip count serves as a parameter for various loop transformations
+and typically helps estimate transformation cost.  For example, it can help
+determine how many iterations to peel or how aggressively to unroll.
+
+Initialization and Maintenance
+""""""""""""""""""""""""""""""
+
+Passes should interact with estimated trip counts always via
+``llvm::getLoopEstimatedTripCount`` and ``llvm::setLoopEstimatedTripCount``.
+
+When the ``llvm.loop.estimated_trip_count`` metadata is not present on a loop,
+``llvm::getLoopEstimatedTripCount`` estimates the loop's trip count from the
+loop's ``branch_weights`` metadata under the assumption that the latter still
+accurately encodes the program's original profile data.  However, as passes
+transform existing loops and create new loops, they must be free to update and
+create ``branch_weights`` metadata in a way that maintains accurate block
+frequencies.  Trip counts estimated from this new ``branch_weights`` metadata
+are not necessarily useful to the passes that consume estimated trip counts.
+
+For this reason, when a pass transforms or creates loops, the pass should
+separately estimate new trip counts based on the estimated trip counts that
+``llvm::getLoopEstimatedTripCount`` returns at the start of the pass, and the
+pass should record the new estimates by calling
+``llvm::setLoopEstimatedTripCount``, which creates or updates
+``llvm.loop.estimated_trip_count`` metadata.  Once this metadata is present on a
+loop, ``llvm::getLoopEstimatedTripCount`` returns its value instead of
+estimating the trip count from the loop's ``branch_weights`` metadata.
+
 '``llvm.licm.disable``' Metadata
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
@@ -20514,7 +20569,7 @@ Note that it has the following implications:
 -  If ``%cnt`` is non-zero, the return value is non-zero as well.
 -  If ``%cnt`` is less than or equal to ``%max_lanes``, the return value is equal to ``%cnt``.
 
-'``llvm.experimental.vector.partial.reduce.add.*``' Intrinsic
+'``llvm.vector.partial.reduce.add.*``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 Syntax:
@@ -20523,15 +20578,15 @@ This is an overloaded intrinsic.
 
 ::
 
-      declare <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v4i32.v8i32(<4 x i32> %a, <8 x i32> %b)
-      declare <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v4i32.v16i32(<4 x i32> %a, <16 x i32> %b)
-      declare <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv8i32(<vscale x 4 x i32> %a, <vscale x 8 x i32> %b)
-      declare <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv16i32(<vscale x 4 x i32> %a, <vscale x 16 x i32> %b)
+      declare <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v4i32.v8i32(<4 x i32> %a, <8 x i32> %b)
+      declare <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v4i32.v16i32(<4 x i32> %a, <16 x i32> %b)
+      declare <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv8i32(<vscale x 4 x i32> %a, <vscale x 8 x i32> %b)
+      declare <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv16i32(<vscale x 4 x i32> %a, <vscale x 16 x i32> %b)
 
 Overview:
 """""""""
 
-The '``llvm.vector.experimental.partial.reduce.add.*``' intrinsics reduce the
+The '``llvm.vector.partial.reduce.add.*``' intrinsics reduce the
 concatenation of the two vector arguments down to the number of elements of the
 result vector type.
 
diff --git a/llvm/docs/Passes.rst b/llvm/docs/Passes.rst
index e297d3883c39e..8cbe4eeb8a869 100644
--- a/llvm/docs/Passes.rst
+++ b/llvm/docs/Passes.rst
@@ -16,15 +16,15 @@ Introduction
   is most likely incomplete. It is possible to list passes known by the opt
   tool using ``opt -print-passes``.
 
-This document serves as a high level summary of the optimization features that
+This document serves as a high-level summary of the optimization features that
 LLVM provides.  Optimizations are implemented as Passes that traverse some
 portion of a program to either collect information or transform the program.
 The table below divides the passes that LLVM provides into three categories.
 Analysis passes compute information that other passes can use or for debugging
 or program visualization purposes.  Transform passes can use (or invalidate)
 the analysis passes.  Transform passes all mutate the program in some way.
-Utility passes provides some utility but don't otherwise fit categorization.
-For example passes to extract functions to bitcode or write a module to bitcode
+Utility passes provide some utility but don't otherwise fit categorization.
+For example, passes to extract functions to bitcode or write a module to bitcode
 are neither analysis nor transform passes.  The table of contents above
 provides a quick summary of each pass and links to the more complete pass
 description later in the document.
@@ -61,7 +61,7 @@ Yet to be written.
 ``da``: Dependence Analysis
 ---------------------------
 
-Dependence analysis framework, which is used to detect dependences in memory
+Dependence analysis framework, which is used to detect dependencies in memory
 accesses.
 
 ``domfrontier``: Dominance Frontier Construction
@@ -90,7 +90,7 @@ postscript or some other suitable format.
 This pass, only available in ``opt``, prints the control flow graph into a
 ``.dot`` graph.  This graph can then be processed with the :program:`dot` tool
 to convert it to postscript or some other suitable format.
-Additionally the ``-cfg-func-name=<substring>`` option can be used to filter the
+Additionally, the ``-cfg-func-name=<substring>`` option can be used to filter the
 functions that are printed. All functions that contain the specified substring
 will be printed.
 
@@ -101,7 +101,7 @@ This pass, only available in ``opt``, prints the control flow graph into a
 ``.dot`` graph, omitting the function bodies.  This graph can then be processed
 with the :program:`dot` tool to convert it to postscript or some other suitable
 format.
-Additionally the ``-cfg-func-name=<substring>`` option can be used to filter the
+Additionally, the ``-cfg-func-name=<substring>`` option can be used to filter the
 functions that are printed. All functions that contain the specified substring
 will be printed.
 
@@ -240,7 +240,7 @@ standard error in a human-readable form.
 ---------------------------------------------------
 
 This pass, only available in ``opt``, prints the SCCs of each function CFG to
-standard error in a human-readable fom.
+standard error in a human-readable form.
 
 ``function(print)``: Print function to stderr
 ---------------------------------------------
@@ -486,7 +486,7 @@ Bottom-up inlining of functions into callees.
 ``instcombine``: Combine redundant instructions
 -----------------------------------------------
 
-Combine instructions to form fewer, simple instructions.  This pass does not
+Combine instructions to form fewer, simpler instructions.  This pass does not
 modify the CFG. This pass is where algebraic simplification happens.
 
 This pass combines things like:
@@ -502,7 +502,7 @@ into:
 
   %Z = add i32 %X, 2
 
-This is a simple worklist driven algorithm.
+This is a simple worklist-driven algorithm.
 
 This pass guarantees that the following canonicalizations are performed on the
 program:
@@ -532,9 +532,9 @@ library calls on different targets.
 ``aggressive-instcombine``: Combine expression patterns
 --------------------------------------------------------
 
-Combine expression patterns to form expressions with fewer, simple instructions.
+Combine expression patterns to form expressions with fewer, simpler instructions.
 
-For example, this pass reduce width of expressions post-dominated by TruncInst
+For example, this pass reduces the width of expressions post-dominated by ``TruncInst``
 into smaller width when applicable.
 
 It differs from instcombine pass in that it can modify CFG and contains pattern
@@ -722,7 +722,7 @@ determine the trip counts of loops easily.
 ---------------------------------------------
 
 This pass implements a simple unroll and jam classical loop optimisation pass.
-It transforms loop from:
+It transforms a loop from:
 
 .. code-block:: c++
 
@@ -799,11 +799,11 @@ This pass looks for equivalent functions that are mergeable and folds them.
 
 Total-ordering is introduced among the functions set: we define comparison
 that answers for every two functions which of them is greater. It allows to
-arrange functions into the binary tree.
+arrange functions into a binary tree.
 
-For every new function we check for equivalent in tree.
+For every new function we check for equivalent in the tree.
 
-If equivalent exists we fold such functions. If both functions are overridable,
+If equivalent exists, we fold such functions. If both functions are overridable,
 we move the functionality into a new internal function and leave two
 overridable thunks to it.
 
@@ -838,7 +838,7 @@ For example: 4 + (x + 5) ⇒ x + (4 + 5)
 
 In the implementation of this algorithm, constants are assigned rank = 0,
 function arguments are rank = 1, and other values are assigned ranks
-corresponding to the reverse post order traversal of current function (starting
+corresponding to the reverse post-order traversal of the current function (starting
 at 2), which effectively gives values in deep loops higher rank than values not
 in loops.
 
@@ -1019,7 +1019,7 @@ noisy.
 ``verify``: Module Verifier
 ---------------------------
 
-Verifies an LLVM IR code.  This is useful to run after an optimization which is
+Verifies LLVM IR code.  This is useful to run after an optimization which is
 undergoing testing.  Note that llvm-as verifies its input before emitting
 bitcode, and also that malformed bitcode is likely to make LLVM crash.  All
 language front-ends are therefore encouraged to verify their output before
@@ -1059,7 +1059,7 @@ instead just tries to ensure that code is well-formed.
 ----------------------------------
 
 Displays the control flow graph using the GraphViz tool.
-Additionally the ``-cfg-func-name=<substring>`` option can be used to filter the
+Additionally, the ``-cfg-func-name=<substring>`` option can be used to filter the
 functions that are displayed. All functions that contain the specified substring
 will be displayed.
 
@@ -1068,7 +1068,7 @@ will be displayed.
 
 Displays the control flow graph using the GraphViz tool, but omitting function
 bodies.
-Additionally the ``-cfg-func-name=<substring>`` option can be used to filter the
+Additionally, the ``-cfg-func-name=<substring>`` option can be used to filter the
 functions that are displayed. All functions that contain the specified substring
 will be displayed.
 
diff --git a/llvm/docs/ProgrammersManual.rst b/llvm/docs/ProgrammersManual.rst
index 0fa91bcfe2d10..602922fcb3b9c 100644
--- a/llvm/docs/ProgrammersManual.rst
+++ b/llvm/docs/ProgrammersManual.rst
@@ -1169,7 +1169,8 @@ It also supports a `level` argument to control the verbosity of the output.
 
   LDBG(2) << "I am here!";
 
-A ``DEBUG_TYPE`` macro should be defined in the file before using ``LDBG()``.
+A ``DEBUG_TYPE`` macro may optionally be defined in the file before using
+``LDBG()``, otherwise the file name is used as the debug type.
 The file name and line number are automatically added to the output, as well as
 a terminating newline.
 
@@ -1180,7 +1181,7 @@ The debug output can be enabled by passing the ``-debug`` command line argument.
   $ opt < a.bc > /dev/null -mypass
   <no output>
   $ opt < a.bc > /dev/null -mypass -debug
-  [my-pass:2] MyPass.cpp:123 I am here!
+  [my-pass MyPass.cpp:123 2] I am here!
 
 While `LDBG()` is useful to add debug output to your code, there are cases
 where you may need to guard a block of code with a debug check. The
@@ -1222,29 +1223,29 @@ Fine grained debug info with ``DEBUG_TYPE`` and the ``-debug-only`` option
 Sometimes you may find yourself in a situation where enabling ``-debug`` just
 turns on **too much** information (such as when working on the code generator).
 If you want to enable debug information with more fine-grained control, you
-should define the ``DEBUG_TYPE`` macro and use the ``-debug-only`` option as
-follows:
+can control the debug type and level with associate with each logging statement
+as follows:
 
 .. code-block:: c++
 
-  #define DEBUG_TYPE "foo"
+  #define DEBUG_TYPE "foo" // Optional: the file name is used instead if not defined
   LDBG(2) << "Hello,";
   // DEBUG_TYPE can be overridden locally, here with "bar"
   LDBG("bar", 3) << "'bar' debug type";
 
 
-A more fine-grained control can be achieved by passing the ``-debug-only``
-command line argument:
+A more fine-grained control of the output can be achieved by passing the
+``-debug-only`` command line argument:
 
 .. code-block:: none
 
   $ opt < a.bc > /dev/null -mypass -debug-only=foo
-  [foo:2] MyPass.cpp:123 Hello,
+  [foo MyPass.cpp:123 2] Hello,
   $ opt < a.bc > /dev/null -mypass -debug-only=foo,bar
-  [foo:2] MyPass.cpp:123 Hello,
-  [bar:3] MyPass.cpp:124 World!
+  [foo MyPass.cpp:123 2] Hello,
+  [bar MyPass.cpp:124 3] World!
   $ opt < a.bc > /dev/null -mypass -debug-only=bar
-  [bar:3] MyPass.cpp:124 World!
+  [bar MyPass.cpp:124 3] World!
 
 The debug-only argument is a comma separated list of debug types and levels.
 The level is an optional integer setting the maximum debug level to enable:
@@ -1252,9 +1253,9 @@ The level is an optional integer setting the maximum debug level to enable:
 .. code-block:: none
 
   $ opt < a.bc > /dev/null -mypass -debug-only=foo:2,bar:2
-  [foo:2] MyPass.cpp:123 Hello,
+  [foo MyPass.cpp:123 2] Hello,
   $ opt < a.bc > /dev/null -mypass -debug-only=foo:1,bar:3
-  [bar:3] MyPass.cpp:124 World!
+  [bar MyPass.cpp:124 3] World!
 
 Instead of opting in specific debug types, the ``-debug-only`` option also
 works to filter out debug output for specific debug types, by omitting the
@@ -1263,7 +1264,7 @@ level (or setting it to 0):
 .. code-block:: none
 
   $ opt < a.bc > /dev/null -mypass -debug-only=foo:
-  [bar:3] MyPass.cpp:124 World!
+  [bar MyPass.cpp:124 3] World!
   $ opt < a.bc > /dev/null -mypass -debug-only=bar:0,foo:
 
 
diff --git a/llvm/docs/QualGroup.rst b/llvm/docs/QualGroup.rst
index 61ef4fd2d4b07..b83e2808d9373 100644
--- a/llvm/docs/QualGroup.rst
+++ b/llvm/docs/QualGroup.rst
@@ -41,17 +41,141 @@ The Qualification Group aims to:
 The group is non-enforcing and does not control any part of the codebase.
 All technical decisions remain subject to the standard LLVM review and governance process.
 
+Group Composition
+=================
+
+Group Members
+-------------
+
+The members of the LLVM Qualification Group represent a diverse cross-section of the LLVM community, including individual contributors, researchers, vendor representatives, and experts in the field of software qualification, including reliability, quality, safety, and/or security.
+They meet the criteria for inclusion below. Knowing their handles help us keep track of who’s who across platforms, coordinate activities, and recognize contributions.
+
+.. list-table::
+   :widths: 20 20 20 20 20
+   :header-rows: 1
+
+   * - Name
+     - Affiliation
+     - Discourse handle
+     - Discord handle
+     - GitHub handle
+   * - Alan Phipps
+     - Texas Instruments
+     - evodius96
+     - \-
+     - evodius96
+   * - Carlos Andrés Ramírez
+     - Woven by Toyota
+     - CarlosAndresRamirez
+     - carlos\_andres\_ramirez
+     - CarlosAndresRamirez
+   * - Davide Cunial
+     - BMW A.G.
+     - capitan-davide
+     - capitan_davide
+     - capitan-davide
+   * - Oscar Slotosch
+     - Validas
+     - slotosch
+     - oscarslotosch_66740
+     - \-
+   * - Petar Jovanovic
+     - HTECH
+     - petarj
+     - petarjovanovic_18635
+     - petar-jovanovic
+   * - Petter Berntsson
+     - Arm Limited
+     - petbernt
+     - petbernt
+     - petbernt
+   * - Wendi Urribarri
+     - Woven by Toyota
+     - uwendi
+     - uwendi
+     - uwendi
+   * - YoungJun Lee
+     - NSHC
+     - YoungJunLee
+     - YoungJunLee
+     - IamYJLee
+
+
+Organizations are limited to three representatives within the group to maintain diversity.
+
 Participation
-=============
+-------------
 
-Participation is open to anyone interested. There are several ways to get involved:
+There are several ways to participate:
 
 * Join discussions on the `LLVM Discourse <https://discourse.llvm.org/>`_ forum, under the "Community" category.
-* Engage in conversations on the LLVM Community Discord in the `#fusa-qual-wg <https://discord.com/channels/636084430946959380/1389362444169773117>`_ channel.
+* Engage in conversations on the LLVM Community Discord in the `#fusa-qual-wg <https://discord.com/channels/636084430946959380/1389362444169773117>`_ channel. Note: You need to join the community's `Discord chat server <https://llvm.org/docs/GettingInvolved.html#discord>`_ first.
 * Join our monthly sync-up calls. Details on working sessions and meeting minutes are shared on the :doc:`GettingInvolved` page.
 * Contribute ideas, feedback, or patches via GitHub, Discourse, or directly in working documents.
 
-We welcome contributors from diverse backgrounds, organizations, and experience levels.
+Contribution Principles
+-----------------------
+
+We understand that most members contribute in a limited capacity due to their primary responsibilities. This initiative is volunteer-driven, and we operate with the following shared principles:
+
+* **Acknowledgement of limited bandwidth:** We recognize that no one is working full-time on this group, and participation will vary based on individual availability and priorities.
+* **Small and consistent contributions are valuable:** We believe that steady ongoing contributions, even if minimal, are crucial for long-term success, as long as there is coordination and respect for each other's time. Even small contributions (e.g., a few hours per month) can significantly advance the group's goals and have an impact. 
+* **Realistic progress expectations:** Given the voluntary nature and no full-time involvement, we expect our progress to be slow. This group was initiated in July 2025. Concrete outcomes in 1-2 years would be considered excellent for this type of cross-company and voluntary collaboration.
+* **Respect for differing capacities:** We value every member’s engagement, whether large or small, often or sporadically, as it all contributes to the overall effort. Even contributions that may seem small, such as sharing an idea or pointing out a relevant resource, are meaningful and important.
+
+However, we need a balance between flexibility, structure, and enough organization to move forward together. Members are expected to remain engaged through one or more of the following:
+
+* Regular participation in meetings or asynchronous discussions.
+* Contributions to qualification artifacts, methodologies, or documentation.
+* Active involvement in at least one qualification-related task over the past year.
+
+Membership Criteria
+-------------------
+
+Membership in the LLVM Qualification Group is intended for individuals with relevant experience or active engagement in qualification-related efforts. Categories include:
+
+**Individual Contributors**
+
+  * Experience in software/tool qualification (e.g., reliability, quality, safety, security); OR  
+  * Active involvement in LLVM-related qualification efforts; OR  
+  * Significant LLVM contributions related to qualification in the past year (code, discussion, resolving related challenges).
+
+**Researchers**
+
+  * Active research, publication, or development of methodologies, frameworks, or tools aimed at improving LLVM quality and reliability.
+
+**Vendor Contacts**
+
+  * Represent organizations building or using LLVM-based tools in safety-critical environments; OR  
+  * Require involvement due to organizational role in qualification or compliance.
+
+Nomination Process
+------------------
+
+Individuals may nominate themselves or be nominated by an existing member. Nominations should:
+
+* Explain the nominee’s background and relevance to qualification efforts.
+* Be submitted via this form: `Participant Introduction & Membership <https://forms.gle/cE1kHjqkKNtafUrD7>`_
+* Be communicated to an active LLVM Qualification Group member (e.g., on the Discord channel).
+
+Nominations are discussed within the group. If consensus is reached, the nominee is accepted. Otherwise, a majority vote will decide.
+
+Membership Review
+-----------------
+
+To ensure the group remains active and focused, member participation will be reviewed every six months. Inactive members may be removed following this review.
+
+Current Topics & Backlog
+========================
+
+Our working group is actively engaged in discussions about the project's
+direction and tackling technical challenges. You can find our current 
+discussions, challenges, and the project backlog in the following 
+document: `Backlog <https://docs.google.com/document/d/10YZZ72ba09Ck_OiJaP9C4-7DeUiveaIKTE3IkaSKjzA/edit?usp=sharing>`_
+
+This document serves as our central hub for all ongoing topics and will
+be updated regularly to reflect our progress. We welcome your 
+contributions and feedback.
 
 Current Topics & Backlog
 ========================
@@ -70,8 +194,7 @@ contributions and feedback.
 Meeting Materials
 =================
 
-Agendas, meeting notes, and presentation slides for the LLVM Qualification Working Group sync-ups
-are shared to ensure transparency and continuity.
+Agendas, meeting notes, and presentation slides for the sync-ups are shared to ensure transparency and continuity.
 
 Upcoming and past meeting agendas, and meeting minutes are published in a dedicated thread
 on the LLVM Discourse forum: `Meeting Agendas and Minutes <https://discourse.llvm.org/t/llvm-qualification-wg-sync-ups-meeting-minutes/87148>`_ 
@@ -85,22 +208,9 @@ Available slides:
 * `July 2025 <qual-wg/slides/202507_llvm_qual_wg.pdf>`_
 * (add future entries here)
 
-A future patch will migrate these slide files to the `llvm-www` repository, once
+Note: A future patch will migrate these slide files to the `llvm-www` repository, once
 a suitable hosting location is confirmed with the community.
 
-Contributors
-============
-
-The LLVM Qualification Working Group is a collaborative effort involving participants 
-from across the LLVM ecosystem. These include community members and industry contributors
-with experience in compiler development, tool qualification, and functional safety.
-
-While contributor names are recorded in the meeting minutes for those who attend 
-sync-up calls, we also recognize contributions made asynchronously via Discord, GitHub, 
-and other discussion channels.
-
-All forms of constructive participation are valued and acknowledged.
-
 Code of Conduct
 ===============
 
@@ -135,6 +245,7 @@ Transparency and Openness
 Unacceptable Behavior
 ---------------------
 We will not tolerate:
+
 * Harassment, discrimination, or exclusionary behavior.
 * Disruptive conduct in meetings or communication channels.
 * Using this group for marketing, lobbying, or promoting non-collaborative commercial agendas.
diff --git a/llvm/docs/RISCVUsage.rst b/llvm/docs/RISCVUsage.rst
index d6c7b46485ccf..cfe090eddfa09 100644
--- a/llvm/docs/RISCVUsage.rst
+++ b/llvm/docs/RISCVUsage.rst
@@ -327,6 +327,9 @@ The primary goal of experimental support is to assist in the process of ratifica
 ``experimental-zalasr``
   LLVM implements the `0.0.5 draft specification <https://github.com/mehnadnerd/riscv-zalasr>`__.
 
+``experimental-zibi``
+  LLVM implements the `0.1 release specification <https://github.com/riscv/zibi/releases/tag/v0.1.0>`__.
+
 ``experimental-zicfilp``, ``experimental-zicfiss``
   LLVM implements the `1.0 release specification <https://github.com/riscv/riscv-cfi/releases/tag/v1.0>`__.
 
diff --git a/llvm/docs/ReleaseNotes.md b/llvm/docs/ReleaseNotes.md
index 16174553ba7f2..3c3799321606a 100644
--- a/llvm/docs/ReleaseNotes.md
+++ b/llvm/docs/ReleaseNotes.md
@@ -124,6 +124,7 @@ Changes to the RISC-V Backend
   using `$x` with an architecture string suffix is not yet supported.
 * Ssctr and Smctr extensions are no longer experimental.
 * Add support for Zvfbfa (Additional BF16 vector compute support)
+* Adds experimental support for the 'Zibi` (Branch with Immediate) extension.
 
 Changes to the WebAssembly Backend
 ----------------------------------
diff --git a/llvm/docs/TestingGuide.rst b/llvm/docs/TestingGuide.rst
index f79d7eacd5b6c..6ab33383e929b 100644
--- a/llvm/docs/TestingGuide.rst
+++ b/llvm/docs/TestingGuide.rst
@@ -35,7 +35,7 @@ tests are contained inside the LLVM repository itself under ``llvm/unittests``
 and ``llvm/test`` respectively and are expected to always pass. They should be
 run before every commit.
 
-The whole programs tests are referred to as the "LLVM test suite" (or
+The whole-program tests are referred to as the "LLVM test suite" (or
 "test-suite") and are in the ``test-suite``
 `repository on GitHub <https://github.com/llvm/llvm-test-suite.git>`_.
 For historical reasons, these tests are also referred to as the "nightly
@@ -49,7 +49,7 @@ Unit tests are written using `Google Test <https://github.com/google/googletest/
 and `Google Mock <https://github.com/google/googletest/blob/master/docs/gmock_for_dummies.md>`_
 and are located in the ``llvm/unittests`` directory.
 In general, unit tests are reserved for targeting the support library and other
-generic data structure, we prefer relying on regression tests for testing
+generic data structure. We prefer relying on regression tests for testing
 transformations and analysis on the IR.
 
 Regression tests
@@ -69,7 +69,7 @@ piece of LLVM IR distilled from an actual application or benchmark.
 Testing Analysis
 ----------------
 
-An analysis is a pass that infer properties on some part of the IR and not
+An analysis is a pass to infer properties on some part of the IR without
 transforming it. They are tested in general using the same infrastructure as the
 regression tests, by creating a separate "Printer" pass to consume the analysis
 result and print it on the standard output in a textual format suitable for
@@ -90,7 +90,7 @@ flags, and then executed to capture the program output and timing
 information. The output of these programs is compared to a reference
 output to ensure that the program is being compiled correctly.
 
-In addition to compiling and executing programs, whole program tests
+In addition to compiling and executing programs, whole-program tests
 serve as a way of benchmarking LLVM performance, both in terms of the
 efficiency of the programs generated as well as the speed with which
 LLVM compiles, optimizes, and generates code.
@@ -104,7 +104,7 @@ Debugging Information tests
 ---------------------------
 
 The test suite contains tests to check the quality of debugging information.
-The tests are written in C based languages or in LLVM assembly language.
+The tests are written in C-based languages or in LLVM assembly language.
 
 These tests are compiled and run under a debugger. The debugger output
 is checked to validate the debugging information. See ``README.txt`` in the
@@ -139,7 +139,7 @@ To run all of the LLVM regression tests, use the ``check-llvm`` target:
     % make check-llvm
 
 In order to get reasonable testing performance, build LLVM and subprojects
-in release mode, i.e.
+in release mode, i.e.,
 
 .. code-block:: bash
 
@@ -159,7 +159,7 @@ variable to pass the required options to lit. For example, you can use:
 
     % make check LIT_OPTS="-v --vg --vg-leak"
 
-to enable testing with valgrind and with leak checking enabled.
+to enable testing with Valgrind and with leak checking enabled.
 
 To run individual tests or subsets of tests, you can use the ``llvm-lit``
 script which is built as part of LLVM. For example, to run the
@@ -202,13 +202,13 @@ The LLVM regression tests are driven by :program:`lit` and are located in the
 
 This directory contains a large array of small tests that exercise
 various features of LLVM and to ensure that regressions do not occur.
-The directory is broken into several sub-directories, each focused on a
+The directory is broken into several subdirectories, each focused on a
 particular area of LLVM.
 
 Writing new regression tests
 ----------------------------
 
-The regression test structure is very simple, but does require some
+The regression test structure is very simple but does require some
 information to be set. This information is gathered via ``cmake``
 and is written to a file, ``test/lit.site.cfg.py`` in the build directory.
 The ``llvm/test`` Makefile does this work for you.
@@ -299,7 +299,7 @@ top to indicate that assertions were automatically generated.
 If you want to update assertions in an existing test case, pass the `-u` option
 which first checks the ``NOTE:`` line exists and matches the script name.
 
-Sometimes a test absolutely depends on hand-written assertions and should not
+Sometimes, a test absolutely depends on hand-written assertions and should not
 have assertions automatically generated. In that case, add the text ``NOTE: Do
 not autogenerate`` to the first line, and the scripts will skip that test. It
 is a good idea to explain why generated assertions will not work for the test
@@ -428,7 +428,7 @@ For convenience, these are the contents:
   !llvm.ident = !{!0}
   !0 = metadata !{metadata !"Compiler V3"}
 
-For symmetry reasons, ``ident.ll`` is just a dummy file that doesn't
+For symmetry, ``ident.ll`` is just a dummy file that doesn't
 actually participate in the test besides holding the ``RUN:`` lines.
 
 .. note::
@@ -470,7 +470,7 @@ content.
 The script will prepare extra files with ``split-file``, invoke ``gen``, and
 then rewrite the part after ``gen`` with its stdout.
 
-For convenience, if the test needs one single assembly file, you can also wrap
+For convenience, if the test needs a single assembly file, you can also wrap
 ``gen`` and its required files with ``.ifdef`` and ``.endif``. Then you can
 skip ``split-file`` in ``RUN`` lines.
 
@@ -869,7 +869,7 @@ Additional substitutions can be defined as follows:
   substitutions for all tests in a test directory.  They do so by extending the
   substitution list, ``config.substitutions``.  Each item in the list is a tuple
   consisting of a pattern and its replacement, which lit applies as plain text
-  (even if it contains sequences that python's ``re.sub`` considers to be
+  (even if it contains sequences that Python's ``re.sub`` considers to be
   escape sequences).
 - To define substitutions within a single test file, lit supports the
   ``DEFINE:`` and ``REDEFINE:`` directives, described in detail below.  So that
@@ -976,7 +976,7 @@ directives:
   colons.  This syntax has a few advantages:
 
     - It is impossible for ``%{name}`` to contain sequences that are special in
-      python's ``re.sub`` patterns.  Otherwise, attempting to specify
+      Python's ``re.sub`` patterns.  Otherwise, attempting to specify
       ``%{name}`` as a substitution pattern in a lit configuration file could
       produce confusing expansions.
     - The braces help avoid the possibility that another substitution's pattern
@@ -1039,7 +1039,7 @@ To address such use cases, lit configuration files support
 to specify the maximum number of passes through the substitution list.  Thus, in
 the above example, setting the limit to 2 would cause lit to make a second pass
 that expands ``%{inner}`` in the ``RUN:`` line, and the output from the ``echo``
-command when then be:
+command would then be:
 
 .. code-block:: shell
 
@@ -1094,7 +1094,7 @@ a test fails.
 
 Finally, any line that contains "END." will cause the special
 interpretation of lines to terminate. This is generally done right after
-the last RUN: line. This has two side effects:
+the last ``RUN:`` line. This has two side effects:
 
 (a) it prevents special interpretation of lines that are part of the test
     program, not the instructions to the test case, and
diff --git a/llvm/include/llvm/ADT/DenseMap.h b/llvm/include/llvm/ADT/DenseMap.h
index 18dd7f30c5616..e13a2cb09a412 100644
--- a/llvm/include/llvm/ADT/DenseMap.h
+++ b/llvm/include/llvm/ADT/DenseMap.h
@@ -372,8 +372,8 @@ class DenseMapBase : public DebugEpochBase {
     // Ensure that "NumEntries * 4 < NumBuckets * 3"
     if (NumEntries == 0)
       return 0;
-    // +1 is required because of the strict equality.
-    // For example if NumEntries is 48, we need to return 401.
+    // +1 is required because of the strict inequality.
+    // For example, if NumEntries is 48, we need to return 128.
     return NextPowerOf2(NumEntries * 4 / 3 + 1);
   }
 
@@ -710,9 +710,11 @@ class DenseMap : public DenseMapBase<DenseMap<KeyT, ValueT, KeyInfoT, BucketT>,
   unsigned NumBuckets;
 
 public:
-  /// Create a DenseMap with an optional \p InitialReserve that guarantee that
-  /// this number of elements can be inserted in the map without grow()
-  explicit DenseMap(unsigned InitialReserve = 0) { init(InitialReserve); }
+  /// Create a DenseMap with an optional \p NumElementsToReserve to guarantee
+  /// that this number of elements can be inserted in the map without grow().
+  explicit DenseMap(unsigned NumElementsToReserve = 0) {
+    init(NumElementsToReserve);
+  }
 
   DenseMap(const DenseMap &other) : BaseT() {
     init(0);
@@ -738,7 +740,7 @@ class DenseMap : public DenseMapBase<DenseMap<KeyT, ValueT, KeyInfoT, BucketT>,
 
   ~DenseMap() {
     this->destroyAll();
-    deallocate_buffer(Buckets, sizeof(BucketT) * NumBuckets, alignof(BucketT));
+    deallocateBuckets();
   }
 
   void swap(DenseMap &RHS) {
@@ -758,7 +760,7 @@ class DenseMap : public DenseMapBase<DenseMap<KeyT, ValueT, KeyInfoT, BucketT>,
 
   DenseMap &operator=(DenseMap &&other) {
     this->destroyAll();
-    deallocate_buffer(Buckets, sizeof(BucketT) * NumBuckets, alignof(BucketT));
+    deallocateBuckets();
     init(0);
     swap(other);
     return *this;
@@ -766,7 +768,7 @@ class DenseMap : public DenseMapBase<DenseMap<KeyT, ValueT, KeyInfoT, BucketT>,
 
   void copyFrom(const DenseMap &other) {
     this->destroyAll();
-    deallocate_buffer(Buckets, sizeof(BucketT) * NumBuckets, alignof(BucketT));
+    deallocateBuckets();
     if (allocateBuckets(other.NumBuckets)) {
       this->BaseT::copyFrom(other);
     } else {
@@ -827,6 +829,10 @@ class DenseMap : public DenseMapBase<DenseMap<KeyT, ValueT, KeyInfoT, BucketT>,
 
   unsigned getNumBuckets() const { return NumBuckets; }
 
+  void deallocateBuckets() {
+    deallocate_buffer(Buckets, sizeof(BucketT) * NumBuckets, alignof(BucketT));
+  }
+
   bool allocateBuckets(unsigned Num) {
     NumBuckets = Num;
     if (NumBuckets == 0) {
@@ -883,10 +889,8 @@ class SmallDenseMap
   AlignedCharArrayUnion<BucketT[InlineBuckets], LargeRep> storage;
 
 public:
-  explicit SmallDenseMap(unsigned NumInitBuckets = 0) {
-    if (NumInitBuckets > InlineBuckets)
-      NumInitBuckets = llvm::bit_ceil(NumInitBuckets);
-    init(NumInitBuckets);
+  explicit SmallDenseMap(unsigned NumElementsToReserve = 0) {
+    init(NumElementsToReserve);
   }
 
   SmallDenseMap(const SmallDenseMap &other) : BaseT() {
@@ -901,7 +905,7 @@ class SmallDenseMap
 
   template <typename InputIt>
   SmallDenseMap(const InputIt &I, const InputIt &E) {
-    init(NextPowerOf2(std::distance(I, E)));
+    init(std::distance(I, E));
     this->insert(I, E);
   }
 
@@ -1013,7 +1017,8 @@ class SmallDenseMap
     this->BaseT::copyFrom(other);
   }
 
-  void init(unsigned InitBuckets) {
+  void init(unsigned InitNumEntries) {
+    auto InitBuckets = BaseT::getMinBucketToReserveForEntries(InitNumEntries);
     Small = true;
     if (InitBuckets > InlineBuckets) {
       Small = false;
diff --git a/llvm/include/llvm/ADT/DenseSet.h b/llvm/include/llvm/ADT/DenseSet.h
index 281d4d1c78cc0..60ad9b2eb7762 100644
--- a/llvm/include/llvm/ADT/DenseSet.h
+++ b/llvm/include/llvm/ADT/DenseSet.h
@@ -250,20 +250,24 @@ bool operator!=(const DenseSetImpl<ValueT, MapTy, ValueInfoT> &LHS,
   return !(LHS == RHS);
 }
 
+template <typename ValueT, typename ValueInfoT>
+using DenseSet = DenseSetImpl<
+    ValueT, DenseMap<ValueT, DenseSetEmpty, ValueInfoT, DenseSetPair<ValueT>>,
+    ValueInfoT>;
+
+template <typename ValueT, unsigned InlineBuckets, typename ValueInfoT>
+using SmallDenseSet =
+    DenseSetImpl<ValueT,
+                 SmallDenseMap<ValueT, DenseSetEmpty, InlineBuckets, ValueInfoT,
+                               DenseSetPair<ValueT>>,
+                 ValueInfoT>;
+
 } // end namespace detail
 
 /// Implements a dense probed hash-table based set.
 template <typename ValueT, typename ValueInfoT = DenseMapInfo<ValueT>>
-class DenseSet : public detail::DenseSetImpl<
-                     ValueT,
-                     DenseMap<ValueT, detail::DenseSetEmpty, ValueInfoT,
-                              detail::DenseSetPair<ValueT>>,
-                     ValueInfoT> {
-  using BaseT =
-      detail::DenseSetImpl<ValueT,
-                           DenseMap<ValueT, detail::DenseSetEmpty, ValueInfoT,
-                                    detail::DenseSetPair<ValueT>>,
-                           ValueInfoT>;
+class DenseSet : public detail::DenseSet<ValueT, ValueInfoT> {
+  using BaseT = detail::DenseSet<ValueT, ValueInfoT>;
 
 public:
   using BaseT::BaseT;
@@ -274,16 +278,8 @@ class DenseSet : public detail::DenseSetImpl<
 template <typename ValueT, unsigned InlineBuckets = 4,
           typename ValueInfoT = DenseMapInfo<ValueT>>
 class SmallDenseSet
-    : public detail::DenseSetImpl<
-          ValueT,
-          SmallDenseMap<ValueT, detail::DenseSetEmpty, InlineBuckets,
-                        ValueInfoT, detail::DenseSetPair<ValueT>>,
-          ValueInfoT> {
-  using BaseT = detail::DenseSetImpl<
-      ValueT,
-      SmallDenseMap<ValueT, detail::DenseSetEmpty, InlineBuckets, ValueInfoT,
-                    detail::DenseSetPair<ValueT>>,
-      ValueInfoT>;
+    : public detail::SmallDenseSet<ValueT, InlineBuckets, ValueInfoT> {
+  using BaseT = detail::SmallDenseSet<ValueT, InlineBuckets, ValueInfoT>;
 
 public:
   using BaseT::BaseT;
diff --git a/llvm/include/llvm/ADT/EnumeratedArray.h b/llvm/include/llvm/ADT/EnumeratedArray.h
index 93e1327306175..2fe6be434f11b 100644
--- a/llvm/include/llvm/ADT/EnumeratedArray.h
+++ b/llvm/include/llvm/ADT/EnumeratedArray.h
@@ -15,8 +15,9 @@
 #ifndef LLVM_ADT_ENUMERATEDARRAY_H
 #define LLVM_ADT_ENUMERATEDARRAY_H
 
+#include "llvm/ADT/STLExtras.h"
+#include <array>
 #include <cassert>
-#include <iterator>
 
 namespace llvm {
 
@@ -24,12 +25,15 @@ template <typename ValueType, typename Enumeration,
           Enumeration LargestEnum = Enumeration::Last, typename IndexType = int,
           IndexType Size = 1 + static_cast<IndexType>(LargestEnum)>
 class EnumeratedArray {
-public:
-  using iterator = ValueType *;
-  using const_iterator = const ValueType *;
+  static_assert(Size > 0);
+  using ArrayTy = std::array<ValueType, Size>;
+  ArrayTy Underlying;
 
-  using const_reverse_iterator = std::reverse_iterator<const_iterator>;
-  using reverse_iterator = std::reverse_iterator<iterator>;
+public:
+  using iterator = typename ArrayTy::iterator;
+  using const_iterator = typename ArrayTy::const_iterator;
+  using reverse_iterator = typename ArrayTy::reverse_iterator;
+  using const_reverse_iterator = typename ArrayTy::const_reverse_iterator;
 
   using value_type = ValueType;
   using reference = ValueType &;
@@ -38,16 +42,10 @@ class EnumeratedArray {
   using const_pointer = const ValueType *;
 
   EnumeratedArray() = default;
-  EnumeratedArray(ValueType V) {
-    for (IndexType IX = 0; IX < Size; ++IX) {
-      Underlying[IX] = V;
-    }
-  }
+  EnumeratedArray(ValueType V) { Underlying.fill(V); }
   EnumeratedArray(std::initializer_list<ValueType> Init) {
     assert(Init.size() == Size && "Incorrect initializer size");
-    for (IndexType IX = 0; IX < Size; ++IX) {
-      Underlying[IX] = *(Init.begin() + IX);
-    }
+    llvm::copy(Init, Underlying.begin());
   }
 
   const ValueType &operator[](Enumeration Index) const {
@@ -62,23 +60,15 @@ class EnumeratedArray {
   IndexType size() const { return Size; }
   bool empty() const { return size() == 0; }
 
-  iterator begin() { return Underlying; }
-  const_iterator begin() const { return Underlying; }
-
-  iterator end() { return begin() + size(); }
-  const_iterator end() const { return begin() + size(); }
-
-  reverse_iterator rbegin() { return reverse_iterator(end()); }
-  const_reverse_iterator rbegin() const {
-    return const_reverse_iterator(end());
-  }
-  reverse_iterator rend() { return reverse_iterator(begin()); }
-  const_reverse_iterator rend() const {
-    return const_reverse_iterator(begin());
-  }
+  iterator begin() { return Underlying.begin(); }
+  const_iterator begin() const { return Underlying.begin(); }
+  iterator end() { return Underlying.end(); }
+  const_iterator end() const { return Underlying.end(); }
 
-private:
-  ValueType Underlying[Size];
+  reverse_iterator rbegin() { return Underlying.rbegin(); }
+  const_reverse_iterator rbegin() const { return Underlying.rbegin(); }
+  reverse_iterator rend() { return Underlying.rend(); }
+  const_reverse_iterator rend() const { return Underlying.rend(); }
 };
 
 } // namespace llvm
diff --git a/llvm/include/llvm/ADT/Hashing.h b/llvm/include/llvm/ADT/Hashing.h
index ec22fe3a28cf9..41a730e24a6b1 100644
--- a/llvm/include/llvm/ADT/Hashing.h
+++ b/llvm/include/llvm/ADT/Hashing.h
@@ -333,20 +333,21 @@ inline uint64_t get_execution_seed() {
 // for equality. For all the platforms we care about, this holds for integers
 // and pointers, but there are platforms where it doesn't and we would like to
 // support user-defined types which happen to satisfy this property.
-template <typename T> struct is_hashable_data
-  : std::integral_constant<bool, ((is_integral_or_enum<T>::value ||
-                                   std::is_pointer<T>::value) &&
-                                  64 % sizeof(T) == 0)> {};
+template <typename T>
+struct is_hashable_data : std::bool_constant<((is_integral_or_enum<T>::value ||
+                                               std::is_pointer<T>::value) &&
+                                              64 % sizeof(T) == 0)> {};
 
 // Special case std::pair to detect when both types are viable and when there
 // is no alignment-derived padding in the pair. This is a bit of a lie because
 // std::pair isn't truly POD, but it's close enough in all reasonable
 // implementations for our use case of hashing the underlying data.
-template <typename T, typename U> struct is_hashable_data<std::pair<T, U> >
-  : std::integral_constant<bool, (is_hashable_data<T>::value &&
-                                  is_hashable_data<U>::value &&
-                                  (sizeof(T) + sizeof(U)) ==
-                                   sizeof(std::pair<T, U>))> {};
+template <typename T, typename U>
+struct is_hashable_data<std::pair<T, U>>
+    : std::bool_constant<(is_hashable_data<T>::value &&
+                          is_hashable_data<U>::value &&
+                          (sizeof(T) + sizeof(U)) == sizeof(std::pair<T, U>))> {
+};
 
 /// Helper to get the hashable data representation for a type.
 template <typename T> auto get_hashable_data(const T &value) {
diff --git a/llvm/include/llvm/ADT/IndexedMap.h b/llvm/include/llvm/ADT/IndexedMap.h
index b1ebbdd1bfd54..cda0316dc78fa 100644
--- a/llvm/include/llvm/ADT/IndexedMap.h
+++ b/llvm/include/llvm/ADT/IndexedMap.h
@@ -20,67 +20,56 @@
 #ifndef LLVM_ADT_INDEXEDMAP_H
 #define LLVM_ADT_INDEXEDMAP_H
 
-#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/identity.h"
 #include <cassert>
 
 namespace llvm {
 
-template <typename T, typename ToIndexT = identity<unsigned>>
-  class IndexedMap {
-    using IndexT = typename ToIndexT::argument_type;
-    // Prefer SmallVector with zero inline storage over std::vector. IndexedMaps
-    // can grow very large and SmallVector grows more efficiently as long as T
-    // is trivially copyable.
-    using StorageT = SmallVector<T, 0>;
-
-    StorageT storage_;
-    T nullVal_;
-    ToIndexT toIndex_;
-
-  public:
-    IndexedMap() : nullVal_(T()) {}
-
-    explicit IndexedMap(const T& val) : nullVal_(val) {}
-
-    typename StorageT::reference operator[](IndexT n) {
-      assert(toIndex_(n) < storage_.size() && "index out of bounds!");
-      return storage_[toIndex_(n)];
-    }
-
-    typename StorageT::const_reference operator[](IndexT n) const {
-      assert(toIndex_(n) < storage_.size() && "index out of bounds!");
-      return storage_[toIndex_(n)];
-    }
-
-    void reserve(typename StorageT::size_type s) {
-      storage_.reserve(s);
-    }
-
-    void resize(typename StorageT::size_type s) {
-      storage_.resize(s, nullVal_);
-    }
-
-    void clear() {
-      storage_.clear();
-    }
-
-    void grow(IndexT n) {
-      unsigned NewSize = toIndex_(n) + 1;
-      if (NewSize > storage_.size())
-        resize(NewSize);
-    }
-
-    bool inBounds(IndexT n) const {
-      return toIndex_(n) < storage_.size();
-    }
-
-    typename StorageT::size_type size() const {
-      return storage_.size();
-    }
-  };
-
-} // end namespace llvm
+template <typename T, typename ToIndexT = identity<unsigned>> class IndexedMap {
+  using IndexT = typename ToIndexT::argument_type;
+  // Prefer SmallVector with zero inline storage over std::vector. IndexedMaps
+  // can grow very large and SmallVector grows more efficiently as long as T
+  // is trivially copyable.
+  using StorageT = SmallVector<T, 0>;
+
+  StorageT storage_;
+  T nullVal_;
+  ToIndexT toIndex_;
+
+public:
+  IndexedMap() : nullVal_(T()) {}
+
+  explicit IndexedMap(const T &val) : nullVal_(val) {}
+
+  typename StorageT::reference operator[](IndexT n) {
+    assert(toIndex_(n) < storage_.size() && "index out of bounds!");
+    return storage_[toIndex_(n)];
+  }
+
+  typename StorageT::const_reference operator[](IndexT n) const {
+    assert(toIndex_(n) < storage_.size() && "index out of bounds!");
+    return storage_[toIndex_(n)];
+  }
+
+  void reserve(typename StorageT::size_type s) { storage_.reserve(s); }
+
+  void resize(typename StorageT::size_type s) { storage_.resize(s, nullVal_); }
+
+  void clear() { storage_.clear(); }
+
+  void grow(IndexT n) {
+    unsigned NewSize = toIndex_(n) + 1;
+    if (NewSize > storage_.size())
+      resize(NewSize);
+  }
+
+  bool inBounds(IndexT n) const { return toIndex_(n) < storage_.size(); }
+
+  typename StorageT::size_type size() const { return storage_.size(); }
+};
+
+} // namespace llvm
 
 #endif // LLVM_ADT_INDEXEDMAP_H
diff --git a/llvm/include/llvm/ADT/PackedVector.h b/llvm/include/llvm/ADT/PackedVector.h
index b6bb6a4738067..1146cc4bd6d23 100644
--- a/llvm/include/llvm/ADT/PackedVector.h
+++ b/llvm/include/llvm/ADT/PackedVector.h
@@ -31,14 +31,14 @@ class PackedVectorBase<T, BitNum, BitVectorTy, false> {
   static T getValue(const BitVectorTy &Bits, unsigned Idx) {
     T val = T();
     for (unsigned i = 0; i != BitNum; ++i)
-      val = T(val | ((Bits[(Idx << (BitNum-1)) + i] ? 1UL : 0UL) << i));
+      val = T(val | ((Bits[(Idx * BitNum) + i] ? 1UL : 0UL) << i));
     return val;
   }
 
   static void setValue(BitVectorTy &Bits, unsigned Idx, T val) {
     assert((val >> BitNum) == 0 && "value is too big");
     for (unsigned i = 0; i != BitNum; ++i)
-      Bits[(Idx << (BitNum-1)) + i] = val & (T(1) << i);
+      Bits[(Idx * BitNum) + i] = val & (T(1) << i);
   }
 };
 
@@ -48,8 +48,8 @@ class PackedVectorBase<T, BitNum, BitVectorTy, true> {
   static T getValue(const BitVectorTy &Bits, unsigned Idx) {
     T val = T();
     for (unsigned i = 0; i != BitNum-1; ++i)
-      val = T(val | ((Bits[(Idx << (BitNum-1)) + i] ? 1UL : 0UL) << i));
-    if (Bits[(Idx << (BitNum-1)) + BitNum-1])
+      val = T(val | ((Bits[(Idx * BitNum) + i] ? 1UL : 0UL) << i));
+    if (Bits[(Idx * BitNum) + BitNum - 1])
       val = ~val;
     return val;
   }
@@ -57,11 +57,11 @@ class PackedVectorBase<T, BitNum, BitVectorTy, true> {
   static void setValue(BitVectorTy &Bits, unsigned Idx, T val) {
     if (val < 0) {
       val = ~val;
-      Bits.set((Idx << (BitNum-1)) + BitNum-1);
+      Bits.set((Idx * BitNum) + BitNum - 1);
     }
     assert((val >> (BitNum-1)) == 0 && "value is too big");
     for (unsigned i = 0; i != BitNum-1; ++i)
-      Bits[(Idx << (BitNum-1)) + i] = val & (T(1) << i);
+      Bits[(Idx * BitNum) + i] = val & (T(1) << i);
   }
 };
 
@@ -76,6 +76,10 @@ template <typename T, unsigned BitNum, typename BitVectorTy = BitVector>
 class PackedVector : public PackedVectorBase<T, BitNum, BitVectorTy,
                                             std::numeric_limits<T>::is_signed> {
   BitVectorTy Bits;
+  // Keep track of the number of elements on our own.
+  // We always maintain Bits.size() == NumElements * BitNum.
+  // Used to avoid an integer division in size().
+  unsigned NumElements = 0;
   using base = PackedVectorBase<T, BitNum, BitVectorTy,
                                 std::numeric_limits<T>::is_signed>;
 
@@ -99,17 +103,24 @@ class PackedVector : public PackedVectorBase<T, BitNum, BitVectorTy,
   };
 
   PackedVector() = default;
-  explicit PackedVector(unsigned size) : Bits(size << (BitNum-1)) {}
+  explicit PackedVector(unsigned size)
+      : Bits(size * BitNum), NumElements(size) {}
 
-  bool empty() const { return Bits.empty(); }
+  bool empty() const { return NumElements == 0; }
 
-  unsigned size() const { return Bits.size() >> (BitNum - 1); }
+  unsigned size() const { return NumElements; }
 
-  void clear() { Bits.clear(); }
+  void clear() {
+    Bits.clear();
+    NumElements = 0;
+  }
 
-  void resize(unsigned N) { Bits.resize(N << (BitNum - 1)); }
+  void resize(unsigned N) {
+    Bits.resize(N * BitNum);
+    NumElements = N;
+  }
 
-  void reserve(unsigned N) { Bits.reserve(N << (BitNum-1)); }
+  void reserve(unsigned N) { Bits.reserve(N * BitNum); }
 
   PackedVector &reset() {
     Bits.reset();
diff --git a/llvm/include/llvm/ADT/PointerIntPair.h b/llvm/include/llvm/ADT/PointerIntPair.h
index 9cfc65846d5bf..75e3a58e7ca61 100644
--- a/llvm/include/llvm/ADT/PointerIntPair.h
+++ b/llvm/include/llvm/ADT/PointerIntPair.h
@@ -173,15 +173,14 @@ struct PointerIntPairInfo {
                 "PointerIntPair with integer size too large for pointer");
   enum MaskAndShiftConstants : uintptr_t {
     /// PointerBitMask - The bits that come from the pointer.
-    PointerBitMask =
-        ~(uintptr_t)(((intptr_t)1 << PtrTraits::NumLowBitsAvailable) - 1),
+    PointerBitMask = (~(uintptr_t)0) << PtrTraits::NumLowBitsAvailable,
 
     /// IntShift - The number of low bits that we reserve for other uses, and
     /// keep zero.
     IntShift = (uintptr_t)PtrTraits::NumLowBitsAvailable - IntBits,
 
     /// IntMask - This is the unshifted mask for valid bits of the int type.
-    IntMask = (uintptr_t)(((intptr_t)1 << IntBits) - 1),
+    IntMask = ((uintptr_t)1 << IntBits) - 1,
 
     // ShiftedIntMask - This is the bits for the integer shifted in place.
     ShiftedIntMask = (uintptr_t)(IntMask << IntShift)
diff --git a/llvm/include/llvm/ADT/STLExtras.h b/llvm/include/llvm/ADT/STLExtras.h
index 099bef288b953..4e7e42e9f4a5f 100644
--- a/llvm/include/llvm/ADT/STLExtras.h
+++ b/llvm/include/llvm/ADT/STLExtras.h
@@ -35,6 +35,7 @@
 #include <iterator>
 #include <limits>
 #include <memory>
+#include <numeric>
 #include <optional>
 #include <tuple>
 #include <type_traits>
@@ -113,6 +114,13 @@ using is_one_of = std::disjunction<std::is_same<T, Ts>...>;
 template <typename T, typename... Ts>
 using are_base_of = std::conjunction<std::is_base_of<T, Ts>...>;
 
+/// traits class for checking whether type `T` is same as all other types in
+/// `Ts`.
+template <typename T = void, typename... Ts>
+using all_types_equal = std::conjunction<std::is_same<T, Ts>...>;
+template <typename T = void, typename... Ts>
+constexpr bool all_types_equal_v = all_types_equal<T, Ts...>::value;
+
 /// Determine if all types in Ts are distinct.
 ///
 /// Useful to statically assert when Ts is intended to describe a non-multi set
@@ -995,13 +1003,17 @@ class concat_iterator
 
   static constexpr bool ReturnsByValue =
       !(std::is_reference_v<decltype(*std::declval<IterTs>())> && ...);
-
+  static constexpr bool ReturnsConvertibleType =
+      !all_types_equal_v<
+          std::remove_cv_t<ValueT>,
+          remove_cvref_t<decltype(*std::declval<IterTs>())>...> &&
+      (std::is_convertible_v<decltype(*std::declval<IterTs>()), ValueT> && ...);
+
+  // Cannot return a reference type if a conversion takes place, provided that
+  // the result of dereferencing all `IterTs...` is convertible to `ValueT`.
   using reference_type =
-      typename std::conditional_t<ReturnsByValue, ValueT, ValueT &>;
-
-  using handle_type =
-      typename std::conditional_t<ReturnsByValue, std::optional<ValueT>,
-                                  ValueT *>;
+      std::conditional_t<ReturnsByValue || ReturnsConvertibleType, ValueT,
+                         ValueT &>;
 
   /// We store both the current and end iterators for each concatenated
   /// sequence in a tuple of pairs.
@@ -1012,49 +1024,38 @@ class concat_iterator
   std::tuple<IterTs...> Begins;
   std::tuple<IterTs...> Ends;
 
-  /// Attempts to increment a specific iterator.
-  ///
-  /// Returns true if it was able to increment the iterator. Returns false if
-  /// the iterator is already at the end iterator.
-  template <size_t Index> bool incrementHelper() {
+  /// Attempts to increment the `Index`-th iterator. If the iterator is already
+  /// at end, recurse over iterators in `Others...`.
+  template <size_t Index, size_t... Others> void incrementImpl() {
     auto &Begin = std::get<Index>(Begins);
     auto &End = std::get<Index>(Ends);
-    if (Begin == End)
-      return false;
-
+    if (Begin == End) {
+      if constexpr (sizeof...(Others) != 0)
+        return incrementImpl<Others...>();
+      llvm_unreachable("Attempted to increment an end concat iterator!");
+    }
     ++Begin;
-    return true;
   }
 
   /// Increments the first non-end iterator.
   ///
   /// It is an error to call this with all iterators at the end.
   template <size_t... Ns> void increment(std::index_sequence<Ns...>) {
-    // Build a sequence of functions to increment each iterator if possible.
-    bool (concat_iterator::*IncrementHelperFns[])() = {
-        &concat_iterator::incrementHelper<Ns>...};
-
-    // Loop over them, and stop as soon as we succeed at incrementing one.
-    for (auto &IncrementHelperFn : IncrementHelperFns)
-      if ((this->*IncrementHelperFn)())
-        return;
-
-    llvm_unreachable("Attempted to increment an end concat iterator!");
+    incrementImpl<Ns...>();
   }
 
-  /// Returns null if the specified iterator is at the end. Otherwise,
-  /// dereferences the iterator and returns the address of the resulting
-  /// reference.
-  template <size_t Index> handle_type getHelper() const {
+  /// Dereferences the `Index`-th iterator and returns the resulting reference.
+  /// If `Index` is at end, recurse over iterators in `Others...`.
+  template <size_t Index, size_t... Others> reference_type getImpl() const {
     auto &Begin = std::get<Index>(Begins);
     auto &End = std::get<Index>(Ends);
-    if (Begin == End)
-      return {};
-
-    if constexpr (ReturnsByValue)
-      return *Begin;
-    else
-      return &*Begin;
+    if (Begin == End) {
+      if constexpr (sizeof...(Others) != 0)
+        return getImpl<Others...>();
+      llvm_unreachable(
+          "Attempted to get a pointer from an end concat iterator!");
+    }
+    return *Begin;
   }
 
   /// Finds the first non-end iterator, dereferences, and returns the resulting
@@ -1062,16 +1063,7 @@ class concat_iterator
   ///
   /// It is an error to call this with all iterators at the end.
   template <size_t... Ns> reference_type get(std::index_sequence<Ns...>) const {
-    // Build a sequence of functions to get from iterator if possible.
-    handle_type (concat_iterator::*GetHelperFns[])()
-        const = {&concat_iterator::getHelper<Ns>...};
-
-    // Loop over them, and return the first result we find.
-    for (auto &GetHelperFn : GetHelperFns)
-      if (auto P = (this->*GetHelperFn)())
-        return *P;
-
-    llvm_unreachable("Attempted to get a pointer from an end concat iterator!");
+    return getImpl<Ns...>();
   }
 
 public:
@@ -1694,6 +1686,12 @@ template <typename R> constexpr size_t range_size(R &&Range) {
     return static_cast<size_t>(std::distance(adl_begin(Range), adl_end(Range)));
 }
 
+/// Wrapper for std::accumulate.
+template <typename R, typename E> auto accumulate(R &&Range, E &&Init) {
+  return std::accumulate(adl_begin(Range), adl_end(Range),
+                         std::forward<E>(Init));
+}
+
 /// Provide wrappers to std::for_each which take ranges instead of having to
 /// pass begin/end explicitly.
 template <typename R, typename UnaryFunction>
diff --git a/llvm/include/llvm/ADT/SparseBitVector.h b/llvm/include/llvm/ADT/SparseBitVector.h
index 7151af6146e6e..90e2336f9f488 100644
--- a/llvm/include/llvm/ADT/SparseBitVector.h
+++ b/llvm/include/llvm/ADT/SparseBitVector.h
@@ -119,8 +119,8 @@ template <unsigned ElementSize = 128> struct SparseBitVectorElement {
 
   size_type count() const {
     unsigned NumBits = 0;
-    for (unsigned i = 0; i < BITWORDS_PER_ELEMENT; ++i)
-      NumBits += llvm::popcount(Bits[i]);
+    for (BitWord Bit : Bits)
+      NumBits += llvm::popcount(Bit);
     return NumBits;
   }
 
@@ -799,11 +799,8 @@ class SparseBitVector {
 
   unsigned count() const {
     unsigned BitCount = 0;
-    for (ElementListConstIter Iter = Elements.begin();
-         Iter != Elements.end();
-         ++Iter)
-      BitCount += Iter->count();
-
+    for (const SparseBitVectorElement<ElementSize> &Elem : Elements)
+      BitCount += Elem.count();
     return BitCount;
   }
 
diff --git a/llvm/include/llvm/ADT/SparseMultiSet.h b/llvm/include/llvm/ADT/SparseMultiSet.h
index d8dbe4023ea64..cf7603158b28b 100644
--- a/llvm/include/llvm/ADT/SparseMultiSet.h
+++ b/llvm/include/llvm/ADT/SparseMultiSet.h
@@ -21,9 +21,9 @@
 #ifndef LLVM_ADT_SPARSEMULTISET_H
 #define LLVM_ADT_SPARSEMULTISET_H
 
-#include "llvm/ADT/identity.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/SparseSet.h"
+#include "llvm/ADT/identity.h"
 #include <cassert>
 #include <cstdint>
 #include <cstdlib>
@@ -80,9 +80,8 @@ namespace llvm {
 /// @tparam KeyFunctorT A functor that computes an unsigned index from KeyT.
 /// @tparam SparseT     An unsigned integer type. See above.
 ///
-template<typename ValueT,
-         typename KeyFunctorT = identity<unsigned>,
-         typename SparseT = uint8_t>
+template <typename ValueT, typename KeyFunctorT = identity<unsigned>,
+          typename SparseT = uint8_t>
 class SparseMultiSet {
   static_assert(std::is_unsigned_v<SparseT>,
                 "SparseT must be an unsigned integer type");
@@ -103,14 +102,10 @@ class SparseMultiSet {
     SMSNode(ValueT D, unsigned P, unsigned N) : Data(D), Prev(P), Next(N) {}
 
     /// List tails have invalid Nexts.
-    bool isTail() const {
-      return Next == INVALID;
-    }
+    bool isTail() const { return Next == INVALID; }
 
     /// Whether this node is a tombstone node, and thus is in our freelist.
-    bool isTombstone() const {
-      return Prev == INVALID;
-    }
+    bool isTombstone() const { return Prev == INVALID; }
 
     /// Since the list is circular in Prev, all non-tombstone nodes have a valid
     /// Prev.
@@ -156,7 +151,7 @@ class SparseMultiSet {
 
   /// Add in the given SMSNode. Uses a free entry in our freelist if
   /// available. Returns the index of the added node.
-  unsigned addValue(const ValueT& V, unsigned Prev, unsigned Next) {
+  unsigned addValue(const ValueT &V, unsigned Prev, unsigned Next) {
     if (NumFree == 0) {
       Dense.push_back(SMSNode(V, Prev, Next));
       return Dense.size() - 1;
@@ -204,13 +199,13 @@ class SparseMultiSet {
     // seem like a likely use case, so we can add that code when we need it.
     assert(empty() && "Can only resize universe on an empty map");
     // Hysteresis prevents needless reallocations.
-    if (U >= Universe/4 && U <= Universe)
+    if (U >= Universe / 4 && U <= Universe)
       return;
     free(Sparse);
     // The Sparse array doesn't actually need to be initialized, so malloc
     // would be enough here, but that will cause tools like valgrind to
     // complain about branching on uninitialized data.
-    Sparse = static_cast<SparseT*>(safe_calloc(U, sizeof(SparseT)));
+    Sparse = static_cast<SparseT *>(safe_calloc(U, sizeof(SparseT)));
     Universe = U;
   }
 
@@ -232,7 +227,7 @@ class SparseMultiSet {
     unsigned SparseIdx;
 
     iterator_base(SMSPtrTy P, unsigned I, unsigned SI)
-      : SMS(P), Idx(I), SparseIdx(SI) {}
+        : SMS(P), Idx(I), SparseIdx(SI) {}
 
     /// Whether our iterator has fallen outside our dense vector.
     bool isEnd() const {
@@ -273,9 +268,7 @@ class SparseMultiSet {
       return false;
     }
 
-    bool operator!=(const iterator_base &RHS) const {
-      return !operator==(RHS);
-    }
+    bool operator!=(const iterator_base &RHS) const { return !operator==(RHS); }
 
     /// Increment and decrement operators
     iterator_base &operator--() { // predecrement - Back up
@@ -372,12 +365,10 @@ class SparseMultiSet {
   /// @param   Key A valid key to find.
   /// @returns An iterator to the element identified by key, or end().
   ///
-  iterator find(const KeyT &Key) {
-    return findIndex(KeyIndexOf(Key));
-  }
+  iterator find(const KeyT &Key) { return findIndex(KeyIndexOf(Key)); }
 
   const_iterator find(const KeyT &Key) const {
-    iterator I = const_cast<SparseMultiSet*>(this)->findIndex(KeyIndexOf(Key));
+    iterator I = const_cast<SparseMultiSet *>(this)->findIndex(KeyIndexOf(Key));
     return const_iterator(I.SMS, I.Idx, KeyIndexOf(Key));
   }
 
@@ -392,9 +383,7 @@ class SparseMultiSet {
   }
 
   /// Returns true if this set contains an element identified by Key.
-  bool contains(const KeyT &Key) const {
-    return find(Key) != end();
-  }
+  bool contains(const KeyT &Key) const { return find(Key) != end(); }
 
   /// Return the head and tail of the subset's list, otherwise returns end().
   iterator getHead(const KeyT &Key) { return find(Key); }
@@ -517,6 +506,6 @@ class SparseMultiSet {
   }
 };
 
-} // end namespace llvm
+} // namespace llvm
 
 #endif // LLVM_ADT_SPARSEMULTISET_H
diff --git a/llvm/include/llvm/ADT/SparseSet.h b/llvm/include/llvm/ADT/SparseSet.h
index a8ebc9f786486..395cfc3ebfd43 100644
--- a/llvm/include/llvm/ADT/SparseSet.h
+++ b/llvm/include/llvm/ADT/SparseSet.h
@@ -20,8 +20,8 @@
 #ifndef LLVM_ADT_SPARSESET_H
 #define LLVM_ADT_SPARSESET_H
 
-#include "llvm/ADT/identity.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/identity.h"
 #include "llvm/Support/AllocatorBase.h"
 #include <cassert>
 #include <cstdint>
@@ -53,8 +53,7 @@ namespace llvm {
 ///
 /// For best results, ValueT should not require a destructor.
 ///
-template<typename ValueT>
-struct SparseSetValTraits {
+template <typename ValueT> struct SparseSetValTraits {
   static unsigned getValIndex(const ValueT &Val) {
     return Val.getSparseSetIndex();
   }
@@ -64,7 +63,7 @@ struct SparseSetValTraits {
 /// generic implementation handles ValueT classes which either provide
 /// getSparseSetIndex() or specialize SparseSetValTraits<>.
 ///
-template<typename KeyT, typename ValueT, typename KeyFunctorT>
+template <typename KeyT, typename ValueT, typename KeyFunctorT>
 struct SparseSetValFunctor {
   unsigned operator()(const ValueT &Val) const {
     return SparseSetValTraits<ValueT>::getValIndex(Val);
@@ -73,11 +72,9 @@ struct SparseSetValFunctor {
 
 /// SparseSetValFunctor<KeyT, KeyT> - Helper class for the common case of
 /// identity key/value sets.
-template<typename KeyT, typename KeyFunctorT>
+template <typename KeyT, typename KeyFunctorT>
 struct SparseSetValFunctor<KeyT, KeyT, KeyFunctorT> {
-  unsigned operator()(const KeyT &Key) const {
-    return KeyFunctorT()(Key);
-  }
+  unsigned operator()(const KeyT &Key) const { return KeyFunctorT()(Key); }
 };
 
 /// SparseSet - Fast set implementation for objects that can be identified by
@@ -118,9 +115,8 @@ struct SparseSetValFunctor<KeyT, KeyT, KeyFunctorT> {
 /// @tparam KeyFunctorT A functor that computes an unsigned index from KeyT.
 /// @tparam SparseT     An unsigned integer type. See above.
 ///
-template<typename ValueT,
-         typename KeyFunctorT = identity<unsigned>,
-         typename SparseT = uint8_t>
+template <typename ValueT, typename KeyFunctorT = identity<unsigned>,
+          typename SparseT = uint8_t>
 class SparseSet {
   static_assert(std::is_unsigned_v<SparseT>,
                 "SparseT must be an unsigned integer type");
@@ -162,7 +158,7 @@ class SparseSet {
     // seem like a likely use case, so we can add that code when we need it.
     assert(empty() && "Can only resize universe on an empty map");
     // Hysteresis prevents needless reallocations.
-    if (U >= Universe/4 && U <= Universe)
+    if (U >= Universe / 4 && U <= Universe)
       return;
     // The Sparse array doesn't actually need to be initialized, so malloc
     // would be enough here, but that will cause tools like valgrind to
@@ -226,12 +222,10 @@ class SparseSet {
   /// @param   Key A valid key to find.
   /// @returns An iterator to the element identified by key, or end().
   ///
-  iterator find(const KeyT &Key) {
-    return findIndex(KeyIndexOf(Key));
-  }
+  iterator find(const KeyT &Key) { return findIndex(KeyIndexOf(Key)); }
 
   const_iterator find(const KeyT &Key) const {
-    return const_cast<SparseSet*>(this)->findIndex(KeyIndexOf(Key));
+    return const_cast<SparseSet *>(this)->findIndex(KeyIndexOf(Key));
   }
 
   /// Check if the set contains the given \c Key.
@@ -267,9 +261,7 @@ class SparseSet {
   /// array subscript - If an element already exists with this key, return it.
   /// Otherwise, automatically construct a new value from Key, insert it,
   /// and return the newly inserted element.
-  ValueT &operator[](const KeyT &Key) {
-    return *insert(ValueT(Key)).first;
-  }
+  ValueT &operator[](const KeyT &Key) { return *insert(ValueT(Key)).first; }
 
   ValueT pop_back_val() {
     // Sparse does not need to be cleared, see find().
@@ -318,6 +310,6 @@ class SparseSet {
   }
 };
 
-} // end namespace llvm
+} // namespace llvm
 
 #endif // LLVM_ADT_SPARSESET_H
diff --git a/llvm/include/llvm/ADT/Statistic.h b/llvm/include/llvm/ADT/Statistic.h
index 082e6d50577fa..75d608beb0134 100644
--- a/llvm/include/llvm/ADT/Statistic.h
+++ b/llvm/include/llvm/ADT/Statistic.h
@@ -131,8 +131,8 @@ class TrackingStatistic {
 
 class NoopStatistic {
 public:
-  NoopStatistic(const char * /*DebugType*/, const char * /*Name*/,
-                const char * /*Desc*/) {}
+  constexpr NoopStatistic(const char * /*DebugType*/, const char * /*Name*/,
+                          const char * /*Desc*/) {}
 
   uint64_t getValue() const { return 0; }
 
@@ -164,8 +164,13 @@ using Statistic = NoopStatistic;
 
 // STATISTIC - A macro to make definition of statistics really simple.  This
 // automatically passes the DEBUG_TYPE of the file into the statistic.
+#if LLVM_ENABLE_STATS
 #define STATISTIC(VARNAME, DESC)                                               \
   static llvm::Statistic VARNAME = {DEBUG_TYPE, #VARNAME, DESC}
+#else
+#define STATISTIC(VARNAME, DESC)                                               \
+  static llvm::Statistic VARNAME [[maybe_unused]] = {DEBUG_TYPE, #VARNAME, DESC}
+#endif
 
 // ALWAYS_ENABLED_STATISTIC - A macro to define a statistic like STATISTIC but
 // it is enabled even if LLVM_ENABLE_STATS is off.
diff --git a/llvm/include/llvm/ADT/Twine.h b/llvm/include/llvm/ADT/Twine.h
index 4ed4898df5459..d9f9c0f0d5d9c 100644
--- a/llvm/include/llvm/ADT/Twine.h
+++ b/llvm/include/llvm/ADT/Twine.h
@@ -119,22 +119,19 @@ class Twine {
     /// An int value, to render as a signed decimal integer.
     DecIKind,
 
-    /// A pointer to an unsigned long value, to render as an unsigned decimal
-    /// integer.
+    /// An unsigned long value, to render as an unsigned decimal integer.
     DecULKind,
 
-    /// A pointer to a long value, to render as a signed decimal integer.
+    /// A long value, to render as a signed decimal integer.
     DecLKind,
 
-    /// A pointer to an unsigned long long value, to render as an unsigned
-    /// decimal integer.
+    /// An unsigned long long value, to render as an unsigned decimal integer.
     DecULLKind,
 
-    /// A pointer to a long long value, to render as a signed decimal integer.
+    /// A long long value, to render as a signed decimal integer.
     DecLLKind,
 
-    /// A pointer to a uint64_t value, to render as an unsigned hexadecimal
-    /// integer.
+    /// A uint64_t value, to render as an unsigned hexadecimal integer.
     UHexKind
   };
 
@@ -150,11 +147,11 @@ class Twine {
     char character;
     unsigned int decUI;
     int decI;
-    const unsigned long *decUL;
-    const long *decL;
-    const unsigned long long *decULL;
-    const long long *decLL;
-    const uint64_t *uHex;
+    unsigned long decUL;
+    long decL;
+    unsigned long long decULL;
+    long long decLL;
+    uint64_t uHex;
   };
 
   /// LHS - The prefix in the concatenation, which may be uninitialized for
@@ -336,22 +333,18 @@ class Twine {
   explicit Twine(int Val) : LHSKind(DecIKind) { LHS.decI = Val; }
 
   /// Construct a twine to print \p Val as an unsigned decimal integer.
-  explicit Twine(const unsigned long &Val) : LHSKind(DecULKind) {
-    LHS.decUL = &Val;
-  }
+  explicit Twine(unsigned long Val) : LHSKind(DecULKind) { LHS.decUL = Val; }
 
   /// Construct a twine to print \p Val as a signed decimal integer.
-  explicit Twine(const long &Val) : LHSKind(DecLKind) { LHS.decL = &Val; }
+  explicit Twine(long Val) : LHSKind(DecLKind) { LHS.decL = Val; }
 
   /// Construct a twine to print \p Val as an unsigned decimal integer.
-  explicit Twine(const unsigned long long &Val) : LHSKind(DecULLKind) {
-    LHS.decULL = &Val;
+  explicit Twine(unsigned long long Val) : LHSKind(DecULLKind) {
+    LHS.decULL = Val;
   }
 
   /// Construct a twine to print \p Val as a signed decimal integer.
-  explicit Twine(const long long &Val) : LHSKind(DecLLKind) {
-    LHS.decLL = &Val;
-  }
+  explicit Twine(long long Val) : LHSKind(DecLLKind) { LHS.decLL = Val; }
 
   // FIXME: Unfortunately, to make sure this is as efficient as possible we
   // need extra binary constructors from particular types. We can't rely on
@@ -389,9 +382,9 @@ class Twine {
   /// @{
 
   // Construct a twine to print \p Val as an unsigned hexadecimal integer.
-  static Twine utohexstr(const uint64_t &Val) {
+  static Twine utohexstr(uint64_t Val) {
     Child LHS, RHS;
-    LHS.uHex = &Val;
+    LHS.uHex = Val;
     RHS.twine = nullptr;
     return Twine(LHS, UHexKind, RHS, EmptyKind);
   }
diff --git a/llvm/include/llvm/ADT/bit.h b/llvm/include/llvm/ADT/bit.h
index d6e33c3e6133a..67c0a1c3300fa 100644
--- a/llvm/include/llvm/ADT/bit.h
+++ b/llvm/include/llvm/ADT/bit.h
@@ -148,6 +148,35 @@ template <typename T, typename = std::enable_if_t<std::is_unsigned_v<T>>>
   return (Value != 0) && ((Value & (Value - 1)) == 0);
 }
 
+/// Count the number of set bits in a value.
+/// Ex. popcount(0xF000F000) = 8
+/// Returns 0 if Value is zero.
+template <typename T> [[nodiscard]] inline int popcount(T Value) noexcept {
+  static_assert(std::is_unsigned_v<T>, "T must be an unsigned integer type");
+  static_assert(sizeof(T) <= 8, "T must be 8 bytes or less");
+
+  if constexpr (sizeof(T) <= 4) {
+#if defined(__GNUC__)
+    return (int)__builtin_popcount(Value);
+#else
+    uint32_t V = Value;
+    V = V - ((V >> 1) & 0x55555555);
+    V = (V & 0x33333333) + ((V >> 2) & 0x33333333);
+    return int(((V + (V >> 4) & 0xF0F0F0F) * 0x1010101) >> 24);
+#endif
+  } else {
+#if defined(__GNUC__)
+    return (int)__builtin_popcountll(Value);
+#else
+    uint64_t V = Value;
+    V = V - ((V >> 1) & 0x5555555555555555ULL);
+    V = (V & 0x3333333333333333ULL) + ((V >> 2) & 0x3333333333333333ULL);
+    V = (V + (V >> 4)) & 0x0F0F0F0F0F0F0F0FULL;
+    return int((uint64_t)(V * 0x0101010101010101ULL) >> 56);
+#endif
+  }
+}
+
 /// Count number of 0's from the least significant bit to the most
 ///   stopping at the first 1.
 ///
@@ -161,7 +190,7 @@ template <typename T> [[nodiscard]] int countr_zero(T Val) {
     return std::numeric_limits<T>::digits;
 
   // Use the intrinsic if available.
-  if constexpr (sizeof(T) == 4) {
+  if constexpr (sizeof(T) <= 4) {
 #if __has_builtin(__builtin_ctz) || defined(__GNUC__)
     return __builtin_ctz(Val);
 #elif defined(_MSC_VER)
@@ -179,19 +208,9 @@ template <typename T> [[nodiscard]] int countr_zero(T Val) {
 #endif
   }
 
-  // Fall back to the bisection method.
-  unsigned ZeroBits = 0;
-  T Shift = std::numeric_limits<T>::digits >> 1;
-  T Mask = std::numeric_limits<T>::max() >> Shift;
-  while (Shift) {
-    if ((Val & Mask) == 0) {
-      Val >>= Shift;
-      ZeroBits |= Shift;
-    }
-    Shift >>= 1;
-    Mask >>= Shift;
-  }
-  return ZeroBits;
+  // Fallback to popcount.  "(Val & -Val) - 1" is a bitmask with all bits below
+  // the least significant 1 set.
+  return llvm::popcount(static_cast<std::make_unsigned_t<T>>((Val & -Val) - 1));
 }
 
 /// Count number of 0's from the most significant bit to the least
@@ -300,35 +319,6 @@ template <typename T> [[nodiscard]] T bit_ceil(T Value) {
   return T(1) << llvm::bit_width<T>(Value - 1u);
 }
 
-/// Count the number of set bits in a value.
-/// Ex. popcount(0xF000F000) = 8
-/// Returns 0 if the word is zero.
-template <typename T, typename = std::enable_if_t<std::is_unsigned_v<T>>>
-[[nodiscard]] inline int popcount(T Value) noexcept {
-  if constexpr (sizeof(T) <= 4) {
-#if defined(__GNUC__)
-    return (int)__builtin_popcount(Value);
-#else
-    uint32_t v = Value;
-    v = v - ((v >> 1) & 0x55555555);
-    v = (v & 0x33333333) + ((v >> 2) & 0x33333333);
-    return int(((v + (v >> 4) & 0xF0F0F0F) * 0x1010101) >> 24);
-#endif
-  } else if constexpr (sizeof(T) <= 8) {
-#if defined(__GNUC__)
-    return (int)__builtin_popcountll(Value);
-#else
-    uint64_t v = Value;
-    v = v - ((v >> 1) & 0x5555555555555555ULL);
-    v = (v & 0x3333333333333333ULL) + ((v >> 2) & 0x3333333333333333ULL);
-    v = (v + (v >> 4)) & 0x0F0F0F0F0F0F0F0FULL;
-    return int((uint64_t)(v * 0x0101010101010101ULL) >> 56);
-#endif
-  } else {
-    static_assert(sizeof(T) == 0, "T must be 8 bytes or less");
-  }
-}
-
 // Forward-declare rotr so that rotl can use it.
 template <typename T, typename = std::enable_if_t<std::is_unsigned_v<T>>>
 [[nodiscard]] constexpr T rotr(T V, int R);
diff --git a/llvm/include/llvm/ADT/identity.h b/llvm/include/llvm/ADT/identity.h
index 7309032362077..88d033fc01141 100644
--- a/llvm/include/llvm/ADT/identity.h
+++ b/llvm/include/llvm/ADT/identity.h
@@ -15,7 +15,6 @@
 #ifndef LLVM_ADT_IDENTITY_H
 #define LLVM_ADT_IDENTITY_H
 
-
 namespace llvm {
 
 // Similar to `std::identity` from C++20.
@@ -23,14 +22,10 @@ template <class Ty> struct identity {
   using is_transparent = void;
   using argument_type = Ty;
 
-  Ty &operator()(Ty &self) const {
-    return self;
-  }
-  const Ty &operator()(const Ty &self) const {
-    return self;
-  }
+  Ty &operator()(Ty &self) const { return self; }
+  const Ty &operator()(const Ty &self) const { return self; }
 };
 
-} // end namespace llvm
+} // namespace llvm
 
 #endif // LLVM_ADT_IDENTITY_H
diff --git a/llvm/include/llvm/ADT/ilist_node.h b/llvm/include/llvm/ADT/ilist_node.h
index 67384546a9275..8d78d5dbbda44 100644
--- a/llvm/include/llvm/ADT/ilist_node.h
+++ b/llvm/include/llvm/ADT/ilist_node.h
@@ -52,14 +52,10 @@ template <class OptionsT> class ilist_sentinel;
 
 // Selector for which iterator type to pick given the iterator-bits node option.
 template <bool use_iterator_bits, typename Opts, bool arg1, bool arg2>
-class ilist_select_iterator_type {
-public:
-  using type = ilist_iterator<Opts, arg1, arg2>;
-};
-template <typename Opts, bool arg1, bool arg2>
-class ilist_select_iterator_type<true, Opts, arg1, arg2> {
-public:
-  using type = ilist_iterator_w_bits<Opts, arg1, arg2>;
+struct ilist_select_iterator_type {
+  using type = std::conditional_t<use_iterator_bits,
+                                  ilist_iterator_w_bits<Opts, arg1, arg2>,
+                                  ilist_iterator<Opts, arg1, arg2>>;
 };
 
 /// Implementation for an ilist node.
diff --git a/llvm/include/llvm/ADT/ilist_node_options.h b/llvm/include/llvm/ADT/ilist_node_options.h
index d26e79b925ad1..143195aa9c647 100644
--- a/llvm/include/llvm/ADT/ilist_node_options.h
+++ b/llvm/include/llvm/ADT/ilist_node_options.h
@@ -82,7 +82,7 @@ template <class... Options> struct extract_sentinel_tracking;
 template <bool EnableSentinelTracking, class... Options>
 struct extract_sentinel_tracking<
     ilist_sentinel_tracking<EnableSentinelTracking>, Options...>
-    : std::integral_constant<bool, EnableSentinelTracking>, is_explicit {};
+    : std::bool_constant<EnableSentinelTracking>, is_explicit {};
 template <class Option1, class... Options>
 struct extract_sentinel_tracking<Option1, Options...>
     : extract_sentinel_tracking<Options...> {};
@@ -119,7 +119,7 @@ template <class Tag> struct is_valid_option<ilist_tag<Tag>> : std::true_type {};
 template <class... Options> struct extract_iterator_bits;
 template <bool IteratorBits, class... Options>
 struct extract_iterator_bits<ilist_iterator_bits<IteratorBits>, Options...>
-    : std::integral_constant<bool, IteratorBits> {};
+    : std::bool_constant<IteratorBits> {};
 template <class Option1, class... Options>
 struct extract_iterator_bits<Option1, Options...>
     : extract_iterator_bits<Options...> {};
@@ -149,8 +149,8 @@ template <class... Options> struct check_options;
 template <> struct check_options<> : std::true_type {};
 template <class Option1, class... Options>
 struct check_options<Option1, Options...>
-    : std::integral_constant<bool, is_valid_option<Option1>::value &&
-                                       check_options<Options...>::value> {};
+    : std::bool_constant<is_valid_option<Option1>::value &&
+                         check_options<Options...>::value> {};
 
 /// Traits for options for \a ilist_node.
 ///
diff --git a/llvm/include/llvm/Analysis/AliasAnalysis.h b/llvm/include/llvm/Analysis/AliasAnalysis.h
index b7d1251aeb723..1681079054b8b 100644
--- a/llvm/include/llvm/Analysis/AliasAnalysis.h
+++ b/llvm/include/llvm/Analysis/AliasAnalysis.h
@@ -585,6 +585,7 @@ class AAResults {
   LLVM_ABI AliasResult alias(const MemoryLocation &LocA,
                              const MemoryLocation &LocB, AAQueryInfo &AAQI,
                              const Instruction *CtxI = nullptr);
+  LLVM_ABI AliasResult aliasErrno(const MemoryLocation &Loc, const Module *M);
 
   LLVM_ABI ModRefInfo getModRefInfoMask(const MemoryLocation &Loc,
                                         AAQueryInfo &AAQI,
@@ -744,6 +745,11 @@ class LLVM_ABI AAResults::Concept {
                             const MemoryLocation &LocB, AAQueryInfo &AAQI,
                             const Instruction *CtxI) = 0;
 
+  /// Returns an AliasResult indicating whether a specific memory location
+  /// aliases errno.
+  virtual AliasResult aliasErrno(const MemoryLocation &Loc,
+                                 const Module *M) = 0;
+
   /// @}
   //===--------------------------------------------------------------------===//
   /// \name Simple mod/ref information
@@ -805,6 +811,10 @@ template <typename AAResultT> class AAResults::Model final : public Concept {
     return Result.alias(LocA, LocB, AAQI, CtxI);
   }
 
+  AliasResult aliasErrno(const MemoryLocation &Loc, const Module *M) override {
+    return Result.aliasErrno(Loc, M);
+  }
+
   ModRefInfo getModRefInfoMask(const MemoryLocation &Loc, AAQueryInfo &AAQI,
                                bool IgnoreLocals) override {
     return Result.getModRefInfoMask(Loc, AAQI, IgnoreLocals);
@@ -860,6 +870,10 @@ class AAResultBase {
     return AliasResult::MayAlias;
   }
 
+  AliasResult aliasErrno(const MemoryLocation &Loc, const Module *M) {
+    return AliasResult::MayAlias;
+  }
+
   ModRefInfo getModRefInfoMask(const MemoryLocation &Loc, AAQueryInfo &AAQI,
                                bool IgnoreLocals) {
     return ModRefInfo::ModRef;
diff --git a/llvm/include/llvm/Analysis/BasicAliasAnalysis.h b/llvm/include/llvm/Analysis/BasicAliasAnalysis.h
index 6f37a086e323b..31875e59207b3 100644
--- a/llvm/include/llvm/Analysis/BasicAliasAnalysis.h
+++ b/llvm/include/llvm/Analysis/BasicAliasAnalysis.h
@@ -73,6 +73,8 @@ class BasicAAResult : public AAResultBase {
                              const MemoryLocation &LocB, AAQueryInfo &AAQI,
                              const Instruction *CtxI);
 
+  LLVM_ABI AliasResult aliasErrno(const MemoryLocation &Loc, const Module *M);
+
   LLVM_ABI ModRefInfo getModRefInfo(const CallBase *Call,
                                     const MemoryLocation &Loc,
                                     AAQueryInfo &AAQI);
diff --git a/llvm/include/llvm/Analysis/LoopInfo.h b/llvm/include/llvm/Analysis/LoopInfo.h
index f80744e70f7ad..a7a6a2753709c 100644
--- a/llvm/include/llvm/Analysis/LoopInfo.h
+++ b/llvm/include/llvm/Analysis/LoopInfo.h
@@ -59,12 +59,11 @@ class LLVM_ABI Loop : public LoopBase<BasicBlock, Loop> {
   };
 
   /// Return true if the specified value is loop invariant.
-  bool isLoopInvariant(const Value *V, bool HasCoroSuspendInst = false) const;
+  bool isLoopInvariant(const Value *V) const;
 
   /// Return true if all the operands of the specified instruction are loop
   /// invariant.
-  bool hasLoopInvariantOperands(const Instruction *I,
-                                bool HasCoroSuspendInst = false) const;
+  bool hasLoopInvariantOperands(const Instruction *I) const;
 
   /// If the given value is an instruction inside of the loop and it can be
   /// hoisted, do so to make it trivially loop-invariant.
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index a5e98bb7bc137..41ff54f0781a2 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -23,6 +23,7 @@
 
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/BitmaskEnum.h"
 #include "llvm/Analysis/IVDescriptors.h"
 #include "llvm/IR/FMF.h"
 #include "llvm/IR/InstrTypes.h"
@@ -796,10 +797,13 @@ class TargetTransformInfo {
                            LoopInfo *LI, DominatorTree *DT, AssumptionCache *AC,
                            TargetLibraryInfo *LibInfo) const;
 
+  /// Which addressing mode Loop Strength Reduction will try to generate.
   enum AddressingModeKind {
-    AMK_PreIndexed,
-    AMK_PostIndexed,
-    AMK_None
+    AMK_None = 0x0,        ///< Don't prefer any addressing mode
+    AMK_PreIndexed = 0x1,  ///< Prefer pre-indexed addressing mode
+    AMK_PostIndexed = 0x2, ///< Prefer post-indexed addressing mode
+    AMK_All = 0x3,         ///< Consider all addressing modes
+    LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/AMK_All)
   };
 
   /// Return the preferred addressing mode LSR should make efforts to generate.
@@ -1324,7 +1328,7 @@ class TargetTransformInfo {
 
   /// \return The cost of a partial reduction, which is a reduction from a
   /// vector to another vector with fewer elements of larger size. They are
-  /// represented by the llvm.experimental.partial.reduce.add intrinsic, which
+  /// represented by the llvm.vector.partial.reduce.add intrinsic, which
   /// takes an accumulator of type \p AccumType and a second vector operand to
   /// be accumulated, whose element count is specified by \p VF. The type of
   /// reduction is specified by \p Opcode. The second operand passed to the
@@ -1847,6 +1851,10 @@ class TargetTransformInfo {
   /// otherwise scalar epilogue loop.
   LLVM_ABI bool preferEpilogueVectorization() const;
 
+  /// \returns True if the loop vectorizer should discard any VFs where the
+  /// maximum register pressure exceeds getNumberOfRegisters.
+  LLVM_ABI bool shouldConsiderVectorizationRegPressure() const;
+
   /// \returns True if the target wants to expand the given reduction intrinsic
   /// into a shuffle sequence.
   LLVM_ABI bool shouldExpandReduction(const IntrinsicInst *II) const;
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index b58386b94bba4..566e1cf51631a 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -1105,6 +1105,8 @@ class TargetTransformInfoImplBase {
 
   virtual bool preferEpilogueVectorization() const { return true; }
 
+  virtual bool shouldConsiderVectorizationRegPressure() const { return false; }
+
   virtual bool shouldExpandReduction(const IntrinsicInst *II) const {
     return true;
   }
diff --git a/llvm/include/llvm/Analysis/TypeBasedAliasAnalysis.h b/llvm/include/llvm/Analysis/TypeBasedAliasAnalysis.h
index 77edbe8527aae..38f9fc718824f 100644
--- a/llvm/include/llvm/Analysis/TypeBasedAliasAnalysis.h
+++ b/llvm/include/llvm/Analysis/TypeBasedAliasAnalysis.h
@@ -50,6 +50,7 @@ class TypeBasedAAResult : public AAResultBase {
   LLVM_ABI AliasResult alias(const MemoryLocation &LocA,
                              const MemoryLocation &LocB, AAQueryInfo &AAQI,
                              const Instruction *CtxI);
+  LLVM_ABI AliasResult aliasErrno(const MemoryLocation &Loc, const Module *M);
   LLVM_ABI ModRefInfo getModRefInfoMask(const MemoryLocation &Loc,
                                         AAQueryInfo &AAQI, bool IgnoreLocals);
 
diff --git a/llvm/include/llvm/BinaryFormat/DXContainer.h b/llvm/include/llvm/BinaryFormat/DXContainer.h
index facd137e9d9dd..c04380667a640 100644
--- a/llvm/include/llvm/BinaryFormat/DXContainer.h
+++ b/llvm/include/llvm/BinaryFormat/DXContainer.h
@@ -228,6 +228,16 @@ enum class SamplerFilter : uint32_t {
 #include "DXContainerConstants.def"
 };
 
+#define FILTER(Val, Enum)                                                      \
+  case Val:                                                                    \
+    return true;
+inline bool isValidSamplerFilter(uint32_t V) {
+  switch (V) {
+#include "DXContainerConstants.def"
+  }
+  return false;
+}
+
 LLVM_ABI ArrayRef<EnumEntry<SamplerFilter>> getSamplerFilters();
 
 #define TEXTURE_ADDRESS_MODE(Val, Enum) Enum = Val,
@@ -237,6 +247,16 @@ enum class TextureAddressMode : uint32_t {
 
 LLVM_ABI ArrayRef<EnumEntry<TextureAddressMode>> getTextureAddressModes();
 
+#define TEXTURE_ADDRESS_MODE(Val, Enum)                                        \
+  case Val:                                                                    \
+    return true;
+inline bool isValidAddress(uint32_t V) {
+  switch (V) {
+#include "DXContainerConstants.def"
+  }
+  return false;
+}
+
 #define COMPARISON_FUNC(Val, Enum) Enum = Val,
 enum class ComparisonFunc : uint32_t {
 #include "DXContainerConstants.def"
@@ -244,11 +264,31 @@ enum class ComparisonFunc : uint32_t {
 
 LLVM_ABI ArrayRef<EnumEntry<ComparisonFunc>> getComparisonFuncs();
 
+#define COMPARISON_FUNC(Val, Enum)                                             \
+  case Val:                                                                    \
+    return true;
+inline bool isValidComparisonFunc(uint32_t V) {
+  switch (V) {
+#include "DXContainerConstants.def"
+  }
+  return false;
+}
+
 #define STATIC_BORDER_COLOR(Val, Enum) Enum = Val,
 enum class StaticBorderColor : uint32_t {
 #include "DXContainerConstants.def"
 };
 
+#define STATIC_BORDER_COLOR(Val, Enum)                                         \
+  case Val:                                                                    \
+    return true;
+inline bool isValidBorderColor(uint32_t V) {
+  switch (V) {
+#include "DXContainerConstants.def"
+  }
+  return false;
+}
+
 LLVM_ABI ArrayRef<EnumEntry<StaticBorderColor>> getStaticBorderColors();
 
 LLVM_ABI PartType parsePartType(StringRef S);
diff --git a/llvm/include/llvm/BinaryFormat/Dwarf.def b/llvm/include/llvm/BinaryFormat/Dwarf.def
index b561125fe37a4..2c9a3c0f6fb04 100644
--- a/llvm/include/llvm/BinaryFormat/Dwarf.def
+++ b/llvm/include/llvm/BinaryFormat/Dwarf.def
@@ -25,7 +25,8 @@
       defined HANDLE_DW_APPLE_PROPERTY || defined HANDLE_DW_UT ||              \
       defined HANDLE_DWARF_SECTION || defined HANDLE_DW_IDX ||                 \
       defined HANDLE_DW_END || defined HANDLE_DW_SECT ||                       \
-      defined HANDLE_DW_APPLE_ENUM_KIND)
+      defined HANDLE_DW_APPLE_ENUM_KIND ||                                     \
+      ( defined HANDLE_DW_ASPACE && defined HANDLE_DW_ASPACE_PRED) )
 #error "Missing macro definition of HANDLE_DW*"
 #endif
 
@@ -151,6 +152,14 @@
 #define HANDLE_DW_APPLE_ENUM_KIND(ID, NAME)
 #endif
 
+#ifndef HANDLE_DW_ASPACE
+#define HANDLE_DW_ASPACE(ID, NAME)
+#endif
+
+#ifndef HANDLE_DW_ASPACE_PRED
+#define HANDLE_DW_ASPACE_PRED(ID, NAME, PRED)
+#endif
+
 HANDLE_DW_TAG(0x0000, null, 2, DWARF, DW_KIND_NONE)
 HANDLE_DW_TAG(0x0001, array_type, 2, DWARF, DW_KIND_TYPE)
 HANDLE_DW_TAG(0x0002, class_type, 2, DWARF, DW_KIND_TYPE)
@@ -628,6 +637,21 @@ HANDLE_DW_AT(0x3e0d, LLVM_coro_suspend_idx, 0, LLVM)
 // The DWARF v6 working draft defines DW_AT_alloc_type; use this LLVM-private ID
 // until that is released as an official standard.
 HANDLE_DW_AT(0x3e0e, LLVM_alloc_type, 0, LLVM)
+// Heterogeneous Debugging Extension defined at
+// https://llvm.org/docs/AMDGPUDwarfProposalForHeterogeneousDebugging.html.
+HANDLE_DW_AT(0x3e0f, LLVM_memory_space, 0, LLVM)
+HANDLE_DW_AT(0x3e10, LLVM_address_space, 0, LLVM)
+HANDLE_DW_AT(0x3e11, LLVM_lanes, 0, LLVM)
+HANDLE_DW_AT(0x3e12, LLVM_lane_pc, 0, LLVM)
+HANDLE_DW_AT(0x3e13, LLVM_vector_size, 0, LLVM)
+
+// https://llvm.org/docs/AMDGPUUsage.html#address-space-identifier
+HANDLE_DW_ASPACE(0x0, none)
+HANDLE_DW_ASPACE_PRED(AMDGPU::DWARFAS::GENERIC, AMDGPU_generic, SELECT_AMDGPU)
+HANDLE_DW_ASPACE_PRED(AMDGPU::DWARFAS::REGION, AMDGPU_region, SELECT_AMDGPU)
+HANDLE_DW_ASPACE_PRED(AMDGPU::DWARFAS::LOCAL, AMDGPU_local, SELECT_AMDGPU)
+HANDLE_DW_ASPACE_PRED(AMDGPU::DWARFAS::PRIVATE_LANE, AMDGPU_private_lane, SELECT_AMDGPU)
+HANDLE_DW_ASPACE_PRED(AMDGPU::DWARFAS::PRIVATE_WAVE, AMDGPU_private_wave, SELECT_AMDGPU)
 
 // Apple extensions.
 
@@ -916,6 +940,19 @@ HANDLE_DW_OP(0xe9, LLVM_user, -1, -1, 0, LLVM)
 // location stack or any of its values. It is defined as a placeholder for
 // testing purposes.
 HANDLE_DW_OP_LLVM_USEROP(0x0001, nop)
+// Heterogeneous Debugging Extension defined at
+// https://llvm.org/docs/AMDGPUDwarfProposalForHeterogeneousDebugging.html.
+HANDLE_DW_OP_LLVM_USEROP(0x0002, form_aspace_address)
+HANDLE_DW_OP_LLVM_USEROP(0x0003, push_lane)
+HANDLE_DW_OP_LLVM_USEROP(0x0004, offset)
+HANDLE_DW_OP_LLVM_USEROP(0x0005, offset_uconst)
+HANDLE_DW_OP_LLVM_USEROP(0x0006, bit_offset)
+HANDLE_DW_OP_LLVM_USEROP(0x0007, call_frame_entry_reg)
+HANDLE_DW_OP_LLVM_USEROP(0x0008, undefined)
+HANDLE_DW_OP_LLVM_USEROP(0x0009, aspace_bregx)
+HANDLE_DW_OP_LLVM_USEROP(0x000a, piece_end)
+HANDLE_DW_OP_LLVM_USEROP(0x000b, extend)
+HANDLE_DW_OP_LLVM_USEROP(0x000c, select_bit_piece)
 
 // DWARF languages.
 HANDLE_DW_LANG(0x0001, C89, 0, 2, DWARF)
@@ -1385,3 +1422,5 @@ HANDLE_DW_SECT(8, RNGLISTS)
 #undef HANDLE_DW_END
 #undef HANDLE_DW_SECT
 #undef HANDLE_DW_APPLE_ENUM_KIND
+#undef HANDLE_DW_ASPACE
+#undef HANDLE_DW_ASPACE_PRED
diff --git a/llvm/include/llvm/BinaryFormat/Dwarf.h b/llvm/include/llvm/BinaryFormat/Dwarf.h
index 231b7ac17d75f..2c5012510a5c3 100644
--- a/llvm/include/llvm/BinaryFormat/Dwarf.h
+++ b/llvm/include/llvm/BinaryFormat/Dwarf.h
@@ -19,6 +19,7 @@
 #ifndef LLVM_BINARYFORMAT_DWARF_H
 #define LLVM_BINARYFORMAT_DWARF_H
 
+#include "llvm/Support/AMDGPUAddrSpace.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/DataTypes.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -757,6 +758,12 @@ enum CallingConvention {
   DW_CC_hi_user = 0xff
 };
 
+enum AddressSpace {
+#define HANDLE_DW_ASPACE(ID, NAME) DW_ASPACE_LLVM_##NAME = ID,
+#define HANDLE_DW_ASPACE_PRED(ID, NAME, PRED) DW_ASPACE_LLVM_##NAME = ID,
+#include "llvm/BinaryFormat/Dwarf.def"
+};
+
 enum InlineAttribute {
   // Inline codes
   DW_INL_not_inlined = 0x00,
@@ -1011,6 +1018,7 @@ LLVM_ABI StringRef IndexString(unsigned Idx);
 LLVM_ABI StringRef FormatString(DwarfFormat Format);
 LLVM_ABI StringRef FormatString(bool IsDWARF64);
 LLVM_ABI StringRef RLEString(unsigned RLE);
+LLVM_ABI StringRef AddressSpaceString(unsigned AS, const llvm::Triple &TT);
 /// @}
 
 /// \defgroup DwarfConstantsParsing Dwarf constants parsing functions
diff --git a/llvm/include/llvm/BinaryFormat/SFrame.h b/llvm/include/llvm/BinaryFormat/SFrame.h
index 095db18b9c254..7b58043c60363 100644
--- a/llvm/include/llvm/BinaryFormat/SFrame.h
+++ b/llvm/include/llvm/BinaryFormat/SFrame.h
@@ -117,6 +117,7 @@ template <endianness E> struct FDEInfo {
     Info = ((PAuthKey & 1) << 5) | ((static_cast<uint8_t>(FDE) & 1) << 4) |
            (static_cast<uint8_t>(FRE) & 0xf);
   }
+  uint8_t getFuncInfo() const { return Info; }
 };
 
 template <endianness E> struct FuncDescEntry {
@@ -155,6 +156,7 @@ template <endianness E> struct FREInfo {
     Info = ((RA & 1) << 7) | ((static_cast<uint8_t>(Sz) & 3) << 5) |
            ((N & 0xf) << 1) | (static_cast<uint8_t>(Reg) & 1);
   }
+  uint8_t getFREInfo() const { return Info; }
 };
 
 template <typename T, endianness E> struct FrameRowEntry {
diff --git a/llvm/include/llvm/CAS/MappedFileRegionArena.h b/llvm/include/llvm/CAS/MappedFileRegionArena.h
new file mode 100644
index 0000000000000..ff51f0eb59929
--- /dev/null
+++ b/llvm/include/llvm/CAS/MappedFileRegionArena.h
@@ -0,0 +1,130 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This file declares interface for MappedFileRegionArena, a bump pointer
+/// allocator, backed by a memory-mapped file.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CAS_MAPPEDFILEREGIONARENA_H
+#define LLVM_CAS_MAPPEDFILEREGIONARENA_H
+
+#include "llvm/Support/Alignment.h"
+#include "llvm/Support/FileSystem.h"
+#include <atomic>
+
+namespace llvm::cas {
+
+/// Allocator for an owned mapped file region that supports thread-safe and
+/// process-safe bump pointer allocation.
+///
+/// This allocator is designed to create a sparse file when supported by the
+/// filesystem's \c ftruncate so that it can be used with a large maximum size.
+/// It will also attempt to shrink the underlying file down to its current
+/// allocation size when the last concurrent mapping is closed.
+///
+/// Process-safe. Uses file locks when resizing the file during initialization
+/// and destruction.
+///
+/// Thread-safe. Requires OS support thread-safe file lock.
+///
+/// Provides 8-byte alignment for all allocations.
+class MappedFileRegionArena {
+public:
+  using RegionT = sys::fs::mapped_file_region;
+
+  /// Header for MappedFileRegionArena. It can be configured to be located
+  /// at any location within the file and the allocation will be appended after
+  /// the header.
+  struct Header {
+    // BumpPtr for new allocation.
+    std::atomic<uint64_t> BumpPtr;
+    // Allocated size on disk.
+    std::atomic<uint64_t> AllocatedSize;
+    // Capacity of the file.
+    std::atomic<uint64_t> Capacity;
+    // Offset from the beginning of the file to this header (for verification).
+    std::atomic<uint64_t> HeaderOffset;
+  };
+
+  /// Create a \c MappedFileRegionArena.
+  ///
+  /// \param Path the path to open the mapped region.
+  /// \param Capacity the maximum size for the mapped file region.
+  /// \param HeaderOffset the offset at which to store the header. This is so
+  /// that information can be stored before the header, like a file magic.
+  /// \param NewFileConstructor is for constructing new files. It has exclusive
+  /// access to the file. Must call \c initializeBumpPtr.
+  static Expected<MappedFileRegionArena>
+  create(const Twine &Path, uint64_t Capacity, uint64_t HeaderOffset,
+         function_ref<Error(MappedFileRegionArena &)> NewFileConstructor);
+
+  /// Minimum alignment for allocations, currently hardcoded to 8B.
+  static constexpr Align getAlign() {
+    // Trick Align into giving us '8' as a constexpr.
+    struct alignas(8) T {};
+    static_assert(alignof(T) == 8, "Tautology failed?");
+    return Align::Of<T>();
+  }
+
+  /// Allocate at least \p AllocSize. Rounds up to \a getAlign().
+  Expected<char *> allocate(uint64_t AllocSize) {
+    auto Offset = allocateOffset(AllocSize);
+    if (LLVM_UNLIKELY(!Offset))
+      return Offset.takeError();
+    return data() + *Offset;
+  }
+  /// Allocate, returning the offset from \a data() instead of a pointer.
+  Expected<int64_t> allocateOffset(uint64_t AllocSize);
+
+  char *data() const { return Region.data(); }
+  uint64_t size() const { return H->BumpPtr; }
+  uint64_t capacity() const { return Region.size(); }
+
+  RegionT &getRegion() { return Region; }
+
+  ~MappedFileRegionArena() { destroyImpl(); }
+
+  MappedFileRegionArena() = default;
+  MappedFileRegionArena(MappedFileRegionArena &&RHS) { moveImpl(RHS); }
+  MappedFileRegionArena &operator=(MappedFileRegionArena &&RHS) {
+    destroyImpl();
+    moveImpl(RHS);
+    return *this;
+  }
+
+  MappedFileRegionArena(const MappedFileRegionArena &) = delete;
+  MappedFileRegionArena &operator=(const MappedFileRegionArena &) = delete;
+
+private:
+  // initialize header from offset.
+  void initializeHeader(uint64_t HeaderOffset);
+
+  void destroyImpl();
+  void moveImpl(MappedFileRegionArena &RHS) {
+    std::swap(Region, RHS.Region);
+    std::swap(H, RHS.H);
+    std::swap(Path, RHS.Path);
+    std::swap(FD, RHS.FD);
+    std::swap(SharedLockFD, RHS.SharedLockFD);
+  }
+
+private:
+  RegionT Region;
+  Header *H = nullptr;
+  std::string Path;
+  // File descriptor for the main storage file.
+  std::optional<int> FD;
+  // File descriptor for the file used as reader/writer lock.
+  std::optional<int> SharedLockFD;
+};
+
+} // namespace llvm::cas
+
+#endif // LLVM_CAS_MAPPEDFILEREGIONARENA_H
diff --git a/llvm/include/llvm/CodeGen/DebugHandlerBase.h b/llvm/include/llvm/CodeGen/DebugHandlerBase.h
index 2849497f9a43e..fee4bb116bb87 100644
--- a/llvm/include/llvm/CodeGen/DebugHandlerBase.h
+++ b/llvm/include/llvm/CodeGen/DebugHandlerBase.h
@@ -144,6 +144,8 @@ class DebugHandlerBase : public AsmPrinterHandler {
   static bool isUnsignedDIType(const DIType *Ty);
 
   const InstructionOrdering &getInstOrdering() const { return InstOrdering; }
+
+  const LexicalScopes &getLexicalScopes() const { return LScopes; }
 };
 
 } // namespace llvm
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/GISelValueTracking.h b/llvm/include/llvm/CodeGen/GlobalISel/GISelValueTracking.h
index 490d1a34cc846..2db66ba9584a3 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/GISelValueTracking.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/GISelValueTracking.h
@@ -37,8 +37,6 @@ class LLVM_ABI GISelValueTracking : public GISelChangeObserver {
   const TargetLowering &TL;
   const DataLayout &DL;
   unsigned MaxDepth;
-  /// Cache maintained during a computeKnownBits request.
-  SmallDenseMap<Register, KnownBits, 16> ComputeKnownBitsCache;
 
   void computeKnownBitsMin(Register Src0, Register Src1, KnownBits &Known,
                            const APInt &DemandedElts, unsigned Depth = 0);
@@ -60,15 +58,14 @@ class LLVM_ABI GISelValueTracking : public GISelChangeObserver {
 
 public:
   GISelValueTracking(MachineFunction &MF, unsigned MaxDepth = 6);
-  virtual ~GISelValueTracking() = default;
+  ~GISelValueTracking() = default;
 
   const MachineFunction &getMachineFunction() const { return MF; }
 
   const DataLayout &getDataLayout() const { return DL; }
 
-  virtual void computeKnownBitsImpl(Register R, KnownBits &Known,
-                                    const APInt &DemandedElts,
-                                    unsigned Depth = 0);
+  void computeKnownBitsImpl(Register R, KnownBits &Known,
+                            const APInt &DemandedElts, unsigned Depth = 0);
 
   unsigned computeNumSignBits(Register R, const APInt &DemandedElts,
                               unsigned Depth = 0);
diff --git a/llvm/include/llvm/CodeGen/LexicalScopes.h b/llvm/include/llvm/CodeGen/LexicalScopes.h
index 777a0035a2c59..993df54c05ad5 100644
--- a/llvm/include/llvm/CodeGen/LexicalScopes.h
+++ b/llvm/include/llvm/CodeGen/LexicalScopes.h
@@ -34,13 +34,12 @@ class MachineInstr;
 class MDNode;
 
 //===----------------------------------------------------------------------===//
-/// InsnRange - This is used to track range of instructions with identical
-/// lexical scope.
+/// This is used to track range of instructions with identical lexical scope.
 ///
 using InsnRange = std::pair<const MachineInstr *, const MachineInstr *>;
 
 //===----------------------------------------------------------------------===//
-/// LexicalScope - This class is used to track scope information.
+/// This class is used to track scope information.
 ///
 class LexicalScope {
 public:
@@ -66,10 +65,10 @@ class LexicalScope {
   SmallVectorImpl<LexicalScope *> &getChildren() { return Children; }
   SmallVectorImpl<InsnRange> &getRanges() { return Ranges; }
 
-  /// addChild - Add a child scope.
+  /// Add a child scope.
   void addChild(LexicalScope *S) { Children.push_back(S); }
 
-  /// openInsnRange - This scope covers instruction range starting from MI.
+  /// This scope covers instruction range starting from MI.
   void openInsnRange(const MachineInstr *MI) {
     if (!FirstInsn)
       FirstInsn = MI;
@@ -78,8 +77,7 @@ class LexicalScope {
       Parent->openInsnRange(MI);
   }
 
-  /// extendInsnRange - Extend the current instruction range covered by
-  /// this scope.
+  /// Extend the current instruction range covered by this scope.
   void extendInsnRange(const MachineInstr *MI) {
     assert(FirstInsn && "MI Range is not open!");
     LastInsn = MI;
@@ -87,9 +85,9 @@ class LexicalScope {
       Parent->extendInsnRange(MI);
   }
 
-  /// closeInsnRange - Create a range based on FirstInsn and LastInsn collected
-  /// until now. This is used when a new scope is encountered while walking
-  /// machine instructions.
+  /// Create a range based on FirstInsn and LastInsn collected until now.
+  /// This is used when a new scope is encountered while walking machine
+  /// instructions.
   void closeInsnRange(LexicalScope *NewScope = nullptr) {
     assert(LastInsn && "Last insn missing!");
     Ranges.push_back(InsnRange(FirstInsn, LastInsn));
@@ -101,7 +99,7 @@ class LexicalScope {
       Parent->closeInsnRange(NewScope);
   }
 
-  /// dominates - Return true if current scope dominates given lexical scope.
+  /// Return true if current scope dominates given lexical scope.
   bool dominates(const LexicalScope *S) const {
     if (S == this)
       return true;
@@ -116,7 +114,7 @@ class LexicalScope {
   unsigned getDFSIn() const { return DFSIn; }
   void setDFSIn(unsigned I) { DFSIn = I; }
 
-  /// dump - print lexical scope.
+  /// Print lexical scope.
   LLVM_ABI void dump(unsigned Indent = 0) const;
 
 private:
@@ -136,31 +134,36 @@ class LexicalScope {
 };
 
 //===----------------------------------------------------------------------===//
-/// LexicalScopes -  This class provides interface to collect and use lexical
-/// scoping information from machine instruction.
+/// This class provides interface to collect and use lexical scoping information
+/// from machine instruction.
 ///
 class LexicalScopes {
 public:
   LexicalScopes() = default;
 
-  /// initialize - Scan machine function and constuct lexical scope nest, resets
+  /// Scan module to build subprogram-to-function map.
+  LLVM_ABI void initialize(const Module &);
+
+  /// Scan machine function and constuct lexical scope nest, resets
   /// the instance if necessary.
-  LLVM_ABI void initialize(const MachineFunction &);
+  LLVM_ABI void scanFunction(const MachineFunction &);
+
+  /// Reset the instance so that it's prepared for another module.
+  LLVM_ABI void resetModule();
 
-  /// releaseMemory - release memory.
-  LLVM_ABI void reset();
+  /// Reset the instance so that it's prepared for another function.
+  LLVM_ABI void resetFunction();
 
-  /// empty - Return true if there is any lexical scope information available.
+  /// Return true if there is any lexical scope information available.
   bool empty() { return CurrentFnLexicalScope == nullptr; }
 
-  /// getCurrentFunctionScope - Return lexical scope for the current function.
+  /// Return lexical scope for the current function.
   LexicalScope *getCurrentFunctionScope() const {
     return CurrentFnLexicalScope;
   }
 
-  /// getMachineBasicBlocks - Populate given set using machine basic blocks
-  /// which have machine instructions that belong to lexical scope identified by
-  /// DebugLoc.
+  /// Populate given set using machine basic blocks which have machine
+  /// instructions that belong to lexical scope identified by DebugLoc.
   LLVM_ABI void
   getMachineBasicBlocks(const DILocation *DL,
                         SmallPtrSetImpl<const MachineBasicBlock *> &MBBs);
@@ -169,39 +172,44 @@ class LexicalScopes {
   /// instruction's lexical scope in a given machine basic block.
   LLVM_ABI bool dominates(const DILocation *DL, MachineBasicBlock *MBB);
 
-  /// findLexicalScope - Find lexical scope, either regular or inlined, for the
-  /// given DebugLoc. Return NULL if not found.
+  /// Find lexical scope, either regular or inlined, for the given DebugLoc.
+  /// Return NULL if not found.
   LLVM_ABI LexicalScope *findLexicalScope(const DILocation *DL);
 
-  /// getAbstractScopesList - Return a reference to list of abstract scopes.
+  /// Return a reference to list of abstract scopes.
   ArrayRef<LexicalScope *> getAbstractScopesList() const {
     return AbstractScopesList;
   }
 
-  /// findAbstractScope - Find an abstract scope or return null.
+  /// Find an abstract scope or return null.
   LexicalScope *findAbstractScope(const DILocalScope *N) {
     auto I = AbstractScopeMap.find(N);
     return I != AbstractScopeMap.end() ? &I->second : nullptr;
   }
 
-  /// findInlinedScope - Find an inlined scope for the given scope/inlined-at.
+  /// Find an inlined scope for the given scope/inlined-at.
   LexicalScope *findInlinedScope(const DILocalScope *N, const DILocation *IA) {
     auto I = InlinedLexicalScopeMap.find(std::make_pair(N, IA));
     return I != InlinedLexicalScopeMap.end() ? &I->second : nullptr;
   }
 
-  /// findLexicalScope - Find regular lexical scope or return null.
+  /// Find regular lexical scope or return null.
   LexicalScope *findLexicalScope(const DILocalScope *N) {
     auto I = LexicalScopeMap.find(N);
     return I != LexicalScopeMap.end() ? &I->second : nullptr;
   }
 
-  /// getOrCreateAbstractScope - Find or create an abstract lexical scope.
+  /// Find or create an abstract lexical scope.
   LLVM_ABI LexicalScope *getOrCreateAbstractScope(const DILocalScope *Scope);
 
+  /// Get function to which the given subprogram is attached, if exists.
+  const Function *getFunction(const DISubprogram *SP) const {
+    return FunctionMap.lookup(SP);
+  }
+
 private:
-  /// getOrCreateLexicalScope - Find lexical scope for the given Scope/IA. If
-  /// not available then create new lexical scope.
+  /// Find lexical scope for the given Scope/IA. If not available
+  /// then create new lexical scope.
   LLVM_ABI LexicalScope *
   getOrCreateLexicalScope(const DILocalScope *Scope,
                           const DILocation *IA = nullptr);
@@ -210,14 +218,14 @@ class LexicalScopes {
               : nullptr;
   }
 
-  /// getOrCreateRegularScope - Find or create a regular lexical scope.
+  /// Find or create a regular lexical scope.
   LexicalScope *getOrCreateRegularScope(const DILocalScope *Scope);
 
-  /// getOrCreateInlinedScope - Find or create an inlined lexical scope.
+  /// Find or create an inlined lexical scope.
   LexicalScope *getOrCreateInlinedScope(const DILocalScope *Scope,
                                         const DILocation *InlinedAt);
 
-  /// extractLexicalScopes - Extract instruction ranges for each lexical scopes
+  /// Extract instruction ranges for each lexical scopes
   /// for the given machine function.
   void extractLexicalScopes(SmallVectorImpl<InsnRange> &MIRanges,
                             DenseMap<const MachineInstr *, LexicalScope *> &M);
@@ -228,27 +236,27 @@ class LexicalScopes {
 
   const MachineFunction *MF = nullptr;
 
-  /// LexicalScopeMap - Tracks the scopes in the current function.
+  /// Mapping between DISubprograms and IR functions.
+  DenseMap<const DISubprogram *, const Function *> FunctionMap;
+
+  /// Tracks the scopes in the current function.
   // Use an unordered_map to ensure value pointer validity over insertion.
   std::unordered_map<const DILocalScope *, LexicalScope> LexicalScopeMap;
 
-  /// InlinedLexicalScopeMap - Tracks inlined function scopes in current
-  /// function.
+  /// Tracks inlined function scopes in current function.
   std::unordered_map<std::pair<const DILocalScope *, const DILocation *>,
                      LexicalScope,
                      pair_hash<const DILocalScope *, const DILocation *>>
       InlinedLexicalScopeMap;
 
-  /// AbstractScopeMap - These scopes are  not included LexicalScopeMap.
+  /// These scopes are  not included LexicalScopeMap.
   // Use an unordered_map to ensure value pointer validity over insertion.
   std::unordered_map<const DILocalScope *, LexicalScope> AbstractScopeMap;
 
-  /// AbstractScopesList - Tracks abstract scopes constructed while processing
-  /// a function.
+  /// Tracks abstract scopes constructed while processing a function.
   SmallVector<LexicalScope *, 4> AbstractScopesList;
 
-  /// CurrentFnLexicalScope - Top level scope for the current function.
-  ///
+  /// Top level scope for the current function.
   LexicalScope *CurrentFnLexicalScope = nullptr;
 
   /// Map a location to the set of basic blocks it dominates. This is a cache
diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
index 087affcfd55ce..6a624a7052cdd 100644
--- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
@@ -135,10 +135,9 @@ class LLVM_ABI TargetInstrInfo : public MCInstrInfo {
 
   /// Given a machine instruction descriptor, returns the register
   /// class constraint for OpNum, or NULL.
-  virtual
-  const TargetRegisterClass *getRegClass(const MCInstrDesc &MCID, unsigned OpNum,
-                                         const TargetRegisterInfo *TRI,
-                                         const MachineFunction &MF) const;
+  virtual const TargetRegisterClass *
+  getRegClass(const MCInstrDesc &MCID, unsigned OpNum,
+              const TargetRegisterInfo *TRI) const;
 
   /// Returns true if MI is an instruction we are unable to reason about
   /// (like a call or something with unmodeled side effects).
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 2ba8b29e775e0..46be271320fdd 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -480,7 +480,7 @@ class LLVM_ABI TargetLoweringBase {
     return true;
   }
 
-  /// Return true if the @llvm.experimental.vector.partial.reduce.* intrinsic
+  /// Return true if the @llvm.vector.partial.reduce.* intrinsic
   /// should be expanded using generic code in SelectionDAGBuilder.
   virtual bool
   shouldExpandPartialReductionIntrinsic(const IntrinsicInst *I) const {
diff --git a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
index 73ccc8ed5b11d..bf133f0332bcb 100644
--- a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
@@ -678,6 +678,20 @@ class LLVM_ABI TargetRegisterInfo : public MCRegisterInfo {
   getMatchingSuperRegClass(const TargetRegisterClass *A,
                            const TargetRegisterClass *B, unsigned Idx) const;
 
+  /// Find a common register class that can accomodate both the source and
+  /// destination operands of a copy-like instruction:
+  ///
+  /// DefRC:DefSubReg = COPY SrcRC:SrcSubReg
+  ///
+  /// This is a generalized form of getMatchingSuperRegClass,
+  /// getCommonSuperRegClass, and getCommonSubClass which handles 0, 1, or 2
+  /// subregister indexes. Those utilities should be preferred if the number of
+  /// non-0 subregister indexes is known.
+  const TargetRegisterClass *
+  findCommonRegClass(const TargetRegisterClass *DefRC, unsigned DefSubReg,
+                     const TargetRegisterClass *SrcRC,
+                     unsigned SrcSubReg) const;
+
   // For a copy-like instruction that defines a register of class DefRC with
   // subreg index DefSubReg, reading from another source with class SrcRC and
   // subregister SrcSubReg return true if this is a preferable copy
@@ -685,7 +699,10 @@ class LLVM_ABI TargetRegisterInfo : public MCRegisterInfo {
   virtual bool shouldRewriteCopySrc(const TargetRegisterClass *DefRC,
                                     unsigned DefSubReg,
                                     const TargetRegisterClass *SrcRC,
-                                    unsigned SrcSubReg) const;
+                                    unsigned SrcSubReg) const {
+    // If this source does not incur a cross register bank copy, use it.
+    return findCommonRegClass(DefRC, DefSubReg, SrcRC, SrcSubReg) != nullptr;
+  }
 
   /// Returns the largest legal sub-class of RC that
   /// supports the sub-register index Idx.
@@ -883,7 +900,7 @@ class LLVM_ABI TargetRegisterInfo : public MCRegisterInfo {
   /// If a target supports multiple different pointer register classes,
   /// kind specifies which one is indicated.
   virtual const TargetRegisterClass *
-  getPointerRegClass(const MachineFunction &MF, unsigned Kind=0) const {
+  getPointerRegClass(unsigned Kind = 0) const {
     llvm_unreachable("Target didn't implement getPointerRegClass!");
   }
 
diff --git a/llvm/include/llvm/Config/llvm-config.h.cmake b/llvm/include/llvm/Config/llvm-config.h.cmake
index 39136bc45c292..6488d6c01b5c6 100644
--- a/llvm/include/llvm/Config/llvm-config.h.cmake
+++ b/llvm/include/llvm/Config/llvm-config.h.cmake
@@ -146,4 +146,7 @@
    coverage bugs, and to 0 otherwise. */
 #cmakedefine01 LLVM_ENABLE_DEBUGLOC_TRACKING_ORIGIN
 
+/* Define to 1 to enable LLVM OnDisk Content Addressable Storage */
+#cmakedefine01 LLVM_ENABLE_ONDISK_CAS
+
 #endif
diff --git a/llvm/include/llvm/Frontend/HLSL/RootSignatureValidations.h b/llvm/include/llvm/Frontend/HLSL/RootSignatureValidations.h
index 24e851933949f..ea96094b18300 100644
--- a/llvm/include/llvm/Frontend/HLSL/RootSignatureValidations.h
+++ b/llvm/include/llvm/Frontend/HLSL/RootSignatureValidations.h
@@ -34,12 +34,8 @@ LLVM_ABI bool verifyDescriptorRangeFlag(uint32_t Version,
                                         dxil::ResourceClass Type,
                                         dxbc::DescriptorRangeFlags FlagsVal);
 LLVM_ABI bool verifyNumDescriptors(uint32_t NumDescriptors);
-LLVM_ABI bool verifySamplerFilter(uint32_t Value);
-LLVM_ABI bool verifyAddress(uint32_t Address);
 LLVM_ABI bool verifyMipLODBias(float MipLODBias);
 LLVM_ABI bool verifyMaxAnisotropy(uint32_t MaxAnisotropy);
-LLVM_ABI bool verifyComparisonFunc(uint32_t ComparisonFunc);
-LLVM_ABI bool verifyBorderColor(uint32_t BorderColor);
 LLVM_ABI bool verifyLOD(float LOD);
 
 LLVM_ABI bool verifyBoundOffset(uint32_t Offset);
diff --git a/llvm/include/llvm/Frontend/OpenMP/ClauseT.h b/llvm/include/llvm/Frontend/OpenMP/ClauseT.h
index 56905854f9baa..1ed23eed1571d 100644
--- a/llvm/include/llvm/Frontend/OpenMP/ClauseT.h
+++ b/llvm/include/llvm/Frontend/OpenMP/ClauseT.h
@@ -1046,6 +1046,12 @@ struct ReleaseT {
   using EmptyTrait = std::true_type;
 };
 
+// [6.0:440-441] `replayable` clause
+template <typename T, typename I, typename E> //
+struct ReplayableT {
+  using IncompleteTrait = std::true_type;
+};
+
 // V5.2: [8.2.1] `requirement` clauses
 template <typename T, typename I, typename E> //
 struct ReverseOffloadT {
@@ -1153,6 +1159,12 @@ struct ToT {
   std::tuple<OPT(Expectation), OPT(Mappers), OPT(Iterator), LocatorList> t;
 };
 
+// [6.0:440-441] `transparent` clause
+template <typename T, typename I, typename E> //
+struct TransparentT {
+  using IncompleteTrait = std::true_type;
+};
+
 // V5.2: [8.2.1] `requirement` clauses
 template <typename T, typename I, typename E> //
 struct UnifiedAddressT {
@@ -1279,7 +1291,8 @@ using EmptyClausesT = std::variant<
 template <typename T, typename I, typename E>
 using IncompleteClausesT =
     std::variant<AdjustArgsT<T, I, E>, AppendArgsT<T, I, E>, MatchT<T, I, E>,
-                 OtherwiseT<T, I, E>, WhenT<T, I, E>>;
+                 OtherwiseT<T, I, E>, ReplayableT<T, I, E>,
+                 TransparentT<T, I, E>, WhenT<T, I, E>>;
 
 template <typename T, typename I, typename E>
 using TupleClausesT =
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMP.td b/llvm/include/llvm/Frontend/OpenMP/OMP.td
index ce136197dd0d7..6a41c24e78149 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMP.td
+++ b/llvm/include/llvm/Frontend/OpenMP/OMP.td
@@ -456,6 +456,10 @@ def OMPC_Relaxed : Clause<[Spelling<"relaxed">]> {
 def OMPC_Release : Clause<[Spelling<"release">]> {
   let clangClass = "OMPReleaseClause";
 }
+def OMPC_Replayable : Clause<[Spelling<"replayable">]> {
+  let flangClass = "OmpReplayableClause";
+  let isValueOptional = true;
+}
 def OMPC_ReverseOffload : Clause<[Spelling<"reverse_offload">]> {
   let clangClass = "OMPReverseOffloadClause";
 }
@@ -523,6 +527,10 @@ def OMPC_To : Clause<[Spelling<"to">]> {
   let clangClass = "OMPToClause";
   let flangClass = "OmpToClause";
 }
+def OMPC_Transparent : Clause<[Spelling<"transparent">]> {
+  let flangClass = "OmpTransparentClause";
+  let isValueOptional = true;
+}
 def OMPC_UnifiedAddress : Clause<[Spelling<"unified_address">]> {
   let clangClass = "OMPUnifiedAddressClause";
 }
@@ -1128,6 +1136,7 @@ def OMP_Target : Directive<[Spelling<"target">]> {
     VersionedClause<OMPC_NoWait>,
     VersionedClause<OMPC_OMPX_Bare>,
     VersionedClause<OMPC_OMPX_DynCGroupMem>,
+    VersionedClause<OMPC_Replayable, 60>,
     VersionedClause<OMPC_ThreadLimit, 51>,
   ];
   let association = AS_Block;
@@ -1139,6 +1148,7 @@ def OMP_TargetData : Directive<[Spelling<"target data", 1, 52>,
     VersionedClause<OMPC_Device>,
     VersionedClause<OMPC_If>,
     VersionedClause<OMPC_Default, 60>,
+    VersionedClause<OMPC_Transparent, 60>,
   ];
   let requiredClauses = [
     VersionedClause<OMPC_Map>,
@@ -1157,6 +1167,7 @@ def OMP_TargetEnterData : Directive<[Spelling<"target enter data", 1, 52>,
     VersionedClause<OMPC_Device>,
     VersionedClause<OMPC_If>,
     VersionedClause<OMPC_NoWait>,
+    VersionedClause<OMPC_Replayable, 60>,
   ];
   let requiredClauses = [
     VersionedClause<OMPC_Map>,
@@ -1173,6 +1184,7 @@ def OMP_TargetExitData : Directive<[Spelling<"target exit data", 1, 52>,
     VersionedClause<OMPC_Device>,
     VersionedClause<OMPC_If>,
     VersionedClause<OMPC_NoWait>,
+    VersionedClause<OMPC_Replayable, 60>,
   ];
   let requiredClauses = [
     VersionedClause<OMPC_Map>,
@@ -1191,6 +1203,7 @@ def OMP_TargetUpdate : Directive<[Spelling<"target update", 1, 52>,
     VersionedClause<OMPC_Device>,
     VersionedClause<OMPC_If>,
     VersionedClause<OMPC_NoWait>,
+    VersionedClause<OMPC_Replayable, 60>,
   ];
   let association = AS_None;
   let category = CA_Executable;
@@ -1213,6 +1226,8 @@ def OMP_Task : Directive<[Spelling<"task">]> {
     VersionedClause<OMPC_Final>,
     VersionedClause<OMPC_If>,
     VersionedClause<OMPC_Priority>,
+    VersionedClause<OMPC_Replayable, 60>,
+    VersionedClause<OMPC_Transparent, 60>,
   ];
   let association = AS_Block;
   let category = CA_Executable;
@@ -1254,6 +1269,8 @@ def OMP_TaskLoop : Directive<[Spelling<"taskloop">]> {
     VersionedClause<OMPC_Final>,
     VersionedClause<OMPC_If>,
     VersionedClause<OMPC_Priority>,
+    VersionedClause<OMPC_Replayable, 60>,
+    VersionedClause<OMPC_Transparent, 60>,
   ];
   let allowedExclusiveClauses = [
     VersionedClause<OMPC_GrainSize>,
@@ -1267,6 +1284,9 @@ def OMP_TaskWait : Directive<[Spelling<"taskwait">]> {
     VersionedClause<OMPC_Depend, 50>,
     VersionedClause<OMPC_NoWait, 51>,
   ];
+  let allowedOnceClauses = [
+    VersionedClause<OMPC_Replayable, 60>,
+  ];
   let association = AS_None;
   let category = CA_Executable;
 }
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index cc1177ba3d11c..f43ef932e965a 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -125,16 +125,19 @@ class OpenMPIRBuilderConfig {
 
   /// First separator used between the initial two parts of a name.
   std::optional<StringRef> FirstSeparator;
-  /// Separator used between all of the rest consecutive parts of s name
+  /// Separator used between all of the rest consecutive parts of s name.
   std::optional<StringRef> Separator;
 
-  // Grid Value for the GPU target
+  // Grid Value for the GPU target.
   std::optional<omp::GV> GridValue;
 
   /// When compilation is being done for the OpenMP host (i.e. `IsTargetDevice =
   /// false`), this contains the list of offloading triples associated, if any.
   SmallVector<Triple> TargetTriples;
 
+  // Default address space for the target.
+  unsigned DefaultTargetAS = 0;
+
   LLVM_ABI OpenMPIRBuilderConfig();
   LLVM_ABI OpenMPIRBuilderConfig(bool IsTargetDevice, bool IsGPU,
                                  bool OpenMPOffloadMandatory,
@@ -165,6 +168,8 @@ class OpenMPIRBuilderConfig {
     return *GridValue;
   }
 
+  unsigned getDefaultTargetAS() const { return DefaultTargetAS; }
+
   bool hasRequiresFlags() const { return RequiresFlags; }
   LLVM_ABI bool hasRequiresReverseOffload() const;
   LLVM_ABI bool hasRequiresUnifiedAddress() const;
@@ -202,6 +207,7 @@ class OpenMPIRBuilderConfig {
   void setFirstSeparator(StringRef FS) { FirstSeparator = FS; }
   void setSeparator(StringRef S) { Separator = S; }
   void setGridValue(omp::GV G) { GridValue = G; }
+  void setDefaultTargetAS(unsigned AS) { DefaultTargetAS = AS; }
 
   LLVM_ABI void setHasRequiresReverseOffload(bool Value);
   LLVM_ABI void setHasRequiresUnifiedAddress(bool Value);
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
index 71f041ac138e3..01ca8da759ef7 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
@@ -26,7 +26,7 @@
 #endif
 
 #define __OMP_TYPE(VarName) OMP_TYPE(VarName, Type::get##VarName##Ty(Ctx))
-#define __OMP_PTR_TYPE(VarName) OMP_TYPE(VarName, PointerType::get(Ctx, 0))
+#define __OMP_PTR_TYPE(VarName) OMP_TYPE(VarName, PointerType::get(Ctx, DefaultTargetAS))
 
 __OMP_TYPE(Void)
 __OMP_TYPE(Int1)
diff --git a/llvm/include/llvm/IR/DataLayout.h b/llvm/include/llvm/IR/DataLayout.h
index 2acae246c0b1e..5653ee7b6837d 100644
--- a/llvm/include/llvm/IR/DataLayout.h
+++ b/llvm/include/llvm/IR/DataLayout.h
@@ -303,8 +303,6 @@ class DataLayout {
     llvm_unreachable("invalid mangling mode");
   }
 
-  LLVM_ABI static const char *getManglingComponent(const Triple &T);
-
   /// Returns true if the specified type fits in a native integer type
   /// supported by the CPU.
   ///
diff --git a/llvm/include/llvm/IR/DebugInfoMetadata.h b/llvm/include/llvm/IR/DebugInfoMetadata.h
index a4e9d1639bb2b..6652e303a6648 100644
--- a/llvm/include/llvm/IR/DebugInfoMetadata.h
+++ b/llvm/include/llvm/IR/DebugInfoMetadata.h
@@ -4665,6 +4665,7 @@ template <> struct DenseMapInfo<DebugVariable> {
 /// information).
 class DebugVariableAggregate : public DebugVariable {
 public:
+  LLVM_ABI DebugVariableAggregate(const DbgVariableRecord *DVR);
   DebugVariableAggregate(const DebugVariable &V)
       : DebugVariable(V.getVariable(), std::nullopt, V.getInlinedAt()) {}
 };
diff --git a/llvm/include/llvm/IR/GlobalObject.h b/llvm/include/llvm/IR/GlobalObject.h
index 08a02b42bdc14..e273387807cf6 100644
--- a/llvm/include/llvm/IR/GlobalObject.h
+++ b/llvm/include/llvm/IR/GlobalObject.h
@@ -121,8 +121,10 @@ class GlobalObject : public GlobalValue {
   /// appropriate default object file section.
   LLVM_ABI void setSection(StringRef S);
 
-  /// Set the section prefix for this global object.
-  LLVM_ABI void setSectionPrefix(StringRef Prefix);
+  /// If existing prefix is different from \p Prefix, set it to \p Prefix. If \p
+  /// Prefix is empty, the set clears the existing metadata. Returns true if
+  /// section prefix changed and false otherwise.
+  LLVM_ABI bool setSectionPrefix(StringRef Prefix);
 
   /// Get the section prefix for this global object.
   LLVM_ABI std::optional<StringRef> getSectionPrefix() const;
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index fb9ea10ac9127..585371a6a4423 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -2797,9 +2797,9 @@ foreach n = 2...8 in {
 
 //===-------------- Intrinsics to perform partial reduction ---------------===//
 
-def int_experimental_vector_partial_reduce_add : DefaultAttrsIntrinsic<[LLVMMatchType<0>],
-                                                                       [llvm_anyvector_ty, llvm_anyvector_ty],
-                                                                       [IntrNoMem]>;
+def int_vector_partial_reduce_add : DefaultAttrsIntrinsic<[LLVMMatchType<0>],
+                                                          [llvm_anyvector_ty, llvm_anyvector_ty],
+                                                          [IntrNoMem]>;
 
 //===----------------- Pointer Authentication Intrinsics ------------------===//
 //
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index 6d53bf8b172d8..7c9aef52b3acf 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -3147,13 +3147,8 @@ let TargetPrefix = "aarch64" in {
   // Counting elements
   //
 
-  class AdvSIMD_SME_CNTSB_Intrinsic
-    : DefaultAttrsIntrinsic<[llvm_i64_ty], [], [IntrNoMem]>;
-
-  def int_aarch64_sme_cntsb : AdvSIMD_SME_CNTSB_Intrinsic;
-  def int_aarch64_sme_cntsh : AdvSIMD_SME_CNTSB_Intrinsic;
-  def int_aarch64_sme_cntsw : AdvSIMD_SME_CNTSB_Intrinsic;
-  def int_aarch64_sme_cntsd : AdvSIMD_SME_CNTSB_Intrinsic;
+  def int_aarch64_sme_cntsd
+      : DefaultAttrsIntrinsic<[llvm_i64_ty], [], [IntrNoMem]>;
 
   //
   // PSTATE Functions
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 030d01d7a5f3f..afce1fe6af854 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -663,7 +663,7 @@ def int_amdgcn_cvt_sr_bf8_f16 : DefaultAttrsIntrinsic<
   [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>]
 >, ClangBuiltin<"__builtin_amdgcn_cvt_sr_bf8_f16">;
 
-// llvm.amdgcn.cvt.scale.pk32.f16.bf6 v32f16 vdst, v6i32 src0, i32 scale_sel [0..7]
+// llvm.amdgcn.cvt.scale.pk32.f16.bf6 v32f16 vdst, v6i32 src0, i32 scale_sel [0..15]
 class AMDGPUCvtScaleIntrinsic<LLVMType DstTy, LLVMType Src0Ty, string name> : DefaultAttrsIntrinsic<
   [DstTy], [Src0Ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<2>>]
 >, ClangBuiltin<"__builtin_amdgcn_"#name>;
@@ -3691,6 +3691,10 @@ def int_amdgcn_ashr_pk_u8_i32 : ClangBuiltin<"__builtin_amdgcn_ashr_pk_u8_i32">,
 // gfx1250 intrinsics
 // ===----------------------------------------------------------------------===//
 
+// Vanilla cluster sync-barrier
+def int_amdgcn_s_cluster_barrier : ClangBuiltin<"__builtin_amdgcn_s_cluster_barrier">,
+  Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
+
 // Async waits decrement ASYNCcnt and tensor waits decrement TENSORcnt which is
 // modeled as InaccessibleMem.
 class AMDGPUWaitAsyncIntrinsic :
diff --git a/llvm/include/llvm/IR/IntrinsicsPowerPC.td b/llvm/include/llvm/IR/IntrinsicsPowerPC.td
index fb97230e0f8eb..636e88898a55e 100644
--- a/llvm/include/llvm/IR/IntrinsicsPowerPC.td
+++ b/llvm/include/llvm/IR/IntrinsicsPowerPC.td
@@ -1358,6 +1358,18 @@ def int_ppc_vsx_lxvll :
 def int_ppc_vsx_lxvp :
     DefaultAttrsIntrinsic<[llvm_v256i1_ty], [llvm_ptr_ty],
                           [IntrReadMem, IntrArgMemOnly]>;
+def int_ppc_vsx_lxvrl :
+    DefaultAttrsIntrinsic<[llvm_v4i32_ty], [llvm_ptr_ty, llvm_i64_ty],
+                          [IntrReadMem, IntrArgMemOnly]>;
+def int_ppc_vsx_lxvrll :
+    DefaultAttrsIntrinsic<[llvm_v4i32_ty], [llvm_ptr_ty, llvm_i64_ty],
+                          [IntrReadMem, IntrArgMemOnly]>;
+def int_ppc_vsx_lxvprl :
+    DefaultAttrsIntrinsic<[llvm_v256i1_ty], [llvm_ptr_ty, llvm_i64_ty],
+                          [IntrReadMem, IntrArgMemOnly]>;
+def int_ppc_vsx_lxvprll :
+    DefaultAttrsIntrinsic<[llvm_v256i1_ty], [llvm_ptr_ty, llvm_i64_ty],
+                          [IntrReadMem, IntrArgMemOnly]>;
 
 // Vector store.
 def int_ppc_vsx_stxvw4x : Intrinsic<[], [llvm_v4i32_ty, llvm_ptr_ty],
@@ -1377,6 +1389,19 @@ def int_ppc_vsx_stxvll :
 def int_ppc_vsx_stxvp :
       Intrinsic<[], [llvm_v256i1_ty, llvm_ptr_ty], [IntrWriteMem,
       IntrArgMemOnly]>;
+def int_ppc_vsx_stxvrl :
+      Intrinsic<[], [llvm_v4i32_ty, llvm_ptr_ty, llvm_i64_ty],
+      [IntrWriteMem, IntrArgMemOnly]>;
+def int_ppc_vsx_stxvrll :
+      Intrinsic<[], [llvm_v4i32_ty, llvm_ptr_ty, llvm_i64_ty],
+      [IntrWriteMem, IntrArgMemOnly]>;
+def int_ppc_vsx_stxvprl :
+      Intrinsic<[], [llvm_v256i1_ty, llvm_ptr_ty, llvm_i64_ty], [IntrWriteMem,
+      IntrArgMemOnly]>;
+def int_ppc_vsx_stxvprll :
+      Intrinsic<[], [llvm_v256i1_ty, llvm_ptr_ty, llvm_i64_ty], [IntrWriteMem,
+      IntrArgMemOnly]>;
+
 // Vector and scalar maximum.
 def int_ppc_vsx_xvmaxdp : PowerPC_VSX_Vec_DDD_Intrinsic<"xvmaxdp">;
 def int_ppc_vsx_xvmaxsp : PowerPC_VSX_Vec_FFF_Intrinsic<"xvmaxsp">;
diff --git a/llvm/include/llvm/IR/Metadata.h b/llvm/include/llvm/IR/Metadata.h
index 33203ad85aa32..4ba31b5545cb2 100644
--- a/llvm/include/llvm/IR/Metadata.h
+++ b/llvm/include/llvm/IR/Metadata.h
@@ -919,8 +919,8 @@ class MDOperand {
 
   // Check if MDOperand is of type MDString and equals `Str`.
   bool equalsStr(StringRef Str) const {
-    return isa<MDString>(this->get()) &&
-           cast<MDString>(this->get())->getString() == Str;
+    return isa_and_nonnull<MDString>(get()) &&
+           cast<MDString>(get())->getString() == Str;
   }
 
   ~MDOperand() { untrack(); }
diff --git a/llvm/include/llvm/IR/ProfDataUtils.h b/llvm/include/llvm/IR/ProfDataUtils.h
index 61434735506f9..de9675f48c79b 100644
--- a/llvm/include/llvm/IR/ProfDataUtils.h
+++ b/llvm/include/llvm/IR/ProfDataUtils.h
@@ -16,7 +16,6 @@
 #define LLVM_IR_PROFDATAUTILS_H
 
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Twine.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/Support/Compiler.h"
 
@@ -30,6 +29,10 @@ struct MDProfLabels {
   LLVM_ABI static const char *UnknownBranchWeightsMarker;
 };
 
+/// Profile-based loop metadata that should be accessed only by using
+/// \c llvm::getLoopEstimatedTripCount and \c llvm::setLoopEstimatedTripCount.
+LLVM_ABI extern const char *LLVMLoopEstimatedTripCount;
+
 /// Checks if an Instruction has MD_prof Metadata
 LLVM_ABI bool hasProfMD(const Instruction &I);
 
@@ -193,5 +196,33 @@ LLVM_ABI bool hasExplicitlyUnknownBranchWeights(const Instruction &I);
 /// Scaling the profile data attached to 'I' using the ratio of S/T.
 LLVM_ABI void scaleProfData(Instruction &I, uint64_t S, uint64_t T);
 
+/// Get the branch weights of a branch conditioned on b1 || b2, where b1 and b2
+/// are 2 booleans that are the conditions of 2 branches for which we have the
+/// branch weights B1 and B2, respectively. In both B1 and B2, the first
+/// position (index 0) is for the 'true' branch, and the second position (index
+/// 1) is for the 'false' branch.
+inline SmallVector<uint64_t, 2>
+getDisjunctionWeights(const SmallVector<uint32_t, 2> &B1,
+                      const SmallVector<uint32_t, 2> &B2) {
+  // For the first conditional branch, the probability the "true" case is taken
+  // is p(b1) = B1[0] / (B1[0] + B1[1]). The "false" case's probability is
+  // p(not b1) = B1[1] / (B1[0] + B1[1]).
+  // Similarly for the second conditional branch and B2.
+  //
+  // The probability of the new branch NOT being taken is:
+  // not P = p((not b1) and (not b2)) =
+  //       = B1[1] / (B1[0]+B1[1]) * B2[1] / (B2[0]+B2[1]) =
+  //       = B1[1] * B2[1] / (B1[0] + B1[1]) * (B2[0] + B2[1])
+  // Then the probability of it being taken is: P = 1 - (not P).
+  // The denominator will be the same as above, and the numerator of P will be:
+  // (B1[0] + B1[1]) * (B2[0] + B2[1]) - B1[1]*B2[1]
+  // Which then reduces to what's shown below (out of the 4 terms coming out of
+  // the product of sums, the subtracted one cancels out).
+  assert(B1.size() == 2);
+  assert(B2.size() == 2);
+  auto FalseWeight = B1[1] * B2[1];
+  auto TrueWeight = B1[0] * B2[0] + B1[0] * B2[1] + B1[1] * B2[0];
+  return {TrueWeight, FalseWeight};
+}
 } // namespace llvm
 #endif
diff --git a/llvm/include/llvm/MC/DXContainerRootSignature.h b/llvm/include/llvm/MC/DXContainerRootSignature.h
index f2722fd37a4f1..54677ef70244f 100644
--- a/llvm/include/llvm/MC/DXContainerRootSignature.h
+++ b/llvm/include/llvm/MC/DXContainerRootSignature.h
@@ -60,6 +60,22 @@ struct DescriptorTable {
   }
 };
 
+struct StaticSampler {
+  dxbc::SamplerFilter Filter;
+  dxbc::TextureAddressMode AddressU;
+  dxbc::TextureAddressMode AddressV;
+  dxbc::TextureAddressMode AddressW;
+  float MipLODBias;
+  uint32_t MaxAnisotropy;
+  dxbc::ComparisonFunc ComparisonFunc;
+  dxbc::StaticBorderColor BorderColor;
+  float MinLOD;
+  float MaxLOD;
+  uint32_t ShaderRegister;
+  uint32_t RegisterSpace;
+  dxbc::ShaderVisibility ShaderVisibility;
+};
+
 struct RootParametersContainer {
   SmallVector<RootParameterInfo> ParametersInfo;
 
@@ -125,7 +141,7 @@ struct RootSignatureDesc {
   uint32_t StaticSamplersOffset = 0u;
   uint32_t NumStaticSamplers = 0u;
   mcdxbc::RootParametersContainer ParametersContainer;
-  SmallVector<dxbc::RTS0::v1::StaticSampler> StaticSamplers;
+  SmallVector<StaticSampler> StaticSamplers;
 
   LLVM_ABI void write(raw_ostream &OS) const;
 
diff --git a/llvm/include/llvm/MC/MCAsmBackend.h b/llvm/include/llvm/MC/MCAsmBackend.h
index 1625355323692..58363f0b671e2 100644
--- a/llvm/include/llvm/MC/MCAsmBackend.h
+++ b/llvm/include/llvm/MC/MCAsmBackend.h
@@ -168,6 +168,7 @@ class LLVM_ABI MCAsmBackend {
   virtual bool relaxAlign(MCFragment &F, unsigned &Size) { return false; }
   virtual bool relaxDwarfLineAddr(MCFragment &) const { return false; }
   virtual bool relaxDwarfCFA(MCFragment &) const { return false; }
+  virtual bool relaxSFrameCFA(MCFragment &) const { return false; }
 
   // Defined by linker relaxation targets to possibly emit LEB128 relocations
   // and set Value at the relocated location.
diff --git a/llvm/include/llvm/MC/MCAssembler.h b/llvm/include/llvm/MC/MCAssembler.h
index 1316d8669239d..6e1d6421b8d33 100644
--- a/llvm/include/llvm/MC/MCAssembler.h
+++ b/llvm/include/llvm/MC/MCAssembler.h
@@ -117,6 +117,7 @@ class MCAssembler {
   void relaxBoundaryAlign(MCBoundaryAlignFragment &BF);
   void relaxDwarfLineAddr(MCFragment &F);
   void relaxDwarfCallFrameFragment(MCFragment &F);
+  void relaxSFrameFragment(MCFragment &DF);
 
 public:
   /// Construct a new assembler instance.
diff --git a/llvm/include/llvm/MC/MCContext.h b/llvm/include/llvm/MC/MCContext.h
index d96609c0fec21..4a528eecfc900 100644
--- a/llvm/include/llvm/MC/MCContext.h
+++ b/llvm/include/llvm/MC/MCContext.h
@@ -484,6 +484,10 @@ class MCContext {
   /// \param Name - The symbol name, which must be unique across all symbols.
   LLVM_ABI MCSymbol *getOrCreateSymbol(const Twine &Name);
 
+  /// Variant of getOrCreateSymbol that handles backslash-escaped symbols.
+  /// For example, parse "a\"b\\" as a"\.
+  LLVM_ABI MCSymbol *parseSymbol(const Twine &Name);
+
   /// Gets a symbol that will be defined to the final stack offset of a local
   /// variable after codegen.
   ///
diff --git a/llvm/include/llvm/MC/MCDecoder.h b/llvm/include/llvm/MC/MCDecoder.h
index 87df6c10d8bb2..175f6a9591558 100644
--- a/llvm/include/llvm/MC/MCDecoder.h
+++ b/llvm/include/llvm/MC/MCDecoder.h
@@ -58,20 +58,6 @@ uint64_t fieldFromInstruction(const std::bitset<N> &Insn, unsigned StartBit,
   return ((Insn >> StartBit) & Mask).to_ullong();
 }
 
-// Helper function for inserting bits extracted from an encoded instruction into
-// an integer-typed field.
-template <typename IntType>
-static std::enable_if_t<std::is_integral_v<IntType>, void>
-insertBits(IntType &field, IntType bits, unsigned startBit, unsigned numBits) {
-  // Check that no bit beyond numBits is set, so that a simple bitwise |
-  // is sufficient.
-  assert((~(((IntType)1 << numBits) - 1) & bits) == 0 &&
-         "bits has more than numBits bits set");
-  assert(startBit + numBits <= sizeof(IntType) * 8);
-  (void)numBits;
-  field |= bits << startBit;
-}
-
 } // namespace llvm::MCD
 
 #endif // LLVM_MC_MCDECODER_H
diff --git a/llvm/include/llvm/MC/MCDecoderOps.h b/llvm/include/llvm/MC/MCDecoderOps.h
index 790ff3eb4f333..5afc0387f561f 100644
--- a/llvm/include/llvm/MC/MCDecoderOps.h
+++ b/llvm/include/llvm/MC/MCDecoderOps.h
@@ -24,7 +24,6 @@ enum DecoderOps {
                          //                uleb128 Val)
   OPC_CheckPredicate,    // OPC_CheckPredicate(uleb128 PIdx)
   OPC_Decode,            // OPC_Decode(uleb128 Opcode, uleb128 DIdx)
-  OPC_TryDecode,         // OPC_TryDecode(uleb128 Opcode, uleb128 DIdx)
   OPC_SoftFail,          // OPC_SoftFail(uleb128 PMask, uleb128 NMask)
 };
 
diff --git a/llvm/include/llvm/MC/MCObjectStreamer.h b/llvm/include/llvm/MC/MCObjectStreamer.h
index b9e813b9b0d28..1899cb6331c6f 100644
--- a/llvm/include/llvm/MC/MCObjectStreamer.h
+++ b/llvm/include/llvm/MC/MCObjectStreamer.h
@@ -150,6 +150,9 @@ class MCObjectStreamer : public MCStreamer {
                              MCSymbol *EndLabel = nullptr) override;
   void emitDwarfAdvanceFrameAddr(const MCSymbol *LastLabel,
                                  const MCSymbol *Label, SMLoc Loc);
+  void emitSFrameCalculateFuncOffset(const MCSymbol *FunCabsel,
+                                     const MCSymbol *FREBegin,
+                                     MCFragment *FDEFrag, SMLoc Loc);
   void emitCVLocDirective(unsigned FunctionId, unsigned FileNo, unsigned Line,
                           unsigned Column, bool PrologueEnd, bool IsStmt,
                           StringRef FileName, SMLoc Loc) override;
diff --git a/llvm/include/llvm/MC/MCParser/MCAsmParser.h b/llvm/include/llvm/MC/MCParser/MCAsmParser.h
index cb9bd5c600d52..e3f44a08db641 100644
--- a/llvm/include/llvm/MC/MCParser/MCAsmParser.h
+++ b/llvm/include/llvm/MC/MCParser/MCAsmParser.h
@@ -279,6 +279,9 @@ class LLVM_ABI MCAsmParser {
   /// Res to the identifier contents.
   virtual bool parseIdentifier(StringRef &Res) = 0;
 
+  /// Parse identifier and get or create symbol for it.
+  bool parseSymbol(MCSymbol *&Res);
+
   /// Parse up to the end of statement and return the contents from the
   /// current token until the end of the statement; the current token on exit
   /// will be either the EndOfStatement or EOF.
diff --git a/llvm/include/llvm/MC/MCSFrame.h b/llvm/include/llvm/MC/MCSFrame.h
index 8f182a86d1ab1..694aec55aefeb 100644
--- a/llvm/include/llvm/MC/MCSFrame.h
+++ b/llvm/include/llvm/MC/MCSFrame.h
@@ -16,9 +16,14 @@
 #ifndef LLVM_MC_MCSFRAME_H
 #define LLVM_MC_MCSFRAME_H
 
+#include "llvm/ADT/SmallVector.h"
+#include <cstdint>
+
 namespace llvm {
 
+class MCContext;
 class MCObjectStreamer;
+class MCFragment;
 
 class MCSFrameEmitter {
 public:
@@ -26,6 +31,15 @@ class MCSFrameEmitter {
   //
   // \param Streamer - Emit into this stream.
   static void emit(MCObjectStreamer &Streamer);
+
+  // Encode the FRE's function offset.
+  //
+  // \param C - Context.
+  // \param Offset - Offset to encode.
+  // \param Out - Destination of the encoding.
+  // \param FDEFrag - Frag that specifies the encoding format.
+  static void encodeFuncOffset(MCContext &C, uint64_t Offset,
+                               SmallVectorImpl<char> &Out, MCFragment *FDEFrag);
 };
 
 } // end namespace llvm
diff --git a/llvm/include/llvm/MC/MCSection.h b/llvm/include/llvm/MC/MCSection.h
index 12389d623e588..a26e6cfb2158a 100644
--- a/llvm/include/llvm/MC/MCSection.h
+++ b/llvm/include/llvm/MC/MCSection.h
@@ -59,6 +59,7 @@ class MCFragment {
     FT_Org,
     FT_Dwarf,
     FT_DwarfFrame,
+    FT_SFrame,
     FT_BoundaryAlign,
     FT_SymbolId,
     FT_CVInlineLines,
@@ -143,6 +144,12 @@ class MCFragment {
       // .loc dwarf directives.
       int64_t LineDelta;
     } dwarf;
+    struct {
+      // This FRE describes unwind info at AddrDelta from function start.
+      const MCExpr *AddrDelta;
+      // Fragment that records how many bytes of AddrDelta to emit.
+      MCFragment *FDEFragment;
+    } sframe;
   } u{};
 
 public:
@@ -296,6 +303,24 @@ class MCFragment {
     assert(Kind == FT_Dwarf);
     u.dwarf.LineDelta = LineDelta;
   }
+
+  //== FT_SFrame functions
+  const MCExpr &getSFrameAddrDelta() const {
+    assert(Kind == FT_SFrame);
+    return *u.sframe.AddrDelta;
+  }
+  void setSFrameAddrDelta(const MCExpr *E) {
+    assert(Kind == FT_SFrame);
+    u.sframe.AddrDelta = E;
+  }
+  MCFragment *getSFrameFDE() const {
+    assert(Kind == FT_SFrame);
+    return u.sframe.FDEFragment;
+  }
+  void setSFrameFDE(MCFragment *F) {
+    assert(Kind == FT_SFrame);
+    u.sframe.FDEFragment = F;
+  }
 };
 
 // MCFragment subclasses do not use the fixed-size part or variable-size tail of
diff --git a/llvm/include/llvm/ObjectYAML/DXContainerYAML.h b/llvm/include/llvm/ObjectYAML/DXContainerYAML.h
index 359b27761cea3..62bfee7693db1 100644
--- a/llvm/include/llvm/ObjectYAML/DXContainerYAML.h
+++ b/llvm/include/llvm/ObjectYAML/DXContainerYAML.h
@@ -92,7 +92,7 @@ struct RootDescriptorYaml {
 };
 
 struct DescriptorRangeYaml {
-  uint32_t RangeType;
+  dxil::ResourceClass RangeType;
   uint32_t NumDescriptors;
   uint32_t BaseShaderRegister;
   uint32_t RegisterSpace;
@@ -111,12 +111,12 @@ struct DescriptorTableYaml {
 };
 
 struct RootParameterHeaderYaml {
-  uint32_t Type;
-  uint32_t Visibility;
+  dxbc::RootParameterType Type;
+  dxbc::ShaderVisibility Visibility;
   uint32_t Offset;
 
   RootParameterHeaderYaml(){};
-  RootParameterHeaderYaml(uint32_t T) : Type(T) {}
+  RootParameterHeaderYaml(dxbc::RootParameterType T) : Type(T) {}
 };
 
 struct RootParameterLocationYaml {
@@ -165,21 +165,19 @@ struct RootParameterYamlDesc {
 };
 
 struct StaticSamplerYamlDesc {
-  uint32_t Filter = llvm::to_underlying(dxbc::SamplerFilter::Anisotropic);
-  uint32_t AddressU = llvm::to_underlying(dxbc::TextureAddressMode::Wrap);
-  uint32_t AddressV = llvm::to_underlying(dxbc::TextureAddressMode::Wrap);
-  uint32_t AddressW = llvm::to_underlying(dxbc::TextureAddressMode::Wrap);
+  dxbc::SamplerFilter Filter = dxbc::SamplerFilter::Anisotropic;
+  dxbc::TextureAddressMode AddressU = dxbc::TextureAddressMode::Wrap;
+  dxbc::TextureAddressMode AddressV = dxbc::TextureAddressMode::Wrap;
+  dxbc::TextureAddressMode AddressW = dxbc::TextureAddressMode::Wrap;
   float MipLODBias = 0.f;
   uint32_t MaxAnisotropy = 16u;
-  uint32_t ComparisonFunc =
-      llvm::to_underlying(dxbc::ComparisonFunc::LessEqual);
-  uint32_t BorderColor =
-      llvm::to_underlying(dxbc::StaticBorderColor::OpaqueWhite);
+  dxbc::ComparisonFunc ComparisonFunc = dxbc::ComparisonFunc::LessEqual;
+  dxbc::StaticBorderColor BorderColor = dxbc::StaticBorderColor::OpaqueWhite;
   float MinLOD = 0.f;
   float MaxLOD = std::numeric_limits<float>::max();
   uint32_t ShaderRegister;
   uint32_t RegisterSpace;
-  uint32_t ShaderVisibility;
+  dxbc::ShaderVisibility ShaderVisibility;
 };
 
 struct RootSignatureYamlDesc {
@@ -321,6 +319,13 @@ LLVM_YAML_DECLARE_ENUM_TRAITS(llvm::dxbc::PSV::ResourceKind)
 LLVM_YAML_DECLARE_ENUM_TRAITS(llvm::dxbc::D3DSystemValue)
 LLVM_YAML_DECLARE_ENUM_TRAITS(llvm::dxbc::SigComponentType)
 LLVM_YAML_DECLARE_ENUM_TRAITS(llvm::dxbc::SigMinPrecision)
+LLVM_YAML_DECLARE_ENUM_TRAITS(llvm::dxbc::RootParameterType)
+LLVM_YAML_DECLARE_ENUM_TRAITS(dxil::ResourceClass)
+LLVM_YAML_DECLARE_ENUM_TRAITS(llvm::dxbc::SamplerFilter)
+LLVM_YAML_DECLARE_ENUM_TRAITS(llvm::dxbc::StaticBorderColor)
+LLVM_YAML_DECLARE_ENUM_TRAITS(llvm::dxbc::TextureAddressMode)
+LLVM_YAML_DECLARE_ENUM_TRAITS(llvm::dxbc::ShaderVisibility)
+LLVM_YAML_DECLARE_ENUM_TRAITS(llvm::dxbc::ComparisonFunc)
 
 namespace llvm {
 
diff --git a/llvm/include/llvm/Option/ArgList.h b/llvm/include/llvm/Option/ArgList.h
index 313164bc29689..3e80574355b87 100644
--- a/llvm/include/llvm/Option/ArgList.h
+++ b/llvm/include/llvm/Option/ArgList.h
@@ -253,8 +253,8 @@ class ArgList {
   }
 
   /// Return the last argument matching \p Id, or null.
-  template<typename ...OptSpecifiers>
-  Arg *getLastArg(OptSpecifiers ...Ids) const {
+  template <typename... OptSpecifiers>
+  LLVM_ATTRIBUTE_NOINLINE Arg *getLastArg(OptSpecifiers... Ids) const {
     Arg *Res = nullptr;
     for (Arg *A : filtered(Ids...)) {
       Res = A;
@@ -265,8 +265,8 @@ class ArgList {
 
   /// Return the last argument matching \p Id, or null. Do not "claim" the
   /// option (don't mark it as having been used).
-  template<typename ...OptSpecifiers>
-  Arg *getLastArgNoClaim(OptSpecifiers ...Ids) const {
+  template <typename... OptSpecifiers>
+  LLVM_ATTRIBUTE_NOINLINE Arg *getLastArgNoClaim(OptSpecifiers... Ids) const {
     for (Arg *A : filtered_reverse(Ids...))
       return A;
     return nullptr;
diff --git a/llvm/include/llvm/Passes/PassBuilder.h b/llvm/include/llvm/Passes/PassBuilder.h
index 9cdb7ca7dbc9b..2742ec1b71b7e 100644
--- a/llvm/include/llvm/Passes/PassBuilder.h
+++ b/llvm/include/llvm/Passes/PassBuilder.h
@@ -65,6 +65,9 @@ class PipelineTuningOptions {
   /// false.
   bool LoopInterchange;
 
+  /// Tuning option to enable/disable loop fusion. Its default value is false.
+  bool LoopFusion;
+
   /// Tuning option to forget all SCEV loops in LoopUnroll. Its default value
   /// is that of the flag: `-forget-scev-loop-unroll`.
   bool ForgetAllSCEVInLoopUnroll;
diff --git a/llvm/include/llvm/ProfileData/SampleProf.h b/llvm/include/llvm/ProfileData/SampleProf.h
index a626071d23915..3dd34aba2d716 100644
--- a/llvm/include/llvm/ProfileData/SampleProf.h
+++ b/llvm/include/llvm/ProfileData/SampleProf.h
@@ -62,7 +62,7 @@ enum class sampleprof_error {
   uncompress_failed,
   zlib_unavailable,
   hash_mismatch,
-  illegal_line_offset
+  illegal_line_offset,
 };
 
 inline std::error_code make_error_code(sampleprof_error E) {
@@ -91,6 +91,8 @@ struct is_error_code_enum<llvm::sampleprof_error> : std::true_type {};
 namespace llvm {
 namespace sampleprof {
 
+constexpr char kVTableProfPrefix[] = "vtables ";
+
 enum SampleProfileFormat {
   SPF_None = 0,
   SPF_Text = 0x1,
@@ -204,6 +206,9 @@ enum class SecProfSummaryFlags : uint32_t {
   /// SecFlagIsPreInlined means this profile contains ShouldBeInlined
   /// contexts thus this is CS preinliner computed.
   SecFlagIsPreInlined = (1 << 4),
+
+  /// SecFlagHasVTableTypeProf means this profile contains vtable type profiles.
+  SecFlagHasVTableTypeProf = (1 << 5),
 };
 
 enum class SecFuncMetadataFlags : uint32_t {
@@ -303,7 +308,7 @@ struct LineLocation {
   }
 
   uint64_t getHashCode() const {
-    return ((uint64_t) Discriminator << 32) | LineOffset;
+    return ((uint64_t)Discriminator << 32) | LineOffset;
   }
 
   uint32_t LineOffset;
@@ -318,16 +323,30 @@ struct LineLocationHash {
 
 LLVM_ABI raw_ostream &operator<<(raw_ostream &OS, const LineLocation &Loc);
 
+/// Key represents type of a C++ polymorphic class type by its vtable and value
+/// represents its counter.
+/// TODO: The class name FunctionId should be renamed to SymbolId in a refactor
+/// change.
+using TypeCountMap = std::map<FunctionId, uint64_t>;
+
+/// Write \p Map to the output stream. Keys are linearized using \p NameTable
+/// and written as ULEB128. Values are written as ULEB128 as well.
+std::error_code
+serializeTypeMap(const TypeCountMap &Map,
+                 const MapVector<FunctionId, uint32_t> &NameTable,
+                 raw_ostream &OS);
+
 /// Representation of a single sample record.
 ///
 /// A sample record is represented by a positive integer value, which
 /// indicates how frequently was the associated line location executed.
 ///
 /// Additionally, if the associated location contains a function call,
-/// the record will hold a list of all the possible called targets. For
-/// direct calls, this will be the exact function being invoked. For
-/// indirect calls (function pointers, virtual table dispatch), this
-/// will be a list of one or more functions.
+/// the record will hold a list of all the possible called targets and the types
+/// for virtual table dispatches. For direct calls, this will be the exact
+/// function being invoked. For indirect calls (function pointers, virtual table
+/// dispatch), this will be a list of one or more functions. For virtual table
+/// dispatches, this record will also hold the type of the object.
 class SampleRecord {
 public:
   using CallTarget = std::pair<FunctionId, uint64_t>;
@@ -746,6 +765,7 @@ using BodySampleMap = std::map<LineLocation, SampleRecord>;
 // memory, which is *very* significant for large profiles.
 using FunctionSamplesMap = std::map<FunctionId, FunctionSamples>;
 using CallsiteSampleMap = std::map<LineLocation, FunctionSamplesMap>;
+using CallsiteTypeMap = std::map<LineLocation, TypeCountMap>;
 using LocToLocMap =
     std::unordered_map<LineLocation, LineLocation, LineLocationHash>;
 
@@ -939,6 +959,14 @@ class FunctionSamples {
     return &Iter->second;
   }
 
+  /// Returns the TypeCountMap for inlined callsites at the given \p Loc.
+  const TypeCountMap *findCallsiteTypeSamplesAt(const LineLocation &Loc) const {
+    auto Iter = VirtualCallsiteTypeCounts.find(mapIRLocToProfileLoc(Loc));
+    if (Iter == VirtualCallsiteTypeCounts.end())
+      return nullptr;
+    return &Iter->second;
+  }
+
   /// Returns a pointer to FunctionSamples at the given callsite location
   /// \p Loc with callee \p CalleeName. If no callsite can be found, relax
   /// the restriction to return the FunctionSamples at callsite location
@@ -1000,6 +1028,61 @@ class FunctionSamples {
     return CallsiteSamples;
   }
 
+  /// Returns vtable access samples for the C++ types collected in this
+  /// function.
+  const CallsiteTypeMap &getCallsiteTypeCounts() const {
+    return VirtualCallsiteTypeCounts;
+  }
+
+  /// Returns the vtable access samples for the C++ types for \p Loc.
+  /// Under the hood, the caller-specified \p Loc will be un-drifted before the
+  /// type sample lookup if possible.
+  TypeCountMap &getTypeSamplesAt(const LineLocation &Loc) {
+    return VirtualCallsiteTypeCounts[mapIRLocToProfileLoc(Loc)];
+  }
+
+  /// At location \p Loc, add a type sample for the given \p Type with
+  /// \p Count. This function uses saturating add which clamp the result to
+  /// maximum uint64_t (the counter type), and inserts the saturating add result
+  /// to map.  Returns counter_overflow to caller if the actual result is larger
+  /// than maximum uint64_t.
+  sampleprof_error addTypeSamplesAt(const LineLocation &Loc, FunctionId Type,
+                                    uint64_t Count) {
+    auto &TypeCounts = getTypeSamplesAt(Loc);
+    bool Overflowed = false;
+    TypeCounts[Type] = SaturatingMultiplyAdd(Count, /* Weight= */ (uint64_t)1,
+                                             TypeCounts[Type], &Overflowed);
+    return Overflowed ? sampleprof_error::counter_overflow
+                      : sampleprof_error::success;
+  }
+
+  /// Scale \p Other sample counts by \p Weight and add the scaled result to the
+  /// type samples for \p Loc. Under the hoold, the caller-provided \p Loc will
+  /// be un-drifted before the type sample lookup if possible.
+  /// typename T is either a std::map or a DenseMap.
+  template <typename T>
+  sampleprof_error addCallsiteVTableTypeProfAt(const LineLocation &Loc,
+                                               const T &Other,
+                                               uint64_t Weight = 1) {
+    static_assert((std::is_same_v<typename T::key_type, StringRef> ||
+                   std::is_same_v<typename T::key_type, FunctionId>) &&
+                      std::is_same_v<typename T::mapped_type, uint64_t>,
+                  "T must be a map with StringRef or FunctionId as key and "
+                  "uint64_t as value");
+    TypeCountMap &TypeCounts = getTypeSamplesAt(Loc);
+    bool Overflowed = false;
+
+    for (const auto [Type, Count] : Other) {
+      FunctionId TypeId(Type);
+      bool RowOverflow = false;
+      TypeCounts[TypeId] = SaturatingMultiplyAdd(
+          Count, Weight, TypeCounts[TypeId], &RowOverflow);
+      Overflowed |= RowOverflow;
+    }
+    return Overflowed ? sampleprof_error::counter_overflow
+                      : sampleprof_error::success;
+  }
+
   /// Return the maximum of sample counts in a function body. When SkipCallSite
   /// is false, which is the default, the return count includes samples in the
   /// inlined functions. When SkipCallSite is true, the return count only
@@ -1054,6 +1137,10 @@ class FunctionSamples {
         mergeSampleProfErrors(Result,
                               FSMap[Rec.first].merge(Rec.second, Weight));
     }
+    for (const auto &[Loc, OtherTypeMap] : Other.getCallsiteTypeCounts())
+      mergeSampleProfErrors(
+          Result, addCallsiteVTableTypeProfAt(Loc, OtherTypeMap, Weight));
+
     return Result;
   }
 
@@ -1297,6 +1384,23 @@ class FunctionSamples {
   /// collected in the call to baz() at line offset 8.
   CallsiteSampleMap CallsiteSamples;
 
+  /// Map a virtual callsite to the list of accessed vtables and vtable counts.
+  /// The callsite is referenced by its source location.
+  ///
+  /// For example, given:
+  ///
+  ///     void foo() {
+  ///       ...
+  ///  5    inlined_vcall_bar();
+  ///       ...
+  ///  5    inlined_vcall_baz();
+  ///       ...
+  ///  200  inlined_vcall_qux();
+  ///     }
+  /// This map will contain two entries. One with two types for line offset 5
+  /// and one with one type for line offset 200.
+  CallsiteTypeMap VirtualCallsiteTypeCounts;
+
   /// IR to profile location map generated by stale profile matching.
   ///
   /// Each entry is a mapping from the location on current build to the matched
diff --git a/llvm/include/llvm/ProfileData/SampleProfReader.h b/llvm/include/llvm/ProfileData/SampleProfReader.h
index bfe079fbe536f..799938ab901c1 100644
--- a/llvm/include/llvm/ProfileData/SampleProfReader.h
+++ b/llvm/include/llvm/ProfileData/SampleProfReader.h
@@ -589,6 +589,10 @@ class SampleProfileReader {
   /// Whether the function profiles use FS discriminators.
   bool ProfileIsFS = false;
 
+  /// If true, the profile has vtable profiles and reader should decode them
+  /// to parse profiles correctly.
+  bool ReadVTableProf = false;
+
   /// \brief The format of sample.
   SampleProfileFormat Format = SPF_None;
 
@@ -703,6 +707,14 @@ class LLVM_ABI SampleProfileReaderBinary : public SampleProfileReader {
   /// otherwise same as readStringFromTable, also return its hash value.
   ErrorOr<std::pair<SampleContext, uint64_t>> readSampleContextFromTable();
 
+  /// Read all virtual functions' vtable access counts for \p FProfile.
+  std::error_code readCallsiteVTableProf(FunctionSamples &FProfile);
+
+  /// Read bytes from the input buffer pointed by `Data` and decode them into
+  /// \p M. `Data` will be advanced to the end of the read bytes when this
+  /// function returns. Returns error if any.
+  std::error_code readVTableTypeCountMap(TypeCountMap &M);
+
   /// Points to the current location in the buffer.
   const uint8_t *Data = nullptr;
 
diff --git a/llvm/include/llvm/ProfileData/SampleProfWriter.h b/llvm/include/llvm/ProfileData/SampleProfWriter.h
index e84b2095efd7b..9dbeaf56509b0 100644
--- a/llvm/include/llvm/ProfileData/SampleProfWriter.h
+++ b/llvm/include/llvm/ProfileData/SampleProfWriter.h
@@ -217,13 +217,20 @@ class LLVM_ABI SampleProfileWriterBinary : public SampleProfileWriter {
   std::error_code writeBody(const FunctionSamples &S);
   inline void stablizeNameTable(MapVector<FunctionId, uint32_t> &NameTable,
                                 std::set<FunctionId> &V);
-  
+
   MapVector<FunctionId, uint32_t> NameTable;
-  
+
   void addName(FunctionId FName);
   virtual void addContext(const SampleContext &Context);
   void addNames(const FunctionSamples &S);
 
+  /// Write \p CallsiteTypeMap to the output stream \p OS.
+  std::error_code
+  writeCallsiteVTableProf(const CallsiteTypeMap &CallsiteTypeMap,
+                          raw_ostream &OS);
+
+  bool WriteVTableProf = false;
+
 private:
   LLVM_ABI friend ErrorOr<std::unique_ptr<SampleProfileWriter>>
   SampleProfileWriter::create(std::unique_ptr<raw_ostream> &OS,
@@ -412,8 +419,7 @@ class LLVM_ABI SampleProfileWriterExtBinaryBase
 class LLVM_ABI SampleProfileWriterExtBinary
     : public SampleProfileWriterExtBinaryBase {
 public:
-  SampleProfileWriterExtBinary(std::unique_ptr<raw_ostream> &OS)
-      : SampleProfileWriterExtBinaryBase(OS) {}
+  SampleProfileWriterExtBinary(std::unique_ptr<raw_ostream> &OS);
 
 private:
   std::error_code writeDefaultLayout(const SampleProfileMap &ProfileMap);
diff --git a/llvm/include/llvm/Support/AMDGPUAddrSpace.h b/llvm/include/llvm/Support/AMDGPUAddrSpace.h
index b1b1dd49a2c4a..0c89512310289 100644
--- a/llvm/include/llvm/Support/AMDGPUAddrSpace.h
+++ b/llvm/include/llvm/Support/AMDGPUAddrSpace.h
@@ -120,6 +120,51 @@ inline bool isConstantAddressSpace(unsigned AS) {
     return false;
   }
 }
+
+namespace DWARFAS {
+enum : unsigned {
+  GLOBAL = 0,
+  GENERIC = 1,
+  REGION = 2,
+  LOCAL = 3,
+  PRIVATE_LANE = 5,
+  PRIVATE_WAVE = 6,
+  DEFAULT = GLOBAL,
+};
+} // namespace DWARFAS
+
+namespace impl {
+// TODO: Move this into mapToDWARFAddrSpace when we switch to C++23
+// (see https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2022/p2647r1.html)
+static constexpr unsigned LLVMToDWARFAddrSpaceMapping[] = {
+    DWARFAS::GENERIC,     //< AMDGPUAS::FLAT_ADDRESS
+    DWARFAS::GLOBAL,      //< AMDGPUAS::GLOBAL_ADDRESS
+    DWARFAS::REGION,      //< AMDGPUAS::REGION_ADDRESS
+    DWARFAS::LOCAL,       //< AMDGPUAS::LOCAL_ADDRESS
+    DWARFAS::GLOBAL,      //< AMDGPUAS::CONSTANT_ADDRESS
+    DWARFAS::PRIVATE_LANE //< AMDGPUAS::PRIVATE_ADDRESS
+};
+} // end namespace impl
+
+/// If @p LLVMAddressSpace has a corresponding DWARF encoding,
+/// return it; otherwise return the sentinel value -1 to indicate
+/// no such mapping exists.
+///
+/// This maps private/scratch to the focused lane view.
+///
+/// These mappings must be kept in sync with llvm/docs/AMDGPUUsage.rst
+/// table "AMDGPU DWARF Address Space Mapping".
+///
+/// Note: This could return std::optional<int> but that would require
+/// an extra #include.
+constexpr int mapToDWARFAddrSpace(unsigned LLVMAddrSpace) {
+  constexpr unsigned SizeOfLLVMToDWARFAddrSpaceMapping =
+      sizeof(impl::LLVMToDWARFAddrSpaceMapping) /
+      sizeof(impl::LLVMToDWARFAddrSpaceMapping[0]);
+  if (LLVMAddrSpace < SizeOfLLVMToDWARFAddrSpaceMapping)
+    return impl::LLVMToDWARFAddrSpaceMapping[LLVMAddrSpace];
+  return -1;
+}
 } // end namespace AMDGPU
 
 } // end namespace llvm
diff --git a/llvm/include/llvm/Support/AMDHSAKernelDescriptor.h b/llvm/include/llvm/Support/AMDHSAKernelDescriptor.h
index 418d2b36114c5..3f966acdf12af 100644
--- a/llvm/include/llvm/Support/AMDHSAKernelDescriptor.h
+++ b/llvm/include/llvm/Support/AMDHSAKernelDescriptor.h
@@ -25,11 +25,6 @@
 #include <cstddef>
 #include <cstdint>
 
-// Gets offset of specified member in specified type.
-#ifndef offsetof
-#define offsetof(TYPE, MEMBER) ((size_t)&((TYPE*)0)->MEMBER)
-#endif // offsetof
-
 // Creates enumeration entries used for packing bits into integers. Enumeration
 // entries include bit shift amount, bit width, and bit mask.
 #ifndef AMDHSA_BITS_ENUM_ENTRY
diff --git a/llvm/include/llvm/Support/CFGDiff.h b/llvm/include/llvm/Support/CFGDiff.h
index 11bb9c0fb8f4d..41004d755a124 100644
--- a/llvm/include/llvm/Support/CFGDiff.h
+++ b/llvm/include/llvm/Support/CFGDiff.h
@@ -34,18 +34,17 @@ namespace llvm {
 
 namespace detail {
 template <typename Range>
-auto reverse_if_helper(Range &&R, std::integral_constant<bool, false>) {
+auto reverse_if_helper(Range &&R, std::bool_constant<false>) {
   return std::forward<Range>(R);
 }
 
 template <typename Range>
-auto reverse_if_helper(Range &&R, std::integral_constant<bool, true>) {
+auto reverse_if_helper(Range &&R, std::bool_constant<true>) {
   return llvm::reverse(std::forward<Range>(R));
 }
 
 template <bool B, typename Range> auto reverse_if(Range &&R) {
-  return reverse_if_helper(std::forward<Range>(R),
-                           std::integral_constant<bool, B>{});
+  return reverse_if_helper(std::forward<Range>(R), std::bool_constant<B>{});
 }
 } // namespace detail
 
diff --git a/llvm/include/llvm/Support/DXILABI.h b/llvm/include/llvm/Support/DXILABI.h
index 307a1d1d43f5c..e6600c3406df5 100644
--- a/llvm/include/llvm/Support/DXILABI.h
+++ b/llvm/include/llvm/Support/DXILABI.h
@@ -102,7 +102,6 @@ const unsigned MinWaveSize = 4;
 const unsigned MaxWaveSize = 128;
 
 LLVM_ABI StringRef getResourceClassName(ResourceClass RC);
-
 } // namespace dxil
 } // namespace llvm
 
diff --git a/llvm/include/llvm/Support/Debug.h b/llvm/include/llvm/Support/Debug.h
index a7795d403721c..b73f2d7c8b852 100644
--- a/llvm/include/llvm/Support/Debug.h
+++ b/llvm/include/llvm/Support/Debug.h
@@ -44,11 +44,6 @@ class raw_ostream;
 /// level, return false.
 LLVM_ABI bool isCurrentDebugType(const char *Type, int Level = 0);
 
-/// Overload allowing to swap the order of the Type and Level arguments.
-LLVM_ABI inline bool isCurrentDebugType(int Level, const char *Type) {
-  return isCurrentDebugType(Type, Level);
-}
-
 /// setCurrentDebugType - Set the current debug type, as if the -debug-only=X
 /// option were specified.  Note that DebugFlag also needs to be set to true for
 /// debug output to be produced.
diff --git a/llvm/include/llvm/Support/DebugLog.h b/llvm/include/llvm/Support/DebugLog.h
index dce706e196bde..7025ca149ace1 100644
--- a/llvm/include/llvm/Support/DebugLog.h
+++ b/llvm/include/llvm/Support/DebugLog.h
@@ -19,52 +19,55 @@
 namespace llvm {
 #ifndef NDEBUG
 
-// LDBG() is a macro that can be used as a raw_ostream for debugging.
-// It will stream the output to the dbgs() stream, with a prefix of the
-// debug type and the file and line number. A trailing newline is added to the
-// output automatically. If the streamed content contains a newline, the prefix
-// is added to each beginning of a new line. Nothing is printed if the debug
-// output is not enabled or the debug type does not match.
-//
-// E.g.,
-//   LDBG() << "Bitset contains: " << Bitset;
-// is somehow equivalent to
-//   LLVM_DEBUG(dbgs() << "[" << DEBUG_TYPE << "] " << __FILE__ << ":" <<
-//   __LINE__ << " "
-//              << "Bitset contains: " << Bitset << "\n");
-//
+/// LDBG() is a macro that can be used as a raw_ostream for debugging.
+/// It will stream the output to the dbgs() stream, with a prefix of the
+/// debug type and the file and line number. A trailing newline is added to the
+/// output automatically. If the streamed content contains a newline, the prefix
+/// is added to each beginning of a new line. Nothing is printed if the debug
+/// output is not enabled or the debug type does not match.
+///
+/// E.g.,
+///   LDBG() << "Bitset contains: " << Bitset;
+/// is equivalent to
+///   LLVM_DEBUG(dbgs() << "[" << DEBUG_TYPE << "] " << __FILE__ << ":" <<
+///   __LINE__ << " "
+///              << "Bitset contains: " << Bitset << "\n");
+///
 // An optional `level` argument can be provided to control the verbosity of the
-// output. The default level is 1, and is in increasing level of verbosity.
-//
-// The `level` argument can be a literal integer, or a macro that evaluates to
-// an integer.
-//
-// An optional `type` argument can be provided to control the debug type. The
-// default type is DEBUG_TYPE. The `type` argument can be a literal string, or a
-// macro that evaluates to a string.
+/// output. The default level is 1, and is in increasing level of verbosity.
+///
+/// The `level` argument can be a literal integer, or a macro that evaluates to
+/// an integer.
+///
+/// An optional `type` argument can be provided to control the debug type. The
+/// default type is DEBUG_TYPE. The `type` argument can be a literal string, or
+/// a macro that evaluates to a string.
+///
+/// E.g.,
+///   LDBG(2) << "Bitset contains: " << Bitset;
+///   LDBG("debug_type") << "Bitset contains: " << Bitset;
+///   LDBG("debug_type", 2) << "Bitset contains: " << Bitset;
 #define LDBG(...) _GET_LDBG_MACRO(__VA_ARGS__)(__VA_ARGS__)
 
-// Helper macros to choose the correct macro based on the number of arguments.
-#define LDBG_FUNC_CHOOSER(_f1, _f2, _f3, ...) _f3
-#define LDBG_FUNC_RECOMPOSER(argsWithParentheses)                              \
-  LDBG_FUNC_CHOOSER argsWithParentheses
-#define LDBG_CHOOSE_FROM_ARG_COUNT(...)                                        \
-  LDBG_FUNC_RECOMPOSER(                                                        \
-      (__VA_ARGS__, LDBG_LOG_LEVEL_WITH_TYPE, LDBG_LOG_LEVEL, ))
-#define LDBG_NO_ARG_EXPANDER() , , LDBG_LOG_LEVEL_1
-#define _GET_LDBG_MACRO(...)                                                   \
-  LDBG_CHOOSE_FROM_ARG_COUNT(LDBG_NO_ARG_EXPANDER __VA_ARGS__())
-
-// Dispatch macros to support the `level` argument or none (default to 1)
-#define LDBG_LOG_LEVEL(LEVEL)                                                  \
-  DEBUGLOG_WITH_STREAM_AND_TYPE(llvm::dbgs(), LEVEL, DEBUG_TYPE)
-#define LDBG_LOG_LEVEL_1() LDBG_LOG_LEVEL(1)
-// This macro is a helper when LDBG() is called with 2 arguments.
-// In this case we want to allow the order of the arguments to be swapped.
-// We rely on the fact that the `level` argument is an integer, and the `type`
-// is a string and dispatch to a C++ API that is overloaded.
-#define LDBG_LOG_LEVEL_WITH_TYPE(LEVEL_OR_TYPE, TYPE_OR_LEVEL)                 \
-  DEBUGLOG_WITH_STREAM_AND_TYPE(llvm::dbgs(), (LEVEL_OR_TYPE), (TYPE_OR_LEVEL))
+/// LDBG_OS() is a macro that behaves like LDBG() but instead of directly using
+/// it to stream the output, it takes a callback function that will be called
+/// with a raw_ostream.
+/// This is useful when you need to pass a `raw_ostream` to a helper function to
+/// be able to print (when the `<<` operator is not available).
+///
+/// E.g.,
+///   LDBG_OS([&] (raw_ostream &Os) {
+///     Os << "Pass Manager contains: ";
+///     pm.printAsTextual(Os);
+///   });
+///
+/// Just like LDBG(), it optionally accepts a `level` and `type` arguments.
+/// E.g.,
+///   LDBG_OS(2, [&] (raw_ostream &Os) { ... });
+///   LDBG_OS("debug_type", [&] (raw_ostream &Os) { ... });
+///   LDBG_OS("debug_type", 2, [&] (raw_ostream &Os) { ... });
+///
+#define LDBG_OS(...) _GET_LDBG_OS_MACRO(__VA_ARGS__)(__VA_ARGS__)
 
 // We want the filename without the full path. We are using the __FILE__ macro
 // and a constexpr function to strip the path prefix. We can avoid the frontend
@@ -76,22 +79,156 @@ namespace llvm {
 #define __LLVM_FILE_NAME__ ::llvm::impl::getShortFileName(__FILE__)
 #endif
 
-#define DEBUGLOG_WITH_STREAM_TYPE_FILE_AND_LINE(STREAM, LEVEL, TYPE, FILE,     \
-                                                LINE)                          \
-  for (bool _c =                                                               \
-           (::llvm::DebugFlag && ::llvm::isCurrentDebugType(TYPE, LEVEL));     \
+// Everything below are implementation details of the macros above.
+namespace impl {
+
+/// This macro expands to the stream to use for output, we use a macro to allow
+/// unit-testing to override.
+#define LDBG_STREAM ::llvm::dbgs()
+
+// ----------------------------------------------------------------------------
+// LDBG() implementation
+// ----------------------------------------------------------------------------
+
+// Helper macros to choose the correct LDBG() macro based on the number of
+// arguments.
+#define LDBG_FUNC_CHOOSER(_f1, _f2, _f3, ...) _f3
+#define LDBG_FUNC_RECOMPOSER(argsWithParentheses)                              \
+  LDBG_FUNC_CHOOSER argsWithParentheses
+#define LDBG_CHOOSE_FROM_ARG_COUNT(...)                                        \
+  LDBG_FUNC_RECOMPOSER((__VA_ARGS__, LDBG_TYPE_AND_LEVEL, LDBG_LEVEL_OR_TYPE, ))
+#define LDBG_NO_ARG_EXPANDER() , , LDBG_NO_ARG
+#define _GET_LDBG_MACRO(...)                                                   \
+  LDBG_CHOOSE_FROM_ARG_COUNT(LDBG_NO_ARG_EXPANDER __VA_ARGS__())
+
+/// This macro is the core of the LDBG() implementation. It is used to print the
+/// debug output with the given stream, level, type, file, and line number.
+#define LDBG_STREAM_LEVEL_TYPE_FILE_AND_LINE(STREAM, LEVEL_OR_TYPE,            \
+                                             TYPE_OR_LEVEL, FILE, LINE)        \
+  for (bool _c = ::llvm::DebugFlag && ::llvm::impl::ldbgIsCurrentDebugType(    \
+                                          TYPE_OR_LEVEL, LEVEL_OR_TYPE);       \
        _c; _c = false)                                                         \
-    for (::llvm::impl::raw_ldbg_ostream LdbgOS{                                \
-             ::llvm::impl::computePrefix(TYPE, FILE, LINE, LEVEL), (STREAM)};  \
-         _c; _c = false)                                                       \
-  ::llvm::impl::RAIINewLineStream{LdbgOS}.asLvalue()
+  ::llvm::impl::raw_ldbg_ostream{                                              \
+      ::llvm::impl::computePrefix(TYPE_OR_LEVEL, FILE, LINE, LEVEL_OR_TYPE),   \
+      (STREAM), /*ShouldPrefixNextString=*/true,                               \
+      /*ShouldEmitNewLineOnDestruction=*/true}                                 \
+      .asLvalue()
 
-#define DEBUGLOG_WITH_STREAM_TYPE_AND_FILE(STREAM, LEVEL, TYPE, FILE)          \
-  DEBUGLOG_WITH_STREAM_TYPE_FILE_AND_LINE(STREAM, LEVEL, TYPE, FILE, __LINE__)
-#define DEBUGLOG_WITH_STREAM_AND_TYPE(STREAM, LEVEL, TYPE)                     \
-  DEBUGLOG_WITH_STREAM_TYPE_AND_FILE(STREAM, LEVEL, TYPE, __LLVM_FILE_NAME__)
+/// These macros are helpers to implement LDBG() with an increasing amount of
+/// optional arguments made explicit.
+#define LDBG_STREAM_LEVEL_TYPE_AND_FILE(STREAM, LEVEL_OR_TYPE, TYPE_OR_LEVEL,  \
+                                        FILE)                                  \
+  LDBG_STREAM_LEVEL_TYPE_FILE_AND_LINE(STREAM, LEVEL_OR_TYPE, TYPE_OR_LEVEL,   \
+                                       FILE, __LINE__)
+#define LDGB_STREAM_LEVEL_AND_TYPE(STREAM, LEVEL_OR_TYPE, TYPE_OR_LEVEL)       \
+  LDBG_STREAM_LEVEL_TYPE_AND_FILE(STREAM, LEVEL_OR_TYPE, TYPE_OR_LEVEL,        \
+                                  __LLVM_FILE_NAME__)
+/// This macro is a helper when LDBG() is called with 2 arguments.
+/// In this case we want to force the first argument to be the type for
+/// consistency in the codebase.
+/// We trick this by casting the first argument to a (const char *) which
+/// won't compile with an int.
+#define LDBG_TYPE_AND_LEVEL(TYPE, LEVEL)                                       \
+  LDGB_STREAM_LEVEL_AND_TYPE(LDBG_STREAM, static_cast<const char *>(TYPE),     \
+                             (LEVEL))
 
-namespace impl {
+/// When a single argument is provided. This can be either a level or the debug
+/// type. If a level is provided, we default the debug type to DEBUG_TYPE, if a
+/// string is provided, we default the level to 1.
+#define LDBG_LEVEL_OR_TYPE(LEVEL_OR_TYPE)                                      \
+  LDGB_STREAM_LEVEL_AND_TYPE(LDBG_STREAM, (LEVEL_OR_TYPE),                     \
+                             LDBG_GET_DEFAULT_TYPE_OR_LEVEL(LEVEL_OR_TYPE))
+#define LDBG_NO_ARG() LDBG_LEVEL_OR_TYPE(1)
+
+// ----------------------------------------------------------------------------
+// LDBG_OS() implementation
+// ----------------------------------------------------------------------------
+
+// Helper macros to choose the correct LDBG_OS() macro based on the number of
+// arguments.
+#define LDBG_OS_FUNC_CHOOSER(_f1, _f2, _f3, _f4, ...) _f4
+#define LDBG_OS_FUNC_RECOMPOSER(argsWithParentheses)                           \
+  LDBG_OS_FUNC_CHOOSER argsWithParentheses
+#define LDBG_OS_CHOOSE_FROM_ARG_COUNT(...)                                     \
+  LDBG_OS_FUNC_RECOMPOSER((__VA_ARGS__, LDBG_OS_TYPE_AND_LEVEL_AND_CALLBACK,   \
+                           LDBG_OS_LEVEL_OR_TYPE_AND_CALLBACK,                 \
+                           LDBG_OS_CALLBACK, ))
+#define LDBG_OS_NO_ARG_EXPANDER() , , , LDBG_OS_CALLBACK
+#define _GET_LDBG_OS_MACRO(...)                                                \
+  LDBG_OS_CHOOSE_FROM_ARG_COUNT(LDBG_OS_NO_ARG_EXPANDER __VA_ARGS__())
+
+/// This macro is the core of the LDBG_OS() macros. It is used to print the
+/// debug output with the given stream, level, type, file, and line number.
+#define LDBG_OS_IMPL(TYPE_OR_LEVEL, LEVEL_OR_TYPE, CALLBACK, STREAM, FILE,     \
+                     LINE)                                                     \
+  if (::llvm::DebugFlag &&                                                     \
+      ::llvm::impl::ldbgIsCurrentDebugType(TYPE_OR_LEVEL, LEVEL_OR_TYPE)) {    \
+    ::llvm::impl::raw_ldbg_ostream LdbgOS{                                     \
+        ::llvm::impl::computePrefix(TYPE_OR_LEVEL, FILE, LINE, LEVEL_OR_TYPE), \
+        (STREAM), /*ShouldPrefixNextString=*/true,                             \
+        /*ShouldEmitNewLineOnDestruction=*/true};                              \
+    CALLBACK(LdbgOS);                                                          \
+  }
+
+#define LDBG_OS_TYPE_AND_LEVEL_AND_CALLBACK(TYPE, LEVEL, CALLBACK)             \
+  LDBG_OS_IMPL(static_cast<const char *>(TYPE), LEVEL, CALLBACK, LDBG_STREAM,  \
+               __LLVM_FILE_NAME__, __LINE__)
+#define LDBG_OS_LEVEL_OR_TYPE_AND_CALLBACK(LEVEL_OR_TYPE, CALLBACK)            \
+  LDBG_OS_IMPL(LDBG_GET_DEFAULT_TYPE_OR_LEVEL(LEVEL_OR_TYPE), LEVEL_OR_TYPE,   \
+               CALLBACK, LDBG_STREAM, __LLVM_FILE_NAME__, __LINE__)
+#define LDBG_OS_CALLBACK(CALLBACK)                                             \
+  LDBG_OS_LEVEL_OR_TYPE_AND_CALLBACK(1, CALLBACK)
+
+// ----------------------------------------------------------------------------
+// General Helpers for the implementation above
+// ----------------------------------------------------------------------------
+
+/// Return the stringified macro as a StringRef.
+/// Also, strip out potential surrounding quotes: this comes from an artifact of
+/// the macro stringification, if DEBUG_TYPE is undefined we get the string
+/// "DEBUG_TYPE", however if it is defined we get the string with the quotes.
+/// For example if DEBUG_TYPE is "foo", we get "\"foo\"" but we want to return
+/// "foo" here.
+constexpr ::llvm::StringRef strip_quotes(const char *Str) {
+  ::llvm::StringRef S(Str);
+  if (Str[0] == '"' && Str[S.size() - 1] == '"')
+    return StringRef(Str + 1, S.size() - 2);
+  return S;
+}
+
+/// Helper to provide the default level (=1) or type (=DEBUG_TYPE). This is used
+/// when a single argument is passed to LDBG() (or LDBG_OS()), if it is an
+/// integer we return DEBUG_TYPE and if it is a string we return 1.
+/// When DEBUG_TYPE is not defined, we return the current file name instead.
+#define LDBG_GET_DEFAULT_TYPE_OR_LEVEL(LEVEL_OR_TYPE)                          \
+  [](auto LevelOrType) {                                                       \
+    if constexpr (std::is_integral_v<decltype(LevelOrType)>) {                 \
+      constexpr const char *DebugType = LDBG_GET_DEBUG_TYPE_STR();             \
+      if constexpr (DebugType[0] == '"') {                                     \
+        return ::llvm::impl::strip_quotes(DebugType);                          \
+      } else {                                                                 \
+        return __LLVM_FILE_NAME__;                                             \
+      }                                                                        \
+    } else {                                                                   \
+      return 1;                                                                \
+    }                                                                          \
+  }(LEVEL_OR_TYPE)
+
+/// Helpers to get DEBUG_TYPE as a StringRef, even when DEBUG_TYPE is not
+/// defined (in which case it expands to "DEBUG_TYPE")
+#define LDBG_GET_DEBUG_TYPE_STR__(X) #X
+#define LDBG_GET_DEBUG_TYPE_STR_(X) LDBG_GET_DEBUG_TYPE_STR__(X)
+#define LDBG_GET_DEBUG_TYPE_STR() LDBG_GET_DEBUG_TYPE_STR_(DEBUG_TYPE)
+
+/// Helper to call isCurrentDebugType with a StringRef.
+static LLVM_ATTRIBUTE_UNUSED bool ldbgIsCurrentDebugType(StringRef Type,
+                                                         int Level) {
+  return ::llvm::isCurrentDebugType(Type.str().c_str(), Level);
+}
+static LLVM_ATTRIBUTE_UNUSED bool ldbgIsCurrentDebugType(int Level,
+                                                         StringRef Type) {
+  return ::llvm::isCurrentDebugType(Type.str().c_str(), Level);
+}
 
 /// A raw_ostream that tracks `\n` and print the prefix after each
 /// newline.
@@ -99,6 +236,7 @@ class LLVM_ABI raw_ldbg_ostream final : public raw_ostream {
   std::string Prefix;
   raw_ostream &Os;
   bool ShouldPrefixNextString;
+  bool ShouldEmitNewLineOnDestruction;
 
   /// Split the line on newlines and insert the prefix before each
   /// newline. Forward everything to the underlying stream.
@@ -131,12 +269,17 @@ class LLVM_ABI raw_ldbg_ostream final : public raw_ostream {
 
 public:
   explicit raw_ldbg_ostream(std::string Prefix, raw_ostream &Os,
-                            bool ShouldPrefixNextString = true)
+                            bool ShouldPrefixNextString = true,
+                            bool ShouldEmitNewLineOnDestruction = false)
       : Prefix(std::move(Prefix)), Os(Os),
-        ShouldPrefixNextString(ShouldPrefixNextString) {
+        ShouldPrefixNextString(ShouldPrefixNextString),
+        ShouldEmitNewLineOnDestruction(ShouldEmitNewLineOnDestruction) {
     SetUnbuffered();
   }
-  ~raw_ldbg_ostream() final {}
+  ~raw_ldbg_ostream() final {
+    if (ShouldEmitNewLineOnDestruction)
+      Os << '\n';
+  }
 
   /// Forward the current_pos method to the underlying stream.
   uint64_t current_pos() const final { return Os.tell(); }
@@ -173,17 +316,18 @@ getShortFileName(const char *path) {
 /// "[DebugType] File:Line "
 /// Where the File is the file name without the path prefix.
 static LLVM_ATTRIBUTE_UNUSED std::string
-computePrefix(const char *DebugType, const char *File, int Line, int Level) {
+computePrefix(StringRef DebugType, const char *File, int Line, int Level) {
   std::string Prefix;
   raw_string_ostream OsPrefix(Prefix);
-  if (DebugType)
-    OsPrefix << "[" << DebugType << ":" << Level << "] ";
-  OsPrefix << File << ":" << Line << " ";
+  OsPrefix << "[";
+  if (!DebugType.empty() && DebugType != File)
+    OsPrefix << DebugType << " ";
+  OsPrefix << File << ":" << Line << " " << Level << "] ";
   return OsPrefix.str();
 }
 /// Overload allowing to swap the order of the DebugType and Level arguments.
 static LLVM_ATTRIBUTE_UNUSED std::string
-computePrefix(int Level, const char *File, int Line, const char *DebugType) {
+computePrefix(int Level, const char *File, int Line, StringRef DebugType) {
   return computePrefix(DebugType, File, Line, Level);
 }
 
@@ -194,6 +338,7 @@ computePrefix(int Level, const char *File, int Line, const char *DebugType) {
 #define LDBG(...)                                                              \
   for (bool _c = false; _c; _c = false)                                        \
   ::llvm::nulls()
+#define LDBG_OS(...)
 #endif
 } // end namespace llvm
 
diff --git a/llvm/include/llvm/Support/Endian.h b/llvm/include/llvm/Support/Endian.h
index 02a3194e09784..7eb1d7e8dfe7f 100644
--- a/llvm/include/llvm/Support/Endian.h
+++ b/llvm/include/llvm/Support/Endian.h
@@ -96,9 +96,8 @@ inline void write(void *memory, value_type value, endianness endian) {
          &value, sizeof(value_type));
 }
 
-template<typename value_type,
-         endianness endian,
-         std::size_t alignment>
+template <typename value_type, endianness endian, std::size_t alignment>
+LLVM_DEPRECATED("Pass endian as a function argument instead", "write")
 inline void write(void *memory, value_type value) {
   write<value_type, alignment>(memory, value, endian);
 }
@@ -163,7 +162,7 @@ inline void writeAtBitAlignment(void *memory, value_type value,
                                 uint64_t startBit) {
   assert(startBit < 8);
   if (startBit == 0)
-    write<value_type, endian, alignment>(memory, value);
+    write<value_type, alignment>(memory, value, endian);
   else {
     // Read two values and shift the result into them.
     value_type val[2];
@@ -230,8 +229,8 @@ struct packed_endian_specific_integral {
   operator value_type() const { return value(); }
 
   void operator=(value_type newValue) {
-    endian::write<value_type, endian, alignment>(
-      (void*)Value.buffer, newValue);
+    endian::write<value_type, alignment>((void *)Value.buffer, newValue,
+                                         endian);
   }
 
   packed_endian_specific_integral &operator+=(value_type newValue) {
@@ -268,7 +267,7 @@ struct packed_endian_specific_integral {
     }
 
     void operator=(value_type NewValue) {
-      endian::write<value_type, endian, alignment>(Ptr, NewValue);
+      endian::write<value_type, alignment>(Ptr, NewValue, endian);
     }
 
   private:
diff --git a/llvm/include/llvm/Support/FileSystem.h b/llvm/include/llvm/Support/FileSystem.h
index a21b0a272d2b0..c203779307840 100644
--- a/llvm/include/llvm/Support/FileSystem.h
+++ b/llvm/include/llvm/Support/FileSystem.h
@@ -410,6 +410,11 @@ LLVM_ABI std::error_code copy_file(const Twine &From, int ToFD);
 ///          platform-specific error_code.
 LLVM_ABI std::error_code resize_file(int FD, uint64_t Size);
 
+/// Resize path to size with sparse files explicitly enabled. It uses
+/// FSCTL_SET_SPARSE On Windows. This is the same as resize_file on
+/// non-Windows
+LLVM_ABI std::error_code resize_file_sparse(int FD, uint64_t Size);
+
 /// Resize \p FD to \p Size before mapping \a mapped_file_region::readwrite. On
 /// non-Windows, this calls \a resize_file(). On Windows, this is a no-op,
 /// since the subsequent mapping (via \c CreateFileMapping) automatically
diff --git a/llvm/include/llvm/Support/FormatProviders.h b/llvm/include/llvm/Support/FormatProviders.h
index b7d2e2e45f71f..3e0800e1efe6c 100644
--- a/llvm/include/llvm/Support/FormatProviders.h
+++ b/llvm/include/llvm/Support/FormatProviders.h
@@ -29,35 +29,31 @@ namespace support {
 namespace detail {
 template <typename T>
 struct use_integral_formatter
-    : public std::integral_constant<
-          bool, is_one_of<T, uint8_t, int16_t, uint16_t, int32_t, uint32_t,
-                          int64_t, uint64_t, int, unsigned, long, unsigned long,
-                          long long, unsigned long long>::value> {};
+    : public std::bool_constant<
+          is_one_of<T, uint8_t, int16_t, uint16_t, int32_t, uint32_t, int64_t,
+                    uint64_t, int, unsigned, long, unsigned long, long long,
+                    unsigned long long>::value> {};
 
 template <typename T>
-struct use_char_formatter
-    : public std::integral_constant<bool, std::is_same_v<T, char>> {};
+struct use_char_formatter : public std::bool_constant<std::is_same_v<T, char>> {
+};
 
 template <typename T>
 struct is_cstring
-    : public std::integral_constant<bool,
-                                    is_one_of<T, char *, const char *>::value> {
-};
+    : public std::bool_constant<is_one_of<T, char *, const char *>::value> {};
 
 template <typename T>
 struct use_string_formatter
-    : public std::integral_constant<bool,
-                                    std::is_convertible_v<T, llvm::StringRef>> {
-};
+    : public std::bool_constant<std::is_convertible_v<T, llvm::StringRef>> {};
 
 template <typename T>
 struct use_pointer_formatter
-    : public std::integral_constant<bool, std::is_pointer_v<T> &&
-                                              !is_cstring<T>::value> {};
+    : public std::bool_constant<std::is_pointer_v<T> && !is_cstring<T>::value> {
+};
 
 template <typename T>
 struct use_double_formatter
-    : public std::integral_constant<bool, std::is_floating_point_v<T>> {};
+    : public std::bool_constant<std::is_floating_point_v<T>> {};
 
 class HelperFunctions {
 protected:
@@ -330,8 +326,7 @@ using IterValue = typename std::iterator_traits<IterT>::value_type;
 
 template <typename IterT>
 struct range_item_has_provider
-    : public std::integral_constant<
-          bool,
+    : public std::bool_constant<
           !support::detail::uses_missing_provider<IterValue<IterT>>::value> {};
 } // namespace detail
 } // namespace support
diff --git a/llvm/include/llvm/Support/FormatVariadicDetails.h b/llvm/include/llvm/Support/FormatVariadicDetails.h
index b85a4f6065195..fa11d56fc1ada 100644
--- a/llvm/include/llvm/Support/FormatVariadicDetails.h
+++ b/llvm/include/llvm/Support/FormatVariadicDetails.h
@@ -66,13 +66,10 @@ template <class T> class has_FormatProvider {
   typedef void (*Signature_format)(const Decayed &, llvm::raw_ostream &,
                                    StringRef);
 
-  template <typename U>
-  static char test(SameType<Signature_format, &U::format> *);
-
-  template <typename U> static double test(...);
+  template <typename U> using check = SameType<Signature_format, &U::format>;
 
-  static bool const value =
-      (sizeof(test<llvm::format_provider<Decayed>>(nullptr)) == 1);
+  static constexpr bool value =
+      llvm::is_detected<check, llvm::format_provider<Decayed>>::value;
 };
 
 // Test if raw_ostream& << T -> raw_ostream& is findable via ADL.
@@ -96,26 +93,24 @@ template <class T> class has_StreamOperator {
 // based format() invocation.
 template <typename T>
 struct uses_format_member
-    : public std::integral_constant<
-          bool, std::is_base_of_v<format_adapter, std::remove_reference_t<T>>> {
-};
+    : public std::bool_constant<
+          std::is_base_of_v<format_adapter, std::remove_reference_t<T>>> {};
 
 // Simple template that decides whether a type T should use the format_provider
 // based format() invocation.  The member function takes priority, so this test
 // will only be true if there is not ALSO a format member.
 template <typename T>
 struct uses_format_provider
-    : public std::integral_constant<
-          bool, !uses_format_member<T>::value && has_FormatProvider<T>::value> {
-};
+    : public std::bool_constant<!uses_format_member<T>::value &&
+                                has_FormatProvider<T>::value> {};
 
 // Simple template that decides whether a type T should use the operator<<
 // based format() invocation.  This takes last priority.
 template <typename T>
 struct uses_stream_operator
-    : public std::integral_constant<bool, !uses_format_member<T>::value &&
-                                              !uses_format_provider<T>::value &&
-                                              has_StreamOperator<T>::value> {};
+    : public std::bool_constant<!uses_format_member<T>::value &&
+                                !uses_format_provider<T>::value &&
+                                has_StreamOperator<T>::value> {};
 
 // Simple template that decides whether a type T has neither a member-function
 // nor format_provider based implementation that it can use.  Mostly used so
@@ -123,10 +118,9 @@ struct uses_stream_operator
 // implementation can be located.
 template <typename T>
 struct uses_missing_provider
-    : public std::integral_constant<bool, !uses_format_member<T>::value &&
-                                              !uses_format_provider<T>::value &&
-                                              !uses_stream_operator<T>::value> {
-};
+    : public std::bool_constant<!uses_format_member<T>::value &&
+                                !uses_format_provider<T>::value &&
+                                !uses_stream_operator<T>::value> {};
 
 template <typename T>
 std::enable_if_t<uses_format_member<T>::value, T>
diff --git a/llvm/include/llvm/Support/HashBuilder.h b/llvm/include/llvm/Support/HashBuilder.h
index 097110874400d..ae266d3f19a1a 100644
--- a/llvm/include/llvm/Support/HashBuilder.h
+++ b/llvm/include/llvm/Support/HashBuilder.h
@@ -32,8 +32,7 @@ namespace hashbuilder_detail {
 /// Trait to indicate whether a type's bits can be hashed directly (after
 /// endianness correction).
 template <typename U>
-struct IsHashableData
-    : std::integral_constant<bool, is_integral_or_enum<U>::value> {};
+struct IsHashableData : std::bool_constant<is_integral_or_enum<U>::value> {};
 
 } // namespace hashbuilder_detail
 
@@ -366,18 +365,16 @@ class HashBuilder : public HashBuilderBase<HasherT> {
   HashBuilder &addRangeElementsImpl(ForwardIteratorT First,
                                     ForwardIteratorT Last,
                                     std::forward_iterator_tag) {
-    for (auto It = First; It != Last; ++It)
-      add(*It);
-    return *this;
-  }
-
-  template <typename T>
-  std::enable_if_t<hashbuilder_detail::IsHashableData<T>::value &&
-                       Endianness == llvm::endianness::native,
-                   HashBuilder &>
-  addRangeElementsImpl(T *First, T *Last, std::forward_iterator_tag) {
-    this->update(ArrayRef(reinterpret_cast<const uint8_t *>(First),
-                          (Last - First) * sizeof(T)));
+    using T = typename std::iterator_traits<ForwardIteratorT>::value_type;
+    if constexpr (std::is_pointer_v<ForwardIteratorT> &&
+                  hashbuilder_detail::IsHashableData<T>::value &&
+                  Endianness == llvm::endianness::native) {
+      this->update(ArrayRef(reinterpret_cast<const uint8_t *>(First),
+                            (Last - First) * sizeof(T)));
+    } else {
+      for (auto It = First; It != Last; ++It)
+        add(*It);
+    }
     return *this;
   }
 };
diff --git a/mlir/include/mlir/Tools/lsp-server-support/Logging.h b/llvm/include/llvm/Support/LSP/Logging.h
similarity index 55%
rename from mlir/include/mlir/Tools/lsp-server-support/Logging.h
rename to llvm/include/llvm/Support/LSP/Logging.h
index 9b090d05f7fa4..fe65899b1d4ce 100644
--- a/mlir/include/mlir/Tools/lsp-server-support/Logging.h
+++ b/llvm/include/llvm/Support/LSP/Logging.h
@@ -1,4 +1,4 @@
-//===- Logging.h - MLIR LSP Server Logging ----------------------*- C++ -*-===//
+//===- Logging.h - LSP Server Logging ----------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,16 +6,15 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef MLIR_TOOLS_LSPSERVERSUPPORT_LOGGING_H
-#define MLIR_TOOLS_LSPSERVERSUPPORT_LOGGING_H
+#ifndef LLVM_SUPPORT_LSP_LOGGING_H
+#define LLVM_SUPPORT_LSP_LOGGING_H
 
-#include "mlir/Support/LLVM.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/FormatVariadic.h"
 #include <memory>
 #include <mutex>
 
-namespace mlir {
+namespace llvm {
 namespace lsp {
 
 /// This class represents the main interface for logging, and allows for
@@ -26,21 +25,18 @@ class Logger {
   enum class Level { Debug, Info, Error };
 
   /// Set the severity level of the logger.
-  static void setLogLevel(Level logLevel);
+  static void setLogLevel(Level LogLevel);
 
   /// Initiate a log message at various severity levels. These should be called
   /// after a call to `initialize`.
-  template <typename... Ts>
-  static void debug(const char *fmt, Ts &&...vals) {
-    log(Level::Debug, fmt, llvm::formatv(fmt, std::forward<Ts>(vals)...));
+  template <typename... Ts> static void debug(const char *Fmt, Ts &&...Vals) {
+    log(Level::Debug, Fmt, llvm::formatv(Fmt, std::forward<Ts>(Vals)...));
   }
-  template <typename... Ts>
-  static void info(const char *fmt, Ts &&...vals) {
-    log(Level::Info, fmt, llvm::formatv(fmt, std::forward<Ts>(vals)...));
+  template <typename... Ts> static void info(const char *Fmt, Ts &&...Vals) {
+    log(Level::Info, Fmt, llvm::formatv(Fmt, std::forward<Ts>(Vals)...));
   }
-  template <typename... Ts>
-  static void error(const char *fmt, Ts &&...vals) {
-    log(Level::Error, fmt, llvm::formatv(fmt, std::forward<Ts>(vals)...));
+  template <typename... Ts> static void error(const char *Fmt, Ts &&...Vals) {
+    log(Level::Error, Fmt, llvm::formatv(Fmt, std::forward<Ts>(Vals)...));
   }
 
 private:
@@ -50,16 +46,16 @@ class Logger {
   static Logger &get();
 
   /// Start a log message with the given severity level.
-  static void log(Level logLevel, const char *fmt,
-                  const llvm::formatv_object_base &message);
+  static void log(Level LogLevel, const char *Fmt,
+                  const llvm::formatv_object_base &Message);
 
   /// The minimum logging level. Messages with lower level are ignored.
-  Level logLevel = Level::Error;
+  Level LogLevel = Level::Error;
 
   /// A mutex used to guard logging.
-  std::mutex mutex;
+  std::mutex Mutex;
 };
 } // namespace lsp
-} // namespace mlir
+} // namespace llvm
 
-#endif // MLIR_TOOLS_LSPSERVERSUPPORT_LOGGING_H
+#endif // LLVM_SUPPORT_LSP_LOGGING_H
diff --git a/mlir/include/mlir/Tools/lsp-server-support/Protocol.h b/llvm/include/llvm/Support/LSP/Protocol.h
similarity index 98%
rename from mlir/include/mlir/Tools/lsp-server-support/Protocol.h
rename to llvm/include/llvm/Support/LSP/Protocol.h
index cc06dbfedb42a..93b82f1e581f8 100644
--- a/mlir/include/mlir/Tools/lsp-server-support/Protocol.h
+++ b/llvm/include/llvm/Support/LSP/Protocol.h
@@ -20,20 +20,24 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef MLIR_TOOLS_LSPSERVERSUPPORT_PROTOCOL_H
-#define MLIR_TOOLS_LSPSERVERSUPPORT_PROTOCOL_H
+#ifndef LLVM_SUPPORT_LSP_PROTOCOL_H
+#define LLVM_SUPPORT_LSP_PROTOCOL_H
 
-#include "mlir/Support/LLVM.h"
 #include "llvm/Support/JSON.h"
+#include "llvm/Support/LogicalResult.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/raw_ostream.h"
 #include <bitset>
 #include <optional>
 #include <string>
 #include <utility>
-#include <vector>
 
-namespace mlir {
+// This file is using the LSP syntax for identifier names which is different
+// from the LLVM coding standard. To avoid the clang-tidy warnings, we're
+// disabling one check here.
+// NOLINTBEGIN(readability-identifier-naming)
+
+namespace llvm {
 namespace lsp {
 
 enum class ErrorCode {
@@ -1241,12 +1245,11 @@ struct CodeAction {
 llvm::json::Value toJSON(const CodeAction &);
 
 } // namespace lsp
-} // namespace mlir
+} // namespace llvm
 
 namespace llvm {
-template <>
-struct format_provider<mlir::lsp::Position> {
-  static void format(const mlir::lsp::Position &pos, raw_ostream &os,
+template <> struct format_provider<llvm::lsp::Position> {
+  static void format(const llvm::lsp::Position &pos, raw_ostream &os,
                      StringRef style) {
     assert(style.empty() && "style modifiers for this type are not supported");
     os << pos;
@@ -1255,3 +1258,5 @@ struct format_provider<mlir::lsp::Position> {
 } // namespace llvm
 
 #endif
+
+// NOLINTEND(readability-identifier-naming)
diff --git a/llvm/include/llvm/Support/LSP/Transport.h b/llvm/include/llvm/Support/LSP/Transport.h
new file mode 100644
index 0000000000000..ccd7f213aa277
--- /dev/null
+++ b/llvm/include/llvm/Support/LSP/Transport.h
@@ -0,0 +1,289 @@
+//===--- Transport.h - Sending and Receiving LSP messages -------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// The language server protocol is usually implemented by writing messages as
+// JSON-RPC over the stdin/stdout of a subprocess. This file contains a JSON
+// transport interface that handles this communication.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_LSP_TRANSPORT_H
+#define LLVM_SUPPORT_LSP_TRANSPORT_H
+
+#include "llvm/ADT/FunctionExtras.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/FormatAdapters.h"
+#include "llvm/Support/JSON.h"
+#include "llvm/Support/LSP/Logging.h"
+#include "llvm/Support/LSP/Protocol.h"
+#include "llvm/Support/raw_ostream.h"
+#include <memory>
+
+namespace llvm {
+// Simple helper function that returns a string as printed from a op.
+template <typename T> static std::string debugString(T &&Op) {
+  std::string InstrStr;
+  llvm::raw_string_ostream Os(InstrStr);
+  Os << Op;
+  return Os.str();
+}
+namespace lsp {
+class MessageHandler;
+
+//===----------------------------------------------------------------------===//
+// JSONTransport
+//===----------------------------------------------------------------------===//
+
+/// The encoding style of the JSON-RPC messages (both input and output).
+enum JSONStreamStyle {
+  /// Encoding per the LSP specification, with mandatory Content-Length header.
+  Standard,
+  /// Messages are delimited by a '// -----' line. Comment lines start with //.
+  Delimited
+};
+
+/// An abstract class used by the JSONTransport to read JSON message.
+class JSONTransportInput {
+public:
+  explicit JSONTransportInput(JSONStreamStyle Style = JSONStreamStyle::Standard)
+      : Style(Style) {}
+  virtual ~JSONTransportInput() = default;
+
+  virtual bool hasError() const = 0;
+  virtual bool isEndOfInput() const = 0;
+
+  /// Read in a message from the input stream.
+  LogicalResult readMessage(std::string &Json) {
+    return Style == JSONStreamStyle::Delimited ? readDelimitedMessage(Json)
+                                               : readStandardMessage(Json);
+  }
+  virtual LogicalResult readDelimitedMessage(std::string &Json) = 0;
+  virtual LogicalResult readStandardMessage(std::string &Json) = 0;
+
+private:
+  /// The JSON stream style to use.
+  JSONStreamStyle Style;
+};
+
+/// Concrete implementation of the JSONTransportInput that reads from a file.
+class JSONTransportInputOverFile : public JSONTransportInput {
+public:
+  explicit JSONTransportInputOverFile(
+      std::FILE *In, JSONStreamStyle Style = JSONStreamStyle::Standard)
+      : JSONTransportInput(Style), In(In) {}
+
+  bool hasError() const final { return ferror(In); }
+  bool isEndOfInput() const final { return feof(In); }
+
+  LogicalResult readDelimitedMessage(std::string &Json) final;
+  LogicalResult readStandardMessage(std::string &Json) final;
+
+private:
+  std::FILE *In;
+};
+
+/// A transport class that performs the JSON-RPC communication with the LSP
+/// client.
+class JSONTransport {
+public:
+  JSONTransport(std::unique_ptr<JSONTransportInput> In, raw_ostream &Out,
+                bool PrettyOutput = false)
+      : In(std::move(In)), Out(Out), PrettyOutput(PrettyOutput) {}
+
+  JSONTransport(std::FILE *In, raw_ostream &Out,
+                JSONStreamStyle Style = JSONStreamStyle::Standard,
+                bool PrettyOutput = false)
+      : In(std::make_unique<JSONTransportInputOverFile>(In, Style)), Out(Out),
+        PrettyOutput(PrettyOutput) {}
+
+  /// The following methods are used to send a message to the LSP client.
+  void notify(StringRef Method, llvm::json::Value Params);
+  void call(StringRef Method, llvm::json::Value Params, llvm::json::Value Id);
+  void reply(llvm::json::Value Id, llvm::Expected<llvm::json::Value> Result);
+
+  /// Start executing the JSON-RPC transport.
+  llvm::Error run(MessageHandler &Handler);
+
+private:
+  /// Dispatches the given incoming json message to the message handler.
+  bool handleMessage(llvm::json::Value Msg, MessageHandler &Handler);
+  /// Writes the given message to the output stream.
+  void sendMessage(llvm::json::Value Msg);
+
+private:
+  /// The input to read a message from.
+  std::unique_ptr<JSONTransportInput> In;
+  SmallVector<char, 0> OutputBuffer;
+  /// The output file stream.
+  raw_ostream &Out;
+  /// If the output JSON should be formatted for easier readability.
+  bool PrettyOutput;
+};
+
+//===----------------------------------------------------------------------===//
+// MessageHandler
+//===----------------------------------------------------------------------===//
+
+/// A Callback<T> is a void function that accepts Expected<T>. This is
+/// accepted by functions that logically return T.
+template <typename T>
+using Callback = llvm::unique_function<void(llvm::Expected<T>)>;
+
+/// An OutgoingNotification<T> is a function used for outgoing notifications
+/// send to the client.
+template <typename T>
+using OutgoingNotification = llvm::unique_function<void(const T &)>;
+
+/// An OutgoingRequest<T> is a function used for outgoing requests to send to
+/// the client.
+template <typename T>
+using OutgoingRequest =
+    llvm::unique_function<void(const T &, llvm::json::Value Id)>;
+
+/// An `OutgoingRequestCallback` is invoked when an outgoing request to the
+/// client receives a response in turn. It is passed the original request's ID,
+/// as well as the response result.
+template <typename T>
+using OutgoingRequestCallback =
+    std::function<void(llvm::json::Value, llvm::Expected<T>)>;
+
+/// A handler used to process the incoming transport messages.
+class MessageHandler {
+public:
+  MessageHandler(JSONTransport &Transport) : Transport(Transport) {}
+
+  bool onNotify(StringRef Method, llvm::json::Value Value);
+  bool onCall(StringRef Method, llvm::json::Value Params, llvm::json::Value Id);
+  bool onReply(llvm::json::Value Id, llvm::Expected<llvm::json::Value> Result);
+
+  template <typename T>
+  static llvm::Expected<T> parse(const llvm::json::Value &Raw,
+                                 StringRef PayloadName, StringRef PayloadKind) {
+    T Result;
+    llvm::json::Path::Root Root;
+    if (fromJSON(Raw, Result, Root))
+      return std::move(Result);
+
+    // Dump the relevant parts of the broken message.
+    std::string Context;
+    llvm::raw_string_ostream Os(Context);
+    Root.printErrorContext(Raw, Os);
+
+    // Report the error (e.g. to the client).
+    return llvm::make_error<LSPError>(
+        llvm::formatv("failed to decode {0} {1}: {2}", PayloadName, PayloadKind,
+                      fmt_consume(Root.getError())),
+        ErrorCode::InvalidParams);
+  }
+
+  template <typename Param, typename Result, typename ThisT>
+  void method(llvm::StringLiteral Method, ThisT *ThisPtr,
+              void (ThisT::*Handler)(const Param &, Callback<Result>)) {
+    MethodHandlers[Method] = [Method, Handler,
+                              ThisPtr](llvm::json::Value RawParams,
+                                       Callback<llvm::json::Value> Reply) {
+      llvm::Expected<Param> Parameter =
+          parse<Param>(RawParams, Method, "request");
+      if (!Parameter)
+        return Reply(Parameter.takeError());
+      (ThisPtr->*Handler)(*Parameter, std::move(Reply));
+    };
+  }
+
+  template <typename Param, typename ThisT>
+  void notification(llvm::StringLiteral Method, ThisT *ThisPtr,
+                    void (ThisT::*Handler)(const Param &)) {
+    NotificationHandlers[Method] = [Method, Handler,
+                                    ThisPtr](llvm::json::Value RawParams) {
+      llvm::Expected<Param> Parameter =
+          parse<Param>(RawParams, Method, "notification");
+      if (!Parameter) {
+        return llvm::consumeError(llvm::handleErrors(
+            Parameter.takeError(), [](const LSPError &LspError) {
+              Logger::error("JSON parsing error: {0}",
+                            LspError.message.c_str());
+            }));
+      }
+      (ThisPtr->*Handler)(*Parameter);
+    };
+  }
+
+  /// Create an OutgoingNotification object used for the given method.
+  template <typename T>
+  OutgoingNotification<T> outgoingNotification(llvm::StringLiteral Method) {
+    return [&, Method](const T &Params) {
+      std::lock_guard<std::mutex> TransportLock(TransportOutputMutex);
+      Logger::info("--> {0}", Method);
+      Transport.notify(Method, llvm::json::Value(Params));
+    };
+  }
+
+  /// Create an OutgoingRequest function that, when called, sends a request with
+  /// the given method via the transport. Should the outgoing request be
+  /// met with a response, the result JSON is parsed and the response callback
+  /// is invoked.
+  template <typename Param, typename Result>
+  OutgoingRequest<Param>
+  outgoingRequest(llvm::StringLiteral Method,
+                  OutgoingRequestCallback<Result> Callback) {
+    return [&, Method, Callback](const Param &Parameter, llvm::json::Value Id) {
+      auto CallbackWrapper = [Method, Callback = std::move(Callback)](
+                                 llvm::json::Value Id,
+                                 llvm::Expected<llvm::json::Value> Value) {
+        if (!Value)
+          return Callback(std::move(Id), Value.takeError());
+
+        std::string ResponseName = llvm::formatv("reply:{0}({1})", Method, Id);
+        llvm::Expected<Result> ParseResult =
+            parse<Result>(*Value, ResponseName, "response");
+        if (!ParseResult)
+          return Callback(std::move(Id), ParseResult.takeError());
+
+        return Callback(std::move(Id), *ParseResult);
+      };
+
+      {
+        std::lock_guard<std::mutex> Lock(ResponseHandlersMutex);
+        ResponseHandlers.insert(
+            {debugString(Id), std::make_pair(Method.str(), CallbackWrapper)});
+      }
+
+      std::lock_guard<std::mutex> TransportLock(TransportOutputMutex);
+      Logger::info("--> {0}({1})", Method, Id);
+      Transport.call(Method, llvm::json::Value(Parameter), Id);
+    };
+  }
+
+private:
+  template <typename HandlerT>
+  using HandlerMap = llvm::StringMap<llvm::unique_function<HandlerT>>;
+
+  HandlerMap<void(llvm::json::Value)> NotificationHandlers;
+  HandlerMap<void(llvm::json::Value, Callback<llvm::json::Value>)>
+      MethodHandlers;
+
+  /// A pair of (1) the original request's method name, and (2) the callback
+  /// function to be invoked for responses.
+  using ResponseHandlerTy =
+      std::pair<std::string, OutgoingRequestCallback<llvm::json::Value>>;
+  /// A mapping from request/response ID to response handler.
+  llvm::StringMap<ResponseHandlerTy> ResponseHandlers;
+  /// Mutex to guard insertion into the response handler map.
+  std::mutex ResponseHandlersMutex;
+
+  JSONTransport &Transport;
+
+  /// Mutex to guard sending output messages to the transport.
+  std::mutex TransportOutputMutex;
+};
+
+} // namespace lsp
+} // namespace llvm
+
+#endif
diff --git a/llvm/include/llvm/Support/VirtualFileSystem.h b/llvm/include/llvm/Support/VirtualFileSystem.h
index d97677305a39f..8007f3c853f20 100644
--- a/llvm/include/llvm/Support/VirtualFileSystem.h
+++ b/llvm/include/llvm/Support/VirtualFileSystem.h
@@ -1114,14 +1114,10 @@ class LLVM_ABI RedirectingFileSystem
 };
 
 /// Collect all pairs of <virtual path, real path> entries from the
-/// \p YAMLFilePath. This is used by the module dependency collector to forward
+/// \p VFS. This is used by the module dependency collector to forward
 /// the entries into the reproducer output VFS YAML file.
-LLVM_ABI void collectVFSFromYAML(
-    std::unique_ptr<llvm::MemoryBuffer> Buffer,
-    llvm::SourceMgr::DiagHandlerTy DiagHandler, StringRef YAMLFilePath,
-    SmallVectorImpl<YAMLVFSEntry> &CollectedEntries,
-    void *DiagContext = nullptr,
-    IntrusiveRefCntPtr<FileSystem> ExternalFS = getRealFileSystem());
+void collectVFSEntries(RedirectingFileSystem &VFS,
+                       SmallVectorImpl<YAMLVFSEntry> &CollectedEntries);
 
 class YAMLVFSWriter {
   std::vector<YAMLVFSEntry> Mappings;
diff --git a/llvm/include/llvm/Support/YAMLTraits.h b/llvm/include/llvm/Support/YAMLTraits.h
index 27af2d60c837f..81e3e2e41e86d 100644
--- a/llvm/include/llvm/Support/YAMLTraits.h
+++ b/llvm/include/llvm/Support/YAMLTraits.h
@@ -442,15 +442,8 @@ template <class T> struct has_CustomMappingTraits {
       is_detected<check, CustomMappingTraits<T>>::value;
 };
 
-// has_FlowTraits<int> will cause an error with some compilers because
-// it subclasses int.  Using this wrapper only instantiates the
-// real has_FlowTraits only if the template type is a class.
-template <typename T, bool Enabled = std::is_class_v<T>> class has_FlowTraits {
-public:
-  static constexpr bool value = false;
-};
-
-template <class T> struct has_FlowTraits<T, true> {
+// Test if flow is defined on type T.
+template <typename T> struct has_FlowTraits {
   template <class U> using check = decltype(&U::flow);
 
   static constexpr bool value = is_detected<check, T>::value;
@@ -459,8 +452,7 @@ template <class T> struct has_FlowTraits<T, true> {
 // Test if SequenceTraits<T> is defined on type T
 template <typename T>
 struct has_SequenceTraits
-    : public std::integral_constant<bool, has_SequenceMethodTraits<T>::value> {
-};
+    : public std::bool_constant<has_SequenceMethodTraits<T>::value> {};
 
 // Test if DocumentListTraits<T> is defined on type T
 template <class T> struct has_DocumentListTraits {
@@ -669,29 +661,27 @@ inline QuotingType needsQuotes(StringRef S, bool ForcePreserveAsString = true) {
 
 template <typename T, typename Context>
 struct missingTraits
-    : public std::integral_constant<bool,
-                                    !has_ScalarEnumerationTraits<T>::value &&
-                                        !has_ScalarBitSetTraits<T>::value &&
-                                        !has_ScalarTraits<T>::value &&
-                                        !has_BlockScalarTraits<T>::value &&
-                                        !has_TaggedScalarTraits<T>::value &&
-                                        !has_MappingTraits<T, Context>::value &&
-                                        !has_SequenceTraits<T>::value &&
-                                        !has_CustomMappingTraits<T>::value &&
-                                        !has_DocumentListTraits<T>::value &&
-                                        !has_PolymorphicTraits<T>::value> {};
+    : public std::bool_constant<
+          !has_ScalarEnumerationTraits<T>::value &&
+          !has_ScalarBitSetTraits<T>::value && !has_ScalarTraits<T>::value &&
+          !has_BlockScalarTraits<T>::value &&
+          !has_TaggedScalarTraits<T>::value &&
+          !has_MappingTraits<T, Context>::value &&
+          !has_SequenceTraits<T>::value && !has_CustomMappingTraits<T>::value &&
+          !has_DocumentListTraits<T>::value &&
+          !has_PolymorphicTraits<T>::value> {};
 
 template <typename T, typename Context>
 struct validatedMappingTraits
-    : public std::integral_constant<
-          bool, has_MappingTraits<T, Context>::value &&
-                    has_MappingValidateTraits<T, Context>::value> {};
+    : public std::bool_constant<has_MappingTraits<T, Context>::value &&
+                                has_MappingValidateTraits<T, Context>::value> {
+};
 
 template <typename T, typename Context>
 struct unvalidatedMappingTraits
-    : public std::integral_constant<
-          bool, has_MappingTraits<T, Context>::value &&
-                    !has_MappingValidateTraits<T, Context>::value> {};
+    : public std::bool_constant<has_MappingTraits<T, Context>::value &&
+                                !has_MappingValidateTraits<T, Context>::value> {
+};
 
 // Base class for Input and Output.
 class LLVM_ABI IO {
diff --git a/llvm/include/llvm/TargetParser/Triple.h b/llvm/include/llvm/TargetParser/Triple.h
index c8fa482a9a4f4..f9b4fc3aa2010 100644
--- a/llvm/include/llvm/TargetParser/Triple.h
+++ b/llvm/include/llvm/TargetParser/Triple.h
@@ -1328,6 +1328,10 @@ class Triple {
                                            const VersionTuple &Version);
 
   LLVM_ABI ExceptionHandling getDefaultExceptionHandling() const;
+
+  /// Compute the LLVM IR data layout string based on the triple. Some targets
+  /// customize the layout based on the ABIName string.
+  LLVM_ABI std::string computeDataLayout(StringRef ABIName = "") const;
 };
 
 } // End llvm namespace
diff --git a/llvm/include/llvm/Transforms/Utils/DebugSSAUpdater.h b/llvm/include/llvm/Transforms/Utils/DebugSSAUpdater.h
new file mode 100644
index 0000000000000..2d25ce3245793
--- /dev/null
+++ b/llvm/include/llvm/Transforms/Utils/DebugSSAUpdater.h
@@ -0,0 +1,365 @@
+//===- DebugSSAUpdater.h - Debug SSA Update Tool ----------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the DebugSSAUpdater class, which is used to evaluate the
+// live values of debug variables in IR. This uses SSA construction, treating
+// debug value records as definitions, to determine at each point in the program
+// which definition(s) are live at a given point. This is useful for analysis of
+// the state of debug variables, such as measuring the change in values of a
+// variable over time, or calculating coverage stats.
+//
+// NB: This is an expensive analysis that is generally not suitable for use in
+// LLVM passes, but may be useful for standalone tools.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_UTILS_DEBUGSSAUPDATER_H
+#define LLVM_TRANSFORMS_UTILS_DEBUGSSAUPDATER_H
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/DebugProgramInstruction.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/IR/ValueMap.h"
+#include <cstdint>
+
+namespace llvm {
+
+////////////////////////////////////////
+// SSAUpdater specialization classes
+
+class DbgSSAPhi;
+template <typename T> class SSAUpdaterTraits;
+
+/// A definition of a variable; can represent either a debug value, no
+/// definition (the variable has not yet been defined), or a phi value*.
+/// *Meaning multiple definitions that are live-in to a block from different
+/// predecessors, not a debug value that uses an IR PHINode.
+struct DbgValueDef {
+  DbgSSAPhi *Phi;
+  bool IsUndef;
+  bool IsMemory;
+  Metadata *Locations;
+  DIExpression *Expression;
+
+  DbgValueDef()
+      : Phi(nullptr), IsUndef(true), IsMemory(false), Locations(nullptr),
+        Expression(nullptr) {}
+  DbgValueDef(int)
+      : Phi(nullptr), IsUndef(true), IsMemory(false), Locations(nullptr),
+        Expression(nullptr) {}
+  DbgValueDef(bool IsMemory, Metadata *Locations, DIExpression *Expression)
+      : Phi(nullptr), IsUndef(false), IsMemory(IsMemory), Locations(Locations),
+        Expression(Expression) {}
+  DbgValueDef(DbgVariableRecord *DVR) : Phi(nullptr) {
+    assert(!DVR->isDbgAssign() && "#dbg_assign not yet supported");
+    IsUndef = DVR->isKillLocation();
+    IsMemory = DVR->isAddressOfVariable();
+    Locations = DVR->getRawLocation();
+    Expression = DVR->getExpression();
+  }
+  DbgValueDef(DbgSSAPhi *Phi)
+      : Phi(Phi), IsUndef(false), IsMemory(false), Locations(nullptr),
+        Expression(nullptr) {}
+
+  bool agreesWith(DbgValueDef Other) const {
+    if (IsUndef && Other.IsUndef)
+      return true;
+    return std::tie(Phi, IsUndef, IsMemory, Locations, Expression) ==
+           std::tie(Other.Phi, Other.IsUndef, Other.IsMemory, Other.Locations,
+                    Other.Expression);
+  }
+
+  operator bool() const { return !IsUndef; }
+  bool operator==(DbgValueDef Other) const { return agreesWith(Other); }
+  bool operator!=(DbgValueDef Other) const { return !agreesWith(Other); }
+
+  void print(raw_ostream &OS) const;
+};
+
+class DbgSSABlock;
+class DebugSSAUpdater;
+
+/// Represents the live-in definitions of a variable to a block with multiple
+/// predecessors.
+class DbgSSAPhi {
+public:
+  SmallVector<std::pair<DbgSSABlock *, DbgValueDef>, 4> IncomingValues;
+  DbgSSABlock *ParentBlock;
+  DbgSSAPhi(DbgSSABlock *ParentBlock) : ParentBlock(ParentBlock) {}
+
+  DbgSSABlock *getParent() { return ParentBlock; }
+  unsigned getNumIncomingValues() const { return IncomingValues.size(); }
+  DbgSSABlock *getIncomingBlock(size_t Idx) {
+    return IncomingValues[Idx].first;
+  }
+  DbgValueDef getIncomingValue(size_t Idx) {
+    return IncomingValues[Idx].second;
+  }
+  void addIncoming(DbgSSABlock *BB, DbgValueDef DV) {
+    IncomingValues.push_back({BB, DV});
+  }
+
+  void print(raw_ostream &OS) const;
+};
+
+inline raw_ostream &operator<<(raw_ostream &OS, const DbgValueDef &DV) {
+  DV.print(OS);
+  return OS;
+}
+inline raw_ostream &operator<<(raw_ostream &OS, const DbgSSAPhi &PHI) {
+  PHI.print(OS);
+  return OS;
+}
+
+/// Thin wrapper around a block successor iterator.
+class DbgSSABlockSuccIterator {
+public:
+  succ_iterator SuccIt;
+  DebugSSAUpdater &Updater;
+
+  DbgSSABlockSuccIterator(succ_iterator SuccIt, DebugSSAUpdater &Updater)
+      : SuccIt(SuccIt), Updater(Updater) {}
+
+  bool operator!=(const DbgSSABlockSuccIterator &OtherIt) const {
+    return OtherIt.SuccIt != SuccIt;
+  }
+
+  DbgSSABlockSuccIterator &operator++() {
+    ++SuccIt;
+    return *this;
+  }
+
+  DbgSSABlock *operator*();
+};
+
+/// Thin wrapper around a block successor iterator.
+class DbgSSABlockPredIterator {
+public:
+  pred_iterator PredIt;
+  DebugSSAUpdater &Updater;
+
+  DbgSSABlockPredIterator(pred_iterator PredIt, DebugSSAUpdater &Updater)
+      : PredIt(PredIt), Updater(Updater) {}
+
+  bool operator!=(const DbgSSABlockPredIterator &OtherIt) const {
+    return OtherIt.PredIt != PredIt;
+  }
+
+  DbgSSABlockPredIterator &operator++() {
+    ++PredIt;
+    return *this;
+  }
+
+  DbgSSABlock *operator*();
+};
+
+class DbgSSABlock {
+public:
+  BasicBlock &BB;
+  DebugSSAUpdater &Updater;
+  using PHIListT = SmallVector<DbgSSAPhi, 1>;
+  /// List of PHIs in this block. There should only ever be one, but this needs
+  /// to be a list for the SSAUpdater.
+  PHIListT PHIList;
+
+  DbgSSABlock(BasicBlock &BB, DebugSSAUpdater &Updater)
+      : BB(BB), Updater(Updater) {}
+
+  DbgSSABlockPredIterator pred_begin() {
+    return DbgSSABlockPredIterator(llvm::pred_begin(&BB), Updater);
+  }
+
+  DbgSSABlockPredIterator pred_end() {
+    return DbgSSABlockPredIterator(llvm::pred_end(&BB), Updater);
+  }
+
+  iterator_range<DbgSSABlockPredIterator> predecessors() {
+    return iterator_range(pred_begin(), pred_end());
+  }
+
+  DbgSSABlockSuccIterator succ_begin() {
+    return DbgSSABlockSuccIterator(llvm::succ_begin(&BB), Updater);
+  }
+
+  DbgSSABlockSuccIterator succ_end() {
+    return DbgSSABlockSuccIterator(llvm::succ_end(&BB), Updater);
+  }
+
+  iterator_range<DbgSSABlockSuccIterator> successors() {
+    return iterator_range(succ_begin(), succ_end());
+  }
+
+  /// SSAUpdater has requested a PHI: create that within this block record.
+  DbgSSAPhi *newPHI() {
+    assert(PHIList.empty() &&
+           "Only one PHI should exist per-block per-variable");
+    PHIList.emplace_back(this);
+    return &PHIList.back();
+  }
+
+  /// SSAUpdater wishes to know what PHIs already exist in this block.
+  PHIListT &phis() { return PHIList; }
+};
+
+/// Class used to determine the live ranges of debug variables in IR using
+/// SSA construction (via the SSAUpdaterImpl class), used for analysis purposes.
+class DebugSSAUpdater {
+  friend class SSAUpdaterTraits<DebugSSAUpdater>;
+  using AvailableValsTy = DenseMap<DbgSSABlock *, DbgValueDef>;
+
+private:
+  /// This keeps track of which value to use on a per-block basis. When we
+  /// insert PHI nodes, we keep track of them here.
+  AvailableValsTy AV;
+
+  /// Pointer to an optionally-passed vector into which, if it is non-null,
+  /// the PHIs that describe ambiguous variable locations will be inserted.
+  SmallVectorImpl<DbgSSAPhi *> *InsertedPHIs;
+
+  DenseMap<BasicBlock *, DbgSSABlock *> BlockMap;
+
+public:
+  /// If InsertedPHIs is specified, it will be filled
+  /// in with all PHI Nodes created by rewriting.
+  explicit DebugSSAUpdater(
+      SmallVectorImpl<DbgSSAPhi *> *InsertedPHIs = nullptr);
+  DebugSSAUpdater(const DebugSSAUpdater &) = delete;
+  DebugSSAUpdater &operator=(const DebugSSAUpdater &) = delete;
+
+  ~DebugSSAUpdater() {
+    for (auto &Block : BlockMap)
+      delete Block.second;
+  }
+
+  void reset() {
+    for (auto &Block : BlockMap)
+      delete Block.second;
+
+    if (InsertedPHIs)
+      InsertedPHIs->clear();
+    BlockMap.clear();
+  }
+
+  void initialize();
+
+  /// For a given BB, create a wrapper block for it. Stores it in the
+  /// DebugSSAUpdater block map.
+  DbgSSABlock *getDbgSSABlock(BasicBlock *BB) {
+    auto it = BlockMap.find(BB);
+    if (it == BlockMap.end()) {
+      BlockMap[BB] = new DbgSSABlock(*BB, *this);
+      it = BlockMap.find(BB);
+    }
+    return it->second;
+  }
+
+  /// Indicate that a rewritten value is available in the specified block
+  /// with the specified value.
+  void addAvailableValue(DbgSSABlock *BB, DbgValueDef DV);
+
+  /// Return true if the DebugSSAUpdater already has a value for the specified
+  /// block.
+  bool hasValueForBlock(DbgSSABlock *BB) const;
+
+  /// Return the value for the specified block if the DebugSSAUpdater has one,
+  /// otherwise return nullptr.
+  DbgValueDef findValueForBlock(DbgSSABlock *BB) const;
+
+  /// Construct SSA form, materializing a value that is live at the end
+  /// of the specified block.
+  DbgValueDef getValueAtEndOfBlock(DbgSSABlock *BB);
+
+  /// Construct SSA form, materializing a value that is live in the
+  /// middle of the specified block.
+  ///
+  /// \c getValueInMiddleOfBlock is the same as \c GetValueAtEndOfBlock except
+  /// in one important case: if there is a definition of the rewritten value
+  /// after the 'use' in BB.  Consider code like this:
+  ///
+  /// \code
+  ///      X1 = ...
+  ///   SomeBB:
+  ///      use(X)
+  ///      X2 = ...
+  ///      br Cond, SomeBB, OutBB
+  /// \endcode
+  ///
+  /// In this case, there are two values (X1 and X2) added to the AvailableVals
+  /// set by the client of the rewriter, and those values are both live out of
+  /// their respective blocks.  However, the use of X happens in the *middle* of
+  /// a block.  Because of this, we need to insert a new PHI node in SomeBB to
+  /// merge the appropriate values, and this value isn't live out of the block.
+  DbgValueDef getValueInMiddleOfBlock(DbgSSABlock *BB);
+
+private:
+  DbgValueDef getValueAtEndOfBlockInternal(DbgSSABlock *BB);
+};
+
+struct DbgRangeEntry {
+  BasicBlock::iterator Start;
+  BasicBlock::iterator End;
+  // Should be non-PHI.
+  DbgValueDef Value;
+};
+
+/// Utility class used to store the names of SSA values after their owning
+/// modules have been destroyed. Values are added via \c addValue to receive a
+/// corresponding ID, which can then be used to retrieve the name of the SSA
+/// value via \c getName at any point. Adding the same value multiple times
+/// returns the same ID, making \c addValue idempotent.
+class SSAValueNameMap {
+  struct Config : ValueMapConfig<Value *> {
+    enum { FollowRAUW = false };
+  };
+
+public:
+  using ValueID = uint64_t;
+  ValueID addValue(Value *V);
+  std::string getName(ValueID ID) { return ValueIDToNameMap[ID]; }
+
+private:
+  DenseMap<ValueID, std::string> ValueIDToNameMap;
+  ValueMap<Value *, ValueID, Config> ValueToIDMap;
+  ValueID NextID = 0;
+};
+
+/// Utility class used to find and store the live debug ranges for variables in
+/// a module. This class uses the DebugSSAUpdater for each variable added with
+/// \c addVariable to find either a single-location value, e.g. #dbg_declare, or
+/// a set of live value ranges corresponding to the set of #dbg_value records.
+class DbgValueRangeTable {
+  DenseMap<DebugVariableAggregate, SmallVector<DbgRangeEntry>>
+      OrigVariableValueRangeTable;
+  DenseMap<DebugVariableAggregate, DbgValueDef> OrigSingleLocVariableValueTable;
+
+public:
+  void addVariable(Function *F, DebugVariableAggregate DVA);
+  bool hasVariableEntry(DebugVariableAggregate DVA) const {
+    return OrigVariableValueRangeTable.contains(DVA) ||
+           OrigSingleLocVariableValueTable.contains(DVA);
+  }
+  bool hasSingleLocEntry(DebugVariableAggregate DVA) const {
+    return OrigSingleLocVariableValueTable.contains(DVA);
+  }
+  ArrayRef<DbgRangeEntry> getVariableRanges(DebugVariableAggregate DVA) {
+    return OrigVariableValueRangeTable[DVA];
+  }
+  DbgValueDef getSingleLoc(DebugVariableAggregate DVA) {
+    return OrigSingleLocVariableValueTable[DVA];
+  }
+
+  void printValues(DebugVariableAggregate DVA, raw_ostream &OS);
+};
+
+} // end namespace llvm
+
+#endif // LLVM_TRANSFORMS_UTILS_DEBUGSSAUPDATER_H
diff --git a/llvm/include/llvm/Transforms/Utils/LoopUtils.h b/llvm/include/llvm/Transforms/Utils/LoopUtils.h
index 96e3d3d47f2d0..c5dbb2bdd1dd8 100644
--- a/llvm/include/llvm/Transforms/Utils/LoopUtils.h
+++ b/llvm/include/llvm/Transforms/Utils/LoopUtils.h
@@ -185,8 +185,7 @@ LLVM_ABI bool hoistRegion(DomTreeNode *, AAResults *, LoopInfo *,
                           TargetLibraryInfo *, Loop *, MemorySSAUpdater &,
                           ScalarEvolution *, ICFLoopSafetyInfo *,
                           SinkAndHoistLICMFlags &, OptimizationRemarkEmitter *,
-                          bool, bool AllowSpeculation,
-                          bool HasCoroSuspendInst = false);
+                          bool, bool AllowSpeculation);
 
 /// Return true if the induction variable \p IV in a Loop whose latch is
 /// \p LatchBlock would become dead if the exit test \p Cond were removed.
@@ -323,22 +322,48 @@ LLVM_ABI TransformationMode hasLICMVersioningTransformation(const Loop *L);
 LLVM_ABI void addStringMetadataToLoop(Loop *TheLoop, const char *MDString,
                                       unsigned V = 0);
 
-/// Returns a loop's estimated trip count based on branch weight metadata.
-/// In addition if \p EstimatedLoopInvocationWeight is not null it is
-/// initialized with weight of loop's latch leading to the exit.
-/// Returns a valid positive trip count, saturated at UINT_MAX, or std::nullopt
-/// when a meaningful estimate cannot be made.
+/// Return either:
+/// - \c std::nullopt, if the implementation is unable to handle the loop form
+///   of \p L (e.g., \p L must have a latch block that controls the loop exit).
+/// - The value of \c llvm.loop.estimated_trip_count from the loop metadata of
+///   \p L, if that metadata is present.
+/// - Else, a new estimate of the trip count from the latch branch weights of
+///   \p L.
+///
+/// An estimated trip count is always a valid positive trip count, saturated at
+/// \c UINT_MAX.
+///
+/// In addition, if \p EstimatedLoopInvocationWeight, then either:
+/// - Set \c *EstimatedLoopInvocationWeight to the weight of the latch's branch
+///   to the loop exit.
+/// - Do not set it, and return \c std::nullopt, if the current implementation
+///   cannot compute that weight (e.g., if \p L does not have a latch block that
+///   controls the loop exit) or the weight is zero (because zero cannot be
+///   used to compute new branch weights that reflect the estimated trip count).
+///
+/// TODO: Eventually, once all passes have migrated away from setting branch
+/// weights to indicate estimated trip counts, this function will drop the
+/// \p EstimatedLoopInvocationWeight parameter.
 LLVM_ABI std::optional<unsigned>
 getLoopEstimatedTripCount(Loop *L,
                           unsigned *EstimatedLoopInvocationWeight = nullptr);
 
-/// Set a loop's branch weight metadata to reflect that loop has \p
-/// EstimatedTripCount iterations and \p EstimatedLoopInvocationWeight exits
-/// through latch. Returns true if metadata is successfully updated, false
-/// otherwise. Note that loop must have a latch block which controls loop exit
-/// in order to succeed.
-LLVM_ABI bool setLoopEstimatedTripCount(Loop *L, unsigned EstimatedTripCount,
-                                        unsigned EstimatedLoopInvocationWeight);
+/// Set \c llvm.loop.estimated_trip_count with the value \p EstimatedTripCount
+/// in the loop metadata of \p L.  Return false if the implementation is unable
+/// to handle the loop form of \p L (e.g., \p L must have a latch block that
+/// controls the loop exit).  Otherwise, return true.
+///
+/// In addition, if \p EstimatedLoopInvocationWeight, set the branch weight
+/// metadata of \p L to reflect that \p L has an estimated
+/// \p EstimatedTripCount iterations and has \c *EstimatedLoopInvocationWeight
+/// exit weight through the loop's latch.
+///
+/// TODO: Eventually, once all passes have migrated away from setting branch
+/// weights to indicate estimated trip counts, this function will drop the
+/// \p EstimatedLoopInvocationWeight parameter.
+LLVM_ABI bool setLoopEstimatedTripCount(
+    Loop *L, unsigned EstimatedTripCount,
+    std::optional<unsigned> EstimatedLoopInvocationWeight = std::nullopt);
 
 /// Check inner loop (L) backedge count is known to be invariant on all
 /// iterations of its outer loop. If the loop has no parent, this is trivially
diff --git a/llvm/include/llvm/Transforms/Utils/SimplifyLibCalls.h b/llvm/include/llvm/Transforms/Utils/SimplifyLibCalls.h
index deb3d6c44ef09..4e7c97194cc59 100644
--- a/llvm/include/llvm/Transforms/Utils/SimplifyLibCalls.h
+++ b/llvm/include/llvm/Transforms/Utils/SimplifyLibCalls.h
@@ -189,7 +189,7 @@ class LibCallSimplifier {
   Value *optimizeMemSet(CallInst *CI, IRBuilderBase &B);
   Value *optimizeRealloc(CallInst *CI, IRBuilderBase &B);
   Value *optimizeNew(CallInst *CI, IRBuilderBase &B, LibFunc &Func);
-  Value *optimizeExistingHotColdNew(CallInst *CI, IRBuilderBase &B);
+  Value *maybeOptimizeNoBuiltinOperatorNew(CallInst *CI, IRBuilderBase &B);
   Value *optimizeWcslen(CallInst *CI, IRBuilderBase &B);
   Value *optimizeBCopy(CallInst *CI, IRBuilderBase &B);
 
diff --git a/llvm/lib/Analysis/AliasAnalysis.cpp b/llvm/lib/Analysis/AliasAnalysis.cpp
index 3ec009ca4adde..f2dc25fa5dbf5 100644
--- a/llvm/lib/Analysis/AliasAnalysis.cpp
+++ b/llvm/lib/Analysis/AliasAnalysis.cpp
@@ -148,6 +148,18 @@ AliasResult AAResults::alias(const MemoryLocation &LocA,
   return Result;
 }
 
+AliasResult AAResults::aliasErrno(const MemoryLocation &Loc, const Module *M) {
+  AliasResult Result = AliasResult::MayAlias;
+
+  for (const auto &AA : AAs) {
+    Result = AA->aliasErrno(Loc, M);
+    if (Result != AliasResult::MayAlias)
+      break;
+  }
+
+  return Result;
+}
+
 ModRefInfo AAResults::getModRefInfoMask(const MemoryLocation &Loc,
                                         bool IgnoreLocals) {
   SimpleAAQueryInfo AAQIP(*this);
diff --git a/llvm/lib/Analysis/BasicAliasAnalysis.cpp b/llvm/lib/Analysis/BasicAliasAnalysis.cpp
index de37c391cf254..f812809e5e0b5 100644
--- a/llvm/lib/Analysis/BasicAliasAnalysis.cpp
+++ b/llvm/lib/Analysis/BasicAliasAnalysis.cpp
@@ -951,7 +951,8 @@ ModRefInfo BasicAAResult::getModRefInfo(const CallBase *Call,
     return ModRefInfo::NoModRef;
 
   ModRefInfo ArgMR = ME.getModRef(IRMemLocation::ArgMem);
-  ModRefInfo OtherMR = ME.getWithoutLoc(IRMemLocation::ArgMem).getModRef();
+  ModRefInfo ErrnoMR = ME.getModRef(IRMemLocation::ErrnoMem);
+  ModRefInfo OtherMR = ME.getModRef(IRMemLocation::Other);
 
   // An identified function-local object that does not escape can only be
   // accessed via call arguments. Reduce OtherMR (which includes accesses to
@@ -997,6 +998,15 @@ ModRefInfo BasicAAResult::getModRefInfo(const CallBase *Call,
   }
 
   ModRefInfo Result = ArgMR | OtherMR;
+
+  // Refine accesses to errno memory.
+  if ((ErrnoMR | Result) != Result) {
+    if (AAQI.AAR.aliasErrno(Loc, Call->getModule()) != AliasResult::NoAlias) {
+      // Exclusion conditions do not hold, this memory location may alias errno.
+      Result |= ErrnoMR;
+    }
+  }
+
   if (!isModAndRefSet(Result))
     return Result;
 
@@ -1851,6 +1861,20 @@ AliasResult BasicAAResult::aliasCheckRecursive(
   return AliasResult::MayAlias;
 }
 
+AliasResult BasicAAResult::aliasErrno(const MemoryLocation &Loc,
+                                      const Module *M) {
+  // There cannot be any alias with errno if the given memory location is an
+  // identified function-local object, or the size of the memory access is
+  // larger than the integer size.
+  if (Loc.Size.hasValue() &&
+      Loc.Size.getValue().getKnownMinValue() * 8 > TLI.getIntSize())
+    return AliasResult::NoAlias;
+
+  if (isIdentifiedFunctionLocal(getUnderlyingObject(Loc.Ptr)))
+    return AliasResult::NoAlias;
+  return AliasResult::MayAlias;
+}
+
 /// Check whether two Values can be considered equivalent.
 ///
 /// If the values may come from different cycle iterations, this will also
diff --git a/llvm/lib/Analysis/CaptureTracking.cpp b/llvm/lib/Analysis/CaptureTracking.cpp
index 71ca5131ee128..a0fe7f9037e47 100644
--- a/llvm/lib/Analysis/CaptureTracking.cpp
+++ b/llvm/lib/Analysis/CaptureTracking.cpp
@@ -275,13 +275,6 @@ UseCaptureInfo llvm::DetermineUseCaptureKind(const Use &U, const Value *Base) {
   case Instruction::Call:
   case Instruction::Invoke: {
     auto *Call = cast<CallBase>(I);
-    // Not captured if the callee is readonly, doesn't return a copy through
-    // its return value and doesn't unwind or diverge (a readonly function can
-    // leak bits by throwing an exception or not depending on the input value).
-    if (Call->onlyReadsMemory() && Call->doesNotThrow() && Call->willReturn() &&
-        Call->getType()->isVoidTy())
-      return CaptureComponents::None;
-
     // The pointer is not captured if returned pointer is not captured.
     // NOTE: CaptureTracking users should not assume that only functions
     // marked with nocapture do not capture. This means that places like
@@ -305,10 +298,17 @@ UseCaptureInfo llvm::DetermineUseCaptureKind(const Use &U, const Value *Base) {
     if (Call->isCallee(&U))
       return CaptureComponents::None;
 
-    // Not captured if only passed via 'nocapture' arguments.
     assert(Call->isDataOperand(&U) && "Non-callee must be data operand");
     CaptureInfo CI = Call->getCaptureInfo(Call->getDataOperandNo(&U));
-    return UseCaptureInfo(CI.getOtherComponents(), CI.getRetComponents());
+
+    // If the call is readonly and doesn't return a value, only the address
+    // may be captured.
+    CaptureComponents Mask = CaptureComponents::All;
+    if (Call->onlyReadsMemory() && Call->getType()->isVoidTy())
+      Mask = CaptureComponents::Address;
+
+    return UseCaptureInfo(CI.getOtherComponents() & Mask,
+                          CI.getRetComponents());
   }
   case Instruction::Load:
     // Volatile loads make the address observable.
diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp
index 07c6ba8ae7d9e..a3b2e62a1b8ba 100755
--- a/llvm/lib/Analysis/ConstantFolding.cpp
+++ b/llvm/lib/Analysis/ConstantFolding.cpp
@@ -4253,9 +4253,9 @@ static Constant *ConstantFoldScalableVectorCall(
     return ConstantInt::getFalse(SVTy);
   }
   case Intrinsic::get_active_lane_mask: {
-    auto Op0 = cast<ConstantInt>(Operands[0])->getValue();
-    auto Op1 = cast<ConstantInt>(Operands[1])->getValue();
-    if (Op0.uge(Op1))
+    auto *Op0 = dyn_cast<ConstantInt>(Operands[0]);
+    auto *Op1 = dyn_cast<ConstantInt>(Operands[1]);
+    if (Op0 && Op1 && Op0->getValue().uge(Op1->getValue()))
       return ConstantVector::getNullValue(SVTy);
     break;
   }
diff --git a/llvm/lib/Analysis/DependenceAnalysis.cpp b/llvm/lib/Analysis/DependenceAnalysis.cpp
index da86a8d2cc9c0..0f77a1410e83b 100644
--- a/llvm/lib/Analysis/DependenceAnalysis.cpp
+++ b/llvm/lib/Analysis/DependenceAnalysis.cpp
@@ -121,6 +121,12 @@ static cl::opt<unsigned> MIVMaxLevelThreshold(
     cl::desc("Maximum depth allowed for the recursive algorithm used to "
              "explore MIV direction vectors."));
 
+static cl::opt<bool> RunSIVRoutinesOnly(
+    "da-run-siv-routines-only", cl::init(false), cl::ReallyHidden,
+    cl::desc("Run only SIV routines and disable others (ZIV, RDIV, and MIV). "
+             "The purpose is mainly to exclude the influence of those routines "
+             "in regression tests for SIV routines."));
+
 //===----------------------------------------------------------------------===//
 // basics
 
@@ -1980,6 +1986,8 @@ bool DependenceInfo::exactRDIVtest(const SCEV *SrcCoeff, const SCEV *DstCoeff,
                                    const SCEV *SrcConst, const SCEV *DstConst,
                                    const Loop *SrcLoop, const Loop *DstLoop,
                                    FullDependence &Result) const {
+  if (RunSIVRoutinesOnly)
+    return false;
   LLVM_DEBUG(dbgs() << "\tExact RDIV test\n");
   LLVM_DEBUG(dbgs() << "\t    SrcCoeff = " << *SrcCoeff << " = AM\n");
   LLVM_DEBUG(dbgs() << "\t    DstCoeff = " << *DstCoeff << " = BM\n");
@@ -2124,6 +2132,8 @@ bool DependenceInfo::symbolicRDIVtest(const SCEV *A1, const SCEV *A2,
                                       const SCEV *C1, const SCEV *C2,
                                       const Loop *Loop1,
                                       const Loop *Loop2) const {
+  if (RunSIVRoutinesOnly)
+    return false;
   ++SymbolicRDIVapplications;
   LLVM_DEBUG(dbgs() << "\ttry symbolic RDIV test\n");
   LLVM_DEBUG(dbgs() << "\t    A1 = " << *A1);
@@ -2433,6 +2443,8 @@ bool DependenceInfo::accumulateCoefficientsGCD(const SCEV *Expr,
 // to "a common divisor".
 bool DependenceInfo::gcdMIVtest(const SCEV *Src, const SCEV *Dst,
                                 FullDependence &Result) const {
+  if (RunSIVRoutinesOnly)
+    return false;
   LLVM_DEBUG(dbgs() << "starting gcd\n");
   ++GCDapplications;
   unsigned BitWidth = SE->getTypeSizeInBits(Src->getType());
@@ -2599,6 +2611,8 @@ bool DependenceInfo::gcdMIVtest(const SCEV *Src, const SCEV *Dst,
 bool DependenceInfo::banerjeeMIVtest(const SCEV *Src, const SCEV *Dst,
                                      const SmallBitVector &Loops,
                                      FullDependence &Result) const {
+  if (RunSIVRoutinesOnly)
+    return false;
   LLVM_DEBUG(dbgs() << "starting Banerjee\n");
   ++BanerjeeApplications;
   LLVM_DEBUG(dbgs() << "    Src = " << *Src << '\n');
@@ -3698,8 +3712,8 @@ DependenceInfo::depends(Instruction *Src, Instruction *Dst,
 
   unsigned Pairs = 1;
   SmallVector<Subscript, 2> Pair(Pairs);
-  Pair[0].Src = SrcSCEV;
-  Pair[0].Dst = DstSCEV;
+  Pair[0].Src = SrcEv;
+  Pair[0].Dst = DstEv;
 
   if (Delinearize) {
     if (tryDelinearize(Src, Dst, Pair)) {
@@ -3709,6 +3723,8 @@ DependenceInfo::depends(Instruction *Src, Instruction *Dst,
   }
 
   for (unsigned P = 0; P < Pairs; ++P) {
+    assert(Pair[P].Src->getType()->isIntegerTy() && "Src must be an integer");
+    assert(Pair[P].Dst->getType()->isIntegerTy() && "Dst must be an integer");
     Pair[P].Loops.resize(MaxLevels + 1);
     Pair[P].GroupLoops.resize(MaxLevels + 1);
     Pair[P].Group.resize(Pairs);
@@ -4111,8 +4127,8 @@ const SCEV *DependenceInfo::getSplitIteration(const Dependence &Dep,
   SmallVector<Subscript, 2> Pair(Pairs);
   const SCEV *SrcSCEV = SE->getSCEV(SrcPtr);
   const SCEV *DstSCEV = SE->getSCEV(DstPtr);
-  Pair[0].Src = SrcSCEV;
-  Pair[0].Dst = DstSCEV;
+  Pair[0].Src = SE->removePointerBase(SrcSCEV);
+  Pair[0].Dst = SE->removePointerBase(DstSCEV);
 
   if (Delinearize) {
     if (tryDelinearize(Src, Dst, Pair)) {
@@ -4122,6 +4138,8 @@ const SCEV *DependenceInfo::getSplitIteration(const Dependence &Dep,
   }
 
   for (unsigned P = 0; P < Pairs; ++P) {
+    assert(Pair[P].Src->getType()->isIntegerTy() && "Src must be an integer");
+    assert(Pair[P].Dst->getType()->isIntegerTy() && "Dst must be an integer");
     Pair[P].Loops.resize(MaxLevels + 1);
     Pair[P].GroupLoops.resize(MaxLevels + 1);
     Pair[P].Group.resize(Pairs);
diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp
index ebe329aa1d5fe..100fa428cb842 100644
--- a/llvm/lib/Analysis/InstructionSimplify.cpp
+++ b/llvm/lib/Analysis/InstructionSimplify.cpp
@@ -1850,6 +1850,35 @@ static Value *simplifyAndOrOfFCmps(const SimplifyQuery &Q, FCmpInst *LHS,
                  : ConstantInt::getBool(LHS->getType(), !IsAnd);
   }
 
+  Value *V0;
+  const APFloat *V0Op1, *V1Op1;
+  // (fcmp olt V0, V0Op1) || (fcmp olt V0, V1Op1)
+  //                      --> fcmp olt V0, max(V0Op1, V1Op1)
+  // (fcmp ogt V0, V0Op1) || (fcmp ogt V0, V1Op1)
+  //                      --> fcmp ogt V0, max(V0Op1, V1Op1)
+  //
+  // (fcmp olt V0, V0Op1) && (fcmp olt V0, V1Op1)
+  //                      --> fcmp olt V0, min(V0Op1, V1Op1)
+  // (fcmp ogt V0, V0Op1) && (fcmp ogt V0, V1Op1)
+  //                      --> fcmp ogt V0, min(V0Op1, V1Op1)
+  if (match(LHS, m_SpecificFCmp(FCmpInst::FCMP_OLT, m_Value(V0),
+                                m_APFloat(V0Op1))) &&
+      match(RHS, m_SpecificFCmp(FCmpInst::FCMP_OLT, m_Specific(V0),
+                                m_APFloat(V1Op1)))) {
+    if (*V0Op1 > *V1Op1)
+      return IsAnd ? RHS : LHS;
+    if (*V1Op1 > *V0Op1)
+      return IsAnd ? LHS : RHS;
+  } else if (match(LHS, m_SpecificFCmp(FCmpInst::FCMP_OGT, m_Value(V0),
+                                       m_APFloat(V0Op1))) &&
+             match(RHS, m_SpecificFCmp(FCmpInst::FCMP_OGT, m_Specific(V0),
+                                       m_APFloat(V1Op1)))) {
+    if (*V0Op1 < *V1Op1)
+      return IsAnd ? RHS : LHS;
+    if (*V1Op1 < *V0Op1)
+      return IsAnd ? LHS : RHS;
+  }
+
   return nullptr;
 }
 
@@ -6474,6 +6503,10 @@ Value *llvm::simplifyBinaryIntrinsic(Intrinsic::ID IID, Type *ReturnType,
                                      const CallBase *Call) {
   unsigned BitWidth = ReturnType->getScalarSizeInBits();
   switch (IID) {
+  case Intrinsic::get_active_lane_mask:
+    if (match(Op1, m_Zero()))
+      return ConstantInt::getFalse(ReturnType);
+    break;
   case Intrinsic::abs:
     // abs(abs(x)) -> abs(x). We don't need to worry about the nsw arg here.
     // It is always ok to pick the earlier abs. We'll just lose nsw if its only
diff --git a/llvm/lib/Analysis/LoopInfo.cpp b/llvm/lib/Analysis/LoopInfo.cpp
index 6ba6073cce950..a8c3173bb1794 100644
--- a/llvm/lib/Analysis/LoopInfo.cpp
+++ b/llvm/lib/Analysis/LoopInfo.cpp
@@ -58,26 +58,14 @@ static cl::opt<bool, true>
 // Loop implementation
 //
 
-bool Loop::isLoopInvariant(const Value *V, bool HasCoroSuspendInst) const {
-  if (const Instruction *I = dyn_cast<Instruction>(V)) {
-    // FIXME: this is semantically inconsistent. We're tracking a proper fix in
-    // issue #149604.
-    // If V is a pointer to stack object and L contains a coro.suspend function
-    // call, then V may not be loop invariant because the ramp function and
-    // resume function have different stack frames.
-    if (HasCoroSuspendInst && isa<AllocaInst>(I))
-      return false;
-    else
-      return !contains(I);
-  }
+bool Loop::isLoopInvariant(const Value *V) const {
+  if (const Instruction *I = dyn_cast<Instruction>(V))
+    return !contains(I);
   return true; // All non-instructions are loop invariant
 }
 
-bool Loop::hasLoopInvariantOperands(const Instruction *I,
-                                    bool HasCoroSuspendInst) const {
-  return all_of(I->operands(), [&](Value *V) {
-    return isLoopInvariant(V, HasCoroSuspendInst);
-  });
+bool Loop::hasLoopInvariantOperands(const Instruction *I) const {
+  return all_of(I->operands(), [&](Value *V) { return isLoopInvariant(V); });
 }
 
 bool Loop::makeLoopInvariant(Value *V, bool &Changed, Instruction *InsertPt,
diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp
index ebb863076d2c5..079d7da5750b6 100644
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -3230,14 +3230,15 @@ const SCEV *ScalarEvolution::getMulExpr(SmallVectorImpl<const SCEV *> &Ops,
           match(Ops[1], m_scev_UDiv(m_SCEV(D), m_SCEVConstant(C2))) &&
           C2->getAPInt().isPowerOf2() &&
           C1V.logBase2() <= getMinTrailingZeros(D)) {
-        const SCEV *NewMul;
+        const SCEV *NewMul = nullptr;
         if (C1V.uge(C2->getAPInt())) {
           NewMul = getMulExpr(getUDivExpr(getConstant(C1V), C2), D);
-        } else {
+        } else if (C2->getAPInt().logBase2() <= getMinTrailingZeros(D)) {
           assert(C1V.ugt(1) && "C1 <= 1 should have been folded earlier");
           NewMul = getUDivExpr(D, getUDivExpr(C2, getConstant(C1V)));
         }
-        return C1V == LHSC->getAPInt() ? NewMul : getNegativeSCEV(NewMul);
+        if (NewMul)
+          return C1V == LHSC->getAPInt() ? NewMul : getNegativeSCEV(NewMul);
       }
     }
   }
@@ -15184,15 +15185,20 @@ void SCEVUnionPredicate::add(const SCEVPredicate *N, ScalarEvolution &SE) {
     return;
   }
 
+  // Implication checks are quadratic in the number of predicates. Stop doing
+  // them if there are many predicates, as they should be too expensive to use
+  // anyway at that point.
+  bool CheckImplies = Preds.size() < 16;
+
   // Only add predicate if it is not already implied by this union predicate.
-  if (implies(N, SE))
+  if (CheckImplies && implies(N, SE))
     return;
 
   // Build a new vector containing the current predicates, except the ones that
   // are implied by the new predicate N.
   SmallVector<const SCEVPredicate *> PrunedPreds;
   for (auto *P : Preds) {
-    if (N->implies(P, SE))
+    if (CheckImplies && N->implies(P, SE))
       continue;
     PrunedPreds.push_back(P);
   }
@@ -15457,6 +15463,12 @@ void ScalarEvolution::LoopGuards::collectFromPHI(
     const BasicBlock *InBlock = Phi.getIncomingBlock(IncomingIdx);
     if (!VisitedBlocks.insert(InBlock).second)
       return {nullptr, scCouldNotCompute};
+
+    // Avoid analyzing unreachable blocks so that we don't get trapped
+    // traversing cycles with ill-formed dominance or infinite cycles
+    if (!SE.DT.isReachableFromEntry(InBlock))
+      return {nullptr, scCouldNotCompute};
+
     auto [G, Inserted] = IncomingGuards.try_emplace(InBlock, LoopGuards(SE));
     if (Inserted)
       collectFromBlock(SE, G->second, Phi.getParent(), InBlock, VisitedBlocks,
@@ -15511,6 +15523,9 @@ void ScalarEvolution::LoopGuards::collectFromBlock(
     ScalarEvolution &SE, ScalarEvolution::LoopGuards &Guards,
     const BasicBlock *Block, const BasicBlock *Pred,
     SmallPtrSetImpl<const BasicBlock *> &VisitedBlocks, unsigned Depth) {
+
+  assert(SE.DT.isReachableFromEntry(Block) && SE.DT.isReachableFromEntry(Pred));
+
   SmallVector<const SCEV *> ExprsToRewrite;
   auto CollectCondition = [&](ICmpInst::Predicate Predicate, const SCEV *LHS,
                               const SCEV *RHS,
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 899806bf37348..09b50c5270e57 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1425,6 +1425,10 @@ bool TargetTransformInfo::preferEpilogueVectorization() const {
   return TTIImpl->preferEpilogueVectorization();
 }
 
+bool TargetTransformInfo::shouldConsiderVectorizationRegPressure() const {
+  return TTIImpl->shouldConsiderVectorizationRegPressure();
+}
+
 TargetTransformInfo::VPLegalization
 TargetTransformInfo::getVPLegalizationStrategy(const VPIntrinsic &VPI) const {
   return TTIImpl->getVPLegalizationStrategy(VPI);
diff --git a/llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp b/llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp
index 7025b8354564a..c7d263a75b33a 100644
--- a/llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp
+++ b/llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp
@@ -115,6 +115,7 @@
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
@@ -385,6 +386,25 @@ AliasResult TypeBasedAAResult::alias(const MemoryLocation &LocA,
   return AliasResult::NoAlias;
 }
 
+AliasResult TypeBasedAAResult::aliasErrno(const MemoryLocation &Loc,
+                                          const Module *M) {
+  if (!shouldUseTBAA())
+    return AliasResult::MayAlias;
+
+  const auto *N = Loc.AATags.TBAA;
+  if (!N)
+    return AliasResult::MayAlias;
+
+  // There cannot be any alias with errno if TBAA proves the given memory
+  // location does not alias errno.
+  const auto *ErrnoTBAAMD = M->getNamedMetadata("llvm.errno.tbaa");
+  if (!ErrnoTBAAMD || any_of(ErrnoTBAAMD->operands(), [&](const auto *Node) {
+        return Aliases(N, Node);
+      }))
+    return AliasResult::MayAlias;
+  return AliasResult::NoAlias;
+}
+
 ModRefInfo TypeBasedAAResult::getModRefInfoMask(const MemoryLocation &Loc,
                                                 AAQueryInfo &AAQI,
                                                 bool IgnoreLocals) {
diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 129823e0e98a3..73192a75fa507 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -5070,6 +5070,11 @@ void computeKnownFPClass(const Value *V, const APInt &DemandedElts,
                      KnownRHS.isKnownNeverPosZero()) &&
                     (KnownLHS.isKnownNeverPosZero() ||
                      KnownRHS.isKnownNeverNegZero()))) {
+          // Don't take sign bit from NaN operands.
+          if (!KnownLHS.isKnownNeverNaN())
+            KnownLHS.SignBit = std::nullopt;
+          if (!KnownRHS.isKnownNeverNaN())
+            KnownRHS.SignBit = std::nullopt;
           if ((IID == Intrinsic::maximum || IID == Intrinsic::maximumnum ||
                IID == Intrinsic::maxnum) &&
               (KnownLHS.SignBit == false || KnownRHS.SignBit == false))
diff --git a/llvm/lib/BinaryFormat/AMDGPUMetadataVerifier.cpp b/llvm/lib/BinaryFormat/AMDGPUMetadataVerifier.cpp
index 8737dc0fc7459..f2ada27cac01d 100644
--- a/llvm/lib/BinaryFormat/AMDGPUMetadataVerifier.cpp
+++ b/llvm/lib/BinaryFormat/AMDGPUMetadataVerifier.cpp
@@ -281,7 +281,14 @@ bool MetadataVerifier::verifyKernel(msgpack::DocNode &Node) {
     return false;
   if (!verifyIntegerEntry(KernelMap, ".uniform_work_group_size", false))
     return false;
-
+  if (!verifyEntry(
+          KernelMap, ".cluster_dims", false, [this](msgpack::DocNode &Node) {
+            return verifyArray(
+                Node,
+                [this](msgpack::DocNode &Node) { return verifyInteger(Node); },
+                3);
+          }))
+    return false;
 
   return true;
 }
diff --git a/llvm/lib/BinaryFormat/Dwarf.cpp b/llvm/lib/BinaryFormat/Dwarf.cpp
index 0d17dc175fed9..8b24044e19e50 100644
--- a/llvm/lib/BinaryFormat/Dwarf.cpp
+++ b/llvm/lib/BinaryFormat/Dwarf.cpp
@@ -911,6 +911,27 @@ StringRef llvm::dwarf::RLEString(unsigned RLE) {
   }
 }
 
+StringRef llvm::dwarf::AddressSpaceString(unsigned AS, const llvm::Triple &TT) {
+  switch (AS) {
+#define HANDLE_DW_ASPACE(ID, NAME)                                             \
+  case DW_ASPACE_LLVM_##NAME:                                                  \
+    return "DW_ASPACE_LLVM_" #NAME;
+#define HANDLE_DW_ASPACE_PRED(ID, NAME, PRED)
+#include "llvm/BinaryFormat/Dwarf.def"
+  default:
+    break;
+  }
+
+  bool SELECT_AMDGPU = TT.isAMDGPU();
+#define HANDLE_DW_ASPACE(ID, NAME)
+#define HANDLE_DW_ASPACE_PRED(ID, NAME, PRED)                                  \
+  if (DW_ASPACE_LLVM_##NAME == AS && PRED)                                     \
+    return "DW_ASPACE_LLVM_" #NAME;
+#include "llvm/BinaryFormat/Dwarf.def"
+
+  return "";
+}
+
 StringRef (*const llvm::dwarf::EnumTraits<Tag>::StringFn)(unsigned) = TagString;
 StringRef (*const llvm::dwarf::EnumTraits<Attribute>::StringFn)(unsigned) =
     AttributeString;
diff --git a/llvm/lib/Bitcode/Reader/MetadataLoader.cpp b/llvm/lib/Bitcode/Reader/MetadataLoader.cpp
index a5cedadd30981..22c7fa5f515ee 100644
--- a/llvm/lib/Bitcode/Reader/MetadataLoader.cpp
+++ b/llvm/lib/Bitcode/Reader/MetadataLoader.cpp
@@ -25,6 +25,7 @@
 #include "llvm/Bitcode/BitcodeReader.h"
 #include "llvm/Bitcode/LLVMBitCodes.h"
 #include "llvm/Bitstream/BitstreamReader.h"
+#include "llvm/IR/Argument.h"
 #include "llvm/IR/AutoUpgrade.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constants.h"
@@ -58,9 +59,6 @@
 #include <tuple>
 #include <utility>
 #include <vector>
-namespace llvm {
-class Argument;
-}
 
 using namespace llvm;
 
@@ -83,8 +81,6 @@ static cl::opt<bool> DisableLazyLoading(
 
 namespace {
 
-static int64_t unrotateSign(uint64_t U) { return (U & 1) ? ~(U >> 1) : U >> 1; }
-
 class BitcodeReaderMetadataList {
   /// Array of metadata references.
   ///
@@ -129,10 +125,7 @@ class BitcodeReaderMetadataList {
   void pop_back() { MetadataPtrs.pop_back(); }
   bool empty() const { return MetadataPtrs.empty(); }
 
-  Metadata *operator[](unsigned i) const {
-    assert(i < MetadataPtrs.size());
-    return MetadataPtrs[i];
-  }
+  Metadata *operator[](unsigned i) const { return MetadataPtrs[i]; }
 
   Metadata *lookup(unsigned I) const {
     if (I < MetadataPtrs.size())
@@ -178,6 +171,9 @@ class BitcodeReaderMetadataList {
 private:
   Metadata *resolveTypeRefArray(Metadata *MaybeTuple);
 };
+} // namespace
+
+static int64_t unrotateSign(uint64_t U) { return (U & 1) ? ~(U >> 1) : U >> 1; }
 
 void BitcodeReaderMetadataList::assignValue(Metadata *MD, unsigned Idx) {
   if (auto *MDN = dyn_cast<MDNode>(MD))
@@ -392,8 +388,6 @@ void PlaceholderQueue::flush(BitcodeReaderMetadataList &MetadataList) {
   }
 }
 
-} // anonymous namespace
-
 static Error error(const Twine &Message) {
   return make_error<StringError>(
       Message, make_error_code(BitcodeError::CorruptedBitcode));
diff --git a/llvm/lib/CAS/CMakeLists.txt b/llvm/lib/CAS/CMakeLists.txt
index f3d2b41c704bc..6ed724bc2fd76 100644
--- a/llvm/lib/CAS/CMakeLists.txt
+++ b/llvm/lib/CAS/CMakeLists.txt
@@ -3,7 +3,9 @@ add_llvm_component_library(LLVMCAS
   ActionCaches.cpp
   BuiltinCAS.cpp
   InMemoryCAS.cpp
+  MappedFileRegionArena.cpp
   ObjectStore.cpp
+  OnDiskCommon.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${LLVM_MAIN_INCLUDE_DIR}/llvm/CAS
diff --git a/llvm/lib/CAS/MappedFileRegionArena.cpp b/llvm/lib/CAS/MappedFileRegionArena.cpp
new file mode 100644
index 0000000000000..472843d78af6e
--- /dev/null
+++ b/llvm/lib/CAS/MappedFileRegionArena.cpp
@@ -0,0 +1,389 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file Implements MappedFileRegionArena.
+///
+/// A bump pointer allocator, backed by a memory-mapped file.
+///
+/// The effect we want is:
+///
+/// Step 1. If it doesn't exist, create the file with an initial size.
+/// Step 2. Reserve virtual memory large enough for the max file size.
+/// Step 3. Map the file into memory in the reserved region.
+/// Step 4. Increase the file size and update the mapping when necessary.
+///
+/// However, updating the mapping is challenging when it needs to work portably,
+/// and across multiple processes without locking for every read. Our current
+/// implementation handles the steps above in following ways:
+///
+/// Step 1. Use \ref sys::fs::resize_file_sparse to grow the file to its max
+///         size (typically several GB). If the file system doesn't support
+///         sparse file, this may return a fully allocated file.
+/// Step 2. Call \ref sys::fs::mapped_file_region to map the entire file.
+/// Step 3. [Automatic as part of step 2.]
+/// Step 4. If supported, use \c fallocate or similiar APIs to ensure the file
+///         system storage for the sparse file so we won't end up with partial
+///         file if the disk is out of space.
+///
+/// Additionally, we attempt to resize the file to its actual data size when
+/// closing the mapping, if this is the only concurrent instance. This is done
+/// using file locks. Shrinking the file mitigates problems with having large
+/// files: on filesystems without sparse files it avoids unnecessary space use;
+/// it also avoids allocating the full size if another process copies the file,
+/// which typically loses sparseness. These mitigations only work while the file
+/// is not in use.
+///
+/// The capacity and the header offset is determined by the first user of the
+/// MappedFileRegionArena instance and any future mismatched value from the
+/// original will result in error on creation.
+///
+/// To support resizing, we use two separate file locks:
+/// 1. We use a shared reader lock on a ".shared" file until destruction.
+/// 2. We use a lock on the main file during initialization - shared to check
+///    the status, upgraded to exclusive to resize/initialize the file.
+///
+/// Then during destruction we attempt to get exclusive access on (1), which
+/// requires no concurrent readers. If so, we shrink the file. Using two
+/// separate locks simplifies the implementation and enables it to work on
+/// platforms (e.g. Windows) where a shared/reader lock prevents writing.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CAS/MappedFileRegionArena.h"
+#include "OnDiskCommon.h"
+#include "llvm/ADT/StringExtras.h"
+
+#if LLVM_ON_UNIX
+#include <sys/stat.h>
+#if __has_include(<sys/param.h>)
+#include <sys/param.h>
+#endif
+#ifdef DEV_BSIZE
+#define MAPPED_FILE_BSIZE DEV_BSIZE
+#elif __linux__
+#define MAPPED_FILE_BSIZE 512
+#endif
+#endif
+
+using namespace llvm;
+using namespace llvm::cas;
+using namespace llvm::cas::ondisk;
+
+namespace {
+struct FileWithLock {
+  std::string Path;
+  int FD = -1;
+  std::optional<sys::fs::LockKind> Locked;
+
+private:
+  FileWithLock(std::string PathStr, Error &E) : Path(std::move(PathStr)) {
+    ErrorAsOutParameter EOP(&E);
+    if (std::error_code EC = sys::fs::openFileForReadWrite(
+            Path, FD, sys::fs::CD_OpenAlways, sys::fs::OF_None))
+      E = createFileError(Path, EC);
+  }
+
+public:
+  FileWithLock(FileWithLock &) = delete;
+  FileWithLock(FileWithLock &&Other) {
+    Path = std::move(Other.Path);
+    FD = Other.FD;
+    Other.FD = -1;
+    Locked = Other.Locked;
+    Other.Locked = std::nullopt;
+  }
+
+  ~FileWithLock() { consumeError(unlock()); }
+
+  static Expected<FileWithLock> open(StringRef Path) {
+    Error E = Error::success();
+    FileWithLock Result(Path.str(), E);
+    if (E)
+      return std::move(E);
+    return std::move(Result);
+  }
+
+  Error lock(sys::fs::LockKind LK) {
+    assert(!Locked && "already locked");
+    if (std::error_code EC = lockFileThreadSafe(FD, LK))
+      return createFileError(Path, EC);
+    Locked = LK;
+    return Error::success();
+  }
+
+  Error switchLock(sys::fs::LockKind LK) {
+    assert(Locked && "not locked");
+    if (auto E = unlock())
+      return E;
+
+    return lock(LK);
+  }
+
+  Error unlock() {
+    if (Locked) {
+      Locked = std::nullopt;
+      if (std::error_code EC = unlockFileThreadSafe(FD))
+        return createFileError(Path, EC);
+    }
+    return Error::success();
+  }
+
+  // Return true if succeed to lock the file exclusively.
+  bool tryLockExclusive() {
+    assert(!Locked && "can only try to lock if not locked");
+    if (tryLockFileThreadSafe(FD) == std::error_code()) {
+      Locked = sys::fs::LockKind::Exclusive;
+      return true;
+    }
+
+    return false;
+  }
+
+  // Release the lock so it will not be unlocked on destruction.
+  void release() {
+    Locked = std::nullopt;
+    FD = -1;
+  }
+};
+
+struct FileSizeInfo {
+  uint64_t Size;
+  uint64_t AllocatedSize;
+
+  static ErrorOr<FileSizeInfo> get(sys::fs::file_t File);
+};
+} // end anonymous namespace
+
+Expected<MappedFileRegionArena> MappedFileRegionArena::create(
+    const Twine &Path, uint64_t Capacity, uint64_t HeaderOffset,
+    function_ref<Error(MappedFileRegionArena &)> NewFileConstructor) {
+  uint64_t MinCapacity = HeaderOffset + sizeof(Header);
+  if (Capacity < MinCapacity)
+    return createStringError(
+        std::make_error_code(std::errc::invalid_argument),
+        "capacity is too small to hold MappedFileRegionArena");
+
+  MappedFileRegionArena Result;
+  Result.Path = Path.str();
+
+  // Open the shared lock file. See file comment for details of locking scheme.
+  SmallString<128> SharedFilePath(Result.Path);
+  SharedFilePath.append(".shared");
+
+  auto SharedFileLock = FileWithLock::open(SharedFilePath);
+  if (!SharedFileLock)
+    return SharedFileLock.takeError();
+  Result.SharedLockFD = SharedFileLock->FD;
+
+  // Take shared/reader lock that will be held until destroyImpl if construction
+  // is successful.
+  if (auto E = SharedFileLock->lock(sys::fs::LockKind::Shared))
+    return std::move(E);
+
+  // Take shared/reader lock for initialization.
+  auto MainFile = FileWithLock::open(Result.Path);
+  if (!MainFile)
+    return MainFile.takeError();
+  if (Error E = MainFile->lock(sys::fs::LockKind::Shared))
+    return std::move(E);
+  Result.FD = MainFile->FD;
+
+  sys::fs::file_t File = sys::fs::convertFDToNativeFile(MainFile->FD);
+  auto FileSize = FileSizeInfo::get(File);
+  if (!FileSize)
+    return createFileError(Result.Path, FileSize.getError());
+
+  // If the size is smaller than the capacity, we need to initialize the file.
+  // It maybe empty, or may have been shrunk during a previous close.
+  if (FileSize->Size < Capacity) {
+    // Lock the file exclusively so only one process will do the initialization.
+    if (Error E = MainFile->switchLock(sys::fs::LockKind::Exclusive))
+      return std::move(E);
+    // Retrieve the current size now that we have exclusive access.
+    FileSize = FileSizeInfo::get(File);
+    if (!FileSize)
+      return createFileError(Result.Path, FileSize.getError());
+  }
+
+  if (FileSize->Size >= MinCapacity) {
+    // File is initialized. Read out the header to check for capacity and
+    // offset.
+    SmallVector<char, sizeof(Header)> HeaderContent(sizeof(Header));
+    auto Size = sys::fs::readNativeFileSlice(File, HeaderContent, HeaderOffset);
+    if (!Size)
+      return Size.takeError();
+
+    Header H;
+    memcpy(&H, HeaderContent.data(), sizeof(H));
+    if (H.HeaderOffset != HeaderOffset)
+      return createStringError(
+          std::make_error_code(std::errc::invalid_argument),
+          "specified header offset (" + utostr(HeaderOffset) +
+              ") does not match existing config (" + utostr(H.HeaderOffset) +
+              ")");
+
+    // If the capacity doesn't match, use the existing capacity instead.
+    if (H.Capacity != Capacity)
+      Capacity = H.Capacity;
+  }
+
+  // If the size is smaller than capacity, we need to resize the file.
+  if (FileSize->Size < Capacity) {
+    assert(MainFile->Locked == sys::fs::LockKind::Exclusive);
+    if (std::error_code EC =
+            sys::fs::resize_file_sparse(MainFile->FD, Capacity))
+      return createFileError(Result.Path, EC);
+  }
+
+  // Create the mapped region.
+  {
+    std::error_code EC;
+    sys::fs::mapped_file_region Map(
+        File, sys::fs::mapped_file_region::readwrite, Capacity, 0, EC);
+    if (EC)
+      return createFileError(Result.Path, EC);
+    Result.Region = std::move(Map);
+  }
+
+  // Initialize the header.
+  Result.initializeHeader(HeaderOffset);
+  if (FileSize->Size < MinCapacity) {
+    assert(MainFile->Locked == sys::fs::LockKind::Exclusive);
+    // If we need to fully initialize the file, call NewFileConstructor.
+    if (Error E = NewFileConstructor(Result))
+      return std::move(E);
+
+    Result.H->HeaderOffset.exchange(HeaderOffset);
+    Result.H->Capacity.exchange(Capacity);
+  }
+
+  if (MainFile->Locked == sys::fs::LockKind::Exclusive) {
+    // If holding an exclusive lock, we might have resized the file and
+    // performed some read/write to the file. Query the file size again to make
+    // sure everything is up-to-date. Otherwise, FileSize info is already
+    // up-to-date.
+    FileSize = FileSizeInfo::get(File);
+    if (!FileSize)
+      return createFileError(Result.Path, FileSize.getError());
+    Result.H->AllocatedSize.exchange(FileSize->AllocatedSize);
+  }
+
+  // Release the shared lock so it can be closed in destoryImpl().
+  SharedFileLock->release();
+  return std::move(Result);
+}
+
+void MappedFileRegionArena::destroyImpl() {
+  if (!FD)
+    return;
+
+  // Drop the shared lock indicating we are no longer accessing the file.
+  if (SharedLockFD)
+    (void)unlockFileThreadSafe(*SharedLockFD);
+
+  // Attempt to truncate the file if we can get exclusive access. Ignore any
+  // errors.
+  if (H) {
+    assert(SharedLockFD && "Must have shared lock file open");
+    if (tryLockFileThreadSafe(*SharedLockFD) == std::error_code()) {
+      size_t Size = size();
+      // sync to file system to make sure all contents are up-to-date.
+      (void)Region.sync();
+      // unmap the file before resizing since that is the requirement for
+      // some platforms.
+      Region.unmap();
+      (void)sys::fs::resize_file(*FD, Size);
+      (void)unlockFileThreadSafe(*SharedLockFD);
+    }
+  }
+
+  auto Close = [](std::optional<int> &FD) {
+    if (FD) {
+      sys::fs::file_t File = sys::fs::convertFDToNativeFile(*FD);
+      sys::fs::closeFile(File);
+      FD = std::nullopt;
+    }
+  };
+
+  // Close the file and shared lock.
+  Close(FD);
+  Close(SharedLockFD);
+}
+
+void MappedFileRegionArena::initializeHeader(uint64_t HeaderOffset) {
+  assert(capacity() < (uint64_t)INT64_MAX && "capacity must fit in int64_t");
+  uint64_t HeaderEndOffset = HeaderOffset + sizeof(decltype(*H));
+  assert(HeaderEndOffset <= capacity() &&
+         "Expected end offset to be pre-allocated");
+  assert(isAligned(Align::Of<decltype(*H)>(), HeaderOffset) &&
+         "Expected end offset to be aligned");
+  H = reinterpret_cast<decltype(H)>(data() + HeaderOffset);
+
+  uint64_t ExistingValue = 0;
+  if (!H->BumpPtr.compare_exchange_strong(ExistingValue, HeaderEndOffset))
+    assert(ExistingValue >= HeaderEndOffset &&
+           "Expected 0, or past the end of the header itself");
+}
+
+static Error createAllocatorOutOfSpaceError() {
+  return createStringError(std::make_error_code(std::errc::not_enough_memory),
+                           "memory mapped file allocator is out of space");
+}
+
+Expected<int64_t> MappedFileRegionArena::allocateOffset(uint64_t AllocSize) {
+  AllocSize = alignTo(AllocSize, getAlign());
+  uint64_t OldEnd = H->BumpPtr.fetch_add(AllocSize);
+  uint64_t NewEnd = OldEnd + AllocSize;
+  if (LLVM_UNLIKELY(NewEnd > capacity())) {
+    // Return the allocation. If the start already passed the end, that means
+    // some other concurrent allocations already consumed all the capacity.
+    // There is no need to return the original value. If the start was not
+    // passed the end, current allocation certainly bumped it passed the end.
+    // All other allocation afterwards must have failed and current allocation
+    // is in charge of return the allocation back to a valid value.
+    if (OldEnd <= capacity())
+      (void)H->BumpPtr.exchange(OldEnd);
+
+    return createAllocatorOutOfSpaceError();
+  }
+
+  uint64_t DiskSize = H->AllocatedSize;
+  if (LLVM_UNLIKELY(NewEnd > DiskSize)) {
+    uint64_t NewSize;
+    // The minimum increment is a page, but allocate more to amortize the cost.
+    constexpr uint64_t Increment = 1 * 1024 * 1024; // 1 MB
+    if (Error E = preallocateFileTail(*FD, DiskSize, DiskSize + Increment)
+                      .moveInto(NewSize))
+      return std::move(E);
+    assert(NewSize >= DiskSize + Increment);
+    // FIXME: on Darwin this can under-count the size if there is a race to
+    // preallocate disk, because the semantics of F_PREALLOCATE are to add bytes
+    // to the end of the file, not to allocate up to a fixed size.
+    // Any discrepancy will be resolved the next time the file is truncated and
+    // then reopend.
+    while (DiskSize < NewSize)
+      H->AllocatedSize.compare_exchange_strong(DiskSize, NewSize);
+  }
+  return OldEnd;
+}
+
+ErrorOr<FileSizeInfo> FileSizeInfo::get(sys::fs::file_t File) {
+#if LLVM_ON_UNIX && defined(MAPPED_FILE_BSIZE)
+  struct stat Status;
+  int StatRet = ::fstat(File, &Status);
+  if (StatRet)
+    return errnoAsErrorCode();
+  uint64_t AllocatedSize = uint64_t(Status.st_blksize) * MAPPED_FILE_BSIZE;
+  return FileSizeInfo{uint64_t(Status.st_size), AllocatedSize};
+#else
+  // Fallback: assume the file is fully allocated. Note: this may result in
+  // data loss on out-of-space.
+  sys::fs::file_status Status;
+  if (std::error_code EC = sys::fs::status(File, Status))
+    return EC;
+  return FileSizeInfo{Status.getSize(), Status.getSize()};
+#endif
+}
diff --git a/llvm/lib/CAS/OnDiskCommon.cpp b/llvm/lib/CAS/OnDiskCommon.cpp
new file mode 100644
index 0000000000000..25aa06bfe64da
--- /dev/null
+++ b/llvm/lib/CAS/OnDiskCommon.cpp
@@ -0,0 +1,127 @@
+//===- OnDiskCommon.cpp ---------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "OnDiskCommon.h"
+#include "llvm/Config/config.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/FileSystem.h"
+#include <thread>
+
+#if __has_include(<sys/file.h>)
+#include <sys/file.h>
+#ifdef LOCK_SH
+#define HAVE_FLOCK 1
+#else
+#define HAVE_FLOCK 0
+#endif
+#endif
+
+#if __has_include(<fcntl.h>)
+#include <fcntl.h>
+#endif
+
+using namespace llvm;
+
+std::error_code cas::ondisk::lockFileThreadSafe(int FD,
+                                                sys::fs::LockKind Kind) {
+#if HAVE_FLOCK
+  if (flock(FD, Kind == sys::fs::LockKind::Exclusive ? LOCK_EX : LOCK_SH) == 0)
+    return std::error_code();
+  return std::error_code(errno, std::generic_category());
+#elif defined(_WIN32)
+  // On Windows this implementation is thread-safe.
+  return sys::fs::lockFile(FD, Kind);
+#else
+  return make_error_code(std::errc::no_lock_available);
+#endif
+}
+
+std::error_code cas::ondisk::unlockFileThreadSafe(int FD) {
+#if HAVE_FLOCK
+  if (flock(FD, LOCK_UN) == 0)
+    return std::error_code();
+  return std::error_code(errno, std::generic_category());
+#elif defined(_WIN32)
+  // On Windows this implementation is thread-safe.
+  return sys::fs::unlockFile(FD);
+#else
+  return make_error_code(std::errc::no_lock_available);
+#endif
+}
+
+std::error_code
+cas::ondisk::tryLockFileThreadSafe(int FD, std::chrono::milliseconds Timeout,
+                                   sys::fs::LockKind Kind) {
+#if HAVE_FLOCK
+  auto Start = std::chrono::steady_clock::now();
+  auto End = Start + Timeout;
+  do {
+    if (flock(FD, (Kind == sys::fs::LockKind::Exclusive ? LOCK_EX : LOCK_SH) |
+                      LOCK_NB) == 0)
+      return std::error_code();
+    int Error = errno;
+    if (Error == EWOULDBLOCK) {
+      // Match sys::fs::tryLockFile, which sleeps for 1 ms per attempt.
+      std::this_thread::sleep_for(std::chrono::milliseconds(1));
+      continue;
+    }
+    return std::error_code(Error, std::generic_category());
+  } while (std::chrono::steady_clock::now() < End);
+  return make_error_code(std::errc::no_lock_available);
+#elif defined(_WIN32)
+  // On Windows this implementation is thread-safe.
+  return sys::fs::tryLockFile(FD, Timeout, Kind);
+#else
+  return make_error_code(std::errc::no_lock_available);
+#endif
+}
+
+Expected<size_t> cas::ondisk::preallocateFileTail(int FD, size_t CurrentSize,
+                                                  size_t NewSize) {
+  auto CreateError = [&](std::error_code EC) -> Expected<size_t> {
+    if (EC == std::errc::not_supported)
+      // Ignore ENOTSUP in case the filesystem cannot preallocate.
+      return NewSize;
+#if defined(HAVE_POSIX_FALLOCATE)
+    if (EC == std::errc::invalid_argument && CurrentSize < NewSize && // len > 0
+        NewSize < std::numeric_limits<off_t>::max()) // 0 <= offset, len < max
+      // Prior to 2024, POSIX required EINVAL for cases that should be ENOTSUP,
+      // so handle it the same as above if it is not one of the other ways to
+      // get EINVAL.
+      return NewSize;
+#endif
+    return createStringError(EC,
+                             "failed to allocate to CAS file: " + EC.message());
+  };
+#if defined(HAVE_POSIX_FALLOCATE)
+  // Note: posix_fallocate returns its error directly, not via errno.
+  if (int Err = posix_fallocate(FD, CurrentSize, NewSize - CurrentSize))
+    return CreateError(std::error_code(Err, std::generic_category()));
+  return NewSize;
+#elif defined(__APPLE__)
+  fstore_t FAlloc;
+  FAlloc.fst_flags = F_ALLOCATEALL;
+#if defined(F_ALLOCATEPERSIST) &&                                              \
+    defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__) &&                  \
+    __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ >= 130000
+  // F_ALLOCATEPERSIST is introduced in macOS 13.
+  FAlloc.fst_flags |= F_ALLOCATEPERSIST;
+#endif
+  FAlloc.fst_posmode = F_PEOFPOSMODE;
+  FAlloc.fst_offset = 0;
+  FAlloc.fst_length = NewSize - CurrentSize;
+  FAlloc.fst_bytesalloc = 0;
+  if (fcntl(FD, F_PREALLOCATE, &FAlloc))
+    return CreateError(errnoAsErrorCode());
+  assert(CurrentSize + FAlloc.fst_bytesalloc >= NewSize);
+  return CurrentSize + FAlloc.fst_bytesalloc;
+#else
+  (void)CreateError; // Silence unused variable.
+  return NewSize;    // Pretend it worked.
+#endif
+}
diff --git a/llvm/lib/CAS/OnDiskCommon.h b/llvm/lib/CAS/OnDiskCommon.h
new file mode 100644
index 0000000000000..8b79ffe5c3158
--- /dev/null
+++ b/llvm/lib/CAS/OnDiskCommon.h
@@ -0,0 +1,46 @@
+//===- OnDiskCommon.h -------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CAS_ONDISKCOMMON_H
+#define LLVM_LIB_CAS_ONDISKCOMMON_H
+
+#include "llvm/Support/Error.h"
+#include "llvm/Support/FileSystem.h"
+#include <chrono>
+
+namespace llvm::cas::ondisk {
+
+/// Thread-safe alternative to \c sys::fs::lockFile. This does not support all
+/// the platforms that \c sys::fs::lockFile does, so keep it in the CAS library
+/// for now.
+std::error_code lockFileThreadSafe(int FD, llvm::sys::fs::LockKind Kind);
+
+/// Thread-safe alternative to \c sys::fs::unlockFile. This does not support all
+/// the platforms that \c sys::fs::lockFile does, so keep it in the CAS library
+/// for now.
+std::error_code unlockFileThreadSafe(int FD);
+
+/// Thread-safe alternative to \c sys::fs::tryLockFile. This does not support
+/// all the platforms that \c sys::fs::lockFile does, so keep it in the CAS
+/// library for now.
+std::error_code tryLockFileThreadSafe(
+    int FD, std::chrono::milliseconds Timeout = std::chrono::milliseconds(0),
+    llvm::sys::fs::LockKind Kind = llvm::sys::fs::LockKind::Exclusive);
+
+/// Allocate space for the file \p FD on disk, if the filesystem supports it.
+///
+/// On filesystems that support this operation, this ensures errors such as
+/// \c std::errc::no_space_on_device are detected before we write data.
+///
+/// \returns the new size of the file, or an \c Error.
+Expected<size_t> preallocateFileTail(int FD, size_t CurrentSize,
+                                     size_t NewSize);
+
+} // namespace llvm::cas::ondisk
+
+#endif // LLVM_LIB_CAS_ONDISKCOMMON_H
diff --git a/llvm/lib/CodeGen/AggressiveAntiDepBreaker.cpp b/llvm/lib/CodeGen/AggressiveAntiDepBreaker.cpp
index 755be089709a5..e0f80b0a57f2b 100644
--- a/llvm/lib/CodeGen/AggressiveAntiDepBreaker.cpp
+++ b/llvm/lib/CodeGen/AggressiveAntiDepBreaker.cpp
@@ -395,7 +395,7 @@ void AggressiveAntiDepBreaker::PrescanInstruction(
     // Note register reference...
     const TargetRegisterClass *RC = nullptr;
     if (i < MI.getDesc().getNumOperands())
-      RC = TII->getRegClass(MI.getDesc(), i, TRI, MF);
+      RC = TII->getRegClass(MI.getDesc(), i, TRI);
     AggressiveAntiDepState::RegisterReference RR = { &MO, RC };
     RegRefs.emplace(Reg.asMCReg(), RR);
   }
@@ -479,7 +479,7 @@ void AggressiveAntiDepBreaker::ScanInstruction(MachineInstr &MI,
     // Note register reference...
     const TargetRegisterClass *RC = nullptr;
     if (i < MI.getDesc().getNumOperands())
-      RC = TII->getRegClass(MI.getDesc(), i, TRI, MF);
+      RC = TII->getRegClass(MI.getDesc(), i, TRI);
     AggressiveAntiDepState::RegisterReference RR = { &MO, RC };
     RegRefs.emplace(Reg.asMCReg(), RR);
   }
diff --git a/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp b/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp
index 0f3ff985974ce..d98d18035ac6d 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp
@@ -105,6 +105,8 @@ DebugHandlerBase::~DebugHandlerBase() = default;
 void DebugHandlerBase::beginModule(Module *M) {
   if (M->debug_compile_units().empty())
     Asm = nullptr;
+  else
+    LScopes.initialize(*M);
 }
 
 // Each LexicalScope has first instruction and last instruction to mark
@@ -269,7 +271,7 @@ void DebugHandlerBase::beginFunction(const MachineFunction *MF) {
 
   // Grab the lexical scopes for the function, if we don't have any of those
   // then we're not going to be able to do anything.
-  LScopes.initialize(*MF);
+  LScopes.scanFunction(*MF);
   if (LScopes.empty()) {
     beginFunctionImpl(MF);
     return;
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
index 67f526fe91464..7ce014e9fac9a 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
@@ -537,8 +537,9 @@ void DwarfCompileUnit::addWasmRelocBaseGlobal(DIELoc *Loc, StringRef GlobalName,
 // and DW_AT_high_pc attributes. If there are global variables in this
 // scope then create and insert DIEs for these variables.
 DIE &DwarfCompileUnit::updateSubprogramScopeDIE(const DISubprogram *SP,
+                                                const Function &F,
                                                 MCSymbol *LineTableSym) {
-  DIE *SPDie = getOrCreateSubprogramDIE(SP, includeMinimalInlineScopes());
+  DIE *SPDie = getOrCreateSubprogramDIE(SP, &F, includeMinimalInlineScopes());
   SmallVector<RangeSpan, 2> BB_List;
   // If basic block sections are on, ranges for each basic block section has
   // to be emitted separately.
@@ -1122,9 +1123,10 @@ sortLocalVars(SmallVectorImpl<DbgVariable *> &Input) {
 }
 
 DIE &DwarfCompileUnit::constructSubprogramScopeDIE(const DISubprogram *Sub,
+                                                   const Function &F,
                                                    LexicalScope *Scope,
                                                    MCSymbol *LineTableSym) {
-  DIE &ScopeDIE = updateSubprogramScopeDIE(Sub, LineTableSym);
+  DIE &ScopeDIE = updateSubprogramScopeDIE(Sub, F, LineTableSym);
 
   if (Scope) {
     assert(!Scope->getInlinedAt());
@@ -1198,32 +1200,17 @@ DIE *DwarfCompileUnit::createAndAddScopeChildren(LexicalScope *Scope,
   return ObjectPointer;
 }
 
-void DwarfCompileUnit::constructAbstractSubprogramScopeDIE(
-    LexicalScope *Scope) {
-  auto *SP = cast<DISubprogram>(Scope->getScopeNode());
-  if (getAbstractScopeDIEs().count(SP))
-    return;
+DIE &DwarfCompileUnit::getOrCreateAbstractSubprogramDIE(
+    const DISubprogram *SP) {
+  if (auto *AbsDef = getAbstractScopeDIEs().lookup(SP))
+    return *AbsDef;
 
-  DIE *ContextDIE;
-  DwarfCompileUnit *ContextCU = this;
-
-  if (includeMinimalInlineScopes())
-    ContextDIE = &getUnitDie();
-  // Some of this is duplicated from DwarfUnit::getOrCreateSubprogramDIE, with
-  // the important distinction that the debug node is not associated with the
-  // DIE (since the debug node will be associated with the concrete DIE, if
-  // any). It could be refactored to some common utility function.
-  else if (auto *SPDecl = SP->getDeclaration()) {
-    ContextDIE = &getUnitDie();
-    getOrCreateSubprogramDIE(SPDecl);
-  } else {
-    ContextDIE = getOrCreateContextDIE(SP->getScope());
-    // The scope may be shared with a subprogram that has already been
-    // constructed in another CU, in which case we need to construct this
-    // subprogram in the same CU.
-    ContextCU = DD->lookupCU(ContextDIE->getUnitDie());
-  }
+  auto [ContextDIE, ContextCU] = getOrCreateAbstractSubprogramContextDIE(SP);
+  return createAbstractSubprogramDIE(SP, ContextDIE, ContextCU);
+}
 
+DIE &DwarfCompileUnit::createAbstractSubprogramDIE(
+    const DISubprogram *SP, DIE *ContextDIE, DwarfCompileUnit *ContextCU) {
   // Passing null as the associated node because the abstract definition
   // shouldn't be found by lookup.
   DIE &AbsDef = ContextCU->createAndAddDIE(dwarf::DW_TAG_subprogram,
@@ -1237,8 +1224,45 @@ void DwarfCompileUnit::constructAbstractSubprogramScopeDIE(
                      DD->getDwarfVersion() <= 4 ? std::optional<dwarf::Form>()
                                                 : dwarf::DW_FORM_implicit_const,
                      dwarf::DW_INL_inlined);
-  if (DIE *ObjectPointer = ContextCU->createAndAddScopeChildren(Scope, AbsDef))
-    ContextCU->addDIEEntry(AbsDef, dwarf::DW_AT_object_pointer, *ObjectPointer);
+
+  return AbsDef;
+}
+
+std::pair<DIE *, DwarfCompileUnit *>
+DwarfCompileUnit::getOrCreateAbstractSubprogramContextDIE(
+    const DISubprogram *SP) {
+  bool Minimal = includeMinimalInlineScopes();
+  bool IgnoreScope = shouldPlaceInUnitDIE(SP, Minimal);
+  DIE *ContextDIE = getOrCreateSubprogramContextDIE(SP, IgnoreScope);
+
+  if (auto *SPDecl = SP->getDeclaration())
+    if (!Minimal)
+      getOrCreateSubprogramDIE(SPDecl, nullptr);
+
+  // The scope may be shared with a subprogram that has already been
+  // constructed in another CU, in which case we need to construct this
+  // subprogram in the same CU.
+  auto *ContextCU = IgnoreScope ? this : DD->lookupCU(ContextDIE->getUnitDie());
+
+  return std::make_pair(ContextDIE, ContextCU);
+}
+
+void DwarfCompileUnit::constructAbstractSubprogramScopeDIE(
+    LexicalScope *Scope) {
+  auto *SP = cast<DISubprogram>(Scope->getScopeNode());
+
+  // Populate subprogram DIE only once.
+  if (!getFinalizedAbstractSubprograms().insert(SP).second)
+    return;
+
+  auto [ContextDIE, ContextCU] = getOrCreateAbstractSubprogramContextDIE(SP);
+  DIE *AbsDef = getAbstractScopeDIEs().lookup(SP);
+  if (!AbsDef)
+    AbsDef = &createAbstractSubprogramDIE(SP, ContextDIE, ContextCU);
+
+  if (DIE *ObjectPointer = ContextCU->createAndAddScopeChildren(Scope, *AbsDef))
+    ContextCU->addDIEEntry(*AbsDef, dwarf::DW_AT_object_pointer,
+                           *ObjectPointer);
 }
 
 bool DwarfCompileUnit::useGNUAnalogForDwarf5Feature() const {
@@ -1293,9 +1317,9 @@ DwarfCompileUnit::getDwarf5OrGNULocationAtom(dwarf::LocationAtom Loc) const {
 }
 
 DIE &DwarfCompileUnit::constructCallSiteEntryDIE(
-    DIE &ScopeDIE, const DISubprogram *CalleeSP, bool IsTail,
-    const MCSymbol *PCAddr, const MCSymbol *CallAddr, unsigned CallReg,
-    DIType *AllocSiteTy) {
+    DIE &ScopeDIE, const DISubprogram *CalleeSP, const Function *CalleeF,
+    bool IsTail, const MCSymbol *PCAddr, const MCSymbol *CallAddr,
+    unsigned CallReg, DIType *AllocSiteTy) {
   // Insert a call site entry DIE within ScopeDIE.
   DIE &CallSiteDIE = createAndAddDIE(getDwarf5OrGNUTag(dwarf::DW_TAG_call_site),
                                      ScopeDIE, nullptr);
@@ -1305,7 +1329,7 @@ DIE &DwarfCompileUnit::constructCallSiteEntryDIE(
     addAddress(CallSiteDIE, getDwarf5OrGNUAttr(dwarf::DW_AT_call_target),
                MachineLocation(CallReg));
   } else if (CalleeSP) {
-    DIE *CalleeDIE = getOrCreateSubprogramDIE(CalleeSP);
+    DIE *CalleeDIE = getOrCreateSubprogramDIE(CalleeSP, CalleeF);
     assert(CalleeDIE && "Could not create DIE for call site entry origin");
     if (AddLinkageNamesToDeclCallOriginsForTuning(DD) &&
         !CalleeSP->isDefinition() &&
@@ -1396,7 +1420,7 @@ DIE *DwarfCompileUnit::constructImportedEntityDIE(
     if (auto *AbsSPDie = getAbstractScopeDIEs().lookup(SP))
       EntityDie = AbsSPDie;
     else
-      EntityDie = getOrCreateSubprogramDIE(SP);
+      EntityDie = getOrCreateSubprogramDIE(SP, nullptr);
   } else if (auto *T = dyn_cast<DIType>(Entity))
     EntityDie = getOrCreateTypeDIE(T);
   else if (auto *GV = dyn_cast<DIGlobalVariable>(Entity))
@@ -1805,3 +1829,16 @@ DIE *DwarfCompileUnit::getOrCreateContextDIE(const DIScope *Context) {
   }
   return DwarfUnit::getOrCreateContextDIE(Context);
 }
+
+DIE *DwarfCompileUnit::getOrCreateSubprogramDIE(const DISubprogram *SP,
+                                                const Function *F,
+                                                bool Minimal) {
+  if (!F && SP->isDefinition()) {
+    F = DD->getLexicalScopes().getFunction(SP);
+
+    if (!F)
+      return &getCU().getOrCreateAbstractSubprogramDIE(SP);
+  }
+
+  return DwarfUnit::getOrCreateSubprogramDIE(SP, F, Minimal);
+}
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h b/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
index c2f6ca0913818..a3bbc8364599d 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
@@ -81,6 +81,7 @@ class DwarfCompileUnit final : public DwarfUnit {
 
   // List of abstract local scopes (either DISubprogram or DILexicalBlock).
   DenseMap<const DILocalScope *, DIE *> AbstractLocalScopeDIEs;
+  SmallPtrSet<const DISubprogram *, 8> FinalizedAbstractSubprograms;
 
   // List of inlined lexical block scopes that belong to subprograms within this
   // CU.
@@ -137,12 +138,28 @@ class DwarfCompileUnit final : public DwarfUnit {
     return DU->getAbstractEntities();
   }
 
+  auto &getFinalizedAbstractSubprograms() {
+    if (isDwoUnit() && !DD->shareAcrossDWOCUs())
+      return FinalizedAbstractSubprograms;
+    return DU->getFinalizedAbstractSubprograms();
+  }
+
   void finishNonUnitTypeDIE(DIE& D, const DICompositeType *CTy) override;
 
   /// Add info for Wasm-global-based relocation.
   void addWasmRelocBaseGlobal(DIELoc *Loc, StringRef GlobalName,
                               uint64_t GlobalIndex);
 
+  /// Create context DIE for abstract subprogram.
+  /// \returns The context DIE and the compile unit where abstract
+  ///          DIE should be constructed.
+  std::pair<DIE *, DwarfCompileUnit *>
+  getOrCreateAbstractSubprogramContextDIE(const DISubprogram *SP);
+
+  /// Create new DIE for abstract subprogram.
+  DIE &createAbstractSubprogramDIE(const DISubprogram *SP, DIE *ContextDIE,
+                                   DwarfCompileUnit *ContextCU);
+
 public:
   DwarfCompileUnit(unsigned UID, const DICompileUnit *Node, AsmPrinter *A,
                    DwarfDebug *DW, DwarfFile *DWU,
@@ -216,7 +233,8 @@ class DwarfCompileUnit final : public DwarfUnit {
   /// DW_AT_low_pc, DW_AT_high_pc and DW_AT_LLVM_stmt_sequence attributes.
   /// If there are global variables in this scope then create and insert DIEs
   /// for these variables.
-  DIE &updateSubprogramScopeDIE(const DISubprogram *SP, MCSymbol *LineTableSym);
+  DIE &updateSubprogramScopeDIE(const DISubprogram *SP, const Function &F,
+                                MCSymbol *LineTableSym);
 
   void constructScopeDIE(LexicalScope *Scope, DIE &ParentScopeDIE);
 
@@ -259,12 +277,18 @@ class DwarfCompileUnit final : public DwarfUnit {
   /// This instance of 'getOrCreateContextDIE()' can handle DILocalScope.
   DIE *getOrCreateContextDIE(const DIScope *Ty) override;
 
+  DIE *getOrCreateSubprogramDIE(const DISubprogram *SP, const Function *F,
+                                bool Minimal = false) override;
+
   /// Construct a DIE for this subprogram scope.
-  DIE &constructSubprogramScopeDIE(const DISubprogram *Sub, LexicalScope *Scope,
-                                   MCSymbol *LineTableSym);
+  DIE &constructSubprogramScopeDIE(const DISubprogram *Sub, const Function &F,
+                                   LexicalScope *Scope, MCSymbol *LineTableSym);
 
   DIE *createAndAddScopeChildren(LexicalScope *Scope, DIE &ScopeDIE);
 
+  /// Create an abstract subprogram DIE, that should later be populated
+  /// by \ref constructAbstractSubprogramScopeDIE.
+  DIE &getOrCreateAbstractSubprogramDIE(const DISubprogram *SP);
   void constructAbstractSubprogramScopeDIE(LexicalScope *Scope);
 
   /// Whether to use the GNU analog for a DWARF5 tag, attribute, or location
@@ -281,14 +305,15 @@ class DwarfCompileUnit final : public DwarfUnit {
   dwarf::LocationAtom getDwarf5OrGNULocationAtom(dwarf::LocationAtom Loc) const;
 
   /// Construct a call site entry DIE describing a call within \p Scope to a
-  /// callee described by \p CalleeSP.
+  /// callee described by \p CalleeSP and \p CalleeF.
   /// \p IsTail specifies whether the call is a tail call.
   /// \p PCAddr points to the PC value after the call instruction.
   /// \p CallAddr points to the PC value at the call instruction (or is null).
   /// \p CallReg is a register location for an indirect call. For direct calls
   /// the \p CallReg is set to 0.
   DIE &constructCallSiteEntryDIE(DIE &ScopeDIE, const DISubprogram *CalleeSP,
-                                 bool IsTail, const MCSymbol *PCAddr,
+                                 const Function *CalleeF, bool IsTail,
+                                 const MCSymbol *PCAddr,
                                  const MCSymbol *CallAddr, unsigned CallReg,
                                  DIType *AllocSiteTy);
   /// Construct call site parameter DIEs for the \p CallSiteDIE. The \p Params
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index 2090157a1a91c..25e291c53ea6a 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -1001,8 +1001,9 @@ void DwarfDebug::constructCallSiteEntryDIEs(const DISubprogram &SP,
                                                        ->getName(CallReg)))
                         << (IsTail ? " [IsTail]" : "") << "\n");
 
-      DIE &CallSiteDIE = CU.constructCallSiteEntryDIE(
-          ScopeDIE, CalleeSP, IsTail, PCAddr, CallAddr, CallReg, AllocSiteTy);
+      DIE &CallSiteDIE =
+          CU.constructCallSiteEntryDIE(ScopeDIE, CalleeSP, CalleeDecl, IsTail,
+                                       PCAddr, CallAddr, CallReg, AllocSiteTy);
 
       // Optionally emit call-site-param debug info.
       if (emitDebugEntryValues()) {
@@ -2711,7 +2712,8 @@ void DwarfDebug::skippedNonDebugFunction() {
 
 // Gather and emit post-function debug information.
 void DwarfDebug::endFunctionImpl(const MachineFunction *MF) {
-  const DISubprogram *SP = MF->getFunction().getSubprogram();
+  const Function &F = MF->getFunction();
+  const DISubprogram *SP = F.getSubprogram();
 
   assert(CurFn == MF &&
       "endFunction should be called with the same function as beginFunction");
@@ -2780,11 +2782,12 @@ void DwarfDebug::endFunctionImpl(const MachineFunction *MF) {
 
   ProcessedSPNodes.insert(SP);
   DIE &ScopeDIE =
-      TheCU.constructSubprogramScopeDIE(SP, FnScope, FunctionLineTableLabel);
+      TheCU.constructSubprogramScopeDIE(SP, F, FnScope, FunctionLineTableLabel);
   if (auto *SkelCU = TheCU.getSkeleton())
     if (!LScopes.getAbstractScopesList().empty() &&
         TheCU.getCUNode()->getSplitDebugInlining())
-      SkelCU->constructSubprogramScopeDIE(SP, FnScope, FunctionLineTableLabel);
+      SkelCU->constructSubprogramScopeDIE(SP, F, FnScope,
+                                          FunctionLineTableLabel);
 
   FunctionLineTableLabel = nullptr;
 
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfFile.h b/llvm/lib/CodeGen/AsmPrinter/DwarfFile.h
index 0fc2b91ddfa91..ef1524d875c84 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfFile.h
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfFile.h
@@ -11,6 +11,7 @@
 
 #include "DwarfStringPool.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/CodeGen/DIE.h"
@@ -27,6 +28,7 @@ class DbgVariable;
 class DbgLabel;
 class DINode;
 class DILocalScope;
+class DISubprogram;
 class DwarfCompileUnit;
 class DwarfUnit;
 class LexicalScope;
@@ -94,6 +96,9 @@ class DwarfFile {
   // Collection of abstract subprogram DIEs.
   DenseMap<const DILocalScope *, DIE *> AbstractLocalScopeDIEs;
   DenseMap<const DINode *, std::unique_ptr<DbgEntity>> AbstractEntities;
+  /// Keeps track of abstract subprograms to populate them only once.
+  // FIXME: merge creation and population of abstract scopes.
+  SmallPtrSet<const DISubprogram *, 8> FinalizedAbstractSubprograms;
 
   /// Maps MDNodes for type system with the corresponding DIEs. These DIEs can
   /// be shared across CUs, that is why we keep the map here instead
@@ -174,6 +179,10 @@ class DwarfFile {
     return AbstractEntities;
   }
 
+  auto &getFinalizedAbstractSubprograms() {
+    return FinalizedAbstractSubprograms;
+  }
+
   void insertDIE(const MDNode *TypeMD, DIE *Die) {
     DITypeNodeToDieMap.insert(std::make_pair(TypeMD, Die));
   }
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
index d76fd0c010209..62fb5eb011cf2 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
@@ -573,7 +573,7 @@ DIE *DwarfUnit::getOrCreateContextDIE(const DIScope *Context) {
   if (auto *NS = dyn_cast<DINamespace>(Context))
     return getOrCreateNameSpace(NS);
   if (auto *SP = dyn_cast<DISubprogram>(Context))
-    return getOrCreateSubprogramDIE(SP);
+    return getOrCreateSubprogramDIE(SP, nullptr);
   if (auto *M = dyn_cast<DIModule>(Context))
     return getOrCreateModule(M);
   return getDIE(Context);
@@ -1066,7 +1066,7 @@ void DwarfUnit::constructTypeDIE(DIE &Buffer, const DICompositeType *CTy) {
       if (!Element)
         continue;
       if (auto *SP = dyn_cast<DISubprogram>(Element))
-        getOrCreateSubprogramDIE(SP);
+        getOrCreateSubprogramDIE(SP, nullptr);
       else if (auto *DDTy = dyn_cast<DIDerivedType>(Element)) {
         if (DDTy->getTag() == dwarf::DW_TAG_friend) {
           DIE &ElemDie = createAndAddDIE(dwarf::DW_TAG_friend, Buffer);
@@ -1335,22 +1335,21 @@ DIE *DwarfUnit::getOrCreateModule(const DIModule *M) {
   return &MDie;
 }
 
-DIE *DwarfUnit::getOrCreateSubprogramDIE(const DISubprogram *SP, bool Minimal) {
+DIE *DwarfUnit::getOrCreateSubprogramDIE(const DISubprogram *SP,
+                                         const Function *FnHint, bool Minimal) {
   // Construct the context before querying for the existence of the DIE in case
   // such construction creates the DIE (as is the case for member function
   // declarations).
   DIE *ContextDIE =
-      Minimal ? &getUnitDie() : getOrCreateContextDIE(SP->getScope());
+      getOrCreateSubprogramContextDIE(SP, shouldPlaceInUnitDIE(SP, Minimal));
 
   if (DIE *SPDie = getDIE(SP))
     return SPDie;
 
   if (auto *SPDecl = SP->getDeclaration()) {
     if (!Minimal) {
-      // Add subprogram definitions to the CU die directly.
-      ContextDIE = &getUnitDie();
       // Build the decl now to ensure it precedes the definition.
-      getOrCreateSubprogramDIE(SPDecl);
+      getOrCreateSubprogramDIE(SPDecl, nullptr);
       // Check whether the DIE for SP has already been created after the call
       // above.
       // FIXME: Should the creation of definition subprogram DIE during
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h
index fe05766cf36e1..bb00ec3af9782 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h
@@ -256,7 +256,9 @@ class DwarfUnit : public DIEUnit {
 
   DIE *getOrCreateNameSpace(const DINamespace *NS);
   DIE *getOrCreateModule(const DIModule *M);
-  DIE *getOrCreateSubprogramDIE(const DISubprogram *SP, bool Minimal = false);
+  virtual DIE *getOrCreateSubprogramDIE(const DISubprogram *SP,
+                                        const Function *FnHint,
+                                        bool Minimal = false);
 
   void applySubprogramAttributes(const DISubprogram *SP, DIE &SPDie,
                                  bool SkipSPAttributes = false);
@@ -343,6 +345,18 @@ class DwarfUnit : public DIEUnit {
   /// Emit the common part of the header for this unit.
   void emitCommonHeader(bool UseOffsets, dwarf::UnitType UT);
 
+  bool shouldPlaceInUnitDIE(const DISubprogram *SP, bool Minimal) {
+    // Add subprogram declarations to the CU die directly.
+    return Minimal || SP->getDeclaration();
+  }
+
+  DIE *getOrCreateSubprogramContextDIE(const DISubprogram *SP,
+                                       bool IgnoreScope) {
+    if (IgnoreScope)
+      return &getUnitDie();
+    return getOrCreateContextDIE(SP->getScope());
+  }
+
 private:
   /// A helper to add a wide integer constant to a DIE using a block
   /// form.
diff --git a/llvm/lib/CodeGen/BreakFalseDeps.cpp b/llvm/lib/CodeGen/BreakFalseDeps.cpp
index 7eef4a9d12b16..205020af1b30d 100644
--- a/llvm/lib/CodeGen/BreakFalseDeps.cpp
+++ b/llvm/lib/CodeGen/BreakFalseDeps.cpp
@@ -133,8 +133,7 @@ bool BreakFalseDeps::pickBestRegisterForUndef(MachineInstr *MI, unsigned OpIdx,
   }
 
   // Get the undef operand's register class
-  const TargetRegisterClass *OpRC =
-    TII->getRegClass(MI->getDesc(), OpIdx, TRI, *MF);
+  const TargetRegisterClass *OpRC = TII->getRegClass(MI->getDesc(), OpIdx, TRI);
   assert(OpRC && "Not a valid register class");
 
   // If the instruction has a true dependency, we can hide the false depdency
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 9db4c9e5e2807..a190f0dac1379 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -583,23 +583,23 @@ bool CodeGenPrepare::_run(Function &F) {
   // if requested.
   if (BBSectionsGuidedSectionPrefix && BBSectionsProfileReader &&
       BBSectionsProfileReader->isFunctionHot(F.getName())) {
-    F.setSectionPrefix("hot");
+    (void)F.setSectionPrefix("hot");
   } else if (ProfileGuidedSectionPrefix) {
     // The hot attribute overwrites profile count based hotness while profile
     // counts based hotness overwrite the cold attribute.
     // This is a conservative behabvior.
     if (F.hasFnAttribute(Attribute::Hot) ||
         PSI->isFunctionHotInCallGraph(&F, *BFI))
-      F.setSectionPrefix("hot");
+      (void)F.setSectionPrefix("hot");
     // If PSI shows this function is not hot, we will placed the function
     // into unlikely section if (1) PSI shows this is a cold function, or
     // (2) the function has a attribute of cold.
     else if (PSI->isFunctionColdInCallGraph(&F, *BFI) ||
              F.hasFnAttribute(Attribute::Cold))
-      F.setSectionPrefix("unlikely");
+      (void)F.setSectionPrefix("unlikely");
     else if (ProfileUnknownInSpecialSection && PSI->hasPartialSampleProfile() &&
              PSI->isFunctionHotnessUnknown(F))
-      F.setSectionPrefix("unknown");
+      (void)F.setSectionPrefix("unknown");
   }
 
   /// This optimization identifies DIV instructions that can be
diff --git a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
index 7d355e6e365d3..6c2a5a7da84d3 100644
--- a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
+++ b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
@@ -1022,8 +1022,7 @@ ComplexDeinterleavingGraph::identifyDotProduct(Value *V) {
 
   CompositeNode *ANode = nullptr;
 
-  const Intrinsic::ID PartialReduceInt =
-      Intrinsic::experimental_vector_partial_reduce_add;
+  const Intrinsic::ID PartialReduceInt = Intrinsic::vector_partial_reduce_add;
 
   Value *AReal = nullptr;
   Value *AImag = nullptr;
@@ -1139,8 +1138,7 @@ ComplexDeinterleavingGraph::identifyPartialReduction(Value *R, Value *I) {
     return nullptr;
 
   auto *IInst = dyn_cast<IntrinsicInst>(*CommonUser);
-  if (!IInst || IInst->getIntrinsicID() !=
-                    Intrinsic::experimental_vector_partial_reduce_add)
+  if (!IInst || IInst->getIntrinsicID() != Intrinsic::vector_partial_reduce_add)
     return nullptr;
 
   if (CompositeNode *CN = identifyDotProduct(IInst))
diff --git a/llvm/lib/CodeGen/CriticalAntiDepBreaker.cpp b/llvm/lib/CodeGen/CriticalAntiDepBreaker.cpp
index e8581f632f8ee..f873616cfedea 100644
--- a/llvm/lib/CodeGen/CriticalAntiDepBreaker.cpp
+++ b/llvm/lib/CodeGen/CriticalAntiDepBreaker.cpp
@@ -187,7 +187,7 @@ void CriticalAntiDepBreaker::PrescanInstruction(MachineInstr &MI) {
     const TargetRegisterClass *NewRC = nullptr;
 
     if (i < MI.getDesc().getNumOperands())
-      NewRC = TII->getRegClass(MI.getDesc(), i, TRI, MF);
+      NewRC = TII->getRegClass(MI.getDesc(), i, TRI);
 
     // For now, only allow the register to be changed if its register
     // class is consistent across all uses.
@@ -316,7 +316,7 @@ void CriticalAntiDepBreaker::ScanInstruction(MachineInstr &MI, unsigned Count) {
 
     const TargetRegisterClass *NewRC = nullptr;
     if (i < MI.getDesc().getNumOperands())
-      NewRC = TII->getRegClass(MI.getDesc(), i, TRI, MF);
+      NewRC = TII->getRegClass(MI.getDesc(), i, TRI);
 
     // For now, only allow the register to be changed if its register
     // class is consistent across all uses.
diff --git a/llvm/lib/CodeGen/GlobalISel/CSEMIRBuilder.cpp b/llvm/lib/CodeGen/GlobalISel/CSEMIRBuilder.cpp
index 59c62cf106482..3a47d59a9029c 100644
--- a/llvm/lib/CodeGen/GlobalISel/CSEMIRBuilder.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CSEMIRBuilder.cpp
@@ -339,8 +339,10 @@ MachineInstrBuilder CSEMIRBuilder::buildConstant(const DstOp &Res,
 
   // For vectors, CSE the element only for now.
   LLT Ty = Res.getLLTTy(*getMRI());
-  if (Ty.isVector())
+  if (Ty.isFixedVector())
     return buildSplatBuildVector(Res, buildConstant(Ty.getElementType(), Val));
+  if (Ty.isScalableVector())
+    return buildSplatVector(Res, buildConstant(Ty.getElementType(), Val));
 
   FoldingSetNodeID ID;
   GISelInstProfileBuilder ProfBuilder(ID, *getMRI());
diff --git a/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp b/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp
index 0cf44e02254de..9b4c103763d74 100644
--- a/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp
@@ -93,12 +93,8 @@ KnownBits GISelValueTracking::getKnownBits(Register R) {
 KnownBits GISelValueTracking::getKnownBits(Register R,
                                            const APInt &DemandedElts,
                                            unsigned Depth) {
-  // For now, we only maintain the cache during one request.
-  assert(ComputeKnownBitsCache.empty() && "Cache should have been cleared");
-
   KnownBits Known;
   computeKnownBitsImpl(R, Known, DemandedElts, Depth);
-  ComputeKnownBitsCache.clear();
   return Known;
 }
 
@@ -187,14 +183,6 @@ void GISelValueTracking::computeKnownBitsImpl(Register R, KnownBits &Known,
 #endif
 
   unsigned BitWidth = DstTy.getScalarSizeInBits();
-  auto CacheEntry = ComputeKnownBitsCache.find(R);
-  if (CacheEntry != ComputeKnownBitsCache.end()) {
-    Known = CacheEntry->second;
-    LLVM_DEBUG(dbgs() << "Cache hit at ");
-    LLVM_DEBUG(dumpResult(MI, Known, Depth));
-    assert(Known.getBitWidth() == BitWidth && "Cache entry size doesn't match");
-    return;
-  }
   Known = KnownBits(BitWidth); // Don't know anything
 
   // Depth may get bigger than max depth if it gets passed to a different
@@ -254,16 +242,6 @@ void GISelValueTracking::computeKnownBitsImpl(Register R, KnownBits &Known,
     // point of the pipeline, otherwise the main live-range will be
     // defined more than once, which is against SSA.
     assert(MI.getOperand(0).getSubReg() == 0 && "Is this code in SSA?");
-    // Record in the cache that we know nothing for MI.
-    // This will get updated later and in the meantime, if we reach that
-    // phi again, because of a loop, we will cut the search thanks to this
-    // cache entry.
-    // We could actually build up more information on the phi by not cutting
-    // the search, but that additional information is more a side effect
-    // than an intended choice.
-    // Therefore, for now, save on compile time until we derive a proper way
-    // to derive known bits for PHIs within loops.
-    ComputeKnownBitsCache[R] = KnownBits(BitWidth);
     // PHI's operand are a mix of registers and basic blocks interleaved.
     // We only care about the register ones.
     for (unsigned Idx = 1; Idx < MI.getNumOperands(); Idx += 2) {
@@ -700,9 +678,6 @@ void GISelValueTracking::computeKnownBitsImpl(Register R, KnownBits &Known,
   }
 
   LLVM_DEBUG(dumpResult(MI, Known, Depth));
-
-  // Update the cache.
-  ComputeKnownBitsCache[R] = Known;
 }
 
 static bool outputDenormalIsIEEEOrPosZero(const MachineFunction &MF, LLT Ty) {
diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
index 541269ab6bfce..768e3713f78e2 100644
--- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -1863,7 +1863,7 @@ bool IRTranslator::translateVectorDeinterleave2Intrinsic(
 void IRTranslator::getStackGuard(Register DstReg,
                                  MachineIRBuilder &MIRBuilder) {
   const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
-  MRI->setRegClass(DstReg, TRI->getPointerRegClass(*MF));
+  MRI->setRegClass(DstReg, TRI->getPointerRegClass());
   auto MIB =
       MIRBuilder.buildInstr(TargetOpcode::LOAD_STACK_GUARD, {DstReg}, {});
 
diff --git a/llvm/lib/CodeGen/GlobalISel/Utils.cpp b/llvm/lib/CodeGen/GlobalISel/Utils.cpp
index 1ccc549e0ec60..055fdc6ad7213 100644
--- a/llvm/lib/CodeGen/GlobalISel/Utils.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/Utils.cpp
@@ -114,7 +114,7 @@ Register llvm::constrainOperandRegClass(
   // Assume physical registers are properly constrained.
   assert(Reg.isVirtual() && "PhysReg not implemented");
 
-  const TargetRegisterClass *OpRC = TII.getRegClass(II, OpIdx, &TRI, MF);
+  const TargetRegisterClass *OpRC = TII.getRegClass(II, OpIdx, &TRI);
   // Some of the target independent instructions, like COPY, may not impose any
   // register class constraints on some of their operands: If it's a use, we can
   // skip constraining as the instruction defining the register would constrain
diff --git a/llvm/lib/CodeGen/InitUndef.cpp b/llvm/lib/CodeGen/InitUndef.cpp
index 500a73be7c0f5..e07e598019709 100644
--- a/llvm/lib/CodeGen/InitUndef.cpp
+++ b/llvm/lib/CodeGen/InitUndef.cpp
@@ -232,7 +232,7 @@ bool InitUndef::processBasicBlock(MachineFunction &MF, MachineBasicBlock &MBB,
       MachineOperand &UseMO = MI.getOperand(UseOpIdx);
       if (UseMO.getReg() == MCRegister::NoRegister) {
         const TargetRegisterClass *RC =
-            TII->getRegClass(MI.getDesc(), UseOpIdx, TRI, MF);
+            TII->getRegClass(MI.getDesc(), UseOpIdx, TRI);
         Register NewDest = MRI->createVirtualRegister(RC);
         // We don't have a way to update dead lanes, so keep track of the
         // new register so that we avoid querying it later.
diff --git a/llvm/lib/CodeGen/LexicalScopes.cpp b/llvm/lib/CodeGen/LexicalScopes.cpp
index 5916f619537a4..9fc9ac9a66d41 100644
--- a/llvm/lib/CodeGen/LexicalScopes.cpp
+++ b/llvm/lib/CodeGen/LexicalScopes.cpp
@@ -23,6 +23,7 @@
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
@@ -36,8 +37,16 @@ using namespace llvm;
 
 #define DEBUG_TYPE "lexicalscopes"
 
-/// reset - Reset the instance so that it's prepared for another function.
-void LexicalScopes::reset() {
+static bool skipUnit(const DICompileUnit *CU) {
+  return CU->getEmissionKind() == DICompileUnit::NoDebug;
+}
+
+void LexicalScopes::resetModule() {
+  FunctionMap.clear();
+  resetFunction();
+}
+
+void LexicalScopes::resetFunction() {
   MF = nullptr;
   CurrentFnLexicalScope = nullptr;
   LexicalScopeMap.clear();
@@ -47,12 +56,19 @@ void LexicalScopes::reset() {
   DominatedBlocks.clear();
 }
 
-/// initialize - Scan machine function and constuct lexical scope nest.
-void LexicalScopes::initialize(const MachineFunction &Fn) {
-  reset();
+void LexicalScopes::initialize(const Module &M) {
+  resetModule();
+  for (const Function &F : M) {
+    DISubprogram *SP = F.getSubprogram();
+    if (SP && (!SP->getUnit() || !skipUnit(SP->getUnit())))
+      FunctionMap[SP] = &F;
+  }
+}
+
+void LexicalScopes::scanFunction(const MachineFunction &Fn) {
+  resetFunction();
   // Don't attempt any lexical scope creation for a NoDebug compile unit.
-  if (Fn.getFunction().getSubprogram()->getUnit()->getEmissionKind() ==
-      DICompileUnit::NoDebug)
+  if (skipUnit(Fn.getFunction().getSubprogram()->getUnit()))
     return;
   MF = &Fn;
   SmallVector<InsnRange, 4> MIRanges;
@@ -143,8 +159,7 @@ LexicalScope *LexicalScopes::getOrCreateLexicalScope(const DILocalScope *Scope,
                                                      const DILocation *IA) {
   if (IA) {
     // Skip scopes inlined from a NoDebug compile unit.
-    if (Scope->getSubprogram()->getUnit()->getEmissionKind() ==
-        DICompileUnit::NoDebug)
+    if (skipUnit(Scope->getSubprogram()->getUnit()))
       return getOrCreateLexicalScope(IA);
     // Create an abstract scope for inlined function.
     getOrCreateAbstractScope(Scope);
diff --git a/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp b/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp
index a8143bd8f4273..0037bdd270ff3 100644
--- a/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp
+++ b/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp
@@ -3721,7 +3721,7 @@ bool InstrRefBasedLDV::ExtendRanges(MachineFunction &MF,
   TFI = MF.getSubtarget().getFrameLowering();
   TFI->getCalleeSaves(MF, CalleeSavedRegs);
   MFI = &MF.getFrameInfo();
-  LS.initialize(MF);
+  LS.scanFunction(MF);
 
   const auto &STI = MF.getSubtarget();
   AdjustsStackInCalls = MFI->adjustsStack() &&
diff --git a/llvm/lib/CodeGen/LiveDebugValues/VarLocBasedImpl.cpp b/llvm/lib/CodeGen/LiveDebugValues/VarLocBasedImpl.cpp
index 82e0c28f2f26c..b9ea03f949ef8 100644
--- a/llvm/lib/CodeGen/LiveDebugValues/VarLocBasedImpl.cpp
+++ b/llvm/lib/CodeGen/LiveDebugValues/VarLocBasedImpl.cpp
@@ -2231,7 +2231,7 @@ bool VarLocBasedLDV::ExtendRanges(MachineFunction &MF,
   TFI->getCalleeSaves(MF, CalleeSavedRegs);
   this->ShouldEmitDebugEntryValues = ShouldEmitDebugEntryValues;
 
-  LS.initialize(MF);
+  LS.scanFunction(MF);
 
   bool Changed = false;
   bool OLChanged = false;
diff --git a/llvm/lib/CodeGen/LiveDebugVariables.cpp b/llvm/lib/CodeGen/LiveDebugVariables.cpp
index 9d98e6c085fe3..b049491b531fe 100644
--- a/llvm/lib/CodeGen/LiveDebugVariables.cpp
+++ b/llvm/lib/CodeGen/LiveDebugVariables.cpp
@@ -1263,7 +1263,7 @@ void UserValue::computeIntervals(MachineRegisterInfo &MRI,
 
 void LiveDebugVariables::LDVImpl::computeIntervals() {
   LexicalScopes LS;
-  LS.initialize(*MF);
+  LS.scanFunction(*MF);
 
   for (const auto &UV : userValues) {
     UV->computeIntervals(MF->getRegInfo(), *TRI, *LIS, LS);
diff --git a/llvm/lib/CodeGen/MachineInstr.cpp b/llvm/lib/CodeGen/MachineInstr.cpp
index 79047f732808a..2c06c5ad4a5e4 100644
--- a/llvm/lib/CodeGen/MachineInstr.cpp
+++ b/llvm/lib/CodeGen/MachineInstr.cpp
@@ -976,11 +976,9 @@ MachineInstr::getRegClassConstraint(unsigned OpIdx,
                                     const TargetRegisterInfo *TRI) const {
   assert(getParent() && "Can't have an MBB reference here!");
   assert(getMF() && "Can't have an MF reference here!");
-  const MachineFunction &MF = *getMF();
-
   // Most opcodes have fixed constraints in their MCInstrDesc.
   if (!isInlineAsm())
-    return TII->getRegClass(getDesc(), OpIdx, TRI, MF);
+    return TII->getRegClass(getDesc(), OpIdx, TRI);
 
   if (!getOperand(OpIdx).isReg())
     return nullptr;
@@ -1003,7 +1001,7 @@ MachineInstr::getRegClassConstraint(unsigned OpIdx,
 
   // Assume that all registers in a memory operand are pointers.
   if (F.isMemKind())
-    return TRI->getPointerRegClass(MF);
+    return TRI->getPointerRegClass();
 
   return nullptr;
 }
diff --git a/llvm/lib/CodeGen/MachineLICM.cpp b/llvm/lib/CodeGen/MachineLICM.cpp
index 286fbfd373b59..4f164e2d53460 100644
--- a/llvm/lib/CodeGen/MachineLICM.cpp
+++ b/llvm/lib/CodeGen/MachineLICM.cpp
@@ -1420,7 +1420,7 @@ MachineInstr *MachineLICMImpl::ExtractHoistableLoad(MachineInstr *MI,
   if (NewOpc == 0) return nullptr;
   const MCInstrDesc &MID = TII->get(NewOpc);
   MachineFunction &MF = *MI->getMF();
-  const TargetRegisterClass *RC = TII->getRegClass(MID, LoadRegIndex, TRI, MF);
+  const TargetRegisterClass *RC = TII->getRegClass(MID, LoadRegIndex, TRI);
   // Ok, we're unfolding. Create a temporary register and do the unfold.
   Register Reg = MRI->createVirtualRegister(RC);
 
diff --git a/llvm/lib/CodeGen/MachineVerifier.cpp b/llvm/lib/CodeGen/MachineVerifier.cpp
index 01703fe09b79a..e911ce8a75828 100644
--- a/llvm/lib/CodeGen/MachineVerifier.cpp
+++ b/llvm/lib/CodeGen/MachineVerifier.cpp
@@ -2376,20 +2376,24 @@ void MachineVerifier::visitMachineInstrBefore(const MachineInstr *MI) {
 
     // If we have only one valid type, this is likely a copy between a virtual
     // and physical register.
-    TypeSize SrcSize = TRI->getRegSizeInBits(SrcReg, *MRI);
-    TypeSize DstSize = TRI->getRegSizeInBits(DstReg, *MRI);
+    TypeSize SrcSize = TypeSize::getZero();
+    TypeSize DstSize = TypeSize::getZero();
     if (SrcReg.isPhysical() && DstTy.isValid()) {
       const TargetRegisterClass *SrcRC =
           TRI->getMinimalPhysRegClassLLT(SrcReg, DstTy);
-      if (SrcRC)
-        SrcSize = TRI->getRegSizeInBits(*SrcRC);
+      if (!SrcRC)
+        SrcSize = TRI->getRegSizeInBits(SrcReg, *MRI);
+    } else {
+      SrcSize = TRI->getRegSizeInBits(SrcReg, *MRI);
     }
 
     if (DstReg.isPhysical() && SrcTy.isValid()) {
       const TargetRegisterClass *DstRC =
           TRI->getMinimalPhysRegClassLLT(DstReg, SrcTy);
-      if (DstRC)
-        DstSize = TRI->getRegSizeInBits(*DstRC);
+      if (!DstRC)
+        DstSize = TRI->getRegSizeInBits(DstReg, *MRI);
+    } else {
+      DstSize = TRI->getRegSizeInBits(DstReg, *MRI);
     }
 
     // The next two checks allow COPY between physical and virtual registers,
@@ -2636,7 +2640,7 @@ MachineVerifier::visitMachineOperand(const MachineOperand *MO, unsigned MONum) {
       }
       if (MONum < MCID.getNumOperands()) {
         if (const TargetRegisterClass *DRC =
-              TII->getRegClass(MCID, MONum, TRI, *MF)) {
+                TII->getRegClass(MCID, MONum, TRI)) {
           if (!DRC->contains(Reg)) {
             report("Illegal physical register for instruction", MO, MONum);
             OS << printReg(Reg, TRI) << " is not a "
@@ -2721,11 +2725,11 @@ MachineVerifier::visitMachineOperand(const MachineOperand *MO, unsigned MONum) {
         // comply to it.
         if (!isPreISelGenericOpcode(MCID.getOpcode()) &&
             MONum < MCID.getNumOperands() &&
-            TII->getRegClass(MCID, MONum, TRI, *MF)) {
+            TII->getRegClass(MCID, MONum, TRI)) {
           report("Virtual register does not match instruction constraint", MO,
                  MONum);
           OS << "Expect register class "
-             << TRI->getRegClassName(TII->getRegClass(MCID, MONum, TRI, *MF))
+             << TRI->getRegClassName(TII->getRegClass(MCID, MONum, TRI))
              << " but got nothing\n";
           return;
         }
@@ -2752,7 +2756,7 @@ MachineVerifier::visitMachineOperand(const MachineOperand *MO, unsigned MONum) {
       }
       if (MONum < MCID.getNumOperands()) {
         if (const TargetRegisterClass *DRC =
-              TII->getRegClass(MCID, MONum, TRI, *MF)) {
+                TII->getRegClass(MCID, MONum, TRI)) {
           if (SubIdx) {
             const TargetRegisterClass *SuperRC =
                 TRI->getLargestLegalSuperClass(RC, *MF);
diff --git a/llvm/lib/CodeGen/RegisterCoalescer.cpp b/llvm/lib/CodeGen/RegisterCoalescer.cpp
index 514f2f02d6425..b8486f6560c5f 100644
--- a/llvm/lib/CodeGen/RegisterCoalescer.cpp
+++ b/llvm/lib/CodeGen/RegisterCoalescer.cpp
@@ -1374,7 +1374,7 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP,
   }
 
   const unsigned DefSubIdx = DefMI->getOperand(0).getSubReg();
-  const TargetRegisterClass *DefRC = TII->getRegClass(MCID, 0, TRI, *MF);
+  const TargetRegisterClass *DefRC = TII->getRegClass(MCID, 0, TRI);
   if (!DefMI->isImplicitDef()) {
     if (DstReg.isPhysical()) {
       Register NewDstReg = DstReg;
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index d130efe96b56b..4b20b756f8a15 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -10092,6 +10092,55 @@ SDValue DAGCombiner::visitXOR(SDNode *N) {
   if (SDValue Combined = combineCarryDiamond(DAG, TLI, N0, N1, N))
     return Combined;
 
+  // fold (xor (smin(x, C), C)) -> select (x < C), xor(x, C), 0
+  // fold (xor (smax(x, C), C)) -> select (x > C), xor(x, C), 0
+  // fold (xor (umin(x, C), C)) -> select (x < C), xor(x, C), 0
+  // fold (xor (umax(x, C), C)) -> select (x > C), xor(x, C), 0
+  SDValue Op0;
+  if (sd_match(N0, m_OneUse(m_AnyOf(m_SMin(m_Value(Op0), m_Specific(N1)),
+                                    m_SMax(m_Value(Op0), m_Specific(N1)),
+                                    m_UMin(m_Value(Op0), m_Specific(N1)),
+                                    m_UMax(m_Value(Op0), m_Specific(N1)))))) {
+
+    if (isa<ConstantSDNode>(N1) ||
+        ISD::isBuildVectorOfConstantSDNodes(N1.getNode())) {
+      // For vectors, only optimize when the constant is zero or all-ones to
+      // avoid generating more instructions
+      if (VT.isVector()) {
+        ConstantSDNode *N1C = isConstOrConstSplat(N1);
+        if (!N1C || (!N1C->isZero() && !N1C->isAllOnes()))
+          return SDValue();
+      }
+
+      // Avoid the fold if the minmax operation is legal and select is expensive
+      if (TLI.isOperationLegal(N0.getOpcode(), VT) &&
+          TLI.isPredictableSelectExpensive())
+        return SDValue();
+
+      EVT CCVT = getSetCCResultType(VT);
+      ISD::CondCode CC;
+      switch (N0.getOpcode()) {
+      case ISD::SMIN:
+        CC = ISD::SETLT;
+        break;
+      case ISD::SMAX:
+        CC = ISD::SETGT;
+        break;
+      case ISD::UMIN:
+        CC = ISD::SETULT;
+        break;
+      case ISD::UMAX:
+        CC = ISD::SETUGT;
+        break;
+      }
+      SDValue FN1 = DAG.getFreeze(N1);
+      SDValue Cmp = DAG.getSetCC(DL, CCVT, Op0, FN1, CC);
+      SDValue XorXC = DAG.getNode(ISD::XOR, DL, VT, Op0, FN1);
+      SDValue Zero = DAG.getConstant(0, DL, VT);
+      return DAG.getSelect(DL, VT, Cmp, XorXC, Zero);
+    }
+  }
+
   return SDValue();
 }
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
index 9467ba14cf895..851d445f75fa8 100644
--- a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
@@ -1964,8 +1964,7 @@ Register FastISel::createResultReg(const TargetRegisterClass *RC) {
 Register FastISel::constrainOperandRegClass(const MCInstrDesc &II, Register Op,
                                             unsigned OpNum) {
   if (Op.isVirtual()) {
-    const TargetRegisterClass *RegClass =
-        TII.getRegClass(II, OpNum, &TRI, *FuncInfo.MF);
+    const TargetRegisterClass *RegClass = TII.getRegClass(II, OpNum, &TRI);
     if (!MRI.constrainRegClass(Op, RegClass)) {
       // If it's not legal to COPY between the register classes, something
       // has gone very wrong before we got here.
diff --git a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
index 861f76e93f2ce..11bc64c626421 100644
--- a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
@@ -125,7 +125,7 @@ void InstrEmitter::EmitCopyFromReg(SDValue Op, bool IsClone, Register SrcReg,
           const TargetRegisterClass *RC = nullptr;
           if (i + II.getNumDefs() < II.getNumOperands()) {
             RC = TRI->getAllocatableClass(
-                TII->getRegClass(II, i + II.getNumDefs(), TRI, *MF));
+                TII->getRegClass(II, i + II.getNumDefs(), TRI));
           }
           if (!UseRC)
             UseRC = RC;
@@ -197,7 +197,7 @@ void InstrEmitter::CreateVirtualRegisters(SDNode *Node,
     // register instead of creating a new vreg.
     Register VRBase;
     const TargetRegisterClass *RC =
-      TRI->getAllocatableClass(TII->getRegClass(II, i, TRI, *MF));
+        TRI->getAllocatableClass(TII->getRegClass(II, i, TRI));
     // Always let the value type influence the used register class. The
     // constraints on the instruction may be too lax to represent the value
     // type correctly. For example, a 64-bit float (X86::FR64) can't live in
@@ -330,7 +330,7 @@ InstrEmitter::AddRegisterOperand(MachineInstrBuilder &MIB,
   if (II) {
     const TargetRegisterClass *OpRC = nullptr;
     if (IIOpNum < II->getNumOperands())
-      OpRC = TII->getRegClass(*II, IIOpNum, TRI, *MF);
+      OpRC = TII->getRegClass(*II, IIOpNum, TRI);
 
     if (OpRC) {
       unsigned MinNumRegs = MinRCSize;
@@ -409,7 +409,7 @@ void InstrEmitter::AddOperand(MachineInstrBuilder &MIB, SDValue Op,
     Register VReg = R->getReg();
     MVT OpVT = Op.getSimpleValueType();
     const TargetRegisterClass *IIRC =
-        II ? TRI->getAllocatableClass(TII->getRegClass(*II, IIOpNum, TRI, *MF))
+        II ? TRI->getAllocatableClass(TII->getRegClass(*II, IIOpNum, TRI))
            : nullptr;
     const TargetRegisterClass *OpRC =
         TLI->isTypeLegal(OpVT)
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index bcfc2c5dc9f83..5fb7e63cfb605 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -585,8 +585,7 @@ void SelectionDAGLegalize::LegalizeStoreOps(SDNode *Node) {
           DAG.getMemBasePlusOffset(Ptr, TypeSize::getFixed(IncrementSize), dl);
       Hi = DAG.getNode(
           ISD::SRL, dl, Value.getValueType(), Value,
-          DAG.getConstant(RoundWidth, dl,
-                          TLI.getShiftAmountTy(Value.getValueType(), DL)));
+          DAG.getShiftAmountConstant(RoundWidth, Value.getValueType(), dl));
       Hi = DAG.getTruncStore(Chain, dl, Hi, Ptr,
                              ST->getPointerInfo().getWithOffset(IncrementSize),
                              ExtraVT, ST->getBaseAlign(), MMOFlags, AAInfo);
@@ -596,8 +595,7 @@ void SelectionDAGLegalize::LegalizeStoreOps(SDNode *Node) {
       // Store the top RoundWidth bits.
       Hi = DAG.getNode(
           ISD::SRL, dl, Value.getValueType(), Value,
-          DAG.getConstant(ExtraWidth, dl,
-                          TLI.getShiftAmountTy(Value.getValueType(), DL)));
+          DAG.getShiftAmountConstant(ExtraWidth, Value.getValueType(), dl));
       Hi = DAG.getTruncStore(Chain, dl, Hi, Ptr, ST->getPointerInfo(), RoundVT,
                              ST->getBaseAlign(), MMOFlags, AAInfo);
 
@@ -816,8 +814,7 @@ void SelectionDAGLegalize::LegalizeLoadOps(SDNode *Node) {
       // Move the top bits to the right place.
       Hi = DAG.getNode(
           ISD::SHL, dl, Hi.getValueType(), Hi,
-          DAG.getConstant(RoundWidth, dl,
-                          TLI.getShiftAmountTy(Hi.getValueType(), DL)));
+          DAG.getShiftAmountConstant(RoundWidth, Hi.getValueType(), dl));
 
       // Join the hi and lo parts.
       Value = DAG.getNode(ISD::OR, dl, Node->getValueType(0), Lo, Hi);
@@ -845,8 +842,7 @@ void SelectionDAGLegalize::LegalizeLoadOps(SDNode *Node) {
       // Move the top bits to the right place.
       Hi = DAG.getNode(
           ISD::SHL, dl, Hi.getValueType(), Hi,
-          DAG.getConstant(ExtraWidth, dl,
-                          TLI.getShiftAmountTy(Hi.getValueType(), DL)));
+          DAG.getShiftAmountConstant(ExtraWidth, Hi.getValueType(), dl));
 
       // Join the hi and lo parts.
       Value = DAG.getNode(ISD::OR, dl, Node->getValueType(0), Lo, Hi);
@@ -2767,8 +2763,7 @@ SDValue SelectionDAGLegalize::ExpandLegalINT_TO_FP(SDNode *Node,
     SDValue SignBitTest = DAG.getSetCC(
         dl, SetCCVT, Op0, DAG.getConstant(0, dl, SrcVT), ISD::SETLT);
 
-    EVT ShiftVT = TLI.getShiftAmountTy(SrcVT, DAG.getDataLayout());
-    SDValue ShiftConst = DAG.getConstant(1, dl, ShiftVT);
+    SDValue ShiftConst = DAG.getShiftAmountConstant(1, SrcVT, dl);
     SDValue Shr = DAG.getNode(ISD::SRL, dl, SrcVT, Op0, ShiftConst);
     SDValue AndConst = DAG.getConstant(1, dl, SrcVT);
     SDValue And = DAG.getNode(ISD::AND, dl, SrcVT, Op0, AndConst);
@@ -3350,10 +3345,8 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     } else {
       Op = DAG.getAnyExtOrTrunc(Op, dl, MVT::i32);
     }
-    Op = DAG.getNode(
-        ISD::SHL, dl, MVT::i32, Op,
-        DAG.getConstant(16, dl,
-                        TLI.getShiftAmountTy(MVT::i32, DAG.getDataLayout())));
+    Op = DAG.getNode(ISD::SHL, dl, MVT::i32, Op,
+                     DAG.getShiftAmountConstant(16, MVT::i32, dl));
     Op = DAG.getNode(ISD::BITCAST, dl, MVT::f32, Op);
     // Add fp_extend in case the output is bigger than f32.
     if (Node->getValueType(0) != MVT::f32)
@@ -3370,10 +3363,9 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     if (!DAG.isKnownNeverSNaN(Op)) {
       Op = DAG.getNode(ISD::FCANONICALIZE, dl, MVT::f32, Op, Node->getFlags());
     }
-    Op = DAG.getNode(
-        ISD::SRL, dl, MVT::i32, DAG.getNode(ISD::BITCAST, dl, MVT::i32, Op),
-        DAG.getConstant(16, dl,
-                        TLI.getShiftAmountTy(MVT::i32, DAG.getDataLayout())));
+    Op = DAG.getNode(ISD::SRL, dl, MVT::i32,
+                     DAG.getNode(ISD::BITCAST, dl, MVT::i32, Op),
+                     DAG.getShiftAmountConstant(16, MVT::i32, dl));
     // The result of this node can be bf16 or an integer type in case bf16 is
     // not supported on the target and was softened to i16 for storage.
     if (Node->getValueType(0) == MVT::bf16) {
@@ -3431,13 +3423,11 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
 
     // NOTE: we could fall back on load/store here too for targets without
     // SRA.  However, it is doubtful that any exist.
-    EVT ShiftAmountTy = TLI.getShiftAmountTy(VT, DAG.getDataLayout());
     unsigned BitsDiff = VT.getScalarSizeInBits() -
                         ExtraVT.getScalarSizeInBits();
-    SDValue ShiftCst = DAG.getConstant(BitsDiff, dl, ShiftAmountTy);
-    Tmp1 = DAG.getNode(ISD::SHL, dl, Node->getValueType(0),
-                       Node->getOperand(0), ShiftCst);
-    Tmp1 = DAG.getNode(ISD::SRA, dl, Node->getValueType(0), Tmp1, ShiftCst);
+    SDValue ShiftCst = DAG.getShiftAmountConstant(BitsDiff, VT, dl);
+    Tmp1 = DAG.getNode(ISD::SHL, dl, VT, Node->getOperand(0), ShiftCst);
+    Tmp1 = DAG.getNode(ISD::SRA, dl, VT, Tmp1, ShiftCst);
     Results.push_back(Tmp1);
     break;
   }
@@ -3666,11 +3656,9 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     EVT OpTy = Node->getOperand(0).getValueType();
     if (Node->getConstantOperandVal(1)) {
       // 1 -> Hi
-      Tmp1 = DAG.getNode(ISD::SRL, dl, OpTy, Node->getOperand(0),
-                         DAG.getConstant(OpTy.getSizeInBits() / 2, dl,
-                                         TLI.getShiftAmountTy(
-                                             Node->getOperand(0).getValueType(),
-                                             DAG.getDataLayout())));
+      Tmp1 = DAG.getNode(
+          ISD::SRL, dl, OpTy, Node->getOperand(0),
+          DAG.getShiftAmountConstant(OpTy.getSizeInBits() / 2, OpTy, dl));
       Tmp1 = DAG.getNode(ISD::TRUNCATE, dl, Node->getValueType(0), Tmp1);
     } else {
       // 0 -> Lo
@@ -3950,9 +3938,8 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
       for (unsigned i = 0; i < 2; ++i) {
         SDValue Lo = DAG.getNode(ISD::ZERO_EXTEND, dl, VT, Halves[2 * i]);
         SDValue Hi = DAG.getNode(ISD::ANY_EXTEND, dl, VT, Halves[2 * i + 1]);
-        SDValue Shift = DAG.getConstant(
-            HalfType.getScalarSizeInBits(), dl,
-            TLI.getShiftAmountTy(HalfType, DAG.getDataLayout()));
+        SDValue Shift =
+            DAG.getShiftAmountConstant(HalfType.getScalarSizeInBits(), VT, dl);
         Hi = DAG.getNode(ISD::SHL, dl, VT, Hi, Shift);
         Results.push_back(DAG.getNode(ISD::OR, dl, VT, Lo, Hi));
       }
@@ -3999,8 +3986,7 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
       Lo = DAG.getNode(ISD::ZERO_EXTEND, dl, VT, Lo);
       Hi = DAG.getNode(ISD::ANY_EXTEND, dl, VT, Hi);
       SDValue Shift =
-          DAG.getConstant(HalfType.getSizeInBits(), dl,
-                          TLI.getShiftAmountTy(HalfType, DAG.getDataLayout()));
+          DAG.getShiftAmountConstant(HalfType.getSizeInBits(), VT, dl);
       Hi = DAG.getNode(ISD::SHL, dl, VT, Hi, Shift);
       Results.push_back(DAG.getNode(ISD::OR, dl, VT, Lo, Hi));
     }
@@ -4130,8 +4116,7 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     Tmp2 = DAG.getNode(ISD::ANY_EXTEND, dl, PairTy, Node->getOperand(1));
     Tmp2 = DAG.getNode(
         ISD::SHL, dl, PairTy, Tmp2,
-        DAG.getConstant(PairTy.getSizeInBits() / 2, dl,
-                        TLI.getShiftAmountTy(PairTy, DAG.getDataLayout())));
+        DAG.getShiftAmountConstant(PairTy.getSizeInBits() / 2, PairTy, dl));
     Results.push_back(DAG.getNode(ISD::OR, dl, PairTy, Tmp1, Tmp2));
     break;
   }
@@ -5368,10 +5353,8 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) {
     unsigned DiffBits = NVT.getSizeInBits() - OVT.getSizeInBits();
     Tmp1 = DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, Node->getOperand(0));
     Tmp1 = DAG.getNode(Node->getOpcode(), dl, NVT, Tmp1);
-    Tmp1 = DAG.getNode(
-        ISD::SRL, dl, NVT, Tmp1,
-        DAG.getConstant(DiffBits, dl,
-                        TLI.getShiftAmountTy(NVT, DAG.getDataLayout())));
+    Tmp1 = DAG.getNode(ISD::SRL, dl, NVT, Tmp1,
+                       DAG.getShiftAmountConstant(DiffBits, NVT, dl));
 
     Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, OVT, Tmp1));
     break;
@@ -5483,11 +5466,9 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) {
     Tmp2 = DAG.getNode(ExtOp, dl, NVT, Node->getOperand(1));
     Tmp1 = DAG.getNode(ISD::MUL, dl, NVT, Tmp1, Tmp2);
 
-    auto &DL = DAG.getDataLayout();
     unsigned OriginalSize = OVT.getScalarSizeInBits();
-    Tmp2 = DAG.getNode(
-        ISD::SRL, dl, NVT, Tmp1,
-        DAG.getConstant(OriginalSize, dl, TLI.getScalarShiftAmountTy(DL, NVT)));
+    Tmp2 = DAG.getNode(ISD::SRL, dl, NVT, Tmp1,
+                       DAG.getShiftAmountConstant(OriginalSize, NVT, dl));
     Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, OVT, Tmp1));
     Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, OVT, Tmp2));
     break;
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 9e85f08abb766..88a4a8b16373b 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -1613,7 +1613,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_FunnelShift(SDNode *N) {
   // fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z % bw)).
   if (NewBits >= (2 * OldBits) && !isa<ConstantSDNode>(Amt) &&
       !TLI.isOperationLegalOrCustom(Opcode, VT)) {
-    SDValue HiShift = DAG.getConstant(OldBits, DL, VT);
+    SDValue HiShift = DAG.getShiftAmountConstant(OldBits, VT, DL);
     Hi = DAG.getNode(ISD::SHL, DL, VT, Hi, HiShift);
     Lo = DAG.getZeroExtendInReg(Lo, DL, OldVT);
     SDValue Res = DAG.getNode(ISD::OR, DL, VT, Hi, Lo);
@@ -1624,13 +1624,14 @@ SDValue DAGTypeLegalizer::PromoteIntRes_FunnelShift(SDNode *N) {
   }
 
   // Shift Lo up to occupy the upper bits of the promoted type.
-  SDValue ShiftOffset = DAG.getConstant(NewBits - OldBits, DL, AmtVT);
-  Lo = DAG.getNode(ISD::SHL, DL, VT, Lo, ShiftOffset);
+  Lo = DAG.getNode(ISD::SHL, DL, VT, Lo,
+                   DAG.getShiftAmountConstant(NewBits - OldBits, VT, DL));
 
   // Increase Amount to shift the result into the lower bits of the promoted
   // type.
   if (IsFSHR)
-    Amt = DAG.getNode(ISD::ADD, DL, AmtVT, Amt, ShiftOffset);
+    Amt = DAG.getNode(ISD::ADD, DL, AmtVT, Amt,
+                      DAG.getConstant(NewBits - OldBits, DL, AmtVT));
 
   return DAG.getNode(Opcode, DL, VT, Hi, Lo, Amt);
 }
@@ -1938,9 +1939,9 @@ SDValue DAGTypeLegalizer::PromoteIntRes_VAARG(SDNode *N) {
   for (unsigned i = 1; i < NumRegs; ++i) {
     SDValue Part = DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, Parts[i]);
     // Shift it to the right position and "or" it in.
-    Part = DAG.getNode(ISD::SHL, dl, NVT, Part,
-                       DAG.getConstant(i * RegVT.getSizeInBits(), dl,
-                                       TLI.getPointerTy(DAG.getDataLayout())));
+    Part = DAG.getNode(
+        ISD::SHL, dl, NVT, Part,
+        DAG.getShiftAmountConstant(i * RegVT.getSizeInBits(), NVT, dl));
     Res = DAG.getNode(ISD::OR, dl, NVT, Res, Part);
   }
 
@@ -2293,9 +2294,9 @@ SDValue DAGTypeLegalizer::PromoteIntOp_BUILD_PAIR(SDNode *N) {
   assert(Lo.getValueType() == N->getValueType(0) && "Operand over promoted?");
   SDLoc dl(N);
 
-  Hi = DAG.getNode(ISD::SHL, dl, N->getValueType(0), Hi,
-                   DAG.getConstant(OVT.getSizeInBits(), dl,
-                                   TLI.getPointerTy(DAG.getDataLayout())));
+  Hi = DAG.getNode(
+      ISD::SHL, dl, N->getValueType(0), Hi,
+      DAG.getShiftAmountConstant(OVT.getSizeInBits(), N->getValueType(0), dl));
   return DAG.getNode(ISD::OR, dl, N->getValueType(0), Lo, Hi);
 }
 
@@ -3943,8 +3944,7 @@ void DAGTypeLegalizer::ExpandIntRes_AssertSext(SDNode *N,
     Lo = DAG.getNode(ISD::AssertSext, dl, NVT, Lo, DAG.getValueType(EVT));
     // The high part replicates the sign bit of Lo, make it explicit.
     Hi = DAG.getNode(ISD::SRA, dl, NVT, Lo,
-                     DAG.getConstant(NVTBits - 1, dl,
-                                     TLI.getPointerTy(DAG.getDataLayout())));
+                     DAG.getShiftAmountConstant(NVTBits - 1, NVT, dl));
   }
 }
 
@@ -4329,8 +4329,7 @@ void DAGTypeLegalizer::ExpandIntRes_LOAD(LoadSDNode *N,
       // lo part.
       unsigned LoSize = Lo.getValueSizeInBits();
       Hi = DAG.getNode(ISD::SRA, dl, NVT, Lo,
-                       DAG.getConstant(LoSize - 1, dl,
-                                       TLI.getPointerTy(DAG.getDataLayout())));
+                       DAG.getShiftAmountConstant(LoSize - 1, NVT, dl));
     } else if (ExtType == ISD::ZEXTLOAD) {
       // The high part is just a zero.
       Hi = DAG.getConstant(0, dl, NVT);
@@ -4391,13 +4390,12 @@ void DAGTypeLegalizer::ExpandIntRes_LOAD(LoadSDNode *N,
       Lo = DAG.getNode(
           ISD::OR, dl, NVT, Lo,
           DAG.getNode(ISD::SHL, dl, NVT, Hi,
-                      DAG.getConstant(ExcessBits, dl,
-                                      TLI.getPointerTy(DAG.getDataLayout()))));
+                      DAG.getShiftAmountConstant(ExcessBits, NVT, dl)));
       // Move high bits to the right position in Hi.
       Hi = DAG.getNode(ExtType == ISD::SEXTLOAD ? ISD::SRA : ISD::SRL, dl, NVT,
                        Hi,
-                       DAG.getConstant(NVT.getSizeInBits() - ExcessBits, dl,
-                                       TLI.getPointerTy(DAG.getDataLayout())));
+                       DAG.getShiftAmountConstant(
+                           NVT.getSizeInBits() - ExcessBits, NVT, dl));
     }
   }
 
@@ -5088,9 +5086,8 @@ void DAGTypeLegalizer::ExpandIntRes_SIGN_EXTEND(SDNode *N,
     Lo = DAG.getNode(ISD::SIGN_EXTEND, dl, NVT, N->getOperand(0));
     // The high part is obtained by SRA'ing all but one of the bits of low part.
     unsigned LoSize = NVT.getSizeInBits();
-    Hi = DAG.getNode(
-        ISD::SRA, dl, NVT, Lo,
-        DAG.getConstant(LoSize - 1, dl, TLI.getPointerTy(DAG.getDataLayout())));
+    Hi = DAG.getNode(ISD::SRA, dl, NVT, Lo,
+                     DAG.getShiftAmountConstant(LoSize - 1, NVT, dl));
   } else {
     // For example, extension of an i48 to an i64.  The operand type necessarily
     // promotes to the result type, so will end up being expanded too.
@@ -5123,8 +5120,8 @@ ExpandIntRes_SIGN_EXTEND_INREG(SDNode *N, SDValue &Lo, SDValue &Hi) {
     // The high part gets the sign extension from the lo-part.  This handles
     // things like sextinreg V:i64 from i8.
     Hi = DAG.getNode(ISD::SRA, dl, Hi.getValueType(), Lo,
-                     DAG.getConstant(Hi.getValueSizeInBits() - 1, dl,
-                                     TLI.getPointerTy(DAG.getDataLayout())));
+                     DAG.getShiftAmountConstant(Hi.getValueSizeInBits() - 1,
+                                                Hi.getValueType(), dl));
   } else {
     // For example, extension of an i48 to an i64.  Leave the low part alone,
     // sext_inreg the high part.
@@ -5166,12 +5163,12 @@ void DAGTypeLegalizer::ExpandIntRes_SREM(SDNode *N,
 void DAGTypeLegalizer::ExpandIntRes_TRUNCATE(SDNode *N,
                                              SDValue &Lo, SDValue &Hi) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue InOp = N->getOperand(0);
+  EVT InVT = InOp.getValueType();
   SDLoc dl(N);
-  Lo = DAG.getNode(ISD::TRUNCATE, dl, NVT, N->getOperand(0));
-  Hi = DAG.getNode(ISD::SRL, dl, N->getOperand(0).getValueType(),
-                   N->getOperand(0),
-                   DAG.getConstant(NVT.getSizeInBits(), dl,
-                                   TLI.getPointerTy(DAG.getDataLayout())));
+  Lo = DAG.getNode(ISD::TRUNCATE, dl, NVT, InOp);
+  Hi = DAG.getNode(ISD::SRL, dl, InVT, InOp,
+                   DAG.getShiftAmountConstant(NVT.getSizeInBits(), InVT, dl));
   Hi = DAG.getNode(ISD::TRUNCATE, dl, NVT, Hi);
 }
 
@@ -5254,9 +5251,9 @@ void DAGTypeLegalizer::ExpandIntRes_XMULO(SDNode *N,
     SDValue MulLo, MulHi;
     TLI.forceExpandWideMUL(DAG, dl, /*Signed=*/true, N->getOperand(0),
                            N->getOperand(1), MulLo, MulHi);
-    SDValue SRA =
-        DAG.getNode(ISD::SRA, dl, VT, MulLo,
-                    DAG.getConstant(VT.getScalarSizeInBits() - 1, dl, VT));
+    SDValue SRA = DAG.getNode(
+        ISD::SRA, dl, VT, MulLo,
+        DAG.getShiftAmountConstant(VT.getScalarSizeInBits() - 1, VT, dl));
     SDValue Overflow =
         DAG.getSetCC(dl, N->getValueType(1), MulHi, SRA, ISD::SETNE);
     SplitInteger(MulLo, Lo, Hi);
@@ -5929,14 +5926,13 @@ SDValue DAGTypeLegalizer::ExpandIntOp_STORE(StoreSDNode *N, unsigned OpNo) {
 
   if (ExcessBits < NVT.getSizeInBits()) {
     // Transfer high bits from the top of Lo to the bottom of Hi.
-    Hi = DAG.getNode(ISD::SHL, dl, NVT, Hi,
-                     DAG.getConstant(NVT.getSizeInBits() - ExcessBits, dl,
-                                     TLI.getPointerTy(DAG.getDataLayout())));
+    Hi = DAG.getNode(
+        ISD::SHL, dl, NVT, Hi,
+        DAG.getShiftAmountConstant(NVT.getSizeInBits() - ExcessBits, NVT, dl));
     Hi = DAG.getNode(
         ISD::OR, dl, NVT, Hi,
         DAG.getNode(ISD::SRL, dl, NVT, Lo,
-                    DAG.getConstant(ExcessBits, dl,
-                                    TLI.getPointerTy(DAG.getDataLayout()))));
+                    DAG.getShiftAmountConstant(ExcessBits, NVT, dl)));
   }
 
   // Store both the high bits and maybe some of the low bits.
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
index 83fade45d1892..f14eeda639e71 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
@@ -1001,11 +1001,10 @@ SDValue DAGTypeLegalizer::JoinIntegers(SDValue Lo, SDValue Hi) {
   EVT NVT = EVT::getIntegerVT(*DAG.getContext(),
                               LVT.getSizeInBits() + HVT.getSizeInBits());
 
-  EVT ShiftAmtVT = TLI.getShiftAmountTy(NVT, DAG.getDataLayout());
   Lo = DAG.getNode(ISD::ZERO_EXTEND, dlLo, NVT, Lo);
   Hi = DAG.getNode(ISD::ANY_EXTEND, dlHi, NVT, Hi);
   Hi = DAG.getNode(ISD::SHL, dlHi, NVT, Hi,
-                   DAG.getConstant(LVT.getSizeInBits(), dlHi, ShiftAmtVT));
+                   DAG.getShiftAmountConstant(LVT.getSizeInBits(), NVT, dlHi));
   return DAG.getNode(ISD::OR, dlHi, NVT, Lo, Hi);
 }
 
@@ -1026,14 +1025,9 @@ void DAGTypeLegalizer::SplitInteger(SDValue Op,
   assert(LoVT.getSizeInBits() + HiVT.getSizeInBits() ==
          Op.getValueSizeInBits() && "Invalid integer splitting!");
   Lo = DAG.getNode(ISD::TRUNCATE, dl, LoVT, Op);
-  unsigned ReqShiftAmountInBits =
-      Log2_32_Ceil(Op.getValueType().getSizeInBits());
-  MVT ShiftAmountTy =
-      TLI.getScalarShiftAmountTy(DAG.getDataLayout(), Op.getValueType());
-  if (ReqShiftAmountInBits > ShiftAmountTy.getSizeInBits())
-    ShiftAmountTy = MVT::getIntegerVT(NextPowerOf2(ReqShiftAmountInBits));
-  Hi = DAG.getNode(ISD::SRL, dl, Op.getValueType(), Op,
-                   DAG.getConstant(LoVT.getSizeInBits(), dl, ShiftAmountTy));
+  Hi = DAG.getNode(
+      ISD::SRL, dl, Op.getValueType(), Op,
+      DAG.getShiftAmountConstant(LoVT.getSizeInBits(), Op.getValueType(), dl));
   Hi = DAG.getNode(ISD::TRUNCATE, dl, HiVT, Hi);
 }
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 118fd8418f787..ff7cd665446cc 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -5945,10 +5945,8 @@ SDValue DAGTypeLegalizer::WidenVecRes_BITCAST(SDNode *N) {
       // interesting bits will end up at the wrong place.
       if (DAG.getDataLayout().isBigEndian()) {
         unsigned ShiftAmt = NInVT.getSizeInBits() - InVT.getSizeInBits();
-        EVT ShiftAmtTy = TLI.getShiftAmountTy(NInVT, DAG.getDataLayout());
-        assert(ShiftAmt < WidenVT.getSizeInBits() && "Too large shift amount!");
         NInOp = DAG.getNode(ISD::SHL, dl, NInVT, NInOp,
-                           DAG.getConstant(ShiftAmt, dl, ShiftAmtTy));
+                            DAG.getShiftAmountConstant(ShiftAmt, NInVT, dl));
       }
       return DAG.getNode(ISD::BITCAST, dl, WidenVT, NInOp);
     }
diff --git a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
index a570b71ecd28d..f70b6cddcc099 100644
--- a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
@@ -340,7 +340,7 @@ static void GetCostForDef(const ScheduleDAGSDNodes::RegDefIter &RegDefPos,
 
     unsigned Idx = RegDefPos.GetIdx();
     const MCInstrDesc &Desc = TII->get(Opcode);
-    const TargetRegisterClass *RC = TII->getRegClass(Desc, Idx, TRI, MF);
+    const TargetRegisterClass *RC = TII->getRegClass(Desc, Idx, TRI);
     assert(RC && "Not a valid register class");
     RegClass = RC->getID();
     // FIXME: Cost arbitrarily set to 1 because there doesn't seem to be a
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index bcf25958d0982..5b2c09ffecbe2 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -8247,8 +8247,8 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
     // INSERT_VECTOR_ELT into out-of-bounds element is an UNDEF, except
     // for scalable vectors where we will generate appropriate code to
     // deal with out-of-bounds cases correctly.
-    if (N3C && N1.getValueType().isFixedLengthVector() &&
-        N3C->getZExtValue() >= N1.getValueType().getVectorNumElements())
+    if (N3C && VT.isFixedLengthVector() &&
+        N3C->getZExtValue() >= VT.getVectorNumElements())
       return getUNDEF(VT);
 
     // Undefined index can be assumed out-of-bounds, so that's UNDEF too.
@@ -9053,6 +9053,18 @@ static void checkAddrSpaceIsValidForLibcall(const TargetLowering *TLI,
   }
 }
 
+static bool isInTailCallPositionWrapper(const CallInst *CI,
+                                        const SelectionDAG *SelDAG,
+                                        bool AllowReturnsFirstArg) {
+  if (!CI || !CI->isTailCall())
+    return false;
+  // TODO: Fix "returns-first-arg" determination so it doesn't depend on which
+  // helper symbol we lower to.
+  return isInTailCallPosition(*CI, SelDAG->getTarget(),
+                              AllowReturnsFirstArg &&
+                                  funcReturnsFirstArgOfCall(*CI));
+}
+
 std::pair<SDValue, SDValue>
 SelectionDAG::getMemcmp(SDValue Chain, const SDLoc &dl, SDValue Mem0,
                         SDValue Mem1, SDValue Size, const CallInst *CI) {
@@ -9067,10 +9079,8 @@ SelectionDAG::getMemcmp(SDValue Chain, const SDLoc &dl, SDValue Mem0,
       {Size, getDataLayout().getIntPtrType(*getContext())}};
 
   TargetLowering::CallLoweringInfo CLI(*this);
-  bool IsTailCall = false;
-  bool ReturnsFirstArg = CI && funcReturnsFirstArgOfCall(*CI);
-  IsTailCall = CI && CI->isTailCall() &&
-               isInTailCallPosition(*CI, getTarget(), ReturnsFirstArg);
+  bool IsTailCall =
+      isInTailCallPositionWrapper(CI, this, /*AllowReturnsFirstArg*/ true);
 
   CLI.setDebugLoc(dl)
       .setChain(Chain)
@@ -9148,10 +9158,7 @@ SDValue SelectionDAG::getMemcpy(
     IsTailCall = *OverrideTailCall;
   } else {
     bool LowersToMemcpy = StringRef(MemCpyName) == StringRef("memcpy");
-    bool ReturnsFirstArg = CI && funcReturnsFirstArgOfCall(*CI);
-    IsTailCall = CI && CI->isTailCall() &&
-                 isInTailCallPosition(*CI, getTarget(),
-                                      ReturnsFirstArg && LowersToMemcpy);
+    IsTailCall = isInTailCallPositionWrapper(CI, this, LowersToMemcpy);
   }
 
   CLI.setDebugLoc(dl)
@@ -9255,10 +9262,7 @@ SDValue SelectionDAG::getMemmove(SDValue Chain, const SDLoc &dl, SDValue Dst,
   } else {
     bool LowersToMemmove =
         TLI->getLibcallName(RTLIB::MEMMOVE) == StringRef("memmove");
-    bool ReturnsFirstArg = CI && funcReturnsFirstArgOfCall(*CI);
-    IsTailCall = CI && CI->isTailCall() &&
-                 isInTailCallPosition(*CI, getTarget(),
-                                      ReturnsFirstArg && LowersToMemmove);
+    IsTailCall = isInTailCallPositionWrapper(CI, this, LowersToMemmove);
   }
 
   CLI.setDebugLoc(dl)
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 430e47451fd49..070d7978ce48f 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -223,10 +223,9 @@ getCopyFromParts(SelectionDAG &DAG, const SDLoc &DL, const SDValue *Parts,
           std::swap(Lo, Hi);
         EVT TotalVT = EVT::getIntegerVT(*DAG.getContext(), NumParts * PartBits);
         Hi = DAG.getNode(ISD::ANY_EXTEND, DL, TotalVT, Hi);
-        Hi = DAG.getNode(ISD::SHL, DL, TotalVT, Hi,
-                         DAG.getConstant(Lo.getValueSizeInBits(), DL,
-                                         TLI.getShiftAmountTy(
-                                             TotalVT, DAG.getDataLayout())));
+        Hi = DAG.getNode(
+            ISD::SHL, DL, TotalVT, Hi,
+            DAG.getShiftAmountConstant(Lo.getValueSizeInBits(), TotalVT, DL));
         Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, TotalVT, Lo);
         Val = DAG.getNode(ISD::OR, DL, TotalVT, Lo, Hi);
       }
@@ -4469,9 +4468,10 @@ void SelectionDAGBuilder::visitGetElementPtr(const User &I) {
         if (ElementMul != 1) {
           if (ElementMul.isPowerOf2()) {
             unsigned Amt = ElementMul.logBase2();
-            IdxN = DAG.getNode(ISD::SHL, dl, N.getValueType(), IdxN,
-                               DAG.getConstant(Amt, dl, IdxN.getValueType()),
-                               ScaleFlags);
+            IdxN = DAG.getNode(
+                ISD::SHL, dl, N.getValueType(), IdxN,
+                DAG.getShiftAmountConstant(Amt, N.getValueType(), dl),
+                ScaleFlags);
           } else {
             SDValue Scale = DAG.getConstant(ElementMul.getZExtValue(), dl,
                                             IdxN.getValueType());
@@ -5460,10 +5460,8 @@ static SDValue GetExponent(SelectionDAG &DAG, SDValue Op,
                            const TargetLowering &TLI, const SDLoc &dl) {
   SDValue t0 = DAG.getNode(ISD::AND, dl, MVT::i32, Op,
                            DAG.getConstant(0x7f800000, dl, MVT::i32));
-  SDValue t1 = DAG.getNode(
-      ISD::SRL, dl, MVT::i32, t0,
-      DAG.getConstant(23, dl,
-                      TLI.getShiftAmountTy(MVT::i32, DAG.getDataLayout())));
+  SDValue t1 = DAG.getNode(ISD::SRL, dl, MVT::i32, t0,
+                           DAG.getShiftAmountConstant(23, MVT::i32, dl));
   SDValue t2 = DAG.getNode(ISD::SUB, dl, MVT::i32, t1,
                            DAG.getConstant(127, dl, MVT::i32));
   return DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, t2);
@@ -5488,11 +5486,8 @@ static SDValue getLimitedPrecisionExp2(SDValue t0, const SDLoc &dl,
   SDValue X = DAG.getNode(ISD::FSUB, dl, MVT::f32, t0, t1);
 
   //   IntegerPartOfX <<= 23;
-  IntegerPartOfX =
-      DAG.getNode(ISD::SHL, dl, MVT::i32, IntegerPartOfX,
-                  DAG.getConstant(23, dl,
-                                  DAG.getTargetLoweringInfo().getShiftAmountTy(
-                                      MVT::i32, DAG.getDataLayout())));
+  IntegerPartOfX = DAG.getNode(ISD::SHL, dl, MVT::i32, IntegerPartOfX,
+                               DAG.getShiftAmountConstant(23, MVT::i32, dl));
 
   SDValue TwoToFractionalPartOfX;
   if (LimitFloatPrecision <= 6) {
@@ -8107,7 +8102,7 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     setValue(&I, Trunc);
     return;
   }
-  case Intrinsic::experimental_vector_partial_reduce_add: {
+  case Intrinsic::vector_partial_reduce_add: {
     if (!TLI.shouldExpandPartialReductionIntrinsic(cast<IntrinsicInst>(&I))) {
       visitTargetIntrinsic(I, Intrinsic);
       return;
diff --git a/llvm/lib/CodeGen/StaticDataAnnotator.cpp b/llvm/lib/CodeGen/StaticDataAnnotator.cpp
index 2d9b489a80acb..53a9ab4dbda02 100644
--- a/llvm/lib/CodeGen/StaticDataAnnotator.cpp
+++ b/llvm/lib/CodeGen/StaticDataAnnotator.cpp
@@ -91,8 +91,7 @@ bool StaticDataAnnotator::runOnModule(Module &M) {
     if (SectionPrefix.empty())
       continue;
 
-    GV.setSectionPrefix(SectionPrefix);
-    Changed = true;
+    Changed |= GV.setSectionPrefix(SectionPrefix);
   }
 
   return Changed;
diff --git a/llvm/lib/CodeGen/TailDuplicator.cpp b/llvm/lib/CodeGen/TailDuplicator.cpp
index 5d720fbbf1c61..9b1420a94142d 100644
--- a/llvm/lib/CodeGen/TailDuplicator.cpp
+++ b/llvm/lib/CodeGen/TailDuplicator.cpp
@@ -375,9 +375,14 @@ void TailDuplicator::processPHI(
   if (!Remove)
     return;
 
-  // Remove PredBB from the PHI node.
-  MI->removeOperand(SrcOpIdx + 1);
-  MI->removeOperand(SrcOpIdx);
+  // MI might have multiple entries for PredBB. Need to remove them all.
+  for (unsigned N = MI->getNumOperands(); N > 2; N -= 2) {
+    if (MI->getOperand(N - 1).getMBB() == PredBB) {
+      MI->removeOperand(N - 1);
+      MI->removeOperand(N - 2);
+    }
+  }
+
   if (MI->getNumOperands() == 1 && !TailBB->hasAddressTaken())
     MI->eraseFromParent();
   else if (MI->getNumOperands() == 1)
diff --git a/llvm/lib/CodeGen/TargetInstrInfo.cpp b/llvm/lib/CodeGen/TargetInstrInfo.cpp
index 0d7b128fc736e..b0009560d3fcb 100644
--- a/llvm/lib/CodeGen/TargetInstrInfo.cpp
+++ b/llvm/lib/CodeGen/TargetInstrInfo.cpp
@@ -58,16 +58,15 @@ static cl::opt<unsigned int> MaxAccumulatorWidth(
 
 TargetInstrInfo::~TargetInstrInfo() = default;
 
-const TargetRegisterClass*
+const TargetRegisterClass *
 TargetInstrInfo::getRegClass(const MCInstrDesc &MCID, unsigned OpNum,
-                             const TargetRegisterInfo *TRI,
-                             const MachineFunction &MF) const {
+                             const TargetRegisterInfo *TRI) const {
   if (OpNum >= MCID.getNumOperands())
     return nullptr;
 
   short RegClass = MCID.operands()[OpNum].RegClass;
   if (MCID.operands()[OpNum].isLookupPtrRegClass())
-    return TRI->getPointerRegClass(MF, RegClass);
+    return TRI->getPointerRegClass(RegClass);
 
   // Instructions like INSERT_SUBREG do not have fixed register classes.
   if (RegClass < 0)
diff --git a/llvm/lib/CodeGen/TargetRegisterInfo.cpp b/llvm/lib/CodeGen/TargetRegisterInfo.cpp
index 701a9f8d72a65..c9e46182decc2 100644
--- a/llvm/lib/CodeGen/TargetRegisterInfo.cpp
+++ b/llvm/lib/CodeGen/TargetRegisterInfo.cpp
@@ -412,25 +412,21 @@ getCommonSuperRegClass(const TargetRegisterClass *RCA, unsigned SubA,
   return BestRC;
 }
 
-/// Check if the registers defined by the pair (RegisterClass, SubReg)
-/// share the same register file.
-static bool shareSameRegisterFile(const TargetRegisterInfo &TRI,
-                                  const TargetRegisterClass *DefRC,
-                                  unsigned DefSubReg,
-                                  const TargetRegisterClass *SrcRC,
-                                  unsigned SrcSubReg) {
+const TargetRegisterClass *TargetRegisterInfo::findCommonRegClass(
+    const TargetRegisterClass *DefRC, unsigned DefSubReg,
+    const TargetRegisterClass *SrcRC, unsigned SrcSubReg) const {
   // Same register class.
   //
   // When processing uncoalescable copies / bitcasts, it is possible we reach
   // here with the same register class, but mismatched subregister indices.
   if (DefRC == SrcRC && DefSubReg == SrcSubReg)
-    return true;
+    return DefRC;
 
   // Both operands are sub registers. Check if they share a register class.
   unsigned SrcIdx, DefIdx;
   if (SrcSubReg && DefSubReg) {
-    return TRI.getCommonSuperRegClass(SrcRC, SrcSubReg, DefRC, DefSubReg,
-                                      SrcIdx, DefIdx) != nullptr;
+    return getCommonSuperRegClass(SrcRC, SrcSubReg, DefRC, DefSubReg, SrcIdx,
+                                  DefIdx);
   }
 
   // At most one of the register is a sub register, make it Src to avoid
@@ -442,18 +438,10 @@ static bool shareSameRegisterFile(const TargetRegisterInfo &TRI,
 
   // One of the register is a sub register, check if we can get a superclass.
   if (SrcSubReg)
-    return TRI.getMatchingSuperRegClass(SrcRC, DefRC, SrcSubReg) != nullptr;
+    return getMatchingSuperRegClass(SrcRC, DefRC, SrcSubReg);
 
   // Plain copy.
-  return TRI.getCommonSubClass(DefRC, SrcRC) != nullptr;
-}
-
-bool TargetRegisterInfo::shouldRewriteCopySrc(const TargetRegisterClass *DefRC,
-                                              unsigned DefSubReg,
-                                              const TargetRegisterClass *SrcRC,
-                                              unsigned SrcSubReg) const {
-  // If this source does not incur a cross register bank copy, use it.
-  return shareSameRegisterFile(*this, DefRC, DefSubReg, SrcRC, SrcSubReg);
+  return getCommonSubClass(DefRC, SrcRC);
 }
 
 float TargetRegisterInfo::getSpillWeightScaleFactor(
diff --git a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
index 8d94b40a41bea..414e414738b71 100644
--- a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -1401,9 +1401,8 @@ bool TwoAddressInstructionImpl::tryInstructionTransform(
       if (UnfoldMCID.getNumDefs() == 1) {
         // Unfold the load.
         LLVM_DEBUG(dbgs() << "2addr:   UNFOLDING: " << MI);
-        const TargetRegisterClass *RC =
-          TRI->getAllocatableClass(
-            TII->getRegClass(UnfoldMCID, LoadRegIndex, TRI, *MF));
+        const TargetRegisterClass *RC = TRI->getAllocatableClass(
+            TII->getRegClass(UnfoldMCID, LoadRegIndex, TRI));
         Register Reg = MRI->createVirtualRegister(RC);
         SmallVector<MachineInstr *, 2> NewMIs;
         if (!TII->unfoldMemoryOperand(*MF, MI, Reg,
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFExpressionPrinter.cpp b/llvm/lib/DebugInfo/DWARF/DWARFExpressionPrinter.cpp
index 078ebf4e7c032..fcd2316c30aef 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFExpressionPrinter.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFExpressionPrinter.cpp
@@ -53,6 +53,8 @@ static bool printOp(const DWARFExpression::Operation *Op, raw_ostream &OS,
     return false;
   }
 
+  std::optional<unsigned> SubOpcode = Op->getSubCode();
+
   // In "register-only" mode, still show simple constant-valued locations.
   // This lets clients print annotations like "i = 0" when the location is
   // a constant (e.g. DW_OP_constu/consts ... DW_OP_stack_value).
@@ -63,7 +65,9 @@ static bool printOp(const DWARFExpression::Operation *Op, raw_ostream &OS,
     if ((Op->getCode() >= DW_OP_breg0 && Op->getCode() <= DW_OP_breg31) ||
         (Op->getCode() >= DW_OP_reg0 && Op->getCode() <= DW_OP_reg31) ||
         Op->getCode() == DW_OP_bregx || Op->getCode() == DW_OP_regx ||
-        Op->getCode() == DW_OP_regval_type) {
+        Op->getCode() == DW_OP_regval_type ||
+        SubOpcode == DW_OP_LLVM_call_frame_entry_reg ||
+        SubOpcode == DW_OP_LLVM_aspace_bregx) {
       if (prettyPrintRegisterOp(U, OS, DumpOpts, Op->getCode(),
                                 Op->getRawOperands()))
         return true;
@@ -93,12 +97,20 @@ static bool printOp(const DWARFExpression::Operation *Op, raw_ostream &OS,
     StringRef Name = OperationEncodingString(Op->getCode());
     assert(!Name.empty() && "DW_OP has no name!");
     OS << Name;
+
+    if (SubOpcode) {
+      StringRef SubName = SubOperationEncodingString(Op->getCode(), *SubOpcode);
+      assert(!SubName.empty() && "DW_OP SubOp has no name!");
+      OS << ' ' << SubName;
+    }
   }
 
   if ((Op->getCode() >= DW_OP_breg0 && Op->getCode() <= DW_OP_breg31) ||
       (Op->getCode() >= DW_OP_reg0 && Op->getCode() <= DW_OP_reg31) ||
       Op->getCode() == DW_OP_bregx || Op->getCode() == DW_OP_regx ||
-      Op->getCode() == DW_OP_regval_type)
+      Op->getCode() == DW_OP_regval_type ||
+      SubOpcode == DW_OP_LLVM_call_frame_entry_reg ||
+      SubOpcode == DW_OP_LLVM_aspace_bregx)
     if (prettyPrintRegisterOp(U, OS, DumpOpts, Op->getCode(),
                               Op->getRawOperands()))
       return true;
@@ -110,10 +122,8 @@ static bool printOp(const DWARFExpression::Operation *Op, raw_ostream &OS,
       unsigned Signed = Size & DWARFExpression::Operation::SignBit;
 
       if (Size == DWARFExpression::Operation::SizeSubOpLEB) {
-        StringRef SubName = SubOperationEncodingString(
-            Op->getCode(), Op->getRawOperand(Operand));
-        assert(!SubName.empty() && "DW_OP SubOp has no name!");
-        OS << " " << SubName;
+        assert(Operand == 0 && "DW_OP SubOp must be the first operand");
+        assert(SubOpcode && "DW_OP SubOp description is inconsistent");
       } else if (Size == DWARFExpression::Operation::BaseTypeRef && U) {
         // For DW_OP_convert the operand may be 0 to indicate that conversion to
         // the generic type should be done. The same holds for
@@ -210,6 +220,19 @@ static bool printCompactDWARFExpr(
         nullptr) {
   SmallVector<PrintedExpr, 4> Stack;
 
+  auto UnknownOpcode = [](raw_ostream &OS, uint8_t Opcode,
+                          std::optional<unsigned> SubOpcode) -> bool {
+    // If we hit an unknown operand, we don't know its effect on the stack,
+    // so bail out on the whole expression.
+    OS << "<unknown op " << dwarf::OperationEncodingString(Opcode) << " ("
+       << (int)Opcode;
+    if (SubOpcode)
+      OS << ") subop " << dwarf::SubOperationEncodingString(Opcode, *SubOpcode)
+         << " (" << *SubOpcode;
+    OS << ")>";
+    return false;
+  };
+
   while (I != E) {
     const DWARFExpression::Operation &Op = *I;
     uint8_t Opcode = Op.getCode();
@@ -262,8 +285,10 @@ static bool printCompactDWARFExpr(
       break;
     }
     case dwarf::DW_OP_LLVM_user: {
-      assert(Op.getSubCode() == dwarf::DW_OP_LLVM_nop);
-      break;
+      std::optional<unsigned> SubOpcode = Op.getSubCode();
+      if (SubOpcode == dwarf::DW_OP_LLVM_nop)
+        break;
+      return UnknownOpcode(OS, Opcode, SubOpcode);
     }
     default:
       if (Opcode >= dwarf::DW_OP_reg0 && Opcode <= dwarf::DW_OP_reg31) {
@@ -287,11 +312,7 @@ static bool printCompactDWARFExpr(
         if (Offset)
           S << format("%+" PRId64, Offset);
       } else {
-        // If we hit an unknown operand, we don't know its effect on the stack,
-        // so bail out on the whole expression.
-        OS << "<unknown op " << dwarf::OperationEncodingString(Opcode) << " ("
-           << (int)Opcode << ")>";
-        return false;
+        return UnknownOpcode(OS, Opcode, std::nullopt);
       }
       break;
     }
@@ -326,8 +347,13 @@ bool prettyPrintRegisterOp(DWARFUnit *U, raw_ostream &OS,
   uint64_t DwarfRegNum;
   unsigned OpNum = 0;
 
+  std::optional<unsigned> SubOpcode;
+  if (Opcode == DW_OP_LLVM_user)
+    SubOpcode = Operands[OpNum++];
+
   if (Opcode == DW_OP_bregx || Opcode == DW_OP_regx ||
-      Opcode == DW_OP_regval_type)
+      Opcode == DW_OP_regval_type || SubOpcode == DW_OP_LLVM_aspace_bregx ||
+      SubOpcode == DW_OP_LLVM_call_frame_entry_reg)
     DwarfRegNum = Operands[OpNum++];
   else if (Opcode >= DW_OP_breg0 && Opcode < DW_OP_bregx)
     DwarfRegNum = Opcode - DW_OP_breg0;
@@ -337,7 +363,7 @@ bool prettyPrintRegisterOp(DWARFUnit *U, raw_ostream &OS,
   auto RegName = DumpOpts.GetNameForDWARFReg(DwarfRegNum, DumpOpts.IsEH);
   if (!RegName.empty()) {
     if ((Opcode >= DW_OP_breg0 && Opcode <= DW_OP_breg31) ||
-        Opcode == DW_OP_bregx)
+        Opcode == DW_OP_bregx || SubOpcode == DW_OP_LLVM_aspace_bregx)
       OS << ' ' << RegName << format("%+" PRId64, Operands[OpNum]);
     else
       OS << ' ' << RegName.data();
diff --git a/llvm/lib/DebugInfo/DWARF/LowLevel/DWARFExpression.cpp b/llvm/lib/DebugInfo/DWARF/LowLevel/DWARFExpression.cpp
index 9a7f7d1ca2a67..61bd6fcb65bfa 100644
--- a/llvm/lib/DebugInfo/DWARF/LowLevel/DWARFExpression.cpp
+++ b/llvm/lib/DebugInfo/DWARF/LowLevel/DWARFExpression.cpp
@@ -130,6 +130,23 @@ static std::vector<Desc> getSubOpDescriptions() {
   std::vector<Desc> Descriptions;
   Descriptions.resize(LlvmUserDescriptionsSize);
   Descriptions[DW_OP_LLVM_nop] = Desc(Op::Dwarf5, Op::SizeSubOpLEB);
+  Descriptions[DW_OP_LLVM_form_aspace_address] =
+      Desc(Op::Dwarf5, Op::SizeSubOpLEB);
+  Descriptions[DW_OP_LLVM_push_lane] = Desc(Op::Dwarf5, Op::SizeSubOpLEB);
+  Descriptions[DW_OP_LLVM_offset] = Desc(Op::Dwarf5, Op::SizeSubOpLEB);
+  Descriptions[DW_OP_LLVM_offset_uconst] =
+      Desc(Op::Dwarf5, Op::SizeSubOpLEB, Op::SizeLEB);
+  Descriptions[DW_OP_LLVM_bit_offset] = Desc(Op::Dwarf5, Op::SizeSubOpLEB);
+  Descriptions[DW_OP_LLVM_call_frame_entry_reg] =
+      Desc(Op::Dwarf5, Op::SizeSubOpLEB, Op::SizeLEB);
+  Descriptions[DW_OP_LLVM_undefined] = Desc(Op::Dwarf5, Op::SizeSubOpLEB);
+  Descriptions[DW_OP_LLVM_aspace_bregx] =
+      Desc(Op::Dwarf5, Op::SizeSubOpLEB, Op::SizeLEB, Op::SizeLEB);
+  Descriptions[DW_OP_LLVM_piece_end] = Desc(Op::Dwarf5, Op::SizeSubOpLEB);
+  Descriptions[DW_OP_LLVM_extend] =
+      Desc(Op::Dwarf5, Op::SizeSubOpLEB, Op::SizeLEB, Op::SizeLEB);
+  Descriptions[DW_OP_LLVM_select_bit_piece] =
+      Desc(Op::Dwarf5, Op::SizeSubOpLEB, Op::SizeLEB, Op::SizeLEB);
   return Descriptions;
 }
 
@@ -164,6 +181,8 @@ bool DWARFExpression::Operation::extract(DataExtractor Data,
         return false;
       assert(Desc.Op[Operand] == Operation::SizeSubOpLEB &&
              "SizeSubOpLEB Description must begin with SizeSubOpLEB operand");
+      Operands.resize(Desc.Op.size());
+      OperandEndOffsets.resize(Desc.Op.size());
       break;
     case Operation::Size1:
       Operands[Operand] = Data.getU8(&Offset);
diff --git a/llvm/lib/Frontend/HLSL/RootSignatureMetadata.cpp b/llvm/lib/Frontend/HLSL/RootSignatureMetadata.cpp
index 31605e3900341..f29f2c7602fc6 100644
--- a/llvm/lib/Frontend/HLSL/RootSignatureMetadata.cpp
+++ b/llvm/lib/Frontend/HLSL/RootSignatureMetadata.cpp
@@ -52,13 +52,15 @@ static std::optional<StringRef> extractMdStringValue(MDNode *Node,
   return NodeText->getString();
 }
 
-static Expected<dxbc::ShaderVisibility>
-extractShaderVisibility(MDNode *Node, unsigned int OpId) {
+template <typename T, typename = std::enable_if_t<
+                          std::is_enum_v<T> &&
+                          std::is_same_v<std::underlying_type_t<T>, uint32_t>>>
+Expected<T> extractEnumValue(MDNode *Node, unsigned int OpId, StringRef ErrText,
+                             llvm::function_ref<bool(uint32_t)> VerifyFn) {
   if (std::optional<uint32_t> Val = extractMdIntValue(Node, OpId)) {
-    if (!dxbc::isValidShaderVisibility(*Val))
-      return make_error<RootSignatureValidationError<uint32_t>>(
-          "ShaderVisibility", *Val);
-    return dxbc::ShaderVisibility(*Val);
+    if (!VerifyFn(*Val))
+      return make_error<RootSignatureValidationError<uint32_t>>(ErrText, *Val);
+    return static_cast<T>(*Val);
   }
   return make_error<InvalidRSMetadataValue>("ShaderVisibility");
 }
@@ -233,7 +235,9 @@ Error MetadataParser::parseRootConstants(mcdxbc::RootSignatureDesc &RSD,
     return make_error<InvalidRSMetadataFormat>("RootConstants Element");
 
   Expected<dxbc::ShaderVisibility> Visibility =
-      extractShaderVisibility(RootConstantNode, 1);
+      extractEnumValue<dxbc::ShaderVisibility>(RootConstantNode, 1,
+                                               "ShaderVisibility",
+                                               dxbc::isValidShaderVisibility);
   if (auto E = Visibility.takeError())
     return Error(std::move(E));
 
@@ -287,7 +291,9 @@ Error MetadataParser::parseRootDescriptors(
   }
 
   Expected<dxbc::ShaderVisibility> Visibility =
-      extractShaderVisibility(RootDescriptorNode, 1);
+      extractEnumValue<dxbc::ShaderVisibility>(RootDescriptorNode, 1,
+                                               "ShaderVisibility",
+                                               dxbc::isValidShaderVisibility);
   if (auto E = Visibility.takeError())
     return Error(std::move(E));
 
@@ -380,7 +386,9 @@ Error MetadataParser::parseDescriptorTable(mcdxbc::RootSignatureDesc &RSD,
     return make_error<InvalidRSMetadataFormat>("Descriptor Table");
 
   Expected<dxbc::ShaderVisibility> Visibility =
-      extractShaderVisibility(DescriptorTableNode, 1);
+      extractEnumValue<dxbc::ShaderVisibility>(DescriptorTableNode, 1,
+                                               "ShaderVisibility",
+                                               dxbc::isValidShaderVisibility);
   if (auto E = Visibility.takeError())
     return Error(std::move(E));
 
@@ -406,26 +414,34 @@ Error MetadataParser::parseStaticSampler(mcdxbc::RootSignatureDesc &RSD,
   if (StaticSamplerNode->getNumOperands() != 14)
     return make_error<InvalidRSMetadataFormat>("Static Sampler");
 
-  dxbc::RTS0::v1::StaticSampler Sampler;
-  if (std::optional<uint32_t> Val = extractMdIntValue(StaticSamplerNode, 1))
-    Sampler.Filter = *Val;
-  else
-    return make_error<InvalidRSMetadataValue>("Filter");
+  mcdxbc::StaticSampler Sampler;
 
-  if (std::optional<uint32_t> Val = extractMdIntValue(StaticSamplerNode, 2))
-    Sampler.AddressU = *Val;
-  else
-    return make_error<InvalidRSMetadataValue>("AddressU");
+  Expected<dxbc::SamplerFilter> Filter = extractEnumValue<dxbc::SamplerFilter>(
+      StaticSamplerNode, 1, "Filter", dxbc::isValidSamplerFilter);
+  if (auto E = Filter.takeError())
+    return Error(std::move(E));
+  Sampler.Filter = *Filter;
 
-  if (std::optional<uint32_t> Val = extractMdIntValue(StaticSamplerNode, 3))
-    Sampler.AddressV = *Val;
-  else
-    return make_error<InvalidRSMetadataValue>("AddressV");
+  Expected<dxbc::TextureAddressMode> AddressU =
+      extractEnumValue<dxbc::TextureAddressMode>(
+          StaticSamplerNode, 2, "AddressU", dxbc::isValidAddress);
+  if (auto E = AddressU.takeError())
+    return Error(std::move(E));
+  Sampler.AddressU = *AddressU;
 
-  if (std::optional<uint32_t> Val = extractMdIntValue(StaticSamplerNode, 4))
-    Sampler.AddressW = *Val;
-  else
-    return make_error<InvalidRSMetadataValue>("AddressW");
+  Expected<dxbc::TextureAddressMode> AddressV =
+      extractEnumValue<dxbc::TextureAddressMode>(
+          StaticSamplerNode, 3, "AddressV", dxbc::isValidAddress);
+  if (auto E = AddressV.takeError())
+    return Error(std::move(E));
+  Sampler.AddressV = *AddressV;
+
+  Expected<dxbc::TextureAddressMode> AddressW =
+      extractEnumValue<dxbc::TextureAddressMode>(
+          StaticSamplerNode, 4, "AddressW", dxbc::isValidAddress);
+  if (auto E = AddressW.takeError())
+    return Error(std::move(E));
+  Sampler.AddressW = *AddressW;
 
   if (std::optional<float> Val = extractMdFloatValue(StaticSamplerNode, 5))
     Sampler.MipLODBias = *Val;
@@ -437,15 +453,19 @@ Error MetadataParser::parseStaticSampler(mcdxbc::RootSignatureDesc &RSD,
   else
     return make_error<InvalidRSMetadataValue>("MaxAnisotropy");
 
-  if (std::optional<uint32_t> Val = extractMdIntValue(StaticSamplerNode, 7))
-    Sampler.ComparisonFunc = *Val;
-  else
-    return make_error<InvalidRSMetadataValue>("ComparisonFunc");
+  Expected<dxbc::ComparisonFunc> ComparisonFunc =
+      extractEnumValue<dxbc::ComparisonFunc>(
+          StaticSamplerNode, 7, "ComparisonFunc", dxbc::isValidComparisonFunc);
+  if (auto E = ComparisonFunc.takeError())
+    return Error(std::move(E));
+  Sampler.ComparisonFunc = *ComparisonFunc;
 
-  if (std::optional<uint32_t> Val = extractMdIntValue(StaticSamplerNode, 8))
-    Sampler.BorderColor = *Val;
-  else
-    return make_error<InvalidRSMetadataValue>("ComparisonFunc");
+  Expected<dxbc::StaticBorderColor> BorderColor =
+      extractEnumValue<dxbc::StaticBorderColor>(
+          StaticSamplerNode, 8, "BorderColor", dxbc::isValidBorderColor);
+  if (auto E = BorderColor.takeError())
+    return Error(std::move(E));
+  Sampler.BorderColor = *BorderColor;
 
   if (std::optional<float> Val = extractMdFloatValue(StaticSamplerNode, 9))
     Sampler.MinLOD = *Val;
@@ -467,10 +487,13 @@ Error MetadataParser::parseStaticSampler(mcdxbc::RootSignatureDesc &RSD,
   else
     return make_error<InvalidRSMetadataValue>("RegisterSpace");
 
-  if (std::optional<uint32_t> Val = extractMdIntValue(StaticSamplerNode, 13))
-    Sampler.ShaderVisibility = *Val;
-  else
-    return make_error<InvalidRSMetadataValue>("ShaderVisibility");
+  Expected<dxbc::ShaderVisibility> Visibility =
+      extractEnumValue<dxbc::ShaderVisibility>(StaticSamplerNode, 13,
+                                               "ShaderVisibility",
+                                               dxbc::isValidShaderVisibility);
+  if (auto E = Visibility.takeError())
+    return Error(std::move(E));
+  Sampler.ShaderVisibility = *Visibility;
 
   RSD.StaticSamplers.push_back(Sampler);
   return Error::success();
@@ -594,30 +617,7 @@ Error MetadataParser::validateRootSignature(
     }
   }
 
-  for (const dxbc::RTS0::v1::StaticSampler &Sampler : RSD.StaticSamplers) {
-    if (!hlsl::rootsig::verifySamplerFilter(Sampler.Filter))
-      DeferredErrs =
-          joinErrors(std::move(DeferredErrs),
-                     make_error<RootSignatureValidationError<uint32_t>>(
-                         "Filter", Sampler.Filter));
-
-    if (!hlsl::rootsig::verifyAddress(Sampler.AddressU))
-      DeferredErrs =
-          joinErrors(std::move(DeferredErrs),
-                     make_error<RootSignatureValidationError<uint32_t>>(
-                         "AddressU", Sampler.AddressU));
-
-    if (!hlsl::rootsig::verifyAddress(Sampler.AddressV))
-      DeferredErrs =
-          joinErrors(std::move(DeferredErrs),
-                     make_error<RootSignatureValidationError<uint32_t>>(
-                         "AddressV", Sampler.AddressV));
-
-    if (!hlsl::rootsig::verifyAddress(Sampler.AddressW))
-      DeferredErrs =
-          joinErrors(std::move(DeferredErrs),
-                     make_error<RootSignatureValidationError<uint32_t>>(
-                         "AddressW", Sampler.AddressW));
+  for (const mcdxbc::StaticSampler &Sampler : RSD.StaticSamplers) {
 
     if (!hlsl::rootsig::verifyMipLODBias(Sampler.MipLODBias))
       DeferredErrs = joinErrors(std::move(DeferredErrs),
@@ -630,18 +630,6 @@ Error MetadataParser::validateRootSignature(
                      make_error<RootSignatureValidationError<uint32_t>>(
                          "MaxAnisotropy", Sampler.MaxAnisotropy));
 
-    if (!hlsl::rootsig::verifyComparisonFunc(Sampler.ComparisonFunc))
-      DeferredErrs =
-          joinErrors(std::move(DeferredErrs),
-                     make_error<RootSignatureValidationError<uint32_t>>(
-                         "ComparisonFunc", Sampler.ComparisonFunc));
-
-    if (!hlsl::rootsig::verifyBorderColor(Sampler.BorderColor))
-      DeferredErrs =
-          joinErrors(std::move(DeferredErrs),
-                     make_error<RootSignatureValidationError<uint32_t>>(
-                         "BorderColor", Sampler.BorderColor));
-
     if (!hlsl::rootsig::verifyLOD(Sampler.MinLOD))
       DeferredErrs = joinErrors(std::move(DeferredErrs),
                                 make_error<RootSignatureValidationError<float>>(
@@ -663,12 +651,6 @@ Error MetadataParser::validateRootSignature(
           joinErrors(std::move(DeferredErrs),
                      make_error<RootSignatureValidationError<uint32_t>>(
                          "RegisterSpace", Sampler.RegisterSpace));
-
-    if (!dxbc::isValidShaderVisibility(Sampler.ShaderVisibility))
-      DeferredErrs =
-          joinErrors(std::move(DeferredErrs),
-                     make_error<RootSignatureValidationError<uint32_t>>(
-                         "ShaderVisibility", Sampler.ShaderVisibility));
   }
 
   return DeferredErrs;
diff --git a/llvm/lib/Frontend/HLSL/RootSignatureValidations.cpp b/llvm/lib/Frontend/HLSL/RootSignatureValidations.cpp
index d682dda0bab26..0970977b5064f 100644
--- a/llvm/lib/Frontend/HLSL/RootSignatureValidations.cpp
+++ b/llvm/lib/Frontend/HLSL/RootSignatureValidations.cpp
@@ -115,27 +115,6 @@ bool verifyNumDescriptors(uint32_t NumDescriptors) {
   return NumDescriptors > 0;
 }
 
-bool verifySamplerFilter(uint32_t Value) {
-  switch (Value) {
-#define FILTER(Num, Val) case llvm::to_underlying(dxbc::SamplerFilter::Val):
-#include "llvm/BinaryFormat/DXContainerConstants.def"
-    return true;
-  }
-  return false;
-}
-
-// Values allowed here:
-// https://learn.microsoft.com/en-us/windows/win32/api/d3d12/ne-d3d12-d3d12_texture_address_mode#syntax
-bool verifyAddress(uint32_t Address) {
-  switch (Address) {
-#define TEXTURE_ADDRESS_MODE(Num, Val)                                         \
-  case llvm::to_underlying(dxbc::TextureAddressMode::Val):
-#include "llvm/BinaryFormat/DXContainerConstants.def"
-    return true;
-  }
-  return false;
-}
-
 bool verifyMipLODBias(float MipLODBias) {
   return MipLODBias >= -16.f && MipLODBias <= 15.99f;
 }
@@ -144,26 +123,6 @@ bool verifyMaxAnisotropy(uint32_t MaxAnisotropy) {
   return MaxAnisotropy <= 16u;
 }
 
-bool verifyComparisonFunc(uint32_t ComparisonFunc) {
-  switch (ComparisonFunc) {
-#define COMPARISON_FUNC(Num, Val)                                              \
-  case llvm::to_underlying(dxbc::ComparisonFunc::Val):
-#include "llvm/BinaryFormat/DXContainerConstants.def"
-    return true;
-  }
-  return false;
-}
-
-bool verifyBorderColor(uint32_t BorderColor) {
-  switch (BorderColor) {
-#define STATIC_BORDER_COLOR(Num, Val)                                          \
-  case llvm::to_underlying(dxbc::StaticBorderColor::Val):
-#include "llvm/BinaryFormat/DXContainerConstants.def"
-    return true;
-  }
-  return false;
-}
-
 bool verifyLOD(float LOD) { return !std::isnan(LOD); }
 
 bool verifyBoundOffset(uint32_t Offset) {
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index c955ecd403633..220eee3cb8b08 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -7412,6 +7412,12 @@ static void FixupDebugInfoForOutlinedFunction(
   }
 }
 
+static Value *removeASCastIfPresent(Value *V) {
+  if (Operator::getOpcode(V) == Instruction::AddrSpaceCast)
+    return cast<Operator>(V)->getOperand(0);
+  return V;
+}
+
 static Expected<Function *> createOutlinedFunction(
     OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder,
     const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs,
@@ -7575,7 +7581,8 @@ static Expected<Function *> createOutlinedFunction(
     // preceding mapped arguments that refer to the same global that may be
     // seperate segments. To prevent this, we defer global processing until all
     // other processing has been performed.
-    if (isa<GlobalValue>(Input)) {
+    if (llvm::isa<llvm::GlobalValue, llvm::GlobalObject, llvm::GlobalVariable>(
+            removeASCastIfPresent(Input))) {
       DeferredReplacement.push_back(std::make_pair(Input, InputCopy));
       continue;
     }
@@ -10056,19 +10063,20 @@ OpenMPIRBuilder::createOffloadMapnames(SmallVectorImpl<llvm::Constant *> &Names,
 void OpenMPIRBuilder::initializeTypes(Module &M) {
   LLVMContext &Ctx = M.getContext();
   StructType *T;
+  unsigned DefaultTargetAS = Config.getDefaultTargetAS();
 #define OMP_TYPE(VarName, InitValue) VarName = InitValue;
 #define OMP_ARRAY_TYPE(VarName, ElemTy, ArraySize)                             \
   VarName##Ty = ArrayType::get(ElemTy, ArraySize);                             \
-  VarName##PtrTy = PointerType::getUnqual(Ctx);
+  VarName##PtrTy = PointerType::get(Ctx, DefaultTargetAS);
 #define OMP_FUNCTION_TYPE(VarName, IsVarArg, ReturnType, ...)                  \
   VarName = FunctionType::get(ReturnType, {__VA_ARGS__}, IsVarArg);            \
-  VarName##Ptr = PointerType::getUnqual(Ctx);
+  VarName##Ptr = PointerType::get(Ctx, DefaultTargetAS);
 #define OMP_STRUCT_TYPE(VarName, StructName, Packed, ...)                      \
   T = StructType::getTypeByName(Ctx, StructName);                              \
   if (!T)                                                                      \
     T = StructType::create(Ctx, {__VA_ARGS__}, StructName, Packed);            \
   VarName = T;                                                                 \
-  VarName##Ptr = PointerType::getUnqual(Ctx);
+  VarName##Ptr = PointerType::get(Ctx, DefaultTargetAS);
 #include "llvm/Frontend/OpenMP/OMPKinds.def"
 }
 
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index 8d8120ac9ed90..5385b1f8cac0b 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -1259,6 +1259,8 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn,
               .StartsWith("reverse.", Intrinsic::vector_reverse)
               .StartsWith("interleave2.", Intrinsic::vector_interleave2)
               .StartsWith("deinterleave2.", Intrinsic::vector_deinterleave2)
+              .StartsWith("partial.reduce.add",
+                          Intrinsic::vector_partial_reduce_add)
               .Default(Intrinsic::not_intrinsic);
       if (ID != Intrinsic::not_intrinsic) {
         const auto *FT = F->getFunctionType();
@@ -1269,7 +1271,8 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn,
           Tys.push_back(FT->getReturnType());
         if (ID != Intrinsic::vector_interleave2)
           Tys.push_back(FT->getParamType(0));
-        if (ID == Intrinsic::vector_insert)
+        if (ID == Intrinsic::vector_insert ||
+            ID == Intrinsic::vector_partial_reduce_add)
           // Inserting overloads the inserted type.
           Tys.push_back(FT->getParamType(1));
         rename(F);
diff --git a/llvm/lib/IR/DataLayout.cpp b/llvm/lib/IR/DataLayout.cpp
index ed629d4e5ea22..77f9b997a2ebf 100644
--- a/llvm/lib/IR/DataLayout.cpp
+++ b/llvm/lib/IR/DataLayout.cpp
@@ -172,18 +172,6 @@ struct LessPointerAddrSpace {
 };
 } // namespace
 
-const char *DataLayout::getManglingComponent(const Triple &T) {
-  if (T.isOSBinFormatGOFF())
-    return "-m:l";
-  if (T.isOSBinFormatMachO())
-    return "-m:o";
-  if ((T.isOSWindows() || T.isUEFI()) && T.isOSBinFormatCOFF())
-    return T.getArch() == Triple::x86 ? "-m:x" : "-m:w";
-  if (T.isOSBinFormatXCOFF())
-    return "-m:a";
-  return "-m:e";
-}
-
 // Default primitive type specifications.
 // NOTE: These arrays must be sorted by type bit width.
 constexpr DataLayout::PrimitiveSpec DefaultIntSpecs[] = {
diff --git a/llvm/lib/IR/DebugInfoMetadata.cpp b/llvm/lib/IR/DebugInfoMetadata.cpp
index 96065edca9b51..1ededb9e7b3e2 100644
--- a/llvm/lib/IR/DebugInfoMetadata.cpp
+++ b/llvm/lib/IR/DebugInfoMetadata.cpp
@@ -54,6 +54,10 @@ DebugVariable::DebugVariable(const DbgVariableRecord *DVR)
       Fragment(DVR->getExpression()->getFragmentInfo()),
       InlinedAt(DVR->getDebugLoc().getInlinedAt()) {}
 
+DebugVariableAggregate::DebugVariableAggregate(const DbgVariableRecord *DVR)
+    : DebugVariable(DVR->getVariable(), std::nullopt,
+                    DVR->getDebugLoc()->getInlinedAt()) {}
+
 DILocation::DILocation(LLVMContext &C, StorageType Storage, unsigned Line,
                        unsigned Column, uint64_t AtomGroup, uint8_t AtomRank,
                        ArrayRef<Metadata *> MDs, bool ImplicitCode)
diff --git a/llvm/lib/IR/Globals.cpp b/llvm/lib/IR/Globals.cpp
index 11d33e262fecb..1a7a5c5fbad6b 100644
--- a/llvm/lib/IR/Globals.cpp
+++ b/llvm/lib/IR/Globals.cpp
@@ -288,10 +288,22 @@ void GlobalObject::setSection(StringRef S) {
   setGlobalObjectFlag(HasSectionHashEntryBit, !S.empty());
 }
 
-void GlobalObject::setSectionPrefix(StringRef Prefix) {
+bool GlobalObject::setSectionPrefix(StringRef Prefix) {
+  StringRef ExistingPrefix;
+  if (std::optional<StringRef> MaybePrefix = getSectionPrefix())
+    ExistingPrefix = *MaybePrefix;
+
+  if (ExistingPrefix == Prefix)
+    return false;
+
+  if (Prefix.empty()) {
+    setMetadata(LLVMContext::MD_section_prefix, nullptr);
+    return true;
+  }
   MDBuilder MDB(getContext());
   setMetadata(LLVMContext::MD_section_prefix,
               MDB.createGlobalObjectSectionPrefix(Prefix));
+  return true;
 }
 
 std::optional<StringRef> GlobalObject::getSectionPrefix() const {
diff --git a/llvm/lib/IR/Instruction.cpp b/llvm/lib/IR/Instruction.cpp
index 5e87b5ff941ad..c1fafd759b5ab 100644
--- a/llvm/lib/IR/Instruction.cpp
+++ b/llvm/lib/IR/Instruction.cpp
@@ -553,16 +553,17 @@ void Instruction::dropUBImplyingAttrsAndUnknownMetadata(
 }
 
 void Instruction::dropUBImplyingAttrsAndMetadata(ArrayRef<unsigned> Keep) {
-  // !annotation metadata does not impact semantics.
+  // !annotation and !prof metadata does not impact semantics.
   // !range, !nonnull and !align produce poison, so they are safe to speculate.
   // !noundef and various AA metadata must be dropped, as it generally produces
   // immediate undefined behavior.
   static const unsigned KnownIDs[] = {
       LLVMContext::MD_annotation, LLVMContext::MD_range,
-      LLVMContext::MD_nonnull, LLVMContext::MD_align};
+      LLVMContext::MD_nonnull, LLVMContext::MD_align, LLVMContext::MD_prof};
   SmallVector<unsigned> KeepIDs;
   KeepIDs.reserve(Keep.size() + std::size(KnownIDs));
-  append_range(KeepIDs, KnownIDs);
+  append_range(KeepIDs, (!ProfcheckDisableMetadataFixes ? KnownIDs
+                                                        : drop_end(KnownIDs)));
   append_range(KeepIDs, Keep);
   dropUBImplyingAttrsAndUnknownMetadata(KeepIDs);
 }
diff --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp
index a1751c0ee3e48..601f2e5192d0d 100644
--- a/llvm/lib/IR/Instructions.cpp
+++ b/llvm/lib/IR/Instructions.cpp
@@ -720,6 +720,10 @@ CaptureInfo CallBase::getCaptureInfo(unsigned OpNo) const {
     return CI;
   }
 
+  // Bundles on assumes are captures(none).
+  if (getIntrinsicID() == Intrinsic::assume)
+    return CaptureInfo::none();
+
   // deopt operand bundles are captures(none)
   auto &BOI = getBundleOpInfoForOperand(OpNo);
   auto OBU = operandBundleFromBundleOpInfo(BOI);
diff --git a/llvm/lib/IR/Intrinsics.cpp b/llvm/lib/IR/Intrinsics.cpp
index 58a1f745a7122..4d2e8fadff4f7 100644
--- a/llvm/lib/IR/Intrinsics.cpp
+++ b/llvm/lib/IR/Intrinsics.cpp
@@ -740,14 +740,6 @@ Intrinsic::ID Intrinsic::lookupIntrinsicID(StringRef Name) {
 #include "llvm/IR/IntrinsicImpl.inc"
 #undef GET_INTRINSIC_ATTRIBUTES
 
-AttributeSet Intrinsic::getFnAttributes(LLVMContext &C, ID id) {
-  if (id == 0)
-    return AttributeSet();
-  uint16_t PackedID = IntrinsicsToAttributesMap[id - 1];
-  uint8_t FnAttrID = PackedID >> 8;
-  return getIntrinsicFnAttributeSet(C, FnAttrID);
-}
-
 Function *Intrinsic::getOrInsertDeclaration(Module *M, ID id,
                                             ArrayRef<Type *> Tys) {
   // There can never be multiple globals with the same name of different types,
diff --git a/llvm/lib/IR/Metadata.cpp b/llvm/lib/IR/Metadata.cpp
index 1157cbe6bbc1b..fc78a5b299f49 100644
--- a/llvm/lib/IR/Metadata.cpp
+++ b/llvm/lib/IR/Metadata.cpp
@@ -1007,8 +1007,7 @@ MDNode *MDNode::uniquify() {
 #define HANDLE_MDNODE_LEAF_UNIQUABLE(CLASS)                                    \
   case CLASS##Kind: {                                                          \
     CLASS *SubclassThis = cast<CLASS>(this);                                   \
-    std::integral_constant<bool, HasCachedHash<CLASS>::value>                  \
-        ShouldRecalculateHash;                                                 \
+    std::bool_constant<HasCachedHash<CLASS>::value> ShouldRecalculateHash;     \
     dispatchRecalculateHash(SubclassThis, ShouldRecalculateHash);              \
     return uniquifyImpl(SubclassThis, getContext().pImpl->CLASS##s);           \
   }
@@ -1065,7 +1064,7 @@ void MDNode::storeDistinctInContext() {
     llvm_unreachable("Invalid subclass of MDNode");
 #define HANDLE_MDNODE_LEAF(CLASS)                                              \
   case CLASS##Kind: {                                                          \
-    std::integral_constant<bool, HasCachedHash<CLASS>::value> ShouldResetHash; \
+    std::bool_constant<HasCachedHash<CLASS>::value> ShouldResetHash;           \
     dispatchResetHash(cast<CLASS>(this), ShouldResetHash);                     \
     break;                                                                     \
   }
diff --git a/llvm/lib/IR/ProfDataUtils.cpp b/llvm/lib/IR/ProfDataUtils.cpp
index d0b91d9356613..5827292cee39b 100644
--- a/llvm/lib/IR/ProfDataUtils.cpp
+++ b/llvm/lib/IR/ProfDataUtils.cpp
@@ -95,6 +95,7 @@ const char *MDProfLabels::FunctionEntryCount = "function_entry_count";
 const char *MDProfLabels::SyntheticFunctionEntryCount =
     "synthetic_function_entry_count";
 const char *MDProfLabels::UnknownBranchWeightsMarker = "unknown";
+const char *LLVMLoopEstimatedTripCount = "llvm.loop.estimated_trip_count";
 
 bool hasProfMD(const Instruction &I) {
   return I.hasMetadata(LLVMContext::MD_prof);
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index 7d362ce308812..e9ee130dd5e91 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -1076,6 +1076,18 @@ void Verifier::visitMDNode(const MDNode &MD, AreDebugLocsAllowed AllowLocs) {
     }
   }
 
+  // Check llvm.loop.estimated_trip_count.
+  if (MD.getNumOperands() > 0 &&
+      MD.getOperand(0).equalsStr(LLVMLoopEstimatedTripCount)) {
+    Check(MD.getNumOperands() == 2, "Expected two operands", &MD);
+    auto *Count = dyn_cast_or_null<ConstantAsMetadata>(MD.getOperand(1));
+    Check(Count && Count->getType()->isIntegerTy() &&
+              cast<IntegerType>(Count->getType())->getBitWidth() <= 32,
+          "Expected second operand to be an integer constant of type i32 or "
+          "smaller",
+          &MD);
+  }
+
   // Check these last, so we diagnose problems in operands first.
   Check(!MD.isTemporary(), "Expected no forward declarations!", &MD);
   Check(MD.isResolved(), "All nodes should be resolved!", &MD);
@@ -6518,7 +6530,7 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
     }
     break;
   }
-  case Intrinsic::experimental_vector_partial_reduce_add: {
+  case Intrinsic::vector_partial_reduce_add: {
     VectorType *AccTy = cast<VectorType>(Call.getArgOperand(0)->getType());
     VectorType *VecTy = cast<VectorType>(Call.getArgOperand(1)->getType());
 
diff --git a/llvm/lib/MC/MCAssembler.cpp b/llvm/lib/MC/MCAssembler.cpp
index b1031d7822604..cee281597cfed 100644
--- a/llvm/lib/MC/MCAssembler.cpp
+++ b/llvm/lib/MC/MCAssembler.cpp
@@ -22,6 +22,7 @@
 #include "llvm/MC/MCFixup.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCSFrame.h"
 #include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCValue.h"
@@ -199,6 +200,7 @@ uint64_t MCAssembler::computeFragmentSize(const MCFragment &F) const {
   case MCFragment::FT_LEB:
   case MCFragment::FT_Dwarf:
   case MCFragment::FT_DwarfFrame:
+  case MCFragment::FT_SFrame:
   case MCFragment::FT_CVInlineLines:
   case MCFragment::FT_CVDefRange:
     return F.getSize();
@@ -399,6 +401,7 @@ static void writeFragment(raw_ostream &OS, const MCAssembler &Asm,
   case MCFragment::FT_LEB:
   case MCFragment::FT_Dwarf:
   case MCFragment::FT_DwarfFrame:
+  case MCFragment::FT_SFrame:
   case MCFragment::FT_CVInlineLines:
   case MCFragment::FT_CVDefRange: {
     if (F.getKind() == MCFragment::FT_Data)
@@ -914,6 +917,24 @@ void MCAssembler::relaxDwarfCallFrameFragment(MCFragment &F) {
   F.clearVarFixups();
 }
 
+void MCAssembler::relaxSFrameFragment(MCFragment &F) {
+  assert(F.getKind() == MCFragment::FT_SFrame);
+  MCContext &C = getContext();
+  int64_t Value;
+  bool Abs = F.getSFrameAddrDelta().evaluateAsAbsolute(Value, *this);
+  if (!Abs) {
+    C.reportError(F.getSFrameAddrDelta().getLoc(),
+                  "invalid CFI advance_loc expression in sframe");
+    F.setSFrameAddrDelta(MCConstantExpr::create(0, C));
+    return;
+  }
+
+  SmallVector<char, 4> Data;
+  MCSFrameEmitter::encodeFuncOffset(Context, Value, Data, F.getSFrameFDE());
+  F.setVarContents(Data);
+  F.clearVarFixups();
+}
+
 bool MCAssembler::relaxFragment(MCFragment &F) {
   auto Size = computeFragmentSize(F);
   switch (F.getKind()) {
@@ -932,6 +953,9 @@ bool MCAssembler::relaxFragment(MCFragment &F) {
   case MCFragment::FT_DwarfFrame:
     relaxDwarfCallFrameFragment(F);
     break;
+  case MCFragment::FT_SFrame:
+    relaxSFrameFragment(F);
+    break;
   case MCFragment::FT_BoundaryAlign:
     relaxBoundaryAlign(static_cast<MCBoundaryAlignFragment &>(F));
     break;
diff --git a/llvm/lib/MC/MCContext.cpp b/llvm/lib/MC/MCContext.cpp
index 5d9ddc2f1306c..5df8692d2e7ba 100644
--- a/llvm/lib/MC/MCContext.cpp
+++ b/llvm/lib/MC/MCContext.cpp
@@ -203,27 +203,6 @@ MCInst *MCContext::createMCInst() {
 MCSymbol *MCContext::getOrCreateSymbol(const Twine &Name) {
   SmallString<128> NameSV;
   StringRef NameRef = Name.toStringRef(NameSV);
-  if (NameRef.contains('\\')) {
-    NameSV = NameRef;
-    size_t S = 0;
-    // Support escaped \\ and \" as in GNU Assembler. GAS issues a warning for
-    // other characters following \\, which we do not implement due to code
-    // structure.
-    for (size_t I = 0, E = NameSV.size(); I != E; ++I) {
-      char C = NameSV[I];
-      if (C == '\\' && I + 1 != E) {
-        switch (NameSV[I + 1]) {
-        case '"':
-        case '\\':
-          C = NameSV[++I];
-          break;
-        }
-      }
-      NameSV[S++] = C;
-    }
-    NameSV.resize(S);
-    NameRef = NameSV;
-  }
 
   assert(!NameRef.empty() && "Normal symbols cannot be unnamed!");
 
@@ -244,6 +223,34 @@ MCSymbol *MCContext::getOrCreateSymbol(const Twine &Name) {
   return Entry.second.Symbol;
 }
 
+MCSymbol *MCContext::parseSymbol(const Twine &Name) {
+  SmallString<128> SV;
+  StringRef NameRef = Name.toStringRef(SV);
+  if (NameRef.contains('\\')) {
+    SV = NameRef;
+    size_t S = 0;
+    // Support escaped \\ and \" as in GNU Assembler. GAS issues a warning for
+    // other characters following \\, which we do not implement due to code
+    // structure.
+    for (size_t I = 0, E = SV.size(); I != E; ++I) {
+      char C = SV[I];
+      if (C == '\\' && I + 1 != E) {
+        switch (SV[I + 1]) {
+        case '"':
+        case '\\':
+          C = SV[++I];
+          break;
+        }
+      }
+      SV[S++] = C;
+    }
+    SV.resize(S);
+    NameRef = SV;
+  }
+
+  return getOrCreateSymbol(NameRef);
+}
+
 MCSymbol *MCContext::getOrCreateFrameAllocSymbol(const Twine &FuncName,
                                                  unsigned Idx) {
   return getOrCreateSymbol(MAI->getPrivateGlobalPrefix() + FuncName +
diff --git a/llvm/lib/MC/MCDwarf.cpp b/llvm/lib/MC/MCDwarf.cpp
index e7c0d37e8f99b..e8f000a584839 100644
--- a/llvm/lib/MC/MCDwarf.cpp
+++ b/llvm/lib/MC/MCDwarf.cpp
@@ -181,7 +181,7 @@ void MCDwarfLineTable::emitOne(
 
   unsigned FileNum, LastLine, Column, Flags, Isa, Discriminator;
   bool IsAtStartSeq;
-  MCSymbol *LastLabel;
+  MCSymbol *PrevLabel;
   auto init = [&]() {
     FileNum = 1;
     LastLine = 1;
@@ -189,21 +189,31 @@ void MCDwarfLineTable::emitOne(
     Flags = DWARF2_LINE_DEFAULT_IS_STMT ? DWARF2_FLAG_IS_STMT : 0;
     Isa = 0;
     Discriminator = 0;
-    LastLabel = nullptr;
+    PrevLabel = nullptr;
     IsAtStartSeq = true;
   };
   init();
 
   // Loop through each MCDwarfLineEntry and encode the dwarf line number table.
   bool EndEntryEmitted = false;
-  for (const MCDwarfLineEntry &LineEntry : LineEntries) {
-    MCSymbol *Label = LineEntry.getLabel();
+  for (auto It = LineEntries.begin(); It != LineEntries.end(); ++It) {
+    auto LineEntry = *It;
+    MCSymbol *CurrLabel = LineEntry.getLabel();
     const MCAsmInfo *asmInfo = MCOS->getContext().getAsmInfo();
 
     if (LineEntry.LineStreamLabel) {
       if (!IsAtStartSeq) {
-        MCOS->emitDwarfLineEndEntry(Section, LastLabel,
-                                    /*EndLabel =*/LastLabel);
+        auto *Label = CurrLabel;
+        auto NextIt = It + 1;
+        // LineEntry with a null Label is probably a fake LineEntry we added
+        // when `-emit-func-debug-line-table-offsets` in order to terminate the
+        // sequence. Look for the next Label if possible, otherwise we will set
+        // the PC to the end of the section.
+        if (!Label && NextIt != LineEntries.end()) {
+          Label = NextIt->getLabel();
+        }
+        MCOS->emitDwarfLineEndEntry(Section, PrevLabel,
+                                    /*EndLabel =*/Label);
         init();
       }
       MCOS->emitLabel(LineEntry.LineStreamLabel, LineEntry.StreamLabelDefLoc);
@@ -211,7 +221,7 @@ void MCDwarfLineTable::emitOne(
     }
 
     if (LineEntry.IsEndEntry) {
-      MCOS->emitDwarfAdvanceLineAddr(INT64_MAX, LastLabel, Label,
+      MCOS->emitDwarfAdvanceLineAddr(INT64_MAX, PrevLabel, CurrLabel,
                                      asmInfo->getCodePointerSize());
       init();
       EndEntryEmitted = true;
@@ -258,12 +268,12 @@ void MCDwarfLineTable::emitOne(
     // At this point we want to emit/create the sequence to encode the delta in
     // line numbers and the increment of the address from the previous Label
     // and the current Label.
-    MCOS->emitDwarfAdvanceLineAddr(LineDelta, LastLabel, Label,
+    MCOS->emitDwarfAdvanceLineAddr(LineDelta, PrevLabel, CurrLabel,
                                    asmInfo->getCodePointerSize());
 
     Discriminator = 0;
     LastLine = LineEntry.getLine();
-    LastLabel = Label;
+    PrevLabel = CurrLabel;
     IsAtStartSeq = false;
   }
 
@@ -273,7 +283,7 @@ void MCDwarfLineTable::emitOne(
   // does not track ranges nor terminate the line table. In that case,
   // conservatively use the section end symbol to end the line table.
   if (!EndEntryEmitted && !IsAtStartSeq)
-    MCOS->emitDwarfLineEndEntry(Section, LastLabel);
+    MCOS->emitDwarfLineEndEntry(Section, PrevLabel);
 }
 
 void MCDwarfLineTable::endCurrentSeqAndEmitLineStreamLabel(MCStreamer *MCOS,
diff --git a/llvm/lib/MC/MCFragment.cpp b/llvm/lib/MC/MCFragment.cpp
index 21da79bb0aa30..85d1c5888f1da 100644
--- a/llvm/lib/MC/MCFragment.cpp
+++ b/llvm/lib/MC/MCFragment.cpp
@@ -53,6 +53,7 @@ LLVM_DUMP_METHOD void MCFragment::dump() const {
   case MCFragment::FT_Org:           OS << "Org"; break;
   case MCFragment::FT_Dwarf:         OS << "Dwarf"; break;
   case MCFragment::FT_DwarfFrame:    OS << "DwarfCallFrame"; break;
+  case MCFragment::FT_SFrame:        OS << "SFrame"; break;
   case MCFragment::FT_LEB:           OS << "LEB"; break;
   case MCFragment::FT_BoundaryAlign: OS<<"BoundaryAlign"; break;
   case MCFragment::FT_SymbolId:      OS << "SymbolId"; break;
@@ -79,7 +80,8 @@ LLVM_DUMP_METHOD void MCFragment::dump() const {
   case MCFragment::FT_Align:
   case MCFragment::FT_LEB:
   case MCFragment::FT_Dwarf:
-  case MCFragment::FT_DwarfFrame: {
+  case MCFragment::FT_DwarfFrame:
+  case MCFragment::FT_SFrame: {
     if (isLinkerRelaxable())
       OS << " LinkerRelaxable";
     auto Fixed = getContents();
@@ -129,6 +131,7 @@ LLVM_DUMP_METHOD void MCFragment::dump() const {
       OS << " LineDelta:" << getDwarfLineDelta();
       break;
     case MCFragment::FT_DwarfFrame:
+    case MCFragment::FT_SFrame:
       OS << " AddrDelta:";
       getDwarfAddrDelta().print(OS, nullptr);
       break;
diff --git a/llvm/lib/MC/MCObjectStreamer.cpp b/llvm/lib/MC/MCObjectStreamer.cpp
index 59265bc8595ba..701a0836d2c70 100644
--- a/llvm/lib/MC/MCObjectStreamer.cpp
+++ b/llvm/lib/MC/MCObjectStreamer.cpp
@@ -583,6 +583,19 @@ void MCObjectStreamer::emitDwarfAdvanceFrameAddr(const MCSymbol *LastLabel,
   newFragment();
 }
 
+void MCObjectStreamer::emitSFrameCalculateFuncOffset(const MCSymbol *FuncBase,
+                                                     const MCSymbol *FREBegin,
+                                                     MCFragment *FDEFrag,
+                                                     SMLoc Loc) {
+  assert(FuncBase && "No function base address");
+  assert(FREBegin && "FRE doesn't describe a location");
+  auto *F = getCurrentFragment();
+  F->Kind = MCFragment::FT_SFrame;
+  F->setSFrameAddrDelta(buildSymbolDiff(*this, FREBegin, FuncBase, Loc));
+  F->setSFrameFDE(FDEFrag);
+  newFragment();
+}
+
 void MCObjectStreamer::emitCVLocDirective(unsigned FunctionId, unsigned FileNo,
                                           unsigned Line, unsigned Column,
                                           bool PrologueEnd, bool IsStmt,
diff --git a/llvm/lib/MC/MCParser/AsmParser.cpp b/llvm/lib/MC/MCParser/AsmParser.cpp
index fb183a10b3d37..acea3ab23680a 100644
--- a/llvm/lib/MC/MCParser/AsmParser.cpp
+++ b/llvm/lib/MC/MCParser/AsmParser.cpp
@@ -1213,8 +1213,8 @@ bool AsmParser::parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc,
 
     MCSymbol *Sym = getContext().getInlineAsmLabel(SymbolName);
     if (!Sym)
-      Sym = getContext().getOrCreateSymbol(MAI.isHLASM() ? SymbolName.upper()
-                                                         : SymbolName);
+      Sym = getContext().parseSymbol(MAI.isHLASM() ? SymbolName.upper()
+                                                   : SymbolName);
 
     // If this is an absolute variable reference, substitute it now to preserve
     // semantics in the face of reassignment.
@@ -1845,7 +1845,7 @@ bool AsmParser::parseStatement(ParseStatementInfo &Info,
                                        RewrittenLabel);
         IDVal = RewrittenLabel;
       }
-      Sym = getContext().getOrCreateSymbol(IDVal);
+      Sym = getContext().parseSymbol(IDVal);
     } else
       Sym = Ctx.createDirectionalLocalSymbol(LocalLabelVal);
     // End of Labels should be treated as end of line for lexing
@@ -3876,20 +3876,15 @@ bool AsmParser::parseDirectiveCVLoc() {
 /// ::= .cv_linetable FunctionId, FnStart, FnEnd
 bool AsmParser::parseDirectiveCVLinetable() {
   int64_t FunctionId;
-  StringRef FnStartName, FnEndName;
+  MCSymbol *FnStartSym, *FnEndSym;
   SMLoc Loc = getTok().getLoc();
   if (parseCVFunctionId(FunctionId, ".cv_linetable") || parseComma() ||
       parseTokenLoc(Loc) ||
-      check(parseIdentifier(FnStartName), Loc,
-            "expected identifier in directive") ||
+      check(parseSymbol(FnStartSym), Loc, "expected identifier in directive") ||
       parseComma() || parseTokenLoc(Loc) ||
-      check(parseIdentifier(FnEndName), Loc,
-            "expected identifier in directive"))
+      check(parseSymbol(FnEndSym), Loc, "expected identifier in directive"))
     return true;
 
-  MCSymbol *FnStartSym = getContext().getOrCreateSymbol(FnStartName);
-  MCSymbol *FnEndSym = getContext().getOrCreateSymbol(FnEndName);
-
   getStreamer().emitCVLinetableDirective(FunctionId, FnStartSym, FnEndSym);
   return false;
 }
@@ -3898,7 +3893,7 @@ bool AsmParser::parseDirectiveCVLinetable() {
 /// ::= .cv_inline_linetable PrimaryFunctionId FileId LineNum FnStart FnEnd
 bool AsmParser::parseDirectiveCVInlineLinetable() {
   int64_t PrimaryFunctionId, SourceFileId, SourceLineNum;
-  StringRef FnStartName, FnEndName;
+  MCSymbol *FnStartSym, *FnEndSym;
   SMLoc Loc = getTok().getLoc();
   if (parseCVFunctionId(PrimaryFunctionId, ".cv_inline_linetable") ||
       parseTokenLoc(Loc) ||
@@ -3908,16 +3903,14 @@ bool AsmParser::parseDirectiveCVInlineLinetable() {
       parseIntToken(SourceLineNum, "expected SourceLineNum") ||
       check(SourceLineNum < 0, Loc, "Line number less than zero") ||
       parseTokenLoc(Loc) ||
-      check(parseIdentifier(FnStartName), Loc, "expected identifier") ||
+      check(parseSymbol(FnStartSym), Loc, "expected identifier") ||
       parseTokenLoc(Loc) ||
-      check(parseIdentifier(FnEndName), Loc, "expected identifier"))
+      check(parseSymbol(FnEndSym), Loc, "expected identifier"))
     return true;
 
   if (parseEOL())
     return true;
 
-  MCSymbol *FnStartSym = getContext().getOrCreateSymbol(FnStartName);
-  MCSymbol *FnEndSym = getContext().getOrCreateSymbol(FnEndName);
   getStreamer().emitCVInlineLinetableDirective(PrimaryFunctionId, SourceFileId,
                                                SourceLineNum, FnStartSym,
                                                FnEndSym);
@@ -3938,16 +3931,14 @@ bool AsmParser::parseDirectiveCVDefRange() {
   std::vector<std::pair<const MCSymbol *, const MCSymbol *>> Ranges;
   while (getLexer().is(AsmToken::Identifier)) {
     Loc = getLexer().getLoc();
-    StringRef GapStartName;
-    if (parseIdentifier(GapStartName))
+    MCSymbol *GapStartSym;
+    if (parseSymbol(GapStartSym))
       return Error(Loc, "expected identifier in directive");
-    MCSymbol *GapStartSym = getContext().getOrCreateSymbol(GapStartName);
 
     Loc = getLexer().getLoc();
-    StringRef GapEndName;
-    if (parseIdentifier(GapEndName))
+    MCSymbol *GapEndSym;
+    if (parseSymbol(GapEndSym))
       return Error(Loc, "expected identifier in directive");
-    MCSymbol *GapEndSym = getContext().getOrCreateSymbol(GapEndName);
 
     Ranges.push_back({GapStartSym, GapEndSym});
   }
@@ -4084,12 +4075,11 @@ bool AsmParser::parseDirectiveCVFileChecksumOffset() {
 /// ::= .cv_fpo_data procsym
 bool AsmParser::parseDirectiveCVFPOData() {
   SMLoc DirLoc = getLexer().getLoc();
-  StringRef ProcName;
-  if (parseIdentifier(ProcName))
+  MCSymbol *ProcSym;
+  if (parseSymbol(ProcSym))
     return TokError("expected symbol name");
   if (parseEOL())
     return true;
-  MCSymbol *ProcSym = getContext().getOrCreateSymbol(ProcName);
   getStreamer().emitCVFPOData(ProcSym, DirLoc);
   return false;
 }
@@ -4311,15 +4301,12 @@ bool AsmParser::parseDirectiveCFIPersonalityOrLsda(bool IsPersonality) {
   if (Encoding == dwarf::DW_EH_PE_omit)
     return false;
 
-  StringRef Name;
+  MCSymbol *Sym;
   if (check(!isValidEncoding(Encoding), "unsupported encoding.") ||
       parseComma() ||
-      check(parseIdentifier(Name), "expected identifier in directive") ||
-      parseEOL())
+      check(parseSymbol(Sym), "expected identifier in directive") || parseEOL())
     return true;
 
-  MCSymbol *Sym = getContext().getOrCreateSymbol(Name);
-
   if (IsPersonality)
     getStreamer().emitCFIPersonality(Sym, Encoding);
   else
@@ -4898,7 +4885,7 @@ bool AsmParser::parseDirectiveSymbolAttribute(MCSymbolAttr Attr) {
     if (discardLTOSymbol(Name))
       return false;
 
-    MCSymbol *Sym = getContext().getOrCreateSymbol(Name);
+    MCSymbol *Sym = getContext().parseSymbol(Name);
 
     // Assembler local symbols don't make any sense here, except for directives
     // that the symbol should be tagged.
@@ -4920,13 +4907,10 @@ bool AsmParser::parseDirectiveComm(bool IsLocal) {
     return true;
 
   SMLoc IDLoc = getLexer().getLoc();
-  StringRef Name;
-  if (parseIdentifier(Name))
+  MCSymbol *Sym;
+  if (parseSymbol(Sym))
     return TokError("expected identifier in directive");
 
-  // Handle the identifier as the key symbol.
-  MCSymbol *Sym = getContext().getOrCreateSymbol(Name);
-
   if (parseComma())
     return true;
 
@@ -5756,10 +5740,9 @@ bool AsmParser::parseDirectiveAddrsig() {
 }
 
 bool AsmParser::parseDirectiveAddrsigSym() {
-  StringRef Name;
-  if (check(parseIdentifier(Name), "expected identifier") || parseEOL())
+  MCSymbol *Sym;
+  if (check(parseSymbol(Sym), "expected identifier") || parseEOL())
     return true;
-  MCSymbol *Sym = getContext().getOrCreateSymbol(Name);
   getStreamer().emitAddrsigSym(Sym);
   return false;
 }
@@ -6159,7 +6142,7 @@ bool HLASMAsmParser::parseAsHLASMLabel(ParseStatementInfo &Info,
     return Error(LabelLoc,
                  "Cannot have just a label for an HLASM inline asm statement");
 
-  MCSymbol *Sym = getContext().getOrCreateSymbol(
+  MCSymbol *Sym = getContext().parseSymbol(
       getContext().getAsmInfo()->isHLASM() ? LabelVal.upper() : LabelVal);
 
   // Emit the label.
@@ -6287,7 +6270,7 @@ bool parseAssignmentExpression(StringRef Name, bool allow_redef,
     Parser.getStreamer().emitValueToOffset(Value, 0, EqualLoc);
     return false;
   } else
-    Sym = Parser.getContext().getOrCreateSymbol(Name);
+    Sym = Parser.getContext().parseSymbol(Name);
 
   Sym->setRedefinable(allow_redef);
 
diff --git a/llvm/lib/MC/MCParser/COFFAsmParser.cpp b/llvm/lib/MC/MCParser/COFFAsmParser.cpp
index 9fb17488a9e9c..5dd79946d8779 100644
--- a/llvm/lib/MC/MCParser/COFFAsmParser.cpp
+++ b/llvm/lib/MC/MCParser/COFFAsmParser.cpp
@@ -293,13 +293,11 @@ bool COFFAsmParser::parseDirectiveSymbolAttribute(StringRef Directive, SMLoc) {
   assert(Attr != MCSA_Invalid && "unexpected symbol attribute directive!");
   if (getLexer().isNot(AsmToken::EndOfStatement)) {
     while (true) {
-      StringRef Name;
+      MCSymbol *Sym;
 
-      if (getParser().parseIdentifier(Name))
+      if (getParser().parseSymbol(Sym))
         return TokError("expected identifier in directive");
 
-      MCSymbol *Sym = getContext().getOrCreateSymbol(Name);
-
       getStreamer().emitSymbolAttribute(Sym, Attr);
 
       if (getLexer().is(AsmToken::EndOfStatement))
@@ -450,13 +448,11 @@ bool COFFAsmParser::parseDirectivePopSection(StringRef, SMLoc) {
 }
 
 bool COFFAsmParser::parseDirectiveDef(StringRef, SMLoc) {
-  StringRef SymbolName;
+  MCSymbol *Sym;
 
-  if (getParser().parseIdentifier(SymbolName))
+  if (getParser().parseSymbol(Sym))
     return TokError("expected identifier in directive");
 
-  MCSymbol *Sym = getContext().getOrCreateSymbol(SymbolName);
-
   getStreamer().beginCOFFSymbolDef(Sym);
 
   Lex();
@@ -496,8 +492,8 @@ bool COFFAsmParser::parseDirectiveEndef(StringRef, SMLoc) {
 }
 
 bool COFFAsmParser::parseDirectiveSecRel32(StringRef, SMLoc) {
-  StringRef SymbolID;
-  if (getParser().parseIdentifier(SymbolID))
+  MCSymbol *Symbol;
+  if (getParser().parseSymbol(Symbol))
     return TokError("expected identifier in directive");
 
   int64_t Offset = 0;
@@ -517,8 +513,6 @@ bool COFFAsmParser::parseDirectiveSecRel32(StringRef, SMLoc) {
         "invalid '.secrel32' directive offset, can't be less "
         "than zero or greater than std::numeric_limits<uint32_t>::max()");
 
-  MCSymbol *Symbol = getContext().getOrCreateSymbol(SymbolID);
-
   Lex();
   getStreamer().emitCOFFSecRel32(Symbol, Offset);
   return false;
@@ -526,8 +520,8 @@ bool COFFAsmParser::parseDirectiveSecRel32(StringRef, SMLoc) {
 
 bool COFFAsmParser::parseDirectiveRVA(StringRef, SMLoc) {
   auto parseOp = [&]() -> bool {
-    StringRef SymbolID;
-    if (getParser().parseIdentifier(SymbolID))
+    MCSymbol *Symbol;
+    if (getParser().parseSymbol(Symbol))
       return TokError("expected identifier in directive");
 
     int64_t Offset = 0;
@@ -544,8 +538,6 @@ bool COFFAsmParser::parseDirectiveRVA(StringRef, SMLoc) {
                               "than -2147483648 or greater than "
                               "2147483647");
 
-    MCSymbol *Symbol = getContext().getOrCreateSymbol(SymbolID);
-
     getStreamer().emitCOFFImgRel32(Symbol, Offset);
     return false;
   };
@@ -556,75 +548,65 @@ bool COFFAsmParser::parseDirectiveRVA(StringRef, SMLoc) {
 }
 
 bool COFFAsmParser::parseDirectiveSafeSEH(StringRef, SMLoc) {
-  StringRef SymbolID;
-  if (getParser().parseIdentifier(SymbolID))
+  MCSymbol *Symbol;
+  if (getParser().parseSymbol(Symbol))
     return TokError("expected identifier in directive");
 
   if (getLexer().isNot(AsmToken::EndOfStatement))
     return TokError("unexpected token in directive");
 
-  MCSymbol *Symbol = getContext().getOrCreateSymbol(SymbolID);
-
   Lex();
   getStreamer().emitCOFFSafeSEH(Symbol);
   return false;
 }
 
 bool COFFAsmParser::parseDirectiveSecIdx(StringRef, SMLoc) {
-  StringRef SymbolID;
-  if (getParser().parseIdentifier(SymbolID))
+  MCSymbol *Symbol;
+  if (getParser().parseSymbol(Symbol))
     return TokError("expected identifier in directive");
 
   if (getLexer().isNot(AsmToken::EndOfStatement))
     return TokError("unexpected token in directive");
 
-  MCSymbol *Symbol = getContext().getOrCreateSymbol(SymbolID);
-
   Lex();
   getStreamer().emitCOFFSectionIndex(Symbol);
   return false;
 }
 
 bool COFFAsmParser::parseDirectiveSymIdx(StringRef, SMLoc) {
-  StringRef SymbolID;
-  if (getParser().parseIdentifier(SymbolID))
+  MCSymbol *Symbol;
+  if (getParser().parseSymbol(Symbol))
     return TokError("expected identifier in directive");
 
   if (getLexer().isNot(AsmToken::EndOfStatement))
     return TokError("unexpected token in directive");
 
-  MCSymbol *Symbol = getContext().getOrCreateSymbol(SymbolID);
-
   Lex();
   getStreamer().emitCOFFSymbolIndex(Symbol);
   return false;
 }
 
 bool COFFAsmParser::parseDirectiveSecNum(StringRef, SMLoc) {
-  StringRef SymbolID;
-  if (getParser().parseIdentifier(SymbolID))
+  MCSymbol *Symbol;
+  if (getParser().parseSymbol(Symbol))
     return TokError("expected identifier in directive");
 
   if (getLexer().isNot(AsmToken::EndOfStatement))
     return TokError("unexpected token in directive");
 
-  MCSymbol *Symbol = getContext().getOrCreateSymbol(SymbolID);
-
   Lex();
   getStreamer().emitCOFFSecNumber(Symbol);
   return false;
 }
 
 bool COFFAsmParser::parseDirectiveSecOffset(StringRef, SMLoc) {
-  StringRef SymbolID;
-  if (getParser().parseIdentifier(SymbolID))
+  MCSymbol *Symbol;
+  if (getParser().parseSymbol(Symbol))
     return TokError("expected identifier in directive");
 
   if (getLexer().isNot(AsmToken::EndOfStatement))
     return TokError("unexpected token in directive");
 
-  MCSymbol *Symbol = getContext().getOrCreateSymbol(SymbolID);
-
   Lex();
   getStreamer().emitCOFFSecOffset(Symbol);
   return false;
@@ -679,15 +661,13 @@ bool COFFAsmParser::parseDirectiveLinkOnce(StringRef, SMLoc Loc) {
 }
 
 bool COFFAsmParser::parseSEHDirectiveStartProc(StringRef, SMLoc Loc) {
-  StringRef SymbolID;
-  if (getParser().parseIdentifier(SymbolID))
+  MCSymbol *Symbol;
+  if (getParser().parseSymbol(Symbol))
     return true;
 
   if (getLexer().isNot(AsmToken::EndOfStatement))
     return TokError("unexpected token in directive");
 
-  MCSymbol *Symbol = getContext().getOrCreateSymbol(SymbolID);
-
   Lex();
   getStreamer().emitWinCFIStartProc(Symbol, Loc);
   return false;
@@ -718,8 +698,8 @@ bool COFFAsmParser::parseSEHDirectiveEndChained(StringRef, SMLoc Loc) {
 }
 
 bool COFFAsmParser::parseSEHDirectiveHandler(StringRef, SMLoc Loc) {
-  StringRef SymbolID;
-  if (getParser().parseIdentifier(SymbolID))
+  MCSymbol *handler;
+  if (getParser().parseSymbol(handler))
     return true;
 
   if (getLexer().isNot(AsmToken::Comma))
@@ -736,8 +716,6 @@ bool COFFAsmParser::parseSEHDirectiveHandler(StringRef, SMLoc Loc) {
   if (getLexer().isNot(AsmToken::EndOfStatement))
     return TokError("unexpected token in directive");
 
-  MCSymbol *handler = getContext().getOrCreateSymbol(SymbolID);
-
   Lex();
   getStreamer().emitWinEHHandler(handler, unwind, except, Loc);
   return false;
diff --git a/llvm/lib/MC/MCParser/COFFMasmParser.cpp b/llvm/lib/MC/MCParser/COFFMasmParser.cpp
index 1bb617b327f1e..04e12e56c4262 100644
--- a/llvm/lib/MC/MCParser/COFFMasmParser.cpp
+++ b/llvm/lib/MC/MCParser/COFFMasmParser.cpp
@@ -443,8 +443,8 @@ bool COFFMasmParser::parseDirectiveProc(StringRef Directive, SMLoc Loc) {
   if (!getStreamer().getCurrentFragment())
     return Error(getTok().getLoc(), "expected section directive");
 
-  StringRef Label;
-  if (getParser().parseIdentifier(Label))
+  MCSymbol *Sym;
+  if (getParser().parseSymbol(Sym))
     return Error(Loc, "expected identifier for procedure");
   if (getLexer().is(AsmToken::Identifier)) {
     StringRef nextVal = getTok().getString();
@@ -459,12 +459,12 @@ bool COFFMasmParser::parseDirectiveProc(StringRef Directive, SMLoc Loc) {
       nextLoc = getTok().getLoc();
     }
   }
-  auto *Sym =
-      static_cast<MCSymbolCOFF *>(getContext().getOrCreateSymbol(Label));
 
   // Define symbol as simple external function
-  Sym->setExternal(true);
-  Sym->setType(COFF::IMAGE_SYM_DTYPE_FUNCTION << COFF::SCT_COMPLEX_TYPE_SHIFT);
+  auto *COFFSym = static_cast<MCSymbolCOFF *>(Sym);
+  COFFSym->setExternal(true);
+  COFFSym->setType(COFF::IMAGE_SYM_DTYPE_FUNCTION
+                   << COFF::SCT_COMPLEX_TYPE_SHIFT);
 
   bool Framed = false;
   if (getLexer().is(AsmToken::Identifier) &&
@@ -475,7 +475,7 @@ bool COFFMasmParser::parseDirectiveProc(StringRef Directive, SMLoc Loc) {
   }
   getStreamer().emitLabel(Sym, Loc);
 
-  CurrentProcedures.push_back(Label);
+  CurrentProcedures.push_back(Sym->getName());
   CurrentProceduresFramed.push_back(Framed);
   return false;
 }
@@ -510,8 +510,8 @@ bool COFFMasmParser::parseDirectiveAlias(StringRef Directive, SMLoc Loc) {
       getParser().parseAngleBracketString(ActualName))
     return Error(getTok().getLoc(), "expected <actualName>");
 
-  MCSymbol *Alias = getContext().getOrCreateSymbol(AliasName);
-  MCSymbol *Actual = getContext().getOrCreateSymbol(ActualName);
+  MCSymbol *Alias = getContext().parseSymbol(AliasName);
+  MCSymbol *Actual = getContext().parseSymbol(ActualName);
 
   getStreamer().emitWeakReference(Alias, Actual);
 
diff --git a/llvm/lib/MC/MCParser/DarwinAsmParser.cpp b/llvm/lib/MC/MCParser/DarwinAsmParser.cpp
index a9095b3298f5e..fceb718d091c9 100644
--- a/llvm/lib/MC/MCParser/DarwinAsmParser.cpp
+++ b/llvm/lib/MC/MCParser/DarwinAsmParser.cpp
@@ -501,13 +501,10 @@ bool DarwinAsmParser::parseSectionSwitch(StringRef Segment, StringRef Section,
 /// parseDirectiveAltEntry
 ///  ::= .alt_entry identifier
 bool DarwinAsmParser::parseDirectiveAltEntry(StringRef, SMLoc) {
-  StringRef Name;
-  if (getParser().parseIdentifier(Name))
+  MCSymbol *Sym;
+  if (getParser().parseSymbol(Sym))
     return TokError("expected identifier in directive");
 
-  // Look up symbol.
-  MCSymbol *Sym = getContext().getOrCreateSymbol(Name);
-
   if (Sym->isDefined())
     return TokError(".alt_entry must preceed symbol definition");
 
@@ -521,13 +518,10 @@ bool DarwinAsmParser::parseDirectiveAltEntry(StringRef, SMLoc) {
 /// parseDirectiveDesc
 ///  ::= .desc identifier , expression
 bool DarwinAsmParser::parseDirectiveDesc(StringRef, SMLoc) {
-  StringRef Name;
-  if (getParser().parseIdentifier(Name))
+  MCSymbol *Sym;
+  if (getParser().parseSymbol(Sym))
     return TokError("expected identifier in directive");
 
-  // Handle the identifier as the key symbol.
-  MCSymbol *Sym = getContext().getOrCreateSymbol(Name);
-
   if (getLexer().isNot(AsmToken::Comma))
     return TokError("unexpected token in '.desc' directive");
   Lex();
@@ -560,18 +554,17 @@ bool DarwinAsmParser::parseDirectiveIndirectSymbol(StringRef, SMLoc Loc) {
     return Error(Loc, "indirect symbol not in a symbol pointer or stub "
                       "section");
 
-  StringRef Name;
-  if (getParser().parseIdentifier(Name))
+  MCSymbol *Sym;
+  if (getParser().parseSymbol(Sym))
     return TokError("expected identifier in .indirect_symbol directive");
 
-  MCSymbol *Sym = getContext().getOrCreateSymbol(Name);
-
   // Assembler local symbols don't make any sense here. Complain loudly.
   if (Sym->isTemporary())
     return TokError("non-local symbol required in directive");
 
   if (!getStreamer().emitSymbolAttribute(Sym, MCSA_IndirectSymbol))
-    return TokError("unable to emit indirect symbol attribute for: " + Name);
+    return TokError("unable to emit indirect symbol attribute for: " +
+                    Sym->getName());
 
   if (getLexer().isNot(AsmToken::EndOfStatement))
     return TokError("unexpected token in '.indirect_symbol' directive");
@@ -633,13 +626,10 @@ bool DarwinAsmParser::parseDirectiveLinkerOption(StringRef IDVal, SMLoc) {
 /// parseDirectiveLsym
 ///  ::= .lsym identifier , expression
 bool DarwinAsmParser::parseDirectiveLsym(StringRef, SMLoc) {
-  StringRef Name;
-  if (getParser().parseIdentifier(Name))
+  MCSymbol *Sym;
+  if (getParser().parseSymbol(Sym))
     return TokError("expected identifier in directive");
 
-  // Handle the identifier as the key symbol.
-  MCSymbol *Sym = getContext().getOrCreateSymbol(Name);
-
   if (getLexer().isNot(AsmToken::Comma))
     return TokError("unexpected token in '.lsym' directive");
   Lex();
@@ -826,13 +816,10 @@ bool DarwinAsmParser::parseDirectiveSubsectionsViaSymbols(StringRef, SMLoc) {
 ///  ::= .tbss identifier, size, align
 bool DarwinAsmParser::parseDirectiveTBSS(StringRef, SMLoc) {
   SMLoc IDLoc = getLexer().getLoc();
-  StringRef Name;
-  if (getParser().parseIdentifier(Name))
+  MCSymbol *Sym;
+  if (getParser().parseSymbol(Sym))
     return TokError("expected identifier in directive");
 
-  // Handle the identifier as the key symbol.
-  MCSymbol *Sym = getContext().getOrCreateSymbol(Name);
-
   if (getLexer().isNot(AsmToken::Comma))
     return TokError("unexpected token in directive");
   Lex();
@@ -911,13 +898,10 @@ bool DarwinAsmParser::parseDirectiveZerofill(StringRef, SMLoc) {
   Lex();
 
   SMLoc IDLoc = getLexer().getLoc();
-  StringRef IDStr;
-  if (getParser().parseIdentifier(IDStr))
+  MCSymbol *Sym;
+  if (getParser().parseSymbol(Sym))
     return TokError("expected identifier in directive");
 
-  // handle the identifier as the key symbol.
-  MCSymbol *Sym = getContext().getOrCreateSymbol(IDStr);
-
   if (getLexer().isNot(AsmToken::Comma))
     return TokError("unexpected token in directive");
   Lex();
diff --git a/llvm/lib/MC/MCParser/ELFAsmParser.cpp b/llvm/lib/MC/MCParser/ELFAsmParser.cpp
index 513f3b3da7813..6195355626fd5 100644
--- a/llvm/lib/MC/MCParser/ELFAsmParser.cpp
+++ b/llvm/lib/MC/MCParser/ELFAsmParser.cpp
@@ -163,7 +163,7 @@ bool ELFAsmParser::parseDirectiveSymbolAttribute(StringRef Directive, SMLoc) {
         continue;
       }
 
-      MCSymbol *Sym = getContext().getOrCreateSymbol(Name);
+      MCSymbol *Sym = getContext().parseSymbol(Name);
 
       getStreamer().emitSymbolAttribute(Sym, Attr);
 
@@ -196,10 +196,9 @@ bool ELFAsmParser::parseSectionSwitch(StringRef Section, unsigned Type,
 }
 
 bool ELFAsmParser::parseDirectiveSize(StringRef, SMLoc) {
-  StringRef Name;
-  if (getParser().parseIdentifier(Name))
+  MCSymbol *Sym;
+  if (getParser().parseSymbol(Sym))
     return TokError("expected identifier");
-  auto *Sym = static_cast<MCSymbolELF *>(getContext().getOrCreateSymbol(Name));
 
   if (getLexer().isNot(AsmToken::Comma))
     return TokError("expected comma");
@@ -712,13 +711,10 @@ static MCSymbolAttr MCAttrForString(StringRef Type) {
 ///  ::= .type identifier , %attribute
 ///  ::= .type identifier , "attribute"
 bool ELFAsmParser::parseDirectiveType(StringRef, SMLoc) {
-  StringRef Name;
-  if (getParser().parseIdentifier(Name))
+  MCSymbol *Sym;
+  if (getParser().parseSymbol(Sym))
     return TokError("expected identifier");
 
-  // Handle the identifier as the key symbol.
-  MCSymbol *Sym = getContext().getOrCreateSymbol(Name);
-
   bool AllowAt = getLexer().getAllowAtInIdentifier();
   if (!AllowAt &&
       !getContext().getAsmInfo()->getCommentString().starts_with("@"))
@@ -790,8 +786,9 @@ bool ELFAsmParser::parseDirectiveIdent(StringRef, SMLoc) {
 /// parseDirectiveSymver
 ///  ::= .symver foo, bar2@zed
 bool ELFAsmParser::parseDirectiveSymver(StringRef, SMLoc) {
-  StringRef OriginalName, Name, Action;
-  if (getParser().parseIdentifier(OriginalName))
+  MCSymbol *OriginalSym;
+  StringRef Name, Action;
+  if (getParser().parseSymbol(OriginalSym))
     return TokError("expected identifier");
 
   if (getLexer().isNot(AsmToken::Comma))
@@ -819,8 +816,7 @@ bool ELFAsmParser::parseDirectiveSymver(StringRef, SMLoc) {
   }
   (void)parseOptionalToken(AsmToken::EndOfStatement);
 
-  getStreamer().emitELFSymverDirective(
-      getContext().getOrCreateSymbol(OriginalName), Name, KeepOriginalSym);
+  getStreamer().emitELFSymverDirective(OriginalSym, Name, KeepOriginalSym);
   return false;
 }
 
@@ -853,8 +849,8 @@ bool ELFAsmParser::parseDirectiveVersion(StringRef, SMLoc) {
 bool ELFAsmParser::parseDirectiveWeakref(StringRef, SMLoc) {
   // FIXME: Share code with the other alias building directives.
 
-  StringRef AliasName;
-  if (getParser().parseIdentifier(AliasName))
+  MCSymbol *Alias;
+  if (getParser().parseSymbol(Alias))
     return TokError("expected identifier");
 
   if (getLexer().isNot(AsmToken::Comma))
@@ -862,14 +858,10 @@ bool ELFAsmParser::parseDirectiveWeakref(StringRef, SMLoc) {
 
   Lex();
 
-  StringRef Name;
-  if (getParser().parseIdentifier(Name))
+  MCSymbol *Sym;
+  if (getParser().parseSymbol(Sym))
     return TokError("expected identifier");
 
-  MCSymbol *Alias = getContext().getOrCreateSymbol(AliasName);
-
-  MCSymbol *Sym = getContext().getOrCreateSymbol(Name);
-
   getStreamer().emitWeakReference(Alias, Sym);
   return false;
 }
diff --git a/llvm/lib/MC/MCParser/MCAsmParser.cpp b/llvm/lib/MC/MCParser/MCAsmParser.cpp
index 68b9cab2492f5..c1b7e57184de1 100644
--- a/llvm/lib/MC/MCParser/MCAsmParser.cpp
+++ b/llvm/lib/MC/MCParser/MCAsmParser.cpp
@@ -163,6 +163,15 @@ bool MCAsmParser::parseGNUAttribute(SMLoc L, int64_t &Tag,
   return true;
 }
 
+bool MCAsmParser::parseSymbol(MCSymbol *&Res) {
+  StringRef Name;
+  if (parseIdentifier(Name))
+    return true;
+
+  Res = getContext().parseSymbol(Name);
+  return false;
+}
+
 void MCParsedAsmOperand::dump() const {
   // Cannot completely remove virtual function even in release mode.
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
diff --git a/llvm/lib/MC/MCParser/MCAsmParserExtension.cpp b/llvm/lib/MC/MCParser/MCAsmParserExtension.cpp
index 7fa05088c9725..299d4b46a8a84 100644
--- a/llvm/lib/MC/MCParser/MCAsmParserExtension.cpp
+++ b/llvm/lib/MC/MCParser/MCAsmParserExtension.cpp
@@ -50,8 +50,8 @@ bool MCAsmParserExtension::parseDirectiveCGProfile(StringRef, SMLoc) {
   if (getLexer().isNot(AsmToken::EndOfStatement))
     return TokError("unexpected token in directive");
 
-  MCSymbol *FromSym = getContext().getOrCreateSymbol(From);
-  MCSymbol *ToSym = getContext().getOrCreateSymbol(To);
+  MCSymbol *FromSym = getContext().parseSymbol(From);
+  MCSymbol *ToSym = getContext().parseSymbol(To);
 
   getStreamer().emitCGProfileEntry(
       MCSymbolRefExpr::create(FromSym, getContext(), FromLoc),
diff --git a/llvm/lib/MC/MCParser/MasmParser.cpp b/llvm/lib/MC/MCParser/MasmParser.cpp
index 2dcfe0f3a420a..7f0ea7830b495 100644
--- a/llvm/lib/MC/MCParser/MasmParser.cpp
+++ b/llvm/lib/MC/MCParser/MasmParser.cpp
@@ -1480,7 +1480,7 @@ bool MasmParser::parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc,
       auto VarIt = Variables.find(SymbolName.lower());
       if (VarIt != Variables.end())
         SymbolName = VarIt->second.Name;
-      Sym = getContext().getOrCreateSymbol(SymbolName);
+      Sym = getContext().parseSymbol(SymbolName);
     }
 
     // If this is an absolute variable reference, substitute it now to preserve
@@ -1965,7 +1965,7 @@ bool MasmParser::parseStatement(ParseStatementInfo &Info,
     if (IDVal == "@@") {
       Sym = Ctx.createDirectionalLocalSymbol(0);
     } else {
-      Sym = getContext().getOrCreateSymbol(IDVal);
+      Sym = getContext().parseSymbol(IDVal);
     }
 
     // End of Labels should be treated as end of line for lexing
@@ -3009,8 +3009,7 @@ bool MasmParser::parseDirectiveEquate(StringRef IDVal, StringRef Name,
     return false;
   }
 
-  auto *Sym =
-      static_cast<MCSymbolCOFF *>(getContext().getOrCreateSymbol(Var.Name));
+  auto *Sym = static_cast<MCSymbolCOFF *>(getContext().parseSymbol(Var.Name));
   const MCConstantExpr *PrevValue =
       Sym->isVariable()
           ? dyn_cast_or_null<MCConstantExpr>(Sym->getVariableValue())
@@ -3318,7 +3317,7 @@ bool MasmParser::parseDirectiveNamedValue(StringRef TypeName, unsigned Size,
                                           StringRef Name, SMLoc NameLoc) {
   if (StructInProgress.empty()) {
     // Initialize named data value.
-    MCSymbol *Sym = getContext().getOrCreateSymbol(Name);
+    MCSymbol *Sym = getContext().parseSymbol(Name);
     getStreamer().emitLabel(Sym);
     unsigned Count;
     if (emitIntegralValues(Size, &Count))
@@ -3509,7 +3508,7 @@ bool MasmParser::parseDirectiveNamedRealValue(StringRef TypeName,
                                               SMLoc NameLoc) {
   if (StructInProgress.empty()) {
     // Initialize named data value.
-    MCSymbol *Sym = getContext().getOrCreateSymbol(Name);
+    MCSymbol *Sym = getContext().parseSymbol(Name);
     getStreamer().emitLabel(Sym);
     unsigned Count;
     if (emitRealValues(Semantics, &Count))
@@ -4003,7 +4002,7 @@ bool MasmParser::parseDirectiveNamedStructValue(const StructInfo &Structure,
                                                 SMLoc DirLoc, StringRef Name) {
   if (StructInProgress.empty()) {
     // Initialize named data value.
-    MCSymbol *Sym = getContext().getOrCreateSymbol(Name);
+    MCSymbol *Sym = getContext().parseSymbol(Name);
     getStreamer().emitLabel(Sym);
     unsigned Count;
     if (emitStructValues(Structure, &Count))
@@ -4503,9 +4502,9 @@ bool MasmParser::parseDirectivePurgeMacro(SMLoc DirectiveLoc) {
 bool MasmParser::parseDirectiveExtern() {
   // .extern is the default - but we still need to take any provided type info.
   auto parseOp = [&]() -> bool {
-    StringRef Name;
+    MCSymbol *Sym;
     SMLoc NameLoc = getTok().getLoc();
-    if (parseIdentifier(Name))
+    if (parseSymbol(Sym))
       return Error(NameLoc, "expected name");
     if (parseToken(AsmToken::Colon))
       return true;
@@ -4518,12 +4517,10 @@ bool MasmParser::parseDirectiveExtern() {
       AsmTypeInfo Type;
       if (lookUpType(TypeName, Type))
         return Error(TypeLoc, "unrecognized type");
-      KnownType[Name.lower()] = Type;
+      KnownType[Sym->getName().lower()] = Type;
     }
 
-    auto *Sym =
-        static_cast<MCSymbolCOFF *>(getContext().getOrCreateSymbol(Name));
-    Sym->setExternal(true);
+    static_cast<MCSymbolCOFF *>(Sym)->setExternal(true);
     getStreamer().emitSymbolAttribute(Sym, MCSA_Extern);
 
     return false;
@@ -4538,11 +4535,10 @@ bool MasmParser::parseDirectiveExtern() {
 ///  ::= { ".globl", ".weak", ... } [ identifier ( , identifier )* ]
 bool MasmParser::parseDirectiveSymbolAttribute(MCSymbolAttr Attr) {
   auto parseOp = [&]() -> bool {
-    StringRef Name;
     SMLoc Loc = getTok().getLoc();
-    if (parseIdentifier(Name))
+    MCSymbol *Sym;
+    if (parseSymbol(Sym))
       return Error(Loc, "expected identifier");
-    MCSymbol *Sym = getContext().getOrCreateSymbol(Name);
 
     // Assembler local symbols don't make any sense here. Complain loudly.
     if (Sym->isTemporary())
@@ -4565,13 +4561,10 @@ bool MasmParser::parseDirectiveComm(bool IsLocal) {
     return true;
 
   SMLoc IDLoc = getLexer().getLoc();
-  StringRef Name;
-  if (parseIdentifier(Name))
+  MCSymbol *Sym;
+  if (parseSymbol(Sym))
     return TokError("expected identifier in directive");
 
-  // Handle the identifier as the key symbol.
-  MCSymbol *Sym = getContext().getOrCreateSymbol(Name);
-
   if (getLexer().isNot(AsmToken::Comma))
     return TokError("unexpected token in directive");
   Lex();
diff --git a/llvm/lib/MC/MCParser/WasmAsmParser.cpp b/llvm/lib/MC/MCParser/WasmAsmParser.cpp
index ddfe1e10d9d0a..75e8948f8ce9e 100644
--- a/llvm/lib/MC/MCParser/WasmAsmParser.cpp
+++ b/llvm/lib/MC/MCParser/WasmAsmParser.cpp
@@ -212,10 +212,9 @@ class WasmAsmParser : public MCAsmParserExtension {
   // TODO: This function is almost the same as ELFAsmParser::ParseDirectiveSize
   // so maybe could be shared somehow.
   bool parseDirectiveSize(StringRef, SMLoc Loc) {
-    StringRef Name;
-    if (Parser->parseIdentifier(Name))
+    MCSymbol *Sym;
+    if (Parser->parseSymbol(Sym))
       return TokError("expected identifier in directive");
-    auto Sym = getContext().getOrCreateSymbol(Name);
     if (expect(AsmToken::Comma, ","))
       return true;
     const MCExpr *Expr;
@@ -241,8 +240,7 @@ class WasmAsmParser : public MCAsmParserExtension {
       return error("Expected label after .type directive, got: ",
                    Lexer->getTok());
     auto *WasmSym = static_cast<MCSymbolWasm *>(
-        getStreamer().getContext().getOrCreateSymbol(
-            Lexer->getTok().getString()));
+        getStreamer().getContext().parseSymbol(Lexer->getTok().getString()));
     Lex();
     if (!(isNext(AsmToken::Comma) && isNext(AsmToken::At) &&
           Lexer->is(AsmToken::Identifier)))
@@ -293,10 +291,9 @@ class WasmAsmParser : public MCAsmParserExtension {
     assert(Attr != MCSA_Invalid && "unexpected symbol attribute directive!");
     if (getLexer().isNot(AsmToken::EndOfStatement)) {
       while (true) {
-        StringRef Name;
-        if (getParser().parseIdentifier(Name))
+        MCSymbol *Sym;
+        if (getParser().parseSymbol(Sym))
           return TokError("expected identifier in directive");
-        MCSymbol *Sym = getContext().getOrCreateSymbol(Name);
         getStreamer().emitSymbolAttribute(Sym, Attr);
         if (getLexer().is(AsmToken::EndOfStatement))
           break;
diff --git a/llvm/lib/MC/MCSFrame.cpp b/llvm/lib/MC/MCSFrame.cpp
index a0d6c80ab72ea..e8252d4d76b61 100644
--- a/llvm/lib/MC/MCSFrame.cpp
+++ b/llvm/lib/MC/MCSFrame.cpp
@@ -14,6 +14,7 @@
 #include "llvm/MC/MCObjectStreamer.h"
 #include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/Endian.h"
 #include "llvm/Support/EndianStream.h"
 
 using namespace llvm;
@@ -33,10 +34,70 @@ struct SFrameFRE {
   size_t CFAOffset = 0;
   size_t FPOffset = 0;
   size_t RAOffset = 0;
-  bool FromFP = false;
+  FREInfo<endianness::native> Info;
   bool CFARegSet = false;
 
   SFrameFRE(const MCSymbol *Start) : Label(Start) {}
+
+  void emitOffset(MCObjectStreamer &S, FREOffset OffsetSize, size_t Offset) {
+    switch (OffsetSize) {
+    case (FREOffset::B1):
+      S.emitInt8(Offset);
+      return;
+    case (FREOffset::B2):
+      S.emitInt16(Offset);
+      return;
+    case (FREOffset::B4):
+      S.emitInt32(Offset);
+      return;
+    }
+  }
+
+  void emit(MCObjectStreamer &S, const MCSymbol *FuncBegin,
+            MCFragment *FDEFrag) {
+    S.emitSFrameCalculateFuncOffset(FuncBegin, Label, FDEFrag, SMLoc());
+
+    // fre_cfa_base_reg_id already set during parsing
+
+    // fre_offset_count
+    unsigned RegsTracked = 1; // always track the cfa.
+    if (FPOffset != 0)
+      RegsTracked++;
+    if (RAOffset != 0)
+      RegsTracked++;
+    Info.setOffsetCount(RegsTracked);
+
+    // fre_offset_size
+    if (isInt<8>(CFAOffset) && isInt<8>(FPOffset) && isInt<8>(RAOffset))
+      Info.setOffsetSize(FREOffset::B1);
+    else if (isInt<16>(CFAOffset) && isInt<16>(FPOffset) && isInt<16>(RAOffset))
+      Info.setOffsetSize(FREOffset::B2);
+    else {
+      assert(isInt<32>(CFAOffset) && isInt<32>(FPOffset) &&
+             isInt<32>(RAOffset) && "Offset too big for sframe");
+      Info.setOffsetSize(FREOffset::B4);
+    }
+
+    // No support for fre_mangled_ra_p yet.
+    Info.setReturnAddressSigned(false);
+
+    // sframe_fre_info_word
+    S.emitInt8(Info.getFREInfo());
+
+    // FRE Offsets
+    [[maybe_unused]] unsigned OffsetsEmitted = 1;
+    emitOffset(S, Info.getOffsetSize(), CFAOffset);
+    if (FPOffset) {
+      OffsetsEmitted++;
+      emitOffset(S, Info.getOffsetSize(), FPOffset);
+    }
+    if (RAOffset) {
+      OffsetsEmitted++;
+      emitOffset(S, Info.getOffsetSize(), RAOffset);
+    }
+    assert(OffsetsEmitted == RegsTracked &&
+           "Didn't emit the right number of offsets");
+  }
 };
 
 // High-level structure to track info needed to emit a sframe_func_desc_entry
@@ -46,11 +107,13 @@ struct SFrameFDE {
   const MCDwarfFrameInfo &DFrame;
   // Label where this FDE's FREs start.
   MCSymbol *FREStart;
+  // Frag where this FDE is emitted.
+  MCFragment *Frag;
   // Unwinding fres
   SmallVector<SFrameFRE> FREs;
 
   SFrameFDE(const MCDwarfFrameInfo &DF, MCSymbol *FRES)
-      : DFrame(DF), FREStart(FRES) {}
+      : DFrame(DF), FREStart(FRES), Frag(nullptr) {}
 
   void emit(MCObjectStreamer &S, const MCSymbol *FRESubSectionStart) {
     MCContext &C = S.getContext();
@@ -74,13 +137,21 @@ struct SFrameFDE {
     S.emitInt32(0);
 
     // sfde_func_num_fres
-    // TODO: When we actually emit fres, replace 0 with FREs.size()
-    S.emitInt32(0);
+    S.emitInt32(FREs.size());
 
     // sfde_func_info word
-    FDEInfo<endianness::native> I;
-    I.setFuncInfo(0 /* No pauth key */, FDEType::PCInc, FREType::Addr1);
-    S.emitInt8(I.Info);
+
+    // All FREs within an FDE share the same sframe::FREType::AddrX. The value
+    // of 'X' is determined by the FRE with the largest offset, which is the
+    // last. This offset isn't known until relax time, so emit a frag which can
+    // calculate that now.
+    //
+    // At relax time, this FDE frag calculates the proper AddrX value (as well
+    // as the rest of the FDE FuncInfo word). Subsequent FRE frags will read it
+    // from this frag and emit the proper number of bytes.
+    Frag = S.getCurrentFragment();
+    S.emitSFrameCalculateFuncOffset(DFrame.Begin, FREs.back().Label, nullptr,
+                                    SMLoc());
 
     // sfde_func_rep_size. Not relevant in non-PCMASK fdes.
     S.emitInt8(0);
@@ -96,13 +167,16 @@ struct SFrameFDE {
 class SFrameEmitterImpl {
   MCObjectStreamer &Streamer;
   SmallVector<SFrameFDE> FDEs;
+  uint32_t TotalFREs;
   ABI SFrameABI;
   // Target-specific convenience variables to detect when a CFI instruction
   // references these registers. Unlike in dwarf frame descriptions, they never
-  // escape into the sframe section itself.
+  // escape into the sframe section itself. TODO: These should be retrieved from
+  // the target.
   unsigned SPReg;
   unsigned FPReg;
   unsigned RAReg;
+  int8_t FixedRAOffset;
   MCSymbol *FDESubSectionStart;
   MCSymbol *FRESubSectionStart;
   MCSymbol *FRESubSectionEnd;
@@ -110,12 +184,12 @@ class SFrameEmitterImpl {
   bool setCFARegister(SFrameFRE &FRE, const MCCFIInstruction &I) {
     if (I.getRegister() == SPReg) {
       FRE.CFARegSet = true;
-      FRE.FromFP = false;
+      FRE.Info.setBaseRegister(BaseReg::SP);
       return true;
     }
     if (I.getRegister() == FPReg) {
       FRE.CFARegSet = true;
-      FRE.FromFP = true;
+      FRE.Info.setBaseRegister(BaseReg::FP);
       return true;
     }
     Streamer.getContext().reportWarning(
@@ -182,7 +256,8 @@ class SFrameEmitterImpl {
   }
 
 public:
-  SFrameEmitterImpl(MCObjectStreamer &Streamer) : Streamer(Streamer) {
+  SFrameEmitterImpl(MCObjectStreamer &Streamer)
+      : Streamer(Streamer), TotalFREs(0) {
     assert(Streamer.getContext()
                .getObjectFileInfo()
                ->getSFrameABIArch()
@@ -195,6 +270,7 @@ class SFrameEmitterImpl {
       SPReg = 31;
       RAReg = 29;
       FPReg = 30;
+      FixedRAOffset = 0;
       break;
     case ABI::AMD64EndianLittle:
       SPReg = 7;
@@ -202,6 +278,7 @@ class SFrameEmitterImpl {
       // MCDwarfFrameInfo constructor.
       RAReg = static_cast<unsigned>(INT_MAX);
       FPReg = 6;
+      FixedRAOffset = -8;
       break;
     }
 
@@ -219,10 +296,16 @@ class SFrameEmitterImpl {
   bool equalIgnoringLocation(const SFrameFRE &Left, const SFrameFRE &Right) {
     return Left.CFAOffset == Right.CFAOffset &&
            Left.FPOffset == Right.FPOffset && Left.RAOffset == Right.RAOffset &&
-           Left.FromFP == Right.FromFP && Left.CFARegSet == Right.CFARegSet;
+           Left.Info.getFREInfo() == Right.Info.getFREInfo() &&
+           Left.CFARegSet == Right.CFARegSet;
   }
 
   void buildSFDE(const MCDwarfFrameInfo &DF) {
+    // Functions with zero size can happen with assembler macros and
+    // machine-generated code. They don't need unwind info at all, so
+    // no need to warn.
+    if (atSameLocation(DF.Begin, DF.End))
+      return;
     bool Valid = true;
     SFrameFDE FDE(DF, Streamer.getContext().createTempSymbol());
     // This would have been set via ".cfi_return_column", but
@@ -277,8 +360,11 @@ class SFrameEmitterImpl {
         LastLabel = L;
       }
     }
-    if (Valid)
+
+    if (Valid) {
       FDEs.push_back(FDE);
+      TotalFREs += FDE.FREs.size();
+    }
   }
 
   void emitPreamble() {
@@ -294,13 +380,12 @@ class SFrameEmitterImpl {
     // sfh_cfa_fixed_fp_offset
     Streamer.emitInt8(0);
     // sfh_cfa_fixed_ra_offset
-    Streamer.emitInt8(0);
+    Streamer.emitInt8(FixedRAOffset);
     // sfh_auxhdr_len
     Streamer.emitInt8(0);
     // shf_num_fdes
     Streamer.emitInt32(FDEs.size());
     // shf_num_fres
-    uint32_t TotalFREs = 0;
     Streamer.emitInt32(TotalFREs);
 
     // shf_fre_len
@@ -322,8 +407,11 @@ class SFrameEmitterImpl {
 
   void emitFREs() {
     Streamer.emitLabel(FRESubSectionStart);
-    for (auto &FDE : FDEs)
+    for (auto &FDE : FDEs) {
       Streamer.emitLabel(FDE.FREStart);
+      for (auto &FRE : FDE.FREs)
+        FRE.emit(Streamer, FDE.DFrame.Begin, FDE.Frag);
+    }
     Streamer.emitLabel(FRESubSectionEnd);
   }
 };
@@ -359,3 +447,55 @@ void MCSFrameEmitter::emit(MCObjectStreamer &Streamer) {
   Emitter.emitFDEs();
   Emitter.emitFREs();
 }
+
+void MCSFrameEmitter::encodeFuncOffset(MCContext &C, uint64_t Offset,
+                                       SmallVectorImpl<char> &Out,
+                                       MCFragment *FDEFrag) {
+  // If encoding into the FDE Frag itself, generate the sfde_func_info.
+  if (FDEFrag == nullptr) {
+    // sfde_func_info
+
+    // Offset is the difference between the function start label and the final
+    // FRE's offset, which is the max offset for this FDE.
+    FDEInfo<endianness::native> I;
+    I.Info = 0;
+    if (isUInt<8>(Offset))
+      I.setFREType(FREType::Addr1);
+    else if (isUInt<16>(Offset))
+      I.setFREType(FREType::Addr2);
+    else {
+      assert(isUInt<32>(Offset));
+      I.setFREType(FREType::Addr4);
+    }
+    I.setFDEType(FDEType::PCInc);
+    // TODO: When we support pauth keys, this will need to be retrieved
+    // from the frag itself.
+    I.setPAuthKey(0);
+
+    Out.push_back(I.getFuncInfo());
+    return;
+  }
+
+  const auto &FDEData = FDEFrag->getVarContents();
+  FDEInfo<endianness::native> I;
+  I.Info = FDEData.back();
+  FREType T = I.getFREType();
+  llvm::endianness E = C.getAsmInfo()->isLittleEndian()
+                           ? llvm::endianness::little
+                           : llvm::endianness::big;
+  // sfre_start_address
+  switch (T) {
+  case FREType::Addr1:
+    assert(isUInt<8>(Offset) && "Miscalculated Sframe FREType");
+    support::endian::write<uint8_t>(Out, Offset, E);
+    break;
+  case FREType::Addr2:
+    assert(isUInt<16>(Offset) && "Miscalculated Sframe FREType");
+    support::endian::write<uint16_t>(Out, Offset, E);
+    break;
+  case FREType::Addr4:
+    assert(isUInt<32>(Offset) && "Miscalculated Sframe FREType");
+    support::endian::write<uint32_t>(Out, Offset, E);
+    break;
+  }
+}
diff --git a/llvm/lib/Object/Archive.cpp b/llvm/lib/Object/Archive.cpp
index 92f31c909efd4..753f805824cea 100644
--- a/llvm/lib/Object/Archive.cpp
+++ b/llvm/lib/Object/Archive.cpp
@@ -473,9 +473,7 @@ Archive::Child::Child(const Archive *Parent, const char *Start, Error *Err)
   }
 
   Header = Parent->createArchiveMemberHeader(
-      Start,
-      Parent ? Parent->getData().size() - (Start - Parent->getData().data())
-             : 0,
+      Start, Parent->getData().size() - (Start - Parent->getData().data()),
       Err);
 
   // If we are pointed to real data, Start is not a nullptr, then there must be
diff --git a/llvm/lib/ObjectYAML/DXContainerEmitter.cpp b/llvm/lib/ObjectYAML/DXContainerEmitter.cpp
index 1078b1188bb66..910383816f43b 100644
--- a/llvm/lib/ObjectYAML/DXContainerEmitter.cpp
+++ b/llvm/lib/ObjectYAML/DXContainerEmitter.cpp
@@ -274,13 +274,8 @@ Error DXContainerWriter::writeParts(raw_ostream &OS) {
       for (DXContainerYAML::RootParameterLocationYaml &L :
            P.RootSignature->Parameters.Locations) {
 
-        assert(dxbc::isValidParameterType(L.Header.Type) &&
-               "invalid DXContainer YAML");
-        assert(dxbc::isValidShaderVisibility(L.Header.Visibility) &&
-               "invalid DXContainer YAML");
-        dxbc::RootParameterType Type = dxbc::RootParameterType(L.Header.Type);
-        dxbc::ShaderVisibility Visibility =
-            dxbc::ShaderVisibility(L.Header.Visibility);
+        const dxbc::RootParameterType Type = L.Header.Type;
+        const dxbc::ShaderVisibility Visibility = L.Header.Visibility;
 
         switch (Type) {
         case dxbc::RootParameterType::Constants32Bit: {
@@ -313,10 +308,8 @@ Error DXContainerWriter::writeParts(raw_ostream &OS) {
               P.RootSignature->Parameters.getOrInsertTable(L);
           mcdxbc::DescriptorTable Table;
           for (const auto &R : TableYaml.Ranges) {
-            assert(dxbc::isValidRangeType(R.RangeType) &&
-                   "Invalid Descriptor Range Type");
             mcdxbc::DescriptorRange Range;
-            Range.RangeType = dxil::ResourceClass(R.RangeType);
+            Range.RangeType = R.RangeType;
             Range.NumDescriptors = R.NumDescriptors;
             Range.BaseShaderRegister = R.BaseShaderRegister;
             Range.RegisterSpace = R.RegisterSpace;
@@ -335,7 +328,7 @@ Error DXContainerWriter::writeParts(raw_ostream &OS) {
       }
 
       for (const auto &Param : P.RootSignature->samplers()) {
-        dxbc::RTS0::v1::StaticSampler NewSampler;
+        mcdxbc::StaticSampler NewSampler;
         NewSampler.Filter = Param.Filter;
         NewSampler.AddressU = Param.AddressU;
         NewSampler.AddressV = Param.AddressV;
diff --git a/llvm/lib/ObjectYAML/DXContainerYAML.cpp b/llvm/lib/ObjectYAML/DXContainerYAML.cpp
index 32b502ed4e21f..22674b1ceb734 100644
--- a/llvm/lib/ObjectYAML/DXContainerYAML.cpp
+++ b/llvm/lib/ObjectYAML/DXContainerYAML.cpp
@@ -60,7 +60,10 @@ readDescriptorRanges(DXContainerYAML::RootParameterHeaderYaml &Header,
     NewR.NumDescriptors = R.NumDescriptors;
     NewR.BaseShaderRegister = R.BaseShaderRegister;
     NewR.RegisterSpace = R.RegisterSpace;
-    NewR.RangeType = R.RangeType;
+    if (!dxbc::isValidRangeType(R.RangeType))
+      return createStringError(std::errc::invalid_argument,
+                               "Invalid value for descriptor range type");
+    NewR.RangeType = dxil::ResourceClass(R.RangeType);
     if constexpr (std::is_same_v<T, dxbc::RTS0::v2::DescriptorRange>) {
       // Set all flag fields for v2
 #define DESCRIPTOR_RANGE_FLAG(Num, Enum, Flag)                                 \
@@ -94,15 +97,14 @@ DXContainerYAML::RootSignatureYamlDesc::create(
       return createStringError(std::errc::invalid_argument,
                                "Invalid value for parameter type");
 
-    RootParameterHeaderYaml Header(PH.ParameterType);
+    RootParameterHeaderYaml Header(dxbc::RootParameterType(PH.ParameterType));
     Header.Offset = PH.ParameterOffset;
-    Header.Type = PH.ParameterType;
 
     if (!dxbc::isValidShaderVisibility(PH.ShaderVisibility))
       return createStringError(std::errc::invalid_argument,
                                "Invalid value for shader visibility");
 
-    Header.Visibility = PH.ShaderVisibility;
+    Header.Visibility = dxbc::ShaderVisibility(PH.ShaderVisibility);
 
     llvm::Expected<object::DirectX::RootParameterView> ParamViewOrErr =
         Data.getParameter(PH);
@@ -162,20 +164,50 @@ DXContainerYAML::RootSignatureYamlDesc::create(
   }
 
   for (const auto &S : Data.samplers()) {
+    if (!dxbc::isValidSamplerFilter(S.Filter))
+      return createStringError(std::errc::invalid_argument,
+                               "Invalid value for static sampler filter");
+
+    if (!dxbc::isValidAddress(S.AddressU))
+      return createStringError(std::errc::invalid_argument,
+                               "Invalid value for static sampler AddressU");
+
+    if (!dxbc::isValidAddress(S.AddressV))
+      return createStringError(std::errc::invalid_argument,
+                               "Invalid value for static sampler AddressV");
+
+    if (!dxbc::isValidAddress(S.AddressW))
+      return createStringError(std::errc::invalid_argument,
+                               "Invalid value for static sampler AddressW");
+
+    if (!dxbc::isValidComparisonFunc(S.ComparisonFunc))
+      return createStringError(
+          std::errc::invalid_argument,
+          "Invalid value for static sampler ComparisonFunc");
+
+    if (!dxbc::isValidBorderColor(S.BorderColor))
+      return createStringError(std::errc::invalid_argument,
+                               "Invalid value for static sampler BorderColor");
+
+    if (!dxbc::isValidShaderVisibility(S.ShaderVisibility))
+      return createStringError(
+          std::errc::invalid_argument,
+          "Invalid value for static sampler ShaderVisibility");
+
     StaticSamplerYamlDesc NewS;
-    NewS.Filter = S.Filter;
-    NewS.AddressU = S.AddressU;
-    NewS.AddressV = S.AddressV;
-    NewS.AddressW = S.AddressW;
+    NewS.Filter = dxbc::SamplerFilter(S.Filter);
+    NewS.AddressU = dxbc::TextureAddressMode(S.AddressU);
+    NewS.AddressV = dxbc::TextureAddressMode(S.AddressV);
+    NewS.AddressW = dxbc::TextureAddressMode(S.AddressW);
     NewS.MipLODBias = S.MipLODBias;
     NewS.MaxAnisotropy = S.MaxAnisotropy;
-    NewS.ComparisonFunc = S.ComparisonFunc;
-    NewS.BorderColor = S.BorderColor;
+    NewS.ComparisonFunc = dxbc::ComparisonFunc(S.ComparisonFunc);
+    NewS.BorderColor = dxbc::StaticBorderColor(S.BorderColor);
     NewS.MinLOD = S.MinLOD;
     NewS.MaxLOD = S.MaxLOD;
     NewS.ShaderRegister = S.ShaderRegister;
     NewS.RegisterSpace = S.RegisterSpace;
-    NewS.ShaderVisibility = S.ShaderVisibility;
+    NewS.ShaderVisibility = dxbc::ShaderVisibility(S.ShaderVisibility);
 
     RootSigDesc.StaticSamplers.push_back(NewS);
   }
@@ -425,21 +457,21 @@ void MappingContextTraits<DXContainerYAML::RootParameterLocationYaml,
   IO.mapRequired("ShaderVisibility", L.Header.Visibility);
 
   switch (L.Header.Type) {
-  case llvm::to_underlying(dxbc::RootParameterType::Constants32Bit): {
+  case dxbc::RootParameterType::Constants32Bit: {
     DXContainerYAML::RootConstantsYaml &Constants =
         S.Parameters.getOrInsertConstants(L);
     IO.mapRequired("Constants", Constants);
     break;
   }
-  case llvm::to_underlying(dxbc::RootParameterType::CBV):
-  case llvm::to_underlying(dxbc::RootParameterType::SRV):
-  case llvm::to_underlying(dxbc::RootParameterType::UAV): {
+  case dxbc::RootParameterType::CBV:
+  case dxbc::RootParameterType::SRV:
+  case dxbc::RootParameterType::UAV: {
     DXContainerYAML::RootDescriptorYaml &Descriptor =
         S.Parameters.getOrInsertDescriptor(L);
     IO.mapRequired("Descriptor", Descriptor);
     break;
   }
-  case llvm::to_underlying(dxbc::RootParameterType::DescriptorTable): {
+  case dxbc::RootParameterType::DescriptorTable: {
     DXContainerYAML::DescriptorTableYaml &Table =
         S.Parameters.getOrInsertTable(L);
     IO.mapRequired("Table", Table);
@@ -585,6 +617,55 @@ void ScalarEnumerationTraits<dxbc::SigComponentType>::enumeration(
     IO.enumCase(Value, E.Name.str().c_str(), E.Value);
 }
 
+void ScalarEnumerationTraits<dxbc::RootParameterType>::enumeration(
+    IO &IO, dxbc::RootParameterType &Value) {
+  for (const auto &E : dxbc::getRootParameterTypes())
+    IO.enumCase(Value, E.Name.str().c_str(), E.Value);
+}
+
+void ScalarEnumerationTraits<dxil::ResourceClass>::enumeration(
+    IO &IO, dxil::ResourceClass &Value) {
+  const EnumEntry<dxil::ResourceClass> ResourceClasses[] = {
+      {"CBuffer", dxil::ResourceClass::CBuffer},
+      {"SRV", dxil::ResourceClass::SRV},
+      {"UAV", dxil::ResourceClass::UAV},
+      {"Sampler", dxil::ResourceClass::Sampler},
+  };
+
+  for (const auto &E : ResourceClasses)
+    IO.enumCase(Value, E.Name.str().c_str(), E.Value);
+}
+
+void ScalarEnumerationTraits<dxbc::SamplerFilter>::enumeration(
+    IO &IO, dxbc::SamplerFilter &Value) {
+  for (const auto &E : dxbc::getSamplerFilters())
+    IO.enumCase(Value, E.Name.str().c_str(), E.Value);
+}
+
+void ScalarEnumerationTraits<dxbc::StaticBorderColor>::enumeration(
+    IO &IO, dxbc::StaticBorderColor &Value) {
+  for (const auto &E : dxbc::getStaticBorderColors())
+    IO.enumCase(Value, E.Name.str().c_str(), E.Value);
+}
+
+void ScalarEnumerationTraits<dxbc::TextureAddressMode>::enumeration(
+    IO &IO, dxbc::TextureAddressMode &Value) {
+  for (const auto &E : dxbc::getTextureAddressModes())
+    IO.enumCase(Value, E.Name.str().c_str(), E.Value);
+}
+
+void ScalarEnumerationTraits<dxbc::ShaderVisibility>::enumeration(
+    IO &IO, dxbc::ShaderVisibility &Value) {
+  for (const auto &E : dxbc::getShaderVisibility())
+    IO.enumCase(Value, E.Name.str().c_str(), E.Value);
+}
+
+void ScalarEnumerationTraits<dxbc::ComparisonFunc>::enumeration(
+    IO &IO, dxbc::ComparisonFunc &Value) {
+  for (const auto &E : dxbc::getComparisonFuncs())
+    IO.enumCase(Value, E.Name.str().c_str(), E.Value);
+}
+
 } // namespace yaml
 
 void DXContainerYAML::PSVInfo::mapInfoForVersion(yaml::IO &IO) {
diff --git a/llvm/lib/ObjectYAML/GOFFEmitter.cpp b/llvm/lib/ObjectYAML/GOFFEmitter.cpp
index 7e94ac609a030..c26893cfaa720 100644
--- a/llvm/lib/ObjectYAML/GOFFEmitter.cpp
+++ b/llvm/lib/ObjectYAML/GOFFEmitter.cpp
@@ -38,8 +38,8 @@ template <typename ValueType> struct BinaryBeImpl {
 template <typename ValueType>
 raw_ostream &operator<<(raw_ostream &OS, const BinaryBeImpl<ValueType> &BBE) {
   char Buffer[sizeof(BBE.Value)];
-  support::endian::write<ValueType, llvm::endianness::big, support::unaligned>(
-      Buffer, BBE.Value);
+  support::endian::write<ValueType, support::unaligned>(Buffer, BBE.Value,
+                                                        llvm::endianness::big);
   OS.write(Buffer, sizeof(BBE.Value));
   return OS;
 }
diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index 98821bb1408a7..c3f35f0f5e7fa 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -104,6 +104,7 @@
 #include "llvm/Transforms/Scalar/LoopDeletion.h"
 #include "llvm/Transforms/Scalar/LoopDistribute.h"
 #include "llvm/Transforms/Scalar/LoopFlatten.h"
+#include "llvm/Transforms/Scalar/LoopFuse.h"
 #include "llvm/Transforms/Scalar/LoopIdiomRecognize.h"
 #include "llvm/Transforms/Scalar/LoopInstSimplify.h"
 #include "llvm/Transforms/Scalar/LoopInterchange.h"
@@ -313,6 +314,7 @@ PipelineTuningOptions::PipelineTuningOptions() {
   SLPVectorization = false;
   LoopUnrolling = true;
   LoopInterchange = EnableLoopInterchange;
+  LoopFusion = false;
   ForgetAllSCEVInLoopUnroll = ForgetSCEVInLoopUnroll;
   LicmMssaOptCap = SetLicmMssaOptCap;
   LicmMssaNoAccForPromotionCap = SetLicmMssaNoAccForPromotionCap;
@@ -1551,6 +1553,11 @@ PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level,
   OptimizePM.addPass(createFunctionToLoopPassAdaptor(
       std::move(LPM), /*UseMemorySSA=*/false, /*UseBlockFrequencyInfo=*/false));
 
+  // FIXME: This may not be the right place in the pipeline.
+  // We need to have the data to support the right place.
+  if (PTO.LoopFusion)
+    OptimizePM.addPass(LoopFusePass());
+
   // Distribute loops to allow partial vectorization.  I.e. isolate dependences
   // into separate loop that would otherwise inhibit vectorization.  This is
   // currently only performed for loops marked with the metadata
@@ -2355,4 +2362,4 @@ AAManager PassBuilder::buildDefaultAAPipeline() {
 bool PassBuilder::isInstrumentedPGOUse() const {
   return (PGOOpt && PGOOpt->Action == PGOOptions::IRUse) ||
          !UseCtxProfile.empty();
-}
\ No newline at end of file
+}
diff --git a/llvm/lib/ProfileData/SampleProf.cpp b/llvm/lib/ProfileData/SampleProf.cpp
index 60c1393616713..ac7513ef2cb49 100644
--- a/llvm/lib/ProfileData/SampleProf.cpp
+++ b/llvm/lib/ProfileData/SampleProf.cpp
@@ -47,6 +47,24 @@ bool FunctionSamples::ProfileIsPreInlined = false;
 bool FunctionSamples::UseMD5 = false;
 bool FunctionSamples::HasUniqSuffix = true;
 bool FunctionSamples::ProfileIsFS = false;
+
+std::error_code
+serializeTypeMap(const TypeCountMap &Map,
+                 const MapVector<FunctionId, uint32_t> &NameTable,
+                 raw_ostream &OS) {
+  encodeULEB128(Map.size(), OS);
+  for (const auto &[TypeName, SampleCount] : Map) {
+    if (auto NameIndexIter = NameTable.find(TypeName);
+        NameIndexIter != NameTable.end()) {
+      encodeULEB128(NameIndexIter->second, OS);
+    } else {
+      // If the type is not in the name table, we cannot serialize it.
+      return sampleprof_error::truncated_name_table;
+    }
+    encodeULEB128(SampleCount, OS);
+  }
+  return sampleprof_error::success;
+}
 } // namespace sampleprof
 } // namespace llvm
 
@@ -178,6 +196,17 @@ raw_ostream &llvm::sampleprof::operator<<(raw_ostream &OS,
   return OS;
 }
 
+static void printTypeCountMap(raw_ostream &OS, LineLocation Loc,
+                              const TypeCountMap &TypeCountMap) {
+  if (TypeCountMap.empty()) {
+    return;
+  }
+  OS << Loc << ": vtables: ";
+  for (const auto &[Type, Count] : TypeCountMap)
+    OS << Type << ":" << Count << " ";
+  OS << "\n";
+}
+
 /// Print the samples collected for a function on stream \p OS.
 void FunctionSamples::print(raw_ostream &OS, unsigned Indent) const {
   if (getFunctionHash())
@@ -192,7 +221,13 @@ void FunctionSamples::print(raw_ostream &OS, unsigned Indent) const {
     SampleSorter<LineLocation, SampleRecord> SortedBodySamples(BodySamples);
     for (const auto &SI : SortedBodySamples.get()) {
       OS.indent(Indent + 2);
+      const auto &Loc = SI->first;
       OS << SI->first << ": " << SI->second;
+      if (const TypeCountMap *TypeCountMap =
+              this->findCallsiteTypeSamplesAt(Loc)) {
+        OS.indent(Indent + 2);
+        printTypeCountMap(OS, Loc, *TypeCountMap);
+      }
     }
     OS.indent(Indent);
     OS << "}\n";
@@ -214,6 +249,11 @@ void FunctionSamples::print(raw_ostream &OS, unsigned Indent) const {
         OS << Loc << ": inlined callee: " << FuncSample.getFunction() << ": ";
         FuncSample.print(OS, Indent + 4);
       }
+      auto TypeSamplesIter = VirtualCallsiteTypeCounts.find(Loc);
+      if (TypeSamplesIter != VirtualCallsiteTypeCounts.end()) {
+        OS.indent(Indent + 2);
+        printTypeCountMap(OS, Loc, TypeSamplesIter->second);
+      }
     }
     OS.indent(Indent);
     OS << "}\n";
diff --git a/llvm/lib/ProfileData/SampleProfReader.cpp b/llvm/lib/ProfileData/SampleProfReader.cpp
index 12769a391286c..81ae792e70b99 100644
--- a/llvm/lib/ProfileData/SampleProfReader.cpp
+++ b/llvm/lib/ProfileData/SampleProfReader.cpp
@@ -197,8 +197,37 @@ enum class LineType {
   CallSiteProfile,
   BodyProfile,
   Metadata,
+  VirtualCallTypeProfile,
 };
 
+// Parse `Input` as a white-space separated list of `vtable:count` pairs. An
+// example input line is `_ZTVbar:1471 _ZTVfoo:630`.
+static bool parseTypeCountMap(StringRef Input,
+                              DenseMap<StringRef, uint64_t> &TypeCountMap) {
+  for (size_t Index = Input.find_first_not_of(' '); Index != StringRef::npos;) {
+    size_t ColonIndex = Input.find(':', Index);
+    if (ColonIndex == StringRef::npos)
+      return false; // No colon found, invalid format.
+    StringRef TypeName = Input.substr(Index, ColonIndex - Index);
+    // CountIndex is the start index of count.
+    size_t CountStartIndex = ColonIndex + 1;
+    // NextIndex is the start index after the 'target:count' pair.
+    size_t NextIndex = Input.find_first_of(' ', CountStartIndex);
+    uint64_t Count;
+    if (Input.substr(CountStartIndex, NextIndex - CountStartIndex)
+            .getAsInteger(10, Count))
+      return false; // Invalid count.
+    // Error on duplicated type names in one line of input.
+    auto [Iter, Inserted] = TypeCountMap.insert({TypeName, Count});
+    if (!Inserted)
+      return false;
+    Index = (NextIndex == StringRef::npos)
+                ? StringRef::npos
+                : Input.find_first_not_of(' ', NextIndex);
+  }
+  return true;
+}
+
 /// Parse \p Input as line sample.
 ///
 /// \param Input input line.
@@ -215,6 +244,7 @@ static bool ParseLine(const StringRef &Input, LineType &LineTy, uint32_t &Depth,
                       uint64_t &NumSamples, uint32_t &LineOffset,
                       uint32_t &Discriminator, StringRef &CalleeName,
                       DenseMap<StringRef, uint64_t> &TargetCountMap,
+                      DenseMap<StringRef, uint64_t> &TypeCountMap,
                       uint64_t &FunctionHash, uint32_t &Attributes,
                       bool &IsFlat) {
   for (Depth = 0; Input[Depth] == ' '; Depth++)
@@ -306,6 +336,10 @@ static bool ParseLine(const StringRef &Input, LineType &LineTy, uint32_t &Depth,
       // Change n3 to the next blank space after colon + integer pair.
       n3 = n4;
     }
+  } else if (Rest.starts_with(kVTableProfPrefix)) {
+    LineTy = LineType::VirtualCallTypeProfile;
+    return parseTypeCountMap(Rest.substr(strlen(kVTableProfPrefix)),
+                             TypeCountMap);
   } else {
     LineTy = LineType::CallSiteProfile;
     size_t n3 = Rest.find_last_of(':');
@@ -374,19 +408,27 @@ std::error_code SampleProfileReaderText::readImpl() {
       uint64_t NumSamples;
       StringRef FName;
       DenseMap<StringRef, uint64_t> TargetCountMap;
+      DenseMap<StringRef, uint64_t> TypeCountMap;
       uint32_t Depth, LineOffset, Discriminator;
       LineType LineTy = LineType::BodyProfile;
       uint64_t FunctionHash = 0;
       uint32_t Attributes = 0;
       bool IsFlat = false;
+      // TODO: Update ParseLine to return an error code instead of a bool and
+      // report it.
       if (!ParseLine(*LineIt, LineTy, Depth, NumSamples, LineOffset,
-                     Discriminator, FName, TargetCountMap, FunctionHash,
-                     Attributes, IsFlat)) {
+                     Discriminator, FName, TargetCountMap, TypeCountMap,
+                     FunctionHash, Attributes, IsFlat)) {
         switch (LineTy) {
         case LineType::Metadata:
           reportError(LineIt.line_number(),
                       "Cannot parse metadata: " + *LineIt);
           break;
+        case LineType::VirtualCallTypeProfile:
+          reportError(LineIt.line_number(),
+                      "Expected 'vtables [mangled_vtable:NUM]+', found " +
+                          *LineIt);
+          break;
         default:
           reportError(LineIt.line_number(),
                       "Expected 'NUM[.NUM]: NUM[ mangled_name:NUM]*', found " +
@@ -417,6 +459,14 @@ std::error_code SampleProfileReaderText::readImpl() {
         DepthMetadata = 0;
         break;
       }
+
+      case LineType::VirtualCallTypeProfile: {
+        mergeSampleProfErrors(
+            Result, InlineStack.back()->addCallsiteVTableTypeProfAt(
+                        LineLocation(LineOffset, Discriminator), TypeCountMap));
+        break;
+      }
+
       case LineType::BodyProfile: {
         FunctionSamples &FProfile = *InlineStack.back();
         for (const auto &name_count : TargetCountMap) {
@@ -598,6 +648,67 @@ SampleProfileReaderBinary::readSampleContextFromTable() {
   return std::make_pair(Context, Hash);
 }
 
+std::error_code
+SampleProfileReaderBinary::readVTableTypeCountMap(TypeCountMap &M) {
+  auto NumVTableTypes = readNumber<uint32_t>();
+  if (std::error_code EC = NumVTableTypes.getError())
+    return EC;
+
+  for (uint32_t I = 0; I < *NumVTableTypes; ++I) {
+    auto VTableType(readStringFromTable());
+    if (std::error_code EC = VTableType.getError())
+      return EC;
+
+    auto VTableSamples = readNumber<uint64_t>();
+    if (std::error_code EC = VTableSamples.getError())
+      return EC;
+    // The source profile should not have duplicate vtable records at the same
+    // location. In case duplicate vtables are found, reader can emit a warning
+    // but continue processing the profile.
+    if (!M.insert(std::make_pair(*VTableType, *VTableSamples)).second) {
+      Ctx.diagnose(DiagnosticInfoSampleProfile(
+          Buffer->getBufferIdentifier(), 0,
+          "Duplicate vtable type " + VTableType->str() +
+              " at the same location. Additional counters will be ignored.",
+          DS_Warning));
+      continue;
+    }
+  }
+  return sampleprof_error::success;
+}
+
+std::error_code
+SampleProfileReaderBinary::readCallsiteVTableProf(FunctionSamples &FProfile) {
+  assert(ReadVTableProf &&
+         "Cannot read vtable profiles if ReadVTableProf is false");
+
+  // Read the vtable type profile for the callsite.
+  auto NumCallsites = readNumber<uint32_t>();
+  if (std::error_code EC = NumCallsites.getError())
+    return EC;
+
+  for (uint32_t I = 0; I < *NumCallsites; ++I) {
+    auto LineOffset = readNumber<uint64_t>();
+    if (std::error_code EC = LineOffset.getError())
+      return EC;
+
+    if (!isOffsetLegal(*LineOffset))
+      return sampleprof_error::illegal_line_offset;
+
+    auto Discriminator = readNumber<uint64_t>();
+    if (std::error_code EC = Discriminator.getError())
+      return EC;
+
+    // Here we handle FS discriminators:
+    const uint32_t DiscriminatorVal = (*Discriminator) & getDiscriminatorMask();
+
+    if (std::error_code EC = readVTableTypeCountMap(FProfile.getTypeSamplesAt(
+            LineLocation(*LineOffset, DiscriminatorVal))))
+      return EC;
+  }
+  return sampleprof_error::success;
+}
+
 std::error_code
 SampleProfileReaderBinary::readProfile(FunctionSamples &FProfile) {
   auto NumSamples = readNumber<uint64_t>();
@@ -678,6 +789,9 @@ SampleProfileReaderBinary::readProfile(FunctionSamples &FProfile) {
       return EC;
   }
 
+  if (ReadVTableProf)
+    return readCallsiteVTableProf(FProfile);
+
   return sampleprof_error::success;
 }
 
@@ -740,6 +854,8 @@ std::error_code SampleProfileReaderExtBinaryBase::readOneSection(
       FunctionSamples::ProfileIsPreInlined = ProfileIsPreInlined = true;
     if (hasSecFlag(Entry, SecProfSummaryFlags::SecFlagFSDiscriminator))
       FunctionSamples::ProfileIsFS = ProfileIsFS = true;
+    if (hasSecFlag(Entry, SecProfSummaryFlags::SecFlagHasVTableTypeProf))
+      ReadVTableProf = true;
     break;
   case SecNameTable: {
     bool FixedLengthMD5 =
diff --git a/llvm/lib/ProfileData/SampleProfWriter.cpp b/llvm/lib/ProfileData/SampleProfWriter.cpp
index 9173a0f94f69d..e5f31348578b8 100644
--- a/llvm/lib/ProfileData/SampleProfWriter.cpp
+++ b/llvm/lib/ProfileData/SampleProfWriter.cpp
@@ -41,6 +41,11 @@
 using namespace llvm;
 using namespace sampleprof;
 
+// To begin with, make this option off by default.
+static cl::opt<bool> ExtBinaryWriteVTableTypeProf(
+    "extbinary-write-vtable-type-prof", cl::init(false), cl::Hidden,
+    cl::desc("Write vtable type profile in ext-binary sample profile writer"));
+
 namespace llvm {
 namespace support {
 namespace endian {
@@ -435,6 +440,9 @@ std::error_code SampleProfileWriterExtBinaryBase::writeOneSection(
     addSectionFlag(SecProfSummary, SecProfSummaryFlags::SecFlagIsPreInlined);
   if (Type == SecProfSummary && FunctionSamples::ProfileIsFS)
     addSectionFlag(SecProfSummary, SecProfSummaryFlags::SecFlagFSDiscriminator);
+  if (Type == SecProfSummary && ExtBinaryWriteVTableTypeProf)
+    addSectionFlag(SecProfSummary,
+                   SecProfSummaryFlags::SecFlagHasVTableTypeProf);
 
   uint64_t SectionStart = markSectionStart(Type, LayoutIdx);
   switch (Type) {
@@ -478,6 +486,12 @@ std::error_code SampleProfileWriterExtBinaryBase::writeOneSection(
   return sampleprof_error::success;
 }
 
+SampleProfileWriterExtBinary::SampleProfileWriterExtBinary(
+    std::unique_ptr<raw_ostream> &OS)
+    : SampleProfileWriterExtBinaryBase(OS) {
+  WriteVTableProf = ExtBinaryWriteVTableTypeProf;
+}
+
 std::error_code SampleProfileWriterExtBinary::writeDefaultLayout(
     const SampleProfileMap &ProfileMap) {
   // The const indices passed to writeOneSection below are specifying the
@@ -587,6 +601,19 @@ std::error_code SampleProfileWriterText::writeSample(const FunctionSamples &S) {
       OS << " " << J.first << ":" << J.second;
     OS << "\n";
     LineCount++;
+
+    if (const TypeCountMap *Map = S.findCallsiteTypeSamplesAt(Loc);
+        Map && !Map->empty()) {
+      OS.indent(Indent + 1);
+      Loc.print(OS);
+      OS << ": ";
+      OS << kVTableProfPrefix;
+      for (const auto [TypeName, Count] : *Map) {
+        OS << TypeName << ":" << Count << " ";
+      }
+      OS << "\n";
+      LineCount++;
+    }
   }
 
   SampleSorter<LineLocation, FunctionSamplesMap> SortedCallsiteSamples(
@@ -603,7 +630,21 @@ std::error_code SampleProfileWriterText::writeSample(const FunctionSamples &S) {
       if (std::error_code EC = writeSample(CalleeSamples))
         return EC;
     }
+
+    if (const TypeCountMap *Map = S.findCallsiteTypeSamplesAt(Loc);
+        Map && !Map->empty()) {
+      OS.indent(Indent);
+      Loc.print(OS);
+      OS << ": ";
+      OS << kVTableProfPrefix;
+      for (const auto [TypeId, Count] : *Map) {
+        OS << TypeId << ":" << Count << " ";
+      }
+      OS << "\n";
+      LineCount++;
+    }
   }
+
   Indent -= 1;
 
   if (FunctionSamples::ProfileIsProbeBased) {
@@ -663,6 +704,17 @@ void SampleProfileWriterBinary::addNames(const FunctionSamples &S) {
       addName(CalleeSamples.getFunction());
       addNames(CalleeSamples);
     }
+
+  if (!WriteVTableProf)
+    return;
+  // Add all the vtable names to NameTable.
+  for (const auto &VTableAccessCountMap :
+       llvm::make_second_range(S.getCallsiteTypeCounts())) {
+    // Add type name to NameTable.
+    for (const auto Type : llvm::make_first_range(VTableAccessCountMap)) {
+      addName(Type);
+    }
+  }
 }
 
 void SampleProfileWriterExtBinaryBase::addContext(
@@ -801,6 +853,22 @@ std::error_code SampleProfileWriterExtBinaryBase::writeHeader(
   return sampleprof_error::success;
 }
 
+std::error_code SampleProfileWriterBinary::writeCallsiteVTableProf(
+    const CallsiteTypeMap &CallsiteTypeMap, raw_ostream &OS) {
+  assert(WriteVTableProf &&
+         "writeCallsiteVTableProf should not be called if WriteVTableProf is "
+         "false");
+
+  encodeULEB128(CallsiteTypeMap.size(), OS);
+  for (const auto &[Loc, TypeMap] : CallsiteTypeMap) {
+    Loc.serialize(OS);
+    if (std::error_code EC = serializeTypeMap(TypeMap, getNameTable(), OS))
+      return EC;
+  }
+
+  return sampleprof_error::success;
+}
+
 std::error_code SampleProfileWriterBinary::writeSummary() {
   auto &OS = *OutputStream;
   encodeULEB128(Summary->getTotalCount(), OS);
@@ -838,14 +906,16 @@ std::error_code SampleProfileWriterBinary::writeBody(const FunctionSamples &S) {
   for (const auto &J : S.getCallsiteSamples())
     NumCallsites += J.second.size();
   encodeULEB128(NumCallsites, OS);
-  for (const auto &[Loc, CalleeFunctionSampleMap] : S.getCallsiteSamples())
-    for (const auto &FunctionSample :
-         llvm::make_second_range(CalleeFunctionSampleMap)) {
-      Loc.serialize(OS);
-      if (std::error_code EC = writeBody(FunctionSample))
+  for (const auto &J : S.getCallsiteSamples())
+    for (const auto &FS : J.second) {
+      J.first.serialize(OS);
+      if (std::error_code EC = writeBody(FS.second))
         return EC;
     }
 
+  if (WriteVTableProf)
+    return writeCallsiteVTableProf(S.getCallsiteTypeCounts(), OS);
+
   return sampleprof_error::success;
 }
 
diff --git a/llvm/lib/Remarks/BitstreamRemarkParser.cpp b/llvm/lib/Remarks/BitstreamRemarkParser.cpp
index 86a6c6dffb187..d40b40dfb2ba0 100644
--- a/llvm/lib/Remarks/BitstreamRemarkParser.cpp
+++ b/llvm/lib/Remarks/BitstreamRemarkParser.cpp
@@ -12,7 +12,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "BitstreamRemarkParser.h"
-#include "llvm/Remarks/Remark.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Path.h"
 #include <optional>
@@ -20,27 +19,68 @@
 using namespace llvm;
 using namespace llvm::remarks;
 
-static Error unknownRecord(const char *BlockName, unsigned RecordID) {
-  return createStringError(
-      std::make_error_code(std::errc::illegal_byte_sequence),
-      "Error while parsing %s: unknown record entry (%lu).", BlockName,
-      RecordID);
+namespace {
+
+template <typename... Ts> Error error(char const *Fmt, const Ts &...Vals) {
+  std::string Buffer;
+  raw_string_ostream OS(Buffer);
+  OS << formatv(Fmt, Vals...);
+  return make_error<StringError>(
+      std::move(Buffer),
+      std::make_error_code(std::errc::illegal_byte_sequence));
+}
+
+} // namespace
+
+Error BitstreamBlockParserHelperBase::unknownRecord(unsigned AbbrevID) {
+  return error("Unknown record entry ({}).", AbbrevID);
+}
+
+Error BitstreamBlockParserHelperBase::unexpectedRecord(StringRef RecordName) {
+  return error("Unexpected record entry ({}).", RecordName);
+}
+
+Error BitstreamBlockParserHelperBase::malformedRecord(StringRef RecordName) {
+  return error("Malformed record entry ({}).", RecordName);
+}
+
+Error BitstreamBlockParserHelperBase::unexpectedBlock(unsigned Code) {
+  return error("Unexpected subblock ({}).", Code);
 }
 
-static Error malformedRecord(const char *BlockName, const char *RecordName) {
-  return createStringError(
-      std::make_error_code(std::errc::illegal_byte_sequence),
-      "Error while parsing %s: malformed record entry (%s).", BlockName,
-      RecordName);
+static Expected<unsigned> expectSubBlock(BitstreamCursor &Stream) {
+  Expected<BitstreamEntry> Next = Stream.advance();
+  if (!Next)
+    return Next.takeError();
+  switch (Next->Kind) {
+  case BitstreamEntry::SubBlock:
+    return Next->ID;
+  case BitstreamEntry::Record:
+  case BitstreamEntry::EndBlock:
+    return error("Expected subblock, but got unexpected record.");
+  case BitstreamEntry::Error:
+    return error("Expected subblock, but got unexpected end of bitstream.");
+  }
+  llvm_unreachable("Unexpected BitstreamEntry");
 }
 
-BitstreamMetaParserHelper::BitstreamMetaParserHelper(
-    BitstreamCursor &Stream, BitstreamBlockInfo &BlockInfo)
-    : Stream(Stream), BlockInfo(BlockInfo) {}
+Error BitstreamBlockParserHelperBase::expectBlock() {
+  auto MaybeBlockID = expectSubBlock(Stream);
+  if (!MaybeBlockID)
+    return MaybeBlockID.takeError();
+  if (*MaybeBlockID != BlockID)
+    return error("Expected {} block, but got unexpected block ({}).", BlockName,
+                 *MaybeBlockID);
+  return Error::success();
+}
 
-/// Parse a record and fill in the fields in the parser.
-static Error parseRecord(BitstreamMetaParserHelper &Parser, unsigned Code) {
-  BitstreamCursor &Stream = Parser.Stream;
+Error BitstreamBlockParserHelperBase::enterBlock() {
+  if (Stream.EnterSubBlock(BlockID))
+    return error("Error while entering {} block.", BlockName);
+  return Error::success();
+}
+
+Error BitstreamMetaParserHelper::parseRecord(unsigned Code) {
   // Note: 2 is used here because it's the max number of fields we have per
   // record.
   SmallVector<uint64_t, 2> Record;
@@ -52,171 +92,132 @@ static Error parseRecord(BitstreamMetaParserHelper &Parser, unsigned Code) {
   switch (*RecordID) {
   case RECORD_META_CONTAINER_INFO: {
     if (Record.size() != 2)
-      return malformedRecord("BLOCK_META", "RECORD_META_CONTAINER_INFO");
-    Parser.ContainerVersion = Record[0];
-    Parser.ContainerType = Record[1];
+      return malformedRecord(MetaContainerInfoName);
+    Container = {Record[0], Record[1]};
+    // Error immediately if container version is outdated, so the user sees an
+    // explanation instead of a parser error.
+    if (Container->Version != CurrentContainerVersion) {
+      return ::error(
+          "Unsupported remark container version (expected: {}, read: {}). "
+          "Please upgrade/downgrade your toolchain to read this container.",
+          CurrentContainerVersion, Container->Version);
+    }
     break;
   }
   case RECORD_META_REMARK_VERSION: {
     if (Record.size() != 1)
-      return malformedRecord("BLOCK_META", "RECORD_META_REMARK_VERSION");
-    Parser.RemarkVersion = Record[0];
+      return malformedRecord(MetaRemarkVersionName);
+    RemarkVersion = Record[0];
+    // Error immediately if remark version is outdated, so the user sees an
+    // explanation instead of a parser error.
+    if (*RemarkVersion != CurrentRemarkVersion) {
+      return ::error(
+          "Unsupported remark version in container (expected: {}, read: {}). "
+          "Please upgrade/downgrade your toolchain to read this container.",
+          CurrentRemarkVersion, *RemarkVersion);
+    }
     break;
   }
   case RECORD_META_STRTAB: {
     if (Record.size() != 0)
-      return malformedRecord("BLOCK_META", "RECORD_META_STRTAB");
-    Parser.StrTabBuf = Blob;
+      return malformedRecord(MetaStrTabName);
+    StrTabBuf = Blob;
     break;
   }
   case RECORD_META_EXTERNAL_FILE: {
     if (Record.size() != 0)
-      return malformedRecord("BLOCK_META", "RECORD_META_EXTERNAL_FILE");
-    Parser.ExternalFilePath = Blob;
+      return malformedRecord(MetaExternalFileName);
+    ExternalFilePath = Blob;
     break;
   }
   default:
-    return unknownRecord("BLOCK_META", *RecordID);
+    return unknownRecord(*RecordID);
   }
   return Error::success();
 }
 
-BitstreamRemarkParserHelper::BitstreamRemarkParserHelper(
-    BitstreamCursor &Stream)
-    : Stream(Stream) {}
-
-/// Parse a record and fill in the fields in the parser.
-static Error parseRecord(BitstreamRemarkParserHelper &Parser, unsigned Code) {
-  BitstreamCursor &Stream = Parser.Stream;
-  // Note: 5 is used here because it's the max number of fields we have per
-  // record.
-  SmallVector<uint64_t, 5> Record;
-  StringRef Blob;
-  Expected<unsigned> RecordID = Stream.readRecord(Code, Record, &Blob);
-  if (!RecordID)
-    return RecordID.takeError();
+Error BitstreamRemarkParserHelper::parseRecord(unsigned Code) {
+  Record.clear();
+  Expected<unsigned> MaybeRecordID =
+      Stream.readRecord(Code, Record, &RecordBlob);
+  if (!MaybeRecordID)
+    return MaybeRecordID.takeError();
+  RecordID = *MaybeRecordID;
+  return handleRecord();
+}
 
-  switch (*RecordID) {
+Error BitstreamRemarkParserHelper::handleRecord() {
+  switch (RecordID) {
   case RECORD_REMARK_HEADER: {
     if (Record.size() != 4)
-      return malformedRecord("BLOCK_REMARK", "RECORD_REMARK_HEADER");
-    Parser.Type = Record[0];
-    Parser.RemarkNameIdx = Record[1];
-    Parser.PassNameIdx = Record[2];
-    Parser.FunctionNameIdx = Record[3];
+      return malformedRecord(RemarkHeaderName);
+    Type = Record[0];
+    RemarkNameIdx = Record[1];
+    PassNameIdx = Record[2];
+    FunctionNameIdx = Record[3];
     break;
   }
   case RECORD_REMARK_DEBUG_LOC: {
     if (Record.size() != 3)
-      return malformedRecord("BLOCK_REMARK", "RECORD_REMARK_DEBUG_LOC");
-    Parser.SourceFileNameIdx = Record[0];
-    Parser.SourceLine = Record[1];
-    Parser.SourceColumn = Record[2];
+      return malformedRecord(RemarkDebugLocName);
+    Loc = {Record[0], Record[1], Record[2]};
     break;
   }
   case RECORD_REMARK_HOTNESS: {
     if (Record.size() != 1)
-      return malformedRecord("BLOCK_REMARK", "RECORD_REMARK_HOTNESS");
-    Parser.Hotness = Record[0];
+      return malformedRecord(RemarkHotnessName);
+    Hotness = Record[0];
     break;
   }
   case RECORD_REMARK_ARG_WITH_DEBUGLOC: {
     if (Record.size() != 5)
-      return malformedRecord("BLOCK_REMARK", "RECORD_REMARK_ARG_WITH_DEBUGLOC");
-    // Create a temporary argument. Use that as a valid memory location for this
-    // argument entry.
-    Parser.TmpArgs.emplace_back();
-    Parser.TmpArgs.back().KeyIdx = Record[0];
-    Parser.TmpArgs.back().ValueIdx = Record[1];
-    Parser.TmpArgs.back().SourceFileNameIdx = Record[2];
-    Parser.TmpArgs.back().SourceLine = Record[3];
-    Parser.TmpArgs.back().SourceColumn = Record[4];
-    Parser.Args =
-        ArrayRef<BitstreamRemarkParserHelper::Argument>(Parser.TmpArgs);
+      return malformedRecord(RemarkArgWithDebugLocName);
+    auto &Arg = Args.emplace_back(Record[0], Record[1]);
+    Arg.Loc = {Record[2], Record[3], Record[4]};
     break;
   }
   case RECORD_REMARK_ARG_WITHOUT_DEBUGLOC: {
     if (Record.size() != 2)
-      return malformedRecord("BLOCK_REMARK",
-                             "RECORD_REMARK_ARG_WITHOUT_DEBUGLOC");
-    // Create a temporary argument. Use that as a valid memory location for this
-    // argument entry.
-    Parser.TmpArgs.emplace_back();
-    Parser.TmpArgs.back().KeyIdx = Record[0];
-    Parser.TmpArgs.back().ValueIdx = Record[1];
-    Parser.Args =
-        ArrayRef<BitstreamRemarkParserHelper::Argument>(Parser.TmpArgs);
+      return malformedRecord(RemarkArgWithoutDebugLocName);
+    Args.emplace_back(Record[0], Record[1]);
     break;
   }
   default:
-    return unknownRecord("BLOCK_REMARK", *RecordID);
+    return unknownRecord(RecordID);
   }
   return Error::success();
 }
 
-template <typename T>
-static Error parseBlock(T &ParserHelper, unsigned BlockID,
-                        const char *BlockName) {
-  BitstreamCursor &Stream = ParserHelper.Stream;
-  Expected<BitstreamEntry> Next = Stream.advance();
-  if (!Next)
-    return Next.takeError();
-  if (Next->Kind != BitstreamEntry::SubBlock || Next->ID != BlockID)
-    return createStringError(
-        std::make_error_code(std::errc::illegal_byte_sequence),
-        "Error while parsing %s: expecting [ENTER_SUBBLOCK, %s, ...].",
-        BlockName, BlockName);
-  if (Stream.EnterSubBlock(BlockID))
-    return createStringError(
-        std::make_error_code(std::errc::illegal_byte_sequence),
-        "Error while entering %s.", BlockName);
-
-  // Stop when there is nothing to read anymore or when we encounter an
-  // END_BLOCK.
-  while (!Stream.AtEndOfStream()) {
-    Next = Stream.advance();
-    if (!Next)
-      return Next.takeError();
-    switch (Next->Kind) {
-    case BitstreamEntry::EndBlock:
-      return Error::success();
-    case BitstreamEntry::Error:
-    case BitstreamEntry::SubBlock:
-      return createStringError(
-          std::make_error_code(std::errc::illegal_byte_sequence),
-          "Error while parsing %s: expecting records.", BlockName);
-    case BitstreamEntry::Record:
-      if (Error E = parseRecord(ParserHelper, Next->ID))
-        return E;
-      continue;
-    }
-  }
-  // If we're here, it means we didn't get an END_BLOCK yet, but we're at the
-  // end of the stream. In this case, error.
-  return createStringError(
-      std::make_error_code(std::errc::illegal_byte_sequence),
-      "Error while parsing %s: unterminated block.", BlockName);
-}
-
-Error BitstreamMetaParserHelper::parse() {
-  return parseBlock(*this, META_BLOCK_ID, "META_BLOCK");
-}
+Error BitstreamRemarkParserHelper::parseNext() {
+  Type.reset();
+  RemarkNameIdx.reset();
+  PassNameIdx.reset();
+  FunctionNameIdx.reset();
+  Hotness.reset();
+  Loc.reset();
+  Args.clear();
 
-Error BitstreamRemarkParserHelper::parse() {
-  return parseBlock(*this, REMARK_BLOCK_ID, "REMARK_BLOCK");
+  if (Error E = expectBlock())
+    return E;
+  return parseBlock();
 }
 
 BitstreamParserHelper::BitstreamParserHelper(StringRef Buffer)
     : Stream(Buffer) {}
 
-Expected<std::array<char, 4>> BitstreamParserHelper::parseMagic() {
+Error BitstreamParserHelper::expectMagic() {
   std::array<char, 4> Result;
-  for (unsigned i = 0; i < 4; ++i)
+  for (unsigned I = 0; I < 4; ++I)
     if (Expected<unsigned> R = Stream.Read(8))
-      Result[i] = *R;
+      Result[I] = *R;
     else
       return R.takeError();
-  return Result;
+
+  StringRef MagicNumber{Result.data(), Result.size()};
+  if (MagicNumber != remarks::ContainerMagic)
+    return error("Unknown magic number: expecting {}, got {}.",
+                 remarks::ContainerMagic, MagicNumber);
+  return Error::success();
 }
 
 Error BitstreamParserHelper::parseBlockInfoBlock() {
@@ -225,8 +226,7 @@ Error BitstreamParserHelper::parseBlockInfoBlock() {
     return Next.takeError();
   if (Next->Kind != BitstreamEntry::SubBlock ||
       Next->ID != llvm::bitc::BLOCKINFO_BLOCK_ID)
-    return createStringError(
-        std::make_error_code(std::errc::illegal_byte_sequence),
+    return error(
         "Error while parsing BLOCKINFO_BLOCK: expecting [ENTER_SUBBLOCK, "
         "BLOCKINFO_BLOCK, ...].");
 
@@ -236,9 +236,7 @@ Error BitstreamParserHelper::parseBlockInfoBlock() {
     return MaybeBlockInfo.takeError();
 
   if (!*MaybeBlockInfo)
-    return createStringError(
-        std::make_error_code(std::errc::illegal_byte_sequence),
-        "Error while parsing BLOCKINFO_BLOCK.");
+    return error("Missing BLOCKINFO_BLOCK.");
 
   BlockInfo = **MaybeBlockInfo;
 
@@ -246,77 +244,17 @@ Error BitstreamParserHelper::parseBlockInfoBlock() {
   return Error::success();
 }
 
-static Expected<bool> isBlock(BitstreamCursor &Stream, unsigned BlockID) {
-  bool Result = false;
-  uint64_t PreviousBitNo = Stream.GetCurrentBitNo();
-  Expected<BitstreamEntry> Next = Stream.advance();
-  if (!Next)
-    return Next.takeError();
-  switch (Next->Kind) {
-  case BitstreamEntry::SubBlock:
-    // Check for the block id.
-    Result = Next->ID == BlockID;
-    break;
-  case BitstreamEntry::Error:
-    return createStringError(
-        std::make_error_code(std::errc::illegal_byte_sequence),
-        "Unexpected error while parsing bitstream.");
-  default:
-    Result = false;
-    break;
-  }
-  if (Error E = Stream.JumpToBit(PreviousBitNo))
-    return std::move(E);
-  return Result;
-}
-
-Expected<bool> BitstreamParserHelper::isMetaBlock() {
-  return isBlock(Stream, META_BLOCK_ID);
-}
-
-Expected<bool> BitstreamParserHelper::isRemarkBlock() {
-  return isBlock(Stream, META_BLOCK_ID);
-}
-
-static Error validateMagicNumber(StringRef MagicNumber) {
-  if (MagicNumber != remarks::ContainerMagic)
-    return createStringError(std::make_error_code(std::errc::invalid_argument),
-                             "Unknown magic number: expecting %s, got %.4s.",
-                             remarks::ContainerMagic.data(), MagicNumber.data());
-  return Error::success();
-}
-
-static Error advanceToMetaBlock(BitstreamParserHelper &Helper) {
-  Expected<std::array<char, 4>> MagicNumber = Helper.parseMagic();
-  if (!MagicNumber)
-    return MagicNumber.takeError();
-  if (Error E = validateMagicNumber(
-          StringRef(MagicNumber->data(), MagicNumber->size())))
+Error BitstreamParserHelper::advanceToMetaBlock() {
+  if (Error E = expectMagic())
     return E;
-  if (Error E = Helper.parseBlockInfoBlock())
+  if (Error E = parseBlockInfoBlock())
     return E;
-  Expected<bool> isMetaBlock = Helper.isMetaBlock();
-  if (!isMetaBlock)
-    return isMetaBlock.takeError();
-  if (!*isMetaBlock)
-    return createStringError(
-        std::make_error_code(std::errc::illegal_byte_sequence),
-        "Expecting META_BLOCK after the BLOCKINFO_BLOCK.");
   return Error::success();
 }
 
 Expected<std::unique_ptr<BitstreamRemarkParser>>
 remarks::createBitstreamParserFromMeta(
     StringRef Buf, std::optional<StringRef> ExternalFilePrependPath) {
-  BitstreamParserHelper Helper(Buf);
-  Expected<std::array<char, 4>> MagicNumber = Helper.parseMagic();
-  if (!MagicNumber)
-    return MagicNumber.takeError();
-
-  if (Error E = validateMagicNumber(
-          StringRef(MagicNumber->data(), MagicNumber->size())))
-    return std::move(E);
-
   auto Parser = std::make_unique<BitstreamRemarkParser>(Buf);
 
   if (ExternalFilePrependPath)
@@ -339,13 +277,13 @@ Expected<std::unique_ptr<Remark>> BitstreamRemarkParser::next() {
 }
 
 Error BitstreamRemarkParser::parseMeta() {
-  // Advance and to the meta block.
-  if (Error E = advanceToMetaBlock(ParserHelper))
+  if (Error E = ParserHelper.advanceToMetaBlock())
     return E;
 
-  BitstreamMetaParserHelper MetaHelper(ParserHelper.Stream,
-                                       ParserHelper.BlockInfo);
-  if (Error E = MetaHelper.parse())
+  BitstreamMetaParserHelper MetaHelper(ParserHelper.Stream);
+  if (Error E = MetaHelper.expectBlock())
+    return E;
+  if (Error E = MetaHelper.parseBlock())
     return E;
 
   if (Error E = processCommonMeta(MetaHelper))
@@ -364,59 +302,41 @@ Error BitstreamRemarkParser::parseMeta() {
 
 Error BitstreamRemarkParser::processCommonMeta(
     BitstreamMetaParserHelper &Helper) {
-  if (std::optional<uint64_t> Version = Helper.ContainerVersion)
-    ContainerVersion = *Version;
-  else
-    return createStringError(
-        std::make_error_code(std::errc::illegal_byte_sequence),
-        "Error while parsing BLOCK_META: missing container version.");
-
-  if (std::optional<uint8_t> Type = Helper.ContainerType) {
-    // Always >= BitstreamRemarkContainerType::First since it's unsigned.
-    if (*Type > static_cast<uint8_t>(BitstreamRemarkContainerType::Last))
-      return createStringError(
-          std::make_error_code(std::errc::illegal_byte_sequence),
-          "Error while parsing BLOCK_META: invalid container type.");
-
-    ContainerType = static_cast<BitstreamRemarkContainerType>(*Type);
-  } else
-    return createStringError(
-        std::make_error_code(std::errc::illegal_byte_sequence),
-        "Error while parsing BLOCK_META: missing container type.");
-
+  if (!Helper.Container)
+    return Helper.error("Missing container info.");
+  auto &Container = *Helper.Container;
+  ContainerVersion = Container.Version;
+  // Always >= BitstreamRemarkContainerType::First since it's unsigned.
+  if (Container.Type > static_cast<uint8_t>(BitstreamRemarkContainerType::Last))
+    return Helper.error("Invalid container type.");
+  ContainerType = static_cast<BitstreamRemarkContainerType>(Container.Type);
   return Error::success();
 }
 
-static Error processStrTab(BitstreamRemarkParser &P,
-                           std::optional<StringRef> StrTabBuf) {
-  if (!StrTabBuf)
-    return createStringError(
-        std::make_error_code(std::errc::illegal_byte_sequence),
-        "Error while parsing BLOCK_META: missing string table.");
+Error BitstreamRemarkParser::processStrTab(BitstreamMetaParserHelper &Helper) {
+  if (!Helper.StrTabBuf)
+    return Helper.error("Missing string table.");
   // Parse and assign the string table.
-  P.StrTab.emplace(*StrTabBuf);
+  StrTab.emplace(*Helper.StrTabBuf);
   return Error::success();
 }
 
-static Error processRemarkVersion(BitstreamRemarkParser &P,
-                                  std::optional<uint64_t> RemarkVersion) {
-  if (!RemarkVersion)
-    return createStringError(
-        std::make_error_code(std::errc::illegal_byte_sequence),
-        "Error while parsing BLOCK_META: missing remark version.");
-  P.RemarkVersion = *RemarkVersion;
+Error BitstreamRemarkParser::processRemarkVersion(
+    BitstreamMetaParserHelper &Helper) {
+  if (!Helper.RemarkVersion)
+    return Helper.error("Missing remark version.");
+  RemarkVersion = *Helper.RemarkVersion;
   return Error::success();
 }
 
 Error BitstreamRemarkParser::processExternalFilePath(
-    std::optional<StringRef> ExternalFilePath) {
-  if (!ExternalFilePath)
-    return createStringError(
-        std::make_error_code(std::errc::illegal_byte_sequence),
-        "Error while parsing BLOCK_META: missing external file path.");
+    BitstreamMetaParserHelper &Helper) {
+  if (!Helper.ExternalFilePath)
+    return Helper.error("Missing external file path.");
+  StringRef ExternalFilePath = *Helper.ExternalFilePath;
 
   SmallString<80> FullPath(ExternalFilePrependPath);
-  sys::path::append(FullPath, *ExternalFilePath);
+  sys::path::append(FullPath, ExternalFilePath);
 
   // External file: open the external file, parse it, check if its metadata
   // matches the one from the separate metadata, then replace the current parser
@@ -435,32 +355,22 @@ Error BitstreamRemarkParser::processExternalFilePath(
   // Create a separate parser used for parsing the separate file.
   ParserHelper = BitstreamParserHelper(TmpRemarkBuffer->getBuffer());
   // Advance and check until we can parse the meta block.
-  if (Error E = advanceToMetaBlock(ParserHelper))
+  if (Error E = ParserHelper.advanceToMetaBlock())
     return E;
   // Parse the meta from the separate file.
   // Note: here we overwrite the BlockInfo with the one from the file. This will
   // be used to parse the rest of the file.
-  BitstreamMetaParserHelper SeparateMetaHelper(ParserHelper.Stream,
-                                               ParserHelper.BlockInfo);
-  if (Error E = SeparateMetaHelper.parse())
+  BitstreamMetaParserHelper SeparateMetaHelper(ParserHelper.Stream);
+  if (Error E = SeparateMetaHelper.expectBlock())
+    return E;
+  if (Error E = SeparateMetaHelper.parseBlock())
     return E;
 
-  uint64_t PreviousContainerVersion = ContainerVersion;
   if (Error E = processCommonMeta(SeparateMetaHelper))
     return E;
 
   if (ContainerType != BitstreamRemarkContainerType::SeparateRemarksFile)
-    return createStringError(
-        std::make_error_code(std::errc::illegal_byte_sequence),
-        "Error while parsing external file's BLOCK_META: wrong container "
-        "type.");
-
-  if (PreviousContainerVersion != ContainerVersion)
-    return createStringError(
-        std::make_error_code(std::errc::illegal_byte_sequence),
-        "Error while parsing external file's BLOCK_META: mismatching versions: "
-        "original meta: %lu, external file meta: %lu.",
-        PreviousContainerVersion, ContainerVersion);
+    return SeparateMetaHelper.error("Wrong container type in external file.");
 
   // Process the meta from the separate file.
   return processSeparateRemarksFileMeta(SeparateMetaHelper);
@@ -468,26 +378,26 @@ Error BitstreamRemarkParser::processExternalFilePath(
 
 Error BitstreamRemarkParser::processStandaloneMeta(
     BitstreamMetaParserHelper &Helper) {
-  if (Error E = processStrTab(*this, Helper.StrTabBuf))
+  if (Error E = processStrTab(Helper))
     return E;
-  return processRemarkVersion(*this, Helper.RemarkVersion);
+  return processRemarkVersion(Helper);
 }
 
 Error BitstreamRemarkParser::processSeparateRemarksFileMeta(
     BitstreamMetaParserHelper &Helper) {
-  return processRemarkVersion(*this, Helper.RemarkVersion);
+  return processRemarkVersion(Helper);
 }
 
 Error BitstreamRemarkParser::processSeparateRemarksMetaMeta(
     BitstreamMetaParserHelper &Helper) {
-  if (Error E = processStrTab(*this, Helper.StrTabBuf))
+  if (Error E = processStrTab(Helper))
     return E;
-  return processExternalFilePath(Helper.ExternalFilePath);
+  return processExternalFilePath(Helper);
 }
 
 Expected<std::unique_ptr<Remark>> BitstreamRemarkParser::parseRemark() {
   BitstreamRemarkParserHelper RemarkHelper(ParserHelper.Stream);
-  if (Error E = RemarkHelper.parse())
+  if (Error E = RemarkHelper.parseNext())
     return std::move(E);
 
   return processRemark(RemarkHelper);
@@ -498,28 +408,20 @@ BitstreamRemarkParser::processRemark(BitstreamRemarkParserHelper &Helper) {
   std::unique_ptr<Remark> Result = std::make_unique<Remark>();
   Remark &R = *Result;
 
-  if (StrTab == std::nullopt)
-    return createStringError(
-        std::make_error_code(std::errc::invalid_argument),
-        "Error while parsing BLOCK_REMARK: missing string table.");
+  if (!StrTab)
+    return Helper.error("Missing string table.");
 
   if (!Helper.Type)
-    return createStringError(
-        std::make_error_code(std::errc::illegal_byte_sequence),
-        "Error while parsing BLOCK_REMARK: missing remark type.");
+    return Helper.error("Missing remark type.");
 
   // Always >= Type::First since it's unsigned.
   if (*Helper.Type > static_cast<uint8_t>(Type::Last))
-    return createStringError(
-        std::make_error_code(std::errc::illegal_byte_sequence),
-        "Error while parsing BLOCK_REMARK: unknown remark type.");
+    return Helper.error("Unknown remark type.");
 
   R.RemarkType = static_cast<Type>(*Helper.Type);
 
   if (!Helper.RemarkNameIdx)
-    return createStringError(
-        std::make_error_code(std::errc::illegal_byte_sequence),
-        "Error while parsing BLOCK_REMARK: missing remark name.");
+    return Helper.error("Missing remark name.");
 
   if (Expected<StringRef> RemarkName = (*StrTab)[*Helper.RemarkNameIdx])
     R.RemarkName = *RemarkName;
@@ -527,9 +429,7 @@ BitstreamRemarkParser::processRemark(BitstreamRemarkParserHelper &Helper) {
     return RemarkName.takeError();
 
   if (!Helper.PassNameIdx)
-    return createStringError(
-        std::make_error_code(std::errc::illegal_byte_sequence),
-        "Error while parsing BLOCK_REMARK: missing remark pass.");
+    return Helper.error("Missing remark pass.");
 
   if (Expected<StringRef> PassName = (*StrTab)[*Helper.PassNameIdx])
     R.PassName = *PassName;
@@ -537,61 +437,53 @@ BitstreamRemarkParser::processRemark(BitstreamRemarkParserHelper &Helper) {
     return PassName.takeError();
 
   if (!Helper.FunctionNameIdx)
-    return createStringError(
-        std::make_error_code(std::errc::illegal_byte_sequence),
-        "Error while parsing BLOCK_REMARK: missing remark function name.");
+    return Helper.error("Missing remark function name.");
+
   if (Expected<StringRef> FunctionName = (*StrTab)[*Helper.FunctionNameIdx])
     R.FunctionName = *FunctionName;
   else
     return FunctionName.takeError();
 
-  if (Helper.SourceFileNameIdx && Helper.SourceLine && Helper.SourceColumn) {
-    Expected<StringRef> SourceFileName = (*StrTab)[*Helper.SourceFileNameIdx];
+  if (Helper.Loc) {
+    Expected<StringRef> SourceFileName =
+        (*StrTab)[Helper.Loc->SourceFileNameIdx];
     if (!SourceFileName)
       return SourceFileName.takeError();
     R.Loc.emplace();
     R.Loc->SourceFilePath = *SourceFileName;
-    R.Loc->SourceLine = *Helper.SourceLine;
-    R.Loc->SourceColumn = *Helper.SourceColumn;
+    R.Loc->SourceLine = Helper.Loc->SourceLine;
+    R.Loc->SourceColumn = Helper.Loc->SourceColumn;
   }
 
   if (Helper.Hotness)
     R.Hotness = *Helper.Hotness;
 
-  if (!Helper.Args)
-    return std::move(Result);
-
-  for (const BitstreamRemarkParserHelper::Argument &Arg : *Helper.Args) {
+  for (const BitstreamRemarkParserHelper::Argument &Arg : Helper.Args) {
     if (!Arg.KeyIdx)
-      return createStringError(
-          std::make_error_code(std::errc::illegal_byte_sequence),
-          "Error while parsing BLOCK_REMARK: missing key in remark argument.");
+      return Helper.error("Missing key in remark argument.");
     if (!Arg.ValueIdx)
-      return createStringError(
-          std::make_error_code(std::errc::illegal_byte_sequence),
-          "Error while parsing BLOCK_REMARK: missing value in remark "
-          "argument.");
+      return Helper.error("Missing value in remark argument.");
 
     // We have at least a key and a value, create an entry.
-    R.Args.emplace_back();
+    auto &RArg = R.Args.emplace_back();
 
     if (Expected<StringRef> Key = (*StrTab)[*Arg.KeyIdx])
-      R.Args.back().Key = *Key;
+      RArg.Key = *Key;
     else
       return Key.takeError();
 
     if (Expected<StringRef> Value = (*StrTab)[*Arg.ValueIdx])
-      R.Args.back().Val = *Value;
+      RArg.Val = *Value;
     else
       return Value.takeError();
 
-    if (Arg.SourceFileNameIdx && Arg.SourceLine && Arg.SourceColumn) {
+    if (Arg.Loc) {
       if (Expected<StringRef> SourceFileName =
-              (*StrTab)[*Arg.SourceFileNameIdx]) {
-        R.Args.back().Loc.emplace();
-        R.Args.back().Loc->SourceFilePath = *SourceFileName;
-        R.Args.back().Loc->SourceLine = *Arg.SourceLine;
-        R.Args.back().Loc->SourceColumn = *Arg.SourceColumn;
+              (*StrTab)[Arg.Loc->SourceFileNameIdx]) {
+        RArg.Loc.emplace();
+        RArg.Loc->SourceFilePath = *SourceFileName;
+        RArg.Loc->SourceLine = Arg.Loc->SourceLine;
+        RArg.Loc->SourceColumn = Arg.Loc->SourceColumn;
       } else
         return SourceFileName.takeError();
     }
diff --git a/llvm/lib/Remarks/BitstreamRemarkParser.h b/llvm/lib/Remarks/BitstreamRemarkParser.h
index cba805dc24b59..d756e3296a871 100644
--- a/llvm/lib/Remarks/BitstreamRemarkParser.h
+++ b/llvm/lib/Remarks/BitstreamRemarkParser.h
@@ -13,14 +13,15 @@
 #ifndef LLVM_LIB_REMARKS_BITSTREAM_REMARK_PARSER_H
 #define LLVM_LIB_REMARKS_BITSTREAM_REMARK_PARSER_H
 
-#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Bitstream/BitstreamReader.h"
 #include "llvm/Remarks/BitstreamRemarkContainer.h"
+#include "llvm/Remarks/Remark.h"
 #include "llvm/Remarks/RemarkFormat.h"
 #include "llvm/Remarks/RemarkParser.h"
+#include "llvm/Remarks/RemarkStringTable.h"
 #include "llvm/Support/Error.h"
-#include <array>
+#include "llvm/Support/FormatVariadic.h"
 #include <cstdint>
 #include <memory>
 #include <optional>
@@ -28,66 +29,156 @@
 namespace llvm {
 namespace remarks {
 
-struct Remark;
+class BitstreamBlockParserHelperBase {
+protected:
+  BitstreamCursor &Stream;
+
+  StringRef BlockName;
+  unsigned BlockID;
+
+public:
+  BitstreamBlockParserHelperBase(BitstreamCursor &Stream, unsigned BlockID,
+                                 StringRef BlockName)
+      : Stream(Stream), BlockName(BlockName), BlockID(BlockID) {}
+
+  template <typename... Ts> Error error(char const *Fmt, const Ts &...Vals) {
+    std::string Buffer;
+    raw_string_ostream OS(Buffer);
+    OS << "Error while parsing " << BlockName << " block: ";
+    OS << formatv(Fmt, Vals...);
+    return make_error<StringError>(
+        std::move(Buffer),
+        std::make_error_code(std::errc::illegal_byte_sequence));
+  }
+
+  Error expectBlock();
+
+protected:
+  Error enterBlock();
+
+  Error unknownRecord(unsigned AbbrevID);
+  Error unexpectedRecord(StringRef RecordName);
+  Error malformedRecord(StringRef RecordName);
+  Error unexpectedBlock(unsigned Code);
+};
+
+template <typename Derived>
+class BitstreamBlockParserHelper : public BitstreamBlockParserHelperBase {
+protected:
+  using BitstreamBlockParserHelperBase::BitstreamBlockParserHelperBase;
+  Derived &derived() { return *static_cast<Derived *>(this); }
+
+  /// Parse a record and fill in the fields in the parser.
+  /// The subclass must statically override this method.
+  Error parseRecord(unsigned Code) = delete;
+
+  /// Parse a subblock and fill in the fields in the parser.
+  /// The subclass can statically override this method.
+  Error parseSubBlock(unsigned Code) { return unexpectedBlock(Code); }
+
+public:
+  /// Enter, parse, and leave this bitstream block. This expects the
+  /// BitstreamCursor to be right after the SubBlock entry (i.e. after calling
+  /// expectBlock).
+  Error parseBlock() {
+    if (Error E = enterBlock())
+      return E;
+
+    // Stop when there is nothing to read anymore or when we encounter an
+    // END_BLOCK.
+    while (true) {
+      Expected<BitstreamEntry> Next = Stream.advance();
+      if (!Next)
+        return Next.takeError();
+      switch (Next->Kind) {
+      case BitstreamEntry::SubBlock:
+        if (Error E = derived().parseSubBlock(Next->ID))
+          return E;
+        continue;
+      case BitstreamEntry::EndBlock:
+        return Error::success();
+      case BitstreamEntry::Record:
+        if (Error E = derived().parseRecord(Next->ID))
+          return E;
+        continue;
+      case BitstreamEntry::Error:
+        return error("Unexpected end of bitstream.");
+      }
+      llvm_unreachable("Unexpected BitstreamEntry");
+    }
+  }
+};
 
 /// Helper to parse a META_BLOCK for a bitstream remark container.
-struct BitstreamMetaParserHelper {
-  /// The Bitstream reader.
-  BitstreamCursor &Stream;
-  /// Reference to the storage for the block info.
-  BitstreamBlockInfo &BlockInfo;
-  /// The parsed content: depending on the container type, some fields might be
-  /// empty.
-  std::optional<uint64_t> ContainerVersion;
-  std::optional<uint8_t> ContainerType;
-  std::optional<StringRef> StrTabBuf;
-  std::optional<StringRef> ExternalFilePath;
+class BitstreamMetaParserHelper
+    : public BitstreamBlockParserHelper<BitstreamMetaParserHelper> {
+  friend class BitstreamBlockParserHelper;
+
+public:
+  struct ContainerInfo {
+    uint64_t Version;
+    uint64_t Type;
+  };
+
+  /// The parsed content: depending on the container type, some fields might
+  /// be empty.
+  std::optional<ContainerInfo> Container;
   std::optional<uint64_t> RemarkVersion;
+  std::optional<StringRef> ExternalFilePath;
+  std::optional<StringRef> StrTabBuf;
 
-  /// Continue parsing with \p Stream. \p Stream is expected to contain a
-  /// ENTER_SUBBLOCK to the META_BLOCK at the current position.
-  /// \p Stream is expected to have a BLOCKINFO_BLOCK set.
-  BitstreamMetaParserHelper(BitstreamCursor &Stream,
-                            BitstreamBlockInfo &BlockInfo);
+  BitstreamMetaParserHelper(BitstreamCursor &Stream)
+      : BitstreamBlockParserHelper(Stream, META_BLOCK_ID, MetaBlockName) {}
 
-  /// Parse the META_BLOCK and fill the available entries.
-  /// This helper does not check for the validity of the fields.
-  Error parse();
+protected:
+  Error parseRecord(unsigned Code);
 };
 
 /// Helper to parse a REMARK_BLOCK for a bitstream remark container.
-struct BitstreamRemarkParserHelper {
-  /// The Bitstream reader.
-  BitstreamCursor &Stream;
+class BitstreamRemarkParserHelper
+    : public BitstreamBlockParserHelper<BitstreamRemarkParserHelper> {
+  friend class BitstreamBlockParserHelper;
+
+protected:
+  SmallVector<uint64_t, 5> Record;
+  StringRef RecordBlob;
+  unsigned RecordID;
+
+public:
+  struct RemarkLoc {
+    uint64_t SourceFileNameIdx;
+    uint64_t SourceLine;
+    uint64_t SourceColumn;
+  };
+
+  struct Argument {
+    std::optional<uint64_t> KeyIdx;
+    std::optional<uint64_t> ValueIdx;
+    std::optional<RemarkLoc> Loc;
+
+    Argument(std::optional<uint64_t> KeyIdx, std::optional<uint64_t> ValueIdx)
+        : KeyIdx(KeyIdx), ValueIdx(ValueIdx) {}
+  };
+
   /// The parsed content: depending on the remark, some fields might be empty.
   std::optional<uint8_t> Type;
   std::optional<uint64_t> RemarkNameIdx;
   std::optional<uint64_t> PassNameIdx;
   std::optional<uint64_t> FunctionNameIdx;
-  std::optional<uint64_t> SourceFileNameIdx;
-  std::optional<uint32_t> SourceLine;
-  std::optional<uint32_t> SourceColumn;
   std::optional<uint64_t> Hotness;
-  struct Argument {
-    std::optional<uint64_t> KeyIdx;
-    std::optional<uint64_t> ValueIdx;
-    std::optional<uint64_t> SourceFileNameIdx;
-    std::optional<uint32_t> SourceLine;
-    std::optional<uint32_t> SourceColumn;
-  };
-  std::optional<ArrayRef<Argument>> Args;
-  /// Avoid re-allocating a vector every time.
-  SmallVector<Argument, 8> TmpArgs;
-
-  /// Continue parsing with \p Stream. \p Stream is expected to contain a
-  /// ENTER_SUBBLOCK to the REMARK_BLOCK at the current position.
-  /// \p Stream is expected to have a BLOCKINFO_BLOCK set and to have already
-  /// parsed the META_BLOCK.
-  BitstreamRemarkParserHelper(BitstreamCursor &Stream);
-
-  /// Parse the REMARK_BLOCK and fill the available entries.
-  /// This helper does not check for the validity of the fields.
-  Error parse();
+  std::optional<RemarkLoc> Loc;
+
+  SmallVector<Argument, 8> Args;
+
+  BitstreamRemarkParserHelper(BitstreamCursor &Stream)
+      : BitstreamBlockParserHelper(Stream, REMARK_BLOCK_ID, RemarkBlockName) {}
+
+  /// Clear helper state and parse next remark block.
+  Error parseNext();
+
+protected:
+  Error parseRecord(unsigned Code);
+  Error handleRecord();
 };
 
 /// Helper to parse any bitstream remark container.
@@ -98,21 +189,15 @@ struct BitstreamParserHelper {
   BitstreamBlockInfo BlockInfo;
   /// Start parsing at \p Buffer.
   BitstreamParserHelper(StringRef Buffer);
-  /// Parse the magic number.
-  Expected<std::array<char, 4>> parseMagic();
+  /// Parse and validate the magic number.
+  Error expectMagic();
+  /// Advance to the meta block
+  Error advanceToMetaBlock();
   /// Parse the block info block containing all the abbrevs.
   /// This needs to be called before calling any other parsing function.
   Error parseBlockInfoBlock();
-  /// Return true if the next block is a META_BLOCK. This function does not move
-  /// the cursor.
-  Expected<bool> isMetaBlock();
-  /// Return true if the next block is a REMARK_BLOCK. This function does not
-  /// move the cursor.
-  Expected<bool> isRemarkBlock();
   /// Return true if the parser reached the end of the stream.
   bool atEndOfStream() { return Stream.AtEndOfStream(); }
-  /// Jump to the end of the stream, skipping everything.
-  void skipToEnd() { return Stream.skipToEnd(); }
 };
 
 /// Parses and holds the state of the latest parsed remark.
@@ -149,14 +234,16 @@ struct BitstreamRemarkParser : public RemarkParser {
   Expected<std::unique_ptr<Remark>> parseRemark();
 
 private:
-  /// Helper functions.
   Error processCommonMeta(BitstreamMetaParserHelper &Helper);
   Error processStandaloneMeta(BitstreamMetaParserHelper &Helper);
   Error processSeparateRemarksFileMeta(BitstreamMetaParserHelper &Helper);
   Error processSeparateRemarksMetaMeta(BitstreamMetaParserHelper &Helper);
+  Error processExternalFilePath(BitstreamMetaParserHelper &Helper);
+  Error processStrTab(BitstreamMetaParserHelper &Helper);
+  Error processRemarkVersion(BitstreamMetaParserHelper &Helper);
+
   Expected<std::unique_ptr<Remark>>
   processRemark(BitstreamRemarkParserHelper &Helper);
-  Error processExternalFilePath(std::optional<StringRef> ExternalFilePath);
 };
 
 Expected<std::unique_ptr<BitstreamRemarkParser>> createBitstreamParserFromMeta(
diff --git a/llvm/lib/Support/CMakeLists.txt b/llvm/lib/Support/CMakeLists.txt
index 2528e8bd1142a..7da972f372c5b 100644
--- a/llvm/lib/Support/CMakeLists.txt
+++ b/llvm/lib/Support/CMakeLists.txt
@@ -135,6 +135,7 @@ if (UNIX AND "${CMAKE_SYSTEM_NAME}" MATCHES "AIX")
 endif()
 
 add_subdirectory(BLAKE3)
+add_subdirectory(LSP)
 
 add_llvm_component_library(LLVMSupport
   ABIBreak.cpp
diff --git a/llvm/lib/Support/LSP/CMakeLists.txt b/llvm/lib/Support/LSP/CMakeLists.txt
new file mode 100644
index 0000000000000..6bc9d636fbdfe
--- /dev/null
+++ b/llvm/lib/Support/LSP/CMakeLists.txt
@@ -0,0 +1,8 @@
+add_llvm_component_library(LLVMSupportLSP 
+  Protocol.cpp
+  Transport.cpp
+  Logging.cpp
+
+  LINK_COMPONENTS
+  Support
+)
diff --git a/mlir/lib/Tools/lsp-server-support/Logging.cpp b/llvm/lib/Support/LSP/Logging.cpp
similarity index 55%
rename from mlir/lib/Tools/lsp-server-support/Logging.cpp
rename to llvm/lib/Support/LSP/Logging.cpp
index 373e2165c244d..b36621ae1c6c6 100644
--- a/mlir/lib/Tools/lsp-server-support/Logging.cpp
+++ b/llvm/lib/Support/LSP/Logging.cpp
@@ -6,36 +6,36 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "mlir/Tools/lsp-server-support/Logging.h"
+#include "llvm/Support/LSP/Logging.h"
 #include "llvm/Support/Chrono.h"
 #include "llvm/Support/raw_ostream.h"
 
-using namespace mlir;
-using namespace mlir::lsp;
+using namespace llvm;
+using namespace llvm::lsp;
 
-void Logger::setLogLevel(Level logLevel) { get().logLevel = logLevel; }
+void Logger::setLogLevel(Level LogLevel) { get().LogLevel = LogLevel; }
 
 Logger &Logger::get() {
-  static Logger logger;
-  return logger;
+  static Logger Logger;
+  return Logger;
 }
 
-void Logger::log(Level logLevel, const char *fmt,
-                 const llvm::formatv_object_base &message) {
-  Logger &logger = get();
+void Logger::log(Level LogLevel, const char *Fmt,
+                 const llvm::formatv_object_base &Message) {
+  Logger &Logger = get();
 
   // Ignore messages with log levels below the current setting in the logger.
-  if (logLevel < logger.logLevel)
+  if (LogLevel < Logger.LogLevel)
     return;
 
   // An indicator character for each log level.
-  const char *logLevelIndicators = "DIE";
+  const char *LogLevelIndicators = "DIE";
 
   // Format the message and print to errs.
-  llvm::sys::TimePoint<> timestamp = std::chrono::system_clock::now();
-  std::lock_guard<std::mutex> logGuard(logger.mutex);
+  llvm::sys::TimePoint<> Timestamp = std::chrono::system_clock::now();
+  std::lock_guard<std::mutex> LogGuard(Logger.Mutex);
   llvm::errs() << llvm::formatv(
       "{0}[{1:%H:%M:%S.%L}] {2}\n",
-      logLevelIndicators[static_cast<unsigned>(logLevel)], timestamp, message);
+      LogLevelIndicators[static_cast<unsigned>(LogLevel)], Timestamp, Message);
   llvm::errs().flush();
 }
diff --git a/llvm/lib/Support/LSP/Protocol.cpp b/llvm/lib/Support/LSP/Protocol.cpp
new file mode 100644
index 0000000000000..f22126345a435
--- /dev/null
+++ b/llvm/lib/Support/LSP/Protocol.cpp
@@ -0,0 +1,1043 @@
+//===--- Protocol.cpp - Language Server Protocol Implementation -----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the serialization code for the LSP structs.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/LSP/Protocol.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringSet.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/JSON.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+using namespace llvm::lsp;
+
+// Helper that doesn't treat `null` and absent fields as failures.
+template <typename T>
+static bool mapOptOrNull(const llvm::json::Value &Params,
+                         llvm::StringLiteral Prop, T &Out,
+                         llvm::json::Path Path) {
+  const llvm::json::Object *O = Params.getAsObject();
+  assert(O);
+
+  // Field is missing or null.
+  auto *V = O->get(Prop);
+  if (!V || V->getAsNull())
+    return true;
+  return fromJSON(*V, Out, Path.field(Prop));
+}
+
+//===----------------------------------------------------------------------===//
+// LSPError
+//===----------------------------------------------------------------------===//
+
+char LSPError::ID;
+
+//===----------------------------------------------------------------------===//
+// URIForFile
+//===----------------------------------------------------------------------===//
+
+static bool isWindowsPath(StringRef Path) {
+  return Path.size() > 1 && llvm::isAlpha(Path[0]) && Path[1] == ':';
+}
+
+static bool isNetworkPath(StringRef Path) {
+  return Path.size() > 2 && Path[0] == Path[1] &&
+         llvm::sys::path::is_separator(Path[0]);
+}
+
+static bool shouldEscapeInURI(unsigned char C) {
+  // Unreserved characters.
+  if ((C >= 'a' && C <= 'z') || (C >= 'A' && C <= 'Z') ||
+      (C >= '0' && C <= '9'))
+    return false;
+
+  switch (C) {
+  case '-':
+  case '_':
+  case '.':
+  case '~':
+  // '/' is only reserved when parsing.
+  case '/':
+  // ':' is only reserved for relative URI paths, which we doesn't produce.
+  case ':':
+    return false;
+  }
+  return true;
+}
+
+/// Encodes a string according to percent-encoding.
+/// - Unreserved characters are not escaped.
+/// - Reserved characters always escaped with exceptions like '/'.
+/// - All other characters are escaped.
+static void percentEncode(StringRef Content, std::string &Out) {
+  for (unsigned char C : Content) {
+    if (shouldEscapeInURI(C)) {
+      Out.push_back('%');
+      Out.push_back(llvm::hexdigit(C / 16));
+      Out.push_back(llvm::hexdigit(C % 16));
+    } else {
+      Out.push_back(C);
+    }
+  }
+}
+
+/// Decodes a string according to percent-encoding.
+static std::string percentDecode(StringRef Content) {
+  std::string Result;
+  for (auto I = Content.begin(), E = Content.end(); I != E; ++I) {
+    if (*I != '%') {
+      Result += *I;
+      continue;
+    }
+    if (*I == '%' && I + 2 < Content.end() && llvm::isHexDigit(*(I + 1)) &&
+        llvm::isHexDigit(*(I + 2))) {
+      Result.push_back(llvm::hexFromNibbles(*(I + 1), *(I + 2)));
+      I += 2;
+    } else {
+      Result.push_back(*I);
+    }
+  }
+  return Result;
+}
+
+/// Return the set containing the supported URI schemes.
+static StringSet<> &getSupportedSchemes() {
+  static StringSet<> Schemes({"file", "test"});
+  return Schemes;
+}
+
+/// Returns true if the given scheme is structurally valid, i.e. it does not
+/// contain any invalid scheme characters. This does not check that the scheme
+/// is actually supported.
+static bool isStructurallyValidScheme(StringRef Scheme) {
+  if (Scheme.empty())
+    return false;
+  if (!llvm::isAlpha(Scheme[0]))
+    return false;
+  return llvm::all_of(llvm::drop_begin(Scheme), [](char C) {
+    return llvm::isAlnum(C) || C == '+' || C == '.' || C == '-';
+  });
+}
+
+static llvm::Expected<std::string> uriFromAbsolutePath(StringRef AbsolutePath,
+                                                       StringRef Scheme) {
+  std::string Body;
+  StringRef Authority;
+  StringRef Root = llvm::sys::path::root_name(AbsolutePath);
+  if (isNetworkPath(Root)) {
+    // Windows UNC paths e.g. \\server\share => file://server/share
+    Authority = Root.drop_front(2);
+    AbsolutePath.consume_front(Root);
+  } else if (isWindowsPath(Root)) {
+    // Windows paths e.g. X:\path => file:///X:/path
+    Body = "/";
+  }
+  Body += llvm::sys::path::convert_to_slash(AbsolutePath);
+
+  std::string Uri = Scheme.str() + ":";
+  if (Authority.empty() && Body.empty())
+    return Uri;
+
+  // If authority if empty, we only print body if it starts with "/"; otherwise,
+  // the URI is invalid.
+  if (!Authority.empty() || StringRef(Body).starts_with("/")) {
+    Uri.append("//");
+    percentEncode(Authority, Uri);
+  }
+  percentEncode(Body, Uri);
+  return Uri;
+}
+
+static llvm::Expected<std::string> getAbsolutePath(StringRef Authority,
+                                                   StringRef Body) {
+  if (!Body.starts_with("/"))
+    return llvm::createStringError(
+        llvm::inconvertibleErrorCode(),
+        "File scheme: expect body to be an absolute path starting "
+        "with '/': " +
+            Body);
+  SmallString<128> Path;
+  if (!Authority.empty()) {
+    // Windows UNC paths e.g. file://server/share => \\server\share
+    ("//" + Authority).toVector(Path);
+  } else if (isWindowsPath(Body.substr(1))) {
+    // Windows paths e.g. file:///X:/path => X:\path
+    Body.consume_front("/");
+  }
+  Path.append(Body);
+  llvm::sys::path::native(Path);
+  return std::string(Path);
+}
+
+static llvm::Expected<std::string> parseFilePathFromURI(StringRef OrigUri) {
+  StringRef Uri = OrigUri;
+
+  // Decode the scheme of the URI.
+  size_t Pos = Uri.find(':');
+  if (Pos == StringRef::npos)
+    return llvm::createStringError(llvm::inconvertibleErrorCode(),
+                                   "Scheme must be provided in URI: " +
+                                       OrigUri);
+  StringRef SchemeStr = Uri.substr(0, Pos);
+  std::string UriScheme = percentDecode(SchemeStr);
+  if (!isStructurallyValidScheme(UriScheme))
+    return llvm::createStringError(llvm::inconvertibleErrorCode(),
+                                   "Invalid scheme: " + SchemeStr +
+                                       " (decoded: " + UriScheme + ")");
+  Uri = Uri.substr(Pos + 1);
+
+  // Decode the authority of the URI.
+  std::string UriAuthority;
+  if (Uri.consume_front("//")) {
+    Pos = Uri.find('/');
+    UriAuthority = percentDecode(Uri.substr(0, Pos));
+    Uri = Uri.substr(Pos);
+  }
+
+  // Decode the body of the URI.
+  std::string UriBody = percentDecode(Uri);
+
+  // Compute the absolute path for this uri.
+  if (!getSupportedSchemes().contains(UriScheme)) {
+    return llvm::createStringError(llvm::inconvertibleErrorCode(),
+                                   "unsupported URI scheme `" + UriScheme +
+                                       "' for workspace files");
+  }
+  return getAbsolutePath(UriAuthority, UriBody);
+}
+
+llvm::Expected<URIForFile> URIForFile::fromURI(StringRef Uri) {
+  llvm::Expected<std::string> FilePath = parseFilePathFromURI(Uri);
+  if (!FilePath)
+    return FilePath.takeError();
+  return URIForFile(std::move(*FilePath), Uri.str());
+}
+
+llvm::Expected<URIForFile> URIForFile::fromFile(StringRef AbsoluteFilepath,
+                                                StringRef Scheme) {
+  llvm::Expected<std::string> Uri =
+      uriFromAbsolutePath(AbsoluteFilepath, Scheme);
+  if (!Uri)
+    return Uri.takeError();
+  return fromURI(*Uri);
+}
+
+StringRef URIForFile::scheme() const { return uri().split(':').first; }
+
+void URIForFile::registerSupportedScheme(StringRef Scheme) {
+  getSupportedSchemes().insert(Scheme);
+}
+
+bool llvm::lsp::fromJSON(const llvm::json::Value &Value, URIForFile &Result,
+                         llvm::json::Path Path) {
+  if (std::optional<StringRef> Str = Value.getAsString()) {
+    llvm::Expected<URIForFile> ExpectedUri = URIForFile::fromURI(*Str);
+    if (!ExpectedUri) {
+      Path.report("unresolvable URI");
+      consumeError(ExpectedUri.takeError());
+      return false;
+    }
+    Result = std::move(*ExpectedUri);
+    return true;
+  }
+  return false;
+}
+
+llvm::json::Value llvm::lsp::toJSON(const URIForFile &Value) {
+  return Value.uri();
+}
+
+raw_ostream &llvm::lsp::operator<<(raw_ostream &Os, const URIForFile &Value) {
+  return Os << Value.uri();
+}
+
+//===----------------------------------------------------------------------===//
+// ClientCapabilities
+//===----------------------------------------------------------------------===//
+
+bool llvm::lsp::fromJSON(const llvm::json::Value &Value,
+                         ClientCapabilities &Result, llvm::json::Path Path) {
+  const llvm::json::Object *O = Value.getAsObject();
+  if (!O) {
+    Path.report("expected object");
+    return false;
+  }
+  if (const llvm::json::Object *TextDocument = O->getObject("textDocument")) {
+    if (const llvm::json::Object *DocumentSymbol =
+            TextDocument->getObject("documentSymbol")) {
+      if (std::optional<bool> HierarchicalSupport =
+              DocumentSymbol->getBoolean("hierarchicalDocumentSymbolSupport"))
+        Result.hierarchicalDocumentSymbol = *HierarchicalSupport;
+    }
+    if (auto *CodeAction = TextDocument->getObject("codeAction")) {
+      if (CodeAction->getObject("codeActionLiteralSupport"))
+        Result.codeActionStructure = true;
+    }
+  }
+  if (auto *Window = O->getObject("window")) {
+    if (std::optional<bool> WorkDoneProgressSupport =
+            Window->getBoolean("workDoneProgress"))
+      Result.workDoneProgress = *WorkDoneProgressSupport;
+  }
+  return true;
+}
+
+//===----------------------------------------------------------------------===//
+// ClientInfo
+//===----------------------------------------------------------------------===//
+
+bool llvm::lsp::fromJSON(const llvm::json::Value &Value, ClientInfo &Result,
+                         llvm::json::Path Path) {
+  llvm::json::ObjectMapper O(Value, Path);
+  if (!O || !O.map("name", Result.name))
+    return false;
+
+  // Don't fail if we can't parse version.
+  O.map("version", Result.version);
+  return true;
+}
+
+//===----------------------------------------------------------------------===//
+// InitializeParams
+//===----------------------------------------------------------------------===//
+
+bool llvm::lsp::fromJSON(const llvm::json::Value &Value, TraceLevel &Result,
+                         llvm::json::Path Path) {
+  if (std::optional<StringRef> Str = Value.getAsString()) {
+    if (*Str == "off") {
+      Result = TraceLevel::Off;
+      return true;
+    }
+    if (*Str == "messages") {
+      Result = TraceLevel::Messages;
+      return true;
+    }
+    if (*Str == "verbose") {
+      Result = TraceLevel::Verbose;
+      return true;
+    }
+  }
+  return false;
+}
+
+bool llvm::lsp::fromJSON(const llvm::json::Value &Value,
+                         InitializeParams &Result, llvm::json::Path Path) {
+  llvm::json::ObjectMapper O(Value, Path);
+  if (!O)
+    return false;
+  // We deliberately don't fail if we can't parse individual fields.
+  O.map("capabilities", Result.capabilities);
+  O.map("trace", Result.trace);
+  mapOptOrNull(Value, "clientInfo", Result.clientInfo, Path);
+
+  return true;
+}
+
+//===----------------------------------------------------------------------===//
+// TextDocumentItem
+//===----------------------------------------------------------------------===//
+
+bool llvm::lsp::fromJSON(const llvm::json::Value &Value,
+                         TextDocumentItem &Result, llvm::json::Path Path) {
+  llvm::json::ObjectMapper O(Value, Path);
+  return O && O.map("uri", Result.uri) &&
+         O.map("languageId", Result.languageId) && O.map("text", Result.text) &&
+         O.map("version", Result.version);
+}
+
+//===----------------------------------------------------------------------===//
+// TextDocumentIdentifier
+//===----------------------------------------------------------------------===//
+
+llvm::json::Value llvm::lsp::toJSON(const TextDocumentIdentifier &Value) {
+  return llvm::json::Object{{"uri", Value.uri}};
+}
+
+bool llvm::lsp::fromJSON(const llvm::json::Value &Value,
+                         TextDocumentIdentifier &Result,
+                         llvm::json::Path Path) {
+  llvm::json::ObjectMapper O(Value, Path);
+  return O && O.map("uri", Result.uri);
+}
+
+//===----------------------------------------------------------------------===//
+// VersionedTextDocumentIdentifier
+//===----------------------------------------------------------------------===//
+
+llvm::json::Value
+llvm::lsp::toJSON(const VersionedTextDocumentIdentifier &Value) {
+  return llvm::json::Object{
+      {"uri", Value.uri},
+      {"version", Value.version},
+  };
+}
+
+bool llvm::lsp::fromJSON(const llvm::json::Value &Value,
+                         VersionedTextDocumentIdentifier &Result,
+                         llvm::json::Path Path) {
+  llvm::json::ObjectMapper O(Value, Path);
+  return O && O.map("uri", Result.uri) && O.map("version", Result.version);
+}
+
+//===----------------------------------------------------------------------===//
+// Position
+//===----------------------------------------------------------------------===//
+
+bool llvm::lsp::fromJSON(const llvm::json::Value &Value, Position &Result,
+                         llvm::json::Path Path) {
+  llvm::json::ObjectMapper O(Value, Path);
+  return O && O.map("line", Result.line) &&
+         O.map("character", Result.character);
+}
+
+llvm::json::Value llvm::lsp::toJSON(const Position &Value) {
+  return llvm::json::Object{
+      {"line", Value.line},
+      {"character", Value.character},
+  };
+}
+
+raw_ostream &llvm::lsp::operator<<(raw_ostream &Os, const Position &Value) {
+  return Os << Value.line << ':' << Value.character;
+}
+
+//===----------------------------------------------------------------------===//
+// Range
+//===----------------------------------------------------------------------===//
+
+bool llvm::lsp::fromJSON(const llvm::json::Value &Value, Range &Result,
+                         llvm::json::Path Path) {
+  llvm::json::ObjectMapper O(Value, Path);
+  return O && O.map("start", Result.start) && O.map("end", Result.end);
+}
+
+llvm::json::Value llvm::lsp::toJSON(const Range &Value) {
+  return llvm::json::Object{
+      {"start", Value.start},
+      {"end", Value.end},
+  };
+}
+
+raw_ostream &llvm::lsp::operator<<(raw_ostream &Os, const Range &Value) {
+  return Os << Value.start << '-' << Value.end;
+}
+
+//===----------------------------------------------------------------------===//
+// Location
+//===----------------------------------------------------------------------===//
+
+bool llvm::lsp::fromJSON(const llvm::json::Value &Value, Location &Result,
+                         llvm::json::Path Path) {
+  llvm::json::ObjectMapper O(Value, Path);
+  return O && O.map("uri", Result.uri) && O.map("range", Result.range);
+}
+
+llvm::json::Value llvm::lsp::toJSON(const Location &Value) {
+  return llvm::json::Object{
+      {"uri", Value.uri},
+      {"range", Value.range},
+  };
+}
+
+raw_ostream &llvm::lsp::operator<<(raw_ostream &Os, const Location &Value) {
+  return Os << Value.range << '@' << Value.uri;
+}
+
+//===----------------------------------------------------------------------===//
+// TextDocumentPositionParams
+//===----------------------------------------------------------------------===//
+
+bool llvm::lsp::fromJSON(const llvm::json::Value &Value,
+                         TextDocumentPositionParams &Result,
+                         llvm::json::Path Path) {
+  llvm::json::ObjectMapper O(Value, Path);
+  return O && O.map("textDocument", Result.textDocument) &&
+         O.map("position", Result.position);
+}
+
+//===----------------------------------------------------------------------===//
+// ReferenceParams
+//===----------------------------------------------------------------------===//
+
+bool llvm::lsp::fromJSON(const llvm::json::Value &Value,
+                         ReferenceContext &Result, llvm::json::Path Path) {
+  llvm::json::ObjectMapper O(Value, Path);
+  return O && O.mapOptional("includeDeclaration", Result.includeDeclaration);
+}
+
+bool llvm::lsp::fromJSON(const llvm::json::Value &Value,
+                         ReferenceParams &Result, llvm::json::Path Path) {
+  TextDocumentPositionParams &Base = Result;
+  llvm::json::ObjectMapper O(Value, Path);
+  return fromJSON(Value, Base, Path) && O &&
+         O.mapOptional("context", Result.context);
+}
+
+//===----------------------------------------------------------------------===//
+// DidOpenTextDocumentParams
+//===----------------------------------------------------------------------===//
+
+bool llvm::lsp::fromJSON(const llvm::json::Value &Value,
+                         DidOpenTextDocumentParams &Result,
+                         llvm::json::Path Path) {
+  llvm::json::ObjectMapper O(Value, Path);
+  return O && O.map("textDocument", Result.textDocument);
+}
+
+//===----------------------------------------------------------------------===//
+// DidCloseTextDocumentParams
+//===----------------------------------------------------------------------===//
+
+bool llvm::lsp::fromJSON(const llvm::json::Value &Value,
+                         DidCloseTextDocumentParams &Result,
+                         llvm::json::Path Path) {
+  llvm::json::ObjectMapper O(Value, Path);
+  return O && O.map("textDocument", Result.textDocument);
+}
+
+//===----------------------------------------------------------------------===//
+// DidChangeTextDocumentParams
+//===----------------------------------------------------------------------===//
+
+LogicalResult
+TextDocumentContentChangeEvent::applyTo(std::string &Contents) const {
+  // If there is no range, the full document changed.
+  if (!range) {
+    Contents = text;
+    return success();
+  }
+
+  // Try to map the replacement range to the content.
+  llvm::SourceMgr TmpScrMgr;
+  TmpScrMgr.AddNewSourceBuffer(llvm::MemoryBuffer::getMemBuffer(Contents),
+                               SMLoc());
+  SMRange RangeLoc = range->getAsSMRange(TmpScrMgr);
+  if (!RangeLoc.isValid())
+    return failure();
+
+  Contents.replace(RangeLoc.Start.getPointer() - Contents.data(),
+                   RangeLoc.End.getPointer() - RangeLoc.Start.getPointer(),
+                   text);
+  return success();
+}
+
+LogicalResult TextDocumentContentChangeEvent::applyTo(
+    ArrayRef<TextDocumentContentChangeEvent> Changes, std::string &Contents) {
+  for (const auto &Change : Changes)
+    if (failed(Change.applyTo(Contents)))
+      return failure();
+  return success();
+}
+
+bool llvm::lsp::fromJSON(const llvm::json::Value &Value,
+                         TextDocumentContentChangeEvent &Result,
+                         llvm::json::Path Path) {
+  llvm::json::ObjectMapper O(Value, Path);
+  return O && O.map("range", Result.range) &&
+         O.map("rangeLength", Result.rangeLength) && O.map("text", Result.text);
+}
+
+bool llvm::lsp::fromJSON(const llvm::json::Value &Value,
+                         DidChangeTextDocumentParams &Result,
+                         llvm::json::Path Path) {
+  llvm::json::ObjectMapper O(Value, Path);
+  return O && O.map("textDocument", Result.textDocument) &&
+         O.map("contentChanges", Result.contentChanges);
+}
+
+//===----------------------------------------------------------------------===//
+// MarkupContent
+//===----------------------------------------------------------------------===//
+
+static llvm::StringRef toTextKind(MarkupKind Kind) {
+  switch (Kind) {
+  case MarkupKind::PlainText:
+    return "plaintext";
+  case MarkupKind::Markdown:
+    return "markdown";
+  }
+  llvm_unreachable("Invalid MarkupKind");
+}
+
+raw_ostream &llvm::lsp::operator<<(raw_ostream &Os, MarkupKind Kind) {
+  return Os << toTextKind(Kind);
+}
+
+llvm::json::Value llvm::lsp::toJSON(const MarkupContent &Mc) {
+  if (Mc.value.empty())
+    return nullptr;
+
+  return llvm::json::Object{
+      {"kind", toTextKind(Mc.kind)},
+      {"value", Mc.value},
+  };
+}
+
+//===----------------------------------------------------------------------===//
+// Hover
+//===----------------------------------------------------------------------===//
+
+llvm::json::Value llvm::lsp::toJSON(const Hover &Hover) {
+  llvm::json::Object Result{{"contents", toJSON(Hover.contents)}};
+  if (Hover.range)
+    Result["range"] = toJSON(*Hover.range);
+  return std::move(Result);
+}
+
+//===----------------------------------------------------------------------===//
+// DocumentSymbol
+//===----------------------------------------------------------------------===//
+
+llvm::json::Value llvm::lsp::toJSON(const DocumentSymbol &Symbol) {
+  llvm::json::Object Result{{"name", Symbol.name},
+                            {"kind", static_cast<int>(Symbol.kind)},
+                            {"range", Symbol.range},
+                            {"selectionRange", Symbol.selectionRange}};
+
+  if (!Symbol.detail.empty())
+    Result["detail"] = Symbol.detail;
+  if (!Symbol.children.empty())
+    Result["children"] = Symbol.children;
+  return std::move(Result);
+}
+
+//===----------------------------------------------------------------------===//
+// DocumentSymbolParams
+//===----------------------------------------------------------------------===//
+
+bool llvm::lsp::fromJSON(const llvm::json::Value &Value,
+                         DocumentSymbolParams &Result, llvm::json::Path Path) {
+  llvm::json::ObjectMapper O(Value, Path);
+  return O && O.map("textDocument", Result.textDocument);
+}
+
+//===----------------------------------------------------------------------===//
+// DiagnosticRelatedInformation
+//===----------------------------------------------------------------------===//
+
+bool llvm::lsp::fromJSON(const llvm::json::Value &Value,
+                         DiagnosticRelatedInformation &Result,
+                         llvm::json::Path Path) {
+  llvm::json::ObjectMapper O(Value, Path);
+  return O && O.map("location", Result.location) &&
+         O.map("message", Result.message);
+}
+
+llvm::json::Value llvm::lsp::toJSON(const DiagnosticRelatedInformation &Info) {
+  return llvm::json::Object{
+      {"location", Info.location},
+      {"message", Info.message},
+  };
+}
+
+//===----------------------------------------------------------------------===//
+// Diagnostic
+//===----------------------------------------------------------------------===//
+
+llvm::json::Value llvm::lsp::toJSON(DiagnosticTag Tag) {
+  return static_cast<int>(Tag);
+}
+
+bool llvm::lsp::fromJSON(const llvm::json::Value &Value, DiagnosticTag &Result,
+                         llvm::json::Path Path) {
+  if (std::optional<int64_t> I = Value.getAsInteger()) {
+    Result = (DiagnosticTag)*I;
+    return true;
+  }
+
+  return false;
+}
+
+llvm::json::Value llvm::lsp::toJSON(const Diagnostic &Diag) {
+  llvm::json::Object Result{
+      {"range", Diag.range},
+      {"severity", (int)Diag.severity},
+      {"message", Diag.message},
+  };
+  if (Diag.category)
+    Result["category"] = *Diag.category;
+  if (!Diag.source.empty())
+    Result["source"] = Diag.source;
+  if (Diag.relatedInformation)
+    Result["relatedInformation"] = *Diag.relatedInformation;
+  if (!Diag.tags.empty())
+    Result["tags"] = Diag.tags;
+  return std::move(Result);
+}
+
+bool llvm::lsp::fromJSON(const llvm::json::Value &Value, Diagnostic &Result,
+                         llvm::json::Path Path) {
+  llvm::json::ObjectMapper O(Value, Path);
+  if (!O)
+    return false;
+  int Severity = 0;
+  if (!mapOptOrNull(Value, "severity", Severity, Path))
+    return false;
+  Result.severity = (DiagnosticSeverity)Severity;
+
+  return O.map("range", Result.range) && O.map("message", Result.message) &&
+         mapOptOrNull(Value, "category", Result.category, Path) &&
+         mapOptOrNull(Value, "source", Result.source, Path) &&
+         mapOptOrNull(Value, "relatedInformation", Result.relatedInformation,
+                      Path) &&
+         mapOptOrNull(Value, "tags", Result.tags, Path);
+}
+
+//===----------------------------------------------------------------------===//
+// PublishDiagnosticsParams
+//===----------------------------------------------------------------------===//
+
+llvm::json::Value llvm::lsp::toJSON(const PublishDiagnosticsParams &Params) {
+  return llvm::json::Object{
+      {"uri", Params.uri},
+      {"diagnostics", Params.diagnostics},
+      {"version", Params.version},
+  };
+}
+
+//===----------------------------------------------------------------------===//
+// TextEdit
+//===----------------------------------------------------------------------===//
+
+bool llvm::lsp::fromJSON(const llvm::json::Value &Value, TextEdit &Result,
+                         llvm::json::Path Path) {
+  llvm::json::ObjectMapper O(Value, Path);
+  return O && O.map("range", Result.range) && O.map("newText", Result.newText);
+}
+
+llvm::json::Value llvm::lsp::toJSON(const TextEdit &Value) {
+  return llvm::json::Object{
+      {"range", Value.range},
+      {"newText", Value.newText},
+  };
+}
+
+raw_ostream &llvm::lsp::operator<<(raw_ostream &Os, const TextEdit &Value) {
+  Os << Value.range << " => \"";
+  llvm::printEscapedString(Value.newText, Os);
+  return Os << '"';
+}
+
+//===----------------------------------------------------------------------===//
+// CompletionItemKind
+//===----------------------------------------------------------------------===//
+
+bool llvm::lsp::fromJSON(const llvm::json::Value &Value,
+                         CompletionItemKind &Result, llvm::json::Path Path) {
+  if (std::optional<int64_t> IntValue = Value.getAsInteger()) {
+    if (*IntValue < static_cast<int>(CompletionItemKind::Text) ||
+        *IntValue > static_cast<int>(CompletionItemKind::TypeParameter))
+      return false;
+    Result = static_cast<CompletionItemKind>(*IntValue);
+    return true;
+  }
+  return false;
+}
+
+CompletionItemKind llvm::lsp::adjustKindToCapability(
+    CompletionItemKind Kind,
+    CompletionItemKindBitset &SupportedCompletionItemKinds) {
+  size_t KindVal = static_cast<size_t>(Kind);
+  if (KindVal >= kCompletionItemKindMin &&
+      KindVal <= SupportedCompletionItemKinds.size() &&
+      SupportedCompletionItemKinds[KindVal])
+    return Kind;
+
+  // Provide some fall backs for common kinds that are close enough.
+  switch (Kind) {
+  case CompletionItemKind::Folder:
+    return CompletionItemKind::File;
+  case CompletionItemKind::EnumMember:
+    return CompletionItemKind::Enum;
+  case CompletionItemKind::Struct:
+    return CompletionItemKind::Class;
+  default:
+    return CompletionItemKind::Text;
+  }
+}
+
+bool llvm::lsp::fromJSON(const llvm::json::Value &Value,
+                         CompletionItemKindBitset &Result,
+                         llvm::json::Path Path) {
+  if (const llvm::json::Array *ArrayValue = Value.getAsArray()) {
+    for (size_t I = 0, E = ArrayValue->size(); I < E; ++I) {
+      CompletionItemKind KindOut;
+      if (fromJSON((*ArrayValue)[I], KindOut, Path.index(I)))
+        Result.set(size_t(KindOut));
+    }
+    return true;
+  }
+  return false;
+}
+
+//===----------------------------------------------------------------------===//
+// CompletionItem
+//===----------------------------------------------------------------------===//
+
+llvm::json::Value llvm::lsp::toJSON(const CompletionItem &Value) {
+  assert(!Value.label.empty() && "completion item label is required");
+  llvm::json::Object Result{{"label", Value.label}};
+  if (Value.kind != CompletionItemKind::Missing)
+    Result["kind"] = static_cast<int>(Value.kind);
+  if (!Value.detail.empty())
+    Result["detail"] = Value.detail;
+  if (Value.documentation)
+    Result["documentation"] = Value.documentation;
+  if (!Value.sortText.empty())
+    Result["sortText"] = Value.sortText;
+  if (!Value.filterText.empty())
+    Result["filterText"] = Value.filterText;
+  if (!Value.insertText.empty())
+    Result["insertText"] = Value.insertText;
+  if (Value.insertTextFormat != InsertTextFormat::Missing)
+    Result["insertTextFormat"] = static_cast<int>(Value.insertTextFormat);
+  if (Value.textEdit)
+    Result["textEdit"] = *Value.textEdit;
+  if (!Value.additionalTextEdits.empty()) {
+    Result["additionalTextEdits"] =
+        llvm::json::Array(Value.additionalTextEdits);
+  }
+  if (Value.deprecated)
+    Result["deprecated"] = Value.deprecated;
+  return std::move(Result);
+}
+
+raw_ostream &llvm::lsp::operator<<(raw_ostream &Os,
+                                   const CompletionItem &Value) {
+  return Os << Value.label << " - " << toJSON(Value);
+}
+
+bool llvm::lsp::operator<(const CompletionItem &Lhs,
+                          const CompletionItem &Rhs) {
+  return (Lhs.sortText.empty() ? Lhs.label : Lhs.sortText) <
+         (Rhs.sortText.empty() ? Rhs.label : Rhs.sortText);
+}
+
+//===----------------------------------------------------------------------===//
+// CompletionList
+//===----------------------------------------------------------------------===//
+
+llvm::json::Value llvm::lsp::toJSON(const CompletionList &Value) {
+  return llvm::json::Object{
+      {"isIncomplete", Value.isIncomplete},
+      {"items", llvm::json::Array(Value.items)},
+  };
+}
+
+//===----------------------------------------------------------------------===//
+// CompletionContext
+//===----------------------------------------------------------------------===//
+
+bool llvm::lsp::fromJSON(const llvm::json::Value &Value,
+                         CompletionContext &Result, llvm::json::Path Path) {
+  llvm::json::ObjectMapper O(Value, Path);
+  int TriggerKind;
+  if (!O || !O.map("triggerKind", TriggerKind) ||
+      !mapOptOrNull(Value, "triggerCharacter", Result.triggerCharacter, Path))
+    return false;
+  Result.triggerKind = static_cast<CompletionTriggerKind>(TriggerKind);
+  return true;
+}
+
+//===----------------------------------------------------------------------===//
+// CompletionParams
+//===----------------------------------------------------------------------===//
+
+bool llvm::lsp::fromJSON(const llvm::json::Value &Value,
+                         CompletionParams &Result, llvm::json::Path Path) {
+  if (!fromJSON(Value, static_cast<TextDocumentPositionParams &>(Result), Path))
+    return false;
+  if (const llvm::json::Value *Context = Value.getAsObject()->get("context"))
+    return fromJSON(*Context, Result.context, Path.field("context"));
+  return true;
+}
+
+//===----------------------------------------------------------------------===//
+// ParameterInformation
+//===----------------------------------------------------------------------===//
+
+llvm::json::Value llvm::lsp::toJSON(const ParameterInformation &Value) {
+  assert((Value.labelOffsets || !Value.labelString.empty()) &&
+         "parameter information label is required");
+  llvm::json::Object Result;
+  if (Value.labelOffsets)
+    Result["label"] = llvm::json::Array(
+        {Value.labelOffsets->first, Value.labelOffsets->second});
+  else
+    Result["label"] = Value.labelString;
+  if (!Value.documentation.empty())
+    Result["documentation"] = Value.documentation;
+  return std::move(Result);
+}
+
+//===----------------------------------------------------------------------===//
+// SignatureInformation
+//===----------------------------------------------------------------------===//
+
+llvm::json::Value llvm::lsp::toJSON(const SignatureInformation &Value) {
+  assert(!Value.label.empty() && "signature information label is required");
+  llvm::json::Object Result{
+      {"label", Value.label},
+      {"parameters", llvm::json::Array(Value.parameters)},
+  };
+  if (!Value.documentation.empty())
+    Result["documentation"] = Value.documentation;
+  return std::move(Result);
+}
+
+raw_ostream &llvm::lsp::operator<<(raw_ostream &Os,
+                                   const SignatureInformation &Value) {
+  return Os << Value.label << " - " << toJSON(Value);
+}
+
+//===----------------------------------------------------------------------===//
+// SignatureHelp
+//===----------------------------------------------------------------------===//
+
+llvm::json::Value llvm::lsp::toJSON(const SignatureHelp &Value) {
+  assert(Value.activeSignature >= 0 &&
+         "Unexpected negative value for number of active signatures.");
+  assert(Value.activeParameter >= 0 &&
+         "Unexpected negative value for active parameter index");
+  return llvm::json::Object{
+      {"activeSignature", Value.activeSignature},
+      {"activeParameter", Value.activeParameter},
+      {"signatures", llvm::json::Array(Value.signatures)},
+  };
+}
+
+//===----------------------------------------------------------------------===//
+// DocumentLinkParams
+//===----------------------------------------------------------------------===//
+
+bool llvm::lsp::fromJSON(const llvm::json::Value &Value,
+                         DocumentLinkParams &Result, llvm::json::Path Path) {
+  llvm::json::ObjectMapper O(Value, Path);
+  return O && O.map("textDocument", Result.textDocument);
+}
+
+//===----------------------------------------------------------------------===//
+// DocumentLink
+//===----------------------------------------------------------------------===//
+
+llvm::json::Value llvm::lsp::toJSON(const DocumentLink &Value) {
+  return llvm::json::Object{
+      {"range", Value.range},
+      {"target", Value.target},
+  };
+}
+
+//===----------------------------------------------------------------------===//
+// InlayHintsParams
+//===----------------------------------------------------------------------===//
+
+bool llvm::lsp::fromJSON(const llvm::json::Value &Value,
+                         InlayHintsParams &Result, llvm::json::Path Path) {
+  llvm::json::ObjectMapper O(Value, Path);
+  return O && O.map("textDocument", Result.textDocument) &&
+         O.map("range", Result.range);
+}
+
+//===----------------------------------------------------------------------===//
+// InlayHint
+//===----------------------------------------------------------------------===//
+
+llvm::json::Value llvm::lsp::toJSON(const InlayHint &Value) {
+  return llvm::json::Object{{"position", Value.position},
+                            {"kind", (int)Value.kind},
+                            {"label", Value.label},
+                            {"paddingLeft", Value.paddingLeft},
+                            {"paddingRight", Value.paddingRight}};
+}
+bool llvm::lsp::operator==(const InlayHint &Lhs, const InlayHint &Rhs) {
+  return std::tie(Lhs.position, Lhs.kind, Lhs.label) ==
+         std::tie(Rhs.position, Rhs.kind, Rhs.label);
+}
+bool llvm::lsp::operator<(const InlayHint &Lhs, const InlayHint &Rhs) {
+  return std::tie(Lhs.position, Lhs.kind, Lhs.label) <
+         std::tie(Rhs.position, Rhs.kind, Rhs.label);
+}
+
+llvm::raw_ostream &llvm::lsp::operator<<(llvm::raw_ostream &Os,
+                                         InlayHintKind Value) {
+  switch (Value) {
+  case InlayHintKind::Parameter:
+    return Os << "parameter";
+  case InlayHintKind::Type:
+    return Os << "type";
+  }
+  llvm_unreachable("Unknown InlayHintKind");
+}
+
+//===----------------------------------------------------------------------===//
+// CodeActionContext
+//===----------------------------------------------------------------------===//
+
+bool llvm::lsp::fromJSON(const llvm::json::Value &Value,
+                         CodeActionContext &Result, llvm::json::Path Path) {
+  llvm::json::ObjectMapper O(Value, Path);
+  if (!O || !O.map("diagnostics", Result.diagnostics))
+    return false;
+  O.map("only", Result.only);
+  return true;
+}
+
+//===----------------------------------------------------------------------===//
+// CodeActionParams
+//===----------------------------------------------------------------------===//
+
+bool llvm::lsp::fromJSON(const llvm::json::Value &Value,
+                         CodeActionParams &Result, llvm::json::Path Path) {
+  llvm::json::ObjectMapper O(Value, Path);
+  return O && O.map("textDocument", Result.textDocument) &&
+         O.map("range", Result.range) && O.map("context", Result.context);
+}
+
+//===----------------------------------------------------------------------===//
+// WorkspaceEdit
+//===----------------------------------------------------------------------===//
+
+bool llvm::lsp::fromJSON(const llvm::json::Value &Value, WorkspaceEdit &Result,
+                         llvm::json::Path Path) {
+  llvm::json::ObjectMapper O(Value, Path);
+  return O && O.map("changes", Result.changes);
+}
+
+llvm::json::Value llvm::lsp::toJSON(const WorkspaceEdit &Value) {
+  llvm::json::Object FileChanges;
+  for (auto &Change : Value.changes)
+    FileChanges[Change.first] = llvm::json::Array(Change.second);
+  return llvm::json::Object{{"changes", std::move(FileChanges)}};
+}
+
+//===----------------------------------------------------------------------===//
+// CodeAction
+//===----------------------------------------------------------------------===//
+
+const llvm::StringLiteral CodeAction::kQuickFix = "quickfix";
+const llvm::StringLiteral CodeAction::kRefactor = "refactor";
+const llvm::StringLiteral CodeAction::kInfo = "info";
+
+llvm::json::Value llvm::lsp::toJSON(const CodeAction &Value) {
+  llvm::json::Object CodeAction{{"title", Value.title}};
+  if (Value.kind)
+    CodeAction["kind"] = *Value.kind;
+  if (Value.diagnostics)
+    CodeAction["diagnostics"] = llvm::json::Array(*Value.diagnostics);
+  if (Value.isPreferred)
+    CodeAction["isPreferred"] = true;
+  if (Value.edit)
+    CodeAction["edit"] = *Value.edit;
+  return std::move(CodeAction);
+}
diff --git a/llvm/lib/Support/LSP/Transport.cpp b/llvm/lib/Support/LSP/Transport.cpp
new file mode 100644
index 0000000000000..e71f17701636b
--- /dev/null
+++ b/llvm/lib/Support/LSP/Transport.cpp
@@ -0,0 +1,369 @@
+//===--- JSONTransport.cpp - sending and receiving LSP messages over JSON -===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/LSP/Transport.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/LSP/Logging.h"
+#include "llvm/Support/LSP/Protocol.h"
+#include <atomic>
+#include <optional>
+#include <system_error>
+#include <utility>
+
+using namespace llvm;
+using namespace llvm::lsp;
+
+//===----------------------------------------------------------------------===//
+// Reply
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// Function object to reply to an LSP call.
+/// Each instance must be called exactly once, otherwise:
+///  - if there was no reply, an error reply is sent
+///  - if there were multiple replies, only the first is sent
+class Reply {
+public:
+  Reply(const llvm::json::Value &Id, StringRef Method, JSONTransport &Transport,
+        std::mutex &TransportOutputMutex);
+  Reply(Reply &&Other);
+  Reply &operator=(Reply &&) = delete;
+  Reply(const Reply &) = delete;
+  Reply &operator=(const Reply &) = delete;
+
+  void operator()(llvm::Expected<llvm::json::Value> Reply);
+
+private:
+  std::string Method;
+  std::atomic<bool> Replied = {false};
+  llvm::json::Value Id;
+  JSONTransport *Transport;
+  std::mutex &TransportOutputMutex;
+};
+} // namespace
+
+Reply::Reply(const llvm::json::Value &Id, llvm::StringRef Method,
+             JSONTransport &Transport, std::mutex &TransportOutputMutex)
+    : Method(Method), Id(Id), Transport(&Transport),
+      TransportOutputMutex(TransportOutputMutex) {}
+
+Reply::Reply(Reply &&Other)
+    : Method(Other.Method), Replied(Other.Replied.load()),
+      Id(std::move(Other.Id)), Transport(Other.Transport),
+      TransportOutputMutex(Other.TransportOutputMutex) {
+  Other.Transport = nullptr;
+}
+
+void Reply::operator()(llvm::Expected<llvm::json::Value> Reply) {
+  if (Replied.exchange(true)) {
+    Logger::error("Replied twice to message {0}({1})", Method, Id);
+    assert(false && "must reply to each call only once!");
+    return;
+  }
+  assert(Transport && "expected valid transport to reply to");
+
+  std::lock_guard<std::mutex> TransportLock(TransportOutputMutex);
+  if (Reply) {
+    Logger::info("--> reply:{0}({1})", Method, Id);
+    Transport->reply(std::move(Id), std::move(Reply));
+  } else {
+    llvm::Error Error = Reply.takeError();
+    Logger::info("--> reply:{0}({1}): {2}", Method, Id, Error);
+    Transport->reply(std::move(Id), std::move(Error));
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// MessageHandler
+//===----------------------------------------------------------------------===//
+
+bool MessageHandler::onNotify(llvm::StringRef Method, llvm::json::Value Value) {
+  Logger::info("--> {0}", Method);
+
+  if (Method == "exit")
+    return false;
+  if (Method == "$cancel") {
+    // TODO: Add support for cancelling requests.
+  } else {
+    auto It = NotificationHandlers.find(Method);
+    if (It != NotificationHandlers.end())
+      It->second(std::move(Value));
+  }
+  return true;
+}
+
+bool MessageHandler::onCall(llvm::StringRef Method, llvm::json::Value Params,
+                            llvm::json::Value Id) {
+  Logger::info("--> {0}({1})", Method, Id);
+
+  Reply Reply(Id, Method, Transport, TransportOutputMutex);
+
+  auto It = MethodHandlers.find(Method);
+  if (It != MethodHandlers.end()) {
+    It->second(std::move(Params), std::move(Reply));
+  } else {
+    Reply(llvm::make_error<LSPError>("method not found: " + Method.str(),
+                                     ErrorCode::MethodNotFound));
+  }
+  return true;
+}
+
+bool MessageHandler::onReply(llvm::json::Value Id,
+                             llvm::Expected<llvm::json::Value> Result) {
+  // Find the response handler in the mapping. If it exists, move it out of the
+  // mapping and erase it.
+  ResponseHandlerTy ResponseHandler;
+  {
+    std::lock_guard<std::mutex> responseHandlersLock(ResponseHandlerTy);
+    auto It = ResponseHandlers.find(debugString(Id));
+    if (It != ResponseHandlers.end()) {
+      ResponseHandler = std::move(It->second);
+      ResponseHandlers.erase(It);
+    }
+  }
+
+  // If we found a response handler, invoke it. Otherwise, log an error.
+  if (ResponseHandler.second) {
+    Logger::info("--> reply:{0}({1})", ResponseHandler.first, Id);
+    ResponseHandler.second(std::move(Id), std::move(Result));
+  } else {
+    Logger::error(
+        "received a reply with ID {0}, but there was no such outgoing request",
+        Id);
+    if (!Result)
+      llvm::consumeError(Result.takeError());
+  }
+  return true;
+}
+
+//===----------------------------------------------------------------------===//
+// JSONTransport
+//===----------------------------------------------------------------------===//
+
+/// Encode the given error as a JSON object.
+static llvm::json::Object encodeError(llvm::Error Error) {
+  std::string Message;
+  ErrorCode Code = ErrorCode::UnknownErrorCode;
+  auto HandlerFn = [&](const LSPError &LspError) -> llvm::Error {
+    Message = LspError.message;
+    Code = LspError.code;
+    return llvm::Error::success();
+  };
+  if (llvm::Error Unhandled = llvm::handleErrors(std::move(Error), HandlerFn))
+    Message = llvm::toString(std::move(Unhandled));
+
+  return llvm::json::Object{
+      {"message", std::move(Message)},
+      {"code", int64_t(Code)},
+  };
+}
+
+/// Decode the given JSON object into an error.
+llvm::Error decodeError(const llvm::json::Object &O) {
+  StringRef Msg = O.getString("message").value_or("Unspecified error");
+  if (std::optional<int64_t> Code = O.getInteger("code"))
+    return llvm::make_error<LSPError>(Msg.str(), ErrorCode(*Code));
+  return llvm::make_error<llvm::StringError>(llvm::inconvertibleErrorCode(),
+                                             Msg.str());
+}
+
+void JSONTransport::notify(StringRef Method, llvm::json::Value Params) {
+  sendMessage(llvm::json::Object{
+      {"jsonrpc", "2.0"},
+      {"method", Method},
+      {"params", std::move(Params)},
+  });
+}
+void JSONTransport::call(StringRef Method, llvm::json::Value Params,
+                         llvm::json::Value Id) {
+  sendMessage(llvm::json::Object{
+      {"jsonrpc", "2.0"},
+      {"id", std::move(Id)},
+      {"method", Method},
+      {"params", std::move(Params)},
+  });
+}
+void JSONTransport::reply(llvm::json::Value Id,
+                          llvm::Expected<llvm::json::Value> Result) {
+  if (Result) {
+    return sendMessage(llvm::json::Object{
+        {"jsonrpc", "2.0"},
+        {"id", std::move(Id)},
+        {"result", std::move(*Result)},
+    });
+  }
+
+  sendMessage(llvm::json::Object{
+      {"jsonrpc", "2.0"},
+      {"id", std::move(Id)},
+      {"error", encodeError(Result.takeError())},
+  });
+}
+
+llvm::Error JSONTransport::run(MessageHandler &Handler) {
+  std::string Json;
+  while (!In->isEndOfInput()) {
+    if (In->hasError()) {
+      return llvm::errorCodeToError(
+          std::error_code(errno, std::system_category()));
+    }
+
+    if (succeeded(In->readMessage(Json))) {
+      if (llvm::Expected<llvm::json::Value> Doc = llvm::json::parse(Json)) {
+        if (!handleMessage(std::move(*Doc), Handler))
+          return llvm::Error::success();
+      } else {
+        Logger::error("JSON parse error: {0}", llvm::toString(Doc.takeError()));
+      }
+    }
+  }
+  return llvm::errorCodeToError(std::make_error_code(std::errc::io_error));
+}
+
+void JSONTransport::sendMessage(llvm::json::Value Msg) {
+  OutputBuffer.clear();
+  llvm::raw_svector_ostream os(OutputBuffer);
+  os << llvm::formatv(PrettyOutput ? "{0:2}\n" : "{0}", Msg);
+  Out << "Content-Length: " << OutputBuffer.size() << "\r\n\r\n"
+      << OutputBuffer;
+  Out.flush();
+  Logger::debug(">>> {0}\n", OutputBuffer);
+}
+
+bool JSONTransport::handleMessage(llvm::json::Value Msg,
+                                  MessageHandler &Handler) {
+  // Message must be an object with "jsonrpc":"2.0".
+  llvm::json::Object *Object = Msg.getAsObject();
+  if (!Object ||
+      Object->getString("jsonrpc") != std::optional<StringRef>("2.0"))
+    return false;
+
+  // `id` may be any JSON value. If absent, this is a notification.
+  std::optional<llvm::json::Value> Id;
+  if (llvm::json::Value *I = Object->get("id"))
+    Id = std::move(*I);
+  std::optional<StringRef> Method = Object->getString("method");
+
+  // This is a response.
+  if (!Method) {
+    if (!Id)
+      return false;
+    if (auto *Err = Object->getObject("error"))
+      return Handler.onReply(std::move(*Id), decodeError(*Err));
+    // result should be given, use null if not.
+    llvm::json::Value Result = nullptr;
+    if (llvm::json::Value *R = Object->get("result"))
+      Result = std::move(*R);
+    return Handler.onReply(std::move(*Id), std::move(Result));
+  }
+
+  // Params should be given, use null if not.
+  llvm::json::Value Params = nullptr;
+  if (llvm::json::Value *P = Object->get("params"))
+    Params = std::move(*P);
+
+  if (Id)
+    return Handler.onCall(*Method, std::move(Params), std::move(*Id));
+  return Handler.onNotify(*Method, std::move(Params));
+}
+
+/// Tries to read a line up to and including \n.
+/// If failing, feof(), ferror(), or shutdownRequested() will be set.
+LogicalResult readLine(std::FILE *In, SmallVectorImpl<char> &Out) {
+  // Big enough to hold any reasonable header line. May not fit content lines
+  // in delimited mode, but performance doesn't matter for that mode.
+  static constexpr int BufSize = 128;
+  size_t Size = 0;
+  Out.clear();
+  for (;;) {
+    Out.resize_for_overwrite(Size + BufSize);
+    if (!std::fgets(&Out[Size], BufSize, In))
+      return failure();
+
+    clearerr(In);
+
+    // If the line contained null bytes, anything after it (including \n) will
+    // be ignored. Fortunately this is not a legal header or JSON.
+    size_t Read = std::strlen(&Out[Size]);
+    if (Read > 0 && Out[Size + Read - 1] == '\n') {
+      Out.resize(Size + Read);
+      return success();
+    }
+    Size += Read;
+  }
+}
+
+// Returns std::nullopt when:
+//  - ferror(), feof(), or shutdownRequested() are set.
+//  - Content-Length is missing or empty (protocol error)
+LogicalResult
+JSONTransportInputOverFile::readStandardMessage(std::string &Json) {
+  // A Language Server Protocol message starts with a set of HTTP headers,
+  // delimited  by \r\n, and terminated by an empty line (\r\n).
+  unsigned long long ContentLength = 0;
+  llvm::SmallString<128> Line;
+  while (true) {
+    if (feof(In) || hasError() || failed(readLine(In, Line)))
+      return failure();
+
+    // Content-Length is a mandatory header, and the only one we handle.
+    StringRef LineRef = Line;
+    if (LineRef.consume_front("Content-Length: ")) {
+      llvm::getAsUnsignedInteger(LineRef.trim(), 0, ContentLength);
+    } else if (!LineRef.trim().empty()) {
+      // It's another header, ignore it.
+      continue;
+    } else {
+      // An empty line indicates the end of headers. Go ahead and read the JSON.
+      break;
+    }
+  }
+
+  // The fuzzer likes crashing us by sending "Content-Length: 9999999999999999"
+  if (ContentLength == 0 || ContentLength > 1 << 30)
+    return failure();
+
+  Json.resize(ContentLength);
+  for (size_t Pos = 0, Read; Pos < ContentLength; Pos += Read) {
+    Read = std::fread(&Json[Pos], 1, ContentLength - Pos, In);
+    if (Read == 0)
+      return failure();
+
+    // If we're done, the error was transient. If we're not done, either it was
+    // transient or we'll see it again on retry.
+    clearerr(In);
+    Pos += Read;
+  }
+  return success();
+}
+
+/// For lit tests we support a simplified syntax:
+/// - messages are delimited by '// -----' on a line by itself
+/// - lines starting with // are ignored.
+/// This is a testing path, so favor simplicity over performance here.
+/// When returning failure: feof(), ferror(), or shutdownRequested() will be
+/// set.
+LogicalResult
+JSONTransportInputOverFile::readDelimitedMessage(std::string &Json) {
+  Json.clear();
+  llvm::SmallString<128> Line;
+  while (succeeded(readLine(In, Line))) {
+    StringRef LineRef = Line.str().trim();
+    if (LineRef.starts_with("//")) {
+      // Found a delimiter for the message.
+      if (LineRef == "// -----")
+        break;
+      continue;
+    }
+
+    Json += Line;
+  }
+
+  return failure(ferror(In));
+}
diff --git a/llvm/lib/Support/Twine.cpp b/llvm/lib/Support/Twine.cpp
index d6b48166fb0f6..9d449161c298b 100644
--- a/llvm/lib/Support/Twine.cpp
+++ b/llvm/lib/Support/Twine.cpp
@@ -88,19 +88,19 @@ void Twine::printOneChild(raw_ostream &OS, Child Ptr, NodeKind Kind) const {
     OS << Ptr.decI;
     break;
   case Twine::DecULKind:
-    OS << *Ptr.decUL;
+    OS << Ptr.decUL;
     break;
   case Twine::DecLKind:
-    OS << *Ptr.decL;
+    OS << Ptr.decL;
     break;
   case Twine::DecULLKind:
-    OS << *Ptr.decULL;
+    OS << Ptr.decULL;
     break;
   case Twine::DecLLKind:
-    OS << *Ptr.decLL;
+    OS << Ptr.decLL;
     break;
   case Twine::UHexKind:
-    OS.write_hex(*Ptr.uHex);
+    OS.write_hex(Ptr.uHex);
     break;
   }
 }
@@ -144,16 +144,16 @@ void Twine::printOneChildRepr(raw_ostream &OS, Child Ptr, NodeKind Kind) const {
     OS << "decI:\"" << Ptr.decI << "\"";
     break;
   case Twine::DecULKind:
-    OS << "decUL:\"" << *Ptr.decUL << "\"";
+    OS << "decUL:\"" << Ptr.decUL << "\"";
     break;
   case Twine::DecLKind:
-    OS << "decL:\"" << *Ptr.decL << "\"";
+    OS << "decL:\"" << Ptr.decL << "\"";
     break;
   case Twine::DecULLKind:
-    OS << "decULL:\"" << *Ptr.decULL << "\"";
+    OS << "decULL:\"" << Ptr.decULL << "\"";
     break;
   case Twine::DecLLKind:
-    OS << "decLL:\"" << *Ptr.decLL << "\"";
+    OS << "decLL:\"" << Ptr.decLL << "\"";
     break;
   case Twine::UHexKind:
     OS << "uhex:\"" << Ptr.uHex << "\"";
diff --git a/llvm/lib/Support/Unix/Path.inc b/llvm/lib/Support/Unix/Path.inc
index 2f563e2899b56..0d991ead72416 100644
--- a/llvm/lib/Support/Unix/Path.inc
+++ b/llvm/lib/Support/Unix/Path.inc
@@ -600,6 +600,11 @@ std::error_code resize_file(int FD, uint64_t Size) {
   return std::error_code();
 }
 
+std::error_code resize_file_sparse(int FD, uint64_t Size) {
+  // On Unix, this is the same as `resize_file`.
+  return resize_file(FD, Size);
+}
+
 static int convertAccessMode(AccessMode Mode) {
   switch (Mode) {
   case AccessMode::Exist:
diff --git a/llvm/lib/Support/VirtualFileSystem.cpp b/llvm/lib/Support/VirtualFileSystem.cpp
index 5d4248819f1fb..cf784595c2f1c 100644
--- a/llvm/lib/Support/VirtualFileSystem.cpp
+++ b/llvm/lib/Support/VirtualFileSystem.cpp
@@ -2707,19 +2707,9 @@ static void getVFSEntries(RedirectingFileSystem::Entry *SrcE,
   Entries.push_back(YAMLVFSEntry(VPath.c_str(), FE->getExternalContentsPath()));
 }
 
-void vfs::collectVFSFromYAML(std::unique_ptr<MemoryBuffer> Buffer,
-                             SourceMgr::DiagHandlerTy DiagHandler,
-                             StringRef YAMLFilePath,
-                             SmallVectorImpl<YAMLVFSEntry> &CollectedEntries,
-                             void *DiagContext,
-                             IntrusiveRefCntPtr<FileSystem> ExternalFS) {
-  std::unique_ptr<RedirectingFileSystem> VFS = RedirectingFileSystem::create(
-      std::move(Buffer), DiagHandler, YAMLFilePath, DiagContext,
-      std::move(ExternalFS));
-  if (!VFS)
-    return;
-  ErrorOr<RedirectingFileSystem::LookupResult> RootResult =
-      VFS->lookupPath("/");
+void vfs::collectVFSEntries(RedirectingFileSystem &VFS,
+                            SmallVectorImpl<YAMLVFSEntry> &CollectedEntries) {
+  ErrorOr<RedirectingFileSystem::LookupResult> RootResult = VFS.lookupPath("/");
   if (!RootResult)
     return;
   SmallVector<StringRef, 8> Components;
diff --git a/llvm/lib/Support/Windows/Path.inc b/llvm/lib/Support/Windows/Path.inc
index 6672d8e0ec777..be007b7abdb51 100644
--- a/llvm/lib/Support/Windows/Path.inc
+++ b/llvm/lib/Support/Windows/Path.inc
@@ -27,6 +27,7 @@
 #include "llvm/Support/Windows/WindowsSupport.h"
 #include <shellapi.h>
 #include <shlobj.h>
+#include <winioctl.h>
 
 #undef max
 
@@ -617,6 +618,22 @@ std::error_code resize_file(int FD, uint64_t Size) {
   return std::error_code(error, std::generic_category());
 }
 
+std::error_code resize_file_sparse(int FD, uint64_t Size) {
+  HANDLE hFile = reinterpret_cast<HANDLE>(::_get_osfhandle(FD));
+  DWORD temp;
+  if (!DeviceIoControl(hFile, FSCTL_SET_SPARSE, NULL, 0, NULL, 0, &temp,
+                       NULL)) {
+    return mapWindowsError(GetLastError());
+  }
+  LARGE_INTEGER liSize;
+  liSize.QuadPart = Size;
+  if (!SetFilePointerEx(hFile, liSize, NULL, FILE_BEGIN) ||
+      !SetEndOfFile(hFile)) {
+    return mapWindowsError(GetLastError());
+  }
+  return std::error_code();
+}
+
 std::error_code access(const Twine &Path, AccessMode Mode) {
   SmallVector<wchar_t, 128> PathUtf16;
 
diff --git a/llvm/lib/Support/Windows/Program.inc b/llvm/lib/Support/Windows/Program.inc
index 799af5559966c..ec785e407cc57 100644
--- a/llvm/lib/Support/Windows/Program.inc
+++ b/llvm/lib/Support/Windows/Program.inc
@@ -220,6 +220,9 @@ static bool Execute(ProcessInfo &PI, StringRef Program,
       llvm::append_range(EnvBlock, EnvString);
       EnvBlock.push_back(0);
     }
+    // Empty environments need to be terminated with two nulls.
+    if (Env->size() == 0)
+      EnvBlock.push_back(0);
     EnvBlock.push_back(0);
   }
 
diff --git a/llvm/lib/Support/raw_socket_stream.cpp b/llvm/lib/Support/raw_socket_stream.cpp
index fd1c681672138..3b510d357fd5d 100644
--- a/llvm/lib/Support/raw_socket_stream.cpp
+++ b/llvm/lib/Support/raw_socket_stream.cpp
@@ -255,7 +255,7 @@ manageTimeout(const std::chrono::milliseconds &Timeout,
   // has been canceled by another thread
   if (getActiveFD() == -1 || (CancelFD.has_value() && FD[1].revents & POLLIN))
     return std::make_error_code(std::errc::operation_canceled);
-#if _WIN32
+#ifdef _WIN32
   if (PollStatus == SOCKET_ERROR)
 #else
   if (PollStatus == -1)
diff --git a/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp b/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp
index 484bc2a4be8fa..cb831963759b5 100644
--- a/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp
@@ -630,7 +630,7 @@ void SSACCmpConv::convert(SmallVectorImpl<MachineBasicBlock *> &RemovedBlocks) {
     const MCInstrDesc &MCID = TII->get(Opc);
     // Create a dummy virtual register for the SUBS def.
     Register DestReg =
-        MRI->createVirtualRegister(TII->getRegClass(MCID, 0, TRI, *MF));
+        MRI->createVirtualRegister(TII->getRegClass(MCID, 0, TRI));
     // Insert a SUBS Rn, #0 instruction instead of the cbz / cbnz.
     BuildMI(*Head, Head->end(), TermDL, MCID)
         .addReg(DestReg, RegState::Define | RegState::Dead)
@@ -639,7 +639,7 @@ void SSACCmpConv::convert(SmallVectorImpl<MachineBasicBlock *> &RemovedBlocks) {
         .addImm(0);
     // SUBS uses the GPR*sp register classes.
     MRI->constrainRegClass(HeadCond[2].getReg(),
-                           TII->getRegClass(MCID, 1, TRI, *MF));
+                           TII->getRegClass(MCID, 1, TRI));
   }
 
   Head->splice(Head->end(), CmpBB, CmpBB->begin(), CmpBB->end());
@@ -686,10 +686,10 @@ void SSACCmpConv::convert(SmallVectorImpl<MachineBasicBlock *> &RemovedBlocks) {
   unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(CmpBBTailCC);
   const MCInstrDesc &MCID = TII->get(Opc);
   MRI->constrainRegClass(CmpMI->getOperand(FirstOp).getReg(),
-                         TII->getRegClass(MCID, 0, TRI, *MF));
+                         TII->getRegClass(MCID, 0, TRI));
   if (CmpMI->getOperand(FirstOp + 1).isReg())
     MRI->constrainRegClass(CmpMI->getOperand(FirstOp + 1).getReg(),
-                           TII->getRegClass(MCID, 1, TRI, *MF));
+                           TII->getRegClass(MCID, 1, TRI));
   MachineInstrBuilder MIB = BuildMI(*Head, CmpMI, CmpMI->getDebugLoc(), MCID)
                                 .add(CmpMI->getOperand(FirstOp)); // Register Rn
   if (isZBranch)
diff --git a/llvm/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp b/llvm/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp
index 987dfbcdd53e9..75361f5d313c6 100644
--- a/llvm/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp
+++ b/llvm/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp
@@ -115,7 +115,6 @@ static bool atomicReadDroppedOnZero(unsigned Opcode) {
 
 void AArch64DeadRegisterDefinitions::processMachineBasicBlock(
     MachineBasicBlock &MBB) {
-  const MachineFunction &MF = *MBB.getParent();
   for (MachineInstr &MI : MBB) {
     if (usesFrameIndex(MI)) {
       // We need to skip this instruction because while it appears to have a
@@ -157,7 +156,7 @@ void AArch64DeadRegisterDefinitions::processMachineBasicBlock(
         LLVM_DEBUG(dbgs() << "    Ignoring, def is tied operand.\n");
         continue;
       }
-      const TargetRegisterClass *RC = TII->getRegClass(Desc, I, TRI, MF);
+      const TargetRegisterClass *RC = TII->getRegClass(Desc, I, TRI);
       unsigned NewReg;
       if (RC == nullptr) {
         LLVM_DEBUG(dbgs() << "    Ignoring, register is not a GPR.\n");
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index 175b5e04d82ff..fd53f04443766 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -293,14 +293,9 @@ static cl::opt<bool> DisableMultiVectorSpillFill(
     cl::desc("Disable use of LD/ST pairs for SME2 or SVE2p1"), cl::init(false),
     cl::Hidden);
 
-/// Returns how much of the incoming argument stack area (in bytes) we should
-/// clean up in an epilogue. For the C calling convention this will be 0, for
-/// guaranteed tail call conventions it can be positive (a normal return or a
-/// tail call to a function that uses less stack space for arguments) or
-/// negative (for a tail call to a function that needs more stack space than us
-/// for arguments).
-static int64_t getArgumentStackToRestore(MachineFunction &MF,
-                                         MachineBasicBlock &MBB) {
+int64_t
+AArch64FrameLowering::getArgumentStackToRestore(MachineFunction &MF,
+                                                MachineBasicBlock &MBB) const {
   MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
   AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
   bool IsTailCallReturn = (MBB.end() != MBBI)
@@ -711,44 +706,6 @@ void AArch64FrameLowering::resetCFIToInitialState(
   }
 }
 
-static void emitCalleeSavedRestores(MachineBasicBlock &MBB,
-                                    MachineBasicBlock::iterator MBBI,
-                                    bool SVE) {
-  MachineFunction &MF = *MBB.getParent();
-  MachineFrameInfo &MFI = MF.getFrameInfo();
-
-  const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
-  if (CSI.empty())
-    return;
-
-  const TargetSubtargetInfo &STI = MF.getSubtarget();
-  const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
-  CFIInstBuilder CFIBuilder(MBB, MBBI, MachineInstr::FrameDestroy);
-
-  for (const auto &Info : CSI) {
-    if (SVE !=
-        (MFI.getStackID(Info.getFrameIdx()) == TargetStackID::ScalableVector))
-      continue;
-
-    MCRegister Reg = Info.getReg();
-    if (SVE &&
-        !static_cast<const AArch64RegisterInfo &>(TRI).regNeedsCFI(Reg, Reg))
-      continue;
-
-    CFIBuilder.buildRestore(Info.getReg());
-  }
-}
-
-void AArch64FrameLowering::emitCalleeSavedGPRRestores(
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) const {
-  emitCalleeSavedRestores(MBB, MBBI, false);
-}
-
-void AArch64FrameLowering::emitCalleeSavedSVERestores(
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) const {
-  emitCalleeSavedRestores(MBB, MBBI, true);
-}
-
 // Return the maximum possible number of bytes for `Size` due to the
 // architectural limit on the size of a SVE register.
 static int64_t upperBound(StackOffset Size) {
@@ -1642,28 +1599,6 @@ bool AArch64FrameLowering::isSVECalleeSave(
   }
 }
 
-static void emitShadowCallStackEpilogue(const TargetInstrInfo &TII,
-                                        MachineFunction &MF,
-                                        MachineBasicBlock &MBB,
-                                        MachineBasicBlock::iterator MBBI,
-                                        const DebugLoc &DL, bool NeedsWinCFI) {
-  // Shadow call stack epilog: ldr x30, [x18, #-8]!
-  BuildMI(MBB, MBBI, DL, TII.get(AArch64::LDRXpre))
-      .addReg(AArch64::X18, RegState::Define)
-      .addReg(AArch64::LR, RegState::Define)
-      .addReg(AArch64::X18)
-      .addImm(-8)
-      .setMIFlag(MachineInstr::FrameDestroy);
-
-  if (NeedsWinCFI)
-    BuildMI(MBB, MBBI, DL, TII.get(AArch64::SEH_Nop))
-        .setMIFlag(MachineInstr::FrameDestroy);
-
-  if (MF.getInfo<AArch64FunctionInfo>()->needsAsyncDwarfUnwindInfo(MF))
-    CFIInstBuilder(MBB, MBBI, MachineInstr::FrameDestroy)
-        .buildRestore(AArch64::X18);
-}
-
 void AArch64FrameLowering::emitPacRetPlusLeafHardening(
     MachineFunction &MF) const {
   const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
@@ -1703,389 +1638,10 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
   PrologueEmitter.emitPrologue();
 }
 
-static bool isFuncletReturnInstr(const MachineInstr &MI) {
-  switch (MI.getOpcode()) {
-  default:
-    return false;
-  case AArch64::CATCHRET:
-  case AArch64::CLEANUPRET:
-    return true;
-  }
-}
-
 void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
                                         MachineBasicBlock &MBB) const {
-  MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
-  MachineFrameInfo &MFI = MF.getFrameInfo();
-  AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
-  const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
-  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
-  DebugLoc DL;
-  bool NeedsWinCFI = needsWinCFI(MF);
-  bool EmitCFI = AFI->needsAsyncDwarfUnwindInfo(MF);
-  bool HasWinCFI = false;
-  bool IsFunclet = false;
-
-  if (MBB.end() != MBBI) {
-    DL = MBBI->getDebugLoc();
-    IsFunclet = isFuncletReturnInstr(*MBBI);
-  }
-
-  MachineBasicBlock::iterator EpilogStartI = MBB.end();
-
-  auto FinishingTouches = make_scope_exit([&]() {
-    if (AFI->needsShadowCallStackPrologueEpilogue(MF)) {
-      emitShadowCallStackEpilogue(*TII, MF, MBB, MBB.getFirstTerminator(), DL,
-                                  NeedsWinCFI);
-      HasWinCFI |= NeedsWinCFI;
-    }
-    if (EmitCFI)
-      emitCalleeSavedGPRRestores(MBB, MBB.getFirstTerminator());
-    if (AFI->shouldSignReturnAddress(MF)) {
-      // If pac-ret+leaf is in effect, PAUTH_EPILOGUE pseudo instructions
-      // are inserted by emitPacRetPlusLeafHardening().
-      if (!shouldSignReturnAddressEverywhere(MF)) {
-        BuildMI(MBB, MBB.getFirstTerminator(), DL,
-                TII->get(AArch64::PAUTH_EPILOGUE))
-            .setMIFlag(MachineInstr::FrameDestroy);
-      }
-      // AArch64PointerAuth pass will insert SEH_PACSignLR
-      HasWinCFI |= NeedsWinCFI;
-    }
-    if (HasWinCFI) {
-      BuildMI(MBB, MBB.getFirstTerminator(), DL,
-              TII->get(AArch64::SEH_EpilogEnd))
-          .setMIFlag(MachineInstr::FrameDestroy);
-      if (!MF.hasWinCFI())
-        MF.setHasWinCFI(true);
-    }
-    if (NeedsWinCFI) {
-      assert(EpilogStartI != MBB.end());
-      if (!HasWinCFI)
-        MBB.erase(EpilogStartI);
-    }
-  });
-
-  int64_t NumBytes = IsFunclet ? getWinEHFuncletFrameSize(MF)
-                               : MFI.getStackSize();
-
-  // All calls are tail calls in GHC calling conv, and functions have no
-  // prologue/epilogue.
-  if (MF.getFunction().getCallingConv() == CallingConv::GHC)
-    return;
-
-  // How much of the stack used by incoming arguments this function is expected
-  // to restore in this particular epilogue.
-  int64_t ArgumentStackToRestore = getArgumentStackToRestore(MF, MBB);
-  bool IsWin64 = Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv(),
-                                              MF.getFunction().isVarArg());
-  unsigned FixedObject = getFixedObjectSize(MF, AFI, IsWin64, IsFunclet);
-
-  int64_t AfterCSRPopSize = ArgumentStackToRestore;
-  auto PrologueSaveSize = AFI->getCalleeSavedStackSize() + FixedObject;
-  // We cannot rely on the local stack size set in emitPrologue if the function
-  // has funclets, as funclets have different local stack size requirements, and
-  // the current value set in emitPrologue may be that of the containing
-  // function.
-  if (MF.hasEHFunclets())
-    AFI->setLocalStackSize(NumBytes - PrologueSaveSize);
-  if (homogeneousPrologEpilog(MF, &MBB)) {
-    assert(!NeedsWinCFI);
-    auto FirstHomogenousEpilogI = MBB.getFirstTerminator();
-    if (FirstHomogenousEpilogI != MBB.begin()) {
-      auto HomogeneousEpilog = std::prev(FirstHomogenousEpilogI);
-      if (HomogeneousEpilog->getOpcode() == AArch64::HOM_Epilog)
-        FirstHomogenousEpilogI = HomogeneousEpilog;
-    }
-
-    // Adjust local stack
-    emitFrameOffset(MBB, FirstHomogenousEpilogI, DL, AArch64::SP, AArch64::SP,
-                    StackOffset::getFixed(AFI->getLocalStackSize()), TII,
-                    MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI);
-
-    // SP has been already adjusted while restoring callee save regs.
-    // We've bailed-out the case with adjusting SP for arguments.
-    assert(AfterCSRPopSize == 0);
-    return;
-  }
-
-  bool FPAfterSVECalleeSaves =
-      Subtarget.isTargetWindows() && AFI->getSVECalleeSavedStackSize();
-
-  bool CombineSPBump = shouldCombineCSRLocalStackBumpInEpilogue(MBB, NumBytes);
-  // Assume we can't combine the last pop with the sp restore.
-  bool CombineAfterCSRBump = false;
-  if (FPAfterSVECalleeSaves) {
-    AfterCSRPopSize += FixedObject;
-  } else if (!CombineSPBump && PrologueSaveSize != 0) {
-    MachineBasicBlock::iterator Pop = std::prev(MBB.getFirstTerminator());
-    while (Pop->getOpcode() == TargetOpcode::CFI_INSTRUCTION ||
-           AArch64InstrInfo::isSEHInstruction(*Pop))
-      Pop = std::prev(Pop);
-    // Converting the last ldp to a post-index ldp is valid only if the last
-    // ldp's offset is 0.
-    const MachineOperand &OffsetOp = Pop->getOperand(Pop->getNumOperands() - 1);
-    // If the offset is 0 and the AfterCSR pop is not actually trying to
-    // allocate more stack for arguments (in space that an untimely interrupt
-    // may clobber), convert it to a post-index ldp.
-    if (OffsetOp.getImm() == 0 && AfterCSRPopSize >= 0) {
-      convertCalleeSaveRestoreToSPPrePostIncDec(
-          MBB, Pop, DL, TII, PrologueSaveSize, NeedsWinCFI, &HasWinCFI, EmitCFI,
-          MachineInstr::FrameDestroy, PrologueSaveSize);
-    } else {
-      // If not, make sure to emit an add after the last ldp.
-      // We're doing this by transferring the size to be restored from the
-      // adjustment *before* the CSR pops to the adjustment *after* the CSR
-      // pops.
-      AfterCSRPopSize += PrologueSaveSize;
-      CombineAfterCSRBump = true;
-    }
-  }
-
-  // Move past the restores of the callee-saved registers.
-  // If we plan on combining the sp bump of the local stack size and the callee
-  // save stack size, we might need to adjust the CSR save and restore offsets.
-  MachineBasicBlock::iterator FirstGPRRestoreI = MBB.getFirstTerminator();
-  MachineBasicBlock::iterator Begin = MBB.begin();
-  while (FirstGPRRestoreI != Begin) {
-    --FirstGPRRestoreI;
-    if (!FirstGPRRestoreI->getFlag(MachineInstr::FrameDestroy) ||
-        (!FPAfterSVECalleeSaves && isSVECalleeSave(FirstGPRRestoreI))) {
-      ++FirstGPRRestoreI;
-      break;
-    } else if (CombineSPBump)
-      fixupCalleeSaveRestoreStackOffset(
-          *FirstGPRRestoreI, AFI->getLocalStackSize(), NeedsWinCFI, &HasWinCFI);
-  }
-
-  if (NeedsWinCFI) {
-    // Note that there are cases where we insert SEH opcodes in the
-    // epilogue when we had no SEH opcodes in the prologue. For
-    // example, when there is no stack frame but there are stack
-    // arguments. Insert the SEH_EpilogStart and remove it later if it
-    // we didn't emit any SEH opcodes to avoid generating WinCFI for
-    // functions that don't need it.
-    BuildMI(MBB, FirstGPRRestoreI, DL, TII->get(AArch64::SEH_EpilogStart))
-        .setMIFlag(MachineInstr::FrameDestroy);
-    EpilogStartI = FirstGPRRestoreI;
-    --EpilogStartI;
-  }
-
-  if (hasFP(MF) && AFI->hasSwiftAsyncContext()) {
-    switch (MF.getTarget().Options.SwiftAsyncFramePointer) {
-    case SwiftAsyncFramePointerMode::DeploymentBased:
-      // Avoid the reload as it is GOT relative, and instead fall back to the
-      // hardcoded value below.  This allows a mismatch between the OS and
-      // application without immediately terminating on the difference.
-      [[fallthrough]];
-    case SwiftAsyncFramePointerMode::Always:
-      // We need to reset FP to its untagged state on return. Bit 60 is
-      // currently used to show the presence of an extended frame.
-
-      // BIC x29, x29, #0x1000_0000_0000_0000
-      BuildMI(MBB, MBB.getFirstTerminator(), DL, TII->get(AArch64::ANDXri),
-              AArch64::FP)
-          .addUse(AArch64::FP)
-          .addImm(0x10fe)
-          .setMIFlag(MachineInstr::FrameDestroy);
-      if (NeedsWinCFI) {
-        BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
-            .setMIFlags(MachineInstr::FrameDestroy);
-        HasWinCFI = true;
-      }
-      break;
-
-    case SwiftAsyncFramePointerMode::Never:
-      break;
-    }
-  }
-
-  const StackOffset &SVEStackSize = getSVEStackSize(MF);
-
-  // If there is a single SP update, insert it before the ret and we're done.
-  if (CombineSPBump) {
-    assert(!SVEStackSize && "Cannot combine SP bump with SVE");
-
-    // When we are about to restore the CSRs, the CFA register is SP again.
-    if (EmitCFI && hasFP(MF))
-      CFIInstBuilder(MBB, FirstGPRRestoreI, MachineInstr::FrameDestroy)
-          .buildDefCFA(AArch64::SP, NumBytes);
-
-    emitFrameOffset(MBB, MBB.getFirstTerminator(), DL, AArch64::SP, AArch64::SP,
-                    StackOffset::getFixed(NumBytes + AfterCSRPopSize), TII,
-                    MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI,
-                    EmitCFI, StackOffset::getFixed(NumBytes));
-    return;
-  }
-
-  NumBytes -= PrologueSaveSize;
-  assert(NumBytes >= 0 && "Negative stack allocation size!?");
-
-  // Process the SVE callee-saves to determine what space needs to be
-  // deallocated.
-  StackOffset DeallocateBefore = {}, DeallocateAfter = SVEStackSize;
-  MachineBasicBlock::iterator RestoreBegin = FirstGPRRestoreI,
-                              RestoreEnd = FirstGPRRestoreI;
-  if (int64_t CalleeSavedSize = AFI->getSVECalleeSavedStackSize()) {
-    if (FPAfterSVECalleeSaves)
-      RestoreEnd = MBB.getFirstTerminator();
-
-    RestoreBegin = std::prev(RestoreEnd);
-    while (RestoreBegin != MBB.begin() &&
-           isSVECalleeSave(std::prev(RestoreBegin)))
-      --RestoreBegin;
-
-    assert(isSVECalleeSave(RestoreBegin) &&
-           isSVECalleeSave(std::prev(RestoreEnd)) && "Unexpected instruction");
-
-    StackOffset CalleeSavedSizeAsOffset =
-        StackOffset::getScalable(CalleeSavedSize);
-    DeallocateBefore = SVEStackSize - CalleeSavedSizeAsOffset;
-    DeallocateAfter = CalleeSavedSizeAsOffset;
-  }
-
-  // Deallocate the SVE area.
-  if (FPAfterSVECalleeSaves) {
-    // If the callee-save area is before FP, restoring the FP implicitly
-    // deallocates non-callee-save SVE allocations.  Otherwise, deallocate
-    // them explicitly.
-    if (!AFI->isStackRealigned() && !MFI.hasVarSizedObjects()) {
-      emitFrameOffset(MBB, FirstGPRRestoreI, DL, AArch64::SP, AArch64::SP,
-                      DeallocateBefore, TII, MachineInstr::FrameDestroy, false,
-                      NeedsWinCFI, &HasWinCFI);
-    }
-
-    // Deallocate callee-save non-SVE registers.
-    emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP,
-                    StackOffset::getFixed(AFI->getCalleeSavedStackSize()), TII,
-                    MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI);
-
-    // Deallocate fixed objects.
-    emitFrameOffset(MBB, RestoreEnd, DL, AArch64::SP, AArch64::SP,
-                    StackOffset::getFixed(FixedObject), TII,
-                    MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI);
-
-    // Deallocate callee-save SVE registers.
-    emitFrameOffset(MBB, RestoreEnd, DL, AArch64::SP, AArch64::SP,
-                    DeallocateAfter, TII, MachineInstr::FrameDestroy, false,
-                    NeedsWinCFI, &HasWinCFI);
-  } else if (SVEStackSize) {
-    int64_t SVECalleeSavedSize = AFI->getSVECalleeSavedStackSize();
-    // If we have stack realignment or variable-sized objects we must use the
-    // FP to restore SVE callee saves (as there is an unknown amount of
-    // data/padding between the SP and SVE CS area).
-    Register BaseForSVEDealloc =
-        (AFI->isStackRealigned() || MFI.hasVarSizedObjects()) ? AArch64::FP
-                                                              : AArch64::SP;
-    if (SVECalleeSavedSize && BaseForSVEDealloc == AArch64::FP) {
-      Register CalleeSaveBase = AArch64::FP;
-      if (int64_t CalleeSaveBaseOffset =
-              AFI->getCalleeSaveBaseToFrameRecordOffset()) {
-        // If we have have an non-zero offset to the non-SVE CS base we need to
-        // compute the base address by subtracting the offest in a temporary
-        // register first (to avoid briefly deallocating the SVE CS).
-        CalleeSaveBase = MBB.getParent()->getRegInfo().createVirtualRegister(
-            &AArch64::GPR64RegClass);
-        emitFrameOffset(MBB, RestoreBegin, DL, CalleeSaveBase, AArch64::FP,
-                        StackOffset::getFixed(-CalleeSaveBaseOffset), TII,
-                        MachineInstr::FrameDestroy);
-      }
-      // The code below will deallocate the stack space space by moving the
-      // SP to the start of the SVE callee-save area.
-      emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, CalleeSaveBase,
-                      StackOffset::getScalable(-SVECalleeSavedSize), TII,
-                      MachineInstr::FrameDestroy);
-    } else if (BaseForSVEDealloc == AArch64::SP) {
-      if (SVECalleeSavedSize) {
-        // Deallocate the non-SVE locals first before we can deallocate (and
-        // restore callee saves) from the SVE area.
-        emitFrameOffset(
-            MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP,
-            StackOffset::getFixed(NumBytes), TII, MachineInstr::FrameDestroy,
-            false, NeedsWinCFI, &HasWinCFI, EmitCFI && !hasFP(MF),
-            SVEStackSize + StackOffset::getFixed(NumBytes + PrologueSaveSize));
-        NumBytes = 0;
-      }
-
-      emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP,
-                      DeallocateBefore, TII, MachineInstr::FrameDestroy, false,
-                      NeedsWinCFI, &HasWinCFI, EmitCFI && !hasFP(MF),
-                      SVEStackSize +
-                          StackOffset::getFixed(NumBytes + PrologueSaveSize));
-
-      emitFrameOffset(MBB, RestoreEnd, DL, AArch64::SP, AArch64::SP,
-                      DeallocateAfter, TII, MachineInstr::FrameDestroy, false,
-                      NeedsWinCFI, &HasWinCFI, EmitCFI && !hasFP(MF),
-                      DeallocateAfter +
-                          StackOffset::getFixed(NumBytes + PrologueSaveSize));
-    }
-    if (EmitCFI)
-      emitCalleeSavedSVERestores(MBB, RestoreEnd);
-  }
-
-  if (!hasFP(MF)) {
-    bool RedZone = canUseRedZone(MF);
-    // If this was a redzone leaf function, we don't need to restore the
-    // stack pointer (but we may need to pop stack args for fastcc).
-    if (RedZone && AfterCSRPopSize == 0)
-      return;
-
-    // Pop the local variables off the stack. If there are no callee-saved
-    // registers, it means we are actually positioned at the terminator and can
-    // combine stack increment for the locals and the stack increment for
-    // callee-popped arguments into (possibly) a single instruction and be done.
-    bool NoCalleeSaveRestore = PrologueSaveSize == 0;
-    int64_t StackRestoreBytes = RedZone ? 0 : NumBytes;
-    if (NoCalleeSaveRestore)
-      StackRestoreBytes += AfterCSRPopSize;
-
-    emitFrameOffset(
-        MBB, FirstGPRRestoreI, DL, AArch64::SP, AArch64::SP,
-        StackOffset::getFixed(StackRestoreBytes), TII,
-        MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI, EmitCFI,
-        StackOffset::getFixed((RedZone ? 0 : NumBytes) + PrologueSaveSize));
-
-    // If we were able to combine the local stack pop with the argument pop,
-    // then we're done.
-    if (NoCalleeSaveRestore || AfterCSRPopSize == 0) {
-      return;
-    }
-
-    NumBytes = 0;
-  }
-
-  // Restore the original stack pointer.
-  // FIXME: Rather than doing the math here, we should instead just use
-  // non-post-indexed loads for the restores if we aren't actually going to
-  // be able to save any instructions.
-  if (!IsFunclet && (MFI.hasVarSizedObjects() || AFI->isStackRealigned())) {
-    emitFrameOffset(
-        MBB, FirstGPRRestoreI, DL, AArch64::SP, AArch64::FP,
-        StackOffset::getFixed(-AFI->getCalleeSaveBaseToFrameRecordOffset()),
-        TII, MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI);
-  } else if (NumBytes)
-    emitFrameOffset(MBB, FirstGPRRestoreI, DL, AArch64::SP, AArch64::SP,
-                    StackOffset::getFixed(NumBytes), TII,
-                    MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI);
-
-  // When we are about to restore the CSRs, the CFA register is SP again.
-  if (EmitCFI && hasFP(MF))
-    CFIInstBuilder(MBB, FirstGPRRestoreI, MachineInstr::FrameDestroy)
-        .buildDefCFA(AArch64::SP, PrologueSaveSize);
-
-  // This must be placed after the callee-save restore code because that code
-  // assumes the SP is at the same location as it was after the callee-save save
-  // code in the prologue.
-  if (AfterCSRPopSize) {
-    assert(AfterCSRPopSize > 0 && "attempting to reallocate arg stack that an "
-                                  "interrupt may have clobbered");
-
-    emitFrameOffset(
-        MBB, MBB.getFirstTerminator(), DL, AArch64::SP, AArch64::SP,
-        StackOffset::getFixed(AfterCSRPopSize), TII, MachineInstr::FrameDestroy,
-        false, NeedsWinCFI, &HasWinCFI, EmitCFI,
-        StackOffset::getFixed(CombineAfterCSRBump ? PrologueSaveSize : 0));
-  }
+  AArch64EpilogueEmitter EpilogueEmitter(MF, MBB, *this);
+  EpilogueEmitter.emitEpilogue();
 }
 
 bool AArch64FrameLowering::enableCFIFixup(const MachineFunction &MF) const {
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.h b/llvm/lib/Target/AArch64/AArch64FrameLowering.h
index a9d65441a4e30..0825d03bcb0d8 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.h
@@ -22,6 +22,7 @@ namespace llvm {
 class TargetLowering;
 class AArch64FunctionInfo;
 class AArch64PrologueEmitter;
+class AArch64EpilogueEmitter;
 
 class AArch64FrameLowering : public TargetFrameLowering {
 public:
@@ -134,7 +135,6 @@ class AArch64FrameLowering : public TargetFrameLowering {
     return StackId != TargetStackID::ScalableVector;
   }
 
-  friend class AArch64PrologueEmitter;
   void
   orderFrameObjects(const MachineFunction &MF,
                     SmallVectorImpl<int> &ObjectsToAllocate) const override;
@@ -147,6 +147,9 @@ class AArch64FrameLowering : public TargetFrameLowering {
 
   StackOffset getSVEStackSize(const MachineFunction &MF) const;
 
+  friend class AArch64PrologueEmitter;
+  friend class AArch64EpilogueEmitter;
+
 protected:
   bool hasFPImpl(const MachineFunction &MF) const override;
 
@@ -170,10 +173,6 @@ class AArch64FrameLowering : public TargetFrameLowering {
                                       int &MaxCSFrameIndex) const;
   bool shouldCombineCSRLocalStackBumpInEpilogue(MachineBasicBlock &MBB,
                                                 uint64_t StackBumpBytes) const;
-  void emitCalleeSavedGPRRestores(MachineBasicBlock &MBB,
-                                  MachineBasicBlock::iterator MBBI) const;
-  void emitCalleeSavedSVERestores(MachineBasicBlock &MBB,
-                                  MachineBasicBlock::iterator MBBI) const;
   void allocateStackSpace(MachineBasicBlock &MBB,
                           MachineBasicBlock::iterator MBBI,
                           int64_t RealignmentPadding, StackOffset AllocSize,
@@ -215,6 +214,15 @@ class AArch64FrameLowering : public TargetFrameLowering {
   StackOffset getStackOffset(const MachineFunction &MF,
                              int64_t ObjectOffset) const;
 
+  /// Returns how much of the incoming argument stack area (in bytes) we should
+  /// clean up in an epilogue. For the C calling convention this will be 0, for
+  /// guaranteed tail call conventions it can be positive (a normal return or a
+  /// tail call to a function that uses less stack space for arguments) or
+  /// negative (for a tail call to a function that needs more stack space than
+  /// us for arguments).
+  int64_t getArgumentStackToRestore(MachineFunction &MF,
+                                    MachineBasicBlock &MBB) const;
+
   // Find a scratch register that we can use at the start of the prologue to
   // re-align the stack pointer.  We avoid using callee-save registers since
   // they may appear to be free when this is called from canUseAsPrologue
diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 6fdc981fc21a5..54bdb8750f709 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -71,6 +71,9 @@ class AArch64DAGToDAGISel : public SelectionDAGISel {
   template <signed Low, signed High, signed Scale>
   bool SelectRDVLImm(SDValue N, SDValue &Imm);
 
+  template <signed Low, signed High>
+  bool SelectRDSVLShiftImm(SDValue N, SDValue &Imm);
+
   bool SelectArithExtendedRegister(SDValue N, SDValue &Reg, SDValue &Shift);
   bool SelectArithUXTXRegister(SDValue N, SDValue &Reg, SDValue &Shift);
   bool SelectArithImmed(SDValue N, SDValue &Val, SDValue &Shift);
@@ -574,7 +577,7 @@ bool AArch64DAGToDAGISel::SelectInlineAsmMemoryOperand(
     // We need to make sure that this one operand does not end up in XZR, thus
     // require the address to be in a PointerRegClass register.
     const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
-    const TargetRegisterClass *TRC = TRI->getPointerRegClass(*MF);
+    const TargetRegisterClass *TRC = TRI->getPointerRegClass();
     SDLoc dl(Op);
     SDValue RC = CurDAG->getTargetConstant(TRC->getID(), dl, MVT::i64);
     SDValue NewOp =
@@ -938,6 +941,21 @@ bool AArch64DAGToDAGISel::SelectRDVLImm(SDValue N, SDValue &Imm) {
   return false;
 }
 
+// Returns a suitable RDSVL multiplier from a left shift.
+template <signed Low, signed High>
+bool AArch64DAGToDAGISel::SelectRDSVLShiftImm(SDValue N, SDValue &Imm) {
+  if (!isa<ConstantSDNode>(N))
+    return false;
+
+  int64_t MulImm = 1LL << cast<ConstantSDNode>(N)->getSExtValue();
+  if (MulImm >= Low && MulImm <= High) {
+    Imm = CurDAG->getSignedTargetConstant(MulImm, SDLoc(N), MVT::i32);
+    return true;
+  }
+
+  return false;
+}
+
 /// SelectArithExtendedRegister - Select a "extended register" operand.  This
 /// operand folds in an extend followed by an optional left shift.
 bool AArch64DAGToDAGISel::SelectArithExtendedRegister(SDValue N, SDValue &Reg,
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index a194147d09396..232d5ae170976 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -2184,8 +2184,7 @@ bool AArch64TargetLowering::shouldExpandGetActiveLaneMask(EVT ResVT,
 
 bool AArch64TargetLowering::shouldExpandPartialReductionIntrinsic(
     const IntrinsicInst *I) const {
-  assert(I->getIntrinsicID() ==
-             Intrinsic::experimental_vector_partial_reduce_add &&
+  assert(I->getIntrinsicID() == Intrinsic::vector_partial_reduce_add &&
          "Unexpected intrinsic!");
   return true;
 }
@@ -2940,6 +2939,63 @@ AArch64TargetLowering::EmitDynamicProbedAlloc(MachineInstr &MI,
   return NextInst->getParent();
 }
 
+MachineBasicBlock *
+AArch64TargetLowering::EmitCheckMatchingVL(MachineInstr &MI,
+                                           MachineBasicBlock *MBB) const {
+  MachineFunction *MF = MBB->getParent();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+
+  const TargetRegisterClass *RC_GPR = &AArch64::GPR64RegClass;
+  const TargetRegisterClass *RC_GPRsp = &AArch64::GPR64spRegClass;
+
+  Register RegVL_GPR = MRI.createVirtualRegister(RC_GPR);
+  Register RegVL_GPRsp = MRI.createVirtualRegister(RC_GPRsp); // for ADDSVL src
+  Register RegSVL_GPR = MRI.createVirtualRegister(RC_GPR);
+  Register RegSVL_GPRsp = MRI.createVirtualRegister(RC_GPRsp); // for ADDSVL dst
+
+  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+  DebugLoc DL = MI.getDebugLoc();
+
+  // RDVL requires GPR64, ADDSVL requires GPR64sp
+  // We need to insert COPY instructions, these will later be removed by the
+  // RegisterCoalescer
+  BuildMI(*MBB, MI, DL, TII->get(AArch64::RDVLI_XI), RegVL_GPR).addImm(1);
+  BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::COPY), RegVL_GPRsp)
+      .addReg(RegVL_GPR);
+
+  BuildMI(*MBB, MI, DL, TII->get(AArch64::ADDSVL_XXI), RegSVL_GPRsp)
+      .addReg(RegVL_GPRsp)
+      .addImm(-1);
+  BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::COPY), RegSVL_GPR)
+      .addReg(RegSVL_GPRsp);
+
+  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
+  MachineFunction::iterator It = ++MBB->getIterator();
+  MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *PassBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MF->insert(It, TrapBB);
+  MF->insert(It, PassBB);
+
+  // Continue if vector lengths match
+  BuildMI(*MBB, MI, DL, TII->get(AArch64::CBZX))
+      .addReg(RegSVL_GPR)
+      .addMBB(PassBB);
+
+  // Transfer rest of current BB to PassBB
+  PassBB->splice(PassBB->begin(), MBB,
+                 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
+  PassBB->transferSuccessorsAndUpdatePHIs(MBB);
+
+  // Trap if vector lengths mismatch
+  BuildMI(TrapBB, DL, TII->get(AArch64::BRK)).addImm(1);
+
+  MBB->addSuccessor(TrapBB);
+  MBB->addSuccessor(PassBB);
+
+  MI.eraseFromParent();
+  return PassBB;
+}
+
 MachineBasicBlock *
 AArch64TargetLowering::EmitTileLoad(unsigned Opc, unsigned BaseReg,
                                     MachineInstr &MI,
@@ -3343,6 +3399,9 @@ MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter(
   case AArch64::PROBED_STACKALLOC_DYN:
     return EmitDynamicProbedAlloc(MI, BB);
 
+  case AArch64::CHECK_MATCHING_VL_PSEUDO:
+    return EmitCheckMatchingVL(MI, BB);
+
   case AArch64::LD1_MXIPXX_H_PSEUDO_B:
     return EmitTileLoad(AArch64::LD1_MXIPXX_H_B, AArch64::ZAB0, MI, BB);
   case AArch64::LD1_MXIPXX_H_PSEUDO_H:
@@ -6392,25 +6451,11 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
   case Intrinsic::aarch64_sve_clz:
     return DAG.getNode(AArch64ISD::CTLZ_MERGE_PASSTHRU, DL, Op.getValueType(),
                        Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
-  case Intrinsic::aarch64_sme_cntsb:
-    return DAG.getNode(AArch64ISD::RDSVL, DL, Op.getValueType(),
-                       DAG.getConstant(1, DL, MVT::i32));
-  case Intrinsic::aarch64_sme_cntsh: {
-    SDValue One = DAG.getConstant(1, DL, MVT::i32);
-    SDValue Bytes = DAG.getNode(AArch64ISD::RDSVL, DL, Op.getValueType(), One);
-    return DAG.getNode(ISD::SRL, DL, Op.getValueType(), Bytes, One);
-  }
-  case Intrinsic::aarch64_sme_cntsw: {
-    SDValue Bytes = DAG.getNode(AArch64ISD::RDSVL, DL, Op.getValueType(),
-                                DAG.getConstant(1, DL, MVT::i32));
-    return DAG.getNode(ISD::SRL, DL, Op.getValueType(), Bytes,
-                       DAG.getConstant(2, DL, MVT::i32));
-  }
   case Intrinsic::aarch64_sme_cntsd: {
     SDValue Bytes = DAG.getNode(AArch64ISD::RDSVL, DL, Op.getValueType(),
                                 DAG.getConstant(1, DL, MVT::i32));
     return DAG.getNode(ISD::SRL, DL, Op.getValueType(), Bytes,
-                       DAG.getConstant(3, DL, MVT::i32));
+                       DAG.getConstant(3, DL, MVT::i32), SDNodeFlags::Exact);
   }
   case Intrinsic::aarch64_sve_cnt: {
     SDValue Data = Op.getOperand(3);
@@ -8037,6 +8082,17 @@ static bool isPassedInFPR(EVT VT) {
          (VT.isFloatingPoint() && !VT.isScalableVector());
 }
 
+static SDValue getZT0FrameIndex(MachineFrameInfo &MFI,
+                                AArch64FunctionInfo &FuncInfo,
+                                SelectionDAG &DAG) {
+  if (!FuncInfo.hasZT0SpillSlotIndex())
+    FuncInfo.setZT0SpillSlotIndex(MFI.CreateSpillStackObject(64, Align(16)));
+
+  return DAG.getFrameIndex(
+      FuncInfo.getZT0SpillSlotIndex(),
+      DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout()));
+}
+
 SDValue AArch64TargetLowering::lowerEHPadEntry(SDValue Chain, SDLoc const &DL,
                                                SelectionDAG &DAG) const {
   assert(Chain.getOpcode() == ISD::EntryToken && "Unexpected Chain value");
@@ -9122,14 +9178,29 @@ void AArch64TargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
   }
 }
 
-SDValue AArch64TargetLowering::changeStreamingMode(SelectionDAG &DAG, SDLoc DL,
-                                                   bool Enable, SDValue Chain,
-                                                   SDValue InGlue,
-                                                   unsigned Condition) const {
+SDValue AArch64TargetLowering::changeStreamingMode(
+    SelectionDAG &DAG, SDLoc DL, bool Enable, SDValue Chain, SDValue InGlue,
+    unsigned Condition, bool InsertVectorLengthCheck) const {
   MachineFunction &MF = DAG.getMachineFunction();
   AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
   FuncInfo->setHasStreamingModeChanges(true);
 
+  auto GetCheckVL = [&](SDValue Chain, SDValue InGlue = SDValue()) -> SDValue {
+    SmallVector<SDValue, 2> Ops = {Chain};
+    if (InGlue)
+      Ops.push_back(InGlue);
+    return DAG.getNode(AArch64ISD::CHECK_MATCHING_VL, DL,
+                       DAG.getVTList(MVT::Other, MVT::Glue), Ops);
+  };
+
+  if (InsertVectorLengthCheck && Enable) {
+    // Non-streaming -> Streaming
+    // Insert vector length check before smstart
+    SDValue CheckVL = GetCheckVL(Chain, InGlue);
+    Chain = CheckVL.getValue(0);
+    InGlue = CheckVL.getValue(1);
+  }
+
   const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
   SDValue RegMask = DAG.getRegisterMask(TRI->getSMStartStopCallPreservedMask());
   SDValue MSROp =
@@ -9156,7 +9227,16 @@ SDValue AArch64TargetLowering::changeStreamingMode(SelectionDAG &DAG, SDLoc DL,
   if (InGlue)
     Ops.push_back(InGlue);
 
-  return DAG.getNode(Opcode, DL, DAG.getVTList(MVT::Other, MVT::Glue), Ops);
+  SDValue SMChange =
+      DAG.getNode(Opcode, DL, DAG.getVTList(MVT::Other, MVT::Glue), Ops);
+
+  if (!InsertVectorLengthCheck || Enable)
+    return SMChange;
+
+  // Streaming -> Non-streaming
+  // Insert vector length check after smstop since we cannot read VL
+  // in streaming mode
+  return GetCheckVL(SMChange.getValue(0), SMChange.getValue(1));
 }
 
 // Emit a call to __arm_sme_save or __arm_sme_restore.
@@ -9226,7 +9306,7 @@ static bool shouldLowerTailCallStackArg(const MachineFunction &MF,
       if (CallOffset != MFI.getObjectOffset(FI))
         return true;
       uint64_t SizeInBits = LoadNode->getMemoryVT().getFixedSizeInBits();
-      if (SizeInBits / 8 != MFI.getObjectSize(FI))
+      if (SizeInBits / 8 != static_cast<uint64_t>(MFI.getObjectSize(FI)))
         return true;
       return false;
     }
@@ -9312,6 +9392,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
 
   std::optional<unsigned> ZAMarkerNode;
   bool UseNewSMEABILowering = getTM().useNewSMEABILowering();
+
   if (UseNewSMEABILowering) {
     if (CallAttrs.requiresLazySave() ||
         CallAttrs.requiresPreservingAllZAState())
@@ -9440,10 +9521,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
   // If the caller has ZT0 state which will not be preserved by the callee,
   // spill ZT0 before the call.
   if (ShouldPreserveZT0) {
-    unsigned ZTObj = MFI.CreateSpillStackObject(64, Align(16));
-    ZTFrameIdx = DAG.getFrameIndex(
-        ZTObj,
-        DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout()));
+    ZTFrameIdx = getZT0FrameIndex(MFI, *FuncInfo, DAG);
 
     Chain = DAG.getNode(AArch64ISD::SAVE_ZT, DL, DAG.getVTList(MVT::Other),
                         {Chain, DAG.getConstant(0, DL, MVT::i32), ZTFrameIdx});
@@ -9740,9 +9818,11 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
 
   SDValue InGlue;
   if (RequiresSMChange) {
-    Chain =
-        changeStreamingMode(DAG, DL, CallAttrs.callee().hasStreamingInterface(),
-                            Chain, InGlue, getSMToggleCondition(CallAttrs));
+    bool InsertVectorLengthCheck =
+        (CallConv == CallingConv::AArch64_SVE_VectorCall);
+    Chain = changeStreamingMode(
+        DAG, DL, CallAttrs.callee().hasStreamingInterface(), Chain, InGlue,
+        getSMToggleCondition(CallAttrs), InsertVectorLengthCheck);
     InGlue = Chain.getValue(1);
   }
 
@@ -17393,8 +17473,7 @@ bool AArch64TargetLowering::optimizeExtendOrTruncateConversion(
             if (match(SingleUser, m_c_Mul(m_Specific(I), m_SExt(m_Value()))))
               return true;
             if (match(SingleUser,
-                      m_Intrinsic<
-                          Intrinsic::experimental_vector_partial_reduce_add>(
+                      m_Intrinsic<Intrinsic::vector_partial_reduce_add>(
                           m_Value(), m_Specific(I))))
               return true;
             return false;
@@ -18995,6 +19074,17 @@ static SDValue performUADDVCombine(SDNode *N, SelectionDAG &DAG) {
     else if (SDValue R = performUADDVZextCombine(A, DAG))
       return R;
   }
+
+  // uaddv(A) --> A if all lanes of A are known to be zeros except the 0th lane.
+  MVT OpVT = A.getSimpleValueType();
+  assert(N->getSimpleValueType(0) == OpVT &&
+         "The operand type should be consistent with the result type of UADDV");
+  APInt Mask = APInt::getAllOnes(OpVT.getVectorNumElements());
+  Mask.clearBit(0);
+  KnownBits KnownLeadingLanes = DAG.computeKnownBits(A, Mask);
+  if (KnownLeadingLanes.isZero())
+    return A;
+
   return SDValue();
 }
 
@@ -22424,140 +22514,6 @@ static SDValue convertMergedOpToPredOp(SDNode *N, unsigned Opc,
   return SDValue();
 }
 
-SDValue tryLowerPartialReductionToDot(SDNode *N,
-                                      const AArch64Subtarget *Subtarget,
-                                      SelectionDAG &DAG) {
-
-  assert(N->getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
-         getIntrinsicID(N) ==
-             Intrinsic::experimental_vector_partial_reduce_add &&
-         "Expected a partial reduction node");
-
-  bool Scalable = N->getValueType(0).isScalableVector();
-  if (Scalable && !Subtarget->isSVEorStreamingSVEAvailable())
-    return SDValue();
-  if (!Scalable && (!Subtarget->isNeonAvailable() || !Subtarget->hasDotProd()))
-    return SDValue();
-
-  SDLoc DL(N);
-
-  SDValue Op2 = N->getOperand(2);
-  unsigned Op2Opcode = Op2->getOpcode();
-  SDValue MulOpLHS, MulOpRHS;
-  bool MulOpLHSIsSigned, MulOpRHSIsSigned;
-  if (ISD::isExtOpcode(Op2Opcode)) {
-    MulOpLHSIsSigned = MulOpRHSIsSigned = (Op2Opcode == ISD::SIGN_EXTEND);
-    MulOpLHS = Op2->getOperand(0);
-    MulOpRHS = DAG.getConstant(1, DL, MulOpLHS.getValueType());
-  } else if (Op2Opcode == ISD::MUL) {
-    SDValue ExtMulOpLHS = Op2->getOperand(0);
-    SDValue ExtMulOpRHS = Op2->getOperand(1);
-
-    unsigned ExtMulOpLHSOpcode = ExtMulOpLHS->getOpcode();
-    unsigned ExtMulOpRHSOpcode = ExtMulOpRHS->getOpcode();
-    if (!ISD::isExtOpcode(ExtMulOpLHSOpcode) ||
-        !ISD::isExtOpcode(ExtMulOpRHSOpcode))
-      return SDValue();
-
-    MulOpLHSIsSigned = ExtMulOpLHSOpcode == ISD::SIGN_EXTEND;
-    MulOpRHSIsSigned = ExtMulOpRHSOpcode == ISD::SIGN_EXTEND;
-
-    MulOpLHS = ExtMulOpLHS->getOperand(0);
-    MulOpRHS = ExtMulOpRHS->getOperand(0);
-
-    if (MulOpLHS.getValueType() != MulOpRHS.getValueType())
-      return SDValue();
-  } else
-    return SDValue();
-
-  SDValue Acc = N->getOperand(1);
-  EVT ReducedVT = N->getValueType(0);
-  EVT MulSrcVT = MulOpLHS.getValueType();
-
-  // Dot products operate on chunks of four elements so there must be four times
-  // as many elements in the wide type
-  if (!(ReducedVT == MVT::nxv4i64 && MulSrcVT == MVT::nxv16i8) &&
-      !(ReducedVT == MVT::nxv4i32 && MulSrcVT == MVT::nxv16i8) &&
-      !(ReducedVT == MVT::nxv2i64 && MulSrcVT == MVT::nxv8i16) &&
-      !(ReducedVT == MVT::v4i64 && MulSrcVT == MVT::v16i8) &&
-      !(ReducedVT == MVT::v4i32 && MulSrcVT == MVT::v16i8) &&
-      !(ReducedVT == MVT::v2i32 && MulSrcVT == MVT::v8i8))
-    return SDValue();
-
-  // If the extensions are mixed, we should lower it to a usdot instead
-  unsigned Opcode = 0;
-  if (MulOpLHSIsSigned != MulOpRHSIsSigned) {
-    if (!Subtarget->hasMatMulInt8())
-      return SDValue();
-
-    bool Scalable = N->getValueType(0).isScalableVT();
-    // There's no nxv2i64 version of usdot
-    if (Scalable && ReducedVT != MVT::nxv4i32 && ReducedVT != MVT::nxv4i64)
-      return SDValue();
-
-    Opcode = AArch64ISD::USDOT;
-    // USDOT expects the signed operand to be last
-    if (!MulOpRHSIsSigned)
-      std::swap(MulOpLHS, MulOpRHS);
-  } else
-    Opcode = MulOpLHSIsSigned ? AArch64ISD::SDOT : AArch64ISD::UDOT;
-
-  // Partial reduction lowering for (nx)v16i8 to (nx)v4i64 requires an i32 dot
-  // product followed by a zero / sign extension
-  if ((ReducedVT == MVT::nxv4i64 && MulSrcVT == MVT::nxv16i8) ||
-      (ReducedVT == MVT::v4i64 && MulSrcVT == MVT::v16i8)) {
-    EVT ReducedVTI32 =
-        (ReducedVT.isScalableVector()) ? MVT::nxv4i32 : MVT::v4i32;
-
-    SDValue DotI32 =
-        DAG.getNode(Opcode, DL, ReducedVTI32,
-                    DAG.getConstant(0, DL, ReducedVTI32), MulOpLHS, MulOpRHS);
-    SDValue Extended = DAG.getSExtOrTrunc(DotI32, DL, ReducedVT);
-    return DAG.getNode(ISD::ADD, DL, ReducedVT, Acc, Extended);
-  }
-
-  return DAG.getNode(Opcode, DL, ReducedVT, Acc, MulOpLHS, MulOpRHS);
-}
-
-SDValue tryLowerPartialReductionToWideAdd(SDNode *N,
-                                          const AArch64Subtarget *Subtarget,
-                                          SelectionDAG &DAG) {
-
-  assert(N->getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
-         getIntrinsicID(N) ==
-             Intrinsic::experimental_vector_partial_reduce_add &&
-         "Expected a partial reduction node");
-
-  if (!Subtarget->hasSVE2() && !Subtarget->isStreamingSVEAvailable())
-    return SDValue();
-
-  SDLoc DL(N);
-
-  if (!ISD::isExtOpcode(N->getOperand(2).getOpcode()))
-    return SDValue();
-  SDValue Acc = N->getOperand(1);
-  SDValue Ext = N->getOperand(2);
-  EVT AccVT = Acc.getValueType();
-  EVT ExtVT = Ext.getValueType();
-  if (ExtVT.getVectorElementType() != AccVT.getVectorElementType())
-    return SDValue();
-
-  SDValue ExtOp = Ext->getOperand(0);
-  EVT ExtOpVT = ExtOp.getValueType();
-
-  if (!(ExtOpVT == MVT::nxv4i32 && AccVT == MVT::nxv2i64) &&
-      !(ExtOpVT == MVT::nxv8i16 && AccVT == MVT::nxv4i32) &&
-      !(ExtOpVT == MVT::nxv16i8 && AccVT == MVT::nxv8i16))
-    return SDValue();
-
-  bool ExtOpIsSigned = Ext.getOpcode() == ISD::SIGN_EXTEND;
-  unsigned BottomOpcode =
-      ExtOpIsSigned ? AArch64ISD::SADDWB : AArch64ISD::UADDWB;
-  unsigned TopOpcode = ExtOpIsSigned ? AArch64ISD::SADDWT : AArch64ISD::UADDWT;
-  SDValue BottomNode = DAG.getNode(BottomOpcode, DL, AccVT, Acc, ExtOp);
-  return DAG.getNode(TopOpcode, DL, AccVT, BottomNode, ExtOp);
-}
-
 static SDValue combineSVEBitSel(unsigned IID, SDNode *N, SelectionDAG &DAG) {
   SDLoc DL(N);
   EVT VT = N->getValueType(0);
@@ -22590,17 +22546,6 @@ static SDValue performIntrinsicCombine(SDNode *N,
   switch (IID) {
   default:
     break;
-  case Intrinsic::experimental_vector_partial_reduce_add: {
-    if (SDValue Dot = tryLowerPartialReductionToDot(N, Subtarget, DAG))
-      return Dot;
-    if (SDValue WideAdd = tryLowerPartialReductionToWideAdd(N, Subtarget, DAG))
-      return WideAdd;
-    SDLoc DL(N);
-    SDValue Input = N->getOperand(2);
-    return DAG.getNode(ISD::PARTIAL_REDUCE_UMLA, DL, N->getValueType(0),
-                       N->getOperand(1), Input,
-                       DAG.getConstant(1, DL, Input.getValueType()));
-  }
   case Intrinsic::aarch64_neon_vcvtfxs2fp:
   case Intrinsic::aarch64_neon_vcvtfxu2fp:
     return tryCombineFixedPointConvert(N, DCI, DAG);
@@ -26109,6 +26054,17 @@ static SDValue performSetCCPunpkCombine(SDNode *N, SelectionDAG &DAG) {
   return SDValue();
 }
 
+static bool isSignExtInReg(const SDValue &V) {
+  if (V.getOpcode() != AArch64ISD::VASHR ||
+      V.getOperand(0).getOpcode() != AArch64ISD::VSHL)
+    return false;
+
+  unsigned BitWidth = V->getValueType(0).getScalarSizeInBits();
+  unsigned ShiftAmtR = V.getConstantOperandVal(1);
+  unsigned ShiftAmtL = V.getOperand(0).getConstantOperandVal(1);
+  return (ShiftAmtR == ShiftAmtL && ShiftAmtR == (BitWidth - 1));
+}
+
 static SDValue
 performSetccMergeZeroCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
   assert(N->getOpcode() == AArch64ISD::SETCC_MERGE_ZERO &&
@@ -26149,6 +26105,27 @@ performSetccMergeZeroCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
                          LHS->getOperand(0), Pred);
   }
 
+  //    setcc_merge_zero(
+  //       pred, insert_subvector(undef, signext_inreg(vNi1), 0), != splat(0))
+  // => setcc_merge_zero(
+  //       pred, insert_subvector(undef, shl(vNi1), 0), != splat(0))
+  if (Cond == ISD::SETNE && isZerosVector(RHS.getNode()) &&
+      LHS->getOpcode() == ISD::INSERT_SUBVECTOR && LHS.hasOneUse()) {
+    SDValue L0 = LHS->getOperand(0);
+    SDValue L1 = LHS->getOperand(1);
+    SDValue L2 = LHS->getOperand(2);
+
+    if (L0.getOpcode() == ISD::UNDEF && isNullConstant(L2) &&
+        isSignExtInReg(L1)) {
+      SDLoc DL(N);
+      SDValue Shl = L1.getOperand(0);
+      SDValue NewLHS = DAG.getNode(ISD::INSERT_SUBVECTOR, DL,
+                                   LHS.getValueType(), L0, Shl, L2);
+      return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, N->getValueType(0),
+                         Pred, NewLHS, RHS, N->getOperand(3));
+    }
+  }
+
   return SDValue();
 }
 
@@ -27585,6 +27562,10 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
     if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ false))
       return R;
     return performFlagSettingCombine(N, DCI, AArch64ISD::SBC);
+  case AArch64ISD::ADDS:
+    return performFlagSettingCombine(N, DCI, ISD::ADD);
+  case AArch64ISD::SUBS:
+    return performFlagSettingCombine(N, DCI, ISD::SUB);
   case AArch64ISD::BICi: {
     APInt DemandedBits =
         APInt::getAllOnes(N->getValueType(0).getScalarSizeInBits());
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index f5d14905cac66..ff073d3eafb1f 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -168,6 +168,9 @@ class AArch64TargetLowering : public TargetLowering {
   MachineBasicBlock *EmitDynamicProbedAlloc(MachineInstr &MI,
                                             MachineBasicBlock *MBB) const;
 
+  MachineBasicBlock *EmitCheckMatchingVL(MachineInstr &MI,
+                                         MachineBasicBlock *MBB) const;
+
   MachineBasicBlock *EmitTileLoad(unsigned Opc, unsigned BaseReg,
                                   MachineInstr &MI,
                                   MachineBasicBlock *BB) const;
@@ -532,8 +535,8 @@ class AArch64TargetLowering : public TargetLowering {
   /// node. \p Condition should be one of the enum values from
   /// AArch64SME::ToggleCondition.
   SDValue changeStreamingMode(SelectionDAG &DAG, SDLoc DL, bool Enable,
-                              SDValue Chain, SDValue InGlue,
-                              unsigned Condition) const;
+                              SDValue Chain, SDValue InGlue, unsigned Condition,
+                              bool InsertVectorLengthCheck = false) const;
 
   bool isVScaleKnownToBeAPowerOfTwo() const override { return true; }
 
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index bf3d47ac43607..5a51c812732e6 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -6273,6 +6273,11 @@ void llvm::emitFrameOffset(MachineBasicBlock &MBB,
   AArch64InstrInfo::decomposeStackOffsetForFrameOffsets(
       Offset, Bytes, NumPredicateVectors, NumDataVectors);
 
+  // Insert ADDSXri for scalable offset at the end.
+  bool NeedsFinalDefNZCV = SetNZCV && (NumPredicateVectors || NumDataVectors);
+  if (NeedsFinalDefNZCV)
+    SetNZCV = false;
+
   // First emit non-scalable frame offsets, or a simple 'mov'.
   if (Bytes || (!Offset && SrcReg != DestReg)) {
     assert((DestReg != AArch64::SP || Bytes % 8 == 0) &&
@@ -6292,8 +6297,6 @@ void llvm::emitFrameOffset(MachineBasicBlock &MBB,
     FrameReg = DestReg;
   }
 
-  assert(!(SetNZCV && (NumPredicateVectors || NumDataVectors)) &&
-         "SetNZCV not supported with SVE vectors");
   assert(!(NeedsWinCFI && NumPredicateVectors) &&
          "WinCFI can't allocate fractions of an SVE data vector");
 
@@ -6313,6 +6316,12 @@ void llvm::emitFrameOffset(MachineBasicBlock &MBB,
                        Flag, NeedsWinCFI, HasWinCFI, EmitCFAOffset, CFAOffset,
                        FrameReg);
   }
+
+  if (NeedsFinalDefNZCV)
+    BuildMI(MBB, MBBI, DL, TII->get(AArch64::ADDSXri), DestReg)
+        .addReg(DestReg)
+        .addImm(0)
+        .addImm(0);
 }
 
 MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl(
@@ -10951,9 +10960,8 @@ static Register cloneInstr(const MachineInstr *MI, unsigned ReplaceOprNum,
           MRI.getRegClass(NewMI->getOperand(0).getReg()));
       NewMI->getOperand(I).setReg(Result);
     } else if (I == ReplaceOprNum) {
-      MRI.constrainRegClass(
-          ReplaceReg,
-          TII->getRegClass(NewMI->getDesc(), I, TRI, *MBB.getParent()));
+      MRI.constrainRegClass(ReplaceReg,
+                            TII->getRegClass(NewMI->getDesc(), I, TRI));
       NewMI->getOperand(I).setReg(ReplaceReg);
     }
   }
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index f0020a9a3c91d..3fcafc6d35090 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -5409,6 +5409,11 @@ defm UCVTF : IntegerToFP<0b00, 0b011, "ucvtf", any_uint_to_fp>;
 let Predicates = [HasNEON, HasFPRCVT] in {
   defm SCVTF : IntegerToFPSIMDScalar<0b11, 0b100, "scvtf", any_sint_to_fp>;
   defm UCVTF : IntegerToFPSIMDScalar<0b11, 0b101, "ucvtf", any_uint_to_fp>;
+
+  def : Pat<(v1f64 (extract_subvector (v2f64 (sint_to_fp (v2i64 (sext (v2i32 V64:$Rn))))), (i64 0))),
+          (SCVTFDSr (EXTRACT_SUBREG V64:$Rn, ssub))>;
+  def : Pat<(v1f64 (extract_subvector (v2f64 (uint_to_fp (v2i64 (zext (v2i32 V64:$Rn))))), (i64 0))),
+          (UCVTFDSr (EXTRACT_SUBREG V64:$Rn, ssub))>;
 }
 
 def : Pat<(f16 (fdiv (f16 (any_sint_to_fp (i32 GPR32:$Rn))), fixedpoint_f16_i32:$scale)),
diff --git a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
index fd4ef2aa28f8a..04e76c7abd202 100644
--- a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
+++ b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
@@ -594,19 +594,18 @@ bool AArch64MIPeepholeOpt::splitTwoPartImm(
   // NewDstReg = Opcode.second NewTmpReg Imm1
 
   // Determine register classes for destinations and register operands
-  MachineFunction *MF = MI.getMF();
   const TargetRegisterClass *FirstInstrDstRC =
-      TII->getRegClass(TII->get(Opcode.first), 0, TRI, *MF);
+      TII->getRegClass(TII->get(Opcode.first), 0, TRI);
   const TargetRegisterClass *FirstInstrOperandRC =
-      TII->getRegClass(TII->get(Opcode.first), 1, TRI, *MF);
+      TII->getRegClass(TII->get(Opcode.first), 1, TRI);
   const TargetRegisterClass *SecondInstrDstRC =
       (Opcode.first == Opcode.second)
           ? FirstInstrDstRC
-          : TII->getRegClass(TII->get(Opcode.second), 0, TRI, *MF);
+          : TII->getRegClass(TII->get(Opcode.second), 0, TRI);
   const TargetRegisterClass *SecondInstrOperandRC =
       (Opcode.first == Opcode.second)
           ? FirstInstrOperandRC
-          : TII->getRegClass(TII->get(Opcode.second), 1, TRI, *MF);
+          : TII->getRegClass(TII->get(Opcode.second), 1, TRI);
 
   // Get old registers destinations and new register destinations
   Register DstReg = MI.getOperand(0).getReg();
@@ -785,14 +784,14 @@ bool AArch64MIPeepholeOpt::visitUBFMXri(MachineInstr &MI) {
   }
 
   const TargetRegisterClass *DstRC64 =
-      TII->getRegClass(TII->get(MI.getOpcode()), 0, TRI, *MI.getMF());
+      TII->getRegClass(TII->get(MI.getOpcode()), 0, TRI);
   const TargetRegisterClass *DstRC32 =
       TRI->getSubRegisterClass(DstRC64, AArch64::sub_32);
   assert(DstRC32 && "Destination register class of UBFMXri doesn't have a "
                     "sub_32 subregister class");
 
   const TargetRegisterClass *SrcRC64 =
-      TII->getRegClass(TII->get(MI.getOpcode()), 1, TRI, *MI.getMF());
+      TII->getRegClass(TII->get(MI.getOpcode()), 1, TRI);
   const TargetRegisterClass *SrcRC32 =
       TRI->getSubRegisterClass(SrcRC64, AArch64::sub_32);
   assert(SrcRC32 && "Source register class of UBFMXri doesn't have a sub_32 "
diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp
index b4197a04840b7..a81f5b3d436a9 100644
--- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp
@@ -28,7 +28,10 @@ yaml::AArch64FunctionInfo::AArch64FunctionInfo(
     : HasRedZone(MFI.hasRedZone()),
       StackSizeSVE(MFI.hasCalculatedStackSizeSVE()
                        ? std::optional<uint64_t>(MFI.getStackSizeSVE())
-                       : std::nullopt) {}
+                       : std::nullopt),
+      HasStackFrame(MFI.hasStackFrame()
+                        ? std::optional<bool>(MFI.hasStackFrame())
+                        : std::nullopt) {}
 
 void yaml::AArch64FunctionInfo::mappingImpl(yaml::IO &YamlIO) {
   MappingTraits<AArch64FunctionInfo>::mapping(YamlIO, *this);
@@ -40,6 +43,8 @@ void AArch64FunctionInfo::initializeBaseYamlFields(
     HasRedZone = YamlMFI.HasRedZone;
   if (YamlMFI.StackSizeSVE)
     setStackSizeSVE(*YamlMFI.StackSizeSVE);
+  if (YamlMFI.HasStackFrame)
+    setHasStackFrame(*YamlMFI.HasStackFrame);
 }
 
 static std::pair<bool, bool> GetSignReturnAddress(const Function &F) {
diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
index 993cff112ba84..897c7e8539608 100644
--- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
@@ -239,6 +239,9 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
   // support).
   Register EarlyAllocSMESaveBuffer = AArch64::NoRegister;
 
+  // Holds the spill slot for ZT0.
+  int ZT0SpillSlotIndex = std::numeric_limits<int>::max();
+
   // Note: The following properties are only used for the old SME ABI lowering:
   /// The frame-index for the TPIDR2 object used for lazy saves.
   TPIDR2Object TPIDR2;
@@ -265,6 +268,15 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
     return EarlyAllocSMESaveBuffer;
   }
 
+  void setZT0SpillSlotIndex(int FI) { ZT0SpillSlotIndex = FI; }
+  int getZT0SpillSlotIndex() const {
+    assert(hasZT0SpillSlotIndex() && "ZT0 spill slot index not set!");
+    return ZT0SpillSlotIndex;
+  }
+  bool hasZT0SpillSlotIndex() const {
+    return ZT0SpillSlotIndex != std::numeric_limits<int>::max();
+  }
+
   // Old SME ABI lowering state getters/setters:
   Register getSMESaveBufferAddr() const { return SMESaveBufferAddr; };
   void setSMESaveBufferAddr(Register Reg) { SMESaveBufferAddr = Reg; };
@@ -600,6 +612,7 @@ namespace yaml {
 struct AArch64FunctionInfo final : public yaml::MachineFunctionInfo {
   std::optional<bool> HasRedZone;
   std::optional<uint64_t> StackSizeSVE;
+  std::optional<bool> HasStackFrame;
 
   AArch64FunctionInfo() = default;
   AArch64FunctionInfo(const llvm::AArch64FunctionInfo &MFI);
@@ -612,6 +625,7 @@ template <> struct MappingTraits<AArch64FunctionInfo> {
   static void mapping(IO &YamlIO, AArch64FunctionInfo &MFI) {
     YamlIO.mapOptional("hasRedZone", MFI.HasRedZone);
     YamlIO.mapOptional("stackSizeSVE", MFI.StackSizeSVE);
+    YamlIO.mapOptional("hasStackFrame", MFI.HasStackFrame);
   }
 };
 
diff --git a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp
index af424987b8ddb..700c45a8aec9a 100644
--- a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp
+++ b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp
@@ -791,4 +791,436 @@ void AArch64PrologueEmitter::emitCalleeSavedSVELocations(
   }
 }
 
+static bool isFuncletReturnInstr(const MachineInstr &MI) {
+  switch (MI.getOpcode()) {
+  default:
+    return false;
+  case AArch64::CATCHRET:
+  case AArch64::CLEANUPRET:
+    return true;
+  }
+}
+
+AArch64EpilogueEmitter::AArch64EpilogueEmitter(MachineFunction &MF,
+                                               MachineBasicBlock &MBB,
+                                               const AArch64FrameLowering &AFL)
+    : MF(MF), MBB(MBB), MFI(MF.getFrameInfo()),
+      Subtarget(MF.getSubtarget<AArch64Subtarget>()), AFL(AFL) {
+  TII = Subtarget.getInstrInfo();
+  AFI = MF.getInfo<AArch64FunctionInfo>();
+
+  NeedsWinCFI = AFL.needsWinCFI(MF);
+  EmitCFI = AFI->needsAsyncDwarfUnwindInfo(MF);
+  SEHEpilogueStartI = MBB.end();
+}
+
+void AArch64EpilogueEmitter::emitEpilogue() {
+  MachineBasicBlock::iterator EpilogueEndI = MBB.getLastNonDebugInstr();
+  if (MBB.end() != EpilogueEndI) {
+    DL = EpilogueEndI->getDebugLoc();
+    IsFunclet = isFuncletReturnInstr(*EpilogueEndI);
+  }
+
+  int64_t NumBytes =
+      IsFunclet ? AFL.getWinEHFuncletFrameSize(MF) : MFI.getStackSize();
+
+  // All calls are tail calls in GHC calling conv, and functions have no
+  // prologue/epilogue.
+  if (MF.getFunction().getCallingConv() == CallingConv::GHC)
+    return;
+
+  // How much of the stack used by incoming arguments this function is expected
+  // to restore in this particular epilogue.
+  int64_t ArgumentStackToRestore = AFL.getArgumentStackToRestore(MF, MBB);
+  bool IsWin64 = Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv(),
+                                              MF.getFunction().isVarArg());
+  unsigned FixedObject = AFL.getFixedObjectSize(MF, AFI, IsWin64, IsFunclet);
+
+  int64_t AfterCSRPopSize = ArgumentStackToRestore;
+  auto PrologueSaveSize = AFI->getCalleeSavedStackSize() + FixedObject;
+  // We cannot rely on the local stack size set in emitPrologue if the function
+  // has funclets, as funclets have different local stack size requirements, and
+  // the current value set in emitPrologue may be that of the containing
+  // function.
+  if (MF.hasEHFunclets())
+    AFI->setLocalStackSize(NumBytes - PrologueSaveSize);
+
+  if (AFL.homogeneousPrologEpilog(MF, &MBB)) {
+    assert(!NeedsWinCFI);
+    auto FirstHomogenousEpilogI = MBB.getFirstTerminator();
+    if (FirstHomogenousEpilogI != MBB.begin()) {
+      auto HomogeneousEpilog = std::prev(FirstHomogenousEpilogI);
+      if (HomogeneousEpilog->getOpcode() == AArch64::HOM_Epilog)
+        FirstHomogenousEpilogI = HomogeneousEpilog;
+    }
+
+    // Adjust local stack
+    emitFrameOffset(MBB, FirstHomogenousEpilogI, DL, AArch64::SP, AArch64::SP,
+                    StackOffset::getFixed(AFI->getLocalStackSize()), TII,
+                    MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI);
+
+    // SP has been already adjusted while restoring callee save regs.
+    // We've bailed-out the case with adjusting SP for arguments.
+    assert(AfterCSRPopSize == 0);
+    return;
+  }
+
+  bool FPAfterSVECalleeSaves =
+      Subtarget.isTargetWindows() && AFI->getSVECalleeSavedStackSize();
+
+  bool CombineSPBump =
+      AFL.shouldCombineCSRLocalStackBumpInEpilogue(MBB, NumBytes);
+  // Assume we can't combine the last pop with the sp restore.
+  bool CombineAfterCSRBump = false;
+  if (FPAfterSVECalleeSaves) {
+    AfterCSRPopSize += FixedObject;
+  } else if (!CombineSPBump && PrologueSaveSize != 0) {
+    MachineBasicBlock::iterator Pop = std::prev(MBB.getFirstTerminator());
+    while (Pop->getOpcode() == TargetOpcode::CFI_INSTRUCTION ||
+           AArch64InstrInfo::isSEHInstruction(*Pop))
+      Pop = std::prev(Pop);
+    // Converting the last ldp to a post-index ldp is valid only if the last
+    // ldp's offset is 0.
+    const MachineOperand &OffsetOp = Pop->getOperand(Pop->getNumOperands() - 1);
+    // If the offset is 0 and the AfterCSR pop is not actually trying to
+    // allocate more stack for arguments (in space that an untimely interrupt
+    // may clobber), convert it to a post-index ldp.
+    if (OffsetOp.getImm() == 0 && AfterCSRPopSize >= 0) {
+      AFL.convertCalleeSaveRestoreToSPPrePostIncDec(
+          MBB, Pop, DL, TII, PrologueSaveSize, NeedsWinCFI, &HasWinCFI, EmitCFI,
+          MachineInstr::FrameDestroy, PrologueSaveSize);
+    } else {
+      // If not, make sure to emit an add after the last ldp.
+      // We're doing this by transferring the size to be restored from the
+      // adjustment *before* the CSR pops to the adjustment *after* the CSR
+      // pops.
+      AfterCSRPopSize += PrologueSaveSize;
+      CombineAfterCSRBump = true;
+    }
+  }
+
+  // Move past the restores of the callee-saved registers.
+  // If we plan on combining the sp bump of the local stack size and the callee
+  // save stack size, we might need to adjust the CSR save and restore offsets.
+  MachineBasicBlock::iterator FirstGPRRestoreI = MBB.getFirstTerminator();
+  MachineBasicBlock::iterator Begin = MBB.begin();
+  while (FirstGPRRestoreI != Begin) {
+    --FirstGPRRestoreI;
+    if (!FirstGPRRestoreI->getFlag(MachineInstr::FrameDestroy) ||
+        (!FPAfterSVECalleeSaves && AFL.isSVECalleeSave(FirstGPRRestoreI))) {
+      ++FirstGPRRestoreI;
+      break;
+    } else if (CombineSPBump)
+      AFL.fixupCalleeSaveRestoreStackOffset(
+          *FirstGPRRestoreI, AFI->getLocalStackSize(), NeedsWinCFI, &HasWinCFI);
+  }
+
+  if (NeedsWinCFI) {
+    // Note that there are cases where we insert SEH opcodes in the
+    // epilogue when we had no SEH opcodes in the prologue. For
+    // example, when there is no stack frame but there are stack
+    // arguments. Insert the SEH_EpilogStart and remove it later if it
+    // we didn't emit any SEH opcodes to avoid generating WinCFI for
+    // functions that don't need it.
+    BuildMI(MBB, FirstGPRRestoreI, DL, TII->get(AArch64::SEH_EpilogStart))
+        .setMIFlag(MachineInstr::FrameDestroy);
+    SEHEpilogueStartI = FirstGPRRestoreI;
+    --SEHEpilogueStartI;
+  }
+
+  if (AFL.hasFP(MF) && AFI->hasSwiftAsyncContext())
+    emitSwiftAsyncContextFramePointer(EpilogueEndI, DL);
+
+  const StackOffset &SVEStackSize = AFL.getSVEStackSize(MF);
+
+  // If there is a single SP update, insert it before the ret and we're done.
+  if (CombineSPBump) {
+    assert(!SVEStackSize && "Cannot combine SP bump with SVE");
+
+    // When we are about to restore the CSRs, the CFA register is SP again.
+    if (EmitCFI && AFL.hasFP(MF))
+      CFIInstBuilder(MBB, FirstGPRRestoreI, MachineInstr::FrameDestroy)
+          .buildDefCFA(AArch64::SP, NumBytes);
+
+    emitFrameOffset(MBB, MBB.getFirstTerminator(), DL, AArch64::SP, AArch64::SP,
+                    StackOffset::getFixed(NumBytes + AfterCSRPopSize), TII,
+                    MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI,
+                    EmitCFI, StackOffset::getFixed(NumBytes));
+    return;
+  }
+
+  NumBytes -= PrologueSaveSize;
+  assert(NumBytes >= 0 && "Negative stack allocation size!?");
+
+  // Process the SVE callee-saves to determine what space needs to be
+  // deallocated.
+  StackOffset DeallocateBefore = {}, DeallocateAfter = SVEStackSize;
+  MachineBasicBlock::iterator RestoreBegin = FirstGPRRestoreI,
+                              RestoreEnd = FirstGPRRestoreI;
+  if (int64_t CalleeSavedSize = AFI->getSVECalleeSavedStackSize()) {
+    if (FPAfterSVECalleeSaves)
+      RestoreEnd = MBB.getFirstTerminator();
+
+    RestoreBegin = std::prev(RestoreEnd);
+    while (RestoreBegin != MBB.begin() &&
+           AFL.isSVECalleeSave(std::prev(RestoreBegin)))
+      --RestoreBegin;
+
+    assert(AFL.isSVECalleeSave(RestoreBegin) &&
+           AFL.isSVECalleeSave(std::prev(RestoreEnd)) &&
+           "Unexpected instruction");
+
+    StackOffset CalleeSavedSizeAsOffset =
+        StackOffset::getScalable(CalleeSavedSize);
+    DeallocateBefore = SVEStackSize - CalleeSavedSizeAsOffset;
+    DeallocateAfter = CalleeSavedSizeAsOffset;
+  }
+
+  // Deallocate the SVE area.
+  if (FPAfterSVECalleeSaves) {
+    // If the callee-save area is before FP, restoring the FP implicitly
+    // deallocates non-callee-save SVE allocations.  Otherwise, deallocate
+    // them explicitly.
+    if (!AFI->isStackRealigned() && !MFI.hasVarSizedObjects()) {
+      emitFrameOffset(MBB, FirstGPRRestoreI, DL, AArch64::SP, AArch64::SP,
+                      DeallocateBefore, TII, MachineInstr::FrameDestroy, false,
+                      NeedsWinCFI, &HasWinCFI);
+    }
+
+    // Deallocate callee-save non-SVE registers.
+    emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP,
+                    StackOffset::getFixed(AFI->getCalleeSavedStackSize()), TII,
+                    MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI);
+
+    // Deallocate fixed objects.
+    emitFrameOffset(MBB, RestoreEnd, DL, AArch64::SP, AArch64::SP,
+                    StackOffset::getFixed(FixedObject), TII,
+                    MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI);
+
+    // Deallocate callee-save SVE registers.
+    emitFrameOffset(MBB, RestoreEnd, DL, AArch64::SP, AArch64::SP,
+                    DeallocateAfter, TII, MachineInstr::FrameDestroy, false,
+                    NeedsWinCFI, &HasWinCFI);
+  } else if (SVEStackSize) {
+    int64_t SVECalleeSavedSize = AFI->getSVECalleeSavedStackSize();
+    // If we have stack realignment or variable-sized objects we must use the
+    // FP to restore SVE callee saves (as there is an unknown amount of
+    // data/padding between the SP and SVE CS area).
+    Register BaseForSVEDealloc =
+        (AFI->isStackRealigned() || MFI.hasVarSizedObjects()) ? AArch64::FP
+                                                              : AArch64::SP;
+    if (SVECalleeSavedSize && BaseForSVEDealloc == AArch64::FP) {
+      Register CalleeSaveBase = AArch64::FP;
+      if (int64_t CalleeSaveBaseOffset =
+              AFI->getCalleeSaveBaseToFrameRecordOffset()) {
+        // If we have have an non-zero offset to the non-SVE CS base we need to
+        // compute the base address by subtracting the offest in a temporary
+        // register first (to avoid briefly deallocating the SVE CS).
+        CalleeSaveBase = MBB.getParent()->getRegInfo().createVirtualRegister(
+            &AArch64::GPR64RegClass);
+        emitFrameOffset(MBB, RestoreBegin, DL, CalleeSaveBase, AArch64::FP,
+                        StackOffset::getFixed(-CalleeSaveBaseOffset), TII,
+                        MachineInstr::FrameDestroy);
+      }
+      // The code below will deallocate the stack space space by moving the
+      // SP to the start of the SVE callee-save area.
+      emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, CalleeSaveBase,
+                      StackOffset::getScalable(-SVECalleeSavedSize), TII,
+                      MachineInstr::FrameDestroy);
+    } else if (BaseForSVEDealloc == AArch64::SP) {
+      if (SVECalleeSavedSize) {
+        // Deallocate the non-SVE locals first before we can deallocate (and
+        // restore callee saves) from the SVE area.
+        emitFrameOffset(
+            MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP,
+            StackOffset::getFixed(NumBytes), TII, MachineInstr::FrameDestroy,
+            false, NeedsWinCFI, &HasWinCFI, EmitCFI && !AFL.hasFP(MF),
+            SVEStackSize + StackOffset::getFixed(NumBytes + PrologueSaveSize));
+        NumBytes = 0;
+      }
+
+      emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP,
+                      DeallocateBefore, TII, MachineInstr::FrameDestroy, false,
+                      NeedsWinCFI, &HasWinCFI, EmitCFI && !AFL.hasFP(MF),
+                      SVEStackSize +
+                          StackOffset::getFixed(NumBytes + PrologueSaveSize));
+
+      emitFrameOffset(MBB, RestoreEnd, DL, AArch64::SP, AArch64::SP,
+                      DeallocateAfter, TII, MachineInstr::FrameDestroy, false,
+                      NeedsWinCFI, &HasWinCFI, EmitCFI && !AFL.hasFP(MF),
+                      DeallocateAfter +
+                          StackOffset::getFixed(NumBytes + PrologueSaveSize));
+    }
+    if (EmitCFI)
+      emitCalleeSavedSVERestores(RestoreEnd);
+  }
+
+  if (!AFL.hasFP(MF)) {
+    bool RedZone = AFL.canUseRedZone(MF);
+    // If this was a redzone leaf function, we don't need to restore the
+    // stack pointer (but we may need to pop stack args for fastcc).
+    if (RedZone && AfterCSRPopSize == 0)
+      return;
+
+    // Pop the local variables off the stack. If there are no callee-saved
+    // registers, it means we are actually positioned at the terminator and can
+    // combine stack increment for the locals and the stack increment for
+    // callee-popped arguments into (possibly) a single instruction and be done.
+    bool NoCalleeSaveRestore = PrologueSaveSize == 0;
+    int64_t StackRestoreBytes = RedZone ? 0 : NumBytes;
+    if (NoCalleeSaveRestore)
+      StackRestoreBytes += AfterCSRPopSize;
+
+    emitFrameOffset(
+        MBB, FirstGPRRestoreI, DL, AArch64::SP, AArch64::SP,
+        StackOffset::getFixed(StackRestoreBytes), TII,
+        MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI, EmitCFI,
+        StackOffset::getFixed((RedZone ? 0 : NumBytes) + PrologueSaveSize));
+
+    // If we were able to combine the local stack pop with the argument pop,
+    // then we're done.
+    if (NoCalleeSaveRestore || AfterCSRPopSize == 0)
+      return;
+
+    NumBytes = 0;
+  }
+
+  // Restore the original stack pointer.
+  // FIXME: Rather than doing the math here, we should instead just use
+  // non-post-indexed loads for the restores if we aren't actually going to
+  // be able to save any instructions.
+  if (!IsFunclet && (MFI.hasVarSizedObjects() || AFI->isStackRealigned())) {
+    emitFrameOffset(
+        MBB, FirstGPRRestoreI, DL, AArch64::SP, AArch64::FP,
+        StackOffset::getFixed(-AFI->getCalleeSaveBaseToFrameRecordOffset()),
+        TII, MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI);
+  } else if (NumBytes)
+    emitFrameOffset(MBB, FirstGPRRestoreI, DL, AArch64::SP, AArch64::SP,
+                    StackOffset::getFixed(NumBytes), TII,
+                    MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI);
+
+  // When we are about to restore the CSRs, the CFA register is SP again.
+  if (EmitCFI && AFL.hasFP(MF))
+    CFIInstBuilder(MBB, FirstGPRRestoreI, MachineInstr::FrameDestroy)
+        .buildDefCFA(AArch64::SP, PrologueSaveSize);
+
+  // This must be placed after the callee-save restore code because that code
+  // assumes the SP is at the same location as it was after the callee-save save
+  // code in the prologue.
+  if (AfterCSRPopSize) {
+    assert(AfterCSRPopSize > 0 && "attempting to reallocate arg stack that an "
+                                  "interrupt may have clobbered");
+
+    emitFrameOffset(
+        MBB, MBB.getFirstTerminator(), DL, AArch64::SP, AArch64::SP,
+        StackOffset::getFixed(AfterCSRPopSize), TII, MachineInstr::FrameDestroy,
+        false, NeedsWinCFI, &HasWinCFI, EmitCFI,
+        StackOffset::getFixed(CombineAfterCSRBump ? PrologueSaveSize : 0));
+  }
+}
+
+void AArch64EpilogueEmitter::emitSwiftAsyncContextFramePointer(
+    MachineBasicBlock::iterator MBBI, const DebugLoc &DL) const {
+  switch (MF.getTarget().Options.SwiftAsyncFramePointer) {
+  case SwiftAsyncFramePointerMode::DeploymentBased:
+    // Avoid the reload as it is GOT relative, and instead fall back to the
+    // hardcoded value below.  This allows a mismatch between the OS and
+    // application without immediately terminating on the difference.
+    [[fallthrough]];
+  case SwiftAsyncFramePointerMode::Always:
+    // We need to reset FP to its untagged state on return. Bit 60 is
+    // currently used to show the presence of an extended frame.
+
+    // BIC x29, x29, #0x1000_0000_0000_0000
+    BuildMI(MBB, MBB.getFirstTerminator(), DL, TII->get(AArch64::ANDXri),
+            AArch64::FP)
+        .addUse(AArch64::FP)
+        .addImm(0x10fe)
+        .setMIFlag(MachineInstr::FrameDestroy);
+    if (NeedsWinCFI) {
+      BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
+          .setMIFlags(MachineInstr::FrameDestroy);
+      HasWinCFI = true;
+    }
+    break;
+
+  case SwiftAsyncFramePointerMode::Never:
+    break;
+  }
+}
+
+void AArch64EpilogueEmitter::emitShadowCallStackEpilogue(
+    MachineBasicBlock::iterator MBBI, const DebugLoc &DL) const {
+  // Shadow call stack epilog: ldr x30, [x18, #-8]!
+  BuildMI(MBB, MBBI, DL, TII->get(AArch64::LDRXpre))
+      .addReg(AArch64::X18, RegState::Define)
+      .addReg(AArch64::LR, RegState::Define)
+      .addReg(AArch64::X18)
+      .addImm(-8)
+      .setMIFlag(MachineInstr::FrameDestroy);
+
+  if (NeedsWinCFI)
+    BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
+        .setMIFlag(MachineInstr::FrameDestroy);
+
+  if (AFI->needsAsyncDwarfUnwindInfo(MF))
+    CFIInstBuilder(MBB, MBBI, MachineInstr::FrameDestroy)
+        .buildRestore(AArch64::X18);
+}
+
+void AArch64EpilogueEmitter::emitCalleeSavedRestores(
+    MachineBasicBlock::iterator MBBI, bool SVE) const {
+  const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
+  if (CSI.empty())
+    return;
+
+  const TargetSubtargetInfo &STI = MF.getSubtarget();
+  const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
+  CFIInstBuilder CFIBuilder(MBB, MBBI, MachineInstr::FrameDestroy);
+
+  for (const auto &Info : CSI) {
+    if (SVE !=
+        (MFI.getStackID(Info.getFrameIdx()) == TargetStackID::ScalableVector))
+      continue;
+
+    MCRegister Reg = Info.getReg();
+    if (SVE &&
+        !static_cast<const AArch64RegisterInfo &>(TRI).regNeedsCFI(Reg, Reg))
+      continue;
+
+    CFIBuilder.buildRestore(Info.getReg());
+  }
+}
+
+void AArch64EpilogueEmitter::finalizeEpilogue() const {
+  if (AFI->needsShadowCallStackPrologueEpilogue(MF)) {
+    emitShadowCallStackEpilogue(MBB.getFirstTerminator(), DL);
+    HasWinCFI |= NeedsWinCFI;
+  }
+  if (EmitCFI)
+    emitCalleeSavedGPRRestores(MBB.getFirstTerminator());
+  if (AFI->shouldSignReturnAddress(MF)) {
+    // If pac-ret+leaf is in effect, PAUTH_EPILOGUE pseudo instructions
+    // are inserted by emitPacRetPlusLeafHardening().
+    if (!AFL.shouldSignReturnAddressEverywhere(MF)) {
+      BuildMI(MBB, MBB.getFirstTerminator(), DL,
+              TII->get(AArch64::PAUTH_EPILOGUE))
+          .setMIFlag(MachineInstr::FrameDestroy);
+    }
+    // AArch64PointerAuth pass will insert SEH_PACSignLR
+    HasWinCFI |= NeedsWinCFI;
+  }
+  if (HasWinCFI) {
+    BuildMI(MBB, MBB.getFirstTerminator(), DL, TII->get(AArch64::SEH_EpilogEnd))
+        .setMIFlag(MachineInstr::FrameDestroy);
+    if (!MF.hasWinCFI())
+      MF.setHasWinCFI(true);
+  }
+  if (NeedsWinCFI) {
+    assert(SEHEpilogueStartI != MBB.end());
+    if (!HasWinCFI)
+      MBB.erase(SEHEpilogueStartI);
+  }
+}
+
 } // namespace llvm
diff --git a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.h b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.h
index 94029ede60c76..20bbffcdb33f2 100644
--- a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.h
+++ b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.h
@@ -7,8 +7,9 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// This file contains the declaration of the AArch64PrologueEmitter class,
-/// which is is used to emit the prologue on AArch64.
+/// This file contains the declaration of the AArch64PrologueEmitter and
+/// AArch64EpilogueEmitter classes, which are is used to emit the prologue and
+/// epilogue on AArch64.
 ///
 //===----------------------------------------------------------------------===//
 
@@ -106,6 +107,63 @@ class AArch64PrologueEmitter {
   AArch64FunctionInfo *AFI = nullptr;
 };
 
+/// A helper class for emitting the epilogue. Substantial new functionality
+/// should be factored into a new method. Where possible "emit*" methods should
+/// be const, and any flags that change how the epilogue is emitted should be
+/// set in the constructor.
+class AArch64EpilogueEmitter {
+public:
+  AArch64EpilogueEmitter(MachineFunction &MF, MachineBasicBlock &MBB,
+                         const AArch64FrameLowering &AFL);
+
+  /// Emit the epilogue.
+  void emitEpilogue();
+
+  ~AArch64EpilogueEmitter() { finalizeEpilogue(); }
+
+private:
+  void emitSwiftAsyncContextFramePointer(MachineBasicBlock::iterator MBBI,
+                                         const DebugLoc &DL) const;
+
+  void emitShadowCallStackEpilogue(MachineBasicBlock::iterator MBBI,
+                                   const DebugLoc &DL) const;
+
+  void emitCalleeSavedRestores(MachineBasicBlock::iterator MBBI,
+                               bool SVE) const;
+
+  void emitCalleeSavedGPRRestores(MachineBasicBlock::iterator MBBI) const {
+    emitCalleeSavedRestores(MBBI, /*SVE=*/false);
+  }
+
+  void emitCalleeSavedSVERestores(MachineBasicBlock::iterator MBBI) const {
+    emitCalleeSavedRestores(MBBI, /*SVE=*/true);
+  }
+
+  void finalizeEpilogue() const;
+
+  MachineFunction &MF;
+  MachineBasicBlock &MBB;
+
+  const MachineFrameInfo &MFI;
+  const AArch64Subtarget &Subtarget;
+  const AArch64FrameLowering &AFL;
+
+  // Epilogue flags. These generally should not change outside of the
+  // constructor (or early in emitEpilogue).
+  bool NeedsWinCFI = false;
+  bool EmitCFI = false;
+  bool IsFunclet = false;
+
+  // Note: "HasWinCFI" is mutable as it can change in any "emit" function.
+  mutable bool HasWinCFI = false;
+
+  const TargetInstrInfo *TII = nullptr;
+  AArch64FunctionInfo *AFI = nullptr;
+
+  DebugLoc DL;
+  MachineBasicBlock::iterator SEHEpilogueStartI;
+};
+
 } // namespace llvm
 
 #endif
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
index 77dfab83a834a..2b0c8ad0578bc 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -610,8 +610,7 @@ bool AArch64RegisterInfo::isAsmClobberable(const MachineFunction &MF,
 }
 
 const TargetRegisterClass *
-AArch64RegisterInfo::getPointerRegClass(const MachineFunction &MF,
-                                      unsigned Kind) const {
+AArch64RegisterInfo::getPointerRegClass(unsigned Kind) const {
   return &AArch64::GPR64spRegClass;
 }
 
@@ -893,7 +892,7 @@ AArch64RegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
   const MCInstrDesc &MCID = TII->get(AArch64::ADDXri);
   MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
   Register BaseReg = MRI.createVirtualRegister(&AArch64::GPR64spRegClass);
-  MRI.constrainRegClass(BaseReg, TII->getRegClass(MCID, 0, this, MF));
+  MRI.constrainRegClass(BaseReg, TII->getRegClass(MCID, 0, this));
   unsigned Shifter = AArch64_AM::getShifterImm(AArch64_AM::LSL, 0);
 
   BuildMI(*MBB, Ins, DL, MCID, BaseReg)
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.h b/llvm/lib/Target/AArch64/AArch64RegisterInfo.h
index 1ed8e959fdd2d..72a7676241770 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.h
@@ -102,8 +102,7 @@ class AArch64RegisterInfo final : public AArch64GenRegisterInfo {
   bool isAsmClobberable(const MachineFunction &MF,
                        MCRegister PhysReg) const override;
   const TargetRegisterClass *
-  getPointerRegClass(const MachineFunction &MF,
-                     unsigned Kind = 0) const override;
+  getPointerRegClass(unsigned Kind = 0) const override;
   const TargetRegisterClass *
   getCrossCopyRegClass(const TargetRegisterClass *RC) const override;
 
diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
index 601dc34d74b9c..e552afee0d8cf 100644
--- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
@@ -48,6 +48,17 @@ let usesCustomInserter = 1 in {
 }
 def : Pat<(i64 (AArch64EntryPStateSM)), (EntryPStateSM)>;
 
+// Pseudo-instruction that compares the current SVE vector length (VL) with the
+// streaming vector length (SVL). If the two lengths do not match, the check
+// lowers to a `brk`, causing a trap.
+let hasSideEffects = 1, isCodeGenOnly = 1, usesCustomInserter = 1 in
+def CHECK_MATCHING_VL_PSEUDO : Pseudo<(outs), (ins), []>, Sched<[]>;
+
+def AArch64_check_matching_vl
+  : SDNode<"AArch64ISD::CHECK_MATCHING_VL", SDTypeProfile<0, 0,[]>,
+           [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+def : Pat<(AArch64_check_matching_vl), (CHECK_MATCHING_VL_PSEUDO)>;
+
 //===----------------------------------------------------------------------===//
 // Old SME ABI lowering ISD nodes/pseudos (deprecated)
 //===----------------------------------------------------------------------===//
@@ -134,11 +145,16 @@ def : Pat<(AArch64_sme_state_alloc), (SMEStateAllocPseudo)>;
 def SDT_AArch64RDSVL  : SDTypeProfile<1, 1, [SDTCisInt<0>, SDTCisInt<1>]>;
 def AArch64rdsvl : SDNode<"AArch64ISD::RDSVL", SDT_AArch64RDSVL>;
 
+def sme_rdsvl_shl_imm : ComplexPattern<i64, 1, "SelectRDSVLShiftImm<1, 31>">;
+
 let Predicates = [HasSMEandIsNonStreamingSafe] in {
 def RDSVLI_XI  : sve_int_read_vl_a<0b0, 0b11111, "rdsvl", /*streaming_sve=*/0b1>;
 def ADDSPL_XXI : sve_int_arith_vl<0b1, "addspl", /*streaming_sve=*/0b1>;
 def ADDSVL_XXI : sve_int_arith_vl<0b0, "addsvl", /*streaming_sve=*/0b1>;
 
+def : Pat<(i64 (shl (AArch64rdsvl (i32 1)), (sme_rdsvl_shl_imm i64:$imm))),
+          (RDSVLI_XI (!cast<SDNodeXForm>("trunc_imm") $imm))>;
+
 def : Pat<(AArch64rdsvl (i32 simm6_32b:$imm)), (RDSVLI_XI simm6_32b:$imm)>;
 }
 
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 7604ffdc9f646..7fe4f7acdbd49 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -4238,6 +4238,13 @@ defm UDOT_ZZZ_HtoS  : sve2p1_two_way_dot_vv<"udot", 0b1, int_aarch64_sve_udot_x2
 defm SDOT_ZZZI_HtoS : sve2p1_two_way_dot_vvi<"sdot", 0b0, int_aarch64_sve_sdot_lane_x2>;
 defm UDOT_ZZZI_HtoS : sve2p1_two_way_dot_vvi<"udot", 0b1, int_aarch64_sve_udot_lane_x2>;
 
+let Predicates = [HasSVE2p1_or_SME2] in {
+  def : Pat<(nxv4i32 (partial_reduce_umla nxv4i32:$Acc, nxv8i16:$MulLHS, nxv8i16:$MulRHS)),
+            (UDOT_ZZZ_HtoS $Acc, $MulLHS, $MulRHS)>;
+  def : Pat<(nxv4i32 (partial_reduce_smla nxv4i32:$Acc, nxv8i16:$MulLHS, nxv8i16:$MulRHS)),
+            (SDOT_ZZZ_HtoS $Acc, $MulLHS, $MulRHS)>;
+} // End HasSVE2p1_or_SME2
+
 defm SQCVTN_Z2Z_StoH  : sve2p1_multi_vec_extract_narrow<"sqcvtn", 0b00, int_aarch64_sve_sqcvtn_x2>;
 defm UQCVTN_Z2Z_StoH  : sve2p1_multi_vec_extract_narrow<"uqcvtn", 0b01, int_aarch64_sve_uqcvtn_x2>;
 defm SQCVTUN_Z2Z_StoH : sve2p1_multi_vec_extract_narrow<"sqcvtun", 0b10, int_aarch64_sve_sqcvtun_x2>;
diff --git a/llvm/lib/Target/AArch64/AArch64SchedNeoverseN1.td b/llvm/lib/Target/AArch64/AArch64SchedNeoverseN1.td
index 524fa33f498bb..50142afccd48d 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedNeoverseN1.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedNeoverseN1.td
@@ -15,7 +15,9 @@
 //===----------------------------------------------------------------------===//
 
 def NeoverseN1Model : SchedMachineModel {
-  let IssueWidth            =   8; // Maximum micro-ops dispatch rate.
+  let IssueWidth            =   3; // This value comes from the decode bandwidth
+                                   // and empirical measurements showed that this
+                                   // value is better.
   let MicroOpBufferSize     = 128; // NOTE: Copied from Cortex-A76.
   let LoadLatency           =   4; // Optimistic load latency.
   let MispredictPenalty     =  11; // Cycles cost of branch mispredicted.
diff --git a/llvm/lib/Target/AArch64/AArch64SchedNeoverseN3.td b/llvm/lib/Target/AArch64/AArch64SchedNeoverseN3.td
index e44d40f8d7020..cd0d8a9186d5b 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedNeoverseN3.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedNeoverseN3.td
@@ -11,7 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 def NeoverseN3Model : SchedMachineModel {
-    let IssueWidth            =  10; // Micro-ops dispatched at a time.
+    let IssueWidth            =   5; // Micro-ops dispatched at a time.
     let MicroOpBufferSize     = 160; // Entries in micro-op re-order buffer. NOTE: Copied from N2.
     let LoadLatency           =   4; // Optimistic load latency.
     let MispredictPenalty     =  10; // Extra cycles for mispredicted branch. NOTE: Copied from N2.
diff --git a/llvm/lib/Target/AArch64/AArch64SchedNeoverseV1.td b/llvm/lib/Target/AArch64/AArch64SchedNeoverseV1.td
index 44625a2034d9d..f28df44bfdb38 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedNeoverseV1.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedNeoverseV1.td
@@ -19,7 +19,9 @@
 //===----------------------------------------------------------------------===//
 
 def NeoverseV1Model : SchedMachineModel {
-  let IssueWidth            =  15; // Maximum micro-ops dispatch rate.
+  let IssueWidth            =   8; // This value comes from the decode bandwidth
+                                   // and empirical measurements showed that this
+                                   // value is better.
   let MicroOpBufferSize     = 256; // Micro-op re-order buffer.
   let LoadLatency           =   4; // Optimistic load latency.
   let MispredictPenalty     =  11; // Cycles cost of branch mispredicted.
diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
index 4650b2d0c8151..5b80b08375f8c 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -295,27 +295,6 @@ static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
   return std::make_unique<AArch64_ELFTargetObjectFile>();
 }
 
-// Helper function to build a DataLayout string
-static std::string computeDataLayout(const Triple &TT,
-                                     const MCTargetOptions &Options,
-                                     bool LittleEndian) {
-  if (TT.isOSBinFormatMachO()) {
-    if (TT.getArch() == Triple::aarch64_32)
-      return "e-m:o-p:32:32-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-"
-             "n32:64-S128-Fn32";
-    return "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-n32:64-S128-"
-           "Fn32";
-  }
-  if (TT.isOSBinFormatCOFF())
-    return "e-m:w-p270:32:32-p271:32:32-p272:64:64-p:64:64-i32:32-i64:64-i128:"
-           "128-n32:64-S128-Fn32";
-  std::string Endian = LittleEndian ? "e" : "E";
-  std::string Ptr32 = TT.getEnvironment() == Triple::GNUILP32 ? "-p:32:32" : "";
-  return Endian + "-m:e" + Ptr32 +
-         "-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-"
-         "n32:64-S128-Fn32";
-}
-
 static StringRef computeDefaultCPU(const Triple &TT, StringRef CPU) {
   if (CPU.empty() && TT.isArm64e())
     return "apple-a12";
@@ -368,11 +347,10 @@ AArch64TargetMachine::AArch64TargetMachine(const Target &T, const Triple &TT,
                                            std::optional<CodeModel::Model> CM,
                                            CodeGenOptLevel OL, bool JIT,
                                            bool LittleEndian)
-    : CodeGenTargetMachineImpl(
-          T, computeDataLayout(TT, Options.MCOptions, LittleEndian), TT,
-          computeDefaultCPU(TT, CPU), FS, Options,
-          getEffectiveRelocModel(TT, RM),
-          getEffectiveAArch64CodeModel(TT, CM, JIT), OL),
+    : CodeGenTargetMachineImpl(T, TT.computeDataLayout(), TT,
+                               computeDefaultCPU(TT, CPU), FS, Options,
+                               getEffectiveRelocModel(TT, RM),
+                               getEffectiveAArch64CodeModel(TT, CM, JIT), OL),
       TLOF(createTLOF(getTargetTriple())), isLittle(LittleEndian),
       UseNewSMEABILowering(EnableNewSMEABILowering) {
   initAsmInfo();
@@ -716,12 +694,6 @@ bool AArch64PassConfig::addPreISel() {
     // is disabled as we emit the .subsections_via_symbols directive which
     // means that merging extern globals is not safe.
     bool MergeExternalByDefault = !TM->getTargetTriple().isOSBinFormatMachO();
-
-    // FIXME: extern global merging is only enabled when we optimise for size
-    // because there are some regressions with it also enabled for performance.
-    if (!OnlyOptimizeForSize)
-      MergeExternalByDefault = false;
-
     addPass(createGlobalMergePass(TM, 4095, OnlyOptimizeForSize,
                                   MergeExternalByDefault));
   }
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 92321a76dbd80..8c4b4f6e4d6de 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -2103,15 +2103,15 @@ instCombineSVECntElts(InstCombiner &IC, IntrinsicInst &II, unsigned NumElts) {
 }
 
 static std::optional<Instruction *>
-instCombineSMECntsElts(InstCombiner &IC, IntrinsicInst &II, unsigned NumElts,
-                       const AArch64Subtarget *ST) {
+instCombineSMECntsd(InstCombiner &IC, IntrinsicInst &II,
+                    const AArch64Subtarget *ST) {
   if (!ST->isStreaming())
     return std::nullopt;
 
-  // In streaming-mode, aarch64_sme_cnts is equivalent to aarch64_sve_cnt
+  // In streaming-mode, aarch64_sme_cntds is equivalent to aarch64_sve_cntd
   // with SVEPredPattern::all
-  Value *Cnt = IC.Builder.CreateElementCount(
-      II.getType(), ElementCount::getScalable(NumElts));
+  Value *Cnt =
+      IC.Builder.CreateElementCount(II.getType(), ElementCount::getScalable(2));
   Cnt->takeName(&II);
   return IC.replaceInstUsesWith(II, Cnt);
 }
@@ -2747,6 +2747,15 @@ static std::optional<Instruction *> instCombineDMB(InstCombiner &IC,
   return std::nullopt;
 }
 
+static std::optional<Instruction *> instCombineWhilelo(InstCombiner &IC,
+                                                       IntrinsicInst &II) {
+  return IC.replaceInstUsesWith(
+      II,
+      IC.Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask,
+                                 {II.getType(), II.getOperand(0)->getType()},
+                                 {II.getOperand(0), II.getOperand(1)}));
+}
+
 static std::optional<Instruction *> instCombinePTrue(InstCombiner &IC,
                                                      IntrinsicInst &II) {
   if (match(II.getOperand(0), m_ConstantInt<AArch64SVEPredPattern::all>()))
@@ -2826,13 +2835,7 @@ AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
   case Intrinsic::aarch64_sve_cntb:
     return instCombineSVECntElts(IC, II, 16);
   case Intrinsic::aarch64_sme_cntsd:
-    return instCombineSMECntsElts(IC, II, 2, ST);
-  case Intrinsic::aarch64_sme_cntsw:
-    return instCombineSMECntsElts(IC, II, 4, ST);
-  case Intrinsic::aarch64_sme_cntsh:
-    return instCombineSMECntsElts(IC, II, 8, ST);
-  case Intrinsic::aarch64_sme_cntsb:
-    return instCombineSMECntsElts(IC, II, 16, ST);
+    return instCombineSMECntsd(IC, II, ST);
   case Intrinsic::aarch64_sve_ptest_any:
   case Intrinsic::aarch64_sve_ptest_first:
   case Intrinsic::aarch64_sve_ptest_last:
@@ -2889,6 +2892,8 @@ AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
     return instCombineSVEDupqLane(IC, II);
   case Intrinsic::aarch64_sve_insr:
     return instCombineSVEInsr(IC, II);
+  case Intrinsic::aarch64_sve_whilelo:
+    return instCombineWhilelo(IC, II);
   case Intrinsic::aarch64_sve_ptrue:
     return instCombinePTrue(IC, II);
   case Intrinsic::aarch64_sve_uxtb:
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index ff09b375c3108..ea2196a584127 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -1348,6 +1348,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       .clampMaxNumElements(1, s64, 2)
       .clampMaxNumElements(1, s32, 4)
       .clampMaxNumElements(1, s16, 8)
+      .scalarize(1)
       .lower();
 
   getActionDefinitionsBuilder(G_VECREDUCE_MUL)
diff --git a/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp b/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp
index c39a5cc2fcb16..cced0faa28889 100644
--- a/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp
+++ b/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp
@@ -110,6 +110,71 @@ struct PhysRegSave {
   Register X0Save = AArch64::NoRegister;
 };
 
+/// Contains the needed ZA state (and live registers) at an instruction. That is
+/// the state ZA must be in _before_ "InsertPt".
+struct InstInfo {
+  ZAState NeededState{ZAState::ANY};
+  MachineBasicBlock::iterator InsertPt;
+  LiveRegs PhysLiveRegs = LiveRegs::None;
+};
+
+/// Contains the needed ZA state for each instruction in a block. Instructions
+/// that do not require a ZA state are not recorded.
+struct BlockInfo {
+  ZAState FixedEntryState{ZAState::ANY};
+  SmallVector<InstInfo> Insts;
+  LiveRegs PhysLiveRegsAtEntry = LiveRegs::None;
+  LiveRegs PhysLiveRegsAtExit = LiveRegs::None;
+};
+
+/// Contains the needed ZA state information for all blocks within a function.
+struct FunctionInfo {
+  SmallVector<BlockInfo> Blocks;
+  std::optional<MachineBasicBlock::iterator> AfterSMEProloguePt;
+  LiveRegs PhysLiveRegsAfterSMEPrologue = LiveRegs::None;
+};
+
+/// State/helpers that is only needed when emitting code to handle
+/// saving/restoring ZA.
+class EmitContext {
+public:
+  EmitContext() = default;
+
+  /// Get or create a TPIDR2 block in \p MF.
+  int getTPIDR2Block(MachineFunction &MF) {
+    if (TPIDR2BlockFI)
+      return *TPIDR2BlockFI;
+    MachineFrameInfo &MFI = MF.getFrameInfo();
+    TPIDR2BlockFI = MFI.CreateStackObject(16, Align(16), false);
+    return *TPIDR2BlockFI;
+  }
+
+  /// Get or create agnostic ZA buffer pointer in \p MF.
+  Register getAgnosticZABufferPtr(MachineFunction &MF) {
+    if (AgnosticZABufferPtr != AArch64::NoRegister)
+      return AgnosticZABufferPtr;
+    Register BufferPtr =
+        MF.getInfo<AArch64FunctionInfo>()->getEarlyAllocSMESaveBuffer();
+    AgnosticZABufferPtr =
+        BufferPtr != AArch64::NoRegister
+            ? BufferPtr
+            : MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
+    return AgnosticZABufferPtr;
+  }
+
+  /// Returns true if the function must allocate a ZA save buffer on entry. This
+  /// will be the case if, at any point in the function, a ZA save was emitted.
+  bool needsSaveBuffer() const {
+    assert(!(TPIDR2BlockFI && AgnosticZABufferPtr) &&
+           "Cannot have both a TPIDR2 block and agnostic ZA buffer");
+    return TPIDR2BlockFI || AgnosticZABufferPtr != AArch64::NoRegister;
+  }
+
+private:
+  std::optional<int> TPIDR2BlockFI;
+  Register AgnosticZABufferPtr = AArch64::NoRegister;
+};
+
 static bool isLegalEdgeBundleZAState(ZAState State) {
   switch (State) {
   case ZAState::ACTIVE:
@@ -119,9 +184,6 @@ static bool isLegalEdgeBundleZAState(ZAState State) {
     return false;
   }
 }
-struct TPIDR2State {
-  int FrameIndex = -1;
-};
 
 StringRef getZAStateString(ZAState State) {
 #define MAKE_CASE(V)                                                           \
@@ -192,25 +254,28 @@ struct MachineSMEABI : public MachineFunctionPass {
 
   /// Collects the needed ZA state (and live registers) before each instruction
   /// within the machine function.
-  void collectNeededZAStates(SMEAttrs);
+  FunctionInfo collectNeededZAStates(SMEAttrs SMEFnAttrs);
 
   /// Assigns each edge bundle a ZA state based on the needed states of blocks
   /// that have incoming or outgoing edges in that bundle.
-  void assignBundleZAStates();
+  SmallVector<ZAState> assignBundleZAStates(const EdgeBundles &Bundles,
+                                            const FunctionInfo &FnInfo);
 
   /// Inserts code to handle changes between ZA states within the function.
   /// E.g., ACTIVE -> LOCAL_SAVED will insert code required to save ZA.
-  void insertStateChanges();
+  void insertStateChanges(EmitContext &, const FunctionInfo &FnInfo,
+                          const EdgeBundles &Bundles,
+                          ArrayRef<ZAState> BundleStates);
 
   // Emission routines for private and shared ZA functions (using lazy saves).
   void emitNewZAPrologue(MachineBasicBlock &MBB,
                          MachineBasicBlock::iterator MBBI);
-  void emitRestoreLazySave(MachineBasicBlock &MBB,
+  void emitRestoreLazySave(EmitContext &, MachineBasicBlock &MBB,
                            MachineBasicBlock::iterator MBBI,
                            LiveRegs PhysLiveRegs);
-  void emitSetupLazySave(MachineBasicBlock &MBB,
+  void emitSetupLazySave(EmitContext &, MachineBasicBlock &MBB,
                          MachineBasicBlock::iterator MBBI);
-  void emitAllocateLazySaveBuffer(MachineBasicBlock &MBB,
+  void emitAllocateLazySaveBuffer(EmitContext &, MachineBasicBlock &MBB,
                                   MachineBasicBlock::iterator MBBI);
   void emitZAOff(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
                  bool ClearTPIDR2);
@@ -222,78 +287,49 @@ struct MachineSMEABI : public MachineFunctionPass {
   // Emit a "full" ZA save or restore. It is "full" in the sense that this
   // function will emit a call to __arm_sme_save or __arm_sme_restore, which
   // handles saving and restoring both ZA and ZT0.
-  void emitFullZASaveRestore(MachineBasicBlock &MBB,
+  void emitFullZASaveRestore(EmitContext &, MachineBasicBlock &MBB,
                              MachineBasicBlock::iterator MBBI,
                              LiveRegs PhysLiveRegs, bool IsSave);
-  void emitAllocateFullZASaveBuffer(MachineBasicBlock &MBB,
+  void emitAllocateFullZASaveBuffer(EmitContext &, MachineBasicBlock &MBB,
                                     MachineBasicBlock::iterator MBBI,
                                     LiveRegs PhysLiveRegs);
 
-  void emitStateChange(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
-                       ZAState From, ZAState To, LiveRegs PhysLiveRegs);
+  void emitStateChange(EmitContext &, MachineBasicBlock &MBB,
+                       MachineBasicBlock::iterator MBBI, ZAState From,
+                       ZAState To, LiveRegs PhysLiveRegs);
 
   // Helpers for switching between lazy/full ZA save/restore routines.
-  void emitZASave(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
-                  LiveRegs PhysLiveRegs) {
+  void emitZASave(EmitContext &Context, MachineBasicBlock &MBB,
+                  MachineBasicBlock::iterator MBBI, LiveRegs PhysLiveRegs) {
     if (AFI->getSMEFnAttrs().hasAgnosticZAInterface())
-      return emitFullZASaveRestore(MBB, MBBI, PhysLiveRegs, /*IsSave=*/true);
-    return emitSetupLazySave(MBB, MBBI);
+      return emitFullZASaveRestore(Context, MBB, MBBI, PhysLiveRegs,
+                                   /*IsSave=*/true);
+    return emitSetupLazySave(Context, MBB, MBBI);
   }
-  void emitZARestore(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
-                     LiveRegs PhysLiveRegs) {
+  void emitZARestore(EmitContext &Context, MachineBasicBlock &MBB,
+                     MachineBasicBlock::iterator MBBI, LiveRegs PhysLiveRegs) {
     if (AFI->getSMEFnAttrs().hasAgnosticZAInterface())
-      return emitFullZASaveRestore(MBB, MBBI, PhysLiveRegs, /*IsSave=*/false);
-    return emitRestoreLazySave(MBB, MBBI, PhysLiveRegs);
+      return emitFullZASaveRestore(Context, MBB, MBBI, PhysLiveRegs,
+                                   /*IsSave=*/false);
+    return emitRestoreLazySave(Context, MBB, MBBI, PhysLiveRegs);
   }
-  void emitAllocateZASaveBuffer(MachineBasicBlock &MBB,
+  void emitAllocateZASaveBuffer(EmitContext &Context, MachineBasicBlock &MBB,
                                 MachineBasicBlock::iterator MBBI,
                                 LiveRegs PhysLiveRegs) {
     if (AFI->getSMEFnAttrs().hasAgnosticZAInterface())
-      return emitAllocateFullZASaveBuffer(MBB, MBBI, PhysLiveRegs);
-    return emitAllocateLazySaveBuffer(MBB, MBBI);
+      return emitAllocateFullZASaveBuffer(Context, MBB, MBBI, PhysLiveRegs);
+    return emitAllocateLazySaveBuffer(Context, MBB, MBBI);
   }
 
   /// Save live physical registers to virtual registers.
   PhysRegSave createPhysRegSave(LiveRegs PhysLiveRegs, MachineBasicBlock &MBB,
                                 MachineBasicBlock::iterator MBBI, DebugLoc DL);
   /// Restore physical registers from a save of their previous values.
-  void restorePhyRegSave(PhysRegSave const &RegSave, MachineBasicBlock &MBB,
+  void restorePhyRegSave(const PhysRegSave &RegSave, MachineBasicBlock &MBB,
                          MachineBasicBlock::iterator MBBI, DebugLoc DL);
 
-  /// Get or create a TPIDR2 block in this function.
-  TPIDR2State getTPIDR2Block();
-
-  Register getAgnosticZABufferPtr();
-
 private:
-  /// Contains the needed ZA state (and live registers) at an instruction.
-  struct InstInfo {
-    ZAState NeededState{ZAState::ANY};
-    MachineBasicBlock::iterator InsertPt;
-    LiveRegs PhysLiveRegs = LiveRegs::None;
-  };
-
-  /// Contains the needed ZA state for each instruction in a block.
-  /// Instructions that do not require a ZA state are not recorded.
-  struct BlockInfo {
-    ZAState FixedEntryState{ZAState::ANY};
-    SmallVector<InstInfo> Insts;
-    LiveRegs PhysLiveRegsAtEntry = LiveRegs::None;
-    LiveRegs PhysLiveRegsAtExit = LiveRegs::None;
-  };
-
-  // All pass state that must be cleared between functions.
-  struct PassState {
-    SmallVector<BlockInfo> Blocks;
-    SmallVector<ZAState> BundleStates;
-    std::optional<TPIDR2State> TPIDR2Block;
-    std::optional<MachineBasicBlock::iterator> AfterSMEProloguePt;
-    Register AgnosticZABufferPtr = AArch64::NoRegister;
-    LiveRegs PhysLiveRegsAfterSMEPrologue = LiveRegs::None;
-  } State;
-
   MachineFunction *MF = nullptr;
-  EdgeBundles *Bundles = nullptr;
   const AArch64Subtarget *Subtarget = nullptr;
   const AArch64RegisterInfo *TRI = nullptr;
   const AArch64FunctionInfo *AFI = nullptr;
@@ -301,14 +337,18 @@ struct MachineSMEABI : public MachineFunctionPass {
   MachineRegisterInfo *MRI = nullptr;
 };
 
-void MachineSMEABI::collectNeededZAStates(SMEAttrs SMEFnAttrs) {
+FunctionInfo MachineSMEABI::collectNeededZAStates(SMEAttrs SMEFnAttrs) {
   assert((SMEFnAttrs.hasAgnosticZAInterface() || SMEFnAttrs.hasZT0State() ||
           SMEFnAttrs.hasZAState()) &&
          "Expected function to have ZA/ZT0 state!");
 
-  State.Blocks.resize(MF->getNumBlockIDs());
+  SmallVector<BlockInfo> Blocks(MF->getNumBlockIDs());
+  LiveRegs PhysLiveRegsAfterSMEPrologue = LiveRegs::None;
+  std::optional<MachineBasicBlock::iterator> AfterSMEProloguePt;
+
   for (MachineBasicBlock &MBB : *MF) {
-    BlockInfo &Block = State.Blocks[MBB.getNumber()];
+    BlockInfo &Block = Blocks[MBB.getNumber()];
+
     if (MBB.isEntryBlock()) {
       // Entry block:
       Block.FixedEntryState = SMEFnAttrs.hasPrivateZAInterface()
@@ -347,8 +387,8 @@ void MachineSMEABI::collectNeededZAStates(SMEAttrs SMEFnAttrs) {
       // allocation -- which is a safe point for this pass to insert any TPIDR2
       // block setup.
       if (MI.getOpcode() == AArch64::SMEStateAllocPseudo) {
-        State.AfterSMEProloguePt = MBBI;
-        State.PhysLiveRegsAfterSMEPrologue = PhysLiveRegs;
+        AfterSMEProloguePt = MBBI;
+        PhysLiveRegsAfterSMEPrologue = PhysLiveRegs;
       }
       // Note: We treat Agnostic ZA as inout_za with an alternate save/restore.
       auto [NeededState, InsertPt] = getZAStateBeforeInst(
@@ -368,11 +408,18 @@ void MachineSMEABI::collectNeededZAStates(SMEAttrs SMEFnAttrs) {
     // Reverse vector (as we had to iterate backwards for liveness).
     std::reverse(Block.Insts.begin(), Block.Insts.end());
   }
+
+  return FunctionInfo{std::move(Blocks), AfterSMEProloguePt,
+                      PhysLiveRegsAfterSMEPrologue};
 }
 
-void MachineSMEABI::assignBundleZAStates() {
-  State.BundleStates.resize(Bundles->getNumBundles());
-  for (unsigned I = 0, E = Bundles->getNumBundles(); I != E; ++I) {
+/// Assigns each edge bundle a ZA state based on the needed states of blocks
+/// that have incoming or outgoing edges in that bundle.
+SmallVector<ZAState>
+MachineSMEABI::assignBundleZAStates(const EdgeBundles &Bundles,
+                                    const FunctionInfo &FnInfo) {
+  SmallVector<ZAState> BundleStates(Bundles.getNumBundles());
+  for (unsigned I = 0, E = Bundles.getNumBundles(); I != E; ++I) {
     LLVM_DEBUG(dbgs() << "Assigning ZA state for edge bundle: " << I << '\n');
 
     // Attempt to assign a ZA state for this bundle that minimizes state
@@ -381,16 +428,16 @@ void MachineSMEABI::assignBundleZAStates() {
     // TODO: We should propagate desired incoming/outgoing states through blocks
     // that have the "ANY" state first to make better global decisions.
     int EdgeStateCounts[ZAState::NUM_ZA_STATE] = {0};
-    for (unsigned BlockID : Bundles->getBlocks(I)) {
+    for (unsigned BlockID : Bundles.getBlocks(I)) {
       LLVM_DEBUG(dbgs() << "- bb." << BlockID);
 
-      const BlockInfo &Block = State.Blocks[BlockID];
+      const BlockInfo &Block = FnInfo.Blocks[BlockID];
       if (Block.Insts.empty()) {
         LLVM_DEBUG(dbgs() << " (no state preference)\n");
         continue;
       }
-      bool InEdge = Bundles->getBundle(BlockID, /*Out=*/false) == I;
-      bool OutEdge = Bundles->getBundle(BlockID, /*Out=*/true) == I;
+      bool InEdge = Bundles.getBundle(BlockID, /*Out=*/false) == I;
+      bool OutEdge = Bundles.getBundle(BlockID, /*Out=*/true) == I;
 
       ZAState DesiredIncomingState = Block.Insts.front().NeededState;
       if (InEdge && isLegalEdgeBundleZAState(DesiredIncomingState)) {
@@ -423,15 +470,20 @@ void MachineSMEABI::assignBundleZAStates() {
       dbgs() << "\n\n";
     });
 
-    State.BundleStates[I] = BundleState;
+    BundleStates[I] = BundleState;
   }
+
+  return BundleStates;
 }
 
-void MachineSMEABI::insertStateChanges() {
+void MachineSMEABI::insertStateChanges(EmitContext &Context,
+                                       const FunctionInfo &FnInfo,
+                                       const EdgeBundles &Bundles,
+                                       ArrayRef<ZAState> BundleStates) {
   for (MachineBasicBlock &MBB : *MF) {
-    const BlockInfo &Block = State.Blocks[MBB.getNumber()];
-    ZAState InState = State.BundleStates[Bundles->getBundle(MBB.getNumber(),
-                                                            /*Out=*/false)];
+    const BlockInfo &Block = FnInfo.Blocks[MBB.getNumber()];
+    ZAState InState = BundleStates[Bundles.getBundle(MBB.getNumber(),
+                                                     /*Out=*/false)];
 
     ZAState CurrentState = Block.FixedEntryState;
     if (CurrentState == ZAState::ANY)
@@ -439,8 +491,8 @@ void MachineSMEABI::insertStateChanges() {
 
     for (auto &Inst : Block.Insts) {
       if (CurrentState != Inst.NeededState)
-        emitStateChange(MBB, Inst.InsertPt, CurrentState, Inst.NeededState,
-                        Inst.PhysLiveRegs);
+        emitStateChange(Context, MBB, Inst.InsertPt, CurrentState,
+                        Inst.NeededState, Inst.PhysLiveRegs);
       CurrentState = Inst.NeededState;
     }
 
@@ -448,21 +500,13 @@ void MachineSMEABI::insertStateChanges() {
       continue;
 
     ZAState OutState =
-        State.BundleStates[Bundles->getBundle(MBB.getNumber(), /*Out=*/true)];
+        BundleStates[Bundles.getBundle(MBB.getNumber(), /*Out=*/true)];
     if (CurrentState != OutState)
-      emitStateChange(MBB, MBB.getFirstTerminator(), CurrentState, OutState,
-                      Block.PhysLiveRegsAtExit);
+      emitStateChange(Context, MBB, MBB.getFirstTerminator(), CurrentState,
+                      OutState, Block.PhysLiveRegsAtExit);
   }
 }
 
-TPIDR2State MachineSMEABI::getTPIDR2Block() {
-  if (State.TPIDR2Block)
-    return *State.TPIDR2Block;
-  MachineFrameInfo &MFI = MF->getFrameInfo();
-  State.TPIDR2Block = TPIDR2State{MFI.CreateStackObject(16, Align(16), false)};
-  return *State.TPIDR2Block;
-}
-
 static DebugLoc getDebugLoc(MachineBasicBlock &MBB,
                             MachineBasicBlock::iterator MBBI) {
   if (MBBI != MBB.end())
@@ -470,7 +514,8 @@ static DebugLoc getDebugLoc(MachineBasicBlock &MBB,
   return DebugLoc();
 }
 
-void MachineSMEABI::emitSetupLazySave(MachineBasicBlock &MBB,
+void MachineSMEABI::emitSetupLazySave(EmitContext &Context,
+                                      MachineBasicBlock &MBB,
                                       MachineBasicBlock::iterator MBBI) {
   DebugLoc DL = getDebugLoc(MBB, MBBI);
 
@@ -478,7 +523,7 @@ void MachineSMEABI::emitSetupLazySave(MachineBasicBlock &MBB,
   Register TPIDR2 = MRI->createVirtualRegister(&AArch64::GPR64spRegClass);
   Register TPIDR2Ptr = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
   BuildMI(MBB, MBBI, DL, TII->get(AArch64::ADDXri), TPIDR2)
-      .addFrameIndex(getTPIDR2Block().FrameIndex)
+      .addFrameIndex(Context.getTPIDR2Block(*MF))
       .addImm(0)
       .addImm(0);
   BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), TPIDR2Ptr)
@@ -512,7 +557,7 @@ PhysRegSave MachineSMEABI::createPhysRegSave(LiveRegs PhysLiveRegs,
   return RegSave;
 }
 
-void MachineSMEABI::restorePhyRegSave(PhysRegSave const &RegSave,
+void MachineSMEABI::restorePhyRegSave(const PhysRegSave &RegSave,
                                       MachineBasicBlock &MBB,
                                       MachineBasicBlock::iterator MBBI,
                                       DebugLoc DL) {
@@ -528,7 +573,8 @@ void MachineSMEABI::restorePhyRegSave(PhysRegSave const &RegSave,
         .addReg(RegSave.X0Save);
 }
 
-void MachineSMEABI::emitRestoreLazySave(MachineBasicBlock &MBB,
+void MachineSMEABI::emitRestoreLazySave(EmitContext &Context,
+                                        MachineBasicBlock &MBB,
                                         MachineBasicBlock::iterator MBBI,
                                         LiveRegs PhysLiveRegs) {
   auto *TLI = Subtarget->getTargetLowering();
@@ -548,7 +594,7 @@ void MachineSMEABI::emitRestoreLazySave(MachineBasicBlock &MBB,
       .addImm(AArch64SysReg::TPIDR2_EL0);
   // Get pointer to TPIDR2 block.
   BuildMI(MBB, MBBI, DL, TII->get(AArch64::ADDXri), TPIDR2)
-      .addFrameIndex(getTPIDR2Block().FrameIndex)
+      .addFrameIndex(Context.getTPIDR2Block(*MF))
       .addImm(0)
       .addImm(0);
   // (Conditionally) restore ZA state.
@@ -582,7 +628,8 @@ void MachineSMEABI::emitZAOff(MachineBasicBlock &MBB,
 }
 
 void MachineSMEABI::emitAllocateLazySaveBuffer(
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) {
+    EmitContext &Context, MachineBasicBlock &MBB,
+    MachineBasicBlock::iterator MBBI) {
   MachineFrameInfo &MFI = MF->getFrameInfo();
   DebugLoc DL = getDebugLoc(MBB, MBBI);
   Register SP = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
@@ -630,7 +677,7 @@ void MachineSMEABI::emitAllocateLazySaveBuffer(
     BuildMI(MBB, MBBI, DL, TII->get(AArch64::STPXi))
         .addReg(Buffer)
         .addReg(SVL)
-        .addFrameIndex(getTPIDR2Block().FrameIndex)
+        .addFrameIndex(Context.getTPIDR2Block(*MF))
         .addImm(0);
   }
 }
@@ -662,18 +709,8 @@ void MachineSMEABI::emitNewZAPrologue(MachineBasicBlock &MBB,
       .addImm(1);
 }
 
-Register MachineSMEABI::getAgnosticZABufferPtr() {
-  if (State.AgnosticZABufferPtr != AArch64::NoRegister)
-    return State.AgnosticZABufferPtr;
-  Register BufferPtr = AFI->getEarlyAllocSMESaveBuffer();
-  State.AgnosticZABufferPtr =
-      BufferPtr != AArch64::NoRegister
-          ? BufferPtr
-          : MF->getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
-  return State.AgnosticZABufferPtr;
-}
-
-void MachineSMEABI::emitFullZASaveRestore(MachineBasicBlock &MBB,
+void MachineSMEABI::emitFullZASaveRestore(EmitContext &Context,
+                                          MachineBasicBlock &MBB,
                                           MachineBasicBlock::iterator MBBI,
                                           LiveRegs PhysLiveRegs, bool IsSave) {
   auto *TLI = Subtarget->getTargetLowering();
@@ -684,7 +721,7 @@ void MachineSMEABI::emitFullZASaveRestore(MachineBasicBlock &MBB,
 
   // Copy the buffer pointer into X0.
   BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), BufferPtr)
-      .addReg(getAgnosticZABufferPtr());
+      .addReg(Context.getAgnosticZABufferPtr(*MF));
 
   // Call __arm_sme_save/__arm_sme_restore.
   BuildMI(MBB, MBBI, DL, TII->get(AArch64::BL))
@@ -699,14 +736,14 @@ void MachineSMEABI::emitFullZASaveRestore(MachineBasicBlock &MBB,
 }
 
 void MachineSMEABI::emitAllocateFullZASaveBuffer(
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
-    LiveRegs PhysLiveRegs) {
+    EmitContext &Context, MachineBasicBlock &MBB,
+    MachineBasicBlock::iterator MBBI, LiveRegs PhysLiveRegs) {
   // Buffer already allocated in SelectionDAG.
   if (AFI->getEarlyAllocSMESaveBuffer())
     return;
 
   DebugLoc DL = getDebugLoc(MBB, MBBI);
-  Register BufferPtr = getAgnosticZABufferPtr();
+  Register BufferPtr = Context.getAgnosticZABufferPtr(*MF);
   Register BufferSize = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
 
   PhysRegSave RegSave = createPhysRegSave(PhysLiveRegs, MBB, MBBI, DL);
@@ -742,11 +779,11 @@ void MachineSMEABI::emitAllocateFullZASaveBuffer(
   restorePhyRegSave(RegSave, MBB, MBBI, DL);
 }
 
-void MachineSMEABI::emitStateChange(MachineBasicBlock &MBB,
+void MachineSMEABI::emitStateChange(EmitContext &Context,
+                                    MachineBasicBlock &MBB,
                                     MachineBasicBlock::iterator InsertPt,
                                     ZAState From, ZAState To,
                                     LiveRegs PhysLiveRegs) {
-
   // ZA not used.
   if (From == ZAState::ANY || To == ZAState::ANY)
     return;
@@ -774,9 +811,9 @@ void MachineSMEABI::emitStateChange(MachineBasicBlock &MBB,
   }
 
   if (From == ZAState::ACTIVE && To == ZAState::LOCAL_SAVED)
-    emitZASave(MBB, InsertPt, PhysLiveRegs);
+    emitZASave(Context, MBB, InsertPt, PhysLiveRegs);
   else if (From == ZAState::LOCAL_SAVED && To == ZAState::ACTIVE)
-    emitZARestore(MBB, InsertPt, PhysLiveRegs);
+    emitZARestore(Context, MBB, InsertPt, PhysLiveRegs);
   else if (To == ZAState::OFF) {
     assert(From != ZAState::CALLER_DORMANT &&
            "CALLER_DORMANT to OFF should have already been handled");
@@ -807,32 +844,33 @@ bool MachineSMEABI::runOnMachineFunction(MachineFunction &MF) {
 
   assert(MF.getRegInfo().isSSA() && "Expected to be run on SSA form!");
 
-  // Reset pass state.
-  State = PassState{};
   this->MF = &MF;
-  Bundles = &getAnalysis<EdgeBundlesWrapperLegacy>().getEdgeBundles();
   Subtarget = &MF.getSubtarget<AArch64Subtarget>();
   TII = Subtarget->getInstrInfo();
   TRI = Subtarget->getRegisterInfo();
   MRI = &MF.getRegInfo();
 
-  collectNeededZAStates(SMEFnAttrs);
-  assignBundleZAStates();
-  insertStateChanges();
+  const EdgeBundles &Bundles =
+      getAnalysis<EdgeBundlesWrapperLegacy>().getEdgeBundles();
+
+  FunctionInfo FnInfo = collectNeededZAStates(SMEFnAttrs);
+  SmallVector<ZAState> BundleStates = assignBundleZAStates(Bundles, FnInfo);
+
+  EmitContext Context;
+  insertStateChanges(Context, FnInfo, Bundles, BundleStates);
 
-  // Allocate save buffer (if needed).
-  if (State.AgnosticZABufferPtr != AArch64::NoRegister || State.TPIDR2Block) {
-    if (State.AfterSMEProloguePt) {
+  if (Context.needsSaveBuffer()) {
+    if (FnInfo.AfterSMEProloguePt) {
       // Note: With inline stack probes the AfterSMEProloguePt may not be in the
       // entry block (due to the probing loop).
-      emitAllocateZASaveBuffer(*(*State.AfterSMEProloguePt)->getParent(),
-                               *State.AfterSMEProloguePt,
-                               State.PhysLiveRegsAfterSMEPrologue);
+      MachineBasicBlock::iterator MBBI = *FnInfo.AfterSMEProloguePt;
+      emitAllocateZASaveBuffer(Context, *MBBI->getParent(), MBBI,
+                               FnInfo.PhysLiveRegsAfterSMEPrologue);
     } else {
       MachineBasicBlock &EntryBlock = MF.front();
       emitAllocateZASaveBuffer(
-          EntryBlock, EntryBlock.getFirstNonPHI(),
-          State.Blocks[EntryBlock.getNumber()].PhysLiveRegsAtEntry);
+          Context, EntryBlock, EntryBlock.getFirstNonPHI(),
+          FnInfo.Blocks[EntryBlock.getNumber()].PhysLiveRegsAtEntry);
     }
   }
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index ffbda14dcd849..35e64486184b1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -580,6 +580,12 @@ def FeatureRealTrue16Insts : SubtargetFeature<"real-true16",
   "Use true 16-bit registers"
 >;
 
+def FeatureD16Writes32BitVgpr : SubtargetFeature<"d16-write-vgpr32",
+  "EnableD16Writes32BitVgpr",
+  "true",
+  "D16 instructions potentially have 32-bit data dependencies"
+>;
+
 def FeatureBF16TransInsts : SubtargetFeature<"bf16-trans-insts",
   "HasBF16TransInsts",
   "true",
@@ -1928,7 +1934,9 @@ def FeatureISAVersion11_Common : FeatureSet<
    FeaturePackedTID,
    FeatureVcmpxPermlaneHazard,
    FeatureMemoryAtomicFAddF32DenormalSupport,
-   FeatureRealTrue16Insts]>;
+   FeatureRealTrue16Insts,
+   FeatureD16Writes32BitVgpr,
+]>;
 
 // There are few workarounds that need to be
 // added to all targets. This pessimizes codegen
@@ -2563,6 +2571,11 @@ def UseFakeTrue16Insts : True16PredicateClass<"Subtarget->hasTrue16BitInsts() &&
   // FIXME When we default to RealTrue16 instead of Fake, change the line as follows.
   // AssemblerPredicate<(all_of FeatureTrue16BitInsts, (not FeatureRealTrue16Insts))>;
 
+def HasD16Writes32BitVgpr: Predicate<"Subtarget->hasD16Writes32BitVgpr()">,
+  AssemblerPredicate<(all_of FeatureTrue16BitInsts, FeatureRealTrue16Insts, FeatureD16Writes32BitVgpr)>;
+def NotHasD16Writes32BitVgpr: Predicate<"!Subtarget->hasD16Writes32BitVgpr()">,
+  AssemblerPredicate<(all_of FeatureTrue16BitInsts, FeatureRealTrue16Insts, (not FeatureD16Writes32BitVgpr))>;
+
 def HasBF16TransInsts : Predicate<"Subtarget->hasBF16TransInsts()">,
   AssemblerPredicate<(all_of FeatureBF16TransInsts)>;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp
index d158f0f58d711..dda8033f47398 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp
@@ -107,6 +107,14 @@ AMDGPUFunctionArgInfo::getPreloadedValue(
   case AMDGPUFunctionArgInfo::WORKGROUP_ID_Z:
     return std::tuple(WorkGroupIDZ ? &WorkGroupIDZ : nullptr,
                       &AMDGPU::SGPR_32RegClass, LLT::scalar(32));
+  case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_X:
+  case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Y:
+  case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Z:
+  case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_X:
+  case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Y:
+  case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Z:
+  case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_FLAT_ID:
+    return std::tuple(nullptr, &AMDGPU::SGPR_32RegClass, LLT::scalar(32));
   case AMDGPUFunctionArgInfo::LDS_KERNEL_ID:
     return std::tuple(LDSKernelId ? &LDSKernelId : nullptr,
                       &AMDGPU::SGPR_32RegClass, LLT::scalar(32));
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
index e07d47381ecca..1064e57b9da9e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
@@ -111,18 +111,25 @@ struct AMDGPUFunctionArgInfo {
     DISPATCH_ID         =  4,
     FLAT_SCRATCH_INIT   =  5,
     LDS_KERNEL_ID       =  6, // LLVM internal, not part of the ABI
-    WORKGROUP_ID_X      = 10,
-    WORKGROUP_ID_Y      = 11,
-    WORKGROUP_ID_Z      = 12,
+    WORKGROUP_ID_X      = 10, // Also used for cluster ID X.
+    WORKGROUP_ID_Y      = 11, // Also used for cluster ID Y.
+    WORKGROUP_ID_Z      = 12, // Also used for cluster ID Z.
     PRIVATE_SEGMENT_WAVE_BYTE_OFFSET = 14,
     IMPLICIT_BUFFER_PTR = 15,
     IMPLICIT_ARG_PTR = 16,
     PRIVATE_SEGMENT_SIZE = 17,
+    CLUSTER_WORKGROUP_ID_X = 21,
+    CLUSTER_WORKGROUP_ID_Y = 22,
+    CLUSTER_WORKGROUP_ID_Z = 23,
+    CLUSTER_WORKGROUP_MAX_ID_X = 24,
+    CLUSTER_WORKGROUP_MAX_ID_Y = 25,
+    CLUSTER_WORKGROUP_MAX_ID_Z = 26,
+    CLUSTER_WORKGROUP_MAX_FLAT_ID = 27,
 
     // VGPRS:
-    WORKITEM_ID_X       = 18,
-    WORKITEM_ID_Y       = 19,
-    WORKITEM_ID_Z       = 20,
+    WORKITEM_ID_X       = 28,
+    WORKITEM_ID_Y       = 29,
+    WORKITEM_ID_Z       = 30,
     FIRST_VGPR_VALUE    = WORKITEM_ID_X
   };
   // clang-format on
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributes.def b/llvm/lib/Target/AMDGPU/AMDGPUAttributes.def
index 8c1c8219690ba..4c9715e4a1737 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAttributes.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributes.def
@@ -31,5 +31,8 @@ AMDGPU_ATTRIBUTE(LDS_KERNEL_ID, "amdgpu-no-lds-kernel-id")
 AMDGPU_ATTRIBUTE(DEFAULT_QUEUE, "amdgpu-no-default-queue")
 AMDGPU_ATTRIBUTE(COMPLETION_ACTION, "amdgpu-no-completion-action")
 AMDGPU_ATTRIBUTE(FLAT_SCRATCH_INIT, "amdgpu-no-flat-scratch-init")
+AMDGPU_ATTRIBUTE(CLUSTER_ID_X, "amdgpu-no-cluster-id-x")
+AMDGPU_ATTRIBUTE(CLUSTER_ID_Y, "amdgpu-no-cluster-id-y")
+AMDGPU_ATTRIBUTE(CLUSTER_ID_Z, "amdgpu-no-cluster-id-z")
 
 #undef AMDGPU_ATTRIBUTE
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
index f646457f9d76f..2ba31562c4784 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
@@ -77,6 +77,13 @@ intrinsicToAttrMask(Intrinsic::ID ID, bool &NonKernelOnly, bool &NeedsImplicit,
   case Intrinsic::amdgcn_workgroup_id_z:
   case Intrinsic::r600_read_tgid_z:
     return WORKGROUP_ID_Z;
+  case Intrinsic::amdgcn_cluster_id_x:
+    NonKernelOnly = true;
+    return CLUSTER_ID_X;
+  case Intrinsic::amdgcn_cluster_id_y:
+    return CLUSTER_ID_Y;
+  case Intrinsic::amdgcn_cluster_id_z:
+    return CLUSTER_ID_Z;
   case Intrinsic::amdgcn_lds_kernel_id:
     return LDS_KERNEL_ID;
   case Intrinsic::amdgcn_dispatch_ptr:
@@ -1296,6 +1303,157 @@ struct AAAMDGPUNoAGPR
 
 const char AAAMDGPUNoAGPR::ID = 0;
 
+/// An abstract attribute to propagate the function attribute
+/// "amdgpu-cluster-dims" from kernel entry functions to device functions.
+struct AAAMDGPUClusterDims
+    : public StateWrapper<BooleanState, AbstractAttribute> {
+  using Base = StateWrapper<BooleanState, AbstractAttribute>;
+  AAAMDGPUClusterDims(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
+
+  /// Create an abstract attribute view for the position \p IRP.
+  static AAAMDGPUClusterDims &createForPosition(const IRPosition &IRP,
+                                                Attributor &A);
+
+  /// See AbstractAttribute::getName().
+  StringRef getName() const override { return "AAAMDGPUClusterDims"; }
+
+  /// See AbstractAttribute::getIdAddr().
+  const char *getIdAddr() const override { return &ID; }
+
+  /// This function should return true if the type of the \p AA is
+  /// AAAMDGPUClusterDims.
+  static bool classof(const AbstractAttribute *AA) {
+    return AA->getIdAddr() == &ID;
+  }
+
+  virtual const AMDGPU::ClusterDimsAttr &getClusterDims() const = 0;
+
+  /// Unique ID (due to the unique address)
+  static const char ID;
+};
+
+const char AAAMDGPUClusterDims::ID = 0;
+
+struct AAAMDGPUClusterDimsFunction : public AAAMDGPUClusterDims {
+  AAAMDGPUClusterDimsFunction(const IRPosition &IRP, Attributor &A)
+      : AAAMDGPUClusterDims(IRP, A) {}
+
+  void initialize(Attributor &A) override {
+    Function *F = getAssociatedFunction();
+    assert(F && "empty associated function");
+
+    Attr = AMDGPU::ClusterDimsAttr::get(*F);
+
+    // No matter what a kernel function has, it is final.
+    if (AMDGPU::isEntryFunctionCC(F->getCallingConv())) {
+      if (Attr.isUnknown())
+        indicatePessimisticFixpoint();
+      else
+        indicateOptimisticFixpoint();
+    }
+  }
+
+  const std::string getAsStr(Attributor *A) const override {
+    if (!getAssumed() || Attr.isUnknown())
+      return "unknown";
+    if (Attr.isNoCluster())
+      return "no";
+    if (Attr.isVariableDims())
+      return "variable";
+    return Attr.to_string();
+  }
+
+  void trackStatistics() const override {}
+
+  ChangeStatus updateImpl(Attributor &A) override {
+    auto OldState = Attr;
+
+    auto CheckCallSite = [&](AbstractCallSite CS) {
+      const auto *CallerAA = A.getAAFor<AAAMDGPUClusterDims>(
+          *this, IRPosition::function(*CS.getInstruction()->getFunction()),
+          DepClassTy::REQUIRED);
+      if (!CallerAA || !CallerAA->isValidState())
+        return false;
+
+      return merge(CallerAA->getClusterDims());
+    };
+
+    bool UsedAssumedInformation = false;
+    if (!A.checkForAllCallSites(CheckCallSite, *this,
+                                /*RequireAllCallSites=*/true,
+                                UsedAssumedInformation))
+      return indicatePessimisticFixpoint();
+
+    return OldState == Attr ? ChangeStatus::UNCHANGED : ChangeStatus::CHANGED;
+  }
+
+  ChangeStatus manifest(Attributor &A) override {
+    if (Attr.isUnknown())
+      return ChangeStatus::UNCHANGED;
+    return A.manifestAttrs(
+        getIRPosition(),
+        {Attribute::get(getAssociatedFunction()->getContext(), AttrName,
+                        Attr.to_string())},
+        /*ForceReplace=*/true);
+  }
+
+  const AMDGPU::ClusterDimsAttr &getClusterDims() const override {
+    return Attr;
+  }
+
+private:
+  bool merge(const AMDGPU::ClusterDimsAttr &Other) {
+    // Case 1: Both of them are unknown yet, we do nothing and continue wait for
+    // propagation.
+    if (Attr.isUnknown() && Other.isUnknown())
+      return true;
+
+    // Case 2: The other is determined, but we are unknown yet, we simply take
+    // the other's value.
+    if (Attr.isUnknown()) {
+      Attr = Other;
+      return true;
+    }
+
+    // Case 3: We are determined but the other is unknown yet, we simply keep
+    // everything unchanged.
+    if (Other.isUnknown())
+      return true;
+
+    // After this point, both are determined.
+
+    // Case 4: If they are same, we do nothing.
+    if (Attr == Other)
+      return true;
+
+    // Now they are not same.
+
+    // Case 5: If either of us uses cluster (but not both; otherwise case 4
+    // would hold), then it is unknown whether cluster will be used, and the
+    // state is final, unlike case 1.
+    if (Attr.isNoCluster() || Other.isNoCluster()) {
+      Attr.setUnknown();
+      return false;
+    }
+
+    // Case 6: Both of us use cluster, but the dims are different, so the result
+    // is, cluster is used, but we just don't have a fixed dims.
+    Attr.setVariableDims();
+    return true;
+  }
+
+  AMDGPU::ClusterDimsAttr Attr;
+
+  static constexpr const char AttrName[] = "amdgpu-cluster-dims";
+};
+
+AAAMDGPUClusterDims &
+AAAMDGPUClusterDims::createForPosition(const IRPosition &IRP, Attributor &A) {
+  if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
+    return *new (A.Allocator) AAAMDGPUClusterDimsFunction(IRP, A);
+  llvm_unreachable("AAAMDGPUClusterDims is only valid for function position");
+}
+
 static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
                     AMDGPUAttributorOptions Options,
                     ThinOrFullLTOPhase LTOPhase) {
@@ -1314,7 +1472,7 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
        &AAAMDMaxNumWorkgroups::ID, &AAAMDWavesPerEU::ID, &AAAMDGPUNoAGPR::ID,
        &AACallEdges::ID, &AAPointerInfo::ID, &AAPotentialConstantValues::ID,
        &AAUnderlyingObjects::ID, &AANoAliasAddrSpace::ID, &AAAddressSpace::ID,
-       &AAIndirectCallInfo::ID});
+       &AAIndirectCallInfo::ID, &AAAMDGPUClusterDims::ID});
 
   AttributorConfig AC(CGUpdater);
   AC.IsClosedWorldModule = Options.IsClosedWorld;
@@ -1352,6 +1510,10 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
       A.getOrCreateAAFor<AAAMDWavesPerEU>(IRPosition::function(*F));
     }
 
+    const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(*F);
+    if (!F->isDeclaration() && ST.hasClusters())
+      A.getOrCreateAAFor<AAAMDGPUClusterDims>(IRPosition::function(*F));
+
     for (auto &I : instructions(F)) {
       Value *Ptr = nullptr;
       if (auto *LI = dyn_cast<LoadInst>(&I))
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index 21255f691e4ad..7afaddea164f8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -809,15 +809,15 @@ bool AMDGPUCallLowering::passSpecialInputs(MachineIRBuilder &MIRBuilder,
     AMDGPUFunctionArgInfo::LDS_KERNEL_ID,
   };
 
-  static constexpr StringLiteral ImplicitAttrNames[] = {
-    "amdgpu-no-dispatch-ptr",
-    "amdgpu-no-queue-ptr",
-    "amdgpu-no-implicitarg-ptr",
-    "amdgpu-no-dispatch-id",
-    "amdgpu-no-workgroup-id-x",
-    "amdgpu-no-workgroup-id-y",
-    "amdgpu-no-workgroup-id-z",
-    "amdgpu-no-lds-kernel-id",
+  static constexpr StringLiteral ImplicitAttrNames[][2] = {
+      {"amdgpu-no-dispatch-ptr", ""},
+      {"amdgpu-no-queue-ptr", ""},
+      {"amdgpu-no-implicitarg-ptr", ""},
+      {"amdgpu-no-dispatch-id", ""},
+      {"amdgpu-no-workgroup-id-x", "amdgpu-no-cluster-id-x"},
+      {"amdgpu-no-workgroup-id-y", "amdgpu-no-cluster-id-y"},
+      {"amdgpu-no-workgroup-id-z", "amdgpu-no-cluster-id-z"},
+      {"amdgpu-no-lds-kernel-id", ""},
   };
 
   MachineRegisterInfo &MRI = MF.getRegInfo();
@@ -833,7 +833,9 @@ bool AMDGPUCallLowering::passSpecialInputs(MachineIRBuilder &MIRBuilder,
     LLT ArgTy;
 
     // If the callee does not use the attribute value, skip copying the value.
-    if (Info.CB->hasFnAttr(ImplicitAttrNames[I++]))
+    if (all_of(ImplicitAttrNames[I++], [&](StringRef AttrName) {
+          return AttrName.empty() || Info.CB->hasFnAttr(AttrName);
+        }))
       continue;
 
     std::tie(OutgoingArg, ArgRC, ArgTy) =
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
index 19b8757e6ad6e..3c88d1b8214f7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
@@ -254,9 +254,9 @@ void MetadataStreamerMsgPackV4::emitKernelLanguage(const Function &Func,
 }
 
 void MetadataStreamerMsgPackV4::emitKernelAttrs(const AMDGPUTargetMachine &TM,
-                                                const Function &Func,
+                                                const MachineFunction &MF,
                                                 msgpack::MapDocNode Kern) {
-
+  const Function &Func = MF.getFunction();
   if (auto *Node = Func.getMetadata("reqd_work_group_size"))
     Kern[".reqd_workgroup_size"] = getWorkGroupDimensions(Node);
   if (auto *Node = Func.getMetadata("work_group_size_hint"))
@@ -599,7 +599,7 @@ void MetadataStreamerMsgPackV4::emitKernel(const MachineFunction &MF,
     Kern[".symbol"] = Kern.getDocument()->getNode(
         (Twine(Func.getName()) + Twine(".kd")).str(), /*Copy=*/true);
     emitKernelLanguage(Func, Kern);
-    emitKernelAttrs(TM, Func, Kern);
+    emitKernelAttrs(TM, MF, Kern);
     emitKernelArgs(MF, Kern);
   }
 
@@ -726,10 +726,11 @@ void MetadataStreamerMsgPackV5::emitHiddenKernelArgs(
 }
 
 void MetadataStreamerMsgPackV5::emitKernelAttrs(const AMDGPUTargetMachine &TM,
-                                                const Function &Func,
+                                                const MachineFunction &MF,
                                                 msgpack::MapDocNode Kern) {
-  MetadataStreamerMsgPackV4::emitKernelAttrs(TM, Func, Kern);
+  MetadataStreamerMsgPackV4::emitKernelAttrs(TM, MF, Kern);
 
+  const Function &Func = MF.getFunction();
   if (Func.getFnAttribute("uniform-work-group-size").getValueAsBool())
     Kern[".uniform_work_group_size"] = Kern.getDocument()->getNode(1);
 }
@@ -745,5 +746,21 @@ void MetadataStreamerMsgPackV6::emitVersion() {
   getRootMetadata("amdhsa.version") = Version;
 }
 
+void MetadataStreamerMsgPackV6::emitKernelAttrs(const AMDGPUTargetMachine &TM,
+                                                const MachineFunction &MF,
+                                                msgpack::MapDocNode Kern) {
+  MetadataStreamerMsgPackV5::emitKernelAttrs(TM, MF, Kern);
+
+  const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
+  ClusterDimsAttr Attr = MFI.getClusterDims();
+  if (Attr.isFixedDims()) {
+    msgpack::ArrayDocNode ClusterDimsNode = HSAMetadataDoc->getArrayNode();
+    ClusterDimsNode.push_back(HSAMetadataDoc->getNode(Attr.getDims()[0]));
+    ClusterDimsNode.push_back(HSAMetadataDoc->getNode(Attr.getDims()[1]));
+    ClusterDimsNode.push_back(HSAMetadataDoc->getNode(Attr.getDims()[2]));
+    Kern[".cluster_dims"] = ClusterDimsNode;
+  }
+}
+
 } // end namespace AMDGPU::HSAMD
 } // end namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
index 22dfcb4a4ec1d..1b4b113fad61c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
@@ -61,7 +61,7 @@ class MetadataStreamer {
   virtual void emitHiddenKernelArgs(const MachineFunction &MF, unsigned &Offset,
                                     msgpack::ArrayDocNode Args) = 0;
   virtual void emitKernelAttrs(const AMDGPUTargetMachine &TM,
-                               const Function &Func,
+                               const MachineFunction &MF,
                                msgpack::MapDocNode Kern) = 0;
 };
 
@@ -102,7 +102,7 @@ class LLVM_EXTERNAL_VISIBILITY MetadataStreamerMsgPackV4
 
   void emitKernelLanguage(const Function &Func, msgpack::MapDocNode Kern);
 
-  void emitKernelAttrs(const AMDGPUTargetMachine &TM, const Function &Func,
+  void emitKernelAttrs(const AMDGPUTargetMachine &TM, const MachineFunction &MF,
                        msgpack::MapDocNode Kern) override;
 
   void emitKernelArgs(const MachineFunction &MF, msgpack::MapDocNode Kern);
@@ -149,7 +149,7 @@ class MetadataStreamerMsgPackV5 : public MetadataStreamerMsgPackV4 {
   void emitVersion() override;
   void emitHiddenKernelArgs(const MachineFunction &MF, unsigned &Offset,
                             msgpack::ArrayDocNode Args) override;
-  void emitKernelAttrs(const AMDGPUTargetMachine &TM, const Function &Func,
+  void emitKernelAttrs(const AMDGPUTargetMachine &TM, const MachineFunction &MF,
                        msgpack::MapDocNode Kern) override;
 
 public:
@@ -164,6 +164,9 @@ class MetadataStreamerMsgPackV6 final : public MetadataStreamerMsgPackV5 {
 public:
   MetadataStreamerMsgPackV6() = default;
   ~MetadataStreamerMsgPackV6() = default;
+
+  void emitKernelAttrs(const AMDGPUTargetMachine &TM, const MachineFunction &MF,
+                       msgpack::MapDocNode Kern) override;
 };
 
 } // end namespace HSAMD
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index a44af5f854c18..f069b591eb315 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -5287,6 +5287,30 @@ SDValue AMDGPUTargetLowering::performRcpCombine(SDNode *N,
   return DCI.DAG.getConstantFP(One / Val, SDLoc(N), N->getValueType(0));
 }
 
+bool AMDGPUTargetLowering::isInt64ImmLegal(SDNode *N, SelectionDAG &DAG) const {
+  if (!Subtarget->isGCN())
+    return false;
+
+  ConstantSDNode *SDConstant = dyn_cast<ConstantSDNode>(N);
+  ConstantFPSDNode *SDFPConstant = dyn_cast<ConstantFPSDNode>(N);
+  auto &ST = DAG.getSubtarget<GCNSubtarget>();
+  const auto *TII = ST.getInstrInfo();
+
+  if (!ST.hasMovB64() || (!SDConstant && !SDFPConstant))
+    return false;
+
+  if (ST.has64BitLiterals())
+    return true;
+
+  if (SDConstant) {
+    const APInt &APVal = SDConstant->getAPIntValue();
+    return isUInt<32>(APVal.getZExtValue()) || TII->isInlineConstant(APVal);
+  }
+
+  APInt Val = SDFPConstant->getValueAPF().bitcastToAPInt();
+  return isUInt<32>(Val.getZExtValue()) || TII->isInlineConstant(Val);
+}
+
 SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
                                                 DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
@@ -5336,6 +5360,8 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
     SDValue Src = N->getOperand(0);
     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src)) {
       SDLoc SL(N);
+      if (isInt64ImmLegal(C, DAG))
+        break;
       uint64_t CVal = C->getZExtValue();
       SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
                                DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
@@ -5346,6 +5372,8 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
     if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Src)) {
       const APInt &Val = C->getValueAPF().bitcastToAPInt();
       SDLoc SL(N);
+      if (isInt64ImmLegal(C, DAG))
+        break;
       uint64_t CVal = Val.getZExtValue();
       SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
                                 DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index bdaf48652d107..610f0ebb4caa5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -103,6 +103,9 @@ class AMDGPUTargetLowering : public TargetLowering {
   SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const;
 
 protected:
+  /// Check whether value Val can be supported by v_mov_b64, for the current
+  /// target.
+  bool isInt64ImmLegal(SDNode *Val, SelectionDAG &DAG) const;
   bool shouldCombineMemoryType(EVT VT) const;
   SDValue performLoadCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULaneMaskUtils.h b/llvm/lib/Target/AMDGPU/AMDGPULaneMaskUtils.h
new file mode 100644
index 0000000000000..df80196d95176
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPULaneMaskUtils.h
@@ -0,0 +1,89 @@
+//===- AMDGPULaneMaskUtils.h - Exec/lane mask helper functions -*- C++ -*--===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPULANEMASKUTILS_H
+#define LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPULANEMASKUTILS_H
+
+#include "GCNSubtarget.h"
+#include "llvm/CodeGen/Register.h"
+
+namespace llvm {
+
+class GCNSubtarget;
+
+namespace AMDGPU {
+
+class LaneMaskConstants {
+public:
+  const Register ExecReg;
+  const Register VccReg;
+  const unsigned AndOpc;
+  const unsigned AndTermOpc;
+  const unsigned AndN2Opc;
+  const unsigned AndN2SaveExecOpc;
+  const unsigned AndN2TermOpc;
+  const unsigned AndSaveExecOpc;
+  const unsigned AndSaveExecTermOpc;
+  const unsigned BfmOpc;
+  const unsigned CMovOpc;
+  const unsigned CSelectOpc;
+  const unsigned MovOpc;
+  const unsigned MovTermOpc;
+  const unsigned OrOpc;
+  const unsigned OrTermOpc;
+  const unsigned OrSaveExecOpc;
+  const unsigned XorOpc;
+  const unsigned XorTermOpc;
+  const unsigned WQMOpc;
+
+  constexpr LaneMaskConstants(bool IsWave32)
+      : ExecReg(IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC),
+        VccReg(IsWave32 ? AMDGPU::VCC_LO : AMDGPU::VCC),
+        AndOpc(IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64),
+        AndTermOpc(IsWave32 ? AMDGPU::S_AND_B32_term : AMDGPU::S_AND_B64_term),
+        AndN2Opc(IsWave32 ? AMDGPU::S_ANDN2_B32 : AMDGPU::S_ANDN2_B64),
+        AndN2SaveExecOpc(IsWave32 ? AMDGPU::S_ANDN2_SAVEEXEC_B32
+                                  : AMDGPU::S_ANDN2_SAVEEXEC_B64),
+        AndN2TermOpc(IsWave32 ? AMDGPU::S_ANDN2_B32_term
+                              : AMDGPU::S_ANDN2_B64_term),
+        AndSaveExecOpc(IsWave32 ? AMDGPU::S_AND_SAVEEXEC_B32
+                                : AMDGPU::S_AND_SAVEEXEC_B64),
+        AndSaveExecTermOpc(IsWave32 ? AMDGPU::S_AND_SAVEEXEC_B32_term
+                                    : AMDGPU::S_AND_SAVEEXEC_B64_term),
+        BfmOpc(IsWave32 ? AMDGPU::S_BFM_B32 : AMDGPU::S_BFM_B64),
+        CMovOpc(IsWave32 ? AMDGPU::S_CMOV_B32 : AMDGPU::S_CMOV_B64),
+        CSelectOpc(IsWave32 ? AMDGPU::S_CSELECT_B32 : AMDGPU::S_CSELECT_B64),
+        MovOpc(IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),
+        MovTermOpc(IsWave32 ? AMDGPU::S_MOV_B32_term : AMDGPU::S_MOV_B64_term),
+        OrOpc(IsWave32 ? AMDGPU::S_OR_B32 : AMDGPU::S_OR_B64),
+        OrTermOpc(IsWave32 ? AMDGPU::S_OR_B32_term : AMDGPU::S_OR_B64_term),
+        OrSaveExecOpc(IsWave32 ? AMDGPU::S_OR_SAVEEXEC_B32
+                               : AMDGPU::S_OR_SAVEEXEC_B64),
+        XorOpc(IsWave32 ? AMDGPU::S_XOR_B32 : AMDGPU::S_XOR_B64),
+        XorTermOpc(IsWave32 ? AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term),
+        WQMOpc(IsWave32 ? AMDGPU::S_WQM_B32 : AMDGPU::S_WQM_B64) {}
+
+  static inline const LaneMaskConstants &get(const GCNSubtarget &ST);
+};
+
+static constexpr LaneMaskConstants LaneMaskConstants32 =
+    LaneMaskConstants(/*IsWave32=*/true);
+static constexpr LaneMaskConstants LaneMaskConstants64 =
+    LaneMaskConstants(/*IsWave32=*/false);
+
+inline const LaneMaskConstants &LaneMaskConstants::get(const GCNSubtarget &ST) {
+  unsigned WavefrontSize = ST.getWavefrontSize();
+  assert(WavefrontSize == 32 || WavefrontSize == 64);
+  return WavefrontSize == 32 ? LaneMaskConstants32 : LaneMaskConstants64;
+}
+
+} // end namespace AMDGPU
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPULANEMASKUTILS_H
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index f18536cd4ab93..c690b2b7129b4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -2293,16 +2293,9 @@ Register AMDGPULegalizerInfo::getSegmentAperture(
     assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
             !ST.hasGloballyAddressableScratch()) &&
            "Cannot use src_private_base with globally addressable scratch!");
-    // FIXME: It would be more natural to emit a COPY here, but then copy
-    // coalescing would kick in and it would think it's okay to use the "HI"
-    // subregister (instead of extracting the HI 32 bits) which is an artificial
-    // (unusable) register.
-    //  Register TableGen definitions would need an overhaul to get rid of the
-    //  artificial "HI" aperture registers and prevent this kind of issue from
-    //  happening.
     Register Dst = MRI.createGenericVirtualRegister(S64);
     MRI.setRegClass(Dst, &AMDGPU::SReg_64RegClass);
-    B.buildInstr(AMDGPU::S_MOV_B64, {Dst}, {Register(ApertureRegNo)});
+    B.buildCopy({Dst}, {Register(ApertureRegNo)});
     return B.buildUnmerge(S32, Dst).getReg(1);
   }
 
@@ -4452,6 +4445,74 @@ void AMDGPULegalizerInfo::buildLoadInputValue(Register DstReg,
   }
 }
 
+bool AMDGPULegalizerInfo::legalizeWorkGroupId(
+    MachineInstr &MI, MachineIRBuilder &B,
+    AMDGPUFunctionArgInfo::PreloadedValue WorkGroupIdPV,
+    AMDGPUFunctionArgInfo::PreloadedValue ClusterMaxIdPV,
+    AMDGPUFunctionArgInfo::PreloadedValue ClusterWorkGroupIdPV) const {
+  Register DstReg = MI.getOperand(0).getReg();
+  if (!ST.hasClusters()) {
+    if (!loadInputValue(DstReg, B, WorkGroupIdPV))
+      return false;
+    MI.eraseFromParent();
+    return true;
+  }
+
+  // Clusters are supported. Return the global position in the grid. If clusters
+  // are enabled, WorkGroupIdPV returns the cluster ID not the workgroup ID.
+
+  // WorkGroupIdXYZ = ClusterId == 0 ?
+  //   ClusterIdXYZ :
+  //   ClusterIdXYZ * (ClusterMaxIdXYZ + 1) + ClusterWorkGroupIdXYZ
+  MachineRegisterInfo &MRI = *B.getMRI();
+  const LLT S32 = LLT::scalar(32);
+  Register ClusterIdXYZ = MRI.createGenericVirtualRegister(S32);
+  Register ClusterMaxIdXYZ = MRI.createGenericVirtualRegister(S32);
+  Register ClusterWorkGroupIdXYZ = MRI.createGenericVirtualRegister(S32);
+  if (!loadInputValue(ClusterIdXYZ, B, WorkGroupIdPV) ||
+      !loadInputValue(ClusterWorkGroupIdXYZ, B, ClusterWorkGroupIdPV) ||
+      !loadInputValue(ClusterMaxIdXYZ, B, ClusterMaxIdPV))
+    return false;
+
+  auto One = B.buildConstant(S32, 1);
+  auto ClusterSizeXYZ = B.buildAdd(S32, ClusterMaxIdXYZ, One);
+  auto GlobalIdXYZ = B.buildAdd(S32, ClusterWorkGroupIdXYZ,
+                                B.buildMul(S32, ClusterIdXYZ, ClusterSizeXYZ));
+
+  const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
+
+  switch (MFI->getClusterDims().getKind()) {
+  case AMDGPU::ClusterDimsAttr::Kind::FixedDims:
+  case AMDGPU::ClusterDimsAttr::Kind::VariableDims: {
+    B.buildCopy(DstReg, GlobalIdXYZ);
+    MI.eraseFromParent();
+    return true;
+  }
+  case AMDGPU::ClusterDimsAttr::Kind::NoCluster: {
+    B.buildCopy(DstReg, ClusterIdXYZ);
+    MI.eraseFromParent();
+    return true;
+  }
+  case AMDGPU::ClusterDimsAttr::Kind::Unknown: {
+    using namespace AMDGPU::Hwreg;
+    unsigned ClusterIdField = HwregEncoding::encode(ID_IB_STS2, 6, 4);
+    Register ClusterId = MRI.createGenericVirtualRegister(S32);
+    MRI.setRegClass(ClusterId, &AMDGPU::SReg_32RegClass);
+    B.buildInstr(AMDGPU::S_GETREG_B32_const)
+        .addDef(ClusterId)
+        .addImm(ClusterIdField);
+    auto Zero = B.buildConstant(S32, 0);
+    auto NoClusters =
+        B.buildICmp(CmpInst::ICMP_EQ, LLT::scalar(1), ClusterId, Zero);
+    B.buildSelect(DstReg, NoClusters, ClusterIdXYZ, GlobalIdXYZ);
+    MI.eraseFromParent();
+    return true;
+  }
+  }
+
+  llvm_unreachable("nothing should reach here");
+}
+
 bool AMDGPULegalizerInfo::loadInputValue(
     Register DstReg, MachineIRBuilder &B,
     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
@@ -4471,8 +4532,31 @@ bool AMDGPULegalizerInfo::loadInputValue(
       AMDGPU::isEntryFunctionCC(CC) && !MFI->hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
   const ArgDescriptor WorkGroupIDZ =
       ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
+  const ArgDescriptor ClusterWorkGroupIDX =
+      ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0000000Fu);
+  const ArgDescriptor ClusterWorkGroupIDY =
+      ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x000000F0u);
+  const ArgDescriptor ClusterWorkGroupIDZ =
+      ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x00000F00u);
+  const ArgDescriptor ClusterWorkGroupMaxIDX =
+      ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0000F000u);
+  const ArgDescriptor ClusterWorkGroupMaxIDY =
+      ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x000F0000u);
+  const ArgDescriptor ClusterWorkGroupMaxIDZ =
+      ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x00F00000u);
+  const ArgDescriptor ClusterWorkGroupMaxFlatID =
+      ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0F000000u);
+
+  auto LoadConstant = [&](unsigned N) {
+    B.buildConstant(DstReg, N);
+    return true;
+  };
+
   if (ST.hasArchitectedSGPRs() &&
       (AMDGPU::isCompute(CC) || CC == CallingConv::AMDGPU_Gfx)) {
+    AMDGPU::ClusterDimsAttr ClusterDims = MFI->getClusterDims();
+    bool HasFixedDims = ClusterDims.isFixedDims();
+
     switch (ArgType) {
     case AMDGPUFunctionArgInfo::WORKGROUP_ID_X:
       Arg = &WorkGroupIDX;
@@ -4489,6 +4573,53 @@ bool AMDGPULegalizerInfo::loadInputValue(
       ArgRC = &AMDGPU::SReg_32RegClass;
       ArgTy = LLT::scalar(32);
       break;
+    case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_X:
+      if (HasFixedDims && ClusterDims.getDims()[0] == 1)
+        return LoadConstant(0);
+      Arg = &ClusterWorkGroupIDX;
+      ArgRC = &AMDGPU::SReg_32RegClass;
+      ArgTy = LLT::scalar(32);
+      break;
+    case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Y:
+      if (HasFixedDims && ClusterDims.getDims()[1] == 1)
+        return LoadConstant(0);
+      Arg = &ClusterWorkGroupIDY;
+      ArgRC = &AMDGPU::SReg_32RegClass;
+      ArgTy = LLT::scalar(32);
+      break;
+    case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Z:
+      if (HasFixedDims && ClusterDims.getDims()[2] == 1)
+        return LoadConstant(0);
+      Arg = &ClusterWorkGroupIDZ;
+      ArgRC = &AMDGPU::SReg_32RegClass;
+      ArgTy = LLT::scalar(32);
+      break;
+    case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_X:
+      if (HasFixedDims)
+        return LoadConstant(ClusterDims.getDims()[0] - 1);
+      Arg = &ClusterWorkGroupMaxIDX;
+      ArgRC = &AMDGPU::SReg_32RegClass;
+      ArgTy = LLT::scalar(32);
+      break;
+    case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Y:
+      if (HasFixedDims)
+        return LoadConstant(ClusterDims.getDims()[1] - 1);
+      Arg = &ClusterWorkGroupMaxIDY;
+      ArgRC = &AMDGPU::SReg_32RegClass;
+      ArgTy = LLT::scalar(32);
+      break;
+    case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Z:
+      if (HasFixedDims)
+        return LoadConstant(ClusterDims.getDims()[2] - 1);
+      Arg = &ClusterWorkGroupMaxIDZ;
+      ArgRC = &AMDGPU::SReg_32RegClass;
+      ArgTy = LLT::scalar(32);
+      break;
+    case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_FLAT_ID:
+      Arg = &ClusterWorkGroupMaxFlatID;
+      ArgRC = &AMDGPU::SReg_32RegClass;
+      ArgTy = LLT::scalar(32);
+      break;
     default:
       break;
     }
@@ -4499,10 +4630,9 @@ bool AMDGPULegalizerInfo::loadInputValue(
 
   if (!Arg) {
     if (ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR) {
-      // The intrinsic may appear when we have a 0 sized kernarg segment, in which
-      // case the pointer argument may be missing and we use null.
-      B.buildConstant(DstReg, 0);
-      return true;
+      // The intrinsic may appear when we have a 0 sized kernarg segment, in
+      // which case the pointer argument may be missing and we use null.
+      return LoadConstant(0);
     }
 
     // It's undefined behavior if a function marked with the amdgpu-no-*
@@ -7415,6 +7545,22 @@ bool AMDGPULegalizerInfo::legalizeWaveID(MachineInstr &MI,
   return true;
 }
 
+bool AMDGPULegalizerInfo::legalizeConstHwRegRead(MachineInstr &MI,
+                                                 MachineIRBuilder &B,
+                                                 AMDGPU::Hwreg::Id HwReg,
+                                                 unsigned LowBit,
+                                                 unsigned Width) const {
+  MachineRegisterInfo &MRI = *B.getMRI();
+  Register DstReg = MI.getOperand(0).getReg();
+  if (!MRI.getRegClassOrNull(DstReg))
+    MRI.setRegClass(DstReg, &AMDGPU::SReg_32RegClass);
+  B.buildInstr(AMDGPU::S_GETREG_B32_const)
+      .addDef(DstReg)
+      .addImm(AMDGPU::Hwreg::HwregEncoding::encode(HwReg, LowBit, Width));
+  MI.eraseFromParent();
+  return true;
+}
+
 static constexpr unsigned FPEnvModeBitField =
     AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_MODE, 0, 23);
 
@@ -7577,14 +7723,64 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
     return legalizeWorkitemIDIntrinsic(MI, MRI, B, 2,
                                        AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
   case Intrinsic::amdgcn_workgroup_id_x:
-    return legalizePreloadedArgIntrin(MI, MRI, B,
-                                      AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
+    return legalizeWorkGroupId(
+        MI, B, AMDGPUFunctionArgInfo::WORKGROUP_ID_X,
+        AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_X,
+        AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_X);
   case Intrinsic::amdgcn_workgroup_id_y:
-    return legalizePreloadedArgIntrin(MI, MRI, B,
-                                      AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
+    return legalizeWorkGroupId(
+        MI, B, AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,
+        AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Y,
+        AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Y);
   case Intrinsic::amdgcn_workgroup_id_z:
-    return legalizePreloadedArgIntrin(MI, MRI, B,
+    return legalizeWorkGroupId(
+        MI, B, AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,
+        AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Z,
+        AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Z);
+  case Intrinsic::amdgcn_cluster_id_x:
+    return ST.hasClusters() &&
+           legalizePreloadedArgIntrin(MI, MRI, B,
+                                      AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
+  case Intrinsic::amdgcn_cluster_id_y:
+    return ST.hasClusters() &&
+           legalizePreloadedArgIntrin(MI, MRI, B,
+                                      AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
+  case Intrinsic::amdgcn_cluster_id_z:
+    return ST.hasClusters() &&
+           legalizePreloadedArgIntrin(MI, MRI, B,
                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
+  case Intrinsic::amdgcn_cluster_workgroup_id_x:
+    return ST.hasClusters() &&
+           legalizePreloadedArgIntrin(
+               MI, MRI, B, AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_X);
+  case Intrinsic::amdgcn_cluster_workgroup_id_y:
+    return ST.hasClusters() &&
+           legalizePreloadedArgIntrin(
+               MI, MRI, B, AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Y);
+  case Intrinsic::amdgcn_cluster_workgroup_id_z:
+    return ST.hasClusters() &&
+           legalizePreloadedArgIntrin(
+               MI, MRI, B, AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Z);
+  case Intrinsic::amdgcn_cluster_workgroup_flat_id:
+    return ST.hasClusters() &&
+           legalizeConstHwRegRead(MI, B, AMDGPU::Hwreg::ID_IB_STS2, 21, 4);
+  case Intrinsic::amdgcn_cluster_workgroup_max_id_x:
+    return ST.hasClusters() &&
+           legalizePreloadedArgIntrin(
+               MI, MRI, B, AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_X);
+  case Intrinsic::amdgcn_cluster_workgroup_max_id_y:
+    return ST.hasClusters() &&
+           legalizePreloadedArgIntrin(
+               MI, MRI, B, AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Y);
+  case Intrinsic::amdgcn_cluster_workgroup_max_id_z:
+    return ST.hasClusters() &&
+           legalizePreloadedArgIntrin(
+               MI, MRI, B, AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Z);
+  case Intrinsic::amdgcn_cluster_workgroup_max_flat_id:
+    return ST.hasClusters() &&
+           legalizePreloadedArgIntrin(
+               MI, MRI, B,
+               AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_FLAT_ID);
   case Intrinsic::amdgcn_wave_id:
     return legalizeWaveID(MI, B);
   case Intrinsic::amdgcn_lds_kernel_id:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
index 1f4e02b0d600a..cd44a9ba0807c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
@@ -114,6 +114,11 @@ class AMDGPULegalizerInfo final : public LegalizerInfo {
   void buildLoadInputValue(Register DstReg, MachineIRBuilder &B,
                            const ArgDescriptor *Arg,
                            const TargetRegisterClass *ArgRC, LLT ArgTy) const;
+  bool legalizeWorkGroupId(
+      MachineInstr &MI, MachineIRBuilder &B,
+      AMDGPUFunctionArgInfo::PreloadedValue ClusterIdPV,
+      AMDGPUFunctionArgInfo::PreloadedValue ClusterMaxIdPV,
+      AMDGPUFunctionArgInfo::PreloadedValue ClusterWorkGroupIdPV) const;
   bool loadInputValue(Register DstReg, MachineIRBuilder &B,
                       AMDGPUFunctionArgInfo::PreloadedValue ArgType) const;
 
@@ -218,6 +223,9 @@ class AMDGPULegalizerInfo final : public LegalizerInfo {
 
   bool legalizeStackSave(MachineInstr &MI, MachineIRBuilder &B) const;
   bool legalizeWaveID(MachineInstr &MI, MachineIRBuilder &B) const;
+  bool legalizeConstHwRegRead(MachineInstr &MI, MachineIRBuilder &B,
+                              AMDGPU::Hwreg::Id HwReg, unsigned LowBit,
+                              unsigned Width) const;
 
   bool legalizeGetFPEnv(MachineInstr &MI, MachineRegisterInfo &MRI,
                         MachineIRBuilder &B) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
index 7dc1ec07cf0f9..d9bfeae52e213 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
@@ -2328,6 +2328,12 @@ void SplitPtrStructs::processFunction(Function &F) {
   LLVM_DEBUG(dbgs() << "Splitting pointer structs in function: " << F.getName()
                     << "\n");
   for (Instruction *I : Originals) {
+    // In some cases, instruction order doesn't reflect program order,
+    // so the visit() call will have already visited coertain instructions
+    // by the time this loop gets to them. Avoid re-visiting these so as to,
+    // for example, avoid processing the same conditional twice.
+    if (SplitUsers.contains(I))
+      continue;
     auto [Rsrc, Off] = visit(I);
     assert(((Rsrc && Off) || (!Rsrc && !Off)) &&
            "Can't have a resource but no offset");
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp
index a30d9cb0412a4..d490788a97685 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp
@@ -18,6 +18,7 @@
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/IntrinsicsAMDGPU.h"
 #include "llvm/InitializePasses.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
 
 #define DEBUG_TYPE "amdgpu-lower-intrinsics"
 
@@ -49,7 +50,6 @@ class AMDGPULowerIntrinsicsLegacy : public ModulePass {
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<TargetPassConfig>();
-    AU.setPreservesCFG();
   }
 };
 
@@ -73,6 +73,7 @@ bool AMDGPULowerIntrinsicsImpl::run() {
     case Intrinsic::amdgcn_s_barrier_signal:
     case Intrinsic::amdgcn_s_barrier_signal_isfirst:
     case Intrinsic::amdgcn_s_barrier_wait:
+    case Intrinsic::amdgcn_s_cluster_barrier:
       forEachCall(F, [&](IntrinsicInst *II) { Changed |= visitBarrier(*II); });
       break;
     }
@@ -81,13 +82,14 @@ bool AMDGPULowerIntrinsicsImpl::run() {
   return Changed;
 }
 
-// Optimize barriers and lower s_barrier to a sequence of split barrier
-// intrinsics.
+// Optimize barriers and lower s_(cluster_)barrier to a sequence of split
+// barrier intrinsics.
 bool AMDGPULowerIntrinsicsImpl::visitBarrier(IntrinsicInst &I) {
   assert(I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier ||
          I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier_signal ||
          I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier_signal_isfirst ||
-         I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier_wait);
+         I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier_wait ||
+         I.getIntrinsicID() == Intrinsic::amdgcn_s_cluster_barrier);
 
   const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(*I.getFunction());
   bool IsSingleWaveWG = false;
@@ -99,7 +101,59 @@ bool AMDGPULowerIntrinsicsImpl::visitBarrier(IntrinsicInst &I) {
 
   IRBuilder<> B(&I);
 
-  if (IsSingleWaveWG) {
+  // Lower the s_cluster_barrier intrinsic first. There is no corresponding
+  // hardware instruction in any subtarget.
+  if (I.getIntrinsicID() == Intrinsic::amdgcn_s_cluster_barrier) {
+    // The default cluster barrier expects one signal per workgroup. So we need
+    // a workgroup barrier first.
+    if (IsSingleWaveWG) {
+      B.CreateIntrinsic(B.getVoidTy(), Intrinsic::amdgcn_wave_barrier, {});
+    } else {
+      Value *BarrierID_32 = B.getInt32(AMDGPU::Barrier::WORKGROUP);
+      Value *BarrierID_16 = B.getInt16(AMDGPU::Barrier::WORKGROUP);
+      Value *IsFirst = B.CreateIntrinsic(
+          B.getInt1Ty(), Intrinsic::amdgcn_s_barrier_signal_isfirst,
+          {BarrierID_32});
+      B.CreateIntrinsic(B.getVoidTy(), Intrinsic::amdgcn_s_barrier_wait,
+                        {BarrierID_16});
+
+      Instruction *ThenTerm =
+          SplitBlockAndInsertIfThen(IsFirst, I.getIterator(), false);
+      B.SetInsertPoint(ThenTerm);
+    }
+
+    // Now we can signal the cluster barrier from a single wave and wait for the
+    // barrier in all waves.
+    Value *BarrierID_32 = B.getInt32(AMDGPU::Barrier::CLUSTER);
+    Value *BarrierID_16 = B.getInt16(AMDGPU::Barrier::CLUSTER);
+    B.CreateIntrinsic(B.getVoidTy(), Intrinsic::amdgcn_s_barrier_signal,
+                      {BarrierID_32});
+
+    B.SetInsertPoint(&I);
+    B.CreateIntrinsic(B.getVoidTy(), Intrinsic::amdgcn_s_barrier_wait,
+                      {BarrierID_16});
+
+    I.eraseFromParent();
+    return true;
+  }
+
+  bool IsWorkgroupScope = false;
+
+  if (I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier_wait ||
+      I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier_signal ||
+      I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier_signal_isfirst) {
+    int BarrierID = cast<ConstantInt>(I.getArgOperand(0))->getSExtValue();
+    if (BarrierID == AMDGPU::Barrier::TRAP ||
+        BarrierID == AMDGPU::Barrier::WORKGROUP ||
+        (BarrierID >= AMDGPU::Barrier::NAMED_BARRIER_FIRST &&
+         BarrierID <= AMDGPU::Barrier::NAMED_BARRIER_LAST))
+      IsWorkgroupScope = true;
+  } else {
+    assert(I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier);
+    IsWorkgroupScope = true;
+  }
+
+  if (IsWorkgroupScope && IsSingleWaveWG) {
     // Down-grade waits, remove split signals.
     if (I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier ||
         I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier_wait) {
@@ -134,9 +188,7 @@ PreservedAnalyses AMDGPULowerIntrinsicsPass::run(Module &M,
   AMDGPULowerIntrinsicsImpl Impl(M, TM);
   if (!Impl.run())
     return PreservedAnalyses::all();
-  PreservedAnalyses PA;
-  PA.preserveSet<CFGAnalyses>();
-  return PA;
+  return PreservedAnalyses::none();
 }
 
 bool AMDGPULowerIntrinsicsLegacy::runOnModule(Module &M) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp
index eeb6de512bf5e..e17c2113ca398 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp
@@ -360,6 +360,7 @@ bool isReallyAClobber(const Value *Ptr, MemoryDef *Def, AAResults *AA) {
   if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(DefInst)) {
     switch (II->getIntrinsicID()) {
     case Intrinsic::amdgcn_s_barrier:
+    case Intrinsic::amdgcn_s_cluster_barrier:
     case Intrinsic::amdgcn_s_barrier_signal:
     case Intrinsic::amdgcn_s_barrier_signal_var:
     case Intrinsic::amdgcn_s_barrier_signal_isfirst:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 7dbe1235a98b5..ddabd25894414 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -406,6 +406,7 @@ static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca,
                                SmallVector<Instruction *> &NewInsts) {
   // TODO: Extracting a "multiple of X" from a GEP might be a useful generic
   // helper.
+  LLVMContext &Ctx = GEP->getContext();
   unsigned BW = DL.getIndexTypeSizeInBits(GEP->getType());
   SmallMapVector<Value *, APInt, 4> VarOffsets;
   APInt ConstOffset(BW, 0);
@@ -438,27 +439,24 @@ static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca,
 
   assert(CurPtr == Alloca && "GEP not based on alloca");
 
-  unsigned VecElemSize = DL.getTypeAllocSize(VecElemTy);
+  int64_t VecElemSize = DL.getTypeAllocSize(VecElemTy);
   if (VarOffsets.size() > 1)
     return nullptr;
 
   APInt IndexQuot;
-  APInt Rem;
-  APInt::sdivrem(ConstOffset, APInt(ConstOffset.getBitWidth(), VecElemSize),
-                 IndexQuot, Rem);
-  if (!Rem.isZero())
+  int64_t Rem;
+  APInt::sdivrem(ConstOffset, VecElemSize, IndexQuot, Rem);
+  if (Rem != 0)
     return nullptr;
   if (VarOffsets.size() == 0)
-    return ConstantInt::get(GEP->getContext(), IndexQuot);
+    return ConstantInt::get(Ctx, IndexQuot);
 
   IRBuilder<> Builder(GEP);
 
   const auto &VarOffset = VarOffsets.front();
   APInt OffsetQuot;
-  APInt::sdivrem(VarOffset.second,
-                 APInt(VarOffset.second.getBitWidth(), VecElemSize), OffsetQuot,
-                 Rem);
-  if (!Rem.isZero() || OffsetQuot.isZero())
+  APInt::sdivrem(VarOffset.second, VecElemSize, OffsetQuot, Rem);
+  if (Rem != 0 || OffsetQuot.isZero())
     return nullptr;
 
   Value *Offset = VarOffset.first;
@@ -468,7 +466,7 @@ static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca,
 
   if (!OffsetQuot.isOne()) {
     ConstantInt *ConstMul =
-        ConstantInt::get(OffsetType, OffsetQuot.getSExtValue());
+        ConstantInt::get(Ctx, OffsetQuot.sext(OffsetType->getBitWidth()));
     Offset = Builder.CreateMul(Offset, ConstMul);
     if (Instruction *NewInst = dyn_cast<Instruction>(Offset))
       NewInsts.push_back(NewInst);
@@ -477,7 +475,7 @@ static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca,
     return Offset;
 
   ConstantInt *ConstIndex =
-      ConstantInt::get(OffsetType, IndexQuot.getSExtValue());
+      ConstantInt::get(Ctx, IndexQuot.sext(OffsetType->getBitWidth()));
   Value *IndexAdd = Builder.CreateAdd(Offset, ConstIndex);
   if (Instruction *NewInst = dyn_cast<Instruction>(IndexAdd))
     NewInsts.push_back(NewInst);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 36b27bef350ed..848d9a5a9eb98 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -73,6 +73,7 @@
 #include "AMDGPU.h"
 #include "AMDGPUGlobalISelUtils.h"
 #include "AMDGPUInstrInfo.h"
+#include "AMDGPULaneMaskUtils.h"
 #include "GCNSubtarget.h"
 #include "SIMachineFunctionInfo.h"
 #include "SIRegisterInfo.h"
@@ -783,17 +784,8 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
   MachineFunction *MF = &B.getMF();
 
   const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass();
-  const unsigned MovExecOpc =
-      Subtarget.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
-  const unsigned MovExecTermOpc =
-      Subtarget.isWave32() ? AMDGPU::S_MOV_B32_term : AMDGPU::S_MOV_B64_term;
-
-  const unsigned XorTermOpc = Subtarget.isWave32() ?
-    AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term;
-  const unsigned AndSaveExecOpc =  Subtarget.isWave32() ?
-    AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
-  const unsigned ExecReg =  Subtarget.isWave32() ?
-    AMDGPU::EXEC_LO : AMDGPU::EXEC;
+  const AMDGPU::LaneMaskConstants &LMC =
+      AMDGPU::LaneMaskConstants::get(Subtarget);
 
 #ifndef NDEBUG
   const int OrigRangeSize = std::distance(Range.begin(), Range.end());
@@ -941,19 +933,19 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
   MRI.setRegClass(CondReg, WaveRC);
 
   // Update EXEC, save the original EXEC value to VCC.
-  B.buildInstr(AndSaveExecOpc)
-    .addDef(NewExec)
-    .addReg(CondReg, RegState::Kill);
+  B.buildInstr(LMC.AndSaveExecOpc)
+      .addDef(NewExec)
+      .addReg(CondReg, RegState::Kill);
 
   MRI.setSimpleHint(NewExec, CondReg);
 
   B.setInsertPt(*BodyBB, BodyBB->end());
 
   // Update EXEC, switch all done bits to 0 and all todo bits to 1.
-  B.buildInstr(XorTermOpc)
-    .addDef(ExecReg)
-    .addReg(ExecReg)
-    .addReg(NewExec);
+  B.buildInstr(LMC.XorTermOpc)
+      .addDef(LMC.ExecReg)
+      .addReg(LMC.ExecReg)
+      .addReg(NewExec);
 
   // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
   // s_cbranch_scc0?
@@ -962,14 +954,12 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
   B.buildInstr(AMDGPU::SI_WATERFALL_LOOP).addMBB(LoopBB);
 
   // Save the EXEC mask before the loop.
-  BuildMI(MBB, MBB.end(), DL, TII->get(MovExecOpc), SaveExecReg)
-    .addReg(ExecReg);
+  BuildMI(MBB, MBB.end(), DL, TII->get(LMC.MovOpc), SaveExecReg)
+      .addReg(LMC.ExecReg);
 
   // Restore the EXEC mask after the loop.
   B.setMBB(*RestoreExecBB);
-  B.buildInstr(MovExecTermOpc)
-    .addDef(ExecReg)
-    .addReg(SaveExecReg);
+  B.buildInstr(LMC.MovTermOpc).addDef(LMC.ExecReg).addReg(SaveExecReg);
 
   // Set the insert point after the original instruction, so any new
   // instructions will be in the remainder.
@@ -3855,21 +3845,27 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     // want the most straightforward mapping, so just directly handle this.
     const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI);
     const RegisterBank *SrcBank = getRegBank(SrcReg, MRI, *TRI);
-    assert(SrcBank && "src bank should have been assigned already");
 
     // For COPY between a physical reg and an s1, there is no type associated so
     // we need to take the virtual register's type as a hint on how to interpret
     // s1 values.
+    unsigned Size;
     if (!SrcReg.isVirtual() && !DstBank &&
-        MRI.getType(DstReg) == LLT::scalar(1))
+        MRI.getType(DstReg) == LLT::scalar(1)) {
       DstBank = &AMDGPU::VCCRegBank;
-    else if (!DstReg.isVirtual() && MRI.getType(SrcReg) == LLT::scalar(1))
+      Size = 1;
+    } else if (!DstReg.isVirtual() && MRI.getType(SrcReg) == LLT::scalar(1)) {
       DstBank = &AMDGPU::VCCRegBank;
+      Size = 1;
+    } else {
+      Size = getSizeInBits(DstReg, MRI, *TRI);
+    }
 
     if (!DstBank)
       DstBank = SrcBank;
+    else if (!SrcBank)
+      SrcBank = DstBank;
 
-    unsigned Size = getSizeInBits(DstReg, MRI, *TRI);
     if (MI.getOpcode() != AMDGPU::G_FREEZE &&
         cannotCopy(*DstBank, *SrcBank, TypeSize::getFixed(Size)))
       return getInvalidInstructionMapping();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp b/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp
index 21cf9cc6878fb..fedb694bfcc2a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp
@@ -29,6 +29,8 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/LiveIntervals.h"
 #include "llvm/CodeGen/LiveRegMatrix.h"
+#include "llvm/CodeGen/LiveStacks.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/VirtRegMap.h"
 #include "llvm/InitializePasses.h"
@@ -42,6 +44,9 @@ namespace {
 STATISTIC(NumMFMAsRewrittenToAGPR,
           "Number of MFMA instructions rewritten to use AGPR form");
 
+/// Map from spill slot frame index to list of instructions which reference it.
+using SpillReferenceMap = DenseMap<int, SmallVector<MachineInstr *, 4>>;
+
 class AMDGPURewriteAGPRCopyMFMAImpl {
   MachineFunction &MF;
   const GCNSubtarget &ST;
@@ -51,6 +56,7 @@ class AMDGPURewriteAGPRCopyMFMAImpl {
   VirtRegMap &VRM;
   LiveRegMatrix &LRM;
   LiveIntervals &LIS;
+  LiveStacks &LSS;
   const RegisterClassInfo &RegClassInfo;
 
   bool attemptReassignmentsToAGPR(SmallSetVector<Register, 4> &InterferingRegs,
@@ -59,10 +65,11 @@ class AMDGPURewriteAGPRCopyMFMAImpl {
 public:
   AMDGPURewriteAGPRCopyMFMAImpl(MachineFunction &MF, VirtRegMap &VRM,
                                 LiveRegMatrix &LRM, LiveIntervals &LIS,
+                                LiveStacks &LSS,
                                 const RegisterClassInfo &RegClassInfo)
       : MF(MF), ST(MF.getSubtarget<GCNSubtarget>()), TII(*ST.getInstrInfo()),
         TRI(*ST.getRegisterInfo()), MRI(MF.getRegInfo()), VRM(VRM), LRM(LRM),
-        LIS(LIS), RegClassInfo(RegClassInfo) {}
+        LIS(LIS), LSS(LSS), RegClassInfo(RegClassInfo) {}
 
   bool isRewriteCandidate(const MachineInstr &MI) const {
     return TII.isMAI(MI) && AMDGPU::getMFMASrcCVDstAGPROp(MI.getOpcode()) != -1;
@@ -103,6 +110,22 @@ class AMDGPURewriteAGPRCopyMFMAImpl {
 
   bool tryFoldCopiesToAGPR(Register VReg, MCRegister AssignedAGPR) const;
   bool tryFoldCopiesFromAGPR(Register VReg, MCRegister AssignedAGPR) const;
+
+  /// Replace spill instruction \p SpillMI which loads/stores from/to \p SpillFI
+  /// with a COPY to the replacement register value \p VReg.
+  void replaceSpillWithCopyToVReg(MachineInstr &SpillMI, int SpillFI,
+                                  Register VReg) const;
+
+  /// Create a map from frame index to use instructions for spills. If a use of
+  /// the frame index does not consist only of spill instructions, it will not
+  /// be included in the map.
+  void collectSpillIndexUses(ArrayRef<LiveInterval *> StackIntervals,
+                             SpillReferenceMap &Map) const;
+
+  /// Attempt to unspill VGPRs by finding a free register and replacing the
+  /// spill instructions with copies.
+  void eliminateSpillsOfReassignedVGPRs() const;
+
   bool run(MachineFunction &MF) const;
 };
 
@@ -391,6 +414,138 @@ bool AMDGPURewriteAGPRCopyMFMAImpl::tryFoldCopiesFromAGPR(
   return MadeChange;
 }
 
+void AMDGPURewriteAGPRCopyMFMAImpl::replaceSpillWithCopyToVReg(
+    MachineInstr &SpillMI, int SpillFI, Register VReg) const {
+  const DebugLoc &DL = SpillMI.getDebugLoc();
+  MachineBasicBlock &MBB = *SpillMI.getParent();
+  MachineInstr *NewCopy;
+  if (SpillMI.mayStore()) {
+    NewCopy = BuildMI(MBB, SpillMI, DL, TII.get(TargetOpcode::COPY), VReg)
+                  .add(SpillMI.getOperand(0));
+  } else {
+    NewCopy = BuildMI(MBB, SpillMI, DL, TII.get(TargetOpcode::COPY))
+                  .add(SpillMI.getOperand(0))
+                  .addReg(VReg);
+  }
+
+  LIS.ReplaceMachineInstrInMaps(SpillMI, *NewCopy);
+  SpillMI.eraseFromParent();
+}
+
+void AMDGPURewriteAGPRCopyMFMAImpl::collectSpillIndexUses(
+    ArrayRef<LiveInterval *> StackIntervals, SpillReferenceMap &Map) const {
+
+  SmallSet<int, 4> NeededFrameIndexes;
+  for (const LiveInterval *LI : StackIntervals)
+    NeededFrameIndexes.insert(LI->reg().stackSlotIndex());
+
+  for (MachineBasicBlock &MBB : MF) {
+    for (MachineInstr &MI : MBB) {
+      for (MachineOperand &MO : MI.operands()) {
+        if (!MO.isFI() || !NeededFrameIndexes.count(MO.getIndex()))
+          continue;
+
+        if (TII.isVGPRSpill(MI)) {
+          SmallVector<MachineInstr *, 4> &References = Map[MO.getIndex()];
+          References.push_back(&MI);
+          break;
+        }
+
+        // Verify this was really a spill instruction, if it's not just ignore
+        // all uses.
+
+        // TODO: This should probably be verifier enforced.
+        NeededFrameIndexes.erase(MO.getIndex());
+        Map.erase(MO.getIndex());
+      }
+    }
+  }
+}
+
+void AMDGPURewriteAGPRCopyMFMAImpl::eliminateSpillsOfReassignedVGPRs() const {
+  unsigned NumSlots = LSS.getNumIntervals();
+  if (NumSlots == 0)
+    return;
+
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+
+  SmallVector<LiveInterval *, 32> StackIntervals;
+  StackIntervals.reserve(NumSlots);
+
+  for (auto &[Slot, LI] : LSS) {
+    if (!MFI.isSpillSlotObjectIndex(Slot) || MFI.isDeadObjectIndex(Slot))
+      continue;
+
+    const TargetRegisterClass *RC = LSS.getIntervalRegClass(Slot);
+    if (TRI.hasVGPRs(RC))
+      StackIntervals.push_back(&LI);
+  }
+
+  sort(StackIntervals, [](const LiveInterval *A, const LiveInterval *B) {
+    /// Sort heaviest intervals first to prioritize their unspilling
+    if (A->weight() > B->weight())
+      return true;
+
+    if (A->getSize() > B->getSize())
+      return true;
+
+    // Tie breaker by number to avoid need for stable sort
+    return A->reg().stackSlotIndex() < B->reg().stackSlotIndex();
+  });
+
+  // FIXME: The APIs for dealing with the LiveInterval of a frame index are
+  // cumbersome. LiveStacks owns its LiveIntervals which refer to stack
+  // slots. We cannot use the usual LiveRegMatrix::assign and unassign on these,
+  // and must create a substitute virtual register to do so. This makes
+  // incremental updating here difficult; we need to actually perform the IR
+  // mutation to get the new vreg references in place to compute the register
+  // LiveInterval to perform an assignment to track the new interference
+  // correctly, and we can't simply migrate the LiveInterval we already have.
+  //
+  // To avoid walking through the entire function for each index, pre-collect
+  // all the instructions slot referencess.
+
+  DenseMap<int, SmallVector<MachineInstr *, 4>> SpillSlotReferences;
+  collectSpillIndexUses(StackIntervals, SpillSlotReferences);
+
+  for (LiveInterval *LI : StackIntervals) {
+    int Slot = LI->reg().stackSlotIndex();
+    auto SpillReferences = SpillSlotReferences.find(Slot);
+    if (SpillReferences == SpillSlotReferences.end())
+      continue;
+
+    const TargetRegisterClass *RC = LSS.getIntervalRegClass(Slot);
+
+    LLVM_DEBUG(dbgs() << "Trying to eliminate " << printReg(Slot, &TRI)
+                      << " by reassigning\n");
+
+    ArrayRef<MCPhysReg> AllocOrder = RegClassInfo.getOrder(RC);
+
+    for (MCPhysReg PhysReg : AllocOrder) {
+      if (LRM.checkInterference(*LI, PhysReg) != LiveRegMatrix::IK_Free)
+        continue;
+
+      LLVM_DEBUG(dbgs() << "Reassigning " << *LI << " to "
+                        << printReg(PhysReg, &TRI) << '\n');
+
+      const TargetRegisterClass *RC = LSS.getIntervalRegClass(Slot);
+      Register NewVReg = MRI.createVirtualRegister(RC);
+
+      for (MachineInstr *SpillMI : SpillReferences->second)
+        replaceSpillWithCopyToVReg(*SpillMI, Slot, NewVReg);
+
+      // TODO: We should be able to transfer the information from the stack
+      // slot's LiveInterval without recomputing from scratch with the
+      // replacement vreg uses.
+      LiveInterval &NewLI = LIS.createAndComputeVirtRegInterval(NewVReg);
+      VRM.grow();
+      LRM.assign(NewLI, PhysReg);
+      MFI.RemoveStackObject(Slot);
+      break;
+    }
+  }
+}
+
 bool AMDGPURewriteAGPRCopyMFMAImpl::run(MachineFunction &MF) const {
   // This only applies on subtargets that have a configurable AGPR vs. VGPR
   // allocation.
@@ -417,6 +572,12 @@ bool AMDGPURewriteAGPRCopyMFMAImpl::run(MachineFunction &MF) const {
       MadeChange = true;
   }
 
+  // If we've successfully rewritten some MFMAs, we've alleviated some VGPR
+  // pressure. See if we can eliminate some spills now that those registers are
+  // more available.
+  if (MadeChange)
+    eliminateSpillsOfReassignedVGPRs();
+
   return MadeChange;
 }
 
@@ -440,10 +601,13 @@ class AMDGPURewriteAGPRCopyMFMALegacy : public MachineFunctionPass {
     AU.addRequired<LiveIntervalsWrapperPass>();
     AU.addRequired<VirtRegMapWrapperLegacy>();
     AU.addRequired<LiveRegMatrixWrapperLegacy>();
+    AU.addRequired<LiveStacksWrapperLegacy>();
 
     AU.addPreserved<LiveIntervalsWrapperPass>();
     AU.addPreserved<VirtRegMapWrapperLegacy>();
     AU.addPreserved<LiveRegMatrixWrapperLegacy>();
+    AU.addPreserved<LiveStacksWrapperLegacy>();
+
     AU.setPreservesAll();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
@@ -456,6 +620,7 @@ INITIALIZE_PASS_BEGIN(AMDGPURewriteAGPRCopyMFMALegacy, DEBUG_TYPE,
 INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(VirtRegMapWrapperLegacy)
 INITIALIZE_PASS_DEPENDENCY(LiveRegMatrixWrapperLegacy)
+INITIALIZE_PASS_DEPENDENCY(LiveStacksWrapperLegacy)
 INITIALIZE_PASS_END(AMDGPURewriteAGPRCopyMFMALegacy, DEBUG_TYPE,
                     "AMDGPU Rewrite AGPR-Copy-MFMA", false, false)
 
@@ -474,8 +639,8 @@ bool AMDGPURewriteAGPRCopyMFMALegacy::runOnMachineFunction(
   auto &VRM = getAnalysis<VirtRegMapWrapperLegacy>().getVRM();
   auto &LRM = getAnalysis<LiveRegMatrixWrapperLegacy>().getLRM();
   auto &LIS = getAnalysis<LiveIntervalsWrapperPass>().getLIS();
-
-  AMDGPURewriteAGPRCopyMFMAImpl Impl(MF, VRM, LRM, LIS, RegClassInfo);
+  auto &LSS = getAnalysis<LiveStacksWrapperLegacy>().getLS();
+  AMDGPURewriteAGPRCopyMFMAImpl Impl(MF, VRM, LRM, LIS, LSS, RegClassInfo);
   return Impl.run(MF);
 }
 
@@ -485,13 +650,15 @@ AMDGPURewriteAGPRCopyMFMAPass::run(MachineFunction &MF,
   VirtRegMap &VRM = MFAM.getResult<VirtRegMapAnalysis>(MF);
   LiveRegMatrix &LRM = MFAM.getResult<LiveRegMatrixAnalysis>(MF);
   LiveIntervals &LIS = MFAM.getResult<LiveIntervalsAnalysis>(MF);
+  LiveStacks &LSS = MFAM.getResult<LiveStacksAnalysis>(MF);
   RegisterClassInfo RegClassInfo;
   RegClassInfo.runOnMachineFunction(MF);
 
-  AMDGPURewriteAGPRCopyMFMAImpl Impl(MF, VRM, LRM, LIS, RegClassInfo);
+  AMDGPURewriteAGPRCopyMFMAImpl Impl(MF, VRM, LRM, LIS, LSS, RegClassInfo);
   if (!Impl.run(MF))
     return PreservedAnalyses::all();
   auto PA = getMachineFunctionPassPreservedAnalyses();
   PA.preserveSet<CFGAnalyses>();
+  PA.preserve<LiveStacksAnalysis>();
   return PA;
 }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index 73acb1ddbd2a7..26e0b3dfc2e8a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -38,6 +38,10 @@ bool AMDGPUSubtarget::useRealTrue16Insts() const {
   return hasTrue16BitInsts() && EnableRealTrue16Insts;
 }
 
+bool AMDGPUSubtarget::hasD16Writes32BitVgpr() const {
+  return EnableD16Writes32BitVgpr;
+}
+
 // Returns the maximum per-workgroup LDS allocation size (in bytes) that still
 // allows the given function to achieve an occupancy of NWaves waves per
 // SIMD / EU, taking into account only the function's *maximum* workgroup size.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
index 57b757c990e1a..ed03ef21b6dda 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -59,6 +59,7 @@ class AMDGPUSubtarget {
   bool HasCvtPkF16F32Inst = false;
   bool HasF32ToF16BF16ConversionSRInsts = false;
   bool EnableRealTrue16Insts = false;
+  bool EnableD16Writes32BitVgpr = false;
   bool HasBF16TransInsts = false;
   bool HasBF16ConversionInsts = false;
   bool HasBF16PackedInsts = false;
@@ -224,6 +225,8 @@ class AMDGPUSubtarget {
   // supported and the support for fake True16 instructions is removed.
   bool useRealTrue16Insts() const;
 
+  bool hasD16Writes32BitVgpr() const;
+
   bool hasBF16TransInsts() const { return HasBF16TransInsts; }
 
   bool hasBF16ConversionInsts() const {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 9afe7590fe4ef..92a587b5771b6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -720,25 +720,6 @@ static MachineSchedRegistry GCNILPSchedRegistry(
     "Run GCN iterative scheduler for ILP scheduling (experimental)",
     createIterativeILPMachineScheduler);
 
-static StringRef computeDataLayout(const Triple &TT) {
-  if (TT.getArch() == Triple::r600) {
-    // 32-bit pointers.
-    return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
-           "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1";
-  }
-
-  // 32-bit private, local, and region pointers. 64-bit global, constant and
-  // flat. 160-bit non-integral fat buffer pointers that include a 128-bit
-  // buffer descriptor and a 32-bit offset, which are indexed by 32-bit values
-  // (address space 7), and 128-bit non-integral buffer resourcees (address
-  // space 8) which cannot be non-trivilally accessed by LLVM memory operations
-  // like getelementptr.
-  return "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32"
-         "-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-i64:64-"
-         "v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-"
-         "v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9";
-}
-
 LLVM_READNONE
 static StringRef getGPUOrDefault(const Triple &TT, StringRef GPU) {
   if (!GPU.empty())
@@ -764,7 +745,7 @@ AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT,
                                          std::optional<CodeModel::Model> CM,
                                          CodeGenOptLevel OptLevel)
     : CodeGenTargetMachineImpl(
-          T, computeDataLayout(TT), TT, getGPUOrDefault(TT, CPU), FS, Options,
+          T, TT.computeDataLayout(), TT, getGPUOrDefault(TT, CPU), FS, Options,
           getEffectiveRelocModel(RM),
           getEffectiveCodeModel(CM, CodeModel::Small), OptLevel),
       TLOF(createTLOF(getTargetTriple())) {
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index e420f2ad676f9..2ced4d6813766 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -188,20 +188,6 @@ class AMDGPUOperand : public MCParsedAsmOperand {
     ImmTyByteSel,
   };
 
-  // Immediate operand kind.
-  // It helps to identify the location of an offending operand after an error.
-  // Note that regular literals and mandatory literals (KImm) must be handled
-  // differently. When looking for an offending operand, we should usually
-  // ignore mandatory literals because they are part of the instruction and
-  // cannot be changed. Report location of mandatory operands only for VOPD,
-  // when both OpX and OpY have a KImm and there are no other literals.
-  enum ImmKindTy {
-    ImmKindTyNone,
-    ImmKindTyLiteral,
-    ImmKindTyMandatoryLiteral,
-    ImmKindTyConst,
-  };
-
 private:
   struct TokOp {
     const char *Data;
@@ -212,7 +198,6 @@ class AMDGPUOperand : public MCParsedAsmOperand {
     int64_t Val;
     ImmTy Type;
     bool IsFPImm;
-    mutable ImmKindTy Kind;
     Modifiers Mods;
   };
 
@@ -228,6 +213,9 @@ class AMDGPUOperand : public MCParsedAsmOperand {
     const MCExpr *Expr;
   };
 
+  // The index of the associated MCInst operand.
+  mutable int MCOpIdx = -1;
+
 public:
   bool isToken() const override { return Kind == Token; }
 
@@ -239,38 +227,6 @@ class AMDGPUOperand : public MCParsedAsmOperand {
     return Kind == Immediate;
   }
 
-  void setImmKindNone() const {
-    assert(isImm());
-    Imm.Kind = ImmKindTyNone;
-  }
-
-  void setImmKindLiteral() const {
-    assert(isImm());
-    Imm.Kind = ImmKindTyLiteral;
-  }
-
-  void setImmKindMandatoryLiteral() const {
-    assert(isImm());
-    Imm.Kind = ImmKindTyMandatoryLiteral;
-  }
-
-  void setImmKindConst() const {
-    assert(isImm());
-    Imm.Kind = ImmKindTyConst;
-  }
-
-  bool IsImmKindLiteral() const {
-    return isImm() && Imm.Kind == ImmKindTyLiteral;
-  }
-
-  bool IsImmKindMandatoryLiteral() const {
-    return isImm() && Imm.Kind == ImmKindTyMandatoryLiteral;
-  }
-
-  bool isImmKindConst() const {
-    return isImm() && Imm.Kind == ImmKindTyConst;
-  }
-
   bool isInlinableImm(MVT type) const;
   bool isLiteralImm(MVT type) const;
 
@@ -1055,6 +1011,8 @@ class AMDGPUOperand : public MCParsedAsmOperand {
     return SMRange(StartLoc, EndLoc);
   }
 
+  int getMCOpIdx() const { return MCOpIdx; }
+
   Modifiers getModifiers() const {
     assert(isRegKind() || isImmTy(ImmTyNone));
     return isRegKind() ? Reg.Mods : Imm.Mods;
@@ -1242,7 +1200,6 @@ class AMDGPUOperand : public MCParsedAsmOperand {
     auto Op = std::make_unique<AMDGPUOperand>(Immediate, AsmParser);
     Op->Imm.Val = Val;
     Op->Imm.IsFPImm = IsFPImm;
-    Op->Imm.Kind = ImmKindTyNone;
     Op->Imm.Type = Type;
     Op->Imm.Mods = Modifiers();
     Op->StartLoc = Loc;
@@ -1485,20 +1442,6 @@ class AMDGPUAsmParser : public MCTargetAsmParser {
       : MCTargetAsmParser(Options, STI, MII), Parser(_Parser) {
     MCAsmParserExtension::Initialize(Parser);
 
-    if (getFeatureBits().none()) {
-      // Set default features.
-      copySTI().ToggleFeature("southern-islands");
-    }
-
-    FeatureBitset FB = getFeatureBits();
-    if (!FB[AMDGPU::FeatureWavefrontSize64] &&
-        !FB[AMDGPU::FeatureWavefrontSize32]) {
-      // If there is no default wave size it must be a generation before gfx10,
-      // these have FeatureWavefrontSize64 in their definition already. For
-      // gfx10+ set wave32 as a default.
-      copySTI().ToggleFeature(AMDGPU::FeatureWavefrontSize32);
-    }
-
     setAvailableFeatures(ComputeAvailableFeatures(getFeatureBits()));
 
     AMDGPU::IsaVersion ISA = AMDGPU::getIsaVersion(getSTI().getCPU());
@@ -1836,25 +1779,24 @@ class AMDGPUAsmParser : public MCTargetAsmParser {
   ParseStatus parseHwregFunc(OperandInfoTy &HwReg, OperandInfoTy &Offset,
                              OperandInfoTy &Width);
 
+  static SMLoc getLaterLoc(SMLoc a, SMLoc b);
+
   SMLoc getFlatOffsetLoc(const OperandVector &Operands) const;
   SMLoc getSMEMOffsetLoc(const OperandVector &Operands) const;
   SMLoc getBLGPLoc(const OperandVector &Operands) const;
 
+  SMLoc getOperandLoc(const OperandVector &Operands, int MCOpIdx) const;
   SMLoc getOperandLoc(std::function<bool(const AMDGPUOperand&)> Test,
                       const OperandVector &Operands) const;
-  SMLoc getImmLoc(AMDGPUOperand::ImmTy Type, const OperandVector &Operands) const;
-  SMLoc getRegLoc(MCRegister Reg, const OperandVector &Operands) const;
-  SMLoc getLitLoc(const OperandVector &Operands,
-                  bool SearchMandatoryLiterals = false) const;
-  SMLoc getMandatoryLitLoc(const OperandVector &Operands) const;
-  SMLoc getConstLoc(const OperandVector &Operands) const;
+  SMLoc getImmLoc(AMDGPUOperand::ImmTy Type,
+                  const OperandVector &Operands) const;
   SMLoc getInstLoc(const OperandVector &Operands) const;
 
   bool validateInstruction(const MCInst &Inst, const SMLoc &IDLoc, const OperandVector &Operands);
   bool validateOffset(const MCInst &Inst, const OperandVector &Operands);
   bool validateFlatOffset(const MCInst &Inst, const OperandVector &Operands);
   bool validateSMEMOffset(const MCInst &Inst, const OperandVector &Operands);
-  bool validateSOPLiteral(const MCInst &Inst) const;
+  bool validateSOPLiteral(const MCInst &Inst, const OperandVector &Operands);
   bool validateConstantBusLimitations(const MCInst &Inst, const OperandVector &Operands);
   std::optional<unsigned> checkVOPDRegBankConstraints(const MCInst &Inst,
                                                       bool AsVOPD3);
@@ -1895,7 +1837,7 @@ class AMDGPUAsmParser : public MCTargetAsmParser {
                               const unsigned CPol);
   bool validateTFE(const MCInst &Inst, const OperandVector &Operands);
   bool validateSetVgprMSB(const MCInst &Inst, const OperandVector &Operands);
-  std::optional<StringRef> validateLdsDirect(const MCInst &Inst);
+  bool validateLdsDirect(const MCInst &Inst, const OperandVector &Operands);
   bool validateWMMA(const MCInst &Inst, const OperandVector &Operands);
   unsigned getConstantBusLimit(unsigned Opcode) const;
   bool usesConstantBus(const MCInst &Inst, unsigned OpIdx);
@@ -2337,6 +2279,8 @@ uint64_t AMDGPUOperand::applyInputFPModifiers(uint64_t Val, unsigned Size) const
 }
 
 void AMDGPUOperand::addImmOperands(MCInst &Inst, unsigned N, bool ApplyModifiers) const {
+  MCOpIdx = Inst.getNumOperands();
+
   if (isExpr()) {
     Inst.addOperand(MCOperand::createExpr(Expr));
     return;
@@ -2350,7 +2294,6 @@ void AMDGPUOperand::addImmOperands(MCInst &Inst, unsigned N, bool ApplyModifiers
   } else {
     assert(!isImmTy(ImmTyNone) || !hasModifiers());
     Inst.addOperand(MCOperand::createImm(Imm.Val));
-    setImmKindNone();
   }
 }
 
@@ -2379,7 +2322,6 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo
       if (AMDGPU::isInlinableLiteral64(Literal.getZExtValue(),
                                        AsmParser->hasInv2PiInlineImm())) {
         Inst.addOperand(MCOperand::createImm(Literal.getZExtValue()));
-        setImmKindConst();
         return;
       }
 
@@ -2400,7 +2342,6 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo
         }
 
         Inst.addOperand(MCOperand::createImm(Val));
-        setImmKindLiteral();
         return;
       }
 
@@ -2411,7 +2352,6 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo
 
     case AMDGPU::OPERAND_KIMM64:
       Inst.addOperand(MCOperand::createImm(Val));
-      setImmKindMandatoryLiteral();
       return;
 
     case AMDGPU::OPERAND_REG_IMM_BF16:
@@ -2424,7 +2364,6 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo
         // 1/(2*pi) = 0.15915494 since bf16 is in fact fp32 with cleared low 16
         // bits. Prevent rounding below.
         Inst.addOperand(MCOperand::createImm(0x3e22));
-        setImmKindLiteral();
         return;
       }
       [[fallthrough]];
@@ -2459,11 +2398,6 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo
 
       uint64_t ImmVal = FPLiteral.bitcastToAPInt().getZExtValue();
       Inst.addOperand(MCOperand::createImm(ImmVal));
-      if (OpTy == AMDGPU::OPERAND_KIMM32 || OpTy == AMDGPU::OPERAND_KIMM16) {
-        setImmKindMandatoryLiteral();
-      } else {
-        setImmKindLiteral();
-      }
       return;
     }
     default:
@@ -2488,26 +2422,14 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo
   case AMDGPU::OPERAND_REG_IMM_V2FP32:
   case AMDGPU::OPERAND_REG_IMM_V2INT32:
   case AMDGPU::OPERAND_INLINE_SPLIT_BARRIER_INT32:
-    if (isSafeTruncation(Val, 32) &&
-        AMDGPU::isInlinableLiteral32(static_cast<int32_t>(Val),
-                                     AsmParser->hasInv2PiInlineImm())) {
-      Inst.addOperand(MCOperand::createImm(Val));
-      setImmKindConst();
-      return;
-    }
-    [[fallthrough]];
-
   case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16:
-
-    Inst.addOperand(MCOperand::createImm(Lo_32(Val)));
-    setImmKindLiteral();
+    Inst.addOperand(MCOperand::createImm(Val));
     return;
 
   case AMDGPU::OPERAND_REG_IMM_INT64:
   case AMDGPU::OPERAND_REG_INLINE_C_INT64:
     if (AMDGPU::isInlinableLiteral64(Val, AsmParser->hasInv2PiInlineImm())) {
       Inst.addOperand(MCOperand::createImm(Val));
-      setImmKindConst();
       return;
     }
 
@@ -2519,7 +2441,6 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo
       Val = Lo_32(Val);
 
     Inst.addOperand(MCOperand::createImm(Val));
-    setImmKindLiteral();
     return;
 
   case AMDGPU::OPERAND_REG_IMM_FP64:
@@ -2527,7 +2448,6 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo
   case AMDGPU::OPERAND_REG_INLINE_AC_FP64:
     if (AMDGPU::isInlinableLiteral64(Val, AsmParser->hasInv2PiInlineImm())) {
       Inst.addOperand(MCOperand::createImm(Val));
-      setImmKindConst();
       return;
     }
 
@@ -2547,95 +2467,36 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo
     }
 
     Inst.addOperand(MCOperand::createImm(Val));
-    setImmKindLiteral();
     return;
 
   case AMDGPU::OPERAND_REG_IMM_INT16:
   case AMDGPU::OPERAND_REG_INLINE_C_INT16:
-    if (isSafeTruncation(Val, 16) &&
-        AMDGPU::isInlinableIntLiteral(static_cast<int16_t>(Val))) {
-      Inst.addOperand(MCOperand::createImm(Lo_32(Val)));
-      setImmKindConst();
-      return;
-    }
-
-    Inst.addOperand(MCOperand::createImm(Val & 0xffff));
-    setImmKindLiteral();
-    return;
-
   case AMDGPU::OPERAND_REG_INLINE_C_FP16:
   case AMDGPU::OPERAND_REG_IMM_FP16:
-    if (isSafeTruncation(Val, 16) &&
-        AMDGPU::isInlinableLiteralFP16(static_cast<int16_t>(Val),
-                                       AsmParser->hasInv2PiInlineImm())) {
-      Inst.addOperand(MCOperand::createImm(Val));
-      setImmKindConst();
-      return;
-    }
-
-    Inst.addOperand(MCOperand::createImm(Val & 0xffff));
-    setImmKindLiteral();
-    return;
-
   case AMDGPU::OPERAND_REG_IMM_BF16:
   case AMDGPU::OPERAND_REG_INLINE_C_BF16:
-    if (isSafeTruncation(Val, 16) &&
-        AMDGPU::isInlinableLiteralBF16(static_cast<int16_t>(Val),
-                                     AsmParser->hasInv2PiInlineImm())) {
-      Inst.addOperand(MCOperand::createImm(Val));
-      setImmKindConst();
-      return;
-    }
-
-    Inst.addOperand(MCOperand::createImm(Val & 0xffff));
-    setImmKindLiteral();
-    return;
-
-  case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: {
-    assert(isSafeTruncation(Val, 16));
-    assert(AMDGPU::isInlinableIntLiteral(static_cast<int16_t>(Val)));
-    Inst.addOperand(MCOperand::createImm(Val));
-    return;
-  }
-  case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: {
-    assert(isSafeTruncation(Val, 16));
-    assert(AMDGPU::isInlinableLiteralFP16(static_cast<int16_t>(Val),
-                                          AsmParser->hasInv2PiInlineImm()));
-
-    Inst.addOperand(MCOperand::createImm(Val));
-    return;
-  }
-
-  case AMDGPU::OPERAND_REG_INLINE_C_V2BF16: {
-    assert(isSafeTruncation(Val, 16));
-    assert(AMDGPU::isInlinableLiteralBF16(static_cast<int16_t>(Val),
-                                          AsmParser->hasInv2PiInlineImm()));
-
-    Inst.addOperand(MCOperand::createImm(Val));
-    return;
-  }
-
+  case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
+  case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
+  case AMDGPU::OPERAND_REG_INLINE_C_V2BF16:
   case AMDGPU::OPERAND_KIMM32:
-    Inst.addOperand(MCOperand::createImm(Literal.getLoBits(32).getZExtValue()));
-    setImmKindMandatoryLiteral();
-    return;
   case AMDGPU::OPERAND_KIMM16:
-    Inst.addOperand(MCOperand::createImm(Literal.getLoBits(16).getZExtValue()));
-    setImmKindMandatoryLiteral();
+    Inst.addOperand(MCOperand::createImm(Val));
     return;
+
   case AMDGPU::OPERAND_KIMM64:
     if ((isInt<32>(Val) || isUInt<32>(Val)) && !getModifiers().Lit64)
       Val <<= 32;
 
     Inst.addOperand(MCOperand::createImm(Val));
-    setImmKindMandatoryLiteral();
     return;
+
   default:
-    llvm_unreachable("invalid operand size");
+    llvm_unreachable("invalid operand type");
   }
 }
 
 void AMDGPUOperand::addRegOperands(MCInst &Inst, unsigned N) const {
+  MCOpIdx = Inst.getNumOperands();
   Inst.addOperand(MCOperand::createReg(AMDGPU::getMCReg(getReg(), AsmParser->getSTI())));
 }
 
@@ -3942,6 +3803,8 @@ bool AMDGPUAsmParser::validateConstantBusLimitations(
 
   OperandIndices OpIndices = getSrcOperandIndices(Opcode);
 
+  unsigned ConstantBusLimit = getConstantBusLimit(Opcode);
+
   for (int OpIdx : OpIndices) {
     if (OpIdx == -1)
       continue;
@@ -3985,17 +3848,14 @@ bool AMDGPUAsmParser::validateConstantBusLimitations(
         }
       }
     }
-  }
-  ConstantBusUseCount += NumLiterals;
-
-  if (ConstantBusUseCount <= getConstantBusLimit(Opcode))
-    return true;
 
-  SMLoc LitLoc = getLitLoc(Operands);
-  SMLoc RegLoc = getRegLoc(LastSGPR, Operands);
-  SMLoc Loc = (LitLoc.getPointer() < RegLoc.getPointer()) ? RegLoc : LitLoc;
-  Error(Loc, "invalid operand (violates constant bus restrictions)");
-  return false;
+    if (ConstantBusUseCount + NumLiterals > ConstantBusLimit) {
+      Error(getOperandLoc(Operands, OpIdx),
+            "invalid operand (violates constant bus restrictions)");
+      return false;
+    }
+  }
+  return true;
 }
 
 std::optional<unsigned>
@@ -4408,19 +4268,15 @@ bool AMDGPUAsmParser::validateMovrels(const MCInst &Inst,
   const int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
   assert(Src0Idx != -1);
 
-  SMLoc ErrLoc;
   const MCOperand &Src0 = Inst.getOperand(Src0Idx);
   if (Src0.isReg()) {
     auto Reg = mc2PseudoReg(Src0.getReg());
     const MCRegisterInfo *TRI = getContext().getRegisterInfo();
     if (!isSGPR(Reg, TRI))
       return true;
-    ErrLoc = getRegLoc(Reg, Operands);
-  } else {
-    ErrLoc = getConstLoc(Operands);
   }
 
-  Error(ErrLoc, "source operand must be a VGPR");
+  Error(getOperandLoc(Operands, Src0Idx), "source operand must be a VGPR");
   return false;
 }
 
@@ -4442,7 +4298,7 @@ bool AMDGPUAsmParser::validateMAIAccWrite(const MCInst &Inst,
   auto Reg = mc2PseudoReg(Src0.getReg());
   const MCRegisterInfo *TRI = getContext().getRegisterInfo();
   if (!isGFX90A() && isSGPR(Reg, TRI)) {
-    Error(getRegLoc(Reg, Operands),
+    Error(getOperandLoc(Operands, Src0Idx),
           "source operand must be either a VGPR or an inline constant");
     return false;
   }
@@ -4464,7 +4320,7 @@ bool AMDGPUAsmParser::validateMAISrc2(const MCInst &Inst,
     return true;
 
   if (Inst.getOperand(Src2Idx).isImm() && isInlineConstant(Inst, Src2Idx)) {
-    Error(getConstLoc(Operands),
+    Error(getOperandLoc(Operands, Src2Idx),
           "inline constants are not allowed for this operand");
     return false;
   }
@@ -4494,16 +4350,14 @@ bool AMDGPUAsmParser::validateMFMA(const MCInst &Inst,
       bool Success = true;
       if (Info->NumRegsSrcA != mfmaScaleF8F6F4FormatToNumRegs(CBSZ)) {
         int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
-        Error(getRegLoc(mc2PseudoReg(Inst.getOperand(Src0Idx).getReg()),
-                        Operands),
+        Error(getOperandLoc(Operands, Src0Idx),
               "wrong register tuple size for cbsz value " + Twine(CBSZ));
         Success = false;
       }
 
       if (Info->NumRegsSrcB != mfmaScaleF8F6F4FormatToNumRegs(BLGP)) {
         int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
-        Error(getRegLoc(mc2PseudoReg(Inst.getOperand(Src1Idx).getReg()),
-                        Operands),
+        Error(getOperandLoc(Operands, Src1Idx),
               "wrong register tuple size for blgp value " + Twine(BLGP));
         Success = false;
       }
@@ -4530,7 +4384,7 @@ bool AMDGPUAsmParser::validateMFMA(const MCInst &Inst,
     return true;
 
   if (TRI->regsOverlap(Src2Reg, DstReg)) {
-    Error(getRegLoc(mc2PseudoReg(Src2Reg), Operands),
+    Error(getOperandLoc(Operands, Src2Idx),
           "source 2 operand must not partially overlap with dst");
     return false;
   }
@@ -4724,9 +4578,8 @@ static bool IsRevOpcode(const unsigned Opcode)
   }
 }
 
-std::optional<StringRef>
-AMDGPUAsmParser::validateLdsDirect(const MCInst &Inst) {
-
+bool AMDGPUAsmParser::validateLdsDirect(const MCInst &Inst,
+                                        const OperandVector &Operands) {
   using namespace SIInstrFlags;
   const unsigned Opcode = Inst.getOpcode();
   const MCInstrDesc &Desc = MII.get(Opcode);
@@ -4735,7 +4588,7 @@ AMDGPUAsmParser::validateLdsDirect(const MCInst &Inst) {
   // with 9-bit operands only. Ignore encodings which do not accept these.
   const auto Enc = VOP1 | VOP2 | VOP3 | VOPC | VOP3P | SIInstrFlags::SDWA;
   if ((Desc.TSFlags & Enc) == 0)
-    return std::nullopt;
+    return true;
 
   for (auto SrcName : {OpName::src0, OpName::src1, OpName::src2}) {
     auto SrcIdx = getNamedOperandIdx(Opcode, SrcName);
@@ -4744,18 +4597,27 @@ AMDGPUAsmParser::validateLdsDirect(const MCInst &Inst) {
     const auto &Src = Inst.getOperand(SrcIdx);
     if (Src.isReg() && Src.getReg() == LDS_DIRECT) {
 
-      if (isGFX90A() || isGFX11Plus())
-        return StringRef("lds_direct is not supported on this GPU");
+      if (isGFX90A() || isGFX11Plus()) {
+        Error(getOperandLoc(Operands, SrcIdx),
+              "lds_direct is not supported on this GPU");
+        return false;
+      }
 
-      if (IsRevOpcode(Opcode) || (Desc.TSFlags & SIInstrFlags::SDWA))
-        return StringRef("lds_direct cannot be used with this instruction");
+      if (IsRevOpcode(Opcode) || (Desc.TSFlags & SIInstrFlags::SDWA)) {
+        Error(getOperandLoc(Operands, SrcIdx),
+              "lds_direct cannot be used with this instruction");
+        return false;
+      }
 
-      if (SrcName != OpName::src0)
-        return StringRef("lds_direct may be used as src0 only");
+      if (SrcName != OpName::src0) {
+        Error(getOperandLoc(Operands, SrcIdx),
+              "lds_direct may be used as src0 only");
+        return false;
+      }
     }
   }
 
-  return std::nullopt;
+  return true;
 }
 
 SMLoc AMDGPUAsmParser::getFlatOffsetLoc(const OperandVector &Operands) const {
@@ -4881,7 +4743,8 @@ bool AMDGPUAsmParser::validateSMEMOffset(const MCInst &Inst,
   return false;
 }
 
-bool AMDGPUAsmParser::validateSOPLiteral(const MCInst &Inst) const {
+bool AMDGPUAsmParser::validateSOPLiteral(const MCInst &Inst,
+                                         const OperandVector &Operands) {
   unsigned Opcode = Inst.getOpcode();
   const MCInstrDesc &Desc = MII.get(Opcode);
   if (!(Desc.TSFlags & (SIInstrFlags::SOP2 | SIInstrFlags::SOPC)))
@@ -4894,7 +4757,7 @@ bool AMDGPUAsmParser::validateSOPLiteral(const MCInst &Inst) const {
 
   unsigned NumExprs = 0;
   unsigned NumLiterals = 0;
-  uint64_t LiteralValue;
+  int64_t LiteralValue;
 
   for (int OpIdx : OpIndices) {
     if (OpIdx == -1) break;
@@ -4903,7 +4766,9 @@ bool AMDGPUAsmParser::validateSOPLiteral(const MCInst &Inst) const {
     // Exclude special imm operands (like that used by s_set_gpr_idx_on)
     if (AMDGPU::isSISrcOperand(Desc, OpIdx)) {
       if (MO.isImm() && !isInlineConstant(Inst, OpIdx)) {
-        uint64_t Value = static_cast<uint64_t>(MO.getImm());
+        auto OpType = static_cast<AMDGPU::OperandType>(
+            Desc.operands()[OpIdx].OperandType);
+        int64_t Value = encode32BitLiteral(MO.getImm(), OpType);
         if (NumLiterals == 0 || LiteralValue != Value) {
           LiteralValue = Value;
           ++NumLiterals;
@@ -4914,7 +4779,12 @@ bool AMDGPUAsmParser::validateSOPLiteral(const MCInst &Inst) const {
     }
   }
 
-  return NumLiterals + NumExprs <= 1;
+  if (NumLiterals + NumExprs <= 1)
+    return true;
+
+  Error(getOperandLoc(Operands, Src1Idx),
+        "only one unique literal operand is allowed");
+  return false;
 }
 
 bool AMDGPUAsmParser::validateOpSel(const MCInst &Inst) {
@@ -5090,9 +4960,8 @@ bool AMDGPUAsmParser::validateDPP(const MCInst &Inst,
       const MCOperand &Src1 = Inst.getOperand(Src1Idx);
       const MCRegisterInfo *TRI = getContext().getRegisterInfo();
       if (Src1.isReg() && isSGPR(mc2PseudoReg(Src1.getReg()), TRI)) {
-        auto Reg = mc2PseudoReg(Inst.getOperand(Src1Idx).getReg());
-        SMLoc S = getRegLoc(Reg, Operands);
-        Error(S, "invalid operand for instruction");
+        Error(getOperandLoc(Operands, Src1Idx),
+              "invalid operand for instruction");
         return false;
       }
       if (Src1.isImm()) {
@@ -5125,9 +4994,8 @@ bool AMDGPUAsmParser::validateVOPLiteral(const MCInst &Inst,
 
   OperandIndices OpIndices = getSrcOperandIndices(Opcode, HasMandatoryLiteral);
 
-  unsigned NumExprs = 0;
-  unsigned NumLiterals = 0;
-  uint64_t LiteralValue;
+  std::optional<unsigned> LiteralOpIdx;
+  std::optional<uint64_t> LiteralValue;
 
   for (int OpIdx : OpIndices) {
     if (OpIdx == -1)
@@ -5139,6 +5007,7 @@ bool AMDGPUAsmParser::validateVOPLiteral(const MCInst &Inst,
     if (!isSISrcOperand(Desc, OpIdx))
       continue;
 
+    bool IsAnotherLiteral = false;
     if (MO.isImm() && !isInlineConstant(Inst, OpIdx)) {
       uint64_t Value = static_cast<uint64_t>(MO.getImm());
       bool IsForcedFP64 =
@@ -5151,34 +5020,37 @@ bool AMDGPUAsmParser::validateVOPLiteral(const MCInst &Inst,
 
       if (!IsValid32Op && !isInt<32>(Value) && !isUInt<32>(Value) &&
           !IsForcedFP64 && (!has64BitLiterals() || Desc.getSize() != 4)) {
-        Error(getLitLoc(Operands), "invalid operand for instruction");
+        Error(getOperandLoc(Operands, OpIdx),
+              "invalid operand for instruction");
         return false;
       }
 
       if (IsFP64 && IsValid32Op && !IsForcedFP64)
         Value = Hi_32(Value);
 
-      if (NumLiterals == 0 || LiteralValue != Value) {
-        LiteralValue = Value;
-        ++NumLiterals;
-      }
+      IsAnotherLiteral = !LiteralValue || *LiteralValue != Value;
+      LiteralValue = Value;
     } else if (MO.isExpr()) {
-      ++NumExprs;
+      // Literal value not known, so we conservately assume it's different.
+      IsAnotherLiteral = true;
     }
-  }
-  NumLiterals += NumExprs;
 
-  if (!NumLiterals)
-    return true;
+    if (IsAnotherLiteral && !HasMandatoryLiteral &&
+        !getFeatureBits()[FeatureVOP3Literal]) {
+      Error(getOperandLoc(Operands, OpIdx),
+            "literal operands are not supported");
+      return false;
+    }
 
-  if (!HasMandatoryLiteral && !getFeatureBits()[FeatureVOP3Literal]) {
-    Error(getLitLoc(Operands), "literal operands are not supported");
-    return false;
-  }
+    if (LiteralOpIdx && IsAnotherLiteral) {
+      Error(getLaterLoc(getOperandLoc(Operands, OpIdx),
+                        getOperandLoc(Operands, *LiteralOpIdx)),
+            "only one unique literal operand is allowed");
+      return false;
+    }
 
-  if (NumLiterals > 1) {
-    Error(getLitLoc(Operands, true), "only one unique literal operand is allowed");
-    return false;
+    if (IsAnotherLiteral)
+      LiteralOpIdx = OpIdx;
   }
 
   return true;
@@ -5352,8 +5224,7 @@ bool AMDGPUAsmParser::validateWaitCnt(const MCInst &Inst,
   if (Reg == AMDGPU::SGPR_NULL)
     return true;
 
-  SMLoc RegLoc = getRegLoc(Reg, Operands);
-  Error(RegLoc, "src0 must be null");
+  Error(getOperandLoc(Operands, Src0Idx), "src0 must be null");
   return false;
 }
 
@@ -5400,8 +5271,7 @@ bool AMDGPUAsmParser::validateGWS(const MCInst &Inst,
   auto Reg = Inst.getOperand(Data0Pos).getReg();
   auto RegIdx = Reg - (VGPR32.contains(Reg) ? AMDGPU::VGPR0 : AMDGPU::AGPR0);
   if (RegIdx & 1) {
-    SMLoc RegLoc = getRegLoc(Reg, Operands);
-    Error(RegLoc, "vgpr must be even aligned");
+    Error(getOperandLoc(Operands, Data0Pos), "vgpr must be even aligned");
     return false;
   }
 
@@ -5598,7 +5468,7 @@ bool AMDGPUAsmParser::validateWMMA(const MCInst &Inst,
                                      "MATRIX_FMT_FP6", "MATRIX_FMT_BF6",
                                      "MATRIX_FMT_FP4"};
 
-    Error(getRegLoc(mc2PseudoReg(Inst.getOperand(SrcIdx).getReg()), Operands),
+    Error(getOperandLoc(Operands, SrcIdx),
           "wrong register tuple size for " + Twine(FmtNames[Fmt]));
     return false;
   };
@@ -5610,20 +5480,15 @@ bool AMDGPUAsmParser::validateWMMA(const MCInst &Inst,
 bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst,
                                           const SMLoc &IDLoc,
                                           const OperandVector &Operands) {
-  if (auto ErrMsg = validateLdsDirect(Inst)) {
-    Error(getRegLoc(LDS_DIRECT, Operands), *ErrMsg);
+  if (!validateLdsDirect(Inst, Operands))
     return false;
-  }
   if (!validateTrue16OpSel(Inst)) {
     Error(getImmLoc(AMDGPUOperand::ImmTyOpSel, Operands),
           "op_sel operand conflicts with 16-bit operand suffix");
     return false;
   }
-  if (!validateSOPLiteral(Inst)) {
-    Error(getLitLoc(Operands),
-      "only one unique literal operand is allowed");
+  if (!validateSOPLiteral(Inst, Operands))
     return false;
-  }
   if (!validateVOPLiteral(Inst, Operands)) {
     return false;
   }
@@ -8563,6 +8428,21 @@ SMLoc AMDGPUAsmParser::getInstLoc(const OperandVector &Operands) const {
   return ((AMDGPUOperand &)*Operands[0]).getStartLoc();
 }
 
+// Returns one of the given locations that comes later in the source.
+SMLoc AMDGPUAsmParser::getLaterLoc(SMLoc a, SMLoc b) {
+  return a.getPointer() < b.getPointer() ? b : a;
+}
+
+SMLoc AMDGPUAsmParser::getOperandLoc(const OperandVector &Operands,
+                                     int MCOpIdx) const {
+  for (const auto &Op : Operands) {
+    const auto TargetOp = static_cast<AMDGPUOperand &>(*Op);
+    if (TargetOp.getMCOpIdx() == MCOpIdx)
+      return TargetOp.getStartLoc();
+  }
+  llvm_unreachable("No such MC operand!");
+}
+
 SMLoc
 AMDGPUAsmParser::getOperandLoc(std::function<bool(const AMDGPUOperand&)> Test,
                                const OperandVector &Operands) const {
@@ -8581,40 +8461,6 @@ AMDGPUAsmParser::getImmLoc(AMDGPUOperand::ImmTy Type,
   return getOperandLoc(Test, Operands);
 }
 
-SMLoc AMDGPUAsmParser::getRegLoc(MCRegister Reg,
-                                 const OperandVector &Operands) const {
-  auto Test = [=](const AMDGPUOperand& Op) {
-    return Op.isRegKind() && Op.getReg() == Reg;
-  };
-  return getOperandLoc(Test, Operands);
-}
-
-SMLoc AMDGPUAsmParser::getLitLoc(const OperandVector &Operands,
-                                 bool SearchMandatoryLiterals) const {
-  auto Test = [](const AMDGPUOperand& Op) {
-    return Op.IsImmKindLiteral() || Op.isExpr();
-  };
-  SMLoc Loc = getOperandLoc(Test, Operands);
-  if (SearchMandatoryLiterals && Loc == getInstLoc(Operands))
-    Loc = getMandatoryLitLoc(Operands);
-  return Loc;
-}
-
-SMLoc AMDGPUAsmParser::getMandatoryLitLoc(const OperandVector &Operands) const {
-  auto Test = [](const AMDGPUOperand &Op) {
-    return Op.IsImmKindMandatoryLiteral();
-  };
-  return getOperandLoc(Test, Operands);
-}
-
-SMLoc
-AMDGPUAsmParser::getConstLoc(const OperandVector &Operands) const {
-  auto Test = [](const AMDGPUOperand& Op) {
-    return Op.isImmKindConst();
-  };
-  return getOperandLoc(Test, Operands);
-}
-
 ParseStatus
 AMDGPUAsmParser::parseStructuredOpFields(ArrayRef<StructuredOpField *> Fields) {
   if (!trySkipToken(AsmToken::LCurly))
@@ -10220,7 +10066,6 @@ void AMDGPUAsmParser::cvtVOP3DPP(MCInst &Inst, const OperandVector &Operands,
       Op.addRegOperands(Inst, 1);
     } else if (Op.isImm() &&
                Desc.operands()[Inst.getNumOperands()].RegClass != -1) {
-      assert(!Op.IsImmKindLiteral() && "Cannot use literal with DPP");
       Op.addImmOperands(Inst, 1);
     } else if (Op.isImm()) {
       OptionalIdx[Op.getImmTy()] = I;
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index a1306565bbe29..7e5ae25ff30e6 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -228,15 +228,14 @@ class GlobalSaddrTable <bit is_saddr, string Name = ""> {
 // saddr is 32-bit (which isn't handled here yet).
 class FLAT_Load_Pseudo<
     string opName, RegisterOperand vdata_op, bit HasTiedOutput = 0,
-    bit HasSaddr = 0, bit EnableSaddr = 0>
+    bit HasSaddr = 0, bit EnableSaddr = 0,
+    RegisterClass VaddrRC = !if(EnableSaddr, VGPR_32, VReg_64)>
     : FLAT_Pseudo<opName, (outs), (ins), ""> {
 
   let OutOperandList = (outs vdata_op:$vdst);
   let InOperandList = !con(
-    !if(EnableSaddr,
-        (ins SReg_64_XEXEC_XNULL:$saddr, VGPR_32:$vaddr),
-        (ins VReg_64:$vaddr)),
-    (ins flat_offset:$offset),
+    !if(EnableSaddr, (ins SReg_64_XEXEC_XNULL:$saddr), (ins)),
+    (ins VaddrRC:$vaddr, flat_offset:$offset),
     // FIXME: Operands with default values do not work with following
     // non-optional operands.
     !if(HasTiedOutput, (ins CPol:$cpol, vdata_op:$vdst_in),
@@ -268,15 +267,13 @@ multiclass FLAT_Flat_Load_Pseudo_t16<string opName> {
 }
 
 class FLAT_Store_Pseudo <string opName, RegisterOperand vdataClass,
-  bit HasSaddr = 0, bit EnableSaddr = 0> : FLAT_Pseudo<
-  opName,
-  (outs),
-  !con(
-    !if(EnableSaddr,
-      (ins VGPR_32:$vaddr, vdataClass:$vdata, SReg_64_XEXEC_XNULL:$saddr),
-      (ins VReg_64:$vaddr, vdataClass:$vdata)),
-      (ins flat_offset:$offset, CPol_0:$cpol)),
-  " $vaddr, $vdata"#!if(HasSaddr, !if(EnableSaddr, ", $saddr", ", off"), "")#"$offset$cpol"> {
+  bit HasSaddr = 0, bit EnableSaddr = 0,
+  RegisterClass VaddrRC = !if(EnableSaddr, VGPR_32, VReg_64)> : FLAT_Pseudo<opName, (outs), (ins), ""> {
+  let InOperandList = !con(
+      (ins VaddrRC:$vaddr, vdataClass:$vdata),
+      !if(EnableSaddr, (ins SReg_64_XEXEC_XNULL:$saddr), (ins)),
+      (ins flat_offset:$offset, CPol_0:$cpol));
+  let AsmOperands = " $vaddr, $vdata"#!if(HasSaddr, !if(EnableSaddr, ", $saddr", ", off"), "")#"$offset$cpol";
   let mayLoad  = 0;
   let mayStore = 1;
   let has_vdst = 0;
@@ -833,99 +830,83 @@ multiclass FLAT_Atomic_Pseudo<
   defm "" : FLAT_Atomic_Pseudo_RTN<opName, vdst_op, vt, data_vt, data_op>;
 }
 
-multiclass FLAT_Global_Atomic_Pseudo_NO_RTN<
+class FLAT_Global_Atomic_Pseudo_NO_RTN<
   string opName,
   RegisterOperand vdst_op,
   ValueType vt,
   ValueType data_vt = vt,
-  RegisterOperand data_op = vdst_op> {
-
-  let is_flat_global = 1 in {
-    def "" : FLAT_AtomicNoRet_Pseudo <opName,
-      (outs),
-      (ins VReg_64:$vaddr, data_op:$vdata, flat_offset:$offset, CPol_0:$cpol),
-      " $vaddr, $vdata, off$offset$cpol">,
-      GlobalSaddrTable<0, opName> {
-      let has_saddr = 1;
-      let FPAtomic = data_vt.isFP;
-    }
-
-    def _SADDR : FLAT_AtomicNoRet_Pseudo <opName,
-      (outs),
-      (ins VGPR_32:$vaddr, data_op:$vdata, SReg_64_XEXEC_XNULL:$saddr, flat_offset:$offset, CPol_0:$cpol),
-      " $vaddr, $vdata, $saddr$offset$cpol">,
-      GlobalSaddrTable<1, opName> {
-      let has_saddr = 1;
-      let enabled_saddr = 1;
-      let FPAtomic = data_vt.isFP;
-    }
-  }
+  RegisterOperand data_op = vdst_op,
+  bit EnableSaddr = false,
+  RegisterClass VaddrRC = !if(EnableSaddr, VGPR_32, VReg_64)>
+    : FLAT_AtomicNoRet_Pseudo<opName, (outs), (ins), "">, GlobalSaddrTable<EnableSaddr, opName> {
+  let InOperandList = !con(
+    (ins VaddrRC:$vaddr, data_op:$vdata),
+    !if(EnableSaddr, (ins SReg_64_XEXEC_XNULL:$saddr), (ins)),
+    (ins flat_offset:$offset, CPol_0:$cpol));
+  let AsmOperands = " $vaddr, $vdata, "#!if(EnableSaddr, "$saddr", "off")#"$offset$cpol";
+  let has_saddr = 1;
+  let enabled_saddr = EnableSaddr;
+  let FPAtomic = data_vt.isFP;
+  let is_flat_global = 1;
 }
 
-multiclass FLAT_Global_Atomic_Pseudo_RTN<
-  string opName,
+multiclass FLAT_Global_Atomic_Pseudo_Helper_NO_RTN<string opName,
   RegisterOperand vdst_op,
   ValueType vt,
   ValueType data_vt = vt,
   RegisterOperand data_op = vdst_op> {
+  def "" : FLAT_Global_Atomic_Pseudo_NO_RTN<opName, vdst_op, vt, data_vt, data_op, 0>;
+  def _SADDR : FLAT_Global_Atomic_Pseudo_NO_RTN<opName, vdst_op, vt, data_vt, data_op, 1>;
+}
 
-  defvar vdst_op_vgpr = getEquivalentVGPROperand<vdst_op>.ret;
-  defvar data_op_vgpr = getEquivalentVGPROperand<data_op>.ret;
-
-  let is_flat_global = 1 in {
-    def _RTN : FLAT_AtomicRet_Pseudo <opName,
-      (outs vdst_op_vgpr:$vdst),
-        (ins VReg_64:$vaddr, data_op_vgpr:$vdata, flat_offset:$offset, CPol_GLC1:$cpol),
-      " $vdst, $vaddr, $vdata, off$offset$cpol">,
-      GlobalSaddrTable<0, opName#"_rtn"> {
-      let has_saddr = 1;
-      let FPAtomic = data_vt.isFP;
-    }
+class FLAT_Global_Atomic_Pseudo_RTN<
+  string opName,
+  RegisterOperand vdst_op,
+  ValueType vt,
+  ValueType data_vt = vt,
+  RegisterOperand data_op = vdst_op,
+  bit EnableSaddr = false,
+  bit IsVGPR = false,
+  RegisterClass VaddrRC = !if(EnableSaddr, VGPR_32, VReg_64)>
+    : FLAT_AtomicRet_Pseudo<opName, (outs), (ins), "">, GlobalSaddrTable<EnableSaddr, opName#"_rtn"#!if(IsVGPR, "", "_agpr")> {
 
-    def _SADDR_RTN : FLAT_AtomicRet_Pseudo <opName,
-      (outs vdst_op_vgpr:$vdst),
-        (ins VGPR_32:$vaddr, data_op_vgpr:$vdata, SReg_64_XEXEC_XNULL:$saddr, flat_offset:$offset, CPol_GLC1:$cpol),
-      " $vdst, $vaddr, $vdata, $saddr$offset$cpol">,
-      GlobalSaddrTable<1, opName#"_rtn"> {
-       let has_saddr = 1;
-       let enabled_saddr = 1;
-       let FPAtomic = data_vt.isFP;
-    }
+  defvar vdst_rc= !if(IsVGPR, getEquivalentVGPROperand<vdst_op>.ret, getEquivalentAGPROperand<vdst_op>.ret);
+  defvar data_rc = !if(IsVGPR, getEquivalentVGPROperand<data_op>.ret, getEquivalentAGPROperand<data_op>.ret);
 
-    defvar vdst_op_agpr = getEquivalentAGPROperand<vdst_op>.ret;
-    defvar data_op_agpr = getEquivalentAGPROperand<data_op>.ret;
+  let OutOperandList = (outs vdst_rc:$vdst);
+  let InOperandList = !con(
+    (ins VaddrRC:$vaddr, data_rc:$vdata),
+    !if(EnableSaddr, (ins SReg_64_XEXEC_XNULL:$saddr), (ins)),
+    (ins flat_offset:$offset, CPol_GLC1:$cpol));
+  let AsmOperands = " $vdst, $vaddr, $vdata, "#!if(EnableSaddr, "$saddr", "off")#"$offset$cpol";
+  let has_saddr = 1;
+  let enabled_saddr = EnableSaddr;
+  let FPAtomic = data_vt.isFP;
+  let is_flat_global = 1;
+}
 
+multiclass FLAT_Global_Atomic_Pseudo_Helper_RTN<string opName,
+  RegisterOperand vdst_op,
+  ValueType vt,
+  ValueType data_vt = vt,
+  RegisterOperand data_op = vdst_op> {
+  def _RTN : FLAT_Global_Atomic_Pseudo_RTN<opName, vdst_op, vt, data_vt, data_op, 0, 1>;
+  def _SADDR_RTN : FLAT_Global_Atomic_Pseudo_RTN<opName, vdst_op, vt, data_vt, data_op, 1, 1>;
     let SubtargetPredicate = isGFX90APlus in {
-      def _RTN_agpr : FLAT_AtomicRet_Pseudo <opName,
-        (outs vdst_op_agpr:$vdst),
-        (ins VReg_64:$vaddr, data_op_agpr:$vdata, flat_offset:$offset, CPol_GLC1:$cpol),
-        " $vdst, $vaddr, $vdata, off$offset$cpol">,
-        GlobalSaddrTable<0, opName#"_rtn_agpr"> {
-        let has_saddr = 1;
-        let FPAtomic = data_vt.isFP;
-      }
-
-      def _SADDR_RTN_agpr : FLAT_AtomicRet_Pseudo <opName,
-        (outs vdst_op_agpr:$vdst),
-        (ins VGPR_32:$vaddr, data_op_agpr:$vdata, SReg_64_XEXEC_XNULL:$saddr, flat_offset:$offset, CPol_GLC1:$cpol),
-        " $vdst, $vaddr, $vdata, $saddr$offset$cpol">,
-        GlobalSaddrTable<1, opName#"_rtn_agpr"> {
-         let has_saddr = 1;
-         let enabled_saddr = 1;
-         let FPAtomic = data_vt.isFP;
-      }
-    }
+    def _RTN_agpr : FLAT_Global_Atomic_Pseudo_RTN<opName, vdst_op, vt, data_vt, data_op, 0, 0>;
+    def _SADDR_RTN_agpr : FLAT_Global_Atomic_Pseudo_RTN<opName, vdst_op, vt, data_vt, data_op, 1, 0>;
   }
 }
 
+
 multiclass FLAT_Global_Atomic_Pseudo<
   string opName,
   RegisterOperand vdst_rc,
   ValueType vt,
   ValueType data_vt = vt,
   RegisterOperand data_rc = vdst_rc> {
-  defm "" : FLAT_Global_Atomic_Pseudo_NO_RTN<opName, vdst_rc, vt, data_vt, data_rc>;
-  defm "" : FLAT_Global_Atomic_Pseudo_RTN<opName, vdst_rc, vt, data_vt, data_rc>;
+  defm "" : FLAT_Global_Atomic_Pseudo_Helper_NO_RTN<opName, vdst_rc, vt, data_vt, data_rc>;
+  defm "" : FLAT_Global_Atomic_Pseudo_Helper_RTN<opName, vdst_rc, vt, data_vt, data_rc>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1356,19 +1337,19 @@ let SubtargetPredicate = isGFX10Plus in {
 } // End SubtargetPredicate = isGFX10Plus
 
 let SubtargetPredicate = HasAtomicFaddNoRtnInsts in
-  defm GLOBAL_ATOMIC_ADD_F32 : FLAT_Global_Atomic_Pseudo_NO_RTN <
+  defm GLOBAL_ATOMIC_ADD_F32 : FLAT_Global_Atomic_Pseudo_Helper_NO_RTN <
     "global_atomic_add_f32", AVLdSt_32, f32
   >;
 let SubtargetPredicate = HasAtomicBufferGlobalPkAddF16NoRtnInsts in
-  defm GLOBAL_ATOMIC_PK_ADD_F16 : FLAT_Global_Atomic_Pseudo_NO_RTN <
+  defm GLOBAL_ATOMIC_PK_ADD_F16 : FLAT_Global_Atomic_Pseudo_Helper_NO_RTN <
     "global_atomic_pk_add_f16", AVLdSt_32, v2f16
   >;
 let SubtargetPredicate = HasAtomicFaddRtnInsts in
-  defm GLOBAL_ATOMIC_ADD_F32 : FLAT_Global_Atomic_Pseudo_RTN <
+  defm GLOBAL_ATOMIC_ADD_F32 : FLAT_Global_Atomic_Pseudo_Helper_RTN <
     "global_atomic_add_f32", AVLdSt_32, f32
   >;
 let SubtargetPredicate = HasAtomicBufferGlobalPkAddF16Insts in
-  defm GLOBAL_ATOMIC_PK_ADD_F16 : FLAT_Global_Atomic_Pseudo_RTN <
+  defm GLOBAL_ATOMIC_PK_ADD_F16 : FLAT_Global_Atomic_Pseudo_Helper_RTN <
     "global_atomic_pk_add_f16", AVLdSt_32, v2f16
   >;
 
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index cbd6f64976d21..920a47b5afe07 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -1839,6 +1839,16 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   /// \returns true if the subtarget requires a wait for xcnt before atomic
   /// flat/global stores & rmw.
   bool requiresWaitXCntBeforeAtomicStores() const { return GFX1250Insts; }
+
+  /// \returns the number of significant bits in the immediate field of the
+  /// S_NOP instruction.
+  unsigned getSNopBits() const {
+    if (getGeneration() >= AMDGPUSubtarget::GFX12)
+      return 7;
+    if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
+      return 4;
+    return 3;
+  }
 };
 
 class GCNUserSGPRUsageInfo {
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
index fd65f95334f75..bf212bbca934c 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
@@ -464,8 +464,9 @@ void AMDGPUMCCodeEmitter::encodeInstruction(const MCInst &MI,
       assert(STI.hasFeature(AMDGPU::Feature64BitLiterals));
       support::endian::write<uint64_t>(CB, Imm, llvm::endianness::little);
     } else {
-      if (Desc.operands()[i].OperandType == AMDGPU::OPERAND_REG_IMM_FP64)
-        Imm = Hi_32(Imm);
+      auto OpType =
+          static_cast<AMDGPU::OperandType>(Desc.operands()[i].OperandType);
+      Imm = AMDGPU::encode32BitLiteral(Imm, OpType);
       support::endian::write<uint32_t>(CB, Imm, llvm::endianness::little);
     }
 
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
index 90c56f6901460..f2e2d0ed3f8a6 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
@@ -83,6 +83,9 @@ createAMDGPUMCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) {
       createAMDGPUMCSubtargetInfoImpl(TT, CPU, /*TuneCPU*/ CPU, FS);
 
   // FIXME: We should error for the default target.
+  if (STI->getFeatureBits().none())
+    STI->ToggleFeature(AMDGPU::FeatureSouthernIslands);
+
   if (!STI->hasFeature(AMDGPU::FeatureWavefrontSize64) &&
       !STI->hasFeature(AMDGPU::FeatureWavefrontSize32)) {
     // If there is no default wave size it must be a generation before gfx10,
diff --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
index bf787b230067d..291c03ab223a8 100644
--- a/llvm/lib/Target/AMDGPU/MIMGInstructions.td
+++ b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
@@ -272,13 +272,13 @@ class MIMGNSAHelper<int num_addrs,
             !if(!le(num_addrs, 13), 3, ?))));
 }
 
-class PartialNSAHelper<int num_addrs, int max_addr, RegisterClass LastAddrRC>
+class PartialNSAHelper<int num_addrs, int max_addr, RegisterOperand LastAddrRC>
   : NSAHelper<> {
 
-  list<RegisterClass> addr_types =
+  list<RegisterOperand> addr_types =
     !if(!ge(num_addrs, max_addr),
-      !listconcat(!listsplat(VGPR_32, !sub(max_addr, 1)), [LastAddrRC]),
-      !listsplat(VGPR_32, num_addrs));
+      !listconcat(!listsplat(VGPROp_32, !sub(max_addr, 1)), [LastAddrRC]),
+      !listsplat(VGPROp_32, num_addrs));
 
   int VAddrCount = !if(!gt(num_addrs, max_addr), max_addr, num_addrs);
   list<string> AddrAsmNames =  !foreach(i, !range(VAddrCount), "vaddr" # i);
@@ -359,7 +359,7 @@ class MIMG_gfx11<int op, dag outs, string dns = "">
 // Note that 1-dword addresses always use non-NSA variants.
 class MIMG_nsa_gfx11<int op, dag outs, int num_addrs, string dns="",
                      list<RegisterClass> addr_types=[],
-                     RegisterClass LastAddrRC = VGPR_32>
+                     RegisterOperand LastAddrRC = VGPROp_32>
   : MIMG<outs, dns>, MIMGe_gfx11<op> {
   let SubtargetPredicate = isGFX11Only;
   let AssemblerPredicate = isGFX11Only;
@@ -400,7 +400,7 @@ class VIMAGE_gfx12<int op, dag outs, int num_addrs, string dns="",
 }
 
 class VSAMPLE_gfx12<int op, dag outs, int num_addrs, string dns="",
-                    RegisterClass Addr3RC>
+                    RegisterOperand Addr3RC>
   : VSAMPLE<outs, dns>, VSAMPLEe<op> {
   let SubtargetPredicate = isGFX12Plus;
   let AssemblerPredicate = isGFX12Plus;
@@ -421,7 +421,7 @@ class VSAMPLE_gfx12<int op, dag outs, int num_addrs, string dns="",
 
 class MIMG_NoSampler_Helper <mimgopc op, string asm,
                              RegisterOperand dst_rc,
-                             RegisterClass addr_rc,
+                             RegisterOperand addr_rc,
                              string dns="">
   : MIMG_gfx6789 <op.GFX10M, (outs dst_rc:$vdata), dns> {
   let InOperandList = !con((ins addr_rc:$vaddr, SReg_256_XNULL:$srsrc,
@@ -434,10 +434,10 @@ class MIMG_NoSampler_Helper <mimgopc op, string asm,
 
 class MIMG_NoSampler_Helper_gfx90a <mimgopc op, string asm,
                                     RegisterOperand dst_rc,
-                                    RegisterClass addr_rc,
+                                    RegisterOperand addr_rc,
                                     string dns="">
   : MIMG_gfx90a <op.GFX10M, (outs getAlign2RegOp<dst_rc>.ret:$vdata), dns> {
-  let InOperandList = !con((ins addr_rc:$vaddr, SReg_256_XNULL:$srsrc,
+  let InOperandList = !con((ins getAlign2RegOp<addr_rc>.ret:$vaddr, SReg_256_XNULL:$srsrc,
                                 DMask:$dmask, UNorm:$unorm, CPol:$cpol,
                                 R128A16:$r128, LWE:$lwe, DA:$da),
                            !if(BaseOpcode.HasD16, (ins D16:$d16), (ins)));
@@ -446,7 +446,7 @@ class MIMG_NoSampler_Helper_gfx90a <mimgopc op, string asm,
 }
 
 class MIMG_NoSampler_gfx10<mimgopc op, string opcode,
-                           RegisterOperand DataRC, RegisterClass AddrRC,
+                           RegisterOperand DataRC, RegisterOperand AddrRC,
                            string dns="">
   : MIMG_gfx10<op.GFX10M, (outs DataRC:$vdata), dns> {
   let InOperandList = !con((ins AddrRC:$vaddr0, SReg_256_XNULL:$srsrc, DMask:$dmask,
@@ -471,7 +471,7 @@ class MIMG_NoSampler_nsa_gfx10<mimgopc op, string opcode,
 }
 
 class MIMG_NoSampler_gfx11<mimgopc op, string opcode,
-                           RegisterOperand DataRC, RegisterClass AddrRC,
+                           RegisterOperand DataRC, RegisterOperand AddrRC,
                            string dns="">
   : MIMG_gfx11<op.GFX11, (outs DataRC:$vdata), dns> {
   let InOperandList = !con((ins AddrRC:$vaddr0, SReg_256_XNULL:$srsrc, DMask:$dmask,
@@ -508,7 +508,7 @@ class VIMAGE_NoSampler_gfx12<mimgopc op, string opcode,
 }
 
 class VSAMPLE_Sampler_gfx12<mimgopc op, string opcode, RegisterOperand DataRC,
-                            int num_addrs, RegisterClass Addr3RC = VGPR_32,
+                            int num_addrs, RegisterOperand Addr3RC = VGPROp_32,
                             string dns="">
   : VSAMPLE_gfx12<op.GFX12, (outs DataRC:$vdata), num_addrs, dns, Addr3RC> {
   let InOperandList = !con(AddrIns,
@@ -525,7 +525,7 @@ class VSAMPLE_Sampler_gfx12<mimgopc op, string opcode, RegisterOperand DataRC,
 }
 
 class VSAMPLE_Sampler_nortn_gfx12<mimgopc op, string opcode,
-                            int num_addrs, RegisterClass Addr3RC = VGPR_32,
+                            int num_addrs, RegisterOperand Addr3RC = VGPROp_32,
                             string dns="">
   : VSAMPLE_gfx12<op.GFX12, (outs), num_addrs, dns, Addr3RC> {
   let InOperandList = !con(AddrIns,
@@ -550,16 +550,16 @@ multiclass MIMG_NoSampler_Src_Helper <mimgopc op, string asm,
   let VAddrDwords = 1 in {
     let ssamp = 0 in {
       if op.HAS_GFX10M then {
-        def _V1 : MIMG_NoSampler_Helper <op, asm, dst_rc, VGPR_32,
+        def _V1 : MIMG_NoSampler_Helper <op, asm, dst_rc, VGPROp_32,
                                          !if(enableDisasm, "GFX8", "")>;
         if !not(ExtendedImageInst) then
-        def _V1_gfx90a : MIMG_NoSampler_Helper_gfx90a <op, asm, dst_rc, VGPR_32,
+        def _V1_gfx90a : MIMG_NoSampler_Helper_gfx90a <op, asm, dst_rc, VGPROp_32,
                                        !if(enableDisasm, "GFX90A", "")>;
-        def _V1_gfx10 : MIMG_NoSampler_gfx10<op, asm, dst_rc, VGPR_32,
+        def _V1_gfx10 : MIMG_NoSampler_gfx10<op, asm, dst_rc, VGPROp_32,
                                              !if(enableDisasm, "GFX10", "")>;
       }
       if op.HAS_GFX11 then {
-        def _V1_gfx11 : MIMG_NoSampler_gfx11<op, asm, dst_rc, VGPR_32,
+        def _V1_gfx11 : MIMG_NoSampler_gfx11<op, asm, dst_rc, VGPROp_32,
                                              !if(enableDisasm, "GFX11", "")>;
       }
     }
@@ -576,14 +576,14 @@ multiclass MIMG_NoSampler_Src_Helper <mimgopc op, string asm,
   let VAddrDwords = 2 in {
     let ssamp = 0 in {
       if op.HAS_GFX10M then {
-        def _V2 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_64>;
+        def _V2 : MIMG_NoSampler_Helper <op, asm, dst_rc, VGPROp_64>;
         if !not(ExtendedImageInst) then
-        def _V2_gfx90a : MIMG_NoSampler_Helper_gfx90a <op, asm, dst_rc, VReg_64_Align2>;
-        def _V2_gfx10 : MIMG_NoSampler_gfx10<op, asm, dst_rc, VReg_64>;
+        def _V2_gfx90a : MIMG_NoSampler_Helper_gfx90a <op, asm, dst_rc, VGPROp_64_Align2>;
+        def _V2_gfx10 : MIMG_NoSampler_gfx10<op, asm, dst_rc, VGPROp_64>;
         def _V2_nsa_gfx10 : MIMG_NoSampler_nsa_gfx10<op, asm, dst_rc, 2>;
       }
       if op.HAS_GFX11 then {
-        def _V2_gfx11 : MIMG_NoSampler_gfx11<op, asm, dst_rc, VReg_64>;
+        def _V2_gfx11 : MIMG_NoSampler_gfx11<op, asm, dst_rc, VGPROp_64>;
         def _V2_nsa_gfx11 : MIMG_NoSampler_nsa_gfx11<op, asm, dst_rc, 2>;
       }
     }
@@ -600,14 +600,14 @@ multiclass MIMG_NoSampler_Src_Helper <mimgopc op, string asm,
   let VAddrDwords = 3 in {
     let ssamp = 0 in {
       if op.HAS_GFX10M then {
-        def _V3 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_96>;
+        def _V3 : MIMG_NoSampler_Helper <op, asm, dst_rc, VGPROp_96>;
         if !not(ExtendedImageInst) then
-        def _V3_gfx90a : MIMG_NoSampler_Helper_gfx90a <op, asm, dst_rc, VReg_96_Align2>;
-        def _V3_gfx10 : MIMG_NoSampler_gfx10<op, asm, dst_rc, VReg_96>;
+        def _V3_gfx90a : MIMG_NoSampler_Helper_gfx90a <op, asm, dst_rc, VGPROp_96_Align2>;
+        def _V3_gfx10 : MIMG_NoSampler_gfx10<op, asm, dst_rc, VGPROp_96>;
         def _V3_nsa_gfx10 : MIMG_NoSampler_nsa_gfx10<op, asm, dst_rc, 3>;
       }
       if op.HAS_GFX11 then {
-        def _V3_gfx11 : MIMG_NoSampler_gfx11<op, asm, dst_rc, VReg_96>;
+        def _V3_gfx11 : MIMG_NoSampler_gfx11<op, asm, dst_rc, VGPROp_96>;
         def _V3_nsa_gfx11 : MIMG_NoSampler_nsa_gfx11<op, asm, dst_rc, 3>;
       }
     }
@@ -624,15 +624,15 @@ multiclass MIMG_NoSampler_Src_Helper <mimgopc op, string asm,
   let VAddrDwords = 4 in {
     let ssamp = 0 in {
       if op.HAS_GFX10M then {
-        def _V4 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_128>;
+        def _V4 : MIMG_NoSampler_Helper <op, asm, dst_rc, VGPROp_128>;
         if !not(ExtendedImageInst) then
-        def _V4_gfx90a : MIMG_NoSampler_Helper_gfx90a <op, asm, dst_rc, VReg_128_Align2>;
-        def _V4_gfx10 : MIMG_NoSampler_gfx10<op, asm, dst_rc, VReg_128>;
+        def _V4_gfx90a : MIMG_NoSampler_Helper_gfx90a <op, asm, dst_rc, VGPROp_128_Align2>;
+        def _V4_gfx10 : MIMG_NoSampler_gfx10<op, asm, dst_rc, VGPROp_128>;
         def _V4_nsa_gfx10 : MIMG_NoSampler_nsa_gfx10<op, asm, dst_rc, 4,
                                                      !if(enableDisasm, "GFX10", "")>;
       }
       if op.HAS_GFX11 then {
-        def _V4_gfx11 : MIMG_NoSampler_gfx11<op, asm, dst_rc, VReg_128>;
+        def _V4_gfx11 : MIMG_NoSampler_gfx11<op, asm, dst_rc, VGPROp_128>;
         def _V4_nsa_gfx11 : MIMG_NoSampler_nsa_gfx11<op, asm, dst_rc, 4,
                                                      !if(enableDisasm, "GFX11", "")>;
       }
@@ -640,7 +640,7 @@ multiclass MIMG_NoSampler_Src_Helper <mimgopc op, string asm,
     if op.HAS_GFX12 then {
       if isVSample then {
         let samp = 0 in
-          def _V4_gfx12 : VSAMPLE_Sampler_gfx12<op, asm, dst_rc, 4, VGPR_32,
+          def _V4_gfx12 : VSAMPLE_Sampler_gfx12<op, asm, dst_rc, 4, VGPROp_32,
                                                 !if(enableDisasm, "GFX12", "")>;
       }
       else {
@@ -1128,7 +1128,7 @@ multiclass MIMG_Atomic_Renamed <mimgopc op, string asm, string renamed,
   : MIMG_Atomic <op, asm, isCmpSwap, isFP, renamed>;
 
 class MIMG_Sampler_Helper <mimgopc op, string asm, RegisterOperand dst_rc,
-                           RegisterClass src_rc, string dns="">
+                           RegisterOperand src_rc, string dns="">
   : MIMG_gfx6789 <op.VI, (outs dst_rc:$vdata), dns> {
   let InOperandList = !con((ins src_rc:$vaddr, SReg_256_XNULL:$srsrc, SReg_128_XNULL:$ssamp,
                                 DMask:$dmask, UNorm:$unorm, CPol:$cpol,
@@ -1139,7 +1139,7 @@ class MIMG_Sampler_Helper <mimgopc op, string asm, RegisterOperand dst_rc,
 }
 
 class MIMG_Sampler_gfx90a<mimgopc op, string asm, RegisterOperand dst_rc,
-                          RegisterClass src_rc, string dns="">
+                          RegisterOperand src_rc, string dns="">
   : MIMG_gfx90a<op.GFX10M, (outs dst_rc:$vdata), dns> {
   let InOperandList = !con((ins src_rc:$vaddr, SReg_256_XNULL:$srsrc, SReg_128_XNULL:$ssamp,
                                 DMask:$dmask, UNorm:$unorm, CPol:$cpol,
@@ -1164,7 +1164,7 @@ class MIMG_Sampler_Asm_gfx10p<string opcode, string AsmPrefix, bit HasD16> {
 }
 
 class MIMG_Sampler_gfx10<mimgopc op, string opcode,
-                         RegisterOperand DataRC, RegisterClass AddrRC,
+                         RegisterOperand DataRC, RegisterOperand AddrRC,
                          string dns="">
   : MIMG_gfx10<op.GFX10M, (outs DataRC:$vdata), dns> {
   let InOperandList = MIMG_Sampler_OpList_gfx10p<(ins AddrRC:$vaddr0), BaseOpcode.HasD16>.ret;
@@ -1180,7 +1180,7 @@ class MIMG_Sampler_nsa_gfx10<mimgopc op, string opcode,
 }
 
 class MIMG_Sampler_nortn_gfx10<mimgopc op, string opcode,
-                         RegisterClass AddrRC,
+                         RegisterOperand AddrRC,
                          string dns="">
   : MIMG_gfx10<op.GFX10M, (outs), dns> {
   let InOperandList = MIMG_Sampler_OpList_gfx10p<(ins AddrRC:$vaddr0), BaseOpcode.HasD16>.ret;
@@ -1200,7 +1200,7 @@ class MIMG_Sampler_nortn_nsa_gfx10<mimgopc op, string opcode,
 }
 
 class MIMG_Sampler_gfx11<mimgopc op, string opcode,
-                         RegisterOperand DataRC, RegisterClass AddrRC,
+                         RegisterOperand DataRC, RegisterOperand AddrRC,
                          string dns="">
   : MIMG_gfx11<op.GFX11, (outs DataRC:$vdata), dns> {
   let InOperandList = MIMG_Sampler_OpList_gfx10p<(ins AddrRC:$vaddr0), BaseOpcode.HasD16>.ret;
@@ -1209,7 +1209,7 @@ class MIMG_Sampler_gfx11<mimgopc op, string opcode,
 
 class MIMG_Sampler_nsa_gfx11<mimgopc op, string opcode,
                              RegisterOperand DataRC, int num_addrs,
-                             RegisterClass LastVAddrSize, string dns="">
+                             RegisterOperand LastVAddrSize, string dns="">
   : MIMG_nsa_gfx11<op.GFX11, (outs DataRC:$vdata), num_addrs, dns, [],
                    LastVAddrSize> {
   let InOperandList = MIMG_Sampler_OpList_gfx10p<AddrIns, BaseOpcode.HasD16>.ret;
@@ -1217,7 +1217,7 @@ class MIMG_Sampler_nsa_gfx11<mimgopc op, string opcode,
 }
 
 class MIMG_Sampler_nortn_gfx11<mimgopc op, string opcode,
-                                  RegisterClass AddrRC,
+                                  RegisterOperand AddrRC,
                                   string dns="">
   : MIMG_gfx11<op.GFX11, (outs), dns> {
   let InOperandList = MIMG_Sampler_OpList_gfx10p<(ins AddrRC:$vaddr0), BaseOpcode.HasD16>.ret;
@@ -1227,7 +1227,7 @@ class MIMG_Sampler_nortn_gfx11<mimgopc op, string opcode,
 
 class MIMG_Sampler_nortn_nsa_gfx11<mimgopc op, string opcode,
                                       int num_addrs,
-                                      RegisterClass LastVAddrSize, string dns="">
+                                      RegisterOperand LastVAddrSize, string dns="">
   : MIMG_nsa_gfx11<op.GFX11, (outs), num_addrs, dns, [], LastVAddrSize> {
   let InOperandList = MIMG_Sampler_OpList_gfx10p<AddrIns, BaseOpcode.HasD16>.ret;
   let AsmString = MIMG_Sampler_Asm_gfx10p<opcode, "off, "#AddrAsm, BaseOpcode.HasD16>.ret;
@@ -1237,20 +1237,20 @@ class MIMG_Sampler_nortn_nsa_gfx11<mimgopc op, string opcode,
 class MIMGAddrSize<int dw, bit enable_disasm, int AddrDW = dw> {
   int NumWords = dw;
 
-  RegisterClass RegClass = !if(!le(AddrDW, 0), ?,
-                           !if(!eq(AddrDW, 1), VGPR_32,
-                           !if(!eq(AddrDW, 2), VReg_64,
-                           !if(!eq(AddrDW, 3), VReg_96,
-                           !if(!eq(AddrDW, 4), VReg_128,
-                           !if(!eq(AddrDW, 5), VReg_160,
-                           !if(!eq(AddrDW, 6), VReg_192,
-                           !if(!eq(AddrDW, 7), VReg_224,
-                           !if(!eq(AddrDW, 8), VReg_256,
-                           !if(!eq(AddrDW, 9), VReg_288,
-                           !if(!eq(AddrDW, 10), VReg_320,
-                           !if(!eq(AddrDW, 11), VReg_352,
-                           !if(!eq(AddrDW, 12), VReg_384,
-                           !if(!le(AddrDW, 16), VReg_512, ?))))))))))))));
+  RegisterOperand RegClass = !if(!le(AddrDW, 0), ?,
+                           !if(!eq(AddrDW, 1), VGPROp_32,
+                           !if(!eq(AddrDW, 2), VGPROp_64,
+                           !if(!eq(AddrDW, 3), VGPROp_96,
+                           !if(!eq(AddrDW, 4), VGPROp_128,
+                           !if(!eq(AddrDW, 5), VGPROp_160,
+                           !if(!eq(AddrDW, 6), VGPROp_192,
+                           !if(!eq(AddrDW, 7), VGPROp_224,
+                           !if(!eq(AddrDW, 8), VGPROp_256,
+                           !if(!eq(AddrDW, 9), VGPROp_288,
+                           !if(!eq(AddrDW, 10), VGPROp_320,
+                           !if(!eq(AddrDW, 11), VGPROp_352,
+                           !if(!eq(AddrDW, 12), VGPROp_384,
+                           !if(!le(AddrDW, 16), VGPROp_512, ?))))))))))))));
 
   // Whether the instruction variant with this vaddr size should be enabled for
   // the auto-generated disassembler.
@@ -1514,8 +1514,10 @@ multiclass MIMG_Gather_WQM <mimgopc op, AMDGPUSampleVariant sample>
 
 class MIMG_IntersectRay_Helper<bit Is64, bit IsA16, bit isDual, bit isBVH8> {
   int num_addrs = !if(isBVH8, 11, !if(Is64, !if(IsA16, 9, 12), !if(IsA16, 8, 11)));
-  RegisterClass RegClass = MIMGAddrSize<num_addrs, 0>.RegClass;
-  int VAddrDwords = !srl(RegClass.Size, 5);
+  RegisterOperand RegClass = MIMGAddrSize<num_addrs, 0>.RegClass;
+
+  defvar Size = !cast<SIRegisterClass>(RegClass.RegClass).Size;
+  int VAddrDwords = !srl(Size, 5);
 
   int GFX11PlusNSAAddrs = !if(IsA16, 4, 5);
   RegisterClass node_ptr_type = !if(Is64, VReg_64, VGPR_32);
@@ -1526,7 +1528,7 @@ class MIMG_IntersectRay_Helper<bit Is64, bit IsA16, bit isDual, bit isBVH8> {
            true   : [node_ptr_type, VGPR_32, VReg_96, VReg_96, VReg_96]);
 }
 
-class MIMG_IntersectRay_gfx10<mimgopc op, string opcode, RegisterClass AddrRC>
+class MIMG_IntersectRay_gfx10<mimgopc op, string opcode, RegisterOperand AddrRC>
     : MIMG_gfx10<op.GFX10M, (outs VReg_128:$vdata), "GFX10"> {
   let InOperandList = (ins AddrRC:$vaddr0, SReg_128_XNULL:$srsrc, A16:$a16);
   let AsmString = opcode#" $vdata, $vaddr0, $srsrc$a16";
@@ -1540,7 +1542,7 @@ class MIMG_IntersectRay_nsa_gfx10<mimgopc op, string opcode, int num_addrs>
   let AsmString = opcode#" $vdata, "#nsah.AddrAsm#", $srsrc$a16";
 }
 
-class MIMG_IntersectRay_gfx11<mimgopc op, string opcode, RegisterClass AddrRC>
+class MIMG_IntersectRay_gfx11<mimgopc op, string opcode, RegisterOperand AddrRC>
     : MIMG_gfx11<op.GFX11, (outs VReg_128:$vdata), "GFX11"> {
   let InOperandList = (ins AddrRC:$vaddr0, SReg_128_XNULL:$srsrc, A16:$a16);
   let AsmString = opcode#" $vdata, $vaddr0, $srsrc$a16";
diff --git a/llvm/lib/Target/AMDGPU/R600Packetizer.cpp b/llvm/lib/Target/AMDGPU/R600Packetizer.cpp
index c1ed176ed29d2..301cb21a808f8 100644
--- a/llvm/lib/Target/AMDGPU/R600Packetizer.cpp
+++ b/llvm/lib/Target/AMDGPU/R600Packetizer.cpp
@@ -319,6 +319,11 @@ bool R600Packetizer::runOnMachineFunction(MachineFunction &Fn) {
 
   MachineLoopInfo &MLI = getAnalysis<MachineLoopInfoWrapperPass>().getLI();
 
+  const InstrItineraryData *II = ST.getInstrItineraryData();
+  // If there is no itineraries information, abandon.
+  if (II->Itineraries == nullptr)
+    return false;
+
   // Instantiate the packetizer.
   R600PacketizerList Packetizer(Fn, ST, MLI);
 
diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h
index ecc4659ee0e81..ecc28244cc71e 100644
--- a/llvm/lib/Target/AMDGPU/SIDefines.h
+++ b/llvm/lib/Target/AMDGPU/SIDefines.h
@@ -1108,7 +1108,14 @@ enum Register_Flag : uint8_t {
 namespace AMDGPU {
 namespace Barrier {
 
-enum Type { TRAP = -2, WORKGROUP = -1 };
+enum Type {
+  CLUSTER_TRAP = -4,
+  CLUSTER = -3,
+  TRAP = -2,
+  WORKGROUP = -1,
+  NAMED_BARRIER_FIRST = 1,
+  NAMED_BARRIER_LAST = 16,
+};
 
 enum {
   BARRIER_SCOPE_WORKGROUP = 0,
diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
index 6533d4c8eca35..7793907c032d2 100644
--- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -66,6 +66,7 @@
 
 #include "SIFixSGPRCopies.h"
 #include "AMDGPU.h"
+#include "AMDGPULaneMaskUtils.h"
 #include "GCNSubtarget.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "llvm/CodeGen/MachineDominators.h"
@@ -1145,7 +1146,8 @@ void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) {
 }
 
 void SIFixSGPRCopies::fixSCCCopies(MachineFunction &MF) {
-  bool IsWave32 = MF.getSubtarget<GCNSubtarget>().isWave32();
+  const AMDGPU::LaneMaskConstants &LMC =
+      AMDGPU::LaneMaskConstants::get(MF.getSubtarget<GCNSubtarget>());
   for (MachineBasicBlock &MBB : MF) {
     for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;
          ++I) {
@@ -1159,10 +1161,7 @@ void SIFixSGPRCopies::fixSCCCopies(MachineFunction &MF) {
         Register SCCCopy =
             MRI->createVirtualRegister(TRI->getWaveMaskRegClass());
         I = BuildMI(*MI.getParent(), std::next(MachineBasicBlock::iterator(MI)),
-                    MI.getDebugLoc(),
-                    TII->get(IsWave32 ? AMDGPU::S_CSELECT_B32
-                                      : AMDGPU::S_CSELECT_B64),
-                    SCCCopy)
+                    MI.getDebugLoc(), TII->get(LMC.CSelectOpc), SCCCopy)
                 .addImm(-1)
                 .addImm(0);
         I = BuildMI(*MI.getParent(), std::next(I), I->getDebugLoc(),
@@ -1172,14 +1171,12 @@ void SIFixSGPRCopies::fixSCCCopies(MachineFunction &MF) {
         continue;
       }
       if (DstReg == AMDGPU::SCC) {
-        unsigned Opcode = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
-        Register Exec = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
         Register Tmp = MRI->createVirtualRegister(TRI->getBoolRC());
         I = BuildMI(*MI.getParent(), std::next(MachineBasicBlock::iterator(MI)),
-                    MI.getDebugLoc(), TII->get(Opcode))
+                    MI.getDebugLoc(), TII->get(LMC.AndOpc))
                 .addReg(Tmp, getDefRegState(true))
                 .addReg(SrcReg)
-                .addReg(Exec);
+                .addReg(LMC.ExecReg);
         MI.eraseFromParent();
       }
     }
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 5297816ec1f2b..38331b614bceb 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -709,23 +709,11 @@ bool SIFoldOperandsImpl::updateOperand(FoldCandidate &Fold) const {
 
   // Verify the register is compatible with the operand.
   if (const TargetRegisterClass *OpRC =
-          TII->getRegClass(MI->getDesc(), Fold.UseOpNo, TRI, *MF)) {
-    const TargetRegisterClass *OldRC = MRI->getRegClass(Old.getReg());
-    const TargetRegisterClass *NewRC = MRI->getRegClass(New->getReg());
-    unsigned NewSubReg = New->getSubReg();
-    unsigned OldSubReg = Old.getSubReg();
-
-    const TargetRegisterClass *ConstrainRC = OpRC;
-    if (NewSubReg && OldSubReg) {
-      unsigned PreA, PreB;
-      ConstrainRC = TRI->getCommonSuperRegClass(OpRC, OldSubReg, NewRC,
-                                                NewSubReg, PreA, PreB);
-    } else if (OldSubReg) {
-      ConstrainRC = TRI->getMatchingSuperRegClass(OldRC, OpRC, OldSubReg);
-    } else if (NewSubReg) {
-      ConstrainRC = TRI->getMatchingSuperRegClass(NewRC, OpRC, NewSubReg);
-    }
-
+          TII->getRegClass(MI->getDesc(), Fold.UseOpNo, TRI)) {
+    const TargetRegisterClass *NewRC =
+        TRI->getRegClassForReg(*MRI, New->getReg());
+    const TargetRegisterClass *ConstrainRC =
+        TRI->findCommonRegClass(OpRC, Old.getSubReg(), NewRC, New->getSubReg());
     if (!ConstrainRC)
       return false;
 
@@ -740,8 +728,12 @@ bool SIFoldOperandsImpl::updateOperand(FoldCandidate &Fold) const {
   // 16-bit SGPRs instead of 32-bit ones.
   if (Old.getSubReg() == AMDGPU::lo16 && TRI->isSGPRReg(*MRI, New->getReg()))
     Old.setSubReg(AMDGPU::NoSubRegister);
-  Old.substVirtReg(New->getReg(), New->getSubReg(), *TRI);
-  Old.setIsUndef(New->isUndef());
+  if (New->getReg().isPhysical()) {
+    Old.substPhysReg(New->getReg(), *TRI);
+  } else {
+    Old.substVirtReg(New->getReg(), New->getSubReg(), *TRI);
+    Old.setIsUndef(New->isUndef());
+  }
   return true;
 }
 
@@ -2010,7 +2002,9 @@ bool SIFoldOperandsImpl::tryFoldFoldableCopy(
   if (!FoldingImm && !OpToFold.isReg())
     return false;
 
-  if (OpToFold.isReg() && !OpToFold.getReg().isVirtual())
+  // Fold virtual registers and constant physical registers.
+  if (OpToFold.isReg() && OpToFold.getReg().isPhysical() &&
+      !TRI->isConstantPhysReg(OpToFold.getReg()))
     return false;
 
   // Prevent folding operands backwards in the function. For example,
@@ -2409,8 +2403,7 @@ bool SIFoldOperandsImpl::tryFoldRegSequence(MachineInstr &MI) {
 
   unsigned OpIdx = Op - &UseMI->getOperand(0);
   const MCInstrDesc &InstDesc = UseMI->getDesc();
-  const TargetRegisterClass *OpRC =
-      TII->getRegClass(InstDesc, OpIdx, TRI, *MI.getMF());
+  const TargetRegisterClass *OpRC = TII->getRegClass(InstDesc, OpIdx, TRI);
   if (!OpRC || !TRI->isVectorSuperClass(OpRC))
     return false;
 
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index ce25bf499c41e..7c5d4fc2dacf6 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -8,6 +8,7 @@
 
 #include "SIFrameLowering.h"
 #include "AMDGPU.h"
+#include "AMDGPULaneMaskUtils.h"
 #include "GCNSubtarget.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "SIMachineFunctionInfo.h"
@@ -984,6 +985,7 @@ void SIFrameLowering::emitCSRSpillStores(
   const SIInstrInfo *TII = ST.getInstrInfo();
   const SIRegisterInfo &TRI = TII->getRegisterInfo();
   MachineRegisterInfo &MRI = MF.getRegInfo();
+  const AMDGPU::LaneMaskConstants &LMC = AMDGPU::LaneMaskConstants::get(ST);
 
   // Spill Whole-Wave Mode VGPRs. Save only the inactive lanes of the scratch
   // registers. However, save all lanes of callee-saved VGPRs. Due to this, we
@@ -1015,8 +1017,7 @@ void SIFrameLowering::emitCSRSpillStores(
   StoreWWMRegisters(WWMScratchRegs);
 
   auto EnableAllLanes = [&]() {
-    unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
-    BuildMI(MBB, MBBI, DL, TII->get(MovOpc), TRI.getExec()).addImm(-1);
+    BuildMI(MBB, MBBI, DL, TII->get(LMC.MovOpc), LMC.ExecReg).addImm(-1);
   };
 
   if (!WWMCalleeSavedRegs.empty()) {
@@ -1043,8 +1044,7 @@ void SIFrameLowering::emitCSRSpillStores(
     TII->getWholeWaveFunctionSetup(MF)->eraseFromParent();
   } else if (ScratchExecCopy) {
     // FIXME: Split block and make terminator.
-    unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
-    BuildMI(MBB, MBBI, DL, TII->get(ExecMov), TRI.getExec())
+    BuildMI(MBB, MBBI, DL, TII->get(LMC.MovOpc), LMC.ExecReg)
         .addReg(ScratchExecCopy, RegState::Kill);
     LiveUnits.addReg(ScratchExecCopy);
   }
@@ -1092,6 +1092,7 @@ void SIFrameLowering::emitCSRSpillRestores(
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   const SIInstrInfo *TII = ST.getInstrInfo();
   const SIRegisterInfo &TRI = TII->getRegisterInfo();
+  const AMDGPU::LaneMaskConstants &LMC = AMDGPU::LaneMaskConstants::get(ST);
   Register FramePtrReg = FuncInfo->getFrameOffsetReg();
 
   for (const auto &Spill : FuncInfo->getPrologEpilogSGPRSpills()) {
@@ -1147,16 +1148,14 @@ void SIFrameLowering::emitCSRSpillRestores(
     Register OrigExec = Return.getOperand(0).getReg();
 
     if (!WWMScratchRegs.empty()) {
-      unsigned XorOpc = ST.isWave32() ? AMDGPU::S_XOR_B32 : AMDGPU::S_XOR_B64;
-      BuildMI(MBB, MBBI, DL, TII->get(XorOpc), TRI.getExec())
+      BuildMI(MBB, MBBI, DL, TII->get(LMC.XorOpc), LMC.ExecReg)
           .addReg(OrigExec)
           .addImm(-1);
       RestoreWWMRegisters(WWMScratchRegs);
     }
 
     // Restore original EXEC.
-    unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
-    BuildMI(MBB, MBBI, DL, TII->get(MovOpc), TRI.getExec()).addReg(OrigExec);
+    BuildMI(MBB, MBBI, DL, TII->get(LMC.MovOpc), LMC.ExecReg).addReg(OrigExec);
 
     // Drop the first operand and update the opcode.
     Return.removeOperand(0);
@@ -1173,8 +1172,7 @@ void SIFrameLowering::emitCSRSpillRestores(
   RestoreWWMRegisters(WWMScratchRegs);
   if (!WWMCalleeSavedRegs.empty()) {
     if (ScratchExecCopy) {
-      unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
-      BuildMI(MBB, MBBI, DL, TII->get(MovOpc), TRI.getExec()).addImm(-1);
+      BuildMI(MBB, MBBI, DL, TII->get(LMC.MovOpc), LMC.ExecReg).addImm(-1);
     } else {
       ScratchExecCopy = buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL,
                                              /*IsProlog*/ false,
@@ -1185,8 +1183,7 @@ void SIFrameLowering::emitCSRSpillRestores(
   RestoreWWMRegisters(WWMCalleeSavedRegs);
   if (ScratchExecCopy) {
     // FIXME: Split block and make terminator.
-    unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
-    BuildMI(MBB, MBBI, DL, TII->get(ExecMov), TRI.getExec())
+    BuildMI(MBB, MBBI, DL, TII->get(LMC.MovOpc), LMC.ExecReg)
         .addReg(ScratchExecCopy, RegState::Kill);
   }
 }
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index cb3e544449bbf..363717b017ef0 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -14,6 +14,7 @@
 #include "SIISelLowering.h"
 #include "AMDGPU.h"
 #include "AMDGPUInstrInfo.h"
+#include "AMDGPULaneMaskUtils.h"
 #include "AMDGPUTargetMachine.h"
 #include "GCNSubtarget.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
@@ -2098,10 +2099,16 @@ bool SITargetLowering::allowsMisalignedMemoryAccessesImpl(
   if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
       AddrSpace == AMDGPUAS::FLAT_ADDRESS) {
     bool AlignedBy4 = Alignment >= Align(4);
+    if (Subtarget->hasUnalignedScratchAccessEnabled()) {
+      if (IsFast)
+        *IsFast = AlignedBy4 ? Size : 1;
+      return true;
+    }
+
     if (IsFast)
       *IsFast = AlignedBy4;
 
-    return AlignedBy4 || Subtarget->hasUnalignedScratchAccessEnabled();
+    return AlignedBy4;
   }
 
   // So long as they are correct, wide global memory operations perform better
@@ -2359,6 +2366,37 @@ SDValue SITargetLowering::lowerKernargMemParameter(
   return DAG.getMergeValues({Val, Load.getValue(1)}, SL);
 }
 
+/// Coerce an argument which was passed in a different ABI type to the original
+/// expected value type.
+SDValue SITargetLowering::convertABITypeToValueType(SelectionDAG &DAG,
+                                                    SDValue Val,
+                                                    CCValAssign &VA,
+                                                    const SDLoc &SL) const {
+  EVT ValVT = VA.getValVT();
+
+  // If this is an 8 or 16-bit value, it is really passed promoted
+  // to 32 bits. Insert an assert[sz]ext to capture this, then
+  // truncate to the right size.
+  switch (VA.getLocInfo()) {
+  case CCValAssign::Full:
+    return Val;
+  case CCValAssign::BCvt:
+    return DAG.getNode(ISD::BITCAST, SL, ValVT, Val);
+  case CCValAssign::SExt:
+    Val = DAG.getNode(ISD::AssertSext, SL, VA.getLocVT(), Val,
+                      DAG.getValueType(ValVT));
+    return DAG.getNode(ISD::TRUNCATE, SL, ValVT, Val);
+  case CCValAssign::ZExt:
+    Val = DAG.getNode(ISD::AssertZext, SL, VA.getLocVT(), Val,
+                      DAG.getValueType(ValVT));
+    return DAG.getNode(ISD::TRUNCATE, SL, ValVT, Val);
+  case CCValAssign::AExt:
+    return DAG.getNode(ISD::TRUNCATE, SL, ValVT, Val);
+  default:
+    llvm_unreachable("Unknown loc info!");
+  }
+}
+
 SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG,
                                               CCValAssign &VA, const SDLoc &SL,
                                               SDValue Chain,
@@ -2379,7 +2417,6 @@ SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG,
 
   // Create load nodes to retrieve arguments from the stack.
   SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
-  SDValue ArgValue;
 
   // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
   ISD::LoadExtType ExtType = ISD::NON_EXTLOAD;
@@ -2402,10 +2439,62 @@ SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG,
     break;
   }
 
-  ArgValue = DAG.getExtLoad(
+  SDValue ArgValue = DAG.getExtLoad(
       ExtType, SL, VA.getLocVT(), Chain, FIN,
       MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), MemVT);
-  return ArgValue;
+
+  SDValue ConvertedVal = convertABITypeToValueType(DAG, ArgValue, VA, SL);
+  if (ConvertedVal == ArgValue)
+    return ConvertedVal;
+
+  return DAG.getMergeValues({ConvertedVal, ArgValue.getValue(1)}, SL);
+}
+
+SDValue SITargetLowering::lowerWorkGroupId(
+    SelectionDAG &DAG, const SIMachineFunctionInfo &MFI, EVT VT,
+    AMDGPUFunctionArgInfo::PreloadedValue WorkGroupIdPV,
+    AMDGPUFunctionArgInfo::PreloadedValue ClusterMaxIdPV,
+    AMDGPUFunctionArgInfo::PreloadedValue ClusterWorkGroupIdPV) const {
+  if (!Subtarget->hasClusters())
+    return getPreloadedValue(DAG, MFI, VT, WorkGroupIdPV);
+
+  // Clusters are supported. Return the global position in the grid. If clusters
+  // are enabled, WorkGroupIdPV returns the cluster ID not the workgroup ID.
+
+  // WorkGroupIdXYZ = ClusterId == 0 ?
+  //   ClusterIdXYZ :
+  //   ClusterIdXYZ * (ClusterMaxIdXYZ + 1) + ClusterWorkGroupIdXYZ
+  SDValue ClusterIdXYZ = getPreloadedValue(DAG, MFI, VT, WorkGroupIdPV);
+  SDLoc SL(ClusterIdXYZ);
+  SDValue ClusterMaxIdXYZ = getPreloadedValue(DAG, MFI, VT, ClusterMaxIdPV);
+  SDValue One = DAG.getConstant(1, SL, VT);
+  SDValue ClusterSizeXYZ = DAG.getNode(ISD::ADD, SL, VT, ClusterMaxIdXYZ, One);
+  SDValue ClusterWorkGroupIdXYZ =
+      getPreloadedValue(DAG, MFI, VT, ClusterWorkGroupIdPV);
+  SDValue GlobalIdXYZ =
+      DAG.getNode(ISD::ADD, SL, VT, ClusterWorkGroupIdXYZ,
+                  DAG.getNode(ISD::MUL, SL, VT, ClusterIdXYZ, ClusterSizeXYZ));
+
+  switch (MFI.getClusterDims().getKind()) {
+  case AMDGPU::ClusterDimsAttr::Kind::FixedDims:
+  case AMDGPU::ClusterDimsAttr::Kind::VariableDims:
+    return GlobalIdXYZ;
+  case AMDGPU::ClusterDimsAttr::Kind::NoCluster:
+    return ClusterIdXYZ;
+  case AMDGPU::ClusterDimsAttr::Kind::Unknown: {
+    using namespace AMDGPU::Hwreg;
+    SDValue ClusterIdField =
+        DAG.getTargetConstant(HwregEncoding::encode(ID_IB_STS2, 6, 4), SL, VT);
+    SDNode *GetReg =
+        DAG.getMachineNode(AMDGPU::S_GETREG_B32_const, SL, VT, ClusterIdField);
+    SDValue ClusterId(GetReg, 0);
+    SDValue Zero = DAG.getConstant(0, SL, VT);
+    return DAG.getNode(ISD::SELECT_CC, SL, VT, ClusterId, Zero, ClusterIdXYZ,
+                       GlobalIdXYZ, DAG.getCondCode(ISD::SETEQ));
+  }
+  }
+
+  llvm_unreachable("nothing should reach here");
 }
 
 SDValue SITargetLowering::getPreloadedValue(
@@ -2426,9 +2515,30 @@ SDValue SITargetLowering::getPreloadedValue(
       AMDGPU::isEntryFunctionCC(CC) && !MFI.hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
   const ArgDescriptor WorkGroupIDZ =
       ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
+  const ArgDescriptor ClusterWorkGroupIDX =
+      ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0000000Fu);
+  const ArgDescriptor ClusterWorkGroupIDY =
+      ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x000000F0u);
+  const ArgDescriptor ClusterWorkGroupIDZ =
+      ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x00000F00u);
+  const ArgDescriptor ClusterWorkGroupMaxIDX =
+      ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0000F000u);
+  const ArgDescriptor ClusterWorkGroupMaxIDY =
+      ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x000F0000u);
+  const ArgDescriptor ClusterWorkGroupMaxIDZ =
+      ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x00F00000u);
+  const ArgDescriptor ClusterWorkGroupMaxFlatID =
+      ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0F000000u);
+
+  auto LoadConstant = [&](unsigned N) {
+    return DAG.getConstant(N, SDLoc(), VT);
+  };
+
   if (Subtarget->hasArchitectedSGPRs() &&
-      (AMDGPU::isCompute(CC) || CC == CallingConv::AMDGPU_Gfx ||
-       CC == CallingConv::AMDGPU_Gfx_WholeWave)) {
+      (AMDGPU::isCompute(CC) || CC == CallingConv::AMDGPU_Gfx)) {
+    AMDGPU::ClusterDimsAttr ClusterDims = MFI.getClusterDims();
+    bool HasFixedDims = ClusterDims.isFixedDims();
+
     switch (PVID) {
     case AMDGPUFunctionArgInfo::WORKGROUP_ID_X:
       Reg = &WorkGroupIDX;
@@ -2445,6 +2555,53 @@ SDValue SITargetLowering::getPreloadedValue(
       RC = &AMDGPU::SReg_32RegClass;
       Ty = LLT::scalar(32);
       break;
+    case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_X:
+      if (HasFixedDims && ClusterDims.getDims()[0] == 1)
+        return LoadConstant(0);
+      Reg = &ClusterWorkGroupIDX;
+      RC = &AMDGPU::SReg_32RegClass;
+      Ty = LLT::scalar(32);
+      break;
+    case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Y:
+      if (HasFixedDims && ClusterDims.getDims()[1] == 1)
+        return LoadConstant(0);
+      Reg = &ClusterWorkGroupIDY;
+      RC = &AMDGPU::SReg_32RegClass;
+      Ty = LLT::scalar(32);
+      break;
+    case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Z:
+      if (HasFixedDims && ClusterDims.getDims()[2] == 1)
+        return LoadConstant(0);
+      Reg = &ClusterWorkGroupIDZ;
+      RC = &AMDGPU::SReg_32RegClass;
+      Ty = LLT::scalar(32);
+      break;
+    case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_X:
+      if (HasFixedDims)
+        return LoadConstant(ClusterDims.getDims()[0] - 1);
+      Reg = &ClusterWorkGroupMaxIDX;
+      RC = &AMDGPU::SReg_32RegClass;
+      Ty = LLT::scalar(32);
+      break;
+    case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Y:
+      if (HasFixedDims)
+        return LoadConstant(ClusterDims.getDims()[1] - 1);
+      Reg = &ClusterWorkGroupMaxIDY;
+      RC = &AMDGPU::SReg_32RegClass;
+      Ty = LLT::scalar(32);
+      break;
+    case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Z:
+      if (HasFixedDims)
+        return LoadConstant(ClusterDims.getDims()[2] - 1);
+      Reg = &ClusterWorkGroupMaxIDZ;
+      RC = &AMDGPU::SReg_32RegClass;
+      Ty = LLT::scalar(32);
+      break;
+    case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_FLAT_ID:
+      Reg = &ClusterWorkGroupMaxFlatID;
+      RC = &AMDGPU::SReg_32RegClass;
+      Ty = LLT::scalar(32);
+      break;
     default:
       break;
     }
@@ -3378,7 +3535,6 @@ SDValue SITargetLowering::LowerFormalArguments(
       RC = &AMDGPU::SGPR_32RegClass;
     else
       llvm_unreachable("Unexpected register class in LowerFormalArguments!");
-    EVT ValVT = VA.getValVT();
 
     Reg = MF.addLiveIn(Reg, RC);
     SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
@@ -3396,30 +3552,7 @@ SDValue SITargetLowering::LowerFormalArguments(
           DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), NumBits)));
     }
 
-    // If this is an 8 or 16-bit value, it is really passed promoted
-    // to 32 bits. Insert an assert[sz]ext to capture this, then
-    // truncate to the right size.
-    switch (VA.getLocInfo()) {
-    case CCValAssign::Full:
-      break;
-    case CCValAssign::BCvt:
-      Val = DAG.getNode(ISD::BITCAST, DL, ValVT, Val);
-      break;
-    case CCValAssign::SExt:
-      Val = DAG.getNode(ISD::AssertSext, DL, VT, Val, DAG.getValueType(ValVT));
-      Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
-      break;
-    case CCValAssign::ZExt:
-      Val = DAG.getNode(ISD::AssertZext, DL, VT, Val, DAG.getValueType(ValVT));
-      Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
-      break;
-    case CCValAssign::AExt:
-      Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
-      break;
-    default:
-      llvm_unreachable("Unknown loc info!");
-    }
-
+    Val = convertABITypeToValueType(DAG, Val, VA, DL);
     InVals.push_back(Val);
   }
 
@@ -3660,21 +3793,23 @@ void SITargetLowering::passSpecialInputs(
   // in the same location as the input.
   // clang-format off
   static constexpr std::pair<AMDGPUFunctionArgInfo::PreloadedValue,
-                              StringLiteral> ImplicitAttrs[] = {
-     {AMDGPUFunctionArgInfo::DISPATCH_PTR, "amdgpu-no-dispatch-ptr"},
-     {AMDGPUFunctionArgInfo::QUEUE_PTR, "amdgpu-no-queue-ptr" },
-     {AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR, "amdgpu-no-implicitarg-ptr"},
-     {AMDGPUFunctionArgInfo::DISPATCH_ID, "amdgpu-no-dispatch-id"},
-     {AMDGPUFunctionArgInfo::WORKGROUP_ID_X, "amdgpu-no-workgroup-id-x"},
-     {AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,"amdgpu-no-workgroup-id-y"},
-     {AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,"amdgpu-no-workgroup-id-z"},
-     {AMDGPUFunctionArgInfo::LDS_KERNEL_ID,"amdgpu-no-lds-kernel-id"},
-   };
+      std::array<StringLiteral, 2>> ImplicitAttrs[] = {
+    {AMDGPUFunctionArgInfo::DISPATCH_PTR, {"amdgpu-no-dispatch-ptr", ""}},
+    {AMDGPUFunctionArgInfo::QUEUE_PTR, {"amdgpu-no-queue-ptr", ""}},
+    {AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR, {"amdgpu-no-implicitarg-ptr", ""}},
+    {AMDGPUFunctionArgInfo::DISPATCH_ID, {"amdgpu-no-dispatch-id", ""}},
+    {AMDGPUFunctionArgInfo::WORKGROUP_ID_X, {"amdgpu-no-workgroup-id-x", "amdgpu-no-cluster-id-x"}},
+    {AMDGPUFunctionArgInfo::WORKGROUP_ID_Y, {"amdgpu-no-workgroup-id-y", "amdgpu-no-cluster-id-y"}},
+    {AMDGPUFunctionArgInfo::WORKGROUP_ID_Z, {"amdgpu-no-workgroup-id-z", "amdgpu-no-cluster-id-z"}},
+    {AMDGPUFunctionArgInfo::LDS_KERNEL_ID, {"amdgpu-no-lds-kernel-id", ""}},
+  };
   // clang-format on
 
-  for (auto [InputID, Attr] : ImplicitAttrs) {
+  for (auto [InputID, Attrs] : ImplicitAttrs) {
     // If the callee does not use the attribute value, skip copying the value.
-    if (CLI.CB->hasFnAttr(Attr))
+    if (all_of(Attrs, [&](StringRef Attr) {
+          return Attr.empty() || CLI.CB->hasFnAttr(Attr);
+        }))
       continue;
 
     const auto [OutgoingArg, ArgRC, ArgTy] =
@@ -4895,6 +5030,7 @@ emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI,
   MachineFunction *MF = OrigBB.getParent();
   const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
   const SIRegisterInfo *TRI = ST.getRegisterInfo();
+  const AMDGPU::LaneMaskConstants &LMC = AMDGPU::LaneMaskConstants::get(ST);
   MachineBasicBlock::iterator I = LoopBB.begin();
 
   const TargetRegisterClass *BoolRC = TRI->getBoolRC();
@@ -4926,10 +5062,7 @@ emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI,
       .addReg(Idx.getReg(), 0, Idx.getSubReg());
 
   // Update EXEC, save the original EXEC value to VCC.
-  BuildMI(LoopBB, I, DL,
-          TII->get(ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32
-                                 : AMDGPU::S_AND_SAVEEXEC_B64),
-          NewExec)
+  BuildMI(LoopBB, I, DL, TII->get(LMC.AndSaveExecOpc), NewExec)
       .addReg(CondReg, RegState::Kill);
 
   MRI.setSimpleHint(NewExec, CondReg);
@@ -4956,13 +5089,9 @@ emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI,
   }
 
   // Update EXEC, switch all done bits to 0 and all todo bits to 1.
-  unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
   MachineInstr *InsertPt =
-      BuildMI(LoopBB, I, DL,
-              TII->get(ST.isWave32() ? AMDGPU::S_XOR_B32_term
-                                     : AMDGPU::S_XOR_B64_term),
-              Exec)
-          .addReg(Exec)
+      BuildMI(LoopBB, I, DL, TII->get(LMC.XorTermOpc), LMC.ExecReg)
+          .addReg(LMC.ExecReg)
           .addReg(NewExec);
 
   // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
@@ -4997,15 +5126,14 @@ loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI,
   Register DstReg = MI.getOperand(0).getReg();
   Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
   Register TmpExec = MRI.createVirtualRegister(BoolXExecRC);
-  unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
-  unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
+  const AMDGPU::LaneMaskConstants &LMC = AMDGPU::LaneMaskConstants::get(ST);
 
   BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec);
 
   // Save the EXEC mask
   // clang-format off
-  BuildMI(MBB, I, DL, TII->get(MovExecOpc), SaveExec)
-      .addReg(Exec);
+  BuildMI(MBB, I, DL, TII->get(LMC.MovOpc), SaveExec)
+      .addReg(LMC.ExecReg);
   // clang-format on
 
   auto [LoopBB, RemainderBB] = splitBlockForLoop(MI, MBB, false);
@@ -5025,7 +5153,7 @@ loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI,
   LoopBB->addSuccessor(LandingPad);
   MachineBasicBlock::iterator First = LandingPad->begin();
   // clang-format off
-  BuildMI(*LandingPad, First, DL, TII->get(MovExecOpc), Exec)
+  BuildMI(*LandingPad, First, DL, TII->get(LMC.MovOpc), LMC.ExecReg)
       .addReg(SaveExec);
   // clang-format on
 
@@ -8033,25 +8161,14 @@ SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
     // it returns a wrong value (all zeroes?). The real value is in the upper 32
     // bits.
     //
-    // To work around the issue, directly emit a 64 bit mov from this register
+    // To work around the issue, emit a 64 bit copy from this register
     // then extract the high bits. Note that this shouldn't even result in a
     // shift being emitted and simply become a pair of registers (e.g.):
     //    s_mov_b64 s[6:7], src_shared_base
     //    v_mov_b32_e32 v1, s7
-    //
-    // FIXME: It would be more natural to emit a CopyFromReg here, but then copy
-    // coalescing would kick in and it would think it's okay to use the "HI"
-    // subregister directly (instead of extracting the HI 32 bits) which is an
-    // artificial (unusable) register.
-    //  Register TableGen definitions would need an overhaul to get rid of the
-    //  artificial "HI" aperture registers and prevent this kind of issue from
-    //  happening.
-    SDNode *Mov = DAG.getMachineNode(AMDGPU::S_MOV_B64, DL, MVT::i64,
-                                     DAG.getRegister(ApertureRegNo, MVT::i64));
-    return DAG.getNode(
-        ISD::TRUNCATE, DL, MVT::i32,
-        DAG.getNode(ISD::SRL, DL, MVT::i64,
-                    {SDValue(Mov, 0), DAG.getConstant(32, DL, MVT::i64)}));
+    SDValue Copy =
+        DAG.getCopyFromReg(DAG.getEntryNode(), DL, ApertureRegNo, MVT::v2i32);
+    return DAG.getExtractVectorElt(DL, MVT::i32, Copy, 1);
   }
 
   // For code object version 5, private_base and shared_base are passed through
@@ -9528,6 +9645,19 @@ SDValue SITargetLowering::lowerWaveID(SelectionDAG &DAG, SDValue Op) const {
                      DAG.getConstant(25, SL, VT), DAG.getConstant(5, SL, VT));
 }
 
+SDValue SITargetLowering::lowerConstHwRegRead(SelectionDAG &DAG, SDValue Op,
+                                              AMDGPU::Hwreg::Id HwReg,
+                                              unsigned LowBit,
+                                              unsigned Width) const {
+  SDLoc SL(Op);
+  using namespace AMDGPU::Hwreg;
+  return {DAG.getMachineNode(
+              AMDGPU::S_GETREG_B32_const, SL, MVT::i32,
+              DAG.getTargetConstant(HwregEncoding::encode(HwReg, LowBit, Width),
+                                    SL, MVT::i32)),
+          0};
+}
+
 SDValue SITargetLowering::lowerWorkitemID(SelectionDAG &DAG, SDValue Op,
                                           unsigned Dim,
                                           const ArgDescriptor &Arg) const {
@@ -9674,14 +9804,81 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     return lowerImplicitZextParam(DAG, Op, MVT::i16,
                                   SI::KernelInputOffsets::LOCAL_SIZE_Z);
   case Intrinsic::amdgcn_workgroup_id_x:
-    return getPreloadedValue(DAG, *MFI, VT,
-                             AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
+    return lowerWorkGroupId(DAG, *MFI, VT,
+                            AMDGPUFunctionArgInfo::WORKGROUP_ID_X,
+                            AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_X,
+                            AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_X);
   case Intrinsic::amdgcn_workgroup_id_y:
-    return getPreloadedValue(DAG, *MFI, VT,
-                             AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
+    return lowerWorkGroupId(DAG, *MFI, VT,
+                            AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,
+                            AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Y,
+                            AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Y);
   case Intrinsic::amdgcn_workgroup_id_z:
-    return getPreloadedValue(DAG, *MFI, VT,
-                             AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
+    return lowerWorkGroupId(DAG, *MFI, VT,
+                            AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,
+                            AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Z,
+                            AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Z);
+  case Intrinsic::amdgcn_cluster_id_x:
+    return Subtarget->hasClusters()
+               ? getPreloadedValue(DAG, *MFI, VT,
+                                   AMDGPUFunctionArgInfo::WORKGROUP_ID_X)
+               : DAG.getPOISON(VT);
+  case Intrinsic::amdgcn_cluster_id_y:
+    return Subtarget->hasClusters()
+               ? getPreloadedValue(DAG, *MFI, VT,
+                                   AMDGPUFunctionArgInfo::WORKGROUP_ID_Y)
+               : DAG.getPOISON(VT);
+  case Intrinsic::amdgcn_cluster_id_z:
+    return Subtarget->hasClusters()
+               ? getPreloadedValue(DAG, *MFI, VT,
+                                   AMDGPUFunctionArgInfo::WORKGROUP_ID_Z)
+               : DAG.getPOISON(VT);
+  case Intrinsic::amdgcn_cluster_workgroup_id_x:
+    return Subtarget->hasClusters()
+               ? getPreloadedValue(
+                     DAG, *MFI, VT,
+                     AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_X)
+               : DAG.getPOISON(VT);
+  case Intrinsic::amdgcn_cluster_workgroup_id_y:
+    return Subtarget->hasClusters()
+               ? getPreloadedValue(
+                     DAG, *MFI, VT,
+                     AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Y)
+               : DAG.getPOISON(VT);
+  case Intrinsic::amdgcn_cluster_workgroup_id_z:
+    return Subtarget->hasClusters()
+               ? getPreloadedValue(
+                     DAG, *MFI, VT,
+                     AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Z)
+               : DAG.getPOISON(VT);
+  case Intrinsic::amdgcn_cluster_workgroup_flat_id:
+    return Subtarget->hasClusters()
+               ? lowerConstHwRegRead(DAG, Op, AMDGPU::Hwreg::ID_IB_STS2, 21, 4)
+               : SDValue();
+  case Intrinsic::amdgcn_cluster_workgroup_max_id_x:
+    return Subtarget->hasClusters()
+               ? getPreloadedValue(
+                     DAG, *MFI, VT,
+                     AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_X)
+               : DAG.getPOISON(VT);
+  case Intrinsic::amdgcn_cluster_workgroup_max_id_y:
+    return Subtarget->hasClusters()
+               ? getPreloadedValue(
+                     DAG, *MFI, VT,
+                     AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Y)
+               : DAG.getPOISON(VT);
+  case Intrinsic::amdgcn_cluster_workgroup_max_id_z:
+    return Subtarget->hasClusters()
+               ? getPreloadedValue(
+                     DAG, *MFI, VT,
+                     AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Z)
+               : DAG.getPOISON(VT);
+  case Intrinsic::amdgcn_cluster_workgroup_max_flat_id:
+    return Subtarget->hasClusters()
+               ? getPreloadedValue(
+                     DAG, *MFI, VT,
+                     AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_FLAT_ID)
+               : DAG.getPOISON(VT);
   case Intrinsic::amdgcn_wave_id:
     return lowerWaveID(DAG, Op);
   case Intrinsic::amdgcn_lds_kernel_id: {
@@ -15007,13 +15204,36 @@ SITargetLowering::performExtractVectorEltCombine(SDNode *N,
     return V;
   }
 
+  // EXTRACT_VECTOR_ELT (v2i32 bitcast (i64/f64:k), Idx)
+  //   =>
+  // i32:Lo(k) if Idx == 0, or
+  // i32:Hi(k) if Idx == 1
+  auto *Idx = dyn_cast<ConstantSDNode>(N->getOperand(1));
+  if (Vec.getOpcode() == ISD::BITCAST && VecVT == MVT::v2i32 && Idx) {
+    SDLoc SL(N);
+    SDValue PeekThrough = Vec.getOperand(0);
+    auto *KImm = dyn_cast<ConstantSDNode>(PeekThrough);
+    if (KImm && KImm->getValueType(0).getSizeInBits() == 64) {
+      uint64_t KImmValue = KImm->getZExtValue();
+      return DAG.getConstant(
+          (KImmValue >> (32 * Idx->getZExtValue())) & 0xffffffff, SL, MVT::i32);
+    }
+    auto *KFPImm = dyn_cast<ConstantFPSDNode>(PeekThrough);
+    if (KFPImm && KFPImm->getValueType(0).getSizeInBits() == 64) {
+      uint64_t KFPImmValue =
+          KFPImm->getValueAPF().bitcastToAPInt().getZExtValue();
+      return DAG.getConstant((KFPImmValue >> (32 * Idx->getZExtValue())) &
+                                 0xffffffff,
+                             SL, MVT::i32);
+    }
+  }
+
   if (!DCI.isBeforeLegalize())
     return SDValue();
 
   // Try to turn sub-dword accesses of vectors into accesses of the same 32-bit
   // elements. This exposes more load reduction opportunities by replacing
   // multiple small extract_vector_elements with a single 32-bit extract.
-  auto *Idx = dyn_cast<ConstantSDNode>(N->getOperand(1));
   if (isa<MemSDNode>(Vec) && VecEltSize <= 16 && VecEltVT.isByteSized() &&
       VecSize > 32 && VecSize % 32 == 0 && Idx) {
     EVT NewVT = getEquivalentMemType(*DAG.getContext(), VecVT);
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index 4886fcf9fd012..ba408a8f64540 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -16,6 +16,7 @@
 
 #include "AMDGPUArgumentUsageInfo.h"
 #include "AMDGPUISelLowering.h"
+#include "SIDefines.h"
 #include "llvm/CodeGen/MachineFunction.h"
 
 namespace llvm {
@@ -58,9 +59,17 @@ class SITargetLowering final : public AMDGPUTargetLowering {
                                      Align Alignment,
                                      ImplicitParameter Param) const;
 
+  SDValue convertABITypeToValueType(SelectionDAG &DAG, SDValue Val,
+                                    CCValAssign &VA, const SDLoc &SL) const;
+
   SDValue lowerStackParameter(SelectionDAG &DAG, CCValAssign &VA,
                               const SDLoc &SL, SDValue Chain,
                               const ISD::InputArg &Arg) const;
+  SDValue lowerWorkGroupId(
+      SelectionDAG &DAG, const SIMachineFunctionInfo &MFI, EVT VT,
+      AMDGPUFunctionArgInfo::PreloadedValue ClusterIdPV,
+      AMDGPUFunctionArgInfo::PreloadedValue ClusterMaxIdPV,
+      AMDGPUFunctionArgInfo::PreloadedValue ClusterWorkGroupIdPV) const;
   SDValue getPreloadedValue(SelectionDAG &DAG,
                             const SIMachineFunctionInfo &MFI,
                             EVT VT,
@@ -81,6 +90,9 @@ class SITargetLowering final : public AMDGPUTargetLowering {
                                         unsigned NewOpcode) const;
 
   SDValue lowerWaveID(SelectionDAG &DAG, SDValue Op) const;
+  SDValue lowerConstHwRegRead(SelectionDAG &DAG, SDValue Op,
+                              AMDGPU::Hwreg::Id HwReg, unsigned LowBit,
+                              unsigned Width) const;
   SDValue lowerWorkitemID(SelectionDAG &DAG, SDValue Op, unsigned Dim,
                           const ArgDescriptor &ArgDesc) const;
 
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index b163a274396ff..3a6c3b2e0c2c5 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -845,6 +845,15 @@ RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
     assert(Result.first >= 0 && Result.first < SQ_MAX_PGM_VGPRS);
     assert(Size % 16 == 0);
     Result.second = Result.first + (Size / 16);
+
+    if (Size == 16 && Context->ST->hasD16Writes32BitVgpr()) {
+      // Regardless of which lo16/hi16 is used, consider the full 32-bit
+      // register used.
+      if (AMDGPU::isHi16Reg(MCReg, *TRI))
+        Result.first -= 1;
+      else
+        Result.second += 1;
+    }
   } else if (TRI->isSGPRReg(*MRI, Op.getReg()) && RegIdx < SQ_MAX_PGM_SGPRS) {
     // SGPRs including VCC, TTMPs and EXEC but excluding read-only scalar
     // sources like SRC_PRIVATE_BASE.
@@ -1941,13 +1950,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
 
         // LOAD_CNT is only relevant to vgpr or LDS.
         unsigned RegNo = FIRST_LDS_VGPR;
-        // Only objects with alias scope info were added to LDSDMAScopes array.
-        // In the absense of the scope info we will not be able to disambiguate
-        // aliasing here. There is no need to try searching for a corresponding
-        // store slot. This is conservatively correct because in that case we
-        // will produce a wait using the first (general) LDS DMA wait slot which
-        // will wait on all of them anyway.
-        if (Ptr && Memop->getAAInfo() && Memop->getAAInfo().Scope) {
+        if (Ptr && Memop->getAAInfo()) {
           const auto &LDSDMAStores = ScoreBrackets.getLDSDMAStores();
           for (unsigned I = 0, E = LDSDMAStores.size(); I != E; ++I) {
             if (MI.mayAlias(AA, *LDSDMAStores[I], true))
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 398c99b3bd127..c39da779ecf8c 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -14,6 +14,7 @@
 #include "SIInstrInfo.h"
 #include "AMDGPU.h"
 #include "AMDGPUInstrInfo.h"
+#include "AMDGPULaneMaskUtils.h"
 #include "GCNHazardRecognizer.h"
 #include "GCNSubtarget.h"
 #include "SIMachineFunctionInfo.h"
@@ -912,7 +913,7 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
       return;
     }
 
-    if (!AMDGPU::SReg_64RegClass.contains(SrcReg)) {
+    if (!AMDGPU::SReg_64_EncodableRegClass.contains(SrcReg)) {
       reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
       return;
     }
@@ -1195,6 +1196,7 @@ void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB,
                                      Register FalseReg) const {
   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
   const TargetRegisterClass *BoolXExecRC = RI.getWaveMaskRegClass();
+  const AMDGPU::LaneMaskConstants &LMC = AMDGPU::LaneMaskConstants::get(ST);
   assert(MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass &&
          "Not a VGPR32 reg");
 
@@ -1213,10 +1215,7 @@ void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB,
     switch (Cond[0].getImm()) {
     case SIInstrInfo::SCC_TRUE: {
       Register SReg = MRI.createVirtualRegister(BoolXExecRC);
-      BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
-                                            : AMDGPU::S_CSELECT_B64), SReg)
-        .addImm(1)
-        .addImm(0);
+      BuildMI(MBB, I, DL, get(LMC.CSelectOpc), SReg).addImm(1).addImm(0);
       BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
         .addImm(0)
         .addReg(FalseReg)
@@ -1227,10 +1226,7 @@ void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB,
     }
     case SIInstrInfo::SCC_FALSE: {
       Register SReg = MRI.createVirtualRegister(BoolXExecRC);
-      BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
-                                            : AMDGPU::S_CSELECT_B64), SReg)
-        .addImm(0)
-        .addImm(1);
+      BuildMI(MBB, I, DL, get(LMC.CSelectOpc), SReg).addImm(0).addImm(1);
       BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
         .addImm(0)
         .addReg(FalseReg)
@@ -1270,13 +1266,8 @@ void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB,
     case SIInstrInfo::EXECNZ: {
       Register SReg = MRI.createVirtualRegister(BoolXExecRC);
       Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
-      BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32
-                                            : AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
-        .addImm(0);
-      BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
-                                            : AMDGPU::S_CSELECT_B64), SReg)
-        .addImm(1)
-        .addImm(0);
+      BuildMI(MBB, I, DL, get(LMC.OrSaveExecOpc), SReg2).addImm(0);
+      BuildMI(MBB, I, DL, get(LMC.CSelectOpc), SReg).addImm(1).addImm(0);
       BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
         .addImm(0)
         .addReg(FalseReg)
@@ -1288,13 +1279,8 @@ void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB,
     case SIInstrInfo::EXECZ: {
       Register SReg = MRI.createVirtualRegister(BoolXExecRC);
       Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
-      BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32
-                                            : AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
-        .addImm(0);
-      BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
-                                            : AMDGPU::S_CSELECT_B64), SReg)
-        .addImm(0)
-        .addImm(1);
+      BuildMI(MBB, I, DL, get(LMC.OrSaveExecOpc), SReg2).addImm(0);
+      BuildMI(MBB, I, DL, get(LMC.CSelectOpc), SReg).addImm(0).addImm(1);
       BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
         .addImm(0)
         .addReg(FalseReg)
@@ -1946,8 +1932,9 @@ void SIInstrInfo::insertNoops(MachineBasicBlock &MBB,
                               MachineBasicBlock::iterator MI,
                               unsigned Quantity) const {
   DebugLoc DL = MBB.findDebugLoc(MI);
+  unsigned MaxSNopCount = 1u << ST.getSNopBits();
   while (Quantity > 0) {
-    unsigned Arg = std::min(Quantity, 8u);
+    unsigned Arg = std::min(Quantity, MaxSNopCount);
     Quantity -= Arg;
     BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP)).addImm(Arg - 1);
   }
@@ -2046,6 +2033,7 @@ unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) {
 bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
   MachineBasicBlock &MBB = *MI.getParent();
   DebugLoc DL = MBB.findDebugLoc(MI);
+  const AMDGPU::LaneMaskConstants &LMC = AMDGPU::LaneMaskConstants::get(ST);
   switch (MI.getOpcode()) {
   default: return TargetInstrInfo::expandPostRAPseudo(MI);
   case AMDGPU::S_MOV_B64_term:
@@ -2470,18 +2458,15 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
   case AMDGPU::ENTER_STRICT_WWM: {
     // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
     // Whole Wave Mode is entered.
-    MI.setDesc(get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32
-                                 : AMDGPU::S_OR_SAVEEXEC_B64));
+    MI.setDesc(get(LMC.OrSaveExecOpc));
     break;
   }
   case AMDGPU::ENTER_STRICT_WQM: {
     // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
     // STRICT_WQM is entered.
-    const unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
-    const unsigned WQMOp = ST.isWave32() ? AMDGPU::S_WQM_B32 : AMDGPU::S_WQM_B64;
-    const unsigned MovOp = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
-    BuildMI(MBB, MI, DL, get(MovOp), MI.getOperand(0).getReg()).addReg(Exec);
-    BuildMI(MBB, MI, DL, get(WQMOp), Exec).addReg(Exec);
+    BuildMI(MBB, MI, DL, get(LMC.MovOpc), MI.getOperand(0).getReg())
+        .addReg(LMC.ExecReg);
+    BuildMI(MBB, MI, DL, get(LMC.WQMOpc), LMC.ExecReg).addReg(LMC.ExecReg);
 
     MI.eraseFromParent();
     break;
@@ -2490,7 +2475,7 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
   case AMDGPU::EXIT_STRICT_WQM: {
     // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
     // WWM/STICT_WQM is exited.
-    MI.setDesc(get(ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64));
+    MI.setDesc(get(LMC.MovOpc));
     break;
   }
   case AMDGPU::SI_RETURN: {
@@ -2598,7 +2583,7 @@ void SIInstrInfo::reMaterialize(MachineBasicBlock &MBB,
 
     const MCInstrDesc &TID = get(NewOpcode);
     const TargetRegisterClass *NewRC =
-        RI.getAllocatableClass(getRegClass(TID, 0, &RI, *MF));
+        RI.getAllocatableClass(getRegClass(TID, 0, &RI));
     MRI.setRegClass(DestReg, NewRC);
 
     UseMO->setReg(DestReg);
@@ -3615,7 +3600,7 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
           AMDGPU::V_MOV_B64_PSEUDO, AMDGPU::V_ACCVGPR_WRITE_B32_e64}) {
       const MCInstrDesc &MovDesc = get(MovOp);
 
-      const TargetRegisterClass *MovDstRC = getRegClass(MovDesc, 0, &RI, *MF);
+      const TargetRegisterClass *MovDstRC = getRegClass(MovDesc, 0, &RI);
       if (Is16Bit) {
         // We just need to find a correctly sized register class, so the
         // subregister index compatibility doesn't matter since we're statically
@@ -5590,7 +5575,7 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
       Data = nullptr;
 
     if (ST.hasGFX90AInsts()) {
-      if (Dst && Data &&
+      if (Dst && Data && !Dst->isTied() && !Data->isTied() &&
           (RI.isAGPR(MRI, Dst->getReg()) != RI.isAGPR(MRI, Data->getReg()))) {
         ErrInfo = "Invalid register class: "
                   "vdata and vdst should be both VGPR or AGPR";
@@ -5923,25 +5908,22 @@ void SIInstrInfo::insertScratchExecCopy(MachineFunction &MF,
                                         SlotIndexes *Indexes) const {
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   const SIInstrInfo *TII = ST.getInstrInfo();
-  bool IsWave32 = ST.isWave32();
+  const AMDGPU::LaneMaskConstants &LMC = AMDGPU::LaneMaskConstants::get(ST);
   if (IsSCCLive) {
     // Insert two move instructions, one to save the original value of EXEC and
     // the other to turn on all bits in EXEC. This is required as we can't use
     // the single instruction S_OR_SAVEEXEC that clobbers SCC.
-    unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
-    MCRegister Exec = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
-    auto StoreExecMI = BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Reg)
-                           .addReg(Exec, RegState::Kill);
-    auto FlipExecMI = BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Exec).addImm(-1);
+    auto StoreExecMI = BuildMI(MBB, MBBI, DL, TII->get(LMC.MovOpc), Reg)
+                           .addReg(LMC.ExecReg, RegState::Kill);
+    auto FlipExecMI =
+        BuildMI(MBB, MBBI, DL, TII->get(LMC.MovOpc), LMC.ExecReg).addImm(-1);
     if (Indexes) {
       Indexes->insertMachineInstrInMaps(*StoreExecMI);
       Indexes->insertMachineInstrInMaps(*FlipExecMI);
     }
   } else {
-    const unsigned OrSaveExec =
-        IsWave32 ? AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64;
     auto SaveExec =
-        BuildMI(MBB, MBBI, DL, TII->get(OrSaveExec), Reg).addImm(-1);
+        BuildMI(MBB, MBBI, DL, TII->get(LMC.OrSaveExecOpc), Reg).addImm(-1);
     SaveExec->getOperand(3).setIsDead(); // Mark SCC as dead.
     if (Indexes)
       Indexes->insertMachineInstrInMaps(*SaveExec);
@@ -5952,10 +5934,9 @@ void SIInstrInfo::restoreExec(MachineFunction &MF, MachineBasicBlock &MBB,
                               MachineBasicBlock::iterator MBBI,
                               const DebugLoc &DL, Register Reg,
                               SlotIndexes *Indexes) const {
-  unsigned ExecMov = isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
-  MCRegister Exec = isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
-  auto ExecRestoreMI =
-      BuildMI(MBB, MBBI, DL, get(ExecMov), Exec).addReg(Reg, RegState::Kill);
+  const AMDGPU::LaneMaskConstants &LMC = AMDGPU::LaneMaskConstants::get(ST);
+  auto ExecRestoreMI = BuildMI(MBB, MBBI, DL, get(LMC.MovOpc), LMC.ExecReg)
+                           .addReg(Reg, RegState::Kill);
   if (Indexes)
     Indexes->insertMachineInstrInMaps(*ExecRestoreMI);
 }
@@ -5975,12 +5956,8 @@ SIInstrInfo::getWholeWaveFunctionSetup(MachineFunction &MF) const {
 
 static const TargetRegisterClass *
 adjustAllocatableRegClass(const GCNSubtarget &ST, const SIRegisterInfo &RI,
-                          const MCInstrDesc &TID, unsigned RCID,
-                          bool IsAllocatable) {
-  if ((IsAllocatable || !ST.hasGFX90AInsts()) &&
-      (((TID.mayLoad() || TID.mayStore()) &&
-        !(TID.TSFlags & SIInstrFlags::Spill)) ||
-       (TID.TSFlags & SIInstrFlags::MIMG))) {
+                          const MCInstrDesc &TID, unsigned RCID) {
+  if (!ST.hasGFX90AInsts() && (TID.mayLoad() || TID.mayStore())) {
     switch (RCID) {
     case AMDGPU::AV_32RegClassID:
       RCID = AMDGPU::VGPR_32RegClassID;
@@ -6008,19 +5985,17 @@ adjustAllocatableRegClass(const GCNSubtarget &ST, const SIRegisterInfo &RI,
   return RI.getProperlyAlignedRC(RI.getRegClass(RCID));
 }
 
-const TargetRegisterClass *SIInstrInfo::getRegClass(const MCInstrDesc &TID,
-    unsigned OpNum, const TargetRegisterInfo *TRI,
-    const MachineFunction &MF)
-  const {
+const TargetRegisterClass *
+SIInstrInfo::getRegClass(const MCInstrDesc &TID, unsigned OpNum,
+                         const TargetRegisterInfo *TRI) const {
   if (OpNum >= TID.getNumOperands())
     return nullptr;
   auto RegClass = TID.operands()[OpNum].RegClass;
-  if (TID.getOpcode() == AMDGPU::AV_MOV_B64_IMM_PSEUDO) {
-    // Special pseudos have no alignment requirement
+  // Special pseudos have no alignment requirement.
+  if (TID.getOpcode() == AMDGPU::AV_MOV_B64_IMM_PSEUDO || isSpill(TID))
     return RI.getRegClass(RegClass);
-  }
 
-  return adjustAllocatableRegClass(ST, RI, TID, RegClass, false);
+  return adjustAllocatableRegClass(ST, RI, TID, RegClass);
 }
 
 const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI,
@@ -6039,7 +6014,7 @@ const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI,
   }
 
   unsigned RCID = Desc.operands()[OpNo].RegClass;
-  return adjustAllocatableRegClass(ST, RI, Desc, RCID, true);
+  return adjustAllocatableRegClass(ST, RI, Desc, RCID);
 }
 
 void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const {
@@ -6127,12 +6102,10 @@ bool SIInstrInfo::isLegalRegOperand(const MachineRegisterInfo &MRI,
     const TargetRegisterClass *SuperRC = RI.getLargestLegalSuperClass(RC, *MF);
     if (!SuperRC)
       return false;
-
-    DRC = RI.getMatchingSuperRegClass(SuperRC, DRC, MO.getSubReg());
-    if (!DRC)
-      return false;
+    return RI.getMatchingSuperRegClass(SuperRC, DRC, MO.getSubReg()) != nullptr;
   }
-  return RC->hasSuperClassEq(DRC);
+
+  return RI.getCommonSubClass(DRC, RC) != nullptr;
 }
 
 bool SIInstrInfo::isLegalRegOperand(const MachineInstr &MI, unsigned OpIdx,
@@ -6759,8 +6732,8 @@ void SIInstrInfo::legalizeOperandsFLAT(MachineRegisterInfo &MRI,
   if (moveFlatAddrToVGPR(MI))
     return;
 
-  const TargetRegisterClass *DeclaredRC = getRegClass(
-      MI.getDesc(), SAddr->getOperandNo(), &RI, *MI.getParent()->getParent());
+  const TargetRegisterClass *DeclaredRC =
+      getRegClass(MI.getDesc(), SAddr->getOperandNo(), &RI);
 
   Register ToSGPR = readlaneVGPRToSGPR(SAddr->getReg(), MI, MRI, DeclaredRC);
   SAddr->setReg(ToSGPR);
@@ -6820,13 +6793,7 @@ emitLoadScalarOpsFromVGPRLoop(const SIInstrInfo &TII,
   MachineFunction &MF = *LoopBB.getParent();
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   const SIRegisterInfo *TRI = ST.getRegisterInfo();
-  unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
-  unsigned SaveExecOpc =
-      ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
-  unsigned XorTermOpc =
-      ST.isWave32() ? AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term;
-  unsigned AndOpc =
-      ST.isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
+  const AMDGPU::LaneMaskConstants &LMC = AMDGPU::LaneMaskConstants::get(ST);
   const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
 
   MachineBasicBlock::iterator I = LoopBB.begin();
@@ -6854,7 +6821,7 @@ emitLoadScalarOpsFromVGPRLoop(const SIInstrInfo &TII,
         CondReg = NewCondReg;
       else { // If not the first, we create an AND.
         Register AndReg = MRI.createVirtualRegister(BoolXExecRC);
-        BuildMI(LoopBB, I, DL, TII.get(AndOpc), AndReg)
+        BuildMI(LoopBB, I, DL, TII.get(LMC.AndOpc), AndReg)
             .addReg(CondReg)
             .addReg(NewCondReg);
         CondReg = AndReg;
@@ -6910,7 +6877,7 @@ emitLoadScalarOpsFromVGPRLoop(const SIInstrInfo &TII,
           CondReg = NewCondReg;
         else { // If not the first, we create an AND.
           Register AndReg = MRI.createVirtualRegister(BoolXExecRC);
-          BuildMI(LoopBB, I, DL, TII.get(AndOpc), AndReg)
+          BuildMI(LoopBB, I, DL, TII.get(LMC.AndOpc), AndReg)
               .addReg(CondReg)
               .addReg(NewCondReg);
           CondReg = AndReg;
@@ -6939,15 +6906,15 @@ emitLoadScalarOpsFromVGPRLoop(const SIInstrInfo &TII,
   MRI.setSimpleHint(SaveExec, CondReg);
 
   // Update EXEC to matching lanes, saving original to SaveExec.
-  BuildMI(LoopBB, I, DL, TII.get(SaveExecOpc), SaveExec)
+  BuildMI(LoopBB, I, DL, TII.get(LMC.AndSaveExecOpc), SaveExec)
       .addReg(CondReg, RegState::Kill);
 
   // The original instruction is here; we insert the terminators after it.
   I = BodyBB.end();
 
   // Update EXEC, switch all done bits to 0 and all todo bits to 1.
-  BuildMI(BodyBB, I, DL, TII.get(XorTermOpc), Exec)
-      .addReg(Exec)
+  BuildMI(BodyBB, I, DL, TII.get(LMC.XorTermOpc), LMC.ExecReg)
+      .addReg(LMC.ExecReg)
       .addReg(SaveExec);
 
   BuildMI(BodyBB, I, DL, TII.get(AMDGPU::SI_WATERFALL_LOOP)).addMBB(&LoopBB);
@@ -6974,8 +6941,7 @@ loadMBUFScalarOperandsFromVGPR(const SIInstrInfo &TII, MachineInstr &MI,
     ++End;
   }
   const DebugLoc &DL = MI.getDebugLoc();
-  unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
-  unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
+  const AMDGPU::LaneMaskConstants &LMC = AMDGPU::LaneMaskConstants::get(ST);
   const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
 
   // Save SCC. Waterfall Loop may overwrite SCC.
@@ -6997,7 +6963,7 @@ loadMBUFScalarOperandsFromVGPR(const SIInstrInfo &TII, MachineInstr &MI,
   Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
 
   // Save the EXEC mask
-  BuildMI(MBB, Begin, DL, TII.get(MovExecOpc), SaveExec).addReg(Exec);
+  BuildMI(MBB, Begin, DL, TII.get(LMC.MovOpc), SaveExec).addReg(LMC.ExecReg);
 
   // Killed uses in the instruction we are waterfalling around will be
   // incorrect due to the added control-flow.
@@ -7058,7 +7024,8 @@ loadMBUFScalarOperandsFromVGPR(const SIInstrInfo &TII, MachineInstr &MI,
   }
 
   // Restore the EXEC mask
-  BuildMI(*RemainderBB, First, DL, TII.get(MovExecOpc), Exec).addReg(SaveExec);
+  BuildMI(*RemainderBB, First, DL, TII.get(LMC.MovOpc), LMC.ExecReg)
+      .addReg(SaveExec);
   return BodyBB;
 }
 
@@ -7753,12 +7720,10 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
     // Clear unused bits of vcc
     Register CondReg = Inst.getOperand(1).getReg();
     bool IsSCC = CondReg == AMDGPU::SCC;
-    Register VCC = RI.getVCC();
-    Register EXEC = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
-    unsigned Opc = ST.isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
-    BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(Opc), VCC)
-        .addReg(EXEC)
-        .addReg(IsSCC ? VCC : CondReg);
+    const AMDGPU::LaneMaskConstants &LMC = AMDGPU::LaneMaskConstants::get(ST);
+    BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(LMC.AndOpc), LMC.VccReg)
+        .addReg(LMC.ExecReg)
+        .addReg(IsSCC ? LMC.VccReg : CondReg);
     Inst.removeOperand(1);
   } break;
 
@@ -10211,9 +10176,7 @@ MachineInstr *SIInstrInfo::createPHISourceCopy(
       InsPt->definesRegister(Src, /*TRI=*/nullptr)) {
     InsPt++;
     return BuildMI(MBB, InsPt, DL,
-                   get(ST.isWave32() ? AMDGPU::S_MOV_B32_term
-                                     : AMDGPU::S_MOV_B64_term),
-                   Dst)
+                   get(AMDGPU::LaneMaskConstants::get(ST).MovTermOpc), Dst)
         .addReg(Src, 0, SrcSubReg)
         .addReg(AMDGPU::EXEC, RegState::Implicit);
   }
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index f7dde2b90b68e..dffb3d7459e64 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -797,10 +797,12 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
     return get(Opcode).TSFlags & SIInstrFlags::Spill;
   }
 
-  static bool isSpill(const MachineInstr &MI) {
-    return MI.getDesc().TSFlags & SIInstrFlags::Spill;
+  static bool isSpill(const MCInstrDesc &Desc) {
+    return Desc.TSFlags & SIInstrFlags::Spill;
   }
 
+  static bool isSpill(const MachineInstr &MI) { return isSpill(MI.getDesc()); }
+
   static bool isWWMRegSpillOpcode(uint16_t Opcode) {
     return Opcode == AMDGPU::SI_SPILL_WWM_V32_SAVE ||
            Opcode == AMDGPU::SI_SPILL_WWM_AV32_SAVE ||
@@ -926,7 +928,8 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
     return Opcode == AMDGPU::S_CMPK_EQ_U32 || Opcode == AMDGPU::S_CMPK_LG_U32 ||
            Opcode == AMDGPU::S_CMPK_GT_U32 || Opcode == AMDGPU::S_CMPK_GE_U32 ||
            Opcode == AMDGPU::S_CMPK_LT_U32 || Opcode == AMDGPU::S_CMPK_LE_U32 ||
-           Opcode == AMDGPU::S_GETREG_B32;
+           Opcode == AMDGPU::S_GETREG_B32 ||
+           Opcode == AMDGPU::S_GETREG_B32_const;
   }
 
   /// \returns true if this is an s_store_dword* instruction. This is more
@@ -1534,10 +1537,9 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
   /// Return true if this opcode should not be used by codegen.
   bool isAsmOnlyOpcode(int MCOp) const;
 
-  const TargetRegisterClass *getRegClass(const MCInstrDesc &TID, unsigned OpNum,
-                                         const TargetRegisterInfo *TRI,
-                                         const MachineFunction &MF)
-    const override;
+  const TargetRegisterClass *
+  getRegClass(const MCInstrDesc &TID, unsigned OpNum,
+              const TargetRegisterInfo *TRI) const override;
 
   void fixImplicitOperands(MachineInstr &MI) const;
 
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index aa5dae09ca185..c49f1930705aa 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -1353,7 +1353,7 @@ def MatrixAReuse : NamedBitOperand<"matrix_a_reuse">;
 def MatrixBReuse : NamedBitOperand<"matrix_b_reuse">;
 
 def ScaleSel : NamedIntOperand<"scale_sel"> {
-  let Validator = "isUInt<3>";
+  let Validator = "isUInt<4>";
 }
 
 class KImmFPOperand<ValueType vt> : ImmOperand<vt> {
@@ -2596,24 +2596,42 @@ class getAlign2RegOp<RegisterOperand RC> {
   RegisterOperand ret =
     !cond(!eq(RC, VGPROp_16) : VGPROp_16,
           !eq(RC, VGPROp_32) : VGPROp_32,
+
           !eq(RC, VGPROp_64) : VGPROp_64_Align2,
           !eq(RC, VGPROp_64_Align1) : VGPROp_64_Align2,
+          !eq(RC, VGPROp_64_Align2) : VGPROp_64_Align2,
+
           !eq(RC, VGPROp_96) : VGPROp_96_Align2,
           !eq(RC, VGPROp_96_Align1) : VGPROp_96_Align2,
+          !eq(RC, VGPROp_96_Align2) : VGPROp_96_Align2,
+
           !eq(RC, VGPROp_128) : VGPROp_128_Align2,
           !eq(RC, VGPROp_128_Align1) : VGPROp_128_Align2,
+          !eq(RC, VGPROp_128_Align2) : VGPROp_128_Align2,
+
           !eq(RC, VGPROp_160) : VGPROp_160_Align2,
           !eq(RC, VGPROp_160_Align1) : VGPROp_160_Align2,
+          !eq(RC, VGPROp_160_Align2) : VGPROp_160_Align2,
+
           !eq(RC, VGPROp_1024) : VGPROp_1024_Align2,
           !eq(RC, VGPROp_1024_Align1) : VGPROp_1024_Align2,
+          !eq(RC, VGPROp_1024_Align2) : VGPROp_1024_Align2,
+
           !eq(RC, AVLdSt_32) : AVLdSt_32,
+          !eq(RC, AVLdSt_64_Align1) : AVLdSt_64_Align2,
           !eq(RC, AVLdSt_64) : AVLdSt_64_Align2,
+
           !eq(RC, AVLdSt_96) : AVLdSt_96_Align2,
+          !eq(RC, AVLdSt_96_Align1) : AVLdSt_96_Align1,
           !eq(RC, AVLdSt_96_Align1) : AVLdSt_96_Align2,
+
           !eq(RC, AVLdSt_128) : AVLdSt_128_Align2,
           !eq(RC, AVLdSt_128_Align1) : AVLdSt_128_Align2,
+          !eq(RC, AVLdSt_128_Align2) : AVLdSt_128_Align2,
+
           !eq(RC, AVLdSt_160) : AVLdSt_160_Align2,
-          !eq(RC, AVLdSt_160_Align1) : AVLdSt_160_Align2);
+          !eq(RC, AVLdSt_160_Align1) : AVLdSt_160_Align2,
+          !eq(RC, AVLdSt_160_Align2) : AVLdSt_160_Align2);
 }
 
 class getEquivalentAGPROperand<RegisterOperand RC> {
diff --git a/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp b/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp
index 73a2d0a56aebe..6537b79d58021 100644
--- a/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
+#include "AMDGPULaneMaskUtils.h"
 #include "GCNSubtarget.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "SIMachineFunctionInfo.h"
@@ -27,21 +28,22 @@ namespace {
 
 class SILateBranchLowering {
 private:
-  const SIRegisterInfo *TRI = nullptr;
-  const SIInstrInfo *TII = nullptr;
-  MachineDominatorTree *MDT = nullptr;
+  const GCNSubtarget &ST;
+  const SIInstrInfo *TII;
+  const SIRegisterInfo *TRI;
+  MachineDominatorTree *MDT;
+  const AMDGPU::LaneMaskConstants &LMC;
 
   void expandChainCall(MachineInstr &MI, const GCNSubtarget &ST,
                        bool DynamicVGPR);
   void earlyTerm(MachineInstr &MI, MachineBasicBlock *EarlyExitBlock);
 
 public:
-  SILateBranchLowering(MachineDominatorTree *MDT) : MDT(MDT) {}
+  SILateBranchLowering(const GCNSubtarget &ST, MachineDominatorTree *MDT)
+      : ST(ST), TII(ST.getInstrInfo()), TRI(&TII->getRegisterInfo()), MDT(MDT),
+        LMC(AMDGPU::LaneMaskConstants::get(ST)) {}
 
   bool run(MachineFunction &MF);
-
-  unsigned MovOpc;
-  Register ExecReg;
 };
 
 class SILateBranchLoweringLegacy : public MachineFunctionPass {
@@ -50,8 +52,9 @@ class SILateBranchLoweringLegacy : public MachineFunctionPass {
   SILateBranchLoweringLegacy() : MachineFunctionPass(ID) {}
 
   bool runOnMachineFunction(MachineFunction &MF) override {
+    const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
     auto *MDT = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
-    return SILateBranchLowering(MDT).run(MF);
+    return SILateBranchLowering(ST, MDT).run(MF);
   }
 
   StringRef getPassName() const override {
@@ -166,17 +169,16 @@ void SILateBranchLowering::expandChainCall(MachineInstr &MI,
     copyOpWithoutRegFlags(SelectCallee,
                           *TII->getNamedOperand(MI, AMDGPU::OpName::fbcallee));
 
-    auto SelectExec = BuildMI(*MI.getParent(), MI, DL,
-                              TII->get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
-                                                     : AMDGPU::S_CSELECT_B64))
-                          .addDef(ExecReg);
+    auto SelectExec = BuildMI(*MI.getParent(), MI, DL, TII->get(LMC.CSelectOpc))
+                          .addDef(LMC.ExecReg);
 
     copyOpWithoutRegFlags(SelectExec,
                           *TII->getNamedOperand(MI, AMDGPU::OpName::exec));
     copyOpWithoutRegFlags(SelectExec,
                           *TII->getNamedOperand(MI, AMDGPU::OpName::fbexec));
   } else {
-    auto SetExec = BuildMI(*MI.getParent(), MI, DL, TII->get(MovOpc), ExecReg);
+    auto SetExec =
+        BuildMI(*MI.getParent(), MI, DL, TII->get(LMC.MovOpc), LMC.ExecReg);
     copyOpWithoutRegFlags(SetExec,
                           *TII->getNamedOperand(MI, AMDGPU::OpName::exec));
   }
@@ -206,8 +208,9 @@ void SILateBranchLowering::earlyTerm(MachineInstr &MI,
 PreservedAnalyses
 llvm::SILateBranchLoweringPass::run(MachineFunction &MF,
                                     MachineFunctionAnalysisManager &MFAM) {
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   auto *MDT = &MFAM.getResult<MachineDominatorTreeAnalysis>(MF);
-  if (!SILateBranchLowering(MDT).run(MF))
+  if (!SILateBranchLowering(ST, MDT).run(MF))
     return PreservedAnalyses::all();
 
   return getMachineFunctionPassPreservedAnalyses()
@@ -215,13 +218,6 @@ llvm::SILateBranchLoweringPass::run(MachineFunction &MF,
 }
 
 bool SILateBranchLowering::run(MachineFunction &MF) {
-  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
-  TII = ST.getInstrInfo();
-  TRI = &TII->getRegisterInfo();
-
-  MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
-  ExecReg = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
-
   SmallVector<MachineInstr *, 4> EarlyTermInstrs;
   SmallVector<MachineInstr *, 1> EpilogInstrs;
   bool MadeChange = false;
@@ -270,8 +266,8 @@ bool SILateBranchLowering::run(MachineFunction &MF) {
     DebugLoc DL;
 
     MF.insert(MF.end(), EarlyExitBlock);
-    BuildMI(*EarlyExitBlock, EarlyExitBlock->end(), DL, TII->get(MovOpc),
-            ExecReg)
+    BuildMI(*EarlyExitBlock, EarlyExitBlock->end(), DL, TII->get(LMC.MovOpc),
+            LMC.ExecReg)
         .addImm(0);
     generateEndPgm(*EarlyExitBlock, EarlyExitBlock->end(), DL, TII, MF);
 
diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index 69d02e7c2934c..f0d1117664983 100644
--- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -1337,10 +1337,10 @@ SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI,
                                               AMDGPU::OpName::data1);
 
     const TargetRegisterClass *DataRC0 =
-        TII->getRegClass(Write2Opc, Data0Idx, TRI, *MF);
+        TII->getRegClass(Write2Opc, Data0Idx, TRI);
 
     const TargetRegisterClass *DataRC1 =
-        TII->getRegClass(Write2Opc, Data1Idx, TRI, *MF);
+        TII->getRegClass(Write2Opc, Data1Idx, TRI);
 
     if (unsigned SubReg = Data0->getSubReg()) {
       DataRC0 = TRI->getMatchingSuperRegClass(MRI->getRegClass(Data0->getReg()),
diff --git a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
index e97536d36bab2..115a020f44098 100644
--- a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
@@ -50,6 +50,7 @@
 
 #include "SILowerControlFlow.h"
 #include "AMDGPU.h"
+#include "AMDGPULaneMaskUtils.h"
 #include "GCNSubtarget.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "llvm/ADT/SmallSet.h"
@@ -85,15 +86,7 @@ class SILowerControlFlow {
   SmallSet<Register, 8> RecomputeRegs;
 
   const TargetRegisterClass *BoolRC = nullptr;
-  unsigned AndOpc;
-  unsigned OrOpc;
-  unsigned XorOpc;
-  unsigned MovTermOpc;
-  unsigned Andn2TermOpc;
-  unsigned XorTermrOpc;
-  unsigned OrTermrOpc;
-  unsigned OrSaveExecOpc;
-  unsigned Exec;
+  const AMDGPU::LaneMaskConstants &LMC;
 
   bool EnableOptimizeEndCf = false;
 
@@ -139,9 +132,11 @@ class SILowerControlFlow {
   void optimizeEndCf();
 
 public:
-  SILowerControlFlow(LiveIntervals *LIS, LiveVariables *LV,
-                     MachineDominatorTree *MDT, MachinePostDominatorTree *PDT)
-      : LIS(LIS), LV(LV), MDT(MDT), PDT(PDT) {}
+  SILowerControlFlow(const GCNSubtarget *ST, LiveIntervals *LIS,
+                     LiveVariables *LV, MachineDominatorTree *MDT,
+                     MachinePostDominatorTree *PDT)
+      : LIS(LIS), LV(LV), MDT(MDT), PDT(PDT),
+        LMC(AMDGPU::LaneMaskConstants::get(*ST)) {}
   bool run(MachineFunction &MF);
 };
 
@@ -243,18 +238,15 @@ void SILowerControlFlow::emitIf(MachineInstr &MI) {
   // will interfere with trying to form s_and_saveexec_b64 later.
   Register CopyReg = SimpleIf ? SaveExecReg
                        : MRI->createVirtualRegister(BoolRC);
-  MachineInstr *CopyExec =
-    BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), CopyReg)
-    .addReg(Exec)
-    .addReg(Exec, RegState::ImplicitDefine);
+  MachineInstr *CopyExec = BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), CopyReg)
+                               .addReg(LMC.ExecReg)
+                               .addReg(LMC.ExecReg, RegState::ImplicitDefine);
   LoweredIf.insert(CopyReg);
 
   Register Tmp = MRI->createVirtualRegister(BoolRC);
 
   MachineInstr *And =
-    BuildMI(MBB, I, DL, TII->get(AndOpc), Tmp)
-    .addReg(CopyReg)
-    .add(Cond);
+      BuildMI(MBB, I, DL, TII->get(LMC.AndOpc), Tmp).addReg(CopyReg).add(Cond);
   if (LV)
     LV->replaceKillInstruction(Cond.getReg(), MI, *And);
 
@@ -262,18 +254,17 @@ void SILowerControlFlow::emitIf(MachineInstr &MI) {
 
   MachineInstr *Xor = nullptr;
   if (!SimpleIf) {
-    Xor =
-      BuildMI(MBB, I, DL, TII->get(XorOpc), SaveExecReg)
-      .addReg(Tmp)
-      .addReg(CopyReg);
+    Xor = BuildMI(MBB, I, DL, TII->get(LMC.XorOpc), SaveExecReg)
+              .addReg(Tmp)
+              .addReg(CopyReg);
     setImpSCCDefDead(*Xor, ImpDefSCC.isDead());
   }
 
   // Use a copy that is a terminator to get correct spill code placement it with
   // fast regalloc.
   MachineInstr *SetExec =
-    BuildMI(MBB, I, DL, TII->get(MovTermOpc), Exec)
-    .addReg(Tmp, RegState::Kill);
+      BuildMI(MBB, I, DL, TII->get(LMC.MovTermOpc), LMC.ExecReg)
+          .addReg(Tmp, RegState::Kill);
   if (LV)
     LV->getVarInfo(Tmp).Kills.push_back(SetExec);
 
@@ -327,8 +318,8 @@ void SILowerControlFlow::emitElse(MachineInstr &MI) {
   // else.
   Register SaveReg = MRI->createVirtualRegister(BoolRC);
   MachineInstr *OrSaveExec =
-    BuildMI(MBB, Start, DL, TII->get(OrSaveExecOpc), SaveReg)
-    .add(MI.getOperand(1)); // Saved EXEC
+      BuildMI(MBB, Start, DL, TII->get(LMC.OrSaveExecOpc), SaveReg)
+          .add(MI.getOperand(1)); // Saved EXEC
   if (LV)
     LV->replaceKillInstruction(SrcReg, MI, *OrSaveExec);
 
@@ -338,14 +329,14 @@ void SILowerControlFlow::emitElse(MachineInstr &MI) {
 
   // This accounts for any modification of the EXEC mask within the block and
   // can be optimized out pre-RA when not required.
-  MachineInstr *And = BuildMI(MBB, ElsePt, DL, TII->get(AndOpc), DstReg)
-                          .addReg(Exec)
+  MachineInstr *And = BuildMI(MBB, ElsePt, DL, TII->get(LMC.AndOpc), DstReg)
+                          .addReg(LMC.ExecReg)
                           .addReg(SaveReg);
 
   MachineInstr *Xor =
-    BuildMI(MBB, ElsePt, DL, TII->get(XorTermrOpc), Exec)
-    .addReg(Exec)
-    .addReg(DstReg);
+      BuildMI(MBB, ElsePt, DL, TII->get(LMC.XorTermOpc), LMC.ExecReg)
+          .addReg(LMC.ExecReg)
+          .addReg(DstReg);
 
   // Skip ahead to the unconditional branch in case there are other terminators
   // present.
@@ -400,16 +391,16 @@ void SILowerControlFlow::emitIfBreak(MachineInstr &MI) {
   Register AndReg;
   if (!SkipAnding) {
     AndReg = MRI->createVirtualRegister(BoolRC);
-    And = BuildMI(MBB, &MI, DL, TII->get(AndOpc), AndReg)
-             .addReg(Exec)
-             .add(MI.getOperand(1));
+    And = BuildMI(MBB, &MI, DL, TII->get(LMC.AndOpc), AndReg)
+              .addReg(LMC.ExecReg)
+              .add(MI.getOperand(1));
     if (LV)
       LV->replaceKillInstruction(MI.getOperand(1).getReg(), MI, *And);
-    Or = BuildMI(MBB, &MI, DL, TII->get(OrOpc), Dst)
+    Or = BuildMI(MBB, &MI, DL, TII->get(LMC.OrOpc), Dst)
              .addReg(AndReg)
              .add(MI.getOperand(2));
   } else {
-    Or = BuildMI(MBB, &MI, DL, TII->get(OrOpc), Dst)
+    Or = BuildMI(MBB, &MI, DL, TII->get(LMC.OrOpc), Dst)
              .add(MI.getOperand(1))
              .add(MI.getOperand(2));
     if (LV)
@@ -436,8 +427,8 @@ void SILowerControlFlow::emitLoop(MachineInstr &MI) {
   const DebugLoc &DL = MI.getDebugLoc();
 
   MachineInstr *AndN2 =
-      BuildMI(MBB, &MI, DL, TII->get(Andn2TermOpc), Exec)
-          .addReg(Exec)
+      BuildMI(MBB, &MI, DL, TII->get(LMC.AndN2TermOpc), LMC.ExecReg)
+          .addReg(LMC.ExecReg)
           .add(MI.getOperand(0));
   if (LV)
     LV->replaceKillInstruction(MI.getOperand(0).getReg(), MI, *AndN2);
@@ -505,7 +496,7 @@ MachineBasicBlock *SILowerControlFlow::emitEndCf(MachineInstr &MI) {
     }
   }
 
-  unsigned Opcode = OrOpc;
+  unsigned Opcode = LMC.OrOpc;
   MachineBasicBlock *SplitBB = &MBB;
   if (NeedBlockSplit) {
     SplitBB = MBB.splitAt(MI, /*UpdateLiveIns*/true, LIS);
@@ -522,14 +513,13 @@ MachineBasicBlock *SILowerControlFlow::emitEndCf(MachineInstr &MI) {
       if (PDT)
         PDT->applyUpdates(DTUpdates);
     }
-    Opcode = OrTermrOpc;
+    Opcode = LMC.OrTermOpc;
     InsPt = MI;
   }
 
-  MachineInstr *NewMI =
-    BuildMI(MBB, InsPt, DL, TII->get(Opcode), Exec)
-    .addReg(Exec)
-    .add(MI.getOperand(0));
+  MachineInstr *NewMI = BuildMI(MBB, InsPt, DL, TII->get(Opcode), LMC.ExecReg)
+                            .addReg(LMC.ExecReg)
+                            .add(MI.getOperand(0));
   if (LV) {
     LV->replaceKillInstruction(DataReg, MI, *NewMI);
 
@@ -597,12 +587,12 @@ void SILowerControlFlow::findMaskOperands(MachineInstr &MI, unsigned OpNo,
   // does not really modify exec.
   for (auto I = Def->getIterator(); I != MI.getIterator(); ++I)
     if (I->modifiesRegister(AMDGPU::EXEC, TRI) &&
-        !(I->isCopy() && I->getOperand(0).getReg() != Exec))
+        !(I->isCopy() && I->getOperand(0).getReg() != LMC.ExecReg))
       return;
 
   for (const auto &SrcOp : Def->explicit_operands())
     if (SrcOp.isReg() && SrcOp.isUse() &&
-        (SrcOp.getReg().isVirtual() || SrcOp.getReg() == Exec))
+        (SrcOp.getReg().isVirtual() || SrcOp.getReg() == LMC.ExecReg))
       Src.push_back(SrcOp);
 }
 
@@ -781,28 +771,6 @@ bool SILowerControlFlow::run(MachineFunction &MF) {
   MRI = &MF.getRegInfo();
   BoolRC = TRI->getBoolRC();
 
-  if (ST.isWave32()) {
-    AndOpc = AMDGPU::S_AND_B32;
-    OrOpc = AMDGPU::S_OR_B32;
-    XorOpc = AMDGPU::S_XOR_B32;
-    MovTermOpc = AMDGPU::S_MOV_B32_term;
-    Andn2TermOpc = AMDGPU::S_ANDN2_B32_term;
-    XorTermrOpc = AMDGPU::S_XOR_B32_term;
-    OrTermrOpc = AMDGPU::S_OR_B32_term;
-    OrSaveExecOpc = AMDGPU::S_OR_SAVEEXEC_B32;
-    Exec = AMDGPU::EXEC_LO;
-  } else {
-    AndOpc = AMDGPU::S_AND_B64;
-    OrOpc = AMDGPU::S_OR_B64;
-    XorOpc = AMDGPU::S_XOR_B64;
-    MovTermOpc = AMDGPU::S_MOV_B64_term;
-    Andn2TermOpc = AMDGPU::S_ANDN2_B64_term;
-    XorTermrOpc = AMDGPU::S_XOR_B64_term;
-    OrTermrOpc = AMDGPU::S_OR_B64_term;
-    OrSaveExecOpc = AMDGPU::S_OR_SAVEEXEC_B64;
-    Exec = AMDGPU::EXEC;
-  }
-
   // Compute set of blocks with kills
   const bool CanDemote =
       MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS;
@@ -876,6 +844,7 @@ bool SILowerControlFlow::run(MachineFunction &MF) {
 }
 
 bool SILowerControlFlowLegacy::runOnMachineFunction(MachineFunction &MF) {
+  const GCNSubtarget *ST = &MF.getSubtarget<GCNSubtarget>();
   // This doesn't actually need LiveIntervals, but we can preserve them.
   auto *LISWrapper = getAnalysisIfAvailable<LiveIntervalsWrapperPass>();
   LiveIntervals *LIS = LISWrapper ? &LISWrapper->getLIS() : nullptr;
@@ -888,12 +857,13 @@ bool SILowerControlFlowLegacy::runOnMachineFunction(MachineFunction &MF) {
       getAnalysisIfAvailable<MachinePostDominatorTreeWrapperPass>();
   MachinePostDominatorTree *PDT =
       PDTWrapper ? &PDTWrapper->getPostDomTree() : nullptr;
-  return SILowerControlFlow(LIS, LV, MDT, PDT).run(MF);
+  return SILowerControlFlow(ST, LIS, LV, MDT, PDT).run(MF);
 }
 
 PreservedAnalyses
 SILowerControlFlowPass::run(MachineFunction &MF,
                             MachineFunctionAnalysisManager &MFAM) {
+  const GCNSubtarget *ST = &MF.getSubtarget<GCNSubtarget>();
   LiveIntervals *LIS = MFAM.getCachedResult<LiveIntervalsAnalysis>(MF);
   LiveVariables *LV = MFAM.getCachedResult<LiveVariablesAnalysis>(MF);
   MachineDominatorTree *MDT =
@@ -901,7 +871,7 @@ SILowerControlFlowPass::run(MachineFunction &MF,
   MachinePostDominatorTree *PDT =
       MFAM.getCachedResult<MachinePostDominatorTreeAnalysis>(MF);
 
-  bool Changed = SILowerControlFlow(LIS, LV, MDT, PDT).run(MF);
+  bool Changed = SILowerControlFlow(ST, LIS, LV, MDT, PDT).run(MF);
   if (!Changed)
     return PreservedAnalyses::all();
 
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 54426d33d3473..908d856d386f5 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -132,13 +132,16 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
   if (!AMDGPU::isGraphics(CC) ||
       ((CC == CallingConv::AMDGPU_CS || CC == CallingConv::AMDGPU_Gfx) &&
        ST.hasArchitectedSGPRs())) {
-    if (IsKernel || !F.hasFnAttribute("amdgpu-no-workgroup-id-x"))
+    if (IsKernel || !F.hasFnAttribute("amdgpu-no-workgroup-id-x") ||
+        !F.hasFnAttribute("amdgpu-no-cluster-id-x"))
       WorkGroupIDX = true;
 
-    if (!F.hasFnAttribute("amdgpu-no-workgroup-id-y"))
+    if (!F.hasFnAttribute("amdgpu-no-workgroup-id-y") ||
+        !F.hasFnAttribute("amdgpu-no-cluster-id-y"))
       WorkGroupIDY = true;
 
-    if (!F.hasFnAttribute("amdgpu-no-workgroup-id-z"))
+    if (!F.hasFnAttribute("amdgpu-no-workgroup-id-z") ||
+        !F.hasFnAttribute("amdgpu-no-cluster-id-z"))
       WorkGroupIDZ = true;
   }
 
@@ -195,6 +198,8 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
     VGPRForAGPRCopy =
         AMDGPU::VGPR_32RegClass.getRegister(ST.getMaxNumVGPRs(F) - 1);
   }
+
+  ClusterDims = AMDGPU::ClusterDimsAttr::get(F);
 }
 
 MachineFunctionInfo *SIMachineFunctionInfo::clone(
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index ca8f8033a2d54..45606153db58e 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -465,6 +465,9 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
   // Default/requested number of work groups for the function.
   SmallVector<unsigned> MaxNumWorkGroups = {0, 0, 0};
 
+  // Requested cluster dimensions.
+  AMDGPU::ClusterDimsAttr ClusterDims;
+
 private:
   unsigned NumUserSGPRs = 0;
   unsigned NumSystemSGPRs = 0;
@@ -1207,6 +1210,8 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
   unsigned getMaxNumWorkGroupsX() const { return MaxNumWorkGroups[0]; }
   unsigned getMaxNumWorkGroupsY() const { return MaxNumWorkGroups[1]; }
   unsigned getMaxNumWorkGroupsZ() const { return MaxNumWorkGroups[2]; }
+
+  AMDGPU::ClusterDimsAttr getClusterDims() const { return ClusterDims; }
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index 1637c06936f9b..c501ebba0c7ed 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -462,10 +462,6 @@ class SIGfx90ACacheControl : public SIGfx7CacheControl {
                              SIAtomicScope Scope,
                              SIAtomicAddrSpace AddrSpace) const override;
 
-  bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
-                              SIAtomicScope Scope,
-                              SIAtomicAddrSpace AddrSpace) const override;
-
   bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
                             SIAtomicScope Scope,
                             SIAtomicAddrSpace AddrSpace) const override;
@@ -1375,41 +1371,6 @@ bool SIGfx90ACacheControl::enableLoadCacheBypass(
   return Changed;
 }
 
-bool SIGfx90ACacheControl::enableStoreCacheBypass(
-    const MachineBasicBlock::iterator &MI,
-    SIAtomicScope Scope,
-    SIAtomicAddrSpace AddrSpace) const {
-  assert(!MI->mayLoad() && MI->mayStore());
-  bool Changed = false;
-
-  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
-    switch (Scope) {
-    case SIAtomicScope::SYSTEM:
-    case SIAtomicScope::AGENT:
-      /// Do not set glc for store atomic operations as they implicitly write
-      /// through the L1 cache.
-      break;
-    case SIAtomicScope::WORKGROUP:
-    case SIAtomicScope::WAVEFRONT:
-    case SIAtomicScope::SINGLETHREAD:
-      // No cache to bypass. Store atomics implicitly write through the L1
-      // cache.
-      break;
-    default:
-      llvm_unreachable("Unsupported synchronization scope");
-    }
-  }
-
-  /// The scratch address space does not need the global memory caches
-  /// to be bypassed as all memory operations by the same thread are
-  /// sequentially consistent, and no other thread can access scratch
-  /// memory.
-
-  /// Other address spaces do not have a cache.
-
-  return Changed;
-}
-
 bool SIGfx90ACacheControl::enableRMWCacheBypass(
     const MachineBasicBlock::iterator &MI,
     SIAtomicScope Scope,
diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
index 745e4086bc7fe..aa028c850bd49 100644
--- a/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
+++ b/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
@@ -8,6 +8,7 @@
 
 #include "SIOptimizeExecMasking.h"
 #include "AMDGPU.h"
+#include "AMDGPULaneMaskUtils.h"
 #include "GCNSubtarget.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "SIRegisterInfo.h"
@@ -25,12 +26,20 @@ using namespace llvm;
 namespace {
 
 class SIOptimizeExecMasking {
-  MachineFunction *MF = nullptr;
-  const GCNSubtarget *ST = nullptr;
-  const SIRegisterInfo *TRI = nullptr;
-  const SIInstrInfo *TII = nullptr;
-  const MachineRegisterInfo *MRI = nullptr;
-  MCRegister Exec;
+public:
+  SIOptimizeExecMasking(MachineFunction *MF)
+      : MF(MF), ST(&MF->getSubtarget<GCNSubtarget>()), TII(ST->getInstrInfo()),
+        TRI(&TII->getRegisterInfo()), MRI(&MF->getRegInfo()),
+        LMC(AMDGPU::LaneMaskConstants::get(*ST)) {}
+  bool run();
+
+private:
+  MachineFunction *MF;
+  const GCNSubtarget *ST;
+  const SIInstrInfo *TII;
+  const SIRegisterInfo *TRI;
+  const MachineRegisterInfo *MRI;
+  const AMDGPU::LaneMaskConstants &LMC;
 
   DenseMap<MachineInstr *, MachineInstr *> SaveExecVCmpMapping;
   SmallVector<std::pair<MachineInstr *, MachineInstr *>, 1> OrXors;
@@ -57,13 +66,10 @@ class SIOptimizeExecMasking {
   bool optimizeExecSequence();
   void tryRecordVCmpxAndSaveexecSequence(MachineInstr &MI);
   bool optimizeVCMPSaveExecSequence(MachineInstr &SaveExecInstr,
-                                    MachineInstr &VCmp, MCRegister Exec) const;
+                                    MachineInstr &VCmp) const;
 
   void tryRecordOrSaveexecXorSequence(MachineInstr &MI);
   bool optimizeOrSaveexecXorSequences();
-
-public:
-  bool run(MachineFunction &MF);
 };
 
 class SIOptimizeExecMaskingLegacy : public MachineFunctionPass {
@@ -91,9 +97,9 @@ class SIOptimizeExecMaskingLegacy : public MachineFunctionPass {
 PreservedAnalyses
 SIOptimizeExecMaskingPass::run(MachineFunction &MF,
                                MachineFunctionAnalysisManager &) {
-  SIOptimizeExecMasking Impl;
+  SIOptimizeExecMasking Impl(&MF);
 
-  if (!Impl.run(MF))
+  if (!Impl.run())
     return PreservedAnalyses::all();
 
   auto PA = getMachineFunctionPassPreservedAnalyses();
@@ -120,7 +126,7 @@ Register SIOptimizeExecMasking::isCopyFromExec(const MachineInstr &MI) const {
   case AMDGPU::S_MOV_B32:
   case AMDGPU::S_MOV_B32_term: {
     const MachineOperand &Src = MI.getOperand(1);
-    if (Src.isReg() && Src.getReg() == Exec)
+    if (Src.isReg() && Src.getReg() == LMC.ExecReg)
       return MI.getOperand(0).getReg();
   }
   }
@@ -135,7 +141,7 @@ Register SIOptimizeExecMasking::isCopyToExec(const MachineInstr &MI) const {
   case AMDGPU::S_MOV_B64:
   case AMDGPU::S_MOV_B32: {
     const MachineOperand &Dst = MI.getOperand(0);
-    if (Dst.isReg() && Dst.getReg() == Exec && MI.getOperand(1).isReg())
+    if (Dst.isReg() && Dst.getReg() == LMC.ExecReg && MI.getOperand(1).isReg())
       return MI.getOperand(1).getReg();
     break;
   }
@@ -471,7 +477,7 @@ bool SIOptimizeExecMasking::optimizeExecSequence() {
           isLogicalOpOnExec(*PrepareExecInst) == CopyToExec) {
         LLVM_DEBUG(dbgs() << "Fold exec copy: " << *PrepareExecInst);
 
-        PrepareExecInst->getOperand(0).setReg(Exec);
+        PrepareExecInst->getOperand(0).setReg(LMC.ExecReg);
 
         LLVM_DEBUG(dbgs() << "into: " << *PrepareExecInst << '\n');
 
@@ -496,7 +502,7 @@ bool SIOptimizeExecMasking::optimizeExecSequence() {
              J = std::next(CopyFromExecInst->getIterator()),
              JE = I->getIterator();
          J != JE; ++J) {
-      if (SaveExecInst && J->readsRegister(Exec, TRI)) {
+      if (SaveExecInst && J->readsRegister(LMC.ExecReg, TRI)) {
         LLVM_DEBUG(dbgs() << "exec read prevents saveexec: " << *J << '\n');
         // Make sure this is inserted after any VALU ops that may have been
         // scheduled in between.
@@ -580,8 +586,8 @@ bool SIOptimizeExecMasking::optimizeExecSequence() {
     CopyToExecInst->eraseFromParent();
 
     for (MachineInstr *OtherInst : OtherUseInsts) {
-      OtherInst->substituteRegister(CopyToExec, Exec, AMDGPU::NoSubRegister,
-                                    *TRI);
+      OtherInst->substituteRegister(CopyToExec, LMC.ExecReg,
+                                    AMDGPU::NoSubRegister, *TRI);
     }
 
     Changed = true;
@@ -593,7 +599,7 @@ bool SIOptimizeExecMasking::optimizeExecSequence() {
 // Inserts the optimized s_mov_b32 / v_cmpx sequence based on the
 // operands extracted from a v_cmp ..., s_and_saveexec pattern.
 bool SIOptimizeExecMasking::optimizeVCMPSaveExecSequence(
-    MachineInstr &SaveExecInstr, MachineInstr &VCmp, MCRegister Exec) const {
+    MachineInstr &SaveExecInstr, MachineInstr &VCmp) const {
   const int NewOpcode = AMDGPU::getVCMPXOpFromVCMP(VCmp.getOpcode());
 
   if (NewOpcode == -1)
@@ -610,7 +616,7 @@ bool SIOptimizeExecMasking::optimizeVCMPSaveExecSequence(
     unsigned MovOpcode = IsSGPR32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
     BuildMI(*SaveExecInstr.getParent(), InsertPosIt,
             SaveExecInstr.getDebugLoc(), TII->get(MovOpcode), MoveDest)
-        .addReg(Exec);
+        .addReg(LMC.ExecReg);
   }
 
   // Omit dst as V_CMPX is implicitly writing to EXEC.
@@ -661,10 +667,7 @@ void SIOptimizeExecMasking::tryRecordVCmpxAndSaveexecSequence(
   if (!ST->hasGFX10_3Insts())
     return;
 
-  const unsigned AndSaveExecOpcode =
-      ST->isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
-
-  if (MI.getOpcode() != AndSaveExecOpcode)
+  if (MI.getOpcode() != LMC.AndSaveExecOpc)
     return;
 
   Register SaveExecDest = MI.getOperand(0).getReg();
@@ -690,7 +693,7 @@ void SIOptimizeExecMasking::tryRecordVCmpxAndSaveexecSequence(
         return AMDGPU::getVCMPXOpFromVCMP(Check->getOpcode()) != -1 &&
                Check->modifiesRegister(SaveExecSrc0->getReg(), TRI);
       },
-      {Exec, SaveExecSrc0->getReg()});
+      {LMC.ExecReg, SaveExecSrc0->getReg()});
 
   if (!VCmp)
     return;
@@ -748,32 +751,28 @@ void SIOptimizeExecMasking::tryRecordVCmpxAndSaveexecSequence(
 // to be replaced with
 // s_andn2_saveexec s_o, s_i.
 void SIOptimizeExecMasking::tryRecordOrSaveexecXorSequence(MachineInstr &MI) {
-  const unsigned XorOpcode =
-      ST->isWave32() ? AMDGPU::S_XOR_B32 : AMDGPU::S_XOR_B64;
-
-  if (MI.getOpcode() == XorOpcode && &MI != &MI.getParent()->front()) {
+  if (MI.getOpcode() == LMC.XorOpc && &MI != &MI.getParent()->front()) {
     const MachineOperand &XorDst = MI.getOperand(0);
     const MachineOperand &XorSrc0 = MI.getOperand(1);
     const MachineOperand &XorSrc1 = MI.getOperand(2);
 
-    if (XorDst.isReg() && XorDst.getReg() == Exec && XorSrc0.isReg() &&
+    if (XorDst.isReg() && XorDst.getReg() == LMC.ExecReg && XorSrc0.isReg() &&
         XorSrc1.isReg() &&
-        (XorSrc0.getReg() == Exec || XorSrc1.getReg() == Exec)) {
-      const unsigned OrSaveexecOpcode = ST->isWave32()
-                                            ? AMDGPU::S_OR_SAVEEXEC_B32
-                                            : AMDGPU::S_OR_SAVEEXEC_B64;
+        (XorSrc0.getReg() == LMC.ExecReg || XorSrc1.getReg() == LMC.ExecReg)) {
 
       // Peek at the previous instruction and check if this is a relevant
       // s_or_saveexec instruction.
       MachineInstr &PossibleOrSaveexec = *MI.getPrevNode();
-      if (PossibleOrSaveexec.getOpcode() != OrSaveexecOpcode)
+      if (PossibleOrSaveexec.getOpcode() != LMC.OrSaveExecOpc)
         return;
 
       const MachineOperand &OrDst = PossibleOrSaveexec.getOperand(0);
       const MachineOperand &OrSrc0 = PossibleOrSaveexec.getOperand(1);
       if (OrDst.isReg() && OrSrc0.isReg()) {
-        if ((XorSrc0.getReg() == Exec && XorSrc1.getReg() == OrDst.getReg()) ||
-            (XorSrc0.getReg() == OrDst.getReg() && XorSrc1.getReg() == Exec)) {
+        if ((XorSrc0.getReg() == LMC.ExecReg &&
+             XorSrc1.getReg() == OrDst.getReg()) ||
+            (XorSrc0.getReg() == OrDst.getReg() &&
+             XorSrc1.getReg() == LMC.ExecReg)) {
           OrXors.emplace_back(&PossibleOrSaveexec, &MI);
         }
       }
@@ -787,15 +786,13 @@ bool SIOptimizeExecMasking::optimizeOrSaveexecXorSequences() {
   }
 
   bool Changed = false;
-  const unsigned Andn2Opcode = ST->isWave32() ? AMDGPU::S_ANDN2_SAVEEXEC_B32
-                                              : AMDGPU::S_ANDN2_SAVEEXEC_B64;
 
   for (const auto &Pair : OrXors) {
     MachineInstr *Or = nullptr;
     MachineInstr *Xor = nullptr;
     std::tie(Or, Xor) = Pair;
     BuildMI(*Or->getParent(), Or->getIterator(), Or->getDebugLoc(),
-            TII->get(Andn2Opcode), Or->getOperand(0).getReg())
+            TII->get(LMC.AndN2SaveExecOpc), Or->getOperand(0).getReg())
         .addReg(Or->getOperand(1).getReg());
 
     Or->eraseFromParent();
@@ -811,24 +808,17 @@ bool SIOptimizeExecMaskingLegacy::runOnMachineFunction(MachineFunction &MF) {
   if (skipFunction(MF.getFunction()))
     return false;
 
-  return SIOptimizeExecMasking().run(MF);
+  return SIOptimizeExecMasking(&MF).run();
 }
 
-bool SIOptimizeExecMasking::run(MachineFunction &MF) {
-  this->MF = &MF;
-  ST = &MF.getSubtarget<GCNSubtarget>();
-  TRI = ST->getRegisterInfo();
-  TII = ST->getInstrInfo();
-  MRI = &MF.getRegInfo();
-  Exec = TRI->getExec();
-
+bool SIOptimizeExecMasking::run() {
   bool Changed = optimizeExecSequence();
 
   OrXors.clear();
   SaveExecVCmpMapping.clear();
   KillFlagCandidates.clear();
   static unsigned SearchWindow = 10;
-  for (MachineBasicBlock &MBB : MF) {
+  for (MachineBasicBlock &MBB : *MF) {
     unsigned SearchCount = 0;
 
     for (auto &MI : llvm::reverse(MBB)) {
@@ -842,7 +832,7 @@ bool SIOptimizeExecMasking::run(MachineFunction &MF) {
       tryRecordOrSaveexecXorSequence(MI);
       tryRecordVCmpxAndSaveexecSequence(MI);
 
-      if (MI.modifiesRegister(Exec, TRI)) {
+      if (MI.modifiesRegister(LMC.ExecReg, TRI)) {
         break;
       }
 
@@ -855,7 +845,7 @@ bool SIOptimizeExecMasking::run(MachineFunction &MF) {
     MachineInstr *SaveExecInstr = Entry.getFirst();
     MachineInstr *VCmpInstr = Entry.getSecond();
 
-    Changed |= optimizeVCMPSaveExecSequence(*SaveExecInstr, *VCmpInstr, Exec);
+    Changed |= optimizeVCMPSaveExecSequence(*SaveExecInstr, *VCmpInstr);
   }
 
   return Changed;
diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp
index b2228574378f1..c186f5af78b7f 100644
--- a/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp
+++ b/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp
@@ -14,6 +14,7 @@
 
 #include "SIOptimizeExecMaskingPreRA.h"
 #include "AMDGPU.h"
+#include "AMDGPULaneMaskUtils.h"
 #include "GCNSubtarget.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "llvm/CodeGen/LiveIntervals.h"
@@ -28,15 +29,13 @@ namespace {
 
 class SIOptimizeExecMaskingPreRA {
 private:
+  const GCNSubtarget &ST;
   const SIRegisterInfo *TRI;
   const SIInstrInfo *TII;
   MachineRegisterInfo *MRI;
   LiveIntervals *LIS;
+  const AMDGPU::LaneMaskConstants &LMC;
 
-  unsigned AndOpc;
-  unsigned Andn2Opc;
-  unsigned OrSaveExecOpc;
-  unsigned XorTermrOpc;
   MCRegister CondReg;
   MCRegister ExecReg;
 
@@ -44,7 +43,10 @@ class SIOptimizeExecMaskingPreRA {
   bool optimizeElseBranch(MachineBasicBlock &MBB);
 
 public:
-  SIOptimizeExecMaskingPreRA(LiveIntervals *LIS) : LIS(LIS) {}
+  SIOptimizeExecMaskingPreRA(MachineFunction &MF, LiveIntervals *LIS)
+      : ST(MF.getSubtarget<GCNSubtarget>()), TRI(ST.getRegisterInfo()),
+        TII(ST.getInstrInfo()), MRI(&MF.getRegInfo()), LIS(LIS),
+        LMC(AMDGPU::LaneMaskConstants::get(ST)) {}
   bool run(MachineFunction &MF);
 };
 
@@ -138,8 +140,8 @@ bool SIOptimizeExecMaskingPreRA::optimizeVcndVcmpPair(MachineBasicBlock &MBB) {
 
   auto *And =
       TRI->findReachingDef(CondReg, AMDGPU::NoSubRegister, *I, *MRI, LIS);
-  if (!And || And->getOpcode() != AndOpc ||
-      !And->getOperand(1).isReg() || !And->getOperand(2).isReg())
+  if (!And || And->getOpcode() != LMC.AndOpc || !And->getOperand(1).isReg() ||
+      !And->getOperand(2).isReg())
     return false;
 
   MachineOperand *AndCC = &And->getOperand(1);
@@ -207,7 +209,7 @@ bool SIOptimizeExecMaskingPreRA::optimizeVcndVcmpPair(MachineBasicBlock &MBB) {
                     << *And);
 
   MachineInstr *Andn2 =
-      BuildMI(MBB, *And, And->getDebugLoc(), TII->get(Andn2Opc),
+      BuildMI(MBB, *And, And->getDebugLoc(), TII->get(LMC.AndN2Opc),
               And->getOperand(0).getReg())
           .addReg(ExecReg)
           .addReg(CCReg, getUndefRegState(CC->isUndef()), CC->getSubReg());
@@ -294,11 +296,11 @@ bool SIOptimizeExecMaskingPreRA::optimizeElseBranch(MachineBasicBlock &MBB) {
   // Check this is an else block.
   auto First = MBB.begin();
   MachineInstr &SaveExecMI = *First;
-  if (SaveExecMI.getOpcode() != OrSaveExecOpc)
+  if (SaveExecMI.getOpcode() != LMC.OrSaveExecOpc)
     return false;
 
   auto I = llvm::find_if(MBB.terminators(), [this](const MachineInstr &MI) {
-    return MI.getOpcode() == XorTermrOpc;
+    return MI.getOpcode() == LMC.XorTermOpc;
   });
   if (I == MBB.terminators().end())
     return false;
@@ -314,7 +316,7 @@ bool SIOptimizeExecMaskingPreRA::optimizeElseBranch(MachineBasicBlock &MBB) {
   MachineInstr *AndExecMI = nullptr;
   I--;
   while (I != First && !AndExecMI) {
-    if (I->getOpcode() == AndOpc && I->getOperand(0).getReg() == DstReg &&
+    if (I->getOpcode() == LMC.AndOpc && I->getOperand(0).getReg() == DstReg &&
         I->getOperand(1).getReg() == Register(ExecReg))
       AndExecMI = &*I;
     I--;
@@ -352,7 +354,7 @@ PreservedAnalyses
 SIOptimizeExecMaskingPreRAPass::run(MachineFunction &MF,
                                     MachineFunctionAnalysisManager &MFAM) {
   auto &LIS = MFAM.getResult<LiveIntervalsAnalysis>(MF);
-  SIOptimizeExecMaskingPreRA(&LIS).run(MF);
+  SIOptimizeExecMaskingPreRA(MF, &LIS).run(MF);
   return PreservedAnalyses::all();
 }
 
@@ -362,23 +364,12 @@ bool SIOptimizeExecMaskingPreRALegacy::runOnMachineFunction(
     return false;
 
   auto *LIS = &getAnalysis<LiveIntervalsWrapperPass>().getLIS();
-  return SIOptimizeExecMaskingPreRA(LIS).run(MF);
+  return SIOptimizeExecMaskingPreRA(MF, LIS).run(MF);
 }
 
 bool SIOptimizeExecMaskingPreRA::run(MachineFunction &MF) {
-  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
-  TRI = ST.getRegisterInfo();
-  TII = ST.getInstrInfo();
-  MRI = &MF.getRegInfo();
-
-  const bool Wave32 = ST.isWave32();
-  AndOpc = Wave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
-  Andn2Opc = Wave32 ? AMDGPU::S_ANDN2_B32 : AMDGPU::S_ANDN2_B64;
-  OrSaveExecOpc =
-      Wave32 ? AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64;
-  XorTermrOpc = Wave32 ? AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term;
-  CondReg = MCRegister::from(Wave32 ? AMDGPU::VCC_LO : AMDGPU::VCC);
-  ExecReg = MCRegister::from(Wave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC);
+  CondReg = MCRegister::from(LMC.VccReg);
+  ExecReg = MCRegister::from(LMC.ExecReg);
 
   DenseSet<Register> RecalcRegs({AMDGPU::EXEC_LO, AMDGPU::EXEC_HI});
   bool Changed = false;
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 22488384759be..205237fefe785 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -1108,8 +1108,8 @@ bool SIRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI,
                                 SIInstrFlags::FlatScratch);
 }
 
-const TargetRegisterClass *SIRegisterInfo::getPointerRegClass(
-  const MachineFunction &MF, unsigned Kind) const {
+const TargetRegisterClass *
+SIRegisterInfo::getPointerRegClass(unsigned Kind) const {
   // This is inaccurate. It depends on the instruction and address space. The
   // only place where we should hit this is for dealing with frame indexes /
   // private accesses, so this is correct in that case.
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
index eeefef1116aa3..7b91ba7bc581f 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -154,8 +154,8 @@ class SIRegisterInfo final : public AMDGPUGenRegisterInfo {
   bool isFrameOffsetLegal(const MachineInstr *MI, Register BaseReg,
                           int64_t Offset) const override;
 
-  const TargetRegisterClass *getPointerRegClass(
-    const MachineFunction &MF, unsigned Kind = 0) const override;
+  const TargetRegisterClass *
+  getPointerRegClass(unsigned Kind = 0) const override;
 
   /// Returns a legal register class to copy a register in the specified class
   /// to or from. If it is possible to copy the register directly without using
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index 5f5eec49bab06..4e1876db41d3d 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -228,16 +228,12 @@ def SGPR_NULL64 :
 // need them, we need to do a 64 bit load and extract the bits manually.
 multiclass ApertureRegister<string name, bits<10> regIdx> {
   let isConstant = true in {
-    // FIXME: We shouldn't need to define subregisters for these (nor add them to any 16 bit
-    //  register classes), but if we don't it seems to confuse the TableGen
-    //  backend and we end up with a lot of weird register pressure sets and classes.
     defm _LO : SIRegLoHi16 <name, regIdx>;
-    defm _HI : SIRegLoHi16 <"", regIdx>;
-
-    def "" : RegisterWithSubRegs<name, [!cast<Register>(NAME#_LO), !cast<Register>(NAME#_HI)]> {
+    def "" : RegisterWithSubRegs<name, [!cast<Register>(NAME#_LO)]> {
       let Namespace = "AMDGPU";
-      let SubRegIndices = [sub0, sub1];
+      let SubRegIndices = [sub0];
       let HWEncoding = !cast<Register>(NAME#_LO).HWEncoding;
+      let CoveredBySubRegs = 0;
     }
   } // isConstant = true
 }
@@ -790,8 +786,7 @@ let GeneratePressureSet = 0, HasSGPR = 1 in {
 def SReg_32_XM0_XEXEC : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16, i1], 32,
   (add SGPR_32, VCC_LO, VCC_HI, FLAT_SCR_LO, FLAT_SCR_HI, XNACK_MASK_LO, XNACK_MASK_HI,
    SGPR_NULL, SGPR_NULL_HI, TTMP_32, TMA_LO, TMA_HI, TBA_LO, TBA_HI, SRC_SHARED_BASE_LO,
-   SRC_SHARED_LIMIT_LO, SRC_PRIVATE_BASE_LO, SRC_PRIVATE_LIMIT_LO, SRC_SHARED_BASE_HI,
-   SRC_SHARED_LIMIT_HI, SRC_PRIVATE_BASE_HI, SRC_PRIVATE_LIMIT_HI, SRC_POPS_EXITING_WAVE_ID,
+   SRC_SHARED_LIMIT_LO, SRC_PRIVATE_BASE_LO, SRC_PRIVATE_LIMIT_LO, SRC_POPS_EXITING_WAVE_ID,
    SRC_VCCZ, SRC_EXECZ, SRC_SCC, SRC_FLAT_SCRATCH_BASE_LO, SRC_FLAT_SCRATCH_BASE_HI)> {
   let AllocationPriority = 0;
 }
@@ -801,10 +796,9 @@ def SReg_LO16 : SIRegisterClass<"AMDGPU", [i16, f16, bf16], 16,
    XNACK_MASK_LO_LO16, XNACK_MASK_HI_LO16, SGPR_NULL_LO16, SGPR_NULL_HI_LO16, TTMP_LO16,
    TMA_LO_LO16, TMA_HI_LO16, TBA_LO_LO16, TBA_HI_LO16, SRC_SHARED_BASE_LO_LO16,
    SRC_SHARED_LIMIT_LO_LO16, SRC_PRIVATE_BASE_LO_LO16, SRC_PRIVATE_LIMIT_LO_LO16,
-   SRC_SHARED_BASE_HI_LO16, SRC_SHARED_LIMIT_HI_LO16, SRC_PRIVATE_BASE_HI_LO16,
-   SRC_PRIVATE_LIMIT_HI_LO16, SRC_POPS_EXITING_WAVE_ID_LO16, SRC_VCCZ_LO16,
-   SRC_EXECZ_LO16, SRC_SCC_LO16, EXEC_LO_LO16, EXEC_HI_LO16, M0_CLASS_LO16,
-   SRC_FLAT_SCRATCH_BASE_LO_LO16, SRC_FLAT_SCRATCH_BASE_HI_LO16)> {
+   SRC_POPS_EXITING_WAVE_ID_LO16, SRC_VCCZ_LO16, SRC_EXECZ_LO16, SRC_SCC_LO16,
+   EXEC_LO_LO16, EXEC_HI_LO16, M0_CLASS_LO16, SRC_FLAT_SCRATCH_BASE_LO_LO16,
+   SRC_FLAT_SCRATCH_BASE_HI_LO16)> {
   let Size = 16;
   let isAllocatable = 0;
   let BaseClassOrder = 16;
@@ -825,6 +819,13 @@ def SReg_32_XM0 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2
   let AllocationPriority = 0;
 }
 
+def APERTURE_Class : SIRegisterClass<"AMDGPU", Reg64Types.types, 32,
+  (add SRC_SHARED_BASE, SRC_SHARED_LIMIT, SRC_PRIVATE_BASE, SRC_PRIVATE_LIMIT)> {
+  let isAllocatable = 0;
+  let Size = 64;
+  let BaseClassOrder = 10000;
+}
+
 } // End GeneratePressureSet = 0
 
 // Register class for all scalar registers (SGPRs + Special Registers)
@@ -876,8 +877,7 @@ def TTMP_64 : SIRegisterClass<"AMDGPU", [v2i32, i64, f64, v4i16, v4f16, v4bf16],
 }
 
 def SReg_64_XEXEC_XNULL : SIRegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, i1, v4i16, v4f16, v4bf16], 32,
-  (add SGPR_64, VCC, FLAT_SCR, XNACK_MASK, SRC_SHARED_BASE,
-       SRC_SHARED_LIMIT, SRC_PRIVATE_BASE, SRC_PRIVATE_LIMIT, TTMP_64, TBA, TMA,
+  (add SGPR_64, VCC, FLAT_SCR, XNACK_MASK, TTMP_64, TBA, TMA,
        SRC_FLAT_SCRATCH_BASE)> {
   let CopyCost = 1;
   let AllocationPriority = 1;
@@ -900,6 +900,14 @@ def SReg_64 : SIRegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, i1, v4i16, v4f1
   let Size = 64;
 }
 
+def SReg_64_Encodable : SIRegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, i1, v4i16, v4f16, v4bf16], 32,
+  (add SReg_64, APERTURE_Class)> {
+  let CopyCost = 1;
+  let isAllocatable = 0;
+  let HasSGPR = 1;
+  let Size = 64;
+}
+
 def SReg_1_XEXEC : SIRegisterClass<"AMDGPU", [i1], 32,
   (add SReg_64_XEXEC, SReg_32_XEXEC)> {
   let CopyCost = 1;
@@ -1123,7 +1131,8 @@ def VS_32_Lo256 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2
   let Size = 32;
 }
 
-def VS_64 : SIRegisterClass<"AMDGPU", VReg_64.RegTypes, 32, (add VReg_64, SReg_64)> {
+def VS_64 : SIRegisterClass<"AMDGPU", VReg_64.RegTypes, 32,
+                            (add VReg_64, SReg_64_Encodable)> {
   let isAllocatable = 0;
   let HasVGPR = 1;
   let HasSGPR = 1;
@@ -1131,7 +1140,7 @@ def VS_64 : SIRegisterClass<"AMDGPU", VReg_64.RegTypes, 32, (add VReg_64, SReg_6
 }
 
 def VS_64_Align2 : SIRegisterClass<"AMDGPU", VReg_64.RegTypes, 32,
-                                   (add VReg_64_Align2, SReg_64)> {
+                                   (add VReg_64_Align2, SReg_64_Encodable)> {
   let isAllocatable = 0;
   let HasVGPR = 1;
   let HasSGPR = 1;
@@ -1145,7 +1154,8 @@ def AV_32 : SIRegisterClass<"AMDGPU", VGPR_32.RegTypes, 32, (add VGPR_32, AGPR_3
   let Size = 32;
 }
 
-def VS_64_Lo256 : SIRegisterClass<"AMDGPU", VReg_64.RegTypes, 32, (add VReg_64_Lo256_Align2, SReg_64)> {
+def VS_64_Lo256 : SIRegisterClass<"AMDGPU", VReg_64.RegTypes, 32,
+                                  (add VReg_64_Lo256_Align2, SReg_64_Encodable)> {
   let isAllocatable = 0;
   let HasVGPR = 1;
   let HasSGPR = 1;
@@ -1225,7 +1235,7 @@ def SSrc_bf16: SrcRegOrImm9 <SReg_32, "OPERAND_REG_IMM_BF16">;
 def SSrc_f16 : SrcRegOrImm9 <SReg_32, "OPERAND_REG_IMM_FP16">;
 def SSrc_b32 : SrcRegOrImm9 <SReg_32, "OPERAND_REG_IMM_INT32">;
 def SSrc_f32 : SrcRegOrImm9 <SReg_32, "OPERAND_REG_IMM_FP32">;
-def SSrc_b64 : SrcRegOrImm9 <SReg_64, "OPERAND_REG_IMM_INT64">;
+def SSrc_b64 : SrcRegOrImm9 <SReg_64_Encodable, "OPERAND_REG_IMM_INT64">;
 
 def SSrcOrLds_b32 : SrcRegOrImm9 <SRegOrLds_32, "OPERAND_REG_IMM_INT32">;
 
@@ -1327,7 +1337,7 @@ def VGPROp_16 : VGPROp<VGPR_16> {
 }
 def VGPROp_32 : VGPROp<VGPR_32>;
 
-foreach size = ["64", "96", "128", "160", "192", "224", "256", "288", "512", "1024"] in {
+foreach size = ["64", "96", "128", "160", "192", "224", "256", "288", "320", "352", "384", "512", "1024"] in {
   def VGPROp_#size : VGPROp<!cast<RegisterClass>("VReg_"#size)>;
 }
 
diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index 1198bbc310daa..6611e1e6507e1 100644
--- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -69,6 +69,7 @@
 
 #include "SIWholeQuadMode.h"
 #include "AMDGPU.h"
+#include "AMDGPULaneMaskUtils.h"
 #include "GCNSubtarget.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "llvm/ADT/MapVector.h"
@@ -155,7 +156,7 @@ class SIWholeQuadMode {
                   MachineDominatorTree *MDT, MachinePostDominatorTree *PDT)
       : ST(&MF.getSubtarget<GCNSubtarget>()), TII(ST->getInstrInfo()),
         TRI(&TII->getRegisterInfo()), MRI(&MF.getRegInfo()), LIS(LIS), MDT(MDT),
-        PDT(PDT) {}
+        PDT(PDT), LMC(AMDGPU::LaneMaskConstants::get(*ST)) {}
   bool run(MachineFunction &MF);
 
 private:
@@ -166,15 +167,8 @@ class SIWholeQuadMode {
   LiveIntervals *LIS;
   MachineDominatorTree *MDT;
   MachinePostDominatorTree *PDT;
+  const AMDGPU::LaneMaskConstants &LMC;
 
-  unsigned AndOpc;
-  unsigned AndTermOpc;
-  unsigned AndN2Opc;
-  unsigned XorOpc;
-  unsigned AndSaveExecOpc;
-  unsigned AndSaveExecTermOpc;
-  unsigned WQMOpc;
-  Register Exec;
   Register LiveMaskReg;
 
   DenseMap<const MachineInstr *, InstrInfo> Instructions;
@@ -882,14 +876,12 @@ MachineInstr *SIWholeQuadMode::lowerKillF32(MachineInstr &MI) {
   const MachineOperand &Op1 = MI.getOperand(1);
 
   // VCC represents lanes killed.
-  Register VCC = ST->isWave32() ? AMDGPU::VCC_LO : AMDGPU::VCC;
-
   if (TRI->isVGPR(*MRI, Op0.getReg())) {
     Opcode = AMDGPU::getVOPe32(Opcode);
     VcmpMI = BuildMI(MBB, &MI, DL, TII->get(Opcode)).add(Op1).add(Op0);
   } else {
     VcmpMI = BuildMI(MBB, &MI, DL, TII->get(Opcode))
-                 .addReg(VCC, RegState::Define)
+                 .addReg(LMC.VccReg, RegState::Define)
                  .addImm(0) // src0 modifiers
                  .add(Op1)
                  .addImm(0) // src1 modifiers
@@ -898,9 +890,9 @@ MachineInstr *SIWholeQuadMode::lowerKillF32(MachineInstr &MI) {
   }
 
   MachineInstr *MaskUpdateMI =
-      BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
+      BuildMI(MBB, MI, DL, TII->get(LMC.AndN2Opc), LiveMaskReg)
           .addReg(LiveMaskReg)
-          .addReg(VCC);
+          .addReg(LMC.VccReg);
 
   // State of SCC represents whether any lanes are live in mask,
   // if SCC is 0 then no lanes will be alive anymore.
@@ -908,7 +900,9 @@ MachineInstr *SIWholeQuadMode::lowerKillF32(MachineInstr &MI) {
       BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_EARLY_TERMINATE_SCC0));
 
   MachineInstr *ExecMaskMI =
-      BuildMI(MBB, MI, DL, TII->get(AndN2Opc), Exec).addReg(Exec).addReg(VCC);
+      BuildMI(MBB, MI, DL, TII->get(LMC.AndN2Opc), LMC.ExecReg)
+          .addReg(LMC.ExecReg)
+          .addReg(LMC.VccReg);
 
   assert(MBB.succ_size() == 1);
 
@@ -942,9 +936,9 @@ MachineInstr *SIWholeQuadMode::lowerKillI1(MachineInstr &MI, bool IsWQM) {
   if (Op.isImm()) {
     if (Op.getImm() == KillVal) {
       // Static: all active lanes are killed
-      MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
+      MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(LMC.AndN2Opc), LiveMaskReg)
                          .addReg(LiveMaskReg)
-                         .addReg(Exec);
+                         .addReg(LMC.ExecReg);
     } else {
       // Static: kill does nothing
       bool IsLastTerminator = std::next(MI.getIterator()) == MBB.end();
@@ -964,14 +958,15 @@ MachineInstr *SIWholeQuadMode::lowerKillI1(MachineInstr &MI, bool IsWQM) {
       // Op represents live lanes after kill,
       // so exec mask needs to be factored in.
       TmpReg = MRI->createVirtualRegister(TRI->getBoolRC());
-      ComputeKilledMaskMI =
-          BuildMI(MBB, MI, DL, TII->get(AndN2Opc), TmpReg).addReg(Exec).add(Op);
-      MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
+      ComputeKilledMaskMI = BuildMI(MBB, MI, DL, TII->get(LMC.AndN2Opc), TmpReg)
+                                .addReg(LMC.ExecReg)
+                                .add(Op);
+      MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(LMC.AndN2Opc), LiveMaskReg)
                          .addReg(LiveMaskReg)
                          .addReg(TmpReg);
     } else {
       // Op represents lanes to kill
-      MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
+      MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(LMC.AndN2Opc), LiveMaskReg)
                          .addReg(LiveMaskReg)
                          .add(Op);
     }
@@ -990,24 +985,25 @@ MachineInstr *SIWholeQuadMode::lowerKillI1(MachineInstr &MI, bool IsWQM) {
   if (IsDemote) {
     // Demote - deactivate quads with only helper lanes
     LiveMaskWQM = MRI->createVirtualRegister(TRI->getBoolRC());
-    WQMMaskMI =
-        BuildMI(MBB, MI, DL, TII->get(WQMOpc), LiveMaskWQM).addReg(LiveMaskReg);
-    NewTerm = BuildMI(MBB, MI, DL, TII->get(AndOpc), Exec)
-                  .addReg(Exec)
+    WQMMaskMI = BuildMI(MBB, MI, DL, TII->get(LMC.WQMOpc), LiveMaskWQM)
+                    .addReg(LiveMaskReg);
+    NewTerm = BuildMI(MBB, MI, DL, TII->get(LMC.AndOpc), LMC.ExecReg)
+                  .addReg(LMC.ExecReg)
                   .addReg(LiveMaskWQM);
   } else {
     // Kill - deactivate lanes no longer in live mask
     if (Op.isImm()) {
-      unsigned MovOpc = ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
-      NewTerm = BuildMI(MBB, &MI, DL, TII->get(MovOpc), Exec).addImm(0);
+      NewTerm =
+          BuildMI(MBB, &MI, DL, TII->get(LMC.MovOpc), LMC.ExecReg).addImm(0);
     } else if (!IsWQM) {
-      NewTerm = BuildMI(MBB, &MI, DL, TII->get(AndOpc), Exec)
-                    .addReg(Exec)
+      NewTerm = BuildMI(MBB, &MI, DL, TII->get(LMC.AndOpc), LMC.ExecReg)
+                    .addReg(LMC.ExecReg)
                     .addReg(LiveMaskReg);
     } else {
-      unsigned Opcode = KillVal ? AndN2Opc : AndOpc;
-      NewTerm =
-          BuildMI(MBB, &MI, DL, TII->get(Opcode), Exec).addReg(Exec).add(Op);
+      unsigned Opcode = KillVal ? LMC.AndN2Opc : LMC.AndOpc;
+      NewTerm = BuildMI(MBB, &MI, DL, TII->get(Opcode), LMC.ExecReg)
+                    .addReg(LMC.ExecReg)
+                    .add(Op);
     }
   }
 
@@ -1183,13 +1179,14 @@ void SIWholeQuadMode::toExact(MachineBasicBlock &MBB,
   MachineInstr *MI;
 
   if (SaveWQM) {
-    unsigned Opcode = IsTerminator ? AndSaveExecTermOpc : AndSaveExecOpc;
+    unsigned Opcode =
+        IsTerminator ? LMC.AndSaveExecTermOpc : LMC.AndSaveExecOpc;
     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(Opcode), SaveWQM)
              .addReg(LiveMaskReg);
   } else {
-    unsigned Opcode = IsTerminator ? AndTermOpc : AndOpc;
-    MI = BuildMI(MBB, Before, DebugLoc(), TII->get(Opcode), Exec)
-             .addReg(Exec)
+    unsigned Opcode = IsTerminator ? LMC.AndTermOpc : LMC.AndOpc;
+    MI = BuildMI(MBB, Before, DebugLoc(), TII->get(Opcode), LMC.ExecReg)
+             .addReg(LMC.ExecReg)
              .addReg(LiveMaskReg);
   }
 
@@ -1203,10 +1200,11 @@ void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB,
   MachineInstr *MI;
 
   if (SavedWQM) {
-    MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), Exec)
+    MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), LMC.ExecReg)
              .addReg(SavedWQM);
   } else {
-    MI = BuildMI(MBB, Before, DebugLoc(), TII->get(WQMOpc), Exec).addReg(Exec);
+    MI = BuildMI(MBB, Before, DebugLoc(), TII->get(LMC.WQMOpc), LMC.ExecReg)
+             .addReg(LMC.ExecReg);
   }
 
   LIS->InsertMachineInstrInMaps(*MI);
@@ -1246,11 +1244,11 @@ void SIWholeQuadMode::fromStrictMode(MachineBasicBlock &MBB,
 
   if (CurrentStrictState == StateStrictWWM) {
     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_STRICT_WWM),
-                 Exec)
+                 LMC.ExecReg)
              .addReg(SavedOrig);
   } else {
     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_STRICT_WQM),
-                 Exec)
+                 LMC.ExecReg)
              .addReg(SavedOrig);
   }
   LIS->InsertMachineInstrInMaps(*MI);
@@ -1280,7 +1278,7 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, BlockInfo &BI,
   if (IsEntry) {
     // Skip the instruction that saves LiveMask
     if (II != IE && II->getOpcode() == AMDGPU::COPY &&
-        II->getOperand(1).getReg() == TRI->getExec())
+        II->getOperand(1).getReg() == LMC.ExecReg)
       ++II;
   }
 
@@ -1565,18 +1563,14 @@ bool SIWholeQuadMode::lowerKillInstrs(bool IsWQM) {
 
 void SIWholeQuadMode::lowerInitExec(MachineInstr &MI) {
   MachineBasicBlock *MBB = MI.getParent();
-  bool IsWave32 = ST->isWave32();
 
   if (MI.getOpcode() == AMDGPU::SI_INIT_WHOLE_WAVE) {
     assert(MBB == &MBB->getParent()->front() &&
            "init whole wave not in entry block");
     Register EntryExec = MRI->createVirtualRegister(TRI->getBoolRC());
-    MachineInstr *SaveExec =
-        BuildMI(*MBB, MBB->begin(), MI.getDebugLoc(),
-                TII->get(IsWave32 ? AMDGPU::S_OR_SAVEEXEC_B32
-                                  : AMDGPU::S_OR_SAVEEXEC_B64),
-                EntryExec)
-            .addImm(-1);
+    MachineInstr *SaveExec = BuildMI(*MBB, MBB->begin(), MI.getDebugLoc(),
+                                     TII->get(LMC.OrSaveExecOpc), EntryExec)
+                                 .addImm(-1);
 
     // Replace all uses of MI's destination reg with EntryExec.
     MRI->replaceRegWith(MI.getOperand(0).getReg(), EntryExec);
@@ -1596,11 +1590,9 @@ void SIWholeQuadMode::lowerInitExec(MachineInstr &MI) {
 
   if (MI.getOpcode() == AMDGPU::SI_INIT_EXEC) {
     // This should be before all vector instructions.
-    MachineInstr *InitMI =
-        BuildMI(*MBB, MBB->begin(), MI.getDebugLoc(),
-                TII->get(IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),
-                Exec)
-            .addImm(MI.getOperand(0).getImm());
+    MachineInstr *InitMI = BuildMI(*MBB, MBB->begin(), MI.getDebugLoc(),
+                                   TII->get(LMC.MovOpc), LMC.ExecReg)
+                               .addImm(MI.getOperand(0).getImm());
     if (LIS) {
       LIS->RemoveMachineInstrFromMaps(MI);
       LIS->InsertMachineInstrInMaps(*InitMI);
@@ -1644,19 +1636,14 @@ void SIWholeQuadMode::lowerInitExec(MachineInstr &MI) {
   auto BfeMI = BuildMI(*MBB, FirstMI, DL, TII->get(AMDGPU::S_BFE_U32), CountReg)
                    .addReg(InputReg)
                    .addImm((MI.getOperand(1).getImm() & Mask) | 0x70000);
-  auto BfmMI =
-      BuildMI(*MBB, FirstMI, DL,
-              TII->get(IsWave32 ? AMDGPU::S_BFM_B32 : AMDGPU::S_BFM_B64), Exec)
-          .addReg(CountReg)
-          .addImm(0);
+  auto BfmMI = BuildMI(*MBB, FirstMI, DL, TII->get(LMC.BfmOpc), LMC.ExecReg)
+                   .addReg(CountReg)
+                   .addImm(0);
   auto CmpMI = BuildMI(*MBB, FirstMI, DL, TII->get(AMDGPU::S_CMP_EQ_U32))
                    .addReg(CountReg, RegState::Kill)
                    .addImm(WavefrontSize);
   auto CmovMI =
-      BuildMI(*MBB, FirstMI, DL,
-              TII->get(IsWave32 ? AMDGPU::S_CMOV_B32 : AMDGPU::S_CMOV_B64),
-              Exec)
-          .addImm(-1);
+      BuildMI(*MBB, FirstMI, DL, TII->get(LMC.CMovOpc), LMC.ExecReg).addImm(-1);
 
   if (!LIS) {
     MI.eraseFromParent();
@@ -1711,30 +1698,10 @@ bool SIWholeQuadMode::run(MachineFunction &MF) {
   SetInactiveInstrs.clear();
   StateTransition.clear();
 
-  if (ST->isWave32()) {
-    AndOpc = AMDGPU::S_AND_B32;
-    AndTermOpc = AMDGPU::S_AND_B32_term;
-    AndN2Opc = AMDGPU::S_ANDN2_B32;
-    XorOpc = AMDGPU::S_XOR_B32;
-    AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B32;
-    AndSaveExecTermOpc = AMDGPU::S_AND_SAVEEXEC_B32_term;
-    WQMOpc = AMDGPU::S_WQM_B32;
-    Exec = AMDGPU::EXEC_LO;
-  } else {
-    AndOpc = AMDGPU::S_AND_B64;
-    AndTermOpc = AMDGPU::S_AND_B64_term;
-    AndN2Opc = AMDGPU::S_ANDN2_B64;
-    XorOpc = AMDGPU::S_XOR_B64;
-    AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B64;
-    AndSaveExecTermOpc = AMDGPU::S_AND_SAVEEXEC_B64_term;
-    WQMOpc = AMDGPU::S_WQM_B64;
-    Exec = AMDGPU::EXEC;
-  }
-
   const char GlobalFlags = analyzeFunction(MF);
   bool Changed = false;
 
-  LiveMaskReg = Exec;
+  LiveMaskReg = LMC.ExecReg;
 
   MachineBasicBlock &Entry = MF.front();
   MachineBasicBlock::iterator EntryMI = lowerInitExecInstrs(Entry, Changed);
@@ -1748,7 +1715,7 @@ bool SIWholeQuadMode::run(MachineFunction &MF) {
     LiveMaskReg = MRI->createVirtualRegister(TRI->getBoolRC());
     MachineInstr *MI =
         BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::COPY), LiveMaskReg)
-            .addReg(Exec);
+            .addReg(LMC.ExecReg);
     LIS->InsertMachineInstrInMaps(*MI);
     Changed = true;
   }
@@ -1779,8 +1746,9 @@ bool SIWholeQuadMode::run(MachineFunction &MF) {
     Changed |= lowerKillInstrs(false);
   } else if (GlobalFlags == StateWQM) {
     // Shader only needs WQM
-    auto MI = BuildMI(Entry, EntryMI, DebugLoc(), TII->get(WQMOpc), Exec)
-                  .addReg(Exec);
+    auto MI =
+        BuildMI(Entry, EntryMI, DebugLoc(), TII->get(LMC.WQMOpc), LMC.ExecReg)
+            .addReg(LMC.ExecReg);
     LIS->InsertMachineInstrInMaps(*MI);
     lowerKillInstrs(true);
     Changed = true;
@@ -1798,7 +1766,7 @@ bool SIWholeQuadMode::run(MachineFunction &MF) {
   }
 
   // Compute live range for live mask
-  if (LiveMaskReg != Exec)
+  if (LiveMaskReg != LMC.ExecReg)
     LIS->createAndComputeVirtRegInterval(LiveMaskReg);
 
   // Physical registers like SCC aren't tracked by default anyway, so just
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index fe94887cdff98..296ce5a46287c 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -1127,19 +1127,26 @@ def S_CBRANCH_I_FORK : SOPK_Pseudo <
   "$sdst, $simm16"
 >;
 
-// This is hasSideEffects to allow its use in readcyclecounter selection.
 // FIXME: Need to truncate immediate to 16-bits.
-// FIXME: Should have separate pseudos for known may read MODE and
-// only read MODE.
-def S_GETREG_B32 : SOPK_Pseudo <
+class S_GETREG_B32_Pseudo<list<dag> pattern=[]> : SOPK_Pseudo <
   "s_getreg_b32",
   (outs SReg_32:$sdst), (ins hwreg:$simm16),
-  "$sdst, $simm16",
-  [(set i32:$sdst, (int_amdgcn_s_getreg (i32 timm:$simm16)))]> {
+  "$sdst, $simm16", pattern>;
+
+// This is hasSideEffects to allow its use in readcyclecounter selection.
+// FIXME: Should have separate pseudos for known may read MODE and
+// only read MODE.
+def S_GETREG_B32 : S_GETREG_B32_Pseudo<
+    [(set i32:$sdst, (int_amdgcn_s_getreg (i32 timm:$simm16)))]> {
   let hasSideEffects = 1;
   let Uses = [MODE];
 }
 
+// A version of the pseudo for reading hardware register fields that are
+// known to remain the same during the course of the run. Has no side
+// effects and doesn't read MODE.
+def S_GETREG_B32_const : S_GETREG_B32_Pseudo;
+
 let Defs = [MODE], Uses = [MODE] in {
 
 // FIXME: Need to truncate immediate to 16-bits.
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 40da4f96aefdb..c80302e03beea 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -3157,6 +3157,34 @@ bool isValid32BitLiteral(uint64_t Val, bool IsFP64) {
   return isUInt<32>(Val) || isInt<32>(Val);
 }
 
+int64_t encode32BitLiteral(int64_t Imm, OperandType Type) {
+  switch (Type) {
+  default:
+    break;
+  case OPERAND_REG_IMM_BF16:
+  case OPERAND_REG_IMM_FP16:
+  case OPERAND_REG_INLINE_C_BF16:
+  case OPERAND_REG_INLINE_C_FP16:
+    return Imm & 0xffff;
+  case OPERAND_INLINE_SPLIT_BARRIER_INT32:
+  case OPERAND_REG_IMM_FP32:
+  case OPERAND_REG_IMM_INT32:
+  case OPERAND_REG_IMM_V2BF16:
+  case OPERAND_REG_IMM_V2FP16:
+  case OPERAND_REG_IMM_V2FP32:
+  case OPERAND_REG_IMM_V2INT16:
+  case OPERAND_REG_IMM_V2INT32:
+  case OPERAND_REG_INLINE_AC_FP32:
+  case OPERAND_REG_INLINE_AC_INT32:
+  case OPERAND_REG_INLINE_C_FP32:
+  case OPERAND_REG_INLINE_C_INT32:
+    return Lo_32(Imm);
+  case OPERAND_REG_IMM_FP64:
+    return Hi_32(Imm);
+  }
+  return Imm;
+}
+
 bool isArgPassedInSGPR(const Argument *A) {
   const Function *F = A->getParent();
 
@@ -3533,6 +3561,54 @@ bool isPackedFP32Inst(unsigned Opc) {
   }
 }
 
+const std::array<unsigned, 3> &ClusterDimsAttr::getDims() const {
+  assert(isFixedDims() && "expect kind to be FixedDims");
+  return Dims;
+}
+
+std::string ClusterDimsAttr::to_string() const {
+  SmallString<10> Buffer;
+  raw_svector_ostream OS(Buffer);
+
+  switch (getKind()) {
+  case Kind::Unknown:
+    return "";
+  case Kind::NoCluster: {
+    OS << EncoNoCluster << ',' << EncoNoCluster << ',' << EncoNoCluster;
+    return Buffer.c_str();
+  }
+  case Kind::VariableDims: {
+    OS << EncoVariableDims << ',' << EncoVariableDims << ','
+       << EncoVariableDims;
+    return Buffer.c_str();
+  }
+  case Kind::FixedDims: {
+    OS << Dims[0] << ',' << Dims[1] << ',' << Dims[2];
+    return Buffer.c_str();
+  }
+  }
+  llvm_unreachable("Unknown ClusterDimsAttr kind");
+}
+
+ClusterDimsAttr ClusterDimsAttr::get(const Function &F) {
+  std::optional<SmallVector<unsigned>> Attr =
+      getIntegerVecAttribute(F, "amdgpu-cluster-dims", /*Size=*/3);
+  ClusterDimsAttr::Kind AttrKind = Kind::FixedDims;
+
+  if (!Attr.has_value())
+    AttrKind = Kind::Unknown;
+  else if (all_of(*Attr, [](unsigned V) { return V == EncoNoCluster; }))
+    AttrKind = Kind::NoCluster;
+  else if (all_of(*Attr, [](unsigned V) { return V == EncoVariableDims; }))
+    AttrKind = Kind::VariableDims;
+
+  ClusterDimsAttr A(AttrKind);
+  if (AttrKind == Kind::FixedDims)
+    A.Dims = {(*Attr)[0], (*Attr)[1], (*Attr)[2]};
+
+  return A;
+}
+
 } // namespace AMDGPU
 
 raw_ostream &operator<<(raw_ostream &OS,
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 3fcd16f9290b1..37b0262966160 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -1718,6 +1718,9 @@ bool isInlinableLiteralV2F16(uint32_t Literal);
 LLVM_READNONE
 bool isValid32BitLiteral(uint64_t Val, bool IsFP64);
 
+LLVM_READNONE
+int64_t encode32BitLiteral(int64_t Imm, OperandType Type);
+
 bool isArgPassedInSGPR(const Argument *Arg);
 
 bool isArgPassedInSGPR(const CallBase *CB, unsigned ArgNo);
@@ -1813,6 +1816,50 @@ bool supportsScaleOffset(const MCInstrInfo &MII, unsigned Opcode);
 /// must be defined in terms of bytes.
 unsigned getLdsDwGranularity(const MCSubtargetInfo &ST);
 
+class ClusterDimsAttr {
+public:
+  enum class Kind { Unknown, NoCluster, VariableDims, FixedDims };
+
+  ClusterDimsAttr() = default;
+
+  Kind getKind() const { return AttrKind; }
+
+  bool isUnknown() const { return getKind() == Kind::Unknown; }
+
+  bool isNoCluster() const { return getKind() == Kind::NoCluster; }
+
+  bool isFixedDims() const { return getKind() == Kind::FixedDims; }
+
+  bool isVariableDims() const { return getKind() == Kind::VariableDims; }
+
+  void setUnknown() { *this = ClusterDimsAttr(Kind::Unknown); }
+
+  void setNoCluster() { *this = ClusterDimsAttr(Kind::NoCluster); }
+
+  void setVariableDims() { *this = ClusterDimsAttr(Kind::VariableDims); }
+
+  /// \returns the dims stored. Note that this function can only be called if
+  /// the kind is \p Fixed.
+  const std::array<unsigned, 3> &getDims() const;
+
+  bool operator==(const ClusterDimsAttr &RHS) const {
+    return AttrKind == RHS.AttrKind && Dims == RHS.Dims;
+  }
+
+  std::string to_string() const;
+
+  static ClusterDimsAttr get(const Function &F);
+
+private:
+  enum Encoding { EncoNoCluster = 0, EncoVariableDims = 1024 };
+
+  ClusterDimsAttr(Kind AttrKind) : AttrKind(AttrKind) {}
+
+  std::array<unsigned, 3> Dims = {0, 0, 0};
+
+  Kind AttrKind = Kind::Unknown;
+};
+
 } // end namespace AMDGPU
 
 raw_ostream &operator<<(raw_ostream &OS,
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index 6f778a0d262af..f7279b664ed27 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -966,9 +966,14 @@ class MAIInst<string OpName, VOPProfile P, SDPatternOperator node, bit Scaled =
 class ScaledMAIInst<string OpName, MAIInst BaseInst, SDPatternOperator node> :
   MAIInst<OpName, BaseInst.Pfl, node, /*Scaled=*/true> {
   // Append operands from V_MFMA_LD_SCALE_B32, but we need to rename them.
+  // Restrict to VGPR only (VRegSrc_32) for the scale operands to workaround a
+  // hardware design defect: For all Inline/SGPR constants, SP HW use bits
+  // [30:23] as the scale.
+  // TODO: We may still be able to allow Inline Constants/SGPR, with a proper
+  // shift, to obtain a potentially better performance.
   let InOperandList = !con(BaseInst.InOperandList,
-    (ins VSrc_b32:$scale_src0,
-         VSrc_b32:$scale_src1,
+    (ins VRegSrc_32:$scale_src0,
+         VRegSrc_32:$scale_src1,
          op_sel0:$src0_modifiers,
          op_sel_hi0:$src1_modifiers));
   let AsmOperands =
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index 5550a0c08b918..b900510d7622a 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -414,10 +414,9 @@ class VOP3a_BITOP3_gfx12<bits<10> op, VOPProfile p> : VOP3e_gfx11_gfx12<op, p> {
 }
 
 class VOP3a_ScaleSel_gfx1250<bits<10> op, VOPProfile p> : VOP3e_gfx11_gfx12<op, p> {
-  bits<3> scale_sel;
+  bits<4> scale_sel;
 
-  let Inst{13-11} = scale_sel;
-  let Inst{14} = 0;
+  let Inst{14-11} = scale_sel;
 }
 
 class VOP3Interp_gfx10<bits<10> op, VOPProfile p> : VOP3e_gfx10<op, p> {
diff --git a/llvm/lib/Target/ARC/ARCTargetMachine.cpp b/llvm/lib/Target/ARC/ARCTargetMachine.cpp
index 370336394ba7f..8e1944062a2c3 100644
--- a/llvm/lib/Target/ARC/ARCTargetMachine.cpp
+++ b/llvm/lib/Target/ARC/ARCTargetMachine.cpp
@@ -33,12 +33,9 @@ ARCTargetMachine::ARCTargetMachine(const Target &T, const Triple &TT,
                                    std::optional<Reloc::Model> RM,
                                    std::optional<CodeModel::Model> CM,
                                    CodeGenOptLevel OL, bool JIT)
-    : CodeGenTargetMachineImpl(
-          T,
-          "e-m:e-p:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-"
-          "f32:32:32-i64:32-f64:32-a:0:32-n32",
-          TT, CPU, FS, Options, getRelocModel(RM),
-          getEffectiveCodeModel(CM, CodeModel::Small), OL),
+    : CodeGenTargetMachineImpl(T, TT.computeDataLayout(), TT, CPU, FS, Options,
+                               getRelocModel(RM),
+                               getEffectiveCodeModel(CM, CodeModel::Small), OL),
       TLOF(std::make_unique<TargetLoweringObjectFileELF>()),
       Subtarget(TT, std::string(CPU), std::string(FS), *this) {
   initAsmInfo();
diff --git a/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp
index bc20daf0cfbbc..e94220af05a0d 100644
--- a/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp
@@ -310,8 +310,7 @@ ARMBaseRegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC,
 }
 
 const TargetRegisterClass *
-ARMBaseRegisterInfo::getPointerRegClass(const MachineFunction &MF, unsigned Kind)
-                                                                         const {
+ARMBaseRegisterInfo::getPointerRegClass(unsigned Kind) const {
   return &ARM::GPRRegClass;
 }
 
@@ -708,7 +707,7 @@ ARMBaseRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
   const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
   const MCInstrDesc &MCID = TII.get(ADDriOpc);
   Register BaseReg = MRI.createVirtualRegister(&ARM::GPRRegClass);
-  MRI.constrainRegClass(BaseReg, TII.getRegClass(MCID, 0, this, MF));
+  MRI.constrainRegClass(BaseReg, TII.getRegClass(MCID, 0, this));
 
   MachineInstrBuilder MIB = BuildMI(*MBB, Ins, DL, MCID, BaseReg)
     .addFrameIndex(FrameIdx).addImm(Offset);
@@ -882,7 +881,7 @@ ARMBaseRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
 
   const MCInstrDesc &MCID = MI.getDesc();
   const TargetRegisterClass *RegClass =
-      TII.getRegClass(MCID, FIOperandNum, this, *MI.getParent()->getParent());
+      TII.getRegClass(MCID, FIOperandNum, this);
 
   if (Offset == 0 && (FrameReg.isVirtual() || RegClass->contains(FrameReg)))
     // Must be addrmode4/6.
diff --git a/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h b/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h
index 69e10ac2a54d2..5b67b34089d7e 100644
--- a/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h
+++ b/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h
@@ -91,8 +91,7 @@ class ARMBaseRegisterInfo : public ARMGenRegisterInfo {
                               MCRegister PhysReg) const override;
 
   const TargetRegisterClass *
-  getPointerRegClass(const MachineFunction &MF,
-                     unsigned Kind = 0) const override;
+  getPointerRegClass(unsigned Kind = 0) const override;
   const TargetRegisterClass *
   getCrossCopyRegClass(const TargetRegisterClass *RC) const override;
 
diff --git a/llvm/lib/Target/ARM/ARMFrameLowering.cpp b/llvm/lib/Target/ARM/ARMFrameLowering.cpp
index a8da70eadea5b..138981ad92a87 100644
--- a/llvm/lib/Target/ARM/ARMFrameLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMFrameLowering.cpp
@@ -2364,7 +2364,7 @@ static unsigned estimateRSStackSizeLimit(MachineFunction &MF,
           break;
 
         const MCInstrDesc &MCID = MI.getDesc();
-        const TargetRegisterClass *RegClass = TII.getRegClass(MCID, i, TRI, MF);
+        const TargetRegisterClass *RegClass = TII.getRegClass(MCID, i, TRI);
         if (RegClass && !RegClass->contains(ARM::SP))
           HasNonSPFrameIndex = true;
 
diff --git a/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
index eea0cb61af2bf..cd4299b7a1a53 100644
--- a/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
@@ -2424,7 +2424,7 @@ bool ARMPreAllocLoadStoreOpt::RescheduleOps(
           Ops.pop_back();
 
           const MCInstrDesc &MCID = TII->get(NewOpc);
-          const TargetRegisterClass *TRC = TII->getRegClass(MCID, 0, TRI, *MF);
+          const TargetRegisterClass *TRC = TII->getRegClass(MCID, 0, TRI);
           MRI->constrainRegClass(FirstReg, TRC);
           MRI->constrainRegClass(SecondReg, TRC);
 
@@ -3014,7 +3014,7 @@ static void AdjustBaseAndOffset(MachineInstr *MI, Register NewBaseReg,
   MachineFunction *MF = MI->getMF();
   MachineRegisterInfo &MRI = MF->getRegInfo();
   const MCInstrDesc &MCID = TII->get(MI->getOpcode());
-  const TargetRegisterClass *TRC = TII->getRegClass(MCID, BaseOp, TRI, *MF);
+  const TargetRegisterClass *TRC = TII->getRegClass(MCID, BaseOp, TRI);
   MRI.constrainRegClass(NewBaseReg, TRC);
 
   int OldOffset = MI->getOperand(BaseOp + 1).getImm();
@@ -3071,10 +3071,10 @@ static MachineInstr *createPostIncLoadStore(MachineInstr *MI, int Offset,
 
   const MCInstrDesc &MCID = TII->get(NewOpcode);
   // Constrain the def register class
-  const TargetRegisterClass *TRC = TII->getRegClass(MCID, 0, TRI, *MF);
+  const TargetRegisterClass *TRC = TII->getRegClass(MCID, 0, TRI);
   MRI.constrainRegClass(NewReg, TRC);
   // And do the same for the base operand
-  TRC = TII->getRegClass(MCID, 2, TRI, *MF);
+  TRC = TII->getRegClass(MCID, 2, TRI);
   MRI.constrainRegClass(MI->getOperand(1).getReg(), TRC);
 
   unsigned AddrMode = (MCID.TSFlags & ARMII::AddrModeMask);
diff --git a/llvm/lib/Target/ARM/ARMTargetMachine.cpp b/llvm/lib/Target/ARM/ARMTargetMachine.cpp
index fedf9e2cf34b1..346776e0c4b25 100644
--- a/llvm/lib/Target/ARM/ARMTargetMachine.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetMachine.cpp
@@ -121,62 +121,6 @@ static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
   return std::make_unique<ARMElfTargetObjectFile>();
 }
 
-static std::string computeDataLayout(const Triple &TT,
-                                     const TargetOptions &Options,
-                                     bool isLittle) {
-  auto ABI = ARM::computeTargetABI(TT, Options.MCOptions.ABIName);
-  std::string Ret;
-
-  if (isLittle)
-    // Little endian.
-    Ret += "e";
-  else
-    // Big endian.
-    Ret += "E";
-
-  Ret += DataLayout::getManglingComponent(TT);
-
-  // Pointers are 32 bits and aligned to 32 bits.
-  Ret += "-p:32:32";
-
-  // Function pointers are aligned to 8 bits (because the LSB stores the
-  // ARM/Thumb state).
-  Ret += "-Fi8";
-
-  // ABIs other than APCS have 64 bit integers with natural alignment.
-  if (ABI != ARM::ARM_ABI_APCS)
-    Ret += "-i64:64";
-
-  // We have 64 bits floats. The APCS ABI requires them to be aligned to 32
-  // bits, others to 64 bits. We always try to align to 64 bits.
-  if (ABI == ARM::ARM_ABI_APCS)
-    Ret += "-f64:32:64";
-
-  // We have 128 and 64 bit vectors. The APCS ABI aligns them to 32 bits, others
-  // to 64. We always ty to give them natural alignment.
-  if (ABI == ARM::ARM_ABI_APCS)
-    Ret += "-v64:32:64-v128:32:128";
-  else if (ABI != ARM::ARM_ABI_AAPCS16)
-    Ret += "-v128:64:128";
-
-  // Try to align aggregates to 32 bits (the default is 64 bits, which has no
-  // particular hardware support on 32-bit ARM).
-  Ret += "-a:0:32";
-
-  // Integer registers are 32 bits.
-  Ret += "-n32";
-
-  // The stack is 64 bit aligned on AAPCS and 32 bit aligned everywhere else.
-  if (ABI == ARM::ARM_ABI_AAPCS16)
-    Ret += "-S128";
-  else if (ABI == ARM::ARM_ABI_AAPCS)
-    Ret += "-S64";
-  else
-    Ret += "-S32";
-
-  return Ret;
-}
-
 static Reloc::Model getEffectiveRelocModel(const Triple &TT,
                                            std::optional<Reloc::Model> RM) {
   if (!RM)
@@ -201,12 +145,13 @@ ARMBaseTargetMachine::ARMBaseTargetMachine(const Target &T, const Triple &TT,
                                            const TargetOptions &Options,
                                            std::optional<Reloc::Model> RM,
                                            std::optional<CodeModel::Model> CM,
-                                           CodeGenOptLevel OL, bool isLittle)
-    : CodeGenTargetMachineImpl(T, computeDataLayout(TT, Options, isLittle), TT,
-                               CPU, FS, Options, getEffectiveRelocModel(TT, RM),
-                               getEffectiveCodeModel(CM, CodeModel::Small), OL),
+                                           CodeGenOptLevel OL)
+    : CodeGenTargetMachineImpl(
+          T, TT.computeDataLayout(Options.MCOptions.ABIName), TT, CPU, FS,
+          Options, getEffectiveRelocModel(TT, RM),
+          getEffectiveCodeModel(CM, CodeModel::Small), OL),
       TargetABI(ARM::computeTargetABI(TT, Options.MCOptions.ABIName)),
-      TLOF(createTLOF(getTargetTriple())), isLittle(isLittle) {
+      TLOF(createTLOF(getTargetTriple())), isLittle(TT.isLittleEndian()) {
 
   // Default to triple-appropriate float ABI
   if (Options.FloatABIType == FloatABI::Default) {
@@ -334,7 +279,7 @@ ARMLETargetMachine::ARMLETargetMachine(const Target &T, const Triple &TT,
                                        std::optional<Reloc::Model> RM,
                                        std::optional<CodeModel::Model> CM,
                                        CodeGenOptLevel OL, bool JIT)
-    : ARMBaseTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {}
+    : ARMBaseTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {}
 
 ARMBETargetMachine::ARMBETargetMachine(const Target &T, const Triple &TT,
                                        StringRef CPU, StringRef FS,
@@ -342,7 +287,7 @@ ARMBETargetMachine::ARMBETargetMachine(const Target &T, const Triple &TT,
                                        std::optional<Reloc::Model> RM,
                                        std::optional<CodeModel::Model> CM,
                                        CodeGenOptLevel OL, bool JIT)
-    : ARMBaseTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {}
+    : ARMBaseTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {}
 
 namespace {
 
diff --git a/llvm/lib/Target/ARM/ARMTargetMachine.h b/llvm/lib/Target/ARM/ARMTargetMachine.h
index 1d73af1da6d02..c417c4c8bae65 100644
--- a/llvm/lib/Target/ARM/ARMTargetMachine.h
+++ b/llvm/lib/Target/ARM/ARMTargetMachine.h
@@ -42,8 +42,7 @@ class ARMBaseTargetMachine : public CodeGenTargetMachineImpl {
   ARMBaseTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
                        StringRef FS, const TargetOptions &Options,
                        std::optional<Reloc::Model> RM,
-                       std::optional<CodeModel::Model> CM, CodeGenOptLevel OL,
-                       bool isLittle);
+                       std::optional<CodeModel::Model> CM, CodeGenOptLevel OL);
   ~ARMBaseTargetMachine() override;
 
   const ARMSubtarget *getSubtargetImpl(const Function &F) const override;
diff --git a/llvm/lib/Target/ARM/MLxExpansionPass.cpp b/llvm/lib/Target/ARM/MLxExpansionPass.cpp
index 00d8d84654ded..8e1bf1d957400 100644
--- a/llvm/lib/Target/ARM/MLxExpansionPass.cpp
+++ b/llvm/lib/Target/ARM/MLxExpansionPass.cpp
@@ -283,9 +283,7 @@ MLxExpansion::ExpandFPMLxInstruction(MachineBasicBlock &MBB, MachineInstr *MI,
 
   const MCInstrDesc &MCID1 = TII->get(MulOpc);
   const MCInstrDesc &MCID2 = TII->get(AddSubOpc);
-  const MachineFunction &MF = *MI->getParent()->getParent();
-  Register TmpReg =
-      MRI->createVirtualRegister(TII->getRegClass(MCID1, 0, TRI, MF));
+  Register TmpReg = MRI->createVirtualRegister(TII->getRegClass(MCID1, 0, TRI));
 
   MachineInstrBuilder MIB = BuildMI(MBB, MI, MI->getDebugLoc(), MCID1, TmpReg)
     .addReg(Src1Reg, getKillRegState(Src1Kill))
diff --git a/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp b/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp
index ce4ee157289df..4b8c2fd569ead 100644
--- a/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp
+++ b/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp
@@ -24,7 +24,7 @@
 using namespace llvm;
 
 Thumb1InstrInfo::Thumb1InstrInfo(const ARMSubtarget &STI)
-    : ARMBaseInstrInfo(STI) {}
+    : ARMBaseInstrInfo(STI), RI(STI) {}
 
 /// Return the noop instruction to use for a noop.
 MCInst Thumb1InstrInfo::getNop() const {
diff --git a/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp b/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp
index e91441b12fe6f..431ce38ad6e99 100644
--- a/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp
+++ b/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp
@@ -46,7 +46,7 @@ PreferNoCSEL("prefer-no-csel", cl::Hidden,
              cl::init(false));
 
 Thumb2InstrInfo::Thumb2InstrInfo(const ARMSubtarget &STI)
-    : ARMBaseInstrInfo(STI) {}
+    : ARMBaseInstrInfo(STI), RI(STI) {}
 
 /// Return the noop instruction to use for a noop.
 MCInst Thumb2InstrInfo::getNop() const {
@@ -564,8 +564,7 @@ bool llvm::rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
   bool isSub = false;
 
   MachineFunction &MF = *MI.getParent()->getParent();
-  const TargetRegisterClass *RegClass =
-      TII.getRegClass(Desc, FrameRegIdx, TRI, MF);
+  const TargetRegisterClass *RegClass = TII.getRegClass(Desc, FrameRegIdx, TRI);
 
   // Memory operands in inline assembly always use AddrModeT2_i12.
   if (Opcode == ARM::INLINEASM || Opcode == ARM::INLINEASM_BR)
diff --git a/llvm/lib/Target/ARM/ThumbRegisterInfo.cpp b/llvm/lib/Target/ARM/ThumbRegisterInfo.cpp
index 911502605c227..12875c233312a 100644
--- a/llvm/lib/Target/ARM/ThumbRegisterInfo.cpp
+++ b/llvm/lib/Target/ARM/ThumbRegisterInfo.cpp
@@ -35,12 +35,13 @@ extern cl::opt<bool> ReuseFrameIndexVals;
 
 using namespace llvm;
 
-ThumbRegisterInfo::ThumbRegisterInfo() = default;
+ThumbRegisterInfo::ThumbRegisterInfo(const ARMSubtarget &STI)
+    : IsThumb1Only(STI.isThumb1Only()) {}
 
 const TargetRegisterClass *
 ThumbRegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC,
                                               const MachineFunction &MF) const {
-  if (!MF.getSubtarget<ARMSubtarget>().isThumb1Only())
+  if (!IsThumb1Only)
     return ARMBaseRegisterInfo::getLargestLegalSuperClass(RC, MF);
 
   if (ARM::tGPRRegClass.hasSubClassEq(RC))
@@ -49,10 +50,9 @@ ThumbRegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC,
 }
 
 const TargetRegisterClass *
-ThumbRegisterInfo::getPointerRegClass(const MachineFunction &MF,
-                                      unsigned Kind) const {
-  if (!MF.getSubtarget<ARMSubtarget>().isThumb1Only())
-    return ARMBaseRegisterInfo::getPointerRegClass(MF, Kind);
+ThumbRegisterInfo::getPointerRegClass(unsigned Kind) const {
+  if (!IsThumb1Only)
+    return ARMBaseRegisterInfo::getPointerRegClass(Kind);
   return &ARM::tGPRRegClass;
 }
 
diff --git a/llvm/lib/Target/ARM/ThumbRegisterInfo.h b/llvm/lib/Target/ARM/ThumbRegisterInfo.h
index ccfe211b808a5..1512a09cae200 100644
--- a/llvm/lib/Target/ARM/ThumbRegisterInfo.h
+++ b/llvm/lib/Target/ARM/ThumbRegisterInfo.h
@@ -23,16 +23,18 @@ namespace llvm {
   class ARMBaseInstrInfo;
 
 struct ThumbRegisterInfo : public ARMBaseRegisterInfo {
+private:
+  const bool IsThumb1Only;
+
 public:
-  ThumbRegisterInfo();
+  explicit ThumbRegisterInfo(const ARMSubtarget &STI);
 
   const TargetRegisterClass *
   getLargestLegalSuperClass(const TargetRegisterClass *RC,
                             const MachineFunction &MF) const override;
 
   const TargetRegisterClass *
-  getPointerRegClass(const MachineFunction &MF,
-                     unsigned Kind = 0) const override;
+  getPointerRegClass(unsigned Kind = 0) const override;
 
   /// emitLoadConstPool - Emits a load from constpool to materialize the
   /// specified immediate.
diff --git a/llvm/lib/Target/AVR/AVRRegisterInfo.cpp b/llvm/lib/Target/AVR/AVRRegisterInfo.cpp
index 051affe7110dd..18bea848baeab 100644
--- a/llvm/lib/Target/AVR/AVRRegisterInfo.cpp
+++ b/llvm/lib/Target/AVR/AVRRegisterInfo.cpp
@@ -289,8 +289,7 @@ Register AVRRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
 }
 
 const TargetRegisterClass *
-AVRRegisterInfo::getPointerRegClass(const MachineFunction &MF,
-                                    unsigned Kind) const {
+AVRRegisterInfo::getPointerRegClass(unsigned Kind) const {
   // FIXME: Currently we're using avr-gcc as reference, so we restrict
   // ptrs to Y and Z regs. Though avr-gcc has buggy implementation
   // of memory constraint, so we can fix it and bit avr-gcc here ;-)
diff --git a/llvm/lib/Target/AVR/AVRRegisterInfo.h b/llvm/lib/Target/AVR/AVRRegisterInfo.h
index 8eb0cf3039bbd..e69696b4d9160 100644
--- a/llvm/lib/Target/AVR/AVRRegisterInfo.h
+++ b/llvm/lib/Target/AVR/AVRRegisterInfo.h
@@ -44,8 +44,7 @@ class AVRRegisterInfo : public AVRGenRegisterInfo {
   Register getFrameRegister(const MachineFunction &MF) const override;
 
   const TargetRegisterClass *
-  getPointerRegClass(const MachineFunction &MF,
-                     unsigned Kind = 0) const override;
+  getPointerRegClass(unsigned Kind = 0) const override;
 
   /// Splits a 16-bit `DREGS` register into the lo/hi register pair.
   /// \param Reg A 16-bit register to split.
diff --git a/llvm/lib/Target/AVR/AVRTargetMachine.cpp b/llvm/lib/Target/AVR/AVRTargetMachine.cpp
index fbd148478c894..f001d7974669a 100644
--- a/llvm/lib/Target/AVR/AVRTargetMachine.cpp
+++ b/llvm/lib/Target/AVR/AVRTargetMachine.cpp
@@ -28,9 +28,6 @@
 
 namespace llvm {
 
-static const char *AVRDataLayout =
-    "e-P1-p:16:8-i8:8-i16:8-i32:8-i64:8-f32:8-f64:8-n8:16-a:8";
-
 /// Processes a CPU name.
 static StringRef getCPU(StringRef CPU) {
   if (CPU.empty() || CPU == "generic") {
@@ -50,8 +47,8 @@ AVRTargetMachine::AVRTargetMachine(const Target &T, const Triple &TT,
                                    std::optional<Reloc::Model> RM,
                                    std::optional<CodeModel::Model> CM,
                                    CodeGenOptLevel OL, bool JIT)
-    : CodeGenTargetMachineImpl(T, AVRDataLayout, TT, getCPU(CPU), FS, Options,
-                               getEffectiveRelocModel(RM),
+    : CodeGenTargetMachineImpl(T, TT.computeDataLayout(), TT, getCPU(CPU), FS,
+                               Options, getEffectiveRelocModel(RM),
                                getEffectiveCodeModel(CM, CodeModel::Small), OL),
       SubTarget(TT, std::string(getCPU(CPU)), std::string(FS), *this) {
   this->TLOF = std::make_unique<AVRTargetObjectFile>();
diff --git a/llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp b/llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp
index a347794a9a30c..d96f403d2f814 100644
--- a/llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp
+++ b/llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp
@@ -234,6 +234,7 @@ struct BPFOperand : public MCParsedAsmOperand {
         .Case("callx", true)
         .Case("goto", true)
         .Case("gotol", true)
+        .Case("gotox", true)
         .Case("may_goto", true)
         .Case("*", true)
         .Case("exit", true)
diff --git a/llvm/lib/Target/BPF/BPFAsmPrinter.cpp b/llvm/lib/Target/BPF/BPFAsmPrinter.cpp
index e3843e0e112e2..77dc4a75a7d68 100644
--- a/llvm/lib/Target/BPF/BPFAsmPrinter.cpp
+++ b/llvm/lib/Target/BPF/BPFAsmPrinter.cpp
@@ -11,52 +11,35 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "BPFAsmPrinter.h"
 #include "BPF.h"
 #include "BPFInstrInfo.h"
 #include "BPFMCInstLower.h"
 #include "BTFDebug.h"
 #include "MCTargetDesc/BPFInstPrinter.h"
 #include "TargetInfo/BPFTargetInfo.h"
+#include "llvm/BinaryFormat/ELF.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/IR/Module.h"
 #include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCSymbolELF.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "asm-printer"
 
-namespace {
-class BPFAsmPrinter : public AsmPrinter {
-public:
-  explicit BPFAsmPrinter(TargetMachine &TM,
-                         std::unique_ptr<MCStreamer> Streamer)
-      : AsmPrinter(TM, std::move(Streamer), ID), BTF(nullptr) {}
-
-  StringRef getPassName() const override { return "BPF Assembly Printer"; }
-  bool doInitialization(Module &M) override;
-  void printOperand(const MachineInstr *MI, int OpNum, raw_ostream &O);
-  bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
-                       const char *ExtraCode, raw_ostream &O) override;
-  bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNum,
-                             const char *ExtraCode, raw_ostream &O) override;
-
-  void emitInstruction(const MachineInstr *MI) override;
-
-  static char ID;
-
-private:
-  BTFDebug *BTF;
-};
-} // namespace
-
 bool BPFAsmPrinter::doInitialization(Module &M) {
   AsmPrinter::doInitialization(M);
 
@@ -69,6 +52,45 @@ bool BPFAsmPrinter::doInitialization(Module &M) {
   return false;
 }
 
+const BPFTargetMachine &BPFAsmPrinter::getBTM() const {
+  return static_cast<const BPFTargetMachine &>(TM);
+}
+
+bool BPFAsmPrinter::doFinalization(Module &M) {
+  // Remove unused globals which are previously used for jump table.
+  const BPFSubtarget *Subtarget = getBTM().getSubtargetImpl();
+  if (Subtarget->hasGotox()) {
+    std::vector<GlobalVariable *> Targets;
+    for (GlobalVariable &Global : M.globals()) {
+      if (Global.getLinkage() != GlobalValue::PrivateLinkage)
+        continue;
+      if (!Global.isConstant() || !Global.hasInitializer())
+        continue;
+
+      Constant *CV = dyn_cast<Constant>(Global.getInitializer());
+      if (!CV)
+        continue;
+      ConstantArray *CA = dyn_cast<ConstantArray>(CV);
+      if (!CA)
+        continue;
+
+      for (unsigned i = 1, e = CA->getNumOperands(); i != e; ++i) {
+        if (!dyn_cast<BlockAddress>(CA->getOperand(i)))
+          continue;
+      }
+      Targets.push_back(&Global);
+    }
+
+    for (GlobalVariable *GV : Targets) {
+      GV->replaceAllUsesWith(PoisonValue::get(GV->getType()));
+      GV->dropAllReferences();
+      GV->eraseFromParent();
+    }
+  }
+
+  return AsmPrinter::doFinalization(M);
+}
+
 void BPFAsmPrinter::printOperand(const MachineInstr *MI, int OpNum,
                                  raw_ostream &O) {
   const MachineOperand &MO = MI->getOperand(OpNum);
@@ -150,6 +172,50 @@ void BPFAsmPrinter::emitInstruction(const MachineInstr *MI) {
   EmitToStreamer(*OutStreamer, TmpInst);
 }
 
+MCSymbol *BPFAsmPrinter::getJTPublicSymbol(unsigned JTI) {
+  SmallString<60> Name;
+  raw_svector_ostream(Name)
+      << "BPF.JT." << MF->getFunctionNumber() << '.' << JTI;
+  MCSymbol *S = OutContext.getOrCreateSymbol(Name);
+  if (auto *ES = static_cast<MCSymbolELF *>(S)) {
+    ES->setBinding(ELF::STB_GLOBAL);
+    ES->setType(ELF::STT_OBJECT);
+  }
+  return S;
+}
+
+void BPFAsmPrinter::emitJumpTableInfo() {
+  const MachineJumpTableInfo *MJTI = MF->getJumpTableInfo();
+  if (!MJTI)
+    return;
+
+  const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables();
+  if (JT.empty())
+    return;
+
+  const TargetLoweringObjectFile &TLOF = getObjFileLowering();
+  const Function &F = MF->getFunction();
+  MCSection *JTS = TLOF.getSectionForJumpTable(F, TM);
+  assert(MJTI->getEntryKind() == MachineJumpTableInfo::EK_BlockAddress);
+  unsigned EntrySize = MJTI->getEntrySize(getDataLayout());
+  OutStreamer->switchSection(JTS);
+  for (unsigned JTI = 0; JTI < JT.size(); JTI++) {
+    ArrayRef<MachineBasicBlock *> JTBBs = JT[JTI].MBBs;
+    if (JTBBs.empty())
+      continue;
+
+    MCSymbol *JTStart = getJTPublicSymbol(JTI);
+    OutStreamer->emitLabel(JTStart);
+    for (const MachineBasicBlock *MBB : JTBBs) {
+      const MCExpr *LHS = MCSymbolRefExpr::create(MBB->getSymbol(), OutContext);
+      OutStreamer->emitValue(LHS, EntrySize);
+    }
+    const MCExpr *JTSize =
+        MCConstantExpr::create(JTBBs.size() * EntrySize, OutContext);
+    OutStreamer->emitELFSize(JTStart, JTSize);
+  }
+}
+
 char BPFAsmPrinter::ID = 0;
 
 INITIALIZE_PASS(BPFAsmPrinter, "bpf-asm-printer", "BPF Assembly Printer", false,
diff --git a/llvm/lib/Target/BPF/BPFAsmPrinter.h b/llvm/lib/Target/BPF/BPFAsmPrinter.h
new file mode 100644
index 0000000000000..0cfb2839c8ff9
--- /dev/null
+++ b/llvm/lib/Target/BPF/BPFAsmPrinter.h
@@ -0,0 +1,48 @@
+//===-- BPFFrameLowering.h - Define frame lowering for BPF -----*- C++ -*--===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_BPF_BPFASMPRINTER_H
+#define LLVM_LIB_TARGET_BPF_BPFASMPRINTER_H
+
+#include "BPFTargetMachine.h"
+#include "BTFDebug.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+
+namespace llvm {
+
+class BPFAsmPrinter : public AsmPrinter {
+public:
+  explicit BPFAsmPrinter(TargetMachine &TM,
+                         std::unique_ptr<MCStreamer> Streamer)
+      : AsmPrinter(TM, std::move(Streamer), ID), BTF(nullptr), TM(TM) {}
+
+  StringRef getPassName() const override { return "BPF Assembly Printer"; }
+  bool doInitialization(Module &M) override;
+  bool doFinalization(Module &M) override;
+  void printOperand(const MachineInstr *MI, int OpNum, raw_ostream &O);
+  bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                       const char *ExtraCode, raw_ostream &O) override;
+  bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNum,
+                             const char *ExtraCode, raw_ostream &O) override;
+
+  void emitInstruction(const MachineInstr *MI) override;
+  MCSymbol *getJTPublicSymbol(unsigned JTI);
+  virtual void emitJumpTableInfo() override;
+
+  static char ID;
+
+private:
+  BTFDebug *BTF;
+  TargetMachine &TM;
+
+  const BPFTargetMachine &getBTM() const;
+};
+
+} // namespace llvm
+
+#endif /* LLVM_LIB_TARGET_BPF_BPFASMPRINTER_H */
diff --git a/llvm/lib/Target/BPF/BPFISelLowering.cpp b/llvm/lib/Target/BPF/BPFISelLowering.cpp
index f4f414d192df0..6e5520c3dbb18 100644
--- a/llvm/lib/Target/BPF/BPFISelLowering.cpp
+++ b/llvm/lib/Target/BPF/BPFISelLowering.cpp
@@ -18,6 +18,7 @@
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
 #include "llvm/CodeGen/ValueTypes.h"
@@ -38,6 +39,10 @@ static cl::opt<bool> BPFExpandMemcpyInOrder("bpf-expand-memcpy-in-order",
   cl::Hidden, cl::init(false),
   cl::desc("Expand memcpy into load/store pairs in order"));
 
+static cl::opt<unsigned> BPFMinimumJumpTableEntries(
+    "bpf-min-jump-table-entries", cl::init(13), cl::Hidden,
+    cl::desc("Set minimum number of entries to use a jump table on BPF"));
+
 static void fail(const SDLoc &DL, SelectionDAG &DAG, const Twine &Msg,
                  SDValue Val = {}) {
   std::string Str;
@@ -67,12 +72,16 @@ BPFTargetLowering::BPFTargetLowering(const TargetMachine &TM,
 
   setOperationAction(ISD::BR_CC, MVT::i64, Custom);
   setOperationAction(ISD::BR_JT, MVT::Other, Expand);
-  setOperationAction(ISD::BRIND, MVT::Other, Expand);
   setOperationAction(ISD::BRCOND, MVT::Other, Expand);
 
+  if (!STI.hasGotox())
+    setOperationAction(ISD::BRIND, MVT::Other, Expand);
+
   setOperationAction(ISD::TRAP, MVT::Other, Custom);
 
   setOperationAction({ISD::GlobalAddress, ISD::ConstantPool}, MVT::i64, Custom);
+  if (STI.hasGotox())
+    setOperationAction({ISD::JumpTable, ISD::BlockAddress}, MVT::i64, Custom);
 
   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom);
   setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
@@ -159,6 +168,7 @@ BPFTargetLowering::BPFTargetLowering(const TargetMachine &TM,
 
   setBooleanContents(ZeroOrOneBooleanContent);
   setMaxAtomicSizeInBitsSupported(64);
+  setMinimumJumpTableEntries(BPFMinimumJumpTableEntries);
 
   // Function alignments
   setMinFunctionAlignment(Align(8));
@@ -246,6 +256,10 @@ bool BPFTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
   return TargetLoweringBase::isZExtFree(Val, VT2);
 }
 
+unsigned BPFTargetLowering::getJumpTableEncoding() const {
+  return MachineJumpTableInfo::EK_BlockAddress;
+}
+
 BPFTargetLowering::ConstraintType
 BPFTargetLowering::getConstraintType(StringRef Constraint) const {
   if (Constraint.size() == 1) {
@@ -316,10 +330,14 @@ SDValue BPFTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
     report_fatal_error("unimplemented opcode: " + Twine(Op.getOpcode()));
   case ISD::BR_CC:
     return LowerBR_CC(Op, DAG);
+  case ISD::JumpTable:
+    return LowerJumpTable(Op, DAG);
   case ISD::GlobalAddress:
     return LowerGlobalAddress(Op, DAG);
   case ISD::ConstantPool:
     return LowerConstantPool(Op, DAG);
+  case ISD::BlockAddress:
+    return LowerBlockAddress(Op, DAG);
   case ISD::SELECT_CC:
     return LowerSELECT_CC(Op, DAG);
   case ISD::SDIV:
@@ -780,6 +798,11 @@ SDValue BPFTargetLowering::LowerTRAP(SDValue Op, SelectionDAG &DAG) const {
   return LowerCall(CLI, InVals);
 }
 
+SDValue BPFTargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
+  JumpTableSDNode *N = cast<JumpTableSDNode>(Op);
+  return getAddr(N, DAG);
+}
+
 const char *BPFTargetLowering::getTargetNodeName(unsigned Opcode) const {
   switch ((BPFISD::NodeType)Opcode) {
   case BPFISD::FIRST_NUMBER:
@@ -800,17 +823,17 @@ const char *BPFTargetLowering::getTargetNodeName(unsigned Opcode) const {
   return nullptr;
 }
 
-static SDValue getTargetNode(GlobalAddressSDNode *N, const SDLoc &DL, EVT Ty,
-                             SelectionDAG &DAG, unsigned Flags) {
-  return DAG.getTargetGlobalAddress(N->getGlobal(), DL, Ty, 0, Flags);
-}
-
 static SDValue getTargetNode(ConstantPoolSDNode *N, const SDLoc &DL, EVT Ty,
                              SelectionDAG &DAG, unsigned Flags) {
   return DAG.getTargetConstantPool(N->getConstVal(), Ty, N->getAlign(),
                                    N->getOffset(), Flags);
 }
 
+static SDValue getTargetNode(JumpTableSDNode *N, const SDLoc &DL, EVT Ty,
+                             SelectionDAG &DAG, unsigned Flags) {
+  return DAG.getTargetJumpTable(N->getIndex(), Ty, Flags);
+}
+
 template <class NodeTy>
 SDValue BPFTargetLowering::getAddr(NodeTy *N, SelectionDAG &DAG,
                                    unsigned Flags) const {
@@ -827,7 +850,15 @@ SDValue BPFTargetLowering::LowerGlobalAddress(SDValue Op,
   if (N->getOffset() != 0)
     report_fatal_error("invalid offset for global address: " +
                        Twine(N->getOffset()));
-  return getAddr(N, DAG);
+
+  const GlobalValue *GVal = N->getGlobal();
+  SDLoc DL(Op);
+
+  // Wrap it in a TargetGlobalAddress
+  SDValue Addr = DAG.getTargetGlobalAddress(GVal, DL, MVT::i64);
+
+  // Emit pseudo instruction
+  return SDValue(DAG.getMachineNode(BPF::LDIMM64, DL, MVT::i64, Addr), 0);
 }
 
 SDValue BPFTargetLowering::LowerConstantPool(SDValue Op,
@@ -837,6 +868,18 @@ SDValue BPFTargetLowering::LowerConstantPool(SDValue Op,
   return getAddr(N, DAG);
 }
 
+SDValue BPFTargetLowering::LowerBlockAddress(SDValue Op,
+                                             SelectionDAG &DAG) const {
+  const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
+  SDLoc DL(Op);
+
+  // Wrap it in a TargetBlockAddress
+  SDValue Addr = DAG.getTargetBlockAddress(BA, MVT::i64);
+
+  // Emit pseudo instruction
+  return SDValue(DAG.getMachineNode(BPF::LDIMM64, DL, MVT::i64, Addr), 0);
+}
+
 unsigned
 BPFTargetLowering::EmitSubregExt(MachineInstr &MI, MachineBasicBlock *BB,
                                  unsigned Reg, bool isSigned) const {
@@ -900,6 +943,86 @@ BPFTargetLowering::EmitInstrWithCustomInserterMemcpy(MachineInstr &MI,
   return BB;
 }
 
+MachineBasicBlock *BPFTargetLowering::EmitInstrWithCustomInserterLDimm64(
+    MachineInstr &MI, MachineBasicBlock *BB) const {
+  MachineFunction *MF = BB->getParent();
+  const BPFInstrInfo *TII = MF->getSubtarget<BPFSubtarget>().getInstrInfo();
+  const TargetRegisterClass *RC = getRegClassFor(MVT::i64);
+  MachineRegisterInfo &RegInfo = MF->getRegInfo();
+  DebugLoc DL = MI.getDebugLoc();
+
+  // Build address taken map for Global Varaibles and BlockAddresses
+  DenseMap<const BasicBlock *, MachineBasicBlock *> AddressTakenBBs;
+  for (MachineBasicBlock &MBB : *MF) {
+    if (const BasicBlock *BB = MBB.getBasicBlock())
+      if (BB->hasAddressTaken())
+        AddressTakenBBs[BB] = &MBB;
+  }
+
+  MachineOperand &MO = MI.getOperand(1);
+  assert(MO.isBlockAddress() || MO.isGlobal());
+
+  MCRegister ResultReg = MI.getOperand(0).getReg();
+  Register TmpReg = RegInfo.createVirtualRegister(RC);
+
+  std::vector<MachineBasicBlock *> Targets;
+  unsigned JTI;
+
+  if (MO.isBlockAddress()) {
+    auto *BA = MO.getBlockAddress();
+    MachineBasicBlock *TgtMBB = AddressTakenBBs[BA->getBasicBlock()];
+    assert(TgtMBB);
+
+    Targets.push_back(TgtMBB);
+    JTI = MF->getOrCreateJumpTableInfo(getJumpTableEncoding())
+              ->createJumpTableIndex(Targets);
+
+    BuildMI(*BB, MI, DL, TII->get(BPF::LD_imm64), TmpReg)
+        .addJumpTableIndex(JTI);
+    BuildMI(*BB, MI, DL, TII->get(BPF::LDD), ResultReg)
+        .addReg(TmpReg)
+        .addImm(0);
+    MI.eraseFromParent();
+    return BB;
+  }
+
+  // Helper: emit LD_imm64 with operand GlobalAddress or JumpTable
+  auto emitLDImm64 = [&](const GlobalValue *GV = nullptr, unsigned JTI = -1) {
+    auto MIB = BuildMI(*BB, MI, DL, TII->get(BPF::LD_imm64), ResultReg);
+    if (GV)
+      MIB.addGlobalAddress(GV);
+    else
+      MIB.addJumpTableIndex(JTI);
+    MI.eraseFromParent();
+    return BB;
+  };
+
+  // Must be a global at this point
+  const GlobalValue *GVal = MO.getGlobal();
+  const auto *GV = dyn_cast<GlobalVariable>(GVal);
+
+  if (!GV || GV->getLinkage() != GlobalValue::PrivateLinkage ||
+      !GV->isConstant() || !GV->hasInitializer())
+    return emitLDImm64(GVal);
+
+  const auto *CA = dyn_cast<ConstantArray>(GV->getInitializer());
+  if (!CA)
+    return emitLDImm64(GVal);
+
+  for (const Use &Op : CA->operands()) {
+    if (!isa<BlockAddress>(Op))
+      return emitLDImm64(GVal);
+    auto *BA = cast<BlockAddress>(Op);
+    MachineBasicBlock *TgtMBB = AddressTakenBBs[BA->getBasicBlock()];
+    assert(TgtMBB);
+    Targets.push_back(TgtMBB);
+  }
+
+  JTI = MF->getOrCreateJumpTableInfo(getJumpTableEncoding())
+            ->createJumpTableIndex(Targets);
+  return emitLDImm64(nullptr, JTI);
+}
+
 MachineBasicBlock *
 BPFTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
                                                MachineBasicBlock *BB) const {
@@ -912,6 +1035,7 @@ BPFTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
                        Opc == BPF::Select_32_64);
 
   bool isMemcpyOp = Opc == BPF::MEMCPY;
+  bool isLDimm64Op = Opc == BPF::LDIMM64;
 
 #ifndef NDEBUG
   bool isSelectRIOp = (Opc == BPF::Select_Ri ||
@@ -919,13 +1043,16 @@ BPFTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
                        Opc == BPF::Select_Ri_32 ||
                        Opc == BPF::Select_Ri_32_64);
 
-  if (!(isSelectRROp || isSelectRIOp || isMemcpyOp))
+  if (!(isSelectRROp || isSelectRIOp || isMemcpyOp || isLDimm64Op))
     report_fatal_error("unhandled instruction type: " + Twine(Opc));
 #endif
 
   if (isMemcpyOp)
     return EmitInstrWithCustomInserterMemcpy(MI, BB);
 
+  if (isLDimm64Op)
+    return EmitInstrWithCustomInserterLDimm64(MI, BB);
+
   bool is32BitCmp = (Opc == BPF::Select_32 ||
                      Opc == BPF::Select_32_64 ||
                      Opc == BPF::Select_Ri_32 ||
diff --git a/llvm/lib/Target/BPF/BPFISelLowering.h b/llvm/lib/Target/BPF/BPFISelLowering.h
index 8f60261c10e9e..5243d4944667d 100644
--- a/llvm/lib/Target/BPF/BPFISelLowering.h
+++ b/llvm/lib/Target/BPF/BPFISelLowering.h
@@ -66,6 +66,8 @@ class BPFTargetLowering : public TargetLowering {
 
   MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override;
 
+  unsigned getJumpTableEncoding() const override;
+
 private:
   // Control Instruction Selection Features
   bool HasAlu32;
@@ -81,6 +83,8 @@ class BPFTargetLowering : public TargetLowering {
   SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerTRAP(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
 
   template <class NodeTy>
   SDValue getAddr(NodeTy *N, SelectionDAG &DAG, unsigned Flags = 0) const;
@@ -163,7 +167,9 @@ class BPFTargetLowering : public TargetLowering {
   MachineBasicBlock * EmitInstrWithCustomInserterMemcpy(MachineInstr &MI,
                                                         MachineBasicBlock *BB)
                                                         const;
-
+  MachineBasicBlock *
+  EmitInstrWithCustomInserterLDimm64(MachineInstr &MI,
+                                     MachineBasicBlock *BB) const;
 };
 }
 
diff --git a/llvm/lib/Target/BPF/BPFInstrInfo.cpp b/llvm/lib/Target/BPF/BPFInstrInfo.cpp
index fb4efcfe86142..409f8b4c253b8 100644
--- a/llvm/lib/Target/BPF/BPFInstrInfo.cpp
+++ b/llvm/lib/Target/BPF/BPFInstrInfo.cpp
@@ -182,6 +182,11 @@ bool BPFInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
     if (!isUnpredicatedTerminator(*I))
       break;
 
+    // From base method doc: ... returning true if it cannot be understood ...
+    // Indirect branch has multiple destinations and no true/false concepts.
+    if (I->isIndirectBranch())
+      return true;
+
     // A terminator that isn't a branch can't easily be handled
     // by this analysis.
     if (!I->isBranch())
@@ -260,3 +265,43 @@ unsigned BPFInstrInfo::removeBranch(MachineBasicBlock &MBB,
 
   return Count;
 }
+
+int BPFInstrInfo::getJumpTableIndex(const MachineInstr &MI) const {
+  if (MI.getOpcode() != BPF::JX)
+    return -1;
+
+  // The pattern looks like:
+  // %0 = LD_imm64 %jump-table.0   ; load jump-table address
+  // %1 = ADD_rr %0, $another_reg  ; address + offset
+  // %2 = LDD %1, 0                ; load the actual label
+  // JX %2
+  const MachineFunction &MF = *MI.getParent()->getParent();
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+
+  Register Reg = MI.getOperand(0).getReg();
+  if (!Reg.isVirtual())
+    return -1;
+  MachineInstr *Ldd = MRI.getUniqueVRegDef(Reg);
+  if (Ldd == nullptr || Ldd->getOpcode() != BPF::LDD)
+    return -1;
+
+  Reg = Ldd->getOperand(1).getReg();
+  if (!Reg.isVirtual())
+    return -1;
+  MachineInstr *Add = MRI.getUniqueVRegDef(Reg);
+  if (Add == nullptr || Add->getOpcode() != BPF::ADD_rr)
+    return -1;
+
+  Reg = Add->getOperand(1).getReg();
+  if (!Reg.isVirtual())
+    return -1;
+  MachineInstr *LDimm64 = MRI.getUniqueVRegDef(Reg);
+  if (LDimm64 == nullptr || LDimm64->getOpcode() != BPF::LD_imm64)
+    return -1;
+
+  const MachineOperand &MO = LDimm64->getOperand(1);
+  if (!MO.isJTI())
+    return -1;
+
+  return MO.getIndex();
+}
diff --git a/llvm/lib/Target/BPF/BPFInstrInfo.h b/llvm/lib/Target/BPF/BPFInstrInfo.h
index 2359e43e483f8..911e880166d29 100644
--- a/llvm/lib/Target/BPF/BPFInstrInfo.h
+++ b/llvm/lib/Target/BPF/BPFInstrInfo.h
@@ -59,6 +59,9 @@ class BPFInstrInfo : public BPFGenInstrInfo {
                         MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond,
                         const DebugLoc &DL,
                         int *BytesAdded = nullptr) const override;
+
+  int getJumpTableIndex(const MachineInstr &MI) const override;
+
 private:
   void expandMEMCPY(MachineBasicBlock::iterator) const;
 
diff --git a/llvm/lib/Target/BPF/BPFInstrInfo.td b/llvm/lib/Target/BPF/BPFInstrInfo.td
index de7dae2c8ca68..51c32b22510f0 100644
--- a/llvm/lib/Target/BPF/BPFInstrInfo.td
+++ b/llvm/lib/Target/BPF/BPFInstrInfo.td
@@ -61,6 +61,7 @@ def BPFNoMovsx : Predicate<"!Subtarget->hasMovsx()">;
 def BPFNoBswap : Predicate<"!Subtarget->hasBswap()">;
 def BPFHasStoreImm : Predicate<"Subtarget->hasStoreImm()">;
 def BPFHasLoadAcqStoreRel : Predicate<"Subtarget->hasLoadAcqStoreRel()">;
+def BPFHasGotox : Predicate<"Subtarget->hasGotox()">;
 
 class ImmediateAsmOperand<string name> : AsmOperandClass {
   let Name = name;
@@ -216,6 +217,18 @@ class JMP_RI<BPFJumpOp Opc, string OpcodeStr, PatLeaf Cond>
   let BPFClass = BPF_JMP;
 }
 
+class JMP_IND<BPFJumpOp Opc, string OpcodeStr, list<dag> Pattern>
+    : TYPE_ALU_JMP<Opc.Value, BPF_X.Value,
+                   (outs),
+                   (ins GPR:$dst),
+                   !strconcat(OpcodeStr, " $dst"),
+                   Pattern> {
+  bits<4> dst;
+
+  let Inst{51-48} = dst;
+  let BPFClass = BPF_JMP;
+}
+
 class JMP_JCOND<BPFJumpOp Opc, string OpcodeStr, list<dag> Pattern>
     : TYPE_ALU_JMP<Opc.Value, BPF_K.Value,
                    (outs),
@@ -281,6 +294,12 @@ defm JSLT : J<BPF_JSLT, "s<", BPF_CC_LT, BPF_CC_LT_32>;
 defm JSLE : J<BPF_JSLE, "s<=", BPF_CC_LE, BPF_CC_LE_32>;
 defm JSET : J<BPF_JSET, "&", NoCond, NoCond>;
 def JCOND : JMP_JCOND<BPF_JCOND, "may_goto", []>;
+
+let Predicates = [BPFHasGotox] in {
+  let isIndirectBranch = 1, isBarrier = 1 in {
+    def JX : JMP_IND<BPF_JA, "gotox", [(brind i64:$dst)]>;
+  }
+}
 }
 
 // ALU instructions
@@ -849,8 +868,8 @@ let usesCustomInserter = 1, isCodeGenOnly = 1 in {
 }
 
 // load 64-bit global addr into register
-def : Pat<(BPFWrapper tglobaladdr:$in), (LD_imm64 tglobaladdr:$in)>;
 def : Pat<(BPFWrapper tconstpool:$in), (LD_imm64 tconstpool:$in)>;
+def : Pat<(BPFWrapper tjumptable:$in), (LD_imm64 tjumptable:$in)>;
 
 // 0xffffFFFF doesn't fit into simm32, optimize common case
 def : Pat<(i64 (and (i64 GPR:$src), 0xffffFFFF)),
@@ -1370,3 +1389,8 @@ let usesCustomInserter = 1, isCodeGenOnly = 1 in {
       "#memcpy dst: $dst, src: $src, len: $len, align: $align",
       [(BPFmemcpy GPR:$dst, GPR:$src, imm:$len, imm:$align)]>;
 }
+
+// For GlobalValue and BlockAddress.
+let usesCustomInserter = 1, isCodeGenOnly = 1 in {
+  def LDIMM64 : Pseudo<(outs GPR:$dst), (ins i64imm:$addr), "", []>;
+}
diff --git a/llvm/lib/Target/BPF/BPFMCInstLower.cpp b/llvm/lib/Target/BPF/BPFMCInstLower.cpp
index 040a1fb750702..7d671d2c464e4 100644
--- a/llvm/lib/Target/BPF/BPFMCInstLower.cpp
+++ b/llvm/lib/Target/BPF/BPFMCInstLower.cpp
@@ -12,6 +12,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "BPFMCInstLower.h"
+#include "BPFAsmPrinter.h"
+#include "BPFISelLowering.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineInstr.h"
@@ -19,6 +21,7 @@
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCStreamer.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
@@ -77,6 +80,9 @@ void BPFMCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
     case MachineOperand::MO_ConstantPoolIndex:
       MCOp = LowerSymbolOperand(MO, Printer.GetCPISymbol(MO.getIndex()));
       break;
+    case MachineOperand::MO_JumpTableIndex:
+      MCOp = LowerSymbolOperand(MO, Printer.getJTPublicSymbol(MO.getIndex()));
+      break;
     }
 
     OutMI.addOperand(MCOp);
diff --git a/llvm/lib/Target/BPF/BPFMCInstLower.h b/llvm/lib/Target/BPF/BPFMCInstLower.h
index 4bd0f1f0bf1cf..483edd9a02831 100644
--- a/llvm/lib/Target/BPF/BPFMCInstLower.h
+++ b/llvm/lib/Target/BPF/BPFMCInstLower.h
@@ -12,7 +12,7 @@
 #include "llvm/Support/Compiler.h"
 
 namespace llvm {
-class AsmPrinter;
+class BPFAsmPrinter;
 class MCContext;
 class MCInst;
 class MCOperand;
@@ -24,10 +24,10 @@ class MachineOperand;
 class LLVM_LIBRARY_VISIBILITY BPFMCInstLower {
   MCContext &Ctx;
 
-  AsmPrinter &Printer;
+  BPFAsmPrinter &Printer;
 
 public:
-  BPFMCInstLower(MCContext &ctx, AsmPrinter &printer)
+  BPFMCInstLower(MCContext &ctx, BPFAsmPrinter &printer)
       : Ctx(ctx), Printer(printer) {}
   void Lower(const MachineInstr *MI, MCInst &OutMI) const;
 
diff --git a/llvm/lib/Target/BPF/BPFSubtarget.cpp b/llvm/lib/Target/BPF/BPFSubtarget.cpp
index a7ecc39fad7b9..8f16fe5bfdb51 100644
--- a/llvm/lib/Target/BPF/BPFSubtarget.cpp
+++ b/llvm/lib/Target/BPF/BPFSubtarget.cpp
@@ -43,6 +43,8 @@ static cl::opt<bool>
 static cl::opt<bool> Disable_load_acq_store_rel(
     "disable-load-acq-store-rel", cl::Hidden, cl::init(false),
     cl::desc("Disable load-acquire and store-release insns"));
+static cl::opt<bool> Disable_gotox("disable-gotox", cl::Hidden, cl::init(false),
+                                   cl::desc("Disable gotox insn"));
 
 void BPFSubtarget::anchor() {}
 
@@ -66,6 +68,7 @@ void BPFSubtarget::initializeEnvironment() {
   HasGotol = false;
   HasStoreImm = false;
   HasLoadAcqStoreRel = false;
+  HasGotox = false;
 }
 
 void BPFSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
@@ -96,6 +99,7 @@ void BPFSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
     HasGotol = !Disable_gotol;
     HasStoreImm = !Disable_StoreImm;
     HasLoadAcqStoreRel = !Disable_load_acq_store_rel;
+    HasGotox = !Disable_gotox;
     return;
   }
 }
diff --git a/llvm/lib/Target/BPF/BPFSubtarget.h b/llvm/lib/Target/BPF/BPFSubtarget.h
index aed2211265e23..e870dfdc85ec9 100644
--- a/llvm/lib/Target/BPF/BPFSubtarget.h
+++ b/llvm/lib/Target/BPF/BPFSubtarget.h
@@ -65,7 +65,7 @@ class BPFSubtarget : public BPFGenSubtargetInfo {
 
   // whether cpu v4 insns are enabled.
   bool HasLdsx, HasMovsx, HasBswap, HasSdivSmod, HasGotol, HasStoreImm,
-      HasLoadAcqStoreRel;
+      HasLoadAcqStoreRel, HasGotox;
 
   std::unique_ptr<CallLowering> CallLoweringInfo;
   std::unique_ptr<InstructionSelector> InstSelector;
@@ -94,6 +94,7 @@ class BPFSubtarget : public BPFGenSubtargetInfo {
   bool hasGotol() const { return HasGotol; }
   bool hasStoreImm() const { return HasStoreImm; }
   bool hasLoadAcqStoreRel() const { return HasLoadAcqStoreRel; }
+  bool hasGotox() const { return HasGotox; }
 
   bool isLittleEndian() const { return IsLittleEndian; }
 
diff --git a/llvm/lib/Target/BPF/BPFTargetLoweringObjectFile.cpp b/llvm/lib/Target/BPF/BPFTargetLoweringObjectFile.cpp
new file mode 100644
index 0000000000000..997f09870bad6
--- /dev/null
+++ b/llvm/lib/Target/BPF/BPFTargetLoweringObjectFile.cpp
@@ -0,0 +1,19 @@
+//===------------------ BPFTargetLoweringObjectFile.cpp -------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "BPFTargetLoweringObjectFile.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCSectionELF.h"
+
+using namespace llvm;
+
+MCSection *BPFTargetLoweringObjectFileELF::getSectionForJumpTable(
+    const Function &F, const TargetMachine &TM,
+    const MachineJumpTableEntry *JTE) const {
+  return getContext().getELFSection(".jumptables", ELF::SHT_PROGBITS, 0);
+}
diff --git a/llvm/lib/Target/BPF/BPFTargetLoweringObjectFile.h b/llvm/lib/Target/BPF/BPFTargetLoweringObjectFile.h
new file mode 100644
index 0000000000000..f3064c0c8cb8a
--- /dev/null
+++ b/llvm/lib/Target/BPF/BPFTargetLoweringObjectFile.h
@@ -0,0 +1,25 @@
+//===============-  BPFTargetLoweringObjectFile.h  -*- C++ -*-================//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_BPF_BPFTARGETLOWERINGOBJECTFILE
+#define LLVM_LIB_TARGET_BPF_BPFTARGETLOWERINGOBJECTFILE
+
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+
+namespace llvm {
+class BPFTargetLoweringObjectFileELF : public TargetLoweringObjectFileELF {
+
+public:
+  virtual MCSection *
+  getSectionForJumpTable(const Function &F, const TargetMachine &TM,
+                         const MachineJumpTableEntry *JTE) const override;
+};
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_BPF_BPFTARGETLOWERINGOBJECTFILE
diff --git a/llvm/lib/Target/BPF/BPFTargetMachine.cpp b/llvm/lib/Target/BPF/BPFTargetMachine.cpp
index 527a480354571..ad3df2c879fe7 100644
--- a/llvm/lib/Target/BPF/BPFTargetMachine.cpp
+++ b/llvm/lib/Target/BPF/BPFTargetMachine.cpp
@@ -12,6 +12,7 @@
 
 #include "BPFTargetMachine.h"
 #include "BPF.h"
+#include "BPFTargetLoweringObjectFile.h"
 #include "BPFTargetTransformInfo.h"
 #include "MCTargetDesc/BPFMCAsmInfo.h"
 #include "TargetInfo/BPFTargetInfo.h"
@@ -59,14 +60,6 @@ extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeBPFTarget() {
   initializeBPFMIPreEmitCheckingPass(PR);
 }
 
-// DataLayout: little or big endian
-static std::string computeDataLayout(const Triple &TT) {
-  if (TT.getArch() == Triple::bpfeb)
-    return "E-m:e-p:64:64-i64:64-i128:128-n32:64-S128";
-  else
-    return "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128";
-}
-
 static Reloc::Model getEffectiveRelocModel(std::optional<Reloc::Model> RM) {
   return RM.value_or(Reloc::PIC_);
 }
@@ -77,10 +70,10 @@ BPFTargetMachine::BPFTargetMachine(const Target &T, const Triple &TT,
                                    std::optional<Reloc::Model> RM,
                                    std::optional<CodeModel::Model> CM,
                                    CodeGenOptLevel OL, bool JIT)
-    : CodeGenTargetMachineImpl(T, computeDataLayout(TT), TT, CPU, FS, Options,
+    : CodeGenTargetMachineImpl(T, TT.computeDataLayout(), TT, CPU, FS, Options,
                                getEffectiveRelocModel(RM),
                                getEffectiveCodeModel(CM, CodeModel::Small), OL),
-      TLOF(std::make_unique<TargetLoweringObjectFileELF>()),
+      TLOF(std::make_unique<BPFTargetLoweringObjectFileELF>()),
       Subtarget(TT, std::string(CPU), std::string(FS), *this) {
   if (!DisableCheckUnreachable) {
     this->Options.TrapUnreachable = true;
diff --git a/llvm/lib/Target/BPF/CMakeLists.txt b/llvm/lib/Target/BPF/CMakeLists.txt
index eade4cacb7100..3678f1335ca36 100644
--- a/llvm/lib/Target/BPF/CMakeLists.txt
+++ b/llvm/lib/Target/BPF/CMakeLists.txt
@@ -37,6 +37,7 @@ add_llvm_target(BPFCodeGen
   BPFRegisterInfo.cpp
   BPFSelectionDAGInfo.cpp
   BPFSubtarget.cpp
+  BPFTargetLoweringObjectFile.cpp
   BPFTargetMachine.cpp
   BPFMIPeephole.cpp
   BPFMIChecking.cpp
diff --git a/llvm/lib/Target/BPF/Disassembler/BPFDisassembler.cpp b/llvm/lib/Target/BPF/Disassembler/BPFDisassembler.cpp
index 230cf3b0ddbe4..813dddad7d75f 100644
--- a/llvm/lib/Target/BPF/Disassembler/BPFDisassembler.cpp
+++ b/llvm/lib/Target/BPF/Disassembler/BPFDisassembler.cpp
@@ -26,6 +26,7 @@
 #include <cstdint>
 
 using namespace llvm;
+using namespace llvm::MCD;
 
 #define DEBUG_TYPE "bpf-disassembler"
 
diff --git a/llvm/lib/Target/CSKY/CMakeLists.txt b/llvm/lib/Target/CSKY/CMakeLists.txt
index 4b900bc99c271..433f3c821f9ee 100644
--- a/llvm/lib/Target/CSKY/CMakeLists.txt
+++ b/llvm/lib/Target/CSKY/CMakeLists.txt
@@ -44,6 +44,7 @@ add_llvm_target(CSKYCodeGen
   SelectionDAG
   Support
   Target
+  TargetParser
 
   ADD_TO_COMPONENT
   CSKY
diff --git a/llvm/lib/Target/CSKY/CSKYTargetMachine.cpp b/llvm/lib/Target/CSKY/CSKYTargetMachine.cpp
index ae6ef89fdcd07..d0058b9af14be 100644
--- a/llvm/lib/Target/CSKY/CSKYTargetMachine.cpp
+++ b/llvm/lib/Target/CSKY/CSKYTargetMachine.cpp
@@ -33,28 +33,13 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeCSKYTarget() {
   initializeCSKYDAGToDAGISelLegacyPass(*Registry);
 }
 
-static std::string computeDataLayout(const Triple &TT) {
-  std::string Ret;
-
-  // Only support little endian for now.
-  // TODO: Add support for big endian.
-  Ret += "e";
-
-  // CSKY is always 32-bit target with the CSKYv2 ABI as prefer now.
-  // It's a 4-byte aligned stack with ELF mangling only.
-  Ret += "-m:e-S32-p:32:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:32:32"
-         "-v128:32:32-a:0:32-Fi32-n32";
-
-  return Ret;
-}
-
 CSKYTargetMachine::CSKYTargetMachine(const Target &T, const Triple &TT,
                                      StringRef CPU, StringRef FS,
                                      const TargetOptions &Options,
                                      std::optional<Reloc::Model> RM,
                                      std::optional<CodeModel::Model> CM,
                                      CodeGenOptLevel OL, bool JIT)
-    : CodeGenTargetMachineImpl(T, computeDataLayout(TT), TT, CPU, FS, Options,
+    : CodeGenTargetMachineImpl(T, TT.computeDataLayout(), TT, CPU, FS, Options,
                                RM.value_or(Reloc::Static),
                                getEffectiveCodeModel(CM, CodeModel::Small), OL),
       TLOF(std::make_unique<CSKYELFTargetObjectFile>()) {
diff --git a/llvm/lib/Target/CSKY/Disassembler/CSKYDisassembler.cpp b/llvm/lib/Target/CSKY/Disassembler/CSKYDisassembler.cpp
index 887e281279535..39e651d52e4d3 100644
--- a/llvm/lib/Target/CSKY/Disassembler/CSKYDisassembler.cpp
+++ b/llvm/lib/Target/CSKY/Disassembler/CSKYDisassembler.cpp
@@ -26,6 +26,7 @@
 #include "llvm/Support/Endian.h"
 
 using namespace llvm;
+using namespace llvm::MCD;
 
 #define DEBUG_TYPE "csky-disassembler"
 
diff --git a/llvm/lib/Target/DirectX/DXILPostOptimizationValidation.cpp b/llvm/lib/Target/DirectX/DXILPostOptimizationValidation.cpp
index 28d4dd64e8945..7e93474e73118 100644
--- a/llvm/lib/Target/DirectX/DXILPostOptimizationValidation.cpp
+++ b/llvm/lib/Target/DirectX/DXILPostOptimizationValidation.cpp
@@ -214,7 +214,7 @@ static void validateRootSignature(Module &M,
     }
   }
 
-  for (const dxbc::RTS0::v1::StaticSampler &S : RSD.StaticSamplers)
+  for (const mcdxbc::StaticSampler &S : RSD.StaticSamplers)
     Builder.trackBinding(dxil::ResourceClass::Sampler, S.RegisterSpace,
                          S.ShaderRegister, S.ShaderRegister, &S);
 
diff --git a/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp b/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp
index f5d5a73c926e9..bcf84403b2c0d 100644
--- a/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp
+++ b/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp
@@ -134,11 +134,8 @@ DirectXTargetMachine::DirectXTargetMachine(const Target &T, const Triple &TT,
                                            std::optional<Reloc::Model> RM,
                                            std::optional<CodeModel::Model> CM,
                                            CodeGenOptLevel OL, bool JIT)
-    : CodeGenTargetMachineImpl(
-          T,
-          "e-m:e-p:32:32-i1:32-i8:8-i16:16-i32:32-i64:64-f16:16-"
-          "f32:32-f64:64-n8:16:32:64",
-          TT, CPU, FS, Options, Reloc::Static, CodeModel::Small, OL),
+    : CodeGenTargetMachineImpl(T, TT.computeDataLayout(), TT, CPU, FS, Options,
+                               Reloc::Static, CodeModel::Small, OL),
       TLOF(std::make_unique<DXILTargetObjectFile>()),
       Subtarget(std::make_unique<DirectXSubtarget>(TT, CPU, FS, *this)) {
   initAsmInfo();
diff --git a/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp b/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
index 0639878c1256f..974f6533411e0 100644
--- a/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
+++ b/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
@@ -34,6 +34,7 @@
 #define DEBUG_TYPE "hexagon-disassembler"
 
 using namespace llvm;
+using namespace llvm::MCD;
 using namespace Hexagon;
 
 using DecodeStatus = MCDisassembler::DecodeStatus;
diff --git a/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp b/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp
index 3b7bd1cd1ba94..52e6b0b083c81 100644
--- a/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp
@@ -1751,10 +1751,11 @@ namespace {
   class BitSimplification : public Transformation {
   public:
     BitSimplification(BitTracker &bt, const MachineDominatorTree &mdt,
-        const HexagonInstrInfo &hii, const HexagonRegisterInfo &hri,
-        MachineRegisterInfo &mri, MachineFunction &mf)
-      : Transformation(true), MDT(mdt), HII(hii), HRI(hri), MRI(mri),
-        MF(mf), BT(bt) {}
+                      const HexagonInstrInfo &hii,
+                      const HexagonRegisterInfo &hri, MachineRegisterInfo &mri,
+                      MachineFunction &mf)
+        : Transformation(true), MDT(mdt), HII(hii), HRI(hri), MRI(mri), BT(bt) {
+    }
 
     bool processBlock(MachineBasicBlock &B, const RegisterSet &AVs) override;
 
@@ -1797,7 +1798,6 @@ namespace {
     const HexagonInstrInfo &HII;
     const HexagonRegisterInfo &HRI;
     MachineRegisterInfo &MRI;
-    MachineFunction &MF;
     BitTracker &BT;
   };
 
@@ -1886,7 +1886,7 @@ bool BitSimplification::matchHalf(unsigned SelfR,
 
 bool BitSimplification::validateReg(BitTracker::RegisterRef R, unsigned Opc,
       unsigned OpNum) {
-  auto *OpRC = HII.getRegClass(HII.get(Opc), OpNum, &HRI, MF);
+  auto *OpRC = HII.getRegClass(HII.get(Opc), OpNum, &HRI);
   auto *RRC = HBS::getFinalVRegClass(R, MRI);
   return OpRC->hasSubClassEq(RRC);
 }
diff --git a/llvm/lib/Target/Hexagon/HexagonCallingConv.td b/llvm/lib/Target/Hexagon/HexagonCallingConv.td
index fd6d873dd4188..dceb70c8abbf2 100644
--- a/llvm/lib/Target/Hexagon/HexagonCallingConv.td
+++ b/llvm/lib/Target/Hexagon/HexagonCallingConv.td
@@ -6,6 +6,15 @@
 //
 //===----------------------------------------------------------------------===//
 
+// We cannot use the standard CCIfArgVarArg class here since in Hexagon there
+// exists a special case for a musl environment.  In a musl environment VarArgs
+// are treated like non VarArgs.  I.e., in a musl enviroment unnamed arguments
+// can also be passed in registers.  The CCIfArgVarArg class only checks each
+// individual argument, but not whether State.isVarArg() is true.  We also have
+// to check State.isVarArg() which is determined by the TreatAsVarArg argument.
+class CCIfStateVarArgAndArgVarArg<CCAction A>
+  : CCIf<"State.isVarArg() && ArgFlags.isVarArg()", A>;
+
 def CC_HexagonStack: CallingConv<[
   CCIfType<[i32,v2i16,v4i8],
     CCAssignToStack<4,4>>,
@@ -23,7 +32,7 @@ def CC_Hexagon_Legacy: CallingConv<[
 
   CCIfByVal<
     CCPassByVal<8,8>>,
-  CCIfArgVarArg<
+  CCIfStateVarArgAndArgVarArg<
     CCDelegateTo<CC_HexagonStack>>,
 
   // Pass split values in pairs, allocate odd register if necessary.
@@ -53,7 +62,7 @@ def CC_Hexagon: CallingConv<[
 
   CCIfByVal<
     CCPassByVal<8,1>>,
-  CCIfArgVarArg<
+  CCIfStateVarArgAndArgVarArg<
     CCDelegateTo<CC_HexagonStack>>,
 
   // Pass split values in pairs, allocate odd register if necessary.
diff --git a/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp b/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp
index b3c61e1829bf9..dd343d9fbe79f 100644
--- a/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp
@@ -2225,7 +2225,7 @@ void HexagonFrameLowering::optimizeSpillSlots(MachineFunction &MF,
         if (!Bad) {
           // If the addressing mode is ok, check the register class.
           unsigned OpNum = Load ? 0 : 2;
-          auto *RC = HII.getRegClass(In.getDesc(), OpNum, &HRI, MF);
+          auto *RC = HII.getRegClass(In.getDesc(), OpNum, &HRI);
           RC = getCommonRC(SI.RC, RC);
           if (RC == nullptr)
             Bad = true;
@@ -2395,7 +2395,7 @@ void HexagonFrameLowering::optimizeSpillSlots(MachineFunction &MF,
 
         HexagonBlockRanges::RegisterRef SrcRR = { SrcOp.getReg(),
                                                   SrcOp.getSubReg() };
-        auto *RC = HII.getRegClass(SI.getDesc(), 2, &HRI, MF);
+        auto *RC = HII.getRegClass(SI.getDesc(), 2, &HRI);
         // The this-> is needed to unconfuse MSVC.
         Register FoundR = this->findPhysReg(MF, Range, IM, DM, RC);
         LLVM_DEBUG(dbgs() << "Replacement reg:" << printReg(FoundR, &HRI)
diff --git a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
index c54b67ccd8843..9f7f434b66fa1 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
@@ -3352,7 +3352,6 @@ HexagonTargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
 SDValue
 HexagonTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   unsigned Opc = Op.getOpcode();
-
   // Handle INLINEASM first.
   if (Opc == ISD::INLINEASM || Opc == ISD::INLINEASM_BR)
     return LowerINLINEASM(Op, DAG);
diff --git a/llvm/lib/Target/Hexagon/HexagonISelLowering.h b/llvm/lib/Target/Hexagon/HexagonISelLowering.h
index 9ebbbc6399b42..8d04edbea5b43 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelLowering.h
+++ b/llvm/lib/Target/Hexagon/HexagonISelLowering.h
@@ -577,6 +577,8 @@ class HexagonTargetLowering : public TargetLowering {
   SDValue LowerHvxFpExtend(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerHvxFpToInt(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerHvxIntToFp(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerHvxPred32ToFp(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerHvxPred64ToFp(SDValue Op, SelectionDAG &DAG) const;
   SDValue ExpandHvxFpToInt(SDValue Op, SelectionDAG &DAG) const;
   SDValue ExpandHvxIntToFp(SDValue Op, SelectionDAG &DAG) const;
 
diff --git a/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp b/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
index f1fa40c1b9036..d0dfa47468705 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
@@ -208,6 +208,8 @@ HexagonTargetLowering::initializeHVXLowering() {
     setOperationAction(ISD::SPLAT_VECTOR,   T, Legal);
     setOperationAction(ISD::UADDSAT, T, Legal);
     setOperationAction(ISD::SADDSAT, T, Legal);
+    setOperationAction(ISD::USUBSAT, T, Legal);
+    setOperationAction(ISD::SSUBSAT, T, Legal);
     if (T != ByteV) {
       setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, T, Legal);
       setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, T, Legal);
@@ -302,6 +304,8 @@ HexagonTargetLowering::initializeHVXLowering() {
     setOperationAction(ISD::UADDSAT, T, Legal);
     setOperationAction(ISD::SADDSAT, T, Legal);
     setOperationAction(ISD::SUB,      T, Legal);
+    setOperationAction(ISD::USUBSAT, T, Legal);
+    setOperationAction(ISD::SSUBSAT, T, Legal);
     setOperationAction(ISD::MUL,      T, Custom);
     setOperationAction(ISD::MULHS,    T, Custom);
     setOperationAction(ISD::MULHU,    T, Custom);
@@ -442,6 +446,10 @@ HexagonTargetLowering::initializeHVXLowering() {
     }
   }
 
+  // Include cases which are not hander earlier
+  setOperationAction(ISD::UINT_TO_FP, MVT::v32i1, Custom);
+  setOperationAction(ISD::UINT_TO_FP, MVT::v64i1, Custom);
+
   setTargetDAGCombine({ISD::CONCAT_VECTORS, ISD::TRUNCATE, ISD::VSELECT});
 }
 
@@ -2329,6 +2337,123 @@ HexagonTargetLowering::LowerHvxFpToInt(SDValue Op, SelectionDAG &DAG) const {
   return ExpandHvxFpToInt(Op, DAG);
 }
 
+// For vector type v32i1 uint_to_fp to v32f32:
+// R1 = #1, R2 holds the v32i1 param
+// V1 = vsplat(R1)
+// V2 = vsplat(R2)
+// Q0 = vand(V1,R1)
+// V0.w=prefixsum(Q0)
+// V0.w=vsub(V0.w,V1.w)
+// V2.w = vlsr(V2.w,V0.w)
+// V2 = vand(V2,V1)
+// V2.sf = V2.w
+SDValue HexagonTargetLowering::LowerHvxPred32ToFp(SDValue PredOp,
+                                                  SelectionDAG &DAG) const {
+
+  MVT ResTy = ty(PredOp);
+  const SDLoc &dl(PredOp);
+
+  SDValue Const = DAG.getTargetConstant(0x1, dl, MVT::i32);
+  SDNode *RegConst = DAG.getMachineNode(Hexagon::A2_tfrsi, dl, MVT::i32, Const);
+  SDNode *SplatConst = DAG.getMachineNode(Hexagon::V6_lvsplatw, dl, MVT::v32i32,
+                                          SDValue(RegConst, 0));
+  SDNode *PredTransfer =
+      DAG.getMachineNode(Hexagon::V6_vandvrt, dl, MVT::v32i1,
+                         SDValue(SplatConst, 0), SDValue(RegConst, 0));
+  SDNode *PrefixSum = DAG.getMachineNode(Hexagon::V6_vprefixqw, dl, MVT::v32i32,
+                                         SDValue(PredTransfer, 0));
+  SDNode *SplatParam = DAG.getMachineNode(
+      Hexagon::V6_lvsplatw, dl, MVT::v32i32,
+      DAG.getNode(ISD::BITCAST, dl, MVT::i32, PredOp.getOperand(0)));
+  SDNode *Vsub =
+      DAG.getMachineNode(Hexagon::V6_vsubw, dl, MVT::v32i32,
+                         SDValue(PrefixSum, 0), SDValue(SplatConst, 0));
+  SDNode *IndexShift =
+      DAG.getMachineNode(Hexagon::V6_vlsrwv, dl, MVT::v32i32,
+                         SDValue(SplatParam, 0), SDValue(Vsub, 0));
+  SDNode *MaskOff =
+      DAG.getMachineNode(Hexagon::V6_vand, dl, MVT::v32i32,
+                         SDValue(IndexShift, 0), SDValue(SplatConst, 0));
+  SDNode *Convert = DAG.getMachineNode(Hexagon::V6_vconv_sf_w, dl, ResTy,
+                                       SDValue(MaskOff, 0));
+  return SDValue(Convert, 0);
+}
+
+// For vector type v64i1 uint_to_fo to v64f16:
+// i64 R32 = bitcast v64i1 R3:2 (R3:2 holds v64i1)
+// R3 = subreg_high (R32)
+// R2 = subreg_low (R32)
+// R1 = #1
+// V1 = vsplat(R1)
+// V2 = vsplat(R2)
+// V3 = vsplat(R3)
+// Q0 = vand(V1,R1)
+// V0.w=prefixsum(Q0)
+// V0.w=vsub(V0.w,V1.w)
+// V2.w = vlsr(V2.w,V0.w)
+// V3.w = vlsr(V3.w,V0.w)
+// V2 = vand(V2,V1)
+// V3 = vand(V3,V1)
+// V2.h = vpacke(V3.w,V2.w)
+// V2.hf = V2.h
+SDValue HexagonTargetLowering::LowerHvxPred64ToFp(SDValue PredOp,
+                                                  SelectionDAG &DAG) const {
+
+  MVT ResTy = ty(PredOp);
+  const SDLoc &dl(PredOp);
+
+  SDValue Inp = DAG.getNode(ISD::BITCAST, dl, MVT::i64, PredOp.getOperand(0));
+  // Get the hi and lo regs
+  SDValue HiReg =
+      DAG.getTargetExtractSubreg(Hexagon::isub_hi, dl, MVT::i32, Inp);
+  SDValue LoReg =
+      DAG.getTargetExtractSubreg(Hexagon::isub_lo, dl, MVT::i32, Inp);
+  // Get constant #1 and splat into vector V1
+  SDValue Const = DAG.getTargetConstant(0x1, dl, MVT::i32);
+  SDNode *RegConst = DAG.getMachineNode(Hexagon::A2_tfrsi, dl, MVT::i32, Const);
+  SDNode *SplatConst = DAG.getMachineNode(Hexagon::V6_lvsplatw, dl, MVT::v32i32,
+                                          SDValue(RegConst, 0));
+  // Splat the hi and lo args
+  SDNode *SplatHi =
+      DAG.getMachineNode(Hexagon::V6_lvsplatw, dl, MVT::v32i32,
+                         DAG.getNode(ISD::BITCAST, dl, MVT::i32, HiReg));
+  SDNode *SplatLo =
+      DAG.getMachineNode(Hexagon::V6_lvsplatw, dl, MVT::v32i32,
+                         DAG.getNode(ISD::BITCAST, dl, MVT::i32, LoReg));
+  // vand between splatted const and const
+  SDNode *PredTransfer =
+      DAG.getMachineNode(Hexagon::V6_vandvrt, dl, MVT::v32i1,
+                         SDValue(SplatConst, 0), SDValue(RegConst, 0));
+  // Get the prefixsum
+  SDNode *PrefixSum = DAG.getMachineNode(Hexagon::V6_vprefixqw, dl, MVT::v32i32,
+                                         SDValue(PredTransfer, 0));
+  // Get the vsub
+  SDNode *Vsub =
+      DAG.getMachineNode(Hexagon::V6_vsubw, dl, MVT::v32i32,
+                         SDValue(PrefixSum, 0), SDValue(SplatConst, 0));
+  // Get vlsr for hi and lo
+  SDNode *IndexShift_hi =
+      DAG.getMachineNode(Hexagon::V6_vlsrwv, dl, MVT::v32i32,
+                         SDValue(SplatHi, 0), SDValue(Vsub, 0));
+  SDNode *IndexShift_lo =
+      DAG.getMachineNode(Hexagon::V6_vlsrwv, dl, MVT::v32i32,
+                         SDValue(SplatLo, 0), SDValue(Vsub, 0));
+  // Get vand of hi and lo
+  SDNode *MaskOff_hi =
+      DAG.getMachineNode(Hexagon::V6_vand, dl, MVT::v32i32,
+                         SDValue(IndexShift_hi, 0), SDValue(SplatConst, 0));
+  SDNode *MaskOff_lo =
+      DAG.getMachineNode(Hexagon::V6_vand, dl, MVT::v32i32,
+                         SDValue(IndexShift_lo, 0), SDValue(SplatConst, 0));
+  // Pack them
+  SDNode *Pack =
+      DAG.getMachineNode(Hexagon::V6_vpackeh, dl, MVT::v64i16,
+                         SDValue(MaskOff_hi, 0), SDValue(MaskOff_lo, 0));
+  SDNode *Convert =
+      DAG.getMachineNode(Hexagon::V6_vconv_hf_h, dl, ResTy, SDValue(Pack, 0));
+  return SDValue(Convert, 0);
+}
+
 SDValue
 HexagonTargetLowering::LowerHvxIntToFp(SDValue Op, SelectionDAG &DAG) const {
   // Catch invalid conversion ops (just in case).
@@ -2339,6 +2464,13 @@ HexagonTargetLowering::LowerHvxIntToFp(SDValue Op, SelectionDAG &DAG) const {
   MVT IntTy = ty(Op.getOperand(0)).getVectorElementType();
   MVT FpTy = ResTy.getVectorElementType();
 
+  if (Op.getOpcode() == ISD::UINT_TO_FP) {
+    if (ResTy == MVT::v32f32 && ty(Op.getOperand(0)) == MVT::v32i1)
+      return LowerHvxPred32ToFp(Op, DAG);
+    if (ResTy == MVT::v64f16 && ty(Op.getOperand(0)) == MVT::v64i1)
+      return LowerHvxPred64ToFp(Op, DAG);
+  }
+
   if (Subtarget.useHVXIEEEFPOps()) {
     // There are only conversions to f16.
     if (FpTy == MVT::f16) {
diff --git a/llvm/lib/Target/Hexagon/HexagonLoadStoreWidening.cpp b/llvm/lib/Target/Hexagon/HexagonLoadStoreWidening.cpp
index b8060fb66680f..7cbd81ff227e1 100644
--- a/llvm/lib/Target/Hexagon/HexagonLoadStoreWidening.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonLoadStoreWidening.cpp
@@ -646,7 +646,7 @@ bool HexagonLoadStoreWidening::createWideStores(InstrGroup &OG, InstrGroup &NG,
     MachineInstr *CombI;
     if (Acc != 0) {
       const MCInstrDesc &TfrD = TII->get(Hexagon::A2_tfrsi);
-      const TargetRegisterClass *RC = TII->getRegClass(TfrD, 0, TRI, *MF);
+      const TargetRegisterClass *RC = TII->getRegClass(TfrD, 0, TRI);
       Register VReg = MF->getRegInfo().createVirtualRegister(RC);
       MachineInstr *TfrI = BuildMI(*MF, DL, TfrD, VReg).addImm(LowerAcc);
       NG.push_back(TfrI);
@@ -677,7 +677,7 @@ bool HexagonLoadStoreWidening::createWideStores(InstrGroup &OG, InstrGroup &NG,
   } else {
     // Create vreg = A2_tfrsi #Acc; mem[hw] = vreg
     const MCInstrDesc &TfrD = TII->get(Hexagon::A2_tfrsi);
-    const TargetRegisterClass *RC = TII->getRegClass(TfrD, 0, TRI, *MF);
+    const TargetRegisterClass *RC = TII->getRegClass(TfrD, 0, TRI);
     Register VReg = MF->getRegInfo().createVirtualRegister(RC);
     MachineInstr *TfrI = BuildMI(*MF, DL, TfrD, VReg).addImm(int(Acc));
     NG.push_back(TfrI);
diff --git a/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td b/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td
index fb2ef59d99ef1..1637b91f1fa12 100644
--- a/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td
+++ b/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td
@@ -441,6 +441,21 @@ let Predicates = [UseHVX] in {
   def: OpR_RR_pat_sat<V6_vaddwsat_dv,  saddsat, VecPI32, HWI32>;
 }
 
+let Predicates = [UseHVX] in {
+  def: OpR_RR_pat_sat<V6_vsububsat,    usubsat, VecI8,   HVI8>;
+  def: OpR_RR_pat_sat<V6_vsubuhsat,    usubsat, VecI16,  HVI16>;
+  def: OpR_RR_pat_sat<V6_vsubuwsat,    usubsat, VecI32,  HVI32>;
+  def: OpR_RR_pat_sat<V6_vsubbsat,     ssubsat, VecI8,   HVI8>;
+  def: OpR_RR_pat_sat<V6_vsubhsat,     ssubsat, VecI16,  HVI16>;
+  def: OpR_RR_pat_sat<V6_vsubwsat,     ssubsat, VecI32,  HVI32>;
+  def: OpR_RR_pat_sat<V6_vsububsat_dv, usubsat, VecPI8,  HWI8>;
+  def: OpR_RR_pat_sat<V6_vsubuhsat_dv, usubsat, VecPI16, HWI16>;
+  def: OpR_RR_pat_sat<V6_vsubuwsat_dv, usubsat, VecPI32, HWI32>;
+  def: OpR_RR_pat_sat<V6_vsubbsat_dv,  ssubsat, VecPI8,  HWI8>;
+  def: OpR_RR_pat_sat<V6_vsubhsat_dv,  ssubsat, VecPI16, HWI16>;
+  def: OpR_RR_pat_sat<V6_vsubwsat_dv,  ssubsat, VecPI32, HWI32>;
+}
+
 // For now, we always deal with vector floating point in SF mode.
 class OpR_RR_pat_conv<InstHexagon MI, PatFrag Op, ValueType ResType,
                       PatFrag RsPred, PatFrag RtPred = RsPred>
diff --git a/llvm/lib/Target/Hexagon/HexagonRegisterInfo.cpp b/llvm/lib/Target/Hexagon/HexagonRegisterInfo.cpp
index 2731c523963e5..77ce983d24785 100644
--- a/llvm/lib/Target/Hexagon/HexagonRegisterInfo.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonRegisterInfo.cpp
@@ -444,7 +444,6 @@ bool HexagonRegisterInfo::useFPForScavengingIndex(const MachineFunction &MF)
 }
 
 const TargetRegisterClass *
-HexagonRegisterInfo::getPointerRegClass(const MachineFunction &MF,
-                                        unsigned Kind) const {
+HexagonRegisterInfo::getPointerRegClass(unsigned Kind) const {
   return &Hexagon::IntRegsRegClass;
 }
diff --git a/llvm/lib/Target/Hexagon/HexagonRegisterInfo.h b/llvm/lib/Target/Hexagon/HexagonRegisterInfo.h
index 72153980236e9..945b8608cd948 100644
--- a/llvm/lib/Target/Hexagon/HexagonRegisterInfo.h
+++ b/llvm/lib/Target/Hexagon/HexagonRegisterInfo.h
@@ -72,8 +72,7 @@ class HexagonRegisterInfo : public HexagonGenRegisterInfo {
         const TargetRegisterClass *RC) const;
 
   const TargetRegisterClass *
-  getPointerRegClass(const MachineFunction &MF,
-                     unsigned Kind = 0) const override;
+  getPointerRegClass(unsigned Kind = 0) const override;
 
   bool isEHReturnCalleeSaveReg(Register Reg) const;
 };
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp b/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
index 66508fd767793..0afa04ab57e81 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
@@ -231,14 +231,10 @@ HexagonTargetMachine::HexagonTargetMachine(const Target &T, const Triple &TT,
     // Specify the vector alignment explicitly. For v512x1, the calculated
     // alignment would be 512*alignment(i1), which is 512 bytes, instead of
     // the required minimum of 64 bytes.
-    : CodeGenTargetMachineImpl(
-          T,
-          "e-m:e-p:32:32:32-a:0-n16:32-"
-          "i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-"
-          "v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048",
-          TT, CPU, FS, Options, getEffectiveRelocModel(RM),
-          getEffectiveCodeModel(CM, CodeModel::Small),
-          (HexagonNoOpt ? CodeGenOptLevel::None : OL)),
+    : CodeGenTargetMachineImpl(T, TT.computeDataLayout(), TT, CPU, FS, Options,
+                               getEffectiveRelocModel(RM),
+                               getEffectiveCodeModel(CM, CodeModel::Small),
+                               (HexagonNoOpt ? CodeGenOptLevel::None : OL)),
       TLOF(std::make_unique<HexagonTargetObjectFile>()),
       Subtarget(Triple(TT), CPU, FS, *this) {
   initAsmInfo();
diff --git a/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp b/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
index bc486cd562bf4..cb88d1ac4af9f 100644
--- a/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
@@ -653,7 +653,7 @@ bool HexagonPacketizerList::canPromoteToNewValueStore(const MachineInstr &MI,
   const MCInstrDesc& MCID = PacketMI.getDesc();
 
   // First operand is always the result.
-  const TargetRegisterClass *PacketRC = HII->getRegClass(MCID, 0, HRI, MF);
+  const TargetRegisterClass *PacketRC = HII->getRegClass(MCID, 0, HRI);
   // Double regs can not feed into new value store: PRM section: 5.4.2.2.
   if (PacketRC == &Hexagon::DoubleRegsRegClass)
     return false;
@@ -866,7 +866,7 @@ bool HexagonPacketizerList::canPromoteToDotNew(const MachineInstr &MI,
     return false;
 
   const MCInstrDesc& MCID = PI.getDesc();
-  const TargetRegisterClass *VecRC = HII->getRegClass(MCID, 0, HRI, MF);
+  const TargetRegisterClass *VecRC = HII->getRegClass(MCID, 0, HRI);
   if (DisableVecDblNVStores && VecRC == &Hexagon::HvxWRRegClass)
     return false;
 
diff --git a/llvm/lib/Target/Lanai/Disassembler/LanaiDisassembler.cpp b/llvm/lib/Target/Lanai/Disassembler/LanaiDisassembler.cpp
index 9cd0636306b16..96feaf28d0a0f 100644
--- a/llvm/lib/Target/Lanai/Disassembler/LanaiDisassembler.cpp
+++ b/llvm/lib/Target/Lanai/Disassembler/LanaiDisassembler.cpp
@@ -28,6 +28,7 @@
 #define DEBUG_TYPE "lanai-disassembler"
 
 using namespace llvm;
+using namespace llvm::MCD;
 
 typedef MCDisassembler::DecodeStatus DecodeStatus;
 
diff --git a/llvm/lib/Target/Lanai/LanaiTargetMachine.cpp b/llvm/lib/Target/Lanai/LanaiTargetMachine.cpp
index 3d6ba9ecc55e2..df56f9ae39fe2 100644
--- a/llvm/lib/Target/Lanai/LanaiTargetMachine.cpp
+++ b/llvm/lib/Target/Lanai/LanaiTargetMachine.cpp
@@ -37,17 +37,6 @@ extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeLanaiTarget() {
   initializeLanaiMemAluCombinerPass(PR);
 }
 
-static std::string computeDataLayout() {
-  // Data layout (keep in sync with clang/lib/Basic/Targets.cpp)
-  return "E"        // Big endian
-         "-m:e"     // ELF name manging
-         "-p:32:32" // 32-bit pointers, 32 bit aligned
-         "-i64:64"  // 64 bit integers, 64 bit aligned
-         "-a:0:32"  // 32 bit alignment of objects of aggregate type
-         "-n32"     // 32 bit native integer width
-         "-S64";    // 64 bit natural stack alignment
-}
-
 static Reloc::Model getEffectiveRelocModel(std::optional<Reloc::Model> RM) {
   return RM.value_or(Reloc::PIC_);
 }
@@ -58,7 +47,7 @@ LanaiTargetMachine::LanaiTargetMachine(
     std::optional<CodeModel::Model> CodeModel, CodeGenOptLevel OptLevel,
     bool JIT)
     : CodeGenTargetMachineImpl(
-          T, computeDataLayout(), TT, Cpu, FeatureString, Options,
+          T, TT.computeDataLayout(), TT, Cpu, FeatureString, Options,
           getEffectiveRelocModel(RM),
           getEffectiveCodeModel(CodeModel, CodeModel::Medium), OptLevel),
       Subtarget(TT, Cpu, FeatureString, *this, Options, getCodeModel(),
diff --git a/llvm/lib/Target/LoongArch/Disassembler/LoongArchDisassembler.cpp b/llvm/lib/Target/LoongArch/Disassembler/LoongArchDisassembler.cpp
index 735f798afde24..d4058fac4304a 100644
--- a/llvm/lib/Target/LoongArch/Disassembler/LoongArchDisassembler.cpp
+++ b/llvm/lib/Target/LoongArch/Disassembler/LoongArchDisassembler.cpp
@@ -24,6 +24,7 @@
 #include "llvm/Support/Endian.h"
 
 using namespace llvm;
+using namespace llvm::MCD;
 
 #define DEBUG_TYPE "loongarch-disassembler"
 
diff --git a/llvm/lib/Target/LoongArch/LoongArchDeadRegisterDefinitions.cpp b/llvm/lib/Target/LoongArch/LoongArchDeadRegisterDefinitions.cpp
index 069b181791ac7..0ccebeb393267 100644
--- a/llvm/lib/Target/LoongArch/LoongArchDeadRegisterDefinitions.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchDeadRegisterDefinitions.cpp
@@ -86,7 +86,7 @@ bool LoongArchDeadRegisterDefinitions::runOnMachineFunction(
           continue;
         LLVM_DEBUG(dbgs() << "    Dead def operand #" << I << " in:\n      ";
                    MI.print(dbgs()));
-        const TargetRegisterClass *RC = TII->getRegClass(Desc, I, TRI, MF);
+        const TargetRegisterClass *RC = TII->getRegClass(Desc, I, TRI);
         if (!(RC && RC->contains(LoongArch::R0))) {
           LLVM_DEBUG(dbgs() << "    Ignoring, register is not a GPR.\n");
           continue;
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index 634914d3b3fd0..e8668860c2b38 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -311,6 +311,10 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
       setOperationAction(ISD::ABDS, VT, Legal);
       setOperationAction(ISD::ABDU, VT, Legal);
+      setOperationAction(ISD::SADDSAT, VT, Legal);
+      setOperationAction(ISD::SSUBSAT, VT, Legal);
+      setOperationAction(ISD::UADDSAT, VT, Legal);
+      setOperationAction(ISD::USUBSAT, VT, Legal);
     }
     for (MVT VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32})
       setOperationAction(ISD::BITREVERSE, VT, Custom);
@@ -386,6 +390,10 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
       setOperationAction(ISD::ABDS, VT, Legal);
       setOperationAction(ISD::ABDU, VT, Legal);
+      setOperationAction(ISD::SADDSAT, VT, Legal);
+      setOperationAction(ISD::SSUBSAT, VT, Legal);
+      setOperationAction(ISD::UADDSAT, VT, Legal);
+      setOperationAction(ISD::USUBSAT, VT, Legal);
       setOperationAction(ISD::VECREDUCE_ADD, VT, Custom);
     }
     for (MVT VT : {MVT::v32i8, MVT::v16i16, MVT::v8i32})
@@ -2671,8 +2679,9 @@ SDValue LoongArchTargetLowering::lowerBUILD_VECTOR(SDValue Op,
 
     if (SplatBitSize == 64 && !Subtarget.is64Bit()) {
       // We can only handle 64-bit elements that are within
-      // the signed 32-bit range on 32-bit targets.
-      if (!SplatValue.isSignedIntN(32))
+      // the signed 10-bit range on 32-bit targets.
+      // See the BUILD_VECTOR case in LoongArchDAGToDAGISel::Select().
+      if (!SplatValue.isSignedIntN(10))
         return SDValue();
       if ((Is128Vec && ResTy == MVT::v4i32) ||
           (Is256Vec && ResTy == MVT::v8i32))
diff --git a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
index a79c01cbe577a..2e8e11155c5fa 100644
--- a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
@@ -1389,6 +1389,14 @@ def : Pat<(xor (vt LASX256:$xj), (vt LASX256:$xk)),
 foreach vt = [v32i8, v16i16, v8i32, v4i64] in
 def : Pat<(vnot (or (vt LASX256:$xj), (vt LASX256:$xk))),
           (XVNOR_V LASX256:$xj, LASX256:$xk)>;
+// XVANDN_V
+foreach vt = [v32i8, v16i16, v8i32, v4i64] in
+def : Pat<(and (vt (vnot LASX256:$xj)), (vt LASX256:$xk)),
+          (XVANDN_V LASX256:$xj, LASX256:$xk)>;
+// XVORN_V
+foreach vt = [v32i8, v16i16, v8i32, v4i64] in
+def : Pat<(or (vt LASX256:$xj), (vt (vnot LASX256:$xk))),
+          (XVORN_V LASX256:$xj, LASX256:$xk)>;
 
 // XVANDI_B
 def : Pat<(and (v32i8 LASX256:$xj), (v32i8 (SplatPat_uimm8 uimm8:$imm))),
@@ -1990,6 +1998,12 @@ def : Pat<(v4i32(fp_to_uint v4f64:$vj)),
 defm : PatXrXr<abds, "XVABSD">;
 defm : PatXrXrU<abdu, "XVABSD">;
 
+// XVSADD_{B/H/W/D}[U], XVSSUB_{B/H/W/D}[U]
+defm : PatXrXr<saddsat, "XVSADD">;
+defm : PatXrXr<ssubsat, "XVSSUB">;
+defm : PatXrXrU<uaddsat, "XVSADD">;
+defm : PatXrXrU<usubsat, "XVSSUB">;
+
 // Vector mask set by condition
 def : Pat<(loongarch_xvmskltz (v32i8 LASX256:$vj)), (PseudoXVMSKLTZ_B LASX256:$vj)>;
 def : Pat<(loongarch_xvmskltz (v16i16 LASX256:$vj)), (PseudoXVMSKLTZ_H LASX256:$vj)>;
diff --git a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
index eb7120ffb41a6..5421bba0424bf 100644
--- a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
@@ -1583,6 +1583,14 @@ def : Pat<(xor (vt LSX128:$vj), (vt LSX128:$vk)),
 foreach vt = [v16i8, v8i16, v4i32, v2i64] in
 def : Pat<(vnot (or (vt LSX128:$vj), (vt LSX128:$vk))),
           (VNOR_V LSX128:$vj, LSX128:$vk)>;
+// VANDN_V
+foreach vt = [v16i8, v8i16, v4i32, v2i64] in
+def : Pat<(and (vt (vnot LSX128:$vj)), (vt LSX128:$vk)),
+          (VANDN_V LSX128:$vj, LSX128:$vk)>;
+// VORN_V
+foreach vt = [v16i8, v8i16, v4i32, v2i64] in
+def : Pat<(or (vt LSX128:$vj), (vt (vnot LSX128:$vk))),
+          (VORN_V LSX128:$vj, LSX128:$vk)>;
 
 // VANDI_B
 def : Pat<(and (v16i8 LSX128:$vj), (v16i8 (SplatPat_uimm8 uimm8:$imm))),
@@ -2147,6 +2155,12 @@ def : Pat<(f64 f64imm_vldi:$in),
 defm : PatVrVr<abds, "VABSD">;
 defm : PatVrVrU<abdu, "VABSD">;
 
+// VSADD_{B/H/W/D}[U], VSSUB_{B/H/W/D}[U]
+defm : PatVrVr<saddsat, "VSADD">;
+defm : PatVrVr<ssubsat, "VSSUB">;
+defm : PatVrVrU<uaddsat, "VSADD">;
+defm : PatVrVrU<usubsat, "VSSUB">;
+
 // Vector mask set by condition
 def : Pat<(loongarch_vmskltz (v16i8 LSX128:$vj)), (PseudoVMSKLTZ_B LSX128:$vj)>;
 def : Pat<(loongarch_vmskltz (v8i16 LSX128:$vj)), (PseudoVMSKLTZ_H LSX128:$vj)>;
diff --git a/llvm/lib/Target/LoongArch/LoongArchRegisterInfo.h b/llvm/lib/Target/LoongArch/LoongArchRegisterInfo.h
index d1e40254c2972..53381c28898b8 100644
--- a/llvm/lib/Target/LoongArch/LoongArchRegisterInfo.h
+++ b/llvm/lib/Target/LoongArch/LoongArchRegisterInfo.h
@@ -33,8 +33,7 @@ struct LoongArchRegisterInfo : public LoongArchGenRegisterInfo {
   BitVector getReservedRegs(const MachineFunction &MF) const override;
 
   const TargetRegisterClass *
-  getPointerRegClass(const MachineFunction &MF,
-                     unsigned Kind = 0) const override {
+  getPointerRegClass(unsigned Kind = 0) const override {
     return &LoongArch::GPRRegClass;
   }
 
diff --git a/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp b/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp
index c36db9c75dd3a..d0a8ababe8e58 100644
--- a/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp
@@ -57,13 +57,6 @@ static cl::opt<bool>
                            cl::desc("Enable the loop data prefetch pass"),
                            cl::init(false));
 
-static std::string computeDataLayout(const Triple &TT) {
-  if (TT.isArch64Bit())
-    return "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128";
-  assert(TT.isArch32Bit() && "only LA32 and LA64 are currently supported");
-  return "e-m:e-p:32:32-i64:64-n32-S128";
-}
-
 static Reloc::Model getEffectiveRelocModel(const Triple &TT,
                                            std::optional<Reloc::Model> RM) {
   return RM.value_or(Reloc::Static);
@@ -93,7 +86,7 @@ LoongArchTargetMachine::LoongArchTargetMachine(
     const Target &T, const Triple &TT, StringRef CPU, StringRef FS,
     const TargetOptions &Options, std::optional<Reloc::Model> RM,
     std::optional<CodeModel::Model> CM, CodeGenOptLevel OL, bool JIT)
-    : CodeGenTargetMachineImpl(T, computeDataLayout(TT), TT, CPU, FS, Options,
+    : CodeGenTargetMachineImpl(T, TT.computeDataLayout(), TT, CPU, FS, Options,
                                getEffectiveRelocModel(TT, RM),
                                getEffectiveLoongArchCodeModel(TT, CM), OL),
       TLOF(std::make_unique<TargetLoweringObjectFileELF>()) {
diff --git a/llvm/lib/Target/M68k/CMakeLists.txt b/llvm/lib/Target/M68k/CMakeLists.txt
index b730f41b22353..1ac7e211a996c 100644
--- a/llvm/lib/Target/M68k/CMakeLists.txt
+++ b/llvm/lib/Target/M68k/CMakeLists.txt
@@ -51,6 +51,7 @@ add_llvm_target(M68kCodeGen
   SelectionDAG
   Support
   Target
+  TargetParser
 
   ADD_TO_COMPONENT
   M68k
diff --git a/llvm/lib/Target/M68k/M68kTargetMachine.cpp b/llvm/lib/Target/M68k/M68kTargetMachine.cpp
index ce15ee635e21b..847c27bac2cba 100644
--- a/llvm/lib/Target/M68k/M68kTargetMachine.cpp
+++ b/llvm/lib/Target/M68k/M68kTargetMachine.cpp
@@ -46,35 +46,6 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeM68kTarget() {
 
 namespace {
 
-std::string computeDataLayout(const Triple &TT, StringRef CPU,
-                              const TargetOptions &Options) {
-  std::string Ret = "";
-  // M68k is Big Endian
-  Ret += "E";
-
-  // FIXME how to wire it with the used object format?
-  Ret += "-m:e";
-
-  // M68k pointers are always 32 bit wide even for 16-bit CPUs.
-  // The ABI only specifies 16-bit alignment.
-  // On at least the 68020+ with a 32-bit bus, there is a performance benefit
-  // to having 32-bit alignment.
-  Ret += "-p:32:16:32";
-
-  // Bytes do not require special alignment, words are word aligned and
-  // long words are word aligned at minimum.
-  Ret += "-i8:8:8-i16:16:16-i32:16:32";
-
-  // FIXME no floats at the moment
-
-  // The registers can hold 8, 16, 32 bits
-  Ret += "-n8:16:32";
-
-  Ret += "-a:0:16-S16";
-
-  return Ret;
-}
-
 Reloc::Model getEffectiveRelocModel(const Triple &TT,
                                     std::optional<Reloc::Model> RM) {
   // If not defined we default to static
@@ -101,8 +72,8 @@ M68kTargetMachine::M68kTargetMachine(const Target &T, const Triple &TT,
                                      std::optional<Reloc::Model> RM,
                                      std::optional<CodeModel::Model> CM,
                                      CodeGenOptLevel OL, bool JIT)
-    : CodeGenTargetMachineImpl(T, computeDataLayout(TT, CPU, Options), TT, CPU,
-                               FS, Options, getEffectiveRelocModel(TT, RM),
+    : CodeGenTargetMachineImpl(T, TT.computeDataLayout(), TT, CPU, FS, Options,
+                               getEffectiveRelocModel(TT, RM),
                                ::getEffectiveCodeModel(CM, JIT), OL),
       TLOF(std::make_unique<M68kELFTargetObjectFile>()),
       Subtarget(TT, CPU, FS, *this) {
diff --git a/llvm/lib/Target/MSP430/CMakeLists.txt b/llvm/lib/Target/MSP430/CMakeLists.txt
index 4081d3472fd78..bcf9fd288dbd1 100644
--- a/llvm/lib/Target/MSP430/CMakeLists.txt
+++ b/llvm/lib/Target/MSP430/CMakeLists.txt
@@ -40,6 +40,7 @@ add_llvm_target(MSP430CodeGen
   SelectionDAG
   Support
   Target
+  TargetParser
 
   ADD_TO_COMPONENT
   MSP430
diff --git a/llvm/lib/Target/MSP430/MSP430RegisterInfo.cpp b/llvm/lib/Target/MSP430/MSP430RegisterInfo.cpp
index 44596a1527a2d..c1a1e8e83e0d3 100644
--- a/llvm/lib/Target/MSP430/MSP430RegisterInfo.cpp
+++ b/llvm/lib/Target/MSP430/MSP430RegisterInfo.cpp
@@ -91,8 +91,7 @@ BitVector MSP430RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
 }
 
 const TargetRegisterClass *
-MSP430RegisterInfo::getPointerRegClass(const MachineFunction &MF, unsigned Kind)
-                                                                         const {
+MSP430RegisterInfo::getPointerRegClass(unsigned Kind) const {
   return &MSP430::GR16RegClass;
 }
 
diff --git a/llvm/lib/Target/MSP430/MSP430RegisterInfo.h b/llvm/lib/Target/MSP430/MSP430RegisterInfo.h
index 51e07f4e8e9ea..fbca97361232d 100644
--- a/llvm/lib/Target/MSP430/MSP430RegisterInfo.h
+++ b/llvm/lib/Target/MSP430/MSP430RegisterInfo.h
@@ -28,9 +28,8 @@ class MSP430RegisterInfo : public MSP430GenRegisterInfo {
   const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override;
 
   BitVector getReservedRegs(const MachineFunction &MF) const override;
-  const TargetRegisterClass*
-  getPointerRegClass(const MachineFunction &MF,
-                     unsigned Kind = 0) const override;
+  const TargetRegisterClass *
+  getPointerRegClass(unsigned Kind = 0) const override;
 
   bool eliminateFrameIndex(MachineBasicBlock::iterator II,
                            int SPAdj, unsigned FIOperandNum,
diff --git a/llvm/lib/Target/MSP430/MSP430TargetMachine.cpp b/llvm/lib/Target/MSP430/MSP430TargetMachine.cpp
index e6024f4a62185..988bcae120f9f 100644
--- a/llvm/lib/Target/MSP430/MSP430TargetMachine.cpp
+++ b/llvm/lib/Target/MSP430/MSP430TargetMachine.cpp
@@ -34,19 +34,14 @@ static Reloc::Model getEffectiveRelocModel(std::optional<Reloc::Model> RM) {
   return RM.value_or(Reloc::Static);
 }
 
-static std::string computeDataLayout(const Triple &TT, StringRef CPU,
-                                     const TargetOptions &Options) {
-  return "e-m:e-p:16:16-i32:16-i64:16-f32:16-f64:16-a:8-n8:16-S16";
-}
-
 MSP430TargetMachine::MSP430TargetMachine(const Target &T, const Triple &TT,
                                          StringRef CPU, StringRef FS,
                                          const TargetOptions &Options,
                                          std::optional<Reloc::Model> RM,
                                          std::optional<CodeModel::Model> CM,
                                          CodeGenOptLevel OL, bool JIT)
-    : CodeGenTargetMachineImpl(T, computeDataLayout(TT, CPU, Options), TT, CPU,
-                               FS, Options, getEffectiveRelocModel(RM),
+    : CodeGenTargetMachineImpl(T, TT.computeDataLayout(), TT, CPU, FS, Options,
+                               getEffectiveRelocModel(RM),
                                getEffectiveCodeModel(CM, CodeModel::Small), OL),
       TLOF(std::make_unique<TargetLoweringObjectFileELF>()),
       Subtarget(TT, std::string(CPU), std::string(FS), *this) {
diff --git a/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
index 7b2ee832ae7db..8a5cb517c94c5 100644
--- a/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
+++ b/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
@@ -524,8 +524,8 @@ class MipsAsmParser : public MCTargetAsmParser {
   MipsAsmParser(const MCSubtargetInfo &sti, MCAsmParser &parser,
                 const MCInstrInfo &MII, const MCTargetOptions &Options)
       : MCTargetAsmParser(Options, sti, MII),
-        ABI(MipsABIInfo::computeTargetABI(sti.getTargetTriple(), sti.getCPU(),
-                                          Options)) {
+        ABI(MipsABIInfo::computeTargetABI(sti.getTargetTriple(),
+                                          Options.getABIName())) {
     MCAsmParserExtension::Initialize(parser);
 
     parser.addAliasForDirective(".asciiz", ".asciz");
diff --git a/llvm/lib/Target/Mips/Disassembler/MipsDisassembler.cpp b/llvm/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
index fa6cc0e3f0187..c22b8f61b12dc 100644
--- a/llvm/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
+++ b/llvm/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
@@ -1003,7 +1003,7 @@ static DecodeStatus DecodeMem(MCInst &Inst, unsigned Insn, uint64_t Address,
   Reg = getReg(Decoder, Mips::GPR32RegClassID, Reg);
   Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
 
-  if (Inst.getOpcode() == Mips::SC ||
+  if (Inst.getOpcode() == Mips::SC || Inst.getOpcode() == Mips::SC64 ||
       Inst.getOpcode() == Mips::SCD)
     Inst.addOperand(MCOperand::createReg(Reg));
 
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsABIInfo.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsABIInfo.cpp
index 1be29cf3c94b9..d7809e27e23f3 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsABIInfo.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsABIInfo.cpp
@@ -57,17 +57,16 @@ unsigned MipsABIInfo::GetCalleeAllocdArgSizeInBytes(CallingConv::ID CC) const {
   llvm_unreachable("Unhandled ABI");
 }
 
-MipsABIInfo MipsABIInfo::computeTargetABI(const Triple &TT, StringRef CPU,
-                                          const MCTargetOptions &Options) {
-  if (Options.getABIName().starts_with("o32"))
+MipsABIInfo MipsABIInfo::computeTargetABI(const Triple &TT, StringRef ABIName) {
+  if (ABIName.starts_with("o32"))
     return MipsABIInfo::O32();
-  if (Options.getABIName().starts_with("n32"))
+  if (ABIName.starts_with("n32"))
     return MipsABIInfo::N32();
-  if (Options.getABIName().starts_with("n64"))
+  if (ABIName.starts_with("n64"))
     return MipsABIInfo::N64();
   if (TT.isABIN32())
     return MipsABIInfo::N32();
-  assert(Options.getABIName().empty() && "Unknown ABI option for MIPS");
+  assert(ABIName.empty() && "Unknown ABI option for MIPS");
 
   if (TT.isMIPS64())
     return MipsABIInfo::N64();
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsABIInfo.h b/llvm/lib/Target/Mips/MCTargetDesc/MipsABIInfo.h
index 44b023c7c3ef6..d8003d2fcc164 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsABIInfo.h
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsABIInfo.h
@@ -33,8 +33,7 @@ class MipsABIInfo {
   static MipsABIInfo O32() { return MipsABIInfo(ABI::O32); }
   static MipsABIInfo N32() { return MipsABIInfo(ABI::N32); }
   static MipsABIInfo N64() { return MipsABIInfo(ABI::N64); }
-  static MipsABIInfo computeTargetABI(const Triple &TT, StringRef CPU,
-                                      const MCTargetOptions &Options);
+  static MipsABIInfo computeTargetABI(const Triple &TT, StringRef ABIName);
 
   bool IsKnown() const { return ThisABI != ABI::Unknown; }
   bool IsO32() const { return ThisABI == ABI::O32; }
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
index 33aab71044b09..74e7baf1db293 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
@@ -619,7 +619,7 @@ MCAsmBackend *llvm::createMipsAsmBackend(const Target &T,
     return new WindowsMipsAsmBackend(T, MRI, STI);
 
   MipsABIInfo ABI = MipsABIInfo::computeTargetABI(STI.getTargetTriple(),
-                                                  STI.getCPU(), Options);
+                                                  Options.getABIName());
   return new MipsAsmBackend(T, MRI, STI.getTargetTriple(), STI.getCPU(),
                             ABI.IsN32());
 }
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp
index 8b28ee62b878c..e1c9954c19cc0 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp
@@ -24,7 +24,8 @@ MipsELFMCAsmInfo::MipsELFMCAsmInfo(const Triple &TheTriple,
                                    const MCTargetOptions &Options) {
   IsLittleEndian = TheTriple.isLittleEndian();
 
-  MipsABIInfo ABI = MipsABIInfo::computeTargetABI(TheTriple, "", Options);
+  MipsABIInfo ABI =
+      MipsABIInfo::computeTargetABI(TheTriple, Options.getABIName());
 
   if (TheTriple.isMIPS64() && !ABI.IsN32())
     CodePointerSize = CalleeSaveStackSlotSize = 8;
diff --git a/llvm/lib/Target/Mips/Mips16InstrInfo.cpp b/llvm/lib/Target/Mips/Mips16InstrInfo.cpp
index cafc11b8a0d9b..5d08f560c3c36 100644
--- a/llvm/lib/Target/Mips/Mips16InstrInfo.cpp
+++ b/llvm/lib/Target/Mips/Mips16InstrInfo.cpp
@@ -37,7 +37,7 @@ using namespace llvm;
 #define DEBUG_TYPE "mips16-instrinfo"
 
 Mips16InstrInfo::Mips16InstrInfo(const MipsSubtarget &STI)
-    : MipsInstrInfo(STI, Mips::Bimm16) {}
+    : MipsInstrInfo(STI, Mips::Bimm16), RI(STI) {}
 
 const MipsRegisterInfo &Mips16InstrInfo::getRegisterInfo() const {
   return RI;
diff --git a/llvm/lib/Target/Mips/Mips16InstrInfo.td b/llvm/lib/Target/Mips/Mips16InstrInfo.td
index ab473c133b8e3..296414c6a06db 100644
--- a/llvm/lib/Target/Mips/Mips16InstrInfo.td
+++ b/llvm/lib/Target/Mips/Mips16InstrInfo.td
@@ -88,7 +88,7 @@ class FRI16_ins<bits<5> op, string asmstr,
 
 class FRI16_TCP_ins<bits<5> _op, string asmstr,
                     InstrItinClass itin>:
-  FRI16<_op, (outs CPU16Regs:$rx), (ins pcrel16:$imm8, i32imm:$size),
+  FRI16<_op, (outs CPU16Regs:$rx), (ins pcrel16:$imm8),
             !strconcat(asmstr, "\t$rx, $imm8\t# 16 bit inst"), [], itin>;
 
 class FRI16R_ins_base<bits<5> op, string asmstr, string asmstr2,
@@ -216,7 +216,7 @@ class FEXT_RI16_B_ins<bits<5> _op, string asmstr,
 
 class FEXT_RI16_TCP_ins<bits<5> _op, string asmstr,
                         InstrItinClass itin>:
-  FEXT_RI16<_op, (outs CPU16Regs:$rx), (ins pcrel16:$imm16, i32imm:$size),
+  FEXT_RI16<_op, (outs CPU16Regs:$rx), (ins pcrel16:$imm16),
             !strconcat(asmstr, "\t$rx, $imm16"), [], itin>;
 
 class FEXT_2RI16_ins<bits<5> _op, string asmstr,
@@ -856,6 +856,7 @@ def LwRxSpImmX16: FEXT_RRI16_mem_ins<0b10010, "lw", mem16sp, II_LW>, MayLoad;
 def LwRxPcTcp16: FRI16_TCP_ins<0b10110, "lw", II_LW>, MayLoad;
 
 def LwRxPcTcpX16: FEXT_RI16_TCP_ins<0b10110, "lw", II_LW>, MayLoad;
+
 //
 // Format: MOVE r32, rz MIPS16e
 // Purpose: Move
diff --git a/llvm/lib/Target/Mips/Mips16RegisterInfo.cpp b/llvm/lib/Target/Mips/Mips16RegisterInfo.cpp
index d257f02b2bc6f..66099593b6311 100644
--- a/llvm/lib/Target/Mips/Mips16RegisterInfo.cpp
+++ b/llvm/lib/Target/Mips/Mips16RegisterInfo.cpp
@@ -28,7 +28,8 @@ using namespace llvm;
 
 #define DEBUG_TYPE "mips16-registerinfo"
 
-Mips16RegisterInfo::Mips16RegisterInfo() = default;
+Mips16RegisterInfo::Mips16RegisterInfo(const MipsSubtarget &STI)
+    : MipsRegisterInfo(STI) {}
 
 bool Mips16RegisterInfo::requiresRegisterScavenging
   (const MachineFunction &MF) const {
diff --git a/llvm/lib/Target/Mips/Mips16RegisterInfo.h b/llvm/lib/Target/Mips/Mips16RegisterInfo.h
index ff115b30162b9..29d08b4003ed4 100644
--- a/llvm/lib/Target/Mips/Mips16RegisterInfo.h
+++ b/llvm/lib/Target/Mips/Mips16RegisterInfo.h
@@ -16,10 +16,9 @@
 #include "MipsRegisterInfo.h"
 
 namespace llvm {
-
 class Mips16RegisterInfo : public MipsRegisterInfo {
 public:
-  Mips16RegisterInfo();
+  explicit Mips16RegisterInfo(const MipsSubtarget &STI);
 
   bool requiresRegisterScavenging(const MachineFunction &MF) const override;
 
diff --git a/llvm/lib/Target/Mips/MipsConstantIslandPass.cpp b/llvm/lib/Target/Mips/MipsConstantIslandPass.cpp
index 8699807b6bf2b..31a229a0fd102 100644
--- a/llvm/lib/Target/Mips/MipsConstantIslandPass.cpp
+++ b/llvm/lib/Target/Mips/MipsConstantIslandPass.cpp
@@ -1647,7 +1647,6 @@ void MipsConstantIslands::prescanForConstants() {
           MI.removeOperand(1);
           MI.removeOperand(1);
           MI.addOperand(MachineOperand::CreateCPI(index, 0));
-          MI.addOperand(MachineOperand::CreateImm(4));
         }
         break;
       }
diff --git a/llvm/lib/Target/Mips/MipsExpandPseudo.cpp b/llvm/lib/Target/Mips/MipsExpandPseudo.cpp
index 34ff41f6e02da..95822c94946a1 100644
--- a/llvm/lib/Target/Mips/MipsExpandPseudo.cpp
+++ b/llvm/lib/Target/Mips/MipsExpandPseudo.cpp
@@ -432,13 +432,24 @@ bool MipsExpandPseudo::expandAtomicBinOpSubword(
   Register OldVal = I->getOperand(6).getReg();
   Register BinOpRes = I->getOperand(7).getReg();
   Register StoreVal = I->getOperand(8).getReg();
+  bool NoMovnInstr = (IsMin || IsMax) && !STI->hasMips4() && !STI->hasMips32();
 
   const BasicBlock *LLVM_BB = BB.getBasicBlock();
   MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *loop1MBB;
+  MachineBasicBlock *loop2MBB;
+  if (NoMovnInstr) {
+    loop1MBB = MF->CreateMachineBasicBlock(LLVM_BB);
+    loop2MBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  }
   MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(LLVM_BB);
   MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
   MachineFunction::iterator It = ++BB.getIterator();
   MF->insert(It, loopMBB);
+  if (NoMovnInstr) {
+    MF->insert(It, loop1MBB);
+    MF->insert(It, loop2MBB);
+  }
   MF->insert(It, sinkMBB);
   MF->insert(It, exitMBB);
 
@@ -446,9 +457,19 @@ bool MipsExpandPseudo::expandAtomicBinOpSubword(
   exitMBB->transferSuccessorsAndUpdatePHIs(&BB);
 
   BB.addSuccessor(loopMBB, BranchProbability::getOne());
-  loopMBB->addSuccessor(sinkMBB);
-  loopMBB->addSuccessor(loopMBB);
+  if (NoMovnInstr) {
+    loopMBB->addSuccessor(loop1MBB);
+    loopMBB->addSuccessor(loop2MBB);
+  } else {
+    loopMBB->addSuccessor(sinkMBB);
+    loopMBB->addSuccessor(loopMBB);
+  }
   loopMBB->normalizeSuccProbs();
+  if (NoMovnInstr) {
+    loop1MBB->addSuccessor(loop2MBB);
+    loop2MBB->addSuccessor(loopMBB);
+    loop2MBB->addSuccessor(exitMBB, BranchProbability::getOne());
+  }
 
   BuildMI(loopMBB, DL, TII->get(LL), OldVal).addReg(Ptr).addImm(0);
   if (IsNand) {
@@ -525,7 +546,7 @@ bool MipsExpandPseudo::expandAtomicBinOpSubword(
       BuildMI(loopMBB, DL, TII->get(OR), BinOpRes)
           .addReg(BinOpRes)
           .addReg(Scratch4);
-    } else {
+    } else if (STI->hasMips4() || STI->hasMips32()) {
       // max: move BinOpRes, StoreVal
       //      movn BinOpRes, Incr, Scratch4, BinOpRes
       // min: move BinOpRes, StoreVal
@@ -537,12 +558,59 @@ bool MipsExpandPseudo::expandAtomicBinOpSubword(
           .addReg(Incr)
           .addReg(Scratch4)
           .addReg(BinOpRes);
+    } else {
+      // if min:
+      // loopMBB:  move BinOpRes, StoreVal
+      //           beq Scratch4, 0, loop1MBB
+      //           j loop2MBB
+      // loop1MBB: move BinOpRes, Incr
+      // loop2MBB: and BinOpRes, BinOpRes, Mask
+      //           and StoreVal, OlddVal, Mask2
+      //           or StoreVal, StoreVal, BinOpRes
+      //           StoreVal<tied1> = sc StoreVal, 0(Ptr)
+      //           beq StoreVal, zero, loopMBB
+      //
+      // if max:
+      // loopMBB:  move BinOpRes, Incr
+      //           beq Scratch4, 0, loop1MBB
+      //           j loop2MBB
+      // loop1MBB: move BinOpRes, StoreVal
+      // loop2MBB: and BinOpRes, BinOpRes, Mask
+      //           and StoreVal, OlddVal, Mask2
+      //           or StoreVal, StoreVal, BinOpRes
+      //           StoreVal<tied1> = sc StoreVal, 0(Ptr)
+      //           beq StoreVal, zero, loopMBB
+      if (IsMin) {
+        BuildMI(loopMBB, DL, TII->get(OR), BinOpRes)
+            .addReg(StoreVal)
+            .addReg(Mips::ZERO);
+        BuildMI(loop1MBB, DL, TII->get(OR), BinOpRes)
+            .addReg(Incr)
+            .addReg(Mips::ZERO);
+      } else {
+        BuildMI(loopMBB, DL, TII->get(OR), BinOpRes)
+            .addReg(Incr)
+            .addReg(Mips::ZERO);
+        BuildMI(loop1MBB, DL, TII->get(OR), BinOpRes)
+            .addReg(StoreVal)
+            .addReg(Mips::ZERO);
+      }
+      BuildMI(loopMBB, DL, TII->get(BEQ))
+          .addReg(Scratch4)
+          .addReg(Mips::ZERO)
+          .addMBB(loop1MBB);
+      BuildMI(loopMBB, DL, TII->get(Mips::B)).addMBB(loop2MBB);
     }
 
     //  and BinOpRes, BinOpRes, Mask
-    BuildMI(loopMBB, DL, TII->get(Mips::AND), BinOpRes)
-        .addReg(BinOpRes)
-        .addReg(Mask);
+    if (NoMovnInstr)
+      BuildMI(loop2MBB, DL, TII->get(Mips::AND), BinOpRes)
+          .addReg(BinOpRes)
+          .addReg(Mask);
+    else
+      BuildMI(loopMBB, DL, TII->get(Mips::AND), BinOpRes)
+          .addReg(BinOpRes)
+          .addReg(Mask);
 
   } else if (!IsSwap) {
     //  <binop> binopres, oldval, incr2
@@ -564,14 +632,37 @@ bool MipsExpandPseudo::expandAtomicBinOpSubword(
   // or StoreVal, StoreVal, BinOpRes
   // StoreVal<tied1> = sc StoreVal, 0(Ptr)
   // beq StoreVal, zero, loopMBB
-  BuildMI(loopMBB, DL, TII->get(Mips::AND), StoreVal)
-    .addReg(OldVal).addReg(Mask2);
-  BuildMI(loopMBB, DL, TII->get(Mips::OR), StoreVal)
-    .addReg(StoreVal).addReg(BinOpRes);
-  BuildMI(loopMBB, DL, TII->get(SC), StoreVal)
-    .addReg(StoreVal).addReg(Ptr).addImm(0);
-  BuildMI(loopMBB, DL, TII->get(BEQ))
-    .addReg(StoreVal).addReg(Mips::ZERO).addMBB(loopMBB);
+  if (NoMovnInstr) {
+    BuildMI(loop2MBB, DL, TII->get(Mips::AND), StoreVal)
+        .addReg(OldVal)
+        .addReg(Mask2);
+    BuildMI(loop2MBB, DL, TII->get(Mips::OR), StoreVal)
+        .addReg(StoreVal)
+        .addReg(BinOpRes);
+    BuildMI(loop2MBB, DL, TII->get(SC), StoreVal)
+        .addReg(StoreVal)
+        .addReg(Ptr)
+        .addImm(0);
+    BuildMI(loop2MBB, DL, TII->get(BEQ))
+        .addReg(StoreVal)
+        .addReg(Mips::ZERO)
+        .addMBB(loopMBB);
+  } else {
+    BuildMI(loopMBB, DL, TII->get(Mips::AND), StoreVal)
+        .addReg(OldVal)
+        .addReg(Mask2);
+    BuildMI(loopMBB, DL, TII->get(Mips::OR), StoreVal)
+        .addReg(StoreVal)
+        .addReg(BinOpRes);
+    BuildMI(loopMBB, DL, TII->get(SC), StoreVal)
+        .addReg(StoreVal)
+        .addReg(Ptr)
+        .addImm(0);
+    BuildMI(loopMBB, DL, TII->get(BEQ))
+        .addReg(StoreVal)
+        .addReg(Mips::ZERO)
+        .addMBB(loopMBB);
+  }
 
   //  sinkMBB:
   //    and     maskedoldval1,oldval,mask
@@ -600,6 +691,10 @@ bool MipsExpandPseudo::expandAtomicBinOpSubword(
 
   LivePhysRegs LiveRegs;
   computeAndAddLiveIns(LiveRegs, *loopMBB);
+  if (NoMovnInstr) {
+    computeAndAddLiveIns(LiveRegs, *loop1MBB);
+    computeAndAddLiveIns(LiveRegs, *loop2MBB);
+  }
   computeAndAddLiveIns(LiveRegs, *sinkMBB);
   computeAndAddLiveIns(LiveRegs, *exitMBB);
 
@@ -746,20 +841,41 @@ bool MipsExpandPseudo::expandAtomicBinOp(MachineBasicBlock &BB,
     llvm_unreachable("Unknown pseudo atomic!");
   }
 
+  bool NoMovnInstr = (IsMin || IsMax) && !STI->hasMips4() && !STI->hasMips32();
   const BasicBlock *LLVM_BB = BB.getBasicBlock();
   MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *loop1MBB;
+  MachineBasicBlock *loop2MBB;
+  if (NoMovnInstr) {
+    loop1MBB = MF->CreateMachineBasicBlock(LLVM_BB);
+    loop2MBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  }
   MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
   MachineFunction::iterator It = ++BB.getIterator();
   MF->insert(It, loopMBB);
+  if (NoMovnInstr) {
+    MF->insert(It, loop1MBB);
+    MF->insert(It, loop2MBB);
+  }
   MF->insert(It, exitMBB);
 
   exitMBB->splice(exitMBB->begin(), &BB, std::next(I), BB.end());
   exitMBB->transferSuccessorsAndUpdatePHIs(&BB);
 
   BB.addSuccessor(loopMBB, BranchProbability::getOne());
-  loopMBB->addSuccessor(exitMBB);
-  loopMBB->addSuccessor(loopMBB);
+  if (NoMovnInstr) {
+    loopMBB->addSuccessor(loop1MBB);
+    loopMBB->addSuccessor(loop2MBB);
+  } else {
+    loopMBB->addSuccessor(exitMBB);
+    loopMBB->addSuccessor(loopMBB);
+  }
   loopMBB->normalizeSuccProbs();
+  if (NoMovnInstr) {
+    loop1MBB->addSuccessor(loop2MBB);
+    loop2MBB->addSuccessor(loopMBB);
+    loop2MBB->addSuccessor(exitMBB, BranchProbability::getOne());
+  }
 
   BuildMI(loopMBB, DL, TII->get(LL), OldVal).addReg(Ptr).addImm(0);
   assert((OldVal != Ptr) && "Clobbered the wrong ptr reg!");
@@ -802,7 +918,7 @@ bool MipsExpandPseudo::expandAtomicBinOp(MachineBasicBlock &BB,
       BuildMI(loopMBB, DL, TII->get(OR), Scratch)
           .addReg(Scratch)
           .addReg(Scratch2);
-    } else {
+    } else if (STI->hasMips4() || STI->hasMips32()) {
       // max: move Scratch, OldVal
       //      movn Scratch, Incr, Scratch2, Scratch
       // min: move Scratch, OldVal
@@ -814,6 +930,38 @@ bool MipsExpandPseudo::expandAtomicBinOp(MachineBasicBlock &BB,
           .addReg(Incr)
           .addReg(Scratch2)
           .addReg(Scratch);
+    } else {
+      // if min:
+      // loopMBB:  move Scratch, OldVal
+      //           beq Scratch2_32, 0, loop1MBB
+      //           j loop2MBB
+      // loop1MBB: move Scratch, Incr
+      // loop2MBB: sc $2, 0($4)
+      //           beqz	$2, $BB0_1
+      //           nop
+      //
+      // if max:
+      // loopMBB:  move Scratch, Incr
+      //           beq Scratch2_32, 0, loop1MBB
+      //           j loop2MBB
+      // loop1MBB: move Scratch, OldVal
+      // loop2MBB: sc $2, 0($4)
+      //           beqz	$2, $BB0_1
+      //           nop
+      if (IsMin) {
+        BuildMI(loopMBB, DL, TII->get(OR), Scratch).addReg(OldVal).addReg(ZERO);
+        BuildMI(loop1MBB, DL, TII->get(OR), Scratch).addReg(Incr).addReg(ZERO);
+      } else {
+        BuildMI(loopMBB, DL, TII->get(OR), Scratch).addReg(Incr).addReg(ZERO);
+        BuildMI(loop1MBB, DL, TII->get(OR), Scratch)
+            .addReg(OldVal)
+            .addReg(ZERO);
+      }
+      BuildMI(loopMBB, DL, TII->get(BEQ))
+          .addReg(Scratch2_32)
+          .addReg(ZERO)
+          .addMBB(loop1MBB);
+      BuildMI(loopMBB, DL, TII->get(Mips::B)).addMBB(loop2MBB);
     }
 
   } else if (Opcode) {
@@ -829,20 +977,35 @@ bool MipsExpandPseudo::expandAtomicBinOp(MachineBasicBlock &BB,
     BuildMI(loopMBB, DL, TII->get(OR), Scratch).addReg(Incr).addReg(ZERO);
   }
 
-  BuildMI(loopMBB, DL, TII->get(SC), Scratch)
-      .addReg(Scratch)
-      .addReg(Ptr)
-      .addImm(0);
-  BuildMI(loopMBB, DL, TII->get(BEQ))
-      .addReg(Scratch)
-      .addReg(ZERO)
-      .addMBB(loopMBB);
+  if (NoMovnInstr) {
+    BuildMI(loop2MBB, DL, TII->get(SC), Scratch)
+        .addReg(Scratch)
+        .addReg(Ptr)
+        .addImm(0);
+    BuildMI(loop2MBB, DL, TII->get(BEQ))
+        .addReg(Scratch)
+        .addReg(ZERO)
+        .addMBB(loopMBB);
+  } else {
+    BuildMI(loopMBB, DL, TII->get(SC), Scratch)
+        .addReg(Scratch)
+        .addReg(Ptr)
+        .addImm(0);
+    BuildMI(loopMBB, DL, TII->get(BEQ))
+        .addReg(Scratch)
+        .addReg(ZERO)
+        .addMBB(loopMBB);
+  }
 
   NMBBI = BB.end();
   I->eraseFromParent();
 
   LivePhysRegs LiveRegs;
   computeAndAddLiveIns(LiveRegs, *loopMBB);
+  if (!STI->hasMips4() && !STI->hasMips32()) {
+    computeAndAddLiveIns(LiveRegs, *loop1MBB);
+    computeAndAddLiveIns(LiveRegs, *loop2MBB);
+  }
   computeAndAddLiveIns(LiveRegs, *exitMBB);
 
   return true;
diff --git a/llvm/lib/Target/Mips/MipsRegisterInfo.cpp b/llvm/lib/Target/Mips/MipsRegisterInfo.cpp
index 539288e8da592..4d105bddd4d9c 100644
--- a/llvm/lib/Target/Mips/MipsRegisterInfo.cpp
+++ b/llvm/lib/Target/Mips/MipsRegisterInfo.cpp
@@ -37,27 +37,26 @@ using namespace llvm;
 #define GET_REGINFO_TARGET_DESC
 #include "MipsGenRegisterInfo.inc"
 
-MipsRegisterInfo::MipsRegisterInfo() : MipsGenRegisterInfo(Mips::RA) {
+MipsRegisterInfo::MipsRegisterInfo(const MipsSubtarget &STI)
+    : MipsGenRegisterInfo(Mips::RA), ArePtrs64bit(STI.getABI().ArePtrs64bit()) {
   MIPS_MC::initLLVMToCVRegMapping(this);
 }
 
 unsigned MipsRegisterInfo::getPICCallReg() { return Mips::T9; }
 
 const TargetRegisterClass *
-MipsRegisterInfo::getPointerRegClass(const MachineFunction &MF,
-                                     unsigned Kind) const {
-  MipsABIInfo ABI = MF.getSubtarget<MipsSubtarget>().getABI();
+MipsRegisterInfo::getPointerRegClass(unsigned Kind) const {
   MipsPtrClass PtrClassKind = static_cast<MipsPtrClass>(Kind);
 
   switch (PtrClassKind) {
   case MipsPtrClass::Default:
-    return ABI.ArePtrs64bit() ? &Mips::GPR64RegClass : &Mips::GPR32RegClass;
+    return ArePtrs64bit ? &Mips::GPR64RegClass : &Mips::GPR32RegClass;
   case MipsPtrClass::GPR16MM:
     return &Mips::GPRMM16RegClass;
   case MipsPtrClass::StackPointer:
-    return ABI.ArePtrs64bit() ? &Mips::SP64RegClass : &Mips::SP32RegClass;
+    return ArePtrs64bit ? &Mips::SP64RegClass : &Mips::SP32RegClass;
   case MipsPtrClass::GlobalPointer:
-    return ABI.ArePtrs64bit() ? &Mips::GP64RegClass : &Mips::GP32RegClass;
+    return ArePtrs64bit ? &Mips::GP64RegClass : &Mips::GP32RegClass;
   }
 
   llvm_unreachable("Unknown pointer kind");
diff --git a/llvm/lib/Target/Mips/MipsRegisterInfo.h b/llvm/lib/Target/Mips/MipsRegisterInfo.h
index b002f4cf3ae7a..dbdb0501998bf 100644
--- a/llvm/lib/Target/Mips/MipsRegisterInfo.h
+++ b/llvm/lib/Target/Mips/MipsRegisterInfo.h
@@ -25,6 +25,9 @@ namespace llvm {
 class TargetRegisterClass;
 
 class MipsRegisterInfo : public MipsGenRegisterInfo {
+private:
+  const bool ArePtrs64bit;
+
 public:
   enum class MipsPtrClass {
     /// The default register class for integer values.
@@ -38,14 +41,13 @@ class MipsRegisterInfo : public MipsGenRegisterInfo {
     GlobalPointer = 3,
   };
 
-  MipsRegisterInfo();
+  explicit MipsRegisterInfo(const MipsSubtarget &STI);
 
   /// Get PIC indirect call register
   static unsigned getPICCallReg();
 
   /// Code Generation virtual methods...
-  const TargetRegisterClass *getPointerRegClass(const MachineFunction &MF,
-                                                unsigned Kind) const override;
+  const TargetRegisterClass *getPointerRegClass(unsigned Kind) const override;
 
   unsigned getRegPressureLimit(const TargetRegisterClass *RC,
                                MachineFunction &MF) const override;
diff --git a/llvm/lib/Target/Mips/MipsSEInstrInfo.cpp b/llvm/lib/Target/Mips/MipsSEInstrInfo.cpp
index caa20f72aacf9..dbdbb179a583d 100644
--- a/llvm/lib/Target/Mips/MipsSEInstrInfo.cpp
+++ b/llvm/lib/Target/Mips/MipsSEInstrInfo.cpp
@@ -28,7 +28,7 @@ static unsigned getUnconditionalBranch(const MipsSubtarget &STI) {
 }
 
 MipsSEInstrInfo::MipsSEInstrInfo(const MipsSubtarget &STI)
-    : MipsInstrInfo(STI, getUnconditionalBranch(STI)), RI() {}
+    : MipsInstrInfo(STI, getUnconditionalBranch(STI)), RI(STI) {}
 
 const MipsRegisterInfo &MipsSEInstrInfo::getRegisterInfo() const {
   return RI;
@@ -682,8 +682,8 @@ MipsSEInstrInfo::compareOpndSize(unsigned Opc,
   const MCInstrDesc &Desc = get(Opc);
   assert(Desc.NumOperands == 2 && "Unary instruction expected.");
   const MipsRegisterInfo *RI = &getRegisterInfo();
-  unsigned DstRegSize = RI->getRegSizeInBits(*getRegClass(Desc, 0, RI, MF));
-  unsigned SrcRegSize = RI->getRegSizeInBits(*getRegClass(Desc, 1, RI, MF));
+  unsigned DstRegSize = RI->getRegSizeInBits(*getRegClass(Desc, 0, RI));
+  unsigned SrcRegSize = RI->getRegSizeInBits(*getRegClass(Desc, 1, RI));
 
   return std::make_pair(DstRegSize > SrcRegSize, DstRegSize < SrcRegSize);
 }
diff --git a/llvm/lib/Target/Mips/MipsSERegisterInfo.cpp b/llvm/lib/Target/Mips/MipsSERegisterInfo.cpp
index feb2b3d2010b4..1326878f7e17e 100644
--- a/llvm/lib/Target/Mips/MipsSERegisterInfo.cpp
+++ b/llvm/lib/Target/Mips/MipsSERegisterInfo.cpp
@@ -34,7 +34,8 @@ using namespace llvm;
 
 #define DEBUG_TYPE "mips-reg-info"
 
-MipsSERegisterInfo::MipsSERegisterInfo() = default;
+MipsSERegisterInfo::MipsSERegisterInfo(const MipsSubtarget &STI)
+    : MipsRegisterInfo(STI) {}
 
 bool MipsSERegisterInfo::
 requiresRegisterScavenging(const MachineFunction &MF) const {
diff --git a/llvm/lib/Target/Mips/MipsSERegisterInfo.h b/llvm/lib/Target/Mips/MipsSERegisterInfo.h
index cc8496e0268be..93de2c778063a 100644
--- a/llvm/lib/Target/Mips/MipsSERegisterInfo.h
+++ b/llvm/lib/Target/Mips/MipsSERegisterInfo.h
@@ -20,7 +20,7 @@ namespace llvm {
 
 class MipsSERegisterInfo : public MipsRegisterInfo {
 public:
-  MipsSERegisterInfo();
+  explicit MipsSERegisterInfo(const MipsSubtarget &STI);
 
   bool requiresRegisterScavenging(const MachineFunction &MF) const override;
 
diff --git a/llvm/lib/Target/Mips/MipsTargetMachine.cpp b/llvm/lib/Target/Mips/MipsTargetMachine.cpp
index 8c519fa379dd8..03bedc5b15c4f 100644
--- a/llvm/lib/Target/Mips/MipsTargetMachine.cpp
+++ b/llvm/lib/Target/Mips/MipsTargetMachine.cpp
@@ -77,42 +77,6 @@ static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
   return std::make_unique<MipsTargetObjectFile>();
 }
 
-static std::string computeDataLayout(const Triple &TT, StringRef CPU,
-                                     const TargetOptions &Options,
-                                     bool isLittle) {
-  std::string Ret;
-  MipsABIInfo ABI = MipsABIInfo::computeTargetABI(TT, CPU, Options.MCOptions);
-
-  // There are both little and big endian mips.
-  if (isLittle)
-    Ret += "e";
-  else
-    Ret += "E";
-
-  if (ABI.IsO32())
-    Ret += "-m:m";
-  else
-    Ret += "-m:e";
-
-  // Pointers are 32 bit on some ABIs.
-  if (!ABI.IsN64())
-    Ret += "-p:32:32";
-
-  // 8 and 16 bit integers only need to have natural alignment, but try to
-  // align them to 32 bits. 64 bit integers have natural alignment.
-  Ret += "-i8:8:32-i16:16:32-i64:64";
-
-  // 32 bit registers are always available and the stack is at least 64 bit
-  // aligned. On N64 64 bit registers are also available and the stack is
-  // 128 bit aligned.
-  if (ABI.IsN64() || ABI.IsN32())
-    Ret += "-i128:128-n32:64-S128";
-  else
-    Ret += "-n32-S64";
-
-  return Ret;
-}
-
 static Reloc::Model getEffectiveRelocModel(bool JIT,
                                            std::optional<Reloc::Model> RM) {
   if (!RM || JIT)
@@ -132,12 +96,12 @@ MipsTargetMachine::MipsTargetMachine(const Target &T, const Triple &TT,
                                      std::optional<CodeModel::Model> CM,
                                      CodeGenOptLevel OL, bool JIT,
                                      bool isLittle)
-    : CodeGenTargetMachineImpl(T, computeDataLayout(TT, CPU, Options, isLittle),
-                               TT, CPU, FS, Options,
-                               getEffectiveRelocModel(JIT, RM),
-                               getEffectiveCodeModel(CM, CodeModel::Small), OL),
+    : CodeGenTargetMachineImpl(
+          T, TT.computeDataLayout(Options.MCOptions.getABIName()), TT, CPU, FS,
+          Options, getEffectiveRelocModel(JIT, RM),
+          getEffectiveCodeModel(CM, CodeModel::Small), OL),
       isLittle(isLittle), TLOF(createTLOF(getTargetTriple())),
-      ABI(MipsABIInfo::computeTargetABI(TT, CPU, Options.MCOptions)),
+      ABI(MipsABIInfo::computeTargetABI(TT, Options.MCOptions.getABIName())),
       Subtarget(nullptr),
       DefaultSubtarget(TT, CPU, FS, isLittle, *this, std::nullopt),
       NoMips16Subtarget(TT, CPU, FS.empty() ? "-mips16" : FS.str() + ",-mips16",
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
index 833f014a4c870..a6837a482608c 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
@@ -118,24 +118,6 @@ extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeNVPTXTarget() {
   initializeNVPTXPrologEpilogPassPass(PR);
 }
 
-static std::string computeDataLayout(bool is64Bit, bool UseShortPointers) {
-  std::string Ret = "e";
-
-  // Tensor Memory (addrspace:6) is always 32-bits.
-  // Distributed Shared Memory (addrspace:7) follows shared memory
-  // (addrspace:3).
-  if (!is64Bit)
-    Ret += "-p:32:32-p6:32:32-p7:32:32";
-  else if (UseShortPointers)
-    Ret += "-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32";
-  else
-    Ret += "-p6:32:32";
-
-  Ret += "-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64";
-
-  return Ret;
-}
-
 NVPTXTargetMachine::NVPTXTargetMachine(const Target &T, const Triple &TT,
                                        StringRef CPU, StringRef FS,
                                        const TargetOptions &Options,
@@ -144,10 +126,10 @@ NVPTXTargetMachine::NVPTXTargetMachine(const Target &T, const Triple &TT,
                                        CodeGenOptLevel OL, bool is64bit)
     // The pic relocation model is used regardless of what the client has
     // specified, as it is the only relocation model currently supported.
-    : CodeGenTargetMachineImpl(T,
-                               computeDataLayout(is64bit, UseShortPointersOpt),
-                               TT, CPU, FS, Options, Reloc::PIC_,
-                               getEffectiveCodeModel(CM, CodeModel::Small), OL),
+    : CodeGenTargetMachineImpl(
+          T, TT.computeDataLayout(UseShortPointersOpt ? "shortptr" : ""), TT,
+          CPU, FS, Options, Reloc::PIC_,
+          getEffectiveCodeModel(CM, CodeModel::Small), OL),
       is64bit(is64bit), TLOF(std::make_unique<NVPTXTargetObjectFile>()),
       Subtarget(TT, std::string(CPU), std::string(FS), *this),
       StrPool(StrAlloc) {
diff --git a/llvm/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp b/llvm/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp
index 5e27f06c94f06..47586c417cfe3 100644
--- a/llvm/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp
+++ b/llvm/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp
@@ -18,6 +18,7 @@
 #include "llvm/Support/Endian.h"
 
 using namespace llvm;
+using namespace llvm::MCD;
 
 DEFINE_PPC_REGCLASSES
 
diff --git a/llvm/lib/Target/PowerPC/PPC.td b/llvm/lib/Target/PowerPC/PPC.td
index ea7c2203662bd..386d0f65d1ed1 100644
--- a/llvm/lib/Target/PowerPC/PPC.td
+++ b/llvm/lib/Target/PowerPC/PPC.td
@@ -58,8 +58,13 @@ def DirectivePwrFuture
 
 // Specifies that the selected CPU supports 64-bit instructions, regardless of
 // whether we are in 32-bit or 64-bit mode.
-def Feature64Bit     : SubtargetFeature<"64bit","Has64BitSupport", "true",
-                                        "Enable 64-bit instructions">;
+def Feature64BitSupport : SubtargetFeature<"64bit-support", "Has64BitSupport", "true",
+                                        "Supports 64-bit instructions">;
+// 64-bit is enabled.
+def Feature64Bit     : SubtargetFeature<"64bit", "IsPPC64", "true",
+                                        "Enable 64-bit mode",
+                                        [Feature64BitSupport]>;
+
 def AIXOS: SubtargetFeature<"aix", "IsAIX", "true", "AIX OS">;
 def FeatureModernAIXAs
     : SubtargetFeature<"modern-aix-as", "HasModernAIXAs", "true",
@@ -85,7 +90,7 @@ def FeatureAltivec   : SubtargetFeature<"altivec","HasAltivec", "true",
 def FeatureSPE       : SubtargetFeature<"spe","HasSPE", "true",
                                         "Enable SPE instructions",
                                         [FeatureHardFloat]>;
-def FeatureEFPU2 : SubtargetFeature<"efpu2", "HasEFPU2", "true", 
+def FeatureEFPU2 : SubtargetFeature<"efpu2", "HasEFPU2", "true",
                                         "Enable Embedded Floating-Point APU 2 instructions",
                                         [FeatureSPE]>;
 def FeatureMFOCRF    : SubtargetFeature<"mfocrf","HasMFOCRF", "true",
@@ -353,6 +358,43 @@ def FeaturePredictableSelectIsExpensive :
 def FeatureFastMFLR : SubtargetFeature<"fast-MFLR", "HasFastMFLR", "true",
                                        "MFLR is a fast instruction">;
 
+//===----------------------------------------------------------------------===//
+// PowerPC Instruction Predicate Definitions.
+def In32BitMode  : Predicate<"!Subtarget->isPPC64()">;
+def In64BitMode  : Predicate<"Subtarget->isPPC64()">;
+def IsBookE  : Predicate<"Subtarget->isBookE()">;
+def IsNotBookE  : Predicate<"!Subtarget->isBookE()">;
+def HasOnlyMSYNC : Predicate<"Subtarget->hasOnlyMSYNC()">;
+def HasSYNC   : Predicate<"!Subtarget->hasOnlyMSYNC()">;
+def IsPPC4xx  : Predicate<"Subtarget->isPPC4xx()">;
+def IsPPC6xx  : Predicate<"Subtarget->isPPC6xx()">;
+def IsE500  : Predicate<"Subtarget->isE500()">;
+def HasSPE  : Predicate<"Subtarget->hasSPE()">;
+def HasICBT : Predicate<"Subtarget->hasICBT()">;
+def HasPartwordAtomics : Predicate<"Subtarget->hasPartwordAtomics()">;
+def HasQuadwordAtomics : Predicate<"Subtarget->hasQuadwordAtomics()">;
+def NoNaNsFPMath
+    : Predicate<"Subtarget->getTargetMachine().Options.NoNaNsFPMath">;
+def NaNsFPMath
+    : Predicate<"!Subtarget->getTargetMachine().Options.NoNaNsFPMath">;
+def HasBPERMD : Predicate<"Subtarget->hasBPERMD()">;
+def HasExtDiv : Predicate<"Subtarget->hasExtDiv()">;
+def IsISA2_06 : Predicate<"Subtarget->isISA2_06()">;
+def IsISA2_07 : Predicate<"Subtarget->isISA2_07()">;
+def IsISA3_0 : Predicate<"Subtarget->isISA3_0()">;
+def HasFPU : Predicate<"Subtarget->hasFPU()">;
+def PCRelativeMemops : Predicate<"Subtarget->hasPCRelativeMemops()">;
+def IsNotISA3_1 : Predicate<"!Subtarget->isISA3_1()">;
+
+// AIX assembler may not be modern enough to support some extended mne.
+def ModernAs: Predicate<"!Subtarget->isAIXABI() || Subtarget->HasModernAIXAs">,
+                 AssemblerPredicate<(any_of (not AIXOS), FeatureModernAIXAs)>;
+def IsAIX : Predicate<"Subtarget->isAIXABI()">;
+def NotAIX : Predicate<"!Subtarget->isAIXABI()">;
+def IsISAFuture : Predicate<"Subtarget->isISAFuture()">;
+def IsNotISAFuture : Predicate<"!Subtarget->isISAFuture()">;
+
+
 // Since new processors generally contain a superset of features of those that
 // came before them, the idea is to make implementations of new processors
 // less error prone and easier to read.
@@ -392,7 +434,7 @@ def ProcessorFeatures {
                                                   FeaturePOPCNTD,
                                                   FeatureCMPB,
                                                   FeatureLDBRX,
-                                                  Feature64Bit,
+                                                  Feature64BitSupport,
                                                   /* Feature64BitRegs, */
                                                   FeatureBPERMD,
                                                   FeatureExtDiv,
@@ -630,13 +672,13 @@ def : ProcessorModel<"970", G5Model,
                   [Directive970, FeatureAltivec,
                    FeatureMFOCRF, FeatureFSqrt,
                    FeatureFRES, FeatureFRSQRTE, FeatureSTFIWX,
-                   Feature64Bit /*, Feature64BitRegs */,
+                   Feature64BitSupport /*, Feature64BitRegs */,
                    FeatureMFTB]>;
 def : ProcessorModel<"g5", G5Model,
                   [Directive970, FeatureAltivec,
                    FeatureMFOCRF, FeatureFSqrt, FeatureSTFIWX,
                    FeatureFRES, FeatureFRSQRTE,
-                   Feature64Bit /*, Feature64BitRegs */,
+                   Feature64BitSupport /*, Feature64BitRegs */,
                    FeatureMFTB, DeprecatedDST]>;
 def : ProcessorModel<"e500", PPCE500Model,
                   [DirectiveE500,
@@ -657,41 +699,41 @@ def : ProcessorModel<"a2", PPCA2Model,
                    FeatureSTFIWX, FeatureLFIWAX,
                    FeatureFPRND, FeatureFPCVT, FeatureISEL,
                    FeatureSlowPOPCNTD, FeatureCMPB, FeatureLDBRX,
-                   Feature64Bit /*, Feature64BitRegs */, FeatureMFTB,
+                   Feature64BitSupport /*, Feature64BitRegs */, FeatureMFTB,
                    FeatureISA2_06]>;
 def : ProcessorModel<"pwr3", G5Model,
                   [DirectivePwr3, FeatureAltivec,
                    FeatureFRES, FeatureFRSQRTE, FeatureMFOCRF,
-                   FeatureSTFIWX, Feature64Bit]>;
+                   FeatureSTFIWX, Feature64BitSupport]>;
 def : ProcessorModel<"pwr4", G5Model,
                   [DirectivePwr4, FeatureAltivec, FeatureMFOCRF,
                    FeatureFSqrt, FeatureFRES, FeatureFRSQRTE,
-                   FeatureSTFIWX, Feature64Bit, FeatureMFTB]>;
+                   FeatureSTFIWX, Feature64BitSupport, FeatureMFTB]>;
 def : ProcessorModel<"pwr5", G5Model,
                   [DirectivePwr5, FeatureAltivec, FeatureMFOCRF,
                    FeatureFSqrt, FeatureFRE, FeatureFRES,
                    FeatureFRSQRTE, FeatureFRSQRTES,
-                   FeatureSTFIWX, Feature64Bit,
+                   FeatureSTFIWX, Feature64BitSupport,
                    FeatureMFTB, DeprecatedDST]>;
 def : ProcessorModel<"pwr5x", G5Model,
                   [DirectivePwr5x, FeatureAltivec, FeatureMFOCRF,
                    FeatureFSqrt, FeatureFRE, FeatureFRES,
                    FeatureFRSQRTE, FeatureFRSQRTES,
-                   FeatureSTFIWX, FeatureFPRND, Feature64Bit,
+                   FeatureSTFIWX, FeatureFPRND, Feature64BitSupport,
                    FeatureMFTB, DeprecatedDST]>;
 def : ProcessorModel<"pwr6", G5Model,
                   [DirectivePwr6, FeatureAltivec,
                    FeatureMFOCRF, FeatureFCPSGN, FeatureFSqrt, FeatureFRE,
                    FeatureFRES, FeatureFRSQRTE, FeatureFRSQRTES,
                    FeatureRecipPrec, FeatureSTFIWX, FeatureLFIWAX, FeatureCMPB,
-                   FeatureFPRND, Feature64Bit /*, Feature64BitRegs */,
+                   FeatureFPRND, Feature64BitSupport /*, Feature64BitRegs */,
                    FeatureMFTB, DeprecatedDST]>;
 def : ProcessorModel<"pwr6x", G5Model,
                   [DirectivePwr5x, FeatureAltivec, FeatureMFOCRF,
                    FeatureFCPSGN, FeatureFSqrt, FeatureFRE, FeatureFRES,
                    FeatureFRSQRTE, FeatureFRSQRTES, FeatureRecipPrec,
                    FeatureSTFIWX, FeatureLFIWAX, FeatureCMPB,
-                   FeatureFPRND, Feature64Bit,
+                   FeatureFPRND, Feature64BitSupport,
                    FeatureMFTB, DeprecatedDST]>;
 def : ProcessorModel<"pwr7", P7Model, ProcessorFeatures.P7Features>;
 def : ProcessorModel<"pwr8", P8Model, ProcessorFeatures.P8Features>;
@@ -709,7 +751,7 @@ def : ProcessorModel<"ppc64", G5Model,
                   [Directive64, FeatureAltivec,
                    FeatureMFOCRF, FeatureFSqrt, FeatureFRES,
                    FeatureFRSQRTE, FeatureSTFIWX,
-                   Feature64Bit /*, Feature64BitRegs */,
+                   Feature64BitSupport /*, Feature64BitRegs */,
                    FeatureMFTB]>;
 def : ProcessorModel<"ppc64le", P8Model, ProcessorFeatures.P8Features>;
 
diff --git a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index 415164fc9e2cb..89165fa8f8fdb 100644
--- a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -401,7 +401,7 @@ namespace {
         // We need to make sure that this one operand does not end up in r0
         // (because we might end up lowering this as 0(%op)).
         const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
-        const TargetRegisterClass *TRC = TRI->getPointerRegClass(*MF, /*Kind=*/1);
+        const TargetRegisterClass *TRC = TRI->getPointerRegClass(/*Kind=*/1);
         SDLoc dl(Op);
         SDValue RC = CurDAG->getTargetConstant(TRC->getID(), dl, MVT::i32);
         SDValue NewOp =
diff --git a/llvm/lib/Target/PowerPC/PPCInstrFuture.td b/llvm/lib/Target/PowerPC/PPCInstrFuture.td
index a12dfae2a0d7f..5751d7dc1628b 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrFuture.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrFuture.td
@@ -192,3 +192,21 @@ let Predicates = [HasVSX, IsISAFuture] in {
       : VXForm_VRTAB5<323, (outs vrrc:$VRT), (ins vrrc:$VRA, vrrc:$VRB),
                       "vucmprlh $VRT, $VRA, $VRB", []>;
 }
+
+//---------------------------- Anonymous Patterns ----------------------------//
+
+// Load/Store VSX Vector with Right Length (Left-justified).
+def : Pat<(v4i32 (int_ppc_vsx_lxvrl addr:$RA, i64:$RB)), (LXVRL $RA, $RB)>;
+def : Pat<(v4i32 (int_ppc_vsx_lxvrll addr:$RA, i64:$RB)), (LXVRLL $RA, $RB)>;
+def : Pat<(int_ppc_vsx_stxvrl v4i32:$XT, addr:$RA, i64:$RB), (STXVRL $XT, $RA,
+                                                                 $RB)>;
+def : Pat<(int_ppc_vsx_stxvrll v4i32:$XT, addr:$RA, i64:$RB), (STXVRLL $XT, $RA,
+                                                                  $RB)>;
+
+// Load/Store VSX Vector pair with Right Length (Left-justified).
+def : Pat<(v256i1 (int_ppc_vsx_lxvprl addr:$RA, i64:$RB)), (LXVPRL $RA, $RB)>;
+def : Pat<(v256i1 (int_ppc_vsx_lxvprll addr:$RA, i64:$RB)), (LXVPRLL $RA, $RB)>;
+def : Pat<(int_ppc_vsx_stxvprl v256i1:$XTp, addr:$RA, i64:$RB), (STXVPRL $XTp,
+                                                                    $RA, $RB)>;
+def : Pat<(int_ppc_vsx_stxvprll v256i1:$XTp, addr:$RA, i64:$RB), (STXVPRLL $XTp,
+                                                                     $RA, $RB)>;
diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/llvm/lib/Target/PowerPC/PPCInstrInfo.td
index c12cf85113128..1c45050cdf9ca 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.td
@@ -713,42 +713,6 @@ let WantsParent = true in {
   def PDForm : ComplexPattern<iPTR, 2, "SelectPDForm">;
 }
 
-//===----------------------------------------------------------------------===//
-// PowerPC Instruction Predicate Definitions.
-def In32BitMode  : Predicate<"!Subtarget->isPPC64()">;
-def In64BitMode  : Predicate<"Subtarget->isPPC64()">;
-def IsBookE  : Predicate<"Subtarget->isBookE()">;
-def IsNotBookE  : Predicate<"!Subtarget->isBookE()">;
-def HasOnlyMSYNC : Predicate<"Subtarget->hasOnlyMSYNC()">;
-def HasSYNC   : Predicate<"!Subtarget->hasOnlyMSYNC()">;
-def IsPPC4xx  : Predicate<"Subtarget->isPPC4xx()">;
-def IsPPC6xx  : Predicate<"Subtarget->isPPC6xx()">;
-def IsE500  : Predicate<"Subtarget->isE500()">;
-def HasSPE  : Predicate<"Subtarget->hasSPE()">;
-def HasICBT : Predicate<"Subtarget->hasICBT()">;
-def HasPartwordAtomics : Predicate<"Subtarget->hasPartwordAtomics()">;
-def HasQuadwordAtomics : Predicate<"Subtarget->hasQuadwordAtomics()">;
-def NoNaNsFPMath
-    : Predicate<"Subtarget->getTargetMachine().Options.NoNaNsFPMath">;
-def NaNsFPMath
-    : Predicate<"!Subtarget->getTargetMachine().Options.NoNaNsFPMath">;
-def HasBPERMD : Predicate<"Subtarget->hasBPERMD()">;
-def HasExtDiv : Predicate<"Subtarget->hasExtDiv()">;
-def IsISA2_06 : Predicate<"Subtarget->isISA2_06()">;
-def IsISA2_07 : Predicate<"Subtarget->isISA2_07()">;
-def IsISA3_0 : Predicate<"Subtarget->isISA3_0()">;
-def HasFPU : Predicate<"Subtarget->hasFPU()">;
-def PCRelativeMemops : Predicate<"Subtarget->hasPCRelativeMemops()">;
-def IsNotISA3_1 : Predicate<"!Subtarget->isISA3_1()">;
-
-// AIX assembler may not be modern enough to support some extended mne.
-def ModernAs: Predicate<"!Subtarget->isAIXABI() || Subtarget->HasModernAIXAs">,
-                 AssemblerPredicate<(any_of (not AIXOS), FeatureModernAIXAs)>;
-def IsAIX : Predicate<"Subtarget->isAIXABI()">;
-def NotAIX : Predicate<"!Subtarget->isAIXABI()">;
-def IsISAFuture : Predicate<"Subtarget->isISAFuture()">;
-def IsNotISAFuture : Predicate<"!Subtarget->isISAFuture()">;
-
 //===----------------------------------------------------------------------===//
 // PowerPC Multiclass Definitions.
 multiclass XForm_base_r3xo_r<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp b/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp
index f1230407b1649..85b40727ff296 100644
--- a/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp
@@ -164,8 +164,7 @@ PPCRegisterInfo::PPCRegisterInfo(const PPCTargetMachine &TM)
 /// getPointerRegClass - Return the register class to use to hold pointers.
 /// This is used for addressing modes.
 const TargetRegisterClass *
-PPCRegisterInfo::getPointerRegClass(const MachineFunction &MF, unsigned Kind)
-                                                                       const {
+PPCRegisterInfo::getPointerRegClass(unsigned Kind) const {
   // Note that PPCInstrInfo::foldImmediate also directly uses this Kind value
   // when it checks for ZERO folding.
   if (Kind == 1) {
@@ -2022,9 +2021,9 @@ Register PPCRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
   const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
   const MCInstrDesc &MCID = TII.get(ADDriOpc);
   MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
-  const TargetRegisterClass *RC = getPointerRegClass(MF);
+  const TargetRegisterClass *RC = getPointerRegClass();
   Register BaseReg = MRI.createVirtualRegister(RC);
-  MRI.constrainRegClass(BaseReg, TII.getRegClass(MCID, 0, this, MF));
+  MRI.constrainRegClass(BaseReg, TII.getRegClass(MCID, 0, this));
 
   BuildMI(*MBB, Ins, DL, MCID, BaseReg)
     .addFrameIndex(FrameIdx).addImm(Offset);
@@ -2052,8 +2051,7 @@ void PPCRegisterInfo::resolveFrameIndex(MachineInstr &MI, Register BaseReg,
   const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
   const MCInstrDesc &MCID = MI.getDesc();
   MachineRegisterInfo &MRI = MF.getRegInfo();
-  MRI.constrainRegClass(BaseReg,
-                        TII.getRegClass(MCID, FIOperandNum, this, MF));
+  MRI.constrainRegClass(BaseReg, TII.getRegClass(MCID, FIOperandNum, this));
 }
 
 bool PPCRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI,
diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfo.h b/llvm/lib/Target/PowerPC/PPCRegisterInfo.h
index 849f856b5419e..560690208f704 100644
--- a/llvm/lib/Target/PowerPC/PPCRegisterInfo.h
+++ b/llvm/lib/Target/PowerPC/PPCRegisterInfo.h
@@ -79,7 +79,7 @@ class PPCRegisterInfo : public PPCGenRegisterInfo {
   /// getPointerRegClass - Return the register class to use to hold pointers.
   /// This is used for addressing modes.
   const TargetRegisterClass *
-  getPointerRegClass(const MachineFunction &MF, unsigned Kind=0) const override;
+  getPointerRegClass(unsigned Kind = 0) const override;
 
   const TargetRegisterClass *
   getCrossCopyRegClass(const TargetRegisterClass *RC) const override;
diff --git a/llvm/lib/Target/PowerPC/PPCSubtarget.cpp b/llvm/lib/Target/PowerPC/PPCSubtarget.cpp
index 736ba1edcaea6..85e022a2ba6fc 100644
--- a/llvm/lib/Target/PowerPC/PPCSubtarget.cpp
+++ b/llvm/lib/Target/PowerPC/PPCSubtarget.cpp
@@ -54,10 +54,8 @@ PPCSubtarget &PPCSubtarget::initializeSubtargetDependencies(StringRef CPU,
 
 PPCSubtarget::PPCSubtarget(const Triple &TT, StringRef CPU, StringRef TuneCPU,
                            StringRef FS, const PPCTargetMachine &TM)
-    : PPCGenSubtargetInfo(TT, CPU, TuneCPU, FS),
-      IsPPC64(getTargetTriple().getArch() == Triple::ppc64 ||
-              getTargetTriple().getArch() == Triple::ppc64le),
-      TM(TM), FrameLowering(initializeSubtargetDependencies(CPU, TuneCPU, FS)),
+    : PPCGenSubtargetInfo(TT, CPU, TuneCPU, FS), TM(TM),
+      FrameLowering(initializeSubtargetDependencies(CPU, TuneCPU, FS)),
       InstrInfo(*this), TLInfo(TM, *this) {
   TSInfo = std::make_unique<PPCSelectionDAGInfo>();
 
@@ -247,7 +245,6 @@ CodeModel::Model PPCSubtarget::getCodeModel(const TargetMachine &TM,
 }
 
 bool PPCSubtarget::isELFv2ABI() const { return TM.isELFv2ABI(); }
-bool PPCSubtarget::isPPC64() const { return TM.isPPC64(); }
 
 bool PPCSubtarget::isUsingPCRelativeCalls() const {
   return isPPC64() && hasPCRelativeMemops() && isELFv2ABI() &&
diff --git a/llvm/lib/Target/PowerPC/PPCSubtarget.h b/llvm/lib/Target/PowerPC/PPCSubtarget.h
index c17fca7f70a3c..f275802fe1843 100644
--- a/llvm/lib/Target/PowerPC/PPCSubtarget.h
+++ b/llvm/lib/Target/PowerPC/PPCSubtarget.h
@@ -93,7 +93,6 @@ class PPCSubtarget : public PPCGenSubtargetInfo {
   /// Which cpu directive was used.
   unsigned CPUDirective;
 
-  bool IsPPC64;
   bool IsLittleEndian;
 
   POPCNTDKind HasPOPCNTD;
@@ -167,10 +166,6 @@ class PPCSubtarget : public PPCGenSubtargetInfo {
   void initSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);
 
 public:
-  /// isPPC64 - Return true if we are generating code for 64-bit pointer mode.
-  ///
-  bool isPPC64() const;
-
   // useSoftFloat - Return true if soft-float option is turned on.
   bool useSoftFloat() const {
     if (isAIXABI() && !HasHardFloat)
diff --git a/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp b/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp
index ae92d5eab20cd..000d29610678f 100644
--- a/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp
+++ b/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp
@@ -149,58 +149,6 @@ LLVMInitializePowerPCTarget() {
   initializePPCAIXAsmPrinterPass(PR);
 }
 
-static bool isLittleEndianTriple(const Triple &T) {
-  return T.getArch() == Triple::ppc64le || T.getArch() == Triple::ppcle;
-}
-
-/// Return the datalayout string of a subtarget.
-static std::string getDataLayoutString(const Triple &T) {
-  bool is64Bit = T.getArch() == Triple::ppc64 || T.getArch() == Triple::ppc64le;
-  std::string Ret;
-
-  // Most PPC* platforms are big endian, PPC(64)LE is little endian.
-  if (isLittleEndianTriple(T))
-    Ret = "e";
-  else
-    Ret = "E";
-
-  Ret += DataLayout::getManglingComponent(T);
-
-  // PPC32 has 32 bit pointers. The PS3 (OS Lv2) is a PPC64 machine with 32 bit
-  // pointers.
-  if (!is64Bit || T.getOS() == Triple::Lv2)
-    Ret += "-p:32:32";
-
-  // If the target ABI uses function descriptors, then the alignment of function
-  // pointers depends on the alignment used to emit the descriptor. Otherwise,
-  // function pointers are aligned to 32 bits because the instructions must be.
-  if ((T.getArch() == Triple::ppc64 && !T.isPPC64ELFv2ABI())) {
-    Ret += "-Fi64";
-  } else if (T.isOSAIX()) {
-    Ret += is64Bit ? "-Fi64" : "-Fi32";
-  } else {
-    Ret += "-Fn32";
-  }
-
-  // Note, the alignment values for f64 and i64 on ppc64 in Darwin
-  // documentation are wrong; these are correct (i.e. "what gcc does").
-  Ret += "-i64:64";
-
-  // PPC64 has 32 and 64 bit registers, PPC32 has only 32 bit ones.
-  if (is64Bit)
-    Ret += "-i128:128-n32:64";
-  else
-    Ret += "-n32";
-
-  // Specify the vector alignment explicitly. For v256i1 and v512i1, the
-  // calculated alignment would be 256*alignment(i1) and 512*alignment(i1),
-  // which is 256 and 512 bytes - way over aligned.
-  if (is64Bit && (T.isOSAIX() || T.isOSLinux()))
-    Ret += "-S128-v256:256:256-v512:512:512";
-
-  return Ret;
-}
-
 static std::string computeFSAdditions(StringRef FS, CodeGenOptLevel OL,
                                       const Triple &TT) {
   std::string FullFS = std::string(FS);
@@ -348,13 +296,13 @@ PPCTargetMachine::PPCTargetMachine(const Target &T, const Triple &TT,
                                    std::optional<Reloc::Model> RM,
                                    std::optional<CodeModel::Model> CM,
                                    CodeGenOptLevel OL, bool JIT)
-    : CodeGenTargetMachineImpl(T, getDataLayoutString(TT), TT, CPU,
+    : CodeGenTargetMachineImpl(T, TT.computeDataLayout(), TT, CPU,
                                computeFSAdditions(FS, OL, TT), Options,
                                getEffectiveRelocModel(TT, RM),
                                getEffectivePPCCodeModel(TT, CM, JIT), OL),
       TLOF(createTLOF(getTargetTriple())),
       TargetABI(computeTargetABI(TT, Options)),
-      Endianness(isLittleEndianTriple(TT) ? Endian::LITTLE : Endian::BIG) {
+      Endianness(TT.isLittleEndian() ? Endian::LITTLE : Endian::BIG) {
   initAsmInfo();
 }
 
diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
index cd8392849ac40..2b5f18d611524 100644
--- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
+++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
@@ -946,6 +946,11 @@ struct RISCVOperand final : public MCParsedAsmOperand {
     return isUImmPred([](int64_t Imm) { return 4 == Imm; });
   }
 
+  bool isImm5Zibi() const {
+    return isUImmPred(
+        [](int64_t Imm) { return (Imm != 0 && isUInt<5>(Imm)) || Imm == -1; });
+  }
+
   bool isSImm5Plus1() const {
     return isSImmPred(
         [](int64_t Imm) { return Imm != INT64_MIN && isInt<5>(Imm - 1); });
@@ -1643,6 +1648,10 @@ bool RISCVAsmParser::matchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
                                       "operand must be a valid system register "
                                       "name or an integer in the range");
   }
+  case Match_InvalidImm5Zibi:
+    return generateImmOutOfRangeError(
+        Operands, ErrorInfo, -1, (1 << 5) - 1,
+        "immediate must be non-zero in the range");
   case Match_InvalidVTypeI: {
     SMLoc ErrorLoc = ((RISCVOperand &)*Operands[ErrorInfo]).getStartLoc();
     return generateVTypeError(ErrorLoc);
diff --git a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
index 61b86abdc4ca9..ff07122b61378 100644
--- a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
+++ b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
@@ -194,12 +194,24 @@ static DecodeStatus DecodeFPR128RegisterClass(MCInst &Inst, uint32_t RegNo,
   return MCDisassembler::Success;
 }
 
+static DecodeStatus DecodeGPRX1RegisterClass(MCInst &Inst,
+                                             const MCDisassembler *Decoder) {
+  Inst.addOperand(MCOperand::createReg(RISCV::X1));
+  return MCDisassembler::Success;
+}
+
 static DecodeStatus DecodeSPRegisterClass(MCInst &Inst,
                                           const MCDisassembler *Decoder) {
   Inst.addOperand(MCOperand::createReg(RISCV::X2));
   return MCDisassembler::Success;
 }
 
+static DecodeStatus DecodeGPRX5RegisterClass(MCInst &Inst,
+                                             const MCDisassembler *Decoder) {
+  Inst.addOperand(MCOperand::createReg(RISCV::X5));
+  return MCDisassembler::Success;
+}
+
 static DecodeStatus DecodeGPRNoX0RegisterClass(MCInst &Inst, uint32_t RegNo,
                                                uint64_t Address,
                                                const MCDisassembler *Decoder) {
@@ -408,6 +420,18 @@ static DecodeStatus decodeVMaskReg(MCInst &Inst, uint32_t RegNo,
   return MCDisassembler::Success;
 }
 
+static DecodeStatus decodeImmThreeOperand(MCInst &Inst,
+                                          const MCDisassembler *Decoder) {
+  Inst.addOperand(MCOperand::createImm(3));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus decodeImmFourOperand(MCInst &Inst,
+                                         const MCDisassembler *Decoder) {
+  Inst.addOperand(MCOperand::createImm(4));
+  return MCDisassembler::Success;
+}
+
 template <unsigned N>
 static DecodeStatus decodeUImmOperand(MCInst &Inst, uint32_t Imm,
                                       int64_t Address,
@@ -491,6 +515,14 @@ static DecodeStatus decodeUImmPlus1Operand(MCInst &Inst, uint32_t Imm,
   return MCDisassembler::Success;
 }
 
+static DecodeStatus decodeImmZibiOperand(MCInst &Inst, uint32_t Imm,
+                                         int64_t Address,
+                                         const MCDisassembler *Decoder) {
+  assert(isUInt<5>(Imm) && "Invalid immediate");
+  Inst.addOperand(MCOperand::createImm(Imm ? Imm : -1LL));
+  return MCDisassembler::Success;
+}
+
 template <unsigned N>
 static DecodeStatus decodeSImmOperand(MCInst &Inst, uint32_t Imm,
                                       int64_t Address,
@@ -571,46 +603,6 @@ static DecodeStatus decodeXqccmpRlistS0(MCInst &Inst, uint32_t Imm,
   return decodeZcmpRlist(Inst, Imm, Address, Decoder);
 }
 
-static DecodeStatus decodeCSSPushPopchk(MCInst &Inst, uint16_t Insn,
-                                        uint64_t Address,
-                                        const MCDisassembler *Decoder) {
-  uint32_t Rs1 = fieldFromInstruction(Insn, 7, 5);
-  [[maybe_unused]] DecodeStatus Result =
-      DecodeGPRX1X5RegisterClass(Inst, Rs1, Address, Decoder);
-  assert(Result == MCDisassembler::Success && "Invalid register");
-  return MCDisassembler::Success;
-}
-
-static DecodeStatus decodeXTHeadMemPair(MCInst &Inst, uint32_t Insn,
-                                        uint64_t Address,
-                                        const MCDisassembler *Decoder) {
-  DecodeStatus S = MCDisassembler::Success;
-  uint32_t Rd1 = fieldFromInstruction(Insn, 7, 5);
-  uint32_t Rs1 = fieldFromInstruction(Insn, 15, 5);
-  uint32_t Rd2 = fieldFromInstruction(Insn, 20, 5);
-  uint32_t UImm2 = fieldFromInstruction(Insn, 25, 2);
-  if (!Check(S, DecodeGPRRegisterClass(Inst, Rd1, Address, Decoder)))
-    return MCDisassembler::Fail;
-  if (!Check(S, DecodeGPRRegisterClass(Inst, Rd2, Address, Decoder)))
-    return MCDisassembler::Fail;
-  if (!Check(S, DecodeGPRRegisterClass(Inst, Rs1, Address, Decoder)))
-    return MCDisassembler::Fail;
-  [[maybe_unused]] DecodeStatus Result =
-      decodeUImmOperand<2>(Inst, UImm2, Address, Decoder);
-  assert(Result == MCDisassembler::Success && "Invalid immediate");
-
-  // Disassemble the final operand which is implicit.
-  unsigned Opcode = Inst.getOpcode();
-  bool IsWordOp = (Opcode == RISCV::TH_LWD || Opcode == RISCV::TH_LWUD ||
-                   Opcode == RISCV::TH_SWD);
-  if (IsWordOp)
-    Inst.addOperand(MCOperand::createImm(3));
-  else
-    Inst.addOperand(MCOperand::createImm(4));
-
-  return S;
-}
-
 #include "RISCVGenDisassemblerTables.inc"
 
 namespace {
diff --git a/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp b/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp
index 7df1b7e580002..4330d4e91e0ee 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp
+++ b/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp
@@ -738,12 +738,17 @@ bool RISCVInstructionSelector::select(MachineInstr &MI) {
     // TODO: Use constant pool for complex constants.
     Register DstReg = MI.getOperand(0).getReg();
     const APFloat &FPimm = MI.getOperand(1).getFPImm()->getValueAPF();
-    APInt Imm = FPimm.bitcastToAPInt();
     unsigned Size = MRI->getType(DstReg).getSizeInBits();
     if (Size == 16 || Size == 32 || (Size == 64 && Subtarget->is64Bit())) {
-      Register GPRReg = MRI->createVirtualRegister(&RISCV::GPRRegClass);
-      if (!materializeImm(GPRReg, Imm.getSExtValue(), MIB))
-        return false;
+      Register GPRReg;
+      if (FPimm.isPosZero()) {
+        GPRReg = RISCV::X0;
+      } else {
+        GPRReg = MRI->createVirtualRegister(&RISCV::GPRRegClass);
+        APInt Imm = FPimm.bitcastToAPInt();
+        if (!materializeImm(GPRReg, Imm.getSExtValue(), MIB))
+          return false;
+      }
 
       unsigned Opcode = Size == 64   ? RISCV::FMV_D_X
                         : Size == 32 ? RISCV::FMV_W_X
@@ -756,7 +761,7 @@ bool RISCVInstructionSelector::select(MachineInstr &MI) {
       assert(Size == 64 && !Subtarget->is64Bit() &&
              "Unexpected size or subtarget");
 
-      if (Imm.isNonNegative() && Imm.isZero()) {
+      if (FPimm.isPosZero()) {
         // Optimize +0.0 to use fcvt.d.w
         MachineInstrBuilder FCVT =
             MIB.buildInstr(RISCV::FCVT_D_W, {DstReg}, {Register(RISCV::X0)})
@@ -771,6 +776,7 @@ bool RISCVInstructionSelector::select(MachineInstr &MI) {
       // Split into two pieces and build through the stack.
       Register GPRRegHigh = MRI->createVirtualRegister(&RISCV::GPRRegClass);
       Register GPRRegLow = MRI->createVirtualRegister(&RISCV::GPRRegClass);
+      APInt Imm = FPimm.bitcastToAPInt();
       if (!materializeImm(GPRRegHigh, Imm.extractBits(32, 32).getSExtValue(),
                           MIB))
         return false;
diff --git a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
index 16f34a89a52ec..9ce0ee1be7ea7 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
+++ b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
@@ -572,7 +572,9 @@ RISCVLegalizerInfo::RISCVLegalizerInfo(const RISCVSubtarget &ST)
       .legalFor(ST.hasStdExtF(), {s32})
       .legalFor(ST.hasStdExtD(), {s64})
       .legalFor(ST.hasStdExtZfh(), {s16})
-      .lowerFor({s32, s64, s128});
+      .customFor(!ST.is64Bit(), {s32})
+      .customFor(ST.is64Bit(), {s32, s64})
+      .lowerFor({s64, s128});
 
   getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
       .legalFor(ST.hasStdExtF(), {{sXLen, s32}})
@@ -1358,7 +1360,16 @@ bool RISCVLegalizerInfo::legalizeCustom(
     return false;
   case TargetOpcode::G_ABS:
     return Helper.lowerAbsToMaxNeg(MI);
-  // TODO: G_FCONSTANT
+  case TargetOpcode::G_FCONSTANT: {
+    const APFloat &FVal = MI.getOperand(1).getFPImm()->getValueAPF();
+
+    // Convert G_FCONSTANT to G_CONSTANT.
+    Register DstReg = MI.getOperand(0).getReg();
+    MIRBuilder.buildConstant(DstReg, FVal.bitcastToAPInt());
+
+    MI.eraseFromParent();
+    return true;
+  }
   case TargetOpcode::G_CONSTANT: {
     const Function &F = MF.getFunction();
     // TODO: if PSI and BFI are present, add " ||
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
index fcea23a5275c0..70b7c430c410e 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
@@ -341,6 +341,7 @@ enum OperandType : unsigned {
   OPERAND_UIMM64,
   OPERAND_THREE,
   OPERAND_FOUR,
+  OPERAND_IMM5_ZIBI,
   OPERAND_SIMM5,
   OPERAND_SIMM5_NONZERO,
   OPERAND_SIMM5_PLUS1,
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
index 717fba68b48ed..6d587e6f167fc 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
@@ -97,6 +97,10 @@ class RISCVMCCodeEmitter : public MCCodeEmitter {
                              SmallVectorImpl<MCFixup> &Fixups,
                              const MCSubtargetInfo &STI) const;
 
+  uint64_t getImmOpValueZibi(const MCInst &MI, unsigned OpNo,
+                             SmallVectorImpl<MCFixup> &Fixups,
+                             const MCSubtargetInfo &STI) const;
+
   uint64_t getImmOpValue(const MCInst &MI, unsigned OpNo,
                          SmallVectorImpl<MCFixup> &Fixups,
                          const MCSubtargetInfo &STI) const;
@@ -559,6 +563,19 @@ RISCVMCCodeEmitter::getImmOpValueAsrN(const MCInst &MI, unsigned OpNo,
   return getImmOpValue(MI, OpNo, Fixups, STI);
 }
 
+uint64_t
+RISCVMCCodeEmitter::getImmOpValueZibi(const MCInst &MI, unsigned OpNo,
+                                      SmallVectorImpl<MCFixup> &Fixups,
+                                      const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpNo);
+  assert(MO.isImm() && "Zibi operand must be an immediate");
+  int64_t Res = MO.getImm();
+  if (Res == -1)
+    return 0;
+
+  return Res;
+}
+
 uint64_t RISCVMCCodeEmitter::getImmOpValue(const MCInst &MI, unsigned OpNo,
                                            SmallVectorImpl<MCFixup> &Fixups,
                                            const MCSubtargetInfo &STI) const {
diff --git a/llvm/lib/Target/RISCV/RISCVDeadRegisterDefinitions.cpp b/llvm/lib/Target/RISCV/RISCVDeadRegisterDefinitions.cpp
index 1c7aa738f6215..51180f548ca6d 100644
--- a/llvm/lib/Target/RISCV/RISCVDeadRegisterDefinitions.cpp
+++ b/llvm/lib/Target/RISCV/RISCVDeadRegisterDefinitions.cpp
@@ -89,7 +89,7 @@ bool RISCVDeadRegisterDefinitions::runOnMachineFunction(MachineFunction &MF) {
         LLVM_DEBUG(dbgs() << "    Dead def operand #" << I << " in:\n      ";
                    MI.print(dbgs()));
         Register X0Reg;
-        const TargetRegisterClass *RC = TII->getRegClass(Desc, I, TRI, MF);
+        const TargetRegisterClass *RC = TII->getRegClass(Desc, I, TRI);
         if (RC && RC->contains(RISCV::X0)) {
           X0Reg = RISCV::X0;
         } else if (RC && RC->contains(RISCV::X0_W)) {
diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td
index 063963d4ec36b..95703e33926c5 100644
--- a/llvm/lib/Target/RISCV/RISCVFeatures.td
+++ b/llvm/lib/Target/RISCV/RISCVFeatures.td
@@ -78,6 +78,12 @@ def FeatureStdExtE
     : RISCVExtension<2, 0, "Embedded Instruction Set with 16 GPRs">,
       RISCVExtensionBitmask<0, 4>;
 
+def FeatureStdExtZibi
+    : RISCVExperimentalExtension<0, 1, "Branch with Immediate">;
+def HasStdExtZibi : Predicate<"Subtarget->hasStdExtZibi()">,
+                    AssemblerPredicate<(all_of FeatureStdExtZibi),
+                                       "'Zibi' (Branch with Immediate)">;
+
 def FeatureStdExtZic64b
     : RISCVExtension<1, 0, "Cache Block Size Is 64 Bytes">;
 
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index c7f15415ebb91..dda6023b37f7b 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -3204,9 +3204,7 @@ static bool isWorthFoldingIntoRegRegScale(const RISCVSubtarget &Subtarget,
     // If we have a SHXADD instruction, prefer that over reassociating an ADDI.
     assert(Shift.getOpcode() == ISD::SHL);
     unsigned ShiftAmt = Shift.getConstantOperandVal(1);
-    if ((ShiftAmt <= 3 &&
-         (Subtarget.hasStdExtZba() || Subtarget.hasVendorXTHeadBa())) ||
-        (ShiftAmt >= 4 && ShiftAmt <= 7 && Subtarget.hasVendorXqciac()))
+    if (Subtarget.hasShlAdd(ShiftAmt))
       return false;
 
     // All users of the ADDI should be load/store.
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 037eec05e4301..d98872c484d0b 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -378,13 +378,8 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
     setOperationAction({ISD::ROTL, ISD::ROTR}, XLenVT, Expand);
   }
 
-  // With Zbb we have an XLen rev8 instruction, but not GREVI. So we'll
-  // pattern match it directly in isel.
   setOperationAction(ISD::BSWAP, XLenVT,
-                     (Subtarget.hasStdExtZbb() || Subtarget.hasStdExtZbkb() ||
-                      Subtarget.hasVendorXTHeadBb())
-                         ? Legal
-                         : Expand);
+                     Subtarget.hasREV8Like() ? Legal : Expand);
 
   if ((Subtarget.hasVendorXCVbitmanip() || Subtarget.hasVendorXqcibm()) &&
       !Subtarget.is64Bit()) {
@@ -403,12 +398,14 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
                        Legal);
   }
 
-  if (Subtarget.hasStdExtZbb() ||
-      (Subtarget.hasVendorXCVbitmanip() && !Subtarget.is64Bit())) {
+  if (Subtarget.hasCTZLike()) {
     if (Subtarget.is64Bit())
       setOperationAction({ISD::CTTZ, ISD::CTTZ_ZERO_UNDEF}, MVT::i32, Custom);
   } else {
     setOperationAction(ISD::CTTZ, XLenVT, Expand);
+  }
+
+  if (!Subtarget.hasCPOPLike()) {
     // TODO: These should be set to LibCall, but this currently breaks
     //   the Linux kernel build. See #101786. Lacks i128 tests, too.
     if (Subtarget.is64Bit())
@@ -418,8 +415,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::CTPOP, MVT::i64, Expand);
   }
 
-  if (Subtarget.hasStdExtZbb() || Subtarget.hasVendorXTHeadBb() ||
-      (Subtarget.hasVendorXCVbitmanip() && !Subtarget.is64Bit())) {
+  if (Subtarget.hasCLZLike()) {
     // We need the custom lowering to make sure that the resulting sequence
     // for the 32bit case is efficient on 64bit targets.
     // Use default promotion for i32 without Zbb.
@@ -2158,13 +2154,11 @@ bool RISCVTargetLowering::signExtendConstant(const ConstantInt *CI) const {
 }
 
 bool RISCVTargetLowering::isCheapToSpeculateCttz(Type *Ty) const {
-  return Subtarget.hasStdExtZbb() ||
-         (Subtarget.hasVendorXCVbitmanip() && !Subtarget.is64Bit());
+  return Subtarget.hasCTZLike();
 }
 
 bool RISCVTargetLowering::isCheapToSpeculateCtlz(Type *Ty) const {
-  return Subtarget.hasStdExtZbb() || Subtarget.hasVendorXTHeadBb() ||
-         (Subtarget.hasVendorXCVbitmanip() && !Subtarget.is64Bit());
+  return Subtarget.hasCLZLike();
 }
 
 bool RISCVTargetLowering::isMaskAndCmp0FoldingBeneficial(
@@ -2753,7 +2747,7 @@ bool RISCVTargetLowering::isLegalElementTypeForRVV(EVT ScalarTy) const {
   case MVT::i8:
   case MVT::i16:
   case MVT::i32:
-    return true;
+    return Subtarget.hasVInstructions();
   case MVT::i64:
     return Subtarget.hasVInstructionsI64();
   case MVT::f16:
@@ -9151,10 +9145,22 @@ static SDValue lowerSelectToBinOp(SDNode *N, SelectionDAG &DAG,
           DAG.getNode(ISD::ADD, DL, VT, CondV, DAG.getAllOnesConstant(DL, VT));
       return DAG.getNode(ISD::AND, DL, VT, Neg, DAG.getFreeze(FalseV));
     }
-    // (select c, y, 0) -> -c & y
-    if (isNullConstant(FalseV) && (!HasCZero || isSimm12Constant(TrueV))) {
-      SDValue Neg = DAG.getNegative(CondV, DL, VT);
-      return DAG.getNode(ISD::AND, DL, VT, Neg, DAG.getFreeze(TrueV));
+    if (isNullConstant(FalseV)) {
+      // (select c, (1 << ShAmount) + 1, 0) -> (c << ShAmount) + c
+      if (auto *TrueC = dyn_cast<ConstantSDNode>(TrueV)) {
+        uint64_t TrueM1 = TrueC->getZExtValue() - 1;
+        if (isPowerOf2_64(TrueM1)) {
+          unsigned ShAmount = Log2_64(TrueM1);
+          if (Subtarget.hasShlAdd(ShAmount))
+            return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, CondV,
+                               DAG.getConstant(ShAmount, DL, VT), CondV);
+        }
+      }
+      // (select c, y, 0) -> -c & y
+      if (!HasCZero || isSimm12Constant(TrueV)) {
+        SDValue Neg = DAG.getNegative(CondV, DL, VT);
+        return DAG.getNode(ISD::AND, DL, VT, Neg, DAG.getFreeze(TrueV));
+      }
     }
   }
 
@@ -15350,11 +15356,9 @@ static SDValue combineBinOpToReduce(SDNode *N, SelectionDAG &DAG,
 //          (SLLI (QC.SHLADD x, y, c1 - c0), c0), if 4 <= (c1-c0) <=31.
 static SDValue transformAddShlImm(SDNode *N, SelectionDAG &DAG,
                                   const RISCVSubtarget &Subtarget) {
-  const bool HasStdExtZba = Subtarget.hasStdExtZba();
-  const bool HasVendorXAndesPerf = Subtarget.hasVendorXAndesPerf();
-  const bool HasVendorXqciac = Subtarget.hasVendorXqciac();
-  // Perform this optimization only in the zba/xandesperf/xqciac extension.
-  if (!HasStdExtZba && !HasVendorXAndesPerf && !HasVendorXqciac)
+  // Perform this optimization only in the zba/xandesperf/xqciac/xtheadba
+  // extension.
+  if (!Subtarget.hasShlAdd(3))
     return SDValue();
 
   // Skip for vector types and larger types.
@@ -15380,16 +15384,7 @@ static SDValue transformAddShlImm(SDNode *N, SelectionDAG &DAG,
     return SDValue();
 
   int64_t Diff = std::abs(C0 - C1);
-  bool IsShXaddDiff = Diff == 1 || Diff == 2 || Diff == 3;
-  bool HasShXadd = HasStdExtZba || HasVendorXAndesPerf;
-
-  // Skip if SH1ADD/SH2ADD/SH3ADD are not applicable.
-  if ((!IsShXaddDiff && HasShXadd && !HasVendorXqciac) ||
-      (IsShXaddDiff && !HasShXadd && HasVendorXqciac))
-    return SDValue();
-
-  // Skip if QC_SHLADD is not applicable.
-  if (Diff == 0 || Diff > 31)
+  if (!Subtarget.hasShlAdd(Diff))
     return SDValue();
 
   // Build nodes.
@@ -15446,7 +15441,7 @@ static SDValue combineShlAddIAddImpl(SDNode *N, SDValue AddI, SDValue Other,
 static SDValue combineShlAddIAdd(SDNode *N, SelectionDAG &DAG,
                                  const RISCVSubtarget &Subtarget) {
   // Perform this optimization only in the zba extension.
-  if (!ReassocShlAddiAdd || !Subtarget.hasStdExtZba())
+  if (!ReassocShlAddiAdd || !Subtarget.hasShlAdd(3))
     return SDValue();
 
   // Skip for vector types and larger types.
@@ -15828,7 +15823,8 @@ static SDValue performSUBCombine(SDNode *N, SelectionDAG &DAG,
   SDValue N1 = N->getOperand(1);
   // fold (sub 0, (setcc x, 0, setlt)) -> (sra x, xlen - 1)
   if (isNullConstant(N0) && N1.getOpcode() == ISD::SETCC && N1.hasOneUse() &&
-      isNullConstant(N1.getOperand(1))) {
+      isNullConstant(N1.getOperand(1)) &&
+      N1.getValueType() == N1.getOperand(0).getValueType()) {
     ISD::CondCode CCVal = cast<CondCodeSDNode>(N1.getOperand(2))->get();
     if (CCVal == ISD::SETLT) {
       SDLoc DL(N);
@@ -16375,17 +16371,13 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG,
   if (Subtarget.hasVendorXqciac() && isInt<12>(CNode->getSExtValue()))
     return SDValue();
 
-  const bool HasShlAdd = Subtarget.hasStdExtZba() ||
-                         Subtarget.hasVendorXTHeadBa() ||
-                         Subtarget.hasVendorXAndesPerf();
-
   // WARNING: The code below is knowingly incorrect with regards to undef semantics.
   // We're adding additional uses of X here, and in principle, we should be freezing
   // X before doing so.  However, adding freeze here causes real regressions, and no
   // other target properly freezes X in these cases either.
   SDValue X = N->getOperand(0);
 
-  if (HasShlAdd) {
+  if (Subtarget.hasShlAdd(3)) {
     for (uint64_t Divisor : {3, 5, 9}) {
       if (MulAmt % Divisor != 0)
         continue;
@@ -18840,6 +18832,8 @@ static SDValue tryFoldSelectIntoOp(SDNode *N, SelectionDAG &DAG,
   case ISD::ADD:
   case ISD::OR:
   case ISD::XOR:
+  case ISD::UMIN:
+  case ISD::UMAX:
     break;
   }
 
@@ -18949,7 +18943,7 @@ static SDValue useInversedSetcc(SDNode *N, SelectionDAG &DAG,
 
   // Replace (setcc eq (and x, C)) with (setcc ne (and x, C))) to generate
   // BEXTI, where C is power of 2.
-  if (Subtarget.hasStdExtZbs() && VT.isScalarInteger() &&
+  if (Subtarget.hasBEXTILike() && VT.isScalarInteger() &&
       (Subtarget.hasCZEROLike() || Subtarget.hasVendorXTHeadCondMov())) {
     SDValue LHS = Cond.getOperand(0);
     SDValue RHS = Cond.getOperand(1);
@@ -21331,14 +21325,8 @@ bool RISCVTargetLowering::isDesirableToCommuteWithShift(
     auto *C1 = dyn_cast<ConstantSDNode>(N0->getOperand(1));
     auto *C2 = dyn_cast<ConstantSDNode>(N->getOperand(1));
 
-    bool IsShXAdd =
-        (Subtarget.hasStdExtZba() || Subtarget.hasVendorXAndesPerf()) && C2 &&
-        C2->getZExtValue() >= 1 && C2->getZExtValue() <= 3;
-    bool IsQCShlAdd = Subtarget.hasVendorXqciac() && C2 &&
-                      C2->getZExtValue() >= 4 && C2->getZExtValue() <= 31;
-
     // Bail if we might break a sh{1,2,3}add/qc.shladd pattern.
-    if ((IsShXAdd || IsQCShlAdd) && N->hasOneUse() &&
+    if (C2 && Subtarget.hasShlAdd(C2->getZExtValue()) && N->hasOneUse() &&
         N->user_begin()->getOpcode() == ISD::ADD &&
         !isUsedByLdSt(*N->user_begin(), nullptr) &&
         !isa<ConstantSDNode>(N->user_begin()->getOperand(1)))
@@ -21580,6 +21568,16 @@ void RISCVTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
     Known = Known.sext(BitWidth);
     break;
   }
+  case RISCVISD::SHL_ADD: {
+    KnownBits Known2;
+    Known = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+    unsigned ShAmt = Op.getConstantOperandVal(1);
+    Known <<= ShAmt;
+    Known.Zero.setLowBits(ShAmt); // the <<= operator left these bits unknown
+    Known2 = DAG.computeKnownBits(Op.getOperand(2), DemandedElts, Depth + 1);
+    Known = KnownBits::add(Known, Known2);
+    break;
+  }
   case RISCVISD::CTZW: {
     KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
     unsigned PossibleTZ = Known2.trunc(32).countMaxTrailingZeros();
@@ -23259,6 +23257,10 @@ SDValue RISCVTargetLowering::LowerCall(CallLoweringInfo &CLI,
     if (VA.isRegLoc()) {
       // Queue up the argument copies and emit them at the end.
       RegsToPass.push_back(std::make_pair(VA.getLocReg(), ArgValue));
+
+      const TargetOptions &Options = DAG.getTarget().Options;
+      if (Options.EmitCallSiteInfo)
+        CSInfo.ArgRegPairs.emplace_back(VA.getLocReg(), i);
     } else {
       assert(VA.isMemLoc() && "Argument not register or memory");
       assert(!IsTailCall && "Tail call not allowed if stack is used "
@@ -23360,9 +23362,7 @@ SDValue RISCVTargetLowering::LowerCall(CallLoweringInfo &CLI,
     if (CLI.CFIType)
       Ret.getNode()->setCFIType(CLI.CFIType->getZExtValue());
     DAG.addNoMergeSiteInfo(Ret.getNode(), CLI.NoMerge);
-    if (MF.getTarget().Options.EmitCallGraphSection && CB &&
-        CB->isIndirectCall())
-      DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
+    DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
     return Ret;
   }
 
@@ -23371,10 +23371,8 @@ SDValue RISCVTargetLowering::LowerCall(CallLoweringInfo &CLI,
   if (CLI.CFIType)
     Chain.getNode()->setCFIType(CLI.CFIType->getZExtValue());
 
-  if (MF.getTarget().Options.EmitCallGraphSection && CB && CB->isIndirectCall())
-    DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
-
   DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
+  DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
   Glue = Chain.getValue(1);
 
   // Mark the end of the call, which is glued to the call itself.
@@ -24396,7 +24394,7 @@ bool RISCVTargetLowering::decomposeMulByConstant(LLVMContext &Context, EVT VT,
     return true;
 
   // Optimize the MUL to (SH*ADD x, (SLLI x, bits)) if Imm is not simm12.
-  if (Subtarget.hasStdExtZba() && !Imm.isSignedIntN(12) &&
+  if (Subtarget.hasShlAdd(3) && !Imm.isSignedIntN(12) &&
       ((Imm - 2).isPowerOf2() || (Imm - 4).isPowerOf2() ||
        (Imm - 8).isPowerOf2()))
     return true;
@@ -24839,12 +24837,16 @@ bool RISCVTargetLowering::areTwoSDNodeTargetMMOFlagsMergeable(
 }
 
 bool RISCVTargetLowering::isCtpopFast(EVT VT) const {
-  if (VT.isScalableVector())
-    return isTypeLegal(VT) && Subtarget.hasStdExtZvbb();
-  if (VT.isFixedLengthVector() && Subtarget.hasStdExtZvbb())
-    return true;
-  return Subtarget.hasStdExtZbb() &&
-         (VT == MVT::i32 || VT == MVT::i64 || VT.isFixedLengthVector());
+  if (VT.isVector()) {
+    EVT SVT = VT.getVectorElementType();
+    // If the element type is legal we can use cpop.v if it is enabled.
+    if (isLegalElementTypeForRVV(SVT))
+      return Subtarget.hasStdExtZvbb();
+    // Don't consider it fast if the type needs to be legalized or scalarized.
+    return false;
+  }
+
+  return Subtarget.hasCPOPLike() && (VT == MVT::i32 || VT == MVT::i64);
 }
 
 unsigned RISCVTargetLowering::getCustomCtpopCost(EVT VT,
@@ -24937,8 +24939,8 @@ RISCVTargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
 
 bool RISCVTargetLowering::shouldFoldSelectWithSingleBitTest(
     EVT VT, const APInt &AndMask) const {
-  if (Subtarget.hasCZEROLike())
-    return !Subtarget.hasStdExtZbs() && AndMask.ugt(1024);
+  if (Subtarget.hasCZEROLike() || Subtarget.hasVendorXTHeadCondMov())
+    return !Subtarget.hasBEXTILike() && AndMask.ugt(1024);
   return TargetLowering::shouldFoldSelectWithSingleBitTest(VT, AndMask);
 }
 
diff --git a/llvm/lib/Target/RISCV/RISCVInstrFormats.td b/llvm/lib/Target/RISCV/RISCVInstrFormats.td
index c2667b0e7c9e4..2afd77a96373b 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrFormats.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrFormats.td
@@ -498,6 +498,22 @@ class RVInstB<bits<3> funct3, RISCVOpcode opcode, dag outs, dag ins,
   let Inst{6-0} = opcode.Value;
 }
 
+class RVInstBIMM<bits<3> funct3, RISCVOpcode opcode, dag outs, dag ins,
+                 string opcodestr, string argstr>
+    : RVInst<outs, ins, opcodestr, argstr, [], InstFormatB> {
+  bits<12> imm12;
+  bits<5> cimm;
+  bits<5> rs1;
+  let Inst{31} = imm12{11};
+  let Inst{30-25} = imm12{9-4};
+  let Inst{24-20} = cimm;
+  let Inst{19-15} = rs1;
+  let Inst{14-12} = funct3;
+  let Inst{11-8} = imm12{3-0};
+  let Inst{7} = imm12{10};
+  let Inst{6-0} = opcode.Value;
+}
+
 class RVInstU<RISCVOpcode opcode, dag outs, dag ins, string opcodestr,
               string argstr>
     : RVInst<outs, ins, opcodestr, argstr, [], InstFormatU> {
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index d0bb57a3eaa13..f816112f70140 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -4492,7 +4492,7 @@ void RISCVInstrInfo::mulImm(MachineFunction &MF, MachineBasicBlock &MBB,
         .addReg(DestReg, RegState::Kill)
         .addImm(ShiftAmount)
         .setMIFlag(Flag);
-  } else if (STI.hasStdExtZba() &&
+  } else if (STI.hasShlAdd(3) &&
              ((Amount % 3 == 0 && isPowerOf2_64(Amount / 3)) ||
               (Amount % 5 == 0 && isPowerOf2_64(Amount / 5)) ||
               (Amount % 9 == 0 && isPowerOf2_64(Amount / 9)))) {
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
index 92552b36aa0b9..3529d8f4799ab 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
@@ -44,9 +44,6 @@ def SDT_RISCVIntUnaryOpW : SDTypeProfile<1, 1, [
 def SDT_RISCVIntBinOpW : SDTypeProfile<1, 2, [
   SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisVT<0, i64>
 ]>;
-def SDT_RISCVIntShiftDOpW : SDTypeProfile<1, 3, [
-  SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisVT<0, i64>, SDTCisVT<3, i64>
-]>;
 
 // Target-independent nodes, but with target-specific formats.
 def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_CallSeqStart,
@@ -2329,6 +2326,7 @@ include "RISCVInstrInfoZimop.td"
 include "RISCVInstrInfoZicbo.td"
 include "RISCVInstrInfoZicond.td"
 include "RISCVInstrInfoZilsd.td"
+include "RISCVInstrInfoZibi.td"
 
 // Scalar FP
 include "RISCVInstrInfoF.td"
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoD.td b/llvm/lib/Target/RISCV/RISCVInstrInfoD.td
index 414e093510607..3d9737e3645d5 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoD.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoD.td
@@ -401,6 +401,7 @@ foreach Ext = DExts in {
   defm : PatFprFpr_m<fmaximumnum, FMAX_D, Ext>;
   defm : PatFprFpr_m<riscv_fmin, FMIN_D, Ext>;
   defm : PatFprFpr_m<riscv_fmax, FMAX_D, Ext>;
+  let Predicates = Ext.Predicates in
   def : Pat<(f64 (fcanonicalize FPR64:$rs1)), (FMIN_D $rs1, $rs1)>;
 }
 
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoF.td b/llvm/lib/Target/RISCV/RISCVInstrInfoF.td
index 84a75666e5f36..2c1cf77acff56 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoF.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoF.td
@@ -655,6 +655,7 @@ foreach Ext = FExts in {
   defm : PatFprFpr_m<fmaximumnum, FMAX_S, Ext>;
   defm : PatFprFpr_m<riscv_fmin, FMIN_S, Ext>;
   defm : PatFprFpr_m<riscv_fmax, FMAX_S, Ext>;
+  let Predicates = Ext.Predicates in
   def : Pat<(f32 (fcanonicalize FPR32:$rs1)), (FMIN_S $rs1, $rs1)>;
 }
 
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td
index 49c9bdd83d3f6..b37ceaaee9cf4 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td
@@ -44,6 +44,7 @@ def ImmThreeAsmOperand : AsmOperandClass {
 def immthree : RISCVOp {
   let ParserMatchClass = ImmThreeAsmOperand;
   let OperandType = "OPERAND_THREE";
+  let DecoderMethod = "decodeImmThreeOperand";
 }
 
 def ImmFourAsmOperand : AsmOperandClass {
@@ -56,6 +57,7 @@ def ImmFourAsmOperand : AsmOperandClass {
 def immfour : RISCVOp {
   let ParserMatchClass = ImmFourAsmOperand;
   let OperandType = "OPERAND_FOUR";
+  let DecoderMethod = "decodeImmFourOperand";
 }
 
 //===----------------------------------------------------------------------===//
@@ -161,9 +163,9 @@ class THLoadPair<bits<5> funct5, string opcodestr, Operand consttype>
                 (ins GPR:$rs1, uimm2:$uimm2, consttype:$const3or4),
                  opcodestr, "$rd, $rs2, (${rs1}), $uimm2, $const3or4"> {
   bits<2> uimm2;
+  bits<0> const3or4;
   let Inst{31-27} = funct5;
   let Inst{26-25} = uimm2;
-  let DecoderMethod = "decodeXTHeadMemPair";
   let Constraints = "@earlyclobber $rd,@earlyclobber $rs2";
 }
 
@@ -173,9 +175,9 @@ class THStorePair<bits<5> funct5, string opcodestr, Operand consttype>
               (ins GPR:$rd, GPR:$rs2, GPR:$rs1, uimm2:$uimm2, consttype:$const3or4),
               opcodestr, "$rd, $rs2, (${rs1}), $uimm2, $const3or4"> {
   bits<2> uimm2;
+  bits<0> const3or4;
   let Inst{31-27} = funct5;
   let Inst{26-25} = uimm2;
-  let DecoderMethod = "decodeXTHeadMemPair";
 }
 
 let hasSideEffects = 1, mayLoad = 0, mayStore = 0 in
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td
index b3554ba81e387..014da990a0146 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td
@@ -380,6 +380,7 @@ foreach Ext = ZfhExts in {
   defm : PatFprFpr_m<fmaximumnum, FMAX_H, Ext>;
   defm : PatFprFpr_m<riscv_fmin, FMIN_H, Ext>;
   defm : PatFprFpr_m<riscv_fmax, FMAX_H, Ext>;
+  let Predicates = Ext.Predicates in
   def : Pat<(f16 (fcanonicalize FPR16:$rs1)), (FMIN_H $rs1, $rs1)>;
 }
 
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZibi.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZibi.td
new file mode 100644
index 0000000000000..1570355e3da54
--- /dev/null
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZibi.td
@@ -0,0 +1,44 @@
+//===-- RISCVInstrInfoZibi.td - 'Zibi' instructions --------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// This file describes the RISC-V instructions for 'Zibi' (branch with imm).
+///
+//===----------------------------------------------------------------------===//
+
+// A 5-bit unsigned immediate representing 1-31 and -1. 00000 represents -1.
+def imm5_zibi : RISCVOp<XLenVT>, ImmLeaf<XLenVT, [{
+    return (Imm != 0 && isUInt<5>(Imm)) || Imm == -1;
+}]> {
+  let ParserMatchClass = ImmAsmOperand<"", 5, "Zibi">;
+  let EncoderMethod = "getImmOpValueZibi";
+  let DecoderMethod = "decodeImmZibiOperand";
+  let MCOperandPredicate = [{
+    int64_t Imm;
+    if (!MCOp.evaluateAsConstantImm(Imm))
+      return false;
+    return (Imm >= 1 && Imm <= 31) || Imm == -1;
+  }];
+  let OperandType = "OPERAND_IMM5_ZIBI";
+}
+
+class Branch_imm<bits<3> funct3, string opcodestr>
+    : RVInstBIMM<funct3, OPC_BRANCH, (outs),
+                 (ins GPR:$rs1, imm5_zibi:$cimm, bare_simm13_lsb0:$imm12),
+                 opcodestr, "$rs1, $cimm, $imm12">,
+      Sched<[WriteJmp, ReadJmp]> {
+  let isBranch = 1;
+  let isTerminator = 1;
+  let hasSideEffects = 0;
+  let mayLoad = 0;
+  let mayStore = 0;
+}
+
+let Predicates = [HasStdExtZibi] in {
+  def BEQI : Branch_imm<0b010, "beqi">;
+  def BNEI : Branch_imm<0b011, "bnei">;
+} // Predicates = [HasStdExtZibi]
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZicfiss.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZicfiss.td
index 50ebaa9951979..efd06c29dc99f 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZicfiss.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZicfiss.td
@@ -12,12 +12,12 @@
 
 class RVC_SSInst<bits<5> rs1val, RegisterClass reg_class, string opcodestr> :
   RVInst16<(outs), (ins reg_class:$rs1), opcodestr, "$rs1", [], InstFormatOther> {
+  bits<0> rs1;
   let Inst{15-13} = 0b011;
   let Inst{12} = 0;
   let Inst{11-7} = rs1val;
   let Inst{6-2} = 0b00000;
   let Inst{1-0} = 0b01;
-  let DecoderMethod = "decodeCSSPushPopchk";
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.h b/llvm/lib/Target/RISCV/RISCVRegisterInfo.h
index 2810139bf52ea..67726db504122 100644
--- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.h
@@ -123,8 +123,7 @@ struct RISCVRegisterInfo : public RISCVGenRegisterInfo {
   }
 
   const TargetRegisterClass *
-  getPointerRegClass(const MachineFunction &MF,
-                     unsigned Kind = 0) const override {
+  getPointerRegClass(unsigned Kind = 0) const override {
     return &RISCV::GPRRegClass;
   }
 
diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.h b/llvm/lib/Target/RISCV/RISCVSubtarget.h
index 50e76df56e575..7dffa63d85505 100644
--- a/llvm/lib/Target/RISCV/RISCVSubtarget.h
+++ b/llvm/lib/Target/RISCV/RISCVSubtarget.h
@@ -186,6 +186,20 @@ class RISCVSubtarget : public RISCVGenSubtargetInfo {
     return HasStdExtZfhmin || HasStdExtZfbfmin;
   }
 
+  bool hasCLZLike() const {
+    return HasStdExtZbb || HasVendorXTHeadBb ||
+           (HasVendorXCVbitmanip && !IsRV64);
+  }
+  bool hasCTZLike() const {
+    return HasStdExtZbb || (HasVendorXCVbitmanip && !IsRV64);
+  }
+  bool hasCPOPLike() const {
+    return HasStdExtZbb || (HasVendorXCVbitmanip && !IsRV64);
+  }
+  bool hasREV8Like() const {
+    return HasStdExtZbb || HasStdExtZbkb || HasVendorXTHeadBb;
+  }
+
   bool hasBEXTILike() const { return HasStdExtZbs || HasVendorXTHeadBs; }
 
   bool hasCZEROLike() const {
@@ -198,6 +212,14 @@ class RISCVSubtarget : public RISCVGenSubtargetInfo {
            hasShortForwardBranchOpt();
   }
 
+  bool hasShlAdd(int64_t ShAmt) const {
+    if (ShAmt <= 0)
+      return false;
+    if (ShAmt <= 3)
+      return HasStdExtZba || HasVendorXAndesPerf || HasVendorXTHeadBa;
+    return ShAmt <= 31 && HasVendorXqciac;
+  }
+
   bool is64Bit() const { return IsRV64; }
   MVT getXLenVT() const {
     return is64Bit() ? MVT::i64 : MVT::i32;
diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
index 460bb33f2553a..f81b1e1260ee3 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
@@ -141,39 +141,6 @@ extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeRISCVTarget() {
   initializeRISCVAsmPrinterPass(*PR);
 }
 
-static std::string computeDataLayout(const Triple &TT,
-                                     const TargetOptions &Opts) {
-  std::string Ret;
-
-  if (TT.isLittleEndian())
-    Ret += "e";
-  else
-    Ret += "E";
-
-  Ret += "-m:e";
-
-  // Pointer and integer sizes.
-  if (TT.isArch64Bit()) {
-    Ret += "-p:64:64-i64:64-i128:128";
-    Ret += "-n32:64";
-  } else {
-    assert(TT.isArch32Bit() && "only RV32 and RV64 are currently supported");
-    Ret += "-p:32:32-i64:64";
-    Ret += "-n32";
-  }
-
-  // Stack alignment based on ABI.
-  StringRef ABI = Opts.MCOptions.getABIName();
-  if (ABI == "ilp32e")
-    Ret += "-S32";
-  else if (ABI == "lp64e")
-    Ret += "-S64";
-  else
-    Ret += "-S128";
-
-  return Ret;
-}
-
 static Reloc::Model getEffectiveRelocModel(const Triple &TT,
                                            std::optional<Reloc::Model> RM) {
   return RM.value_or(Reloc::Static);
@@ -185,9 +152,10 @@ RISCVTargetMachine::RISCVTargetMachine(const Target &T, const Triple &TT,
                                        std::optional<Reloc::Model> RM,
                                        std::optional<CodeModel::Model> CM,
                                        CodeGenOptLevel OL, bool JIT)
-    : CodeGenTargetMachineImpl(T, computeDataLayout(TT, Options), TT, CPU, FS,
-                               Options, getEffectiveRelocModel(TT, RM),
-                               getEffectiveCodeModel(CM, CodeModel::Small), OL),
+    : CodeGenTargetMachineImpl(
+          T, TT.computeDataLayout(Options.MCOptions.getABIName()), TT, CPU, FS,
+          Options, getEffectiveRelocModel(TT, RM),
+          getEffectiveCodeModel(CM, CodeModel::Small), OL),
       TLOF(std::make_unique<RISCVELFTargetObjectFile>()) {
   initAsmInfo();
 
@@ -195,6 +163,9 @@ RISCVTargetMachine::RISCVTargetMachine(const Target &T, const Triple &TT,
   setMachineOutliner(true);
   setSupportsDefaultOutlining(true);
 
+  // RISC-V supports the debug entry values.
+  setSupportsDebugEntryValues(true);
+
   if (TT.isOSFuchsia() && !TT.isArch64Bit())
     report_fatal_error("Fuchsia is only supported for 64-bit");
 
@@ -427,6 +398,7 @@ class RISCVPassConfig : public TargetPassConfig {
   void addPreRegAlloc() override;
   void addPostRegAlloc() override;
   void addFastRegAlloc() override;
+  bool addILPOpts() override;
 
   std::unique_ptr<CSEConfigBase> getCSEConfig() const override;
 };
@@ -612,9 +584,6 @@ void RISCVPassConfig::addMachineSSAOptimization() {
 
   TargetPassConfig::addMachineSSAOptimization();
 
-  if (EnableMachineCombiner)
-    addPass(&MachineCombinerID);
-
   if (TM->getTargetTriple().isRISCV64()) {
     addPass(createRISCVOptWInstrsPass());
   }
@@ -649,6 +618,13 @@ void RISCVPassConfig::addPostRegAlloc() {
     addPass(createRISCVRedundantCopyEliminationPass());
 }
 
+bool RISCVPassConfig::addILPOpts() {
+  if (EnableMachineCombiner)
+    addPass(&MachineCombinerID);
+
+  return true;
+}
+
 void RISCVTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
   PB.registerLateLoopOptimizationsEPCallback([=](LoopPassManager &LPM,
                                                  OptimizationLevel Level) {
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index 1ca513214f67c..a06faa414a2ef 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -289,9 +289,7 @@ bool RISCVTTIImpl::hasActiveVectorLength() const {
 TargetTransformInfo::PopcntSupportKind
 RISCVTTIImpl::getPopcntSupport(unsigned TyWidth) const {
   assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
-  return ST->hasStdExtZbb() || (ST->hasVendorXCVbitmanip() && !ST->is64Bit())
-             ? TTI::PSK_FastHardware
-             : TTI::PSK_Software;
+  return ST->hasCPOPLike() ? TTI::PSK_FastHardware : TTI::PSK_Software;
 }
 
 InstructionCost RISCVTTIImpl::getPartialReductionCost(
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
index 6bd7d51daff69..47e0a250d285a 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -141,6 +141,8 @@ class RISCVTTIImpl final : public BasicTTIImplBase<RISCVTTIImpl> {
     return false;
   }
 
+  bool shouldConsiderVectorizationRegPressure() const override { return true; }
+
   InstructionCost
   getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
                         unsigned AddressSpace,
diff --git a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
index 29526cf5a5273..a1134663c0e7a 100644
--- a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
+++ b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
@@ -10,9 +10,19 @@
 // instructions are inserted.
 //
 // The purpose of this optimization is to make the VL argument, for instructions
-// that have a VL argument, as small as possible. This is implemented by
-// visiting each instruction in reverse order and checking that if it has a VL
-// argument, whether the VL can be reduced.
+// that have a VL argument, as small as possible.
+//
+// This is split into a sparse dataflow analysis where we determine what VL is
+// demanded by each instruction first, and then afterwards try to reduce the VL
+// of each instruction if it demands less than its VL operand.
+//
+// The analysis is explained in more detail in the 2025 EuroLLVM Developers'
+// Meeting talk "Accidental Dataflow Analysis: Extending the RISC-V VL
+// Optimizer", which is available on YouTube at
+// https://www.youtube.com/watch?v=Mfb5fRSdJAc
+//
+// The slides for the talk are available at
+// https://llvm.org/devmtg/2025-04/slides/technical_talk/lau_accidental_dataflow.pdf
 //
 //===---------------------------------------------------------------------===//
 
@@ -30,6 +40,27 @@ using namespace llvm;
 
 namespace {
 
+/// Wrapper around MachineOperand that defaults to immediate 0.
+struct DemandedVL {
+  MachineOperand VL;
+  DemandedVL() : VL(MachineOperand::CreateImm(0)) {}
+  DemandedVL(MachineOperand VL) : VL(VL) {}
+  static DemandedVL vlmax() {
+    return DemandedVL(MachineOperand::CreateImm(RISCV::VLMaxSentinel));
+  }
+  bool operator!=(const DemandedVL &Other) const {
+    return !VL.isIdenticalTo(Other.VL);
+  }
+
+  DemandedVL max(const DemandedVL &X) const {
+    if (RISCV::isVLKnownLE(VL, X.VL))
+      return X;
+    if (RISCV::isVLKnownLE(X.VL, VL))
+      return *this;
+    return DemandedVL::vlmax();
+  }
+};
+
 class RISCVVLOptimizer : public MachineFunctionPass {
   const MachineRegisterInfo *MRI;
   const MachineDominatorTree *MDT;
@@ -51,17 +82,25 @@ class RISCVVLOptimizer : public MachineFunctionPass {
   StringRef getPassName() const override { return PASS_NAME; }
 
 private:
-  std::optional<MachineOperand>
-  getMinimumVLForUser(const MachineOperand &UserOp) const;
-  /// Returns the largest common VL MachineOperand that may be used to optimize
-  /// MI. Returns std::nullopt if it failed to find a suitable VL.
-  std::optional<MachineOperand> checkUsers(const MachineInstr &MI) const;
+  DemandedVL getMinimumVLForUser(const MachineOperand &UserOp) const;
+  /// Returns true if the users of \p MI have compatible EEWs and SEWs.
+  bool checkUsers(const MachineInstr &MI) const;
   bool tryReduceVL(MachineInstr &MI) const;
   bool isCandidate(const MachineInstr &MI) const;
+  void transfer(const MachineInstr &MI);
 
   /// For a given instruction, records what elements of it are demanded by
   /// downstream users.
-  DenseMap<const MachineInstr *, std::optional<MachineOperand>> DemandedVLs;
+  DenseMap<const MachineInstr *, DemandedVL> DemandedVLs;
+  SetVector<const MachineInstr *> Worklist;
+
+  /// \returns all vector virtual registers that \p MI uses.
+  auto virtual_vec_uses(const MachineInstr &MI) const {
+    return make_filter_range(MI.uses(), [this](const MachineOperand &MO) {
+      return MO.isReg() && MO.getReg().isVirtual() &&
+             RISCVRegisterInfo::isRVVRegClass(MRI->getRegClass(MO.getReg()));
+    });
+  }
 };
 
 /// Represents the EMUL and EEW of a MachineOperand.
@@ -847,10 +886,15 @@ static std::optional<OperandInfo> getOperandInfo(const MachineOperand &MO) {
   return OperandInfo(getEMULEqualsEEWDivSEWTimesLMUL(*Log2EEW, MI), *Log2EEW);
 }
 
+static bool isTupleInsertInstr(const MachineInstr &MI);
+
 /// Return true if this optimization should consider MI for VL reduction. This
 /// white-list approach simplifies this optimization for instructions that may
 /// have more complex semantics with relation to how it uses VL.
 static bool isSupportedInstr(const MachineInstr &MI) {
+  if (MI.isPHI() || MI.isFullCopy() || isTupleInsertInstr(MI))
+    return true;
+
   const RISCVVPseudosTable::PseudoInfo *RVV =
       RISCVVPseudosTable::getPseudoInfo(MI.getOpcode());
 
@@ -1348,21 +1392,24 @@ bool RISCVVLOptimizer::isCandidate(const MachineInstr &MI) const {
   return true;
 }
 
-std::optional<MachineOperand>
+DemandedVL
 RISCVVLOptimizer::getMinimumVLForUser(const MachineOperand &UserOp) const {
   const MachineInstr &UserMI = *UserOp.getParent();
   const MCInstrDesc &Desc = UserMI.getDesc();
 
+  if (UserMI.isPHI() || UserMI.isFullCopy() || isTupleInsertInstr(UserMI))
+    return DemandedVLs.lookup(&UserMI);
+
   if (!RISCVII::hasVLOp(Desc.TSFlags) || !RISCVII::hasSEWOp(Desc.TSFlags)) {
     LLVM_DEBUG(dbgs() << "  Abort due to lack of VL, assume that"
                          " use VLMAX\n");
-    return std::nullopt;
+    return DemandedVL::vlmax();
   }
 
   if (RISCVII::readsPastVL(
           TII->get(RISCV::getRVVMCOpcode(UserMI.getOpcode())).TSFlags)) {
     LLVM_DEBUG(dbgs() << "  Abort because used by unsafe instruction\n");
-    return std::nullopt;
+    return DemandedVL::vlmax();
   }
 
   unsigned VLOpNum = RISCVII::getVLOpNum(Desc);
@@ -1376,11 +1423,10 @@ RISCVVLOptimizer::getMinimumVLForUser(const MachineOperand &UserOp) const {
   if (UserOp.isTied()) {
     assert(UserOp.getOperandNo() == UserMI.getNumExplicitDefs() &&
            RISCVII::isFirstDefTiedToFirstUse(UserMI.getDesc()));
-    auto DemandedVL = DemandedVLs.lookup(&UserMI);
-    if (!DemandedVL || !RISCV::isVLKnownLE(*DemandedVL, VLOp)) {
+    if (!RISCV::isVLKnownLE(DemandedVLs.lookup(&UserMI).VL, VLOp)) {
       LLVM_DEBUG(dbgs() << "  Abort because user is passthru in "
                            "instruction with demanded tail\n");
-      return std::nullopt;
+      return DemandedVL::vlmax();
     }
   }
 
@@ -1393,11 +1439,8 @@ RISCVVLOptimizer::getMinimumVLForUser(const MachineOperand &UserOp) const {
 
   // If we know the demanded VL of UserMI, then we can reduce the VL it
   // requires.
-  if (auto DemandedVL = DemandedVLs.lookup(&UserMI)) {
-    assert(isCandidate(UserMI));
-    if (RISCV::isVLKnownLE(*DemandedVL, VLOp))
-      return DemandedVL;
-  }
+  if (RISCV::isVLKnownLE(DemandedVLs.lookup(&UserMI).VL, VLOp))
+    return DemandedVLs.lookup(&UserMI);
 
   return VLOp;
 }
@@ -1450,22 +1493,23 @@ static bool isSegmentedStoreInstr(const MachineInstr &MI) {
   }
 }
 
-std::optional<MachineOperand>
-RISCVVLOptimizer::checkUsers(const MachineInstr &MI) const {
-  std::optional<MachineOperand> CommonVL;
-  SmallSetVector<MachineOperand *, 8> Worklist;
+bool RISCVVLOptimizer::checkUsers(const MachineInstr &MI) const {
+  if (MI.isPHI() || MI.isFullCopy() || isTupleInsertInstr(MI))
+    return true;
+
+  SmallSetVector<MachineOperand *, 8> OpWorklist;
   SmallPtrSet<const MachineInstr *, 4> PHISeen;
   for (auto &UserOp : MRI->use_operands(MI.getOperand(0).getReg()))
-    Worklist.insert(&UserOp);
+    OpWorklist.insert(&UserOp);
 
-  while (!Worklist.empty()) {
-    MachineOperand &UserOp = *Worklist.pop_back_val();
+  while (!OpWorklist.empty()) {
+    MachineOperand &UserOp = *OpWorklist.pop_back_val();
     const MachineInstr &UserMI = *UserOp.getParent();
     LLVM_DEBUG(dbgs() << "  Checking user: " << UserMI << "\n");
 
     if (UserMI.isFullCopy() && UserMI.getOperand(0).getReg().isVirtual()) {
       LLVM_DEBUG(dbgs() << "    Peeking through uses of COPY\n");
-      Worklist.insert_range(llvm::make_pointer_range(
+      OpWorklist.insert_range(llvm::make_pointer_range(
           MRI->use_operands(UserMI.getOperand(0).getReg())));
       continue;
     }
@@ -1481,8 +1525,8 @@ RISCVVLOptimizer::checkUsers(const MachineInstr &MI) const {
         // whole register group).
         if (!isTupleInsertInstr(CandidateMI) &&
             !isSegmentedStoreInstr(CandidateMI))
-          return std::nullopt;
-        Worklist.insert(&UseOp);
+          return false;
+        OpWorklist.insert(&UseOp);
       }
       continue;
     }
@@ -1492,28 +1536,14 @@ RISCVVLOptimizer::checkUsers(const MachineInstr &MI) const {
       if (!PHISeen.insert(&UserMI).second)
         continue;
       LLVM_DEBUG(dbgs() << "    Peeking through uses of PHI\n");
-      Worklist.insert_range(llvm::make_pointer_range(
+      OpWorklist.insert_range(llvm::make_pointer_range(
           MRI->use_operands(UserMI.getOperand(0).getReg())));
       continue;
     }
 
-    auto VLOp = getMinimumVLForUser(UserOp);
-    if (!VLOp)
-      return std::nullopt;
-
-    // Use the largest VL among all the users. If we cannot determine this
-    // statically, then we cannot optimize the VL.
-    if (!CommonVL || RISCV::isVLKnownLE(*CommonVL, *VLOp)) {
-      CommonVL = *VLOp;
-      LLVM_DEBUG(dbgs() << "    User VL is: " << VLOp << "\n");
-    } else if (!RISCV::isVLKnownLE(*VLOp, *CommonVL)) {
-      LLVM_DEBUG(dbgs() << "    Abort because cannot determine a common VL\n");
-      return std::nullopt;
-    }
-
     if (!RISCVII::hasSEWOp(UserMI.getDesc().TSFlags)) {
       LLVM_DEBUG(dbgs() << "    Abort due to lack of SEW operand\n");
-      return std::nullopt;
+      return false;
     }
 
     std::optional<OperandInfo> ConsumerInfo = getOperandInfo(UserOp);
@@ -1522,7 +1552,7 @@ RISCVVLOptimizer::checkUsers(const MachineInstr &MI) const {
       LLVM_DEBUG(dbgs() << "    Abort due to unknown operand information.\n");
       LLVM_DEBUG(dbgs() << "      ConsumerInfo is: " << ConsumerInfo << "\n");
       LLVM_DEBUG(dbgs() << "      ProducerInfo is: " << ProducerInfo << "\n");
-      return std::nullopt;
+      return false;
     }
 
     if (!OperandInfo::areCompatible(*ProducerInfo, *ConsumerInfo)) {
@@ -1531,11 +1561,11 @@ RISCVVLOptimizer::checkUsers(const MachineInstr &MI) const {
           << "    Abort due to incompatible information for EMUL or EEW.\n");
       LLVM_DEBUG(dbgs() << "      ConsumerInfo is: " << ConsumerInfo << "\n");
       LLVM_DEBUG(dbgs() << "      ProducerInfo is: " << ProducerInfo << "\n");
-      return std::nullopt;
+      return false;
     }
   }
 
-  return CommonVL;
+  return true;
 }
 
 bool RISCVVLOptimizer::tryReduceVL(MachineInstr &MI) const {
@@ -1551,9 +1581,7 @@ bool RISCVVLOptimizer::tryReduceVL(MachineInstr &MI) const {
     return false;
   }
 
-  auto CommonVL = DemandedVLs.lookup(&MI);
-  if (!CommonVL)
-    return false;
+  auto *CommonVL = &DemandedVLs.at(&MI).VL;
 
   assert((CommonVL->isImm() || CommonVL->getReg().isVirtual()) &&
          "Expected VL to be an Imm or virtual Reg");
@@ -1564,7 +1592,7 @@ bool RISCVVLOptimizer::tryReduceVL(MachineInstr &MI) const {
     const MachineInstr *VLMI = MRI->getVRegDef(CommonVL->getReg());
     if (RISCVInstrInfo::isFaultOnlyFirstLoad(*VLMI) &&
         !MDT->dominates(VLMI, &MI))
-      CommonVL = VLMI->getOperand(RISCVII::getVLOpNum(VLMI->getDesc()));
+      CommonVL = &VLMI->getOperand(RISCVII::getVLOpNum(VLMI->getDesc()));
   }
 
   if (!RISCV::isVLKnownLE(*CommonVL, VLOp)) {
@@ -1599,6 +1627,24 @@ bool RISCVVLOptimizer::tryReduceVL(MachineInstr &MI) const {
   return true;
 }
 
+static bool isPhysical(const MachineOperand &MO) {
+  return MO.isReg() && MO.getReg().isPhysical();
+}
+
+/// Look through \p MI's operands and propagate what it demands to its uses.
+void RISCVVLOptimizer::transfer(const MachineInstr &MI) {
+  if (!isSupportedInstr(MI) || !checkUsers(MI) || any_of(MI.defs(), isPhysical))
+    DemandedVLs[&MI] = DemandedVL::vlmax();
+
+  for (const MachineOperand &MO : virtual_vec_uses(MI)) {
+    const MachineInstr *Def = MRI->getVRegDef(MO.getReg());
+    DemandedVL Prev = DemandedVLs[Def];
+    DemandedVLs[Def] = DemandedVLs[Def].max(getMinimumVLForUser(MO));
+    if (DemandedVLs[Def] != Prev)
+      Worklist.insert(Def);
+  }
+}
+
 bool RISCVVLOptimizer::runOnMachineFunction(MachineFunction &MF) {
   if (skipFunction(MF.getFunction()))
     return false;
@@ -1614,15 +1660,18 @@ bool RISCVVLOptimizer::runOnMachineFunction(MachineFunction &MF) {
 
   assert(DemandedVLs.empty());
 
-  // For each instruction that defines a vector, compute what VL its
-  // downstream users demand.
+  // For each instruction that defines a vector, propagate the VL it
+  // uses to its inputs.
   for (MachineBasicBlock *MBB : post_order(&MF)) {
     assert(MDT->isReachableFromEntry(MBB));
-    for (MachineInstr &MI : reverse(*MBB)) {
-      if (!isCandidate(MI))
-        continue;
-      DemandedVLs.insert({&MI, checkUsers(MI)});
-    }
+    for (MachineInstr &MI : reverse(*MBB))
+      Worklist.insert(&MI);
+  }
+
+  while (!Worklist.empty()) {
+    const MachineInstr *MI = Worklist.front();
+    Worklist.remove(MI);
+    transfer(*MI);
   }
 
   // Then go through and see if we can reduce the VL of any instructions to
diff --git a/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp b/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp
index 62651185137c9..ffba2843bde1f 100644
--- a/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp
+++ b/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp
@@ -455,8 +455,7 @@ bool RISCVVectorPeephole::convertSameMaskVMergeToVMv(MachineInstr &MI) {
     True->getOperand(1).setReg(MI.getOperand(2).getReg());
     // If True is masked then its passthru needs to be in VRNoV0.
     MRI->constrainRegClass(True->getOperand(1).getReg(),
-                           TII->getRegClass(True->getDesc(), 1, TRI,
-                                            *True->getParent()->getParent()));
+                           TII->getRegClass(True->getDesc(), 1, TRI));
   }
 
   MI.setDesc(TII->get(NewOpc));
@@ -674,10 +673,9 @@ bool RISCVVectorPeephole::foldVMV_V_V(MachineInstr &MI) {
     SrcPassthru.setReg(Passthru.getReg());
     // If Src is masked then its passthru needs to be in VRNoV0.
     if (Passthru.getReg() != RISCV::NoRegister)
-      MRI->constrainRegClass(Passthru.getReg(),
-                             TII->getRegClass(Src->getDesc(),
-                                              SrcPassthru.getOperandNo(), TRI,
-                                              *Src->getParent()->getParent()));
+      MRI->constrainRegClass(
+          Passthru.getReg(),
+          TII->getRegClass(Src->getDesc(), SrcPassthru.getOperandNo(), TRI));
   }
 
   if (RISCVII::hasVecPolicyOp(Src->getDesc().TSFlags)) {
diff --git a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.h b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.h
index c2c08f8831307..d76180ce97e9e 100644
--- a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.h
+++ b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.h
@@ -232,6 +232,11 @@ namespace SpecConstantOpOperands {
 #include "SPIRVGenTables.inc"
 } // namespace SpecConstantOpOperands
 
+namespace FPEncoding {
+#define GET_FPEncoding_DECL
+#include "SPIRVGenTables.inc"
+} // namespace FPEncoding
+
 struct ExtendedBuiltin {
   StringRef Name;
   InstructionSet::InstructionSet Set;
diff --git a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
index cfe24c84941a9..115766ce886c7 100644
--- a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
@@ -203,6 +203,18 @@ SPIRVType *SPIRVGlobalRegistry::getOpTypeFloat(uint32_t Width,
   });
 }
 
+SPIRVType *
+SPIRVGlobalRegistry::getOpTypeFloat(uint32_t Width,
+                                    MachineIRBuilder &MIRBuilder,
+                                    SPIRV::FPEncoding::FPEncoding FPEncode) {
+  return createOpType(MIRBuilder, [&](MachineIRBuilder &MIRBuilder) {
+    return MIRBuilder.buildInstr(SPIRV::OpTypeFloat)
+        .addDef(createTypeVReg(MIRBuilder))
+        .addImm(Width)
+        .addImm(FPEncode);
+  });
+}
+
 SPIRVType *SPIRVGlobalRegistry::getOpTypeVoid(MachineIRBuilder &MIRBuilder) {
   return createOpType(MIRBuilder, [&](MachineIRBuilder &MIRBuilder) {
     return MIRBuilder.buildInstr(SPIRV::OpTypeVoid)
@@ -1041,8 +1053,14 @@ SPIRVType *SPIRVGlobalRegistry::createSPIRVType(
     return Width == 1 ? getOpTypeBool(MIRBuilder)
                       : getOpTypeInt(Width, MIRBuilder, false);
   }
-  if (Ty->isFloatingPointTy())
-    return getOpTypeFloat(Ty->getPrimitiveSizeInBits(), MIRBuilder);
+  if (Ty->isFloatingPointTy()) {
+    if (Ty->isBFloatTy()) {
+      return getOpTypeFloat(Ty->getPrimitiveSizeInBits(), MIRBuilder,
+                            SPIRV::FPEncoding::BFloat16KHR);
+    } else {
+      return getOpTypeFloat(Ty->getPrimitiveSizeInBits(), MIRBuilder);
+    }
+  }
   if (Ty->isVoidTy())
     return getOpTypeVoid(MIRBuilder);
   if (Ty->isVectorTy()) {
diff --git a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h
index 7ef812828b7cc..a648defa0a888 100644
--- a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h
+++ b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h
@@ -438,6 +438,9 @@ class SPIRVGlobalRegistry : public SPIRVIRMapping {
 
   SPIRVType *getOpTypeFloat(uint32_t Width, MachineIRBuilder &MIRBuilder);
 
+  SPIRVType *getOpTypeFloat(uint32_t Width, MachineIRBuilder &MIRBuilder,
+                            SPIRV::FPEncoding::FPEncoding FPEncode);
+
   SPIRVType *getOpTypeVoid(MachineIRBuilder &MIRBuilder);
 
   SPIRVType *getOpTypeVector(uint32_t NumElems, SPIRVType *ElemType,
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td
index 8d10cd0ffb3dd..496dcba17c10d 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td
+++ b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td
@@ -167,7 +167,7 @@ def OpTypeVoid: Op<19, (outs TYPE:$type), (ins), "$type = OpTypeVoid">;
 def OpTypeBool: Op<20, (outs TYPE:$type), (ins), "$type = OpTypeBool">;
 def OpTypeInt: Op<21, (outs TYPE:$type), (ins i32imm:$width, i32imm:$signedness),
                   "$type = OpTypeInt $width $signedness">;
-def OpTypeFloat: Op<22, (outs TYPE:$type), (ins i32imm:$width),
+def OpTypeFloat: Op<22, (outs TYPE:$type), (ins i32imm:$width, variable_ops),
                   "$type = OpTypeFloat $width">;
 def OpTypeVector: Op<23, (outs TYPE:$type), (ins TYPE:$compType, i32imm:$compCount),
                   "$type = OpTypeVector $compType $compCount">;
diff --git a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
index d2824ee2d2caf..ed933f872d136 100644
--- a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
+++ b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
@@ -210,6 +210,7 @@ def CooperativeMatrixLayoutOperand : OperandCategory;
 def CooperativeMatrixOperandsOperand : OperandCategory;
 def SpecConstantOpOperandsOperand : OperandCategory;
 def MatrixMultiplyAccumulateOperandsOperand : OperandCategory;
+def FPEncodingOperand : OperandCategory;
 
 //===----------------------------------------------------------------------===//
 // Definition of the Environments
@@ -1996,3 +1997,28 @@ defm MatrixAPackedFloat16INTEL :  MatrixMultiplyAccumulateOperandsOperand<0x400,
 defm MatrixBPackedFloat16INTEL :  MatrixMultiplyAccumulateOperandsOperand<0x800, [SPV_INTEL_subgroup_matrix_multiply_accumulate]>;
 defm MatrixAPackedBFloat16INTEL :  MatrixMultiplyAccumulateOperandsOperand<0x1000, [SPV_INTEL_subgroup_matrix_multiply_accumulate]>;
 defm MatrixBPackedBFloat16INTEL :  MatrixMultiplyAccumulateOperandsOperand<0x2000, [SPV_INTEL_subgroup_matrix_multiply_accumulate]>;
+
+//===----------------------------------------------------------------------===//
+// Multiclass used to define FPEncoding enum values and at the
+// same time SymbolicOperand entries with extensions.
+//===----------------------------------------------------------------------===//
+def FPEncoding : GenericEnum, Operand<i32> {
+  let FilterClass = "FPEncoding";
+  let NameField = "Name";
+  let ValueField = "Value";
+  let PrintMethod = !strconcat("printSymbolicOperand<OperandCategory::", FilterClass, "Operand>");
+}
+
+class FPEncoding<string name, bits<32> value> {
+  string Name = name;
+  bits<32> Value = value;
+}
+
+multiclass FPEncodingOperand<bits<32> value, list<Extension> reqExtensions>{
+  def NAME : FPEncoding<NAME, value>;
+  defm : SymbolicOperandWithRequirements<
+             FPEncodingOperand, value, NAME, 0, 0,
+             reqExtensions, [], []>;
+}
+
+defm BFloat16KHR : FPEncodingOperand<0, []>;
diff --git a/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp b/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp
index 0eac43d8469df..9f6f9c7225357 100644
--- a/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp
@@ -60,30 +60,6 @@ extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeSPIRVTarget() {
   initializeSPIRVStripConvergentIntrinsicsPass(PR);
 }
 
-static std::string computeDataLayout(const Triple &TT) {
-  const auto Arch = TT.getArch();
-  // TODO: this probably needs to be revisited:
-  // Logical SPIR-V has no pointer size, so any fixed pointer size would be
-  // wrong. The choice to default to 32 or 64 is just motivated by another
-  // memory model used for graphics: PhysicalStorageBuffer64. But it shouldn't
-  // mean anything.
-  if (Arch == Triple::spirv32)
-    return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-"
-           "v256:256-v512:512-v1024:1024-n8:16:32:64-G1";
-  if (Arch == Triple::spirv)
-    return "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-"
-           "v512:512-v1024:1024-n8:16:32:64-G10";
-  if (TT.getVendor() == Triple::VendorType::AMD &&
-      TT.getOS() == Triple::OSType::AMDHSA)
-    return "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-"
-           "v512:512-v1024:1024-n32:64-S32-G1-P4-A0";
-  if (TT.getVendor() == Triple::VendorType::Intel)
-    return "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-"
-           "v512:512-v1024:1024-n8:16:32:64-G1-P9-A0";
-  return "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-"
-         "v512:512-v1024:1024-n8:16:32:64-G1";
-}
-
 static Reloc::Model getEffectiveRelocModel(std::optional<Reloc::Model> RM) {
   if (!RM)
     return Reloc::PIC_;
@@ -99,7 +75,7 @@ SPIRVTargetMachine::SPIRVTargetMachine(const Target &T, const Triple &TT,
                                        std::optional<Reloc::Model> RM,
                                        std::optional<CodeModel::Model> CM,
                                        CodeGenOptLevel OL, bool JIT)
-    : CodeGenTargetMachineImpl(T, computeDataLayout(TT), TT, CPU, FS, Options,
+    : CodeGenTargetMachineImpl(T, TT.computeDataLayout(), TT, CPU, FS, Options,
                                getEffectiveRelocModel(RM),
                                getEffectiveCodeModel(CM, CodeModel::Small), OL),
       TLOF(std::make_unique<SPIRVTargetObjectFile>()),
diff --git a/llvm/lib/Target/Sparc/SparcInstrInfo.cpp b/llvm/lib/Target/Sparc/SparcInstrInfo.cpp
index cd0f649912980..e28f4457263f4 100644
--- a/llvm/lib/Target/Sparc/SparcInstrInfo.cpp
+++ b/llvm/lib/Target/Sparc/SparcInstrInfo.cpp
@@ -38,7 +38,7 @@ static cl::opt<unsigned>
 void SparcInstrInfo::anchor() {}
 
 SparcInstrInfo::SparcInstrInfo(const SparcSubtarget &ST)
-    : SparcGenInstrInfo(ST, SP::ADJCALLSTACKDOWN, SP::ADJCALLSTACKUP), RI(),
+    : SparcGenInstrInfo(ST, SP::ADJCALLSTACKDOWN, SP::ADJCALLSTACKUP), RI(ST),
       Subtarget(ST) {}
 
 /// isLoadFromStackSlot - If the specified machine instruction is a direct
diff --git a/llvm/lib/Target/Sparc/SparcRegisterInfo.cpp b/llvm/lib/Target/Sparc/SparcRegisterInfo.cpp
index e4db27a63076d..0a14746f587bb 100644
--- a/llvm/lib/Target/Sparc/SparcRegisterInfo.cpp
+++ b/llvm/lib/Target/Sparc/SparcRegisterInfo.cpp
@@ -31,7 +31,8 @@ static cl::opt<bool>
 ReserveAppRegisters("sparc-reserve-app-registers", cl::Hidden, cl::init(false),
                     cl::desc("Reserve application registers (%g2-%g4)"));
 
-SparcRegisterInfo::SparcRegisterInfo() : SparcGenRegisterInfo(SP::O7) {}
+SparcRegisterInfo::SparcRegisterInfo(const SparcSubtarget &STI)
+    : SparcGenRegisterInfo(SP::O7), Is64Bit(STI.is64Bit()) {}
 
 const MCPhysReg*
 SparcRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
@@ -111,11 +112,10 @@ bool SparcRegisterInfo::isReservedReg(const MachineFunction &MF,
   return getReservedRegs(MF)[Reg];
 }
 
-const TargetRegisterClass*
-SparcRegisterInfo::getPointerRegClass(const MachineFunction &MF,
-                                      unsigned Kind) const {
-  const SparcSubtarget &Subtarget = MF.getSubtarget<SparcSubtarget>();
-  return Subtarget.is64Bit() ? &SP::I64RegsRegClass : &SP::IntRegsRegClass;
+const TargetRegisterClass *
+SparcRegisterInfo::getPointerRegClass(unsigned Kind) const {
+  assert(Kind == 0 && "this should only be used for default cases");
+  return Is64Bit ? &SP::I64RegsRegClass : &SP::IntRegsRegClass;
 }
 
 static void replaceFI(MachineFunction &MF, MachineBasicBlock::iterator II,
diff --git a/llvm/lib/Target/Sparc/SparcRegisterInfo.h b/llvm/lib/Target/Sparc/SparcRegisterInfo.h
index eae859ce1a519..abd8baeff56a2 100644
--- a/llvm/lib/Target/Sparc/SparcRegisterInfo.h
+++ b/llvm/lib/Target/Sparc/SparcRegisterInfo.h
@@ -19,8 +19,14 @@
 #include "SparcGenRegisterInfo.inc"
 
 namespace llvm {
+class SparcSubtarget;
+
 struct SparcRegisterInfo : public SparcGenRegisterInfo {
-  SparcRegisterInfo();
+private:
+  const bool Is64Bit;
+
+public:
+  explicit SparcRegisterInfo(const SparcSubtarget &STI);
 
   /// Code Generation virtual methods...
   const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override;
@@ -32,8 +38,7 @@ struct SparcRegisterInfo : public SparcGenRegisterInfo {
   BitVector getReservedRegs(const MachineFunction &MF) const override;
   bool isReservedReg(const MachineFunction &MF, MCRegister Reg) const;
 
-  const TargetRegisterClass *getPointerRegClass(const MachineFunction &MF,
-                                                unsigned Kind) const override;
+  const TargetRegisterClass *getPointerRegClass(unsigned Kind) const override;
 
   bool eliminateFrameIndex(MachineBasicBlock::iterator II,
                            int SPAdj, unsigned FIOperandNum,
diff --git a/llvm/lib/Target/Sparc/SparcTargetMachine.cpp b/llvm/lib/Target/Sparc/SparcTargetMachine.cpp
index 754c8f63ca4ec..27ab57c11cf71 100644
--- a/llvm/lib/Target/Sparc/SparcTargetMachine.cpp
+++ b/llvm/lib/Target/Sparc/SparcTargetMachine.cpp
@@ -38,39 +38,6 @@ static cl::opt<bool>
     BranchRelaxation("sparc-enable-branch-relax", cl::Hidden, cl::init(true),
                      cl::desc("Relax out of range conditional branches"));
 
-static std::string computeDataLayout(const Triple &T) {
-  const bool is64Bit = T.isSPARC64();
-
-  // Sparc is typically big endian, but some are little.
-  std::string Ret = T.getArch() == Triple::sparcel ? "e" : "E";
-  Ret += "-m:e";
-
-  // Some ABIs have 32bit pointers.
-  if (!is64Bit)
-    Ret += "-p:32:32";
-
-  // Alignments for 64 bit integers.
-  Ret += "-i64:64";
-
-  // Alignments for 128 bit integers.
-  // This is not specified in the ABI document but is the de facto standard.
-  Ret += "-i128:128";
-
-  // On SparcV9 128 floats are aligned to 128 bits, on others only to 64.
-  // On SparcV9 registers can hold 64 or 32 bits, on others only 32.
-  if (is64Bit)
-    Ret += "-n32:64";
-  else
-    Ret += "-f128:64-n32";
-
-  if (is64Bit)
-    Ret += "-S128";
-  else
-    Ret += "-S64";
-
-  return Ret;
-}
-
 static Reloc::Model getEffectiveRelocModel(std::optional<Reloc::Model> RM) {
   return RM.value_or(Reloc::Static);
 }
@@ -111,7 +78,7 @@ SparcTargetMachine::SparcTargetMachine(const Target &T, const Triple &TT,
                                        std::optional<CodeModel::Model> CM,
                                        CodeGenOptLevel OL, bool JIT)
     : CodeGenTargetMachineImpl(
-          T, computeDataLayout(TT), TT, CPU, FS, Options,
+          T, TT.computeDataLayout(), TT, CPU, FS, Options,
           getEffectiveRelocModel(RM),
           getEffectiveSparcCodeModel(CM, getEffectiveRelocModel(RM),
                                      TT.isSPARC64(), JIT),
diff --git a/llvm/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp b/llvm/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp
index 0b91aba67694f..a724c5a7f97c8 100644
--- a/llvm/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp
+++ b/llvm/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp
@@ -20,6 +20,7 @@
 #include <cstdint>
 
 using namespace llvm;
+using namespace llvm::MCD;
 
 #define DEBUG_TYPE "systemz-disassembler"
 
diff --git a/llvm/lib/Target/SystemZ/SystemZHazardRecognizer.cpp b/llvm/lib/Target/SystemZ/SystemZHazardRecognizer.cpp
index 34888f44aa221..34d58e05ff3e4 100644
--- a/llvm/lib/Target/SystemZ/SystemZHazardRecognizer.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZHazardRecognizer.cpp
@@ -115,12 +115,11 @@ SystemZHazardRecognizer::fitsIntoCurrentGroup(SUnit *SU) const {
 }
 
 bool SystemZHazardRecognizer::has4RegOps(const MachineInstr *MI) const {
-  const MachineFunction &MF = *MI->getParent()->getParent();
   const TargetRegisterInfo *TRI = &TII->getRegisterInfo();
   const MCInstrDesc &MID = MI->getDesc();
   unsigned Count = 0;
   for (unsigned OpIdx = 0; OpIdx < MID.getNumOperands(); OpIdx++) {
-    const TargetRegisterClass *RC = TII->getRegClass(MID, OpIdx, TRI, MF);
+    const TargetRegisterClass *RC = TII->getRegClass(MID, OpIdx, TRI);
     if (RC == nullptr)
       continue;
     if (OpIdx >= MID.getNumDefs() &&
diff --git a/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp b/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
index 6f146b67f8566..a05fdc74e6366 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
@@ -1851,7 +1851,7 @@ bool SystemZDAGToDAGISel::SelectInlineAsmMemoryOperand(
 
   if (selectBDXAddr(Form, DispRange, Op, Base, Disp, Index)) {
     const TargetRegisterClass *TRC =
-      Subtarget->getRegisterInfo()->getPointerRegClass(*MF);
+        Subtarget->getRegisterInfo()->getPointerRegClass();
     SDLoc DL(Base);
     SDValue RC = CurDAG->getTargetConstant(TRC->getID(), DL, MVT::i32);
 
diff --git a/llvm/lib/Target/SystemZ/SystemZRegisterInfo.h b/llvm/lib/Target/SystemZ/SystemZRegisterInfo.h
index 460be432811a4..b1de145db3d31 100644
--- a/llvm/lib/Target/SystemZ/SystemZRegisterInfo.h
+++ b/llvm/lib/Target/SystemZ/SystemZRegisterInfo.h
@@ -135,8 +135,7 @@ struct SystemZRegisterInfo : public SystemZGenRegisterInfo {
   /// This is currently only used by LOAD_STACK_GUARD, which requires a non-%r0
   /// register, hence ADDR64.
   const TargetRegisterClass *
-  getPointerRegClass(const MachineFunction &MF,
-                     unsigned Kind=0) const override {
+  getPointerRegClass(unsigned Kind = 0) const override {
     return &SystemZ::ADDR64BitRegClass;
   }
 
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp b/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp
index ece8928accd0c..3d0c04b574933 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp
@@ -54,47 +54,6 @@ LLVMInitializeSystemZTarget() {
   initializeSystemZCopyPhysRegsPass(PR);
 }
 
-static std::string computeDataLayout(const Triple &TT) {
-  std::string Ret;
-
-  // Big endian.
-  Ret += "E";
-
-  // Data mangling.
-  Ret += DataLayout::getManglingComponent(TT);
-
-  // Special features for z/OS.
-  if (TT.isOSzOS()) {
-    if (TT.isArch64Bit()) {
-      // Custom address space for ptr32.
-      Ret += "-p1:32:32";
-    }
-  }
-
-  // Make sure that global data has at least 16 bits of alignment by
-  // default, so that we can refer to it using LARL.  We don't have any
-  // special requirements for stack variables though.
-  Ret += "-i1:8:16-i8:8:16";
-
-  // 64-bit integers are naturally aligned.
-  Ret += "-i64:64";
-
-  // 128-bit floats are aligned only to 64 bits.
-  Ret += "-f128:64";
-
-  // The DataLayout string always holds a vector alignment of 64 bits, see
-  // comment in clang/lib/Basic/Targets/SystemZ.h.
-  Ret += "-v128:64";
-
-  // We prefer 16 bits of aligned for all globals; see above.
-  Ret += "-a:8:16";
-
-  // Integer registers are 32 or 64 bits.
-  Ret += "-n32:64";
-
-  return Ret;
-}
-
 static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
   if (TT.isOSzOS())
     return std::make_unique<TargetLoweringObjectFileGOFF>();
@@ -163,7 +122,7 @@ SystemZTargetMachine::SystemZTargetMachine(const Target &T, const Triple &TT,
                                            std::optional<CodeModel::Model> CM,
                                            CodeGenOptLevel OL, bool JIT)
     : CodeGenTargetMachineImpl(
-          T, computeDataLayout(TT), TT, CPU, FS, Options,
+          T, TT.computeDataLayout(), TT, CPU, FS, Options,
           getEffectiveRelocModel(RM),
           getEffectiveSystemZCodeModel(CM, getEffectiveRelocModel(RM), JIT),
           OL),
diff --git a/llvm/lib/Target/VE/VERegisterInfo.cpp b/llvm/lib/Target/VE/VERegisterInfo.cpp
index f381b7d321598..99e1f61c088eb 100644
--- a/llvm/lib/Target/VE/VERegisterInfo.cpp
+++ b/llvm/lib/Target/VE/VERegisterInfo.cpp
@@ -93,8 +93,7 @@ BitVector VERegisterInfo::getReservedRegs(const MachineFunction &MF) const {
 }
 
 const TargetRegisterClass *
-VERegisterInfo::getPointerRegClass(const MachineFunction &MF,
-                                   unsigned Kind) const {
+VERegisterInfo::getPointerRegClass(unsigned Kind) const {
   return &VE::I64RegClass;
 }
 
diff --git a/llvm/lib/Target/VE/VERegisterInfo.h b/llvm/lib/Target/VE/VERegisterInfo.h
index 3f6feedf42534..999dc856c9bd5 100644
--- a/llvm/lib/Target/VE/VERegisterInfo.h
+++ b/llvm/lib/Target/VE/VERegisterInfo.h
@@ -31,8 +31,7 @@ struct VERegisterInfo : public VEGenRegisterInfo {
 
   BitVector getReservedRegs(const MachineFunction &MF) const override;
 
-  const TargetRegisterClass *getPointerRegClass(const MachineFunction &MF,
-                                                unsigned Kind) const override;
+  const TargetRegisterClass *getPointerRegClass(unsigned Kind) const override;
 
   bool eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
                            unsigned FIOperandNum,
diff --git a/llvm/lib/Target/VE/VETargetMachine.cpp b/llvm/lib/Target/VE/VETargetMachine.cpp
index 14b8e330d87a4..dc9ca48cc221b 100644
--- a/llvm/lib/Target/VE/VETargetMachine.cpp
+++ b/llvm/lib/Target/VE/VETargetMachine.cpp
@@ -35,38 +35,6 @@ extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeVETarget() {
   initializeVEDAGToDAGISelLegacyPass(PR);
 }
 
-static std::string computeDataLayout(const Triple &T) {
-  // Aurora VE is little endian
-  std::string Ret = "e";
-
-  // Use ELF mangling
-  Ret += "-m:e";
-
-  // Alignments for 64 bit integers.
-  Ret += "-i64:64";
-
-  // VE supports 32 bit and 64 bits integer on registers
-  Ret += "-n32:64";
-
-  // Stack alignment is 128 bits
-  Ret += "-S128";
-
-  // Vector alignments are 64 bits
-  // Need to define all of them.  Otherwise, each alignment becomes
-  // the size of each data by default.
-  Ret += "-v64:64:64"; // for v2f32
-  Ret += "-v128:64:64";
-  Ret += "-v256:64:64";
-  Ret += "-v512:64:64";
-  Ret += "-v1024:64:64";
-  Ret += "-v2048:64:64";
-  Ret += "-v4096:64:64";
-  Ret += "-v8192:64:64";
-  Ret += "-v16384:64:64"; // for v256f64
-
-  return Ret;
-}
-
 static Reloc::Model getEffectiveRelocModel(std::optional<Reloc::Model> RM) {
   return RM.value_or(Reloc::Static);
 }
@@ -91,7 +59,7 @@ VETargetMachine::VETargetMachine(const Target &T, const Triple &TT,
                                  std::optional<Reloc::Model> RM,
                                  std::optional<CodeModel::Model> CM,
                                  CodeGenOptLevel OL, bool JIT)
-    : CodeGenTargetMachineImpl(T, computeDataLayout(TT), TT, CPU, FS, Options,
+    : CodeGenTargetMachineImpl(T, TT.computeDataLayout(), TT, CPU, FS, Options,
                                getEffectiveRelocModel(RM),
                                getEffectiveCodeModel(CM, CodeModel::Small), OL),
       TLOF(createTLOF()),
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp
index 2f36e26066d81..27f7e1ada1250 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp
@@ -278,7 +278,7 @@ void WebAssemblyFrameLowering::emitPrologue(MachineFunction &MF,
   DebugLoc DL;
 
   const TargetRegisterClass *PtrRC =
-      MRI.getTargetRegisterInfo()->getPointerRegClass(MF);
+      MRI.getTargetRegisterInfo()->getPointerRegClass();
   unsigned SPReg = getSPReg(MF);
   if (StackSize)
     SPReg = MRI.createVirtualRegister(PtrRC);
@@ -349,7 +349,7 @@ void WebAssemblyFrameLowering::emitEpilogue(MachineFunction &MF,
     SPReg = FI->getBasePointerVreg();
   } else if (StackSize) {
     const TargetRegisterClass *PtrRC =
-        MRI.getTargetRegisterInfo()->getPointerRegClass(MF);
+        MRI.getTargetRegisterInfo()->getPointerRegClass();
     Register OffsetReg = MRI.createVirtualRegister(PtrRC);
     BuildMI(MBB, InsertPt, DL, TII->get(getOpcConst(MF)), OffsetReg)
         .addImm(StackSize);
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISD.def b/llvm/lib/Target/WebAssembly/WebAssemblyISD.def
index 1eae3586d16b8..23108e429eda8 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISD.def
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISD.def
@@ -28,6 +28,7 @@ HANDLE_NODETYPE(BR_IF)
 HANDLE_NODETYPE(BR_TABLE)
 HANDLE_NODETYPE(DOT)
 HANDLE_NODETYPE(EXT_ADD_PAIRWISE_U)
+HANDLE_NODETYPE(EXT_ADD_PAIRWISE_S)
 HANDLE_NODETYPE(SHUFFLE)
 HANDLE_NODETYPE(SWIZZLE)
 HANDLE_NODETYPE(VEC_SHL)
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp
index fc852d0a12e14..2541b0433ab59 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp
@@ -110,7 +110,7 @@ void WebAssemblyDAGToDAGISel::PreprocessISelDAG() {
 }
 
 static SDValue getTagSymNode(int Tag, SelectionDAG *DAG) {
-  assert(Tag == WebAssembly::CPP_EXCEPTION || WebAssembly::C_LONGJMP);
+  assert(Tag == WebAssembly::CPP_EXCEPTION || Tag == WebAssembly::C_LONGJMP);
   auto &MF = DAG->getMachineFunction();
   const auto &TLI = DAG->getTargetLoweringInfo();
   MVT PtrVT = TLI.getPointerTy(DAG->getDataLayout());
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index fe100dab427ef..64b9dc31f75b7 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -418,28 +418,34 @@ MVT WebAssemblyTargetLowering::getPointerMemTy(const DataLayout &DL,
 
 bool WebAssemblyTargetLowering::shouldExpandPartialReductionIntrinsic(
     const IntrinsicInst *I) const {
-  if (I->getIntrinsicID() != Intrinsic::experimental_vector_partial_reduce_add)
+  if (I->getIntrinsicID() != Intrinsic::vector_partial_reduce_add)
     return true;
 
   EVT VT = EVT::getEVT(I->getType());
+  if (VT.getSizeInBits() > 128)
+    return true;
+
   auto Op1 = I->getOperand(1);
 
   if (auto *InputInst = dyn_cast<Instruction>(Op1)) {
-    if (InstructionOpcodeToISD(InputInst->getOpcode()) != ISD::MUL)
-      return true;
-
-    if (isa<Instruction>(InputInst->getOperand(0)) &&
-        isa<Instruction>(InputInst->getOperand(1))) {
-      // dot only supports signed inputs but also support lowering unsigned.
-      if (cast<Instruction>(InputInst->getOperand(0))->getOpcode() !=
-          cast<Instruction>(InputInst->getOperand(1))->getOpcode())
-        return true;
-
-      EVT Op1VT = EVT::getEVT(Op1->getType());
-      if (Op1VT.getVectorElementType() == VT.getVectorElementType() &&
-          ((VT.getVectorElementCount() * 2 == Op1VT.getVectorElementCount()) ||
-           (VT.getVectorElementCount() * 4 == Op1VT.getVectorElementCount())))
-        return false;
+    unsigned Opcode = InstructionOpcodeToISD(InputInst->getOpcode());
+    if (Opcode == ISD::MUL) {
+      if (isa<Instruction>(InputInst->getOperand(0)) &&
+          isa<Instruction>(InputInst->getOperand(1))) {
+        // dot only supports signed inputs but also support lowering unsigned.
+        if (cast<Instruction>(InputInst->getOperand(0))->getOpcode() !=
+            cast<Instruction>(InputInst->getOperand(1))->getOpcode())
+          return true;
+
+        EVT Op1VT = EVT::getEVT(Op1->getType());
+        if (Op1VT.getVectorElementType() == VT.getVectorElementType() &&
+            ((VT.getVectorElementCount() * 2 ==
+              Op1VT.getVectorElementCount()) ||
+             (VT.getVectorElementCount() * 4 == Op1VT.getVectorElementCount())))
+          return false;
+      }
+    } else if (ISD::isExtOpcode(Opcode)) {
+      return false;
     }
   }
   return true;
@@ -2111,83 +2117,98 @@ SDValue WebAssemblyTargetLowering::LowerVASTART(SDValue Op,
 // extmul and adds.
 SDValue performLowerPartialReduction(SDNode *N, SelectionDAG &DAG) {
   assert(N->getOpcode() == ISD::INTRINSIC_WO_CHAIN);
-  if (N->getConstantOperandVal(0) !=
-      Intrinsic::experimental_vector_partial_reduce_add)
+  if (N->getConstantOperandVal(0) != Intrinsic::vector_partial_reduce_add)
     return SDValue();
 
   assert(N->getValueType(0) == MVT::v4i32 && "can only support v4i32");
   SDLoc DL(N);
-  SDValue Mul = N->getOperand(2);
-  assert(Mul->getOpcode() == ISD::MUL && "expected mul input");
-
-  SDValue ExtendLHS = Mul->getOperand(0);
-  SDValue ExtendRHS = Mul->getOperand(1);
-  assert((ISD::isExtOpcode(ExtendLHS.getOpcode()) &&
-          ISD::isExtOpcode(ExtendRHS.getOpcode())) &&
-         "expected widening mul");
-  assert(ExtendLHS.getOpcode() == ExtendRHS.getOpcode() &&
-         "expected mul to use the same extend for both operands");
-
-  SDValue ExtendInLHS = ExtendLHS->getOperand(0);
-  SDValue ExtendInRHS = ExtendRHS->getOperand(0);
-  bool IsSigned = ExtendLHS->getOpcode() == ISD::SIGN_EXTEND;
-
-  if (ExtendInLHS->getValueType(0) == MVT::v8i16) {
-    if (IsSigned) {
-      // i32x4.dot_i16x8_s
-      SDValue Dot = DAG.getNode(WebAssemblyISD::DOT, DL, MVT::v4i32,
-                                ExtendInLHS, ExtendInRHS);
-      return DAG.getNode(ISD::ADD, DL, MVT::v4i32, N->getOperand(1), Dot);
-    }
 
-    unsigned LowOpc = WebAssemblyISD::EXTEND_LOW_U;
-    unsigned HighOpc = WebAssemblyISD::EXTEND_HIGH_U;
+  SDValue Input = N->getOperand(2);
+  if (Input->getOpcode() == ISD::MUL) {
+    SDValue ExtendLHS = Input->getOperand(0);
+    SDValue ExtendRHS = Input->getOperand(1);
+    assert((ISD::isExtOpcode(ExtendLHS.getOpcode()) &&
+            ISD::isExtOpcode(ExtendRHS.getOpcode())) &&
+           "expected widening mul or add");
+    assert(ExtendLHS.getOpcode() == ExtendRHS.getOpcode() &&
+           "expected binop to use the same extend for both operands");
+
+    SDValue ExtendInLHS = ExtendLHS->getOperand(0);
+    SDValue ExtendInRHS = ExtendRHS->getOperand(0);
+    bool IsSigned = ExtendLHS->getOpcode() == ISD::SIGN_EXTEND;
+    unsigned LowOpc =
+        IsSigned ? WebAssemblyISD::EXTEND_LOW_S : WebAssemblyISD::EXTEND_LOW_U;
+    unsigned HighOpc = IsSigned ? WebAssemblyISD::EXTEND_HIGH_S
+                                : WebAssemblyISD::EXTEND_HIGH_U;
+    SDValue LowLHS;
+    SDValue LowRHS;
+    SDValue HighLHS;
+    SDValue HighRHS;
+
+    auto AssignInputs = [&](MVT VT) {
+      LowLHS = DAG.getNode(LowOpc, DL, VT, ExtendInLHS);
+      LowRHS = DAG.getNode(LowOpc, DL, VT, ExtendInRHS);
+      HighLHS = DAG.getNode(HighOpc, DL, VT, ExtendInLHS);
+      HighRHS = DAG.getNode(HighOpc, DL, VT, ExtendInRHS);
+    };
 
-    // (add (add (extmul_low_sx lhs, rhs), (extmul_high_sx lhs, rhs)))
-    SDValue LowLHS = DAG.getNode(LowOpc, DL, MVT::v4i32, ExtendInLHS);
-    SDValue LowRHS = DAG.getNode(LowOpc, DL, MVT::v4i32, ExtendInRHS);
-    SDValue HighLHS = DAG.getNode(HighOpc, DL, MVT::v4i32, ExtendInLHS);
-    SDValue HighRHS = DAG.getNode(HighOpc, DL, MVT::v4i32, ExtendInRHS);
+    if (ExtendInLHS->getValueType(0) == MVT::v8i16) {
+      if (IsSigned) {
+        // i32x4.dot_i16x8_s
+        SDValue Dot = DAG.getNode(WebAssemblyISD::DOT, DL, MVT::v4i32,
+                                  ExtendInLHS, ExtendInRHS);
+        return DAG.getNode(ISD::ADD, DL, MVT::v4i32, N->getOperand(1), Dot);
+      }
 
-    SDValue MulLow = DAG.getNode(ISD::MUL, DL, MVT::v4i32, LowLHS, LowRHS);
-    SDValue MulHigh = DAG.getNode(ISD::MUL, DL, MVT::v4i32, HighLHS, HighRHS);
-    SDValue Add = DAG.getNode(ISD::ADD, DL, MVT::v4i32, MulLow, MulHigh);
-    return DAG.getNode(ISD::ADD, DL, MVT::v4i32, N->getOperand(1), Add);
+      // (add (add (extmul_low_sx lhs, rhs), (extmul_high_sx lhs, rhs)))
+      MVT VT = MVT::v4i32;
+      AssignInputs(VT);
+      SDValue MulLow = DAG.getNode(ISD::MUL, DL, VT, LowLHS, LowRHS);
+      SDValue MulHigh = DAG.getNode(ISD::MUL, DL, VT, HighLHS, HighRHS);
+      SDValue Add = DAG.getNode(ISD::ADD, DL, VT, MulLow, MulHigh);
+      return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(1), Add);
+    } else {
+      assert(ExtendInLHS->getValueType(0) == MVT::v16i8 &&
+             "expected v16i8 input types");
+      AssignInputs(MVT::v8i16);
+      // Lower to a wider tree, using twice the operations compared to above.
+      if (IsSigned) {
+        // Use two dots
+        SDValue DotLHS =
+            DAG.getNode(WebAssemblyISD::DOT, DL, MVT::v4i32, LowLHS, LowRHS);
+        SDValue DotRHS =
+            DAG.getNode(WebAssemblyISD::DOT, DL, MVT::v4i32, HighLHS, HighRHS);
+        SDValue Add = DAG.getNode(ISD::ADD, DL, MVT::v4i32, DotLHS, DotRHS);
+        return DAG.getNode(ISD::ADD, DL, MVT::v4i32, N->getOperand(1), Add);
+      }
+
+      SDValue MulLow = DAG.getNode(ISD::MUL, DL, MVT::v8i16, LowLHS, LowRHS);
+      SDValue MulHigh = DAG.getNode(ISD::MUL, DL, MVT::v8i16, HighLHS, HighRHS);
+
+      SDValue AddLow = DAG.getNode(WebAssemblyISD::EXT_ADD_PAIRWISE_U, DL,
+                                   MVT::v4i32, MulLow);
+      SDValue AddHigh = DAG.getNode(WebAssemblyISD::EXT_ADD_PAIRWISE_U, DL,
+                                    MVT::v4i32, MulHigh);
+      SDValue Add = DAG.getNode(ISD::ADD, DL, MVT::v4i32, AddLow, AddHigh);
+      return DAG.getNode(ISD::ADD, DL, MVT::v4i32, N->getOperand(1), Add);
+    }
   } else {
-    assert(ExtendInLHS->getValueType(0) == MVT::v16i8 &&
-           "expected v16i8 input types");
-    // Lower to a wider tree, using twice the operations compared to above.
-    if (IsSigned) {
-      // Use two dots
-      unsigned LowOpc = WebAssemblyISD::EXTEND_LOW_S;
-      unsigned HighOpc = WebAssemblyISD::EXTEND_HIGH_S;
-      SDValue LowLHS = DAG.getNode(LowOpc, DL, MVT::v8i16, ExtendInLHS);
-      SDValue LowRHS = DAG.getNode(LowOpc, DL, MVT::v8i16, ExtendInRHS);
-      SDValue HighLHS = DAG.getNode(HighOpc, DL, MVT::v8i16, ExtendInLHS);
-      SDValue HighRHS = DAG.getNode(HighOpc, DL, MVT::v8i16, ExtendInRHS);
-      SDValue DotLHS =
-          DAG.getNode(WebAssemblyISD::DOT, DL, MVT::v4i32, LowLHS, LowRHS);
-      SDValue DotRHS =
-          DAG.getNode(WebAssemblyISD::DOT, DL, MVT::v4i32, HighLHS, HighRHS);
-      SDValue Add = DAG.getNode(ISD::ADD, DL, MVT::v4i32, DotLHS, DotRHS);
+    // Accumulate the input using extadd_pairwise.
+    assert(ISD::isExtOpcode(Input.getOpcode()) && "expected extend");
+    bool IsSigned = Input->getOpcode() == ISD::SIGN_EXTEND;
+    unsigned PairwiseOpc = IsSigned ? WebAssemblyISD::EXT_ADD_PAIRWISE_S
+                                    : WebAssemblyISD::EXT_ADD_PAIRWISE_U;
+    SDValue ExtendIn = Input->getOperand(0);
+    if (ExtendIn->getValueType(0) == MVT::v8i16) {
+      SDValue Add = DAG.getNode(PairwiseOpc, DL, MVT::v4i32, ExtendIn);
       return DAG.getNode(ISD::ADD, DL, MVT::v4i32, N->getOperand(1), Add);
     }
 
-    unsigned LowOpc = WebAssemblyISD::EXTEND_LOW_U;
-    unsigned HighOpc = WebAssemblyISD::EXTEND_HIGH_U;
-    SDValue LowLHS = DAG.getNode(LowOpc, DL, MVT::v8i16, ExtendInLHS);
-    SDValue LowRHS = DAG.getNode(LowOpc, DL, MVT::v8i16, ExtendInRHS);
-    SDValue HighLHS = DAG.getNode(HighOpc, DL, MVT::v8i16, ExtendInLHS);
-    SDValue HighRHS = DAG.getNode(HighOpc, DL, MVT::v8i16, ExtendInRHS);
-
-    SDValue MulLow = DAG.getNode(ISD::MUL, DL, MVT::v8i16, LowLHS, LowRHS);
-    SDValue MulHigh = DAG.getNode(ISD::MUL, DL, MVT::v8i16, HighLHS, HighRHS);
-
-    SDValue AddLow =
-        DAG.getNode(WebAssemblyISD::EXT_ADD_PAIRWISE_U, DL, MVT::v4i32, MulLow);
-    SDValue AddHigh = DAG.getNode(WebAssemblyISD::EXT_ADD_PAIRWISE_U, DL,
-                                  MVT::v4i32, MulHigh);
-    SDValue Add = DAG.getNode(ISD::ADD, DL, MVT::v4i32, AddLow, AddHigh);
+    assert(ExtendIn->getValueType(0) == MVT::v16i8 &&
+           "expected v16i8 input types");
+    SDValue Add =
+        DAG.getNode(PairwiseOpc, DL, MVT::v4i32,
+                    DAG.getNode(PairwiseOpc, DL, MVT::v8i16, ExtendIn));
     return DAG.getNode(ISD::ADD, DL, MVT::v4i32, N->getOperand(1), Add);
   }
 }
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
index 3c26b453c4482..d8948ad2df037 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
@@ -1454,12 +1454,13 @@ def : Pat<(t1.vt (bitconvert (t2.vt V128:$v))), (t1.vt V128:$v)>;
 
 // Extended pairwise addition
 def extadd_pairwise_u : SDNode<"WebAssemblyISD::EXT_ADD_PAIRWISE_U", extend_t>;
+def extadd_pairwise_s : SDNode<"WebAssemblyISD::EXT_ADD_PAIRWISE_S", extend_t>;
 
-defm "" : SIMDConvert<I16x8, I8x16, int_wasm_extadd_pairwise_signed,
+defm "" : SIMDConvert<I16x8, I8x16, extadd_pairwise_s,
                       "extadd_pairwise_i8x16_s", 0x7c>;
 defm "" : SIMDConvert<I16x8, I8x16, extadd_pairwise_u,
                       "extadd_pairwise_i8x16_u", 0x7d>;
-defm "" : SIMDConvert<I32x4, I16x8, int_wasm_extadd_pairwise_signed,
+defm "" : SIMDConvert<I32x4, I16x8, extadd_pairwise_s,
                       "extadd_pairwise_i16x8_s", 0x7e>;
 defm "" : SIMDConvert<I32x4, I16x8, extadd_pairwise_u,
                       "extadd_pairwise_i16x8_u", 0x7f>;
@@ -1468,6 +1469,10 @@ def : Pat<(v4i32 (int_wasm_extadd_pairwise_unsigned (v8i16 V128:$in))),
           (extadd_pairwise_u_I32x4 V128:$in)>;
 def : Pat<(v8i16 (int_wasm_extadd_pairwise_unsigned (v16i8 V128:$in))),
           (extadd_pairwise_u_I16x8 V128:$in)>;
+def : Pat<(v4i32 (int_wasm_extadd_pairwise_signed (v8i16 V128:$in))),
+          (extadd_pairwise_s_I32x4 V128:$in)>;
+def : Pat<(v8i16 (int_wasm_extadd_pairwise_signed (v16i8 V128:$in))),
+          (extadd_pairwise_s_I16x8 V128:$in)>;
 
 // f64x2 <-> f32x4 conversions
 def demote_t : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>]>;
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp
index 18886ba570681..ebb5f555df67a 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp
@@ -117,7 +117,7 @@ bool WebAssemblyRegisterInfo::eliminateFrameIndex(
   if (FrameOffset) {
     // Create i32/64.add SP, offset and make it the operand.
     const TargetRegisterClass *PtrRC =
-        MRI.getTargetRegisterInfo()->getPointerRegClass(MF);
+        MRI.getTargetRegisterInfo()->getPointerRegClass();
     Register OffsetOp = MRI.createVirtualRegister(PtrRC);
     BuildMI(MBB, *II, II->getDebugLoc(),
             TII->get(WebAssemblyFrameLowering::getOpcConst(MF)),
@@ -149,10 +149,8 @@ WebAssemblyRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
 }
 
 const TargetRegisterClass *
-WebAssemblyRegisterInfo::getPointerRegClass(const MachineFunction &MF,
-                                            unsigned Kind) const {
+WebAssemblyRegisterInfo::getPointerRegClass(unsigned Kind) const {
   assert(Kind == 0 && "Only one kind of pointer on WebAssembly");
-  if (MF.getSubtarget<WebAssemblySubtarget>().hasAddr64())
-    return &WebAssembly::I64RegClass;
-  return &WebAssembly::I32RegClass;
+  return TT.getArch() == Triple::wasm64 ? &WebAssembly::I64RegClass
+                                        : &WebAssembly::I32RegClass;
 }
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.h b/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.h
index d875e4b93603b..3a73ff6b1b3b0 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.h
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.h
@@ -42,8 +42,7 @@ class WebAssemblyRegisterInfo final : public WebAssemblyGenRegisterInfo {
   Register getFrameRegister(const MachineFunction &MF) const override;
 
   const TargetRegisterClass *
-  getPointerRegClass(const MachineFunction &MF,
-                     unsigned Kind = 0) const override;
+  getPointerRegClass(unsigned Kind = 0) const override;
   // This does not apply to wasm.
   const uint32_t *getNoPreservedMask() const override { return nullptr; }
 };
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
index 6827ee6527947..a9c638cde1259 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
@@ -196,19 +196,9 @@ WebAssemblyTargetMachine::WebAssemblyTargetMachine(
     const Target &T, const Triple &TT, StringRef CPU, StringRef FS,
     const TargetOptions &Options, std::optional<Reloc::Model> RM,
     std::optional<CodeModel::Model> CM, CodeGenOptLevel OL, bool JIT)
-    : CodeGenTargetMachineImpl(
-          T,
-          TT.isArch64Bit()
-              ? (TT.isOSEmscripten() ? "e-m:e-p:64:64-p10:8:8-p20:8:8-i64:64-"
-                                       "i128:128-f128:64-n32:64-S128-ni:1:10:20"
-                                     : "e-m:e-p:64:64-p10:8:8-p20:8:8-i64:64-"
-                                       "i128:128-n32:64-S128-ni:1:10:20")
-              : (TT.isOSEmscripten() ? "e-m:e-p:32:32-p10:8:8-p20:8:8-i64:64-"
-                                       "i128:128-f128:64-n32:64-S128-ni:1:10:20"
-                                     : "e-m:e-p:32:32-p10:8:8-p20:8:8-i64:64-"
-                                       "i128:128-n32:64-S128-ni:1:10:20"),
-          TT, CPU, FS, Options, getEffectiveRelocModel(RM, TT),
-          getEffectiveCodeModel(CM, CodeModel::Large), OL),
+    : CodeGenTargetMachineImpl(T, TT.computeDataLayout(), TT, CPU, FS, Options,
+                               getEffectiveRelocModel(RM, TT),
+                               getEffectiveCodeModel(CM, CodeModel::Large), OL),
       TLOF(new WebAssemblyTargetObjectFile()),
       UsesMultivalueABI(Options.MCOptions.getABIName() == "experimental-mv") {
   // WebAssembly type-checks instructions, but a noreturn function with a return
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
index 0eefd3e2b3500..92a9812df2127 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
@@ -316,7 +316,13 @@ InstructionCost WebAssemblyTTIImpl::getPartialReductionCost(
   if (CostKind != TTI::TCK_RecipThroughput)
     return Invalid;
 
-  InstructionCost Cost(TTI::TCC_Basic);
+  if (Opcode != Instruction::Add)
+    return Invalid;
+
+  EVT AccumEVT = EVT::getEVT(AccumType);
+  // TODO: Add i64 accumulator.
+  if (AccumEVT != MVT::i32)
+    return Invalid;
 
   // Possible options:
   // - i16x8.extadd_pairwise_i8x16_sx
@@ -324,23 +330,26 @@ InstructionCost WebAssemblyTTIImpl::getPartialReductionCost(
   // - i32x4.dot_i16x8_s
   // Only try to support dot, for now.
 
-  if (Opcode != Instruction::Add)
+  EVT InputEVT = EVT::getEVT(InputTypeA);
+  if (!((InputEVT == MVT::i16 && VF.getFixedValue() == 8) ||
+        (InputEVT == MVT::i8 && VF.getFixedValue() == 16))) {
     return Invalid;
+  }
 
-  if (!BinOp || *BinOp != Instruction::Mul)
+  if (OpAExtend == TTI::PR_None)
     return Invalid;
 
-  if (InputTypeA != InputTypeB)
-    return Invalid;
+  InstructionCost Cost(TTI::TCC_Basic);
+  if (!BinOp)
+    return Cost;
 
   if (OpAExtend != OpBExtend)
     return Invalid;
 
-  EVT InputEVT = EVT::getEVT(InputTypeA);
-  EVT AccumEVT = EVT::getEVT(AccumType);
+  if (*BinOp != Instruction::Mul)
+    return Invalid;
 
-  // TODO: Add i64 accumulator.
-  if (AccumEVT != MVT::i32)
+  if (InputTypeA != InputTypeB)
     return Invalid;
 
   // Signed inputs can lower to dot
diff --git a/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp b/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp
index 7fe58539cd4ec..2c752457d165e 100644
--- a/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp
+++ b/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp
@@ -98,10 +98,11 @@ X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI,
       .widenScalarToNextPow2(0, /*Min=*/8)
       .clampScalar(0, s8, sMaxScalar);
 
-  getActionDefinitionsBuilder(
-      {G_LROUND, G_LLROUND, G_FCOS,  G_FCOSH, G_FACOS,  G_FSIN,  G_FSINH,
-       G_FASIN,  G_FTAN,    G_FTANH, G_FATAN, G_FATAN2, G_FPOW,  G_FEXP,
-       G_FEXP2,  G_FEXP10,  G_FLOG,  G_FLOG2, G_FLOG10, G_FPOWI, G_FSINCOS})
+  getActionDefinitionsBuilder({G_LROUND,  G_LLROUND, G_FCOS,  G_FCOSH,  G_FACOS,
+                               G_FSIN,    G_FSINH,   G_FASIN, G_FTAN,   G_FTANH,
+                               G_FATAN,   G_FATAN2,  G_FPOW,  G_FEXP,   G_FEXP2,
+                               G_FEXP10,  G_FLOG,    G_FLOG2, G_FLOG10, G_FPOWI,
+                               G_FSINCOS, G_FCEIL,   G_FFLOOR})
       .libcall();
 
   getActionDefinitionsBuilder(G_FSQRT)
@@ -580,7 +581,7 @@ X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI,
       .lower();
 
   // fp intrinsics
-  getActionDefinitionsBuilder(G_INTRINSIC_ROUNDEVEN)
+  getActionDefinitionsBuilder({G_INTRINSIC_ROUNDEVEN, G_INTRINSIC_TRUNC})
       .scalarize(0)
       .minScalar(0, LLT::scalar(32))
       .libcall();
diff --git a/llvm/lib/Target/X86/X86AsmPrinter.cpp b/llvm/lib/Target/X86/X86AsmPrinter.cpp
index ff22ee8c86fac..a7734e9200a19 100644
--- a/llvm/lib/Target/X86/X86AsmPrinter.cpp
+++ b/llvm/lib/Target/X86/X86AsmPrinter.cpp
@@ -478,9 +478,9 @@ static bool isIndirectBranchOrTailCall(const MachineInstr &MI) {
          Opc == X86::TAILJMPr64 || Opc == X86::TAILJMPm64 ||
          Opc == X86::TCRETURNri || Opc == X86::TCRETURN_WIN64ri ||
          Opc == X86::TCRETURN_HIPE32ri || Opc == X86::TCRETURNmi ||
-         Opc == X86::TCRETURNri64 || Opc == X86::TCRETURNmi64 ||
-         Opc == X86::TCRETURNri64_ImpCall || Opc == X86::TAILJMPr64_REX ||
-         Opc == X86::TAILJMPm64_REX;
+         Opc == X86::TCRETURN_WINmi64 || Opc == X86::TCRETURNri64 ||
+         Opc == X86::TCRETURNmi64 || Opc == X86::TCRETURNri64_ImpCall ||
+         Opc == X86::TAILJMPr64_REX || Opc == X86::TAILJMPm64_REX;
 }
 
 void X86AsmPrinter::emitBasicBlockEnd(const MachineBasicBlock &MBB) {
diff --git a/llvm/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp b/llvm/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp
index 3ac7c8222b54b..d2e35277419f7 100644
--- a/llvm/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp
+++ b/llvm/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp
@@ -388,7 +388,7 @@ void X86AvoidSFBPass::buildCopy(MachineInstr *LoadInst, unsigned NLoadOpcode,
   MachineMemOperand *SMMO = *StoreInst->memoperands_begin();
 
   Register Reg1 = MRI->createVirtualRegister(
-      TII->getRegClass(TII->get(NLoadOpcode), 0, TRI, *(MBB->getParent())));
+      TII->getRegClass(TII->get(NLoadOpcode), 0, TRI));
   MachineInstr *NewLoad =
       BuildMI(*MBB, LoadInst, LoadInst->getDebugLoc(), TII->get(NLoadOpcode),
               Reg1)
@@ -553,8 +553,7 @@ void X86AvoidSFBPass::findPotentiallylBlockedCopies(MachineFunction &MF) {
 }
 
 unsigned X86AvoidSFBPass::getRegSizeInBytes(MachineInstr *LoadInst) {
-  const auto *TRC = TII->getRegClass(TII->get(LoadInst->getOpcode()), 0, TRI,
-                              *LoadInst->getParent()->getParent());
+  const auto *TRC = TII->getRegClass(TII->get(LoadInst->getOpcode()), 0, TRI);
   return TRI->getRegSizeInBits(*TRC) / 8;
 }
 
diff --git a/llvm/lib/Target/X86/X86CompressEVEX.cpp b/llvm/lib/Target/X86/X86CompressEVEX.cpp
index 4ea30de78402f..c0c7f5adf06ef 100644
--- a/llvm/lib/Target/X86/X86CompressEVEX.cpp
+++ b/llvm/lib/Target/X86/X86CompressEVEX.cpp
@@ -174,7 +174,8 @@ static bool performCustomAdjustments(MachineInstr &MI, unsigned NewOpc) {
   return true;
 }
 
-static bool CompressEVEXImpl(MachineInstr &MI, const X86Subtarget &ST) {
+static bool CompressEVEXImpl(MachineInstr &MI, MachineBasicBlock &MBB,
+                             const X86Subtarget &ST) {
   uint64_t TSFlags = MI.getDesc().TSFlags;
 
   // Check for EVEX instructions only.
@@ -239,14 +240,14 @@ static bool CompressEVEXImpl(MachineInstr &MI, const X86Subtarget &ST) {
     return I->NewOpc;
   };
 
-  // Redundant NDD ops cannot be safely compressed if either:
-  // - the legacy op would introduce a partial write that BreakFalseDeps
-  // identified as a potential stall, or
-  // - the op is writing to a subregister of a live register, i.e. the
-  // full (zeroed) result is used.
-  // Both cases are indicated by an implicit def of the superregister.
+  Register Dst = MI.getOperand(0).getReg();
   if (IsRedundantNDD) {
-    Register Dst = MI.getOperand(0).getReg();
+    // Redundant NDD ops cannot be safely compressed if either:
+    // - the legacy op would introduce a partial write that BreakFalseDeps
+    // identified as a potential stall, or
+    // - the op is writing to a subregister of a live register, i.e. the
+    // full (zeroed) result is used.
+    // Both cases are indicated by an implicit def of the superregister.
     if (Dst &&
         (X86::GR16RegClass.contains(Dst) || X86::GR8RegClass.contains(Dst))) {
       Register Super = getX86SubSuperRegister(Dst, 64);
@@ -260,6 +261,33 @@ static bool CompressEVEXImpl(MachineInstr &MI, const X86Subtarget &ST) {
     if (!X86EnableAPXForRelocation)
       assert(!isAddMemInstrWithRelocation(MI) &&
              "Unexpected NDD instruction with relocation!");
+  } else if (Opc == X86::ADD32ri_ND || Opc == X86::ADD64ri32_ND ||
+             Opc == X86::ADD32rr_ND || Opc == X86::ADD64rr_ND) {
+    // Non-redundant NDD ADD can be compressed to LEA when:
+    // - No EGPR register used and
+    // - EFLAGS is dead.
+    if (!usesExtendedRegister(MI) &&
+        MI.registerDefIsDead(X86::EFLAGS, /*TRI=*/nullptr)) {
+      Register Src1 = MI.getOperand(1).getReg();
+      const MachineOperand &Src2 = MI.getOperand(2);
+      bool Is32BitReg = Opc == X86::ADD32ri_ND || Opc == X86::ADD32rr_ND;
+      const MCInstrDesc &NewDesc =
+          ST.getInstrInfo()->get(Is32BitReg ? X86::LEA32r : X86::LEA64r);
+      if (Is32BitReg)
+        Src1 = getX86SubSuperRegister(Src1, 64);
+      MachineInstrBuilder MIB = BuildMI(MBB, MI, MI.getDebugLoc(), NewDesc, Dst)
+                                    .addReg(Src1)
+                                    .addImm(1);
+      if (Opc == X86::ADD32ri_ND || Opc == X86::ADD64ri32_ND)
+        MIB.addReg(0).add(Src2);
+      else if (Is32BitReg)
+        MIB.addReg(getX86SubSuperRegister(Src2.getReg(), 64)).addImm(0);
+      else
+        MIB.add(Src2).addImm(0);
+      MIB.addReg(0);
+      MI.removeFromParent();
+      return true;
+    }
   }
 
   // NonNF -> NF only if it's not a compressible NDD instruction and eflags is
@@ -318,8 +346,8 @@ bool CompressEVEXPass::runOnMachineFunction(MachineFunction &MF) {
 
   for (MachineBasicBlock &MBB : MF) {
     // Traverse the basic block.
-    for (MachineInstr &MI : MBB)
-      Changed |= CompressEVEXImpl(MI, ST);
+    for (MachineInstr &MI : llvm::make_early_inc_range(MBB))
+      Changed |= CompressEVEXImpl(MI, MBB, ST);
   }
   LLVM_DEBUG(dbgs() << "End X86CompressEVEXPass\n";);
   return Changed;
diff --git a/llvm/lib/Target/X86/X86DomainReassignment.cpp b/llvm/lib/Target/X86/X86DomainReassignment.cpp
index 93e55ca5fabf9..339e2f3b7209e 100644
--- a/llvm/lib/Target/X86/X86DomainReassignment.cpp
+++ b/llvm/lib/Target/X86/X86DomainReassignment.cpp
@@ -175,8 +175,7 @@ class InstrReplacerDstCOPY : public InstrConverterBase {
     const DebugLoc &DL = MI->getDebugLoc();
 
     Register Reg = MRI->createVirtualRegister(
-        TII->getRegClass(TII->get(DstOpcode), 0, MRI->getTargetRegisterInfo(),
-                         *MBB->getParent()));
+        TII->getRegClass(TII->get(DstOpcode), 0, MRI->getTargetRegisterInfo()));
     MachineInstrBuilder Bld = BuildMI(*MBB, MI, DL, TII->get(DstOpcode), Reg);
     for (const MachineOperand &MO : llvm::drop_begin(MI->operands()))
       Bld.add(MO);
diff --git a/llvm/lib/Target/X86/X86ExpandPseudo.cpp b/llvm/lib/Target/X86/X86ExpandPseudo.cpp
index 9457e718de699..4a9b824b0db14 100644
--- a/llvm/lib/Target/X86/X86ExpandPseudo.cpp
+++ b/llvm/lib/Target/X86/X86ExpandPseudo.cpp
@@ -276,8 +276,10 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB,
   case X86::TCRETURNdi64cc:
   case X86::TCRETURNri64:
   case X86::TCRETURNri64_ImpCall:
-  case X86::TCRETURNmi64: {
-    bool isMem = Opcode == X86::TCRETURNmi || Opcode == X86::TCRETURNmi64;
+  case X86::TCRETURNmi64:
+  case X86::TCRETURN_WINmi64: {
+    bool isMem = Opcode == X86::TCRETURNmi || Opcode == X86::TCRETURNmi64 ||
+                 Opcode == X86::TCRETURN_WINmi64;
     MachineOperand &JumpTarget = MBBI->getOperand(0);
     MachineOperand &StackAdjust = MBBI->getOperand(isMem ? X86::AddrNumOperands
                                                          : 1);
@@ -341,7 +343,8 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB,
         MIB.addImm(MBBI->getOperand(2).getImm());
       }
 
-    } else if (Opcode == X86::TCRETURNmi || Opcode == X86::TCRETURNmi64) {
+    } else if (Opcode == X86::TCRETURNmi || Opcode == X86::TCRETURNmi64 ||
+               Opcode == X86::TCRETURN_WINmi64) {
       unsigned Op = (Opcode == X86::TCRETURNmi)
                         ? X86::TAILJMPm
                         : (IsX64 ? X86::TAILJMPm64_REX : X86::TAILJMPm64);
diff --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp
index a293b4c87cfe4..08c9d738baceb 100644
--- a/llvm/lib/Target/X86/X86FrameLowering.cpp
+++ b/llvm/lib/Target/X86/X86FrameLowering.cpp
@@ -2402,7 +2402,7 @@ static bool isTailCallOpcode(unsigned Opc) {
          Opc == X86::TCRETURN_HIPE32ri || Opc == X86::TCRETURNdi ||
          Opc == X86::TCRETURNmi || Opc == X86::TCRETURNri64 ||
          Opc == X86::TCRETURNri64_ImpCall || Opc == X86::TCRETURNdi64 ||
-         Opc == X86::TCRETURNmi64;
+         Opc == X86::TCRETURNmi64 || Opc == X86::TCRETURN_WINmi64;
 }
 
 void X86FrameLowering::emitEpilogue(MachineFunction &MF,
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 3631016b0f5c7..f81efdc6414aa 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -41567,6 +41567,17 @@ static SDValue combineX86ShufflesRecursively(
     resolveTargetShuffleInputsAndMask(Ops, Mask);
   }
 
+  // Handle the all undef/zero/ones cases.
+  if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; }))
+    return DAG.getUNDEF(RootVT);
+  if (all_of(Mask, [](int Idx) { return Idx < 0; }))
+    return getZeroVector(RootVT, Subtarget, DAG, DL);
+  if (Ops.size() == 1 && ISD::isBuildVectorAllOnes(Ops[0].getNode()) &&
+      !llvm::is_contained(Mask, SM_SentinelZero))
+    return getOnesVector(RootVT, DAG, DL);
+
+  assert(!Ops.empty() && "Shuffle with no inputs detected");
+
   // We can only combine unary and binary shuffle mask cases.
   if (Ops.size() <= 2) {
     // Minor canonicalization of the accumulated shuffle mask to make it easier
@@ -48396,13 +48407,17 @@ static SDValue checkSignTestSetCCCombine(SDValue Cmp, X86::CondCode &CC,
   MVT SrcVT = Src.getSimpleValueType();
   APInt BitMask = APInt::getSignMask(SrcVT.getScalarSizeInBits());
 
-  // If Src came from a SHL (probably from an expanded SIGN_EXTEND_INREG), then
-  // peek through and adjust the TEST bit.
+  // If Src came from a SIGN_EXTEND_INREG or SHL (probably from an expanded
+  // SIGN_EXTEND_INREG), then peek through and adjust the TEST bit.
   if (Src.getOpcode() == ISD::SHL) {
     if (std::optional<unsigned> ShiftAmt = DAG.getValidShiftAmount(Src)) {
       Src = Src.getOperand(0);
       BitMask.lshrInPlace(*ShiftAmt);
     }
+  } else if (Src.getOpcode() == ISD::SIGN_EXTEND_INREG) {
+    EVT ExtVT = cast<VTSDNode>(Src.getOperand(1))->getVT();
+    Src = Src.getOperand(0);
+    BitMask.lshrInPlace(BitMask.getBitWidth() - ExtVT.getScalarSizeInBits());
   }
 
   SDValue Mask = DAG.getNode(ISD::AND, DL, SrcVT, Src,
diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td
index 5a0df058b27f6..af7a33abaf758 100644
--- a/llvm/lib/Target/X86/X86InstrCompiler.td
+++ b/llvm/lib/Target/X86/X86InstrCompiler.td
@@ -1364,15 +1364,19 @@ def : Pat<(X86tcret ptr_rc_tailcall:$dst, timm:$off),
 // There wouldn't be enough scratch registers for base+index.
 def : Pat<(X86tcret_6regs (load addr:$dst), timm:$off),
           (TCRETURNmi64 addr:$dst, timm:$off)>,
-          Requires<[In64BitMode, NotUseIndirectThunkCalls]>;
+          Requires<[In64BitMode, IsNotWin64CCFunc, NotUseIndirectThunkCalls]>;
+
+def : Pat<(X86tcret_6regs (load addr:$dst), timm:$off),
+          (TCRETURN_WINmi64 addr:$dst, timm:$off)>,
+          Requires<[IsWin64CCFunc, NotUseIndirectThunkCalls]>;
 
 def : Pat<(X86tcret ptr_rc_tailcall:$dst, timm:$off),
           (INDIRECT_THUNK_TCRETURN64 ptr_rc_tailcall:$dst, timm:$off)>,
-          Requires<[In64BitMode, UseIndirectThunkCalls]>;
+          Requires<[In64BitMode, IsNotWin64CCFunc, UseIndirectThunkCalls]>;
 
 def : Pat<(X86tcret ptr_rc_tailcall:$dst, timm:$off),
           (INDIRECT_THUNK_TCRETURN32 ptr_rc_tailcall:$dst, timm:$off)>,
-          Requires<[Not64BitMode, UseIndirectThunkCalls]>;
+          Requires<[Not64BitMode, IsNotWin64CCFunc, UseIndirectThunkCalls]>;
 
 def : Pat<(X86tcret (i64 tglobaladdr:$dst), timm:$off),
           (TCRETURNdi64 tglobaladdr:$dst, timm:$off)>,
@@ -2215,7 +2219,7 @@ let Predicates = [HasZU] in {
   def : Pat<(i64 (zext (i16 (mul (loadi16 addr:$src1), imm:$src2)))),
             (SUBREG_TO_REG (i64 0), (IMULZU16rmi addr:$src1, imm:$src2), sub_16bit)>;
 }
-  
+
 // mul reg, imm
 def : Pat<(mul GR16:$src1, imm:$src2),
           (IMUL16rri GR16:$src1, imm:$src2)>;
diff --git a/llvm/lib/Target/X86/X86InstrControl.td b/llvm/lib/Target/X86/X86InstrControl.td
index 139aedd473ebc..d962bfff1444d 100644
--- a/llvm/lib/Target/X86/X86InstrControl.td
+++ b/llvm/lib/Target/X86/X86InstrControl.td
@@ -372,6 +372,9 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
   def TCRETURNmi64   : PseudoI<(outs),
                                (ins i64mem_TC:$dst, i32imm:$offset),
                                []>, Sched<[WriteJumpLd]>;
+  def TCRETURN_WINmi64   : PseudoI<(outs),
+                               (ins i64mem_w64TC:$dst, i32imm:$offset),
+                               []>, Sched<[WriteJumpLd]>;
 
   def TAILJMPd64 : PseudoI<(outs), (ins i64i32imm_brtarget:$dst),
                            []>, Sched<[WriteJump]>;
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index f109e29c0bff0..58d526269ff3c 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -94,9 +94,8 @@ X86InstrInfo::X86InstrInfo(const X86Subtarget &STI)
 
 const TargetRegisterClass *
 X86InstrInfo::getRegClass(const MCInstrDesc &MCID, unsigned OpNum,
-                          const TargetRegisterInfo *TRI,
-                          const MachineFunction &MF) const {
-  auto *RC = TargetInstrInfo::getRegClass(MCID, OpNum, TRI, MF);
+                          const TargetRegisterInfo *TRI) const {
+  auto *RC = TargetInstrInfo::getRegClass(MCID, OpNum, TRI);
   // If the target does not have egpr, then r16-r31 will be resereved for all
   // instructions.
   if (!RC || !Subtarget.hasEGPR())
@@ -7249,8 +7248,8 @@ static void updateOperandRegConstraints(MachineFunction &MF,
     if (!Reg.isVirtual())
       continue;
 
-    auto *NewRC = MRI.constrainRegClass(
-        Reg, TII.getRegClass(NewMI.getDesc(), Idx, &TRI, MF));
+    auto *NewRC =
+        MRI.constrainRegClass(Reg, TII.getRegClass(NewMI.getDesc(), Idx, &TRI));
     if (!NewRC) {
       LLVM_DEBUG(
           dbgs() << "WARNING: Unable to update register constraint for operand "
@@ -7348,7 +7347,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandCustom(
       unsigned SrcIdx = (Imm >> 6) & 3;
 
       const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
-      const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum, &RI, MF);
+      const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum, &RI);
       unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8;
       if ((Size == 0 || Size >= 16) && RCSize >= 16 &&
           (MI.getOpcode() != X86::INSERTPSrri || Alignment >= Align(4))) {
@@ -7373,7 +7372,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandCustom(
     // TODO: In most cases AVX doesn't have a 8-byte alignment requirement.
     if (OpNum == 2) {
       const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
-      const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum, &RI, MF);
+      const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum, &RI);
       unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8;
       if ((Size == 0 || Size >= 16) && RCSize >= 16 && Alignment >= Align(8)) {
         unsigned NewOpCode =
@@ -7392,7 +7391,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandCustom(
     // table twice.
     if (OpNum == 2) {
       const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
-      const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum, &RI, MF);
+      const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum, &RI);
       unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8;
       if ((Size == 0 || Size >= 16) && RCSize >= 16 && Alignment < Align(16)) {
         MachineInstr *NewMI =
@@ -7527,7 +7526,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
     bool NarrowToMOV32rm = false;
     if (Size) {
       const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
-      const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum, &RI, MF);
+      const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum, &RI);
       unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8;
       // Check if it's safe to fold the load. If the size of the object is
       // narrower than the load width, then it's not.
@@ -8495,7 +8494,7 @@ bool X86InstrInfo::unfoldMemoryOperand(
 
   const MCInstrDesc &MCID = get(Opc);
 
-  const TargetRegisterClass *RC = getRegClass(MCID, Index, &RI, MF);
+  const TargetRegisterClass *RC = getRegClass(MCID, Index, &RI);
   const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
   // TODO: Check if 32-byte or greater accesses are slow too?
   if (!MI.hasOneMemOperand() && RC == &X86::VR128RegClass &&
@@ -8606,7 +8605,7 @@ bool X86InstrInfo::unfoldMemoryOperand(
 
   // Emit the store instruction.
   if (UnfoldStore) {
-    const TargetRegisterClass *DstRC = getRegClass(MCID, 0, &RI, MF);
+    const TargetRegisterClass *DstRC = getRegClass(MCID, 0, &RI);
     auto MMOs = extractStoreMMOs(MI.memoperands(), MF);
     unsigned Alignment = std::max<uint32_t>(TRI.getSpillSize(*DstRC), 16);
     bool isAligned = !MMOs.empty() && MMOs.front()->getAlign() >= Alignment;
@@ -8638,7 +8637,7 @@ bool X86InstrInfo::unfoldMemoryOperand(
   const MCInstrDesc &MCID = get(Opc);
   MachineFunction &MF = DAG.getMachineFunction();
   const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
-  const TargetRegisterClass *RC = getRegClass(MCID, Index, &RI, MF);
+  const TargetRegisterClass *RC = getRegClass(MCID, Index, &RI);
   unsigned NumDefs = MCID.NumDefs;
   std::vector<SDValue> AddrOps;
   std::vector<SDValue> BeforeOps;
@@ -8689,7 +8688,7 @@ bool X86InstrInfo::unfoldMemoryOperand(
   std::vector<EVT> VTs;
   const TargetRegisterClass *DstRC = nullptr;
   if (MCID.getNumDefs() > 0) {
-    DstRC = getRegClass(MCID, 0, &RI, MF);
+    DstRC = getRegClass(MCID, 0, &RI);
     VTs.push_back(*TRI.legalclasstypes_begin(*DstRC));
   }
   for (unsigned i = 0, e = N->getNumValues(); i != e; ++i) {
diff --git a/llvm/lib/Target/X86/X86InstrInfo.h b/llvm/lib/Target/X86/X86InstrInfo.h
index f087b7f20ff67..86133b3d969b1 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.h
+++ b/llvm/lib/Target/X86/X86InstrInfo.h
@@ -248,8 +248,7 @@ class X86InstrInfo final : public X86GenInstrInfo {
   /// GR*_NOREX2RegClass (Returned register class)
   const TargetRegisterClass *
   getRegClass(const MCInstrDesc &MCID, unsigned OpNum,
-              const TargetRegisterInfo *TRI,
-              const MachineFunction &MF) const override;
+              const TargetRegisterInfo *TRI) const override;
 
   /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
   /// such, whenever a client has an instance of instruction info, it should
diff --git a/llvm/lib/Target/X86/X86InstrOperands.td b/llvm/lib/Target/X86/X86InstrOperands.td
index 53a6b7c4c4c92..80843f6bb80e6 100644
--- a/llvm/lib/Target/X86/X86InstrOperands.td
+++ b/llvm/lib/Target/X86/X86InstrOperands.td
@@ -141,6 +141,11 @@ def i64mem_TC : X86MemOperand<"printqwordmem", X86Mem64AsmOperand, 64> {
                        ptr_rc_tailcall, i32imm, SEGMENT_REG);
 }
 
+def i64mem_w64TC : X86MemOperand<"printqwordmem", X86Mem64AsmOperand, 64> {
+  let MIOperandInfo = (ops GR64_TCW64, i8imm,
+                       GR64_TCW64, i32imm, SEGMENT_REG);
+}
+
 // Special parser to detect 16-bit mode to select 16-bit displacement.
 def X86AbsMemMode16AsmOperand : AsmOperandClass {
   let Name = "AbsMemMode16";
diff --git a/llvm/lib/Target/X86/X86OptimizeLEAs.cpp b/llvm/lib/Target/X86/X86OptimizeLEAs.cpp
index 8dcd4b8cf7551..167bed132cd12 100644
--- a/llvm/lib/Target/X86/X86OptimizeLEAs.cpp
+++ b/llvm/lib/Target/X86/X86OptimizeLEAs.cpp
@@ -339,7 +339,6 @@ int X86OptimizeLEAPass::calcInstrDist(const MachineInstr &First,
 bool X86OptimizeLEAPass::chooseBestLEA(
     const SmallVectorImpl<MachineInstr *> &List, const MachineInstr &MI,
     MachineInstr *&BestLEA, int64_t &AddrDispShift, int &Dist) {
-  const MachineFunction *MF = MI.getParent()->getParent();
   const MCInstrDesc &Desc = MI.getDesc();
   int MemOpNo = X86II::getMemoryOperandNo(Desc.TSFlags) +
                 X86II::getOperandBias(Desc);
@@ -360,7 +359,7 @@ bool X86OptimizeLEAPass::chooseBestLEA(
     // example MOV8mr_NOREX. We could constrain the register class of the LEA
     // def to suit MI, however since this case is very rare and hard to
     // reproduce in a test it's just more reliable to skip the LEA.
-    if (TII->getRegClass(Desc, MemOpNo + X86::AddrBaseReg, TRI, *MF) !=
+    if (TII->getRegClass(Desc, MemOpNo + X86::AddrBaseReg, TRI) !=
         MRI->getRegClass(DefMI->getOperand(0).getReg()))
       continue;
 
diff --git a/llvm/lib/Target/X86/X86RegisterInfo.cpp b/llvm/lib/Target/X86/X86RegisterInfo.cpp
index 9ec04e740a08b..c47bb3e67e625 100644
--- a/llvm/lib/Target/X86/X86RegisterInfo.cpp
+++ b/llvm/lib/Target/X86/X86RegisterInfo.cpp
@@ -61,6 +61,7 @@ X86RegisterInfo::X86RegisterInfo(const Triple &TT)
 
   // Cache some information.
   Is64Bit = TT.isArch64Bit();
+  IsTarget64BitLP64 = Is64Bit && !TT.isX32();
   IsWin64 = Is64Bit && TT.isOSWindows();
   IsUEFI64 = Is64Bit && TT.isUEFI();
 
@@ -192,13 +193,11 @@ X86RegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC,
 }
 
 const TargetRegisterClass *
-X86RegisterInfo::getPointerRegClass(const MachineFunction &MF,
-                                    unsigned Kind) const {
-  const X86Subtarget &Subtarget = MF.getSubtarget<X86Subtarget>();
+X86RegisterInfo::getPointerRegClass(unsigned Kind) const {
   switch (Kind) {
   default: llvm_unreachable("Unexpected Kind in getPointerRegClass!");
   case 0: // Normal GPRs.
-    if (Subtarget.isTarget64BitLP64())
+    if (IsTarget64BitLP64)
       return &X86::GR64RegClass;
     // If the target is 64bit but we have been told to use 32bit addresses,
     // we can still use 64-bit register as long as we know the high bits
@@ -206,16 +205,16 @@ X86RegisterInfo::getPointerRegClass(const MachineFunction &MF,
     // Reflect that in the returned register class.
     return Is64Bit ? &X86::LOW32_ADDR_ACCESSRegClass : &X86::GR32RegClass;
   case 1: // Normal GPRs except the stack pointer (for encoding reasons).
-    if (Subtarget.isTarget64BitLP64())
+    if (IsTarget64BitLP64)
       return &X86::GR64_NOSPRegClass;
     // NOSP does not contain RIP, so no special case here.
     return &X86::GR32_NOSPRegClass;
   case 2: // NOREX GPRs.
-    if (Subtarget.isTarget64BitLP64())
+    if (IsTarget64BitLP64)
       return &X86::GR64_NOREXRegClass;
     return &X86::GR32_NOREXRegClass;
   case 3: // NOREX GPRs except the stack pointer (for encoding reasons).
-    if (Subtarget.isTarget64BitLP64())
+    if (IsTarget64BitLP64)
       return &X86::GR64_NOREX_NOSPRegClass;
     // NOSP does not contain RIP, so no special case here.
     return &X86::GR32_NOREX_NOSPRegClass;
@@ -1010,6 +1009,7 @@ unsigned X86RegisterInfo::findDeadCallerSavedReg(
   case X86::TCRETURNri64:
   case X86::TCRETURNri64_ImpCall:
   case X86::TCRETURNmi64:
+  case X86::TCRETURN_WINmi64:
   case X86::EH_RETURN:
   case X86::EH_RETURN64: {
     LiveRegUnits LRU(*this);
diff --git a/llvm/lib/Target/X86/X86RegisterInfo.h b/llvm/lib/Target/X86/X86RegisterInfo.h
index d022e5ab87945..e646591663aca 100644
--- a/llvm/lib/Target/X86/X86RegisterInfo.h
+++ b/llvm/lib/Target/X86/X86RegisterInfo.h
@@ -28,6 +28,9 @@ class X86RegisterInfo final : public X86GenRegisterInfo {
   ///
   bool Is64Bit;
 
+  /// Is this x86_64 with the LP64 programming model (standard AMD64, no x32)?
+  bool IsTarget64BitLP64;
+
   /// IsWin64 - Is the target on of win64 flavours
   ///
   bool IsWin64;
@@ -78,8 +81,7 @@ class X86RegisterInfo final : public X86GenRegisterInfo {
   /// getPointerRegClass - Returns a TargetRegisterClass used for pointer
   /// values.
   const TargetRegisterClass *
-  getPointerRegClass(const MachineFunction &MF,
-                     unsigned Kind = 0) const override;
+  getPointerRegClass(unsigned Kind = 0) const override;
 
   /// getCrossCopyRegClass - Returns a legal register class to copy a register
   /// in the specified class to or from. Returns NULL if it is possible to copy
diff --git a/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp b/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp
index 4cc456ece77e0..e0b3b61e29175 100644
--- a/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp
+++ b/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp
@@ -836,13 +836,12 @@ X86SpeculativeLoadHardeningPass::tracePredStateThroughCFG(
 /// a way to unfold into a newly created vreg rather than requiring a register
 /// input.
 static const TargetRegisterClass *
-getRegClassForUnfoldedLoad(MachineFunction &MF, const X86InstrInfo &TII,
-                           unsigned Opcode) {
+getRegClassForUnfoldedLoad(const X86InstrInfo &TII, unsigned Opcode) {
   unsigned Index;
   unsigned UnfoldedOpc = TII.getOpcodeAfterMemoryUnfold(
       Opcode, /*UnfoldLoad*/ true, /*UnfoldStore*/ false, &Index);
   const MCInstrDesc &MCID = TII.get(UnfoldedOpc);
-  return TII.getRegClass(MCID, Index, &TII.getRegisterInfo(), MF);
+  return TII.getRegClass(MCID, Index, &TII.getRegisterInfo());
 }
 
 void X86SpeculativeLoadHardeningPass::unfoldCallAndJumpLoads(
@@ -893,11 +892,12 @@ void X86SpeculativeLoadHardeningPass::unfoldCallAndJumpLoads(
       case X86::TAILJMPm64_REX:
       case X86::TAILJMPm:
       case X86::TCRETURNmi64:
+      case X86::TCRETURN_WINmi64:
       case X86::TCRETURNmi: {
         // Use the generic unfold logic now that we know we're dealing with
         // expected instructions.
         // FIXME: We don't have test coverage for all of these!
-        auto *UnfoldedRC = getRegClassForUnfoldedLoad(MF, *TII, MI.getOpcode());
+        auto *UnfoldedRC = getRegClassForUnfoldedLoad(*TII, MI.getOpcode());
         if (!UnfoldedRC) {
           LLVM_DEBUG(dbgs()
                          << "ERROR: Unable to unfold load from instruction:\n";
diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp
index 6d9c6cdedd9e5..babbe95cc7808 100644
--- a/llvm/lib/Target/X86/X86TargetMachine.cpp
+++ b/llvm/lib/Target/X86/X86TargetMachine.cpp
@@ -125,54 +125,6 @@ static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
   return std::make_unique<X86ELFTargetObjectFile>();
 }
 
-static std::string computeDataLayout(const Triple &TT) {
-  // X86 is little endian
-  std::string Ret = "e";
-
-  Ret += DataLayout::getManglingComponent(TT);
-  // X86 and x32 have 32 bit pointers.
-  if (!TT.isArch64Bit() || TT.isX32())
-    Ret += "-p:32:32";
-
-  // Address spaces for 32 bit signed, 32 bit unsigned, and 64 bit pointers.
-  Ret += "-p270:32:32-p271:32:32-p272:64:64";
-
-  // Some ABIs align 64 bit integers and doubles to 64 bits, others to 32.
-  // 128 bit integers are not specified in the 32-bit ABIs but are used
-  // internally for lowering f128, so we match the alignment to that.
-  if (TT.isArch64Bit() || TT.isOSWindows())
-    Ret += "-i64:64-i128:128";
-  else if (TT.isOSIAMCU())
-    Ret += "-i64:32-f64:32";
-  else
-    Ret += "-i128:128-f64:32:64";
-
-  // Some ABIs align long double to 128 bits, others to 32.
-  if (TT.isOSIAMCU())
-    ; // No f80
-  else if (TT.isArch64Bit() || TT.isOSDarwin() || TT.isWindowsMSVCEnvironment())
-    Ret += "-f80:128";
-  else
-    Ret += "-f80:32";
-
-  if (TT.isOSIAMCU())
-    Ret += "-f128:32";
-
-  // The registers can hold 8, 16, 32 or, in x86-64, 64 bits.
-  if (TT.isArch64Bit())
-    Ret += "-n8:16:32:64";
-  else
-    Ret += "-n8:16:32";
-
-  // The stack is aligned to 32 bits on some ABIs and 128 bits on others.
-  if ((!TT.isArch64Bit() && TT.isOSWindows()) || TT.isOSIAMCU())
-    Ret += "-a:0:32-S32";
-  else
-    Ret += "-S128";
-
-  return Ret;
-}
-
 static Reloc::Model getEffectiveRelocModel(const Triple &TT, bool JIT,
                                            std::optional<Reloc::Model> RM) {
   bool is64Bit = TT.getArch() == Triple::x86_64;
@@ -236,7 +188,7 @@ X86TargetMachine::X86TargetMachine(const Target &T, const Triple &TT,
                                    std::optional<Reloc::Model> RM,
                                    std::optional<CodeModel::Model> CM,
                                    CodeGenOptLevel OL, bool JIT)
-    : CodeGenTargetMachineImpl(T, computeDataLayout(TT), TT, CPU, FS, Options,
+    : CodeGenTargetMachineImpl(T, TT.computeDataLayout(), TT, CPU, FS, Options,
                                getEffectiveRelocModel(TT, JIT, RM),
                                getEffectiveX86CodeModel(TT, CM, JIT), OL),
       TLOF(createTLOF(getTargetTriple())), IsJIT(JIT) {
diff --git a/llvm/lib/Target/Xtensa/CMakeLists.txt b/llvm/lib/Target/Xtensa/CMakeLists.txt
index 4fc1ba6dfa650..c698b42b00d10 100644
--- a/llvm/lib/Target/Xtensa/CMakeLists.txt
+++ b/llvm/lib/Target/Xtensa/CMakeLists.txt
@@ -33,6 +33,7 @@ add_llvm_target(XtensaCodeGen
   SelectionDAG
   Support
   Target
+  TargetParser
   XtensaDesc
   XtensaInfo
 
diff --git a/llvm/lib/Target/Xtensa/Disassembler/XtensaDisassembler.cpp b/llvm/lib/Target/Xtensa/Disassembler/XtensaDisassembler.cpp
index 396b00f7b8628..7d22a0489a951 100644
--- a/llvm/lib/Target/Xtensa/Disassembler/XtensaDisassembler.cpp
+++ b/llvm/lib/Target/Xtensa/Disassembler/XtensaDisassembler.cpp
@@ -25,6 +25,7 @@
 #include "llvm/Support/Endian.h"
 
 using namespace llvm;
+using namespace llvm::MCD;
 
 #define DEBUG_TYPE "Xtensa-disassembler"
 
diff --git a/llvm/lib/Target/Xtensa/XtensaTargetMachine.cpp b/llvm/lib/Target/Xtensa/XtensaTargetMachine.cpp
index c9f1ca8b46dab..72cb61b5e864e 100644
--- a/llvm/lib/Target/Xtensa/XtensaTargetMachine.cpp
+++ b/llvm/lib/Target/Xtensa/XtensaTargetMachine.cpp
@@ -32,13 +32,6 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeXtensaTarget() {
   initializeXtensaAsmPrinterPass(PR);
 }
 
-static std::string computeDataLayout(const Triple &TT, StringRef CPU,
-                                     const TargetOptions &Options,
-                                     bool IsLittle) {
-  std::string Ret = "e-m:e-p:32:32-i8:8:32-i16:16:32-i64:64-n32";
-  return Ret;
-}
-
 static Reloc::Model getEffectiveRelocModel(bool JIT,
                                            std::optional<Reloc::Model> RM) {
   if (!RM || JIT)
@@ -53,8 +46,7 @@ XtensaTargetMachine::XtensaTargetMachine(const Target &T, const Triple &TT,
                                          std::optional<CodeModel::Model> CM,
                                          CodeGenOptLevel OL, bool JIT,
                                          bool IsLittle)
-    : CodeGenTargetMachineImpl(T, computeDataLayout(TT, CPU, Options, IsLittle),
-                               TT, CPU, FS, Options,
+    : CodeGenTargetMachineImpl(T, TT.computeDataLayout(), TT, CPU, FS, Options,
                                getEffectiveRelocModel(JIT, RM),
                                getEffectiveCodeModel(CM, CodeModel::Small), OL),
       TLOF(std::make_unique<TargetLoweringObjectFileELF>()) {
diff --git a/llvm/lib/TargetParser/CMakeLists.txt b/llvm/lib/TargetParser/CMakeLists.txt
index 5eecfbf80b2f7..e1a30199e1ade 100644
--- a/llvm/lib/TargetParser/CMakeLists.txt
+++ b/llvm/lib/TargetParser/CMakeLists.txt
@@ -26,6 +26,7 @@ add_llvm_component_library(LLVMTargetParser
   SubtargetFeature.cpp
   TargetParser.cpp
   Triple.cpp
+  TargetDataLayout.cpp
   X86TargetParser.cpp
   XtensaTargetParser.cpp
 
diff --git a/llvm/lib/TargetParser/Host.cpp b/llvm/lib/TargetParser/Host.cpp
index 63848160636a2..a5bdc9dd38848 100644
--- a/llvm/lib/TargetParser/Host.cpp
+++ b/llvm/lib/TargetParser/Host.cpp
@@ -1302,16 +1302,17 @@ static const char *getAMDProcessorTypeAndSubtype(unsigned Family,
   case 26:
     CPU = "znver5";
     *Type = X86::AMDFAM1AH;
-    if (Model <= 0x77) {
+    if (Model <= 0x4f || (Model >= 0x60 && Model <= 0x77) ||
+        (Model >= 0xd0 && Model <= 0xd7)) {
       // Models 00h-0Fh (Breithorn).
       // Models 10h-1Fh (Breithorn-Dense).
       // Models 20h-2Fh (Strix 1).
       // Models 30h-37h (Strix 2).
       // Models 38h-3Fh (Strix 3).
       // Models 40h-4Fh (Granite Ridge).
-      // Models 50h-5Fh (Weisshorn).
       // Models 60h-6Fh (Krackan1).
       // Models 70h-77h (Sarlak).
+      // Models D0h-D7h (Annapurna).
       CPU = "znver5";
       *Subtype = X86::AMDFAM1AH_ZNVER5;
       break; //  "znver5"
@@ -2049,6 +2050,11 @@ StringMap<bool> sys::getHostCPUFeatures() {
   Features["rdpru"]    = HasExtLeaf8 && ((EBX >> 4) & 1);
   Features["wbnoinvd"] = HasExtLeaf8 && ((EBX >> 9) & 1);
 
+  bool HasExtLeaf21 = MaxExtLevel >= 0x80000021 &&
+                      !getX86CpuIDAndInfo(0x80000021, &EAX, &EBX, &ECX, &EDX);
+  // AMD cpuid bit for prefetchi is different from Intel
+  Features["prefetchi"] = HasExtLeaf21 && ((EAX >> 20) & 1);
+
   bool HasLeaf7 =
       MaxLevel >= 7 && !getX86CpuIDAndInfoEx(0x7, 0x0, &EAX, &EBX, &ECX, &EDX);
 
@@ -2131,7 +2137,7 @@ StringMap<bool> sys::getHostCPUFeatures() {
   Features["avxneconvert"] = HasLeaf7Subleaf1 && ((EDX >> 5) & 1) && HasAVXSave;
   Features["amx-complex"] = HasLeaf7Subleaf1 && ((EDX >> 8) & 1) && HasAMXSave;
   Features["avxvnniint16"] = HasLeaf7Subleaf1 && ((EDX >> 10) & 1) && HasAVXSave;
-  Features["prefetchi"]  = HasLeaf7Subleaf1 && ((EDX >> 14) & 1);
+  Features["prefetchi"] |= HasLeaf7Subleaf1 && ((EDX >> 14) & 1);
   Features["usermsr"]  = HasLeaf7Subleaf1 && ((EDX >> 15) & 1);
   bool HasAVX10 = HasLeaf7Subleaf1 && ((EDX >> 19) & 1);
   bool HasAPXF = HasLeaf7Subleaf1 && ((EDX >> 21) & 1);
diff --git a/llvm/lib/TargetParser/TargetDataLayout.cpp b/llvm/lib/TargetParser/TargetDataLayout.cpp
new file mode 100644
index 0000000000000..e222588ea389b
--- /dev/null
+++ b/llvm/lib/TargetParser/TargetDataLayout.cpp
@@ -0,0 +1,629 @@
+//===--- TargetDataLayout.cpp - Map Triple to LLVM data layout string -----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/TargetParser/ARMTargetParser.h"
+#include "llvm/TargetParser/Triple.h"
+#include <cstring>
+using namespace llvm;
+
+static StringRef getManglingComponent(const Triple &T) {
+  if (T.isOSBinFormatGOFF())
+    return "-m:l";
+  if (T.isOSBinFormatMachO())
+    return "-m:o";
+  if ((T.isOSWindows() || T.isUEFI()) && T.isOSBinFormatCOFF())
+    return T.getArch() == Triple::x86 ? "-m:x" : "-m:w";
+  if (T.isOSBinFormatXCOFF())
+    return "-m:a";
+  return "-m:e";
+}
+
+static std::string computeARMDataLayout(const Triple &TT, StringRef ABIName) {
+  auto ABI = ARM::computeTargetABI(TT, ABIName);
+  std::string Ret;
+
+  if (TT.isLittleEndian())
+    // Little endian.
+    Ret += "e";
+  else
+    // Big endian.
+    Ret += "E";
+
+  Ret += getManglingComponent(TT);
+
+  // Pointers are 32 bits and aligned to 32 bits.
+  Ret += "-p:32:32";
+
+  // Function pointers are aligned to 8 bits (because the LSB stores the
+  // ARM/Thumb state).
+  Ret += "-Fi8";
+
+  // ABIs other than APCS have 64 bit integers with natural alignment.
+  if (ABI != ARM::ARM_ABI_APCS)
+    Ret += "-i64:64";
+
+  // We have 64 bits floats. The APCS ABI requires them to be aligned to 32
+  // bits, others to 64 bits. We always try to align to 64 bits.
+  if (ABI == ARM::ARM_ABI_APCS)
+    Ret += "-f64:32:64";
+
+  // We have 128 and 64 bit vectors. The APCS ABI aligns them to 32 bits, others
+  // to 64. We always ty to give them natural alignment.
+  if (ABI == ARM::ARM_ABI_APCS)
+    Ret += "-v64:32:64-v128:32:128";
+  else if (ABI != ARM::ARM_ABI_AAPCS16)
+    Ret += "-v128:64:128";
+
+  // Try to align aggregates to 32 bits (the default is 64 bits, which has no
+  // particular hardware support on 32-bit ARM).
+  Ret += "-a:0:32";
+
+  // Integer registers are 32 bits.
+  Ret += "-n32";
+
+  // The stack is 64 bit aligned on AAPCS and 32 bit aligned everywhere else.
+  if (ABI == ARM::ARM_ABI_AAPCS16)
+    Ret += "-S128";
+  else if (ABI == ARM::ARM_ABI_AAPCS)
+    Ret += "-S64";
+  else
+    Ret += "-S32";
+
+  return Ret;
+}
+
+// Helper function to build a DataLayout string
+static std::string computeAArch64DataLayout(const Triple &TT) {
+  if (TT.isOSBinFormatMachO()) {
+    if (TT.getArch() == Triple::aarch64_32)
+      return "e-m:o-p:32:32-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-"
+             "n32:64-S128-Fn32";
+    return "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-n32:64-S128-"
+           "Fn32";
+  }
+  if (TT.isOSBinFormatCOFF())
+    return "e-m:w-p270:32:32-p271:32:32-p272:64:64-p:64:64-i32:32-i64:64-i128:"
+           "128-n32:64-S128-Fn32";
+  std::string Endian = TT.isLittleEndian() ? "e" : "E";
+  std::string Ptr32 = TT.getEnvironment() == Triple::GNUILP32 ? "-p:32:32" : "";
+  return Endian + "-m:e" + Ptr32 +
+         "-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-"
+         "n32:64-S128-Fn32";
+}
+
+// DataLayout: little or big endian
+static std::string computeBPFDataLayout(const Triple &TT) {
+  if (TT.getArch() == Triple::bpfeb)
+    return "E-m:e-p:64:64-i64:64-i128:128-n32:64-S128";
+  else
+    return "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128";
+}
+
+static std::string computeCSKYDataLayout(const Triple &TT) {
+  // CSKY is always 32-bit target with the CSKYv2 ABI as prefer now.
+  // It's a 4-byte aligned stack with ELF mangling only.
+  // Only support little endian for now.
+  // TODO: Add support for big endian.
+  return "e-m:e-S32-p:32:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:32:32"
+         "-v128:32:32-a:0:32-Fi32-n32";
+}
+
+static std::string computeLoongArchDataLayout(const Triple &TT) {
+  if (TT.isLoongArch64())
+    return "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128";
+  assert(TT.isLoongArch32() && "only LA32 and LA64 are currently supported");
+  return "e-m:e-p:32:32-i64:64-n32-S128";
+}
+
+static std::string computeM68kDataLayout(const Triple &TT) {
+  std::string Ret = "";
+  // M68k is Big Endian
+  Ret += "E";
+
+  // FIXME how to wire it with the used object format?
+  Ret += "-m:e";
+
+  // M68k pointers are always 32 bit wide even for 16-bit CPUs.
+  // The ABI only specifies 16-bit alignment.
+  // On at least the 68020+ with a 32-bit bus, there is a performance benefit
+  // to having 32-bit alignment.
+  Ret += "-p:32:16:32";
+
+  // Bytes do not require special alignment, words are word aligned and
+  // long words are word aligned at minimum.
+  Ret += "-i8:8:8-i16:16:16-i32:16:32";
+
+  // FIXME no floats at the moment
+
+  // The registers can hold 8, 16, 32 bits
+  Ret += "-n8:16:32";
+
+  Ret += "-a:0:16-S16";
+
+  return Ret;
+}
+
+namespace {
+enum class MipsABI { Unknown, O32, N32, N64 };
+}
+
+// FIXME: This duplicates MipsABIInfo::computeTargetABI, but duplicating this is
+// preferable to violating layering rules. Ideally that information should live
+// in LLVM TargetParser, but for now we just duplicate some ABI name string
+// logic for simplicity.
+static MipsABI getMipsABI(const Triple &TT, StringRef ABIName) {
+  if (ABIName.starts_with("o32"))
+    return MipsABI::O32;
+  if (ABIName.starts_with("n32"))
+    return MipsABI::N32;
+  if (ABIName.starts_with("n64"))
+    return MipsABI::N64;
+  if (TT.isABIN32())
+    return MipsABI::N32;
+  assert(ABIName.empty() && "Unknown ABI option for MIPS");
+
+  if (TT.isMIPS64())
+    return MipsABI::N64;
+  return MipsABI::O32;
+}
+
+static std::string computeMipsDataLayout(const Triple &TT, StringRef ABIName) {
+  std::string Ret;
+  MipsABI ABI = getMipsABI(TT, ABIName);
+
+  // There are both little and big endian mips.
+  if (TT.isLittleEndian())
+    Ret += "e";
+  else
+    Ret += "E";
+
+  if (ABI == MipsABI::O32)
+    Ret += "-m:m";
+  else
+    Ret += "-m:e";
+
+  // Pointers are 32 bit on some ABIs.
+  if (ABI != MipsABI::N64)
+    Ret += "-p:32:32";
+
+  // 8 and 16 bit integers only need to have natural alignment, but try to
+  // align them to 32 bits. 64 bit integers have natural alignment.
+  Ret += "-i8:8:32-i16:16:32-i64:64";
+
+  // 32 bit registers are always available and the stack is at least 64 bit
+  // aligned. On N64 64 bit registers are also available and the stack is
+  // 128 bit aligned.
+  if (ABI == MipsABI::N64 || ABI == MipsABI::N32)
+    Ret += "-i128:128-n32:64-S128";
+  else
+    Ret += "-n32-S64";
+
+  return Ret;
+}
+
+static std::string computePowerDataLayout(const Triple &T) {
+  bool is64Bit = T.isPPC64();
+  std::string Ret;
+
+  // Most PPC* platforms are big endian, PPC(64)LE is little endian.
+  if (T.isLittleEndian())
+    Ret = "e";
+  else
+    Ret = "E";
+
+  Ret += getManglingComponent(T);
+
+  // PPC32 has 32 bit pointers. The PS3 (OS Lv2) is a PPC64 machine with 32 bit
+  // pointers.
+  if (!is64Bit || T.getOS() == Triple::Lv2)
+    Ret += "-p:32:32";
+
+  // If the target ABI uses function descriptors, then the alignment of function
+  // pointers depends on the alignment used to emit the descriptor. Otherwise,
+  // function pointers are aligned to 32 bits because the instructions must be.
+  if ((T.getArch() == Triple::ppc64 && !T.isPPC64ELFv2ABI())) {
+    Ret += "-Fi64";
+  } else if (T.isOSAIX()) {
+    Ret += is64Bit ? "-Fi64" : "-Fi32";
+  } else {
+    Ret += "-Fn32";
+  }
+
+  // Note, the alignment values for f64 and i64 on ppc64 in Darwin
+  // documentation are wrong; these are correct (i.e. "what gcc does").
+  Ret += "-i64:64";
+
+  // PPC64 has 32 and 64 bit registers, PPC32 has only 32 bit ones.
+  if (is64Bit)
+    Ret += "-i128:128-n32:64";
+  else
+    Ret += "-n32";
+
+  // Specify the vector alignment explicitly. For v256i1 and v512i1, the
+  // calculated alignment would be 256*alignment(i1) and 512*alignment(i1),
+  // which is 256 and 512 bytes - way over aligned.
+  if (is64Bit && (T.isOSAIX() || T.isOSLinux()))
+    Ret += "-S128-v256:256:256-v512:512:512";
+
+  return Ret;
+}
+
+static std::string computeAMDDataLayout(const Triple &TT) {
+  if (TT.getArch() == Triple::r600) {
+    // 32-bit pointers.
+    return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
+           "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1";
+  }
+
+  // 32-bit private, local, and region pointers. 64-bit global, constant and
+  // flat. 160-bit non-integral fat buffer pointers that include a 128-bit
+  // buffer descriptor and a 32-bit offset, which are indexed by 32-bit values
+  // (address space 7), and 128-bit non-integral buffer resourcees (address
+  // space 8) which cannot be non-trivilally accessed by LLVM memory operations
+  // like getelementptr.
+  return "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32"
+         "-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-i64:64-"
+         "v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-"
+         "v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9";
+}
+
+static std::string computeRISCVDataLayout(const Triple &TT, StringRef ABIName) {
+  std::string Ret;
+
+  if (TT.isLittleEndian())
+    Ret += "e";
+  else
+    Ret += "E";
+
+  Ret += "-m:e";
+
+  // Pointer and integer sizes.
+  if (TT.isRISCV64()) {
+    Ret += "-p:64:64-i64:64-i128:128";
+    Ret += "-n32:64";
+  } else {
+    assert(TT.isRISCV32() && "only RV32 and RV64 are currently supported");
+    Ret += "-p:32:32-i64:64";
+    Ret += "-n32";
+  }
+
+  // Stack alignment based on ABI.
+  StringRef ABI = ABIName;
+  if (ABI == "ilp32e")
+    Ret += "-S32";
+  else if (ABI == "lp64e")
+    Ret += "-S64";
+  else
+    Ret += "-S128";
+
+  return Ret;
+}
+
+static std::string computeSparcDataLayout(const Triple &T) {
+  const bool Is64Bit = T.isSPARC64();
+
+  // Sparc is typically big endian, but some are little.
+  std::string Ret = T.getArch() == Triple::sparcel ? "e" : "E";
+  Ret += "-m:e";
+
+  // Some ABIs have 32bit pointers.
+  if (!Is64Bit)
+    Ret += "-p:32:32";
+
+  // Alignments for 64 bit integers.
+  Ret += "-i64:64";
+
+  // Alignments for 128 bit integers.
+  // This is not specified in the ABI document but is the de facto standard.
+  Ret += "-i128:128";
+
+  // On SparcV9 128 floats are aligned to 128 bits, on others only to 64.
+  // On SparcV9 registers can hold 64 or 32 bits, on others only 32.
+  if (Is64Bit)
+    Ret += "-n32:64";
+  else
+    Ret += "-f128:64-n32";
+
+  if (Is64Bit)
+    Ret += "-S128";
+  else
+    Ret += "-S64";
+
+  return Ret;
+}
+
+static std::string computeSystemZDataLayout(const Triple &TT) {
+  std::string Ret;
+
+  // Big endian.
+  Ret += "E";
+
+  // Data mangling.
+  Ret += getManglingComponent(TT);
+
+  // Special features for z/OS.
+  if (TT.isOSzOS()) {
+    // Custom address space for ptr32.
+    Ret += "-p1:32:32";
+  }
+
+  // Make sure that global data has at least 16 bits of alignment by
+  // default, so that we can refer to it using LARL.  We don't have any
+  // special requirements for stack variables though.
+  Ret += "-i1:8:16-i8:8:16";
+
+  // 64-bit integers are naturally aligned.
+  Ret += "-i64:64";
+
+  // 128-bit floats are aligned only to 64 bits.
+  Ret += "-f128:64";
+
+  // The DataLayout string always holds a vector alignment of 64 bits, see
+  // comment in clang/lib/Basic/Targets/SystemZ.h.
+  Ret += "-v128:64";
+
+  // We prefer 16 bits of aligned for all globals; see above.
+  Ret += "-a:8:16";
+
+  // Integer registers are 32 or 64 bits.
+  Ret += "-n32:64";
+
+  return Ret;
+}
+
+static std::string computeX86DataLayout(const Triple &TT) {
+  bool Is64Bit = TT.getArch() == Triple::x86_64;
+
+  // X86 is little endian
+  std::string Ret = "e";
+
+  Ret += getManglingComponent(TT);
+  // X86 and x32 have 32 bit pointers.
+  if (!Is64Bit || TT.isX32())
+    Ret += "-p:32:32";
+
+  // Address spaces for 32 bit signed, 32 bit unsigned, and 64 bit pointers.
+  Ret += "-p270:32:32-p271:32:32-p272:64:64";
+
+  // Some ABIs align 64 bit integers and doubles to 64 bits, others to 32.
+  // 128 bit integers are not specified in the 32-bit ABIs but are used
+  // internally for lowering f128, so we match the alignment to that.
+  if (Is64Bit || TT.isOSWindows())
+    Ret += "-i64:64-i128:128";
+  else if (TT.isOSIAMCU())
+    Ret += "-i64:32-f64:32";
+  else
+    Ret += "-i128:128-f64:32:64";
+
+  // Some ABIs align long double to 128 bits, others to 32.
+  if (TT.isOSIAMCU())
+    ; // No f80
+  else if (Is64Bit || TT.isOSDarwin() || TT.isWindowsMSVCEnvironment())
+    Ret += "-f80:128";
+  else
+    Ret += "-f80:32";
+
+  if (TT.isOSIAMCU())
+    Ret += "-f128:32";
+
+  // The registers can hold 8, 16, 32 or, in x86-64, 64 bits.
+  if (Is64Bit)
+    Ret += "-n8:16:32:64";
+  else
+    Ret += "-n8:16:32";
+
+  // The stack is aligned to 32 bits on some ABIs and 128 bits on others.
+  if ((!Is64Bit && TT.isOSWindows()) || TT.isOSIAMCU())
+    Ret += "-a:0:32-S32";
+  else
+    Ret += "-S128";
+
+  return Ret;
+}
+
+static std::string computeNVPTXDataLayout(const Triple &T, StringRef ABIName) {
+  bool Is64Bit = T.getArch() == Triple::nvptx64;
+  std::string Ret = "e";
+
+  // Tensor Memory (addrspace:6) is always 32-bits.
+  // Distributed Shared Memory (addrspace:7) follows shared memory
+  // (addrspace:3).
+  if (!Is64Bit)
+    Ret += "-p:32:32-p6:32:32-p7:32:32";
+  else if (ABIName == "shortptr")
+    Ret += "-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32";
+  else
+    Ret += "-p6:32:32";
+
+  Ret += "-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64";
+
+  return Ret;
+}
+
+static std::string computeSPIRVDataLayout(const Triple &TT) {
+  const auto Arch = TT.getArch();
+  // TODO: this probably needs to be revisited:
+  // Logical SPIR-V has no pointer size, so any fixed pointer size would be
+  // wrong. The choice to default to 32 or 64 is just motivated by another
+  // memory model used for graphics: PhysicalStorageBuffer64. But it shouldn't
+  // mean anything.
+  if (Arch == Triple::spirv32)
+    return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-"
+           "v256:256-v512:512-v1024:1024-n8:16:32:64-G1";
+  if (Arch == Triple::spirv)
+    return "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-"
+           "v512:512-v1024:1024-n8:16:32:64-G10";
+  if (TT.getVendor() == Triple::VendorType::AMD &&
+      TT.getOS() == Triple::OSType::AMDHSA)
+    return "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-"
+           "v512:512-v1024:1024-n32:64-S32-G1-P4-A0";
+  if (TT.getVendor() == Triple::VendorType::Intel)
+    return "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-"
+           "v512:512-v1024:1024-n8:16:32:64-G1-P9-A0";
+  return "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-"
+         "v512:512-v1024:1024-n8:16:32:64-G1";
+}
+
+static std::string computeLanaiDataLayout() {
+  // Data layout (keep in sync with clang/lib/Basic/Targets.cpp)
+  return "E"        // Big endian
+         "-m:e"     // ELF name manging
+         "-p:32:32" // 32-bit pointers, 32 bit aligned
+         "-i64:64"  // 64 bit integers, 64 bit aligned
+         "-a:0:32"  // 32 bit alignment of objects of aggregate type
+         "-n32"     // 32 bit native integer width
+         "-S64";    // 64 bit natural stack alignment
+}
+
+static std::string computeWebAssemblyDataLayout(const Triple &TT) {
+  return TT.getArch() == Triple::wasm64
+             ? (TT.isOSEmscripten() ? "e-m:e-p:64:64-p10:8:8-p20:8:8-i64:64-"
+                                      "i128:128-f128:64-n32:64-S128-ni:1:10:20"
+                                    : "e-m:e-p:64:64-p10:8:8-p20:8:8-i64:64-"
+                                      "i128:128-n32:64-S128-ni:1:10:20")
+             : (TT.isOSEmscripten() ? "e-m:e-p:32:32-p10:8:8-p20:8:8-i64:64-"
+                                      "i128:128-f128:64-n32:64-S128-ni:1:10:20"
+                                    : "e-m:e-p:32:32-p10:8:8-p20:8:8-i64:64-"
+                                      "i128:128-n32:64-S128-ni:1:10:20");
+}
+
+static std::string computeVEDataLayout(const Triple &T) {
+  // Aurora VE is little endian
+  std::string Ret = "e";
+
+  // Use ELF mangling
+  Ret += "-m:e";
+
+  // Alignments for 64 bit integers.
+  Ret += "-i64:64";
+
+  // VE supports 32 bit and 64 bits integer on registers
+  Ret += "-n32:64";
+
+  // Stack alignment is 128 bits
+  Ret += "-S128";
+
+  // Vector alignments are 64 bits
+  // Need to define all of them.  Otherwise, each alignment becomes
+  // the size of each data by default.
+  Ret += "-v64:64:64"; // for v2f32
+  Ret += "-v128:64:64";
+  Ret += "-v256:64:64";
+  Ret += "-v512:64:64";
+  Ret += "-v1024:64:64";
+  Ret += "-v2048:64:64";
+  Ret += "-v4096:64:64";
+  Ret += "-v8192:64:64";
+  Ret += "-v16384:64:64"; // for v256f64
+
+  return Ret;
+}
+
+std::string Triple::computeDataLayout(StringRef ABIName) const {
+  switch (getArch()) {
+  case Triple::arm:
+  case Triple::armeb:
+  case Triple::thumb:
+  case Triple::thumbeb:
+    return computeARMDataLayout(*this, ABIName);
+  case Triple::aarch64:
+  case Triple::aarch64_be:
+  case Triple::aarch64_32:
+    return computeAArch64DataLayout(*this);
+  case Triple::arc:
+    return "e-m:e-p:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-"
+           "f32:32:32-i64:32-f64:32-a:0:32-n32";
+  case Triple::avr:
+    return "e-P1-p:16:8-i8:8-i16:8-i32:8-i64:8-f32:8-f64:8-n8:16-a:8";
+  case Triple::bpfel:
+  case Triple::bpfeb:
+    return computeBPFDataLayout(*this);
+  case Triple::csky:
+    return computeCSKYDataLayout(*this);
+  case Triple::dxil:
+    return "e-m:e-p:32:32-i1:32-i8:8-i16:16-i32:32-i64:64-f16:16-"
+           "f32:32-f64:64-n8:16:32:64";
+  case Triple::hexagon:
+    return "e-m:e-p:32:32:32-a:0-n16:32-"
+           "i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-"
+           "v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048";
+  case Triple::loongarch32:
+  case Triple::loongarch64:
+    return computeLoongArchDataLayout(*this);
+  case Triple::m68k:
+    return computeM68kDataLayout(*this);
+  case Triple::mips:
+  case Triple::mipsel:
+  case Triple::mips64:
+  case Triple::mips64el:
+    return computeMipsDataLayout(*this, ABIName);
+  case Triple::msp430:
+    return "e-m:e-p:16:16-i32:16-i64:16-f32:16-f64:16-a:8-n8:16-S16";
+  case Triple::ppc:
+  case Triple::ppcle:
+  case Triple::ppc64:
+  case Triple::ppc64le:
+    return computePowerDataLayout(*this);
+  case Triple::r600:
+  case Triple::amdgcn:
+    return computeAMDDataLayout(*this);
+  case Triple::riscv32:
+  case Triple::riscv64:
+  case Triple::riscv32be:
+  case Triple::riscv64be:
+    return computeRISCVDataLayout(*this, ABIName);
+  case Triple::sparc:
+  case Triple::sparcv9:
+  case Triple::sparcel:
+    return computeSparcDataLayout(*this);
+  case Triple::systemz:
+    return computeSystemZDataLayout(*this);
+  case Triple::tce:
+  case Triple::tcele:
+  case Triple::x86:
+  case Triple::x86_64:
+    return computeX86DataLayout(*this);
+  case Triple::xcore:
+  case Triple::xtensa:
+    return "e-m:e-p:32:32-i8:8:32-i16:16:32-i64:64-n32";
+  case Triple::nvptx:
+  case Triple::nvptx64:
+    return computeNVPTXDataLayout(*this, ABIName);
+  case Triple::spir:
+  case Triple::spir64:
+  case Triple::spirv:
+  case Triple::spirv32:
+  case Triple::spirv64:
+    return computeSPIRVDataLayout(*this);
+  case Triple::lanai:
+    return computeLanaiDataLayout();
+  case Triple::wasm32:
+  case Triple::wasm64:
+    return computeWebAssemblyDataLayout(*this);
+  case Triple::ve:
+    return computeVEDataLayout(*this);
+
+  case Triple::amdil:
+  case Triple::amdil64:
+  case Triple::hsail:
+  case Triple::hsail64:
+  case Triple::kalimba:
+  case Triple::shave:
+  case Triple::renderscript32:
+  case Triple::renderscript64:
+    // These are all virtual ISAs with no LLVM backend, and therefore no fixed
+    // LLVM data layout.
+    return "";
+
+  case Triple::UnknownArch:
+    return "";
+  }
+  llvm_unreachable("Invalid arch");
+}
diff --git a/llvm/lib/TargetParser/TargetParser.cpp b/llvm/lib/TargetParser/TargetParser.cpp
index 2194ef4df14d6..acc8a2d83b8a0 100644
--- a/llvm/lib/TargetParser/TargetParser.cpp
+++ b/llvm/lib/TargetParser/TargetParser.cpp
@@ -425,6 +425,7 @@ static void fillAMDGCNFeatureMap(StringRef GPU, const Triple &T,
     Features["transpose-load-f4f6-insts"] = true;
     Features["bf16-trans-insts"] = true;
     Features["bf16-cvt-insts"] = true;
+    Features["bf16-pk-insts"] = true;
     Features["fp8-conversion-insts"] = true;
     Features["fp8e5m3-insts"] = true;
     Features["permlane16-swap"] = true;
diff --git a/llvm/lib/Transforms/Coroutines/Coroutines.cpp b/llvm/lib/Transforms/Coroutines/Coroutines.cpp
index ac93f748ce65c..28a89a8f87dbd 100644
--- a/llvm/lib/Transforms/Coroutines/Coroutines.cpp
+++ b/llvm/lib/Transforms/Coroutines/Coroutines.cpp
@@ -356,9 +356,9 @@ void coro::Shape::invalidateCoroutine(
     // present.
     for (AnyCoroSuspendInst *CS : CoroSuspends) {
       CS->replaceAllUsesWith(PoisonValue::get(CS->getType()));
-      CS->eraseFromParent();
       if (auto *CoroSave = CS->getCoroSave())
         CoroSave->eraseFromParent();
+      CS->eraseFromParent();
     }
     CoroSuspends.clear();
 
diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
index 7f9693169af0c..fd35de571a0d9 100644
--- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
+++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
@@ -316,6 +316,9 @@ class CallsiteContextGraph {
 
   /// Node in the Callsite Context Graph
   struct ContextNode {
+    // Assigned to nodes as they are created, useful for debugging.
+    unsigned NodeId = 0;
+
     // Keep this for now since in the IR case where we have an Instruction* it
     // is not as immediately discoverable. Used for printing richer information
     // when dumping graph.
@@ -760,6 +763,7 @@ class CallsiteContextGraph {
     auto *NewNode = NodeOwner.back().get();
     if (F)
       NodeToCallingFunc[NewNode] = F;
+    NewNode->NodeId = NodeOwner.size();
     return NewNode;
   }
 
@@ -2977,6 +2981,7 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::print(
       OS << "\n";
     }
   }
+  OS << "\tNodeId: " << NodeId << "\n";
   OS << "\tAllocTypes: " << getAllocTypeString(AllocTypes) << "\n";
   OS << "\tContextIds:";
   // Make a copy of the computed context ids that we can sort for stability.
@@ -2988,14 +2993,24 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::print(
   OS << "\n";
   OS << "\tCalleeEdges:\n";
   for (auto &Edge : CalleeEdges)
-    OS << "\t\t" << *Edge << "\n";
+    OS << "\t\t" << *Edge << " (Callee NodeId: " << Edge->Callee->NodeId
+       << ")\n";
   OS << "\tCallerEdges:\n";
   for (auto &Edge : CallerEdges)
-    OS << "\t\t" << *Edge << "\n";
+    OS << "\t\t" << *Edge << " (Caller NodeId: " << Edge->Caller->NodeId
+       << ")\n";
   if (!Clones.empty()) {
-    OS << "\tClones: " << llvm::interleaved(Clones) << "\n";
+    OS << "\tClones: ";
+    bool First = true;
+    for (auto *C : Clones) {
+      if (!First)
+        OS << ", ";
+      First = false;
+      OS << C << " NodeId: " << C->NodeId;
+    }
+    OS << "\n";
   } else if (CloneOf) {
-    OS << "\tClone of " << CloneOf << "\n";
+    OS << "\tClone of " << CloneOf << " NodeId: " << CloneOf->NodeId << "\n";
   }
 }
 
@@ -3149,7 +3164,7 @@ struct DOTGraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>
   static std::string getNodeLabel(NodeRef Node, GraphType G) {
     std::string LabelString =
         (Twine("OrigId: ") + (Node->IsAllocation ? "Alloc" : "") +
-         Twine(Node->OrigStackOrAllocId))
+         Twine(Node->OrigStackOrAllocId) + " NodeId: " + Twine(Node->NodeId))
             .str();
     LabelString += "\n";
     if (Node->hasCall()) {
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index f9155cc660317..00951fde0cf8a 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -2002,6 +2002,16 @@ Instruction *InstCombinerImpl::visitFAdd(BinaryOperator &I) {
   if (Instruction *FoldedFAdd = foldBinOpIntoSelectOrPhi(I))
     return FoldedFAdd;
 
+  // B = fadd A, 0.0
+  // Z = Op B
+  // can be transformed into
+  // Z = Op A
+  // Where Op is such that we can ignore sign of 0 in fadd
+  Value *A;
+  if (match(&I, m_OneUse(m_FAdd(m_Value(A), m_AnyZeroFP()))) &&
+      canIgnoreSignBitOfZero(*I.use_begin()))
+    return replaceInstUsesWith(I, A);
+
   // (-X) + Y --> Y - X
   Value *X, *Y;
   if (match(&I, m_c_FAdd(m_FNeg(m_Value(X)), m_Value(Y))))
@@ -2731,6 +2741,24 @@ Instruction *InstCombinerImpl::visitSub(BinaryOperator &I) {
     return BinaryOperator::CreateSub(X, Not);
   }
 
+  // min(X+1, Y) - min(X, Y) --> zext X < Y
+  // Replacing a sub and at least one min with an icmp
+  // and a zext is a potential improvement.
+  if (match(Op0, m_c_SMin(m_NSWAddLike(m_Value(X), m_One()), m_Value(Y))) &&
+      match(Op1, m_c_SMin(m_Specific(X), m_Specific(Y))) &&
+      I.getType()->getScalarSizeInBits() != 1 &&
+      (Op0->hasOneUse() || Op1->hasOneUse())) {
+    Value *Cond = Builder.CreateICmpSLT(X, Y);
+    return new ZExtInst(Cond, I.getType());
+  }
+  if (match(Op0, m_c_UMin(m_NUWAddLike(m_Value(X), m_One()), m_Value(Y))) &&
+      match(Op1, m_c_UMin(m_Specific(X), m_Specific(Y))) &&
+      I.getType()->getScalarSizeInBits() != 1 &&
+      (Op0->hasOneUse() || Op1->hasOneUse())) {
+    Value *Cond = Builder.CreateICmpULT(X, Y);
+    return new ZExtInst(Cond, I.getType());
+  }
+
   // Optimize pointer differences into the same array into a size.  Consider:
   //  &A[10] - &A[0]: we should compile this to "10".
   Value *LHSOp, *RHSOp;
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 33b66aeaffe60..17cf4154f8dbd 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -3385,12 +3385,13 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
       // TODO: apply range metadata for range check patterns?
     }
 
-    // Separate storage assumptions apply to the underlying allocations, not any
-    // particular pointer within them. When evaluating the hints for AA purposes
-    // we getUnderlyingObject them; by precomputing the answers here we can
-    // avoid having to do so repeatedly there.
     for (unsigned Idx = 0; Idx < II->getNumOperandBundles(); Idx++) {
       OperandBundleUse OBU = II->getOperandBundleAt(Idx);
+
+      // Separate storage assumptions apply to the underlying allocations, not
+      // any particular pointer within them. When evaluating the hints for AA
+      // purposes we getUnderlyingObject them; by precomputing the answers here
+      // we can avoid having to do so repeatedly there.
       if (OBU.getTagName() == "separate_storage") {
         assert(OBU.Inputs.size() == 2);
         auto MaybeSimplifyHint = [&](const Use &U) {
@@ -3404,6 +3405,32 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
         MaybeSimplifyHint(OBU.Inputs[0]);
         MaybeSimplifyHint(OBU.Inputs[1]);
       }
+
+      // Try to remove redundant alignment assumptions.
+      if (OBU.getTagName() == "align" && OBU.Inputs.size() == 2) {
+        RetainedKnowledge RK = getKnowledgeFromOperandInAssume(
+            *cast<AssumeInst>(II), II->arg_size() + Idx);
+        if (!RK || RK.AttrKind != Attribute::Alignment ||
+            !isPowerOf2_64(RK.ArgValue) || !isa<ConstantInt>(RK.IRArgValue))
+          continue;
+
+        // Don't try to remove align assumptions for pointers derived from
+        // arguments. We might lose information if the function gets inline and
+        // the align argument attribute disappears.
+        Value *UO = getUnderlyingObject(RK.WasOn);
+        if (!UO || isa<Argument>(UO))
+          continue;
+
+        // Compute known bits for the pointer, passing nullptr as context to
+        // avoid computeKnownBits using the assumption we are about to remove
+        // for reasoning.
+        KnownBits Known = computeKnownBits(RK.WasOn, /*CtxI=*/nullptr);
+        unsigned TZ = std::min(Known.countMinTrailingZeros(),
+                               Value::MaxAlignmentExponent);
+        if ((1ULL << TZ) < RK.ArgValue)
+          continue;
+        return CallBase::removeOperandBundle(II, OBU.getTagID());
+      }
     }
 
     // Convert nonnull assume like:
@@ -3925,6 +3952,19 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
     }
     break;
   }
+  case Intrinsic::get_active_lane_mask: {
+    const APInt *Op0, *Op1;
+    if (match(II->getOperand(0), m_StrictlyPositive(Op0)) &&
+        match(II->getOperand(1), m_APInt(Op1))) {
+      Type *OpTy = II->getOperand(0)->getType();
+      return replaceInstUsesWith(
+          *II, Builder.CreateIntrinsic(
+                   II->getType(), Intrinsic::get_active_lane_mask,
+                   {Constant::getNullValue(OpTy),
+                    ConstantInt::get(OpTy, Op1->usub_sat(*Op0))}));
+    }
+    break;
+  }
   default: {
     // Handle target specific intrinsics
     std::optional<Instruction *> V = targetInstCombineIntrinsic(*II);
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
index ccf918f0b6dbe..9ca8194b44f8f 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -977,8 +977,7 @@ Instruction *InstCombinerImpl::visitTrunc(TruncInst &Trunc) {
   // trunc ( OP i8 C1, V1) to i1 -> icmp eq V1, log_2(C1) iff C1 is power of 2
   if (DestWidth == 1 && match(Src, m_Shr(m_Power2(C1), m_Value(V1)))) {
     Value *Right = ConstantInt::get(V1->getType(), C1->countr_zero());
-    Value *Icmp = Builder.CreateICmpEQ(V1, Right);
-    return replaceInstUsesWith(Trunc, Icmp);
+    return new ICmpInst(ICmpInst::ICMP_EQ, V1, Right);
   }
 
   // OP = { lshr, ashr }
@@ -986,8 +985,15 @@ Instruction *InstCombinerImpl::visitTrunc(TruncInst &Trunc) {
   // power of 2
   if (DestWidth == 1 && match(Src, m_Shr(m_LowBitMask(C1), m_Value(V1)))) {
     Value *Right = ConstantInt::get(V1->getType(), C1->countr_one());
-    Value *Icmp = Builder.CreateICmpULT(V1, Right);
-    return replaceInstUsesWith(Trunc, Icmp);
+    return new ICmpInst(ICmpInst::ICMP_ULT, V1, Right);
+  }
+
+  // OP = { lshr, ashr }
+  // trunc ( OP i8 C1, V1) to i1 -> icmp ugt V1, cttz(C1) - 1 iff (C1) is
+  // negative power of 2
+  if (DestWidth == 1 && match(Src, m_Shr(m_NegatedPower2(C1), m_Value(V1)))) {
+    Value *Right = ConstantInt::get(V1->getType(), C1->countr_zero());
+    return new ICmpInst(ICmpInst::ICMP_UGE, V1, Right);
   }
 
   return Changed ? &Trunc : nullptr;
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
index 4b10586616c29..53e77e6cc5c31 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
@@ -107,8 +107,8 @@ isOnlyCopiedFromConstantMemory(AAResults *AA, AllocaInst *V,
         // a load (but one that potentially returns the value itself), so we can
         // ignore it if we know that the value isn't captured.
         bool NoCapture = Call->doesNotCapture(DataOpNo);
-        if ((Call->onlyReadsMemory() && (Call->use_empty() || NoCapture)) ||
-            (Call->onlyReadsMemory(DataOpNo) && NoCapture))
+        if (NoCapture &&
+            (Call->onlyReadsMemory() || Call->onlyReadsMemory(DataOpNo)))
           continue;
       }
 
diff --git a/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp b/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp
index ed9a0be6981fa..15e7172c6ce12 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp
@@ -60,17 +60,18 @@ bool InstCombinerImpl::foldDeadPhiWeb(PHINode &PN) {
   SmallVector<PHINode *, 16> Stack;
   SmallPtrSet<PHINode *, 16> Visited;
   Stack.push_back(&PN);
+  Visited.insert(&PN);
   while (!Stack.empty()) {
     PHINode *Phi = Stack.pop_back_val();
-    if (!Visited.insert(Phi).second)
-      continue;
-    // Early stop if the set of PHIs is large
-    if (Visited.size() == 16)
-      return false;
     for (User *Use : Phi->users()) {
-      if (PHINode *PhiUse = dyn_cast<PHINode>(Use))
+      if (PHINode *PhiUse = dyn_cast<PHINode>(Use)) {
+        if (!Visited.insert(PhiUse).second)
+          continue;
+        // Early stop if the set of PHIs is large
+        if (Visited.size() >= 16)
+          return false;
         Stack.push_back(PhiUse);
-      else
+      } else
         return false;
     }
   }
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
index 9467463d39c0e..8f9d0bf6240d5 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -1153,6 +1153,38 @@ static Value *foldAbsDiff(ICmpInst *Cmp, Value *TVal, Value *FVal,
     return Builder.CreateBinaryIntrinsic(Intrinsic::abs, TI, Builder.getTrue());
   }
 
+  // Match: (A > B) ? (A - B) : (0 - (A - B)) --> abs(A - B)
+  if (Pred == CmpInst::ICMP_SGT &&
+      match(TI, m_NSWSub(m_Specific(A), m_Specific(B))) &&
+      match(FI, m_Neg(m_Specific(TI)))) {
+    return Builder.CreateBinaryIntrinsic(Intrinsic::abs, TI,
+                                         Builder.getFalse());
+  }
+
+  // Match: (A < B) ? (0 - (A - B)) : (A - B) --> abs(A - B)
+  if (Pred == CmpInst::ICMP_SLT &&
+      match(FI, m_NSWSub(m_Specific(A), m_Specific(B))) &&
+      match(TI, m_Neg(m_Specific(FI)))) {
+    return Builder.CreateBinaryIntrinsic(Intrinsic::abs, FI,
+                                         Builder.getFalse());
+  }
+
+  // Match: (A > B) ? (0 - (B - A)) : (B - A) --> abs(B - A)
+  if (Pred == CmpInst::ICMP_SGT &&
+      match(FI, m_NSWSub(m_Specific(B), m_Specific(A))) &&
+      match(TI, m_Neg(m_Specific(FI)))) {
+    return Builder.CreateBinaryIntrinsic(Intrinsic::abs, FI,
+                                         Builder.getFalse());
+  }
+
+  // Match: (A < B) ? (B - A) : (0 - (B - A)) --> abs(B - A)
+  if (Pred == CmpInst::ICMP_SLT &&
+      match(TI, m_NSWSub(m_Specific(B), m_Specific(A))) &&
+      match(FI, m_Neg(m_Specific(TI)))) {
+    return Builder.CreateBinaryIntrinsic(Intrinsic::abs, TI,
+                                         Builder.getFalse());
+  }
+
   return nullptr;
 }
 
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index c2f045a2ab02e..f0ddd5ca94c5a 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -144,6 +144,8 @@ static cl::opt<unsigned>
 MaxArraySize("instcombine-maxarray-size", cl::init(1024),
              cl::desc("Maximum array size considered when doing a combine"));
 
+extern cl::opt<bool> ProfcheckDisableMetadataFixes;
+
 // FIXME: Remove this flag when it is no longer necessary to convert
 // llvm.dbg.declare to avoid inaccurate debug info. Setting this to false
 // increases variable availability at the cost of accuracy. Variables that
@@ -1361,6 +1363,10 @@ Value *InstCombinerImpl::SimplifySelectsFeedingBinaryOp(BinaryOperator &I,
   if (!LHSIsSelect && !RHSIsSelect)
     return nullptr;
 
+  SelectInst *SI = ProfcheckDisableMetadataFixes
+                       ? nullptr
+                       : cast<SelectInst>(LHSIsSelect ? LHS : RHS);
+
   FastMathFlags FMF;
   BuilderTy::FastMathFlagGuard Guard(Builder);
   if (isa<FPMathOperator>(&I)) {
@@ -1381,15 +1387,14 @@ Value *InstCombinerImpl::SimplifySelectsFeedingBinaryOp(BinaryOperator &I,
     // We need an 'add' and exactly 1 arm of the select to have been simplified.
     if (Opcode != Instruction::Add || (!True && !False) || (True && False))
       return nullptr;
-
     Value *N;
     if (True && match(FVal, m_Neg(m_Value(N)))) {
       Value *Sub = Builder.CreateSub(Z, N);
-      return Builder.CreateSelect(Cond, True, Sub, I.getName());
+      return Builder.CreateSelect(Cond, True, Sub, I.getName(), SI);
     }
     if (False && match(TVal, m_Neg(m_Value(N)))) {
       Value *Sub = Builder.CreateSub(Z, N);
-      return Builder.CreateSelect(Cond, Sub, False, I.getName());
+      return Builder.CreateSelect(Cond, Sub, False, I.getName(), SI);
     }
     return nullptr;
   };
@@ -1425,9 +1430,9 @@ Value *InstCombinerImpl::SimplifySelectsFeedingBinaryOp(BinaryOperator &I,
   if (!True || !False)
     return nullptr;
 
-  Value *SI = Builder.CreateSelect(Cond, True, False);
-  SI->takeName(&I);
-  return SI;
+  Value *NewSI = Builder.CreateSelect(Cond, True, False, I.getName(), SI);
+  NewSI->takeName(&I);
+  return NewSI;
 }
 
 /// Freely adapt every user of V as-if V was changed to !V.
diff --git a/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp b/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp
index ecb2f2dbc552b..c86092bd51eda 100644
--- a/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp
@@ -848,13 +848,12 @@ bool MemProfUsePass::annotateGlobalVariables(
     // So we just print out the static data section prefix in LLVM_DEBUG.
     if (Record && Record->AccessCount > 0) {
       ++NumOfMemProfHotGlobalVars;
-      GVar.setSectionPrefix("hot");
-      Changed = true;
+      Changed |= GVar.setSectionPrefix("hot");
       LLVM_DEBUG(dbgs() << "Global variable " << Name
                         << " is annotated as hot\n");
     } else if (DataAccessProf->isKnownColdSymbol(Name)) {
       ++NumOfMemProfColdGlobalVars;
-      GVar.setSectionPrefix("unlikely");
+      Changed |= GVar.setSectionPrefix("unlikely");
       Changed = true;
       LLVM_DEBUG(dbgs() << "Global variable " << Name
                         << " is annotated as unlikely\n");
diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index 9899a2aae2b15..7933604b8ac25 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -3684,6 +3684,15 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
 
     case Intrinsic::x86_mmx_packssdw:
       return Intrinsic::x86_mmx_packssdw;
+
+    case Intrinsic::x86_avx512_packssdw_512:
+    case Intrinsic::x86_avx512_packusdw_512:
+      return Intrinsic::x86_avx512_packssdw_512;
+
+    case Intrinsic::x86_avx512_packsswb_512:
+    case Intrinsic::x86_avx512_packuswb_512:
+      return Intrinsic::x86_avx512_packsswb_512;
+
     default:
       llvm_unreachable("unexpected intrinsic id");
     }
@@ -3696,6 +3705,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   // Shadow is propagated with the signed variant of the same intrinsic applied
   // to sext(Sa != zeroinitializer), sext(Sb != zeroinitializer).
   // MMXEltSizeInBits is used only for x86mmx arguments.
+  //
+  // TODO: consider using GetMinMaxUnsigned() to handle saturation precisely
   void handleVectorPackIntrinsic(IntrinsicInst &I,
                                  unsigned MMXEltSizeInBits = 0) {
     assert(I.arg_size() == 2);
@@ -4900,6 +4911,69 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     setOriginForNaryOp(I);
   }
 
+  // Handle llvm.x86.avx512.* instructions that take a vector of floating-point
+  // values and perform an operation whose shadow propagation should be handled
+  // as all-or-nothing [*], with masking provided by a vector and a mask
+  // supplied as an integer.
+  //
+  // [*] if all bits of a vector element are initialized, the output is fully
+  //     initialized; otherwise, the output is fully uninitialized
+  //
+  // e.g., <16 x float> @llvm.x86.avx512.rsqrt14.ps.512
+  //                        (<16 x float>, <16 x float>, i16)
+  //                         A             WriteThru     Mask
+  //
+  //       <2 x double> @llvm.x86.avx512.rcp14.pd.128
+  //                        (<2 x double>, <2 x double>, i8)
+  //
+  // Dst[i]        = Mask[i] ? some_op(A[i]) : WriteThru[i]
+  // Dst_shadow[i] = Mask[i] ? all_or_nothing(A_shadow[i]) : WriteThru_shadow[i]
+  void handleAVX512VectorGenericMaskedFP(IntrinsicInst &I) {
+    IRBuilder<> IRB(&I);
+
+    assert(I.arg_size() == 3);
+    Value *A = I.getOperand(0);
+    Value *WriteThrough = I.getOperand(1);
+    Value *Mask = I.getOperand(2);
+
+    assert(isFixedFPVector(A));
+    assert(isFixedFPVector(WriteThrough));
+
+    [[maybe_unused]] unsigned ANumElements =
+        cast<FixedVectorType>(A->getType())->getNumElements();
+    unsigned OutputNumElements =
+        cast<FixedVectorType>(WriteThrough->getType())->getNumElements();
+    assert(ANumElements == OutputNumElements);
+
+    assert(Mask->getType()->isIntegerTy());
+    // Some bits of the mask might be unused, but check them all anyway
+    // (typically the mask is an integer constant).
+    insertCheckShadowOf(Mask, &I);
+
+    // The mask has 1 bit per element of A, but a minimum of 8 bits.
+    if (Mask->getType()->getScalarSizeInBits() == 8 && ANumElements < 8)
+      Mask = IRB.CreateTrunc(Mask, Type::getIntNTy(*MS.C, ANumElements));
+    assert(Mask->getType()->getScalarSizeInBits() == ANumElements);
+
+    assert(I.getType() == WriteThrough->getType());
+
+    Mask = IRB.CreateBitCast(
+        Mask, FixedVectorType::get(IRB.getInt1Ty(), OutputNumElements));
+
+    Value *AShadow = getShadow(A);
+
+    // All-or-nothing shadow
+    AShadow = IRB.CreateSExt(IRB.CreateICmpNE(AShadow, getCleanShadow(AShadow)),
+                             AShadow->getType());
+
+    Value *WriteThroughShadow = getShadow(WriteThrough);
+
+    Value *Shadow = IRB.CreateSelect(Mask, AShadow, WriteThroughShadow);
+    setShadow(&I, Shadow);
+
+    setOriginForNaryOp(I);
+  }
+
   // For sh.* compiler intrinsics:
   //   llvm.x86.avx512fp16.mask.{add/sub/mul/div/max/min}.sh.round
   //     (<8 x half>, <8 x half>, <8 x half>, i8,  i32)
@@ -5554,6 +5628,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       handleVectorShiftIntrinsic(I, /* Variable */ true);
       break;
 
+    // Pack with Signed/Unsigned Saturation
     case Intrinsic::x86_sse2_packsswb_128:
     case Intrinsic::x86_sse2_packssdw_128:
     case Intrinsic::x86_sse2_packuswb_128:
@@ -5562,6 +5637,15 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     case Intrinsic::x86_avx2_packssdw:
     case Intrinsic::x86_avx2_packuswb:
     case Intrinsic::x86_avx2_packusdw:
+    // e.g., <64 x i8> @llvm.x86.avx512.packsswb.512
+    //                     (<32 x i16> %a, <32 x i16> %b)
+    //       <32 x i16> @llvm.x86.avx512.packssdw.512
+    //                     (<16 x i32> %a, <16 x i32> %b)
+    // Note: AVX512 masked variants are auto-upgraded by LLVM.
+    case Intrinsic::x86_avx512_packsswb_512:
+    case Intrinsic::x86_avx512_packssdw_512:
+    case Intrinsic::x86_avx512_packuswb_512:
+    case Intrinsic::x86_avx512_packusdw_512:
       handleVectorPackIntrinsic(I);
       break;
 
@@ -6070,6 +6154,108 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       break;
     }
 
+    // AVX512/AVX10 Reciprocal
+    //   <16 x float> @llvm.x86.avx512.rsqrt14.ps.512
+    //                    (<16 x float>, <16 x float>, i16)
+    //   <8 x float> @llvm.x86.avx512.rsqrt14.ps.256
+    //                    (<8 x float>, <8 x float>, i8)
+    //   <4 x float> @llvm.x86.avx512.rsqrt14.ps.128
+    //                    (<4 x float>, <4 x float>, i8)
+    //
+    //   <8 x double> @llvm.x86.avx512.rsqrt14.pd.512
+    //                    (<8 x double>, <8 x double>, i8)
+    //   <4 x double> @llvm.x86.avx512.rsqrt14.pd.256
+    //                    (<4 x double>, <4 x double>, i8)
+    //   <2 x double> @llvm.x86.avx512.rsqrt14.pd.128
+    //                    (<2 x double>, <2 x double>, i8)
+    //
+    //   <32 x bfloat> @llvm.x86.avx10.mask.rsqrt.bf16.512
+    //                    (<32 x bfloat>, <32 x bfloat>, i32)
+    //   <16 x bfloat> @llvm.x86.avx10.mask.rsqrt.bf16.256
+    //                    (<16 x bfloat>, <16 x bfloat>, i16)
+    //   <8 x bfloat> @llvm.x86.avx10.mask.rsqrt.bf16.128
+    //                    (<8 x bfloat>, <8 x bfloat>, i8)
+    //
+    //   <32 x half> @llvm.x86.avx512fp16.mask.rsqrt.ph.512
+    //                    (<32 x half>, <32 x half>, i32)
+    //   <16 x half> @llvm.x86.avx512fp16.mask.rsqrt.ph.256
+    //                    (<16 x half>, <16 x half>, i16)
+    //   <8 x half> @llvm.x86.avx512fp16.mask.rsqrt.ph.128
+    //                    (<8 x half>, <8 x half>, i8)
+    //
+    // TODO: 3-operand variants are not handled:
+    //   <2 x double> @llvm.x86.avx512.rsqrt14.sd
+    //                    (<2 x double>, <2 x double>, <2 x double>, i8)
+    //   <4 x float> @llvm.x86.avx512.rsqrt14.ss
+    //                    (<4 x float>, <4 x float>, <4 x float>, i8)
+    //   <8 x half> @llvm.x86.avx512fp16.mask.rsqrt.sh
+    //                    (<8 x half>, <8 x half>, <8 x half>, i8)
+    case Intrinsic::x86_avx512_rsqrt14_ps_512:
+    case Intrinsic::x86_avx512_rsqrt14_ps_256:
+    case Intrinsic::x86_avx512_rsqrt14_ps_128:
+    case Intrinsic::x86_avx512_rsqrt14_pd_512:
+    case Intrinsic::x86_avx512_rsqrt14_pd_256:
+    case Intrinsic::x86_avx512_rsqrt14_pd_128:
+    case Intrinsic::x86_avx10_mask_rsqrt_bf16_512:
+    case Intrinsic::x86_avx10_mask_rsqrt_bf16_256:
+    case Intrinsic::x86_avx10_mask_rsqrt_bf16_128:
+    case Intrinsic::x86_avx512fp16_mask_rsqrt_ph_512:
+    case Intrinsic::x86_avx512fp16_mask_rsqrt_ph_256:
+    case Intrinsic::x86_avx512fp16_mask_rsqrt_ph_128:
+      handleAVX512VectorGenericMaskedFP(I);
+      break;
+
+    // AVX512/AVX10 Reciprocal Square Root
+    //   <16 x float> @llvm.x86.avx512.rcp14.ps.512
+    //                    (<16 x float>, <16 x float>, i16)
+    //   <8 x float> @llvm.x86.avx512.rcp14.ps.256
+    //                    (<8 x float>, <8 x float>, i8)
+    //   <4 x float> @llvm.x86.avx512.rcp14.ps.128
+    //                    (<4 x float>, <4 x float>, i8)
+    //
+    //   <8 x double> @llvm.x86.avx512.rcp14.pd.512
+    //                    (<8 x double>, <8 x double>, i8)
+    //   <4 x double> @llvm.x86.avx512.rcp14.pd.256
+    //                    (<4 x double>, <4 x double>, i8)
+    //   <2 x double> @llvm.x86.avx512.rcp14.pd.128
+    //                    (<2 x double>, <2 x double>, i8)
+    //
+    //   <32 x bfloat> @llvm.x86.avx10.mask.rcp.bf16.512
+    //                    (<32 x bfloat>, <32 x bfloat>, i32)
+    //   <16 x bfloat> @llvm.x86.avx10.mask.rcp.bf16.256
+    //                    (<16 x bfloat>, <16 x bfloat>, i16)
+    //   <8 x bfloat> @llvm.x86.avx10.mask.rcp.bf16.128
+    //                    (<8 x bfloat>, <8 x bfloat>, i8)
+    //
+    //   <32 x half> @llvm.x86.avx512fp16.mask.rcp.ph.512
+    //                    (<32 x half>, <32 x half>, i32)
+    //   <16 x half> @llvm.x86.avx512fp16.mask.rcp.ph.256
+    //                    (<16 x half>, <16 x half>, i16)
+    //   <8 x half> @llvm.x86.avx512fp16.mask.rcp.ph.128
+    //                    (<8 x half>, <8 x half>, i8)
+    //
+    // TODO: 3-operand variants are not handled:
+    //   <2 x double> @llvm.x86.avx512.rcp14.sd
+    //                    (<2 x double>, <2 x double>, <2 x double>, i8)
+    //   <4 x float> @llvm.x86.avx512.rcp14.ss
+    //                    (<4 x float>, <4 x float>, <4 x float>, i8)
+    //   <8 x half> @llvm.x86.avx512fp16.mask.rcp.sh
+    //                    (<8 x half>, <8 x half>, <8 x half>, i8)
+    case Intrinsic::x86_avx512_rcp14_ps_512:
+    case Intrinsic::x86_avx512_rcp14_ps_256:
+    case Intrinsic::x86_avx512_rcp14_ps_128:
+    case Intrinsic::x86_avx512_rcp14_pd_512:
+    case Intrinsic::x86_avx512_rcp14_pd_256:
+    case Intrinsic::x86_avx512_rcp14_pd_128:
+    case Intrinsic::x86_avx10_mask_rcp_bf16_512:
+    case Intrinsic::x86_avx10_mask_rcp_bf16_256:
+    case Intrinsic::x86_avx10_mask_rcp_bf16_128:
+    case Intrinsic::x86_avx512fp16_mask_rcp_ph_512:
+    case Intrinsic::x86_avx512fp16_mask_rcp_ph_256:
+    case Intrinsic::x86_avx512fp16_mask_rcp_ph_128:
+      handleAVX512VectorGenericMaskedFP(I);
+      break;
+
     // AVX512 FP16 Arithmetic
     case Intrinsic::x86_avx512fp16_mask_add_sh_round:
     case Intrinsic::x86_avx512fp16_mask_sub_sh_round:
diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp
index e157cc9212769..092a0fb264c28 100644
--- a/llvm/lib/Transforms/Scalar/LICM.cpp
+++ b/llvm/lib/Transforms/Scalar/LICM.cpp
@@ -474,7 +474,7 @@ bool LoopInvariantCodeMotion::runOnLoop(Loop *L, AAResults *AA, LoopInfo *LI,
   if (Preheader)
     Changed |= hoistRegion(DT->getNode(L->getHeader()), AA, LI, DT, AC, TLI, L,
                            MSSAU, SE, &SafetyInfo, Flags, ORE, LoopNestMode,
-                           LicmAllowSpeculation, HasCoroSuspendInst);
+                           LicmAllowSpeculation);
 
   // Now that all loop invariants have been removed from the loop, promote any
   // memory references to scalars that we can.
@@ -892,7 +892,7 @@ bool llvm::hoistRegion(DomTreeNode *N, AAResults *AA, LoopInfo *LI,
                        ICFLoopSafetyInfo *SafetyInfo,
                        SinkAndHoistLICMFlags &Flags,
                        OptimizationRemarkEmitter *ORE, bool LoopNestMode,
-                       bool AllowSpeculation, bool HasCoroSuspendInst) {
+                       bool AllowSpeculation) {
   // Verify inputs.
   assert(N != nullptr && AA != nullptr && LI != nullptr && DT != nullptr &&
          CurLoop != nullptr && SafetyInfo != nullptr &&
@@ -925,7 +925,7 @@ bool llvm::hoistRegion(DomTreeNode *N, AAResults *AA, LoopInfo *LI,
       // TODO: It may be safe to hoist if we are hoisting to a conditional block
       // and we have accurately duplicated the control flow from the loop header
       // to that block.
-      if (CurLoop->hasLoopInvariantOperands(&I, HasCoroSuspendInst) &&
+      if (CurLoop->hasLoopInvariantOperands(&I) &&
           canSinkOrHoistInst(I, AA, DT, CurLoop, MSSAU, true, Flags, ORE) &&
           isSafeToExecuteUnconditionally(I, DT, TLI, CurLoop, SafetyInfo, ORE,
                                          Preheader->getTerminator(), AC,
@@ -975,7 +975,7 @@ bool llvm::hoistRegion(DomTreeNode *N, AAResults *AA, LoopInfo *LI,
                SafetyInfo->doesNotWriteMemoryBefore(I, CurLoop);
       };
       if ((IsInvariantStart(I) || isGuard(&I)) &&
-          CurLoop->hasLoopInvariantOperands(&I, HasCoroSuspendInst) &&
+          CurLoop->hasLoopInvariantOperands(&I) &&
           MustExecuteWithoutWritesBefore(I)) {
         hoist(I, DT, CurLoop, CFH.getOrCreateHoistedBlock(BB), SafetyInfo,
               MSSAU, SE, ORE);
@@ -1705,10 +1705,7 @@ static void hoist(Instruction &I, const DominatorTree *DT, const Loop *CurLoop,
       // time in isGuaranteedToExecute if we don't actually have anything to
       // drop.  It is a compile time optimization, not required for correctness.
       !SafetyInfo->isGuaranteedToExecute(I, DT, CurLoop)) {
-    if (ProfcheckDisableMetadataFixes)
-      I.dropUBImplyingAttrsAndMetadata();
-    else
-      I.dropUBImplyingAttrsAndMetadata({LLVMContext::MD_prof});
+    I.dropUBImplyingAttrsAndMetadata();
   }
 
   if (isa<PHINode>(I))
diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index e3ef9d8680b53..1a279b6198182 100644
--- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -167,17 +167,15 @@ static cl::opt<bool> FilterSameScaledReg(
              " with the same ScaledReg and Scale"));
 
 static cl::opt<TTI::AddressingModeKind> PreferredAddresingMode(
-  "lsr-preferred-addressing-mode", cl::Hidden, cl::init(TTI::AMK_None),
-   cl::desc("A flag that overrides the target's preferred addressing mode."),
-   cl::values(clEnumValN(TTI::AMK_None,
-                         "none",
-                         "Don't prefer any addressing mode"),
-              clEnumValN(TTI::AMK_PreIndexed,
-                         "preindexed",
-                         "Prefer pre-indexed addressing mode"),
-              clEnumValN(TTI::AMK_PostIndexed,
-                         "postindexed",
-                         "Prefer post-indexed addressing mode")));
+    "lsr-preferred-addressing-mode", cl::Hidden, cl::init(TTI::AMK_None),
+    cl::desc("A flag that overrides the target's preferred addressing mode."),
+    cl::values(
+        clEnumValN(TTI::AMK_None, "none", "Don't prefer any addressing mode"),
+        clEnumValN(TTI::AMK_PreIndexed, "preindexed",
+                   "Prefer pre-indexed addressing mode"),
+        clEnumValN(TTI::AMK_PostIndexed, "postindexed",
+                   "Prefer post-indexed addressing mode"),
+        clEnumValN(TTI::AMK_All, "all", "Consider all addressing modes")));
 
 static cl::opt<unsigned> ComplexityLimit(
   "lsr-complexity-limit", cl::Hidden,
@@ -1404,7 +1402,7 @@ void Cost::RateRegister(const Formula &F, const SCEV *Reg,
     // for now LSR only handles innermost loops).
     if (AR->getLoop() != L) {
       // If the AddRec exists, consider it's register free and leave it alone.
-      if (isExistingPhi(AR, *SE) && AMK != TTI::AMK_PostIndexed)
+      if (isExistingPhi(AR, *SE) && !(AMK & TTI::AMK_PostIndexed))
         return;
 
       // It is bad to allow LSR for current loop to add induction variables
@@ -1427,9 +1425,9 @@ void Cost::RateRegister(const Formula &F, const SCEV *Reg,
       if (match(AR, m_scev_AffineAddRec(m_SCEV(Start), m_SCEVConstant(Step))))
         // If the step size matches the base offset, we could use pre-indexed
         // addressing.
-        if ((AMK == TTI::AMK_PreIndexed && F.BaseOffset.isFixed() &&
+        if (((AMK & TTI::AMK_PreIndexed) && F.BaseOffset.isFixed() &&
              Step->getAPInt() == F.BaseOffset.getFixedValue()) ||
-            (AMK == TTI::AMK_PostIndexed && !isa<SCEVConstant>(Start) &&
+            ((AMK & TTI::AMK_PostIndexed) && !isa<SCEVConstant>(Start) &&
              SE->isLoopInvariant(Start, L)))
           LoopCost = 0;
     }
@@ -4147,7 +4145,7 @@ void LSRInstance::GenerateConstantOffsetsImpl(
   // means that a single pre-indexed access can be generated to become the new
   // base pointer for each iteration of the loop, resulting in no extra add/sub
   // instructions for pointer updating.
-  if (AMK == TTI::AMK_PreIndexed && LU.Kind == LSRUse::Address) {
+  if ((AMK & TTI::AMK_PreIndexed) && LU.Kind == LSRUse::Address) {
     const APInt *StepInt;
     if (match(G, m_scev_AffineAddRec(m_SCEV(), m_scev_APInt(StepInt)))) {
       int64_t Step = StepInt->isNegative() ? StepInt->getSExtValue()
@@ -5437,7 +5435,7 @@ void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
     // This can sometimes (notably when trying to favour postinc) lead to
     // sub-optimial decisions. There it is best left to the cost modelling to
     // get correct.
-    if (AMK != TTI::AMK_PostIndexed || LU.Kind != LSRUse::Address) {
+    if (!(AMK & TTI::AMK_PostIndexed) || LU.Kind != LSRUse::Address) {
       int NumReqRegsToFind = std::min(F.getNumRegs(), ReqRegs.size());
       for (const SCEV *Reg : ReqRegs) {
         if ((F.ScaledReg && F.ScaledReg == Reg) ||
diff --git a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
index e05625344ee29..2ee91a9b40026 100644
--- a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
+++ b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
@@ -308,6 +308,9 @@ class StructurizeCFG {
 
   void hoistZeroCostElseBlockPhiValues(BasicBlock *ElseBB, BasicBlock *ThenBB);
 
+  bool isHoistableInstruction(Instruction *I, BasicBlock *BB,
+                              BasicBlock *HoistTo);
+
   void orderNodes();
 
   void analyzeLoops(RegionNode *N);
@@ -415,11 +418,21 @@ class StructurizeCFGLegacyPass : public RegionPass {
 
 } // end anonymous namespace
 
+char StructurizeCFGLegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(StructurizeCFGLegacyPass, "structurizecfg",
+                      "Structurize the CFG", false, false)
+INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(RegionInfoPass)
+INITIALIZE_PASS_END(StructurizeCFGLegacyPass, "structurizecfg",
+                    "Structurize the CFG", false, false)
+
 /// Checks whether an instruction is zero cost instruction and checks if the
 /// operands are from different BB. If so, this instruction can be coalesced
 /// if its hoisted to predecessor block. So, this returns true.
-static bool isHoistableInstruction(Instruction *I, BasicBlock *BB,
-                                   const TargetTransformInfo *TTI) {
+bool StructurizeCFG::isHoistableInstruction(Instruction *I, BasicBlock *BB,
+                                            BasicBlock *HoistTo) {
   if (I->getParent() != BB || isa<PHINode>(I))
     return false;
 
@@ -432,10 +445,11 @@ static bool isHoistableInstruction(Instruction *I, BasicBlock *BB,
   if (CostVal != 0)
     return false;
 
-  // Check if any operands are instructions defined in the same block.
+  // Check if all operands are available at the hoisting destination.
   for (auto &Op : I->operands()) {
     if (auto *OpI = dyn_cast<Instruction>(Op)) {
-      if (OpI->getParent() == BB)
+      // Operand must dominate the hoisting destination.
+      if (!DT->dominates(OpI->getParent(), HoistTo))
         return false;
     }
   }
@@ -443,16 +457,6 @@ static bool isHoistableInstruction(Instruction *I, BasicBlock *BB,
   return true;
 }
 
-char StructurizeCFGLegacyPass::ID = 0;
-
-INITIALIZE_PASS_BEGIN(StructurizeCFGLegacyPass, "structurizecfg",
-                      "Structurize the CFG", false, false)
-INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(RegionInfoPass)
-INITIALIZE_PASS_END(StructurizeCFGLegacyPass, "structurizecfg",
-                    "Structurize the CFG", false, false)
-
 /// Structurization can introduce unnecessary VGPR copies due to register
 /// coalescing interference. For example, if the Else block has a zero-cost
 /// instruction and the Then block modifies the VGPR value, only one value is
@@ -478,7 +482,7 @@ void StructurizeCFG::hoistZeroCostElseBlockPhiValues(BasicBlock *ElseBB,
   for (PHINode &Phi : ElseSucc->phis()) {
     Value *ElseVal = Phi.getIncomingValueForBlock(ElseBB);
     auto *Inst = dyn_cast<Instruction>(ElseVal);
-    if (!Inst || !isHoistableInstruction(Inst, ElseBB, TTI))
+    if (!Inst || !isHoistableInstruction(Inst, ElseBB, CommonDominator))
       continue;
     Inst->removeFromParent();
     Inst->insertInto(CommonDominator, Term->getIterator());
diff --git a/llvm/lib/Transforms/Utils/CMakeLists.txt b/llvm/lib/Transforms/Utils/CMakeLists.txt
index e411d68570096..f367ca2fdf56b 100644
--- a/llvm/lib/Transforms/Utils/CMakeLists.txt
+++ b/llvm/lib/Transforms/Utils/CMakeLists.txt
@@ -20,6 +20,7 @@ add_llvm_component_library(LLVMTransformUtils
   CtorUtils.cpp
   CountVisits.cpp
   Debugify.cpp
+  DebugSSAUpdater.cpp
   DeclareRuntimeLibcalls.cpp
   DemoteRegToStack.cpp
   DXILUpgrade.cpp
diff --git a/llvm/lib/Transforms/Utils/DebugSSAUpdater.cpp b/llvm/lib/Transforms/Utils/DebugSSAUpdater.cpp
new file mode 100644
index 0000000000000..c0e7609176a83
--- /dev/null
+++ b/llvm/lib/Transforms/Utils/DebugSSAUpdater.cpp
@@ -0,0 +1,390 @@
+//===- DebugSSAUpdater.cpp - Debug Variable SSA Update Tool ---------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the DebugSSAUpdater class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/DebugSSAUpdater.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/Transforms/Utils/SSAUpdaterImpl.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "debug-ssa-updater"
+
+void DbgValueDef::print(raw_ostream &OS) const {
+  OS << "DbgVal{ ";
+  if (IsUndef) {
+    OS << "undef }";
+    return;
+  }
+  if (Phi) {
+    OS << *Phi << "}";
+    return;
+  }
+  OS << (IsMemory ? "Mem: " : "Def: ") << *Locations << " - " << *Expression
+     << " }";
+}
+
+void DbgSSAPhi::print(raw_ostream &OS) const {
+  OS << "DbgPhi ";
+  for (auto &[BB, DV] : IncomingValues)
+    OS << "[" << BB->BB.getName() << ", " << DV << "] ";
+}
+
+using AvailableValsTy = DenseMap<DbgSSABlock *, DbgValueDef>;
+
+DebugSSAUpdater::DebugSSAUpdater(SmallVectorImpl<DbgSSAPhi *> *NewPHI)
+    : InsertedPHIs(NewPHI) {}
+
+void DebugSSAUpdater::initialize() { AV.clear(); }
+
+bool DebugSSAUpdater::hasValueForBlock(DbgSSABlock *BB) const {
+  return AV.count(BB);
+}
+
+DbgValueDef DebugSSAUpdater::findValueForBlock(DbgSSABlock *BB) const {
+  return AV.lookup(BB);
+}
+
+void DebugSSAUpdater::addAvailableValue(DbgSSABlock *BB, DbgValueDef DV) {
+  AV[BB] = DV;
+}
+
+DbgValueDef DebugSSAUpdater::getValueAtEndOfBlock(DbgSSABlock *BB) {
+  DbgValueDef Res = getValueAtEndOfBlockInternal(BB);
+  return Res;
+}
+
+DbgValueDef DebugSSAUpdater::getValueInMiddleOfBlock(DbgSSABlock *BB) {
+  // If there is no definition of the renamed variable in this block, just use
+  // 'getValueAtEndOfBlock' to do our work.
+  if (!hasValueForBlock(BB))
+    return getValueAtEndOfBlock(BB);
+
+  // Otherwise, we have the hard case. Get the live-in values for each
+  // predecessor.
+  SmallVector<std::pair<DbgSSABlock *, DbgValueDef>, 8> PredValues;
+  DbgValueDef SingularValue;
+
+  bool IsFirstPred = true;
+  for (DbgSSABlock *PredBB : BB->predecessors()) {
+    DbgValueDef PredVal = getValueAtEndOfBlock(PredBB);
+    PredValues.push_back(std::make_pair(PredBB, PredVal));
+
+    // Compute SingularValue.
+    if (IsFirstPred) {
+      SingularValue = PredVal;
+      IsFirstPred = false;
+    } else if (!PredVal.agreesWith(SingularValue))
+      SingularValue = DbgValueDef();
+  }
+
+  // If there are no predecessors, just return undef.
+  if (PredValues.empty())
+    return DbgValueDef();
+
+  // Otherwise, if all the merged values are the same, just use it.
+  if (!SingularValue.IsUndef)
+    return SingularValue;
+
+  // Ok, we have no way out, insert a new one now.
+  DbgSSAPhi *InsertedPHI = BB->newPHI();
+
+  // Fill in all the predecessors of the PHI.
+  for (const auto &PredValue : PredValues)
+    InsertedPHI->addIncoming(PredValue.first, PredValue.second);
+
+  // See if the PHI node can be merged to a single value. This can happen in
+  // loop cases when we get a PHI of itself and one other value.
+
+  // If the client wants to know about all new instructions, tell it.
+  if (InsertedPHIs)
+    InsertedPHIs->push_back(InsertedPHI);
+
+  LLVM_DEBUG(dbgs() << "  Inserted PHI: " << *InsertedPHI << "\n");
+  return InsertedPHI;
+}
+
+DbgSSABlock *DbgSSABlockSuccIterator::operator*() {
+  return Updater.getDbgSSABlock(*SuccIt);
+}
+DbgSSABlock *DbgSSABlockPredIterator::operator*() {
+  return Updater.getDbgSSABlock(*PredIt);
+}
+
+namespace llvm {
+
+template <> class SSAUpdaterTraits<DebugSSAUpdater> {
+public:
+  using BlkT = DbgSSABlock;
+  using ValT = DbgValueDef;
+  using PhiT = DbgSSAPhi;
+  using BlkSucc_iterator = DbgSSABlockSuccIterator;
+
+  static BlkSucc_iterator BlkSucc_begin(BlkT *BB) { return BB->succ_begin(); }
+  static BlkSucc_iterator BlkSucc_end(BlkT *BB) { return BB->succ_end(); }
+
+  class PHI_iterator {
+  private:
+    DbgSSAPhi *PHI;
+    unsigned Idx;
+
+  public:
+    explicit PHI_iterator(DbgSSAPhi *P) // begin iterator
+        : PHI(P), Idx(0) {}
+    PHI_iterator(DbgSSAPhi *P, bool) // end iterator
+        : PHI(P), Idx(PHI->getNumIncomingValues()) {}
+
+    PHI_iterator &operator++() {
+      ++Idx;
+      return *this;
+    }
+    bool operator==(const PHI_iterator &X) const { return Idx == X.Idx; }
+    bool operator!=(const PHI_iterator &X) const { return !operator==(X); }
+
+    DbgValueDef getIncomingValue() { return PHI->getIncomingValue(Idx); }
+    DbgSSABlock *getIncomingBlock() { return PHI->getIncomingBlock(Idx); }
+  };
+
+  static PHI_iterator PHI_begin(PhiT *PHI) { return PHI_iterator(PHI); }
+  static PHI_iterator PHI_end(PhiT *PHI) { return PHI_iterator(PHI, true); }
+
+  /// FindPredecessorBlocks - Put the predecessors of BB into the Preds
+  /// vector.
+  static void FindPredecessorBlocks(DbgSSABlock *BB,
+                                    SmallVectorImpl<DbgSSABlock *> *Preds) {
+    for (auto PredIt = BB->pred_begin(); PredIt != BB->pred_end(); ++PredIt)
+      Preds->push_back(*PredIt);
+  }
+
+  /// GetPoisonVal - Get an undefined value of the same type as the value
+  /// being handled.
+  static DbgValueDef GetPoisonVal(DbgSSABlock *BB, DebugSSAUpdater *Updater) {
+    return DbgValueDef();
+  }
+
+  /// CreateEmptyPHI - Create a new debug PHI entry for the specified block.
+  static DbgSSAPhi *CreateEmptyPHI(DbgSSABlock *BB, unsigned NumPreds,
+                                   DebugSSAUpdater *Updater) {
+    DbgSSAPhi *PHI = BB->newPHI();
+    return PHI;
+  }
+
+  /// AddPHIOperand - Add the specified value as an operand of the PHI for
+  /// the specified predecessor block.
+  static void AddPHIOperand(DbgSSAPhi *PHI, DbgValueDef Val,
+                            DbgSSABlock *Pred) {
+    PHI->addIncoming(Pred, Val);
+  }
+
+  /// ValueIsPHI - Check if a value is a PHI.
+  static DbgSSAPhi *ValueIsPHI(DbgValueDef Val, DebugSSAUpdater *Updater) {
+    return Val.Phi;
+  }
+
+  /// ValueIsNewPHI - Like ValueIsPHI but also check if the PHI has no source
+  /// operands, i.e., it was just added.
+  static DbgSSAPhi *ValueIsNewPHI(DbgValueDef Val, DebugSSAUpdater *Updater) {
+    DbgSSAPhi *PHI = ValueIsPHI(Val, Updater);
+    if (PHI && PHI->getNumIncomingValues() == 0)
+      return PHI;
+    return nullptr;
+  }
+
+  /// GetPHIValue - For the specified PHI instruction, return the value
+  /// that it defines.
+  static DbgValueDef GetPHIValue(DbgSSAPhi *PHI) { return PHI; }
+};
+
+} // end namespace llvm
+
+/// Check to see if AvailableVals has an entry for the specified BB and if so,
+/// return it. If not, construct SSA form by first calculating the required
+/// placement of PHIs and then inserting new PHIs where needed.
+DbgValueDef DebugSSAUpdater::getValueAtEndOfBlockInternal(DbgSSABlock *BB) {
+  if (AV.contains(BB))
+    return AV[BB];
+
+  SSAUpdaterImpl<DebugSSAUpdater> Impl(this, &AV, InsertedPHIs);
+  return Impl.GetValue(BB);
+}
+
+bool isContained(DIScope *Inner, DIScope *Outer) {
+  if (Inner == Outer)
+    return true;
+  if (!Inner->getScope())
+    return false;
+  return isContained(Inner->getScope(), Outer);
+}
+
+void DbgValueRangeTable::addVariable(Function *F, DebugVariableAggregate DVA) {
+  const DILocalVariable *Var = DVA.getVariable();
+  const DILocation *InlinedAt = DVA.getInlinedAt();
+
+  DenseMap<BasicBlock *, SmallVector<DbgVariableRecord *>> BlockDbgRecordValues;
+  DenseSet<BasicBlock *> HasAnyInstructionsInScope;
+  int NumRecordsFound = 0;
+  DbgVariableRecord *LastRecordFound = nullptr;
+  bool DeclareRecordFound = false;
+
+  LLVM_DEBUG(dbgs() << "Finding variable info for " << *Var << " at "
+                    << InlinedAt << "\n");
+
+  for (auto &BB : *F) {
+    auto &DbgRecordValues = BlockDbgRecordValues[&BB];
+    bool FoundInstructionInScope = false;
+    for (auto &I : BB) {
+      LLVM_DEBUG(dbgs() << "Instruction: '" << I << "'\n");
+
+      for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange())) {
+        if (DVR.getVariable() == Var &&
+            DVR.getDebugLoc().getInlinedAt() == InlinedAt) {
+          assert(!DVR.isDbgAssign() && "No support for #dbg_assign yet.");
+          if (DVR.isDbgDeclare())
+            DeclareRecordFound = true;
+          ++NumRecordsFound;
+          LastRecordFound = &DVR;
+          DbgRecordValues.push_back(&DVR);
+        }
+      }
+      if (!FoundInstructionInScope && I.getDebugLoc()) {
+        if (I.getDebugLoc().getInlinedAt() == InlinedAt &&
+            isContained(cast<DILocalScope>(I.getDebugLoc().getScope()),
+                        Var->getScope())) {
+          FoundInstructionInScope = true;
+          HasAnyInstructionsInScope.insert(&BB);
+        }
+      }
+    }
+    LLVM_DEBUG(dbgs() << "DbgRecordValues found in '" << BB.getName() << "':\n";
+               for_each(DbgRecordValues, [](auto *DV) { DV->dump(); }));
+  }
+
+  if (!NumRecordsFound) {
+    LLVM_DEBUG(dbgs() << "No dbg_records found for variable!\n");
+    return;
+  }
+
+  // Now that we have all the DbgValues, we can start defining available values
+  // for each block. The end goal is to have, for every block with any
+  // instructions in scope, a LiveIn value.
+  // Currently we anticipate that either a variable has a set of #dbg_values, in
+  // which case we need a complete SSA liveness analysis to determine live-in
+  // values per-block, or a variable has a single #dbg_declare.
+  if (DeclareRecordFound) {
+    // FIXME: This should be changed for fragments!
+    LLVM_DEBUG(dbgs() << "Single location found for variable!\n");
+    assert(NumRecordsFound == 1 &&
+           "Found multiple records for a #dbg_declare variable!");
+    OrigSingleLocVariableValueTable[DVA] = DbgValueDef(LastRecordFound);
+    return;
+  }
+
+  // We don't have a single location for the variable's entire scope, so instead
+  // we must now perform a liveness analysis to create a location list.
+  DenseMap<BasicBlock *, DbgValueDef> LiveInMap;
+  SmallVector<DbgSSAPhi *> HypotheticalPHIs;
+  DebugSSAUpdater SSAUpdater(&HypotheticalPHIs);
+  SSAUpdater.initialize();
+  for (auto &[BB, DVs] : BlockDbgRecordValues) {
+    auto *DbgBB = SSAUpdater.getDbgSSABlock(BB);
+    if (DVs.empty())
+      continue;
+    auto *LastValueInBlock = DVs.back();
+    LLVM_DEBUG(dbgs() << "Last value in " << BB->getName() << ": "
+                      << *LastValueInBlock << "\n");
+    SSAUpdater.addAvailableValue(DbgBB, DbgValueDef(LastValueInBlock));
+  }
+
+  for (BasicBlock &BB : *F) {
+    if (!HasAnyInstructionsInScope.contains(&BB)) {
+      LLVM_DEBUG(dbgs() << "Skipping finding debug ranges for '" << BB.getName()
+                        << "' due to no in-scope instructions.\n");
+      continue;
+    }
+    LLVM_DEBUG(dbgs() << "Finding live-in value for '" << BB.getName()
+                      << "'...\n");
+    DbgValueDef LiveValue =
+        SSAUpdater.getValueInMiddleOfBlock(SSAUpdater.getDbgSSABlock(&BB));
+    LLVM_DEBUG(dbgs() << "Found live-in: " << LiveValue << "\n");
+    auto HasValidValue = [](DbgValueDef DV) {
+      return !DV.IsUndef && DV.Phi == nullptr;
+    };
+
+    SmallVector<DbgRangeEntry> BlockDbgRanges;
+    BasicBlock::iterator LastIt = BB.begin();
+    for (auto *DVR : BlockDbgRecordValues[&BB]) {
+      // Create a range that ends as of DVR.
+      BasicBlock::iterator DVRStartIt =
+          const_cast<Instruction *>(DVR->getInstruction())->getIterator();
+      if (HasValidValue(LiveValue))
+        BlockDbgRanges.push_back({LastIt, DVRStartIt, LiveValue});
+      LiveValue = DbgValueDef(DVR);
+      LastIt = DVRStartIt;
+    }
+
+    // After considering all in-block debug values, if any, create a range
+    // covering the remainder of the block.
+    if (HasValidValue(LiveValue))
+      BlockDbgRanges.push_back({LastIt, BB.end(), LiveValue});
+    LLVM_DEBUG(dbgs() << "Create set of ranges with " << BlockDbgRanges.size()
+                      << " entries!\n");
+    if (!BlockDbgRanges.empty())
+      OrigVariableValueRangeTable[DVA].append(BlockDbgRanges);
+  }
+}
+
+void DbgValueRangeTable::printValues(DebugVariableAggregate DVA,
+                                     raw_ostream &OS) {
+  OS << "Variable Table for '" << DVA.getVariable()->getName() << "' (at "
+     << DVA.getInlinedAt() << "):\n";
+  if (!hasVariableEntry(DVA)) {
+    OS << "  Empty!\n";
+    return;
+  }
+  if (hasSingleLocEntry(DVA)) {
+    OS << "  SingleLoc: " << OrigSingleLocVariableValueTable[DVA] << "\n";
+    return;
+  }
+  OS << "  LocRange:\n";
+  for (DbgRangeEntry RangeEntry : OrigVariableValueRangeTable[DVA]) {
+    OS << "    (";
+    if (RangeEntry.Start == RangeEntry.Start->getParent()->begin() &&
+        RangeEntry.End == RangeEntry.Start->getParent()->end()) {
+      OS << RangeEntry.Start->getParent()->getName();
+    } else {
+      OS << RangeEntry.Start->getParent()->getName() << ": "
+         << *RangeEntry.Start << ", ";
+      if (RangeEntry.End == RangeEntry.Start->getParent()->end())
+        OS << "..";
+      else
+        OS << *RangeEntry.End;
+    }
+    OS << ") [" << RangeEntry.Value << "]\n";
+  }
+}
+
+SSAValueNameMap::ValueID SSAValueNameMap::addValue(Value *V) {
+  auto ExistingID = ValueToIDMap.find(V);
+  if (ExistingID != ValueToIDMap.end())
+    return ExistingID->second;
+  // First, get a new ID and Map V to it.
+  ValueID NewID = NextID++;
+  ValueToIDMap.insert({V, NewID});
+  // Then, get the name string for V and map NewID to it.
+  assert(!ValueIDToNameMap.contains(NewID) &&
+         "New value ID already maps to a name?");
+  std::string &ValueText = ValueIDToNameMap[NewID];
+  raw_string_ostream Stream(ValueText);
+  V->printAsOperand(Stream, true);
+  return NewID;
+}
diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp
index 2cfd70a1746c8..57dc1b38b8ec3 100644
--- a/llvm/lib/Transforms/Utils/Local.cpp
+++ b/llvm/lib/Transforms/Utils/Local.cpp
@@ -3342,8 +3342,11 @@ void llvm::hoistAllInstructionsInto(BasicBlock *DomBlock, Instruction *InsertPt,
   // retain their original debug locations (DILocations) and debug intrinsic
   // instructions.
   //
-  // Doing so would degrade the debugging experience and adversely affect the
-  // accuracy of profiling information.
+  // Doing so would degrade the debugging experience.
+  //
+  // FIXME: Issue #152767: debug info should also be the same as the
+  // original branch, **if** the user explicitly indicated that (for sampling
+  // PGO)
   //
   // Currently, when hoisting the instructions, we take the following actions:
   // - Remove their debug intrinsic instructions.
diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp
index b172ef6ba0803..7b1a7ce6995f8 100644
--- a/llvm/lib/Transforms/Utils/LoopUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp
@@ -804,26 +804,51 @@ static BranchInst *getExpectedExitLoopLatchBranch(Loop *L) {
   return LatchBR;
 }
 
-/// Return the estimated trip count for any exiting branch which dominates
-/// the loop latch.
-static std::optional<unsigned> getEstimatedTripCount(BranchInst *ExitingBranch,
-                                                     Loop *L,
-                                                     uint64_t &OrigExitWeight) {
+struct DbgLoop {
+  const Loop *L;
+  explicit DbgLoop(const Loop *L) : L(L) {}
+};
+
+#ifndef NDEBUG
+static inline raw_ostream &operator<<(raw_ostream &OS, DbgLoop D) {
+  OS << "function ";
+  D.L->getHeader()->getParent()->printAsOperand(OS, /*PrintType=*/false);
+  return OS << " " << *D.L;
+}
+#endif // NDEBUG
+
+static std::optional<unsigned> estimateLoopTripCount(Loop *L) {
+  // Currently we take the estimate exit count only from the loop latch,
+  // ignoring other exiting blocks.  This can overestimate the trip count
+  // if we exit through another exit, but can never underestimate it.
+  // TODO: incorporate information from other exits
+  BranchInst *ExitingBranch = getExpectedExitLoopLatchBranch(L);
+  if (!ExitingBranch) {
+    LLVM_DEBUG(dbgs() << "estimateLoopTripCount: Failed to find exiting "
+                      << "latch branch of required form in " << DbgLoop(L)
+                      << "\n");
+    return std::nullopt;
+  }
+
   // To estimate the number of times the loop body was executed, we want to
   // know the number of times the backedge was taken, vs. the number of times
   // we exited the loop.
   uint64_t LoopWeight, ExitWeight;
-  if (!extractBranchWeights(*ExitingBranch, LoopWeight, ExitWeight))
+  if (!extractBranchWeights(*ExitingBranch, LoopWeight, ExitWeight)) {
+    LLVM_DEBUG(dbgs() << "estimateLoopTripCount: Failed to extract branch "
+                      << "weights for " << DbgLoop(L) << "\n");
     return std::nullopt;
+  }
 
   if (L->contains(ExitingBranch->getSuccessor(1)))
     std::swap(LoopWeight, ExitWeight);
 
-  if (!ExitWeight)
+  if (!ExitWeight) {
     // Don't have a way to return predicated infinite
+    LLVM_DEBUG(dbgs() << "estimateLoopTripCount: Failed because of zero exit "
+                      << "probability for " << DbgLoop(L) << "\n");
     return std::nullopt;
-
-  OrigExitWeight = ExitWeight;
+  }
 
   // Estimated exit count is a ratio of the loop weight by the weight of the
   // edge exiting the loop, rounded to nearest.
@@ -834,43 +859,102 @@ static std::optional<unsigned> getEstimatedTripCount(BranchInst *ExitingBranch,
     return std::numeric_limits<unsigned>::max();
 
   // Estimated trip count is one plus estimated exit count.
-  return ExitCount + 1;
+  uint64_t TC = ExitCount + 1;
+  LLVM_DEBUG(dbgs() << "estimateLoopTripCount: Estimated trip count of " << TC
+                    << " for " << DbgLoop(L) << "\n");
+  return TC;
 }
 
 std::optional<unsigned>
 llvm::getLoopEstimatedTripCount(Loop *L,
                                 unsigned *EstimatedLoopInvocationWeight) {
-  // Currently we take the estimate exit count only from the loop latch,
-  // ignoring other exiting blocks.  This can overestimate the trip count
-  // if we exit through another exit, but can never underestimate it.
-  // TODO: incorporate information from other exits
-  if (BranchInst *LatchBranch = getExpectedExitLoopLatchBranch(L)) {
-    uint64_t ExitWeight;
-    if (std::optional<uint64_t> EstTripCount =
-            getEstimatedTripCount(LatchBranch, L, ExitWeight)) {
-      if (EstimatedLoopInvocationWeight)
-        *EstimatedLoopInvocationWeight = ExitWeight;
-      return *EstTripCount;
-    }
+  // If EstimatedLoopInvocationWeight, we do not support this loop if
+  // getExpectedExitLoopLatchBranch returns nullptr.
+  //
+  // FIXME: Also, this is a stop-gap solution for nested loops.  It avoids
+  // mistaking LLVMLoopEstimatedTripCount metadata to be for an outer loop when
+  // it was created for an inner loop.  The problem is that loop metadata is
+  // attached to the branch instruction in the loop latch block, but that can be
+  // shared by the loops.  A solution is to attach loop metadata to loop headers
+  // instead, but that would be a large change to LLVM.
+  //
+  // Until that happens, we work around the problem as follows.
+  // getExpectedExitLoopLatchBranch (which also guards
+  // setLoopEstimatedTripCount) returns nullptr for a loop unless the loop has
+  // one latch and that latch has exactly two successors one of which is an exit
+  // from the loop.  If the latch is shared by nested loops, then that condition
+  // might hold for the inner loop but cannot hold for the outer loop:
+  // - Because the latch is shared, it must have at least two successors: the
+  //   inner loop header and the outer loop header, which is also an exit for
+  //   the inner loop.  That satisifies the condition for the inner loop.
+  // - To satsify the condition for the outer loop, the latch must have a third
+  //   successor that is an exit for the outer loop.  But that violates the
+  //   condition for both loops.
+  BranchInst *ExitingBranch = getExpectedExitLoopLatchBranch(L);
+  if (!ExitingBranch)
+    return std::nullopt;
+
+  // If requested, either compute *EstimatedLoopInvocationWeight or return
+  // nullopt if cannot.
+  //
+  // TODO: Eventually, once all passes have migrated away from setting branch
+  // weights to indicate estimated trip counts, this function will drop the
+  // EstimatedLoopInvocationWeight parameter.
+  if (EstimatedLoopInvocationWeight) {
+    uint64_t LoopWeight = 0, ExitWeight = 0; // Inits expected to be unused.
+    if (!extractBranchWeights(*ExitingBranch, LoopWeight, ExitWeight))
+      return std::nullopt;
+    if (L->contains(ExitingBranch->getSuccessor(1)))
+      std::swap(LoopWeight, ExitWeight);
+    if (!ExitWeight)
+      return std::nullopt;
+    *EstimatedLoopInvocationWeight = ExitWeight;
   }
-  return std::nullopt;
+
+  // Return the estimated trip count from metadata unless the metadata is
+  // missing or has no value.
+  if (auto TC = getOptionalIntLoopAttribute(L, LLVMLoopEstimatedTripCount)) {
+    LLVM_DEBUG(dbgs() << "getLoopEstimatedTripCount: "
+                      << LLVMLoopEstimatedTripCount << " metadata has trip "
+                      << "count of " << *TC << " for " << DbgLoop(L) << "\n");
+    return TC;
+  }
+
+  // Estimate the trip count from latch branch weights.
+  return estimateLoopTripCount(L);
 }
 
-bool llvm::setLoopEstimatedTripCount(Loop *L, unsigned EstimatedTripCount,
-                                     unsigned EstimatedloopInvocationWeight) {
-  // At the moment, we currently support changing the estimate trip count of
-  // the latch branch only.  We could extend this API to manipulate estimated
-  // trip counts for any exit.
+bool llvm::setLoopEstimatedTripCount(
+    Loop *L, unsigned EstimatedTripCount,
+    std::optional<unsigned> EstimatedloopInvocationWeight) {
+  // If EstimatedLoopInvocationWeight, we do not support this loop if
+  // getExpectedExitLoopLatchBranch returns nullptr.
+  //
+  // FIXME: See comments in getLoopEstimatedTripCount for why this is required
+  // here regardless of EstimatedLoopInvocationWeight.
   BranchInst *LatchBranch = getExpectedExitLoopLatchBranch(L);
   if (!LatchBranch)
     return false;
 
+  // Set the metadata.
+  addStringMetadataToLoop(L, LLVMLoopEstimatedTripCount, EstimatedTripCount);
+
+  // At the moment, we currently support changing the estimated trip count in
+  // the latch branch's branch weights only.  We could extend this API to
+  // manipulate estimated trip counts for any exit.
+  //
+  // TODO: Eventually, once all passes have migrated away from setting branch
+  // weights to indicate estimated trip counts, we will not set branch weights
+  // here at all.
+  if (!EstimatedloopInvocationWeight)
+    return true;
+
   // Calculate taken and exit weights.
   unsigned LatchExitWeight = 0;
   unsigned BackedgeTakenWeight = 0;
 
-  if (EstimatedTripCount > 0) {
-    LatchExitWeight = EstimatedloopInvocationWeight;
+  if (EstimatedTripCount != 0) {
+    LatchExitWeight = *EstimatedloopInvocationWeight;
     BackedgeTakenWeight = (EstimatedTripCount - 1) * LatchExitWeight;
   }
 
diff --git a/llvm/lib/Transforms/Utils/SCCPSolver.cpp b/llvm/lib/Transforms/Utils/SCCPSolver.cpp
index 84485176ad4ff..af216cd9214bf 100644
--- a/llvm/lib/Transforms/Utils/SCCPSolver.cpp
+++ b/llvm/lib/Transforms/Utils/SCCPSolver.cpp
@@ -19,8 +19,10 @@
 #include "llvm/Analysis/ValueLattice.h"
 #include "llvm/Analysis/ValueLatticeUtils.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/ConstantRange.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/IR/NoFolder.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/Support/Casting.h"
@@ -284,6 +286,58 @@ static Value *simplifyInstruction(SCCPSolver &Solver,
     return Sub;
   }
 
+  // Relax range checks.
+  if (auto *ICmp = dyn_cast<ICmpInst>(&Inst)) {
+    Value *X;
+    auto MatchTwoInstructionExactRangeCheck =
+        [&]() -> std::optional<ConstantRange> {
+      const APInt *RHSC;
+      if (!match(ICmp->getOperand(1), m_APInt(RHSC)))
+        return std::nullopt;
+
+      Value *LHS = ICmp->getOperand(0);
+      ICmpInst::Predicate Pred = ICmp->getPredicate();
+      const APInt *Offset;
+      if (match(LHS, m_OneUse(m_AddLike(m_Value(X), m_APInt(Offset)))))
+        return ConstantRange::makeExactICmpRegion(Pred, *RHSC).sub(*Offset);
+      // Match icmp eq/ne X & NegPow2, C
+      if (ICmp->isEquality()) {
+        const APInt *Mask;
+        if (match(LHS, m_OneUse(m_And(m_Value(X), m_NegatedPower2(Mask)))) &&
+            RHSC->countr_zero() >= Mask->countr_zero()) {
+          ConstantRange CR(*RHSC, *RHSC - *Mask);
+          return Pred == ICmpInst::ICMP_EQ ? CR : CR.inverse();
+        }
+      }
+      return std::nullopt;
+    };
+
+    if (auto CR = MatchTwoInstructionExactRangeCheck()) {
+      ConstantRange LRange = GetRange(X);
+      // Early exit if we know nothing about X.
+      if (LRange.isFullSet())
+        return nullptr;
+      // We are allowed to refine the comparison to either true or false for out
+      // of range inputs. Here we refine the comparison to true, i.e. we relax
+      // the range check.
+      auto NewCR = CR->exactUnionWith(LRange.inverse());
+      // TODO: Check if we can narrow the range check to an equality test.
+      // E.g, for X in [0, 4), X - 3 u< 2 -> X == 3
+      if (!NewCR)
+        return nullptr;
+      ICmpInst::Predicate Pred;
+      APInt RHS;
+      // Check if we can represent NewCR as an icmp predicate.
+      if (NewCR->getEquivalentICmp(Pred, RHS)) {
+        IRBuilder<NoFolder> Builder(&Inst);
+        Value *NewICmp =
+            Builder.CreateICmp(Pred, X, ConstantInt::get(X->getType(), RHS));
+        InsertedValues.insert(NewICmp);
+        return NewICmp;
+      }
+    }
+  }
+
   return nullptr;
 }
 
diff --git a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
index 28befd0aa1ce8..45cee1e7da625 100644
--- a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
+++ b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
@@ -2222,20 +2222,11 @@ Value *SCEVExpander::generateOverflowCheck(const SCEVAddRecExpr *AR,
     // Get the backedge taken count and truncate or extended to the AR type.
     Value *TruncTripCount = Builder.CreateZExtOrTrunc(TripCountVal, Ty);
 
-    Value *MulV, *OfMul;
-    if (Step->isOne()) {
-      // Special-case Step of one. Potentially-costly `umul_with_overflow` isn't
-      // needed, there is never an overflow, so to avoid artificially inflating
-      // the cost of the check, directly emit the optimized IR.
-      MulV = TruncTripCount;
-      OfMul = ConstantInt::getFalse(MulV->getContext());
-    } else {
-      CallInst *Mul = Builder.CreateIntrinsic(Intrinsic::umul_with_overflow, Ty,
-                                              {AbsStep, TruncTripCount},
-                                              /*FMFSource=*/nullptr, "mul");
-      MulV = Builder.CreateExtractValue(Mul, 0, "mul.result");
-      OfMul = Builder.CreateExtractValue(Mul, 1, "mul.overflow");
-    }
+    CallInst *Mul = Builder.CreateIntrinsic(Intrinsic::umul_with_overflow, Ty,
+                                            {AbsStep, TruncTripCount},
+                                            /*FMFSource=*/nullptr, "mul");
+    Value *MulV = Builder.CreateExtractValue(Mul, 0, "mul.result");
+    Value *OfMul = Builder.CreateExtractValue(Mul, 1, "mul.overflow");
 
     Value *Add = nullptr, *Sub = nullptr;
     bool NeedPosCheck = !SE.isKnownNegative(Step);
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index 970f85378d3d2..a1f759dd1df83 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -203,6 +203,8 @@ static cl::opt<unsigned> MaxJumpThreadingLiveBlocks(
     cl::desc("Limit number of blocks a define in a threaded block is allowed "
              "to be live in"));
 
+extern cl::opt<bool> ProfcheckDisableMetadataFixes;
+
 STATISTIC(NumBitMaps, "Number of switch instructions turned into bitmaps");
 STATISTIC(NumLinearMaps,
           "Number of switch instructions turned into linear mapping");
@@ -330,6 +332,17 @@ class SimplifyCFGOpt {
   }
 };
 
+// we synthesize a || b as select a, true, b
+// we synthesize a && b as select a, b, false
+// this function determines if SI is playing one of those roles.
+[[maybe_unused]] bool
+isSelectInRoleOfConjunctionOrDisjunction(const SelectInst *SI) {
+  return ((isa<ConstantInt>(SI->getTrueValue()) &&
+           (dyn_cast<ConstantInt>(SI->getTrueValue())->isOne())) ||
+          (isa<ConstantInt>(SI->getFalseValue()) &&
+           (dyn_cast<ConstantInt>(SI->getFalseValue())->isNullValue())));
+}
+
 } // end anonymous namespace
 
 /// Return true if all the PHI nodes in the basic block \p BB
@@ -3379,7 +3392,7 @@ bool SimplifyCFGOpt::speculativelyExecuteBB(BranchInst *BI,
   // hoisting above.
   for (auto &I : make_early_inc_range(*ThenBB)) {
     if (!SpeculatedStoreValue || &I != SpeculatedStore) {
-      I.setDebugLoc(DebugLoc::getDropped());
+      I.dropLocation();
     }
     I.dropUBImplyingAttrsAndMetadata();
 
@@ -4031,6 +4044,7 @@ static bool performBranchToCommonDestFolding(BranchInst *BI, BranchInst *PBI,
 
   // Try to update branch weights.
   uint64_t PredTrueWeight, PredFalseWeight, SuccTrueWeight, SuccFalseWeight;
+  SmallVector<uint32_t, 2> MDWeights;
   if (extractPredSuccWeights(PBI, BI, PredTrueWeight, PredFalseWeight,
                              SuccTrueWeight, SuccFalseWeight)) {
     SmallVector<uint64_t, 8> NewWeights;
@@ -4061,7 +4075,7 @@ static bool performBranchToCommonDestFolding(BranchInst *BI, BranchInst *PBI,
     // Halve the weights if any of them cannot fit in an uint32_t
     fitWeights(NewWeights);
 
-    SmallVector<uint32_t, 8> MDWeights(NewWeights.begin(), NewWeights.end());
+    append_range(MDWeights, NewWeights);
     setBranchWeights(PBI, MDWeights[0], MDWeights[1], /*IsExpected=*/false);
 
     // TODO: If BB is reachable from all paths through PredBlock, then we
@@ -4098,6 +4112,13 @@ static bool performBranchToCommonDestFolding(BranchInst *BI, BranchInst *PBI,
   Value *BICond = VMap[BI->getCondition()];
   PBI->setCondition(
       createLogicalOp(Builder, Opc, PBI->getCondition(), BICond, "or.cond"));
+  if (!ProfcheckDisableMetadataFixes)
+    if (auto *SI = dyn_cast<SelectInst>(PBI->getCondition()))
+      if (!MDWeights.empty()) {
+        assert(isSelectInRoleOfConjunctionOrDisjunction(SI));
+        setBranchWeights(SI, MDWeights[0], MDWeights[1],
+                         /*IsExpected=*/false);
+      }
 
   ++NumFoldBranchToCommonDest;
   return true;
@@ -4438,6 +4459,20 @@ static bool mergeConditionalStoreToAddress(
   auto *T = SplitBlockAndInsertIfThen(CombinedPred, InsertPt,
                                       /*Unreachable=*/false,
                                       /*BranchWeights=*/nullptr, DTU);
+  if (hasBranchWeightMD(*PBranch) && hasBranchWeightMD(*QBranch) &&
+      !ProfcheckDisableMetadataFixes) {
+    SmallVector<uint32_t, 2> PWeights, QWeights;
+    extractBranchWeights(*PBranch, PWeights);
+    extractBranchWeights(*QBranch, QWeights);
+    if (InvertPCond)
+      std::swap(PWeights[0], PWeights[1]);
+    if (InvertQCond)
+      std::swap(QWeights[0], QWeights[1]);
+    auto CombinedWeights = getDisjunctionWeights(PWeights, QWeights);
+    setBranchWeights(PostBB->getTerminator(), CombinedWeights[0],
+                     CombinedWeights[1],
+                     /*IsExpected=*/false);
+  }
 
   QB.SetInsertPoint(T);
   StoreInst *SI = cast<StoreInst>(QB.CreateStore(QPHI, Address));
@@ -4796,6 +4831,18 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI,
     fitWeights(NewWeights);
 
     setBranchWeights(PBI, NewWeights[0], NewWeights[1], /*IsExpected=*/false);
+    // Cond may be a select instruction with the first operand set to "true", or
+    // the second to "false" (see how createLogicalOp works for `and` and `or`)
+    if (!ProfcheckDisableMetadataFixes)
+      if (auto *SI = dyn_cast<SelectInst>(Cond)) {
+        assert(isSelectInRoleOfConjunctionOrDisjunction(SI));
+        // The select is predicated on PBICond
+        assert(dyn_cast<SelectInst>(SI)->getCondition() == PBICond);
+        // The corresponding probabilities are what was referred to above as
+        // PredCommon and PredOther.
+        setBranchWeights(SI, PredCommon, PredOther,
+                         /*IsExpected=*/false);
+      }
   }
 
   // OtherDest may have phi nodes.  If so, add an entry from PBI's
diff --git a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
index 8acebbaa5458b..4a1565977b91c 100644
--- a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
@@ -61,6 +61,9 @@ static cl::opt<bool> OptimizeExistingHotColdNew(
     "optimize-existing-hot-cold-new", cl::Hidden, cl::init(false),
     cl::desc(
         "Enable optimization of existing hot/cold operator new library calls"));
+static cl::opt<bool> OptimizeNoBuiltinHotColdNew(
+    "optimize-nobuiltin-hot-cold-new-new", cl::Hidden, cl::init(false),
+    cl::desc("Enable transformation of nobuiltin operator new library calls"));
 
 namespace {
 
@@ -1723,13 +1726,11 @@ Value *LibCallSimplifier::optimizeRealloc(CallInst *CI, IRBuilderBase &B) {
   return nullptr;
 }
 
-// Allow existing calls to operator new() that takes a __hot_cold_t parameter to
-// be updated with a compiler-determined hot cold hint value. This is used in
-// cases where the call is marked nobuiltin (because operator new called
-// explicitly) and therefore cannot be replaced with a different callee.
-Value *LibCallSimplifier::optimizeExistingHotColdNew(CallInst *CI,
-                                                     IRBuilderBase &B) {
-  if (!OptimizeHotColdNew || !OptimizeExistingHotColdNew)
+// Optionally allow optimization of nobuiltin calls to operator new and its
+// variants.
+Value *LibCallSimplifier::maybeOptimizeNoBuiltinOperatorNew(CallInst *CI,
+                                                            IRBuilderBase &B) {
+  if (!OptimizeHotColdNew)
     return nullptr;
   Function *Callee = CI->getCalledFunction();
   if (!Callee)
@@ -1738,6 +1739,22 @@ Value *LibCallSimplifier::optimizeExistingHotColdNew(CallInst *CI,
   if (!TLI->getLibFunc(*Callee, Func))
     return nullptr;
   switch (Func) {
+  case LibFunc_Znwm:
+  case LibFunc_ZnwmRKSt9nothrow_t:
+  case LibFunc_ZnwmSt11align_val_t:
+  case LibFunc_ZnwmSt11align_val_tRKSt9nothrow_t:
+  case LibFunc_Znam:
+  case LibFunc_ZnamRKSt9nothrow_t:
+  case LibFunc_ZnamSt11align_val_t:
+  case LibFunc_ZnamSt11align_val_tRKSt9nothrow_t:
+  case LibFunc_size_returning_new:
+  case LibFunc_size_returning_new_aligned:
+    // By default normal operator new calls (not already passing a hot_cold_t
+    // parameter) are not mutated if the call is not marked builtin. Optionally
+    // enable that in cases where it is known to be safe.
+    if (!OptimizeNoBuiltinHotColdNew)
+      return nullptr;
+    break;
   case LibFunc_Znwm12__hot_cold_t:
   case LibFunc_ZnwmRKSt9nothrow_t12__hot_cold_t:
   case LibFunc_ZnwmSt11align_val_t12__hot_cold_t:
@@ -1748,10 +1765,15 @@ Value *LibCallSimplifier::optimizeExistingHotColdNew(CallInst *CI,
   case LibFunc_ZnamSt11align_val_tRKSt9nothrow_t12__hot_cold_t:
   case LibFunc_size_returning_new_hot_cold:
   case LibFunc_size_returning_new_aligned_hot_cold:
-    return optimizeNew(CI, B, Func);
+    // If the nobuiltin call already passes a hot_cold_t parameter, allow update
+    // of that parameter when enabled.
+    if (!OptimizeExistingHotColdNew)
+      return nullptr;
+    break;
   default:
     return nullptr;
   }
+  return optimizeNew(CI, B, Func);
 }
 
 // When enabled, replace operator new() calls marked with a hot or cold memprof
@@ -4121,9 +4143,8 @@ Value *LibCallSimplifier::optimizeCall(CallInst *CI, IRBuilderBase &Builder) {
   //       we can all non-FP calls with the StrictFP attribute to be
   //       optimized.
   if (CI->isNoBuiltin()) {
-    // If this is an existing call to a hot cold operator new, we can update the
-    // hint parameter value, which doesn't change the callee.
-    return optimizeExistingHotColdNew(CI, Builder);
+    // Optionally update operator new calls.
+    return maybeOptimizeNoBuiltinOperatorNew(CI, Builder);
   }
 
   LibFunc Func;
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index 85a6fcaf3ecdd..ff35db14f7094 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -1903,11 +1903,12 @@ bool LoopVectorizationLegality::canUncountableExitConditionLoadBeMoved(
   SafetyInfo.computeLoopSafetyInfo(TheLoop);
   // We need to know that load will be executed before we can hoist a
   // copy out to run just before the first iteration.
-  // FIXME: Currently, other restrictions prevent us from reaching this point
-  //        with a loop where the uncountable exit condition is determined
-  //        by a conditional load.
-  assert(SafetyInfo.isGuaranteedToExecute(*Load, DT, TheLoop) &&
-         "Unhandled control flow in uncountable exit loop with side effects");
+  if (!SafetyInfo.isGuaranteedToExecute(*Load, DT, TheLoop)) {
+    reportVectorizationFailure(
+        "Load for uncountable exit not guaranteed to execute",
+        "ConditionalUncountableExitLoad", ORE, TheLoop);
+    return false;
+  }
 
   // Prohibit any potential aliasing with any instruction in the loop which
   // might store to memory.
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index b4acda80cfb93..fee201ee3523a 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -393,6 +393,10 @@ static cl::opt<bool> EnableEarlyExitVectorization(
     cl::desc(
         "Enable vectorization of early exit loops with uncountable exits."));
 
+static cl::opt<bool> ConsiderRegPressure(
+    "vectorizer-consider-reg-pressure", cl::init(false), cl::Hidden,
+    cl::desc("Discard VFs if their register pressure is too high."));
+
 // Likelyhood of bypassing the vectorized loop because there are zero trips left
 // after prolog. See `emitIterationCountCheck`.
 static constexpr uint32_t MinItersBypassWeights[] = {1, 127};
@@ -3693,6 +3697,14 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
 
 bool LoopVectorizationCostModel::shouldConsiderRegPressureForVF(
     ElementCount VF) {
+  if (ConsiderRegPressure.getNumOccurrences())
+    return ConsiderRegPressure;
+
+  // TODO: We should eventually consider register pressure for all targets. The
+  // TTI hook is temporary whilst target-specific issues are being fixed.
+  if (TTI.shouldConsiderVectorizationRegPressure())
+    return true;
+
   if (!useMaxBandwidth(VF.isScalable()
                            ? TargetTransformInfo::RGK_ScalableVector
                            : TargetTransformInfo::RGK_FixedWidthVector))
@@ -4308,16 +4320,10 @@ bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
   if (TTI.getMaxInterleaveFactor(VF) <= 1)
     return false;
 
-  // TODO: PR #108190 introduced a discrepancy between fixed-width and scalable
-  // VFs when deciding profitability.
-  // See related "TODO: extend to support scalable VFs." in
-  // selectEpilogueVectorizationFactor.
-  unsigned Multiplier = VF.isFixed() ? IC : 1;
   unsigned MinVFThreshold = EpilogueVectorizationMinVF.getNumOccurrences() > 0
                                 ? EpilogueVectorizationMinVF
                                 : TTI.getEpilogueVectorizationMinVF();
-  return estimateElementCount(VF * Multiplier, VScaleForTuning) >=
-         MinVFThreshold;
+  return estimateElementCount(VF * IC, VScaleForTuning) >= MinVFThreshold;
 }
 
 VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
@@ -9545,7 +9551,7 @@ static void preparePlanForMainVectorLoop(VPlan &MainPlan, VPlan &EpiPlan) {
   auto ResumePhiIter =
       find_if(MainScalarPH->phis(), [VectorTC](VPRecipeBase &R) {
         return match(&R, m_VPInstruction<Instruction::PHI>(m_Specific(VectorTC),
-                                                           m_SpecificInt(0)));
+                                                           m_ZeroInt()));
       });
   VPPhi *ResumePhi = nullptr;
   if (ResumePhiIter == MainScalarPH->phis().end()) {
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 75cace77ec534..520d2e6f1e110 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -198,6 +198,16 @@ static cl::opt<unsigned> MaxProfitableLoadStride(
     "slp-max-stride", cl::init(8), cl::Hidden,
     cl::desc("The maximum stride, considered to be profitable."));
 
+static cl::opt<bool>
+    DisableTreeReorder("slp-disable-tree-reorder", cl::init(false), cl::Hidden,
+                       cl::desc("Disable tree reordering even if it is "
+                                "profitable. Used for testing only."));
+
+static cl::opt<bool>
+    ForceStridedLoads("slp-force-strided-loads", cl::init(false), cl::Hidden,
+                      cl::desc("Generate strided loads even if they are not "
+                               "profitable. Used for testing only."));
+
 static cl::opt<bool>
     ViewSLPTree("view-slp-tree", cl::Hidden,
                 cl::desc("Display the SLP trees with Graphviz"));
@@ -1916,6 +1926,19 @@ class BoUpSLP {
   class ShuffleCostEstimator;
   class ShuffleInstructionBuilder;
 
+  /// If we decide to generate strided load / store, this struct contains all
+  /// the necessary info. It's fields are calculated by analyzeRtStrideCandidate
+  /// and analyzeConstantStrideCandidate. Note that Stride can be given either
+  /// as a SCEV or as a Value if it already exists. To get the stride in bytes,
+  /// StrideVal (or value obtained from StrideSCEV) has to by multiplied by the
+  /// size of element of FixedVectorType.
+  struct StridedPtrInfo {
+    Value *StrideVal = nullptr;
+    const SCEV *StrideSCEV = nullptr;
+    FixedVectorType *Ty = nullptr;
+  };
+  SmallDenseMap<TreeEntry *, StridedPtrInfo> TreeEntryToStridedPtrInfoMap;
+
 public:
   /// Tracks the state we can represent the loads in the given sequence.
   enum class LoadsState {
@@ -2211,6 +2234,11 @@ class BoUpSLP {
   /// TODO: If load combining is allowed in the IR optimizer, this analysis
   ///       may not be necessary.
   bool isLoadCombineCandidate(ArrayRef<Value *> Stores) const;
+  bool isStridedLoad(ArrayRef<Value *> VL, ArrayRef<Value *> PointerOps,
+                     ArrayRef<unsigned> Order, const TargetTransformInfo &TTI,
+                     const DataLayout &DL, ScalarEvolution &SE,
+                     const bool IsAnyPointerUsedOutGraph, const int64_t Diff,
+                     StridedPtrInfo &SPtrInfo) const;
 
   /// Checks if the given array of loads can be represented as a vectorized,
   /// scatter or just simple gather.
@@ -2225,6 +2253,7 @@ class BoUpSLP {
   LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
                                SmallVectorImpl<unsigned> &Order,
                                SmallVectorImpl<Value *> &PointerOps,
+                               StridedPtrInfo &SPtrInfo,
                                unsigned *BestVF = nullptr,
                                bool TryRecursiveCheck = true) const;
 
@@ -4469,11 +4498,10 @@ class BoUpSLP {
 
   /// Checks if the specified list of the instructions/values can be vectorized
   /// and fills required data before actual scheduling of the instructions.
-  TreeEntry::EntryState
-  getScalarsVectorizationState(const InstructionsState &S, ArrayRef<Value *> VL,
-                               bool IsScatterVectorizeUserTE,
-                               OrdersType &CurrentOrder,
-                               SmallVectorImpl<Value *> &PointerOps);
+  TreeEntry::EntryState getScalarsVectorizationState(
+      const InstructionsState &S, ArrayRef<Value *> VL,
+      bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder,
+      SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo &SPtrInfo);
 
   /// Maps a specific scalar to its tree entry(ies).
   SmallDenseMap<Value *, SmallVector<TreeEntry *>> ScalarToTreeEntries;
@@ -5243,6 +5271,7 @@ class BoUpSLP {
           // Same applies even for non-commutative cmps, because we can invert
           // their predicate potentially and, thus, reorder the operands.
           bool IsCommutativeUser =
+              ::isCommutative(User) ||
               ::isCommutative(TE->getMatchingMainOpOrAltOp(User), User);
           EdgeInfo EI(TE, U.getOperandNo());
           if (!IsCommutativeUser && !isa<CmpInst>(User)) {
@@ -6789,12 +6818,13 @@ isMaskedLoadCompress(ArrayRef<Value *> VL, ArrayRef<Value *> PointerOps,
 /// 4. Any pointer operand is an instruction with the users outside of the
 /// current graph (for masked gathers extra extractelement instructions
 /// might be required).
-static bool isStridedLoad(ArrayRef<Value *> VL, ArrayRef<Value *> PointerOps,
-                          ArrayRef<unsigned> Order,
-                          const TargetTransformInfo &TTI, const DataLayout &DL,
-                          ScalarEvolution &SE,
-                          const bool IsAnyPointerUsedOutGraph,
-                          const int64_t Diff) {
+bool BoUpSLP::isStridedLoad(ArrayRef<Value *> VL, ArrayRef<Value *> PointerOps,
+                            ArrayRef<unsigned> Order,
+                            const TargetTransformInfo &TTI,
+                            const DataLayout &DL, ScalarEvolution &SE,
+                            const bool IsAnyPointerUsedOutGraph,
+                            const int64_t Diff,
+                            StridedPtrInfo &SPtrInfo) const {
   const size_t Sz = VL.size();
   const uint64_t AbsoluteDiff = std::abs(Diff);
   Type *ScalarTy = VL.front()->getType();
@@ -6836,17 +6866,20 @@ static bool isStridedLoad(ArrayRef<Value *> VL, ArrayRef<Value *> PointerOps,
       if (((Dist / Stride) * Stride) != Dist || !Dists.insert(Dist).second)
         break;
     }
-    if (Dists.size() == Sz)
+    if (Dists.size() == Sz) {
+      Type *StrideTy = DL.getIndexType(Ptr0->getType());
+      SPtrInfo.StrideVal = ConstantInt::get(StrideTy, Stride);
+      SPtrInfo.Ty = getWidenedType(ScalarTy, Sz);
       return true;
+    }
   }
   return false;
 }
 
-BoUpSLP::LoadsState
-BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
-                           SmallVectorImpl<unsigned> &Order,
-                           SmallVectorImpl<Value *> &PointerOps,
-                           unsigned *BestVF, bool TryRecursiveCheck) const {
+BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
+    ArrayRef<Value *> VL, const Value *VL0, SmallVectorImpl<unsigned> &Order,
+    SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo &SPtrInfo,
+    unsigned *BestVF, bool TryRecursiveCheck) const {
   // Check that a vectorized load would load the same memory as a scalar
   // load. For example, we don't want to vectorize loads that are smaller
   // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
@@ -6884,9 +6917,13 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
   Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
   if (!IsSorted) {
     if (Sz > MinProfitableStridedLoads && TTI->isTypeLegal(VecTy)) {
-      if (TTI->isLegalStridedLoadStore(VecTy, CommonAlignment) &&
-          calculateRtStride(PointerOps, ScalarTy, *DL, *SE, Order))
+      if (const SCEV *Stride =
+              calculateRtStride(PointerOps, ScalarTy, *DL, *SE, Order);
+          Stride && TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
+        SPtrInfo.Ty = getWidenedType(ScalarTy, PointerOps.size());
+        SPtrInfo.StrideSCEV = Stride;
         return LoadsState::StridedVectorize;
+      }
     }
 
     if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
@@ -6930,7 +6967,7 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
         });
     if (IsPossibleStrided &&
         isStridedLoad(VL, PointerOps, Order, *TTI, *DL, *SE,
-                      IsAnyPointerUsedOutGraph, *Diff))
+                      IsAnyPointerUsedOutGraph, *Diff, SPtrInfo))
       return LoadsState::StridedVectorize;
   }
   if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
@@ -7014,9 +7051,9 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
         ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
         SmallVector<unsigned> Order;
         SmallVector<Value *> PointerOps;
-        LoadsState LS =
-            canVectorizeLoads(Slice, Slice.front(), Order, PointerOps, BestVF,
-                              /*TryRecursiveCheck=*/false);
+        LoadsState LS = canVectorizeLoads(Slice, Slice.front(), Order,
+                                          PointerOps, SPtrInfo, BestVF,
+                                          /*TryRecursiveCheck=*/false);
         // Check that the sorted loads are consecutive.
         if (LS == LoadsState::Gather) {
           if (BestVF) {
@@ -7688,9 +7725,10 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom,
     // extra analysis later, so include such nodes into a special list.
     if (TE.hasState() && TE.getOpcode() == Instruction::Load) {
       SmallVector<Value *> PointerOps;
+      StridedPtrInfo SPtrInfo;
       OrdersType CurrentOrder;
       LoadsState Res = canVectorizeLoads(TE.Scalars, TE.Scalars.front(),
-                                         CurrentOrder, PointerOps);
+                                         CurrentOrder, PointerOps, SPtrInfo);
       if (Res == LoadsState::Vectorize || Res == LoadsState::StridedVectorize ||
           Res == LoadsState::CompressVectorize)
         return std::move(CurrentOrder);
@@ -7770,6 +7808,9 @@ static void combineOrders(MutableArrayRef<unsigned> Order,
 }
 
 bool BoUpSLP::isProfitableToReorder() const {
+  if (DisableTreeReorder)
+    return false;
+
   constexpr unsigned TinyVF = 2;
   constexpr unsigned TinyTree = 10;
   constexpr unsigned PhiOpsLimit = 12;
@@ -9193,8 +9234,9 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
           // Try to build vector load.
           ArrayRef<Value *> Values(
               reinterpret_cast<Value *const *>(Slice.begin()), Slice.size());
+          StridedPtrInfo SPtrInfo;
           LoadsState LS = canVectorizeLoads(Values, Slice.front(), CurrentOrder,
-                                            PointerOps, &BestVF);
+                                            PointerOps, SPtrInfo, &BestVF);
           if (LS != LoadsState::Gather ||
               (BestVF > 1 && static_cast<unsigned>(NumElts) == 2 * BestVF)) {
             if (LS == LoadsState::ScatterVectorize) {
@@ -9388,6 +9430,7 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
                 unsigned VF = *CommonVF;
                 OrdersType Order;
                 SmallVector<Value *> PointerOps;
+                StridedPtrInfo SPtrInfo;
                 // Segmented load detected - vectorize at maximum vector factor.
                 if (InterleaveFactor <= Slice.size() &&
                     TTI.isLegalInterleavedAccessType(
@@ -9396,8 +9439,8 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
                         cast<LoadInst>(Slice.front())->getAlign(),
                         cast<LoadInst>(Slice.front())
                             ->getPointerAddressSpace()) &&
-                    canVectorizeLoads(Slice, Slice.front(), Order,
-                                      PointerOps) == LoadsState::Vectorize) {
+                    canVectorizeLoads(Slice, Slice.front(), Order, PointerOps,
+                                      SPtrInfo) == LoadsState::Vectorize) {
                   UserMaxVF = InterleaveFactor * VF;
                 } else {
                   InterleaveFactor = 0;
@@ -9419,8 +9462,9 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
                            ArrayRef<Value *> VL = TE.Scalars;
                            OrdersType Order;
                            SmallVector<Value *> PointerOps;
+                           StridedPtrInfo SPtrInfo;
                            LoadsState State = canVectorizeLoads(
-                               VL, VL.front(), Order, PointerOps);
+                               VL, VL.front(), Order, PointerOps, SPtrInfo);
                            if (State == LoadsState::ScatterVectorize ||
                                State == LoadsState::CompressVectorize)
                              return false;
@@ -9438,11 +9482,11 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
                          [&, Slice = Slice](unsigned Idx) {
                            OrdersType Order;
                            SmallVector<Value *> PointerOps;
+                           StridedPtrInfo SPtrInfo;
                            return canVectorizeLoads(
                                       Slice.slice(Idx * UserMaxVF, UserMaxVF),
-                                      Slice[Idx * UserMaxVF], Order,
-                                      PointerOps) ==
-                                  LoadsState::ScatterVectorize;
+                                      Slice[Idx * UserMaxVF], Order, PointerOps,
+                                      SPtrInfo) == LoadsState::ScatterVectorize;
                          }))
                 UserMaxVF = MaxVF;
               if (Slice.size() != ConsecutiveNodesSize)
@@ -9799,7 +9843,7 @@ getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy,
 BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
     const InstructionsState &S, ArrayRef<Value *> VL,
     bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder,
-    SmallVectorImpl<Value *> &PointerOps) {
+    SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo &SPtrInfo) {
   assert(S.getMainOp() &&
          "Expected instructions with same/alternate opcodes only.");
 
@@ -9901,7 +9945,7 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
         });
       });
     };
-    switch (canVectorizeLoads(VL, VL0, CurrentOrder, PointerOps)) {
+    switch (canVectorizeLoads(VL, VL0, CurrentOrder, PointerOps, SPtrInfo)) {
     case LoadsState::Vectorize:
       return TreeEntry::Vectorize;
     case LoadsState::CompressVectorize:
@@ -11371,8 +11415,9 @@ void BoUpSLP::buildTreeRec(ArrayRef<Value *> VLRef, unsigned Depth,
       UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
   OrdersType CurrentOrder;
   SmallVector<Value *> PointerOps;
+  StridedPtrInfo SPtrInfo;
   TreeEntry::EntryState State = getScalarsVectorizationState(
-      S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps);
+      S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps, SPtrInfo);
   if (State == TreeEntry::NeedToGather) {
     newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
     return;
@@ -11532,6 +11577,7 @@ void BoUpSLP::buildTreeRec(ArrayRef<Value *> VLRef, unsigned Depth,
         // Vectorizing non-consecutive loads with `llvm.masked.gather`.
         TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
                           UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
+        TreeEntryToStridedPtrInfoMap[TE] = SPtrInfo;
         LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (strided LoadInst).\n";
                    TE->dump());
         break;
@@ -12920,8 +12966,9 @@ void BoUpSLP::transformNodes() {
               if (S.getOpcode() == Instruction::Load) {
                 OrdersType Order;
                 SmallVector<Value *> PointerOps;
-                LoadsState Res =
-                    canVectorizeLoads(Slice, Slice.front(), Order, PointerOps);
+                StridedPtrInfo SPtrInfo;
+                LoadsState Res = canVectorizeLoads(Slice, Slice.front(), Order,
+                                                   PointerOps, SPtrInfo);
                 AllStrided &= Res == LoadsState::StridedVectorize ||
                               Res == LoadsState::ScatterVectorize ||
                               Res == LoadsState::Gather;
@@ -13027,10 +13074,18 @@ void BoUpSLP::transformNodes() {
         InstructionCost StridedCost = TTI->getStridedMemoryOpCost(
             Instruction::Load, VecTy, BaseLI->getPointerOperand(),
             /*VariableMask=*/false, CommonAlignment, CostKind, BaseLI);
-        if (StridedCost < OriginalVecCost)
+        if (StridedCost < OriginalVecCost || ForceStridedLoads) {
           // Strided load is more profitable than consecutive load + reverse -
           // transform the node to strided load.
+          Type *StrideTy = DL->getIndexType(cast<LoadInst>(E.Scalars.front())
+                                                ->getPointerOperand()
+                                                ->getType());
+          StridedPtrInfo SPtrInfo;
+          SPtrInfo.StrideVal = ConstantInt::get(StrideTy, 1);
+          SPtrInfo.Ty = VecTy;
+          TreeEntryToStridedPtrInfoMap[&E] = SPtrInfo;
           E.State = TreeEntry::StridedVectorize;
+        }
       }
       break;
     }
@@ -19471,6 +19526,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
 
       LoadInst *LI = cast<LoadInst>(VL0);
       Instruction *NewLI;
+      FixedVectorType *StridedLoadTy = nullptr;
       Value *PO = LI->getPointerOperand();
       if (E->State == TreeEntry::Vectorize) {
         NewLI = Builder.CreateAlignedLoad(VecTy, PO, LI->getAlign());
@@ -19508,43 +19564,36 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
         Value *Ptr0 = cast<LoadInst>(E->Scalars.front())->getPointerOperand();
         Value *PtrN = cast<LoadInst>(E->Scalars.back())->getPointerOperand();
         PO = IsReverseOrder ? PtrN : Ptr0;
-        std::optional<int64_t> Diff = getPointersDiff(
-            VL0->getType(), Ptr0, VL0->getType(), PtrN, *DL, *SE);
         Type *StrideTy = DL->getIndexType(PO->getType());
         Value *StrideVal;
-        if (Diff) {
-          int64_t Stride =
-              *Diff / (static_cast<int64_t>(E->Scalars.size()) - 1);
-          StrideVal =
-              ConstantInt::get(StrideTy, (IsReverseOrder ? -1 : 1) * Stride *
-                                             DL->getTypeAllocSize(ScalarTy));
-        } else {
-          SmallVector<Value *> PointerOps(E->Scalars.size(), nullptr);
-          transform(E->Scalars, PointerOps.begin(), [](Value *V) {
-            return cast<LoadInst>(V)->getPointerOperand();
-          });
-          OrdersType Order;
-          const SCEV *StrideSCEV =
-              calculateRtStride(PointerOps, ScalarTy, *DL, *SE, Order);
-          assert(StrideSCEV && "At this point stride should be known");
+        const StridedPtrInfo &SPtrInfo = TreeEntryToStridedPtrInfoMap.at(E);
+        StridedLoadTy = SPtrInfo.Ty;
+        assert(StridedLoadTy && "Missing StridedPoinerInfo for tree entry.");
+        unsigned StridedLoadEC =
+            StridedLoadTy->getElementCount().getKnownMinValue();
+
+        Value *Stride = SPtrInfo.StrideVal;
+        if (!Stride) {
+          const SCEV *StrideSCEV = SPtrInfo.StrideSCEV;
+          assert(StrideSCEV && "Neither StrideVal nor StrideSCEV were set.");
           SCEVExpander Expander(*SE, *DL, "strided-load-vec");
-          Value *Stride = Expander.expandCodeFor(
-              StrideSCEV, StrideSCEV->getType(), &*Builder.GetInsertPoint());
-          Value *NewStride =
-              Builder.CreateIntCast(Stride, StrideTy, /*isSigned=*/true);
-          StrideVal = Builder.CreateMul(
-              NewStride,
-              ConstantInt::get(
-                  StrideTy,
-                  (IsReverseOrder ? -1 : 1) *
-                      static_cast<int>(DL->getTypeAllocSize(ScalarTy))));
-        }
+          Stride = Expander.expandCodeFor(StrideSCEV, StrideSCEV->getType(),
+                                          &*Builder.GetInsertPoint());
+        }
+        Value *NewStride =
+            Builder.CreateIntCast(Stride, StrideTy, /*isSigned=*/true);
+        StrideVal = Builder.CreateMul(
+            NewStride, ConstantInt::get(
+                           StrideTy, (IsReverseOrder ? -1 : 1) *
+                                         static_cast<int>(
+                                             DL->getTypeAllocSize(ScalarTy))));
         Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
         auto *Inst = Builder.CreateIntrinsic(
             Intrinsic::experimental_vp_strided_load,
-            {VecTy, PO->getType(), StrideTy},
-            {PO, StrideVal, Builder.getAllOnesMask(VecTy->getElementCount()),
-             Builder.getInt32(E->Scalars.size())});
+            {StridedLoadTy, PO->getType(), StrideTy},
+            {PO, StrideVal,
+             Builder.getAllOnesMask(ElementCount::getFixed(StridedLoadEC)),
+             Builder.getInt32(StridedLoadEC)});
         Inst->addParamAttr(
             /*ArgNo=*/0,
             Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
@@ -24386,135 +24435,134 @@ class HorizontalReduction {
       VectorizedTree = GetNewVectorizedTree(
           VectorizedTree,
           emitReduction(Builder, *TTI, ReductionRoot->getType()));
-    if (VectorizedTree) {
-      // Reorder operands of bool logical op in the natural order to avoid
-      // possible problem with poison propagation. If not possible to reorder
-      // (both operands are originally RHS), emit an extra freeze instruction
-      // for the LHS operand.
-      // I.e., if we have original code like this:
-      // RedOp1 = select i1 ?, i1 LHS, i1 false
-      // RedOp2 = select i1 RHS, i1 ?, i1 false
-
-      // Then, we swap LHS/RHS to create a new op that matches the poison
-      // semantics of the original code.
-
-      // If we have original code like this and both values could be poison:
-      // RedOp1 = select i1 ?, i1 LHS, i1 false
-      // RedOp2 = select i1 ?, i1 RHS, i1 false
-
-      // Then, we must freeze LHS in the new op.
-      auto FixBoolLogicalOps = [&, VectorizedTree](Value *&LHS, Value *&RHS,
-                                                   Instruction *RedOp1,
-                                                   Instruction *RedOp2,
-                                                   bool InitStep) {
-        if (!AnyBoolLogicOp)
-          return;
-        if (isBoolLogicOp(RedOp1) && ((!InitStep && LHS == VectorizedTree) ||
-                                      getRdxOperand(RedOp1, 0) == LHS ||
-                                      isGuaranteedNotToBePoison(LHS, AC)))
-          return;
-        if (isBoolLogicOp(RedOp2) && ((!InitStep && RHS == VectorizedTree) ||
-                                      getRdxOperand(RedOp2, 0) == RHS ||
-                                      isGuaranteedNotToBePoison(RHS, AC))) {
-          std::swap(LHS, RHS);
-          return;
-        }
-        if (LHS != VectorizedTree)
-          LHS = Builder.CreateFreeze(LHS);
-      };
-      // Finish the reduction.
-      // Need to add extra arguments and not vectorized possible reduction
-      // values.
-      // Try to avoid dependencies between the scalar remainders after
-      // reductions.
-      auto FinalGen =
-          [&](ArrayRef<std::pair<Instruction *, Value *>> InstVals,
-              bool InitStep) {
-            unsigned Sz = InstVals.size();
-            SmallVector<std::pair<Instruction *, Value *>> ExtraReds(Sz / 2 +
-                                                                     Sz % 2);
-            for (unsigned I = 0, E = (Sz / 2) * 2; I < E; I += 2) {
-              Instruction *RedOp = InstVals[I + 1].first;
-              Builder.SetCurrentDebugLocation(RedOp->getDebugLoc());
-              Value *RdxVal1 = InstVals[I].second;
-              Value *StableRdxVal1 = RdxVal1;
-              auto It1 = TrackedVals.find(RdxVal1);
-              if (It1 != TrackedVals.end())
-                StableRdxVal1 = It1->second;
-              Value *RdxVal2 = InstVals[I + 1].second;
-              Value *StableRdxVal2 = RdxVal2;
-              auto It2 = TrackedVals.find(RdxVal2);
-              if (It2 != TrackedVals.end())
-                StableRdxVal2 = It2->second;
-              // To prevent poison from leaking across what used to be
-              // sequential, safe, scalar boolean logic operations, the
-              // reduction operand must be frozen.
-              FixBoolLogicalOps(StableRdxVal1, StableRdxVal2, InstVals[I].first,
-                                RedOp, InitStep);
-              Value *ExtraRed = createOp(Builder, RdxKind, StableRdxVal1,
-                                         StableRdxVal2, "op.rdx", ReductionOps);
-              ExtraReds[I / 2] = std::make_pair(InstVals[I].first, ExtraRed);
-            }
-            if (Sz % 2 == 1)
-              ExtraReds[Sz / 2] = InstVals.back();
-            return ExtraReds;
-          };
-      SmallVector<std::pair<Instruction *, Value *>> ExtraReductions;
-      ExtraReductions.emplace_back(cast<Instruction>(ReductionRoot),
-                                   VectorizedTree);
-      SmallPtrSet<Value *, 8> Visited;
-      for (ArrayRef<Value *> Candidates : ReducedVals) {
-        for (Value *RdxVal : Candidates) {
-          if (!Visited.insert(RdxVal).second)
-            continue;
-          unsigned NumOps = VectorizedVals.lookup(RdxVal);
-          for (Instruction *RedOp :
-               ArrayRef(ReducedValsToOps.at(RdxVal)).drop_back(NumOps))
-            ExtraReductions.emplace_back(RedOp, RdxVal);
-        }
+
+    if (!VectorizedTree) {
+      if (!CheckForReusedReductionOps) {
+        for (ReductionOpsType &RdxOps : ReductionOps)
+          for (Value *RdxOp : RdxOps)
+            V.analyzedReductionRoot(cast<Instruction>(RdxOp));
       }
-      // Iterate through all not-vectorized reduction values/extra arguments.
-      bool InitStep = true;
-      while (ExtraReductions.size() > 1) {
-        SmallVector<std::pair<Instruction *, Value *>> NewReds =
-            FinalGen(ExtraReductions, InitStep);
-        ExtraReductions.swap(NewReds);
-        InitStep = false;
+      return nullptr;
+    }
+
+    // Reorder operands of bool logical op in the natural order to avoid
+    // possible problem with poison propagation. If not possible to reorder
+    // (both operands are originally RHS), emit an extra freeze instruction
+    // for the LHS operand.
+    // I.e., if we have original code like this:
+    // RedOp1 = select i1 ?, i1 LHS, i1 false
+    // RedOp2 = select i1 RHS, i1 ?, i1 false
+
+    // Then, we swap LHS/RHS to create a new op that matches the poison
+    // semantics of the original code.
+
+    // If we have original code like this and both values could be poison:
+    // RedOp1 = select i1 ?, i1 LHS, i1 false
+    // RedOp2 = select i1 ?, i1 RHS, i1 false
+
+    // Then, we must freeze LHS in the new op.
+    auto FixBoolLogicalOps =
+        [&, VectorizedTree](Value *&LHS, Value *&RHS, Instruction *RedOp1,
+                            Instruction *RedOp2, bool InitStep) {
+          if (!AnyBoolLogicOp)
+            return;
+          if (isBoolLogicOp(RedOp1) && ((!InitStep && LHS == VectorizedTree) ||
+                                        getRdxOperand(RedOp1, 0) == LHS ||
+                                        isGuaranteedNotToBePoison(LHS, AC)))
+            return;
+          if (isBoolLogicOp(RedOp2) && ((!InitStep && RHS == VectorizedTree) ||
+                                        getRdxOperand(RedOp2, 0) == RHS ||
+                                        isGuaranteedNotToBePoison(RHS, AC))) {
+            std::swap(LHS, RHS);
+            return;
+          }
+          if (LHS != VectorizedTree)
+            LHS = Builder.CreateFreeze(LHS);
+        };
+    // Finish the reduction.
+    // Need to add extra arguments and not vectorized possible reduction values.
+    // Try to avoid dependencies between the scalar remainders after reductions.
+    auto FinalGen = [&](ArrayRef<std::pair<Instruction *, Value *>> InstVals,
+                        bool InitStep) {
+      unsigned Sz = InstVals.size();
+      SmallVector<std::pair<Instruction *, Value *>> ExtraReds(Sz / 2 + Sz % 2);
+      for (unsigned I = 0, E = (Sz / 2) * 2; I < E; I += 2) {
+        Instruction *RedOp = InstVals[I + 1].first;
+        Builder.SetCurrentDebugLocation(RedOp->getDebugLoc());
+        Value *RdxVal1 = InstVals[I].second;
+        Value *StableRdxVal1 = RdxVal1;
+        auto It1 = TrackedVals.find(RdxVal1);
+        if (It1 != TrackedVals.end())
+          StableRdxVal1 = It1->second;
+        Value *RdxVal2 = InstVals[I + 1].second;
+        Value *StableRdxVal2 = RdxVal2;
+        auto It2 = TrackedVals.find(RdxVal2);
+        if (It2 != TrackedVals.end())
+          StableRdxVal2 = It2->second;
+        // To prevent poison from leaking across what used to be sequential,
+        // safe, scalar boolean logic operations, the reduction operand must be
+        // frozen.
+        FixBoolLogicalOps(StableRdxVal1, StableRdxVal2, InstVals[I].first,
+                          RedOp, InitStep);
+        Value *ExtraRed = createOp(Builder, RdxKind, StableRdxVal1,
+                                   StableRdxVal2, "op.rdx", ReductionOps);
+        ExtraReds[I / 2] = std::make_pair(InstVals[I].first, ExtraRed);
+      }
+      if (Sz % 2 == 1)
+        ExtraReds[Sz / 2] = InstVals.back();
+      return ExtraReds;
+    };
+    SmallVector<std::pair<Instruction *, Value *>> ExtraReductions;
+    ExtraReductions.emplace_back(cast<Instruction>(ReductionRoot),
+                                 VectorizedTree);
+    SmallPtrSet<Value *, 8> Visited;
+    for (ArrayRef<Value *> Candidates : ReducedVals) {
+      for (Value *RdxVal : Candidates) {
+        if (!Visited.insert(RdxVal).second)
+          continue;
+        unsigned NumOps = VectorizedVals.lookup(RdxVal);
+        for (Instruction *RedOp :
+             ArrayRef(ReducedValsToOps.at(RdxVal)).drop_back(NumOps))
+          ExtraReductions.emplace_back(RedOp, RdxVal);
       }
-      VectorizedTree = ExtraReductions.front().second;
+    }
+    // Iterate through all not-vectorized reduction values/extra arguments.
+    bool InitStep = true;
+    while (ExtraReductions.size() > 1) {
+      SmallVector<std::pair<Instruction *, Value *>> NewReds =
+          FinalGen(ExtraReductions, InitStep);
+      ExtraReductions.swap(NewReds);
+      InitStep = false;
+    }
+    VectorizedTree = ExtraReductions.front().second;
 
-      ReductionRoot->replaceAllUsesWith(VectorizedTree);
+    ReductionRoot->replaceAllUsesWith(VectorizedTree);
 
-      // The original scalar reduction is expected to have no remaining
-      // uses outside the reduction tree itself.  Assert that we got this
-      // correct, replace internal uses with undef, and mark for eventual
-      // deletion.
+    // The original scalar reduction is expected to have no remaining
+    // uses outside the reduction tree itself.  Assert that we got this
+    // correct, replace internal uses with undef, and mark for eventual
+    // deletion.
 #ifndef NDEBUG
-      SmallPtrSet<Value *, 4> IgnoreSet;
-      for (ArrayRef<Value *> RdxOps : ReductionOps)
-        IgnoreSet.insert_range(RdxOps);
+    SmallPtrSet<Value *, 4> IgnoreSet;
+    for (ArrayRef<Value *> RdxOps : ReductionOps)
+      IgnoreSet.insert_range(RdxOps);
 #endif
-      for (ArrayRef<Value *> RdxOps : ReductionOps) {
-        for (Value *Ignore : RdxOps) {
-          if (!Ignore)
-            continue;
+    for (ArrayRef<Value *> RdxOps : ReductionOps) {
+      for (Value *Ignore : RdxOps) {
+        if (!Ignore)
+          continue;
 #ifndef NDEBUG
-          for (auto *U : Ignore->users()) {
-            assert(IgnoreSet.count(U) &&
-                   "All users must be either in the reduction ops list.");
-          }
+        for (auto *U : Ignore->users()) {
+          assert(IgnoreSet.count(U) &&
+                 "All users must be either in the reduction ops list.");
+        }
 #endif
-          if (!Ignore->use_empty()) {
-            Value *P = PoisonValue::get(Ignore->getType());
-            Ignore->replaceAllUsesWith(P);
-          }
+        if (!Ignore->use_empty()) {
+          Value *P = PoisonValue::get(Ignore->getType());
+          Ignore->replaceAllUsesWith(P);
         }
-        V.removeInstructionsAndOperands(RdxOps, VectorValuesAndScales);
       }
-    } else if (!CheckForReusedReductionOps) {
-      for (ReductionOpsType &RdxOps : ReductionOps)
-        for (Value *RdxOp : RdxOps)
-          V.analyzedReductionRoot(cast<Instruction>(RdxOp));
+      V.removeInstructionsAndOperands(RdxOps, VectorValuesAndScales);
     }
     return VectorizedTree;
   }
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 16b1b539345de..30a3a01ddd949 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -343,37 +343,21 @@ Value *VPTransformState::get(const VPValue *Def, bool NeedsScalar) {
     LastLane = 0;
   }
 
-  auto *LastInst = cast<Instruction>(get(Def, LastLane));
+  // We need to construct the vector value for a single-scalar value by
+  // broadcasting the scalar to all lanes.
+  // TODO: Replace by introducing Broadcast VPInstructions.
+  assert(IsSingleScalar && "must be a single-scalar at this point");
   // Set the insert point after the last scalarized instruction or after the
   // last PHI, if LastInst is a PHI. This ensures the insertelement sequence
   // will directly follow the scalar definitions.
   auto OldIP = Builder.saveIP();
+  auto *LastInst = cast<Instruction>(get(Def, LastLane));
   auto NewIP = isa<PHINode>(LastInst)
                    ? LastInst->getParent()->getFirstNonPHIIt()
                    : std::next(BasicBlock::iterator(LastInst));
   Builder.SetInsertPoint(&*NewIP);
-
-  // However, if we are vectorizing, we need to construct the vector values.
-  // If the value is known to be uniform after vectorization, we can just
-  // broadcast the scalar value corresponding to lane zero. Otherwise, we
-  // construct the vector values using insertelement instructions. Since the
-  // resulting vectors are stored in State, we will only generate the
-  // insertelements once.
-  Value *VectorValue = nullptr;
-  if (IsSingleScalar) {
-    VectorValue = GetBroadcastInstrs(ScalarValue);
-    set(Def, VectorValue);
-  } else {
-    assert(!VF.isScalable() && "VF is assumed to be non scalable.");
-    assert(isa<VPInstruction>(Def) &&
-           "Explicit BuildVector recipes must have"
-           "handled packing for non-VPInstructions.");
-    // Initialize packing with insertelements to start from poison.
-    VectorValue = PoisonValue::get(toVectorizedTy(LastInst->getType(), VF));
-    for (unsigned Lane = 0; Lane < VF.getFixedValue(); ++Lane)
-      VectorValue = packScalarIntoVectorizedValue(Def, VectorValue, Lane);
-    set(Def, VectorValue);
-  }
+  Value *VectorValue = GetBroadcastInstrs(ScalarValue);
+  set(Def, VectorValue);
   Builder.restoreIP(OldIP);
   return VectorValue;
 }
@@ -1763,3 +1747,33 @@ VPCostContext::getOperandInfo(VPValue *V) const {
 
   return TTI::getOperandInfo(V->getLiveInIRValue());
 }
+
+InstructionCost VPCostContext::getScalarizationOverhead(
+    Type *ResultTy, ArrayRef<const VPValue *> Operands, ElementCount VF) {
+  if (VF.isScalar())
+    return 0;
+
+  InstructionCost ScalarizationCost = 0;
+  // Compute the cost of scalarizing the result if needed.
+  if (!ResultTy->isVoidTy()) {
+    for (Type *VectorTy :
+         to_vector(getContainedTypes(toVectorizedTy(ResultTy, VF)))) {
+      ScalarizationCost += TTI.getScalarizationOverhead(
+          cast<VectorType>(VectorTy), APInt::getAllOnes(VF.getFixedValue()),
+          /*Insert=*/true,
+          /*Extract=*/false, CostKind);
+    }
+  }
+  // Compute the cost of scalarizing the operands, skipping ones that do not
+  // require extraction/scalarization and do not incur any overhead.
+  SmallPtrSet<const VPValue *, 4> UniqueOperands;
+  SmallVector<Type *> Tys;
+  for (auto *Op : Operands) {
+    if (Op->isLiveIn() || isa<VPReplicateRecipe, VPPredInstPHIRecipe>(Op) ||
+        !UniqueOperands.insert(Op).second)
+      continue;
+    Tys.push_back(toVectorizedTy(Types.inferScalarType(Op), VF));
+  }
+  return ScalarizationCost +
+         TTI.getOperandsScalarizationOverhead(Tys, CostKind);
+}
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 53291a931530f..f79855f7e2c5f 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -908,6 +908,8 @@ struct VPRecipeWithIRFlags : public VPSingleDefRecipe, public VPIRFlags {
     return R && classof(R);
   }
 
+  virtual VPRecipeWithIRFlags *clone() override = 0;
+
   static inline bool classof(const VPSingleDefRecipe *U) {
     auto *R = dyn_cast<VPRecipeBase>(U);
     return R && classof(R);
@@ -916,9 +918,8 @@ struct VPRecipeWithIRFlags : public VPSingleDefRecipe, public VPIRFlags {
   void execute(VPTransformState &State) override = 0;
 
   /// Compute the cost for this recipe for \p VF, using \p Opcode and \p Ctx.
-  std::optional<InstructionCost>
-  getCostForRecipeWithOpcode(unsigned Opcode, ElementCount VF,
-                             VPCostContext &Ctx) const;
+  InstructionCost getCostForRecipeWithOpcode(unsigned Opcode, ElementCount VF,
+                                             VPCostContext &Ctx) const;
 };
 
 /// Helper to access the operand that contains the unroll part for this recipe
@@ -1061,13 +1062,6 @@ class LLVM_ABI_FOR_TEST VPInstruction : public VPRecipeWithIRFlags,
     VScale,
   };
 
-private:
-  typedef unsigned char OpcodeTy;
-  OpcodeTy Opcode;
-
-  /// An optional name that can be used for the generated IR instruction.
-  const std::string Name;
-
   /// Returns true if this VPInstruction generates scalar values for all lanes.
   /// Most VPInstructions generate a single value per part, either vector or
   /// scalar. VPReplicateRecipe takes care of generating multiple (scalar)
@@ -1076,6 +1070,13 @@ class LLVM_ABI_FOR_TEST VPInstruction : public VPRecipeWithIRFlags,
   /// underlying ingredient.
   bool doesGeneratePerAllLanes() const;
 
+private:
+  typedef unsigned char OpcodeTy;
+  OpcodeTy Opcode;
+
+  /// An optional name that can be used for the generated IR instruction.
+  const std::string Name;
+
   /// Returns true if we can generate a scalar for the first lane only if
   /// needed.
   bool canGenerateScalarForFirstLane() const;
@@ -1085,11 +1086,6 @@ class LLVM_ABI_FOR_TEST VPInstruction : public VPRecipeWithIRFlags,
   /// existing value is returned rather than a generated one.
   Value *generate(VPTransformState &State);
 
-  /// Utility methods serving execute(): generates a scalar single instance of
-  /// the modeled instruction for a given lane. \returns the scalar generated
-  /// value for lane \p Lane.
-  Value *generatePerLane(VPTransformState &State, const VPLane &Lane);
-
 #if !defined(NDEBUG)
   /// Return the number of operands determined by the opcode of the
   /// VPInstruction. Returns -1u if the number of operands cannot be determined
@@ -1799,6 +1795,9 @@ class LLVM_ABI_FOR_TEST VPWidenGEPRecipe : public VPRecipeWithIRFlags {
 
   VP_CLASSOF_IMPL(VPDef::VPWidenGEPSC)
 
+  /// This recipe generates a GEP instruction.
+  unsigned getOpcode() const { return Instruction::GetElementPtr; }
+
   /// Generate the gep nodes.
   void execute(VPTransformState &State) override;
 
@@ -1903,6 +1902,8 @@ class VPVectorPointerRecipe : public VPRecipeWithIRFlags,
 
   void execute(VPTransformState &State) override;
 
+  Type *getSourceElementType() const { return IndexedTy; }
+
   bool onlyFirstLaneUsed(const VPValue *Op) const override {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index d400ceff7797c..46ab7712e2671 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -10,6 +10,7 @@
 #include "VPlan.h"
 #include "VPlanCFG.h"
 #include "VPlanDominatorTree.h"
+#include "VPlanHelpers.h"
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/TypeSwitch.h"
 #include "llvm/Analysis/ScalarEvolution.h"
@@ -396,7 +397,7 @@ bool VPDominatorTree::properlyDominates(const VPRecipeBase *A,
 
 /// Get the VF scaling factor applied to the recipe's output, if the recipe has
 /// one.
-static unsigned getVFScaleFactor(VPRecipeBase *R) {
+static unsigned getVFScaleFactor(VPValue *R) {
   if (auto *RR = dyn_cast<VPReductionPHIRecipe>(R))
     return RR->getVFScaleFactor();
   if (auto *RR = dyn_cast<VPPartialReductionRecipe>(R))
@@ -422,15 +423,15 @@ SmallVector<VPRegisterUsage, 8> llvm::calculateRegisterUsageForPlan(
     const SmallPtrSetImpl<const Value *> &ValuesToIgnore) {
   // Each 'key' in the map opens a new interval. The values
   // of the map are the index of the 'last seen' usage of the
-  // recipe that is the key.
-  using IntervalMap = SmallDenseMap<VPRecipeBase *, unsigned, 16>;
+  // VPValue that is the key.
+  using IntervalMap = SmallDenseMap<VPValue *, unsigned, 16>;
 
   // Maps indices to recipes.
   SmallVector<VPRecipeBase *, 64> Idx2Recipe;
   // Marks the end of each interval.
   IntervalMap EndPoint;
-  // Saves the list of recipe indices that are used in the loop.
-  SmallPtrSet<VPRecipeBase *, 8> Ends;
+  // Saves the list of VPValues that are used in the loop.
+  SmallPtrSet<VPValue *, 8> Ends;
   // Saves the list of values that are used in the loop but are defined outside
   // the loop (not including non-recipe values such as arguments and
   // constants).
@@ -441,7 +442,7 @@ SmallVector<VPRegisterUsage, 8> llvm::calculateRegisterUsageForPlan(
   // each recipe. We use RPO to ensure that defs are met before their users. We
   // assume that each recipe that has in-loop users starts an interval. We
   // record every time that an in-loop value is used, so we have a list of the
-  // first and last occurrences of each recipe.
+  // first occurences of each recipe and last occurrence of each VPValue.
   VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
   ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT(
       LoopRegion);
@@ -470,32 +471,32 @@ SmallVector<VPRegisterUsage, 8> llvm::calculateRegisterUsageForPlan(
         }
 
         // Overwrite previous end points.
-        EndPoint[DefR] = Idx2Recipe.size();
-        Ends.insert(DefR);
+        EndPoint[U] = Idx2Recipe.size();
+        Ends.insert(U);
       }
     }
     if (VPBB == LoopRegion->getExiting()) {
       // VPWidenIntOrFpInductionRecipes are used implicitly at the end of the
       // exiting block, where their increment will get materialized eventually.
       for (auto &R : LoopRegion->getEntryBasicBlock()->phis()) {
-        if (isa<VPWidenIntOrFpInductionRecipe>(&R)) {
-          EndPoint[&R] = Idx2Recipe.size();
-          Ends.insert(&R);
+        if (auto *WideIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R)) {
+          EndPoint[WideIV] = Idx2Recipe.size();
+          Ends.insert(WideIV);
         }
       }
     }
   }
 
   // Saves the list of intervals that end with the index in 'key'.
-  using RecipeList = SmallVector<VPRecipeBase *, 2>;
-  SmallDenseMap<unsigned, RecipeList, 16> TransposeEnds;
+  using VPValueList = SmallVector<VPValue *, 2>;
+  SmallDenseMap<unsigned, VPValueList, 16> TransposeEnds;
 
   // Next, we transpose the EndPoints into a multi map that holds the list of
   // intervals that *end* at a specific location.
   for (auto &Interval : EndPoint)
     TransposeEnds[Interval.second].push_back(Interval.first);
 
-  SmallPtrSet<VPRecipeBase *, 8> OpenIntervals;
+  SmallPtrSet<VPValue *, 8> OpenIntervals;
   SmallVector<VPRegisterUsage, 8> RUs(VFs.size());
   SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
 
@@ -519,14 +520,16 @@ SmallVector<VPRegisterUsage, 8> llvm::calculateRegisterUsageForPlan(
   for (unsigned int Idx = 0, Sz = Idx2Recipe.size(); Idx < Sz; ++Idx) {
     VPRecipeBase *R = Idx2Recipe[Idx];
 
-    // Remove all of the recipes that end at this location.
-    RecipeList &List = TransposeEnds[Idx];
-    for (VPRecipeBase *ToRemove : List)
+    // Remove all of the VPValues that end at this location.
+    VPValueList &List = TransposeEnds[Idx];
+    for (VPValue *ToRemove : List)
       OpenIntervals.erase(ToRemove);
 
     // Ignore recipes that are never used within the loop and do not have side
     // effects.
-    if (!Ends.count(R) && !R->mayHaveSideEffects())
+    if (none_of(R->definedValues(),
+                [&Ends](VPValue *Def) { return Ends.count(Def); }) &&
+        !R->mayHaveSideEffects())
       continue;
 
     // Skip recipes for ignored values.
@@ -546,41 +549,38 @@ SmallVector<VPRegisterUsage, 8> llvm::calculateRegisterUsageForPlan(
       // there is no previous entry for ClassID.
       SmallMapVector<unsigned, unsigned, 4> RegUsage;
 
-      for (auto *R : OpenIntervals) {
-        // Skip recipes that weren't present in the original loop.
+      for (auto *VPV : OpenIntervals) {
+        // Skip values that weren't present in the original loop.
         // TODO: Remove after removing the legacy
         // LoopVectorizationCostModel::calculateRegisterUsage
         if (isa<VPVectorPointerRecipe, VPVectorEndPointerRecipe,
-                VPBranchOnMaskRecipe>(R))
+                VPBranchOnMaskRecipe>(VPV))
           continue;
 
         if (VFs[J].isScalar() ||
             isa<VPCanonicalIVPHIRecipe, VPReplicateRecipe, VPDerivedIVRecipe,
-                VPEVLBasedIVPHIRecipe, VPScalarIVStepsRecipe>(R) ||
-            (isa<VPInstruction>(R) &&
-             vputils::onlyScalarValuesUsed(cast<VPSingleDefRecipe>(R))) ||
-            (isa<VPReductionPHIRecipe>(R) &&
-             (cast<VPReductionPHIRecipe>(R))->isInLoop())) {
-          unsigned ClassID = TTI.getRegisterClassForType(
-              false, TypeInfo.inferScalarType(R->getVPSingleValue()));
+                VPEVLBasedIVPHIRecipe, VPScalarIVStepsRecipe>(VPV) ||
+            (isa<VPInstruction>(VPV) && vputils::onlyScalarValuesUsed(VPV)) ||
+            (isa<VPReductionPHIRecipe>(VPV) &&
+             (cast<VPReductionPHIRecipe>(VPV))->isInLoop())) {
+          unsigned ClassID =
+              TTI.getRegisterClassForType(false, TypeInfo.inferScalarType(VPV));
           // FIXME: The target might use more than one register for the type
           // even in the scalar case.
           RegUsage[ClassID] += 1;
         } else {
           // The output from scaled phis and scaled reductions actually has
           // fewer lanes than the VF.
-          unsigned ScaleFactor = getVFScaleFactor(R);
+          unsigned ScaleFactor = getVFScaleFactor(VPV);
           ElementCount VF = VFs[J].divideCoefficientBy(ScaleFactor);
           LLVM_DEBUG(if (VF != VFs[J]) {
             dbgs() << "LV(REG): Scaled down VF from " << VFs[J] << " to " << VF
                    << " for " << *R << "\n";
           });
 
-          for (VPValue *DefV : R->definedValues()) {
-            Type *ScalarTy = TypeInfo.inferScalarType(DefV);
-            unsigned ClassID = TTI.getRegisterClassForType(true, ScalarTy);
-            RegUsage[ClassID] += GetRegUsage(ScalarTy, VF);
-          }
+          Type *ScalarTy = TypeInfo.inferScalarType(VPV);
+          unsigned ClassID = TTI.getRegisterClassForType(true, ScalarTy);
+          RegUsage[ClassID] += GetRegUsage(ScalarTy, VF);
         }
       }
 
@@ -593,8 +593,11 @@ SmallVector<VPRegisterUsage, 8> llvm::calculateRegisterUsageForPlan(
     LLVM_DEBUG(dbgs() << "LV(REG): At #" << Idx << " Interval # "
                       << OpenIntervals.size() << '\n');
 
-    // Add the current recipe to the list of open intervals.
-    OpenIntervals.insert(R);
+    // Add used VPValues defined by the current recipe to the list of open
+    // intervals.
+    for (VPValue *DefV : R->definedValues())
+      if (Ends.contains(DefV))
+        OpenIntervals.insert(DefV);
   }
 
   // We also search for instructions that are defined outside the loop, but are
diff --git a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h
index 5ad2ac6b61e05..fe59774b7c838 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h
@@ -371,6 +371,13 @@ struct VPCostContext {
   /// legacy cost model for \p VF. Only used to check for additional VPlan
   /// simplifications.
   bool isLegacyUniformAfterVectorization(Instruction *I, ElementCount VF) const;
+
+  /// Estimate the overhead of scalarizing a recipe with result type \p ResultTy
+  /// and \p Operands with \p VF. This is a convenience wrapper for the
+  /// type-based getScalarizationOverhead API.
+  InstructionCost getScalarizationOverhead(Type *ResultTy,
+                                           ArrayRef<const VPValue *> Operands,
+                                           ElementCount VF);
 };
 
 /// This class can be used to assign names to VPValues. For VPValues without
diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
index 109156c1469c5..401a2cbd9a5ca 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
@@ -149,12 +149,20 @@ struct is_zero_int {
   bool isValue(const APInt &C) const { return C.isZero(); }
 };
 
+struct is_one {
+  bool isValue(const APInt &C) const { return C.isOne(); }
+};
+
 /// Match an integer 0 or a vector with all elements equal to 0.
 /// For vectors, this includes constants with undefined elements.
 inline int_pred_ty<is_zero_int> m_ZeroInt() {
   return int_pred_ty<is_zero_int>();
 }
 
+/// Match an integer 1 or a vector with all elements equal to 1.
+/// For vectors, this includes constants with undefined elements.
+inline int_pred_ty<is_one> m_One() { return int_pred_ty<is_one>(); }
+
 /// Matching combinators
 template <typename LTy, typename RTy> struct match_combine_or {
   LTy L;
@@ -252,10 +260,9 @@ struct Recipe_match {
   static bool matchRecipeAndOpcode(const VPRecipeBase *R) {
     auto *DefR = dyn_cast<RecipeTy>(R);
     // Check for recipes that do not have opcodes.
-    if constexpr (std::is_same<RecipeTy, VPScalarIVStepsRecipe>::value ||
-                  std::is_same<RecipeTy, VPCanonicalIVPHIRecipe>::value ||
-                  std::is_same<RecipeTy, VPDerivedIVRecipe>::value ||
-                  std::is_same<RecipeTy, VPWidenGEPRecipe>::value)
+    if constexpr (std::is_same_v<RecipeTy, VPScalarIVStepsRecipe> ||
+                  std::is_same_v<RecipeTy, VPCanonicalIVPHIRecipe> ||
+                  std::is_same_v<RecipeTy, VPDerivedIVRecipe>)
       return DefR;
     else
       return DefR && DefR->getOpcode() == Opcode;
@@ -524,15 +531,24 @@ m_SpecificCmp(CmpPredicate MatchPred, const Op0_t &Op0, const Op1_t &Op1) {
 }
 
 template <typename Op0_t, typename Op1_t>
-using GEPLikeRecipe_match =
+using GEPLikeRecipe_match = match_combine_or<
     Recipe_match<std::tuple<Op0_t, Op1_t>, Instruction::GetElementPtr,
-                 /*Commutative*/ false, VPWidenRecipe, VPReplicateRecipe,
-                 VPWidenGEPRecipe, VPInstruction>;
+                 /*Commutative*/ false, VPReplicateRecipe, VPWidenGEPRecipe>,
+    match_combine_or<
+        VPInstruction_match<VPInstruction::PtrAdd, Op0_t, Op1_t>,
+        VPInstruction_match<VPInstruction::WidePtrAdd, Op0_t, Op1_t>>>;
 
 template <typename Op0_t, typename Op1_t>
 inline GEPLikeRecipe_match<Op0_t, Op1_t> m_GetElementPtr(const Op0_t &Op0,
                                                          const Op1_t &Op1) {
-  return GEPLikeRecipe_match<Op0_t, Op1_t>(Op0, Op1);
+  return m_CombineOr(
+      Recipe_match<std::tuple<Op0_t, Op1_t>, Instruction::GetElementPtr,
+                   /*Commutative*/ false, VPReplicateRecipe, VPWidenGEPRecipe>(
+          Op0, Op1),
+      m_CombineOr(
+          VPInstruction_match<VPInstruction::PtrAdd, Op0_t, Op1_t>(Op0, Op1),
+          VPInstruction_match<VPInstruction::WidePtrAdd, Op0_t, Op1_t>(Op0,
+                                                                       Op1)));
 }
 
 template <typename Op0_t, typename Op1_t, typename Op2_t>
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index bf51489543098..8e9c3db50319f 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -328,7 +328,7 @@ VPPartialReductionRecipe::computeCost(ElementCount VF,
   // Pick out opcode, type/ext information and use sub side effects from a widen
   // recipe.
   auto HandleWiden = [&](VPWidenRecipe *Widen) {
-    if (match(Widen, m_Sub(m_SpecificInt(0), m_VPValue(Op)))) {
+    if (match(Widen, m_Sub(m_ZeroInt(), m_VPValue(Op)))) {
       Widen = dyn_cast<VPWidenRecipe>(Op->getDefiningRecipe());
     }
     Opcode = Widen->getOpcode();
@@ -375,9 +375,9 @@ void VPPartialReductionRecipe::execute(VPTransformState &State) {
 
   Type *RetTy = PhiVal->getType();
 
-  CallInst *V = Builder.CreateIntrinsic(
-      RetTy, Intrinsic::experimental_vector_partial_reduce_add,
-      {PhiVal, BinOpVal}, nullptr, "partial.reduce");
+  CallInst *V =
+      Builder.CreateIntrinsic(RetTy, Intrinsic::vector_partial_reduce_add,
+                              {PhiVal, BinOpVal}, nullptr, "partial.reduce");
 
   State.set(this, V);
 }
@@ -564,16 +564,6 @@ bool VPInstruction::canGenerateScalarForFirstLane() const {
   }
 }
 
-Value *VPInstruction::generatePerLane(VPTransformState &State,
-                                      const VPLane &Lane) {
-  IRBuilderBase &Builder = State.Builder;
-
-  assert(getOpcode() == VPInstruction::PtrAdd &&
-         "only PtrAdd opcodes are supported for now");
-  return Builder.CreatePtrAdd(State.get(getOperand(0), Lane),
-                              State.get(getOperand(1), Lane), Name);
-}
-
 /// Create a conditional branch using \p Cond branching to the successors of \p
 /// VPBB. Note that the first successor is always forward (i.e. not created yet)
 /// while the second successor may already have been created (if it is a header
@@ -988,7 +978,7 @@ Value *VPInstruction::generate(VPTransformState &State) {
   }
 }
 
-std::optional<InstructionCost> VPRecipeWithIRFlags::getCostForRecipeWithOpcode(
+InstructionCost VPRecipeWithIRFlags::getCostForRecipeWithOpcode(
     unsigned Opcode, ElementCount VF, VPCostContext &Ctx) const {
   Type *ScalarTy = Ctx.Types.inferScalarType(this);
   Type *ResultTy = VF.isVector() ? toVectorTy(ScalarTy, VF) : ScalarTy;
@@ -1054,7 +1044,7 @@ std::optional<InstructionCost> VPRecipeWithIRFlags::getCostForRecipeWithOpcode(
         {TTI::OK_AnyValue, TTI::OP_None}, CtxI);
   }
   }
-  return std::nullopt;
+  llvm_unreachable("called for unsupported opcode");
 }
 
 InstructionCost VPInstruction::computeCost(ElementCount VF,
@@ -1069,7 +1059,7 @@ InstructionCost VPInstruction::computeCost(ElementCount VF,
     assert(!doesGeneratePerAllLanes() &&
            "Should only generate a vector value or single scalar, not scalars "
            "for all lanes.");
-    return *getCostForRecipeWithOpcode(
+    return getCostForRecipeWithOpcode(
         getOpcode(),
         vputils::onlyFirstLaneUsed(this) ? ElementCount::getFixed(1) : VF, Ctx);
   }
@@ -1197,24 +1187,13 @@ void VPInstruction::execute(VPTransformState &State) {
          "Set flags not supported for the provided opcode");
   if (hasFastMathFlags())
     State.Builder.setFastMathFlags(getFastMathFlags());
-  bool GeneratesPerFirstLaneOnly = canGenerateScalarForFirstLane() &&
-                                   (vputils::onlyFirstLaneUsed(this) ||
-                                    isVectorToScalar() || isSingleScalar());
-  bool GeneratesPerAllLanes = doesGeneratePerAllLanes();
-  if (GeneratesPerAllLanes) {
-    for (unsigned Lane = 0, NumLanes = State.VF.getFixedValue();
-         Lane != NumLanes; ++Lane) {
-      Value *GeneratedValue = generatePerLane(State, VPLane(Lane));
-      assert(GeneratedValue && "generatePerLane must produce a value");
-      State.set(this, GeneratedValue, VPLane(Lane));
-    }
-    return;
-  }
-
   Value *GeneratedValue = generate(State);
   if (!hasResult())
     return;
   assert(GeneratedValue && "generate must produce a value");
+  bool GeneratesPerFirstLaneOnly = canGenerateScalarForFirstLane() &&
+                                   (vputils::onlyFirstLaneUsed(this) ||
+                                    isVectorToScalar() || isSingleScalar());
   assert((((GeneratedValue->getType()->isVectorTy() ||
             GeneratedValue->getType()->isStructTy()) ==
            !GeneratesPerFirstLaneOnly) ||
@@ -1287,6 +1266,12 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
   case VPInstruction::Broadcast:
   case VPInstruction::ReductionStartVector:
     return true;
+  case VPInstruction::BuildStructVector:
+  case VPInstruction::BuildVector:
+    // Before replicating by VF, Build(Struct)Vector uses all lanes of the
+    // operand, after replicating its operands only the first lane is used.
+    // Before replicating, it will have only a single operand.
+    return getNumOperands() > 1;
   case VPInstruction::PtrAdd:
     return Op == getOperand(0) || vputils::onlyFirstLaneUsed(this);
   case VPInstruction::WidePtrAdd:
@@ -2221,7 +2206,7 @@ InstructionCost VPWidenRecipe::computeCost(ElementCount VF,
   case Instruction::ExtractValue:
   case Instruction::ICmp:
   case Instruction::FCmp:
-    return *getCostForRecipeWithOpcode(getOpcode(), VF, Ctx);
+    return getCostForRecipeWithOpcode(getOpcode(), VF, Ctx);
   default:
     llvm_unreachable("Unsupported opcode for instruction");
   }
@@ -3147,33 +3132,8 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
     if (VF.isScalable())
       return InstructionCost::getInvalid();
 
-    // Compute the cost of scalarizing the result and operands if needed.
-    InstructionCost ScalarizationCost = 0;
-    if (VF.isVector()) {
-      if (!ResultTy->isVoidTy()) {
-        for (Type *VectorTy :
-             to_vector(getContainedTypes(toVectorizedTy(ResultTy, VF)))) {
-          ScalarizationCost += Ctx.TTI.getScalarizationOverhead(
-              cast<VectorType>(VectorTy), APInt::getAllOnes(VF.getFixedValue()),
-              /*Insert=*/true,
-              /*Extract=*/false, Ctx.CostKind);
-        }
-      }
-      // Skip operands that do not require extraction/scalarization and do not
-      // incur any overhead.
-      SmallPtrSet<const VPValue *, 4> UniqueOperands;
-      Tys.clear();
-      for (auto *Op : ArgOps) {
-        if (Op->isLiveIn() || isa<VPReplicateRecipe, VPPredInstPHIRecipe>(Op) ||
-            !UniqueOperands.insert(Op).second)
-          continue;
-        Tys.push_back(toVectorizedTy(Ctx.Types.inferScalarType(Op), VF));
-      }
-      ScalarizationCost +=
-          Ctx.TTI.getOperandsScalarizationOverhead(Tys, Ctx.CostKind);
-    }
-
-    return ScalarCallCost * VF.getFixedValue() + ScalarizationCost;
+    return ScalarCallCost * VF.getFixedValue() +
+           Ctx.getScalarizationOverhead(ResultTy, ArgOps, VF);
   }
   case Instruction::Add:
   case Instruction::Sub:
@@ -3191,9 +3151,35 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
   case Instruction::Xor:
   case Instruction::ICmp:
   case Instruction::FCmp:
-    return *getCostForRecipeWithOpcode(getOpcode(), ElementCount::getFixed(1),
-                                       Ctx) *
+    return getCostForRecipeWithOpcode(getOpcode(), ElementCount::getFixed(1),
+                                      Ctx) *
            (isSingleScalar() ? 1 : VF.getFixedValue());
+  case Instruction::SDiv:
+  case Instruction::UDiv:
+  case Instruction::SRem:
+  case Instruction::URem: {
+    InstructionCost ScalarCost =
+        getCostForRecipeWithOpcode(getOpcode(), ElementCount::getFixed(1), Ctx);
+    if (isSingleScalar())
+      return ScalarCost;
+
+    ScalarCost = ScalarCost * VF.getFixedValue() +
+                 Ctx.getScalarizationOverhead(Ctx.Types.inferScalarType(this),
+                                              to_vector(operands()), VF);
+    // If the recipe is not predicated (i.e. not in a replicate region), return
+    // the scalar cost. Otherwise handle predicated cost.
+    if (!getParent()->getParent()->isReplicator())
+      return ScalarCost;
+
+    // Account for the phi nodes that we will create.
+    ScalarCost += VF.getFixedValue() *
+                  Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind);
+    // Scale the cost by the probability of executing the predicated blocks.
+    // This assumes the predicated block for each vector lane is equally
+    // likely.
+    ScalarCost /= getPredBlockCostDivisor(Ctx.CostKind);
+    return ScalarCost;
+  }
   case Instruction::Load:
   case Instruction::Store: {
     if (isSingleScalar()) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 2cac5557daeee..dcc368933f2a1 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1134,10 +1134,10 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
     return Def->replaceAllUsesWith(
         Builder.createLogicalAnd(X, Builder.createLogicalAnd(Y, Z)));
 
-  if (match(Def, m_c_Mul(m_VPValue(A), m_SpecificInt(1))))
+  if (match(Def, m_c_Mul(m_VPValue(A), m_One())))
     return Def->replaceAllUsesWith(A);
 
-  if (match(Def, m_c_Mul(m_VPValue(A), m_SpecificInt(0))))
+  if (match(Def, m_c_Mul(m_VPValue(A), m_ZeroInt())))
     return Def->replaceAllUsesWith(R.getOperand(0) == A ? R.getOperand(1)
                                                         : R.getOperand(0));
 
@@ -1176,16 +1176,14 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
   }
 
   // Remove redundant DerviedIVs, that is 0 + A * 1 -> A and 0 + 0 * x -> 0.
-  if ((match(Def,
-             m_DerivedIV(m_SpecificInt(0), m_VPValue(A), m_SpecificInt(1))) ||
-       match(Def,
-             m_DerivedIV(m_SpecificInt(0), m_SpecificInt(0), m_VPValue()))) &&
+  if ((match(Def, m_DerivedIV(m_ZeroInt(), m_VPValue(A), m_One())) ||
+       match(Def, m_DerivedIV(m_ZeroInt(), m_ZeroInt(), m_VPValue()))) &&
       TypeInfo.inferScalarType(Def->getOperand(1)) ==
           TypeInfo.inferScalarType(Def))
     return Def->replaceAllUsesWith(Def->getOperand(1));
 
-  if (match(Def, m_VPInstruction<VPInstruction::WideIVStep>(
-                     m_VPValue(X), m_SpecificInt(1)))) {
+  if (match(Def, m_VPInstruction<VPInstruction::WideIVStep>(m_VPValue(X),
+                                                            m_One()))) {
     Type *WideStepTy = TypeInfo.inferScalarType(Def);
     if (TypeInfo.inferScalarType(X) != WideStepTy)
       X = Builder.createWidenCast(Instruction::Trunc, X, WideStepTy);
@@ -1272,9 +1270,11 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
 
   if (match(Def,
             m_VPInstruction<VPInstruction::ExtractLastElement>(m_VPValue(A))) &&
-      vputils::isSingleScalar(A) && all_of(A->users(), [Def, A](VPUser *U) {
-        return U->usesScalars(A) || Def == U;
-      })) {
+      ((isa<VPInstruction>(A) && vputils::isSingleScalar(A)) ||
+       (isa<VPReplicateRecipe>(A) &&
+        cast<VPReplicateRecipe>(A)->isSingleScalar())) &&
+      all_of(A->users(),
+             [Def, A](VPUser *U) { return U->usesScalars(A) || Def == U; })) {
     return Def->replaceAllUsesWith(A);
   }
 }
@@ -1978,7 +1978,7 @@ struct VPCSEDenseMapInfo : public DenseMapInfo<VPSingleDefRecipe *> {
     return TypeSwitch<const VPSingleDefRecipe *,
                       std::optional<std::pair<bool, unsigned>>>(R)
         .Case<VPInstruction, VPWidenRecipe, VPWidenCastRecipe,
-              VPWidenSelectRecipe, VPReplicateRecipe>(
+              VPWidenSelectRecipe, VPWidenGEPRecipe, VPReplicateRecipe>(
             [](auto *I) { return std::make_pair(false, I->getOpcode()); })
         .Case<VPWidenIntrinsicRecipe>([](auto *I) {
           return std::make_pair(true, I->getVectorIntrinsicID());
@@ -1986,12 +1986,31 @@ struct VPCSEDenseMapInfo : public DenseMapInfo<VPSingleDefRecipe *> {
         .Default([](auto *) { return std::nullopt; });
   }
 
+  /// If recipe \p R will lower to a GEP with a non-i8 source element type,
+  /// return that source element type.
+  static Type *getGEPSourceElementType(const VPSingleDefRecipe *R) {
+    // All VPInstructions that lower to GEPs must have the i8 source element
+    // type (as they are PtrAdds), so we omit it.
+    return TypeSwitch<const VPSingleDefRecipe *, Type *>(R)
+        .Case<VPReplicateRecipe, VPWidenGEPRecipe>([](auto *I) -> Type * {
+          if (auto *GEP = dyn_cast<GetElementPtrInst>(I->getUnderlyingValue()))
+            return GEP->getSourceElementType();
+          return nullptr;
+        })
+        .Case<VPVectorPointerRecipe>(
+            [](auto *I) { return I->getSourceElementType(); })
+        .Default([](auto *) { return nullptr; });
+  }
+
   /// Returns true if recipe \p Def can be safely handed for CSE.
   static bool canHandle(const VPSingleDefRecipe *Def) {
     // We can extend the list of handled recipes in the future,
     // provided we account for the data embedded in them while checking for
-    // equality or hashing.
-    auto C = getOpcodeOrIntrinsicID(Def);
+    // equality or hashing. We assign VPVectorEndPointerRecipe the GEP opcode,
+    // as it is essentially a GEP with different semantics.
+    auto C = isa<VPVectorPointerRecipe>(Def)
+                 ? std::make_pair(false, Instruction::GetElementPtr)
+                 : getOpcodeOrIntrinsicID(Def);
 
     // The issue with (Insert|Extract)Value is that the index of the
     // insert/extract is not a proper operand in LLVM IR, and hence also not in
@@ -2012,8 +2031,8 @@ struct VPCSEDenseMapInfo : public DenseMapInfo<VPSingleDefRecipe *> {
     VPTypeAnalysis TypeInfo(*Plan);
     hash_code Result = hash_combine(
         Def->getVPDefID(), getOpcodeOrIntrinsicID(Def),
-        TypeInfo.inferScalarType(Def), vputils::isSingleScalar(Def),
-        hash_combine_range(Def->operands()));
+        getGEPSourceElementType(Def), TypeInfo.inferScalarType(Def),
+        vputils::isSingleScalar(Def), hash_combine_range(Def->operands()));
     if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(Def))
       if (RFlags->hasPredicate())
         return hash_combine(Result, RFlags->getPredicate());
@@ -2026,6 +2045,7 @@ struct VPCSEDenseMapInfo : public DenseMapInfo<VPSingleDefRecipe *> {
       return L == R;
     if (L->getVPDefID() != R->getVPDefID() ||
         getOpcodeOrIntrinsicID(L) != getOpcodeOrIntrinsicID(R) ||
+        getGEPSourceElementType(L) != getGEPSourceElementType(R) ||
         vputils::isSingleScalar(L) != vputils::isSingleScalar(R) ||
         !equal(L->operands(), R->operands()))
       return false;
@@ -3695,34 +3715,39 @@ void VPlanTransforms::materializeBuildVectors(VPlan &Plan) {
       vp_depth_first_shallow(Plan.getEntry()));
   auto VPBBsInsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
       vp_depth_first_shallow(LoopRegion->getEntry()));
-  // Materialize Build(Struct)Vector for all replicating VPReplicateRecipes,
-  // excluding ones in replicate regions. Those are not materialized explicitly
-  // yet. Those vector users are still handled in VPReplicateRegion::execute(),
-  // via shouldPack().
+  // Materialize Build(Struct)Vector for all replicating VPReplicateRecipes and
+  // VPInstructions, excluding ones in replicate regions. Those are not
+  // materialized explicitly yet. Those vector users are still handled in
+  // VPReplicateRegion::execute(), via shouldPack().
   // TODO: materialize build vectors for replicating recipes in replicating
   // regions.
-  // TODO: materialize build vectors for VPInstructions.
   for (VPBasicBlock *VPBB :
        concat<VPBasicBlock *>(VPBBsOutsideLoopRegion, VPBBsInsideLoopRegion)) {
     for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
-      auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
-      auto UsesVectorOrInsideReplicateRegion = [RepR, LoopRegion](VPUser *U) {
+      if (!isa<VPReplicateRecipe, VPInstruction>(&R))
+        continue;
+      auto *DefR = cast<VPRecipeWithIRFlags>(&R);
+      auto UsesVectorOrInsideReplicateRegion = [DefR, LoopRegion](VPUser *U) {
         VPRegionBlock *ParentRegion =
             cast<VPRecipeBase>(U)->getParent()->getParent();
-        return !U->usesScalars(RepR) || ParentRegion != LoopRegion;
+        return !U->usesScalars(DefR) || ParentRegion != LoopRegion;
       };
-      if (!RepR || RepR->isSingleScalar() ||
-          none_of(RepR->users(), UsesVectorOrInsideReplicateRegion))
+      if ((isa<VPReplicateRecipe>(DefR) &&
+           cast<VPReplicateRecipe>(DefR)->isSingleScalar()) ||
+          (isa<VPInstruction>(DefR) &&
+           (vputils::onlyFirstLaneUsed(DefR) ||
+            !cast<VPInstruction>(DefR)->doesGeneratePerAllLanes())) ||
+          none_of(DefR->users(), UsesVectorOrInsideReplicateRegion))
         continue;
 
-      Type *ScalarTy = TypeInfo.inferScalarType(RepR);
+      Type *ScalarTy = TypeInfo.inferScalarType(DefR);
       unsigned Opcode = ScalarTy->isStructTy()
                             ? VPInstruction::BuildStructVector
                             : VPInstruction::BuildVector;
-      auto *BuildVector = new VPInstruction(Opcode, {RepR});
-      BuildVector->insertAfter(RepR);
+      auto *BuildVector = new VPInstruction(Opcode, {DefR});
+      BuildVector->insertAfter(DefR);
 
-      RepR->replaceUsesWithIf(
+      DefR->replaceUsesWithIf(
           BuildVector, [BuildVector, &UsesVectorOrInsideReplicateRegion](
                            VPUser &U, unsigned) {
             return &U != BuildVector && UsesVectorOrInsideReplicateRegion(&U);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 1957428fab799..69452a7e37572 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -158,10 +158,10 @@ struct VPlanTransforms {
   /// Explicitly unroll \p Plan by \p UF.
   static void unrollByUF(VPlan &Plan, unsigned UF);
 
-  /// Replace each VPReplicateRecipe outside on any replicate region in \p Plan
-  /// with \p VF single-scalar recipes.
-  /// TODO: Also replicate VPReplicateRecipes inside replicate regions, thereby
-  /// dissolving the latter.
+  /// Replace each replicating VPReplicateRecipe and VPInstruction outside of
+  /// any replicate region in \p Plan with \p VF single-scalar recipes.
+  /// TODO: Also replicate VPScalarIVSteps and VPReplicateRecipes inside
+  /// replicate regions, thereby dissolving the latter.
   static void replicateByVF(VPlan &Plan, ElementCount VF);
 
   /// Optimize \p Plan based on \p BestVF and \p BestUF. This may restrict the
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
index 443df167378b0..180b1b96b6364 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
@@ -238,7 +238,7 @@ void UnrollState::unrollHeaderPHIByUF(VPHeaderPHIRecipe *R,
         if (Part != 1)
           continue;
         VPValue *StartV;
-        if (match(VPI->getOperand(2), m_SpecificInt(1))) {
+        if (match(VPI->getOperand(2), m_One())) {
           StartV = VPI->getOperand(1);
         } else {
           auto *C = VPI->clone();
@@ -463,15 +463,16 @@ void VPlanTransforms::unrollByUF(VPlan &Plan, unsigned UF) {
   VPlanTransforms::removeDeadRecipes(Plan);
 }
 
-/// Create a single-scalar clone of \p RepR for lane \p Lane. Use \p
-/// Def2LaneDefs to look up scalar definitions for operands of \RepR.
-static VPReplicateRecipe *
+/// Create a single-scalar clone of \p DefR (must be a VPReplicateRecipe or
+/// VPInstruction) for lane \p Lane. Use \p Def2LaneDefs to look up scalar
+/// definitions for operands of \DefR.
+static VPRecipeWithIRFlags *
 cloneForLane(VPlan &Plan, VPBuilder &Builder, Type *IdxTy,
-             VPReplicateRecipe *RepR, VPLane Lane,
+             VPRecipeWithIRFlags *DefR, VPLane Lane,
              const DenseMap<VPValue *, SmallVector<VPValue *>> &Def2LaneDefs) {
   // Collect the operands at Lane, creating extracts as needed.
   SmallVector<VPValue *> NewOps;
-  for (VPValue *Op : RepR->operands()) {
+  for (VPValue *Op : DefR->operands()) {
     // If Op is a definition that has been unrolled, directly use the clone for
     // the corresponding lane.
     auto LaneDefs = Def2LaneDefs.find(Op);
@@ -501,11 +502,24 @@ cloneForLane(VPlan &Plan, VPBuilder &Builder, Type *IdxTy,
     NewOps.push_back(Ext);
   }
 
-  auto *New =
-      new VPReplicateRecipe(RepR->getUnderlyingInstr(), NewOps,
-                            /*IsSingleScalar=*/true, /*Mask=*/nullptr, *RepR);
-  New->transferFlags(*RepR);
-  New->insertBefore(RepR);
+  VPRecipeWithIRFlags *New;
+  if (auto *RepR = dyn_cast<VPReplicateRecipe>(DefR)) {
+    // TODO: have cloning of replicate recipes also provide the desired result
+    // coupled with setting its operands to NewOps (deriving IsSingleScalar and
+    // Mask from the operands?)
+    New =
+        new VPReplicateRecipe(RepR->getUnderlyingInstr(), NewOps,
+                              /*IsSingleScalar=*/true, /*Mask=*/nullptr, *RepR);
+  } else {
+    assert(isa<VPInstruction>(DefR) &&
+           "DefR must be a VPReplicateRecipe or VPInstruction");
+    New = DefR->clone();
+    for (const auto &[Idx, Op] : enumerate(NewOps)) {
+      New->setOperand(Idx, Op);
+    }
+  }
+  New->transferFlags(*DefR);
+  New->insertBefore(DefR);
   return New;
 }
 
@@ -530,34 +544,38 @@ void VPlanTransforms::replicateByVF(VPlan &Plan, ElementCount VF) {
   SmallVector<VPRecipeBase *> ToRemove;
   for (VPBasicBlock *VPBB : VPBBsToUnroll) {
     for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
-      auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
-      if (!RepR || RepR->isSingleScalar())
+      if (!isa<VPInstruction, VPReplicateRecipe>(&R) ||
+          (isa<VPReplicateRecipe>(&R) &&
+           cast<VPReplicateRecipe>(&R)->isSingleScalar()) ||
+          (isa<VPInstruction>(&R) &&
+           !cast<VPInstruction>(&R)->doesGeneratePerAllLanes()))
         continue;
 
-      VPBuilder Builder(RepR);
-      if (RepR->getNumUsers() == 0) {
-        // Create single-scalar version of RepR for all lanes.
+      auto *DefR = cast<VPRecipeWithIRFlags>(&R);
+      VPBuilder Builder(DefR);
+      if (DefR->getNumUsers() == 0) {
+        // Create single-scalar version of DefR for all lanes.
         for (unsigned I = 0; I != VF.getKnownMinValue(); ++I)
-          cloneForLane(Plan, Builder, IdxTy, RepR, VPLane(I), Def2LaneDefs);
-        RepR->eraseFromParent();
+          cloneForLane(Plan, Builder, IdxTy, DefR, VPLane(I), Def2LaneDefs);
+        DefR->eraseFromParent();
         continue;
       }
-      /// Create single-scalar version of RepR for all lanes.
+      /// Create single-scalar version of DefR for all lanes.
       SmallVector<VPValue *> LaneDefs;
       for (unsigned I = 0; I != VF.getKnownMinValue(); ++I)
         LaneDefs.push_back(
-            cloneForLane(Plan, Builder, IdxTy, RepR, VPLane(I), Def2LaneDefs));
+            cloneForLane(Plan, Builder, IdxTy, DefR, VPLane(I), Def2LaneDefs));
 
-      Def2LaneDefs[RepR] = LaneDefs;
+      Def2LaneDefs[DefR] = LaneDefs;
       /// Users that only demand the first lane can use the definition for lane
       /// 0.
-      RepR->replaceUsesWithIf(LaneDefs[0], [RepR](VPUser &U, unsigned) {
-        return U.onlyFirstLaneUsed(RepR);
+      DefR->replaceUsesWithIf(LaneDefs[0], [DefR](VPUser &U, unsigned) {
+        return U.onlyFirstLaneUsed(DefR);
       });
 
-      // Update each build vector user that currently has RepR as its only
+      // Update each build vector user that currently has DefR as its only
       // operand, to have all LaneDefs as its operands.
-      for (VPUser *U : to_vector(RepR->users())) {
+      for (VPUser *U : to_vector(DefR->users())) {
         auto *VPI = dyn_cast<VPInstruction>(U);
         if (!VPI || (VPI->getOpcode() != VPInstruction::BuildVector &&
                      VPI->getOpcode() != VPInstruction::BuildStructVector))
@@ -569,7 +587,7 @@ void VPlanTransforms::replicateByVF(VPlan &Plan, ElementCount VF) {
         for (VPValue *LaneDef : drop_begin(LaneDefs))
           VPI->addOperand(LaneDef);
       }
-      ToRemove.push_back(RepR);
+      ToRemove.push_back(DefR);
     }
   }
   for (auto *R : reverse(ToRemove))
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
index c6c1ef3369825..ddc4ad1977401 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
@@ -65,10 +65,9 @@ bool vputils::isHeaderMask(const VPValue *V, VPlan &Plan) {
   VPValue *A, *B;
   using namespace VPlanPatternMatch;
 
-  if (match(V, m_ActiveLaneMask(m_VPValue(A), m_VPValue(B), m_SpecificInt(1))))
+  if (match(V, m_ActiveLaneMask(m_VPValue(A), m_VPValue(B), m_One())))
     return B == Plan.getTripCount() &&
-           (match(A, m_ScalarIVSteps(m_Specific(Plan.getCanonicalIV()),
-                                     m_SpecificInt(1),
+           (match(A, m_ScalarIVSteps(m_Specific(Plan.getCanonicalIV()), m_One(),
                                      m_Specific(&Plan.getVF()))) ||
             IsWideCanonicalIV(A));
 
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index a84b6f59971c9..0ef933f596604 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -2014,12 +2014,19 @@ bool VectorCombine::scalarizeExtExtract(Instruction &I) {
       IntegerType::get(SrcTy->getContext(), DL->getTypeSizeInBits(SrcTy)));
   uint64_t SrcEltSizeInBits = DL->getTypeSizeInBits(SrcTy->getElementType());
   uint64_t EltBitMask = (1ull << SrcEltSizeInBits) - 1;
+  uint64_t TotalBits = DL->getTypeSizeInBits(SrcTy);
+  Type *PackedTy = IntegerType::get(SrcTy->getContext(), TotalBits);
+  Value *Mask = ConstantInt::get(PackedTy, EltBitMask);
   for (User *U : Ext->users()) {
     auto *Extract = cast<ExtractElementInst>(U);
     uint64_t Idx =
         cast<ConstantInt>(Extract->getIndexOperand())->getZExtValue();
-    Value *LShr = Builder.CreateLShr(ScalarV, Idx * SrcEltSizeInBits);
-    Value *And = Builder.CreateAnd(LShr, EltBitMask);
+    uint64_t ShiftAmt =
+        DL->isBigEndian()
+            ? (TotalBits - SrcEltSizeInBits - Idx * SrcEltSizeInBits)
+            : (Idx * SrcEltSizeInBits);
+    Value *LShr = Builder.CreateLShr(ScalarV, ShiftAmt);
+    Value *And = Builder.CreateAnd(LShr, Mask);
     U->replaceAllUsesWith(And);
   }
   return true;
diff --git a/llvm/test/Analysis/BasicAA/featuretest.ll b/llvm/test/Analysis/BasicAA/featuretest.ll
index e4cb009f0c633..04c4725d26c1d 100644
--- a/llvm/test/Analysis/BasicAA/featuretest.ll
+++ b/llvm/test/Analysis/BasicAA/featuretest.ll
@@ -15,24 +15,14 @@ declare void @llvm.assume(i1)
 ; operations on another array.  Important for scientific codes.
 ;
 define i32 @different_array_test(i64 %A, i64 %B) {
-; NO_ASSUME-LABEL: @different_array_test(
-; NO_ASSUME-NEXT:    [[ARRAY11:%.*]] = alloca [100 x i32], align 4
-; NO_ASSUME-NEXT:    [[ARRAY22:%.*]] = alloca [200 x i32], align 4
-; NO_ASSUME-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[ARRAY11]], i32 4) ]
-; NO_ASSUME-NEXT:    call void @external(ptr nonnull [[ARRAY11]])
-; NO_ASSUME-NEXT:    call void @external(ptr nonnull [[ARRAY22]])
-; NO_ASSUME-NEXT:    [[POINTER2:%.*]] = getelementptr i32, ptr [[ARRAY22]], i64 [[B:%.*]]
-; NO_ASSUME-NEXT:    store i32 7, ptr [[POINTER2]], align 4
-; NO_ASSUME-NEXT:    ret i32 0
-;
-; USE_ASSUME-LABEL: @different_array_test(
-; USE_ASSUME-NEXT:    [[ARRAY11:%.*]] = alloca [100 x i32], align 4
-; USE_ASSUME-NEXT:    [[ARRAY22:%.*]] = alloca [200 x i32], align 4
-; USE_ASSUME-NEXT:    call void @external(ptr nonnull [[ARRAY11]])
-; USE_ASSUME-NEXT:    call void @external(ptr nonnull [[ARRAY22]])
-; USE_ASSUME-NEXT:    [[POINTER2:%.*]] = getelementptr i32, ptr [[ARRAY22]], i64 [[B:%.*]]
-; USE_ASSUME-NEXT:    store i32 7, ptr [[POINTER2]], align 4
-; USE_ASSUME-NEXT:    ret i32 0
+; CHECK-LABEL: @different_array_test(
+; CHECK-NEXT:    [[ARRAY11:%.*]] = alloca [100 x i32], align 4
+; CHECK-NEXT:    [[ARRAY22:%.*]] = alloca [200 x i32], align 4
+; CHECK-NEXT:    call void @external(ptr nonnull [[ARRAY11]])
+; CHECK-NEXT:    call void @external(ptr nonnull [[ARRAY22]])
+; CHECK-NEXT:    [[POINTER2:%.*]] = getelementptr i32, ptr [[ARRAY22]], i64 [[B:%.*]]
+; CHECK-NEXT:    store i32 7, ptr [[POINTER2]], align 4
+; CHECK-NEXT:    ret i32 0
 ;
   %Array1 = alloca i32, i32 100
   %Array2 = alloca i32, i32 200
diff --git a/llvm/test/Analysis/CostModel/ARM/abs.ll b/llvm/test/Analysis/CostModel/ARM/abs.ll
new file mode 100644
index 0000000000000..8c7fef3405127
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/ARM/abs.ll
@@ -0,0 +1,49 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv6m-none-eabi < %s | FileCheck %s --check-prefix=CHECK-T1
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv7m-none-eabi < %s | FileCheck %s --check-prefix=CHECK-T2
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi < %s | FileCheck %s --check-prefix=CHECK-81
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=armv8-none-eabi < %s | FileCheck %s --check-prefix=CHECK-ARM
+
+target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
+
+define void @abs() {
+; CHECK-T1-LABEL: 'abs'
+; CHECK-T1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:4 Lat:3 SizeLat:3 for: %I8 = call i8 @llvm.abs.i8(i8 undef, i1 false)
+; CHECK-T1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:4 Lat:3 SizeLat:3 for: %I16 = call i16 @llvm.abs.i16(i16 undef, i1 false)
+; CHECK-T1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:4 Lat:3 SizeLat:3 for: %I32 = call i32 @llvm.abs.i32(i32 undef, i1 false)
+; CHECK-T1-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:6 Lat:5 SizeLat:5 for: %I64 = call i64 @llvm.abs.i64(i64 undef, i1 false)
+; CHECK-T1-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:10 Lat:9 SizeLat:9 for: %I128 = call i128 @llvm.abs.i128(i128 undef, i1 false)
+; CHECK-T1-NEXT:  Cost Model: Found costs of 1 for: ret void
+;
+; CHECK-T2-LABEL: 'abs'
+; CHECK-T2-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:4 Lat:3 SizeLat:3 for: %I8 = call i8 @llvm.abs.i8(i8 undef, i1 false)
+; CHECK-T2-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:4 Lat:3 SizeLat:3 for: %I16 = call i16 @llvm.abs.i16(i16 undef, i1 false)
+; CHECK-T2-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:4 Lat:3 SizeLat:3 for: %I32 = call i32 @llvm.abs.i32(i32 undef, i1 false)
+; CHECK-T2-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:6 Lat:5 SizeLat:5 for: %I64 = call i64 @llvm.abs.i64(i64 undef, i1 false)
+; CHECK-T2-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:10 Lat:9 SizeLat:9 for: %I128 = call i128 @llvm.abs.i128(i128 undef, i1 false)
+; CHECK-T2-NEXT:  Cost Model: Found costs of 1 for: ret void
+;
+; CHECK-81-LABEL: 'abs'
+; CHECK-81-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:4 Lat:3 SizeLat:3 for: %I8 = call i8 @llvm.abs.i8(i8 undef, i1 false)
+; CHECK-81-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:4 Lat:3 SizeLat:3 for: %I16 = call i16 @llvm.abs.i16(i16 undef, i1 false)
+; CHECK-81-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:4 Lat:3 SizeLat:3 for: %I32 = call i32 @llvm.abs.i32(i32 undef, i1 false)
+; CHECK-81-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:6 Lat:5 SizeLat:5 for: %I64 = call i64 @llvm.abs.i64(i64 undef, i1 false)
+; CHECK-81-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:10 Lat:9 SizeLat:9 for: %I128 = call i128 @llvm.abs.i128(i128 undef, i1 false)
+; CHECK-81-NEXT:  Cost Model: Found costs of 1 for: ret void
+;
+; CHECK-ARM-LABEL: 'abs'
+; CHECK-ARM-NEXT:  Cost Model: Found costs of 3 for: %I8 = call i8 @llvm.abs.i8(i8 undef, i1 false)
+; CHECK-ARM-NEXT:  Cost Model: Found costs of 3 for: %I16 = call i16 @llvm.abs.i16(i16 undef, i1 false)
+; CHECK-ARM-NEXT:  Cost Model: Found costs of 3 for: %I32 = call i32 @llvm.abs.i32(i32 undef, i1 false)
+; CHECK-ARM-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %I64 = call i64 @llvm.abs.i64(i64 undef, i1 false)
+; CHECK-ARM-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:6 Lat:6 SizeLat:6 for: %I128 = call i128 @llvm.abs.i128(i128 undef, i1 false)
+; CHECK-ARM-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+  %I8 = call i8 @llvm.abs.i8(i8 undef, i1 false)
+  %I16 = call i16 @llvm.abs.i16(i16 undef, i1 false)
+  %I32 = call i32 @llvm.abs.i32(i32 undef, i1 false)
+  %I64 = call i64 @llvm.abs.i64(i64 undef, i1 false)
+  %I128 = call i128 @llvm.abs.i128(i128 undef, i1 false)
+  ret void
+}
+
diff --git a/llvm/test/Analysis/CostModel/ARM/reduce-add.ll b/llvm/test/Analysis/CostModel/ARM/reduce-add.ll
index 4db9d1bb0efaf..6e4d4e3ec515f 100644
--- a/llvm/test/Analysis/CostModel/ARM/reduce-add.ll
+++ b/llvm/test/Analysis/CostModel/ARM/reduce-add.ll
@@ -1,43 +1,34 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv8m.main < %s | FileCheck %s --check-prefix=V8M-RECIP
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=armv8a-linux-gnueabihf < %s | FileCheck %s --check-prefix=NEON-RECIP
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mtriple=thumbv8m.main < %s | FileCheck %s --check-prefix=V8M-SIZE
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mtriple=armv8a-linux-gnueabihf < %s | FileCheck %s --check-prefix=NEON-SIZE
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv8m.main < %s | FileCheck %s --check-prefix=V8M
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=armv8a-linux-gnueabihf < %s | FileCheck %s --check-prefix=NEON
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=armv8.1m.main -mattr=+mve < %s | FileCheck %s --check-prefix=MVE
 
 target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
 
 define i32 @reduce_i64(i32 %arg) {
-; V8M-RECIP-LABEL: 'reduce_i64'
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1 = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V8 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %V16 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+; V8M-LABEL: 'reduce_i64'
+; V8M-NEXT:  Cost Model: Found costs of 2 for: %V1 = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> undef)
+; V8M-NEXT:  Cost Model: Found costs of 8 for: %V2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef)
+; V8M-NEXT:  Cost Model: Found costs of 20 for: %V4 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef)
+; V8M-NEXT:  Cost Model: Found costs of 44 for: %V8 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> undef)
+; V8M-NEXT:  Cost Model: Found costs of 92 for: %V16 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> undef)
+; V8M-NEXT:  Cost Model: Found costs of 1 for: ret i32 undef
 ;
-; NEON-RECIP-LABEL: 'reduce_i64'
-; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1 = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> undef)
-; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef)
-; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V4 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef)
-; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V8 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> undef)
-; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 107 for instruction: %V16 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> undef)
-; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; NEON-LABEL: 'reduce_i64'
+; NEON-NEXT:  Cost Model: Found costs of 3 for: %V1 = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> undef)
+; NEON-NEXT:  Cost Model: Found costs of 16 for: %V2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef)
+; NEON-NEXT:  Cost Model: Found costs of 29 for: %V4 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef)
+; NEON-NEXT:  Cost Model: Found costs of RThru:55 CodeSize:54 Lat:54 SizeLat:54 for: %V8 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> undef)
+; NEON-NEXT:  Cost Model: Found costs of RThru:107 CodeSize:103 Lat:103 SizeLat:103 for: %V16 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> undef)
+; NEON-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
-; V8M-SIZE-LABEL: 'reduce_i64'
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1 = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V8 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %V16 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
-;
-; NEON-SIZE-LABEL: 'reduce_i64'
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1 = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V4 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V8 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %V16 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+; MVE-LABEL: 'reduce_i64'
+; MVE-NEXT:  Cost Model: Found costs of 8 for: %V1 = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:108 CodeSize:76 Lat:108 SizeLat:108 for: %V2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:208 CodeSize:144 Lat:208 SizeLat:208 for: %V4 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:408 CodeSize:280 Lat:408 SizeLat:408 for: %V8 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:808 CodeSize:552 Lat:808 SizeLat:808 for: %V16 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
   %V1  = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> undef)
   %V2  = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef)
@@ -48,45 +39,35 @@ define i32 @reduce_i64(i32 %arg) {
 }
 
 define i32 @reduce_i32(i32 %arg) {
-; V8M-RECIP-LABEL: 'reduce_i32'
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V16 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 94 for instruction: %V32 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 190 for instruction: %V64 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 382 for instruction: %V128 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
-;
-; NEON-RECIP-LABEL: 'reduce_i32'
-; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V2 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef)
-; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V4 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef)
-; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 150 for instruction: %V8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef)
-; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 391 for instruction: %V16 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef)
-; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 488 for instruction: %V32 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> undef)
-; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 682 for instruction: %V64 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> undef)
-; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1070 for instruction: %V128 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef)
-; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; V8M-LABEL: 'reduce_i32'
+; V8M-NEXT:  Cost Model: Found costs of 4 for: %V2 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef)
+; V8M-NEXT:  Cost Model: Found costs of 10 for: %V4 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef)
+; V8M-NEXT:  Cost Model: Found costs of 22 for: %V8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef)
+; V8M-NEXT:  Cost Model: Found costs of 46 for: %V16 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef)
+; V8M-NEXT:  Cost Model: Found costs of 94 for: %V32 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> undef)
+; V8M-NEXT:  Cost Model: Found costs of 190 for: %V64 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> undef)
+; V8M-NEXT:  Cost Model: Found costs of 382 for: %V128 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef)
+; V8M-NEXT:  Cost Model: Found costs of 1 for: ret i32 undef
 ;
-; V8M-SIZE-LABEL: 'reduce_i32'
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V16 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 94 for instruction: %V32 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 190 for instruction: %V64 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 382 for instruction: %V128 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+; NEON-LABEL: 'reduce_i32'
+; NEON-NEXT:  Cost Model: Found costs of 16 for: %V2 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef)
+; NEON-NEXT:  Cost Model: Found costs of 53 for: %V4 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef)
+; NEON-NEXT:  Cost Model: Found costs of 150 for: %V8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef)
+; NEON-NEXT:  Cost Model: Found costs of 391 for: %V16 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef)
+; NEON-NEXT:  Cost Model: Found costs of 488 for: %V32 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> undef)
+; NEON-NEXT:  Cost Model: Found costs of RThru:682 CodeSize:681 Lat:681 SizeLat:681 for: %V64 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> undef)
+; NEON-NEXT:  Cost Model: Found costs of RThru:1070 CodeSize:1066 Lat:1066 SizeLat:1066 for: %V128 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef)
+; NEON-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
-; NEON-SIZE-LABEL: 'reduce_i32'
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V2 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V4 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 150 for instruction: %V8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 391 for instruction: %V16 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 488 for instruction: %V32 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 681 for instruction: %V64 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1066 for instruction: %V128 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+; MVE-LABEL: 'reduce_i32'
+; MVE-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:38 Lat:54 SizeLat:54 for: %V2 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %V4 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %V8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %V16 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %V32 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %V64 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: %V128 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
   %V2   = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef)
   %V4   = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef)
@@ -97,30 +78,3 @@ define i32 @reduce_i32(i32 %arg) {
   %V128 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef)
   ret i32 undef
 }
-
-declare i64 @llvm.vector.reduce.add.v1i64(<1 x i64>)
-declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>)
-declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>)
-declare i64 @llvm.vector.reduce.add.v8i64(<8 x i64>)
-declare i64 @llvm.vector.reduce.add.v16i64(<16 x i64>)
-
-declare i32 @llvm.vector.reduce.add.v2i32(<2 x i32>)
-declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
-declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>)
-declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>)
-declare i32 @llvm.vector.reduce.add.v32i32(<32 x i32>)
-
-declare i16 @llvm.vector.reduce.add.v2i16(<2 x i16>)
-declare i16 @llvm.vector.reduce.add.v4i16(<4 x i16>)
-declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>)
-declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>)
-declare i16 @llvm.vector.reduce.add.v32i16(<32 x i16>)
-declare i16 @llvm.vector.reduce.add.v64i16(<64 x i16>)
-
-declare i8 @llvm.vector.reduce.add.v2i8(<2 x i8>)
-declare i8 @llvm.vector.reduce.add.v4i8(<4 x i8>)
-declare i8 @llvm.vector.reduce.add.v8i8(<8 x i8>)
-declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>)
-declare i8 @llvm.vector.reduce.add.v32i8(<32 x i8>)
-declare i8 @llvm.vector.reduce.add.v64i8(<64 x i8>)
-declare i8 @llvm.vector.reduce.add.v128i8(<128 x i8>)
diff --git a/llvm/test/Analysis/CostModel/ARM/reduce-and.ll b/llvm/test/Analysis/CostModel/ARM/reduce-and.ll
index a220d0bacfa61..8a4407b361909 100644
--- a/llvm/test/Analysis/CostModel/ARM/reduce-and.ll
+++ b/llvm/test/Analysis/CostModel/ARM/reduce-and.ll
@@ -1,19 +1,43 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt < %s -mtriple=armv8a-linux-gnueabihf -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv8m.main < %s | FileCheck %s --check-prefix=V8M
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=armv8a-linux-gnueabihf < %s | FileCheck %s --check-prefix=NEON
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=armv8.1m.main -mattr=+mve < %s | FileCheck %s --check-prefix=MVE
 
 target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
 
 define i32 @reduce_i1(i32 %arg) {
-; CHECK-LABEL: 'reduce_i1'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1 = call i1 @llvm.vector.reduce.and.v1i1(<1 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i1 @llvm.vector.reduce.and.v16i1(<16 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V32 = call i1 @llvm.vector.reduce.and.v32i1(<32 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 193 for instruction: %V64 = call i1 @llvm.vector.reduce.and.v64i1(<64 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 385 for instruction: %V128 = call i1 @llvm.vector.reduce.and.v128i1(<128 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; V8M-LABEL: 'reduce_i1'
+; V8M-NEXT:  Cost Model: Found costs of 1 for: %V1 = call i1 @llvm.vector.reduce.and.v1i1(<1 x i1> undef)
+; V8M-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:2 Lat:2 SizeLat:2 for: %V2 = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> undef)
+; V8M-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:2 Lat:2 SizeLat:2 for: %V4 = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> undef)
+; V8M-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:2 Lat:2 SizeLat:2 for: %V8 = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> undef)
+; V8M-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:2 Lat:2 SizeLat:2 for: %V16 = call i1 @llvm.vector.reduce.and.v16i1(<16 x i1> undef)
+; V8M-NEXT:  Cost Model: Found costs of RThru:33 CodeSize:2 Lat:2 SizeLat:2 for: %V32 = call i1 @llvm.vector.reduce.and.v32i1(<32 x i1> undef)
+; V8M-NEXT:  Cost Model: Found costs of RThru:65 CodeSize:2 Lat:2 SizeLat:2 for: %V64 = call i1 @llvm.vector.reduce.and.v64i1(<64 x i1> undef)
+; V8M-NEXT:  Cost Model: Found costs of RThru:129 CodeSize:2 Lat:2 SizeLat:2 for: %V128 = call i1 @llvm.vector.reduce.and.v128i1(<128 x i1> undef)
+; V8M-NEXT:  Cost Model: Found costs of 1 for: ret i32 undef
+;
+; NEON-LABEL: 'reduce_i1'
+; NEON-NEXT:  Cost Model: Found costs of 3 for: %V1 = call i1 @llvm.vector.reduce.and.v1i1(<1 x i1> undef)
+; NEON-NEXT:  Cost Model: Found costs of 2 for: %V2 = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> undef)
+; NEON-NEXT:  Cost Model: Found costs of 2 for: %V4 = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> undef)
+; NEON-NEXT:  Cost Model: Found costs of 2 for: %V8 = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> undef)
+; NEON-NEXT:  Cost Model: Found costs of 2 for: %V16 = call i1 @llvm.vector.reduce.and.v16i1(<16 x i1> undef)
+; NEON-NEXT:  Cost Model: Found costs of RThru:97 CodeSize:2 Lat:2 SizeLat:2 for: %V32 = call i1 @llvm.vector.reduce.and.v32i1(<32 x i1> undef)
+; NEON-NEXT:  Cost Model: Found costs of RThru:193 CodeSize:2 Lat:2 SizeLat:2 for: %V64 = call i1 @llvm.vector.reduce.and.v64i1(<64 x i1> undef)
+; NEON-NEXT:  Cost Model: Found costs of RThru:385 CodeSize:2 Lat:2 SizeLat:2 for: %V128 = call i1 @llvm.vector.reduce.and.v128i1(<128 x i1> undef)
+; NEON-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
+;
+; MVE-LABEL: 'reduce_i1'
+; MVE-NEXT:  Cost Model: Found costs of 4 for: %V1 = call i1 @llvm.vector.reduce.and.v1i1(<1 x i1> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:2 Lat:2 SizeLat:2 for: %V2 = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:2 Lat:2 SizeLat:2 for: %V4 = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:2 Lat:2 SizeLat:2 for: %V8 = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:2 Lat:2 SizeLat:2 for: %V16 = call i1 @llvm.vector.reduce.and.v16i1(<16 x i1> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:257 CodeSize:2 Lat:2 SizeLat:2 for: %V32 = call i1 @llvm.vector.reduce.and.v32i1(<32 x i1> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:513 CodeSize:2 Lat:2 SizeLat:2 for: %V64 = call i1 @llvm.vector.reduce.and.v64i1(<64 x i1> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:1025 CodeSize:2 Lat:2 SizeLat:2 for: %V128 = call i1 @llvm.vector.reduce.and.v128i1(<128 x i1> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
   %V1   = call i1 @llvm.vector.reduce.and.v1i1(<1 x i1> undef)
   %V2   = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> undef)
@@ -25,12 +49,3 @@ define i32 @reduce_i1(i32 %arg) {
   %V128 = call i1 @llvm.vector.reduce.and.v128i1(<128 x i1> undef)
   ret i32 undef
 }
-
-declare i1 @llvm.vector.reduce.and.v1i1(<1 x i1>)
-declare i1 @llvm.vector.reduce.and.v2i1(<2 x i1>)
-declare i1 @llvm.vector.reduce.and.v4i1(<4 x i1>)
-declare i1 @llvm.vector.reduce.and.v8i1(<8 x i1>)
-declare i1 @llvm.vector.reduce.and.v16i1(<16 x i1>)
-declare i1 @llvm.vector.reduce.and.v32i1(<32 x i1>)
-declare i1 @llvm.vector.reduce.and.v64i1(<64 x i1>)
-declare i1 @llvm.vector.reduce.and.v128i1(<128 x i1>)
diff --git a/llvm/test/Analysis/CostModel/ARM/reduce-bit.ll b/llvm/test/Analysis/CostModel/ARM/reduce-bit.ll
index b38660df59a3e..3c13718874e78 100644
--- a/llvm/test/Analysis/CostModel/ARM/reduce-bit.ll
+++ b/llvm/test/Analysis/CostModel/ARM/reduce-bit.ll
@@ -1,43 +1,43 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt < %s -mtriple=armv8a-linux-gnueabihf -mattr=+fp64 -passes="print<cost-model>" 2>&1 -disable-output | FileCheck %s --check-prefix=CHECK-V8
-; RUN: opt < %s -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve -passes="print<cost-model>" 2>&1 -disable-output | FileCheck %s --check-prefix=CHECK-MVEI
+; RUN: opt < %s -mtriple=armv8a-linux-gnueabihf -mattr=+fp64 -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output | FileCheck %s --check-prefix=CHECK-V8
+; RUN: opt < %s -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output | FileCheck %s --check-prefix=CHECK-MVEI
 
 define void @and() {
 ; CHECK-V8-LABEL: 'and'
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1i64 = call i64 @llvm.vector.reduce.and.v1i64(<1 x i64> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i64 = call i64 @llvm.vector.reduce.and.v2i64(<2 x i64> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i64 = call i64 @llvm.vector.reduce.and.v4i64(<4 x i64> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i32 = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i32 = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v8i32 = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i16 = call i16 @llvm.vector.reduce.and.v2i16(<2 x i16> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4i16 = call i16 @llvm.vector.reduce.and.v4i16(<4 x i16> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i16 = call i16 @llvm.vector.reduce.and.v8i16(<8 x i16> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v16i16 = call i16 @llvm.vector.reduce.and.v16i16(<16 x i16> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i8 = call i8 @llvm.vector.reduce.and.v2i8(<2 x i8> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4i8 = call i8 @llvm.vector.reduce.and.v4i8(<4 x i8> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8i8 = call i8 @llvm.vector.reduce.and.v8i8(<8 x i8> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i8 = call i8 @llvm.vector.reduce.and.v16i8(<16 x i8> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32i8 = call i8 @llvm.vector.reduce.and.v32i8(<32 x i8> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-V8-NEXT:  Cost Model: Found costs of 1 for: %v1i64 = call i64 @llvm.vector.reduce.and.v1i64(<1 x i64> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of 2 for: %v2i64 = call i64 @llvm.vector.reduce.and.v2i64(<2 x i64> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of 3 for: %v4i64 = call i64 @llvm.vector.reduce.and.v4i64(<4 x i64> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of 3 for: %v2i32 = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of 4 for: %v4i32 = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of 5 for: %v8i32 = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of 3 for: %v2i16 = call i16 @llvm.vector.reduce.and.v2i16(<2 x i16> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of 7 for: %v4i16 = call i16 @llvm.vector.reduce.and.v4i16(<4 x i16> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of 8 for: %v8i16 = call i16 @llvm.vector.reduce.and.v8i16(<8 x i16> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of 9 for: %v16i16 = call i16 @llvm.vector.reduce.and.v16i16(<16 x i16> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of 3 for: %v2i8 = call i8 @llvm.vector.reduce.and.v2i8(<2 x i8> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of 7 for: %v4i8 = call i8 @llvm.vector.reduce.and.v4i8(<4 x i8> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of 15 for: %v8i8 = call i8 @llvm.vector.reduce.and.v8i8(<8 x i8> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of 16 for: %v16i8 = call i8 @llvm.vector.reduce.and.v16i8(<16 x i8> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of 17 for: %v32i8 = call i8 @llvm.vector.reduce.and.v32i8(<32 x i8> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-MVEI-LABEL: 'and'
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1i64 = call i64 @llvm.vector.reduce.and.v1i64(<1 x i64> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2i64 = call i64 @llvm.vector.reduce.and.v2i64(<2 x i64> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v4i64 = call i64 @llvm.vector.reduce.and.v4i64(<4 x i64> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i32 = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4i32 = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v8i32 = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i16 = call i16 @llvm.vector.reduce.and.v2i16(<2 x i16> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4i16 = call i16 @llvm.vector.reduce.and.v4i16(<4 x i16> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8i16 = call i16 @llvm.vector.reduce.and.v8i16(<8 x i16> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v16i16 = call i16 @llvm.vector.reduce.and.v16i16(<16 x i16> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i8 = call i8 @llvm.vector.reduce.and.v2i8(<2 x i8> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4i8 = call i8 @llvm.vector.reduce.and.v4i8(<4 x i8> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v8i8 = call i8 @llvm.vector.reduce.and.v8i8(<8 x i8> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16i8 = call i8 @llvm.vector.reduce.and.v16i8(<16 x i8> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v32i8 = call i8 @llvm.vector.reduce.and.v32i8(<32 x i8> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of 1 for: %v1i64 = call i64 @llvm.vector.reduce.and.v1i64(<1 x i64> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of 4 for: %v2i64 = call i64 @llvm.vector.reduce.and.v2i64(<2 x i64> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:5 Lat:6 SizeLat:6 for: %v4i64 = call i64 @llvm.vector.reduce.and.v4i64(<4 x i64> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of 3 for: %v2i32 = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of 7 for: %v4i32 = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:8 Lat:9 SizeLat:9 for: %v8i32 = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of 3 for: %v2i16 = call i16 @llvm.vector.reduce.and.v2i16(<2 x i16> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:5 Lat:7 SizeLat:7 for: %v4i16 = call i16 @llvm.vector.reduce.and.v4i16(<4 x i16> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of 15 for: %v8i16 = call i16 @llvm.vector.reduce.and.v8i16(<8 x i16> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:16 Lat:17 SizeLat:17 for: %v16i16 = call i16 @llvm.vector.reduce.and.v16i16(<16 x i16> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of 3 for: %v2i8 = call i8 @llvm.vector.reduce.and.v2i8(<2 x i8> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of 7 for: %v4i8 = call i8 @llvm.vector.reduce.and.v4i8(<4 x i8> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:9 Lat:11 SizeLat:11 for: %v8i8 = call i8 @llvm.vector.reduce.and.v8i8(<8 x i8> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of 31 for: %v16i8 = call i8 @llvm.vector.reduce.and.v16i8(<16 x i8> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of RThru:33 CodeSize:32 Lat:33 SizeLat:33 for: %v32i8 = call i8 @llvm.vector.reduce.and.v32i8(<32 x i8> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 entry:
   %v1i64 = call i64 @llvm.vector.reduce.and.v1i64(<1 x i64> undef)
@@ -60,40 +60,40 @@ entry:
 
 define void @or() {
 ; CHECK-V8-LABEL: 'or'
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1i64 = call i64 @llvm.vector.reduce.or.v1i64(<1 x i64> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i64 = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i64 = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i32 = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i32 = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v8i32 = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i16 = call i16 @llvm.vector.reduce.or.v2i16(<2 x i16> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4i16 = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i16 = call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v16i16 = call i16 @llvm.vector.reduce.or.v16i16(<16 x i16> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i8 = call i8 @llvm.vector.reduce.or.v2i8(<2 x i8> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4i8 = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8i8 = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i8 = call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32i8 = call i8 @llvm.vector.reduce.or.v32i8(<32 x i8> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-V8-NEXT:  Cost Model: Found costs of 1 for: %v1i64 = call i64 @llvm.vector.reduce.or.v1i64(<1 x i64> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of 2 for: %v2i64 = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of 3 for: %v4i64 = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of 3 for: %v2i32 = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of 4 for: %v4i32 = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of 5 for: %v8i32 = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of 3 for: %v2i16 = call i16 @llvm.vector.reduce.or.v2i16(<2 x i16> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of 7 for: %v4i16 = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of 8 for: %v8i16 = call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of 9 for: %v16i16 = call i16 @llvm.vector.reduce.or.v16i16(<16 x i16> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of 3 for: %v2i8 = call i8 @llvm.vector.reduce.or.v2i8(<2 x i8> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of 7 for: %v4i8 = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of 15 for: %v8i8 = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of 16 for: %v16i8 = call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of 17 for: %v32i8 = call i8 @llvm.vector.reduce.or.v32i8(<32 x i8> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-MVEI-LABEL: 'or'
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1i64 = call i64 @llvm.vector.reduce.or.v1i64(<1 x i64> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2i64 = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v4i64 = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i32 = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4i32 = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v8i32 = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i16 = call i16 @llvm.vector.reduce.or.v2i16(<2 x i16> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4i16 = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8i16 = call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v16i16 = call i16 @llvm.vector.reduce.or.v16i16(<16 x i16> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i8 = call i8 @llvm.vector.reduce.or.v2i8(<2 x i8> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4i8 = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v8i8 = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16i8 = call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v32i8 = call i8 @llvm.vector.reduce.or.v32i8(<32 x i8> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of 1 for: %v1i64 = call i64 @llvm.vector.reduce.or.v1i64(<1 x i64> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of 4 for: %v2i64 = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:5 Lat:6 SizeLat:6 for: %v4i64 = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of 3 for: %v2i32 = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of 7 for: %v4i32 = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:8 Lat:9 SizeLat:9 for: %v8i32 = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of 3 for: %v2i16 = call i16 @llvm.vector.reduce.or.v2i16(<2 x i16> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:5 Lat:7 SizeLat:7 for: %v4i16 = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of 15 for: %v8i16 = call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:16 Lat:17 SizeLat:17 for: %v16i16 = call i16 @llvm.vector.reduce.or.v16i16(<16 x i16> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of 3 for: %v2i8 = call i8 @llvm.vector.reduce.or.v2i8(<2 x i8> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of 7 for: %v4i8 = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:9 Lat:11 SizeLat:11 for: %v8i8 = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of 31 for: %v16i8 = call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of RThru:33 CodeSize:32 Lat:33 SizeLat:33 for: %v32i8 = call i8 @llvm.vector.reduce.or.v32i8(<32 x i8> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 entry:
   %v1i64 = call i64 @llvm.vector.reduce.or.v1i64(<1 x i64> undef)
@@ -116,40 +116,40 @@ entry:
 
 define void @xor() {
 ; CHECK-V8-LABEL: 'xor'
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1i64 = call i64 @llvm.vector.reduce.xor.v1i64(<1 x i64> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i64 = call i64 @llvm.vector.reduce.xor.v2i64(<2 x i64> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i64 = call i64 @llvm.vector.reduce.xor.v4i64(<4 x i64> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i32 = call i32 @llvm.vector.reduce.xor.v2i32(<2 x i32> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i32 = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v8i32 = call i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i16 = call i16 @llvm.vector.reduce.xor.v2i16(<2 x i16> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4i16 = call i16 @llvm.vector.reduce.xor.v4i16(<4 x i16> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i16 = call i16 @llvm.vector.reduce.xor.v8i16(<8 x i16> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v16i16 = call i16 @llvm.vector.reduce.xor.v16i16(<16 x i16> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i8 = call i8 @llvm.vector.reduce.xor.v2i8(<2 x i8> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4i8 = call i8 @llvm.vector.reduce.xor.v4i8(<4 x i8> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8i8 = call i8 @llvm.vector.reduce.xor.v8i8(<8 x i8> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i8 = call i8 @llvm.vector.reduce.xor.v16i8(<16 x i8> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32i8 = call i8 @llvm.vector.reduce.xor.v32i8(<32 x i8> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-V8-NEXT:  Cost Model: Found costs of 1 for: %v1i64 = call i64 @llvm.vector.reduce.xor.v1i64(<1 x i64> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of 2 for: %v2i64 = call i64 @llvm.vector.reduce.xor.v2i64(<2 x i64> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of 3 for: %v4i64 = call i64 @llvm.vector.reduce.xor.v4i64(<4 x i64> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of 3 for: %v2i32 = call i32 @llvm.vector.reduce.xor.v2i32(<2 x i32> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of 4 for: %v4i32 = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of 5 for: %v8i32 = call i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of 3 for: %v2i16 = call i16 @llvm.vector.reduce.xor.v2i16(<2 x i16> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of 7 for: %v4i16 = call i16 @llvm.vector.reduce.xor.v4i16(<4 x i16> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of 8 for: %v8i16 = call i16 @llvm.vector.reduce.xor.v8i16(<8 x i16> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of 9 for: %v16i16 = call i16 @llvm.vector.reduce.xor.v16i16(<16 x i16> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of 3 for: %v2i8 = call i8 @llvm.vector.reduce.xor.v2i8(<2 x i8> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of 7 for: %v4i8 = call i8 @llvm.vector.reduce.xor.v4i8(<4 x i8> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of 15 for: %v8i8 = call i8 @llvm.vector.reduce.xor.v8i8(<8 x i8> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of 16 for: %v16i8 = call i8 @llvm.vector.reduce.xor.v16i8(<16 x i8> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of 17 for: %v32i8 = call i8 @llvm.vector.reduce.xor.v32i8(<32 x i8> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-MVEI-LABEL: 'xor'
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1i64 = call i64 @llvm.vector.reduce.xor.v1i64(<1 x i64> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2i64 = call i64 @llvm.vector.reduce.xor.v2i64(<2 x i64> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v4i64 = call i64 @llvm.vector.reduce.xor.v4i64(<4 x i64> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i32 = call i32 @llvm.vector.reduce.xor.v2i32(<2 x i32> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4i32 = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v8i32 = call i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i16 = call i16 @llvm.vector.reduce.xor.v2i16(<2 x i16> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4i16 = call i16 @llvm.vector.reduce.xor.v4i16(<4 x i16> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8i16 = call i16 @llvm.vector.reduce.xor.v8i16(<8 x i16> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v16i16 = call i16 @llvm.vector.reduce.xor.v16i16(<16 x i16> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i8 = call i8 @llvm.vector.reduce.xor.v2i8(<2 x i8> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4i8 = call i8 @llvm.vector.reduce.xor.v4i8(<4 x i8> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v8i8 = call i8 @llvm.vector.reduce.xor.v8i8(<8 x i8> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16i8 = call i8 @llvm.vector.reduce.xor.v16i8(<16 x i8> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v32i8 = call i8 @llvm.vector.reduce.xor.v32i8(<32 x i8> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of 1 for: %v1i64 = call i64 @llvm.vector.reduce.xor.v1i64(<1 x i64> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of 4 for: %v2i64 = call i64 @llvm.vector.reduce.xor.v2i64(<2 x i64> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:5 Lat:6 SizeLat:6 for: %v4i64 = call i64 @llvm.vector.reduce.xor.v4i64(<4 x i64> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of 3 for: %v2i32 = call i32 @llvm.vector.reduce.xor.v2i32(<2 x i32> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of 7 for: %v4i32 = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:8 Lat:9 SizeLat:9 for: %v8i32 = call i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of 3 for: %v2i16 = call i16 @llvm.vector.reduce.xor.v2i16(<2 x i16> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:5 Lat:7 SizeLat:7 for: %v4i16 = call i16 @llvm.vector.reduce.xor.v4i16(<4 x i16> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of 15 for: %v8i16 = call i16 @llvm.vector.reduce.xor.v8i16(<8 x i16> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:16 Lat:17 SizeLat:17 for: %v16i16 = call i16 @llvm.vector.reduce.xor.v16i16(<16 x i16> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of 3 for: %v2i8 = call i8 @llvm.vector.reduce.xor.v2i8(<2 x i8> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of 7 for: %v4i8 = call i8 @llvm.vector.reduce.xor.v4i8(<4 x i8> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:9 Lat:11 SizeLat:11 for: %v8i8 = call i8 @llvm.vector.reduce.xor.v8i8(<8 x i8> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of 31 for: %v16i8 = call i8 @llvm.vector.reduce.xor.v16i8(<16 x i8> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of RThru:33 CodeSize:32 Lat:33 SizeLat:33 for: %v32i8 = call i8 @llvm.vector.reduce.xor.v32i8(<32 x i8> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 entry:
   %v1i64 = call i64 @llvm.vector.reduce.xor.v1i64(<1 x i64> undef)
@@ -169,49 +169,3 @@ entry:
   %v32i8 = call i8 @llvm.vector.reduce.xor.v32i8(<32 x i8> undef)
   ret void
 }
-
-declare i16 @llvm.vector.reduce.and.v16i16(<16 x i16>)
-declare i16 @llvm.vector.reduce.and.v2i16(<2 x i16>)
-declare i16 @llvm.vector.reduce.and.v4i16(<4 x i16>)
-declare i16 @llvm.vector.reduce.and.v8i16(<8 x i16>)
-declare i16 @llvm.vector.reduce.or.v16i16(<16 x i16>)
-declare i16 @llvm.vector.reduce.or.v2i16(<2 x i16>)
-declare i16 @llvm.vector.reduce.or.v4i16(<4 x i16>)
-declare i16 @llvm.vector.reduce.or.v8i16(<8 x i16>)
-declare i16 @llvm.vector.reduce.xor.v16i16(<16 x i16>)
-declare i16 @llvm.vector.reduce.xor.v2i16(<2 x i16>)
-declare i16 @llvm.vector.reduce.xor.v4i16(<4 x i16>)
-declare i16 @llvm.vector.reduce.xor.v8i16(<8 x i16>)
-declare i32 @llvm.vector.reduce.and.v2i32(<2 x i32>)
-declare i32 @llvm.vector.reduce.and.v4i32(<4 x i32>)
-declare i32 @llvm.vector.reduce.and.v8i32(<8 x i32>)
-declare i32 @llvm.vector.reduce.or.v2i32(<2 x i32>)
-declare i32 @llvm.vector.reduce.or.v4i32(<4 x i32>)
-declare i32 @llvm.vector.reduce.or.v8i32(<8 x i32>)
-declare i32 @llvm.vector.reduce.xor.v2i32(<2 x i32>)
-declare i32 @llvm.vector.reduce.xor.v4i32(<4 x i32>)
-declare i32 @llvm.vector.reduce.xor.v8i32(<8 x i32>)
-declare i64 @llvm.vector.reduce.and.v1i64(<1 x i64>)
-declare i64 @llvm.vector.reduce.and.v2i64(<2 x i64>)
-declare i64 @llvm.vector.reduce.and.v4i64(<4 x i64>)
-declare i64 @llvm.vector.reduce.or.v1i64(<1 x i64>)
-declare i64 @llvm.vector.reduce.or.v2i64(<2 x i64>)
-declare i64 @llvm.vector.reduce.or.v4i64(<4 x i64>)
-declare i64 @llvm.vector.reduce.xor.v1i64(<1 x i64>)
-declare i64 @llvm.vector.reduce.xor.v2i64(<2 x i64>)
-declare i64 @llvm.vector.reduce.xor.v4i64(<4 x i64>)
-declare i8 @llvm.vector.reduce.and.v16i8(<16 x i8>)
-declare i8 @llvm.vector.reduce.and.v32i8(<32 x i8>)
-declare i8 @llvm.vector.reduce.and.v8i8(<8 x i8>)
-declare i8 @llvm.vector.reduce.and.v4i8(<4 x i8>)
-declare i8 @llvm.vector.reduce.and.v2i8(<2 x i8>)
-declare i8 @llvm.vector.reduce.or.v16i8(<16 x i8>)
-declare i8 @llvm.vector.reduce.or.v32i8(<32 x i8>)
-declare i8 @llvm.vector.reduce.or.v8i8(<8 x i8>)
-declare i8 @llvm.vector.reduce.or.v4i8(<4 x i8>)
-declare i8 @llvm.vector.reduce.or.v2i8(<2 x i8>)
-declare i8 @llvm.vector.reduce.xor.v16i8(<16 x i8>)
-declare i8 @llvm.vector.reduce.xor.v32i8(<32 x i8>)
-declare i8 @llvm.vector.reduce.xor.v8i8(<8 x i8>)
-declare i8 @llvm.vector.reduce.xor.v4i8(<4 x i8>)
-declare i8 @llvm.vector.reduce.xor.v2i8(<2 x i8>)
diff --git a/llvm/test/Analysis/CostModel/ARM/reduce-fminmax.ll b/llvm/test/Analysis/CostModel/ARM/reduce-fminmax.ll
index 48edae8c7d137..2bb890eee01ac 100644
--- a/llvm/test/Analysis/CostModel/ARM/reduce-fminmax.ll
+++ b/llvm/test/Analysis/CostModel/ARM/reduce-fminmax.ll
@@ -1,49 +1,49 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt < %s -mtriple=armv8a-linux-gnueabihf -mattr=+fp64 -passes="print<cost-model>" 2>&1 -disable-output | FileCheck %s --check-prefix=CHECK-V8
-; RUN: opt < %s -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve.fp -passes="print<cost-model>" 2>&1 -disable-output | FileCheck %s --check-prefix=CHECK-MVEFP
-; RUN: opt < %s -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve -passes="print<cost-model>" 2>&1 -disable-output | FileCheck %s --check-prefix=CHECK-MVEI
+; RUN: opt < %s -mtriple=armv8a-linux-gnueabihf -mattr=+fp64 -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output | FileCheck %s --check-prefix=CHECK-V8
+; RUN: opt < %s -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve.fp -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output | FileCheck %s --check-prefix=CHECK-MVEFP
+; RUN: opt < %s -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output | FileCheck %s --check-prefix=CHECK-MVEI
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 
 define void @fmin_strict() {
 ; CHECK-V8-LABEL: 'fmin_strict'
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %fmin_v2f16 = call half @llvm.vector.reduce.fmin.v2f16(<2 x half> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %fmin_v4f16 = call half @llvm.vector.reduce.fmin.v4f16(<4 x half> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %fmin_v8f16 = call half @llvm.vector.reduce.fmin.v8f16(<8 x half> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 91 for instruction: %fmin_v16f16 = call half @llvm.vector.reduce.fmin.v16f16(<16 x half> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %fmin_v2f32 = call float @llvm.vector.reduce.fmin.v2f32(<2 x float> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fmin_v4f32 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %fmin_v8f32 = call float @llvm.vector.reduce.fmin.v8f32(<8 x float> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %fmin_v2f64 = call double @llvm.vector.reduce.fmin.v2f64(<2 x double> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %fmin_v4f64 = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 94 for instruction: %fmin_v4f128 = call fp128 @llvm.vector.reduce.fmin.v4f128(<4 x fp128> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-V8-NEXT:  Cost Model: Found costs of 7 for: %fmin_v2f16 = call half @llvm.vector.reduce.fmin.v2f16(<2 x half> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of 19 for: %fmin_v4f16 = call half @llvm.vector.reduce.fmin.v4f16(<4 x half> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of 43 for: %fmin_v8f16 = call half @llvm.vector.reduce.fmin.v8f16(<8 x half> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of 91 for: %fmin_v16f16 = call half @llvm.vector.reduce.fmin.v16f16(<16 x half> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of 1 for: %fmin_v2f32 = call float @llvm.vector.reduce.fmin.v2f32(<2 x float> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of 2 for: %fmin_v4f32 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of 3 for: %fmin_v8f32 = call float @llvm.vector.reduce.fmin.v8f32(<8 x float> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of 1 for: %fmin_v2f64 = call double @llvm.vector.reduce.fmin.v2f64(<2 x double> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of 9 for: %fmin_v4f64 = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of RThru:94 CodeSize:67 Lat:94 SizeLat:94 for: %fmin_v4f128 = call fp128 @llvm.vector.reduce.fmin.v4f128(<4 x fp128> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-MVEFP-LABEL: 'fmin_strict'
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fmin_v2f16 = call half @llvm.vector.reduce.fmin.v2f16(<2 x half> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %fmin_v4f16 = call half @llvm.vector.reduce.fmin.v4f16(<4 x half> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %fmin_v8f16 = call half @llvm.vector.reduce.fmin.v8f16(<8 x half> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %fmin_v16f16 = call half @llvm.vector.reduce.fmin.v16f16(<16 x half> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %fmin_v2f32 = call float @llvm.vector.reduce.fmin.v2f32(<2 x float> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %fmin_v4f32 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %fmin_v8f32 = call float @llvm.vector.reduce.fmin.v8f32(<8 x float> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %fmin_v2f64 = call double @llvm.vector.reduce.fmin.v2f64(<2 x double> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %fmin_v4f64 = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 118 for instruction: %fmin_v4f128 = call fp128 @llvm.vector.reduce.fmin.v4f128(<4 x fp128> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 2 for: %fmin_v2f16 = call half @llvm.vector.reduce.fmin.v2f16(<2 x half> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 5 for: %fmin_v4f16 = call half @llvm.vector.reduce.fmin.v4f16(<4 x half> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:5 Lat:7 SizeLat:7 for: %fmin_v8f16 = call half @llvm.vector.reduce.fmin.v8f16(<8 x half> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:6 Lat:9 SizeLat:9 for: %fmin_v16f16 = call half @llvm.vector.reduce.fmin.v16f16(<16 x half> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %fmin_v2f32 = call float @llvm.vector.reduce.fmin.v2f32(<2 x float> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 3 for: %fmin_v4f32 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:4 Lat:5 SizeLat:5 for: %fmin_v8f32 = call float @llvm.vector.reduce.fmin.v8f32(<8 x float> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:13 Lat:35 SizeLat:35 for: %fmin_v2f64 = call double @llvm.vector.reduce.fmin.v2f64(<2 x double> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:69 CodeSize:25 Lat:69 SizeLat:69 for: %fmin_v4f64 = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:118 CodeSize:67 Lat:118 SizeLat:118 for: %fmin_v4f128 = call fp128 @llvm.vector.reduce.fmin.v4f128(<4 x fp128> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-MVEI-LABEL: 'fmin_strict'
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %fmin_v2f16 = call half @llvm.vector.reduce.fmin.v2f16(<2 x half> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 137 for instruction: %fmin_v4f16 = call half @llvm.vector.reduce.fmin.v4f16(<4 x half> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 409 for instruction: %fmin_v8f16 = call half @llvm.vector.reduce.fmin.v8f16(<8 x half> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 545 for instruction: %fmin_v16f16 = call half @llvm.vector.reduce.fmin.v16f16(<16 x half> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %fmin_v2f32 = call float @llvm.vector.reduce.fmin.v2f32(<2 x float> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 137 for instruction: %fmin_v4f32 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 205 for instruction: %fmin_v8f32 = call float @llvm.vector.reduce.fmin.v8f32(<8 x float> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %fmin_v2f64 = call double @llvm.vector.reduce.fmin.v2f64(<2 x double> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %fmin_v4f64 = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 118 for instruction: %fmin_v4f128 = call fp128 @llvm.vector.reduce.fmin.v4f128(<4 x fp128> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:13 Lat:35 SizeLat:35 for: %fmin_v2f16 = call half @llvm.vector.reduce.fmin.v2f16(<2 x half> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of RThru:137 CodeSize:49 Lat:137 SizeLat:137 for: %fmin_v4f16 = call half @llvm.vector.reduce.fmin.v4f16(<4 x half> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of RThru:409 CodeSize:145 Lat:409 SizeLat:409 for: %fmin_v8f16 = call half @llvm.vector.reduce.fmin.v8f16(<8 x half> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of RThru:545 CodeSize:193 Lat:545 SizeLat:545 for: %fmin_v16f16 = call half @llvm.vector.reduce.fmin.v16f16(<16 x half> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:13 Lat:35 SizeLat:35 for: %fmin_v2f32 = call float @llvm.vector.reduce.fmin.v2f32(<2 x float> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of RThru:137 CodeSize:49 Lat:137 SizeLat:137 for: %fmin_v4f32 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of RThru:205 CodeSize:73 Lat:205 SizeLat:205 for: %fmin_v8f32 = call float @llvm.vector.reduce.fmin.v8f32(<8 x float> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:13 Lat:35 SizeLat:35 for: %fmin_v2f64 = call double @llvm.vector.reduce.fmin.v2f64(<2 x double> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of RThru:69 CodeSize:25 Lat:69 SizeLat:69 for: %fmin_v4f64 = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of RThru:118 CodeSize:67 Lat:118 SizeLat:118 for: %fmin_v4f128 = call fp128 @llvm.vector.reduce.fmin.v4f128(<4 x fp128> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %fmin_v2f16 = call half @llvm.vector.reduce.fmin.v2f16(<2 x half> undef)
   %fmin_v4f16 = call half @llvm.vector.reduce.fmin.v4f16(<4 x half> undef)
@@ -61,43 +61,43 @@ define void @fmin_strict() {
 
 define void @fmin_unordered() {
 ; CHECK-V8-LABEL: 'fmin_unordered'
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %fmin_v2f16 = call reassoc half @llvm.vector.reduce.fmin.v2f16(<2 x half> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %fmin_v4f16 = call reassoc half @llvm.vector.reduce.fmin.v4f16(<4 x half> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %fmin_v8f16 = call reassoc half @llvm.vector.reduce.fmin.v8f16(<8 x half> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 91 for instruction: %fmin_v16f16 = call reassoc half @llvm.vector.reduce.fmin.v16f16(<16 x half> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %fmin_v2f32 = call reassoc float @llvm.vector.reduce.fmin.v2f32(<2 x float> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fmin_v4f32 = call reassoc float @llvm.vector.reduce.fmin.v4f32(<4 x float> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %fmin_v8f32 = call reassoc float @llvm.vector.reduce.fmin.v8f32(<8 x float> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %fmin_v2f64 = call reassoc double @llvm.vector.reduce.fmin.v2f64(<2 x double> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %fmin_v4f64 = call reassoc double @llvm.vector.reduce.fmin.v4f64(<4 x double> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 94 for instruction: %fmin_v4f128 = call reassoc fp128 @llvm.vector.reduce.fmin.v4f128(<4 x fp128> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-V8-NEXT:  Cost Model: Found costs of 7 for: %fmin_v2f16 = call reassoc half @llvm.vector.reduce.fmin.v2f16(<2 x half> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of 19 for: %fmin_v4f16 = call reassoc half @llvm.vector.reduce.fmin.v4f16(<4 x half> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of 43 for: %fmin_v8f16 = call reassoc half @llvm.vector.reduce.fmin.v8f16(<8 x half> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of 91 for: %fmin_v16f16 = call reassoc half @llvm.vector.reduce.fmin.v16f16(<16 x half> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of 1 for: %fmin_v2f32 = call reassoc float @llvm.vector.reduce.fmin.v2f32(<2 x float> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of 2 for: %fmin_v4f32 = call reassoc float @llvm.vector.reduce.fmin.v4f32(<4 x float> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of 3 for: %fmin_v8f32 = call reassoc float @llvm.vector.reduce.fmin.v8f32(<8 x float> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of 1 for: %fmin_v2f64 = call reassoc double @llvm.vector.reduce.fmin.v2f64(<2 x double> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of 9 for: %fmin_v4f64 = call reassoc double @llvm.vector.reduce.fmin.v4f64(<4 x double> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of RThru:94 CodeSize:67 Lat:94 SizeLat:94 for: %fmin_v4f128 = call reassoc fp128 @llvm.vector.reduce.fmin.v4f128(<4 x fp128> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-MVEFP-LABEL: 'fmin_unordered'
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fmin_v2f16 = call reassoc half @llvm.vector.reduce.fmin.v2f16(<2 x half> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %fmin_v4f16 = call reassoc half @llvm.vector.reduce.fmin.v4f16(<4 x half> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %fmin_v8f16 = call reassoc half @llvm.vector.reduce.fmin.v8f16(<8 x half> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %fmin_v16f16 = call reassoc half @llvm.vector.reduce.fmin.v16f16(<16 x half> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %fmin_v2f32 = call reassoc float @llvm.vector.reduce.fmin.v2f32(<2 x float> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %fmin_v4f32 = call reassoc float @llvm.vector.reduce.fmin.v4f32(<4 x float> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %fmin_v8f32 = call reassoc float @llvm.vector.reduce.fmin.v8f32(<8 x float> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %fmin_v2f64 = call reassoc double @llvm.vector.reduce.fmin.v2f64(<2 x double> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %fmin_v4f64 = call reassoc double @llvm.vector.reduce.fmin.v4f64(<4 x double> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 118 for instruction: %fmin_v4f128 = call reassoc fp128 @llvm.vector.reduce.fmin.v4f128(<4 x fp128> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 2 for: %fmin_v2f16 = call reassoc half @llvm.vector.reduce.fmin.v2f16(<2 x half> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 5 for: %fmin_v4f16 = call reassoc half @llvm.vector.reduce.fmin.v4f16(<4 x half> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:5 Lat:7 SizeLat:7 for: %fmin_v8f16 = call reassoc half @llvm.vector.reduce.fmin.v8f16(<8 x half> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:6 Lat:9 SizeLat:9 for: %fmin_v16f16 = call reassoc half @llvm.vector.reduce.fmin.v16f16(<16 x half> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %fmin_v2f32 = call reassoc float @llvm.vector.reduce.fmin.v2f32(<2 x float> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 3 for: %fmin_v4f32 = call reassoc float @llvm.vector.reduce.fmin.v4f32(<4 x float> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:4 Lat:5 SizeLat:5 for: %fmin_v8f32 = call reassoc float @llvm.vector.reduce.fmin.v8f32(<8 x float> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:13 Lat:35 SizeLat:35 for: %fmin_v2f64 = call reassoc double @llvm.vector.reduce.fmin.v2f64(<2 x double> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:69 CodeSize:25 Lat:69 SizeLat:69 for: %fmin_v4f64 = call reassoc double @llvm.vector.reduce.fmin.v4f64(<4 x double> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:118 CodeSize:67 Lat:118 SizeLat:118 for: %fmin_v4f128 = call reassoc fp128 @llvm.vector.reduce.fmin.v4f128(<4 x fp128> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-MVEI-LABEL: 'fmin_unordered'
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %fmin_v2f16 = call reassoc half @llvm.vector.reduce.fmin.v2f16(<2 x half> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 137 for instruction: %fmin_v4f16 = call reassoc half @llvm.vector.reduce.fmin.v4f16(<4 x half> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 409 for instruction: %fmin_v8f16 = call reassoc half @llvm.vector.reduce.fmin.v8f16(<8 x half> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 545 for instruction: %fmin_v16f16 = call reassoc half @llvm.vector.reduce.fmin.v16f16(<16 x half> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %fmin_v2f32 = call reassoc float @llvm.vector.reduce.fmin.v2f32(<2 x float> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 137 for instruction: %fmin_v4f32 = call reassoc float @llvm.vector.reduce.fmin.v4f32(<4 x float> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 205 for instruction: %fmin_v8f32 = call reassoc float @llvm.vector.reduce.fmin.v8f32(<8 x float> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %fmin_v2f64 = call reassoc double @llvm.vector.reduce.fmin.v2f64(<2 x double> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %fmin_v4f64 = call reassoc double @llvm.vector.reduce.fmin.v4f64(<4 x double> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 118 for instruction: %fmin_v4f128 = call reassoc fp128 @llvm.vector.reduce.fmin.v4f128(<4 x fp128> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:13 Lat:35 SizeLat:35 for: %fmin_v2f16 = call reassoc half @llvm.vector.reduce.fmin.v2f16(<2 x half> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of RThru:137 CodeSize:49 Lat:137 SizeLat:137 for: %fmin_v4f16 = call reassoc half @llvm.vector.reduce.fmin.v4f16(<4 x half> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of RThru:409 CodeSize:145 Lat:409 SizeLat:409 for: %fmin_v8f16 = call reassoc half @llvm.vector.reduce.fmin.v8f16(<8 x half> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of RThru:545 CodeSize:193 Lat:545 SizeLat:545 for: %fmin_v16f16 = call reassoc half @llvm.vector.reduce.fmin.v16f16(<16 x half> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:13 Lat:35 SizeLat:35 for: %fmin_v2f32 = call reassoc float @llvm.vector.reduce.fmin.v2f32(<2 x float> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of RThru:137 CodeSize:49 Lat:137 SizeLat:137 for: %fmin_v4f32 = call reassoc float @llvm.vector.reduce.fmin.v4f32(<4 x float> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of RThru:205 CodeSize:73 Lat:205 SizeLat:205 for: %fmin_v8f32 = call reassoc float @llvm.vector.reduce.fmin.v8f32(<8 x float> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:13 Lat:35 SizeLat:35 for: %fmin_v2f64 = call reassoc double @llvm.vector.reduce.fmin.v2f64(<2 x double> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of RThru:69 CodeSize:25 Lat:69 SizeLat:69 for: %fmin_v4f64 = call reassoc double @llvm.vector.reduce.fmin.v4f64(<4 x double> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of RThru:118 CodeSize:67 Lat:118 SizeLat:118 for: %fmin_v4f128 = call reassoc fp128 @llvm.vector.reduce.fmin.v4f128(<4 x fp128> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %fmin_v2f16 = call reassoc half @llvm.vector.reduce.fmin.v2f16(<2 x half> undef)
   %fmin_v4f16 = call reassoc half @llvm.vector.reduce.fmin.v4f16(<4 x half> undef)
@@ -114,43 +114,43 @@ define void @fmin_unordered() {
 
 define void @fmax_strict() {
 ; CHECK-V8-LABEL: 'fmax_strict'
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %fmax_v2f16 = call half @llvm.vector.reduce.fmax.v2f16(<2 x half> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %fmax_v4f16 = call half @llvm.vector.reduce.fmax.v4f16(<4 x half> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %fmax_v8f16 = call half @llvm.vector.reduce.fmax.v8f16(<8 x half> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 91 for instruction: %fmax_v16f16 = call half @llvm.vector.reduce.fmax.v16f16(<16 x half> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %fmax_v2f32 = call float @llvm.vector.reduce.fmax.v2f32(<2 x float> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fmax_v4f32 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %fmax_v8f32 = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %fmax_v2f64 = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %fmax_v4f64 = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 94 for instruction: %fmax_v4f128 = call fp128 @llvm.vector.reduce.fmax.v4f128(<4 x fp128> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-V8-NEXT:  Cost Model: Found costs of 7 for: %fmax_v2f16 = call half @llvm.vector.reduce.fmax.v2f16(<2 x half> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of 19 for: %fmax_v4f16 = call half @llvm.vector.reduce.fmax.v4f16(<4 x half> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of 43 for: %fmax_v8f16 = call half @llvm.vector.reduce.fmax.v8f16(<8 x half> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of 91 for: %fmax_v16f16 = call half @llvm.vector.reduce.fmax.v16f16(<16 x half> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of 1 for: %fmax_v2f32 = call float @llvm.vector.reduce.fmax.v2f32(<2 x float> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of 2 for: %fmax_v4f32 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of 3 for: %fmax_v8f32 = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of 1 for: %fmax_v2f64 = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of 9 for: %fmax_v4f64 = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of RThru:94 CodeSize:67 Lat:94 SizeLat:94 for: %fmax_v4f128 = call fp128 @llvm.vector.reduce.fmax.v4f128(<4 x fp128> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-MVEFP-LABEL: 'fmax_strict'
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fmax_v2f16 = call half @llvm.vector.reduce.fmax.v2f16(<2 x half> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %fmax_v4f16 = call half @llvm.vector.reduce.fmax.v4f16(<4 x half> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %fmax_v8f16 = call half @llvm.vector.reduce.fmax.v8f16(<8 x half> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %fmax_v16f16 = call half @llvm.vector.reduce.fmax.v16f16(<16 x half> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %fmax_v2f32 = call float @llvm.vector.reduce.fmax.v2f32(<2 x float> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %fmax_v4f32 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %fmax_v8f32 = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %fmax_v2f64 = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %fmax_v4f64 = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 118 for instruction: %fmax_v4f128 = call fp128 @llvm.vector.reduce.fmax.v4f128(<4 x fp128> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 2 for: %fmax_v2f16 = call half @llvm.vector.reduce.fmax.v2f16(<2 x half> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 5 for: %fmax_v4f16 = call half @llvm.vector.reduce.fmax.v4f16(<4 x half> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:5 Lat:7 SizeLat:7 for: %fmax_v8f16 = call half @llvm.vector.reduce.fmax.v8f16(<8 x half> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:6 Lat:9 SizeLat:9 for: %fmax_v16f16 = call half @llvm.vector.reduce.fmax.v16f16(<16 x half> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %fmax_v2f32 = call float @llvm.vector.reduce.fmax.v2f32(<2 x float> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 3 for: %fmax_v4f32 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:4 Lat:5 SizeLat:5 for: %fmax_v8f32 = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:13 Lat:35 SizeLat:35 for: %fmax_v2f64 = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:69 CodeSize:25 Lat:69 SizeLat:69 for: %fmax_v4f64 = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:118 CodeSize:67 Lat:118 SizeLat:118 for: %fmax_v4f128 = call fp128 @llvm.vector.reduce.fmax.v4f128(<4 x fp128> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-MVEI-LABEL: 'fmax_strict'
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %fmax_v2f16 = call half @llvm.vector.reduce.fmax.v2f16(<2 x half> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 137 for instruction: %fmax_v4f16 = call half @llvm.vector.reduce.fmax.v4f16(<4 x half> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 409 for instruction: %fmax_v8f16 = call half @llvm.vector.reduce.fmax.v8f16(<8 x half> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 545 for instruction: %fmax_v16f16 = call half @llvm.vector.reduce.fmax.v16f16(<16 x half> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %fmax_v2f32 = call float @llvm.vector.reduce.fmax.v2f32(<2 x float> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 137 for instruction: %fmax_v4f32 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 205 for instruction: %fmax_v8f32 = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %fmax_v2f64 = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %fmax_v4f64 = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 118 for instruction: %fmax_v4f128 = call fp128 @llvm.vector.reduce.fmax.v4f128(<4 x fp128> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:13 Lat:35 SizeLat:35 for: %fmax_v2f16 = call half @llvm.vector.reduce.fmax.v2f16(<2 x half> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of RThru:137 CodeSize:49 Lat:137 SizeLat:137 for: %fmax_v4f16 = call half @llvm.vector.reduce.fmax.v4f16(<4 x half> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of RThru:409 CodeSize:145 Lat:409 SizeLat:409 for: %fmax_v8f16 = call half @llvm.vector.reduce.fmax.v8f16(<8 x half> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of RThru:545 CodeSize:193 Lat:545 SizeLat:545 for: %fmax_v16f16 = call half @llvm.vector.reduce.fmax.v16f16(<16 x half> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:13 Lat:35 SizeLat:35 for: %fmax_v2f32 = call float @llvm.vector.reduce.fmax.v2f32(<2 x float> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of RThru:137 CodeSize:49 Lat:137 SizeLat:137 for: %fmax_v4f32 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of RThru:205 CodeSize:73 Lat:205 SizeLat:205 for: %fmax_v8f32 = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:13 Lat:35 SizeLat:35 for: %fmax_v2f64 = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of RThru:69 CodeSize:25 Lat:69 SizeLat:69 for: %fmax_v4f64 = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of RThru:118 CodeSize:67 Lat:118 SizeLat:118 for: %fmax_v4f128 = call fp128 @llvm.vector.reduce.fmax.v4f128(<4 x fp128> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %fmax_v2f16 = call half @llvm.vector.reduce.fmax.v2f16(<2 x half> undef)
   %fmax_v4f16 = call half @llvm.vector.reduce.fmax.v4f16(<4 x half> undef)
@@ -168,43 +168,43 @@ define void @fmax_strict() {
 
 define void @fmax_unordered() {
 ; CHECK-V8-LABEL: 'fmax_unordered'
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %fmax_v2f16 = call reassoc half @llvm.vector.reduce.fmax.v2f16(<2 x half> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %fmax_v4f16 = call reassoc half @llvm.vector.reduce.fmax.v4f16(<4 x half> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %fmax_v8f16 = call reassoc half @llvm.vector.reduce.fmax.v8f16(<8 x half> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 91 for instruction: %fmax_v16f16 = call reassoc half @llvm.vector.reduce.fmax.v16f16(<16 x half> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %fmax_v2f32 = call reassoc float @llvm.vector.reduce.fmax.v2f32(<2 x float> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fmax_v4f32 = call reassoc float @llvm.vector.reduce.fmax.v4f32(<4 x float> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %fmax_v8f32 = call reassoc float @llvm.vector.reduce.fmax.v8f32(<8 x float> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %fmax_v2f64 = call reassoc double @llvm.vector.reduce.fmax.v2f64(<2 x double> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %fmax_v4f64 = call reassoc double @llvm.vector.reduce.fmax.v4f64(<4 x double> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 94 for instruction: %fmax_v4f128 = call reassoc fp128 @llvm.vector.reduce.fmax.v4f128(<4 x fp128> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-V8-NEXT:  Cost Model: Found costs of 7 for: %fmax_v2f16 = call reassoc half @llvm.vector.reduce.fmax.v2f16(<2 x half> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of 19 for: %fmax_v4f16 = call reassoc half @llvm.vector.reduce.fmax.v4f16(<4 x half> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of 43 for: %fmax_v8f16 = call reassoc half @llvm.vector.reduce.fmax.v8f16(<8 x half> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of 91 for: %fmax_v16f16 = call reassoc half @llvm.vector.reduce.fmax.v16f16(<16 x half> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of 1 for: %fmax_v2f32 = call reassoc float @llvm.vector.reduce.fmax.v2f32(<2 x float> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of 2 for: %fmax_v4f32 = call reassoc float @llvm.vector.reduce.fmax.v4f32(<4 x float> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of 3 for: %fmax_v8f32 = call reassoc float @llvm.vector.reduce.fmax.v8f32(<8 x float> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of 1 for: %fmax_v2f64 = call reassoc double @llvm.vector.reduce.fmax.v2f64(<2 x double> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of 9 for: %fmax_v4f64 = call reassoc double @llvm.vector.reduce.fmax.v4f64(<4 x double> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of RThru:94 CodeSize:67 Lat:94 SizeLat:94 for: %fmax_v4f128 = call reassoc fp128 @llvm.vector.reduce.fmax.v4f128(<4 x fp128> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-MVEFP-LABEL: 'fmax_unordered'
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fmax_v2f16 = call reassoc half @llvm.vector.reduce.fmax.v2f16(<2 x half> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %fmax_v4f16 = call reassoc half @llvm.vector.reduce.fmax.v4f16(<4 x half> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %fmax_v8f16 = call reassoc half @llvm.vector.reduce.fmax.v8f16(<8 x half> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %fmax_v16f16 = call reassoc half @llvm.vector.reduce.fmax.v16f16(<16 x half> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %fmax_v2f32 = call reassoc float @llvm.vector.reduce.fmax.v2f32(<2 x float> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %fmax_v4f32 = call reassoc float @llvm.vector.reduce.fmax.v4f32(<4 x float> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %fmax_v8f32 = call reassoc float @llvm.vector.reduce.fmax.v8f32(<8 x float> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %fmax_v2f64 = call reassoc double @llvm.vector.reduce.fmax.v2f64(<2 x double> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %fmax_v4f64 = call reassoc double @llvm.vector.reduce.fmax.v4f64(<4 x double> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 118 for instruction: %fmax_v4f128 = call reassoc fp128 @llvm.vector.reduce.fmax.v4f128(<4 x fp128> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 2 for: %fmax_v2f16 = call reassoc half @llvm.vector.reduce.fmax.v2f16(<2 x half> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 5 for: %fmax_v4f16 = call reassoc half @llvm.vector.reduce.fmax.v4f16(<4 x half> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:5 Lat:7 SizeLat:7 for: %fmax_v8f16 = call reassoc half @llvm.vector.reduce.fmax.v8f16(<8 x half> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:6 Lat:9 SizeLat:9 for: %fmax_v16f16 = call reassoc half @llvm.vector.reduce.fmax.v16f16(<16 x half> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %fmax_v2f32 = call reassoc float @llvm.vector.reduce.fmax.v2f32(<2 x float> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 3 for: %fmax_v4f32 = call reassoc float @llvm.vector.reduce.fmax.v4f32(<4 x float> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:4 Lat:5 SizeLat:5 for: %fmax_v8f32 = call reassoc float @llvm.vector.reduce.fmax.v8f32(<8 x float> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:13 Lat:35 SizeLat:35 for: %fmax_v2f64 = call reassoc double @llvm.vector.reduce.fmax.v2f64(<2 x double> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:69 CodeSize:25 Lat:69 SizeLat:69 for: %fmax_v4f64 = call reassoc double @llvm.vector.reduce.fmax.v4f64(<4 x double> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:118 CodeSize:67 Lat:118 SizeLat:118 for: %fmax_v4f128 = call reassoc fp128 @llvm.vector.reduce.fmax.v4f128(<4 x fp128> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-MVEI-LABEL: 'fmax_unordered'
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %fmax_v2f16 = call reassoc half @llvm.vector.reduce.fmax.v2f16(<2 x half> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 137 for instruction: %fmax_v4f16 = call reassoc half @llvm.vector.reduce.fmax.v4f16(<4 x half> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 409 for instruction: %fmax_v8f16 = call reassoc half @llvm.vector.reduce.fmax.v8f16(<8 x half> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 545 for instruction: %fmax_v16f16 = call reassoc half @llvm.vector.reduce.fmax.v16f16(<16 x half> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %fmax_v2f32 = call reassoc float @llvm.vector.reduce.fmax.v2f32(<2 x float> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 137 for instruction: %fmax_v4f32 = call reassoc float @llvm.vector.reduce.fmax.v4f32(<4 x float> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 205 for instruction: %fmax_v8f32 = call reassoc float @llvm.vector.reduce.fmax.v8f32(<8 x float> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %fmax_v2f64 = call reassoc double @llvm.vector.reduce.fmax.v2f64(<2 x double> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %fmax_v4f64 = call reassoc double @llvm.vector.reduce.fmax.v4f64(<4 x double> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 118 for instruction: %fmax_v4f128 = call reassoc fp128 @llvm.vector.reduce.fmax.v4f128(<4 x fp128> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:13 Lat:35 SizeLat:35 for: %fmax_v2f16 = call reassoc half @llvm.vector.reduce.fmax.v2f16(<2 x half> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of RThru:137 CodeSize:49 Lat:137 SizeLat:137 for: %fmax_v4f16 = call reassoc half @llvm.vector.reduce.fmax.v4f16(<4 x half> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of RThru:409 CodeSize:145 Lat:409 SizeLat:409 for: %fmax_v8f16 = call reassoc half @llvm.vector.reduce.fmax.v8f16(<8 x half> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of RThru:545 CodeSize:193 Lat:545 SizeLat:545 for: %fmax_v16f16 = call reassoc half @llvm.vector.reduce.fmax.v16f16(<16 x half> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:13 Lat:35 SizeLat:35 for: %fmax_v2f32 = call reassoc float @llvm.vector.reduce.fmax.v2f32(<2 x float> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of RThru:137 CodeSize:49 Lat:137 SizeLat:137 for: %fmax_v4f32 = call reassoc float @llvm.vector.reduce.fmax.v4f32(<4 x float> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of RThru:205 CodeSize:73 Lat:205 SizeLat:205 for: %fmax_v8f32 = call reassoc float @llvm.vector.reduce.fmax.v8f32(<8 x float> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:13 Lat:35 SizeLat:35 for: %fmax_v2f64 = call reassoc double @llvm.vector.reduce.fmax.v2f64(<2 x double> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of RThru:69 CodeSize:25 Lat:69 SizeLat:69 for: %fmax_v4f64 = call reassoc double @llvm.vector.reduce.fmax.v4f64(<4 x double> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of RThru:118 CodeSize:67 Lat:118 SizeLat:118 for: %fmax_v4f128 = call reassoc fp128 @llvm.vector.reduce.fmax.v4f128(<4 x fp128> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %fmax_v2f16 = call reassoc half @llvm.vector.reduce.fmax.v2f16(<2 x half> undef)
   %fmax_v4f16 = call reassoc half @llvm.vector.reduce.fmax.v4f16(<4 x half> undef)
@@ -218,27 +218,3 @@ define void @fmax_unordered() {
   %fmax_v4f128 = call reassoc fp128 @llvm.vector.reduce.fmax.v4f128(<4 x fp128> undef)
   ret void
 }
-
-
-declare half @llvm.vector.reduce.fmin.v2f16(<2 x half>)
-declare half @llvm.vector.reduce.fmin.v4f16(<4 x half>)
-declare half @llvm.vector.reduce.fmin.v8f16(<8 x half>)
-declare half @llvm.vector.reduce.fmin.v16f16(<16 x half>)
-declare float @llvm.vector.reduce.fmin.v2f32(<2 x float>)
-declare float @llvm.vector.reduce.fmin.v4f32(<4 x float>)
-declare float @llvm.vector.reduce.fmin.v8f32(<8 x float>)
-declare double @llvm.vector.reduce.fmin.v2f64(<2 x double>)
-declare double @llvm.vector.reduce.fmin.v4f64(<4 x double>)
-declare fp128 @llvm.vector.reduce.fmin.v4f128(<4 x fp128>)
-
-
-declare half @llvm.vector.reduce.fmax.v2f16(<2 x half>)
-declare half @llvm.vector.reduce.fmax.v4f16(<4 x half>)
-declare half @llvm.vector.reduce.fmax.v8f16(<8 x half>)
-declare half @llvm.vector.reduce.fmax.v16f16(<16 x half>)
-declare float @llvm.vector.reduce.fmax.v2f32(<2 x float>)
-declare float @llvm.vector.reduce.fmax.v4f32(<4 x float>)
-declare float @llvm.vector.reduce.fmax.v8f32(<8 x float>)
-declare double @llvm.vector.reduce.fmax.v2f64(<2 x double>)
-declare double @llvm.vector.reduce.fmax.v4f64(<4 x double>)
-declare fp128 @llvm.vector.reduce.fmax.v4f128(<4 x fp128>)
diff --git a/llvm/test/Analysis/CostModel/ARM/reduce-fp.ll b/llvm/test/Analysis/CostModel/ARM/reduce-fp.ll
index 87de486eeb183..392ef5e694883 100644
--- a/llvm/test/Analysis/CostModel/ARM/reduce-fp.ll
+++ b/llvm/test/Analysis/CostModel/ARM/reduce-fp.ll
@@ -1,49 +1,49 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt < %s -mtriple=armv8a-linux-gnueabihf -mattr=+fp64 -passes="print<cost-model>" 2>&1 -disable-output | FileCheck %s --check-prefix=CHECK-V8
-; RUN: opt < %s -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve.fp -passes="print<cost-model>" 2>&1 -disable-output | FileCheck %s --check-prefix=CHECK-MVEFP
-; RUN: opt < %s -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve -passes="print<cost-model>" 2>&1 -disable-output | FileCheck %s --check-prefix=CHECK-MVEI
+; RUN: opt < %s -mtriple=armv8a-linux-gnueabihf -mattr=+fp64 -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output | FileCheck %s --check-prefix=CHECK-V8
+; RUN: opt < %s -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve.fp -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output | FileCheck %s --check-prefix=CHECK-MVEFP
+; RUN: opt < %s -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output | FileCheck %s --check-prefix=CHECK-MVEI
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 
 define void @fadd_strict() {
 ; CHECK-V8-LABEL: 'fadd_strict'
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fadd_v2f16 = call half @llvm.vector.reduce.fadd.v2f16(half 0xH0000, <2 x half> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %fadd_v4f16 = call half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %fadd_v8f16 = call half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %fadd_v16f16 = call half @llvm.vector.reduce.fadd.v16f16(half 0xH0000, <16 x half> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %fadd_v2f32 = call float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f32 = call float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %fadd_v8f32 = call float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %fadd_v2f64 = call double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f64 = call double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %fadd_v4f128 = call fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-V8-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:6 Lat:10 SizeLat:6 for: %fadd_v2f16 = call half @llvm.vector.reduce.fadd.v2f16(half 0xH0000, <2 x half> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:12 Lat:20 SizeLat:12 for: %fadd_v4f16 = call half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:24 Lat:40 SizeLat:24 for: %fadd_v8f16 = call half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:48 Lat:80 SizeLat:48 for: %fadd_v16f16 = call half @llvm.vector.reduce.fadd.v16f16(half 0xH0000, <16 x half> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:6 SizeLat:2 for: %fadd_v2f32 = call float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:12 SizeLat:4 for: %fadd_v4f32 = call float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:24 SizeLat:8 for: %fadd_v8f32 = call float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:6 SizeLat:2 for: %fadd_v2f64 = call double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:12 SizeLat:4 for: %fadd_v4f64 = call double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of RThru:48 CodeSize:20 Lat:28 SizeLat:20 for: %fadd_v4f128 = call fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-MVEFP-LABEL: 'fadd_strict'
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %fadd_v2f16 = call half @llvm.vector.reduce.fadd.v2f16(half 0xH0000, <2 x half> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %fadd_v4f16 = call half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %fadd_v8f16 = call half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %fadd_v16f16 = call half @llvm.vector.reduce.fadd.v16f16(half 0xH0000, <16 x half> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fadd_v2f32 = call float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %fadd_v4f32 = call float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fadd_v8f32 = call float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %fadd_v2f64 = call double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f64 = call double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %fadd_v4f128 = call fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 3 for: %fadd_v2f16 = call half @llvm.vector.reduce.fadd.v2f16(half 0xH0000, <2 x half> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 6 for: %fadd_v4f16 = call half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 12 for: %fadd_v8f16 = call half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 24 for: %fadd_v16f16 = call half @llvm.vector.reduce.fadd.v16f16(half 0xH0000, <16 x half> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 2 for: %fadd_v2f32 = call float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 4 for: %fadd_v4f32 = call float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 8 for: %fadd_v8f32 = call float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 4 for: %fadd_v2f64 = call double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 8 for: %fadd_v4f64 = call double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 32 for: %fadd_v4f128 = call fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-MVEI-LABEL: 'fadd_strict'
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %fadd_v2f16 = call half @llvm.vector.reduce.fadd.v2f16(half 0xH0000, <2 x half> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f16 = call half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %fadd_v8f16 = call half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %fadd_v16f16 = call half @llvm.vector.reduce.fadd.v16f16(half 0xH0000, <16 x half> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %fadd_v2f32 = call float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f32 = call float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %fadd_v8f32 = call float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %fadd_v2f64 = call double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f64 = call double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %fadd_v4f128 = call fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of 4 for: %fadd_v2f16 = call half @llvm.vector.reduce.fadd.v2f16(half 0xH0000, <2 x half> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of 8 for: %fadd_v4f16 = call half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of 16 for: %fadd_v8f16 = call half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of 32 for: %fadd_v16f16 = call half @llvm.vector.reduce.fadd.v16f16(half 0xH0000, <16 x half> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of 4 for: %fadd_v2f32 = call float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of 8 for: %fadd_v4f32 = call float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of 16 for: %fadd_v8f32 = call float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of 4 for: %fadd_v2f64 = call double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of 8 for: %fadd_v4f64 = call double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of 32 for: %fadd_v4f128 = call fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %fadd_v2f16 = call half @llvm.vector.reduce.fadd.v2f16(half 0.0, <2 x half> undef)
   %fadd_v4f16 = call half @llvm.vector.reduce.fadd.v4f16(half 0.0, <4 x half> undef)
@@ -61,43 +61,43 @@ define void @fadd_strict() {
 
 define void @fadd_unordered() {
 ; CHECK-V8-LABEL: 'fadd_unordered'
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fadd_v2f16 = call reassoc half @llvm.vector.reduce.fadd.v2f16(half 0xH0000, <2 x half> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %fadd_v4f16 = call reassoc half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %fadd_v8f16 = call reassoc half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %fadd_v16f16 = call reassoc half @llvm.vector.reduce.fadd.v16f16(half 0xH0000, <16 x half> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %fadd_v2f32 = call reassoc float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %fadd_v4f32 = call reassoc float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fadd_v8f32 = call reassoc float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %fadd_v2f64 = call reassoc double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %fadd_v4f64 = call reassoc double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %fadd_v4f128 = call reassoc fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-V8-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:7 Lat:9 SizeLat:7 for: %fadd_v2f16 = call reassoc half @llvm.vector.reduce.fadd.v2f16(half 0xH0000, <2 x half> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:16 Lat:20 SizeLat:16 for: %fadd_v4f16 = call reassoc half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of RThru:44 CodeSize:33 Lat:39 SizeLat:33 for: %fadd_v8f16 = call reassoc half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of RThru:92 CodeSize:66 Lat:74 SizeLat:66 for: %fadd_v16f16 = call reassoc half @llvm.vector.reduce.fadd.v16f16(half 0xH0000, <16 x half> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:6 SizeLat:2 for: %fadd_v2f32 = call reassoc float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:3 Lat:9 SizeLat:3 for: %fadd_v4f32 = call reassoc float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:12 SizeLat:4 for: %fadd_v8f32 = call reassoc float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:6 SizeLat:2 for: %fadd_v2f64 = call reassoc double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:3 Lat:9 SizeLat:3 for: %fadd_v4f64 = call reassoc double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of RThru:52 CodeSize:30 Lat:34 SizeLat:30 for: %fadd_v4f128 = call reassoc fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-MVEFP-LABEL: 'fadd_unordered'
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %fadd_v2f16 = call reassoc half @llvm.vector.reduce.fadd.v2f16(half 0xH0000, <2 x half> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %fadd_v4f16 = call reassoc half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fadd_v8f16 = call reassoc half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %fadd_v16f16 = call reassoc half @llvm.vector.reduce.fadd.v16f16(half 0xH0000, <16 x half> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fadd_v2f32 = call reassoc float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %fadd_v4f32 = call reassoc float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %fadd_v8f32 = call reassoc float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %fadd_v2f64 = call reassoc double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %fadd_v4f64 = call reassoc double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %fadd_v4f128 = call reassoc fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 3 for: %fadd_v2f16 = call reassoc half @llvm.vector.reduce.fadd.v2f16(half 0xH0000, <2 x half> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 6 for: %fadd_v4f16 = call reassoc half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:6 Lat:8 SizeLat:8 for: %fadd_v8f16 = call reassoc half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:7 Lat:10 SizeLat:10 for: %fadd_v16f16 = call reassoc half @llvm.vector.reduce.fadd.v16f16(half 0xH0000, <16 x half> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 2 for: %fadd_v2f32 = call reassoc float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 4 for: %fadd_v4f32 = call reassoc float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:5 Lat:6 SizeLat:6 for: %fadd_v8f32 = call reassoc float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:11 Lat:15 SizeLat:15 for: %fadd_v2f64 = call reassoc double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:21 Lat:29 SizeLat:29 for: %fadd_v4f64 = call reassoc double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:76 CodeSize:40 Lat:76 SizeLat:76 for: %fadd_v4f128 = call reassoc fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-MVEI-LABEL: 'fadd_unordered'
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %fadd_v2f16 = call reassoc half @llvm.vector.reduce.fadd.v2f16(half 0xH0000, <2 x half> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %fadd_v4f16 = call reassoc half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 169 for instruction: %fadd_v8f16 = call reassoc half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 225 for instruction: %fadd_v16f16 = call reassoc half @llvm.vector.reduce.fadd.v16f16(half 0xH0000, <16 x half> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %fadd_v2f32 = call reassoc float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %fadd_v4f32 = call reassoc float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 85 for instruction: %fadd_v8f32 = call reassoc float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %fadd_v2f64 = call reassoc double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %fadd_v4f64 = call reassoc double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %fadd_v4f128 = call reassoc fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:11 Lat:15 SizeLat:15 for: %fadd_v2f16 = call reassoc half @llvm.vector.reduce.fadd.v2f16(half 0xH0000, <2 x half> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of RThru:57 CodeSize:41 Lat:57 SizeLat:57 for: %fadd_v4f16 = call reassoc half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of RThru:169 CodeSize:121 Lat:169 SizeLat:169 for: %fadd_v8f16 = call reassoc half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of RThru:225 CodeSize:161 Lat:225 SizeLat:225 for: %fadd_v16f16 = call reassoc half @llvm.vector.reduce.fadd.v16f16(half 0xH0000, <16 x half> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:11 Lat:15 SizeLat:15 for: %fadd_v2f32 = call reassoc float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of RThru:57 CodeSize:41 Lat:57 SizeLat:57 for: %fadd_v4f32 = call reassoc float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of RThru:85 CodeSize:61 Lat:85 SizeLat:85 for: %fadd_v8f32 = call reassoc float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:11 Lat:15 SizeLat:15 for: %fadd_v2f64 = call reassoc double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:21 Lat:29 SizeLat:29 for: %fadd_v4f64 = call reassoc double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of RThru:76 CodeSize:40 Lat:76 SizeLat:76 for: %fadd_v4f128 = call reassoc fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %fadd_v2f16 = call reassoc half @llvm.vector.reduce.fadd.v2f16(half 0.0, <2 x half> undef)
   %fadd_v4f16 = call reassoc half @llvm.vector.reduce.fadd.v4f16(half 0.0, <4 x half> undef)
@@ -114,43 +114,43 @@ define void @fadd_unordered() {
 
 define void @fmul_strict() {
 ; CHECK-V8-LABEL: 'fmul_strict'
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fmul_v2f16 = call half @llvm.vector.reduce.fmul.v2f16(half 0xH0000, <2 x half> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %fmul_v4f16 = call half @llvm.vector.reduce.fmul.v4f16(half 0xH0000, <4 x half> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %fmul_v8f16 = call half @llvm.vector.reduce.fmul.v8f16(half 0xH0000, <8 x half> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %fmul_v16f16 = call half @llvm.vector.reduce.fmul.v16f16(half 0xH0000, <16 x half> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %fmul_v2f32 = call float @llvm.vector.reduce.fmul.v2f32(float 0.000000e+00, <2 x float> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fmul_v4f32 = call float @llvm.vector.reduce.fmul.v4f32(float 0.000000e+00, <4 x float> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %fmul_v8f32 = call float @llvm.vector.reduce.fmul.v8f32(float 0.000000e+00, <8 x float> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %fmul_v2f64 = call double @llvm.vector.reduce.fmul.v2f64(double 0.000000e+00, <2 x double> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fmul_v4f64 = call double @llvm.vector.reduce.fmul.v4f64(double 0.000000e+00, <4 x double> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %fmul_v4f128 = call fp128 @llvm.vector.reduce.fmul.v4f128(fp128 undef, <4 x fp128> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-V8-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:6 Lat:10 SizeLat:6 for: %fmul_v2f16 = call half @llvm.vector.reduce.fmul.v2f16(half 0xH0000, <2 x half> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:12 Lat:20 SizeLat:12 for: %fmul_v4f16 = call half @llvm.vector.reduce.fmul.v4f16(half 0xH0000, <4 x half> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:24 Lat:40 SizeLat:24 for: %fmul_v8f16 = call half @llvm.vector.reduce.fmul.v8f16(half 0xH0000, <8 x half> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:48 Lat:80 SizeLat:48 for: %fmul_v16f16 = call half @llvm.vector.reduce.fmul.v16f16(half 0xH0000, <16 x half> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:6 SizeLat:2 for: %fmul_v2f32 = call float @llvm.vector.reduce.fmul.v2f32(float 0.000000e+00, <2 x float> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:12 SizeLat:4 for: %fmul_v4f32 = call float @llvm.vector.reduce.fmul.v4f32(float 0.000000e+00, <4 x float> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:24 SizeLat:8 for: %fmul_v8f32 = call float @llvm.vector.reduce.fmul.v8f32(float 0.000000e+00, <8 x float> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:6 SizeLat:2 for: %fmul_v2f64 = call double @llvm.vector.reduce.fmul.v2f64(double 0.000000e+00, <2 x double> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:12 SizeLat:4 for: %fmul_v4f64 = call double @llvm.vector.reduce.fmul.v4f64(double 0.000000e+00, <4 x double> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of RThru:48 CodeSize:20 Lat:28 SizeLat:20 for: %fmul_v4f128 = call fp128 @llvm.vector.reduce.fmul.v4f128(fp128 undef, <4 x fp128> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-MVEFP-LABEL: 'fmul_strict'
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %fmul_v2f16 = call half @llvm.vector.reduce.fmul.v2f16(half 0xH0000, <2 x half> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %fmul_v4f16 = call half @llvm.vector.reduce.fmul.v4f16(half 0xH0000, <4 x half> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %fmul_v8f16 = call half @llvm.vector.reduce.fmul.v8f16(half 0xH0000, <8 x half> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %fmul_v16f16 = call half @llvm.vector.reduce.fmul.v16f16(half 0xH0000, <16 x half> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fmul_v2f32 = call float @llvm.vector.reduce.fmul.v2f32(float 0.000000e+00, <2 x float> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %fmul_v4f32 = call float @llvm.vector.reduce.fmul.v4f32(float 0.000000e+00, <4 x float> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fmul_v8f32 = call float @llvm.vector.reduce.fmul.v8f32(float 0.000000e+00, <8 x float> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %fmul_v2f64 = call double @llvm.vector.reduce.fmul.v2f64(double 0.000000e+00, <2 x double> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fmul_v4f64 = call double @llvm.vector.reduce.fmul.v4f64(double 0.000000e+00, <4 x double> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %fmul_v4f128 = call fp128 @llvm.vector.reduce.fmul.v4f128(fp128 undef, <4 x fp128> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 3 for: %fmul_v2f16 = call half @llvm.vector.reduce.fmul.v2f16(half 0xH0000, <2 x half> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 6 for: %fmul_v4f16 = call half @llvm.vector.reduce.fmul.v4f16(half 0xH0000, <4 x half> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 12 for: %fmul_v8f16 = call half @llvm.vector.reduce.fmul.v8f16(half 0xH0000, <8 x half> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 24 for: %fmul_v16f16 = call half @llvm.vector.reduce.fmul.v16f16(half 0xH0000, <16 x half> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 2 for: %fmul_v2f32 = call float @llvm.vector.reduce.fmul.v2f32(float 0.000000e+00, <2 x float> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 4 for: %fmul_v4f32 = call float @llvm.vector.reduce.fmul.v4f32(float 0.000000e+00, <4 x float> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 8 for: %fmul_v8f32 = call float @llvm.vector.reduce.fmul.v8f32(float 0.000000e+00, <8 x float> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 4 for: %fmul_v2f64 = call double @llvm.vector.reduce.fmul.v2f64(double 0.000000e+00, <2 x double> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 8 for: %fmul_v4f64 = call double @llvm.vector.reduce.fmul.v4f64(double 0.000000e+00, <4 x double> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 32 for: %fmul_v4f128 = call fp128 @llvm.vector.reduce.fmul.v4f128(fp128 undef, <4 x fp128> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-MVEI-LABEL: 'fmul_strict'
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %fmul_v2f16 = call half @llvm.vector.reduce.fmul.v2f16(half 0xH0000, <2 x half> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fmul_v4f16 = call half @llvm.vector.reduce.fmul.v4f16(half 0xH0000, <4 x half> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %fmul_v8f16 = call half @llvm.vector.reduce.fmul.v8f16(half 0xH0000, <8 x half> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %fmul_v16f16 = call half @llvm.vector.reduce.fmul.v16f16(half 0xH0000, <16 x half> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %fmul_v2f32 = call float @llvm.vector.reduce.fmul.v2f32(float 0.000000e+00, <2 x float> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fmul_v4f32 = call float @llvm.vector.reduce.fmul.v4f32(float 0.000000e+00, <4 x float> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %fmul_v8f32 = call float @llvm.vector.reduce.fmul.v8f32(float 0.000000e+00, <8 x float> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %fmul_v2f64 = call double @llvm.vector.reduce.fmul.v2f64(double 0.000000e+00, <2 x double> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fmul_v4f64 = call double @llvm.vector.reduce.fmul.v4f64(double 0.000000e+00, <4 x double> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %fmul_v4f128 = call fp128 @llvm.vector.reduce.fmul.v4f128(fp128 undef, <4 x fp128> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of 4 for: %fmul_v2f16 = call half @llvm.vector.reduce.fmul.v2f16(half 0xH0000, <2 x half> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of 8 for: %fmul_v4f16 = call half @llvm.vector.reduce.fmul.v4f16(half 0xH0000, <4 x half> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of 16 for: %fmul_v8f16 = call half @llvm.vector.reduce.fmul.v8f16(half 0xH0000, <8 x half> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of 32 for: %fmul_v16f16 = call half @llvm.vector.reduce.fmul.v16f16(half 0xH0000, <16 x half> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of 4 for: %fmul_v2f32 = call float @llvm.vector.reduce.fmul.v2f32(float 0.000000e+00, <2 x float> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of 8 for: %fmul_v4f32 = call float @llvm.vector.reduce.fmul.v4f32(float 0.000000e+00, <4 x float> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of 16 for: %fmul_v8f32 = call float @llvm.vector.reduce.fmul.v8f32(float 0.000000e+00, <8 x float> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of 4 for: %fmul_v2f64 = call double @llvm.vector.reduce.fmul.v2f64(double 0.000000e+00, <2 x double> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of 8 for: %fmul_v4f64 = call double @llvm.vector.reduce.fmul.v4f64(double 0.000000e+00, <4 x double> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of 32 for: %fmul_v4f128 = call fp128 @llvm.vector.reduce.fmul.v4f128(fp128 undef, <4 x fp128> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %fmul_v2f16 = call half @llvm.vector.reduce.fmul.v2f16(half 0.0, <2 x half> undef)
   %fmul_v4f16 = call half @llvm.vector.reduce.fmul.v4f16(half 0.0, <4 x half> undef)
@@ -168,43 +168,43 @@ define void @fmul_strict() {
 
 define void @fmul_unordered() {
 ; CHECK-V8-LABEL: 'fmul_unordered'
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fmul_v2f16 = call reassoc half @llvm.vector.reduce.fmul.v2f16(half 0xH0000, <2 x half> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %fmul_v4f16 = call reassoc half @llvm.vector.reduce.fmul.v4f16(half 0xH0000, <4 x half> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %fmul_v8f16 = call reassoc half @llvm.vector.reduce.fmul.v8f16(half 0xH0000, <8 x half> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %fmul_v16f16 = call reassoc half @llvm.vector.reduce.fmul.v16f16(half 0xH0000, <16 x half> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %fmul_v2f32 = call reassoc float @llvm.vector.reduce.fmul.v2f32(float 0.000000e+00, <2 x float> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %fmul_v4f32 = call reassoc float @llvm.vector.reduce.fmul.v4f32(float 0.000000e+00, <4 x float> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fmul_v8f32 = call reassoc float @llvm.vector.reduce.fmul.v8f32(float 0.000000e+00, <8 x float> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %fmul_v2f64 = call reassoc double @llvm.vector.reduce.fmul.v2f64(double 0.000000e+00, <2 x double> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %fmul_v4f64 = call reassoc double @llvm.vector.reduce.fmul.v4f64(double 0.000000e+00, <4 x double> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %fmul_v4f128 = call reassoc fp128 @llvm.vector.reduce.fmul.v4f128(fp128 undef, <4 x fp128> undef)
-; CHECK-V8-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-V8-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:7 Lat:9 SizeLat:7 for: %fmul_v2f16 = call reassoc half @llvm.vector.reduce.fmul.v2f16(half 0xH0000, <2 x half> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:16 Lat:20 SizeLat:16 for: %fmul_v4f16 = call reassoc half @llvm.vector.reduce.fmul.v4f16(half 0xH0000, <4 x half> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of RThru:44 CodeSize:33 Lat:39 SizeLat:33 for: %fmul_v8f16 = call reassoc half @llvm.vector.reduce.fmul.v8f16(half 0xH0000, <8 x half> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of RThru:92 CodeSize:66 Lat:74 SizeLat:66 for: %fmul_v16f16 = call reassoc half @llvm.vector.reduce.fmul.v16f16(half 0xH0000, <16 x half> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:6 SizeLat:2 for: %fmul_v2f32 = call reassoc float @llvm.vector.reduce.fmul.v2f32(float 0.000000e+00, <2 x float> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:3 Lat:9 SizeLat:3 for: %fmul_v4f32 = call reassoc float @llvm.vector.reduce.fmul.v4f32(float 0.000000e+00, <4 x float> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:12 SizeLat:4 for: %fmul_v8f32 = call reassoc float @llvm.vector.reduce.fmul.v8f32(float 0.000000e+00, <8 x float> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:6 SizeLat:2 for: %fmul_v2f64 = call reassoc double @llvm.vector.reduce.fmul.v2f64(double 0.000000e+00, <2 x double> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:3 Lat:9 SizeLat:3 for: %fmul_v4f64 = call reassoc double @llvm.vector.reduce.fmul.v4f64(double 0.000000e+00, <4 x double> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of RThru:52 CodeSize:30 Lat:34 SizeLat:30 for: %fmul_v4f128 = call reassoc fp128 @llvm.vector.reduce.fmul.v4f128(fp128 undef, <4 x fp128> undef)
+; CHECK-V8-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-MVEFP-LABEL: 'fmul_unordered'
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %fmul_v2f16 = call reassoc half @llvm.vector.reduce.fmul.v2f16(half 0xH0000, <2 x half> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %fmul_v4f16 = call reassoc half @llvm.vector.reduce.fmul.v4f16(half 0xH0000, <4 x half> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fmul_v8f16 = call reassoc half @llvm.vector.reduce.fmul.v8f16(half 0xH0000, <8 x half> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %fmul_v16f16 = call reassoc half @llvm.vector.reduce.fmul.v16f16(half 0xH0000, <16 x half> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fmul_v2f32 = call reassoc float @llvm.vector.reduce.fmul.v2f32(float 0.000000e+00, <2 x float> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %fmul_v4f32 = call reassoc float @llvm.vector.reduce.fmul.v4f32(float 0.000000e+00, <4 x float> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %fmul_v8f32 = call reassoc float @llvm.vector.reduce.fmul.v8f32(float 0.000000e+00, <8 x float> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %fmul_v2f64 = call reassoc double @llvm.vector.reduce.fmul.v2f64(double 0.000000e+00, <2 x double> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %fmul_v4f64 = call reassoc double @llvm.vector.reduce.fmul.v4f64(double 0.000000e+00, <4 x double> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %fmul_v4f128 = call reassoc fp128 @llvm.vector.reduce.fmul.v4f128(fp128 undef, <4 x fp128> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 3 for: %fmul_v2f16 = call reassoc half @llvm.vector.reduce.fmul.v2f16(half 0xH0000, <2 x half> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 6 for: %fmul_v4f16 = call reassoc half @llvm.vector.reduce.fmul.v4f16(half 0xH0000, <4 x half> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:6 Lat:8 SizeLat:8 for: %fmul_v8f16 = call reassoc half @llvm.vector.reduce.fmul.v8f16(half 0xH0000, <8 x half> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:7 Lat:10 SizeLat:10 for: %fmul_v16f16 = call reassoc half @llvm.vector.reduce.fmul.v16f16(half 0xH0000, <16 x half> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 2 for: %fmul_v2f32 = call reassoc float @llvm.vector.reduce.fmul.v2f32(float 0.000000e+00, <2 x float> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 4 for: %fmul_v4f32 = call reassoc float @llvm.vector.reduce.fmul.v4f32(float 0.000000e+00, <4 x float> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:5 Lat:6 SizeLat:6 for: %fmul_v8f32 = call reassoc float @llvm.vector.reduce.fmul.v8f32(float 0.000000e+00, <8 x float> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:11 Lat:15 SizeLat:15 for: %fmul_v2f64 = call reassoc double @llvm.vector.reduce.fmul.v2f64(double 0.000000e+00, <2 x double> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:21 Lat:29 SizeLat:29 for: %fmul_v4f64 = call reassoc double @llvm.vector.reduce.fmul.v4f64(double 0.000000e+00, <4 x double> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:76 CodeSize:40 Lat:76 SizeLat:76 for: %fmul_v4f128 = call reassoc fp128 @llvm.vector.reduce.fmul.v4f128(fp128 undef, <4 x fp128> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-MVEI-LABEL: 'fmul_unordered'
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %fmul_v2f16 = call reassoc half @llvm.vector.reduce.fmul.v2f16(half 0xH0000, <2 x half> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %fmul_v4f16 = call reassoc half @llvm.vector.reduce.fmul.v4f16(half 0xH0000, <4 x half> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 169 for instruction: %fmul_v8f16 = call reassoc half @llvm.vector.reduce.fmul.v8f16(half 0xH0000, <8 x half> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 225 for instruction: %fmul_v16f16 = call reassoc half @llvm.vector.reduce.fmul.v16f16(half 0xH0000, <16 x half> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %fmul_v2f32 = call reassoc float @llvm.vector.reduce.fmul.v2f32(float 0.000000e+00, <2 x float> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %fmul_v4f32 = call reassoc float @llvm.vector.reduce.fmul.v4f32(float 0.000000e+00, <4 x float> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 85 for instruction: %fmul_v8f32 = call reassoc float @llvm.vector.reduce.fmul.v8f32(float 0.000000e+00, <8 x float> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %fmul_v2f64 = call reassoc double @llvm.vector.reduce.fmul.v2f64(double 0.000000e+00, <2 x double> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %fmul_v4f64 = call reassoc double @llvm.vector.reduce.fmul.v4f64(double 0.000000e+00, <4 x double> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %fmul_v4f128 = call reassoc fp128 @llvm.vector.reduce.fmul.v4f128(fp128 undef, <4 x fp128> undef)
-; CHECK-MVEI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:11 Lat:15 SizeLat:15 for: %fmul_v2f16 = call reassoc half @llvm.vector.reduce.fmul.v2f16(half 0xH0000, <2 x half> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of RThru:57 CodeSize:41 Lat:57 SizeLat:57 for: %fmul_v4f16 = call reassoc half @llvm.vector.reduce.fmul.v4f16(half 0xH0000, <4 x half> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of RThru:169 CodeSize:121 Lat:169 SizeLat:169 for: %fmul_v8f16 = call reassoc half @llvm.vector.reduce.fmul.v8f16(half 0xH0000, <8 x half> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of RThru:225 CodeSize:161 Lat:225 SizeLat:225 for: %fmul_v16f16 = call reassoc half @llvm.vector.reduce.fmul.v16f16(half 0xH0000, <16 x half> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:11 Lat:15 SizeLat:15 for: %fmul_v2f32 = call reassoc float @llvm.vector.reduce.fmul.v2f32(float 0.000000e+00, <2 x float> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of RThru:57 CodeSize:41 Lat:57 SizeLat:57 for: %fmul_v4f32 = call reassoc float @llvm.vector.reduce.fmul.v4f32(float 0.000000e+00, <4 x float> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of RThru:85 CodeSize:61 Lat:85 SizeLat:85 for: %fmul_v8f32 = call reassoc float @llvm.vector.reduce.fmul.v8f32(float 0.000000e+00, <8 x float> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:11 Lat:15 SizeLat:15 for: %fmul_v2f64 = call reassoc double @llvm.vector.reduce.fmul.v2f64(double 0.000000e+00, <2 x double> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:21 Lat:29 SizeLat:29 for: %fmul_v4f64 = call reassoc double @llvm.vector.reduce.fmul.v4f64(double 0.000000e+00, <4 x double> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of RThru:76 CodeSize:40 Lat:76 SizeLat:76 for: %fmul_v4f128 = call reassoc fp128 @llvm.vector.reduce.fmul.v4f128(fp128 undef, <4 x fp128> undef)
+; CHECK-MVEI-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %fmul_v2f16 = call reassoc half @llvm.vector.reduce.fmul.v2f16(half 0.0, <2 x half> undef)
   %fmul_v4f16 = call reassoc half @llvm.vector.reduce.fmul.v4f16(half 0.0, <4 x half> undef)
@@ -218,27 +218,3 @@ define void @fmul_unordered() {
   %fmul_v4f128 = call reassoc fp128 @llvm.vector.reduce.fmul.v4f128(fp128 undef, <4 x fp128> undef)
   ret void
 }
-
-
-declare half @llvm.vector.reduce.fadd.v2f16(half, <2 x half>)
-declare half @llvm.vector.reduce.fadd.v4f16(half, <4 x half>)
-declare half @llvm.vector.reduce.fadd.v8f16(half, <8 x half>)
-declare half @llvm.vector.reduce.fadd.v16f16(half, <16 x half>)
-declare float @llvm.vector.reduce.fadd.v2f32(float, <2 x float>)
-declare float @llvm.vector.reduce.fadd.v4f32(float, <4 x float>)
-declare float @llvm.vector.reduce.fadd.v8f32(float, <8 x float>)
-declare double @llvm.vector.reduce.fadd.v2f64(double, <2 x double>)
-declare double @llvm.vector.reduce.fadd.v4f64(double, <4 x double>)
-declare fp128 @llvm.vector.reduce.fadd.v4f128(fp128, <4 x fp128>)
-
-
-declare half @llvm.vector.reduce.fmul.v2f16(half, <2 x half>)
-declare half @llvm.vector.reduce.fmul.v4f16(half, <4 x half>)
-declare half @llvm.vector.reduce.fmul.v8f16(half, <8 x half>)
-declare half @llvm.vector.reduce.fmul.v16f16(half, <16 x half>)
-declare float @llvm.vector.reduce.fmul.v2f32(float, <2 x float>)
-declare float @llvm.vector.reduce.fmul.v4f32(float, <4 x float>)
-declare float @llvm.vector.reduce.fmul.v8f32(float, <8 x float>)
-declare double @llvm.vector.reduce.fmul.v2f64(double, <2 x double>)
-declare double @llvm.vector.reduce.fmul.v4f64(double, <4 x double>)
-declare fp128 @llvm.vector.reduce.fmul.v4f128(fp128, <4 x fp128>)
diff --git a/llvm/test/Analysis/CostModel/ARM/reduce-or.ll b/llvm/test/Analysis/CostModel/ARM/reduce-or.ll
index a6b60103c0253..05df62d2131b0 100644
--- a/llvm/test/Analysis/CostModel/ARM/reduce-or.ll
+++ b/llvm/test/Analysis/CostModel/ARM/reduce-or.ll
@@ -1,19 +1,43 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt < %s -mtriple=armv8a-linux-gnueabihf -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv8m.main < %s | FileCheck %s --check-prefix=V8M
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=armv8a-linux-gnueabihf < %s | FileCheck %s --check-prefix=NEON
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=armv8.1m.main -mattr=+mve < %s | FileCheck %s --check-prefix=MVE
 
 target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
 
 define i32 @reduce_i1(i32 %arg) {
-; CHECK-LABEL: 'reduce_i1'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1 = call i1 @llvm.vector.reduce.or.v1i1(<1 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V32 = call i1 @llvm.vector.reduce.or.v32i1(<32 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 193 for instruction: %V64 = call i1 @llvm.vector.reduce.or.v64i1(<64 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 385 for instruction: %V128 = call i1 @llvm.vector.reduce.or.v128i1(<128 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; V8M-LABEL: 'reduce_i1'
+; V8M-NEXT:  Cost Model: Found costs of 1 for: %V1 = call i1 @llvm.vector.reduce.or.v1i1(<1 x i1> undef)
+; V8M-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:2 Lat:2 SizeLat:2 for: %V2 = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> undef)
+; V8M-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:2 Lat:2 SizeLat:2 for: %V4 = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> undef)
+; V8M-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:2 Lat:2 SizeLat:2 for: %V8 = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> undef)
+; V8M-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:2 Lat:2 SizeLat:2 for: %V16 = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> undef)
+; V8M-NEXT:  Cost Model: Found costs of RThru:33 CodeSize:2 Lat:2 SizeLat:2 for: %V32 = call i1 @llvm.vector.reduce.or.v32i1(<32 x i1> undef)
+; V8M-NEXT:  Cost Model: Found costs of RThru:65 CodeSize:2 Lat:2 SizeLat:2 for: %V64 = call i1 @llvm.vector.reduce.or.v64i1(<64 x i1> undef)
+; V8M-NEXT:  Cost Model: Found costs of RThru:129 CodeSize:2 Lat:2 SizeLat:2 for: %V128 = call i1 @llvm.vector.reduce.or.v128i1(<128 x i1> undef)
+; V8M-NEXT:  Cost Model: Found costs of 1 for: ret i32 undef
+;
+; NEON-LABEL: 'reduce_i1'
+; NEON-NEXT:  Cost Model: Found costs of 3 for: %V1 = call i1 @llvm.vector.reduce.or.v1i1(<1 x i1> undef)
+; NEON-NEXT:  Cost Model: Found costs of 2 for: %V2 = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> undef)
+; NEON-NEXT:  Cost Model: Found costs of 2 for: %V4 = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> undef)
+; NEON-NEXT:  Cost Model: Found costs of 2 for: %V8 = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> undef)
+; NEON-NEXT:  Cost Model: Found costs of 2 for: %V16 = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> undef)
+; NEON-NEXT:  Cost Model: Found costs of RThru:97 CodeSize:2 Lat:2 SizeLat:2 for: %V32 = call i1 @llvm.vector.reduce.or.v32i1(<32 x i1> undef)
+; NEON-NEXT:  Cost Model: Found costs of RThru:193 CodeSize:2 Lat:2 SizeLat:2 for: %V64 = call i1 @llvm.vector.reduce.or.v64i1(<64 x i1> undef)
+; NEON-NEXT:  Cost Model: Found costs of RThru:385 CodeSize:2 Lat:2 SizeLat:2 for: %V128 = call i1 @llvm.vector.reduce.or.v128i1(<128 x i1> undef)
+; NEON-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
+;
+; MVE-LABEL: 'reduce_i1'
+; MVE-NEXT:  Cost Model: Found costs of 4 for: %V1 = call i1 @llvm.vector.reduce.or.v1i1(<1 x i1> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:2 Lat:2 SizeLat:2 for: %V2 = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:2 Lat:2 SizeLat:2 for: %V4 = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:2 Lat:2 SizeLat:2 for: %V8 = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:2 Lat:2 SizeLat:2 for: %V16 = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:257 CodeSize:2 Lat:2 SizeLat:2 for: %V32 = call i1 @llvm.vector.reduce.or.v32i1(<32 x i1> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:513 CodeSize:2 Lat:2 SizeLat:2 for: %V64 = call i1 @llvm.vector.reduce.or.v64i1(<64 x i1> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:1025 CodeSize:2 Lat:2 SizeLat:2 for: %V128 = call i1 @llvm.vector.reduce.or.v128i1(<128 x i1> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
   %V1   = call i1 @llvm.vector.reduce.or.v1i1(<1 x i1> undef)
   %V2   = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> undef)
@@ -25,12 +49,3 @@ define i32 @reduce_i1(i32 %arg) {
   %V128 = call i1 @llvm.vector.reduce.or.v128i1(<128 x i1> undef)
   ret i32 undef
 }
-
-declare i1 @llvm.vector.reduce.or.v1i1(<1 x i1>)
-declare i1 @llvm.vector.reduce.or.v2i1(<2 x i1>)
-declare i1 @llvm.vector.reduce.or.v4i1(<4 x i1>)
-declare i1 @llvm.vector.reduce.or.v8i1(<8 x i1>)
-declare i1 @llvm.vector.reduce.or.v16i1(<16 x i1>)
-declare i1 @llvm.vector.reduce.or.v32i1(<32 x i1>)
-declare i1 @llvm.vector.reduce.or.v64i1(<64 x i1>)
-declare i1 @llvm.vector.reduce.or.v128i1(<128 x i1>)
diff --git a/llvm/test/Analysis/CostModel/ARM/reduce-smax.ll b/llvm/test/Analysis/CostModel/ARM/reduce-smax.ll
index 7dcab51e0a1cf..861d825b3ecdb 100644
--- a/llvm/test/Analysis/CostModel/ARM/reduce-smax.ll
+++ b/llvm/test/Analysis/CostModel/ARM/reduce-smax.ll
@@ -1,34 +1,34 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv8m.main < %s | FileCheck %s --check-prefix=V8M
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=armv8a-linux-gnueabihf < %s | FileCheck %s --check-prefix=NEON
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=armv8.1m.main -mattr=+mve < %s | FileCheck %s --check-prefix=MVE
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv8m.main < %s | FileCheck %s --check-prefix=V8M
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=armv8a-linux-gnueabihf < %s | FileCheck %s --check-prefix=NEON
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=armv8.1m.main -mattr=+mve < %s | FileCheck %s --check-prefix=MVE
 
 target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
 
 define i32 @reduce_i64(i32 %arg) {
 ; V8M-LABEL: 'reduce_i64'
-; V8M-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1 = call i64 @llvm.vector.reduce.smax.v1i64(<1 x i64> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2 = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V4 = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %V8 = call i64 @llvm.vector.reduce.smax.v8i64(<8 x i64> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 167 for instruction: %V16 = call i64 @llvm.vector.reduce.smax.v16i64(<16 x i64> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+; V8M-NEXT:  Cost Model: Found costs of 2 for: %V1 = call i64 @llvm.vector.reduce.smax.v1i64(<1 x i64> undef)
+; V8M-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:14 Lat:13 SizeLat:13 for: %V2 = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> undef)
+; V8M-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:38 Lat:35 SizeLat:35 for: %V4 = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> undef)
+; V8M-NEXT:  Cost Model: Found costs of RThru:79 CodeSize:86 Lat:79 SizeLat:79 for: %V8 = call i64 @llvm.vector.reduce.smax.v8i64(<8 x i64> undef)
+; V8M-NEXT:  Cost Model: Found costs of RThru:167 CodeSize:182 Lat:167 SizeLat:167 for: %V16 = call i64 @llvm.vector.reduce.smax.v16i64(<16 x i64> undef)
+; V8M-NEXT:  Cost Model: Found costs of 1 for: ret i32 undef
 ;
 ; NEON-LABEL: 'reduce_i64'
-; NEON-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1 = call i64 @llvm.vector.reduce.smax.v1i64(<1 x i64> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V4 = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V8 = call i64 @llvm.vector.reduce.smax.v8i64(<8 x i64> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 178 for instruction: %V16 = call i64 @llvm.vector.reduce.smax.v16i64(<16 x i64> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; NEON-NEXT:  Cost Model: Found costs of 3 for: %V1 = call i64 @llvm.vector.reduce.smax.v1i64(<1 x i64> undef)
+; NEON-NEXT:  Cost Model: Found costs of 17 for: %V2 = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> undef)
+; NEON-NEXT:  Cost Model: Found costs of 31 for: %V4 = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> undef)
+; NEON-NEXT:  Cost Model: Found costs of 76 for: %V8 = call i64 @llvm.vector.reduce.smax.v8i64(<8 x i64> undef)
+; NEON-NEXT:  Cost Model: Found costs of 178 for: %V16 = call i64 @llvm.vector.reduce.smax.v16i64(<16 x i64> undef)
+; NEON-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; MVE-LABEL: 'reduce_i64'
-; MVE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V1 = call i64 @llvm.vector.reduce.smax.v1i64(<1 x i64> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 110 for instruction: %V2 = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 212 for instruction: %V4 = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 416 for instruction: %V8 = call i64 @llvm.vector.reduce.smax.v8i64(<8 x i64> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 824 for instruction: %V16 = call i64 @llvm.vector.reduce.smax.v16i64(<16 x i64> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; MVE-NEXT:  Cost Model: Found costs of 8 for: %V1 = call i64 @llvm.vector.reduce.smax.v1i64(<1 x i64> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:110 CodeSize:59 Lat:110 SizeLat:110 for: %V2 = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:212 CodeSize:110 Lat:212 SizeLat:212 for: %V4 = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:416 CodeSize:212 Lat:416 SizeLat:416 for: %V8 = call i64 @llvm.vector.reduce.smax.v8i64(<8 x i64> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:824 CodeSize:416 Lat:824 SizeLat:824 for: %V16 = call i64 @llvm.vector.reduce.smax.v16i64(<16 x i64> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
   %V1  = call i64 @llvm.vector.reduce.smax.v1i64(<1 x i64> undef)
   %V2  = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> undef)
@@ -40,28 +40,28 @@ define i32 @reduce_i64(i32 %arg) {
 
 define i32 @reduce_i32(i32 %arg) {
 ; V8M-LABEL: 'reduce_i32'
-; V8M-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V8 = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 91 for instruction: %V16 = call i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 187 for instruction: %V32 = call i32 @llvm.vector.reduce.smax.v32i32(<32 x i32> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+; V8M-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:8 Lat:7 SizeLat:7 for: %V2 = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> undef)
+; V8M-NEXT:  Cost Model: Found costs of RThru:19 CodeSize:22 Lat:19 SizeLat:19 for: %V4 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> undef)
+; V8M-NEXT:  Cost Model: Found costs of RThru:43 CodeSize:50 Lat:43 SizeLat:43 for: %V8 = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> undef)
+; V8M-NEXT:  Cost Model: Found costs of RThru:91 CodeSize:106 Lat:91 SizeLat:91 for: %V16 = call i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> undef)
+; V8M-NEXT:  Cost Model: Found costs of RThru:187 CodeSize:218 Lat:187 SizeLat:187 for: %V32 = call i32 @llvm.vector.reduce.smax.v32i32(<32 x i32> undef)
+; V8M-NEXT:  Cost Model: Found costs of 1 for: ret i32 undef
 ;
 ; NEON-LABEL: 'reduce_i32'
-; NEON-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V2 = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8 = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16 = call i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32 = call i32 @llvm.vector.reduce.smax.v32i32(<32 x i32> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; NEON-NEXT:  Cost Model: Found costs of 16 for: %V2 = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> undef)
+; NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %V4 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> undef)
+; NEON-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %V8 = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> undef)
+; NEON-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: %V16 = call i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> undef)
+; NEON-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:16 Lat:32 SizeLat:32 for: %V32 = call i32 @llvm.vector.reduce.smax.v32i32(<32 x i32> undef)
+; NEON-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; MVE-LABEL: 'reduce_i32'
-; MVE-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %V2 = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8 = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16 = call i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32 = call i32 @llvm.vector.reduce.smax.v32i32(<32 x i32> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; MVE-NEXT:  Cost Model: Found costs of RThru:58 CodeSize:31 Lat:58 SizeLat:58 for: %V2 = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %V4 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %V8 = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: %V16 = call i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:16 Lat:32 SizeLat:32 for: %V32 = call i32 @llvm.vector.reduce.smax.v32i32(<32 x i32> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
   %V2  = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> undef)
   %V4  = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> undef)
@@ -73,31 +73,31 @@ define i32 @reduce_i32(i32 %arg) {
 
 define i32 @reduce_i16(i32 %arg) {
 ; V8M-LABEL: 'reduce_i16'
-; V8M-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i16 @llvm.vector.reduce.smax.v2i16(<2 x i16> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V8 = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 91 for instruction: %V16 = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 187 for instruction: %V32 = call i16 @llvm.vector.reduce.smax.v32i16(<32 x i16> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 379 for instruction: %V64 = call i16 @llvm.vector.reduce.smax.v64i16(<64 x i16> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+; V8M-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:8 Lat:7 SizeLat:7 for: %V2 = call i16 @llvm.vector.reduce.smax.v2i16(<2 x i16> undef)
+; V8M-NEXT:  Cost Model: Found costs of RThru:19 CodeSize:22 Lat:19 SizeLat:19 for: %V4 = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> undef)
+; V8M-NEXT:  Cost Model: Found costs of RThru:43 CodeSize:50 Lat:43 SizeLat:43 for: %V8 = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> undef)
+; V8M-NEXT:  Cost Model: Found costs of RThru:91 CodeSize:106 Lat:91 SizeLat:91 for: %V16 = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> undef)
+; V8M-NEXT:  Cost Model: Found costs of RThru:187 CodeSize:218 Lat:187 SizeLat:187 for: %V32 = call i16 @llvm.vector.reduce.smax.v32i16(<32 x i16> undef)
+; V8M-NEXT:  Cost Model: Found costs of RThru:379 CodeSize:442 Lat:379 SizeLat:379 for: %V64 = call i16 @llvm.vector.reduce.smax.v64i16(<64 x i16> undef)
+; V8M-NEXT:  Cost Model: Found costs of 1 for: ret i32 undef
 ;
 ; NEON-LABEL: 'reduce_i16'
-; NEON-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V2 = call i16 @llvm.vector.reduce.smax.v2i16(<2 x i16> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V4 = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16 = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V32 = call i16 @llvm.vector.reduce.smax.v32i16(<32 x i16> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V64 = call i16 @llvm.vector.reduce.smax.v64i16(<64 x i16> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; NEON-NEXT:  Cost Model: Found costs of 16 for: %V2 = call i16 @llvm.vector.reduce.smax.v2i16(<2 x i16> undef)
+; NEON-NEXT:  Cost Model: Found costs of 53 for: %V4 = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> undef)
+; NEON-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:3 Lat:6 SizeLat:6 for: %V8 = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> undef)
+; NEON-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:6 Lat:12 SizeLat:12 for: %V16 = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> undef)
+; NEON-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:12 Lat:24 SizeLat:24 for: %V32 = call i16 @llvm.vector.reduce.smax.v32i16(<32 x i16> undef)
+; NEON-NEXT:  Cost Model: Found costs of RThru:48 CodeSize:24 Lat:48 SizeLat:48 for: %V64 = call i16 @llvm.vector.reduce.smax.v64i16(<64 x i16> undef)
+; NEON-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; MVE-LABEL: 'reduce_i16'
-; MVE-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %V2 = call i16 @llvm.vector.reduce.smax.v2i16(<2 x i16> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16 = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V32 = call i16 @llvm.vector.reduce.smax.v32i16(<32 x i16> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V64 = call i16 @llvm.vector.reduce.smax.v64i16(<64 x i16> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; MVE-NEXT:  Cost Model: Found costs of RThru:58 CodeSize:31 Lat:58 SizeLat:58 for: %V2 = call i16 @llvm.vector.reduce.smax.v2i16(<2 x i16> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %V4 = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:3 Lat:6 SizeLat:6 for: %V8 = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:6 Lat:12 SizeLat:12 for: %V16 = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:12 Lat:24 SizeLat:24 for: %V32 = call i16 @llvm.vector.reduce.smax.v32i16(<32 x i16> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:48 CodeSize:24 Lat:48 SizeLat:48 for: %V64 = call i16 @llvm.vector.reduce.smax.v64i16(<64 x i16> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
   %V2  = call i16 @llvm.vector.reduce.smax.v2i16(<2 x i16> undef)
   %V4  = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> undef)
@@ -110,34 +110,34 @@ define i32 @reduce_i16(i32 %arg) {
 
 define i32 @reduce_i8(i32 %arg) {
 ; V8M-LABEL: 'reduce_i8'
-; V8M-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i8 @llvm.vector.reduce.smax.v2i8(<2 x i8> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V8 = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 91 for instruction: %V16 = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 187 for instruction: %V32 = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 379 for instruction: %V64 = call i8 @llvm.vector.reduce.smax.v64i8(<64 x i8> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 763 for instruction: %V128 = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+; V8M-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:8 Lat:7 SizeLat:7 for: %V2 = call i8 @llvm.vector.reduce.smax.v2i8(<2 x i8> undef)
+; V8M-NEXT:  Cost Model: Found costs of RThru:19 CodeSize:22 Lat:19 SizeLat:19 for: %V4 = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> undef)
+; V8M-NEXT:  Cost Model: Found costs of RThru:43 CodeSize:50 Lat:43 SizeLat:43 for: %V8 = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> undef)
+; V8M-NEXT:  Cost Model: Found costs of RThru:91 CodeSize:106 Lat:91 SizeLat:91 for: %V16 = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> undef)
+; V8M-NEXT:  Cost Model: Found costs of RThru:187 CodeSize:218 Lat:187 SizeLat:187 for: %V32 = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> undef)
+; V8M-NEXT:  Cost Model: Found costs of RThru:379 CodeSize:442 Lat:379 SizeLat:379 for: %V64 = call i8 @llvm.vector.reduce.smax.v64i8(<64 x i8> undef)
+; V8M-NEXT:  Cost Model: Found costs of RThru:763 CodeSize:890 Lat:763 SizeLat:763 for: %V128 = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> undef)
+; V8M-NEXT:  Cost Model: Found costs of 1 for: ret i32 undef
 ;
 ; NEON-LABEL: 'reduce_i8'
-; NEON-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V2 = call i8 @llvm.vector.reduce.smax.v2i8(<2 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V4 = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 150 for instruction: %V8 = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i8 @llvm.vector.reduce.smax.v64i8(<64 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V128 = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; NEON-NEXT:  Cost Model: Found costs of 16 for: %V2 = call i8 @llvm.vector.reduce.smax.v2i8(<2 x i8> undef)
+; NEON-NEXT:  Cost Model: Found costs of 53 for: %V4 = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> undef)
+; NEON-NEXT:  Cost Model: Found costs of 150 for: %V8 = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> undef)
+; NEON-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %V16 = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> undef)
+; NEON-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: %V32 = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> undef)
+; NEON-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:16 Lat:32 SizeLat:32 for: %V64 = call i8 @llvm.vector.reduce.smax.v64i8(<64 x i8> undef)
+; NEON-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:32 Lat:64 SizeLat:64 for: %V128 = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> undef)
+; NEON-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; MVE-LABEL: 'reduce_i8'
-; MVE-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %V2 = call i8 @llvm.vector.reduce.smax.v2i8(<2 x i8> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i8 @llvm.vector.reduce.smax.v64i8(<64 x i8> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V128 = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; MVE-NEXT:  Cost Model: Found costs of RThru:58 CodeSize:31 Lat:58 SizeLat:58 for: %V2 = call i8 @llvm.vector.reduce.smax.v2i8(<2 x i8> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %V4 = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:3 Lat:6 SizeLat:6 for: %V8 = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %V16 = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: %V32 = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:16 Lat:32 SizeLat:32 for: %V64 = call i8 @llvm.vector.reduce.smax.v64i8(<64 x i8> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:32 Lat:64 SizeLat:64 for: %V128 = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
   %V2   = call i8 @llvm.vector.reduce.smax.v2i8(<2 x i8> undef)
   %V4   = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> undef)
@@ -148,30 +148,3 @@ define i32 @reduce_i8(i32 %arg) {
   %V128 = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> undef)
   ret i32 undef
 }
-
-declare i64 @llvm.vector.reduce.smax.v1i64(<1 x i64>)
-declare i64 @llvm.vector.reduce.smax.v2i64(<2 x i64>)
-declare i64 @llvm.vector.reduce.smax.v4i64(<4 x i64>)
-declare i64 @llvm.vector.reduce.smax.v8i64(<8 x i64>)
-declare i64 @llvm.vector.reduce.smax.v16i64(<16 x i64>)
-
-declare i32 @llvm.vector.reduce.smax.v2i32(<2 x i32>)
-declare i32 @llvm.vector.reduce.smax.v4i32(<4 x i32>)
-declare i32 @llvm.vector.reduce.smax.v8i32(<8 x i32>)
-declare i32 @llvm.vector.reduce.smax.v16i32(<16 x i32>)
-declare i32 @llvm.vector.reduce.smax.v32i32(<32 x i32>)
-
-declare i16 @llvm.vector.reduce.smax.v2i16(<2 x i16>)
-declare i16 @llvm.vector.reduce.smax.v4i16(<4 x i16>)
-declare i16 @llvm.vector.reduce.smax.v8i16(<8 x i16>)
-declare i16 @llvm.vector.reduce.smax.v16i16(<16 x i16>)
-declare i16 @llvm.vector.reduce.smax.v32i16(<32 x i16>)
-declare i16 @llvm.vector.reduce.smax.v64i16(<64 x i16>)
-
-declare i8 @llvm.vector.reduce.smax.v2i8(<2 x i8>)
-declare i8 @llvm.vector.reduce.smax.v4i8(<4 x i8>)
-declare i8 @llvm.vector.reduce.smax.v8i8(<8 x i8>)
-declare i8 @llvm.vector.reduce.smax.v16i8(<16 x i8>)
-declare i8 @llvm.vector.reduce.smax.v32i8(<32 x i8>)
-declare i8 @llvm.vector.reduce.smax.v64i8(<64 x i8>)
-declare i8 @llvm.vector.reduce.smax.v128i8(<128 x i8>)
diff --git a/llvm/test/Analysis/CostModel/ARM/reduce-smin.ll b/llvm/test/Analysis/CostModel/ARM/reduce-smin.ll
index 617c6b4605189..d8fcc7bca9155 100644
--- a/llvm/test/Analysis/CostModel/ARM/reduce-smin.ll
+++ b/llvm/test/Analysis/CostModel/ARM/reduce-smin.ll
@@ -1,34 +1,34 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv8m.main < %s | FileCheck %s --check-prefix=V8M
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=armv8a-linux-gnueabihf < %s | FileCheck %s --check-prefix=NEON
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=armv8.1m.main -mattr=+mve < %s | FileCheck %s --check-prefix=MVE
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv8m.main < %s | FileCheck %s --check-prefix=V8M
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=armv8a-linux-gnueabihf < %s | FileCheck %s --check-prefix=NEON
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=armv8.1m.main -mattr=+mve < %s | FileCheck %s --check-prefix=MVE
 
 target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
 
 define i32 @reduce_i64(i32 %arg) {
 ; V8M-LABEL: 'reduce_i64'
-; V8M-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1 = call i64 @llvm.vector.reduce.smin.v1i64(<1 x i64> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2 = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V4 = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %V8 = call i64 @llvm.vector.reduce.smin.v8i64(<8 x i64> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 167 for instruction: %V16 = call i64 @llvm.vector.reduce.smin.v16i64(<16 x i64> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+; V8M-NEXT:  Cost Model: Found costs of 2 for: %V1 = call i64 @llvm.vector.reduce.smin.v1i64(<1 x i64> undef)
+; V8M-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:14 Lat:13 SizeLat:13 for: %V2 = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> undef)
+; V8M-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:38 Lat:35 SizeLat:35 for: %V4 = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> undef)
+; V8M-NEXT:  Cost Model: Found costs of RThru:79 CodeSize:86 Lat:79 SizeLat:79 for: %V8 = call i64 @llvm.vector.reduce.smin.v8i64(<8 x i64> undef)
+; V8M-NEXT:  Cost Model: Found costs of RThru:167 CodeSize:182 Lat:167 SizeLat:167 for: %V16 = call i64 @llvm.vector.reduce.smin.v16i64(<16 x i64> undef)
+; V8M-NEXT:  Cost Model: Found costs of 1 for: ret i32 undef
 ;
 ; NEON-LABEL: 'reduce_i64'
-; NEON-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1 = call i64 @llvm.vector.reduce.smin.v1i64(<1 x i64> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V4 = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V8 = call i64 @llvm.vector.reduce.smin.v8i64(<8 x i64> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 178 for instruction: %V16 = call i64 @llvm.vector.reduce.smin.v16i64(<16 x i64> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; NEON-NEXT:  Cost Model: Found costs of 3 for: %V1 = call i64 @llvm.vector.reduce.smin.v1i64(<1 x i64> undef)
+; NEON-NEXT:  Cost Model: Found costs of 17 for: %V2 = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> undef)
+; NEON-NEXT:  Cost Model: Found costs of 31 for: %V4 = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> undef)
+; NEON-NEXT:  Cost Model: Found costs of 76 for: %V8 = call i64 @llvm.vector.reduce.smin.v8i64(<8 x i64> undef)
+; NEON-NEXT:  Cost Model: Found costs of 178 for: %V16 = call i64 @llvm.vector.reduce.smin.v16i64(<16 x i64> undef)
+; NEON-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; MVE-LABEL: 'reduce_i64'
-; MVE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V1 = call i64 @llvm.vector.reduce.smin.v1i64(<1 x i64> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 110 for instruction: %V2 = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 212 for instruction: %V4 = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 416 for instruction: %V8 = call i64 @llvm.vector.reduce.smin.v8i64(<8 x i64> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 824 for instruction: %V16 = call i64 @llvm.vector.reduce.smin.v16i64(<16 x i64> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; MVE-NEXT:  Cost Model: Found costs of 8 for: %V1 = call i64 @llvm.vector.reduce.smin.v1i64(<1 x i64> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:110 CodeSize:59 Lat:110 SizeLat:110 for: %V2 = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:212 CodeSize:110 Lat:212 SizeLat:212 for: %V4 = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:416 CodeSize:212 Lat:416 SizeLat:416 for: %V8 = call i64 @llvm.vector.reduce.smin.v8i64(<8 x i64> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:824 CodeSize:416 Lat:824 SizeLat:824 for: %V16 = call i64 @llvm.vector.reduce.smin.v16i64(<16 x i64> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
   %V1  = call i64 @llvm.vector.reduce.smin.v1i64(<1 x i64> undef)
   %V2  = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> undef)
@@ -40,28 +40,28 @@ define i32 @reduce_i64(i32 %arg) {
 
 define i32 @reduce_i32(i32 %arg) {
 ; V8M-LABEL: 'reduce_i32'
-; V8M-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i32 @llvm.vector.reduce.smin.v2i32(<2 x i32> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V8 = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 91 for instruction: %V16 = call i32 @llvm.vector.reduce.smin.v16i32(<16 x i32> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 187 for instruction: %V32 = call i32 @llvm.vector.reduce.smin.v32i32(<32 x i32> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+; V8M-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:8 Lat:7 SizeLat:7 for: %V2 = call i32 @llvm.vector.reduce.smin.v2i32(<2 x i32> undef)
+; V8M-NEXT:  Cost Model: Found costs of RThru:19 CodeSize:22 Lat:19 SizeLat:19 for: %V4 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> undef)
+; V8M-NEXT:  Cost Model: Found costs of RThru:43 CodeSize:50 Lat:43 SizeLat:43 for: %V8 = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> undef)
+; V8M-NEXT:  Cost Model: Found costs of RThru:91 CodeSize:106 Lat:91 SizeLat:91 for: %V16 = call i32 @llvm.vector.reduce.smin.v16i32(<16 x i32> undef)
+; V8M-NEXT:  Cost Model: Found costs of RThru:187 CodeSize:218 Lat:187 SizeLat:187 for: %V32 = call i32 @llvm.vector.reduce.smin.v32i32(<32 x i32> undef)
+; V8M-NEXT:  Cost Model: Found costs of 1 for: ret i32 undef
 ;
 ; NEON-LABEL: 'reduce_i32'
-; NEON-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V2 = call i32 @llvm.vector.reduce.smin.v2i32(<2 x i32> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8 = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16 = call i32 @llvm.vector.reduce.smin.v16i32(<16 x i32> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32 = call i32 @llvm.vector.reduce.smin.v32i32(<32 x i32> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; NEON-NEXT:  Cost Model: Found costs of 16 for: %V2 = call i32 @llvm.vector.reduce.smin.v2i32(<2 x i32> undef)
+; NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %V4 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> undef)
+; NEON-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %V8 = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> undef)
+; NEON-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: %V16 = call i32 @llvm.vector.reduce.smin.v16i32(<16 x i32> undef)
+; NEON-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:16 Lat:32 SizeLat:32 for: %V32 = call i32 @llvm.vector.reduce.smin.v32i32(<32 x i32> undef)
+; NEON-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; MVE-LABEL: 'reduce_i32'
-; MVE-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %V2 = call i32 @llvm.vector.reduce.smin.v2i32(<2 x i32> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8 = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16 = call i32 @llvm.vector.reduce.smin.v16i32(<16 x i32> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32 = call i32 @llvm.vector.reduce.smin.v32i32(<32 x i32> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; MVE-NEXT:  Cost Model: Found costs of RThru:58 CodeSize:31 Lat:58 SizeLat:58 for: %V2 = call i32 @llvm.vector.reduce.smin.v2i32(<2 x i32> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %V4 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %V8 = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: %V16 = call i32 @llvm.vector.reduce.smin.v16i32(<16 x i32> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:16 Lat:32 SizeLat:32 for: %V32 = call i32 @llvm.vector.reduce.smin.v32i32(<32 x i32> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
   %V2  = call i32 @llvm.vector.reduce.smin.v2i32(<2 x i32> undef)
   %V4  = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> undef)
@@ -73,31 +73,31 @@ define i32 @reduce_i32(i32 %arg) {
 
 define i32 @reduce_i16(i32 %arg) {
 ; V8M-LABEL: 'reduce_i16'
-; V8M-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i16 @llvm.vector.reduce.smin.v2i16(<2 x i16> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V8 = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 91 for instruction: %V16 = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 187 for instruction: %V32 = call i16 @llvm.vector.reduce.smin.v32i16(<32 x i16> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 379 for instruction: %V64 = call i16 @llvm.vector.reduce.smin.v64i16(<64 x i16> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+; V8M-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:8 Lat:7 SizeLat:7 for: %V2 = call i16 @llvm.vector.reduce.smin.v2i16(<2 x i16> undef)
+; V8M-NEXT:  Cost Model: Found costs of RThru:19 CodeSize:22 Lat:19 SizeLat:19 for: %V4 = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> undef)
+; V8M-NEXT:  Cost Model: Found costs of RThru:43 CodeSize:50 Lat:43 SizeLat:43 for: %V8 = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> undef)
+; V8M-NEXT:  Cost Model: Found costs of RThru:91 CodeSize:106 Lat:91 SizeLat:91 for: %V16 = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> undef)
+; V8M-NEXT:  Cost Model: Found costs of RThru:187 CodeSize:218 Lat:187 SizeLat:187 for: %V32 = call i16 @llvm.vector.reduce.smin.v32i16(<32 x i16> undef)
+; V8M-NEXT:  Cost Model: Found costs of RThru:379 CodeSize:442 Lat:379 SizeLat:379 for: %V64 = call i16 @llvm.vector.reduce.smin.v64i16(<64 x i16> undef)
+; V8M-NEXT:  Cost Model: Found costs of 1 for: ret i32 undef
 ;
 ; NEON-LABEL: 'reduce_i16'
-; NEON-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V2 = call i16 @llvm.vector.reduce.smin.v2i16(<2 x i16> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V4 = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16 = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V32 = call i16 @llvm.vector.reduce.smin.v32i16(<32 x i16> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V64 = call i16 @llvm.vector.reduce.smin.v64i16(<64 x i16> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; NEON-NEXT:  Cost Model: Found costs of 16 for: %V2 = call i16 @llvm.vector.reduce.smin.v2i16(<2 x i16> undef)
+; NEON-NEXT:  Cost Model: Found costs of 53 for: %V4 = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> undef)
+; NEON-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:3 Lat:6 SizeLat:6 for: %V8 = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> undef)
+; NEON-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:6 Lat:12 SizeLat:12 for: %V16 = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> undef)
+; NEON-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:12 Lat:24 SizeLat:24 for: %V32 = call i16 @llvm.vector.reduce.smin.v32i16(<32 x i16> undef)
+; NEON-NEXT:  Cost Model: Found costs of RThru:48 CodeSize:24 Lat:48 SizeLat:48 for: %V64 = call i16 @llvm.vector.reduce.smin.v64i16(<64 x i16> undef)
+; NEON-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; MVE-LABEL: 'reduce_i16'
-; MVE-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %V2 = call i16 @llvm.vector.reduce.smin.v2i16(<2 x i16> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16 = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V32 = call i16 @llvm.vector.reduce.smin.v32i16(<32 x i16> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V64 = call i16 @llvm.vector.reduce.smin.v64i16(<64 x i16> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; MVE-NEXT:  Cost Model: Found costs of RThru:58 CodeSize:31 Lat:58 SizeLat:58 for: %V2 = call i16 @llvm.vector.reduce.smin.v2i16(<2 x i16> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %V4 = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:3 Lat:6 SizeLat:6 for: %V8 = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:6 Lat:12 SizeLat:12 for: %V16 = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:12 Lat:24 SizeLat:24 for: %V32 = call i16 @llvm.vector.reduce.smin.v32i16(<32 x i16> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:48 CodeSize:24 Lat:48 SizeLat:48 for: %V64 = call i16 @llvm.vector.reduce.smin.v64i16(<64 x i16> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
   %V2  = call i16 @llvm.vector.reduce.smin.v2i16(<2 x i16> undef)
   %V4  = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> undef)
@@ -110,34 +110,34 @@ define i32 @reduce_i16(i32 %arg) {
 
 define i32 @reduce_i8(i32 %arg) {
 ; V8M-LABEL: 'reduce_i8'
-; V8M-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i8 @llvm.vector.reduce.smin.v2i8(<2 x i8> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i8 @llvm.vector.reduce.smin.v4i8(<4 x i8> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V8 = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 91 for instruction: %V16 = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 187 for instruction: %V32 = call i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 379 for instruction: %V64 = call i8 @llvm.vector.reduce.smin.v64i8(<64 x i8> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 763 for instruction: %V128 = call i8 @llvm.vector.reduce.smin.v128i8(<128 x i8> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+; V8M-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:8 Lat:7 SizeLat:7 for: %V2 = call i8 @llvm.vector.reduce.smin.v2i8(<2 x i8> undef)
+; V8M-NEXT:  Cost Model: Found costs of RThru:19 CodeSize:22 Lat:19 SizeLat:19 for: %V4 = call i8 @llvm.vector.reduce.smin.v4i8(<4 x i8> undef)
+; V8M-NEXT:  Cost Model: Found costs of RThru:43 CodeSize:50 Lat:43 SizeLat:43 for: %V8 = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> undef)
+; V8M-NEXT:  Cost Model: Found costs of RThru:91 CodeSize:106 Lat:91 SizeLat:91 for: %V16 = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> undef)
+; V8M-NEXT:  Cost Model: Found costs of RThru:187 CodeSize:218 Lat:187 SizeLat:187 for: %V32 = call i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> undef)
+; V8M-NEXT:  Cost Model: Found costs of RThru:379 CodeSize:442 Lat:379 SizeLat:379 for: %V64 = call i8 @llvm.vector.reduce.smin.v64i8(<64 x i8> undef)
+; V8M-NEXT:  Cost Model: Found costs of RThru:763 CodeSize:890 Lat:763 SizeLat:763 for: %V128 = call i8 @llvm.vector.reduce.smin.v128i8(<128 x i8> undef)
+; V8M-NEXT:  Cost Model: Found costs of 1 for: ret i32 undef
 ;
 ; NEON-LABEL: 'reduce_i8'
-; NEON-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V2 = call i8 @llvm.vector.reduce.smin.v2i8(<2 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V4 = call i8 @llvm.vector.reduce.smin.v4i8(<4 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 150 for instruction: %V8 = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i8 @llvm.vector.reduce.smin.v64i8(<64 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V128 = call i8 @llvm.vector.reduce.smin.v128i8(<128 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; NEON-NEXT:  Cost Model: Found costs of 16 for: %V2 = call i8 @llvm.vector.reduce.smin.v2i8(<2 x i8> undef)
+; NEON-NEXT:  Cost Model: Found costs of 53 for: %V4 = call i8 @llvm.vector.reduce.smin.v4i8(<4 x i8> undef)
+; NEON-NEXT:  Cost Model: Found costs of 150 for: %V8 = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> undef)
+; NEON-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %V16 = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> undef)
+; NEON-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: %V32 = call i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> undef)
+; NEON-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:16 Lat:32 SizeLat:32 for: %V64 = call i8 @llvm.vector.reduce.smin.v64i8(<64 x i8> undef)
+; NEON-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:32 Lat:64 SizeLat:64 for: %V128 = call i8 @llvm.vector.reduce.smin.v128i8(<128 x i8> undef)
+; NEON-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; MVE-LABEL: 'reduce_i8'
-; MVE-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %V2 = call i8 @llvm.vector.reduce.smin.v2i8(<2 x i8> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i8 @llvm.vector.reduce.smin.v4i8(<4 x i8> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i8 @llvm.vector.reduce.smin.v64i8(<64 x i8> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V128 = call i8 @llvm.vector.reduce.smin.v128i8(<128 x i8> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; MVE-NEXT:  Cost Model: Found costs of RThru:58 CodeSize:31 Lat:58 SizeLat:58 for: %V2 = call i8 @llvm.vector.reduce.smin.v2i8(<2 x i8> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %V4 = call i8 @llvm.vector.reduce.smin.v4i8(<4 x i8> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:3 Lat:6 SizeLat:6 for: %V8 = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %V16 = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: %V32 = call i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:16 Lat:32 SizeLat:32 for: %V64 = call i8 @llvm.vector.reduce.smin.v64i8(<64 x i8> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:32 Lat:64 SizeLat:64 for: %V128 = call i8 @llvm.vector.reduce.smin.v128i8(<128 x i8> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
   %V2   = call i8 @llvm.vector.reduce.smin.v2i8(<2 x i8> undef)
   %V4   = call i8 @llvm.vector.reduce.smin.v4i8(<4 x i8> undef)
@@ -148,30 +148,3 @@ define i32 @reduce_i8(i32 %arg) {
   %V128 = call i8 @llvm.vector.reduce.smin.v128i8(<128 x i8> undef)
   ret i32 undef
 }
-
-declare i64 @llvm.vector.reduce.smin.v1i64(<1 x i64>)
-declare i64 @llvm.vector.reduce.smin.v2i64(<2 x i64>)
-declare i64 @llvm.vector.reduce.smin.v4i64(<4 x i64>)
-declare i64 @llvm.vector.reduce.smin.v8i64(<8 x i64>)
-declare i64 @llvm.vector.reduce.smin.v16i64(<16 x i64>)
-
-declare i32 @llvm.vector.reduce.smin.v2i32(<2 x i32>)
-declare i32 @llvm.vector.reduce.smin.v4i32(<4 x i32>)
-declare i32 @llvm.vector.reduce.smin.v8i32(<8 x i32>)
-declare i32 @llvm.vector.reduce.smin.v16i32(<16 x i32>)
-declare i32 @llvm.vector.reduce.smin.v32i32(<32 x i32>)
-
-declare i16 @llvm.vector.reduce.smin.v2i16(<2 x i16>)
-declare i16 @llvm.vector.reduce.smin.v4i16(<4 x i16>)
-declare i16 @llvm.vector.reduce.smin.v8i16(<8 x i16>)
-declare i16 @llvm.vector.reduce.smin.v16i16(<16 x i16>)
-declare i16 @llvm.vector.reduce.smin.v32i16(<32 x i16>)
-declare i16 @llvm.vector.reduce.smin.v64i16(<64 x i16>)
-
-declare i8 @llvm.vector.reduce.smin.v2i8(<2 x i8>)
-declare i8 @llvm.vector.reduce.smin.v4i8(<4 x i8>)
-declare i8 @llvm.vector.reduce.smin.v8i8(<8 x i8>)
-declare i8 @llvm.vector.reduce.smin.v16i8(<16 x i8>)
-declare i8 @llvm.vector.reduce.smin.v32i8(<32 x i8>)
-declare i8 @llvm.vector.reduce.smin.v64i8(<64 x i8>)
-declare i8 @llvm.vector.reduce.smin.v128i8(<128 x i8>)
diff --git a/llvm/test/Analysis/CostModel/ARM/reduce-umax.ll b/llvm/test/Analysis/CostModel/ARM/reduce-umax.ll
index 764034d18bee0..b95808a1bb4b5 100644
--- a/llvm/test/Analysis/CostModel/ARM/reduce-umax.ll
+++ b/llvm/test/Analysis/CostModel/ARM/reduce-umax.ll
@@ -1,34 +1,34 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv8m.main < %s | FileCheck %s --check-prefix=V8M
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=armv8a-linux-gnueabihf < %s | FileCheck %s --check-prefix=NEON
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=armv8.1m.main -mattr=+mve < %s | FileCheck %s --check-prefix=MVE
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv8m.main < %s | FileCheck %s --check-prefix=V8M
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=armv8a-linux-gnueabihf < %s | FileCheck %s --check-prefix=NEON
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=armv8.1m.main -mattr=+mve < %s | FileCheck %s --check-prefix=MVE
 
 target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
 
 define i32 @reduce_i64(i32 %arg) {
 ; V8M-LABEL: 'reduce_i64'
-; V8M-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1 = call i64 @llvm.vector.reduce.umax.v1i64(<1 x i64> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2 = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V4 = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %V8 = call i64 @llvm.vector.reduce.umax.v8i64(<8 x i64> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 167 for instruction: %V16 = call i64 @llvm.vector.reduce.umax.v16i64(<16 x i64> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+; V8M-NEXT:  Cost Model: Found costs of 2 for: %V1 = call i64 @llvm.vector.reduce.umax.v1i64(<1 x i64> undef)
+; V8M-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:14 Lat:13 SizeLat:13 for: %V2 = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> undef)
+; V8M-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:38 Lat:35 SizeLat:35 for: %V4 = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> undef)
+; V8M-NEXT:  Cost Model: Found costs of RThru:79 CodeSize:86 Lat:79 SizeLat:79 for: %V8 = call i64 @llvm.vector.reduce.umax.v8i64(<8 x i64> undef)
+; V8M-NEXT:  Cost Model: Found costs of RThru:167 CodeSize:182 Lat:167 SizeLat:167 for: %V16 = call i64 @llvm.vector.reduce.umax.v16i64(<16 x i64> undef)
+; V8M-NEXT:  Cost Model: Found costs of 1 for: ret i32 undef
 ;
 ; NEON-LABEL: 'reduce_i64'
-; NEON-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1 = call i64 @llvm.vector.reduce.umax.v1i64(<1 x i64> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V4 = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V8 = call i64 @llvm.vector.reduce.umax.v8i64(<8 x i64> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 178 for instruction: %V16 = call i64 @llvm.vector.reduce.umax.v16i64(<16 x i64> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; NEON-NEXT:  Cost Model: Found costs of 3 for: %V1 = call i64 @llvm.vector.reduce.umax.v1i64(<1 x i64> undef)
+; NEON-NEXT:  Cost Model: Found costs of 17 for: %V2 = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> undef)
+; NEON-NEXT:  Cost Model: Found costs of 31 for: %V4 = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> undef)
+; NEON-NEXT:  Cost Model: Found costs of 76 for: %V8 = call i64 @llvm.vector.reduce.umax.v8i64(<8 x i64> undef)
+; NEON-NEXT:  Cost Model: Found costs of 178 for: %V16 = call i64 @llvm.vector.reduce.umax.v16i64(<16 x i64> undef)
+; NEON-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; MVE-LABEL: 'reduce_i64'
-; MVE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V1 = call i64 @llvm.vector.reduce.umax.v1i64(<1 x i64> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 110 for instruction: %V2 = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 212 for instruction: %V4 = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 416 for instruction: %V8 = call i64 @llvm.vector.reduce.umax.v8i64(<8 x i64> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 824 for instruction: %V16 = call i64 @llvm.vector.reduce.umax.v16i64(<16 x i64> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; MVE-NEXT:  Cost Model: Found costs of 8 for: %V1 = call i64 @llvm.vector.reduce.umax.v1i64(<1 x i64> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:110 CodeSize:59 Lat:110 SizeLat:110 for: %V2 = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:212 CodeSize:110 Lat:212 SizeLat:212 for: %V4 = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:416 CodeSize:212 Lat:416 SizeLat:416 for: %V8 = call i64 @llvm.vector.reduce.umax.v8i64(<8 x i64> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:824 CodeSize:416 Lat:824 SizeLat:824 for: %V16 = call i64 @llvm.vector.reduce.umax.v16i64(<16 x i64> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
   %V1  = call i64 @llvm.vector.reduce.umax.v1i64(<1 x i64> undef)
   %V2  = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> undef)
@@ -40,28 +40,28 @@ define i32 @reduce_i64(i32 %arg) {
 
 define i32 @reduce_i32(i32 %arg) {
 ; V8M-LABEL: 'reduce_i32'
-; V8M-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V8 = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 91 for instruction: %V16 = call i32 @llvm.vector.reduce.umax.v16i32(<16 x i32> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 187 for instruction: %V32 = call i32 @llvm.vector.reduce.umax.v32i32(<32 x i32> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+; V8M-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:8 Lat:7 SizeLat:7 for: %V2 = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> undef)
+; V8M-NEXT:  Cost Model: Found costs of RThru:19 CodeSize:22 Lat:19 SizeLat:19 for: %V4 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> undef)
+; V8M-NEXT:  Cost Model: Found costs of RThru:43 CodeSize:50 Lat:43 SizeLat:43 for: %V8 = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> undef)
+; V8M-NEXT:  Cost Model: Found costs of RThru:91 CodeSize:106 Lat:91 SizeLat:91 for: %V16 = call i32 @llvm.vector.reduce.umax.v16i32(<16 x i32> undef)
+; V8M-NEXT:  Cost Model: Found costs of RThru:187 CodeSize:218 Lat:187 SizeLat:187 for: %V32 = call i32 @llvm.vector.reduce.umax.v32i32(<32 x i32> undef)
+; V8M-NEXT:  Cost Model: Found costs of 1 for: ret i32 undef
 ;
 ; NEON-LABEL: 'reduce_i32'
-; NEON-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V2 = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8 = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16 = call i32 @llvm.vector.reduce.umax.v16i32(<16 x i32> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32 = call i32 @llvm.vector.reduce.umax.v32i32(<32 x i32> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; NEON-NEXT:  Cost Model: Found costs of 16 for: %V2 = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> undef)
+; NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %V4 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> undef)
+; NEON-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %V8 = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> undef)
+; NEON-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: %V16 = call i32 @llvm.vector.reduce.umax.v16i32(<16 x i32> undef)
+; NEON-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:16 Lat:32 SizeLat:32 for: %V32 = call i32 @llvm.vector.reduce.umax.v32i32(<32 x i32> undef)
+; NEON-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; MVE-LABEL: 'reduce_i32'
-; MVE-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %V2 = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8 = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16 = call i32 @llvm.vector.reduce.umax.v16i32(<16 x i32> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32 = call i32 @llvm.vector.reduce.umax.v32i32(<32 x i32> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; MVE-NEXT:  Cost Model: Found costs of RThru:58 CodeSize:31 Lat:58 SizeLat:58 for: %V2 = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %V4 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %V8 = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: %V16 = call i32 @llvm.vector.reduce.umax.v16i32(<16 x i32> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:16 Lat:32 SizeLat:32 for: %V32 = call i32 @llvm.vector.reduce.umax.v32i32(<32 x i32> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
   %V2  = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> undef)
   %V4  = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> undef)
@@ -73,31 +73,31 @@ define i32 @reduce_i32(i32 %arg) {
 
 define i32 @reduce_i16(i32 %arg) {
 ; V8M-LABEL: 'reduce_i16'
-; V8M-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i16 @llvm.vector.reduce.umax.v2i16(<2 x i16> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V8 = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 91 for instruction: %V16 = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 187 for instruction: %V32 = call i16 @llvm.vector.reduce.umax.v32i16(<32 x i16> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 379 for instruction: %V64 = call i16 @llvm.vector.reduce.umax.v64i16(<64 x i16> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+; V8M-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:8 Lat:7 SizeLat:7 for: %V2 = call i16 @llvm.vector.reduce.umax.v2i16(<2 x i16> undef)
+; V8M-NEXT:  Cost Model: Found costs of RThru:19 CodeSize:22 Lat:19 SizeLat:19 for: %V4 = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> undef)
+; V8M-NEXT:  Cost Model: Found costs of RThru:43 CodeSize:50 Lat:43 SizeLat:43 for: %V8 = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> undef)
+; V8M-NEXT:  Cost Model: Found costs of RThru:91 CodeSize:106 Lat:91 SizeLat:91 for: %V16 = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> undef)
+; V8M-NEXT:  Cost Model: Found costs of RThru:187 CodeSize:218 Lat:187 SizeLat:187 for: %V32 = call i16 @llvm.vector.reduce.umax.v32i16(<32 x i16> undef)
+; V8M-NEXT:  Cost Model: Found costs of RThru:379 CodeSize:442 Lat:379 SizeLat:379 for: %V64 = call i16 @llvm.vector.reduce.umax.v64i16(<64 x i16> undef)
+; V8M-NEXT:  Cost Model: Found costs of 1 for: ret i32 undef
 ;
 ; NEON-LABEL: 'reduce_i16'
-; NEON-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V2 = call i16 @llvm.vector.reduce.umax.v2i16(<2 x i16> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V4 = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16 = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V32 = call i16 @llvm.vector.reduce.umax.v32i16(<32 x i16> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V64 = call i16 @llvm.vector.reduce.umax.v64i16(<64 x i16> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; NEON-NEXT:  Cost Model: Found costs of 16 for: %V2 = call i16 @llvm.vector.reduce.umax.v2i16(<2 x i16> undef)
+; NEON-NEXT:  Cost Model: Found costs of 53 for: %V4 = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> undef)
+; NEON-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:3 Lat:6 SizeLat:6 for: %V8 = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> undef)
+; NEON-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:6 Lat:12 SizeLat:12 for: %V16 = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> undef)
+; NEON-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:12 Lat:24 SizeLat:24 for: %V32 = call i16 @llvm.vector.reduce.umax.v32i16(<32 x i16> undef)
+; NEON-NEXT:  Cost Model: Found costs of RThru:48 CodeSize:24 Lat:48 SizeLat:48 for: %V64 = call i16 @llvm.vector.reduce.umax.v64i16(<64 x i16> undef)
+; NEON-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; MVE-LABEL: 'reduce_i16'
-; MVE-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %V2 = call i16 @llvm.vector.reduce.umax.v2i16(<2 x i16> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16 = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V32 = call i16 @llvm.vector.reduce.umax.v32i16(<32 x i16> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V64 = call i16 @llvm.vector.reduce.umax.v64i16(<64 x i16> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; MVE-NEXT:  Cost Model: Found costs of RThru:58 CodeSize:31 Lat:58 SizeLat:58 for: %V2 = call i16 @llvm.vector.reduce.umax.v2i16(<2 x i16> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %V4 = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:3 Lat:6 SizeLat:6 for: %V8 = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:6 Lat:12 SizeLat:12 for: %V16 = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:12 Lat:24 SizeLat:24 for: %V32 = call i16 @llvm.vector.reduce.umax.v32i16(<32 x i16> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:48 CodeSize:24 Lat:48 SizeLat:48 for: %V64 = call i16 @llvm.vector.reduce.umax.v64i16(<64 x i16> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
   %V2  = call i16 @llvm.vector.reduce.umax.v2i16(<2 x i16> undef)
   %V4  = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> undef)
@@ -110,34 +110,34 @@ define i32 @reduce_i16(i32 %arg) {
 
 define i32 @reduce_i8(i32 %arg) {
 ; V8M-LABEL: 'reduce_i8'
-; V8M-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i8 @llvm.vector.reduce.umax.v2i8(<2 x i8> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i8 @llvm.vector.reduce.umax.v4i8(<4 x i8> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V8 = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 91 for instruction: %V16 = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 187 for instruction: %V32 = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 379 for instruction: %V64 = call i8 @llvm.vector.reduce.umax.v64i8(<64 x i8> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 763 for instruction: %V128 = call i8 @llvm.vector.reduce.umax.v128i8(<128 x i8> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+; V8M-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:8 Lat:7 SizeLat:7 for: %V2 = call i8 @llvm.vector.reduce.umax.v2i8(<2 x i8> undef)
+; V8M-NEXT:  Cost Model: Found costs of RThru:19 CodeSize:22 Lat:19 SizeLat:19 for: %V4 = call i8 @llvm.vector.reduce.umax.v4i8(<4 x i8> undef)
+; V8M-NEXT:  Cost Model: Found costs of RThru:43 CodeSize:50 Lat:43 SizeLat:43 for: %V8 = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> undef)
+; V8M-NEXT:  Cost Model: Found costs of RThru:91 CodeSize:106 Lat:91 SizeLat:91 for: %V16 = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> undef)
+; V8M-NEXT:  Cost Model: Found costs of RThru:187 CodeSize:218 Lat:187 SizeLat:187 for: %V32 = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> undef)
+; V8M-NEXT:  Cost Model: Found costs of RThru:379 CodeSize:442 Lat:379 SizeLat:379 for: %V64 = call i8 @llvm.vector.reduce.umax.v64i8(<64 x i8> undef)
+; V8M-NEXT:  Cost Model: Found costs of RThru:763 CodeSize:890 Lat:763 SizeLat:763 for: %V128 = call i8 @llvm.vector.reduce.umax.v128i8(<128 x i8> undef)
+; V8M-NEXT:  Cost Model: Found costs of 1 for: ret i32 undef
 ;
 ; NEON-LABEL: 'reduce_i8'
-; NEON-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V2 = call i8 @llvm.vector.reduce.umax.v2i8(<2 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V4 = call i8 @llvm.vector.reduce.umax.v4i8(<4 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 150 for instruction: %V8 = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i8 @llvm.vector.reduce.umax.v64i8(<64 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V128 = call i8 @llvm.vector.reduce.umax.v128i8(<128 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; NEON-NEXT:  Cost Model: Found costs of 16 for: %V2 = call i8 @llvm.vector.reduce.umax.v2i8(<2 x i8> undef)
+; NEON-NEXT:  Cost Model: Found costs of 53 for: %V4 = call i8 @llvm.vector.reduce.umax.v4i8(<4 x i8> undef)
+; NEON-NEXT:  Cost Model: Found costs of 150 for: %V8 = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> undef)
+; NEON-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %V16 = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> undef)
+; NEON-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: %V32 = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> undef)
+; NEON-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:16 Lat:32 SizeLat:32 for: %V64 = call i8 @llvm.vector.reduce.umax.v64i8(<64 x i8> undef)
+; NEON-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:32 Lat:64 SizeLat:64 for: %V128 = call i8 @llvm.vector.reduce.umax.v128i8(<128 x i8> undef)
+; NEON-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; MVE-LABEL: 'reduce_i8'
-; MVE-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %V2 = call i8 @llvm.vector.reduce.umax.v2i8(<2 x i8> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i8 @llvm.vector.reduce.umax.v4i8(<4 x i8> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i8 @llvm.vector.reduce.umax.v64i8(<64 x i8> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V128 = call i8 @llvm.vector.reduce.umax.v128i8(<128 x i8> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; MVE-NEXT:  Cost Model: Found costs of RThru:58 CodeSize:31 Lat:58 SizeLat:58 for: %V2 = call i8 @llvm.vector.reduce.umax.v2i8(<2 x i8> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %V4 = call i8 @llvm.vector.reduce.umax.v4i8(<4 x i8> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:3 Lat:6 SizeLat:6 for: %V8 = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %V16 = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: %V32 = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:16 Lat:32 SizeLat:32 for: %V64 = call i8 @llvm.vector.reduce.umax.v64i8(<64 x i8> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:32 Lat:64 SizeLat:64 for: %V128 = call i8 @llvm.vector.reduce.umax.v128i8(<128 x i8> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
   %V2   = call i8 @llvm.vector.reduce.umax.v2i8(<2 x i8> undef)
   %V4   = call i8 @llvm.vector.reduce.umax.v4i8(<4 x i8> undef)
@@ -148,30 +148,3 @@ define i32 @reduce_i8(i32 %arg) {
   %V128 = call i8 @llvm.vector.reduce.umax.v128i8(<128 x i8> undef)
   ret i32 undef
 }
-
-declare i64 @llvm.vector.reduce.umax.v1i64(<1 x i64>)
-declare i64 @llvm.vector.reduce.umax.v2i64(<2 x i64>)
-declare i64 @llvm.vector.reduce.umax.v4i64(<4 x i64>)
-declare i64 @llvm.vector.reduce.umax.v8i64(<8 x i64>)
-declare i64 @llvm.vector.reduce.umax.v16i64(<16 x i64>)
-
-declare i32 @llvm.vector.reduce.umax.v2i32(<2 x i32>)
-declare i32 @llvm.vector.reduce.umax.v4i32(<4 x i32>)
-declare i32 @llvm.vector.reduce.umax.v8i32(<8 x i32>)
-declare i32 @llvm.vector.reduce.umax.v16i32(<16 x i32>)
-declare i32 @llvm.vector.reduce.umax.v32i32(<32 x i32>)
-
-declare i16 @llvm.vector.reduce.umax.v2i16(<2 x i16>)
-declare i16 @llvm.vector.reduce.umax.v4i16(<4 x i16>)
-declare i16 @llvm.vector.reduce.umax.v8i16(<8 x i16>)
-declare i16 @llvm.vector.reduce.umax.v16i16(<16 x i16>)
-declare i16 @llvm.vector.reduce.umax.v32i16(<32 x i16>)
-declare i16 @llvm.vector.reduce.umax.v64i16(<64 x i16>)
-
-declare i8 @llvm.vector.reduce.umax.v2i8(<2 x i8>)
-declare i8 @llvm.vector.reduce.umax.v4i8(<4 x i8>)
-declare i8 @llvm.vector.reduce.umax.v8i8(<8 x i8>)
-declare i8 @llvm.vector.reduce.umax.v16i8(<16 x i8>)
-declare i8 @llvm.vector.reduce.umax.v32i8(<32 x i8>)
-declare i8 @llvm.vector.reduce.umax.v64i8(<64 x i8>)
-declare i8 @llvm.vector.reduce.umax.v128i8(<128 x i8>)
diff --git a/llvm/test/Analysis/CostModel/ARM/reduce-umin.ll b/llvm/test/Analysis/CostModel/ARM/reduce-umin.ll
index b5431f63bdca9..055606b0f8617 100644
--- a/llvm/test/Analysis/CostModel/ARM/reduce-umin.ll
+++ b/llvm/test/Analysis/CostModel/ARM/reduce-umin.ll
@@ -1,34 +1,34 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv8m.main < %s | FileCheck %s --check-prefix=V8M
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=armv8a-linux-gnueabihf < %s | FileCheck %s --check-prefix=NEON
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=armv8.1m.main -mattr=+mve < %s | FileCheck %s --check-prefix=MVE
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv8m.main < %s | FileCheck %s --check-prefix=V8M
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=armv8a-linux-gnueabihf < %s | FileCheck %s --check-prefix=NEON
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=armv8.1m.main -mattr=+mve < %s | FileCheck %s --check-prefix=MVE
 
 target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
 
 define i32 @reduce_i64(i32 %arg) {
 ; V8M-LABEL: 'reduce_i64'
-; V8M-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1 = call i64 @llvm.vector.reduce.umin.v1i64(<1 x i64> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2 = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V4 = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %V8 = call i64 @llvm.vector.reduce.umin.v8i64(<8 x i64> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 167 for instruction: %V16 = call i64 @llvm.vector.reduce.umin.v16i64(<16 x i64> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+; V8M-NEXT:  Cost Model: Found costs of 2 for: %V1 = call i64 @llvm.vector.reduce.umin.v1i64(<1 x i64> undef)
+; V8M-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:14 Lat:13 SizeLat:13 for: %V2 = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> undef)
+; V8M-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:38 Lat:35 SizeLat:35 for: %V4 = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> undef)
+; V8M-NEXT:  Cost Model: Found costs of RThru:79 CodeSize:86 Lat:79 SizeLat:79 for: %V8 = call i64 @llvm.vector.reduce.umin.v8i64(<8 x i64> undef)
+; V8M-NEXT:  Cost Model: Found costs of RThru:167 CodeSize:182 Lat:167 SizeLat:167 for: %V16 = call i64 @llvm.vector.reduce.umin.v16i64(<16 x i64> undef)
+; V8M-NEXT:  Cost Model: Found costs of 1 for: ret i32 undef
 ;
 ; NEON-LABEL: 'reduce_i64'
-; NEON-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1 = call i64 @llvm.vector.reduce.umin.v1i64(<1 x i64> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V4 = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V8 = call i64 @llvm.vector.reduce.umin.v8i64(<8 x i64> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 178 for instruction: %V16 = call i64 @llvm.vector.reduce.umin.v16i64(<16 x i64> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; NEON-NEXT:  Cost Model: Found costs of 3 for: %V1 = call i64 @llvm.vector.reduce.umin.v1i64(<1 x i64> undef)
+; NEON-NEXT:  Cost Model: Found costs of 17 for: %V2 = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> undef)
+; NEON-NEXT:  Cost Model: Found costs of 31 for: %V4 = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> undef)
+; NEON-NEXT:  Cost Model: Found costs of 76 for: %V8 = call i64 @llvm.vector.reduce.umin.v8i64(<8 x i64> undef)
+; NEON-NEXT:  Cost Model: Found costs of 178 for: %V16 = call i64 @llvm.vector.reduce.umin.v16i64(<16 x i64> undef)
+; NEON-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; MVE-LABEL: 'reduce_i64'
-; MVE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V1 = call i64 @llvm.vector.reduce.umin.v1i64(<1 x i64> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 110 for instruction: %V2 = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 212 for instruction: %V4 = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 416 for instruction: %V8 = call i64 @llvm.vector.reduce.umin.v8i64(<8 x i64> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 824 for instruction: %V16 = call i64 @llvm.vector.reduce.umin.v16i64(<16 x i64> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; MVE-NEXT:  Cost Model: Found costs of 8 for: %V1 = call i64 @llvm.vector.reduce.umin.v1i64(<1 x i64> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:110 CodeSize:59 Lat:110 SizeLat:110 for: %V2 = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:212 CodeSize:110 Lat:212 SizeLat:212 for: %V4 = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:416 CodeSize:212 Lat:416 SizeLat:416 for: %V8 = call i64 @llvm.vector.reduce.umin.v8i64(<8 x i64> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:824 CodeSize:416 Lat:824 SizeLat:824 for: %V16 = call i64 @llvm.vector.reduce.umin.v16i64(<16 x i64> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
   %V1  = call i64 @llvm.vector.reduce.umin.v1i64(<1 x i64> undef)
   %V2  = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> undef)
@@ -40,28 +40,28 @@ define i32 @reduce_i64(i32 %arg) {
 
 define i32 @reduce_i32(i32 %arg) {
 ; V8M-LABEL: 'reduce_i32'
-; V8M-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i32 @llvm.vector.reduce.umin.v2i32(<2 x i32> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V8 = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 91 for instruction: %V16 = call i32 @llvm.vector.reduce.umin.v16i32(<16 x i32> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 187 for instruction: %V32 = call i32 @llvm.vector.reduce.umin.v32i32(<32 x i32> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+; V8M-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:8 Lat:7 SizeLat:7 for: %V2 = call i32 @llvm.vector.reduce.umin.v2i32(<2 x i32> undef)
+; V8M-NEXT:  Cost Model: Found costs of RThru:19 CodeSize:22 Lat:19 SizeLat:19 for: %V4 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> undef)
+; V8M-NEXT:  Cost Model: Found costs of RThru:43 CodeSize:50 Lat:43 SizeLat:43 for: %V8 = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> undef)
+; V8M-NEXT:  Cost Model: Found costs of RThru:91 CodeSize:106 Lat:91 SizeLat:91 for: %V16 = call i32 @llvm.vector.reduce.umin.v16i32(<16 x i32> undef)
+; V8M-NEXT:  Cost Model: Found costs of RThru:187 CodeSize:218 Lat:187 SizeLat:187 for: %V32 = call i32 @llvm.vector.reduce.umin.v32i32(<32 x i32> undef)
+; V8M-NEXT:  Cost Model: Found costs of 1 for: ret i32 undef
 ;
 ; NEON-LABEL: 'reduce_i32'
-; NEON-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V2 = call i32 @llvm.vector.reduce.umin.v2i32(<2 x i32> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8 = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16 = call i32 @llvm.vector.reduce.umin.v16i32(<16 x i32> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32 = call i32 @llvm.vector.reduce.umin.v32i32(<32 x i32> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; NEON-NEXT:  Cost Model: Found costs of 16 for: %V2 = call i32 @llvm.vector.reduce.umin.v2i32(<2 x i32> undef)
+; NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %V4 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> undef)
+; NEON-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %V8 = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> undef)
+; NEON-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: %V16 = call i32 @llvm.vector.reduce.umin.v16i32(<16 x i32> undef)
+; NEON-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:16 Lat:32 SizeLat:32 for: %V32 = call i32 @llvm.vector.reduce.umin.v32i32(<32 x i32> undef)
+; NEON-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; MVE-LABEL: 'reduce_i32'
-; MVE-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %V2 = call i32 @llvm.vector.reduce.umin.v2i32(<2 x i32> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8 = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16 = call i32 @llvm.vector.reduce.umin.v16i32(<16 x i32> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32 = call i32 @llvm.vector.reduce.umin.v32i32(<32 x i32> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; MVE-NEXT:  Cost Model: Found costs of RThru:58 CodeSize:31 Lat:58 SizeLat:58 for: %V2 = call i32 @llvm.vector.reduce.umin.v2i32(<2 x i32> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %V4 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %V8 = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: %V16 = call i32 @llvm.vector.reduce.umin.v16i32(<16 x i32> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:16 Lat:32 SizeLat:32 for: %V32 = call i32 @llvm.vector.reduce.umin.v32i32(<32 x i32> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
   %V2  = call i32 @llvm.vector.reduce.umin.v2i32(<2 x i32> undef)
   %V4  = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> undef)
@@ -73,31 +73,31 @@ define i32 @reduce_i32(i32 %arg) {
 
 define i32 @reduce_i16(i32 %arg) {
 ; V8M-LABEL: 'reduce_i16'
-; V8M-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i16 @llvm.vector.reduce.umin.v2i16(<2 x i16> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V8 = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 91 for instruction: %V16 = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 187 for instruction: %V32 = call i16 @llvm.vector.reduce.umin.v32i16(<32 x i16> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 379 for instruction: %V64 = call i16 @llvm.vector.reduce.umin.v64i16(<64 x i16> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+; V8M-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:8 Lat:7 SizeLat:7 for: %V2 = call i16 @llvm.vector.reduce.umin.v2i16(<2 x i16> undef)
+; V8M-NEXT:  Cost Model: Found costs of RThru:19 CodeSize:22 Lat:19 SizeLat:19 for: %V4 = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> undef)
+; V8M-NEXT:  Cost Model: Found costs of RThru:43 CodeSize:50 Lat:43 SizeLat:43 for: %V8 = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> undef)
+; V8M-NEXT:  Cost Model: Found costs of RThru:91 CodeSize:106 Lat:91 SizeLat:91 for: %V16 = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> undef)
+; V8M-NEXT:  Cost Model: Found costs of RThru:187 CodeSize:218 Lat:187 SizeLat:187 for: %V32 = call i16 @llvm.vector.reduce.umin.v32i16(<32 x i16> undef)
+; V8M-NEXT:  Cost Model: Found costs of RThru:379 CodeSize:442 Lat:379 SizeLat:379 for: %V64 = call i16 @llvm.vector.reduce.umin.v64i16(<64 x i16> undef)
+; V8M-NEXT:  Cost Model: Found costs of 1 for: ret i32 undef
 ;
 ; NEON-LABEL: 'reduce_i16'
-; NEON-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V2 = call i16 @llvm.vector.reduce.umin.v2i16(<2 x i16> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V4 = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16 = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V32 = call i16 @llvm.vector.reduce.umin.v32i16(<32 x i16> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V64 = call i16 @llvm.vector.reduce.umin.v64i16(<64 x i16> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; NEON-NEXT:  Cost Model: Found costs of 16 for: %V2 = call i16 @llvm.vector.reduce.umin.v2i16(<2 x i16> undef)
+; NEON-NEXT:  Cost Model: Found costs of 53 for: %V4 = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> undef)
+; NEON-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:3 Lat:6 SizeLat:6 for: %V8 = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> undef)
+; NEON-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:6 Lat:12 SizeLat:12 for: %V16 = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> undef)
+; NEON-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:12 Lat:24 SizeLat:24 for: %V32 = call i16 @llvm.vector.reduce.umin.v32i16(<32 x i16> undef)
+; NEON-NEXT:  Cost Model: Found costs of RThru:48 CodeSize:24 Lat:48 SizeLat:48 for: %V64 = call i16 @llvm.vector.reduce.umin.v64i16(<64 x i16> undef)
+; NEON-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; MVE-LABEL: 'reduce_i16'
-; MVE-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %V2 = call i16 @llvm.vector.reduce.umin.v2i16(<2 x i16> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16 = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V32 = call i16 @llvm.vector.reduce.umin.v32i16(<32 x i16> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V64 = call i16 @llvm.vector.reduce.umin.v64i16(<64 x i16> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; MVE-NEXT:  Cost Model: Found costs of RThru:58 CodeSize:31 Lat:58 SizeLat:58 for: %V2 = call i16 @llvm.vector.reduce.umin.v2i16(<2 x i16> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %V4 = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:3 Lat:6 SizeLat:6 for: %V8 = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:6 Lat:12 SizeLat:12 for: %V16 = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:12 Lat:24 SizeLat:24 for: %V32 = call i16 @llvm.vector.reduce.umin.v32i16(<32 x i16> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:48 CodeSize:24 Lat:48 SizeLat:48 for: %V64 = call i16 @llvm.vector.reduce.umin.v64i16(<64 x i16> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
   %V2  = call i16 @llvm.vector.reduce.umin.v2i16(<2 x i16> undef)
   %V4  = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> undef)
@@ -110,34 +110,34 @@ define i32 @reduce_i16(i32 %arg) {
 
 define i32 @reduce_i8(i32 %arg) {
 ; V8M-LABEL: 'reduce_i8'
-; V8M-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i8 @llvm.vector.reduce.umin.v2i8(<2 x i8> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i8 @llvm.vector.reduce.umin.v4i8(<4 x i8> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V8 = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 91 for instruction: %V16 = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 187 for instruction: %V32 = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 379 for instruction: %V64 = call i8 @llvm.vector.reduce.umin.v64i8(<64 x i8> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 763 for instruction: %V128 = call i8 @llvm.vector.reduce.umin.v128i8(<128 x i8> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+; V8M-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:8 Lat:7 SizeLat:7 for: %V2 = call i8 @llvm.vector.reduce.umin.v2i8(<2 x i8> undef)
+; V8M-NEXT:  Cost Model: Found costs of RThru:19 CodeSize:22 Lat:19 SizeLat:19 for: %V4 = call i8 @llvm.vector.reduce.umin.v4i8(<4 x i8> undef)
+; V8M-NEXT:  Cost Model: Found costs of RThru:43 CodeSize:50 Lat:43 SizeLat:43 for: %V8 = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> undef)
+; V8M-NEXT:  Cost Model: Found costs of RThru:91 CodeSize:106 Lat:91 SizeLat:91 for: %V16 = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> undef)
+; V8M-NEXT:  Cost Model: Found costs of RThru:187 CodeSize:218 Lat:187 SizeLat:187 for: %V32 = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> undef)
+; V8M-NEXT:  Cost Model: Found costs of RThru:379 CodeSize:442 Lat:379 SizeLat:379 for: %V64 = call i8 @llvm.vector.reduce.umin.v64i8(<64 x i8> undef)
+; V8M-NEXT:  Cost Model: Found costs of RThru:763 CodeSize:890 Lat:763 SizeLat:763 for: %V128 = call i8 @llvm.vector.reduce.umin.v128i8(<128 x i8> undef)
+; V8M-NEXT:  Cost Model: Found costs of 1 for: ret i32 undef
 ;
 ; NEON-LABEL: 'reduce_i8'
-; NEON-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V2 = call i8 @llvm.vector.reduce.umin.v2i8(<2 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V4 = call i8 @llvm.vector.reduce.umin.v4i8(<4 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 150 for instruction: %V8 = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i8 @llvm.vector.reduce.umin.v64i8(<64 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V128 = call i8 @llvm.vector.reduce.umin.v128i8(<128 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; NEON-NEXT:  Cost Model: Found costs of 16 for: %V2 = call i8 @llvm.vector.reduce.umin.v2i8(<2 x i8> undef)
+; NEON-NEXT:  Cost Model: Found costs of 53 for: %V4 = call i8 @llvm.vector.reduce.umin.v4i8(<4 x i8> undef)
+; NEON-NEXT:  Cost Model: Found costs of 150 for: %V8 = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> undef)
+; NEON-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %V16 = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> undef)
+; NEON-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: %V32 = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> undef)
+; NEON-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:16 Lat:32 SizeLat:32 for: %V64 = call i8 @llvm.vector.reduce.umin.v64i8(<64 x i8> undef)
+; NEON-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:32 Lat:64 SizeLat:64 for: %V128 = call i8 @llvm.vector.reduce.umin.v128i8(<128 x i8> undef)
+; NEON-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; MVE-LABEL: 'reduce_i8'
-; MVE-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %V2 = call i8 @llvm.vector.reduce.umin.v2i8(<2 x i8> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i8 @llvm.vector.reduce.umin.v4i8(<4 x i8> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i8 @llvm.vector.reduce.umin.v64i8(<64 x i8> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V128 = call i8 @llvm.vector.reduce.umin.v128i8(<128 x i8> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; MVE-NEXT:  Cost Model: Found costs of RThru:58 CodeSize:31 Lat:58 SizeLat:58 for: %V2 = call i8 @llvm.vector.reduce.umin.v2i8(<2 x i8> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %V4 = call i8 @llvm.vector.reduce.umin.v4i8(<4 x i8> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:3 Lat:6 SizeLat:6 for: %V8 = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %V16 = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: %V32 = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:16 Lat:32 SizeLat:32 for: %V64 = call i8 @llvm.vector.reduce.umin.v64i8(<64 x i8> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:32 Lat:64 SizeLat:64 for: %V128 = call i8 @llvm.vector.reduce.umin.v128i8(<128 x i8> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
   %V2   = call i8 @llvm.vector.reduce.umin.v2i8(<2 x i8> undef)
   %V4   = call i8 @llvm.vector.reduce.umin.v4i8(<4 x i8> undef)
@@ -148,30 +148,3 @@ define i32 @reduce_i8(i32 %arg) {
   %V128 = call i8 @llvm.vector.reduce.umin.v128i8(<128 x i8> undef)
   ret i32 undef
 }
-
-declare i64 @llvm.vector.reduce.umin.v1i64(<1 x i64>)
-declare i64 @llvm.vector.reduce.umin.v2i64(<2 x i64>)
-declare i64 @llvm.vector.reduce.umin.v4i64(<4 x i64>)
-declare i64 @llvm.vector.reduce.umin.v8i64(<8 x i64>)
-declare i64 @llvm.vector.reduce.umin.v16i64(<16 x i64>)
-
-declare i32 @llvm.vector.reduce.umin.v2i32(<2 x i32>)
-declare i32 @llvm.vector.reduce.umin.v4i32(<4 x i32>)
-declare i32 @llvm.vector.reduce.umin.v8i32(<8 x i32>)
-declare i32 @llvm.vector.reduce.umin.v16i32(<16 x i32>)
-declare i32 @llvm.vector.reduce.umin.v32i32(<32 x i32>)
-
-declare i16 @llvm.vector.reduce.umin.v2i16(<2 x i16>)
-declare i16 @llvm.vector.reduce.umin.v4i16(<4 x i16>)
-declare i16 @llvm.vector.reduce.umin.v8i16(<8 x i16>)
-declare i16 @llvm.vector.reduce.umin.v16i16(<16 x i16>)
-declare i16 @llvm.vector.reduce.umin.v32i16(<32 x i16>)
-declare i16 @llvm.vector.reduce.umin.v64i16(<64 x i16>)
-
-declare i8 @llvm.vector.reduce.umin.v2i8(<2 x i8>)
-declare i8 @llvm.vector.reduce.umin.v4i8(<4 x i8>)
-declare i8 @llvm.vector.reduce.umin.v8i8(<8 x i8>)
-declare i8 @llvm.vector.reduce.umin.v16i8(<16 x i8>)
-declare i8 @llvm.vector.reduce.umin.v32i8(<32 x i8>)
-declare i8 @llvm.vector.reduce.umin.v64i8(<64 x i8>)
-declare i8 @llvm.vector.reduce.umin.v128i8(<128 x i8>)
diff --git a/llvm/test/Analysis/DependenceAnalysis/ExactSIV.ll b/llvm/test/Analysis/DependenceAnalysis/ExactSIV.ll
index b6b44ad4bfc53..2a809c32d7d21 100644
--- a/llvm/test/Analysis/DependenceAnalysis/ExactSIV.ll
+++ b/llvm/test/Analysis/DependenceAnalysis/ExactSIV.ll
@@ -1,6 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt < %s -disable-output "-passes=print<da>" -aa-pipeline=basic-aa 2>&1 \
 ; RUN: | FileCheck %s
+; RUN: opt < %s -disable-output "-passes=print<da>" -da-run-siv-routines-only 2>&1 \
+; RUN: | FileCheck %s --check-prefix=CHECK-SIV-ONLY
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.6.0"
@@ -25,6 +27,20 @@ define void @exact0(ptr %A, ptr %B) nounwind uwtable ssp {
 ; CHECK-NEXT:  Src: store i32 %0, ptr %B.addr.01, align 4 --> Dst: store i32 %0, ptr %B.addr.01, align 4
 ; CHECK-NEXT:    da analyze - none!
 ;
+; CHECK-SIV-ONLY-LABEL: 'exact0'
+; CHECK-SIV-ONLY-NEXT:  Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: store i32 %conv, ptr %arrayidx, align 4
+; CHECK-SIV-ONLY-NEXT:    da analyze - none!
+; CHECK-SIV-ONLY-NEXT:  Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: %0 = load i32, ptr %arrayidx2, align 4
+; CHECK-SIV-ONLY-NEXT:    da analyze - flow [<=|<]!
+; CHECK-SIV-ONLY-NEXT:  Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: store i32 %0, ptr %B.addr.01, align 4
+; CHECK-SIV-ONLY-NEXT:    da analyze - confused!
+; CHECK-SIV-ONLY-NEXT:  Src: %0 = load i32, ptr %arrayidx2, align 4 --> Dst: %0 = load i32, ptr %arrayidx2, align 4
+; CHECK-SIV-ONLY-NEXT:    da analyze - none!
+; CHECK-SIV-ONLY-NEXT:  Src: %0 = load i32, ptr %arrayidx2, align 4 --> Dst: store i32 %0, ptr %B.addr.01, align 4
+; CHECK-SIV-ONLY-NEXT:    da analyze - confused!
+; CHECK-SIV-ONLY-NEXT:  Src: store i32 %0, ptr %B.addr.01, align 4 --> Dst: store i32 %0, ptr %B.addr.01, align 4
+; CHECK-SIV-ONLY-NEXT:    da analyze - none!
+;
 entry:
   br label %for.body
 
@@ -69,6 +85,20 @@ define void @exact1(ptr %A, ptr %B) nounwind uwtable ssp {
 ; CHECK-NEXT:  Src: store i32 %0, ptr %B.addr.01, align 4 --> Dst: store i32 %0, ptr %B.addr.01, align 4
 ; CHECK-NEXT:    da analyze - none!
 ;
+; CHECK-SIV-ONLY-LABEL: 'exact1'
+; CHECK-SIV-ONLY-NEXT:  Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: store i32 %conv, ptr %arrayidx, align 4
+; CHECK-SIV-ONLY-NEXT:    da analyze - none!
+; CHECK-SIV-ONLY-NEXT:  Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: %0 = load i32, ptr %arrayidx3, align 4
+; CHECK-SIV-ONLY-NEXT:    da analyze - none!
+; CHECK-SIV-ONLY-NEXT:  Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: store i32 %0, ptr %B.addr.01, align 4
+; CHECK-SIV-ONLY-NEXT:    da analyze - confused!
+; CHECK-SIV-ONLY-NEXT:  Src: %0 = load i32, ptr %arrayidx3, align 4 --> Dst: %0 = load i32, ptr %arrayidx3, align 4
+; CHECK-SIV-ONLY-NEXT:    da analyze - none!
+; CHECK-SIV-ONLY-NEXT:  Src: %0 = load i32, ptr %arrayidx3, align 4 --> Dst: store i32 %0, ptr %B.addr.01, align 4
+; CHECK-SIV-ONLY-NEXT:    da analyze - confused!
+; CHECK-SIV-ONLY-NEXT:  Src: store i32 %0, ptr %B.addr.01, align 4 --> Dst: store i32 %0, ptr %B.addr.01, align 4
+; CHECK-SIV-ONLY-NEXT:    da analyze - none!
+;
 entry:
   br label %for.body
 
@@ -114,6 +144,20 @@ define void @exact2(ptr %A, ptr %B) nounwind uwtable ssp {
 ; CHECK-NEXT:  Src: store i32 %0, ptr %B.addr.01, align 4 --> Dst: store i32 %0, ptr %B.addr.01, align 4
 ; CHECK-NEXT:    da analyze - none!
 ;
+; CHECK-SIV-ONLY-LABEL: 'exact2'
+; CHECK-SIV-ONLY-NEXT:  Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: store i32 %conv, ptr %arrayidx, align 4
+; CHECK-SIV-ONLY-NEXT:    da analyze - none!
+; CHECK-SIV-ONLY-NEXT:  Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: %0 = load i32, ptr %arrayidx1, align 4
+; CHECK-SIV-ONLY-NEXT:    da analyze - none!
+; CHECK-SIV-ONLY-NEXT:  Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: store i32 %0, ptr %B.addr.01, align 4
+; CHECK-SIV-ONLY-NEXT:    da analyze - confused!
+; CHECK-SIV-ONLY-NEXT:  Src: %0 = load i32, ptr %arrayidx1, align 4 --> Dst: %0 = load i32, ptr %arrayidx1, align 4
+; CHECK-SIV-ONLY-NEXT:    da analyze - none!
+; CHECK-SIV-ONLY-NEXT:  Src: %0 = load i32, ptr %arrayidx1, align 4 --> Dst: store i32 %0, ptr %B.addr.01, align 4
+; CHECK-SIV-ONLY-NEXT:    da analyze - confused!
+; CHECK-SIV-ONLY-NEXT:  Src: store i32 %0, ptr %B.addr.01, align 4 --> Dst: store i32 %0, ptr %B.addr.01, align 4
+; CHECK-SIV-ONLY-NEXT:    da analyze - none!
+;
 entry:
   br label %for.body
 
@@ -157,6 +201,20 @@ define void @exact3(ptr %A, ptr %B) nounwind uwtable ssp {
 ; CHECK-NEXT:  Src: store i32 %0, ptr %B.addr.01, align 4 --> Dst: store i32 %0, ptr %B.addr.01, align 4
 ; CHECK-NEXT:    da analyze - none!
 ;
+; CHECK-SIV-ONLY-LABEL: 'exact3'
+; CHECK-SIV-ONLY-NEXT:  Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: store i32 %conv, ptr %arrayidx, align 4
+; CHECK-SIV-ONLY-NEXT:    da analyze - none!
+; CHECK-SIV-ONLY-NEXT:  Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: %0 = load i32, ptr %arrayidx1, align 4
+; CHECK-SIV-ONLY-NEXT:    da analyze - flow [>]!
+; CHECK-SIV-ONLY-NEXT:  Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: store i32 %0, ptr %B.addr.01, align 4
+; CHECK-SIV-ONLY-NEXT:    da analyze - confused!
+; CHECK-SIV-ONLY-NEXT:  Src: %0 = load i32, ptr %arrayidx1, align 4 --> Dst: %0 = load i32, ptr %arrayidx1, align 4
+; CHECK-SIV-ONLY-NEXT:    da analyze - none!
+; CHECK-SIV-ONLY-NEXT:  Src: %0 = load i32, ptr %arrayidx1, align 4 --> Dst: store i32 %0, ptr %B.addr.01, align 4
+; CHECK-SIV-ONLY-NEXT:    da analyze - confused!
+; CHECK-SIV-ONLY-NEXT:  Src: store i32 %0, ptr %B.addr.01, align 4 --> Dst: store i32 %0, ptr %B.addr.01, align 4
+; CHECK-SIV-ONLY-NEXT:    da analyze - none!
+;
 entry:
   br label %for.body
 
@@ -200,6 +258,20 @@ define void @exact4(ptr %A, ptr %B) nounwind uwtable ssp {
 ; CHECK-NEXT:  Src: store i32 %0, ptr %B.addr.01, align 4 --> Dst: store i32 %0, ptr %B.addr.01, align 4
 ; CHECK-NEXT:    da analyze - none!
 ;
+; CHECK-SIV-ONLY-LABEL: 'exact4'
+; CHECK-SIV-ONLY-NEXT:  Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: store i32 %conv, ptr %arrayidx, align 4
+; CHECK-SIV-ONLY-NEXT:    da analyze - none!
+; CHECK-SIV-ONLY-NEXT:  Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: %0 = load i32, ptr %arrayidx1, align 4
+; CHECK-SIV-ONLY-NEXT:    da analyze - flow [>]!
+; CHECK-SIV-ONLY-NEXT:  Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: store i32 %0, ptr %B.addr.01, align 4
+; CHECK-SIV-ONLY-NEXT:    da analyze - confused!
+; CHECK-SIV-ONLY-NEXT:  Src: %0 = load i32, ptr %arrayidx1, align 4 --> Dst: %0 = load i32, ptr %arrayidx1, align 4
+; CHECK-SIV-ONLY-NEXT:    da analyze - none!
+; CHECK-SIV-ONLY-NEXT:  Src: %0 = load i32, ptr %arrayidx1, align 4 --> Dst: store i32 %0, ptr %B.addr.01, align 4
+; CHECK-SIV-ONLY-NEXT:    da analyze - confused!
+; CHECK-SIV-ONLY-NEXT:  Src: store i32 %0, ptr %B.addr.01, align 4 --> Dst: store i32 %0, ptr %B.addr.01, align 4
+; CHECK-SIV-ONLY-NEXT:    da analyze - none!
+;
 entry:
   br label %for.body
 
@@ -243,6 +315,20 @@ define void @exact5(ptr %A, ptr %B) nounwind uwtable ssp {
 ; CHECK-NEXT:  Src: store i32 %0, ptr %B.addr.01, align 4 --> Dst: store i32 %0, ptr %B.addr.01, align 4
 ; CHECK-NEXT:    da analyze - none!
 ;
+; CHECK-SIV-ONLY-LABEL: 'exact5'
+; CHECK-SIV-ONLY-NEXT:  Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: store i32 %conv, ptr %arrayidx, align 4
+; CHECK-SIV-ONLY-NEXT:    da analyze - none!
+; CHECK-SIV-ONLY-NEXT:  Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: %0 = load i32, ptr %arrayidx1, align 4
+; CHECK-SIV-ONLY-NEXT:    da analyze - flow [=>|<]!
+; CHECK-SIV-ONLY-NEXT:  Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: store i32 %0, ptr %B.addr.01, align 4
+; CHECK-SIV-ONLY-NEXT:    da analyze - confused!
+; CHECK-SIV-ONLY-NEXT:  Src: %0 = load i32, ptr %arrayidx1, align 4 --> Dst: %0 = load i32, ptr %arrayidx1, align 4
+; CHECK-SIV-ONLY-NEXT:    da analyze - none!
+; CHECK-SIV-ONLY-NEXT:  Src: %0 = load i32, ptr %arrayidx1, align 4 --> Dst: store i32 %0, ptr %B.addr.01, align 4
+; CHECK-SIV-ONLY-NEXT:    da analyze - confused!
+; CHECK-SIV-ONLY-NEXT:  Src: store i32 %0, ptr %B.addr.01, align 4 --> Dst: store i32 %0, ptr %B.addr.01, align 4
+; CHECK-SIV-ONLY-NEXT:    da analyze - none!
+;
 entry:
   br label %for.body
 
@@ -286,6 +372,20 @@ define void @exact6(ptr %A, ptr %B) nounwind uwtable ssp {
 ; CHECK-NEXT:  Src: store i32 %0, ptr %B.addr.01, align 4 --> Dst: store i32 %0, ptr %B.addr.01, align 4
 ; CHECK-NEXT:    da analyze - none!
 ;
+; CHECK-SIV-ONLY-LABEL: 'exact6'
+; CHECK-SIV-ONLY-NEXT:  Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: store i32 %conv, ptr %arrayidx, align 4
+; CHECK-SIV-ONLY-NEXT:    da analyze - none!
+; CHECK-SIV-ONLY-NEXT:  Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: %0 = load i32, ptr %arrayidx1, align 4
+; CHECK-SIV-ONLY-NEXT:    da analyze - flow [=>|<]!
+; CHECK-SIV-ONLY-NEXT:  Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: store i32 %0, ptr %B.addr.01, align 4
+; CHECK-SIV-ONLY-NEXT:    da analyze - confused!
+; CHECK-SIV-ONLY-NEXT:  Src: %0 = load i32, ptr %arrayidx1, align 4 --> Dst: %0 = load i32, ptr %arrayidx1, align 4
+; CHECK-SIV-ONLY-NEXT:    da analyze - none!
+; CHECK-SIV-ONLY-NEXT:  Src: %0 = load i32, ptr %arrayidx1, align 4 --> Dst: store i32 %0, ptr %B.addr.01, align 4
+; CHECK-SIV-ONLY-NEXT:    da analyze - confused!
+; CHECK-SIV-ONLY-NEXT:  Src: store i32 %0, ptr %B.addr.01, align 4 --> Dst: store i32 %0, ptr %B.addr.01, align 4
+; CHECK-SIV-ONLY-NEXT:    da analyze - none!
+;
 entry:
   br label %for.body
 
@@ -329,6 +429,20 @@ define void @exact7(ptr %A, ptr %B) nounwind uwtable ssp {
 ; CHECK-NEXT:  Src: store i32 %0, ptr %B.addr.01, align 4 --> Dst: store i32 %0, ptr %B.addr.01, align 4
 ; CHECK-NEXT:    da analyze - none!
 ;
+; CHECK-SIV-ONLY-LABEL: 'exact7'
+; CHECK-SIV-ONLY-NEXT:  Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: store i32 %conv, ptr %arrayidx, align 4
+; CHECK-SIV-ONLY-NEXT:    da analyze - none!
+; CHECK-SIV-ONLY-NEXT:  Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: %0 = load i32, ptr %arrayidx1, align 4
+; CHECK-SIV-ONLY-NEXT:    da analyze - flow [*|<]!
+; CHECK-SIV-ONLY-NEXT:  Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: store i32 %0, ptr %B.addr.01, align 4
+; CHECK-SIV-ONLY-NEXT:    da analyze - confused!
+; CHECK-SIV-ONLY-NEXT:  Src: %0 = load i32, ptr %arrayidx1, align 4 --> Dst: %0 = load i32, ptr %arrayidx1, align 4
+; CHECK-SIV-ONLY-NEXT:    da analyze - none!
+; CHECK-SIV-ONLY-NEXT:  Src: %0 = load i32, ptr %arrayidx1, align 4 --> Dst: store i32 %0, ptr %B.addr.01, align 4
+; CHECK-SIV-ONLY-NEXT:    da analyze - confused!
+; CHECK-SIV-ONLY-NEXT:  Src: store i32 %0, ptr %B.addr.01, align 4 --> Dst: store i32 %0, ptr %B.addr.01, align 4
+; CHECK-SIV-ONLY-NEXT:    da analyze - none!
+;
 entry:
   br label %for.body
 
@@ -372,6 +486,20 @@ define void @exact8(ptr %A, ptr %B) nounwind uwtable ssp {
 ; CHECK-NEXT:  Src: store i32 %0, ptr %B.addr.01, align 4 --> Dst: store i32 %0, ptr %B.addr.01, align 4
 ; CHECK-NEXT:    da analyze - none!
 ;
+; CHECK-SIV-ONLY-LABEL: 'exact8'
+; CHECK-SIV-ONLY-NEXT:  Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: store i32 %conv, ptr %arrayidx, align 4
+; CHECK-SIV-ONLY-NEXT:    da analyze - none!
+; CHECK-SIV-ONLY-NEXT:  Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: %0 = load i32, ptr %arrayidx2, align 4
+; CHECK-SIV-ONLY-NEXT:    da analyze - none!
+; CHECK-SIV-ONLY-NEXT:  Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: store i32 %0, ptr %B.addr.01, align 4
+; CHECK-SIV-ONLY-NEXT:    da analyze - confused!
+; CHECK-SIV-ONLY-NEXT:  Src: %0 = load i32, ptr %arrayidx2, align 4 --> Dst: %0 = load i32, ptr %arrayidx2, align 4
+; CHECK-SIV-ONLY-NEXT:    da analyze - none!
+; CHECK-SIV-ONLY-NEXT:  Src: %0 = load i32, ptr %arrayidx2, align 4 --> Dst: store i32 %0, ptr %B.addr.01, align 4
+; CHECK-SIV-ONLY-NEXT:    da analyze - confused!
+; CHECK-SIV-ONLY-NEXT:  Src: store i32 %0, ptr %B.addr.01, align 4 --> Dst: store i32 %0, ptr %B.addr.01, align 4
+; CHECK-SIV-ONLY-NEXT:    da analyze - none!
+;
 entry:
   br label %for.body
 
@@ -415,6 +543,20 @@ define void @exact9(ptr %A, ptr %B) nounwind uwtable ssp {
 ; CHECK-NEXT:  Src: store i32 %0, ptr %B.addr.01, align 4 --> Dst: store i32 %0, ptr %B.addr.01, align 4
 ; CHECK-NEXT:    da analyze - none!
 ;
+; CHECK-SIV-ONLY-LABEL: 'exact9'
+; CHECK-SIV-ONLY-NEXT:  Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: store i32 %conv, ptr %arrayidx, align 4
+; CHECK-SIV-ONLY-NEXT:    da analyze - none!
+; CHECK-SIV-ONLY-NEXT:  Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: %0 = load i32, ptr %arrayidx2, align 4
+; CHECK-SIV-ONLY-NEXT:    da analyze - flow [>]!
+; CHECK-SIV-ONLY-NEXT:  Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: store i32 %0, ptr %B.addr.01, align 4
+; CHECK-SIV-ONLY-NEXT:    da analyze - confused!
+; CHECK-SIV-ONLY-NEXT:  Src: %0 = load i32, ptr %arrayidx2, align 4 --> Dst: %0 = load i32, ptr %arrayidx2, align 4
+; CHECK-SIV-ONLY-NEXT:    da analyze - none!
+; CHECK-SIV-ONLY-NEXT:  Src: %0 = load i32, ptr %arrayidx2, align 4 --> Dst: store i32 %0, ptr %B.addr.01, align 4
+; CHECK-SIV-ONLY-NEXT:    da analyze - confused!
+; CHECK-SIV-ONLY-NEXT:  Src: store i32 %0, ptr %B.addr.01, align 4 --> Dst: store i32 %0, ptr %B.addr.01, align 4
+; CHECK-SIV-ONLY-NEXT:    da analyze - none!
+;
 entry:
   br label %for.body
 
@@ -458,6 +600,20 @@ define void @exact10(ptr %A, ptr %B) nounwind uwtable ssp {
 ; CHECK-NEXT:  Src: store i32 %0, ptr %B.addr.01, align 4 --> Dst: store i32 %0, ptr %B.addr.01, align 4
 ; CHECK-NEXT:    da analyze - none!
 ;
+; CHECK-SIV-ONLY-LABEL: 'exact10'
+; CHECK-SIV-ONLY-NEXT:  Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: store i32 %conv, ptr %arrayidx, align 4
+; CHECK-SIV-ONLY-NEXT:    da analyze - none!
+; CHECK-SIV-ONLY-NEXT:  Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: %0 = load i32, ptr %arrayidx2, align 4
+; CHECK-SIV-ONLY-NEXT:    da analyze - flow [>]!
+; CHECK-SIV-ONLY-NEXT:  Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: store i32 %0, ptr %B.addr.01, align 4
+; CHECK-SIV-ONLY-NEXT:    da analyze - confused!
+; CHECK-SIV-ONLY-NEXT:  Src: %0 = load i32, ptr %arrayidx2, align 4 --> Dst: %0 = load i32, ptr %arrayidx2, align 4
+; CHECK-SIV-ONLY-NEXT:    da analyze - none!
+; CHECK-SIV-ONLY-NEXT:  Src: %0 = load i32, ptr %arrayidx2, align 4 --> Dst: store i32 %0, ptr %B.addr.01, align 4
+; CHECK-SIV-ONLY-NEXT:    da analyze - confused!
+; CHECK-SIV-ONLY-NEXT:  Src: store i32 %0, ptr %B.addr.01, align 4 --> Dst: store i32 %0, ptr %B.addr.01, align 4
+; CHECK-SIV-ONLY-NEXT:    da analyze - none!
+;
 entry:
   br label %for.body
 
@@ -501,6 +657,20 @@ define void @exact11(ptr %A, ptr %B) nounwind uwtable ssp {
 ; CHECK-NEXT:  Src: store i32 %0, ptr %B.addr.01, align 4 --> Dst: store i32 %0, ptr %B.addr.01, align 4
 ; CHECK-NEXT:    da analyze - none!
 ;
+; CHECK-SIV-ONLY-LABEL: 'exact11'
+; CHECK-SIV-ONLY-NEXT:  Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: store i32 %conv, ptr %arrayidx, align 4
+; CHECK-SIV-ONLY-NEXT:    da analyze - none!
+; CHECK-SIV-ONLY-NEXT:  Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: %0 = load i32, ptr %arrayidx2, align 4
+; CHECK-SIV-ONLY-NEXT:    da analyze - flow [=>|<]!
+; CHECK-SIV-ONLY-NEXT:  Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: store i32 %0, ptr %B.addr.01, align 4
+; CHECK-SIV-ONLY-NEXT:    da analyze - confused!
+; CHECK-SIV-ONLY-NEXT:  Src: %0 = load i32, ptr %arrayidx2, align 4 --> Dst: %0 = load i32, ptr %arrayidx2, align 4
+; CHECK-SIV-ONLY-NEXT:    da analyze - none!
+; CHECK-SIV-ONLY-NEXT:  Src: %0 = load i32, ptr %arrayidx2, align 4 --> Dst: store i32 %0, ptr %B.addr.01, align 4
+; CHECK-SIV-ONLY-NEXT:    da analyze - confused!
+; CHECK-SIV-ONLY-NEXT:  Src: store i32 %0, ptr %B.addr.01, align 4 --> Dst: store i32 %0, ptr %B.addr.01, align 4
+; CHECK-SIV-ONLY-NEXT:    da analyze - none!
+;
 entry:
   br label %for.body
 
@@ -544,6 +714,20 @@ define void @exact12(ptr %A, ptr %B) nounwind uwtable ssp {
 ; CHECK-NEXT:  Src: store i32 %0, ptr %B.addr.01, align 4 --> Dst: store i32 %0, ptr %B.addr.01, align 4
 ; CHECK-NEXT:    da analyze - none!
 ;
+; CHECK-SIV-ONLY-LABEL: 'exact12'
+; CHECK-SIV-ONLY-NEXT:  Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: store i32 %conv, ptr %arrayidx, align 4
+; CHECK-SIV-ONLY-NEXT:    da analyze - none!
+; CHECK-SIV-ONLY-NEXT:  Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: %0 = load i32, ptr %arrayidx2, align 4
+; CHECK-SIV-ONLY-NEXT:    da analyze - flow [=>|<]!
+; CHECK-SIV-ONLY-NEXT:  Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: store i32 %0, ptr %B.addr.01, align 4
+; CHECK-SIV-ONLY-NEXT:    da analyze - confused!
+; CHECK-SIV-ONLY-NEXT:  Src: %0 = load i32, ptr %arrayidx2, align 4 --> Dst: %0 = load i32, ptr %arrayidx2, align 4
+; CHECK-SIV-ONLY-NEXT:    da analyze - none!
+; CHECK-SIV-ONLY-NEXT:  Src: %0 = load i32, ptr %arrayidx2, align 4 --> Dst: store i32 %0, ptr %B.addr.01, align 4
+; CHECK-SIV-ONLY-NEXT:    da analyze - confused!
+; CHECK-SIV-ONLY-NEXT:  Src: store i32 %0, ptr %B.addr.01, align 4 --> Dst: store i32 %0, ptr %B.addr.01, align 4
+; CHECK-SIV-ONLY-NEXT:    da analyze - none!
+;
 entry:
   br label %for.body
 
@@ -587,6 +771,20 @@ define void @exact13(ptr %A, ptr %B) nounwind uwtable ssp {
 ; CHECK-NEXT:  Src: store i32 %0, ptr %B.addr.01, align 4 --> Dst: store i32 %0, ptr %B.addr.01, align 4
 ; CHECK-NEXT:    da analyze - none!
 ;
+; CHECK-SIV-ONLY-LABEL: 'exact13'
+; CHECK-SIV-ONLY-NEXT:  Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: store i32 %conv, ptr %arrayidx, align 4
+; CHECK-SIV-ONLY-NEXT:    da analyze - none!
+; CHECK-SIV-ONLY-NEXT:  Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: %0 = load i32, ptr %arrayidx2, align 4
+; CHECK-SIV-ONLY-NEXT:    da analyze - flow [*|<]!
+; CHECK-SIV-ONLY-NEXT:  Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: store i32 %0, ptr %B.addr.01, align 4
+; CHECK-SIV-ONLY-NEXT:    da analyze - confused!
+; CHECK-SIV-ONLY-NEXT:  Src: %0 = load i32, ptr %arrayidx2, align 4 --> Dst: %0 = load i32, ptr %arrayidx2, align 4
+; CHECK-SIV-ONLY-NEXT:    da analyze - none!
+; CHECK-SIV-ONLY-NEXT:  Src: %0 = load i32, ptr %arrayidx2, align 4 --> Dst: store i32 %0, ptr %B.addr.01, align 4
+; CHECK-SIV-ONLY-NEXT:    da analyze - confused!
+; CHECK-SIV-ONLY-NEXT:  Src: store i32 %0, ptr %B.addr.01, align 4 --> Dst: store i32 %0, ptr %B.addr.01, align 4
+; CHECK-SIV-ONLY-NEXT:    da analyze - none!
+;
 entry:
   br label %for.body
 
@@ -609,3 +807,123 @@ for.body:                                         ; preds = %entry, %for.body
 for.end:                                          ; preds = %for.body
   ret void
 }
+
+;; max_i = INT64_MAX/6  // 1537228672809129301
+;; for (long long i = 0; i <= max_i; i++) {
+;;   A[-6*i + INT64_MAX] = 0;
+;;   if (i)
+;;     A[3*i - 2] = 1;
+;; }
+;;
+;; FIXME: DependencyAnalsysis currently detects no dependency between
+;; `A[-6*i + INT64_MAX]` and `A[3*i - 2]`, but it does exist. For example,
+;;
+;; | memory location        | -6*i + INT64_MAX       | 3*i - 2
+;; |------------------------|------------------------|-----------
+;; | A[1]                   | i = max_i              | i = 1
+;; | A[4611686018427387901] | i = 768614336404564651 | i = max_i
+;;
+;; Actually,
+;;  * 1                   = -6*max_i              + INT64_MAX = 3*1     - 2
+;;  * 4611686018427387901 = -6*768614336404564651 + INT64_MAX = 3*max_i - 2
+;;
+
+define void @exact14(ptr %A) {
+; CHECK-LABEL: 'exact14'
+; CHECK-NEXT:  Src: store i8 0, ptr %idx.0, align 1 --> Dst: store i8 0, ptr %idx.0, align 1
+; CHECK-NEXT:    da analyze - none!
+; CHECK-NEXT:  Src: store i8 0, ptr %idx.0, align 1 --> Dst: store i8 1, ptr %idx.1, align 1
+; CHECK-NEXT:    da analyze - none!
+; CHECK-NEXT:  Src: store i8 1, ptr %idx.1, align 1 --> Dst: store i8 1, ptr %idx.1, align 1
+; CHECK-NEXT:    da analyze - none!
+;
+; CHECK-SIV-ONLY-LABEL: 'exact14'
+; CHECK-SIV-ONLY-NEXT:  Src: store i8 0, ptr %idx.0, align 1 --> Dst: store i8 0, ptr %idx.0, align 1
+; CHECK-SIV-ONLY-NEXT:    da analyze - none!
+; CHECK-SIV-ONLY-NEXT:  Src: store i8 0, ptr %idx.0, align 1 --> Dst: store i8 1, ptr %idx.1, align 1
+; CHECK-SIV-ONLY-NEXT:    da analyze - none!
+; CHECK-SIV-ONLY-NEXT:  Src: store i8 1, ptr %idx.1, align 1 --> Dst: store i8 1, ptr %idx.1, align 1
+; CHECK-SIV-ONLY-NEXT:    da analyze - none!
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %i = phi i64 [ 0, %entry ], [ %i.inc, %loop.latch ]
+  %subscript.0 = phi i64 [ 9223372036854775807, %entry ], [ %subscript.0.next, %loop.latch ]
+  %subscript.1 = phi i64 [ -2, %entry ], [ %subscript.1.next, %loop.latch ]
+  %idx.0 = getelementptr inbounds i8, ptr %A, i64 %subscript.0
+  store i8 0, ptr %idx.0
+  %cond.store = icmp ne i64 %i, 0
+  br i1 %cond.store, label %if.store, label %loop.latch
+
+if.store:
+  %idx.1 = getelementptr inbounds i8, ptr %A, i64 %subscript.1
+  store i8 1, ptr %idx.1
+  br label %loop.latch
+
+loop.latch:
+  %i.inc = add nuw nsw i64 %i, 1
+  %subscript.0.next = add nsw i64 %subscript.0, -6
+  %subscript.1.next = add nsw i64 %subscript.1, 3
+  %exitcond = icmp sgt i64 %i.inc, 1537228672809129301
+  br i1 %exitcond, label %exit, label %loop.header
+
+exit:
+  ret void
+}
+
+;; A generalized version of @exact14.
+;;
+;; for (long long i = 0; i <= n / 6; i++) {
+;;   A[-6*i + n] = 0;
+;;   if (i)
+;;     A[3*i - 2] = 1;
+;; }
+
+define void @exact15(ptr %A, i64 %n) {
+; CHECK-LABEL: 'exact15'
+; CHECK-NEXT:  Src: store i8 0, ptr %idx.0, align 1 --> Dst: store i8 0, ptr %idx.0, align 1
+; CHECK-NEXT:    da analyze - none!
+; CHECK-NEXT:  Src: store i8 0, ptr %idx.0, align 1 --> Dst: store i8 1, ptr %idx.1, align 1
+; CHECK-NEXT:    da analyze - output [*|<]!
+; CHECK-NEXT:  Src: store i8 1, ptr %idx.1, align 1 --> Dst: store i8 1, ptr %idx.1, align 1
+; CHECK-NEXT:    da analyze - none!
+;
+; CHECK-SIV-ONLY-LABEL: 'exact15'
+; CHECK-SIV-ONLY-NEXT:  Src: store i8 0, ptr %idx.0, align 1 --> Dst: store i8 0, ptr %idx.0, align 1
+; CHECK-SIV-ONLY-NEXT:    da analyze - none!
+; CHECK-SIV-ONLY-NEXT:  Src: store i8 0, ptr %idx.0, align 1 --> Dst: store i8 1, ptr %idx.1, align 1
+; CHECK-SIV-ONLY-NEXT:    da analyze - output [*|<]!
+; CHECK-SIV-ONLY-NEXT:  Src: store i8 1, ptr %idx.1, align 1 --> Dst: store i8 1, ptr %idx.1, align 1
+; CHECK-SIV-ONLY-NEXT:    da analyze - none!
+;
+entry:
+  %bound = sdiv i64 %n, 6
+  %guard = icmp sgt i64 %n, 0
+  br i1 %guard, label %loop.header, label %exit
+
+loop.header:
+  %i = phi i64 [ 0, %entry ], [ %i.inc, %loop.latch ]
+  %subscript.0 = phi i64 [ %n, %entry ], [ %subscript.0.next, %loop.latch ]
+  %subscript.1 = phi i64 [ -2, %entry ], [ %subscript.1.next, %loop.latch ]
+  %idx.0 = getelementptr inbounds i8, ptr %A, i64 %subscript.0
+  store i8 0, ptr %idx.0
+  %cond.store = icmp ne i64 %i, 0
+  br i1 %cond.store, label %if.store, label %loop.latch
+
+if.store:
+  %idx.1 = getelementptr inbounds i8, ptr %A, i64 %subscript.1
+  store i8 1, ptr %idx.1
+  br label %loop.latch
+
+loop.latch:
+  %i.inc = add nuw nsw i64 %i, 1
+  %subscript.0.next = add nsw i64 %subscript.0, -6
+  %subscript.1.next = add nsw i64 %subscript.1, 3
+  %exitcond = icmp sgt i64 %i.inc, %bound
+  br i1 %exitcond, label %exit, label %loop.header
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Analysis/ScalarEvolution/backedge-taken-count-guard-info-with-multiple-predecessors.ll b/llvm/test/Analysis/ScalarEvolution/backedge-taken-count-guard-info-with-multiple-predecessors.ll
index 28035b05303db..564ce6b7d622f 100644
--- a/llvm/test/Analysis/ScalarEvolution/backedge-taken-count-guard-info-with-multiple-predecessors.ll
+++ b/llvm/test/Analysis/ScalarEvolution/backedge-taken-count-guard-info-with-multiple-predecessors.ll
@@ -364,3 +364,29 @@ body:
 exit:
   ret void
 }
+
+define void @hang_due_to_unreachable_phi_inblock() personality ptr null {
+bb:
+  br label %bb6
+
+self-loop:                                        ; preds = %self-loop
+  %dead = invoke ptr null()
+          to label %self-loop unwind label %bb4
+
+bb4:                                              ; preds = %self-loop
+  %i5 = landingpad { ptr, i32 }
+          cleanup
+  br label %bb6
+
+bb6:                                              ; preds = %bb4, %bb
+  %i7 = phi ptr [ null, %bb4 ], [ null, %bb ]
+  br label %bb8
+
+bb8:                                              ; preds = %bb8, %bb6
+  %i9 = phi ptr [ null, %bb8 ], [ null, %bb6 ]
+  %i11 = icmp eq ptr %i9, null
+  br i1 %i11, label %bb12, label %bb8
+
+bb12:                                             ; preds = %bb8, %bb6
+  ret void
+}
diff --git a/llvm/test/Analysis/ScalarEvolution/mul-udiv-folds.ll b/llvm/test/Analysis/ScalarEvolution/mul-udiv-folds.ll
index 1d34706baadeb..1e21fbf08a92f 100644
--- a/llvm/test/Analysis/ScalarEvolution/mul-udiv-folds.ll
+++ b/llvm/test/Analysis/ScalarEvolution/mul-udiv-folds.ll
@@ -123,3 +123,68 @@ loop:
 exit:
   ret void
 }
+
+declare void @use.i64(i64)
+
+define void @dividend_not_known_multiple_of_divisor(i64 %x) {
+; CHECK-LABEL: 'dividend_not_known_multiple_of_divisor'
+; CHECK-NEXT:  Classifying expressions for: @dividend_not_known_multiple_of_divisor
+; CHECK-NEXT:    %mul.2 = shl i64 %x, 1
+; CHECK-NEXT:    --> (2 * %x) U: [0,-1) S: [-9223372036854775808,9223372036854775807)
+; CHECK-NEXT:    %div.16 = lshr exact i64 %mul.2, 4
+; CHECK-NEXT:    --> ((2 * %x) /u 16) U: [0,1152921504606846976) S: [0,1152921504606846976)
+; CHECK-NEXT:    %m2 = and i64 %div.16, 1152921504606846974
+; CHECK-NEXT:    --> (2 * ((2 * %x) /u 32))<nuw><nsw> U: [0,1152921504606846975) S: [0,1152921504606846975)
+; CHECK-NEXT:    %m3 = mul i64 %div.16, 2
+; CHECK-NEXT:    --> (2 * ((2 * %x) /u 16))<nuw><nsw> U: [0,2305843009213693951) S: [0,2305843009213693951)
+; CHECK-NEXT:    %m4 = udiv i64 %m3, 4
+; CHECK-NEXT:    --> ((2 * ((2 * %x) /u 16))<nuw><nsw> /u 4) U: [0,576460752303423488) S: [0,576460752303423488)
+; CHECK-NEXT:  Determining loop execution counts for: @dividend_not_known_multiple_of_divisor
+;
+entry:
+  %mul.2 = shl i64 %x, 1
+  %div.16 = lshr exact i64 %mul.2, 4
+  %m2 = and i64 %div.16, 1152921504606846974
+  call void @use.i64(i64 %m2)
+
+  %m3 = mul i64 %div.16, 2
+  %m4 = udiv i64 %m3, 4
+  call void @use.i64(i64 %m4)
+  ret void
+}
+
+define void @btc_depends_on_div_mul(i64 %x) {
+; CHECK-LABEL: 'btc_depends_on_div_mul'
+; CHECK-NEXT:  Classifying expressions for: @btc_depends_on_div_mul
+; CHECK-NEXT:    %mul.2 = shl i64 %x, 1
+; CHECK-NEXT:    --> (2 * %x) U: [0,-1) S: [-9223372036854775808,9223372036854775807)
+; CHECK-NEXT:    %div.16 = lshr exact i64 %mul.2, 4
+; CHECK-NEXT:    --> ((2 * %x) /u 16) U: [0,1152921504606846976) S: [0,1152921504606846976)
+; CHECK-NEXT:    %masked = and i64 %div.16, 1152921504606846974
+; CHECK-NEXT:    --> (2 * ((2 * %x) /u 32))<nuw><nsw> U: [0,1152921504606846975) S: [0,1152921504606846975)
+; CHECK-NEXT:    %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+; CHECK-NEXT:    --> {0,+,2}<%loop> U: [0,-1) S: [-9223372036854775808,9223372036854775807) Exits: (-2 + (2 * ((2 * %x) /u 32))<nuw><nsw>)<nsw> LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    %iv.next = add i64 %iv, 2
+; CHECK-NEXT:    --> {2,+,2}<%loop> U: [0,-1) S: [-9223372036854775808,9223372036854775807) Exits: (2 * ((2 * %x) /u 32))<nuw><nsw> LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:  Determining loop execution counts for: @btc_depends_on_div_mul
+; CHECK-NEXT:  Loop %loop: backedge-taken count is ((-2 + (2 * ((2 * %x) /u 32))<nuw><nsw>)<nsw> /u 2)
+; CHECK-NEXT:  Loop %loop: constant max backedge-taken count is i64 9223372036854775807
+; CHECK-NEXT:  Loop %loop: symbolic max backedge-taken count is ((-2 + (2 * ((2 * %x) /u 32))<nuw><nsw>)<nsw> /u 2)
+; CHECK-NEXT:  Loop %loop: Trip multiple is 1
+;
+entry:
+  %mul.2 = shl i64 %x, 1
+  %div.16 = lshr exact i64 %mul.2, 4
+  %masked = and i64 %div.16, 1152921504606846974
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  call void @use.i64(i64 %iv)
+  %iv.next = add i64 %iv, 2
+  %ec = icmp eq i64 %iv.next, %masked
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Analysis/TypeBasedAliasAnalysis/dse.ll b/llvm/test/Analysis/TypeBasedAliasAnalysis/dse.ll
index 50ea1913b0c76..5f04f12777bd8 100644
--- a/llvm/test/Analysis/TypeBasedAliasAnalysis/dse.ll
+++ b/llvm/test/Analysis/TypeBasedAliasAnalysis/dse.ll
@@ -1,14 +1,14 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt < %s -aa-pipeline=tbaa,basic-aa -passes=dse -S | FileCheck %s
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 ; DSE should make use of TBAA.
 
 define i8 @test0_yes(ptr %a, ptr %b) nounwind {
-; CHECK-LABEL: define i8 @test0_yes
-; CHECK-SAME: (ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:    [[Y:%.*]] = load i8, ptr [[B]], align 1, !tbaa [[TBAA0:![0-9]+]]
-; CHECK-NEXT:    store i8 1, ptr [[A]], align 1, !tbaa [[TBAA3:![0-9]+]]
+; CHECK-LABEL: define i8 @test0_yes(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[Y:%.*]] = load i8, ptr [[B]], align 1, !tbaa [[BAR_TBAA0:![0-9]+]]
+; CHECK-NEXT:    store i8 1, ptr [[A]], align 1, !tbaa [[FOO_TBAA3:![0-9]+]]
 ; CHECK-NEXT:    ret i8 [[Y]]
 ;
   store i8 0, ptr %a, !tbaa !1
@@ -18,11 +18,11 @@ define i8 @test0_yes(ptr %a, ptr %b) nounwind {
 }
 
 define i8 @test0_no(ptr %a, ptr %b) nounwind {
-; CHECK-LABEL: define i8 @test0_no
-; CHECK-SAME: (ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    store i8 0, ptr [[A]], align 1, !tbaa [[TBAA3]]
-; CHECK-NEXT:    [[Y:%.*]] = load i8, ptr [[B]], align 1, !tbaa [[TBAA5:![0-9]+]]
-; CHECK-NEXT:    store i8 1, ptr [[A]], align 1, !tbaa [[TBAA3]]
+; CHECK-LABEL: define i8 @test0_no(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    store i8 0, ptr [[A]], align 1, !tbaa [[FOO_TBAA3]]
+; CHECK-NEXT:    [[Y:%.*]] = load i8, ptr [[B]], align 1, !tbaa [[BAR_TBAA5:![0-9]+]]
+; CHECK-NEXT:    store i8 1, ptr [[A]], align 1, !tbaa [[FOO_TBAA3]]
 ; CHECK-NEXT:    ret i8 [[Y]]
 ;
   store i8 0, ptr %a, !tbaa !3
@@ -32,9 +32,9 @@ define i8 @test0_no(ptr %a, ptr %b) nounwind {
 }
 
 define i8 @test1_yes(ptr %a, ptr %b) nounwind {
-; CHECK-LABEL: define i8 @test1_yes
-; CHECK-SAME: (ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[Y:%.*]] = load i8, ptr [[B]], align 1, !tbaa [[TBAA8:![0-9]+]]
+; CHECK-LABEL: define i8 @test1_yes(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[Y:%.*]] = load i8, ptr [[B]], align 1, !tbaa [[QUX_TBAA8:![0-9]+]]
 ; CHECK-NEXT:    store i8 1, ptr [[A]], align 1
 ; CHECK-NEXT:    ret i8 [[Y]]
 ;
@@ -45,10 +45,10 @@ define i8 @test1_yes(ptr %a, ptr %b) nounwind {
 }
 
 define i8 @test1_no(ptr %a, ptr %b) nounwind {
-; CHECK-LABEL: define i8 @test1_no
-; CHECK-SAME: (ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-LABEL: define i8 @test1_no(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    store i8 0, ptr [[A]], align 1
-; CHECK-NEXT:    [[Y:%.*]] = load i8, ptr [[B]], align 1, !tbaa [[TBAA10:![0-9]+]]
+; CHECK-NEXT:    [[Y:%.*]] = load i8, ptr [[B]], align 1, !tbaa [[QUX_TBAA10:![0-9]+]]
 ; CHECK-NEXT:    store i8 1, ptr [[A]], align 1
 ; CHECK-NEXT:    ret i8 [[Y]]
 ;
@@ -80,3 +80,16 @@ define i8 @test1_no(ptr %a, ptr %b) nounwind {
 !10 = !{ !"bar", !12}
 !11 = !{ !"qux", !0}
 !12 = !{!"different"}
+;.
+; CHECK: [[BAR_TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0}
+; CHECK: [[META1]] = !{!"bar", [[META2:![0-9]+]]}
+; CHECK: [[META2]] = !{}
+; CHECK: [[FOO_TBAA3]] = !{[[META4:![0-9]+]], [[META4]], i64 0}
+; CHECK: [[META4]] = !{!"foo", [[META2]]}
+; CHECK: [[BAR_TBAA5]] = !{[[META6:![0-9]+]], [[META6]], i64 0}
+; CHECK: [[META6]] = !{!"bar", [[META7:![0-9]+]]}
+; CHECK: [[META7]] = !{!"different"}
+; CHECK: [[QUX_TBAA8]] = !{[[META9:![0-9]+]], [[META9]], i64 0, i1 true}
+; CHECK: [[META9]] = !{!"qux", [[META2]]}
+; CHECK: [[QUX_TBAA10]] = !{[[META9]], [[META9]], i64 0, i1 false}
+;.
diff --git a/llvm/test/Analysis/TypeBasedAliasAnalysis/gvn-nonlocal-type-mismatch.ll b/llvm/test/Analysis/TypeBasedAliasAnalysis/gvn-nonlocal-type-mismatch.ll
index d896a1b164844..685c0159dd21d 100644
--- a/llvm/test/Analysis/TypeBasedAliasAnalysis/gvn-nonlocal-type-mismatch.ll
+++ b/llvm/test/Analysis/TypeBasedAliasAnalysis/gvn-nonlocal-type-mismatch.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt -aa-pipeline=tbaa,basic-aa -passes=gvn -S < %s | FileCheck %s --check-prefixes=CHECK,CHECK-MEMDEP
 ; RUN: opt -aa-pipeline=tbaa,basic-aa -passes='gvn<memoryssa>' -S < %s | FileCheck %s --check-prefixes=CHECK,CHECK-MEMSSA
 
@@ -11,8 +11,8 @@ define void @yes(i1 %c, ptr %p, ptr %p1, ptr %q) nounwind {
 ; CHECK-MEMDEP-LABEL: define void @yes(
 ; CHECK-MEMDEP-SAME: i1 [[C:%.*]], ptr [[P:%.*]], ptr [[P1:%.*]], ptr [[Q:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-MEMDEP-NEXT:  [[ENTRY:.*:]]
-; CHECK-MEMDEP-NEXT:    store i32 0, ptr [[P]], align 4, !tbaa [[TBAA0:![0-9]+]]
-; CHECK-MEMDEP-NEXT:    store i32 1, ptr [[P1]], align 4, !tbaa [[TBAA3:![0-9]+]]
+; CHECK-MEMDEP-NEXT:    store i32 0, ptr [[P]], align 4, !tbaa [[RED_TBAA0:![0-9]+]]
+; CHECK-MEMDEP-NEXT:    store i32 1, ptr [[P1]], align 4, !tbaa [[BLU_TBAA3:![0-9]+]]
 ; CHECK-MEMDEP-NEXT:    br i1 [[C]], label %[[IF_ELSE:.*]], label %[[IF_THEN:.*]]
 ; CHECK-MEMDEP:       [[IF_THEN]]:
 ; CHECK-MEMDEP-NEXT:    store i32 0, ptr [[Q]], align 4
@@ -23,11 +23,11 @@ define void @yes(i1 %c, ptr %p, ptr %p1, ptr %q) nounwind {
 ; CHECK-MEMSSA-LABEL: define void @yes(
 ; CHECK-MEMSSA-SAME: i1 [[C:%.*]], ptr [[P:%.*]], ptr [[P1:%.*]], ptr [[Q:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-MEMSSA-NEXT:  [[ENTRY:.*:]]
-; CHECK-MEMSSA-NEXT:    store i32 0, ptr [[P]], align 4, !tbaa [[TBAA0:![0-9]+]]
-; CHECK-MEMSSA-NEXT:    store i32 1, ptr [[P1]], align 4, !tbaa [[TBAA3:![0-9]+]]
+; CHECK-MEMSSA-NEXT:    store i32 0, ptr [[P]], align 4, !tbaa [[RED_TBAA0:![0-9]+]]
+; CHECK-MEMSSA-NEXT:    store i32 1, ptr [[P1]], align 4, !tbaa [[BLU_TBAA3:![0-9]+]]
 ; CHECK-MEMSSA-NEXT:    br i1 [[C]], label %[[IF_ELSE:.*]], label %[[IF_THEN:.*]]
 ; CHECK-MEMSSA:       [[IF_THEN]]:
-; CHECK-MEMSSA-NEXT:    [[T:%.*]] = load i32, ptr [[P]], align 4, !tbaa [[TBAA0]]
+; CHECK-MEMSSA-NEXT:    [[T:%.*]] = load i32, ptr [[P]], align 4, !tbaa [[RED_TBAA0]]
 ; CHECK-MEMSSA-NEXT:    store i32 [[T]], ptr [[Q]], align 4
 ; CHECK-MEMSSA-NEXT:    ret void
 ; CHECK-MEMSSA:       [[IF_ELSE]]:
@@ -56,15 +56,15 @@ define void @watch_out_for_type_change(i1 %c, ptr %p, ptr %p1, ptr %q) nounwind
 ; CHECK-LABEL: define void @watch_out_for_type_change(
 ; CHECK-SAME: i1 [[C:%.*]], ptr [[P:%.*]], ptr [[P1:%.*]], ptr [[Q:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store i32 0, ptr [[P]], align 4, !tbaa [[TBAA0:![0-9]+]]
-; CHECK-NEXT:    store i32 1, ptr [[P1]], align 4, !tbaa [[TBAA3:![0-9]+]]
+; CHECK-NEXT:    store i32 0, ptr [[P]], align 4, !tbaa [[RED_TBAA0:![0-9]+]]
+; CHECK-NEXT:    store i32 1, ptr [[P1]], align 4, !tbaa [[BLU_TBAA3:![0-9]+]]
 ; CHECK-NEXT:    br i1 [[C]], label %[[IF_ELSE:.*]], label %[[IF_THEN:.*]]
 ; CHECK:       [[IF_THEN]]:
-; CHECK-NEXT:    [[T:%.*]] = load i32, ptr [[P]], align 4, !tbaa [[TBAA5:![0-9]+]]
+; CHECK-NEXT:    [[T:%.*]] = load i32, ptr [[P]], align 4, !tbaa [[OUTER_SPACE_TBAA5:![0-9]+]]
 ; CHECK-NEXT:    store i32 [[T]], ptr [[Q]], align 4
 ; CHECK-NEXT:    ret void
 ; CHECK:       [[IF_ELSE]]:
-; CHECK-NEXT:    [[U:%.*]] = load i32, ptr [[P]], align 4, !tbaa [[TBAA8:![0-9]+]]
+; CHECK-NEXT:    [[U:%.*]] = load i32, ptr [[P]], align 4, !tbaa [[BRICK_RED_TBAA8:![0-9]+]]
 ; CHECK-NEXT:    store i32 [[U]], ptr [[Q]], align 4
 ; CHECK-NEXT:    ret void
 ;
@@ -91,29 +91,29 @@ define void @watch_out_for_another_type_change(i1 %c, ptr %p, ptr %p1, ptr %q) n
 ; CHECK-MEMDEP-LABEL: define void @watch_out_for_another_type_change(
 ; CHECK-MEMDEP-SAME: i1 [[C:%.*]], ptr [[P:%.*]], ptr [[P1:%.*]], ptr [[Q:%.*]]) #[[ATTR0]] {
 ; CHECK-MEMDEP-NEXT:  [[ENTRY:.*:]]
-; CHECK-MEMDEP-NEXT:    store i32 0, ptr [[P]], align 4, !tbaa [[TBAA0]]
-; CHECK-MEMDEP-NEXT:    store i32 1, ptr [[P1]], align 4, !tbaa [[TBAA3]]
+; CHECK-MEMDEP-NEXT:    store i32 0, ptr [[P]], align 4, !tbaa [[RED_TBAA0]]
+; CHECK-MEMDEP-NEXT:    store i32 1, ptr [[P1]], align 4, !tbaa [[BLU_TBAA3]]
 ; CHECK-MEMDEP-NEXT:    br i1 [[C]], label %[[IF_ELSE:.*]], label %[[IF_THEN:.*]]
 ; CHECK-MEMDEP:       [[IF_THEN]]:
 ; CHECK-MEMDEP-NEXT:    store i32 0, ptr [[Q]], align 4
 ; CHECK-MEMDEP-NEXT:    ret void
 ; CHECK-MEMDEP:       [[IF_ELSE]]:
-; CHECK-MEMDEP-NEXT:    [[U:%.*]] = load i32, ptr [[P]], align 4, !tbaa [[TBAA5]]
+; CHECK-MEMDEP-NEXT:    [[U:%.*]] = load i32, ptr [[P]], align 4, !tbaa [[OUTER_SPACE_TBAA5]]
 ; CHECK-MEMDEP-NEXT:    store i32 [[U]], ptr [[Q]], align 4
 ; CHECK-MEMDEP-NEXT:    ret void
 ;
 ; CHECK-MEMSSA-LABEL: define void @watch_out_for_another_type_change(
 ; CHECK-MEMSSA-SAME: i1 [[C:%.*]], ptr [[P:%.*]], ptr [[P1:%.*]], ptr [[Q:%.*]]) #[[ATTR0]] {
 ; CHECK-MEMSSA-NEXT:  [[ENTRY:.*:]]
-; CHECK-MEMSSA-NEXT:    store i32 0, ptr [[P]], align 4, !tbaa [[TBAA0]]
-; CHECK-MEMSSA-NEXT:    store i32 1, ptr [[P1]], align 4, !tbaa [[TBAA3]]
+; CHECK-MEMSSA-NEXT:    store i32 0, ptr [[P]], align 4, !tbaa [[RED_TBAA0]]
+; CHECK-MEMSSA-NEXT:    store i32 1, ptr [[P1]], align 4, !tbaa [[BLU_TBAA3]]
 ; CHECK-MEMSSA-NEXT:    br i1 [[C]], label %[[IF_ELSE:.*]], label %[[IF_THEN:.*]]
 ; CHECK-MEMSSA:       [[IF_THEN]]:
-; CHECK-MEMSSA-NEXT:    [[T:%.*]] = load i32, ptr [[P]], align 4, !tbaa [[TBAA8]]
+; CHECK-MEMSSA-NEXT:    [[T:%.*]] = load i32, ptr [[P]], align 4, !tbaa [[BRICK_RED_TBAA8]]
 ; CHECK-MEMSSA-NEXT:    store i32 [[T]], ptr [[Q]], align 4
 ; CHECK-MEMSSA-NEXT:    ret void
 ; CHECK-MEMSSA:       [[IF_ELSE]]:
-; CHECK-MEMSSA-NEXT:    [[U:%.*]] = load i32, ptr [[P]], align 4, !tbaa [[TBAA5]]
+; CHECK-MEMSSA-NEXT:    [[U:%.*]] = load i32, ptr [[P]], align 4, !tbaa [[OUTER_SPACE_TBAA5]]
 ; CHECK-MEMSSA-NEXT:    store i32 [[U]], ptr [[Q]], align 4
 ; CHECK-MEMSSA-NEXT:    ret void
 ;
@@ -144,25 +144,25 @@ if.else:
 !8 = !{!"brick red", !5}
 !9 = !{!"observable universe"}
 ;.
-; CHECK-MEMDEP: [[TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0}
+; CHECK-MEMDEP: [[RED_TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0}
 ; CHECK-MEMDEP: [[META1]] = !{!"red", [[META2:![0-9]+]]}
 ; CHECK-MEMDEP: [[META2]] = !{}
-; CHECK-MEMDEP: [[TBAA3]] = !{[[META4:![0-9]+]], [[META4]], i64 0}
+; CHECK-MEMDEP: [[BLU_TBAA3]] = !{[[META4:![0-9]+]], [[META4]], i64 0}
 ; CHECK-MEMDEP: [[META4]] = !{!"blu", [[META2]]}
-; CHECK-MEMDEP: [[TBAA5]] = !{[[META6:![0-9]+]], [[META6]], i64 0}
+; CHECK-MEMDEP: [[OUTER_SPACE_TBAA5]] = !{[[META6:![0-9]+]], [[META6]], i64 0}
 ; CHECK-MEMDEP: [[META6]] = !{!"outer space", [[META7:![0-9]+]]}
 ; CHECK-MEMDEP: [[META7]] = !{!"observable universe"}
-; CHECK-MEMDEP: [[TBAA8]] = !{[[META9:![0-9]+]], [[META9]], i64 0}
+; CHECK-MEMDEP: [[BRICK_RED_TBAA8]] = !{[[META9:![0-9]+]], [[META9]], i64 0}
 ; CHECK-MEMDEP: [[META9]] = !{!"brick red", [[META1]]}
 ;.
-; CHECK-MEMSSA: [[TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0}
+; CHECK-MEMSSA: [[RED_TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0}
 ; CHECK-MEMSSA: [[META1]] = !{!"red", [[META2:![0-9]+]]}
 ; CHECK-MEMSSA: [[META2]] = !{}
-; CHECK-MEMSSA: [[TBAA3]] = !{[[META4:![0-9]+]], [[META4]], i64 0}
+; CHECK-MEMSSA: [[BLU_TBAA3]] = !{[[META4:![0-9]+]], [[META4]], i64 0}
 ; CHECK-MEMSSA: [[META4]] = !{!"blu", [[META2]]}
-; CHECK-MEMSSA: [[TBAA5]] = !{[[META6:![0-9]+]], [[META6]], i64 0}
+; CHECK-MEMSSA: [[OUTER_SPACE_TBAA5]] = !{[[META6:![0-9]+]], [[META6]], i64 0}
 ; CHECK-MEMSSA: [[META6]] = !{!"outer space", [[META7:![0-9]+]]}
 ; CHECK-MEMSSA: [[META7]] = !{!"observable universe"}
-; CHECK-MEMSSA: [[TBAA8]] = !{[[META9:![0-9]+]], [[META9]], i64 0}
+; CHECK-MEMSSA: [[BRICK_RED_TBAA8]] = !{[[META9:![0-9]+]], [[META9]], i64 0}
 ; CHECK-MEMSSA: [[META9]] = !{!"brick red", [[META1]]}
 ;.
diff --git a/llvm/test/Analysis/TypeBasedAliasAnalysis/memcpyopt.ll b/llvm/test/Analysis/TypeBasedAliasAnalysis/memcpyopt.ll
index 47dd886bb9f17..f605b516e019e 100644
--- a/llvm/test/Analysis/TypeBasedAliasAnalysis/memcpyopt.ll
+++ b/llvm/test/Analysis/TypeBasedAliasAnalysis/memcpyopt.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt -S -aa-pipeline=tbaa,basic-aa -passes=memcpyopt,instcombine < %s | FileCheck %s
 
 target datalayout = "e-p:64:64:64"
@@ -7,10 +7,12 @@ target datalayout = "e-p:64:64:64"
 ; it has a TBAA tag which declares that it is unrelated.
 
 define void @foo(ptr nocapture %p, ptr nocapture %q, ptr nocapture %s) nounwind {
-; CHECK: @foo
-; CHECK-NEXT: tail call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 1 dereferenceable(16) %p, ptr noundef nonnull align 1 dereferenceable(16) %q, i64 16, i1 false), !tbaa !0
-; CHECK-NEXT: store i8 2, ptr %s, align 1, !tbaa [[TAGA:!.*]]
-; CHECK-NEXT: ret void
+; CHECK-LABEL: define void @foo(
+; CHECK-SAME: ptr captures(none) [[P:%.*]], ptr captures(none) [[Q:%.*]], ptr captures(none) [[S:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    tail call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 1 dereferenceable(16) [[P]], ptr noundef nonnull align 1 dereferenceable(16) [[Q]], i64 16, i1 false), !tbaa [[B_TBAA0:![0-9]+]]
+; CHECK-NEXT:    store i8 2, ptr [[S]], align 1, !tbaa [[A_TBAA3:![0-9]+]]
+; CHECK-NEXT:    ret void
+;
   tail call void @llvm.memcpy.p0.p0.i64(ptr %p, ptr %q, i64 16, i1 false), !tbaa !2
   store i8 2, ptr %s, align 1, !tbaa !1
   tail call void @llvm.memcpy.p0.p0.i64(ptr %q, ptr %p, i64 16, i1 false), !tbaa !2
@@ -19,10 +21,15 @@ define void @foo(ptr nocapture %p, ptr nocapture %q, ptr nocapture %s) nounwind
 
 declare void @llvm.memcpy.p0.p0.i64(ptr nocapture, ptr nocapture, i64, i1) nounwind
 
-; CHECK: [[TAGA]] = !{[[TYPEA:!.*]], [[TYPEA]], i64 0}
-; CHECK: [[TYPEA]] = !{!"A", !{{.*}}}
 !0 = !{!"tbaa root"}
 !1 = !{!3, !3, i64 0}
 !2 = !{!4, !4, i64 0}
 !3 = !{!"A", !0}
 !4 = !{!"B", !0}
+;.
+; CHECK: [[B_TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0}
+; CHECK: [[META1]] = !{!"B", [[META2:![0-9]+]]}
+; CHECK: [[META2]] = !{!"tbaa root"}
+; CHECK: [[A_TBAA3]] = !{[[META4:![0-9]+]], [[META4]], i64 0}
+; CHECK: [[META4]] = !{!"A", [[META2]]}
+;.
diff --git a/llvm/test/Bitcode/upgrade-masked-keep-metadata.ll b/llvm/test/Bitcode/upgrade-masked-keep-metadata.ll
index 0bcdfed808814..a4667ab62f789 100644
--- a/llvm/test/Bitcode/upgrade-masked-keep-metadata.ll
+++ b/llvm/test/Bitcode/upgrade-masked-keep-metadata.ll
@@ -1,9 +1,10 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt -S < %s | FileCheck %s
 define <4 x i32> @load(ptr nocapture readonly %a0) !dbg !8 {
-; CHECK-LABEL: @load(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[V0:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[A0:%.*]], i32 16, <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x i32> undef), !dbg [[DBG19:![0-9]+]], !tbaa [[TBAA20:![0-9]+]]
+; CHECK-LABEL: define <4 x i32> @load(
+; CHECK-SAME: ptr readonly captures(none) [[A0:%.*]]) !dbg [[DBG8:![0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[V0:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[A0]], i32 16, <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x i32> undef), !dbg [[DBG19:![0-9]+]], !tbaa [[CHAR_TBAA20:![0-9]+]]
 ; CHECK-NEXT:    ret <4 x i32> [[V0]], !dbg [[DBG23:![0-9]+]]
 ;
 entry:
@@ -12,9 +13,10 @@ entry:
 }
 
 define void @store(<4 x i32> %a0, ptr nocapture %a1) !dbg !24 {
-; CHECK-LABEL: @store(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[A0:%.*]], ptr [[A1:%.*]], i32 16, <4 x i1> <i1 false, i1 true, i1 false, i1 true>), !dbg [[DBG30:![0-9]+]], !tbaa [[TBAA20]]
+; CHECK-LABEL: define void @store(
+; CHECK-SAME: <4 x i32> [[A0:%.*]], ptr captures(none) [[A1:%.*]]) !dbg [[DBG24:![0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[A0]], ptr [[A1]], i32 16, <4 x i1> <i1 false, i1 true, i1 false, i1 true>), !dbg [[DBG30:![0-9]+]], !tbaa [[CHAR_TBAA20]]
 ; CHECK-NEXT:    ret void, !dbg [[DBG31:![0-9]+]]
 ;
 entry:
@@ -23,9 +25,10 @@ entry:
 }
 
 define <4 x i32> @gather(<4 x ptr> %a0) !dbg !32 {
-; CHECK-LABEL: @gather(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[V0:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[A0:%.*]], i32 16, <4 x i1> <i1 true, i1 true, i1 true, i1 false>, <4 x i32> undef), !dbg [[DBG35:![0-9]+]], !tbaa [[TBAA20]]
+; CHECK-LABEL: define <4 x i32> @gather(
+; CHECK-SAME: <4 x ptr> [[A0:%.*]]) !dbg [[DBG32:![0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[V0:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[A0]], i32 16, <4 x i1> <i1 true, i1 true, i1 true, i1 false>, <4 x i32> undef), !dbg [[DBG35:![0-9]+]], !tbaa [[CHAR_TBAA20]]
 ; CHECK-NEXT:    ret <4 x i32> [[V0]], !dbg [[DBG36:![0-9]+]]
 ;
 entry:
@@ -34,9 +37,10 @@ entry:
 }
 
 define void @scatter(<4 x i32> %a0, <4 x ptr> %a1) !dbg !37 {
-; CHECK-LABEL: @scatter(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[A0:%.*]], <4 x ptr> [[A1:%.*]], i32 16, <4 x i1> <i1 false, i1 true, i1 true, i1 true>), !dbg [[DBG41:![0-9]+]], !tbaa [[TBAA20]]
+; CHECK-LABEL: define void @scatter(
+; CHECK-SAME: <4 x i32> [[A0:%.*]], <4 x ptr> [[A1:%.*]]) !dbg [[DBG37:![0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[A0]], <4 x ptr> [[A1]], i32 16, <4 x i1> <i1 false, i1 true, i1 true, i1 true>), !dbg [[DBG41:![0-9]+]], !tbaa [[CHAR_TBAA20]]
 ; CHECK-NEXT:    ret void, !dbg [[DBG42:![0-9]+]]
 ;
 entry:
@@ -45,9 +49,10 @@ entry:
 }
 
 define <4 x i32> @expandload(ptr nocapture readonly %a0) !dbg !43 {
-; CHECK-LABEL: @expandload(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[V0:%.*]] = call <4 x i32> @llvm.masked.expandload.v4i32(ptr [[A0:%.*]], <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x i32> undef), !dbg [[DBG49:![0-9]+]], !tbaa [[TBAA50:![0-9]+]]
+; CHECK-LABEL: define <4 x i32> @expandload(
+; CHECK-SAME: ptr readonly captures(none) [[A0:%.*]]) !dbg [[DBG43:![0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[V0:%.*]] = call <4 x i32> @llvm.masked.expandload.v4i32(ptr [[A0]], <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x i32> undef), !dbg [[DBG49:![0-9]+]], !tbaa [[INT_TBAA50:![0-9]+]]
 ; CHECK-NEXT:    ret <4 x i32> [[V0]], !dbg [[DBG52:![0-9]+]]
 ;
 entry:
@@ -56,9 +61,10 @@ entry:
 }
 
 define void @compressstore(<4 x i32> %a0, ptr nocapture %a1) !dbg !53 {
-; CHECK-LABEL: @compressstore(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    call void @llvm.masked.compressstore.v4i32(<4 x i32> [[A0:%.*]], ptr [[A1:%.*]], <4 x i1> <i1 false, i1 false, i1 true, i1 true>), !dbg [[DBG59:![0-9]+]], !tbaa [[TBAA50]]
+; CHECK-LABEL: define void @compressstore(
+; CHECK-SAME: <4 x i32> [[A0:%.*]], ptr captures(none) [[A1:%.*]]) !dbg [[DBG53:![0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    call void @llvm.masked.compressstore.v4i32(<4 x i32> [[A0]], ptr [[A1]], <4 x i1> <i1 false, i1 false, i1 true, i1 true>), !dbg [[DBG59:![0-9]+]], !tbaa [[INT_TBAA50]]
 ; CHECK-NEXT:    ret void, !dbg [[DBG60:![0-9]+]]
 ;
 entry:
diff --git a/llvm/test/Bitcode/upgrade-vector-partial-reduce-add-intrinsic.ll b/llvm/test/Bitcode/upgrade-vector-partial-reduce-add-intrinsic.ll
new file mode 100644
index 0000000000000..623faad4406a1
--- /dev/null
+++ b/llvm/test/Bitcode/upgrade-vector-partial-reduce-add-intrinsic.ll
@@ -0,0 +1,24 @@
+; RUN: llvm-as %s -o - | llvm-dis | FileCheck %s
+
+define <4 x i32> @partial_reduce_add_fixed(<16 x i32> %a) {
+; CHECK-LABEL: @partial_reduce_add_fixed
+; CHECK: %res = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> zeroinitializer, <16 x i32> %a)
+
+  %res = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> zeroinitializer, <16 x i32> %a)
+  ret <4 x i32> %res
+}
+
+
+define <vscale x 4 x i32> @partial_reduce_add_scalable(<vscale x 16 x i32> %a) {
+; CHECK-LABEL: @partial_reduce_add_scalable
+; CHECK: %res = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> zeroinitializer, <vscale x 16 x i32> %a)
+
+  %res = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> zeroinitializer, <vscale x 16 x i32> %a)
+  ret <vscale x 4 x i32> %res
+}
+
+declare <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32>, <16 x i32>)
+; CHECK-DAG: declare <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32>, <16 x i32>)
+
+declare <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32>, <vscale x 16 x i32>)
+; CHECK-DAG: declare <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32>, <vscale x 16 x i32>)
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll
index 675c953fb1a84..0f75887d79cab 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll
@@ -1,4 +1,5 @@
-; RUN: llc -O0 -aarch64-enable-atomic-cfg-tidy=0 -mattr=+lse -stop-after=irtranslator -global-isel -verify-machineinstrs %s -o - 2>&1 | FileCheck %s
+; RUN: llc -O0 -aarch64-enable-atomic-cfg-tidy=0 -mattr=+lse -stop-after=irtranslator -global-isel -verify-machineinstrs %s -o - -use-constant-int-for-fixed-length-splat=false 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-CV
+; RUN: llc -O0 -aarch64-enable-atomic-cfg-tidy=0 -mattr=+lse -stop-after=irtranslator -global-isel -verify-machineinstrs %s -o - -use-constant-int-for-fixed-length-splat 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-CI
 ; RUN: llc -O3 -aarch64-enable-atomic-cfg-tidy=0 -mattr=+lse -stop-after=irtranslator -global-isel -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefix=O3
 
 ; This file checks that the translation from llvm IR to generic MachineInstr
@@ -1701,13 +1702,19 @@ define i32 @test_constantaggzerovector_v1s32(i32 %arg){
 }
 
 define i32 @test_constantdatavector_v1s32(i32 %arg){
-; CHECK-LABEL: name: test_constantdatavector_v1s32
-; CHECK: [[ARG:%[0-9]+]]:_(s32) = COPY $w0
-; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-; CHECK-NOT: G_MERGE_VALUES
-; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY [[C0]]
-; CHECK-NOT: G_MERGE_VALUES
-; CHECK: G_ADD [[ARG]], [[COPY]]
+; CHECK-CV-LABEL: name: test_constantdatavector_v1s32
+; CHECK-CV: [[ARG:%[0-9]+]]:_(s32) = COPY $w0
+; CHECK-CV: [[C0:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+; CHECK-CV-NOT: G_MERGE_VALUES
+; CHECK-CV: [[COPY:%[0-9]+]]:_(s32) = COPY [[C0]]
+; CHECK-CV-NOT: G_MERGE_VALUES
+; CHECK-CV: G_ADD [[ARG]], [[COPY]]
+;
+; CHECK-CI-LABEL: name: test_constantdatavector_v1s32
+; CHECK-CI: [[ARG:%[0-9]+]]:_(s32) = COPY $w0
+; CHECK-CI: [[C0:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+; CHECK-CI-NOT: G_MERGE_VALUES
+; CHECK-CI: G_ADD [[ARG]], [[C0]]
   %vec = insertelement <1 x i32> undef, i32 %arg, i32 0
   %add = add <1 x i32> %vec, <i32 1>
   %res = extractelement <1 x i32> %add, i32 0
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-addv.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-addv.mir
index ae08cd9d5bfef..3e4a856aed2ec 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-addv.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-addv.mir
@@ -15,7 +15,7 @@ body:             |
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(<4 x s32>) = COPY $q3
     ; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(<4 x s32>) = COPY $q4
     ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<16 x s32>) = G_CONCAT_VECTORS [[COPY1]](<4 x s32>), [[COPY2]](<4 x s32>), [[COPY3]](<4 x s32>), [[COPY4]](<4 x s32>)
-    ; CHECK-NEXT: [[INT:%[0-9]+]]:_(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.experimental.vector.partial.reduce.add), [[COPY]](<4 x s32>), [[CONCAT_VECTORS]](<16 x s32>)
+    ; CHECK-NEXT: [[INT:%[0-9]+]]:_(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.vector.partial.reduce.add), [[COPY]](<4 x s32>), [[CONCAT_VECTORS]](<16 x s32>)
     ; CHECK-NEXT: [[VECREDUCE_ADD:%[0-9]+]]:_(s32) = G_VECREDUCE_ADD [[INT]](<4 x s32>)
     ; CHECK-NEXT: $w0 = COPY [[VECREDUCE_ADD]](s32)
     ; CHECK-NEXT: RET_ReallyLR implicit $w0
@@ -25,7 +25,7 @@ body:             |
     %4:_(<4 x s32>) = COPY $q3
     %5:_(<4 x s32>) = COPY $q4
     %1:_(<16 x s32>) = G_CONCAT_VECTORS %2:_(<4 x s32>), %3:_(<4 x s32>), %4:_(<4 x s32>), %5:_(<4 x s32>)
-    %6:_(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.experimental.vector.partial.reduce.add), %0:_(<4 x s32>), %1:_(<16 x s32>)
+    %6:_(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.vector.partial.reduce.add), %0:_(<4 x s32>), %1:_(<16 x s32>)
     %7:_(s32) = G_VECREDUCE_ADD %6:_(<4 x s32>)
     $w0 = COPY %7:_(s32)
     RET_ReallyLR implicit $w0
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-one-by-n-vector-ptr-add.ll b/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-one-by-n-vector-ptr-add.ll
index 870f893cbef39..78b0c720d51a3 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-one-by-n-vector-ptr-add.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-one-by-n-vector-ptr-add.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -O0 -global-isel -mtriple aarch64 -stop-after=irtranslator -verify-machineinstrs %s -o - | FileCheck %s
+; RUN: llc -O0 -global-isel -mtriple aarch64 -stop-after=irtranslator -verify-machineinstrs %s -o - -use-constant-int-for-fixed-length-splat=false | FileCheck %s --check-prefix=CHECK
+; RUN: llc -O0 -global-isel -mtriple aarch64 -stop-after=irtranslator -verify-machineinstrs %s -o - -use-constant-int-for-fixed-length-splat | FileCheck %s --check-prefix=CHECK-CI
 
 ; Make sure we treat <1 x N> getelementptrs like scalar getelementptrs.
 
@@ -9,15 +10,26 @@
 define <1 x ptr> @one_elt_vector_ptr_add_non_vector_idx(<1 x ptr> %vec) {
   ; CHECK-LABEL: name: one_elt_vector_ptr_add_non_vector_idx
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK:   liveins: $d0
-  ; CHECK:   [[COPY:%[0-9]+]]:_(p0) = COPY $d0
-  ; CHECK:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-  ; CHECK:   [[COPY1:%[0-9]+]]:_(s32) = COPY [[C]](s32)
-  ; CHECK:   [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[COPY1]](s32)
-  ; CHECK:   [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[SEXT]](s64)
-  ; CHECK:   [[COPY2:%[0-9]+]]:_(p0) = COPY [[PTR_ADD]](p0)
-  ; CHECK:   $d0 = COPY [[COPY2]](p0)
-  ; CHECK:   RET_ReallyLR implicit $d0
+  ; CHECK-NEXT:   liveins: $d0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $d0
+  ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY [[C]](s32)
+  ; CHECK-NEXT:   [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[COPY1]](s32)
+  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[SEXT]](s64)
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(p0) = COPY [[PTR_ADD]](p0)
+  ; CHECK-NEXT:   $d0 = COPY [[COPY2]](p0)
+  ; CHECK-NEXT:   RET_ReallyLR implicit $d0
+  ;
+  ; CHECK-CI-LABEL: name: one_elt_vector_ptr_add_non_vector_idx
+  ; CHECK-CI: bb.1 (%ir-block.0):
+  ; CHECK-CI-NEXT:   liveins: $d0
+  ; CHECK-CI-NEXT: {{  $}}
+  ; CHECK-CI-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $d0
+  ; CHECK-CI-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+  ; CHECK-CI-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+  ; CHECK-CI-NEXT:   $d0 = COPY [[PTR_ADD]](p0)
+  ; CHECK-CI-NEXT:   RET_ReallyLR implicit $d0
   %ptr_add = getelementptr i8, <1 x ptr> %vec, <1 x i32> <i32 1>
   ret <1 x ptr> %ptr_add
 }
@@ -28,15 +40,26 @@ define <1 x ptr> @one_elt_vector_ptr_add_non_vector_idx(<1 x ptr> %vec) {
 define <1 x ptr> @one_elt_vector_ptr_add_non_vector_ptr(ptr %vec) {
   ; CHECK-LABEL: name: one_elt_vector_ptr_add_non_vector_ptr
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK:   liveins: $x0
-  ; CHECK:   [[COPY:%[0-9]+]]:_(p0) = COPY $x0
-  ; CHECK:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-  ; CHECK:   [[COPY1:%[0-9]+]]:_(s32) = COPY [[C]](s32)
-  ; CHECK:   [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[COPY1]](s32)
-  ; CHECK:   [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[SEXT]](s64)
-  ; CHECK:   [[COPY2:%[0-9]+]]:_(p0) = COPY [[PTR_ADD]](p0)
-  ; CHECK:   $d0 = COPY [[COPY2]](p0)
-  ; CHECK:   RET_ReallyLR implicit $d0
+  ; CHECK-NEXT:   liveins: $x0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x0
+  ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY [[C]](s32)
+  ; CHECK-NEXT:   [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[COPY1]](s32)
+  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[SEXT]](s64)
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(p0) = COPY [[PTR_ADD]](p0)
+  ; CHECK-NEXT:   $d0 = COPY [[COPY2]](p0)
+  ; CHECK-NEXT:   RET_ReallyLR implicit $d0
+  ;
+  ; CHECK-CI-LABEL: name: one_elt_vector_ptr_add_non_vector_ptr
+  ; CHECK-CI: bb.1 (%ir-block.0):
+  ; CHECK-CI-NEXT:   liveins: $x0
+  ; CHECK-CI-NEXT: {{  $}}
+  ; CHECK-CI-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x0
+  ; CHECK-CI-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+  ; CHECK-CI-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+  ; CHECK-CI-NEXT:   $d0 = COPY [[PTR_ADD]](p0)
+  ; CHECK-CI-NEXT:   RET_ReallyLR implicit $d0
   %ptr_add = getelementptr i8, ptr %vec, <1 x i32> <i32 1>
   ret <1 x ptr> %ptr_add
 }
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/select-splat-vector.ll b/llvm/test/CodeGen/AArch64/GlobalISel/select-splat-vector.ll
index 770035c34812c..8705e3aa2c305 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/select-splat-vector.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/select-splat-vector.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
 ; RUN: llc < %s -mtriple aarch64 -mattr=+sve -aarch64-enable-gisel-sve=1  | FileCheck %s  --check-prefixes=CHECK,CHECK-SDAG
+; RUN: llc < %s -mtriple aarch64 -mattr=+sve -aarch64-enable-gisel-sve=1  -use-constant-int-for-scalable-splat | FileCheck %s  --check-prefixes=CHECK,CHECK-SDAG
 ; RUN: llc < %s -mtriple aarch64 -mattr=+sve -global-isel -aarch64-enable-gisel-sve=1 | FileCheck %s --check-prefixes=CHECK,CHECK-GS
-
-; REQUIRES: asserts
+; RUN: llc < %s -mtriple aarch64 -mattr=+sve -global-isel -aarch64-enable-gisel-sve=1 -use-constant-int-for-scalable-splat | FileCheck %s --check-prefixes=CHECK,CHECK-GS
 
 ;; add
 define <vscale x 2 x i64> @addnxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b) {
diff --git a/llvm/test/CodeGen/AArch64/aarch64-addv.ll b/llvm/test/CodeGen/AArch64/aarch64-addv.ll
index bc675343adc08..d9180a28bd40b 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-addv.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-addv.ll
@@ -493,3 +493,154 @@ entry:
   ret i128 %arg1
 }
 
+define i16 @addv_zero_lanes_v4i16(ptr %arr)  {
+; CHECK-SD-LABEL: addv_zero_lanes_v4i16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldrb w0, [x0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: addv_zero_lanes_v4i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldrb w8, [x0]
+; CHECK-GI-NEXT:    fmov d0, x8
+; CHECK-GI-NEXT:    addv h0, v0.4h
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    ret
+  %v = load i64, ptr %arr
+  %and = and i64 %v, 255
+  %vec = bitcast i64 %and to <4 x i16>
+  %r = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %vec)
+  ret i16 %r
+}
+
+define i8 @addv_zero_lanes_v8i8(ptr %arr)  {
+; CHECK-SD-LABEL: addv_zero_lanes_v8i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldrb w0, [x0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: addv_zero_lanes_v8i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldrb w8, [x0]
+; CHECK-GI-NEXT:    fmov d0, x8
+; CHECK-GI-NEXT:    addv b0, v0.8b
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    ret
+  %v = load i64, ptr %arr
+  %and = and i64 %v, 255
+  %vec = bitcast i64 %and to <8 x i8>
+  %r = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %vec)
+  ret i8 %r
+}
+
+define i8 @addv_zero_lanes_negative_v8i8(ptr %arr)  {
+; CHECK-LABEL: addv_zero_lanes_negative_v8i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr x8, [x0]
+; CHECK-NEXT:    and x8, x8, #0x100
+; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    addv b0, v0.8b
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+  %v = load i64, ptr %arr
+  %and = and i64 %v, 256
+  %vec = bitcast i64 %and to <8 x i8>
+  %r = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %vec)
+  ret i8 %r
+}
+
+
+define i8 @addv_zero_lanes_v16i8(ptr %arr)  {
+; CHECK-SD-LABEL: addv_zero_lanes_v16i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-SD-NEXT:    ldrb w8, [x0]
+; CHECK-SD-NEXT:    mov v0.d[0], x8
+; CHECK-SD-NEXT:    addv b0, v0.16b
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: addv_zero_lanes_v16i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldrb w8, [x0]
+; CHECK-GI-NEXT:    mov v0.d[0], x8
+; CHECK-GI-NEXT:    mov v0.d[1], xzr
+; CHECK-GI-NEXT:    addv b0, v0.16b
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    ret
+  %v = load i128, ptr %arr
+  %and = and i128 %v, 255
+  %vec = bitcast i128 %and to <16 x i8>
+  %r = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %vec)
+  ret i8 %r
+}
+
+define i16 @addv_zero_lanes_v8i16(ptr %arr)  {
+; CHECK-SD-LABEL: addv_zero_lanes_v8i16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-SD-NEXT:    ldrh w8, [x0]
+; CHECK-SD-NEXT:    mov v0.d[0], x8
+; CHECK-SD-NEXT:    addv h0, v0.8h
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: addv_zero_lanes_v8i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldrh w8, [x0]
+; CHECK-GI-NEXT:    mov v0.d[0], x8
+; CHECK-GI-NEXT:    mov v0.d[1], xzr
+; CHECK-GI-NEXT:    addv h0, v0.8h
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    ret
+  %v = load i128, ptr %arr
+  %and = and i128 %v, u0xFFFF
+  %vec = bitcast i128 %and to <8 x i16>
+  %r = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %vec)
+  ret i16 %r
+}
+
+define i32 @addv_zero_lanes_v4i32(ptr %arr)  {
+; CHECK-SD-LABEL: addv_zero_lanes_v4i32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-SD-NEXT:    ldr w8, [x0]
+; CHECK-SD-NEXT:    mov v0.d[0], x8
+; CHECK-SD-NEXT:    addv s0, v0.4s
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: addv_zero_lanes_v4i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr w8, [x0]
+; CHECK-GI-NEXT:    mov v0.d[0], x8
+; CHECK-GI-NEXT:    mov v0.d[1], xzr
+; CHECK-GI-NEXT:    addv s0, v0.4s
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    ret
+  %v = load i128, ptr %arr
+  %and = and i128 %v, u0xFFFFFFFF
+  %vec = bitcast i128 %and to <4 x i32>
+  %r = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %vec)
+  ret i32 %r
+}
+
+define i32 @addv_zero_lanes_v2i32(ptr %arr)  {
+; CHECK-SD-LABEL: addv_zero_lanes_v2i32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr w0, [x0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: addv_zero_lanes_v2i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr w8, [x0]
+; CHECK-GI-NEXT:    fmov d0, x8
+; CHECK-GI-NEXT:    addp v0.2s, v0.2s, v0.2s
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    ret
+  %v = load i64, ptr %arr
+  %and = and i64 %v, u0xFFFFFFFF
+  %vec = bitcast i64 %and to <2 x i32>
+  %r = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %vec)
+  ret i32 %r
+}
diff --git a/llvm/test/CodeGen/AArch64/aarch64-tail-dup-size.ll b/llvm/test/CodeGen/AArch64/aarch64-tail-dup-size.ll
index be07404f4b2fc..dd0152c3f4c08 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-tail-dup-size.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-tail-dup-size.ll
@@ -4,7 +4,7 @@
 
 ; RUN: llc -mtriple=aarch64-none-linux -tail-dup-size=4 < %s | FileCheck %s --check-prefix=CHECK-O2
 ; RUN: llc -mtriple=aarch64-none-linux -tail-dup-placement-threshold=4 < %s | FileCheck %s --check-prefix=CHECK-O2
-; RUN: llc -mtriple=aarch64-none-linux -tail-dup-placement-threshold=6 < %s | FileCheck %s --check-prefix=CHECK-O3
+; RUN: llc -mtriple=aarch64-none-linux -tail-dup-placement-threshold=6 < %s | FileCheck %s --check-prefix=CHECK-O2-6
 
 %a = type { ptr, i32, %b }
 %b = type { %c }
@@ -29,31 +29,53 @@ define dso_local void @testcase(ptr nocapture %arg){
 ; CHECK-O2-NEXT:  .LBB0_3: // %if.end
 ; CHECK-O2-NEXT:    adrp x9, global_int
 ; CHECK-O2-NEXT:    add x2, x8, #16
-; CHECK-O2-NEXT:    mov w0, #10
+; CHECK-O2-NEXT:    mov w0, #10 // =0xa
 ; CHECK-O2-NEXT:    ldr w1, [x9, :lo12:global_int]
 ; CHECK-O2-NEXT:    b externalfunc
 ;
 ; CHECK-O3-LABEL: testcase:
 ; CHECK-O3:       // %bb.0: // %entry
-; CHECK-O3-NEXT:    adrp x8, global_ptr
-; CHECK-O3-NEXT:    ldr x9, [x8, :lo12:global_ptr]
+; CHECK-O3-NEXT:    adrp x8, .L_MergedGlobals+8
+; CHECK-O3-NEXT:    ldr x9, [x8, :lo12:.L_MergedGlobals+8]
 ; CHECK-O3-NEXT:    cbz x9, .LBB0_2
 ; CHECK-O3-NEXT:  // %bb.1: // %if.then
 ; CHECK-O3-NEXT:    ldr x9, [x9]
 ; CHECK-O3-NEXT:    str x9, [x0]
-; CHECK-O3-NEXT:    ldr x8, [x8, :lo12:global_ptr]
-; CHECK-O3-NEXT:    adrp x9, global_int
+; CHECK-O3-NEXT:    ldr x8, [x8, :lo12:.L_MergedGlobals+8]
+; CHECK-O3-NEXT:    adrp x9, .L_MergedGlobals
 ; CHECK-O3-NEXT:    add x2, x8, #16
-; CHECK-O3-NEXT:    mov w0, #10
-; CHECK-O3-NEXT:    ldr w1, [x9, :lo12:global_int]
+; CHECK-O3-NEXT:    mov w0, #10 // =0xa
+; CHECK-O3-NEXT:    ldr w1, [x9, :lo12:.L_MergedGlobals]
 ; CHECK-O3-NEXT:    b externalfunc
 ; CHECK-O3-NEXT:  .LBB0_2:
 ; CHECK-O3-NEXT:    mov x8, xzr
-; CHECK-O3-NEXT:    adrp x9, global_int
+; CHECK-O3-NEXT:    adrp x9, .L_MergedGlobals
 ; CHECK-O3-NEXT:    add x2, x8, #16
-; CHECK-O3-NEXT:    mov w0, #10
-; CHECK-O3-NEXT:    ldr w1, [x9, :lo12:global_int]
+; CHECK-O3-NEXT:    mov w0, #10 // =0xa
+; CHECK-O3-NEXT:    ldr w1, [x9, :lo12:.L_MergedGlobals]
 ; CHECK-O3-NEXT:    b externalfunc
+;
+; CHECK-O2-6-LABEL: testcase:
+; CHECK-O2-6:       // %bb.0: // %entry
+; CHECK-O2-6-NEXT:    adrp x8, global_ptr
+; CHECK-O2-6-NEXT:    ldr x9, [x8, :lo12:global_ptr]
+; CHECK-O2-6-NEXT:    cbz x9, .LBB0_2
+; CHECK-O2-6-NEXT:  // %bb.1: // %if.then
+; CHECK-O2-6-NEXT:    ldr x9, [x9]
+; CHECK-O2-6-NEXT:    str x9, [x0]
+; CHECK-O2-6-NEXT:    ldr x8, [x8, :lo12:global_ptr]
+; CHECK-O2-6-NEXT:    adrp x9, global_int
+; CHECK-O2-6-NEXT:    add x2, x8, #16
+; CHECK-O2-6-NEXT:    mov w0, #10 // =0xa
+; CHECK-O2-6-NEXT:    ldr w1, [x9, :lo12:global_int]
+; CHECK-O2-6-NEXT:    b externalfunc
+; CHECK-O2-6-NEXT:  .LBB0_2:
+; CHECK-O2-6-NEXT:    mov x8, xzr
+; CHECK-O2-6-NEXT:    adrp x9, global_int
+; CHECK-O2-6-NEXT:    add x2, x8, #16
+; CHECK-O2-6-NEXT:    mov w0, #10 // =0xa
+; CHECK-O2-6-NEXT:    ldr w1, [x9, :lo12:global_int]
+; CHECK-O2-6-NEXT:    b externalfunc
 entry:
   %0 = load ptr, ptr @global_ptr, align 8
   %cmp.not = icmp eq ptr %0, null
diff --git a/llvm/test/CodeGen/AArch64/abds-neg.ll b/llvm/test/CodeGen/AArch64/abds-neg.ll
index 02c76ba7343a0..37319642f5b34 100644
--- a/llvm/test/CodeGen/AArch64/abds-neg.ll
+++ b/llvm/test/CodeGen/AArch64/abds-neg.ll
@@ -73,8 +73,8 @@ define i16 @abd_ext_i16_i32(i16 %a, i32 %b) nounwind {
 ; CHECK-LABEL: abd_ext_i16_i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    sxth w8, w0
-; CHECK-NEXT:    subs w8, w1, w8
-; CHECK-NEXT:    cneg w0, w8, ge
+; CHECK-NEXT:    subs w8, w8, w1
+; CHECK-NEXT:    cneg w0, w8, gt
 ; CHECK-NEXT:    ret
   %aext = sext i16 %a to i64
   %bext = sext i32 %b to i64
@@ -104,8 +104,8 @@ define i16 @abd_ext_i16_undef(i16 %a, i16 %b) nounwind {
 define i32 @abd_ext_i32(i32 %a, i32 %b) nounwind {
 ; CHECK-LABEL: abd_ext_i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    subs w8, w1, w0
-; CHECK-NEXT:    cneg w0, w8, ge
+; CHECK-NEXT:    subs w8, w0, w1
+; CHECK-NEXT:    cneg w0, w8, gt
 ; CHECK-NEXT:    ret
   %aext = sext i32 %a to i64
   %bext = sext i32 %b to i64
@@ -119,9 +119,8 @@ define i32 @abd_ext_i32(i32 %a, i32 %b) nounwind {
 define i32 @abd_ext_i32_i16(i32 %a, i16 %b) nounwind {
 ; CHECK-LABEL: abd_ext_i32_i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sxth w8, w1
-; CHECK-NEXT:    subs w8, w8, w0
-; CHECK-NEXT:    cneg w0, w8, ge
+; CHECK-NEXT:    subs w8, w0, w1, sxth
+; CHECK-NEXT:    cneg w0, w8, gt
 ; CHECK-NEXT:    ret
   %aext = sext i32 %a to i64
   %bext = sext i16 %b to i64
@@ -135,8 +134,8 @@ define i32 @abd_ext_i32_i16(i32 %a, i16 %b) nounwind {
 define i32 @abd_ext_i32_undef(i32 %a, i32 %b) nounwind {
 ; CHECK-LABEL: abd_ext_i32_undef:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    subs w8, w1, w0
-; CHECK-NEXT:    cneg w0, w8, ge
+; CHECK-NEXT:    subs w8, w0, w1
+; CHECK-NEXT:    cneg w0, w8, gt
 ; CHECK-NEXT:    ret
   %aext = sext i32 %a to i64
   %bext = sext i32 %b to i64
@@ -150,8 +149,8 @@ define i32 @abd_ext_i32_undef(i32 %a, i32 %b) nounwind {
 define i64 @abd_ext_i64(i64 %a, i64 %b) nounwind {
 ; CHECK-LABEL: abd_ext_i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    subs x8, x1, x0
-; CHECK-NEXT:    cneg x0, x8, ge
+; CHECK-NEXT:    subs x8, x0, x1
+; CHECK-NEXT:    cneg x0, x8, gt
 ; CHECK-NEXT:    ret
   %aext = sext i64 %a to i128
   %bext = sext i64 %b to i128
@@ -165,8 +164,8 @@ define i64 @abd_ext_i64(i64 %a, i64 %b) nounwind {
 define i64 @abd_ext_i64_undef(i64 %a, i64 %b) nounwind {
 ; CHECK-LABEL: abd_ext_i64_undef:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    subs x8, x1, x0
-; CHECK-NEXT:    cneg x0, x8, ge
+; CHECK-NEXT:    subs x8, x0, x1
+; CHECK-NEXT:    cneg x0, x8, gt
 ; CHECK-NEXT:    ret
   %aext = sext i64 %a to i128
   %bext = sext i64 %b to i128
diff --git a/llvm/test/CodeGen/AArch64/abds.ll b/llvm/test/CodeGen/AArch64/abds.ll
index bf52e71ec21fe..1ef1c1c68c7bb 100644
--- a/llvm/test/CodeGen/AArch64/abds.ll
+++ b/llvm/test/CodeGen/AArch64/abds.ll
@@ -112,8 +112,7 @@ define i32 @abd_ext_i32(i32 %a, i32 %b) nounwind {
 define i32 @abd_ext_i32_i16(i32 %a, i16 %b) nounwind {
 ; CHECK-LABEL: abd_ext_i32_i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sxth w8, w1
-; CHECK-NEXT:    subs w8, w0, w8
+; CHECK-NEXT:    subs w8, w0, w1, sxth
 ; CHECK-NEXT:    cneg w0, w8, le
 ; CHECK-NEXT:    ret
   %aext = sext i32 %a to i64
@@ -497,13 +496,9 @@ define i32 @abd_sub_i32(i32 %a, i32 %b) nounwind {
 define i64 @vector_legalized(i16 %a, i16 %b) {
 ; CHECK-LABEL: vector_legalized:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v0.2d, #0000000000000000
 ; CHECK-NEXT:    sxth w8, w0
 ; CHECK-NEXT:    subs w8, w8, w1, sxth
-; CHECK-NEXT:    addp d0, v0.2d
-; CHECK-NEXT:    cneg w8, w8, mi
-; CHECK-NEXT:    fmov x9, d0
-; CHECK-NEXT:    add x0, x9, x8
+; CHECK-NEXT:    cneg w0, w8, mi
 ; CHECK-NEXT:    ret
   %ea = sext i16 %a to i32
   %eb = sext i16 %b to i32
diff --git a/llvm/test/CodeGen/AArch64/abdu-neg.ll b/llvm/test/CodeGen/AArch64/abdu-neg.ll
index 400031b64cb84..79fc12ea76f63 100644
--- a/llvm/test/CodeGen/AArch64/abdu-neg.ll
+++ b/llvm/test/CodeGen/AArch64/abdu-neg.ll
@@ -73,8 +73,8 @@ define i16 @abd_ext_i16_i32(i16 %a, i32 %b) nounwind {
 ; CHECK-LABEL: abd_ext_i16_i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    and w8, w0, #0xffff
-; CHECK-NEXT:    subs w8, w1, w8
-; CHECK-NEXT:    cneg w0, w8, hs
+; CHECK-NEXT:    subs w8, w8, w1
+; CHECK-NEXT:    cneg w0, w8, hi
 ; CHECK-NEXT:    ret
   %aext = zext i16 %a to i64
   %bext = zext i32 %b to i64
@@ -104,8 +104,8 @@ define i16 @abd_ext_i16_undef(i16 %a, i16 %b) nounwind {
 define i32 @abd_ext_i32(i32 %a, i32 %b) nounwind {
 ; CHECK-LABEL: abd_ext_i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    subs w8, w1, w0
-; CHECK-NEXT:    cneg w0, w8, hs
+; CHECK-NEXT:    subs w8, w0, w1
+; CHECK-NEXT:    cneg w0, w8, hi
 ; CHECK-NEXT:    ret
   %aext = zext i32 %a to i64
   %bext = zext i32 %b to i64
@@ -119,9 +119,8 @@ define i32 @abd_ext_i32(i32 %a, i32 %b) nounwind {
 define i32 @abd_ext_i32_i16(i32 %a, i16 %b) nounwind {
 ; CHECK-LABEL: abd_ext_i32_i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w1, #0xffff
-; CHECK-NEXT:    subs w8, w8, w0
-; CHECK-NEXT:    cneg w0, w8, hs
+; CHECK-NEXT:    subs w8, w0, w1, uxth
+; CHECK-NEXT:    cneg w0, w8, hi
 ; CHECK-NEXT:    ret
   %aext = zext i32 %a to i64
   %bext = zext i16 %b to i64
@@ -135,8 +134,8 @@ define i32 @abd_ext_i32_i16(i32 %a, i16 %b) nounwind {
 define i32 @abd_ext_i32_undef(i32 %a, i32 %b) nounwind {
 ; CHECK-LABEL: abd_ext_i32_undef:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    subs w8, w1, w0
-; CHECK-NEXT:    cneg w0, w8, hs
+; CHECK-NEXT:    subs w8, w0, w1
+; CHECK-NEXT:    cneg w0, w8, hi
 ; CHECK-NEXT:    ret
   %aext = zext i32 %a to i64
   %bext = zext i32 %b to i64
@@ -150,8 +149,8 @@ define i32 @abd_ext_i32_undef(i32 %a, i32 %b) nounwind {
 define i64 @abd_ext_i64(i64 %a, i64 %b) nounwind {
 ; CHECK-LABEL: abd_ext_i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    subs x8, x1, x0
-; CHECK-NEXT:    cneg x0, x8, hs
+; CHECK-NEXT:    subs x8, x0, x1
+; CHECK-NEXT:    cneg x0, x8, hi
 ; CHECK-NEXT:    ret
   %aext = zext i64 %a to i128
   %bext = zext i64 %b to i128
@@ -165,8 +164,8 @@ define i64 @abd_ext_i64(i64 %a, i64 %b) nounwind {
 define i64 @abd_ext_i64_undef(i64 %a, i64 %b) nounwind {
 ; CHECK-LABEL: abd_ext_i64_undef:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    subs x8, x1, x0
-; CHECK-NEXT:    cneg x0, x8, hs
+; CHECK-NEXT:    subs x8, x0, x1
+; CHECK-NEXT:    cneg x0, x8, hi
 ; CHECK-NEXT:    ret
   %aext = zext i64 %a to i128
   %bext = zext i64 %b to i128
diff --git a/llvm/test/CodeGen/AArch64/abdu.ll b/llvm/test/CodeGen/AArch64/abdu.ll
index 8d2b0b0742d7d..6db7693fb3a1c 100644
--- a/llvm/test/CodeGen/AArch64/abdu.ll
+++ b/llvm/test/CodeGen/AArch64/abdu.ll
@@ -112,8 +112,7 @@ define i32 @abd_ext_i32(i32 %a, i32 %b) nounwind {
 define i32 @abd_ext_i32_i16(i32 %a, i16 %b) nounwind {
 ; CHECK-LABEL: abd_ext_i32_i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w1, #0xffff
-; CHECK-NEXT:    subs w8, w0, w8
+; CHECK-NEXT:    subs w8, w0, w1, uxth
 ; CHECK-NEXT:    cneg w0, w8, ls
 ; CHECK-NEXT:    ret
   %aext = zext i32 %a to i64
@@ -362,13 +361,9 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind {
 define i64 @vector_legalized(i16 %a, i16 %b) {
 ; CHECK-LABEL: vector_legalized:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v0.2d, #0000000000000000
 ; CHECK-NEXT:    and w8, w0, #0xffff
 ; CHECK-NEXT:    subs w8, w8, w1, uxth
-; CHECK-NEXT:    cneg w8, w8, mi
-; CHECK-NEXT:    addp d0, v0.2d
-; CHECK-NEXT:    fmov x9, d0
-; CHECK-NEXT:    add x0, x9, x8
+; CHECK-NEXT:    cneg w0, w8, mi
 ; CHECK-NEXT:    ret
   %ea = zext i16 %a to i32
   %eb = zext i16 %b to i32
diff --git a/llvm/test/CodeGen/AArch64/adds_cmn.ll b/llvm/test/CodeGen/AArch64/adds_cmn.ll
index 7f1cb0df049b1..aa070b7886ba5 100644
--- a/llvm/test/CodeGen/AArch64/adds_cmn.ll
+++ b/llvm/test/CodeGen/AArch64/adds_cmn.ll
@@ -4,10 +4,8 @@
 define { i32, i32 } @adds_cmn(i32 noundef %x, i32 noundef %y) {
 ; CHECK-LABEL: adds_cmn:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    cmn w0, w1
-; CHECK-NEXT:    add w1, w0, w1
-; CHECK-NEXT:    cset w8, lo
-; CHECK-NEXT:    mov w0, w8
+; CHECK-NEXT:    adds w1, w0, w1
+; CHECK-NEXT:    cset w0, lo
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %x, i32 %y)
diff --git a/llvm/test/CodeGen/AArch64/combine-storetomstore.ll b/llvm/test/CodeGen/AArch64/combine-storetomstore.ll
index c2e54d3d39394..6ab2d7c2d7857 100644
--- a/llvm/test/CodeGen/AArch64/combine-storetomstore.ll
+++ b/llvm/test/CodeGen/AArch64/combine-storetomstore.ll
@@ -24,7 +24,6 @@ define void @test_masked_store_success_v4i16(<4 x i16> %x, ptr %ptr, <4 x i1> %m
 ; SVE-NEXT:    shl v1.4h, v1.4h, #15
 ; SVE-NEXT:    ptrue p0.h, vl4
 ; SVE-NEXT:    // kill: def $d0 killed $d0 def $z0
-; SVE-NEXT:    cmlt v1.4h, v1.4h, #0
 ; SVE-NEXT:    cmpne p0.h, p0/z, z1.h, #0
 ; SVE-NEXT:    st1h { z0.h }, p0, [x0]
 ; SVE-NEXT:    ret
@@ -41,7 +40,6 @@ define void @test_masked_store_success_v4i32(<4 x i32> %x, ptr %ptr, <4 x i1> %m
 ; SVE-NEXT:    ptrue p0.s, vl4
 ; SVE-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; SVE-NEXT:    shl v1.4s, v1.4s, #31
-; SVE-NEXT:    cmlt v1.4s, v1.4s, #0
 ; SVE-NEXT:    cmpne p0.s, p0/z, z1.s, #0
 ; SVE-NEXT:    st1w { z0.s }, p0, [x0]
 ; SVE-NEXT:    ret
@@ -63,8 +61,6 @@ define void @test_masked_store_success_v4i64(<4 x i64> %x, ptr %ptr, <4 x i1> %m
 ; SVE-NEXT:    ushll v2.2d, v2.2s, #0
 ; SVE-NEXT:    shl v3.2d, v3.2d, #63
 ; SVE-NEXT:    shl v2.2d, v2.2d, #63
-; SVE-NEXT:    cmlt v3.2d, v3.2d, #0
-; SVE-NEXT:    cmlt v2.2d, v2.2d, #0
 ; SVE-NEXT:    cmpne p1.d, p0/z, z3.d, #0
 ; SVE-NEXT:    cmpne p0.d, p0/z, z2.d, #0
 ; SVE-NEXT:    st1d { z1.d }, p1, [x0, x8, lsl #3]
@@ -82,7 +78,6 @@ define void @test_masked_store_success_v4f16(<4 x half> %x, ptr %ptr, <4 x i1> %
 ; SVE-NEXT:    shl v1.4h, v1.4h, #15
 ; SVE-NEXT:    ptrue p0.h, vl4
 ; SVE-NEXT:    // kill: def $d0 killed $d0 def $z0
-; SVE-NEXT:    cmlt v1.4h, v1.4h, #0
 ; SVE-NEXT:    cmpne p0.h, p0/z, z1.h, #0
 ; SVE-NEXT:    st1h { z0.h }, p0, [x0]
 ; SVE-NEXT:    ret
@@ -99,7 +94,6 @@ define void @test_masked_store_success_v4f32(<4 x float> %x, ptr %ptr, <4 x i1>
 ; SVE-NEXT:    ptrue p0.s, vl4
 ; SVE-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; SVE-NEXT:    shl v1.4s, v1.4s, #31
-; SVE-NEXT:    cmlt v1.4s, v1.4s, #0
 ; SVE-NEXT:    cmpne p0.s, p0/z, z1.s, #0
 ; SVE-NEXT:    st1w { z0.s }, p0, [x0]
 ; SVE-NEXT:    ret
@@ -121,8 +115,6 @@ define void @test_masked_store_success_v4f64(<4 x double> %x, ptr %ptr, <4 x i1>
 ; SVE-NEXT:    ushll v2.2d, v2.2s, #0
 ; SVE-NEXT:    shl v3.2d, v3.2d, #63
 ; SVE-NEXT:    shl v2.2d, v2.2d, #63
-; SVE-NEXT:    cmlt v3.2d, v3.2d, #0
-; SVE-NEXT:    cmlt v2.2d, v2.2d, #0
 ; SVE-NEXT:    cmpne p1.d, p0/z, z3.d, #0
 ; SVE-NEXT:    cmpne p0.d, p0/z, z2.d, #0
 ; SVE-NEXT:    st1d { z1.d }, p1, [x0, x8, lsl #3]
@@ -140,7 +132,6 @@ define void @test_masked_store_success_v8i8(<8 x i8> %x, ptr %ptr, <8 x i1> %mas
 ; SVE-NEXT:    shl v1.8b, v1.8b, #7
 ; SVE-NEXT:    ptrue p0.b, vl8
 ; SVE-NEXT:    // kill: def $d0 killed $d0 def $z0
-; SVE-NEXT:    cmlt v1.8b, v1.8b, #0
 ; SVE-NEXT:    cmpne p0.b, p0/z, z1.b, #0
 ; SVE-NEXT:    st1b { z0.b }, p0, [x0]
 ; SVE-NEXT:    ret
@@ -157,7 +148,6 @@ define void @test_masked_store_success_v8i16(<8 x i16> %x, ptr %ptr, <8 x i1> %m
 ; SVE-NEXT:    ptrue p0.h, vl8
 ; SVE-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; SVE-NEXT:    shl v1.8h, v1.8h, #15
-; SVE-NEXT:    cmlt v1.8h, v1.8h, #0
 ; SVE-NEXT:    cmpne p0.h, p0/z, z1.h, #0
 ; SVE-NEXT:    st1h { z0.h }, p0, [x0]
 ; SVE-NEXT:    ret
@@ -180,8 +170,6 @@ define void @test_masked_store_success_v8i32(<8 x i32> %x, ptr %ptr, <8 x i1> %m
 ; SVE-NEXT:    ushll v2.4s, v2.4h, #0
 ; SVE-NEXT:    shl v3.4s, v3.4s, #31
 ; SVE-NEXT:    shl v2.4s, v2.4s, #31
-; SVE-NEXT:    cmlt v3.4s, v3.4s, #0
-; SVE-NEXT:    cmlt v2.4s, v2.4s, #0
 ; SVE-NEXT:    cmpne p1.s, p0/z, z3.s, #0
 ; SVE-NEXT:    cmpne p0.s, p0/z, z2.s, #0
 ; SVE-NEXT:    st1w { z1.s }, p1, [x0, x8, lsl #2]
@@ -219,12 +207,8 @@ define void @test_masked_store_success_v8i64(<8 x i64> %x, ptr %ptr, <8 x i1> %m
 ; SVE-NEXT:    shl v4.2d, v4.2d, #63
 ; SVE-NEXT:    shl v5.2d, v5.2d, #63
 ; SVE-NEXT:    shl v6.2d, v6.2d, #63
-; SVE-NEXT:    shl v7.2d, v7.2d, #63
-; SVE-NEXT:    cmlt v4.2d, v4.2d, #0
-; SVE-NEXT:    cmlt v5.2d, v5.2d, #0
-; SVE-NEXT:    cmlt v6.2d, v6.2d, #0
 ; SVE-NEXT:    cmpne p1.d, p0/z, z4.d, #0
-; SVE-NEXT:    cmlt v4.2d, v7.2d, #0
+; SVE-NEXT:    shl v4.2d, v7.2d, #63
 ; SVE-NEXT:    cmpne p2.d, p0/z, z5.d, #0
 ; SVE-NEXT:    cmpne p3.d, p0/z, z6.d, #0
 ; SVE-NEXT:    cmpne p0.d, p0/z, z4.d, #0
@@ -247,7 +231,6 @@ define void @test_masked_store_success_v8f16(<8 x half> %x, ptr %ptr, <8 x i1> %
 ; SVE-NEXT:    ptrue p0.h, vl8
 ; SVE-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; SVE-NEXT:    shl v1.8h, v1.8h, #15
-; SVE-NEXT:    cmlt v1.8h, v1.8h, #0
 ; SVE-NEXT:    cmpne p0.h, p0/z, z1.h, #0
 ; SVE-NEXT:    st1h { z0.h }, p0, [x0]
 ; SVE-NEXT:    ret
@@ -270,8 +253,6 @@ define void @test_masked_store_success_v8f32(<8 x float> %x, ptr %ptr, <8 x i1>
 ; SVE-NEXT:    ushll v2.4s, v2.4h, #0
 ; SVE-NEXT:    shl v3.4s, v3.4s, #31
 ; SVE-NEXT:    shl v2.4s, v2.4s, #31
-; SVE-NEXT:    cmlt v3.4s, v3.4s, #0
-; SVE-NEXT:    cmlt v2.4s, v2.4s, #0
 ; SVE-NEXT:    cmpne p1.s, p0/z, z3.s, #0
 ; SVE-NEXT:    cmpne p0.s, p0/z, z2.s, #0
 ; SVE-NEXT:    st1w { z1.s }, p1, [x0, x8, lsl #2]
@@ -309,12 +290,8 @@ define void @test_masked_store_success_v8f64(<8 x double> %x, ptr %ptr, <8 x i1>
 ; SVE-NEXT:    shl v4.2d, v4.2d, #63
 ; SVE-NEXT:    shl v5.2d, v5.2d, #63
 ; SVE-NEXT:    shl v6.2d, v6.2d, #63
-; SVE-NEXT:    shl v7.2d, v7.2d, #63
-; SVE-NEXT:    cmlt v4.2d, v4.2d, #0
-; SVE-NEXT:    cmlt v5.2d, v5.2d, #0
-; SVE-NEXT:    cmlt v6.2d, v6.2d, #0
 ; SVE-NEXT:    cmpne p1.d, p0/z, z4.d, #0
-; SVE-NEXT:    cmlt v4.2d, v7.2d, #0
+; SVE-NEXT:    shl v4.2d, v7.2d, #63
 ; SVE-NEXT:    cmpne p2.d, p0/z, z5.d, #0
 ; SVE-NEXT:    cmpne p3.d, p0/z, z6.d, #0
 ; SVE-NEXT:    cmpne p0.d, p0/z, z4.d, #0
@@ -336,7 +313,6 @@ define void @test_masked_store_success_v16i8(<16 x i8> %x, ptr %ptr, <16 x i1> %
 ; SVE-NEXT:    shl v1.16b, v1.16b, #7
 ; SVE-NEXT:    ptrue p0.b, vl16
 ; SVE-NEXT:    // kill: def $q0 killed $q0 def $z0
-; SVE-NEXT:    cmlt v1.16b, v1.16b, #0
 ; SVE-NEXT:    cmpne p0.b, p0/z, z1.b, #0
 ; SVE-NEXT:    st1b { z0.b }, p0, [x0]
 ; SVE-NEXT:    ret
@@ -357,8 +333,6 @@ define void @test_masked_store_success_v16i16(<16 x i16> %x, ptr %ptr, <16 x i1>
 ; SVE-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; SVE-NEXT:    shl v3.8h, v3.8h, #15
 ; SVE-NEXT:    shl v2.8h, v2.8h, #15
-; SVE-NEXT:    cmlt v3.8h, v3.8h, #0
-; SVE-NEXT:    cmlt v2.8h, v2.8h, #0
 ; SVE-NEXT:    cmpne p1.h, p0/z, z3.h, #0
 ; SVE-NEXT:    cmpne p0.h, p0/z, z2.h, #0
 ; SVE-NEXT:    st1h { z1.h }, p1, [x0, x8, lsl #1]
@@ -391,13 +365,9 @@ define void @test_masked_store_success_v16i32(<16 x i32> %x, ptr %ptr, <16 x i1>
 ; SVE-NEXT:    ushll v7.4s, v7.4h, #0
 ; SVE-NEXT:    ushll v5.4s, v5.4h, #0
 ; SVE-NEXT:    shl v4.4s, v4.4s, #31
-; SVE-NEXT:    cmlt v6.4s, v6.4s, #0
+; SVE-NEXT:    cmpne p1.s, p0/z, z6.s, #0
 ; SVE-NEXT:    shl v7.4s, v7.4s, #31
 ; SVE-NEXT:    shl v5.4s, v5.4s, #31
-; SVE-NEXT:    cmlt v4.4s, v4.4s, #0
-; SVE-NEXT:    cmpne p1.s, p0/z, z6.s, #0
-; SVE-NEXT:    cmlt v7.4s, v7.4s, #0
-; SVE-NEXT:    cmlt v5.4s, v5.4s, #0
 ; SVE-NEXT:    cmpne p2.s, p0/z, z7.s, #0
 ; SVE-NEXT:    cmpne p3.s, p0/z, z5.s, #0
 ; SVE-NEXT:    cmpne p0.s, p0/z, z4.s, #0
@@ -479,8 +449,6 @@ define void @test_masked_store_success_v32i8(<32 x i8> %x, ptr %ptr, <32 x i1> %
 ; SVE-NEXT:    mov w8, #16 // =0x10
 ; SVE-NEXT:    shl v2.16b, v2.16b, #7
 ; SVE-NEXT:    shl v3.16b, v3.16b, #7
-; SVE-NEXT:    cmlt v2.16b, v2.16b, #0
-; SVE-NEXT:    cmlt v3.16b, v3.16b, #0
 ; SVE-NEXT:    cmpne p1.b, p0/z, z3.b, #0
 ; SVE-NEXT:    cmpne p0.b, p0/z, z2.b, #0
 ; SVE-NEXT:    st1b { z1.b }, p1, [x0, x8]
@@ -565,12 +533,8 @@ define void @test_masked_store_success_v32i16(<32 x i16> %x, ptr %ptr, <32 x i1>
 ; SVE-NEXT:    shl v4.8h, v4.8h, #15
 ; SVE-NEXT:    shl v5.8h, v5.8h, #15
 ; SVE-NEXT:    shl v6.8h, v6.8h, #15
-; SVE-NEXT:    shl v7.8h, v7.8h, #15
-; SVE-NEXT:    cmlt v4.8h, v4.8h, #0
-; SVE-NEXT:    cmlt v5.8h, v5.8h, #0
-; SVE-NEXT:    cmlt v6.8h, v6.8h, #0
 ; SVE-NEXT:    cmpne p1.h, p0/z, z4.h, #0
-; SVE-NEXT:    cmlt v4.8h, v7.8h, #0
+; SVE-NEXT:    shl v4.8h, v7.8h, #15
 ; SVE-NEXT:    cmpne p2.h, p0/z, z5.h, #0
 ; SVE-NEXT:    cmpne p3.h, p0/z, z6.h, #0
 ; SVE-NEXT:    cmpne p0.h, p0/z, z4.h, #0
@@ -595,144 +559,140 @@ define void @test_masked_store_success_v64i8(<64 x i8> %x, ptr %ptr, <64 x i1> %
 ; SVE-NEXT:    .cfi_offset w29, -16
 ; SVE-NEXT:    ldr w8, [sp, #216]
 ; SVE-NEXT:    ldr w9, [sp, #344]
-; SVE-NEXT:    fmov s7, w1
+; SVE-NEXT:    fmov s6, w1
 ; SVE-NEXT:    ldr w11, [sp, #88]
 ; SVE-NEXT:    ldr w10, [sp, #224]
 ; SVE-NEXT:    ptrue p0.b, vl16
-; SVE-NEXT:    fmov s4, w8
-; SVE-NEXT:    fmov s5, w9
+; SVE-NEXT:    fmov s5, w8
+; SVE-NEXT:    fmov s4, w9
 ; SVE-NEXT:    ldr w8, [sp, #352]
-; SVE-NEXT:    fmov s6, w11
+; SVE-NEXT:    fmov s7, w11
 ; SVE-NEXT:    ldr w9, [sp, #96]
-; SVE-NEXT:    mov v7.b[1], w2
+; SVE-NEXT:    mov v6.b[1], w2
 ; SVE-NEXT:    // kill: def $q2 killed $q2 def $z2
 ; SVE-NEXT:    // kill: def $q3 killed $q3 def $z3
 ; SVE-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; SVE-NEXT:    // kill: def $q0 killed $q0 def $z0
-; SVE-NEXT:    mov v4.b[1], w10
-; SVE-NEXT:    mov v5.b[1], w8
+; SVE-NEXT:    mov v5.b[1], w10
+; SVE-NEXT:    mov v4.b[1], w8
 ; SVE-NEXT:    ldr w8, [sp, #232]
-; SVE-NEXT:    mov v6.b[1], w9
+; SVE-NEXT:    mov v7.b[1], w9
 ; SVE-NEXT:    ldr w9, [sp, #360]
 ; SVE-NEXT:    ldr w10, [sp, #112]
-; SVE-NEXT:    mov v7.b[2], w3
-; SVE-NEXT:    mov v4.b[2], w8
+; SVE-NEXT:    mov v6.b[2], w3
+; SVE-NEXT:    mov v5.b[2], w8
 ; SVE-NEXT:    ldr w8, [sp, #104]
-; SVE-NEXT:    mov v5.b[2], w9
+; SVE-NEXT:    mov v4.b[2], w9
 ; SVE-NEXT:    ldr w9, [sp, #368]
-; SVE-NEXT:    mov v6.b[2], w8
+; SVE-NEXT:    mov v7.b[2], w8
 ; SVE-NEXT:    ldr w8, [sp, #240]
-; SVE-NEXT:    mov v7.b[3], w4
-; SVE-NEXT:    mov v4.b[3], w8
-; SVE-NEXT:    mov v5.b[3], w9
+; SVE-NEXT:    mov v6.b[3], w4
+; SVE-NEXT:    mov v5.b[3], w8
+; SVE-NEXT:    mov v4.b[3], w9
 ; SVE-NEXT:    ldr w8, [sp, #248]
 ; SVE-NEXT:    ldr w9, [sp, #376]
-; SVE-NEXT:    mov v6.b[3], w10
+; SVE-NEXT:    mov v7.b[3], w10
 ; SVE-NEXT:    ldr w10, [sp, #120]
-; SVE-NEXT:    mov v7.b[4], w5
-; SVE-NEXT:    mov v4.b[4], w8
-; SVE-NEXT:    mov v5.b[4], w9
+; SVE-NEXT:    mov v6.b[4], w5
+; SVE-NEXT:    mov v5.b[4], w8
+; SVE-NEXT:    mov v4.b[4], w9
 ; SVE-NEXT:    ldr w8, [sp, #256]
 ; SVE-NEXT:    ldr w9, [sp, #384]
-; SVE-NEXT:    mov v6.b[4], w10
+; SVE-NEXT:    mov v7.b[4], w10
 ; SVE-NEXT:    ldr w10, [sp, #128]
-; SVE-NEXT:    mov v7.b[5], w6
-; SVE-NEXT:    mov v4.b[5], w8
-; SVE-NEXT:    mov v5.b[5], w9
+; SVE-NEXT:    mov v6.b[5], w6
+; SVE-NEXT:    mov v5.b[5], w8
+; SVE-NEXT:    mov v4.b[5], w9
 ; SVE-NEXT:    ldr w8, [sp, #264]
 ; SVE-NEXT:    ldr w9, [sp, #392]
-; SVE-NEXT:    mov v6.b[5], w10
+; SVE-NEXT:    mov v7.b[5], w10
 ; SVE-NEXT:    ldr w10, [sp, #136]
-; SVE-NEXT:    mov v7.b[6], w7
-; SVE-NEXT:    mov v4.b[6], w8
-; SVE-NEXT:    mov v5.b[6], w9
+; SVE-NEXT:    mov v6.b[6], w7
+; SVE-NEXT:    mov v5.b[6], w8
+; SVE-NEXT:    mov v4.b[6], w9
 ; SVE-NEXT:    ldr w8, [sp, #272]
 ; SVE-NEXT:    ldr w9, [sp, #400]
-; SVE-NEXT:    mov v6.b[6], w10
+; SVE-NEXT:    mov v7.b[6], w10
 ; SVE-NEXT:    ldr w10, [sp, #144]
-; SVE-NEXT:    mov v4.b[7], w8
+; SVE-NEXT:    mov v5.b[7], w8
 ; SVE-NEXT:    ldr w8, [sp, #16]
-; SVE-NEXT:    mov v5.b[7], w9
+; SVE-NEXT:    mov v4.b[7], w9
 ; SVE-NEXT:    ldr w9, [sp, #280]
-; SVE-NEXT:    mov v6.b[7], w10
-; SVE-NEXT:    mov v7.b[7], w8
+; SVE-NEXT:    mov v7.b[7], w10
+; SVE-NEXT:    mov v6.b[7], w8
 ; SVE-NEXT:    ldr w10, [sp, #408]
 ; SVE-NEXT:    ldr w8, [sp, #152]
-; SVE-NEXT:    mov v4.b[8], w9
+; SVE-NEXT:    mov v5.b[8], w9
 ; SVE-NEXT:    ldr w9, [sp, #24]
-; SVE-NEXT:    mov v5.b[8], w10
+; SVE-NEXT:    mov v4.b[8], w10
 ; SVE-NEXT:    ldr w10, [sp, #288]
-; SVE-NEXT:    mov v6.b[8], w8
-; SVE-NEXT:    mov v7.b[8], w9
+; SVE-NEXT:    mov v7.b[8], w8
+; SVE-NEXT:    mov v6.b[8], w9
 ; SVE-NEXT:    ldr w8, [sp, #416]
 ; SVE-NEXT:    ldr w9, [sp, #160]
-; SVE-NEXT:    mov v4.b[9], w10
+; SVE-NEXT:    mov v5.b[9], w10
 ; SVE-NEXT:    ldr w10, [sp, #32]
-; SVE-NEXT:    mov v5.b[9], w8
+; SVE-NEXT:    mov v4.b[9], w8
 ; SVE-NEXT:    ldr w8, [sp, #296]
-; SVE-NEXT:    mov v6.b[9], w9
-; SVE-NEXT:    mov v7.b[9], w10
+; SVE-NEXT:    mov v7.b[9], w9
+; SVE-NEXT:    mov v6.b[9], w10
 ; SVE-NEXT:    ldr w9, [sp, #424]
 ; SVE-NEXT:    ldr w10, [sp, #168]
-; SVE-NEXT:    mov v4.b[10], w8
+; SVE-NEXT:    mov v5.b[10], w8
 ; SVE-NEXT:    ldr w8, [sp, #40]
-; SVE-NEXT:    mov v5.b[10], w9
+; SVE-NEXT:    mov v4.b[10], w9
 ; SVE-NEXT:    ldr w9, [sp, #304]
-; SVE-NEXT:    mov v6.b[10], w10
-; SVE-NEXT:    mov v7.b[10], w8
+; SVE-NEXT:    mov v7.b[10], w10
+; SVE-NEXT:    mov v6.b[10], w8
 ; SVE-NEXT:    ldr w10, [sp, #432]
 ; SVE-NEXT:    ldr w8, [sp, #176]
-; SVE-NEXT:    mov v4.b[11], w9
+; SVE-NEXT:    mov v5.b[11], w9
 ; SVE-NEXT:    ldr w9, [sp, #48]
-; SVE-NEXT:    mov v5.b[11], w10
+; SVE-NEXT:    mov v4.b[11], w10
 ; SVE-NEXT:    ldr w10, [sp, #312]
-; SVE-NEXT:    mov v6.b[11], w8
-; SVE-NEXT:    mov v7.b[11], w9
+; SVE-NEXT:    mov v7.b[11], w8
+; SVE-NEXT:    mov v6.b[11], w9
 ; SVE-NEXT:    ldr w8, [sp, #440]
 ; SVE-NEXT:    ldr w9, [sp, #184]
-; SVE-NEXT:    mov v4.b[12], w10
+; SVE-NEXT:    mov v5.b[12], w10
 ; SVE-NEXT:    ldr w10, [sp, #56]
-; SVE-NEXT:    mov v5.b[12], w8
+; SVE-NEXT:    mov v4.b[12], w8
 ; SVE-NEXT:    ldr w8, [sp, #320]
-; SVE-NEXT:    mov v6.b[12], w9
-; SVE-NEXT:    mov v7.b[12], w10
+; SVE-NEXT:    mov v7.b[12], w9
+; SVE-NEXT:    mov v6.b[12], w10
 ; SVE-NEXT:    ldr w9, [sp, #448]
 ; SVE-NEXT:    ldr w10, [sp, #192]
-; SVE-NEXT:    mov v4.b[13], w8
+; SVE-NEXT:    mov v5.b[13], w8
 ; SVE-NEXT:    ldr w8, [sp, #64]
-; SVE-NEXT:    mov v5.b[13], w9
+; SVE-NEXT:    mov v4.b[13], w9
 ; SVE-NEXT:    ldr w9, [sp, #328]
-; SVE-NEXT:    mov v6.b[13], w10
-; SVE-NEXT:    mov v7.b[13], w8
+; SVE-NEXT:    mov v7.b[13], w10
+; SVE-NEXT:    mov v6.b[13], w8
 ; SVE-NEXT:    ldr w10, [sp, #456]
 ; SVE-NEXT:    ldr w8, [sp, #200]
-; SVE-NEXT:    mov v4.b[14], w9
+; SVE-NEXT:    mov v5.b[14], w9
 ; SVE-NEXT:    ldr w9, [sp, #72]
-; SVE-NEXT:    mov v5.b[14], w10
+; SVE-NEXT:    mov v4.b[14], w10
 ; SVE-NEXT:    ldr w10, [sp, #336]
-; SVE-NEXT:    mov v6.b[14], w8
-; SVE-NEXT:    mov v7.b[14], w9
+; SVE-NEXT:    mov v7.b[14], w8
+; SVE-NEXT:    mov v6.b[14], w9
 ; SVE-NEXT:    ldr w8, [sp, #464]
 ; SVE-NEXT:    ldr w9, [sp, #208]
-; SVE-NEXT:    mov v4.b[15], w10
+; SVE-NEXT:    mov v5.b[15], w10
 ; SVE-NEXT:    ldr w10, [sp, #80]
-; SVE-NEXT:    mov v5.b[15], w8
+; SVE-NEXT:    mov v4.b[15], w8
 ; SVE-NEXT:    mov w8, #32 // =0x20
-; SVE-NEXT:    mov v6.b[15], w9
-; SVE-NEXT:    mov v7.b[15], w10
+; SVE-NEXT:    mov v7.b[15], w9
+; SVE-NEXT:    mov v6.b[15], w10
 ; SVE-NEXT:    mov w9, #48 // =0x30
-; SVE-NEXT:    shl v4.16b, v4.16b, #7
 ; SVE-NEXT:    shl v5.16b, v5.16b, #7
-; SVE-NEXT:    shl v6.16b, v6.16b, #7
+; SVE-NEXT:    shl v4.16b, v4.16b, #7
 ; SVE-NEXT:    shl v7.16b, v7.16b, #7
-; SVE-NEXT:    cmlt v4.16b, v4.16b, #0
-; SVE-NEXT:    cmlt v5.16b, v5.16b, #0
-; SVE-NEXT:    cmlt v6.16b, v6.16b, #0
-; SVE-NEXT:    cmpne p1.b, p0/z, z4.b, #0
-; SVE-NEXT:    cmlt v4.16b, v7.16b, #0
-; SVE-NEXT:    cmpne p2.b, p0/z, z5.b, #0
-; SVE-NEXT:    cmpne p3.b, p0/z, z6.b, #0
-; SVE-NEXT:    cmpne p0.b, p0/z, z4.b, #0
+; SVE-NEXT:    cmpne p1.b, p0/z, z5.b, #0
+; SVE-NEXT:    shl v5.16b, v6.16b, #7
+; SVE-NEXT:    cmpne p2.b, p0/z, z4.b, #0
+; SVE-NEXT:    cmpne p3.b, p0/z, z7.b, #0
+; SVE-NEXT:    cmpne p0.b, p0/z, z5.b, #0
 ; SVE-NEXT:    st1b { z2.b }, p1, [x0, x8]
 ; SVE-NEXT:    mov w8, #16 // =0x10
 ; SVE-NEXT:    st1b { z3.b }, p2, [x0, x9]
@@ -755,7 +715,6 @@ define void @test_masked_store_success_invert_mask_v4i32(<4 x i32> %x, ptr %ptr,
 ; SVE-NEXT:    eor v1.8b, v1.8b, v2.8b
 ; SVE-NEXT:    ushll v1.4s, v1.4h, #0
 ; SVE-NEXT:    shl v1.4s, v1.4s, #31
-; SVE-NEXT:    cmlt v1.4s, v1.4s, #0
 ; SVE-NEXT:    cmpne p0.s, p0/z, z1.s, #0
 ; SVE-NEXT:    st1w { z0.s }, p0, [x0]
 ; SVE-NEXT:    ret
@@ -947,29 +906,27 @@ define void @test_masked_store_multiple_v8i32(<8 x i32> %x, <8 x i32> %y, ptr %p
 ; SVE-LABEL: test_masked_store_multiple_v8i32:
 ; SVE:       // %bb.0:
 ; SVE-NEXT:    // kill: def $q0 killed $q0 def $z0
-; SVE-NEXT:    zip2 v6.8b, v4.8b, v0.8b
-; SVE-NEXT:    zip1 v4.8b, v4.8b, v0.8b
+; SVE-NEXT:    zip1 v6.8b, v5.8b, v0.8b
+; SVE-NEXT:    zip2 v7.8b, v4.8b, v0.8b
 ; SVE-NEXT:    mov x8, #4 // =0x4
-; SVE-NEXT:    zip1 v7.8b, v5.8b, v0.8b
 ; SVE-NEXT:    zip2 v5.8b, v5.8b, v0.8b
+; SVE-NEXT:    zip1 v4.8b, v4.8b, v0.8b
 ; SVE-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; SVE-NEXT:    ptrue p0.s, vl4
 ; SVE-NEXT:    ushll v6.4s, v6.4h, #0
-; SVE-NEXT:    ushll v4.4s, v4.4h, #0
 ; SVE-NEXT:    ushll v7.4s, v7.4h, #0
 ; SVE-NEXT:    ushll v5.4s, v5.4h, #0
+; SVE-NEXT:    ushll v4.4s, v4.4h, #0
 ; SVE-NEXT:    shl v6.4s, v6.4s, #31
-; SVE-NEXT:    shl v4.4s, v4.4s, #31
 ; SVE-NEXT:    shl v7.4s, v7.4s, #31
 ; SVE-NEXT:    shl v5.4s, v5.4s, #31
+; SVE-NEXT:    shl v4.4s, v4.4s, #31
 ; SVE-NEXT:    cmlt v6.4s, v6.4s, #0
-; SVE-NEXT:    cmlt v4.4s, v4.4s, #0
-; SVE-NEXT:    cmlt v7.4s, v7.4s, #0
+; SVE-NEXT:    cmpne p1.s, p0/z, z7.s, #0
 ; SVE-NEXT:    cmlt v5.4s, v5.4s, #0
-; SVE-NEXT:    cmpne p1.s, p0/z, z6.s, #0
-; SVE-NEXT:    ldp q6, q16, [x1]
+; SVE-NEXT:    ldp q7, q16, [x1]
 ; SVE-NEXT:    cmpne p0.s, p0/z, z4.s, #0
-; SVE-NEXT:    bif v2.16b, v6.16b, v7.16b
+; SVE-NEXT:    bif v2.16b, v7.16b, v6.16b
 ; SVE-NEXT:    bif v3.16b, v16.16b, v5.16b
 ; SVE-NEXT:    st1w { z1.s }, p1, [x0, x8, lsl #2]
 ; SVE-NEXT:    st1w { z0.s }, p0, [x0]
@@ -987,74 +944,70 @@ define void @test_masked_store_multiple_v8i32(<8 x i32> %x, <8 x i32> %y, ptr %p
 define void @test_masked_store_multiple_v8i64(<8 x i64> %x, <8 x i64> %y, ptr %ptr1, ptr %ptr2, <8 x i1> %mask, <8 x i1> %mask2) {
 ; SVE-LABEL: test_masked_store_multiple_v8i64:
 ; SVE:       // %bb.0:
-; SVE-NEXT:    ldp d16, d18, [sp]
-; SVE-NEXT:    ptrue p0.d, vl2
+; SVE-NEXT:    ldp d16, d17, [sp]
+; SVE-NEXT:    ptrue p1.d, vl2
+; SVE-NEXT:    mov x9, #4 // =0x4
 ; SVE-NEXT:    // kill: def $q3 killed $q3 def $z3
 ; SVE-NEXT:    // kill: def $q2 killed $q2 def $z2
-; SVE-NEXT:    // kill: def $q0 killed $q0 def $z0
-; SVE-NEXT:    mov x8, #6 // =0x6
-; SVE-NEXT:    mov x9, #4 // =0x4
 ; SVE-NEXT:    // kill: def $q1 killed $q1 def $z1
-; SVE-NEXT:    mov b17, v16.b[4]
-; SVE-NEXT:    mov b19, v16.b[2]
-; SVE-NEXT:    mov b20, v16.b[6]
-; SVE-NEXT:    mov b21, v16.b[0]
-; SVE-NEXT:    mov b22, v18.b[4]
-; SVE-NEXT:    mov b23, v18.b[6]
-; SVE-NEXT:    mov b24, v18.b[0]
-; SVE-NEXT:    mov b25, v18.b[2]
-; SVE-NEXT:    mov v17.b[4], v16.b[5]
-; SVE-NEXT:    mov v19.b[4], v16.b[3]
-; SVE-NEXT:    mov v20.b[4], v16.b[7]
-; SVE-NEXT:    mov v21.b[4], v16.b[1]
-; SVE-NEXT:    mov v22.b[4], v18.b[5]
-; SVE-NEXT:    mov v23.b[4], v18.b[7]
-; SVE-NEXT:    mov v24.b[4], v18.b[1]
-; SVE-NEXT:    mov v25.b[4], v18.b[3]
-; SVE-NEXT:    ushll v17.2d, v17.2s, #0
-; SVE-NEXT:    ushll v18.2d, v21.2s, #0
-; SVE-NEXT:    ushll v21.2d, v24.2s, #0
-; SVE-NEXT:    shl v16.2d, v17.2d, #63
+; SVE-NEXT:    mov x8, #6 // =0x6
+; SVE-NEXT:    // kill: def $q0 killed $q0 def $z0
+; SVE-NEXT:    mov b18, v16.b[0]
+; SVE-NEXT:    mov b19, v16.b[4]
+; SVE-NEXT:    mov b20, v17.b[4]
+; SVE-NEXT:    mov b21, v16.b[6]
+; SVE-NEXT:    mov b22, v16.b[2]
+; SVE-NEXT:    mov b23, v17.b[6]
+; SVE-NEXT:    mov b24, v17.b[2]
+; SVE-NEXT:    mov b25, v17.b[0]
+; SVE-NEXT:    mov v18.b[4], v16.b[1]
+; SVE-NEXT:    mov v19.b[4], v16.b[5]
+; SVE-NEXT:    mov v20.b[4], v17.b[5]
+; SVE-NEXT:    mov v21.b[4], v16.b[7]
+; SVE-NEXT:    mov v22.b[4], v16.b[3]
+; SVE-NEXT:    mov v23.b[4], v17.b[7]
+; SVE-NEXT:    mov v24.b[4], v17.b[3]
+; SVE-NEXT:    mov v25.b[4], v17.b[1]
+; SVE-NEXT:    ushll v18.2d, v18.2s, #0
 ; SVE-NEXT:    ushll v17.2d, v19.2s, #0
-; SVE-NEXT:    ushll v19.2d, v20.2s, #0
-; SVE-NEXT:    ushll v20.2d, v22.2s, #0
-; SVE-NEXT:    shl v18.2d, v18.2d, #63
-; SVE-NEXT:    ushll v22.2d, v25.2s, #0
-; SVE-NEXT:    shl v21.2d, v21.2d, #63
-; SVE-NEXT:    cmlt v16.2d, v16.2d, #0
+; SVE-NEXT:    ushll v19.2d, v21.2s, #0
+; SVE-NEXT:    ushll v21.2d, v25.2s, #0
+; SVE-NEXT:    shl v16.2d, v18.2d, #63
+; SVE-NEXT:    ushll v18.2d, v20.2s, #0
+; SVE-NEXT:    ushll v20.2d, v23.2s, #0
 ; SVE-NEXT:    shl v17.2d, v17.2d, #63
 ; SVE-NEXT:    shl v19.2d, v19.2d, #63
+; SVE-NEXT:    shl v21.2d, v21.2d, #63
+; SVE-NEXT:    cmpne p0.d, p1/z, z16.d, #0
+; SVE-NEXT:    ushll v16.2d, v22.2s, #0
+; SVE-NEXT:    shl v18.2d, v18.2d, #63
+; SVE-NEXT:    ushll v22.2d, v24.2s, #0
 ; SVE-NEXT:    shl v20.2d, v20.2d, #63
-; SVE-NEXT:    cmlt v18.2d, v18.2d, #0
-; SVE-NEXT:    shl v22.2d, v22.2d, #63
-; SVE-NEXT:    cmlt v21.2d, v21.2d, #0
-; SVE-NEXT:    cmpne p1.d, p0/z, z16.d, #0
-; SVE-NEXT:    ushll v16.2d, v23.2s, #0
-; SVE-NEXT:    cmlt v17.2d, v17.2d, #0
-; SVE-NEXT:    cmlt v19.2d, v19.2d, #0
-; SVE-NEXT:    cmlt v20.2d, v20.2d, #0
+; SVE-NEXT:    cmpne p2.d, p1/z, z17.d, #0
+; SVE-NEXT:    cmpne p3.d, p1/z, z19.d, #0
 ; SVE-NEXT:    shl v16.2d, v16.2d, #63
-; SVE-NEXT:    cmpne p2.d, p0/z, z17.d, #0
-; SVE-NEXT:    cmpne p3.d, p0/z, z19.d, #0
-; SVE-NEXT:    ldp q17, q19, [x1, #32]
-; SVE-NEXT:    cmpne p0.d, p0/z, z18.d, #0
-; SVE-NEXT:    cmlt v16.2d, v16.2d, #0
-; SVE-NEXT:    bif v6.16b, v17.16b, v20.16b
-; SVE-NEXT:    cmlt v20.2d, v22.2d, #0
-; SVE-NEXT:    ldp q17, q18, [x1]
-; SVE-NEXT:    st1d { z2.d }, p1, [x0, x9, lsl #3]
-; SVE-NEXT:    mov v2.16b, v16.16b
+; SVE-NEXT:    cmlt v17.2d, v18.2d, #0
+; SVE-NEXT:    shl v18.2d, v22.2d, #63
+; SVE-NEXT:    ldp q19, q22, [x1, #32]
+; SVE-NEXT:    cmpne p1.d, p1/z, z16.d, #0
+; SVE-NEXT:    cmlt v16.2d, v20.2d, #0
+; SVE-NEXT:    cmlt v20.2d, v21.2d, #0
+; SVE-NEXT:    cmlt v18.2d, v18.2d, #0
+; SVE-NEXT:    bif v6.16b, v19.16b, v17.16b
+; SVE-NEXT:    ldp q17, q19, [x1]
+; SVE-NEXT:    st1d { z2.d }, p2, [x0, x9, lsl #3]
+; SVE-NEXT:    mov x9, #2 // =0x2
 ; SVE-NEXT:    st1d { z3.d }, p3, [x0, x8, lsl #3]
-; SVE-NEXT:    mov v3.16b, v21.16b
+; SVE-NEXT:    mov v2.16b, v16.16b
+; SVE-NEXT:    mov v3.16b, v20.16b
+; SVE-NEXT:    st1d { z1.d }, p1, [x0, x9, lsl #3]
+; SVE-NEXT:    mov v1.16b, v18.16b
 ; SVE-NEXT:    st1d { z0.d }, p0, [x0]
-; SVE-NEXT:    mov v0.16b, v20.16b
-; SVE-NEXT:    mov x9, #2 // =0x2
-; SVE-NEXT:    st1d { z1.d }, p2, [x0, x9, lsl #3]
-; SVE-NEXT:    bsl v2.16b, v7.16b, v19.16b
+; SVE-NEXT:    bsl v2.16b, v7.16b, v22.16b
 ; SVE-NEXT:    bsl v3.16b, v4.16b, v17.16b
-; SVE-NEXT:    bsl v0.16b, v5.16b, v18.16b
+; SVE-NEXT:    bsl v1.16b, v5.16b, v19.16b
 ; SVE-NEXT:    stp q6, q2, [x1, #32]
-; SVE-NEXT:    stp q3, q0, [x1]
+; SVE-NEXT:    stp q3, q1, [x1]
 ; SVE-NEXT:    ret
   %load = load <8 x i64>, ptr %ptr1, align 32
   %load2 = load <8 x i64>, ptr %ptr2, align 32
@@ -1073,7 +1026,6 @@ define void @test_masked_store_unaligned_v4i32(<4 x i32> %data, ptr %ptr, <4 x i
 ; SVE-NEXT:    add x8, x0, #1
 ; SVE-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; SVE-NEXT:    shl v1.4s, v1.4s, #31
-; SVE-NEXT:    cmlt v1.4s, v1.4s, #0
 ; SVE-NEXT:    cmpne p0.s, p0/z, z1.s, #0
 ; SVE-NEXT:    st1w { z0.s }, p0, [x8]
 ; SVE-NEXT:    ret
@@ -1098,8 +1050,6 @@ define void @test_masked_store_unaligned_v4i64(<4 x i64> %data, ptr %ptr, <4 x i
 ; SVE-NEXT:    ushll v2.2d, v2.2s, #0
 ; SVE-NEXT:    shl v3.2d, v3.2d, #63
 ; SVE-NEXT:    shl v2.2d, v2.2d, #63
-; SVE-NEXT:    cmlt v3.2d, v3.2d, #0
-; SVE-NEXT:    cmlt v2.2d, v2.2d, #0
 ; SVE-NEXT:    cmpne p1.d, p0/z, z3.d, #0
 ; SVE-NEXT:    cmpne p0.d, p0/z, z2.d, #0
 ; SVE-NEXT:    st1d { z1.d }, p1, [x8]
@@ -1127,8 +1077,6 @@ define void @test_masked_store_unaligned_v8i32(<8 x i32> %data, ptr %ptr, <8 x i
 ; SVE-NEXT:    ushll v2.4s, v2.4h, #0
 ; SVE-NEXT:    shl v3.4s, v3.4s, #31
 ; SVE-NEXT:    shl v2.4s, v2.4s, #31
-; SVE-NEXT:    cmlt v3.4s, v3.4s, #0
-; SVE-NEXT:    cmlt v2.4s, v2.4s, #0
 ; SVE-NEXT:    cmpne p1.s, p0/z, z3.s, #0
 ; SVE-NEXT:    cmpne p0.s, p0/z, z2.s, #0
 ; SVE-NEXT:    st1w { z0.s }, p1, [x8]
@@ -1168,12 +1116,8 @@ define void @test_masked_store_unaligned_v8i64(<8 x i64> %data, ptr %ptr, <8 x i
 ; SVE-NEXT:    shl v4.2d, v4.2d, #63
 ; SVE-NEXT:    shl v5.2d, v5.2d, #63
 ; SVE-NEXT:    shl v6.2d, v6.2d, #63
-; SVE-NEXT:    shl v7.2d, v7.2d, #63
-; SVE-NEXT:    cmlt v4.2d, v4.2d, #0
-; SVE-NEXT:    cmlt v5.2d, v5.2d, #0
-; SVE-NEXT:    cmlt v6.2d, v6.2d, #0
 ; SVE-NEXT:    cmpne p1.d, p0/z, z4.d, #0
-; SVE-NEXT:    cmlt v4.2d, v7.2d, #0
+; SVE-NEXT:    shl v4.2d, v7.2d, #63
 ; SVE-NEXT:    cmpne p2.d, p0/z, z5.d, #0
 ; SVE-NEXT:    cmpne p3.d, p0/z, z6.d, #0
 ; SVE-NEXT:    cmpne p0.d, p0/z, z4.d, #0
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-cdot.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-cdot.ll
index 11cf4c31936d8..ebb2da9a3edd2 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-cdot.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-cdot.ll
@@ -45,10 +45,10 @@ define i32 @cdotp_i8_rot0(<vscale x 32 x i8> %a, <vscale x 32 x i8> %b) {
 ; CHECK-SVE-NEXT:    [[B_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[B_REAL]] to <vscale x 16 x i32>
 ; CHECK-SVE-NEXT:    [[B_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[B_IMAG]] to <vscale x 16 x i32>
 ; CHECK-SVE-NEXT:    [[REAL_MUL:%.*]] = mul <vscale x 16 x i32> [[B_REAL_EXT]], [[A_REAL_EXT]]
-; CHECK-SVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[REAL_MUL]])
+; CHECK-SVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[REAL_MUL]])
 ; CHECK-SVE-NEXT:    [[IMAG_MUL:%.*]] = mul <vscale x 16 x i32> [[B_IMAG_EXT]], [[A_IMAG_EXT]]
 ; CHECK-SVE-NEXT:    [[IMAG_MUL_NEG:%.*]] = sub <vscale x 16 x i32> zeroinitializer, [[IMAG_MUL]]
-; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[REAL_MUL_REDUCED]], <vscale x 16 x i32> [[IMAG_MUL_NEG]])
+; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[REAL_MUL_REDUCED]], <vscale x 16 x i32> [[IMAG_MUL_NEG]])
 ; CHECK-SVE-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
 ; CHECK-SVE:       [[MIDDLE_BLOCK]]:
 ; CHECK-SVE-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE_SUB]])
@@ -71,10 +71,10 @@ define i32 @cdotp_i8_rot0(<vscale x 32 x i8> %a, <vscale x 32 x i8> %b) {
 ; CHECK-NOSVE-NEXT:    [[B_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[B_REAL]] to <vscale x 16 x i32>
 ; CHECK-NOSVE-NEXT:    [[B_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[B_IMAG]] to <vscale x 16 x i32>
 ; CHECK-NOSVE-NEXT:    [[REAL_MUL:%.*]] = mul <vscale x 16 x i32> [[B_REAL_EXT]], [[A_REAL_EXT]]
-; CHECK-NOSVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[REAL_MUL]])
+; CHECK-NOSVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[REAL_MUL]])
 ; CHECK-NOSVE-NEXT:    [[IMAG_MUL:%.*]] = mul <vscale x 16 x i32> [[B_IMAG_EXT]], [[A_IMAG_EXT]]
 ; CHECK-NOSVE-NEXT:    [[IMAG_MUL_NEG:%.*]] = sub <vscale x 16 x i32> zeroinitializer, [[IMAG_MUL]]
-; CHECK-NOSVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[REAL_MUL_REDUCED]], <vscale x 16 x i32> [[IMAG_MUL_NEG]])
+; CHECK-NOSVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[REAL_MUL_REDUCED]], <vscale x 16 x i32> [[IMAG_MUL_NEG]])
 ; CHECK-NOSVE-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
 ; CHECK-NOSVE:       [[MIDDLE_BLOCK]]:
 ; CHECK-NOSVE-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE_SUB]])
@@ -96,10 +96,10 @@ vector.body:                                      ; preds = %vector.body, %entry
   %b.real.ext = sext <vscale x 16 x i8> %b.real to <vscale x 16 x i32>
   %b.imag.ext = sext <vscale x 16 x i8> %b.imag to <vscale x 16 x i32>
   %real.mul = mul <vscale x 16 x i32> %b.real.ext, %a.real.ext
-  %real.mul.reduced = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %vec.phi, <vscale x 16 x i32> %real.mul)
+  %real.mul.reduced = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %vec.phi, <vscale x 16 x i32> %real.mul)
   %imag.mul = mul <vscale x 16 x i32> %b.imag.ext, %a.imag.ext
   %imag.mul.neg = sub <vscale x 16 x i32> zeroinitializer, %imag.mul
-  %partial.reduce.sub = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %real.mul.reduced, <vscale x 16 x i32> %imag.mul.neg)
+  %partial.reduce.sub = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %real.mul.reduced, <vscale x 16 x i32> %imag.mul.neg)
   br i1 true, label %middle.block, label %vector.body
 
 middle.block:                                     ; preds = %vector.body
@@ -146,9 +146,9 @@ define i32 @cdotp_i8_rot90(<vscale x 32 x i8> %a, <vscale x 32 x i8> %b) {
 ; CHECK-SVE-NEXT:    [[B_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[B_REAL]] to <vscale x 16 x i32>
 ; CHECK-SVE-NEXT:    [[B_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[B_IMAG]] to <vscale x 16 x i32>
 ; CHECK-SVE-NEXT:    [[REAL_MUL:%.*]] = mul <vscale x 16 x i32> [[B_REAL_EXT]], [[A_IMAG_EXT]]
-; CHECK-SVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[REAL_MUL]])
+; CHECK-SVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[REAL_MUL]])
 ; CHECK-SVE-NEXT:    [[IMAG_MUL:%.*]] = mul <vscale x 16 x i32> [[B_IMAG_EXT]], [[A_REAL_EXT]]
-; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[REAL_MUL_REDUCED]], <vscale x 16 x i32> [[IMAG_MUL]])
+; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[REAL_MUL_REDUCED]], <vscale x 16 x i32> [[IMAG_MUL]])
 ; CHECK-SVE-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
 ; CHECK-SVE:       [[MIDDLE_BLOCK]]:
 ; CHECK-SVE-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE_SUB]])
@@ -171,9 +171,9 @@ define i32 @cdotp_i8_rot90(<vscale x 32 x i8> %a, <vscale x 32 x i8> %b) {
 ; CHECK-NOSVE-NEXT:    [[B_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[B_REAL]] to <vscale x 16 x i32>
 ; CHECK-NOSVE-NEXT:    [[B_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[B_IMAG]] to <vscale x 16 x i32>
 ; CHECK-NOSVE-NEXT:    [[REAL_MUL:%.*]] = mul <vscale x 16 x i32> [[B_REAL_EXT]], [[A_IMAG_EXT]]
-; CHECK-NOSVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[REAL_MUL]])
+; CHECK-NOSVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[REAL_MUL]])
 ; CHECK-NOSVE-NEXT:    [[IMAG_MUL:%.*]] = mul <vscale x 16 x i32> [[B_IMAG_EXT]], [[A_REAL_EXT]]
-; CHECK-NOSVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[REAL_MUL_REDUCED]], <vscale x 16 x i32> [[IMAG_MUL]])
+; CHECK-NOSVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[REAL_MUL_REDUCED]], <vscale x 16 x i32> [[IMAG_MUL]])
 ; CHECK-NOSVE-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
 ; CHECK-NOSVE:       [[MIDDLE_BLOCK]]:
 ; CHECK-NOSVE-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE_SUB]])
@@ -195,9 +195,9 @@ vector.body:                                      ; preds = %vector.body, %entry
   %b.real.ext = sext <vscale x 16 x i8> %b.real to <vscale x 16 x i32>
   %b.imag.ext = sext <vscale x 16 x i8> %b.imag to <vscale x 16 x i32>
   %real.mul = mul <vscale x 16 x i32> %b.real.ext, %a.imag.ext
-  %real.mul.reduced = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %vec.phi, <vscale x 16 x i32> %real.mul)
+  %real.mul.reduced = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %vec.phi, <vscale x 16 x i32> %real.mul)
   %imag.mul = mul <vscale x 16 x i32> %b.imag.ext, %a.real.ext
-  %partial.reduce.sub = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %real.mul.reduced, <vscale x 16 x i32> %imag.mul)
+  %partial.reduce.sub = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %real.mul.reduced, <vscale x 16 x i32> %imag.mul)
   br i1 true, label %middle.block, label %vector.body
 
 middle.block:                                     ; preds = %vector.body
@@ -244,9 +244,9 @@ define i32 @cdotp_i8_rot180(<vscale x 32 x i8> %a, <vscale x 32 x i8> %b) {
 ; CHECK-SVE-NEXT:    [[B_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[B_REAL]] to <vscale x 16 x i32>
 ; CHECK-SVE-NEXT:    [[B_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[B_IMAG]] to <vscale x 16 x i32>
 ; CHECK-SVE-NEXT:    [[REAL_MUL:%.*]] = mul <vscale x 16 x i32> [[B_REAL_EXT]], [[A_REAL_EXT]]
-; CHECK-SVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[REAL_MUL]])
+; CHECK-SVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[REAL_MUL]])
 ; CHECK-SVE-NEXT:    [[IMAG_MUL:%.*]] = mul <vscale x 16 x i32> [[B_IMAG_EXT]], [[A_IMAG_EXT]]
-; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[REAL_MUL_REDUCED]], <vscale x 16 x i32> [[IMAG_MUL]])
+; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[REAL_MUL_REDUCED]], <vscale x 16 x i32> [[IMAG_MUL]])
 ; CHECK-SVE-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
 ; CHECK-SVE:       [[MIDDLE_BLOCK]]:
 ; CHECK-SVE-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE_SUB]])
@@ -269,9 +269,9 @@ define i32 @cdotp_i8_rot180(<vscale x 32 x i8> %a, <vscale x 32 x i8> %b) {
 ; CHECK-NOSVE-NEXT:    [[B_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[B_REAL]] to <vscale x 16 x i32>
 ; CHECK-NOSVE-NEXT:    [[B_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[B_IMAG]] to <vscale x 16 x i32>
 ; CHECK-NOSVE-NEXT:    [[REAL_MUL:%.*]] = mul <vscale x 16 x i32> [[B_REAL_EXT]], [[A_REAL_EXT]]
-; CHECK-NOSVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[REAL_MUL]])
+; CHECK-NOSVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[REAL_MUL]])
 ; CHECK-NOSVE-NEXT:    [[IMAG_MUL:%.*]] = mul <vscale x 16 x i32> [[B_IMAG_EXT]], [[A_IMAG_EXT]]
-; CHECK-NOSVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[REAL_MUL_REDUCED]], <vscale x 16 x i32> [[IMAG_MUL]])
+; CHECK-NOSVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[REAL_MUL_REDUCED]], <vscale x 16 x i32> [[IMAG_MUL]])
 ; CHECK-NOSVE-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
 ; CHECK-NOSVE:       [[MIDDLE_BLOCK]]:
 ; CHECK-NOSVE-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE_SUB]])
@@ -293,9 +293,9 @@ vector.body:                                      ; preds = %vector.body, %entry
   %b.real.ext = sext <vscale x 16 x i8> %b.real to <vscale x 16 x i32>
   %b.imag.ext = sext <vscale x 16 x i8> %b.imag to <vscale x 16 x i32>
   %real.mul = mul <vscale x 16 x i32> %b.real.ext, %a.real.ext
-  %real.mul.reduced = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %vec.phi, <vscale x 16 x i32> %real.mul)
+  %real.mul.reduced = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %vec.phi, <vscale x 16 x i32> %real.mul)
   %imag.mul = mul <vscale x 16 x i32> %b.imag.ext, %a.imag.ext
-  %partial.reduce.sub = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %real.mul.reduced, <vscale x 16 x i32> %imag.mul)
+  %partial.reduce.sub = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %real.mul.reduced, <vscale x 16 x i32> %imag.mul)
   br i1 true, label %middle.block, label %vector.body
 
 middle.block:                                     ; preds = %vector.body
@@ -343,9 +343,9 @@ define i32 @cdotp_i8_rot270(<vscale x 32 x i8> %a, <vscale x 32 x i8> %b) {
 ; CHECK-SVE-NEXT:    [[B_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[B_IMAG]] to <vscale x 16 x i32>
 ; CHECK-SVE-NEXT:    [[REAL_MUL:%.*]] = mul <vscale x 16 x i32> [[B_REAL_EXT]], [[A_IMAG_EXT]]
 ; CHECK-SVE-NEXT:    [[REAL_MUL_NEG:%.*]] = sub <vscale x 16 x i32> zeroinitializer, [[REAL_MUL]]
-; CHECK-SVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[REAL_MUL_NEG]])
+; CHECK-SVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[REAL_MUL_NEG]])
 ; CHECK-SVE-NEXT:    [[IMAG_MUL:%.*]] = mul <vscale x 16 x i32> [[B_IMAG_EXT]], [[A_REAL_EXT]]
-; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[REAL_MUL_REDUCED]], <vscale x 16 x i32> [[IMAG_MUL]])
+; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[REAL_MUL_REDUCED]], <vscale x 16 x i32> [[IMAG_MUL]])
 ; CHECK-SVE-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
 ; CHECK-SVE:       [[MIDDLE_BLOCK]]:
 ; CHECK-SVE-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE_SUB]])
@@ -369,9 +369,9 @@ define i32 @cdotp_i8_rot270(<vscale x 32 x i8> %a, <vscale x 32 x i8> %b) {
 ; CHECK-NOSVE-NEXT:    [[B_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[B_IMAG]] to <vscale x 16 x i32>
 ; CHECK-NOSVE-NEXT:    [[REAL_MUL:%.*]] = mul <vscale x 16 x i32> [[B_REAL_EXT]], [[A_IMAG_EXT]]
 ; CHECK-NOSVE-NEXT:    [[REAL_MUL_NEG:%.*]] = sub <vscale x 16 x i32> zeroinitializer, [[REAL_MUL]]
-; CHECK-NOSVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[REAL_MUL_NEG]])
+; CHECK-NOSVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[REAL_MUL_NEG]])
 ; CHECK-NOSVE-NEXT:    [[IMAG_MUL:%.*]] = mul <vscale x 16 x i32> [[B_IMAG_EXT]], [[A_REAL_EXT]]
-; CHECK-NOSVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[REAL_MUL_REDUCED]], <vscale x 16 x i32> [[IMAG_MUL]])
+; CHECK-NOSVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[REAL_MUL_REDUCED]], <vscale x 16 x i32> [[IMAG_MUL]])
 ; CHECK-NOSVE-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
 ; CHECK-NOSVE:       [[MIDDLE_BLOCK]]:
 ; CHECK-NOSVE-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE_SUB]])
@@ -394,9 +394,9 @@ vector.body:                                      ; preds = %vector.body, %entry
   %b.imag.ext = sext <vscale x 16 x i8> %b.imag to <vscale x 16 x i32>
   %real.mul = mul <vscale x 16 x i32> %b.real.ext, %a.imag.ext
   %real.mul.neg = sub <vscale x 16 x i32> zeroinitializer, %real.mul
-  %real.mul.reduced = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %vec.phi, <vscale x 16 x i32> %real.mul.neg)
+  %real.mul.reduced = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %vec.phi, <vscale x 16 x i32> %real.mul.neg)
   %imag.mul = mul <vscale x 16 x i32> %b.imag.ext, %a.real.ext
-  %partial.reduce.sub = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %real.mul.reduced, <vscale x 16 x i32> %imag.mul)
+  %partial.reduce.sub = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %real.mul.reduced, <vscale x 16 x i32> %imag.mul)
   br i1 true, label %middle.block, label %vector.body
 
 middle.block:                                     ; preds = %vector.body
@@ -443,10 +443,10 @@ define i64 @cdotp_i16_rot0(<vscale x 16 x i16> %a, <vscale x 16 x i16> %b) {
 ; CHECK-SVE-NEXT:    [[B_REAL_EXT:%.*]] = sext <vscale x 8 x i16> [[B_REAL]] to <vscale x 8 x i64>
 ; CHECK-SVE-NEXT:    [[B_IMAG_EXT:%.*]] = sext <vscale x 8 x i16> [[B_IMAG]] to <vscale x 8 x i64>
 ; CHECK-SVE-NEXT:    [[REAL_MUL:%.*]] = mul <vscale x 8 x i64> [[B_REAL_EXT]], [[A_REAL_EXT]]
-; CHECK-SVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[VEC_PHI]], <vscale x 8 x i64> [[REAL_MUL]])
+; CHECK-SVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 2 x i64> @llvm.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[VEC_PHI]], <vscale x 8 x i64> [[REAL_MUL]])
 ; CHECK-SVE-NEXT:    [[IMAG_MUL:%.*]] = mul <vscale x 8 x i64> [[B_IMAG_EXT]], [[A_IMAG_EXT]]
 ; CHECK-SVE-NEXT:    [[IMAG_MUL_NEG:%.*]] = sub <vscale x 8 x i64> zeroinitializer, [[IMAG_MUL]]
-; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[REAL_MUL_REDUCED]], <vscale x 8 x i64> [[IMAG_MUL_NEG]])
+; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 2 x i64> @llvm.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[REAL_MUL_REDUCED]], <vscale x 8 x i64> [[IMAG_MUL_NEG]])
 ; CHECK-SVE-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
 ; CHECK-SVE:       [[MIDDLE_BLOCK]]:
 ; CHECK-SVE-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[PARTIAL_REDUCE_SUB]])
@@ -469,10 +469,10 @@ define i64 @cdotp_i16_rot0(<vscale x 16 x i16> %a, <vscale x 16 x i16> %b) {
 ; CHECK-NOSVE-NEXT:    [[B_REAL_EXT:%.*]] = sext <vscale x 8 x i16> [[B_REAL]] to <vscale x 8 x i64>
 ; CHECK-NOSVE-NEXT:    [[B_IMAG_EXT:%.*]] = sext <vscale x 8 x i16> [[B_IMAG]] to <vscale x 8 x i64>
 ; CHECK-NOSVE-NEXT:    [[REAL_MUL:%.*]] = mul <vscale x 8 x i64> [[B_REAL_EXT]], [[A_REAL_EXT]]
-; CHECK-NOSVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[VEC_PHI]], <vscale x 8 x i64> [[REAL_MUL]])
+; CHECK-NOSVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 2 x i64> @llvm.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[VEC_PHI]], <vscale x 8 x i64> [[REAL_MUL]])
 ; CHECK-NOSVE-NEXT:    [[IMAG_MUL:%.*]] = mul <vscale x 8 x i64> [[B_IMAG_EXT]], [[A_IMAG_EXT]]
 ; CHECK-NOSVE-NEXT:    [[IMAG_MUL_NEG:%.*]] = sub <vscale x 8 x i64> zeroinitializer, [[IMAG_MUL]]
-; CHECK-NOSVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[REAL_MUL_REDUCED]], <vscale x 8 x i64> [[IMAG_MUL_NEG]])
+; CHECK-NOSVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 2 x i64> @llvm.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[REAL_MUL_REDUCED]], <vscale x 8 x i64> [[IMAG_MUL_NEG]])
 ; CHECK-NOSVE-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
 ; CHECK-NOSVE:       [[MIDDLE_BLOCK]]:
 ; CHECK-NOSVE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[PARTIAL_REDUCE_SUB]])
@@ -494,10 +494,10 @@ vector.body:                                      ; preds = %vector.body, %entry
   %b.real.ext = sext <vscale x 8 x i16> %b.real to <vscale x 8 x i64>
   %b.imag.ext = sext <vscale x 8 x i16> %b.imag to <vscale x 8 x i64>
   %real.mul = mul <vscale x 8 x i64> %b.real.ext, %a.real.ext
-  %real.mul.reduced = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %vec.phi, <vscale x 8 x i64> %real.mul)
+  %real.mul.reduced = call <vscale x 2 x i64> @llvm.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %vec.phi, <vscale x 8 x i64> %real.mul)
   %imag.mul = mul <vscale x 8 x i64> %b.imag.ext, %a.imag.ext
   %imag.mul.neg = sub <vscale x 8 x i64> zeroinitializer, %imag.mul
-  %partial.reduce.sub = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %real.mul.reduced, <vscale x 8 x i64> %imag.mul.neg)
+  %partial.reduce.sub = call <vscale x 2 x i64> @llvm.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %real.mul.reduced, <vscale x 8 x i64> %imag.mul.neg)
   br i1 true, label %middle.block, label %vector.body
 
 middle.block:                                     ; preds = %vector.body
@@ -544,9 +544,9 @@ define i64 @cdotp_i16_rot90(<vscale x 16 x i16> %a, <vscale x 16 x i16> %b) {
 ; CHECK-SVE-NEXT:    [[B_REAL_EXT:%.*]] = sext <vscale x 8 x i16> [[B_REAL]] to <vscale x 8 x i64>
 ; CHECK-SVE-NEXT:    [[B_IMAG_EXT:%.*]] = sext <vscale x 8 x i16> [[B_IMAG]] to <vscale x 8 x i64>
 ; CHECK-SVE-NEXT:    [[REAL_MUL:%.*]] = mul <vscale x 8 x i64> [[B_REAL_EXT]], [[A_IMAG_EXT]]
-; CHECK-SVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[VEC_PHI]], <vscale x 8 x i64> [[REAL_MUL]])
+; CHECK-SVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 2 x i64> @llvm.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[VEC_PHI]], <vscale x 8 x i64> [[REAL_MUL]])
 ; CHECK-SVE-NEXT:    [[IMAG_MUL:%.*]] = mul <vscale x 8 x i64> [[B_IMAG_EXT]], [[A_REAL_EXT]]
-; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[REAL_MUL_REDUCED]], <vscale x 8 x i64> [[IMAG_MUL]])
+; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 2 x i64> @llvm.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[REAL_MUL_REDUCED]], <vscale x 8 x i64> [[IMAG_MUL]])
 ; CHECK-SVE-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
 ; CHECK-SVE:       [[MIDDLE_BLOCK]]:
 ; CHECK-SVE-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[PARTIAL_REDUCE_SUB]])
@@ -569,9 +569,9 @@ define i64 @cdotp_i16_rot90(<vscale x 16 x i16> %a, <vscale x 16 x i16> %b) {
 ; CHECK-NOSVE-NEXT:    [[B_REAL_EXT:%.*]] = sext <vscale x 8 x i16> [[B_REAL]] to <vscale x 8 x i64>
 ; CHECK-NOSVE-NEXT:    [[B_IMAG_EXT:%.*]] = sext <vscale x 8 x i16> [[B_IMAG]] to <vscale x 8 x i64>
 ; CHECK-NOSVE-NEXT:    [[REAL_MUL:%.*]] = mul <vscale x 8 x i64> [[B_REAL_EXT]], [[A_IMAG_EXT]]
-; CHECK-NOSVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[VEC_PHI]], <vscale x 8 x i64> [[REAL_MUL]])
+; CHECK-NOSVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 2 x i64> @llvm.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[VEC_PHI]], <vscale x 8 x i64> [[REAL_MUL]])
 ; CHECK-NOSVE-NEXT:    [[IMAG_MUL:%.*]] = mul <vscale x 8 x i64> [[B_IMAG_EXT]], [[A_REAL_EXT]]
-; CHECK-NOSVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[REAL_MUL_REDUCED]], <vscale x 8 x i64> [[IMAG_MUL]])
+; CHECK-NOSVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 2 x i64> @llvm.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[REAL_MUL_REDUCED]], <vscale x 8 x i64> [[IMAG_MUL]])
 ; CHECK-NOSVE-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
 ; CHECK-NOSVE:       [[MIDDLE_BLOCK]]:
 ; CHECK-NOSVE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[PARTIAL_REDUCE_SUB]])
@@ -593,9 +593,9 @@ vector.body:                                      ; preds = %vector.body, %entry
   %b.real.ext = sext <vscale x 8 x i16> %b.real to <vscale x 8 x i64>
   %b.imag.ext = sext <vscale x 8 x i16> %b.imag to <vscale x 8 x i64>
   %real.mul = mul <vscale x 8 x i64> %b.real.ext, %a.imag.ext
-  %real.mul.reduced = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %vec.phi, <vscale x 8 x i64> %real.mul)
+  %real.mul.reduced = call <vscale x 2 x i64> @llvm.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %vec.phi, <vscale x 8 x i64> %real.mul)
   %imag.mul = mul <vscale x 8 x i64> %b.imag.ext, %a.real.ext
-  %partial.reduce.sub = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %real.mul.reduced, <vscale x 8 x i64> %imag.mul)
+  %partial.reduce.sub = call <vscale x 2 x i64> @llvm.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %real.mul.reduced, <vscale x 8 x i64> %imag.mul)
   br i1 true, label %middle.block, label %vector.body
 
 middle.block:                                     ; preds = %vector.body
@@ -642,9 +642,9 @@ define i64 @cdotp_i16_rot180(<vscale x 16 x i16> %a, <vscale x 16 x i16> %b) {
 ; CHECK-SVE-NEXT:    [[B_REAL_EXT:%.*]] = sext <vscale x 8 x i16> [[B_REAL]] to <vscale x 8 x i64>
 ; CHECK-SVE-NEXT:    [[B_IMAG_EXT:%.*]] = sext <vscale x 8 x i16> [[B_IMAG]] to <vscale x 8 x i64>
 ; CHECK-SVE-NEXT:    [[REAL_MUL:%.*]] = mul <vscale x 8 x i64> [[B_REAL_EXT]], [[A_REAL_EXT]]
-; CHECK-SVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[VEC_PHI]], <vscale x 8 x i64> [[REAL_MUL]])
+; CHECK-SVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 2 x i64> @llvm.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[VEC_PHI]], <vscale x 8 x i64> [[REAL_MUL]])
 ; CHECK-SVE-NEXT:    [[IMAG_MUL:%.*]] = mul <vscale x 8 x i64> [[B_IMAG_EXT]], [[A_IMAG_EXT]]
-; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[REAL_MUL_REDUCED]], <vscale x 8 x i64> [[IMAG_MUL]])
+; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 2 x i64> @llvm.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[REAL_MUL_REDUCED]], <vscale x 8 x i64> [[IMAG_MUL]])
 ; CHECK-SVE-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
 ; CHECK-SVE:       [[MIDDLE_BLOCK]]:
 ; CHECK-SVE-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[PARTIAL_REDUCE_SUB]])
@@ -667,9 +667,9 @@ define i64 @cdotp_i16_rot180(<vscale x 16 x i16> %a, <vscale x 16 x i16> %b) {
 ; CHECK-NOSVE-NEXT:    [[B_REAL_EXT:%.*]] = sext <vscale x 8 x i16> [[B_REAL]] to <vscale x 8 x i64>
 ; CHECK-NOSVE-NEXT:    [[B_IMAG_EXT:%.*]] = sext <vscale x 8 x i16> [[B_IMAG]] to <vscale x 8 x i64>
 ; CHECK-NOSVE-NEXT:    [[REAL_MUL:%.*]] = mul <vscale x 8 x i64> [[B_REAL_EXT]], [[A_REAL_EXT]]
-; CHECK-NOSVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[VEC_PHI]], <vscale x 8 x i64> [[REAL_MUL]])
+; CHECK-NOSVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 2 x i64> @llvm.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[VEC_PHI]], <vscale x 8 x i64> [[REAL_MUL]])
 ; CHECK-NOSVE-NEXT:    [[IMAG_MUL:%.*]] = mul <vscale x 8 x i64> [[B_IMAG_EXT]], [[A_IMAG_EXT]]
-; CHECK-NOSVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[REAL_MUL_REDUCED]], <vscale x 8 x i64> [[IMAG_MUL]])
+; CHECK-NOSVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 2 x i64> @llvm.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[REAL_MUL_REDUCED]], <vscale x 8 x i64> [[IMAG_MUL]])
 ; CHECK-NOSVE-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
 ; CHECK-NOSVE:       [[MIDDLE_BLOCK]]:
 ; CHECK-NOSVE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[PARTIAL_REDUCE_SUB]])
@@ -691,9 +691,9 @@ vector.body:                                      ; preds = %vector.body, %entry
   %b.real.ext = sext <vscale x 8 x i16> %b.real to <vscale x 8 x i64>
   %b.imag.ext = sext <vscale x 8 x i16> %b.imag to <vscale x 8 x i64>
   %real.mul = mul <vscale x 8 x i64> %b.real.ext, %a.real.ext
-  %real.mul.reduced = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %vec.phi, <vscale x 8 x i64> %real.mul)
+  %real.mul.reduced = call <vscale x 2 x i64> @llvm.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %vec.phi, <vscale x 8 x i64> %real.mul)
   %imag.mul = mul <vscale x 8 x i64> %b.imag.ext, %a.imag.ext
-  %partial.reduce.sub = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %real.mul.reduced, <vscale x 8 x i64> %imag.mul)
+  %partial.reduce.sub = call <vscale x 2 x i64> @llvm.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %real.mul.reduced, <vscale x 8 x i64> %imag.mul)
   br i1 true, label %middle.block, label %vector.body
 
 middle.block:                                     ; preds = %vector.body
@@ -741,9 +741,9 @@ define i64 @cdotp_i16_rot270(<vscale x 16 x i16> %a, <vscale x 16 x i16> %b) {
 ; CHECK-SVE-NEXT:    [[B_IMAG_EXT:%.*]] = sext <vscale x 8 x i16> [[B_IMAG]] to <vscale x 8 x i64>
 ; CHECK-SVE-NEXT:    [[REAL_MUL:%.*]] = mul <vscale x 8 x i64> [[B_REAL_EXT]], [[A_IMAG_EXT]]
 ; CHECK-SVE-NEXT:    [[REAL_MUL_NEG:%.*]] = sub <vscale x 8 x i64> zeroinitializer, [[REAL_MUL]]
-; CHECK-SVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[VEC_PHI]], <vscale x 8 x i64> [[REAL_MUL_NEG]])
+; CHECK-SVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 2 x i64> @llvm.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[VEC_PHI]], <vscale x 8 x i64> [[REAL_MUL_NEG]])
 ; CHECK-SVE-NEXT:    [[IMAG_MUL:%.*]] = mul <vscale x 8 x i64> [[B_IMAG_EXT]], [[A_REAL_EXT]]
-; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[REAL_MUL_REDUCED]], <vscale x 8 x i64> [[IMAG_MUL]])
+; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 2 x i64> @llvm.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[REAL_MUL_REDUCED]], <vscale x 8 x i64> [[IMAG_MUL]])
 ; CHECK-SVE-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
 ; CHECK-SVE:       [[MIDDLE_BLOCK]]:
 ; CHECK-SVE-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[PARTIAL_REDUCE_SUB]])
@@ -767,9 +767,9 @@ define i64 @cdotp_i16_rot270(<vscale x 16 x i16> %a, <vscale x 16 x i16> %b) {
 ; CHECK-NOSVE-NEXT:    [[B_IMAG_EXT:%.*]] = sext <vscale x 8 x i16> [[B_IMAG]] to <vscale x 8 x i64>
 ; CHECK-NOSVE-NEXT:    [[REAL_MUL:%.*]] = mul <vscale x 8 x i64> [[B_REAL_EXT]], [[A_IMAG_EXT]]
 ; CHECK-NOSVE-NEXT:    [[REAL_MUL_NEG:%.*]] = sub <vscale x 8 x i64> zeroinitializer, [[REAL_MUL]]
-; CHECK-NOSVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[VEC_PHI]], <vscale x 8 x i64> [[REAL_MUL_NEG]])
+; CHECK-NOSVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 2 x i64> @llvm.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[VEC_PHI]], <vscale x 8 x i64> [[REAL_MUL_NEG]])
 ; CHECK-NOSVE-NEXT:    [[IMAG_MUL:%.*]] = mul <vscale x 8 x i64> [[B_IMAG_EXT]], [[A_REAL_EXT]]
-; CHECK-NOSVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[REAL_MUL_REDUCED]], <vscale x 8 x i64> [[IMAG_MUL]])
+; CHECK-NOSVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 2 x i64> @llvm.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[REAL_MUL_REDUCED]], <vscale x 8 x i64> [[IMAG_MUL]])
 ; CHECK-NOSVE-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
 ; CHECK-NOSVE:       [[MIDDLE_BLOCK]]:
 ; CHECK-NOSVE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[PARTIAL_REDUCE_SUB]])
@@ -792,9 +792,9 @@ vector.body:                                      ; preds = %vector.body, %entry
   %b.imag.ext = sext <vscale x 8 x i16> %b.imag to <vscale x 8 x i64>
   %real.mul = mul <vscale x 8 x i64> %b.real.ext, %a.imag.ext
   %real.mul.neg = sub <vscale x 8 x i64> zeroinitializer, %real.mul
-  %real.mul.reduced = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %vec.phi, <vscale x 8 x i64> %real.mul.neg)
+  %real.mul.reduced = call <vscale x 2 x i64> @llvm.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %vec.phi, <vscale x 8 x i64> %real.mul.neg)
   %imag.mul = mul <vscale x 8 x i64> %b.imag.ext, %a.real.ext
-  %partial.reduce.sub = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %real.mul.reduced, <vscale x 8 x i64> %imag.mul)
+  %partial.reduce.sub = call <vscale x 2 x i64> @llvm.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %real.mul.reduced, <vscale x 8 x i64> %imag.mul)
   br i1 true, label %middle.block, label %vector.body
 
 middle.block:                                     ; preds = %vector.body
@@ -822,10 +822,10 @@ define i32 @not_cdotp(<vscale x 32 x i8> %a, <vscale x 32 x i8> %b) {
 ; CHECK-SVE2-NEXT:    [[B_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[B_IMAG]] to <vscale x 16 x i32>
 ; CHECK-SVE2-NEXT:    [[REAL_MUL:%.*]] = mul <vscale x 16 x i32> [[B_REAL_EXT]], [[A_REAL_EXT]]
 ; CHECK-SVE2-NEXT:    [[REAL_MUL_NEG:%.*]] = sub <vscale x 16 x i32> zeroinitializer, [[REAL_MUL]]
-; CHECK-SVE2-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[REAL_MUL_NEG]])
+; CHECK-SVE2-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[REAL_MUL_NEG]])
 ; CHECK-SVE2-NEXT:    [[IMAG_MUL:%.*]] = mul <vscale x 16 x i32> [[B_IMAG_EXT]], [[A_IMAG_EXT]]
 ; CHECK-SVE2-NEXT:    [[IMAG_MUL_NEG:%.*]] = sub <vscale x 16 x i32> zeroinitializer, [[IMAG_MUL]]
-; CHECK-SVE2-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[REAL_MUL_REDUCED]], <vscale x 16 x i32> [[IMAG_MUL_NEG]])
+; CHECK-SVE2-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[REAL_MUL_REDUCED]], <vscale x 16 x i32> [[IMAG_MUL_NEG]])
 ; CHECK-SVE2-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
 ; CHECK-SVE2:       [[MIDDLE_BLOCK]]:
 ; CHECK-SVE2-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE_SUB]])
@@ -849,10 +849,10 @@ define i32 @not_cdotp(<vscale x 32 x i8> %a, <vscale x 32 x i8> %b) {
 ; CHECK-SVE-NEXT:    [[B_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[B_IMAG]] to <vscale x 16 x i32>
 ; CHECK-SVE-NEXT:    [[REAL_MUL:%.*]] = mul <vscale x 16 x i32> [[B_REAL_EXT]], [[A_REAL_EXT]]
 ; CHECK-SVE-NEXT:    [[REAL_MUL_NEG:%.*]] = sub <vscale x 16 x i32> zeroinitializer, [[REAL_MUL]]
-; CHECK-SVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[REAL_MUL_NEG]])
+; CHECK-SVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[REAL_MUL_NEG]])
 ; CHECK-SVE-NEXT:    [[IMAG_MUL:%.*]] = mul <vscale x 16 x i32> [[B_IMAG_EXT]], [[A_IMAG_EXT]]
 ; CHECK-SVE-NEXT:    [[IMAG_MUL_NEG:%.*]] = sub <vscale x 16 x i32> zeroinitializer, [[IMAG_MUL]]
-; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[REAL_MUL_REDUCED]], <vscale x 16 x i32> [[IMAG_MUL_NEG]])
+; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[REAL_MUL_REDUCED]], <vscale x 16 x i32> [[IMAG_MUL_NEG]])
 ; CHECK-SVE-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
 ; CHECK-SVE:       [[MIDDLE_BLOCK]]:
 ; CHECK-SVE-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE_SUB]])
@@ -876,10 +876,10 @@ define i32 @not_cdotp(<vscale x 32 x i8> %a, <vscale x 32 x i8> %b) {
 ; CHECK-NOSVE-NEXT:    [[B_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[B_IMAG]] to <vscale x 16 x i32>
 ; CHECK-NOSVE-NEXT:    [[REAL_MUL:%.*]] = mul <vscale x 16 x i32> [[B_REAL_EXT]], [[A_REAL_EXT]]
 ; CHECK-NOSVE-NEXT:    [[REAL_MUL_NEG:%.*]] = sub <vscale x 16 x i32> zeroinitializer, [[REAL_MUL]]
-; CHECK-NOSVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[REAL_MUL_NEG]])
+; CHECK-NOSVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[REAL_MUL_NEG]])
 ; CHECK-NOSVE-NEXT:    [[IMAG_MUL:%.*]] = mul <vscale x 16 x i32> [[B_IMAG_EXT]], [[A_IMAG_EXT]]
 ; CHECK-NOSVE-NEXT:    [[IMAG_MUL_NEG:%.*]] = sub <vscale x 16 x i32> zeroinitializer, [[IMAG_MUL]]
-; CHECK-NOSVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[REAL_MUL_REDUCED]], <vscale x 16 x i32> [[IMAG_MUL_NEG]])
+; CHECK-NOSVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[REAL_MUL_REDUCED]], <vscale x 16 x i32> [[IMAG_MUL_NEG]])
 ; CHECK-NOSVE-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
 ; CHECK-NOSVE:       [[MIDDLE_BLOCK]]:
 ; CHECK-NOSVE-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE_SUB]])
@@ -902,10 +902,10 @@ vector.body:                                      ; preds = %vector.body, %entry
   %b.imag.ext = sext <vscale x 16 x i8> %b.imag to <vscale x 16 x i32>
   %real.mul = mul <vscale x 16 x i32> %b.real.ext, %a.real.ext
   %real.mul.neg = sub <vscale x 16 x i32> zeroinitializer, %real.mul
-  %real.mul.reduced = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %vec.phi, <vscale x 16 x i32> %real.mul.neg)
+  %real.mul.reduced = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %vec.phi, <vscale x 16 x i32> %real.mul.neg)
   %imag.mul = mul <vscale x 16 x i32> %b.imag.ext, %a.imag.ext
   %imag.mul.neg = sub <vscale x 16 x i32> zeroinitializer, %imag.mul
-  %partial.reduce.sub = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %real.mul.reduced, <vscale x 16 x i32> %imag.mul.neg)
+  %partial.reduce.sub = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %real.mul.reduced, <vscale x 16 x i32> %imag.mul.neg)
   br i1 true, label %middle.block, label %vector.body
 
 middle.block:                                     ; preds = %vector.body
@@ -931,10 +931,10 @@ define i16 @invalid_type(<vscale x 32 x i8> %a, <vscale x 32 x i8> %b) {
 ; CHECK-SVE2-NEXT:    [[B_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[B_REAL]] to <vscale x 16 x i32>
 ; CHECK-SVE2-NEXT:    [[B_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[B_IMAG]] to <vscale x 16 x i32>
 ; CHECK-SVE2-NEXT:    [[REAL_MUL:%.*]] = mul <vscale x 16 x i32> [[B_REAL_EXT]], [[A_REAL_EXT]]
-; CHECK-SVE2-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 8 x i16> @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i32(<vscale x 8 x i16> [[VEC_PHI]], <vscale x 16 x i32> [[REAL_MUL]])
+; CHECK-SVE2-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 8 x i16> @llvm.vector.partial.reduce.add.nxv8i16.nxv16i32(<vscale x 8 x i16> [[VEC_PHI]], <vscale x 16 x i32> [[REAL_MUL]])
 ; CHECK-SVE2-NEXT:    [[IMAG_MUL:%.*]] = mul <vscale x 16 x i32> [[B_IMAG_EXT]], [[A_IMAG_EXT]]
 ; CHECK-SVE2-NEXT:    [[IMAG_MUL_NEG:%.*]] = sub <vscale x 16 x i32> zeroinitializer, [[IMAG_MUL]]
-; CHECK-SVE2-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 8 x i16> @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i32(<vscale x 8 x i16> [[REAL_MUL_REDUCED]], <vscale x 16 x i32> [[IMAG_MUL_NEG]])
+; CHECK-SVE2-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 8 x i16> @llvm.vector.partial.reduce.add.nxv8i16.nxv16i32(<vscale x 8 x i16> [[REAL_MUL_REDUCED]], <vscale x 16 x i32> [[IMAG_MUL_NEG]])
 ; CHECK-SVE2-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
 ; CHECK-SVE2:       [[MIDDLE_BLOCK]]:
 ; CHECK-SVE2-NEXT:    [[TMP0:%.*]] = call i16 @llvm.vector.reduce.add.nxv8i16(<vscale x 8 x i16> [[PARTIAL_REDUCE_SUB]])
@@ -957,10 +957,10 @@ define i16 @invalid_type(<vscale x 32 x i8> %a, <vscale x 32 x i8> %b) {
 ; CHECK-SVE-NEXT:    [[B_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[B_REAL]] to <vscale x 16 x i32>
 ; CHECK-SVE-NEXT:    [[B_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[B_IMAG]] to <vscale x 16 x i32>
 ; CHECK-SVE-NEXT:    [[REAL_MUL:%.*]] = mul <vscale x 16 x i32> [[B_REAL_EXT]], [[A_REAL_EXT]]
-; CHECK-SVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 8 x i16> @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i32(<vscale x 8 x i16> [[VEC_PHI]], <vscale x 16 x i32> [[REAL_MUL]])
+; CHECK-SVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 8 x i16> @llvm.vector.partial.reduce.add.nxv8i16.nxv16i32(<vscale x 8 x i16> [[VEC_PHI]], <vscale x 16 x i32> [[REAL_MUL]])
 ; CHECK-SVE-NEXT:    [[IMAG_MUL:%.*]] = mul <vscale x 16 x i32> [[B_IMAG_EXT]], [[A_IMAG_EXT]]
 ; CHECK-SVE-NEXT:    [[IMAG_MUL_NEG:%.*]] = sub <vscale x 16 x i32> zeroinitializer, [[IMAG_MUL]]
-; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 8 x i16> @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i32(<vscale x 8 x i16> [[REAL_MUL_REDUCED]], <vscale x 16 x i32> [[IMAG_MUL_NEG]])
+; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 8 x i16> @llvm.vector.partial.reduce.add.nxv8i16.nxv16i32(<vscale x 8 x i16> [[REAL_MUL_REDUCED]], <vscale x 16 x i32> [[IMAG_MUL_NEG]])
 ; CHECK-SVE-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
 ; CHECK-SVE:       [[MIDDLE_BLOCK]]:
 ; CHECK-SVE-NEXT:    [[TMP0:%.*]] = call i16 @llvm.vector.reduce.add.nxv8i16(<vscale x 8 x i16> [[PARTIAL_REDUCE_SUB]])
@@ -983,10 +983,10 @@ define i16 @invalid_type(<vscale x 32 x i8> %a, <vscale x 32 x i8> %b) {
 ; CHECK-NOSVE-NEXT:    [[B_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[B_REAL]] to <vscale x 16 x i32>
 ; CHECK-NOSVE-NEXT:    [[B_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[B_IMAG]] to <vscale x 16 x i32>
 ; CHECK-NOSVE-NEXT:    [[REAL_MUL:%.*]] = mul <vscale x 16 x i32> [[B_REAL_EXT]], [[A_REAL_EXT]]
-; CHECK-NOSVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 8 x i16> @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i32(<vscale x 8 x i16> [[VEC_PHI]], <vscale x 16 x i32> [[REAL_MUL]])
+; CHECK-NOSVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 8 x i16> @llvm.vector.partial.reduce.add.nxv8i16.nxv16i32(<vscale x 8 x i16> [[VEC_PHI]], <vscale x 16 x i32> [[REAL_MUL]])
 ; CHECK-NOSVE-NEXT:    [[IMAG_MUL:%.*]] = mul <vscale x 16 x i32> [[B_IMAG_EXT]], [[A_IMAG_EXT]]
 ; CHECK-NOSVE-NEXT:    [[IMAG_MUL_NEG:%.*]] = sub <vscale x 16 x i32> zeroinitializer, [[IMAG_MUL]]
-; CHECK-NOSVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 8 x i16> @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i32(<vscale x 8 x i16> [[REAL_MUL_REDUCED]], <vscale x 16 x i32> [[IMAG_MUL_NEG]])
+; CHECK-NOSVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 8 x i16> @llvm.vector.partial.reduce.add.nxv8i16.nxv16i32(<vscale x 8 x i16> [[REAL_MUL_REDUCED]], <vscale x 16 x i32> [[IMAG_MUL_NEG]])
 ; CHECK-NOSVE-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
 ; CHECK-NOSVE:       [[MIDDLE_BLOCK]]:
 ; CHECK-NOSVE-NEXT:    [[TMP0:%.*]] = call i16 @llvm.vector.reduce.add.nxv8i16(<vscale x 8 x i16> [[PARTIAL_REDUCE_SUB]])
@@ -1008,10 +1008,10 @@ vector.body:                                      ; preds = %vector.body, %entry
   %b.real.ext = sext <vscale x 16 x i8> %b.real to <vscale x 16 x i32>
   %b.imag.ext = sext <vscale x 16 x i8> %b.imag to <vscale x 16 x i32>
   %real.mul = mul <vscale x 16 x i32> %b.real.ext, %a.real.ext
-  %real.mul.reduced = call <vscale x 8 x i16> @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i32(<vscale x 8 x i16> %vec.phi, <vscale x 16 x i32> %real.mul)
+  %real.mul.reduced = call <vscale x 8 x i16> @llvm.vector.partial.reduce.add.nxv8i16.nxv16i32(<vscale x 8 x i16> %vec.phi, <vscale x 16 x i32> %real.mul)
   %imag.mul = mul <vscale x 16 x i32> %b.imag.ext, %a.imag.ext
   %imag.mul.neg = sub <vscale x 16 x i32> zeroinitializer, %imag.mul
-  %partial.reduce.sub = call <vscale x 8 x i16> @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i32(<vscale x 8 x i16> %real.mul.reduced, <vscale x 16 x i32> %imag.mul.neg)
+  %partial.reduce.sub = call <vscale x 8 x i16> @llvm.vector.partial.reduce.add.nxv8i16.nxv16i32(<vscale x 8 x i16> %real.mul.reduced, <vscale x 16 x i32> %imag.mul.neg)
   br i1 true, label %middle.block, label %vector.body
 
 middle.block:                                     ; preds = %vector.body
@@ -1037,10 +1037,10 @@ define i32 @not_cdotp_i8_rot0_fixed_length(<32 x i8> %a, <32 x i8> %b) {
 ; CHECK-SVE2-NEXT:    [[B_REAL_EXT:%.*]] = sext <16 x i8> [[B_REAL]] to <16 x i32>
 ; CHECK-SVE2-NEXT:    [[B_IMAG_EXT:%.*]] = sext <16 x i8> [[B_IMAG]] to <16 x i32>
 ; CHECK-SVE2-NEXT:    [[REAL_MUL:%.*]] = mul <16 x i32> [[B_REAL_EXT]], [[A_REAL_EXT]]
-; CHECK-SVE2-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[REAL_MUL]])
+; CHECK-SVE2-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[REAL_MUL]])
 ; CHECK-SVE2-NEXT:    [[IMAG_MUL:%.*]] = mul <16 x i32> [[B_IMAG_EXT]], [[A_IMAG_EXT]]
 ; CHECK-SVE2-NEXT:    [[IMAG_MUL_NEG:%.*]] = sub <16 x i32> zeroinitializer, [[IMAG_MUL]]
-; CHECK-SVE2-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[REAL_MUL_REDUCED]], <16 x i32> [[IMAG_MUL_NEG]])
+; CHECK-SVE2-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[REAL_MUL_REDUCED]], <16 x i32> [[IMAG_MUL_NEG]])
 ; CHECK-SVE2-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
 ; CHECK-SVE2:       [[MIDDLE_BLOCK]]:
 ; CHECK-SVE2-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE_SUB]])
@@ -1063,10 +1063,10 @@ define i32 @not_cdotp_i8_rot0_fixed_length(<32 x i8> %a, <32 x i8> %b) {
 ; CHECK-SVE-NEXT:    [[B_REAL_EXT:%.*]] = sext <16 x i8> [[B_REAL]] to <16 x i32>
 ; CHECK-SVE-NEXT:    [[B_IMAG_EXT:%.*]] = sext <16 x i8> [[B_IMAG]] to <16 x i32>
 ; CHECK-SVE-NEXT:    [[REAL_MUL:%.*]] = mul <16 x i32> [[B_REAL_EXT]], [[A_REAL_EXT]]
-; CHECK-SVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[REAL_MUL]])
+; CHECK-SVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[REAL_MUL]])
 ; CHECK-SVE-NEXT:    [[IMAG_MUL:%.*]] = mul <16 x i32> [[B_IMAG_EXT]], [[A_IMAG_EXT]]
 ; CHECK-SVE-NEXT:    [[IMAG_MUL_NEG:%.*]] = sub <16 x i32> zeroinitializer, [[IMAG_MUL]]
-; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[REAL_MUL_REDUCED]], <16 x i32> [[IMAG_MUL_NEG]])
+; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[REAL_MUL_REDUCED]], <16 x i32> [[IMAG_MUL_NEG]])
 ; CHECK-SVE-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
 ; CHECK-SVE:       [[MIDDLE_BLOCK]]:
 ; CHECK-SVE-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE_SUB]])
@@ -1089,10 +1089,10 @@ define i32 @not_cdotp_i8_rot0_fixed_length(<32 x i8> %a, <32 x i8> %b) {
 ; CHECK-NOSVE-NEXT:    [[B_REAL_EXT:%.*]] = sext <16 x i8> [[B_REAL]] to <16 x i32>
 ; CHECK-NOSVE-NEXT:    [[B_IMAG_EXT:%.*]] = sext <16 x i8> [[B_IMAG]] to <16 x i32>
 ; CHECK-NOSVE-NEXT:    [[REAL_MUL:%.*]] = mul <16 x i32> [[B_REAL_EXT]], [[A_REAL_EXT]]
-; CHECK-NOSVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[REAL_MUL]])
+; CHECK-NOSVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[REAL_MUL]])
 ; CHECK-NOSVE-NEXT:    [[IMAG_MUL:%.*]] = mul <16 x i32> [[B_IMAG_EXT]], [[A_IMAG_EXT]]
 ; CHECK-NOSVE-NEXT:    [[IMAG_MUL_NEG:%.*]] = sub <16 x i32> zeroinitializer, [[IMAG_MUL]]
-; CHECK-NOSVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[REAL_MUL_REDUCED]], <16 x i32> [[IMAG_MUL_NEG]])
+; CHECK-NOSVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[REAL_MUL_REDUCED]], <16 x i32> [[IMAG_MUL_NEG]])
 ; CHECK-NOSVE-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
 ; CHECK-NOSVE:       [[MIDDLE_BLOCK]]:
 ; CHECK-NOSVE-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE_SUB]])
@@ -1114,10 +1114,10 @@ vector.body:                                      ; preds = %vector.body, %entry
   %b.real.ext = sext <16 x i8> %b.real to <16 x i32>
   %b.imag.ext = sext <16 x i8> %b.imag to <16 x i32>
   %real.mul = mul <16 x i32> %b.real.ext, %a.real.ext
-  %real.mul.reduced = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %vec.phi, <16 x i32> %real.mul)
+  %real.mul.reduced = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %vec.phi, <16 x i32> %real.mul)
   %imag.mul = mul <16 x i32> %b.imag.ext, %a.imag.ext
   %imag.mul.neg = sub <16 x i32> zeroinitializer, %imag.mul
-  %partial.reduce.sub = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %real.mul.reduced, <16 x i32> %imag.mul.neg)
+  %partial.reduce.sub = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %real.mul.reduced, <16 x i32> %imag.mul.neg)
   br i1 true, label %middle.block, label %vector.body
 
 middle.block:                                     ; preds = %vector.body
@@ -1125,11 +1125,11 @@ middle.block:                                     ; preds = %vector.body
   ret i32 %0
 }
 
-declare <vscale x 8 x i16> @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i32(<vscale x 8 x i16>, <vscale x 16 x i32>)
-declare <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32>, <vscale x 16 x i32>)
-declare <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i32(<vscale x 2 x i64>, <vscale x 16 x i32>)
+declare <vscale x 8 x i16> @llvm.vector.partial.reduce.add.nxv8i16.nxv16i32(<vscale x 8 x i16>, <vscale x 16 x i32>)
+declare <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32>, <vscale x 16 x i32>)
+declare <vscale x 2 x i64> @llvm.vector.partial.reduce.add.nxv2i64.nxv8i32(<vscale x 2 x i64>, <vscale x 16 x i32>)
 
-declare <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32>, <16 x i32>)
+declare <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32>, <16 x i32>)
 declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
 
 declare i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32>)
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-unrolled-cdot.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-unrolled-cdot.ll
index faefaf9bad7b1..d258ae0b376a1 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-unrolled-cdot.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-unrolled-cdot.ll
@@ -38,12 +38,12 @@ define i32 @cdotp_i8_rot0(<vscale x 32 x i8> %a0, <vscale x 32 x i8> %b0, <vscal
 ; CHECK-SVE2-NEXT:    [[B1_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[B1_IMAG]] to <vscale x 16 x i32>
 ; CHECK-SVE2-NEXT:    [[TMP2:%.*]] = mul nsw <vscale x 16 x i32> [[B0_IMAG_EXT]], [[A0_IMAG_EXT]]
 ; CHECK-SVE2-NEXT:    [[TMP3:%.*]] = mul nsw <vscale x 16 x i32> [[B1_IMAG_EXT]], [[A1_IMAG_EXT]]
-; CHECK-SVE2-NEXT:    [[PARTIAL_REDUCE:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP0]])
-; CHECK-SVE2-NEXT:    [[PARTIAL_REDUCE32:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI25]], <vscale x 16 x i32> [[TMP1]])
+; CHECK-SVE2-NEXT:    [[PARTIAL_REDUCE:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP0]])
+; CHECK-SVE2-NEXT:    [[PARTIAL_REDUCE32:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI25]], <vscale x 16 x i32> [[TMP1]])
 ; CHECK-SVE2-NEXT:    [[TMP4:%.*]] = sub nsw <vscale x 16 x i32> zeroinitializer, [[TMP2]]
 ; CHECK-SVE2-NEXT:    [[TMP5:%.*]] = sub nsw <vscale x 16 x i32> zeroinitializer, [[TMP3]]
-; CHECK-SVE2-NEXT:    [[PARTIAL_REDUCE33]] = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[PARTIAL_REDUCE]], <vscale x 16 x i32> [[TMP4]])
-; CHECK-SVE2-NEXT:    [[PARTIAL_REDUCE34]] = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[PARTIAL_REDUCE32]], <vscale x 16 x i32> [[TMP5]])
+; CHECK-SVE2-NEXT:    [[PARTIAL_REDUCE33]] = tail call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[PARTIAL_REDUCE]], <vscale x 16 x i32> [[TMP4]])
+; CHECK-SVE2-NEXT:    [[PARTIAL_REDUCE34]] = tail call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[PARTIAL_REDUCE32]], <vscale x 16 x i32> [[TMP5]])
 ; CHECK-SVE2-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
 ; CHECK-SVE2:       [[MIDDLE_BLOCK]]:
 ; CHECK-SVE2-NEXT:    [[BIN_RDX:%.*]] = add <vscale x 4 x i32> [[PARTIAL_REDUCE34]], [[PARTIAL_REDUCE33]]
@@ -81,12 +81,12 @@ define i32 @cdotp_i8_rot0(<vscale x 32 x i8> %a0, <vscale x 32 x i8> %b0, <vscal
 ; CHECK-SVE-NEXT:    [[B1_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[B1_IMAG]] to <vscale x 16 x i32>
 ; CHECK-SVE-NEXT:    [[TMP2:%.*]] = mul nsw <vscale x 16 x i32> [[B0_IMAG_EXT]], [[A0_IMAG_EXT]]
 ; CHECK-SVE-NEXT:    [[TMP3:%.*]] = mul nsw <vscale x 16 x i32> [[B1_IMAG_EXT]], [[A1_IMAG_EXT]]
-; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP0]])
-; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE32:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI25]], <vscale x 16 x i32> [[TMP1]])
+; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP0]])
+; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE32:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI25]], <vscale x 16 x i32> [[TMP1]])
 ; CHECK-SVE-NEXT:    [[TMP4:%.*]] = sub nsw <vscale x 16 x i32> zeroinitializer, [[TMP2]]
 ; CHECK-SVE-NEXT:    [[TMP5:%.*]] = sub nsw <vscale x 16 x i32> zeroinitializer, [[TMP3]]
-; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE33]] = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[PARTIAL_REDUCE]], <vscale x 16 x i32> [[TMP4]])
-; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE34]] = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[PARTIAL_REDUCE32]], <vscale x 16 x i32> [[TMP5]])
+; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE33]] = tail call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[PARTIAL_REDUCE]], <vscale x 16 x i32> [[TMP4]])
+; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE34]] = tail call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[PARTIAL_REDUCE32]], <vscale x 16 x i32> [[TMP5]])
 ; CHECK-SVE-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
 ; CHECK-SVE:       [[MIDDLE_BLOCK]]:
 ; CHECK-SVE-NEXT:    [[BIN_RDX:%.*]] = add <vscale x 4 x i32> [[PARTIAL_REDUCE34]], [[PARTIAL_REDUCE33]]
@@ -124,12 +124,12 @@ define i32 @cdotp_i8_rot0(<vscale x 32 x i8> %a0, <vscale x 32 x i8> %b0, <vscal
 ; CHECK-NOSVE-NEXT:    [[B1_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[B1_IMAG]] to <vscale x 16 x i32>
 ; CHECK-NOSVE-NEXT:    [[TMP2:%.*]] = mul nsw <vscale x 16 x i32> [[B0_IMAG_EXT]], [[A0_IMAG_EXT]]
 ; CHECK-NOSVE-NEXT:    [[TMP3:%.*]] = mul nsw <vscale x 16 x i32> [[B1_IMAG_EXT]], [[A1_IMAG_EXT]]
-; CHECK-NOSVE-NEXT:    [[PARTIAL_REDUCE:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP0]])
-; CHECK-NOSVE-NEXT:    [[PARTIAL_REDUCE32:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI25]], <vscale x 16 x i32> [[TMP1]])
+; CHECK-NOSVE-NEXT:    [[PARTIAL_REDUCE:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP0]])
+; CHECK-NOSVE-NEXT:    [[PARTIAL_REDUCE32:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI25]], <vscale x 16 x i32> [[TMP1]])
 ; CHECK-NOSVE-NEXT:    [[TMP4:%.*]] = sub nsw <vscale x 16 x i32> zeroinitializer, [[TMP2]]
 ; CHECK-NOSVE-NEXT:    [[TMP5:%.*]] = sub nsw <vscale x 16 x i32> zeroinitializer, [[TMP3]]
-; CHECK-NOSVE-NEXT:    [[PARTIAL_REDUCE33]] = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[PARTIAL_REDUCE]], <vscale x 16 x i32> [[TMP4]])
-; CHECK-NOSVE-NEXT:    [[PARTIAL_REDUCE34]] = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[PARTIAL_REDUCE32]], <vscale x 16 x i32> [[TMP5]])
+; CHECK-NOSVE-NEXT:    [[PARTIAL_REDUCE33]] = tail call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[PARTIAL_REDUCE]], <vscale x 16 x i32> [[TMP4]])
+; CHECK-NOSVE-NEXT:    [[PARTIAL_REDUCE34]] = tail call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[PARTIAL_REDUCE32]], <vscale x 16 x i32> [[TMP5]])
 ; CHECK-NOSVE-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
 ; CHECK-NOSVE:       [[MIDDLE_BLOCK]]:
 ; CHECK-NOSVE-NEXT:    [[BIN_RDX:%.*]] = add <vscale x 4 x i32> [[PARTIAL_REDUCE34]], [[PARTIAL_REDUCE33]]
@@ -166,12 +166,12 @@ vector.body:                                      ; preds = %vector.body, %entry
   %b1.imag.ext = sext <vscale x 16 x i8> %b1.imag to <vscale x 16 x i32>
   %24 = mul nsw <vscale x 16 x i32> %b0.imag.ext, %a0.imag.ext
   %25 = mul nsw <vscale x 16 x i32> %b1.imag.ext, %a1.imag.ext
-  %partial.reduce = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %vec.phi, <vscale x 16 x i32> %18)
-  %partial.reduce32 = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %vec.phi25, <vscale x 16 x i32> %19)
+  %partial.reduce = tail call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %vec.phi, <vscale x 16 x i32> %18)
+  %partial.reduce32 = tail call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %vec.phi25, <vscale x 16 x i32> %19)
   %26 = sub nsw <vscale x 16 x i32> zeroinitializer, %24
   %27 = sub nsw <vscale x 16 x i32> zeroinitializer, %25
-  %partial.reduce33 = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %partial.reduce, <vscale x 16 x i32> %26)
-  %partial.reduce34 = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %partial.reduce32, <vscale x 16 x i32> %27)
+  %partial.reduce33 = tail call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %partial.reduce, <vscale x 16 x i32> %26)
+  %partial.reduce34 = tail call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %partial.reduce32, <vscale x 16 x i32> %27)
   br i1 true, label %middle.block, label %vector.body
 
 middle.block:                                     ; preds = %vector.body
@@ -180,11 +180,11 @@ middle.block:                                     ; preds = %vector.body
   ret i32 %29
 }
 
-declare <vscale x 8 x i16> @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i32(<vscale x 8 x i16>, <vscale x 16 x i32>)
-declare <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32>, <vscale x 16 x i32>)
-declare <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i32(<vscale x 2 x i64>, <vscale x 16 x i32>)
+declare <vscale x 8 x i16> @llvm.vector.partial.reduce.add.nxv8i16.nxv16i32(<vscale x 8 x i16>, <vscale x 16 x i32>)
+declare <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32>, <vscale x 16 x i32>)
+declare <vscale x 2 x i64> @llvm.vector.partial.reduce.add.nxv2i64.nxv8i32(<vscale x 2 x i64>, <vscale x 16 x i32>)
 
-declare <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32>, <16 x i32>)
+declare <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32>, <16 x i32>)
 declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
 
 declare i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32>)
diff --git a/llvm/test/CodeGen/AArch64/ctpop.ll b/llvm/test/CodeGen/AArch64/ctpop.ll
index c739be95cd243..d547b6bec5b83 100644
--- a/llvm/test/CodeGen/AArch64/ctpop.ll
+++ b/llvm/test/CodeGen/AArch64/ctpop.ll
@@ -505,3 +505,125 @@ entry:
   %s = call <4 x i128> @llvm.ctpop(<4 x i128> %d)
   ret <4 x i128> %s
 }
+
+define i8 @i8(i8 %x) {
+; CHECK-SD-LABEL: i8:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    and w8, w0, #0xff
+; CHECK-SD-NEXT:    fmov s0, w8
+; CHECK-SD-NEXT:    cnt v0.8b, v0.8b
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: i8:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-GI-NEXT:    and x8, x0, #0xff
+; CHECK-GI-NEXT:    fmov d0, x8
+; CHECK-GI-NEXT:    cnt v0.8b, v0.8b
+; CHECK-GI-NEXT:    uaddlv h0, v0.8b
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = call i8 @llvm.ctpop.i8(i8 %x)
+  ret i8 %s
+}
+
+define i16 @i16_mask(i16 %x) {
+; CHECK-SD-LABEL: i16_mask:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    and w8, w0, #0xff
+; CHECK-SD-NEXT:    fmov s0, w8
+; CHECK-SD-NEXT:    cnt v0.8b, v0.8b
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: i16_mask:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    and w8, w0, #0xff
+; CHECK-GI-NEXT:    and x8, x8, #0xffff
+; CHECK-GI-NEXT:    fmov d0, x8
+; CHECK-GI-NEXT:    cnt v0.8b, v0.8b
+; CHECK-GI-NEXT:    uaddlv h0, v0.8b
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    ret
+entry:
+  %and = and i16 %x, 255
+  %s = call i16 @llvm.ctpop.i16(i16 %and)
+  ret i16 %s
+}
+
+define i32 @i32_mask(i32 %x) {
+; CHECK-SD-LABEL: i32_mask:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    and w8, w0, #0xff
+; CHECK-SD-NEXT:    fmov s0, w8
+; CHECK-SD-NEXT:    cnt v0.8b, v0.8b
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: i32_mask:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    and w8, w0, #0xff
+; CHECK-GI-NEXT:    fmov s0, w8
+; CHECK-GI-NEXT:    cnt v0.8b, v0.8b
+; CHECK-GI-NEXT:    uaddlv h0, v0.8b
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    ret
+entry:
+  %and = and i32 %x, 255
+  %s = call i32 @llvm.ctpop.i32(i32 %and)
+  ret i32 %s
+}
+
+define i32 @i32_mask_negative(i32 %x) {
+; CHECK-SD-LABEL: i32_mask_negative:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    and w8, w0, #0xffff
+; CHECK-SD-NEXT:    fmov s0, w8
+; CHECK-SD-NEXT:    cnt v0.8b, v0.8b
+; CHECK-SD-NEXT:    addv b0, v0.8b
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: i32_mask_negative:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    and w8, w0, #0xffff
+; CHECK-GI-NEXT:    fmov s0, w8
+; CHECK-GI-NEXT:    cnt v0.8b, v0.8b
+; CHECK-GI-NEXT:    uaddlv h0, v0.8b
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    ret
+entry:
+  %and = and i32 %x, 65535
+  %s = call i32 @llvm.ctpop.i32(i32 %and)
+  ret i32 %s
+}
+
+define i128 @i128_mask(i128 %x) {
+; CHECK-SD-LABEL: i128_mask:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-SD-NEXT:    and x8, x0, #0xff
+; CHECK-SD-NEXT:    mov x1, xzr
+; CHECK-SD-NEXT:    mov v0.d[0], x8
+; CHECK-SD-NEXT:    cnt v0.16b, v0.16b
+; CHECK-SD-NEXT:    addv b0, v0.16b
+; CHECK-SD-NEXT:    fmov x0, d0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: i128_mask:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    and x8, x0, #0xff
+; CHECK-GI-NEXT:    mov x1, xzr
+; CHECK-GI-NEXT:    mov v0.d[0], x8
+; CHECK-GI-NEXT:    mov v0.d[1], xzr
+; CHECK-GI-NEXT:    cnt v0.16b, v0.16b
+; CHECK-GI-NEXT:    uaddlv h0, v0.16b
+; CHECK-GI-NEXT:    mov w0, v0.s[0]
+; CHECK-GI-NEXT:    ret
+entry:
+  %and = and i128 %x, 255
+  %s = call i128 @llvm.ctpop.i128(i128 %and)
+  ret i128 %s
+}
diff --git a/llvm/test/CodeGen/AArch64/fprcvt-cvtf.ll b/llvm/test/CodeGen/AArch64/fprcvt-cvtf.ll
index 9da6f583cec01..3ea1a01cfc977 100644
--- a/llvm/test/CodeGen/AArch64/fprcvt-cvtf.ll
+++ b/llvm/test/CodeGen/AArch64/fprcvt-cvtf.ll
@@ -94,16 +94,10 @@ define double @scvtf_f64i32_neg(<4 x i32> %x) {
  ret double %conv
 }
 
-; This test does not give the indended result of scvtf d0, s0
-; This is due to the input being loaded as a 2 item vector and
-; therefore using vector inputs that do not match the pattern
-; This test will be fixed in a future revision
 define <1 x double> @scvtf_f64i32_simple(<1 x i32> %x) {
 ; CHECK-LABEL: scvtf_f64i32_simple:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sshll v0.2d, v0.2s, #0
-; CHECK-NEXT:    scvtf v0.2d, v0.2d
-; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    scvtf d0, s0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-NO-FPRCVT-LABEL: scvtf_f64i32_simple:
@@ -315,16 +309,10 @@ define double @ucvtf_f64i32_neg(<4 x i32> %x) {
  ret double %conv
 }
 
-; This test does not give the indended result of ucvtf d0, s0
-; This is due to the input being loaded as a 2 item vector and
-; therefore using vector inputs that do not match the pattern
-; This test will be fixed in a future revision
 define <1 x double> @ucvtf_f64i32_simple(<1 x i32> %x) {
 ; CHECK-LABEL: ucvtf_f64i32_simple:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-NEXT:    ucvtf v0.2d, v0.2d
-; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ucvtf d0, s0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-NO-FPRCVT-LABEL: ucvtf_f64i32_simple:
@@ -449,3 +437,24 @@ define <1 x float> @ucvtf_f32i64_simple(<1 x i64> %x) {
  %conv = uitofp <1 x i64> %x to <1 x float>
  ret <1 x float> %conv
 }
+
+define <1 x double> @uitofp_sext_v2i32_extract_lane0(<2 x i32> %x) {
+; CHECK-LABEL: uitofp_sext_v2i32_extract_lane0:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sshll v0.2d, v0.2s, #0
+; CHECK-NEXT:    ucvtf v0.2d, v0.2d
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ret
+;
+; CHECK-NO-FPRCVT-LABEL: uitofp_sext_v2i32_extract_lane0:
+; CHECK-NO-FPRCVT:       // %bb.0:
+; CHECK-NO-FPRCVT-NEXT:    sshll v0.2d, v0.2s, #0
+; CHECK-NO-FPRCVT-NEXT:    ucvtf v0.2d, v0.2d
+; CHECK-NO-FPRCVT-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NO-FPRCVT-NEXT:    ret
+  %wide  = sext <2 x i32> %x to <2 x i64>
+  %fpv2  = uitofp <2 x i64> %wide to <2 x double>
+  %lane0 = shufflevector <2 x double> %fpv2, <2 x double> poison, <1 x i32> zeroinitializer
+  ret <1 x double> %lane0
+}
+
diff --git a/llvm/test/CodeGen/AArch64/global-merge-external.ll b/llvm/test/CodeGen/AArch64/global-merge-external.ll
new file mode 100644
index 0000000000000..7f2b1e38ea4fc
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/global-merge-external.ll
@@ -0,0 +1,35 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc %s -o - -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,CHECK-O2
+; RUN: llc -O3 %s -o - -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,CHECK-O3
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64"
+
+@global0 = dso_local local_unnamed_addr global i32 0, align 4
+@global1 = dso_local local_unnamed_addr global i32 0, align 4
+
+define dso_local i32 @func() {
+; CHECK-O2-LABEL: func:
+; CHECK-O2:       // %bb.0: // %entry
+; CHECK-O2-NEXT:    adrp x8, global0
+; CHECK-O2-NEXT:    adrp x9, global1
+; CHECK-O2-NEXT:    ldr w8, [x8, :lo12:global0]
+; CHECK-O2-NEXT:    ldr w9, [x9, :lo12:global1]
+; CHECK-O2-NEXT:    add w0, w9, w8
+; CHECK-O2-NEXT:    ret
+;
+; CHECK-O3-LABEL: func:
+; CHECK-O3:       // %bb.0: // %entry
+; CHECK-O3-NEXT:    adrp x8, .L_MergedGlobals
+; CHECK-O3-NEXT:    add x8, x8, :lo12:.L_MergedGlobals
+; CHECK-O3-NEXT:    ldp w9, w8, [x8]
+; CHECK-O3-NEXT:    add w0, w8, w9
+; CHECK-O3-NEXT:    ret
+entry:
+  %0 = load i32, ptr @global0, align 4
+  %1 = load i32, ptr @global1, align 4
+  %add = add nsw i32 %1, %0
+  ret i32 %add
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}
diff --git a/llvm/test/CodeGen/AArch64/global-merge-minsize.ll b/llvm/test/CodeGen/AArch64/global-merge-minsize.ll
index 8f569ecd9e634..f952580ba4540 100644
--- a/llvm/test/CodeGen/AArch64/global-merge-minsize.ll
+++ b/llvm/test/CodeGen/AArch64/global-merge-minsize.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc %s -o - -verify-machineinstrs | FileCheck %s
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
@@ -7,12 +8,13 @@ target triple = "aarch64"
 @global1 = dso_local local_unnamed_addr global i32 0, align 4
 
 define dso_local i32 @func() minsize optsize {
-; CHECK-LABEL: @func
-; CHECK:       adrp x8, .L_MergedGlobals
-; CHECK-NEXT:  add x8, x8, :lo12:.L_MergedGlobals
-; CHECK-NEXT:  ldp w9, w8, [x8]
-; CHECK-NEXT:  add w0, w8, w9
-; CHECK-NEXT:  ret
+; CHECK-LABEL: func:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    adrp x8, .L_MergedGlobals
+; CHECK-NEXT:    add x8, x8, :lo12:.L_MergedGlobals
+; CHECK-NEXT:    ldp w9, w8, [x8]
+; CHECK-NEXT:    add w0, w8, w9
+; CHECK-NEXT:    ret
 entry:
   %0 = load i32, ptr @global0, align 4
   %1 = load i32, ptr @global1, align 4
diff --git a/llvm/test/CodeGen/AArch64/intrinsic-cttz-elts-sve.ll b/llvm/test/CodeGen/AArch64/intrinsic-cttz-elts-sve.ll
index cdf2a962f9322..ca16df3c09ade 100644
--- a/llvm/test/CodeGen/AArch64/intrinsic-cttz-elts-sve.ll
+++ b/llvm/test/CodeGen/AArch64/intrinsic-cttz-elts-sve.ll
@@ -367,7 +367,6 @@ define i32 @ctz_v16i1(<16 x i1> %a) {
 ; NONSTREAMING-NEXT:    shl v0.16b, v0.16b, #7
 ; NONSTREAMING-NEXT:    ptrue p0.b, vl16
 ; NONSTREAMING-NEXT:    ptrue p1.b
-; NONSTREAMING-NEXT:    cmlt v0.16b, v0.16b, #0
 ; NONSTREAMING-NEXT:    cmpne p0.b, p0/z, z0.b, #0
 ; NONSTREAMING-NEXT:    brkb p0.b, p1/z, p0.b
 ; NONSTREAMING-NEXT:    cntp x0, p0, p0.b
@@ -396,7 +395,6 @@ define i32 @ctz_v16i1_poison(<16 x i1> %a) {
 ; NONSTREAMING-NEXT:    shl v0.16b, v0.16b, #7
 ; NONSTREAMING-NEXT:    ptrue p0.b, vl16
 ; NONSTREAMING-NEXT:    ptrue p1.b
-; NONSTREAMING-NEXT:    cmlt v0.16b, v0.16b, #0
 ; NONSTREAMING-NEXT:    cmpne p0.b, p0/z, z0.b, #0
 ; NONSTREAMING-NEXT:    brkb p0.b, p1/z, p0.b
 ; NONSTREAMING-NEXT:    cntp x0, p0, p0.b
@@ -425,7 +423,6 @@ define i64 @add_i64_ctz_v16i1_poison(<16 x i1> %a, i64 %b) {
 ; NONSTREAMING-NEXT:    shl v0.16b, v0.16b, #7
 ; NONSTREAMING-NEXT:    ptrue p0.b, vl16
 ; NONSTREAMING-NEXT:    ptrue p1.b
-; NONSTREAMING-NEXT:    cmlt v0.16b, v0.16b, #0
 ; NONSTREAMING-NEXT:    cmpne p0.b, p0/z, z0.b, #0
 ; NONSTREAMING-NEXT:    brkb p0.b, p1/z, p0.b
 ; NONSTREAMING-NEXT:    incp x0, p0.b
@@ -453,7 +450,6 @@ define i32 @ctz_v8i1(<8 x i1> %a) {
 ; NONSTREAMING-NEXT:    shl v0.8b, v0.8b, #7
 ; NONSTREAMING-NEXT:    ptrue p0.b, vl8
 ; NONSTREAMING-NEXT:    ptrue p1.b
-; NONSTREAMING-NEXT:    cmlt v0.8b, v0.8b, #0
 ; NONSTREAMING-NEXT:    cmpne p0.b, p0/z, z0.b, #0
 ; NONSTREAMING-NEXT:    brkb p0.b, p1/z, p0.b
 ; NONSTREAMING-NEXT:    cntp x0, p0, p0.b
@@ -482,7 +478,6 @@ define i32 @ctz_v8i1_poison(<8 x i1> %a) {
 ; NONSTREAMING-NEXT:    shl v0.8b, v0.8b, #7
 ; NONSTREAMING-NEXT:    ptrue p0.b, vl8
 ; NONSTREAMING-NEXT:    ptrue p1.b
-; NONSTREAMING-NEXT:    cmlt v0.8b, v0.8b, #0
 ; NONSTREAMING-NEXT:    cmpne p0.b, p0/z, z0.b, #0
 ; NONSTREAMING-NEXT:    brkb p0.b, p1/z, p0.b
 ; NONSTREAMING-NEXT:    cntp x0, p0, p0.b
@@ -511,7 +506,6 @@ define i32 @ctz_v4i1(<4 x i1> %a) {
 ; NONSTREAMING-NEXT:    shl v0.4h, v0.4h, #15
 ; NONSTREAMING-NEXT:    ptrue p0.h, vl4
 ; NONSTREAMING-NEXT:    ptrue p1.h
-; NONSTREAMING-NEXT:    cmlt v0.4h, v0.4h, #0
 ; NONSTREAMING-NEXT:    cmpne p0.h, p0/z, z0.h, #0
 ; NONSTREAMING-NEXT:    brkb p0.b, p1/z, p0.b
 ; NONSTREAMING-NEXT:    cntp x0, p0, p0.h
@@ -540,7 +534,6 @@ define i32 @ctz_v4i1_poison(<4 x i1> %a) {
 ; NONSTREAMING-NEXT:    shl v0.4h, v0.4h, #15
 ; NONSTREAMING-NEXT:    ptrue p0.h, vl4
 ; NONSTREAMING-NEXT:    ptrue p1.h
-; NONSTREAMING-NEXT:    cmlt v0.4h, v0.4h, #0
 ; NONSTREAMING-NEXT:    cmpne p0.h, p0/z, z0.h, #0
 ; NONSTREAMING-NEXT:    brkb p0.b, p1/z, p0.b
 ; NONSTREAMING-NEXT:    cntp x0, p0, p0.h
@@ -569,7 +562,6 @@ define i32 @ctz_v2i1(<2 x i1> %a) {
 ; NONSTREAMING-NEXT:    shl v0.2s, v0.2s, #31
 ; NONSTREAMING-NEXT:    ptrue p0.s, vl2
 ; NONSTREAMING-NEXT:    ptrue p1.s
-; NONSTREAMING-NEXT:    cmlt v0.2s, v0.2s, #0
 ; NONSTREAMING-NEXT:    cmpne p0.s, p0/z, z0.s, #0
 ; NONSTREAMING-NEXT:    brkb p0.b, p1/z, p0.b
 ; NONSTREAMING-NEXT:    cntp x0, p0, p0.s
@@ -598,7 +590,6 @@ define i32 @ctz_v2i1_poison(<2 x i1> %a) {
 ; NONSTREAMING-NEXT:    shl v0.2s, v0.2s, #31
 ; NONSTREAMING-NEXT:    ptrue p0.s, vl2
 ; NONSTREAMING-NEXT:    ptrue p1.s
-; NONSTREAMING-NEXT:    cmlt v0.2s, v0.2s, #0
 ; NONSTREAMING-NEXT:    cmpne p0.s, p0/z, z0.s, #0
 ; NONSTREAMING-NEXT:    brkb p0.b, p1/z, p0.b
 ; NONSTREAMING-NEXT:    cntp x0, p0, p0.s
diff --git a/llvm/test/CodeGen/AArch64/intrinsic-vector-match-sve2.ll b/llvm/test/CodeGen/AArch64/intrinsic-vector-match-sve2.ll
index 474a9d1003e8c..46bd414568e05 100644
--- a/llvm/test/CodeGen/AArch64/intrinsic-vector-match-sve2.ll
+++ b/llvm/test/CodeGen/AArch64/intrinsic-vector-match-sve2.ll
@@ -142,7 +142,6 @@ define <16 x i1> @match_v16i8_v8i8(<16 x i8> %op1, <8 x i8> %op2, <16 x i1> %mas
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    mov z1.d, d1
-; CHECK-NEXT:    cmlt v2.16b, v2.16b, #0
 ; CHECK-NEXT:    cmpne p0.b, p0/z, z2.b, #0
 ; CHECK-NEXT:    match p0.b, p0/z, z0.b, z1.b
 ; CHECK-NEXT:    mov z0.b, p0/z, #-1 // =0xffffffffffffffff
@@ -159,7 +158,6 @@ define <16 x i1> @match_v16i8_v16i8(<16 x i8> %op1, <16 x i8> %op2, <16 x i1> %m
 ; CHECK-NEXT:    ptrue p0.b, vl16
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
-; CHECK-NEXT:    cmlt v2.16b, v2.16b, #0
 ; CHECK-NEXT:    cmpne p0.b, p0/z, z2.b, #0
 ; CHECK-NEXT:    match p0.b, p0/z, z0.b, z1.b
 ; CHECK-NEXT:    mov z0.b, p0/z, #-1 // =0xffffffffffffffff
@@ -177,7 +175,6 @@ define <8 x i1> @match_v8i8_v8i8(<8 x i8> %op1, <8 x i8> %op2, <8 x i1> %mask) #
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    mov z1.d, d1
-; CHECK-NEXT:    cmlt v2.8b, v2.8b, #0
 ; CHECK-NEXT:    cmpne p0.b, p0/z, z2.b, #0
 ; CHECK-NEXT:    match p0.b, p0/z, z0.b, z1.b
 ; CHECK-NEXT:    mov z0.b, p0/z, #-1 // =0xffffffffffffffff
@@ -206,7 +203,6 @@ define <8 x i1> @match_v8i16(<8 x i16> %op1, <8 x i16> %op2, <8 x i1> %mask) #0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    shl v2.8h, v2.8h, #15
-; CHECK-NEXT:    cmlt v2.8h, v2.8h, #0
 ; CHECK-NEXT:    cmpne p0.h, p0/z, z2.h, #0
 ; CHECK-NEXT:    match p0.h, p0/z, z0.h, z1.h
 ; CHECK-NEXT:    mov z0.h, p0/z, #-1 // =0xffffffffffffffff
@@ -225,7 +221,6 @@ define <8 x i1> @match_v8i8_v16i8(<8 x i8> %op1, <16 x i8> %op2, <8 x i1> %mask)
 ; CHECK-NEXT:    ptrue p0.b, vl8
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
-; CHECK-NEXT:    cmlt v2.8b, v2.8b, #0
 ; CHECK-NEXT:    cmpne p0.b, p0/z, z2.b, #0
 ; CHECK-NEXT:    match p0.b, p0/z, z0.b, z1.b
 ; CHECK-NEXT:    mov z0.b, p0/z, #-1 // =0xffffffffffffffff
diff --git a/llvm/test/CodeGen/AArch64/local-bounds-single-trap.ll b/llvm/test/CodeGen/AArch64/local-bounds-single-trap.ll
index 8b8a3e430df69..1207eaa2612a3 100644
--- a/llvm/test/CodeGen/AArch64/local-bounds-single-trap.ll
+++ b/llvm/test/CodeGen/AArch64/local-bounds-single-trap.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -O3 -mtriple arm64-linux -filetype asm -o - %s | FileCheck %s -check-prefix CHECK-ASM
 ; This test checks that nomerge correctly prevents the traps from being merged
 ; in the compiled code.
@@ -9,36 +10,43 @@
 
 ; Function Attrs: noinline nounwind uwtable
 define dso_local void @f8(i32 noundef %i, i32 noundef %k) #0 {
+; CHECK-ASM-LABEL: f8:
+; CHECK-ASM:       // %bb.0: // %entry
+; CHECK-ASM-NEXT:    sub sp, sp, #16
+; CHECK-ASM-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-ASM-NEXT:    .cfi_remember_state
+; CHECK-ASM-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-ASM-NEXT:    sxtw x8, w0
+; CHECK-ASM-NEXT:    stp w1, w0, [sp, #8]
+; CHECK-ASM-NEXT:    cmp x8, #10
+; CHECK-ASM-NEXT:    b.hi .LBB0_5
+; CHECK-ASM-NEXT:  // %bb.1: // %entry
+; CHECK-ASM-NEXT:    mov w9, #10 // =0xa
+; CHECK-ASM-NEXT:    sub x9, x9, x8
+; CHECK-ASM-NEXT:    cbz x9, .LBB0_5
+; CHECK-ASM-NEXT:  // %bb.2:
+; CHECK-ASM-NEXT:    ldrsw x9, [sp, #8]
+; CHECK-ASM-NEXT:    adrp x10, .L_MergedGlobals
+; CHECK-ASM-NEXT:    add x10, x10, :lo12:.L_MergedGlobals
+; CHECK-ASM-NEXT:    strb wzr, [x10, x8]
+; CHECK-ASM-NEXT:    cmp x9, #10
+; CHECK-ASM-NEXT:    b.hi .LBB0_6
+; CHECK-ASM-NEXT:  // %bb.3:
+; CHECK-ASM-NEXT:    mov w8, #10 // =0xa
+; CHECK-ASM-NEXT:    sub x8, x8, x9
+; CHECK-ASM-NEXT:    cbz x8, .LBB0_6
+; CHECK-ASM-NEXT:  // %bb.4:
+; CHECK-ASM-NEXT:    add x8, x10, x9
+; CHECK-ASM-NEXT:    strb wzr, [x8, #10]
+; CHECK-ASM-NEXT:    add sp, sp, #16
+; CHECK-ASM-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-ASM-NEXT:    ret
+; CHECK-ASM-NEXT:  .LBB0_5: // %trap
+; CHECK-ASM-NEXT:    .cfi_restore_state
+; CHECK-ASM-NEXT:    brk #0x1
+; CHECK-ASM-NEXT:  .LBB0_6: // %trap3
+; CHECK-ASM-NEXT:    brk #0x1
 entry:
-; CHECK-ASM: 	cmp	x8, #10
-; CHECK-ASM: 	b.hi	.LBB0_5
-; CHECK-ASM: // %bb.1:                               // %entry
-; CHECK-ASM: 	mov	w9, #10                         // =0xa
-; CHECK-ASM: 	sub	x9, x9, x8
-; CHECK-ASM: 	cbz	x9, .LBB0_5
-; CHECK-ASM: // %bb.2:
-; CHECK-ASM: 	ldrsw	x9, [sp, #8]
-; CHECK-ASM: 	adrp	x10, B
-; CHECK-ASM: 	add	x10, x10, :lo12:B
-; CHECK-ASM: 	strb	wzr, [x10, x8]
-; CHECK-ASM: 	cmp	x9, #10
-; CHECK-ASM: 	b.hi	.LBB0_6
-; CHECK-ASM: // %bb.3:
-; CHECK-ASM: 	mov	w8, #10                         // =0xa
-; CHECK-ASM: 	sub	x8, x8, x9
-; CHECK-ASM: 	cbz	x8, .LBB0_6
-; CHECK-ASM: // %bb.4:
-; CHECK-ASM: 	adrp	x8, B2
-; CHECK-ASM: 	add	x8, x8, :lo12:B2
-; CHECK-ASM: 	strb	wzr, [x8, x9]
-; CHECK-ASM: 	add	sp, sp, #16
-; CHECK-ASM: 	.cfi_def_cfa_offset 0
-; CHECK-ASM: 	ret
-; CHECK-ASM: .LBB0_5:                                // %trap
-; CHECK-ASM: .cfi_restore_state
-; CHECK-ASM: brk     #0x1
-; CHECK-ASM: .LBB0_6:                                // %trap3
-; CHECK-ASM: brk     #0x1
   %i.addr = alloca i32, align 4
   %k.addr = alloca i32, align 4
   store i32 %i, ptr %i.addr, align 4
diff --git a/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll b/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll
index 2d81a264e02bc..c38516fc57bbd 100644
--- a/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll
+++ b/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll
@@ -26,7 +26,7 @@ define <4 x i32> @udot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) {
   %u.wide = zext <16 x i8> %u to <16 x i32>
   %s.wide = zext <16 x i8> %s to <16 x i32>
   %mult = mul nuw nsw <16 x i32> %s.wide, %u.wide
-  %partial.reduce = tail call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %acc, <16 x i32> %mult)
+  %partial.reduce = tail call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %acc, <16 x i32> %mult)
   ret <4 x i32> %partial.reduce
 }
 
@@ -96,7 +96,7 @@ vector.body:
   %load2 = load <16 x i8>, ptr %gep2, align 16
   %load2.wide = zext <16 x i8> %load2 to <16 x i32>
   %mul = mul nuw nsw <16 x i32> %load1.wide, %load2.wide
-  %partial.reduce = tail call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %acc, <16 x i32> %mul)
+  %partial.reduce = tail call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %acc, <16 x i32> %mul)
   %index.next = add nuw i64 %index, 16
   %cmp = icmp eq i64 %index.next, 16
   br i1 %cmp, label %end, label %vector.body
@@ -133,7 +133,7 @@ define <2 x i32> @udot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) {
   %u.wide = zext <8 x i8> %u to <8 x i32>
   %s.wide = zext <8 x i8> %s to <8 x i32>
   %mult = mul nuw nsw <8 x i32> %s.wide, %u.wide
-  %partial.reduce = tail call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<2 x i32> %acc, <8 x i32> %mult)
+  %partial.reduce = tail call <2 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<2 x i32> %acc, <8 x i32> %mult)
   ret <2 x i32> %partial.reduce
 }
 
@@ -160,7 +160,7 @@ define <4 x i32> @sdot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) {
   %u.wide = sext <16 x i8> %u to <16 x i32>
   %s.wide = sext <16 x i8> %s to <16 x i32>
   %mult = mul nuw nsw <16 x i32> %s.wide, %u.wide
-  %partial.reduce = tail call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %acc, <16 x i32> %mult)
+  %partial.reduce = tail call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %acc, <16 x i32> %mult)
   ret <4 x i32> %partial.reduce
 }
 
@@ -192,7 +192,7 @@ define <2 x i32> @sdot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) {
   %u.wide = sext <8 x i8> %u to <8 x i32>
   %s.wide = sext <8 x i8> %s to <8 x i32>
   %mult = mul nuw nsw <8 x i32> %s.wide, %u.wide
-  %partial.reduce = tail call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<2 x i32> %acc, <8 x i32> %mult)
+  %partial.reduce = tail call <2 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<2 x i32> %acc, <8 x i32> %mult)
   ret <2 x i32> %partial.reduce
 }
 
@@ -228,7 +228,7 @@ define <4 x i32> @usdot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) {
   %u.wide = zext <16 x i8> %u to <16 x i32>
   %s.wide = sext <16 x i8> %s to <16 x i32>
   %mult = mul nuw nsw <16 x i32> %s.wide, %u.wide
-  %partial.reduce = tail call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %acc, <16 x i32> %mult)
+  %partial.reduce = tail call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %acc, <16 x i32> %mult)
   ret <4 x i32> %partial.reduce
 }
 
@@ -307,7 +307,7 @@ vector.body:
   %load2 = load <16 x i8>, ptr %gep2, align 16
   %load2.wide = zext <16 x i8> %load2 to <16 x i32>
   %mul = mul nuw nsw <16 x i32> %load1.wide, %load2.wide
-  %partial.reduce = tail call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %acc, <16 x i32> %mul)
+  %partial.reduce = tail call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %acc, <16 x i32> %mul)
   %index.next = add nuw i64 %index, 16
   %cmp = icmp eq i64 %index.next, 16
   br i1 %cmp, label %end, label %vector.body
@@ -358,7 +358,7 @@ define <2 x i32> @usdot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) #0{
   %u.wide = zext <8 x i8> %u to <8 x i32>
   %s.wide = sext <8 x i8> %s to <8 x i32>
   %mult = mul nuw nsw <8 x i32> %s.wide, %u.wide
-  %partial.reduce = tail call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<2 x i32> %acc, <8 x i32> %mult)
+  %partial.reduce = tail call <2 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<2 x i32> %acc, <8 x i32> %mult)
   ret <2 x i32> %partial.reduce
 }
 
@@ -394,7 +394,7 @@ define <4 x i32> @sudot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) #0{
   %s.wide = sext <16 x i8> %u to <16 x i32>
   %u.wide = zext <16 x i8> %s to <16 x i32>
   %mult = mul nuw nsw <16 x i32> %u.wide, %s.wide
-  %partial.reduce = tail call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %acc, <16 x i32> %mult)
+  %partial.reduce = tail call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %acc, <16 x i32> %mult)
   ret <4 x i32> %partial.reduce
 }
 
@@ -473,7 +473,7 @@ vector.body:
   %load2 = load <16 x i8>, ptr %gep2, align 16
   %load2.wide = sext <16 x i8> %load2 to <16 x i32>
   %mul = mul nuw nsw <16 x i32> %load1.wide, %load2.wide
-  %partial.reduce = tail call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %acc, <16 x i32> %mul)
+  %partial.reduce = tail call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %acc, <16 x i32> %mul)
   %index.next = add nuw i64 %index, 16
   %cmp = icmp eq i64 %index.next, 16
   br i1 %cmp, label %end, label %vector.body
@@ -524,7 +524,7 @@ define <2 x i32> @sudot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) #0{
   %u.wide = sext <8 x i8> %u to <8 x i32>
   %s.wide = zext <8 x i8> %s to <8 x i32>
   %mult = mul nuw nsw <8 x i32> %s.wide, %u.wide
-  %partial.reduce = tail call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<2 x i32> %acc, <8 x i32> %mult)
+  %partial.reduce = tail call <2 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<2 x i32> %acc, <8 x i32> %mult)
   ret <2 x i32> %partial.reduce
 }
 
@@ -566,7 +566,7 @@ entry:
   %a.wide = zext <16 x i8> %a to <16 x i64>
   %b.wide = zext <16 x i8> %b to <16 x i64>
   %mult = mul nuw nsw <16 x i64> %a.wide, %b.wide
-  %partial.reduce = tail call <4 x i64> @llvm.experimental.vector.partial.reduce.add.v4i64.v16i64(
+  %partial.reduce = tail call <4 x i64> @llvm.vector.partial.reduce.add.v4i64.v16i64(
   <4 x i64> %acc, <16 x i64> %mult)
   ret <4 x i64> %partial.reduce
 }
@@ -609,7 +609,7 @@ entry:
   %a.wide = sext <16 x i8> %a to <16 x i64>
   %b.wide = sext <16 x i8> %b to <16 x i64>
   %mult = mul nuw nsw <16 x i64> %a.wide, %b.wide
-  %partial.reduce = tail call <4 x i64> @llvm.experimental.vector.partial.reduce.add.v4i64.v16i64(
+  %partial.reduce = tail call <4 x i64> @llvm.vector.partial.reduce.add.v4i64.v16i64(
   <4 x i64> %acc, <16 x i64> %mult)
   ret <4 x i64> %partial.reduce
 }
@@ -674,7 +674,7 @@ entry:
   %a.wide = zext <16 x i8> %a to <16 x i64>
   %b.wide = sext <16 x i8> %b to <16 x i64>
   %mult = mul nuw nsw <16 x i64> %a.wide, %b.wide
-  %partial.reduce = tail call <4 x i64> @llvm.experimental.vector.partial.reduce.add.v4i64.v16i64(
+  %partial.reduce = tail call <4 x i64> @llvm.vector.partial.reduce.add.v4i64.v16i64(
   <4 x i64> %acc, <16 x i64> %mult)
   ret <4 x i64> %partial.reduce
 }
@@ -739,7 +739,7 @@ entry:
   %a.wide = sext <16 x i8> %a to <16 x i64>
   %b.wide = zext <16 x i8> %b to <16 x i64>
   %mult = mul nuw nsw <16 x i64> %a.wide, %b.wide
-  %partial.reduce = tail call <4 x i64> @llvm.experimental.vector.partial.reduce.add.v4i64.v16i64(
+  %partial.reduce = tail call <4 x i64> @llvm.vector.partial.reduce.add.v4i64.v16i64(
   <4 x i64> %acc, <16 x i64> %mult)
   ret <4 x i64> %partial.reduce
 }
@@ -767,7 +767,7 @@ define <4 x i32> @udot_no_bin_op(<4 x i32> %acc, <16 x i8> %a){
 ; CHECK-DOT-I8MM-NEXT:    udot v0.4s, v1.16b, v2.16b
 ; CHECK-DOT-I8MM-NEXT:    ret
   %a.wide = zext <16 x i8> %a to <16 x i32>
-  %partial.reduce = tail call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %acc, <16 x i32> %a.wide)
+  %partial.reduce = tail call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %acc, <16 x i32> %a.wide)
   ret <4 x i32> %partial.reduce
 }
 
@@ -832,7 +832,7 @@ vector.body:
   %gep = getelementptr i8, ptr %p, i64 %index
   %load = load <16 x i8>, ptr %gep, align 16
   %load.wide = zext <16 x i8> %load to <16 x i32>
-  %partial.reduce = tail call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %acc, <16 x i32> %load.wide)
+  %partial.reduce = tail call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %acc, <16 x i32> %load.wide)
   %index.next = add nuw i64 %index, 16
   %cmp = icmp eq i64 %index.next, 16
   br i1 %cmp, label %end, label %vector.body
@@ -864,7 +864,7 @@ define <4 x i32> @sdot_no_bin_op(<4 x i32> %acc, <16 x i8> %a){
 ; CHECK-DOT-I8MM-NEXT:    sdot v0.4s, v1.16b, v2.16b
 ; CHECK-DOT-I8MM-NEXT:    ret
   %a.wide = sext <16 x i8> %a to <16 x i32>
-  %partial.reduce = tail call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %acc, <16 x i32> %a.wide)
+  %partial.reduce = tail call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %acc, <16 x i32> %a.wide)
   ret <4 x i32> %partial.reduce
 }
 
@@ -896,7 +896,7 @@ define <2 x i32> @udot_no_bin_op_narrow(<2 x i32> %acc, <8 x i8> %a){
 ; CHECK-DOT-I8MM-NEXT:    udot v0.2s, v1.8b, v2.8b
 ; CHECK-DOT-I8MM-NEXT:    ret
   %a.wide = zext <8 x i8> %a to <8 x i32>
-  %partial.reduce = tail call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> %acc, <8 x i32> %a.wide)
+  %partial.reduce = tail call <2 x i32> @llvm.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> %acc, <8 x i32> %a.wide)
   ret <2 x i32> %partial.reduce
 }
 
@@ -928,7 +928,7 @@ define <2 x i32> @sdot_no_bin_op_narrow(<2 x i32> %acc, <8 x i8> %a){
 ; CHECK-DOT-I8MM-NEXT:    sdot v0.2s, v1.8b, v2.8b
 ; CHECK-DOT-I8MM-NEXT:    ret
   %a.wide = sext <8 x i8> %a to <8 x i32>
-  %partial.reduce = tail call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> %acc, <8 x i32> %a.wide)
+  %partial.reduce = tail call <2 x i32> @llvm.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> %acc, <8 x i32> %a.wide)
   ret <2 x i32> %partial.reduce
 }
 
@@ -969,7 +969,7 @@ define <4 x i64> @udot_no_bin_op_8to64(<4 x i64> %acc, <16 x i8> %a){
 ; CHECK-DOT-I8MM-NEXT:    uaddw2 v0.2d, v0.2d, v4.4s
 ; CHECK-DOT-I8MM-NEXT:    ret
   %a.wide = zext <16 x i8> %a to <16 x i64>
-  %partial.reduce = tail call <4 x i64> @llvm.experimental.vector.partial.reduce.add.v4i64.v16i64(<4 x i64> %acc, <16 x i64> %a.wide)
+  %partial.reduce = tail call <4 x i64> @llvm.vector.partial.reduce.add.v4i64.v16i64(<4 x i64> %acc, <16 x i64> %a.wide)
   ret <4 x i64> %partial.reduce
 }
 
@@ -1010,7 +1010,7 @@ define <4 x i64> @sdot_no_bin_op_8to64(<4 x i64> %acc, <16 x i8> %a){
 ; CHECK-DOT-I8MM-NEXT:    saddw2 v0.2d, v0.2d, v4.4s
 ; CHECK-DOT-I8MM-NEXT:    ret
   %a.wide = sext <16 x i8> %a to <16 x i64>
-  %partial.reduce = tail call <4 x i64> @llvm.experimental.vector.partial.reduce.add.v4i64.v16i64(<4 x i64> %acc, <16 x i64> %a.wide)
+  %partial.reduce = tail call <4 x i64> @llvm.vector.partial.reduce.add.v4i64.v16i64(<4 x i64> %acc, <16 x i64> %a.wide)
   ret <4 x i64> %partial.reduce
 }
 
@@ -1024,7 +1024,7 @@ define <4 x i32> @not_udot(<4 x i32> %acc, <8 x i8> %u, <8 x i8> %s) #0{
   %u.wide = zext <8 x i8> %u to <8 x i32>
   %s.wide = zext <8 x i8> %s to <8 x i32>
   %mult = mul nuw nsw <8 x i32> %s.wide, %u.wide
-  %partial.reduce = tail call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %acc, <8 x i32> %mult)
+  %partial.reduce = tail call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %acc, <8 x i32> %mult)
   ret <4 x i32> %partial.reduce
 }
 
@@ -1042,7 +1042,7 @@ define <2 x i32> @not_udot_narrow(<2 x i32> %acc, <4 x i8> %u, <4 x i8> %s) {
   %u.wide = zext <4 x i8> %u to <4 x i32>
   %s.wide = zext <4 x i8> %s to <4 x i32>
   %mult = mul nuw nsw <4 x i32> %s.wide, %u.wide
-  %partial.reduce = tail call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<2 x i32> %acc, <4 x i32> %mult)
+  %partial.reduce = tail call <2 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<2 x i32> %acc, <4 x i32> %mult)
   ret <2 x i32> %partial.reduce
 }
 
@@ -1063,7 +1063,7 @@ entry:
   %a.wide = zext <8 x i16> %a to <8 x i64>
   %b.wide = zext <8 x i8> %b to <8 x i64>
   %mult = mul nuw nsw <8 x i64> %a.wide, %b.wide
-  %partial.reduce = tail call <2 x i64> @llvm.experimental.vector.partial.reduce.add.v2i64.v8i64(<2 x i64> %acc, <8 x i64> %mult)
+  %partial.reduce = tail call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v8i64(<2 x i64> %acc, <8 x i64> %mult)
   ret <2 x i64> %partial.reduce
 }
 
@@ -1084,7 +1084,7 @@ entry:
   %a.wide = sext <8 x i16> %a to <8 x i64>
   %b.wide = sext <8 x i8> %b to <8 x i64>
   %mult = mul nuw nsw <8 x i64> %a.wide, %b.wide
-  %partial.reduce = tail call <2 x i64> @llvm.experimental.vector.partial.reduce.add.v2i64.v8i64(<2 x i64> %acc, <8 x i64> %mult)
+  %partial.reduce = tail call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v8i64(<2 x i64> %acc, <8 x i64> %mult)
   ret <2 x i64> %partial.reduce
 }
 
@@ -1105,7 +1105,7 @@ entry:
   %a.wide = zext <8 x i16> %a to <8 x i64>
   %b.wide = sext <8 x i8> %b to <8 x i64>
   %mult = mul nuw nsw <8 x i64> %a.wide, %b.wide
-  %partial.reduce = tail call <2 x i64> @llvm.experimental.vector.partial.reduce.add.v2i64.v8i64(<2 x i64> %acc, <8 x i64> %mult)
+  %partial.reduce = tail call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v8i64(<2 x i64> %acc, <8 x i64> %mult)
   ret <2 x i64> %partial.reduce
 }
 
@@ -1126,7 +1126,7 @@ entry:
   %a.wide = sext <8 x i16> %a to <8 x i64>
   %b.wide = zext <8 x i8> %b to <8 x i64>
   %mult = mul nuw nsw <8 x i64> %a.wide, %b.wide
-  %partial.reduce = tail call <2 x i64> @llvm.experimental.vector.partial.reduce.add.v2i64.v8i64(<2 x i64> %acc, <8 x i64> %mult)
+  %partial.reduce = tail call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v8i64(<2 x i64> %acc, <8 x i64> %mult)
   ret <2 x i64> %partial.reduce
 }
 
@@ -1227,10 +1227,10 @@ vector.body:
   %sext1 = sext <16 x i8> %load1 to <16 x i32>
   %zext = zext <16 x i8> %load3 to <16 x i32>
   %mul1 = mul <16 x i32> %sext1, %zext
-  %psum1 = tail call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %acc1, <16 x i32> %mul1)
+  %psum1 = tail call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %acc1, <16 x i32> %mul1)
   %sext2 = sext <16 x i8> %load2 to <16 x i32>
   %mul2 = mul <16 x i32> %sext2, %zext
-  %psum2 = tail call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %acc2, <16 x i32> %mul2)
+  %psum2 = tail call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %acc2, <16 x i32> %mul2)
   %iv.next = add i64 %iv, 16
   %1 = icmp eq i64 %iv.next, 1024
   br i1 %1, label %end, label %vector.body
@@ -1252,6 +1252,6 @@ define <2 x i64> @udot_16to64(<2 x i64> %acc, <8 x i16> %input){
 ; CHECK-COMMON-NEXT:    ret
 entry:
     %input.wide = zext <8 x i16> %input to <8 x i64>
-    %partial.reduce = tail call <2 x i64> @llvm.experimental.vector.partial.reduce.add(<2 x i64> %acc, <8 x i64> %input.wide)
+    %partial.reduce = tail call <2 x i64> @llvm.vector.partial.reduce.add(<2 x i64> %acc, <8 x i64> %input.wide)
     ret <2 x i64> %partial.reduce
 }
diff --git a/llvm/test/CodeGen/AArch64/partial-reduction-add.ll b/llvm/test/CodeGen/AArch64/partial-reduction-add.ll
index c3828c3d695c4..139adb295ac7c 100644
--- a/llvm/test/CodeGen/AArch64/partial-reduction-add.ll
+++ b/llvm/test/CodeGen/AArch64/partial-reduction-add.ll
@@ -10,7 +10,7 @@ define <4 x i32> @partial_reduce_add_fixed(<4 x i32> %accumulator, <4 x i32> %0)
 ; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ret
 entry:
-  %partial.reduce = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v4i32.v4i32(<4 x i32> %accumulator, <4 x i32> %0)
+  %partial.reduce = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v4i32.v4i32(<4 x i32> %accumulator, <4 x i32> %0)
   ret <4 x i32> %partial.reduce
 }
 
@@ -21,7 +21,7 @@ define <4 x i32> @partial_reduce_add_fixed_half(<4 x i32> %accumulator, <8 x i32
 ; CHECK-NEXT:    add v0.4s, v0.4s, v2.4s
 ; CHECK-NEXT:    ret
 entry:
-  %partial.reduce = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v4i32.v8i32(<4 x i32> %accumulator, <8 x i32> %0)
+  %partial.reduce = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v4i32.v8i32(<4 x i32> %accumulator, <8 x i32> %0)
   ret <4 x i32> %partial.reduce
 }
 
@@ -31,7 +31,7 @@ define <vscale x 4 x i32> @partial_reduce_add(<vscale x 4 x i32> %accumulator, <
 ; CHECK-NEXT:    add z0.s, z0.s, z1.s
 ; CHECK-NEXT:    ret
 entry:
-  %partial.reduce = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv4i32(<vscale x 4 x i32> %accumulator, <vscale x 4 x i32> %0)
+  %partial.reduce = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv4i32(<vscale x 4 x i32> %accumulator, <vscale x 4 x i32> %0)
   ret <vscale x 4 x i32> %partial.reduce
 }
 
@@ -42,7 +42,7 @@ define <vscale x 4 x i32> @partial_reduce_add_half(<vscale x 4 x i32> %accumulat
 ; CHECK-NEXT:    add z0.s, z0.s, z2.s
 ; CHECK-NEXT:    ret
 entry:
-  %partial.reduce = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv8i32(<vscale x 4 x i32> %accumulator, <vscale x 8 x i32> %0)
+  %partial.reduce = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv8i32(<vscale x 4 x i32> %accumulator, <vscale x 8 x i32> %0)
   ret <vscale x 4 x i32> %partial.reduce
 }
 
@@ -55,7 +55,7 @@ define <vscale x 4 x i32> @partial_reduce_add_quart(<vscale x 4 x i32> %accumula
 ; CHECK-NEXT:    add z0.s, z0.s, z4.s
 ; CHECK-NEXT:    ret
 entry:
-  %partial.reduce = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv16i32(<vscale x 4 x i32> %accumulator, <vscale x 16 x i32> %0)
+  %partial.reduce = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv16i32(<vscale x 4 x i32> %accumulator, <vscale x 16 x i32> %0)
   ret <vscale x 4 x i32> %partial.reduce
 }
 
@@ -68,14 +68,14 @@ define <vscale x 8 x i32> @partial_reduce_add_half_8(<vscale x 8 x i32> %accumul
 ; CHECK-NEXT:    add z1.s, z1.s, z5.s
 ; CHECK-NEXT:    ret
 entry:
-  %partial.reduce = call <vscale x 8 x i32> @llvm.experimental.vector.partial.reduce.add.nxv8i32.nxv8i32.nxv16i32(<vscale x 8 x i32> %accumulator, <vscale x 16 x i32> %0)
+  %partial.reduce = call <vscale x 8 x i32> @llvm.vector.partial.reduce.add.nxv8i32.nxv8i32.nxv16i32(<vscale x 8 x i32> %accumulator, <vscale x 16 x i32> %0)
   ret <vscale x 8 x i32> %partial.reduce
 }
 
-declare <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>)
-declare <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv8i32(<vscale x 4 x i32>, <vscale x 8 x i32>)
-declare <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv16i32(<vscale x 4 x i32>, <vscale x 16 x i32>)
-declare <vscale x 8 x i32> @llvm.experimental.vector.partial.reduce.add.nxv8i32.nxv8i32.nxv16i32(<vscale x 8 x i32>, <vscale x 16 x i32>)
+declare <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>)
+declare <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv8i32(<vscale x 4 x i32>, <vscale x 8 x i32>)
+declare <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv16i32(<vscale x 4 x i32>, <vscale x 16 x i32>)
+declare <vscale x 8 x i32> @llvm.vector.partial.reduce.add.nxv8i32.nxv8i32.nxv16i32(<vscale x 8 x i32>, <vscale x 16 x i32>)
 
 declare i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32>)
 declare i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32>)
diff --git a/llvm/test/CodeGen/AArch64/pr157252.mir b/llvm/test/CodeGen/AArch64/pr157252.mir
new file mode 100644
index 0000000000000..319e54f0fa7e9
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/pr157252.mir
@@ -0,0 +1,25 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=aarch64-none-linux-gnu -run-pass prologepilog -frame-pointer=none -o - %s | FileCheck %s
+---
+name: test_addsxri_scalable_offset
+stack:
+  - { id: 0, type: default, size: 4, alignment: 4, stack-id: default }
+  - { id: 1, type: default, size: 16, alignment: 16, stack-id: scalable-vector }
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: test_addsxri_scalable_offset
+    ; CHECK: liveins: $fp
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: early-clobber $sp = frame-setup STRXpre killed $fp, $sp, -16 :: (store (s64) into %stack.2)
+    ; CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -1, implicit $vg
+    ; CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22
+    ; CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w29, -16
+    ; CHECK-NEXT: $x8 = ADDXri $sp, 12, 0
+    ; CHECK-NEXT: $x8 = ADDVL_XXI $x8, 1, implicit $vg
+    ; CHECK-NEXT: $x8 = ADDSXri $x8, 0, 0, implicit-def $nzcv
+    ; CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 1, implicit $vg
+    ; CHECK-NEXT: early-clobber $sp, $fp = frame-destroy LDRXpost $sp, 16 :: (load (s64) from %stack.2)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x8
+    $x8 = ADDSXri %stack.0, 0, 0, implicit-def $nzcv
+    RET_ReallyLR implicit $x8
+...
diff --git a/llvm/test/CodeGen/AArch64/rem-by-const.ll b/llvm/test/CodeGen/AArch64/rem-by-const.ll
index 599fa510d4aea..1cb92e46cbcd1 100644
--- a/llvm/test/CodeGen/AArch64/rem-by-const.ll
+++ b/llvm/test/CodeGen/AArch64/rem-by-const.ll
@@ -88,7 +88,7 @@ define i8 @ui8_7(i8 %a, i8 %b) {
 ; CHECK-GI-NEXT:    sub w9, w0, w8
 ; CHECK-GI-NEXT:    ubfx w9, w9, #1, #7
 ; CHECK-GI-NEXT:    add w8, w9, w8
-; CHECK-GI-NEXT:    ubfx w8, w8, #2, #6
+; CHECK-GI-NEXT:    lsr w8, w8, #2
 ; CHECK-GI-NEXT:    lsl w9, w8, #3
 ; CHECK-GI-NEXT:    sub w8, w9, w8
 ; CHECK-GI-NEXT:    sub w0, w0, w8
@@ -207,7 +207,7 @@ define i16 @ui16_7(i16 %a, i16 %b) {
 ; CHECK-GI-NEXT:    sub w9, w0, w8
 ; CHECK-GI-NEXT:    ubfx w9, w9, #1, #15
 ; CHECK-GI-NEXT:    add w8, w9, w8
-; CHECK-GI-NEXT:    ubfx w8, w8, #2, #14
+; CHECK-GI-NEXT:    lsr w8, w8, #2
 ; CHECK-GI-NEXT:    lsl w9, w8, #3
 ; CHECK-GI-NEXT:    sub w8, w9, w8
 ; CHECK-GI-NEXT:    sub w0, w0, w8
diff --git a/llvm/test/CodeGen/AArch64/sme-callee-save-restore-pairs.ll b/llvm/test/CodeGen/AArch64/sme-callee-save-restore-pairs.ll
index cf42db7aa65bd..b58a857f3a3cb 100644
--- a/llvm/test/CodeGen/AArch64/sme-callee-save-restore-pairs.ll
+++ b/llvm/test/CodeGen/AArch64/sme-callee-save-restore-pairs.ll
@@ -47,12 +47,18 @@ define void @fbyte(<vscale x 16 x i8> %v) #0{
 ; NOPAIR-NEXT:  // %bb.1:
 ; NOPAIR-NEXT:    smstop sm
 ; NOPAIR-NEXT:  .LBB0_2:
+; NOPAIR-NEXT:    rdvl x8, #1
+; NOPAIR-NEXT:    addsvl x8, x8, #-1
+; NOPAIR-NEXT:    cbz x8, .LBB0_4
+; NOPAIR-NEXT:  // %bb.3:
+; NOPAIR-NEXT:    brk #0x1
+; NOPAIR-NEXT:  .LBB0_4:
 ; NOPAIR-NEXT:    ldr z0, [sp] // 16-byte Folded Reload
 ; NOPAIR-NEXT:    bl my_func2
-; NOPAIR-NEXT:    tbz w19, #0, .LBB0_4
-; NOPAIR-NEXT:  // %bb.3:
+; NOPAIR-NEXT:    tbz w19, #0, .LBB0_6
+; NOPAIR-NEXT:  // %bb.5:
 ; NOPAIR-NEXT:    smstart sm
-; NOPAIR-NEXT:  .LBB0_4:
+; NOPAIR-NEXT:  .LBB0_6:
 ; NOPAIR-NEXT:    addvl sp, sp, #1
 ; NOPAIR-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
 ; NOPAIR-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -127,12 +133,18 @@ define void @fbyte(<vscale x 16 x i8> %v) #0{
 ; PAIR-NEXT:  // %bb.1:
 ; PAIR-NEXT:    smstop sm
 ; PAIR-NEXT:  .LBB0_2:
+; PAIR-NEXT:    rdvl x8, #1
+; PAIR-NEXT:    addsvl x8, x8, #-1
+; PAIR-NEXT:    cbz x8, .LBB0_4
+; PAIR-NEXT:  // %bb.3:
+; PAIR-NEXT:    brk #0x1
+; PAIR-NEXT:  .LBB0_4:
 ; PAIR-NEXT:    ldr z0, [sp] // 16-byte Folded Reload
 ; PAIR-NEXT:    bl my_func2
-; PAIR-NEXT:    tbz w19, #0, .LBB0_4
-; PAIR-NEXT:  // %bb.3:
+; PAIR-NEXT:    tbz w19, #0, .LBB0_6
+; PAIR-NEXT:  // %bb.5:
 ; PAIR-NEXT:    smstart sm
-; PAIR-NEXT:  .LBB0_4:
+; PAIR-NEXT:  .LBB0_6:
 ; PAIR-NEXT:    addvl sp, sp, #1
 ; PAIR-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
 ; PAIR-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/sme-intrinsics-rdsvl.ll b/llvm/test/CodeGen/AArch64/sme-intrinsics-rdsvl.ll
index 5d10d7e13da14..06c53d8070781 100644
--- a/llvm/test/CodeGen/AArch64/sme-intrinsics-rdsvl.ll
+++ b/llvm/test/CodeGen/AArch64/sme-intrinsics-rdsvl.ll
@@ -1,46 +1,89 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -verify-machineinstrs < %s | FileCheck %s
 
-define i64 @sme_cntsb() {
-; CHECK-LABEL: sme_cntsb:
+define i64 @cntsb() {
+; CHECK-LABEL: cntsb:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    rdsvl x0, #1
 ; CHECK-NEXT:    ret
-  %v = call i64 @llvm.aarch64.sme.cntsb()
-  ret i64 %v
+  %1 = call i64 @llvm.aarch64.sme.cntsd()
+  %res = shl nuw nsw i64 %1, 3
+  ret i64 %res
 }
 
-define i64 @sme_cntsh() {
-; CHECK-LABEL: sme_cntsh:
+define i64 @cntsh() {
+; CHECK-LABEL: cntsh:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    rdsvl x8, #1
 ; CHECK-NEXT:    lsr x0, x8, #1
 ; CHECK-NEXT:    ret
-  %v = call i64 @llvm.aarch64.sme.cntsh()
-  ret i64 %v
+  %1 = call i64 @llvm.aarch64.sme.cntsd()
+  %res = shl nuw nsw i64 %1, 2
+  ret i64 %res
 }
 
-define i64 @sme_cntsw() {
-; CHECK-LABEL: sme_cntsw:
+define i64 @cntsw() {
+; CHECK-LABEL: cntsw:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    rdsvl x8, #1
 ; CHECK-NEXT:    lsr x0, x8, #2
 ; CHECK-NEXT:    ret
-  %v = call i64 @llvm.aarch64.sme.cntsw()
-  ret i64 %v
+  %1 = call i64 @llvm.aarch64.sme.cntsd()
+  %res = shl nuw nsw i64 %1, 1
+  ret i64 %res
 }
 
-define i64 @sme_cntsd() {
-; CHECK-LABEL: sme_cntsd:
+define i64 @cntsd() {
+; CHECK-LABEL: cntsd:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    rdsvl x8, #1
 ; CHECK-NEXT:    lsr x0, x8, #3
+; CHECK-NEXT:    ret
+  %res = call i64 @llvm.aarch64.sme.cntsd()
+  ret i64 %res
+}
+
+define i64 @sme_cntsb_mul() {
+; CHECK-LABEL: sme_cntsb_mul:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rdsvl x0, #4
+; CHECK-NEXT:    ret
+  %v = call i64 @llvm.aarch64.sme.cntsd()
+  %shl = shl nuw nsw i64 %v, 3
+  %res = mul nuw nsw i64 %shl, 4
+  ret i64 %res
+}
+
+define i64 @sme_cntsh_mul() {
+; CHECK-LABEL: sme_cntsh_mul:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rdsvl x0, #4
+; CHECK-NEXT:    ret
+  %v = call i64 @llvm.aarch64.sme.cntsd()
+  %shl = shl nuw nsw i64 %v, 2
+  %res = mul nuw nsw i64 %shl, 8
+  ret i64 %res
+}
+
+define i64 @sme_cntsw_mul() {
+; CHECK-LABEL: sme_cntsw_mul:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rdsvl x0, #4
+; CHECK-NEXT:    ret
+  %v = call i64 @llvm.aarch64.sme.cntsd()
+  %shl = shl nuw nsw i64 %v, 1
+  %res = mul nuw nsw i64 %shl, 16
+  ret i64 %res
+}
+
+define i64 @sme_cntsd_mul() {
+; CHECK-LABEL: sme_cntsd_mul:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rdsvl x0, #4
 ; CHECK-NEXT:    ret
   %v = call i64 @llvm.aarch64.sme.cntsd()
-  ret i64 %v
+  %res = mul nuw nsw i64 %v, 32
+  ret i64 %res
 }
 
-declare i64 @llvm.aarch64.sme.cntsb()
-declare i64 @llvm.aarch64.sme.cntsh()
-declare i64 @llvm.aarch64.sme.cntsw()
 declare i64 @llvm.aarch64.sme.cntsd()
diff --git a/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll b/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll
index 80827c2547780..cab094e638cdf 100644
--- a/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll
+++ b/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll
@@ -224,22 +224,21 @@ define float @test6(float %f) nounwind "aarch64_pstate_sm_enabled" {
 define void @test7() nounwind "aarch64_inout_zt0" {
 ; CHECK-LABEL: test7:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #144
-; CHECK-NEXT:    stp x30, x19, [sp, #128] // 16-byte Folded Spill
-; CHECK-NEXT:    add x19, sp, #64
+; CHECK-NEXT:    sub sp, sp, #80
+; CHECK-NEXT:    stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    mov x19, sp
 ; CHECK-NEXT:    str zt0, [x19]
 ; CHECK-NEXT:    smstop za
 ; CHECK-NEXT:    bl callee
 ; CHECK-NEXT:    smstart za
 ; CHECK-NEXT:    ldr zt0, [x19]
-; CHECK-NEXT:    mov x19, sp
 ; CHECK-NEXT:    str zt0, [x19]
 ; CHECK-NEXT:    smstop za
 ; CHECK-NEXT:    bl callee
 ; CHECK-NEXT:    smstart za
 ; CHECK-NEXT:    ldr zt0, [x19]
-; CHECK-NEXT:    ldp x30, x19, [sp, #128] // 16-byte Folded Reload
-; CHECK-NEXT:    add sp, sp, #144
+; CHECK-NEXT:    ldp x30, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #80
 ; CHECK-NEXT:    ret
   call void @callee()
   call void @callee()
@@ -527,14 +526,24 @@ define void @test13(ptr %ptr) nounwind "aarch64_pstate_sm_enabled" {
 ; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
 ; CHECK-NEXT:    mov z0.s, #0 // =0x0
-; CHECK-NEXT:    mov x19, x0
 ; CHECK-NEXT:    str z0, [sp] // 16-byte Folded Spill
 ; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    addsvl x8, x8, #-1
+; CHECK-NEXT:    cbnz x8, .LBB14_2
+; CHECK-NEXT:  // %bb.1:
 ; CHECK-NEXT:    ldr z0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    mov x19, x0
 ; CHECK-NEXT:    bl callee_farg_fret
 ; CHECK-NEXT:    str z0, [sp] // 16-byte Folded Spill
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    addsvl x8, x8, #-1
+; CHECK-NEXT:    cbz x8, .LBB14_3
+; CHECK-NEXT:  .LBB14_2:
+; CHECK-NEXT:    brk #0x1
+; CHECK-NEXT:  .LBB14_3:
 ; CHECK-NEXT:    ldr z0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    bl callee_farg_fret
 ; CHECK-NEXT:    str z0, [sp] // 16-byte Folded Spill
diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-checkvl-mir.ll b/llvm/test/CodeGen/AArch64/sme-streaming-checkvl-mir.ll
new file mode 100644
index 0000000000000..0ac46085d683f
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sme-streaming-checkvl-mir.ll
@@ -0,0 +1,213 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+sme,+sme2p1 -stop-before=finalize-isel -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK-BEFORE-ISEL
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+sme,+sme2p1 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK-AFTER-ISEL
+
+target triple = "aarch64-unknown-linux-gnu"
+
+declare void @bar_enabled(<vscale x 4 x i32>) #0
+declare void @bar(<vscale x 4 x i32>)
+declare <vscale x 4 x i32> @bar_retv_enabled() #0
+declare <vscale x 4 x i32> @bar_retv()
+
+; Non-streaming -> calls streaming callee
+define void @foo_non_streaming_pass_arg(ptr %arg) {
+  ; CHECK-BEFORE-ISEL-LABEL: name: foo_non_streaming_pass_arg
+  ; CHECK-BEFORE-ISEL: bb.0.entry:
+  ; CHECK-BEFORE-ISEL-NEXT:   liveins: $x0
+  ; CHECK-BEFORE-ISEL-NEXT: {{  $}}
+  ; CHECK-BEFORE-ISEL-NEXT:   [[COPY:%[0-9]+]]:gpr64common = COPY $x0
+  ; CHECK-BEFORE-ISEL-NEXT:   [[LDR_ZXI:%[0-9]+]]:zpr = LDR_ZXI [[COPY]], 0 :: (load (<vscale x 1 x s128>) from %ir.arg)
+  ; CHECK-BEFORE-ISEL-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+  ; CHECK-BEFORE-ISEL-NEXT:   CHECK_MATCHING_VL_PSEUDO
+  ; CHECK-BEFORE-ISEL-NEXT:   MSRpstatesvcrImm1 1, 1, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit-def $sp, implicit-def $z0, implicit $vg, implicit-def $vg, implicit-def $fpmr
+  ; CHECK-BEFORE-ISEL-NEXT:   $z0 = COPY [[LDR_ZXI]]
+  ; CHECK-BEFORE-ISEL-NEXT:   BL @bar_enabled, csr_aarch64_sve_aapcs, implicit-def dead $lr, implicit $sp, implicit $z0, implicit-def $sp
+  ; CHECK-BEFORE-ISEL-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+  ; CHECK-BEFORE-ISEL-NEXT:   MSRpstatesvcrImm1 1, 0, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit $vg, implicit-def $vg, implicit-def $fpmr
+  ; CHECK-BEFORE-ISEL-NEXT:   RET_ReallyLR
+  ;
+  ; CHECK-AFTER-ISEL-LABEL: name: foo_non_streaming_pass_arg
+  ; CHECK-AFTER-ISEL: bb.0.entry:
+  ; CHECK-AFTER-ISEL-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; CHECK-AFTER-ISEL-NEXT:   liveins: $x0
+  ; CHECK-AFTER-ISEL-NEXT: {{  $}}
+  ; CHECK-AFTER-ISEL-NEXT:   [[COPY:%[0-9]+]]:gpr64common = COPY $x0
+  ; CHECK-AFTER-ISEL-NEXT:   [[LDR_ZXI:%[0-9]+]]:zpr = LDR_ZXI [[COPY]], 0 :: (load (<vscale x 1 x s128>) from %ir.arg)
+  ; CHECK-AFTER-ISEL-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+  ; CHECK-AFTER-ISEL-NEXT:   [[RDVLI_XI:%[0-9]+]]:gpr64 = RDVLI_XI 1, implicit $vg
+  ; CHECK-AFTER-ISEL-NEXT:   [[COPY1:%[0-9]+]]:gpr64sp = COPY [[RDVLI_XI]]
+  ; CHECK-AFTER-ISEL-NEXT:   [[ADDSVL_XXI:%[0-9]+]]:gpr64sp = ADDSVL_XXI [[COPY1]], -1, implicit $vg
+  ; CHECK-AFTER-ISEL-NEXT:   [[COPY2:%[0-9]+]]:gpr64 = COPY [[ADDSVL_XXI]]
+  ; CHECK-AFTER-ISEL-NEXT:   CBZX [[COPY2]], %bb.2
+  ; CHECK-AFTER-ISEL-NEXT: {{  $}}
+  ; CHECK-AFTER-ISEL-NEXT: bb.1.entry:
+  ; CHECK-AFTER-ISEL-NEXT:   successors:
+  ; CHECK-AFTER-ISEL-NEXT: {{  $}}
+  ; CHECK-AFTER-ISEL-NEXT:   BRK 1
+  ; CHECK-AFTER-ISEL-NEXT: {{  $}}
+  ; CHECK-AFTER-ISEL-NEXT: bb.2.entry:
+  ; CHECK-AFTER-ISEL-NEXT:   MSRpstatesvcrImm1 1, 1, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit-def $sp, implicit-def $z0, implicit $vg, implicit-def $vg, implicit-def $fpmr
+  ; CHECK-AFTER-ISEL-NEXT:   $z0 = COPY [[LDR_ZXI]]
+  ; CHECK-AFTER-ISEL-NEXT:   BL @bar_enabled, csr_aarch64_sve_aapcs, implicit-def dead $lr, implicit $sp, implicit $z0, implicit-def $sp
+  ; CHECK-AFTER-ISEL-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+  ; CHECK-AFTER-ISEL-NEXT:   MSRpstatesvcrImm1 1, 0, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit $vg, implicit-def $vg, implicit-def $fpmr
+  ; CHECK-AFTER-ISEL-NEXT:   RET_ReallyLR
+entry:
+  %v = load <vscale x 4 x i32>, ptr %arg, align 16
+  tail call void @bar_enabled(<vscale x 4 x i32> %v) #0
+  ret void
+}
+
+; Streaming -> calls non-streaming callee
+define void @foo_streaming_pass_arg(ptr %arg) #0 {
+  ; CHECK-BEFORE-ISEL-LABEL: name: foo_streaming_pass_arg
+  ; CHECK-BEFORE-ISEL: bb.0.entry:
+  ; CHECK-BEFORE-ISEL-NEXT:   liveins: $x0
+  ; CHECK-BEFORE-ISEL-NEXT: {{  $}}
+  ; CHECK-BEFORE-ISEL-NEXT:   [[COPY:%[0-9]+]]:gpr64common = COPY $x0
+  ; CHECK-BEFORE-ISEL-NEXT:   [[LDR_ZXI:%[0-9]+]]:zpr = LDR_ZXI [[COPY]], 0 :: (load (<vscale x 1 x s128>) from %ir.arg)
+  ; CHECK-BEFORE-ISEL-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+  ; CHECK-BEFORE-ISEL-NEXT:   MSRpstatesvcrImm1 1, 0, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit-def $sp, implicit-def $z0, implicit $vg, implicit-def $vg, implicit-def $fpmr
+  ; CHECK-BEFORE-ISEL-NEXT:   CHECK_MATCHING_VL_PSEUDO
+  ; CHECK-BEFORE-ISEL-NEXT:   $z0 = COPY [[LDR_ZXI]]
+  ; CHECK-BEFORE-ISEL-NEXT:   BL @bar, csr_aarch64_sve_aapcs, implicit-def dead $lr, implicit $sp, implicit $z0, implicit-def $sp
+  ; CHECK-BEFORE-ISEL-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+  ; CHECK-BEFORE-ISEL-NEXT:   MSRpstatesvcrImm1 1, 1, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit $vg, implicit-def $vg, implicit-def $fpmr
+  ; CHECK-BEFORE-ISEL-NEXT:   RET_ReallyLR
+  ;
+  ; CHECK-AFTER-ISEL-LABEL: name: foo_streaming_pass_arg
+  ; CHECK-AFTER-ISEL: bb.0.entry:
+  ; CHECK-AFTER-ISEL-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; CHECK-AFTER-ISEL-NEXT:   liveins: $x0
+  ; CHECK-AFTER-ISEL-NEXT: {{  $}}
+  ; CHECK-AFTER-ISEL-NEXT:   [[COPY:%[0-9]+]]:gpr64common = COPY $x0
+  ; CHECK-AFTER-ISEL-NEXT:   [[LDR_ZXI:%[0-9]+]]:zpr = LDR_ZXI [[COPY]], 0 :: (load (<vscale x 1 x s128>) from %ir.arg)
+  ; CHECK-AFTER-ISEL-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+  ; CHECK-AFTER-ISEL-NEXT:   MSRpstatesvcrImm1 1, 0, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit-def $sp, implicit-def $z0, implicit $vg, implicit-def $vg, implicit-def $fpmr
+  ; CHECK-AFTER-ISEL-NEXT:   [[RDVLI_XI:%[0-9]+]]:gpr64 = RDVLI_XI 1, implicit $vg
+  ; CHECK-AFTER-ISEL-NEXT:   [[COPY1:%[0-9]+]]:gpr64sp = COPY [[RDVLI_XI]]
+  ; CHECK-AFTER-ISEL-NEXT:   [[ADDSVL_XXI:%[0-9]+]]:gpr64sp = ADDSVL_XXI [[COPY1]], -1, implicit $vg
+  ; CHECK-AFTER-ISEL-NEXT:   [[COPY2:%[0-9]+]]:gpr64 = COPY [[ADDSVL_XXI]]
+  ; CHECK-AFTER-ISEL-NEXT:   CBZX [[COPY2]], %bb.2
+  ; CHECK-AFTER-ISEL-NEXT: {{  $}}
+  ; CHECK-AFTER-ISEL-NEXT: bb.1.entry:
+  ; CHECK-AFTER-ISEL-NEXT:   successors:
+  ; CHECK-AFTER-ISEL-NEXT: {{  $}}
+  ; CHECK-AFTER-ISEL-NEXT:   BRK 1
+  ; CHECK-AFTER-ISEL-NEXT: {{  $}}
+  ; CHECK-AFTER-ISEL-NEXT: bb.2.entry:
+  ; CHECK-AFTER-ISEL-NEXT:   $z0 = COPY [[LDR_ZXI]]
+  ; CHECK-AFTER-ISEL-NEXT:   BL @bar, csr_aarch64_sve_aapcs, implicit-def dead $lr, implicit $sp, implicit $z0, implicit-def $sp
+  ; CHECK-AFTER-ISEL-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+  ; CHECK-AFTER-ISEL-NEXT:   MSRpstatesvcrImm1 1, 1, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit $vg, implicit-def $vg, implicit-def $fpmr
+  ; CHECK-AFTER-ISEL-NEXT:   RET_ReallyLR
+entry:
+  %v = load <vscale x 4 x i32>, ptr %arg, align 16
+  tail call void @bar(<vscale x 4 x i32> %v)
+  ret void
+}
+
+; Non-streaming -> returns SVE value from streaming callee
+define void @foo_non_streaming_retval(ptr %ptr) {
+  ; CHECK-BEFORE-ISEL-LABEL: name: foo_non_streaming_retval
+  ; CHECK-BEFORE-ISEL: bb.0.entry:
+  ; CHECK-BEFORE-ISEL-NEXT:   liveins: $x0
+  ; CHECK-BEFORE-ISEL-NEXT: {{  $}}
+  ; CHECK-BEFORE-ISEL-NEXT:   [[COPY:%[0-9]+]]:gpr64common = COPY $x0
+  ; CHECK-BEFORE-ISEL-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+  ; CHECK-BEFORE-ISEL-NEXT:   CHECK_MATCHING_VL_PSEUDO
+  ; CHECK-BEFORE-ISEL-NEXT:   MSRpstatesvcrImm1 1, 1, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit-def $sp, implicit-def $z0, implicit $vg, implicit-def $vg, implicit-def $fpmr
+  ; CHECK-BEFORE-ISEL-NEXT:   BL @bar_retv_enabled, csr_aarch64_sve_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def $z0
+  ; CHECK-BEFORE-ISEL-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+  ; CHECK-BEFORE-ISEL-NEXT:   [[COPY1:%[0-9]+]]:zpr = COPY $z0
+  ; CHECK-BEFORE-ISEL-NEXT:   MSRpstatesvcrImm1 1, 0, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit $vg, implicit-def $vg, implicit-def $fpmr
+  ; CHECK-BEFORE-ISEL-NEXT:   [[COPY2:%[0-9]+]]:zpr = COPY [[COPY1]]
+  ; CHECK-BEFORE-ISEL-NEXT:   STR_ZXI [[COPY2]], [[COPY]], 0 :: (store (<vscale x 1 x s128>) into %ir.ptr)
+  ; CHECK-BEFORE-ISEL-NEXT:   RET_ReallyLR
+  ;
+  ; CHECK-AFTER-ISEL-LABEL: name: foo_non_streaming_retval
+  ; CHECK-AFTER-ISEL: bb.0.entry:
+  ; CHECK-AFTER-ISEL-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; CHECK-AFTER-ISEL-NEXT:   liveins: $x0
+  ; CHECK-AFTER-ISEL-NEXT: {{  $}}
+  ; CHECK-AFTER-ISEL-NEXT:   [[COPY:%[0-9]+]]:gpr64common = COPY $x0
+  ; CHECK-AFTER-ISEL-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+  ; CHECK-AFTER-ISEL-NEXT:   [[RDVLI_XI:%[0-9]+]]:gpr64 = RDVLI_XI 1, implicit $vg
+  ; CHECK-AFTER-ISEL-NEXT:   [[COPY1:%[0-9]+]]:gpr64sp = COPY [[RDVLI_XI]]
+  ; CHECK-AFTER-ISEL-NEXT:   [[ADDSVL_XXI:%[0-9]+]]:gpr64sp = ADDSVL_XXI [[COPY1]], -1, implicit $vg
+  ; CHECK-AFTER-ISEL-NEXT:   [[COPY2:%[0-9]+]]:gpr64 = COPY [[ADDSVL_XXI]]
+  ; CHECK-AFTER-ISEL-NEXT:   CBZX [[COPY2]], %bb.2
+  ; CHECK-AFTER-ISEL-NEXT: {{  $}}
+  ; CHECK-AFTER-ISEL-NEXT: bb.1.entry:
+  ; CHECK-AFTER-ISEL-NEXT:   successors:
+  ; CHECK-AFTER-ISEL-NEXT: {{  $}}
+  ; CHECK-AFTER-ISEL-NEXT:   BRK 1
+  ; CHECK-AFTER-ISEL-NEXT: {{  $}}
+  ; CHECK-AFTER-ISEL-NEXT: bb.2.entry:
+  ; CHECK-AFTER-ISEL-NEXT:   MSRpstatesvcrImm1 1, 1, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit-def $sp, implicit-def $z0, implicit $vg, implicit-def $vg, implicit-def $fpmr
+  ; CHECK-AFTER-ISEL-NEXT:   BL @bar_retv_enabled, csr_aarch64_sve_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def $z0
+  ; CHECK-AFTER-ISEL-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+  ; CHECK-AFTER-ISEL-NEXT:   [[COPY3:%[0-9]+]]:zpr = COPY $z0
+  ; CHECK-AFTER-ISEL-NEXT:   MSRpstatesvcrImm1 1, 0, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit $vg, implicit-def $vg, implicit-def $fpmr
+  ; CHECK-AFTER-ISEL-NEXT:   [[COPY4:%[0-9]+]]:zpr = COPY [[COPY3]]
+  ; CHECK-AFTER-ISEL-NEXT:   STR_ZXI [[COPY4]], [[COPY]], 0 :: (store (<vscale x 1 x s128>) into %ir.ptr)
+  ; CHECK-AFTER-ISEL-NEXT:   RET_ReallyLR
+entry:
+  %v = tail call <vscale x 4 x i32> @bar_retv_enabled() #0
+  store <vscale x 4 x i32> %v, ptr %ptr, align 16
+  ret void
+}
+
+; Streaming -> returns SVE value from non-streaming callee
+define void @foo_streaming_retval(ptr %ptr) #0 {
+  ; CHECK-BEFORE-ISEL-LABEL: name: foo_streaming_retval
+  ; CHECK-BEFORE-ISEL: bb.0.entry:
+  ; CHECK-BEFORE-ISEL-NEXT:   liveins: $x0
+  ; CHECK-BEFORE-ISEL-NEXT: {{  $}}
+  ; CHECK-BEFORE-ISEL-NEXT:   [[COPY:%[0-9]+]]:gpr64common = COPY $x0
+  ; CHECK-BEFORE-ISEL-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+  ; CHECK-BEFORE-ISEL-NEXT:   MSRpstatesvcrImm1 1, 0, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit-def $sp, implicit-def $z0, implicit $vg, implicit-def $vg, implicit-def $fpmr
+  ; CHECK-BEFORE-ISEL-NEXT:   CHECK_MATCHING_VL_PSEUDO
+  ; CHECK-BEFORE-ISEL-NEXT:   BL @bar_retv, csr_aarch64_sve_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def $z0
+  ; CHECK-BEFORE-ISEL-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+  ; CHECK-BEFORE-ISEL-NEXT:   [[COPY1:%[0-9]+]]:zpr = COPY $z0
+  ; CHECK-BEFORE-ISEL-NEXT:   MSRpstatesvcrImm1 1, 1, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit $vg, implicit-def $vg, implicit-def $fpmr
+  ; CHECK-BEFORE-ISEL-NEXT:   [[COPY2:%[0-9]+]]:zpr = COPY [[COPY1]]
+  ; CHECK-BEFORE-ISEL-NEXT:   STR_ZXI [[COPY2]], [[COPY]], 0 :: (store (<vscale x 1 x s128>) into %ir.ptr)
+  ; CHECK-BEFORE-ISEL-NEXT:   RET_ReallyLR
+  ;
+  ; CHECK-AFTER-ISEL-LABEL: name: foo_streaming_retval
+  ; CHECK-AFTER-ISEL: bb.0.entry:
+  ; CHECK-AFTER-ISEL-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; CHECK-AFTER-ISEL-NEXT:   liveins: $x0
+  ; CHECK-AFTER-ISEL-NEXT: {{  $}}
+  ; CHECK-AFTER-ISEL-NEXT:   [[COPY:%[0-9]+]]:gpr64common = COPY $x0
+  ; CHECK-AFTER-ISEL-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+  ; CHECK-AFTER-ISEL-NEXT:   MSRpstatesvcrImm1 1, 0, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit-def $sp, implicit-def $z0, implicit $vg, implicit-def $vg, implicit-def $fpmr
+  ; CHECK-AFTER-ISEL-NEXT:   [[RDVLI_XI:%[0-9]+]]:gpr64 = RDVLI_XI 1, implicit $vg
+  ; CHECK-AFTER-ISEL-NEXT:   [[COPY1:%[0-9]+]]:gpr64sp = COPY [[RDVLI_XI]]
+  ; CHECK-AFTER-ISEL-NEXT:   [[ADDSVL_XXI:%[0-9]+]]:gpr64sp = ADDSVL_XXI [[COPY1]], -1, implicit $vg
+  ; CHECK-AFTER-ISEL-NEXT:   [[COPY2:%[0-9]+]]:gpr64 = COPY [[ADDSVL_XXI]]
+  ; CHECK-AFTER-ISEL-NEXT:   CBZX [[COPY2]], %bb.2
+  ; CHECK-AFTER-ISEL-NEXT: {{  $}}
+  ; CHECK-AFTER-ISEL-NEXT: bb.1.entry:
+  ; CHECK-AFTER-ISEL-NEXT:   successors:
+  ; CHECK-AFTER-ISEL-NEXT: {{  $}}
+  ; CHECK-AFTER-ISEL-NEXT:   BRK 1
+  ; CHECK-AFTER-ISEL-NEXT: {{  $}}
+  ; CHECK-AFTER-ISEL-NEXT: bb.2.entry:
+  ; CHECK-AFTER-ISEL-NEXT:   BL @bar_retv, csr_aarch64_sve_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def $z0
+  ; CHECK-AFTER-ISEL-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+  ; CHECK-AFTER-ISEL-NEXT:   [[COPY3:%[0-9]+]]:zpr = COPY $z0
+  ; CHECK-AFTER-ISEL-NEXT:   MSRpstatesvcrImm1 1, 1, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit $vg, implicit-def $vg, implicit-def $fpmr
+  ; CHECK-AFTER-ISEL-NEXT:   [[COPY4:%[0-9]+]]:zpr = COPY [[COPY3]]
+  ; CHECK-AFTER-ISEL-NEXT:   STR_ZXI [[COPY4]], [[COPY]], 0 :: (store (<vscale x 1 x s128>) into %ir.ptr)
+  ; CHECK-AFTER-ISEL-NEXT:   RET_ReallyLR
+entry:
+  %v = tail call <vscale x 4 x i32> @bar_retv()
+  store <vscale x 4 x i32> %v, ptr %ptr, align 16
+  ret void
+}
+
+attributes #0 = { "aarch64_pstate_sm_enabled" }
+attributes #1 = { "aarch64_pstate_sm_compatible" }
diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-checkvl.ll b/llvm/test/CodeGen/AArch64/sme-streaming-checkvl.ll
new file mode 100644
index 0000000000000..a1eb1ceeaf19b
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sme-streaming-checkvl.ll
@@ -0,0 +1,478 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+sme,+sme2p1 -verify-machineinstrs < %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+declare void @bar_enabled(<vscale x 4 x i32>) #0
+declare void @bar(<vscale x 4 x i32>)
+declare <vscale x 4 x i32> @bar_retv_enabled() #0
+declare <vscale x 4 x i32> @bar_retv()
+
+; Non-streaming -> calls streaming callee
+define void @foo_non_streaming_pass_arg(ptr %arg) {
+; CHECK-LABEL: foo_non_streaming_pass_arg:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 96
+; CHECK-NEXT:    cntd x9
+; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x9, x28, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    add x29, sp, #64
+; CHECK-NEXT:    .cfi_def_cfa w29, 32
+; CHECK-NEXT:    .cfi_offset w28, -8
+; CHECK-NEXT:    .cfi_offset vg, -16
+; CHECK-NEXT:    .cfi_offset w30, -24
+; CHECK-NEXT:    .cfi_offset w29, -32
+; CHECK-NEXT:    .cfi_offset b8, -40
+; CHECK-NEXT:    .cfi_offset b9, -48
+; CHECK-NEXT:    .cfi_offset b10, -56
+; CHECK-NEXT:    .cfi_offset b11, -64
+; CHECK-NEXT:    .cfi_offset b12, -72
+; CHECK-NEXT:    .cfi_offset b13, -80
+; CHECK-NEXT:    .cfi_offset b14, -88
+; CHECK-NEXT:    .cfi_offset b15, -96
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    addsvl x8, x8, #-1
+; CHECK-NEXT:    cbz x8, .LBB0_2
+; CHECK-NEXT:  // %bb.1: // %entry
+; CHECK-NEXT:    brk #0x1
+; CHECK-NEXT:  .LBB0_2: // %entry
+; CHECK-NEXT:    ldr z0, [x0]
+; CHECK-NEXT:    sub x8, x29, #64
+; CHECK-NEXT:    str z0, [x8, #-1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    ldr z0, [x8, #-1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    bl bar_enabled
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    .cfi_def_cfa wsp, 96
+; CHECK-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x28, [sp, #88] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w28
+; CHECK-NEXT:    .cfi_restore vg
+; CHECK-NEXT:    .cfi_restore w30
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    .cfi_restore b8
+; CHECK-NEXT:    .cfi_restore b9
+; CHECK-NEXT:    .cfi_restore b10
+; CHECK-NEXT:    .cfi_restore b11
+; CHECK-NEXT:    .cfi_restore b12
+; CHECK-NEXT:    .cfi_restore b13
+; CHECK-NEXT:    .cfi_restore b14
+; CHECK-NEXT:    .cfi_restore b15
+; CHECK-NEXT:    ret
+entry:
+  %v = load <vscale x 4 x i32>, ptr %arg, align 16
+  tail call void @bar_enabled(<vscale x 4 x i32> %v) #0
+  ret void
+}
+
+; Streaming-compatible -> calls streaming callee
+define void @foo_streaming_compatible_pass_arg(ptr %arg) #1 {
+; CHECK-LABEL: foo_streaming_compatible_pass_arg:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #1136
+; CHECK-NEXT:    .cfi_def_cfa_offset 1136
+; CHECK-NEXT:    cntd x9
+; CHECK-NEXT:    stp d15, d14, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    str x29, [sp, #1088] // 8-byte Folded Spill
+; CHECK-NEXT:    str x30, [sp, #1096] // 8-byte Folded Spill
+; CHECK-NEXT:    str x9, [sp, #1104] // 8-byte Folded Spill
+; CHECK-NEXT:    str x28, [sp, #1112] // 8-byte Folded Spill
+; CHECK-NEXT:    str x19, [sp, #1120] // 8-byte Folded Spill
+; CHECK-NEXT:    add x29, sp, #1088
+; CHECK-NEXT:    .cfi_def_cfa w29, 48
+; CHECK-NEXT:    .cfi_offset w19, -16
+; CHECK-NEXT:    .cfi_offset w28, -24
+; CHECK-NEXT:    .cfi_offset vg, -32
+; CHECK-NEXT:    .cfi_offset w30, -40
+; CHECK-NEXT:    .cfi_offset w29, -48
+; CHECK-NEXT:    .cfi_offset b8, -1080
+; CHECK-NEXT:    .cfi_offset b9, -1088
+; CHECK-NEXT:    .cfi_offset b10, -1096
+; CHECK-NEXT:    .cfi_offset b11, -1104
+; CHECK-NEXT:    .cfi_offset b12, -1112
+; CHECK-NEXT:    .cfi_offset b13, -1120
+; CHECK-NEXT:    .cfi_offset b14, -1128
+; CHECK-NEXT:    .cfi_offset b15, -1136
+; CHECK-NEXT:    sub sp, sp, #1024
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    mrs x19, SVCR
+; CHECK-NEXT:    addsvl x8, x8, #-1
+; CHECK-NEXT:    cbz x8, .LBB1_2
+; CHECK-NEXT:  // %bb.1: // %entry
+; CHECK-NEXT:    brk #0x1
+; CHECK-NEXT:  .LBB1_2: // %entry
+; CHECK-NEXT:    ldr z0, [x0]
+; CHECK-NEXT:    sub x8, x29, #1088
+; CHECK-NEXT:    str z0, [x8, #-1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    tbnz w19, #0, .LBB1_4
+; CHECK-NEXT:  // %bb.3: // %entry
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:  .LBB1_4: // %entry
+; CHECK-NEXT:    ldr z0, [x8, #-1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    bl bar_enabled
+; CHECK-NEXT:    tbnz w19, #0, .LBB1_6
+; CHECK-NEXT:  // %bb.5: // %entry
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:  .LBB1_6: // %entry
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    add sp, sp, #1024
+; CHECK-NEXT:    .cfi_def_cfa wsp, 1136
+; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x19, [sp, #1120] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x28, [sp, #1112] // 8-byte Folded Reload
+; CHECK-NEXT:    ldr x30, [sp, #1096] // 8-byte Folded Reload
+; CHECK-NEXT:    ldr x29, [sp, #1088] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #1136
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w19
+; CHECK-NEXT:    .cfi_restore w28
+; CHECK-NEXT:    .cfi_restore vg
+; CHECK-NEXT:    .cfi_restore w30
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    .cfi_restore b8
+; CHECK-NEXT:    .cfi_restore b9
+; CHECK-NEXT:    .cfi_restore b10
+; CHECK-NEXT:    .cfi_restore b11
+; CHECK-NEXT:    .cfi_restore b12
+; CHECK-NEXT:    .cfi_restore b13
+; CHECK-NEXT:    .cfi_restore b14
+; CHECK-NEXT:    .cfi_restore b15
+; CHECK-NEXT:    ret
+entry:
+  %v = load <vscale x 4 x i32>, ptr %arg, align 16
+  tail call void @bar_enabled(<vscale x 4 x i32> %v) #0
+  ret void
+}
+
+; Streaming -> calls non-streaming callee
+define void @foo_streaming_pass_arg(ptr %arg) #0 {
+; CHECK-LABEL: foo_streaming_pass_arg:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #1120
+; CHECK-NEXT:    .cfi_def_cfa_offset 1120
+; CHECK-NEXT:    cntd x9
+; CHECK-NEXT:    stp d15, d14, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    str x29, [sp, #1088] // 8-byte Folded Spill
+; CHECK-NEXT:    str x30, [sp, #1096] // 8-byte Folded Spill
+; CHECK-NEXT:    str x9, [sp, #1104] // 8-byte Folded Spill
+; CHECK-NEXT:    str x28, [sp, #1112] // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_offset w28, -8
+; CHECK-NEXT:    .cfi_offset vg, -16
+; CHECK-NEXT:    .cfi_offset w30, -24
+; CHECK-NEXT:    .cfi_offset w29, -32
+; CHECK-NEXT:    .cfi_offset b8, -1064
+; CHECK-NEXT:    .cfi_offset b9, -1072
+; CHECK-NEXT:    .cfi_offset b10, -1080
+; CHECK-NEXT:    .cfi_offset b11, -1088
+; CHECK-NEXT:    .cfi_offset b12, -1096
+; CHECK-NEXT:    .cfi_offset b13, -1104
+; CHECK-NEXT:    .cfi_offset b14, -1112
+; CHECK-NEXT:    .cfi_offset b15, -1120
+; CHECK-NEXT:    sub sp, sp, #1024
+; CHECK-NEXT:    .cfi_def_cfa_offset 2144
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    addsvl x8, x8, #-1
+; CHECK-NEXT:    cbz x8, .LBB2_2
+; CHECK-NEXT:  // %bb.1: // %entry
+; CHECK-NEXT:    brk #0x1
+; CHECK-NEXT:  .LBB2_2: // %entry
+; CHECK-NEXT:    ldr z0, [x0]
+; CHECK-NEXT:    bl bar
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    add sp, sp, #1024
+; CHECK-NEXT:    .cfi_def_cfa_offset 1120
+; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x28, [sp, #1112] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x30, [sp, #1096] // 8-byte Folded Reload
+; CHECK-NEXT:    ldr x29, [sp, #1088] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #1120
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w28
+; CHECK-NEXT:    .cfi_restore vg
+; CHECK-NEXT:    .cfi_restore w30
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    .cfi_restore b8
+; CHECK-NEXT:    .cfi_restore b9
+; CHECK-NEXT:    .cfi_restore b10
+; CHECK-NEXT:    .cfi_restore b11
+; CHECK-NEXT:    .cfi_restore b12
+; CHECK-NEXT:    .cfi_restore b13
+; CHECK-NEXT:    .cfi_restore b14
+; CHECK-NEXT:    .cfi_restore b15
+; CHECK-NEXT:    ret
+entry:
+  %v = load <vscale x 4 x i32>, ptr %arg, align 16
+  tail call void @bar(<vscale x 4 x i32> %v)
+  ret void
+}
+
+; Non-streaming -> returns SVE value from streaming callee
+define void @foo_non_streaming_retval(ptr %ptr) {
+; CHECK-LABEL: foo_non_streaming_retval:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    stp d15, d14, [sp, #-112]! // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 112
+; CHECK-NEXT:    cntd x9
+; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    str x9, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x28, x19, [sp, #96] // 16-byte Folded Spill
+; CHECK-NEXT:    add x29, sp, #64
+; CHECK-NEXT:    .cfi_def_cfa w29, 48
+; CHECK-NEXT:    .cfi_offset w19, -8
+; CHECK-NEXT:    .cfi_offset w28, -16
+; CHECK-NEXT:    .cfi_offset vg, -32
+; CHECK-NEXT:    .cfi_offset w30, -40
+; CHECK-NEXT:    .cfi_offset w29, -48
+; CHECK-NEXT:    .cfi_offset b8, -56
+; CHECK-NEXT:    .cfi_offset b9, -64
+; CHECK-NEXT:    .cfi_offset b10, -72
+; CHECK-NEXT:    .cfi_offset b11, -80
+; CHECK-NEXT:    .cfi_offset b12, -88
+; CHECK-NEXT:    .cfi_offset b13, -96
+; CHECK-NEXT:    .cfi_offset b14, -104
+; CHECK-NEXT:    .cfi_offset b15, -112
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    addsvl x8, x8, #-1
+; CHECK-NEXT:    cbz x8, .LBB3_2
+; CHECK-NEXT:  // %bb.1: // %entry
+; CHECK-NEXT:    brk #0x1
+; CHECK-NEXT:  .LBB3_2: // %entry
+; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    bl bar_retv_enabled
+; CHECK-NEXT:    sub x8, x29, #64
+; CHECK-NEXT:    str z0, [x8, #-1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    ldr z0, [x8, #-1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    str z0, [x19]
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    .cfi_def_cfa wsp, 112
+; CHECK-NEXT:    ldp x28, x19, [sp, #96] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #112 // 16-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w19
+; CHECK-NEXT:    .cfi_restore w28
+; CHECK-NEXT:    .cfi_restore vg
+; CHECK-NEXT:    .cfi_restore w30
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    .cfi_restore b8
+; CHECK-NEXT:    .cfi_restore b9
+; CHECK-NEXT:    .cfi_restore b10
+; CHECK-NEXT:    .cfi_restore b11
+; CHECK-NEXT:    .cfi_restore b12
+; CHECK-NEXT:    .cfi_restore b13
+; CHECK-NEXT:    .cfi_restore b14
+; CHECK-NEXT:    .cfi_restore b15
+; CHECK-NEXT:    ret
+entry:
+  %v = tail call <vscale x 4 x i32> @bar_retv_enabled() #0
+  store <vscale x 4 x i32> %v, ptr %ptr, align 16
+  ret void
+}
+
+; Streaming-compatible -> returns SVE value from streaming callee
+define void @foo_streaming_compatible_retval(ptr %ptr) #1 {
+; CHECK-LABEL: foo_streaming_compatible_retval:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #1136
+; CHECK-NEXT:    .cfi_def_cfa_offset 1136
+; CHECK-NEXT:    cntd x9
+; CHECK-NEXT:    stp d15, d14, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    str x29, [sp, #1088] // 8-byte Folded Spill
+; CHECK-NEXT:    str x30, [sp, #1096] // 8-byte Folded Spill
+; CHECK-NEXT:    str x9, [sp, #1104] // 8-byte Folded Spill
+; CHECK-NEXT:    str x28, [sp, #1112] // 8-byte Folded Spill
+; CHECK-NEXT:    str x20, [sp, #1120] // 8-byte Folded Spill
+; CHECK-NEXT:    str x19, [sp, #1128] // 8-byte Folded Spill
+; CHECK-NEXT:    add x29, sp, #1088
+; CHECK-NEXT:    .cfi_def_cfa w29, 48
+; CHECK-NEXT:    .cfi_offset w19, -8
+; CHECK-NEXT:    .cfi_offset w20, -16
+; CHECK-NEXT:    .cfi_offset w28, -24
+; CHECK-NEXT:    .cfi_offset vg, -32
+; CHECK-NEXT:    .cfi_offset w30, -40
+; CHECK-NEXT:    .cfi_offset w29, -48
+; CHECK-NEXT:    .cfi_offset b8, -1080
+; CHECK-NEXT:    .cfi_offset b9, -1088
+; CHECK-NEXT:    .cfi_offset b10, -1096
+; CHECK-NEXT:    .cfi_offset b11, -1104
+; CHECK-NEXT:    .cfi_offset b12, -1112
+; CHECK-NEXT:    .cfi_offset b13, -1120
+; CHECK-NEXT:    .cfi_offset b14, -1128
+; CHECK-NEXT:    .cfi_offset b15, -1136
+; CHECK-NEXT:    sub sp, sp, #1024
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    mrs x20, SVCR
+; CHECK-NEXT:    addsvl x8, x8, #-1
+; CHECK-NEXT:    cbz x8, .LBB4_2
+; CHECK-NEXT:  // %bb.1: // %entry
+; CHECK-NEXT:    brk #0x1
+; CHECK-NEXT:  .LBB4_2: // %entry
+; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    tbnz w20, #0, .LBB4_4
+; CHECK-NEXT:  // %bb.3: // %entry
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:  .LBB4_4: // %entry
+; CHECK-NEXT:    bl bar_retv_enabled
+; CHECK-NEXT:    sub x8, x29, #1088
+; CHECK-NEXT:    str z0, [x8, #-1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    tbnz w20, #0, .LBB4_6
+; CHECK-NEXT:  // %bb.5: // %entry
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:  .LBB4_6: // %entry
+; CHECK-NEXT:    ldr z0, [x8, #-1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    str z0, [x19]
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    add sp, sp, #1024
+; CHECK-NEXT:    .cfi_def_cfa wsp, 1136
+; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x19, [sp, #1128] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x20, [sp, #1120] // 8-byte Folded Reload
+; CHECK-NEXT:    ldr x28, [sp, #1112] // 8-byte Folded Reload
+; CHECK-NEXT:    ldr x30, [sp, #1096] // 8-byte Folded Reload
+; CHECK-NEXT:    ldr x29, [sp, #1088] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #1136
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w19
+; CHECK-NEXT:    .cfi_restore w20
+; CHECK-NEXT:    .cfi_restore w28
+; CHECK-NEXT:    .cfi_restore vg
+; CHECK-NEXT:    .cfi_restore w30
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    .cfi_restore b8
+; CHECK-NEXT:    .cfi_restore b9
+; CHECK-NEXT:    .cfi_restore b10
+; CHECK-NEXT:    .cfi_restore b11
+; CHECK-NEXT:    .cfi_restore b12
+; CHECK-NEXT:    .cfi_restore b13
+; CHECK-NEXT:    .cfi_restore b14
+; CHECK-NEXT:    .cfi_restore b15
+; CHECK-NEXT:    ret
+entry:
+  %v = tail call <vscale x 4 x i32> @bar_retv_enabled() #0
+  store <vscale x 4 x i32> %v, ptr %ptr, align 16
+  ret void
+}
+
+; Streaming -> returns SVE value from non-streaming callee
+define void @foo_streaming_retval(ptr %ptr) #0 {
+; CHECK-LABEL: foo_streaming_retval:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #1136
+; CHECK-NEXT:    .cfi_def_cfa_offset 1136
+; CHECK-NEXT:    cntd x9
+; CHECK-NEXT:    stp d15, d14, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    str x29, [sp, #1088] // 8-byte Folded Spill
+; CHECK-NEXT:    str x30, [sp, #1096] // 8-byte Folded Spill
+; CHECK-NEXT:    str x9, [sp, #1104] // 8-byte Folded Spill
+; CHECK-NEXT:    str x28, [sp, #1112] // 8-byte Folded Spill
+; CHECK-NEXT:    str x19, [sp, #1120] // 8-byte Folded Spill
+; CHECK-NEXT:    add x29, sp, #1088
+; CHECK-NEXT:    .cfi_def_cfa w29, 48
+; CHECK-NEXT:    .cfi_offset w19, -16
+; CHECK-NEXT:    .cfi_offset w28, -24
+; CHECK-NEXT:    .cfi_offset vg, -32
+; CHECK-NEXT:    .cfi_offset w30, -40
+; CHECK-NEXT:    .cfi_offset w29, -48
+; CHECK-NEXT:    .cfi_offset b8, -1080
+; CHECK-NEXT:    .cfi_offset b9, -1088
+; CHECK-NEXT:    .cfi_offset b10, -1096
+; CHECK-NEXT:    .cfi_offset b11, -1104
+; CHECK-NEXT:    .cfi_offset b12, -1112
+; CHECK-NEXT:    .cfi_offset b13, -1120
+; CHECK-NEXT:    .cfi_offset b14, -1128
+; CHECK-NEXT:    .cfi_offset b15, -1136
+; CHECK-NEXT:    sub sp, sp, #1024
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    addsvl x8, x8, #-1
+; CHECK-NEXT:    cbz x8, .LBB5_2
+; CHECK-NEXT:  // %bb.1: // %entry
+; CHECK-NEXT:    brk #0x1
+; CHECK-NEXT:  .LBB5_2: // %entry
+; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    bl bar_retv
+; CHECK-NEXT:    sub x8, x29, #1088
+; CHECK-NEXT:    str z0, [x8, #-1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    ldr z0, [x8, #-1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    str z0, [x19]
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    add sp, sp, #1024
+; CHECK-NEXT:    .cfi_def_cfa wsp, 1136
+; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x19, [sp, #1120] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x28, [sp, #1112] // 8-byte Folded Reload
+; CHECK-NEXT:    ldr x30, [sp, #1096] // 8-byte Folded Reload
+; CHECK-NEXT:    ldr x29, [sp, #1088] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #1136
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w19
+; CHECK-NEXT:    .cfi_restore w28
+; CHECK-NEXT:    .cfi_restore vg
+; CHECK-NEXT:    .cfi_restore w30
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    .cfi_restore b8
+; CHECK-NEXT:    .cfi_restore b9
+; CHECK-NEXT:    .cfi_restore b10
+; CHECK-NEXT:    .cfi_restore b11
+; CHECK-NEXT:    .cfi_restore b12
+; CHECK-NEXT:    .cfi_restore b13
+; CHECK-NEXT:    .cfi_restore b14
+; CHECK-NEXT:    .cfi_restore b15
+; CHECK-NEXT:    ret
+entry:
+  %v = tail call <vscale x 4 x i32> @bar_retv()
+  store <vscale x 4 x i32> %v, ptr %ptr, align 16
+  ret void
+}
+
+attributes #0 = { "aarch64_pstate_sm_enabled" }
+attributes #1 = { "aarch64_pstate_sm_compatible" }
diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll b/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll
index 9088986ee9b72..f2163ad15bafc 100644
--- a/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll
+++ b/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll
@@ -209,13 +209,19 @@ define <vscale x 2 x double> @streaming_compatible_with_scalable_vectors(<vscale
 ; CHECK-NEXT:  // %bb.1:
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:  .LBB5_2:
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    addsvl x8, x8, #-1
+; CHECK-NEXT:    cbz x8, .LBB5_4
+; CHECK-NEXT:  // %bb.3:
+; CHECK-NEXT:    brk #0x1
+; CHECK-NEXT:  .LBB5_4:
 ; CHECK-NEXT:    ldr z0, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    bl normal_callee_scalable_vec_arg
 ; CHECK-NEXT:    str z0, [sp] // 16-byte Folded Spill
-; CHECK-NEXT:    tbz w19, #0, .LBB5_4
-; CHECK-NEXT:  // %bb.3:
+; CHECK-NEXT:    tbz w19, #0, .LBB5_6
+; CHECK-NEXT:  // %bb.5:
 ; CHECK-NEXT:    smstart sm
-; CHECK-NEXT:  .LBB5_4:
+; CHECK-NEXT:  .LBB5_6:
 ; CHECK-NEXT:    ldr z0, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z1, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    fadd z0.d, z1.d, z0.d
@@ -300,13 +306,19 @@ define <vscale x 2 x i1> @streaming_compatible_with_predicate_vectors(<vscale x
 ; CHECK-NEXT:  // %bb.1:
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:  .LBB6_2:
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    addsvl x8, x8, #-1
+; CHECK-NEXT:    cbz x8, .LBB6_4
+; CHECK-NEXT:  // %bb.3:
+; CHECK-NEXT:    brk #0x1
+; CHECK-NEXT:  .LBB6_4:
 ; CHECK-NEXT:    ldr p0, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    bl normal_callee_predicate_vec_arg
 ; CHECK-NEXT:    str p0, [sp, #6, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT:    tbz w19, #0, .LBB6_4
-; CHECK-NEXT:  // %bb.3:
+; CHECK-NEXT:    tbz w19, #0, .LBB6_6
+; CHECK-NEXT:  // %bb.5:
 ; CHECK-NEXT:    smstart sm
-; CHECK-NEXT:  .LBB6_4:
+; CHECK-NEXT:  .LBB6_6:
 ; CHECK-NEXT:    ldr p0, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p1, [sp, #6, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    and p0.b, p1/z, p1.b, p0.b
diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-interface-remarks.ll b/llvm/test/CodeGen/AArch64/sme-streaming-interface-remarks.ll
index e1a474d898233..2806f864c7b25 100644
--- a/llvm/test/CodeGen/AArch64/sme-streaming-interface-remarks.ll
+++ b/llvm/test/CodeGen/AArch64/sme-streaming-interface-remarks.ll
@@ -76,14 +76,14 @@ entry:
   %Data1 = alloca <vscale x 16 x i8>, align 16
   %Data2 = alloca <vscale x 16 x i8>, align 16
   %Data3 = alloca <vscale x 16 x i8>, align 16
-  %0 = tail call i64 @llvm.aarch64.sme.cntsb()
+  %0 = tail call i64 @llvm.aarch64.sme.cntsd()
   call void @foo(ptr noundef nonnull %Data1, ptr noundef nonnull %Data2, ptr noundef nonnull %Data3, i64 noundef %0)
   %1 = load <vscale x 16 x i8>, ptr %Data1, align 16
   %vecext = extractelement <vscale x 16 x i8> %1, i64 0
   ret i8 %vecext
 }
 
-declare i64 @llvm.aarch64.sme.cntsb()
+declare i64 @llvm.aarch64.sme.cntsd()
 
 declare void @foo(ptr noundef, ptr noundef, ptr noundef, i64 noundef)
 
diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll b/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll
index 8c4d57e244e03..505a40c16653b 100644
--- a/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll
+++ b/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll
@@ -366,9 +366,10 @@ define i8 @call_to_non_streaming_pass_sve_objects(ptr nocapture noundef readnone
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-3
-; CHECK-NEXT:    rdsvl x3, #1
+; CHECK-NEXT:    rdsvl x8, #1
 ; CHECK-NEXT:    addvl x0, sp, #2
 ; CHECK-NEXT:    addvl x1, sp, #1
+; CHECK-NEXT:    lsr x3, x8, #3
 ; CHECK-NEXT:    mov x2, sp
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    bl foo
@@ -386,7 +387,7 @@ entry:
   %Data1 = alloca <vscale x 16 x i8>, align 16
   %Data2 = alloca <vscale x 16 x i8>, align 16
   %Data3 = alloca <vscale x 16 x i8>, align 16
-  %0 = tail call i64 @llvm.aarch64.sme.cntsb()
+  %0 = tail call i64 @llvm.aarch64.sme.cntsd()
   call void @foo(ptr noundef nonnull %Data1, ptr noundef nonnull %Data2, ptr noundef nonnull %Data3, i64 noundef %0)
   %1 = load <vscale x 16 x i8>, ptr %Data1, align 16
   %vecext = extractelement <vscale x 16 x i8> %1, i64 0
@@ -421,7 +422,7 @@ entry:
   ret void
 }
 
-declare i64 @llvm.aarch64.sme.cntsb()
+declare i64 @llvm.aarch64.sme.cntsd()
 
 declare void @foo(ptr noundef, ptr noundef, ptr noundef, i64 noundef)
 declare void @bar(ptr noundef, i64 noundef, i64 noundef, i32 noundef, i32 noundef, float noundef, float noundef, double noundef, double noundef)
diff --git a/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll b/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll
index c72077bd311b4..125cea7dc469a 100644
--- a/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll
+++ b/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll
@@ -376,6 +376,12 @@ define void @vg_unwind_with_sve_args(<vscale x 2 x i64> %x) #0 {
 ; CHECK-NEXT:    //APP
 ; CHECK-NEXT:    //NO_APP
 ; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    addsvl x8, x8, #-1
+; CHECK-NEXT:    cbz x8, .LBB3_2
+; CHECK-NEXT:  // %bb.1:
+; CHECK-NEXT:    brk #0x1
+; CHECK-NEXT:  .LBB3_2:
 ; CHECK-NEXT:    ldr z0, [x29, #-19, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    bl scalable_callee
 ; CHECK-NEXT:    smstart sm
@@ -472,6 +478,12 @@ define void @vg_unwind_with_sve_args(<vscale x 2 x i64> %x) #0 {
 ; FP-CHECK-NEXT:    //APP
 ; FP-CHECK-NEXT:    //NO_APP
 ; FP-CHECK-NEXT:    smstop sm
+; FP-CHECK-NEXT:    rdvl x8, #1
+; FP-CHECK-NEXT:    addsvl x8, x8, #-1
+; FP-CHECK-NEXT:    cbz x8, .LBB3_2
+; FP-CHECK-NEXT:  // %bb.1:
+; FP-CHECK-NEXT:    brk #0x1
+; FP-CHECK-NEXT:  .LBB3_2:
 ; FP-CHECK-NEXT:    ldr z0, [x29, #-19, mul vl] // 16-byte Folded Reload
 ; FP-CHECK-NEXT:    bl scalable_callee
 ; FP-CHECK-NEXT:    smstart sm
diff --git a/llvm/test/CodeGen/AArch64/sme-zt0-state.ll b/llvm/test/CodeGen/AArch64/sme-zt0-state.ll
index 49eb368662b5d..2583a93e514a2 100644
--- a/llvm/test/CodeGen/AArch64/sme-zt0-state.ll
+++ b/llvm/test/CodeGen/AArch64/sme-zt0-state.ll
@@ -386,3 +386,43 @@ define void @shared_za_new_zt0(ptr %callee) "aarch64_inout_za" "aarch64_new_zt0"
   call void %callee() "aarch64_inout_za" "aarch64_in_zt0";
   ret void;
 }
+
+
+define void @zt0_multiple_private_za_calls(ptr %callee) "aarch64_in_zt0" nounwind {
+; CHECK-COMMON-LABEL: zt0_multiple_private_za_calls:
+; CHECK-COMMON:       // %bb.0:
+; CHECK-COMMON-NEXT:    sub sp, sp, #96
+; CHECK-COMMON-NEXT:    stp x20, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-COMMON-NEXT:    mov x20, sp
+; CHECK-COMMON-NEXT:    mov x19, x0
+; CHECK-COMMON-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-COMMON-NEXT:    str zt0, [x20]
+; CHECK-COMMON-NEXT:    smstop za
+; CHECK-COMMON-NEXT:    blr x0
+; CHECK-COMMON-NEXT:    smstart za
+; CHECK-COMMON-NEXT:    ldr zt0, [x20]
+; CHECK-COMMON-NEXT:    str zt0, [x20]
+; CHECK-COMMON-NEXT:    smstop za
+; CHECK-COMMON-NEXT:    blr x19
+; CHECK-COMMON-NEXT:    smstart za
+; CHECK-COMMON-NEXT:    ldr zt0, [x20]
+; CHECK-COMMON-NEXT:    str zt0, [x20]
+; CHECK-COMMON-NEXT:    smstop za
+; CHECK-COMMON-NEXT:    blr x19
+; CHECK-COMMON-NEXT:    smstart za
+; CHECK-COMMON-NEXT:    ldr zt0, [x20]
+; CHECK-COMMON-NEXT:    str zt0, [x20]
+; CHECK-COMMON-NEXT:    smstop za
+; CHECK-COMMON-NEXT:    blr x19
+; CHECK-COMMON-NEXT:    smstart za
+; CHECK-COMMON-NEXT:    ldr zt0, [x20]
+; CHECK-COMMON-NEXT:    ldp x20, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-COMMON-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
+; CHECK-COMMON-NEXT:    add sp, sp, #96
+; CHECK-COMMON-NEXT:    ret
+  call void %callee()
+  call void %callee()
+  call void %callee()
+  call void %callee()
+  ret void
+}
diff --git a/llvm/test/CodeGen/AArch64/spill-reload-remarks.ll b/llvm/test/CodeGen/AArch64/spill-reload-remarks.ll
index a23854759d688..33a4ecd56e35b 100644
--- a/llvm/test/CodeGen/AArch64/spill-reload-remarks.ll
+++ b/llvm/test/CodeGen/AArch64/spill-reload-remarks.ll
@@ -2,7 +2,7 @@
 
 ; We should have both spill and reload for %arg.
 
-; CHECK: remark: <unknown>:0:0: 2 spills 2.000000e+00 total spills cost 3 reloads 3.000000e+00 total reloads cost generated in function
+; CHECK: remark: <unknown>:0:0: 2 spills 1.500000e+00 total spills cost 3 reloads 1.500000e+00 total reloads cost generated in function
 define <vscale x 2 x i1> @streaming_compatible_with_predicate_vectors(<vscale x 2 x i1> %arg) "aarch64_pstate_sm_compatible" nounwind #0 {
   %res = call <vscale x 2 x i1> @normal_callee_predicate_vec_arg(<vscale x 2 x i1> %arg)
   %and = and <vscale x 2 x i1> %res, %arg
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-128bit-loads.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-128bit-loads.ll
index 55d37d1bda5e4..69fb3d88af02a 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-128bit-loads.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-128bit-loads.ll
@@ -13,7 +13,6 @@ define <16 x i8> @masked_load_v16i8(ptr %src, <16 x i1> %mask) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    shl v0.16b, v0.16b, #7
 ; CHECK-NEXT:    ptrue p0.b, vl16
-; CHECK-NEXT:    cmlt v0.16b, v0.16b, #0
 ; CHECK-NEXT:    cmpne p0.b, p0/z, z0.b, #0
 ; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -28,7 +27,6 @@ define <8 x half> @masked_load_v8f16(ptr %src, <8 x i1> %mask) {
 ; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
 ; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    shl v0.8h, v0.8h, #15
-; CHECK-NEXT:    cmlt v0.8h, v0.8h, #0
 ; CHECK-NEXT:    cmpne p0.h, p0/z, z0.h, #0
 ; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -43,7 +41,6 @@ define <4 x float> @masked_load_v4f32(ptr %src, <4 x i1> %mask) {
 ; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    shl v0.4s, v0.4s, #31
-; CHECK-NEXT:    cmlt v0.4s, v0.4s, #0
 ; CHECK-NEXT:    cmpne p0.s, p0/z, z0.s, #0
 ; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -58,7 +55,6 @@ define <2 x double> @masked_load_v2f64(ptr %src, <2 x i1> %mask) {
 ; CHECK-NEXT:    ushll v0.2d, v0.2s, #0
 ; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    shl v0.2d, v0.2d, #63
-; CHECK-NEXT:    cmlt v0.2d, v0.2d, #0
 ; CHECK-NEXT:    cmpne p0.d, p0/z, z0.d, #0
 ; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -74,7 +70,6 @@ define <2 x double> @masked_load_passthru_v2f64(ptr %src, <2 x i1> %mask, <2 x d
 ; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    shl v0.2d, v0.2d, #63
-; CHECK-NEXT:    cmlt v0.2d, v0.2d, #0
 ; CHECK-NEXT:    cmpne p0.d, p0/z, z0.d, #0
 ; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
 ; CHECK-NEXT:    sel z0.d, p0, z0.d, z1.d
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-128bit-stores.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-128bit-stores.ll
index 1a19b77f53c67..4570b50c96ad3 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-128bit-stores.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-128bit-stores.ll
@@ -13,7 +13,6 @@ define void @masked_store_v16i8(ptr %dst, <16 x i1> %mask) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    shl v0.16b, v0.16b, #7
 ; CHECK-NEXT:    ptrue p0.b, vl16
-; CHECK-NEXT:    cmlt v0.16b, v0.16b, #0
 ; CHECK-NEXT:    cmpne p0.b, p0/z, z0.b, #0
 ; CHECK-NEXT:    movi v0.2d, #0000000000000000
 ; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
@@ -28,7 +27,6 @@ define void @masked_store_v8f16(ptr %dst, <8 x i1> %mask) {
 ; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
 ; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    shl v0.8h, v0.8h, #15
-; CHECK-NEXT:    cmlt v0.8h, v0.8h, #0
 ; CHECK-NEXT:    cmpne p0.h, p0/z, z0.h, #0
 ; CHECK-NEXT:    movi v0.2d, #0000000000000000
 ; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
@@ -43,7 +41,6 @@ define void @masked_store_v4f32(ptr %dst, <4 x i1> %mask) {
 ; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    shl v0.4s, v0.4s, #31
-; CHECK-NEXT:    cmlt v0.4s, v0.4s, #0
 ; CHECK-NEXT:    cmpne p0.s, p0/z, z0.s, #0
 ; CHECK-NEXT:    movi v0.2d, #0000000000000000
 ; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
@@ -58,7 +55,6 @@ define void @masked_store_v2f64(ptr %dst, <2 x i1> %mask) {
 ; CHECK-NEXT:    ushll v0.2d, v0.2s, #0
 ; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    shl v0.2d, v0.2d, #63
-; CHECK-NEXT:    cmlt v0.2d, v0.2d, #0
 ; CHECK-NEXT:    cmpne p0.d, p0/z, z0.d, #0
 ; CHECK-NEXT:    movi v0.2d, #0000000000000000
 ; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-partial-reduce.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-partial-reduce.ll
index cc19f6c2cbbc8..e71d983664cd9 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-partial-reduce.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-partial-reduce.ll
@@ -35,7 +35,7 @@ define <8 x i16> @two_way_i8_i16_vl128(ptr %accptr, ptr %uptr, ptr %sptr) {
   %u.wide = zext <16 x i8> %u to <16 x i16>
   %s.wide = zext <16 x i8> %s to <16 x i16>
   %mult = mul nuw nsw <16 x i16> %s.wide, %u.wide
-  %partial.reduce = tail call <8 x i16> @llvm.experimental.vector.partial.reduce.add(<8 x i16> %acc, <16 x i16> %mult)
+  %partial.reduce = tail call <8 x i16> @llvm.vector.partial.reduce.add(<8 x i16> %acc, <16 x i16> %mult)
   ret <8 x i16> %partial.reduce
 }
 
@@ -70,7 +70,7 @@ define <16 x i16> @two_way_i8_i16_vl128_double_width(ptr %accptr, ptr %uptr, ptr
   %u.wide = zext <32 x i8> %u to <32 x i16>
   %s.wide = zext <32 x i8> %s to <32 x i16>
   %mult = mul nuw nsw <32 x i16> %s.wide, %u.wide
-  %partial.reduce = tail call <16 x i16> @llvm.experimental.vector.partial.reduce.add(<16 x i16> %acc, <32 x i16> %mult)
+  %partial.reduce = tail call <16 x i16> @llvm.vector.partial.reduce.add(<16 x i16> %acc, <32 x i16> %mult)
   ret <16 x i16> %partial.reduce
 }
 
@@ -124,7 +124,7 @@ define <16 x i16> @two_way_i8_i16_vl256(ptr %accptr, ptr %uptr, ptr %sptr) vscal
   %u.wide = zext <32 x i8> %u to <32 x i16>
   %s.wide = zext <32 x i8> %s to <32 x i16>
   %mult = mul nuw nsw <32 x i16> %s.wide, %u.wide
-  %partial.reduce = tail call <16 x i16> @llvm.experimental.vector.partial.reduce.add(<16 x i16> %acc, <32 x i16> %mult)
+  %partial.reduce = tail call <16 x i16> @llvm.vector.partial.reduce.add(<16 x i16> %acc, <32 x i16> %mult)
   ret <16 x i16> %partial.reduce
 }
 
@@ -158,7 +158,7 @@ define <4 x i32> @two_way_i16_i32_vl128(ptr %accptr, ptr %uptr, ptr %sptr) {
   %u.wide = zext <8 x i16> %u to <8 x i32>
   %s.wide = zext <8 x i16> %s to <8 x i32>
   %mult = mul nuw nsw <8 x i32> %s.wide, %u.wide
-  %partial.reduce = tail call <4 x i32> @llvm.experimental.vector.partial.reduce.add(<4 x i32> %acc, <8 x i32> %mult)
+  %partial.reduce = tail call <4 x i32> @llvm.vector.partial.reduce.add(<4 x i32> %acc, <8 x i32> %mult)
   ret <4 x i32> %partial.reduce
 }
 
@@ -193,7 +193,7 @@ define <8 x i32> @two_way_i16_i32_vl128_double_width(ptr %accptr, ptr %uptr, ptr
   %u.wide = zext <16 x i16> %u to <16 x i32>
   %s.wide = zext <16 x i16> %s to <16 x i32>
   %mult = mul nuw nsw <16 x i32> %s.wide, %u.wide
-  %partial.reduce = tail call <8 x i32> @llvm.experimental.vector.partial.reduce.add(<8 x i32> %acc, <16 x i32> %mult)
+  %partial.reduce = tail call <8 x i32> @llvm.vector.partial.reduce.add(<8 x i32> %acc, <16 x i32> %mult)
   ret <8 x i32> %partial.reduce
 }
 
@@ -247,7 +247,7 @@ define <8 x i32> @two_way_i16_i32_vl256(ptr %accptr, ptr %uptr, ptr %sptr) vscal
   %u.wide = zext <16 x i16> %u to <16 x i32>
   %s.wide = zext <16 x i16> %s to <16 x i32>
   %mult = mul nuw nsw <16 x i32> %s.wide, %u.wide
-  %partial.reduce = tail call <8 x i32> @llvm.experimental.vector.partial.reduce.add(<8 x i32> %acc, <16 x i32> %mult)
+  %partial.reduce = tail call <8 x i32> @llvm.vector.partial.reduce.add(<8 x i32> %acc, <16 x i32> %mult)
   ret <8 x i32> %partial.reduce
 }
 
@@ -281,7 +281,7 @@ define <2 x i64> @two_way_i32_i64_vl128(ptr %accptr, ptr %uptr, ptr %sptr) {
   %u.wide = zext <4 x i32> %u to <4 x i64>
   %s.wide = zext <4 x i32> %s to <4 x i64>
   %mult = mul nuw nsw <4 x i64> %s.wide, %u.wide
-  %partial.reduce = tail call <2 x i64> @llvm.experimental.vector.partial.reduce.add(<2 x i64> %acc, <4 x i64> %mult)
+  %partial.reduce = tail call <2 x i64> @llvm.vector.partial.reduce.add(<2 x i64> %acc, <4 x i64> %mult)
   ret <2 x i64> %partial.reduce
 }
 
@@ -316,7 +316,7 @@ define <4 x i64> @two_way_i32_i64_vl128_double_width(ptr %accptr, ptr %uptr, ptr
   %u.wide = zext <8 x i32> %u to <8 x i64>
   %s.wide = zext <8 x i32> %s to <8 x i64>
   %mult = mul nuw nsw <8 x i64> %s.wide, %u.wide
-  %partial.reduce = tail call <4 x i64> @llvm.experimental.vector.partial.reduce.add(<4 x i64> %acc, <8 x i64> %mult)
+  %partial.reduce = tail call <4 x i64> @llvm.vector.partial.reduce.add(<4 x i64> %acc, <8 x i64> %mult)
   ret <4 x i64> %partial.reduce
 }
 
@@ -370,7 +370,7 @@ define <4 x i64> @two_way_i32_i64_vl256(ptr %accptr, ptr %uptr, ptr %sptr) vscal
   %u.wide = zext <8 x i32> %u to <8 x i64>
   %s.wide = zext <8 x i32> %s to <8 x i64>
   %mult = mul nuw nsw <8 x i64> %s.wide, %u.wide
-  %partial.reduce = tail call <4 x i64> @llvm.experimental.vector.partial.reduce.add(<4 x i64> %acc, <8 x i64> %mult)
+  %partial.reduce = tail call <4 x i64> @llvm.vector.partial.reduce.add(<4 x i64> %acc, <8 x i64> %mult)
   ret <4 x i64> %partial.reduce
 }
 
@@ -403,7 +403,7 @@ define <4 x i32> @four_way_i8_i32_vl128(ptr %accptr, ptr %uptr, ptr %sptr) {
   %u.wide = zext <16 x i8> %u to <16 x i32>
   %s.wide = zext <16 x i8> %s to <16 x i32>
   %mult = mul nuw nsw <16 x i32> %s.wide, %u.wide
-  %partial.reduce = tail call <4 x i32> @llvm.experimental.vector.partial.reduce.add(<4 x i32> %acc, <16 x i32> %mult)
+  %partial.reduce = tail call <4 x i32> @llvm.vector.partial.reduce.add(<4 x i32> %acc, <16 x i32> %mult)
   ret <4 x i32> %partial.reduce
 }
 
@@ -430,7 +430,7 @@ define <4 x i32> @four_way_i8_i32_vl128_usdot(ptr %accptr, ptr %uptr, ptr %sptr)
   %u.wide = zext <16 x i8> %u to <16 x i32>
   %s.wide = sext <16 x i8> %s to <16 x i32>
   %mult = mul nuw nsw <16 x i32> %s.wide, %u.wide
-  %partial.reduce = tail call <4 x i32> @llvm.experimental.vector.partial.reduce.add(<4 x i32> %acc, <16 x i32> %mult)
+  %partial.reduce = tail call <4 x i32> @llvm.vector.partial.reduce.add(<4 x i32> %acc, <16 x i32> %mult)
   ret <4 x i32> %partial.reduce
 }
 
@@ -457,7 +457,7 @@ define <4 x i32> @four_way_i8_i32_vl128_sudot(ptr %accptr, ptr %uptr, ptr %sptr)
   %u.wide = sext <16 x i8> %u to <16 x i32>
   %s.wide = zext <16 x i8> %s to <16 x i32>
   %mult = mul nuw nsw <16 x i32> %s.wide, %u.wide
-  %partial.reduce = tail call <4 x i32> @llvm.experimental.vector.partial.reduce.add(<4 x i32> %acc, <16 x i32> %mult)
+  %partial.reduce = tail call <4 x i32> @llvm.vector.partial.reduce.add(<4 x i32> %acc, <16 x i32> %mult)
   ret <4 x i32> %partial.reduce
 }
 
@@ -504,7 +504,7 @@ define <2 x i64> @four_way_i8_i64_vl128_usdot(ptr %accptr, ptr %uptr, ptr %sptr)
   %u.wide = zext <16 x i8> %u to <16 x i64>
   %s.wide = sext <16 x i8> %s to <16 x i64>
   %mult = mul nuw nsw <16 x i64> %s.wide, %u.wide
-  %partial.reduce = tail call <2 x i64> @llvm.experimental.vector.partial.reduce.add(<2 x i64> %acc, <16 x i64> %mult)
+  %partial.reduce = tail call <2 x i64> @llvm.vector.partial.reduce.add(<2 x i64> %acc, <16 x i64> %mult)
   ret <2 x i64> %partial.reduce
 }
 
@@ -551,7 +551,7 @@ define <2 x i64> @four_way_i16_i64_vl128_usdot(ptr %accptr, ptr %uptr, ptr %sptr
   %u.wide = zext <8 x i16> %u to <8 x i64>
   %s.wide = sext <8 x i16> %s to <8 x i64>
   %mult = mul nuw nsw <8 x i64> %s.wide, %u.wide
-  %partial.reduce = tail call <2 x i64> @llvm.experimental.vector.partial.reduce.add(<2 x i64> %acc, <8 x i64> %mult)
+  %partial.reduce = tail call <2 x i64> @llvm.vector.partial.reduce.add(<2 x i64> %acc, <8 x i64> %mult)
   ret <2 x i64> %partial.reduce
 }
 
@@ -582,7 +582,7 @@ define <8 x i32> @four_way_i8_i32_vl128_double_width(ptr %accptr, ptr %uptr, ptr
   %u.wide = zext <32 x i8> %u to <32 x i32>
   %s.wide = zext <32 x i8> %s to <32 x i32>
   %mult = mul nuw nsw <32 x i32> %s.wide, %u.wide
-  %partial.reduce = tail call <8 x i32> @llvm.experimental.vector.partial.reduce.add(<8 x i32> %acc, <32 x i32> %mult)
+  %partial.reduce = tail call <8 x i32> @llvm.vector.partial.reduce.add(<8 x i32> %acc, <32 x i32> %mult)
   ret <8 x i32> %partial.reduce
 }
 
@@ -613,7 +613,7 @@ define <8 x i32> @four_way_i8_i32_vl128_double_width_usdot(ptr %accptr, ptr %upt
   %u.wide = zext <32 x i8> %u to <32 x i32>
   %s.wide = sext <32 x i8> %s to <32 x i32>
   %mult = mul nuw nsw <32 x i32> %s.wide, %u.wide
-  %partial.reduce = tail call <8 x i32> @llvm.experimental.vector.partial.reduce.add(<8 x i32> %acc, <32 x i32> %mult)
+  %partial.reduce = tail call <8 x i32> @llvm.vector.partial.reduce.add(<8 x i32> %acc, <32 x i32> %mult)
   ret <8 x i32> %partial.reduce
 }
 
@@ -658,7 +658,7 @@ define <8 x i32> @four_way_i8_i32_vl256(ptr %accptr, ptr %uptr, ptr %sptr) vscal
   %u.wide = zext <32 x i8> %u to <32 x i32>
   %s.wide = zext <32 x i8> %s to <32 x i32>
   %mult = mul nuw nsw <32 x i32> %s.wide, %u.wide
-  %partial.reduce = tail call <8 x i32> @llvm.experimental.vector.partial.reduce.add(<8 x i32> %acc, <32 x i32> %mult)
+  %partial.reduce = tail call <8 x i32> @llvm.vector.partial.reduce.add(<8 x i32> %acc, <32 x i32> %mult)
   ret <8 x i32> %partial.reduce
 }
 
@@ -703,7 +703,7 @@ define <8 x i32> @four_way_i8_i32_vl256_usdot(ptr %accptr, ptr %uptr, ptr %sptr)
   %u.wide = zext <32 x i8> %u to <32 x i32>
   %s.wide = sext <32 x i8> %s to <32 x i32>
   %mult = mul nuw nsw <32 x i32> %s.wide, %u.wide
-  %partial.reduce = tail call <8 x i32> @llvm.experimental.vector.partial.reduce.add(<8 x i32> %acc, <32 x i32> %mult)
+  %partial.reduce = tail call <8 x i32> @llvm.vector.partial.reduce.add(<8 x i32> %acc, <32 x i32> %mult)
   ret <8 x i32> %partial.reduce
 }
 
@@ -740,7 +740,7 @@ define <2 x i64> @four_way_i16_i64_vl128(ptr %accptr, ptr %uptr, ptr %sptr) {
   %u.wide = zext <8 x i16> %u to <8 x i64>
   %s.wide = zext <8 x i16> %s to <8 x i64>
   %mult = mul nuw nsw <8 x i64> %s.wide, %u.wide
-  %partial.reduce = tail call <2 x i64> @llvm.experimental.vector.partial.reduce.add(<2 x i64> %acc, <8 x i64> %mult)
+  %partial.reduce = tail call <2 x i64> @llvm.vector.partial.reduce.add(<2 x i64> %acc, <8 x i64> %mult)
   ret <2 x i64> %partial.reduce
 }
 
@@ -781,7 +781,7 @@ define <4 x i64> @four_way_i16_i64_vl128_double_width(ptr %accptr, ptr %uptr, pt
   %u.wide = zext <16 x i16> %u to <16 x i64>
   %s.wide = zext <16 x i16> %s to <16 x i64>
   %mult = mul nuw nsw <16 x i64> %s.wide, %u.wide
-  %partial.reduce = tail call <4 x i64> @llvm.experimental.vector.partial.reduce.add(<4 x i64> %acc, <16 x i64> %mult)
+  %partial.reduce = tail call <4 x i64> @llvm.vector.partial.reduce.add(<4 x i64> %acc, <16 x i64> %mult)
   ret <4 x i64> %partial.reduce
 }
 
@@ -836,7 +836,7 @@ define <4 x i64> @four_way_i16_i64_vl256(ptr %accptr, ptr %uptr, ptr %sptr) vsca
   %u.wide = zext <16 x i16> %u to <16 x i64>
   %s.wide = zext <16 x i16> %s to <16 x i64>
   %mult = mul nuw nsw <16 x i64> %s.wide, %u.wide
-  %partial.reduce = tail call <4 x i64> @llvm.experimental.vector.partial.reduce.add(<4 x i64> %acc, <16 x i64> %mult)
+  %partial.reduce = tail call <4 x i64> @llvm.vector.partial.reduce.add(<4 x i64> %acc, <16 x i64> %mult)
   ret <4 x i64> %partial.reduce
 }
 
@@ -889,7 +889,7 @@ define <2 x i64> @eight_way_i8_i64_vl128(ptr %accptr, ptr %uptr, ptr %sptr) {
   %u.wide = zext <16 x i8> %u to <16 x i64>
   %s.wide = zext <16 x i8> %s to <16 x i64>
   %mult = mul nuw nsw <16 x i64> %s.wide, %u.wide
-  %partial.reduce = tail call <2 x i64> @llvm.experimental.vector.partial.reduce.add(<2 x i64> %acc, <16 x i64> %mult)
+  %partial.reduce = tail call <2 x i64> @llvm.vector.partial.reduce.add(<2 x i64> %acc, <16 x i64> %mult)
   ret <2 x i64> %partial.reduce
 }
 
@@ -953,7 +953,7 @@ define <4 x i64> @four_way_i8_i64_vl128_double_width(ptr %accptr, ptr %uptr, ptr
   %u.wide = zext <32 x i8> %u to <32 x i64>
   %s.wide = zext <32 x i8> %s to <32 x i64>
   %mult = mul nuw nsw <32 x i64> %s.wide, %u.wide
-  %partial.reduce = tail call <4 x i64> @llvm.experimental.vector.partial.reduce.add(<4 x i64> %acc, <32 x i64> %mult)
+  %partial.reduce = tail call <4 x i64> @llvm.vector.partial.reduce.add(<4 x i64> %acc, <32 x i64> %mult)
   ret <4 x i64> %partial.reduce
 }
 
@@ -1010,6 +1010,6 @@ define <4 x i64> @four_way_i8_i64_vl256(ptr %accptr, ptr %uptr, ptr %sptr) vscal
   %u.wide = zext <32 x i8> %u to <32 x i64>
   %s.wide = zext <32 x i8> %s to <32 x i64>
   %mult = mul nuw nsw <32 x i64> %s.wide, %u.wide
-  %partial.reduce = tail call <4 x i64> @llvm.experimental.vector.partial.reduce.add(<4 x i64> %acc, <32 x i64> %mult)
+  %partial.reduce = tail call <4 x i64> @llvm.vector.partial.reduce.add(<4 x i64> %acc, <32 x i64> %mult)
   ret <4 x i64> %partial.reduce
 }
diff --git a/llvm/test/CodeGen/AArch64/sve-nontemporal-masked-ldst.ll b/llvm/test/CodeGen/AArch64/sve-nontemporal-masked-ldst.ll
index 390f5c21f7b44..36df5e5deadfc 100644
--- a/llvm/test/CodeGen/AArch64/sve-nontemporal-masked-ldst.ll
+++ b/llvm/test/CodeGen/AArch64/sve-nontemporal-masked-ldst.ll
@@ -7,7 +7,6 @@ define <4 x i32> @masked_load_v4i32(ptr %a, <4 x i1> %mask) nounwind {
 ; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    shl v0.4s, v0.4s, #31
-; CHECK-NEXT:    cmlt v0.4s, v0.4s, #0
 ; CHECK-NEXT:    cmpne p0.s, p0/z, z0.s, #0
 ; CHECK-NEXT:    ldnt1w { z0.s }, p0/z, [x0]
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -23,7 +22,6 @@ define void @masked_store_v4i32(<4 x i32> %x, ptr %a, <4 x i1> %mask) nounwind {
 ; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    shl v1.4s, v1.4s, #31
-; CHECK-NEXT:    cmlt v1.4s, v1.4s, #0
 ; CHECK-NEXT:    cmpne p0.s, p0/z, z1.s, #0
 ; CHECK-NEXT:    stnt1w { z0.s }, p0, [x0]
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll b/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll
index b2cde51e99619..da0c01f13b960 100644
--- a/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll
+++ b/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll
@@ -22,7 +22,7 @@ entry:
   %a.wide = zext <vscale x 16 x i8> %a to <vscale x 16 x i32>
   %b.wide = zext <vscale x 16 x i8> %b to <vscale x 16 x i32>
   %mult = mul nuw nsw <vscale x 16 x i32> %a.wide, %b.wide
-  %partial.reduce = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %acc, <vscale x 16 x i32> %mult)
+  %partial.reduce = tail call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %acc, <vscale x 16 x i32> %mult)
   ret <vscale x 4 x i32> %partial.reduce
 }
 
@@ -45,7 +45,7 @@ entry:
   %a.wide = zext <vscale x 8 x i16> %a to <vscale x 8 x i64>
   %b.wide = zext <vscale x 8 x i16> %b to <vscale x 8 x i64>
   %mult = mul nuw nsw <vscale x 8 x i64> %a.wide, %b.wide
-  %partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %acc, <vscale x 8 x i64> %mult)
+  %partial.reduce = tail call <vscale x 2 x i64> @llvm.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %acc, <vscale x 8 x i64> %mult)
   ret <vscale x 2 x i64> %partial.reduce
 }
 
@@ -68,7 +68,7 @@ entry:
   %a.wide = sext <vscale x 16 x i8> %a to <vscale x 16 x i32>
   %b.wide = sext <vscale x 16 x i8> %b to <vscale x 16 x i32>
   %mult = mul nuw nsw <vscale x 16 x i32> %a.wide, %b.wide
-  %partial.reduce = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %accc, <vscale x 16 x i32> %mult)
+  %partial.reduce = tail call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %accc, <vscale x 16 x i32> %mult)
   ret <vscale x 4 x i32> %partial.reduce
 }
 
@@ -91,7 +91,7 @@ entry:
   %a.wide = sext <vscale x 8 x i16> %a to <vscale x 8 x i64>
   %b.wide = sext <vscale x 8 x i16> %b to <vscale x 8 x i64>
   %mult = mul nuw nsw <vscale x 8 x i64> %a.wide, %b.wide
-  %partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %acc, <vscale x 8 x i64> %mult)
+  %partial.reduce = tail call <vscale x 2 x i64> @llvm.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %acc, <vscale x 8 x i64> %mult)
   ret <vscale x 2 x i64> %partial.reduce
 }
 
@@ -130,7 +130,7 @@ entry:
   %a.wide = zext <vscale x 16 x i8> %a to <vscale x 16 x i32>
   %b.wide = sext <vscale x 16 x i8> %b to <vscale x 16 x i32>
   %mult = mul nuw nsw <vscale x 16 x i32> %a.wide, %b.wide
-  %partial.reduce = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %acc, <vscale x 16 x i32> %mult)
+  %partial.reduce = tail call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %acc, <vscale x 16 x i32> %mult)
   ret <vscale x 4 x i32> %partial.reduce
 }
 
@@ -169,7 +169,7 @@ entry:
   %a.wide = sext <vscale x 16 x i8> %a to <vscale x 16 x i32>
   %b.wide = zext <vscale x 16 x i8> %b to <vscale x 16 x i32>
   %mult = mul nuw nsw <vscale x 16 x i32> %a.wide, %b.wide
-  %partial.reduce = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %acc, <vscale x 16 x i32> %mult)
+  %partial.reduce = tail call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %acc, <vscale x 16 x i32> %mult)
   ret <vscale x 4 x i32> %partial.reduce
 }
 
@@ -201,7 +201,7 @@ entry:
   %a.wide = zext <vscale x 16 x i8> %a to <vscale x 16 x i64>
   %b.wide = zext <vscale x 16 x i8> %b to <vscale x 16 x i64>
   %mult = mul nuw nsw <vscale x 16 x i64> %a.wide, %b.wide
-  %partial.reduce = tail call <vscale x 4 x i64> @llvm.experimental.vector.partial.reduce.add.nxv4i64.nxv16i64(
+  %partial.reduce = tail call <vscale x 4 x i64> @llvm.vector.partial.reduce.add.nxv4i64.nxv16i64(
   <vscale x 4 x i64> %acc, <vscale x 16 x i64> %mult)
   ret <vscale x 4 x i64> %partial.reduce
 }
@@ -234,7 +234,7 @@ entry:
   %a.wide = sext <vscale x 16 x i8> %a to <vscale x 16 x i64>
   %b.wide = sext <vscale x 16 x i8> %b to <vscale x 16 x i64>
   %mult = mul nuw nsw <vscale x 16 x i64> %a.wide, %b.wide
-  %partial.reduce = tail call <vscale x 4 x i64> @llvm.experimental.vector.partial.reduce.add.nxv4i64.nxv16i64(
+  %partial.reduce = tail call <vscale x 4 x i64> @llvm.vector.partial.reduce.add.nxv4i64.nxv16i64(
   <vscale x 4 x i64> %acc, <vscale x 16 x i64> %mult)
   ret <vscale x 4 x i64> %partial.reduce
 }
@@ -300,7 +300,7 @@ entry:
   %a.wide = zext <vscale x 16 x i8> %a to <vscale x 16 x i64>
   %b.wide = sext <vscale x 16 x i8> %b to <vscale x 16 x i64>
   %mult = mul nuw nsw <vscale x 16 x i64> %a.wide, %b.wide
-  %partial.reduce = tail call <vscale x 4 x i64> @llvm.experimental.vector.partial.reduce.add.nxv4i64.nxv16i64(
+  %partial.reduce = tail call <vscale x 4 x i64> @llvm.vector.partial.reduce.add.nxv4i64.nxv16i64(
   <vscale x 4 x i64> %acc, <vscale x 16 x i64> %mult)
   ret <vscale x 4 x i64> %partial.reduce
 }
@@ -366,7 +366,7 @@ entry:
   %a.wide = sext <vscale x 16 x i8> %a to <vscale x 16 x i64>
   %b.wide = zext <vscale x 16 x i8> %b to <vscale x 16 x i64>
   %mult = mul nuw nsw <vscale x 16 x i64> %a.wide, %b.wide
-  %partial.reduce = tail call <vscale x 4 x i64> @llvm.experimental.vector.partial.reduce.add.nxv4i64.nxv16i64(
+  %partial.reduce = tail call <vscale x 4 x i64> @llvm.vector.partial.reduce.add.nxv4i64.nxv16i64(
   <vscale x 4 x i64> %acc, <vscale x 16 x i64> %mult)
   ret <vscale x 4 x i64> %partial.reduce
 }
@@ -390,7 +390,7 @@ define <vscale x 4 x i32> @udot_no_bin_op(<vscale x 4 x i32> %acc, <vscale x 16
 ; CHECK-SME-NEXT:    udot z0.s, z1.b, z2.b
 ; CHECK-SME-NEXT:    ret
   %a.ext = zext <vscale x 16 x i8> %a to <vscale x 16 x i32>
-  %partial.reduce = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %acc, <vscale x 16 x i32> %a.ext)
+  %partial.reduce = tail call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %acc, <vscale x 16 x i32> %a.ext)
   ret <vscale x 4 x i32> %partial.reduce
 }
 
@@ -413,7 +413,7 @@ define <vscale x 4 x i32> @sdot_no_bin_op(<vscale x 4 x i32> %acc, <vscale x 16
 ; CHECK-SME-NEXT:    sdot z0.s, z1.b, z2.b
 ; CHECK-SME-NEXT:    ret
   %a.ext = sext <vscale x 16 x i8> %a to <vscale x 16 x i32>
-  %partial.reduce = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %acc, <vscale x 16 x i32> %a.ext)
+  %partial.reduce = tail call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %acc, <vscale x 16 x i32> %a.ext)
   ret <vscale x 4 x i32> %partial.reduce
 }
 
@@ -437,7 +437,7 @@ define <vscale x 2 x i64> @udot_no_bin_op_wide(<vscale x 2 x i64> %acc, <vscale
 ; CHECK-SME-NEXT:    ret
 entry:
   %a.wide = zext <vscale x 8 x i16> %a to <vscale x 8 x i64>
-  %partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %acc, <vscale x 8 x i64> %a.wide)
+  %partial.reduce = tail call <vscale x 2 x i64> @llvm.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %acc, <vscale x 8 x i64> %a.wide)
   ret <vscale x 2 x i64> %partial.reduce
 }
 
@@ -461,7 +461,7 @@ define <vscale x 2 x i64> @sdot_no_bin_op_wide(<vscale x 2 x i64> %acc, <vscale
 ; CHECK-SME-NEXT:    ret
 entry:
   %a.wide = sext <vscale x 8 x i16> %a to <vscale x 8 x i64>
-  %partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %acc, <vscale x 8 x i64> %a.wide)
+  %partial.reduce = tail call <vscale x 2 x i64> @llvm.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %acc, <vscale x 8 x i64> %a.wide)
   ret <vscale x 2 x i64> %partial.reduce
 }
 
@@ -493,7 +493,7 @@ define <vscale x 4 x i64> @udot_no_bin_op_8to64(<vscale x 4 x i64> %acc, <vscale
 ; CHECK-SME-NEXT:    uaddwt z0.d, z0.d, z4.s
 ; CHECK-SME-NEXT:    ret
   %a.ext = zext <vscale x 16 x i8> %a to <vscale x 16 x i64>
-  %partial.reduce = tail call <vscale x 4 x i64> @llvm.experimental.vector.partial.reduce.add.nxv4i64.nxv16i64(<vscale x 4 x i64> %acc, <vscale x 16 x i64> %a.ext)
+  %partial.reduce = tail call <vscale x 4 x i64> @llvm.vector.partial.reduce.add.nxv4i64.nxv16i64(<vscale x 4 x i64> %acc, <vscale x 16 x i64> %a.ext)
   ret <vscale x 4 x i64> %partial.reduce
 }
 
@@ -525,7 +525,7 @@ define <vscale x 4 x i64> @sdot_no_bin_op_8to64(<vscale x 4 x i64> %acc, <vscale
 ; CHECK-SME-NEXT:    saddwt z0.d, z0.d, z4.s
 ; CHECK-SME-NEXT:    ret
   %a.ext = sext <vscale x 16 x i8> %a to <vscale x 16 x i64>
-  %partial.reduce = tail call <vscale x 4 x i64> @llvm.experimental.vector.partial.reduce.add.nxv4i64.nxv16i64(<vscale x 4 x i64> %acc, <vscale x 16 x i64> %a.ext)
+  %partial.reduce = tail call <vscale x 4 x i64> @llvm.vector.partial.reduce.add.nxv4i64.nxv16i64(<vscale x 4 x i64> %acc, <vscale x 16 x i64> %a.ext)
   ret <vscale x 4 x i64> %partial.reduce
 }
 
@@ -557,7 +557,7 @@ entry:
   %a.wide = zext <vscale x 8 x i8> %a to <vscale x 8 x i32>
   %b.wide = zext <vscale x 8 x i8> %b to <vscale x 8 x i32>
   %mult = mul nuw nsw <vscale x 8 x i32> %a.wide, %b.wide
-  %partial.reduce = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %acc, <vscale x 8 x i32> %mult)
+  %partial.reduce = tail call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %acc, <vscale x 8 x i32> %mult)
   ret <vscale x 4 x i32> %partial.reduce
 }
 
@@ -589,7 +589,7 @@ entry:
   %a.wide = zext <vscale x 4 x i16> %a to <vscale x 4 x i64>
   %b.wide = zext <vscale x 4 x i16> %b to <vscale x 4 x i64>
   %mult = mul nuw nsw <vscale x 4 x i64> %a.wide, %b.wide
-  %partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv4i64(<vscale x 2 x i64> %acc, <vscale x 4 x i64> %mult)
+  %partial.reduce = tail call <vscale x 2 x i64> @llvm.vector.partial.reduce.add.nxv2i64.nxv4i64(<vscale x 2 x i64> %acc, <vscale x 4 x i64> %mult)
   ret <vscale x 2 x i64> %partial.reduce
 }
 
@@ -660,7 +660,7 @@ entry:
   %a.wide = zext <vscale x 8 x i16> %a to <vscale x 8 x i64>
   %b.wide = sext <vscale x 8 x i16> %b to <vscale x 8 x i64>
   %mult = mul nuw nsw <vscale x 8 x i64> %a.wide, %b.wide
-  %partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %acc, <vscale x 8 x i64> %mult)
+  %partial.reduce = tail call <vscale x 2 x i64> @llvm.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %acc, <vscale x 8 x i64> %mult)
   ret <vscale x 2 x i64> %partial.reduce
 }
 
@@ -731,7 +731,7 @@ entry:
   %a.wide = sext <vscale x 8 x i16> %a to <vscale x 8 x i64>
   %b.wide = zext <vscale x 8 x i16> %b to <vscale x 8 x i64>
   %mult = mul nuw nsw <vscale x 8 x i64> %a.wide, %b.wide
-  %partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %acc, <vscale x 8 x i64> %mult)
+  %partial.reduce = tail call <vscale x 2 x i64> @llvm.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %acc, <vscale x 8 x i64> %mult)
   ret <vscale x 2 x i64> %partial.reduce
 }
 
@@ -805,7 +805,7 @@ entry:
   %a.wide = zext <vscale x 8 x i16> %a to <vscale x 8 x i64>
   %b.wide = zext <vscale x 8 x i8> %b to <vscale x 8 x i64>
   %mult = mul nuw nsw <vscale x 8 x i64> %a.wide, %b.wide
-  %partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %acc, <vscale x 8 x i64> %mult)
+  %partial.reduce = tail call <vscale x 2 x i64> @llvm.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %acc, <vscale x 8 x i64> %mult)
   ret <vscale x 2 x i64> %partial.reduce
 }
 
@@ -882,7 +882,7 @@ entry:
   %a.wide = sext <vscale x 8 x i16> %a to <vscale x 8 x i64>
   %b.wide = sext <vscale x 8 x i8> %b to <vscale x 8 x i64>
   %mult = mul nuw nsw <vscale x 8 x i64> %a.wide, %b.wide
-  %partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %acc, <vscale x 8 x i64> %mult)
+  %partial.reduce = tail call <vscale x 2 x i64> @llvm.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %acc, <vscale x 8 x i64> %mult)
   ret <vscale x 2 x i64> %partial.reduce
 }
 
@@ -959,7 +959,7 @@ entry:
   %a.wide = zext <vscale x 8 x i16> %a to <vscale x 8 x i64>
   %b.wide = sext <vscale x 8 x i8> %b to <vscale x 8 x i64>
   %mult = mul nuw nsw <vscale x 8 x i64> %a.wide, %b.wide
-  %partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %acc, <vscale x 8 x i64> %mult)
+  %partial.reduce = tail call <vscale x 2 x i64> @llvm.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %acc, <vscale x 8 x i64> %mult)
   ret <vscale x 2 x i64> %partial.reduce
 }
 
@@ -1033,7 +1033,7 @@ entry:
   %a.wide = sext <vscale x 8 x i16> %a to <vscale x 8 x i64>
   %b.wide = zext <vscale x 8 x i8> %b to <vscale x 8 x i64>
   %mult = mul nuw nsw <vscale x 8 x i64> %a.wide, %b.wide
-  %partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %acc, <vscale x 8 x i64> %mult)
+  %partial.reduce = tail call <vscale x 2 x i64> @llvm.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %acc, <vscale x 8 x i64> %mult)
   ret <vscale x 2 x i64> %partial.reduce
 }
 
@@ -1062,7 +1062,7 @@ entry:
   %a.wide = zext <vscale x 8 x i8> %a to <vscale x 8 x i16>
   %b.wide = zext <vscale x 8 x i8> %b to <vscale x 8 x i16>
   %mult = mul nuw nsw <vscale x 8 x i16> %a.wide, %b.wide
-  %partial.reduce = tail call <vscale x 2 x i16> @llvm.experimental.vector.partial.reduce.add.nxv2i16.nxv8i16(<vscale x 2 x i16> %acc, <vscale x 8 x i16> %mult)
+  %partial.reduce = tail call <vscale x 2 x i16> @llvm.vector.partial.reduce.add.nxv2i16.nxv8i16(<vscale x 2 x i16> %acc, <vscale x 8 x i16> %mult)
   ret <vscale x 2 x i16> %partial.reduce
 }
 
@@ -1094,7 +1094,7 @@ entry:
   %a.wide = sext <vscale x 8 x i8> %a to <vscale x 8 x i16>
   %b.wide = sext <vscale x 8 x i8> %b to <vscale x 8 x i16>
   %mult = mul nuw nsw <vscale x 8 x i16> %a.wide, %b.wide
-  %partial.reduce = tail call <vscale x 2 x i16> @llvm.experimental.vector.partial.reduce.add.nxv2i16.nxv8i16(<vscale x 2 x i16> %acc, <vscale x 8 x i16> %mult)
+  %partial.reduce = tail call <vscale x 2 x i16> @llvm.vector.partial.reduce.add.nxv2i16.nxv8i16(<vscale x 2 x i16> %acc, <vscale x 8 x i16> %mult)
   ret <vscale x 2 x i16> %partial.reduce
 }
 
@@ -1123,7 +1123,7 @@ entry:
   %a.wide = zext <vscale x 8 x i8> %a to <vscale x 8 x i64>
   %b.wide = zext <vscale x 8 x i8> %b to <vscale x 8 x i64>
   %mult = mul nuw nsw <vscale x 8 x i64> %a.wide, %b.wide
-  %partial.reduce = tail call <vscale x 4 x i64> @llvm.experimental.vector.partial.reduce.add.nxv4i64.nxv8i64(
+  %partial.reduce = tail call <vscale x 4 x i64> @llvm.vector.partial.reduce.add.nxv4i64.nxv8i64(
   <vscale x 4 x i64> %acc, <vscale x 8 x i64> %mult)
   ret <vscale x 4 x i64> %partial.reduce
 }
@@ -1149,7 +1149,7 @@ define <vscale x 4 x i32> @sdot_imm(<vscale x 4 x i32> %acc, <vscale x 16 x i8>
 entry:
   %a.wide = sext <vscale x 16 x i8> %a to <vscale x 16 x i32>
   %mult = mul nuw nsw <vscale x 16 x i32> %a.wide, splat(i32 -1)
-  %partial.reduce = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %acc, <vscale x 16 x i32> %mult)
+  %partial.reduce = tail call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %acc, <vscale x 16 x i32> %mult)
   ret <vscale x 4 x i32> %partial.reduce
 }
 
@@ -1210,7 +1210,7 @@ define <vscale x 4 x i32> @sdot_imm_does_not_fit(<vscale x 4 x i32> %acc, <vscal
 entry:
   %a.wide = sext <vscale x 16 x i8> %a to <vscale x 16 x i32>
   %mult = mul nuw nsw <vscale x 16 x i32> %a.wide, splat(i32 256)
-  %partial.reduce = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %acc, <vscale x 16 x i32> %mult)
+  %partial.reduce = tail call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %acc, <vscale x 16 x i32> %mult)
   ret <vscale x 4 x i32> %partial.reduce
 }
 
@@ -1235,7 +1235,7 @@ define <vscale x 4 x i32> @udot_imm(<vscale x 4 x i32> %acc, <vscale x 16 x i8>
 entry:
   %a.wide = zext <vscale x 16 x i8> %a to <vscale x 16 x i32>
   %mult = mul nuw nsw <vscale x 16 x i32> %a.wide, splat(i32 255)
-  %partial.reduce = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %acc, <vscale x 16 x i32> %mult)
+  %partial.reduce = tail call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %acc, <vscale x 16 x i32> %mult)
   ret <vscale x 4 x i32> %partial.reduce
 }
 
@@ -1296,6 +1296,6 @@ define <vscale x 4 x i32> @udot_imm_does_not_fit(<vscale x 4 x i32> %acc, <vscal
 entry:
   %a.wide = zext <vscale x 16 x i8> %a to <vscale x 16 x i32>
   %mult = mul nuw nsw <vscale x 16 x i32> %a.wide, splat(i32 256)
-  %partial.reduce = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %acc, <vscale x 16 x i32> %mult)
+  %partial.reduce = tail call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %acc, <vscale x 16 x i32> %mult)
   ret <vscale x 4 x i32> %partial.reduce
 }
diff --git a/llvm/test/CodeGen/AArch64/sve-partial-reduce-wide-add.ll b/llvm/test/CodeGen/AArch64/sve-partial-reduce-wide-add.ll
index e62979d077fd2..cf738b61a01ee 100644
--- a/llvm/test/CodeGen/AArch64/sve-partial-reduce-wide-add.ll
+++ b/llvm/test/CodeGen/AArch64/sve-partial-reduce-wide-add.ll
@@ -18,7 +18,7 @@ define <vscale x 2 x i64> @signed_wide_add_nxv4i32(<vscale x 2 x i64> %acc, <vsc
 ; CHECK-SVE2-NEXT:    ret
 entry:
     %input.wide = sext <vscale x 4 x i32> %input to <vscale x 4 x i64>
-    %partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv4i64(<vscale x 2 x i64> %acc, <vscale x 4 x i64> %input.wide)
+    %partial.reduce = tail call <vscale x 2 x i64> @llvm.vector.partial.reduce.add.nxv2i64.nxv4i64(<vscale x 2 x i64> %acc, <vscale x 4 x i64> %input.wide)
     ret <vscale x 2 x i64> %partial.reduce
 }
 
@@ -38,7 +38,7 @@ define <vscale x 2 x i64> @unsigned_wide_add_nxv4i32(<vscale x 2 x i64> %acc, <v
 ; CHECK-SVE2-NEXT:    ret
 entry:
     %input.wide = zext <vscale x 4 x i32> %input to <vscale x 4 x i64>
-    %partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv4i64(<vscale x 2 x i64> %acc, <vscale x 4 x i64> %input.wide)
+    %partial.reduce = tail call <vscale x 2 x i64> @llvm.vector.partial.reduce.add.nxv2i64.nxv4i64(<vscale x 2 x i64> %acc, <vscale x 4 x i64> %input.wide)
     ret <vscale x 2 x i64> %partial.reduce
 }
 
@@ -58,7 +58,7 @@ define <vscale x 4 x i32> @signed_wide_add_nxv8i16(<vscale x 4 x i32> %acc, <vsc
 ; CHECK-SVE2-NEXT:    ret
 entry:
     %input.wide = sext <vscale x 8 x i16> %input to <vscale x 8 x i32>
-    %partial.reduce = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv8i32(<vscale x 4 x i32> %acc, <vscale x 8 x i32> %input.wide)
+    %partial.reduce = tail call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv8i32(<vscale x 4 x i32> %acc, <vscale x 8 x i32> %input.wide)
     ret <vscale x 4 x i32> %partial.reduce
 }
 
@@ -78,7 +78,7 @@ define <vscale x 4 x i32> @unsigned_wide_add_nxv8i16(<vscale x 4 x i32> %acc, <v
 ; CHECK-SVE2-NEXT:    ret
 entry:
     %input.wide = zext <vscale x 8 x i16> %input to <vscale x 8 x i32>
-    %partial.reduce = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv8i32(<vscale x 4 x i32> %acc, <vscale x 8 x i32> %input.wide)
+    %partial.reduce = tail call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv8i32(<vscale x 4 x i32> %acc, <vscale x 8 x i32> %input.wide)
     ret <vscale x 4 x i32> %partial.reduce
 }
 
@@ -98,7 +98,7 @@ define <vscale x 8 x i16> @signed_wide_add_nxv16i8(<vscale x 8 x i16> %acc, <vsc
 ; CHECK-SVE2-NEXT:    ret
 entry:
     %input.wide = sext <vscale x 16 x i8> %input to <vscale x 16 x i16>
-    %partial.reduce = tail call <vscale x 8 x i16> @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i16(<vscale x 8 x i16> %acc, <vscale x 16 x i16> %input.wide)
+    %partial.reduce = tail call <vscale x 8 x i16> @llvm.vector.partial.reduce.add.nxv8i16.nxv16i16(<vscale x 8 x i16> %acc, <vscale x 16 x i16> %input.wide)
     ret <vscale x 8 x i16> %partial.reduce
 }
 
@@ -118,7 +118,7 @@ define <vscale x 8 x i16> @unsigned_wide_add_nxv16i8(<vscale x 8 x i16> %acc, <v
 ; CHECK-SVE2-NEXT:    ret
 entry:
     %input.wide = zext <vscale x 16 x i8> %input to <vscale x 16 x i16>
-    %partial.reduce = tail call <vscale x 8 x i16> @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i16(<vscale x 8 x i16> %acc, <vscale x 16 x i16> %input.wide)
+    %partial.reduce = tail call <vscale x 8 x i16> @llvm.vector.partial.reduce.add.nxv8i16.nxv16i16(<vscale x 8 x i16> %acc, <vscale x 16 x i16> %input.wide)
     ret <vscale x 8 x i16> %partial.reduce
 }
 
@@ -142,7 +142,7 @@ define <vscale x 2 x i32> @signed_wide_add_nxv4i16(<vscale x 2 x i32> %acc, <vsc
 ; CHECK-SVE2-NEXT:    ret
 entry:
     %input.wide = sext <vscale x 4 x i16> %input to <vscale x 4 x i32>
-    %partial.reduce = tail call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv4i32(<vscale x 2 x i32> %acc, <vscale x 4 x i32> %input.wide)
+    %partial.reduce = tail call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv4i32(<vscale x 2 x i32> %acc, <vscale x 4 x i32> %input.wide)
     ret <vscale x 2 x i32> %partial.reduce
 }
 
@@ -164,7 +164,7 @@ define <vscale x 2 x i32> @unsigned_wide_add_nxv4i16(<vscale x 2 x i32> %acc, <v
 ; CHECK-SVE2-NEXT:    ret
 entry:
     %input.wide = zext <vscale x 4 x i16> %input to <vscale x 4 x i32>
-    %partial.reduce = tail call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv4i32(<vscale x 2 x i32> %acc, <vscale x 4 x i32> %input.wide)
+    %partial.reduce = tail call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv4i32(<vscale x 2 x i32> %acc, <vscale x 4 x i32> %input.wide)
     ret <vscale x 2 x i32> %partial.reduce
 }
 
@@ -190,7 +190,7 @@ define <vscale x 4 x i64> @signed_wide_add_nxv8i32(<vscale x 4 x i64> %acc, <vsc
 ; CHECK-SVE2-NEXT:    ret
 entry:
     %input.wide = sext <vscale x 8 x i32> %input to <vscale x 8 x i64>
-    %partial.reduce = tail call <vscale x 4 x i64> @llvm.experimental.vector.partial.reduce.add.nxv4i64.nxv8i64(<vscale x 4 x i64> %acc, <vscale x 8 x i64> %input.wide)
+    %partial.reduce = tail call <vscale x 4 x i64> @llvm.vector.partial.reduce.add.nxv4i64.nxv8i64(<vscale x 4 x i64> %acc, <vscale x 8 x i64> %input.wide)
     ret <vscale x 4 x i64> %partial.reduce
 }
 
@@ -216,6 +216,6 @@ define <vscale x 4 x i64> @unsigned_wide_add_nxv8i32(<vscale x 4 x i64> %acc, <v
 ; CHECK-SVE2-NEXT:    ret
 entry:
     %input.wide = zext <vscale x 8 x i32> %input to <vscale x 8 x i64>
-    %partial.reduce = tail call <vscale x 4 x i64> @llvm.experimental.vector.partial.reduce.add.nxv4i64.nxv8i64(<vscale x 4 x i64> %acc, <vscale x 8 x i64> %input.wide)
+    %partial.reduce = tail call <vscale x 4 x i64> @llvm.vector.partial.reduce.add.nxv4i64.nxv8i64(<vscale x 4 x i64> %acc, <vscale x 8 x i64> %input.wide)
     ret <vscale x 4 x i64> %partial.reduce
 }
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-dots-partial-reduction.ll b/llvm/test/CodeGen/AArch64/sve2p1-dots-partial-reduction.ll
new file mode 100644
index 0000000000000..51673282bd8ff
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve2p1-dots-partial-reduction.ll
@@ -0,0 +1,105 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -force-streaming < %s | FileCheck %s
+
+define <vscale x 4 x i32> @udot_vl128(<vscale x 4 x i32> %acc, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b) {
+; CHECK-LABEL: udot_vl128:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    udot z0.s, z1.h, z2.h
+; CHECK-NEXT:    ret
+entry:
+  %a.wide = zext <vscale x 8 x i16> %a to <vscale x 8 x i32>
+  %b.wide = zext <vscale x 8 x i16> %b to <vscale x 8 x i32>
+  %mult = mul nuw nsw <vscale x 8 x i32> %a.wide, %b.wide
+  %partial.reduce = tail call <vscale x 4 x i32> @llvm.vector.partial.reduce.add(<vscale x 4 x i32> %acc, <vscale x 8 x i32> %mult)
+  ret <vscale x 4 x i32> %partial.reduce
+}
+
+define <vscale x 4 x i32> @sdot_vl128(<vscale x 4 x i32> %acc, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b)  {
+; CHECK-LABEL: sdot_vl128:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sdot z0.s, z1.h, z2.h
+; CHECK-NEXT:    ret
+entry:
+  %a.wide = sext <vscale x 8 x i16> %a to <vscale x 8 x i32>
+  %b.wide = sext <vscale x 8 x i16> %b to <vscale x 8 x i32>
+  %mult = mul nuw nsw <vscale x 8 x i32> %a.wide, %b.wide
+  %partial.reduce = tail call <vscale x 4 x i32> @llvm.vector.partial.reduce.add(<vscale x 4 x i32> %acc, <vscale x 8 x i32> %mult)
+  ret <vscale x 4 x i32> %partial.reduce
+}
+
+define void @udot_vl256(ptr %accptr, ptr %aptr, ptr %bptr) vscale_range(2,2) {
+; CHECK-LABEL: udot_vl256:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldr z0, [x0]
+; CHECK-NEXT:    ldr z1, [x1]
+; CHECK-NEXT:    ldr z2, [x2]
+; CHECK-NEXT:    udot z0.s, z1.h, z2.h
+; CHECK-NEXT:    str z0, [x0]
+; CHECK-NEXT:    ret
+entry:
+  %acc = load <8 x i32>, ptr %accptr
+  %a = load <16 x i16>, ptr %aptr
+  %b = load <16 x i16>, ptr %bptr
+  %a.wide = zext <16 x i16> %a to <16 x i32>
+  %b.wide = zext <16 x i16> %b to <16 x i32>
+  %mult = mul nuw nsw <16 x i32> %a.wide, %b.wide
+  %partial.reduce = tail call <8 x i32> @llvm.vector.partial.reduce.add(<8 x i32> %acc, <16 x i32> %mult)
+  store <8 x i32> %partial.reduce, ptr %accptr
+  ret void
+}
+
+define void @sdot_vl256(ptr %accptr, ptr %aptr, ptr %bptr) vscale_range(2,2) {
+; CHECK-LABEL: sdot_vl256:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldr z0, [x0]
+; CHECK-NEXT:    ldr z1, [x1]
+; CHECK-NEXT:    ldr z2, [x2]
+; CHECK-NEXT:    sdot z0.s, z1.h, z2.h
+; CHECK-NEXT:    str z0, [x0]
+; CHECK-NEXT:    ret
+entry:
+  %acc = load <8 x i32>, ptr %accptr
+  %a = load <16 x i16>, ptr %aptr
+  %b = load <16 x i16>, ptr %bptr
+  %a.wide = sext <16 x i16> %a to <16 x i32>
+  %b.wide = sext <16 x i16> %b to <16 x i32>
+  %mult = mul nuw nsw <16 x i32> %a.wide, %b.wide
+  %partial.reduce = tail call <8 x i32> @llvm.vector.partial.reduce.add(<8 x i32> %acc, <16 x i32> %mult)
+  store <8 x i32> %partial.reduce, ptr %accptr
+  ret void
+}
+
+define <4 x i32> @fixed_udot_s_h(<4 x i32> %acc, <8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: fixed_udot_s_h:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    // kill: def $q2 killed $q2 def $z2
+; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
+; CHECK-NEXT:    udot z0.s, z1.h, z2.h
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT:    ret
+entry:
+  %a.wide = zext <8 x i16> %a to <8 x i32>
+  %b.wide = zext <8 x i16> %b to <8 x i32>
+  %mult = mul nuw nsw <8 x i32> %a.wide, %b.wide
+  %partial.reduce = tail call <4 x i32> @llvm.vector.partial.reduce.add(<4 x i32> %acc, <8 x i32> %mult)
+  ret <4 x i32> %partial.reduce
+}
+
+define <4 x i32> @fixed_sdot_s_h(<4 x i32> %acc, <8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: fixed_sdot_s_h:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    // kill: def $q2 killed $q2 def $z2
+; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
+; CHECK-NEXT:    sdot z0.s, z1.h, z2.h
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT:    ret
+entry:
+  %a.wide = sext <8 x i16> %a to <8 x i32>
+  %b.wide = sext <8 x i16> %b to <8 x i32>
+  %mult = mul nuw nsw <8 x i32> %a.wide, %b.wide
+  %partial.reduce = tail call <4 x i32> @llvm.vector.partial.reduce.add(<4 x i32> %acc, <8 x i32> %mult)
+  ret <4 x i32> %partial.reduce
+}
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization-nan.ll b/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization-nan.ll
index 1d295a30a994b..2368b0288ccb7 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization-nan.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization-nan.ll
@@ -6,15 +6,9 @@
 
 ; CHECK-NOFP-GI:       warning: Instruction selection used fallback path for test_v11f16
 ; CHECK-NOFP-GI-NEXT:  warning: Instruction selection used fallback path for test_v11f16_ninf
-; CHECK-NOFP-GI-NEXT:  warning: Instruction selection used fallback path for test_v3f32
-; CHECK-NOFP-GI-NEXT:  warning: Instruction selection used fallback path for test_v3f32_ninf
-; CHECK-NOFP-GI-NEXT:  warning: Instruction selection used fallback path for test_v2f128
 ;
 ; CHECK-FP-GI:       warning: Instruction selection used fallback path for test_v11f16
 ; CHECK-FP-GI-NEXT:  warning: Instruction selection used fallback path for test_v11f16_ninf
-; CHECK-FP-GI-NEXT:  warning: Instruction selection used fallback path for test_v3f32
-; CHECK-FP-GI-NEXT:  warning: Instruction selection used fallback path for test_v3f32_ninf
-; CHECK-FP-GI-NEXT:  warning: Instruction selection used fallback path for test_v2f128
 
 declare half @llvm.vector.reduce.fmax.v1f16(<1 x half> %a)
 declare float @llvm.vector.reduce.fmax.v1f32(<1 x float> %a)
@@ -557,33 +551,99 @@ define half @test_v11f16_ninf(<11 x half> %a) nounwind {
 }
 
 define float @test_v3f32(<3 x float> %a) nounwind {
-; CHECK-LABEL: test_v3f32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #-4194304 // =0xffc00000
-; CHECK-NEXT:    fmov s1, w8
-; CHECK-NEXT:    mov v0.s[3], v1.s[0]
-; CHECK-NEXT:    fmaxnmv s0, v0.4s
-; CHECK-NEXT:    ret
+; CHECK-NOFP-SD-LABEL: test_v3f32:
+; CHECK-NOFP-SD:       // %bb.0:
+; CHECK-NOFP-SD-NEXT:    mov w8, #-4194304 // =0xffc00000
+; CHECK-NOFP-SD-NEXT:    fmov s1, w8
+; CHECK-NOFP-SD-NEXT:    mov v0.s[3], v1.s[0]
+; CHECK-NOFP-SD-NEXT:    fmaxnmv s0, v0.4s
+; CHECK-NOFP-SD-NEXT:    ret
+;
+; CHECK-FP-SD-LABEL: test_v3f32:
+; CHECK-FP-SD:       // %bb.0:
+; CHECK-FP-SD-NEXT:    mov w8, #-4194304 // =0xffc00000
+; CHECK-FP-SD-NEXT:    fmov s1, w8
+; CHECK-FP-SD-NEXT:    mov v0.s[3], v1.s[0]
+; CHECK-FP-SD-NEXT:    fmaxnmv s0, v0.4s
+; CHECK-FP-SD-NEXT:    ret
+;
+; CHECK-NOFP-GI-LABEL: test_v3f32:
+; CHECK-NOFP-GI:       // %bb.0:
+; CHECK-NOFP-GI-NEXT:    mov s1, v0.s[1]
+; CHECK-NOFP-GI-NEXT:    mov s2, v0.s[2]
+; CHECK-NOFP-GI-NEXT:    fmaxnm s0, s0, s1
+; CHECK-NOFP-GI-NEXT:    fmaxnm s0, s0, s2
+; CHECK-NOFP-GI-NEXT:    ret
+;
+; CHECK-FP-GI-LABEL: test_v3f32:
+; CHECK-FP-GI:       // %bb.0:
+; CHECK-FP-GI-NEXT:    mov s1, v0.s[1]
+; CHECK-FP-GI-NEXT:    mov s2, v0.s[2]
+; CHECK-FP-GI-NEXT:    fmaxnm s0, s0, s1
+; CHECK-FP-GI-NEXT:    fmaxnm s0, s0, s2
+; CHECK-FP-GI-NEXT:    ret
   %b = call float @llvm.vector.reduce.fmax.v3f32(<3 x float> %a)
   ret float %b
 }
 
 define float @test_v3f32_ninf(<3 x float> %a) nounwind {
-; CHECK-LABEL: test_v3f32_ninf:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #-4194304 // =0xffc00000
-; CHECK-NEXT:    fmov s1, w8
-; CHECK-NEXT:    mov v0.s[3], v1.s[0]
-; CHECK-NEXT:    fmaxnmv s0, v0.4s
-; CHECK-NEXT:    ret
+; CHECK-NOFP-SD-LABEL: test_v3f32_ninf:
+; CHECK-NOFP-SD:       // %bb.0:
+; CHECK-NOFP-SD-NEXT:    mov w8, #-4194304 // =0xffc00000
+; CHECK-NOFP-SD-NEXT:    fmov s1, w8
+; CHECK-NOFP-SD-NEXT:    mov v0.s[3], v1.s[0]
+; CHECK-NOFP-SD-NEXT:    fmaxnmv s0, v0.4s
+; CHECK-NOFP-SD-NEXT:    ret
+;
+; CHECK-FP-SD-LABEL: test_v3f32_ninf:
+; CHECK-FP-SD:       // %bb.0:
+; CHECK-FP-SD-NEXT:    mov w8, #-4194304 // =0xffc00000
+; CHECK-FP-SD-NEXT:    fmov s1, w8
+; CHECK-FP-SD-NEXT:    mov v0.s[3], v1.s[0]
+; CHECK-FP-SD-NEXT:    fmaxnmv s0, v0.4s
+; CHECK-FP-SD-NEXT:    ret
+;
+; CHECK-NOFP-GI-LABEL: test_v3f32_ninf:
+; CHECK-NOFP-GI:       // %bb.0:
+; CHECK-NOFP-GI-NEXT:    mov s1, v0.s[1]
+; CHECK-NOFP-GI-NEXT:    mov s2, v0.s[2]
+; CHECK-NOFP-GI-NEXT:    fmaxnm s0, s0, s1
+; CHECK-NOFP-GI-NEXT:    fmaxnm s0, s0, s2
+; CHECK-NOFP-GI-NEXT:    ret
+;
+; CHECK-FP-GI-LABEL: test_v3f32_ninf:
+; CHECK-FP-GI:       // %bb.0:
+; CHECK-FP-GI-NEXT:    mov s1, v0.s[1]
+; CHECK-FP-GI-NEXT:    mov s2, v0.s[2]
+; CHECK-FP-GI-NEXT:    fmaxnm s0, s0, s1
+; CHECK-FP-GI-NEXT:    fmaxnm s0, s0, s2
+; CHECK-FP-GI-NEXT:    ret
   %b = call ninf float @llvm.vector.reduce.fmax.v3f32(<3 x float> %a)
   ret float %b
 }
 
 define fp128 @test_v2f128(<2 x fp128> %a) nounwind {
-; CHECK-LABEL: test_v2f128:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    b fmaxl
+; CHECK-NOFP-SD-LABEL: test_v2f128:
+; CHECK-NOFP-SD:       // %bb.0:
+; CHECK-NOFP-SD-NEXT:    b fmaxl
+;
+; CHECK-FP-SD-LABEL: test_v2f128:
+; CHECK-FP-SD:       // %bb.0:
+; CHECK-FP-SD-NEXT:    b fmaxl
+;
+; CHECK-NOFP-GI-LABEL: test_v2f128:
+; CHECK-NOFP-GI:       // %bb.0:
+; CHECK-NOFP-GI-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NOFP-GI-NEXT:    bl fmaxl
+; CHECK-NOFP-GI-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NOFP-GI-NEXT:    ret
+;
+; CHECK-FP-GI-LABEL: test_v2f128:
+; CHECK-FP-GI:       // %bb.0:
+; CHECK-FP-GI-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-FP-GI-NEXT:    bl fmaxl
+; CHECK-FP-GI-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-FP-GI-NEXT:    ret
   %b = call fp128 @llvm.vector.reduce.fmax.v2f128(<2 x fp128> %a)
   ret fp128 %b
 }
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization.ll b/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization.ll
index ee2af110c84cd..a2f4ccd369fb4 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization.ll
@@ -6,15 +6,9 @@
 
 ; CHECK-NOFP-GI:       warning: Instruction selection used fallback path for test_v11f16
 ; CHECK-NOFP-GI-NEXT:  warning: Instruction selection used fallback path for test_v11f16_ninf
-; CHECK-NOFP-GI-NEXT:  warning: Instruction selection used fallback path for test_v3f32
-; CHECK-NOFP-GI-NEXT:  warning: Instruction selection used fallback path for test_v3f32_ninf
-; CHECK-NOFP-GI-NEXT:  warning: Instruction selection used fallback path for test_v2f128
 ;
 ; CHECK-FP-GI:       warning: Instruction selection used fallback path for test_v11f16
 ; CHECK-FP-GI-NEXT:  warning: Instruction selection used fallback path for test_v11f16_ninf
-; CHECK-FP-GI-NEXT:  warning: Instruction selection used fallback path for test_v3f32
-; CHECK-FP-GI-NEXT:  warning: Instruction selection used fallback path for test_v3f32_ninf
-; CHECK-FP-GI-NEXT:  warning: Instruction selection used fallback path for test_v2f128
 
 declare half @llvm.vector.reduce.fmax.v1f16(<1 x half> %a)
 declare float @llvm.vector.reduce.fmax.v1f32(<1 x float> %a)
@@ -557,45 +551,123 @@ define half @test_v11f16_ninf(<11 x half> %a) nounwind {
 }
 
 define float @test_v3f32(<3 x float> %a) nounwind {
-; CHECK-LABEL: test_v3f32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #-8388608 // =0xff800000
-; CHECK-NEXT:    fmov s1, w8
-; CHECK-NEXT:    mov v0.s[3], v1.s[0]
-; CHECK-NEXT:    fmaxnmv s0, v0.4s
-; CHECK-NEXT:    ret
+; CHECK-NOFP-SD-LABEL: test_v3f32:
+; CHECK-NOFP-SD:       // %bb.0:
+; CHECK-NOFP-SD-NEXT:    mov w8, #-8388608 // =0xff800000
+; CHECK-NOFP-SD-NEXT:    fmov s1, w8
+; CHECK-NOFP-SD-NEXT:    mov v0.s[3], v1.s[0]
+; CHECK-NOFP-SD-NEXT:    fmaxnmv s0, v0.4s
+; CHECK-NOFP-SD-NEXT:    ret
+;
+; CHECK-FP-SD-LABEL: test_v3f32:
+; CHECK-FP-SD:       // %bb.0:
+; CHECK-FP-SD-NEXT:    mov w8, #-8388608 // =0xff800000
+; CHECK-FP-SD-NEXT:    fmov s1, w8
+; CHECK-FP-SD-NEXT:    mov v0.s[3], v1.s[0]
+; CHECK-FP-SD-NEXT:    fmaxnmv s0, v0.4s
+; CHECK-FP-SD-NEXT:    ret
+;
+; CHECK-NOFP-GI-LABEL: test_v3f32:
+; CHECK-NOFP-GI:       // %bb.0:
+; CHECK-NOFP-GI-NEXT:    mov s1, v0.s[1]
+; CHECK-NOFP-GI-NEXT:    mov s2, v0.s[2]
+; CHECK-NOFP-GI-NEXT:    fmaxnm s0, s0, s1
+; CHECK-NOFP-GI-NEXT:    fmaxnm s0, s0, s2
+; CHECK-NOFP-GI-NEXT:    ret
+;
+; CHECK-FP-GI-LABEL: test_v3f32:
+; CHECK-FP-GI:       // %bb.0:
+; CHECK-FP-GI-NEXT:    mov s1, v0.s[1]
+; CHECK-FP-GI-NEXT:    mov s2, v0.s[2]
+; CHECK-FP-GI-NEXT:    fmaxnm s0, s0, s1
+; CHECK-FP-GI-NEXT:    fmaxnm s0, s0, s2
+; CHECK-FP-GI-NEXT:    ret
   %b = call nnan float @llvm.vector.reduce.fmax.v3f32(<3 x float> %a)
   ret float %b
 }
 
 define float @test_v3f32_ninf(<3 x float> %a) nounwind {
-; CHECK-LABEL: test_v3f32_ninf:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #-8388609 // =0xff7fffff
-; CHECK-NEXT:    fmov s1, w8
-; CHECK-NEXT:    mov v0.s[3], v1.s[0]
-; CHECK-NEXT:    fmaxnmv s0, v0.4s
-; CHECK-NEXT:    ret
+; CHECK-NOFP-SD-LABEL: test_v3f32_ninf:
+; CHECK-NOFP-SD:       // %bb.0:
+; CHECK-NOFP-SD-NEXT:    mov w8, #-8388609 // =0xff7fffff
+; CHECK-NOFP-SD-NEXT:    fmov s1, w8
+; CHECK-NOFP-SD-NEXT:    mov v0.s[3], v1.s[0]
+; CHECK-NOFP-SD-NEXT:    fmaxnmv s0, v0.4s
+; CHECK-NOFP-SD-NEXT:    ret
+;
+; CHECK-FP-SD-LABEL: test_v3f32_ninf:
+; CHECK-FP-SD:       // %bb.0:
+; CHECK-FP-SD-NEXT:    mov w8, #-8388609 // =0xff7fffff
+; CHECK-FP-SD-NEXT:    fmov s1, w8
+; CHECK-FP-SD-NEXT:    mov v0.s[3], v1.s[0]
+; CHECK-FP-SD-NEXT:    fmaxnmv s0, v0.4s
+; CHECK-FP-SD-NEXT:    ret
+;
+; CHECK-NOFP-GI-LABEL: test_v3f32_ninf:
+; CHECK-NOFP-GI:       // %bb.0:
+; CHECK-NOFP-GI-NEXT:    mov s1, v0.s[1]
+; CHECK-NOFP-GI-NEXT:    mov s2, v0.s[2]
+; CHECK-NOFP-GI-NEXT:    fmaxnm s0, s0, s1
+; CHECK-NOFP-GI-NEXT:    fmaxnm s0, s0, s2
+; CHECK-NOFP-GI-NEXT:    ret
+;
+; CHECK-FP-GI-LABEL: test_v3f32_ninf:
+; CHECK-FP-GI:       // %bb.0:
+; CHECK-FP-GI-NEXT:    mov s1, v0.s[1]
+; CHECK-FP-GI-NEXT:    mov s2, v0.s[2]
+; CHECK-FP-GI-NEXT:    fmaxnm s0, s0, s1
+; CHECK-FP-GI-NEXT:    fmaxnm s0, s0, s2
+; CHECK-FP-GI-NEXT:    ret
   %b = call nnan ninf float @llvm.vector.reduce.fmax.v3f32(<3 x float> %a)
   ret float %b
 }
 
 define fp128 @test_v2f128(<2 x fp128> %a) nounwind {
-; CHECK-LABEL: test_v2f128:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #48
-; CHECK-NEXT:    str x30, [sp, #32] // 8-byte Folded Spill
-; CHECK-NEXT:    stp q0, q1, [sp] // 32-byte Folded Spill
-; CHECK-NEXT:    bl __gttf2
-; CHECK-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    b.le .LBB18_2
-; CHECK-NEXT:  // %bb.1:
-; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
-; CHECK-NEXT:  .LBB18_2:
-; CHECK-NEXT:    ldr x30, [sp, #32] // 8-byte Folded Reload
-; CHECK-NEXT:    add sp, sp, #48
-; CHECK-NEXT:    ret
+; CHECK-NOFP-SD-LABEL: test_v2f128:
+; CHECK-NOFP-SD:       // %bb.0:
+; CHECK-NOFP-SD-NEXT:    sub sp, sp, #48
+; CHECK-NOFP-SD-NEXT:    str x30, [sp, #32] // 8-byte Folded Spill
+; CHECK-NOFP-SD-NEXT:    stp q0, q1, [sp] // 32-byte Folded Spill
+; CHECK-NOFP-SD-NEXT:    bl __gttf2
+; CHECK-NOFP-SD-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
+; CHECK-NOFP-SD-NEXT:    cmp w0, #0
+; CHECK-NOFP-SD-NEXT:    b.le .LBB18_2
+; CHECK-NOFP-SD-NEXT:  // %bb.1:
+; CHECK-NOFP-SD-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NOFP-SD-NEXT:  .LBB18_2:
+; CHECK-NOFP-SD-NEXT:    ldr x30, [sp, #32] // 8-byte Folded Reload
+; CHECK-NOFP-SD-NEXT:    add sp, sp, #48
+; CHECK-NOFP-SD-NEXT:    ret
+;
+; CHECK-FP-SD-LABEL: test_v2f128:
+; CHECK-FP-SD:       // %bb.0:
+; CHECK-FP-SD-NEXT:    sub sp, sp, #48
+; CHECK-FP-SD-NEXT:    str x30, [sp, #32] // 8-byte Folded Spill
+; CHECK-FP-SD-NEXT:    stp q0, q1, [sp] // 32-byte Folded Spill
+; CHECK-FP-SD-NEXT:    bl __gttf2
+; CHECK-FP-SD-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
+; CHECK-FP-SD-NEXT:    cmp w0, #0
+; CHECK-FP-SD-NEXT:    b.le .LBB18_2
+; CHECK-FP-SD-NEXT:  // %bb.1:
+; CHECK-FP-SD-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-FP-SD-NEXT:  .LBB18_2:
+; CHECK-FP-SD-NEXT:    ldr x30, [sp, #32] // 8-byte Folded Reload
+; CHECK-FP-SD-NEXT:    add sp, sp, #48
+; CHECK-FP-SD-NEXT:    ret
+;
+; CHECK-NOFP-GI-LABEL: test_v2f128:
+; CHECK-NOFP-GI:       // %bb.0:
+; CHECK-NOFP-GI-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NOFP-GI-NEXT:    bl fmaxl
+; CHECK-NOFP-GI-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NOFP-GI-NEXT:    ret
+;
+; CHECK-FP-GI-LABEL: test_v2f128:
+; CHECK-FP-GI:       // %bb.0:
+; CHECK-FP-GI-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-FP-GI-NEXT:    bl fmaxl
+; CHECK-FP-GI-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-FP-GI-NEXT:    ret
   %b = call nnan fp128 @llvm.vector.reduce.fmax.v2f128(<2 x fp128> %a)
   ret fp128 %b
 }
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fmaximum.ll b/llvm/test/CodeGen/AArch64/vecreduce-fmaximum.ll
index be61f9b521795..1d5b70796bdb1 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-fmaximum.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-fmaximum.ll
@@ -5,12 +5,8 @@
 ; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mattr=+neon,+fullfp16 -global-isel -global-isel-abort=2 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FP --check-prefix=CHECK-FP-GI
 
 ; CHECK-NOFP-GI:       warning: Instruction selection used fallback path for test_v11f16
-; CHECK-NOFP-GI-NEXT:  warning: Instruction selection used fallback path for test_v3f32
-; CHECK-NOFP-GI-NEXT:  warning: Instruction selection used fallback path for test_v3f32_ninf
 ;
 ; CHECK-FP-GI:       warning: Instruction selection used fallback path for test_v11f16
-; CHECK-FP-GI-NEXT:  warning: Instruction selection used fallback path for test_v3f32
-; CHECK-FP-GI-NEXT:  warning: Instruction selection used fallback path for test_v3f32_ninf
 
 declare half @llvm.vector.reduce.fmaximum.v1f16(<1 x half> %a)
 declare float @llvm.vector.reduce.fmaximum.v1f32(<1 x float> %a)
@@ -440,26 +436,74 @@ define half @test_v11f16(<11 x half> %a) nounwind {
 ; Neutral element is negative infinity which is chosen for padding the widened
 ; vector.
 define float @test_v3f32(<3 x float> %a) nounwind {
-; CHECK-LABEL: test_v3f32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #-8388608 // =0xff800000
-; CHECK-NEXT:    fmov s1, w8
-; CHECK-NEXT:    mov v0.s[3], v1.s[0]
-; CHECK-NEXT:    fmaxv s0, v0.4s
-; CHECK-NEXT:    ret
+; CHECK-NOFP-SD-LABEL: test_v3f32:
+; CHECK-NOFP-SD:       // %bb.0:
+; CHECK-NOFP-SD-NEXT:    mov w8, #-8388608 // =0xff800000
+; CHECK-NOFP-SD-NEXT:    fmov s1, w8
+; CHECK-NOFP-SD-NEXT:    mov v0.s[3], v1.s[0]
+; CHECK-NOFP-SD-NEXT:    fmaxv s0, v0.4s
+; CHECK-NOFP-SD-NEXT:    ret
+;
+; CHECK-FP-SD-LABEL: test_v3f32:
+; CHECK-FP-SD:       // %bb.0:
+; CHECK-FP-SD-NEXT:    mov w8, #-8388608 // =0xff800000
+; CHECK-FP-SD-NEXT:    fmov s1, w8
+; CHECK-FP-SD-NEXT:    mov v0.s[3], v1.s[0]
+; CHECK-FP-SD-NEXT:    fmaxv s0, v0.4s
+; CHECK-FP-SD-NEXT:    ret
+;
+; CHECK-NOFP-GI-LABEL: test_v3f32:
+; CHECK-NOFP-GI:       // %bb.0:
+; CHECK-NOFP-GI-NEXT:    mov s1, v0.s[1]
+; CHECK-NOFP-GI-NEXT:    mov s2, v0.s[2]
+; CHECK-NOFP-GI-NEXT:    fmax s0, s0, s1
+; CHECK-NOFP-GI-NEXT:    fmax s0, s0, s2
+; CHECK-NOFP-GI-NEXT:    ret
+;
+; CHECK-FP-GI-LABEL: test_v3f32:
+; CHECK-FP-GI:       // %bb.0:
+; CHECK-FP-GI-NEXT:    mov s1, v0.s[1]
+; CHECK-FP-GI-NEXT:    mov s2, v0.s[2]
+; CHECK-FP-GI-NEXT:    fmax s0, s0, s1
+; CHECK-FP-GI-NEXT:    fmax s0, s0, s2
+; CHECK-FP-GI-NEXT:    ret
   %b = call float @llvm.vector.reduce.fmaximum.v3f32(<3 x float> %a)
   ret float %b
 }
 
 ; Neutral element chosen for padding the widened vector is not negative infinity.
 define float @test_v3f32_ninf(<3 x float> %a) nounwind {
-; CHECK-LABEL: test_v3f32_ninf:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #-8388609 // =0xff7fffff
-; CHECK-NEXT:    fmov s1, w8
-; CHECK-NEXT:    mov v0.s[3], v1.s[0]
-; CHECK-NEXT:    fmaxv s0, v0.4s
-; CHECK-NEXT:    ret
+; CHECK-NOFP-SD-LABEL: test_v3f32_ninf:
+; CHECK-NOFP-SD:       // %bb.0:
+; CHECK-NOFP-SD-NEXT:    mov w8, #-8388609 // =0xff7fffff
+; CHECK-NOFP-SD-NEXT:    fmov s1, w8
+; CHECK-NOFP-SD-NEXT:    mov v0.s[3], v1.s[0]
+; CHECK-NOFP-SD-NEXT:    fmaxv s0, v0.4s
+; CHECK-NOFP-SD-NEXT:    ret
+;
+; CHECK-FP-SD-LABEL: test_v3f32_ninf:
+; CHECK-FP-SD:       // %bb.0:
+; CHECK-FP-SD-NEXT:    mov w8, #-8388609 // =0xff7fffff
+; CHECK-FP-SD-NEXT:    fmov s1, w8
+; CHECK-FP-SD-NEXT:    mov v0.s[3], v1.s[0]
+; CHECK-FP-SD-NEXT:    fmaxv s0, v0.4s
+; CHECK-FP-SD-NEXT:    ret
+;
+; CHECK-NOFP-GI-LABEL: test_v3f32_ninf:
+; CHECK-NOFP-GI:       // %bb.0:
+; CHECK-NOFP-GI-NEXT:    mov s1, v0.s[1]
+; CHECK-NOFP-GI-NEXT:    mov s2, v0.s[2]
+; CHECK-NOFP-GI-NEXT:    fmax s0, s0, s1
+; CHECK-NOFP-GI-NEXT:    fmax s0, s0, s2
+; CHECK-NOFP-GI-NEXT:    ret
+;
+; CHECK-FP-GI-LABEL: test_v3f32_ninf:
+; CHECK-FP-GI:       // %bb.0:
+; CHECK-FP-GI-NEXT:    mov s1, v0.s[1]
+; CHECK-FP-GI-NEXT:    mov s2, v0.s[2]
+; CHECK-FP-GI-NEXT:    fmax s0, s0, s1
+; CHECK-FP-GI-NEXT:    fmax s0, s0, s2
+; CHECK-FP-GI-NEXT:    ret
   %b = call ninf float @llvm.vector.reduce.fmaximum.v3f32(<3 x float> %a)
   ret float %b
 }
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fmin-legalization.ll b/llvm/test/CodeGen/AArch64/vecreduce-fmin-legalization.ll
index 79a8fc35e833d..c5109c8e63497 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-fmin-legalization.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-fmin-legalization.ll
@@ -6,15 +6,9 @@
 
 ; CHECK-NOFP-GI:       warning: Instruction selection used fallback path for test_v11f16
 ; CHECK-NOFP-GI-NEXT:  warning: Instruction selection used fallback path for test_v11f16_ninf
-; CHECK-NOFP-GI-NEXT:  warning: Instruction selection used fallback path for test_v3f32
-; CHECK-NOFP-GI-NEXT:  warning: Instruction selection used fallback path for test_v3f32_ninf
-; CHECK-NOFP-GI-NEXT:  warning: Instruction selection used fallback path for test_v2f128
 ;
 ; CHECK-FP-GI:       warning: Instruction selection used fallback path for test_v11f16
 ; CHECK-FP-GI-NEXT:  warning: Instruction selection used fallback path for test_v11f16_ninf
-; CHECK-FP-GI-NEXT:  warning: Instruction selection used fallback path for test_v3f32
-; CHECK-FP-GI-NEXT:  warning: Instruction selection used fallback path for test_v3f32_ninf
-; CHECK-FP-GI-NEXT:  warning: Instruction selection used fallback path for test_v2f128
 
 declare half @llvm.vector.reduce.fmin.v1f16(<1 x half> %a)
 declare float @llvm.vector.reduce.fmin.v1f32(<1 x float> %a)
@@ -557,45 +551,123 @@ define half @test_v11f16_ninf(<11 x half> %a) nounwind {
 }
 
 define float @test_v3f32(<3 x float> %a) nounwind {
-; CHECK-LABEL: test_v3f32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #2139095040 // =0x7f800000
-; CHECK-NEXT:    fmov s1, w8
-; CHECK-NEXT:    mov v0.s[3], v1.s[0]
-; CHECK-NEXT:    fminnmv s0, v0.4s
-; CHECK-NEXT:    ret
+; CHECK-NOFP-SD-LABEL: test_v3f32:
+; CHECK-NOFP-SD:       // %bb.0:
+; CHECK-NOFP-SD-NEXT:    mov w8, #2139095040 // =0x7f800000
+; CHECK-NOFP-SD-NEXT:    fmov s1, w8
+; CHECK-NOFP-SD-NEXT:    mov v0.s[3], v1.s[0]
+; CHECK-NOFP-SD-NEXT:    fminnmv s0, v0.4s
+; CHECK-NOFP-SD-NEXT:    ret
+;
+; CHECK-FP-SD-LABEL: test_v3f32:
+; CHECK-FP-SD:       // %bb.0:
+; CHECK-FP-SD-NEXT:    mov w8, #2139095040 // =0x7f800000
+; CHECK-FP-SD-NEXT:    fmov s1, w8
+; CHECK-FP-SD-NEXT:    mov v0.s[3], v1.s[0]
+; CHECK-FP-SD-NEXT:    fminnmv s0, v0.4s
+; CHECK-FP-SD-NEXT:    ret
+;
+; CHECK-NOFP-GI-LABEL: test_v3f32:
+; CHECK-NOFP-GI:       // %bb.0:
+; CHECK-NOFP-GI-NEXT:    mov s1, v0.s[1]
+; CHECK-NOFP-GI-NEXT:    mov s2, v0.s[2]
+; CHECK-NOFP-GI-NEXT:    fminnm s0, s0, s1
+; CHECK-NOFP-GI-NEXT:    fminnm s0, s0, s2
+; CHECK-NOFP-GI-NEXT:    ret
+;
+; CHECK-FP-GI-LABEL: test_v3f32:
+; CHECK-FP-GI:       // %bb.0:
+; CHECK-FP-GI-NEXT:    mov s1, v0.s[1]
+; CHECK-FP-GI-NEXT:    mov s2, v0.s[2]
+; CHECK-FP-GI-NEXT:    fminnm s0, s0, s1
+; CHECK-FP-GI-NEXT:    fminnm s0, s0, s2
+; CHECK-FP-GI-NEXT:    ret
   %b = call nnan float @llvm.vector.reduce.fmin.v3f32(<3 x float> %a)
   ret float %b
 }
 
 define float @test_v3f32_ninf(<3 x float> %a) nounwind {
-; CHECK-LABEL: test_v3f32_ninf:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #2139095039 // =0x7f7fffff
-; CHECK-NEXT:    fmov s1, w8
-; CHECK-NEXT:    mov v0.s[3], v1.s[0]
-; CHECK-NEXT:    fminnmv s0, v0.4s
-; CHECK-NEXT:    ret
+; CHECK-NOFP-SD-LABEL: test_v3f32_ninf:
+; CHECK-NOFP-SD:       // %bb.0:
+; CHECK-NOFP-SD-NEXT:    mov w8, #2139095039 // =0x7f7fffff
+; CHECK-NOFP-SD-NEXT:    fmov s1, w8
+; CHECK-NOFP-SD-NEXT:    mov v0.s[3], v1.s[0]
+; CHECK-NOFP-SD-NEXT:    fminnmv s0, v0.4s
+; CHECK-NOFP-SD-NEXT:    ret
+;
+; CHECK-FP-SD-LABEL: test_v3f32_ninf:
+; CHECK-FP-SD:       // %bb.0:
+; CHECK-FP-SD-NEXT:    mov w8, #2139095039 // =0x7f7fffff
+; CHECK-FP-SD-NEXT:    fmov s1, w8
+; CHECK-FP-SD-NEXT:    mov v0.s[3], v1.s[0]
+; CHECK-FP-SD-NEXT:    fminnmv s0, v0.4s
+; CHECK-FP-SD-NEXT:    ret
+;
+; CHECK-NOFP-GI-LABEL: test_v3f32_ninf:
+; CHECK-NOFP-GI:       // %bb.0:
+; CHECK-NOFP-GI-NEXT:    mov s1, v0.s[1]
+; CHECK-NOFP-GI-NEXT:    mov s2, v0.s[2]
+; CHECK-NOFP-GI-NEXT:    fminnm s0, s0, s1
+; CHECK-NOFP-GI-NEXT:    fminnm s0, s0, s2
+; CHECK-NOFP-GI-NEXT:    ret
+;
+; CHECK-FP-GI-LABEL: test_v3f32_ninf:
+; CHECK-FP-GI:       // %bb.0:
+; CHECK-FP-GI-NEXT:    mov s1, v0.s[1]
+; CHECK-FP-GI-NEXT:    mov s2, v0.s[2]
+; CHECK-FP-GI-NEXT:    fminnm s0, s0, s1
+; CHECK-FP-GI-NEXT:    fminnm s0, s0, s2
+; CHECK-FP-GI-NEXT:    ret
   %b = call nnan ninf float @llvm.vector.reduce.fmin.v3f32(<3 x float> %a)
   ret float %b
 }
 
 define fp128 @test_v2f128(<2 x fp128> %a) nounwind {
-; CHECK-LABEL: test_v2f128:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #48
-; CHECK-NEXT:    str x30, [sp, #32] // 8-byte Folded Spill
-; CHECK-NEXT:    stp q0, q1, [sp] // 32-byte Folded Spill
-; CHECK-NEXT:    bl __lttf2
-; CHECK-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    b.pl .LBB18_2
-; CHECK-NEXT:  // %bb.1:
-; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
-; CHECK-NEXT:  .LBB18_2:
-; CHECK-NEXT:    ldr x30, [sp, #32] // 8-byte Folded Reload
-; CHECK-NEXT:    add sp, sp, #48
-; CHECK-NEXT:    ret
+; CHECK-NOFP-SD-LABEL: test_v2f128:
+; CHECK-NOFP-SD:       // %bb.0:
+; CHECK-NOFP-SD-NEXT:    sub sp, sp, #48
+; CHECK-NOFP-SD-NEXT:    str x30, [sp, #32] // 8-byte Folded Spill
+; CHECK-NOFP-SD-NEXT:    stp q0, q1, [sp] // 32-byte Folded Spill
+; CHECK-NOFP-SD-NEXT:    bl __lttf2
+; CHECK-NOFP-SD-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
+; CHECK-NOFP-SD-NEXT:    cmp w0, #0
+; CHECK-NOFP-SD-NEXT:    b.pl .LBB18_2
+; CHECK-NOFP-SD-NEXT:  // %bb.1:
+; CHECK-NOFP-SD-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NOFP-SD-NEXT:  .LBB18_2:
+; CHECK-NOFP-SD-NEXT:    ldr x30, [sp, #32] // 8-byte Folded Reload
+; CHECK-NOFP-SD-NEXT:    add sp, sp, #48
+; CHECK-NOFP-SD-NEXT:    ret
+;
+; CHECK-FP-SD-LABEL: test_v2f128:
+; CHECK-FP-SD:       // %bb.0:
+; CHECK-FP-SD-NEXT:    sub sp, sp, #48
+; CHECK-FP-SD-NEXT:    str x30, [sp, #32] // 8-byte Folded Spill
+; CHECK-FP-SD-NEXT:    stp q0, q1, [sp] // 32-byte Folded Spill
+; CHECK-FP-SD-NEXT:    bl __lttf2
+; CHECK-FP-SD-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
+; CHECK-FP-SD-NEXT:    cmp w0, #0
+; CHECK-FP-SD-NEXT:    b.pl .LBB18_2
+; CHECK-FP-SD-NEXT:  // %bb.1:
+; CHECK-FP-SD-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-FP-SD-NEXT:  .LBB18_2:
+; CHECK-FP-SD-NEXT:    ldr x30, [sp, #32] // 8-byte Folded Reload
+; CHECK-FP-SD-NEXT:    add sp, sp, #48
+; CHECK-FP-SD-NEXT:    ret
+;
+; CHECK-NOFP-GI-LABEL: test_v2f128:
+; CHECK-NOFP-GI:       // %bb.0:
+; CHECK-NOFP-GI-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NOFP-GI-NEXT:    bl fminl
+; CHECK-NOFP-GI-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NOFP-GI-NEXT:    ret
+;
+; CHECK-FP-GI-LABEL: test_v2f128:
+; CHECK-FP-GI:       // %bb.0:
+; CHECK-FP-GI-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-FP-GI-NEXT:    bl fminl
+; CHECK-FP-GI-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-FP-GI-NEXT:    ret
   %b = call nnan fp128 @llvm.vector.reduce.fmin.v2f128(<2 x fp128> %a)
   ret fp128 %b
 }
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fminimum.ll b/llvm/test/CodeGen/AArch64/vecreduce-fminimum.ll
index e735f670ced0c..56ff68ed0eddc 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-fminimum.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-fminimum.ll
@@ -5,12 +5,8 @@
 ; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mattr=+neon,+fullfp16 -global-isel -global-isel-abort=2 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FP --check-prefix=CHECK-FP-GI
 
 ; CHECK-NOFP-GI:       warning: Instruction selection used fallback path for test_v11f16
-; CHECK-NOFP-GI-NEXT:  warning: Instruction selection used fallback path for test_v3f32
-; CHECK-NOFP-GI-NEXT:  warning: Instruction selection used fallback path for test_v3f32_ninf
 ;
 ; CHECK-FP-GI:       warning: Instruction selection used fallback path for test_v11f16
-; CHECK-FP-GI-NEXT:  warning: Instruction selection used fallback path for test_v3f32
-; CHECK-FP-GI-NEXT:  warning: Instruction selection used fallback path for test_v3f32_ninf
 
 declare half @llvm.vector.reduce.fminimum.v1f16(<1 x half> %a)
 declare float @llvm.vector.reduce.fminimum.v1f32(<1 x float> %a)
@@ -440,26 +436,74 @@ define half @test_v11f16(<11 x half> %a) nounwind {
 ; Neutral element is negative infinity which is chosen for padding the widened
 ; vector.
 define float @test_v3f32(<3 x float> %a) nounwind {
-; CHECK-LABEL: test_v3f32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #2139095040 // =0x7f800000
-; CHECK-NEXT:    fmov s1, w8
-; CHECK-NEXT:    mov v0.s[3], v1.s[0]
-; CHECK-NEXT:    fminv s0, v0.4s
-; CHECK-NEXT:    ret
+; CHECK-NOFP-SD-LABEL: test_v3f32:
+; CHECK-NOFP-SD:       // %bb.0:
+; CHECK-NOFP-SD-NEXT:    mov w8, #2139095040 // =0x7f800000
+; CHECK-NOFP-SD-NEXT:    fmov s1, w8
+; CHECK-NOFP-SD-NEXT:    mov v0.s[3], v1.s[0]
+; CHECK-NOFP-SD-NEXT:    fminv s0, v0.4s
+; CHECK-NOFP-SD-NEXT:    ret
+;
+; CHECK-FP-SD-LABEL: test_v3f32:
+; CHECK-FP-SD:       // %bb.0:
+; CHECK-FP-SD-NEXT:    mov w8, #2139095040 // =0x7f800000
+; CHECK-FP-SD-NEXT:    fmov s1, w8
+; CHECK-FP-SD-NEXT:    mov v0.s[3], v1.s[0]
+; CHECK-FP-SD-NEXT:    fminv s0, v0.4s
+; CHECK-FP-SD-NEXT:    ret
+;
+; CHECK-NOFP-GI-LABEL: test_v3f32:
+; CHECK-NOFP-GI:       // %bb.0:
+; CHECK-NOFP-GI-NEXT:    mov s1, v0.s[1]
+; CHECK-NOFP-GI-NEXT:    mov s2, v0.s[2]
+; CHECK-NOFP-GI-NEXT:    fmin s0, s0, s1
+; CHECK-NOFP-GI-NEXT:    fmin s0, s0, s2
+; CHECK-NOFP-GI-NEXT:    ret
+;
+; CHECK-FP-GI-LABEL: test_v3f32:
+; CHECK-FP-GI:       // %bb.0:
+; CHECK-FP-GI-NEXT:    mov s1, v0.s[1]
+; CHECK-FP-GI-NEXT:    mov s2, v0.s[2]
+; CHECK-FP-GI-NEXT:    fmin s0, s0, s1
+; CHECK-FP-GI-NEXT:    fmin s0, s0, s2
+; CHECK-FP-GI-NEXT:    ret
   %b = call float @llvm.vector.reduce.fminimum.v3f32(<3 x float> %a)
   ret float %b
 }
 
 ; Neutral element chosen for padding the widened vector is not negative infinity.
 define float @test_v3f32_ninf(<3 x float> %a) nounwind {
-; CHECK-LABEL: test_v3f32_ninf:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #2139095039 // =0x7f7fffff
-; CHECK-NEXT:    fmov s1, w8
-; CHECK-NEXT:    mov v0.s[3], v1.s[0]
-; CHECK-NEXT:    fminv s0, v0.4s
-; CHECK-NEXT:    ret
+; CHECK-NOFP-SD-LABEL: test_v3f32_ninf:
+; CHECK-NOFP-SD:       // %bb.0:
+; CHECK-NOFP-SD-NEXT:    mov w8, #2139095039 // =0x7f7fffff
+; CHECK-NOFP-SD-NEXT:    fmov s1, w8
+; CHECK-NOFP-SD-NEXT:    mov v0.s[3], v1.s[0]
+; CHECK-NOFP-SD-NEXT:    fminv s0, v0.4s
+; CHECK-NOFP-SD-NEXT:    ret
+;
+; CHECK-FP-SD-LABEL: test_v3f32_ninf:
+; CHECK-FP-SD:       // %bb.0:
+; CHECK-FP-SD-NEXT:    mov w8, #2139095039 // =0x7f7fffff
+; CHECK-FP-SD-NEXT:    fmov s1, w8
+; CHECK-FP-SD-NEXT:    mov v0.s[3], v1.s[0]
+; CHECK-FP-SD-NEXT:    fminv s0, v0.4s
+; CHECK-FP-SD-NEXT:    ret
+;
+; CHECK-NOFP-GI-LABEL: test_v3f32_ninf:
+; CHECK-NOFP-GI:       // %bb.0:
+; CHECK-NOFP-GI-NEXT:    mov s1, v0.s[1]
+; CHECK-NOFP-GI-NEXT:    mov s2, v0.s[2]
+; CHECK-NOFP-GI-NEXT:    fmin s0, s0, s1
+; CHECK-NOFP-GI-NEXT:    fmin s0, s0, s2
+; CHECK-NOFP-GI-NEXT:    ret
+;
+; CHECK-FP-GI-LABEL: test_v3f32_ninf:
+; CHECK-FP-GI:       // %bb.0:
+; CHECK-FP-GI-NEXT:    mov s1, v0.s[1]
+; CHECK-FP-GI-NEXT:    mov s2, v0.s[2]
+; CHECK-FP-GI-NEXT:    fmin s0, s0, s1
+; CHECK-FP-GI-NEXT:    fmin s0, s0, s2
+; CHECK-FP-GI-NEXT:    ret
   %b = call ninf float @llvm.vector.reduce.fminimum.v3f32(<3 x float> %a)
   ret float %b
 }
diff --git a/llvm/test/CodeGen/AArch64/xor-min-max.ll b/llvm/test/CodeGen/AArch64/xor-min-max.ll
new file mode 100644
index 0000000000000..2d6696e1c556e
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/xor-min-max.ll
@@ -0,0 +1,196 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=aarch64-unknown-unknown -mcpu=cortex-a53 | FileCheck %s
+
+; Test for DAGCombiner optimization: fold (xor (smin(x, C), C)) -> select (x < C), xor (x, C), 0
+
+define i64 @test_smin_neg_one(i64 %a) {
+; CHECK-LABEL: test_smin_neg_one:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmn x0, #1
+; CHECK-NEXT:    csinv x0, xzr, x0, ge
+; CHECK-NEXT:    ret
+  %1 = tail call i64 @llvm.smin.i64(i64 %a, i64 -1)
+  %retval.0 = xor i64 %1, -1
+  ret i64 %retval.0
+}
+
+define i64 @test_smin_constant(i64 %a) {
+; CHECK-LABEL: test_smin_constant:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    eor x8, x0, #0x8
+; CHECK-NEXT:    cmp x0, #8
+; CHECK-NEXT:    csel x0, x8, xzr, lt
+; CHECK-NEXT:    ret
+  %1 = tail call i64 @llvm.smin.i64(i64 %a, i64 8)
+  %retval.0 = xor i64 %1, 8
+  ret i64 %retval.0
+}
+
+; Test for DAGCombiner optimization: fold (xor (smax(x, C), C)) -> select (x > C), xor (x, C), 0
+define i64 @test_smax_neg_one(i64 %a) {
+; CHECK-LABEL: test_smax_neg_one:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mvn x8, x0
+; CHECK-NEXT:    bic x0, x8, x0, asr #63
+; CHECK-NEXT:    ret
+  %1 = tail call i64 @llvm.smax.i64(i64 %a, i64 -1)
+  %retval.0 = xor i64 %1, -1
+  ret i64 %retval.0
+}
+
+define i64 @test_smax_constant(i64 %a) {
+; CHECK-LABEL: test_smax_constant:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    eor x8, x0, #0x8
+; CHECK-NEXT:    cmp x0, #8
+; CHECK-NEXT:    csel x0, x8, xzr, gt
+; CHECK-NEXT:    ret
+  %1 = tail call i64 @llvm.smax.i64(i64 %a, i64 8)
+  %retval.0 = xor i64 %1, 8
+  ret i64 %retval.0
+}
+
+define i64 @test_umin_neg_one(i64 %a) {
+; CHECK-LABEL: test_umin_neg_one:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mvn x0, x0
+; CHECK-NEXT:    ret
+  %1 = tail call i64 @llvm.umin.i64(i64 %a, i64 -1)
+  %retval.0 = xor i64 %1, -1
+  ret i64 %retval.0
+}
+
+define i64 @test_umin_constant(i64 %a) {
+; CHECK-LABEL: test_umin_constant:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    eor x8, x0, #0x8
+; CHECK-NEXT:    cmp x0, #8
+; CHECK-NEXT:    csel x0, x8, xzr, lo
+; CHECK-NEXT:    ret
+  %1 = tail call i64 @llvm.umin.i64(i64 %a, i64 8)
+  %retval.0 = xor i64 %1, 8
+  ret i64 %retval.0
+}
+
+define i64 @test_umax_neg_one(i64 %a) {
+; CHECK-LABEL: test_umax_neg_one:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov x0, xzr
+; CHECK-NEXT:    ret
+  %1 = tail call i64 @llvm.umax.i64(i64 %a, i64 -1)
+  %retval.0 = xor i64 %1, -1
+  ret i64 %retval.0
+}
+
+define i64 @test_umax_constant(i64 %a) {
+; CHECK-LABEL: test_umax_constant:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    eor x8, x0, #0x8
+; CHECK-NEXT:    cmp x0, #8
+; CHECK-NEXT:    csel x0, x8, xzr, hi
+; CHECK-NEXT:    ret
+  %1 = tail call i64 @llvm.umax.i64(i64 %a, i64 8)
+  %retval.0 = xor i64 %1, 8
+  ret i64 %retval.0
+}
+
+; Test vector cases
+define <4 x i32> @test_smin_vector_neg_one(<4 x i32> %a) {
+; CHECK-LABEL: test_smin_vector_neg_one:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi v1.2d, #0xffffffffffffffff
+; CHECK-NEXT:    cmgt v1.4s, v1.4s, v0.4s
+; CHECK-NEXT:    bic v0.16b, v1.16b, v0.16b
+; CHECK-NEXT:    ret
+  %1 = tail call <4 x i32> @llvm.smin.v4i32(<4 x i32> %a, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
+  %retval.0 = xor <4 x i32> %1, <i32 -1, i32 -1, i32 -1, i32 -1>
+  ret <4 x i32> %retval.0
+}
+
+define <4 x i32> @test_smin_vector_constant(<4 x i32> %a) {
+; CHECK-LABEL: test_smin_vector_constant:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi v1.4s, #8
+; CHECK-NEXT:    smin v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    eor v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    ret
+  %1 = tail call <4 x i32> @llvm.smin.v4i32(<4 x i32> %a, <4 x i32> <i32 8, i32 8, i32 8, i32 8>)
+  %retval.0 = xor <4 x i32> %1, <i32 8, i32 8, i32 8, i32 8>
+  ret <4 x i32> %retval.0
+}
+
+define <4 x i32> @test_smax_vector_neg_one(<4 x i32> %a) {
+; CHECK-LABEL: test_smax_vector_neg_one:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmge v1.4s, v0.4s, #0
+; CHECK-NEXT:    bic v0.16b, v1.16b, v0.16b
+; CHECK-NEXT:    ret
+  %1 = tail call <4 x i32> @llvm.smax.v4i32(<4 x i32> %a, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
+  %retval.0 = xor <4 x i32> %1, <i32 -1, i32 -1, i32 -1, i32 -1>
+  ret <4 x i32> %retval.0
+}
+
+define <4 x i32> @test_smax_vector_constant(<4 x i32> %a) {
+; CHECK-LABEL: test_smax_vector_constant:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi v1.4s, #8
+; CHECK-NEXT:    smax v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    eor v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    ret
+  %1 = tail call <4 x i32> @llvm.smax.v4i32(<4 x i32> %a, <4 x i32> <i32 8, i32 8, i32 8, i32 8>)
+  %retval.0 = xor <4 x i32> %1, <i32 8, i32 8, i32 8, i32 8>
+  ret <4 x i32> %retval.0
+}
+
+define <4 x i32> @test_umin_vector_neg_one(<4 x i32> %a) {
+; CHECK-LABEL: test_umin_vector_neg_one:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mvn v0.16b, v0.16b
+; CHECK-NEXT:    ret
+  %1 = tail call <4 x i32> @llvm.umin.v4i32(<4 x i32> %a, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
+  %retval.0 = xor <4 x i32> %1, <i32 -1, i32 -1, i32 -1, i32 -1>
+  ret <4 x i32> %retval.0
+}
+
+define <4 x i32> @test_umin_vector_constant(<4 x i32> %a) {
+; CHECK-LABEL: test_umin_vector_constant:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi v1.4s, #8
+; CHECK-NEXT:    umin v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    eor v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    ret
+  %1 = tail call <4 x i32> @llvm.umin.v4i32(<4 x i32> %a, <4 x i32> <i32 8, i32 8, i32 8, i32 8>)
+  %retval.0 = xor <4 x i32> %1, <i32 8, i32 8, i32 8, i32 8>
+  ret <4 x i32> %retval.0
+}
+
+define <4 x i32> @test_umax_vector_neg_one(<4 x i32> %a) {
+; CHECK-LABEL: test_umax_vector_neg_one:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-NEXT:    ret
+  %1 = tail call <4 x i32> @llvm.umax.v4i32(<4 x i32> %a, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
+  %retval.0 = xor <4 x i32> %1, <i32 -1, i32 -1, i32 -1, i32 -1>
+  ret <4 x i32> %retval.0
+}
+
+define <4 x i32> @test_umax_vector_constant(<4 x i32> %a) {
+; CHECK-LABEL: test_umax_vector_constant:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi v1.4s, #8
+; CHECK-NEXT:    umax v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    eor v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    ret
+  %1 = tail call <4 x i32> @llvm.umax.v4i32(<4 x i32> %a, <4 x i32> <i32 8, i32 8, i32 8, i32 8>)
+  %retval.0 = xor <4 x i32> %1, <i32 8, i32 8, i32 8, i32 8>
+  ret <4 x i32> %retval.0
+}
+
+declare i64 @llvm.smin.i64(i64, i64)
+declare i64 @llvm.smax.i64(i64, i64)
+declare i64 @llvm.umin.i64(i64, i64)
+declare i64 @llvm.umax.i64(i64, i64)
+declare <4 x i32> @llvm.smin.v4i32(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.smax.v4i32(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.umin.v4i32(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.umax.v4i32(<4 x i32>, <4 x i32>)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll
index 679d4a26d26b2..c16c8e2128c72 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll
@@ -430,5 +430,5 @@ define void @func_caller_byval(ptr addrspace(5) %argptr) {
 
 declare void @llvm.memset.p5.i32(ptr addrspace(5) nocapture writeonly, i8, i32, i1 immarg) #1
 
-attributes #0 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
+attributes #0 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-cluster-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-cluster-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-cluster-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
 attributes #1 = { argmemonly nofree nounwind willreturn writeonly }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/dereferenceable-declaration.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/dereferenceable-declaration.ll
index 13828c2d8a6a0..c92e5c5f8bfff 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/dereferenceable-declaration.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/dereferenceable-declaration.ll
@@ -232,4 +232,4 @@ define i64 @load_deref_or_null_maxmimum_callsite_declaration_only() {
   ret i64 %add
 }
 
-attributes #0 = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
+attributes #0 = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-cluster-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-cluster-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-cluster-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.gfx.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.gfx.ll
index b520ce1826ec9..3d224f2f6bf05 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.gfx.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.gfx.ll
@@ -9,12 +9,11 @@
 define amdgpu_ps void @amdgpu_ps() {
 ; MESA-LABEL: amdgpu_ps:
 ; MESA:       ; %bb.0:
-; MESA-NEXT:    s_add_u32 flat_scratch_lo, s2, s4
-; MESA-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
+; MESA-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; MESA-NEXT:    s_mov_b32 s0, 0
-; MESA-NEXT:    s_mov_b64 s[2:3], src_private_base
-; MESA-NEXT:    s_mov_b32 s1, s3
+; MESA-NEXT:    s_add_u32 flat_scratch_lo, s2, s4
 ; MESA-NEXT:    v_mov_b32_e32 v0, s0
+; MESA-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
 ; MESA-NEXT:    v_mov_b32_e32 v2, 0
 ; MESA-NEXT:    v_mov_b32_e32 v1, s1
 ; MESA-NEXT:    flat_store_dword v[0:1], v2
@@ -30,11 +29,10 @@ define amdgpu_ps void @amdgpu_ps() {
 ; PAL-NEXT:    s_waitcnt lgkmcnt(0)
 ; PAL-NEXT:    s_and_b32 s3, s3, 0xffff
 ; PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s0
-; PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
+; PAL-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; PAL-NEXT:    s_mov_b32 s0, 0
-; PAL-NEXT:    s_mov_b64 s[2:3], src_private_base
-; PAL-NEXT:    s_mov_b32 s1, s3
 ; PAL-NEXT:    v_mov_b32_e32 v0, s0
+; PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
 ; PAL-NEXT:    v_mov_b32_e32 v1, s1
 ; PAL-NEXT:    flat_store_dword v[0:1], v2
 ; PAL-NEXT:    s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll
index 86766e2904619..9539ec465e02f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll
@@ -65,52 +65,52 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
 ;
 ; GFX9V4-LABEL: addrspacecast:
 ; GFX9V4:       ; %bb.0:
-; GFX9V4-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX9V4-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX9V4-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
 ; GFX9V4-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
-; GFX9V4-NEXT:    s_mov_b64 s[2:3], src_private_base
-; GFX9V4-NEXT:    s_mov_b64 s[4:5], src_shared_base
+; GFX9V4-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX9V4-NEXT:    s_mov_b64 s[2:3], src_shared_base
 ; GFX9V4-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9V4-NEXT:    s_mov_b32 s2, s0
-; GFX9V4-NEXT:    s_cmp_lg_u32 s0, -1
+; GFX9V4-NEXT:    s_mov_b32 s0, s4
+; GFX9V4-NEXT:    s_cmp_lg_u32 s4, -1
+; GFX9V4-NEXT:    s_cselect_b64 s[0:1], s[0:1], 0
+; GFX9V4-NEXT:    s_mov_b32 s2, s5
+; GFX9V4-NEXT:    s_cmp_lg_u32 s5, -1
+; GFX9V4-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9V4-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
-; GFX9V4-NEXT:    s_mov_b32 s4, s1
-; GFX9V4-NEXT:    s_cmp_lg_u32 s1, -1
-; GFX9V4-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9V4-NEXT:    s_cselect_b64 s[0:1], s[4:5], 0
 ; GFX9V4-NEXT:    v_mov_b32_e32 v2, 1
-; GFX9V4-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9V4-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9V4-NEXT:    flat_store_dword v[0:1], v2
 ; GFX9V4-NEXT:    s_waitcnt vmcnt(0)
-; GFX9V4-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9V4-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX9V4-NEXT:    v_mov_b32_e32 v2, 2
-; GFX9V4-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9V4-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9V4-NEXT:    flat_store_dword v[0:1], v2
 ; GFX9V4-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9V4-NEXT:    s_endpgm
 ;
 ; GFX9V5-LABEL: addrspacecast:
 ; GFX9V5:       ; %bb.0:
-; GFX9V5-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX9V5-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX9V5-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
 ; GFX9V5-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
-; GFX9V5-NEXT:    s_mov_b64 s[2:3], src_private_base
-; GFX9V5-NEXT:    s_mov_b64 s[4:5], src_shared_base
+; GFX9V5-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX9V5-NEXT:    s_mov_b64 s[2:3], src_shared_base
 ; GFX9V5-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9V5-NEXT:    s_mov_b32 s2, s0
-; GFX9V5-NEXT:    s_cmp_lg_u32 s0, -1
+; GFX9V5-NEXT:    s_mov_b32 s0, s4
+; GFX9V5-NEXT:    s_cmp_lg_u32 s4, -1
+; GFX9V5-NEXT:    s_cselect_b64 s[0:1], s[0:1], 0
+; GFX9V5-NEXT:    s_mov_b32 s2, s5
+; GFX9V5-NEXT:    s_cmp_lg_u32 s5, -1
+; GFX9V5-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9V5-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
-; GFX9V5-NEXT:    s_mov_b32 s4, s1
-; GFX9V5-NEXT:    s_cmp_lg_u32 s1, -1
-; GFX9V5-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9V5-NEXT:    s_cselect_b64 s[0:1], s[4:5], 0
 ; GFX9V5-NEXT:    v_mov_b32_e32 v2, 1
-; GFX9V5-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9V5-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9V5-NEXT:    flat_store_dword v[0:1], v2
 ; GFX9V5-NEXT:    s_waitcnt vmcnt(0)
-; GFX9V5-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9V5-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX9V5-NEXT:    v_mov_b32_e32 v2, 2
-; GFX9V5-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9V5-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9V5-NEXT:    flat_store_dword v[0:1], v2
 ; GFX9V5-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9V5-NEXT:    s_endpgm
@@ -150,10 +150,10 @@ define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) #0 {
 ;
 ; GFX9V4-LABEL: llvm_amdgcn_is_shared:
 ; GFX9V4:       ; %bb.0:
-; GFX9V4-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX9V4-NEXT:    s_mov_b64 s[2:3], src_shared_base
+; GFX9V4-NEXT:    s_load_dwordx2 s[2:3], s[8:9], 0x0
+; GFX9V4-NEXT:    s_mov_b64 s[0:1], src_shared_base
 ; GFX9V4-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9V4-NEXT:    s_cmp_eq_u32 s1, s3
+; GFX9V4-NEXT:    s_cmp_eq_u32 s3, s1
 ; GFX9V4-NEXT:    s_cselect_b32 s0, 1, 0
 ; GFX9V4-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9V4-NEXT:    global_store_dword v[0:1], v0, off
@@ -162,10 +162,10 @@ define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) #0 {
 ;
 ; GFX9V5-LABEL: llvm_amdgcn_is_shared:
 ; GFX9V5:       ; %bb.0:
-; GFX9V5-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX9V5-NEXT:    s_mov_b64 s[2:3], src_shared_base
+; GFX9V5-NEXT:    s_load_dwordx2 s[2:3], s[8:9], 0x0
+; GFX9V5-NEXT:    s_mov_b64 s[0:1], src_shared_base
 ; GFX9V5-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9V5-NEXT:    s_cmp_eq_u32 s1, s3
+; GFX9V5-NEXT:    s_cmp_eq_u32 s3, s1
 ; GFX9V5-NEXT:    s_cselect_b32 s0, 1, 0
 ; GFX9V5-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9V5-NEXT:    global_store_dword v[0:1], v0, off
@@ -206,10 +206,10 @@ define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) #0 {
 ;
 ; GFX9V4-LABEL: llvm_amdgcn_is_private:
 ; GFX9V4:       ; %bb.0:
-; GFX9V4-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX9V4-NEXT:    s_mov_b64 s[2:3], src_private_base
+; GFX9V4-NEXT:    s_load_dwordx2 s[2:3], s[8:9], 0x0
+; GFX9V4-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX9V4-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9V4-NEXT:    s_cmp_eq_u32 s1, s3
+; GFX9V4-NEXT:    s_cmp_eq_u32 s3, s1
 ; GFX9V4-NEXT:    s_cselect_b32 s0, 1, 0
 ; GFX9V4-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9V4-NEXT:    global_store_dword v[0:1], v0, off
@@ -218,10 +218,10 @@ define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) #0 {
 ;
 ; GFX9V5-LABEL: llvm_amdgcn_is_private:
 ; GFX9V5:       ; %bb.0:
-; GFX9V5-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX9V5-NEXT:    s_mov_b64 s[2:3], src_private_base
+; GFX9V5-NEXT:    s_load_dwordx2 s[2:3], s[8:9], 0x0
+; GFX9V5-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX9V5-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9V5-NEXT:    s_cmp_eq_u32 s1, s3
+; GFX9V5-NEXT:    s_cmp_eq_u32 s3, s1
 ; GFX9V5-NEXT:    s_cselect_b32 s0, 1, 0
 ; GFX9V5-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9V5-NEXT:    global_store_dword v[0:1], v0, off
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-abi-attribute-hints.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-abi-attribute-hints.ll
index bbbce9a0719ab..1bf2a589cb597 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-abi-attribute-hints.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-abi-attribute-hints.ll
@@ -88,7 +88,7 @@ define amdgpu_kernel void @kernel_call_no_workgroup_ids() {
   ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @extern, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; CHECK-NEXT:   S_ENDPGM 0
-  call void @extern() "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z"
+  call void @extern() "amdgpu-no-workgroup-id-x" "amdgpu-no-cluster-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-cluster-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-cluster-id-z"
   ret void
 }
 
@@ -124,7 +124,7 @@ define amdgpu_kernel void @kernel_call_no_other_sgprs() {
   ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @extern, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr8_sgpr9, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; CHECK-NEXT:   S_ENDPGM 0
-  call void @extern() "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z"
+  call void @extern() "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-cluster-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-cluster-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-cluster-id-z"
   ret void
 }
 
@@ -198,7 +198,7 @@ define void @func_call_no_workgroup_ids() {
   ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @extern, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; CHECK-NEXT:   SI_RETURN
-  call void @extern() "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z"
+  call void @extern() "amdgpu-no-workgroup-id-x" "amdgpu-no-cluster-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-cluster-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-cluster-id-z"
   ret void
 }
 
@@ -223,7 +223,7 @@ define void @func_call_no_other_sgprs() {
   ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @extern, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr8_sgpr9, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; CHECK-NEXT:   SI_RETURN
-  call void @extern() "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z"
+  call void @extern() "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-cluster-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-cluster-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-cluster-id-z"
   ret void
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-inline-asm.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-inline-asm.ll
index f9d11cb23fa4e..2cde060529bec 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-inline-asm.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-inline-asm.ll
@@ -136,7 +136,7 @@ define float @test_multiple_register_outputs_same() #0 {
 define double @test_multiple_register_outputs_mixed() #0 {
   ; CHECK-LABEL: name: test_multiple_register_outputs_mixed
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   INLINEASM &"v_mov_b32 $0, 0; v_add_f64 $1, 0, 0", 0 /* attdialect */, 2031626 /* regdef:VGPR_32 */, def %8, 3670026 /* regdef:VReg_64 */, def %9
+  ; CHECK-NEXT:   INLINEASM &"v_mov_b32 $0, 0; v_add_f64 $1, 0, 0", 0 /* attdialect */, 2031626 /* regdef:VGPR_32 */, def %8, 3735562 /* regdef:VReg_64 */, def %9
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY %8
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s64) = COPY %9
   ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](s64)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-sibling-call.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-sibling-call.ll
index 97c3e903c9aec..7b2e3bf13c368 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-sibling-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-sibling-call.ll
@@ -1486,5 +1486,5 @@ entry:
 }
 
 attributes #0 = { nounwind }
-attributes #1 = { nounwind noinline "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
+attributes #1 = { nounwind noinline "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-cluster-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-cluster-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-cluster-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
 
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-addrspacecast.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-addrspacecast.mir
index d69a3e1a15bbd..4471980c1ba1c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-addrspacecast.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-addrspacecast.mir
@@ -158,8 +158,8 @@ body: |
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; GFX9-NEXT: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[COPY]](p5)
-    ; GFX9-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64(s64) = S_MOV_B64 $src_private_base
-    ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[S_MOV_B64_]](s64)
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sreg_64(s64) = COPY $src_private_base
+    ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](s64)
     ; GFX9-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[PTRTOINT]](s32), [[UV1]](s32)
     ; GFX9-NEXT: [[C:%[0-9]+]]:_(p5) = G_CONSTANT i32 -1
     ; GFX9-NEXT: [[C1:%[0-9]+]]:_(p0) = G_CONSTANT i64 0
@@ -227,8 +227,8 @@ body: |
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX9-NEXT: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[COPY]](p3)
-    ; GFX9-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64(s64) = S_MOV_B64 $src_shared_base
-    ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[S_MOV_B64_]](s64)
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sreg_64(s64) = COPY $src_shared_base
+    ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](s64)
     ; GFX9-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[PTRTOINT]](s32), [[UV1]](s32)
     ; GFX9-NEXT: [[C:%[0-9]+]]:_(p3) = G_CONSTANT i32 -1
     ; GFX9-NEXT: [[C1:%[0-9]+]]:_(p0) = G_CONSTANT i64 0
@@ -380,16 +380,16 @@ body: |
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<2 x p3>) = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[UV:%[0-9]+]]:_(p3), [[UV1:%[0-9]+]]:_(p3) = G_UNMERGE_VALUES [[COPY]](<2 x p3>)
     ; GFX9-NEXT: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[UV]](p3)
-    ; GFX9-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64(s64) = S_MOV_B64 $src_shared_base
-    ; GFX9-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[S_MOV_B64_]](s64)
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sreg_64(s64) = COPY $src_shared_base
+    ; GFX9-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](s64)
     ; GFX9-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[PTRTOINT]](s32), [[UV3]](s32)
     ; GFX9-NEXT: [[C:%[0-9]+]]:_(p3) = G_CONSTANT i32 -1
     ; GFX9-NEXT: [[C1:%[0-9]+]]:_(p0) = G_CONSTANT i64 0
     ; GFX9-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[UV]](p3), [[C]]
     ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(p0) = G_SELECT [[ICMP]](s1), [[MV]], [[C1]]
     ; GFX9-NEXT: [[PTRTOINT1:%[0-9]+]]:_(s32) = G_PTRTOINT [[UV1]](p3)
-    ; GFX9-NEXT: [[S_MOV_B64_1:%[0-9]+]]:sreg_64(s64) = S_MOV_B64 $src_shared_base
-    ; GFX9-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[S_MOV_B64_1]](s64)
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sreg_64(s64) = COPY $src_shared_base
+    ; GFX9-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY2]](s64)
     ; GFX9-NEXT: [[MV1:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[PTRTOINT1]](s32), [[UV5]](s32)
     ; GFX9-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[UV1]](p3), [[C]]
     ; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(p0) = G_SELECT [[ICMP1]](s1), [[MV1]], [[C1]]
@@ -517,8 +517,8 @@ body: |
     ; GFX9-LABEL: name: test_addrspacecast_p5_fi_to_p0
     ; GFX9: [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %stack.0
     ; GFX9-NEXT: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[FRAME_INDEX]](p5)
-    ; GFX9-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64(s64) = S_MOV_B64 $src_private_base
-    ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[S_MOV_B64_]](s64)
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:sreg_64(s64) = COPY $src_private_base
+    ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64)
     ; GFX9-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[PTRTOINT]](s32), [[UV1]](s32)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[MV]](p0)
     %0:_(p5) = G_FRAME_INDEX %stack.0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll
index 393a462954003..5720b882f4e73 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll
@@ -58,8 +58,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4bf16_1k(ptr addrspace(1) %arg) #
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_32x32x4bf16_1k a[0:31], v[0:1], v[2:3], a[0:31] cbsz:1 abid:2 blgp:3
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 15
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    global_store_dwordx4 v0, a[0:3], s[34:35]
 ; GCN-NEXT:    global_store_dwordx4 v0, a[4:7], s[34:35] offset:16
@@ -109,8 +108,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4bf16_1k(ptr addrspace(1) %arg) #
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_16x16x4bf16_1k a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 1
+; GCN-NEXT:    s_nop 9
 ; GCN-NEXT:    global_store_dwordx4 v0, a[0:3], s[16:17]
 ; GCN-NEXT:    global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
 ; GCN-NEXT:    global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
@@ -185,8 +183,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8bf16_1k(ptr addrspace(1) %arg) #
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_32x32x8bf16_1k a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 15
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    global_store_dwordx4 v0, a[0:3], s[16:17]
 ; GCN-NEXT:    global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
@@ -220,8 +217,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x16bf16_1k(ptr addrspace(1) %arg)
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_16x16x16bf16_1k a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 1
+; GCN-NEXT:    s_nop 9
 ; GCN-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
 ; GCN-NEXT:    s_endpgm
 bb:
@@ -277,8 +273,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64(ptr addrspace(1) %arg, doubl
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 15
 ; GCN-NEXT:    s_nop 0
 ; GCN-NEXT:    global_store_dwordx4 v0, a[0:3], s[8:9]
 ; GCN-NEXT:    global_store_dwordx4 v0, a[4:7], s[8:9] offset:16
@@ -302,8 +297,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm(ptr addrspace(1) %
 ; GCN-NEXT:    v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], 0
 ; GCN-NEXT:    v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 15
 ; GCN-NEXT:    s_nop 0
 ; GCN-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
 ; GCN-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
@@ -336,8 +330,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(ptr addrspace(1) %arg, d
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7]
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 15
 ; GCN-NEXT:    s_nop 0
 ; GCN-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
 ; GCN-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
@@ -369,8 +362,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(ptr addrspace(1) %
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7]
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 15
 ; GCN-NEXT:    s_nop 0
 ; GCN-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
 ; GCN-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-copy.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-copy.mir
index bf8f2d633c1dc..fce3805712794 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-copy.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-copy.mir
@@ -16,6 +16,12 @@ body: |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
     ; CHECK-NEXT: $vgpr0 = COPY [[COPY]](s32)
+    ;
+    ; WAVE32-LABEL: name: copy_s32_vgpr_to_vgpr
+    ; WAVE32: liveins: $vgpr0
+    ; WAVE32-NEXT: {{  $}}
+    ; WAVE32-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; WAVE32-NEXT: $vgpr0 = COPY [[COPY]](s32)
     %0:_(s32) = COPY $vgpr0
     $vgpr0 = COPY %0
 
@@ -33,6 +39,12 @@ body: |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
     ; CHECK-NEXT: $sgpr0 = COPY [[COPY]](s32)
+    ;
+    ; WAVE32-LABEL: name: copy_s32_sgpr_to_sgpr
+    ; WAVE32: liveins: $sgpr0
+    ; WAVE32-NEXT: {{  $}}
+    ; WAVE32-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+    ; WAVE32-NEXT: $sgpr0 = COPY [[COPY]](s32)
     %0:_(s32) = COPY $sgpr0
     $sgpr0 = COPY %0
 
@@ -50,6 +62,12 @@ body: |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
     ; CHECK-NEXT: $vgpr0 = COPY [[COPY]](s32)
+    ;
+    ; WAVE32-LABEL: name: copy_s32_sgpr_to_vgpr
+    ; WAVE32: liveins: $sgpr0
+    ; WAVE32-NEXT: {{  $}}
+    ; WAVE32-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+    ; WAVE32-NEXT: $vgpr0 = COPY [[COPY]](s32)
     %0:_(s32) = COPY $sgpr0
     $vgpr0 = COPY %0
 
@@ -67,6 +85,12 @@ body: |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
     ; CHECK-NEXT: $agpr0 = COPY [[COPY]](s32)
+    ;
+    ; WAVE32-LABEL: name: copy_s32_vgpr_to_agpr
+    ; WAVE32: liveins: $vgpr0
+    ; WAVE32-NEXT: {{  $}}
+    ; WAVE32-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; WAVE32-NEXT: $agpr0 = COPY [[COPY]](s32)
     %0:_(s32) = COPY $vgpr0
     $agpr0 = COPY %0
 
@@ -84,6 +108,12 @@ body: |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
     ; CHECK-NEXT: $agpr0 = COPY [[COPY]](s32)
+    ;
+    ; WAVE32-LABEL: name: copy_s32_sgpr_to_agpr
+    ; WAVE32: liveins: $sgpr0
+    ; WAVE32-NEXT: {{  $}}
+    ; WAVE32-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+    ; WAVE32-NEXT: $agpr0 = COPY [[COPY]](s32)
     %0:_(s32) = COPY $sgpr0
     $agpr0 = COPY %0
 
@@ -101,6 +131,12 @@ body: |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:agpr(s32) = COPY $agpr0
     ; CHECK-NEXT: $vgpr0 = COPY [[COPY]](s32)
+    ;
+    ; WAVE32-LABEL: name: copy_s32_agpr_to_vgpr
+    ; WAVE32: liveins: $agpr0
+    ; WAVE32-NEXT: {{  $}}
+    ; WAVE32-NEXT: [[COPY:%[0-9]+]]:agpr(s32) = COPY $agpr0
+    ; WAVE32-NEXT: $vgpr0 = COPY [[COPY]](s32)
     %0:_(s32) = COPY $agpr0
     $vgpr0 = COPY %0
 
@@ -118,6 +154,12 @@ body: |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:agpr(s32) = COPY $agpr0
     ; CHECK-NEXT: $agpr0 = COPY [[COPY]](s32)
+    ;
+    ; WAVE32-LABEL: name: copy_s32_agpr_to_agpr
+    ; WAVE32: liveins: $agpr0
+    ; WAVE32-NEXT: {{  $}}
+    ; WAVE32-NEXT: [[COPY:%[0-9]+]]:agpr(s32) = COPY $agpr0
+    ; WAVE32-NEXT: $agpr0 = COPY [[COPY]](s32)
     %0:_(s32) = COPY $agpr0
     $agpr0 = COPY %0
 
@@ -137,6 +179,14 @@ body: |
     ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s1) = G_TRUNC [[COPY]](s32)
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vcc(s1) = COPY [[TRUNC]](s1)
     ; CHECK-NEXT: S_ENDPGM 0, implicit [[COPY1]](s1)
+    ;
+    ; WAVE32-LABEL: name: copy_s1_sgpr_to_vcc_preassigned
+    ; WAVE32: liveins: $sgpr0
+    ; WAVE32-NEXT: {{  $}}
+    ; WAVE32-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+    ; WAVE32-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s1) = G_TRUNC [[COPY]](s32)
+    ; WAVE32-NEXT: [[COPY1:%[0-9]+]]:vcc(s1) = COPY [[TRUNC]](s1)
+    ; WAVE32-NEXT: S_ENDPGM 0, implicit [[COPY1]](s1)
     %0:sgpr(s32) = COPY $sgpr0
     %1:sgpr(s1) = G_TRUNC %0
     %2:vcc(s1) = COPY %1
@@ -157,6 +207,14 @@ body: |
     ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s1) = G_TRUNC [[COPY]](s32)
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vcc(s1) = COPY [[TRUNC]](s1)
     ; CHECK-NEXT: S_ENDPGM 0, implicit [[COPY1]](s1)
+    ;
+    ; WAVE32-LABEL: name: copy_s1_vgpr_to_vcc_preassigned
+    ; WAVE32: liveins: $vgpr0
+    ; WAVE32-NEXT: {{  $}}
+    ; WAVE32-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; WAVE32-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s1) = G_TRUNC [[COPY]](s32)
+    ; WAVE32-NEXT: [[COPY1:%[0-9]+]]:vcc(s1) = COPY [[TRUNC]](s1)
+    ; WAVE32-NEXT: S_ENDPGM 0, implicit [[COPY1]](s1)
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s1) = G_TRUNC %0
     %2:vcc(s1) = COPY %1
@@ -177,6 +235,14 @@ body: |
     ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s1) = G_TRUNC [[COPY]](s32)
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vcc(s1) = COPY [[TRUNC]](s1)
     ; CHECK-NEXT: S_ENDPGM 0, implicit [[COPY1]](s1)
+    ;
+    ; WAVE32-LABEL: name: copy_s1_sgpr_to_vcc
+    ; WAVE32: liveins: $sgpr0
+    ; WAVE32-NEXT: {{  $}}
+    ; WAVE32-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+    ; WAVE32-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s1) = G_TRUNC [[COPY]](s32)
+    ; WAVE32-NEXT: [[COPY1:%[0-9]+]]:vcc(s1) = COPY [[TRUNC]](s1)
+    ; WAVE32-NEXT: S_ENDPGM 0, implicit [[COPY1]](s1)
     %0:_(s32) = COPY $sgpr0
     %1:_(s1) = G_TRUNC %0
     %2:vcc(s1) = COPY %1
@@ -198,6 +264,14 @@ body: |
     ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s1) = G_TRUNC [[COPY]](s32)
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vcc(s1) = COPY [[TRUNC]](s1)
     ; CHECK-NEXT: S_ENDPGM 0, implicit [[COPY1]](s1)
+    ;
+    ; WAVE32-LABEL: name: copy_s1_vgpr_to_vcc
+    ; WAVE32: liveins: $vgpr0
+    ; WAVE32-NEXT: {{  $}}
+    ; WAVE32-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; WAVE32-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s1) = G_TRUNC [[COPY]](s32)
+    ; WAVE32-NEXT: [[COPY1:%[0-9]+]]:vcc(s1) = COPY [[TRUNC]](s1)
+    ; WAVE32-NEXT: S_ENDPGM 0, implicit [[COPY1]](s1)
     %0:_(s32) = COPY $vgpr0
     %1:_(s1) = G_TRUNC %0
     %2:vcc(s1) = COPY %1
@@ -215,9 +289,17 @@ body: |
     ; CHECK: liveins: $sgpr4_sgpr5
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vcc(s1) = COPY $sgpr4_sgpr5
-    ; CHECK-NEXT: [[CONST1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[CONST2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[COPY]](s1), [[CONST1]], [[CONST2]]
+    ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[COPY]](s1), [[C]], [[C1]]
+    ;
+    ; WAVE32-LABEL: name: wave64_copy_sgpr_64_to_s1
+    ; WAVE32: liveins: $sgpr4_sgpr5
+    ; WAVE32-NEXT: {{  $}}
+    ; WAVE32-NEXT: [[COPY:%[0-9]+]]:vcc(s1) = COPY $sgpr4_sgpr5
+    ; WAVE32-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1
+    ; WAVE32-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
+    ; WAVE32-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[COPY]](s1), [[C]], [[C1]]
     %0:_(s1) = COPY $sgpr4_sgpr5
     %1:_(s32) = G_ZEXT %0:_(s1)
 ...
@@ -229,13 +311,21 @@ legalized: true
 body: |
   bb.0:
     liveins: $sgpr0
+    ; CHECK-LABEL: name: wave32_copy_sgpr_32_to_s1
+    ; CHECK: liveins: $sgpr0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vcc(s1) = COPY $sgpr0
+    ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[COPY]](s1), [[C]], [[C1]]
+    ;
     ; WAVE32-LABEL: name: wave32_copy_sgpr_32_to_s1
     ; WAVE32: liveins: $sgpr0
     ; WAVE32-NEXT: {{  $}}
     ; WAVE32-NEXT: [[COPY:%[0-9]+]]:vcc(s1) = COPY $sgpr0
-    ; WAVE32-NEXT: [[CONST1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1
-    ; WAVE32-NEXT: [[CONST2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
-    ; WAVE32-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[COPY]](s1), [[CONST1]], [[CONST2]]
+    ; WAVE32-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1
+    ; WAVE32-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
+    ; WAVE32-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[COPY]](s1), [[C]], [[C1]]
     %0:_(s1) = COPY $sgpr0
     %1:_(s32) = G_ZEXT %0:_(s1)
 ...
@@ -250,14 +340,26 @@ body: |
     ; CHECK-LABEL: name: wave64_copy2_sgpr_64_to_s1
     ; CHECK: liveins: $sgpr4_sgpr5, $sgpr6_sgpr7
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vcc(s1) = COPY $sgpr4_sgpr5
-    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vcc(s1) = COPY $sgpr6_sgpr7
-    ; CHECK-NEXT: [[CONST1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[CONST2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
-    ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:vgpr(s32) = G_SELECT [[COPY1]](s1), [[CONST1]], [[CONST2]]
-    ; CHECK-NEXT: [[CONST3:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[CONST4:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
-    ; CHECK-NEXT: [[SELECT2:%[0-9]+]]:vgpr(s32) = G_SELECT [[COPY2]](s1), [[CONST3]], [[CONST4]]
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vcc(s1) = COPY $sgpr4_sgpr5
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vcc(s1) = COPY $sgpr6_sgpr7
+    ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[COPY]](s1), [[C]], [[C1]]
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[C3:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:vgpr(s32) = G_SELECT [[COPY1]](s1), [[C2]], [[C3]]
+    ;
+    ; WAVE32-LABEL: name: wave64_copy2_sgpr_64_to_s1
+    ; WAVE32: liveins: $sgpr4_sgpr5, $sgpr6_sgpr7
+    ; WAVE32-NEXT: {{  $}}
+    ; WAVE32-NEXT: [[COPY:%[0-9]+]]:vcc(s1) = COPY $sgpr4_sgpr5
+    ; WAVE32-NEXT: [[COPY1:%[0-9]+]]:vcc(s1) = COPY $sgpr6_sgpr7
+    ; WAVE32-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1
+    ; WAVE32-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
+    ; WAVE32-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[COPY]](s1), [[C]], [[C1]]
+    ; WAVE32-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1
+    ; WAVE32-NEXT: [[C3:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
+    ; WAVE32-NEXT: [[SELECT1:%[0-9]+]]:vgpr(s32) = G_SELECT [[COPY1]](s1), [[C2]], [[C3]]
     %0:_(s1) = COPY $sgpr4_sgpr5
     %1:_(s1) = COPY $sgpr6_sgpr7
     %2:_(s32) = G_ZEXT %0:_(s1)
@@ -271,17 +373,29 @@ legalized: true
 body: |
   bb.0:
     liveins: $sgpr0, $sgpr1
+    ; CHECK-LABEL: name: wave32_copy2_sgpr_32_to_s1
+    ; CHECK: liveins: $sgpr0, $sgpr1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vcc(s1) = COPY $sgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vcc(s1) = COPY $sgpr1
+    ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[COPY]](s1), [[C]], [[C1]]
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[C3:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:vgpr(s32) = G_SELECT [[COPY1]](s1), [[C2]], [[C3]]
+    ;
     ; WAVE32-LABEL: name: wave32_copy2_sgpr_32_to_s1
     ; WAVE32: liveins: $sgpr0, $sgpr1
     ; WAVE32-NEXT: {{  $}}
-    ; WAVE32-NEXT: [[COPY1:%[0-9]+]]:vcc(s1) = COPY $sgpr0
-    ; WAVE32-NEXT: [[COPY2:%[0-9]+]]:vcc(s1) = COPY $sgpr1
-    ; WAVE32-NEXT: [[CONST1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1
-    ; WAVE32-NEXT: [[CONST2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
-    ; WAVE32-NEXT: [[SELECT1:%[0-9]+]]:vgpr(s32) = G_SELECT [[COPY1]](s1), [[CONST1]], [[CONST2]]
-    ; WAVE32-NEXT: [[CONST3:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1
-    ; WAVE32-NEXT: [[CONST4:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
-    ; WAVE32-NEXT: [[SELECT2:%[0-9]+]]:vgpr(s32) = G_SELECT [[COPY2]](s1), [[CONST3]], [[CONST4]]
+    ; WAVE32-NEXT: [[COPY:%[0-9]+]]:vcc(s1) = COPY $sgpr0
+    ; WAVE32-NEXT: [[COPY1:%[0-9]+]]:vcc(s1) = COPY $sgpr1
+    ; WAVE32-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1
+    ; WAVE32-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
+    ; WAVE32-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[COPY]](s1), [[C]], [[C1]]
+    ; WAVE32-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1
+    ; WAVE32-NEXT: [[C3:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
+    ; WAVE32-NEXT: [[SELECT1:%[0-9]+]]:vgpr(s32) = G_SELECT [[COPY1]](s1), [[C2]], [[C3]]
     %0:_(s1) = COPY $sgpr0
     %1:_(s1) = COPY $sgpr1
     %2:_(s32) = G_ZEXT %0:_(s1)
@@ -343,9 +457,17 @@ body: |
     ; CHECK: liveins: $sgpr4_sgpr5
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vcc(s1) = COPY $sgpr4_sgpr5
-    ; CHECK-NEXT: [[CONST1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[CONST2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[COPY]](s1), [[CONST1]], [[CONST2]]
+    ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[COPY]](s1), [[C]], [[C1]]
+    ;
+    ; WAVE32-LABEL: name: wave64_copy_sgpr_64_to_s1_vcc
+    ; WAVE32: liveins: $sgpr4_sgpr5
+    ; WAVE32-NEXT: {{  $}}
+    ; WAVE32-NEXT: [[COPY:%[0-9]+]]:vcc(s1) = COPY $sgpr4_sgpr5
+    ; WAVE32-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1
+    ; WAVE32-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
+    ; WAVE32-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[COPY]](s1), [[C]], [[C1]]
     %0:vcc(s1) = COPY $sgpr4_sgpr5
     %1:_(s32) = G_ZEXT %0:vcc(s1)
 ...
@@ -357,13 +479,21 @@ legalized: true
 body: |
   bb.0:
     liveins: $sgpr0
+    ; CHECK-LABEL: name: wave32_copy_sgpr_32_to_s1_vcc
+    ; CHECK: liveins: $sgpr0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vcc(s1) = COPY $sgpr0
+    ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[COPY]](s1), [[C]], [[C1]]
+    ;
     ; WAVE32-LABEL: name: wave32_copy_sgpr_32_to_s1_vcc
     ; WAVE32: liveins: $sgpr0
     ; WAVE32-NEXT: {{  $}}
     ; WAVE32-NEXT: [[COPY:%[0-9]+]]:vcc(s1) = COPY $sgpr0
-    ; WAVE32-NEXT: [[CONST1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1
-    ; WAVE32-NEXT: [[CONST2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
-    ; WAVE32-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[COPY]](s1), [[CONST1]], [[CONST2]]
+    ; WAVE32-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1
+    ; WAVE32-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
+    ; WAVE32-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[COPY]](s1), [[C]], [[C1]]
     %0:vcc(s1) = COPY $sgpr0
     %1:_(s32) = G_ZEXT %0:vcc(s1)
 ...
@@ -380,14 +510,14 @@ body: |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
     ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s1) = G_TRUNC [[COPY]](s32)
-    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s1) = COPY [[TRUNC]](s1)
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s1) = COPY [[TRUNC]](s1)
     ;
     ; WAVE32-LABEL: name: copy_virt_reg_to_s1
     ; WAVE32: liveins: $vgpr0
     ; WAVE32-NEXT: {{  $}}
     ; WAVE32-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
     ; WAVE32-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s1) = G_TRUNC [[COPY]](s32)
-    ; WAVE32-NEXT: [[COPY2:%[0-9]+]]:vgpr(s1) = COPY [[TRUNC]](s1)
+    ; WAVE32-NEXT: [[COPY1:%[0-9]+]]:vgpr(s1) = COPY [[TRUNC]](s1)
     %0:_(s32) = COPY $vgpr0
     %1:_(s1) = G_TRUNC %0
     %2:_(s1) = COPY %1
@@ -405,16 +535,16 @@ body: |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
     ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s1) = G_TRUNC [[COPY]](s32)
-    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s1) = COPY [[TRUNC]](s1)
-    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s1) = COPY [[COPY2]](s1)
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s1) = COPY [[TRUNC]](s1)
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s1) = COPY [[COPY1]](s1)
     ;
     ; WAVE32-LABEL: name: copy_virt_reg_to_s1_vgpr
     ; WAVE32: liveins: $vgpr0
     ; WAVE32-NEXT: {{  $}}
     ; WAVE32-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
     ; WAVE32-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s1) = G_TRUNC [[COPY]](s32)
-    ; WAVE32-NEXT: [[COPY2:%[0-9]+]]:vgpr(s1) = COPY [[TRUNC]](s1)
-    ; WAVE32-NEXT: [[COPY3:%[0-9]+]]:vgpr(s1) = COPY [[COPY2]](s1)
+    ; WAVE32-NEXT: [[COPY1:%[0-9]+]]:vgpr(s1) = COPY [[TRUNC]](s1)
+    ; WAVE32-NEXT: [[COPY2:%[0-9]+]]:vgpr(s1) = COPY [[COPY1]](s1)
     %0:_(s32) = COPY $vgpr0
     %1:_(s1) = G_TRUNC %0
     %2:vgpr(s1) = COPY %1
@@ -434,16 +564,16 @@ body: |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
     ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s1) = G_TRUNC [[COPY]](s32)
-    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vcc(s1) = COPY [[TRUNC]](s1)
-    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vcc(s1) = COPY [[COPY2]](s1)
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vcc(s1) = COPY [[TRUNC]](s1)
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vcc(s1) = COPY [[COPY1]](s1)
     ;
     ; WAVE32-LABEL: name: copy_virt_reg_to_s1_vcc
     ; WAVE32: liveins: $vgpr0
     ; WAVE32-NEXT: {{  $}}
     ; WAVE32-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
     ; WAVE32-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s1) = G_TRUNC [[COPY]](s32)
-    ; WAVE32-NEXT: [[COPY2:%[0-9]+]]:vcc(s1) = COPY [[TRUNC]](s1)
-    ; WAVE32-NEXT: [[COPY3:%[0-9]+]]:vcc(s1) = COPY [[COPY2]](s1)
+    ; WAVE32-NEXT: [[COPY1:%[0-9]+]]:vcc(s1) = COPY [[TRUNC]](s1)
+    ; WAVE32-NEXT: [[COPY2:%[0-9]+]]:vcc(s1) = COPY [[COPY1]](s1)
     %0:_(s32) = COPY $vgpr0
     %1:_(s1) = G_TRUNC %0
     %2:vcc(s1) = COPY %1
@@ -499,3 +629,121 @@ body: |
     %1:_(s1) = G_TRUNC %0
     $sgpr0 = COPY %1
 ...
+
+---
+name: copy_sgpr_physreg_to_vcc_s1_wave64
+legalized: true
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $sgpr4_sgpr5
+
+    ; CHECK-LABEL: name: copy_sgpr_physreg_to_vcc_s1_wave64
+    ; CHECK: liveins: $sgpr4_sgpr5
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vcc(s1) = COPY $sgpr4_sgpr5
+    ; CHECK-NEXT: S_ENDPGM 0, implicit [[COPY]](s1)
+    ;
+    ; WAVE32-LABEL: name: copy_sgpr_physreg_to_vcc_s1_wave64
+    ; WAVE32: liveins: $sgpr4_sgpr5
+    ; WAVE32-NEXT: {{  $}}
+    ; WAVE32-NEXT: [[COPY:%[0-9]+]]:vcc(s1) = COPY $sgpr4_sgpr5
+    ; WAVE32-NEXT: S_ENDPGM 0, implicit [[COPY]](s1)
+    %0:_(s1) = COPY $sgpr4_sgpr5
+    S_ENDPGM 0, implicit %0
+
+...
+
+---
+name: copy_vcc_s1_to_sgpr_physreg_wave64
+legalized: true
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $vgpr0
+
+    ; CHECK-LABEL: name: copy_vcc_s1_to_sgpr_physreg_wave64
+    ; CHECK: liveins: $vgpr0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[COPY1]]
+    ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[ICMP]](s1)
+    ; CHECK-NEXT: S_ENDPGM 0, implicit $sgpr4_sgpr5
+    ;
+    ; WAVE32-LABEL: name: copy_vcc_s1_to_sgpr_physreg_wave64
+    ; WAVE32: liveins: $vgpr0
+    ; WAVE32-NEXT: {{  $}}
+    ; WAVE32-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; WAVE32-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
+    ; WAVE32-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
+    ; WAVE32-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[COPY1]]
+    ; WAVE32-NEXT: $sgpr4_sgpr5 = COPY [[ICMP]](s1)
+    ; WAVE32-NEXT: S_ENDPGM 0, implicit $sgpr4_sgpr5
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = G_CONSTANT i32 0
+    %2:_(s1) = G_ICMP intpred(eq), %0, %1
+    $sgpr4_sgpr5 = COPY %2
+    S_ENDPGM 0, implicit $sgpr4_sgpr5
+
+...
+
+---
+name: copy_sgpr_physreg_to_vcc_s1_wave32
+legalized: true
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $sgpr4
+
+    ; CHECK-LABEL: name: copy_sgpr_physreg_to_vcc_s1_wave32
+    ; CHECK: liveins: $sgpr4
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vcc(s1) = COPY $sgpr4
+    ; CHECK-NEXT: S_ENDPGM 0, implicit [[COPY]](s1)
+    ;
+    ; WAVE32-LABEL: name: copy_sgpr_physreg_to_vcc_s1_wave32
+    ; WAVE32: liveins: $sgpr4
+    ; WAVE32-NEXT: {{  $}}
+    ; WAVE32-NEXT: [[COPY:%[0-9]+]]:vcc(s1) = COPY $sgpr4
+    ; WAVE32-NEXT: S_ENDPGM 0, implicit [[COPY]](s1)
+    %0:_(s1) = COPY $sgpr4
+    S_ENDPGM 0, implicit %0
+
+...
+
+---
+name: copy_vcc_s1_to_sgpr_physreg_wave32
+legalized: true
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $vgpr0
+
+    ; CHECK-LABEL: name: copy_vcc_s1_to_sgpr_physreg_wave32
+    ; CHECK: liveins: $vgpr0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[COPY1]]
+    ; CHECK-NEXT: $sgpr4 = COPY [[ICMP]](s1)
+    ; CHECK-NEXT: S_ENDPGM 0, implicit $sgpr4
+    ;
+    ; WAVE32-LABEL: name: copy_vcc_s1_to_sgpr_physreg_wave32
+    ; WAVE32: liveins: $vgpr0
+    ; WAVE32-NEXT: {{  $}}
+    ; WAVE32-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; WAVE32-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
+    ; WAVE32-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
+    ; WAVE32-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[COPY1]]
+    ; WAVE32-NEXT: $sgpr4 = COPY [[ICMP]](s1)
+    ; WAVE32-NEXT: S_ENDPGM 0, implicit $sgpr4
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = G_CONSTANT i32 0
+    %2:_(s1) = G_ICMP intpred(eq), %0, %1
+    $sgpr4 = COPY %2
+    S_ENDPGM 0, implicit $sgpr4
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
index f57fc005b994b..9ffc565d9d47a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
@@ -1186,77 +1186,77 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    s_subb_u32 s6, 0, 0
 ; GISEL-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
 ; GISEL-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v4
-; GISEL-NEXT:    v_trunc_f32_e32 v8, v5
-; GISEL-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v8
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v7, v4
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v8, v8
-; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v6, v7, 0
-; GISEL-NEXT:    v_mov_b32_e32 v9, v5
-; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v6, v8, v[9:10]
-; GISEL-NEXT:    v_mul_hi_u32 v11, v7, v4
-; GISEL-NEXT:    v_mul_hi_u32 v12, v8, v4
-; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], s6, v7, v[9:10]
-; GISEL-NEXT:    v_mul_lo_u32 v10, v8, v4
-; GISEL-NEXT:    v_mul_lo_u32 v13, v7, v9
-; GISEL-NEXT:    v_mul_lo_u32 v4, v8, v9
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v10, v13
+; GISEL-NEXT:    v_trunc_f32_e32 v7, v5
+; GISEL-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v7
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v8, v4
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v9, v7
+; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v6, v8, 0
+; GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v6, v9, v[7:8]
+; GISEL-NEXT:    v_mul_hi_u32 v12, v9, v4
+; GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], s6, v8, v[10:11]
+; GISEL-NEXT:    v_mul_lo_u32 v10, v9, v4
+; GISEL-NEXT:    v_mul_hi_u32 v11, v8, v4
+; GISEL-NEXT:    v_mul_lo_u32 v7, v8, v13
+; GISEL-NEXT:    v_mul_lo_u32 v4, v9, v13
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v10, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
-; GISEL-NEXT:    v_mul_hi_u32 v14, v7, v9
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v11
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v14, v7
+; GISEL-NEXT:    v_mul_hi_u32 v14, v8, v13
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v12
 ; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v14
 ; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
-; GISEL-NEXT:    v_mul_hi_u32 v9, v8, v9
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v13
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v13
-; GISEL-NEXT:    v_add_i32_e32 v16, vcc, v7, v4
+; GISEL-NEXT:    v_mul_hi_u32 v13, v9, v13
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v14, v7
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v13, v7
+; GISEL-NEXT:    v_add_i32_e32 v16, vcc, v8, v4
 ; GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v6, v16, 0
-; GISEL-NEXT:    v_addc_u32_e32 v17, vcc, v8, v9, vcc
+; GISEL-NEXT:    v_addc_u32_e32 v7, vcc, v9, v7, vcc
 ; GISEL-NEXT:    v_mov_b32_e32 v4, v14
-; GISEL-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], v6, v17, v[4:5]
-; GISEL-NEXT:    v_mul_lo_u32 v4, v17, v13
+; GISEL-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], v6, v7, v[4:5]
+; GISEL-NEXT:    v_mul_lo_u32 v4, v7, v13
 ; GISEL-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], s6, v16, v[14:15]
 ; GISEL-NEXT:    s_mov_b32 s6, 1
 ; GISEL-NEXT:    s_cmp_lg_u32 s6, 0
-; GISEL-NEXT:    v_mul_lo_u32 v9, v16, v14
+; GISEL-NEXT:    v_mul_lo_u32 v15, v16, v14
 ; GISEL-NEXT:    s_subb_u32 s6, 0, 0
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v9
-; GISEL-NEXT:    v_mul_hi_u32 v9, v16, v13
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v15
+; GISEL-NEXT:    v_mul_hi_u32 v15, v16, v13
+; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
+; GISEL-NEXT:    v_mul_hi_u32 v13, v7, v13
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v15
+; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v15, v7, v14
+; GISEL-NEXT:    v_add_i32_e32 v17, vcc, v17, v4
+; GISEL-NEXT:    v_mul_hi_u32 v4, v16, v14
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v15, v13
 ; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v9
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v4
 ; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; GISEL-NEXT:    v_mul_hi_u32 v9, v17, v13
-; GISEL-NEXT:    v_mul_lo_u32 v13, v17, v14
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v15, v4
-; GISEL-NEXT:    v_mul_hi_u32 v15, v16, v14
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v13, v9
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v9, v15
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v9
-; GISEL-NEXT:    v_ashrrev_i32_e32 v9, 31, v1
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v9
-; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v1, v9, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v18, v0, v9
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v15, v4
-; GISEL-NEXT:    v_mul_hi_u32 v4, v17, v14
-; GISEL-NEXT:    v_xor_b32_e32 v19, v1, v9
+; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v15, v4
+; GISEL-NEXT:    v_ashrrev_i32_e32 v4, 31, v1
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
+; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v1, v4, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v18, v0, v4
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v13, v17
+; GISEL-NEXT:    v_mul_hi_u32 v13, v7, v14
+; GISEL-NEXT:    v_xor_b32_e32 v19, v1, v4
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v15, v1
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v13, v1
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v4, v1
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v16, v0
-; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v17, v1, vcc
+; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v7, v1, vcc
 ; GISEL-NEXT:    v_mul_lo_u32 v13, v19, v0
 ; GISEL-NEXT:    v_mul_lo_u32 v14, v18, v1
 ; GISEL-NEXT:    v_mul_hi_u32 v15, v18, v0
 ; GISEL-NEXT:    v_mul_hi_u32 v0, v19, v0
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0x12d8fb
+; GISEL-NEXT:    v_mov_b32_e32 v7, 0x12d8fb
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
 ; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v15
@@ -1271,144 +1271,147 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
 ; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v0, v13
 ; GISEL-NEXT:    v_mul_hi_u32 v16, v19, v1
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v15, 0
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v7, v15, 0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
 ; GISEL-NEXT:    v_add_i32_e32 v16, vcc, v16, v13
-; GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v4, v16, v[1:2]
+; GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v7, v16, v[1:2]
 ; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v18, v0
+; GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], 0, v15, v[13:14]
 ; GISEL-NEXT:    v_subb_u32_e64 v1, s[4:5], v19, v13, vcc
 ; GISEL-NEXT:    v_sub_i32_e64 v13, s[4:5], v19, v13
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v4
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, -1, s[4:5]
 ; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v1
 ; GISEL-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v13, vcc
-; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4
+; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v7
 ; GISEL-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, 1, v15
 ; GISEL-NEXT:    v_addc_u32_e32 v17, vcc, 0, v16, vcc
-; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v4
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
 ; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
 ; GISEL-NEXT:    v_cndmask_b32_e32 v18, -1, v0, vcc
 ; GISEL-NEXT:    v_mov_b32_e32 v0, v5
 ; GISEL-NEXT:    v_cndmask_b32_e64 v14, -1, v14, s[4:5]
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v8, v[0:1]
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s6, v7, v[0:1]
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v9, v[0:1]
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s6, v8, v[0:1]
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, 1, v13
 ; GISEL-NEXT:    v_addc_u32_e32 v5, vcc, 0, v17, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v18
-; GISEL-NEXT:    v_mul_lo_u32 v18, v7, v0
+; GISEL-NEXT:    v_mul_lo_u32 v18, v8, v0
 ; GISEL-NEXT:    v_cndmask_b32_e32 v13, v13, v1, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v5, v17, v5, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v10, v18
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v11, v8, v0
+; GISEL-NEXT:    v_mul_lo_u32 v11, v9, v0
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v10, v1
-; GISEL-NEXT:    v_mul_hi_u32 v10, v7, v0
+; GISEL-NEXT:    v_mul_hi_u32 v10, v8, v0
 ; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
 ; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
-; GISEL-NEXT:    v_mul_hi_u32 v0, v8, v0
+; GISEL-NEXT:    v_mul_hi_u32 v0, v9, v0
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v10, v1
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v10
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v1
-; GISEL-NEXT:    v_addc_u32_e32 v8, vcc, v8, v0, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v7, 0
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v1
+; GISEL-NEXT:    v_addc_u32_e32 v9, vcc, v9, v0, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v8, 0
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v14
 ; GISEL-NEXT:    v_cndmask_b32_e32 v11, v16, v5, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v6, v8, v[1:2]
-; GISEL-NEXT:    v_xor_b32_e32 v1, v11, v9
-; GISEL-NEXT:    v_ashrrev_i32_e32 v11, 31, v3
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], s6, v7, v[5:6]
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v6, v9, v[1:2]
 ; GISEL-NEXT:    v_cndmask_b32_e32 v10, v15, v13, vcc
-; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v11
-; GISEL-NEXT:    v_addc_u32_e32 v3, vcc, v3, v11, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v12, v2, v11
-; GISEL-NEXT:    v_mul_lo_u32 v2, v8, v0
-; GISEL-NEXT:    v_mul_lo_u32 v6, v7, v5
-; GISEL-NEXT:    v_xor_b32_e32 v13, v3, v11
-; GISEL-NEXT:    v_mul_hi_u32 v3, v7, v0
-; GISEL-NEXT:    v_mul_hi_u32 v0, v8, v0
+; GISEL-NEXT:    v_xor_b32_e32 v1, v10, v4
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], s6, v8, v[5:6]
+; GISEL-NEXT:    v_ashrrev_i32_e32 v10, 31, v3
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v10
+; GISEL-NEXT:    v_addc_u32_e32 v3, vcc, v3, v10, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v12, v2, v10
+; GISEL-NEXT:    v_mul_lo_u32 v2, v9, v0
+; GISEL-NEXT:    v_mul_lo_u32 v6, v8, v5
+; GISEL-NEXT:    v_xor_b32_e32 v13, v3, v10
+; GISEL-NEXT:    v_mul_hi_u32 v3, v8, v0
+; GISEL-NEXT:    v_mul_hi_u32 v0, v9, v0
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
 ; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v3, v8, v5
+; GISEL-NEXT:    v_mul_lo_u32 v3, v9, v5
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v6, v2
-; GISEL-NEXT:    v_mul_hi_u32 v6, v7, v5
+; GISEL-NEXT:    v_mul_hi_u32 v6, v8, v5
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v3, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v6
 ; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
-; GISEL-NEXT:    v_mul_hi_u32 v5, v8, v5
+; GISEL-NEXT:    v_mul_hi_u32 v5, v9, v5
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v7, v0
-; GISEL-NEXT:    v_addc_u32_e32 v2, vcc, v8, v2, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v5, v13, v3
-; GISEL-NEXT:    v_mul_lo_u32 v6, v12, v2
-; GISEL-NEXT:    v_xor_b32_e32 v10, v10, v9
-; GISEL-NEXT:    v_mul_hi_u32 v7, v12, v3
-; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v10, v9
-; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v1, v9, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
-; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v8, v0
+; GISEL-NEXT:    v_addc_u32_e32 v2, vcc, v9, v2, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v3, v13, v0
+; GISEL-NEXT:    v_mul_lo_u32 v5, v12, v2
+; GISEL-NEXT:    v_mul_hi_u32 v6, v12, v0
+; GISEL-NEXT:    v_mul_hi_u32 v0, v13, v0
+; GISEL-NEXT:    v_xor_b32_e32 v8, v11, v4
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v7, v13, v2
-; GISEL-NEXT:    v_mul_hi_u32 v3, v13, v3
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
-; GISEL-NEXT:    v_mul_hi_u32 v6, v12, v2
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v7, v3
-; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
+; GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v6, v13, v2
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
+; GISEL-NEXT:    v_mul_hi_u32 v5, v12, v2
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v6, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v3, v5
-; GISEL-NEXT:    v_mul_hi_u32 v8, v13, v2
-; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v4, v7, 0
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v5
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v5
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v4, v8, v[3:4]
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v0, v3
+; GISEL-NEXT:    v_mul_hi_u32 v6, v13, v2
+; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v7, v9, 0
+; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v5, v0
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v6, v0
+; GISEL-NEXT:    v_mov_b32_e32 v0, v3
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v7, v11, v[0:1]
+; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v1, v4
+; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v8, v4, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], 0, v9, v[5:6]
 ; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v12, v2
-; GISEL-NEXT:    v_subb_u32_e64 v3, s[4:5], v13, v5, vcc
-; GISEL-NEXT:    v_sub_i32_e64 v5, s[4:5], v13, v5
-; GISEL-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v2, v4
-; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v4
-; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v3
-; GISEL-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v3, -1, v6, s[4:5]
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, 1, v7
-; GISEL-NEXT:    v_addc_u32_e32 v9, vcc, 0, v8, vcc
-; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v4
+; GISEL-NEXT:    v_subb_u32_e64 v4, s[4:5], v13, v3, vcc
+; GISEL-NEXT:    v_sub_i32_e64 v3, s[4:5], v13, v3
+; GISEL-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v2, v7
+; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v7
+; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[4:5]
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v4
+; GISEL-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v4, -1, v5, s[4:5]
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, 1, v9
+; GISEL-NEXT:    v_addc_u32_e32 v6, vcc, 0, v11, vcc
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
-; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v5
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
 ; GISEL-NEXT:    v_cndmask_b32_e32 v2, -1, v2, vcc
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, 1, v6
-; GISEL-NEXT:    v_addc_u32_e32 v5, vcc, 0, v9, vcc
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, 1, v5
+; GISEL-NEXT:    v_addc_u32_e32 v7, vcc, 0, v6, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, v6, v4, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v4, v9, v5, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v3, v8, v4, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v2, v2, v11
-; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v11
-; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v11
-; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v11, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, v5, v3, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v3, v6, v7, vcc
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v3, v11, v3, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v2, v2, v10
+; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v10
+; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v10
+; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v10, vcc
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CGP-LABEL: v_sdiv_v2i64_oddk_denom:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
index 19dc20c510041..82279e641ed63 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
@@ -1112,67 +1112,67 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    s_subb_u32 s6, 0, 0
 ; GISEL-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
 ; GISEL-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v4
-; GISEL-NEXT:    v_trunc_f32_e32 v8, v5
-; GISEL-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v8
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v7, v4
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v8, v8
-; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v6, v7, 0
-; GISEL-NEXT:    v_mov_b32_e32 v9, v5
-; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v6, v8, v[9:10]
-; GISEL-NEXT:    v_mul_hi_u32 v11, v7, v4
-; GISEL-NEXT:    v_mul_hi_u32 v12, v8, v4
-; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], s6, v7, v[9:10]
-; GISEL-NEXT:    v_mul_lo_u32 v10, v8, v4
-; GISEL-NEXT:    v_mul_lo_u32 v13, v7, v9
-; GISEL-NEXT:    v_mul_lo_u32 v4, v8, v9
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v10, v13
+; GISEL-NEXT:    v_trunc_f32_e32 v7, v5
+; GISEL-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v7
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v8, v4
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v9, v7
+; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v6, v8, 0
+; GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v6, v9, v[7:8]
+; GISEL-NEXT:    v_mul_hi_u32 v12, v9, v4
+; GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], s6, v8, v[10:11]
+; GISEL-NEXT:    v_mul_lo_u32 v10, v9, v4
+; GISEL-NEXT:    v_mul_hi_u32 v11, v8, v4
+; GISEL-NEXT:    v_mul_lo_u32 v7, v8, v13
+; GISEL-NEXT:    v_mul_lo_u32 v4, v9, v13
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v10, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
-; GISEL-NEXT:    v_mul_hi_u32 v14, v7, v9
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v11
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v14, v7
+; GISEL-NEXT:    v_mul_hi_u32 v14, v8, v13
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v12
 ; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v14
 ; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
-; GISEL-NEXT:    v_mul_hi_u32 v9, v8, v9
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v13
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v13
-; GISEL-NEXT:    v_add_i32_e32 v16, vcc, v7, v4
+; GISEL-NEXT:    v_mul_hi_u32 v13, v9, v13
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v14, v7
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v13, v7
+; GISEL-NEXT:    v_add_i32_e32 v16, vcc, v8, v4
 ; GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v6, v16, 0
-; GISEL-NEXT:    v_addc_u32_e32 v17, vcc, v8, v9, vcc
+; GISEL-NEXT:    v_addc_u32_e32 v17, vcc, v9, v7, vcc
 ; GISEL-NEXT:    v_mov_b32_e32 v4, v14
 ; GISEL-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], v6, v17, v[4:5]
 ; GISEL-NEXT:    v_mul_lo_u32 v4, v17, v13
 ; GISEL-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], s6, v16, v[14:15]
 ; GISEL-NEXT:    s_mov_b32 s6, 1
 ; GISEL-NEXT:    s_cmp_lg_u32 s6, 0
-; GISEL-NEXT:    v_mul_lo_u32 v9, v16, v14
+; GISEL-NEXT:    v_mul_lo_u32 v7, v16, v14
 ; GISEL-NEXT:    s_subb_u32 s6, 0, 0
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v9
-; GISEL-NEXT:    v_mul_hi_u32 v9, v16, v13
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
+; GISEL-NEXT:    v_mul_hi_u32 v7, v16, v13
 ; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v9
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; GISEL-NEXT:    v_mul_hi_u32 v9, v17, v13
+; GISEL-NEXT:    v_mul_hi_u32 v7, v17, v13
 ; GISEL-NEXT:    v_mul_lo_u32 v13, v17, v14
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v15, v4
 ; GISEL-NEXT:    v_mul_hi_u32 v15, v16, v14
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v13, v9
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v13, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v9, v15
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v9
-; GISEL-NEXT:    v_ashrrev_i32_e32 v9, 31, v1
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v9
-; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v1, v9, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v18, v0, v9
+; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v7, v15
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v7
+; GISEL-NEXT:    v_ashrrev_i32_e32 v7, 31, v1
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v7
+; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v1, v7, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v18, v0, v7
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v15, v4
 ; GISEL-NEXT:    v_mul_hi_u32 v4, v17, v14
-; GISEL-NEXT:    v_xor_b32_e32 v19, v1, v9
+; GISEL-NEXT:    v_xor_b32_e32 v19, v1, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v13, v1
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v4, v1
@@ -1195,13 +1195,14 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v14
 ; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v13
-; GISEL-NEXT:    v_mul_hi_u32 v15, v19, v1
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v0, 0
+; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v0, v13
+; GISEL-NEXT:    v_mul_hi_u32 v16, v19, v1
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v15, 0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v15, v13
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v16, v13
 ; GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v4, v13, v[1:2]
+; GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], 0, v15, v[13:14]
 ; GISEL-NEXT:    v_sub_i32_e32 v14, vcc, v18, v0
 ; GISEL-NEXT:    v_sub_i32_e64 v0, s[4:5], v19, v13
 ; GISEL-NEXT:    v_subb_u32_e64 v15, s[4:5], v19, v13, vcc
@@ -1217,94 +1218,96 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_cndmask_b32_e32 v18, -1, v0, vcc
 ; GISEL-NEXT:    v_mov_b32_e32 v0, v5
 ; GISEL-NEXT:    v_cndmask_b32_e64 v13, -1, v1, s[4:5]
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v8, v[0:1]
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s6, v7, v[0:1]
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v9, v[0:1]
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s6, v8, v[0:1]
 ; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v16, v4
 ; GISEL-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v17, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v18
-; GISEL-NEXT:    v_mul_lo_u32 v18, v7, v0
+; GISEL-NEXT:    v_mul_lo_u32 v18, v8, v0
 ; GISEL-NEXT:    v_cndmask_b32_e32 v16, v16, v1, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v5, v17, v5, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v10, v18
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v11, v8, v0
+; GISEL-NEXT:    v_mul_lo_u32 v11, v9, v0
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v10, v1
-; GISEL-NEXT:    v_mul_hi_u32 v10, v7, v0
+; GISEL-NEXT:    v_mul_hi_u32 v10, v8, v0
 ; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
 ; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
-; GISEL-NEXT:    v_mul_hi_u32 v0, v8, v0
+; GISEL-NEXT:    v_mul_hi_u32 v0, v9, v0
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v10, v1
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v10
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v1
-; GISEL-NEXT:    v_addc_u32_e32 v8, vcc, v8, v0, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v7, 0
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v1
+; GISEL-NEXT:    v_addc_u32_e32 v9, vcc, v9, v0, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v8, 0
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v13
 ; GISEL-NEXT:    v_cndmask_b32_e32 v11, v15, v5, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v6, v8, v[1:2]
-; GISEL-NEXT:    v_xor_b32_e32 v1, v11, v9
-; GISEL-NEXT:    v_ashrrev_i32_e32 v11, 31, v3
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], s6, v7, v[5:6]
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v6, v9, v[1:2]
 ; GISEL-NEXT:    v_cndmask_b32_e32 v10, v14, v16, vcc
-; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v11
-; GISEL-NEXT:    v_addc_u32_e32 v3, vcc, v3, v11, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v12, v2, v11
-; GISEL-NEXT:    v_mul_lo_u32 v2, v8, v0
-; GISEL-NEXT:    v_mul_lo_u32 v6, v7, v5
-; GISEL-NEXT:    v_xor_b32_e32 v13, v3, v11
-; GISEL-NEXT:    v_mul_hi_u32 v3, v7, v0
-; GISEL-NEXT:    v_mul_hi_u32 v0, v8, v0
+; GISEL-NEXT:    v_xor_b32_e32 v1, v10, v7
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], s6, v8, v[5:6]
+; GISEL-NEXT:    v_ashrrev_i32_e32 v10, 31, v3
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v10
+; GISEL-NEXT:    v_addc_u32_e32 v3, vcc, v3, v10, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v12, v2, v10
+; GISEL-NEXT:    v_mul_lo_u32 v2, v9, v0
+; GISEL-NEXT:    v_mul_lo_u32 v6, v8, v5
+; GISEL-NEXT:    v_xor_b32_e32 v13, v3, v10
+; GISEL-NEXT:    v_mul_hi_u32 v3, v8, v0
+; GISEL-NEXT:    v_mul_hi_u32 v0, v9, v0
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
 ; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v3, v8, v5
+; GISEL-NEXT:    v_mul_lo_u32 v3, v9, v5
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v6, v2
-; GISEL-NEXT:    v_mul_hi_u32 v6, v7, v5
+; GISEL-NEXT:    v_mul_hi_u32 v6, v8, v5
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v3, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v6
 ; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
-; GISEL-NEXT:    v_mul_hi_u32 v5, v8, v5
+; GISEL-NEXT:    v_mul_hi_u32 v5, v9, v5
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v7, v0
-; GISEL-NEXT:    v_addc_u32_e32 v2, vcc, v8, v2, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v5, v13, v3
-; GISEL-NEXT:    v_mul_lo_u32 v6, v12, v2
-; GISEL-NEXT:    v_xor_b32_e32 v10, v10, v9
-; GISEL-NEXT:    v_mul_hi_u32 v7, v12, v3
-; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v10, v9
-; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v1, v9, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
-; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v8, v0
+; GISEL-NEXT:    v_addc_u32_e32 v2, vcc, v9, v2, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v3, v13, v0
+; GISEL-NEXT:    v_mul_lo_u32 v5, v12, v2
+; GISEL-NEXT:    v_mul_hi_u32 v6, v12, v0
+; GISEL-NEXT:    v_mul_hi_u32 v0, v13, v0
+; GISEL-NEXT:    v_xor_b32_e32 v8, v11, v7
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v7, v13, v2
-; GISEL-NEXT:    v_mul_hi_u32 v3, v13, v3
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
-; GISEL-NEXT:    v_mul_hi_u32 v6, v12, v2
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v7, v3
-; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
+; GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v6, v13, v2
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
+; GISEL-NEXT:    v_mul_hi_u32 v5, v12, v2
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v6, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
-; GISEL-NEXT:    v_mul_hi_u32 v7, v13, v2
-; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v4, v3, 0
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v5
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v4, v5, v[3:4]
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v0, v3
+; GISEL-NEXT:    v_mul_hi_u32 v6, v13, v2
+; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v4, v9, 0
+; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v5, v0
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v6, v0
+; GISEL-NEXT:    v_mov_b32_e32 v0, v3
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v4, v5, v[0:1]
+; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v1, v7
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], 0, v9, v[5:6]
+; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v8, v7, vcc
 ; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v12, v2
 ; GISEL-NEXT:    v_subb_u32_e64 v3, s[4:5], v13, v5, vcc
 ; GISEL-NEXT:    v_sub_i32_e64 v5, s[4:5], v13, v5
@@ -1327,10 +1330,10 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
 ; GISEL-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v2, v2, v11
-; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v11
-; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v11
-; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v11, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v2, v2, v10
+; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v10
+; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v10
+; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v10, vcc
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CGP-LABEL: v_srem_v2i64_pow2k_denom:
@@ -1705,67 +1708,67 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    s_subb_u32 s6, 0, 0
 ; GISEL-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
 ; GISEL-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v4
-; GISEL-NEXT:    v_trunc_f32_e32 v8, v5
-; GISEL-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v8
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v7, v4
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v8, v8
-; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v6, v7, 0
-; GISEL-NEXT:    v_mov_b32_e32 v9, v5
-; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v6, v8, v[9:10]
-; GISEL-NEXT:    v_mul_hi_u32 v11, v7, v4
-; GISEL-NEXT:    v_mul_hi_u32 v12, v8, v4
-; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], s6, v7, v[9:10]
-; GISEL-NEXT:    v_mul_lo_u32 v10, v8, v4
-; GISEL-NEXT:    v_mul_lo_u32 v13, v7, v9
-; GISEL-NEXT:    v_mul_lo_u32 v4, v8, v9
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v10, v13
+; GISEL-NEXT:    v_trunc_f32_e32 v7, v5
+; GISEL-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v7
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v8, v4
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v9, v7
+; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v6, v8, 0
+; GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v6, v9, v[7:8]
+; GISEL-NEXT:    v_mul_hi_u32 v12, v9, v4
+; GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], s6, v8, v[10:11]
+; GISEL-NEXT:    v_mul_lo_u32 v10, v9, v4
+; GISEL-NEXT:    v_mul_hi_u32 v11, v8, v4
+; GISEL-NEXT:    v_mul_lo_u32 v7, v8, v13
+; GISEL-NEXT:    v_mul_lo_u32 v4, v9, v13
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v10, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
-; GISEL-NEXT:    v_mul_hi_u32 v14, v7, v9
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v11
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v14, v7
+; GISEL-NEXT:    v_mul_hi_u32 v14, v8, v13
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v12
 ; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v14
 ; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
-; GISEL-NEXT:    v_mul_hi_u32 v9, v8, v9
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v13
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v13
-; GISEL-NEXT:    v_add_i32_e32 v16, vcc, v7, v4
+; GISEL-NEXT:    v_mul_hi_u32 v13, v9, v13
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v14, v7
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v13, v7
+; GISEL-NEXT:    v_add_i32_e32 v16, vcc, v8, v4
 ; GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v6, v16, 0
-; GISEL-NEXT:    v_addc_u32_e32 v17, vcc, v8, v9, vcc
+; GISEL-NEXT:    v_addc_u32_e32 v17, vcc, v9, v7, vcc
 ; GISEL-NEXT:    v_mov_b32_e32 v4, v14
 ; GISEL-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], v6, v17, v[4:5]
 ; GISEL-NEXT:    v_mul_lo_u32 v4, v17, v13
 ; GISEL-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], s6, v16, v[14:15]
 ; GISEL-NEXT:    s_mov_b32 s6, 1
 ; GISEL-NEXT:    s_cmp_lg_u32 s6, 0
-; GISEL-NEXT:    v_mul_lo_u32 v9, v16, v14
+; GISEL-NEXT:    v_mul_lo_u32 v7, v16, v14
 ; GISEL-NEXT:    s_subb_u32 s6, 0, 0
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v9
-; GISEL-NEXT:    v_mul_hi_u32 v9, v16, v13
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
+; GISEL-NEXT:    v_mul_hi_u32 v7, v16, v13
 ; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v9
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; GISEL-NEXT:    v_mul_hi_u32 v9, v17, v13
+; GISEL-NEXT:    v_mul_hi_u32 v7, v17, v13
 ; GISEL-NEXT:    v_mul_lo_u32 v13, v17, v14
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v15, v4
 ; GISEL-NEXT:    v_mul_hi_u32 v15, v16, v14
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v13, v9
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v13, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v9, v15
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v9
-; GISEL-NEXT:    v_ashrrev_i32_e32 v9, 31, v1
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v9
-; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v1, v9, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v18, v0, v9
+; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v7, v15
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v7
+; GISEL-NEXT:    v_ashrrev_i32_e32 v7, 31, v1
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v7
+; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v1, v7, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v18, v0, v7
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v15, v4
 ; GISEL-NEXT:    v_mul_hi_u32 v4, v17, v14
-; GISEL-NEXT:    v_xor_b32_e32 v19, v1, v9
+; GISEL-NEXT:    v_xor_b32_e32 v19, v1, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v13, v1
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v4, v1
@@ -1788,13 +1791,14 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v14
 ; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v13
-; GISEL-NEXT:    v_mul_hi_u32 v15, v19, v1
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v0, 0
+; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v0, v13
+; GISEL-NEXT:    v_mul_hi_u32 v16, v19, v1
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v15, 0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v15, v13
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v16, v13
 ; GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v4, v13, v[1:2]
+; GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], 0, v15, v[13:14]
 ; GISEL-NEXT:    v_sub_i32_e32 v14, vcc, v18, v0
 ; GISEL-NEXT:    v_sub_i32_e64 v0, s[4:5], v19, v13
 ; GISEL-NEXT:    v_subb_u32_e64 v15, s[4:5], v19, v13, vcc
@@ -1810,94 +1814,96 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_cndmask_b32_e32 v18, -1, v0, vcc
 ; GISEL-NEXT:    v_mov_b32_e32 v0, v5
 ; GISEL-NEXT:    v_cndmask_b32_e64 v13, -1, v1, s[4:5]
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v8, v[0:1]
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s6, v7, v[0:1]
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v9, v[0:1]
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s6, v8, v[0:1]
 ; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v16, v4
 ; GISEL-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v17, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v18
-; GISEL-NEXT:    v_mul_lo_u32 v18, v7, v0
+; GISEL-NEXT:    v_mul_lo_u32 v18, v8, v0
 ; GISEL-NEXT:    v_cndmask_b32_e32 v16, v16, v1, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v5, v17, v5, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v10, v18
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v11, v8, v0
+; GISEL-NEXT:    v_mul_lo_u32 v11, v9, v0
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v10, v1
-; GISEL-NEXT:    v_mul_hi_u32 v10, v7, v0
+; GISEL-NEXT:    v_mul_hi_u32 v10, v8, v0
 ; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
 ; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
-; GISEL-NEXT:    v_mul_hi_u32 v0, v8, v0
+; GISEL-NEXT:    v_mul_hi_u32 v0, v9, v0
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v10, v1
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v10
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v1
-; GISEL-NEXT:    v_addc_u32_e32 v8, vcc, v8, v0, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v7, 0
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v1
+; GISEL-NEXT:    v_addc_u32_e32 v9, vcc, v9, v0, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v8, 0
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v13
 ; GISEL-NEXT:    v_cndmask_b32_e32 v11, v15, v5, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v6, v8, v[1:2]
-; GISEL-NEXT:    v_xor_b32_e32 v1, v11, v9
-; GISEL-NEXT:    v_ashrrev_i32_e32 v11, 31, v3
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], s6, v7, v[5:6]
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v6, v9, v[1:2]
 ; GISEL-NEXT:    v_cndmask_b32_e32 v10, v14, v16, vcc
-; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v11
-; GISEL-NEXT:    v_addc_u32_e32 v3, vcc, v3, v11, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v12, v2, v11
-; GISEL-NEXT:    v_mul_lo_u32 v2, v8, v0
-; GISEL-NEXT:    v_mul_lo_u32 v6, v7, v5
-; GISEL-NEXT:    v_xor_b32_e32 v13, v3, v11
-; GISEL-NEXT:    v_mul_hi_u32 v3, v7, v0
-; GISEL-NEXT:    v_mul_hi_u32 v0, v8, v0
+; GISEL-NEXT:    v_xor_b32_e32 v1, v10, v7
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], s6, v8, v[5:6]
+; GISEL-NEXT:    v_ashrrev_i32_e32 v10, 31, v3
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v10
+; GISEL-NEXT:    v_addc_u32_e32 v3, vcc, v3, v10, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v12, v2, v10
+; GISEL-NEXT:    v_mul_lo_u32 v2, v9, v0
+; GISEL-NEXT:    v_mul_lo_u32 v6, v8, v5
+; GISEL-NEXT:    v_xor_b32_e32 v13, v3, v10
+; GISEL-NEXT:    v_mul_hi_u32 v3, v8, v0
+; GISEL-NEXT:    v_mul_hi_u32 v0, v9, v0
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
 ; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v3, v8, v5
+; GISEL-NEXT:    v_mul_lo_u32 v3, v9, v5
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v6, v2
-; GISEL-NEXT:    v_mul_hi_u32 v6, v7, v5
+; GISEL-NEXT:    v_mul_hi_u32 v6, v8, v5
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v3, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v6
 ; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
-; GISEL-NEXT:    v_mul_hi_u32 v5, v8, v5
+; GISEL-NEXT:    v_mul_hi_u32 v5, v9, v5
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v7, v0
-; GISEL-NEXT:    v_addc_u32_e32 v2, vcc, v8, v2, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v5, v13, v3
-; GISEL-NEXT:    v_mul_lo_u32 v6, v12, v2
-; GISEL-NEXT:    v_xor_b32_e32 v10, v10, v9
-; GISEL-NEXT:    v_mul_hi_u32 v7, v12, v3
-; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v10, v9
-; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v1, v9, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
-; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v8, v0
+; GISEL-NEXT:    v_addc_u32_e32 v2, vcc, v9, v2, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v3, v13, v0
+; GISEL-NEXT:    v_mul_lo_u32 v5, v12, v2
+; GISEL-NEXT:    v_mul_hi_u32 v6, v12, v0
+; GISEL-NEXT:    v_mul_hi_u32 v0, v13, v0
+; GISEL-NEXT:    v_xor_b32_e32 v8, v11, v7
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v7, v13, v2
-; GISEL-NEXT:    v_mul_hi_u32 v3, v13, v3
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
-; GISEL-NEXT:    v_mul_hi_u32 v6, v12, v2
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v7, v3
-; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
+; GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v6, v13, v2
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
+; GISEL-NEXT:    v_mul_hi_u32 v5, v12, v2
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v6, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
-; GISEL-NEXT:    v_mul_hi_u32 v7, v13, v2
-; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v4, v3, 0
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v5
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v4, v5, v[3:4]
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v0, v3
+; GISEL-NEXT:    v_mul_hi_u32 v6, v13, v2
+; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v4, v9, 0
+; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v5, v0
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v6, v0
+; GISEL-NEXT:    v_mov_b32_e32 v0, v3
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v4, v5, v[0:1]
+; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v1, v7
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], 0, v9, v[5:6]
+; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v8, v7, vcc
 ; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v12, v2
 ; GISEL-NEXT:    v_subb_u32_e64 v3, s[4:5], v13, v5, vcc
 ; GISEL-NEXT:    v_sub_i32_e64 v5, s[4:5], v13, v5
@@ -1920,10 +1926,10 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
 ; GISEL-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v2, v2, v11
-; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v11
-; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v11
-; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v11, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v2, v2, v10
+; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v10
+; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v10
+; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v10, vcc
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CGP-LABEL: v_srem_v2i64_oddk_denom:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll
index 9c2fabce4bcde..b33b8a7d8cd72 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll
@@ -7,33 +7,33 @@ define amdgpu_kernel void @v3i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1)
 ; GFX906-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX906-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX906-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; GFX906-NEXT:    v_mov_b32_e32 v4, 8
+; GFX906-NEXT:    v_mov_b32_e32 v3, 8
 ; GFX906-NEXT:    v_mov_b32_e32 v5, 16
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    global_load_dword v3, v2, s[0:1]
+; GFX906-NEXT:    global_load_dword v4, v2, s[0:1]
 ; GFX906-NEXT:    v_mov_b32_e32 v1, 0xff
 ; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_and_b32_e32 v6, 0xff, v3
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v7, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX906-NEXT:    v_or3_b32 v3, v6, v7, v3
+; GFX906-NEXT:    v_and_b32_e32 v6, 0xff, v4
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v7, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX906-NEXT:    v_or3_b32 v4, v6, v7, v4
 ; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX906-NEXT:    s_cbranch_execz .LBB0_2
 ; GFX906-NEXT:  ; %bb.1: ; %bb.1
 ; GFX906-NEXT:    global_load_dword v0, v2, s[2:3]
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    v_and_b32_e32 v2, 0xff, v0
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v3, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX906-NEXT:    v_lshlrev_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX906-NEXT:    v_or3_b32 v3, v2, v3, v0
+; GFX906-NEXT:    v_or3_b32 v4, v2, v3, v0
 ; GFX906-NEXT:  .LBB0_2: ; %bb.2
 ; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v3
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v4
 ; GFX906-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT:    v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_and_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_and_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX906-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX906-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX906-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
diff --git a/llvm/test/CodeGen/AMDGPU/a-v-flat-atomic-cmpxchg.ll b/llvm/test/CodeGen/AMDGPU/a-v-flat-atomic-cmpxchg.ll
index 58f3ffb0492e0..bc341f2baa804 100644
--- a/llvm/test/CodeGen/AMDGPU/a-v-flat-atomic-cmpxchg.ll
+++ b/llvm/test/CodeGen/AMDGPU/a-v-flat-atomic-cmpxchg.ll
@@ -361,8 +361,8 @@ define void @flat_atomic_cmpxchg_i64_ret_av_av__av(ptr %ptr) #0 {
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_add_co_u32_e32 v6, vcc, 0x50, v0
-; CHECK-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
 ; CHECK-NEXT:    s_mov_b64 s[4:5], src_private_base
+; CHECK-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v7
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def v[2:3]
@@ -417,8 +417,8 @@ define void @flat_atomic_cmpxchg_i64_ret_av_av__v(ptr %ptr) #0 {
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_add_co_u32_e32 v6, vcc, 0x50, v0
-; CHECK-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
 ; CHECK-NEXT:    s_mov_b64 s[4:5], src_private_base
+; CHECK-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v7
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def v[2:3]
@@ -473,8 +473,8 @@ define void @flat_atomic_cmpxchg_i64_ret_av_av__a(ptr %ptr) #0 {
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_add_co_u32_e32 v4, vcc, 0x50, v0
-; CHECK-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; CHECK-NEXT:    s_mov_b64 s[4:5], src_private_base
+; CHECK-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def v[2:3]
@@ -538,13 +538,13 @@ define void @flat_atomic_cmpxchg_i64_ret_a_a__a(ptr %ptr) #0 {
 ; CHECK-NEXT:    ; def a[0:1]
 ; CHECK-NEXT:    ;;#ASMEND
 ; CHECK-NEXT:    v_accvgpr_read_b32 v3, a1
+; CHECK-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; CHECK-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; CHECK-NEXT:    v_accvgpr_read_b32 v2, a0
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def a[0:1]
 ; CHECK-NEXT:    ;;#ASMEND
 ; CHECK-NEXT:    v_accvgpr_read_b32 v0, a0
-; CHECK-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; CHECK-NEXT:    v_accvgpr_read_b32 v1, a1
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; CHECK-NEXT:    ; implicit-def: $agpr0_agpr1
@@ -603,13 +603,13 @@ define void @flat_atomic_cmpxchg_i64_ret_a_a__v(ptr %ptr) #0 {
 ; CHECK-NEXT:    ; def a[0:1]
 ; CHECK-NEXT:    ;;#ASMEND
 ; CHECK-NEXT:    v_accvgpr_read_b32 v3, a1
+; CHECK-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; CHECK-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
 ; CHECK-NEXT:    v_accvgpr_read_b32 v2, a0
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def a[0:1]
 ; CHECK-NEXT:    ;;#ASMEND
 ; CHECK-NEXT:    v_accvgpr_read_b32 v0, a0
-; CHECK-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; CHECK-NEXT:    v_accvgpr_read_b32 v1, a1
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v7
 ; CHECK-NEXT:    ; implicit-def: $vgpr4_vgpr5
@@ -659,12 +659,12 @@ define void @flat_atomic_cmpxchg_i64_ret_v_a__v(ptr %ptr) #0 {
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_add_co_u32_e32 v6, vcc, 0x50, v0
+; CHECK-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; CHECK-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def a[0:1]
 ; CHECK-NEXT:    ;;#ASMEND
 ; CHECK-NEXT:    v_accvgpr_read_b32 v0, a0
-; CHECK-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; CHECK-NEXT:    v_accvgpr_read_b32 v1, a1
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v7
 ; CHECK-NEXT:    ;;#ASMSTART
@@ -717,12 +717,12 @@ define void @flat_atomic_cmpxchg_i64_ret_a_v__v(ptr %ptr) #0 {
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_add_co_u32_e32 v6, vcc, 0x50, v0
+; CHECK-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; CHECK-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def a[0:1]
 ; CHECK-NEXT:    ;;#ASMEND
 ; CHECK-NEXT:    v_accvgpr_read_b32 v3, a1
-; CHECK-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; CHECK-NEXT:    v_accvgpr_read_b32 v2, a0
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v7
 ; CHECK-NEXT:    ;;#ASMSTART
@@ -775,8 +775,8 @@ define void @flat_atomic_cmpxchg_i64_ret_v_v__a(ptr %ptr) #0 {
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_add_co_u32_e32 v4, vcc, 0x50, v0
-; CHECK-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; CHECK-NEXT:    s_mov_b64 s[4:5], src_private_base
+; CHECK-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def v[2:3]
@@ -836,8 +836,8 @@ define void @flat_atomic_cmpxchg_i64_ret_av_v__av(ptr %ptr) #0 {
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_add_co_u32_e32 v6, vcc, 0x50, v0
-; CHECK-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
 ; CHECK-NEXT:    s_mov_b64 s[4:5], src_private_base
+; CHECK-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v7
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def v[2:3]
@@ -892,8 +892,8 @@ define void @flat_atomic_cmpxchg_i64_ret_v_av__av(ptr %ptr) #0 {
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_add_co_u32_e32 v6, vcc, 0x50, v0
-; CHECK-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
 ; CHECK-NEXT:    s_mov_b64 s[4:5], src_private_base
+; CHECK-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v7
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def v[2:3]
@@ -948,12 +948,12 @@ define void @flat_atomic_cmpxchg_i64_ret_av_a__av(ptr %ptr) #0 {
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_add_co_u32_e32 v6, vcc, 0x50, v0
+; CHECK-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; CHECK-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def a[0:1]
 ; CHECK-NEXT:    ;;#ASMEND
 ; CHECK-NEXT:    v_accvgpr_read_b32 v0, a0
-; CHECK-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; CHECK-NEXT:    v_accvgpr_read_b32 v1, a1
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v7
 ; CHECK-NEXT:    ;;#ASMSTART
@@ -1006,12 +1006,12 @@ define void @flat_atomic_cmpxchg_i64_ret_a_av__av(ptr %ptr) #0 {
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_add_co_u32_e32 v6, vcc, 0x50, v0
+; CHECK-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; CHECK-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def a[0:1]
 ; CHECK-NEXT:    ;;#ASMEND
 ; CHECK-NEXT:    v_accvgpr_read_b32 v3, a1
-; CHECK-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; CHECK-NEXT:    v_accvgpr_read_b32 v2, a0
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v7
 ; CHECK-NEXT:    ;;#ASMSTART
diff --git a/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll b/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll
index 7b33374453010..d053425afbb6d 100644
--- a/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll
+++ b/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll
@@ -641,12 +641,12 @@ define void @flat_atomic_xchg_i64_ret_a_a(ptr %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0x50, v0
+; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def a[0:1]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v3, a1
-; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v2, a0
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GFX90A-NEXT:    ; implicit-def: $agpr0_agpr1
@@ -686,15 +686,15 @@ define void @flat_atomic_xchg_i64_ret_a_a(ptr %ptr) #0 {
 ; GFX950-LABEL: flat_atomic_xchg_i64_ret_a_a:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    s_mov_b64 s[0:1], 0x50
-; GFX950-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX950-NEXT:    s_mov_b64 s[2:3], 0x50
+; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def a[2:3]
 ; GFX950-NEXT:    ;;#ASMEND
-; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-NEXT:    v_accvgpr_read_b32 v2, a2
 ; GFX950-NEXT:    v_accvgpr_read_b32 v3, a3
-; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-NEXT:    ; implicit-def: $agpr0_agpr1
 ; GFX950-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -739,12 +739,12 @@ define void @flat_atomic_xchg_i64_ret_a_v(ptr %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, 0x50, v0
+; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def a[0:1]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v5, a1
-; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v4, a0
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v3
 ; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
@@ -782,15 +782,15 @@ define void @flat_atomic_xchg_i64_ret_a_v(ptr %ptr) #0 {
 ; GFX950-LABEL: flat_atomic_xchg_i64_ret_a_v:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    s_mov_b64 s[0:1], 0x50
-; GFX950-NEXT:    v_lshl_add_u64 v[2:3], v[0:1], 0, s[0:1]
+; GFX950-NEXT:    s_mov_b64 s[2:3], 0x50
+; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3]
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def a[0:1]
 ; GFX950-NEXT:    ;;#ASMEND
-; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v3
 ; GFX950-NEXT:    v_accvgpr_read_b32 v5, a1
 ; GFX950-NEXT:    v_accvgpr_read_b32 v4, a0
-; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v3
 ; GFX950-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX950-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -833,8 +833,8 @@ define void @flat_atomic_xchg_i64_ret_v_a(ptr %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0x50, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v[2:3]
@@ -876,9 +876,9 @@ define void @flat_atomic_xchg_i64_ret_v_a(ptr %ptr) #0 {
 ; GFX950-LABEL: flat_atomic_xchg_i64_ret_v_a:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    s_mov_b64 s[0:1], 0x50
-; GFX950-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX950-NEXT:    s_mov_b64 s[2:3], 0x50
 ; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
 ; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def v[2:3]
@@ -927,8 +927,8 @@ define void @flat_atomic_xchg_i64_ret_av_av(ptr %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, 0x50, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v3
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v[4:5]
@@ -968,38 +968,39 @@ define void @flat_atomic_xchg_i64_ret_av_av(ptr %ptr) #0 {
 ; GFX950-LABEL: flat_atomic_xchg_i64_ret_av_av:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    s_mov_b64 s[0:1], 0x50
-; GFX950-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX950-NEXT:    s_mov_b64 s[2:3], 0x50
 ; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
-; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
+; GFX950-NEXT:    v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3]
+; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v3
 ; GFX950-NEXT:    ;;#ASMSTART
-; GFX950-NEXT:    ; def v[0:1]
+; GFX950-NEXT:    ; def v[4:5]
 ; GFX950-NEXT:    ;;#ASMEND
-; GFX950-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX950-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX950-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
 ; GFX950-NEXT:    s_cbranch_execz .LBB14_2
 ; GFX950-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GFX950-NEXT:    buffer_wbl2 sc0 sc1
-; GFX950-NEXT:    flat_atomic_swap_x2 v[2:3], v[4:5], v[0:1] sc0 sc1
+; GFX950-NEXT:    flat_atomic_swap_x2 v[0:1], v[2:3], v[4:5] sc0 sc1
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    buffer_inv sc0 sc1
+; GFX950-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; GFX950-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX950-NEXT:  .LBB14_2: ; %Flow
 ; GFX950-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
 ; GFX950-NEXT:    s_cbranch_execz .LBB14_4
 ; GFX950-NEXT:  ; %bb.3: ; %atomicrmw.private
-; GFX950-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GFX950-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc
-; GFX950-NEXT:    scratch_load_dwordx2 v[2:3], v4, off
+; GFX950-NEXT:    v_cndmask_b32_e32 v2, -1, v2, vcc
+; GFX950-NEXT:    scratch_load_dwordx2 v[0:1], v2, off
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    scratch_store_dwordx2 v4, v[0:1], off
+; GFX950-NEXT:    scratch_store_dwordx2 v2, v[4:5], off
 ; GFX950-NEXT:  .LBB14_4: ; %atomicrmw.phi
 ; GFX950-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX950-NEXT:    s_waitcnt vmcnt(1)
 ; GFX950-NEXT:    ;;#ASMSTART
-; GFX950-NEXT:    ; use v[2:3]
+; GFX950-NEXT:    ; use v[0:1]
 ; GFX950-NEXT:    ;;#ASMEND
 ; GFX950-NEXT:    s_waitcnt vmcnt(0)
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
@@ -1016,8 +1017,8 @@ define void @flat_atomic_xchg_i64_ret_av_v(ptr %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, 0x50, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v3
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v[4:5]
@@ -1057,38 +1058,39 @@ define void @flat_atomic_xchg_i64_ret_av_v(ptr %ptr) #0 {
 ; GFX950-LABEL: flat_atomic_xchg_i64_ret_av_v:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    s_mov_b64 s[0:1], 0x50
-; GFX950-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX950-NEXT:    s_mov_b64 s[2:3], 0x50
 ; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
-; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
+; GFX950-NEXT:    v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3]
+; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v3
 ; GFX950-NEXT:    ;;#ASMSTART
-; GFX950-NEXT:    ; def v[0:1]
+; GFX950-NEXT:    ; def v[4:5]
 ; GFX950-NEXT:    ;;#ASMEND
-; GFX950-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX950-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX950-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
 ; GFX950-NEXT:    s_cbranch_execz .LBB15_2
 ; GFX950-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GFX950-NEXT:    buffer_wbl2 sc0 sc1
-; GFX950-NEXT:    flat_atomic_swap_x2 v[2:3], v[4:5], v[0:1] sc0 sc1
+; GFX950-NEXT:    flat_atomic_swap_x2 v[0:1], v[2:3], v[4:5] sc0 sc1
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    buffer_inv sc0 sc1
+; GFX950-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; GFX950-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX950-NEXT:  .LBB15_2: ; %Flow
 ; GFX950-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
 ; GFX950-NEXT:    s_cbranch_execz .LBB15_4
 ; GFX950-NEXT:  ; %bb.3: ; %atomicrmw.private
-; GFX950-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GFX950-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc
-; GFX950-NEXT:    scratch_load_dwordx2 v[2:3], v4, off
+; GFX950-NEXT:    v_cndmask_b32_e32 v2, -1, v2, vcc
+; GFX950-NEXT:    scratch_load_dwordx2 v[0:1], v2, off
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    scratch_store_dwordx2 v4, v[0:1], off
+; GFX950-NEXT:    scratch_store_dwordx2 v2, v[4:5], off
 ; GFX950-NEXT:  .LBB15_4: ; %atomicrmw.phi
 ; GFX950-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX950-NEXT:    s_waitcnt vmcnt(1)
 ; GFX950-NEXT:    ;;#ASMSTART
-; GFX950-NEXT:    ; use v[2:3]
+; GFX950-NEXT:    ; use v[0:1]
 ; GFX950-NEXT:    ;;#ASMEND
 ; GFX950-NEXT:    s_waitcnt vmcnt(0)
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
@@ -1105,8 +1107,8 @@ define void @flat_atomic_xchg_i64_ret_av_a(ptr %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0x50, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v[2:3]
@@ -1148,12 +1150,12 @@ define void @flat_atomic_xchg_i64_ret_av_a(ptr %ptr) #0 {
 ; GFX950-LABEL: flat_atomic_xchg_i64_ret_av_a:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    s_mov_b64 s[0:1], 0x50
-; GFX950-NEXT:    v_lshl_add_u64 v[2:3], v[0:1], 0, s[0:1]
+; GFX950-NEXT:    s_mov_b64 s[2:3], 0x50
 ; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
-; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v3
+; GFX950-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
+; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-NEXT:    ;;#ASMSTART
-; GFX950-NEXT:    ; def v[0:1]
+; GFX950-NEXT:    ; def v[2:3]
 ; GFX950-NEXT:    ;;#ASMEND
 ; GFX950-NEXT:    ; implicit-def: $agpr0_agpr1
 ; GFX950-NEXT:    s_and_saveexec_b64 s[0:1], vcc
@@ -1161,22 +1163,23 @@ define void @flat_atomic_xchg_i64_ret_av_a(ptr %ptr) #0 {
 ; GFX950-NEXT:    s_cbranch_execz .LBB16_2
 ; GFX950-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GFX950-NEXT:    buffer_wbl2 sc0 sc1
-; GFX950-NEXT:    flat_atomic_swap_x2 v[2:3], v[2:3], v[0:1] sc0 sc1
+; GFX950-NEXT:    flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] sc0 sc1
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    buffer_inv sc0 sc1
-; GFX950-NEXT:    v_accvgpr_write_b32 a0, v2
-; GFX950-NEXT:    v_accvgpr_write_b32 a1, v3
 ; GFX950-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX950-NEXT:    v_accvgpr_write_b32 a0, v0
+; GFX950-NEXT:    v_accvgpr_write_b32 a1, v1
+; GFX950-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX950-NEXT:  .LBB16_2: ; %Flow
 ; GFX950-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
 ; GFX950-NEXT:    s_cbranch_execz .LBB16_4
 ; GFX950-NEXT:  ; %bb.3: ; %atomicrmw.private
-; GFX950-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; GFX950-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v2, -1, v2, vcc
-; GFX950-NEXT:    scratch_load_dwordx2 a[0:1], v2, off
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
+; GFX950-NEXT:    scratch_load_dwordx2 a[0:1], v0, off
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    scratch_store_dwordx2 v2, v[0:1], off
+; GFX950-NEXT:    scratch_store_dwordx2 v0, v[2:3], off
 ; GFX950-NEXT:  .LBB16_4: ; %atomicrmw.phi
 ; GFX950-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX950-NEXT:    s_waitcnt vmcnt(1)
@@ -1198,12 +1201,12 @@ define void @flat_atomic_xchg_i64_ret_a_av(ptr %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, 0x50, v0
+; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def a[0:1]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v5, a1
-; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v4, a0
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v3
 ; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
@@ -1241,15 +1244,15 @@ define void @flat_atomic_xchg_i64_ret_a_av(ptr %ptr) #0 {
 ; GFX950-LABEL: flat_atomic_xchg_i64_ret_a_av:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    s_mov_b64 s[0:1], 0x50
-; GFX950-NEXT:    v_lshl_add_u64 v[2:3], v[0:1], 0, s[0:1]
+; GFX950-NEXT:    s_mov_b64 s[2:3], 0x50
+; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3]
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def a[0:1]
 ; GFX950-NEXT:    ;;#ASMEND
-; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v3
 ; GFX950-NEXT:    v_accvgpr_read_b32 v5, a1
 ; GFX950-NEXT:    v_accvgpr_read_b32 v4, a0
-; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v3
 ; GFX950-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX950-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -1292,8 +1295,8 @@ define void @flat_atomic_xchg_i64_ret_v_av(ptr %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, 0x50, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v3
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v[4:5]
@@ -1333,9 +1336,9 @@ define void @flat_atomic_xchg_i64_ret_v_av(ptr %ptr) #0 {
 ; GFX950-LABEL: flat_atomic_xchg_i64_ret_v_av:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    s_mov_b64 s[0:1], 0x50
-; GFX950-NEXT:    v_lshl_add_u64 v[2:3], v[0:1], 0, s[0:1]
+; GFX950-NEXT:    s_mov_b64 s[2:3], 0x50
 ; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3]
 ; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v3
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def v[4:5]
@@ -1380,11 +1383,11 @@ define void @flat_atomic_xchg_i64_noret_a(ptr %ptr) #0 {
 ; GFX90A-LABEL: flat_atomic_xchg_i64_noret_a:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def a[0:1]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v3, a1
-; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v2, a0
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -2484,11 +2487,11 @@ define void @flat_atomic_xor_expansion_i64_ret_a_a(ptr %ptr) #0 {
 ; GFX90A-LABEL: flat_atomic_xor_expansion_i64_ret_a_a:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def a[0:1]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v7, a1
-; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v6, a0
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GFX90A-NEXT:    ; implicit-def: $agpr0_agpr1
@@ -2546,13 +2549,13 @@ define void @flat_atomic_xor_expansion_i64_ret_a_a(ptr %ptr) #0 {
 ; GFX950-LABEL: flat_atomic_xor_expansion_i64_ret_a_a:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def a[0:1]
 ; GFX950-NEXT:    ;;#ASMEND
-; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-NEXT:    v_accvgpr_read_b32 v7, a1
 ; GFX950-NEXT:    v_accvgpr_read_b32 v6, a0
-; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-NEXT:    ; implicit-def: $agpr0_agpr1
 ; GFX950-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -2613,11 +2616,11 @@ define void @flat_atomic_xor_expansion_i64_ret_a_v(ptr %ptr) #0 {
 ; GFX90A-LABEL: flat_atomic_xor_expansion_i64_ret_a_v:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def a[0:1]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v3, a1
-; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v2, a0
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GFX90A-NEXT:    ; implicit-def: $vgpr4_vgpr5
@@ -2671,13 +2674,13 @@ define void @flat_atomic_xor_expansion_i64_ret_a_v(ptr %ptr) #0 {
 ; GFX950-LABEL: flat_atomic_xor_expansion_i64_ret_a_v:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def a[0:1]
 ; GFX950-NEXT:    ;;#ASMEND
-; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-NEXT:    v_accvgpr_read_b32 v3, a1
 ; GFX950-NEXT:    v_accvgpr_read_b32 v2, a0
-; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX950-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -3218,11 +3221,11 @@ define void @flat_atomic_xor_expansion_i64_ret_a_av(ptr %ptr) #0 {
 ; GFX90A-LABEL: flat_atomic_xor_expansion_i64_ret_a_av:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def a[0:1]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v3, a1
-; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v2, a0
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GFX90A-NEXT:    ; implicit-def: $vgpr4_vgpr5
@@ -3276,13 +3279,13 @@ define void @flat_atomic_xor_expansion_i64_ret_a_av(ptr %ptr) #0 {
 ; GFX950-LABEL: flat_atomic_xor_expansion_i64_ret_a_av:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def a[0:1]
 ; GFX950-NEXT:    ;;#ASMEND
-; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-NEXT:    v_accvgpr_read_b32 v3, a1
 ; GFX950-NEXT:    v_accvgpr_read_b32 v2, a0
-; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX950-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -3455,11 +3458,11 @@ define void @flat_atomic_xor_expansion_i64_noret_a(ptr %ptr) #0 {
 ; GFX90A-LABEL: flat_atomic_xor_expansion_i64_noret_a:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def a[0:1]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v7, a1
-; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v6, a0
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -3513,13 +3516,13 @@ define void @flat_atomic_xor_expansion_i64_noret_a(ptr %ptr) #0 {
 ; GFX950-LABEL: flat_atomic_xor_expansion_i64_noret_a:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def a[0:1]
 ; GFX950-NEXT:    ;;#ASMEND
-; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-NEXT:    v_accvgpr_read_b32 v7, a1
 ; GFX950-NEXT:    v_accvgpr_read_b32 v6, a0
-; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
 ; GFX950-NEXT:    s_cbranch_execnz .LBB40_3
@@ -4301,11 +4304,11 @@ define void @flat_atomic_xor_i64_ret_a_a(ptr %ptr) #0 {
 ; GFX90A-LABEL: flat_atomic_xor_i64_ret_a_a:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def a[0:1]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v3, a1
-; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v2, a0
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GFX90A-NEXT:    ; implicit-def: $agpr0_agpr1
@@ -4347,13 +4350,13 @@ define void @flat_atomic_xor_i64_ret_a_a(ptr %ptr) #0 {
 ; GFX950-LABEL: flat_atomic_xor_i64_ret_a_a:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def a[0:1]
 ; GFX950-NEXT:    ;;#ASMEND
-; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-NEXT:    v_accvgpr_read_b32 v3, a1
 ; GFX950-NEXT:    v_accvgpr_read_b32 v2, a0
-; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-NEXT:    ; implicit-def: $agpr0_agpr1
 ; GFX950-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -4400,11 +4403,11 @@ define void @flat_atomic_xor_i64_ret_a_v(ptr %ptr) #0 {
 ; GFX90A-LABEL: flat_atomic_xor_i64_ret_a_v:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def a[0:1]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v5, a1
-; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v4, a0
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GFX90A-NEXT:    ; implicit-def: $vgpr2_vgpr3
@@ -4442,13 +4445,13 @@ define void @flat_atomic_xor_i64_ret_a_v(ptr %ptr) #0 {
 ; GFX950-LABEL: flat_atomic_xor_i64_ret_a_v:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def a[0:1]
 ; GFX950-NEXT:    ;;#ASMEND
-; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-NEXT:    v_accvgpr_read_b32 v5, a1
 ; GFX950-NEXT:    v_accvgpr_read_b32 v4, a0
-; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; GFX950-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -4855,11 +4858,11 @@ define void @flat_atomic_xor_i64_ret_a_av(ptr %ptr) #0 {
 ; GFX90A-LABEL: flat_atomic_xor_i64_ret_a_av:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def a[0:1]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v5, a1
-; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v4, a0
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GFX90A-NEXT:    ; implicit-def: $vgpr2_vgpr3
@@ -4897,13 +4900,13 @@ define void @flat_atomic_xor_i64_ret_a_av(ptr %ptr) #0 {
 ; GFX950-LABEL: flat_atomic_xor_i64_ret_a_av:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def a[0:1]
 ; GFX950-NEXT:    ;;#ASMEND
-; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-NEXT:    v_accvgpr_read_b32 v5, a1
 ; GFX950-NEXT:    v_accvgpr_read_b32 v4, a0
-; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; GFX950-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -5032,11 +5035,11 @@ define void @flat_atomic_xor_i64_noret_a(ptr %ptr) #0 {
 ; GFX90A-LABEL: flat_atomic_xor_i64_noret_a:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def a[0:1]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v3, a1
-; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v2, a0
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -5074,13 +5077,13 @@ define void @flat_atomic_xor_i64_noret_a(ptr %ptr) #0 {
 ; GFX950-LABEL: flat_atomic_xor_i64_noret_a:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def a[0:1]
 ; GFX950-NEXT:    ;;#ASMEND
-; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-NEXT:    v_accvgpr_read_b32 v3, a1
 ; GFX950-NEXT:    v_accvgpr_read_b32 v2, a0
-; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
 ; GFX950-NEXT:    s_cbranch_execnz .LBB61_3
@@ -6303,12 +6306,12 @@ define void @flat_atomic_add_i64_ret_a_a(ptr %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0x50, v0
+; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def a[0:1]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v3, a1
-; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v2, a0
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GFX90A-NEXT:    ; implicit-def: $agpr0_agpr1
@@ -6349,15 +6352,15 @@ define void @flat_atomic_add_i64_ret_a_a(ptr %ptr) #0 {
 ; GFX950-LABEL: flat_atomic_add_i64_ret_a_a:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    s_mov_b64 s[0:1], 0x50
-; GFX950-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX950-NEXT:    s_mov_b64 s[2:3], 0x50
+; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def a[0:1]
 ; GFX950-NEXT:    ;;#ASMEND
-; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-NEXT:    v_accvgpr_read_b32 v3, a1
 ; GFX950-NEXT:    v_accvgpr_read_b32 v2, a0
-; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-NEXT:    ; implicit-def: $agpr0_agpr1
 ; GFX950-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -6401,8 +6404,8 @@ define void @flat_atomic_add_i64_ret_av_av(ptr %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, 0x50, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v3
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v[4:5]
@@ -6440,9 +6443,9 @@ define void @flat_atomic_add_i64_ret_av_av(ptr %ptr) #0 {
 ; GFX950-LABEL: flat_atomic_add_i64_ret_av_av:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    s_mov_b64 s[0:1], 0x50
-; GFX950-NEXT:    v_lshl_add_u64 v[2:3], v[0:1], 0, s[0:1]
+; GFX950-NEXT:    s_mov_b64 s[2:3], 0x50
 ; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3]
 ; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v3
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def v[4:5]
@@ -6486,12 +6489,12 @@ define void @flat_atomic_sub_i64_ret_a_a(ptr %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0x50, v0
+; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def a[0:1]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v3, a1
-; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v2, a0
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GFX90A-NEXT:    ; implicit-def: $agpr0_agpr1
@@ -6532,15 +6535,15 @@ define void @flat_atomic_sub_i64_ret_a_a(ptr %ptr) #0 {
 ; GFX950-LABEL: flat_atomic_sub_i64_ret_a_a:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    s_mov_b64 s[0:1], 0x50
-; GFX950-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX950-NEXT:    s_mov_b64 s[2:3], 0x50
+; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def a[0:1]
 ; GFX950-NEXT:    ;;#ASMEND
-; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-NEXT:    v_accvgpr_read_b32 v3, a1
 ; GFX950-NEXT:    v_accvgpr_read_b32 v2, a0
-; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-NEXT:    ; implicit-def: $agpr0_agpr1
 ; GFX950-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -6586,8 +6589,8 @@ define void @flat_atomic_sub_i64_ret_av_av(ptr %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, 0x50, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v3
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v[4:5]
@@ -6625,9 +6628,9 @@ define void @flat_atomic_sub_i64_ret_av_av(ptr %ptr) #0 {
 ; GFX950-LABEL: flat_atomic_sub_i64_ret_av_av:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    s_mov_b64 s[0:1], 0x50
-; GFX950-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX950-NEXT:    s_mov_b64 s[2:3], 0x50
 ; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
 ; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def v[2:3]
@@ -6673,12 +6676,12 @@ define void @flat_atomic_and_i64_ret_a_a(ptr %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0x50, v0
+; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def a[0:1]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v3, a1
-; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v2, a0
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GFX90A-NEXT:    ; implicit-def: $agpr0_agpr1
@@ -6719,15 +6722,15 @@ define void @flat_atomic_and_i64_ret_a_a(ptr %ptr) #0 {
 ; GFX950-LABEL: flat_atomic_and_i64_ret_a_a:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    s_mov_b64 s[0:1], 0x50
-; GFX950-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX950-NEXT:    s_mov_b64 s[2:3], 0x50
+; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def a[0:1]
 ; GFX950-NEXT:    ;;#ASMEND
-; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-NEXT:    v_accvgpr_read_b32 v3, a1
 ; GFX950-NEXT:    v_accvgpr_read_b32 v2, a0
-; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-NEXT:    ; implicit-def: $agpr0_agpr1
 ; GFX950-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -6772,8 +6775,8 @@ define void @flat_atomic_and_i64_ret_av_av(ptr %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x50, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v[2:3]
@@ -6811,9 +6814,9 @@ define void @flat_atomic_and_i64_ret_av_av(ptr %ptr) #0 {
 ; GFX950-LABEL: flat_atomic_and_i64_ret_av_av:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    s_mov_b64 s[0:1], 0x50
-; GFX950-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX950-NEXT:    s_mov_b64 s[2:3], 0x50
 ; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
 ; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def v[2:3]
@@ -6858,12 +6861,12 @@ define void @flat_atomic_nand_i64_ret_a_a(ptr %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x50, v0
+; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def a[0:1]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v7, a1
-; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v6, a0
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GFX90A-NEXT:    ; implicit-def: $agpr0_agpr1
@@ -6922,15 +6925,15 @@ define void @flat_atomic_nand_i64_ret_a_a(ptr %ptr) #0 {
 ; GFX950-LABEL: flat_atomic_nand_i64_ret_a_a:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    s_mov_b64 s[0:1], 0x50
-; GFX950-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX950-NEXT:    s_mov_b64 s[2:3], 0x50
+; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def a[0:1]
 ; GFX950-NEXT:    ;;#ASMEND
-; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
 ; GFX950-NEXT:    v_accvgpr_read_b32 v7, a1
 ; GFX950-NEXT:    v_accvgpr_read_b32 v6, a0
-; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
 ; GFX950-NEXT:    ; implicit-def: $agpr0_agpr1
 ; GFX950-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -6993,8 +6996,8 @@ define void @flat_atomic_nand_i64_ret_av_av(ptr %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, 0x50, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v3
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v[0:1]
@@ -7051,9 +7054,9 @@ define void @flat_atomic_nand_i64_ret_av_av(ptr %ptr) #0 {
 ; GFX950-LABEL: flat_atomic_nand_i64_ret_av_av:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    s_mov_b64 s[0:1], 0x50
-; GFX950-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX950-NEXT:    s_mov_b64 s[2:3], 0x50
 ; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
 ; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def v[0:1]
@@ -7116,12 +7119,12 @@ define void @flat_atomic_or_i64_ret_a_a(ptr %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0x50, v0
+; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def a[0:1]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v3, a1
-; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v2, a0
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GFX90A-NEXT:    ; implicit-def: $agpr0_agpr1
@@ -7162,15 +7165,15 @@ define void @flat_atomic_or_i64_ret_a_a(ptr %ptr) #0 {
 ; GFX950-LABEL: flat_atomic_or_i64_ret_a_a:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    s_mov_b64 s[0:1], 0x50
-; GFX950-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX950-NEXT:    s_mov_b64 s[2:3], 0x50
+; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def a[0:1]
 ; GFX950-NEXT:    ;;#ASMEND
-; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-NEXT:    v_accvgpr_read_b32 v3, a1
 ; GFX950-NEXT:    v_accvgpr_read_b32 v2, a0
-; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-NEXT:    ; implicit-def: $agpr0_agpr1
 ; GFX950-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -7215,8 +7218,8 @@ define void @flat_atomic_or_i64_ret_av_av(ptr %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x50, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v[2:3]
@@ -7254,9 +7257,9 @@ define void @flat_atomic_or_i64_ret_av_av(ptr %ptr) #0 {
 ; GFX950-LABEL: flat_atomic_or_i64_ret_av_av:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    s_mov_b64 s[0:1], 0x50
-; GFX950-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX950-NEXT:    s_mov_b64 s[2:3], 0x50
 ; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
 ; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def v[2:3]
@@ -7301,12 +7304,12 @@ define void @flat_atomic_max_i64_ret_a_a(ptr %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, 0x50, v0
+; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def a[0:1]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v0, a0
-; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v3
 ; GFX90A-NEXT:    ; implicit-def: $agpr0_agpr1
@@ -7348,15 +7351,15 @@ define void @flat_atomic_max_i64_ret_a_a(ptr %ptr) #0 {
 ; GFX950-LABEL: flat_atomic_max_i64_ret_a_a:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    s_mov_b64 s[0:1], 0x50
-; GFX950-NEXT:    v_lshl_add_u64 v[2:3], v[0:1], 0, s[0:1]
+; GFX950-NEXT:    s_mov_b64 s[2:3], 0x50
+; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3]
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def a[0:1]
 ; GFX950-NEXT:    ;;#ASMEND
-; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v3
 ; GFX950-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GFX950-NEXT:    v_accvgpr_read_b32 v1, a1
-; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v3
 ; GFX950-NEXT:    ; implicit-def: $agpr0_agpr1
 ; GFX950-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -7403,8 +7406,8 @@ define void @flat_atomic_max_i64_ret_av_av(ptr %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x50, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v[2:3]
@@ -7443,9 +7446,9 @@ define void @flat_atomic_max_i64_ret_av_av(ptr %ptr) #0 {
 ; GFX950-LABEL: flat_atomic_max_i64_ret_av_av:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    s_mov_b64 s[0:1], 0x50
-; GFX950-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX950-NEXT:    s_mov_b64 s[2:3], 0x50
 ; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
 ; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def v[2:3]
@@ -7492,12 +7495,12 @@ define void @flat_atomic_min_i64_ret_a_a(ptr %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, 0x50, v0
+; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def a[0:1]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v0, a0
-; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v3
 ; GFX90A-NEXT:    ; implicit-def: $agpr0_agpr1
@@ -7539,15 +7542,15 @@ define void @flat_atomic_min_i64_ret_a_a(ptr %ptr) #0 {
 ; GFX950-LABEL: flat_atomic_min_i64_ret_a_a:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    s_mov_b64 s[0:1], 0x50
-; GFX950-NEXT:    v_lshl_add_u64 v[2:3], v[0:1], 0, s[0:1]
+; GFX950-NEXT:    s_mov_b64 s[2:3], 0x50
+; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3]
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def a[0:1]
 ; GFX950-NEXT:    ;;#ASMEND
-; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v3
 ; GFX950-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GFX950-NEXT:    v_accvgpr_read_b32 v1, a1
-; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v3
 ; GFX950-NEXT:    ; implicit-def: $agpr0_agpr1
 ; GFX950-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -7594,8 +7597,8 @@ define void @flat_atomic_min_i64_ret_av_av(ptr %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x50, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v[2:3]
@@ -7634,9 +7637,9 @@ define void @flat_atomic_min_i64_ret_av_av(ptr %ptr) #0 {
 ; GFX950-LABEL: flat_atomic_min_i64_ret_av_av:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    s_mov_b64 s[0:1], 0x50
-; GFX950-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX950-NEXT:    s_mov_b64 s[2:3], 0x50
 ; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
 ; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def v[2:3]
@@ -7683,12 +7686,12 @@ define void @flat_atomic_umax_i64_ret_a_a(ptr %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, 0x50, v0
+; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def a[0:1]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v0, a0
-; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v3
 ; GFX90A-NEXT:    ; implicit-def: $agpr0_agpr1
@@ -7730,15 +7733,15 @@ define void @flat_atomic_umax_i64_ret_a_a(ptr %ptr) #0 {
 ; GFX950-LABEL: flat_atomic_umax_i64_ret_a_a:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    s_mov_b64 s[0:1], 0x50
-; GFX950-NEXT:    v_lshl_add_u64 v[2:3], v[0:1], 0, s[0:1]
+; GFX950-NEXT:    s_mov_b64 s[2:3], 0x50
+; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3]
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def a[0:1]
 ; GFX950-NEXT:    ;;#ASMEND
-; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v3
 ; GFX950-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GFX950-NEXT:    v_accvgpr_read_b32 v1, a1
-; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v3
 ; GFX950-NEXT:    ; implicit-def: $agpr0_agpr1
 ; GFX950-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -7785,8 +7788,8 @@ define void @flat_atomic_umax_i64_ret_av_av(ptr %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x50, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v[2:3]
@@ -7825,9 +7828,9 @@ define void @flat_atomic_umax_i64_ret_av_av(ptr %ptr) #0 {
 ; GFX950-LABEL: flat_atomic_umax_i64_ret_av_av:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    s_mov_b64 s[0:1], 0x50
-; GFX950-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX950-NEXT:    s_mov_b64 s[2:3], 0x50
 ; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
 ; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def v[2:3]
@@ -7874,12 +7877,12 @@ define void @flat_atomic_umin_i64_ret_a_a(ptr %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, 0x50, v0
+; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def a[0:1]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v0, a0
-; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v3
 ; GFX90A-NEXT:    ; implicit-def: $agpr0_agpr1
@@ -7921,15 +7924,15 @@ define void @flat_atomic_umin_i64_ret_a_a(ptr %ptr) #0 {
 ; GFX950-LABEL: flat_atomic_umin_i64_ret_a_a:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    s_mov_b64 s[0:1], 0x50
-; GFX950-NEXT:    v_lshl_add_u64 v[2:3], v[0:1], 0, s[0:1]
+; GFX950-NEXT:    s_mov_b64 s[2:3], 0x50
+; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3]
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def a[0:1]
 ; GFX950-NEXT:    ;;#ASMEND
-; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v3
 ; GFX950-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GFX950-NEXT:    v_accvgpr_read_b32 v1, a1
-; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v3
 ; GFX950-NEXT:    ; implicit-def: $agpr0_agpr1
 ; GFX950-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -7976,8 +7979,8 @@ define void @flat_atomic_umin_i64_ret_av_av(ptr %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x50, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v[2:3]
@@ -8016,9 +8019,9 @@ define void @flat_atomic_umin_i64_ret_av_av(ptr %ptr) #0 {
 ; GFX950-LABEL: flat_atomic_umin_i64_ret_av_av:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    s_mov_b64 s[0:1], 0x50
-; GFX950-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX950-NEXT:    s_mov_b64 s[2:3], 0x50
 ; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
 ; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def v[2:3]
@@ -8065,12 +8068,12 @@ define void @flat_atomic_uinc_wrap_i64_ret_a_a(ptr %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, 0x50, v0
+; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def a[0:1]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v0, a0
-; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v3
 ; GFX90A-NEXT:    ; implicit-def: $agpr0_agpr1
@@ -8114,15 +8117,15 @@ define void @flat_atomic_uinc_wrap_i64_ret_a_a(ptr %ptr) #0 {
 ; GFX950-LABEL: flat_atomic_uinc_wrap_i64_ret_a_a:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    s_mov_b64 s[0:1], 0x50
-; GFX950-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX950-NEXT:    s_mov_b64 s[2:3], 0x50
+; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def a[0:1]
 ; GFX950-NEXT:    ;;#ASMEND
-; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-NEXT:    v_accvgpr_read_b32 v3, a1
 ; GFX950-NEXT:    v_accvgpr_read_b32 v2, a0
-; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-NEXT:    ; implicit-def: $agpr0_agpr1
 ; GFX950-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -8169,8 +8172,8 @@ define void @flat_atomic_uinc_wrap_i64_ret_av_av(ptr %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x50, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v[2:3]
@@ -8211,9 +8214,9 @@ define void @flat_atomic_uinc_wrap_i64_ret_av_av(ptr %ptr) #0 {
 ; GFX950-LABEL: flat_atomic_uinc_wrap_i64_ret_av_av:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    s_mov_b64 s[0:1], 0x50
-; GFX950-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX950-NEXT:    s_mov_b64 s[2:3], 0x50
 ; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
 ; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def v[2:3]
@@ -8261,12 +8264,12 @@ define void @flat_atomic_udec_wrap_i64_ret_a_a(ptr %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, 0x50, v0
+; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def a[0:1]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v0, a0
-; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v3
 ; GFX90A-NEXT:    ; implicit-def: $agpr0_agpr1
@@ -8312,15 +8315,15 @@ define void @flat_atomic_udec_wrap_i64_ret_a_a(ptr %ptr) #0 {
 ; GFX950-LABEL: flat_atomic_udec_wrap_i64_ret_a_a:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    s_mov_b64 s[0:1], 0x50
-; GFX950-NEXT:    v_lshl_add_u64 v[2:3], v[0:1], 0, s[0:1]
+; GFX950-NEXT:    s_mov_b64 s[2:3], 0x50
+; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3]
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def a[0:1]
 ; GFX950-NEXT:    ;;#ASMEND
-; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v3
 ; GFX950-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GFX950-NEXT:    v_accvgpr_read_b32 v1, a1
-; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v3
 ; GFX950-NEXT:    ; implicit-def: $agpr0_agpr1
 ; GFX950-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -8369,8 +8372,8 @@ define void @flat_atomic_udec_wrap_i64_ret_av_av(ptr %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x50, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v[2:3]
@@ -8413,9 +8416,9 @@ define void @flat_atomic_udec_wrap_i64_ret_av_av(ptr %ptr) #0 {
 ; GFX950-LABEL: flat_atomic_udec_wrap_i64_ret_av_av:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    s_mov_b64 s[0:1], 0x50
-; GFX950-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX950-NEXT:    s_mov_b64 s[2:3], 0x50
 ; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
 ; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def v[2:3]
@@ -8464,12 +8467,12 @@ define void @flat_atomic_usub_cond_i64_ret_a_a(ptr %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v6, vcc, 0x50, v0
+; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def a[0:1]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v5, a1
-; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v4, a0
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v7
 ; GFX90A-NEXT:    ; implicit-def: $agpr0_agpr1
@@ -8530,15 +8533,15 @@ define void @flat_atomic_usub_cond_i64_ret_a_a(ptr %ptr) #0 {
 ; GFX950-LABEL: flat_atomic_usub_cond_i64_ret_a_a:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    s_mov_b64 s[0:1], 0x50
-; GFX950-NEXT:    v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1]
+; GFX950-NEXT:    s_mov_b64 s[2:3], 0x50
+; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_lshl_add_u64 v[6:7], v[0:1], 0, s[2:3]
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def a[0:1]
 ; GFX950-NEXT:    ;;#ASMEND
-; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v7
 ; GFX950-NEXT:    v_accvgpr_read_b32 v5, a1
 ; GFX950-NEXT:    v_accvgpr_read_b32 v4, a0
-; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v7
 ; GFX950-NEXT:    ; implicit-def: $agpr0_agpr1
 ; GFX950-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -8606,8 +8609,8 @@ define void @flat_atomic_usub_cond_i64_ret_av_av(ptr %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, 0x50, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v3
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v[0:1]
@@ -8666,9 +8669,9 @@ define void @flat_atomic_usub_cond_i64_ret_av_av(ptr %ptr) #0 {
 ; GFX950-LABEL: flat_atomic_usub_cond_i64_ret_av_av:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    s_mov_b64 s[0:1], 0x50
-; GFX950-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX950-NEXT:    s_mov_b64 s[2:3], 0x50
 ; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
 ; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def v[0:1]
@@ -8737,12 +8740,12 @@ define void @flat_atomic_usub_sat_i64_ret_a_a(ptr %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x50, v0
+; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def a[0:1]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v7, a1
-; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v6, a0
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GFX90A-NEXT:    ; implicit-def: $agpr0_agpr1
@@ -8803,15 +8806,15 @@ define void @flat_atomic_usub_sat_i64_ret_a_a(ptr %ptr) #0 {
 ; GFX950-LABEL: flat_atomic_usub_sat_i64_ret_a_a:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    s_mov_b64 s[0:1], 0x50
-; GFX950-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX950-NEXT:    s_mov_b64 s[2:3], 0x50
+; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def a[0:1]
 ; GFX950-NEXT:    ;;#ASMEND
-; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
 ; GFX950-NEXT:    v_accvgpr_read_b32 v7, a1
 ; GFX950-NEXT:    v_accvgpr_read_b32 v6, a0
-; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
 ; GFX950-NEXT:    ; implicit-def: $agpr0_agpr1
 ; GFX950-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -8879,8 +8882,8 @@ define void @flat_atomic_usub_sat_i64_ret_av_av(ptr %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0x50, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v[2:3]
@@ -8939,9 +8942,9 @@ define void @flat_atomic_usub_sat_i64_ret_av_av(ptr %ptr) #0 {
 ; GFX950-LABEL: flat_atomic_usub_sat_i64_ret_av_av:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    s_mov_b64 s[0:1], 0x50
-; GFX950-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX950-NEXT:    s_mov_b64 s[2:3], 0x50
 ; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
 ; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def v[0:1]
@@ -9014,8 +9017,8 @@ define void @flat_atomic_fadd_f32_ret_a_a(ptr %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 40, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_shared_base
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def a0
 ; GFX90A-NEXT:    ;;#ASMEND
@@ -9097,8 +9100,8 @@ define void @flat_atomic_fadd_f32_ret_av_av(ptr %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 40, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_shared_base
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v3
@@ -9815,12 +9818,12 @@ define void @flat_atomic_fadd_f64_ret_a_a(ptr %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0x50, v0
+; GFX90A-NEXT:    s_mov_b64 s[4:5], src_shared_base
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def a[0:1]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v3, a1
-; GFX90A-NEXT:    s_mov_b64 s[4:5], src_shared_base
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v2, a0
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GFX90A-NEXT:    ; implicit-def: $agpr0_agpr1
@@ -9881,15 +9884,15 @@ define void @flat_atomic_fadd_f64_ret_a_a(ptr %ptr) #0 {
 ; GFX950-LABEL: flat_atomic_fadd_f64_ret_a_a:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    s_mov_b64 s[0:1], 0x50
-; GFX950-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX950-NEXT:    s_mov_b64 s[2:3], 0x50
+; GFX950-NEXT:    s_mov_b64 s[0:1], src_shared_base
+; GFX950-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def a[0:1]
 ; GFX950-NEXT:    ;;#ASMEND
-; GFX950-NEXT:    s_mov_b64 s[0:1], src_shared_base
+; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-NEXT:    v_accvgpr_read_b32 v3, a1
 ; GFX950-NEXT:    v_accvgpr_read_b32 v2, a0
-; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-NEXT:    ; implicit-def: $agpr0_agpr1
 ; GFX950-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -9955,8 +9958,8 @@ define void @flat_atomic_fadd_f64_ret_av_av(ptr %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, 0x50, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_shared_base
+; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v3
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v[4:5]
@@ -10012,9 +10015,9 @@ define void @flat_atomic_fadd_f64_ret_av_av(ptr %ptr) #0 {
 ; GFX950-LABEL: flat_atomic_fadd_f64_ret_av_av:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    s_mov_b64 s[0:1], 0x50
-; GFX950-NEXT:    v_lshl_add_u64 v[2:3], v[0:1], 0, s[0:1]
+; GFX950-NEXT:    s_mov_b64 s[2:3], 0x50
 ; GFX950-NEXT:    s_mov_b64 s[0:1], src_shared_base
+; GFX950-NEXT:    v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3]
 ; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v3
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def v[4:5]
@@ -10078,12 +10081,12 @@ define void @flat_atomic_fsub_f64_ret_a_a(ptr %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x50, v0
+; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def a[0:1]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v7, a1
-; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v6, a0
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GFX90A-NEXT:    ; implicit-def: $agpr0_agpr1
@@ -10136,15 +10139,15 @@ define void @flat_atomic_fsub_f64_ret_a_a(ptr %ptr) #0 {
 ; GFX950-LABEL: flat_atomic_fsub_f64_ret_a_a:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    s_mov_b64 s[0:1], 0x50
-; GFX950-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX950-NEXT:    s_mov_b64 s[2:3], 0x50
+; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def a[0:1]
 ; GFX950-NEXT:    ;;#ASMEND
-; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
 ; GFX950-NEXT:    v_accvgpr_read_b32 v7, a1
 ; GFX950-NEXT:    v_accvgpr_read_b32 v6, a0
-; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
 ; GFX950-NEXT:    ; implicit-def: $agpr0_agpr1
 ; GFX950-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -10201,8 +10204,8 @@ define void @flat_atomic_fsub_f64_ret_av_av(ptr %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0x50, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v[4:5]
@@ -10252,9 +10255,9 @@ define void @flat_atomic_fsub_f64_ret_av_av(ptr %ptr) #0 {
 ; GFX950-LABEL: flat_atomic_fsub_f64_ret_av_av:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    s_mov_b64 s[0:1], 0x50
-; GFX950-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX950-NEXT:    s_mov_b64 s[2:3], 0x50
 ; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
 ; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def v[2:3]
@@ -10311,12 +10314,12 @@ define void @flat_atomic_fmax_f64_ret_a_a(ptr %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0x50, v0
+; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def a[0:1]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v3, a1
-; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v2, a0
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GFX90A-NEXT:    ; implicit-def: $agpr0_agpr1
@@ -10358,15 +10361,15 @@ define void @flat_atomic_fmax_f64_ret_a_a(ptr %ptr) #0 {
 ; GFX950-LABEL: flat_atomic_fmax_f64_ret_a_a:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    s_mov_b64 s[0:1], 0x50
-; GFX950-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX950-NEXT:    s_mov_b64 s[2:3], 0x50
+; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def a[0:1]
 ; GFX950-NEXT:    ;;#ASMEND
-; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-NEXT:    v_accvgpr_read_b32 v3, a1
 ; GFX950-NEXT:    v_accvgpr_read_b32 v2, a0
-; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-NEXT:    ; implicit-def: $agpr0_agpr1
 ; GFX950-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -10412,8 +10415,8 @@ define void @flat_atomic_fmax_f64_ret_av_av(ptr %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, 0x50, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v3
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v[4:5]
@@ -10452,9 +10455,9 @@ define void @flat_atomic_fmax_f64_ret_av_av(ptr %ptr) #0 {
 ; GFX950-LABEL: flat_atomic_fmax_f64_ret_av_av:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    s_mov_b64 s[0:1], 0x50
-; GFX950-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX950-NEXT:    s_mov_b64 s[2:3], 0x50
 ; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
 ; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def v[2:3]
@@ -10500,12 +10503,12 @@ define void @flat_atomic_fmin_f64_ret_a_a(ptr %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0x50, v0
+; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def a[0:1]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v3, a1
-; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v2, a0
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GFX90A-NEXT:    ; implicit-def: $agpr0_agpr1
@@ -10547,15 +10550,15 @@ define void @flat_atomic_fmin_f64_ret_a_a(ptr %ptr) #0 {
 ; GFX950-LABEL: flat_atomic_fmin_f64_ret_a_a:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    s_mov_b64 s[0:1], 0x50
-; GFX950-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX950-NEXT:    s_mov_b64 s[2:3], 0x50
+; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def a[0:1]
 ; GFX950-NEXT:    ;;#ASMEND
-; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-NEXT:    v_accvgpr_read_b32 v3, a1
 ; GFX950-NEXT:    v_accvgpr_read_b32 v2, a0
-; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-NEXT:    ; implicit-def: $agpr0_agpr1
 ; GFX950-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -10601,8 +10604,8 @@ define void @flat_atomic_fmin_f64_ret_av_av(ptr %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, 0x50, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v3
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v[4:5]
@@ -10641,9 +10644,9 @@ define void @flat_atomic_fmin_f64_ret_av_av(ptr %ptr) #0 {
 ; GFX950-LABEL: flat_atomic_fmin_f64_ret_av_av:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    s_mov_b64 s[0:1], 0x50
-; GFX950-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX950-NEXT:    s_mov_b64 s[2:3], 0x50
 ; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
 ; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def v[2:3]
@@ -10689,12 +10692,12 @@ define void @flat_atomic_fmaximum_f64_ret_a_a(ptr %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v6, vcc, 0x50, v0
+; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def a[0:1]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v5, a1
-; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v4, a0
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v7
 ; GFX90A-NEXT:    ; implicit-def: $agpr0_agpr1
@@ -10755,15 +10758,15 @@ define void @flat_atomic_fmaximum_f64_ret_a_a(ptr %ptr) #0 {
 ; GFX950-LABEL: flat_atomic_fmaximum_f64_ret_a_a:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    s_mov_b64 s[0:1], 0x50
-; GFX950-NEXT:    v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1]
+; GFX950-NEXT:    s_mov_b64 s[2:3], 0x50
+; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_lshl_add_u64 v[6:7], v[0:1], 0, s[2:3]
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def a[0:1]
 ; GFX950-NEXT:    ;;#ASMEND
-; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v7
 ; GFX950-NEXT:    v_accvgpr_read_b32 v5, a1
 ; GFX950-NEXT:    v_accvgpr_read_b32 v4, a0
-; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v7
 ; GFX950-NEXT:    ; implicit-def: $agpr0_agpr1
 ; GFX950-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -10829,8 +10832,8 @@ define void @flat_atomic_fmaximum_f64_ret_av_av(ptr %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x50, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v[0:1]
@@ -10888,9 +10891,9 @@ define void @flat_atomic_fmaximum_f64_ret_av_av(ptr %ptr) #0 {
 ; GFX950-LABEL: flat_atomic_fmaximum_f64_ret_av_av:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    s_mov_b64 s[0:1], 0x50
-; GFX950-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX950-NEXT:    s_mov_b64 s[2:3], 0x50
 ; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
 ; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def v[0:1]
@@ -10957,12 +10960,12 @@ define void @flat_atomic_fminimum_f64_ret_a_a(ptr %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v6, vcc, 0x50, v0
+; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def a[0:1]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v5, a1
-; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v4, a0
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v7
 ; GFX90A-NEXT:    ; implicit-def: $agpr0_agpr1
@@ -11023,15 +11026,15 @@ define void @flat_atomic_fminimum_f64_ret_a_a(ptr %ptr) #0 {
 ; GFX950-LABEL: flat_atomic_fminimum_f64_ret_a_a:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    s_mov_b64 s[0:1], 0x50
-; GFX950-NEXT:    v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1]
+; GFX950-NEXT:    s_mov_b64 s[2:3], 0x50
+; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_lshl_add_u64 v[6:7], v[0:1], 0, s[2:3]
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def a[0:1]
 ; GFX950-NEXT:    ;;#ASMEND
-; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v7
 ; GFX950-NEXT:    v_accvgpr_read_b32 v5, a1
 ; GFX950-NEXT:    v_accvgpr_read_b32 v4, a0
-; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v7
 ; GFX950-NEXT:    ; implicit-def: $agpr0_agpr1
 ; GFX950-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -11097,8 +11100,8 @@ define void @flat_atomic_fminimum_f64_ret_av_av(ptr %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x50, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v[0:1]
@@ -11156,9 +11159,9 @@ define void @flat_atomic_fminimum_f64_ret_av_av(ptr %ptr) #0 {
 ; GFX950-LABEL: flat_atomic_fminimum_f64_ret_av_av:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    s_mov_b64 s[0:1], 0x50
-; GFX950-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX950-NEXT:    s_mov_b64 s[2:3], 0x50
 ; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
 ; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def v[0:1]
@@ -14337,8 +14340,8 @@ define void @flat_atomic_xchg_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_add_u32 s4, s16, 0x50
-; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_cmp_eq_u32 s5, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def a[0:1]
@@ -14380,8 +14383,8 @@ define void @flat_atomic_xchg_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    s_add_u32 s0, s0, 0x50
-; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_mov_b64 s[2:3], src_private_base
+; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_cmp_eq_u32 s1, s3
 ; GFX950-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; GFX950-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
@@ -14427,8 +14430,8 @@ define void @flat_atomic_xchg_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_add_u32 s4, s16, 0x50
-; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_cmp_eq_u32 s5, s7
 ; GFX90A-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GFX90A-NEXT:    s_andn2_b64 vcc, exec, s[6:7]
@@ -14464,8 +14467,8 @@ define void @flat_atomic_xchg_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    s_add_u32 s0, s0, 0x50
-; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_mov_b64 s[2:3], src_private_base
+; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_cmp_eq_u32 s1, s3
 ; GFX950-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; GFX950-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
@@ -14505,8 +14508,8 @@ define void @flat_atomic_add_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_add_u32 s4, s16, 0x50
-; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_cmp_eq_u32 s5, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def a[0:1]
@@ -14552,8 +14555,8 @@ define void @flat_atomic_add_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    s_add_u32 s0, s0, 0x50
-; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_mov_b64 s[2:3], src_private_base
+; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_cmp_eq_u32 s1, s3
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def a[0:1]
@@ -14601,8 +14604,8 @@ define void @flat_atomic_add_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_add_u32 s4, s16, 0x50
-; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_cmp_eq_u32 s5, s7
 ; GFX90A-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GFX90A-NEXT:    s_andn2_b64 vcc, exec, s[6:7]
@@ -14640,8 +14643,8 @@ define void @flat_atomic_add_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    s_add_u32 s0, s0, 0x50
-; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_mov_b64 s[2:3], src_private_base
+; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_cmp_eq_u32 s1, s3
 ; GFX950-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; GFX950-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
@@ -14682,8 +14685,8 @@ define void @flat_atomic_sub_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_add_u32 s4, s16, 0x50
-; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_cmp_eq_u32 s5, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def a[0:1]
@@ -14729,8 +14732,8 @@ define void @flat_atomic_sub_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    s_add_u32 s0, s0, 0x50
-; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_mov_b64 s[2:3], src_private_base
+; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_cmp_eq_u32 s1, s3
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def a[0:1]
@@ -14780,8 +14783,8 @@ define void @flat_atomic_sub_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_add_u32 s4, s16, 0x50
-; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_cmp_eq_u32 s5, s7
 ; GFX90A-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GFX90A-NEXT:    s_andn2_b64 vcc, exec, s[6:7]
@@ -14819,8 +14822,8 @@ define void @flat_atomic_sub_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    s_add_u32 s0, s0, 0x50
-; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_mov_b64 s[2:3], src_private_base
+; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_cmp_eq_u32 s1, s3
 ; GFX950-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; GFX950-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
@@ -14863,8 +14866,8 @@ define void @flat_atomic_and_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_add_u32 s4, s16, 0x50
-; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_cmp_eq_u32 s5, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def a[0:1]
@@ -14910,8 +14913,8 @@ define void @flat_atomic_and_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    s_add_u32 s0, s0, 0x50
-; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_mov_b64 s[2:3], src_private_base
+; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_cmp_eq_u32 s1, s3
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def a[0:1]
@@ -14960,8 +14963,8 @@ define void @flat_atomic_and_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_add_u32 s4, s16, 0x50
-; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_cmp_eq_u32 s5, s7
 ; GFX90A-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GFX90A-NEXT:    s_andn2_b64 vcc, exec, s[6:7]
@@ -14999,8 +15002,8 @@ define void @flat_atomic_and_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    s_add_u32 s0, s0, 0x50
-; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_mov_b64 s[2:3], src_private_base
+; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_cmp_eq_u32 s1, s3
 ; GFX950-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; GFX950-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
@@ -15042,8 +15045,8 @@ define void @flat_atomic_nand_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_add_u32 s4, s16, 0x50
-; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_cmp_eq_u32 s5, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def a[0:1]
@@ -15106,8 +15109,8 @@ define void @flat_atomic_nand_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    s_add_u32 s0, s0, 0x50
-; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_mov_b64 s[2:3], src_private_base
+; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_cmp_eq_u32 s1, s3
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def a[0:1]
@@ -15173,8 +15176,8 @@ define void @flat_atomic_nand_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_add_u32 s4, s16, 0x50
-; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_cmp_eq_u32 s5, s7
 ; GFX90A-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GFX90A-NEXT:    s_andn2_b64 vcc, exec, s[6:7]
@@ -15231,8 +15234,8 @@ define void @flat_atomic_nand_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    s_add_u32 s0, s0, 0x50
-; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_mov_b64 s[2:3], src_private_base
+; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_cmp_eq_u32 s1, s3
 ; GFX950-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; GFX950-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
@@ -15292,8 +15295,8 @@ define void @flat_atomic_or_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_add_u32 s4, s16, 0x50
-; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_cmp_eq_u32 s5, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def a[0:1]
@@ -15339,8 +15342,8 @@ define void @flat_atomic_or_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    s_add_u32 s0, s0, 0x50
-; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_mov_b64 s[2:3], src_private_base
+; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_cmp_eq_u32 s1, s3
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def a[0:1]
@@ -15389,8 +15392,8 @@ define void @flat_atomic_or_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_add_u32 s4, s16, 0x50
-; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_cmp_eq_u32 s5, s7
 ; GFX90A-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GFX90A-NEXT:    s_andn2_b64 vcc, exec, s[6:7]
@@ -15428,8 +15431,8 @@ define void @flat_atomic_or_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    s_add_u32 s0, s0, 0x50
-; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_mov_b64 s[2:3], src_private_base
+; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_cmp_eq_u32 s1, s3
 ; GFX950-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; GFX950-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
@@ -15471,8 +15474,8 @@ define void @flat_atomic_xor_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_add_u32 s4, s16, 0x50
-; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_cmp_eq_u32 s5, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def a[0:1]
@@ -15518,8 +15521,8 @@ define void @flat_atomic_xor_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    s_add_u32 s0, s0, 0x50
-; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_mov_b64 s[2:3], src_private_base
+; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_cmp_eq_u32 s1, s3
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def a[0:1]
@@ -15568,8 +15571,8 @@ define void @flat_atomic_xor_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_add_u32 s4, s16, 0x50
-; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_cmp_eq_u32 s5, s7
 ; GFX90A-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GFX90A-NEXT:    s_andn2_b64 vcc, exec, s[6:7]
@@ -15607,8 +15610,8 @@ define void @flat_atomic_xor_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    s_add_u32 s0, s0, 0x50
-; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_mov_b64 s[2:3], src_private_base
+; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_cmp_eq_u32 s1, s3
 ; GFX950-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; GFX950-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
@@ -15650,8 +15653,8 @@ define void @flat_atomic_max_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_add_u32 s4, s16, 0x50
-; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_cmp_eq_u32 s5, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def a[0:1]
@@ -15698,8 +15701,8 @@ define void @flat_atomic_max_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    s_add_u32 s0, s0, 0x50
-; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_mov_b64 s[2:3], src_private_base
+; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_cmp_eq_u32 s1, s3
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def a[0:1]
@@ -15750,8 +15753,8 @@ define void @flat_atomic_max_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_add_u32 s4, s16, 0x50
-; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_cmp_eq_u32 s5, s7
 ; GFX90A-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GFX90A-NEXT:    s_andn2_b64 vcc, exec, s[6:7]
@@ -15790,8 +15793,8 @@ define void @flat_atomic_max_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    s_add_u32 s0, s0, 0x50
-; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_mov_b64 s[2:3], src_private_base
+; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_cmp_eq_u32 s1, s3
 ; GFX950-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; GFX950-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
@@ -15835,8 +15838,8 @@ define void @flat_atomic_min_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_add_u32 s4, s16, 0x50
-; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_cmp_eq_u32 s5, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def a[0:1]
@@ -15883,8 +15886,8 @@ define void @flat_atomic_min_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    s_add_u32 s0, s0, 0x50
-; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_mov_b64 s[2:3], src_private_base
+; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_cmp_eq_u32 s1, s3
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def a[0:1]
@@ -15935,8 +15938,8 @@ define void @flat_atomic_min_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_add_u32 s4, s16, 0x50
-; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_cmp_eq_u32 s5, s7
 ; GFX90A-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GFX90A-NEXT:    s_andn2_b64 vcc, exec, s[6:7]
@@ -15975,8 +15978,8 @@ define void @flat_atomic_min_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    s_add_u32 s0, s0, 0x50
-; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_mov_b64 s[2:3], src_private_base
+; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_cmp_eq_u32 s1, s3
 ; GFX950-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; GFX950-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
@@ -16020,8 +16023,8 @@ define void @flat_atomic_umax_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_add_u32 s4, s16, 0x50
-; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_cmp_eq_u32 s5, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def a[0:1]
@@ -16068,8 +16071,8 @@ define void @flat_atomic_umax_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    s_add_u32 s0, s0, 0x50
-; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_mov_b64 s[2:3], src_private_base
+; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_cmp_eq_u32 s1, s3
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def a[0:1]
@@ -16120,8 +16123,8 @@ define void @flat_atomic_umax_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_add_u32 s4, s16, 0x50
-; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_cmp_eq_u32 s5, s7
 ; GFX90A-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GFX90A-NEXT:    s_andn2_b64 vcc, exec, s[6:7]
@@ -16160,8 +16163,8 @@ define void @flat_atomic_umax_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    s_add_u32 s0, s0, 0x50
-; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_mov_b64 s[2:3], src_private_base
+; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_cmp_eq_u32 s1, s3
 ; GFX950-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; GFX950-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
@@ -16205,8 +16208,8 @@ define void @flat_atomic_umin_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_add_u32 s4, s16, 0x50
-; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_cmp_eq_u32 s5, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def a[0:1]
@@ -16253,8 +16256,8 @@ define void @flat_atomic_umin_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    s_add_u32 s0, s0, 0x50
-; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_mov_b64 s[2:3], src_private_base
+; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_cmp_eq_u32 s1, s3
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def a[0:1]
@@ -16305,8 +16308,8 @@ define void @flat_atomic_umin_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_add_u32 s4, s16, 0x50
-; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_cmp_eq_u32 s5, s7
 ; GFX90A-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GFX90A-NEXT:    s_andn2_b64 vcc, exec, s[6:7]
@@ -16345,8 +16348,8 @@ define void @flat_atomic_umin_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    s_add_u32 s0, s0, 0x50
-; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_mov_b64 s[2:3], src_private_base
+; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_cmp_eq_u32 s1, s3
 ; GFX950-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; GFX950-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
@@ -16390,8 +16393,8 @@ define void @flat_atomic_uinc_wrap_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_add_u32 s4, s16, 0x50
-; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_cmp_eq_u32 s5, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def a[0:1]
@@ -16440,8 +16443,8 @@ define void @flat_atomic_uinc_wrap_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    s_add_u32 s0, s0, 0x50
-; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_mov_b64 s[2:3], src_private_base
+; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_cmp_eq_u32 s1, s3
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def a[0:1]
@@ -16492,8 +16495,8 @@ define void @flat_atomic_uinc_wrap_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_add_u32 s4, s16, 0x50
-; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_cmp_eq_u32 s5, s7
 ; GFX90A-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GFX90A-NEXT:    s_andn2_b64 vcc, exec, s[6:7]
@@ -16534,8 +16537,8 @@ define void @flat_atomic_uinc_wrap_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    s_add_u32 s0, s0, 0x50
-; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_mov_b64 s[2:3], src_private_base
+; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_cmp_eq_u32 s1, s3
 ; GFX950-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; GFX950-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
@@ -16580,8 +16583,8 @@ define void @flat_atomic_udec_wrap_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_add_u32 s4, s16, 0x50
-; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_cmp_eq_u32 s5, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def a[0:1]
@@ -16632,8 +16635,8 @@ define void @flat_atomic_udec_wrap_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    s_add_u32 s0, s0, 0x50
-; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_mov_b64 s[2:3], src_private_base
+; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_cmp_eq_u32 s1, s3
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def a[0:1]
@@ -16686,8 +16689,8 @@ define void @flat_atomic_udec_wrap_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_add_u32 s4, s16, 0x50
-; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_cmp_eq_u32 s5, s7
 ; GFX90A-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GFX90A-NEXT:    s_andn2_b64 vcc, exec, s[6:7]
@@ -16730,8 +16733,8 @@ define void @flat_atomic_udec_wrap_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    s_add_u32 s0, s0, 0x50
-; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_mov_b64 s[2:3], src_private_base
+; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_cmp_eq_u32 s1, s3
 ; GFX950-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; GFX950-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
@@ -16777,8 +16780,8 @@ define void @flat_atomic_usub_cond_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_add_u32 s4, s16, 0x50
-; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_cmp_eq_u32 s5, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def a[0:1]
@@ -16843,8 +16846,8 @@ define void @flat_atomic_usub_cond_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    s_add_u32 s0, s0, 0x50
-; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_mov_b64 s[2:3], src_private_base
+; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_cmp_eq_u32 s1, s3
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def a[0:1]
@@ -16915,8 +16918,8 @@ define void @flat_atomic_usub_cond_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_add_u32 s4, s16, 0x50
-; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_cmp_eq_u32 s5, s7
 ; GFX90A-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GFX90A-NEXT:    s_andn2_b64 vcc, exec, s[6:7]
@@ -16975,8 +16978,8 @@ define void @flat_atomic_usub_cond_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    s_add_u32 s0, s0, 0x50
-; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_mov_b64 s[2:3], src_private_base
+; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_cmp_eq_u32 s1, s3
 ; GFX950-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; GFX950-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
@@ -17042,8 +17045,8 @@ define void @flat_atomic_usub_sat_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_add_u32 s4, s16, 0x50
-; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_cmp_eq_u32 s5, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def a[0:1]
@@ -17108,8 +17111,8 @@ define void @flat_atomic_usub_sat_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    s_add_u32 s0, s0, 0x50
-; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_mov_b64 s[2:3], src_private_base
+; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_cmp_eq_u32 s1, s3
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def a[0:1]
@@ -17180,8 +17183,8 @@ define void @flat_atomic_usub_sat_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_add_u32 s4, s16, 0x50
-; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_cmp_eq_u32 s5, s7
 ; GFX90A-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GFX90A-NEXT:    s_andn2_b64 vcc, exec, s[6:7]
@@ -17240,8 +17243,8 @@ define void @flat_atomic_usub_sat_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    s_add_u32 s0, s0, 0x50
-; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_mov_b64 s[2:3], src_private_base
+; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_cmp_eq_u32 s1, s3
 ; GFX950-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; GFX950-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
@@ -17311,8 +17314,8 @@ define void @flat_atomic_fadd_f32_saddr_ret_a_a(ptr inreg %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_add_u32 s4, s16, 40
-; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], src_shared_base
+; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_cmp_eq_u32 s5, s7
 ; GFX90A-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -17393,8 +17396,8 @@ define void @flat_atomic_fadd_f32_saddr_ret_av_av(ptr inreg %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_add_u32 s4, s16, 40
-; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], src_shared_base
+; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_cmp_eq_u32 s5, s7
 ; GFX90A-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GFX90A-NEXT:    s_andn2_b64 vcc, exec, s[6:7]
@@ -18138,8 +18141,8 @@ define void @flat_atomic_fadd_f64_saddr_ret_a_a(ptr inreg %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_add_u32 s4, s16, 0x50
-; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], src_shared_base
+; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_cmp_eq_u32 s5, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def a[0:1]
@@ -18202,8 +18205,8 @@ define void @flat_atomic_fadd_f64_saddr_ret_a_a(ptr inreg %ptr) #0 {
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    s_add_u32 s0, s0, 0x50
-; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_mov_b64 s[2:3], src_shared_base
+; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_cmp_eq_u32 s1, s3
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def a[0:1]
@@ -18269,8 +18272,8 @@ define void @flat_atomic_fadd_f64_saddr_ret_av_av(ptr inreg %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_add_u32 s4, s16, 0x50
-; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], src_shared_base
+; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_cmp_eq_u32 s5, s7
 ; GFX90A-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GFX90A-NEXT:    s_andn2_b64 vcc, exec, s[6:7]
@@ -18324,8 +18327,8 @@ define void @flat_atomic_fadd_f64_saddr_ret_av_av(ptr inreg %ptr) #0 {
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    s_add_u32 s0, s0, 0x50
-; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_mov_b64 s[2:3], src_shared_base
+; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_cmp_eq_u32 s1, s3
 ; GFX950-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; GFX950-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
@@ -18383,8 +18386,8 @@ define void @flat_atomic_fsub_f64_saddr_ret_a_a(ptr inreg %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_add_u32 s4, s16, 0x50
-; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_cmp_eq_u32 s5, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def a[0:1]
@@ -18441,8 +18444,8 @@ define void @flat_atomic_fsub_f64_saddr_ret_a_a(ptr inreg %ptr) #0 {
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    s_add_u32 s0, s0, 0x50
-; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_mov_b64 s[2:3], src_private_base
+; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_cmp_eq_u32 s1, s3
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def a[0:1]
@@ -18502,8 +18505,8 @@ define void @flat_atomic_fsub_f64_saddr_ret_av_av(ptr inreg %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_add_u32 s4, s16, 0x50
-; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_cmp_eq_u32 s5, s7
 ; GFX90A-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GFX90A-NEXT:    s_andn2_b64 vcc, exec, s[6:7]
@@ -18553,8 +18556,8 @@ define void @flat_atomic_fsub_f64_saddr_ret_av_av(ptr inreg %ptr) #0 {
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    s_add_u32 s0, s0, 0x50
-; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_mov_b64 s[2:3], src_private_base
+; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_cmp_eq_u32 s1, s3
 ; GFX950-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; GFX950-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
@@ -18608,8 +18611,8 @@ define void @flat_atomic_fmax_f64_saddr_ret_a_a(ptr inreg %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_add_u32 s4, s16, 0x50
-; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_cmp_eq_u32 s5, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def a[0:1]
@@ -18656,8 +18659,8 @@ define void @flat_atomic_fmax_f64_saddr_ret_a_a(ptr inreg %ptr) #0 {
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    s_add_u32 s0, s0, 0x50
-; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_mov_b64 s[2:3], src_private_base
+; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_cmp_eq_u32 s1, s3
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def a[0:1]
@@ -18707,8 +18710,8 @@ define void @flat_atomic_fmax_f64_saddr_ret_av_av(ptr inreg %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_add_u32 s4, s16, 0x50
-; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_cmp_eq_u32 s5, s7
 ; GFX90A-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GFX90A-NEXT:    s_andn2_b64 vcc, exec, s[6:7]
@@ -18747,8 +18750,8 @@ define void @flat_atomic_fmax_f64_saddr_ret_av_av(ptr inreg %ptr) #0 {
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    s_add_u32 s0, s0, 0x50
-; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_mov_b64 s[2:3], src_private_base
+; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_cmp_eq_u32 s1, s3
 ; GFX950-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; GFX950-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
@@ -18791,8 +18794,8 @@ define void @flat_atomic_fmin_f64_saddr_ret_a_a(ptr inreg %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_add_u32 s4, s16, 0x50
-; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_cmp_eq_u32 s5, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def a[0:1]
@@ -18839,8 +18842,8 @@ define void @flat_atomic_fmin_f64_saddr_ret_a_a(ptr inreg %ptr) #0 {
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    s_add_u32 s0, s0, 0x50
-; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_mov_b64 s[2:3], src_private_base
+; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_cmp_eq_u32 s1, s3
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def a[0:1]
@@ -18890,8 +18893,8 @@ define void @flat_atomic_fmin_f64_saddr_ret_av_av(ptr inreg %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_add_u32 s4, s16, 0x50
-; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_cmp_eq_u32 s5, s7
 ; GFX90A-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GFX90A-NEXT:    s_andn2_b64 vcc, exec, s[6:7]
@@ -18930,8 +18933,8 @@ define void @flat_atomic_fmin_f64_saddr_ret_av_av(ptr inreg %ptr) #0 {
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    s_add_u32 s0, s0, 0x50
-; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_mov_b64 s[2:3], src_private_base
+; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_cmp_eq_u32 s1, s3
 ; GFX950-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; GFX950-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
@@ -18974,8 +18977,8 @@ define void @flat_atomic_fmaximum_f64_saddr_ret_a_a(ptr inreg %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_add_u32 s4, s16, 0x50
-; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_cmp_eq_u32 s5, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def a[0:1]
@@ -19040,8 +19043,8 @@ define void @flat_atomic_fmaximum_f64_saddr_ret_a_a(ptr inreg %ptr) #0 {
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    s_add_u32 s0, s0, 0x50
-; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_mov_b64 s[2:3], src_private_base
+; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_cmp_eq_u32 s1, s3
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def a[0:1]
@@ -19110,8 +19113,8 @@ define void @flat_atomic_fmaximum_f64_saddr_ret_av_av(ptr inreg %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_add_u32 s4, s16, 0x50
-; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_cmp_eq_u32 s5, s7
 ; GFX90A-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GFX90A-NEXT:    s_andn2_b64 vcc, exec, s[6:7]
@@ -19169,8 +19172,8 @@ define void @flat_atomic_fmaximum_f64_saddr_ret_av_av(ptr inreg %ptr) #0 {
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    s_add_u32 s0, s0, 0x50
-; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_mov_b64 s[2:3], src_private_base
+; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_cmp_eq_u32 s1, s3
 ; GFX950-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; GFX950-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
@@ -19234,8 +19237,8 @@ define void @flat_atomic_fminimum_f64_saddr_ret_a_a(ptr inreg %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_add_u32 s4, s16, 0x50
-; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_cmp_eq_u32 s5, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def a[0:1]
@@ -19300,8 +19303,8 @@ define void @flat_atomic_fminimum_f64_saddr_ret_a_a(ptr inreg %ptr) #0 {
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    s_add_u32 s0, s0, 0x50
-; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_mov_b64 s[2:3], src_private_base
+; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_cmp_eq_u32 s1, s3
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def a[0:1]
@@ -19370,8 +19373,8 @@ define void @flat_atomic_fminimum_f64_saddr_ret_av_av(ptr inreg %ptr) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_add_u32 s4, s16, 0x50
-; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX90A-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX90A-NEXT:    s_cmp_eq_u32 s5, s7
 ; GFX90A-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GFX90A-NEXT:    s_andn2_b64 vcc, exec, s[6:7]
@@ -19429,8 +19432,8 @@ define void @flat_atomic_fminimum_f64_saddr_ret_av_av(ptr inreg %ptr) #0 {
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    s_add_u32 s0, s0, 0x50
-; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_mov_b64 s[2:3], src_private_base
+; GFX950-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-NEXT:    s_cmp_eq_u32 s1, s3
 ; GFX950-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; GFX950-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
diff --git a/llvm/test/CodeGen/AMDGPU/acc-ldst.ll b/llvm/test/CodeGen/AMDGPU/acc-ldst.ll
index 002ccd6060681..635d2a2d16a76 100644
--- a/llvm/test/CodeGen/AMDGPU/acc-ldst.ll
+++ b/llvm/test/CodeGen/AMDGPU/acc-ldst.ll
@@ -9,8 +9,7 @@ declare i32 @llvm.amdgcn.workitem.id.x()
 ; GCN-COUNT-8: global_load_dwordx4 a[{{[0-9:]+}}], v{{[0-9:]+}}, s[{{[0-9:]+}}]
 ; GCN-NOT:     v_accvgpr_write
 ; GCN:         v_mfma_f32_32x32x1f32
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 15
 ; GCN-NEXT:    s_nop 2
 ; GCN-NOT:     v_accvgpr_read
 ; GCN-COUNT-8: global_store_dwordx4 v{{[0-9:]+}}, a[{{[0-9:]+}}], s[{{[0-9:]+}}]
@@ -28,8 +27,7 @@ bb:
 ; GCN:      global_load_dword a{{[0-9]+}}, v{{[0-9:]+}}, s[{{[0-9:]+}}]
 ; GCN-NOT:  v_accvgpr_read
 ; GCN:      v_mfma_f32_32x32x1f32 a[[[N:[0-9]+]]:
-; GCN-NEXT: s_nop 7
-; GCN-NEXT: s_nop 7
+; GCN-NEXT: s_nop 15
 ; GCN-NEXT: s_nop 2
 ; GCN-NOT:  v_accvgpr_read
 ; GCN-NEXT: global_store_dword v{{[0-9:]+}}, a[[N]], s[{{[0-9:]+}}]
@@ -80,8 +78,7 @@ bb:
 ; GCN-COUNT-8:  global_load_dwordx4 v[{{[0-9:]+}}], v{{[0-9:]+}}, s[{{[0-9:]+}}]
 ; GCN-COUNT-32: v_accvgpr_write
 ; GCN:          v_mfma_f32_32x32x1f32
-; GCN-NEXT:     s_nop 7
-; GCN-NEXT:     s_nop 7
+; GCN-NEXT:     s_nop 15
 ; GCN-NEXT:     s_nop 2
 ; GCN-NOT:      v_accvgpr_read
 ; GCN-COUNT-8:  global_store_dwordx4 v{{[0-9:]+}}, a[{{[0-9:]+}}]
diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll
index 42c7b90da63d3..2d7ef2c262157 100644
--- a/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll
@@ -169,6 +169,6 @@ attributes #1 = { nounwind }
 
 ;.
 ; HSA: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
-; HSA: attributes #[[ATTR1]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; HSA: attributes #[[ATTR1]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast-gas.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast-gas.ll
index 153898560fc31..aac499f2fc602 100644
--- a/llvm/test/CodeGen/AMDGPU/addrspacecast-gas.ll
+++ b/llvm/test/CodeGen/AMDGPU/addrspacecast-gas.ll
@@ -27,20 +27,20 @@ define amdgpu_kernel void @use_private_to_flat_addrspacecast(ptr addrspace(5) %p
 ;
 ; GFX1250-GISEL-LABEL: use_private_to_flat_addrspacecast:
 ; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    s_load_b32 s2, s[4:5], 0x24
-; GFX1250-GISEL-NEXT:    s_mov_b64 s[0:1], src_flat_scratch_base_lo
+; GFX1250-GISEL-NEXT:    s_load_b32 s0, s[4:5], 0x24
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_mbcnt_lo_u32_b32 v2, -1, 0
-; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-GISEL-NEXT:    s_cmp_lg_u32 s2, -1
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX1250-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, s2, v0
+; GFX1250-GISEL-NEXT:    s_cmp_lg_u32 s0, -1
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, s0, v0
 ; GFX1250-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
-; GFX1250-GISEL-NEXT:    s_cselect_b32 s0, 1, 0
-; GFX1250-GISEL-NEXT:    s_and_b32 s0, 1, s0
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT:    s_cselect_b32 s1, 1, 0
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    s_and_b32 s1, 1, s1
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, v2, v1, vcc_lo
-; GFX1250-GISEL-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
+; GFX1250-GISEL-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s1
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_cndmask_b32 v1, 0, v1
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
@@ -69,14 +69,13 @@ define amdgpu_kernel void @use_private_to_flat_addrspacecast_nonnull(ptr addrspa
 ;
 ; GFX1250-GISEL-LABEL: use_private_to_flat_addrspacecast_nonnull:
 ; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    s_load_b32 s2, s[4:5], 0x24
-; GFX1250-GISEL-NEXT:    s_mov_b64 s[0:1], src_flat_scratch_base_lo
+; GFX1250-GISEL-NEXT:    s_load_b32 s0, s[4:5], 0x24
 ; GFX1250-GISEL-NEXT:    v_mbcnt_lo_u32_b32 v2, -1, 0
-; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1250-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, s2, v0
+; GFX1250-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, s0, v0
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, v2, v1, vcc_lo
 ; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v2, 0
diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast.ll
index a6a0a9a3c9015..4df82946343b5 100644
--- a/llvm/test/CodeGen/AMDGPU/addrspacecast.ll
+++ b/llvm/test/CodeGen/AMDGPU/addrspacecast.ll
@@ -8,8 +8,8 @@ target triple = "amdgcn-amd-amdhsa"
 ; CI-DAG: s_load_dword [[PTR:s[0-9]+]], s[6:7], 0x0{{$}}
 ; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x10{{$}}
 ; CI-DAG: s_cmp_lg_u32 [[PTR]], -1
-; CI-DAG: s_cselect_b32 s[[HI:[0-9]+]], [[APERTURE]], 0
-; CI-DAG: s_cselect_b32 s[[LO:[0-9]+]], [[PTR]], 0
+; CI-DAG: s_cselect_b32 s[[SHI:[0-9]+]], [[APERTURE]], 0
+; CI-DAG: s_cselect_b32 s[[SLO:[0-9]+]], [[PTR]], 0
 
 ; GFX9-DAG: s_mov_b64 s[{{[0-9]+}}:[[HIBASE:[0-9]+]]], src_shared_base
 
@@ -17,10 +17,13 @@ target triple = "amdgcn-amd-amdhsa"
 ; GFX9-DAG: s_load_dword [[PTR:s[0-9]+]], s[4:5], 0x0{{$}}
 
 ; GFX9: s_cmp_lg_u32 [[PTR]], -1
-; GFX9-DAG: s_cselect_b32 s[[LO:[0-9]+]], s[[HIBASE]], 0
-; GFX9-DAG: s_cselect_b32 s[[HI:[0-9]+]], [[PTR]], 0
+; GFX9-DAG: s_cselect_b32 s[[SHI:[0-9]+]], s[[HIBASE]], 0
+; GFX9-DAG: s_cselect_b32 s[[SLO:[0-9]+]], [[PTR]], 0
 
-; HSA: flat_store_dword v[[[LO]]:[[HI]]], [[K]]
+; HSA-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[SLO]]
+; HSA-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]]
+
+; HSA: flat_store_dword v[[[VLO]]:[[VHI]]], [[K]]
 
 ; HSA:  .amdhsa_user_sgpr_private_segment_buffer 1
 ; HSA:  .amdhsa_user_sgpr_dispatch_ptr 0
@@ -68,18 +71,21 @@ define void @use_group_to_flat_addrspacecast_func(ptr addrspace(3) %ptr) #0 {
 
 ; CI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
 ; CI-DAG: s_cmp_lg_u32 [[PTR]], -1
-; CI-DAG: s_cselect_b32 s[[HI:[0-9]+]], [[APERTURE]], 0
-; CI-DAG: s_cselect_b32 s[[LO:[0-9]+]], [[PTR]], 0
+; CI-DAG: s_cselect_b32 s[[SHI:[0-9]+]], [[APERTURE]], 0
+; CI-DAG: s_cselect_b32 s[[SLO:[0-9]+]], [[PTR]], 0
 
 ; GFX9-DAG: s_load_dword [[PTR:s[0-9]+]], s[4:5], 0x0{{$}}
 ; GFX9-DAG: s_mov_b64 s[{{[0-9]+}}:[[HIBASE:[0-9]+]]], src_private_base
 
 ; GFX9-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
 ; GFX9: s_cmp_lg_u32 [[PTR]], -1
-; GFX9: s_cselect_b32 s[[LO:[0-9]+]], s[[HIBASE]], 0
-; GFX9: s_cselect_b32 s[[HI:[0-9]+]], [[PTR]], 0
+; GFX9: s_cselect_b32 s[[SHI:[0-9]+]], s[[HIBASE]], 0
+; GFX9: s_cselect_b32 s[[SLO:[0-9]+]], [[PTR]], 0
 
-; HSA: flat_store_dword v[[[LO]]:[[HI]]], [[K]]
+; HSA-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]]
+; HSA-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[SLO]]
+
+; HSA: flat_store_dword v[[[VLO]]:[[VHI]]], [[K]]
 
 ; HSA:  .amdhsa_user_sgpr_private_segment_buffer 1
 ; HSA:  .amdhsa_user_sgpr_dispatch_ptr 0
diff --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
index c226dae3d64a9..9e240238c1066 100644
--- a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
+++ b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
@@ -63,8 +63,7 @@ define void @no_free_vgprs_at_agpr_to_agpr_copy(float %v0, float %v1) #0 {
 ; GFX908-NEXT:    v_accvgpr_write_b32 a16, v39
 ; GFX908-NEXT:    s_nop 0
 ; GFX908-NEXT:    v_mfma_f32_16x16x1f32 a[0:15], v33, v32, a[16:31]
-; GFX908-NEXT:    s_nop 7
-; GFX908-NEXT:    s_nop 1
+; GFX908-NEXT:    s_nop 9
 ; GFX908-NEXT:    v_accvgpr_read_b32 v39, a0 ; Reload Reuse
 ; GFX908-NEXT:    v_accvgpr_read_b32 v38, a11 ; Reload Reuse
 ; GFX908-NEXT:    v_accvgpr_read_b32 v37, a12 ; Reload Reuse
@@ -181,8 +180,7 @@ define void @no_free_vgprs_at_agpr_to_agpr_copy(float %v0, float %v1) #0 {
 ; GFX90A-NEXT:    v_accvgpr_mov_b32 a16, a0
 ; GFX90A-NEXT:    s_nop 1
 ; GFX90A-NEXT:    v_mfma_f32_16x16x1f32 a[0:15], v33, v32, a[16:31]
-; GFX90A-NEXT:    s_nop 7
-; GFX90A-NEXT:    s_nop 2
+; GFX90A-NEXT:    s_nop 10
 ; GFX90A-NEXT:    buffer_store_dword a0, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX90A-NEXT:    s_nop 0
 ; GFX90A-NEXT:    buffer_store_dword a1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
@@ -487,8 +485,7 @@ define void @v32_asm_def_use(float %v0, float %v1) #4 {
 ; GFX90A-NEXT:    ; copy
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_accvgpr_write_b32 a32, v35 ; Reload Reuse
-; GFX90A-NEXT:    s_nop 7
-; GFX90A-NEXT:    s_nop 1
+; GFX90A-NEXT:    s_nop 9
 ; GFX90A-NEXT:    v_accvgpr_mov_b32 a3, a2
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use a3 v[0:31]
@@ -965,8 +962,7 @@ define void @no_free_vgprs_at_sgpr_to_agpr_copy(float %v0, float %v1) #0 {
 ; GFX908-NEXT:    v_accvgpr_write_b32 a16, v39
 ; GFX908-NEXT:    s_nop 0
 ; GFX908-NEXT:    v_mfma_f32_16x16x1f32 a[0:15], v33, v32, a[16:31]
-; GFX908-NEXT:    s_nop 7
-; GFX908-NEXT:    s_nop 1
+; GFX908-NEXT:    s_nop 9
 ; GFX908-NEXT:    v_accvgpr_read_b32 v39, a0 ; Reload Reuse
 ; GFX908-NEXT:    v_accvgpr_read_b32 v38, a11 ; Reload Reuse
 ; GFX908-NEXT:    v_accvgpr_read_b32 v37, a12 ; Reload Reuse
@@ -1084,8 +1080,7 @@ define void @no_free_vgprs_at_sgpr_to_agpr_copy(float %v0, float %v1) #0 {
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v34, a32 ; Reload Reuse
 ; GFX90A-NEXT:    s_nop 0
 ; GFX90A-NEXT:    v_mfma_f32_16x16x1f32 a[0:15], v33, v32, a[16:31]
-; GFX90A-NEXT:    s_nop 7
-; GFX90A-NEXT:    s_nop 2
+; GFX90A-NEXT:    s_nop 10
 ; GFX90A-NEXT:    buffer_store_dword a0, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX90A-NEXT:    s_nop 0
 ; GFX90A-NEXT:    buffer_store_dword a1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
index 46b82d3a3d651..1ce7179774349 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
@@ -15503,59 +15503,37 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v131.h, 8, v29.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(54)
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v114
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(53)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(17)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v132.l, 8, v81.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(52)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(16)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v129.l, 8, v82.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(51)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v129.h, 8, v83.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(50)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v128.l, 8, v83.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(49)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v128.h, 8, v85.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(48)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v117.h, 8, v85.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(47)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v118.l, 8, v87.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(46)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v116.l, 8, v87.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(45)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v116.h, 8, v97.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(44)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v114.l, 8, v97.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(43)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(9)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v114.h, 8, v98.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(42)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(3)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v112.h, 8, v99.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(41)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(2)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v113.l, 8, v100.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(40)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v102.l, 8, v101.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(39)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v102.h, 8, v102.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(38)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v100.h, 8, v160.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(37)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v101.l, 8, v160.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(36)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v98.h, 8, v161.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(35)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v99.l, 8, v161.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(34)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v97.l, 8, v162.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(33)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v97.h, 8, v162.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(32)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v87.l, 8, v163.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(31)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v87.h, 8, v163.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(30)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v85.l, 8, v164.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(29)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v85.h, 8, v164.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(28)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v83.l, 8, v165.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(27)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v83.h, 8, v165.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v81.h, 8, v71.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v82.l, 8, v71.l
@@ -52226,59 +52204,37 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v131.h, 8, v29.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(54)
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v114
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(53)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(17)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v132.l, 8, v81.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(52)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(16)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v129.l, 8, v82.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(51)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v129.h, 8, v83.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(50)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v128.l, 8, v83.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(49)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v128.h, 8, v85.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(48)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v117.h, 8, v85.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(47)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v118.l, 8, v87.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(46)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v116.l, 8, v87.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(45)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v116.h, 8, v97.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(44)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v114.l, 8, v97.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(43)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(9)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v114.h, 8, v98.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(42)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(3)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v112.h, 8, v99.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(41)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(2)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v113.l, 8, v100.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(40)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v102.l, 8, v101.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(39)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v102.h, 8, v102.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(38)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v100.h, 8, v160.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(37)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v101.l, 8, v160.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(36)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v98.h, 8, v161.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(35)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v99.l, 8, v161.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(34)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v97.l, 8, v162.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(33)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v97.h, 8, v162.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(32)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v87.l, 8, v163.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(31)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v87.h, 8, v163.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(30)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v85.l, 8, v164.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(29)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v85.h, 8, v164.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(28)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v83.l, 8, v165.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(27)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v83.h, 8, v165.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v81.h, 8, v71.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v82.l, 8, v71.l
@@ -87002,59 +86958,37 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v131.h, 8, v29.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(54)
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v114
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(53)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(17)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v132.l, 8, v81.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(52)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(16)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v129.l, 8, v82.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(51)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v129.h, 8, v83.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(50)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v128.l, 8, v83.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(49)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v128.h, 8, v85.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(48)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v117.h, 8, v85.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(47)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v118.l, 8, v87.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(46)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v116.l, 8, v87.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(45)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v116.h, 8, v97.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(44)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v114.l, 8, v97.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(43)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(9)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v114.h, 8, v98.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(42)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(3)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v112.h, 8, v99.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(41)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(2)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v113.l, 8, v100.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(40)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v102.l, 8, v101.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(39)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v102.h, 8, v102.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(38)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v100.h, 8, v160.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(37)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v101.l, 8, v160.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(36)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v98.h, 8, v161.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(35)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v99.l, 8, v161.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(34)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v97.l, 8, v162.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(33)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v97.h, 8, v162.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(32)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v87.l, 8, v163.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(31)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v87.h, 8, v163.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(30)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v85.l, 8, v164.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(29)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v85.h, 8, v164.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(28)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v83.l, 8, v165.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(27)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v83.h, 8, v165.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v81.h, 8, v71.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v82.l, 8, v71.l
@@ -121707,59 +121641,37 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v131.h, 8, v29.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(54)
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v114
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(53)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(17)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v132.l, 8, v81.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(52)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(16)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v129.l, 8, v82.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(51)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v129.h, 8, v83.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(50)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v128.l, 8, v83.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(49)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v128.h, 8, v85.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(48)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v117.h, 8, v85.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(47)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v118.l, 8, v87.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(46)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v116.l, 8, v87.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(45)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v116.h, 8, v97.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(44)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v114.l, 8, v97.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(43)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(9)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v114.h, 8, v98.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(42)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(3)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v112.h, 8, v99.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(41)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(2)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v113.l, 8, v100.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(40)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v102.l, 8, v101.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(39)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v102.h, 8, v102.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(38)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v100.h, 8, v160.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(37)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v101.l, 8, v160.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(36)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v98.h, 8, v161.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(35)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v99.l, 8, v161.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(34)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v97.l, 8, v162.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(33)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v97.h, 8, v162.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(32)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v87.l, 8, v163.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(31)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v87.h, 8, v163.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(30)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v85.l, 8, v164.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(29)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v85.h, 8, v164.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(28)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v83.l, 8, v165.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(27)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v83.h, 8, v165.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v81.h, 8, v71.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v82.l, 8, v71.l
@@ -147524,6 +147436,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v66, off, s32 offset:12
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v67, off, s32 offset:4
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v54.l, v30.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v67.l, v28.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v64.h, v26.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v65.h, v24.l
@@ -147555,7 +147468,6 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v103.l, 8, v27.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v112.l, 8, v29.l
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(62)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v150.h, 8, v150.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v150.l, 8, v150.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v147.h, 8, v147.h
@@ -147572,69 +147484,37 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v144.h, 8, v144.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v144.l, 8, v144.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v135.h, 8, v135.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(61)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v132.l, 8, v132.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(59)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v135.l, 8, v135.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(57)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v134.l, 8, v134.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(55)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v133.h, 8, v133.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(54)
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v160
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(53)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v101.l, 8, v101.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(52)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v112.h, 8, v103.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(51)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v113.l, 8, v113.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(50)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v113.h, 8, v113.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(49)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v103.h, 8, v114.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(48)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v115.l, 8, v114.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(47)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v115.h, 8, v115.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(46)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v116.l, 8, v116.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(45)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v116.h, 8, v116.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(44)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v117.l, 8, v117.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(43)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v114.l, 8, v117.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(42)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v118.l, 8, v118.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(41)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v118.h, 8, v118.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(40)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v119.l, 8, v119.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(39)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v114.h, 8, v119.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(38)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v119.h, 8, v128.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(37)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v128.l, 8, v128.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(36)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v128.h, 8, v129.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(35)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v129.l, 8, v129.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(34)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v130.l, 8, v130.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(33)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v117.h, 8, v130.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(32)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v130.h, 8, v131.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(31)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v131.l, 8, v131.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(30)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v131.h, 8, v132.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(29)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v129.h, 8, v133.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(28)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v132.h, 8, v151.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(27)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v133.l, 8, v151.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v151.l, 8, v31.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v151.h, 8, v31.l
@@ -147648,7 +147528,6 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB88_4
 ; GFX11-TRUE16-NEXT:  .LBB88_2: ; %end
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB88_3: ; %cmp.false
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v52.h
@@ -147667,7 +147546,6 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v64.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v67.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v54.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v67.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v66.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v68.l
@@ -147988,10 +147866,8 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v23.l, 0x300, v1.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v23.h, 0x300, v1.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v32.l, 3
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(26)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v100.h, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v32.h, 3
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(24)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v100.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v98.l, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
@@ -148008,10 +147884,8 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v21.l, 0x300, v0.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v21.h, 0x300, v1.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v20.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(22)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v97.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v85.h, 3
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(20)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v96.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v86.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v20.h, 0x300, v2.l
@@ -148019,7 +147893,6 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(18)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v87.h, 3
 ; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v131.h, v0.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v129.h, v0.h
@@ -148031,10 +147904,8 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v18.l, 0x300, v1.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v18.h, 0x300, v1.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v71.h, 3
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(16)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v84.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v80.h, 3
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(14)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v96.h, 3
 ; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v130.l, v2.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
@@ -148051,10 +147922,8 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v16.l, 0x300, v0.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v16.h, 0x300, v1.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v15.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(12)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v85.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v80.l, 3
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(10)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v84.h, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v82.l, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
@@ -148068,17 +147937,14 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v118.l, v1.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v118.h, v1.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v15.h, 0x300, v2.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(8)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v83.h, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.l, 0x300, v0.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.h, 0x300, v0.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.l, 0x300, v1.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.h, 0x300, v1.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v69.l, 3
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(6)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v71.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v69.h, 3
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(4)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v70.h, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
@@ -148096,10 +147962,8 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.l, 0x300, v0.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.h, 0x300, v1.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(2)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v68.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v66.l, 3
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v67.h, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v66.h, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
@@ -173957,6 +173821,7 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v66, off, s32 offset:12
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v67, off, s32 offset:4
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v54.l, v30.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v67.l, v28.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v64.h, v26.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v65.h, v24.l
@@ -173988,7 +173853,6 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v103.l, 8, v27.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v112.l, 8, v29.l
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(62)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v150.h, 8, v150.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v150.l, 8, v150.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v147.h, 8, v147.h
@@ -174005,69 +173869,37 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v144.h, 8, v144.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v144.l, 8, v144.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v135.h, 8, v135.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(61)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v132.l, 8, v132.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(59)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v135.l, 8, v135.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(57)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v134.l, 8, v134.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(55)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v133.h, 8, v133.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(54)
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v160
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(53)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v101.l, 8, v101.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(52)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v112.h, 8, v103.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(51)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v113.l, 8, v113.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(50)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v113.h, 8, v113.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(49)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v103.h, 8, v114.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(48)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v115.l, 8, v114.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(47)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v115.h, 8, v115.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(46)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v116.l, 8, v116.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(45)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v116.h, 8, v116.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(44)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v117.l, 8, v117.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(43)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v114.l, 8, v117.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(42)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v118.l, 8, v118.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(41)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v118.h, 8, v118.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(40)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v119.l, 8, v119.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(39)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v114.h, 8, v119.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(38)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v119.h, 8, v128.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(37)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v128.l, 8, v128.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(36)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v128.h, 8, v129.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(35)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v129.l, 8, v129.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(34)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v130.l, 8, v130.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(33)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v117.h, 8, v130.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(32)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v130.h, 8, v131.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(31)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v131.l, 8, v131.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(30)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v131.h, 8, v132.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(29)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v129.h, 8, v133.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(28)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v132.h, 8, v151.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(27)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v133.l, 8, v151.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v151.l, 8, v31.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v151.h, 8, v31.l
@@ -174081,7 +173913,6 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB92_4
 ; GFX11-TRUE16-NEXT:  .LBB92_2: ; %end
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB92_3: ; %cmp.false
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v52.h
@@ -174100,7 +173931,6 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v64.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v67.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v54.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v67.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v66.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v68.l
@@ -174421,10 +174251,8 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v23.l, 0x300, v1.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v23.h, 0x300, v1.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v32.l, 3
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(26)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v100.h, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v32.h, 3
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(24)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v100.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v98.l, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
@@ -174441,10 +174269,8 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v21.l, 0x300, v0.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v21.h, 0x300, v1.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v20.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(22)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v97.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v85.h, 3
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(20)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v96.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v86.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v20.h, 0x300, v2.l
@@ -174452,7 +174278,6 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(18)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v87.h, 3
 ; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v131.h, v0.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v129.h, v0.h
@@ -174464,10 +174289,8 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v18.l, 0x300, v1.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v18.h, 0x300, v1.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v71.h, 3
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(16)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v84.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v80.h, 3
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(14)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v96.h, 3
 ; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v130.l, v2.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
@@ -174484,10 +174307,8 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v16.l, 0x300, v0.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v16.h, 0x300, v1.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v15.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(12)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v85.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v80.l, 3
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(10)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v84.h, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v82.l, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
@@ -174501,17 +174322,14 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v118.l, v1.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v118.h, v1.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v15.h, 0x300, v2.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(8)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v83.h, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.l, 0x300, v0.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.h, 0x300, v0.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.l, 0x300, v1.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.h, 0x300, v1.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v69.l, 3
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(6)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v71.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v69.h, 3
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(4)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v70.h, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
@@ -174529,10 +174347,8 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.l, 0x300, v0.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.h, 0x300, v1.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(2)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v68.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v66.l, 3
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v67.h, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v66.h, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
@@ -196529,6 +196345,7 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v66, off, s32 offset:12
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v67, off, s32 offset:4
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v54.l, v30.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v67.l, v28.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v64.h, v26.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v65.h, v24.l
@@ -196560,7 +196377,6 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v103.l, 8, v27.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v112.l, 8, v29.l
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(62)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v150.h, 8, v150.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v150.l, 8, v150.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v147.h, 8, v147.h
@@ -196577,69 +196393,37 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v144.h, 8, v144.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v144.l, 8, v144.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v135.h, 8, v135.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(61)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v132.l, 8, v132.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(59)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v135.l, 8, v135.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(57)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v134.l, 8, v134.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(55)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v133.h, 8, v133.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(54)
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v160
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(53)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v101.l, 8, v101.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(52)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v112.h, 8, v103.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(51)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v113.l, 8, v113.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(50)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v113.h, 8, v113.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(49)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v103.h, 8, v114.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(48)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v115.l, 8, v114.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(47)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v115.h, 8, v115.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(46)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v116.l, 8, v116.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(45)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v116.h, 8, v116.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(44)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v117.l, 8, v117.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(43)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v114.l, 8, v117.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(42)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v118.l, 8, v118.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(41)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v118.h, 8, v118.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(40)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v119.l, 8, v119.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(39)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v114.h, 8, v119.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(38)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v119.h, 8, v128.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(37)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v128.l, 8, v128.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(36)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v128.h, 8, v129.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(35)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v129.l, 8, v129.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(34)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v130.l, 8, v130.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(33)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v117.h, 8, v130.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(32)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v130.h, 8, v131.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(31)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v131.l, 8, v131.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(30)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v131.h, 8, v132.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(29)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v129.h, 8, v133.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(28)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v132.h, 8, v151.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(27)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v133.l, 8, v151.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v151.l, 8, v31.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v151.h, 8, v31.l
@@ -196653,7 +196437,6 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB96_4
 ; GFX11-TRUE16-NEXT:  .LBB96_2: ; %end
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB96_3: ; %cmp.false
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v52.h
@@ -196672,7 +196455,6 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v64.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v67.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v54.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v67.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v66.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v68.l
@@ -196993,10 +196775,8 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v23.l, 0x300, v1.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v23.h, 0x300, v1.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v32.l, 3
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(26)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v100.h, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v32.h, 3
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(24)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v100.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v98.l, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
@@ -197013,10 +196793,8 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v21.l, 0x300, v0.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v21.h, 0x300, v1.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v20.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(22)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v97.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v85.h, 3
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(20)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v96.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v86.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v20.h, 0x300, v2.l
@@ -197024,7 +196802,6 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(18)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v87.h, 3
 ; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v131.h, v0.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v129.h, v0.h
@@ -197036,10 +196813,8 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v18.l, 0x300, v1.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v18.h, 0x300, v1.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v71.h, 3
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(16)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v84.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v80.h, 3
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(14)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v96.h, 3
 ; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v130.l, v2.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
@@ -197056,10 +196831,8 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v16.l, 0x300, v0.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v16.h, 0x300, v1.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v15.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(12)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v85.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v80.l, 3
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(10)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v84.h, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v82.l, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
@@ -197073,17 +196846,14 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v118.l, v1.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v118.h, v1.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v15.h, 0x300, v2.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(8)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v83.h, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.l, 0x300, v0.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.h, 0x300, v0.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.l, 0x300, v1.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.h, 0x300, v1.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v69.l, 3
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(6)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v71.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v69.h, 3
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(4)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v70.h, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
@@ -197101,10 +196871,8 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.l, 0x300, v0.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.h, 0x300, v1.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(2)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v68.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v66.l, 3
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v67.h, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v66.h, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll
index 436b1a038b274..2abb2f3b9de52 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll
@@ -5033,6 +5033,7 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.h, v23.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.h, v21.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v30.h, v19.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(4)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.h, v17.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v14.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v12.l
@@ -5059,15 +5060,10 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.l, 8, v29.l
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(8)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v33.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(7)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.l, 8, v33.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(6)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v34.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(5)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.l, 8, v34.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(4)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.h, 8, v35.l
 ; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v36
 ; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
@@ -11993,6 +11989,7 @@ define <10 x float> @bitcast_v40i8_to_v10f32(<40 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.h, v23.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.h, v21.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v30.h, v19.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(4)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.h, v17.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v14.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v12.l
@@ -12019,15 +12016,10 @@ define <10 x float> @bitcast_v40i8_to_v10f32(<40 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.l, 8, v29.l
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(8)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v33.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(7)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.l, 8, v33.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(6)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v34.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(5)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.l, 8, v34.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(4)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.h, 8, v35.l
 ; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v36
 ; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
@@ -18559,6 +18551,7 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v32, off, s32 offset:20
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v31, off, s32 offset:12
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v32, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(5)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.h, v29.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.h, v27.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v30.h, v25.l
@@ -18596,13 +18589,9 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v35.h, 8, v35.h
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(8)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v28.h, 8, v33.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(7)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v34.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(6)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.l, 8, v34.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(5)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v35.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(4)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v35.l, 8, v36.l
@@ -18701,10 +18690,9 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB50_2
 ; GFX11-TRUE16-NEXT:  .LBB50_4: ; %cmp.true
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v32.h, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v31.h, 3
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v32.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v31.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v28.l, 3
@@ -24640,6 +24628,7 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v32, off, s32 offset:20
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v31, off, s32 offset:12
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v32, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(5)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.h, v29.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.h, v27.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v30.h, v25.l
@@ -24677,13 +24666,9 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v35.h, 8, v35.h
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(8)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v28.h, 8, v33.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(7)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v34.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(6)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.l, 8, v34.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(5)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v35.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(4)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v35.l, 8, v36.l
@@ -24782,10 +24767,9 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB62_2
 ; GFX11-TRUE16-NEXT:  .LBB62_4: ; %cmp.true
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v32.h, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v31.h, 3
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v32.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v31.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v28.l, 3
@@ -28760,6 +28744,7 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v31, off, s32 offset:20
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v32, off, s32 offset:12
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v32, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(4)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v38.h, v29.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.l, v27.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.h, v25.l
@@ -28792,15 +28777,10 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.l, 8, v38.h
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(8)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v36.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(7)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.l, 8, v36.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(6)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v37.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(5)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.l, 8, v37.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(4)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v38.l
 ; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v49
 ; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
@@ -32871,6 +32851,7 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v31, off, s32 offset:20
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v32, off, s32 offset:12
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v32, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(4)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v38.h, v29.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.l, v27.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.h, v25.l
@@ -32903,15 +32884,10 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.l, 8, v38.h
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(8)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v36.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(7)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.l, 8, v36.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(6)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v37.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(5)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.l, 8, v37.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(4)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v38.l
 ; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v49
 ; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll
index ede44e738fe00..352b2cb7123b1 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll
@@ -12492,6 +12492,7 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    s_clause 0x1
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v38, off, s32 offset:12
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v38, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(11)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v80.h, v29.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.h, v27.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.h, v22.l
@@ -12523,39 +12524,22 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v25.l, 8, v80.h
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(33)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v64.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(31)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.l, 8, v64.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(29)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v65.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(27)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.l, 8, v65.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(25)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v66.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(23)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.l, 8, v66.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(21)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v25.h, 8, v67.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(20)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v23.l, 8, v67.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(19)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v23.h, 8, v68.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(18)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v22.l, 8, v68.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(17)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v22.h, 8, v69.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(16)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.l, 8, v69.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(15)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.h, 8, v70.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(14)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.l, 8, v70.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(13)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.h, 8, v71.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(12)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.l, 8, v71.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(11)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v80.l
 ; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v81
 ; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
@@ -27377,6 +27361,7 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    s_clause 0x1
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v38, off, s32 offset:12
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v38, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(11)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v80.h, v29.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.h, v27.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.h, v22.l
@@ -27408,39 +27393,22 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v25.l, 8, v80.h
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(33)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v64.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(31)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.l, 8, v64.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(29)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v65.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(27)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.l, 8, v65.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(25)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v66.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(23)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.l, 8, v66.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(21)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v25.h, 8, v67.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(20)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v23.l, 8, v67.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(19)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v23.h, 8, v68.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(18)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v22.l, 8, v68.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(17)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v22.h, 8, v69.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(16)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.l, 8, v69.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(15)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.h, 8, v70.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(14)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.l, 8, v70.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(13)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.h, 8, v71.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(12)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.l, 8, v71.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(11)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v80.l
 ; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v81
 ; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
@@ -41534,6 +41502,7 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    s_clause 0x1
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v38, off, s32 offset:12
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v38, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(11)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v80.h, v29.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.h, v27.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.h, v22.l
@@ -41565,39 +41534,22 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v25.l, 8, v80.h
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(33)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v64.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(31)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.l, 8, v64.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(29)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v65.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(27)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.l, 8, v65.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(25)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v66.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(23)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.l, 8, v66.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(21)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v25.h, 8, v67.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(20)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v23.l, 8, v67.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(19)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v23.h, 8, v68.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(18)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v22.l, 8, v68.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(17)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v22.h, 8, v69.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(16)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.l, 8, v69.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(15)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.h, 8, v70.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(14)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.l, 8, v70.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(13)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.h, 8, v71.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(12)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.l, 8, v71.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(11)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v80.l
 ; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v81
 ; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
@@ -54837,6 +54789,7 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    s_clause 0x1
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v38, off, s32 offset:12
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v38, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(11)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v80.h, v29.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.h, v27.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.h, v22.l
@@ -54868,39 +54821,22 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v25.l, 8, v80.h
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(33)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v64.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(31)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.l, 8, v64.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(29)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v65.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(27)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.l, 8, v65.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(25)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v66.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(23)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.l, 8, v66.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(21)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v25.h, 8, v67.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(20)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v23.l, 8, v67.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(19)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v23.h, 8, v68.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(18)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v22.l, 8, v68.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(17)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v22.h, 8, v69.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(16)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.l, 8, v69.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(15)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.h, 8, v70.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(14)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.l, 8, v70.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(13)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.h, 8, v71.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(12)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.l, 8, v71.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(11)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v80.l
 ; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v81
 ; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
@@ -68501,6 +68437,7 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.h, v29.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.l, v27.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.h, v25.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(21)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.l, v23.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v30.h, v21.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.h, v19.l
@@ -68533,37 +68470,24 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v49.l, 8, v29.h
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(33)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v55.h, 8, v55.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(31)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v55.l, 8, v55.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(29)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v53.h, 8, v53.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(27)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v53.l, 8, v53.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(25)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(15)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v52.h, 8, v52.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(23)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(14)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v54.h, 8, v54.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(21)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v29.h, 8, v39.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(20)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v49.h, 8, v48.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(19)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v39.h, 8, v48.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(18)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v48.l, 8, v50.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(17)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v48.h, 8, v50.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(16)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v51.h, 8, v51.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(15)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v52.l, 8, v52.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(14)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v50.l, 8, v54.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(13)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v50.h, 8, v64.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(12)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v50.h, 8, v64.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v54.l, 8, v64.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(11)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v51.l, 8, v65.l
@@ -68710,6 +68634,7 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB98_2
 ; GFX11-TRUE16-NEXT:  .LBB98_4: ; %cmp.true
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v33.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v32.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v32.h, 3
@@ -68717,7 +68642,6 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v31.l, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(8)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, v38.l, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
@@ -68732,11 +68656,10 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v3.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.l, 0x300, v1.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.h, 0x300, v1.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(7)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v35.h, 3
 ; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v52.h, v0.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v54.l, v0.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(4)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v37.h, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
@@ -68756,7 +68679,6 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v50.l, v0.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.h, 0x300, v1.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v35.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.h, 0x300, v0.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.l, 0x300, v0.h
@@ -80726,6 +80648,7 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.h, v29.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.l, v27.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.h, v25.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(21)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.l, v23.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v30.h, v21.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.h, v19.l
@@ -80758,37 +80681,24 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v49.l, 8, v29.h
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(33)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v55.h, 8, v55.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(31)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v55.l, 8, v55.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(29)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v53.h, 8, v53.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(27)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v53.l, 8, v53.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(25)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(15)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v52.h, 8, v52.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(23)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(14)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v54.h, 8, v54.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(21)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v29.h, 8, v39.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(20)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v49.h, 8, v48.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(19)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v39.h, 8, v48.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(18)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v48.l, 8, v50.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(17)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v48.h, 8, v50.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(16)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v51.h, 8, v51.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(15)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v52.l, 8, v52.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(14)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v50.l, 8, v54.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(13)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v50.h, 8, v64.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(12)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v50.h, 8, v64.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v54.l, 8, v64.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(11)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v51.l, 8, v65.l
@@ -80935,6 +80845,7 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB106_2
 ; GFX11-TRUE16-NEXT:  .LBB106_4: ; %cmp.true
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v33.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v32.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v32.h, 3
@@ -80942,7 +80853,6 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v31.l, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(8)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, v38.l, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
@@ -80957,11 +80867,10 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v3.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.l, 0x300, v1.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.h, 0x300, v1.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(7)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v35.h, 3
 ; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v52.h, v0.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v54.l, v0.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(4)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v37.h, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
@@ -80981,7 +80890,6 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v50.l, v0.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.h, 0x300, v1.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v35.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.h, 0x300, v0.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.l, 0x300, v0.h
@@ -91233,6 +91141,7 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.h, v29.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.l, v27.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.h, v25.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(21)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.l, v23.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v30.h, v21.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.h, v19.l
@@ -91265,37 +91174,24 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v49.l, 8, v29.h
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(33)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v55.h, 8, v55.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(31)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v55.l, 8, v55.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(29)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v53.h, 8, v53.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(27)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v53.l, 8, v53.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(25)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(15)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v52.h, 8, v52.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(23)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(14)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v54.h, 8, v54.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(21)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v29.h, 8, v39.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(20)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v49.h, 8, v48.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(19)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v39.h, 8, v48.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(18)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v48.l, 8, v50.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(17)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v48.h, 8, v50.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(16)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v51.h, 8, v51.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(15)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v52.l, 8, v52.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(14)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v50.l, 8, v54.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(13)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v50.h, 8, v64.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(12)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v50.h, 8, v64.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v54.l, 8, v64.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(11)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v51.l, 8, v65.l
@@ -91442,6 +91338,7 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB110_2
 ; GFX11-TRUE16-NEXT:  .LBB110_4: ; %cmp.true
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v33.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v32.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v32.h, 3
@@ -91449,7 +91346,6 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v31.l, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(8)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, v38.l, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
@@ -91464,11 +91360,10 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v3.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.l, 0x300, v1.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.h, 0x300, v1.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(7)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v35.h, 3
 ; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v52.h, v0.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v54.l, v0.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(4)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v37.h, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
@@ -91488,7 +91383,6 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v50.l, v0.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.h, 0x300, v1.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v35.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.h, 0x300, v0.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.l, 0x300, v0.h
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll
index 7e9cb7adf4fc2..664dfa21759cf 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll
@@ -254,9 +254,9 @@ define amdgpu_kernel void @indirect_calls_none_agpr(i1 %cond) {
 
 attributes #0 = { "amdgpu-agpr-alloc"="0" }
 ;.
-; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR1]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR2]] = { "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR2:[0-9]+]] = { "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
 ; CHECK: attributes #[[ATTR3:[0-9]+]] = { convergent nocallback nofree nosync nounwind willreturn memory(none) "target-cpu"="gfx90a" }
 ; CHECK: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) "target-cpu"="gfx90a" }
 ; CHECK: attributes #[[ATTR5:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) "target-cpu"="gfx90a" }
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-snop-padding.mir b/llvm/test/CodeGen/AMDGPU/amdgpu-snop-padding.mir
index 22c913496b734..b5c3e3214f125 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-snop-padding.mir
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-snop-padding.mir
@@ -63,52 +63,41 @@ body:             |
   ; GCN16-NEXT:   successors: %bb.1(0x80000000)
   ; GCN16-NEXT:   liveins: $sgpr6, $sgpr10_sgpr11
   ; GCN16-NEXT: {{  $}}
-  ; GCN16-NEXT:   S_NOP 7
-  ; GCN16-NEXT:   S_NOP 7
+  ; GCN16-NEXT:   S_NOP 15
   ; GCN16-NEXT:   S_BRANCH %bb.1
   ; GCN16-NEXT: {{  $}}
   ; GCN16-NEXT: bb.1:
   ; GCN16-NEXT:   successors: %bb.3(0x40000000), %bb.2(0x40000000)
   ; GCN16-NEXT:   liveins: $sgpr6, $sgpr10_sgpr11
   ; GCN16-NEXT: {{  $}}
-  ; GCN16-NEXT:   S_NOP 7
-  ; GCN16-NEXT:   S_NOP 7
+  ; GCN16-NEXT:   S_NOP 15
   ; GCN16-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 10, implicit $exec
-  ; GCN16-NEXT:   S_NOP 7
-  ; GCN16-NEXT:   S_NOP 7
+  ; GCN16-NEXT:   S_NOP 15
   ; GCN16-NEXT:   S_CBRANCH_EXECZ %bb.3, implicit $exec
   ; GCN16-NEXT: {{  $}}
   ; GCN16-NEXT: bb.2:
   ; GCN16-NEXT:   successors: %bb.3(0x80000000)
   ; GCN16-NEXT:   liveins: $sgpr6, $sgpr10_sgpr11
   ; GCN16-NEXT: {{  $}}
-  ; GCN16-NEXT:   S_NOP 7
-  ; GCN16-NEXT:   S_NOP 7
+  ; GCN16-NEXT:   S_NOP 15
   ; GCN16-NEXT:   SI_SPILL_S32_SAVE killed $sgpr6, %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32
-  ; GCN16-NEXT:   S_NOP 7
-  ; GCN16-NEXT:   S_NOP 7
+  ; GCN16-NEXT:   S_NOP 15
   ; GCN16-NEXT:   S_NOP 0
-  ; GCN16-NEXT:   S_NOP 7
-  ; GCN16-NEXT:   S_NOP 7
+  ; GCN16-NEXT:   S_NOP 15
   ; GCN16-NEXT:   renamable $sgpr6 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32
-  ; GCN16-NEXT:   S_NOP 7
-  ; GCN16-NEXT:   S_NOP 7
+  ; GCN16-NEXT:   S_NOP 15
   ; GCN16-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 20, implicit $exec
-  ; GCN16-NEXT:   S_NOP 7
-  ; GCN16-NEXT:   S_NOP 7
+  ; GCN16-NEXT:   S_NOP 15
   ; GCN16-NEXT:   S_BRANCH %bb.3
   ; GCN16-NEXT: {{  $}}
   ; GCN16-NEXT: bb.3:
   ; GCN16-NEXT:   liveins: $sgpr10_sgpr11
   ; GCN16-NEXT: {{  $}}
-  ; GCN16-NEXT:   S_NOP 7
-  ; GCN16-NEXT:   S_NOP 7
+  ; GCN16-NEXT:   S_NOP 15
   ; GCN16-NEXT:   $sgpr5 = V_READFIRSTLANE_B32 [[V_MOV_B32_e32_]], implicit $exec
-  ; GCN16-NEXT:   S_NOP 7
-  ; GCN16-NEXT:   S_NOP 7
+  ; GCN16-NEXT:   S_NOP 15
   ; GCN16-NEXT:   S_STORE_DWORD_IMM $sgpr5, $sgpr10_sgpr11, 0, 0
-  ; GCN16-NEXT:   S_NOP 7
-  ; GCN16-NEXT:   S_NOP 7
+  ; GCN16-NEXT:   S_NOP 15
   ; GCN16-NEXT:   SI_RETURN
   bb.0:
     liveins: $sgpr6, $sgpr10_sgpr11
diff --git a/llvm/test/CodeGen/AMDGPU/amdhsa-kernarg-preload-num-sgprs.ll b/llvm/test/CodeGen/AMDGPU/amdhsa-kernarg-preload-num-sgprs.ll
index dd760c2a215ca..a160cdc950eb5 100644
--- a/llvm/test/CodeGen/AMDGPU/amdhsa-kernarg-preload-num-sgprs.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdhsa-kernarg-preload-num-sgprs.ll
@@ -70,4 +70,4 @@ define amdgpu_kernel void @amdhsa_kernarg_preload_1_implicit_2(i32 inreg) #0 { r
 
 define amdgpu_kernel void @amdhsa_kernarg_preload_0_implicit_2(i32) #0 { ret void }
 
-attributes #0 = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+attributes #0 = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-cluster-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-cluster-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-cluster-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
diff --git a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll
index 7f7bbb2a95902..a688b6fc6399f 100644
--- a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll
@@ -691,29 +691,29 @@ attributes #6 = { "enqueued-block" }
 
 ;.
 ; ATTRIBUTOR_HSA: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
-; ATTRIBUTOR_HSA: attributes #[[ATTR1]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="fiji" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "target-cpu"="fiji" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR3]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "target-cpu"="fiji" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR4]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="fiji" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR5]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="fiji" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR6]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="fiji" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR7]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="fiji" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR8]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="fiji" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR9]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="fiji" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR10]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="fiji" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR11]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="fiji" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR12]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="fiji" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR13]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR14]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR15]] = { nounwind "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR16]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR17]] = { nounwind sanitize_address "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR18]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR1]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR3]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR4]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR5]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR6]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR7]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR8]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR9]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR10]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR11]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR12]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR13]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR14]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR15:[0-9]+]] = { nounwind "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR16]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR17]] = { nounwind sanitize_address "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR18]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
 ; ATTRIBUTOR_HSA: attributes #[[ATTR19:[0-9]+]] = { nounwind sanitize_address "amdgpu-no-implicitarg-ptr" "uniform-work-group-size"="false" }
 ; ATTRIBUTOR_HSA: attributes #[[ATTR20:[0-9]+]] = { "enqueued-block" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR21]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "enqueued-block" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR21]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "enqueued-block" "uniform-work-group-size"="false" }
 ; ATTRIBUTOR_HSA: attributes #[[ATTR22]] = { "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR23]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR23]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
 ; ATTRIBUTOR_HSA: attributes #[[ATTR24]] = { nounwind }
 ; ATTRIBUTOR_HSA: attributes #[[ATTR25]] = { "enqueued-block" }
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll
index 26c04a35edf16..484ff77fd7abd 100644
--- a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll
+++ b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll
@@ -474,19 +474,19 @@ attributes #1 = { nounwind }
 ; AKF_HSA: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 500}
 ;.
 ; HSA: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
-; HSA: attributes #[[ATTR1]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; HSA: attributes #[[ATTR3]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; HSA: attributes #[[ATTR4]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; HSA: attributes #[[ATTR5]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; HSA: attributes #[[ATTR6]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "uniform-work-group-size"="false" }
-; HSA: attributes #[[ATTR7]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; HSA: attributes #[[ATTR8]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "uniform-work-group-size"="false" }
-; HSA: attributes #[[ATTR9]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "uniform-work-group-size"="false" }
-; HSA: attributes #[[ATTR10]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; HSA: attributes #[[ATTR11]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; HSA: attributes #[[ATTR12]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; HSA: attributes #[[ATTR13]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; HSA: attributes #[[ATTR1]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; HSA: attributes #[[ATTR3]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; HSA: attributes #[[ATTR4]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; HSA: attributes #[[ATTR5]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; HSA: attributes #[[ATTR6]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "uniform-work-group-size"="false" }
+; HSA: attributes #[[ATTR7]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; HSA: attributes #[[ATTR8]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "uniform-work-group-size"="false" }
+; HSA: attributes #[[ATTR9]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "uniform-work-group-size"="false" }
+; HSA: attributes #[[ATTR10]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; HSA: attributes #[[ATTR11]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; HSA: attributes #[[ATTR12]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; HSA: attributes #[[ATTR13]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
 ;.
 ; HSA: [[META0]] = !{i32 1, i32 3, i32 4, i32 10}
 ; HSA: [[META1]] = !{i32 1, i32 5, i32 6, i32 10}
diff --git a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features.ll b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features.ll
index 81ccf16c4e4bc..2efe02458a6c7 100644
--- a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features.ll
+++ b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features.ll
@@ -294,13 +294,13 @@ attributes #1 = { nounwind }
 
 ;.
 ; CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
-; CHECK: attributes #[[ATTR1]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR2]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR3]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR4]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR5]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR6]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR7]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR8]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR9]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR2]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR3]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR4]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR5]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR6]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR7]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR8]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR9]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "uniform-work-group-size"="false" }
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll b/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll
index e4323999d19c3..3c316f4acedb7 100644
--- a/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll
@@ -456,12 +456,10 @@ define i64 @optnone_atomicrmw_add_i64_expand(i64 %val) #1 {
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    s_mov_b64 s[4:5], src_private_base
-; GFX908-NEXT:    s_mov_b32 s6, 32
-; GFX908-NEXT:    s_lshr_b64 s[4:5], s[4:5], s6
 ; GFX908-NEXT:    s_getpc_b64 s[6:7]
 ; GFX908-NEXT:    s_add_u32 s6, s6, global@rel32@lo+4
 ; GFX908-NEXT:    s_addc_u32 s7, s7, global@rel32@hi+12
-; GFX908-NEXT:    s_cmp_eq_u32 s7, s4
+; GFX908-NEXT:    s_cmp_eq_u32 s7, s5
 ; GFX908-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GFX908-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[4:5]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], -1
@@ -507,12 +505,10 @@ define i64 @optnone_atomicrmw_add_i64_expand(i64 %val) #1 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
-; GFX90A-NEXT:    s_mov_b32 s6, 32
-; GFX90A-NEXT:    s_lshr_b64 s[4:5], s[4:5], s6
 ; GFX90A-NEXT:    s_getpc_b64 s[6:7]
 ; GFX90A-NEXT:    s_add_u32 s6, s6, global@rel32@lo+4
 ; GFX90A-NEXT:    s_addc_u32 s7, s7, global@rel32@hi+12
-; GFX90A-NEXT:    s_cmp_eq_u32 s7, s4
+; GFX90A-NEXT:    s_cmp_eq_u32 s7, s5
 ; GFX90A-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[4:5]
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], -1
@@ -558,12 +554,10 @@ define i64 @optnone_atomicrmw_add_i64_expand(i64 %val) #1 {
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    s_mov_b64 s[0:1], src_private_base
-; GFX942-NEXT:    s_mov_b32 s2, 32
-; GFX942-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
 ; GFX942-NEXT:    s_getpc_b64 s[2:3]
 ; GFX942-NEXT:    s_add_u32 s2, s2, global@rel32@lo+4
 ; GFX942-NEXT:    s_addc_u32 s3, s3, global@rel32@hi+12
-; GFX942-NEXT:    s_cmp_eq_u32 s3, s0
+; GFX942-NEXT:    s_cmp_eq_u32 s3, s1
 ; GFX942-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GFX942-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
 ; GFX942-NEXT:    s_mov_b64 s[0:1], -1
@@ -607,12 +601,10 @@ define i64 @optnone_atomicrmw_add_i64_expand(i64 %val) #1 {
 ; GFX1100:       ; %bb.0:
 ; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    s_mov_b64 s[0:1], src_private_base
-; GFX1100-NEXT:    s_mov_b32 s2, 32
-; GFX1100-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
 ; GFX1100-NEXT:    s_getpc_b64 s[2:3]
 ; GFX1100-NEXT:    s_add_u32 s2, s2, global@rel32@lo+4
 ; GFX1100-NEXT:    s_addc_u32 s3, s3, global@rel32@hi+12
-; GFX1100-NEXT:    s_cmp_eq_u32 s3, s0
+; GFX1100-NEXT:    s_cmp_eq_u32 s3, s1
 ; GFX1100-NEXT:    s_cselect_b32 s0, -1, 0
 ; GFX1100-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
 ; GFX1100-NEXT:    s_mov_b32 s0, -1
@@ -660,9 +652,6 @@ define i64 @optnone_atomicrmw_add_i64_expand(i64 %val) #1 {
 ; GFX1200-NEXT:    s_wait_bvhcnt 0x0
 ; GFX1200-NEXT:    s_wait_kmcnt 0x0
 ; GFX1200-NEXT:    s_mov_b64 s[0:1], src_private_base
-; GFX1200-NEXT:    s_mov_b32 s2, 32
-; GFX1200-NEXT:    s_wait_alu 0xfffe
-; GFX1200-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
 ; GFX1200-NEXT:    s_getpc_b64 s[2:3]
 ; GFX1200-NEXT:    s_wait_alu 0xfffe
 ; GFX1200-NEXT:    s_sext_i32_i16 s3, s3
@@ -670,7 +659,7 @@ define i64 @optnone_atomicrmw_add_i64_expand(i64 %val) #1 {
 ; GFX1200-NEXT:    s_wait_alu 0xfffe
 ; GFX1200-NEXT:    s_add_co_ci_u32 s3, s3, global@rel32@hi+24
 ; GFX1200-NEXT:    s_wait_alu 0xfffe
-; GFX1200-NEXT:    s_cmp_eq_u32 s3, s0
+; GFX1200-NEXT:    s_cmp_eq_u32 s3, s1
 ; GFX1200-NEXT:    s_cselect_b32 s0, -1, 0
 ; GFX1200-NEXT:    s_wait_alu 0xfffe
 ; GFX1200-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
@@ -731,12 +720,10 @@ define double @optnone_atomicrmw_fadd_f64_expand(double %val) #1 {
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    s_mov_b64 s[4:5], src_private_base
-; GFX908-NEXT:    s_mov_b32 s6, 32
-; GFX908-NEXT:    s_lshr_b64 s[4:5], s[4:5], s6
 ; GFX908-NEXT:    s_getpc_b64 s[6:7]
 ; GFX908-NEXT:    s_add_u32 s6, s6, global@rel32@lo+4
 ; GFX908-NEXT:    s_addc_u32 s7, s7, global@rel32@hi+12
-; GFX908-NEXT:    s_cmp_eq_u32 s7, s4
+; GFX908-NEXT:    s_cmp_eq_u32 s7, s5
 ; GFX908-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GFX908-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[4:5]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], -1
@@ -800,12 +787,10 @@ define double @optnone_atomicrmw_fadd_f64_expand(double %val) #1 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_shared_base
-; GFX90A-NEXT:    s_mov_b32 s6, 32
-; GFX90A-NEXT:    s_lshr_b64 s[4:5], s[4:5], s6
 ; GFX90A-NEXT:    s_getpc_b64 s[6:7]
 ; GFX90A-NEXT:    s_add_u32 s6, s6, global@rel32@lo+4
 ; GFX90A-NEXT:    s_addc_u32 s7, s7, global@rel32@hi+12
-; GFX90A-NEXT:    s_cmp_eq_u32 s7, s4
+; GFX90A-NEXT:    s_cmp_eq_u32 s7, s5
 ; GFX90A-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[4:5]
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], -1
@@ -825,12 +810,10 @@ define double @optnone_atomicrmw_fadd_f64_expand(double %val) #1 {
 ; GFX90A-NEXT:    s_branch .LBB5_10
 ; GFX90A-NEXT:  .LBB5_3: ; %atomicrmw.check.private
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
-; GFX90A-NEXT:    s_mov_b32 s6, 32
-; GFX90A-NEXT:    s_lshr_b64 s[4:5], s[4:5], s6
 ; GFX90A-NEXT:    s_getpc_b64 s[6:7]
 ; GFX90A-NEXT:    s_add_u32 s6, s6, global@rel32@lo+4
 ; GFX90A-NEXT:    s_addc_u32 s7, s7, global@rel32@hi+12
-; GFX90A-NEXT:    s_cmp_eq_u32 s7, s4
+; GFX90A-NEXT:    s_cmp_eq_u32 s7, s5
 ; GFX90A-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[4:5]
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], -1
@@ -896,12 +879,10 @@ define double @optnone_atomicrmw_fadd_f64_expand(double %val) #1 {
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    s_mov_b64 s[0:1], src_shared_base
-; GFX942-NEXT:    s_mov_b32 s2, 32
-; GFX942-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
 ; GFX942-NEXT:    s_getpc_b64 s[2:3]
 ; GFX942-NEXT:    s_add_u32 s2, s2, global@rel32@lo+4
 ; GFX942-NEXT:    s_addc_u32 s3, s3, global@rel32@hi+12
-; GFX942-NEXT:    s_cmp_eq_u32 s3, s0
+; GFX942-NEXT:    s_cmp_eq_u32 s3, s1
 ; GFX942-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GFX942-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
 ; GFX942-NEXT:    s_mov_b64 s[0:1], -1
@@ -921,12 +902,10 @@ define double @optnone_atomicrmw_fadd_f64_expand(double %val) #1 {
 ; GFX942-NEXT:    s_branch .LBB5_10
 ; GFX942-NEXT:  .LBB5_3: ; %atomicrmw.check.private
 ; GFX942-NEXT:    s_mov_b64 s[0:1], src_private_base
-; GFX942-NEXT:    s_mov_b32 s2, 32
-; GFX942-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
 ; GFX942-NEXT:    s_getpc_b64 s[2:3]
 ; GFX942-NEXT:    s_add_u32 s2, s2, global@rel32@lo+4
 ; GFX942-NEXT:    s_addc_u32 s3, s3, global@rel32@hi+12
-; GFX942-NEXT:    s_cmp_eq_u32 s3, s0
+; GFX942-NEXT:    s_cmp_eq_u32 s3, s1
 ; GFX942-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GFX942-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
 ; GFX942-NEXT:    s_mov_b64 s[0:1], -1
@@ -990,12 +969,10 @@ define double @optnone_atomicrmw_fadd_f64_expand(double %val) #1 {
 ; GFX1100:       ; %bb.0:
 ; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    s_mov_b64 s[0:1], src_private_base
-; GFX1100-NEXT:    s_mov_b32 s2, 32
-; GFX1100-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
 ; GFX1100-NEXT:    s_getpc_b64 s[2:3]
 ; GFX1100-NEXT:    s_add_u32 s2, s2, global@rel32@lo+4
 ; GFX1100-NEXT:    s_addc_u32 s3, s3, global@rel32@hi+12
-; GFX1100-NEXT:    s_cmp_eq_u32 s3, s0
+; GFX1100-NEXT:    s_cmp_eq_u32 s3, s1
 ; GFX1100-NEXT:    s_cselect_b32 s0, -1, 0
 ; GFX1100-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
 ; GFX1100-NEXT:    s_mov_b32 s0, -1
@@ -1060,9 +1037,6 @@ define double @optnone_atomicrmw_fadd_f64_expand(double %val) #1 {
 ; GFX1200-NEXT:    s_wait_bvhcnt 0x0
 ; GFX1200-NEXT:    s_wait_kmcnt 0x0
 ; GFX1200-NEXT:    s_mov_b64 s[0:1], src_private_base
-; GFX1200-NEXT:    s_mov_b32 s2, 32
-; GFX1200-NEXT:    s_wait_alu 0xfffe
-; GFX1200-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
 ; GFX1200-NEXT:    s_getpc_b64 s[2:3]
 ; GFX1200-NEXT:    s_wait_alu 0xfffe
 ; GFX1200-NEXT:    s_sext_i32_i16 s3, s3
@@ -1070,7 +1044,7 @@ define double @optnone_atomicrmw_fadd_f64_expand(double %val) #1 {
 ; GFX1200-NEXT:    s_wait_alu 0xfffe
 ; GFX1200-NEXT:    s_add_co_ci_u32 s3, s3, global@rel32@hi+24
 ; GFX1200-NEXT:    s_wait_alu 0xfffe
-; GFX1200-NEXT:    s_cmp_eq_u32 s3, s0
+; GFX1200-NEXT:    s_cmp_eq_u32 s3, s1
 ; GFX1200-NEXT:    s_cselect_b32 s0, -1, 0
 ; GFX1200-NEXT:    s_wait_alu 0xfffe
 ; GFX1200-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
diff --git a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-undefined-behavior.ll b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-undefined-behavior.ll
index fcca3d705490d..f63dd6ea7ed56 100644
--- a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-undefined-behavior.ll
+++ b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-undefined-behavior.ll
@@ -147,10 +147,10 @@ define amdgpu_kernel void @call_calls_intrin_ascast_cc_kernel(ptr addrspace(3) %
 
 attributes #0 = { "amdgpu-no-flat-scratch-init" }
 ;.
-; GFX9: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+; GFX9: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
 ; GFX9: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) "target-cpu"="gfx900" }
 ;.
-; GFX10: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+; GFX10: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
 ; GFX10: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) "target-cpu"="gfx1010" }
 ;.
 ; GFX9: [[META0]] = !{i32 1, i32 5, i32 6, i32 10}
diff --git a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-undefined-behavior2.ll b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-undefined-behavior2.ll
index 51caa84450ff3..583b6fe0a81ca 100644
--- a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-undefined-behavior2.ll
+++ b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-undefined-behavior2.ll
@@ -134,57 +134,57 @@ define amdgpu_kernel void @with_private_to_flat_addrspacecast_cc_kernel(ptr addr
 ;
 ; GFX9-LABEL: with_private_to_flat_addrspacecast_cc_kernel:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dword s2, s[8:9], 0x0
 ; GFX9-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX9-NEXT:    s_load_dword s0, s[8:9], 0x0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_cmp_lg_u32 s2, -1
-; GFX9-NEXT:    s_cselect_b32 s0, s1, 0
-; GFX9-NEXT:    s_cselect_b32 s1, s2, 0
-; GFX9-NEXT:    v_mov_b32_e32 v0, s1
-; GFX9-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-NEXT:    s_cmp_lg_u32 s0, -1
+; GFX9-NEXT:    s_cselect_b32 s1, s1, 0
+; GFX9-NEXT:    s_cselect_b32 s0, s0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    flat_store_dword v[0:1], v2
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX9-ARCH-FLAT-LABEL: with_private_to_flat_addrspacecast_cc_kernel:
 ; GFX9-ARCH-FLAT:       ; %bb.0:
-; GFX9-ARCH-FLAT-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX9-ARCH-FLAT-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX9-ARCH-FLAT-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; GFX9-ARCH-FLAT-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-ARCH-FLAT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-ARCH-FLAT-NEXT:    s_cmp_lg_u32 s2, -1
-; GFX9-ARCH-FLAT-NEXT:    s_cselect_b32 s0, s1, 0
-; GFX9-ARCH-FLAT-NEXT:    s_cselect_b32 s1, s2, 0
-; GFX9-ARCH-FLAT-NEXT:    v_mov_b32_e32 v0, s1
-; GFX9-ARCH-FLAT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-ARCH-FLAT-NEXT:    s_cmp_lg_u32 s0, -1
+; GFX9-ARCH-FLAT-NEXT:    s_cselect_b32 s1, s1, 0
+; GFX9-ARCH-FLAT-NEXT:    s_cselect_b32 s0, s0, 0
+; GFX9-ARCH-FLAT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-ARCH-FLAT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-ARCH-FLAT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX9-ARCH-FLAT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-ARCH-FLAT-NEXT:    s_endpgm
 ;
 ; GFX942-ARCH-FLAT-LABEL: with_private_to_flat_addrspacecast_cc_kernel:
 ; GFX942-ARCH-FLAT:       ; %bb.0:
-; GFX942-ARCH-FLAT-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX942-ARCH-FLAT-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX942-ARCH-FLAT-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; GFX942-ARCH-FLAT-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX942-ARCH-FLAT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-ARCH-FLAT-NEXT:    s_cmp_lg_u32 s2, -1
-; GFX942-ARCH-FLAT-NEXT:    s_cselect_b32 s0, s1, 0
-; GFX942-ARCH-FLAT-NEXT:    s_cselect_b32 s1, s2, 0
-; GFX942-ARCH-FLAT-NEXT:    v_mov_b32_e32 v0, s1
-; GFX942-ARCH-FLAT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX942-ARCH-FLAT-NEXT:    s_cmp_lg_u32 s0, -1
+; GFX942-ARCH-FLAT-NEXT:    s_cselect_b32 s1, s1, 0
+; GFX942-ARCH-FLAT-NEXT:    s_cselect_b32 s0, s0, 0
+; GFX942-ARCH-FLAT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-ARCH-FLAT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX942-ARCH-FLAT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX942-ARCH-FLAT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-ARCH-FLAT-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: with_private_to_flat_addrspacecast_cc_kernel:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dword s2, s[8:9], 0x0
 ; GFX10-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX10-NEXT:    s_load_dword s0, s[8:9], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_cmp_lg_u32 s2, -1
-; GFX10-NEXT:    s_cselect_b32 s0, s2, 0
+; GFX10-NEXT:    s_cmp_lg_u32 s0, -1
+; GFX10-NEXT:    s_cselect_b32 s0, s0, 0
 ; GFX10-NEXT:    s_cselect_b32 s1, s1, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s1
@@ -533,49 +533,49 @@ define amdgpu_kernel void @private_constant_expression_use(ptr addrspace(1) noca
 ;
 ; GFX9-LABEL: private_constant_expression_use:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX9-NEXT:    s_mov_b64 s[2:3], src_private_base
+; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[8:9], 0x0
+; GFX9-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x7b
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX9-ARCH-FLAT-LABEL: private_constant_expression_use:
 ; GFX9-ARCH-FLAT:       ; %bb.0:
-; GFX9-ARCH-FLAT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX9-ARCH-FLAT-NEXT:    s_mov_b64 s[2:3], src_private_base
+; GFX9-ARCH-FLAT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
+; GFX9-ARCH-FLAT-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX9-ARCH-FLAT-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-ARCH-FLAT-NEXT:    v_mov_b32_e32 v0, 0x7b
-; GFX9-ARCH-FLAT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-ARCH-FLAT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-ARCH-FLAT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-ARCH-FLAT-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-ARCH-FLAT-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-ARCH-FLAT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-ARCH-FLAT-NEXT:    s_endpgm
 ;
 ; GFX942-ARCH-FLAT-LABEL: private_constant_expression_use:
 ; GFX942-ARCH-FLAT:       ; %bb.0:
-; GFX942-ARCH-FLAT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX942-ARCH-FLAT-NEXT:    s_mov_b64 s[2:3], src_private_base
+; GFX942-ARCH-FLAT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
+; GFX942-ARCH-FLAT-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX942-ARCH-FLAT-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX942-ARCH-FLAT-NEXT:    v_mov_b32_e32 v0, 0x7b
-; GFX942-ARCH-FLAT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX942-ARCH-FLAT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX942-ARCH-FLAT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-ARCH-FLAT-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
+; GFX942-ARCH-FLAT-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
 ; GFX942-ARCH-FLAT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-ARCH-FLAT-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: private_constant_expression_use:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX10-NEXT:    s_mov_b64 s[2:3], src_private_base
+; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[8:9], 0x0
+; GFX10-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0x7b
-; GFX10-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    s_endpgm
   store volatile ptr addrspacecast (ptr addrspace(5) inttoptr (i32 123 to ptr addrspace(5)) to ptr), ptr addrspace(1) %out, align 8
@@ -611,48 +611,48 @@ define amdgpu_kernel void @calls_intrin_ascast_cc_kernel(ptr addrspace(3) %ptr)
 ;
 ; GFX9-LABEL: calls_intrin_ascast_cc_kernel:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dword s2, s[8:9], 0x0
 ; GFX9-NEXT:    s_mov_b64 s[0:1], src_shared_base
+; GFX9-NEXT:    s_load_dword s0, s[8:9], 0x0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 7
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    flat_store_dword v[0:1], v2
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX9-ARCH-FLAT-LABEL: calls_intrin_ascast_cc_kernel:
 ; GFX9-ARCH-FLAT:       ; %bb.0:
-; GFX9-ARCH-FLAT-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX9-ARCH-FLAT-NEXT:    s_mov_b64 s[0:1], src_shared_base
+; GFX9-ARCH-FLAT-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; GFX9-ARCH-FLAT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-ARCH-FLAT-NEXT:    v_mov_b32_e32 v2, 7
 ; GFX9-ARCH-FLAT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-ARCH-FLAT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-ARCH-FLAT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-ARCH-FLAT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX9-ARCH-FLAT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-ARCH-FLAT-NEXT:    s_endpgm
 ;
 ; GFX942-ARCH-FLAT-LABEL: calls_intrin_ascast_cc_kernel:
 ; GFX942-ARCH-FLAT:       ; %bb.0:
-; GFX942-ARCH-FLAT-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX942-ARCH-FLAT-NEXT:    s_mov_b64 s[0:1], src_shared_base
+; GFX942-ARCH-FLAT-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; GFX942-ARCH-FLAT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX942-ARCH-FLAT-NEXT:    v_mov_b32_e32 v2, 7
 ; GFX942-ARCH-FLAT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-ARCH-FLAT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-ARCH-FLAT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX942-ARCH-FLAT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX942-ARCH-FLAT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-ARCH-FLAT-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: calls_intrin_ascast_cc_kernel:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dword s2, s[8:9], 0x0
 ; GFX10-NEXT:    s_mov_b64 s[0:1], src_shared_base
-; GFX10-NEXT:    v_mov_b32_e32 v2, 7
+; GFX10-NEXT:    s_load_dword s0, s[8:9], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-NEXT:    v_mov_b32_e32 v2, 7
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll
index 009dec8d195e9..60cd25203311a 100644
--- a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll
+++ b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll
@@ -865,19 +865,19 @@ define amdgpu_kernel void @with_inline_asm() {
 }
 
 ;.
-; GFX9: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
-; GFX9: attributes #[[ATTR1]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+; GFX9: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+; GFX9: attributes #[[ATTR1]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
 ; GFX9: attributes #[[ATTR2]] = { "target-cpu"="gfx900" "uniform-work-group-size"="false" }
-; GFX9: attributes #[[ATTR3]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+; GFX9: attributes #[[ATTR3]] = { "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
 ; GFX9: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) "target-cpu"="gfx900" }
-; GFX9: attributes #[[ATTR5]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+; GFX9: attributes #[[ATTR5]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
 ;.
-; GFX10: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
-; GFX10: attributes #[[ATTR1]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+; GFX10: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+; GFX10: attributes #[[ATTR1]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
 ; GFX10: attributes #[[ATTR2]] = { "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
-; GFX10: attributes #[[ATTR3]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+; GFX10: attributes #[[ATTR3]] = { "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
 ; GFX10: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) "target-cpu"="gfx1010" }
-; GFX10: attributes #[[ATTR5]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+; GFX10: attributes #[[ATTR5]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
 ;.
 ; GFX9: [[META0]] = !{i32 2, i32 10}
 ; GFX9: [[META1]] = !{i32 1, i32 2, i32 3, i32 10}
diff --git a/llvm/test/CodeGen/AMDGPU/av-split-dead-valno-crash.ll b/llvm/test/CodeGen/AMDGPU/av-split-dead-valno-crash.ll
index 37040123ee20c..d301f16512a60 100644
--- a/llvm/test/CodeGen/AMDGPU/av-split-dead-valno-crash.ll
+++ b/llvm/test/CodeGen/AMDGPU/av-split-dead-valno-crash.ll
@@ -7,8 +7,8 @@ define amdgpu_kernel void @vgpr_mfma_pass_av_split_crash(double %arg1, i1 %arg2,
 ; CHECK-NEXT:    s_load_dword s0, s[4:5], 0x8
 ; CHECK-NEXT:    s_load_dwordx2 s[10:11], s[4:5], 0x0
 ; CHECK-NEXT:    s_load_dwordx4 s[12:15], s[4:5], 0x10
-; CHECK-NEXT:    v_mov_b32_e32 v1, 0x3e21eeb6
-; CHECK-NEXT:    v_mov_b32_e32 v20, 0
+; CHECK-NEXT:    v_mov_b32_e32 v30, 0x9037ab78
+; CHECK-NEXT:    v_mov_b32_e32 v31, 0x3e21eeb6
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_bitcmp1_b32 s0, 0
 ; CHECK-NEXT:    s_cselect_b64 s[16:17], -1, 0
@@ -16,12 +16,9 @@ define amdgpu_kernel void @vgpr_mfma_pass_av_split_crash(double %arg1, i1 %arg2,
 ; CHECK-NEXT:    s_bitcmp1_b32 s0, 8
 ; CHECK-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[2:3]
-; CHECK-NEXT:    v_cmp_ne_u32_e64 s[0:1], 1, v0
-; CHECK-NEXT:    v_mov_b32_e32 v0, 0x9037ab78
-; CHECK-NEXT:    v_accvgpr_write_b32 a3, v1
 ; CHECK-NEXT:    s_xor_b64 s[20:21], s[2:3], -1
+; CHECK-NEXT:    v_cmp_ne_u32_e64 s[0:1], 1, v0
 ; CHECK-NEXT:    s_and_b64 s[2:3], exec, s[2:3]
-; CHECK-NEXT:    v_accvgpr_write_b32 a2, v0
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 0xa17f65f6
 ; CHECK-NEXT:    v_mov_b32_e32 v3, 0xbe927e4f
 ; CHECK-NEXT:    v_mov_b32_e32 v4, 0x19f4ec90
@@ -37,14 +34,15 @@ define amdgpu_kernel void @vgpr_mfma_pass_av_split_crash(double %arg1, i1 %arg2,
 ; CHECK-NEXT:    v_mov_b32_e32 v14, 0x8427b883
 ; CHECK-NEXT:    v_mov_b32_e32 v15, 0x3fae1bb4
 ; CHECK-NEXT:    s_mov_b64 s[22:23], 0
-; CHECK-NEXT:    v_mov_b32_e32 v0, 0x57b87036
-; CHECK-NEXT:    v_mov_b32_e32 v1, 0x3fb3b136
+; CHECK-NEXT:    v_mov_b32_e32 v20, 0x57b87036
+; CHECK-NEXT:    v_mov_b32_e32 v21, 0x3fb3b136
 ; CHECK-NEXT:    s_and_b64 s[4:5], exec, s[16:17]
 ; CHECK-NEXT:    v_mov_b32_e32 v18, 0x55555523
 ; CHECK-NEXT:    v_mov_b32_e32 v19, 0xbfd55555
 ; CHECK-NEXT:    s_and_b64 s[6:7], exec, s[18:19]
-; CHECK-NEXT:    v_mov_b32_e32 v21, v20
-; CHECK-NEXT:    ; implicit-def: $vgpr30_vgpr31
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    v_mov_b64_e32 v[16:17], 0
+; CHECK-NEXT:    ; implicit-def: $agpr0_agpr1
 ; CHECK-NEXT:    ; implicit-def: $vgpr22_vgpr23
 ; CHECK-NEXT:    s_branch .LBB0_2
 ; CHECK-NEXT:  .LBB0_1: ; %Flow9
@@ -64,12 +62,11 @@ define amdgpu_kernel void @vgpr_mfma_pass_av_split_crash(double %arg1, i1 %arg2,
 ; CHECK-NEXT:    ; in Loop: Header=BB0_2 Depth=1
 ; CHECK-NEXT:    v_mov_b64_e32 v[24:25], s[14:15]
 ; CHECK-NEXT:    flat_load_dwordx2 v[24:25], v[24:25]
-; CHECK-NEXT:    v_accvgpr_read_b32 v27, a3
-; CHECK-NEXT:    v_accvgpr_read_b32 v26, a2
+; CHECK-NEXT:    v_mov_b64_e32 v[26:27], v[30:31]
 ; CHECK-NEXT:    v_mov_b64_e32 v[28:29], v[2:3]
-; CHECK-NEXT:    v_mov_b64_e32 v[16:17], v[0:1]
-; CHECK-NEXT:    v_accvgpr_write_b32 a0, 0
-; CHECK-NEXT:    v_accvgpr_write_b32 a1, 0
+; CHECK-NEXT:    v_mov_b64_e32 v[16:17], v[20:21]
+; CHECK-NEXT:    v_accvgpr_write_b32 a2, 0
+; CHECK-NEXT:    v_accvgpr_write_b32 a3, 0
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_fmac_f64_e32 v[26:27], 0, v[24:25]
 ; CHECK-NEXT:    v_fmac_f64_e32 v[28:29], 0, v[26:27]
@@ -96,30 +93,32 @@ define amdgpu_kernel void @vgpr_mfma_pass_av_split_crash(double %arg1, i1 %arg2,
 ; CHECK-NEXT:  .LBB0_6: ; %.preheader1855.i.i.i3329
 ; CHECK-NEXT:    ; Parent Loop BB0_2 Depth=1
 ; CHECK-NEXT:    ; => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    v_accvgpr_read_b32 v29, a1
-; CHECK-NEXT:    v_accvgpr_read_b32 v28, a0
+; CHECK-NEXT:    v_accvgpr_read_b32 v29, a3
+; CHECK-NEXT:    v_accvgpr_read_b32 v28, a2
 ; CHECK-NEXT:    s_mov_b64 s[24:25], -1
 ; CHECK-NEXT:    s_mov_b64 s[8:9], -1
 ; CHECK-NEXT:    s_mov_b64 vcc, s[2:3]
-; CHECK-NEXT:    ; implicit-def: $agpr0_agpr1
+; CHECK-NEXT:    ; implicit-def: $agpr2_agpr3
 ; CHECK-NEXT:    s_cbranch_vccz .LBB0_5
 ; CHECK-NEXT:  ; %bb.7: ; %.lr.ph2070.i.i.i3291
 ; CHECK-NEXT:    ; in Loop: Header=BB0_6 Depth=2
-; CHECK-NEXT:    v_accvgpr_write_b32 a0, v30
-; CHECK-NEXT:    v_accvgpr_write_b32 a1, v31
+; CHECK-NEXT:    v_accvgpr_mov_b32 a3, a1
+; CHECK-NEXT:    v_accvgpr_mov_b32 a2, a0
 ; CHECK-NEXT:    s_mov_b64 s[8:9], s[18:19]
 ; CHECK-NEXT:    s_mov_b64 vcc, s[6:7]
 ; CHECK-NEXT:    s_cbranch_vccz .LBB0_5
 ; CHECK-NEXT:  ; %bb.8: ; %.preheader1856.preheader.i.i.i3325
 ; CHECK-NEXT:    ; in Loop: Header=BB0_6 Depth=2
-; CHECK-NEXT:    v_accvgpr_write_b32 a0, v26
+; CHECK-NEXT:    v_accvgpr_write_b32 a2, v26
 ; CHECK-NEXT:    s_mov_b64 s[24:25], 0
-; CHECK-NEXT:    v_accvgpr_write_b32 a1, v27
+; CHECK-NEXT:    v_accvgpr_write_b32 a3, v27
 ; CHECK-NEXT:    s_mov_b64 s[8:9], 0
 ; CHECK-NEXT:    s_branch .LBB0_5
 ; CHECK-NEXT:  .LBB0_9: ; in Loop: Header=BB0_2 Depth=1
+; CHECK-NEXT:    v_mov_b64_e32 v[24:25], s[10:11]
+; CHECK-NEXT:    v_accvgpr_write_b32 a0, v24
 ; CHECK-NEXT:    s_mov_b64 s[22:23], 0
-; CHECK-NEXT:    v_mov_b64_e32 v[30:31], s[10:11]
+; CHECK-NEXT:    v_accvgpr_write_b32 a1, v25
 ; CHECK-NEXT:    s_mov_b64 s[8:9], s[20:21]
 ; CHECK-NEXT:    s_branch .LBB0_15
 ; CHECK-NEXT:  .LBB0_10: ; in Loop: Header=BB0_2 Depth=1
@@ -136,19 +135,21 @@ define amdgpu_kernel void @vgpr_mfma_pass_av_split_crash(double %arg1, i1 %arg2,
 ; CHECK-NEXT:    v_cndmask_b32_e64 v23, v23, 0, s[16:17]
 ; CHECK-NEXT:    v_cndmask_b32_e64 v22, v22, 0, s[16:17]
 ; CHECK-NEXT:    v_cndmask_b32_e64 v16, 0, 1, s[8:9]
-; CHECK-NEXT:    v_mov_b32_e32 v17, v16
 ; CHECK-NEXT:    s_and_b64 s[8:9], exec, s[16:17]
-; CHECK-NEXT:    global_store_dwordx2 v20, v[16:17], s[12:13]
+; CHECK-NEXT:    v_mov_b32_e32 v17, v16
 ; CHECK-NEXT:    s_cselect_b32 s23, s23, 0
 ; CHECK-NEXT:    s_cselect_b32 s22, s22, 0
 ; CHECK-NEXT:    s_mov_b64 s[8:9], -1
+; CHECK-NEXT:    global_store_dwordx2 v0, v[16:17], s[12:13]
 ; CHECK-NEXT:    s_branch .LBB0_14
 ; CHECK-NEXT:  .LBB0_13: ; in Loop: Header=BB0_2 Depth=1
 ; CHECK-NEXT:    s_mov_b64 s[8:9], 0
 ; CHECK-NEXT:    v_mov_b64_e32 v[22:23], 0
-; CHECK-NEXT:  .LBB0_14: ; %Flow6
+; CHECK-NEXT:  .LBB0_14: ; %Flow8
 ; CHECK-NEXT:    ; in Loop: Header=BB0_2 Depth=1
-; CHECK-NEXT:    v_mov_b64_e32 v[30:31], v[24:25]
+; CHECK-NEXT:    v_accvgpr_write_b32 a0, v24
+; CHECK-NEXT:    v_mov_b64_e32 v[16:17], 0
+; CHECK-NEXT:    v_accvgpr_write_b32 a1, v25
 ; CHECK-NEXT:  .LBB0_15: ; %Flow6
 ; CHECK-NEXT:    ; in Loop: Header=BB0_2 Depth=1
 ; CHECK-NEXT:    s_mov_b64 s[24:25], -1
@@ -157,7 +158,7 @@ define amdgpu_kernel void @vgpr_mfma_pass_av_split_crash(double %arg1, i1 %arg2,
 ; CHECK-NEXT:  ; %bb.16: ; %._crit_edge2105.i.i.i2330
 ; CHECK-NEXT:    ; in Loop: Header=BB0_2 Depth=1
 ; CHECK-NEXT:    s_mov_b64 s[24:25], 0
-; CHECK-NEXT:    global_store_dwordx2 v20, v[20:21], s[12:13]
+; CHECK-NEXT:    global_store_dwordx2 v0, v[16:17], s[12:13]
 ; CHECK-NEXT:    s_branch .LBB0_1
 ; CHECK-NEXT:  .LBB0_17: ; %DummyReturnBlock
 ; CHECK-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/branch-relax-indirect-branch.mir b/llvm/test/CodeGen/AMDGPU/branch-relax-indirect-branch.mir
index 778e3fbb81126..035354c1f1369 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-relax-indirect-branch.mir
+++ b/llvm/test/CodeGen/AMDGPU/branch-relax-indirect-branch.mir
@@ -68,7 +68,7 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.3(0x04000000), %bb.7(0x7c000000)
   ; CHECK-NEXT:   liveins: $vcc_hi, $vcc_lo, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $sgpr28, $sgpr29, $sgpr30, $sgpr31, $sgpr34, $sgpr35, $sgpr36, $sgpr37, $sgpr38, $sgpr39, $sgpr40, $sgpr41, $sgpr42, $sgpr43, $sgpr44, $sgpr45, $sgpr46, $sgpr47, $sgpr48, $sgpr49, $sgpr50, $sgpr51, $sgpr52, $sgpr53, $sgpr54, $sgpr55, $sgpr56, $sgpr57, $sgpr58, $sgpr59, $sgpr60, $sgpr61, $sgpr62, $sgpr63, $sgpr64, $sgpr65, $sgpr66, $sgpr67, $sgpr68, $sgpr69, $sgpr70, $sgpr71, $sgpr72, $sgpr73, $sgpr74, $sgpr75, $sgpr76, $sgpr77, $sgpr78, $sgpr79, $sgpr80, $sgpr81, $sgpr82, $sgpr83, $sgpr84, $sgpr85, $sgpr86, $sgpr87, $sgpr88, $sgpr89, $sgpr90, $sgpr91, $sgpr92, $sgpr93, $sgpr94, $sgpr95, $sgpr96, $sgpr97, $sgpr98, $sgpr99, $sgpr100, $sgpr101, $vgpr0, $vgpr1
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   INLINEASM &"v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64", 1 /* sideeffect attdialect */, 2359306 /* regdef:VRegOrLds_32_and_VS_32_Lo256 */, def renamable $sgpr4
+  ; CHECK-NEXT:   INLINEASM &"v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64", 1 /* sideeffect attdialect */, 2621450 /* regdef:SReg_32 */, def renamable $sgpr4
   ; CHECK-NEXT:   S_CMP_LG_U32 killed renamable $sgpr4, 0, implicit-def $scc
   ; CHECK-NEXT:   S_CBRANCH_SCC0 %bb.3, implicit killed $scc
   ; CHECK-NEXT: {{  $}}
@@ -149,7 +149,7 @@ body:             |
     successors: %bb.3(0x04000000), %bb.2(0x7c000000)
     liveins: $vcc_hi, $vcc_lo, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $sgpr28, $sgpr29, $sgpr30, $sgpr31, $sgpr34, $sgpr35, $sgpr36, $sgpr37, $sgpr38, $sgpr39, $sgpr40, $sgpr41, $sgpr42, $sgpr43, $sgpr44, $sgpr45, $sgpr46, $sgpr47, $sgpr48, $sgpr49, $sgpr50, $sgpr51, $sgpr52, $sgpr53, $sgpr54, $sgpr55, $sgpr56, $sgpr57, $sgpr58, $sgpr59, $sgpr60, $sgpr61, $sgpr62, $sgpr63, $sgpr64, $sgpr65, $sgpr66, $sgpr67, $sgpr68, $sgpr69, $sgpr70, $sgpr71, $sgpr72, $sgpr73, $sgpr74, $sgpr75, $sgpr76, $sgpr77, $sgpr78, $sgpr79, $sgpr80, $sgpr81, $sgpr82, $sgpr83, $sgpr84, $sgpr85, $sgpr86, $sgpr87, $sgpr88, $sgpr89, $sgpr90, $sgpr91, $sgpr92, $sgpr93, $sgpr94, $sgpr95, $sgpr96, $sgpr97, $sgpr98, $sgpr99, $sgpr100, $sgpr101, $vgpr0, $vgpr1
 
-    INLINEASM &"v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64", 1 /* sideeffect attdialect */, 2359306 /* regdef:SReg_32 */, def renamable $sgpr4
+    INLINEASM &"v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64", 1 /* sideeffect attdialect */, 2621450 /* regdef:SReg_32 */, def renamable $sgpr4
     S_CMP_LG_U32 killed renamable $sgpr4, 0, implicit-def $scc
     S_CBRANCH_SCC1 %bb.2, implicit killed $scc
 
diff --git a/llvm/test/CodeGen/AMDGPU/branch-relax-no-terminators.mir b/llvm/test/CodeGen/AMDGPU/branch-relax-no-terminators.mir
index 0426dd52153f6..584d6a530fbec 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-relax-no-terminators.mir
+++ b/llvm/test/CodeGen/AMDGPU/branch-relax-no-terminators.mir
@@ -69,7 +69,7 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.3(0x04000000), %bb.7(0x7c000000)
   ; CHECK-NEXT:   liveins: $vcc_hi, $vcc_lo, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $sgpr28, $sgpr29, $sgpr30, $sgpr31, $sgpr34, $sgpr35, $sgpr36, $sgpr37, $sgpr38, $sgpr39, $sgpr40, $sgpr41, $sgpr42, $sgpr43, $sgpr44, $sgpr45, $sgpr46, $sgpr47, $sgpr48, $sgpr49, $sgpr50, $sgpr51, $sgpr52, $sgpr53, $sgpr54, $sgpr55, $sgpr56, $sgpr57, $sgpr58, $sgpr59, $sgpr60, $sgpr61, $sgpr62, $sgpr63, $sgpr64, $sgpr65, $sgpr66, $sgpr67, $sgpr68, $sgpr69, $sgpr70, $sgpr71, $sgpr72, $sgpr73, $sgpr74, $sgpr75, $sgpr76, $sgpr77, $sgpr78, $sgpr79, $sgpr80, $sgpr81, $sgpr82, $sgpr83, $sgpr84, $sgpr85, $sgpr86, $sgpr87, $sgpr88, $sgpr89, $sgpr90, $sgpr91, $sgpr92, $sgpr93, $sgpr94, $sgpr95, $sgpr96, $sgpr97, $sgpr98, $sgpr99, $sgpr100, $sgpr101, $vgpr0, $vgpr1
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   INLINEASM &"v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64", 1 /* sideeffect attdialect */, 2359306 /* regdef:VRegOrLds_32_and_VS_32_Lo256 */, def renamable $sgpr4
+  ; CHECK-NEXT:   INLINEASM &"v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64", 1 /* sideeffect attdialect */, 2621450 /* regdef:SReg_32 */, def renamable $sgpr4
   ; CHECK-NEXT:   S_CMP_LG_U32 killed renamable $sgpr4, 0, implicit-def $scc
   ; CHECK-NEXT:   S_CBRANCH_SCC0 %bb.3, implicit killed $scc
   ; CHECK-NEXT: {{  $}}
@@ -151,7 +151,7 @@ body:             |
     successors: %bb.3(0x04000000), %bb.2(0x7c000000)
     liveins: $vcc_hi, $vcc_lo, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $sgpr28, $sgpr29, $sgpr30, $sgpr31, $sgpr34, $sgpr35, $sgpr36, $sgpr37, $sgpr38, $sgpr39, $sgpr40, $sgpr41, $sgpr42, $sgpr43, $sgpr44, $sgpr45, $sgpr46, $sgpr47, $sgpr48, $sgpr49, $sgpr50, $sgpr51, $sgpr52, $sgpr53, $sgpr54, $sgpr55, $sgpr56, $sgpr57, $sgpr58, $sgpr59, $sgpr60, $sgpr61, $sgpr62, $sgpr63, $sgpr64, $sgpr65, $sgpr66, $sgpr67, $sgpr68, $sgpr69, $sgpr70, $sgpr71, $sgpr72, $sgpr73, $sgpr74, $sgpr75, $sgpr76, $sgpr77, $sgpr78, $sgpr79, $sgpr80, $sgpr81, $sgpr82, $sgpr83, $sgpr84, $sgpr85, $sgpr86, $sgpr87, $sgpr88, $sgpr89, $sgpr90, $sgpr91, $sgpr92, $sgpr93, $sgpr94, $sgpr95, $sgpr96, $sgpr97, $sgpr98, $sgpr99, $sgpr100, $sgpr101, $vgpr0, $vgpr1
 
-    INLINEASM &"v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64", 1 /* sideeffect attdialect */, 2359306 /* regdef:SReg_32 */, def renamable $sgpr4
+    INLINEASM &"v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64", 1 /* sideeffect attdialect */, 2621450 /* regdef:SReg_32 */, def renamable $sgpr4
     S_CMP_LG_U32 killed renamable $sgpr4, 0, implicit-def $scc
     S_CBRANCH_SCC1 %bb.2, implicit killed $scc
 
diff --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation-inst-size-gfx11.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation-inst-size-gfx11.ll
index dd389375b0d77..6bebc8f5d0d18 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-relaxation-inst-size-gfx11.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation-inst-size-gfx11.ll
@@ -23,9 +23,9 @@ define amdgpu_kernel void @long_forward_branch_gfx11plus(ptr addrspace(1) %in, p
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_load_d16_b16 v0, v1, s[0:1]
 ; GFX11-NEXT:    global_load_d16_hi_b16 v0, v1, s[0:1] offset:2
-; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    global_store_b16 v1, v0, s[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    global_store_b16 v1, v0, s[2:3]
 ; GFX11-NEXT:    global_store_d16_hi_b16 v1, v0, s[2:3] offset:2
 ; GFX11-NEXT:  .LBB0_2: ; %bb3
 ; GFX11-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
index b8dd377377dab..306fe33bfb7ac 100644
--- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
@@ -7283,7 +7283,7 @@ declare hidden void @external_void_func_12xv3f32(<3 x float>, <3 x float>, <3 x
 declare hidden void @external_void_func_8xv5f32(<5 x float>, <5 x float>, <5 x float>, <5 x float>,
     <5 x float>, <5 x float>, <5 x float>, <5 x float>) #0
 
-attributes #0 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
+attributes #0 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-cluster-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-cluster-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-cluster-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
 attributes #1 = { nounwind readnone }
 attributes #2 = { nounwind noinline }
 
diff --git a/llvm/test/CodeGen/AMDGPU/call-defs-mode-register.ll b/llvm/test/CodeGen/AMDGPU/call-defs-mode-register.ll
index 0c4974f347a8f..ffe536d347c53 100644
--- a/llvm/test/CodeGen/AMDGPU/call-defs-mode-register.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-defs-mode-register.ll
@@ -54,4 +54,4 @@ define float @asm_changes_mode(float %x, float %y) #0 {
 
 declare float @llvm.experimental.constrained.fadd.f32(float, float, metadata, metadata)
 
-attributes #0 = { strictfp "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z"  }
+attributes #0 = { strictfp "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-cluster-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-cluster-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-cluster-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z"  }
diff --git a/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll b/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll
index 69ad8e96c7c5d..61a195f9c314f 100644
--- a/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll
@@ -747,4 +747,4 @@ define amdgpu_kernel void @callee_saved_sgpr_vgpr_kernel() #2 {
 attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone }
 attributes #2 = { nounwind noinline }
-attributes #3 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
+attributes #3 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-cluster-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-cluster-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-cluster-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
diff --git a/llvm/test/CodeGen/AMDGPU/call-reqd-group-size.ll b/llvm/test/CodeGen/AMDGPU/call-reqd-group-size.ll
index 093ca55698fe3..33eb8c1e8f4f5 100644
--- a/llvm/test/CodeGen/AMDGPU/call-reqd-group-size.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-reqd-group-size.ll
@@ -208,7 +208,7 @@ define amdgpu_kernel void @known_xyz_0(ptr addrspace(1) %out) !reqd_work_group_s
 }
 ; CHECK: .amdhsa_system_vgpr_workitem_id 0
 
-attributes #0 = { "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" }
+attributes #0 = { "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-cluster-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-cluster-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-cluster-id-z" }
 
 !0 = !{i32 1, i32 64, i32 64}
 !1 = !{i32 64, i32 1, i32 64}
diff --git a/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll b/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll
index 9abb50651146a..675acd0eedfc5 100644
--- a/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll
@@ -154,5 +154,5 @@ declare hidden void @func(i32) #0
 declare hidden i32 @func.return(i32) #0
 declare void @got.func(i32) #0
 
-attributes #0 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
+attributes #0 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-cluster-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-cluster-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-cluster-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
 
diff --git a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
index fc3c476d0ab2e..dae77d19c1235 100644
--- a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
+++ b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
@@ -1110,6 +1110,7 @@ define <2 x i16> @chain_hi_to_lo_group_may_alias_store(ptr addrspace(3) %ptr, pt
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, 0x7b
 ; GFX11-TRUE16-NEXT:    ds_load_u16_d16_hi v2, v0
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    ds_store_b16 v1, v2
 ; GFX11-TRUE16-NEXT:    ds_load_u16_d16 v2, v0 offset:2
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll b/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll
index ccdc0b1bf43c4..a84872d8eac0f 100644
--- a/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll
+++ b/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll
@@ -1561,8 +1561,8 @@ define amdgpu_kernel void @v_no_clamp_add_src_v2f16_f16_src(ptr addrspace(1) %ou
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v0, s[2:3]
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, 0
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, 0
 ; GFX11-TRUE16-NEXT:    v_add_f16_e32 v0.l, 1.0, v0.l
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_pk_max_f16 v0, v0, v0 clamp
diff --git a/llvm/test/CodeGen/AMDGPU/cluster-dims.ll b/llvm/test/CodeGen/AMDGPU/cluster-dims.ll
new file mode 100644
index 0000000000000..62e8d9dc61293
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/cluster-dims.ll
@@ -0,0 +1,47 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 %s -o - | FileCheck %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -global-isel %s -o - | FileCheck %s
+
+; CHECK: .cluster_dims:
+; CHECK-NEXT: - 2
+; CHECK-NEXT: - 2
+; CHECK-NEXT: - 2
+define dso_local amdgpu_kernel void @_Z15test_literal_3dv() #0 {
+entry:
+  ret void
+}
+
+; CHECK: .cluster_dims:
+; CHECK-NEXT: - 2
+; CHECK-NEXT: - 2
+; CHECK-NEXT: - 1
+define dso_local amdgpu_kernel void @_Z15test_literal_2dv() #1 {
+entry:
+  ret void
+}
+
+; CHECK: .cluster_dims:
+; CHECK-NEXT: - 4
+; CHECK-NEXT: - 1
+; CHECK-NEXT: - 1
+define dso_local amdgpu_kernel void @_Z15test_literal_1dv() #2 {
+entry:
+  ret void
+}
+
+; CHECK: .cluster_dims:
+; CHECK-NEXT: - 4
+; CHECK-NEXT: - 2
+; CHECK-NEXT: - 1
+define dso_local amdgpu_kernel void @_Z13test_constantv() #3 {
+entry:
+  ret void
+}
+
+attributes #0 = { convergent mustprogress noinline norecurse nounwind "amdgpu-cluster-dims"="2,2,2" }
+attributes #1 = { convergent mustprogress noinline norecurse nounwind "amdgpu-cluster-dims"="2,2,1" }
+attributes #2 = { convergent mustprogress noinline norecurse nounwind "amdgpu-cluster-dims"="4,1,1" }
+attributes #3 = { convergent mustprogress noinline norecurse nounwind "amdgpu-cluster-dims"="4,2,1" }
+
+!llvm.module.flags = !{!0}
+
+!0 = !{i32 1, !"amdhsa_code_object_version", i32 600}
diff --git a/llvm/test/CodeGen/AMDGPU/coalesce-copy-to-agpr-to-av-registers.mir b/llvm/test/CodeGen/AMDGPU/coalesce-copy-to-agpr-to-av-registers.mir
index 9345e92789327..029aa3957d32b 100644
--- a/llvm/test/CodeGen/AMDGPU/coalesce-copy-to-agpr-to-av-registers.mir
+++ b/llvm/test/CodeGen/AMDGPU/coalesce-copy-to-agpr-to-av-registers.mir
@@ -20,13 +20,13 @@ body:             |
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_64 = COPY [[COPY]]
     ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1:areg_64 = COPY [[COPY1]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4128777 /* reguse:AReg_64 */, [[COPY2]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4325385 /* reguse:AReg_64 */, [[COPY2]]
     ; CHECK-NEXT: SI_RETURN
     %0:vgpr_32 = COPY $vgpr0
     %1:vgpr_32 = COPY $vgpr1
     undef %2.sub0:areg_64 = COPY %0
     %2.sub1:areg_64 = COPY %1
-    INLINEASM &"; use $0", 0 /* attdialect */, 4128777 /* reguse:AReg_64 */, killed %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 4325385 /* reguse:AReg_64 */, killed %2
     SI_RETURN
 
 ...
@@ -45,13 +45,13 @@ body:             |
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_64_align2 = COPY [[COPY]]
     ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1:areg_64_align2 = COPY [[COPY1]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4521993 /* reguse:AReg_64_Align2 */, [[COPY2]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4915209 /* reguse:AReg_64_Align2 */, [[COPY2]]
     ; CHECK-NEXT: SI_RETURN
     %0:vgpr_32 = COPY $vgpr0
     %1:vgpr_32 = COPY $vgpr1
     undef %2.sub0:areg_64_align2 = COPY %0
     %2.sub1:areg_64_align2 = COPY %1
-    INLINEASM &"; use $0", 0 /* attdialect */, 4521993 /* reguse:AReg_64_Align2 */, %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 4915209 /* reguse:AReg_64_Align2 */, %2
     SI_RETURN
 
 ...
@@ -72,7 +72,7 @@ body:             |
     ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0:areg_96 = COPY [[COPY]]
     ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub1:areg_96 = COPY [[COPY1]]
     ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub2:areg_96 = COPY [[COPY2]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 5832713 /* reguse:AReg_96 */, [[COPY3]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_96 */, [[COPY3]]
     ; CHECK-NEXT: SI_RETURN
     %0:vgpr_32 = COPY $vgpr0
     %1:vgpr_32 = COPY $vgpr1
@@ -80,7 +80,7 @@ body:             |
     undef %3.sub0:areg_96 = COPY %0
     %3.sub1:areg_96 = COPY %1
     %3.sub2:areg_96 = COPY %2
-    INLINEASM &"; use $0", 0 /* attdialect */, 5832713 /* reguse:AReg_96 */, %3
+    INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_96 */, %3
     SI_RETURN
 
 ...
@@ -101,7 +101,7 @@ body:             |
     ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0:areg_96_align2 = COPY [[COPY]]
     ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub1:areg_96_align2 = COPY [[COPY1]]
     ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub2:areg_96_align2 = COPY [[COPY2]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6160393 /* reguse:AReg_96_Align2 */, [[COPY3]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6750217 /* reguse:AReg_96_Align2 */, [[COPY3]]
     ; CHECK-NEXT: SI_RETURN
     %0:vgpr_32 = COPY $vgpr0
     %1:vgpr_32 = COPY $vgpr1
@@ -109,7 +109,7 @@ body:             |
     undef %3.sub0:areg_96_align2 = COPY %0
     %3.sub1:areg_96_align2 = COPY %1
     %3.sub2:areg_96_align2 = COPY %2
-    INLINEASM &"; use $0", 0 /* attdialect */, 6160393 /* reguse:AReg_96_Align2 */, %3
+    INLINEASM &"; use $0", 0 /* attdialect */, 6750217 /* reguse:AReg_96_Align2 */, %3
     SI_RETURN
 
 ...
@@ -128,13 +128,13 @@ body:             |
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0_sub1:areg_128 = COPY [[COPY]]
     ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub2_sub3:areg_128 = COPY [[COPY1]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 7733257 /* reguse:AReg_128 */, [[COPY2]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8323081 /* reguse:AReg_128 */, [[COPY2]]
     ; CHECK-NEXT: SI_RETURN
     %0:vreg_64 = COPY $vgpr0_vgpr1
     %1:vreg_64 = COPY $vgpr2_vgpr3
     undef %2.sub0_sub1:areg_128 = COPY %0
     %2.sub2_sub3:areg_128 = COPY %1
-    INLINEASM &"; use $0", 0 /* attdialect */, 7733257 /* reguse:AReg_128 */, killed %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 8323081 /* reguse:AReg_128 */, killed %2
     SI_RETURN
 
 ...
@@ -153,13 +153,13 @@ body:             |
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[COPY]]
     ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub2_sub3:areg_128_align2 = COPY [[COPY1]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8060937 /* reguse:AReg_128_Align2 */, [[COPY2]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY2]]
     ; CHECK-NEXT: SI_RETURN
     %0:vreg_64 = COPY $vgpr0_vgpr1
     %1:vreg_64 = COPY $vgpr2_vgpr3
     undef %2.sub0_sub1:areg_128_align2 = COPY %0
     %2.sub2_sub3:areg_128_align2 = COPY %1
-    INLINEASM &"; use $0", 0 /* attdialect */, 8060937 /* reguse:AReg_128_Align2 */, %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %2
     SI_RETURN
 
 ...
@@ -178,13 +178,13 @@ body:             |
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr9
     ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_64_align2 = COPY [[COPY]]
     ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1:areg_64_align2 = COPY [[COPY1]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4521993 /* reguse:AReg_64_Align2 */, [[COPY2]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4915209 /* reguse:AReg_64_Align2 */, [[COPY2]]
     ; CHECK-NEXT: SI_RETURN
     %0:sgpr_32 = COPY $sgpr8
     %1:sgpr_32 = COPY $sgpr9
     undef %2.sub0:areg_64_align2 = COPY %0
     %2.sub1:areg_64_align2 = COPY %1
-    INLINEASM &"; use $0", 0 /* attdialect */, 4521993 /* reguse:AReg_64_Align2 */, %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 4915209 /* reguse:AReg_64_Align2 */, %2
     SI_RETURN
 
 ...
@@ -203,13 +203,13 @@ body:             |
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr1_vgpr2
     ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_96 = COPY [[COPY]]
     ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1_sub2:areg_96 = COPY [[COPY1]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 5832713 /* reguse:AReg_96 */, [[COPY2]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_96 */, [[COPY2]]
     ; CHECK-NEXT: SI_RETURN
     %0:vgpr_32 = COPY $vgpr0
     %1:vreg_64 = COPY $vgpr1_vgpr2
     undef %2.sub0:areg_96 = COPY %0
     %2.sub1_sub2:areg_96 = COPY %1
-    INLINEASM &"; use $0", 0 /* attdialect */, 5832713 /* reguse:AReg_96 */, %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_96 */, %2
     SI_RETURN
 
 ...
@@ -228,13 +228,13 @@ body:             |
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr1_vgpr2
     ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_96_align2 = COPY [[COPY]]
     ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1_sub2:areg_96_align2 = COPY [[COPY1]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4521993 /* reguse:AReg_64_Align2 */, [[COPY2]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4915209 /* reguse:AReg_64_Align2 */, [[COPY2]]
     ; CHECK-NEXT: SI_RETURN
     %0:vgpr_32 = COPY $vgpr0
     %1:vreg_64 = COPY $vgpr1_vgpr2
     undef %2.sub0:areg_96_align2 = COPY %0
     %2.sub1_sub2:areg_96_align2 = COPY %1
-    INLINEASM &"; use $0", 0 /* attdialect */, 4521993 /* reguse:AReg_64_Align2 */, %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 4915209 /* reguse:AReg_64_Align2 */, %2
     SI_RETURN
 
 ...
@@ -253,13 +253,13 @@ body:             |
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0_sub1:areg_96 = COPY [[COPY]]
     ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub2:areg_96 = COPY [[COPY1]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 5832713 /* reguse:AReg_96 */, [[COPY2]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_96 */, [[COPY2]]
     ; CHECK-NEXT: SI_RETURN
     %0:vreg_64 = COPY $vgpr0_vgpr1
     %1:vgpr_32 = COPY $vgpr2
     undef %2.sub0_sub1:areg_96 = COPY %0
     %2.sub2:areg_96 = COPY %1
-    INLINEASM &"; use $0", 0 /* attdialect */, 5832713 /* reguse:AReg_96 */, %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_96 */, %2
     SI_RETURN
 
 ...
@@ -278,13 +278,13 @@ body:             |
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0_sub1:areg_96_align2 = COPY [[COPY]]
     ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub2:areg_96_align2 = COPY [[COPY1]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4521993 /* reguse:AReg_64_Align2 */, [[COPY2]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4915209 /* reguse:AReg_64_Align2 */, [[COPY2]]
     ; CHECK-NEXT: SI_RETURN
     %0:vreg_64 = COPY $vgpr0_vgpr1
     %1:vgpr_32 = COPY $vgpr2
     undef %2.sub0_sub1:areg_96_align2 = COPY %0
     %2.sub2:areg_96_align2 = COPY %1
-    INLINEASM &"; use $0", 0 /* attdialect */, 4521993 /* reguse:AReg_64_Align2 */, %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 4915209 /* reguse:AReg_64_Align2 */, %2
     SI_RETURN
 
 ...
@@ -302,12 +302,12 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_64 = COPY [[COPY]]
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_64 = COPY [[COPY]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4128777 /* reguse:AReg_64 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4325385 /* reguse:AReg_64 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     %0:vgpr_32 = COPY $vgpr0
     undef %2.sub0:areg_64 = COPY %0
     %2.sub1:areg_64 = COPY %0
-    INLINEASM &"; use $0", 0 /* attdialect */, 4128777 /* reguse:AReg_64 */, killed %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 4325385 /* reguse:AReg_64 */, killed %2
     SI_RETURN
 
 ...
@@ -326,13 +326,13 @@ body:             |
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_64_align2 = COPY [[COPY]]
     ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1:areg_64_align2 = COPY [[COPY1]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4521993 /* reguse:AReg_64_Align2 */, [[COPY2]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4915209 /* reguse:AReg_64_Align2 */, [[COPY2]]
     ; CHECK-NEXT: SI_RETURN
     %0:vgpr_32 = COPY $vgpr0
     %1:vgpr_32 = COPY $vgpr1
     undef %2.sub0:areg_64_align2 = COPY %0
     %2.sub1:areg_64_align2 = COPY %1
-    INLINEASM &"; use $0", 0 /* attdialect */, 4521993 /* reguse:AReg_64_Align2 */, %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 4915209 /* reguse:AReg_64_Align2 */, %2
     SI_RETURN
 
 ...
@@ -350,12 +350,12 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96 = COPY [[COPY]]
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_96 = COPY [[COPY]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 5832713 /* reguse:AReg_96 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_96 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     %0:vgpr_32 = COPY $vgpr0
     undef %1.sub0:areg_96 = COPY %0
     %1.sub1:areg_96 = COPY %0
-    INLINEASM &"; use $0", 0 /* attdialect */, 5832713 /* reguse:AReg_96 */, %1
+    INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_96 */, %1
     SI_RETURN
 
 ...
@@ -373,12 +373,12 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96_align2 = COPY [[COPY]]
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_96_align2 = COPY [[COPY]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6160393 /* reguse:AReg_96_Align2 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6750217 /* reguse:AReg_96_Align2 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     %0:vgpr_32 = COPY $vgpr0
     undef %1.sub0:areg_96_align2 = COPY %0
     %1.sub1:areg_96_align2 = COPY %0
-    INLINEASM &"; use $0", 0 /* attdialect */, 6160393 /* reguse:AReg_96_Align2 */, %1
+    INLINEASM &"; use $0", 0 /* attdialect */, 6750217 /* reguse:AReg_96_Align2 */, %1
     SI_RETURN
 
 ...
@@ -398,14 +398,14 @@ body:             |
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_128 = COPY [[COPY]]
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_128 = COPY [[COPY]]
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub3:areg_128 = COPY [[COPY]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 7733257 /* reguse:AReg_128 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8323081 /* reguse:AReg_128 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     %0:vgpr_32 = COPY $vgpr0
     undef %1.sub0:areg_128 = COPY %0
     %1.sub1:areg_128 = COPY %0
     %1.sub2:areg_128 = COPY %0
     %1.sub3:areg_128 = COPY %0
-    INLINEASM &"; use $0", 0 /* attdialect */, 7733257 /* reguse:AReg_128 */, killed %1
+    INLINEASM &"; use $0", 0 /* attdialect */, 8323081 /* reguse:AReg_128 */, killed %1
     SI_RETURN
 
 ...
@@ -425,14 +425,14 @@ body:             |
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_128_align2 = COPY [[COPY]]
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_128_align2 = COPY [[COPY]]
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub3:areg_128_align2 = COPY [[COPY]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8060937 /* reguse:AReg_128_Align2 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     %0:vgpr_32 = COPY $vgpr0
     undef %1.sub0:areg_128_align2 = COPY %0
     %1.sub1:areg_128_align2 = COPY %0
     %1.sub2:areg_128_align2 = COPY %0
     %1.sub3:areg_128_align2 = COPY %0
-    INLINEASM &"; use $0", 0 /* attdialect */, 8060937 /* reguse:AReg_128_Align2 */, %1
+    INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %1
     SI_RETURN
 
 ...
@@ -451,14 +451,14 @@ body:             |
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_64 = COPY [[COPY]]
     ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1:areg_64 = COPY [[COPY1]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4128777 /* reguse:AReg_64 */, [[COPY2]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4325385 /* reguse:AReg_64 */, [[COPY2]]
     ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 2031625 /* reguse:VGPR_32 */, [[COPY]]
     ; CHECK-NEXT: SI_RETURN
     %0:vgpr_32 = COPY $vgpr0
     %1:vgpr_32 = COPY $vgpr1
     undef %2.sub0:areg_64 = COPY %0
     %2.sub1:areg_64 = COPY %1
-    INLINEASM &"; use $0", 0 /* attdialect */, 4128777 /* reguse:AReg_64 */, killed %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 4325385 /* reguse:AReg_64 */, killed %2
     INLINEASM &"; use $0", 0 /* attdialect */, 2031625 /* reguse:VGPR_32 */, killed %0
     SI_RETURN
 
@@ -477,13 +477,13 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_64 = COPY [[COPY]]
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_64 = COPY [[COPY]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4128777 /* reguse:AReg_64 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4325385 /* reguse:AReg_64 */, [[COPY1]]
     ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 2031625 /* reguse:VGPR_32 */, [[COPY]]
     ; CHECK-NEXT: SI_RETURN
     %0:vgpr_32 = COPY $vgpr0
     undef %1.sub0:areg_64 = COPY %0
     %1.sub1:areg_64 = COPY %0
-    INLINEASM &"; use $0", 0 /* attdialect */, 4128777 /* reguse:AReg_64 */, killed %1
+    INLINEASM &"; use $0", 0 /* attdialect */, 4325385 /* reguse:AReg_64 */, killed %1
     INLINEASM &"; use $0", 0 /* attdialect */, 2031625 /* reguse:VGPR_32 */, killed %0
     SI_RETURN
 
@@ -503,15 +503,15 @@ body:             |
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_64 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_64 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:vreg_64 = COPY [[COPY]].sub0
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4128777 /* reguse:AReg_64 */, [[COPY1]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3670025 /* reguse:VReg_64 */, [[COPY]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4325385 /* reguse:AReg_64 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3670025 /* reguse:VS_64_with_sub1 */, [[COPY]]
     ; CHECK-NEXT: SI_RETURN
     %0:vgpr_32 = COPY $vgpr0
     undef %1.sub0:areg_64 = COPY %0
     %1.sub1:areg_64 = COPY %0
     undef %2.sub0:vreg_64 = COPY %0
     %2.sub1:vreg_64 = COPY %0
-    INLINEASM &"; use $0", 0 /* attdialect */, 4128777 /* reguse:AReg_64 */, killed %1
+    INLINEASM &"; use $0", 0 /* attdialect */, 4325385 /* reguse:AReg_64 */, killed %1
     INLINEASM &"; use $0", 0 /* attdialect */, 3670025 /* reguse:VReg_64 */, killed %2
     SI_RETURN
 
@@ -533,13 +533,13 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:vreg_64 = COPY $vgpr1
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_64 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_64 = COPY [[COPY]].sub1
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4128777 /* reguse:AReg_64 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4325385 /* reguse:AReg_64 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0:vreg_64 = COPY $vgpr0
     %0.sub1:vreg_64 = COPY $vgpr1
     undef %2.sub0:areg_64 = COPY %0.sub0
     %2.sub1:areg_64 = COPY %0.sub1
-    INLINEASM &"; use $0", 0 /* attdialect */, 4128777 /* reguse:AReg_64 */, killed %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 4325385 /* reguse:AReg_64 */, killed %2
     SI_RETURN
 
 ...
@@ -558,13 +558,13 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:vreg_64 = COPY $vgpr1
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_64_align2 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_64_align2 = COPY [[COPY]].sub1
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4521993 /* reguse:AReg_64_Align2 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4915209 /* reguse:AReg_64_Align2 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0:vreg_64 = COPY $vgpr0
     %0.sub1:vreg_64 = COPY $vgpr1
     undef %2.sub0:areg_64_align2 = COPY %0.sub0
     %2.sub1:areg_64_align2 = COPY %0.sub1
-    INLINEASM &"; use $0", 0 /* attdialect */, 4521993 /* reguse:AReg_64_Align2 */, %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 4915209 /* reguse:AReg_64_Align2 */, %2
     SI_RETURN
 
 ...
@@ -585,7 +585,7 @@ body:             |
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_96 = COPY [[COPY]].sub1
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96 = COPY [[COPY]].sub2
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 5832713 /* reguse:AReg_96 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_96 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0:vreg_96 =COPY $vgpr0
     %0.sub1:vreg_96 = COPY $vgpr1
@@ -593,7 +593,7 @@ body:             |
     undef %3.sub0:areg_96 = COPY %0.sub0
     %3.sub1:areg_96 = COPY %0.sub1
     %3.sub2:areg_96 = COPY %0.sub2
-    INLINEASM &"; use $0", 0 /* attdialect */, 5832713 /* reguse:AReg_96 */, %3
+    INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_96 */, %3
     SI_RETURN
 
 ...
@@ -614,7 +614,7 @@ body:             |
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96_align2 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_96_align2 = COPY [[COPY]].sub1
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96_align2 = COPY [[COPY]].sub2
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6160393 /* reguse:AReg_96_Align2 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6750217 /* reguse:AReg_96_Align2 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0:vreg_96 =COPY $vgpr0
     %0.sub1:vreg_96 = COPY $vgpr1
@@ -622,7 +622,7 @@ body:             |
     undef %3.sub0:areg_96_align2 = COPY %0.sub0
     %3.sub1:areg_96_align2 = COPY %0.sub1
     %3.sub2:areg_96_align2 = COPY %0.sub2
-    INLINEASM &"; use $0", 0 /* attdialect */, 6160393 /* reguse:AReg_96_Align2 */, %3
+    INLINEASM &"; use $0", 0 /* attdialect */, 6750217 /* reguse:AReg_96_Align2 */, %3
     SI_RETURN
 
 ...
@@ -641,13 +641,13 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2_sub3:vreg_128 = COPY $vgpr2_vgpr3
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_128 = COPY [[COPY]].sub0_sub1
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2_sub3:areg_128 = COPY [[COPY]].sub2_sub3
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 7733257 /* reguse:AReg_128 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8323081 /* reguse:AReg_128 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0_sub1:vreg_128 =COPY $vgpr0_vgpr1
     %0.sub2_sub3:vreg_128 = COPY $vgpr2_vgpr3
     undef %2.sub0_sub1:areg_128 = COPY %0.sub0_sub1
     %2.sub2_sub3:areg_128 = COPY %0.sub2_sub3
-    INLINEASM &"; use $0", 0 /* attdialect */, 7733257 /* reguse:AReg_128 */, killed %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 8323081 /* reguse:AReg_128 */, killed %2
     SI_RETURN
 
 ...
@@ -668,13 +668,13 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:vreg_128 = COPY $vgpr2_vgpr3
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2_sub3:areg_128_align2 = COPY [[COPY]].sub1
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8060937 /* reguse:AReg_128_Align2 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0:vreg_128 =COPY $vgpr0_vgpr1
     %0.sub1:vreg_128 = COPY $vgpr2_vgpr3
     undef %2.sub0_sub1:areg_128_align2 = COPY %0.sub0
     %2.sub2_sub3:areg_128_align2 = COPY %0.sub1
-    INLINEASM &"; use $0", 0 /* attdialect */, 8060937 /* reguse:AReg_128_Align2 */, %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %2
     SI_RETURN
 
 ...
@@ -693,13 +693,13 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:sreg_64 = COPY $sgpr9
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_64_align2 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_64_align2 = COPY [[COPY]].sub1
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4521993 /* reguse:AReg_64_Align2 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4915209 /* reguse:AReg_64_Align2 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0:sreg_64 = COPY $sgpr8
     %0.sub1:sreg_64 = COPY $sgpr9
     undef %2.sub0:areg_64_align2 = COPY %0.sub0
     %2.sub1:areg_64_align2 = COPY %0.sub1
-    INLINEASM &"; use $0", 0 /* attdialect */, 4521993 /* reguse:AReg_64_Align2 */, %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 4915209 /* reguse:AReg_64_Align2 */, %2
     SI_RETURN
 
 ...
@@ -718,13 +718,13 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1_sub2:vreg_96 = COPY $vgpr1_vgpr2
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1_sub2:areg_96 = COPY [[COPY]].sub1_sub2
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 5832713 /* reguse:AReg_96 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_96 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0:vreg_96 =COPY $vgpr0
     %0.sub1_sub2:vreg_96 = COPY $vgpr1_vgpr2
     undef %2.sub0:areg_96 = COPY %0.sub0
     %2.sub1_sub2:areg_96 = COPY %0.sub1_sub2
-    INLINEASM &"; use $0", 0 /* attdialect */, 5832713 /* reguse:AReg_96 */, %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_96 */, %2
     SI_RETURN
 
 ...
@@ -743,13 +743,13 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1_sub2:vreg_96 = COPY $vgpr1_vgpr2
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96_align2 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1_sub2:areg_96_align2 = COPY [[COPY]].sub1_sub2
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4521993 /* reguse:AReg_64_Align2 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4915209 /* reguse:AReg_64_Align2 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0:vreg_96 =COPY $vgpr0
     %0.sub1_sub2:vreg_96 = COPY $vgpr1_vgpr2
     undef %2.sub0:areg_96_align2 = COPY %0.sub0
     %2.sub1_sub2:areg_96_align2 = COPY %0.sub1_sub2
-    INLINEASM &"; use $0", 0 /* attdialect */, 4521993 /* reguse:AReg_64_Align2 */, %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 4915209 /* reguse:AReg_64_Align2 */, %2
     SI_RETURN
 
 ...
@@ -768,13 +768,13 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2:vreg_96 = COPY $vgpr2
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_96 = COPY [[COPY]].sub0_sub1
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96 = COPY [[COPY]].sub2
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 5832713 /* reguse:AReg_96 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_96 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0_sub1:vreg_96 = COPY $vgpr0_vgpr1
     %0.sub2:vreg_96 = COPY $vgpr2
     undef %2.sub0_sub1:areg_96 = COPY %0.sub0_sub1
     %2.sub2:areg_96 = COPY %0.sub2
-    INLINEASM &"; use $0", 0 /* attdialect */, 5832713 /* reguse:AReg_96 */, %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_96 */, %2
     SI_RETURN
 
 ...
@@ -793,13 +793,13 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2:vreg_96 = COPY $vgpr2
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_96_align2 = COPY [[COPY]].sub0_sub1
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96_align2 = COPY [[COPY]].sub2
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4521993 /* reguse:AReg_64_Align2 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4915209 /* reguse:AReg_64_Align2 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0_sub1:vreg_96 = COPY $vgpr0_vgpr1
     %0.sub2:vreg_96 = COPY $vgpr2
     undef %2.sub0_sub1:areg_96_align2 = COPY %0.sub0_sub1
     %2.sub2:areg_96_align2 = COPY %0.sub2
-    INLINEASM &"; use $0", 0 /* attdialect */, 4521993 /* reguse:AReg_64_Align2 */, %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 4915209 /* reguse:AReg_64_Align2 */, %2
     SI_RETURN
 
 ...
@@ -817,12 +817,12 @@ body:             |
     ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_64 = COPY $vgpr0
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_64 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_64 = COPY [[COPY]].sub0
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4128777 /* reguse:AReg_64 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4325385 /* reguse:AReg_64 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0:vreg_64 = COPY $vgpr0
     undef %2.sub0:areg_64 = COPY %0.sub0
     %2.sub1:areg_64 = COPY %0.sub0
-    INLINEASM &"; use $0", 0 /* attdialect */, 4128777 /* reguse:AReg_64 */, killed %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 4325385 /* reguse:AReg_64 */, killed %2
     SI_RETURN
 
 ...
@@ -841,13 +841,13 @@ body:             |
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_96 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96 = COPY [[COPY]].sub0
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 5832713 /* reguse:AReg_96 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_96 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0:vreg_64 = COPY $vgpr0
     undef %1.sub0:areg_96 = COPY %0.sub0
     %1.sub1:areg_96 = COPY %0.sub0
     %1.sub2:areg_96 = COPY %0.sub0
-    INLINEASM &"; use $0", 0 /* attdialect */, 5832713 /* reguse:AReg_96 */, %1
+    INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_96 */, %1
     SI_RETURN
 
 ...
@@ -865,12 +865,12 @@ body:             |
     ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_64 = COPY $vgpr0
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96_align2 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_96_align2 = COPY [[COPY]].sub0
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6160393 /* reguse:AReg_96_Align2 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6750217 /* reguse:AReg_96_Align2 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0:vreg_64 = COPY $vgpr0
     undef %1.sub0:areg_96_align2 = COPY %0.sub0
     %1.sub1:areg_96_align2 = COPY %0.sub0
-    INLINEASM &"; use $0", 0 /* attdialect */, 6160393 /* reguse:AReg_96_Align2 */, %1
+    INLINEASM &"; use $0", 0 /* attdialect */, 6750217 /* reguse:AReg_96_Align2 */, %1
     SI_RETURN
 
 ...
@@ -890,14 +890,14 @@ body:             |
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_128 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_128 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub3:areg_128 = COPY [[COPY]].sub0
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 7733257 /* reguse:AReg_128 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8323081 /* reguse:AReg_128 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0:vreg_64 = COPY $vgpr0
     undef %1.sub0:areg_128 = COPY %0.sub0
     %1.sub1:areg_128 = COPY %0.sub0
     %1.sub2:areg_128 = COPY %0.sub0
     %1.sub3:areg_128 = COPY %0.sub0
-    INLINEASM &"; use $0", 0 /* attdialect */, 7733257 /* reguse:AReg_128 */, killed %1
+    INLINEASM &"; use $0", 0 /* attdialect */, 8323081 /* reguse:AReg_128 */, killed %1
     SI_RETURN
 
 ...
@@ -917,14 +917,14 @@ body:             |
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_128_align2 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_128_align2 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub3:areg_128_align2 = COPY [[COPY]].sub0
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8060937 /* reguse:AReg_128_Align2 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0:vreg_64 = COPY $vgpr0
     undef %1.sub0:areg_128_align2 = COPY %0.sub0
     %1.sub1:areg_128_align2 = COPY %0.sub0
     %1.sub2:areg_128_align2 = COPY %0.sub0
     %1.sub3:areg_128_align2 = COPY %0.sub0
-    INLINEASM &"; use $0", 0 /* attdialect */, 8060937 /* reguse:AReg_128_Align2 */, %1
+    INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %1
     SI_RETURN
 
 ...
@@ -943,13 +943,13 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:vreg_64 = COPY $vgpr1
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_64 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_64 = COPY [[COPY]].sub1
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4128777 /* reguse:AReg_64 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4325385 /* reguse:AReg_64 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0:vreg_64 = COPY $vgpr0
     %0.sub1:vreg_64 = COPY $vgpr1
     undef %2.sub0:areg_64 = COPY %0.sub0
     %2.sub1:areg_64 = COPY %0.sub1
-    INLINEASM &"; use $0", 0 /* attdialect */, 4128777 /* reguse:AReg_64 */, killed %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 4325385 /* reguse:AReg_64 */, killed %2
     SI_RETURN
 
 ...
@@ -968,13 +968,13 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:vreg_64_align2 = COPY $vgpr1
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_64_align2 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_64_align2 = COPY [[COPY]].sub1
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4521993 /* reguse:AReg_64_Align2 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4915209 /* reguse:AReg_64_Align2 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0:vreg_64_align2 = COPY $vgpr0
     %0.sub1:vreg_64_align2 = COPY $vgpr1
     undef %2.sub0:areg_64_align2 = COPY %0.sub0
     %2.sub1:areg_64_align2 = COPY %0.sub1
-    INLINEASM &"; use $0", 0 /* attdialect */, 4521993 /* reguse:AReg_64_Align2 */, %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 4915209 /* reguse:AReg_64_Align2 */, %2
     SI_RETURN
 
 ...
@@ -995,7 +995,7 @@ body:             |
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_96 = COPY [[COPY]].sub1
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96 = COPY [[COPY]].sub2
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 5832713 /* reguse:AReg_96 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_96 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0:vreg_96 = COPY $vgpr0
     %0.sub1:vreg_96 = COPY $vgpr1
@@ -1003,7 +1003,7 @@ body:             |
     undef %3.sub0:areg_96 = COPY %0.sub0
     %3.sub1:areg_96 = COPY %0.sub1
     %3.sub2:areg_96 = COPY %0.sub2
-    INLINEASM &"; use $0", 0 /* attdialect */, 5832713 /* reguse:AReg_96 */, %3
+    INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_96 */, %3
     SI_RETURN
 
 ...
@@ -1024,7 +1024,7 @@ body:             |
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96_align2 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_96_align2 = COPY [[COPY]].sub1
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96_align2 = COPY [[COPY]].sub2
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6160393 /* reguse:AReg_96_Align2 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6750217 /* reguse:AReg_96_Align2 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0:vreg_96_align2 = COPY $vgpr0
     %0.sub1:vreg_96_align2 = COPY $vgpr1
@@ -1032,7 +1032,7 @@ body:             |
     undef %3.sub0:areg_96_align2 = COPY %0.sub0
     %3.sub1:areg_96_align2 = COPY %0.sub1
     %3.sub2:areg_96_align2 = COPY %0.sub2
-    INLINEASM &"; use $0", 0 /* attdialect */, 6160393 /* reguse:AReg_96_Align2 */, %3
+    INLINEASM &"; use $0", 0 /* attdialect */, 6750217 /* reguse:AReg_96_Align2 */, %3
     SI_RETURN
 
 ...
@@ -1051,13 +1051,13 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2_sub3:vreg_128 = COPY $vgpr2_vgpr3
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_128 = COPY [[COPY]].sub0_sub1
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2_sub3:areg_128 = COPY [[COPY]].sub2_sub3
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 7733257 /* reguse:AReg_128 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8323081 /* reguse:AReg_128 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0_sub1:vreg_128 = COPY $vgpr0_vgpr1
     %0.sub2_sub3:vreg_128 = COPY $vgpr2_vgpr3
     undef %2.sub0_sub1:areg_128 = COPY %0.sub0_sub1
     %2.sub2_sub3:areg_128 = COPY %0.sub2_sub3
-    INLINEASM &"; use $0", 0 /* attdialect */, 7733257 /* reguse:AReg_128 */, killed %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 8323081 /* reguse:AReg_128 */, killed %2
     SI_RETURN
 
 ...
@@ -1076,13 +1076,13 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2_sub3:vreg_128_align2 = COPY $vgpr2_vgpr3
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[COPY]].sub0_sub1
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2_sub3:areg_128_align2 = COPY [[COPY]].sub2_sub3
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8060937 /* reguse:AReg_128_Align2 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0_sub1:vreg_128_align2 = COPY $vgpr0_vgpr1
     %0.sub2_sub3:vreg_128_align2 = COPY $vgpr2_vgpr3
     undef %2.sub0_sub1:areg_128_align2 = COPY %0.sub0_sub1
     %2.sub2_sub3:areg_128_align2 = COPY %0.sub2_sub3
-    INLINEASM &"; use $0", 0 /* attdialect */, 8060937 /* reguse:AReg_128_Align2 */, %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %2
     SI_RETURN
 
 ...
@@ -1101,13 +1101,13 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:sreg_64 = COPY $sgpr9
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_64_align2 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_64_align2 = COPY [[COPY]].sub1
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4521993 /* reguse:AReg_64_Align2 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4915209 /* reguse:AReg_64_Align2 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0:sreg_64 = COPY $sgpr8
     %0.sub1:sreg_64 = COPY $sgpr9
     undef %2.sub0:areg_64_align2 = COPY %0.sub0
     %2.sub1:areg_64_align2 = COPY %0.sub1
-    INLINEASM &"; use $0", 0 /* attdialect */, 4521993 /* reguse:AReg_64_Align2 */, %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 4915209 /* reguse:AReg_64_Align2 */, %2
     SI_RETURN
 
 ...
@@ -1126,13 +1126,13 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1_sub2:vreg_96 = COPY $vgpr1_vgpr2
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1_sub2:areg_96 = COPY [[COPY]].sub1_sub2
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 5832713 /* reguse:AReg_96 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_96 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0:vreg_96 = COPY $vgpr0
     %0.sub1_sub2:vreg_96 = COPY $vgpr1_vgpr2
     undef %2.sub0:areg_96 = COPY %0.sub0
     %2.sub1_sub2:areg_96 = COPY %0.sub1_sub2
-    INLINEASM &"; use $0", 0 /* attdialect */, 5832713 /* reguse:AReg_96 */, %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_96 */, %2
     SI_RETURN
 
 ...
@@ -1150,13 +1150,13 @@ body:             |
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1_sub2:vreg_96 = COPY $vgpr1_vgpr2
     ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_96 = COPY [[COPY]]
     ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1_sub2:areg_96 = COPY [[COPY1]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 5832713 /* reguse:AReg_96 */, [[COPY2]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_96 */, [[COPY2]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0:vreg_96 = COPY $vgpr0
     %0.sub1_sub2:vreg_96 = COPY $vgpr1_vgpr2
     undef %2.sub0:areg_96 = COPY %0.sub2
     %2.sub1_sub2:areg_96 = COPY %0.sub0_sub1
-    INLINEASM &"; use $0", 0 /* attdialect */, 5832713 /* reguse:AReg_96 */, %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_96 */, %2
     SI_RETURN
 
 ...
@@ -1176,13 +1176,13 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1_sub2:vreg_96_align2 = COPY $vgpr1_vgpr2
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96_align2 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1_sub2:areg_96_align2 = COPY [[COPY]].sub1_sub2
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4521993 /* reguse:AReg_64_Align2 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4915209 /* reguse:AReg_64_Align2 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0:vreg_96_align2 = COPY $vgpr0
     %0.sub1_sub2:vreg_96_align2 = COPY $vgpr1_vgpr2
     undef %2.sub0:areg_96_align2 = COPY %0.sub0
     %2.sub1_sub2:areg_96_align2 = COPY %0.sub1_sub2
-    INLINEASM &"; use $0", 0 /* attdialect */, 4521993 /* reguse:AReg_64_Align2 */, %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 4915209 /* reguse:AReg_64_Align2 */, %2
     SI_RETURN
 
 ...
@@ -1201,13 +1201,13 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2:vreg_96 = COPY $vgpr2
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_96 = COPY [[COPY]].sub0_sub1
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96 = COPY [[COPY]].sub2
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 5832713 /* reguse:AReg_96 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_96 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0_sub1:vreg_96 = COPY $vgpr0_vgpr1
     %0.sub2:vreg_96 = COPY $vgpr2
     undef %2.sub0_sub1:areg_96 = COPY %0.sub0_sub1
     %2.sub2:areg_96 = COPY %0.sub2
-    INLINEASM &"; use $0", 0 /* attdialect */, 5832713 /* reguse:AReg_96 */, %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_96 */, %2
     SI_RETURN
 
 ...
@@ -1226,13 +1226,13 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2:vreg_96_align2 = COPY $vgpr2
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_96_align2 = COPY [[COPY]].sub0_sub1
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96_align2 = COPY [[COPY]].sub2
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4521993 /* reguse:AReg_64_Align2 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4915209 /* reguse:AReg_64_Align2 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0_sub1:vreg_96_align2 = COPY $vgpr0_vgpr1
     %0.sub2:vreg_96_align2 = COPY $vgpr2
     undef %2.sub0_sub1:areg_96_align2 = COPY %0.sub0_sub1
     %2.sub2:areg_96_align2 = COPY %0.sub2
-    INLINEASM &"; use $0", 0 /* attdialect */, 4521993 /* reguse:AReg_64_Align2 */, %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 4915209 /* reguse:AReg_64_Align2 */, %2
     SI_RETURN
 
 ...
@@ -1251,13 +1251,13 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2:vreg_96 = COPY $vgpr2
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_96_align2 = COPY [[COPY]].sub0_sub1
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96_align2 = COPY [[COPY]].sub2
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4521993 /* reguse:AReg_64_Align2 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4915209 /* reguse:AReg_64_Align2 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0_sub1:vreg_96 = COPY $vgpr0_vgpr1
     %0.sub2:vreg_96 = COPY $vgpr2
     undef %2.sub0_sub1:areg_96_align2 = COPY %0.sub0_sub1
     %2.sub2:areg_96_align2 = COPY %0.sub2
-    INLINEASM &"; use $0", 0 /* attdialect */, 4521993 /* reguse:AReg_64_Align2 */, %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 4915209 /* reguse:AReg_64_Align2 */, %2
     SI_RETURN
 
 ...
@@ -1274,11 +1274,11 @@ body:             |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:areg_64 = COPY [[COPY]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4128777 /* reguse:AReg_64 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4325385 /* reguse:AReg_64 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     %0:vreg_64 = COPY $vgpr0_vgpr1
     %2:areg_64 = COPY %0
-    INLINEASM &"; use $0", 0 /* attdialect */, 4128777 /* reguse:AReg_64 */, killed %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 4325385 /* reguse:AReg_64 */, killed %2
     SI_RETURN
 
 ...
@@ -1295,11 +1295,11 @@ body:             |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:areg_64_align2 = COPY [[COPY]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4521993 /* reguse:AReg_64_Align2 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4915209 /* reguse:AReg_64_Align2 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     %0:vreg_64_align2 = COPY $vgpr0_vgpr1
     %2:areg_64_align2 = COPY %0
-    INLINEASM &"; use $0", 0 /* attdialect */, 4521993 /* reguse:AReg_64_Align2 */, %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 4915209 /* reguse:AReg_64_Align2 */, %2
     SI_RETURN
 
 ...
@@ -1316,11 +1316,11 @@ body:             |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_96 = COPY $vgpr0_vgpr1_vgpr2
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:areg_96 = COPY [[COPY]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 5832713 /* reguse:AReg_96 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_96 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     %0:vreg_96 = COPY $vgpr0_vgpr1_vgpr2
     %3:areg_96 = COPY %0
-    INLINEASM &"; use $0", 0 /* attdialect */, 5832713 /* reguse:AReg_96 */, %3
+    INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_96 */, %3
     SI_RETURN
 
 ...
@@ -1337,11 +1337,11 @@ body:             |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_96_align2 = COPY $vgpr0_vgpr1_vgpr2
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:areg_96_align2 = COPY [[COPY]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6160393 /* reguse:AReg_96_Align2 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6750217 /* reguse:AReg_96_Align2 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     %0:vreg_96_align2 = COPY $vgpr0_vgpr1_vgpr2
     %3:areg_96_align2 = COPY %0
-    INLINEASM &"; use $0", 0 /* attdialect */, 6160393 /* reguse:AReg_96_Align2 */, %3
+    INLINEASM &"; use $0", 0 /* attdialect */, 6750217 /* reguse:AReg_96_Align2 */, %3
     SI_RETURN
 
 ...
@@ -1358,11 +1358,11 @@ body:             |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_128 = COPY $vgpr0_vgpr1_vgpr2_vgpr3
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:areg_128 = COPY [[COPY]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 7733257 /* reguse:AReg_128 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8323081 /* reguse:AReg_128 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     %0:vreg_128 = COPY $vgpr0_vgpr1_vgpr2_vgpr3
     %2:areg_128 = COPY %0
-    INLINEASM &"; use $0", 0 /* attdialect */, 7733257 /* reguse:AReg_128 */, killed %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 8323081 /* reguse:AReg_128 */, killed %2
     SI_RETURN
 
 ...
@@ -1379,11 +1379,11 @@ body:             |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_128_align2 = COPY $vgpr0_vgpr1_vgpr2_vgpr3
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[COPY]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8060937 /* reguse:AReg_128_Align2 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     %0:vreg_128_align2 = COPY $vgpr0_vgpr1_vgpr2_vgpr3
     %2:areg_128_align2 = COPY %0
-    INLINEASM &"; use $0", 0 /* attdialect */, 8060937 /* reguse:AReg_128_Align2 */, %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %2
     SI_RETURN
 
 ...
@@ -1400,11 +1400,11 @@ body:             |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr8_sgpr9
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:areg_64_align2 = COPY [[COPY]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4521993 /* reguse:AReg_64_Align2 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4915209 /* reguse:AReg_64_Align2 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     %0:sreg_64 = COPY $sgpr8_sgpr9
     %2:areg_64_align2 = COPY %0
-    INLINEASM &"; use $0", 0 /* attdialect */, 4521993 /* reguse:AReg_64_Align2 */, %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 4915209 /* reguse:AReg_64_Align2 */, %2
     SI_RETURN
 
 ...
@@ -1421,11 +1421,11 @@ body:             |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_96 = COPY $vgpr0_vgpr1_vgpr2
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:areg_96_align2 = COPY [[COPY]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4521993 /* reguse:AReg_64_Align2 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4915209 /* reguse:AReg_64_Align2 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     %0:vreg_96 = COPY $vgpr0_vgpr1_vgpr2
     %2:areg_96_align2 = COPY %0
-    INLINEASM &"; use $0", 0 /* attdialect */, 4521993 /* reguse:AReg_64_Align2 */, %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 4915209 /* reguse:AReg_64_Align2 */, %2
     SI_RETURN
 
 ...
diff --git a/llvm/test/CodeGen/AMDGPU/coalescer-early-clobber-subreg.mir b/llvm/test/CodeGen/AMDGPU/coalescer-early-clobber-subreg.mir
index 5b6e1f285cf3d..c07c9efcf901d 100644
--- a/llvm/test/CodeGen/AMDGPU/coalescer-early-clobber-subreg.mir
+++ b/llvm/test/CodeGen/AMDGPU/coalescer-early-clobber-subreg.mir
@@ -20,10 +20,10 @@ body:             |
     ; CHECK-LABEL: name: foo1
     ; CHECK: liveins: $vgpr0_vgpr1
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: INLINEASM &"", 0 /* attdialect */, 2228234 /* regdef:VS_32_Lo128 */, def undef %2.sub0, 2228235 /* regdef-ec:VS_32_Lo128 */, def undef early-clobber %2.sub1
+    ; CHECK-NEXT: INLINEASM &"", 0 /* attdialect */, 2031626 /* regdef:VGPR_32 */, def undef %2.sub0, 2031627 /* regdef-ec:VGPR_32 */, def undef early-clobber %2.sub1
     ; CHECK-NEXT: FLAT_STORE_DWORDX2 $vgpr0_vgpr1, %2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64))
     ; CHECK-NEXT: S_ENDPGM 0
-    INLINEASM &"", 0 /* attdialect */, 2228234 /* regdef:VGPR_32 */, def %0:vgpr_32, 2228235 /* regdef-ec:VGPR_32 */, def early-clobber %1:vgpr_32
+    INLINEASM &"", 0 /* attdialect */, 2031626 /* regdef:VGPR_32 */, def %0:vgpr_32, 2031627 /* regdef-ec:VGPR_32 */, def early-clobber %1:vgpr_32
     undef %2.sub0:vreg_64 = COPY killed %0
     %2.sub1:vreg_64 = COPY killed %1
     FLAT_STORE_DWORDX2 killed $vgpr0_vgpr1, killed %2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64))
@@ -41,10 +41,10 @@ body:             |
     ; CHECK-LABEL: name: foo2
     ; CHECK: liveins: $vgpr0_vgpr1
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: INLINEASM &"", 0 /* attdialect */, 2228235 /* regdef-ec:VS_32_Lo128 */, def undef early-clobber %2.sub1, 2228234 /* regdef:VS_32_Lo128 */, def undef %2.sub0
+    ; CHECK-NEXT: INLINEASM &"", 0 /* attdialect */, 2031627 /* regdef-ec:VGPR_32 */, def undef early-clobber %2.sub1, 2031626 /* regdef:VGPR_32 */, def undef %2.sub0
     ; CHECK-NEXT: FLAT_STORE_DWORDX2 $vgpr0_vgpr1, %2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64))
     ; CHECK-NEXT: S_ENDPGM 0
-    INLINEASM &"", 0 /* attdialect */, 2228235 /* regdef-ec:VGPR_32 */, def early-clobber %1:vgpr_32, 2228234 /* regdef:VGPR_32 */, def %0:vgpr_32
+    INLINEASM &"", 0 /* attdialect */, 2031627 /* regdef-ec:VGPR_32 */, def early-clobber %1:vgpr_32, 2031626 /* regdef:VGPR_32 */, def %0:vgpr_32
     undef %2.sub0:vreg_64 = COPY killed %0
     %2.sub1:vreg_64 = COPY killed %1
     FLAT_STORE_DWORDX2 killed $vgpr0_vgpr1, killed %2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64))
@@ -62,10 +62,10 @@ body:             |
     ; CHECK-LABEL: name: foo3
     ; CHECK: liveins: $vgpr0_vgpr1
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: INLINEASM &"", 0 /* attdialect */, 2228234 /* regdef:VS_32_Lo128 */, def undef %2.sub0, 2228235 /* regdef-ec:VS_32_Lo128 */, def undef early-clobber %2.sub1
+    ; CHECK-NEXT: INLINEASM &"", 0 /* attdialect */, 2031626 /* regdef:VGPR_32 */, def undef %2.sub0, 2031627 /* regdef-ec:VGPR_32 */, def undef early-clobber %2.sub1
     ; CHECK-NEXT: FLAT_STORE_DWORDX2 $vgpr0_vgpr1, %2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64))
     ; CHECK-NEXT: S_ENDPGM 0
-    INLINEASM &"", 0 /* attdialect */, 2228234 /* regdef:VGPR_32 */, def %1:vgpr_32, 2228235 /* regdef-ec:VGPR_32 */, def early-clobber %0:vgpr_32
+    INLINEASM &"", 0 /* attdialect */, 2031626 /* regdef:VGPR_32 */, def %1:vgpr_32, 2031627 /* regdef-ec:VGPR_32 */, def early-clobber %0:vgpr_32
     undef %2.sub0:vreg_64 = COPY killed %1
     %2.sub1:vreg_64 = COPY killed %0
     FLAT_STORE_DWORDX2 killed $vgpr0_vgpr1, killed %2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64))
@@ -83,10 +83,10 @@ body:             |
     ; CHECK-LABEL: name: foo4
     ; CHECK: liveins: $vgpr0_vgpr1
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: INLINEASM &"", 0 /* attdialect */, 2228235 /* regdef-ec:VS_32_Lo128 */, def undef early-clobber %2.sub1, 2228234 /* regdef:VS_32_Lo128 */, def undef %2.sub0
+    ; CHECK-NEXT: INLINEASM &"", 0 /* attdialect */, 2031627 /* regdef-ec:VGPR_32 */, def undef early-clobber %2.sub1, 2031626 /* regdef:VGPR_32 */, def undef %2.sub0
     ; CHECK-NEXT: FLAT_STORE_DWORDX2 $vgpr0_vgpr1, %2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64))
     ; CHECK-NEXT: S_ENDPGM 0
-    INLINEASM &"", 0 /* attdialect */, 2228235 /* regdef-ec:VGPR_32 */, def early-clobber %0:vgpr_32, 2228234 /* regdef:VGPR_32 */, def %1:vgpr_32
+    INLINEASM &"", 0 /* attdialect */, 2031627 /* regdef-ec:VGPR_32 */, def early-clobber %0:vgpr_32, 2031626 /* regdef:VGPR_32 */, def %1:vgpr_32
     undef %2.sub0:vreg_64 = COPY killed %1
     %2.sub1:vreg_64 = COPY killed %0
     FLAT_STORE_DWORDX2 killed $vgpr0_vgpr1, killed %2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64))
diff --git a/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll b/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll
index 31c23b94a8de8..66d99b14e282d 100644
--- a/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll
+++ b/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll
@@ -226,9 +226,8 @@ define void @private_alloca_to_flat(ptr %ptr) {
 ; GISEL-ASM-LABEL: private_alloca_to_flat:
 ; GISEL-ASM:       ; %bb.0:
 ; GISEL-ASM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-ASM-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GISEL-ASM-NEXT:    s_lshr_b32 s4, s32, 6
-; GISEL-ASM-NEXT:    s_mov_b64 s[6:7], src_private_base
-; GISEL-ASM-NEXT:    s_mov_b32 s5, s7
 ; GISEL-ASM-NEXT:    v_mov_b32_e32 v0, s4
 ; GISEL-ASM-NEXT:    v_mov_b32_e32 v2, 7
 ; GISEL-ASM-NEXT:    v_mov_b32_e32 v1, s5
@@ -330,21 +329,21 @@ define void @recursive_phis(i1 %cond, ptr addrspace(5) %ptr) {
 ; DAGISEL-ASM-NEXT:    v_and_b32_e32 v0, 0xffff, v1
 ; DAGISEL-ASM-NEXT:  ; %bb.2: ; %finallyendcf.split
 ; DAGISEL-ASM-NEXT:    s_or_b64 exec, exec, s[4:5]
-; DAGISEL-ASM-NEXT:    s_xor_b64 s[6:7], vcc, -1
-; DAGISEL-ASM-NEXT:    s_mov_b64 s[4:5], 0
-; DAGISEL-ASM-NEXT:    s_mov_b64 s[8:9], src_private_base
+; DAGISEL-ASM-NEXT:    s_mov_b64 s[4:5], src_private_base
+; DAGISEL-ASM-NEXT:    s_xor_b64 s[8:9], vcc, -1
+; DAGISEL-ASM-NEXT:    s_mov_b64 s[6:7], 0
 ; DAGISEL-ASM-NEXT:    v_mov_b32_e32 v2, 7
 ; DAGISEL-ASM-NEXT:  .LBB11_3: ; %finally
 ; DAGISEL-ASM-NEXT:    ; =>This Inner Loop Header: Depth=1
-; DAGISEL-ASM-NEXT:    s_and_b64 s[10:11], exec, s[6:7]
-; DAGISEL-ASM-NEXT:    s_or_b64 s[4:5], s[10:11], s[4:5]
-; DAGISEL-ASM-NEXT:    v_mov_b32_e32 v1, s9
+; DAGISEL-ASM-NEXT:    s_and_b64 s[10:11], exec, s[8:9]
+; DAGISEL-ASM-NEXT:    s_or_b64 s[6:7], s[10:11], s[6:7]
+; DAGISEL-ASM-NEXT:    v_mov_b32_e32 v1, s5
 ; DAGISEL-ASM-NEXT:    flat_store_dword v[0:1], v2
 ; DAGISEL-ASM-NEXT:    s_waitcnt vmcnt(0)
-; DAGISEL-ASM-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; DAGISEL-ASM-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; DAGISEL-ASM-NEXT:    s_cbranch_execnz .LBB11_3
 ; DAGISEL-ASM-NEXT:  ; %bb.4: ; %end
-; DAGISEL-ASM-NEXT:    s_or_b64 exec, exec, s[4:5]
+; DAGISEL-ASM-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; DAGISEL-ASM-NEXT:    s_waitcnt lgkmcnt(0)
 ; DAGISEL-ASM-NEXT:    s_setpc_b64 s[30:31]
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll b/llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll
index 96a2d02e50105..f706f53b542d3 100644
--- a/llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll
@@ -35,6 +35,6 @@ define amdgpu_kernel void @test_direct_indirect_call() {
   ret void
 }
 ;.
-; CHECK: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
 ; CHECK: attributes #[[ATTR1]] = { "uniform-work-group-size"="false" }
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir b/llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir
index 8038ea71dc1bb..1635b09313102 100644
--- a/llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir
+++ b/llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir
@@ -370,7 +370,7 @@ body:            |
     ; HAZARD-LABEL: name: inline_sdwa_hazard
     ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
     ; HAZARD-NEXT: {{  $}}
-    ; HAZARD-NEXT: INLINEASM &"v_or_b32 %0, 0, %1", 32 /* isconvergent attdialect */, 327690 /* regdef:SReg_1_with_sub0 */, def $vgpr0, 327689 /* reguse:SReg_1_with_sub0 */, $vgpr1
+    ; HAZARD-NEXT: INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 2031626 /* regdef:VGPR_32 */, def $vgpr0, 2031625 /* reguse:VGPR_32 */, $vgpr1
     ; HAZARD-NEXT: S_NOP 0
     ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0)
     ; HAZARD-NEXT: S_ENDPGM 0
@@ -378,10 +378,10 @@ body:            |
     ; NOHAZARD-LABEL: name: inline_sdwa_hazard
     ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
     ; NOHAZARD-NEXT: {{  $}}
-    ; NOHAZARD-NEXT: INLINEASM &"v_or_b32 %0, 0, %1", 32 /* isconvergent attdialect */, 327690 /* regdef:SReg_1_with_sub0 */, def $vgpr0, 327689 /* reguse:SReg_1_with_sub0 */, $vgpr1
+    ; NOHAZARD-NEXT: INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 2031626 /* regdef:VGPR_32 */, def $vgpr0, 2031625 /* reguse:VGPR_32 */, $vgpr1
     ; NOHAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0)
     ; NOHAZARD-NEXT: S_ENDPGM 0
-  INLINEASM &"v_or_b32 %0, 0, %1", 32, 327690, def $vgpr0, 327689, $vgpr1
+  INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 2031626 /* regdef:VGPR_32 */, def $vgpr0, 2031625 /* reguse:VGPR_32 */, $vgpr1
   renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0)
   S_ENDPGM 0
 ...
@@ -397,17 +397,17 @@ body:            |
     ; HAZARD-NEXT: {{  $}}
     ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0)
     ; HAZARD-NEXT: S_NOP 0
-    ; HAZARD-NEXT: INLINEASM &"v_or_b32 %0, 0, %1", 32 /* isconvergent attdialect */, 327690 /* regdef:SReg_1_with_sub0 */, def $vgpr0, 327689 /* reguse:SReg_1_with_sub0 */, $vgpr1
+    ; HAZARD-NEXT: INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 2031626 /* regdef:VGPR_32 */, def $vgpr0, 2031625 /* reguse:VGPR_32 */, $vgpr1
     ; HAZARD-NEXT: S_ENDPGM 0
     ;
     ; NOHAZARD-LABEL: name: sdwa_inline_hazard
     ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
     ; NOHAZARD-NEXT: {{  $}}
     ; NOHAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0)
-    ; NOHAZARD-NEXT: INLINEASM &"v_or_b32 %0, 0, %1", 32 /* isconvergent attdialect */, 327690 /* regdef:SReg_1_with_sub0 */, def $vgpr0, 327689 /* reguse:SReg_1_with_sub0 */, $vgpr1
+    ; NOHAZARD-NEXT: INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 2031626 /* regdef:VGPR_32 */, def $vgpr0, 2031625 /* reguse:VGPR_32 */, $vgpr1
     ; NOHAZARD-NEXT: S_ENDPGM 0
   renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0)
-  INLINEASM &"v_or_b32 %0, 0, %1", 32, 327690, def $vgpr0, 327689, $vgpr1
+  INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 2031626 /* regdef:VGPR_32 */, def $vgpr0, 2031625 /* reguse:VGPR_32 */, $vgpr1
   S_ENDPGM 0
 ...
 
@@ -421,19 +421,19 @@ body:            |
     ; HAZARD-LABEL: name: inline_inline_hazard
     ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
     ; HAZARD-NEXT: {{  $}}
-    ; HAZARD-NEXT: INLINEASM &"v_or_b32 %0, 0, %1", 32 /* isconvergent attdialect */, 327690 /* regdef:SReg_1_with_sub0 */, def $vgpr0, 327689 /* reguse:SReg_1_with_sub0 */, $vgpr1
+    ; HAZARD-NEXT: INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 2031626 /* regdef:VGPR_32 */, def $vgpr0, 2031625 /* reguse:VGPR_32 */, $vgpr1
     ; HAZARD-NEXT: S_NOP 0
-    ; HAZARD-NEXT: INLINEASM &"v_or_b32 %0, 0, %1", 32 /* isconvergent attdialect */, 327690 /* regdef:SReg_1_with_sub0 */, def $vgpr0, 327689 /* reguse:SReg_1_with_sub0 */, $vgpr1
+    ; HAZARD-NEXT: INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 2031626 /* regdef:VGPR_32 */, def $vgpr0, 2031625 /* reguse:VGPR_32 */, $vgpr1
     ; HAZARD-NEXT: S_ENDPGM 0
     ;
     ; NOHAZARD-LABEL: name: inline_inline_hazard
     ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
     ; NOHAZARD-NEXT: {{  $}}
-    ; NOHAZARD-NEXT: INLINEASM &"v_or_b32 %0, 0, %1", 32 /* isconvergent attdialect */, 327690 /* regdef:SReg_1_with_sub0 */, def $vgpr0, 327689 /* reguse:SReg_1_with_sub0 */, $vgpr1
-    ; NOHAZARD-NEXT: INLINEASM &"v_or_b32 %0, 0, %1", 32 /* isconvergent attdialect */, 327690 /* regdef:SReg_1_with_sub0 */, def $vgpr0, 327689 /* reguse:SReg_1_with_sub0 */, $vgpr1
+    ; NOHAZARD-NEXT: INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 2031626 /* regdef:VGPR_32 */, def $vgpr0, 2031625 /* reguse:VGPR_32 */, $vgpr1
+    ; NOHAZARD-NEXT: INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 2031626 /* regdef:VGPR_32 */, def $vgpr0, 2031625 /* reguse:VGPR_32 */, $vgpr1
     ; NOHAZARD-NEXT: S_ENDPGM 0
-  INLINEASM &"v_or_b32 %0, 0, %1", 32, 327690, def $vgpr0, 327689, $vgpr1
-  INLINEASM &"v_or_b32 %0, 0, %1", 32, 327690, def $vgpr0, 327689, $vgpr1
+  INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 2031626 /* regdef:VGPR_32 */, def $vgpr0, 2031625 /* reguse:VGPR_32 */, $vgpr1
+  INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 2031626 /* regdef:VGPR_32 */, def $vgpr0, 2031625 /* reguse:VGPR_32 */, $vgpr1
   S_ENDPGM 0
 ...
 
diff --git a/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll b/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll
index 99368628bf7dc..8da204b499626 100644
--- a/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll
+++ b/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll
@@ -28,6 +28,6 @@ define amdgpu_kernel void @test_simple_indirect_call() #0 {
 attributes #0 = { "amdgpu-no-dispatch-id" }
 
 ;.
-; ATTRIBUTOR_GCN: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_GCN: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
 ; ATTRIBUTOR_GCN: attributes #[[ATTR1]] = { "amdgpu-no-dispatch-id" "uniform-work-group-size"="false" }
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/elf-notes.ll b/llvm/test/CodeGen/AMDGPU/elf-notes.ll
index 22d699a8f4809..d0dec1f1fe7e4 100644
--- a/llvm/test/CodeGen/AMDGPU/elf-notes.ll
+++ b/llvm/test/CodeGen/AMDGPU/elf-notes.ll
@@ -85,7 +85,7 @@ define amdgpu_kernel void @elf_notes() #0 {
   ret void
 }
 
-attributes #0 = { "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
+attributes #0 = { "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-cluster-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-cluster-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-cluster-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
 
 !llvm.module.flags = !{!0}
 !0 = !{i32 1, !"amdhsa_code_object_version", i32 400}
diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll
index ee87c65c00def..1a4a54b81c78f 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll
@@ -254,8 +254,8 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grai
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fc, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_shared_base
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GFX90A-NEXT:    ; implicit-def: $vgpr0
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -447,8 +447,8 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grai
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_shared_base
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GFX90A-NEXT:    ; implicit-def: $vgpr0
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -846,8 +846,8 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_gra
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0x7fc, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_shared_base
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX90A-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -900,8 +900,8 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_gra
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0x7fc, v0
-; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX908-NEXT:    s_mov_b64 s[4:5], src_shared_base
+; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX908-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -1070,8 +1070,8 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_gra
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_shared_base
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX90A-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -1124,8 +1124,8 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_gra
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
-; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX908-NEXT:    s_mov_b64 s[4:5], src_shared_base
+; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX908-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -1288,8 +1288,8 @@ define float @flat_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_gra
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fc, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_shared_base
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GFX90A-NEXT:    ; implicit-def: $vgpr0
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -1479,8 +1479,8 @@ define void @flat_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_gr
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0x7fc, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_shared_base
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX90A-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -1535,8 +1535,8 @@ define void @flat_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_gr
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0x7fc, v0
-; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX908-NEXT:    s_mov_b64 s[4:5], src_shared_base
+; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX908-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -2016,8 +2016,8 @@ define void @flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__a
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0x7fc, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_shared_base
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX90A-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -2070,8 +2070,8 @@ define void @flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__a
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0x7fc, v0
-; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX908-NEXT:    s_mov_b64 s[4:5], src_shared_base
+; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX908-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -2579,8 +2579,8 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fc, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_shared_base
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GFX90A-NEXT:    ; implicit-def: $vgpr0
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -2772,8 +2772,8 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_shared_base
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GFX90A-NEXT:    ; implicit-def: $vgpr0
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -3171,8 +3171,8 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fin
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0x7fc, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_shared_base
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX90A-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -3225,8 +3225,8 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fin
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0x7fc, v0
-; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX908-NEXT:    s_mov_b64 s[4:5], src_shared_base
+; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX908-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -3395,8 +3395,8 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fin
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_shared_base
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX90A-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -3449,8 +3449,8 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fin
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
-; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX908-NEXT:    s_mov_b64 s[4:5], src_shared_base
+; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX908-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -3613,8 +3613,8 @@ define float @flat_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fin
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fc, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_shared_base
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GFX90A-NEXT:    ; implicit-def: $vgpr0
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -3804,8 +3804,8 @@ define void @flat_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fi
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0x7fc, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_shared_base
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX90A-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -3860,8 +3860,8 @@ define void @flat_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fi
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0x7fc, v0
-; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX908-NEXT:    s_mov_b64 s[4:5], src_shared_base
+; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX908-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -4024,8 +4024,8 @@ define float @flat_agent_atomic_fadd_ret_f32__ieee__amdgpu_no_fine_grained_memor
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fc, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_shared_base
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GFX90A-NEXT:    ; implicit-def: $vgpr0
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -4215,8 +4215,8 @@ define void @flat_agent_atomic_fadd_noret_f32__ieee__amdgpu_no_fine_grained_memo
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0x7fc, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_shared_base
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX90A-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -4271,8 +4271,8 @@ define void @flat_agent_atomic_fadd_noret_f32__ieee__amdgpu_no_fine_grained_memo
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0x7fc, v0
-; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX908-NEXT:    s_mov_b64 s[4:5], src_shared_base
+; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX908-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -6201,9 +6201,9 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_gra
 ; GFX942-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    s_mov_b64 s[0:1], 0x7f8
-; GFX942-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[2:3], 0x7f8
 ; GFX942-NEXT:    s_mov_b64 s[0:1], src_shared_base
+; GFX942-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
 ; GFX942-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
 ; GFX942-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
@@ -6369,8 +6369,8 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_gra
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7f8, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_shared_base
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -6427,8 +6427,8 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_gra
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7f8, v0
-; GFX908-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GFX908-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX908-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GFX908-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -6662,10 +6662,10 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_gra
 ; GFX942-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    s_movk_i32 s0, 0xf800
-; GFX942-NEXT:    s_mov_b32 s1, -1
-; GFX942-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX942-NEXT:    s_movk_i32 s2, 0xf800
+; GFX942-NEXT:    s_mov_b32 s3, -1
 ; GFX942-NEXT:    s_mov_b64 s[0:1], src_shared_base
+; GFX942-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
 ; GFX942-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
 ; GFX942-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
@@ -6831,8 +6831,8 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_gra
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_shared_base
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -6889,8 +6889,8 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_gra
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
-; GFX908-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
 ; GFX908-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX908-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
 ; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GFX908-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -7552,9 +7552,9 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_gra
 ; GFX942-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    s_mov_b64 s[0:1], 0x7f8
-; GFX942-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[2:3], 0x7f8
 ; GFX942-NEXT:    s_mov_b64 s[0:1], src_shared_base
+; GFX942-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
 ; GFX942-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX942-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -7715,8 +7715,8 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_gra
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0x7f8, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_shared_base
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX90A-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -7771,8 +7771,8 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_gra
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0x7f8, v0
-; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX908-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX908-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -8001,10 +8001,10 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_gra
 ; GFX942-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    s_movk_i32 s0, 0xf800
-; GFX942-NEXT:    s_mov_b32 s1, -1
-; GFX942-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX942-NEXT:    s_movk_i32 s2, 0xf800
+; GFX942-NEXT:    s_mov_b32 s3, -1
 ; GFX942-NEXT:    s_mov_b64 s[0:1], src_shared_base
+; GFX942-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
 ; GFX942-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX942-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -8165,8 +8165,8 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_gra
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_shared_base
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX90A-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -8221,8 +8221,8 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_gra
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
-; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX908-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX908-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll
index b8a2476dc19b4..59b0537b817d2 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll
@@ -3255,9 +3255,9 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra
 ; GFX942-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    s_mov_b64 s[0:1], 0x7f8
-; GFX942-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[2:3], 0x7f8
 ; GFX942-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX942-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
 ; GFX942-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
 ; GFX942-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
@@ -3397,8 +3397,8 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7f8, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -3438,8 +3438,8 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7f8, v0
-; GFX908-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GFX908-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX908-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GFX908-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -3667,10 +3667,10 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra
 ; GFX942-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    s_movk_i32 s0, 0xf800
-; GFX942-NEXT:    s_mov_b32 s1, -1
-; GFX942-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX942-NEXT:    s_movk_i32 s2, 0xf800
+; GFX942-NEXT:    s_mov_b32 s3, -1
 ; GFX942-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX942-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
 ; GFX942-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
 ; GFX942-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
@@ -3810,8 +3810,8 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -3851,8 +3851,8 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
-; GFX908-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
 ; GFX908-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX908-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
 ; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GFX908-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -4465,9 +4465,9 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_gra
 ; GFX942-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    s_mov_b64 s[0:1], 0x7f8
-; GFX942-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[2:3], 0x7f8
 ; GFX942-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX942-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
 ; GFX942-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX942-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -4605,8 +4605,8 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_gra
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0x7f8, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX90A-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -4645,8 +4645,8 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_gra
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
 ; GFX908-NEXT:    v_add_co_u32_e32 v6, vcc, 0x7f8, v0
-; GFX908-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
 ; GFX908-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX908-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
 ; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v7
 ; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX908-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -4870,10 +4870,10 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_gra
 ; GFX942-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    s_movk_i32 s0, 0xf800
-; GFX942-NEXT:    s_mov_b32 s1, -1
-; GFX942-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX942-NEXT:    s_movk_i32 s2, 0xf800
+; GFX942-NEXT:    s_mov_b32 s3, -1
 ; GFX942-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX942-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
 ; GFX942-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX942-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -5011,8 +5011,8 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_gra
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX90A-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -5051,8 +5051,8 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_gra
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
 ; GFX908-NEXT:    v_add_co_u32_e32 v6, vcc, 0xfffff800, v0
-; GFX908-NEXT:    v_addc_co_u32_e32 v7, vcc, -1, v1, vcc
 ; GFX908-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX908-NEXT:    v_addc_co_u32_e32 v7, vcc, -1, v1, vcc
 ; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v7
 ; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX908-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll
index 9830e48a86f06..c9c9f332fe391 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll
@@ -3255,9 +3255,9 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra
 ; GFX942-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    s_mov_b64 s[0:1], 0x7f8
-; GFX942-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[2:3], 0x7f8
 ; GFX942-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX942-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
 ; GFX942-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
 ; GFX942-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
@@ -3397,8 +3397,8 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7f8, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -3438,8 +3438,8 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7f8, v0
-; GFX908-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GFX908-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX908-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GFX908-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -3667,10 +3667,10 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra
 ; GFX942-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    s_movk_i32 s0, 0xf800
-; GFX942-NEXT:    s_mov_b32 s1, -1
-; GFX942-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX942-NEXT:    s_movk_i32 s2, 0xf800
+; GFX942-NEXT:    s_mov_b32 s3, -1
 ; GFX942-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX942-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
 ; GFX942-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
 ; GFX942-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
@@ -3810,8 +3810,8 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -3851,8 +3851,8 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
-; GFX908-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
 ; GFX908-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX908-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
 ; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GFX908-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -4465,9 +4465,9 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_gra
 ; GFX942-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    s_mov_b64 s[0:1], 0x7f8
-; GFX942-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[2:3], 0x7f8
 ; GFX942-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX942-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
 ; GFX942-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX942-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -4605,8 +4605,8 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_gra
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0x7f8, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX90A-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -4645,8 +4645,8 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_gra
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
 ; GFX908-NEXT:    v_add_co_u32_e32 v6, vcc, 0x7f8, v0
-; GFX908-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
 ; GFX908-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX908-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
 ; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v7
 ; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX908-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -4870,10 +4870,10 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_gra
 ; GFX942-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    s_movk_i32 s0, 0xf800
-; GFX942-NEXT:    s_mov_b32 s1, -1
-; GFX942-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX942-NEXT:    s_movk_i32 s2, 0xf800
+; GFX942-NEXT:    s_mov_b32 s3, -1
 ; GFX942-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX942-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
 ; GFX942-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX942-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -5011,8 +5011,8 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_gra
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX90A-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -5051,8 +5051,8 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_gra
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
 ; GFX908-NEXT:    v_add_co_u32_e32 v6, vcc, 0xfffff800, v0
-; GFX908-NEXT:    v_addc_co_u32_e32 v7, vcc, -1, v1, vcc
 ; GFX908-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX908-NEXT:    v_addc_co_u32_e32 v7, vcc, -1, v1, vcc
 ; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v7
 ; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX908-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll
index 2c1970220c374..587c2ea885077 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll
@@ -3711,9 +3711,9 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_pos(ptr %ptr, double %v
 ; GFX942-LABEL: flat_agent_atomic_fsub_ret_f64__offset12b_pos:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    s_mov_b64 s[0:1], 0x7f8
-; GFX942-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[2:3], 0x7f8
 ; GFX942-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX942-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
 ; GFX942-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
 ; GFX942-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
@@ -3871,8 +3871,8 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_pos(ptr %ptr, double %v
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7f8, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -3922,8 +3922,8 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_pos(ptr %ptr, double %v
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7f8, v0
-; GFX908-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GFX908-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX908-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GFX908-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -4157,10 +4157,10 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_neg(ptr %ptr, double %v
 ; GFX942-LABEL: flat_agent_atomic_fsub_ret_f64__offset12b_neg:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    s_movk_i32 s0, 0xf800
-; GFX942-NEXT:    s_mov_b32 s1, -1
-; GFX942-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX942-NEXT:    s_movk_i32 s2, 0xf800
+; GFX942-NEXT:    s_mov_b32 s3, -1
 ; GFX942-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX942-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
 ; GFX942-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
 ; GFX942-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
@@ -4318,8 +4318,8 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_neg(ptr %ptr, double %v
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -4369,8 +4369,8 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_neg(ptr %ptr, double %v
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
-; GFX908-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
 ; GFX908-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX908-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
 ; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GFX908-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -5019,9 +5019,9 @@ define void @flat_agent_atomic_fsub_noret_f64__offset12b_pos(ptr %ptr, double %v
 ; GFX942-LABEL: flat_agent_atomic_fsub_noret_f64__offset12b_pos:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    s_mov_b64 s[0:1], 0x7f8
-; GFX942-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[2:3], 0x7f8
 ; GFX942-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX942-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
 ; GFX942-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX942-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -5175,8 +5175,8 @@ define void @flat_agent_atomic_fsub_noret_f64__offset12b_pos(ptr %ptr, double %v
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0x7f8, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX90A-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -5225,8 +5225,8 @@ define void @flat_agent_atomic_fsub_noret_f64__offset12b_pos(ptr %ptr, double %v
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0x7f8, v0
-; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX908-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX908-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -5455,10 +5455,10 @@ define void @flat_agent_atomic_fsub_noret_f64__offset12b_neg(ptr %ptr, double %v
 ; GFX942-LABEL: flat_agent_atomic_fsub_noret_f64__offset12b_neg:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    s_movk_i32 s0, 0xf800
-; GFX942-NEXT:    s_mov_b32 s1, -1
-; GFX942-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX942-NEXT:    s_movk_i32 s2, 0xf800
+; GFX942-NEXT:    s_mov_b32 s3, -1
 ; GFX942-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX942-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
 ; GFX942-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX942-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -5612,8 +5612,8 @@ define void @flat_agent_atomic_fsub_noret_f64__offset12b_neg(ptr %ptr, double %v
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX90A-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -5662,8 +5662,8 @@ define void @flat_agent_atomic_fsub_noret_f64__offset12b_neg(ptr %ptr, double %v
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
-; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX908-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX908-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll b/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll
index f8c2ddf0d7d3c..605026614c614 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll
@@ -586,14 +586,14 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
 ;
 ; GFX1250-GISEL-LABEL: flat_xchg_saddr_i64_rtn:
 ; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v6, vcc_lo, v0, v3
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14
+; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v0, v7, v8
 ; GFX1250-GISEL-NEXT:    v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-GISEL-NEXT:    s_and_saveexec_b32 s0, vcc_lo
@@ -618,10 +618,10 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB10_2
 ; GFX1250-GISEL-NEXT:  .LBB10_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, s1, v6
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v6, v0
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v2, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_clause 0x1
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v2, off
@@ -637,8 +637,8 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
 ; GFX950-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
 ; GFX950-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX950-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], vcc
@@ -770,9 +770,9 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn_neg128(ptr inreg %sbase, i
 ;
 ; GFX1250-GISEL-LABEL: flat_xchg_saddr_i64_rtn_neg128:
 ; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v3
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
@@ -780,7 +780,7 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn_neg128(ptr inreg %sbase, i
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v6, vcc_lo, 0xffffff80, v0
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14
+; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v0, v7, v8
 ; GFX1250-GISEL-NEXT:    v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-GISEL-NEXT:    s_and_saveexec_b32 s0, vcc_lo
@@ -805,10 +805,10 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn_neg128(ptr inreg %sbase, i
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB11_2
 ; GFX1250-GISEL-NEXT:  .LBB11_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, s1, v6
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v6, v0
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v2, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_clause 0x1
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v2, off
@@ -824,11 +824,11 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn_neg128(ptr inreg %sbase, i
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX950-SDAG-NEXT:    s_movk_i32 s0, 0xff80
 ; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX950-SDAG-NEXT:    s_mov_b32 s1, -1
-; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX950-SDAG-NEXT:    s_movk_i32 s2, 0xff80
+; GFX950-SDAG-NEXT:    s_mov_b32 s3, -1
 ; GFX950-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
 ; GFX950-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX950-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], vcc
@@ -953,15 +953,15 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ;
 ; GFX1250-GISEL-LABEL: flat_xchg_saddr_i64_nortn:
 ; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v0
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, s0, v3
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, v3, v6
 ; GFX1250-GISEL-NEXT:    v_cmpx_le_u32_e32 0x4000000, v1
 ; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB12_3
@@ -982,10 +982,10 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB12_2
 ; GFX1250-GISEL-NEXT:  .LBB12_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_lo
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, s0, v2
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v2, v0
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_store_b64 v0, v[4:5], off
 ; GFX1250-GISEL-NEXT:    s_endpgm
@@ -995,8 +995,8 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
 ; GFX950-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-SDAG-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -1028,9 +1028,8 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX950-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v0
 ; GFX950-GISEL-NEXT:    v_mov_b32_e32 v4, v1
-; GFX950-GISEL-NEXT:    s_nop 0
-; GFX950-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
 ; GFX950-GISEL-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
 ; GFX950-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-GISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -1105,18 +1104,18 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
 ;
 ; GFX1250-GISEL-LABEL: flat_xchg_saddr_i64_nortn_neg128:
 ; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v1, vcc_lo, v2, v0
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, 0xffffff80, v1
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, s0, v3
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, v3, v6
 ; GFX1250-GISEL-NEXT:    v_cmpx_le_u32_e32 0x4000000, v1
 ; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB13_3
@@ -1137,10 +1136,10 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB13_2
 ; GFX1250-GISEL-NEXT:  .LBB13_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_lo
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, s0, v2
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v2, v0
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_store_b64 v0, v[4:5], off
 ; GFX1250-GISEL-NEXT:    s_endpgm
@@ -1150,11 +1149,11 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX950-SDAG-NEXT:    s_movk_i32 s0, 0xff80
 ; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX950-SDAG-NEXT:    s_mov_b32 s1, -1
-; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX950-SDAG-NEXT:    s_movk_i32 s2, 0xff80
+; GFX950-SDAG-NEXT:    s_mov_b32 s3, -1
 ; GFX950-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-SDAG-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -1446,14 +1445,14 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ;
 ; GFX1250-GISEL-LABEL: flat_add_saddr_i64_rtn:
 ; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v6, vcc_lo, v0, v3
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14
+; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v0, v7, v8
 ; GFX1250-GISEL-NEXT:    v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-GISEL-NEXT:    s_and_saveexec_b32 s0, vcc_lo
@@ -1478,10 +1477,10 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB18_2
 ; GFX1250-GISEL-NEXT:  .LBB18_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, s1, v6
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v6, v0
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v6, off
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
@@ -1497,8 +1496,8 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
 ; GFX950-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
 ; GFX950-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX950-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], vcc
@@ -1634,9 +1633,9 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ;
 ; GFX1250-GISEL-LABEL: flat_add_saddr_i64_rtn_neg128:
 ; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v3
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
@@ -1644,7 +1643,7 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v6, vcc_lo, 0xffffff80, v0
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14
+; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v0, v7, v8
 ; GFX1250-GISEL-NEXT:    v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-GISEL-NEXT:    s_and_saveexec_b32 s0, vcc_lo
@@ -1669,10 +1668,10 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB19_2
 ; GFX1250-GISEL-NEXT:  .LBB19_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, s1, v6
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v6, v0
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v6, off
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
@@ -1688,11 +1687,11 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX950-SDAG-NEXT:    s_movk_i32 s0, 0xff80
 ; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX950-SDAG-NEXT:    s_mov_b32 s1, -1
-; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX950-SDAG-NEXT:    s_movk_i32 s2, 0xff80
+; GFX950-SDAG-NEXT:    s_mov_b32 s3, -1
 ; GFX950-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
 ; GFX950-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX950-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], vcc
@@ -1824,15 +1823,15 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ;
 ; GFX1250-GISEL-LABEL: flat_add_saddr_i64_nortn:
 ; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v0
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, s0, v3
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, v3, v6
 ; GFX1250-GISEL-NEXT:    v_cmpx_le_u32_e32 0x4000000, v1
 ; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB20_3
@@ -1853,10 +1852,10 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB20_2
 ; GFX1250-GISEL-NEXT:  .LBB20_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_lo
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, s0, v2
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v2, v0
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v2, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v2, off
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
@@ -1869,8 +1868,8 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
 ; GFX950-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-SDAG-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -1905,9 +1904,8 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX950-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v0
 ; GFX950-GISEL-NEXT:    v_mov_b32_e32 v4, v1
-; GFX950-GISEL-NEXT:    s_nop 0
-; GFX950-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
 ; GFX950-GISEL-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
 ; GFX950-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-GISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -1990,18 +1988,18 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ;
 ; GFX1250-GISEL-LABEL: flat_add_saddr_i64_nortn_neg128:
 ; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v1, vcc_lo, v2, v0
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, 0xffffff80, v1
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, s0, v3
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, v3, v6
 ; GFX1250-GISEL-NEXT:    v_cmpx_le_u32_e32 0x4000000, v1
 ; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB21_3
@@ -2022,10 +2020,10 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB21_2
 ; GFX1250-GISEL-NEXT:  .LBB21_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_lo
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, s0, v2
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v2, v0
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v2, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v2, off
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
@@ -2038,11 +2036,11 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX950-SDAG-NEXT:    s_movk_i32 s0, 0xff80
 ; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX950-SDAG-NEXT:    s_mov_b32 s1, -1
-; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX950-SDAG-NEXT:    s_movk_i32 s2, 0xff80
+; GFX950-SDAG-NEXT:    s_mov_b32 s3, -1
 ; GFX950-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-SDAG-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -2342,14 +2340,14 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ;
 ; GFX1250-GISEL-LABEL: flat_sub_saddr_i64_rtn:
 ; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v6, vcc_lo, v0, v3
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14
+; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v0, v7, v8
 ; GFX1250-GISEL-NEXT:    v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-GISEL-NEXT:    s_and_saveexec_b32 s0, vcc_lo
@@ -2374,10 +2372,10 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB26_2
 ; GFX1250-GISEL-NEXT:  .LBB26_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, s1, v6
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v6, v0
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v6, off
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
@@ -2393,8 +2391,8 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
 ; GFX950-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
 ; GFX950-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX950-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], vcc
@@ -2532,9 +2530,9 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ;
 ; GFX1250-GISEL-LABEL: flat_sub_saddr_i64_rtn_neg128:
 ; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v3
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
@@ -2542,7 +2540,7 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v6, vcc_lo, 0xffffff80, v0
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14
+; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v0, v7, v8
 ; GFX1250-GISEL-NEXT:    v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-GISEL-NEXT:    s_and_saveexec_b32 s0, vcc_lo
@@ -2567,10 +2565,10 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB27_2
 ; GFX1250-GISEL-NEXT:  .LBB27_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, s1, v6
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v6, v0
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v6, off
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
@@ -2586,11 +2584,11 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX950-SDAG-NEXT:    s_movk_i32 s0, 0xff80
 ; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX950-SDAG-NEXT:    s_mov_b32 s1, -1
-; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX950-SDAG-NEXT:    s_movk_i32 s2, 0xff80
+; GFX950-SDAG-NEXT:    s_mov_b32 s3, -1
 ; GFX950-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
 ; GFX950-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX950-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], vcc
@@ -2724,15 +2722,15 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ;
 ; GFX1250-GISEL-LABEL: flat_sub_saddr_i64_nortn:
 ; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v0
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, s0, v3
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, v3, v6
 ; GFX1250-GISEL-NEXT:    v_cmpx_le_u32_e32 0x4000000, v1
 ; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB28_3
@@ -2753,10 +2751,10 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB28_2
 ; GFX1250-GISEL-NEXT:  .LBB28_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_lo
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, s0, v2
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v2, v0
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v2, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v2, off
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
@@ -2769,8 +2767,8 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
 ; GFX950-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-SDAG-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -2807,9 +2805,8 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX950-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v0
 ; GFX950-GISEL-NEXT:    v_mov_b32_e32 v4, v1
-; GFX950-GISEL-NEXT:    s_nop 0
-; GFX950-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
 ; GFX950-GISEL-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
 ; GFX950-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-GISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -2892,18 +2889,18 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ;
 ; GFX1250-GISEL-LABEL: flat_sub_saddr_i64_nortn_neg128:
 ; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v1, vcc_lo, v2, v0
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, 0xffffff80, v1
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, s0, v3
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, v3, v6
 ; GFX1250-GISEL-NEXT:    v_cmpx_le_u32_e32 0x4000000, v1
 ; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB29_3
@@ -2924,10 +2921,10 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB29_2
 ; GFX1250-GISEL-NEXT:  .LBB29_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_lo
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, s0, v2
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v2, v0
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v2, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v2, off
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
@@ -2940,11 +2937,11 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX950-SDAG-NEXT:    s_movk_i32 s0, 0xff80
 ; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX950-SDAG-NEXT:    s_mov_b32 s1, -1
-; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX950-SDAG-NEXT:    s_movk_i32 s2, 0xff80
+; GFX950-SDAG-NEXT:    s_mov_b32 s3, -1
 ; GFX950-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-SDAG-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -3247,14 +3244,14 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ;
 ; GFX1250-GISEL-LABEL: flat_and_saddr_i64_rtn:
 ; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v6, vcc_lo, v0, v3
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14
+; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v0, v7, v8
 ; GFX1250-GISEL-NEXT:    v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-GISEL-NEXT:    s_and_saveexec_b32 s0, vcc_lo
@@ -3279,10 +3276,10 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB34_2
 ; GFX1250-GISEL-NEXT:  .LBB34_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, s1, v6
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v6, v0
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v6, off
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
@@ -3299,8 +3296,8 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
 ; GFX950-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
 ; GFX950-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX950-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], vcc
@@ -3437,9 +3434,9 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ;
 ; GFX1250-GISEL-LABEL: flat_and_saddr_i64_rtn_neg128:
 ; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v3
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
@@ -3447,7 +3444,7 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v6, vcc_lo, 0xffffff80, v0
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14
+; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v0, v7, v8
 ; GFX1250-GISEL-NEXT:    v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-GISEL-NEXT:    s_and_saveexec_b32 s0, vcc_lo
@@ -3472,10 +3469,10 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB35_2
 ; GFX1250-GISEL-NEXT:  .LBB35_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, s1, v6
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v6, v0
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v6, off
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
@@ -3492,11 +3489,11 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX950-SDAG-NEXT:    s_movk_i32 s0, 0xff80
 ; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX950-SDAG-NEXT:    s_mov_b32 s1, -1
-; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX950-SDAG-NEXT:    s_movk_i32 s2, 0xff80
+; GFX950-SDAG-NEXT:    s_mov_b32 s3, -1
 ; GFX950-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
 ; GFX950-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX950-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], vcc
@@ -3629,15 +3626,15 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ;
 ; GFX1250-GISEL-LABEL: flat_and_saddr_i64_nortn:
 ; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v0
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, s0, v3
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, v3, v6
 ; GFX1250-GISEL-NEXT:    v_cmpx_le_u32_e32 0x4000000, v1
 ; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB36_3
@@ -3658,10 +3655,10 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB36_2
 ; GFX1250-GISEL-NEXT:  .LBB36_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_lo
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, s0, v2
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v2, v0
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v2, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v2, off
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
@@ -3675,8 +3672,8 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
 ; GFX950-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-SDAG-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -3712,9 +3709,8 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX950-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v0
 ; GFX950-GISEL-NEXT:    v_mov_b32_e32 v4, v1
-; GFX950-GISEL-NEXT:    s_nop 0
-; GFX950-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
 ; GFX950-GISEL-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
 ; GFX950-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-GISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -3797,18 +3793,18 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ;
 ; GFX1250-GISEL-LABEL: flat_and_saddr_i64_nortn_neg128:
 ; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v1, vcc_lo, v2, v0
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, 0xffffff80, v1
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, s0, v3
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, v3, v6
 ; GFX1250-GISEL-NEXT:    v_cmpx_le_u32_e32 0x4000000, v1
 ; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB37_3
@@ -3829,10 +3825,10 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB37_2
 ; GFX1250-GISEL-NEXT:  .LBB37_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_lo
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, s0, v2
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v2, v0
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v2, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v2, off
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
@@ -3846,11 +3842,11 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX950-SDAG-NEXT:    s_movk_i32 s0, 0xff80
 ; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX950-SDAG-NEXT:    s_mov_b32 s1, -1
-; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX950-SDAG-NEXT:    s_movk_i32 s2, 0xff80
+; GFX950-SDAG-NEXT:    s_mov_b32 s3, -1
 ; GFX950-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-SDAG-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -4151,14 +4147,14 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn(ptr inreg %sbase, i32 %voffs
 ;
 ; GFX1250-GISEL-LABEL: flat_or_saddr_i64_rtn:
 ; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v6, vcc_lo, v0, v3
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14
+; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v0, v7, v8
 ; GFX1250-GISEL-NEXT:    v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-GISEL-NEXT:    s_and_saveexec_b32 s0, vcc_lo
@@ -4183,10 +4179,10 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn(ptr inreg %sbase, i32 %voffs
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB42_2
 ; GFX1250-GISEL-NEXT:  .LBB42_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, s1, v6
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v6, v0
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v6, off
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
@@ -4203,8 +4199,8 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn(ptr inreg %sbase, i32 %voffs
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
 ; GFX950-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
 ; GFX950-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX950-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], vcc
@@ -4341,9 +4337,9 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn_neg128(ptr inreg %sbase, i32
 ;
 ; GFX1250-GISEL-LABEL: flat_or_saddr_i64_rtn_neg128:
 ; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v3
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
@@ -4351,7 +4347,7 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn_neg128(ptr inreg %sbase, i32
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v6, vcc_lo, 0xffffff80, v0
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14
+; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v0, v7, v8
 ; GFX1250-GISEL-NEXT:    v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-GISEL-NEXT:    s_and_saveexec_b32 s0, vcc_lo
@@ -4376,10 +4372,10 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn_neg128(ptr inreg %sbase, i32
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB43_2
 ; GFX1250-GISEL-NEXT:  .LBB43_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, s1, v6
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v6, v0
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v6, off
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
@@ -4396,11 +4392,11 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn_neg128(ptr inreg %sbase, i32
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX950-SDAG-NEXT:    s_movk_i32 s0, 0xff80
 ; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX950-SDAG-NEXT:    s_mov_b32 s1, -1
-; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX950-SDAG-NEXT:    s_movk_i32 s2, 0xff80
+; GFX950-SDAG-NEXT:    s_mov_b32 s3, -1
 ; GFX950-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
 ; GFX950-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX950-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], vcc
@@ -4533,15 +4529,15 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i
 ;
 ; GFX1250-GISEL-LABEL: flat_or_saddr_i64_nortn:
 ; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v0
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, s0, v3
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, v3, v6
 ; GFX1250-GISEL-NEXT:    v_cmpx_le_u32_e32 0x4000000, v1
 ; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB44_3
@@ -4562,10 +4558,10 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB44_2
 ; GFX1250-GISEL-NEXT:  .LBB44_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_lo
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, s0, v2
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v2, v0
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v2, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v2, off
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
@@ -4579,8 +4575,8 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
 ; GFX950-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-SDAG-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -4616,9 +4612,8 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i
 ; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX950-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v0
 ; GFX950-GISEL-NEXT:    v_mov_b32_e32 v4, v1
-; GFX950-GISEL-NEXT:    s_nop 0
-; GFX950-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
 ; GFX950-GISEL-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
 ; GFX950-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-GISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -4701,18 +4696,18 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vof
 ;
 ; GFX1250-GISEL-LABEL: flat_or_saddr_i64_nortn_neg128:
 ; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v1, vcc_lo, v2, v0
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, 0xffffff80, v1
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, s0, v3
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, v3, v6
 ; GFX1250-GISEL-NEXT:    v_cmpx_le_u32_e32 0x4000000, v1
 ; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB45_3
@@ -4733,10 +4728,10 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vof
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB45_2
 ; GFX1250-GISEL-NEXT:  .LBB45_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_lo
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, s0, v2
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v2, v0
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v2, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v2, off
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
@@ -4750,11 +4745,11 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vof
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX950-SDAG-NEXT:    s_movk_i32 s0, 0xff80
 ; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX950-SDAG-NEXT:    s_mov_b32 s1, -1
-; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX950-SDAG-NEXT:    s_movk_i32 s2, 0xff80
+; GFX950-SDAG-NEXT:    s_mov_b32 s3, -1
 ; GFX950-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-SDAG-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -5055,14 +5050,14 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ;
 ; GFX1250-GISEL-LABEL: flat_xor_saddr_i64_rtn:
 ; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v6, vcc_lo, v0, v3
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14
+; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v0, v7, v8
 ; GFX1250-GISEL-NEXT:    v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-GISEL-NEXT:    s_and_saveexec_b32 s0, vcc_lo
@@ -5087,10 +5082,10 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB50_2
 ; GFX1250-GISEL-NEXT:  .LBB50_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, s1, v6
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v6, v0
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v6, off
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
@@ -5107,8 +5102,8 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
 ; GFX950-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
 ; GFX950-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX950-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], vcc
@@ -5245,9 +5240,9 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ;
 ; GFX1250-GISEL-LABEL: flat_xor_saddr_i64_rtn_neg128:
 ; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v3
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
@@ -5255,7 +5250,7 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v6, vcc_lo, 0xffffff80, v0
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14
+; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v0, v7, v8
 ; GFX1250-GISEL-NEXT:    v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-GISEL-NEXT:    s_and_saveexec_b32 s0, vcc_lo
@@ -5280,10 +5275,10 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB51_2
 ; GFX1250-GISEL-NEXT:  .LBB51_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, s1, v6
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v6, v0
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v6, off
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
@@ -5300,11 +5295,11 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX950-SDAG-NEXT:    s_movk_i32 s0, 0xff80
 ; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX950-SDAG-NEXT:    s_mov_b32 s1, -1
-; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX950-SDAG-NEXT:    s_movk_i32 s2, 0xff80
+; GFX950-SDAG-NEXT:    s_mov_b32 s3, -1
 ; GFX950-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
 ; GFX950-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX950-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], vcc
@@ -5437,15 +5432,15 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ;
 ; GFX1250-GISEL-LABEL: flat_xor_saddr_i64_nortn:
 ; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v0
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, s0, v3
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, v3, v6
 ; GFX1250-GISEL-NEXT:    v_cmpx_le_u32_e32 0x4000000, v1
 ; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB52_3
@@ -5466,10 +5461,10 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB52_2
 ; GFX1250-GISEL-NEXT:  .LBB52_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_lo
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, s0, v2
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v2, v0
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v2, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v2, off
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
@@ -5483,8 +5478,8 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
 ; GFX950-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-SDAG-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -5520,9 +5515,8 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX950-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v0
 ; GFX950-GISEL-NEXT:    v_mov_b32_e32 v4, v1
-; GFX950-GISEL-NEXT:    s_nop 0
-; GFX950-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
 ; GFX950-GISEL-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
 ; GFX950-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-GISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -5605,18 +5599,18 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ;
 ; GFX1250-GISEL-LABEL: flat_xor_saddr_i64_nortn_neg128:
 ; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v1, vcc_lo, v2, v0
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, 0xffffff80, v1
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, s0, v3
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, v3, v6
 ; GFX1250-GISEL-NEXT:    v_cmpx_le_u32_e32 0x4000000, v1
 ; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB53_3
@@ -5637,10 +5631,10 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB53_2
 ; GFX1250-GISEL-NEXT:  .LBB53_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_lo
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, s0, v2
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v2, v0
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v2, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v2, off
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
@@ -5654,11 +5648,11 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX950-SDAG-NEXT:    s_movk_i32 s0, 0xff80
 ; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX950-SDAG-NEXT:    s_mov_b32 s1, -1
-; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX950-SDAG-NEXT:    s_movk_i32 s2, 0xff80
+; GFX950-SDAG-NEXT:    s_mov_b32 s3, -1
 ; GFX950-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-SDAG-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -5924,14 +5918,14 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ;
 ; GFX1250-GISEL-LABEL: flat_max_saddr_i64_rtn:
 ; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v6, vcc_lo, v0, v3
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14
+; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v0, v7, v8
 ; GFX1250-GISEL-NEXT:    v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-GISEL-NEXT:    s_and_saveexec_b32 s0, vcc_lo
@@ -5952,10 +5946,10 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB58_2
 ; GFX1250-GISEL-NEXT:  .LBB58_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, s1, v6
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v6, v0
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v6, off
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
@@ -5971,8 +5965,8 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
 ; GFX950-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
 ; GFX950-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX950-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], vcc
@@ -6106,9 +6100,9 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ;
 ; GFX1250-GISEL-LABEL: flat_max_saddr_i64_rtn_neg128:
 ; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v3
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
@@ -6116,7 +6110,7 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v6, vcc_lo, 0xffffff80, v0
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14
+; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v0, v7, v8
 ; GFX1250-GISEL-NEXT:    v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-GISEL-NEXT:    s_and_saveexec_b32 s0, vcc_lo
@@ -6137,10 +6131,10 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB59_2
 ; GFX1250-GISEL-NEXT:  .LBB59_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, s1, v6
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v6, v0
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v6, off
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
@@ -6156,11 +6150,11 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX950-SDAG-NEXT:    s_movk_i32 s0, 0xff80
 ; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX950-SDAG-NEXT:    s_mov_b32 s1, -1
-; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX950-SDAG-NEXT:    s_movk_i32 s2, 0xff80
+; GFX950-SDAG-NEXT:    s_mov_b32 s3, -1
 ; GFX950-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
 ; GFX950-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX950-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], vcc
@@ -6291,15 +6285,15 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ;
 ; GFX1250-GISEL-LABEL: flat_max_saddr_i64_nortn:
 ; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v0
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, s0, v3
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, v3, v6
 ; GFX1250-GISEL-NEXT:    v_cmpx_le_u32_e32 0x4000000, v1
 ; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB60_3
@@ -6317,10 +6311,10 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB60_2
 ; GFX1250-GISEL-NEXT:  .LBB60_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_lo
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, s0, v2
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v2, v0
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v2, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v2, off
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
@@ -6333,8 +6327,8 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
 ; GFX950-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-SDAG-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -6370,9 +6364,8 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX950-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v0
 ; GFX950-GISEL-NEXT:    v_mov_b32_e32 v4, v1
-; GFX950-GISEL-NEXT:    s_nop 0
-; GFX950-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
 ; GFX950-GISEL-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
 ; GFX950-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-GISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -6451,18 +6444,18 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ;
 ; GFX1250-GISEL-LABEL: flat_max_saddr_i64_nortn_neg128:
 ; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v1, vcc_lo, v2, v0
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, 0xffffff80, v1
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, s0, v3
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, v3, v6
 ; GFX1250-GISEL-NEXT:    v_cmpx_le_u32_e32 0x4000000, v1
 ; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB61_3
@@ -6480,10 +6473,10 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB61_2
 ; GFX1250-GISEL-NEXT:  .LBB61_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_lo
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, s0, v2
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v2, v0
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v2, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v2, off
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
@@ -6496,11 +6489,11 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX950-SDAG-NEXT:    s_movk_i32 s0, 0xff80
 ; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX950-SDAG-NEXT:    s_mov_b32 s1, -1
-; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX950-SDAG-NEXT:    s_movk_i32 s2, 0xff80
+; GFX950-SDAG-NEXT:    s_mov_b32 s3, -1
 ; GFX950-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-SDAG-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -6766,14 +6759,14 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ;
 ; GFX1250-GISEL-LABEL: flat_min_saddr_i64_rtn:
 ; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v6, vcc_lo, v0, v3
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14
+; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v0, v7, v8
 ; GFX1250-GISEL-NEXT:    v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-GISEL-NEXT:    s_and_saveexec_b32 s0, vcc_lo
@@ -6794,10 +6787,10 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB66_2
 ; GFX1250-GISEL-NEXT:  .LBB66_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, s1, v6
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v6, v0
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v6, off
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
@@ -6813,8 +6806,8 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
 ; GFX950-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
 ; GFX950-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX950-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], vcc
@@ -6948,9 +6941,9 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ;
 ; GFX1250-GISEL-LABEL: flat_min_saddr_i64_rtn_neg128:
 ; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v3
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
@@ -6958,7 +6951,7 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v6, vcc_lo, 0xffffff80, v0
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14
+; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v0, v7, v8
 ; GFX1250-GISEL-NEXT:    v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-GISEL-NEXT:    s_and_saveexec_b32 s0, vcc_lo
@@ -6979,10 +6972,10 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB67_2
 ; GFX1250-GISEL-NEXT:  .LBB67_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, s1, v6
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v6, v0
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v6, off
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
@@ -6998,11 +6991,11 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX950-SDAG-NEXT:    s_movk_i32 s0, 0xff80
 ; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX950-SDAG-NEXT:    s_mov_b32 s1, -1
-; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX950-SDAG-NEXT:    s_movk_i32 s2, 0xff80
+; GFX950-SDAG-NEXT:    s_mov_b32 s3, -1
 ; GFX950-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
 ; GFX950-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX950-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], vcc
@@ -7133,15 +7126,15 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ;
 ; GFX1250-GISEL-LABEL: flat_min_saddr_i64_nortn:
 ; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v0
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, s0, v3
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, v3, v6
 ; GFX1250-GISEL-NEXT:    v_cmpx_le_u32_e32 0x4000000, v1
 ; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB68_3
@@ -7159,10 +7152,10 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB68_2
 ; GFX1250-GISEL-NEXT:  .LBB68_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_lo
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, s0, v2
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v2, v0
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v2, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v2, off
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
@@ -7175,8 +7168,8 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
 ; GFX950-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-SDAG-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -7212,9 +7205,8 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX950-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v0
 ; GFX950-GISEL-NEXT:    v_mov_b32_e32 v4, v1
-; GFX950-GISEL-NEXT:    s_nop 0
-; GFX950-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
 ; GFX950-GISEL-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
 ; GFX950-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-GISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -7293,18 +7285,18 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ;
 ; GFX1250-GISEL-LABEL: flat_min_saddr_i64_nortn_neg128:
 ; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v1, vcc_lo, v2, v0
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, 0xffffff80, v1
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, s0, v3
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, v3, v6
 ; GFX1250-GISEL-NEXT:    v_cmpx_le_u32_e32 0x4000000, v1
 ; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB69_3
@@ -7322,10 +7314,10 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB69_2
 ; GFX1250-GISEL-NEXT:  .LBB69_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_lo
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, s0, v2
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v2, v0
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v2, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v2, off
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
@@ -7338,11 +7330,11 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX950-SDAG-NEXT:    s_movk_i32 s0, 0xff80
 ; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX950-SDAG-NEXT:    s_mov_b32 s1, -1
-; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX950-SDAG-NEXT:    s_movk_i32 s2, 0xff80
+; GFX950-SDAG-NEXT:    s_mov_b32 s3, -1
 ; GFX950-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-SDAG-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -7608,14 +7600,14 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
 ;
 ; GFX1250-GISEL-LABEL: flat_umax_saddr_i64_rtn:
 ; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v6, vcc_lo, v0, v3
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14
+; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v0, v7, v8
 ; GFX1250-GISEL-NEXT:    v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-GISEL-NEXT:    s_and_saveexec_b32 s0, vcc_lo
@@ -7636,10 +7628,10 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB74_2
 ; GFX1250-GISEL-NEXT:  .LBB74_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, s1, v6
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v6, v0
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v6, off
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
@@ -7655,8 +7647,8 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
 ; GFX950-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
 ; GFX950-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX950-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], vcc
@@ -7790,9 +7782,9 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn_neg128(ptr inreg %sbase, i
 ;
 ; GFX1250-GISEL-LABEL: flat_umax_saddr_i64_rtn_neg128:
 ; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v3
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
@@ -7800,7 +7792,7 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn_neg128(ptr inreg %sbase, i
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v6, vcc_lo, 0xffffff80, v0
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14
+; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v0, v7, v8
 ; GFX1250-GISEL-NEXT:    v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-GISEL-NEXT:    s_and_saveexec_b32 s0, vcc_lo
@@ -7821,10 +7813,10 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn_neg128(ptr inreg %sbase, i
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB75_2
 ; GFX1250-GISEL-NEXT:  .LBB75_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, s1, v6
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v6, v0
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v6, off
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
@@ -7840,11 +7832,11 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn_neg128(ptr inreg %sbase, i
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX950-SDAG-NEXT:    s_movk_i32 s0, 0xff80
 ; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX950-SDAG-NEXT:    s_mov_b32 s1, -1
-; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX950-SDAG-NEXT:    s_movk_i32 s2, 0xff80
+; GFX950-SDAG-NEXT:    s_mov_b32 s3, -1
 ; GFX950-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
 ; GFX950-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX950-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], vcc
@@ -7975,15 +7967,15 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ;
 ; GFX1250-GISEL-LABEL: flat_umax_saddr_i64_nortn:
 ; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v0
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, s0, v3
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, v3, v6
 ; GFX1250-GISEL-NEXT:    v_cmpx_le_u32_e32 0x4000000, v1
 ; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB76_3
@@ -8001,10 +7993,10 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB76_2
 ; GFX1250-GISEL-NEXT:  .LBB76_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_lo
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, s0, v2
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v2, v0
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v2, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v2, off
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
@@ -8017,8 +8009,8 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
 ; GFX950-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-SDAG-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -8054,9 +8046,8 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX950-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v0
 ; GFX950-GISEL-NEXT:    v_mov_b32_e32 v4, v1
-; GFX950-GISEL-NEXT:    s_nop 0
-; GFX950-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
 ; GFX950-GISEL-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
 ; GFX950-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-GISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -8135,18 +8126,18 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
 ;
 ; GFX1250-GISEL-LABEL: flat_umax_saddr_i64_nortn_neg128:
 ; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v1, vcc_lo, v2, v0
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, 0xffffff80, v1
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, s0, v3
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, v3, v6
 ; GFX1250-GISEL-NEXT:    v_cmpx_le_u32_e32 0x4000000, v1
 ; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB77_3
@@ -8164,10 +8155,10 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB77_2
 ; GFX1250-GISEL-NEXT:  .LBB77_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_lo
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, s0, v2
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v2, v0
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v2, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v2, off
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
@@ -8180,11 +8171,11 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX950-SDAG-NEXT:    s_movk_i32 s0, 0xff80
 ; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX950-SDAG-NEXT:    s_mov_b32 s1, -1
-; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX950-SDAG-NEXT:    s_movk_i32 s2, 0xff80
+; GFX950-SDAG-NEXT:    s_mov_b32 s3, -1
 ; GFX950-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-SDAG-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -8450,14 +8441,14 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
 ;
 ; GFX1250-GISEL-LABEL: flat_umin_saddr_i64_rtn:
 ; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v6, vcc_lo, v0, v3
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14
+; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v0, v7, v8
 ; GFX1250-GISEL-NEXT:    v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-GISEL-NEXT:    s_and_saveexec_b32 s0, vcc_lo
@@ -8478,10 +8469,10 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB82_2
 ; GFX1250-GISEL-NEXT:  .LBB82_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, s1, v6
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v6, v0
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v6, off
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
@@ -8497,8 +8488,8 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
 ; GFX950-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
 ; GFX950-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX950-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], vcc
@@ -8632,9 +8623,9 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn_neg128(ptr inreg %sbase, i
 ;
 ; GFX1250-GISEL-LABEL: flat_umin_saddr_i64_rtn_neg128:
 ; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v3
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
@@ -8642,7 +8633,7 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn_neg128(ptr inreg %sbase, i
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v6, vcc_lo, 0xffffff80, v0
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14
+; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v0, v7, v8
 ; GFX1250-GISEL-NEXT:    v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-GISEL-NEXT:    s_and_saveexec_b32 s0, vcc_lo
@@ -8663,10 +8654,10 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn_neg128(ptr inreg %sbase, i
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB83_2
 ; GFX1250-GISEL-NEXT:  .LBB83_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, s1, v6
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v6, v0
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v6, off
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
@@ -8682,11 +8673,11 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn_neg128(ptr inreg %sbase, i
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX950-SDAG-NEXT:    s_movk_i32 s0, 0xff80
 ; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX950-SDAG-NEXT:    s_mov_b32 s1, -1
-; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX950-SDAG-NEXT:    s_movk_i32 s2, 0xff80
+; GFX950-SDAG-NEXT:    s_mov_b32 s3, -1
 ; GFX950-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
 ; GFX950-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX950-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], vcc
@@ -8817,15 +8808,15 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ;
 ; GFX1250-GISEL-LABEL: flat_umin_saddr_i64_nortn:
 ; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v0
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, s0, v3
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, v3, v6
 ; GFX1250-GISEL-NEXT:    v_cmpx_le_u32_e32 0x4000000, v1
 ; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB84_3
@@ -8843,10 +8834,10 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB84_2
 ; GFX1250-GISEL-NEXT:  .LBB84_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_lo
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, s0, v2
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v2, v0
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v2, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v2, off
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
@@ -8859,8 +8850,8 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
 ; GFX950-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-SDAG-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -8896,9 +8887,8 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX950-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v0
 ; GFX950-GISEL-NEXT:    v_mov_b32_e32 v4, v1
-; GFX950-GISEL-NEXT:    s_nop 0
-; GFX950-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
 ; GFX950-GISEL-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
 ; GFX950-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-GISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -8977,18 +8967,18 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
 ;
 ; GFX1250-GISEL-LABEL: flat_umin_saddr_i64_nortn_neg128:
 ; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v1, vcc_lo, v2, v0
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, 0xffffff80, v1
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, s0, v3
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, v3, v6
 ; GFX1250-GISEL-NEXT:    v_cmpx_le_u32_e32 0x4000000, v1
 ; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB85_3
@@ -9006,10 +8996,10 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB85_2
 ; GFX1250-GISEL-NEXT:  .LBB85_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_lo
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, s0, v2
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v2, v0
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v2, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v2, off
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
@@ -9022,11 +9012,11 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX950-SDAG-NEXT:    s_movk_i32 s0, 0xff80
 ; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX950-SDAG-NEXT:    s_mov_b32 s1, -1
-; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX950-SDAG-NEXT:    s_movk_i32 s2, 0xff80
+; GFX950-SDAG-NEXT:    s_mov_b32 s3, -1
 ; GFX950-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-SDAG-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -9338,15 +9328,15 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn(ptr inreg %sbase, i32 %
 ;
 ; GFX1250-GISEL-LABEL: flat_cmpxchg_saddr_i64_rtn:
 ; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v8, v1
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v10, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v9, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v9, v2 :: v_dual_mov_b32 v6, v3
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v6, v3 :: v_dual_mov_b32 v7, v4
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, v0, v5
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v1, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v7, v4 :: v_dual_bitop2_b32 v0, s0, v3 bitop3:0x14
+; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v0, v3, v10
 ; GFX1250-GISEL-NEXT:    v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-GISEL-NEXT:    s_and_saveexec_b32 s0, vcc_lo
@@ -9371,10 +9361,10 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn(ptr inreg %sbase, i32 %
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB90_2
 ; GFX1250-GISEL-NEXT:  .LBB90_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, s1, v2
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v2, v0
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v4, off
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
@@ -9393,8 +9383,8 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn(ptr inreg %sbase, i32 %
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v5, v4
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v7, v2
-; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[2:3], s[2:3], 0, v[0:1]
 ; GFX950-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[2:3], s[2:3], 0, v[0:1]
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v3
 ; GFX950-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX950-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], vcc
@@ -9539,10 +9529,10 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn_neg128(ptr inreg %sbase
 ;
 ; GFX1250-GISEL-LABEL: flat_cmpxchg_saddr_i64_rtn_neg128:
 ; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v8, v1
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v10, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v9, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v9, v2 :: v_dual_mov_b32 v6, v3
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v6, v3 :: v_dual_mov_b32 v7, v4
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v5
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
@@ -9550,7 +9540,7 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn_neg128(ptr inreg %sbase
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, 0xffffff80, v0
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, -1, v1, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v7, v4 :: v_dual_bitop2_b32 v0, s0, v3 bitop3:0x14
+; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v0, v3, v10
 ; GFX1250-GISEL-NEXT:    v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-GISEL-NEXT:    s_and_saveexec_b32 s0, vcc_lo
@@ -9575,10 +9565,10 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn_neg128(ptr inreg %sbase
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB91_2
 ; GFX1250-GISEL-NEXT:  .LBB91_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, s1, v2
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v2, v0
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v4, off
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
@@ -9594,14 +9584,14 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn_neg128(ptr inreg %sbase
 ; GFX950-SDAG:       ; %bb.0:
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v6, v1
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX950-SDAG-NEXT:    s_movk_i32 s0, 0xff80
 ; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX950-SDAG-NEXT:    s_mov_b32 s1, -1
+; GFX950-SDAG-NEXT:    s_movk_i32 s2, 0xff80
+; GFX950-SDAG-NEXT:    s_mov_b32 s3, -1
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v5, v4
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v7, v2
-; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[2:3], v[0:1], 0, s[0:1]
 ; GFX950-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3]
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v3
 ; GFX950-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX950-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], vcc
@@ -9742,16 +9732,15 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffs
 ;
 ; GFX1250-GISEL-LABEL: flat_cmpxchg_saddr_i64_nortn:
 ; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v9, v2
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v6, v3 :: v_dual_mov_b32 v7, v4
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v9, v2 :: v_dual_mov_b32 v6, v3
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v5, src_flat_scratch_base_hi :: v_dual_mov_b32 v8, v1
+; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v0
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, s0, v3
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v7, v4 :: v_dual_bitop2_b32 v1, v3, v5 bitop3:0x14
 ; GFX1250-GISEL-NEXT:    v_cmpx_le_u32_e32 0x4000000, v1
 ; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB92_3
@@ -9772,10 +9761,10 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffs
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB92_2
 ; GFX1250-GISEL-NEXT:  .LBB92_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_lo
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, s0, v2
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v2, v0
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v2, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v2, off
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
@@ -9788,8 +9777,8 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffs
 ; GFX950-SDAG:       ; %bb.0:
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v6, v1
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
 ; GFX950-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v5, v4
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v7, v2
@@ -9831,9 +9820,8 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffs
 ; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX950-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v0
 ; GFX950-GISEL-NEXT:    v_mov_b32_e32 v8, v1
-; GFX950-GISEL-NEXT:    s_nop 0
-; GFX950-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
 ; GFX950-GISEL-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
 ; GFX950-GISEL-NEXT:    v_mov_b32_e32 v7, v4
 ; GFX950-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-GISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
@@ -9920,19 +9908,18 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32
 ;
 ; GFX1250-GISEL-LABEL: flat_cmpxchg_saddr_i64_nortn_neg128:
 ; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v9, v2
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v6, v3 :: v_dual_mov_b32 v7, v4
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v9, v2 :: v_dual_mov_b32 v6, v3
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v5, src_flat_scratch_base_hi :: v_dual_mov_b32 v8, v1
+; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v1, vcc_lo, v2, v0
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, 0xffffff80, v1
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, s0, v3
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v7, v4 :: v_dual_bitop2_b32 v1, v3, v5 bitop3:0x14
 ; GFX1250-GISEL-NEXT:    v_cmpx_le_u32_e32 0x4000000, v1
 ; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB93_3
@@ -9953,10 +9940,10 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB93_2
 ; GFX1250-GISEL-NEXT:  .LBB93_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_lo
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, s0, v2
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v2, v0
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v2, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v2, off
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
@@ -9969,11 +9956,11 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32
 ; GFX950-SDAG:       ; %bb.0:
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v6, v1
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX950-SDAG-NEXT:    s_movk_i32 s0, 0xff80
 ; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX950-SDAG-NEXT:    s_mov_b32 s1, -1
-; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX950-SDAG-NEXT:    s_movk_i32 s2, 0xff80
+; GFX950-SDAG-NEXT:    s_mov_b32 s3, -1
 ; GFX950-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v5, v4
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v7, v2
@@ -10246,14 +10233,14 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ;
 ; GFX1250-GISEL-LABEL: flat_inc_saddr_i64_rtn:
 ; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v6, vcc_lo, v0, v3
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14
+; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v0, v7, v8
 ; GFX1250-GISEL-NEXT:    v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-GISEL-NEXT:    s_and_saveexec_b32 s0, vcc_lo
@@ -10275,18 +10262,18 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB98_2
 ; GFX1250-GISEL-NEXT:  .LBB98_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
-; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, s1, v6
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, src_flat_scratch_base_lo
+; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v6, v0
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v6, off
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-GISEL-NEXT:    v_add_nc_u64_e32 v[2:3], 1, v[0:1]
 ; GFX1250-GISEL-NEXT:    v_cmp_ge_u64_e32 vcc_lo, v[0:1], v[4:5]
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc_lo
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e64 v3, v3, 0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_store_b64 v6, v[2:3], off
 ; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
@@ -10299,8 +10286,8 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
 ; GFX950-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
 ; GFX950-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX950-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], vcc
@@ -10442,9 +10429,9 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ;
 ; GFX1250-GISEL-LABEL: flat_inc_saddr_i64_rtn_neg128:
 ; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v3
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
@@ -10452,7 +10439,7 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v6, vcc_lo, 0xffffff80, v0
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14
+; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v0, v7, v8
 ; GFX1250-GISEL-NEXT:    v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-GISEL-NEXT:    s_and_saveexec_b32 s0, vcc_lo
@@ -10474,18 +10461,18 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB99_2
 ; GFX1250-GISEL-NEXT:  .LBB99_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
-; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, s1, v6
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, src_flat_scratch_base_lo
+; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v6, v0
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v6, off
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-GISEL-NEXT:    v_add_nc_u64_e32 v[2:3], 1, v[0:1]
 ; GFX1250-GISEL-NEXT:    v_cmp_ge_u64_e32 vcc_lo, v[0:1], v[4:5]
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc_lo
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e64 v3, v3, 0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_store_b64 v6, v[2:3], off
 ; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
@@ -10498,11 +10485,11 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX950-SDAG-NEXT:    s_movk_i32 s0, 0xff80
 ; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX950-SDAG-NEXT:    s_mov_b32 s1, -1
-; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX950-SDAG-NEXT:    s_movk_i32 s2, 0xff80
+; GFX950-SDAG-NEXT:    s_mov_b32 s3, -1
 ; GFX950-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
 ; GFX950-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX950-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], vcc
@@ -10638,15 +10625,15 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ;
 ; GFX1250-GISEL-LABEL: flat_inc_saddr_i64_nortn:
 ; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v0
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, s0, v3
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, v3, v6
 ; GFX1250-GISEL-NEXT:    v_cmpx_le_u32_e32 0x4000000, v1
 ; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB100_3
@@ -10663,17 +10650,17 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB100_2
 ; GFX1250-GISEL-NEXT:  .LBB100_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_lo
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, s0, v2
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v2, v0
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v6, off
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-GISEL-NEXT:    v_add_nc_u64_e32 v[2:3], 1, v[0:1]
 ; GFX1250-GISEL-NEXT:    v_cmp_ge_u64_e32 vcc_lo, v[0:1], v[4:5]
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc_lo
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, 0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_store_b64 v6, v[0:1], off
 ; GFX1250-GISEL-NEXT:    s_endpgm
@@ -10683,8 +10670,8 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
 ; GFX950-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-SDAG-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -10720,9 +10707,8 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX950-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v0
 ; GFX950-GISEL-NEXT:    v_mov_b32_e32 v4, v1
-; GFX950-GISEL-NEXT:    s_nop 0
-; GFX950-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
 ; GFX950-GISEL-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
 ; GFX950-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-GISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -10804,18 +10790,18 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ;
 ; GFX1250-GISEL-LABEL: flat_inc_saddr_i64_nortn_neg128:
 ; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v1, vcc_lo, v2, v0
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, 0xffffff80, v1
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, s0, v3
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, v3, v6
 ; GFX1250-GISEL-NEXT:    v_cmpx_le_u32_e32 0x4000000, v1
 ; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB101_3
@@ -10832,17 +10818,17 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB101_2
 ; GFX1250-GISEL-NEXT:  .LBB101_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_lo
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, s0, v2
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v2, v0
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v6, off
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-GISEL-NEXT:    v_add_nc_u64_e32 v[2:3], 1, v[0:1]
 ; GFX1250-GISEL-NEXT:    v_cmp_ge_u64_e32 vcc_lo, v[0:1], v[4:5]
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc_lo
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, 0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_store_b64 v6, v[0:1], off
 ; GFX1250-GISEL-NEXT:    s_endpgm
@@ -10852,11 +10838,11 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX950-SDAG-NEXT:    s_movk_i32 s0, 0xff80
 ; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX950-SDAG-NEXT:    s_mov_b32 s1, -1
-; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX950-SDAG-NEXT:    s_movk_i32 s2, 0xff80
+; GFX950-SDAG-NEXT:    s_mov_b32 s3, -1
 ; GFX950-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-SDAG-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -11126,14 +11112,14 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ;
 ; GFX1250-GISEL-LABEL: flat_dec_saddr_i64_rtn:
 ; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v6, vcc_lo, v0, v3
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14
+; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v0, v7, v8
 ; GFX1250-GISEL-NEXT:    v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-GISEL-NEXT:    s_and_saveexec_b32 s0, vcc_lo
@@ -11155,11 +11141,11 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s1, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB106_2
 ; GFX1250-GISEL-NEXT:  .LBB106_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_lo
-; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, s0, v6
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, src_flat_scratch_base_lo
+; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v6, v0
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v6, off
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
@@ -11180,8 +11166,8 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
 ; GFX950-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
 ; GFX950-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX950-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], vcc
@@ -11328,9 +11314,9 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ;
 ; GFX1250-GISEL-LABEL: flat_dec_saddr_i64_rtn_neg128:
 ; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v3
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
@@ -11338,7 +11324,7 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v6, vcc_lo, 0xffffff80, v0
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14
+; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v0, v7, v8
 ; GFX1250-GISEL-NEXT:    v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-GISEL-NEXT:    s_and_saveexec_b32 s0, vcc_lo
@@ -11360,11 +11346,11 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s1, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB107_2
 ; GFX1250-GISEL-NEXT:  .LBB107_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_lo
-; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, s0, v6
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, src_flat_scratch_base_lo
+; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v6, v0
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v6, off
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
@@ -11385,11 +11371,11 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX950-SDAG-NEXT:    s_movk_i32 s0, 0xff80
 ; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX950-SDAG-NEXT:    s_mov_b32 s1, -1
-; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX950-SDAG-NEXT:    s_movk_i32 s2, 0xff80
+; GFX950-SDAG-NEXT:    s_mov_b32 s3, -1
 ; GFX950-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
 ; GFX950-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX950-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], vcc
@@ -11530,15 +11516,15 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ;
 ; GFX1250-GISEL-LABEL: flat_dec_saddr_i64_nortn:
 ; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v0
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, s0, v3
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, v3, v6
 ; GFX1250-GISEL-NEXT:    v_cmpx_le_u32_e32 0x4000000, v1
 ; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB108_3
@@ -11555,10 +11541,10 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB108_2
 ; GFX1250-GISEL-NEXT:  .LBB108_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_lo
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, s0, v2
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v2, v0
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v2, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v2, off
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
@@ -11576,8 +11562,8 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
 ; GFX950-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-SDAG-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -11614,9 +11600,8 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX950-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v0
 ; GFX950-GISEL-NEXT:    v_mov_b32_e32 v4, v1
-; GFX950-GISEL-NEXT:    s_nop 0
-; GFX950-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
 ; GFX950-GISEL-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
 ; GFX950-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-GISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -11702,18 +11687,18 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ;
 ; GFX1250-GISEL-LABEL: flat_dec_saddr_i64_nortn_neg128:
 ; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v1, vcc_lo, v2, v0
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, 0xffffff80, v1
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, s0, v3
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, v3, v6
 ; GFX1250-GISEL-NEXT:    v_cmpx_le_u32_e32 0x4000000, v1
 ; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB109_3
@@ -11730,10 +11715,10 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB109_2
 ; GFX1250-GISEL-NEXT:  .LBB109_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_lo
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, s0, v2
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v2, v0
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v2, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v2, off
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
@@ -11751,11 +11736,11 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX950-SDAG-NEXT:    s_movk_i32 s0, 0xff80
 ; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX950-SDAG-NEXT:    s_mov_b32 s1, -1
-; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX950-SDAG-NEXT:    s_movk_i32 s2, 0xff80
+; GFX950-SDAG-NEXT:    s_mov_b32 s3, -1
 ; GFX950-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
 ; GFX950-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-SDAG-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
@@ -11841,8 +11826,8 @@ define double @flat_atomic_fadd_f64_saddr_rtn(ptr inreg %ptr, double %data) {
 ; GFX1250-SDAG:       ; %bb.0:
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-SDAG-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 0x50
 ; GFX1250-SDAG-NEXT:    s_mov_b64 s[2:3], src_shared_base
+; GFX1250-SDAG-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 0x50
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1250-SDAG-NEXT:    s_cmp_eq_u32 s1, s3
 ; GFX1250-SDAG-NEXT:    s_cselect_b32 s2, -1, 0
@@ -11899,8 +11884,8 @@ define double @flat_atomic_fadd_f64_saddr_rtn(ptr inreg %ptr, double %data) {
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-GISEL-NEXT:    s_add_co_u32 s0, s0, 0x50
-; GFX1250-GISEL-NEXT:    s_add_co_ci_u32 s1, s1, 0
 ; GFX1250-GISEL-NEXT:    s_mov_b64 s[2:3], src_shared_base
+; GFX1250-GISEL-NEXT:    s_add_co_ci_u32 s1, s1, 0
 ; GFX1250-GISEL-NEXT:    s_mov_b32 s2, 1
 ; GFX1250-GISEL-NEXT:    s_cmp_lg_u32 s1, s3
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr2_vgpr3
@@ -11958,8 +11943,8 @@ define double @flat_atomic_fadd_f64_saddr_rtn(ptr inreg %ptr, double %data) {
 ; GFX950-SDAG:       ; %bb.0:
 ; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-SDAG-NEXT:    s_add_u32 s0, s0, 0x50
-; GFX950-SDAG-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-SDAG-NEXT:    s_mov_b64 s[2:3], src_shared_base
+; GFX950-SDAG-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-SDAG-NEXT:    s_cmp_eq_u32 s1, s3
 ; GFX950-SDAG-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; GFX950-SDAG-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
@@ -12006,8 +11991,8 @@ define double @flat_atomic_fadd_f64_saddr_rtn(ptr inreg %ptr, double %data) {
 ; GFX950-GISEL:       ; %bb.0:
 ; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-GISEL-NEXT:    s_add_u32 s0, s0, 0x50
-; GFX950-GISEL-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-GISEL-NEXT:    s_mov_b64 s[2:3], src_shared_base
+; GFX950-GISEL-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-GISEL-NEXT:    s_cmp_lg_u32 s1, s3
 ; GFX950-GISEL-NEXT:    s_mov_b32 s2, 1
 ; GFX950-GISEL-NEXT:    ; implicit-def: $vgpr2_vgpr3
@@ -12061,8 +12046,8 @@ define void @flat_atomic_fadd_f64_saddr_nortn(ptr inreg %ptr, double %data) {
 ; GFX1250-SDAG:       ; %bb.0:
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-SDAG-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 0x50
 ; GFX1250-SDAG-NEXT:    s_mov_b64 s[2:3], src_shared_base
+; GFX1250-SDAG-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 0x50
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1250-SDAG-NEXT:    s_cmp_eq_u32 s1, s3
 ; GFX1250-SDAG-NEXT:    s_cselect_b32 s2, -1, 0
@@ -12121,8 +12106,8 @@ define void @flat_atomic_fadd_f64_saddr_nortn(ptr inreg %ptr, double %data) {
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-GISEL-NEXT:    s_add_co_u32 s0, s0, 0x50
-; GFX1250-GISEL-NEXT:    s_add_co_ci_u32 s1, s1, 0
 ; GFX1250-GISEL-NEXT:    s_mov_b64 s[2:3], src_shared_base
+; GFX1250-GISEL-NEXT:    s_add_co_ci_u32 s1, s1, 0
 ; GFX1250-GISEL-NEXT:    s_mov_b32 s2, 1
 ; GFX1250-GISEL-NEXT:    s_cmp_lg_u32 s1, s3
 ; GFX1250-GISEL-NEXT:    s_cbranch_scc0 .LBB111_6
@@ -12177,8 +12162,8 @@ define void @flat_atomic_fadd_f64_saddr_nortn(ptr inreg %ptr, double %data) {
 ; GFX950-SDAG:       ; %bb.0:
 ; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-SDAG-NEXT:    s_add_u32 s0, s0, 0x50
-; GFX950-SDAG-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-SDAG-NEXT:    s_mov_b64 s[2:3], src_shared_base
+; GFX950-SDAG-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-SDAG-NEXT:    s_cmp_eq_u32 s1, s3
 ; GFX950-SDAG-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; GFX950-SDAG-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
@@ -12225,8 +12210,8 @@ define void @flat_atomic_fadd_f64_saddr_nortn(ptr inreg %ptr, double %data) {
 ; GFX950-GISEL:       ; %bb.0:
 ; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-GISEL-NEXT:    s_add_u32 s0, s0, 0x50
-; GFX950-GISEL-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-GISEL-NEXT:    s_mov_b64 s[2:3], src_shared_base
+; GFX950-GISEL-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-GISEL-NEXT:    s_cmp_lg_u32 s1, s3
 ; GFX950-GISEL-NEXT:    s_mov_b32 s2, 1
 ; GFX950-GISEL-NEXT:    s_cbranch_scc0 .LBB111_6
@@ -12355,8 +12340,8 @@ define double @flat_atomic_fmax_f64_saddr_rtn(ptr inreg %ptr, double %data) {
 ; GFX950-SDAG:       ; %bb.0:
 ; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-SDAG-NEXT:    s_add_u32 s0, s0, 0x50
-; GFX950-SDAG-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-SDAG-NEXT:    s_mov_b64 s[2:3], src_private_base
+; GFX950-SDAG-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-SDAG-NEXT:    s_cmp_eq_u32 s1, s3
 ; GFX950-SDAG-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; GFX950-SDAG-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
@@ -12388,8 +12373,8 @@ define double @flat_atomic_fmax_f64_saddr_rtn(ptr inreg %ptr, double %data) {
 ; GFX950-GISEL:       ; %bb.0:
 ; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-GISEL-NEXT:    s_add_u32 s2, s0, 0x50
-; GFX950-GISEL-NEXT:    s_addc_u32 s3, s1, 0
 ; GFX950-GISEL-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX950-GISEL-NEXT:    s_addc_u32 s3, s1, 0
 ; GFX950-GISEL-NEXT:    s_cmp_lg_u32 s3, s5
 ; GFX950-GISEL-NEXT:    s_mov_b32 s4, 1
 ; GFX950-GISEL-NEXT:    ; implicit-def: $vgpr2_vgpr3
@@ -12506,8 +12491,8 @@ define void @flat_atomic_fmax_f64_saddr_nortn(ptr inreg %ptr, double %data) {
 ; GFX950-SDAG:       ; %bb.0:
 ; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-SDAG-NEXT:    s_add_u32 s0, s0, 0x50
-; GFX950-SDAG-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-SDAG-NEXT:    s_mov_b64 s[2:3], src_private_base
+; GFX950-SDAG-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-SDAG-NEXT:    s_cmp_eq_u32 s1, s3
 ; GFX950-SDAG-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; GFX950-SDAG-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
@@ -12540,8 +12525,8 @@ define void @flat_atomic_fmax_f64_saddr_nortn(ptr inreg %ptr, double %data) {
 ; GFX950-GISEL:       ; %bb.0:
 ; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-GISEL-NEXT:    s_add_u32 s2, s0, 0x50
-; GFX950-GISEL-NEXT:    s_addc_u32 s3, s1, 0
 ; GFX950-GISEL-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX950-GISEL-NEXT:    s_addc_u32 s3, s1, 0
 ; GFX950-GISEL-NEXT:    s_cmp_lg_u32 s3, s5
 ; GFX950-GISEL-NEXT:    s_mov_b32 s4, 1
 ; GFX950-GISEL-NEXT:    s_cbranch_scc0 .LBB113_2
@@ -12656,8 +12641,8 @@ define double @flat_atomic_fmin_f64_saddr_rtn(ptr inreg %ptr, double %data) {
 ; GFX950-SDAG:       ; %bb.0:
 ; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-SDAG-NEXT:    s_add_u32 s0, s0, 0x50
-; GFX950-SDAG-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-SDAG-NEXT:    s_mov_b64 s[2:3], src_private_base
+; GFX950-SDAG-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-SDAG-NEXT:    s_cmp_eq_u32 s1, s3
 ; GFX950-SDAG-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; GFX950-SDAG-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
@@ -12689,8 +12674,8 @@ define double @flat_atomic_fmin_f64_saddr_rtn(ptr inreg %ptr, double %data) {
 ; GFX950-GISEL:       ; %bb.0:
 ; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-GISEL-NEXT:    s_add_u32 s2, s0, 0x50
-; GFX950-GISEL-NEXT:    s_addc_u32 s3, s1, 0
 ; GFX950-GISEL-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX950-GISEL-NEXT:    s_addc_u32 s3, s1, 0
 ; GFX950-GISEL-NEXT:    s_cmp_lg_u32 s3, s5
 ; GFX950-GISEL-NEXT:    s_mov_b32 s4, 1
 ; GFX950-GISEL-NEXT:    ; implicit-def: $vgpr2_vgpr3
@@ -12807,8 +12792,8 @@ define void @flat_atomic_fmin_f64_saddr_nortn(ptr inreg %ptr, double %data) {
 ; GFX950-SDAG:       ; %bb.0:
 ; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-SDAG-NEXT:    s_add_u32 s0, s0, 0x50
-; GFX950-SDAG-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-SDAG-NEXT:    s_mov_b64 s[2:3], src_private_base
+; GFX950-SDAG-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX950-SDAG-NEXT:    s_cmp_eq_u32 s1, s3
 ; GFX950-SDAG-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; GFX950-SDAG-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
@@ -12841,8 +12826,8 @@ define void @flat_atomic_fmin_f64_saddr_nortn(ptr inreg %ptr, double %data) {
 ; GFX950-GISEL:       ; %bb.0:
 ; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-GISEL-NEXT:    s_add_u32 s2, s0, 0x50
-; GFX950-GISEL-NEXT:    s_addc_u32 s3, s1, 0
 ; GFX950-GISEL-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX950-GISEL-NEXT:    s_addc_u32 s3, s1, 0
 ; GFX950-GISEL-NEXT:    s_cmp_lg_u32 s3, s5
 ; GFX950-GISEL-NEXT:    s_mov_b32 s4, 1
 ; GFX950-GISEL-NEXT:    s_cbranch_scc0 .LBB115_2
diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
index fc8883924dfbc..4eaa1965c66f1 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
@@ -4152,8 +4152,7 @@ define void @store_load_i64_aligned(ptr addrspace(5) nocapture %arg) {
 ; GFX942-LABEL: store_load_i64_aligned:
 ; GFX942:       ; %bb.0: ; %bb
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v2, 15
-; GFX942-NEXT:    v_mov_b32_e32 v3, 0
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], 15
 ; GFX942-NEXT:    scratch_store_dwordx2 v0, v[2:3], off sc0 sc1
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    scratch_load_dwordx2 v[0:1], v0, off sc0 sc1
@@ -4263,8 +4262,7 @@ define void @store_load_i64_unaligned(ptr addrspace(5) nocapture %arg) {
 ; GFX942-LABEL: store_load_i64_unaligned:
 ; GFX942:       ; %bb.0: ; %bb
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v2, 15
-; GFX942-NEXT:    v_mov_b32_e32 v3, 0
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], 15
 ; GFX942-NEXT:    scratch_store_dwordx2 v0, v[2:3], off sc0 sc1
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    scratch_load_dwordx2 v[0:1], v0, off sc0 sc1
diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll
index d9a596283db1e..1f105e8dd8ba5 100644
--- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll
@@ -106,11 +106,11 @@ define amdgpu_kernel void @atomic_add_i64_offset(ptr %out, i64 %in) {
 ; GFX12-LABEL: atomic_add_i64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 32
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
 ; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
 ; GFX12-NEXT:    s_mov_b32 s4, -1
@@ -401,17 +401,17 @@ define amdgpu_kernel void @atomic_add_i64_addr64_offset(ptr %out, i64 %in, i64 %
 ; GFX12-LABEL: atomic_add_i64_addr64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
+; GFX12-NEXT:    s_load_b64 s[8:9], s[4:5], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[8:9], 3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
-; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 32
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
 ; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
 ; GFX12-NEXT:    s_mov_b32 s4, -1
 ; GFX12-NEXT:    s_cbranch_vccnz .LBB2_3
@@ -555,15 +555,15 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(ptr %out, ptr %out2,
 ; GFX12-LABEL: atomic_add_i64_ret_addr64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b64 s[8:9], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[6:7]
-; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 32
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s9
 ; GFX12-NEXT:    s_cselect_b32 s6, -1, 0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s6
 ; GFX12-NEXT:    s_cbranch_vccz .LBB3_2
 ; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
@@ -696,9 +696,9 @@ define amdgpu_kernel void @atomic_add_i64(ptr %out, i64 %in) {
 ; GFX12-LABEL: atomic_add_i64:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
 ; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
@@ -979,15 +979,16 @@ define amdgpu_kernel void @atomic_add_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GFX12-LABEL: atomic_add_i64_addr64:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
+; GFX12-NEXT:    s_load_b64 s[8:9], s[4:5], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
 ; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[8:9], 3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
 ; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
 ; GFX12-NEXT:    s_mov_b32 s4, -1
 ; GFX12-NEXT:    s_cbranch_vccnz .LBB6_3
@@ -1267,11 +1268,11 @@ define amdgpu_kernel void @atomic_and_i64_offset(ptr %out, i64 %in) {
 ; GFX12-LABEL: atomic_and_i64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 32
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
 ; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
 ; GFX12-NEXT:    s_mov_b32 s4, -1
@@ -1556,17 +1557,17 @@ define amdgpu_kernel void @atomic_and_i64_addr64_offset(ptr %out, i64 %in, i64 %
 ; GFX12-LABEL: atomic_and_i64_addr64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
+; GFX12-NEXT:    s_load_b64 s[8:9], s[4:5], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[8:9], 3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
-; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 32
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
 ; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
 ; GFX12-NEXT:    s_mov_b32 s4, -1
 ; GFX12-NEXT:    s_cbranch_vccnz .LBB10_3
@@ -1707,15 +1708,15 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr %out, ptr %out2,
 ; GFX12-LABEL: atomic_and_i64_ret_addr64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b64 s[8:9], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[6:7]
-; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 32
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s9
 ; GFX12-NEXT:    s_cselect_b32 s6, -1, 0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s6
 ; GFX12-NEXT:    s_cbranch_vccz .LBB11_2
 ; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
@@ -1845,9 +1846,9 @@ define amdgpu_kernel void @atomic_and_i64(ptr %out, i64 %in) {
 ; GFX12-LABEL: atomic_and_i64:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
 ; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
@@ -2122,15 +2123,16 @@ define amdgpu_kernel void @atomic_and_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GFX12-LABEL: atomic_and_i64_addr64:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
+; GFX12-NEXT:    s_load_b64 s[8:9], s[4:5], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
 ; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[8:9], 3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
 ; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
 ; GFX12-NEXT:    s_mov_b32 s4, -1
 ; GFX12-NEXT:    s_cbranch_vccnz .LBB14_3
@@ -2408,11 +2410,11 @@ define amdgpu_kernel void @atomic_sub_i64_offset(ptr %out, i64 %in) {
 ; GFX12-LABEL: atomic_sub_i64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 32
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
 ; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
 ; GFX12-NEXT:    s_mov_b32 s4, -1
@@ -2703,17 +2705,17 @@ define amdgpu_kernel void @atomic_sub_i64_addr64_offset(ptr %out, i64 %in, i64 %
 ; GFX12-LABEL: atomic_sub_i64_addr64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
+; GFX12-NEXT:    s_load_b64 s[8:9], s[4:5], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[8:9], 3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
-; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 32
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
 ; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
 ; GFX12-NEXT:    s_mov_b32 s4, -1
 ; GFX12-NEXT:    s_cbranch_vccnz .LBB18_3
@@ -2857,15 +2859,15 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr %out, ptr %out2,
 ; GFX12-LABEL: atomic_sub_i64_ret_addr64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b64 s[8:9], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[6:7]
-; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 32
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s9
 ; GFX12-NEXT:    s_cselect_b32 s6, -1, 0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s6
 ; GFX12-NEXT:    s_cbranch_vccz .LBB19_2
 ; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
@@ -2998,9 +3000,9 @@ define amdgpu_kernel void @atomic_sub_i64(ptr %out, i64 %in) {
 ; GFX12-LABEL: atomic_sub_i64:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
 ; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
@@ -3281,15 +3283,16 @@ define amdgpu_kernel void @atomic_sub_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GFX12-LABEL: atomic_sub_i64_addr64:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
+; GFX12-NEXT:    s_load_b64 s[8:9], s[4:5], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
 ; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[8:9], 3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
 ; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
 ; GFX12-NEXT:    s_mov_b32 s4, -1
 ; GFX12-NEXT:    s_cbranch_vccnz .LBB22_3
@@ -3571,11 +3574,11 @@ define amdgpu_kernel void @atomic_max_i64_offset(ptr %out, i64 %in) {
 ; GFX12-LABEL: atomic_max_i64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 32
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
 ; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
 ; GFX12-NEXT:    s_mov_b32 s4, -1
@@ -3868,17 +3871,17 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 %
 ; GFX12-LABEL: atomic_max_i64_addr64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
+; GFX12-NEXT:    s_load_b64 s[8:9], s[4:5], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[8:9], 3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
-; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 32
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
 ; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
 ; GFX12-NEXT:    s_mov_b32 s4, -1
 ; GFX12-NEXT:    s_cbranch_vccnz .LBB26_3
@@ -4024,15 +4027,15 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2,
 ; GFX12-LABEL: atomic_max_i64_ret_addr64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b64 s[8:9], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[6:7]
-; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 32
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s9
 ; GFX12-NEXT:    s_cselect_b32 s6, -1, 0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s6
 ; GFX12-NEXT:    s_cbranch_vccz .LBB27_2
 ; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
@@ -4165,9 +4168,9 @@ define amdgpu_kernel void @atomic_max_i64(ptr %out, i64 %in) {
 ; GFX12-LABEL: atomic_max_i64:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
 ; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
@@ -4450,15 +4453,16 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GFX12-LABEL: atomic_max_i64_addr64:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
+; GFX12-NEXT:    s_load_b64 s[8:9], s[4:5], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
 ; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[8:9], 3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
 ; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
 ; GFX12-NEXT:    s_mov_b32 s4, -1
 ; GFX12-NEXT:    s_cbranch_vccnz .LBB30_3
@@ -4742,11 +4746,11 @@ define amdgpu_kernel void @atomic_umax_i64_offset(ptr %out, i64 %in) {
 ; GFX12-LABEL: atomic_umax_i64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 32
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
 ; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
 ; GFX12-NEXT:    s_mov_b32 s4, -1
@@ -5039,17 +5043,17 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64
 ; GFX12-LABEL: atomic_umax_i64_addr64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
+; GFX12-NEXT:    s_load_b64 s[8:9], s[4:5], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[8:9], 3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
-; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 32
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
 ; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
 ; GFX12-NEXT:    s_mov_b32 s4, -1
 ; GFX12-NEXT:    s_cbranch_vccnz .LBB34_3
@@ -5195,15 +5199,15 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2
 ; GFX12-LABEL: atomic_umax_i64_ret_addr64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b64 s[8:9], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[6:7]
-; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 32
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s9
 ; GFX12-NEXT:    s_cselect_b32 s6, -1, 0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s6
 ; GFX12-NEXT:    s_cbranch_vccz .LBB35_2
 ; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
@@ -5336,9 +5340,9 @@ define amdgpu_kernel void @atomic_umax_i64(ptr %out, i64 %in) {
 ; GFX12-LABEL: atomic_umax_i64:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
 ; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
@@ -5621,15 +5625,16 @@ define amdgpu_kernel void @atomic_umax_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GFX12-LABEL: atomic_umax_i64_addr64:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
+; GFX12-NEXT:    s_load_b64 s[8:9], s[4:5], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
 ; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[8:9], 3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
 ; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
 ; GFX12-NEXT:    s_mov_b32 s4, -1
 ; GFX12-NEXT:    s_cbranch_vccnz .LBB38_3
@@ -5913,11 +5918,11 @@ define amdgpu_kernel void @atomic_min_i64_offset(ptr %out, i64 %in) {
 ; GFX12-LABEL: atomic_min_i64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 32
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
 ; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
 ; GFX12-NEXT:    s_mov_b32 s4, -1
@@ -6210,17 +6215,17 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 %
 ; GFX12-LABEL: atomic_min_i64_addr64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
+; GFX12-NEXT:    s_load_b64 s[8:9], s[4:5], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[8:9], 3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
-; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 32
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
 ; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
 ; GFX12-NEXT:    s_mov_b32 s4, -1
 ; GFX12-NEXT:    s_cbranch_vccnz .LBB42_3
@@ -6366,15 +6371,15 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2,
 ; GFX12-LABEL: atomic_min_i64_ret_addr64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b64 s[8:9], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[6:7]
-; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 32
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s9
 ; GFX12-NEXT:    s_cselect_b32 s6, -1, 0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s6
 ; GFX12-NEXT:    s_cbranch_vccz .LBB43_2
 ; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
@@ -6507,9 +6512,9 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) {
 ; GFX12-LABEL: atomic_min_i64:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
 ; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
@@ -6792,15 +6797,16 @@ define amdgpu_kernel void @atomic_min_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GFX12-LABEL: atomic_min_i64_addr64:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
+; GFX12-NEXT:    s_load_b64 s[8:9], s[4:5], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
 ; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[8:9], 3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
 ; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
 ; GFX12-NEXT:    s_mov_b32 s4, -1
 ; GFX12-NEXT:    s_cbranch_vccnz .LBB46_3
@@ -7084,11 +7090,11 @@ define amdgpu_kernel void @atomic_umin_i64_offset(ptr %out, i64 %in) {
 ; GFX12-LABEL: atomic_umin_i64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 32
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
 ; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
 ; GFX12-NEXT:    s_mov_b32 s4, -1
@@ -7381,17 +7387,17 @@ define amdgpu_kernel void @atomic_umin_i64_addr64_offset(ptr %out, i64 %in, i64
 ; GFX12-LABEL: atomic_umin_i64_addr64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
+; GFX12-NEXT:    s_load_b64 s[8:9], s[4:5], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[8:9], 3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
-; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 32
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
 ; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
 ; GFX12-NEXT:    s_mov_b32 s4, -1
 ; GFX12-NEXT:    s_cbranch_vccnz .LBB50_3
@@ -7537,15 +7543,15 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr %out, ptr %out2
 ; GFX12-LABEL: atomic_umin_i64_ret_addr64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b64 s[8:9], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[6:7]
-; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 32
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s9
 ; GFX12-NEXT:    s_cselect_b32 s6, -1, 0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s6
 ; GFX12-NEXT:    s_cbranch_vccz .LBB51_2
 ; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
@@ -7678,9 +7684,9 @@ define amdgpu_kernel void @atomic_umin_i64(ptr %out, i64 %in) {
 ; GFX12-LABEL: atomic_umin_i64:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
 ; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
@@ -7963,15 +7969,16 @@ define amdgpu_kernel void @atomic_umin_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GFX12-LABEL: atomic_umin_i64_addr64:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
+; GFX12-NEXT:    s_load_b64 s[8:9], s[4:5], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
 ; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[8:9], 3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
 ; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
 ; GFX12-NEXT:    s_mov_b32 s4, -1
 ; GFX12-NEXT:    s_cbranch_vccnz .LBB54_3
@@ -8253,11 +8260,11 @@ define amdgpu_kernel void @atomic_or_i64_offset(ptr %out, i64 %in) {
 ; GFX12-LABEL: atomic_or_i64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 32
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
 ; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
 ; GFX12-NEXT:    s_mov_b32 s4, -1
@@ -8542,17 +8549,17 @@ define amdgpu_kernel void @atomic_or_i64_addr64_offset(ptr %out, i64 %in, i64 %i
 ; GFX12-LABEL: atomic_or_i64_addr64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
+; GFX12-NEXT:    s_load_b64 s[8:9], s[4:5], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[8:9], 3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
-; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 32
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
 ; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
 ; GFX12-NEXT:    s_mov_b32 s4, -1
 ; GFX12-NEXT:    s_cbranch_vccnz .LBB58_3
@@ -8693,15 +8700,15 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr %out, ptr %out2,
 ; GFX12-LABEL: atomic_or_i64_ret_addr64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b64 s[8:9], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[6:7]
-; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 32
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s9
 ; GFX12-NEXT:    s_cselect_b32 s6, -1, 0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s6
 ; GFX12-NEXT:    s_cbranch_vccz .LBB59_2
 ; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
@@ -8831,9 +8838,9 @@ define amdgpu_kernel void @atomic_or_i64(ptr %out, i64 %in) {
 ; GFX12-LABEL: atomic_or_i64:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
 ; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
@@ -9108,15 +9115,16 @@ define amdgpu_kernel void @atomic_or_i64_addr64(ptr %out, i64 %in, i64 %index) {
 ; GFX12-LABEL: atomic_or_i64_addr64:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
+; GFX12-NEXT:    s_load_b64 s[8:9], s[4:5], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
 ; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[8:9], 3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
 ; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
 ; GFX12-NEXT:    s_mov_b32 s4, -1
 ; GFX12-NEXT:    s_cbranch_vccnz .LBB62_3
@@ -9384,11 +9392,11 @@ define amdgpu_kernel void @atomic_xchg_i64_offset(ptr %out, i64 %in) {
 ; GFX12-LABEL: atomic_xchg_i64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 32
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
 ; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
 ; GFX12-NEXT:    s_mov_b32 s4, -1
@@ -9510,11 +9518,11 @@ define amdgpu_kernel void @atomic_xchg_f64_offset(ptr %out, double %in) {
 ; GFX12-LABEL: atomic_xchg_f64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 32
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
 ; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
 ; GFX12-NEXT:    s_mov_b32 s4, -1
@@ -9636,11 +9644,11 @@ define amdgpu_kernel void @atomic_xchg_pointer_offset(ptr %out, ptr %in) {
 ; GFX12-LABEL: atomic_xchg_pointer_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 32
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
 ; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
 ; GFX12-NEXT:    s_mov_b32 s4, -1
@@ -9911,17 +9919,17 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(ptr %out, i64 %in, i64
 ; GFX12-LABEL: atomic_xchg_i64_addr64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
+; GFX12-NEXT:    s_load_b64 s[8:9], s[4:5], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[8:9], 3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
-; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 32
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
 ; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
 ; GFX12-NEXT:    s_mov_b32 s4, -1
 ; GFX12-NEXT:    s_cbranch_vccnz .LBB68_3
@@ -10057,15 +10065,15 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(ptr %out, ptr %out2
 ; GFX12-LABEL: atomic_xchg_i64_ret_addr64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b64 s[8:9], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[6:7]
-; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 32
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s9
 ; GFX12-NEXT:    s_cselect_b32 s6, -1, 0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s6
 ; GFX12-NEXT:    s_cbranch_vccz .LBB69_2
 ; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
@@ -10186,9 +10194,9 @@ define amdgpu_kernel void @atomic_xchg_i64(ptr %out, i64 %in) {
 ; GFX12-LABEL: atomic_xchg_i64:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
 ; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
@@ -10449,15 +10457,16 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GFX12-LABEL: atomic_xchg_i64_addr64:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
+; GFX12-NEXT:    s_load_b64 s[8:9], s[4:5], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
 ; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[8:9], 3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
 ; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
 ; GFX12-NEXT:    s_mov_b32 s4, -1
 ; GFX12-NEXT:    s_cbranch_vccnz .LBB72_3
@@ -10727,11 +10736,11 @@ define amdgpu_kernel void @atomic_xor_i64_offset(ptr %out, i64 %in) {
 ; GFX12-LABEL: atomic_xor_i64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 32
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
 ; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
 ; GFX12-NEXT:    s_mov_b32 s4, -1
@@ -11016,17 +11025,17 @@ define amdgpu_kernel void @atomic_xor_i64_addr64_offset(ptr %out, i64 %in, i64 %
 ; GFX12-LABEL: atomic_xor_i64_addr64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
+; GFX12-NEXT:    s_load_b64 s[8:9], s[4:5], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[8:9], 3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
-; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 32
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
 ; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
 ; GFX12-NEXT:    s_mov_b32 s4, -1
 ; GFX12-NEXT:    s_cbranch_vccnz .LBB76_3
@@ -11167,15 +11176,15 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr %out, ptr %out2,
 ; GFX12-LABEL: atomic_xor_i64_ret_addr64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b64 s[8:9], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[6:7]
-; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 32
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s9
 ; GFX12-NEXT:    s_cselect_b32 s6, -1, 0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s6
 ; GFX12-NEXT:    s_cbranch_vccz .LBB77_2
 ; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
@@ -11305,9 +11314,9 @@ define amdgpu_kernel void @atomic_xor_i64(ptr %out, i64 %in) {
 ; GFX12-LABEL: atomic_xor_i64:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
 ; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
@@ -11582,15 +11591,16 @@ define amdgpu_kernel void @atomic_xor_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GFX12-LABEL: atomic_xor_i64_addr64:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
+; GFX12-NEXT:    s_load_b64 s[8:9], s[4:5], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
 ; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[8:9], 3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
 ; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
 ; GFX12-NEXT:    s_mov_b32 s4, -1
 ; GFX12-NEXT:    s_cbranch_vccnz .LBB80_3
@@ -12745,15 +12755,15 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr %out, i64 %in, i
 ; GFX12-LABEL: atomic_cmpxchg_i64_addr64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b64 s[8:9], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
-; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 32
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s9
 ; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
 ; GFX12-NEXT:    s_mov_b32 s4, -1
 ; GFX12-NEXT:    s_cbranch_vccnz .LBB93_3
@@ -12908,14 +12918,14 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr %out, ptr %o
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b256 s[8:15], s[4:5], 0x24
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x44
-; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshl_b64 s[2:3], s[14:15], 3
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[2:3], s[8:9], s[2:3]
 ; GFX12-NEXT:    s_add_nc_u64 s[2:3], s[2:3], 32
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    s_cmp_eq_u32 s3, s5
+; GFX12-NEXT:    s_cmp_eq_u32 s3, s7
 ; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
 ; GFX12-NEXT:    s_cbranch_vccz .LBB94_2
@@ -13510,12 +13520,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr %out, ptr %out2, i6
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b256 s[8:15], s[4:5], 0x24
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x44
-; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshl_b64 s[2:3], s[14:15], 3
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[2:3], s[8:9], s[2:3]
-; GFX12-NEXT:    s_cmp_eq_u32 s3, s5
+; GFX12-NEXT:    s_cmp_eq_u32 s3, s7
 ; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
@@ -14071,11 +14081,11 @@ define amdgpu_kernel void @atomic_inc_i64_offset(ptr %out, i64 %in) {
 ; GFX12-LABEL: atomic_inc_i64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 32
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
 ; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
 ; GFX12-NEXT:    s_mov_b32 s4, -1
@@ -14380,17 +14390,17 @@ define amdgpu_kernel void @atomic_inc_i64_incr64_offset(ptr %out, i64 %in, i64 %
 ; GFX12-LABEL: atomic_inc_i64_incr64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
+; GFX12-NEXT:    s_load_b64 s[8:9], s[4:5], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[8:9], 3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
-; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 32
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
 ; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
 ; GFX12-NEXT:    s_mov_b32 s4, -1
 ; GFX12-NEXT:    s_cbranch_vccnz .LBB109_3
@@ -14541,15 +14551,15 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64_offset(ptr %out, ptr %out2,
 ; GFX12-LABEL: atomic_inc_i64_ret_incr64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b64 s[8:9], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[6:7]
-; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 32
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s9
 ; GFX12-NEXT:    s_cselect_b32 s6, -1, 0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s6
 ; GFX12-NEXT:    s_cbranch_vccz .LBB110_2
 ; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
@@ -14689,9 +14699,9 @@ define amdgpu_kernel void @atomic_inc_i64(ptr %out, i64 %in) {
 ; GFX12-LABEL: atomic_inc_i64:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
 ; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
@@ -14986,15 +14996,16 @@ define amdgpu_kernel void @atomic_inc_i64_incr64(ptr %out, i64 %in, i64 %index)
 ; GFX12-LABEL: atomic_inc_i64_incr64:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
+; GFX12-NEXT:    s_load_b64 s[8:9], s[4:5], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
 ; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[8:9], 3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
 ; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
 ; GFX12-NEXT:    s_mov_b32 s4, -1
 ; GFX12-NEXT:    s_cbranch_vccnz .LBB113_3
@@ -15296,11 +15307,11 @@ define amdgpu_kernel void @atomic_dec_i64_offset(ptr %out, i64 %in) {
 ; GFX12-LABEL: atomic_dec_i64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 32
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
 ; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
 ; GFX12-NEXT:    s_mov_b32 s4, -1
@@ -15625,17 +15636,17 @@ define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr %out, i64 %in, i64 %
 ; GFX12-LABEL: atomic_dec_i64_decr64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
+; GFX12-NEXT:    s_load_b64 s[8:9], s[4:5], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[8:9], 3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
-; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 32
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
 ; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
 ; GFX12-NEXT:    s_mov_b32 s4, -1
 ; GFX12-NEXT:    s_cbranch_vccnz .LBB117_3
@@ -15797,15 +15808,15 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64_offset(ptr %out, ptr %out2,
 ; GFX12-LABEL: atomic_dec_i64_ret_decr64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b64 s[8:9], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[6:7]
-; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 32
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s9
 ; GFX12-NEXT:    s_cselect_b32 s6, -1, 0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s6
 ; GFX12-NEXT:    s_cbranch_vccz .LBB118_2
 ; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
@@ -15954,9 +15965,9 @@ define amdgpu_kernel void @atomic_dec_i64(ptr %out, i64 %in) {
 ; GFX12-LABEL: atomic_dec_i64:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
 ; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
@@ -16271,15 +16282,16 @@ define amdgpu_kernel void @atomic_dec_i64_decr64(ptr %out, i64 %in, i64 %index)
 ; GFX12-LABEL: atomic_dec_i64_decr64:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
+; GFX12-NEXT:    s_load_b64 s[8:9], s[4:5], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
 ; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[8:9], 3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
 ; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
 ; GFX12-NEXT:    s_mov_b32 s4, -1
 ; GFX12-NEXT:    s_cbranch_vccnz .LBB121_3
diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll
index 524100c5b7a25..9e27f6badfdac 100644
--- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll
@@ -187,8 +187,8 @@ define void @flat_atomic_xchg_i64_noret_offset(ptr %out, i64 %in) {
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_add_co_u32_e32 v0, vcc, 32, v0
-; GCN3-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN3-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN3-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GCN3-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GCN3-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -422,8 +422,8 @@ define i64 @flat_atomic_xchg_i64_ret_offset(ptr %out, i64 %in) {
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_add_co_u32_e32 v4, vcc, 32, v0
-; GCN3-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN3-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN3-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GCN3-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GCN3-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -660,8 +660,8 @@ define amdgpu_gfx void @flat_atomic_xchg_i64_noret_offset_scalar(ptr inreg %out,
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    s_add_u32 s34, s4, 32
-; GCN3-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN3-NEXT:    s_mov_b64 s[36:37], src_private_base
+; GCN3-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN3-NEXT:    s_cmp_eq_u32 s35, s37
 ; GCN3-NEXT:    s_cselect_b64 s[36:37], -1, 0
 ; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
@@ -897,8 +897,8 @@ define amdgpu_gfx i64 @flat_atomic_xchg_i64_ret_offset_scalar(ptr inreg %out, i6
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    s_add_u32 s34, s4, 32
-; GCN3-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN3-NEXT:    s_mov_b64 s[36:37], src_private_base
+; GCN3-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN3-NEXT:    s_cmp_eq_u32 s35, s37
 ; GCN3-NEXT:    s_cselect_b64 s[36:37], -1, 0
 ; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
@@ -1010,8 +1010,8 @@ define void @flat_atomic_xchg_i64_noret_offset__amdgpu_no_remote_memory(ptr %out
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_add_co_u32_e32 v0, vcc, 32, v0
-; GCN3-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN3-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN3-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GCN3-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GCN3-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -1126,8 +1126,8 @@ define i64 @flat_atomic_xchg_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_add_co_u32_e32 v4, vcc, 32, v0
-; GCN3-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN3-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN3-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GCN3-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GCN3-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -1347,8 +1347,8 @@ define void @flat_atomic_xchg_f64_noret_offset(ptr %out, double %in) {
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_add_co_u32_e32 v0, vcc, 32, v0
-; GCN3-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN3-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN3-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GCN3-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GCN3-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -1582,8 +1582,8 @@ define double @flat_atomic_xchg_f64_ret_offset(ptr %out, double %in) {
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_add_co_u32_e32 v4, vcc, 32, v0
-; GCN3-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN3-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN3-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GCN3-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GCN3-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -1820,8 +1820,8 @@ define amdgpu_gfx void @flat_atomic_xchg_f64_noret_offset_scalar(ptr inreg %out,
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    s_add_u32 s34, s4, 32
-; GCN3-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN3-NEXT:    s_mov_b64 s[36:37], src_private_base
+; GCN3-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN3-NEXT:    s_cmp_eq_u32 s35, s37
 ; GCN3-NEXT:    s_cselect_b64 s[36:37], -1, 0
 ; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
@@ -2057,8 +2057,8 @@ define amdgpu_gfx double @flat_atomic_xchg_f64_ret_offset_scalar(ptr inreg %out,
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    s_add_u32 s34, s4, 32
-; GCN3-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN3-NEXT:    s_mov_b64 s[36:37], src_private_base
+; GCN3-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN3-NEXT:    s_cmp_eq_u32 s35, s37
 ; GCN3-NEXT:    s_cselect_b64 s[36:37], -1, 0
 ; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
@@ -2170,8 +2170,8 @@ define void @flat_atomic_xchg_f64_noret_offset__amdgpu_no_remote_memory(ptr %out
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_add_co_u32_e32 v0, vcc, 32, v0
-; GCN3-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN3-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN3-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GCN3-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GCN3-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -2286,8 +2286,8 @@ define double @flat_atomic_xchg_f64_ret_offset__amdgpu_no_remote_memory(ptr %out
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_add_co_u32_e32 v4, vcc, 32, v0
-; GCN3-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN3-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN3-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GCN3-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GCN3-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -2537,8 +2537,8 @@ define void @flat_atomic_add_i64_noret_offset(ptr %out, i64 %in) {
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_add_co_u32_e32 v0, vcc, 32, v0
-; GCN3-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN3-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN3-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GCN3-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GCN3-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -2797,8 +2797,8 @@ define i64 @flat_atomic_add_i64_ret_offset(ptr %out, i64 %in) {
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_add_co_u32_e32 v4, vcc, 32, v0
-; GCN3-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN3-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN3-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GCN3-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GCN3-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -3063,8 +3063,8 @@ define amdgpu_gfx void @flat_atomic_add_i64_noret_offset_scalar(ptr inreg %out,
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    s_add_u32 s34, s4, 32
-; GCN3-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN3-NEXT:    s_mov_b64 s[36:37], src_private_base
+; GCN3-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN3-NEXT:    s_cmp_eq_u32 s35, s37
 ; GCN3-NEXT:    s_cselect_b64 s[36:37], -1, 0
 ; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
@@ -3320,8 +3320,8 @@ define amdgpu_gfx i64 @flat_atomic_add_i64_ret_offset_scalar(ptr inreg %out, i64
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    s_add_u32 s34, s4, 32
-; GCN3-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN3-NEXT:    s_mov_b64 s[36:37], src_private_base
+; GCN3-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN3-NEXT:    s_cmp_eq_u32 s35, s37
 ; GCN3-NEXT:    s_cselect_b64 s[36:37], -1, 0
 ; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
@@ -3448,8 +3448,8 @@ define void @flat_atomic_add_i64_noret_offset__amdgpu_no_remote_memory(ptr %out,
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_add_co_u32_e32 v0, vcc, 32, v0
-; GCN3-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN3-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN3-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GCN3-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GCN3-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -3578,8 +3578,8 @@ define i64 @flat_atomic_add_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_add_co_u32_e32 v4, vcc, 32, v0
-; GCN3-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN3-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN3-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GCN3-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GCN3-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -3924,8 +3924,8 @@ define void @flat_atomic_sub_i64_noret_offset(ptr %out, i64 %in) {
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_add_co_u32_e32 v0, vcc, 32, v0
-; GCN3-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN3-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN3-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GCN3-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GCN3-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -4284,8 +4284,8 @@ define i64 @flat_atomic_sub_i64_ret_offset(ptr %out, i64 %in) {
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_add_co_u32_e32 v4, vcc, 32, v0
-; GCN3-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN3-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN3-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GCN3-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GCN3-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -4656,8 +4656,8 @@ define amdgpu_gfx void @flat_atomic_sub_i64_noret_offset_scalar(ptr inreg %out,
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    s_add_u32 s34, s4, 32
-; GCN3-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN3-NEXT:    s_mov_b64 s[36:37], src_private_base
+; GCN3-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN3-NEXT:    s_cmp_eq_u32 s35, s37
 ; GCN3-NEXT:    s_cselect_b64 s[36:37], -1, 0
 ; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
@@ -5017,8 +5017,8 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_offset_scalar(ptr inreg %out, i64
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    s_add_u32 s34, s4, 32
-; GCN3-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN3-NEXT:    s_mov_b64 s[36:37], src_private_base
+; GCN3-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN3-NEXT:    s_cmp_eq_u32 s35, s37
 ; GCN3-NEXT:    s_cselect_b64 s[36:37], -1, 0
 ; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
@@ -5159,8 +5159,8 @@ define void @flat_atomic_sub_i64_noret_offset__amdgpu_no_remote_memory(ptr %out,
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_add_co_u32_e32 v0, vcc, 32, v0
-; GCN3-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN3-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN3-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GCN3-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GCN3-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -5289,8 +5289,8 @@ define i64 @flat_atomic_sub_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_add_co_u32_e32 v4, vcc, 32, v0
-; GCN3-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN3-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN3-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GCN3-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GCN3-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -5635,8 +5635,8 @@ define void @flat_atomic_and_i64_noret_offset(ptr %out, i64 %in) {
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_add_co_u32_e32 v0, vcc, 32, v0
-; GCN3-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN3-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN3-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GCN3-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GCN3-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -5995,8 +5995,8 @@ define i64 @flat_atomic_and_i64_ret_offset(ptr %out, i64 %in) {
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_add_co_u32_e32 v4, vcc, 32, v0
-; GCN3-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN3-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN3-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GCN3-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GCN3-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -6357,8 +6357,8 @@ define amdgpu_gfx void @flat_atomic_and_i64_noret_offset_scalar(ptr inreg %out,
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    s_add_u32 s34, s4, 32
-; GCN3-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN3-NEXT:    s_mov_b64 s[36:37], src_private_base
+; GCN3-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN3-NEXT:    s_cmp_eq_u32 s35, s37
 ; GCN3-NEXT:    s_cselect_b64 s[36:37], -1, 0
 ; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
@@ -6706,8 +6706,8 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_offset_scalar(ptr inreg %out, i64
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    s_add_u32 s34, s4, 32
-; GCN3-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN3-NEXT:    s_mov_b64 s[36:37], src_private_base
+; GCN3-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN3-NEXT:    s_cmp_eq_u32 s35, s37
 ; GCN3-NEXT:    s_cselect_b64 s[36:37], -1, 0
 ; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
@@ -6846,8 +6846,8 @@ define void @flat_atomic_and_i64_noret_offset__amdgpu_no_remote_memory(ptr %out,
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_add_co_u32_e32 v0, vcc, 32, v0
-; GCN3-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN3-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN3-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GCN3-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GCN3-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -6976,8 +6976,8 @@ define i64 @flat_atomic_and_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_add_co_u32_e32 v4, vcc, 32, v0
-; GCN3-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN3-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN3-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GCN3-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GCN3-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -7342,8 +7342,8 @@ define void @flat_atomic_nand_i64_noret_offset(ptr %out, i64 %in) {
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_add_co_u32_e32 v0, vcc, 32, v0
-; GCN3-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN3-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN3-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GCN3-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GCN3-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -7726,8 +7726,8 @@ define i64 @flat_atomic_nand_i64_ret_offset(ptr %out, i64 %in) {
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_add_co_u32_e32 v4, vcc, 32, v0
-; GCN3-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN3-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN3-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GCN3-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GCN3-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -8112,8 +8112,8 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_offset_scalar(ptr inreg %out,
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    s_add_u32 s34, s4, 32
-; GCN3-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN3-NEXT:    s_mov_b64 s[36:37], src_private_base
+; GCN3-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN3-NEXT:    s_cmp_eq_u32 s35, s37
 ; GCN3-NEXT:    s_cselect_b64 s[36:37], -1, 0
 ; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
@@ -8485,8 +8485,8 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_offset_scalar(ptr inreg %out, i6
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    s_add_u32 s34, s4, 32
-; GCN3-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN3-NEXT:    s_mov_b64 s[36:37], src_private_base
+; GCN3-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN3-NEXT:    s_cmp_eq_u32 s35, s37
 ; GCN3-NEXT:    s_cselect_b64 s[36:37], -1, 0
 ; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
@@ -8675,8 +8675,8 @@ define void @flat_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory(ptr %out
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_add_co_u32_e32 v0, vcc, 32, v0
-; GCN3-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN3-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN3-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GCN3-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GCN3-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -8871,8 +8871,8 @@ define i64 @flat_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_add_co_u32_e32 v4, vcc, 32, v0
-; GCN3-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN3-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN3-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GCN3-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GCN3-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -9237,8 +9237,8 @@ define void @flat_atomic_or_i64_noret_offset(ptr %out, i64 %in) {
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_add_co_u32_e32 v0, vcc, 32, v0
-; GCN3-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN3-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN3-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GCN3-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GCN3-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -9597,8 +9597,8 @@ define i64 @flat_atomic_or_i64_ret_offset(ptr %out, i64 %in) {
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_add_co_u32_e32 v4, vcc, 32, v0
-; GCN3-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN3-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN3-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GCN3-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GCN3-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -9959,8 +9959,8 @@ define amdgpu_gfx void @flat_atomic_or_i64_noret_offset_scalar(ptr inreg %out, i
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    s_add_u32 s34, s4, 32
-; GCN3-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN3-NEXT:    s_mov_b64 s[36:37], src_private_base
+; GCN3-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN3-NEXT:    s_cmp_eq_u32 s35, s37
 ; GCN3-NEXT:    s_cselect_b64 s[36:37], -1, 0
 ; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
@@ -10308,8 +10308,8 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_offset_scalar(ptr inreg %out, i64
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    s_add_u32 s34, s4, 32
-; GCN3-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN3-NEXT:    s_mov_b64 s[36:37], src_private_base
+; GCN3-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN3-NEXT:    s_cmp_eq_u32 s35, s37
 ; GCN3-NEXT:    s_cselect_b64 s[36:37], -1, 0
 ; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
@@ -10448,8 +10448,8 @@ define void @flat_atomic_or_i64_noret_offset__amdgpu_no_remote_memory(ptr %out,
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_add_co_u32_e32 v0, vcc, 32, v0
-; GCN3-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN3-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN3-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GCN3-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GCN3-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -10578,8 +10578,8 @@ define i64 @flat_atomic_or_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i64
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_add_co_u32_e32 v4, vcc, 32, v0
-; GCN3-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN3-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN3-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GCN3-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GCN3-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -10924,8 +10924,8 @@ define void @flat_atomic_xor_i64_noret_offset(ptr %out, i64 %in) {
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_add_co_u32_e32 v0, vcc, 32, v0
-; GCN3-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN3-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN3-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GCN3-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GCN3-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -11284,8 +11284,8 @@ define i64 @flat_atomic_xor_i64_ret_offset(ptr %out, i64 %in) {
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_add_co_u32_e32 v4, vcc, 32, v0
-; GCN3-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN3-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN3-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GCN3-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GCN3-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -11646,8 +11646,8 @@ define amdgpu_gfx void @flat_atomic_xor_i64_noret_offset_scalar(ptr inreg %out,
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    s_add_u32 s34, s4, 32
-; GCN3-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN3-NEXT:    s_mov_b64 s[36:37], src_private_base
+; GCN3-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN3-NEXT:    s_cmp_eq_u32 s35, s37
 ; GCN3-NEXT:    s_cselect_b64 s[36:37], -1, 0
 ; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
@@ -11995,8 +11995,8 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_offset_scalar(ptr inreg %out, i64
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    s_add_u32 s34, s4, 32
-; GCN3-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN3-NEXT:    s_mov_b64 s[36:37], src_private_base
+; GCN3-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN3-NEXT:    s_cmp_eq_u32 s35, s37
 ; GCN3-NEXT:    s_cselect_b64 s[36:37], -1, 0
 ; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
@@ -12135,8 +12135,8 @@ define void @flat_atomic_xor_i64_noret_offset__amdgpu_no_remote_memory(ptr %out,
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_add_co_u32_e32 v0, vcc, 32, v0
-; GCN3-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN3-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN3-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GCN3-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GCN3-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -12265,8 +12265,8 @@ define i64 @flat_atomic_xor_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_add_co_u32_e32 v4, vcc, 32, v0
-; GCN3-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN3-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN3-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GCN3-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GCN3-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -12611,8 +12611,8 @@ define void @flat_atomic_max_i64_noret_offset(ptr %out, i64 %in) {
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_add_co_u32_e32 v0, vcc, 32, v0
-; GCN3-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN3-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN3-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GCN3-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GCN3-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -12971,8 +12971,8 @@ define i64 @flat_atomic_max_i64_ret_offset(ptr %out, i64 %in) {
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_add_co_u32_e32 v4, vcc, 32, v0
-; GCN3-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN3-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN3-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GCN3-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GCN3-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -13358,8 +13358,8 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_offset_scalar(ptr inreg %out,
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    s_add_u32 s34, s4, 32
-; GCN3-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN3-NEXT:    s_mov_b64 s[36:37], src_private_base
+; GCN3-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN3-NEXT:    s_cmp_eq_u32 s35, s37
 ; GCN3-NEXT:    s_cselect_b64 s[36:37], -1, 0
 ; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
@@ -13737,8 +13737,8 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_offset_scalar(ptr inreg %out, i64
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    s_add_u32 s34, s4, 32
-; GCN3-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN3-NEXT:    s_mov_b64 s[36:37], src_private_base
+; GCN3-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN3-NEXT:    s_cmp_eq_u32 s35, s37
 ; GCN3-NEXT:    s_cselect_b64 s[36:37], -1, 0
 ; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
@@ -13939,20 +13939,20 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 %
 ; GCN3:       ; %bb.0: ; %entry
 ; GCN3-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
 ; GCN3-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
-; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GCN3-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x34
 ; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN3-NEXT:    s_mov_b32 s14, -1
 ; GCN3-NEXT:    s_mov_b32 s15, 0xe00000
 ; GCN3-NEXT:    s_add_u32 s12, s12, s11
 ; GCN3-NEXT:    s_addc_u32 s13, s13, 0
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
-; GCN3-NEXT:    s_add_u32 s0, s0, s6
-; GCN3-NEXT:    s_addc_u32 s1, s1, s7
+; GCN3-NEXT:    s_lshl_b64 s[4:5], s[8:9], 3
+; GCN3-NEXT:    s_add_u32 s0, s0, s4
+; GCN3-NEXT:    s_addc_u32 s1, s1, s5
 ; GCN3-NEXT:    s_add_u32 s0, s0, 32
-; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN3-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GCN3-NEXT:    s_addc_u32 s1, s1, 0
-; GCN3-NEXT:    s_cmp_eq_u32 s1, s5
+; GCN3-NEXT:    s_cmp_eq_u32 s1, s7
 ; GCN3-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
 ; GCN3-NEXT:    s_mov_b64 s[4:5], -1
@@ -14368,18 +14368,18 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GCN3:       ; %bb.0: ; %entry
 ; GCN3-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
 ; GCN3-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
-; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GCN3-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x34
 ; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN3-NEXT:    s_mov_b32 s14, -1
 ; GCN3-NEXT:    s_mov_b32 s15, 0xe00000
 ; GCN3-NEXT:    s_add_u32 s12, s12, s11
 ; GCN3-NEXT:    s_addc_u32 s13, s13, 0
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
-; GCN3-NEXT:    s_add_u32 s0, s0, s6
-; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
-; GCN3-NEXT:    s_addc_u32 s1, s1, s7
-; GCN3-NEXT:    s_cmp_eq_u32 s1, s5
+; GCN3-NEXT:    s_lshl_b64 s[4:5], s[8:9], 3
+; GCN3-NEXT:    s_add_u32 s0, s0, s4
+; GCN3-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GCN3-NEXT:    s_addc_u32 s1, s1, s5
+; GCN3-NEXT:    s_cmp_eq_u32 s1, s7
 ; GCN3-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
 ; GCN3-NEXT:    s_mov_b64 s[4:5], -1
@@ -14734,8 +14734,8 @@ define void @flat_atomic_max_i64_noret_offset__amdgpu_no_remote_memory(ptr %out,
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_add_co_u32_e32 v0, vcc, 32, v0
-; GCN3-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN3-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN3-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GCN3-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GCN3-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -14864,8 +14864,8 @@ define i64 @flat_atomic_max_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_add_co_u32_e32 v4, vcc, 32, v0
-; GCN3-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN3-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN3-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GCN3-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GCN3-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -15210,8 +15210,8 @@ define void @flat_atomic_umax_i64_noret_offset(ptr %out, i64 %in) {
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_add_co_u32_e32 v0, vcc, 32, v0
-; GCN3-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN3-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN3-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GCN3-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GCN3-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -15570,8 +15570,8 @@ define i64 @flat_atomic_umax_i64_ret_offset(ptr %out, i64 %in) {
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_add_co_u32_e32 v4, vcc, 32, v0
-; GCN3-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN3-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN3-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GCN3-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GCN3-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -15957,8 +15957,8 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_offset_scalar(ptr inreg %out,
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    s_add_u32 s34, s4, 32
-; GCN3-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN3-NEXT:    s_mov_b64 s[36:37], src_private_base
+; GCN3-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN3-NEXT:    s_cmp_eq_u32 s35, s37
 ; GCN3-NEXT:    s_cselect_b64 s[36:37], -1, 0
 ; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
@@ -16336,8 +16336,8 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_offset_scalar(ptr inreg %out, i6
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    s_add_u32 s34, s4, 32
-; GCN3-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN3-NEXT:    s_mov_b64 s[36:37], src_private_base
+; GCN3-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN3-NEXT:    s_cmp_eq_u32 s35, s37
 ; GCN3-NEXT:    s_cselect_b64 s[36:37], -1, 0
 ; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
@@ -16538,20 +16538,20 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64
 ; GCN3:       ; %bb.0: ; %entry
 ; GCN3-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
 ; GCN3-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
-; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GCN3-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x34
 ; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN3-NEXT:    s_mov_b32 s14, -1
 ; GCN3-NEXT:    s_mov_b32 s15, 0xe00000
 ; GCN3-NEXT:    s_add_u32 s12, s12, s11
 ; GCN3-NEXT:    s_addc_u32 s13, s13, 0
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
-; GCN3-NEXT:    s_add_u32 s0, s0, s6
-; GCN3-NEXT:    s_addc_u32 s1, s1, s7
+; GCN3-NEXT:    s_lshl_b64 s[4:5], s[8:9], 3
+; GCN3-NEXT:    s_add_u32 s0, s0, s4
+; GCN3-NEXT:    s_addc_u32 s1, s1, s5
 ; GCN3-NEXT:    s_add_u32 s0, s0, 32
-; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN3-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GCN3-NEXT:    s_addc_u32 s1, s1, 0
-; GCN3-NEXT:    s_cmp_eq_u32 s1, s5
+; GCN3-NEXT:    s_cmp_eq_u32 s1, s7
 ; GCN3-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
 ; GCN3-NEXT:    s_mov_b64 s[4:5], -1
@@ -17124,8 +17124,8 @@ define void @flat_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory(ptr %out
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_add_co_u32_e32 v0, vcc, 32, v0
-; GCN3-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN3-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN3-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GCN3-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GCN3-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -17254,8 +17254,8 @@ define i64 @flat_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_add_co_u32_e32 v4, vcc, 32, v0
-; GCN3-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN3-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN3-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GCN3-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GCN3-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -17600,8 +17600,8 @@ define void @flat_atomic_umin_i64_noret_offset(ptr %out, i64 %in) {
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_add_co_u32_e32 v0, vcc, 32, v0
-; GCN3-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN3-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN3-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GCN3-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GCN3-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -17960,8 +17960,8 @@ define i64 @flat_atomic_umin_i64_ret_offset(ptr %out, i64 %in) {
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_add_co_u32_e32 v4, vcc, 32, v0
-; GCN3-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN3-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN3-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GCN3-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GCN3-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -18347,8 +18347,8 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_offset_scalar(ptr inreg %out,
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    s_add_u32 s34, s4, 32
-; GCN3-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN3-NEXT:    s_mov_b64 s[36:37], src_private_base
+; GCN3-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN3-NEXT:    s_cmp_eq_u32 s35, s37
 ; GCN3-NEXT:    s_cselect_b64 s[36:37], -1, 0
 ; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
@@ -18726,8 +18726,8 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_offset_scalar(ptr inreg %out, i6
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    s_add_u32 s34, s4, 32
-; GCN3-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN3-NEXT:    s_mov_b64 s[36:37], src_private_base
+; GCN3-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN3-NEXT:    s_cmp_eq_u32 s35, s37
 ; GCN3-NEXT:    s_cselect_b64 s[36:37], -1, 0
 ; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
@@ -18871,8 +18871,8 @@ define void @flat_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory(ptr %out
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_add_co_u32_e32 v0, vcc, 32, v0
-; GCN3-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN3-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN3-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GCN3-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GCN3-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -19001,8 +19001,8 @@ define i64 @flat_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_add_co_u32_e32 v4, vcc, 32, v0
-; GCN3-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN3-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN3-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GCN3-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GCN3-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -19347,8 +19347,8 @@ define void @flat_atomic_min_i64_noret_offset(ptr %out, i64 %in) {
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_add_co_u32_e32 v0, vcc, 32, v0
-; GCN3-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN3-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN3-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GCN3-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GCN3-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -19707,8 +19707,8 @@ define i64 @flat_atomic_min_i64_ret_offset(ptr %out, i64 %in) {
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_add_co_u32_e32 v4, vcc, 32, v0
-; GCN3-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN3-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN3-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GCN3-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GCN3-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -20094,8 +20094,8 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_offset_scalar(ptr inreg %out,
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    s_add_u32 s34, s4, 32
-; GCN3-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN3-NEXT:    s_mov_b64 s[36:37], src_private_base
+; GCN3-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN3-NEXT:    s_cmp_eq_u32 s35, s37
 ; GCN3-NEXT:    s_cselect_b64 s[36:37], -1, 0
 ; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
@@ -20473,8 +20473,8 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_offset_scalar(ptr inreg %out, i64
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    s_add_u32 s34, s4, 32
-; GCN3-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN3-NEXT:    s_mov_b64 s[36:37], src_private_base
+; GCN3-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN3-NEXT:    s_cmp_eq_u32 s35, s37
 ; GCN3-NEXT:    s_cselect_b64 s[36:37], -1, 0
 ; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
@@ -20675,20 +20675,20 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 %
 ; GCN3:       ; %bb.0: ; %entry
 ; GCN3-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
 ; GCN3-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
-; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GCN3-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x34
 ; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN3-NEXT:    s_mov_b32 s14, -1
 ; GCN3-NEXT:    s_mov_b32 s15, 0xe00000
 ; GCN3-NEXT:    s_add_u32 s12, s12, s11
 ; GCN3-NEXT:    s_addc_u32 s13, s13, 0
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
-; GCN3-NEXT:    s_add_u32 s0, s0, s6
-; GCN3-NEXT:    s_addc_u32 s1, s1, s7
+; GCN3-NEXT:    s_lshl_b64 s[4:5], s[8:9], 3
+; GCN3-NEXT:    s_add_u32 s0, s0, s4
+; GCN3-NEXT:    s_addc_u32 s1, s1, s5
 ; GCN3-NEXT:    s_add_u32 s0, s0, 32
-; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN3-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GCN3-NEXT:    s_addc_u32 s1, s1, 0
-; GCN3-NEXT:    s_cmp_eq_u32 s1, s5
+; GCN3-NEXT:    s_cmp_eq_u32 s1, s7
 ; GCN3-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
 ; GCN3-NEXT:    s_mov_b64 s[4:5], -1
@@ -21101,9 +21101,9 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) {
 ; GCN3-NEXT:    s_mov_b32 s15, 0xe00000
 ; GCN3-NEXT:    s_add_u32 s12, s12, s11
 ; GCN3-NEXT:    s_addc_u32 s13, s13, 0
-; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN3-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    s_cmp_eq_u32 s1, s5
+; GCN3-NEXT:    s_cmp_eq_u32 s1, s7
 ; GCN3-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
 ; GCN3-NEXT:    s_mov_b64 s[4:5], -1
@@ -21457,8 +21457,8 @@ define void @flat_atomic_min_i64_noret_offset__amdgpu_no_remote_memory(ptr %out,
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_add_co_u32_e32 v0, vcc, 32, v0
-; GCN3-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN3-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN3-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GCN3-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GCN3-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -21587,8 +21587,8 @@ define i64 @flat_atomic_min_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_add_co_u32_e32 v4, vcc, 32, v0
-; GCN3-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN3-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN3-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GCN3-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GCN3-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -21958,8 +21958,8 @@ define void @flat_atomic_uinc_wrap_i64_noret_offset(ptr %out, i64 %in) {
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_add_co_u32_e32 v0, vcc, 32, v0
-; GCN3-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN3-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN3-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GCN3-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GCN3-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -22348,8 +22348,8 @@ define i64 @flat_atomic_uinc_wrap_i64_ret_offset(ptr %out, i64 %in) {
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_add_co_u32_e32 v4, vcc, 32, v0
-; GCN3-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN3-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN3-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GCN3-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GCN3-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -22745,8 +22745,8 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i64_noret_offset_scalar(ptr inreg
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    s_add_u32 s34, s4, 32
-; GCN3-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN3-NEXT:    s_mov_b64 s[36:37], src_private_base
+; GCN3-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN3-NEXT:    s_cmp_eq_u32 s35, s37
 ; GCN3-NEXT:    s_cselect_b64 s[36:37], -1, 0
 ; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
@@ -23130,8 +23130,8 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_offset_scalar(ptr inreg %ou
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    s_add_u32 s34, s4, 32
-; GCN3-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN3-NEXT:    s_mov_b64 s[36:37], src_private_base
+; GCN3-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN3-NEXT:    s_cmp_eq_u32 s35, s37
 ; GCN3-NEXT:    s_cselect_b64 s[36:37], -1, 0
 ; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
@@ -23282,8 +23282,8 @@ define void @flat_atomic_uinc_wrap_i64_noret_offset__amdgpu_no_remote_memory(ptr
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_add_co_u32_e32 v0, vcc, 32, v0
-; GCN3-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN3-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN3-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GCN3-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GCN3-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -23421,8 +23421,8 @@ define i64 @flat_atomic_uinc_wrap_i64_ret_offset__amdgpu_no_remote_memory(ptr %o
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_add_co_u32_e32 v4, vcc, 32, v0
-; GCN3-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN3-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN3-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GCN3-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GCN3-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -23810,8 +23810,8 @@ define void @flat_atomic_udec_wrap_i64_noret_offset(ptr %out, i64 %in) {
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_add_co_u32_e32 v0, vcc, 32, v0
-; GCN3-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN3-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN3-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GCN3-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GCN3-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
@@ -24223,8 +24223,8 @@ define i64 @flat_atomic_udec_wrap_i64_ret_offset(ptr %out, i64 %in) {
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_add_co_u32_e32 v4, vcc, 32, v0
-; GCN3-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN3-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN3-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GCN3-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GCN3-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -24659,8 +24659,8 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_offset_scalar(ptr inreg
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    s_add_u32 s38, s4, 32
-; GCN3-NEXT:    s_addc_u32 s39, s5, 0
 ; GCN3-NEXT:    s_mov_b64 s[34:35], src_private_base
+; GCN3-NEXT:    s_addc_u32 s39, s5, 0
 ; GCN3-NEXT:    s_cmp_eq_u32 s39, s35
 ; GCN3-NEXT:    s_cselect_b64 s[34:35], -1, 0
 ; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[34:35]
@@ -25091,8 +25091,8 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_offset_scalar(ptr inreg %ou
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    s_add_u32 s38, s4, 32
-; GCN3-NEXT:    s_addc_u32 s39, s5, 0
 ; GCN3-NEXT:    s_mov_b64 s[34:35], src_private_base
+; GCN3-NEXT:    s_addc_u32 s39, s5, 0
 ; GCN3-NEXT:    s_cmp_eq_u32 s39, s35
 ; GCN3-NEXT:    s_cselect_b64 s[34:35], -1, 0
 ; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[34:35]
@@ -25253,8 +25253,8 @@ define void @flat_atomic_udec_wrap_i64_noret_offset__amdgpu_no_remote_memory(ptr
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_add_co_u32_e32 v0, vcc, 32, v0
-; GCN3-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN3-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN3-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GCN3-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GCN3-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -25397,8 +25397,8 @@ define i64 @flat_atomic_udec_wrap_i64_ret_offset__amdgpu_no_remote_memory(ptr %o
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_add_co_u32_e32 v4, vcc, 32, v0
-; GCN3-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN3-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN3-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN3-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
 ; GCN3-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GCN3-NEXT:    s_and_saveexec_b64 s[4:5], vcc
diff --git a/llvm/test/CodeGen/AMDGPU/fold-sgpr-copy.mir b/llvm/test/CodeGen/AMDGPU/fold-sgpr-copy.mir
index 103c3e3eb8bc6..e1295d4a09563 100644
--- a/llvm/test/CodeGen/AMDGPU/fold-sgpr-copy.mir
+++ b/llvm/test/CodeGen/AMDGPU/fold-sgpr-copy.mir
@@ -17,9 +17,8 @@ body:             |
 ...
 
 # GCN-LABEL: name: fold_sgpr_to_sgpr_copy_subreg
-# GCN:      %0:sreg_64 = IMPLICIT_DEF
-# GCN-NEXT: %2:sgpr_32 = COPY %0.sub0
-# GCN-NEXT: S_STORE_DWORD_IMM %2, undef $sgpr10_sgpr11, 0, 0
+# GCN:      %0:sreg_64_xexec = IMPLICIT_DEF
+# GCN-NEXT: S_STORE_DWORD_IMM %0.sub0, undef $sgpr10_sgpr11, 0, 0
 
 name:            fold_sgpr_to_sgpr_copy_subreg
 body:             |
@@ -32,9 +31,8 @@ body:             |
 ...
 
 # GCN-LABEL: name: fold_sgpr_to_sgpr_copy_subreg2
-# GCN:      %0:sreg_64 = IMPLICIT_DEF
-# GCN-NEXT: %3:sreg_32_xm0_xexec = COPY %0.sub0
-# GCN-NEXT: S_STORE_DWORD_IMM %3, undef $sgpr10_sgpr11, 0, 0
+# GCN:      %0:sreg_64_xexec = IMPLICIT_DEF
+# GCN-NEXT: S_STORE_DWORD_IMM %0.sub0, undef $sgpr10_sgpr11, 0, 0
 
 name:            fold_sgpr_to_sgpr_copy_subreg2
 body:             |
diff --git a/llvm/test/CodeGen/AMDGPU/gep-const-address-space.ll b/llvm/test/CodeGen/AMDGPU/gep-const-address-space.ll
index 9db760077a853..872c2cf569dcc 100644
--- a/llvm/test/CodeGen/AMDGPU/gep-const-address-space.ll
+++ b/llvm/test/CodeGen/AMDGPU/gep-const-address-space.ll
@@ -6,21 +6,21 @@ define protected amdgpu_kernel void @IllegalGEPConst(i32 %a, ptr addrspace(1) %b
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
 ; CHECK-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
-; CHECK-NEXT:    s_load_dword s6, s[4:5], 0x24
+; CHECK-NEXT:    s_load_dword s8, s[4:5], 0x24
 ; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
 ; CHECK-NEXT:    s_mov_b32 s14, -1
 ; CHECK-NEXT:    s_mov_b32 s15, 0xe00000
 ; CHECK-NEXT:    s_add_u32 s12, s12, s11
 ; CHECK-NEXT:    s_addc_u32 s13, s13, 0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    s_ashr_i32 s7, s6, 31
-; CHECK-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
-; CHECK-NEXT:    s_add_u32 s0, s0, s6
-; CHECK-NEXT:    s_addc_u32 s1, s1, s7
+; CHECK-NEXT:    s_ashr_i32 s9, s8, 31
+; CHECK-NEXT:    s_lshl_b64 s[4:5], s[8:9], 3
+; CHECK-NEXT:    s_add_u32 s0, s0, s4
+; CHECK-NEXT:    s_addc_u32 s1, s1, s5
 ; CHECK-NEXT:    s_add_u32 s0, s0, -8
-; CHECK-NEXT:    s_mov_b64 s[4:5], src_shared_base
+; CHECK-NEXT:    s_mov_b64 s[6:7], src_shared_base
 ; CHECK-NEXT:    s_addc_u32 s1, s1, -1
-; CHECK-NEXT:    s_cmp_eq_u32 s1, s5
+; CHECK-NEXT:    s_cmp_eq_u32 s1, s7
 ; CHECK-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; CHECK-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
 ; CHECK-NEXT:    s_mov_b64 s[4:5], -1
diff --git a/llvm/test/CodeGen/AMDGPU/gfx11-user-sgpr-init16-bug.ll b/llvm/test/CodeGen/AMDGPU/gfx11-user-sgpr-init16-bug.ll
index 63376def3d7e1..fa8fdbaeacf41 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx11-user-sgpr-init16-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx11-user-sgpr-init16-bug.ll
@@ -200,8 +200,8 @@ declare align 4 ptr addrspace(4) @llvm.amdgcn.queue.ptr() #3
 declare align 4 ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() #3
 declare i64 @llvm.amdgcn.dispatch.id() #3
 
-attributes #0 = { "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
-attributes #1 = { "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
-attributes #2 = { "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
+attributes #0 = { "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-cluster-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-cluster-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-cluster-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
+attributes #1 = { "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-cluster-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-cluster-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-cluster-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
+attributes #2 = { "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-cluster-id-x" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
 attributes #3 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
 
diff --git a/llvm/test/CodeGen/AMDGPU/hazards-gfx950.mir b/llvm/test/CodeGen/AMDGPU/hazards-gfx950.mir
index e506cebd60c77..97f8fd61e4298 100644
--- a/llvm/test/CodeGen/AMDGPU/hazards-gfx950.mir
+++ b/llvm/test/CodeGen/AMDGPU/hazards-gfx950.mir
@@ -1112,11 +1112,11 @@ body:             |
     ; GCN-NEXT: S_WAITCNT 0
     ; GCN-NEXT: renamable $vgpr2 = V_CVT_SCALEF32_PK_FP4_F16_e64 8, killed $vgpr0, 0, killed $vgpr1, 4, killed $vgpr2, 0, implicit $mode, implicit $exec
     ; GCN-NEXT: S_NOP 0
-    ; GCN-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, killed renamable $vgpr2
+    ; GCN-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, killed renamable $vgpr2
     ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31
     S_WAITCNT 0
     renamable $vgpr2 = V_CVT_SCALEF32_PK_FP4_F16_e64 8, killed $vgpr0, 0, killed $vgpr1, 4, killed $vgpr2, 0, implicit $mode, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, killed renamable $vgpr2
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, killed renamable $vgpr2
     S_SETPC_B64_return undef $sgpr30_sgpr31
 ...
 
diff --git a/llvm/test/CodeGen/AMDGPU/hsa.ll b/llvm/test/CodeGen/AMDGPU/hsa.ll
index 024593c49dba1..e3c383c697030 100644
--- a/llvm/test/CodeGen/AMDGPU/hsa.ll
+++ b/llvm/test/CodeGen/AMDGPU/hsa.ll
@@ -43,7 +43,7 @@
 ; ELF:   00E0: 6E616D65 A673696D 706C65BB 2E707269
 ; ELF:   00F0: 76617465 5F736567 6D656E74 5F666978
 ; ELF:   0100: 65645F73 697A6500 AB2E7367 70725F63
-; ELF:   0110: 6F756E74 0EB12E73 6770725F 7370696C
+; ELF:   0110: 6F756E74 10B12E73 6770725F 7370696C
 ; ELF:   0120: 6C5F636F 756E7400 A72E7379 6D626F6C
 ; ELF:   0130: A973696D 706C652E 6B64AB2E 76677072
 ; ELF:   0140: 5F636F75 6E7403B1 2E766770 725F7370
@@ -59,7 +59,7 @@
 ; ELF:   01E0: 73696D70 6C655F6E 6F5F6B65 726E6172
 ; ELF:   01F0: 6773BB2E 70726976 6174655F 7365676D
 ; ELF:   0200: 656E745F 66697865 645F7369 7A6500AB
-; ELF:   0210: 2E736770 725F636F 756E740C B12E7367
+; ELF:   0210: 2E736770 725F636F 756E740E B12E7367
 ; ELF:   0220: 70725F73 70696C6C 5F636F75 6E7400A7
 ; ELF:   0230: 2E73796D 626F6CB5 73696D70 6C655F6E
 ; ELF:   0240: 6F5F6B65 726E6172 67732E6B 64AB2E76
diff --git a/llvm/test/CodeGen/AMDGPU/idot4u.ll b/llvm/test/CodeGen/AMDGPU/idot4u.ll
index 305461ed6b208..049663a1e1bb4 100644
--- a/llvm/test/CodeGen/AMDGPU/idot4u.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot4u.ll
@@ -1685,19 +1685,18 @@ define amdgpu_kernel void @notdot4_mixedtypes(ptr addrspace(1) %src1,
 ; GFX11-DL-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 8, v5
 ; GFX11-DL-TRUE16-NEXT:    v_bfe_i32 v3, v4, 0, 8
 ; GFX11-DL-TRUE16-NEXT:    v_bfe_i32 v7, v5, 0, 8
+; GFX11-DL-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-DL-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v1.l
-; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-DL-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v2.l
 ; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v3.l
-; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v7.l
-; GFX11-DL-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11-DL-TRUE16-NEXT:    v_mad_u16 v0.l, v0.h, v1.l, v0.l
 ; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v0.h, 0
 ; GFX11-DL-TRUE16-NEXT:    v_perm_b32 v1, v5, v5, 0xc0c0302
-; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-DL-TRUE16-NEXT:    v_mad_u16 v0.l, v2.l, v3.l, v0.l
 ; GFX11-DL-TRUE16-NEXT:    v_perm_b32 v2, v4, v4, 0xc0c0302
+; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-DL-TRUE16-NEXT:    v_dot4_u32_u8 v0, v2, v1, v0
 ; GFX11-DL-TRUE16-NEXT:    global_store_b16 v6, v0, s[4:5]
 ; GFX11-DL-TRUE16-NEXT:    s_endpgm
@@ -1977,13 +1976,12 @@ define amdgpu_kernel void @notdot4_mixedtypes2(ptr addrspace(1) %src1,
 ; GFX11-DL-TRUE16-NEXT:    v_bfe_i32 v6, v4, 0, 8
 ; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v4.h
 ; GFX11-DL-TRUE16-NEXT:    v_bfe_i32 v1, v1, 0, 8
-; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-DL-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-DL-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v2.l
 ; GFX11-DL-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v3.l
 ; GFX11-DL-TRUE16-NEXT:    v_bfe_i32 v7, v7, 0, 8
 ; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v6.l
 ; GFX11-DL-TRUE16-NEXT:    v_lshrrev_b32_e32 v6, 24, v3
-; GFX11-DL-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-DL-TRUE16-NEXT:    v_mad_u16 v0.l, v0.h, v1.l, v0.l
 ; GFX11-DL-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v3.h
 ; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v7.l
@@ -2726,10 +2724,10 @@ define amdgpu_kernel void @udot4_acc8_vecMul(ptr addrspace(1) %src1,
 ; GFX11-DL-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 24, v3
 ; GFX11-DL-TRUE16-NEXT:    s_waitcnt vmcnt(1)
 ; GFX11-DL-TRUE16-NEXT:    v_lshrrev_b32_e32 v6, 24, v4
+; GFX11-DL-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-DL-TRUE16-NEXT:    v_lshrrev_b16 v0.h, 8, v3.l
 ; GFX11-DL-TRUE16-NEXT:    v_mul_lo_u16 v1.l, v3.h, v4.h
 ; GFX11-DL-TRUE16-NEXT:    v_lshrrev_b16 v1.h, 8, v4.l
-; GFX11-DL-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-DL-TRUE16-NEXT:    v_mad_u16 v0.l, v3.l, v4.l, v0.l
 ; GFX11-DL-TRUE16-NEXT:    v_mul_lo_u16 v2.l, v2.l, v6.l
 ; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v6.l, 0
diff --git a/llvm/test/CodeGen/AMDGPU/imm.ll b/llvm/test/CodeGen/AMDGPU/imm.ll
index 21390003ee565..b764ee50c3978 100644
--- a/llvm/test/CodeGen/AMDGPU/imm.ll
+++ b/llvm/test/CodeGen/AMDGPU/imm.ll
@@ -1969,10 +1969,9 @@ define amdgpu_kernel void @add_inline_imm_neg_1_f64(ptr addrspace(1) %out, [8 x
 ; GFX942-LABEL: add_inline_imm_neg_1_f64:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX942-NEXT:    v_mov_b32_e32 v0, -1
 ; GFX942-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX942-NEXT:    s_mov_b32 s2, -1
-; GFX942-NEXT:    v_mov_b32_e32 v1, v0
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], -1
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX942-NEXT:    s_endpgm
@@ -2009,8 +2008,7 @@ define amdgpu_kernel void @add_inline_imm_neg_2_f64(ptr addrspace(1) %out, [8 x
 ; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX942-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX942-NEXT:    s_mov_b32 s2, -1
-; GFX942-NEXT:    v_mov_b32_e32 v0, -2
-; GFX942-NEXT:    v_mov_b32_e32 v1, -1
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], -2
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX942-NEXT:    s_endpgm
@@ -2047,8 +2045,7 @@ define amdgpu_kernel void @add_inline_imm_neg_16_f64(ptr addrspace(1) %out, [8 x
 ; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX942-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX942-NEXT:    s_mov_b32 s2, -1
-; GFX942-NEXT:    v_mov_b32_e32 v0, -16
-; GFX942-NEXT:    v_mov_b32_e32 v1, -1
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], -16
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX942-NEXT:    s_endpgm
@@ -2163,10 +2160,9 @@ define amdgpu_kernel void @store_inline_imm_0.0_f64(ptr addrspace(1) %out) {
 ; GFX942-LABEL: store_inline_imm_0.0_f64:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX942-NEXT:    s_mov_b32 s2, -1
-; GFX942-NEXT:    v_mov_b32_e32 v1, v0
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], 0
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX942-NEXT:    s_endpgm
@@ -2239,8 +2235,7 @@ define amdgpu_kernel void @store_inline_imm_0.5_f64(ptr addrspace(1) %out) {
 ; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX942-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX942-NEXT:    s_mov_b32 s2, -1
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NEXT:    v_mov_b32_e32 v1, 0x3fe00000
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], 0.5
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX942-NEXT:    s_endpgm
@@ -2276,8 +2271,7 @@ define amdgpu_kernel void @store_inline_imm_m_0.5_f64(ptr addrspace(1) %out) {
 ; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX942-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX942-NEXT:    s_mov_b32 s2, -1
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NEXT:    v_mov_b32_e32 v1, 0xbfe00000
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], -0.5
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX942-NEXT:    s_endpgm
@@ -2313,8 +2307,7 @@ define amdgpu_kernel void @store_inline_imm_1.0_f64(ptr addrspace(1) %out) {
 ; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX942-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX942-NEXT:    s_mov_b32 s2, -1
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NEXT:    v_mov_b32_e32 v1, 0x3ff00000
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], 1.0
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX942-NEXT:    s_endpgm
@@ -2350,8 +2343,7 @@ define amdgpu_kernel void @store_inline_imm_m_1.0_f64(ptr addrspace(1) %out) {
 ; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX942-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX942-NEXT:    s_mov_b32 s2, -1
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NEXT:    v_mov_b32_e32 v1, 0xbff00000
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], -1.0
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX942-NEXT:    s_endpgm
@@ -2387,8 +2379,7 @@ define amdgpu_kernel void @store_inline_imm_2.0_f64(ptr addrspace(1) %out) {
 ; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX942-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX942-NEXT:    s_mov_b32 s2, -1
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NEXT:    v_mov_b32_e32 v1, 2.0
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], 2.0
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX942-NEXT:    s_endpgm
@@ -2424,8 +2415,7 @@ define amdgpu_kernel void @store_inline_imm_m_2.0_f64(ptr addrspace(1) %out) {
 ; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX942-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX942-NEXT:    s_mov_b32 s2, -1
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NEXT:    v_mov_b32_e32 v1, -2.0
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], -2.0
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX942-NEXT:    s_endpgm
@@ -2461,8 +2451,7 @@ define amdgpu_kernel void @store_inline_imm_4.0_f64(ptr addrspace(1) %out) {
 ; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX942-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX942-NEXT:    s_mov_b32 s2, -1
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NEXT:    v_mov_b32_e32 v1, 0x40100000
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], 4.0
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX942-NEXT:    s_endpgm
@@ -2498,8 +2487,7 @@ define amdgpu_kernel void @store_inline_imm_m_4.0_f64(ptr addrspace(1) %out) {
 ; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX942-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX942-NEXT:    s_mov_b32 s2, -1
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NEXT:    v_mov_b32_e32 v1, 0xc0100000
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], -4.0
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX942-NEXT:    s_endpgm
@@ -2535,8 +2523,7 @@ define amdgpu_kernel void @store_inv_2pi_f64(ptr addrspace(1) %out) {
 ; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX942-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX942-NEXT:    s_mov_b32 s2, -1
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0x6dc9c882
-; GFX942-NEXT:    v_mov_b32_e32 v1, 0x3fc45f30
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], 0.15915494309189532
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX942-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll b/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll
index ec80efc5f0362..2daed9b69384f 100644
--- a/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll
+++ b/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll
@@ -56,19 +56,19 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
 ;
 ; GFX9V4-LABEL: addrspacecast:
 ; GFX9V4:       ; %bb.0:
-; GFX9V4-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX9V4-NEXT:    s_mov_b64 s[2:3], src_private_base
-; GFX9V4-NEXT:    s_mov_b64 s[4:5], src_shared_base
+; GFX9V4-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX9V4-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX9V4-NEXT:    s_mov_b64 s[2:3], src_shared_base
 ; GFX9V4-NEXT:    v_mov_b32_e32 v4, 1
 ; GFX9V4-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9V4-NEXT:    s_cmp_lg_u32 s0, -1
-; GFX9V4-NEXT:    s_cselect_b32 s2, s3, 0
-; GFX9V4-NEXT:    s_cselect_b32 s0, s0, 0
-; GFX9V4-NEXT:    s_cmp_lg_u32 s1, -1
-; GFX9V4-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9V4-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9V4-NEXT:    s_cselect_b32 s0, s5, 0
-; GFX9V4-NEXT:    s_cselect_b32 s1, s1, 0
+; GFX9V4-NEXT:    s_cmp_lg_u32 s4, -1
+; GFX9V4-NEXT:    s_cselect_b32 s0, s1, 0
+; GFX9V4-NEXT:    s_cselect_b32 s1, s4, 0
+; GFX9V4-NEXT:    s_cmp_lg_u32 s5, -1
+; GFX9V4-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9V4-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9V4-NEXT:    s_cselect_b32 s0, s3, 0
+; GFX9V4-NEXT:    s_cselect_b32 s1, s5, 0
 ; GFX9V4-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX9V4-NEXT:    v_mov_b32_e32 v3, s0
 ; GFX9V4-NEXT:    flat_store_dword v[0:1], v4
@@ -80,19 +80,19 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
 ;
 ; GFX9V5-LABEL: addrspacecast:
 ; GFX9V5:       ; %bb.0:
-; GFX9V5-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX9V5-NEXT:    s_mov_b64 s[2:3], src_private_base
-; GFX9V5-NEXT:    s_mov_b64 s[4:5], src_shared_base
+; GFX9V5-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX9V5-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX9V5-NEXT:    s_mov_b64 s[2:3], src_shared_base
 ; GFX9V5-NEXT:    v_mov_b32_e32 v4, 1
 ; GFX9V5-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9V5-NEXT:    s_cmp_lg_u32 s0, -1
-; GFX9V5-NEXT:    s_cselect_b32 s2, s3, 0
-; GFX9V5-NEXT:    s_cselect_b32 s0, s0, 0
-; GFX9V5-NEXT:    s_cmp_lg_u32 s1, -1
-; GFX9V5-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9V5-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9V5-NEXT:    s_cselect_b32 s0, s5, 0
-; GFX9V5-NEXT:    s_cselect_b32 s1, s1, 0
+; GFX9V5-NEXT:    s_cmp_lg_u32 s4, -1
+; GFX9V5-NEXT:    s_cselect_b32 s0, s1, 0
+; GFX9V5-NEXT:    s_cselect_b32 s1, s4, 0
+; GFX9V5-NEXT:    s_cmp_lg_u32 s5, -1
+; GFX9V5-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9V5-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9V5-NEXT:    s_cselect_b32 s0, s3, 0
+; GFX9V5-NEXT:    s_cselect_b32 s1, s5, 0
 ; GFX9V5-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX9V5-NEXT:    v_mov_b32_e32 v3, s0
 ; GFX9V5-NEXT:    flat_store_dword v[0:1], v4
@@ -136,10 +136,10 @@ define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) #0 {
 ;
 ; GFX9V4-LABEL: llvm_amdgcn_is_shared:
 ; GFX9V4:       ; %bb.0:
-; GFX9V4-NEXT:    s_load_dword s2, s[8:9], 0x4
 ; GFX9V4-NEXT:    s_mov_b64 s[0:1], src_shared_base
+; GFX9V4-NEXT:    s_load_dword s0, s[8:9], 0x4
 ; GFX9V4-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9V4-NEXT:    s_cmp_eq_u32 s2, s1
+; GFX9V4-NEXT:    s_cmp_eq_u32 s0, s1
 ; GFX9V4-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GFX9V4-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; GFX9V4-NEXT:    global_store_dword v[0:1], v0, off
@@ -148,10 +148,10 @@ define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) #0 {
 ;
 ; GFX9V5-LABEL: llvm_amdgcn_is_shared:
 ; GFX9V5:       ; %bb.0:
-; GFX9V5-NEXT:    s_load_dword s2, s[8:9], 0x4
 ; GFX9V5-NEXT:    s_mov_b64 s[0:1], src_shared_base
+; GFX9V5-NEXT:    s_load_dword s0, s[8:9], 0x4
 ; GFX9V5-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9V5-NEXT:    s_cmp_eq_u32 s2, s1
+; GFX9V5-NEXT:    s_cmp_eq_u32 s0, s1
 ; GFX9V5-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GFX9V5-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; GFX9V5-NEXT:    global_store_dword v[0:1], v0, off
@@ -190,10 +190,10 @@ define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) #0 {
 ;
 ; GFX9V4-LABEL: llvm_amdgcn_is_private:
 ; GFX9V4:       ; %bb.0:
-; GFX9V4-NEXT:    s_load_dword s2, s[8:9], 0x4
 ; GFX9V4-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX9V4-NEXT:    s_load_dword s0, s[8:9], 0x4
 ; GFX9V4-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9V4-NEXT:    s_cmp_eq_u32 s2, s1
+; GFX9V4-NEXT:    s_cmp_eq_u32 s0, s1
 ; GFX9V4-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GFX9V4-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; GFX9V4-NEXT:    global_store_dword v[0:1], v0, off
@@ -202,10 +202,10 @@ define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) #0 {
 ;
 ; GFX9V5-LABEL: llvm_amdgcn_is_private:
 ; GFX9V5:       ; %bb.0:
-; GFX9V5-NEXT:    s_load_dword s2, s[8:9], 0x4
 ; GFX9V5-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX9V5-NEXT:    s_load_dword s0, s[8:9], 0x4
 ; GFX9V5-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9V5-NEXT:    s_cmp_eq_u32 s2, s1
+; GFX9V5-NEXT:    s_cmp_eq_u32 s0, s1
 ; GFX9V5-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GFX9V5-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; GFX9V5-NEXT:    global_store_dword v[0:1], v0, off
diff --git a/llvm/test/CodeGen/AMDGPU/implicitarg-offset-attributes.ll b/llvm/test/CodeGen/AMDGPU/implicitarg-offset-attributes.ll
index 2a693e1001cd9..30890541df23c 100644
--- a/llvm/test/CodeGen/AMDGPU/implicitarg-offset-attributes.ll
+++ b/llvm/test/CodeGen/AMDGPU/implicitarg-offset-attributes.ll
@@ -276,23 +276,23 @@ attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memo
 
 ;.
 ; V4: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
-; V4: attributes #[[ATTR1]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
-; V4: attributes #[[ATTR2]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
-; V4: attributes #[[ATTR3]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
-; V4: attributes #[[ATTR4]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-default-queue" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
-; V4: attributes #[[ATTR5]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
+; V4: attributes #[[ATTR1]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
+; V4: attributes #[[ATTR2]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
+; V4: attributes #[[ATTR3]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
+; V4: attributes #[[ATTR4]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-default-queue" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
+; V4: attributes #[[ATTR5]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
 ;.
 ; V5: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
-; V5: attributes #[[ATTR1]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
-; V5: attributes #[[ATTR2]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
-; V5: attributes #[[ATTR3]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-default-queue" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
-; V5: attributes #[[ATTR4]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
+; V5: attributes #[[ATTR1]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
+; V5: attributes #[[ATTR2]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
+; V5: attributes #[[ATTR3]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-default-queue" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
+; V5: attributes #[[ATTR4]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
 ;.
 ; V6: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
-; V6: attributes #[[ATTR1]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
-; V6: attributes #[[ATTR2]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
-; V6: attributes #[[ATTR3]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-default-queue" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
-; V6: attributes #[[ATTR4]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
+; V6: attributes #[[ATTR1]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
+; V6: attributes #[[ATTR2]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
+; V6: attributes #[[ATTR3]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-default-queue" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
+; V6: attributes #[[ATTR4]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
 ;.
 ; V4: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 400}
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call-set-from-other-function.ll b/llvm/test/CodeGen/AMDGPU/indirect-call-set-from-other-function.ll
index d09b4fda1b697..d3ef1b7f7036f 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-call-set-from-other-function.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-call-set-from-other-function.ll
@@ -68,6 +68,6 @@ if.end:
   ret void
 }
 ;.
-; CHECK: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
 ; CHECK: attributes #[[ATTR1]] = { "uniform-work-group-size"="false" }
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-av-with-load-source.mir b/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-av-with-load-source.mir
index ccfb8f1d1fe9f..92836d8417ed3 100644
--- a/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-av-with-load-source.mir
+++ b/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-av-with-load-source.mir
@@ -486,7 +486,7 @@ body:             |
   ; CHECK-NEXT:   S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
   ; CHECK-NEXT:   S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
   ; CHECK-NEXT:   renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
-  ; CHECK-NEXT:   INLINEASM &"; use $0 ", 1 /* sideeffect attdialect */, 38600713 /* reguse:VReg_512_Align2 */, killed renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+  ; CHECK-NEXT:   INLINEASM &"; use $0 ", 1 /* sideeffect attdialect */, 39190537 /* reguse:VReg_512_Align2 */, killed renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
     S_NOP 0, implicit-def $agpr0
@@ -516,7 +516,7 @@ body:             |
     S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
     S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
     S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
-    INLINEASM &"; use $0 ", 1 /* sideeffect attdialect */, 38600713 /* reguse:VReg_512_Align2 */, %0:vreg_512_align2
+    INLINEASM &"; use $0 ", 1 /* sideeffect attdialect */, 39190537 /* reguse:VReg_512_Align2 */, %0:vreg_512_align2
     S_ENDPGM 0
 
 ...
@@ -1368,7 +1368,7 @@ body:             |
   ; CHECK-NEXT:   renamable $vgpr0_vgpr1 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s64), addrspace 1)
   ; CHECK-NEXT:   early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33 = V_MFMA_F32_32X32X8F16_vgprcd_e64 $vgpr16_vgpr17, $vgpr16_vgpr17, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT:   early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_MFMA_F32_32X32X8F16_vgprcd_e64 $vgpr16_vgpr17, $vgpr16_vgpr17, $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, 0, 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 38600713 /* reguse:VReg_512_Align2 */, killed renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+  ; CHECK-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 39190537 /* reguse:VReg_512_Align2 */, killed renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
   ; CHECK-NEXT:   S_CBRANCH_VCCNZ %bb.1, implicit $vcc
   ; CHECK-NEXT:   S_BRANCH %bb.2
   ; CHECK-NEXT: {{  $}}
@@ -1408,7 +1408,7 @@ body:             |
     undef %2.sub0_sub1:vreg_512_align2 = GLOBAL_LOAD_DWORDX2 undef %3:vreg_64_align2, 0, 0, implicit $exec :: (load (s64), addrspace 1)
     early-clobber %0:vreg_512_align2 = V_MFMA_F32_32X32X8F16_vgprcd_e64 %1, %1, %2, 0, 0, 0, implicit $mode, implicit $exec
     early-clobber %4:vreg_512_align2 = V_MFMA_F32_32X32X8F16_vgprcd_e64 %1, %1, %0, 0, 0, 0, implicit $mode, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 38600713 /* reguse:VReg_512_Align2 */, %4
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 39190537 /* reguse:VReg_512_Align2 */, %4
     S_CBRANCH_VCCNZ %bb.1, implicit $vcc
     S_BRANCH %bb.2
 
@@ -1726,7 +1726,7 @@ body:             |
   ; CHECK-NEXT:   renamable $vgpr0_vgpr1 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s64), addrspace 1)
   ; CHECK-NEXT:   renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 $vgpr16_vgpr17, $vgpr16_vgpr17, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT:   early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33 = V_MFMA_F32_32X32X8F16_vgprcd_e64 $vgpr16_vgpr17, $vgpr16_vgpr17, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 38600713 /* reguse:VReg_512_Align2 */, renamable $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33
+  ; CHECK-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 39190537 /* reguse:VReg_512_Align2 */, renamable $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33
   ; CHECK-NEXT:   S_CBRANCH_VCCNZ %bb.1, implicit $vcc
   ; CHECK-NEXT:   S_BRANCH %bb.2
   ; CHECK-NEXT: {{  $}}
@@ -1763,7 +1763,7 @@ body:             |
     undef %0.sub0_sub1:vreg_512_align2 = GLOBAL_LOAD_DWORDX2 undef %3:vreg_64_align2, 0, 0, implicit $exec :: (load (s64), addrspace 1)
     %0:vreg_512_align2 = V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1, %1, %0, 0, 0, 0, implicit $mode, implicit $exec
     %4:vreg_512_align2 = V_MFMA_F32_32X32X8F16_vgprcd_e64 %1, %1, %0, 0, 0, 0, implicit $mode, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 38600713 /* reguse:VReg_512_Align2 */, %4
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 39190537 /* reguse:VReg_512_Align2 */, %4
     S_CBRANCH_VCCNZ %bb.1, implicit $vcc
     S_BRANCH %bb.2
 
diff --git a/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll b/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll
index aa75dd1386396..9cbdc3867e374 100644
--- a/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll
@@ -8,16 +8,16 @@
 define amdgpu_kernel void @s_input_output_i128() {
   ; GFX908-LABEL: name: s_input_output_i128
   ; GFX908: bb.0 (%ir-block.0):
-  ; GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 9043978 /* regdef:SGPR_128 */, def %13
+  ; GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 9633802 /* regdef:SGPR_128 */, def %13
   ; GFX908-NEXT:   [[COPY:%[0-9]+]]:sgpr_128 = COPY %13
-  ; GFX908-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9043977 /* reguse:SGPR_128 */, [[COPY]]
+  ; GFX908-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9633801 /* reguse:SGPR_128 */, [[COPY]]
   ; GFX908-NEXT:   S_ENDPGM 0
   ;
   ; GFX90A-LABEL: name: s_input_output_i128
   ; GFX90A: bb.0 (%ir-block.0):
-  ; GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 9043978 /* regdef:SGPR_128 */, def %11
+  ; GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 9633802 /* regdef:SGPR_128 */, def %11
   ; GFX90A-NEXT:   [[COPY:%[0-9]+]]:sgpr_128 = COPY %11
-  ; GFX90A-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9043977 /* reguse:SGPR_128 */, [[COPY]]
+  ; GFX90A-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9633801 /* reguse:SGPR_128 */, [[COPY]]
   ; GFX90A-NEXT:   S_ENDPGM 0
   %val = tail call i128 asm sideeffect "; def $0", "=s"()
   call void asm sideeffect "; use $0", "s"(i128 %val)
@@ -27,16 +27,16 @@ define amdgpu_kernel void @s_input_output_i128() {
 define amdgpu_kernel void @v_input_output_i128() {
   ; GFX908-LABEL: name: v_input_output_i128
   ; GFX908: bb.0 (%ir-block.0):
-  ; GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7208970 /* regdef:VReg_128 */, def %13
+  ; GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7798794 /* regdef:VReg_128 */, def %13
   ; GFX908-NEXT:   [[COPY:%[0-9]+]]:vreg_128 = COPY %13
-  ; GFX908-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7208969 /* reguse:VReg_128 */, [[COPY]]
+  ; GFX908-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7798793 /* reguse:VReg_128 */, [[COPY]]
   ; GFX908-NEXT:   S_ENDPGM 0
   ;
   ; GFX90A-LABEL: name: v_input_output_i128
   ; GFX90A: bb.0 (%ir-block.0):
-  ; GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7405578 /* regdef:VReg_128_Align2 */, def %11
+  ; GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7995402 /* regdef:VReg_128_Align2 */, def %11
   ; GFX90A-NEXT:   [[COPY:%[0-9]+]]:vreg_128_align2 = COPY %11
-  ; GFX90A-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7405577 /* reguse:VReg_128_Align2 */, [[COPY]]
+  ; GFX90A-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7995401 /* reguse:VReg_128_Align2 */, [[COPY]]
   ; GFX90A-NEXT:   S_ENDPGM 0
   %val = tail call i128 asm sideeffect "; def $0", "=v"()
   call void asm sideeffect "; use $0", "v"(i128 %val)
@@ -47,16 +47,16 @@ define amdgpu_kernel void @a_input_output_i128() {
 
   ; GFX908-LABEL: name: a_input_output_i128
   ; GFX908: bb.0 (%ir-block.0):
-  ; GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7733258 /* regdef:AReg_128 */, def %13
+  ; GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 8323082 /* regdef:AReg_128 */, def %13
   ; GFX908-NEXT:   [[COPY:%[0-9]+]]:areg_128 = COPY %13
-  ; GFX908-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7733257 /* reguse:AReg_128 */, [[COPY]]
+  ; GFX908-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8323081 /* reguse:AReg_128 */, [[COPY]]
   ; GFX908-NEXT:   S_ENDPGM 0
   ;
   ; GFX90A-LABEL: name: a_input_output_i128
   ; GFX90A: bb.0 (%ir-block.0):
-  ; GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 8060938 /* regdef:AReg_128_Align2 */, def %11
+  ; GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 8650762 /* regdef:AReg_128_Align2 */, def %11
   ; GFX90A-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY %11
-  ; GFX90A-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8060937 /* reguse:AReg_128_Align2 */, [[COPY]]
+  ; GFX90A-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY]]
   ; GFX90A-NEXT:   S_ENDPGM 0
   %val = call i128 asm sideeffect "; def $0", "=a"()
   call void asm sideeffect "; use $0", "a"(i128 %val)
diff --git a/llvm/test/CodeGen/AMDGPU/issue120256-annotate-constexpr-addrspacecast.ll b/llvm/test/CodeGen/AMDGPU/issue120256-annotate-constexpr-addrspacecast.ll
index 8e87256c24ce5..71a330efc74c6 100644
--- a/llvm/test/CodeGen/AMDGPU/issue120256-annotate-constexpr-addrspacecast.ll
+++ b/llvm/test/CodeGen/AMDGPU/issue120256-annotate-constexpr-addrspacecast.ll
@@ -55,8 +55,8 @@ define amdgpu_kernel void @issue120256_private(ptr addrspace(1) %out) {
 ; FIXME: Inference of amdgpu-no-queue-ptr should not depend on code object version.
 !0 = !{i32 1, !"amdhsa_code_object_version", i32 400}
 ;.
-; CHECK: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx803" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR1]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx803" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx803" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx803" "uniform-work-group-size"="false" }
 ;.
 ; CHECK: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 400}
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.id.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.id.ll
new file mode 100644
index 0000000000000..90fcb5191c353
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.id.ll
@@ -0,0 +1,554 @@
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-attributor %s -o %t.bc
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 %t.bc -o - | FileCheck --check-prefixes=CHECK-UNKNOWN %s
+; RUN: llc -mtriple=amdgcn-unknown-mesa3d -mcpu=gfx1250 %t.bc -o - | FileCheck -check-prefixes=CHECK-MESA3D %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1250 %t.bc -o - | FileCheck --check-prefixes=CHECK-G-UNKNOWN %s
+; RUN: llc -global-isel -mtriple=amdgcn-unknown-mesa3d -mcpu=gfx1250 %t.bc -o - | FileCheck -check-prefixes=CHECK-G-MESA3D %s
+
+declare i32 @llvm.amdgcn.cluster.id.x() #0
+declare i32 @llvm.amdgcn.cluster.id.y() #0
+declare i32 @llvm.amdgcn.cluster.id.z() #0
+
+define amdgpu_kernel void @test_cluster_id_x(ptr addrspace(1) %out) {
+; CHECK-UNKNOWN-LABEL: test_cluster_id_x:
+; CHECK-UNKNOWN:       ; %bb.0:
+; CHECK-UNKNOWN-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; CHECK-UNKNOWN-NEXT:    v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, 0
+; CHECK-UNKNOWN-NEXT:    s_wait_kmcnt 0x0
+; CHECK-UNKNOWN-NEXT:    global_store_b32 v1, v0, s[0:1]
+; CHECK-UNKNOWN-NEXT:    s_endpgm
+; CHECK-UNKNOWN: COMPUTE_PGM_RSRC2:TGID_X_EN: 1
+; CHECK-UNKNOWN: COMPUTE_PGM_RSRC2:TGID_Y_EN: 0
+; CHECK-UNKNOWN: COMPUTE_PGM_RSRC2:TGID_Z_EN: 0
+;
+; CHECK-MESA3D-LABEL: test_cluster_id_x:
+; CHECK-MESA3D:         .amd_kernel_code_t
+; CHECK-MESA3D-NEXT:     amd_code_version_major = 1
+; CHECK-MESA3D-NEXT:     amd_code_version_minor = 2
+; CHECK-MESA3D-NEXT:     amd_machine_kind = 1
+; CHECK-MESA3D-NEXT:     amd_machine_version_major = 12
+; CHECK-MESA3D-NEXT:     amd_machine_version_minor = 5
+; CHECK-MESA3D-NEXT:     amd_machine_version_stepping = 0
+; CHECK-MESA3D-NEXT:     kernel_code_entry_byte_offset = 256
+; CHECK-MESA3D-NEXT:     kernel_code_prefetch_byte_size = 0
+; CHECK-MESA3D-NEXT:     granulated_workitem_vgpr_count = 0
+; CHECK-MESA3D-NEXT:     granulated_wavefront_sgpr_count = 0
+; CHECK-MESA3D-NEXT:     priority = 0
+; CHECK-MESA3D-NEXT:     float_mode = 240
+; CHECK-MESA3D-NEXT:     priv = 0
+; CHECK-MESA3D-NEXT:     enable_dx10_clamp = 0
+; CHECK-MESA3D-NEXT:     debug_mode = 0
+; CHECK-MESA3D-NEXT:     enable_ieee_mode = 0
+; CHECK-MESA3D-NEXT:     enable_wgp_mode = 0
+; CHECK-MESA3D-NEXT:     enable_mem_ordered = 1
+; CHECK-MESA3D-NEXT:     enable_fwd_progress = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
+; CHECK-MESA3D-NEXT:     user_sgpr_count = 2
+; CHECK-MESA3D-NEXT:     enable_trap_handler = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_id_x = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_id_y = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_id_z = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_info = 0
+; CHECK-MESA3D-NEXT:     enable_vgpr_workitem_id = 0
+; CHECK-MESA3D-NEXT:     enable_exception_msb = 0
+; CHECK-MESA3D-NEXT:     granulated_lds_size = 0
+; CHECK-MESA3D-NEXT:     enable_exception = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_private_segment_buffer = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_dispatch_ptr = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_queue_ptr = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_dispatch_id = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_flat_scratch_init = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_private_segment_size = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_z = 0
+; CHECK-MESA3D-NEXT:     enable_wavefront_size32 = 1
+; CHECK-MESA3D-NEXT:     enable_ordered_append_gds = 0
+; CHECK-MESA3D-NEXT:     private_element_size = 1
+; CHECK-MESA3D-NEXT:     is_ptr64 = 1
+; CHECK-MESA3D-NEXT:     is_dynamic_callstack = 0
+; CHECK-MESA3D-NEXT:     is_debug_enabled = 0
+; CHECK-MESA3D-NEXT:     is_xnack_enabled = 0
+; CHECK-MESA3D-NEXT:     workitem_private_segment_byte_size = 0
+; CHECK-MESA3D-NEXT:     workgroup_group_segment_byte_size = 0
+; CHECK-MESA3D-NEXT:     gds_segment_byte_size = 0
+; CHECK-MESA3D-NEXT:     kernarg_segment_byte_size = 8
+; CHECK-MESA3D-NEXT:     workgroup_fbarrier_count = 0
+; CHECK-MESA3D-NEXT:     wavefront_sgpr_count = 2
+; CHECK-MESA3D-NEXT:     workitem_vgpr_count = 2
+; CHECK-MESA3D-NEXT:     reserved_vgpr_first = 0
+; CHECK-MESA3D-NEXT:     reserved_vgpr_count = 0
+; CHECK-MESA3D-NEXT:     reserved_sgpr_first = 0
+; CHECK-MESA3D-NEXT:     reserved_sgpr_count = 0
+; CHECK-MESA3D-NEXT:     debug_wavefront_private_segment_offset_sgpr = 0
+; CHECK-MESA3D-NEXT:     debug_private_segment_buffer_sgpr = 0
+; CHECK-MESA3D-NEXT:     kernarg_segment_alignment = 4
+; CHECK-MESA3D-NEXT:     group_segment_alignment = 4
+; CHECK-MESA3D-NEXT:     private_segment_alignment = 4
+; CHECK-MESA3D-NEXT:     wavefront_size = 5
+; CHECK-MESA3D-NEXT:     call_convention = -1
+; CHECK-MESA3D-NEXT:     runtime_loader_kernel_symbol = 0
+; CHECK-MESA3D-NEXT:    .end_amd_kernel_code_t
+; CHECK-MESA3D-NEXT:  ; %bb.0:
+; CHECK-MESA3D-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; CHECK-MESA3D-NEXT:    v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, 0
+; CHECK-MESA3D-NEXT:    s_wait_kmcnt 0x0
+; CHECK-MESA3D-NEXT:    global_store_b32 v1, v0, s[0:1]
+; CHECK-MESA3D-NEXT:    s_endpgm
+;
+; CHECK-G-UNKNOWN-LABEL: test_cluster_id_x:
+; CHECK-G-UNKNOWN:       ; %bb.0:
+; CHECK-G-UNKNOWN-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; CHECK-G-UNKNOWN-NEXT:    v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, 0
+; CHECK-G-UNKNOWN-NEXT:    s_wait_kmcnt 0x0
+; CHECK-G-UNKNOWN-NEXT:    global_store_b32 v1, v0, s[0:1]
+; CHECK-G-UNKNOWN-NEXT:    s_endpgm
+; CHECK-G-UNKNOWN: COMPUTE_PGM_RSRC2:TGID_X_EN: 1
+; CHECK-G-UNKNOWN: COMPUTE_PGM_RSRC2:TGID_Y_EN: 0
+; CHECK-G-UNKNOWN: COMPUTE_PGM_RSRC2:TGID_Z_EN: 0
+;
+; CHECK-G-MESA3D-LABEL: test_cluster_id_x:
+; CHECK-G-MESA3D:         .amd_kernel_code_t
+; CHECK-G-MESA3D-NEXT:     amd_code_version_major = 1
+; CHECK-G-MESA3D-NEXT:     amd_code_version_minor = 2
+; CHECK-G-MESA3D-NEXT:     amd_machine_kind = 1
+; CHECK-G-MESA3D-NEXT:     amd_machine_version_major = 12
+; CHECK-G-MESA3D-NEXT:     amd_machine_version_minor = 5
+; CHECK-G-MESA3D-NEXT:     amd_machine_version_stepping = 0
+; CHECK-G-MESA3D-NEXT:     kernel_code_entry_byte_offset = 256
+; CHECK-G-MESA3D-NEXT:     kernel_code_prefetch_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     granulated_workitem_vgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     granulated_wavefront_sgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     priority = 0
+; CHECK-G-MESA3D-NEXT:     float_mode = 240
+; CHECK-G-MESA3D-NEXT:     priv = 0
+; CHECK-G-MESA3D-NEXT:     enable_dx10_clamp = 0
+; CHECK-G-MESA3D-NEXT:     debug_mode = 0
+; CHECK-G-MESA3D-NEXT:     enable_ieee_mode = 0
+; CHECK-G-MESA3D-NEXT:     enable_wgp_mode = 0
+; CHECK-G-MESA3D-NEXT:     enable_mem_ordered = 1
+; CHECK-G-MESA3D-NEXT:     enable_fwd_progress = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
+; CHECK-G-MESA3D-NEXT:     user_sgpr_count = 2
+; CHECK-G-MESA3D-NEXT:     enable_trap_handler = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_id_x = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_id_y = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_id_z = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_info = 0
+; CHECK-G-MESA3D-NEXT:     enable_vgpr_workitem_id = 0
+; CHECK-G-MESA3D-NEXT:     enable_exception_msb = 0
+; CHECK-G-MESA3D-NEXT:     granulated_lds_size = 0
+; CHECK-G-MESA3D-NEXT:     enable_exception = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_private_segment_buffer = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_dispatch_ptr = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_queue_ptr = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_dispatch_id = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_flat_scratch_init = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_private_segment_size = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_z = 0
+; CHECK-G-MESA3D-NEXT:     enable_wavefront_size32 = 1
+; CHECK-G-MESA3D-NEXT:     enable_ordered_append_gds = 0
+; CHECK-G-MESA3D-NEXT:     private_element_size = 1
+; CHECK-G-MESA3D-NEXT:     is_ptr64 = 1
+; CHECK-G-MESA3D-NEXT:     is_dynamic_callstack = 0
+; CHECK-G-MESA3D-NEXT:     is_debug_enabled = 0
+; CHECK-G-MESA3D-NEXT:     is_xnack_enabled = 0
+; CHECK-G-MESA3D-NEXT:     workitem_private_segment_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     workgroup_group_segment_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     gds_segment_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     kernarg_segment_byte_size = 8
+; CHECK-G-MESA3D-NEXT:     workgroup_fbarrier_count = 0
+; CHECK-G-MESA3D-NEXT:     wavefront_sgpr_count = 2
+; CHECK-G-MESA3D-NEXT:     workitem_vgpr_count = 2
+; CHECK-G-MESA3D-NEXT:     reserved_vgpr_first = 0
+; CHECK-G-MESA3D-NEXT:     reserved_vgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     reserved_sgpr_first = 0
+; CHECK-G-MESA3D-NEXT:     reserved_sgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     debug_wavefront_private_segment_offset_sgpr = 0
+; CHECK-G-MESA3D-NEXT:     debug_private_segment_buffer_sgpr = 0
+; CHECK-G-MESA3D-NEXT:     kernarg_segment_alignment = 4
+; CHECK-G-MESA3D-NEXT:     group_segment_alignment = 4
+; CHECK-G-MESA3D-NEXT:     private_segment_alignment = 4
+; CHECK-G-MESA3D-NEXT:     wavefront_size = 5
+; CHECK-G-MESA3D-NEXT:     call_convention = -1
+; CHECK-G-MESA3D-NEXT:     runtime_loader_kernel_symbol = 0
+; CHECK-G-MESA3D-NEXT:    .end_amd_kernel_code_t
+; CHECK-G-MESA3D-NEXT:  ; %bb.0:
+; CHECK-G-MESA3D-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; CHECK-G-MESA3D-NEXT:    v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, 0
+; CHECK-G-MESA3D-NEXT:    s_wait_kmcnt 0x0
+; CHECK-G-MESA3D-NEXT:    global_store_b32 v1, v0, s[0:1]
+; CHECK-G-MESA3D-NEXT:    s_endpgm
+  %id = call i32 @llvm.amdgcn.cluster.id.x()
+  store i32 %id, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @test_cluster_id_y(ptr addrspace(1) %out) #1 {
+; CHECK-UNKNOWN-LABEL: test_cluster_id_y:
+; CHECK-UNKNOWN:       ; %bb.0:
+; CHECK-UNKNOWN-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; CHECK-UNKNOWN-NEXT:    v_dual_mov_b32 v0, ttmp7 :: v_dual_mov_b32 v1, 0
+; CHECK-UNKNOWN-NEXT:    s_wait_kmcnt 0x0
+; CHECK-UNKNOWN-NEXT:    global_store_b32 v1, v0, s[0:1]
+; CHECK-UNKNOWN-NEXT:    s_endpgm
+; CHECK-UNKNOWN: COMPUTE_PGM_RSRC2:TGID_X_EN: 1
+; CHECK-UNKNOWN: COMPUTE_PGM_RSRC2:TGID_Y_EN: 1
+; CHECK-UNKNOWN: COMPUTE_PGM_RSRC2:TGID_Z_EN: 0
+;
+; CHECK-MESA3D-LABEL: test_cluster_id_y:
+; CHECK-MESA3D:         .amd_kernel_code_t
+; CHECK-MESA3D-NEXT:     amd_code_version_major = 1
+; CHECK-MESA3D-NEXT:     amd_code_version_minor = 2
+; CHECK-MESA3D-NEXT:     amd_machine_kind = 1
+; CHECK-MESA3D-NEXT:     amd_machine_version_major = 12
+; CHECK-MESA3D-NEXT:     amd_machine_version_minor = 5
+; CHECK-MESA3D-NEXT:     amd_machine_version_stepping = 0
+; CHECK-MESA3D-NEXT:     kernel_code_entry_byte_offset = 256
+; CHECK-MESA3D-NEXT:     kernel_code_prefetch_byte_size = 0
+; CHECK-MESA3D-NEXT:     granulated_workitem_vgpr_count = 0
+; CHECK-MESA3D-NEXT:     granulated_wavefront_sgpr_count = 0
+; CHECK-MESA3D-NEXT:     priority = 0
+; CHECK-MESA3D-NEXT:     float_mode = 240
+; CHECK-MESA3D-NEXT:     priv = 0
+; CHECK-MESA3D-NEXT:     enable_dx10_clamp = 0
+; CHECK-MESA3D-NEXT:     debug_mode = 0
+; CHECK-MESA3D-NEXT:     enable_ieee_mode = 0
+; CHECK-MESA3D-NEXT:     enable_wgp_mode = 0
+; CHECK-MESA3D-NEXT:     enable_mem_ordered = 1
+; CHECK-MESA3D-NEXT:     enable_fwd_progress = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
+; CHECK-MESA3D-NEXT:     user_sgpr_count = 2
+; CHECK-MESA3D-NEXT:     enable_trap_handler = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_id_x = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_id_y = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_id_z = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_info = 0
+; CHECK-MESA3D-NEXT:     enable_vgpr_workitem_id = 0
+; CHECK-MESA3D-NEXT:     enable_exception_msb = 0
+; CHECK-MESA3D-NEXT:     granulated_lds_size = 0
+; CHECK-MESA3D-NEXT:     enable_exception = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_private_segment_buffer = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_dispatch_ptr = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_queue_ptr = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_dispatch_id = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_flat_scratch_init = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_private_segment_size = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_z = 0
+; CHECK-MESA3D-NEXT:     enable_wavefront_size32 = 1
+; CHECK-MESA3D-NEXT:     enable_ordered_append_gds = 0
+; CHECK-MESA3D-NEXT:     private_element_size = 1
+; CHECK-MESA3D-NEXT:     is_ptr64 = 1
+; CHECK-MESA3D-NEXT:     is_dynamic_callstack = 0
+; CHECK-MESA3D-NEXT:     is_debug_enabled = 0
+; CHECK-MESA3D-NEXT:     is_xnack_enabled = 0
+; CHECK-MESA3D-NEXT:     workitem_private_segment_byte_size = 0
+; CHECK-MESA3D-NEXT:     workgroup_group_segment_byte_size = 0
+; CHECK-MESA3D-NEXT:     gds_segment_byte_size = 0
+; CHECK-MESA3D-NEXT:     kernarg_segment_byte_size = 8
+; CHECK-MESA3D-NEXT:     workgroup_fbarrier_count = 0
+; CHECK-MESA3D-NEXT:     wavefront_sgpr_count = 2
+; CHECK-MESA3D-NEXT:     workitem_vgpr_count = 2
+; CHECK-MESA3D-NEXT:     reserved_vgpr_first = 0
+; CHECK-MESA3D-NEXT:     reserved_vgpr_count = 0
+; CHECK-MESA3D-NEXT:     reserved_sgpr_first = 0
+; CHECK-MESA3D-NEXT:     reserved_sgpr_count = 0
+; CHECK-MESA3D-NEXT:     debug_wavefront_private_segment_offset_sgpr = 0
+; CHECK-MESA3D-NEXT:     debug_private_segment_buffer_sgpr = 0
+; CHECK-MESA3D-NEXT:     kernarg_segment_alignment = 4
+; CHECK-MESA3D-NEXT:     group_segment_alignment = 4
+; CHECK-MESA3D-NEXT:     private_segment_alignment = 4
+; CHECK-MESA3D-NEXT:     wavefront_size = 5
+; CHECK-MESA3D-NEXT:     call_convention = -1
+; CHECK-MESA3D-NEXT:     runtime_loader_kernel_symbol = 0
+; CHECK-MESA3D-NEXT:    .end_amd_kernel_code_t
+; CHECK-MESA3D-NEXT:  ; %bb.0:
+; CHECK-MESA3D-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; CHECK-MESA3D-NEXT:    v_dual_mov_b32 v0, ttmp7 :: v_dual_mov_b32 v1, 0
+; CHECK-MESA3D-NEXT:    s_wait_kmcnt 0x0
+; CHECK-MESA3D-NEXT:    global_store_b32 v1, v0, s[0:1]
+; CHECK-MESA3D-NEXT:    s_endpgm
+;
+; CHECK-G-UNKNOWN-LABEL: test_cluster_id_y:
+; CHECK-G-UNKNOWN:       ; %bb.0:
+; CHECK-G-UNKNOWN-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; CHECK-G-UNKNOWN-NEXT:    v_dual_mov_b32 v0, ttmp7 :: v_dual_mov_b32 v1, 0
+; CHECK-G-UNKNOWN-NEXT:    s_wait_kmcnt 0x0
+; CHECK-G-UNKNOWN-NEXT:    global_store_b32 v1, v0, s[0:1]
+; CHECK-G-UNKNOWN-NEXT:    s_endpgm
+; CHECK-G-UNKNOWN: COMPUTE_PGM_RSRC2:TGID_X_EN: 1
+; CHECK-G-UNKNOWN: COMPUTE_PGM_RSRC2:TGID_Y_EN: 1
+; CHECK-G-UNKNOWN: COMPUTE_PGM_RSRC2:TGID_Z_EN: 0
+;
+; CHECK-G-MESA3D-LABEL: test_cluster_id_y:
+; CHECK-G-MESA3D:         .amd_kernel_code_t
+; CHECK-G-MESA3D-NEXT:     amd_code_version_major = 1
+; CHECK-G-MESA3D-NEXT:     amd_code_version_minor = 2
+; CHECK-G-MESA3D-NEXT:     amd_machine_kind = 1
+; CHECK-G-MESA3D-NEXT:     amd_machine_version_major = 12
+; CHECK-G-MESA3D-NEXT:     amd_machine_version_minor = 5
+; CHECK-G-MESA3D-NEXT:     amd_machine_version_stepping = 0
+; CHECK-G-MESA3D-NEXT:     kernel_code_entry_byte_offset = 256
+; CHECK-G-MESA3D-NEXT:     kernel_code_prefetch_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     granulated_workitem_vgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     granulated_wavefront_sgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     priority = 0
+; CHECK-G-MESA3D-NEXT:     float_mode = 240
+; CHECK-G-MESA3D-NEXT:     priv = 0
+; CHECK-G-MESA3D-NEXT:     enable_dx10_clamp = 0
+; CHECK-G-MESA3D-NEXT:     debug_mode = 0
+; CHECK-G-MESA3D-NEXT:     enable_ieee_mode = 0
+; CHECK-G-MESA3D-NEXT:     enable_wgp_mode = 0
+; CHECK-G-MESA3D-NEXT:     enable_mem_ordered = 1
+; CHECK-G-MESA3D-NEXT:     enable_fwd_progress = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
+; CHECK-G-MESA3D-NEXT:     user_sgpr_count = 2
+; CHECK-G-MESA3D-NEXT:     enable_trap_handler = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_id_x = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_id_y = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_id_z = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_info = 0
+; CHECK-G-MESA3D-NEXT:     enable_vgpr_workitem_id = 0
+; CHECK-G-MESA3D-NEXT:     enable_exception_msb = 0
+; CHECK-G-MESA3D-NEXT:     granulated_lds_size = 0
+; CHECK-G-MESA3D-NEXT:     enable_exception = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_private_segment_buffer = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_dispatch_ptr = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_queue_ptr = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_dispatch_id = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_flat_scratch_init = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_private_segment_size = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_z = 0
+; CHECK-G-MESA3D-NEXT:     enable_wavefront_size32 = 1
+; CHECK-G-MESA3D-NEXT:     enable_ordered_append_gds = 0
+; CHECK-G-MESA3D-NEXT:     private_element_size = 1
+; CHECK-G-MESA3D-NEXT:     is_ptr64 = 1
+; CHECK-G-MESA3D-NEXT:     is_dynamic_callstack = 0
+; CHECK-G-MESA3D-NEXT:     is_debug_enabled = 0
+; CHECK-G-MESA3D-NEXT:     is_xnack_enabled = 0
+; CHECK-G-MESA3D-NEXT:     workitem_private_segment_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     workgroup_group_segment_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     gds_segment_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     kernarg_segment_byte_size = 8
+; CHECK-G-MESA3D-NEXT:     workgroup_fbarrier_count = 0
+; CHECK-G-MESA3D-NEXT:     wavefront_sgpr_count = 2
+; CHECK-G-MESA3D-NEXT:     workitem_vgpr_count = 2
+; CHECK-G-MESA3D-NEXT:     reserved_vgpr_first = 0
+; CHECK-G-MESA3D-NEXT:     reserved_vgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     reserved_sgpr_first = 0
+; CHECK-G-MESA3D-NEXT:     reserved_sgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     debug_wavefront_private_segment_offset_sgpr = 0
+; CHECK-G-MESA3D-NEXT:     debug_private_segment_buffer_sgpr = 0
+; CHECK-G-MESA3D-NEXT:     kernarg_segment_alignment = 4
+; CHECK-G-MESA3D-NEXT:     group_segment_alignment = 4
+; CHECK-G-MESA3D-NEXT:     private_segment_alignment = 4
+; CHECK-G-MESA3D-NEXT:     wavefront_size = 5
+; CHECK-G-MESA3D-NEXT:     call_convention = -1
+; CHECK-G-MESA3D-NEXT:     runtime_loader_kernel_symbol = 0
+; CHECK-G-MESA3D-NEXT:    .end_amd_kernel_code_t
+; CHECK-G-MESA3D-NEXT:  ; %bb.0:
+; CHECK-G-MESA3D-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; CHECK-G-MESA3D-NEXT:    v_dual_mov_b32 v0, ttmp7 :: v_dual_mov_b32 v1, 0
+; CHECK-G-MESA3D-NEXT:    s_wait_kmcnt 0x0
+; CHECK-G-MESA3D-NEXT:    global_store_b32 v1, v0, s[0:1]
+; CHECK-G-MESA3D-NEXT:    s_endpgm
+  %id = call i32 @llvm.amdgcn.cluster.id.y()
+  store i32 %id, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @test_cluster_id_z(ptr addrspace(1) %out) #1 {
+; CHECK-UNKNOWN-LABEL: test_cluster_id_z:
+; CHECK-UNKNOWN:       ; %bb.0:
+; CHECK-UNKNOWN-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; CHECK-UNKNOWN-NEXT:    s_lshr_b32 s2, ttmp7, 16
+; CHECK-UNKNOWN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-UNKNOWN-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; CHECK-UNKNOWN-NEXT:    s_wait_kmcnt 0x0
+; CHECK-UNKNOWN-NEXT:    global_store_b32 v0, v1, s[0:1]
+; CHECK-UNKNOWN-NEXT:    s_endpgm
+; CHECK-UNKNOWN: COMPUTE_PGM_RSRC2:TGID_X_EN: 1
+; CHECK-UNKNOWN: COMPUTE_PGM_RSRC2:TGID_Y_EN: 0
+; CHECK-UNKNOWN: COMPUTE_PGM_RSRC2:TGID_Z_EN: 1
+;
+; CHECK-MESA3D-LABEL: test_cluster_id_z:
+; CHECK-MESA3D:         .amd_kernel_code_t
+; CHECK-MESA3D-NEXT:     amd_code_version_major = 1
+; CHECK-MESA3D-NEXT:     amd_code_version_minor = 2
+; CHECK-MESA3D-NEXT:     amd_machine_kind = 1
+; CHECK-MESA3D-NEXT:     amd_machine_version_major = 12
+; CHECK-MESA3D-NEXT:     amd_machine_version_minor = 5
+; CHECK-MESA3D-NEXT:     amd_machine_version_stepping = 0
+; CHECK-MESA3D-NEXT:     kernel_code_entry_byte_offset = 256
+; CHECK-MESA3D-NEXT:     kernel_code_prefetch_byte_size = 0
+; CHECK-MESA3D-NEXT:     granulated_workitem_vgpr_count = 0
+; CHECK-MESA3D-NEXT:     granulated_wavefront_sgpr_count = 0
+; CHECK-MESA3D-NEXT:     priority = 0
+; CHECK-MESA3D-NEXT:     float_mode = 240
+; CHECK-MESA3D-NEXT:     priv = 0
+; CHECK-MESA3D-NEXT:     enable_dx10_clamp = 0
+; CHECK-MESA3D-NEXT:     debug_mode = 0
+; CHECK-MESA3D-NEXT:     enable_ieee_mode = 0
+; CHECK-MESA3D-NEXT:     enable_wgp_mode = 0
+; CHECK-MESA3D-NEXT:     enable_mem_ordered = 1
+; CHECK-MESA3D-NEXT:     enable_fwd_progress = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
+; CHECK-MESA3D-NEXT:     user_sgpr_count = 2
+; CHECK-MESA3D-NEXT:     enable_trap_handler = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_id_x = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_id_y = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_id_z = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_info = 0
+; CHECK-MESA3D-NEXT:     enable_vgpr_workitem_id = 0
+; CHECK-MESA3D-NEXT:     enable_exception_msb = 0
+; CHECK-MESA3D-NEXT:     granulated_lds_size = 0
+; CHECK-MESA3D-NEXT:     enable_exception = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_private_segment_buffer = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_dispatch_ptr = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_queue_ptr = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_dispatch_id = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_flat_scratch_init = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_private_segment_size = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_z = 0
+; CHECK-MESA3D-NEXT:     enable_wavefront_size32 = 1
+; CHECK-MESA3D-NEXT:     enable_ordered_append_gds = 0
+; CHECK-MESA3D-NEXT:     private_element_size = 1
+; CHECK-MESA3D-NEXT:     is_ptr64 = 1
+; CHECK-MESA3D-NEXT:     is_dynamic_callstack = 0
+; CHECK-MESA3D-NEXT:     is_debug_enabled = 0
+; CHECK-MESA3D-NEXT:     is_xnack_enabled = 0
+; CHECK-MESA3D-NEXT:     workitem_private_segment_byte_size = 0
+; CHECK-MESA3D-NEXT:     workgroup_group_segment_byte_size = 0
+; CHECK-MESA3D-NEXT:     gds_segment_byte_size = 0
+; CHECK-MESA3D-NEXT:     kernarg_segment_byte_size = 8
+; CHECK-MESA3D-NEXT:     workgroup_fbarrier_count = 0
+; CHECK-MESA3D-NEXT:     wavefront_sgpr_count = 3
+; CHECK-MESA3D-NEXT:     workitem_vgpr_count = 2
+; CHECK-MESA3D-NEXT:     reserved_vgpr_first = 0
+; CHECK-MESA3D-NEXT:     reserved_vgpr_count = 0
+; CHECK-MESA3D-NEXT:     reserved_sgpr_first = 0
+; CHECK-MESA3D-NEXT:     reserved_sgpr_count = 0
+; CHECK-MESA3D-NEXT:     debug_wavefront_private_segment_offset_sgpr = 0
+; CHECK-MESA3D-NEXT:     debug_private_segment_buffer_sgpr = 0
+; CHECK-MESA3D-NEXT:     kernarg_segment_alignment = 4
+; CHECK-MESA3D-NEXT:     group_segment_alignment = 4
+; CHECK-MESA3D-NEXT:     private_segment_alignment = 4
+; CHECK-MESA3D-NEXT:     wavefront_size = 5
+; CHECK-MESA3D-NEXT:     call_convention = -1
+; CHECK-MESA3D-NEXT:     runtime_loader_kernel_symbol = 0
+; CHECK-MESA3D-NEXT:    .end_amd_kernel_code_t
+; CHECK-MESA3D-NEXT:  ; %bb.0:
+; CHECK-MESA3D-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; CHECK-MESA3D-NEXT:    s_lshr_b32 s2, ttmp7, 16
+; CHECK-MESA3D-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-MESA3D-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; CHECK-MESA3D-NEXT:    s_wait_kmcnt 0x0
+; CHECK-MESA3D-NEXT:    global_store_b32 v0, v1, s[0:1]
+; CHECK-MESA3D-NEXT:    s_endpgm
+;
+; CHECK-G-UNKNOWN-LABEL: test_cluster_id_z:
+; CHECK-G-UNKNOWN:       ; %bb.0:
+; CHECK-G-UNKNOWN-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; CHECK-G-UNKNOWN-NEXT:    s_lshr_b32 s2, ttmp7, 16
+; CHECK-G-UNKNOWN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-G-UNKNOWN-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; CHECK-G-UNKNOWN-NEXT:    s_wait_kmcnt 0x0
+; CHECK-G-UNKNOWN-NEXT:    global_store_b32 v1, v0, s[0:1]
+; CHECK-G-UNKNOWN-NEXT:    s_endpgm
+; CHECK-G-UNKNOWN: COMPUTE_PGM_RSRC2:TGID_X_EN: 1
+; CHECK-G-UNKNOWN: COMPUTE_PGM_RSRC2:TGID_Y_EN: 0
+; CHECK-G-UNKNOWN: COMPUTE_PGM_RSRC2:TGID_Z_EN: 1
+;
+; CHECK-G-MESA3D-LABEL: test_cluster_id_z:
+; CHECK-G-MESA3D:         .amd_kernel_code_t
+; CHECK-G-MESA3D-NEXT:     amd_code_version_major = 1
+; CHECK-G-MESA3D-NEXT:     amd_code_version_minor = 2
+; CHECK-G-MESA3D-NEXT:     amd_machine_kind = 1
+; CHECK-G-MESA3D-NEXT:     amd_machine_version_major = 12
+; CHECK-G-MESA3D-NEXT:     amd_machine_version_minor = 5
+; CHECK-G-MESA3D-NEXT:     amd_machine_version_stepping = 0
+; CHECK-G-MESA3D-NEXT:     kernel_code_entry_byte_offset = 256
+; CHECK-G-MESA3D-NEXT:     kernel_code_prefetch_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     granulated_workitem_vgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     granulated_wavefront_sgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     priority = 0
+; CHECK-G-MESA3D-NEXT:     float_mode = 240
+; CHECK-G-MESA3D-NEXT:     priv = 0
+; CHECK-G-MESA3D-NEXT:     enable_dx10_clamp = 0
+; CHECK-G-MESA3D-NEXT:     debug_mode = 0
+; CHECK-G-MESA3D-NEXT:     enable_ieee_mode = 0
+; CHECK-G-MESA3D-NEXT:     enable_wgp_mode = 0
+; CHECK-G-MESA3D-NEXT:     enable_mem_ordered = 1
+; CHECK-G-MESA3D-NEXT:     enable_fwd_progress = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
+; CHECK-G-MESA3D-NEXT:     user_sgpr_count = 2
+; CHECK-G-MESA3D-NEXT:     enable_trap_handler = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_id_x = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_id_y = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_id_z = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_info = 0
+; CHECK-G-MESA3D-NEXT:     enable_vgpr_workitem_id = 0
+; CHECK-G-MESA3D-NEXT:     enable_exception_msb = 0
+; CHECK-G-MESA3D-NEXT:     granulated_lds_size = 0
+; CHECK-G-MESA3D-NEXT:     enable_exception = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_private_segment_buffer = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_dispatch_ptr = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_queue_ptr = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_dispatch_id = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_flat_scratch_init = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_private_segment_size = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_z = 0
+; CHECK-G-MESA3D-NEXT:     enable_wavefront_size32 = 1
+; CHECK-G-MESA3D-NEXT:     enable_ordered_append_gds = 0
+; CHECK-G-MESA3D-NEXT:     private_element_size = 1
+; CHECK-G-MESA3D-NEXT:     is_ptr64 = 1
+; CHECK-G-MESA3D-NEXT:     is_dynamic_callstack = 0
+; CHECK-G-MESA3D-NEXT:     is_debug_enabled = 0
+; CHECK-G-MESA3D-NEXT:     is_xnack_enabled = 0
+; CHECK-G-MESA3D-NEXT:     workitem_private_segment_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     workgroup_group_segment_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     gds_segment_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     kernarg_segment_byte_size = 8
+; CHECK-G-MESA3D-NEXT:     workgroup_fbarrier_count = 0
+; CHECK-G-MESA3D-NEXT:     wavefront_sgpr_count = 3
+; CHECK-G-MESA3D-NEXT:     workitem_vgpr_count = 2
+; CHECK-G-MESA3D-NEXT:     reserved_vgpr_first = 0
+; CHECK-G-MESA3D-NEXT:     reserved_vgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     reserved_sgpr_first = 0
+; CHECK-G-MESA3D-NEXT:     reserved_sgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     debug_wavefront_private_segment_offset_sgpr = 0
+; CHECK-G-MESA3D-NEXT:     debug_private_segment_buffer_sgpr = 0
+; CHECK-G-MESA3D-NEXT:     kernarg_segment_alignment = 4
+; CHECK-G-MESA3D-NEXT:     group_segment_alignment = 4
+; CHECK-G-MESA3D-NEXT:     private_segment_alignment = 4
+; CHECK-G-MESA3D-NEXT:     wavefront_size = 5
+; CHECK-G-MESA3D-NEXT:     call_convention = -1
+; CHECK-G-MESA3D-NEXT:     runtime_loader_kernel_symbol = 0
+; CHECK-G-MESA3D-NEXT:    .end_amd_kernel_code_t
+; CHECK-G-MESA3D-NEXT:  ; %bb.0:
+; CHECK-G-MESA3D-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; CHECK-G-MESA3D-NEXT:    s_lshr_b32 s2, ttmp7, 16
+; CHECK-G-MESA3D-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-G-MESA3D-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; CHECK-G-MESA3D-NEXT:    s_wait_kmcnt 0x0
+; CHECK-G-MESA3D-NEXT:    global_store_b32 v1, v0, s[0:1]
+; CHECK-G-MESA3D-NEXT:    s_endpgm
+  %id = call i32 @llvm.amdgcn.cluster.id.z()
+  store i32 %id, ptr addrspace(1) %out
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.workgroup.id.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.workgroup.id.ll
new file mode 100644
index 0000000000000..aa3b7b3606fd8
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.workgroup.id.ll
@@ -0,0 +1,1258 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 %s -o - | FileCheck --check-prefixes=CHECK-UNKNOWN %s
+; RUN: llc -mtriple=amdgcn-unknown-mesa3d -mcpu=gfx1250 %s -o - | FileCheck -check-prefixes=CHECK-MESA3D %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1250 %s -o - | FileCheck --check-prefixes=CHECK-G-UNKNOWN %s
+; RUN: llc -global-isel -mtriple=amdgcn-unknown-mesa3d -mcpu=gfx1250 %s -o - | FileCheck -check-prefixes=CHECK-G-MESA3D %s
+
+declare i32 @llvm.amdgcn.cluster.workgroup.id.x() #0
+declare i32 @llvm.amdgcn.cluster.workgroup.id.y() #0
+declare i32 @llvm.amdgcn.cluster.workgroup.id.z() #0
+
+define amdgpu_kernel void @test_workgroup_id_x(ptr addrspace(1) %out) #1 {
+; CHECK-UNKNOWN-LABEL: test_workgroup_id_x:
+; CHECK-UNKNOWN:       ; %bb.0:
+; CHECK-UNKNOWN-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; CHECK-UNKNOWN-NEXT:    s_and_b32 s2, ttmp6, 15
+; CHECK-UNKNOWN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-UNKNOWN-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; CHECK-UNKNOWN-NEXT:    s_wait_kmcnt 0x0
+; CHECK-UNKNOWN-NEXT:    global_store_b32 v0, v1, s[0:1]
+; CHECK-UNKNOWN-NEXT:    s_endpgm
+;
+; CHECK-MESA3D-LABEL: test_workgroup_id_x:
+; CHECK-MESA3D:         .amd_kernel_code_t
+; CHECK-MESA3D-NEXT:     amd_code_version_major = 1
+; CHECK-MESA3D-NEXT:     amd_code_version_minor = 2
+; CHECK-MESA3D-NEXT:     amd_machine_kind = 1
+; CHECK-MESA3D-NEXT:     amd_machine_version_major = 12
+; CHECK-MESA3D-NEXT:     amd_machine_version_minor = 5
+; CHECK-MESA3D-NEXT:     amd_machine_version_stepping = 0
+; CHECK-MESA3D-NEXT:     kernel_code_entry_byte_offset = 256
+; CHECK-MESA3D-NEXT:     kernel_code_prefetch_byte_size = 0
+; CHECK-MESA3D-NEXT:     granulated_workitem_vgpr_count = 0
+; CHECK-MESA3D-NEXT:     granulated_wavefront_sgpr_count = 0
+; CHECK-MESA3D-NEXT:     priority = 0
+; CHECK-MESA3D-NEXT:     float_mode = 240
+; CHECK-MESA3D-NEXT:     priv = 0
+; CHECK-MESA3D-NEXT:     enable_dx10_clamp = 0
+; CHECK-MESA3D-NEXT:     debug_mode = 0
+; CHECK-MESA3D-NEXT:     enable_ieee_mode = 0
+; CHECK-MESA3D-NEXT:     enable_wgp_mode = 0
+; CHECK-MESA3D-NEXT:     enable_mem_ordered = 1
+; CHECK-MESA3D-NEXT:     enable_fwd_progress = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
+; CHECK-MESA3D-NEXT:     user_sgpr_count = 8
+; CHECK-MESA3D-NEXT:     enable_trap_handler = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_id_x = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_id_y = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_id_z = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_info = 0
+; CHECK-MESA3D-NEXT:     enable_vgpr_workitem_id = 2
+; CHECK-MESA3D-NEXT:     enable_exception_msb = 0
+; CHECK-MESA3D-NEXT:     granulated_lds_size = 0
+; CHECK-MESA3D-NEXT:     enable_exception = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_private_segment_buffer = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_dispatch_ptr = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_queue_ptr = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_dispatch_id = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_flat_scratch_init = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_private_segment_size = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_z = 0
+; CHECK-MESA3D-NEXT:     enable_wavefront_size32 = 1
+; CHECK-MESA3D-NEXT:     enable_ordered_append_gds = 0
+; CHECK-MESA3D-NEXT:     private_element_size = 1
+; CHECK-MESA3D-NEXT:     is_ptr64 = 1
+; CHECK-MESA3D-NEXT:     is_dynamic_callstack = 0
+; CHECK-MESA3D-NEXT:     is_debug_enabled = 0
+; CHECK-MESA3D-NEXT:     is_xnack_enabled = 0
+; CHECK-MESA3D-NEXT:     workitem_private_segment_byte_size = 0
+; CHECK-MESA3D-NEXT:     workgroup_group_segment_byte_size = 0
+; CHECK-MESA3D-NEXT:     gds_segment_byte_size = 0
+; CHECK-MESA3D-NEXT:     kernarg_segment_byte_size = 24
+; CHECK-MESA3D-NEXT:     workgroup_fbarrier_count = 0
+; CHECK-MESA3D-NEXT:     wavefront_sgpr_count = 6
+; CHECK-MESA3D-NEXT:     workitem_vgpr_count = 2
+; CHECK-MESA3D-NEXT:     reserved_vgpr_first = 0
+; CHECK-MESA3D-NEXT:     reserved_vgpr_count = 0
+; CHECK-MESA3D-NEXT:     reserved_sgpr_first = 0
+; CHECK-MESA3D-NEXT:     reserved_sgpr_count = 0
+; CHECK-MESA3D-NEXT:     debug_wavefront_private_segment_offset_sgpr = 0
+; CHECK-MESA3D-NEXT:     debug_private_segment_buffer_sgpr = 0
+; CHECK-MESA3D-NEXT:     kernarg_segment_alignment = 4
+; CHECK-MESA3D-NEXT:     group_segment_alignment = 4
+; CHECK-MESA3D-NEXT:     private_segment_alignment = 4
+; CHECK-MESA3D-NEXT:     wavefront_size = 5
+; CHECK-MESA3D-NEXT:     call_convention = -1
+; CHECK-MESA3D-NEXT:     runtime_loader_kernel_symbol = 0
+; CHECK-MESA3D-NEXT:    .end_amd_kernel_code_t
+; CHECK-MESA3D-NEXT:  ; %bb.0:
+; CHECK-MESA3D-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; CHECK-MESA3D-NEXT:    s_and_b32 s2, ttmp6, 15
+; CHECK-MESA3D-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-MESA3D-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; CHECK-MESA3D-NEXT:    s_wait_kmcnt 0x0
+; CHECK-MESA3D-NEXT:    global_store_b32 v0, v1, s[0:1]
+; CHECK-MESA3D-NEXT:    s_endpgm
+;
+; CHECK-G-UNKNOWN-LABEL: test_workgroup_id_x:
+; CHECK-G-UNKNOWN:       ; %bb.0:
+; CHECK-G-UNKNOWN-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; CHECK-G-UNKNOWN-NEXT:    s_and_b32 s2, ttmp6, 15
+; CHECK-G-UNKNOWN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-G-UNKNOWN-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; CHECK-G-UNKNOWN-NEXT:    s_wait_kmcnt 0x0
+; CHECK-G-UNKNOWN-NEXT:    global_store_b32 v1, v0, s[0:1]
+; CHECK-G-UNKNOWN-NEXT:    s_endpgm
+;
+; CHECK-G-MESA3D-LABEL: test_workgroup_id_x:
+; CHECK-G-MESA3D:         .amd_kernel_code_t
+; CHECK-G-MESA3D-NEXT:     amd_code_version_major = 1
+; CHECK-G-MESA3D-NEXT:     amd_code_version_minor = 2
+; CHECK-G-MESA3D-NEXT:     amd_machine_kind = 1
+; CHECK-G-MESA3D-NEXT:     amd_machine_version_major = 12
+; CHECK-G-MESA3D-NEXT:     amd_machine_version_minor = 5
+; CHECK-G-MESA3D-NEXT:     amd_machine_version_stepping = 0
+; CHECK-G-MESA3D-NEXT:     kernel_code_entry_byte_offset = 256
+; CHECK-G-MESA3D-NEXT:     kernel_code_prefetch_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     granulated_workitem_vgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     granulated_wavefront_sgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     priority = 0
+; CHECK-G-MESA3D-NEXT:     float_mode = 240
+; CHECK-G-MESA3D-NEXT:     priv = 0
+; CHECK-G-MESA3D-NEXT:     enable_dx10_clamp = 0
+; CHECK-G-MESA3D-NEXT:     debug_mode = 0
+; CHECK-G-MESA3D-NEXT:     enable_ieee_mode = 0
+; CHECK-G-MESA3D-NEXT:     enable_wgp_mode = 0
+; CHECK-G-MESA3D-NEXT:     enable_mem_ordered = 1
+; CHECK-G-MESA3D-NEXT:     enable_fwd_progress = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
+; CHECK-G-MESA3D-NEXT:     user_sgpr_count = 8
+; CHECK-G-MESA3D-NEXT:     enable_trap_handler = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_id_x = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_id_y = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_id_z = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_info = 0
+; CHECK-G-MESA3D-NEXT:     enable_vgpr_workitem_id = 2
+; CHECK-G-MESA3D-NEXT:     enable_exception_msb = 0
+; CHECK-G-MESA3D-NEXT:     granulated_lds_size = 0
+; CHECK-G-MESA3D-NEXT:     enable_exception = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_private_segment_buffer = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_dispatch_ptr = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_queue_ptr = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_dispatch_id = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_flat_scratch_init = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_private_segment_size = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_z = 0
+; CHECK-G-MESA3D-NEXT:     enable_wavefront_size32 = 1
+; CHECK-G-MESA3D-NEXT:     enable_ordered_append_gds = 0
+; CHECK-G-MESA3D-NEXT:     private_element_size = 1
+; CHECK-G-MESA3D-NEXT:     is_ptr64 = 1
+; CHECK-G-MESA3D-NEXT:     is_dynamic_callstack = 0
+; CHECK-G-MESA3D-NEXT:     is_debug_enabled = 0
+; CHECK-G-MESA3D-NEXT:     is_xnack_enabled = 0
+; CHECK-G-MESA3D-NEXT:     workitem_private_segment_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     workgroup_group_segment_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     gds_segment_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     kernarg_segment_byte_size = 24
+; CHECK-G-MESA3D-NEXT:     workgroup_fbarrier_count = 0
+; CHECK-G-MESA3D-NEXT:     wavefront_sgpr_count = 6
+; CHECK-G-MESA3D-NEXT:     workitem_vgpr_count = 2
+; CHECK-G-MESA3D-NEXT:     reserved_vgpr_first = 0
+; CHECK-G-MESA3D-NEXT:     reserved_vgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     reserved_sgpr_first = 0
+; CHECK-G-MESA3D-NEXT:     reserved_sgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     debug_wavefront_private_segment_offset_sgpr = 0
+; CHECK-G-MESA3D-NEXT:     debug_private_segment_buffer_sgpr = 0
+; CHECK-G-MESA3D-NEXT:     kernarg_segment_alignment = 4
+; CHECK-G-MESA3D-NEXT:     group_segment_alignment = 4
+; CHECK-G-MESA3D-NEXT:     private_segment_alignment = 4
+; CHECK-G-MESA3D-NEXT:     wavefront_size = 5
+; CHECK-G-MESA3D-NEXT:     call_convention = -1
+; CHECK-G-MESA3D-NEXT:     runtime_loader_kernel_symbol = 0
+; CHECK-G-MESA3D-NEXT:    .end_amd_kernel_code_t
+; CHECK-G-MESA3D-NEXT:  ; %bb.0:
+; CHECK-G-MESA3D-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; CHECK-G-MESA3D-NEXT:    s_and_b32 s2, ttmp6, 15
+; CHECK-G-MESA3D-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-G-MESA3D-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; CHECK-G-MESA3D-NEXT:    s_wait_kmcnt 0x0
+; CHECK-G-MESA3D-NEXT:    global_store_b32 v1, v0, s[0:1]
+; CHECK-G-MESA3D-NEXT:    s_endpgm
+  %id = call i32 @llvm.amdgcn.cluster.workgroup.id.x()
+  store i32 %id, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @test_workgroup_id_x_optimized(ptr addrspace(1) %out) "amdgpu-cluster-dims"="1,2,2" {
+; CHECK-UNKNOWN-LABEL: test_workgroup_id_x_optimized:
+; CHECK-UNKNOWN:       ; %bb.0:
+; CHECK-UNKNOWN-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; CHECK-UNKNOWN-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-UNKNOWN-NEXT:    s_wait_kmcnt 0x0
+; CHECK-UNKNOWN-NEXT:    global_store_b32 v0, v0, s[0:1]
+; CHECK-UNKNOWN-NEXT:    s_endpgm
+;
+; CHECK-MESA3D-LABEL: test_workgroup_id_x_optimized:
+; CHECK-MESA3D:         .amd_kernel_code_t
+; CHECK-MESA3D-NEXT:     amd_code_version_major = 1
+; CHECK-MESA3D-NEXT:     amd_code_version_minor = 2
+; CHECK-MESA3D-NEXT:     amd_machine_kind = 1
+; CHECK-MESA3D-NEXT:     amd_machine_version_major = 12
+; CHECK-MESA3D-NEXT:     amd_machine_version_minor = 5
+; CHECK-MESA3D-NEXT:     amd_machine_version_stepping = 0
+; CHECK-MESA3D-NEXT:     kernel_code_entry_byte_offset = 256
+; CHECK-MESA3D-NEXT:     kernel_code_prefetch_byte_size = 0
+; CHECK-MESA3D-NEXT:     granulated_workitem_vgpr_count = 0
+; CHECK-MESA3D-NEXT:     granulated_wavefront_sgpr_count = 0
+; CHECK-MESA3D-NEXT:     priority = 0
+; CHECK-MESA3D-NEXT:     float_mode = 240
+; CHECK-MESA3D-NEXT:     priv = 0
+; CHECK-MESA3D-NEXT:     enable_dx10_clamp = 0
+; CHECK-MESA3D-NEXT:     debug_mode = 0
+; CHECK-MESA3D-NEXT:     enable_ieee_mode = 0
+; CHECK-MESA3D-NEXT:     enable_wgp_mode = 0
+; CHECK-MESA3D-NEXT:     enable_mem_ordered = 1
+; CHECK-MESA3D-NEXT:     enable_fwd_progress = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
+; CHECK-MESA3D-NEXT:     user_sgpr_count = 8
+; CHECK-MESA3D-NEXT:     enable_trap_handler = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_id_x = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_id_y = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_id_z = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_info = 0
+; CHECK-MESA3D-NEXT:     enable_vgpr_workitem_id = 2
+; CHECK-MESA3D-NEXT:     enable_exception_msb = 0
+; CHECK-MESA3D-NEXT:     granulated_lds_size = 0
+; CHECK-MESA3D-NEXT:     enable_exception = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_private_segment_buffer = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_dispatch_ptr = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_queue_ptr = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_dispatch_id = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_flat_scratch_init = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_private_segment_size = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_z = 0
+; CHECK-MESA3D-NEXT:     enable_wavefront_size32 = 1
+; CHECK-MESA3D-NEXT:     enable_ordered_append_gds = 0
+; CHECK-MESA3D-NEXT:     private_element_size = 1
+; CHECK-MESA3D-NEXT:     is_ptr64 = 1
+; CHECK-MESA3D-NEXT:     is_dynamic_callstack = 0
+; CHECK-MESA3D-NEXT:     is_debug_enabled = 0
+; CHECK-MESA3D-NEXT:     is_xnack_enabled = 0
+; CHECK-MESA3D-NEXT:     workitem_private_segment_byte_size = 0
+; CHECK-MESA3D-NEXT:     workgroup_group_segment_byte_size = 0
+; CHECK-MESA3D-NEXT:     gds_segment_byte_size = 0
+; CHECK-MESA3D-NEXT:     kernarg_segment_byte_size = 24
+; CHECK-MESA3D-NEXT:     workgroup_fbarrier_count = 0
+; CHECK-MESA3D-NEXT:     wavefront_sgpr_count = 6
+; CHECK-MESA3D-NEXT:     workitem_vgpr_count = 1
+; CHECK-MESA3D-NEXT:     reserved_vgpr_first = 0
+; CHECK-MESA3D-NEXT:     reserved_vgpr_count = 0
+; CHECK-MESA3D-NEXT:     reserved_sgpr_first = 0
+; CHECK-MESA3D-NEXT:     reserved_sgpr_count = 0
+; CHECK-MESA3D-NEXT:     debug_wavefront_private_segment_offset_sgpr = 0
+; CHECK-MESA3D-NEXT:     debug_private_segment_buffer_sgpr = 0
+; CHECK-MESA3D-NEXT:     kernarg_segment_alignment = 4
+; CHECK-MESA3D-NEXT:     group_segment_alignment = 4
+; CHECK-MESA3D-NEXT:     private_segment_alignment = 4
+; CHECK-MESA3D-NEXT:     wavefront_size = 5
+; CHECK-MESA3D-NEXT:     call_convention = -1
+; CHECK-MESA3D-NEXT:     runtime_loader_kernel_symbol = 0
+; CHECK-MESA3D-NEXT:    .end_amd_kernel_code_t
+; CHECK-MESA3D-NEXT:  ; %bb.0:
+; CHECK-MESA3D-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; CHECK-MESA3D-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-MESA3D-NEXT:    s_wait_kmcnt 0x0
+; CHECK-MESA3D-NEXT:    global_store_b32 v0, v0, s[0:1]
+; CHECK-MESA3D-NEXT:    s_endpgm
+;
+; CHECK-G-UNKNOWN-LABEL: test_workgroup_id_x_optimized:
+; CHECK-G-UNKNOWN:       ; %bb.0:
+; CHECK-G-UNKNOWN-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; CHECK-G-UNKNOWN-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-G-UNKNOWN-NEXT:    s_wait_kmcnt 0x0
+; CHECK-G-UNKNOWN-NEXT:    global_store_b32 v0, v0, s[0:1]
+; CHECK-G-UNKNOWN-NEXT:    s_endpgm
+;
+; CHECK-G-MESA3D-LABEL: test_workgroup_id_x_optimized:
+; CHECK-G-MESA3D:         .amd_kernel_code_t
+; CHECK-G-MESA3D-NEXT:     amd_code_version_major = 1
+; CHECK-G-MESA3D-NEXT:     amd_code_version_minor = 2
+; CHECK-G-MESA3D-NEXT:     amd_machine_kind = 1
+; CHECK-G-MESA3D-NEXT:     amd_machine_version_major = 12
+; CHECK-G-MESA3D-NEXT:     amd_machine_version_minor = 5
+; CHECK-G-MESA3D-NEXT:     amd_machine_version_stepping = 0
+; CHECK-G-MESA3D-NEXT:     kernel_code_entry_byte_offset = 256
+; CHECK-G-MESA3D-NEXT:     kernel_code_prefetch_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     granulated_workitem_vgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     granulated_wavefront_sgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     priority = 0
+; CHECK-G-MESA3D-NEXT:     float_mode = 240
+; CHECK-G-MESA3D-NEXT:     priv = 0
+; CHECK-G-MESA3D-NEXT:     enable_dx10_clamp = 0
+; CHECK-G-MESA3D-NEXT:     debug_mode = 0
+; CHECK-G-MESA3D-NEXT:     enable_ieee_mode = 0
+; CHECK-G-MESA3D-NEXT:     enable_wgp_mode = 0
+; CHECK-G-MESA3D-NEXT:     enable_mem_ordered = 1
+; CHECK-G-MESA3D-NEXT:     enable_fwd_progress = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
+; CHECK-G-MESA3D-NEXT:     user_sgpr_count = 8
+; CHECK-G-MESA3D-NEXT:     enable_trap_handler = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_id_x = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_id_y = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_id_z = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_info = 0
+; CHECK-G-MESA3D-NEXT:     enable_vgpr_workitem_id = 2
+; CHECK-G-MESA3D-NEXT:     enable_exception_msb = 0
+; CHECK-G-MESA3D-NEXT:     granulated_lds_size = 0
+; CHECK-G-MESA3D-NEXT:     enable_exception = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_private_segment_buffer = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_dispatch_ptr = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_queue_ptr = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_dispatch_id = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_flat_scratch_init = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_private_segment_size = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_z = 0
+; CHECK-G-MESA3D-NEXT:     enable_wavefront_size32 = 1
+; CHECK-G-MESA3D-NEXT:     enable_ordered_append_gds = 0
+; CHECK-G-MESA3D-NEXT:     private_element_size = 1
+; CHECK-G-MESA3D-NEXT:     is_ptr64 = 1
+; CHECK-G-MESA3D-NEXT:     is_dynamic_callstack = 0
+; CHECK-G-MESA3D-NEXT:     is_debug_enabled = 0
+; CHECK-G-MESA3D-NEXT:     is_xnack_enabled = 0
+; CHECK-G-MESA3D-NEXT:     workitem_private_segment_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     workgroup_group_segment_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     gds_segment_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     kernarg_segment_byte_size = 24
+; CHECK-G-MESA3D-NEXT:     workgroup_fbarrier_count = 0
+; CHECK-G-MESA3D-NEXT:     wavefront_sgpr_count = 6
+; CHECK-G-MESA3D-NEXT:     workitem_vgpr_count = 1
+; CHECK-G-MESA3D-NEXT:     reserved_vgpr_first = 0
+; CHECK-G-MESA3D-NEXT:     reserved_vgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     reserved_sgpr_first = 0
+; CHECK-G-MESA3D-NEXT:     reserved_sgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     debug_wavefront_private_segment_offset_sgpr = 0
+; CHECK-G-MESA3D-NEXT:     debug_private_segment_buffer_sgpr = 0
+; CHECK-G-MESA3D-NEXT:     kernarg_segment_alignment = 4
+; CHECK-G-MESA3D-NEXT:     group_segment_alignment = 4
+; CHECK-G-MESA3D-NEXT:     private_segment_alignment = 4
+; CHECK-G-MESA3D-NEXT:     wavefront_size = 5
+; CHECK-G-MESA3D-NEXT:     call_convention = -1
+; CHECK-G-MESA3D-NEXT:     runtime_loader_kernel_symbol = 0
+; CHECK-G-MESA3D-NEXT:    .end_amd_kernel_code_t
+; CHECK-G-MESA3D-NEXT:  ; %bb.0:
+; CHECK-G-MESA3D-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; CHECK-G-MESA3D-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-G-MESA3D-NEXT:    s_wait_kmcnt 0x0
+; CHECK-G-MESA3D-NEXT:    global_store_b32 v0, v0, s[0:1]
+; CHECK-G-MESA3D-NEXT:    s_endpgm
+  %id = call i32 @llvm.amdgcn.cluster.workgroup.id.x()
+  store i32 %id, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @test_workgroup_id_y(ptr addrspace(1) %out) #1 {
+; CHECK-UNKNOWN-LABEL: test_workgroup_id_y:
+; CHECK-UNKNOWN:       ; %bb.0:
+; CHECK-UNKNOWN-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; CHECK-UNKNOWN-NEXT:    s_bfe_u32 s2, ttmp6, 0x40004
+; CHECK-UNKNOWN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-UNKNOWN-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; CHECK-UNKNOWN-NEXT:    s_wait_kmcnt 0x0
+; CHECK-UNKNOWN-NEXT:    global_store_b32 v0, v1, s[0:1]
+; CHECK-UNKNOWN-NEXT:    s_endpgm
+;
+; CHECK-MESA3D-LABEL: test_workgroup_id_y:
+; CHECK-MESA3D:         .amd_kernel_code_t
+; CHECK-MESA3D-NEXT:     amd_code_version_major = 1
+; CHECK-MESA3D-NEXT:     amd_code_version_minor = 2
+; CHECK-MESA3D-NEXT:     amd_machine_kind = 1
+; CHECK-MESA3D-NEXT:     amd_machine_version_major = 12
+; CHECK-MESA3D-NEXT:     amd_machine_version_minor = 5
+; CHECK-MESA3D-NEXT:     amd_machine_version_stepping = 0
+; CHECK-MESA3D-NEXT:     kernel_code_entry_byte_offset = 256
+; CHECK-MESA3D-NEXT:     kernel_code_prefetch_byte_size = 0
+; CHECK-MESA3D-NEXT:     granulated_workitem_vgpr_count = 0
+; CHECK-MESA3D-NEXT:     granulated_wavefront_sgpr_count = 0
+; CHECK-MESA3D-NEXT:     priority = 0
+; CHECK-MESA3D-NEXT:     float_mode = 240
+; CHECK-MESA3D-NEXT:     priv = 0
+; CHECK-MESA3D-NEXT:     enable_dx10_clamp = 0
+; CHECK-MESA3D-NEXT:     debug_mode = 0
+; CHECK-MESA3D-NEXT:     enable_ieee_mode = 0
+; CHECK-MESA3D-NEXT:     enable_wgp_mode = 0
+; CHECK-MESA3D-NEXT:     enable_mem_ordered = 1
+; CHECK-MESA3D-NEXT:     enable_fwd_progress = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
+; CHECK-MESA3D-NEXT:     user_sgpr_count = 8
+; CHECK-MESA3D-NEXT:     enable_trap_handler = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_id_x = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_id_y = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_id_z = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_info = 0
+; CHECK-MESA3D-NEXT:     enable_vgpr_workitem_id = 2
+; CHECK-MESA3D-NEXT:     enable_exception_msb = 0
+; CHECK-MESA3D-NEXT:     granulated_lds_size = 0
+; CHECK-MESA3D-NEXT:     enable_exception = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_private_segment_buffer = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_dispatch_ptr = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_queue_ptr = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_dispatch_id = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_flat_scratch_init = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_private_segment_size = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_z = 0
+; CHECK-MESA3D-NEXT:     enable_wavefront_size32 = 1
+; CHECK-MESA3D-NEXT:     enable_ordered_append_gds = 0
+; CHECK-MESA3D-NEXT:     private_element_size = 1
+; CHECK-MESA3D-NEXT:     is_ptr64 = 1
+; CHECK-MESA3D-NEXT:     is_dynamic_callstack = 0
+; CHECK-MESA3D-NEXT:     is_debug_enabled = 0
+; CHECK-MESA3D-NEXT:     is_xnack_enabled = 0
+; CHECK-MESA3D-NEXT:     workitem_private_segment_byte_size = 0
+; CHECK-MESA3D-NEXT:     workgroup_group_segment_byte_size = 0
+; CHECK-MESA3D-NEXT:     gds_segment_byte_size = 0
+; CHECK-MESA3D-NEXT:     kernarg_segment_byte_size = 24
+; CHECK-MESA3D-NEXT:     workgroup_fbarrier_count = 0
+; CHECK-MESA3D-NEXT:     wavefront_sgpr_count = 6
+; CHECK-MESA3D-NEXT:     workitem_vgpr_count = 2
+; CHECK-MESA3D-NEXT:     reserved_vgpr_first = 0
+; CHECK-MESA3D-NEXT:     reserved_vgpr_count = 0
+; CHECK-MESA3D-NEXT:     reserved_sgpr_first = 0
+; CHECK-MESA3D-NEXT:     reserved_sgpr_count = 0
+; CHECK-MESA3D-NEXT:     debug_wavefront_private_segment_offset_sgpr = 0
+; CHECK-MESA3D-NEXT:     debug_private_segment_buffer_sgpr = 0
+; CHECK-MESA3D-NEXT:     kernarg_segment_alignment = 4
+; CHECK-MESA3D-NEXT:     group_segment_alignment = 4
+; CHECK-MESA3D-NEXT:     private_segment_alignment = 4
+; CHECK-MESA3D-NEXT:     wavefront_size = 5
+; CHECK-MESA3D-NEXT:     call_convention = -1
+; CHECK-MESA3D-NEXT:     runtime_loader_kernel_symbol = 0
+; CHECK-MESA3D-NEXT:    .end_amd_kernel_code_t
+; CHECK-MESA3D-NEXT:  ; %bb.0:
+; CHECK-MESA3D-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; CHECK-MESA3D-NEXT:    s_bfe_u32 s2, ttmp6, 0x40004
+; CHECK-MESA3D-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-MESA3D-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; CHECK-MESA3D-NEXT:    s_wait_kmcnt 0x0
+; CHECK-MESA3D-NEXT:    global_store_b32 v0, v1, s[0:1]
+; CHECK-MESA3D-NEXT:    s_endpgm
+;
+; CHECK-G-UNKNOWN-LABEL: test_workgroup_id_y:
+; CHECK-G-UNKNOWN:       ; %bb.0:
+; CHECK-G-UNKNOWN-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; CHECK-G-UNKNOWN-NEXT:    s_bfe_u32 s2, ttmp6, 0x40004
+; CHECK-G-UNKNOWN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-G-UNKNOWN-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; CHECK-G-UNKNOWN-NEXT:    s_wait_kmcnt 0x0
+; CHECK-G-UNKNOWN-NEXT:    global_store_b32 v1, v0, s[0:1]
+; CHECK-G-UNKNOWN-NEXT:    s_endpgm
+;
+; CHECK-G-MESA3D-LABEL: test_workgroup_id_y:
+; CHECK-G-MESA3D:         .amd_kernel_code_t
+; CHECK-G-MESA3D-NEXT:     amd_code_version_major = 1
+; CHECK-G-MESA3D-NEXT:     amd_code_version_minor = 2
+; CHECK-G-MESA3D-NEXT:     amd_machine_kind = 1
+; CHECK-G-MESA3D-NEXT:     amd_machine_version_major = 12
+; CHECK-G-MESA3D-NEXT:     amd_machine_version_minor = 5
+; CHECK-G-MESA3D-NEXT:     amd_machine_version_stepping = 0
+; CHECK-G-MESA3D-NEXT:     kernel_code_entry_byte_offset = 256
+; CHECK-G-MESA3D-NEXT:     kernel_code_prefetch_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     granulated_workitem_vgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     granulated_wavefront_sgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     priority = 0
+; CHECK-G-MESA3D-NEXT:     float_mode = 240
+; CHECK-G-MESA3D-NEXT:     priv = 0
+; CHECK-G-MESA3D-NEXT:     enable_dx10_clamp = 0
+; CHECK-G-MESA3D-NEXT:     debug_mode = 0
+; CHECK-G-MESA3D-NEXT:     enable_ieee_mode = 0
+; CHECK-G-MESA3D-NEXT:     enable_wgp_mode = 0
+; CHECK-G-MESA3D-NEXT:     enable_mem_ordered = 1
+; CHECK-G-MESA3D-NEXT:     enable_fwd_progress = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
+; CHECK-G-MESA3D-NEXT:     user_sgpr_count = 8
+; CHECK-G-MESA3D-NEXT:     enable_trap_handler = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_id_x = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_id_y = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_id_z = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_info = 0
+; CHECK-G-MESA3D-NEXT:     enable_vgpr_workitem_id = 2
+; CHECK-G-MESA3D-NEXT:     enable_exception_msb = 0
+; CHECK-G-MESA3D-NEXT:     granulated_lds_size = 0
+; CHECK-G-MESA3D-NEXT:     enable_exception = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_private_segment_buffer = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_dispatch_ptr = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_queue_ptr = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_dispatch_id = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_flat_scratch_init = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_private_segment_size = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_z = 0
+; CHECK-G-MESA3D-NEXT:     enable_wavefront_size32 = 1
+; CHECK-G-MESA3D-NEXT:     enable_ordered_append_gds = 0
+; CHECK-G-MESA3D-NEXT:     private_element_size = 1
+; CHECK-G-MESA3D-NEXT:     is_ptr64 = 1
+; CHECK-G-MESA3D-NEXT:     is_dynamic_callstack = 0
+; CHECK-G-MESA3D-NEXT:     is_debug_enabled = 0
+; CHECK-G-MESA3D-NEXT:     is_xnack_enabled = 0
+; CHECK-G-MESA3D-NEXT:     workitem_private_segment_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     workgroup_group_segment_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     gds_segment_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     kernarg_segment_byte_size = 24
+; CHECK-G-MESA3D-NEXT:     workgroup_fbarrier_count = 0
+; CHECK-G-MESA3D-NEXT:     wavefront_sgpr_count = 6
+; CHECK-G-MESA3D-NEXT:     workitem_vgpr_count = 2
+; CHECK-G-MESA3D-NEXT:     reserved_vgpr_first = 0
+; CHECK-G-MESA3D-NEXT:     reserved_vgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     reserved_sgpr_first = 0
+; CHECK-G-MESA3D-NEXT:     reserved_sgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     debug_wavefront_private_segment_offset_sgpr = 0
+; CHECK-G-MESA3D-NEXT:     debug_private_segment_buffer_sgpr = 0
+; CHECK-G-MESA3D-NEXT:     kernarg_segment_alignment = 4
+; CHECK-G-MESA3D-NEXT:     group_segment_alignment = 4
+; CHECK-G-MESA3D-NEXT:     private_segment_alignment = 4
+; CHECK-G-MESA3D-NEXT:     wavefront_size = 5
+; CHECK-G-MESA3D-NEXT:     call_convention = -1
+; CHECK-G-MESA3D-NEXT:     runtime_loader_kernel_symbol = 0
+; CHECK-G-MESA3D-NEXT:    .end_amd_kernel_code_t
+; CHECK-G-MESA3D-NEXT:  ; %bb.0:
+; CHECK-G-MESA3D-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; CHECK-G-MESA3D-NEXT:    s_bfe_u32 s2, ttmp6, 0x40004
+; CHECK-G-MESA3D-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-G-MESA3D-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; CHECK-G-MESA3D-NEXT:    s_wait_kmcnt 0x0
+; CHECK-G-MESA3D-NEXT:    global_store_b32 v1, v0, s[0:1]
+; CHECK-G-MESA3D-NEXT:    s_endpgm
+  %id = call i32 @llvm.amdgcn.cluster.workgroup.id.y()
+  store i32 %id, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @test_workgroup_id_y_optimized(ptr addrspace(1) %out) "amdgpu-cluster-dims"="2,1,2" {
+; CHECK-UNKNOWN-LABEL: test_workgroup_id_y_optimized:
+; CHECK-UNKNOWN:       ; %bb.0:
+; CHECK-UNKNOWN-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; CHECK-UNKNOWN-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-UNKNOWN-NEXT:    s_wait_kmcnt 0x0
+; CHECK-UNKNOWN-NEXT:    global_store_b32 v0, v0, s[0:1]
+; CHECK-UNKNOWN-NEXT:    s_endpgm
+;
+; CHECK-MESA3D-LABEL: test_workgroup_id_y_optimized:
+; CHECK-MESA3D:         .amd_kernel_code_t
+; CHECK-MESA3D-NEXT:     amd_code_version_major = 1
+; CHECK-MESA3D-NEXT:     amd_code_version_minor = 2
+; CHECK-MESA3D-NEXT:     amd_machine_kind = 1
+; CHECK-MESA3D-NEXT:     amd_machine_version_major = 12
+; CHECK-MESA3D-NEXT:     amd_machine_version_minor = 5
+; CHECK-MESA3D-NEXT:     amd_machine_version_stepping = 0
+; CHECK-MESA3D-NEXT:     kernel_code_entry_byte_offset = 256
+; CHECK-MESA3D-NEXT:     kernel_code_prefetch_byte_size = 0
+; CHECK-MESA3D-NEXT:     granulated_workitem_vgpr_count = 0
+; CHECK-MESA3D-NEXT:     granulated_wavefront_sgpr_count = 0
+; CHECK-MESA3D-NEXT:     priority = 0
+; CHECK-MESA3D-NEXT:     float_mode = 240
+; CHECK-MESA3D-NEXT:     priv = 0
+; CHECK-MESA3D-NEXT:     enable_dx10_clamp = 0
+; CHECK-MESA3D-NEXT:     debug_mode = 0
+; CHECK-MESA3D-NEXT:     enable_ieee_mode = 0
+; CHECK-MESA3D-NEXT:     enable_wgp_mode = 0
+; CHECK-MESA3D-NEXT:     enable_mem_ordered = 1
+; CHECK-MESA3D-NEXT:     enable_fwd_progress = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
+; CHECK-MESA3D-NEXT:     user_sgpr_count = 8
+; CHECK-MESA3D-NEXT:     enable_trap_handler = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_id_x = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_id_y = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_id_z = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_info = 0
+; CHECK-MESA3D-NEXT:     enable_vgpr_workitem_id = 2
+; CHECK-MESA3D-NEXT:     enable_exception_msb = 0
+; CHECK-MESA3D-NEXT:     granulated_lds_size = 0
+; CHECK-MESA3D-NEXT:     enable_exception = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_private_segment_buffer = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_dispatch_ptr = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_queue_ptr = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_dispatch_id = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_flat_scratch_init = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_private_segment_size = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_z = 0
+; CHECK-MESA3D-NEXT:     enable_wavefront_size32 = 1
+; CHECK-MESA3D-NEXT:     enable_ordered_append_gds = 0
+; CHECK-MESA3D-NEXT:     private_element_size = 1
+; CHECK-MESA3D-NEXT:     is_ptr64 = 1
+; CHECK-MESA3D-NEXT:     is_dynamic_callstack = 0
+; CHECK-MESA3D-NEXT:     is_debug_enabled = 0
+; CHECK-MESA3D-NEXT:     is_xnack_enabled = 0
+; CHECK-MESA3D-NEXT:     workitem_private_segment_byte_size = 0
+; CHECK-MESA3D-NEXT:     workgroup_group_segment_byte_size = 0
+; CHECK-MESA3D-NEXT:     gds_segment_byte_size = 0
+; CHECK-MESA3D-NEXT:     kernarg_segment_byte_size = 24
+; CHECK-MESA3D-NEXT:     workgroup_fbarrier_count = 0
+; CHECK-MESA3D-NEXT:     wavefront_sgpr_count = 6
+; CHECK-MESA3D-NEXT:     workitem_vgpr_count = 1
+; CHECK-MESA3D-NEXT:     reserved_vgpr_first = 0
+; CHECK-MESA3D-NEXT:     reserved_vgpr_count = 0
+; CHECK-MESA3D-NEXT:     reserved_sgpr_first = 0
+; CHECK-MESA3D-NEXT:     reserved_sgpr_count = 0
+; CHECK-MESA3D-NEXT:     debug_wavefront_private_segment_offset_sgpr = 0
+; CHECK-MESA3D-NEXT:     debug_private_segment_buffer_sgpr = 0
+; CHECK-MESA3D-NEXT:     kernarg_segment_alignment = 4
+; CHECK-MESA3D-NEXT:     group_segment_alignment = 4
+; CHECK-MESA3D-NEXT:     private_segment_alignment = 4
+; CHECK-MESA3D-NEXT:     wavefront_size = 5
+; CHECK-MESA3D-NEXT:     call_convention = -1
+; CHECK-MESA3D-NEXT:     runtime_loader_kernel_symbol = 0
+; CHECK-MESA3D-NEXT:    .end_amd_kernel_code_t
+; CHECK-MESA3D-NEXT:  ; %bb.0:
+; CHECK-MESA3D-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; CHECK-MESA3D-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-MESA3D-NEXT:    s_wait_kmcnt 0x0
+; CHECK-MESA3D-NEXT:    global_store_b32 v0, v0, s[0:1]
+; CHECK-MESA3D-NEXT:    s_endpgm
+;
+; CHECK-G-UNKNOWN-LABEL: test_workgroup_id_y_optimized:
+; CHECK-G-UNKNOWN:       ; %bb.0:
+; CHECK-G-UNKNOWN-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; CHECK-G-UNKNOWN-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-G-UNKNOWN-NEXT:    s_wait_kmcnt 0x0
+; CHECK-G-UNKNOWN-NEXT:    global_store_b32 v0, v0, s[0:1]
+; CHECK-G-UNKNOWN-NEXT:    s_endpgm
+;
+; CHECK-G-MESA3D-LABEL: test_workgroup_id_y_optimized:
+; CHECK-G-MESA3D:         .amd_kernel_code_t
+; CHECK-G-MESA3D-NEXT:     amd_code_version_major = 1
+; CHECK-G-MESA3D-NEXT:     amd_code_version_minor = 2
+; CHECK-G-MESA3D-NEXT:     amd_machine_kind = 1
+; CHECK-G-MESA3D-NEXT:     amd_machine_version_major = 12
+; CHECK-G-MESA3D-NEXT:     amd_machine_version_minor = 5
+; CHECK-G-MESA3D-NEXT:     amd_machine_version_stepping = 0
+; CHECK-G-MESA3D-NEXT:     kernel_code_entry_byte_offset = 256
+; CHECK-G-MESA3D-NEXT:     kernel_code_prefetch_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     granulated_workitem_vgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     granulated_wavefront_sgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     priority = 0
+; CHECK-G-MESA3D-NEXT:     float_mode = 240
+; CHECK-G-MESA3D-NEXT:     priv = 0
+; CHECK-G-MESA3D-NEXT:     enable_dx10_clamp = 0
+; CHECK-G-MESA3D-NEXT:     debug_mode = 0
+; CHECK-G-MESA3D-NEXT:     enable_ieee_mode = 0
+; CHECK-G-MESA3D-NEXT:     enable_wgp_mode = 0
+; CHECK-G-MESA3D-NEXT:     enable_mem_ordered = 1
+; CHECK-G-MESA3D-NEXT:     enable_fwd_progress = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
+; CHECK-G-MESA3D-NEXT:     user_sgpr_count = 8
+; CHECK-G-MESA3D-NEXT:     enable_trap_handler = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_id_x = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_id_y = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_id_z = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_info = 0
+; CHECK-G-MESA3D-NEXT:     enable_vgpr_workitem_id = 2
+; CHECK-G-MESA3D-NEXT:     enable_exception_msb = 0
+; CHECK-G-MESA3D-NEXT:     granulated_lds_size = 0
+; CHECK-G-MESA3D-NEXT:     enable_exception = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_private_segment_buffer = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_dispatch_ptr = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_queue_ptr = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_dispatch_id = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_flat_scratch_init = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_private_segment_size = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_z = 0
+; CHECK-G-MESA3D-NEXT:     enable_wavefront_size32 = 1
+; CHECK-G-MESA3D-NEXT:     enable_ordered_append_gds = 0
+; CHECK-G-MESA3D-NEXT:     private_element_size = 1
+; CHECK-G-MESA3D-NEXT:     is_ptr64 = 1
+; CHECK-G-MESA3D-NEXT:     is_dynamic_callstack = 0
+; CHECK-G-MESA3D-NEXT:     is_debug_enabled = 0
+; CHECK-G-MESA3D-NEXT:     is_xnack_enabled = 0
+; CHECK-G-MESA3D-NEXT:     workitem_private_segment_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     workgroup_group_segment_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     gds_segment_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     kernarg_segment_byte_size = 24
+; CHECK-G-MESA3D-NEXT:     workgroup_fbarrier_count = 0
+; CHECK-G-MESA3D-NEXT:     wavefront_sgpr_count = 6
+; CHECK-G-MESA3D-NEXT:     workitem_vgpr_count = 1
+; CHECK-G-MESA3D-NEXT:     reserved_vgpr_first = 0
+; CHECK-G-MESA3D-NEXT:     reserved_vgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     reserved_sgpr_first = 0
+; CHECK-G-MESA3D-NEXT:     reserved_sgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     debug_wavefront_private_segment_offset_sgpr = 0
+; CHECK-G-MESA3D-NEXT:     debug_private_segment_buffer_sgpr = 0
+; CHECK-G-MESA3D-NEXT:     kernarg_segment_alignment = 4
+; CHECK-G-MESA3D-NEXT:     group_segment_alignment = 4
+; CHECK-G-MESA3D-NEXT:     private_segment_alignment = 4
+; CHECK-G-MESA3D-NEXT:     wavefront_size = 5
+; CHECK-G-MESA3D-NEXT:     call_convention = -1
+; CHECK-G-MESA3D-NEXT:     runtime_loader_kernel_symbol = 0
+; CHECK-G-MESA3D-NEXT:    .end_amd_kernel_code_t
+; CHECK-G-MESA3D-NEXT:  ; %bb.0:
+; CHECK-G-MESA3D-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; CHECK-G-MESA3D-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-G-MESA3D-NEXT:    s_wait_kmcnt 0x0
+; CHECK-G-MESA3D-NEXT:    global_store_b32 v0, v0, s[0:1]
+; CHECK-G-MESA3D-NEXT:    s_endpgm
+  %id = call i32 @llvm.amdgcn.cluster.workgroup.id.y()
+  store i32 %id, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @test_workgroup_id_z(ptr addrspace(1) %out) #1 {
+; CHECK-UNKNOWN-LABEL: test_workgroup_id_z:
+; CHECK-UNKNOWN:       ; %bb.0:
+; CHECK-UNKNOWN-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; CHECK-UNKNOWN-NEXT:    s_bfe_u32 s2, ttmp6, 0x40008
+; CHECK-UNKNOWN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-UNKNOWN-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; CHECK-UNKNOWN-NEXT:    s_wait_kmcnt 0x0
+; CHECK-UNKNOWN-NEXT:    global_store_b32 v0, v1, s[0:1]
+; CHECK-UNKNOWN-NEXT:    s_endpgm
+;
+; CHECK-MESA3D-LABEL: test_workgroup_id_z:
+; CHECK-MESA3D:         .amd_kernel_code_t
+; CHECK-MESA3D-NEXT:     amd_code_version_major = 1
+; CHECK-MESA3D-NEXT:     amd_code_version_minor = 2
+; CHECK-MESA3D-NEXT:     amd_machine_kind = 1
+; CHECK-MESA3D-NEXT:     amd_machine_version_major = 12
+; CHECK-MESA3D-NEXT:     amd_machine_version_minor = 5
+; CHECK-MESA3D-NEXT:     amd_machine_version_stepping = 0
+; CHECK-MESA3D-NEXT:     kernel_code_entry_byte_offset = 256
+; CHECK-MESA3D-NEXT:     kernel_code_prefetch_byte_size = 0
+; CHECK-MESA3D-NEXT:     granulated_workitem_vgpr_count = 0
+; CHECK-MESA3D-NEXT:     granulated_wavefront_sgpr_count = 0
+; CHECK-MESA3D-NEXT:     priority = 0
+; CHECK-MESA3D-NEXT:     float_mode = 240
+; CHECK-MESA3D-NEXT:     priv = 0
+; CHECK-MESA3D-NEXT:     enable_dx10_clamp = 0
+; CHECK-MESA3D-NEXT:     debug_mode = 0
+; CHECK-MESA3D-NEXT:     enable_ieee_mode = 0
+; CHECK-MESA3D-NEXT:     enable_wgp_mode = 0
+; CHECK-MESA3D-NEXT:     enable_mem_ordered = 1
+; CHECK-MESA3D-NEXT:     enable_fwd_progress = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
+; CHECK-MESA3D-NEXT:     user_sgpr_count = 8
+; CHECK-MESA3D-NEXT:     enable_trap_handler = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_id_x = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_id_y = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_id_z = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_info = 0
+; CHECK-MESA3D-NEXT:     enable_vgpr_workitem_id = 2
+; CHECK-MESA3D-NEXT:     enable_exception_msb = 0
+; CHECK-MESA3D-NEXT:     granulated_lds_size = 0
+; CHECK-MESA3D-NEXT:     enable_exception = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_private_segment_buffer = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_dispatch_ptr = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_queue_ptr = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_dispatch_id = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_flat_scratch_init = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_private_segment_size = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_z = 0
+; CHECK-MESA3D-NEXT:     enable_wavefront_size32 = 1
+; CHECK-MESA3D-NEXT:     enable_ordered_append_gds = 0
+; CHECK-MESA3D-NEXT:     private_element_size = 1
+; CHECK-MESA3D-NEXT:     is_ptr64 = 1
+; CHECK-MESA3D-NEXT:     is_dynamic_callstack = 0
+; CHECK-MESA3D-NEXT:     is_debug_enabled = 0
+; CHECK-MESA3D-NEXT:     is_xnack_enabled = 0
+; CHECK-MESA3D-NEXT:     workitem_private_segment_byte_size = 0
+; CHECK-MESA3D-NEXT:     workgroup_group_segment_byte_size = 0
+; CHECK-MESA3D-NEXT:     gds_segment_byte_size = 0
+; CHECK-MESA3D-NEXT:     kernarg_segment_byte_size = 24
+; CHECK-MESA3D-NEXT:     workgroup_fbarrier_count = 0
+; CHECK-MESA3D-NEXT:     wavefront_sgpr_count = 6
+; CHECK-MESA3D-NEXT:     workitem_vgpr_count = 2
+; CHECK-MESA3D-NEXT:     reserved_vgpr_first = 0
+; CHECK-MESA3D-NEXT:     reserved_vgpr_count = 0
+; CHECK-MESA3D-NEXT:     reserved_sgpr_first = 0
+; CHECK-MESA3D-NEXT:     reserved_sgpr_count = 0
+; CHECK-MESA3D-NEXT:     debug_wavefront_private_segment_offset_sgpr = 0
+; CHECK-MESA3D-NEXT:     debug_private_segment_buffer_sgpr = 0
+; CHECK-MESA3D-NEXT:     kernarg_segment_alignment = 4
+; CHECK-MESA3D-NEXT:     group_segment_alignment = 4
+; CHECK-MESA3D-NEXT:     private_segment_alignment = 4
+; CHECK-MESA3D-NEXT:     wavefront_size = 5
+; CHECK-MESA3D-NEXT:     call_convention = -1
+; CHECK-MESA3D-NEXT:     runtime_loader_kernel_symbol = 0
+; CHECK-MESA3D-NEXT:    .end_amd_kernel_code_t
+; CHECK-MESA3D-NEXT:  ; %bb.0:
+; CHECK-MESA3D-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; CHECK-MESA3D-NEXT:    s_bfe_u32 s2, ttmp6, 0x40008
+; CHECK-MESA3D-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-MESA3D-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; CHECK-MESA3D-NEXT:    s_wait_kmcnt 0x0
+; CHECK-MESA3D-NEXT:    global_store_b32 v0, v1, s[0:1]
+; CHECK-MESA3D-NEXT:    s_endpgm
+;
+; CHECK-G-UNKNOWN-LABEL: test_workgroup_id_z:
+; CHECK-G-UNKNOWN:       ; %bb.0:
+; CHECK-G-UNKNOWN-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; CHECK-G-UNKNOWN-NEXT:    s_bfe_u32 s2, ttmp6, 0x40008
+; CHECK-G-UNKNOWN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-G-UNKNOWN-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; CHECK-G-UNKNOWN-NEXT:    s_wait_kmcnt 0x0
+; CHECK-G-UNKNOWN-NEXT:    global_store_b32 v1, v0, s[0:1]
+; CHECK-G-UNKNOWN-NEXT:    s_endpgm
+;
+; CHECK-G-MESA3D-LABEL: test_workgroup_id_z:
+; CHECK-G-MESA3D:         .amd_kernel_code_t
+; CHECK-G-MESA3D-NEXT:     amd_code_version_major = 1
+; CHECK-G-MESA3D-NEXT:     amd_code_version_minor = 2
+; CHECK-G-MESA3D-NEXT:     amd_machine_kind = 1
+; CHECK-G-MESA3D-NEXT:     amd_machine_version_major = 12
+; CHECK-G-MESA3D-NEXT:     amd_machine_version_minor = 5
+; CHECK-G-MESA3D-NEXT:     amd_machine_version_stepping = 0
+; CHECK-G-MESA3D-NEXT:     kernel_code_entry_byte_offset = 256
+; CHECK-G-MESA3D-NEXT:     kernel_code_prefetch_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     granulated_workitem_vgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     granulated_wavefront_sgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     priority = 0
+; CHECK-G-MESA3D-NEXT:     float_mode = 240
+; CHECK-G-MESA3D-NEXT:     priv = 0
+; CHECK-G-MESA3D-NEXT:     enable_dx10_clamp = 0
+; CHECK-G-MESA3D-NEXT:     debug_mode = 0
+; CHECK-G-MESA3D-NEXT:     enable_ieee_mode = 0
+; CHECK-G-MESA3D-NEXT:     enable_wgp_mode = 0
+; CHECK-G-MESA3D-NEXT:     enable_mem_ordered = 1
+; CHECK-G-MESA3D-NEXT:     enable_fwd_progress = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
+; CHECK-G-MESA3D-NEXT:     user_sgpr_count = 8
+; CHECK-G-MESA3D-NEXT:     enable_trap_handler = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_id_x = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_id_y = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_id_z = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_info = 0
+; CHECK-G-MESA3D-NEXT:     enable_vgpr_workitem_id = 2
+; CHECK-G-MESA3D-NEXT:     enable_exception_msb = 0
+; CHECK-G-MESA3D-NEXT:     granulated_lds_size = 0
+; CHECK-G-MESA3D-NEXT:     enable_exception = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_private_segment_buffer = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_dispatch_ptr = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_queue_ptr = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_dispatch_id = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_flat_scratch_init = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_private_segment_size = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_z = 0
+; CHECK-G-MESA3D-NEXT:     enable_wavefront_size32 = 1
+; CHECK-G-MESA3D-NEXT:     enable_ordered_append_gds = 0
+; CHECK-G-MESA3D-NEXT:     private_element_size = 1
+; CHECK-G-MESA3D-NEXT:     is_ptr64 = 1
+; CHECK-G-MESA3D-NEXT:     is_dynamic_callstack = 0
+; CHECK-G-MESA3D-NEXT:     is_debug_enabled = 0
+; CHECK-G-MESA3D-NEXT:     is_xnack_enabled = 0
+; CHECK-G-MESA3D-NEXT:     workitem_private_segment_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     workgroup_group_segment_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     gds_segment_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     kernarg_segment_byte_size = 24
+; CHECK-G-MESA3D-NEXT:     workgroup_fbarrier_count = 0
+; CHECK-G-MESA3D-NEXT:     wavefront_sgpr_count = 6
+; CHECK-G-MESA3D-NEXT:     workitem_vgpr_count = 2
+; CHECK-G-MESA3D-NEXT:     reserved_vgpr_first = 0
+; CHECK-G-MESA3D-NEXT:     reserved_vgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     reserved_sgpr_first = 0
+; CHECK-G-MESA3D-NEXT:     reserved_sgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     debug_wavefront_private_segment_offset_sgpr = 0
+; CHECK-G-MESA3D-NEXT:     debug_private_segment_buffer_sgpr = 0
+; CHECK-G-MESA3D-NEXT:     kernarg_segment_alignment = 4
+; CHECK-G-MESA3D-NEXT:     group_segment_alignment = 4
+; CHECK-G-MESA3D-NEXT:     private_segment_alignment = 4
+; CHECK-G-MESA3D-NEXT:     wavefront_size = 5
+; CHECK-G-MESA3D-NEXT:     call_convention = -1
+; CHECK-G-MESA3D-NEXT:     runtime_loader_kernel_symbol = 0
+; CHECK-G-MESA3D-NEXT:    .end_amd_kernel_code_t
+; CHECK-G-MESA3D-NEXT:  ; %bb.0:
+; CHECK-G-MESA3D-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; CHECK-G-MESA3D-NEXT:    s_bfe_u32 s2, ttmp6, 0x40008
+; CHECK-G-MESA3D-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-G-MESA3D-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; CHECK-G-MESA3D-NEXT:    s_wait_kmcnt 0x0
+; CHECK-G-MESA3D-NEXT:    global_store_b32 v1, v0, s[0:1]
+; CHECK-G-MESA3D-NEXT:    s_endpgm
+  %id = call i32 @llvm.amdgcn.cluster.workgroup.id.z()
+  store i32 %id, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @test_workgroup_flat_id(ptr addrspace(1) %out) {
+; CHECK-UNKNOWN-LABEL: test_workgroup_flat_id:
+; CHECK-UNKNOWN:       ; %bb.0:
+; CHECK-UNKNOWN-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; CHECK-UNKNOWN-NEXT:    s_getreg_b32 s2, hwreg(HW_REG_IB_STS2, 21, 4)
+; CHECK-UNKNOWN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-UNKNOWN-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; CHECK-UNKNOWN-NEXT:    s_wait_kmcnt 0x0
+; CHECK-UNKNOWN-NEXT:    global_store_b32 v0, v1, s[0:1]
+; CHECK-UNKNOWN-NEXT:    s_endpgm
+;
+; CHECK-MESA3D-LABEL: test_workgroup_flat_id:
+; CHECK-MESA3D:         .amd_kernel_code_t
+; CHECK-MESA3D-NEXT:     amd_code_version_major = 1
+; CHECK-MESA3D-NEXT:     amd_code_version_minor = 2
+; CHECK-MESA3D-NEXT:     amd_machine_kind = 1
+; CHECK-MESA3D-NEXT:     amd_machine_version_major = 12
+; CHECK-MESA3D-NEXT:     amd_machine_version_minor = 5
+; CHECK-MESA3D-NEXT:     amd_machine_version_stepping = 0
+; CHECK-MESA3D-NEXT:     kernel_code_entry_byte_offset = 256
+; CHECK-MESA3D-NEXT:     kernel_code_prefetch_byte_size = 0
+; CHECK-MESA3D-NEXT:     granulated_workitem_vgpr_count = 0
+; CHECK-MESA3D-NEXT:     granulated_wavefront_sgpr_count = 0
+; CHECK-MESA3D-NEXT:     priority = 0
+; CHECK-MESA3D-NEXT:     float_mode = 240
+; CHECK-MESA3D-NEXT:     priv = 0
+; CHECK-MESA3D-NEXT:     enable_dx10_clamp = 0
+; CHECK-MESA3D-NEXT:     debug_mode = 0
+; CHECK-MESA3D-NEXT:     enable_ieee_mode = 0
+; CHECK-MESA3D-NEXT:     enable_wgp_mode = 0
+; CHECK-MESA3D-NEXT:     enable_mem_ordered = 1
+; CHECK-MESA3D-NEXT:     enable_fwd_progress = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
+; CHECK-MESA3D-NEXT:     user_sgpr_count = 8
+; CHECK-MESA3D-NEXT:     enable_trap_handler = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_id_x = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_id_y = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_id_z = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_info = 0
+; CHECK-MESA3D-NEXT:     enable_vgpr_workitem_id = 2
+; CHECK-MESA3D-NEXT:     enable_exception_msb = 0
+; CHECK-MESA3D-NEXT:     granulated_lds_size = 0
+; CHECK-MESA3D-NEXT:     enable_exception = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_private_segment_buffer = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_dispatch_ptr = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_queue_ptr = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_dispatch_id = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_flat_scratch_init = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_private_segment_size = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_z = 0
+; CHECK-MESA3D-NEXT:     enable_wavefront_size32 = 1
+; CHECK-MESA3D-NEXT:     enable_ordered_append_gds = 0
+; CHECK-MESA3D-NEXT:     private_element_size = 1
+; CHECK-MESA3D-NEXT:     is_ptr64 = 1
+; CHECK-MESA3D-NEXT:     is_dynamic_callstack = 0
+; CHECK-MESA3D-NEXT:     is_debug_enabled = 0
+; CHECK-MESA3D-NEXT:     is_xnack_enabled = 0
+; CHECK-MESA3D-NEXT:     workitem_private_segment_byte_size = 0
+; CHECK-MESA3D-NEXT:     workgroup_group_segment_byte_size = 0
+; CHECK-MESA3D-NEXT:     gds_segment_byte_size = 0
+; CHECK-MESA3D-NEXT:     kernarg_segment_byte_size = 24
+; CHECK-MESA3D-NEXT:     workgroup_fbarrier_count = 0
+; CHECK-MESA3D-NEXT:     wavefront_sgpr_count = 6
+; CHECK-MESA3D-NEXT:     workitem_vgpr_count = 2
+; CHECK-MESA3D-NEXT:     reserved_vgpr_first = 0
+; CHECK-MESA3D-NEXT:     reserved_vgpr_count = 0
+; CHECK-MESA3D-NEXT:     reserved_sgpr_first = 0
+; CHECK-MESA3D-NEXT:     reserved_sgpr_count = 0
+; CHECK-MESA3D-NEXT:     debug_wavefront_private_segment_offset_sgpr = 0
+; CHECK-MESA3D-NEXT:     debug_private_segment_buffer_sgpr = 0
+; CHECK-MESA3D-NEXT:     kernarg_segment_alignment = 4
+; CHECK-MESA3D-NEXT:     group_segment_alignment = 4
+; CHECK-MESA3D-NEXT:     private_segment_alignment = 4
+; CHECK-MESA3D-NEXT:     wavefront_size = 5
+; CHECK-MESA3D-NEXT:     call_convention = -1
+; CHECK-MESA3D-NEXT:     runtime_loader_kernel_symbol = 0
+; CHECK-MESA3D-NEXT:    .end_amd_kernel_code_t
+; CHECK-MESA3D-NEXT:  ; %bb.0:
+; CHECK-MESA3D-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; CHECK-MESA3D-NEXT:    s_getreg_b32 s2, hwreg(HW_REG_IB_STS2, 21, 4)
+; CHECK-MESA3D-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-MESA3D-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; CHECK-MESA3D-NEXT:    s_wait_kmcnt 0x0
+; CHECK-MESA3D-NEXT:    global_store_b32 v0, v1, s[0:1]
+; CHECK-MESA3D-NEXT:    s_endpgm
+;
+; CHECK-G-UNKNOWN-LABEL: test_workgroup_flat_id:
+; CHECK-G-UNKNOWN:       ; %bb.0:
+; CHECK-G-UNKNOWN-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; CHECK-G-UNKNOWN-NEXT:    s_getreg_b32 s2, hwreg(HW_REG_IB_STS2, 21, 4)
+; CHECK-G-UNKNOWN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-G-UNKNOWN-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; CHECK-G-UNKNOWN-NEXT:    s_wait_kmcnt 0x0
+; CHECK-G-UNKNOWN-NEXT:    global_store_b32 v1, v0, s[0:1]
+; CHECK-G-UNKNOWN-NEXT:    s_endpgm
+;
+; CHECK-G-MESA3D-LABEL: test_workgroup_flat_id:
+; CHECK-G-MESA3D:         .amd_kernel_code_t
+; CHECK-G-MESA3D-NEXT:     amd_code_version_major = 1
+; CHECK-G-MESA3D-NEXT:     amd_code_version_minor = 2
+; CHECK-G-MESA3D-NEXT:     amd_machine_kind = 1
+; CHECK-G-MESA3D-NEXT:     amd_machine_version_major = 12
+; CHECK-G-MESA3D-NEXT:     amd_machine_version_minor = 5
+; CHECK-G-MESA3D-NEXT:     amd_machine_version_stepping = 0
+; CHECK-G-MESA3D-NEXT:     kernel_code_entry_byte_offset = 256
+; CHECK-G-MESA3D-NEXT:     kernel_code_prefetch_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     granulated_workitem_vgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     granulated_wavefront_sgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     priority = 0
+; CHECK-G-MESA3D-NEXT:     float_mode = 240
+; CHECK-G-MESA3D-NEXT:     priv = 0
+; CHECK-G-MESA3D-NEXT:     enable_dx10_clamp = 0
+; CHECK-G-MESA3D-NEXT:     debug_mode = 0
+; CHECK-G-MESA3D-NEXT:     enable_ieee_mode = 0
+; CHECK-G-MESA3D-NEXT:     enable_wgp_mode = 0
+; CHECK-G-MESA3D-NEXT:     enable_mem_ordered = 1
+; CHECK-G-MESA3D-NEXT:     enable_fwd_progress = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
+; CHECK-G-MESA3D-NEXT:     user_sgpr_count = 8
+; CHECK-G-MESA3D-NEXT:     enable_trap_handler = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_id_x = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_id_y = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_id_z = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_info = 0
+; CHECK-G-MESA3D-NEXT:     enable_vgpr_workitem_id = 2
+; CHECK-G-MESA3D-NEXT:     enable_exception_msb = 0
+; CHECK-G-MESA3D-NEXT:     granulated_lds_size = 0
+; CHECK-G-MESA3D-NEXT:     enable_exception = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_private_segment_buffer = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_dispatch_ptr = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_queue_ptr = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_dispatch_id = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_flat_scratch_init = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_private_segment_size = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_z = 0
+; CHECK-G-MESA3D-NEXT:     enable_wavefront_size32 = 1
+; CHECK-G-MESA3D-NEXT:     enable_ordered_append_gds = 0
+; CHECK-G-MESA3D-NEXT:     private_element_size = 1
+; CHECK-G-MESA3D-NEXT:     is_ptr64 = 1
+; CHECK-G-MESA3D-NEXT:     is_dynamic_callstack = 0
+; CHECK-G-MESA3D-NEXT:     is_debug_enabled = 0
+; CHECK-G-MESA3D-NEXT:     is_xnack_enabled = 0
+; CHECK-G-MESA3D-NEXT:     workitem_private_segment_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     workgroup_group_segment_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     gds_segment_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     kernarg_segment_byte_size = 24
+; CHECK-G-MESA3D-NEXT:     workgroup_fbarrier_count = 0
+; CHECK-G-MESA3D-NEXT:     wavefront_sgpr_count = 6
+; CHECK-G-MESA3D-NEXT:     workitem_vgpr_count = 2
+; CHECK-G-MESA3D-NEXT:     reserved_vgpr_first = 0
+; CHECK-G-MESA3D-NEXT:     reserved_vgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     reserved_sgpr_first = 0
+; CHECK-G-MESA3D-NEXT:     reserved_sgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     debug_wavefront_private_segment_offset_sgpr = 0
+; CHECK-G-MESA3D-NEXT:     debug_private_segment_buffer_sgpr = 0
+; CHECK-G-MESA3D-NEXT:     kernarg_segment_alignment = 4
+; CHECK-G-MESA3D-NEXT:     group_segment_alignment = 4
+; CHECK-G-MESA3D-NEXT:     private_segment_alignment = 4
+; CHECK-G-MESA3D-NEXT:     wavefront_size = 5
+; CHECK-G-MESA3D-NEXT:     call_convention = -1
+; CHECK-G-MESA3D-NEXT:     runtime_loader_kernel_symbol = 0
+; CHECK-G-MESA3D-NEXT:    .end_amd_kernel_code_t
+; CHECK-G-MESA3D-NEXT:  ; %bb.0:
+; CHECK-G-MESA3D-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; CHECK-G-MESA3D-NEXT:    s_getreg_b32 s2, hwreg(HW_REG_IB_STS2, 21, 4)
+; CHECK-G-MESA3D-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-G-MESA3D-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; CHECK-G-MESA3D-NEXT:    s_wait_kmcnt 0x0
+; CHECK-G-MESA3D-NEXT:    global_store_b32 v1, v0, s[0:1]
+; CHECK-G-MESA3D-NEXT:    s_endpgm
+  %id = call i32 @llvm.amdgcn.cluster.workgroup.flat.id()
+  store i32 %id, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @test_workgroup_id_z_optimized(ptr addrspace(1) %out) "amdgpu-cluster-dims"="2,2,1" {
+; CHECK-UNKNOWN-LABEL: test_workgroup_id_z_optimized:
+; CHECK-UNKNOWN:       ; %bb.0:
+; CHECK-UNKNOWN-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; CHECK-UNKNOWN-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-UNKNOWN-NEXT:    s_wait_kmcnt 0x0
+; CHECK-UNKNOWN-NEXT:    global_store_b32 v0, v0, s[0:1]
+; CHECK-UNKNOWN-NEXT:    s_endpgm
+;
+; CHECK-MESA3D-LABEL: test_workgroup_id_z_optimized:
+; CHECK-MESA3D:         .amd_kernel_code_t
+; CHECK-MESA3D-NEXT:     amd_code_version_major = 1
+; CHECK-MESA3D-NEXT:     amd_code_version_minor = 2
+; CHECK-MESA3D-NEXT:     amd_machine_kind = 1
+; CHECK-MESA3D-NEXT:     amd_machine_version_major = 12
+; CHECK-MESA3D-NEXT:     amd_machine_version_minor = 5
+; CHECK-MESA3D-NEXT:     amd_machine_version_stepping = 0
+; CHECK-MESA3D-NEXT:     kernel_code_entry_byte_offset = 256
+; CHECK-MESA3D-NEXT:     kernel_code_prefetch_byte_size = 0
+; CHECK-MESA3D-NEXT:     granulated_workitem_vgpr_count = 0
+; CHECK-MESA3D-NEXT:     granulated_wavefront_sgpr_count = 0
+; CHECK-MESA3D-NEXT:     priority = 0
+; CHECK-MESA3D-NEXT:     float_mode = 240
+; CHECK-MESA3D-NEXT:     priv = 0
+; CHECK-MESA3D-NEXT:     enable_dx10_clamp = 0
+; CHECK-MESA3D-NEXT:     debug_mode = 0
+; CHECK-MESA3D-NEXT:     enable_ieee_mode = 0
+; CHECK-MESA3D-NEXT:     enable_wgp_mode = 0
+; CHECK-MESA3D-NEXT:     enable_mem_ordered = 1
+; CHECK-MESA3D-NEXT:     enable_fwd_progress = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
+; CHECK-MESA3D-NEXT:     user_sgpr_count = 8
+; CHECK-MESA3D-NEXT:     enable_trap_handler = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_id_x = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_id_y = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_id_z = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_info = 0
+; CHECK-MESA3D-NEXT:     enable_vgpr_workitem_id = 2
+; CHECK-MESA3D-NEXT:     enable_exception_msb = 0
+; CHECK-MESA3D-NEXT:     granulated_lds_size = 0
+; CHECK-MESA3D-NEXT:     enable_exception = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_private_segment_buffer = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_dispatch_ptr = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_queue_ptr = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_dispatch_id = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_flat_scratch_init = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_private_segment_size = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_z = 0
+; CHECK-MESA3D-NEXT:     enable_wavefront_size32 = 1
+; CHECK-MESA3D-NEXT:     enable_ordered_append_gds = 0
+; CHECK-MESA3D-NEXT:     private_element_size = 1
+; CHECK-MESA3D-NEXT:     is_ptr64 = 1
+; CHECK-MESA3D-NEXT:     is_dynamic_callstack = 0
+; CHECK-MESA3D-NEXT:     is_debug_enabled = 0
+; CHECK-MESA3D-NEXT:     is_xnack_enabled = 0
+; CHECK-MESA3D-NEXT:     workitem_private_segment_byte_size = 0
+; CHECK-MESA3D-NEXT:     workgroup_group_segment_byte_size = 0
+; CHECK-MESA3D-NEXT:     gds_segment_byte_size = 0
+; CHECK-MESA3D-NEXT:     kernarg_segment_byte_size = 24
+; CHECK-MESA3D-NEXT:     workgroup_fbarrier_count = 0
+; CHECK-MESA3D-NEXT:     wavefront_sgpr_count = 6
+; CHECK-MESA3D-NEXT:     workitem_vgpr_count = 1
+; CHECK-MESA3D-NEXT:     reserved_vgpr_first = 0
+; CHECK-MESA3D-NEXT:     reserved_vgpr_count = 0
+; CHECK-MESA3D-NEXT:     reserved_sgpr_first = 0
+; CHECK-MESA3D-NEXT:     reserved_sgpr_count = 0
+; CHECK-MESA3D-NEXT:     debug_wavefront_private_segment_offset_sgpr = 0
+; CHECK-MESA3D-NEXT:     debug_private_segment_buffer_sgpr = 0
+; CHECK-MESA3D-NEXT:     kernarg_segment_alignment = 4
+; CHECK-MESA3D-NEXT:     group_segment_alignment = 4
+; CHECK-MESA3D-NEXT:     private_segment_alignment = 4
+; CHECK-MESA3D-NEXT:     wavefront_size = 5
+; CHECK-MESA3D-NEXT:     call_convention = -1
+; CHECK-MESA3D-NEXT:     runtime_loader_kernel_symbol = 0
+; CHECK-MESA3D-NEXT:    .end_amd_kernel_code_t
+; CHECK-MESA3D-NEXT:  ; %bb.0:
+; CHECK-MESA3D-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; CHECK-MESA3D-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-MESA3D-NEXT:    s_wait_kmcnt 0x0
+; CHECK-MESA3D-NEXT:    global_store_b32 v0, v0, s[0:1]
+; CHECK-MESA3D-NEXT:    s_endpgm
+;
+; CHECK-G-UNKNOWN-LABEL: test_workgroup_id_z_optimized:
+; CHECK-G-UNKNOWN:       ; %bb.0:
+; CHECK-G-UNKNOWN-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; CHECK-G-UNKNOWN-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-G-UNKNOWN-NEXT:    s_wait_kmcnt 0x0
+; CHECK-G-UNKNOWN-NEXT:    global_store_b32 v0, v0, s[0:1]
+; CHECK-G-UNKNOWN-NEXT:    s_endpgm
+;
+; CHECK-G-MESA3D-LABEL: test_workgroup_id_z_optimized:
+; CHECK-G-MESA3D:         .amd_kernel_code_t
+; CHECK-G-MESA3D-NEXT:     amd_code_version_major = 1
+; CHECK-G-MESA3D-NEXT:     amd_code_version_minor = 2
+; CHECK-G-MESA3D-NEXT:     amd_machine_kind = 1
+; CHECK-G-MESA3D-NEXT:     amd_machine_version_major = 12
+; CHECK-G-MESA3D-NEXT:     amd_machine_version_minor = 5
+; CHECK-G-MESA3D-NEXT:     amd_machine_version_stepping = 0
+; CHECK-G-MESA3D-NEXT:     kernel_code_entry_byte_offset = 256
+; CHECK-G-MESA3D-NEXT:     kernel_code_prefetch_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     granulated_workitem_vgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     granulated_wavefront_sgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     priority = 0
+; CHECK-G-MESA3D-NEXT:     float_mode = 240
+; CHECK-G-MESA3D-NEXT:     priv = 0
+; CHECK-G-MESA3D-NEXT:     enable_dx10_clamp = 0
+; CHECK-G-MESA3D-NEXT:     debug_mode = 0
+; CHECK-G-MESA3D-NEXT:     enable_ieee_mode = 0
+; CHECK-G-MESA3D-NEXT:     enable_wgp_mode = 0
+; CHECK-G-MESA3D-NEXT:     enable_mem_ordered = 1
+; CHECK-G-MESA3D-NEXT:     enable_fwd_progress = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
+; CHECK-G-MESA3D-NEXT:     user_sgpr_count = 8
+; CHECK-G-MESA3D-NEXT:     enable_trap_handler = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_id_x = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_id_y = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_id_z = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_info = 0
+; CHECK-G-MESA3D-NEXT:     enable_vgpr_workitem_id = 2
+; CHECK-G-MESA3D-NEXT:     enable_exception_msb = 0
+; CHECK-G-MESA3D-NEXT:     granulated_lds_size = 0
+; CHECK-G-MESA3D-NEXT:     enable_exception = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_private_segment_buffer = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_dispatch_ptr = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_queue_ptr = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_dispatch_id = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_flat_scratch_init = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_private_segment_size = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_z = 0
+; CHECK-G-MESA3D-NEXT:     enable_wavefront_size32 = 1
+; CHECK-G-MESA3D-NEXT:     enable_ordered_append_gds = 0
+; CHECK-G-MESA3D-NEXT:     private_element_size = 1
+; CHECK-G-MESA3D-NEXT:     is_ptr64 = 1
+; CHECK-G-MESA3D-NEXT:     is_dynamic_callstack = 0
+; CHECK-G-MESA3D-NEXT:     is_debug_enabled = 0
+; CHECK-G-MESA3D-NEXT:     is_xnack_enabled = 0
+; CHECK-G-MESA3D-NEXT:     workitem_private_segment_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     workgroup_group_segment_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     gds_segment_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     kernarg_segment_byte_size = 24
+; CHECK-G-MESA3D-NEXT:     workgroup_fbarrier_count = 0
+; CHECK-G-MESA3D-NEXT:     wavefront_sgpr_count = 6
+; CHECK-G-MESA3D-NEXT:     workitem_vgpr_count = 1
+; CHECK-G-MESA3D-NEXT:     reserved_vgpr_first = 0
+; CHECK-G-MESA3D-NEXT:     reserved_vgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     reserved_sgpr_first = 0
+; CHECK-G-MESA3D-NEXT:     reserved_sgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     debug_wavefront_private_segment_offset_sgpr = 0
+; CHECK-G-MESA3D-NEXT:     debug_private_segment_buffer_sgpr = 0
+; CHECK-G-MESA3D-NEXT:     kernarg_segment_alignment = 4
+; CHECK-G-MESA3D-NEXT:     group_segment_alignment = 4
+; CHECK-G-MESA3D-NEXT:     private_segment_alignment = 4
+; CHECK-G-MESA3D-NEXT:     wavefront_size = 5
+; CHECK-G-MESA3D-NEXT:     call_convention = -1
+; CHECK-G-MESA3D-NEXT:     runtime_loader_kernel_symbol = 0
+; CHECK-G-MESA3D-NEXT:    .end_amd_kernel_code_t
+; CHECK-G-MESA3D-NEXT:  ; %bb.0:
+; CHECK-G-MESA3D-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; CHECK-G-MESA3D-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-G-MESA3D-NEXT:    s_wait_kmcnt 0x0
+; CHECK-G-MESA3D-NEXT:    global_store_b32 v0, v0, s[0:1]
+; CHECK-G-MESA3D-NEXT:    s_endpgm
+  %id = call i32 @llvm.amdgcn.cluster.workgroup.id.z()
+  store i32 %id, ptr addrspace(1) %out
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"amdgpu_code_object_version", i32 400}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.workgroup.max.flat.id.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.workgroup.max.flat.id.ll
new file mode 100644
index 0000000000000..afe37e371fbc3
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.workgroup.max.flat.id.ll
@@ -0,0 +1,194 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 %s -o - | FileCheck --check-prefixes=CHECK-UNKNOWN %s
+; RUN: llc -mtriple=amdgcn-unknown-mesa3d -mcpu=gfx1250 %s -o - | FileCheck -check-prefixes=CHECK-MESA3D %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1250 %s -o - | FileCheck --check-prefixes=CHECK-G-UNKNOWN %s
+; RUN: llc -global-isel -mtriple=amdgcn-unknown-mesa3d -mcpu=gfx1250 %s -o - | FileCheck -check-prefixes=CHECK-G-MESA3D %s
+
+declare i32 @llvm.amdgcn.cluster.workgroup.max.flat.id() #0
+
+define amdgpu_kernel void @test_workgroup_max_flat_id(ptr addrspace(1) %out) #1 {
+; CHECK-UNKNOWN-LABEL: test_workgroup_max_flat_id:
+; CHECK-UNKNOWN:       ; %bb.0:
+; CHECK-UNKNOWN-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; CHECK-UNKNOWN-NEXT:    s_bfe_u32 s2, ttmp6, 0x40018
+; CHECK-UNKNOWN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-UNKNOWN-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; CHECK-UNKNOWN-NEXT:    s_wait_kmcnt 0x0
+; CHECK-UNKNOWN-NEXT:    global_store_b32 v0, v1, s[0:1]
+; CHECK-UNKNOWN-NEXT:    s_endpgm
+;
+; CHECK-MESA3D-LABEL: test_workgroup_max_flat_id:
+; CHECK-MESA3D:         .amd_kernel_code_t
+; CHECK-MESA3D-NEXT:     amd_code_version_major = 1
+; CHECK-MESA3D-NEXT:     amd_code_version_minor = 2
+; CHECK-MESA3D-NEXT:     amd_machine_kind = 1
+; CHECK-MESA3D-NEXT:     amd_machine_version_major = 12
+; CHECK-MESA3D-NEXT:     amd_machine_version_minor = 5
+; CHECK-MESA3D-NEXT:     amd_machine_version_stepping = 0
+; CHECK-MESA3D-NEXT:     kernel_code_entry_byte_offset = 256
+; CHECK-MESA3D-NEXT:     kernel_code_prefetch_byte_size = 0
+; CHECK-MESA3D-NEXT:     granulated_workitem_vgpr_count = 0
+; CHECK-MESA3D-NEXT:     granulated_wavefront_sgpr_count = 0
+; CHECK-MESA3D-NEXT:     priority = 0
+; CHECK-MESA3D-NEXT:     float_mode = 240
+; CHECK-MESA3D-NEXT:     priv = 0
+; CHECK-MESA3D-NEXT:     enable_dx10_clamp = 0
+; CHECK-MESA3D-NEXT:     debug_mode = 0
+; CHECK-MESA3D-NEXT:     enable_ieee_mode = 0
+; CHECK-MESA3D-NEXT:     enable_wgp_mode = 0
+; CHECK-MESA3D-NEXT:     enable_mem_ordered = 1
+; CHECK-MESA3D-NEXT:     enable_fwd_progress = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
+; CHECK-MESA3D-NEXT:     user_sgpr_count = 8
+; CHECK-MESA3D-NEXT:     enable_trap_handler = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_id_x = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_id_y = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_id_z = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_info = 0
+; CHECK-MESA3D-NEXT:     enable_vgpr_workitem_id = 2
+; CHECK-MESA3D-NEXT:     enable_exception_msb = 0
+; CHECK-MESA3D-NEXT:     granulated_lds_size = 0
+; CHECK-MESA3D-NEXT:     enable_exception = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_private_segment_buffer = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_dispatch_ptr = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_queue_ptr = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_dispatch_id = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_flat_scratch_init = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_private_segment_size = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_z = 0
+; CHECK-MESA3D-NEXT:     enable_wavefront_size32 = 1
+; CHECK-MESA3D-NEXT:     enable_ordered_append_gds = 0
+; CHECK-MESA3D-NEXT:     private_element_size = 1
+; CHECK-MESA3D-NEXT:     is_ptr64 = 1
+; CHECK-MESA3D-NEXT:     is_dynamic_callstack = 0
+; CHECK-MESA3D-NEXT:     is_debug_enabled = 0
+; CHECK-MESA3D-NEXT:     is_xnack_enabled = 0
+; CHECK-MESA3D-NEXT:     workitem_private_segment_byte_size = 0
+; CHECK-MESA3D-NEXT:     workgroup_group_segment_byte_size = 0
+; CHECK-MESA3D-NEXT:     gds_segment_byte_size = 0
+; CHECK-MESA3D-NEXT:     kernarg_segment_byte_size = 24
+; CHECK-MESA3D-NEXT:     workgroup_fbarrier_count = 0
+; CHECK-MESA3D-NEXT:     wavefront_sgpr_count = 6
+; CHECK-MESA3D-NEXT:     workitem_vgpr_count = 2
+; CHECK-MESA3D-NEXT:     reserved_vgpr_first = 0
+; CHECK-MESA3D-NEXT:     reserved_vgpr_count = 0
+; CHECK-MESA3D-NEXT:     reserved_sgpr_first = 0
+; CHECK-MESA3D-NEXT:     reserved_sgpr_count = 0
+; CHECK-MESA3D-NEXT:     debug_wavefront_private_segment_offset_sgpr = 0
+; CHECK-MESA3D-NEXT:     debug_private_segment_buffer_sgpr = 0
+; CHECK-MESA3D-NEXT:     kernarg_segment_alignment = 4
+; CHECK-MESA3D-NEXT:     group_segment_alignment = 4
+; CHECK-MESA3D-NEXT:     private_segment_alignment = 4
+; CHECK-MESA3D-NEXT:     wavefront_size = 5
+; CHECK-MESA3D-NEXT:     call_convention = -1
+; CHECK-MESA3D-NEXT:     runtime_loader_kernel_symbol = 0
+; CHECK-MESA3D-NEXT:    .end_amd_kernel_code_t
+; CHECK-MESA3D-NEXT:  ; %bb.0:
+; CHECK-MESA3D-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; CHECK-MESA3D-NEXT:    s_bfe_u32 s2, ttmp6, 0x40018
+; CHECK-MESA3D-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-MESA3D-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; CHECK-MESA3D-NEXT:    s_wait_kmcnt 0x0
+; CHECK-MESA3D-NEXT:    global_store_b32 v0, v1, s[0:1]
+; CHECK-MESA3D-NEXT:    s_endpgm
+;
+; CHECK-G-UNKNOWN-LABEL: test_workgroup_max_flat_id:
+; CHECK-G-UNKNOWN:       ; %bb.0:
+; CHECK-G-UNKNOWN-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; CHECK-G-UNKNOWN-NEXT:    s_bfe_u32 s2, ttmp6, 0x40018
+; CHECK-G-UNKNOWN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-G-UNKNOWN-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; CHECK-G-UNKNOWN-NEXT:    s_wait_kmcnt 0x0
+; CHECK-G-UNKNOWN-NEXT:    global_store_b32 v1, v0, s[0:1]
+; CHECK-G-UNKNOWN-NEXT:    s_endpgm
+;
+; CHECK-G-MESA3D-LABEL: test_workgroup_max_flat_id:
+; CHECK-G-MESA3D:         .amd_kernel_code_t
+; CHECK-G-MESA3D-NEXT:     amd_code_version_major = 1
+; CHECK-G-MESA3D-NEXT:     amd_code_version_minor = 2
+; CHECK-G-MESA3D-NEXT:     amd_machine_kind = 1
+; CHECK-G-MESA3D-NEXT:     amd_machine_version_major = 12
+; CHECK-G-MESA3D-NEXT:     amd_machine_version_minor = 5
+; CHECK-G-MESA3D-NEXT:     amd_machine_version_stepping = 0
+; CHECK-G-MESA3D-NEXT:     kernel_code_entry_byte_offset = 256
+; CHECK-G-MESA3D-NEXT:     kernel_code_prefetch_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     granulated_workitem_vgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     granulated_wavefront_sgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     priority = 0
+; CHECK-G-MESA3D-NEXT:     float_mode = 240
+; CHECK-G-MESA3D-NEXT:     priv = 0
+; CHECK-G-MESA3D-NEXT:     enable_dx10_clamp = 0
+; CHECK-G-MESA3D-NEXT:     debug_mode = 0
+; CHECK-G-MESA3D-NEXT:     enable_ieee_mode = 0
+; CHECK-G-MESA3D-NEXT:     enable_wgp_mode = 0
+; CHECK-G-MESA3D-NEXT:     enable_mem_ordered = 1
+; CHECK-G-MESA3D-NEXT:     enable_fwd_progress = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
+; CHECK-G-MESA3D-NEXT:     user_sgpr_count = 8
+; CHECK-G-MESA3D-NEXT:     enable_trap_handler = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_id_x = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_id_y = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_id_z = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_info = 0
+; CHECK-G-MESA3D-NEXT:     enable_vgpr_workitem_id = 2
+; CHECK-G-MESA3D-NEXT:     enable_exception_msb = 0
+; CHECK-G-MESA3D-NEXT:     granulated_lds_size = 0
+; CHECK-G-MESA3D-NEXT:     enable_exception = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_private_segment_buffer = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_dispatch_ptr = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_queue_ptr = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_dispatch_id = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_flat_scratch_init = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_private_segment_size = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_z = 0
+; CHECK-G-MESA3D-NEXT:     enable_wavefront_size32 = 1
+; CHECK-G-MESA3D-NEXT:     enable_ordered_append_gds = 0
+; CHECK-G-MESA3D-NEXT:     private_element_size = 1
+; CHECK-G-MESA3D-NEXT:     is_ptr64 = 1
+; CHECK-G-MESA3D-NEXT:     is_dynamic_callstack = 0
+; CHECK-G-MESA3D-NEXT:     is_debug_enabled = 0
+; CHECK-G-MESA3D-NEXT:     is_xnack_enabled = 0
+; CHECK-G-MESA3D-NEXT:     workitem_private_segment_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     workgroup_group_segment_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     gds_segment_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     kernarg_segment_byte_size = 24
+; CHECK-G-MESA3D-NEXT:     workgroup_fbarrier_count = 0
+; CHECK-G-MESA3D-NEXT:     wavefront_sgpr_count = 6
+; CHECK-G-MESA3D-NEXT:     workitem_vgpr_count = 2
+; CHECK-G-MESA3D-NEXT:     reserved_vgpr_first = 0
+; CHECK-G-MESA3D-NEXT:     reserved_vgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     reserved_sgpr_first = 0
+; CHECK-G-MESA3D-NEXT:     reserved_sgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     debug_wavefront_private_segment_offset_sgpr = 0
+; CHECK-G-MESA3D-NEXT:     debug_private_segment_buffer_sgpr = 0
+; CHECK-G-MESA3D-NEXT:     kernarg_segment_alignment = 4
+; CHECK-G-MESA3D-NEXT:     group_segment_alignment = 4
+; CHECK-G-MESA3D-NEXT:     private_segment_alignment = 4
+; CHECK-G-MESA3D-NEXT:     wavefront_size = 5
+; CHECK-G-MESA3D-NEXT:     call_convention = -1
+; CHECK-G-MESA3D-NEXT:     runtime_loader_kernel_symbol = 0
+; CHECK-G-MESA3D-NEXT:    .end_amd_kernel_code_t
+; CHECK-G-MESA3D-NEXT:  ; %bb.0:
+; CHECK-G-MESA3D-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; CHECK-G-MESA3D-NEXT:    s_bfe_u32 s2, ttmp6, 0x40018
+; CHECK-G-MESA3D-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-G-MESA3D-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; CHECK-G-MESA3D-NEXT:    s_wait_kmcnt 0x0
+; CHECK-G-MESA3D-NEXT:    global_store_b32 v1, v0, s[0:1]
+; CHECK-G-MESA3D-NEXT:    s_endpgm
+  %id = call i32 @llvm.amdgcn.cluster.workgroup.max.flat.id()
+  store i32 %id, ptr addrspace(1) %out
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"amdgpu_code_object_version", i32 400}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.workgroup.max.id.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.workgroup.max.id.ll
new file mode 100644
index 0000000000000..7ea4fa5373e57
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.workgroup.max.id.ll
@@ -0,0 +1,1077 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 %s -o - | FileCheck --check-prefixes=CHECK-UNKNOWN %s
+; RUN: llc -mtriple=amdgcn-unknown-mesa3d -mcpu=gfx1250 %s -o - | FileCheck -check-prefixes=CHECK-MESA3D %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1250 %s -o - | FileCheck --check-prefixes=CHECK-G-UNKNOWN %s
+; RUN: llc -global-isel -mtriple=amdgcn-unknown-mesa3d -mcpu=gfx1250 %s -o - | FileCheck -check-prefixes=CHECK-G-MESA3D %s
+
+declare i32 @llvm.amdgcn.cluster.workgroup.max.id.x() #0
+declare i32 @llvm.amdgcn.cluster.workgroup.max.id.y() #0
+declare i32 @llvm.amdgcn.cluster.workgroup.max.id.z() #0
+
+define amdgpu_kernel void @test_workgroup_max_id_x(ptr addrspace(1) %out) #1 {
+; CHECK-UNKNOWN-LABEL: test_workgroup_max_id_x:
+; CHECK-UNKNOWN:       ; %bb.0:
+; CHECK-UNKNOWN-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; CHECK-UNKNOWN-NEXT:    s_bfe_u32 s2, ttmp6, 0x4000c
+; CHECK-UNKNOWN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-UNKNOWN-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; CHECK-UNKNOWN-NEXT:    s_wait_kmcnt 0x0
+; CHECK-UNKNOWN-NEXT:    global_store_b32 v0, v1, s[0:1]
+; CHECK-UNKNOWN-NEXT:    s_endpgm
+;
+; CHECK-MESA3D-LABEL: test_workgroup_max_id_x:
+; CHECK-MESA3D:         .amd_kernel_code_t
+; CHECK-MESA3D-NEXT:     amd_code_version_major = 1
+; CHECK-MESA3D-NEXT:     amd_code_version_minor = 2
+; CHECK-MESA3D-NEXT:     amd_machine_kind = 1
+; CHECK-MESA3D-NEXT:     amd_machine_version_major = 12
+; CHECK-MESA3D-NEXT:     amd_machine_version_minor = 5
+; CHECK-MESA3D-NEXT:     amd_machine_version_stepping = 0
+; CHECK-MESA3D-NEXT:     kernel_code_entry_byte_offset = 256
+; CHECK-MESA3D-NEXT:     kernel_code_prefetch_byte_size = 0
+; CHECK-MESA3D-NEXT:     granulated_workitem_vgpr_count = 0
+; CHECK-MESA3D-NEXT:     granulated_wavefront_sgpr_count = 0
+; CHECK-MESA3D-NEXT:     priority = 0
+; CHECK-MESA3D-NEXT:     float_mode = 240
+; CHECK-MESA3D-NEXT:     priv = 0
+; CHECK-MESA3D-NEXT:     enable_dx10_clamp = 0
+; CHECK-MESA3D-NEXT:     debug_mode = 0
+; CHECK-MESA3D-NEXT:     enable_ieee_mode = 0
+; CHECK-MESA3D-NEXT:     enable_wgp_mode = 0
+; CHECK-MESA3D-NEXT:     enable_mem_ordered = 1
+; CHECK-MESA3D-NEXT:     enable_fwd_progress = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
+; CHECK-MESA3D-NEXT:     user_sgpr_count = 8
+; CHECK-MESA3D-NEXT:     enable_trap_handler = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_id_x = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_id_y = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_id_z = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_info = 0
+; CHECK-MESA3D-NEXT:     enable_vgpr_workitem_id = 2
+; CHECK-MESA3D-NEXT:     enable_exception_msb = 0
+; CHECK-MESA3D-NEXT:     granulated_lds_size = 0
+; CHECK-MESA3D-NEXT:     enable_exception = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_private_segment_buffer = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_dispatch_ptr = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_queue_ptr = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_dispatch_id = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_flat_scratch_init = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_private_segment_size = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_z = 0
+; CHECK-MESA3D-NEXT:     enable_wavefront_size32 = 1
+; CHECK-MESA3D-NEXT:     enable_ordered_append_gds = 0
+; CHECK-MESA3D-NEXT:     private_element_size = 1
+; CHECK-MESA3D-NEXT:     is_ptr64 = 1
+; CHECK-MESA3D-NEXT:     is_dynamic_callstack = 0
+; CHECK-MESA3D-NEXT:     is_debug_enabled = 0
+; CHECK-MESA3D-NEXT:     is_xnack_enabled = 0
+; CHECK-MESA3D-NEXT:     workitem_private_segment_byte_size = 0
+; CHECK-MESA3D-NEXT:     workgroup_group_segment_byte_size = 0
+; CHECK-MESA3D-NEXT:     gds_segment_byte_size = 0
+; CHECK-MESA3D-NEXT:     kernarg_segment_byte_size = 24
+; CHECK-MESA3D-NEXT:     workgroup_fbarrier_count = 0
+; CHECK-MESA3D-NEXT:     wavefront_sgpr_count = 6
+; CHECK-MESA3D-NEXT:     workitem_vgpr_count = 2
+; CHECK-MESA3D-NEXT:     reserved_vgpr_first = 0
+; CHECK-MESA3D-NEXT:     reserved_vgpr_count = 0
+; CHECK-MESA3D-NEXT:     reserved_sgpr_first = 0
+; CHECK-MESA3D-NEXT:     reserved_sgpr_count = 0
+; CHECK-MESA3D-NEXT:     debug_wavefront_private_segment_offset_sgpr = 0
+; CHECK-MESA3D-NEXT:     debug_private_segment_buffer_sgpr = 0
+; CHECK-MESA3D-NEXT:     kernarg_segment_alignment = 4
+; CHECK-MESA3D-NEXT:     group_segment_alignment = 4
+; CHECK-MESA3D-NEXT:     private_segment_alignment = 4
+; CHECK-MESA3D-NEXT:     wavefront_size = 5
+; CHECK-MESA3D-NEXT:     call_convention = -1
+; CHECK-MESA3D-NEXT:     runtime_loader_kernel_symbol = 0
+; CHECK-MESA3D-NEXT:    .end_amd_kernel_code_t
+; CHECK-MESA3D-NEXT:  ; %bb.0:
+; CHECK-MESA3D-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; CHECK-MESA3D-NEXT:    s_bfe_u32 s2, ttmp6, 0x4000c
+; CHECK-MESA3D-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-MESA3D-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; CHECK-MESA3D-NEXT:    s_wait_kmcnt 0x0
+; CHECK-MESA3D-NEXT:    global_store_b32 v0, v1, s[0:1]
+; CHECK-MESA3D-NEXT:    s_endpgm
+;
+; CHECK-G-UNKNOWN-LABEL: test_workgroup_max_id_x:
+; CHECK-G-UNKNOWN:       ; %bb.0:
+; CHECK-G-UNKNOWN-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; CHECK-G-UNKNOWN-NEXT:    s_bfe_u32 s2, ttmp6, 0x4000c
+; CHECK-G-UNKNOWN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-G-UNKNOWN-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; CHECK-G-UNKNOWN-NEXT:    s_wait_kmcnt 0x0
+; CHECK-G-UNKNOWN-NEXT:    global_store_b32 v1, v0, s[0:1]
+; CHECK-G-UNKNOWN-NEXT:    s_endpgm
+;
+; CHECK-G-MESA3D-LABEL: test_workgroup_max_id_x:
+; CHECK-G-MESA3D:         .amd_kernel_code_t
+; CHECK-G-MESA3D-NEXT:     amd_code_version_major = 1
+; CHECK-G-MESA3D-NEXT:     amd_code_version_minor = 2
+; CHECK-G-MESA3D-NEXT:     amd_machine_kind = 1
+; CHECK-G-MESA3D-NEXT:     amd_machine_version_major = 12
+; CHECK-G-MESA3D-NEXT:     amd_machine_version_minor = 5
+; CHECK-G-MESA3D-NEXT:     amd_machine_version_stepping = 0
+; CHECK-G-MESA3D-NEXT:     kernel_code_entry_byte_offset = 256
+; CHECK-G-MESA3D-NEXT:     kernel_code_prefetch_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     granulated_workitem_vgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     granulated_wavefront_sgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     priority = 0
+; CHECK-G-MESA3D-NEXT:     float_mode = 240
+; CHECK-G-MESA3D-NEXT:     priv = 0
+; CHECK-G-MESA3D-NEXT:     enable_dx10_clamp = 0
+; CHECK-G-MESA3D-NEXT:     debug_mode = 0
+; CHECK-G-MESA3D-NEXT:     enable_ieee_mode = 0
+; CHECK-G-MESA3D-NEXT:     enable_wgp_mode = 0
+; CHECK-G-MESA3D-NEXT:     enable_mem_ordered = 1
+; CHECK-G-MESA3D-NEXT:     enable_fwd_progress = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
+; CHECK-G-MESA3D-NEXT:     user_sgpr_count = 8
+; CHECK-G-MESA3D-NEXT:     enable_trap_handler = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_id_x = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_id_y = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_id_z = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_info = 0
+; CHECK-G-MESA3D-NEXT:     enable_vgpr_workitem_id = 2
+; CHECK-G-MESA3D-NEXT:     enable_exception_msb = 0
+; CHECK-G-MESA3D-NEXT:     granulated_lds_size = 0
+; CHECK-G-MESA3D-NEXT:     enable_exception = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_private_segment_buffer = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_dispatch_ptr = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_queue_ptr = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_dispatch_id = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_flat_scratch_init = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_private_segment_size = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_z = 0
+; CHECK-G-MESA3D-NEXT:     enable_wavefront_size32 = 1
+; CHECK-G-MESA3D-NEXT:     enable_ordered_append_gds = 0
+; CHECK-G-MESA3D-NEXT:     private_element_size = 1
+; CHECK-G-MESA3D-NEXT:     is_ptr64 = 1
+; CHECK-G-MESA3D-NEXT:     is_dynamic_callstack = 0
+; CHECK-G-MESA3D-NEXT:     is_debug_enabled = 0
+; CHECK-G-MESA3D-NEXT:     is_xnack_enabled = 0
+; CHECK-G-MESA3D-NEXT:     workitem_private_segment_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     workgroup_group_segment_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     gds_segment_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     kernarg_segment_byte_size = 24
+; CHECK-G-MESA3D-NEXT:     workgroup_fbarrier_count = 0
+; CHECK-G-MESA3D-NEXT:     wavefront_sgpr_count = 6
+; CHECK-G-MESA3D-NEXT:     workitem_vgpr_count = 2
+; CHECK-G-MESA3D-NEXT:     reserved_vgpr_first = 0
+; CHECK-G-MESA3D-NEXT:     reserved_vgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     reserved_sgpr_first = 0
+; CHECK-G-MESA3D-NEXT:     reserved_sgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     debug_wavefront_private_segment_offset_sgpr = 0
+; CHECK-G-MESA3D-NEXT:     debug_private_segment_buffer_sgpr = 0
+; CHECK-G-MESA3D-NEXT:     kernarg_segment_alignment = 4
+; CHECK-G-MESA3D-NEXT:     group_segment_alignment = 4
+; CHECK-G-MESA3D-NEXT:     private_segment_alignment = 4
+; CHECK-G-MESA3D-NEXT:     wavefront_size = 5
+; CHECK-G-MESA3D-NEXT:     call_convention = -1
+; CHECK-G-MESA3D-NEXT:     runtime_loader_kernel_symbol = 0
+; CHECK-G-MESA3D-NEXT:    .end_amd_kernel_code_t
+; CHECK-G-MESA3D-NEXT:  ; %bb.0:
+; CHECK-G-MESA3D-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; CHECK-G-MESA3D-NEXT:    s_bfe_u32 s2, ttmp6, 0x4000c
+; CHECK-G-MESA3D-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-G-MESA3D-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; CHECK-G-MESA3D-NEXT:    s_wait_kmcnt 0x0
+; CHECK-G-MESA3D-NEXT:    global_store_b32 v1, v0, s[0:1]
+; CHECK-G-MESA3D-NEXT:    s_endpgm
+  %id = call i32 @llvm.amdgcn.cluster.workgroup.max.id.x()
+  store i32 %id, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @test_workgroup_max_id_x_optimized(ptr addrspace(1) %out) "amdgpu-cluster-dims"="5,6,7" {
+; CHECK-UNKNOWN-LABEL: test_workgroup_max_id_x_optimized:
+; CHECK-UNKNOWN:       ; %bb.0:
+; CHECK-UNKNOWN-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; CHECK-UNKNOWN-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4
+; CHECK-UNKNOWN-NEXT:    s_wait_kmcnt 0x0
+; CHECK-UNKNOWN-NEXT:    global_store_b32 v0, v1, s[0:1]
+; CHECK-UNKNOWN-NEXT:    s_endpgm
+;
+; CHECK-MESA3D-LABEL: test_workgroup_max_id_x_optimized:
+; CHECK-MESA3D:         .amd_kernel_code_t
+; CHECK-MESA3D-NEXT:     amd_code_version_major = 1
+; CHECK-MESA3D-NEXT:     amd_code_version_minor = 2
+; CHECK-MESA3D-NEXT:     amd_machine_kind = 1
+; CHECK-MESA3D-NEXT:     amd_machine_version_major = 12
+; CHECK-MESA3D-NEXT:     amd_machine_version_minor = 5
+; CHECK-MESA3D-NEXT:     amd_machine_version_stepping = 0
+; CHECK-MESA3D-NEXT:     kernel_code_entry_byte_offset = 256
+; CHECK-MESA3D-NEXT:     kernel_code_prefetch_byte_size = 0
+; CHECK-MESA3D-NEXT:     granulated_workitem_vgpr_count = 0
+; CHECK-MESA3D-NEXT:     granulated_wavefront_sgpr_count = 0
+; CHECK-MESA3D-NEXT:     priority = 0
+; CHECK-MESA3D-NEXT:     float_mode = 240
+; CHECK-MESA3D-NEXT:     priv = 0
+; CHECK-MESA3D-NEXT:     enable_dx10_clamp = 0
+; CHECK-MESA3D-NEXT:     debug_mode = 0
+; CHECK-MESA3D-NEXT:     enable_ieee_mode = 0
+; CHECK-MESA3D-NEXT:     enable_wgp_mode = 0
+; CHECK-MESA3D-NEXT:     enable_mem_ordered = 1
+; CHECK-MESA3D-NEXT:     enable_fwd_progress = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
+; CHECK-MESA3D-NEXT:     user_sgpr_count = 8
+; CHECK-MESA3D-NEXT:     enable_trap_handler = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_id_x = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_id_y = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_id_z = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_info = 0
+; CHECK-MESA3D-NEXT:     enable_vgpr_workitem_id = 2
+; CHECK-MESA3D-NEXT:     enable_exception_msb = 0
+; CHECK-MESA3D-NEXT:     granulated_lds_size = 0
+; CHECK-MESA3D-NEXT:     enable_exception = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_private_segment_buffer = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_dispatch_ptr = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_queue_ptr = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_dispatch_id = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_flat_scratch_init = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_private_segment_size = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_z = 0
+; CHECK-MESA3D-NEXT:     enable_wavefront_size32 = 1
+; CHECK-MESA3D-NEXT:     enable_ordered_append_gds = 0
+; CHECK-MESA3D-NEXT:     private_element_size = 1
+; CHECK-MESA3D-NEXT:     is_ptr64 = 1
+; CHECK-MESA3D-NEXT:     is_dynamic_callstack = 0
+; CHECK-MESA3D-NEXT:     is_debug_enabled = 0
+; CHECK-MESA3D-NEXT:     is_xnack_enabled = 0
+; CHECK-MESA3D-NEXT:     workitem_private_segment_byte_size = 0
+; CHECK-MESA3D-NEXT:     workgroup_group_segment_byte_size = 0
+; CHECK-MESA3D-NEXT:     gds_segment_byte_size = 0
+; CHECK-MESA3D-NEXT:     kernarg_segment_byte_size = 24
+; CHECK-MESA3D-NEXT:     workgroup_fbarrier_count = 0
+; CHECK-MESA3D-NEXT:     wavefront_sgpr_count = 6
+; CHECK-MESA3D-NEXT:     workitem_vgpr_count = 2
+; CHECK-MESA3D-NEXT:     reserved_vgpr_first = 0
+; CHECK-MESA3D-NEXT:     reserved_vgpr_count = 0
+; CHECK-MESA3D-NEXT:     reserved_sgpr_first = 0
+; CHECK-MESA3D-NEXT:     reserved_sgpr_count = 0
+; CHECK-MESA3D-NEXT:     debug_wavefront_private_segment_offset_sgpr = 0
+; CHECK-MESA3D-NEXT:     debug_private_segment_buffer_sgpr = 0
+; CHECK-MESA3D-NEXT:     kernarg_segment_alignment = 4
+; CHECK-MESA3D-NEXT:     group_segment_alignment = 4
+; CHECK-MESA3D-NEXT:     private_segment_alignment = 4
+; CHECK-MESA3D-NEXT:     wavefront_size = 5
+; CHECK-MESA3D-NEXT:     call_convention = -1
+; CHECK-MESA3D-NEXT:     runtime_loader_kernel_symbol = 0
+; CHECK-MESA3D-NEXT:    .end_amd_kernel_code_t
+; CHECK-MESA3D-NEXT:  ; %bb.0:
+; CHECK-MESA3D-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; CHECK-MESA3D-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4
+; CHECK-MESA3D-NEXT:    s_wait_kmcnt 0x0
+; CHECK-MESA3D-NEXT:    global_store_b32 v0, v1, s[0:1]
+; CHECK-MESA3D-NEXT:    s_endpgm
+;
+; CHECK-G-UNKNOWN-LABEL: test_workgroup_max_id_x_optimized:
+; CHECK-G-UNKNOWN:       ; %bb.0:
+; CHECK-G-UNKNOWN-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; CHECK-G-UNKNOWN-NEXT:    v_dual_mov_b32 v0, 4 :: v_dual_mov_b32 v1, 0
+; CHECK-G-UNKNOWN-NEXT:    s_wait_kmcnt 0x0
+; CHECK-G-UNKNOWN-NEXT:    global_store_b32 v1, v0, s[0:1]
+; CHECK-G-UNKNOWN-NEXT:    s_endpgm
+;
+; CHECK-G-MESA3D-LABEL: test_workgroup_max_id_x_optimized:
+; CHECK-G-MESA3D:         .amd_kernel_code_t
+; CHECK-G-MESA3D-NEXT:     amd_code_version_major = 1
+; CHECK-G-MESA3D-NEXT:     amd_code_version_minor = 2
+; CHECK-G-MESA3D-NEXT:     amd_machine_kind = 1
+; CHECK-G-MESA3D-NEXT:     amd_machine_version_major = 12
+; CHECK-G-MESA3D-NEXT:     amd_machine_version_minor = 5
+; CHECK-G-MESA3D-NEXT:     amd_machine_version_stepping = 0
+; CHECK-G-MESA3D-NEXT:     kernel_code_entry_byte_offset = 256
+; CHECK-G-MESA3D-NEXT:     kernel_code_prefetch_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     granulated_workitem_vgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     granulated_wavefront_sgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     priority = 0
+; CHECK-G-MESA3D-NEXT:     float_mode = 240
+; CHECK-G-MESA3D-NEXT:     priv = 0
+; CHECK-G-MESA3D-NEXT:     enable_dx10_clamp = 0
+; CHECK-G-MESA3D-NEXT:     debug_mode = 0
+; CHECK-G-MESA3D-NEXT:     enable_ieee_mode = 0
+; CHECK-G-MESA3D-NEXT:     enable_wgp_mode = 0
+; CHECK-G-MESA3D-NEXT:     enable_mem_ordered = 1
+; CHECK-G-MESA3D-NEXT:     enable_fwd_progress = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
+; CHECK-G-MESA3D-NEXT:     user_sgpr_count = 8
+; CHECK-G-MESA3D-NEXT:     enable_trap_handler = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_id_x = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_id_y = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_id_z = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_info = 0
+; CHECK-G-MESA3D-NEXT:     enable_vgpr_workitem_id = 2
+; CHECK-G-MESA3D-NEXT:     enable_exception_msb = 0
+; CHECK-G-MESA3D-NEXT:     granulated_lds_size = 0
+; CHECK-G-MESA3D-NEXT:     enable_exception = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_private_segment_buffer = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_dispatch_ptr = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_queue_ptr = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_dispatch_id = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_flat_scratch_init = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_private_segment_size = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_z = 0
+; CHECK-G-MESA3D-NEXT:     enable_wavefront_size32 = 1
+; CHECK-G-MESA3D-NEXT:     enable_ordered_append_gds = 0
+; CHECK-G-MESA3D-NEXT:     private_element_size = 1
+; CHECK-G-MESA3D-NEXT:     is_ptr64 = 1
+; CHECK-G-MESA3D-NEXT:     is_dynamic_callstack = 0
+; CHECK-G-MESA3D-NEXT:     is_debug_enabled = 0
+; CHECK-G-MESA3D-NEXT:     is_xnack_enabled = 0
+; CHECK-G-MESA3D-NEXT:     workitem_private_segment_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     workgroup_group_segment_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     gds_segment_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     kernarg_segment_byte_size = 24
+; CHECK-G-MESA3D-NEXT:     workgroup_fbarrier_count = 0
+; CHECK-G-MESA3D-NEXT:     wavefront_sgpr_count = 6
+; CHECK-G-MESA3D-NEXT:     workitem_vgpr_count = 2
+; CHECK-G-MESA3D-NEXT:     reserved_vgpr_first = 0
+; CHECK-G-MESA3D-NEXT:     reserved_vgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     reserved_sgpr_first = 0
+; CHECK-G-MESA3D-NEXT:     reserved_sgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     debug_wavefront_private_segment_offset_sgpr = 0
+; CHECK-G-MESA3D-NEXT:     debug_private_segment_buffer_sgpr = 0
+; CHECK-G-MESA3D-NEXT:     kernarg_segment_alignment = 4
+; CHECK-G-MESA3D-NEXT:     group_segment_alignment = 4
+; CHECK-G-MESA3D-NEXT:     private_segment_alignment = 4
+; CHECK-G-MESA3D-NEXT:     wavefront_size = 5
+; CHECK-G-MESA3D-NEXT:     call_convention = -1
+; CHECK-G-MESA3D-NEXT:     runtime_loader_kernel_symbol = 0
+; CHECK-G-MESA3D-NEXT:    .end_amd_kernel_code_t
+; CHECK-G-MESA3D-NEXT:  ; %bb.0:
+; CHECK-G-MESA3D-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; CHECK-G-MESA3D-NEXT:    v_dual_mov_b32 v0, 4 :: v_dual_mov_b32 v1, 0
+; CHECK-G-MESA3D-NEXT:    s_wait_kmcnt 0x0
+; CHECK-G-MESA3D-NEXT:    global_store_b32 v1, v0, s[0:1]
+; CHECK-G-MESA3D-NEXT:    s_endpgm
+  %id = call i32 @llvm.amdgcn.cluster.workgroup.max.id.x()
+  store i32 %id, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @test_workgroup_max_id_y(ptr addrspace(1) %out) #1 {
+; CHECK-UNKNOWN-LABEL: test_workgroup_max_id_y:
+; CHECK-UNKNOWN:       ; %bb.0:
+; CHECK-UNKNOWN-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; CHECK-UNKNOWN-NEXT:    s_bfe_u32 s2, ttmp6, 0x40010
+; CHECK-UNKNOWN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-UNKNOWN-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; CHECK-UNKNOWN-NEXT:    s_wait_kmcnt 0x0
+; CHECK-UNKNOWN-NEXT:    global_store_b32 v0, v1, s[0:1]
+; CHECK-UNKNOWN-NEXT:    s_endpgm
+;
+; CHECK-MESA3D-LABEL: test_workgroup_max_id_y:
+; CHECK-MESA3D:         .amd_kernel_code_t
+; CHECK-MESA3D-NEXT:     amd_code_version_major = 1
+; CHECK-MESA3D-NEXT:     amd_code_version_minor = 2
+; CHECK-MESA3D-NEXT:     amd_machine_kind = 1
+; CHECK-MESA3D-NEXT:     amd_machine_version_major = 12
+; CHECK-MESA3D-NEXT:     amd_machine_version_minor = 5
+; CHECK-MESA3D-NEXT:     amd_machine_version_stepping = 0
+; CHECK-MESA3D-NEXT:     kernel_code_entry_byte_offset = 256
+; CHECK-MESA3D-NEXT:     kernel_code_prefetch_byte_size = 0
+; CHECK-MESA3D-NEXT:     granulated_workitem_vgpr_count = 0
+; CHECK-MESA3D-NEXT:     granulated_wavefront_sgpr_count = 0
+; CHECK-MESA3D-NEXT:     priority = 0
+; CHECK-MESA3D-NEXT:     float_mode = 240
+; CHECK-MESA3D-NEXT:     priv = 0
+; CHECK-MESA3D-NEXT:     enable_dx10_clamp = 0
+; CHECK-MESA3D-NEXT:     debug_mode = 0
+; CHECK-MESA3D-NEXT:     enable_ieee_mode = 0
+; CHECK-MESA3D-NEXT:     enable_wgp_mode = 0
+; CHECK-MESA3D-NEXT:     enable_mem_ordered = 1
+; CHECK-MESA3D-NEXT:     enable_fwd_progress = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
+; CHECK-MESA3D-NEXT:     user_sgpr_count = 8
+; CHECK-MESA3D-NEXT:     enable_trap_handler = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_id_x = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_id_y = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_id_z = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_info = 0
+; CHECK-MESA3D-NEXT:     enable_vgpr_workitem_id = 2
+; CHECK-MESA3D-NEXT:     enable_exception_msb = 0
+; CHECK-MESA3D-NEXT:     granulated_lds_size = 0
+; CHECK-MESA3D-NEXT:     enable_exception = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_private_segment_buffer = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_dispatch_ptr = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_queue_ptr = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_dispatch_id = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_flat_scratch_init = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_private_segment_size = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_z = 0
+; CHECK-MESA3D-NEXT:     enable_wavefront_size32 = 1
+; CHECK-MESA3D-NEXT:     enable_ordered_append_gds = 0
+; CHECK-MESA3D-NEXT:     private_element_size = 1
+; CHECK-MESA3D-NEXT:     is_ptr64 = 1
+; CHECK-MESA3D-NEXT:     is_dynamic_callstack = 0
+; CHECK-MESA3D-NEXT:     is_debug_enabled = 0
+; CHECK-MESA3D-NEXT:     is_xnack_enabled = 0
+; CHECK-MESA3D-NEXT:     workitem_private_segment_byte_size = 0
+; CHECK-MESA3D-NEXT:     workgroup_group_segment_byte_size = 0
+; CHECK-MESA3D-NEXT:     gds_segment_byte_size = 0
+; CHECK-MESA3D-NEXT:     kernarg_segment_byte_size = 24
+; CHECK-MESA3D-NEXT:     workgroup_fbarrier_count = 0
+; CHECK-MESA3D-NEXT:     wavefront_sgpr_count = 6
+; CHECK-MESA3D-NEXT:     workitem_vgpr_count = 2
+; CHECK-MESA3D-NEXT:     reserved_vgpr_first = 0
+; CHECK-MESA3D-NEXT:     reserved_vgpr_count = 0
+; CHECK-MESA3D-NEXT:     reserved_sgpr_first = 0
+; CHECK-MESA3D-NEXT:     reserved_sgpr_count = 0
+; CHECK-MESA3D-NEXT:     debug_wavefront_private_segment_offset_sgpr = 0
+; CHECK-MESA3D-NEXT:     debug_private_segment_buffer_sgpr = 0
+; CHECK-MESA3D-NEXT:     kernarg_segment_alignment = 4
+; CHECK-MESA3D-NEXT:     group_segment_alignment = 4
+; CHECK-MESA3D-NEXT:     private_segment_alignment = 4
+; CHECK-MESA3D-NEXT:     wavefront_size = 5
+; CHECK-MESA3D-NEXT:     call_convention = -1
+; CHECK-MESA3D-NEXT:     runtime_loader_kernel_symbol = 0
+; CHECK-MESA3D-NEXT:    .end_amd_kernel_code_t
+; CHECK-MESA3D-NEXT:  ; %bb.0:
+; CHECK-MESA3D-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; CHECK-MESA3D-NEXT:    s_bfe_u32 s2, ttmp6, 0x40010
+; CHECK-MESA3D-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-MESA3D-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; CHECK-MESA3D-NEXT:    s_wait_kmcnt 0x0
+; CHECK-MESA3D-NEXT:    global_store_b32 v0, v1, s[0:1]
+; CHECK-MESA3D-NEXT:    s_endpgm
+;
+; CHECK-G-UNKNOWN-LABEL: test_workgroup_max_id_y:
+; CHECK-G-UNKNOWN:       ; %bb.0:
+; CHECK-G-UNKNOWN-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; CHECK-G-UNKNOWN-NEXT:    s_bfe_u32 s2, ttmp6, 0x40010
+; CHECK-G-UNKNOWN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-G-UNKNOWN-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; CHECK-G-UNKNOWN-NEXT:    s_wait_kmcnt 0x0
+; CHECK-G-UNKNOWN-NEXT:    global_store_b32 v1, v0, s[0:1]
+; CHECK-G-UNKNOWN-NEXT:    s_endpgm
+;
+; CHECK-G-MESA3D-LABEL: test_workgroup_max_id_y:
+; CHECK-G-MESA3D:         .amd_kernel_code_t
+; CHECK-G-MESA3D-NEXT:     amd_code_version_major = 1
+; CHECK-G-MESA3D-NEXT:     amd_code_version_minor = 2
+; CHECK-G-MESA3D-NEXT:     amd_machine_kind = 1
+; CHECK-G-MESA3D-NEXT:     amd_machine_version_major = 12
+; CHECK-G-MESA3D-NEXT:     amd_machine_version_minor = 5
+; CHECK-G-MESA3D-NEXT:     amd_machine_version_stepping = 0
+; CHECK-G-MESA3D-NEXT:     kernel_code_entry_byte_offset = 256
+; CHECK-G-MESA3D-NEXT:     kernel_code_prefetch_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     granulated_workitem_vgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     granulated_wavefront_sgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     priority = 0
+; CHECK-G-MESA3D-NEXT:     float_mode = 240
+; CHECK-G-MESA3D-NEXT:     priv = 0
+; CHECK-G-MESA3D-NEXT:     enable_dx10_clamp = 0
+; CHECK-G-MESA3D-NEXT:     debug_mode = 0
+; CHECK-G-MESA3D-NEXT:     enable_ieee_mode = 0
+; CHECK-G-MESA3D-NEXT:     enable_wgp_mode = 0
+; CHECK-G-MESA3D-NEXT:     enable_mem_ordered = 1
+; CHECK-G-MESA3D-NEXT:     enable_fwd_progress = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
+; CHECK-G-MESA3D-NEXT:     user_sgpr_count = 8
+; CHECK-G-MESA3D-NEXT:     enable_trap_handler = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_id_x = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_id_y = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_id_z = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_info = 0
+; CHECK-G-MESA3D-NEXT:     enable_vgpr_workitem_id = 2
+; CHECK-G-MESA3D-NEXT:     enable_exception_msb = 0
+; CHECK-G-MESA3D-NEXT:     granulated_lds_size = 0
+; CHECK-G-MESA3D-NEXT:     enable_exception = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_private_segment_buffer = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_dispatch_ptr = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_queue_ptr = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_dispatch_id = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_flat_scratch_init = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_private_segment_size = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_z = 0
+; CHECK-G-MESA3D-NEXT:     enable_wavefront_size32 = 1
+; CHECK-G-MESA3D-NEXT:     enable_ordered_append_gds = 0
+; CHECK-G-MESA3D-NEXT:     private_element_size = 1
+; CHECK-G-MESA3D-NEXT:     is_ptr64 = 1
+; CHECK-G-MESA3D-NEXT:     is_dynamic_callstack = 0
+; CHECK-G-MESA3D-NEXT:     is_debug_enabled = 0
+; CHECK-G-MESA3D-NEXT:     is_xnack_enabled = 0
+; CHECK-G-MESA3D-NEXT:     workitem_private_segment_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     workgroup_group_segment_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     gds_segment_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     kernarg_segment_byte_size = 24
+; CHECK-G-MESA3D-NEXT:     workgroup_fbarrier_count = 0
+; CHECK-G-MESA3D-NEXT:     wavefront_sgpr_count = 6
+; CHECK-G-MESA3D-NEXT:     workitem_vgpr_count = 2
+; CHECK-G-MESA3D-NEXT:     reserved_vgpr_first = 0
+; CHECK-G-MESA3D-NEXT:     reserved_vgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     reserved_sgpr_first = 0
+; CHECK-G-MESA3D-NEXT:     reserved_sgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     debug_wavefront_private_segment_offset_sgpr = 0
+; CHECK-G-MESA3D-NEXT:     debug_private_segment_buffer_sgpr = 0
+; CHECK-G-MESA3D-NEXT:     kernarg_segment_alignment = 4
+; CHECK-G-MESA3D-NEXT:     group_segment_alignment = 4
+; CHECK-G-MESA3D-NEXT:     private_segment_alignment = 4
+; CHECK-G-MESA3D-NEXT:     wavefront_size = 5
+; CHECK-G-MESA3D-NEXT:     call_convention = -1
+; CHECK-G-MESA3D-NEXT:     runtime_loader_kernel_symbol = 0
+; CHECK-G-MESA3D-NEXT:    .end_amd_kernel_code_t
+; CHECK-G-MESA3D-NEXT:  ; %bb.0:
+; CHECK-G-MESA3D-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; CHECK-G-MESA3D-NEXT:    s_bfe_u32 s2, ttmp6, 0x40010
+; CHECK-G-MESA3D-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-G-MESA3D-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; CHECK-G-MESA3D-NEXT:    s_wait_kmcnt 0x0
+; CHECK-G-MESA3D-NEXT:    global_store_b32 v1, v0, s[0:1]
+; CHECK-G-MESA3D-NEXT:    s_endpgm
+  %id = call i32 @llvm.amdgcn.cluster.workgroup.max.id.y()
+  store i32 %id, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @test_workgroup_max_id_y_optimized(ptr addrspace(1) %out) "amdgpu-cluster-dims"="5,6,7" {
+; CHECK-UNKNOWN-LABEL: test_workgroup_max_id_y_optimized:
+; CHECK-UNKNOWN:       ; %bb.0:
+; CHECK-UNKNOWN-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; CHECK-UNKNOWN-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 5
+; CHECK-UNKNOWN-NEXT:    s_wait_kmcnt 0x0
+; CHECK-UNKNOWN-NEXT:    global_store_b32 v0, v1, s[0:1]
+; CHECK-UNKNOWN-NEXT:    s_endpgm
+;
+; CHECK-MESA3D-LABEL: test_workgroup_max_id_y_optimized:
+; CHECK-MESA3D:         .amd_kernel_code_t
+; CHECK-MESA3D-NEXT:     amd_code_version_major = 1
+; CHECK-MESA3D-NEXT:     amd_code_version_minor = 2
+; CHECK-MESA3D-NEXT:     amd_machine_kind = 1
+; CHECK-MESA3D-NEXT:     amd_machine_version_major = 12
+; CHECK-MESA3D-NEXT:     amd_machine_version_minor = 5
+; CHECK-MESA3D-NEXT:     amd_machine_version_stepping = 0
+; CHECK-MESA3D-NEXT:     kernel_code_entry_byte_offset = 256
+; CHECK-MESA3D-NEXT:     kernel_code_prefetch_byte_size = 0
+; CHECK-MESA3D-NEXT:     granulated_workitem_vgpr_count = 0
+; CHECK-MESA3D-NEXT:     granulated_wavefront_sgpr_count = 0
+; CHECK-MESA3D-NEXT:     priority = 0
+; CHECK-MESA3D-NEXT:     float_mode = 240
+; CHECK-MESA3D-NEXT:     priv = 0
+; CHECK-MESA3D-NEXT:     enable_dx10_clamp = 0
+; CHECK-MESA3D-NEXT:     debug_mode = 0
+; CHECK-MESA3D-NEXT:     enable_ieee_mode = 0
+; CHECK-MESA3D-NEXT:     enable_wgp_mode = 0
+; CHECK-MESA3D-NEXT:     enable_mem_ordered = 1
+; CHECK-MESA3D-NEXT:     enable_fwd_progress = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
+; CHECK-MESA3D-NEXT:     user_sgpr_count = 8
+; CHECK-MESA3D-NEXT:     enable_trap_handler = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_id_x = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_id_y = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_id_z = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_info = 0
+; CHECK-MESA3D-NEXT:     enable_vgpr_workitem_id = 2
+; CHECK-MESA3D-NEXT:     enable_exception_msb = 0
+; CHECK-MESA3D-NEXT:     granulated_lds_size = 0
+; CHECK-MESA3D-NEXT:     enable_exception = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_private_segment_buffer = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_dispatch_ptr = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_queue_ptr = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_dispatch_id = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_flat_scratch_init = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_private_segment_size = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_z = 0
+; CHECK-MESA3D-NEXT:     enable_wavefront_size32 = 1
+; CHECK-MESA3D-NEXT:     enable_ordered_append_gds = 0
+; CHECK-MESA3D-NEXT:     private_element_size = 1
+; CHECK-MESA3D-NEXT:     is_ptr64 = 1
+; CHECK-MESA3D-NEXT:     is_dynamic_callstack = 0
+; CHECK-MESA3D-NEXT:     is_debug_enabled = 0
+; CHECK-MESA3D-NEXT:     is_xnack_enabled = 0
+; CHECK-MESA3D-NEXT:     workitem_private_segment_byte_size = 0
+; CHECK-MESA3D-NEXT:     workgroup_group_segment_byte_size = 0
+; CHECK-MESA3D-NEXT:     gds_segment_byte_size = 0
+; CHECK-MESA3D-NEXT:     kernarg_segment_byte_size = 24
+; CHECK-MESA3D-NEXT:     workgroup_fbarrier_count = 0
+; CHECK-MESA3D-NEXT:     wavefront_sgpr_count = 6
+; CHECK-MESA3D-NEXT:     workitem_vgpr_count = 2
+; CHECK-MESA3D-NEXT:     reserved_vgpr_first = 0
+; CHECK-MESA3D-NEXT:     reserved_vgpr_count = 0
+; CHECK-MESA3D-NEXT:     reserved_sgpr_first = 0
+; CHECK-MESA3D-NEXT:     reserved_sgpr_count = 0
+; CHECK-MESA3D-NEXT:     debug_wavefront_private_segment_offset_sgpr = 0
+; CHECK-MESA3D-NEXT:     debug_private_segment_buffer_sgpr = 0
+; CHECK-MESA3D-NEXT:     kernarg_segment_alignment = 4
+; CHECK-MESA3D-NEXT:     group_segment_alignment = 4
+; CHECK-MESA3D-NEXT:     private_segment_alignment = 4
+; CHECK-MESA3D-NEXT:     wavefront_size = 5
+; CHECK-MESA3D-NEXT:     call_convention = -1
+; CHECK-MESA3D-NEXT:     runtime_loader_kernel_symbol = 0
+; CHECK-MESA3D-NEXT:    .end_amd_kernel_code_t
+; CHECK-MESA3D-NEXT:  ; %bb.0:
+; CHECK-MESA3D-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; CHECK-MESA3D-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 5
+; CHECK-MESA3D-NEXT:    s_wait_kmcnt 0x0
+; CHECK-MESA3D-NEXT:    global_store_b32 v0, v1, s[0:1]
+; CHECK-MESA3D-NEXT:    s_endpgm
+;
+; CHECK-G-UNKNOWN-LABEL: test_workgroup_max_id_y_optimized:
+; CHECK-G-UNKNOWN:       ; %bb.0:
+; CHECK-G-UNKNOWN-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; CHECK-G-UNKNOWN-NEXT:    v_dual_mov_b32 v0, 5 :: v_dual_mov_b32 v1, 0
+; CHECK-G-UNKNOWN-NEXT:    s_wait_kmcnt 0x0
+; CHECK-G-UNKNOWN-NEXT:    global_store_b32 v1, v0, s[0:1]
+; CHECK-G-UNKNOWN-NEXT:    s_endpgm
+;
+; CHECK-G-MESA3D-LABEL: test_workgroup_max_id_y_optimized:
+; CHECK-G-MESA3D:         .amd_kernel_code_t
+; CHECK-G-MESA3D-NEXT:     amd_code_version_major = 1
+; CHECK-G-MESA3D-NEXT:     amd_code_version_minor = 2
+; CHECK-G-MESA3D-NEXT:     amd_machine_kind = 1
+; CHECK-G-MESA3D-NEXT:     amd_machine_version_major = 12
+; CHECK-G-MESA3D-NEXT:     amd_machine_version_minor = 5
+; CHECK-G-MESA3D-NEXT:     amd_machine_version_stepping = 0
+; CHECK-G-MESA3D-NEXT:     kernel_code_entry_byte_offset = 256
+; CHECK-G-MESA3D-NEXT:     kernel_code_prefetch_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     granulated_workitem_vgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     granulated_wavefront_sgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     priority = 0
+; CHECK-G-MESA3D-NEXT:     float_mode = 240
+; CHECK-G-MESA3D-NEXT:     priv = 0
+; CHECK-G-MESA3D-NEXT:     enable_dx10_clamp = 0
+; CHECK-G-MESA3D-NEXT:     debug_mode = 0
+; CHECK-G-MESA3D-NEXT:     enable_ieee_mode = 0
+; CHECK-G-MESA3D-NEXT:     enable_wgp_mode = 0
+; CHECK-G-MESA3D-NEXT:     enable_mem_ordered = 1
+; CHECK-G-MESA3D-NEXT:     enable_fwd_progress = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
+; CHECK-G-MESA3D-NEXT:     user_sgpr_count = 8
+; CHECK-G-MESA3D-NEXT:     enable_trap_handler = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_id_x = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_id_y = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_id_z = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_info = 0
+; CHECK-G-MESA3D-NEXT:     enable_vgpr_workitem_id = 2
+; CHECK-G-MESA3D-NEXT:     enable_exception_msb = 0
+; CHECK-G-MESA3D-NEXT:     granulated_lds_size = 0
+; CHECK-G-MESA3D-NEXT:     enable_exception = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_private_segment_buffer = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_dispatch_ptr = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_queue_ptr = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_dispatch_id = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_flat_scratch_init = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_private_segment_size = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_z = 0
+; CHECK-G-MESA3D-NEXT:     enable_wavefront_size32 = 1
+; CHECK-G-MESA3D-NEXT:     enable_ordered_append_gds = 0
+; CHECK-G-MESA3D-NEXT:     private_element_size = 1
+; CHECK-G-MESA3D-NEXT:     is_ptr64 = 1
+; CHECK-G-MESA3D-NEXT:     is_dynamic_callstack = 0
+; CHECK-G-MESA3D-NEXT:     is_debug_enabled = 0
+; CHECK-G-MESA3D-NEXT:     is_xnack_enabled = 0
+; CHECK-G-MESA3D-NEXT:     workitem_private_segment_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     workgroup_group_segment_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     gds_segment_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     kernarg_segment_byte_size = 24
+; CHECK-G-MESA3D-NEXT:     workgroup_fbarrier_count = 0
+; CHECK-G-MESA3D-NEXT:     wavefront_sgpr_count = 6
+; CHECK-G-MESA3D-NEXT:     workitem_vgpr_count = 2
+; CHECK-G-MESA3D-NEXT:     reserved_vgpr_first = 0
+; CHECK-G-MESA3D-NEXT:     reserved_vgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     reserved_sgpr_first = 0
+; CHECK-G-MESA3D-NEXT:     reserved_sgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     debug_wavefront_private_segment_offset_sgpr = 0
+; CHECK-G-MESA3D-NEXT:     debug_private_segment_buffer_sgpr = 0
+; CHECK-G-MESA3D-NEXT:     kernarg_segment_alignment = 4
+; CHECK-G-MESA3D-NEXT:     group_segment_alignment = 4
+; CHECK-G-MESA3D-NEXT:     private_segment_alignment = 4
+; CHECK-G-MESA3D-NEXT:     wavefront_size = 5
+; CHECK-G-MESA3D-NEXT:     call_convention = -1
+; CHECK-G-MESA3D-NEXT:     runtime_loader_kernel_symbol = 0
+; CHECK-G-MESA3D-NEXT:    .end_amd_kernel_code_t
+; CHECK-G-MESA3D-NEXT:  ; %bb.0:
+; CHECK-G-MESA3D-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; CHECK-G-MESA3D-NEXT:    v_dual_mov_b32 v0, 5 :: v_dual_mov_b32 v1, 0
+; CHECK-G-MESA3D-NEXT:    s_wait_kmcnt 0x0
+; CHECK-G-MESA3D-NEXT:    global_store_b32 v1, v0, s[0:1]
+; CHECK-G-MESA3D-NEXT:    s_endpgm
+  %id = call i32 @llvm.amdgcn.cluster.workgroup.max.id.y()
+  store i32 %id, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @test_workgroup_max_id_z(ptr addrspace(1) %out) #1 {
+; CHECK-UNKNOWN-LABEL: test_workgroup_max_id_z:
+; CHECK-UNKNOWN:       ; %bb.0:
+; CHECK-UNKNOWN-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; CHECK-UNKNOWN-NEXT:    s_bfe_u32 s2, ttmp6, 0x40014
+; CHECK-UNKNOWN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-UNKNOWN-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; CHECK-UNKNOWN-NEXT:    s_wait_kmcnt 0x0
+; CHECK-UNKNOWN-NEXT:    global_store_b32 v0, v1, s[0:1]
+; CHECK-UNKNOWN-NEXT:    s_endpgm
+;
+; CHECK-MESA3D-LABEL: test_workgroup_max_id_z:
+; CHECK-MESA3D:         .amd_kernel_code_t
+; CHECK-MESA3D-NEXT:     amd_code_version_major = 1
+; CHECK-MESA3D-NEXT:     amd_code_version_minor = 2
+; CHECK-MESA3D-NEXT:     amd_machine_kind = 1
+; CHECK-MESA3D-NEXT:     amd_machine_version_major = 12
+; CHECK-MESA3D-NEXT:     amd_machine_version_minor = 5
+; CHECK-MESA3D-NEXT:     amd_machine_version_stepping = 0
+; CHECK-MESA3D-NEXT:     kernel_code_entry_byte_offset = 256
+; CHECK-MESA3D-NEXT:     kernel_code_prefetch_byte_size = 0
+; CHECK-MESA3D-NEXT:     granulated_workitem_vgpr_count = 0
+; CHECK-MESA3D-NEXT:     granulated_wavefront_sgpr_count = 0
+; CHECK-MESA3D-NEXT:     priority = 0
+; CHECK-MESA3D-NEXT:     float_mode = 240
+; CHECK-MESA3D-NEXT:     priv = 0
+; CHECK-MESA3D-NEXT:     enable_dx10_clamp = 0
+; CHECK-MESA3D-NEXT:     debug_mode = 0
+; CHECK-MESA3D-NEXT:     enable_ieee_mode = 0
+; CHECK-MESA3D-NEXT:     enable_wgp_mode = 0
+; CHECK-MESA3D-NEXT:     enable_mem_ordered = 1
+; CHECK-MESA3D-NEXT:     enable_fwd_progress = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
+; CHECK-MESA3D-NEXT:     user_sgpr_count = 8
+; CHECK-MESA3D-NEXT:     enable_trap_handler = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_id_x = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_id_y = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_id_z = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_info = 0
+; CHECK-MESA3D-NEXT:     enable_vgpr_workitem_id = 2
+; CHECK-MESA3D-NEXT:     enable_exception_msb = 0
+; CHECK-MESA3D-NEXT:     granulated_lds_size = 0
+; CHECK-MESA3D-NEXT:     enable_exception = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_private_segment_buffer = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_dispatch_ptr = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_queue_ptr = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_dispatch_id = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_flat_scratch_init = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_private_segment_size = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_z = 0
+; CHECK-MESA3D-NEXT:     enable_wavefront_size32 = 1
+; CHECK-MESA3D-NEXT:     enable_ordered_append_gds = 0
+; CHECK-MESA3D-NEXT:     private_element_size = 1
+; CHECK-MESA3D-NEXT:     is_ptr64 = 1
+; CHECK-MESA3D-NEXT:     is_dynamic_callstack = 0
+; CHECK-MESA3D-NEXT:     is_debug_enabled = 0
+; CHECK-MESA3D-NEXT:     is_xnack_enabled = 0
+; CHECK-MESA3D-NEXT:     workitem_private_segment_byte_size = 0
+; CHECK-MESA3D-NEXT:     workgroup_group_segment_byte_size = 0
+; CHECK-MESA3D-NEXT:     gds_segment_byte_size = 0
+; CHECK-MESA3D-NEXT:     kernarg_segment_byte_size = 24
+; CHECK-MESA3D-NEXT:     workgroup_fbarrier_count = 0
+; CHECK-MESA3D-NEXT:     wavefront_sgpr_count = 6
+; CHECK-MESA3D-NEXT:     workitem_vgpr_count = 2
+; CHECK-MESA3D-NEXT:     reserved_vgpr_first = 0
+; CHECK-MESA3D-NEXT:     reserved_vgpr_count = 0
+; CHECK-MESA3D-NEXT:     reserved_sgpr_first = 0
+; CHECK-MESA3D-NEXT:     reserved_sgpr_count = 0
+; CHECK-MESA3D-NEXT:     debug_wavefront_private_segment_offset_sgpr = 0
+; CHECK-MESA3D-NEXT:     debug_private_segment_buffer_sgpr = 0
+; CHECK-MESA3D-NEXT:     kernarg_segment_alignment = 4
+; CHECK-MESA3D-NEXT:     group_segment_alignment = 4
+; CHECK-MESA3D-NEXT:     private_segment_alignment = 4
+; CHECK-MESA3D-NEXT:     wavefront_size = 5
+; CHECK-MESA3D-NEXT:     call_convention = -1
+; CHECK-MESA3D-NEXT:     runtime_loader_kernel_symbol = 0
+; CHECK-MESA3D-NEXT:    .end_amd_kernel_code_t
+; CHECK-MESA3D-NEXT:  ; %bb.0:
+; CHECK-MESA3D-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; CHECK-MESA3D-NEXT:    s_bfe_u32 s2, ttmp6, 0x40014
+; CHECK-MESA3D-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-MESA3D-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; CHECK-MESA3D-NEXT:    s_wait_kmcnt 0x0
+; CHECK-MESA3D-NEXT:    global_store_b32 v0, v1, s[0:1]
+; CHECK-MESA3D-NEXT:    s_endpgm
+;
+; CHECK-G-UNKNOWN-LABEL: test_workgroup_max_id_z:
+; CHECK-G-UNKNOWN:       ; %bb.0:
+; CHECK-G-UNKNOWN-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; CHECK-G-UNKNOWN-NEXT:    s_bfe_u32 s2, ttmp6, 0x40014
+; CHECK-G-UNKNOWN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-G-UNKNOWN-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; CHECK-G-UNKNOWN-NEXT:    s_wait_kmcnt 0x0
+; CHECK-G-UNKNOWN-NEXT:    global_store_b32 v1, v0, s[0:1]
+; CHECK-G-UNKNOWN-NEXT:    s_endpgm
+;
+; CHECK-G-MESA3D-LABEL: test_workgroup_max_id_z:
+; CHECK-G-MESA3D:         .amd_kernel_code_t
+; CHECK-G-MESA3D-NEXT:     amd_code_version_major = 1
+; CHECK-G-MESA3D-NEXT:     amd_code_version_minor = 2
+; CHECK-G-MESA3D-NEXT:     amd_machine_kind = 1
+; CHECK-G-MESA3D-NEXT:     amd_machine_version_major = 12
+; CHECK-G-MESA3D-NEXT:     amd_machine_version_minor = 5
+; CHECK-G-MESA3D-NEXT:     amd_machine_version_stepping = 0
+; CHECK-G-MESA3D-NEXT:     kernel_code_entry_byte_offset = 256
+; CHECK-G-MESA3D-NEXT:     kernel_code_prefetch_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     granulated_workitem_vgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     granulated_wavefront_sgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     priority = 0
+; CHECK-G-MESA3D-NEXT:     float_mode = 240
+; CHECK-G-MESA3D-NEXT:     priv = 0
+; CHECK-G-MESA3D-NEXT:     enable_dx10_clamp = 0
+; CHECK-G-MESA3D-NEXT:     debug_mode = 0
+; CHECK-G-MESA3D-NEXT:     enable_ieee_mode = 0
+; CHECK-G-MESA3D-NEXT:     enable_wgp_mode = 0
+; CHECK-G-MESA3D-NEXT:     enable_mem_ordered = 1
+; CHECK-G-MESA3D-NEXT:     enable_fwd_progress = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
+; CHECK-G-MESA3D-NEXT:     user_sgpr_count = 8
+; CHECK-G-MESA3D-NEXT:     enable_trap_handler = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_id_x = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_id_y = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_id_z = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_info = 0
+; CHECK-G-MESA3D-NEXT:     enable_vgpr_workitem_id = 2
+; CHECK-G-MESA3D-NEXT:     enable_exception_msb = 0
+; CHECK-G-MESA3D-NEXT:     granulated_lds_size = 0
+; CHECK-G-MESA3D-NEXT:     enable_exception = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_private_segment_buffer = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_dispatch_ptr = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_queue_ptr = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_dispatch_id = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_flat_scratch_init = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_private_segment_size = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_z = 0
+; CHECK-G-MESA3D-NEXT:     enable_wavefront_size32 = 1
+; CHECK-G-MESA3D-NEXT:     enable_ordered_append_gds = 0
+; CHECK-G-MESA3D-NEXT:     private_element_size = 1
+; CHECK-G-MESA3D-NEXT:     is_ptr64 = 1
+; CHECK-G-MESA3D-NEXT:     is_dynamic_callstack = 0
+; CHECK-G-MESA3D-NEXT:     is_debug_enabled = 0
+; CHECK-G-MESA3D-NEXT:     is_xnack_enabled = 0
+; CHECK-G-MESA3D-NEXT:     workitem_private_segment_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     workgroup_group_segment_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     gds_segment_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     kernarg_segment_byte_size = 24
+; CHECK-G-MESA3D-NEXT:     workgroup_fbarrier_count = 0
+; CHECK-G-MESA3D-NEXT:     wavefront_sgpr_count = 6
+; CHECK-G-MESA3D-NEXT:     workitem_vgpr_count = 2
+; CHECK-G-MESA3D-NEXT:     reserved_vgpr_first = 0
+; CHECK-G-MESA3D-NEXT:     reserved_vgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     reserved_sgpr_first = 0
+; CHECK-G-MESA3D-NEXT:     reserved_sgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     debug_wavefront_private_segment_offset_sgpr = 0
+; CHECK-G-MESA3D-NEXT:     debug_private_segment_buffer_sgpr = 0
+; CHECK-G-MESA3D-NEXT:     kernarg_segment_alignment = 4
+; CHECK-G-MESA3D-NEXT:     group_segment_alignment = 4
+; CHECK-G-MESA3D-NEXT:     private_segment_alignment = 4
+; CHECK-G-MESA3D-NEXT:     wavefront_size = 5
+; CHECK-G-MESA3D-NEXT:     call_convention = -1
+; CHECK-G-MESA3D-NEXT:     runtime_loader_kernel_symbol = 0
+; CHECK-G-MESA3D-NEXT:    .end_amd_kernel_code_t
+; CHECK-G-MESA3D-NEXT:  ; %bb.0:
+; CHECK-G-MESA3D-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; CHECK-G-MESA3D-NEXT:    s_bfe_u32 s2, ttmp6, 0x40014
+; CHECK-G-MESA3D-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-G-MESA3D-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; CHECK-G-MESA3D-NEXT:    s_wait_kmcnt 0x0
+; CHECK-G-MESA3D-NEXT:    global_store_b32 v1, v0, s[0:1]
+; CHECK-G-MESA3D-NEXT:    s_endpgm
+  %id = call i32 @llvm.amdgcn.cluster.workgroup.max.id.z()
+  store i32 %id, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @test_workgroup_max_id_z_optimized(ptr addrspace(1) %out) "amdgpu-cluster-dims"="5,6,7" {
+; CHECK-UNKNOWN-LABEL: test_workgroup_max_id_z_optimized:
+; CHECK-UNKNOWN:       ; %bb.0:
+; CHECK-UNKNOWN-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; CHECK-UNKNOWN-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 6
+; CHECK-UNKNOWN-NEXT:    s_wait_kmcnt 0x0
+; CHECK-UNKNOWN-NEXT:    global_store_b32 v0, v1, s[0:1]
+; CHECK-UNKNOWN-NEXT:    s_endpgm
+;
+; CHECK-MESA3D-LABEL: test_workgroup_max_id_z_optimized:
+; CHECK-MESA3D:         .amd_kernel_code_t
+; CHECK-MESA3D-NEXT:     amd_code_version_major = 1
+; CHECK-MESA3D-NEXT:     amd_code_version_minor = 2
+; CHECK-MESA3D-NEXT:     amd_machine_kind = 1
+; CHECK-MESA3D-NEXT:     amd_machine_version_major = 12
+; CHECK-MESA3D-NEXT:     amd_machine_version_minor = 5
+; CHECK-MESA3D-NEXT:     amd_machine_version_stepping = 0
+; CHECK-MESA3D-NEXT:     kernel_code_entry_byte_offset = 256
+; CHECK-MESA3D-NEXT:     kernel_code_prefetch_byte_size = 0
+; CHECK-MESA3D-NEXT:     granulated_workitem_vgpr_count = 0
+; CHECK-MESA3D-NEXT:     granulated_wavefront_sgpr_count = 0
+; CHECK-MESA3D-NEXT:     priority = 0
+; CHECK-MESA3D-NEXT:     float_mode = 240
+; CHECK-MESA3D-NEXT:     priv = 0
+; CHECK-MESA3D-NEXT:     enable_dx10_clamp = 0
+; CHECK-MESA3D-NEXT:     debug_mode = 0
+; CHECK-MESA3D-NEXT:     enable_ieee_mode = 0
+; CHECK-MESA3D-NEXT:     enable_wgp_mode = 0
+; CHECK-MESA3D-NEXT:     enable_mem_ordered = 1
+; CHECK-MESA3D-NEXT:     enable_fwd_progress = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
+; CHECK-MESA3D-NEXT:     user_sgpr_count = 8
+; CHECK-MESA3D-NEXT:     enable_trap_handler = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_id_x = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_id_y = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_id_z = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_info = 0
+; CHECK-MESA3D-NEXT:     enable_vgpr_workitem_id = 2
+; CHECK-MESA3D-NEXT:     enable_exception_msb = 0
+; CHECK-MESA3D-NEXT:     granulated_lds_size = 0
+; CHECK-MESA3D-NEXT:     enable_exception = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_private_segment_buffer = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_dispatch_ptr = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_queue_ptr = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_dispatch_id = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_flat_scratch_init = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_private_segment_size = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_z = 0
+; CHECK-MESA3D-NEXT:     enable_wavefront_size32 = 1
+; CHECK-MESA3D-NEXT:     enable_ordered_append_gds = 0
+; CHECK-MESA3D-NEXT:     private_element_size = 1
+; CHECK-MESA3D-NEXT:     is_ptr64 = 1
+; CHECK-MESA3D-NEXT:     is_dynamic_callstack = 0
+; CHECK-MESA3D-NEXT:     is_debug_enabled = 0
+; CHECK-MESA3D-NEXT:     is_xnack_enabled = 0
+; CHECK-MESA3D-NEXT:     workitem_private_segment_byte_size = 0
+; CHECK-MESA3D-NEXT:     workgroup_group_segment_byte_size = 0
+; CHECK-MESA3D-NEXT:     gds_segment_byte_size = 0
+; CHECK-MESA3D-NEXT:     kernarg_segment_byte_size = 24
+; CHECK-MESA3D-NEXT:     workgroup_fbarrier_count = 0
+; CHECK-MESA3D-NEXT:     wavefront_sgpr_count = 6
+; CHECK-MESA3D-NEXT:     workitem_vgpr_count = 2
+; CHECK-MESA3D-NEXT:     reserved_vgpr_first = 0
+; CHECK-MESA3D-NEXT:     reserved_vgpr_count = 0
+; CHECK-MESA3D-NEXT:     reserved_sgpr_first = 0
+; CHECK-MESA3D-NEXT:     reserved_sgpr_count = 0
+; CHECK-MESA3D-NEXT:     debug_wavefront_private_segment_offset_sgpr = 0
+; CHECK-MESA3D-NEXT:     debug_private_segment_buffer_sgpr = 0
+; CHECK-MESA3D-NEXT:     kernarg_segment_alignment = 4
+; CHECK-MESA3D-NEXT:     group_segment_alignment = 4
+; CHECK-MESA3D-NEXT:     private_segment_alignment = 4
+; CHECK-MESA3D-NEXT:     wavefront_size = 5
+; CHECK-MESA3D-NEXT:     call_convention = -1
+; CHECK-MESA3D-NEXT:     runtime_loader_kernel_symbol = 0
+; CHECK-MESA3D-NEXT:    .end_amd_kernel_code_t
+; CHECK-MESA3D-NEXT:  ; %bb.0:
+; CHECK-MESA3D-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; CHECK-MESA3D-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 6
+; CHECK-MESA3D-NEXT:    s_wait_kmcnt 0x0
+; CHECK-MESA3D-NEXT:    global_store_b32 v0, v1, s[0:1]
+; CHECK-MESA3D-NEXT:    s_endpgm
+;
+; CHECK-G-UNKNOWN-LABEL: test_workgroup_max_id_z_optimized:
+; CHECK-G-UNKNOWN:       ; %bb.0:
+; CHECK-G-UNKNOWN-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; CHECK-G-UNKNOWN-NEXT:    v_dual_mov_b32 v0, 6 :: v_dual_mov_b32 v1, 0
+; CHECK-G-UNKNOWN-NEXT:    s_wait_kmcnt 0x0
+; CHECK-G-UNKNOWN-NEXT:    global_store_b32 v1, v0, s[0:1]
+; CHECK-G-UNKNOWN-NEXT:    s_endpgm
+;
+; CHECK-G-MESA3D-LABEL: test_workgroup_max_id_z_optimized:
+; CHECK-G-MESA3D:         .amd_kernel_code_t
+; CHECK-G-MESA3D-NEXT:     amd_code_version_major = 1
+; CHECK-G-MESA3D-NEXT:     amd_code_version_minor = 2
+; CHECK-G-MESA3D-NEXT:     amd_machine_kind = 1
+; CHECK-G-MESA3D-NEXT:     amd_machine_version_major = 12
+; CHECK-G-MESA3D-NEXT:     amd_machine_version_minor = 5
+; CHECK-G-MESA3D-NEXT:     amd_machine_version_stepping = 0
+; CHECK-G-MESA3D-NEXT:     kernel_code_entry_byte_offset = 256
+; CHECK-G-MESA3D-NEXT:     kernel_code_prefetch_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     granulated_workitem_vgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     granulated_wavefront_sgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     priority = 0
+; CHECK-G-MESA3D-NEXT:     float_mode = 240
+; CHECK-G-MESA3D-NEXT:     priv = 0
+; CHECK-G-MESA3D-NEXT:     enable_dx10_clamp = 0
+; CHECK-G-MESA3D-NEXT:     debug_mode = 0
+; CHECK-G-MESA3D-NEXT:     enable_ieee_mode = 0
+; CHECK-G-MESA3D-NEXT:     enable_wgp_mode = 0
+; CHECK-G-MESA3D-NEXT:     enable_mem_ordered = 1
+; CHECK-G-MESA3D-NEXT:     enable_fwd_progress = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
+; CHECK-G-MESA3D-NEXT:     user_sgpr_count = 8
+; CHECK-G-MESA3D-NEXT:     enable_trap_handler = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_id_x = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_id_y = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_id_z = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_info = 0
+; CHECK-G-MESA3D-NEXT:     enable_vgpr_workitem_id = 2
+; CHECK-G-MESA3D-NEXT:     enable_exception_msb = 0
+; CHECK-G-MESA3D-NEXT:     granulated_lds_size = 0
+; CHECK-G-MESA3D-NEXT:     enable_exception = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_private_segment_buffer = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_dispatch_ptr = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_queue_ptr = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_dispatch_id = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_flat_scratch_init = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_private_segment_size = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_z = 0
+; CHECK-G-MESA3D-NEXT:     enable_wavefront_size32 = 1
+; CHECK-G-MESA3D-NEXT:     enable_ordered_append_gds = 0
+; CHECK-G-MESA3D-NEXT:     private_element_size = 1
+; CHECK-G-MESA3D-NEXT:     is_ptr64 = 1
+; CHECK-G-MESA3D-NEXT:     is_dynamic_callstack = 0
+; CHECK-G-MESA3D-NEXT:     is_debug_enabled = 0
+; CHECK-G-MESA3D-NEXT:     is_xnack_enabled = 0
+; CHECK-G-MESA3D-NEXT:     workitem_private_segment_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     workgroup_group_segment_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     gds_segment_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     kernarg_segment_byte_size = 24
+; CHECK-G-MESA3D-NEXT:     workgroup_fbarrier_count = 0
+; CHECK-G-MESA3D-NEXT:     wavefront_sgpr_count = 6
+; CHECK-G-MESA3D-NEXT:     workitem_vgpr_count = 2
+; CHECK-G-MESA3D-NEXT:     reserved_vgpr_first = 0
+; CHECK-G-MESA3D-NEXT:     reserved_vgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     reserved_sgpr_first = 0
+; CHECK-G-MESA3D-NEXT:     reserved_sgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     debug_wavefront_private_segment_offset_sgpr = 0
+; CHECK-G-MESA3D-NEXT:     debug_private_segment_buffer_sgpr = 0
+; CHECK-G-MESA3D-NEXT:     kernarg_segment_alignment = 4
+; CHECK-G-MESA3D-NEXT:     group_segment_alignment = 4
+; CHECK-G-MESA3D-NEXT:     private_segment_alignment = 4
+; CHECK-G-MESA3D-NEXT:     wavefront_size = 5
+; CHECK-G-MESA3D-NEXT:     call_convention = -1
+; CHECK-G-MESA3D-NEXT:     runtime_loader_kernel_symbol = 0
+; CHECK-G-MESA3D-NEXT:    .end_amd_kernel_code_t
+; CHECK-G-MESA3D-NEXT:  ; %bb.0:
+; CHECK-G-MESA3D-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; CHECK-G-MESA3D-NEXT:    v_dual_mov_b32 v0, 6 :: v_dual_mov_b32 v1, 0
+; CHECK-G-MESA3D-NEXT:    s_wait_kmcnt 0x0
+; CHECK-G-MESA3D-NEXT:    global_store_b32 v1, v0, s[0:1]
+; CHECK-G-MESA3D-NEXT:    s_endpgm
+  %id = call i32 @llvm.amdgcn.cluster.workgroup.max.id.z()
+  store i32 %id, ptr addrspace(1) %out
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"amdgpu_code_object_version", i32 400}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scale.pk.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scale.pk.ll
index c29c52cc58aa2..5c439f631a426 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scale.pk.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scale.pk.ll
@@ -106,7 +106,7 @@ define amdgpu_ps void @test_cvt_scale_pk8_f32_fp8_vv(<2 x i32> %src, i32 %scale,
 ; GFX1250-SDAG-LABEL: test_cvt_scale_pk8_f32_fp8_vv:
 ; GFX1250-SDAG:       ; %bb.0:
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v13, v4 :: v_dual_mov_b32 v12, v3
-; GFX1250-SDAG-NEXT:    v_cvt_scale_pk8_f32_fp8 v[4:11], v[0:1], v2 scale_sel:7
+; GFX1250-SDAG-NEXT:    v_cvt_scale_pk8_f32_fp8 v[4:11], v[0:1], v2 scale_sel:8
 ; GFX1250-SDAG-NEXT:    s_clause 0x1
 ; GFX1250-SDAG-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
 ; GFX1250-SDAG-NEXT:    global_store_b128 v[12:13], v[4:7], off
@@ -115,12 +115,12 @@ define amdgpu_ps void @test_cvt_scale_pk8_f32_fp8_vv(<2 x i32> %src, i32 %scale,
 ; GFX1250-GISEL-LABEL: test_cvt_scale_pk8_f32_fp8_vv:
 ; GFX1250-GISEL:       ; %bb.0:
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v12, v3 :: v_dual_mov_b32 v13, v4
-; GFX1250-GISEL-NEXT:    v_cvt_scale_pk8_f32_fp8 v[4:11], v[0:1], v2 scale_sel:7
+; GFX1250-GISEL-NEXT:    v_cvt_scale_pk8_f32_fp8 v[4:11], v[0:1], v2 scale_sel:8
 ; GFX1250-GISEL-NEXT:    s_clause 0x1
 ; GFX1250-GISEL-NEXT:    global_store_b128 v[12:13], v[4:7], off
 ; GFX1250-GISEL-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
 ; GFX1250-GISEL-NEXT:    s_endpgm
-  %cvt = tail call <8 x float> @llvm.amdgcn.cvt.scale.pk8.f32.fp8(<2 x i32> %src, i32 %scale, i32 7)
+  %cvt = tail call <8 x float> @llvm.amdgcn.cvt.scale.pk8.f32.fp8(<2 x i32> %src, i32 %scale, i32 8)
   store <8 x float> %cvt, ptr addrspace(1) %out, align 16
   ret void
 }
@@ -313,12 +313,12 @@ define amdgpu_ps void @test_cvt_scale_pk16_bf16_bf6_sl(<3 x i32> inreg %src, ptr
 ; GFX1250-NEXT:    v_dual_mov_b32 v10, s0 :: v_dual_mov_b32 v11, s1
 ; GFX1250-NEXT:    v_mov_b32_e32 v12, s2
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-NEXT:    v_cvt_scale_pk16_bf16_bf6 v[2:9], v[10:12], 0x64 scale_sel:7
+; GFX1250-NEXT:    v_cvt_scale_pk16_bf16_bf6 v[2:9], v[10:12], 0x64 scale_sel:8
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[0:1], v[6:9], off offset:16
 ; GFX1250-NEXT:    global_store_b128 v[0:1], v[2:5], off
 ; GFX1250-NEXT:    s_endpgm
-  %cvt = tail call <16 x bfloat> @llvm.amdgcn.cvt.scale.pk16.bf16.bf6(<3 x i32> %src, i32 100, i32 7)
+  %cvt = tail call <16 x bfloat> @llvm.amdgcn.cvt.scale.pk16.bf16.bf6(<3 x i32> %src, i32 100, i32 8)
   store <16 x bfloat> %cvt, ptr addrspace(1) %out, align 8
   ret void
 }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.read.tr.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.read.tr.gfx950.ll
index f504f2caa8632..3e96dfe40f745 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.read.tr.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.read.tr.gfx950.ll
@@ -158,3 +158,69 @@ entry:
   store <4 x bfloat> %val, ptr addrspace(1) %use
   ret void
 }
+
+; This is a special case that does not require aligned VGPRs. Make
+; sure no copies are required for the unaligned ABI return value.
+define { i32, <3 x i32> } @ds_read_b96_tr_b6_no_align2_requirement(ptr addrspace(3) %ptr) {
+; GFX950-SDAG-LABEL: ds_read_b96_tr_b6_no_align2_requirement:
+; GFX950-SDAG:       ; %bb.0:
+; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT:    ds_read_b96_tr_b6 v[2:4], v0 offset:32
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, v2
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, v3
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, v4
+; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-GISEL-LABEL: ds_read_b96_tr_b6_no_align2_requirement:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    ds_read_b96_tr_b6 v[2:4], v0 offset:32
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v1, v2
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v2, v3
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v3, v4
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr i64, ptr addrspace(3) %ptr, i32 4
+  %val = call <3 x i32> @llvm.amdgcn.ds.read.tr6.b96.v3i32.p3(ptr addrspace(3) %gep)
+  %insert0 = insertvalue { i32, <3 x i32> } poison, i32 0, 0
+  %insert1 = insertvalue { i32, <3 x i32> } %insert0, <3 x i32> %val, 1
+  ret { i32, <3 x i32> } %insert1
+}
+
+define void @ds_read_b96_tr_b6_no_align2_requirement_agpr(ptr addrspace(3) %ptr) {
+; GFX950-SDAG-LABEL: ds_read_b96_tr_b6_no_align2_requirement_agpr:
+; GFX950-SDAG:       ; %bb.0:
+; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT:    ds_read_b96_tr_b6 v[0:2], v0 offset:32
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a1, v0
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a2, v1
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a3, v2
+; GFX950-SDAG-NEXT:    ;;#ASMSTART
+; GFX950-SDAG-NEXT:    ; use a1 a2 a3
+; GFX950-SDAG-NEXT:    ;;#ASMEND
+; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-GISEL-LABEL: ds_read_b96_tr_b6_no_align2_requirement_agpr:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    ds_read_b96_tr_b6 v[0:2], v0 offset:32
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_accvgpr_write_b32 a1, v0
+; GFX950-GISEL-NEXT:    v_accvgpr_write_b32 a2, v1
+; GFX950-GISEL-NEXT:    v_accvgpr_write_b32 a3, v2
+; GFX950-GISEL-NEXT:    ;;#ASMSTART
+; GFX950-GISEL-NEXT:    ; use a1 a2 a3
+; GFX950-GISEL-NEXT:    ;;#ASMEND
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr i64, ptr addrspace(3) %ptr, i32 4
+  %val = call <3 x i32> @llvm.amdgcn.ds.read.tr6.b96.v3i32.p3(ptr addrspace(3) %gep)
+  %val0 = extractelement <3 x i32> %val, i32 0
+  %val1 = extractelement <3 x i32> %val, i32 1
+  %val2 = extractelement <3 x i32> %val, i32 2
+  call void asm sideeffect "; use $0 $1 $2", "{a1},{a2},{a3}"(i32 %val0, i32 %val1, i32 %val2)
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.dim.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.dim.gfx90a.ll
new file mode 100644
index 0000000000000..49607e320bd0a
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.dim.gfx90a.ll
@@ -0,0 +1,170 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck -check-prefix=GFX90A %s
+
+define amdgpu_ps void @atomic_swap_1d_agpr(<8 x i32> inreg %rsrc, i32 %s) {
+; GFX90A-LABEL: atomic_swap_1d_agpr:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def a0
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    image_atomic_swap a0, v0, s[0:7] dmask:0x1 unorm glc
+; GFX90A-NEXT:    s_endpgm
+  %data = call i32 asm "; def $0", "=a"()
+  %v = call i32 @llvm.amdgcn.image.atomic.swap.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  call void asm "; use $0", "a"(i32 %v)
+  ret void
+}
+
+define amdgpu_ps void @atomic_add_2d_agpr(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
+; GFX90A-LABEL: atomic_add_2d_agpr:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def a0
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    image_atomic_add a0, v[0:1], s[0:7] dmask:0x1 unorm glc
+; GFX90A-NEXT:    s_endpgm
+  %data = call i32 asm "; def $0", "=a"()
+  %v = call i32 @llvm.amdgcn.image.atomic.add.2d.i32.i32(i32 %data, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
+  call void asm "; use $0", "a"(i32 %v)
+  ret void
+}
+
+; FIXME: This should directly use the AGPRs
+define amdgpu_ps void @atomic_cmpswap_1d_agpr(<8 x i32> inreg %rsrc, i32 %s) {
+; GFX90A-LABEL: atomic_cmpswap_1d_agpr:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def a0
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def a1
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    v_accvgpr_read_b32 v2, a0
+; GFX90A-NEXT:    v_accvgpr_read_b32 v3, a1
+; GFX90A-NEXT:    image_atomic_cmpswap v[2:3], v0, s[0:7] dmask:0x3 unorm glc
+; GFX90A-NEXT:    s_endpgm
+  %cmp = call i32 asm "; def $0", "=a"()
+  %swap = call i32 asm "; def $0", "=a"()
+  %v = call i32 @llvm.amdgcn.image.atomic.cmpswap.1d.i32.i32(i32 %cmp, i32 %swap, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  call void asm "; use $0", "a"(i32 %v)
+  ret void
+}
+
+define amdgpu_ps void @atomic_swap_1d_i64_agpr(<8 x i32> inreg %rsrc, i32 %s) {
+; GFX90A-LABEL: atomic_swap_1d_i64_agpr:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def a[0:1]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    image_atomic_swap a[0:1], v0, s[0:7] dmask:0x3 unorm glc
+; GFX90A-NEXT:    s_endpgm
+  %data = call i64 asm "; def $0", "=a"()
+  %v = call i64 @llvm.amdgcn.image.atomic.swap.1d.i64.i32(i64 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  call void asm "; use $0", "a"(i64 %v)
+  ret void
+}
+
+define amdgpu_ps void @atomic_cmpswap_1d_64_agpr(<8 x i32> inreg %rsrc, i32 %s) {
+; GFX90A-LABEL: atomic_cmpswap_1d_64_agpr:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def a[0:1]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    v_accvgpr_read_b32 v3, a1
+; GFX90A-NEXT:    v_accvgpr_read_b32 v2, a0
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def a[0:1]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    v_accvgpr_read_b32 v5, a1
+; GFX90A-NEXT:    v_accvgpr_read_b32 v4, a0
+; GFX90A-NEXT:    image_atomic_cmpswap v[2:5], v0, s[0:7] dmask:0xf unorm glc
+; GFX90A-NEXT:    s_endpgm
+  %cmp = call i64 asm "; def $0", "=a"()
+  %swap = call i64 asm "; def $0", "=a"()
+  %v = call i64 @llvm.amdgcn.image.atomic.cmpswap.1d.i64.i32(i64 %cmp, i64 %swap, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  call void asm "; use $0", "a"(i64 %v)
+  ret void
+}
+
+define amdgpu_ps void @atomic_swap_1d_agpr_noret(<8 x i32> inreg %rsrc, i32 %s) {
+; GFX90A-LABEL: atomic_swap_1d_agpr_noret:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def a0
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    v_accvgpr_read_b32 v1, a0
+; GFX90A-NEXT:    image_atomic_swap v1, v0, s[0:7] dmask:0x1 unorm glc
+; GFX90A-NEXT:    s_endpgm
+  %data = call i32 asm "; def $0", "=a"()
+  %unused = call i32 @llvm.amdgcn.image.atomic.swap.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @atomic_add_2d_agpr_noret(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
+; GFX90A-LABEL: atomic_add_2d_agpr_noret:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def a0
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    v_accvgpr_read_b32 v2, a0
+; GFX90A-NEXT:    image_atomic_add v2, v[0:1], s[0:7] dmask:0x1 unorm glc
+; GFX90A-NEXT:    s_endpgm
+  %data = call i32 asm "; def $0", "=a"()
+  %unused = call i32 @llvm.amdgcn.image.atomic.add.2d.i32.i32(i32 %data, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @atomic_cmpswap_1d_agpr_noret(<8 x i32> inreg %rsrc, i32 %s) {
+; GFX90A-LABEL: atomic_cmpswap_1d_agpr_noret:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def a0
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def a1
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    v_accvgpr_read_b32 v2, a0
+; GFX90A-NEXT:    v_accvgpr_read_b32 v3, a1
+; GFX90A-NEXT:    image_atomic_cmpswap v[2:3], v0, s[0:7] dmask:0x3 unorm glc
+; GFX90A-NEXT:    s_endpgm
+  %cmp = call i32 asm "; def $0", "=a"()
+  %swap = call i32 asm "; def $0", "=a"()
+  %unused = call i32 @llvm.amdgcn.image.atomic.cmpswap.1d.i32.i32(i32 %cmp, i32 %swap, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @atomic_swap_1d_i64_agpr_noret(<8 x i32> inreg %rsrc, i32 %s) {
+; GFX90A-LABEL: atomic_swap_1d_i64_agpr_noret:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def a[0:1]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    v_accvgpr_read_b32 v3, a1
+; GFX90A-NEXT:    v_accvgpr_read_b32 v2, a0
+; GFX90A-NEXT:    image_atomic_swap v[2:3], v0, s[0:7] dmask:0x3 unorm glc
+; GFX90A-NEXT:    s_endpgm
+  %data = call i64 asm "; def $0", "=a"()
+  %unused = call i64 @llvm.amdgcn.image.atomic.swap.1d.i64.i32(i64 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @atomic_cmpswap_1d_64_agpr_noret(<8 x i32> inreg %rsrc, i32 %s) {
+; GFX90A-LABEL: atomic_cmpswap_1d_64_agpr_noret:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def a[0:1]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    v_accvgpr_read_b32 v3, a1
+; GFX90A-NEXT:    v_accvgpr_read_b32 v2, a0
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def a[0:1]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    v_accvgpr_read_b32 v5, a1
+; GFX90A-NEXT:    v_accvgpr_read_b32 v4, a0
+; GFX90A-NEXT:    image_atomic_cmpswap v[2:5], v0, s[0:7] dmask:0xf unorm glc
+; GFX90A-NEXT:    s_endpgm
+  %cmp = call i64 asm "; def $0", "=a"()
+  %swap = call i64 asm "; def $0", "=a"()
+  %unused = call i64 @llvm.amdgcn.image.atomic.cmpswap.1d.i64.i32(i64 %cmp, i64 %swap, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.gfx90a.ll
index dcac419f8591d..bb4a607fc62d0 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.gfx90a.ll
@@ -418,6 +418,114 @@ main_body:
   ret <4 x float> %v
 }
 
+define amdgpu_ps void @load_1d_agpr(<8 x i32> inreg %rsrc, i32 %s) {
+; GCN-LABEL: load_1d_agpr:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    image_load a[0:3], v0, s[0:7] dmask:0xf unorm
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; use a[0:3]
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    s_endpgm
+  %v = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  call void asm sideeffect "; use $0", "a"(<4 x float> %v)
+  ret  void
+}
+
+define amdgpu_ps void @load_2d_agpr(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
+; GCN-LABEL: load_2d_agpr:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    image_load a[0:3], v[0:1], s[0:7] dmask:0xf unorm
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; use a[0:3]
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    s_endpgm
+  %v = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32(i32 15, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
+  call void asm sideeffect "; use $0", "a"(<4 x float> %v)
+  ret  void
+}
+
+define amdgpu_ps void @load_3d_agpr(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %r) {
+; GCN-LABEL: load_3d_agpr:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    image_load a[0:3], v[0:2], s[0:7] dmask:0xf unorm
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; use a[0:3]
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    s_endpgm
+  %v = call <4 x float> @llvm.amdgcn.image.load.3d.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %r, <8 x i32> %rsrc, i32 0, i32 0)
+  call void asm sideeffect "; use $0", "a"(<4 x float> %v)
+  ret  void
+}
+
+define amdgpu_ps void @load_cube_agpr(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %slice) {
+; GCN-LABEL: load_cube_agpr:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    image_load a[0:3], v[0:2], s[0:7] dmask:0xf unorm da
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; use a[0:3]
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    s_endpgm
+  %v = call <4 x float> @llvm.amdgcn.image.load.cube.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 0, i32 0)
+  call void asm sideeffect "; use $0", "a"(<4 x float> %v)
+  ret  void
+}
+
+define amdgpu_ps void @store_1d_agpr(<8 x i32> inreg %rsrc, i32 %s) {
+; GCN-LABEL: store_1d_agpr:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; def a[0:3]
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    image_store a[0:3], v0, s[0:7] dmask:0xf unorm
+; GCN-NEXT:    s_endpgm
+  %vdata = call <4 x float> asm "; def $0", "=a"()
+  call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @store_2d_agpr(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
+; GCN-LABEL: store_2d_agpr:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; def a[0:3]
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    image_store a[0:3], v[0:1], s[0:7] dmask:0xf unorm
+; GCN-NEXT:    s_endpgm
+  %vdata = call <4 x float> asm "; def $0", "=a"()
+  call void @llvm.amdgcn.image.store.2d.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @store_3d_agpr(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %r) {
+; GCN-LABEL: store_3d_agpr:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; def a[0:3]
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    image_store a[0:3], v[0:2], s[0:7] dmask:0xf unorm
+; GCN-NEXT:    s_endpgm
+  %vdata = call <4 x float> asm "; def $0", "=a"()
+  call void @llvm.amdgcn.image.store.3d.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %t, i32 %r, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @store_cube_agpr(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %slice) {
+; GCN-LABEL: store_cube_agpr:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; def a[0:3]
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    image_store a[0:3], v[0:2], s[0:7] dmask:0xf unorm da
+; GCN-NEXT:    s_endpgm
+  %vdata = call <4 x float> asm "; def $0", "=a"()
+  call void @llvm.amdgcn.image.store.cube.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
 declare <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32, i32, <8 x i32>, i32, i32) #1
 declare {float,i32} @llvm.amdgcn.image.load.1d.f32i32.i32(i32, i32, <8 x i32>, i32, i32) #1
 declare {<2 x float>,i32} @llvm.amdgcn.image.load.1d.v2f32i32.i32(i32, i32, <8 x i32>, i32, i32) #1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll
index 9e1815b48abfd..56215ca20651a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll
@@ -2,10 +2,12 @@
 ; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=tahiti < %s | FileCheck -check-prefixes=SI,SI-SDAG %s
 ; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii < %s | FileCheck -check-prefixes=CI,CI-SDAG %s
 ; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX9-SDAG %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-SDAG %s
 ; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii < %s | FileCheck -check-prefixes=CI,CI-GISEL %s
 ; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s
 ; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s
 ; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-GISEL %s
 
 define amdgpu_kernel void @is_private_vgpr(ptr addrspace(1) %ptr.ptr) {
 ; SI-LABEL: is_private_vgpr:
@@ -46,17 +48,33 @@ define amdgpu_kernel void @is_private_vgpr(ptr addrspace(1) %ptr.ptr) {
 ;
 ; GFX9-LABEL: is_private_vgpr:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[8:9], 0x0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GFX9-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[0:1], v0, s[0:1] glc
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s1, v1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX9-NEXT:    global_store_dword v[0:1], v0, off
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX1250-SDAG-LABEL: is_private_vgpr:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT:    global_load_b64 v[0:1], v0, s[0:1] scale_offset scope:SCOPE_SYS
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
+; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v0, s0, v1
+; GFX1250-SDAG-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 0x4000000, v0
+; GFX1250-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX1250-SDAG-NEXT:    global_store_b32 v[0:1], v0, off
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
 ; CI-GISEL-LABEL: is_private_vgpr:
 ; CI-GISEL:       ; %bb.0:
 ; CI-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
@@ -79,13 +97,12 @@ define amdgpu_kernel void @is_private_vgpr(ptr addrspace(1) %ptr.ptr) {
 ;
 ; GFX10-LABEL: is_private_vgpr:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[8:9], 0x0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GFX10-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[0:1], v0, s[0:1] glc dlc
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3] glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, s1, v1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX10-NEXT:    global_store_dword v[0:1], v0, off
@@ -93,18 +110,34 @@ define amdgpu_kernel void @is_private_vgpr(ptr addrspace(1) %ptr.ptr) {
 ;
 ; GFX11-LABEL: is_private_vgpr:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[0:1], v0, s[0:1] glc dlc
+; GFX11-NEXT:    global_load_b64 v[0:1], v0, s[2:3] glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, s1, v1
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11-NEXT:    global_store_b32 v[0:1], v0, off
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: is_private_vgpr:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v2, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT:    global_load_b64 v[0:1], v0, s[0:1] scale_offset scope:SCOPE_SYS
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
+; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v0, v1, v2
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 0x4000000, v0
+; GFX1250-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX1250-GISEL-NEXT:    global_store_b32 v[0:1], v0, off
+; GFX1250-GISEL-NEXT:    s_endpgm
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr inbounds ptr, ptr addrspace(1) %ptr.ptr, i32 %id
   %ptr = load volatile ptr, ptr addrspace(1) %gep
@@ -156,10 +189,10 @@ define amdgpu_kernel void @is_private_sgpr(ptr %ptr) {
 ;
 ; GFX9-SDAG-LABEL: is_private_sgpr:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_load_dword s2, s[8:9], 0x4
 ; GFX9-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX9-SDAG-NEXT:    s_load_dword s0, s[8:9], 0x4
 ; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    s_cmp_eq_u32 s2, s1
+; GFX9-SDAG-NEXT:    s_cmp_eq_u32 s0, s1
 ; GFX9-SDAG-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GFX9-SDAG-NEXT:    s_andn2_b64 vcc, exec, s[0:1]
 ; GFX9-SDAG-NEXT:    s_cbranch_vccnz .LBB1_2
@@ -170,6 +203,24 @@ define amdgpu_kernel void @is_private_sgpr(ptr %ptr) {
 ; GFX9-SDAG-NEXT:  .LBB1_2: ; %bb1
 ; GFX9-SDAG-NEXT:    s_endpgm
 ;
+; GFX1250-SDAG-LABEL: is_private_sgpr:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-SDAG-NEXT:    s_mov_b32 s1, src_flat_scratch_base_hi
+; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT:    s_xor_b32 s0, s0, s1
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT:    s_cmp_lt_u32 s0, 0x4000000
+; GFX1250-SDAG-NEXT:    s_cselect_b32 s0, -1, 0
+; GFX1250-SDAG-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_vccnz .LBB1_2
+; GFX1250-SDAG-NEXT:  ; %bb.1: ; %bb0
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-SDAG-NEXT:    global_store_b32 v[0:1], v0, off scope:SCOPE_SYS
+; GFX1250-SDAG-NEXT:    s_wait_storecnt 0x0
+; GFX1250-SDAG-NEXT:  .LBB1_2: ; %bb1
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
 ; CI-GISEL-LABEL: is_private_sgpr:
 ; CI-GISEL:       ; %bb.0:
 ; CI-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
@@ -190,10 +241,10 @@ define amdgpu_kernel void @is_private_sgpr(ptr %ptr) {
 ;
 ; GFX9-GISEL-LABEL: is_private_sgpr:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX9-GISEL-NEXT:    s_mov_b64 s[2:3], src_private_base
+; GFX9-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[8:9], 0x0
+; GFX9-GISEL-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_cmp_lg_u32 s1, s3
+; GFX9-GISEL-NEXT:    s_cmp_lg_u32 s3, s1
 ; GFX9-GISEL-NEXT:    s_cbranch_scc1 .LBB1_2
 ; GFX9-GISEL-NEXT:  ; %bb.1: ; %bb0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0
@@ -204,10 +255,10 @@ define amdgpu_kernel void @is_private_sgpr(ptr %ptr) {
 ;
 ; GFX10-LABEL: is_private_sgpr:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX10-NEXT:    s_mov_b64 s[2:3], src_private_base
+; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[8:9], 0x0
+; GFX10-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_cmp_lg_u32 s1, s3
+; GFX10-NEXT:    s_cmp_lg_u32 s3, s1
 ; GFX10-NEXT:    s_cbranch_scc1 .LBB1_2
 ; GFX10-NEXT:  ; %bb.1: ; %bb0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
@@ -218,10 +269,10 @@ define amdgpu_kernel void @is_private_sgpr(ptr %ptr) {
 ;
 ; GFX11-LABEL: is_private_sgpr:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
-; GFX11-NEXT:    s_mov_b64 s[2:3], src_private_base
+; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX11-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_cmp_lg_u32 s1, s3
+; GFX11-NEXT:    s_cmp_lg_u32 s3, s1
 ; GFX11-NEXT:    s_cbranch_scc1 .LBB1_2
 ; GFX11-NEXT:  ; %bb.1: ; %bb0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
@@ -229,6 +280,22 @@ define amdgpu_kernel void @is_private_sgpr(ptr %ptr) {
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:  .LBB1_2: ; %bb1
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: is_private_sgpr:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250-GISEL-NEXT:    s_xor_b32 s0, s1, s0
+; GFX1250-GISEL-NEXT:    s_cmp_ge_u32 s0, 0x4000000
+; GFX1250-GISEL-NEXT:    s_cbranch_scc1 .LBB1_2
+; GFX1250-GISEL-NEXT:  ; %bb.1: ; %bb0
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-GISEL-NEXT:    global_store_b32 v[0:1], v0, off scope:SCOPE_SYS
+; GFX1250-GISEL-NEXT:    s_wait_storecnt 0x0
+; GFX1250-GISEL-NEXT:  .LBB1_2: ; %bb1
+; GFX1250-GISEL-NEXT:    s_endpgm
   %val = call i1 @llvm.amdgcn.is.private(ptr %ptr)
   br i1 %val, label %bb0, label %bb1
 
@@ -244,4 +311,5 @@ bb1:
 ; CI: {{.*}}
 ; GFX10-GISEL: {{.*}}
 ; GFX11-GISEL: {{.*}}
+; GFX1250: {{.*}}
 ; SI-SDAG: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll
index f1dcc93172fb1..63333ed165a32 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll
@@ -81,12 +81,12 @@ define amdgpu_kernel void @is_local_vgpr(ptr addrspace(1) %ptr.ptr) {
 ;
 ; GFX9-LABEL: is_local_vgpr:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[8:9], 0x0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GFX9-NEXT:    s_mov_b64 s[0:1], src_shared_base
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[0:1], v0, s[0:1] glc
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_mov_b64 s[0:1], src_shared_base
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s1, v1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX9-NEXT:    global_store_dword v[0:1], v0, off
@@ -94,15 +94,14 @@ define amdgpu_kernel void @is_local_vgpr(ptr addrspace(1) %ptr.ptr) {
 ;
 ; GFX1250-LABEL: is_local_vgpr:
 ; GFX1250:       ; %bb.0:
-; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1250-NEXT:    s_mov_b64 s[0:1], src_shared_base
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    global_load_b64 v[0:1], v0, s[0:1] scale_offset scope:SCOPE_SYS
+; GFX1250-NEXT:    global_load_b64 v[0:1], v0, s[2:3] scale_offset scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    s_wait_xcnt 0x0
-; GFX1250-NEXT:    s_mov_b64 s[0:1], src_shared_base
-; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-NEXT:    v_cmp_eq_u32_e32 vcc_lo, s1, v1
+; GFX1250-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX1250-NEXT:    global_store_b32 v[0:1], v0, off
 ; GFX1250-NEXT:    s_endpgm
@@ -129,13 +128,12 @@ define amdgpu_kernel void @is_local_vgpr(ptr addrspace(1) %ptr.ptr) {
 ;
 ; GFX10-LABEL: is_local_vgpr:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[8:9], 0x0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GFX10-NEXT:    s_mov_b64 s[0:1], src_shared_base
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[0:1], v0, s[0:1] glc dlc
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3] glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b64 s[0:1], src_shared_base
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, s1, v1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX10-NEXT:    global_store_dword v[0:1], v0, off
@@ -143,14 +141,14 @@ define amdgpu_kernel void @is_local_vgpr(ptr addrspace(1) %ptr.ptr) {
 ;
 ; GFX11-LABEL: is_local_vgpr:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_mov_b64 s[0:1], src_shared_base
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[0:1], v0, s[0:1] glc dlc
+; GFX11-NEXT:    global_load_b64 v[0:1], v0, s[2:3] glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_mov_b64 s[0:1], src_shared_base
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, s1, v1
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11-NEXT:    global_store_b32 v[0:1], v0, off
@@ -240,10 +238,10 @@ define amdgpu_kernel void @is_local_sgpr(ptr %ptr) {
 ;
 ; GFX9-SDAG-LABEL: is_local_sgpr:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_load_dword s2, s[8:9], 0x4
 ; GFX9-SDAG-NEXT:    s_mov_b64 s[0:1], src_shared_base
+; GFX9-SDAG-NEXT:    s_load_dword s0, s[8:9], 0x4
 ; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    s_cmp_eq_u32 s2, s1
+; GFX9-SDAG-NEXT:    s_cmp_eq_u32 s0, s1
 ; GFX9-SDAG-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GFX9-SDAG-NEXT:    s_andn2_b64 vcc, exec, s[0:1]
 ; GFX9-SDAG-NEXT:    s_cbranch_vccnz .LBB1_2
@@ -256,10 +254,10 @@ define amdgpu_kernel void @is_local_sgpr(ptr %ptr) {
 ;
 ; GFX1250-SDAG-LABEL: is_local_sgpr:
 ; GFX1250-SDAG:       ; %bb.0:
-; GFX1250-SDAG-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], src_shared_base
+; GFX1250-SDAG-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-SDAG-NEXT:    s_cmp_eq_u32 s2, s1
+; GFX1250-SDAG-NEXT:    s_cmp_eq_u32 s0, s1
 ; GFX1250-SDAG-NEXT:    s_cselect_b32 s0, -1, 0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-SDAG-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
@@ -291,10 +289,10 @@ define amdgpu_kernel void @is_local_sgpr(ptr %ptr) {
 ;
 ; GFX9-GISEL-LABEL: is_local_sgpr:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX9-GISEL-NEXT:    s_mov_b64 s[2:3], src_shared_base
+; GFX9-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[8:9], 0x0
+; GFX9-GISEL-NEXT:    s_mov_b64 s[0:1], src_shared_base
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_cmp_lg_u32 s1, s3
+; GFX9-GISEL-NEXT:    s_cmp_lg_u32 s3, s1
 ; GFX9-GISEL-NEXT:    s_cbranch_scc1 .LBB1_2
 ; GFX9-GISEL-NEXT:  ; %bb.1: ; %bb0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0
@@ -305,10 +303,10 @@ define amdgpu_kernel void @is_local_sgpr(ptr %ptr) {
 ;
 ; GFX10-LABEL: is_local_sgpr:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX10-NEXT:    s_mov_b64 s[2:3], src_shared_base
+; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[8:9], 0x0
+; GFX10-NEXT:    s_mov_b64 s[0:1], src_shared_base
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_cmp_lg_u32 s1, s3
+; GFX10-NEXT:    s_cmp_lg_u32 s3, s1
 ; GFX10-NEXT:    s_cbranch_scc1 .LBB1_2
 ; GFX10-NEXT:  ; %bb.1: ; %bb0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
@@ -319,10 +317,10 @@ define amdgpu_kernel void @is_local_sgpr(ptr %ptr) {
 ;
 ; GFX11-LABEL: is_local_sgpr:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
-; GFX11-NEXT:    s_mov_b64 s[2:3], src_shared_base
+; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX11-NEXT:    s_mov_b64 s[0:1], src_shared_base
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_cmp_lg_u32 s1, s3
+; GFX11-NEXT:    s_cmp_lg_u32 s3, s1
 ; GFX11-NEXT:    s_cbranch_scc1 .LBB1_2
 ; GFX11-NEXT:  ; %bb.1: ; %bb0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
@@ -333,10 +331,10 @@ define amdgpu_kernel void @is_local_sgpr(ptr %ptr) {
 ;
 ; GFX1250-GISEL-LABEL: is_local_sgpr:
 ; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-GISEL-NEXT:    s_mov_b64 s[2:3], src_shared_base
+; GFX1250-GISEL-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-GISEL-NEXT:    s_mov_b64 s[0:1], src_shared_base
 ; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-GISEL-NEXT:    s_cmp_lg_u32 s1, s3
+; GFX1250-GISEL-NEXT:    s_cmp_lg_u32 s3, s1
 ; GFX1250-GISEL-NEXT:    s_cbranch_scc1 .LBB1_2
 ; GFX1250-GISEL-NEXT:  ; %bb.1: ; %bb0
 ; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, 0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.tr.gfx1250.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.tr.gfx1250.w32.ll
index d91b03ca4461d..d9f2fc55709a6 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.tr.gfx1250.w32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.tr.gfx1250.w32.ll
@@ -320,3 +320,57 @@ entry:
   store <8 x bfloat> %val, ptr addrspace(1) %use
   ret void
 }
+
+; This is a special case that does not require aligned VGPRs. Make
+; sure no copies are required for the unaligned ABI return value.
+define { i32, <3 x i32> } @global_load_tr6_b96_vaddr_no_align2_requirement(ptr addrspace(1) %addr, ptr addrspace(1) %use) {
+; GFX1250-LABEL: global_load_tr6_b96_vaddr_no_align2_requirement:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    global_load_tr6_b96 v[2:4], v[0:1], off offset:32
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, v2
+; GFX1250-NEXT:    v_dual_mov_b32 v2, v3 :: v_dual_mov_b32 v3, v4
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+  %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
+  %val = call <3 x i32> @llvm.amdgcn.global.load.tr6.b96.v3i32.p1(ptr addrspace(1) %gep)
+  %insert0 = insertvalue { i32, <3 x i32> } poison, i32 0, 0
+  %insert1 = insertvalue { i32, <3 x i32> } %insert0, <3 x i32> %val, 1
+  ret { i32, <3 x i32> } %insert1
+}
+
+define { i32, <3 x i32> } @global_load_tr6_b96_saddr_no_align2_requirement(ptr addrspace(1) inreg %addr, ptr addrspace(1) %use) {
+; GFX1250-LABEL: global_load_tr6_b96_saddr_no_align2_requirement:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    global_load_tr6_b96 v[2:4], v0, s[0:1] offset:32
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, v2
+; GFX1250-NEXT:    v_dual_mov_b32 v2, v3 :: v_dual_mov_b32 v3, v4
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+  %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
+  %val = call <3 x i32> @llvm.amdgcn.global.load.tr6.b96.v3i32.p1(ptr addrspace(1) %gep)
+  %insert0 = insertvalue { i32, <3 x i32> } poison, i32 0, 0
+  %insert1 = insertvalue { i32, <3 x i32> } %insert0, <3 x i32> %val, 1
+  ret { i32, <3 x i32> } %insert1
+}
+
+define { i32, <3 x i32> } @ds_load_tr6_b96_no_align2_requirement(ptr addrspace(3) %addr, ptr addrspace(1) %use) {
+; GFX1250-LABEL: ds_load_tr6_b96_no_align2_requirement:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    ds_load_tr6_b96 v[2:4], v0 offset:32
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, v2
+; GFX1250-NEXT:    v_dual_mov_b32 v2, v3 :: v_dual_mov_b32 v3, v4
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+  %gep = getelementptr i64, ptr addrspace(3) %addr, i32 4
+  %val = call <3 x i32> @llvm.amdgcn.ds.load.tr6.b96.v3i32.p3(ptr addrspace(3) %gep)
+  %insert0 = insertvalue { i32, <3 x i32> } poison, i32 0, 0
+  %insert1 = insertvalue { i32, <3 x i32> } %insert0, <3 x i32> %val, 1
+  ret { i32, <3 x i32> } %insert1
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.bf16.ll
index 303ea50dc16cc..12a998ad82cd2 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.bf16.ll
@@ -87,8 +87,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x2bf16(ptr addrspace(1) %arg) #0 {
 ; GFX908-NEXT:    v_mov_b32_e32 v0, 2
 ; GFX908-NEXT:    s_nop 1
 ; GFX908-NEXT:    v_mfma_f32_32x32x2bf16 a[0:31], v3, v0, a[0:31] cbsz:1 abid:2 blgp:3
-; GFX908-NEXT:    s_nop 7
-; GFX908-NEXT:    s_nop 7
+; GFX908-NEXT:    s_nop 15
 ; GFX908-NEXT:    s_nop 1
 ; GFX908-NEXT:    v_accvgpr_read_b32 v3, a27
 ; GFX908-NEXT:    v_accvgpr_read_b32 v2, a26
@@ -191,8 +190,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x2bf16(ptr addrspace(1) %arg) #0 {
 ; GFX90A-NEXT:    v_accvgpr_write_b32 a31, s15
 ; GFX90A-NEXT:    s_nop 1
 ; GFX90A-NEXT:    v_mfma_f32_32x32x2bf16 a[0:31], v1, v2, a[0:31] cbsz:1 abid:2 blgp:3
-; GFX90A-NEXT:    s_nop 7
-; GFX90A-NEXT:    s_nop 7
+; GFX90A-NEXT:    s_nop 15
 ; GFX90A-NEXT:    s_nop 2
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[24:27], s[34:35] offset:96
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[28:31], s[34:35] offset:112
@@ -256,8 +254,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x2bf16(ptr addrspace(1) %arg) #0 {
 ; GFX908-NEXT:    v_mov_b32_e32 v1, 2
 ; GFX908-NEXT:    s_nop 1
 ; GFX908-NEXT:    v_mfma_f32_16x16x2bf16 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
-; GFX908-NEXT:    s_nop 7
-; GFX908-NEXT:    s_nop 1
+; GFX908-NEXT:    s_nop 9
 ; GFX908-NEXT:    v_accvgpr_read_b32 v3, a15
 ; GFX908-NEXT:    v_accvgpr_read_b32 v2, a14
 ; GFX908-NEXT:    v_accvgpr_read_b32 v1, a13
@@ -308,8 +305,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x2bf16(ptr addrspace(1) %arg) #0 {
 ; GFX90A-NEXT:    s_nop 1
 ; GFX90A-NEXT:    v_mfma_f32_16x16x2bf16 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90A-NEXT:    s_nop 7
-; GFX90A-NEXT:    s_nop 1
+; GFX90A-NEXT:    s_nop 9
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
@@ -424,8 +420,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4bf16(ptr addrspace(1) %arg) #0 {
 ; GFX908-NEXT:    v_mov_b32_e32 v1, 2
 ; GFX908-NEXT:    s_nop 1
 ; GFX908-NEXT:    v_mfma_f32_32x32x4bf16 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
-; GFX908-NEXT:    s_nop 7
-; GFX908-NEXT:    s_nop 7
+; GFX908-NEXT:    s_nop 15
 ; GFX908-NEXT:    s_nop 1
 ; GFX908-NEXT:    v_accvgpr_read_b32 v3, a15
 ; GFX908-NEXT:    v_accvgpr_read_b32 v2, a14
@@ -476,8 +471,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4bf16(ptr addrspace(1) %arg) #0 {
 ; GFX90A-NEXT:    s_nop 1
 ; GFX90A-NEXT:    v_mfma_f32_32x32x4bf16 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90A-NEXT:    s_nop 7
-; GFX90A-NEXT:    s_nop 7
+; GFX90A-NEXT:    s_nop 15
 ; GFX90A-NEXT:    s_nop 1
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
@@ -513,8 +507,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x8bf16(ptr addrspace(1) %arg) #0 {
 ; GFX908-NEXT:    v_accvgpr_write_b32 a3, v5
 ; GFX908-NEXT:    s_nop 0
 ; GFX908-NEXT:    v_mfma_f32_16x16x8bf16 a[0:3], v0, v1, a[0:3] cbsz:1 abid:2 blgp:3
-; GFX908-NEXT:    s_nop 7
-; GFX908-NEXT:    s_nop 1
+; GFX908-NEXT:    s_nop 9
 ; GFX908-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GFX908-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GFX908-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -538,8 +531,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x8bf16(ptr addrspace(1) %arg) #0 {
 ; GFX90A-NEXT:    v_accvgpr_write_b32 a3, s3
 ; GFX90A-NEXT:    s_nop 1
 ; GFX90A-NEXT:    v_mfma_f32_16x16x8bf16 a[0:3], v0, v2, a[0:3] cbsz:1 abid:2 blgp:3
-; GFX90A-NEXT:    s_nop 7
-; GFX90A-NEXT:    s_nop 2
+; GFX90A-NEXT:    s_nop 10
 ; GFX90A-NEXT:    global_store_dwordx4 v1, a[0:3], s[6:7]
 ; GFX90A-NEXT:    s_endpgm
 bb:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll
index ff77d5ccbe312..5ab8706f28f5f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll
@@ -59,8 +59,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4bf16_1k(ptr addrspace(1) %arg) #
 ; GFX90A-NEXT:    v_accvgpr_write_b32 a31, s15
 ; GFX90A-NEXT:    s_nop 1
 ; GFX90A-NEXT:    v_mfma_f32_32x32x4bf16_1k a[0:31], v[2:3], v[0:1], a[0:31] cbsz:1 abid:2 blgp:3
-; GFX90A-NEXT:    s_nop 7
-; GFX90A-NEXT:    s_nop 7
+; GFX90A-NEXT:    s_nop 15
 ; GFX90A-NEXT:    s_nop 2
 ; GFX90A-NEXT:    global_store_dwordx4 v1, a[24:27], s[34:35] offset:96
 ; GFX90A-NEXT:    global_store_dwordx4 v1, a[28:31], s[34:35] offset:112
@@ -117,8 +116,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4bf16_1k(ptr addrspace(1) %arg) #
 ; GFX942-NEXT:    v_accvgpr_write_b32 a31, s15
 ; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_mfma_f32_32x32x4_2b_bf16 a[0:31], v[2:3], v[0:1], a[0:31] cbsz:1 abid:2 blgp:3
-; GFX942-NEXT:    s_nop 7
-; GFX942-NEXT:    s_nop 7
+; GFX942-NEXT:    s_nop 15
 ; GFX942-NEXT:    s_nop 2
 ; GFX942-NEXT:    global_store_dwordx4 v1, a[24:27], s[34:35] offset:96
 ; GFX942-NEXT:    global_store_dwordx4 v1, a[28:31], s[34:35] offset:112
@@ -175,8 +173,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4bf16_1k(ptr addrspace(1) %arg) #
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v31, s15
 ; GFX90A-VGPR-NEXT:    s_nop 1
 ; GFX90A-VGPR-NEXT:    v_mfma_f32_32x32x4bf16_1k v[0:31], v[34:35], v[32:33], v[0:31] cbsz:1 abid:2 blgp:3
-; GFX90A-VGPR-NEXT:    s_nop 7
-; GFX90A-VGPR-NEXT:    s_nop 7
+; GFX90A-VGPR-NEXT:    s_nop 15
 ; GFX90A-VGPR-NEXT:    s_nop 2
 ; GFX90A-VGPR-NEXT:    global_store_dwordx4 v33, v[24:27], s[34:35] offset:96
 ; GFX90A-VGPR-NEXT:    global_store_dwordx4 v33, v[28:31], s[34:35] offset:112
@@ -233,8 +230,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4bf16_1k(ptr addrspace(1) %arg) #
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v31, s15
 ; GFX942-VGPR-NEXT:    s_nop 1
 ; GFX942-VGPR-NEXT:    v_mfma_f32_32x32x4_2b_bf16 v[0:31], v[34:35], v[32:33], v[0:31] cbsz:1 abid:2 blgp:3
-; GFX942-VGPR-NEXT:    s_nop 7
-; GFX942-VGPR-NEXT:    s_nop 7
+; GFX942-VGPR-NEXT:    s_nop 15
 ; GFX942-VGPR-NEXT:    s_nop 2
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v33, v[24:27], s[34:35] offset:96
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v33, v[28:31], s[34:35] offset:112
@@ -283,8 +279,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4bf16_1k(ptr addrspace(1) %arg) #
 ; GFX90A-NEXT:    v_accvgpr_write_b32 a15, s15
 ; GFX90A-NEXT:    s_nop 1
 ; GFX90A-NEXT:    v_mfma_f32_16x16x4bf16_1k a[0:15], v[2:3], v[0:1], a[0:15] cbsz:1 abid:2 blgp:3
-; GFX90A-NEXT:    s_nop 7
-; GFX90A-NEXT:    s_nop 2
+; GFX90A-NEXT:    s_nop 10
 ; GFX90A-NEXT:    global_store_dwordx4 v1, a[12:15], s[16:17] offset:48
 ; GFX90A-NEXT:    global_store_dwordx4 v1, a[8:11], s[16:17] offset:32
 ; GFX90A-NEXT:    global_store_dwordx4 v1, a[4:7], s[16:17] offset:16
@@ -319,8 +314,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4bf16_1k(ptr addrspace(1) %arg) #
 ; GFX942-NEXT:    v_accvgpr_write_b32 a15, s15
 ; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_mfma_f32_16x16x4_4b_bf16 a[0:15], v[2:3], v[0:1], a[0:15] cbsz:1 abid:2 blgp:3
-; GFX942-NEXT:    s_nop 7
-; GFX942-NEXT:    s_nop 2
+; GFX942-NEXT:    s_nop 10
 ; GFX942-NEXT:    global_store_dwordx4 v1, a[12:15], s[16:17] offset:48
 ; GFX942-NEXT:    global_store_dwordx4 v1, a[8:11], s[16:17] offset:32
 ; GFX942-NEXT:    global_store_dwordx4 v1, a[4:7], s[16:17] offset:16
@@ -347,8 +341,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4bf16_1k(ptr addrspace(1) %arg) #
 ; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[14:15], s[14:15], s[14:15] op_sel:[0,1]
 ; GFX90A-VGPR-NEXT:    s_nop 1
 ; GFX90A-VGPR-NEXT:    v_mfma_f32_16x16x4bf16_1k v[0:15], v[18:19], v[16:17], v[0:15] cbsz:1 abid:2 blgp:3
-; GFX90A-VGPR-NEXT:    s_nop 7
-; GFX90A-VGPR-NEXT:    s_nop 2
+; GFX90A-VGPR-NEXT:    s_nop 10
 ; GFX90A-VGPR-NEXT:    global_store_dwordx4 v17, v[12:15], s[16:17] offset:48
 ; GFX90A-VGPR-NEXT:    global_store_dwordx4 v17, v[8:11], s[16:17] offset:32
 ; GFX90A-VGPR-NEXT:    global_store_dwordx4 v17, v[4:7], s[16:17] offset:16
@@ -375,8 +368,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4bf16_1k(ptr addrspace(1) %arg) #
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
 ; GFX942-VGPR-NEXT:    s_nop 1
 ; GFX942-VGPR-NEXT:    v_mfma_f32_16x16x4_4b_bf16 v[0:15], v[18:19], v[16:17], v[0:15] cbsz:1 abid:2 blgp:3
-; GFX942-VGPR-NEXT:    s_nop 7
-; GFX942-VGPR-NEXT:    s_nop 2
+; GFX942-VGPR-NEXT:    s_nop 10
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v17, v[12:15], s[16:17] offset:48
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v17, v[8:11], s[16:17] offset:32
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v17, v[4:7], s[16:17] offset:16
@@ -505,8 +497,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8bf16_1k(ptr addrspace(1) %arg) #
 ; GFX90A-NEXT:    v_accvgpr_write_b32 a15, s15
 ; GFX90A-NEXT:    s_nop 1
 ; GFX90A-NEXT:    v_mfma_f32_32x32x8bf16_1k a[0:15], v[2:3], v[0:1], a[0:15] cbsz:1 abid:2 blgp:3
-; GFX90A-NEXT:    s_nop 7
-; GFX90A-NEXT:    s_nop 7
+; GFX90A-NEXT:    s_nop 15
 ; GFX90A-NEXT:    s_nop 2
 ; GFX90A-NEXT:    global_store_dwordx4 v1, a[12:15], s[16:17] offset:48
 ; GFX90A-NEXT:    global_store_dwordx4 v1, a[8:11], s[16:17] offset:32
@@ -542,8 +533,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8bf16_1k(ptr addrspace(1) %arg) #
 ; GFX942-NEXT:    v_accvgpr_write_b32 a15, s15
 ; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_mfma_f32_32x32x8_bf16 a[0:15], v[2:3], v[0:1], a[0:15] cbsz:1 abid:2 blgp:3
-; GFX942-NEXT:    s_nop 7
-; GFX942-NEXT:    s_nop 2
+; GFX942-NEXT:    s_nop 10
 ; GFX942-NEXT:    global_store_dwordx4 v1, a[12:15], s[16:17] offset:48
 ; GFX942-NEXT:    global_store_dwordx4 v1, a[8:11], s[16:17] offset:32
 ; GFX942-NEXT:    global_store_dwordx4 v1, a[4:7], s[16:17] offset:16
@@ -570,8 +560,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8bf16_1k(ptr addrspace(1) %arg) #
 ; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[14:15], s[14:15], s[14:15] op_sel:[0,1]
 ; GFX90A-VGPR-NEXT:    s_nop 1
 ; GFX90A-VGPR-NEXT:    v_mfma_f32_32x32x8bf16_1k v[0:15], v[18:19], v[16:17], v[0:15] cbsz:1 abid:2 blgp:3
-; GFX90A-VGPR-NEXT:    s_nop 7
-; GFX90A-VGPR-NEXT:    s_nop 7
+; GFX90A-VGPR-NEXT:    s_nop 15
 ; GFX90A-VGPR-NEXT:    s_nop 2
 ; GFX90A-VGPR-NEXT:    global_store_dwordx4 v17, v[12:15], s[16:17] offset:48
 ; GFX90A-VGPR-NEXT:    global_store_dwordx4 v17, v[8:11], s[16:17] offset:32
@@ -599,8 +588,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8bf16_1k(ptr addrspace(1) %arg) #
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
 ; GFX942-VGPR-NEXT:    s_nop 1
 ; GFX942-VGPR-NEXT:    v_mfma_f32_32x32x8_bf16 v[0:15], v[18:19], v[16:17], v[0:15] cbsz:1 abid:2 blgp:3
-; GFX942-VGPR-NEXT:    s_nop 7
-; GFX942-VGPR-NEXT:    s_nop 2
+; GFX942-VGPR-NEXT:    s_nop 10
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v17, v[12:15], s[16:17] offset:48
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v17, v[8:11], s[16:17] offset:32
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v17, v[4:7], s[16:17] offset:16
@@ -632,8 +620,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x16bf16_1k(ptr addrspace(1) %arg)
 ; GFX90A-NEXT:    v_accvgpr_write_b32 a3, s3
 ; GFX90A-NEXT:    s_nop 1
 ; GFX90A-NEXT:    v_mfma_f32_16x16x16bf16_1k a[0:3], v[2:3], v[0:1], a[0:3] cbsz:1 abid:2 blgp:3
-; GFX90A-NEXT:    s_nop 7
-; GFX90A-NEXT:    s_nop 2
+; GFX90A-NEXT:    s_nop 10
 ; GFX90A-NEXT:    global_store_dwordx4 v1, a[0:3], s[6:7]
 ; GFX90A-NEXT:    s_endpgm
 ;
@@ -671,8 +658,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x16bf16_1k(ptr addrspace(1) %arg)
 ; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
 ; GFX90A-VGPR-NEXT:    s_nop 1
 ; GFX90A-VGPR-NEXT:    v_mfma_f32_16x16x16bf16_1k v[0:3], v[6:7], v[4:5], v[0:3] cbsz:1 abid:2 blgp:3
-; GFX90A-VGPR-NEXT:    s_nop 7
-; GFX90A-VGPR-NEXT:    s_nop 2
+; GFX90A-VGPR-NEXT:    s_nop 10
 ; GFX90A-VGPR-NEXT:    global_store_dwordx4 v5, v[0:3], s[6:7]
 ; GFX90A-VGPR-NEXT:    s_endpgm
 ;
@@ -795,8 +781,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64(ptr addrspace(1) %arg, doubl
 ; GFX90A-NEXT:    s_nop 1
 ; GFX90A-NEXT:    v_mfma_f64_16x16x4f64 a[0:7], v[2:3], v[0:1], a[0:7] cbsz:1 abid:2 blgp:3
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90A-NEXT:    s_nop 7
-; GFX90A-NEXT:    s_nop 7
+; GFX90A-NEXT:    s_nop 15
 ; GFX90A-NEXT:    s_nop 0
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[4:7], s[8:9] offset:16
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[0:3], s[8:9]
@@ -823,8 +808,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64(ptr addrspace(1) %arg, doubl
 ; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_mfma_f64_16x16x4_f64 a[0:7], v[2:3], v[0:1], a[0:7] cbsz:1 abid:2 neg:[1,1,0]
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NEXT:    s_nop 7
-; GFX942-NEXT:    s_nop 7
+; GFX942-NEXT:    s_nop 15
 ; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[4:7], s[8:9] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[0:3], s[8:9]
@@ -847,8 +831,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64(ptr addrspace(1) %arg, doubl
 ; GFX90A-VGPR-NEXT:    s_nop 1
 ; GFX90A-VGPR-NEXT:    v_mfma_f64_16x16x4f64 v[0:7], v[10:11], v[8:9], v[0:7] cbsz:1 abid:2 blgp:3
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v8, 0
-; GFX90A-VGPR-NEXT:    s_nop 7
-; GFX90A-VGPR-NEXT:    s_nop 7
+; GFX90A-VGPR-NEXT:    s_nop 15
 ; GFX90A-VGPR-NEXT:    s_nop 0
 ; GFX90A-VGPR-NEXT:    global_store_dwordx4 v8, v[4:7], s[8:9] offset:16
 ; GFX90A-VGPR-NEXT:    global_store_dwordx4 v8, v[0:3], s[8:9]
@@ -871,8 +854,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64(ptr addrspace(1) %arg, doubl
 ; GFX942-VGPR-NEXT:    s_nop 1
 ; GFX942-VGPR-NEXT:    v_mfma_f64_16x16x4_f64 v[0:7], v[10:11], v[8:9], v[0:7] cbsz:1 abid:2 neg:[1,1,0]
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-VGPR-NEXT:    s_nop 7
-; GFX942-VGPR-NEXT:    s_nop 7
+; GFX942-VGPR-NEXT:    s_nop 15
 ; GFX942-VGPR-NEXT:    s_nop 0
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[4:7], s[8:9] offset:16
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[0:3], s[8:9]
@@ -896,8 +878,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_0(ptr addrspace(1)
 ; GFX90A-NEXT:    v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], 0
 ; GFX90A-NEXT:    v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90A-NEXT:    s_nop 7
-; GFX90A-NEXT:    s_nop 7
+; GFX90A-NEXT:    s_nop 15
 ; GFX90A-NEXT:    s_nop 0
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
@@ -914,8 +895,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_0(ptr addrspace(1)
 ; GFX942-NEXT:    v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], 0
 ; GFX942-NEXT:    v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 neg:[1,1,0]
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NEXT:    s_nop 7
-; GFX942-NEXT:    s_nop 7
+; GFX942-NEXT:    s_nop 15
 ; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
@@ -932,8 +912,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_0(ptr addrspace(1)
 ; GFX90A-VGPR-NEXT:    v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], 0
 ; GFX90A-VGPR-NEXT:    v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 blgp:3
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v8, 0
-; GFX90A-VGPR-NEXT:    s_nop 7
-; GFX90A-VGPR-NEXT:    s_nop 7
+; GFX90A-VGPR-NEXT:    s_nop 15
 ; GFX90A-VGPR-NEXT:    s_nop 0
 ; GFX90A-VGPR-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
 ; GFX90A-VGPR-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
@@ -950,8 +929,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_0(ptr addrspace(1)
 ; GFX942-VGPR-NEXT:    v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], 0
 ; GFX942-VGPR-NEXT:    v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 neg:[1,1,0]
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-VGPR-NEXT:    s_nop 7
-; GFX942-VGPR-NEXT:    s_nop 7
+; GFX942-VGPR-NEXT:    s_nop 15
 ; GFX942-VGPR-NEXT:    s_nop 0
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
@@ -975,8 +953,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_neg1(ptr addrs
 ; GFX90A-NEXT:    v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], -1
 ; GFX90A-NEXT:    v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90A-NEXT:    s_nop 7
-; GFX90A-NEXT:    s_nop 7
+; GFX90A-NEXT:    s_nop 15
 ; GFX90A-NEXT:    s_nop 0
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
@@ -993,8 +970,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_neg1(ptr addrs
 ; GFX942-NEXT:    v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], -1
 ; GFX942-NEXT:    v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 neg:[1,1,0]
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NEXT:    s_nop 7
-; GFX942-NEXT:    s_nop 7
+; GFX942-NEXT:    s_nop 15
 ; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
@@ -1011,8 +987,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_neg1(ptr addrs
 ; GFX90A-VGPR-NEXT:    v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], -1
 ; GFX90A-VGPR-NEXT:    v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 blgp:3
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v8, 0
-; GFX90A-VGPR-NEXT:    s_nop 7
-; GFX90A-VGPR-NEXT:    s_nop 7
+; GFX90A-VGPR-NEXT:    s_nop 15
 ; GFX90A-VGPR-NEXT:    s_nop 0
 ; GFX90A-VGPR-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
 ; GFX90A-VGPR-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
@@ -1029,8 +1004,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_neg1(ptr addrs
 ; GFX942-VGPR-NEXT:    v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], -1
 ; GFX942-VGPR-NEXT:    v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 neg:[1,1,0]
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-VGPR-NEXT:    s_nop 7
-; GFX942-VGPR-NEXT:    s_nop 7
+; GFX942-VGPR-NEXT:    s_nop 15
 ; GFX942-VGPR-NEXT:    s_nop 0
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
@@ -1054,8 +1028,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_1(ptr addrspace(1)
 ; GFX90A-NEXT:    v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], 1.0
 ; GFX90A-NEXT:    v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90A-NEXT:    s_nop 7
-; GFX90A-NEXT:    s_nop 7
+; GFX90A-NEXT:    s_nop 15
 ; GFX90A-NEXT:    s_nop 0
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
@@ -1072,8 +1045,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_1(ptr addrspace(1)
 ; GFX942-NEXT:    v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], 1.0
 ; GFX942-NEXT:    v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 neg:[1,1,0]
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NEXT:    s_nop 7
-; GFX942-NEXT:    s_nop 7
+; GFX942-NEXT:    s_nop 15
 ; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
@@ -1090,8 +1062,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_1(ptr addrspace(1)
 ; GFX90A-VGPR-NEXT:    v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], 1.0
 ; GFX90A-VGPR-NEXT:    v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 blgp:3
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v8, 0
-; GFX90A-VGPR-NEXT:    s_nop 7
-; GFX90A-VGPR-NEXT:    s_nop 7
+; GFX90A-VGPR-NEXT:    s_nop 15
 ; GFX90A-VGPR-NEXT:    s_nop 0
 ; GFX90A-VGPR-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
 ; GFX90A-VGPR-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
@@ -1108,8 +1079,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_1(ptr addrspace(1)
 ; GFX942-VGPR-NEXT:    v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], 1.0
 ; GFX942-VGPR-NEXT:    v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 neg:[1,1,0]
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-VGPR-NEXT:    s_nop 7
-; GFX942-VGPR-NEXT:    s_nop 7
+; GFX942-VGPR-NEXT:    s_nop 15
 ; GFX942-VGPR-NEXT:    s_nop 0
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
@@ -1133,8 +1103,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_neg1(ptr addrspace
 ; GFX90A-NEXT:    v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], -1.0
 ; GFX90A-NEXT:    v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90A-NEXT:    s_nop 7
-; GFX90A-NEXT:    s_nop 7
+; GFX90A-NEXT:    s_nop 15
 ; GFX90A-NEXT:    s_nop 0
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
@@ -1151,8 +1120,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_neg1(ptr addrspace
 ; GFX942-NEXT:    v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], -1.0
 ; GFX942-NEXT:    v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 neg:[1,1,0]
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NEXT:    s_nop 7
-; GFX942-NEXT:    s_nop 7
+; GFX942-NEXT:    s_nop 15
 ; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
@@ -1169,8 +1137,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_neg1(ptr addrspace
 ; GFX90A-VGPR-NEXT:    v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], -1.0
 ; GFX90A-VGPR-NEXT:    v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 blgp:3
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v8, 0
-; GFX90A-VGPR-NEXT:    s_nop 7
-; GFX90A-VGPR-NEXT:    s_nop 7
+; GFX90A-VGPR-NEXT:    s_nop 15
 ; GFX90A-VGPR-NEXT:    s_nop 0
 ; GFX90A-VGPR-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
 ; GFX90A-VGPR-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
@@ -1187,8 +1154,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_neg1(ptr addrspace
 ; GFX942-VGPR-NEXT:    v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], -1.0
 ; GFX942-VGPR-NEXT:    v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 neg:[1,1,0]
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-VGPR-NEXT:    s_nop 7
-; GFX942-VGPR-NEXT:    s_nop 7
+; GFX942-VGPR-NEXT:    s_nop 15
 ; GFX942-VGPR-NEXT:    s_nop 0
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
@@ -1212,8 +1178,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_64(ptr addrspa
 ; GFX90A-NEXT:    v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], 64
 ; GFX90A-NEXT:    v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90A-NEXT:    s_nop 7
-; GFX90A-NEXT:    s_nop 7
+; GFX90A-NEXT:    s_nop 15
 ; GFX90A-NEXT:    s_nop 0
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
@@ -1230,8 +1195,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_64(ptr addrspa
 ; GFX942-NEXT:    v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], 64
 ; GFX942-NEXT:    v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 neg:[1,1,0]
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NEXT:    s_nop 7
-; GFX942-NEXT:    s_nop 7
+; GFX942-NEXT:    s_nop 15
 ; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
@@ -1248,8 +1212,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_64(ptr addrspa
 ; GFX90A-VGPR-NEXT:    v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], 64
 ; GFX90A-VGPR-NEXT:    v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 blgp:3
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v8, 0
-; GFX90A-VGPR-NEXT:    s_nop 7
-; GFX90A-VGPR-NEXT:    s_nop 7
+; GFX90A-VGPR-NEXT:    s_nop 15
 ; GFX90A-VGPR-NEXT:    s_nop 0
 ; GFX90A-VGPR-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
 ; GFX90A-VGPR-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
@@ -1266,8 +1229,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_64(ptr addrspa
 ; GFX942-VGPR-NEXT:    v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], 64
 ; GFX942-VGPR-NEXT:    v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 neg:[1,1,0]
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-VGPR-NEXT:    s_nop 7
-; GFX942-VGPR-NEXT:    s_nop 7
+; GFX942-VGPR-NEXT:    s_nop 15
 ; GFX942-VGPR-NEXT:    s_nop 0
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
@@ -1299,8 +1261,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_bit
 ; GFX90A-NEXT:    v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7]
 ; GFX90A-NEXT:    v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90A-NEXT:    s_nop 7
-; GFX90A-NEXT:    s_nop 7
+; GFX90A-NEXT:    s_nop 15
 ; GFX90A-NEXT:    s_nop 0
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
@@ -1325,8 +1286,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_bit
 ; GFX942-NEXT:    v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7]
 ; GFX942-NEXT:    v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 neg:[1,1,0]
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NEXT:    s_nop 7
-; GFX942-NEXT:    s_nop 7
+; GFX942-NEXT:    s_nop 15
 ; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
@@ -1354,8 +1314,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_bit
 ; GFX90A-VGPR-NEXT:    s_nop 1
 ; GFX90A-VGPR-NEXT:    v_mfma_f64_16x16x4f64 v[2:9], v[10:11], v[12:13], v[2:9]
 ; GFX90A-VGPR-NEXT:    v_mfma_f64_16x16x4f64 v[2:9], v[10:11], v[12:13], v[2:9] cbsz:1 abid:2 blgp:3
-; GFX90A-VGPR-NEXT:    s_nop 7
-; GFX90A-VGPR-NEXT:    s_nop 7
+; GFX90A-VGPR-NEXT:    s_nop 15
 ; GFX90A-VGPR-NEXT:    s_nop 1
 ; GFX90A-VGPR-NEXT:    global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
 ; GFX90A-VGPR-NEXT:    global_store_dwordx4 v0, v[2:5], s[0:1]
@@ -1383,8 +1342,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_bit
 ; GFX942-VGPR-NEXT:    s_nop 1
 ; GFX942-VGPR-NEXT:    v_mfma_f64_16x16x4_f64 v[2:9], v[10:11], v[12:13], v[2:9]
 ; GFX942-VGPR-NEXT:    v_mfma_f64_16x16x4_f64 v[2:9], v[10:11], v[12:13], v[2:9] cbsz:1 abid:2 neg:[1,1,0]
-; GFX942-VGPR-NEXT:    s_nop 7
-; GFX942-VGPR-NEXT:    s_nop 7
+; GFX942-VGPR-NEXT:    s_nop 15
 ; GFX942-VGPR-NEXT:    s_nop 1
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v0, v[2:5], s[0:1]
@@ -1416,8 +1374,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_and
 ; GFX90A-NEXT:    v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7]
 ; GFX90A-NEXT:    v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90A-NEXT:    s_nop 7
-; GFX90A-NEXT:    s_nop 7
+; GFX90A-NEXT:    s_nop 15
 ; GFX90A-NEXT:    s_nop 0
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
@@ -1442,8 +1399,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_and
 ; GFX942-NEXT:    v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7]
 ; GFX942-NEXT:    v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 neg:[1,1,0]
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NEXT:    s_nop 7
-; GFX942-NEXT:    s_nop 7
+; GFX942-NEXT:    s_nop 15
 ; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
@@ -1468,8 +1424,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_and
 ; GFX90A-VGPR-NEXT:    v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], v[0:7]
 ; GFX90A-VGPR-NEXT:    v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 blgp:3
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v8, 0
-; GFX90A-VGPR-NEXT:    s_nop 7
-; GFX90A-VGPR-NEXT:    s_nop 7
+; GFX90A-VGPR-NEXT:    s_nop 15
 ; GFX90A-VGPR-NEXT:    s_nop 0
 ; GFX90A-VGPR-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
 ; GFX90A-VGPR-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
@@ -1494,8 +1449,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_and
 ; GFX942-VGPR-NEXT:    v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], v[0:7]
 ; GFX942-VGPR-NEXT:    v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 neg:[1,1,0]
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-VGPR-NEXT:    s_nop 7
-; GFX942-VGPR-NEXT:    s_nop 7
+; GFX942-VGPR-NEXT:    s_nop 15
 ; GFX942-VGPR-NEXT:    s_nop 0
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
@@ -1527,8 +1481,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_f32_1_in_high_and_
 ; GFX90A-NEXT:    v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7]
 ; GFX90A-NEXT:    v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90A-NEXT:    s_nop 7
-; GFX90A-NEXT:    s_nop 7
+; GFX90A-NEXT:    s_nop 15
 ; GFX90A-NEXT:    s_nop 0
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
@@ -1553,8 +1506,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_f32_1_in_high_and_
 ; GFX942-NEXT:    v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7]
 ; GFX942-NEXT:    v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 neg:[1,1,0]
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NEXT:    s_nop 7
-; GFX942-NEXT:    s_nop 7
+; GFX942-NEXT:    s_nop 15
 ; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
@@ -1579,8 +1531,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_f32_1_in_high_and_
 ; GFX90A-VGPR-NEXT:    v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], v[0:7]
 ; GFX90A-VGPR-NEXT:    v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 blgp:3
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v8, 0
-; GFX90A-VGPR-NEXT:    s_nop 7
-; GFX90A-VGPR-NEXT:    s_nop 7
+; GFX90A-VGPR-NEXT:    s_nop 15
 ; GFX90A-VGPR-NEXT:    s_nop 0
 ; GFX90A-VGPR-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
 ; GFX90A-VGPR-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
@@ -1605,8 +1556,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_f32_1_in_high_and_
 ; GFX942-VGPR-NEXT:    v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], v[0:7]
 ; GFX942-VGPR-NEXT:    v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 neg:[1,1,0]
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-VGPR-NEXT:    s_nop 7
-; GFX942-VGPR-NEXT:    s_nop 7
+; GFX942-VGPR-NEXT:    s_nop 15
 ; GFX942-VGPR-NEXT:    s_nop 0
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
@@ -1639,8 +1589,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(ptr addrspace(1) %arg, d
 ; GFX90A-NEXT:    s_nop 1
 ; GFX90A-NEXT:    v_mfma_f64_16x16x4f64 a[0:7], v[2:3], v[0:1], a[0:7]
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90A-NEXT:    s_nop 7
-; GFX90A-NEXT:    s_nop 7
+; GFX90A-NEXT:    s_nop 15
 ; GFX90A-NEXT:    s_nop 0
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
@@ -1666,8 +1615,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(ptr addrspace(1) %arg, d
 ; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_mfma_f64_16x16x4_f64 a[0:7], v[2:3], v[0:1], a[0:7]
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NEXT:    s_nop 7
-; GFX942-NEXT:    s_nop 7
+; GFX942-NEXT:    s_nop 15
 ; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
@@ -1695,8 +1643,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(ptr addrspace(1) %arg, d
 ; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
 ; GFX90A-VGPR-NEXT:    s_nop 1
 ; GFX90A-VGPR-NEXT:    v_mfma_f64_16x16x4f64 v[2:9], v[12:13], v[10:11], v[2:9]
-; GFX90A-VGPR-NEXT:    s_nop 7
-; GFX90A-VGPR-NEXT:    s_nop 7
+; GFX90A-VGPR-NEXT:    s_nop 15
 ; GFX90A-VGPR-NEXT:    s_nop 1
 ; GFX90A-VGPR-NEXT:    global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
 ; GFX90A-VGPR-NEXT:    global_store_dwordx4 v0, v[2:5], s[0:1]
@@ -1724,8 +1671,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(ptr addrspace(1) %arg, d
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
 ; GFX942-VGPR-NEXT:    s_nop 1
 ; GFX942-VGPR-NEXT:    v_mfma_f64_16x16x4_f64 v[2:9], v[12:13], v[10:11], v[2:9]
-; GFX942-VGPR-NEXT:    s_nop 7
-; GFX942-VGPR-NEXT:    s_nop 7
+; GFX942-VGPR-NEXT:    s_nop 15
 ; GFX942-VGPR-NEXT:    s_nop 1
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v0, v[2:5], s[0:1]
@@ -1757,8 +1703,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(ptr addrspace(1) %
 ; GFX90A-NEXT:    s_nop 1
 ; GFX90A-NEXT:    v_mfma_f64_16x16x4f64 a[0:7], v[2:3], v[0:1], a[0:7]
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90A-NEXT:    s_nop 7
-; GFX90A-NEXT:    s_nop 7
+; GFX90A-NEXT:    s_nop 15
 ; GFX90A-NEXT:    s_nop 0
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
@@ -1784,8 +1729,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(ptr addrspace(1) %
 ; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_mfma_f64_16x16x4_f64 a[0:7], v[2:3], v[0:1], a[0:7]
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NEXT:    s_nop 7
-; GFX942-NEXT:    s_nop 7
+; GFX942-NEXT:    s_nop 15
 ; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
@@ -1813,8 +1757,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(ptr addrspace(1) %
 ; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
 ; GFX90A-VGPR-NEXT:    s_nop 1
 ; GFX90A-VGPR-NEXT:    v_mfma_f64_16x16x4f64 v[2:9], v[12:13], v[10:11], v[2:9]
-; GFX90A-VGPR-NEXT:    s_nop 7
-; GFX90A-VGPR-NEXT:    s_nop 7
+; GFX90A-VGPR-NEXT:    s_nop 15
 ; GFX90A-VGPR-NEXT:    s_nop 1
 ; GFX90A-VGPR-NEXT:    global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
 ; GFX90A-VGPR-NEXT:    global_store_dwordx4 v0, v[2:5], s[0:1]
@@ -1842,8 +1785,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(ptr addrspace(1) %
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
 ; GFX942-VGPR-NEXT:    s_nop 1
 ; GFX942-VGPR-NEXT:    v_mfma_f64_16x16x4_f64 v[2:9], v[12:13], v[10:11], v[2:9]
-; GFX942-VGPR-NEXT:    s_nop 7
-; GFX942-VGPR-NEXT:    s_nop 7
+; GFX942-VGPR-NEXT:    s_nop 15
 ; GFX942-VGPR-NEXT:    s_nop 1
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v0, v[2:5], s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll
index beda16c17a5c9..dc4c929124fec 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll
@@ -31,26 +31,26 @@ declare <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.fp8.bf8(<2 x i32>, <4 x i3
 declare <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.fp8.fp8(<2 x i32>, <4 x i32>, <16 x float>, i32, i32, i32)
 
 define amdgpu_kernel void @test_mfma_i32_16x16x32i8(ptr addrspace(1) %arg) #0 {
-; GFX942-VGPRCD-SDAG-LABEL: test_mfma_i32_16x16x32i8:
-; GFX942-VGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, 2
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, 1
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, 4
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v5, 3
-; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
-; GFX942-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
-; GFX942-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
-; GFX942-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT:    v_mfma_i32_16x16x32_i8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 6
-; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
-; GFX942-VGPRCD-SDAG-NEXT:    s_endpgm
+; GFX942-SDAG-LABEL: test_mfma_i32_16x16x32i8:
+; GFX942-SDAG:       ; %bb.0: ; %bb
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v2, 2
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v3, 1
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v4, 4
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v5, 3
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_mfma_i32_16x16x32_i8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-SDAG-NEXT:    s_nop 6
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX942-SDAG-NEXT:    s_endpgm
 ;
 ; GFX942-GISEL-LABEL: test_mfma_i32_16x16x32i8:
 ; GFX942-GISEL:       ; %bb.0: ; %bb
@@ -73,47 +73,26 @@ define amdgpu_kernel void @test_mfma_i32_16x16x32i8(ptr addrspace(1) %arg) #0 {
 ; GFX942-GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
 ; GFX942-GISEL-NEXT:    s_endpgm
 ;
-; GFX942-AGPRCD-SDAG-LABEL: test_mfma_i32_16x16x32i8:
-; GFX942-AGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 2
-; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v1, 1
-; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, 4
-; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, 3
-; GFX942-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, 0
-; GFX942-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
-; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
-; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
-; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
-; GFX942-AGPRCD-SDAG-NEXT:    s_nop 1
-; GFX942-AGPRCD-SDAG-NEXT:    v_mfma_i32_16x16x32_i8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
-; GFX942-AGPRCD-SDAG-NEXT:    s_nop 6
-; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v4, a[0:3], s[6:7]
-; GFX942-AGPRCD-SDAG-NEXT:    s_endpgm
-;
-; GFX950-VGPRCD-SDAG-LABEL: test_mfma_i32_16x16x32i8:
-; GFX950-VGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, 2
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, 1
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, 4
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v5, 3
-; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
-; GFX950-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
-; GFX950-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
-; GFX950-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX950-VGPRCD-SDAG-NEXT:    v_mfma_i32_16x16x32_i8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 7
-; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
-; GFX950-VGPRCD-SDAG-NEXT:    s_endpgm
+; GFX950-SDAG-LABEL: test_mfma_i32_16x16x32i8:
+; GFX950-SDAG:       ; %bb.0: ; %bb
+; GFX950-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, 2
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, 1
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v4, 4
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v5, 3
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX950-SDAG-NEXT:    s_nop 1
+; GFX950-SDAG-NEXT:    v_mfma_i32_16x16x32_i8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX950-SDAG-NEXT:    s_nop 7
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX950-SDAG-NEXT:    s_endpgm
 ;
 ; GFX950-GISEL-LABEL: test_mfma_i32_16x16x32i8:
 ; GFX950-GISEL:       ; %bb.0: ; %bb
@@ -135,7 +114,26 @@ define amdgpu_kernel void @test_mfma_i32_16x16x32i8(ptr addrspace(1) %arg) #0 {
 ; GFX950-GISEL-NEXT:    s_nop 6
 ; GFX950-GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
 ; GFX950-GISEL-NEXT:    s_endpgm
-;
+; GFX942-AGPRCD-SDAG-LABEL: test_mfma_i32_16x16x32i8:
+; GFX942-AGPRCD-SDAG:       ; %bb.0: ; %bb
+; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 2
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v1, 1
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, 4
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, 3
+; GFX942-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX942-AGPRCD-SDAG-NEXT:    s_nop 1
+; GFX942-AGPRCD-SDAG-NEXT:    v_mfma_i32_16x16x32_i8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-AGPRCD-SDAG-NEXT:    s_nop 6
+; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v4, a[0:3], s[6:7]
+; GFX942-AGPRCD-SDAG-NEXT:    s_endpgm
 ; GFX950-AGPRCD-SDAG-LABEL: test_mfma_i32_16x16x32i8:
 ; GFX950-AGPRCD-SDAG:       ; %bb.0: ; %bb
 ; GFX950-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
@@ -193,8 +191,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x16i8(ptr addrspace(1) %arg) #0 {
 ; GFX942-SDAG-NEXT:    s_nop 1
 ; GFX942-SDAG-NEXT:    v_mfma_i32_32x32x16_i8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
 ; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-SDAG-NEXT:    s_nop 7
-; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    s_nop 9
 ; GFX942-SDAG-NEXT:    global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
 ; GFX942-SDAG-NEXT:    global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
 ; GFX942-SDAG-NEXT:    global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
@@ -230,8 +227,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x16i8(ptr addrspace(1) %arg) #0 {
 ; GFX942-GISEL-NEXT:    s_nop 1
 ; GFX942-GISEL-NEXT:    v_mfma_i32_32x32x16_i8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
 ; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-GISEL-NEXT:    s_nop 7
-; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    s_nop 9
 ; GFX942-GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[16:17]
 ; GFX942-GISEL-NEXT:    global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
 ; GFX942-GISEL-NEXT:    global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
@@ -267,8 +263,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x16i8(ptr addrspace(1) %arg) #0 {
 ; GFX950-SDAG-NEXT:    s_nop 1
 ; GFX950-SDAG-NEXT:    v_mfma_i32_32x32x16_i8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX950-SDAG-NEXT:    s_nop 7
-; GFX950-SDAG-NEXT:    s_nop 2
+; GFX950-SDAG-NEXT:    s_nop 10
 ; GFX950-SDAG-NEXT:    global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
 ; GFX950-SDAG-NEXT:    global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
 ; GFX950-SDAG-NEXT:    global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
@@ -304,8 +299,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x16i8(ptr addrspace(1) %arg) #0 {
 ; GFX950-GISEL-NEXT:    s_nop 1
 ; GFX950-GISEL-NEXT:    v_mfma_i32_32x32x16_i8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
 ; GFX950-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX950-GISEL-NEXT:    s_nop 7
-; GFX950-GISEL-NEXT:    s_nop 2
+; GFX950-GISEL-NEXT:    s_nop 10
 ; GFX950-GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[16:17]
 ; GFX950-GISEL-NEXT:    global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
 ; GFX950-GISEL-NEXT:    global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
@@ -319,26 +313,26 @@ bb:
 }
 
 define amdgpu_kernel void @test_mfma_f32_16x16x32_bf8_bf8(ptr addrspace(1) %arg) #0 {
-; GFX942-VGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_bf8:
-; GFX942-VGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, 2
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, 1
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, 4
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v5, 3
-; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
-; GFX942-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
-; GFX942-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
-; GFX942-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT:    v_mfma_f32_16x16x32_bf8_bf8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 6
-; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
-; GFX942-VGPRCD-SDAG-NEXT:    s_endpgm
+; GFX942-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_bf8:
+; GFX942-SDAG:       ; %bb.0: ; %bb
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v2, 2
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v3, 1
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v4, 4
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v5, 3
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_mfma_f32_16x16x32_bf8_bf8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-SDAG-NEXT:    s_nop 6
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX942-SDAG-NEXT:    s_endpgm
 ;
 ; GFX942-GISEL-LABEL: test_mfma_f32_16x16x32_bf8_bf8:
 ; GFX942-GISEL:       ; %bb.0: ; %bb
@@ -361,47 +355,26 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf8_bf8(ptr addrspace(1) %arg)
 ; GFX942-GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
 ; GFX942-GISEL-NEXT:    s_endpgm
 ;
-; GFX942-AGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_bf8:
-; GFX942-AGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 2
-; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v1, 1
-; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, 4
-; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, 3
-; GFX942-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, 0
-; GFX942-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
-; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
-; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
-; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
-; GFX942-AGPRCD-SDAG-NEXT:    s_nop 1
-; GFX942-AGPRCD-SDAG-NEXT:    v_mfma_f32_16x16x32_bf8_bf8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
-; GFX942-AGPRCD-SDAG-NEXT:    s_nop 6
-; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v4, a[0:3], s[6:7]
-; GFX942-AGPRCD-SDAG-NEXT:    s_endpgm
-;
-; GFX950-VGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_bf8:
-; GFX950-VGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, 2
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, 1
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, 4
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v5, 3
-; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
-; GFX950-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
-; GFX950-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
-; GFX950-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX950-VGPRCD-SDAG-NEXT:    v_mfma_f32_16x16x32_bf8_bf8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 7
-; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
-; GFX950-VGPRCD-SDAG-NEXT:    s_endpgm
+; GFX950-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_bf8:
+; GFX950-SDAG:       ; %bb.0: ; %bb
+; GFX950-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, 2
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, 1
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v4, 4
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v5, 3
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX950-SDAG-NEXT:    s_nop 1
+; GFX950-SDAG-NEXT:    v_mfma_f32_16x16x32_bf8_bf8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX950-SDAG-NEXT:    s_nop 7
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX950-SDAG-NEXT:    s_endpgm
 ;
 ; GFX950-GISEL-LABEL: test_mfma_f32_16x16x32_bf8_bf8:
 ; GFX950-GISEL:       ; %bb.0: ; %bb
@@ -423,7 +396,26 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf8_bf8(ptr addrspace(1) %arg)
 ; GFX950-GISEL-NEXT:    s_nop 6
 ; GFX950-GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
 ; GFX950-GISEL-NEXT:    s_endpgm
-;
+; GFX942-AGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_bf8:
+; GFX942-AGPRCD-SDAG:       ; %bb.0: ; %bb
+; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 2
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v1, 1
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, 4
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, 3
+; GFX942-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX942-AGPRCD-SDAG-NEXT:    s_nop 1
+; GFX942-AGPRCD-SDAG-NEXT:    v_mfma_f32_16x16x32_bf8_bf8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-AGPRCD-SDAG-NEXT:    s_nop 6
+; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v4, a[0:3], s[6:7]
+; GFX942-AGPRCD-SDAG-NEXT:    s_endpgm
 ; GFX950-AGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_bf8:
 ; GFX950-AGPRCD-SDAG:       ; %bb.0: ; %bb
 ; GFX950-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
@@ -452,26 +444,26 @@ bb:
 }
 
 define amdgpu_kernel void @test_mfma_f32_16x16x32_bf8_fp8(ptr addrspace(1) %arg) #0 {
-; GFX942-VGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_fp8:
-; GFX942-VGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, 2
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, 1
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, 4
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v5, 3
-; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
-; GFX942-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
-; GFX942-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
-; GFX942-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT:    v_mfma_f32_16x16x32_bf8_fp8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 6
-; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
-; GFX942-VGPRCD-SDAG-NEXT:    s_endpgm
+; GFX942-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_fp8:
+; GFX942-SDAG:       ; %bb.0: ; %bb
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v2, 2
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v3, 1
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v4, 4
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v5, 3
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_mfma_f32_16x16x32_bf8_fp8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-SDAG-NEXT:    s_nop 6
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX942-SDAG-NEXT:    s_endpgm
 ;
 ; GFX942-GISEL-LABEL: test_mfma_f32_16x16x32_bf8_fp8:
 ; GFX942-GISEL:       ; %bb.0: ; %bb
@@ -494,47 +486,26 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf8_fp8(ptr addrspace(1) %arg)
 ; GFX942-GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
 ; GFX942-GISEL-NEXT:    s_endpgm
 ;
-; GFX942-AGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_fp8:
-; GFX942-AGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 2
-; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v1, 1
-; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, 4
-; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, 3
-; GFX942-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, 0
-; GFX942-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
-; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
-; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
-; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
-; GFX942-AGPRCD-SDAG-NEXT:    s_nop 1
-; GFX942-AGPRCD-SDAG-NEXT:    v_mfma_f32_16x16x32_bf8_fp8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
-; GFX942-AGPRCD-SDAG-NEXT:    s_nop 6
-; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v4, a[0:3], s[6:7]
-; GFX942-AGPRCD-SDAG-NEXT:    s_endpgm
-;
-; GFX950-VGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_fp8:
-; GFX950-VGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, 2
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, 1
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, 4
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v5, 3
-; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
-; GFX950-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
-; GFX950-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
-; GFX950-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX950-VGPRCD-SDAG-NEXT:    v_mfma_f32_16x16x32_bf8_fp8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 7
-; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
-; GFX950-VGPRCD-SDAG-NEXT:    s_endpgm
+; GFX950-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_fp8:
+; GFX950-SDAG:       ; %bb.0: ; %bb
+; GFX950-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, 2
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, 1
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v4, 4
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v5, 3
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX950-SDAG-NEXT:    s_nop 1
+; GFX950-SDAG-NEXT:    v_mfma_f32_16x16x32_bf8_fp8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX950-SDAG-NEXT:    s_nop 7
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX950-SDAG-NEXT:    s_endpgm
 ;
 ; GFX950-GISEL-LABEL: test_mfma_f32_16x16x32_bf8_fp8:
 ; GFX950-GISEL:       ; %bb.0: ; %bb
@@ -556,7 +527,26 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf8_fp8(ptr addrspace(1) %arg)
 ; GFX950-GISEL-NEXT:    s_nop 6
 ; GFX950-GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
 ; GFX950-GISEL-NEXT:    s_endpgm
-;
+; GFX942-AGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_fp8:
+; GFX942-AGPRCD-SDAG:       ; %bb.0: ; %bb
+; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 2
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v1, 1
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, 4
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, 3
+; GFX942-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX942-AGPRCD-SDAG-NEXT:    s_nop 1
+; GFX942-AGPRCD-SDAG-NEXT:    v_mfma_f32_16x16x32_bf8_fp8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-AGPRCD-SDAG-NEXT:    s_nop 6
+; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v4, a[0:3], s[6:7]
+; GFX942-AGPRCD-SDAG-NEXT:    s_endpgm
 ; GFX950-AGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_fp8:
 ; GFX950-AGPRCD-SDAG:       ; %bb.0: ; %bb
 ; GFX950-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
@@ -585,26 +575,26 @@ bb:
 }
 
 define amdgpu_kernel void @test_mfma_f32_16x16x32_fp8_bf8(ptr addrspace(1) %arg) #0 {
-; GFX942-VGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_bf8:
-; GFX942-VGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, 2
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, 1
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, 4
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v5, 3
-; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
-; GFX942-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
-; GFX942-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
-; GFX942-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT:    v_mfma_f32_16x16x32_fp8_bf8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 6
-; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
-; GFX942-VGPRCD-SDAG-NEXT:    s_endpgm
+; GFX942-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_bf8:
+; GFX942-SDAG:       ; %bb.0: ; %bb
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v2, 2
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v3, 1
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v4, 4
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v5, 3
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_mfma_f32_16x16x32_fp8_bf8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-SDAG-NEXT:    s_nop 6
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX942-SDAG-NEXT:    s_endpgm
 ;
 ; GFX942-GISEL-LABEL: test_mfma_f32_16x16x32_fp8_bf8:
 ; GFX942-GISEL:       ; %bb.0: ; %bb
@@ -627,47 +617,26 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_fp8_bf8(ptr addrspace(1) %arg)
 ; GFX942-GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
 ; GFX942-GISEL-NEXT:    s_endpgm
 ;
-; GFX942-AGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_bf8:
-; GFX942-AGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 2
-; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v1, 1
-; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, 4
-; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, 3
-; GFX942-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, 0
-; GFX942-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
-; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
-; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
-; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
-; GFX942-AGPRCD-SDAG-NEXT:    s_nop 1
-; GFX942-AGPRCD-SDAG-NEXT:    v_mfma_f32_16x16x32_fp8_bf8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
-; GFX942-AGPRCD-SDAG-NEXT:    s_nop 6
-; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v4, a[0:3], s[6:7]
-; GFX942-AGPRCD-SDAG-NEXT:    s_endpgm
-;
-; GFX950-VGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_bf8:
-; GFX950-VGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, 2
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, 1
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, 4
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v5, 3
-; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
-; GFX950-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
-; GFX950-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
-; GFX950-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX950-VGPRCD-SDAG-NEXT:    v_mfma_f32_16x16x32_fp8_bf8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 7
-; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
-; GFX950-VGPRCD-SDAG-NEXT:    s_endpgm
+; GFX950-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_bf8:
+; GFX950-SDAG:       ; %bb.0: ; %bb
+; GFX950-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, 2
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, 1
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v4, 4
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v5, 3
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX950-SDAG-NEXT:    s_nop 1
+; GFX950-SDAG-NEXT:    v_mfma_f32_16x16x32_fp8_bf8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX950-SDAG-NEXT:    s_nop 7
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX950-SDAG-NEXT:    s_endpgm
 ;
 ; GFX950-GISEL-LABEL: test_mfma_f32_16x16x32_fp8_bf8:
 ; GFX950-GISEL:       ; %bb.0: ; %bb
@@ -689,7 +658,26 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_fp8_bf8(ptr addrspace(1) %arg)
 ; GFX950-GISEL-NEXT:    s_nop 6
 ; GFX950-GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
 ; GFX950-GISEL-NEXT:    s_endpgm
-;
+; GFX942-AGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_bf8:
+; GFX942-AGPRCD-SDAG:       ; %bb.0: ; %bb
+; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 2
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v1, 1
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, 4
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, 3
+; GFX942-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX942-AGPRCD-SDAG-NEXT:    s_nop 1
+; GFX942-AGPRCD-SDAG-NEXT:    v_mfma_f32_16x16x32_fp8_bf8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-AGPRCD-SDAG-NEXT:    s_nop 6
+; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v4, a[0:3], s[6:7]
+; GFX942-AGPRCD-SDAG-NEXT:    s_endpgm
 ; GFX950-AGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_bf8:
 ; GFX950-AGPRCD-SDAG:       ; %bb.0: ; %bb
 ; GFX950-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
@@ -718,26 +706,26 @@ bb:
 }
 
 define amdgpu_kernel void @test_mfma_f32_16x16x32_fp8_fp8(ptr addrspace(1) %arg) #0 {
-; GFX942-VGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_fp8:
-; GFX942-VGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, 2
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, 1
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, 4
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v5, 3
-; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
-; GFX942-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
-; GFX942-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
-; GFX942-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT:    v_mfma_f32_16x16x32_fp8_fp8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 6
-; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
-; GFX942-VGPRCD-SDAG-NEXT:    s_endpgm
+; GFX942-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_fp8:
+; GFX942-SDAG:       ; %bb.0: ; %bb
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v2, 2
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v3, 1
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v4, 4
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v5, 3
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_mfma_f32_16x16x32_fp8_fp8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-SDAG-NEXT:    s_nop 6
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX942-SDAG-NEXT:    s_endpgm
 ;
 ; GFX942-GISEL-LABEL: test_mfma_f32_16x16x32_fp8_fp8:
 ; GFX942-GISEL:       ; %bb.0: ; %bb
@@ -760,47 +748,26 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_fp8_fp8(ptr addrspace(1) %arg)
 ; GFX942-GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
 ; GFX942-GISEL-NEXT:    s_endpgm
 ;
-; GFX942-AGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_fp8:
-; GFX942-AGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 2
-; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v1, 1
-; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, 4
-; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, 3
-; GFX942-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, 0
-; GFX942-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
-; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
-; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
-; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
-; GFX942-AGPRCD-SDAG-NEXT:    s_nop 1
-; GFX942-AGPRCD-SDAG-NEXT:    v_mfma_f32_16x16x32_fp8_fp8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
-; GFX942-AGPRCD-SDAG-NEXT:    s_nop 6
-; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v4, a[0:3], s[6:7]
-; GFX942-AGPRCD-SDAG-NEXT:    s_endpgm
-;
-; GFX950-VGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_fp8:
-; GFX950-VGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, 2
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, 1
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, 4
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v5, 3
-; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
-; GFX950-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
-; GFX950-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
-; GFX950-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX950-VGPRCD-SDAG-NEXT:    v_mfma_f32_16x16x32_fp8_fp8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 7
-; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
-; GFX950-VGPRCD-SDAG-NEXT:    s_endpgm
+; GFX950-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_fp8:
+; GFX950-SDAG:       ; %bb.0: ; %bb
+; GFX950-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, 2
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, 1
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v4, 4
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v5, 3
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX950-SDAG-NEXT:    s_nop 1
+; GFX950-SDAG-NEXT:    v_mfma_f32_16x16x32_fp8_fp8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX950-SDAG-NEXT:    s_nop 7
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX950-SDAG-NEXT:    s_endpgm
 ;
 ; GFX950-GISEL-LABEL: test_mfma_f32_16x16x32_fp8_fp8:
 ; GFX950-GISEL:       ; %bb.0: ; %bb
@@ -822,7 +789,26 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_fp8_fp8(ptr addrspace(1) %arg)
 ; GFX950-GISEL-NEXT:    s_nop 6
 ; GFX950-GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
 ; GFX950-GISEL-NEXT:    s_endpgm
-;
+; GFX942-AGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_fp8:
+; GFX942-AGPRCD-SDAG:       ; %bb.0: ; %bb
+; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 2
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v1, 1
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, 4
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, 3
+; GFX942-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX942-AGPRCD-SDAG-NEXT:    s_nop 1
+; GFX942-AGPRCD-SDAG-NEXT:    v_mfma_f32_16x16x32_fp8_fp8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-AGPRCD-SDAG-NEXT:    s_nop 6
+; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v4, a[0:3], s[6:7]
+; GFX942-AGPRCD-SDAG-NEXT:    s_endpgm
 ; GFX950-AGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_fp8:
 ; GFX950-AGPRCD-SDAG:       ; %bb.0: ; %bb
 ; GFX950-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
@@ -880,8 +866,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf8_bf8(ptr addrspace(1) %arg)
 ; GFX942-SDAG-NEXT:    s_nop 1
 ; GFX942-SDAG-NEXT:    v_mfma_f32_32x32x16_bf8_bf8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
 ; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-SDAG-NEXT:    s_nop 7
-; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    s_nop 9
 ; GFX942-SDAG-NEXT:    global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
 ; GFX942-SDAG-NEXT:    global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
 ; GFX942-SDAG-NEXT:    global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
@@ -917,8 +902,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf8_bf8(ptr addrspace(1) %arg)
 ; GFX942-GISEL-NEXT:    s_nop 1
 ; GFX942-GISEL-NEXT:    v_mfma_f32_32x32x16_bf8_bf8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
 ; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-GISEL-NEXT:    s_nop 7
-; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    s_nop 9
 ; GFX942-GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[16:17]
 ; GFX942-GISEL-NEXT:    global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
 ; GFX942-GISEL-NEXT:    global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
@@ -954,8 +938,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf8_bf8(ptr addrspace(1) %arg)
 ; GFX950-SDAG-NEXT:    s_nop 1
 ; GFX950-SDAG-NEXT:    v_mfma_f32_32x32x16_bf8_bf8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX950-SDAG-NEXT:    s_nop 7
-; GFX950-SDAG-NEXT:    s_nop 2
+; GFX950-SDAG-NEXT:    s_nop 10
 ; GFX950-SDAG-NEXT:    global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
 ; GFX950-SDAG-NEXT:    global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
 ; GFX950-SDAG-NEXT:    global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
@@ -991,8 +974,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf8_bf8(ptr addrspace(1) %arg)
 ; GFX950-GISEL-NEXT:    s_nop 1
 ; GFX950-GISEL-NEXT:    v_mfma_f32_32x32x16_bf8_bf8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
 ; GFX950-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX950-GISEL-NEXT:    s_nop 7
-; GFX950-GISEL-NEXT:    s_nop 2
+; GFX950-GISEL-NEXT:    s_nop 10
 ; GFX950-GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[16:17]
 ; GFX950-GISEL-NEXT:    global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
 ; GFX950-GISEL-NEXT:    global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
@@ -1035,8 +1017,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf8_fp8(ptr addrspace(1) %arg)
 ; GFX942-SDAG-NEXT:    s_nop 1
 ; GFX942-SDAG-NEXT:    v_mfma_f32_32x32x16_bf8_fp8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
 ; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-SDAG-NEXT:    s_nop 7
-; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    s_nop 9
 ; GFX942-SDAG-NEXT:    global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
 ; GFX942-SDAG-NEXT:    global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
 ; GFX942-SDAG-NEXT:    global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
@@ -1072,8 +1053,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf8_fp8(ptr addrspace(1) %arg)
 ; GFX942-GISEL-NEXT:    s_nop 1
 ; GFX942-GISEL-NEXT:    v_mfma_f32_32x32x16_bf8_fp8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
 ; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-GISEL-NEXT:    s_nop 7
-; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    s_nop 9
 ; GFX942-GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[16:17]
 ; GFX942-GISEL-NEXT:    global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
 ; GFX942-GISEL-NEXT:    global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
@@ -1109,8 +1089,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf8_fp8(ptr addrspace(1) %arg)
 ; GFX950-SDAG-NEXT:    s_nop 1
 ; GFX950-SDAG-NEXT:    v_mfma_f32_32x32x16_bf8_fp8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX950-SDAG-NEXT:    s_nop 7
-; GFX950-SDAG-NEXT:    s_nop 2
+; GFX950-SDAG-NEXT:    s_nop 10
 ; GFX950-SDAG-NEXT:    global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
 ; GFX950-SDAG-NEXT:    global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
 ; GFX950-SDAG-NEXT:    global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
@@ -1146,8 +1125,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf8_fp8(ptr addrspace(1) %arg)
 ; GFX950-GISEL-NEXT:    s_nop 1
 ; GFX950-GISEL-NEXT:    v_mfma_f32_32x32x16_bf8_fp8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
 ; GFX950-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX950-GISEL-NEXT:    s_nop 7
-; GFX950-GISEL-NEXT:    s_nop 2
+; GFX950-GISEL-NEXT:    s_nop 10
 ; GFX950-GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[16:17]
 ; GFX950-GISEL-NEXT:    global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
 ; GFX950-GISEL-NEXT:    global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
@@ -1190,8 +1168,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_fp8_bf8(ptr addrspace(1) %arg)
 ; GFX942-SDAG-NEXT:    s_nop 1
 ; GFX942-SDAG-NEXT:    v_mfma_f32_32x32x16_fp8_bf8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
 ; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-SDAG-NEXT:    s_nop 7
-; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    s_nop 9
 ; GFX942-SDAG-NEXT:    global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
 ; GFX942-SDAG-NEXT:    global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
 ; GFX942-SDAG-NEXT:    global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
@@ -1227,8 +1204,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_fp8_bf8(ptr addrspace(1) %arg)
 ; GFX942-GISEL-NEXT:    s_nop 1
 ; GFX942-GISEL-NEXT:    v_mfma_f32_32x32x16_fp8_bf8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
 ; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-GISEL-NEXT:    s_nop 7
-; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    s_nop 9
 ; GFX942-GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[16:17]
 ; GFX942-GISEL-NEXT:    global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
 ; GFX942-GISEL-NEXT:    global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
@@ -1264,8 +1240,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_fp8_bf8(ptr addrspace(1) %arg)
 ; GFX950-SDAG-NEXT:    s_nop 1
 ; GFX950-SDAG-NEXT:    v_mfma_f32_32x32x16_fp8_bf8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX950-SDAG-NEXT:    s_nop 7
-; GFX950-SDAG-NEXT:    s_nop 2
+; GFX950-SDAG-NEXT:    s_nop 10
 ; GFX950-SDAG-NEXT:    global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
 ; GFX950-SDAG-NEXT:    global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
 ; GFX950-SDAG-NEXT:    global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
@@ -1301,8 +1276,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_fp8_bf8(ptr addrspace(1) %arg)
 ; GFX950-GISEL-NEXT:    s_nop 1
 ; GFX950-GISEL-NEXT:    v_mfma_f32_32x32x16_fp8_bf8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
 ; GFX950-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX950-GISEL-NEXT:    s_nop 7
-; GFX950-GISEL-NEXT:    s_nop 2
+; GFX950-GISEL-NEXT:    s_nop 10
 ; GFX950-GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[16:17]
 ; GFX950-GISEL-NEXT:    global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
 ; GFX950-GISEL-NEXT:    global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
@@ -1345,8 +1319,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_fp8_fp8(ptr addrspace(1) %arg)
 ; GFX942-SDAG-NEXT:    s_nop 1
 ; GFX942-SDAG-NEXT:    v_mfma_f32_32x32x16_fp8_fp8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
 ; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-SDAG-NEXT:    s_nop 7
-; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    s_nop 9
 ; GFX942-SDAG-NEXT:    global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
 ; GFX942-SDAG-NEXT:    global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
 ; GFX942-SDAG-NEXT:    global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
@@ -1382,8 +1355,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_fp8_fp8(ptr addrspace(1) %arg)
 ; GFX942-GISEL-NEXT:    s_nop 1
 ; GFX942-GISEL-NEXT:    v_mfma_f32_32x32x16_fp8_fp8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
 ; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-GISEL-NEXT:    s_nop 7
-; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    s_nop 9
 ; GFX942-GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[16:17]
 ; GFX942-GISEL-NEXT:    global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
 ; GFX942-GISEL-NEXT:    global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
@@ -1419,8 +1391,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_fp8_fp8(ptr addrspace(1) %arg)
 ; GFX950-SDAG-NEXT:    s_nop 1
 ; GFX950-SDAG-NEXT:    v_mfma_f32_32x32x16_fp8_fp8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX950-SDAG-NEXT:    s_nop 7
-; GFX950-SDAG-NEXT:    s_nop 2
+; GFX950-SDAG-NEXT:    s_nop 10
 ; GFX950-SDAG-NEXT:    global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
 ; GFX950-SDAG-NEXT:    global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
 ; GFX950-SDAG-NEXT:    global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
@@ -1456,8 +1427,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_fp8_fp8(ptr addrspace(1) %arg)
 ; GFX950-GISEL-NEXT:    s_nop 1
 ; GFX950-GISEL-NEXT:    v_mfma_f32_32x32x16_fp8_fp8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
 ; GFX950-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX950-GISEL-NEXT:    s_nop 7
-; GFX950-GISEL-NEXT:    s_nop 2
+; GFX950-GISEL-NEXT:    s_nop 10
 ; GFX950-GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[16:17]
 ; GFX950-GISEL-NEXT:    global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
 ; GFX950-GISEL-NEXT:    global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
@@ -1471,46 +1441,85 @@ bb:
 }
 
 define amdgpu_kernel void @test_smfmac_f32_16x16x32_f16(ptr addrspace(1) %arg, <4 x half> %a, <8 x half> %b, i32 %idx) #0 {
-; GFX942-VGPRCD-SDAG-LABEL: test_smfmac_f32_16x16x32_f16:
-; GFX942-VGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dword s6, s[4:5], 0x44
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v6, 0
-; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[10:11]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[14:15]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v7, s6
-; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[2:3]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[0:1]
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT:    v_smfmac_f32_16x16x32_f16 v[8:11], v[4:5], v[0:3], v7 cbsz:1 abid:2
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 6
-; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v6, v[8:11], s[8:9]
-; GFX942-VGPRCD-SDAG-NEXT:    s_endpgm
+; GFX942-SDAG-LABEL: test_smfmac_f32_16x16x32_f16:
+; GFX942-SDAG:       ; %bb.0: ; %bb
+; GFX942-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX942-SDAG-NEXT:    s_load_dword s6, s[4:5], 0x44
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[10:11]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[14:15]
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v7, s6
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[2:3]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[0:1]
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_smfmac_f32_16x16x32_f16 v[8:11], v[4:5], v[0:3], v7 cbsz:1 abid:2
+; GFX942-SDAG-NEXT:    s_nop 6
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v6, v[8:11], s[8:9]
+; GFX942-SDAG-NEXT:    s_endpgm
 ;
-; GFX942-VGPRCD-GISEL-LABEL: test_smfmac_f32_16x16x32_f16:
-; GFX942-VGPRCD-GISEL:       ; %bb.0: ; %bb
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
-; GFX942-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dword s6, s[4:5], 0x44
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[10:11]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[14:15]
-; GFX942-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[2:3]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[0:1]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v6, s6
-; GFX942-VGPRCD-GISEL-NEXT:    s_nop 1
-; GFX942-VGPRCD-GISEL-NEXT:    v_smfmac_f32_16x16x32_f16 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-VGPRCD-GISEL-NEXT:    s_nop 5
-; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[8:9]
-; GFX942-VGPRCD-GISEL-NEXT:    s_endpgm
+; GFX942-GISEL-LABEL: test_smfmac_f32_16x16x32_f16:
+; GFX942-GISEL:       ; %bb.0: ; %bb
+; GFX942-GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX942-GISEL-NEXT:    s_load_dword s6, s[4:5], 0x44
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[10:11]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[14:15]
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[2:3]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[0:1]
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v6, s6
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_smfmac_f32_16x16x32_f16 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-GISEL-NEXT:    s_nop 5
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[8:9]
+; GFX942-GISEL-NEXT:    s_endpgm
 ;
+; GFX950-SDAG-LABEL: test_smfmac_f32_16x16x32_f16:
+; GFX950-SDAG:       ; %bb.0: ; %bb
+; GFX950-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX950-SDAG-NEXT:    s_load_dword s6, s[4:5], 0x44
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v6, 0
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[10:11]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[14:15]
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v7, s6
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[2:3]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[0:1]
+; GFX950-SDAG-NEXT:    s_nop 1
+; GFX950-SDAG-NEXT:    v_smfmac_f32_16x16x32_f16 v[8:11], v[4:5], v[0:3], v7 cbsz:1 abid:2
+; GFX950-SDAG-NEXT:    s_nop 7
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v6, v[8:11], s[8:9]
+; GFX950-SDAG-NEXT:    s_endpgm
+;
+; GFX950-GISEL-LABEL: test_smfmac_f32_16x16x32_f16:
+; GFX950-GISEL:       ; %bb.0: ; %bb
+; GFX950-GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX950-GISEL-NEXT:    s_load_dword s6, s[4:5], 0x44
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[10:11]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[14:15]
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[2:3]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[0:1]
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v6, s6
+; GFX950-GISEL-NEXT:    s_nop 1
+; GFX950-GISEL-NEXT:    v_smfmac_f32_16x16x32_f16 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX950-GISEL-NEXT:    s_nop 6
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[8:9]
+; GFX950-GISEL-NEXT:    s_endpgm
 ; GFX942-AGPRCD-LABEL: test_smfmac_f32_16x16x32_f16:
 ; GFX942-AGPRCD:       ; %bb.0: ; %bb
 ; GFX942-AGPRCD-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
@@ -1533,47 +1542,6 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x32_f16(ptr addrspace(1) %arg, <
 ; GFX942-AGPRCD-NEXT:    s_nop 5
 ; GFX942-AGPRCD-NEXT:    global_store_dwordx4 v0, a[0:3], s[8:9]
 ; GFX942-AGPRCD-NEXT:    s_endpgm
-;
-; GFX950-VGPRCD-SDAG-LABEL: test_smfmac_f32_16x16x32_f16:
-; GFX950-VGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dword s6, s[4:5], 0x44
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v6, 0
-; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[10:11]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[14:15]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v7, s6
-; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[2:3]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[0:1]
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX950-VGPRCD-SDAG-NEXT:    v_smfmac_f32_16x16x32_f16 v[8:11], v[4:5], v[0:3], v7 cbsz:1 abid:2
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 7
-; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v6, v[8:11], s[8:9]
-; GFX950-VGPRCD-SDAG-NEXT:    s_endpgm
-;
-; GFX950-VGPRCD-GISEL-LABEL: test_smfmac_f32_16x16x32_f16:
-; GFX950-VGPRCD-GISEL:       ; %bb.0: ; %bb
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
-; GFX950-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dword s6, s[4:5], 0x44
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[10:11]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[14:15]
-; GFX950-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[2:3]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[0:1]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v6, s6
-; GFX950-VGPRCD-GISEL-NEXT:    s_nop 1
-; GFX950-VGPRCD-GISEL-NEXT:    v_smfmac_f32_16x16x32_f16 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX950-VGPRCD-GISEL-NEXT:    s_nop 6
-; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[8:9]
-; GFX950-VGPRCD-GISEL-NEXT:    s_endpgm
-;
 ; GFX950-AGPRCD-LABEL: test_smfmac_f32_16x16x32_f16:
 ; GFX950-AGPRCD:       ; %bb.0: ; %bb
 ; GFX950-AGPRCD-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
@@ -1604,66 +1572,121 @@ bb:
 }
 
 define amdgpu_kernel void @test_smfmac_f32_32x32x16_f16(ptr addrspace(1) %arg, <4 x half> %a, <8 x half> %b, i32 %idx) #0 {
-; GFX942-VGPRCD-SDAG-LABEL: test_smfmac_f32_32x32x16_f16:
-; GFX942-VGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x24
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dword s24, s[4:5], 0x44
-; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[20:21], s[18:19]
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v22, s24
-; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT:    v_smfmac_f32_32x32x16_f16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 7
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
-; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
-; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
-; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[16:17]
-; GFX942-VGPRCD-SDAG-NEXT:    s_endpgm
+; GFX942-SDAG-LABEL: test_smfmac_f32_32x32x16_f16:
+; GFX942-SDAG:       ; %bb.0: ; %bb
+; GFX942-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x24
+; GFX942-SDAG-NEXT:    s_load_dword s24, s[4:5], 0x44
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[20:21], s[18:19]
+; GFX942-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v22, s24
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_smfmac_f32_32x32x16_f16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v16, 0
+; GFX942-SDAG-NEXT:    s_nop 9
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[16:17]
+; GFX942-SDAG-NEXT:    s_endpgm
 ;
-; GFX942-VGPRCD-GISEL-LABEL: test_smfmac_f32_32x32x16_f16:
-; GFX942-VGPRCD-GISEL:       ; %bb.0: ; %bb
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x24
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dword s24, s[4:5], 0x44
-; GFX942-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[18:19]
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v22, s24
-; GFX942-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
-; GFX942-VGPRCD-GISEL-NEXT:    s_nop 1
-; GFX942-VGPRCD-GISEL-NEXT:    v_smfmac_f32_32x32x16_f16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-VGPRCD-GISEL-NEXT:    s_nop 7
-; GFX942-VGPRCD-GISEL-NEXT:    s_nop 1
-; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[16:17]
-; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
-; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
-; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
-; GFX942-VGPRCD-GISEL-NEXT:    s_endpgm
+; GFX942-GISEL-LABEL: test_smfmac_f32_32x32x16_f16:
+; GFX942-GISEL:       ; %bb.0: ; %bb
+; GFX942-GISEL-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x24
+; GFX942-GISEL-NEXT:    s_load_dword s24, s[4:5], 0x44
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[18:19]
+; GFX942-GISEL-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v22, s24
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_smfmac_f32_32x32x16_f16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v16, 0
+; GFX942-GISEL-NEXT:    s_nop 9
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[16:17]
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
+; GFX942-GISEL-NEXT:    s_endpgm
+;
+; GFX950-SDAG-LABEL: test_smfmac_f32_32x32x16_f16:
+; GFX950-SDAG:       ; %bb.0: ; %bb
+; GFX950-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x24
+; GFX950-SDAG-NEXT:    s_load_dword s24, s[4:5], 0x44
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[20:21], s[18:19]
+; GFX950-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v22, s24
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX950-SDAG-NEXT:    s_nop 1
+; GFX950-SDAG-NEXT:    v_smfmac_f32_32x32x16_f16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v16, 0
+; GFX950-SDAG-NEXT:    s_nop 10
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[16:17]
+; GFX950-SDAG-NEXT:    s_endpgm
 ;
+; GFX950-GISEL-LABEL: test_smfmac_f32_32x32x16_f16:
+; GFX950-GISEL:       ; %bb.0: ; %bb
+; GFX950-GISEL-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x24
+; GFX950-GISEL-NEXT:    s_load_dword s24, s[4:5], 0x44
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[18:19]
+; GFX950-GISEL-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v22, s24
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX950-GISEL-NEXT:    s_nop 1
+; GFX950-GISEL-NEXT:    v_smfmac_f32_32x32x16_f16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v16, 0
+; GFX950-GISEL-NEXT:    s_nop 10
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[16:17]
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
+; GFX950-GISEL-NEXT:    s_endpgm
 ; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_f32_32x32x16_f16:
 ; GFX942-AGPRCD-SDAG:       ; %bb.0: ; %bb
 ; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
@@ -1701,7 +1724,6 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_f16(ptr addrspace(1) %arg, <
 ; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[4:7], s[24:25] offset:16
 ; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[24:25]
 ; GFX942-AGPRCD-SDAG-NEXT:    s_endpgm
-;
 ; GFX942-AGPRCD-GISEL-LABEL: test_smfmac_f32_32x32x16_f16:
 ; GFX942-AGPRCD-GISEL:       ; %bb.0: ; %bb
 ; GFX942-AGPRCD-GISEL-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
@@ -1739,67 +1761,6 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_f16(ptr addrspace(1) %arg, <
 ; GFX942-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[8:11], s[24:25] offset:32
 ; GFX942-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[12:15], s[24:25] offset:48
 ; GFX942-AGPRCD-GISEL-NEXT:    s_endpgm
-;
-; GFX950-VGPRCD-SDAG-LABEL: test_smfmac_f32_32x32x16_f16:
-; GFX950-VGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x24
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dword s24, s[4:5], 0x44
-; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[20:21], s[18:19]
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v22, s24
-; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX950-VGPRCD-SDAG-NEXT:    v_smfmac_f32_32x32x16_f16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v16, 0
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 7
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 2
-; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
-; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
-; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
-; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[16:17]
-; GFX950-VGPRCD-SDAG-NEXT:    s_endpgm
-;
-; GFX950-VGPRCD-GISEL-LABEL: test_smfmac_f32_32x32x16_f16:
-; GFX950-VGPRCD-GISEL:       ; %bb.0: ; %bb
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x24
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dword s24, s[4:5], 0x44
-; GFX950-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[18:19]
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v22, s24
-; GFX950-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
-; GFX950-VGPRCD-GISEL-NEXT:    s_nop 1
-; GFX950-VGPRCD-GISEL-NEXT:    v_smfmac_f32_32x32x16_f16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v16, 0
-; GFX950-VGPRCD-GISEL-NEXT:    s_nop 7
-; GFX950-VGPRCD-GISEL-NEXT:    s_nop 2
-; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[16:17]
-; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
-; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
-; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
-; GFX950-VGPRCD-GISEL-NEXT:    s_endpgm
-;
 ; GFX950-AGPRCD-SDAG-LABEL: test_smfmac_f32_32x32x16_f16:
 ; GFX950-AGPRCD-SDAG:       ; %bb.0: ; %bb
 ; GFX950-AGPRCD-SDAG-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
@@ -1837,7 +1798,6 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_f16(ptr addrspace(1) %arg, <
 ; GFX950-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[4:7], s[24:25] offset:16
 ; GFX950-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[24:25]
 ; GFX950-AGPRCD-SDAG-NEXT:    s_endpgm
-;
 ; GFX950-AGPRCD-GISEL-LABEL: test_smfmac_f32_32x32x16_f16:
 ; GFX950-AGPRCD-GISEL:       ; %bb.0: ; %bb
 ; GFX950-AGPRCD-GISEL-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
@@ -1883,46 +1843,85 @@ bb:
 }
 
 define amdgpu_kernel void @test_smfmac_f32_16x16x32_bf16(ptr addrspace(1) %arg, <4 x i16> %a, <8 x i16> %b, i32 %idx) #0 {
-; GFX942-VGPRCD-SDAG-LABEL: test_smfmac_f32_16x16x32_bf16:
-; GFX942-VGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dword s6, s[4:5], 0x44
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v6, 0
-; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[10:11]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[14:15]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v7, s6
-; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[2:3]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[0:1]
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT:    v_smfmac_f32_16x16x32_bf16 v[8:11], v[4:5], v[0:3], v7 cbsz:1 abid:2
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 6
-; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v6, v[8:11], s[8:9]
-; GFX942-VGPRCD-SDAG-NEXT:    s_endpgm
+; GFX942-SDAG-LABEL: test_smfmac_f32_16x16x32_bf16:
+; GFX942-SDAG:       ; %bb.0: ; %bb
+; GFX942-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX942-SDAG-NEXT:    s_load_dword s6, s[4:5], 0x44
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[10:11]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[14:15]
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v7, s6
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[2:3]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[0:1]
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_smfmac_f32_16x16x32_bf16 v[8:11], v[4:5], v[0:3], v7 cbsz:1 abid:2
+; GFX942-SDAG-NEXT:    s_nop 6
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v6, v[8:11], s[8:9]
+; GFX942-SDAG-NEXT:    s_endpgm
 ;
-; GFX942-VGPRCD-GISEL-LABEL: test_smfmac_f32_16x16x32_bf16:
-; GFX942-VGPRCD-GISEL:       ; %bb.0: ; %bb
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
-; GFX942-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dword s6, s[4:5], 0x44
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[10:11]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[14:15]
-; GFX942-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[2:3]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[0:1]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v6, s6
-; GFX942-VGPRCD-GISEL-NEXT:    s_nop 1
-; GFX942-VGPRCD-GISEL-NEXT:    v_smfmac_f32_16x16x32_bf16 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-VGPRCD-GISEL-NEXT:    s_nop 5
-; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[8:9]
-; GFX942-VGPRCD-GISEL-NEXT:    s_endpgm
+; GFX942-GISEL-LABEL: test_smfmac_f32_16x16x32_bf16:
+; GFX942-GISEL:       ; %bb.0: ; %bb
+; GFX942-GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX942-GISEL-NEXT:    s_load_dword s6, s[4:5], 0x44
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[10:11]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[14:15]
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[2:3]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[0:1]
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v6, s6
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_smfmac_f32_16x16x32_bf16 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-GISEL-NEXT:    s_nop 5
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[8:9]
+; GFX942-GISEL-NEXT:    s_endpgm
 ;
+; GFX950-SDAG-LABEL: test_smfmac_f32_16x16x32_bf16:
+; GFX950-SDAG:       ; %bb.0: ; %bb
+; GFX950-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX950-SDAG-NEXT:    s_load_dword s6, s[4:5], 0x44
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v6, 0
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[10:11]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[14:15]
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v7, s6
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[2:3]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[0:1]
+; GFX950-SDAG-NEXT:    s_nop 1
+; GFX950-SDAG-NEXT:    v_smfmac_f32_16x16x32_bf16 v[8:11], v[4:5], v[0:3], v7 cbsz:1 abid:2
+; GFX950-SDAG-NEXT:    s_nop 7
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v6, v[8:11], s[8:9]
+; GFX950-SDAG-NEXT:    s_endpgm
+;
+; GFX950-GISEL-LABEL: test_smfmac_f32_16x16x32_bf16:
+; GFX950-GISEL:       ; %bb.0: ; %bb
+; GFX950-GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX950-GISEL-NEXT:    s_load_dword s6, s[4:5], 0x44
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[10:11]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[14:15]
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[2:3]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[0:1]
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v6, s6
+; GFX950-GISEL-NEXT:    s_nop 1
+; GFX950-GISEL-NEXT:    v_smfmac_f32_16x16x32_bf16 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX950-GISEL-NEXT:    s_nop 6
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[8:9]
+; GFX950-GISEL-NEXT:    s_endpgm
 ; GFX942-AGPRCD-LABEL: test_smfmac_f32_16x16x32_bf16:
 ; GFX942-AGPRCD:       ; %bb.0: ; %bb
 ; GFX942-AGPRCD-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
@@ -1945,47 +1944,6 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x32_bf16(ptr addrspace(1) %arg,
 ; GFX942-AGPRCD-NEXT:    s_nop 5
 ; GFX942-AGPRCD-NEXT:    global_store_dwordx4 v0, a[0:3], s[8:9]
 ; GFX942-AGPRCD-NEXT:    s_endpgm
-;
-; GFX950-VGPRCD-SDAG-LABEL: test_smfmac_f32_16x16x32_bf16:
-; GFX950-VGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dword s6, s[4:5], 0x44
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v6, 0
-; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[10:11]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[14:15]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v7, s6
-; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[2:3]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[0:1]
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX950-VGPRCD-SDAG-NEXT:    v_smfmac_f32_16x16x32_bf16 v[8:11], v[4:5], v[0:3], v7 cbsz:1 abid:2
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 7
-; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v6, v[8:11], s[8:9]
-; GFX950-VGPRCD-SDAG-NEXT:    s_endpgm
-;
-; GFX950-VGPRCD-GISEL-LABEL: test_smfmac_f32_16x16x32_bf16:
-; GFX950-VGPRCD-GISEL:       ; %bb.0: ; %bb
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
-; GFX950-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dword s6, s[4:5], 0x44
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[10:11]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[14:15]
-; GFX950-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[2:3]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[0:1]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v6, s6
-; GFX950-VGPRCD-GISEL-NEXT:    s_nop 1
-; GFX950-VGPRCD-GISEL-NEXT:    v_smfmac_f32_16x16x32_bf16 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX950-VGPRCD-GISEL-NEXT:    s_nop 6
-; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[8:9]
-; GFX950-VGPRCD-GISEL-NEXT:    s_endpgm
-;
 ; GFX950-AGPRCD-LABEL: test_smfmac_f32_16x16x32_bf16:
 ; GFX950-AGPRCD:       ; %bb.0: ; %bb
 ; GFX950-AGPRCD-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
@@ -2016,66 +1974,121 @@ bb:
 }
 
 define amdgpu_kernel void @test_smfmac_f32_32x32x16_bf16(ptr addrspace(1) %arg, <4 x i16> %a, <8 x i16> %b, i32 %idx) #0 {
-; GFX942-VGPRCD-SDAG-LABEL: test_smfmac_f32_32x32x16_bf16:
-; GFX942-VGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x24
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dword s24, s[4:5], 0x44
-; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[20:21], s[18:19]
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v22, s24
-; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT:    v_smfmac_f32_32x32x16_bf16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 7
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
-; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
-; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
-; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[16:17]
-; GFX942-VGPRCD-SDAG-NEXT:    s_endpgm
+; GFX942-SDAG-LABEL: test_smfmac_f32_32x32x16_bf16:
+; GFX942-SDAG:       ; %bb.0: ; %bb
+; GFX942-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x24
+; GFX942-SDAG-NEXT:    s_load_dword s24, s[4:5], 0x44
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[20:21], s[18:19]
+; GFX942-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v22, s24
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_smfmac_f32_32x32x16_bf16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v16, 0
+; GFX942-SDAG-NEXT:    s_nop 9
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[16:17]
+; GFX942-SDAG-NEXT:    s_endpgm
 ;
-; GFX942-VGPRCD-GISEL-LABEL: test_smfmac_f32_32x32x16_bf16:
-; GFX942-VGPRCD-GISEL:       ; %bb.0: ; %bb
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x24
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dword s24, s[4:5], 0x44
-; GFX942-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[18:19]
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v22, s24
-; GFX942-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
-; GFX942-VGPRCD-GISEL-NEXT:    s_nop 1
-; GFX942-VGPRCD-GISEL-NEXT:    v_smfmac_f32_32x32x16_bf16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-VGPRCD-GISEL-NEXT:    s_nop 7
-; GFX942-VGPRCD-GISEL-NEXT:    s_nop 1
-; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[16:17]
-; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
-; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
-; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
-; GFX942-VGPRCD-GISEL-NEXT:    s_endpgm
+; GFX942-GISEL-LABEL: test_smfmac_f32_32x32x16_bf16:
+; GFX942-GISEL:       ; %bb.0: ; %bb
+; GFX942-GISEL-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x24
+; GFX942-GISEL-NEXT:    s_load_dword s24, s[4:5], 0x44
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[18:19]
+; GFX942-GISEL-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v22, s24
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_smfmac_f32_32x32x16_bf16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v16, 0
+; GFX942-GISEL-NEXT:    s_nop 9
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[16:17]
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
+; GFX942-GISEL-NEXT:    s_endpgm
 ;
+; GFX950-SDAG-LABEL: test_smfmac_f32_32x32x16_bf16:
+; GFX950-SDAG:       ; %bb.0: ; %bb
+; GFX950-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x24
+; GFX950-SDAG-NEXT:    s_load_dword s24, s[4:5], 0x44
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[20:21], s[18:19]
+; GFX950-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v22, s24
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX950-SDAG-NEXT:    s_nop 1
+; GFX950-SDAG-NEXT:    v_smfmac_f32_32x32x16_bf16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v16, 0
+; GFX950-SDAG-NEXT:    s_nop 10
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[16:17]
+; GFX950-SDAG-NEXT:    s_endpgm
+;
+; GFX950-GISEL-LABEL: test_smfmac_f32_32x32x16_bf16:
+; GFX950-GISEL:       ; %bb.0: ; %bb
+; GFX950-GISEL-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x24
+; GFX950-GISEL-NEXT:    s_load_dword s24, s[4:5], 0x44
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[18:19]
+; GFX950-GISEL-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v22, s24
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX950-GISEL-NEXT:    s_nop 1
+; GFX950-GISEL-NEXT:    v_smfmac_f32_32x32x16_bf16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v16, 0
+; GFX950-GISEL-NEXT:    s_nop 10
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[16:17]
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
+; GFX950-GISEL-NEXT:    s_endpgm
 ; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_f32_32x32x16_bf16:
 ; GFX942-AGPRCD-SDAG:       ; %bb.0: ; %bb
 ; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
@@ -2113,7 +2126,6 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_bf16(ptr addrspace(1) %arg,
 ; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[4:7], s[24:25] offset:16
 ; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[24:25]
 ; GFX942-AGPRCD-SDAG-NEXT:    s_endpgm
-;
 ; GFX942-AGPRCD-GISEL-LABEL: test_smfmac_f32_32x32x16_bf16:
 ; GFX942-AGPRCD-GISEL:       ; %bb.0: ; %bb
 ; GFX942-AGPRCD-GISEL-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
@@ -2151,67 +2163,6 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_bf16(ptr addrspace(1) %arg,
 ; GFX942-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[8:11], s[24:25] offset:32
 ; GFX942-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[12:15], s[24:25] offset:48
 ; GFX942-AGPRCD-GISEL-NEXT:    s_endpgm
-;
-; GFX950-VGPRCD-SDAG-LABEL: test_smfmac_f32_32x32x16_bf16:
-; GFX950-VGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x24
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dword s24, s[4:5], 0x44
-; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[20:21], s[18:19]
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v22, s24
-; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX950-VGPRCD-SDAG-NEXT:    v_smfmac_f32_32x32x16_bf16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v16, 0
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 7
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 2
-; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
-; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
-; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
-; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[16:17]
-; GFX950-VGPRCD-SDAG-NEXT:    s_endpgm
-;
-; GFX950-VGPRCD-GISEL-LABEL: test_smfmac_f32_32x32x16_bf16:
-; GFX950-VGPRCD-GISEL:       ; %bb.0: ; %bb
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x24
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dword s24, s[4:5], 0x44
-; GFX950-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[18:19]
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v22, s24
-; GFX950-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
-; GFX950-VGPRCD-GISEL-NEXT:    s_nop 1
-; GFX950-VGPRCD-GISEL-NEXT:    v_smfmac_f32_32x32x16_bf16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v16, 0
-; GFX950-VGPRCD-GISEL-NEXT:    s_nop 7
-; GFX950-VGPRCD-GISEL-NEXT:    s_nop 2
-; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[16:17]
-; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
-; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
-; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
-; GFX950-VGPRCD-GISEL-NEXT:    s_endpgm
-;
 ; GFX950-AGPRCD-SDAG-LABEL: test_smfmac_f32_32x32x16_bf16:
 ; GFX950-AGPRCD-SDAG:       ; %bb.0: ; %bb
 ; GFX950-AGPRCD-SDAG-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
@@ -2249,7 +2200,6 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_bf16(ptr addrspace(1) %arg,
 ; GFX950-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[4:7], s[24:25] offset:16
 ; GFX950-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[24:25]
 ; GFX950-AGPRCD-SDAG-NEXT:    s_endpgm
-;
 ; GFX950-AGPRCD-GISEL-LABEL: test_smfmac_f32_32x32x16_bf16:
 ; GFX950-AGPRCD-GISEL:       ; %bb.0: ; %bb
 ; GFX950-AGPRCD-GISEL-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
@@ -2295,53 +2245,99 @@ bb:
 }
 
 define amdgpu_kernel void @test_smfmac_i32_16x16x64_i8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 {
-; GFX942-VGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_i8:
-; GFX942-VGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x2c
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v10, s8
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v11, s9
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, s10
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, s11
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, s12
-; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[2:3]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v5, s13
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v1, s14
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[0:1]
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT:    v_smfmac_i32_16x16x64_i8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 6
-; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, v[6:9], s[6:7]
-; GFX942-VGPRCD-SDAG-NEXT:    s_endpgm
+; GFX942-SDAG-LABEL: test_smfmac_i32_16x16x64_i8:
+; GFX942-SDAG:       ; %bb.0: ; %bb
+; GFX942-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x2c
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v10, s8
+; GFX942-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v11, s9
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v2, s10
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v3, s11
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v4, s12
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[2:3]
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v5, s13
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, s14
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[0:1]
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_smfmac_i32_16x16x64_i8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
+; GFX942-SDAG-NEXT:    s_nop 6
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v0, v[6:9], s[6:7]
+; GFX942-SDAG-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: test_smfmac_i32_16x16x64_i8:
+; GFX942-GISEL:       ; %bb.0: ; %bb
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x24
+; GFX942-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GFX942-GISEL-NEXT:    s_load_dword s14, s[4:5], 0x44
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[0:1]
+; GFX942-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[12:13], 0x0
+; GFX942-GISEL-NEXT:    s_mov_b32 s4, s2
+; GFX942-GISEL-NEXT:    s_mov_b32 s5, s3
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v6, s14
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_smfmac_i32_16x16x64_i8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-GISEL-NEXT:    s_nop 5
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[12:13]
+; GFX942-GISEL-NEXT:    s_endpgm
 ;
-; GFX942-VGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_i8:
-; GFX942-VGPRCD-GISEL:       ; %bb.0: ; %bb
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x24
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dword s14, s[4:5], 0x44
-; GFX942-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[0:1]
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[12:13], 0x0
-; GFX942-VGPRCD-GISEL-NEXT:    s_mov_b32 s4, s2
-; GFX942-VGPRCD-GISEL-NEXT:    s_mov_b32 s5, s3
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
-; GFX942-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v6, s14
-; GFX942-VGPRCD-GISEL-NEXT:    s_nop 1
-; GFX942-VGPRCD-GISEL-NEXT:    v_smfmac_i32_16x16x64_i8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-VGPRCD-GISEL-NEXT:    s_nop 5
-; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[12:13]
-; GFX942-VGPRCD-GISEL-NEXT:    s_endpgm
+; GFX950-SDAG-LABEL: test_smfmac_i32_16x16x64_i8:
+; GFX950-SDAG:       ; %bb.0: ; %bb
+; GFX950-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x2c
+; GFX950-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v10, s8
+; GFX950-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v11, s9
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, s10
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, s11
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v4, s12
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[2:3]
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v5, s13
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, s14
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[0:1]
+; GFX950-SDAG-NEXT:    s_nop 1
+; GFX950-SDAG-NEXT:    v_smfmac_i32_16x16x64_i8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
+; GFX950-SDAG-NEXT:    s_nop 7
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v0, v[6:9], s[6:7]
+; GFX950-SDAG-NEXT:    s_endpgm
 ;
+; GFX950-GISEL-LABEL: test_smfmac_i32_16x16x64_i8:
+; GFX950-GISEL:       ; %bb.0: ; %bb
+; GFX950-GISEL-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x24
+; GFX950-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX950-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GFX950-GISEL-NEXT:    s_load_dword s14, s[4:5], 0x44
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[0:1]
+; GFX950-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[12:13], 0x0
+; GFX950-GISEL-NEXT:    s_mov_b32 s4, s2
+; GFX950-GISEL-NEXT:    s_mov_b32 s5, s3
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v6, s14
+; GFX950-GISEL-NEXT:    s_nop 1
+; GFX950-GISEL-NEXT:    v_smfmac_i32_16x16x64_i8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX950-GISEL-NEXT:    s_nop 6
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[12:13]
+; GFX950-GISEL-NEXT:    s_endpgm
 ; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_i8:
 ; GFX942-AGPRCD-SDAG:       ; %bb.0: ; %bb
 ; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
@@ -2367,7 +2363,6 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_i8(ptr addrspace(1) %arg, <2
 ; GFX942-AGPRCD-SDAG-NEXT:    s_nop 5
 ; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
 ; GFX942-AGPRCD-SDAG-NEXT:    s_endpgm
-;
 ; GFX942-AGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_i8:
 ; GFX942-AGPRCD-GISEL:       ; %bb.0: ; %bb
 ; GFX942-AGPRCD-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
@@ -2398,54 +2393,6 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_i8(ptr addrspace(1) %arg, <2
 ; GFX942-AGPRCD-GISEL-NEXT:    s_nop 5
 ; GFX942-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
 ; GFX942-AGPRCD-GISEL-NEXT:    s_endpgm
-;
-; GFX950-VGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_i8:
-; GFX950-VGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x2c
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v10, s8
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v11, s9
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, s10
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, s11
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, s12
-; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[2:3]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v5, s13
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v1, s14
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[0:1]
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX950-VGPRCD-SDAG-NEXT:    v_smfmac_i32_16x16x64_i8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 7
-; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, v[6:9], s[6:7]
-; GFX950-VGPRCD-SDAG-NEXT:    s_endpgm
-;
-; GFX950-VGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_i8:
-; GFX950-VGPRCD-GISEL:       ; %bb.0: ; %bb
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x24
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dword s14, s[4:5], 0x44
-; GFX950-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[0:1]
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[12:13], 0x0
-; GFX950-VGPRCD-GISEL-NEXT:    s_mov_b32 s4, s2
-; GFX950-VGPRCD-GISEL-NEXT:    s_mov_b32 s5, s3
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
-; GFX950-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v6, s14
-; GFX950-VGPRCD-GISEL-NEXT:    s_nop 1
-; GFX950-VGPRCD-GISEL-NEXT:    v_smfmac_i32_16x16x64_i8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX950-VGPRCD-GISEL-NEXT:    s_nop 6
-; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[12:13]
-; GFX950-VGPRCD-GISEL-NEXT:    s_endpgm
-;
 ; GFX950-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_i8:
 ; GFX950-AGPRCD-SDAG:       ; %bb.0: ; %bb
 ; GFX950-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
@@ -2471,7 +2418,6 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_i8(ptr addrspace(1) %arg, <2
 ; GFX950-AGPRCD-SDAG-NEXT:    s_nop 6
 ; GFX950-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
 ; GFX950-AGPRCD-SDAG-NEXT:    s_endpgm
-;
 ; GFX950-AGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_i8:
 ; GFX950-AGPRCD-GISEL:       ; %bb.0: ; %bb
 ; GFX950-AGPRCD-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
@@ -2510,73 +2456,135 @@ bb:
 }
 
 define amdgpu_kernel void @test_smfmac_i32_32x32x32_i8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 {
-; GFX942-VGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_i8:
-; GFX942-VGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x2c
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
-; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v22, s16
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v23, s17
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v18, s18
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v19, s19
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v20, s20
-; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v21, s21
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v16, s22
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT:    v_smfmac_i32_32x32x32_i8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 7
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
-; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
-; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
-; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
-; GFX942-VGPRCD-SDAG-NEXT:    s_endpgm
+; GFX942-SDAG-LABEL: test_smfmac_i32_32x32x32_i8:
+; GFX942-SDAG:       ; %bb.0: ; %bb
+; GFX942-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x2c
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v22, s16
+; GFX942-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v23, s17
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v18, s18
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v19, s19
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v20, s20
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v21, s21
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v16, s22
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_smfmac_i32_32x32x32_i8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v16, 0
+; GFX942-SDAG-NEXT:    s_nop 9
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
+; GFX942-SDAG-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: test_smfmac_i32_32x32x32_i8:
+; GFX942-GISEL:       ; %bb.0: ; %bb
+; GFX942-GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x2c
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[22:23], s[4:5], 0x3c
+; GFX942-GISEL-NEXT:    s_load_dword s26, s[4:5], 0x44
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[16:17]
+; GFX942-GISEL-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
+; GFX942-GISEL-NEXT:    s_mov_b32 s20, s18
+; GFX942-GISEL-NEXT:    s_mov_b32 s21, s19
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v22, s26
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_smfmac_i32_32x32x32_i8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v16, 0
+; GFX942-GISEL-NEXT:    s_nop 9
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
+; GFX942-GISEL-NEXT:    s_endpgm
 ;
-; GFX942-VGPRCD-GISEL-LABEL: test_smfmac_i32_32x32x32_i8:
-; GFX942-VGPRCD-GISEL:       ; %bb.0: ; %bb
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x2c
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[22:23], s[4:5], 0x3c
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dword s26, s[4:5], 0x44
-; GFX942-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[16:17]
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
-; GFX942-VGPRCD-GISEL-NEXT:    s_mov_b32 s20, s18
-; GFX942-VGPRCD-GISEL-NEXT:    s_mov_b32 s21, s19
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v22, s26
-; GFX942-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
-; GFX942-VGPRCD-GISEL-NEXT:    s_nop 1
-; GFX942-VGPRCD-GISEL-NEXT:    v_smfmac_i32_32x32x32_i8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-VGPRCD-GISEL-NEXT:    s_nop 7
-; GFX942-VGPRCD-GISEL-NEXT:    s_nop 1
-; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
-; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
-; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
-; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
-; GFX942-VGPRCD-GISEL-NEXT:    s_endpgm
+; GFX950-SDAG-LABEL: test_smfmac_i32_32x32x32_i8:
+; GFX950-SDAG:       ; %bb.0: ; %bb
+; GFX950-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x2c
+; GFX950-SDAG-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v22, s16
+; GFX950-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v23, s17
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v18, s18
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v19, s19
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v20, s20
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v21, s21
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v16, s22
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX950-SDAG-NEXT:    s_nop 1
+; GFX950-SDAG-NEXT:    v_smfmac_i32_32x32x32_i8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v16, 0
+; GFX950-SDAG-NEXT:    s_nop 10
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
+; GFX950-SDAG-NEXT:    s_endpgm
 ;
+; GFX950-GISEL-LABEL: test_smfmac_i32_32x32x32_i8:
+; GFX950-GISEL:       ; %bb.0: ; %bb
+; GFX950-GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x2c
+; GFX950-GISEL-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX950-GISEL-NEXT:    s_load_dwordx2 s[22:23], s[4:5], 0x3c
+; GFX950-GISEL-NEXT:    s_load_dword s26, s[4:5], 0x44
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[16:17]
+; GFX950-GISEL-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
+; GFX950-GISEL-NEXT:    s_mov_b32 s20, s18
+; GFX950-GISEL-NEXT:    s_mov_b32 s21, s19
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v22, s26
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX950-GISEL-NEXT:    s_nop 1
+; GFX950-GISEL-NEXT:    v_smfmac_i32_32x32x32_i8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v16, 0
+; GFX950-GISEL-NEXT:    s_nop 10
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
+; GFX950-GISEL-NEXT:    s_endpgm
 ; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_i8:
 ; GFX942-AGPRCD-SDAG:       ; %bb.0: ; %bb
 ; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
@@ -2618,7 +2626,6 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_i8(ptr addrspace(1) %arg, <2
 ; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
 ; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
 ; GFX942-AGPRCD-SDAG-NEXT:    s_endpgm
-;
 ; GFX942-AGPRCD-GISEL-LABEL: test_smfmac_i32_32x32x32_i8:
 ; GFX942-AGPRCD-GISEL:       ; %bb.0: ; %bb
 ; GFX942-AGPRCD-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
@@ -2661,74 +2668,6 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_i8(ptr addrspace(1) %arg, <2
 ; GFX942-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
 ; GFX942-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
 ; GFX942-AGPRCD-GISEL-NEXT:    s_endpgm
-;
-; GFX950-VGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_i8:
-; GFX950-VGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x2c
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
-; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v22, s16
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v23, s17
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v18, s18
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v19, s19
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v20, s20
-; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v21, s21
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v16, s22
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX950-VGPRCD-SDAG-NEXT:    v_smfmac_i32_32x32x32_i8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v16, 0
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 7
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 2
-; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
-; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
-; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
-; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
-; GFX950-VGPRCD-SDAG-NEXT:    s_endpgm
-;
-; GFX950-VGPRCD-GISEL-LABEL: test_smfmac_i32_32x32x32_i8:
-; GFX950-VGPRCD-GISEL:       ; %bb.0: ; %bb
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x2c
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[22:23], s[4:5], 0x3c
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dword s26, s[4:5], 0x44
-; GFX950-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[16:17]
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
-; GFX950-VGPRCD-GISEL-NEXT:    s_mov_b32 s20, s18
-; GFX950-VGPRCD-GISEL-NEXT:    s_mov_b32 s21, s19
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v22, s26
-; GFX950-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
-; GFX950-VGPRCD-GISEL-NEXT:    s_nop 1
-; GFX950-VGPRCD-GISEL-NEXT:    v_smfmac_i32_32x32x32_i8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v16, 0
-; GFX950-VGPRCD-GISEL-NEXT:    s_nop 7
-; GFX950-VGPRCD-GISEL-NEXT:    s_nop 2
-; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
-; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
-; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
-; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
-; GFX950-VGPRCD-GISEL-NEXT:    s_endpgm
-;
 ; GFX950-AGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_i8:
 ; GFX950-AGPRCD-SDAG:       ; %bb.0: ; %bb
 ; GFX950-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
@@ -2770,7 +2709,6 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_i8(ptr addrspace(1) %arg, <2
 ; GFX950-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
 ; GFX950-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
 ; GFX950-AGPRCD-SDAG-NEXT:    s_endpgm
-;
 ; GFX950-AGPRCD-GISEL-LABEL: test_smfmac_i32_32x32x32_i8:
 ; GFX950-AGPRCD-GISEL:       ; %bb.0: ; %bb
 ; GFX950-AGPRCD-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
@@ -2821,53 +2759,99 @@ bb:
 }
 
 define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_bf8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 {
-; GFX942-VGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_bf8_bf8:
-; GFX942-VGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x2c
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v10, s8
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v11, s9
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, s10
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, s11
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, s12
-; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[2:3]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v5, s13
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v1, s14
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[0:1]
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT:    v_smfmac_f32_16x16x64_bf8_bf8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 6
-; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, v[6:9], s[6:7]
-; GFX942-VGPRCD-SDAG-NEXT:    s_endpgm
+; GFX942-SDAG-LABEL: test_smfmac_i32_16x16x64_bf8_bf8:
+; GFX942-SDAG:       ; %bb.0: ; %bb
+; GFX942-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x2c
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v10, s8
+; GFX942-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v11, s9
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v2, s10
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v3, s11
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v4, s12
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[2:3]
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v5, s13
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, s14
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[0:1]
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_smfmac_f32_16x16x64_bf8_bf8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
+; GFX942-SDAG-NEXT:    s_nop 6
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v0, v[6:9], s[6:7]
+; GFX942-SDAG-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: test_smfmac_i32_16x16x64_bf8_bf8:
+; GFX942-GISEL:       ; %bb.0: ; %bb
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x24
+; GFX942-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GFX942-GISEL-NEXT:    s_load_dword s14, s[4:5], 0x44
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[0:1]
+; GFX942-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[12:13], 0x0
+; GFX942-GISEL-NEXT:    s_mov_b32 s4, s2
+; GFX942-GISEL-NEXT:    s_mov_b32 s5, s3
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v6, s14
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_smfmac_f32_16x16x64_bf8_bf8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-GISEL-NEXT:    s_nop 5
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[12:13]
+; GFX942-GISEL-NEXT:    s_endpgm
 ;
-; GFX942-VGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_bf8_bf8:
-; GFX942-VGPRCD-GISEL:       ; %bb.0: ; %bb
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x24
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dword s14, s[4:5], 0x44
-; GFX942-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[0:1]
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[12:13], 0x0
-; GFX942-VGPRCD-GISEL-NEXT:    s_mov_b32 s4, s2
-; GFX942-VGPRCD-GISEL-NEXT:    s_mov_b32 s5, s3
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
-; GFX942-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v6, s14
-; GFX942-VGPRCD-GISEL-NEXT:    s_nop 1
-; GFX942-VGPRCD-GISEL-NEXT:    v_smfmac_f32_16x16x64_bf8_bf8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-VGPRCD-GISEL-NEXT:    s_nop 5
-; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[12:13]
-; GFX942-VGPRCD-GISEL-NEXT:    s_endpgm
+; GFX950-SDAG-LABEL: test_smfmac_i32_16x16x64_bf8_bf8:
+; GFX950-SDAG:       ; %bb.0: ; %bb
+; GFX950-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x2c
+; GFX950-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v10, s8
+; GFX950-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v11, s9
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, s10
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, s11
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v4, s12
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[2:3]
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v5, s13
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, s14
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[0:1]
+; GFX950-SDAG-NEXT:    s_nop 1
+; GFX950-SDAG-NEXT:    v_smfmac_f32_16x16x64_bf8_bf8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
+; GFX950-SDAG-NEXT:    s_nop 7
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v0, v[6:9], s[6:7]
+; GFX950-SDAG-NEXT:    s_endpgm
 ;
+; GFX950-GISEL-LABEL: test_smfmac_i32_16x16x64_bf8_bf8:
+; GFX950-GISEL:       ; %bb.0: ; %bb
+; GFX950-GISEL-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x24
+; GFX950-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX950-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GFX950-GISEL-NEXT:    s_load_dword s14, s[4:5], 0x44
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[0:1]
+; GFX950-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[12:13], 0x0
+; GFX950-GISEL-NEXT:    s_mov_b32 s4, s2
+; GFX950-GISEL-NEXT:    s_mov_b32 s5, s3
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v6, s14
+; GFX950-GISEL-NEXT:    s_nop 1
+; GFX950-GISEL-NEXT:    v_smfmac_f32_16x16x64_bf8_bf8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX950-GISEL-NEXT:    s_nop 6
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[12:13]
+; GFX950-GISEL-NEXT:    s_endpgm
 ; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_bf8_bf8:
 ; GFX942-AGPRCD-SDAG:       ; %bb.0: ; %bb
 ; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
@@ -2893,7 +2877,6 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_bf8(ptr addrspace(1) %ar
 ; GFX942-AGPRCD-SDAG-NEXT:    s_nop 5
 ; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
 ; GFX942-AGPRCD-SDAG-NEXT:    s_endpgm
-;
 ; GFX942-AGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_bf8_bf8:
 ; GFX942-AGPRCD-GISEL:       ; %bb.0: ; %bb
 ; GFX942-AGPRCD-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
@@ -2924,54 +2907,6 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_bf8(ptr addrspace(1) %ar
 ; GFX942-AGPRCD-GISEL-NEXT:    s_nop 5
 ; GFX942-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
 ; GFX942-AGPRCD-GISEL-NEXT:    s_endpgm
-;
-; GFX950-VGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_bf8_bf8:
-; GFX950-VGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x2c
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v10, s8
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v11, s9
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, s10
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, s11
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, s12
-; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[2:3]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v5, s13
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v1, s14
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[0:1]
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX950-VGPRCD-SDAG-NEXT:    v_smfmac_f32_16x16x64_bf8_bf8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 7
-; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, v[6:9], s[6:7]
-; GFX950-VGPRCD-SDAG-NEXT:    s_endpgm
-;
-; GFX950-VGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_bf8_bf8:
-; GFX950-VGPRCD-GISEL:       ; %bb.0: ; %bb
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x24
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dword s14, s[4:5], 0x44
-; GFX950-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[0:1]
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[12:13], 0x0
-; GFX950-VGPRCD-GISEL-NEXT:    s_mov_b32 s4, s2
-; GFX950-VGPRCD-GISEL-NEXT:    s_mov_b32 s5, s3
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
-; GFX950-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v6, s14
-; GFX950-VGPRCD-GISEL-NEXT:    s_nop 1
-; GFX950-VGPRCD-GISEL-NEXT:    v_smfmac_f32_16x16x64_bf8_bf8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX950-VGPRCD-GISEL-NEXT:    s_nop 6
-; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[12:13]
-; GFX950-VGPRCD-GISEL-NEXT:    s_endpgm
-;
 ; GFX950-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_bf8_bf8:
 ; GFX950-AGPRCD-SDAG:       ; %bb.0: ; %bb
 ; GFX950-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
@@ -2997,7 +2932,6 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_bf8(ptr addrspace(1) %ar
 ; GFX950-AGPRCD-SDAG-NEXT:    s_nop 6
 ; GFX950-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
 ; GFX950-AGPRCD-SDAG-NEXT:    s_endpgm
-;
 ; GFX950-AGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_bf8_bf8:
 ; GFX950-AGPRCD-GISEL:       ; %bb.0: ; %bb
 ; GFX950-AGPRCD-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
@@ -3036,53 +2970,99 @@ bb:
 }
 
 define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_fp8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 {
-; GFX942-VGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_bf8_fp8:
-; GFX942-VGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x2c
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v10, s8
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v11, s9
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, s10
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, s11
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, s12
-; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[2:3]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v5, s13
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v1, s14
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[0:1]
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT:    v_smfmac_f32_16x16x64_bf8_fp8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 6
-; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, v[6:9], s[6:7]
-; GFX942-VGPRCD-SDAG-NEXT:    s_endpgm
+; GFX942-SDAG-LABEL: test_smfmac_i32_16x16x64_bf8_fp8:
+; GFX942-SDAG:       ; %bb.0: ; %bb
+; GFX942-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x2c
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v10, s8
+; GFX942-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v11, s9
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v2, s10
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v3, s11
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v4, s12
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[2:3]
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v5, s13
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, s14
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[0:1]
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_smfmac_f32_16x16x64_bf8_fp8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
+; GFX942-SDAG-NEXT:    s_nop 6
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v0, v[6:9], s[6:7]
+; GFX942-SDAG-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: test_smfmac_i32_16x16x64_bf8_fp8:
+; GFX942-GISEL:       ; %bb.0: ; %bb
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x24
+; GFX942-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GFX942-GISEL-NEXT:    s_load_dword s14, s[4:5], 0x44
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[0:1]
+; GFX942-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[12:13], 0x0
+; GFX942-GISEL-NEXT:    s_mov_b32 s4, s2
+; GFX942-GISEL-NEXT:    s_mov_b32 s5, s3
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v6, s14
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_smfmac_f32_16x16x64_bf8_fp8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-GISEL-NEXT:    s_nop 5
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[12:13]
+; GFX942-GISEL-NEXT:    s_endpgm
 ;
-; GFX942-VGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_bf8_fp8:
-; GFX942-VGPRCD-GISEL:       ; %bb.0: ; %bb
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x24
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dword s14, s[4:5], 0x44
-; GFX942-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[0:1]
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[12:13], 0x0
-; GFX942-VGPRCD-GISEL-NEXT:    s_mov_b32 s4, s2
-; GFX942-VGPRCD-GISEL-NEXT:    s_mov_b32 s5, s3
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
-; GFX942-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v6, s14
-; GFX942-VGPRCD-GISEL-NEXT:    s_nop 1
-; GFX942-VGPRCD-GISEL-NEXT:    v_smfmac_f32_16x16x64_bf8_fp8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-VGPRCD-GISEL-NEXT:    s_nop 5
-; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[12:13]
-; GFX942-VGPRCD-GISEL-NEXT:    s_endpgm
+; GFX950-SDAG-LABEL: test_smfmac_i32_16x16x64_bf8_fp8:
+; GFX950-SDAG:       ; %bb.0: ; %bb
+; GFX950-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x2c
+; GFX950-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v10, s8
+; GFX950-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v11, s9
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, s10
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, s11
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v4, s12
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[2:3]
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v5, s13
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, s14
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[0:1]
+; GFX950-SDAG-NEXT:    s_nop 1
+; GFX950-SDAG-NEXT:    v_smfmac_f32_16x16x64_bf8_fp8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
+; GFX950-SDAG-NEXT:    s_nop 7
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v0, v[6:9], s[6:7]
+; GFX950-SDAG-NEXT:    s_endpgm
 ;
+; GFX950-GISEL-LABEL: test_smfmac_i32_16x16x64_bf8_fp8:
+; GFX950-GISEL:       ; %bb.0: ; %bb
+; GFX950-GISEL-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x24
+; GFX950-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX950-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GFX950-GISEL-NEXT:    s_load_dword s14, s[4:5], 0x44
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[0:1]
+; GFX950-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[12:13], 0x0
+; GFX950-GISEL-NEXT:    s_mov_b32 s4, s2
+; GFX950-GISEL-NEXT:    s_mov_b32 s5, s3
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v6, s14
+; GFX950-GISEL-NEXT:    s_nop 1
+; GFX950-GISEL-NEXT:    v_smfmac_f32_16x16x64_bf8_fp8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX950-GISEL-NEXT:    s_nop 6
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[12:13]
+; GFX950-GISEL-NEXT:    s_endpgm
 ; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_bf8_fp8:
 ; GFX942-AGPRCD-SDAG:       ; %bb.0: ; %bb
 ; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
@@ -3108,7 +3088,6 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_fp8(ptr addrspace(1) %ar
 ; GFX942-AGPRCD-SDAG-NEXT:    s_nop 5
 ; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
 ; GFX942-AGPRCD-SDAG-NEXT:    s_endpgm
-;
 ; GFX942-AGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_bf8_fp8:
 ; GFX942-AGPRCD-GISEL:       ; %bb.0: ; %bb
 ; GFX942-AGPRCD-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
@@ -3139,54 +3118,6 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_fp8(ptr addrspace(1) %ar
 ; GFX942-AGPRCD-GISEL-NEXT:    s_nop 5
 ; GFX942-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
 ; GFX942-AGPRCD-GISEL-NEXT:    s_endpgm
-;
-; GFX950-VGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_bf8_fp8:
-; GFX950-VGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x2c
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v10, s8
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v11, s9
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, s10
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, s11
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, s12
-; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[2:3]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v5, s13
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v1, s14
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[0:1]
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX950-VGPRCD-SDAG-NEXT:    v_smfmac_f32_16x16x64_bf8_fp8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 7
-; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, v[6:9], s[6:7]
-; GFX950-VGPRCD-SDAG-NEXT:    s_endpgm
-;
-; GFX950-VGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_bf8_fp8:
-; GFX950-VGPRCD-GISEL:       ; %bb.0: ; %bb
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x24
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dword s14, s[4:5], 0x44
-; GFX950-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[0:1]
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[12:13], 0x0
-; GFX950-VGPRCD-GISEL-NEXT:    s_mov_b32 s4, s2
-; GFX950-VGPRCD-GISEL-NEXT:    s_mov_b32 s5, s3
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
-; GFX950-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v6, s14
-; GFX950-VGPRCD-GISEL-NEXT:    s_nop 1
-; GFX950-VGPRCD-GISEL-NEXT:    v_smfmac_f32_16x16x64_bf8_fp8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX950-VGPRCD-GISEL-NEXT:    s_nop 6
-; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[12:13]
-; GFX950-VGPRCD-GISEL-NEXT:    s_endpgm
-;
 ; GFX950-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_bf8_fp8:
 ; GFX950-AGPRCD-SDAG:       ; %bb.0: ; %bb
 ; GFX950-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
@@ -3212,7 +3143,6 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_fp8(ptr addrspace(1) %ar
 ; GFX950-AGPRCD-SDAG-NEXT:    s_nop 6
 ; GFX950-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
 ; GFX950-AGPRCD-SDAG-NEXT:    s_endpgm
-;
 ; GFX950-AGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_bf8_fp8:
 ; GFX950-AGPRCD-GISEL:       ; %bb.0: ; %bb
 ; GFX950-AGPRCD-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
@@ -3251,53 +3181,99 @@ bb:
 }
 
 define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_bf8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 {
-; GFX942-VGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_fp8_bf8:
-; GFX942-VGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x2c
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v10, s8
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v11, s9
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, s10
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, s11
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, s12
-; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[2:3]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v5, s13
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v1, s14
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[0:1]
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT:    v_smfmac_f32_16x16x64_fp8_bf8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 6
-; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, v[6:9], s[6:7]
-; GFX942-VGPRCD-SDAG-NEXT:    s_endpgm
+; GFX942-SDAG-LABEL: test_smfmac_i32_16x16x64_fp8_bf8:
+; GFX942-SDAG:       ; %bb.0: ; %bb
+; GFX942-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x2c
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v10, s8
+; GFX942-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v11, s9
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v2, s10
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v3, s11
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v4, s12
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[2:3]
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v5, s13
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, s14
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[0:1]
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_smfmac_f32_16x16x64_fp8_bf8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
+; GFX942-SDAG-NEXT:    s_nop 6
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v0, v[6:9], s[6:7]
+; GFX942-SDAG-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: test_smfmac_i32_16x16x64_fp8_bf8:
+; GFX942-GISEL:       ; %bb.0: ; %bb
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x24
+; GFX942-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GFX942-GISEL-NEXT:    s_load_dword s14, s[4:5], 0x44
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[0:1]
+; GFX942-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[12:13], 0x0
+; GFX942-GISEL-NEXT:    s_mov_b32 s4, s2
+; GFX942-GISEL-NEXT:    s_mov_b32 s5, s3
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v6, s14
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_smfmac_f32_16x16x64_fp8_bf8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-GISEL-NEXT:    s_nop 5
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[12:13]
+; GFX942-GISEL-NEXT:    s_endpgm
 ;
-; GFX942-VGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_fp8_bf8:
-; GFX942-VGPRCD-GISEL:       ; %bb.0: ; %bb
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x24
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dword s14, s[4:5], 0x44
-; GFX942-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[0:1]
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[12:13], 0x0
-; GFX942-VGPRCD-GISEL-NEXT:    s_mov_b32 s4, s2
-; GFX942-VGPRCD-GISEL-NEXT:    s_mov_b32 s5, s3
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
-; GFX942-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v6, s14
-; GFX942-VGPRCD-GISEL-NEXT:    s_nop 1
-; GFX942-VGPRCD-GISEL-NEXT:    v_smfmac_f32_16x16x64_fp8_bf8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-VGPRCD-GISEL-NEXT:    s_nop 5
-; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[12:13]
-; GFX942-VGPRCD-GISEL-NEXT:    s_endpgm
+; GFX950-SDAG-LABEL: test_smfmac_i32_16x16x64_fp8_bf8:
+; GFX950-SDAG:       ; %bb.0: ; %bb
+; GFX950-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x2c
+; GFX950-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v10, s8
+; GFX950-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v11, s9
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, s10
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, s11
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v4, s12
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[2:3]
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v5, s13
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, s14
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[0:1]
+; GFX950-SDAG-NEXT:    s_nop 1
+; GFX950-SDAG-NEXT:    v_smfmac_f32_16x16x64_fp8_bf8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
+; GFX950-SDAG-NEXT:    s_nop 7
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v0, v[6:9], s[6:7]
+; GFX950-SDAG-NEXT:    s_endpgm
 ;
+; GFX950-GISEL-LABEL: test_smfmac_i32_16x16x64_fp8_bf8:
+; GFX950-GISEL:       ; %bb.0: ; %bb
+; GFX950-GISEL-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x24
+; GFX950-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX950-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GFX950-GISEL-NEXT:    s_load_dword s14, s[4:5], 0x44
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[0:1]
+; GFX950-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[12:13], 0x0
+; GFX950-GISEL-NEXT:    s_mov_b32 s4, s2
+; GFX950-GISEL-NEXT:    s_mov_b32 s5, s3
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v6, s14
+; GFX950-GISEL-NEXT:    s_nop 1
+; GFX950-GISEL-NEXT:    v_smfmac_f32_16x16x64_fp8_bf8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX950-GISEL-NEXT:    s_nop 6
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[12:13]
+; GFX950-GISEL-NEXT:    s_endpgm
 ; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_fp8_bf8:
 ; GFX942-AGPRCD-SDAG:       ; %bb.0: ; %bb
 ; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
@@ -3323,7 +3299,6 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_bf8(ptr addrspace(1) %ar
 ; GFX942-AGPRCD-SDAG-NEXT:    s_nop 5
 ; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
 ; GFX942-AGPRCD-SDAG-NEXT:    s_endpgm
-;
 ; GFX942-AGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_fp8_bf8:
 ; GFX942-AGPRCD-GISEL:       ; %bb.0: ; %bb
 ; GFX942-AGPRCD-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
@@ -3354,54 +3329,6 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_bf8(ptr addrspace(1) %ar
 ; GFX942-AGPRCD-GISEL-NEXT:    s_nop 5
 ; GFX942-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
 ; GFX942-AGPRCD-GISEL-NEXT:    s_endpgm
-;
-; GFX950-VGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_fp8_bf8:
-; GFX950-VGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x2c
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v10, s8
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v11, s9
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, s10
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, s11
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, s12
-; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[2:3]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v5, s13
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v1, s14
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[0:1]
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX950-VGPRCD-SDAG-NEXT:    v_smfmac_f32_16x16x64_fp8_bf8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 7
-; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, v[6:9], s[6:7]
-; GFX950-VGPRCD-SDAG-NEXT:    s_endpgm
-;
-; GFX950-VGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_fp8_bf8:
-; GFX950-VGPRCD-GISEL:       ; %bb.0: ; %bb
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x24
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dword s14, s[4:5], 0x44
-; GFX950-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[0:1]
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[12:13], 0x0
-; GFX950-VGPRCD-GISEL-NEXT:    s_mov_b32 s4, s2
-; GFX950-VGPRCD-GISEL-NEXT:    s_mov_b32 s5, s3
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
-; GFX950-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v6, s14
-; GFX950-VGPRCD-GISEL-NEXT:    s_nop 1
-; GFX950-VGPRCD-GISEL-NEXT:    v_smfmac_f32_16x16x64_fp8_bf8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX950-VGPRCD-GISEL-NEXT:    s_nop 6
-; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[12:13]
-; GFX950-VGPRCD-GISEL-NEXT:    s_endpgm
-;
 ; GFX950-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_fp8_bf8:
 ; GFX950-AGPRCD-SDAG:       ; %bb.0: ; %bb
 ; GFX950-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
@@ -3427,7 +3354,6 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_bf8(ptr addrspace(1) %ar
 ; GFX950-AGPRCD-SDAG-NEXT:    s_nop 6
 ; GFX950-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
 ; GFX950-AGPRCD-SDAG-NEXT:    s_endpgm
-;
 ; GFX950-AGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_fp8_bf8:
 ; GFX950-AGPRCD-GISEL:       ; %bb.0: ; %bb
 ; GFX950-AGPRCD-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
@@ -3466,53 +3392,99 @@ bb:
 }
 
 define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_fp8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 {
-; GFX942-VGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_fp8_fp8:
-; GFX942-VGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x2c
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v10, s8
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v11, s9
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, s10
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, s11
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, s12
-; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[2:3]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v5, s13
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v1, s14
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[0:1]
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT:    v_smfmac_f32_16x16x64_fp8_fp8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 6
-; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, v[6:9], s[6:7]
-; GFX942-VGPRCD-SDAG-NEXT:    s_endpgm
+; GFX942-SDAG-LABEL: test_smfmac_i32_16x16x64_fp8_fp8:
+; GFX942-SDAG:       ; %bb.0: ; %bb
+; GFX942-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x2c
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v10, s8
+; GFX942-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v11, s9
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v2, s10
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v3, s11
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v4, s12
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[2:3]
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v5, s13
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, s14
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[0:1]
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_smfmac_f32_16x16x64_fp8_fp8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
+; GFX942-SDAG-NEXT:    s_nop 6
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v0, v[6:9], s[6:7]
+; GFX942-SDAG-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: test_smfmac_i32_16x16x64_fp8_fp8:
+; GFX942-GISEL:       ; %bb.0: ; %bb
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x24
+; GFX942-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GFX942-GISEL-NEXT:    s_load_dword s14, s[4:5], 0x44
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[0:1]
+; GFX942-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[12:13], 0x0
+; GFX942-GISEL-NEXT:    s_mov_b32 s4, s2
+; GFX942-GISEL-NEXT:    s_mov_b32 s5, s3
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v6, s14
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_smfmac_f32_16x16x64_fp8_fp8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-GISEL-NEXT:    s_nop 5
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[12:13]
+; GFX942-GISEL-NEXT:    s_endpgm
 ;
-; GFX942-VGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_fp8_fp8:
-; GFX942-VGPRCD-GISEL:       ; %bb.0: ; %bb
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x24
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dword s14, s[4:5], 0x44
-; GFX942-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[0:1]
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[12:13], 0x0
-; GFX942-VGPRCD-GISEL-NEXT:    s_mov_b32 s4, s2
-; GFX942-VGPRCD-GISEL-NEXT:    s_mov_b32 s5, s3
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
-; GFX942-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v6, s14
-; GFX942-VGPRCD-GISEL-NEXT:    s_nop 1
-; GFX942-VGPRCD-GISEL-NEXT:    v_smfmac_f32_16x16x64_fp8_fp8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-VGPRCD-GISEL-NEXT:    s_nop 5
-; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[12:13]
-; GFX942-VGPRCD-GISEL-NEXT:    s_endpgm
+; GFX950-SDAG-LABEL: test_smfmac_i32_16x16x64_fp8_fp8:
+; GFX950-SDAG:       ; %bb.0: ; %bb
+; GFX950-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x2c
+; GFX950-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v10, s8
+; GFX950-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v11, s9
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, s10
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, s11
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v4, s12
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[2:3]
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v5, s13
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, s14
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[0:1]
+; GFX950-SDAG-NEXT:    s_nop 1
+; GFX950-SDAG-NEXT:    v_smfmac_f32_16x16x64_fp8_fp8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
+; GFX950-SDAG-NEXT:    s_nop 7
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v0, v[6:9], s[6:7]
+; GFX950-SDAG-NEXT:    s_endpgm
 ;
+; GFX950-GISEL-LABEL: test_smfmac_i32_16x16x64_fp8_fp8:
+; GFX950-GISEL:       ; %bb.0: ; %bb
+; GFX950-GISEL-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x24
+; GFX950-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX950-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GFX950-GISEL-NEXT:    s_load_dword s14, s[4:5], 0x44
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[0:1]
+; GFX950-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[12:13], 0x0
+; GFX950-GISEL-NEXT:    s_mov_b32 s4, s2
+; GFX950-GISEL-NEXT:    s_mov_b32 s5, s3
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v6, s14
+; GFX950-GISEL-NEXT:    s_nop 1
+; GFX950-GISEL-NEXT:    v_smfmac_f32_16x16x64_fp8_fp8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX950-GISEL-NEXT:    s_nop 6
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[12:13]
+; GFX950-GISEL-NEXT:    s_endpgm
 ; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_fp8_fp8:
 ; GFX942-AGPRCD-SDAG:       ; %bb.0: ; %bb
 ; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
@@ -3538,7 +3510,6 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_fp8(ptr addrspace(1) %ar
 ; GFX942-AGPRCD-SDAG-NEXT:    s_nop 5
 ; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
 ; GFX942-AGPRCD-SDAG-NEXT:    s_endpgm
-;
 ; GFX942-AGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_fp8_fp8:
 ; GFX942-AGPRCD-GISEL:       ; %bb.0: ; %bb
 ; GFX942-AGPRCD-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
@@ -3569,54 +3540,6 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_fp8(ptr addrspace(1) %ar
 ; GFX942-AGPRCD-GISEL-NEXT:    s_nop 5
 ; GFX942-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
 ; GFX942-AGPRCD-GISEL-NEXT:    s_endpgm
-;
-; GFX950-VGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_fp8_fp8:
-; GFX950-VGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x2c
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v10, s8
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v11, s9
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, s10
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, s11
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, s12
-; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[2:3]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v5, s13
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v1, s14
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[0:1]
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX950-VGPRCD-SDAG-NEXT:    v_smfmac_f32_16x16x64_fp8_fp8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 7
-; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, v[6:9], s[6:7]
-; GFX950-VGPRCD-SDAG-NEXT:    s_endpgm
-;
-; GFX950-VGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_fp8_fp8:
-; GFX950-VGPRCD-GISEL:       ; %bb.0: ; %bb
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x24
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dword s14, s[4:5], 0x44
-; GFX950-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[0:1]
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[12:13], 0x0
-; GFX950-VGPRCD-GISEL-NEXT:    s_mov_b32 s4, s2
-; GFX950-VGPRCD-GISEL-NEXT:    s_mov_b32 s5, s3
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
-; GFX950-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v6, s14
-; GFX950-VGPRCD-GISEL-NEXT:    s_nop 1
-; GFX950-VGPRCD-GISEL-NEXT:    v_smfmac_f32_16x16x64_fp8_fp8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX950-VGPRCD-GISEL-NEXT:    s_nop 6
-; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[12:13]
-; GFX950-VGPRCD-GISEL-NEXT:    s_endpgm
-;
 ; GFX950-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_fp8_fp8:
 ; GFX950-AGPRCD-SDAG:       ; %bb.0: ; %bb
 ; GFX950-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
@@ -3642,7 +3565,6 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_fp8(ptr addrspace(1) %ar
 ; GFX950-AGPRCD-SDAG-NEXT:    s_nop 6
 ; GFX950-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
 ; GFX950-AGPRCD-SDAG-NEXT:    s_endpgm
-;
 ; GFX950-AGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_fp8_fp8:
 ; GFX950-AGPRCD-GISEL:       ; %bb.0: ; %bb
 ; GFX950-AGPRCD-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
@@ -3681,73 +3603,135 @@ bb:
 }
 
 define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_bf8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 {
-; GFX942-VGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_bf8_bf8:
-; GFX942-VGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x2c
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
-; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v22, s16
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v23, s17
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v18, s18
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v19, s19
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v20, s20
-; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v21, s21
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v16, s22
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT:    v_smfmac_f32_32x32x32_bf8_bf8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 7
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
-; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
-; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
-; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
-; GFX942-VGPRCD-SDAG-NEXT:    s_endpgm
+; GFX942-SDAG-LABEL: test_smfmac_i32_32x32x32_bf8_bf8:
+; GFX942-SDAG:       ; %bb.0: ; %bb
+; GFX942-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x2c
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v22, s16
+; GFX942-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v23, s17
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v18, s18
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v19, s19
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v20, s20
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v21, s21
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v16, s22
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_smfmac_f32_32x32x32_bf8_bf8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v16, 0
+; GFX942-SDAG-NEXT:    s_nop 9
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
+; GFX942-SDAG-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: test_smfmac_i32_32x32x32_bf8_bf8:
+; GFX942-GISEL:       ; %bb.0: ; %bb
+; GFX942-GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x2c
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[22:23], s[4:5], 0x3c
+; GFX942-GISEL-NEXT:    s_load_dword s26, s[4:5], 0x44
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[16:17]
+; GFX942-GISEL-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
+; GFX942-GISEL-NEXT:    s_mov_b32 s20, s18
+; GFX942-GISEL-NEXT:    s_mov_b32 s21, s19
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v22, s26
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_smfmac_f32_32x32x32_bf8_bf8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v16, 0
+; GFX942-GISEL-NEXT:    s_nop 9
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
+; GFX942-GISEL-NEXT:    s_endpgm
 ;
-; GFX942-VGPRCD-GISEL-LABEL: test_smfmac_i32_32x32x32_bf8_bf8:
-; GFX942-VGPRCD-GISEL:       ; %bb.0: ; %bb
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x2c
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[22:23], s[4:5], 0x3c
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dword s26, s[4:5], 0x44
-; GFX942-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[16:17]
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
-; GFX942-VGPRCD-GISEL-NEXT:    s_mov_b32 s20, s18
-; GFX942-VGPRCD-GISEL-NEXT:    s_mov_b32 s21, s19
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v22, s26
-; GFX942-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
-; GFX942-VGPRCD-GISEL-NEXT:    s_nop 1
-; GFX942-VGPRCD-GISEL-NEXT:    v_smfmac_f32_32x32x32_bf8_bf8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-VGPRCD-GISEL-NEXT:    s_nop 7
-; GFX942-VGPRCD-GISEL-NEXT:    s_nop 1
-; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
-; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
-; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
-; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
-; GFX942-VGPRCD-GISEL-NEXT:    s_endpgm
+; GFX950-SDAG-LABEL: test_smfmac_i32_32x32x32_bf8_bf8:
+; GFX950-SDAG:       ; %bb.0: ; %bb
+; GFX950-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x2c
+; GFX950-SDAG-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v22, s16
+; GFX950-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v23, s17
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v18, s18
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v19, s19
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v20, s20
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v21, s21
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v16, s22
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX950-SDAG-NEXT:    s_nop 1
+; GFX950-SDAG-NEXT:    v_smfmac_f32_32x32x32_bf8_bf8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v16, 0
+; GFX950-SDAG-NEXT:    s_nop 10
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
+; GFX950-SDAG-NEXT:    s_endpgm
 ;
+; GFX950-GISEL-LABEL: test_smfmac_i32_32x32x32_bf8_bf8:
+; GFX950-GISEL:       ; %bb.0: ; %bb
+; GFX950-GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x2c
+; GFX950-GISEL-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX950-GISEL-NEXT:    s_load_dwordx2 s[22:23], s[4:5], 0x3c
+; GFX950-GISEL-NEXT:    s_load_dword s26, s[4:5], 0x44
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[16:17]
+; GFX950-GISEL-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
+; GFX950-GISEL-NEXT:    s_mov_b32 s20, s18
+; GFX950-GISEL-NEXT:    s_mov_b32 s21, s19
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v22, s26
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX950-GISEL-NEXT:    s_nop 1
+; GFX950-GISEL-NEXT:    v_smfmac_f32_32x32x32_bf8_bf8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v16, 0
+; GFX950-GISEL-NEXT:    s_nop 10
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
+; GFX950-GISEL-NEXT:    s_endpgm
 ; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_bf8_bf8:
 ; GFX942-AGPRCD-SDAG:       ; %bb.0: ; %bb
 ; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
@@ -3789,7 +3773,6 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_bf8(ptr addrspace(1) %ar
 ; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
 ; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
 ; GFX942-AGPRCD-SDAG-NEXT:    s_endpgm
-;
 ; GFX942-AGPRCD-GISEL-LABEL: test_smfmac_i32_32x32x32_bf8_bf8:
 ; GFX942-AGPRCD-GISEL:       ; %bb.0: ; %bb
 ; GFX942-AGPRCD-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
@@ -3832,74 +3815,6 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_bf8(ptr addrspace(1) %ar
 ; GFX942-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
 ; GFX942-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
 ; GFX942-AGPRCD-GISEL-NEXT:    s_endpgm
-;
-; GFX950-VGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_bf8_bf8:
-; GFX950-VGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x2c
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
-; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v22, s16
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v23, s17
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v18, s18
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v19, s19
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v20, s20
-; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v21, s21
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v16, s22
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX950-VGPRCD-SDAG-NEXT:    v_smfmac_f32_32x32x32_bf8_bf8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v16, 0
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 7
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 2
-; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
-; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
-; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
-; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
-; GFX950-VGPRCD-SDAG-NEXT:    s_endpgm
-;
-; GFX950-VGPRCD-GISEL-LABEL: test_smfmac_i32_32x32x32_bf8_bf8:
-; GFX950-VGPRCD-GISEL:       ; %bb.0: ; %bb
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x2c
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[22:23], s[4:5], 0x3c
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dword s26, s[4:5], 0x44
-; GFX950-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[16:17]
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
-; GFX950-VGPRCD-GISEL-NEXT:    s_mov_b32 s20, s18
-; GFX950-VGPRCD-GISEL-NEXT:    s_mov_b32 s21, s19
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v22, s26
-; GFX950-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
-; GFX950-VGPRCD-GISEL-NEXT:    s_nop 1
-; GFX950-VGPRCD-GISEL-NEXT:    v_smfmac_f32_32x32x32_bf8_bf8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v16, 0
-; GFX950-VGPRCD-GISEL-NEXT:    s_nop 7
-; GFX950-VGPRCD-GISEL-NEXT:    s_nop 2
-; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
-; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
-; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
-; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
-; GFX950-VGPRCD-GISEL-NEXT:    s_endpgm
-;
 ; GFX950-AGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_bf8_bf8:
 ; GFX950-AGPRCD-SDAG:       ; %bb.0: ; %bb
 ; GFX950-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
@@ -3941,7 +3856,6 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_bf8(ptr addrspace(1) %ar
 ; GFX950-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
 ; GFX950-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
 ; GFX950-AGPRCD-SDAG-NEXT:    s_endpgm
-;
 ; GFX950-AGPRCD-GISEL-LABEL: test_smfmac_i32_32x32x32_bf8_bf8:
 ; GFX950-AGPRCD-GISEL:       ; %bb.0: ; %bb
 ; GFX950-AGPRCD-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
@@ -3992,73 +3906,135 @@ bb:
 }
 
 define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_fp8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 {
-; GFX942-VGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_bf8_fp8:
-; GFX942-VGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x2c
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
-; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v22, s16
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v23, s17
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v18, s18
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v19, s19
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v20, s20
-; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v21, s21
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v16, s22
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT:    v_smfmac_f32_32x32x32_bf8_fp8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 7
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
-; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
-; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
-; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
-; GFX942-VGPRCD-SDAG-NEXT:    s_endpgm
+; GFX942-SDAG-LABEL: test_smfmac_i32_32x32x32_bf8_fp8:
+; GFX942-SDAG:       ; %bb.0: ; %bb
+; GFX942-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x2c
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v22, s16
+; GFX942-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v23, s17
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v18, s18
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v19, s19
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v20, s20
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v21, s21
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v16, s22
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_smfmac_f32_32x32x32_bf8_fp8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v16, 0
+; GFX942-SDAG-NEXT:    s_nop 9
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
+; GFX942-SDAG-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: test_smfmac_i32_32x32x32_bf8_fp8:
+; GFX942-GISEL:       ; %bb.0: ; %bb
+; GFX942-GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x2c
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[22:23], s[4:5], 0x3c
+; GFX942-GISEL-NEXT:    s_load_dword s26, s[4:5], 0x44
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[16:17]
+; GFX942-GISEL-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
+; GFX942-GISEL-NEXT:    s_mov_b32 s20, s18
+; GFX942-GISEL-NEXT:    s_mov_b32 s21, s19
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v22, s26
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_smfmac_f32_32x32x32_bf8_fp8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v16, 0
+; GFX942-GISEL-NEXT:    s_nop 9
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
+; GFX942-GISEL-NEXT:    s_endpgm
 ;
-; GFX942-VGPRCD-GISEL-LABEL: test_smfmac_i32_32x32x32_bf8_fp8:
-; GFX942-VGPRCD-GISEL:       ; %bb.0: ; %bb
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x2c
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[22:23], s[4:5], 0x3c
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dword s26, s[4:5], 0x44
-; GFX942-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[16:17]
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
-; GFX942-VGPRCD-GISEL-NEXT:    s_mov_b32 s20, s18
-; GFX942-VGPRCD-GISEL-NEXT:    s_mov_b32 s21, s19
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v22, s26
-; GFX942-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
-; GFX942-VGPRCD-GISEL-NEXT:    s_nop 1
-; GFX942-VGPRCD-GISEL-NEXT:    v_smfmac_f32_32x32x32_bf8_fp8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-VGPRCD-GISEL-NEXT:    s_nop 7
-; GFX942-VGPRCD-GISEL-NEXT:    s_nop 1
-; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
-; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
-; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
-; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
-; GFX942-VGPRCD-GISEL-NEXT:    s_endpgm
+; GFX950-SDAG-LABEL: test_smfmac_i32_32x32x32_bf8_fp8:
+; GFX950-SDAG:       ; %bb.0: ; %bb
+; GFX950-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x2c
+; GFX950-SDAG-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v22, s16
+; GFX950-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v23, s17
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v18, s18
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v19, s19
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v20, s20
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v21, s21
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v16, s22
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX950-SDAG-NEXT:    s_nop 1
+; GFX950-SDAG-NEXT:    v_smfmac_f32_32x32x32_bf8_fp8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v16, 0
+; GFX950-SDAG-NEXT:    s_nop 10
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
+; GFX950-SDAG-NEXT:    s_endpgm
 ;
+; GFX950-GISEL-LABEL: test_smfmac_i32_32x32x32_bf8_fp8:
+; GFX950-GISEL:       ; %bb.0: ; %bb
+; GFX950-GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x2c
+; GFX950-GISEL-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX950-GISEL-NEXT:    s_load_dwordx2 s[22:23], s[4:5], 0x3c
+; GFX950-GISEL-NEXT:    s_load_dword s26, s[4:5], 0x44
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[16:17]
+; GFX950-GISEL-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
+; GFX950-GISEL-NEXT:    s_mov_b32 s20, s18
+; GFX950-GISEL-NEXT:    s_mov_b32 s21, s19
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v22, s26
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX950-GISEL-NEXT:    s_nop 1
+; GFX950-GISEL-NEXT:    v_smfmac_f32_32x32x32_bf8_fp8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v16, 0
+; GFX950-GISEL-NEXT:    s_nop 10
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
+; GFX950-GISEL-NEXT:    s_endpgm
 ; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_bf8_fp8:
 ; GFX942-AGPRCD-SDAG:       ; %bb.0: ; %bb
 ; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
@@ -4100,7 +4076,6 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_fp8(ptr addrspace(1) %ar
 ; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
 ; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
 ; GFX942-AGPRCD-SDAG-NEXT:    s_endpgm
-;
 ; GFX942-AGPRCD-GISEL-LABEL: test_smfmac_i32_32x32x32_bf8_fp8:
 ; GFX942-AGPRCD-GISEL:       ; %bb.0: ; %bb
 ; GFX942-AGPRCD-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
@@ -4143,74 +4118,6 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_fp8(ptr addrspace(1) %ar
 ; GFX942-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
 ; GFX942-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
 ; GFX942-AGPRCD-GISEL-NEXT:    s_endpgm
-;
-; GFX950-VGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_bf8_fp8:
-; GFX950-VGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x2c
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
-; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v22, s16
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v23, s17
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v18, s18
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v19, s19
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v20, s20
-; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v21, s21
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v16, s22
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX950-VGPRCD-SDAG-NEXT:    v_smfmac_f32_32x32x32_bf8_fp8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v16, 0
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 7
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 2
-; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
-; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
-; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
-; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
-; GFX950-VGPRCD-SDAG-NEXT:    s_endpgm
-;
-; GFX950-VGPRCD-GISEL-LABEL: test_smfmac_i32_32x32x32_bf8_fp8:
-; GFX950-VGPRCD-GISEL:       ; %bb.0: ; %bb
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x2c
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[22:23], s[4:5], 0x3c
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dword s26, s[4:5], 0x44
-; GFX950-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[16:17]
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
-; GFX950-VGPRCD-GISEL-NEXT:    s_mov_b32 s20, s18
-; GFX950-VGPRCD-GISEL-NEXT:    s_mov_b32 s21, s19
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v22, s26
-; GFX950-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
-; GFX950-VGPRCD-GISEL-NEXT:    s_nop 1
-; GFX950-VGPRCD-GISEL-NEXT:    v_smfmac_f32_32x32x32_bf8_fp8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v16, 0
-; GFX950-VGPRCD-GISEL-NEXT:    s_nop 7
-; GFX950-VGPRCD-GISEL-NEXT:    s_nop 2
-; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
-; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
-; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
-; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
-; GFX950-VGPRCD-GISEL-NEXT:    s_endpgm
-;
 ; GFX950-AGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_bf8_fp8:
 ; GFX950-AGPRCD-SDAG:       ; %bb.0: ; %bb
 ; GFX950-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
@@ -4252,7 +4159,6 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_fp8(ptr addrspace(1) %ar
 ; GFX950-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
 ; GFX950-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
 ; GFX950-AGPRCD-SDAG-NEXT:    s_endpgm
-;
 ; GFX950-AGPRCD-GISEL-LABEL: test_smfmac_i32_32x32x32_bf8_fp8:
 ; GFX950-AGPRCD-GISEL:       ; %bb.0: ; %bb
 ; GFX950-AGPRCD-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
@@ -4303,73 +4209,135 @@ bb:
 }
 
 define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_bf8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 {
-; GFX942-VGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_fp8_bf8:
-; GFX942-VGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x2c
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
-; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v22, s16
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v23, s17
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v18, s18
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v19, s19
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v20, s20
-; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v21, s21
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v16, s22
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT:    v_smfmac_f32_32x32x32_fp8_bf8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 7
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
-; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
-; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
-; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
-; GFX942-VGPRCD-SDAG-NEXT:    s_endpgm
+; GFX942-SDAG-LABEL: test_smfmac_i32_32x32x32_fp8_bf8:
+; GFX942-SDAG:       ; %bb.0: ; %bb
+; GFX942-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x2c
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v22, s16
+; GFX942-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v23, s17
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v18, s18
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v19, s19
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v20, s20
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v21, s21
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v16, s22
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_smfmac_f32_32x32x32_fp8_bf8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v16, 0
+; GFX942-SDAG-NEXT:    s_nop 9
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
+; GFX942-SDAG-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: test_smfmac_i32_32x32x32_fp8_bf8:
+; GFX942-GISEL:       ; %bb.0: ; %bb
+; GFX942-GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x2c
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[22:23], s[4:5], 0x3c
+; GFX942-GISEL-NEXT:    s_load_dword s26, s[4:5], 0x44
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[16:17]
+; GFX942-GISEL-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
+; GFX942-GISEL-NEXT:    s_mov_b32 s20, s18
+; GFX942-GISEL-NEXT:    s_mov_b32 s21, s19
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v22, s26
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_smfmac_f32_32x32x32_fp8_bf8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v16, 0
+; GFX942-GISEL-NEXT:    s_nop 9
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
+; GFX942-GISEL-NEXT:    s_endpgm
 ;
-; GFX942-VGPRCD-GISEL-LABEL: test_smfmac_i32_32x32x32_fp8_bf8:
-; GFX942-VGPRCD-GISEL:       ; %bb.0: ; %bb
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x2c
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[22:23], s[4:5], 0x3c
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dword s26, s[4:5], 0x44
-; GFX942-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[16:17]
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
-; GFX942-VGPRCD-GISEL-NEXT:    s_mov_b32 s20, s18
-; GFX942-VGPRCD-GISEL-NEXT:    s_mov_b32 s21, s19
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v22, s26
-; GFX942-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
-; GFX942-VGPRCD-GISEL-NEXT:    s_nop 1
-; GFX942-VGPRCD-GISEL-NEXT:    v_smfmac_f32_32x32x32_fp8_bf8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-VGPRCD-GISEL-NEXT:    s_nop 7
-; GFX942-VGPRCD-GISEL-NEXT:    s_nop 1
-; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
-; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
-; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
-; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
-; GFX942-VGPRCD-GISEL-NEXT:    s_endpgm
+; GFX950-SDAG-LABEL: test_smfmac_i32_32x32x32_fp8_bf8:
+; GFX950-SDAG:       ; %bb.0: ; %bb
+; GFX950-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x2c
+; GFX950-SDAG-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v22, s16
+; GFX950-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v23, s17
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v18, s18
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v19, s19
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v20, s20
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v21, s21
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v16, s22
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX950-SDAG-NEXT:    s_nop 1
+; GFX950-SDAG-NEXT:    v_smfmac_f32_32x32x32_fp8_bf8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v16, 0
+; GFX950-SDAG-NEXT:    s_nop 10
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
+; GFX950-SDAG-NEXT:    s_endpgm
 ;
+; GFX950-GISEL-LABEL: test_smfmac_i32_32x32x32_fp8_bf8:
+; GFX950-GISEL:       ; %bb.0: ; %bb
+; GFX950-GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x2c
+; GFX950-GISEL-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX950-GISEL-NEXT:    s_load_dwordx2 s[22:23], s[4:5], 0x3c
+; GFX950-GISEL-NEXT:    s_load_dword s26, s[4:5], 0x44
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[16:17]
+; GFX950-GISEL-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
+; GFX950-GISEL-NEXT:    s_mov_b32 s20, s18
+; GFX950-GISEL-NEXT:    s_mov_b32 s21, s19
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v22, s26
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX950-GISEL-NEXT:    s_nop 1
+; GFX950-GISEL-NEXT:    v_smfmac_f32_32x32x32_fp8_bf8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v16, 0
+; GFX950-GISEL-NEXT:    s_nop 10
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
+; GFX950-GISEL-NEXT:    s_endpgm
 ; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_fp8_bf8:
 ; GFX942-AGPRCD-SDAG:       ; %bb.0: ; %bb
 ; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
@@ -4411,7 +4379,6 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_bf8(ptr addrspace(1) %ar
 ; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
 ; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
 ; GFX942-AGPRCD-SDAG-NEXT:    s_endpgm
-;
 ; GFX942-AGPRCD-GISEL-LABEL: test_smfmac_i32_32x32x32_fp8_bf8:
 ; GFX942-AGPRCD-GISEL:       ; %bb.0: ; %bb
 ; GFX942-AGPRCD-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
@@ -4454,74 +4421,6 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_bf8(ptr addrspace(1) %ar
 ; GFX942-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
 ; GFX942-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
 ; GFX942-AGPRCD-GISEL-NEXT:    s_endpgm
-;
-; GFX950-VGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_fp8_bf8:
-; GFX950-VGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x2c
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
-; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v22, s16
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v23, s17
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v18, s18
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v19, s19
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v20, s20
-; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v21, s21
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v16, s22
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX950-VGPRCD-SDAG-NEXT:    v_smfmac_f32_32x32x32_fp8_bf8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v16, 0
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 7
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 2
-; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
-; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
-; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
-; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
-; GFX950-VGPRCD-SDAG-NEXT:    s_endpgm
-;
-; GFX950-VGPRCD-GISEL-LABEL: test_smfmac_i32_32x32x32_fp8_bf8:
-; GFX950-VGPRCD-GISEL:       ; %bb.0: ; %bb
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x2c
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[22:23], s[4:5], 0x3c
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dword s26, s[4:5], 0x44
-; GFX950-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[16:17]
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
-; GFX950-VGPRCD-GISEL-NEXT:    s_mov_b32 s20, s18
-; GFX950-VGPRCD-GISEL-NEXT:    s_mov_b32 s21, s19
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v22, s26
-; GFX950-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
-; GFX950-VGPRCD-GISEL-NEXT:    s_nop 1
-; GFX950-VGPRCD-GISEL-NEXT:    v_smfmac_f32_32x32x32_fp8_bf8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v16, 0
-; GFX950-VGPRCD-GISEL-NEXT:    s_nop 7
-; GFX950-VGPRCD-GISEL-NEXT:    s_nop 2
-; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
-; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
-; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
-; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
-; GFX950-VGPRCD-GISEL-NEXT:    s_endpgm
-;
 ; GFX950-AGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_fp8_bf8:
 ; GFX950-AGPRCD-SDAG:       ; %bb.0: ; %bb
 ; GFX950-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
@@ -4563,7 +4462,6 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_bf8(ptr addrspace(1) %ar
 ; GFX950-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
 ; GFX950-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
 ; GFX950-AGPRCD-SDAG-NEXT:    s_endpgm
-;
 ; GFX950-AGPRCD-GISEL-LABEL: test_smfmac_i32_32x32x32_fp8_bf8:
 ; GFX950-AGPRCD-GISEL:       ; %bb.0: ; %bb
 ; GFX950-AGPRCD-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
@@ -4614,73 +4512,135 @@ bb:
 }
 
 define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_fp8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 {
-; GFX942-VGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_fp8_fp8:
-; GFX942-VGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x2c
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
-; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v22, s16
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v23, s17
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v18, s18
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v19, s19
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v20, s20
-; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v21, s21
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v16, s22
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT:    v_smfmac_f32_32x32x32_fp8_fp8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 7
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
-; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
-; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
-; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
-; GFX942-VGPRCD-SDAG-NEXT:    s_endpgm
+; GFX942-SDAG-LABEL: test_smfmac_i32_32x32x32_fp8_fp8:
+; GFX942-SDAG:       ; %bb.0: ; %bb
+; GFX942-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x2c
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v22, s16
+; GFX942-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v23, s17
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v18, s18
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v19, s19
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v20, s20
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v21, s21
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v16, s22
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_smfmac_f32_32x32x32_fp8_fp8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v16, 0
+; GFX942-SDAG-NEXT:    s_nop 9
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
+; GFX942-SDAG-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: test_smfmac_i32_32x32x32_fp8_fp8:
+; GFX942-GISEL:       ; %bb.0: ; %bb
+; GFX942-GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x2c
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[22:23], s[4:5], 0x3c
+; GFX942-GISEL-NEXT:    s_load_dword s26, s[4:5], 0x44
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[16:17]
+; GFX942-GISEL-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
+; GFX942-GISEL-NEXT:    s_mov_b32 s20, s18
+; GFX942-GISEL-NEXT:    s_mov_b32 s21, s19
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v22, s26
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_smfmac_f32_32x32x32_fp8_fp8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v16, 0
+; GFX942-GISEL-NEXT:    s_nop 9
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
+; GFX942-GISEL-NEXT:    s_endpgm
 ;
-; GFX942-VGPRCD-GISEL-LABEL: test_smfmac_i32_32x32x32_fp8_fp8:
-; GFX942-VGPRCD-GISEL:       ; %bb.0: ; %bb
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x2c
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[22:23], s[4:5], 0x3c
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dword s26, s[4:5], 0x44
-; GFX942-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[16:17]
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
-; GFX942-VGPRCD-GISEL-NEXT:    s_mov_b32 s20, s18
-; GFX942-VGPRCD-GISEL-NEXT:    s_mov_b32 s21, s19
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v22, s26
-; GFX942-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
-; GFX942-VGPRCD-GISEL-NEXT:    s_nop 1
-; GFX942-VGPRCD-GISEL-NEXT:    v_smfmac_f32_32x32x32_fp8_fp8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-VGPRCD-GISEL-NEXT:    s_nop 7
-; GFX942-VGPRCD-GISEL-NEXT:    s_nop 1
-; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
-; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
-; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
-; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
-; GFX942-VGPRCD-GISEL-NEXT:    s_endpgm
+; GFX950-SDAG-LABEL: test_smfmac_i32_32x32x32_fp8_fp8:
+; GFX950-SDAG:       ; %bb.0: ; %bb
+; GFX950-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x2c
+; GFX950-SDAG-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v22, s16
+; GFX950-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v23, s17
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v18, s18
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v19, s19
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v20, s20
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v21, s21
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v16, s22
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX950-SDAG-NEXT:    s_nop 1
+; GFX950-SDAG-NEXT:    v_smfmac_f32_32x32x32_fp8_fp8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v16, 0
+; GFX950-SDAG-NEXT:    s_nop 10
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
+; GFX950-SDAG-NEXT:    s_endpgm
 ;
+; GFX950-GISEL-LABEL: test_smfmac_i32_32x32x32_fp8_fp8:
+; GFX950-GISEL:       ; %bb.0: ; %bb
+; GFX950-GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x2c
+; GFX950-GISEL-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX950-GISEL-NEXT:    s_load_dwordx2 s[22:23], s[4:5], 0x3c
+; GFX950-GISEL-NEXT:    s_load_dword s26, s[4:5], 0x44
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[16:17]
+; GFX950-GISEL-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
+; GFX950-GISEL-NEXT:    s_mov_b32 s20, s18
+; GFX950-GISEL-NEXT:    s_mov_b32 s21, s19
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v22, s26
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX950-GISEL-NEXT:    s_nop 1
+; GFX950-GISEL-NEXT:    v_smfmac_f32_32x32x32_fp8_fp8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v16, 0
+; GFX950-GISEL-NEXT:    s_nop 10
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
+; GFX950-GISEL-NEXT:    s_endpgm
 ; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_fp8_fp8:
 ; GFX942-AGPRCD-SDAG:       ; %bb.0: ; %bb
 ; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
@@ -4722,7 +4682,6 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_fp8(ptr addrspace(1) %ar
 ; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
 ; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
 ; GFX942-AGPRCD-SDAG-NEXT:    s_endpgm
-;
 ; GFX942-AGPRCD-GISEL-LABEL: test_smfmac_i32_32x32x32_fp8_fp8:
 ; GFX942-AGPRCD-GISEL:       ; %bb.0: ; %bb
 ; GFX942-AGPRCD-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
@@ -4765,74 +4724,6 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_fp8(ptr addrspace(1) %ar
 ; GFX942-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
 ; GFX942-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
 ; GFX942-AGPRCD-GISEL-NEXT:    s_endpgm
-;
-; GFX950-VGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_fp8_fp8:
-; GFX950-VGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x2c
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
-; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v22, s16
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v23, s17
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v18, s18
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v19, s19
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v20, s20
-; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v21, s21
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v16, s22
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX950-VGPRCD-SDAG-NEXT:    v_smfmac_f32_32x32x32_fp8_fp8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v16, 0
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 7
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 2
-; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
-; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
-; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
-; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
-; GFX950-VGPRCD-SDAG-NEXT:    s_endpgm
-;
-; GFX950-VGPRCD-GISEL-LABEL: test_smfmac_i32_32x32x32_fp8_fp8:
-; GFX950-VGPRCD-GISEL:       ; %bb.0: ; %bb
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x2c
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[22:23], s[4:5], 0x3c
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dword s26, s[4:5], 0x44
-; GFX950-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[16:17]
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
-; GFX950-VGPRCD-GISEL-NEXT:    s_mov_b32 s20, s18
-; GFX950-VGPRCD-GISEL-NEXT:    s_mov_b32 s21, s19
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v22, s26
-; GFX950-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
-; GFX950-VGPRCD-GISEL-NEXT:    s_nop 1
-; GFX950-VGPRCD-GISEL-NEXT:    v_smfmac_f32_32x32x32_fp8_fp8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v16, 0
-; GFX950-VGPRCD-GISEL-NEXT:    s_nop 7
-; GFX950-VGPRCD-GISEL-NEXT:    s_nop 2
-; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
-; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
-; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
-; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
-; GFX950-VGPRCD-GISEL-NEXT:    s_endpgm
-;
 ; GFX950-AGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_fp8_fp8:
 ; GFX950-AGPRCD-SDAG:       ; %bb.0: ; %bb
 ; GFX950-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
@@ -4874,7 +4765,6 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_fp8(ptr addrspace(1) %ar
 ; GFX950-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
 ; GFX950-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
 ; GFX950-AGPRCD-SDAG-NEXT:    s_endpgm
-;
 ; GFX950-AGPRCD-GISEL-LABEL: test_smfmac_i32_32x32x32_fp8_fp8:
 ; GFX950-AGPRCD-GISEL:       ; %bb.0: ; %bb
 ; GFX950-AGPRCD-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
@@ -4928,5 +4818,9 @@ attributes #0 = { "amdgpu-flat-work-group-size"="1,256" }
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; GFX942: {{.*}}
 ; GFX942-VGPRCD: {{.*}}
+; GFX942-VGPRCD-GISEL: {{.*}}
+; GFX942-VGPRCD-SDAG: {{.*}}
 ; GFX950: {{.*}}
 ; GFX950-VGPRCD: {{.*}}
+; GFX950-VGPRCD-GISEL: {{.*}}
+; GFX950-VGPRCD-SDAG: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll
index 284ced1727b7e..033a35f69a0bd 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll
@@ -178,8 +178,7 @@ define <16 x float> @test_mfma_f32_32x32x16_bf16__mac(<8 x bfloat> %arg0, <8 x b
 ; GCN-NEXT:    v_accvgpr_write_b32 a15, v23
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[0:15]
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -223,8 +222,7 @@ define <16 x float> @test_mfma_f32_32x32x16_bf16__mac__flags(<8 x bfloat> %arg0,
 ; GCN-NEXT:    v_accvgpr_write_b32 a15, v23
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -394,8 +392,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd_mac(<8 x bfloat>
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_32x32x16_bf16 v[0:15], v[16:19], v[20:23], v[0:15]
 ; GCN-NEXT:    v_mov_b32_e32 v16, 0
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 10
 ; GCN-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
 ; GCN-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
 ; GCN-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
@@ -428,8 +425,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd_mac_flags(<8 x bf
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_32x32x16_bf16 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1
 ; GCN-NEXT:    v_mov_b32_e32 v16, 0
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 10
 ; GCN-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
 ; GCN-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
 ; GCN-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
index 078a043b94604..753206206180a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
@@ -479,8 +479,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal
 ; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
 ; GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[14:15]
 ; GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 0
+; GISEL-NEXT:    s_nop 8
 ; GISEL-NEXT:    global_store_dwordx4 v[12:13], a[16:19], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    global_store_dwordx4 v[14:15], a[20:23], off sc0 sc1
@@ -598,8 +597,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal
 ; VGPRRC-NEXT:    v_mov_b32_e32 v50, s18
 ; VGPRRC-NEXT:    v_mov_b32_e32 v51, s19
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[46:47], 0
-; VGPRRC-NEXT:    s_nop 7
-; VGPRRC-NEXT:    s_nop 0
+; VGPRRC-NEXT:    s_nop 8
 ; VGPRRC-NEXT:    global_store_dwordx4 v[40:41], v[28:31], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    global_store_dwordx4 v[42:43], v[24:27], off sc0 sc1
@@ -864,8 +862,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, <
 ; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
 ; GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[14:15]
 ; GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 0
+; GISEL-NEXT:    s_nop 8
 ; GISEL-NEXT:    global_store_dwordx4 v[12:13], a[16:19], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    global_store_dwordx4 v[14:15], a[20:23], off sc0 sc1
@@ -983,8 +980,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, <
 ; VGPRRC-NEXT:    v_mov_b32_e32 v50, s18
 ; VGPRRC-NEXT:    v_mov_b32_e32 v51, s19
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[46:47], 0
-; VGPRRC-NEXT:    s_nop 7
-; VGPRRC-NEXT:    s_nop 0
+; VGPRRC-NEXT:    s_nop 8
 ; VGPRRC-NEXT:    global_store_dwordx4 v[40:41], v[28:31], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    global_store_dwordx4 v[42:43], v[24:27], off sc0 sc1
@@ -1169,8 +1165,7 @@ define <16 x float> @test_mfma_f32_32x32x16_f16__mac(<8 x half> %arg0, <8 x half
 ; GCN-NEXT:    v_accvgpr_write_b32 a15, v23
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15]
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1210,8 +1205,7 @@ define <16 x float> @test_mfma_f32_32x32x16_f16__mac(<8 x half> %arg0, <8 x half
 ; HEURRC-NEXT:    v_accvgpr_write_b32 a15, v23
 ; HEURRC-NEXT:    s_nop 1
 ; HEURRC-NEXT:    v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15]
-; HEURRC-NEXT:    s_nop 7
-; HEURRC-NEXT:    s_nop 3
+; HEURRC-NEXT:    s_nop 11
 ; HEURRC-NEXT:    v_accvgpr_read_b32 v0, a0
 ; HEURRC-NEXT:    v_accvgpr_read_b32 v1, a1
 ; HEURRC-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1234,8 +1228,7 @@ define <16 x float> @test_mfma_f32_32x32x16_f16__mac(<8 x half> %arg0, <8 x half
 ; VGPRRC:       ; %bb.0:
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VGPRRC-NEXT:    v_mfma_f32_32x32x16_f16 v[8:23], v[0:3], v[4:7], v[8:23]
-; VGPRRC-NEXT:    s_nop 7
-; VGPRRC-NEXT:    s_nop 3
+; VGPRRC-NEXT:    s_nop 11
 ; VGPRRC-NEXT:    v_mov_b32_e32 v0, v8
 ; VGPRRC-NEXT:    v_mov_b32_e32 v1, v9
 ; VGPRRC-NEXT:    v_mov_b32_e32 v2, v10
@@ -1342,8 +1335,7 @@ define <16 x float> @test_mfma_f32_32x32x16_f16__mac__flags(<8 x half> %arg0, <8
 ; GCN-NEXT:    v_accvgpr_write_b32 a15, v23
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1383,8 +1375,7 @@ define <16 x float> @test_mfma_f32_32x32x16_f16__mac__flags(<8 x half> %arg0, <8
 ; HEURRC-NEXT:    v_accvgpr_write_b32 a15, v23
 ; HEURRC-NEXT:    s_nop 1
 ; HEURRC-NEXT:    v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1
-; HEURRC-NEXT:    s_nop 7
-; HEURRC-NEXT:    s_nop 3
+; HEURRC-NEXT:    s_nop 11
 ; HEURRC-NEXT:    v_accvgpr_read_b32 v0, a0
 ; HEURRC-NEXT:    v_accvgpr_read_b32 v1, a1
 ; HEURRC-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1407,8 +1398,7 @@ define <16 x float> @test_mfma_f32_32x32x16_f16__mac__flags(<8 x half> %arg0, <8
 ; VGPRRC:       ; %bb.0:
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VGPRRC-NEXT:    v_mfma_f32_32x32x16_f16 v[8:23], v[0:3], v[4:7], v[8:23] cbsz:1 abid:1 blgp:1
-; VGPRRC-NEXT:    s_nop 7
-; VGPRRC-NEXT:    s_nop 3
+; VGPRRC-NEXT:    s_nop 11
 ; VGPRRC-NEXT:    v_mov_b32_e32 v0, v8
 ; VGPRRC-NEXT:    v_mov_b32_e32 v1, v9
 ; VGPRRC-NEXT:    v_mov_b32_e32 v2, v10
@@ -2199,8 +2189,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac(<8 x half> %ar
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15]
 ; SDAG-NEXT:    v_mov_b32_e32 v16, 0
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 2
+; SDAG-NEXT:    s_nop 10
 ; SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
 ; SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
 ; SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
@@ -2228,8 +2217,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac(<8 x half> %ar
 ; GISEL-NEXT:    s_nop 1
 ; GISEL-NEXT:    v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15]
 ; GISEL-NEXT:    v_mov_b32_e32 v16, 0
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 2
+; GISEL-NEXT:    s_nop 10
 ; GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
 ; GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
 ; GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
@@ -2257,8 +2245,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac(<8 x half> %ar
 ; HEURRC-NEXT:    s_nop 1
 ; HEURRC-NEXT:    v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15]
 ; HEURRC-NEXT:    v_mov_b32_e32 v16, 0
-; HEURRC-NEXT:    s_nop 7
-; HEURRC-NEXT:    s_nop 2
+; HEURRC-NEXT:    s_nop 10
 ; HEURRC-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
 ; HEURRC-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
 ; HEURRC-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
@@ -2286,8 +2273,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac(<8 x half> %ar
 ; VGPRRC-NEXT:    s_nop 1
 ; VGPRRC-NEXT:    v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15]
 ; VGPRRC-NEXT:    v_mov_b32_e32 v16, 0
-; VGPRRC-NEXT:    s_nop 7
-; VGPRRC-NEXT:    s_nop 2
+; VGPRRC-NEXT:    s_nop 10
 ; VGPRRC-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
 ; VGPRRC-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
 ; VGPRRC-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
@@ -2384,8 +2370,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac_flags(<8 x hal
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1
 ; SDAG-NEXT:    v_mov_b32_e32 v16, 0
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 2
+; SDAG-NEXT:    s_nop 10
 ; SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
 ; SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
 ; SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
@@ -2413,8 +2398,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac_flags(<8 x hal
 ; GISEL-NEXT:    s_nop 1
 ; GISEL-NEXT:    v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1
 ; GISEL-NEXT:    v_mov_b32_e32 v16, 0
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 2
+; GISEL-NEXT:    s_nop 10
 ; GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
 ; GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
 ; GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
@@ -2442,8 +2426,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac_flags(<8 x hal
 ; HEURRC-NEXT:    s_nop 1
 ; HEURRC-NEXT:    v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1
 ; HEURRC-NEXT:    v_mov_b32_e32 v16, 0
-; HEURRC-NEXT:    s_nop 7
-; HEURRC-NEXT:    s_nop 2
+; HEURRC-NEXT:    s_nop 10
 ; HEURRC-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
 ; HEURRC-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
 ; HEURRC-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
@@ -2471,8 +2454,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac_flags(<8 x hal
 ; VGPRRC-NEXT:    s_nop 1
 ; VGPRRC-NEXT:    v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1
 ; VGPRRC-NEXT:    v_mov_b32_e32 v16, 0
-; VGPRRC-NEXT:    s_nop 7
-; VGPRRC-NEXT:    s_nop 2
+; VGPRRC-NEXT:    s_nop 10
 ; VGPRRC-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
 ; VGPRRC-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
 ; VGPRRC-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
@@ -3083,8 +3065,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32>
 ; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
 ; GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[14:15]
 ; GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 0
+; GISEL-NEXT:    s_nop 8
 ; GISEL-NEXT:    global_store_dwordx4 v[12:13], a[16:19], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    global_store_dwordx4 v[14:15], a[20:23], off sc0 sc1
@@ -3205,8 +3186,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32>
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
 ; VGPRRC-NEXT:    s_nop 1
 ; VGPRRC-NEXT:    v_mfma_i32_32x32x32_i8 v[0:15], v[36:39], v[40:43], v[16:31]
-; VGPRRC-NEXT:    s_nop 7
-; VGPRRC-NEXT:    s_nop 3
+; VGPRRC-NEXT:    s_nop 11
 ; VGPRRC-NEXT:    global_store_dwordx4 v[32:33], v[12:15], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    global_store_dwordx4 v[34:35], v[8:11], off sc0 sc1
@@ -3497,8 +3477,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4
 ; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
 ; GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[14:15]
 ; GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 0
+; GISEL-NEXT:    s_nop 8
 ; GISEL-NEXT:    global_store_dwordx4 v[12:13], a[16:19], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    global_store_dwordx4 v[14:15], a[20:23], off sc0 sc1
@@ -3619,8 +3598,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
 ; VGPRRC-NEXT:    s_nop 1
 ; VGPRRC-NEXT:    v_mfma_i32_32x32x32_i8 v[0:15], v[36:39], v[40:43], v[16:31] cbsz:2 abid:3 blgp:1
-; VGPRRC-NEXT:    s_nop 7
-; VGPRRC-NEXT:    s_nop 3
+; VGPRRC-NEXT:    s_nop 11
 ; VGPRRC-NEXT:    global_store_dwordx4 v[32:33], v[12:15], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    global_store_dwordx4 v[34:35], v[8:11], off sc0 sc1
@@ -3827,8 +3805,7 @@ define <16 x i32> @test_mfma_i32_32x32x32_i8__mac(<4 x i32> %arg0, <4 x i32> %ar
 ; GCN-NEXT:    v_accvgpr_write_b32 a15, v23
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15]
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -3868,8 +3845,7 @@ define <16 x i32> @test_mfma_i32_32x32x32_i8__mac(<4 x i32> %arg0, <4 x i32> %ar
 ; HEURRC-NEXT:    v_accvgpr_write_b32 a15, v23
 ; HEURRC-NEXT:    s_nop 1
 ; HEURRC-NEXT:    v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15]
-; HEURRC-NEXT:    s_nop 7
-; HEURRC-NEXT:    s_nop 3
+; HEURRC-NEXT:    s_nop 11
 ; HEURRC-NEXT:    v_accvgpr_read_b32 v0, a0
 ; HEURRC-NEXT:    v_accvgpr_read_b32 v1, a1
 ; HEURRC-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -3892,8 +3868,7 @@ define <16 x i32> @test_mfma_i32_32x32x32_i8__mac(<4 x i32> %arg0, <4 x i32> %ar
 ; VGPRRC:       ; %bb.0:
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VGPRRC-NEXT:    v_mfma_i32_32x32x32_i8 v[8:23], v[0:3], v[4:7], v[8:23]
-; VGPRRC-NEXT:    s_nop 7
-; VGPRRC-NEXT:    s_nop 3
+; VGPRRC-NEXT:    s_nop 11
 ; VGPRRC-NEXT:    v_mov_b32_e32 v0, v8
 ; VGPRRC-NEXT:    v_mov_b32_e32 v1, v9
 ; VGPRRC-NEXT:    v_mov_b32_e32 v2, v10
@@ -4000,8 +3975,7 @@ define <16 x i32> @test_mfma_i32_32x32x32_i8__mac__flags(<4 x i32> %arg0, <4 x i
 ; GCN-NEXT:    v_accvgpr_write_b32 a15, v23
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -4041,8 +4015,7 @@ define <16 x i32> @test_mfma_i32_32x32x32_i8__mac__flags(<4 x i32> %arg0, <4 x i
 ; HEURRC-NEXT:    v_accvgpr_write_b32 a15, v23
 ; HEURRC-NEXT:    s_nop 1
 ; HEURRC-NEXT:    v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1
-; HEURRC-NEXT:    s_nop 7
-; HEURRC-NEXT:    s_nop 3
+; HEURRC-NEXT:    s_nop 11
 ; HEURRC-NEXT:    v_accvgpr_read_b32 v0, a0
 ; HEURRC-NEXT:    v_accvgpr_read_b32 v1, a1
 ; HEURRC-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -4065,8 +4038,7 @@ define <16 x i32> @test_mfma_i32_32x32x32_i8__mac__flags(<4 x i32> %arg0, <4 x i
 ; VGPRRC:       ; %bb.0:
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VGPRRC-NEXT:    v_mfma_i32_32x32x32_i8 v[8:23], v[0:3], v[4:7], v[8:23] cbsz:1 abid:1 blgp:1
-; VGPRRC-NEXT:    s_nop 7
-; VGPRRC-NEXT:    s_nop 3
+; VGPRRC-NEXT:    s_nop 11
 ; VGPRRC-NEXT:    v_mov_b32_e32 v0, v8
 ; VGPRRC-NEXT:    v_mov_b32_e32 v1, v9
 ; VGPRRC-NEXT:    v_mov_b32_e32 v2, v10
@@ -4932,8 +4904,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac(<4 x i32> %arg0
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15]
 ; SDAG-NEXT:    v_mov_b32_e32 v16, 0
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 2
+; SDAG-NEXT:    s_nop 10
 ; SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
 ; SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
 ; SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
@@ -4961,8 +4932,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac(<4 x i32> %arg0
 ; GISEL-NEXT:    s_nop 1
 ; GISEL-NEXT:    v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15]
 ; GISEL-NEXT:    v_mov_b32_e32 v16, 0
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 2
+; GISEL-NEXT:    s_nop 10
 ; GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
 ; GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
 ; GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
@@ -4995,8 +4965,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac(<4 x i32> %arg0
 ; HEURRC-NEXT:    s_nop 1
 ; HEURRC-NEXT:    v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15]
 ; HEURRC-NEXT:    v_mov_b32_e32 v16, 0
-; HEURRC-NEXT:    s_nop 7
-; HEURRC-NEXT:    s_nop 2
+; HEURRC-NEXT:    s_nop 10
 ; HEURRC-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
 ; HEURRC-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
 ; HEURRC-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
@@ -5029,8 +4998,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac(<4 x i32> %arg0
 ; VGPRRC-NEXT:    s_nop 1
 ; VGPRRC-NEXT:    v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15]
 ; VGPRRC-NEXT:    v_mov_b32_e32 v16, 0
-; VGPRRC-NEXT:    s_nop 7
-; VGPRRC-NEXT:    s_nop 2
+; VGPRRC-NEXT:    s_nop 10
 ; VGPRRC-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
 ; VGPRRC-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
 ; VGPRRC-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
@@ -5142,8 +5110,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac_flags(<4 x i32>
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1
 ; SDAG-NEXT:    v_mov_b32_e32 v16, 0
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 2
+; SDAG-NEXT:    s_nop 10
 ; SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
 ; SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
 ; SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
@@ -5171,8 +5138,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac_flags(<4 x i32>
 ; GISEL-NEXT:    s_nop 1
 ; GISEL-NEXT:    v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1
 ; GISEL-NEXT:    v_mov_b32_e32 v16, 0
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 2
+; GISEL-NEXT:    s_nop 10
 ; GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
 ; GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
 ; GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
@@ -5205,8 +5171,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac_flags(<4 x i32>
 ; HEURRC-NEXT:    s_nop 1
 ; HEURRC-NEXT:    v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1
 ; HEURRC-NEXT:    v_mov_b32_e32 v16, 0
-; HEURRC-NEXT:    s_nop 7
-; HEURRC-NEXT:    s_nop 2
+; HEURRC-NEXT:    s_nop 10
 ; HEURRC-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
 ; HEURRC-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
 ; HEURRC-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
@@ -5239,8 +5204,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac_flags(<4 x i32>
 ; VGPRRC-NEXT:    s_nop 1
 ; VGPRRC-NEXT:    v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1
 ; VGPRRC-NEXT:    v_mov_b32_e32 v16, 0
-; VGPRRC-NEXT:    s_nop 7
-; VGPRRC-NEXT:    s_nop 2
+; VGPRRC-NEXT:    s_nop 10
 ; VGPRRC-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
 ; VGPRRC-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
 ; VGPRRC-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.i8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.i8.ll
index 856185b17e5fd..d24f1f0b526c3 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.i8.ll
@@ -50,8 +50,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x8i8(ptr addrspace(1) %arg) #0 {
 ; GFX908-NEXT:    v_mov_b32_e32 v1, 2
 ; GFX908-NEXT:    s_nop 1
 ; GFX908-NEXT:    v_mfma_i32_32x32x8i8 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
-; GFX908-NEXT:    s_nop 7
-; GFX908-NEXT:    s_nop 7
+; GFX908-NEXT:    s_nop 15
 ; GFX908-NEXT:    s_nop 1
 ; GFX908-NEXT:    v_accvgpr_read_b32 v15, a15
 ; GFX908-NEXT:    v_accvgpr_read_b32 v14, a14
@@ -103,8 +102,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x8i8(ptr addrspace(1) %arg) #0 {
 ; GFX90A-NEXT:    s_nop 1
 ; GFX90A-NEXT:    v_mfma_i32_32x32x8i8 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90A-NEXT:    s_nop 7
-; GFX90A-NEXT:    s_nop 7
+; GFX90A-NEXT:    s_nop 15
 ; GFX90A-NEXT:    s_nop 1
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
@@ -138,8 +136,7 @@ define amdgpu_kernel void @test_mfma_i32_16x16x16i8(ptr addrspace(1) %arg) #0 {
 ; GFX908-NEXT:    v_accvgpr_write_b32 a3, v5
 ; GFX908-NEXT:    s_nop 0
 ; GFX908-NEXT:    v_mfma_i32_16x16x16i8 a[0:3], v0, v1, a[0:3] cbsz:1 abid:2 blgp:3
-; GFX908-NEXT:    s_nop 7
-; GFX908-NEXT:    s_nop 1
+; GFX908-NEXT:    s_nop 9
 ; GFX908-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GFX908-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GFX908-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -163,8 +160,7 @@ define amdgpu_kernel void @test_mfma_i32_16x16x16i8(ptr addrspace(1) %arg) #0 {
 ; GFX90A-NEXT:    v_accvgpr_write_b32 a3, s3
 ; GFX90A-NEXT:    s_nop 1
 ; GFX90A-NEXT:    v_mfma_i32_16x16x16i8 a[0:3], v0, v2, a[0:3] cbsz:1 abid:2 blgp:3
-; GFX90A-NEXT:    s_nop 7
-; GFX90A-NEXT:    s_nop 2
+; GFX90A-NEXT:    s_nop 10
 ; GFX90A-NEXT:    global_store_dwordx4 v1, a[0:3], s[6:7]
 ; GFX90A-NEXT:    s_endpgm
 bb:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
index c1508c1675fe0..7e30af96bb8b9 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
@@ -97,8 +97,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 {
 ; NOLIT-SRCC-NEXT:    v_mov_b32_e32 v0, 2.0
 ; NOLIT-SRCC-NEXT:    s_nop 1
 ; NOLIT-SRCC-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v3, v0, a[0:31] cbsz:1 abid:2 blgp:3
-; NOLIT-SRCC-NEXT:    s_nop 7
-; NOLIT-SRCC-NEXT:    s_nop 7
+; NOLIT-SRCC-NEXT:    s_nop 15
 ; NOLIT-SRCC-NEXT:    s_nop 1
 ; NOLIT-SRCC-NEXT:    v_accvgpr_read_b32 v3, a27
 ; NOLIT-SRCC-NEXT:    v_accvgpr_read_b32 v2, a26
@@ -233,8 +232,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 {
 ; LIT-SRCC-NEXT:    v_mov_b32_e32 v0, 2.0
 ; LIT-SRCC-NEXT:    s_nop 1
 ; LIT-SRCC-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v3, v0, a[0:31] cbsz:1 abid:2 blgp:3
-; LIT-SRCC-NEXT:    s_nop 7
-; LIT-SRCC-NEXT:    s_nop 7
+; LIT-SRCC-NEXT:    s_nop 15
 ; LIT-SRCC-NEXT:    s_nop 1
 ; LIT-SRCC-NEXT:    v_accvgpr_read_b32 v3, a27
 ; LIT-SRCC-NEXT:    v_accvgpr_read_b32 v2, a26
@@ -337,8 +335,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 {
 ; GFX90A-NEXT:    v_accvgpr_write_b32 a31, s15
 ; GFX90A-NEXT:    s_nop 1
 ; GFX90A-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] cbsz:1 abid:2 blgp:3
-; GFX90A-NEXT:    s_nop 7
-; GFX90A-NEXT:    s_nop 7
+; GFX90A-NEXT:    s_nop 15
 ; GFX90A-NEXT:    s_nop 2
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[24:27], s[34:35] offset:96
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[28:31], s[34:35] offset:112
@@ -394,8 +391,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 {
 ; GFX942-NEXT:    v_accvgpr_write_b32 a31, s15
 ; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v2, a[0:31] cbsz:1 abid:2 blgp:3
-; GFX942-NEXT:    s_nop 7
-; GFX942-NEXT:    s_nop 7
+; GFX942-NEXT:    s_nop 15
 ; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[24:27], s[34:35] offset:96
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[28:31], s[34:35] offset:112
@@ -451,8 +447,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 {
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v31, s15
 ; GFX942-VGPR-NEXT:    s_nop 1
 ; GFX942-VGPR-NEXT:    v_mfma_f32_32x32x1_2b_f32 v[0:31], v33, v34, v[0:31] cbsz:1 abid:2 blgp:3
-; GFX942-VGPR-NEXT:    s_nop 7
-; GFX942-VGPR-NEXT:    s_nop 7
+; GFX942-VGPR-NEXT:    s_nop 15
 ; GFX942-VGPR-NEXT:    s_nop 1
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v32, v[24:27], s[34:35] offset:96
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v32, v[28:31], s[34:35] offset:112
@@ -514,8 +509,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 {
 ; NOLIT-SRCC-NEXT:    v_mov_b32_e32 v1, 2.0
 ; NOLIT-SRCC-NEXT:    s_nop 1
 ; NOLIT-SRCC-NEXT:    v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
-; NOLIT-SRCC-NEXT:    s_nop 7
-; NOLIT-SRCC-NEXT:    s_nop 1
+; NOLIT-SRCC-NEXT:    s_nop 9
 ; NOLIT-SRCC-NEXT:    v_accvgpr_read_b32 v3, a15
 ; NOLIT-SRCC-NEXT:    v_accvgpr_read_b32 v2, a14
 ; NOLIT-SRCC-NEXT:    v_accvgpr_read_b32 v1, a13
@@ -582,8 +576,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 {
 ; LIT-SRCC-NEXT:    v_mov_b32_e32 v1, 2.0
 ; LIT-SRCC-NEXT:    s_nop 1
 ; LIT-SRCC-NEXT:    v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
-; LIT-SRCC-NEXT:    s_nop 7
-; LIT-SRCC-NEXT:    s_nop 1
+; LIT-SRCC-NEXT:    s_nop 9
 ; LIT-SRCC-NEXT:    v_accvgpr_read_b32 v3, a15
 ; LIT-SRCC-NEXT:    v_accvgpr_read_b32 v2, a14
 ; LIT-SRCC-NEXT:    v_accvgpr_read_b32 v1, a13
@@ -634,8 +627,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 {
 ; GFX90A-NEXT:    s_nop 1
 ; GFX90A-NEXT:    v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90A-NEXT:    s_nop 7
-; GFX90A-NEXT:    s_nop 1
+; GFX90A-NEXT:    s_nop 9
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
@@ -669,8 +661,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 {
 ; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_mfma_f32_16x16x1_4b_f32 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NEXT:    s_nop 7
-; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    s_nop 8
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
@@ -696,8 +687,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 {
 ; GFX942-VGPR-NEXT:    s_nop 1
 ; GFX942-VGPR-NEXT:    v_mfma_f32_16x16x1_4b_f32 v[0:15], v16, v17, v[0:15] cbsz:1 abid:2 blgp:3
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-VGPR-NEXT:    s_nop 7
-; GFX942-VGPR-NEXT:    s_nop 0
+; GFX942-VGPR-NEXT:    s_nop 8
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
@@ -872,8 +862,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x2f32(ptr addrspace(1) %arg) #0 {
 ; NOLIT-SRCC-NEXT:    v_mov_b32_e32 v1, 2.0
 ; NOLIT-SRCC-NEXT:    s_nop 1
 ; NOLIT-SRCC-NEXT:    v_mfma_f32_32x32x2f32 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
-; NOLIT-SRCC-NEXT:    s_nop 7
-; NOLIT-SRCC-NEXT:    s_nop 7
+; NOLIT-SRCC-NEXT:    s_nop 15
 ; NOLIT-SRCC-NEXT:    s_nop 1
 ; NOLIT-SRCC-NEXT:    v_accvgpr_read_b32 v3, a15
 ; NOLIT-SRCC-NEXT:    v_accvgpr_read_b32 v2, a14
@@ -940,8 +929,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x2f32(ptr addrspace(1) %arg) #0 {
 ; LIT-SRCC-NEXT:    v_mov_b32_e32 v1, 2.0
 ; LIT-SRCC-NEXT:    s_nop 1
 ; LIT-SRCC-NEXT:    v_mfma_f32_32x32x2f32 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
-; LIT-SRCC-NEXT:    s_nop 7
-; LIT-SRCC-NEXT:    s_nop 7
+; LIT-SRCC-NEXT:    s_nop 15
 ; LIT-SRCC-NEXT:    s_nop 1
 ; LIT-SRCC-NEXT:    v_accvgpr_read_b32 v3, a15
 ; LIT-SRCC-NEXT:    v_accvgpr_read_b32 v2, a14
@@ -992,8 +980,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x2f32(ptr addrspace(1) %arg) #0 {
 ; GFX90A-NEXT:    s_nop 1
 ; GFX90A-NEXT:    v_mfma_f32_32x32x2f32 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90A-NEXT:    s_nop 7
-; GFX90A-NEXT:    s_nop 7
+; GFX90A-NEXT:    s_nop 15
 ; GFX90A-NEXT:    s_nop 1
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
@@ -1028,8 +1015,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x2f32(ptr addrspace(1) %arg) #0 {
 ; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_mfma_f32_32x32x2_f32 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NEXT:    s_nop 7
-; GFX942-NEXT:    s_nop 7
+; GFX942-NEXT:    s_nop 15
 ; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
@@ -1056,8 +1042,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x2f32(ptr addrspace(1) %arg) #0 {
 ; GFX942-VGPR-NEXT:    s_nop 1
 ; GFX942-VGPR-NEXT:    v_mfma_f32_32x32x2_f32 v[0:15], v16, v17, v[0:15] cbsz:1 abid:2 blgp:3
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-VGPR-NEXT:    s_nop 7
-; GFX942-VGPR-NEXT:    s_nop 7
+; GFX942-VGPR-NEXT:    s_nop 15
 ; GFX942-VGPR-NEXT:    s_nop 0
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
@@ -1091,8 +1076,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4f32(ptr addrspace(1) %arg) #0 {
 ; NOLIT-SRCC-NEXT:    v_accvgpr_write_b32 a3, v5
 ; NOLIT-SRCC-NEXT:    s_nop 0
 ; NOLIT-SRCC-NEXT:    v_mfma_f32_16x16x4f32 a[0:3], v0, v1, a[0:3] cbsz:1 abid:2 blgp:3
-; NOLIT-SRCC-NEXT:    s_nop 7
-; NOLIT-SRCC-NEXT:    s_nop 1
+; NOLIT-SRCC-NEXT:    s_nop 9
 ; NOLIT-SRCC-NEXT:    v_accvgpr_read_b32 v0, a0
 ; NOLIT-SRCC-NEXT:    v_accvgpr_read_b32 v1, a1
 ; NOLIT-SRCC-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1120,8 +1104,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4f32(ptr addrspace(1) %arg) #0 {
 ; LIT-SRCC-NEXT:    v_accvgpr_write_b32 a3, v5
 ; LIT-SRCC-NEXT:    s_nop 0
 ; LIT-SRCC-NEXT:    v_mfma_f32_16x16x4f32 a[0:3], v0, v1, a[0:3] cbsz:1 abid:2 blgp:3
-; LIT-SRCC-NEXT:    s_nop 7
-; LIT-SRCC-NEXT:    s_nop 1
+; LIT-SRCC-NEXT:    s_nop 9
 ; LIT-SRCC-NEXT:    v_accvgpr_read_b32 v0, a0
 ; LIT-SRCC-NEXT:    v_accvgpr_read_b32 v1, a1
 ; LIT-SRCC-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1145,8 +1128,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4f32(ptr addrspace(1) %arg) #0 {
 ; GFX90A-NEXT:    v_accvgpr_write_b32 a3, s3
 ; GFX90A-NEXT:    s_nop 1
 ; GFX90A-NEXT:    v_mfma_f32_16x16x4f32 a[0:3], v0, v2, a[0:3] cbsz:1 abid:2 blgp:3
-; GFX90A-NEXT:    s_nop 7
-; GFX90A-NEXT:    s_nop 2
+; GFX90A-NEXT:    s_nop 10
 ; GFX90A-NEXT:    global_store_dwordx4 v1, a[0:3], s[6:7]
 ; GFX90A-NEXT:    s_endpgm
 ;
@@ -1165,8 +1147,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4f32(ptr addrspace(1) %arg) #0 {
 ; GFX942-NEXT:    v_accvgpr_write_b32 a3, s3
 ; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_mfma_f32_16x16x4_f32 a[0:3], v0, v2, a[0:3] cbsz:1 abid:2 blgp:3
-; GFX942-NEXT:    s_nop 7
-; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    s_nop 9
 ; GFX942-NEXT:    global_store_dwordx4 v1, a[0:3], s[6:7]
 ; GFX942-NEXT:    s_endpgm
 ;
@@ -1183,8 +1164,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4f32(ptr addrspace(1) %arg) #0 {
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX942-VGPR-NEXT:    s_nop 1
 ; GFX942-VGPR-NEXT:    v_mfma_f32_16x16x4_f32 v[0:3], v4, v6, v[0:3] cbsz:1 abid:2 blgp:3
-; GFX942-VGPR-NEXT:    s_nop 7
-; GFX942-VGPR-NEXT:    s_nop 1
+; GFX942-VGPR-NEXT:    s_nop 9
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v5, v[0:3], s[6:7]
 ; GFX942-VGPR-NEXT:    s_endpgm
 bb:
@@ -1275,8 +1255,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4f16(ptr addrspace(1) %arg, ptr a
 ; NOLIT-SRCC-NEXT:    v_mov_b32_e32 v1, s3
 ; NOLIT-SRCC-NEXT:    s_nop 1
 ; NOLIT-SRCC-NEXT:    v_mfma_f32_32x32x4f16 a[0:31], v[2:3], v[0:1], a[0:31] cbsz:1 abid:2 blgp:3
-; NOLIT-SRCC-NEXT:    s_nop 7
-; NOLIT-SRCC-NEXT:    s_nop 7
+; NOLIT-SRCC-NEXT:    s_nop 15
 ; NOLIT-SRCC-NEXT:    s_nop 1
 ; NOLIT-SRCC-NEXT:    v_accvgpr_read_b32 v3, a27
 ; NOLIT-SRCC-NEXT:    v_accvgpr_read_b32 v2, a26
@@ -1415,8 +1394,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4f16(ptr addrspace(1) %arg, ptr a
 ; LIT-SRCC-NEXT:    v_mov_b32_e32 v1, s3
 ; LIT-SRCC-NEXT:    s_nop 1
 ; LIT-SRCC-NEXT:    v_mfma_f32_32x32x4f16 a[0:31], v[2:3], v[0:1], a[0:31] cbsz:1 abid:2 blgp:3
-; LIT-SRCC-NEXT:    s_nop 7
-; LIT-SRCC-NEXT:    s_nop 7
+; LIT-SRCC-NEXT:    s_nop 15
 ; LIT-SRCC-NEXT:    s_nop 1
 ; LIT-SRCC-NEXT:    v_accvgpr_read_b32 v3, a27
 ; LIT-SRCC-NEXT:    v_accvgpr_read_b32 v2, a26
@@ -1523,8 +1501,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4f16(ptr addrspace(1) %arg, ptr a
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, s3
 ; GFX90A-NEXT:    s_nop 1
 ; GFX90A-NEXT:    v_mfma_f32_32x32x4f16 a[0:31], v[2:3], v[4:5], a[0:31] cbsz:1 abid:2 blgp:3
-; GFX90A-NEXT:    s_nop 7
-; GFX90A-NEXT:    s_nop 7
+; GFX90A-NEXT:    s_nop 15
 ; GFX90A-NEXT:    s_nop 2
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[24:27], s[36:37] offset:96
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[28:31], s[36:37] offset:112
@@ -1584,8 +1561,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4f16(ptr addrspace(1) %arg, ptr a
 ; GFX942-NEXT:    v_mov_b32_e32 v5, s3
 ; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_mfma_f32_32x32x4_2b_f16 a[0:31], v[2:3], v[4:5], a[0:31] cbsz:1 abid:2 blgp:3
-; GFX942-NEXT:    s_nop 7
-; GFX942-NEXT:    s_nop 7
+; GFX942-NEXT:    s_nop 15
 ; GFX942-NEXT:    s_nop 2
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[24:27], s[36:37] offset:96
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[28:31], s[36:37] offset:112
@@ -1645,8 +1621,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4f16(ptr addrspace(1) %arg, ptr a
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v37, s3
 ; GFX942-VGPR-NEXT:    s_nop 1
 ; GFX942-VGPR-NEXT:    v_mfma_f32_32x32x4_2b_f16 v[0:31], v[34:35], v[36:37], v[0:31] cbsz:1 abid:2 blgp:3
-; GFX942-VGPR-NEXT:    s_nop 7
-; GFX942-VGPR-NEXT:    s_nop 7
+; GFX942-VGPR-NEXT:    s_nop 15
 ; GFX942-VGPR-NEXT:    s_nop 2
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v32, v[24:27], s[36:37] offset:96
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v32, v[28:31], s[36:37] offset:112
@@ -1714,8 +1689,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4f16(ptr addrspace(1) %arg, ptr a
 ; NOLIT-SRCC-NEXT:    v_mov_b32_e32 v3, s23
 ; NOLIT-SRCC-NEXT:    s_nop 1
 ; NOLIT-SRCC-NEXT:    v_mfma_f32_16x16x4f16 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
-; NOLIT-SRCC-NEXT:    s_nop 7
-; NOLIT-SRCC-NEXT:    s_nop 1
+; NOLIT-SRCC-NEXT:    s_nop 9
 ; NOLIT-SRCC-NEXT:    v_accvgpr_read_b32 v3, a15
 ; NOLIT-SRCC-NEXT:    v_accvgpr_read_b32 v2, a14
 ; NOLIT-SRCC-NEXT:    v_accvgpr_read_b32 v1, a13
@@ -1785,8 +1759,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4f16(ptr addrspace(1) %arg, ptr a
 ; LIT-SRCC-NEXT:    v_mov_b32_e32 v3, s23
 ; LIT-SRCC-NEXT:    s_nop 1
 ; LIT-SRCC-NEXT:    v_mfma_f32_16x16x4f16 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
-; LIT-SRCC-NEXT:    s_nop 7
-; LIT-SRCC-NEXT:    s_nop 1
+; LIT-SRCC-NEXT:    s_nop 9
 ; LIT-SRCC-NEXT:    v_accvgpr_read_b32 v3, a15
 ; LIT-SRCC-NEXT:    v_accvgpr_read_b32 v2, a14
 ; LIT-SRCC-NEXT:    v_accvgpr_read_b32 v1, a13
@@ -1840,8 +1813,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4f16(ptr addrspace(1) %arg, ptr a
 ; GFX90A-NEXT:    s_nop 1
 ; GFX90A-NEXT:    v_mfma_f32_16x16x4f16 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90A-NEXT:    s_nop 7
-; GFX90A-NEXT:    s_nop 1
+; GFX90A-NEXT:    s_nop 9
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
@@ -1878,8 +1850,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4f16(ptr addrspace(1) %arg, ptr a
 ; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_mfma_f32_16x16x4_4b_f16 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NEXT:    s_nop 7
-; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    s_nop 9
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
@@ -1908,8 +1879,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4f16(ptr addrspace(1) %arg, ptr a
 ; GFX942-VGPR-NEXT:    s_nop 1
 ; GFX942-VGPR-NEXT:    v_mfma_f32_16x16x4_4b_f16 v[0:15], v[16:17], v[18:19], v[0:15] cbsz:1 abid:2 blgp:3
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-VGPR-NEXT:    s_nop 7
-; GFX942-VGPR-NEXT:    s_nop 1
+; GFX942-VGPR-NEXT:    s_nop 9
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
@@ -2108,8 +2078,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8f16(ptr addrspace(1) %arg, ptr a
 ; NOLIT-SRCC-NEXT:    v_mov_b32_e32 v3, s23
 ; NOLIT-SRCC-NEXT:    s_nop 1
 ; NOLIT-SRCC-NEXT:    v_mfma_f32_32x32x8f16 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
-; NOLIT-SRCC-NEXT:    s_nop 7
-; NOLIT-SRCC-NEXT:    s_nop 7
+; NOLIT-SRCC-NEXT:    s_nop 15
 ; NOLIT-SRCC-NEXT:    s_nop 1
 ; NOLIT-SRCC-NEXT:    v_accvgpr_read_b32 v3, a15
 ; NOLIT-SRCC-NEXT:    v_accvgpr_read_b32 v2, a14
@@ -2179,8 +2148,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8f16(ptr addrspace(1) %arg, ptr a
 ; LIT-SRCC-NEXT:    v_mov_b32_e32 v3, s23
 ; LIT-SRCC-NEXT:    s_nop 1
 ; LIT-SRCC-NEXT:    v_mfma_f32_32x32x8f16 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
-; LIT-SRCC-NEXT:    s_nop 7
-; LIT-SRCC-NEXT:    s_nop 7
+; LIT-SRCC-NEXT:    s_nop 15
 ; LIT-SRCC-NEXT:    s_nop 1
 ; LIT-SRCC-NEXT:    v_accvgpr_read_b32 v3, a15
 ; LIT-SRCC-NEXT:    v_accvgpr_read_b32 v2, a14
@@ -2234,8 +2202,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8f16(ptr addrspace(1) %arg, ptr a
 ; GFX90A-NEXT:    s_nop 1
 ; GFX90A-NEXT:    v_mfma_f32_32x32x8f16 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90A-NEXT:    s_nop 7
-; GFX90A-NEXT:    s_nop 7
+; GFX90A-NEXT:    s_nop 15
 ; GFX90A-NEXT:    s_nop 1
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
@@ -2273,8 +2240,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8f16(ptr addrspace(1) %arg, ptr a
 ; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_mfma_f32_32x32x8_f16 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NEXT:    s_nop 7
-; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    s_nop 9
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
@@ -2303,8 +2269,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8f16(ptr addrspace(1) %arg, ptr a
 ; GFX942-VGPR-NEXT:    s_nop 1
 ; GFX942-VGPR-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[16:17], v[18:19], v[0:15] cbsz:1 abid:2 blgp:3
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-VGPR-NEXT:    s_nop 7
-; GFX942-VGPR-NEXT:    s_nop 1
+; GFX942-VGPR-NEXT:    s_nop 9
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
@@ -2343,8 +2308,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x16f16(ptr addrspace(1) %arg, ptr
 ; NOLIT-SRCC-NEXT:    v_mov_b32_e32 v3, s7
 ; NOLIT-SRCC-NEXT:    s_nop 1
 ; NOLIT-SRCC-NEXT:    v_mfma_f32_16x16x16f16 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
-; NOLIT-SRCC-NEXT:    s_nop 7
-; NOLIT-SRCC-NEXT:    s_nop 1
+; NOLIT-SRCC-NEXT:    s_nop 9
 ; NOLIT-SRCC-NEXT:    v_accvgpr_read_b32 v0, a0
 ; NOLIT-SRCC-NEXT:    v_accvgpr_read_b32 v1, a1
 ; NOLIT-SRCC-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -2375,8 +2339,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x16f16(ptr addrspace(1) %arg, ptr
 ; LIT-SRCC-NEXT:    v_mov_b32_e32 v3, s7
 ; LIT-SRCC-NEXT:    s_nop 1
 ; LIT-SRCC-NEXT:    v_mfma_f32_16x16x16f16 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
-; LIT-SRCC-NEXT:    s_nop 7
-; LIT-SRCC-NEXT:    s_nop 1
+; LIT-SRCC-NEXT:    s_nop 9
 ; LIT-SRCC-NEXT:    v_accvgpr_read_b32 v0, a0
 ; LIT-SRCC-NEXT:    v_accvgpr_read_b32 v1, a1
 ; LIT-SRCC-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -2403,8 +2366,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x16f16(ptr addrspace(1) %arg, ptr
 ; GFX90A-NEXT:    v_accvgpr_write_b32 a3, s11
 ; GFX90A-NEXT:    s_nop 1
 ; GFX90A-NEXT:    v_mfma_f32_16x16x16f16 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
-; GFX90A-NEXT:    s_nop 7
-; GFX90A-NEXT:    s_nop 2
+; GFX90A-NEXT:    s_nop 10
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
 ; GFX90A-NEXT:    s_endpgm
 ;
@@ -2536,8 +2498,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x4i8(ptr addrspace(1) %arg) #0 {
 ; NOLIT-SRCC-NEXT:    v_mov_b32_e32 v0, 2
 ; NOLIT-SRCC-NEXT:    s_nop 1
 ; NOLIT-SRCC-NEXT:    v_mfma_i32_32x32x4i8 a[0:31], v3, v0, a[0:31] cbsz:1 abid:2 blgp:3
-; NOLIT-SRCC-NEXT:    s_nop 7
-; NOLIT-SRCC-NEXT:    s_nop 7
+; NOLIT-SRCC-NEXT:    s_nop 15
 ; NOLIT-SRCC-NEXT:    s_nop 1
 ; NOLIT-SRCC-NEXT:    v_accvgpr_read_b32 v15, a27
 ; NOLIT-SRCC-NEXT:    v_accvgpr_read_b32 v14, a26
@@ -2658,8 +2619,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x4i8(ptr addrspace(1) %arg) #0 {
 ; LIT-SRCC-NEXT:    v_mov_b32_e32 v0, 2
 ; LIT-SRCC-NEXT:    s_nop 1
 ; LIT-SRCC-NEXT:    v_mfma_i32_32x32x4i8 a[0:31], v3, v0, a[0:31] cbsz:1 abid:2 blgp:3
-; LIT-SRCC-NEXT:    s_nop 7
-; LIT-SRCC-NEXT:    s_nop 7
+; LIT-SRCC-NEXT:    s_nop 15
 ; LIT-SRCC-NEXT:    s_nop 1
 ; LIT-SRCC-NEXT:    v_accvgpr_read_b32 v15, a27
 ; LIT-SRCC-NEXT:    v_accvgpr_read_b32 v14, a26
@@ -2748,8 +2708,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x4i8(ptr addrspace(1) %arg) #0 {
 ; GFX90A-NEXT:    v_accvgpr_write_b32 a31, s15
 ; GFX90A-NEXT:    s_nop 1
 ; GFX90A-NEXT:    v_mfma_i32_32x32x4i8 a[0:31], v1, v2, a[0:31] cbsz:1 abid:2 blgp:3
-; GFX90A-NEXT:    s_nop 7
-; GFX90A-NEXT:    s_nop 7
+; GFX90A-NEXT:    s_nop 15
 ; GFX90A-NEXT:    s_nop 2
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[24:27], s[34:35] offset:96
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[28:31], s[34:35] offset:112
@@ -2805,8 +2764,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x4i8(ptr addrspace(1) %arg) #0 {
 ; GFX942-NEXT:    v_accvgpr_write_b32 a31, s15
 ; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_mfma_i32_32x32x4_2b_i8 a[0:31], v1, v2, a[0:31] cbsz:1 abid:2 blgp:3
-; GFX942-NEXT:    s_nop 7
-; GFX942-NEXT:    s_nop 7
+; GFX942-NEXT:    s_nop 15
 ; GFX942-NEXT:    s_nop 2
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[24:27], s[34:35] offset:96
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[28:31], s[34:35] offset:112
@@ -2862,8 +2820,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x4i8(ptr addrspace(1) %arg) #0 {
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v31, s15
 ; GFX942-VGPR-NEXT:    s_nop 1
 ; GFX942-VGPR-NEXT:    v_mfma_i32_32x32x4_2b_i8 v[0:31], v33, v34, v[0:31] cbsz:1 abid:2 blgp:3
-; GFX942-VGPR-NEXT:    s_nop 7
-; GFX942-VGPR-NEXT:    s_nop 7
+; GFX942-VGPR-NEXT:    s_nop 15
 ; GFX942-VGPR-NEXT:    s_nop 2
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v32, v[24:27], s[34:35] offset:96
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v32, v[28:31], s[34:35] offset:112
@@ -2925,8 +2882,7 @@ define amdgpu_kernel void @test_mfma_i32_16x16x4i8(ptr addrspace(1) %arg) #0 {
 ; NOLIT-SRCC-NEXT:    v_mov_b32_e32 v1, 2
 ; NOLIT-SRCC-NEXT:    s_nop 1
 ; NOLIT-SRCC-NEXT:    v_mfma_i32_16x16x4i8 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
-; NOLIT-SRCC-NEXT:    s_nop 7
-; NOLIT-SRCC-NEXT:    s_nop 1
+; NOLIT-SRCC-NEXT:    s_nop 9
 ; NOLIT-SRCC-NEXT:    v_accvgpr_read_b32 v15, a15
 ; NOLIT-SRCC-NEXT:    v_accvgpr_read_b32 v14, a14
 ; NOLIT-SRCC-NEXT:    v_accvgpr_read_b32 v13, a13
@@ -2993,8 +2949,7 @@ define amdgpu_kernel void @test_mfma_i32_16x16x4i8(ptr addrspace(1) %arg) #0 {
 ; LIT-SRCC-NEXT:    v_mov_b32_e32 v1, 2
 ; LIT-SRCC-NEXT:    s_nop 1
 ; LIT-SRCC-NEXT:    v_mfma_i32_16x16x4i8 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
-; LIT-SRCC-NEXT:    s_nop 7
-; LIT-SRCC-NEXT:    s_nop 1
+; LIT-SRCC-NEXT:    s_nop 9
 ; LIT-SRCC-NEXT:    v_accvgpr_read_b32 v15, a15
 ; LIT-SRCC-NEXT:    v_accvgpr_read_b32 v14, a14
 ; LIT-SRCC-NEXT:    v_accvgpr_read_b32 v13, a13
@@ -3045,8 +3000,7 @@ define amdgpu_kernel void @test_mfma_i32_16x16x4i8(ptr addrspace(1) %arg) #0 {
 ; GFX90A-NEXT:    s_nop 1
 ; GFX90A-NEXT:    v_mfma_i32_16x16x4i8 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90A-NEXT:    s_nop 7
-; GFX90A-NEXT:    s_nop 1
+; GFX90A-NEXT:    s_nop 9
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
@@ -3080,8 +3034,7 @@ define amdgpu_kernel void @test_mfma_i32_16x16x4i8(ptr addrspace(1) %arg) #0 {
 ; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_mfma_i32_16x16x4_4b_i8 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NEXT:    s_nop 7
-; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    s_nop 9
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
@@ -3107,8 +3060,7 @@ define amdgpu_kernel void @test_mfma_i32_16x16x4i8(ptr addrspace(1) %arg) #0 {
 ; GFX942-VGPR-NEXT:    s_nop 1
 ; GFX942-VGPR-NEXT:    v_mfma_i32_16x16x4_4b_i8 v[0:15], v16, v17, v[0:15] cbsz:1 abid:2 blgp:3
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-VGPR-NEXT:    s_nop 7
-; GFX942-VGPR-NEXT:    s_nop 1
+; GFX942-VGPR-NEXT:    s_nop 9
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
@@ -3145,8 +3097,7 @@ define amdgpu_kernel void @test_mfma_i32_16x16x4i8_splatimm_src2_64(ptr addrspac
 ; NOLIT-SRCC-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; NOLIT-SRCC-NEXT:    v_mov_b32_e32 v16, 0
 ; NOLIT-SRCC-NEXT:    v_mfma_i32_16x16x4i8 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
-; NOLIT-SRCC-NEXT:    s_nop 7
-; NOLIT-SRCC-NEXT:    s_nop 1
+; NOLIT-SRCC-NEXT:    s_nop 9
 ; NOLIT-SRCC-NEXT:    v_accvgpr_read_b32 v15, a15
 ; NOLIT-SRCC-NEXT:    v_accvgpr_read_b32 v14, a14
 ; NOLIT-SRCC-NEXT:    v_accvgpr_read_b32 v13, a13
@@ -3177,8 +3128,7 @@ define amdgpu_kernel void @test_mfma_i32_16x16x4i8_splatimm_src2_64(ptr addrspac
 ; LIT-SRCC-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; LIT-SRCC-NEXT:    v_mov_b32_e32 v16, 0
 ; LIT-SRCC-NEXT:    v_mfma_i32_16x16x4i8 a[0:15], v0, v1, 64 cbsz:1 abid:2 blgp:3
-; LIT-SRCC-NEXT:    s_nop 7
-; LIT-SRCC-NEXT:    s_nop 1
+; LIT-SRCC-NEXT:    s_nop 9
 ; LIT-SRCC-NEXT:    v_accvgpr_read_b32 v15, a15
 ; LIT-SRCC-NEXT:    v_accvgpr_read_b32 v14, a14
 ; LIT-SRCC-NEXT:    v_accvgpr_read_b32 v13, a13
@@ -3211,8 +3161,7 @@ define amdgpu_kernel void @test_mfma_i32_16x16x4i8_splatimm_src2_64(ptr addrspac
 ; GFX90A-NEXT:    v_mfma_i32_16x16x4i8 a[0:15], v0, v1, 64 cbsz:1 abid:2 blgp:3
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_nop 7
-; GFX90A-NEXT:    s_nop 0
+; GFX90A-NEXT:    s_nop 8
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
@@ -3228,8 +3177,7 @@ define amdgpu_kernel void @test_mfma_i32_16x16x4i8_splatimm_src2_64(ptr addrspac
 ; GFX942-NEXT:    v_mfma_i32_16x16x4_4b_i8 a[0:15], v0, v1, 64 cbsz:1 abid:2 blgp:3
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    s_nop 7
-; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    s_nop 8
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
@@ -3244,8 +3192,7 @@ define amdgpu_kernel void @test_mfma_i32_16x16x4i8_splatimm_src2_64(ptr addrspac
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v16, 0
 ; GFX942-VGPR-NEXT:    v_mfma_i32_16x16x4_4b_i8 v[0:15], v0, v1, 64 cbsz:1 abid:2 blgp:3
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPR-NEXT:    s_nop 7
-; GFX942-VGPR-NEXT:    s_nop 1
+; GFX942-VGPR-NEXT:    s_nop 9
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
@@ -3645,8 +3592,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_forward_acc(ptr addrspace(1)
 ; NOLIT-SRCC-NEXT:    s_nop 0
 ; NOLIT-SRCC-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v3, v0, a[0:31]
 ; NOLIT-SRCC-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v3, v0, a[0:31]
-; NOLIT-SRCC-NEXT:    s_nop 7
-; NOLIT-SRCC-NEXT:    s_nop 7
+; NOLIT-SRCC-NEXT:    s_nop 15
 ; NOLIT-SRCC-NEXT:    s_nop 1
 ; NOLIT-SRCC-NEXT:    v_accvgpr_read_b32 v3, a27
 ; NOLIT-SRCC-NEXT:    v_accvgpr_read_b32 v2, a26
@@ -3782,8 +3728,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_forward_acc(ptr addrspace(1)
 ; LIT-SRCC-NEXT:    s_nop 0
 ; LIT-SRCC-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v3, v0, a[0:31]
 ; LIT-SRCC-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v3, v0, a[0:31]
-; LIT-SRCC-NEXT:    s_nop 7
-; LIT-SRCC-NEXT:    s_nop 7
+; LIT-SRCC-NEXT:    s_nop 15
 ; LIT-SRCC-NEXT:    s_nop 1
 ; LIT-SRCC-NEXT:    v_accvgpr_read_b32 v3, a27
 ; LIT-SRCC-NEXT:    v_accvgpr_read_b32 v2, a26
@@ -3887,8 +3832,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_forward_acc(ptr addrspace(1)
 ; GFX90A-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
 ; GFX90A-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90A-NEXT:    s_nop 7
-; GFX90A-NEXT:    s_nop 7
+; GFX90A-NEXT:    s_nop 15
 ; GFX90A-NEXT:    s_nop 1
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[24:27], s[34:35] offset:96
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[28:31], s[34:35] offset:112
@@ -3945,8 +3889,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_forward_acc(ptr addrspace(1)
 ; GFX942-NEXT:    v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31]
 ; GFX942-NEXT:    v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31]
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NEXT:    s_nop 7
-; GFX942-NEXT:    s_nop 7
+; GFX942-NEXT:    s_nop 15
 ; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[24:27], s[34:35] offset:96
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[28:31], s[34:35] offset:112
@@ -4003,8 +3946,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_forward_acc(ptr addrspace(1)
 ; GFX942-VGPR-NEXT:    v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, v[0:31]
 ; GFX942-VGPR-NEXT:    v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, v[0:31]
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v32, 0
-; GFX942-VGPR-NEXT:    s_nop 7
-; GFX942-VGPR-NEXT:    s_nop 7
+; GFX942-VGPR-NEXT:    s_nop 15
 ; GFX942-VGPR-NEXT:    s_nop 0
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v32, v[24:27], s[34:35] offset:96
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v32, v[28:31], s[34:35] offset:112
@@ -4068,8 +4010,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_forward_acc(ptr addrspace(1)
 ; NOLIT-SRCC-NEXT:    s_nop 1
 ; NOLIT-SRCC-NEXT:    v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15]
 ; NOLIT-SRCC-NEXT:    v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15]
-; NOLIT-SRCC-NEXT:    s_nop 7
-; NOLIT-SRCC-NEXT:    s_nop 1
+; NOLIT-SRCC-NEXT:    s_nop 9
 ; NOLIT-SRCC-NEXT:    v_accvgpr_read_b32 v3, a15
 ; NOLIT-SRCC-NEXT:    v_accvgpr_read_b32 v2, a14
 ; NOLIT-SRCC-NEXT:    v_accvgpr_read_b32 v1, a13
@@ -4136,8 +4077,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_forward_acc(ptr addrspace(1)
 ; LIT-SRCC-NEXT:    s_nop 1
 ; LIT-SRCC-NEXT:    v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15]
 ; LIT-SRCC-NEXT:    v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15]
-; LIT-SRCC-NEXT:    s_nop 7
-; LIT-SRCC-NEXT:    s_nop 1
+; LIT-SRCC-NEXT:    s_nop 9
 ; LIT-SRCC-NEXT:    v_accvgpr_read_b32 v3, a15
 ; LIT-SRCC-NEXT:    v_accvgpr_read_b32 v2, a14
 ; LIT-SRCC-NEXT:    v_accvgpr_read_b32 v1, a13
@@ -4188,8 +4128,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_forward_acc(ptr addrspace(1)
 ; GFX90A-NEXT:    v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15]
 ; GFX90A-NEXT:    v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15]
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90A-NEXT:    s_nop 7
-; GFX90A-NEXT:    s_nop 1
+; GFX90A-NEXT:    s_nop 9
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
@@ -4224,8 +4163,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_forward_acc(ptr addrspace(1)
 ; GFX942-NEXT:    v_mfma_f32_16x16x1_4b_f32 a[0:15], v0, v1, a[0:15]
 ; GFX942-NEXT:    v_mfma_f32_16x16x1_4b_f32 a[0:15], v0, v1, a[0:15]
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NEXT:    s_nop 7
-; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    s_nop 8
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
@@ -4252,8 +4190,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_forward_acc(ptr addrspace(1)
 ; GFX942-VGPR-NEXT:    v_mfma_f32_16x16x1_4b_f32 v[0:15], v16, v17, v[0:15]
 ; GFX942-VGPR-NEXT:    v_mfma_f32_16x16x1_4b_f32 v[0:15], v16, v17, v[0:15]
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-VGPR-NEXT:    s_nop 7
-; GFX942-VGPR-NEXT:    s_nop 0
+; GFX942-VGPR-NEXT:    s_nop 8
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
@@ -4502,8 +4439,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_imm_splat(ptr addrspace(1) %
 ; NOLIT-SRCC-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; NOLIT-SRCC-NEXT:    v_mov_b32_e32 v4, 0
 ; NOLIT-SRCC-NEXT:    v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15]
-; NOLIT-SRCC-NEXT:    s_nop 7
-; NOLIT-SRCC-NEXT:    s_nop 1
+; NOLIT-SRCC-NEXT:    s_nop 9
 ; NOLIT-SRCC-NEXT:    v_accvgpr_read_b32 v3, a15
 ; NOLIT-SRCC-NEXT:    v_accvgpr_read_b32 v2, a14
 ; NOLIT-SRCC-NEXT:    v_accvgpr_read_b32 v1, a13
@@ -4541,8 +4477,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_imm_splat(ptr addrspace(1) %
 ; LIT-SRCC-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; LIT-SRCC-NEXT:    v_mov_b32_e32 v8, 0
 ; LIT-SRCC-NEXT:    v_mfma_f32_16x16x1f32 a[0:15], v0, v1, 1.0
-; LIT-SRCC-NEXT:    s_nop 7
-; LIT-SRCC-NEXT:    s_nop 1
+; LIT-SRCC-NEXT:    s_nop 9
 ; LIT-SRCC-NEXT:    v_accvgpr_read_b32 v3, a15
 ; LIT-SRCC-NEXT:    v_accvgpr_read_b32 v2, a14
 ; LIT-SRCC-NEXT:    v_accvgpr_read_b32 v1, a13
@@ -4578,8 +4513,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_imm_splat(ptr addrspace(1) %
 ; GFX90A-NEXT:    v_mfma_f32_16x16x1f32 a[0:15], v0, v1, 1.0
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_nop 7
-; GFX90A-NEXT:    s_nop 0
+; GFX90A-NEXT:    s_nop 8
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
@@ -4610,8 +4544,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_imm_splat(ptr addrspace(1) %
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v16, 0
 ; GFX942-VGPR-NEXT:    v_mfma_f32_16x16x1_4b_f32 v[0:15], v0, v1, 1.0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPR-NEXT:    s_nop 7
-; GFX942-VGPR-NEXT:    s_nop 0
+; GFX942-VGPR-NEXT:    s_nop 8
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
@@ -4649,8 +4582,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8f16_imm_splat(ptr addrspace(1) %
 ; NOLIT-SRCC-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; NOLIT-SRCC-NEXT:    v_mov_b32_e32 v4, 0
 ; NOLIT-SRCC-NEXT:    v_mfma_f32_32x32x8f16 a[0:15], v[0:1], v[2:3], a[0:15]
-; NOLIT-SRCC-NEXT:    s_nop 7
-; NOLIT-SRCC-NEXT:    s_nop 7
+; NOLIT-SRCC-NEXT:    s_nop 15
 ; NOLIT-SRCC-NEXT:    s_nop 1
 ; NOLIT-SRCC-NEXT:    v_accvgpr_read_b32 v3, a15
 ; NOLIT-SRCC-NEXT:    v_accvgpr_read_b32 v2, a14
@@ -4691,8 +4623,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8f16_imm_splat(ptr addrspace(1) %
 ; LIT-SRCC-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; LIT-SRCC-NEXT:    v_mov_b32_e32 v13, 0
 ; LIT-SRCC-NEXT:    v_mfma_f32_32x32x8f16 a[0:15], v[0:1], v[2:3], 1.0
-; LIT-SRCC-NEXT:    s_nop 7
-; LIT-SRCC-NEXT:    s_nop 7
+; LIT-SRCC-NEXT:    s_nop 15
 ; LIT-SRCC-NEXT:    s_nop 1
 ; LIT-SRCC-NEXT:    v_accvgpr_read_b32 v3, a15
 ; LIT-SRCC-NEXT:    v_accvgpr_read_b32 v2, a14
@@ -4730,8 +4661,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8f16_imm_splat(ptr addrspace(1) %
 ; GFX90A-NEXT:    v_mfma_f32_32x32x8f16 a[0:15], v[0:1], v[2:3], 1.0
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_nop 7
-; GFX90A-NEXT:    s_nop 7
+; GFX90A-NEXT:    s_nop 15
 ; GFX90A-NEXT:    s_nop 0
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
@@ -4750,8 +4680,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8f16_imm_splat(ptr addrspace(1) %
 ; GFX942-NEXT:    v_mfma_f32_32x32x8_f16 a[0:15], v[0:1], v[2:3], 1.0
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    s_nop 7
-; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    s_nop 8
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
@@ -4768,8 +4697,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8f16_imm_splat(ptr addrspace(1) %
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v16, 0
 ; GFX942-VGPR-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[0:1], v[2:3], 1.0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPR-NEXT:    s_nop 7
-; GFX942-VGPR-NEXT:    s_nop 1
+; GFX942-VGPR-NEXT:    s_nop 9
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
@@ -4821,8 +4749,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm_splat(ptr addrspace(1) %
 ; NOLIT-SRCC-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; NOLIT-SRCC-NEXT:    v_mov_b32_e32 v4, 0
 ; NOLIT-SRCC-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
-; NOLIT-SRCC-NEXT:    s_nop 7
-; NOLIT-SRCC-NEXT:    s_nop 7
+; NOLIT-SRCC-NEXT:    s_nop 15
 ; NOLIT-SRCC-NEXT:    s_nop 1
 ; NOLIT-SRCC-NEXT:    v_accvgpr_read_b32 v3, a31
 ; NOLIT-SRCC-NEXT:    v_accvgpr_read_b32 v2, a30
@@ -4889,8 +4816,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm_splat(ptr addrspace(1) %
 ; LIT-SRCC-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; LIT-SRCC-NEXT:    v_mov_b32_e32 v14, 0
 ; LIT-SRCC-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v0, v1, 0
-; LIT-SRCC-NEXT:    s_nop 7
-; LIT-SRCC-NEXT:    s_nop 7
+; LIT-SRCC-NEXT:    s_nop 15
 ; LIT-SRCC-NEXT:    s_nop 1
 ; LIT-SRCC-NEXT:    v_accvgpr_read_b32 v3, a31
 ; LIT-SRCC-NEXT:    v_accvgpr_read_b32 v2, a30
@@ -4948,8 +4874,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm_splat(ptr addrspace(1) %
 ; GFX90A-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v0, v1, 0
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_nop 7
-; GFX90A-NEXT:    s_nop 7
+; GFX90A-NEXT:    s_nop 15
 ; GFX90A-NEXT:    s_nop 0
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
@@ -4970,8 +4895,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm_splat(ptr addrspace(1) %
 ; GFX942-NEXT:    v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, 0
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    s_nop 7
-; GFX942-NEXT:    s_nop 7
+; GFX942-NEXT:    s_nop 15
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@@ -4990,8 +4914,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm_splat(ptr addrspace(1) %
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v32, 0
 ; GFX942-VGPR-NEXT:    v_mfma_f32_32x32x1_2b_f32 v[0:31], v0, v1, 0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPR-NEXT:    s_nop 7
-; GFX942-VGPR-NEXT:    s_nop 7
+; GFX942-VGPR-NEXT:    s_nop 15
 ; GFX942-VGPR-NEXT:    s_nop 0
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
@@ -5131,8 +5054,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_imm(ptr addrspace(1) %arg) #
 ; NOLIT-SRCC-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; NOLIT-SRCC-NEXT:    v_mov_b32_e32 v4, 0
 ; NOLIT-SRCC-NEXT:    v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15]
-; NOLIT-SRCC-NEXT:    s_nop 7
-; NOLIT-SRCC-NEXT:    s_nop 1
+; NOLIT-SRCC-NEXT:    s_nop 9
 ; NOLIT-SRCC-NEXT:    v_accvgpr_read_b32 v3, a15
 ; NOLIT-SRCC-NEXT:    v_accvgpr_read_b32 v2, a14
 ; NOLIT-SRCC-NEXT:    v_accvgpr_read_b32 v1, a13
@@ -5186,8 +5108,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_imm(ptr addrspace(1) %arg) #
 ; LIT-SRCC-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; LIT-SRCC-NEXT:    v_mov_b32_e32 v4, 0
 ; LIT-SRCC-NEXT:    v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15]
-; LIT-SRCC-NEXT:    s_nop 7
-; LIT-SRCC-NEXT:    s_nop 1
+; LIT-SRCC-NEXT:    s_nop 9
 ; LIT-SRCC-NEXT:    v_accvgpr_read_b32 v3, a15
 ; LIT-SRCC-NEXT:    v_accvgpr_read_b32 v2, a14
 ; LIT-SRCC-NEXT:    v_accvgpr_read_b32 v1, a13
@@ -5242,8 +5163,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_imm(ptr addrspace(1) %arg) #
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    v_mfma_f32_16x16x1f32 a[0:15], v1, v2, a[0:15]
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_nop 7
-; GFX90A-NEXT:    s_nop 1
+; GFX90A-NEXT:    s_nop 9
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
@@ -5274,8 +5194,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_imm(ptr addrspace(1) %arg) #
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    v_mfma_f32_16x16x1_4b_f32 a[0:15], v1, v2, a[0:15]
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    s_nop 7
-; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    s_nop 8
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
@@ -5304,8 +5223,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_imm(ptr addrspace(1) %arg) #
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v16, 0
 ; GFX942-VGPR-NEXT:    v_mfma_f32_16x16x1_4b_f32 v[0:15], v0, v15, v[0:15]
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPR-NEXT:    s_nop 7
-; GFX942-VGPR-NEXT:    s_nop 0
+; GFX942-VGPR-NEXT:    s_nop 8
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
@@ -5357,8 +5275,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm(ptr addrspace(1) %arg) #
 ; NOLIT-SRCC-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; NOLIT-SRCC-NEXT:    v_mov_b32_e32 v4, 0
 ; NOLIT-SRCC-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
-; NOLIT-SRCC-NEXT:    s_nop 7
-; NOLIT-SRCC-NEXT:    s_nop 7
+; NOLIT-SRCC-NEXT:    s_nop 15
 ; NOLIT-SRCC-NEXT:    s_nop 1
 ; NOLIT-SRCC-NEXT:    v_accvgpr_read_b32 v3, a31
 ; NOLIT-SRCC-NEXT:    v_accvgpr_read_b32 v2, a30
@@ -5457,8 +5374,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm(ptr addrspace(1) %arg) #
 ; LIT-SRCC-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; LIT-SRCC-NEXT:    v_mov_b32_e32 v4, 0
 ; LIT-SRCC-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
-; LIT-SRCC-NEXT:    s_nop 7
-; LIT-SRCC-NEXT:    s_nop 7
+; LIT-SRCC-NEXT:    s_nop 15
 ; LIT-SRCC-NEXT:    s_nop 1
 ; LIT-SRCC-NEXT:    v_accvgpr_read_b32 v3, a31
 ; LIT-SRCC-NEXT:    v_accvgpr_read_b32 v2, a30
@@ -5558,8 +5474,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm(ptr addrspace(1) %arg) #
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31]
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_nop 7
-; GFX90A-NEXT:    s_nop 7
+; GFX90A-NEXT:    s_nop 15
 ; GFX90A-NEXT:    s_nop 1
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
@@ -5611,8 +5526,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm(ptr addrspace(1) %arg) #
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v2, a[0:31]
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    s_nop 7
-; GFX942-NEXT:    s_nop 7
+; GFX942-NEXT:    s_nop 15
 ; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
@@ -5679,8 +5593,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm(ptr addrspace(1) %arg) #
 ; GFX942-VGPR-NEXT:    s_nop 0
 ; GFX942-VGPR-NEXT:    v_mfma_f32_32x32x1_2b_f32 v[2:33], v0, v34, v[2:33]
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPR-NEXT:    s_nop 7
-; GFX942-VGPR-NEXT:    s_nop 7
+; GFX942-VGPR-NEXT:    s_nop 15
 ; GFX942-VGPR-NEXT:    s_nop 0
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[30:33], s[0:1] offset:112
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[26:29], s[0:1] offset:96
@@ -5965,8 +5878,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_vecarg(ptr addrspace(1) %arg
 ; NOLIT-SRCC-NEXT:    v_mov_b32_e32 v1, 2.0
 ; NOLIT-SRCC-NEXT:    s_nop 1
 ; NOLIT-SRCC-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] cbsz:1 abid:2 blgp:3
-; NOLIT-SRCC-NEXT:    s_nop 7
-; NOLIT-SRCC-NEXT:    s_nop 7
+; NOLIT-SRCC-NEXT:    s_nop 15
 ; NOLIT-SRCC-NEXT:    s_nop 1
 ; NOLIT-SRCC-NEXT:    v_accvgpr_read_b32 v3, a27
 ; NOLIT-SRCC-NEXT:    v_accvgpr_read_b32 v2, a26
@@ -6061,8 +5973,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_vecarg(ptr addrspace(1) %arg
 ; LIT-SRCC-NEXT:    v_mov_b32_e32 v1, 2.0
 ; LIT-SRCC-NEXT:    s_nop 1
 ; LIT-SRCC-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] cbsz:1 abid:2 blgp:3
-; LIT-SRCC-NEXT:    s_nop 7
-; LIT-SRCC-NEXT:    s_nop 7
+; LIT-SRCC-NEXT:    s_nop 15
 ; LIT-SRCC-NEXT:    s_nop 1
 ; LIT-SRCC-NEXT:    v_accvgpr_read_b32 v3, a27
 ; LIT-SRCC-NEXT:    v_accvgpr_read_b32 v2, a26
@@ -6125,8 +6036,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_vecarg(ptr addrspace(1) %arg
 ; GFX90A-NEXT:    global_load_dwordx4 a[0:3], v0, s[0:1]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] cbsz:1 abid:2 blgp:3
-; GFX90A-NEXT:    s_nop 7
-; GFX90A-NEXT:    s_nop 7
+; GFX90A-NEXT:    s_nop 15
 ; GFX90A-NEXT:    s_nop 2
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
@@ -6156,8 +6066,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_vecarg(ptr addrspace(1) %arg
 ; GFX942-NEXT:    global_load_dwordx4 a[0:3], v0, s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v2, a[0:31] cbsz:1 abid:2 blgp:3
-; GFX942-NEXT:    s_nop 7
-; GFX942-NEXT:    s_nop 7
+; GFX942-NEXT:    s_nop 15
 ; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
@@ -6187,8 +6096,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_vecarg(ptr addrspace(1) %arg
 ; GFX942-VGPR-NEXT:    global_load_dwordx4 v[0:3], v32, s[0:1]
 ; GFX942-VGPR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-VGPR-NEXT:    v_mfma_f32_32x32x1_2b_f32 v[0:31], v33, v34, v[0:31] cbsz:1 abid:2 blgp:3
-; GFX942-VGPR-NEXT:    s_nop 7
-; GFX942-VGPR-NEXT:    s_nop 7
+; GFX942-VGPR-NEXT:    s_nop 15
 ; GFX942-VGPR-NEXT:    s_nop 1
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll
index f78ea92b4840b..aae14c8cc87b3 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll
@@ -23,8 +23,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0(<8 x
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v19
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0]
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -47,8 +46,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_1_1__cbsz1__blgp1(<8 x
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v19
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[1,1,0] op_sel_hi:[0,0,0]
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -71,8 +69,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_2_2__cbsz1__blgp1(<8 x
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v19
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[1,1,0]
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -95,8 +92,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_3_3__cbsz1__blgp1(<8 x
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v19
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[1,1,0] op_sel_hi:[1,1,0]
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -119,8 +115,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_3__cbsz1__blgp1(<8 x
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v19
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[0,1,0] op_sel_hi:[0,1,0]
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -143,8 +138,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_3_0__cbsz1__blgp1(<8 x
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v19
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[1,0,0] op_sel_hi:[1,0,0]
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -167,8 +161,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_2_3__cbsz1__blgp1(<8 x
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v19
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[0,1,0] op_sel_hi:[1,1,0]
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -191,8 +184,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_3_2__cbsz1__blgp1(<8 x
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v19
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[1,0,0] op_sel_hi:[1,1,0]
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -216,8 +208,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0__cons
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v19
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3]
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -241,8 +232,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp1(<8 x
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v19
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] blgp:1
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -266,8 +256,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp1__cons
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v19
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] blgp:1
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -291,8 +280,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp2(<8 x
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v17
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] blgp:2
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -316,8 +304,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp2__cons
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v17
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3] blgp:2
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -341,8 +328,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp3(<8 x
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v17
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] blgp:3
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -366,8 +352,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp3__cons
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v17
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3] blgp:3
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -391,8 +376,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp4(<8 x
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v15
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] blgp:4
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -416,8 +400,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp4__cons
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v15
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3] blgp:4
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -441,8 +424,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp0(<8 x
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v19
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:1
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -466,8 +448,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp0__cons
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v19
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] cbsz:1
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -491,8 +472,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp1(<8 x
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v19
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:1 blgp:1
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -517,8 +497,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp1__cons
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v19
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] cbsz:1 blgp:1
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -542,8 +521,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp2(<8 x
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v17
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:1 blgp:2
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -566,8 +544,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp2__cons
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v17
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3] cbsz:1 blgp:2
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -591,8 +568,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp3(<8 x
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v17
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:1 blgp:3
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -616,8 +592,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp3__cons
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v17
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3] cbsz:1 blgp:3
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -641,8 +616,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp4(<8 x
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v15
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:1 blgp:4
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -666,8 +640,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp4__cons
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v15
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3] cbsz:1 blgp:4
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -691,8 +664,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp0(<6 x
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v17
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:2
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -716,8 +688,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp0__cons
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v17
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3] cbsz:2
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -741,8 +712,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp1(<6 x
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v17
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:2 blgp:1
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -766,8 +736,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp1__cons
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v17
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3] cbsz:2 blgp:1
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -888,8 +857,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp0(<6 x
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v17
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:3
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -913,8 +881,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp0__cons
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v17
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3] cbsz:3
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -938,8 +905,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp1(<6 x
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v17
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:3 blgp:1
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -963,8 +929,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp1__cons
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v17
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3] cbsz:3 blgp:1
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1180,8 +1145,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp0(<4 x
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v15
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:4
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1205,8 +1169,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp0__cons
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v15
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3] cbsz:4
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1230,8 +1193,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp1(<4 x
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v15
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:4 blgp:1
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1255,8 +1217,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp1__cons
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v15
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3] cbsz:4 blgp:1
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1425,11 +1386,11 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__sgpr_scaleA__sgpr_
 ; GCN-NEXT:    v_accvgpr_write_b32 a1, v17
 ; GCN-NEXT:    v_accvgpr_write_b32 a2, v18
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v19
-; GCN-NEXT:    v_mov_b32_e32 v16, s1
+; GCN-NEXT:    v_mov_b32_e32 v16, s0
+; GCN-NEXT:    v_mov_b32_e32 v17, s1
 ; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s0, v16 op_sel_hi:[0,0,0]
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel_hi:[0,0,0]
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1447,10 +1408,10 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__sgpr_scaleA__vgpr_
 ; GCN-NEXT:    v_accvgpr_write_b32 a1, v17
 ; GCN-NEXT:    v_accvgpr_write_b32 a2, v18
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v19
+; GCN-NEXT:    v_mov_b32_e32 v16, s0
 ; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s0, v20 op_sel_hi:[0,0,0]
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v20 op_sel_hi:[0,0,0]
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1468,10 +1429,10 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__vgpr_scaleA__sgpr_
 ; GCN-NEXT:    v_accvgpr_write_b32 a1, v17
 ; GCN-NEXT:    v_accvgpr_write_b32 a2, v18
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v19
+; GCN-NEXT:    v_mov_b32_e32 v16, s0
 ; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, s0 op_sel_hi:[0,0,0]
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v16 op_sel_hi:[0,0,0]
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1509,8 +1470,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgprs(<8 x i32> inr
 ; SDAG-NEXT:    v_accvgpr_write_b32 a3, v1
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[14:21], v[6:13], a[0:3], v2, v3 op_sel_hi:[0,0,0]
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    s_nop 11
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
 ; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1540,8 +1500,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgprs(<8 x i32> inr
 ; GISEL-NEXT:    v_accvgpr_write_b32 a3, v1
 ; GISEL-NEXT:    s_nop 1
 ; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[4:11], v[12:19], a[0:3], v2, v3 op_sel_hi:[0,0,0]
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    s_nop 11
 ; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1567,10 +1526,10 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__sgp
 ; SDAG-NEXT:    v_accvgpr_write_b32 a1, v9
 ; SDAG-NEXT:    v_accvgpr_write_b32 a2, v10
 ; SDAG-NEXT:    v_accvgpr_write_b32 a3, v11
+; SDAG-NEXT:    v_mov_b32_e32 v8, s20
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[14:21], v[0:7], a[0:3], s20, v12 op_sel_hi:[0,0,0]
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[14:21], v[0:7], a[0:3], v8, v12 op_sel_hi:[0,0,0]
+; SDAG-NEXT:    s_nop 11
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
 ; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1592,10 +1551,10 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__sgp
 ; GISEL-NEXT:    v_accvgpr_write_b32 a1, v9
 ; GISEL-NEXT:    v_accvgpr_write_b32 a2, v10
 ; GISEL-NEXT:    v_accvgpr_write_b32 a3, v11
+; GISEL-NEXT:    v_mov_b32_e32 v8, s20
 ; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[14:21], v[0:7], a[0:3], s20, v12 op_sel_hi:[0,0,0]
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[14:21], v[0:7], a[0:3], v8, v12 op_sel_hi:[0,0,0]
+; GISEL-NEXT:    s_nop 11
 ; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1621,10 +1580,10 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__vgp
 ; SDAG-NEXT:    v_accvgpr_write_b32 a1, v9
 ; SDAG-NEXT:    v_accvgpr_write_b32 a2, v10
 ; SDAG-NEXT:    v_accvgpr_write_b32 a3, v11
+; SDAG-NEXT:    v_mov_b32_e32 v8, s20
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[14:21], v[0:7], a[0:3], v12, s20 op_sel_hi:[0,0,0]
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[14:21], v[0:7], a[0:3], v12, v8 op_sel_hi:[0,0,0]
+; SDAG-NEXT:    s_nop 11
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
 ; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1646,10 +1605,10 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__vgp
 ; GISEL-NEXT:    v_accvgpr_write_b32 a1, v9
 ; GISEL-NEXT:    v_accvgpr_write_b32 a2, v10
 ; GISEL-NEXT:    v_accvgpr_write_b32 a3, v11
+; GISEL-NEXT:    v_mov_b32_e32 v8, s20
 ; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[14:21], v[0:7], a[0:3], v12, s20 op_sel_hi:[0,0,0]
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[14:21], v[0:7], a[0:3], v12, v8 op_sel_hi:[0,0,0]
+; GISEL-NEXT:    s_nop 11
 ; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1675,10 +1634,10 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_vgpr_sgpr_vgpr__vgp
 ; SDAG-NEXT:    v_accvgpr_write_b32 a1, v9
 ; SDAG-NEXT:    v_accvgpr_write_b32 a2, v10
 ; SDAG-NEXT:    v_accvgpr_write_b32 a3, v11
+; SDAG-NEXT:    v_mov_b32_e32 v8, s20
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[14:21], a[0:3], v12, s20 op_sel_hi:[0,0,0]
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[14:21], a[0:3], v12, v8 op_sel_hi:[0,0,0]
+; SDAG-NEXT:    s_nop 11
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
 ; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1700,10 +1659,10 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_vgpr_sgpr_vgpr__vgp
 ; GISEL-NEXT:    v_accvgpr_write_b32 a1, v9
 ; GISEL-NEXT:    v_accvgpr_write_b32 a2, v10
 ; GISEL-NEXT:    v_accvgpr_write_b32 a3, v11
+; GISEL-NEXT:    v_mov_b32_e32 v8, s20
 ; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[14:21], a[0:3], v12, s20 op_sel_hi:[0,0,0]
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[14:21], a[0:3], v12, v8 op_sel_hi:[0,0,0]
+; GISEL-NEXT:    s_nop 11
 ; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1721,10 +1680,10 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_vgpr_vgpr_sgpr__vgp
 ; GCN-NEXT:    v_accvgpr_write_b32 a1, s1
 ; GCN-NEXT:    v_accvgpr_write_b32 a2, s2
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, s3
+; GCN-NEXT:    v_mov_b32_e32 v17, s16
 ; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, s16 op_sel_hi:[0,0,0]
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel_hi:[0,0,0]
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1750,10 +1709,10 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_sgpr__vgp
 ; SDAG-NEXT:    v_accvgpr_write_b32 a1, s21
 ; SDAG-NEXT:    v_accvgpr_write_b32 a2, s22
 ; SDAG-NEXT:    v_accvgpr_write_b32 a3, s23
+; SDAG-NEXT:    v_mov_b32_e32 v9, s24
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[10:17], v[0:7], a[0:3], v8, s24 op_sel_hi:[0,0,0]
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[10:17], v[0:7], a[0:3], v8, v9 op_sel_hi:[0,0,0]
+; SDAG-NEXT:    s_nop 11
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
 ; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1775,10 +1734,10 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_sgpr__vgp
 ; GISEL-NEXT:    v_accvgpr_write_b32 a1, s21
 ; GISEL-NEXT:    v_accvgpr_write_b32 a2, s22
 ; GISEL-NEXT:    v_accvgpr_write_b32 a3, s23
+; GISEL-NEXT:    v_mov_b32_e32 v9, s24
 ; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[10:17], v[0:7], a[0:3], v8, s24 op_sel_hi:[0,0,0]
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[10:17], v[0:7], a[0:3], v8, v9 op_sel_hi:[0,0,0]
+; GISEL-NEXT:    s_nop 11
 ; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1789,22 +1748,41 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_sgpr__vgp
 }
 
 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_inlineimm__scaleB_inlineimm(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_inlineimm__scaleB_inlineimm:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v16
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v17
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v18
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v19
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 33, -2 op_sel_hi:[1,1,0]
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_inlineimm__scaleB_inlineimm:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_mov_b32_e32 v20, -2
+; SDAG-NEXT:    v_mov_b32_e32 v21, 33
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v21, v20 op_sel_hi:[1,1,0]
+; SDAG-NEXT:    s_nop 11
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_inlineimm__scaleB_inlineimm:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT:    v_mov_b32_e32 v16, 33
+; GISEL-NEXT:    v_mov_b32_e32 v17, -2
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel_hi:[1,1,0]
+; GISEL-NEXT:    s_nop 11
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 2, i32 33, i32 2, i32 -2)
   ret <4 x float> %result
 }
@@ -1813,15 +1791,15 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scale
 ; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scaleB_inlineimm:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_movk_i32 s0, 0x41
+; SDAG-NEXT:    v_mov_b32_e32 v20, -2
+; SDAG-NEXT:    v_mov_b32_e32 v21, 0x41
 ; SDAG-NEXT:    v_accvgpr_write_b32 a0, v16
 ; SDAG-NEXT:    v_accvgpr_write_b32 a1, v17
 ; SDAG-NEXT:    v_accvgpr_write_b32 a2, v18
 ; SDAG-NEXT:    v_accvgpr_write_b32 a3, v19
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s0, -2 op_sel_hi:[1,1,0]
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v21, v20 op_sel_hi:[1,1,0]
+; SDAG-NEXT:    s_nop 11
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
 ; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1836,10 +1814,10 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scale
 ; GISEL-NEXT:    v_accvgpr_write_b32 a2, v18
 ; GISEL-NEXT:    v_accvgpr_write_b32 a3, v19
 ; GISEL-NEXT:    v_mov_b32_e32 v16, 0x41
+; GISEL-NEXT:    v_mov_b32_e32 v17, -2
 ; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, -2 op_sel_hi:[1,1,0]
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel_hi:[1,1,0]
+; GISEL-NEXT:    s_nop 11
 ; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1853,16 +1831,15 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scale
 ; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scaleB_kimm:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_movk_i32 s0, 0x41
+; SDAG-NEXT:    v_mov_b32_e32 v20, 0x4d
+; SDAG-NEXT:    v_mov_b32_e32 v21, 0x41
 ; SDAG-NEXT:    v_accvgpr_write_b32 a0, v16
 ; SDAG-NEXT:    v_accvgpr_write_b32 a1, v17
 ; SDAG-NEXT:    v_accvgpr_write_b32 a2, v18
 ; SDAG-NEXT:    v_accvgpr_write_b32 a3, v19
-; SDAG-NEXT:    v_mov_b32_e32 v16, 0x4d
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s0, v16 op_sel_hi:[1,1,0]
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v21, v20 op_sel_hi:[1,1,0]
+; SDAG-NEXT:    s_nop 11
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
 ; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1880,8 +1857,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scale
 ; GISEL-NEXT:    v_mov_b32_e32 v17, 0x4d
 ; GISEL-NEXT:    s_nop 1
 ; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel_hi:[1,1,0]
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    s_nop 11
 ; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1919,11 +1895,11 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd(<8 x i32
 ; SDAG-NEXT:    v_mov_b32_e32 v17, s9
 ; SDAG-NEXT:    v_mov_b32_e32 v18, s10
 ; SDAG-NEXT:    v_mov_b32_e32 v19, s11
-; SDAG-NEXT:    v_mov_b32_e32 v21, s13
+; SDAG-NEXT:    v_mov_b32_e32 v21, s12
+; SDAG-NEXT:    v_mov_b32_e32 v22, s13
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], s12, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v21, v22 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
+; SDAG-NEXT:    s_nop 11
 ; SDAG-NEXT:    global_store_dwordx4 v20, v[0:3], s[14:15]
 ; SDAG-NEXT:    s_endpgm
 ;
@@ -1942,12 +1918,12 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd(<8 x i32
 ; GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[20:21]
 ; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[22:23]
 ; GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[26:27]
-; GISEL-NEXT:    v_mov_b32_e32 v20, s29
+; GISEL-NEXT:    v_mov_b32_e32 v20, s28
+; GISEL-NEXT:    v_mov_b32_e32 v21, s29
 ; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], s28, v20 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
+; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
 ; GISEL-NEXT:    v_mov_b32_e32 v4, 0
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 2
+; GISEL-NEXT:    s_nop 10
 ; GISEL-NEXT:    global_store_dwordx4 v4, v[0:3], s[30:31]
 ; GISEL-NEXT:    s_endpgm
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 2, i32 3, i32 %scale0, i32 1, i32 %scale1)
@@ -1960,8 +1936,9 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x0
 ; SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x40
-; SDAG-NEXT:    s_movk_i32 s6, 0x41
-; SDAG-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x50
+; SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x50
+; SDAG-NEXT:    v_mov_b32_e32 v21, -2
+; SDAG-NEXT:    v_mov_b32_e32 v22, 0x41
 ; SDAG-NEXT:    v_mov_b32_e32 v20, 0
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SDAG-NEXT:    v_mov_b32_e32 v0, s8
@@ -1983,18 +1960,18 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
 ; SDAG-NEXT:    v_mov_b32_e32 v15, s23
 ; SDAG-NEXT:    v_mov_b64_e32 v[16:17], s[0:1]
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], s6, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0]
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 3
-; SDAG-NEXT:    global_store_dwordx4 v20, v[0:3], s[4:5]
+; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v22, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0]
+; SDAG-NEXT:    s_nop 11
+; SDAG-NEXT:    global_store_dwordx4 v20, v[0:3], s[6:7]
 ; SDAG-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_kimm__scaleB__inlineimm:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x0
 ; GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x40
+; GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x50
 ; GISEL-NEXT:    v_mov_b32_e32 v20, 0x41
-; GISEL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x50
+; GISEL-NEXT:    v_mov_b32_e32 v21, -2
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
 ; GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
@@ -2007,11 +1984,10 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
 ; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[22:23]
 ; GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[0:1]
 ; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0]
+; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0]
 ; GISEL-NEXT:    v_mov_b32_e32 v4, 0
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 2
-; GISEL-NEXT:    global_store_dwordx4 v4, v[0:3], s[4:5]
+; GISEL-NEXT:    s_nop 10
+; GISEL-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
 ; GISEL-NEXT:    s_endpgm
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 65, i32 1, i32 -2)
   store <4 x float> %result, ptr addrspace(1) %ptr, align 16
@@ -2023,8 +1999,9 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x0
 ; SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x40
-; SDAG-NEXT:    s_movk_i32 s6, 0x41
-; SDAG-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x50
+; SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x50
+; SDAG-NEXT:    v_mov_b32_e32 v21, 1.0
+; SDAG-NEXT:    v_mov_b32_e32 v22, 0x41
 ; SDAG-NEXT:    v_mov_b32_e32 v20, 0
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SDAG-NEXT:    v_mov_b32_e32 v0, s8
@@ -2046,18 +2023,18 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
 ; SDAG-NEXT:    v_mov_b32_e32 v15, s23
 ; SDAG-NEXT:    v_mov_b64_e32 v[16:17], s[0:1]
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], s6, 1.0 op_sel:[1,1,0] op_sel_hi:[1,0,0]
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 3
-; SDAG-NEXT:    global_store_dwordx4 v20, v[0:3], s[4:5]
+; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v22, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0]
+; SDAG-NEXT:    s_nop 11
+; SDAG-NEXT:    global_store_dwordx4 v20, v[0:3], s[6:7]
 ; SDAG-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_kimm__scaleB__FP_literal:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x0
 ; GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x40
+; GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x50
 ; GISEL-NEXT:    v_mov_b32_e32 v20, 0x41
-; GISEL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x50
+; GISEL-NEXT:    v_mov_b32_e32 v21, 1.0
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
 ; GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
@@ -2070,11 +2047,10 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
 ; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[22:23]
 ; GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[0:1]
 ; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, 1.0 op_sel:[1,1,0] op_sel_hi:[1,0,0]
+; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0]
 ; GISEL-NEXT:    v_mov_b32_e32 v4, 0
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 2
-; GISEL-NEXT:    global_store_dwordx4 v4, v[0:3], s[4:5]
+; GISEL-NEXT:    s_nop 10
+; GISEL-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
 ; GISEL-NEXT:    s_endpgm
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 65, i32 1, i32 1065353216)
   store <4 x float> %result, ptr addrspace(1) %ptr, align 16
@@ -2086,8 +2062,10 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x0
 ; SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x40
+; SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x50
+; SDAG-NEXT:    v_mov_b32_e32 v21, -2
+; SDAG-NEXT:    v_mov_b32_e32 v22, 1.0
 ; SDAG-NEXT:    v_mov_b32_e32 v20, 0
-; SDAG-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x50
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SDAG-NEXT:    v_mov_b32_e32 v0, s8
 ; SDAG-NEXT:    v_mov_b32_e32 v1, s9
@@ -2108,16 +2086,18 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
 ; SDAG-NEXT:    v_mov_b32_e32 v15, s23
 ; SDAG-NEXT:    v_mov_b64_e32 v[16:17], s[0:1]
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], 1.0, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0]
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 3
-; SDAG-NEXT:    global_store_dwordx4 v20, v[0:3], s[4:5]
+; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v22, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0]
+; SDAG-NEXT:    s_nop 11
+; SDAG-NEXT:    global_store_dwordx4 v20, v[0:3], s[6:7]
 ; SDAG-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_FP_literal__scaleB__inline_imm:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x0
 ; GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x40
+; GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x50
+; GISEL-NEXT:    v_mov_b32_e32 v20, 1.0
+; GISEL-NEXT:    v_mov_b32_e32 v21, -2
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
 ; GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
@@ -2129,14 +2109,11 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
 ; GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[20:21]
 ; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[22:23]
 ; GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[0:1]
-; GISEL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x50
-; GISEL-NEXT:    s_nop 0
-; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], 1.0, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0
-; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-NEXT:    s_nop 7
 ; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    global_store_dwordx4 v4, v[0:3], s[4:5]
+; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0]
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GISEL-NEXT:    s_nop 10
+; GISEL-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
 ; GISEL-NEXT:    s_endpgm
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 1065353216, i32 1, i32 -2)
   store <4 x float> %result, ptr addrspace(1) %ptr, align 16
@@ -2148,8 +2125,10 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x0
 ; SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x40
+; SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x50
+; SDAG-NEXT:    v_mov_b32_e32 v21, 0.15915494
+; SDAG-NEXT:    v_mov_b32_e32 v22, 1.0
 ; SDAG-NEXT:    v_mov_b32_e32 v20, 0
-; SDAG-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x50
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SDAG-NEXT:    v_mov_b32_e32 v0, s8
 ; SDAG-NEXT:    v_mov_b32_e32 v1, s9
@@ -2170,16 +2149,18 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
 ; SDAG-NEXT:    v_mov_b32_e32 v15, s23
 ; SDAG-NEXT:    v_mov_b64_e32 v[16:17], s[0:1]
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], 1.0, 0.15915494 op_sel:[1,1,0] op_sel_hi:[1,0,0]
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 3
-; SDAG-NEXT:    global_store_dwordx4 v20, v[0:3], s[4:5]
+; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v22, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0]
+; SDAG-NEXT:    s_nop 11
+; SDAG-NEXT:    global_store_dwordx4 v20, v[0:3], s[6:7]
 ; SDAG-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_FP_literal__scaleB__FP_literal:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x0
 ; GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x40
+; GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x50
+; GISEL-NEXT:    v_mov_b32_e32 v20, 1.0
+; GISEL-NEXT:    v_mov_b32_e32 v21, 0.15915494
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
 ; GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
@@ -2191,14 +2172,11 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
 ; GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[20:21]
 ; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[22:23]
 ; GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[0:1]
-; GISEL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x50
-; GISEL-NEXT:    s_nop 0
-; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], 1.0, 0.15915494 op_sel:[1,1,0] op_sel_hi:[1,0,0]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0
-; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-NEXT:    s_nop 7
 ; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    global_store_dwordx4 v4, v[0:3], s[4:5]
+; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0]
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GISEL-NEXT:    s_nop 10
+; GISEL-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
 ; GISEL-NEXT:    s_endpgm
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 1065353216, i32 1, i32 1042479491)
   store <4 x float> %result, ptr addrspace(1) %ptr, align 16
@@ -2216,8 +2194,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_a(
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v19
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3]
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -2238,8 +2215,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_b(
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v19
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3]
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -2250,43 +2226,81 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_b(
 }
 
 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_1:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v16
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v17
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v18
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v19
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 0, 1 op_sel_hi:[0,0,0]
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_1:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_mov_b32_e32 v20, 1
+; SDAG-NEXT:    v_mov_b32_e32 v21, 0
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v21, v20 op_sel_hi:[0,0,0]
+; SDAG-NEXT:    s_nop 11
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_1:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT:    v_mov_b32_e32 v16, 0
+; GISEL-NEXT:    v_mov_b32_e32 v17, 1
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel_hi:[0,0,0]
+; GISEL-NEXT:    s_nop 11
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1)
   ret <4 x float> %result
 }
 
 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_1_0_a(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_1_0_a:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v16
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v17
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v18
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v19
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 1, 0 op_sel_hi:[0,0,0]
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_1_0_a:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_mov_b32_e32 v20, 0
+; SDAG-NEXT:    v_mov_b32_e32 v21, 1
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v21, v20 op_sel_hi:[0,0,0]
+; SDAG-NEXT:    s_nop 11
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_1_0_a:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT:    v_mov_b32_e32 v16, 1
+; GISEL-NEXT:    v_mov_b32_e32 v17, 0
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel_hi:[0,0,0]
+; GISEL-NEXT:    s_nop 11
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 1, i32 0, i32 0)
   ret <4 x float> %result
 }
@@ -2305,8 +2319,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp6(
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v19
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] blgp:2
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -2329,8 +2342,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp8(
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v19
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:2
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -2399,8 +2411,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp4(
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v19
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] blgp:4
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -2423,8 +2434,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp8(
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v19
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:4
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -2447,8 +2457,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v6i32_fp4(
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v17
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] blgp:4
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -2471,8 +2480,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v6i32_fp4__v8i32_fp8(
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v17
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:4
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll
index 24af3fa5ff9b7..f0205a3a788ed 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll
@@ -38,8 +38,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp0(<8 x
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
 ; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0]
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 15
 ; SDAG-NEXT:    s_nop 3
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -83,8 +82,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp0(<8 x
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_nop 0
 ; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0]
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 15
 ; GISEL-NEXT:    s_nop 3
 ; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -135,8 +133,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_1_1__cbsz1__blgp1(<8 x
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
 ; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel:[1,1,0] op_sel_hi:[0,0,0]
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 15
 ; SDAG-NEXT:    s_nop 3
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -180,8 +177,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_1_1__cbsz1__blgp1(<8 x
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_nop 0
 ; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel:[1,1,0] op_sel_hi:[0,0,0]
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 15
 ; GISEL-NEXT:    s_nop 3
 ; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -232,8 +228,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_2_2__cbsz1__blgp1(<8 x
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
 ; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[1,1,0]
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 15
 ; SDAG-NEXT:    s_nop 3
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -277,8 +272,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_2_2__cbsz1__blgp1(<8 x
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_nop 0
 ; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[1,1,0]
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 15
 ; GISEL-NEXT:    s_nop 3
 ; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -329,8 +323,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_3_3__cbsz1__blgp1(<8 x
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
 ; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel:[1,1,0] op_sel_hi:[1,1,0]
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 15
 ; SDAG-NEXT:    s_nop 3
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -374,8 +367,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_3_3__cbsz1__blgp1(<8 x
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_nop 0
 ; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel:[1,1,0] op_sel_hi:[1,1,0]
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 15
 ; GISEL-NEXT:    s_nop 3
 ; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -426,8 +418,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_3__cbsz1__blgp1(<8 x
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
 ; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel:[0,1,0] op_sel_hi:[0,1,0]
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 15
 ; SDAG-NEXT:    s_nop 3
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -471,8 +462,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_3__cbsz1__blgp1(<8 x
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_nop 0
 ; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel:[0,1,0] op_sel_hi:[0,1,0]
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 15
 ; GISEL-NEXT:    s_nop 3
 ; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -523,8 +513,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_3_0__cbsz1__blgp1(<8 x
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
 ; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel:[1,0,0] op_sel_hi:[1,0,0]
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 15
 ; SDAG-NEXT:    s_nop 3
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -568,8 +557,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_3_0__cbsz1__blgp1(<8 x
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_nop 0
 ; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel:[1,0,0] op_sel_hi:[1,0,0]
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 15
 ; GISEL-NEXT:    s_nop 3
 ; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -620,8 +608,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_2_3__cbsz1__blgp1(<8 x
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
 ; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel:[0,1,0] op_sel_hi:[1,1,0]
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 15
 ; SDAG-NEXT:    s_nop 3
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -665,8 +652,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_2_3__cbsz1__blgp1(<8 x
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_nop 0
 ; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel:[0,1,0] op_sel_hi:[1,1,0]
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 15
 ; GISEL-NEXT:    s_nop 3
 ; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -717,8 +703,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_3_2__cbsz1__blgp1(<8 x
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
 ; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel:[1,0,0] op_sel_hi:[1,1,0]
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 15
 ; SDAG-NEXT:    s_nop 3
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -762,8 +747,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_3_2__cbsz1__blgp1(<8 x
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_nop 0
 ; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel:[1,0,0] op_sel_hi:[1,1,0]
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 15
 ; GISEL-NEXT:    s_nop 3
 ; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -813,8 +797,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp0__cons
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_nop 0
 ; GCN-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15]
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 15
 ; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -866,8 +849,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp1(<8 x
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
 ; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] blgp:1
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 15
 ; SDAG-NEXT:    s_nop 3
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -911,8 +893,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp1(<8 x
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_nop 0
 ; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] blgp:1
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 15
 ; GISEL-NEXT:    s_nop 3
 ; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -961,8 +942,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp1__cons
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_nop 0
 ; GCN-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] blgp:1
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 15
 ; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -1013,8 +993,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp2(<8 x
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_nop 0
 ; GCN-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] blgp:2
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 15
 ; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -1062,8 +1041,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp2__cons
 ; GCN-NEXT:    v_accvgpr_write_b32 a15, v29
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15] blgp:2
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 15
 ; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -1114,8 +1092,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp3(<8 x
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_nop 0
 ; GCN-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] blgp:3
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 15
 ; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -1163,8 +1140,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp3__cons
 ; GCN-NEXT:    v_accvgpr_write_b32 a15, v29
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15] blgp:3
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 15
 ; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -1213,8 +1189,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp4(<8 x
 ; GCN-NEXT:    v_accvgpr_write_b32 a15, v27
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] blgp:4
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 15
 ; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -1262,8 +1237,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp4__cons
 ; GCN-NEXT:    v_accvgpr_write_b32 a15, v27
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:11], a[0:15] blgp:4
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 15
 ; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -1315,8 +1289,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0(<8 x
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
 ; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] cbsz:1
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 15
 ; SDAG-NEXT:    s_nop 3
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -1360,8 +1333,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0(<8 x
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_nop 0
 ; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] cbsz:1
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 15
 ; GISEL-NEXT:    s_nop 3
 ; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -1410,8 +1382,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0__cons
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_nop 0
 ; GCN-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] cbsz:1
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 15
 ; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -1463,8 +1434,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1(<8 x
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
 ; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] cbsz:1 blgp:1
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 15
 ; SDAG-NEXT:    s_nop 3
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -1508,8 +1478,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1(<8 x
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_nop 0
 ; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] cbsz:1 blgp:1
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 15
 ; GISEL-NEXT:    s_nop 3
 ; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -1559,8 +1528,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1__cons
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_nop 0
 ; GCN-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] cbsz:1 blgp:1
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 15
 ; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -1611,8 +1579,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp2(<8 x
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_nop 0
 ; GCN-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] cbsz:1 blgp:2
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 15
 ; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -1660,8 +1627,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp2__cons
 ; GCN-NEXT:    v_accvgpr_write_b32 a15, v29
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15] cbsz:1 blgp:2
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 15
 ; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -1712,8 +1678,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp3(<8 x
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_nop 0
 ; GCN-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] cbsz:1 blgp:3
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 15
 ; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -1761,8 +1726,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp3__cons
 ; GCN-NEXT:    v_accvgpr_write_b32 a15, v29
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15] cbsz:1 blgp:3
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 15
 ; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -1811,8 +1775,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp4(<8 x
 ; GCN-NEXT:    v_accvgpr_write_b32 a15, v27
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:1 blgp:4
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 15
 ; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -1860,8 +1823,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp4__cons
 ; GCN-NEXT:    v_accvgpr_write_b32 a15, v27
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:11], a[0:15] cbsz:1 blgp:4
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 15
 ; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -1912,8 +1874,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp0(<6 x
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_nop 0
 ; GCN-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] cbsz:2
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 15
 ; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -1961,8 +1922,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp0__cons
 ; GCN-NEXT:    v_accvgpr_write_b32 a15, v29
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15] cbsz:2
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 15
 ; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -2013,8 +1973,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp1(<6 x
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_nop 0
 ; GCN-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] cbsz:2 blgp:1
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 15
 ; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -2062,8 +2021,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp1__cons
 ; GCN-NEXT:    v_accvgpr_write_b32 a15, v29
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15] cbsz:2 blgp:1
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 15
 ; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -2112,8 +2070,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp2(<6 x
 ; GCN-NEXT:    v_accvgpr_write_b32 a15, v27
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:2 blgp:2
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -2160,8 +2117,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp2__cons
 ; GCN-NEXT:    v_accvgpr_write_b32 a15, v27
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15] cbsz:2 blgp:2
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -2209,8 +2165,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp3(<6 x
 ; GCN-NEXT:    v_accvgpr_write_b32 a15, v27
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:2 blgp:3
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -2257,8 +2212,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp3__cons
 ; GCN-NEXT:    v_accvgpr_write_b32 a15, v27
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15] cbsz:2 blgp:3
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -2309,8 +2263,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp0(<6 x
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_nop 0
 ; GCN-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] cbsz:3
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 15
 ; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -2358,8 +2311,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp0__cons
 ; GCN-NEXT:    v_accvgpr_write_b32 a15, v29
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15] cbsz:3
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 15
 ; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -2410,8 +2362,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp1(<6 x
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_nop 0
 ; GCN-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] cbsz:3 blgp:1
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 15
 ; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -2459,8 +2410,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp1__cons
 ; GCN-NEXT:    v_accvgpr_write_b32 a15, v29
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15] cbsz:3 blgp:1
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 15
 ; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -2509,8 +2459,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp2(<6 x
 ; GCN-NEXT:    v_accvgpr_write_b32 a15, v27
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:3 blgp:2
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -2557,8 +2506,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp2__cons
 ; GCN-NEXT:    v_accvgpr_write_b32 a15, v27
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15] cbsz:3 blgp:2
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -2606,8 +2554,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp4(<6 x
 ; GCN-NEXT:    v_accvgpr_write_b32 a15, v25
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:9], a[0:15], v26, v27 op_sel_hi:[0,0,0] cbsz:3 blgp:4
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -2654,8 +2601,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp4__cons
 ; GCN-NEXT:    v_accvgpr_write_b32 a15, v25
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:9], a[0:15] cbsz:3 blgp:4
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -2703,8 +2649,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp3(<6 x
 ; GCN-NEXT:    v_accvgpr_write_b32 a15, v27
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:3 blgp:3
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -2751,8 +2696,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp3__cons
 ; GCN-NEXT:    v_accvgpr_write_b32 a15, v27
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15] cbsz:3 blgp:3
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -2800,8 +2744,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp4(<6 x
 ; GCN-NEXT:    v_accvgpr_write_b32 a15, v25
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:9], a[0:15], v26, v27 op_sel_hi:[0,0,0] cbsz:2 blgp:4
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -2848,8 +2791,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp4__cons
 ; GCN-NEXT:    v_accvgpr_write_b32 a15, v25
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:9], a[0:15] cbsz:2 blgp:4
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -2897,8 +2839,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp0(<4 x
 ; GCN-NEXT:    v_accvgpr_write_b32 a15, v27
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:4
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 15
 ; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -2946,8 +2887,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp0__cons
 ; GCN-NEXT:    v_accvgpr_write_b32 a15, v27
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:11], a[0:15] cbsz:4
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 15
 ; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -2996,8 +2936,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp1(<4 x
 ; GCN-NEXT:    v_accvgpr_write_b32 a15, v27
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:4 blgp:1
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 15
 ; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -3045,8 +2984,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp1__cons
 ; GCN-NEXT:    v_accvgpr_write_b32 a15, v27
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:11], a[0:15] cbsz:4 blgp:1
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 15
 ; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -3095,8 +3033,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp2(<4 x
 ; GCN-NEXT:    v_accvgpr_write_b32 a15, v25
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:9], a[0:15], v26, v27 op_sel_hi:[0,0,0] cbsz:4 blgp:2
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -3143,8 +3080,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp2__cons
 ; GCN-NEXT:    v_accvgpr_write_b32 a15, v25
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:9], a[0:15] cbsz:4 blgp:2
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -3192,8 +3128,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp3(<4 x
 ; GCN-NEXT:    v_accvgpr_write_b32 a15, v25
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:9], a[0:15], v26, v27 op_sel_hi:[0,0,0] cbsz:4 blgp:3
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -3240,8 +3175,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp3__cons
 ; GCN-NEXT:    v_accvgpr_write_b32 a15, v25
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:9], a[0:15] cbsz:4 blgp:3
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -3289,8 +3223,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4(<4 x
 ; GCN-NEXT:    v_accvgpr_write_b32 a15, v23
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:7], a[0:15], v24, v25 op_sel_hi:[0,0,0] cbsz:4 blgp:4
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -3337,8 +3270,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4__cons
 ; GCN-NEXT:    v_accvgpr_write_b32 a15, v23
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:4 blgp:4
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -3387,12 +3319,12 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__sgpr_scaleA__sgpr_
 ; GCN-NEXT:    v_accvgpr_write_b32 a12, v28
 ; GCN-NEXT:    v_accvgpr_write_b32 a13, v29
 ; GCN-NEXT:    v_accvgpr_write_b32 a14, v30
-; GCN-NEXT:    v_mov_b32_e32 v16, s1
+; GCN-NEXT:    v_mov_b32_e32 v16, s0
+; GCN-NEXT:    v_mov_b32_e32 v17, s1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], s0, v16 op_sel_hi:[0,0,0]
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[0,0,0]
+; GCN-NEXT:    s_nop 15
 ; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -3436,11 +3368,11 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__sgpr_scaleA__vgpr_
 ; GCN-NEXT:    v_accvgpr_write_b32 a12, v28
 ; GCN-NEXT:    v_accvgpr_write_b32 a13, v29
 ; GCN-NEXT:    v_accvgpr_write_b32 a14, v30
+; GCN-NEXT:    v_mov_b32_e32 v16, s0
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], s0, v31 op_sel_hi:[0,0,0]
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v31 op_sel_hi:[0,0,0]
+; GCN-NEXT:    s_nop 15
 ; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -3484,11 +3416,11 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgpr_scaleA__sgpr_
 ; GCN-NEXT:    v_accvgpr_write_b32 a12, v28
 ; GCN-NEXT:    v_accvgpr_write_b32 a13, v29
 ; GCN-NEXT:    v_accvgpr_write_b32 a14, v30
+; GCN-NEXT:    v_mov_b32_e32 v16, s0
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, s0 op_sel_hi:[0,0,0]
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v16 op_sel_hi:[0,0,0]
+; GCN-NEXT:    s_nop 15
 ; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -3551,8 +3483,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgprs(<8 x i32> inr
 ; SDAG-NEXT:    v_accvgpr_write_b32 a15, v13
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[18:25], a[0:15], v14, v15 op_sel_hi:[0,0,0]
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 15
 ; SDAG-NEXT:    s_nop 3
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -3607,8 +3538,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgprs(<8 x i32> inr
 ; GISEL-NEXT:    v_accvgpr_write_b32 a15, v13
 ; GISEL-NEXT:    s_nop 1
 ; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[16:23], v[24:31], a[0:15], v14, v15 op_sel_hi:[0,0,0]
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 15
 ; GISEL-NEXT:    s_nop 3
 ; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -3659,10 +3589,10 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__sgp
 ; SDAG-NEXT:    v_accvgpr_write_b32 a13, v21
 ; SDAG-NEXT:    v_accvgpr_write_b32 a14, v22
 ; SDAG-NEXT:    v_accvgpr_write_b32 a15, v23
+; SDAG-NEXT:    v_mov_b32_e32 v8, s20
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[0:7], a[0:15], s20, v24 op_sel_hi:[0,0,0]
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[0:7], a[0:15], v8, v24 op_sel_hi:[0,0,0]
+; SDAG-NEXT:    s_nop 15
 ; SDAG-NEXT:    s_nop 3
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -3709,10 +3639,10 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__sgp
 ; GISEL-NEXT:    v_accvgpr_write_b32 a13, v21
 ; GISEL-NEXT:    v_accvgpr_write_b32 a14, v22
 ; GISEL-NEXT:    v_accvgpr_write_b32 a15, v23
+; GISEL-NEXT:    v_mov_b32_e32 v8, s20
 ; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[0:7], a[0:15], s20, v24 op_sel_hi:[0,0,0]
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[0:7], a[0:15], v8, v24 op_sel_hi:[0,0,0]
+; GISEL-NEXT:    s_nop 15
 ; GISEL-NEXT:    s_nop 3
 ; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -3763,10 +3693,10 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__vgp
 ; SDAG-NEXT:    v_accvgpr_write_b32 a13, v21
 ; SDAG-NEXT:    v_accvgpr_write_b32 a14, v22
 ; SDAG-NEXT:    v_accvgpr_write_b32 a15, v23
+; SDAG-NEXT:    v_mov_b32_e32 v8, s20
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[0:7], a[0:15], v24, s20 op_sel_hi:[0,0,0]
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[0:7], a[0:15], v24, v8 op_sel_hi:[0,0,0]
+; SDAG-NEXT:    s_nop 15
 ; SDAG-NEXT:    s_nop 3
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -3813,10 +3743,10 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__vgp
 ; GISEL-NEXT:    v_accvgpr_write_b32 a13, v21
 ; GISEL-NEXT:    v_accvgpr_write_b32 a14, v22
 ; GISEL-NEXT:    v_accvgpr_write_b32 a15, v23
+; GISEL-NEXT:    v_mov_b32_e32 v8, s20
 ; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[0:7], a[0:15], v24, s20 op_sel_hi:[0,0,0]
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[0:7], a[0:15], v24, v8 op_sel_hi:[0,0,0]
+; GISEL-NEXT:    s_nop 15
 ; GISEL-NEXT:    s_nop 3
 ; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -3867,10 +3797,10 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_sgpr_vgpr__vgp
 ; SDAG-NEXT:    v_accvgpr_write_b32 a13, v21
 ; SDAG-NEXT:    v_accvgpr_write_b32 a14, v22
 ; SDAG-NEXT:    v_accvgpr_write_b32 a15, v23
+; SDAG-NEXT:    v_mov_b32_e32 v8, s20
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[26:33], a[0:15], v24, s20 op_sel_hi:[0,0,0]
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[26:33], a[0:15], v24, v8 op_sel_hi:[0,0,0]
+; SDAG-NEXT:    s_nop 15
 ; SDAG-NEXT:    s_nop 3
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -3917,10 +3847,10 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_sgpr_vgpr__vgp
 ; GISEL-NEXT:    v_accvgpr_write_b32 a13, v21
 ; GISEL-NEXT:    v_accvgpr_write_b32 a14, v22
 ; GISEL-NEXT:    v_accvgpr_write_b32 a15, v23
+; GISEL-NEXT:    v_mov_b32_e32 v8, s20
 ; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[26:33], a[0:15], v24, s20 op_sel_hi:[0,0,0]
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[26:33], a[0:15], v24, v8 op_sel_hi:[0,0,0]
+; GISEL-NEXT:    s_nop 15
 ; GISEL-NEXT:    s_nop 3
 ; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -3963,10 +3893,10 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_vgpr_sgpr__vgp
 ; GCN-NEXT:    v_accvgpr_write_b32 a13, s25
 ; GCN-NEXT:    v_accvgpr_write_b32 a14, s26
 ; GCN-NEXT:    v_accvgpr_write_b32 a15, s27
+; GCN-NEXT:    v_mov_b32_e32 v17, s28
 ; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, s28 op_sel_hi:[0,0,0]
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[0,0,0]
+; GCN-NEXT:    s_nop 15
 ; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -4029,8 +3959,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_sgpr__vgp
 ; SDAG-NEXT:    v_accvgpr_write_b32 a15, v13
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[0:7], a[0:15], v14, v15 op_sel_hi:[0,0,0]
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 15
 ; SDAG-NEXT:    s_nop 3
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -4089,8 +4018,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_sgpr__vgp
 ; GISEL-NEXT:    v_accvgpr_write_b32 a15, v13
 ; GISEL-NEXT:    s_nop 1
 ; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[16:23], v[0:7], a[0:15], v14, v15 op_sel_hi:[0,0,0]
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 15
 ; GISEL-NEXT:    s_nop 3
 ; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -4114,58 +4042,12 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_sgpr__vgp
 }
 
 define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_inlineimm__scaleB_inlineimm(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_inlineimm__scaleB_inlineimm:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    scratch_load_dword a15, off, s32
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v16
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v17
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v18
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v19
-; GCN-NEXT:    v_accvgpr_write_b32 a4, v20
-; GCN-NEXT:    v_accvgpr_write_b32 a5, v21
-; GCN-NEXT:    v_accvgpr_write_b32 a6, v22
-; GCN-NEXT:    v_accvgpr_write_b32 a7, v23
-; GCN-NEXT:    v_accvgpr_write_b32 a8, v24
-; GCN-NEXT:    v_accvgpr_write_b32 a9, v25
-; GCN-NEXT:    v_accvgpr_write_b32 a10, v26
-; GCN-NEXT:    v_accvgpr_write_b32 a11, v27
-; GCN-NEXT:    v_accvgpr_write_b32 a12, v28
-; GCN-NEXT:    v_accvgpr_write_b32 a13, v29
-; GCN-NEXT:    v_accvgpr_write_b32 a14, v30
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], 33, -2 op_sel_hi:[1,1,0]
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    v_accvgpr_read_b32 v4, a4
-; GCN-NEXT:    v_accvgpr_read_b32 v5, a5
-; GCN-NEXT:    v_accvgpr_read_b32 v6, a6
-; GCN-NEXT:    v_accvgpr_read_b32 v7, a7
-; GCN-NEXT:    v_accvgpr_read_b32 v8, a8
-; GCN-NEXT:    v_accvgpr_read_b32 v9, a9
-; GCN-NEXT:    v_accvgpr_read_b32 v10, a10
-; GCN-NEXT:    v_accvgpr_read_b32 v11, a11
-; GCN-NEXT:    v_accvgpr_read_b32 v12, a12
-; GCN-NEXT:    v_accvgpr_read_b32 v13, a13
-; GCN-NEXT:    v_accvgpr_read_b32 v14, a14
-; GCN-NEXT:    v_accvgpr_read_b32 v15, a15
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-  %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 2, i32 33, i32 2, i32 -2)
-  ret <16 x float> %result
-}
-
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_inlineimm(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_inlineimm:
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_inlineimm__scaleB_inlineimm:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    scratch_load_dword a15, off, s32
-; SDAG-NEXT:    s_movk_i32 s0, 0x41
+; SDAG-NEXT:    v_mov_b32_e32 v31, -2
+; SDAG-NEXT:    v_mov_b32_e32 v32, 33
 ; SDAG-NEXT:    v_accvgpr_write_b32 a0, v16
 ; SDAG-NEXT:    v_accvgpr_write_b32 a1, v17
 ; SDAG-NEXT:    v_accvgpr_write_b32 a2, v18
@@ -4183,9 +4065,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scale
 ; SDAG-NEXT:    v_accvgpr_write_b32 a14, v30
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], s0, -2 op_sel_hi:[1,1,0]
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[1,1,0]
+; SDAG-NEXT:    s_nop 15
 ; SDAG-NEXT:    s_nop 3
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -4205,11 +4086,12 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scale
 ; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_inlineimm:
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_inlineimm__scaleB_inlineimm:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    scratch_load_dword a15, off, s32
-; GISEL-NEXT:    v_mov_b32_e32 v31, 0x41
+; GISEL-NEXT:    v_mov_b32_e32 v31, 33
+; GISEL-NEXT:    v_mov_b32_e32 v32, -2
 ; GISEL-NEXT:    v_accvgpr_write_b32 a0, v16
 ; GISEL-NEXT:    v_accvgpr_write_b32 a1, v17
 ; GISEL-NEXT:    v_accvgpr_write_b32 a2, v18
@@ -4227,9 +4109,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scale
 ; GISEL-NEXT:    v_accvgpr_write_b32 a14, v30
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_nop 0
-; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, -2 op_sel_hi:[1,1,0]
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[1,1,0]
+; GISEL-NEXT:    s_nop 15
 ; GISEL-NEXT:    s_nop 3
 ; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -4248,16 +4129,17 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scale
 ; GISEL-NEXT:    v_accvgpr_read_b32 v14, a14
 ; GISEL-NEXT:    v_accvgpr_read_b32 v15, a15
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
-  %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 2, i32 65, i32 2, i32 -2)
+  %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 2, i32 33, i32 2, i32 -2)
   ret <16 x float> %result
 }
 
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_FP_literal(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_FP_literal:
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_inlineimm(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_inlineimm:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    scratch_load_dword a15, off, s32
-; SDAG-NEXT:    s_movk_i32 s0, 0x41
+; SDAG-NEXT:    v_mov_b32_e32 v31, -2
+; SDAG-NEXT:    v_mov_b32_e32 v32, 0x41
 ; SDAG-NEXT:    v_accvgpr_write_b32 a0, v16
 ; SDAG-NEXT:    v_accvgpr_write_b32 a1, v17
 ; SDAG-NEXT:    v_accvgpr_write_b32 a2, v18
@@ -4275,9 +4157,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scale
 ; SDAG-NEXT:    v_accvgpr_write_b32 a14, v30
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], s0, 1.0 op_sel_hi:[1,1,0]
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[1,1,0]
+; SDAG-NEXT:    s_nop 15
 ; SDAG-NEXT:    s_nop 3
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -4297,11 +4178,12 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scale
 ; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_FP_literal:
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_inlineimm:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    scratch_load_dword a15, off, s32
 ; GISEL-NEXT:    v_mov_b32_e32 v31, 0x41
+; GISEL-NEXT:    v_mov_b32_e32 v32, -2
 ; GISEL-NEXT:    v_accvgpr_write_b32 a0, v16
 ; GISEL-NEXT:    v_accvgpr_write_b32 a1, v17
 ; GISEL-NEXT:    v_accvgpr_write_b32 a2, v18
@@ -4319,9 +4201,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scale
 ; GISEL-NEXT:    v_accvgpr_write_b32 a14, v30
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_nop 0
-; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, 1.0 op_sel_hi:[1,1,0]
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[1,1,0]
+; GISEL-NEXT:    s_nop 15
 ; GISEL-NEXT:    s_nop 3
 ; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -4340,111 +4221,17 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scale
 ; GISEL-NEXT:    v_accvgpr_read_b32 v14, a14
 ; GISEL-NEXT:    v_accvgpr_read_b32 v15, a15
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
-  %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 2, i32 65, i32 2, i32 1065353216)
-  ret <16 x float> %result
-}
-
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_FP_literal__scaleB_inlineimm(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_FP_literal__scaleB_inlineimm:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    scratch_load_dword a15, off, s32
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v16
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v17
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v18
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v19
-; GCN-NEXT:    v_accvgpr_write_b32 a4, v20
-; GCN-NEXT:    v_accvgpr_write_b32 a5, v21
-; GCN-NEXT:    v_accvgpr_write_b32 a6, v22
-; GCN-NEXT:    v_accvgpr_write_b32 a7, v23
-; GCN-NEXT:    v_accvgpr_write_b32 a8, v24
-; GCN-NEXT:    v_accvgpr_write_b32 a9, v25
-; GCN-NEXT:    v_accvgpr_write_b32 a10, v26
-; GCN-NEXT:    v_accvgpr_write_b32 a11, v27
-; GCN-NEXT:    v_accvgpr_write_b32 a12, v28
-; GCN-NEXT:    v_accvgpr_write_b32 a13, v29
-; GCN-NEXT:    v_accvgpr_write_b32 a14, v30
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], 1.0, -2 op_sel_hi:[1,1,0]
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    v_accvgpr_read_b32 v4, a4
-; GCN-NEXT:    v_accvgpr_read_b32 v5, a5
-; GCN-NEXT:    v_accvgpr_read_b32 v6, a6
-; GCN-NEXT:    v_accvgpr_read_b32 v7, a7
-; GCN-NEXT:    v_accvgpr_read_b32 v8, a8
-; GCN-NEXT:    v_accvgpr_read_b32 v9, a9
-; GCN-NEXT:    v_accvgpr_read_b32 v10, a10
-; GCN-NEXT:    v_accvgpr_read_b32 v11, a11
-; GCN-NEXT:    v_accvgpr_read_b32 v12, a12
-; GCN-NEXT:    v_accvgpr_read_b32 v13, a13
-; GCN-NEXT:    v_accvgpr_read_b32 v14, a14
-; GCN-NEXT:    v_accvgpr_read_b32 v15, a15
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-  %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 2, i32 1065353216, i32 2, i32 -2)
-  ret <16 x float> %result
-}
-
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_FP_literal__scaleB_FP_literal(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_FP_literal__scaleB_FP_literal:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    scratch_load_dword a15, off, s32
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v16
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v17
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v18
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v19
-; GCN-NEXT:    v_accvgpr_write_b32 a4, v20
-; GCN-NEXT:    v_accvgpr_write_b32 a5, v21
-; GCN-NEXT:    v_accvgpr_write_b32 a6, v22
-; GCN-NEXT:    v_accvgpr_write_b32 a7, v23
-; GCN-NEXT:    v_accvgpr_write_b32 a8, v24
-; GCN-NEXT:    v_accvgpr_write_b32 a9, v25
-; GCN-NEXT:    v_accvgpr_write_b32 a10, v26
-; GCN-NEXT:    v_accvgpr_write_b32 a11, v27
-; GCN-NEXT:    v_accvgpr_write_b32 a12, v28
-; GCN-NEXT:    v_accvgpr_write_b32 a13, v29
-; GCN-NEXT:    v_accvgpr_write_b32 a14, v30
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], 0.15915494, 1.0 op_sel_hi:[1,1,0]
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    v_accvgpr_read_b32 v4, a4
-; GCN-NEXT:    v_accvgpr_read_b32 v5, a5
-; GCN-NEXT:    v_accvgpr_read_b32 v6, a6
-; GCN-NEXT:    v_accvgpr_read_b32 v7, a7
-; GCN-NEXT:    v_accvgpr_read_b32 v8, a8
-; GCN-NEXT:    v_accvgpr_read_b32 v9, a9
-; GCN-NEXT:    v_accvgpr_read_b32 v10, a10
-; GCN-NEXT:    v_accvgpr_read_b32 v11, a11
-; GCN-NEXT:    v_accvgpr_read_b32 v12, a12
-; GCN-NEXT:    v_accvgpr_read_b32 v13, a13
-; GCN-NEXT:    v_accvgpr_read_b32 v14, a14
-; GCN-NEXT:    v_accvgpr_read_b32 v15, a15
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-  %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 2, i32 1042479491, i32 2, i32 1065353216)
+  %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 2, i32 65, i32 2, i32 -2)
   ret <16 x float> %result
 }
 
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_kimm(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_kimm:
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_FP_literal(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_FP_literal:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    scratch_load_dword a15, off, s32
-; SDAG-NEXT:    s_movk_i32 s0, 0x41
-; SDAG-NEXT:    v_mov_b32_e32 v31, 0x4d
+; SDAG-NEXT:    v_mov_b32_e32 v31, 1.0
+; SDAG-NEXT:    v_mov_b32_e32 v32, 0x41
 ; SDAG-NEXT:    v_accvgpr_write_b32 a0, v16
 ; SDAG-NEXT:    v_accvgpr_write_b32 a1, v17
 ; SDAG-NEXT:    v_accvgpr_write_b32 a2, v18
@@ -4462,9 +4249,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scale
 ; SDAG-NEXT:    v_accvgpr_write_b32 a14, v30
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], s0, v31 op_sel_hi:[1,1,0]
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[1,1,0]
+; SDAG-NEXT:    s_nop 15
 ; SDAG-NEXT:    s_nop 3
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -4484,12 +4270,12 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scale
 ; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_kimm:
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_FP_literal:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    scratch_load_dword a15, off, s32
 ; GISEL-NEXT:    v_mov_b32_e32 v31, 0x41
-; GISEL-NEXT:    v_mov_b32_e32 v32, 0x4d
+; GISEL-NEXT:    v_mov_b32_e32 v32, 1.0
 ; GISEL-NEXT:    v_accvgpr_write_b32 a0, v16
 ; GISEL-NEXT:    v_accvgpr_write_b32 a1, v17
 ; GISEL-NEXT:    v_accvgpr_write_b32 a2, v18
@@ -4508,8 +4294,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scale
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_nop 0
 ; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[1,1,0]
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 15
 ; GISEL-NEXT:    s_nop 3
 ; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -4528,54 +4313,330 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scale
 ; GISEL-NEXT:    v_accvgpr_read_b32 v14, a14
 ; GISEL-NEXT:    v_accvgpr_read_b32 v15, a15
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
-  %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 2, i32 65, i32 2, i32 77)
+  %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 2, i32 65, i32 2, i32 1065353216)
   ret <16 x float> %result
 }
 
-define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1, ptr addrspace(1) %ptr) #0 {
-; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd:
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_FP_literal__scaleB_inlineimm(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_FP_literal__scaleB_inlineimm:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x0
-; SDAG-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0x40
-; SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x80
-; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[36:37]
-; SDAG-NEXT:    v_mov_b32_e32 v16, s8
-; SDAG-NEXT:    v_mov_b32_e32 v17, s9
-; SDAG-NEXT:    v_mov_b32_e32 v18, s10
-; SDAG-NEXT:    v_mov_b32_e32 v19, s11
-; SDAG-NEXT:    v_mov_b32_e32 v20, s12
-; SDAG-NEXT:    v_mov_b32_e32 v21, s13
-; SDAG-NEXT:    v_mov_b32_e32 v22, s14
-; SDAG-NEXT:    v_mov_b32_e32 v23, s15
-; SDAG-NEXT:    v_mov_b32_e32 v24, s16
-; SDAG-NEXT:    v_mov_b32_e32 v25, s17
-; SDAG-NEXT:    v_mov_b32_e32 v26, s18
-; SDAG-NEXT:    v_mov_b32_e32 v27, s19
-; SDAG-NEXT:    v_mov_b32_e32 v28, s20
-; SDAG-NEXT:    v_mov_b32_e32 v29, s21
-; SDAG-NEXT:    v_mov_b32_e32 v30, s22
-; SDAG-NEXT:    v_mov_b32_e32 v31, s23
-; SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[38:39]
-; SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[40:41]
-; SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[42:43]
-; SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[44:45]
-; SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[46:47]
-; SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[48:49]
-; SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[50:51]
-; SDAG-NEXT:    v_mov_b32_e32 v32, s1
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], s0, v32 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
-; SDAG-NEXT:    v_mov_b32_e32 v16, 0
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 2
-; SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[2:3] offset:48
-; SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[2:3] offset:32
-; SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[2:3] offset:16
-; SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[2:3]
-; SDAG-NEXT:    s_endpgm
-;
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    scratch_load_dword a15, off, s32
+; SDAG-NEXT:    v_mov_b32_e32 v31, -2
+; SDAG-NEXT:    v_mov_b32_e32 v32, 1.0
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v24
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v25
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v26
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v27
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v28
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v29
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v30
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_nop 0
+; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[1,1,0]
+; SDAG-NEXT:    s_nop 15
+; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_FP_literal__scaleB_inlineimm:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    scratch_load_dword a15, off, s32
+; GISEL-NEXT:    v_mov_b32_e32 v31, 1.0
+; GISEL-NEXT:    v_mov_b32_e32 v32, -2
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT:    v_accvgpr_write_b32 a4, v20
+; GISEL-NEXT:    v_accvgpr_write_b32 a5, v21
+; GISEL-NEXT:    v_accvgpr_write_b32 a6, v22
+; GISEL-NEXT:    v_accvgpr_write_b32 a7, v23
+; GISEL-NEXT:    v_accvgpr_write_b32 a8, v24
+; GISEL-NEXT:    v_accvgpr_write_b32 a9, v25
+; GISEL-NEXT:    v_accvgpr_write_b32 a10, v26
+; GISEL-NEXT:    v_accvgpr_write_b32 a11, v27
+; GISEL-NEXT:    v_accvgpr_write_b32 a12, v28
+; GISEL-NEXT:    v_accvgpr_write_b32 a13, v29
+; GISEL-NEXT:    v_accvgpr_write_b32 a14, v30
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_nop 0
+; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[1,1,0]
+; GISEL-NEXT:    s_nop 15
+; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT:    v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT:    v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT:    v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT:    v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT:    v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT:    v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT:    v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT:    v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT:    v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT:    v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT:    v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 2, i32 1065353216, i32 2, i32 -2)
+  ret <16 x float> %result
+}
+
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_FP_literal__scaleB_FP_literal(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_FP_literal__scaleB_FP_literal:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    scratch_load_dword a15, off, s32
+; SDAG-NEXT:    v_mov_b32_e32 v31, 1.0
+; SDAG-NEXT:    v_mov_b32_e32 v32, 0.15915494
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v24
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v25
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v26
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v27
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v28
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v29
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v30
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_nop 0
+; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[1,1,0]
+; SDAG-NEXT:    s_nop 15
+; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_FP_literal__scaleB_FP_literal:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    scratch_load_dword a15, off, s32
+; GISEL-NEXT:    v_mov_b32_e32 v31, 0.15915494
+; GISEL-NEXT:    v_mov_b32_e32 v32, 1.0
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT:    v_accvgpr_write_b32 a4, v20
+; GISEL-NEXT:    v_accvgpr_write_b32 a5, v21
+; GISEL-NEXT:    v_accvgpr_write_b32 a6, v22
+; GISEL-NEXT:    v_accvgpr_write_b32 a7, v23
+; GISEL-NEXT:    v_accvgpr_write_b32 a8, v24
+; GISEL-NEXT:    v_accvgpr_write_b32 a9, v25
+; GISEL-NEXT:    v_accvgpr_write_b32 a10, v26
+; GISEL-NEXT:    v_accvgpr_write_b32 a11, v27
+; GISEL-NEXT:    v_accvgpr_write_b32 a12, v28
+; GISEL-NEXT:    v_accvgpr_write_b32 a13, v29
+; GISEL-NEXT:    v_accvgpr_write_b32 a14, v30
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_nop 0
+; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[1,1,0]
+; GISEL-NEXT:    s_nop 15
+; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT:    v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT:    v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT:    v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT:    v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT:    v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT:    v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT:    v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT:    v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT:    v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT:    v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT:    v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 2, i32 1042479491, i32 2, i32 1065353216)
+  ret <16 x float> %result
+}
+
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_kimm(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_kimm:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    scratch_load_dword a15, off, s32
+; SDAG-NEXT:    v_mov_b32_e32 v31, 0x4d
+; SDAG-NEXT:    v_mov_b32_e32 v32, 0x41
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v24
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v25
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v26
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v27
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v28
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v29
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v30
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_nop 0
+; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[1,1,0]
+; SDAG-NEXT:    s_nop 15
+; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_kimm:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    scratch_load_dword a15, off, s32
+; GISEL-NEXT:    v_mov_b32_e32 v31, 0x41
+; GISEL-NEXT:    v_mov_b32_e32 v32, 0x4d
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT:    v_accvgpr_write_b32 a4, v20
+; GISEL-NEXT:    v_accvgpr_write_b32 a5, v21
+; GISEL-NEXT:    v_accvgpr_write_b32 a6, v22
+; GISEL-NEXT:    v_accvgpr_write_b32 a7, v23
+; GISEL-NEXT:    v_accvgpr_write_b32 a8, v24
+; GISEL-NEXT:    v_accvgpr_write_b32 a9, v25
+; GISEL-NEXT:    v_accvgpr_write_b32 a10, v26
+; GISEL-NEXT:    v_accvgpr_write_b32 a11, v27
+; GISEL-NEXT:    v_accvgpr_write_b32 a12, v28
+; GISEL-NEXT:    v_accvgpr_write_b32 a13, v29
+; GISEL-NEXT:    v_accvgpr_write_b32 a14, v30
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_nop 0
+; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[1,1,0]
+; GISEL-NEXT:    s_nop 15
+; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT:    v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT:    v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT:    v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT:    v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT:    v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT:    v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT:    v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT:    v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT:    v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT:    v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT:    v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 2, i32 65, i32 2, i32 77)
+  ret <16 x float> %result
+}
+
+define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1, ptr addrspace(1) %ptr) #0 {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x0
+; SDAG-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0x40
+; SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x80
+; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[36:37]
+; SDAG-NEXT:    v_mov_b32_e32 v16, s8
+; SDAG-NEXT:    v_mov_b32_e32 v17, s9
+; SDAG-NEXT:    v_mov_b32_e32 v18, s10
+; SDAG-NEXT:    v_mov_b32_e32 v19, s11
+; SDAG-NEXT:    v_mov_b32_e32 v20, s12
+; SDAG-NEXT:    v_mov_b32_e32 v21, s13
+; SDAG-NEXT:    v_mov_b32_e32 v22, s14
+; SDAG-NEXT:    v_mov_b32_e32 v23, s15
+; SDAG-NEXT:    v_mov_b32_e32 v24, s16
+; SDAG-NEXT:    v_mov_b32_e32 v25, s17
+; SDAG-NEXT:    v_mov_b32_e32 v26, s18
+; SDAG-NEXT:    v_mov_b32_e32 v27, s19
+; SDAG-NEXT:    v_mov_b32_e32 v28, s20
+; SDAG-NEXT:    v_mov_b32_e32 v29, s21
+; SDAG-NEXT:    v_mov_b32_e32 v30, s22
+; SDAG-NEXT:    v_mov_b32_e32 v31, s23
+; SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[38:39]
+; SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[40:41]
+; SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[42:43]
+; SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[44:45]
+; SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[46:47]
+; SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[48:49]
+; SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[50:51]
+; SDAG-NEXT:    v_mov_b32_e32 v32, s0
+; SDAG-NEXT:    v_mov_b32_e32 v33, s1
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v32, v33 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
+; SDAG-NEXT:    v_mov_b32_e32 v16, 0
+; SDAG-NEXT:    s_nop 15
+; SDAG-NEXT:    s_nop 2
+; SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[2:3] offset:48
+; SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[2:3] offset:32
+; SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[2:3] offset:16
+; SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[2:3]
+; SDAG-NEXT:    s_endpgm
+;
 ; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x0
@@ -4598,12 +4659,12 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd(<8 x i32>
 ; GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[46:47]
 ; GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[48:49]
 ; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[50:51]
-; GISEL-NEXT:    v_mov_b32_e32 v32, s1
+; GISEL-NEXT:    v_mov_b32_e32 v32, s0
+; GISEL-NEXT:    v_mov_b32_e32 v33, s1
 ; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], s0, v32 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
+; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v32, v33 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
 ; GISEL-NEXT:    v_mov_b32_e32 v16, 0
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 15
 ; GISEL-NEXT:    s_nop 2
 ; GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[2:3]
 ; GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[2:3] offset:16
@@ -4620,7 +4681,8 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd___scaleA_
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x0
 ; SDAG-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0x40
-; SDAG-NEXT:    s_movk_i32 s2, 0x41
+; SDAG-NEXT:    v_mov_b32_e32 v32, -2
+; SDAG-NEXT:    v_mov_b32_e32 v33, 0x41
 ; SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x80
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SDAG-NEXT:    v_mov_b32_e32 v16, s8
@@ -4648,10 +4710,9 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd___scaleA_
 ; SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[48:49]
 ; SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[50:51]
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], s2, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
+; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v33, v32 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
 ; SDAG-NEXT:    v_mov_b32_e32 v16, 0
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 15
 ; SDAG-NEXT:    s_nop 2
 ; SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
 ; SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
@@ -4664,6 +4725,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd___scaleA_
 ; GISEL-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x0
 ; GISEL-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0x40
 ; GISEL-NEXT:    v_mov_b32_e32 v32, 0x41
+; GISEL-NEXT:    v_mov_b32_e32 v33, -2
 ; GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x80
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL-NEXT:    v_mov_b64_e32 v[22:23], s[14:15]
@@ -4683,10 +4745,9 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd___scaleA_
 ; GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[48:49]
 ; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[50:51]
 ; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v32, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
+; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v32, v33 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
 ; GISEL-NEXT:    v_mov_b32_e32 v16, 0
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 15
 ; GISEL-NEXT:    s_nop 2
 ; GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
 ; GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
@@ -4738,9 +4799,10 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__nonmac(<8 x
 ; SDAG-NEXT:    v_accvgpr_write_b32 a13, s21
 ; SDAG-NEXT:    v_accvgpr_write_b32 a14, s22
 ; SDAG-NEXT:    v_accvgpr_write_b32 a15, s23
-; SDAG-NEXT:    v_mov_b32_e32 v0, s1
+; SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; SDAG-NEXT:    v_mov_b32_e32 v1, s1
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[2:9], v[10:17], a[0:15], s0, v0 op_sel_hi:[0,0,0]
+; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[2:9], v[10:17], a[0:15], v0, v1 op_sel_hi:[0,0,0]
 ; SDAG-NEXT:    v_mov_b32_e32 v2, s20
 ; SDAG-NEXT:    v_mov_b32_e32 v3, s21
 ; SDAG-NEXT:    v_mov_b32_e32 v4, s22
@@ -4811,10 +4873,11 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__nonmac(<8 x
 ; GISEL-NEXT:    v_accvgpr_write_b32 a13, s21
 ; GISEL-NEXT:    v_accvgpr_write_b32 a14, s22
 ; GISEL-NEXT:    v_accvgpr_write_b32 a15, s23
-; GISEL-NEXT:    v_mov_b32_e32 v20, s1
+; GISEL-NEXT:    v_mov_b32_e32 v20, s0
+; GISEL-NEXT:    v_mov_b32_e32 v21, s1
 ; GISEL-NEXT:    v_mov_b64_e32 v[22:23], 48
 ; GISEL-NEXT:    s_nop 0
-; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], s0, v20 op_sel_hi:[0,0,0]
+; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v20, v21 op_sel_hi:[0,0,0]
 ; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
 ; GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
 ; GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
@@ -4852,24 +4915,26 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__nonmac(<8
 ; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_25_42__nonmac:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_load_dwordx16 s[12:27], s[4:5], 0x0
+; SDAG-NEXT:    v_mov_b32_e32 v0, 42
+; SDAG-NEXT:    v_mov_b32_e32 v1, 25
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b32_e32 v0, s12
-; SDAG-NEXT:    v_mov_b32_e32 v1, s13
-; SDAG-NEXT:    v_mov_b32_e32 v2, s14
-; SDAG-NEXT:    v_mov_b32_e32 v3, s15
-; SDAG-NEXT:    v_mov_b32_e32 v4, s16
-; SDAG-NEXT:    v_mov_b32_e32 v5, s17
-; SDAG-NEXT:    v_mov_b32_e32 v6, s18
-; SDAG-NEXT:    v_mov_b32_e32 v7, s19
-; SDAG-NEXT:    v_mov_b32_e32 v8, s20
-; SDAG-NEXT:    v_mov_b32_e32 v9, s21
-; SDAG-NEXT:    v_mov_b32_e32 v10, s22
-; SDAG-NEXT:    v_mov_b32_e32 v11, s23
+; SDAG-NEXT:    v_mov_b32_e32 v2, s12
+; SDAG-NEXT:    v_mov_b32_e32 v3, s13
+; SDAG-NEXT:    v_mov_b32_e32 v4, s14
+; SDAG-NEXT:    v_mov_b32_e32 v5, s15
+; SDAG-NEXT:    v_mov_b32_e32 v6, s16
+; SDAG-NEXT:    v_mov_b32_e32 v7, s17
+; SDAG-NEXT:    v_mov_b32_e32 v8, s18
+; SDAG-NEXT:    v_mov_b32_e32 v9, s19
+; SDAG-NEXT:    v_mov_b32_e32 v10, s20
+; SDAG-NEXT:    v_mov_b32_e32 v11, s21
+; SDAG-NEXT:    v_mov_b32_e32 v12, s22
+; SDAG-NEXT:    v_mov_b32_e32 v13, s23
 ; SDAG-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x40
-; SDAG-NEXT:    v_mov_b32_e32 v12, s24
-; SDAG-NEXT:    v_mov_b32_e32 v13, s25
-; SDAG-NEXT:    v_mov_b32_e32 v14, s26
-; SDAG-NEXT:    v_mov_b32_e32 v15, s27
+; SDAG-NEXT:    v_mov_b32_e32 v14, s24
+; SDAG-NEXT:    v_mov_b32_e32 v15, s25
+; SDAG-NEXT:    v_mov_b32_e32 v16, s26
+; SDAG-NEXT:    v_mov_b32_e32 v17, s27
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SDAG-NEXT:    v_accvgpr_write_b32 a0, s8
 ; SDAG-NEXT:    v_accvgpr_write_b32 a1, s9
@@ -4888,7 +4953,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__nonmac(<8
 ; SDAG-NEXT:    v_accvgpr_write_b32 a14, s22
 ; SDAG-NEXT:    v_accvgpr_write_b32 a15, s23
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], 25, 42 op_sel_hi:[0,0,0] blgp:2
+; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[2:9], v[10:17], a[0:15], v1, v0 op_sel_hi:[0,0,0] blgp:2
 ; SDAG-NEXT:    v_mov_b32_e32 v2, s20
 ; SDAG-NEXT:    v_mov_b32_e32 v3, s21
 ; SDAG-NEXT:    v_mov_b32_e32 v4, s22
@@ -4931,9 +4996,9 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__nonmac(<8
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0x0
 ; GISEL-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x40
+; GISEL-NEXT:    v_mov_b32_e32 v20, 25
+; GISEL-NEXT:    v_mov_b32_e32 v21, 42
 ; GISEL-NEXT:    v_mov_b64_e32 v[16:17], 0
-; GISEL-NEXT:    v_mov_b64_e32 v[18:19], 16
-; GISEL-NEXT:    v_mov_b64_e32 v[20:21], 32
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[36:37]
 ; GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[38:39]
@@ -4959,14 +5024,15 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__nonmac(<8
 ; GISEL-NEXT:    v_accvgpr_write_b32 a13, s21
 ; GISEL-NEXT:    v_accvgpr_write_b32 a14, s22
 ; GISEL-NEXT:    v_accvgpr_write_b32 a15, s23
+; GISEL-NEXT:    v_mov_b64_e32 v[18:19], 16
 ; GISEL-NEXT:    v_mov_b64_e32 v[22:23], 48
-; GISEL-NEXT:    s_nop 0
-; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], 25, 42 op_sel_hi:[0,0,0] blgp:2
+; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v20, v21 op_sel_hi:[0,0,0] blgp:2
 ; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
 ; GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
 ; GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
 ; GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[16:17]
 ; GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[20:21]
+; GISEL-NEXT:    v_mov_b64_e32 v[20:21], 32
 ; GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
 ; GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[18:19]
 ; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[22:23]
@@ -4978,7 +5044,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__nonmac(<8
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    global_store_dwordx4 v[22:23], v[12:15], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    s_nop 2
 ; GISEL-NEXT:    global_store_dwordx4 v[16:17], a[0:3], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    global_store_dwordx4 v[18:19], a[4:7], off sc0 sc1
@@ -5027,8 +5093,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgprcd_nonma
 ; SDAG-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[32:39], v[40:47], v[16:31] blgp:2
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 6
+; SDAG-NEXT:    s_nop 14
 ; SDAG-NEXT:    v_mov_b32_e32 v16, s20
 ; SDAG-NEXT:    v_mov_b32_e32 v17, s21
 ; SDAG-NEXT:    v_mov_b32_e32 v18, s22
@@ -5123,6 +5188,8 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_non
 ; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_nonmac:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_load_dwordx16 s[12:27], s[4:5], 0x0
+; SDAG-NEXT:    v_mov_b32_e32 v32, 42
+; SDAG-NEXT:    v_mov_b32_e32 v33, 25
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SDAG-NEXT:    v_mov_b32_e32 v16, s12
 ; SDAG-NEXT:    v_mov_b32_e32 v17, s13
@@ -5151,7 +5218,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_non
 ; SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[20:21]
 ; SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[22:23]
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], 25, 42 op_sel_hi:[0,0,0] blgp:2
+; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v33, v32 op_sel_hi:[0,0,0] blgp:2
 ; SDAG-NEXT:    v_mov_b32_e32 v16, s20
 ; SDAG-NEXT:    v_mov_b32_e32 v17, s21
 ; SDAG-NEXT:    v_mov_b32_e32 v18, s22
@@ -5195,9 +5262,9 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_non
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0x0
 ; GISEL-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x40
-; GISEL-NEXT:    v_mov_b64_e32 v[32:33], 0
+; GISEL-NEXT:    v_mov_b32_e32 v32, 25
+; GISEL-NEXT:    v_mov_b32_e32 v33, 42
 ; GISEL-NEXT:    v_mov_b64_e32 v[34:35], 16
-; GISEL-NEXT:    v_mov_b64_e32 v[36:37], 32
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[36:37]
 ; GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[38:39]
@@ -5215,10 +5282,11 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_non
 ; GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[18:19]
 ; GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[20:21]
 ; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[22:23]
+; GISEL-NEXT:    v_mov_b64_e32 v[36:37], 32
 ; GISEL-NEXT:    v_mov_b64_e32 v[38:39], 48
-; GISEL-NEXT:    s_nop 0
-; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], 25, 42 op_sel_hi:[0,0,0] blgp:2
+; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v32, v33 op_sel_hi:[0,0,0] blgp:2
 ; GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[10:11]
+; GISEL-NEXT:    v_mov_b64_e32 v[32:33], 0
 ; GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
 ; GISEL-NEXT:    v_mov_b64_e32 v[22:23], s[14:15]
 ; GISEL-NEXT:    v_mov_b64_e32 v[26:27], s[18:19]
@@ -5234,7 +5302,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_non
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    global_store_dwordx4 v[38:39], v[28:31], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    s_nop 2
 ; GISEL-NEXT:    global_store_dwordx4 v[32:33], v[0:3], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    global_store_dwordx4 v[34:35], v[4:7], off sc0 sc1
@@ -5273,8 +5341,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_0_a(
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_nop 0
 ; GCN-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15]
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 15
 ; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -5320,8 +5387,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_0_b(
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_nop 0
 ; GCN-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15]
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 15
 ; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -5345,95 +5411,185 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_0_b(
 }
 
 define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_1(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_1:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    scratch_load_dword a15, off, s32
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v16
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v17
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v18
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v19
-; GCN-NEXT:    v_accvgpr_write_b32 a4, v20
-; GCN-NEXT:    v_accvgpr_write_b32 a5, v21
-; GCN-NEXT:    v_accvgpr_write_b32 a6, v22
-; GCN-NEXT:    v_accvgpr_write_b32 a7, v23
-; GCN-NEXT:    v_accvgpr_write_b32 a8, v24
-; GCN-NEXT:    v_accvgpr_write_b32 a9, v25
-; GCN-NEXT:    v_accvgpr_write_b32 a10, v26
-; GCN-NEXT:    v_accvgpr_write_b32 a11, v27
-; GCN-NEXT:    v_accvgpr_write_b32 a12, v28
-; GCN-NEXT:    v_accvgpr_write_b32 a13, v29
-; GCN-NEXT:    v_accvgpr_write_b32 a14, v30
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], 0, 1 op_sel_hi:[0,0,0]
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    v_accvgpr_read_b32 v4, a4
-; GCN-NEXT:    v_accvgpr_read_b32 v5, a5
-; GCN-NEXT:    v_accvgpr_read_b32 v6, a6
-; GCN-NEXT:    v_accvgpr_read_b32 v7, a7
-; GCN-NEXT:    v_accvgpr_read_b32 v8, a8
-; GCN-NEXT:    v_accvgpr_read_b32 v9, a9
-; GCN-NEXT:    v_accvgpr_read_b32 v10, a10
-; GCN-NEXT:    v_accvgpr_read_b32 v11, a11
-; GCN-NEXT:    v_accvgpr_read_b32 v12, a12
-; GCN-NEXT:    v_accvgpr_read_b32 v13, a13
-; GCN-NEXT:    v_accvgpr_read_b32 v14, a14
-; GCN-NEXT:    v_accvgpr_read_b32 v15, a15
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_1:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    scratch_load_dword a15, off, s32
+; SDAG-NEXT:    v_mov_b32_e32 v31, 1
+; SDAG-NEXT:    v_mov_b32_e32 v32, 0
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v24
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v25
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v26
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v27
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v28
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v29
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v30
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_nop 0
+; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0]
+; SDAG-NEXT:    s_nop 15
+; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_1:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    scratch_load_dword a15, off, s32
+; GISEL-NEXT:    v_mov_b32_e32 v31, 0
+; GISEL-NEXT:    v_mov_b32_e32 v32, 1
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT:    v_accvgpr_write_b32 a4, v20
+; GISEL-NEXT:    v_accvgpr_write_b32 a5, v21
+; GISEL-NEXT:    v_accvgpr_write_b32 a6, v22
+; GISEL-NEXT:    v_accvgpr_write_b32 a7, v23
+; GISEL-NEXT:    v_accvgpr_write_b32 a8, v24
+; GISEL-NEXT:    v_accvgpr_write_b32 a9, v25
+; GISEL-NEXT:    v_accvgpr_write_b32 a10, v26
+; GISEL-NEXT:    v_accvgpr_write_b32 a11, v27
+; GISEL-NEXT:    v_accvgpr_write_b32 a12, v28
+; GISEL-NEXT:    v_accvgpr_write_b32 a13, v29
+; GISEL-NEXT:    v_accvgpr_write_b32 a14, v30
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_nop 0
+; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0]
+; GISEL-NEXT:    s_nop 15
+; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT:    v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT:    v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT:    v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT:    v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT:    v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT:    v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT:    v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT:    v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT:    v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT:    v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT:    v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1)
   ret <16 x float> %result
 }
 
 define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_1_0_a(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_1_0_a:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    scratch_load_dword a15, off, s32
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v16
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v17
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v18
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v19
-; GCN-NEXT:    v_accvgpr_write_b32 a4, v20
-; GCN-NEXT:    v_accvgpr_write_b32 a5, v21
-; GCN-NEXT:    v_accvgpr_write_b32 a6, v22
-; GCN-NEXT:    v_accvgpr_write_b32 a7, v23
-; GCN-NEXT:    v_accvgpr_write_b32 a8, v24
-; GCN-NEXT:    v_accvgpr_write_b32 a9, v25
-; GCN-NEXT:    v_accvgpr_write_b32 a10, v26
-; GCN-NEXT:    v_accvgpr_write_b32 a11, v27
-; GCN-NEXT:    v_accvgpr_write_b32 a12, v28
-; GCN-NEXT:    v_accvgpr_write_b32 a13, v29
-; GCN-NEXT:    v_accvgpr_write_b32 a14, v30
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], 1, 0 op_sel_hi:[0,0,0]
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    v_accvgpr_read_b32 v4, a4
-; GCN-NEXT:    v_accvgpr_read_b32 v5, a5
-; GCN-NEXT:    v_accvgpr_read_b32 v6, a6
-; GCN-NEXT:    v_accvgpr_read_b32 v7, a7
-; GCN-NEXT:    v_accvgpr_read_b32 v8, a8
-; GCN-NEXT:    v_accvgpr_read_b32 v9, a9
-; GCN-NEXT:    v_accvgpr_read_b32 v10, a10
-; GCN-NEXT:    v_accvgpr_read_b32 v11, a11
-; GCN-NEXT:    v_accvgpr_read_b32 v12, a12
-; GCN-NEXT:    v_accvgpr_read_b32 v13, a13
-; GCN-NEXT:    v_accvgpr_read_b32 v14, a14
-; GCN-NEXT:    v_accvgpr_read_b32 v15, a15
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_1_0_a:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    scratch_load_dword a15, off, s32
+; SDAG-NEXT:    v_mov_b32_e32 v31, 0
+; SDAG-NEXT:    v_mov_b32_e32 v32, 1
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v24
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v25
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v26
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v27
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v28
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v29
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v30
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_nop 0
+; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0]
+; SDAG-NEXT:    s_nop 15
+; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_1_0_a:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    scratch_load_dword a15, off, s32
+; GISEL-NEXT:    v_mov_b32_e32 v31, 1
+; GISEL-NEXT:    v_mov_b32_e32 v32, 0
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT:    v_accvgpr_write_b32 a4, v20
+; GISEL-NEXT:    v_accvgpr_write_b32 a5, v21
+; GISEL-NEXT:    v_accvgpr_write_b32 a6, v22
+; GISEL-NEXT:    v_accvgpr_write_b32 a7, v23
+; GISEL-NEXT:    v_accvgpr_write_b32 a8, v24
+; GISEL-NEXT:    v_accvgpr_write_b32 a9, v25
+; GISEL-NEXT:    v_accvgpr_write_b32 a10, v26
+; GISEL-NEXT:    v_accvgpr_write_b32 a11, v27
+; GISEL-NEXT:    v_accvgpr_write_b32 a12, v28
+; GISEL-NEXT:    v_accvgpr_write_b32 a13, v29
+; GISEL-NEXT:    v_accvgpr_write_b32 a14, v30
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_nop 0
+; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0]
+; GISEL-NEXT:    s_nop 15
+; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT:    v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT:    v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT:    v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT:    v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT:    v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT:    v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT:    v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT:    v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT:    v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT:    v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT:    v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 1, i32 0, i32 0)
   ret <16 x float> %result
 }
@@ -5467,8 +5623,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp6(
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
 ; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] blgp:2
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 15
 ; SDAG-NEXT:    s_nop 3
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -5512,8 +5667,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp6(
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_nop 0
 ; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] blgp:2
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 15
 ; GISEL-NEXT:    s_nop 3
 ; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -5564,8 +5718,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp8(
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
 ; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] cbsz:2
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 15
 ; SDAG-NEXT:    s_nop 3
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -5609,8 +5762,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp8(
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_nop 0
 ; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] cbsz:2
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 15
 ; GISEL-NEXT:    s_nop 3
 ; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -5661,8 +5813,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp6(
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
 ; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] cbsz:2 blgp:2
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    s_nop 11
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
 ; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -5705,8 +5856,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp6(
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_nop 0
 ; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] cbsz:2 blgp:2
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    s_nop 11
 ; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -5754,8 +5904,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp6_
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_nop 0
 ; GCN-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] cbsz:2 blgp:2
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -5805,8 +5954,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp4(
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
 ; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] blgp:4
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 15
 ; SDAG-NEXT:    s_nop 3
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -5850,8 +5998,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp4(
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_nop 0
 ; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] blgp:4
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 15
 ; GISEL-NEXT:    s_nop 3
 ; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -5902,8 +6049,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp8(
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
 ; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] cbsz:4
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 15
 ; SDAG-NEXT:    s_nop 3
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -5947,8 +6093,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp8(
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_nop 0
 ; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] cbsz:4
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 15
 ; GISEL-NEXT:    s_nop 3
 ; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -5998,8 +6143,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v6i32_fp4(
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_nop 0
 ; GCN-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] blgp:4
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 15
 ; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -6049,8 +6193,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v6i32_fp4__v8i32_fp8(
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_nop 0
 ; GCN-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] cbsz:4
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 15
 ; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -6101,8 +6244,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4(
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
 ; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] cbsz:4 blgp:4
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    s_nop 11
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
 ; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -6145,8 +6287,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4(
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_nop 0
 ; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] cbsz:4 blgp:4
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    s_nop 11
 ; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -6194,8 +6335,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4_
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_nop 0
 ; GCN-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] cbsz:4 blgp:4
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll
index 198cac5834d1f..5475fa2ae5c6e 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll
@@ -133,8 +133,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4xf32(ptr addrspace(1) %arg) #0 {
 ; GFX942-SDAG-NEXT:    s_nop 1
 ; GFX942-SDAG-NEXT:    v_mfma_f32_32x32x4_xf32 a[0:15], v[2:3], v[0:1], a[0:15] cbsz:1 abid:2 blgp:3
 ; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-SDAG-NEXT:    s_nop 7
-; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    s_nop 9
 ; GFX942-SDAG-NEXT:    global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
 ; GFX942-SDAG-NEXT:    global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
 ; GFX942-SDAG-NEXT:    global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
@@ -172,8 +171,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4xf32(ptr addrspace(1) %arg) #0 {
 ; GFX942-GISEL-NEXT:    s_nop 1
 ; GFX942-GISEL-NEXT:    v_mfma_f32_32x32x4_xf32 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
 ; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-GISEL-NEXT:    s_nop 7
-; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    s_nop 9
 ; GFX942-GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[16:17]
 ; GFX942-GISEL-NEXT:    global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
 ; GFX942-GISEL-NEXT:    global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
@@ -208,8 +206,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4xf32_vgprcd(ptr addrspace(1) %ar
 ; GFX942-SDAG-NEXT:    s_nop 1
 ; GFX942-SDAG-NEXT:    v_mfma_f32_32x32x4_xf32 v[0:15], v[16:17], v[18:19], v[0:15] cbsz:1 abid:2 blgp:3
 ; GFX942-SDAG-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-SDAG-NEXT:    s_nop 7
-; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    s_nop 9
 ; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
 ; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
 ; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
@@ -239,8 +236,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4xf32_vgprcd(ptr addrspace(1) %ar
 ; GFX942-GISEL-NEXT:    s_nop 1
 ; GFX942-GISEL-NEXT:    v_mfma_f32_32x32x4_xf32 v[0:15], v[16:17], v[18:19], v[0:15] cbsz:1 abid:2 blgp:3
 ; GFX942-GISEL-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-GISEL-NEXT:    s_nop 7
-; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    s_nop 9
 ; GFX942-GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[16:17]
 ; GFX942-GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
 ; GFX942-GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.iterative.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.iterative.ll
index af26e7adae713..bc72687e260e7 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.iterative.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.iterative.ll
@@ -28,8 +28,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
 ; GCN-MINREG-NEXT:    v_add_u32_e32 v3, 0x6000, v4
 ; GCN-MINREG-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; GCN-MINREG-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; GCN-MINREG-NEXT:    s_nop 7
-; GCN-MINREG-NEXT:    s_nop 7
+; GCN-MINREG-NEXT:    s_nop 15
 ; GCN-MINREG-NEXT:    ds_write_b128 v5, a[28:31] offset:112
 ; GCN-MINREG-NEXT:    ds_write_b128 v5, a[24:27] offset:96
 ; GCN-MINREG-NEXT:    ds_write_b128 v5, a[20:23] offset:80
@@ -51,8 +50,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
 ; GCN-MINREG-NEXT:    ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
 ; GCN-MINREG-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; GCN-MINREG-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; GCN-MINREG-NEXT:    s_nop 7
-; GCN-MINREG-NEXT:    s_nop 7
+; GCN-MINREG-NEXT:    s_nop 15
 ; GCN-MINREG-NEXT:    s_nop 2
 ; GCN-MINREG-NEXT:    ds_write_b128 v0, a[24:27] offset:8288
 ; GCN-MINREG-NEXT:    ds_write_b128 v0, a[28:31] offset:8304
@@ -75,8 +73,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
 ; GCN-MINREG-NEXT:    ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
 ; GCN-MINREG-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; GCN-MINREG-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; GCN-MINREG-NEXT:    s_nop 7
-; GCN-MINREG-NEXT:    s_nop 7
+; GCN-MINREG-NEXT:    s_nop 15
 ; GCN-MINREG-NEXT:    s_nop 2
 ; GCN-MINREG-NEXT:    ds_write_b128 v0, a[24:27] offset:16480
 ; GCN-MINREG-NEXT:    ds_write_b128 v0, a[28:31] offset:16496
@@ -99,8 +96,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
 ; GCN-MINREG-NEXT:    ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
 ; GCN-MINREG-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; GCN-MINREG-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; GCN-MINREG-NEXT:    s_nop 7
-; GCN-MINREG-NEXT:    s_nop 7
+; GCN-MINREG-NEXT:    s_nop 15
 ; GCN-MINREG-NEXT:    s_nop 2
 ; GCN-MINREG-NEXT:    ds_write_b128 v0, a[24:27] offset:24672
 ; GCN-MINREG-NEXT:    ds_write_b128 v0, a[28:31] offset:24688
@@ -123,8 +119,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
 ; GCN-MINREG-NEXT:    ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
 ; GCN-MINREG-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; GCN-MINREG-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; GCN-MINREG-NEXT:    s_nop 7
-; GCN-MINREG-NEXT:    s_nop 7
+; GCN-MINREG-NEXT:    s_nop 15
 ; GCN-MINREG-NEXT:    s_nop 2
 ; GCN-MINREG-NEXT:    ds_write_b128 v0, a[24:27] offset:32864
 ; GCN-MINREG-NEXT:    ds_write_b128 v0, a[28:31] offset:32880
@@ -159,8 +154,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
 ; GCN-MAXOCC-NEXT:    v_add_u32_e32 v1, s1, v1
 ; GCN-MAXOCC-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; GCN-MAXOCC-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; GCN-MAXOCC-NEXT:    s_nop 7
-; GCN-MAXOCC-NEXT:    s_nop 7
+; GCN-MAXOCC-NEXT:    s_nop 15
 ; GCN-MAXOCC-NEXT:    s_nop 1
 ; GCN-MAXOCC-NEXT:    ds_write_b128 v1, a[28:31] offset:112
 ; GCN-MAXOCC-NEXT:    ds_write_b128 v1, a[24:27] offset:96
@@ -184,8 +178,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
 ; GCN-MAXOCC-NEXT:    ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
 ; GCN-MAXOCC-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; GCN-MAXOCC-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; GCN-MAXOCC-NEXT:    s_nop 7
-; GCN-MAXOCC-NEXT:    s_nop 7
+; GCN-MAXOCC-NEXT:    s_nop 15
 ; GCN-MAXOCC-NEXT:    s_nop 1
 ; GCN-MAXOCC-NEXT:    ds_write_b128 v1, a[24:27] offset:8288
 ; GCN-MAXOCC-NEXT:    ds_write_b128 v1, a[28:31] offset:8304
@@ -208,8 +201,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
 ; GCN-MAXOCC-NEXT:    ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
 ; GCN-MAXOCC-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; GCN-MAXOCC-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; GCN-MAXOCC-NEXT:    s_nop 7
-; GCN-MAXOCC-NEXT:    s_nop 7
+; GCN-MAXOCC-NEXT:    s_nop 15
 ; GCN-MAXOCC-NEXT:    s_nop 2
 ; GCN-MAXOCC-NEXT:    ds_write_b128 v1, a[24:27] offset:16480
 ; GCN-MAXOCC-NEXT:    ds_write_b128 v1, a[28:31] offset:16496
@@ -233,8 +225,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
 ; GCN-MAXOCC-NEXT:    ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
 ; GCN-MAXOCC-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; GCN-MAXOCC-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; GCN-MAXOCC-NEXT:    s_nop 7
-; GCN-MAXOCC-NEXT:    s_nop 7
+; GCN-MAXOCC-NEXT:    s_nop 15
 ; GCN-MAXOCC-NEXT:    s_nop 1
 ; GCN-MAXOCC-NEXT:    ds_write_b128 v1, a[24:27] offset:24672
 ; GCN-MAXOCC-NEXT:    ds_write_b128 v1, a[28:31] offset:24688
@@ -257,8 +248,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
 ; GCN-MAXOCC-NEXT:    ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
 ; GCN-MAXOCC-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; GCN-MAXOCC-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; GCN-MAXOCC-NEXT:    s_nop 7
-; GCN-MAXOCC-NEXT:    s_nop 7
+; GCN-MAXOCC-NEXT:    s_nop 15
 ; GCN-MAXOCC-NEXT:    s_nop 2
 ; GCN-MAXOCC-NEXT:    ds_write_b128 v1, a[24:27] offset:32864
 ; GCN-MAXOCC-NEXT:    ds_write_b128 v1, a[28:31] offset:32880
@@ -293,8 +283,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
 ; GCN-ILP-NEXT:    v_add_u32_e32 v0, s1, v0
 ; GCN-ILP-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; GCN-ILP-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; GCN-ILP-NEXT:    s_nop 7
-; GCN-ILP-NEXT:    s_nop 7
+; GCN-ILP-NEXT:    s_nop 15
 ; GCN-ILP-NEXT:    s_nop 1
 ; GCN-ILP-NEXT:    ds_write_b128 v0, a[28:31] offset:112
 ; GCN-ILP-NEXT:    ds_write_b128 v0, a[24:27] offset:96
@@ -315,8 +304,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
 ; GCN-ILP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-ILP-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31]
 ; GCN-ILP-NEXT:    v_mov_b32_e32 v0, s1
-; GCN-ILP-NEXT:    s_nop 7
-; GCN-ILP-NEXT:    s_nop 7
+; GCN-ILP-NEXT:    s_nop 15
 ; GCN-ILP-NEXT:    s_nop 1
 ; GCN-ILP-NEXT:    ds_write_b128 v0, a[24:27] offset:8288
 ; GCN-ILP-NEXT:    ds_write_b128 v0, a[28:31] offset:8304
@@ -336,8 +324,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
 ; GCN-ILP-NEXT:    ds_read_b128 a[28:31], v3 offset:24688
 ; GCN-ILP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-ILP-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31]
-; GCN-ILP-NEXT:    s_nop 7
-; GCN-ILP-NEXT:    s_nop 7
+; GCN-ILP-NEXT:    s_nop 15
 ; GCN-ILP-NEXT:    s_nop 2
 ; GCN-ILP-NEXT:    ds_write_b128 v0, a[4:7] offset:16400
 ; GCN-ILP-NEXT:    ds_read_b128 a[4:7], v3 offset:49168
@@ -358,8 +345,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
 ; GCN-ILP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-ILP-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31]
 ; GCN-ILP-NEXT:    v_add_u32_e32 v3, 0x6000, v3
-; GCN-ILP-NEXT:    s_nop 7
-; GCN-ILP-NEXT:    s_nop 7
+; GCN-ILP-NEXT:    s_nop 15
 ; GCN-ILP-NEXT:    s_nop 1
 ; GCN-ILP-NEXT:    ds_write_b128 v0, a[4:7] offset:24592
 ; GCN-ILP-NEXT:    ds_read_b128 a[4:7], v3 offset:57360
@@ -383,8 +369,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
 ; GCN-ILP-NEXT:    ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
 ; GCN-ILP-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
 ; GCN-ILP-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
-; GCN-ILP-NEXT:    s_nop 7
-; GCN-ILP-NEXT:    s_nop 7
+; GCN-ILP-NEXT:    s_nop 15
 ; GCN-ILP-NEXT:    s_nop 2
 ; GCN-ILP-NEXT:    ds_write_b128 v0, a[24:27] offset:32864
 ; GCN-ILP-NEXT:    ds_write_b128 v0, a[28:31] offset:32880
@@ -488,8 +473,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl
 ; GCN-MINREG-NEXT:    v_add_u32_e32 v2, s1, v2
 ; GCN-MINREG-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; GCN-MINREG-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; GCN-MINREG-NEXT:    s_nop 7
-; GCN-MINREG-NEXT:    s_nop 7
+; GCN-MINREG-NEXT:    s_nop 15
 ; GCN-MINREG-NEXT:    s_nop 1
 ; GCN-MINREG-NEXT:    ds_write_b128 v2, a[28:31] offset:112
 ; GCN-MINREG-NEXT:    ds_write_b128 v2, a[24:27] offset:96
@@ -513,8 +497,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl
 ; GCN-MINREG-NEXT:    ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
 ; GCN-MINREG-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; GCN-MINREG-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; GCN-MINREG-NEXT:    s_nop 7
-; GCN-MINREG-NEXT:    s_nop 7
+; GCN-MINREG-NEXT:    s_nop 15
 ; GCN-MINREG-NEXT:    s_nop 1
 ; GCN-MINREG-NEXT:    ds_write_b128 v2, a[24:27] offset:8288
 ; GCN-MINREG-NEXT:    ds_write_b128 v2, a[28:31] offset:8304
@@ -539,8 +522,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl
 ; GCN-MINREG-NEXT:    v_add_u32_e32 v4, 0x6000, v3
 ; GCN-MINREG-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; GCN-MINREG-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; GCN-MINREG-NEXT:    s_nop 7
-; GCN-MINREG-NEXT:    s_nop 7
+; GCN-MINREG-NEXT:    s_nop 15
 ; GCN-MINREG-NEXT:    s_nop 1
 ; GCN-MINREG-NEXT:    ds_write_b128 v2, a[28:31] offset:16496
 ; GCN-MINREG-NEXT:    ds_write_b128 v2, a[24:27] offset:16480
@@ -563,8 +545,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl
 ; GCN-MINREG-NEXT:    ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
 ; GCN-MINREG-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; GCN-MINREG-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; GCN-MINREG-NEXT:    s_nop 7
-; GCN-MINREG-NEXT:    s_nop 7
+; GCN-MINREG-NEXT:    s_nop 15
 ; GCN-MINREG-NEXT:    s_nop 2
 ; GCN-MINREG-NEXT:    ds_write_b128 v2, a[28:31] offset:24688
 ; GCN-MINREG-NEXT:    ds_write_b128 v2, a[24:27] offset:24672
@@ -587,8 +568,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl
 ; GCN-MINREG-NEXT:    ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
 ; GCN-MINREG-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; GCN-MINREG-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; GCN-MINREG-NEXT:    s_nop 7
-; GCN-MINREG-NEXT:    s_nop 7
+; GCN-MINREG-NEXT:    s_nop 15
 ; GCN-MINREG-NEXT:    s_nop 2
 ; GCN-MINREG-NEXT:    ds_write_b128 v2, a[28:31] offset:32880
 ; GCN-MINREG-NEXT:    ds_write_b128 v2, a[24:27] offset:32864
@@ -623,8 +603,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl
 ; GCN-MAXOCC-NEXT:    v_add_u32_e32 v3, s1, v3
 ; GCN-MAXOCC-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; GCN-MAXOCC-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; GCN-MAXOCC-NEXT:    s_nop 7
-; GCN-MAXOCC-NEXT:    s_nop 7
+; GCN-MAXOCC-NEXT:    s_nop 15
 ; GCN-MAXOCC-NEXT:    s_nop 1
 ; GCN-MAXOCC-NEXT:    ds_write_b128 v3, a[28:31] offset:112
 ; GCN-MAXOCC-NEXT:    ds_write_b128 v3, a[24:27] offset:96
@@ -648,8 +627,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl
 ; GCN-MAXOCC-NEXT:    ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
 ; GCN-MAXOCC-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; GCN-MAXOCC-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; GCN-MAXOCC-NEXT:    s_nop 7
-; GCN-MAXOCC-NEXT:    s_nop 7
+; GCN-MAXOCC-NEXT:    s_nop 15
 ; GCN-MAXOCC-NEXT:    s_nop 1
 ; GCN-MAXOCC-NEXT:    ds_write_b128 v3, a[24:27] offset:8288
 ; GCN-MAXOCC-NEXT:    ds_write_b128 v3, a[28:31] offset:8304
@@ -673,8 +651,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl
 ; GCN-MAXOCC-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31]
 ; GCN-MAXOCC-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; GCN-MAXOCC-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; GCN-MAXOCC-NEXT:    s_nop 7
-; GCN-MAXOCC-NEXT:    s_nop 7
+; GCN-MAXOCC-NEXT:    s_nop 15
 ; GCN-MAXOCC-NEXT:    s_nop 2
 ; GCN-MAXOCC-NEXT:    ds_write_b128 v3, a[28:31] offset:16496
 ; GCN-MAXOCC-NEXT:    ds_write_b128 v3, a[24:27] offset:16480
@@ -698,8 +675,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl
 ; GCN-MAXOCC-NEXT:    ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
 ; GCN-MAXOCC-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; GCN-MAXOCC-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; GCN-MAXOCC-NEXT:    s_nop 7
-; GCN-MAXOCC-NEXT:    s_nop 7
+; GCN-MAXOCC-NEXT:    s_nop 15
 ; GCN-MAXOCC-NEXT:    s_nop 1
 ; GCN-MAXOCC-NEXT:    ds_write_b128 v3, a[28:31] offset:24688
 ; GCN-MAXOCC-NEXT:    ds_write_b128 v3, a[24:27] offset:24672
@@ -722,8 +698,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl
 ; GCN-MAXOCC-NEXT:    ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
 ; GCN-MAXOCC-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; GCN-MAXOCC-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; GCN-MAXOCC-NEXT:    s_nop 7
-; GCN-MAXOCC-NEXT:    s_nop 7
+; GCN-MAXOCC-NEXT:    s_nop 15
 ; GCN-MAXOCC-NEXT:    s_nop 2
 ; GCN-MAXOCC-NEXT:    ds_write_b128 v3, a[28:31] offset:32880
 ; GCN-MAXOCC-NEXT:    ds_write_b128 v3, a[24:27] offset:32864
@@ -758,8 +733,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl
 ; GCN-ILP-NEXT:    v_add_u32_e32 v2, s1, v2
 ; GCN-ILP-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; GCN-ILP-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; GCN-ILP-NEXT:    s_nop 7
-; GCN-ILP-NEXT:    s_nop 7
+; GCN-ILP-NEXT:    s_nop 15
 ; GCN-ILP-NEXT:    s_nop 1
 ; GCN-ILP-NEXT:    ds_write_b128 v2, a[0:3]
 ; GCN-ILP-NEXT:    ds_read_b128 a[0:3], v3 offset:8192
@@ -783,8 +757,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl
 ; GCN-ILP-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; GCN-ILP-NEXT:    ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
 ; GCN-ILP-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; GCN-ILP-NEXT:    s_nop 7
-; GCN-ILP-NEXT:    s_nop 7
+; GCN-ILP-NEXT:    s_nop 15
 ; GCN-ILP-NEXT:    s_nop 1
 ; GCN-ILP-NEXT:    ds_write_b128 v2, a[24:27] offset:8288
 ; GCN-ILP-NEXT:    ds_write_b128 v2, a[28:31] offset:8304
@@ -808,8 +781,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl
 ; GCN-ILP-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
 ; GCN-ILP-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; GCN-ILP-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; GCN-ILP-NEXT:    s_nop 7
-; GCN-ILP-NEXT:    s_nop 7
+; GCN-ILP-NEXT:    s_nop 15
 ; GCN-ILP-NEXT:    s_nop 2
 ; GCN-ILP-NEXT:    ds_write_b128 v2, a[28:31] offset:16496
 ; GCN-ILP-NEXT:    ds_write_b128 v2, a[24:27] offset:16480
@@ -830,8 +802,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl
 ; GCN-ILP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-ILP-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
 ; GCN-ILP-NEXT:    v_add_u32_e32 v3, 0x6000, v3
-; GCN-ILP-NEXT:    s_nop 7
-; GCN-ILP-NEXT:    s_nop 7
+; GCN-ILP-NEXT:    s_nop 15
 ; GCN-ILP-NEXT:    s_nop 1
 ; GCN-ILP-NEXT:    ds_write_b128 v2, a[0:3] offset:24576
 ; GCN-ILP-NEXT:    ds_read_b128 a[0:3], v3 offset:57344
@@ -855,8 +826,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl
 ; GCN-ILP-NEXT:    ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
 ; GCN-ILP-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
 ; GCN-ILP-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
-; GCN-ILP-NEXT:    s_nop 7
-; GCN-ILP-NEXT:    s_nop 7
+; GCN-ILP-NEXT:    s_nop 15
 ; GCN-ILP-NEXT:    s_nop 2
 ; GCN-ILP-NEXT:    ds_write_b128 v2, a[28:31] offset:32880
 ; GCN-ILP-NEXT:    ds_write_b128 v2, a[24:27] offset:32864
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll
index 5b877f5a2bbb7..aa099b60ef16d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll
@@ -678,8 +678,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr ad
 ; GCN-NEXT:    v_mfma_f32_32x32x1f32 a[64:95], v1, v2, a[64:95]
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mfma_f32_32x32x1f32 a[32:63], v1, v2, a[32:63]
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 4
+; GCN-NEXT:    s_nop 12
 ; GCN-NEXT:    ds_write_b128 v0, a[156:159] offset:112
 ; GCN-NEXT:    ds_write_b128 v0, a[152:155] offset:96
 ; GCN-NEXT:    ds_write_b128 v0, a[148:151] offset:80
@@ -785,8 +784,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr ad
 ; EXACTCUTOFF-NEXT:    v_mfma_f32_32x32x1f32 a[64:95], v1, v2, a[64:95]
 ; EXACTCUTOFF-NEXT:    s_waitcnt lgkmcnt(0)
 ; EXACTCUTOFF-NEXT:    v_mfma_f32_32x32x1f32 a[32:63], v1, v2, a[32:63]
-; EXACTCUTOFF-NEXT:    s_nop 7
-; EXACTCUTOFF-NEXT:    s_nop 4
+; EXACTCUTOFF-NEXT:    s_nop 12
 ; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[156:159] offset:112
 ; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[152:155] offset:96
 ; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[148:151] offset:80
@@ -890,8 +888,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
 ; GCN-NEXT:    v_add_u32_e32 v0, s1, v0
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 15
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    ds_write_b128 v0, a[28:31] offset:112
 ; GCN-NEXT:    ds_write_b128 v0, a[24:27] offset:96
@@ -915,8 +912,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 15
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    ds_write_b128 v0, a[24:27] offset:8288
 ; GCN-NEXT:    ds_write_b128 v0, a[28:31] offset:8304
@@ -939,8 +935,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 15
 ; GCN-NEXT:    s_nop 2
 ; GCN-NEXT:    ds_write_b128 v0, a[24:27] offset:16480
 ; GCN-NEXT:    ds_write_b128 v0, a[28:31] offset:16496
@@ -964,8 +959,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 15
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    ds_write_b128 v0, a[24:27] offset:24672
 ; GCN-NEXT:    ds_write_b128 v0, a[28:31] offset:24688
@@ -988,8 +982,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 15
 ; GCN-NEXT:    s_nop 2
 ; GCN-NEXT:    ds_write_b128 v0, a[24:27] offset:32864
 ; GCN-NEXT:    ds_write_b128 v0, a[28:31] offset:32880
@@ -1024,8 +1017,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
 ; EXACTCUTOFF-NEXT:    v_add_u32_e32 v0, s1, v0
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; EXACTCUTOFF-NEXT:    s_nop 7
-; EXACTCUTOFF-NEXT:    s_nop 7
+; EXACTCUTOFF-NEXT:    s_nop 15
 ; EXACTCUTOFF-NEXT:    s_nop 1
 ; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[28:31] offset:112
 ; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[24:27] offset:96
@@ -1049,8 +1041,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; EXACTCUTOFF-NEXT:    s_nop 7
-; EXACTCUTOFF-NEXT:    s_nop 7
+; EXACTCUTOFF-NEXT:    s_nop 15
 ; EXACTCUTOFF-NEXT:    s_nop 1
 ; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[24:27] offset:8288
 ; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[28:31] offset:8304
@@ -1073,8 +1064,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; EXACTCUTOFF-NEXT:    s_nop 7
-; EXACTCUTOFF-NEXT:    s_nop 7
+; EXACTCUTOFF-NEXT:    s_nop 15
 ; EXACTCUTOFF-NEXT:    s_nop 2
 ; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[24:27] offset:16480
 ; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[28:31] offset:16496
@@ -1098,8 +1088,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; EXACTCUTOFF-NEXT:    s_nop 7
-; EXACTCUTOFF-NEXT:    s_nop 7
+; EXACTCUTOFF-NEXT:    s_nop 15
 ; EXACTCUTOFF-NEXT:    s_nop 1
 ; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[24:27] offset:24672
 ; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[28:31] offset:24688
@@ -1122,8 +1111,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; EXACTCUTOFF-NEXT:    s_nop 7
-; EXACTCUTOFF-NEXT:    s_nop 7
+; EXACTCUTOFF-NEXT:    s_nop 15
 ; EXACTCUTOFF-NEXT:    s_nop 2
 ; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[24:27] offset:32864
 ; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[28:31] offset:32880
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
index b25fe8392a60e..6eb9449069a52 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
@@ -199,8 +199,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x32_f16__vgpr(ptr addrspace(1) %
 ; SDAG-NEXT:    s_nop 0
 ; SDAG-NEXT:    v_smfmac_f32_32x32x32_f16 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
 ; SDAG-NEXT:    v_mov_b32_e32 v16, 0
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 2
+; SDAG-NEXT:    s_nop 10
 ; SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[6:7] offset:32
 ; SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[6:7] offset:48
 ; SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[6:7]
@@ -232,8 +231,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x32_f16__vgpr(ptr addrspace(1) %
 ; GISEL-NEXT:    s_nop 0
 ; GISEL-NEXT:    v_smfmac_f32_32x32x32_f16 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
 ; GISEL-NEXT:    v_mov_b32_e32 v16, 0
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 2
+; GISEL-NEXT:    s_nop 10
 ; GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[6:7]
 ; GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[6:7] offset:16
 ; GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[6:7] offset:32
@@ -253,8 +251,7 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16(<8 x half> %arg0, <16 x half>
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_smfmac_f32_32x32x32_f16 v[12:27], v[0:3], v[4:11], v28
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    s_nop 11
 ; SDAG-NEXT:    v_mov_b32_e32 v0, v12
 ; SDAG-NEXT:    v_mov_b32_e32 v1, v13
 ; SDAG-NEXT:    v_mov_b32_e32 v2, v14
@@ -316,8 +313,7 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16__flags0(<8 x half> %arg0, <16
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_smfmac_f32_32x32x32_f16 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    s_nop 11
 ; SDAG-NEXT:    v_mov_b32_e32 v0, v12
 ; SDAG-NEXT:    v_mov_b32_e32 v1, v13
 ; SDAG-NEXT:    v_mov_b32_e32 v2, v14
@@ -379,8 +375,7 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16__flags1(<8 x half> %arg0, <16
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_smfmac_f32_32x32x32_f16 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    s_nop 11
 ; SDAG-NEXT:    v_mov_b32_e32 v0, v12
 ; SDAG-NEXT:    v_mov_b32_e32 v1, v13
 ; SDAG-NEXT:    v_mov_b32_e32 v2, v14
@@ -471,8 +466,7 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16__sgpr(<8 x half> inreg %arg0,
 ; SDAG-NEXT:    v_mov_b32_e32 v27, v9
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_smfmac_f32_32x32x32_f16 v[12:27], v[36:39], v[28:35], v10
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    s_nop 11
 ; SDAG-NEXT:    v_mov_b32_e32 v0, v12
 ; SDAG-NEXT:    v_mov_b32_e32 v1, v13
 ; SDAG-NEXT:    v_mov_b32_e32 v2, v14
@@ -685,8 +679,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x32_bf16__vgpr(ptr addrspace(1)
 ; GCN-NEXT:    s_nop 0
 ; GCN-NEXT:    v_smfmac_f32_32x32x32_bf16 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
 ; GCN-NEXT:    v_mov_b32_e32 v16, 0
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 10
 ; GCN-NEXT:    global_store_dwordx4 v16, v[8:11], s[6:7] offset:32
 ; GCN-NEXT:    global_store_dwordx4 v16, v[12:15], s[6:7] offset:48
 ; GCN-NEXT:    global_store_dwordx4 v16, v[0:3], s[6:7]
@@ -706,8 +699,7 @@ define <16 x float> @test_smfmac_f32_32x32x32_bf16(<8 x bfloat> %arg0, <16 x bfl
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_smfmac_f32_32x32x32_bf16 v[12:27], v[0:3], v[4:11], v28
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_mov_b32_e32 v0, v12
 ; GCN-NEXT:    v_mov_b32_e32 v1, v13
 ; GCN-NEXT:    v_mov_b32_e32 v2, v14
@@ -734,8 +726,7 @@ define <16 x float> @test_smfmac_f32_32x32x32_bf16__flags0(<8 x bfloat> %arg0, <
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_smfmac_f32_32x32x32_bf16 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_mov_b32_e32 v0, v12
 ; GCN-NEXT:    v_mov_b32_e32 v1, v13
 ; GCN-NEXT:    v_mov_b32_e32 v2, v14
@@ -762,8 +753,7 @@ define <16 x float> @test_smfmac_f32_32x32x32_bf16__flags1(<8 x bfloat> %arg0, <
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_smfmac_f32_32x32x32_bf16 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_mov_b32_e32 v0, v12
 ; GCN-NEXT:    v_mov_b32_e32 v1, v13
 ; GCN-NEXT:    v_mov_b32_e32 v2, v14
@@ -819,8 +809,7 @@ define <16 x float> @test_smfmac_f32_32x32x32_bf16__sgpr(<8 x bfloat> inreg %arg
 ; GCN-NEXT:    v_mov_b32_e32 v27, v9
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_smfmac_f32_32x32x32_bf16 v[12:27], v[36:39], v[28:35], v10
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    v_mov_b32_e32 v0, v12
 ; GCN-NEXT:    v_mov_b32_e32 v1, v13
 ; GCN-NEXT:    v_mov_b32_e32 v2, v14
@@ -1049,8 +1038,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x64_i8__vgpr(ptr addrspace(1) %a
 ; SDAG-NEXT:    s_nop 0
 ; SDAG-NEXT:    v_smfmac_i32_32x32x64_i8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
 ; SDAG-NEXT:    v_mov_b32_e32 v16, 0
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 2
+; SDAG-NEXT:    s_nop 10
 ; SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[6:7] offset:32
 ; SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[6:7] offset:48
 ; SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[6:7]
@@ -1082,8 +1070,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x64_i8__vgpr(ptr addrspace(1) %a
 ; GISEL-NEXT:    s_nop 0
 ; GISEL-NEXT:    v_smfmac_i32_32x32x64_i8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
 ; GISEL-NEXT:    v_mov_b32_e32 v16, 0
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 2
+; GISEL-NEXT:    s_nop 10
 ; GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
 ; GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
 ; GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
@@ -1103,8 +1090,7 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8(<4 x i32> %arg0, <8 x i32> %arg1,
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_smfmac_i32_32x32x64_i8 v[12:27], v[0:3], v[4:11], v28
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    s_nop 11
 ; SDAG-NEXT:    v_mov_b32_e32 v0, v12
 ; SDAG-NEXT:    v_mov_b32_e32 v1, v13
 ; SDAG-NEXT:    v_mov_b32_e32 v2, v14
@@ -1166,8 +1152,7 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8__flags0(<4 x i32> %arg0, <8 x i32
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_smfmac_i32_32x32x64_i8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    s_nop 11
 ; SDAG-NEXT:    v_mov_b32_e32 v0, v12
 ; SDAG-NEXT:    v_mov_b32_e32 v1, v13
 ; SDAG-NEXT:    v_mov_b32_e32 v2, v14
@@ -1229,8 +1214,7 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8__flags1(<4 x i32> %arg0, <8 x i32
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_smfmac_i32_32x32x64_i8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    s_nop 11
 ; SDAG-NEXT:    v_mov_b32_e32 v0, v12
 ; SDAG-NEXT:    v_mov_b32_e32 v1, v13
 ; SDAG-NEXT:    v_mov_b32_e32 v2, v14
@@ -1321,8 +1305,7 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8__sgpr(<4 x i32> inreg %arg0, <8 x
 ; SDAG-NEXT:    v_mov_b32_e32 v27, v9
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_smfmac_i32_32x32x64_i8 v[12:27], v[36:39], v[28:35], v10
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    s_nop 11
 ; SDAG-NEXT:    v_mov_b32_e32 v0, v12
 ; SDAG-NEXT:    v_mov_b32_e32 v1, v13
 ; SDAG-NEXT:    v_mov_b32_e32 v2, v14
@@ -2098,8 +2081,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_bf8__vgpr(ptr addrspace(
 ; SDAG-NEXT:    s_nop 0
 ; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
 ; SDAG-NEXT:    v_mov_b32_e32 v16, 0
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 2
+; SDAG-NEXT:    s_nop 10
 ; SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[6:7] offset:32
 ; SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[6:7] offset:48
 ; SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[6:7]
@@ -2131,8 +2113,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_bf8__vgpr(ptr addrspace(
 ; GISEL-NEXT:    s_nop 0
 ; GISEL-NEXT:    v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
 ; GISEL-NEXT:    v_mov_b32_e32 v16, 0
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 2
+; GISEL-NEXT:    s_nop 10
 ; GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
 ; GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
 ; GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
@@ -2152,8 +2133,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8(<4 x i32> %arg0, <8 x i32>
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_bf8 v[12:27], v[0:3], v[4:11], v28
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    s_nop 11
 ; SDAG-NEXT:    v_mov_b32_e32 v0, v12
 ; SDAG-NEXT:    v_mov_b32_e32 v1, v13
 ; SDAG-NEXT:    v_mov_b32_e32 v2, v14
@@ -2215,8 +2195,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__flags0(<4 x i32> %arg0, <
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_bf8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    s_nop 11
 ; SDAG-NEXT:    v_mov_b32_e32 v0, v12
 ; SDAG-NEXT:    v_mov_b32_e32 v1, v13
 ; SDAG-NEXT:    v_mov_b32_e32 v2, v14
@@ -2278,8 +2257,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__flags1(<4 x i32> %arg0, <
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_bf8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    s_nop 11
 ; SDAG-NEXT:    v_mov_b32_e32 v0, v12
 ; SDAG-NEXT:    v_mov_b32_e32 v1, v13
 ; SDAG-NEXT:    v_mov_b32_e32 v2, v14
@@ -2370,8 +2348,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__sgpr(<4 x i32> inreg %arg
 ; SDAG-NEXT:    v_mov_b32_e32 v27, v9
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_bf8 v[12:27], v[36:39], v[28:35], v10
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    s_nop 11
 ; SDAG-NEXT:    v_mov_b32_e32 v0, v12
 ; SDAG-NEXT:    v_mov_b32_e32 v1, v13
 ; SDAG-NEXT:    v_mov_b32_e32 v2, v14
@@ -2471,8 +2448,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_fp8__vgpr(ptr addrspace(
 ; SDAG-NEXT:    s_nop 0
 ; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
 ; SDAG-NEXT:    v_mov_b32_e32 v16, 0
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 2
+; SDAG-NEXT:    s_nop 10
 ; SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[6:7] offset:32
 ; SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[6:7] offset:48
 ; SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[6:7]
@@ -2504,8 +2480,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_fp8__vgpr(ptr addrspace(
 ; GISEL-NEXT:    s_nop 0
 ; GISEL-NEXT:    v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
 ; GISEL-NEXT:    v_mov_b32_e32 v16, 0
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 2
+; GISEL-NEXT:    s_nop 10
 ; GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
 ; GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
 ; GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
@@ -2525,8 +2500,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8(<4 x i32> %arg0, <8 x i32>
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_fp8 v[12:27], v[0:3], v[4:11], v28
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    s_nop 11
 ; SDAG-NEXT:    v_mov_b32_e32 v0, v12
 ; SDAG-NEXT:    v_mov_b32_e32 v1, v13
 ; SDAG-NEXT:    v_mov_b32_e32 v2, v14
@@ -2588,8 +2562,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__flags0(<4 x i32> %arg0, <
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_fp8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    s_nop 11
 ; SDAG-NEXT:    v_mov_b32_e32 v0, v12
 ; SDAG-NEXT:    v_mov_b32_e32 v1, v13
 ; SDAG-NEXT:    v_mov_b32_e32 v2, v14
@@ -2651,8 +2624,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__flags1(<4 x i32> %arg0, <
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_fp8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    s_nop 11
 ; SDAG-NEXT:    v_mov_b32_e32 v0, v12
 ; SDAG-NEXT:    v_mov_b32_e32 v1, v13
 ; SDAG-NEXT:    v_mov_b32_e32 v2, v14
@@ -2743,8 +2715,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__sgpr(<4 x i32> inreg %arg
 ; SDAG-NEXT:    v_mov_b32_e32 v27, v9
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_fp8 v[12:27], v[36:39], v[28:35], v10
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    s_nop 11
 ; SDAG-NEXT:    v_mov_b32_e32 v0, v12
 ; SDAG-NEXT:    v_mov_b32_e32 v1, v13
 ; SDAG-NEXT:    v_mov_b32_e32 v2, v14
@@ -2844,8 +2815,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_bf8__vgpr(ptr addrspace(
 ; SDAG-NEXT:    s_nop 0
 ; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
 ; SDAG-NEXT:    v_mov_b32_e32 v16, 0
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 2
+; SDAG-NEXT:    s_nop 10
 ; SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[6:7] offset:32
 ; SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[6:7] offset:48
 ; SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[6:7]
@@ -2877,8 +2847,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_bf8__vgpr(ptr addrspace(
 ; GISEL-NEXT:    s_nop 0
 ; GISEL-NEXT:    v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
 ; GISEL-NEXT:    v_mov_b32_e32 v16, 0
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 2
+; GISEL-NEXT:    s_nop 10
 ; GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
 ; GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
 ; GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
@@ -2898,8 +2867,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8(<4 x i32> %arg0, <8 x i32>
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_bf8 v[12:27], v[0:3], v[4:11], v28
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    s_nop 11
 ; SDAG-NEXT:    v_mov_b32_e32 v0, v12
 ; SDAG-NEXT:    v_mov_b32_e32 v1, v13
 ; SDAG-NEXT:    v_mov_b32_e32 v2, v14
@@ -2961,8 +2929,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__flags0(<4 x i32> %arg0, <
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_bf8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    s_nop 11
 ; SDAG-NEXT:    v_mov_b32_e32 v0, v12
 ; SDAG-NEXT:    v_mov_b32_e32 v1, v13
 ; SDAG-NEXT:    v_mov_b32_e32 v2, v14
@@ -3024,8 +2991,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__flags1(<4 x i32> %arg0, <
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_bf8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    s_nop 11
 ; SDAG-NEXT:    v_mov_b32_e32 v0, v12
 ; SDAG-NEXT:    v_mov_b32_e32 v1, v13
 ; SDAG-NEXT:    v_mov_b32_e32 v2, v14
@@ -3116,8 +3082,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__sgpr(<4 x i32> inreg %arg
 ; SDAG-NEXT:    v_mov_b32_e32 v27, v9
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_bf8 v[12:27], v[36:39], v[28:35], v10
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    s_nop 11
 ; SDAG-NEXT:    v_mov_b32_e32 v0, v12
 ; SDAG-NEXT:    v_mov_b32_e32 v1, v13
 ; SDAG-NEXT:    v_mov_b32_e32 v2, v14
@@ -3217,8 +3182,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_fp8__vgpr(ptr addrspace(
 ; SDAG-NEXT:    s_nop 0
 ; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
 ; SDAG-NEXT:    v_mov_b32_e32 v16, 0
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 2
+; SDAG-NEXT:    s_nop 10
 ; SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[6:7] offset:32
 ; SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[6:7] offset:48
 ; SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[6:7]
@@ -3250,8 +3214,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_fp8__vgpr(ptr addrspace(
 ; GISEL-NEXT:    s_nop 0
 ; GISEL-NEXT:    v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
 ; GISEL-NEXT:    v_mov_b32_e32 v16, 0
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 2
+; GISEL-NEXT:    s_nop 10
 ; GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
 ; GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
 ; GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
@@ -3271,8 +3234,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8(<4 x i32> %arg0, <8 x i32>
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_fp8 v[12:27], v[0:3], v[4:11], v28
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    s_nop 11
 ; SDAG-NEXT:    v_mov_b32_e32 v0, v12
 ; SDAG-NEXT:    v_mov_b32_e32 v1, v13
 ; SDAG-NEXT:    v_mov_b32_e32 v2, v14
@@ -3334,8 +3296,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__flags0(<4 x i32> %arg0, <
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_fp8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    s_nop 11
 ; SDAG-NEXT:    v_mov_b32_e32 v0, v12
 ; SDAG-NEXT:    v_mov_b32_e32 v1, v13
 ; SDAG-NEXT:    v_mov_b32_e32 v2, v14
@@ -3397,8 +3358,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__flags1(<4 x i32> %arg0, <
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_fp8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    s_nop 11
 ; SDAG-NEXT:    v_mov_b32_e32 v0, v12
 ; SDAG-NEXT:    v_mov_b32_e32 v1, v13
 ; SDAG-NEXT:    v_mov_b32_e32 v2, v14
@@ -3489,8 +3449,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__sgpr(<4 x i32> inreg %arg
 ; SDAG-NEXT:    v_mov_b32_e32 v27, v9
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_fp8 v[12:27], v[36:39], v[28:35], v10
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    s_nop 11
 ; SDAG-NEXT:    v_mov_b32_e32 v0, v12
 ; SDAG-NEXT:    v_mov_b32_e32 v1, v13
 ; SDAG-NEXT:    v_mov_b32_e32 v2, v14
diff --git a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-add-references.gfx10.mir b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-add-references.gfx10.mir
index b27bb98985e73..8acf32e60cc9e 100644
--- a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-add-references.gfx10.mir
+++ b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-add-references.gfx10.mir
@@ -18,21 +18,21 @@ body:             |
     ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX10-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]]
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[COPY]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[COPY]]
     ; GFX10-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_]], 256, 0, implicit $exec
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_U32_e64_1]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
     ; GFX10-NEXT: SI_RETURN
     ;
     ; GFX12-LABEL: name: local_stack_alloc__v_add_u32_e64__literal_offsets
     ; GFX12: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %stack.0, 256, 0, implicit $exec
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_U32_e64_]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]]
     ; GFX12-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %stack.0, 512, 0, implicit $exec
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_U32_e64_1]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
     ; GFX12-NEXT: SI_RETURN
     %0:vgpr_32 = V_ADD_U32_e64 %stack.0, 256, 0, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %0
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, %0
     %1:vgpr_32 = V_ADD_U32_e64 %stack.0, 512, 0, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %1
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, %1
     SI_RETURN
 
 ...
@@ -53,27 +53,27 @@ body:             |
     ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX10-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]]
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[COPY]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[COPY]]
     ; GFX10-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 256, [[V_ADD_U32_e64_]], 0, implicit $exec
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_U32_e64_1]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
     ; GFX10-NEXT: [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_]], -156, 0, implicit $exec
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_U32_e64_2]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_2]]
     ; GFX10-NEXT: SI_RETURN
     ;
     ; GFX12-LABEL: name: local_stack_alloc__v_add_u32_e64__literal_offsets_commute
     ; GFX12: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 256, %stack.0, 0, implicit $exec
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_U32_e64_]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]]
     ; GFX12-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 512, %stack.0, 0, implicit $exec
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_U32_e64_1]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
     ; GFX12-NEXT: [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %stack.0, 100, 0, implicit $exec
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_U32_e64_2]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_2]]
     ; GFX12-NEXT: SI_RETURN
     %0:vgpr_32 = V_ADD_U32_e64 256, %stack.0, 0, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %0
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, %0
     %1:vgpr_32 = V_ADD_U32_e64 512, %stack.0, 0, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %1
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, %1
     %2:vgpr_32 = V_ADD_U32_e64 %stack.0, 100, 0, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %2
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, %2
     SI_RETURN
 
 ...
diff --git a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-add-references.gfx8.mir b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-add-references.gfx8.mir
index 16deb29d19801..84dd92bf710e9 100644
--- a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-add-references.gfx8.mir
+++ b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-add-references.gfx8.mir
@@ -21,9 +21,9 @@ body:             |
     ; GFX803-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX803-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec
     ; GFX803-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_CO_U32_e64_]]
-    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[COPY]]
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[COPY]]
     ; GFX803-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 256, [[V_ADD_CO_U32_e64_]], implicit-def dead $vcc, implicit $exec
-    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
     ; GFX803-NEXT: SI_RETURN
     ;
     ; GFX900-LABEL: name: local_stack_alloc__v_add_co_u32_e32__literal_offsets
@@ -31,9 +31,9 @@ body:             |
     ; GFX900-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX900-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec
     ; GFX900-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]]
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[COPY]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[COPY]]
     ; GFX900-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 256, [[V_ADD_U32_e64_]], implicit-def dead $vcc, implicit $exec
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
     ; GFX900-NEXT: SI_RETURN
     ;
     ; GFX942-LABEL: name: local_stack_alloc__v_add_co_u32_e32__literal_offsets
@@ -41,10 +41,10 @@ body:             |
     ; GFX942-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 %stack.0
     ; GFX942-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 killed [[S_MOV_B32_]], [[S_MOV_B32_1]], implicit-def dead $scc
     ; GFX942-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]]
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[COPY]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[COPY]]
     ; GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]]
     ; GFX942-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 256, [[COPY1]], implicit-def dead $vcc, implicit $exec
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
     ; GFX942-NEXT: SI_RETURN
     ;
     ; GFX10-LABEL: name: local_stack_alloc__v_add_co_u32_e32__literal_offsets
@@ -52,9 +52,9 @@ body:             |
     ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX10-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]]
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[COPY]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[COPY]]
     ; GFX10-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 256, [[V_ADD_U32_e64_]], implicit-def dead $vcc, implicit $exec
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
     ; GFX10-NEXT: SI_RETURN
     ;
     ; GFX12-LABEL: name: local_stack_alloc__v_add_co_u32_e32__literal_offsets
@@ -62,15 +62,15 @@ body:             |
     ; GFX12-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 %stack.0
     ; GFX12-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 killed [[S_MOV_B32_]], [[S_MOV_B32_1]], implicit-def dead $scc
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]]
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[COPY]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[COPY]]
     ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]]
     ; GFX12-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 256, [[COPY1]], implicit-def dead $vcc, implicit $exec
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
     ; GFX12-NEXT: SI_RETURN
     %0:vgpr_32 = V_ADD_CO_U32_e32 256, %stack.0, implicit-def dead $vcc, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %0
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, %0
     %1:vgpr_32 = V_ADD_CO_U32_e32 512, %stack.0, implicit-def dead $vcc, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %1
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, %1
     SI_RETURN
 
 ...
@@ -88,42 +88,42 @@ body:             |
   bb.0:
     ; GFX803-LABEL: name: local_stack_alloc__v_add_co_u32_e32__literal_offsets_live_vcc
     ; GFX803: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 256, %stack.0, implicit-def dead $vcc, implicit $exec
-    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
     ; GFX803-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 512, %stack.0, implicit-def $vcc, implicit $exec
-    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_CO_U32_e32_1]], implicit $vcc
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]], implicit $vcc
     ; GFX803-NEXT: SI_RETURN
     ;
     ; GFX900-LABEL: name: local_stack_alloc__v_add_co_u32_e32__literal_offsets_live_vcc
     ; GFX900: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 256, %stack.0, implicit-def dead $vcc, implicit $exec
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
     ; GFX900-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 512, %stack.0, implicit-def $vcc, implicit $exec
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_CO_U32_e32_1]], implicit $vcc
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]], implicit $vcc
     ; GFX900-NEXT: SI_RETURN
     ;
     ; GFX942-LABEL: name: local_stack_alloc__v_add_co_u32_e32__literal_offsets_live_vcc
     ; GFX942: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 256, %stack.0, implicit-def dead $vcc, implicit $exec
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
     ; GFX942-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 512, %stack.0, implicit-def $vcc, implicit $exec
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_CO_U32_e32_1]], implicit $vcc
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]], implicit $vcc
     ; GFX942-NEXT: SI_RETURN
     ;
     ; GFX10-LABEL: name: local_stack_alloc__v_add_co_u32_e32__literal_offsets_live_vcc
     ; GFX10: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 256, %stack.0, implicit-def dead $vcc, implicit $exec
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
     ; GFX10-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 512, %stack.0, implicit-def $vcc, implicit $exec
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_CO_U32_e32_1]], implicit $vcc
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]], implicit $vcc
     ; GFX10-NEXT: SI_RETURN
     ;
     ; GFX12-LABEL: name: local_stack_alloc__v_add_co_u32_e32__literal_offsets_live_vcc
     ; GFX12: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 256, %stack.0, implicit-def dead $vcc, implicit $exec
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
     ; GFX12-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 512, %stack.0, implicit-def $vcc, implicit $exec
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_CO_U32_e32_1]], implicit $vcc
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]], implicit $vcc
     ; GFX12-NEXT: SI_RETURN
     %0:vgpr_32 = V_ADD_CO_U32_e32 256, %stack.0, implicit-def dead $vcc, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %0
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, %0
     %1:vgpr_32 = V_ADD_CO_U32_e32 512, %stack.0, implicit-def $vcc, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %1, implicit $vcc
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, %1, implicit $vcc
     SI_RETURN
 
 ...
@@ -144,9 +144,9 @@ body:             |
     ; GFX803-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX803-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec
     ; GFX803-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_CO_U32_e64_]]
-    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[COPY]]
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[COPY]]
     ; GFX803-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 8, [[V_ADD_CO_U32_e64_]], implicit-def dead $vcc, implicit $exec
-    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
     ; GFX803-NEXT: SI_RETURN
     ;
     ; GFX900-LABEL: name: local_stack_alloc__v_add_co_u32_e32__inline_imm_offsets
@@ -154,9 +154,9 @@ body:             |
     ; GFX900-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX900-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec
     ; GFX900-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]]
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[COPY]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[COPY]]
     ; GFX900-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 8, [[V_ADD_U32_e64_]], implicit-def dead $vcc, implicit $exec
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
     ; GFX900-NEXT: SI_RETURN
     ;
     ; GFX942-LABEL: name: local_stack_alloc__v_add_co_u32_e32__inline_imm_offsets
@@ -164,10 +164,10 @@ body:             |
     ; GFX942-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 %stack.0
     ; GFX942-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 killed [[S_MOV_B32_]], [[S_MOV_B32_1]], implicit-def dead $scc
     ; GFX942-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]]
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[COPY]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[COPY]]
     ; GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]]
     ; GFX942-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 8, [[COPY1]], implicit-def dead $vcc, implicit $exec
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
     ; GFX942-NEXT: SI_RETURN
     ;
     ; GFX10-LABEL: name: local_stack_alloc__v_add_co_u32_e32__inline_imm_offsets
@@ -175,9 +175,9 @@ body:             |
     ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX10-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]]
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[COPY]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[COPY]]
     ; GFX10-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 8, [[V_ADD_U32_e64_]], implicit-def dead $vcc, implicit $exec
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
     ; GFX10-NEXT: SI_RETURN
     ;
     ; GFX12-LABEL: name: local_stack_alloc__v_add_co_u32_e32__inline_imm_offsets
@@ -185,15 +185,15 @@ body:             |
     ; GFX12-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 %stack.0
     ; GFX12-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 killed [[S_MOV_B32_]], [[S_MOV_B32_1]], implicit-def dead $scc
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]]
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[COPY]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[COPY]]
     ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]]
     ; GFX12-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 8, [[COPY1]], implicit-def dead $vcc, implicit $exec
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
     ; GFX12-NEXT: SI_RETURN
     %0:vgpr_32 = V_ADD_CO_U32_e32 8, %stack.0, implicit-def dead $vcc, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %0
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, %0
     %1:vgpr_32 = V_ADD_CO_U32_e32 16, %stack.0, implicit-def dead $vcc, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %1
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, %1
     SI_RETURN
 
 ...
@@ -214,9 +214,9 @@ body:             |
     ; GFX803-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX803-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec
     ; GFX803-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_CO_U32_e64_]]
-    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[COPY]]
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[COPY]]
     ; GFX803-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 8, [[V_ADD_CO_U32_e64_]], 0, implicit $exec
-    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_CO_U32_e64_2]]
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]]
     ; GFX803-NEXT: SI_RETURN
     ;
     ; GFX900-LABEL: name: local_stack_alloc__v_add_co_u32_e64__inline_imm_offsets
@@ -224,9 +224,9 @@ body:             |
     ; GFX900-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX900-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec
     ; GFX900-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]]
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[COPY]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[COPY]]
     ; GFX900-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 8, [[V_ADD_U32_e64_]], 0, implicit $exec
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_CO_U32_e64_]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
     ; GFX900-NEXT: SI_RETURN
     ;
     ; GFX942-LABEL: name: local_stack_alloc__v_add_co_u32_e64__inline_imm_offsets
@@ -234,9 +234,9 @@ body:             |
     ; GFX942-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 %stack.0
     ; GFX942-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 killed [[S_MOV_B32_]], [[S_MOV_B32_1]], implicit-def dead $scc
     ; GFX942-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]]
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[COPY]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[COPY]]
     ; GFX942-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 8, [[S_ADD_I32_]], 0, implicit $exec
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_CO_U32_e64_]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
     ; GFX942-NEXT: SI_RETURN
     ;
     ; GFX10-LABEL: name: local_stack_alloc__v_add_co_u32_e64__inline_imm_offsets
@@ -244,9 +244,9 @@ body:             |
     ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX10-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]]
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[COPY]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[COPY]]
     ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 8, [[V_ADD_U32_e64_]], 0, implicit $exec
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_CO_U32_e64_]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
     ; GFX10-NEXT: SI_RETURN
     ;
     ; GFX12-LABEL: name: local_stack_alloc__v_add_co_u32_e64__inline_imm_offsets
@@ -254,14 +254,14 @@ body:             |
     ; GFX12-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 %stack.0
     ; GFX12-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 killed [[S_MOV_B32_]], [[S_MOV_B32_1]], implicit-def dead $scc
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]]
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[COPY]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[COPY]]
     ; GFX12-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 8, [[S_ADD_I32_]], 0, implicit $exec
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_CO_U32_e64_]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
     ; GFX12-NEXT: SI_RETURN
     %0:vgpr_32, dead %2:sreg_64_xexec = V_ADD_CO_U32_e64 %stack.0, 8, 0, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %0
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, %0
     %1:vgpr_32, dead %3:sreg_64_xexec = V_ADD_CO_U32_e64 16, %stack.0, 0, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %1
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, %1
     SI_RETURN
 
 ...
@@ -279,42 +279,42 @@ body:             |
   bb.0:
     ; GFX803-LABEL: name: local_stack_alloc__v_add_co_u32_e64__inline_imm_offsets_live_vcc
     ; GFX803: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 %stack.0, 8, 0, implicit $exec
-    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_CO_U32_e64_]]
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
     ; GFX803-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 16, %stack.0, 0, implicit $exec
-    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_CO_U32_e64_2]]
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]]
     ; GFX803-NEXT: SI_RETURN implicit [[V_ADD_CO_U32_e64_1]]
     ;
     ; GFX900-LABEL: name: local_stack_alloc__v_add_co_u32_e64__inline_imm_offsets_live_vcc
     ; GFX900: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 %stack.0, 8, 0, implicit $exec
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_CO_U32_e64_]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
     ; GFX900-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 16, %stack.0, 0, implicit $exec
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_CO_U32_e64_2]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]]
     ; GFX900-NEXT: SI_RETURN implicit [[V_ADD_CO_U32_e64_1]]
     ;
     ; GFX942-LABEL: name: local_stack_alloc__v_add_co_u32_e64__inline_imm_offsets_live_vcc
     ; GFX942: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 %stack.0, 8, 0, implicit $exec
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_CO_U32_e64_]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
     ; GFX942-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 16, %stack.0, 0, implicit $exec
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_CO_U32_e64_2]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]]
     ; GFX942-NEXT: SI_RETURN implicit [[V_ADD_CO_U32_e64_1]]
     ;
     ; GFX10-LABEL: name: local_stack_alloc__v_add_co_u32_e64__inline_imm_offsets_live_vcc
     ; GFX10: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 %stack.0, 8, 0, implicit $exec
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_CO_U32_e64_]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
     ; GFX10-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 16, %stack.0, 0, implicit $exec
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_CO_U32_e64_2]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]]
     ; GFX10-NEXT: SI_RETURN implicit [[V_ADD_CO_U32_e64_1]]
     ;
     ; GFX12-LABEL: name: local_stack_alloc__v_add_co_u32_e64__inline_imm_offsets_live_vcc
     ; GFX12: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 %stack.0, 8, 0, implicit $exec
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_CO_U32_e64_]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
     ; GFX12-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 16, %stack.0, 0, implicit $exec
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_CO_U32_e64_2]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]]
     ; GFX12-NEXT: SI_RETURN implicit [[V_ADD_CO_U32_e64_1]]
     %0:vgpr_32, %2:sreg_64_xexec = V_ADD_CO_U32_e64 %stack.0, 8, 0, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %0
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, %0
     %1:vgpr_32, %3:sreg_64_xexec = V_ADD_CO_U32_e64 16, %stack.0, 0, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %1
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, %1
     SI_RETURN implicit %2
 
 ...
@@ -385,42 +385,42 @@ body:             |
   bb.0:
     ; GFX803-LABEL: name: local_stack_alloc__s_add_i32__inline_imm_offsets
     ; GFX803: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 8, %stack.0, implicit-def dead $scc
-    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[S_ADD_I32_]]
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[S_ADD_I32_]]
     ; GFX803-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 16, %stack.0, implicit-def dead $scc
-    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[S_ADD_I32_1]]
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[S_ADD_I32_1]]
     ; GFX803-NEXT: SI_RETURN
     ;
     ; GFX900-LABEL: name: local_stack_alloc__s_add_i32__inline_imm_offsets
     ; GFX900: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 8, %stack.0, implicit-def dead $scc
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[S_ADD_I32_]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[S_ADD_I32_]]
     ; GFX900-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 16, %stack.0, implicit-def dead $scc
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[S_ADD_I32_1]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[S_ADD_I32_1]]
     ; GFX900-NEXT: SI_RETURN
     ;
     ; GFX942-LABEL: name: local_stack_alloc__s_add_i32__inline_imm_offsets
     ; GFX942: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 8, %stack.0, implicit-def dead $scc
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[S_ADD_I32_]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[S_ADD_I32_]]
     ; GFX942-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 16, %stack.0, implicit-def dead $scc
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[S_ADD_I32_1]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[S_ADD_I32_1]]
     ; GFX942-NEXT: SI_RETURN
     ;
     ; GFX10-LABEL: name: local_stack_alloc__s_add_i32__inline_imm_offsets
     ; GFX10: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 8, %stack.0, implicit-def dead $scc
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[S_ADD_I32_]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[S_ADD_I32_]]
     ; GFX10-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 16, %stack.0, implicit-def dead $scc
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[S_ADD_I32_1]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[S_ADD_I32_1]]
     ; GFX10-NEXT: SI_RETURN
     ;
     ; GFX12-LABEL: name: local_stack_alloc__s_add_i32__inline_imm_offsets
     ; GFX12: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 8, %stack.0, implicit-def dead $scc
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[S_ADD_I32_]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[S_ADD_I32_]]
     ; GFX12-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 16, %stack.0, implicit-def dead $scc
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[S_ADD_I32_1]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[S_ADD_I32_1]]
     ; GFX12-NEXT: SI_RETURN
     %0:sreg_32 = S_ADD_I32 8, %stack.0, implicit-def dead $scc
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:SREG_32 */, %0
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:SREG_32 */, %0
     %1:sreg_32 = S_ADD_I32 16, %stack.0, implicit-def dead $scc
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:SREG_32 */, %1
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:SREG_32 */, %1
     SI_RETURN
 
 ...
@@ -443,9 +443,9 @@ body:             |
     ; GFX803-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr4
     ; GFX803-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr5
     ; GFX803-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY]], %stack.0, implicit-def dead $scc
-    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[S_ADD_I32_]]
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[S_ADD_I32_]]
     ; GFX803-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY1]], %stack.0, implicit-def dead $scc
-    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[S_ADD_I32_1]]
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[S_ADD_I32_1]]
     ; GFX803-NEXT: SI_RETURN
     ;
     ; GFX900-LABEL: name: local_stack_alloc__s_add_i32__reg_offsets
@@ -454,9 +454,9 @@ body:             |
     ; GFX900-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr4
     ; GFX900-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr5
     ; GFX900-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY]], %stack.0, implicit-def dead $scc
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[S_ADD_I32_]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[S_ADD_I32_]]
     ; GFX900-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY1]], %stack.0, implicit-def dead $scc
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[S_ADD_I32_1]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[S_ADD_I32_1]]
     ; GFX900-NEXT: SI_RETURN
     ;
     ; GFX942-LABEL: name: local_stack_alloc__s_add_i32__reg_offsets
@@ -465,9 +465,9 @@ body:             |
     ; GFX942-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr4
     ; GFX942-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr5
     ; GFX942-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY]], %stack.0, implicit-def dead $scc
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[S_ADD_I32_]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[S_ADD_I32_]]
     ; GFX942-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY1]], %stack.0, implicit-def dead $scc
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[S_ADD_I32_1]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[S_ADD_I32_1]]
     ; GFX942-NEXT: SI_RETURN
     ;
     ; GFX10-LABEL: name: local_stack_alloc__s_add_i32__reg_offsets
@@ -476,9 +476,9 @@ body:             |
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr4
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr5
     ; GFX10-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY]], %stack.0, implicit-def dead $scc
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[S_ADD_I32_]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[S_ADD_I32_]]
     ; GFX10-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY1]], %stack.0, implicit-def dead $scc
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[S_ADD_I32_1]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[S_ADD_I32_1]]
     ; GFX10-NEXT: SI_RETURN
     ;
     ; GFX12-LABEL: name: local_stack_alloc__s_add_i32__reg_offsets
@@ -487,17 +487,17 @@ body:             |
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr4
     ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr5
     ; GFX12-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY]], %stack.0, implicit-def dead $scc
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[S_ADD_I32_]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[S_ADD_I32_]]
     ; GFX12-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY1]], %stack.0, implicit-def dead $scc
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[S_ADD_I32_1]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[S_ADD_I32_1]]
     ; GFX12-NEXT: SI_RETURN
     %0:sreg_32 = COPY $sgpr4
     %1:sreg_32 = COPY $sgpr5
 
     %2:sreg_32 = S_ADD_I32 %0, %stack.0, implicit-def dead $scc
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:SREG_32 */, %2
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:SREG_32 */, %2
     %3:sreg_32 = S_ADD_I32 %1, %stack.0, implicit-def dead $scc
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:SREG_32 */, %3
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:SREG_32 */, %3
     SI_RETURN
 
 ...
@@ -520,9 +520,9 @@ body:             |
     ; GFX803-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr4
     ; GFX803-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr5
     ; GFX803-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 %stack.0, [[COPY]], implicit-def dead $scc
-    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[S_ADD_I32_]]
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[S_ADD_I32_]]
     ; GFX803-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 %stack.0, [[COPY1]], implicit-def dead $scc
-    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[S_ADD_I32_1]]
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[S_ADD_I32_1]]
     ; GFX803-NEXT: SI_RETURN
     ;
     ; GFX900-LABEL: name: local_stack_alloc__s_add_i32__reg_offsets_commute
@@ -531,9 +531,9 @@ body:             |
     ; GFX900-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr4
     ; GFX900-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr5
     ; GFX900-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 %stack.0, [[COPY]], implicit-def dead $scc
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[S_ADD_I32_]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[S_ADD_I32_]]
     ; GFX900-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 %stack.0, [[COPY1]], implicit-def dead $scc
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[S_ADD_I32_1]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[S_ADD_I32_1]]
     ; GFX900-NEXT: SI_RETURN
     ;
     ; GFX942-LABEL: name: local_stack_alloc__s_add_i32__reg_offsets_commute
@@ -542,9 +542,9 @@ body:             |
     ; GFX942-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr4
     ; GFX942-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr5
     ; GFX942-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 %stack.0, [[COPY]], implicit-def dead $scc
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[S_ADD_I32_]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[S_ADD_I32_]]
     ; GFX942-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 %stack.0, [[COPY1]], implicit-def dead $scc
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[S_ADD_I32_1]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[S_ADD_I32_1]]
     ; GFX942-NEXT: SI_RETURN
     ;
     ; GFX10-LABEL: name: local_stack_alloc__s_add_i32__reg_offsets_commute
@@ -553,9 +553,9 @@ body:             |
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr4
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr5
     ; GFX10-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 %stack.0, [[COPY]], implicit-def dead $scc
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[S_ADD_I32_]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[S_ADD_I32_]]
     ; GFX10-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 %stack.0, [[COPY1]], implicit-def dead $scc
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[S_ADD_I32_1]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[S_ADD_I32_1]]
     ; GFX10-NEXT: SI_RETURN
     ;
     ; GFX12-LABEL: name: local_stack_alloc__s_add_i32__reg_offsets_commute
@@ -564,17 +564,17 @@ body:             |
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr4
     ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr5
     ; GFX12-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 %stack.0, [[COPY]], implicit-def dead $scc
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[S_ADD_I32_]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[S_ADD_I32_]]
     ; GFX12-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 %stack.0, [[COPY1]], implicit-def dead $scc
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[S_ADD_I32_1]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[S_ADD_I32_1]]
     ; GFX12-NEXT: SI_RETURN
     %0:sreg_32 = COPY $sgpr4
     %1:sreg_32 = COPY $sgpr5
 
     %2:sreg_32 = S_ADD_I32 %stack.0, %0, implicit-def dead $scc
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:SREG_32 */, %2
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:SREG_32 */, %2
     %3:sreg_32 = S_ADD_I32 %stack.0, %1, implicit-def dead $scc
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:SREG_32 */, %3
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:SREG_32 */, %3
     SI_RETURN
 
 ...
@@ -656,9 +656,9 @@ body:             |
     ; GFX803-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX803-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0
     ; GFX803-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %vgpr_offset, [[V_MOV_B32_e32_]], implicit-def dead $vcc, implicit $exec
-    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
     ; GFX803-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %vgpr_offset, [[V_MOV_B32_e32_]], implicit-def dead $vcc, implicit $exec
-    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_CO_U32_e32_1]]
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]]
     ; GFX803-NEXT: SI_RETURN
     ;
     ; GFX900-LABEL: name: local_stack_alloc__v_add_co_u32_e32__vgpr_offsets
@@ -667,9 +667,9 @@ body:             |
     ; GFX900-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX900-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0
     ; GFX900-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %vgpr_offset, [[V_MOV_B32_e32_]], implicit-def dead $vcc, implicit $exec
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
     ; GFX900-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %vgpr_offset, [[V_MOV_B32_e32_]], implicit-def dead $vcc, implicit $exec
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_CO_U32_e32_1]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]]
     ; GFX900-NEXT: SI_RETURN
     ;
     ; GFX942-LABEL: name: local_stack_alloc__v_add_co_u32_e32__vgpr_offsets
@@ -678,9 +678,9 @@ body:             |
     ; GFX942-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xexec_hi = S_MOV_B32 %stack.0
     ; GFX942-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0
     ; GFX942-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[S_MOV_B32_]], %vgpr_offset, implicit-def dead $vcc, implicit $exec
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
     ; GFX942-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[S_MOV_B32_]], %vgpr_offset, implicit-def dead $vcc, implicit $exec
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_CO_U32_e32_1]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]]
     ; GFX942-NEXT: SI_RETURN
     ;
     ; GFX10-LABEL: name: local_stack_alloc__v_add_co_u32_e32__vgpr_offsets
@@ -689,9 +689,9 @@ body:             |
     ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX10-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0
     ; GFX10-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %vgpr_offset, [[V_MOV_B32_e32_]], implicit-def dead $vcc, implicit $exec
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
     ; GFX10-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %vgpr_offset, [[V_MOV_B32_e32_]], implicit-def dead $vcc, implicit $exec
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_CO_U32_e32_1]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]]
     ; GFX10-NEXT: SI_RETURN
     ;
     ; GFX12-LABEL: name: local_stack_alloc__v_add_co_u32_e32__vgpr_offsets
@@ -700,15 +700,15 @@ body:             |
     ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xexec_hi = S_MOV_B32 %stack.0
     ; GFX12-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0
     ; GFX12-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[S_MOV_B32_]], %vgpr_offset, implicit-def dead $vcc, implicit $exec
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
     ; GFX12-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[S_MOV_B32_]], %vgpr_offset, implicit-def dead $vcc, implicit $exec
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_CO_U32_e32_1]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]]
     ; GFX12-NEXT: SI_RETURN
     %vgpr_offset:vgpr_32 = COPY $vgpr0
     %0:vgpr_32 = V_ADD_CO_U32_e32 %vgpr_offset, %stack.0, implicit-def dead $vcc, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %0
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, %0
     %1:vgpr_32 = V_ADD_CO_U32_e32 %vgpr_offset, %stack.0, implicit-def dead $vcc, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %1
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, %1
     SI_RETURN
 
 ...
@@ -731,9 +731,9 @@ body:             |
     ; GFX803-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX803-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0
     ; GFX803-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[V_MOV_B32_e32_]], %vgpr_offset, implicit-def dead $vcc, implicit $exec
-    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
     ; GFX803-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[V_MOV_B32_e32_]], %vgpr_offset, implicit-def dead $vcc, implicit $exec
-    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_CO_U32_e32_1]]
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]]
     ; GFX803-NEXT: SI_RETURN
     ;
     ; GFX900-LABEL: name: local_stack_alloc__v_add_co_u32_e32__vgpr_offsets_commute
@@ -742,9 +742,9 @@ body:             |
     ; GFX900-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX900-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0
     ; GFX900-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[V_MOV_B32_e32_]], %vgpr_offset, implicit-def dead $vcc, implicit $exec
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
     ; GFX900-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[V_MOV_B32_e32_]], %vgpr_offset, implicit-def dead $vcc, implicit $exec
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_CO_U32_e32_1]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]]
     ; GFX900-NEXT: SI_RETURN
     ;
     ; GFX942-LABEL: name: local_stack_alloc__v_add_co_u32_e32__vgpr_offsets_commute
@@ -753,9 +753,9 @@ body:             |
     ; GFX942-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xexec_hi = S_MOV_B32 %stack.0
     ; GFX942-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0
     ; GFX942-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[S_MOV_B32_]], %vgpr_offset, implicit-def dead $vcc, implicit $exec
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
     ; GFX942-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[S_MOV_B32_]], %vgpr_offset, implicit-def dead $vcc, implicit $exec
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_CO_U32_e32_1]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]]
     ; GFX942-NEXT: SI_RETURN
     ;
     ; GFX10-LABEL: name: local_stack_alloc__v_add_co_u32_e32__vgpr_offsets_commute
@@ -764,9 +764,9 @@ body:             |
     ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX10-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0
     ; GFX10-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[V_MOV_B32_e32_]], %vgpr_offset, implicit-def dead $vcc, implicit $exec
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
     ; GFX10-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[V_MOV_B32_e32_]], %vgpr_offset, implicit-def dead $vcc, implicit $exec
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_CO_U32_e32_1]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]]
     ; GFX10-NEXT: SI_RETURN
     ;
     ; GFX12-LABEL: name: local_stack_alloc__v_add_co_u32_e32__vgpr_offsets_commute
@@ -775,15 +775,15 @@ body:             |
     ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xexec_hi = S_MOV_B32 %stack.0
     ; GFX12-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0
     ; GFX12-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[S_MOV_B32_]], %vgpr_offset, implicit-def dead $vcc, implicit $exec
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
     ; GFX12-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[S_MOV_B32_]], %vgpr_offset, implicit-def dead $vcc, implicit $exec
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_CO_U32_e32_1]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]]
     ; GFX12-NEXT: SI_RETURN
     %vgpr_offset:vgpr_32 = COPY $vgpr0
     %0:vgpr_32 = V_ADD_CO_U32_e32 %stack.0, %vgpr_offset, implicit-def dead $vcc, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %0
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, %0
     %1:vgpr_32 = V_ADD_CO_U32_e32 %stack.0, %vgpr_offset, implicit-def dead $vcc, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %1
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, %1
     SI_RETURN
 
 ...
@@ -805,9 +805,9 @@ body:             |
     ; GFX803-NEXT: {{  $}}
     ; GFX803-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
     ; GFX803-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %sgpr_offset, %stack.0, implicit-def dead $vcc, implicit $exec
-    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
     ; GFX803-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %sgpr_offset, %stack.0, implicit-def dead $vcc, implicit $exec
-    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_CO_U32_e32_1]]
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]]
     ; GFX803-NEXT: SI_RETURN
     ;
     ; GFX900-LABEL: name: local_stack_alloc__v_add_co_u32_e32__sgpr_offsets
@@ -815,9 +815,9 @@ body:             |
     ; GFX900-NEXT: {{  $}}
     ; GFX900-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
     ; GFX900-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %sgpr_offset, %stack.0, implicit-def dead $vcc, implicit $exec
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
     ; GFX900-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %sgpr_offset, %stack.0, implicit-def dead $vcc, implicit $exec
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_CO_U32_e32_1]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]]
     ; GFX900-NEXT: SI_RETURN
     ;
     ; GFX942-LABEL: name: local_stack_alloc__v_add_co_u32_e32__sgpr_offsets
@@ -825,9 +825,9 @@ body:             |
     ; GFX942-NEXT: {{  $}}
     ; GFX942-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
     ; GFX942-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %sgpr_offset, %stack.0, implicit-def dead $vcc, implicit $exec
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
     ; GFX942-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %sgpr_offset, %stack.0, implicit-def dead $vcc, implicit $exec
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_CO_U32_e32_1]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]]
     ; GFX942-NEXT: SI_RETURN
     ;
     ; GFX10-LABEL: name: local_stack_alloc__v_add_co_u32_e32__sgpr_offsets
@@ -836,9 +836,9 @@ body:             |
     ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX10-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
     ; GFX10-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %sgpr_offset, [[V_MOV_B32_e32_]], implicit-def dead $vcc, implicit $exec
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
     ; GFX10-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %sgpr_offset, [[V_MOV_B32_e32_]], implicit-def dead $vcc, implicit $exec
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_CO_U32_e32_1]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]]
     ; GFX10-NEXT: SI_RETURN
     ;
     ; GFX12-LABEL: name: local_stack_alloc__v_add_co_u32_e32__sgpr_offsets
@@ -848,16 +848,16 @@ body:             |
     ; GFX12-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
     ; GFX12-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %sgpr_offset, [[COPY]], implicit-def dead $vcc, implicit $exec
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
     ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
     ; GFX12-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %sgpr_offset, [[COPY1]], implicit-def dead $vcc, implicit $exec
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_CO_U32_e32_1]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]]
     ; GFX12-NEXT: SI_RETURN
     %sgpr_offset:sreg_32 = COPY $sgpr8
     %0:vgpr_32 = V_ADD_CO_U32_e32 %sgpr_offset, %stack.0, implicit-def dead $vcc, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %0
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, %0
     %1:vgpr_32 = V_ADD_CO_U32_e32 %sgpr_offset, %stack.0, implicit-def dead $vcc, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %1
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, %1
     SI_RETURN
 
 ...
@@ -880,9 +880,9 @@ body:             |
     ; GFX803-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX803-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
     ; GFX803-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 %sgpr_offset, [[V_MOV_B32_e32_]], 0, implicit $exec
-    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_CO_U32_e64_]]
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
     ; GFX803-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 %sgpr_offset, [[V_MOV_B32_e32_]], 0, implicit $exec
-    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_CO_U32_e64_2]]
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]]
     ; GFX803-NEXT: SI_RETURN
     ;
     ; GFX900-LABEL: name: local_stack_alloc__v_add_co_u32_e64__sgpr_offsets
@@ -891,9 +891,9 @@ body:             |
     ; GFX900-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX900-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
     ; GFX900-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 %sgpr_offset, [[V_MOV_B32_e32_]], 0, implicit $exec
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_CO_U32_e64_]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
     ; GFX900-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 %sgpr_offset, [[V_MOV_B32_e32_]], 0, implicit $exec
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_CO_U32_e64_2]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]]
     ; GFX900-NEXT: SI_RETURN
     ;
     ; GFX942-LABEL: name: local_stack_alloc__v_add_co_u32_e64__sgpr_offsets
@@ -903,10 +903,10 @@ body:             |
     ; GFX942-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
     ; GFX942-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
     ; GFX942-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 %sgpr_offset, [[COPY]], 0, implicit $exec
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_CO_U32_e64_]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
     ; GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
     ; GFX942-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 %sgpr_offset, [[COPY1]], 0, implicit $exec
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_CO_U32_e64_2]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]]
     ; GFX942-NEXT: SI_RETURN
     ;
     ; GFX10-LABEL: name: local_stack_alloc__v_add_co_u32_e64__sgpr_offsets
@@ -915,9 +915,9 @@ body:             |
     ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX10-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
     ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 %sgpr_offset, [[V_MOV_B32_e32_]], 0, implicit $exec
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_CO_U32_e64_]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
     ; GFX10-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 %sgpr_offset, [[V_MOV_B32_e32_]], 0, implicit $exec
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_CO_U32_e64_2]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]]
     ; GFX10-NEXT: SI_RETURN
     ;
     ; GFX12-LABEL: name: local_stack_alloc__v_add_co_u32_e64__sgpr_offsets
@@ -926,15 +926,15 @@ body:             |
     ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xexec_hi = S_MOV_B32 %stack.0
     ; GFX12-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
     ; GFX12-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 %sgpr_offset, [[S_MOV_B32_]], 0, implicit $exec
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_CO_U32_e64_]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
     ; GFX12-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 %sgpr_offset, [[S_MOV_B32_]], 0, implicit $exec
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_CO_U32_e64_2]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]]
     ; GFX12-NEXT: SI_RETURN
     %sgpr_offset:sreg_32 = COPY $sgpr8
     %0:vgpr_32, dead %2:sreg_64_xexec = V_ADD_CO_U32_e64 %sgpr_offset, %stack.0, 0, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %0
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, %0
     %1:vgpr_32, dead %3:sreg_64_xexec = V_ADD_CO_U32_e64 %sgpr_offset, %stack.0, 0, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %1
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, %1
     SI_RETURN
 
 ...
@@ -957,9 +957,9 @@ body:             |
     ; GFX803-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX803-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
     ; GFX803-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[V_MOV_B32_e32_]], %sgpr_offset, 0, implicit $exec
-    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_CO_U32_e64_]]
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
     ; GFX803-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[V_MOV_B32_e32_]], %sgpr_offset, 0, implicit $exec
-    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_CO_U32_e64_2]]
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]]
     ; GFX803-NEXT: SI_RETURN
     ;
     ; GFX900-LABEL: name: local_stack_alloc__v_add_co_u32_e64__sgpr_offsets_commute
@@ -968,9 +968,9 @@ body:             |
     ; GFX900-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX900-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
     ; GFX900-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[V_MOV_B32_e32_]], %sgpr_offset, 0, implicit $exec
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_CO_U32_e64_]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
     ; GFX900-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[V_MOV_B32_e32_]], %sgpr_offset, 0, implicit $exec
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_CO_U32_e64_2]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]]
     ; GFX900-NEXT: SI_RETURN
     ;
     ; GFX942-LABEL: name: local_stack_alloc__v_add_co_u32_e64__sgpr_offsets_commute
@@ -980,10 +980,10 @@ body:             |
     ; GFX942-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
     ; GFX942-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY %sgpr_offset
     ; GFX942-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[S_MOV_B32_]], [[COPY]], 0, implicit $exec
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_CO_U32_e64_]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
     ; GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY %sgpr_offset
     ; GFX942-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[S_MOV_B32_]], [[COPY1]], 0, implicit $exec
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_CO_U32_e64_2]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]]
     ; GFX942-NEXT: SI_RETURN
     ;
     ; GFX10-LABEL: name: local_stack_alloc__v_add_co_u32_e64__sgpr_offsets_commute
@@ -992,9 +992,9 @@ body:             |
     ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX10-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
     ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[V_MOV_B32_e32_]], %sgpr_offset, 0, implicit $exec
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_CO_U32_e64_]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
     ; GFX10-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[V_MOV_B32_e32_]], %sgpr_offset, 0, implicit $exec
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_CO_U32_e64_2]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]]
     ; GFX10-NEXT: SI_RETURN
     ;
     ; GFX12-LABEL: name: local_stack_alloc__v_add_co_u32_e64__sgpr_offsets_commute
@@ -1003,15 +1003,15 @@ body:             |
     ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xexec_hi = S_MOV_B32 %stack.0
     ; GFX12-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
     ; GFX12-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[S_MOV_B32_]], %sgpr_offset, 0, implicit $exec
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_CO_U32_e64_]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
     ; GFX12-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[S_MOV_B32_]], %sgpr_offset, 0, implicit $exec
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_CO_U32_e64_2]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]]
     ; GFX12-NEXT: SI_RETURN
     %sgpr_offset:sreg_32 = COPY $sgpr8
     %0:vgpr_32, dead %2:sreg_64_xexec = V_ADD_CO_U32_e64 %stack.0, %sgpr_offset, 0, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %0
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, %0
     %1:vgpr_32, dead %3:sreg_64_xexec = V_ADD_CO_U32_e64 %stack.0, %sgpr_offset, 0, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %1
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, %1
     SI_RETURN
 
 ...
diff --git a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-add-references.gfx9.mir b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-add-references.gfx9.mir
index 264aac047495f..9183fe5d55a9c 100644
--- a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-add-references.gfx9.mir
+++ b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-add-references.gfx9.mir
@@ -20,16 +20,16 @@ body:             |
     ; GFX900-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX900-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec
     ; GFX900-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]]
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[COPY]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[COPY]]
     ; GFX900-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 256, [[V_ADD_U32_e64_]], implicit $exec
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_U32_e32_]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
     ; GFX900-NEXT: SI_RETURN
     ;
     ; GFX942-LABEL: name: local_stack_alloc__v_add_u32_e32__literal_offsets
     ; GFX942: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 256, %stack.0, implicit $exec
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_U32_e32_]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
     ; GFX942-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 512, %stack.0, implicit $exec
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_U32_e32_1]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]]
     ; GFX942-NEXT: SI_RETURN
     ;
     ; GFX10-LABEL: name: local_stack_alloc__v_add_u32_e32__literal_offsets
@@ -37,21 +37,21 @@ body:             |
     ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX10-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]]
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[COPY]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[COPY]]
     ; GFX10-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 256, [[V_ADD_U32_e64_]], implicit $exec
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_U32_e32_]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
     ; GFX10-NEXT: SI_RETURN
     ;
     ; GFX12-LABEL: name: local_stack_alloc__v_add_u32_e32__literal_offsets
     ; GFX12: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 256, %stack.0, implicit $exec
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_U32_e32_]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
     ; GFX12-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 512, %stack.0, implicit $exec
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_U32_e32_1]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]]
     ; GFX12-NEXT: SI_RETURN
     %0:vgpr_32 = V_ADD_U32_e32 256, %stack.0, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %0
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, %0
     %1:vgpr_32 = V_ADD_U32_e32 512, %stack.0, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %1
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, %1
     SI_RETURN
 
 ...
@@ -72,16 +72,16 @@ body:             |
     ; GFX900-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX900-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec
     ; GFX900-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]]
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[COPY]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[COPY]]
     ; GFX900-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 8, [[V_ADD_U32_e64_]], implicit $exec
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_U32_e32_]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
     ; GFX900-NEXT: SI_RETURN
     ;
     ; GFX942-LABEL: name: local_stack_alloc__v_add_u32_e32__inline_imm_offsets
     ; GFX942: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 8, %stack.0, implicit $exec
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_U32_e32_]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
     ; GFX942-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 16, %stack.0, implicit $exec
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_U32_e32_1]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]]
     ; GFX942-NEXT: SI_RETURN
     ;
     ; GFX10-LABEL: name: local_stack_alloc__v_add_u32_e32__inline_imm_offsets
@@ -89,21 +89,21 @@ body:             |
     ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX10-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]]
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[COPY]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[COPY]]
     ; GFX10-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 8, [[V_ADD_U32_e64_]], implicit $exec
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_U32_e32_]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
     ; GFX10-NEXT: SI_RETURN
     ;
     ; GFX12-LABEL: name: local_stack_alloc__v_add_u32_e32__inline_imm_offsets
     ; GFX12: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 8, %stack.0, implicit $exec
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_U32_e32_]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
     ; GFX12-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 16, %stack.0, implicit $exec
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_U32_e32_1]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]]
     ; GFX12-NEXT: SI_RETURN
     %0:vgpr_32 = V_ADD_U32_e32 8, %stack.0, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %0
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, %0
     %1:vgpr_32 = V_ADD_U32_e32 16, %stack.0, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %1
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, %1
     SI_RETURN
 
 ...
@@ -124,16 +124,16 @@ body:             |
     ; GFX900-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX900-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec
     ; GFX900-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]]
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[COPY]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[COPY]]
     ; GFX900-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 8, [[V_ADD_U32_e64_]], 0, implicit $exec
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_U32_e64_1]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
     ; GFX900-NEXT: SI_RETURN
     ;
     ; GFX942-LABEL: name: local_stack_alloc__v_add_u32_e64__inline_imm_offsets
     ; GFX942: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %stack.0, 8, 0, implicit $exec
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_U32_e64_]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]]
     ; GFX942-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 16, %stack.0, 0, implicit $exec
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_U32_e64_1]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
     ; GFX942-NEXT: SI_RETURN
     ;
     ; GFX10-LABEL: name: local_stack_alloc__v_add_u32_e64__inline_imm_offsets
@@ -141,21 +141,21 @@ body:             |
     ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX10-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]]
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[COPY]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[COPY]]
     ; GFX10-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 8, [[V_ADD_U32_e64_]], 0, implicit $exec
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_U32_e64_1]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
     ; GFX10-NEXT: SI_RETURN
     ;
     ; GFX12-LABEL: name: local_stack_alloc__v_add_u32_e64__inline_imm_offsets
     ; GFX12: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %stack.0, 8, 0, implicit $exec
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_U32_e64_]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]]
     ; GFX12-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 16, %stack.0, 0, implicit $exec
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_U32_e64_1]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
     ; GFX12-NEXT: SI_RETURN
     %0:vgpr_32 = V_ADD_U32_e64 %stack.0, 8, 0, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %0
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, %0
     %1:vgpr_32 = V_ADD_U32_e64 16, %stack.0, 0, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %1
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, %1
     SI_RETURN
 
 ...
@@ -178,9 +178,9 @@ body:             |
     ; GFX900-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX900-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0
     ; GFX900-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %vgpr_offset, [[V_MOV_B32_e32_]], implicit $exec
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_U32_e32_]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
     ; GFX900-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %vgpr_offset, [[V_MOV_B32_e32_]], implicit $exec
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_U32_e32_1]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]]
     ; GFX900-NEXT: SI_RETURN
     ;
     ; GFX942-LABEL: name: local_stack_alloc__v_add_u32_e32__vgpr_offsets
@@ -188,9 +188,9 @@ body:             |
     ; GFX942-NEXT: {{  $}}
     ; GFX942-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0
     ; GFX942-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %vgpr_offset, %stack.0, implicit $exec
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_U32_e32_]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
     ; GFX942-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %vgpr_offset, %stack.0, implicit $exec
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_U32_e32_1]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]]
     ; GFX942-NEXT: SI_RETURN
     ;
     ; GFX10-LABEL: name: local_stack_alloc__v_add_u32_e32__vgpr_offsets
@@ -199,9 +199,9 @@ body:             |
     ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX10-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0
     ; GFX10-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %vgpr_offset, [[V_MOV_B32_e32_]], implicit $exec
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_U32_e32_]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
     ; GFX10-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %vgpr_offset, [[V_MOV_B32_e32_]], implicit $exec
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_U32_e32_1]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]]
     ; GFX10-NEXT: SI_RETURN
     ;
     ; GFX12-LABEL: name: local_stack_alloc__v_add_u32_e32__vgpr_offsets
@@ -209,15 +209,15 @@ body:             |
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0
     ; GFX12-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %vgpr_offset, %stack.0, implicit $exec
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_U32_e32_]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
     ; GFX12-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %vgpr_offset, %stack.0, implicit $exec
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_U32_e32_1]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]]
     ; GFX12-NEXT: SI_RETURN
     %vgpr_offset:vgpr_32 = COPY $vgpr0
     %0:vgpr_32 = V_ADD_U32_e32 %vgpr_offset, %stack.0, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %0
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, %0
     %1:vgpr_32 = V_ADD_U32_e32 %vgpr_offset, %stack.0, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %1
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, %1
     SI_RETURN
 
 ...
@@ -240,9 +240,9 @@ body:             |
     ; GFX900-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX900-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0
     ; GFX900-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[V_MOV_B32_e32_]], %vgpr_offset, implicit $exec
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_U32_e32_]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
     ; GFX900-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[V_MOV_B32_e32_]], %vgpr_offset, implicit $exec
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_U32_e32_1]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]]
     ; GFX900-NEXT: SI_RETURN
     ;
     ; GFX942-LABEL: name: local_stack_alloc__v_add_u32_e32__vgpr_offsets_commute
@@ -250,9 +250,9 @@ body:             |
     ; GFX942-NEXT: {{  $}}
     ; GFX942-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0
     ; GFX942-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %stack.0, %vgpr_offset, implicit $exec
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_U32_e32_]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
     ; GFX942-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %stack.0, %vgpr_offset, implicit $exec
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_U32_e32_1]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]]
     ; GFX942-NEXT: SI_RETURN
     ;
     ; GFX10-LABEL: name: local_stack_alloc__v_add_u32_e32__vgpr_offsets_commute
@@ -261,9 +261,9 @@ body:             |
     ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX10-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0
     ; GFX10-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[V_MOV_B32_e32_]], %vgpr_offset, implicit $exec
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_U32_e32_]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
     ; GFX10-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[V_MOV_B32_e32_]], %vgpr_offset, implicit $exec
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_U32_e32_1]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]]
     ; GFX10-NEXT: SI_RETURN
     ;
     ; GFX12-LABEL: name: local_stack_alloc__v_add_u32_e32__vgpr_offsets_commute
@@ -271,15 +271,15 @@ body:             |
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0
     ; GFX12-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %stack.0, %vgpr_offset, implicit $exec
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_U32_e32_]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
     ; GFX12-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %stack.0, %vgpr_offset, implicit $exec
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_U32_e32_1]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]]
     ; GFX12-NEXT: SI_RETURN
     %vgpr_offset:vgpr_32 = COPY $vgpr0
     %0:vgpr_32 = V_ADD_U32_e32 %stack.0, %vgpr_offset, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %0
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, %0
     %1:vgpr_32 = V_ADD_U32_e32 %stack.0, %vgpr_offset, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %1
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, %1
     SI_RETURN
 
 ...
@@ -301,9 +301,9 @@ body:             |
     ; GFX900-NEXT: {{  $}}
     ; GFX900-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
     ; GFX900-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %sgpr_offset, %stack.0, implicit $exec
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_U32_e32_]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
     ; GFX900-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %sgpr_offset, %stack.0, implicit $exec
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_U32_e32_1]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]]
     ; GFX900-NEXT: SI_RETURN
     ;
     ; GFX942-LABEL: name: local_stack_alloc__v_add_u32_e32__sgpr_offsets
@@ -311,9 +311,9 @@ body:             |
     ; GFX942-NEXT: {{  $}}
     ; GFX942-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
     ; GFX942-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %sgpr_offset, %stack.0, implicit $exec
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_U32_e32_]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
     ; GFX942-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %sgpr_offset, %stack.0, implicit $exec
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_U32_e32_1]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]]
     ; GFX942-NEXT: SI_RETURN
     ;
     ; GFX10-LABEL: name: local_stack_alloc__v_add_u32_e32__sgpr_offsets
@@ -322,9 +322,9 @@ body:             |
     ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX10-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
     ; GFX10-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %sgpr_offset, [[V_MOV_B32_e32_]], implicit $exec
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_U32_e32_]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
     ; GFX10-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %sgpr_offset, [[V_MOV_B32_e32_]], implicit $exec
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_U32_e32_1]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]]
     ; GFX10-NEXT: SI_RETURN
     ;
     ; GFX12-LABEL: name: local_stack_alloc__v_add_u32_e32__sgpr_offsets
@@ -332,15 +332,15 @@ body:             |
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
     ; GFX12-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %sgpr_offset, %stack.0, implicit $exec
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_U32_e32_]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
     ; GFX12-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %sgpr_offset, %stack.0, implicit $exec
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_U32_e32_1]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]]
     ; GFX12-NEXT: SI_RETURN
     %sgpr_offset:sreg_32 = COPY $sgpr8
     %0:vgpr_32 = V_ADD_U32_e32 %sgpr_offset, %stack.0, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %0
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, %0
     %1:vgpr_32 = V_ADD_U32_e32 %sgpr_offset, %stack.0, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %1
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, %1
     SI_RETURN
 
 ...
@@ -363,9 +363,9 @@ body:             |
     ; GFX900-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX900-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
     ; GFX900-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %sgpr_offset, [[V_MOV_B32_e32_]], 0, implicit $exec
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_U32_e64_]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]]
     ; GFX900-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %sgpr_offset, [[V_MOV_B32_e32_]], 0, implicit $exec
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_U32_e64_1]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
     ; GFX900-NEXT: SI_RETURN
     ;
     ; GFX942-LABEL: name: local_stack_alloc__v_add_u32_e64__sgpr_offsets
@@ -373,9 +373,9 @@ body:             |
     ; GFX942-NEXT: {{  $}}
     ; GFX942-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
     ; GFX942-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %sgpr_offset, %stack.0, 0, implicit $exec
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_U32_e64_]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]]
     ; GFX942-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %sgpr_offset, %stack.0, 0, implicit $exec
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_U32_e64_1]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
     ; GFX942-NEXT: SI_RETURN
     ;
     ; GFX10-LABEL: name: local_stack_alloc__v_add_u32_e64__sgpr_offsets
@@ -384,9 +384,9 @@ body:             |
     ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX10-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
     ; GFX10-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %sgpr_offset, [[V_MOV_B32_e32_]], 0, implicit $exec
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_U32_e64_]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]]
     ; GFX10-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %sgpr_offset, [[V_MOV_B32_e32_]], 0, implicit $exec
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_U32_e64_1]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
     ; GFX10-NEXT: SI_RETURN
     ;
     ; GFX12-LABEL: name: local_stack_alloc__v_add_u32_e64__sgpr_offsets
@@ -394,15 +394,15 @@ body:             |
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
     ; GFX12-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %sgpr_offset, %stack.0, 0, implicit $exec
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_U32_e64_]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]]
     ; GFX12-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %sgpr_offset, %stack.0, 0, implicit $exec
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_U32_e64_1]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
     ; GFX12-NEXT: SI_RETURN
     %sgpr_offset:sreg_32 = COPY $sgpr8
     %0:vgpr_32 = V_ADD_U32_e64 %sgpr_offset, %stack.0, 0, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %0
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, %0
     %1:vgpr_32 = V_ADD_U32_e64 %sgpr_offset, %stack.0, 0, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %1
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, %1
     SI_RETURN
 
 ...
@@ -425,9 +425,9 @@ body:             |
     ; GFX900-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX900-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
     ; GFX900-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_MOV_B32_e32_]], %sgpr_offset, 0, implicit $exec
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_U32_e64_]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]]
     ; GFX900-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_MOV_B32_e32_]], %sgpr_offset, 0, implicit $exec
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_U32_e64_1]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
     ; GFX900-NEXT: SI_RETURN
     ;
     ; GFX942-LABEL: name: local_stack_alloc__v_add_u32_e64__sgpr_offsets_commute
@@ -435,9 +435,9 @@ body:             |
     ; GFX942-NEXT: {{  $}}
     ; GFX942-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
     ; GFX942-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %stack.0, %sgpr_offset, 0, implicit $exec
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_U32_e64_]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]]
     ; GFX942-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %stack.0, %sgpr_offset, 0, implicit $exec
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_U32_e64_1]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
     ; GFX942-NEXT: SI_RETURN
     ;
     ; GFX10-LABEL: name: local_stack_alloc__v_add_u32_e64__sgpr_offsets_commute
@@ -446,9 +446,9 @@ body:             |
     ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX10-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
     ; GFX10-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_MOV_B32_e32_]], %sgpr_offset, 0, implicit $exec
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_U32_e64_]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]]
     ; GFX10-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_MOV_B32_e32_]], %sgpr_offset, 0, implicit $exec
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_U32_e64_1]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
     ; GFX10-NEXT: SI_RETURN
     ;
     ; GFX12-LABEL: name: local_stack_alloc__v_add_u32_e64__sgpr_offsets_commute
@@ -456,15 +456,15 @@ body:             |
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
     ; GFX12-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %stack.0, %sgpr_offset, 0, implicit $exec
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_U32_e64_]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]]
     ; GFX12-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %stack.0, %sgpr_offset, 0, implicit $exec
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_U32_e64_1]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
     ; GFX12-NEXT: SI_RETURN
     %sgpr_offset:sreg_32 = COPY $sgpr8
     %0:vgpr_32 = V_ADD_U32_e64 %stack.0, %sgpr_offset, 0, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %0
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, %0
     %1:vgpr_32 = V_ADD_U32_e64 %stack.0, %sgpr_offset, 0, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %1
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, %1
     SI_RETURN
 
 ...
@@ -486,16 +486,16 @@ body:             |
     ; GFX900-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX900-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec
     ; GFX900-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]]
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[COPY]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[COPY]]
     ; GFX900-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 8, [[V_ADD_U32_e64_]], 1, implicit $exec
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_U32_e64_1]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
     ; GFX900-NEXT: SI_RETURN
     ;
     ; GFX942-LABEL: name: local_stack_alloc__v_add_u32_e64__inline_imm_offsets_clamp_modifier
     ; GFX942: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %stack.0, 8, 1, implicit $exec
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_U32_e64_]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]]
     ; GFX942-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 16, %stack.0, 1, implicit $exec
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_U32_e64_1]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
     ; GFX942-NEXT: SI_RETURN
     ;
     ; GFX10-LABEL: name: local_stack_alloc__v_add_u32_e64__inline_imm_offsets_clamp_modifier
@@ -503,21 +503,21 @@ body:             |
     ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX10-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]]
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[COPY]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[COPY]]
     ; GFX10-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 8, [[V_ADD_U32_e64_]], 1, implicit $exec
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_U32_e64_1]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
     ; GFX10-NEXT: SI_RETURN
     ;
     ; GFX12-LABEL: name: local_stack_alloc__v_add_u32_e64__inline_imm_offsets_clamp_modifier
     ; GFX12: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %stack.0, 8, 1, implicit $exec
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_U32_e64_]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]]
     ; GFX12-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 16, %stack.0, 1, implicit $exec
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VS_32_Lo128 */, [[V_ADD_U32_e64_1]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
     ; GFX12-NEXT: SI_RETURN
     %0:vgpr_32 = V_ADD_U32_e64 %stack.0, 8, /*clamp*/1, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %0
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, %0
     %1:vgpr_32 = V_ADD_U32_e64 16, %stack.0, /*clamp*/1, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %1
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, %1
     SI_RETURN
 
 ...
diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-control-flow.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-control-flow.ll
index 29e727b393309..4fa7c29bfde02 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-control-flow.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-control-flow.ll
@@ -455,3 +455,29 @@ loop:
 exit:
   ret float %sum
 }
+
+define void @dominance_not_in_program_order(ptr addrspace(7) inreg %arg) {
+; CHECK-LABEL: define void @dominance_not_in_program_order
+; CHECK-SAME: ({ ptr addrspace(8), i32 } inreg [[ARG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  .preheader15:
+; CHECK-NEXT:    [[ARG_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[ARG]], 0
+; CHECK-NEXT:    [[ARG_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[ARG]], 1
+; CHECK-NEXT:    br label [[DOTLR_PH18:%.*]]
+; CHECK:       .loopexit:
+; CHECK-NEXT:    [[SCEVGEP12:%.*]] = add i32 [[LSR_IV11_OFF:%.*]], 16
+; CHECK-NEXT:    br label [[DOTLR_PH18]]
+; CHECK:       .lr.ph18:
+; CHECK-NEXT:    [[LSR_IV11_OFF]] = phi i32 [ [[ARG_OFF]], [[DOTLOOPEXIT:%.*]] ], [ [[ARG_OFF]], [[DOTPREHEADER15:%.*]] ]
+; CHECK-NEXT:    br label [[DOTLOOPEXIT]]
+;
+.preheader15:
+  br label %.lr.ph18
+
+.loopexit:                                        ; preds = %.lr.ph18
+  %scevgep12 = getelementptr i8, ptr addrspace(7) %lsr.iv11, i32 16
+  br label %.lr.ph18
+
+.lr.ph18:                                         ; preds = %.loopexit, %.preheader15
+  %lsr.iv11 = phi ptr addrspace(7) [ %arg, %.loopexit ], [ %arg, %.preheader15 ]
+  br label %.loopexit
+}
diff --git a/llvm/test/CodeGen/AMDGPU/lower-intrinsics-cluster-barrier.ll b/llvm/test/CodeGen/AMDGPU/lower-intrinsics-cluster-barrier.ll
new file mode 100644
index 0000000000000..b66011be8defd
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lower-intrinsics-cluster-barrier.ll
@@ -0,0 +1,107 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -passes=amdgpu-lower-intrinsics -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -codegen-opt-level=0 | FileCheck --check-prefixes=CHECK,NOOPT %s
+; RUN: opt < %s -passes=amdgpu-lower-intrinsics -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -codegen-opt-level=1 -mattr=+wavefrontsize32 | FileCheck --check-prefixes=CHECK,OPT-WAVE32 %s
+; RUN: opt < %s -passes=amdgpu-lower-intrinsics -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -codegen-opt-level=1 -mattr=+wavefrontsize64 | FileCheck --check-prefixes=CHECK,OPT-WAVE64 %s
+
+declare void @foo(i1)
+
+; Verify that the explicit use of a split cluster barrier isn't optimized away.
+define amdgpu_kernel void @split_barriers() "amdgpu-flat-work-group-size"="32,32" {
+; CHECK-LABEL: define amdgpu_kernel void @split_barriers(
+; CHECK-SAME: ) #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.signal(i32 -3)
+; CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.wait(i16 -3)
+; CHECK-NEXT:    [[ISFIRST:%.*]] = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 -3)
+; CHECK-NEXT:    call void @foo(i1 [[ISFIRST]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.amdgcn.s.barrier.signal(i32 -3)
+  call void @llvm.amdgcn.s.barrier.wait(i16 -3)
+  %isfirst = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 -3)
+  call void @foo(i1 %isfirst)
+  ret void
+}
+
+define amdgpu_kernel void @s_cluster_barrier() {
+; CHECK-LABEL: define amdgpu_kernel void @s_cluster_barrier(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 -1)
+; CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.wait(i16 -1)
+; CHECK-NEXT:    br i1 [[TMP1]], label %[[BB2:.*]], label %[[BB3:.*]]
+; CHECK:       [[BB2]]:
+; CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.signal(i32 -3)
+; CHECK-NEXT:    br label %[[BB3]]
+; CHECK:       [[BB3]]:
+; CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.wait(i16 -3)
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.amdgcn.s.cluster.barrier()
+  ret void
+}
+
+define amdgpu_kernel void @s_cluster_barrier_wg32() "amdgpu-flat-work-group-size"="32,32" {
+; NOOPT-LABEL: define amdgpu_kernel void @s_cluster_barrier_wg32(
+; NOOPT-SAME: ) #[[ATTR1]] {
+; NOOPT-NEXT:    [[TMP1:%.*]] = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 -1)
+; NOOPT-NEXT:    call void @llvm.amdgcn.s.barrier.wait(i16 -1)
+; NOOPT-NEXT:    br i1 [[TMP1]], label %[[BB2:.*]], label %[[BB3:.*]]
+; NOOPT:       [[BB2]]:
+; NOOPT-NEXT:    call void @llvm.amdgcn.s.barrier.signal(i32 -3)
+; NOOPT-NEXT:    br label %[[BB3]]
+; NOOPT:       [[BB3]]:
+; NOOPT-NEXT:    call void @llvm.amdgcn.s.barrier.wait(i16 -3)
+; NOOPT-NEXT:    ret void
+;
+; OPT-WAVE32-LABEL: define amdgpu_kernel void @s_cluster_barrier_wg32(
+; OPT-WAVE32-SAME: ) #[[ATTR1]] {
+; OPT-WAVE32-NEXT:    call void @llvm.amdgcn.wave.barrier()
+; OPT-WAVE32-NEXT:    call void @llvm.amdgcn.s.barrier.signal(i32 -3)
+; OPT-WAVE32-NEXT:    call void @llvm.amdgcn.s.barrier.wait(i16 -3)
+; OPT-WAVE32-NEXT:    ret void
+;
+; OPT-WAVE64-LABEL: define amdgpu_kernel void @s_cluster_barrier_wg32(
+; OPT-WAVE64-SAME: ) #[[ATTR1]] {
+; OPT-WAVE64-NEXT:    call void @llvm.amdgcn.wave.barrier()
+; OPT-WAVE64-NEXT:    call void @llvm.amdgcn.s.barrier.signal(i32 -3)
+; OPT-WAVE64-NEXT:    call void @llvm.amdgcn.s.barrier.wait(i16 -3)
+; OPT-WAVE64-NEXT:    ret void
+;
+  call void @llvm.amdgcn.s.cluster.barrier()
+  ret void
+}
+
+define amdgpu_kernel void @s_cluster_barrier_wg64() "amdgpu-flat-work-group-size"="64,64" {
+; NOOPT-LABEL: define amdgpu_kernel void @s_cluster_barrier_wg64(
+; NOOPT-SAME: ) #[[ATTR2:[0-9]+]] {
+; NOOPT-NEXT:    [[TMP1:%.*]] = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 -1)
+; NOOPT-NEXT:    call void @llvm.amdgcn.s.barrier.wait(i16 -1)
+; NOOPT-NEXT:    br i1 [[TMP1]], label %[[BB2:.*]], label %[[BB3:.*]]
+; NOOPT:       [[BB2]]:
+; NOOPT-NEXT:    call void @llvm.amdgcn.s.barrier.signal(i32 -3)
+; NOOPT-NEXT:    br label %[[BB3]]
+; NOOPT:       [[BB3]]:
+; NOOPT-NEXT:    call void @llvm.amdgcn.s.barrier.wait(i16 -3)
+; NOOPT-NEXT:    ret void
+;
+; OPT-WAVE32-LABEL: define amdgpu_kernel void @s_cluster_barrier_wg64(
+; OPT-WAVE32-SAME: ) #[[ATTR2:[0-9]+]] {
+; OPT-WAVE32-NEXT:    [[TMP1:%.*]] = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 -1)
+; OPT-WAVE32-NEXT:    call void @llvm.amdgcn.s.barrier.wait(i16 -1)
+; OPT-WAVE32-NEXT:    br i1 [[TMP1]], label %[[BB2:.*]], label %[[BB3:.*]]
+; OPT-WAVE32:       [[BB2]]:
+; OPT-WAVE32-NEXT:    call void @llvm.amdgcn.s.barrier.signal(i32 -3)
+; OPT-WAVE32-NEXT:    br label %[[BB3]]
+; OPT-WAVE32:       [[BB3]]:
+; OPT-WAVE32-NEXT:    call void @llvm.amdgcn.s.barrier.wait(i16 -3)
+; OPT-WAVE32-NEXT:    ret void
+;
+; OPT-WAVE64-LABEL: define amdgpu_kernel void @s_cluster_barrier_wg64(
+; OPT-WAVE64-SAME: ) #[[ATTR2:[0-9]+]] {
+; OPT-WAVE64-NEXT:    call void @llvm.amdgcn.wave.barrier()
+; OPT-WAVE64-NEXT:    call void @llvm.amdgcn.s.barrier.signal(i32 -3)
+; OPT-WAVE64-NEXT:    call void @llvm.amdgcn.s.barrier.wait(i16 -3)
+; OPT-WAVE64-NEXT:    ret void
+;
+  call void @llvm.amdgcn.s.cluster.barrier()
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll
index 2554d99def57f..169a84ff1f86b 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll
@@ -297,6 +297,6 @@ declare i32 @llvm.amdgcn.workgroup.id.y()
 declare i32 @llvm.amdgcn.workgroup.id.z()
 declare void @llvm.amdgcn.raw.ptr.buffer.store.v3i32(<3 x i32>, ptr addrspace(8), i32, i32, i32 immarg)
 
-attributes #0 = { nounwind "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" }
+attributes #0 = { nounwind "amdgpu-no-workgroup-id-y" "amdgpu-no-cluster-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-cluster-id-z" }
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; GFX9ARCH: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-opt.ll b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-opt.ll
new file mode 100644
index 0000000000000..69439d49e588f
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-opt.ll
@@ -0,0 +1,390 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 %s -o - | FileCheck -check-prefix=GFX1250-SDAG %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -global-isel %s -o - | FileCheck -check-prefix=GFX1250-GISEL %s
+
+define void @test_workgroup_id_x_non_kernel(ptr addrspace(1) %out) {
+; GFX1250-SDAG-LABEL: test_workgroup_id_x_non_kernel:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT:    s_bfe_u32 s0, ttmp6, 0x4000c
+; GFX1250-SDAG-NEXT:    s_and_b32 s1, ttmp6, 15
+; GFX1250-SDAG-NEXT:    s_add_co_i32 s0, s0, 1
+; GFX1250-SDAG-NEXT:    s_getreg_b32 s2, hwreg(HW_REG_IB_STS2, 6, 4)
+; GFX1250-SDAG-NEXT:    s_mul_i32 s0, ttmp9, s0
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT:    s_add_co_i32 s1, s1, s0
+; GFX1250-SDAG-NEXT:    s_cmp_eq_u32 s2, 0
+; GFX1250-SDAG-NEXT:    s_cselect_b32 s0, ttmp9, s1
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-SDAG-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX1250-SDAG-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX1250-GISEL-LABEL: test_workgroup_id_x_non_kernel:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT:    s_bfe_u32 s0, ttmp6, 0x4000c
+; GFX1250-GISEL-NEXT:    s_and_b32 s1, ttmp6, 15
+; GFX1250-GISEL-NEXT:    s_add_co_i32 s0, s0, 1
+; GFX1250-GISEL-NEXT:    s_getreg_b32 s2, hwreg(HW_REG_IB_STS2, 6, 4)
+; GFX1250-GISEL-NEXT:    s_mul_i32 s0, ttmp9, s0
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX1250-GISEL-NEXT:    s_add_co_i32 s1, s1, s0
+; GFX1250-GISEL-NEXT:    s_cmp_eq_u32 s2, 0
+; GFX1250-GISEL-NEXT:    s_cselect_b32 s0, ttmp9, s1
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-GISEL-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX1250-GISEL-NEXT:    s_set_pc_i64 s[30:31]
+  %id = call i32 @llvm.amdgcn.workgroup.id.x()
+  store i32 %id, ptr addrspace(1) %out
+  ret void
+}
+
+define void @test_workgroup_id_x_non_kernel_optimized_used(ptr addrspace(1) %out) "amdgpu-cluster-dims"="1024,1024,1024" {
+; GFX1250-SDAG-LABEL: test_workgroup_id_x_non_kernel_optimized_used:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT:    s_bfe_u32 s0, ttmp6, 0x4000c
+; GFX1250-SDAG-NEXT:    s_and_b32 s1, ttmp6, 15
+; GFX1250-SDAG-NEXT:    s_add_co_i32 s0, s0, 1
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT:    s_mul_i32 s0, ttmp9, s0
+; GFX1250-SDAG-NEXT:    s_add_co_i32 s0, s1, s0
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-SDAG-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX1250-SDAG-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX1250-GISEL-LABEL: test_workgroup_id_x_non_kernel_optimized_used:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT:    s_bfe_u32 s0, ttmp6, 0x4000c
+; GFX1250-GISEL-NEXT:    s_and_b32 s1, ttmp6, 15
+; GFX1250-GISEL-NEXT:    s_add_co_i32 s0, s0, 1
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250-GISEL-NEXT:    s_mul_i32 s0, ttmp9, s0
+; GFX1250-GISEL-NEXT:    s_add_co_i32 s0, s1, s0
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-GISEL-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX1250-GISEL-NEXT:    s_set_pc_i64 s[30:31]
+  %id = call i32 @llvm.amdgcn.workgroup.id.x()
+  store i32 %id, ptr addrspace(1) %out
+  ret void
+}
+
+define void @test_workgroup_id_x_non_kernel_optimized_not_used(ptr addrspace(1) %out) "amdgpu-cluster-dims"="0,0,0" {
+; GFX1250-SDAG-LABEL: test_workgroup_id_x_non_kernel_optimized_not_used:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v2, ttmp9
+; GFX1250-SDAG-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX1250-SDAG-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX1250-GISEL-LABEL: test_workgroup_id_x_non_kernel_optimized_not_used:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v2, ttmp9
+; GFX1250-GISEL-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX1250-GISEL-NEXT:    s_set_pc_i64 s[30:31]
+  %id = call i32 @llvm.amdgcn.workgroup.id.x()
+  store i32 %id, ptr addrspace(1) %out
+  ret void
+}
+
+define void @test_workgroup_id_x_non_kernel_optimized_fixed(ptr addrspace(1) %out) "amdgpu-cluster-dims"="2,1,2" {
+; GFX1250-SDAG-LABEL: test_workgroup_id_x_non_kernel_optimized_fixed:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT:    s_lshl_b32 s0, ttmp9, 1
+; GFX1250-SDAG-NEXT:    s_and_b32 s1, ttmp6, 15
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT:    s_add_co_i32 s0, s1, s0
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-SDAG-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX1250-SDAG-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX1250-GISEL-LABEL: test_workgroup_id_x_non_kernel_optimized_fixed:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT:    s_and_b32 s0, ttmp6, 15
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250-GISEL-NEXT:    s_lshl1_add_u32 s0, ttmp9, s0
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-GISEL-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX1250-GISEL-NEXT:    s_set_pc_i64 s[30:31]
+  %id = call i32 @llvm.amdgcn.workgroup.id.x()
+  store i32 %id, ptr addrspace(1) %out
+  ret void
+}
+
+define void @test_workgroup_id_y_non_kernel(ptr addrspace(1) %out) {
+; GFX1250-SDAG-LABEL: test_workgroup_id_y_non_kernel:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT:    s_bfe_u32 s0, ttmp6, 0x40010
+; GFX1250-SDAG-NEXT:    s_and_b32 s1, ttmp7, 0xffff
+; GFX1250-SDAG-NEXT:    s_add_co_i32 s0, s0, 1
+; GFX1250-SDAG-NEXT:    s_bfe_u32 s2, ttmp6, 0x40004
+; GFX1250-SDAG-NEXT:    s_mul_i32 s0, s1, s0
+; GFX1250-SDAG-NEXT:    s_getreg_b32 s3, hwreg(HW_REG_IB_STS2, 6, 4)
+; GFX1250-SDAG-NEXT:    s_add_co_i32 s2, s2, s0
+; GFX1250-SDAG-NEXT:    s_cmp_eq_u32 s3, 0
+; GFX1250-SDAG-NEXT:    s_cselect_b32 s0, s1, s2
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-SDAG-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX1250-SDAG-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX1250-GISEL-LABEL: test_workgroup_id_y_non_kernel:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT:    s_bfe_u32 s0, ttmp6, 0x40010
+; GFX1250-GISEL-NEXT:    s_and_b32 s1, ttmp7, 0xffff
+; GFX1250-GISEL-NEXT:    s_add_co_i32 s0, s0, 1
+; GFX1250-GISEL-NEXT:    s_bfe_u32 s2, ttmp6, 0x40004
+; GFX1250-GISEL-NEXT:    s_mul_i32 s0, s1, s0
+; GFX1250-GISEL-NEXT:    s_getreg_b32 s3, hwreg(HW_REG_IB_STS2, 6, 4)
+; GFX1250-GISEL-NEXT:    s_add_co_i32 s2, s2, s0
+; GFX1250-GISEL-NEXT:    s_cmp_eq_u32 s3, 0
+; GFX1250-GISEL-NEXT:    s_cselect_b32 s0, s1, s2
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-GISEL-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX1250-GISEL-NEXT:    s_set_pc_i64 s[30:31]
+  %id = call i32 @llvm.amdgcn.workgroup.id.y()
+  store i32 %id, ptr addrspace(1) %out
+  ret void
+}
+
+define void @test_workgroup_id_y_non_kernel_optimized_used(ptr addrspace(1) %out) "amdgpu-cluster-dims"="1024,1024,1024" {
+; GFX1250-SDAG-LABEL: test_workgroup_id_y_non_kernel_optimized_used:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT:    s_bfe_u32 s0, ttmp6, 0x40010
+; GFX1250-SDAG-NEXT:    s_and_b32 s1, ttmp7, 0xffff
+; GFX1250-SDAG-NEXT:    s_add_co_i32 s0, s0, 1
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT:    s_mul_i32 s1, s1, s0
+; GFX1250-SDAG-NEXT:    s_bfe_u32 s0, ttmp6, 0x40004
+; GFX1250-SDAG-NEXT:    s_add_co_i32 s0, s0, s1
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-SDAG-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX1250-SDAG-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX1250-GISEL-LABEL: test_workgroup_id_y_non_kernel_optimized_used:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT:    s_bfe_u32 s0, ttmp6, 0x40010
+; GFX1250-GISEL-NEXT:    s_and_b32 s1, ttmp7, 0xffff
+; GFX1250-GISEL-NEXT:    s_add_co_i32 s0, s0, 1
+; GFX1250-GISEL-NEXT:    s_bfe_u32 s2, ttmp6, 0x40004
+; GFX1250-GISEL-NEXT:    s_mul_i32 s1, s1, s0
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250-GISEL-NEXT:    s_add_co_i32 s0, s2, s1
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-GISEL-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX1250-GISEL-NEXT:    s_set_pc_i64 s[30:31]
+  %id = call i32 @llvm.amdgcn.workgroup.id.y()
+  store i32 %id, ptr addrspace(1) %out
+  ret void
+}
+
+define void @test_workgroup_id_y_non_kernel_optimized_not_used(ptr addrspace(1) %out) "amdgpu-cluster-dims"="0,0,0" {
+; GFX1250-SDAG-LABEL: test_workgroup_id_y_non_kernel_optimized_not_used:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT:    s_and_b32 s0, ttmp7, 0xffff
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-SDAG-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX1250-SDAG-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX1250-GISEL-LABEL: test_workgroup_id_y_non_kernel_optimized_not_used:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT:    s_and_b32 s0, ttmp7, 0xffff
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-GISEL-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX1250-GISEL-NEXT:    s_set_pc_i64 s[30:31]
+  %id = call i32 @llvm.amdgcn.workgroup.id.y()
+  store i32 %id, ptr addrspace(1) %out
+  ret void
+}
+
+define void @test_workgroup_id_y_non_kernel_optimized_fixed(ptr addrspace(1) %out) "amdgpu-cluster-dims"="2,1,2" {
+; GFX1250-SDAG-LABEL: test_workgroup_id_y_non_kernel_optimized_fixed:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT:    s_and_b32 s0, ttmp7, 0xffff
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-SDAG-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX1250-SDAG-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX1250-GISEL-LABEL: test_workgroup_id_y_non_kernel_optimized_fixed:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT:    s_and_b32 s0, ttmp7, 0xffff
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-GISEL-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX1250-GISEL-NEXT:    s_set_pc_i64 s[30:31]
+  %id = call i32 @llvm.amdgcn.workgroup.id.y()
+  store i32 %id, ptr addrspace(1) %out
+  ret void
+}
+
+define void @test_workgroup_id_z_non_kernel(ptr addrspace(1) %out) {
+; GFX1250-SDAG-LABEL: test_workgroup_id_z_non_kernel:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT:    s_bfe_u32 s0, ttmp6, 0x40014
+; GFX1250-SDAG-NEXT:    s_lshr_b32 s1, ttmp7, 16
+; GFX1250-SDAG-NEXT:    s_add_co_i32 s0, s0, 1
+; GFX1250-SDAG-NEXT:    s_bfe_u32 s2, ttmp6, 0x40008
+; GFX1250-SDAG-NEXT:    s_mul_i32 s0, s1, s0
+; GFX1250-SDAG-NEXT:    s_getreg_b32 s3, hwreg(HW_REG_IB_STS2, 6, 4)
+; GFX1250-SDAG-NEXT:    s_add_co_i32 s2, s2, s0
+; GFX1250-SDAG-NEXT:    s_cmp_eq_u32 s3, 0
+; GFX1250-SDAG-NEXT:    s_cselect_b32 s0, s1, s2
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-SDAG-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX1250-SDAG-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX1250-GISEL-LABEL: test_workgroup_id_z_non_kernel:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT:    s_bfe_u32 s0, ttmp6, 0x40014
+; GFX1250-GISEL-NEXT:    s_lshr_b32 s1, ttmp7, 16
+; GFX1250-GISEL-NEXT:    s_add_co_i32 s0, s0, 1
+; GFX1250-GISEL-NEXT:    s_bfe_u32 s2, ttmp6, 0x40008
+; GFX1250-GISEL-NEXT:    s_mul_i32 s0, s1, s0
+; GFX1250-GISEL-NEXT:    s_getreg_b32 s3, hwreg(HW_REG_IB_STS2, 6, 4)
+; GFX1250-GISEL-NEXT:    s_add_co_i32 s2, s2, s0
+; GFX1250-GISEL-NEXT:    s_cmp_eq_u32 s3, 0
+; GFX1250-GISEL-NEXT:    s_cselect_b32 s0, s1, s2
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-GISEL-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX1250-GISEL-NEXT:    s_set_pc_i64 s[30:31]
+  %id = call i32 @llvm.amdgcn.workgroup.id.z()
+  store i32 %id, ptr addrspace(1) %out
+  ret void
+}
+
+define void @test_workgroup_id_z_non_kernel_optimized_used(ptr addrspace(1) %out) "amdgpu-cluster-dims"="1024,1024,1024" {
+; GFX1250-SDAG-LABEL: test_workgroup_id_z_non_kernel_optimized_used:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT:    s_bfe_u32 s0, ttmp6, 0x40014
+; GFX1250-SDAG-NEXT:    s_lshr_b32 s1, ttmp7, 16
+; GFX1250-SDAG-NEXT:    s_add_co_i32 s0, s0, 1
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT:    s_mul_i32 s1, s1, s0
+; GFX1250-SDAG-NEXT:    s_bfe_u32 s0, ttmp6, 0x40008
+; GFX1250-SDAG-NEXT:    s_add_co_i32 s0, s0, s1
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-SDAG-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX1250-SDAG-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX1250-GISEL-LABEL: test_workgroup_id_z_non_kernel_optimized_used:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT:    s_bfe_u32 s0, ttmp6, 0x40014
+; GFX1250-GISEL-NEXT:    s_lshr_b32 s1, ttmp7, 16
+; GFX1250-GISEL-NEXT:    s_add_co_i32 s0, s0, 1
+; GFX1250-GISEL-NEXT:    s_bfe_u32 s2, ttmp6, 0x40008
+; GFX1250-GISEL-NEXT:    s_mul_i32 s1, s1, s0
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250-GISEL-NEXT:    s_add_co_i32 s0, s2, s1
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-GISEL-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX1250-GISEL-NEXT:    s_set_pc_i64 s[30:31]
+  %id = call i32 @llvm.amdgcn.workgroup.id.z()
+  store i32 %id, ptr addrspace(1) %out
+  ret void
+}
+
+define void @test_workgroup_id_z_non_kernel_optimized_not_used(ptr addrspace(1) %out) "amdgpu-cluster-dims"="0,0,0" {
+; GFX1250-SDAG-LABEL: test_workgroup_id_z_non_kernel_optimized_not_used:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT:    s_lshr_b32 s0, ttmp7, 16
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-SDAG-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX1250-SDAG-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX1250-GISEL-LABEL: test_workgroup_id_z_non_kernel_optimized_not_used:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT:    s_lshr_b32 s0, ttmp7, 16
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-GISEL-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX1250-GISEL-NEXT:    s_set_pc_i64 s[30:31]
+  %id = call i32 @llvm.amdgcn.workgroup.id.z()
+  store i32 %id, ptr addrspace(1) %out
+  ret void
+}
+
+define void @test_workgroup_id_z_non_kernel_optimized_fixed(ptr addrspace(1) %out) "amdgpu-cluster-dims"="2,1,2" {
+; GFX1250-SDAG-LABEL: test_workgroup_id_z_non_kernel_optimized_fixed:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT:    s_lshr_b32 s0, ttmp7, 15
+; GFX1250-SDAG-NEXT:    s_bfe_u32 s1, ttmp6, 0x40008
+; GFX1250-SDAG-NEXT:    s_and_b32 s0, s0, 0x1fffe
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT:    s_add_co_i32 s0, s1, s0
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-SDAG-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX1250-SDAG-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX1250-GISEL-LABEL: test_workgroup_id_z_non_kernel_optimized_fixed:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT:    s_lshr_b32 s0, ttmp7, 16
+; GFX1250-GISEL-NEXT:    s_bfe_u32 s1, ttmp6, 0x40008
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250-GISEL-NEXT:    s_lshl1_add_u32 s0, s0, s1
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-GISEL-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX1250-GISEL-NEXT:    s_set_pc_i64 s[30:31]
+  %id = call i32 @llvm.amdgcn.workgroup.id.z()
+  store i32 %id, ptr addrspace(1) %out
+  ret void
+}
+
+
+declare i32 @llvm.amdgcn.workgroup.id.x()
+declare i32 @llvm.amdgcn.workgroup.id.y()
+declare i32 @llvm.amdgcn.workgroup.id.z()
diff --git a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics.ll
new file mode 100644
index 0000000000000..497241cff392d
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics.ll
@@ -0,0 +1,376 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=+architected-sgprs < %s | FileCheck -check-prefix=GFX9-SDAG %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=+architected-sgprs -global-isel < %s | FileCheck -check-prefix=GFX9-GISEL %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12-SDAG %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -global-isel < %s | FileCheck -check-prefix=GFX12-GISEL %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1250 < %s | FileCheck -check-prefix=GFX1250-SDAG %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1250 -global-isel < %s | FileCheck -check-prefix=GFX1250-GISEL %s
+
+define amdgpu_cs void @_amdgpu_cs_main() {
+; GFX9-SDAG-LABEL: _amdgpu_cs_main:
+; GFX9-SDAG:       ; %bb.0: ; %.entry
+; GFX9-SDAG-NEXT:    s_lshr_b32 s0, ttmp7, 16
+; GFX9-SDAG-NEXT:    s_and_b32 s1, ttmp7, 0xffff
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, ttmp9
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; GFX9-SDAG-NEXT:    buffer_store_dwordx3 v[0:2], off, s[0:3], 0
+; GFX9-SDAG-NEXT:    s_endpgm
+;
+; GFX9-GISEL-LABEL: _amdgpu_cs_main:
+; GFX9-GISEL:       ; %bb.0: ; %.entry
+; GFX9-GISEL-NEXT:    s_mov_b32 s0, ttmp9
+; GFX9-GISEL-NEXT:    s_and_b32 s1, ttmp7, 0xffff
+; GFX9-GISEL-NEXT:    s_lshr_b32 s2, ttmp7, 16
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX9-GISEL-NEXT:    buffer_store_dwordx3 v[0:2], off, s[0:3], 0
+; GFX9-GISEL-NEXT:    s_endpgm
+;
+; GFX12-SDAG-LABEL: _amdgpu_cs_main:
+; GFX12-SDAG:       ; %bb.0: ; %.entry
+; GFX12-SDAG-NEXT:    s_and_b32 s0, ttmp7, 0xffff
+; GFX12-SDAG-NEXT:    s_lshr_b32 s1, ttmp7, 16
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, s0
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-SDAG-NEXT:    buffer_store_b96 v[0:2], off, s[0:3], null
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: _amdgpu_cs_main:
+; GFX12-GISEL:       ; %bb.0: ; %.entry
+; GFX12-GISEL-NEXT:    s_mov_b32 s0, ttmp9
+; GFX12-GISEL-NEXT:    s_and_b32 s1, ttmp7, 0xffff
+; GFX12-GISEL-NEXT:    s_lshr_b32 s2, ttmp7, 16
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-GISEL-NEXT:    buffer_store_b96 v[0:2], off, s[0:3], null
+; GFX12-GISEL-NEXT:    s_endpgm
+;
+; GFX1250-SDAG-LABEL: _amdgpu_cs_main:
+; GFX1250-SDAG:       ; %bb.0: ; %.entry
+; GFX1250-SDAG-NEXT:    s_bfe_u32 s0, ttmp6, 0x4000c
+; GFX1250-SDAG-NEXT:    s_bfe_u32 s2, ttmp6, 0x40010
+; GFX1250-SDAG-NEXT:    s_add_co_i32 s0, s0, 1
+; GFX1250-SDAG-NEXT:    s_and_b32 s1, ttmp6, 15
+; GFX1250-SDAG-NEXT:    s_mul_i32 s0, ttmp9, s0
+; GFX1250-SDAG-NEXT:    s_and_b32 s3, ttmp7, 0xffff
+; GFX1250-SDAG-NEXT:    s_add_co_i32 s2, s2, 1
+; GFX1250-SDAG-NEXT:    s_bfe_u32 s4, ttmp6, 0x40014
+; GFX1250-SDAG-NEXT:    s_add_co_i32 s1, s1, s0
+; GFX1250-SDAG-NEXT:    s_mul_i32 s0, s3, s2
+; GFX1250-SDAG-NEXT:    s_bfe_u32 s2, ttmp6, 0x40004
+; GFX1250-SDAG-NEXT:    s_lshr_b32 s5, ttmp7, 16
+; GFX1250-SDAG-NEXT:    s_add_co_i32 s4, s4, 1
+; GFX1250-SDAG-NEXT:    s_add_co_i32 s2, s2, s0
+; GFX1250-SDAG-NEXT:    s_mul_i32 s0, s5, s4
+; GFX1250-SDAG-NEXT:    s_bfe_u32 s4, ttmp6, 0x40008
+; GFX1250-SDAG-NEXT:    s_getreg_b32 s6, hwreg(HW_REG_IB_STS2, 6, 4)
+; GFX1250-SDAG-NEXT:    s_add_co_i32 s4, s4, s0
+; GFX1250-SDAG-NEXT:    s_cmp_eq_u32 s6, 0
+; GFX1250-SDAG-NEXT:    s_cselect_b32 s0, s5, s4
+; GFX1250-SDAG-NEXT:    s_cselect_b32 s1, ttmp9, s1
+; GFX1250-SDAG-NEXT:    s_cselect_b32 s2, s3, s2
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s2
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-SDAG-NEXT:    buffer_store_b96 v[0:2], off, s[0:3], null
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: _amdgpu_cs_main:
+; GFX1250-GISEL:       ; %bb.0: ; %.entry
+; GFX1250-GISEL-NEXT:    s_bfe_u32 s0, ttmp6, 0x4000c
+; GFX1250-GISEL-NEXT:    s_and_b32 s1, ttmp6, 15
+; GFX1250-GISEL-NEXT:    s_add_co_i32 s0, s0, 1
+; GFX1250-GISEL-NEXT:    s_getreg_b32 s2, hwreg(HW_REG_IB_STS2, 6, 4)
+; GFX1250-GISEL-NEXT:    s_mul_i32 s0, ttmp9, s0
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-GISEL-NEXT:    s_add_co_i32 s1, s1, s0
+; GFX1250-GISEL-NEXT:    s_cmp_eq_u32 s2, 0
+; GFX1250-GISEL-NEXT:    s_cselect_b32 s0, ttmp9, s1
+; GFX1250-GISEL-NEXT:    s_bfe_u32 s1, ttmp6, 0x40010
+; GFX1250-GISEL-NEXT:    s_and_b32 s3, ttmp7, 0xffff
+; GFX1250-GISEL-NEXT:    s_add_co_i32 s1, s1, 1
+; GFX1250-GISEL-NEXT:    s_bfe_u32 s4, ttmp6, 0x40004
+; GFX1250-GISEL-NEXT:    s_mul_i32 s1, s3, s1
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-GISEL-NEXT:    s_add_co_i32 s4, s4, s1
+; GFX1250-GISEL-NEXT:    s_cmp_eq_u32 s2, 0
+; GFX1250-GISEL-NEXT:    s_cselect_b32 s1, s3, s4
+; GFX1250-GISEL-NEXT:    s_bfe_u32 s3, ttmp6, 0x40014
+; GFX1250-GISEL-NEXT:    s_lshr_b32 s4, ttmp7, 16
+; GFX1250-GISEL-NEXT:    s_add_co_i32 s3, s3, 1
+; GFX1250-GISEL-NEXT:    s_bfe_u32 s5, ttmp6, 0x40008
+; GFX1250-GISEL-NEXT:    s_mul_i32 s3, s4, s3
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-GISEL-NEXT:    s_add_co_i32 s5, s5, s3
+; GFX1250-GISEL-NEXT:    s_cmp_eq_u32 s2, 0
+; GFX1250-GISEL-NEXT:    s_cselect_b32 s2, s4, s5
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-GISEL-NEXT:    buffer_store_b96 v[0:2], off, s[0:3], null
+; GFX1250-GISEL-NEXT:    s_endpgm
+.entry:
+  %idx = call i32 @llvm.amdgcn.workgroup.id.x()
+  %idy = call i32 @llvm.amdgcn.workgroup.id.y()
+  %idz = call i32 @llvm.amdgcn.workgroup.id.z()
+  %ielemx = insertelement <3 x i32> undef, i32 %idx, i64 0
+  %ielemy = insertelement <3 x i32> %ielemx, i32 %idy, i64 1
+  %ielemz = insertelement <3 x i32> %ielemy, i32 %idz, i64 2
+  call void @llvm.amdgcn.raw.ptr.buffer.store.v3i32(<3 x i32> %ielemz, ptr addrspace(8) undef, i32 0, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_cs void @workgroup_id_no_clusters() "amdgpu-cluster-dims"="0,0,0" {
+; GFX9-SDAG-LABEL: workgroup_id_no_clusters:
+; GFX9-SDAG:       ; %bb.0: ; %.entry
+; GFX9-SDAG-NEXT:    s_lshr_b32 s0, ttmp7, 16
+; GFX9-SDAG-NEXT:    s_and_b32 s1, ttmp7, 0xffff
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, ttmp9
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; GFX9-SDAG-NEXT:    buffer_store_dwordx3 v[0:2], off, s[0:3], 0
+; GFX9-SDAG-NEXT:    s_endpgm
+;
+; GFX9-GISEL-LABEL: workgroup_id_no_clusters:
+; GFX9-GISEL:       ; %bb.0: ; %.entry
+; GFX9-GISEL-NEXT:    s_mov_b32 s0, ttmp9
+; GFX9-GISEL-NEXT:    s_and_b32 s1, ttmp7, 0xffff
+; GFX9-GISEL-NEXT:    s_lshr_b32 s2, ttmp7, 16
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX9-GISEL-NEXT:    buffer_store_dwordx3 v[0:2], off, s[0:3], 0
+; GFX9-GISEL-NEXT:    s_endpgm
+;
+; GFX12-SDAG-LABEL: workgroup_id_no_clusters:
+; GFX12-SDAG:       ; %bb.0: ; %.entry
+; GFX12-SDAG-NEXT:    s_and_b32 s0, ttmp7, 0xffff
+; GFX12-SDAG-NEXT:    s_lshr_b32 s1, ttmp7, 16
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, s0
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-SDAG-NEXT:    buffer_store_b96 v[0:2], off, s[0:3], null
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: workgroup_id_no_clusters:
+; GFX12-GISEL:       ; %bb.0: ; %.entry
+; GFX12-GISEL-NEXT:    s_mov_b32 s0, ttmp9
+; GFX12-GISEL-NEXT:    s_and_b32 s1, ttmp7, 0xffff
+; GFX12-GISEL-NEXT:    s_lshr_b32 s2, ttmp7, 16
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-GISEL-NEXT:    buffer_store_b96 v[0:2], off, s[0:3], null
+; GFX12-GISEL-NEXT:    s_endpgm
+;
+; GFX1250-SDAG-LABEL: workgroup_id_no_clusters:
+; GFX1250-SDAG:       ; %bb.0: ; %.entry
+; GFX1250-SDAG-NEXT:    s_and_b32 s0, ttmp7, 0xffff
+; GFX1250-SDAG-NEXT:    s_lshr_b32 s1, ttmp7, 16
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, s0
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-SDAG-NEXT:    buffer_store_b96 v[0:2], off, s[0:3], null
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: workgroup_id_no_clusters:
+; GFX1250-GISEL:       ; %bb.0: ; %.entry
+; GFX1250-GISEL-NEXT:    s_mov_b32 s0, ttmp9
+; GFX1250-GISEL-NEXT:    s_and_b32 s1, ttmp7, 0xffff
+; GFX1250-GISEL-NEXT:    s_lshr_b32 s2, ttmp7, 16
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-GISEL-NEXT:    buffer_store_b96 v[0:2], off, s[0:3], null
+; GFX1250-GISEL-NEXT:    s_endpgm
+.entry:
+  %idx = call i32 @llvm.amdgcn.workgroup.id.x()
+  %idy = call i32 @llvm.amdgcn.workgroup.id.y()
+  %idz = call i32 @llvm.amdgcn.workgroup.id.z()
+  %ielemx = insertelement <3 x i32> undef, i32 %idx, i64 0
+  %ielemy = insertelement <3 x i32> %ielemx, i32 %idy, i64 1
+  %ielemz = insertelement <3 x i32> %ielemy, i32 %idz, i64 2
+  call void @llvm.amdgcn.raw.ptr.buffer.store.v3i32(<3 x i32> %ielemz, ptr addrspace(8) undef, i32 0, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_cs void @workgroup_id_optimized() "amdgpu-cluster-dims"="2,3,4" {
+; GFX9-SDAG-LABEL: workgroup_id_optimized:
+; GFX9-SDAG:       ; %bb.0: ; %.entry
+; GFX9-SDAG-NEXT:    s_lshr_b32 s0, ttmp7, 16
+; GFX9-SDAG-NEXT:    s_and_b32 s1, ttmp7, 0xffff
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, ttmp9
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; GFX9-SDAG-NEXT:    buffer_store_dwordx3 v[0:2], off, s[0:3], 0
+; GFX9-SDAG-NEXT:    s_endpgm
+;
+; GFX9-GISEL-LABEL: workgroup_id_optimized:
+; GFX9-GISEL:       ; %bb.0: ; %.entry
+; GFX9-GISEL-NEXT:    s_mov_b32 s0, ttmp9
+; GFX9-GISEL-NEXT:    s_and_b32 s1, ttmp7, 0xffff
+; GFX9-GISEL-NEXT:    s_lshr_b32 s2, ttmp7, 16
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX9-GISEL-NEXT:    buffer_store_dwordx3 v[0:2], off, s[0:3], 0
+; GFX9-GISEL-NEXT:    s_endpgm
+;
+; GFX12-SDAG-LABEL: workgroup_id_optimized:
+; GFX12-SDAG:       ; %bb.0: ; %.entry
+; GFX12-SDAG-NEXT:    s_and_b32 s0, ttmp7, 0xffff
+; GFX12-SDAG-NEXT:    s_lshr_b32 s1, ttmp7, 16
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, s0
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-SDAG-NEXT:    buffer_store_b96 v[0:2], off, s[0:3], null
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: workgroup_id_optimized:
+; GFX12-GISEL:       ; %bb.0: ; %.entry
+; GFX12-GISEL-NEXT:    s_mov_b32 s0, ttmp9
+; GFX12-GISEL-NEXT:    s_and_b32 s1, ttmp7, 0xffff
+; GFX12-GISEL-NEXT:    s_lshr_b32 s2, ttmp7, 16
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-GISEL-NEXT:    buffer_store_b96 v[0:2], off, s[0:3], null
+; GFX12-GISEL-NEXT:    s_endpgm
+;
+; GFX1250-SDAG-LABEL: workgroup_id_optimized:
+; GFX1250-SDAG:       ; %bb.0: ; %.entry
+; GFX1250-SDAG-NEXT:    s_lshl_b32 s0, ttmp9, 1
+; GFX1250-SDAG-NEXT:    s_and_b32 s1, ttmp6, 15
+; GFX1250-SDAG-NEXT:    s_lshr_b32 s2, ttmp7, 14
+; GFX1250-SDAG-NEXT:    s_add_co_i32 s1, s1, s0
+; GFX1250-SDAG-NEXT:    s_and_b32 s0, s2, 0x3fffc
+; GFX1250-SDAG-NEXT:    s_and_b32 s2, ttmp7, 0xffff
+; GFX1250-SDAG-NEXT:    s_bfe_u32 s3, ttmp6, 0x40008
+; GFX1250-SDAG-NEXT:    s_mul_i32 s2, s2, 3
+; GFX1250-SDAG-NEXT:    s_bfe_u32 s4, ttmp6, 0x40004
+; GFX1250-SDAG-NEXT:    s_add_co_i32 s3, s3, s0
+; GFX1250-SDAG-NEXT:    s_add_co_i32 s4, s4, s2
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s4
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-SDAG-NEXT:    buffer_store_b96 v[0:2], off, s[0:3], null
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: workgroup_id_optimized:
+; GFX1250-GISEL:       ; %bb.0: ; %.entry
+; GFX1250-GISEL-NEXT:    s_and_b32 s1, ttmp7, 0xffff
+; GFX1250-GISEL-NEXT:    s_and_b32 s0, ttmp6, 15
+; GFX1250-GISEL-NEXT:    s_bfe_u32 s2, ttmp6, 0x40004
+; GFX1250-GISEL-NEXT:    s_mul_i32 s1, s1, 3
+; GFX1250-GISEL-NEXT:    s_lshr_b32 s3, ttmp7, 16
+; GFX1250-GISEL-NEXT:    s_bfe_u32 s4, ttmp6, 0x40008
+; GFX1250-GISEL-NEXT:    s_lshl1_add_u32 s0, ttmp9, s0
+; GFX1250-GISEL-NEXT:    s_add_co_i32 s1, s2, s1
+; GFX1250-GISEL-NEXT:    s_lshl2_add_u32 s2, s3, s4
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-GISEL-NEXT:    buffer_store_b96 v[0:2], off, s[0:3], null
+; GFX1250-GISEL-NEXT:    s_endpgm
+.entry:
+  %idx = call i32 @llvm.amdgcn.workgroup.id.x()
+  %idy = call i32 @llvm.amdgcn.workgroup.id.y()
+  %idz = call i32 @llvm.amdgcn.workgroup.id.z()
+  %ielemx = insertelement <3 x i32> undef, i32 %idx, i64 0
+  %ielemy = insertelement <3 x i32> %ielemx, i32 %idy, i64 1
+  %ielemz = insertelement <3 x i32> %ielemy, i32 %idz, i64 2
+  call void @llvm.amdgcn.raw.ptr.buffer.store.v3i32(<3 x i32> %ielemz, ptr addrspace(8) undef, i32 0, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_cs void @caller() {
+; GFX9-SDAG-LABEL: caller:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_getpc_b64 s[8:9]
+; GFX9-SDAG-NEXT:    s_mov_b32 s8, s0
+; GFX9-SDAG-NEXT:    s_load_dwordx4 s[8:11], s[8:9], 0x10
+; GFX9-SDAG-NEXT:    s_mov_b32 s5, callee@abs32@hi
+; GFX9-SDAG-NEXT:    s_mov_b32 s4, callee@abs32@lo
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, ttmp9
+; GFX9-SDAG-NEXT:    s_mov_b32 s32, 0
+; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-SDAG-NEXT:    s_add_u32 s8, s8, s0
+; GFX9-SDAG-NEXT:    s_addc_u32 s9, s9, 0
+; GFX9-SDAG-NEXT:    s_mov_b64 s[0:1], s[8:9]
+; GFX9-SDAG-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; GFX9-SDAG-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-SDAG-NEXT:    s_endpgm
+;
+; GFX9-GISEL-LABEL: caller:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_getpc_b64 s[8:9]
+; GFX9-GISEL-NEXT:    s_mov_b32 s8, s0
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[8:9], 0x10
+; GFX9-GISEL-NEXT:    s_mov_b32 s4, callee@abs32@lo
+; GFX9-GISEL-NEXT:    s_mov_b32 s5, callee@abs32@hi
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, ttmp9
+; GFX9-GISEL-NEXT:    s_mov_b32 s32, 0
+; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT:    s_add_u32 s8, s8, s0
+; GFX9-GISEL-NEXT:    s_addc_u32 s9, s9, 0
+; GFX9-GISEL-NEXT:    s_mov_b64 s[0:1], s[8:9]
+; GFX9-GISEL-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; GFX9-GISEL-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-GISEL-NEXT:    s_endpgm
+;
+; GFX12-SDAG-LABEL: caller:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, ttmp9
+; GFX12-SDAG-NEXT:    s_mov_b32 s1, callee@abs32@hi
+; GFX12-SDAG-NEXT:    s_mov_b32 s0, callee@abs32@lo
+; GFX12-SDAG-NEXT:    s_mov_b32 s32, 0
+; GFX12-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX12-SDAG-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: caller:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, ttmp9
+; GFX12-GISEL-NEXT:    s_mov_b32 s0, callee@abs32@lo
+; GFX12-GISEL-NEXT:    s_mov_b32 s1, callee@abs32@hi
+; GFX12-GISEL-NEXT:    s_mov_b32 s32, 0
+; GFX12-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX12-GISEL-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX12-GISEL-NEXT:    s_endpgm
+;
+; GFX1250-SDAG-LABEL: caller:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_bfe_u32 s0, ttmp6, 0x4000c
+; GFX1250-SDAG-NEXT:    s_and_b32 s1, ttmp6, 15
+; GFX1250-SDAG-NEXT:    s_add_co_i32 s0, s0, 1
+; GFX1250-SDAG-NEXT:    s_getreg_b32 s2, hwreg(HW_REG_IB_STS2, 6, 4)
+; GFX1250-SDAG-NEXT:    s_mul_i32 s0, ttmp9, s0
+; GFX1250-SDAG-NEXT:    s_mov_b32 s32, 0
+; GFX1250-SDAG-NEXT:    s_add_co_i32 s1, s1, s0
+; GFX1250-SDAG-NEXT:    s_cmp_eq_u32 s2, 0
+; GFX1250-SDAG-NEXT:    s_cselect_b32 s2, ttmp9, s1
+; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], callee@abs64
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-SDAG-NEXT:    s_swap_pc_i64 s[30:31], s[0:1]
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: caller:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_bfe_u32 s0, ttmp6, 0x4000c
+; GFX1250-GISEL-NEXT:    s_and_b32 s1, ttmp6, 15
+; GFX1250-GISEL-NEXT:    s_add_co_i32 s0, s0, 1
+; GFX1250-GISEL-NEXT:    s_getreg_b32 s2, hwreg(HW_REG_IB_STS2, 6, 4)
+; GFX1250-GISEL-NEXT:    s_mul_i32 s0, ttmp9, s0
+; GFX1250-GISEL-NEXT:    s_mov_b32 s32, 0
+; GFX1250-GISEL-NEXT:    s_add_co_i32 s1, s1, s0
+; GFX1250-GISEL-NEXT:    s_cmp_eq_u32 s2, 0
+; GFX1250-GISEL-NEXT:    s_cselect_b32 s2, ttmp9, s1
+; GFX1250-GISEL-NEXT:    s_mov_b64 s[0:1], callee@abs64
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-GISEL-NEXT:    s_swap_pc_i64 s[30:31], s[0:1]
+; GFX1250-GISEL-NEXT:    s_endpgm
+  %idx = call i32 @llvm.amdgcn.workgroup.id.x()
+  call amdgpu_gfx void @callee(i32 %idx)
+  ret void
+}
+
+declare amdgpu_gfx void @callee(i32)
+
+declare i32 @llvm.amdgcn.workgroup.id.x()
+declare i32 @llvm.amdgcn.workgroup.id.y()
+declare i32 @llvm.amdgcn.workgroup.id.z()
+declare void @llvm.amdgcn.raw.ptr.buffer.store.v3i32(<3 x i32>, ptr addrspace(8), i32, i32, i32 immarg)
diff --git a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir
index 06d8474b9054b..8f228b75cabfa 100644
--- a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir
+++ b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir
@@ -6429,7 +6429,7 @@ body:             |
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
-  ; GFX908-NEXT:   INLINEASM &"v_or_b32 $0, 0, $1", 32 /* isconvergent attdialect */, 327690 /* regdef:SReg_1_with_sub0 */, def %22, 327689 /* reguse:SReg_1_with_sub0 */, [[V_CVT_I32_F64_e32_4]]
+  ; GFX908-NEXT:   INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 2031626 /* regdef:VGPR_32 */, def %22, 2031625 /* reguse:VGPR_32 */, [[V_CVT_I32_F64_e32_4]]
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT: bb.1:
   ; GFX908-NEXT:   successors: %bb.2(0x80000000)
@@ -6478,7 +6478,7 @@ body:             |
     %19:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
     %20:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
     %21:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
-    INLINEASM &"v_or_b32 $0, 0, $1", 32, 327690, def %22:vgpr_32, 327689, %4
+    INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 2031626 /* regdef:VGPR_32 */, def %22:vgpr_32, 2031625 /* reguse:VGPR_32 */, %4
     %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode
 
   bb.1:
diff --git a/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx90a.mir b/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx90a.mir
index 3feccff715bc1..ddd8a4784ea86 100644
--- a/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx90a.mir
+++ b/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx90a.mir
@@ -125,8 +125,7 @@ body:             |
 ...
 # GCN-LABEL: name: sgemm32x32_mfma_write_agpr_mfma_read_overlap
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 15
 # GCN-NEXT: V_MFMA
 name:            sgemm32x32_mfma_write_agpr_mfma_read_overlap
 body:             |
@@ -136,8 +135,7 @@ body:             |
 ...
 # GCN-LABEL: name: sgemm32x32_mfma_write_vgpr_mfma_read_overlap
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 15
 # GCN-NEXT: V_MFMA
 name:            sgemm32x32_mfma_write_vgpr_mfma_read_overlap
 body:             |
@@ -147,8 +145,7 @@ body:             |
 ...
 # GCN-LABEL: name: dgemm16x16_mfma_write_vgpr_mfma_read_overlap
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 0
+# GCN-NEXT: S_NOP 8
 # GCN-NEXT: V_MFMA
 name:            dgemm16x16_mfma_write_vgpr_mfma_read_overlap
 body:             |
@@ -196,8 +193,7 @@ body:             |
 ...
 # GCN-LABEL: name: sgemm16x16_mfma_write_vgpr_dgemm_mfma_read_overlap
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 0
+# GCN-NEXT: S_NOP 8
 # GCN-NEXT: V_MFMA
 name:            sgemm16x16_mfma_write_vgpr_dgemm_mfma_read_overlap
 body:             |
@@ -207,8 +203,7 @@ body:             |
 ...
 # GCN-LABEL: name: sgemm32x32_mfma_write_vgpr_dgemm_mfma_read_overlap
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 15
 # GCN-NEXT: S_NOP 0
 # GCN-NEXT: V_MFMA
 name:            sgemm32x32_mfma_write_vgpr_dgemm_mfma_read_overlap
@@ -249,8 +244,7 @@ body:             |
 ...
 # GCN-LABEL: name: sgemm16x16_mfma_write_agpr_mfma_srca_read_overlap
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 2
+# GCN-NEXT: S_NOP 10
 # GCN-NEXT: V_MFMA
 name:            sgemm16x16_mfma_write_agpr_mfma_srca_read_overlap
 body:             |
@@ -260,8 +254,7 @@ body:             |
 ...
 # GCN-LABEL: name: sgemm32x32_mfma_write_agpr_mfma_srca_read_overlap
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 15
 # GCN-NEXT: S_NOP 2
 # GCN-NEXT: V_MFMA
 name:            sgemm32x32_mfma_write_agpr_mfma_srca_read_overlap
@@ -312,8 +305,7 @@ body:             |
 ...
 # GCN-LABEL: name: dgemm16x16_mfma_write_vgpr_mfma_srca_read_overlap
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 2
+# GCN-NEXT: S_NOP 10
 # GCN-NEXT: V_MFMA
 name:            dgemm16x16_mfma_write_vgpr_mfma_srca_read_overlap
 body:             |
@@ -333,8 +325,7 @@ body:             |
 ...
 # GCN-LABEL: name: dgemm16x16_mfma_write_vgpr_sgemm_mfma_srca_read_overlap
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 2
+# GCN-NEXT: S_NOP 10
 # GCN-NEXT: V_MFMA
 name:            dgemm16x16_mfma_write_vgpr_sgemm_mfma_srca_read_overlap
 body:             |
@@ -384,8 +375,7 @@ body:             |
 ...
 # GCN-LABEL: name: dgemm16x16_mfma_write_vgpr_mfma_srcb_read_overlap
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 2
+# GCN-NEXT: S_NOP 10
 # GCN-NEXT: V_MFMA
 name:            dgemm16x16_mfma_write_vgpr_mfma_srcb_read_overlap
 body:             |
@@ -435,8 +425,7 @@ body:             |
 ...
 # GCN-LABEL: name: smfma16x16_write_vgpr_flat_read
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 2
+# GCN-NEXT: S_NOP 10
 # GCN-NEXT: FLAT_STORE_DWORD
 name:            smfma16x16_write_vgpr_flat_read
 body:             |
@@ -446,8 +435,7 @@ body:             |
 ...
 # GCN-LABEL: name: smfma32x32_write_vgpr_flat_read
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 15
 # GCN-NEXT: S_NOP 2
 # GCN-NEXT: FLAT_STORE_DWORD
 name:            smfma32x32_write_vgpr_flat_read
@@ -458,8 +446,7 @@ body:             |
 ...
 # GCN-LABEL: name: dmfma4x4_write_vgpr_flat_read_overlap
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 0
+# GCN-NEXT: S_NOP 8
 # GCN-NEXT: FLAT_STORE_DWORD
 name:            dmfma4x4_write_vgpr_flat_read_overlap
 body:             |
@@ -469,8 +456,7 @@ body:             |
 ...
 # GCN-LABEL: name: dmfma4x4_write_vgpr_flat_read_full
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 0
+# GCN-NEXT: S_NOP 8
 # GCN-NEXT: FLAT_STORE_DWORD
 name:            dmfma4x4_write_vgpr_flat_read_full
 body:             |
@@ -480,8 +466,7 @@ body:             |
 ...
 # GCN-LABEL: name: dmfma16x16_write_vgpr_flat_read
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 15
 # GCN-NEXT: S_NOP 1
 # GCN-NEXT: FLAT_STORE_DWORD
 name:            dmfma16x16_write_vgpr_flat_read
@@ -502,8 +487,7 @@ body:             |
 ...
 # GCN-LABEL: name: smfma16x16_write_vgpr_valu_read
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 2
+# GCN-NEXT: S_NOP 10
 # GCN-NEXT: V_MOV_B32
 name:            smfma16x16_write_vgpr_valu_read
 body:             |
@@ -513,8 +497,7 @@ body:             |
 ...
 # GCN-LABEL: name: smfma32x32_write_vgpr_valu_read
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 15
 # GCN-NEXT: S_NOP 2
 # GCN-NEXT: V_MOV_B32
 name:            smfma32x32_write_vgpr_valu_read
@@ -535,8 +518,7 @@ body:             |
 ...
 # GCN-LABEL: name: dmfma16x16_write_vgpr_valu_read
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 2
+# GCN-NEXT: S_NOP 10
 # GCN-NEXT: V_MOV_B32
 name:            dmfma16x16_write_vgpr_valu_read
 body:             |
@@ -556,8 +538,7 @@ body:             |
 ...
 # GCN-LABEL: name: smfma16x16_write_vgpr_accv_read
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 2
+# GCN-NEXT: S_NOP 10
 # GCN-NEXT: V_ACCVGPR_WRITE_B32_e64
 name:            smfma16x16_write_vgpr_accv_read
 body:             |
@@ -567,8 +548,7 @@ body:             |
 ...
 # GCN-LABEL: name: smfma32x32_write_vgpr_accv_read
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 15
 # GCN-NEXT: S_NOP 2
 # GCN-NEXT: V_ACCVGPR_WRITE_B32_e64
 name:            smfma32x32_write_vgpr_accv_read
@@ -599,8 +579,7 @@ body:             |
 ...
 # GCN-LABEL: name: dmfma16x16_write_vgpr_dot_read
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 2
+# GCN-NEXT: S_NOP 10
 # GCN-NEXT: V_DOT
 name:            dmfma16x16_write_vgpr_dot_read
 body:             |
@@ -620,8 +599,7 @@ body:             |
 ...
 # GCN-LABEL: name: smfma16x16_write_vgpr_valu_write
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 2
+# GCN-NEXT: S_NOP 10
 # GCN-NEXT: V_MOV_B32
 name:            smfma16x16_write_vgpr_valu_write
 body:             |
@@ -631,8 +609,7 @@ body:             |
 ...
 # GCN-LABEL: name: smfma32x32_write_vgpr_valu_write
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 15
 # GCN-NEXT: S_NOP 2
 # GCN-NEXT: V_MOV_B32
 name:            smfma32x32_write_vgpr_valu_write
@@ -653,8 +630,7 @@ body:             |
 ...
 # GCN-LABEL: name: smfma16x16_write_vgpr_valu_f16_write
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 2
+# GCN-NEXT: S_NOP 10
 # GCN-NEXT: V_FMA_F16_e64
 name:            smfma16x16_write_vgpr_valu_f16_write
 body:             |
@@ -664,8 +640,7 @@ body:             |
 ...
 # GCN-LABEL: name: smfma32x32_write_vgpr_valu_f16_write
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 15
 # GCN-NEXT: S_NOP 2
 # GCN-NEXT: V_FMA_F16_e64
 name:            smfma32x32_write_vgpr_valu_f16_write
@@ -686,8 +661,7 @@ body:             |
 ...
 # GCN-LABEL: name: smfma16x16_write_vgpr_valu_sdwa_write
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 2
+# GCN-NEXT: S_NOP 10
 # GCN-NEXT: V_MOV_B32_sdwa
 name:            smfma16x16_write_vgpr_valu_sdwa_write
 body:             |
@@ -697,8 +671,7 @@ body:             |
 ...
 # GCN-LABEL: name: smfma32x32_write_vgpr_valu_sdwa_write
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 15
 # GCN-NEXT: S_NOP 2
 # GCN-NEXT: V_MOV_B32_sdwa
 name:            smfma32x32_write_vgpr_valu_sdwa_write
@@ -719,8 +692,7 @@ body:             |
 ...
 # GCN-LABEL: name: dmfma16x16_write_vgpr_valu_write
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 2
+# GCN-NEXT: S_NOP 10
 # GCN-NEXT: V_MOV_B32
 name:            dmfma16x16_write_vgpr_valu_write
 body:             |
@@ -770,8 +742,7 @@ body:             |
 ...
 # GCN-LABEL: name: smfma32x32_read_srcc_vgpr_valu_write
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 6
+# GCN-NEXT: S_NOP 14
 # GCN-NEXT: V_MOV_B32
 name:            smfma32x32_read_srcc_vgpr_valu_write
 body:             |
@@ -1040,8 +1011,7 @@ body:             |
 ...
 # GCN-LABEL: name: dgemm16x16_mfma_write_agpr_mfma_read_overlap
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 0
+# GCN-NEXT: S_NOP 8
 # GCN-NEXT: V_MFMA
 name:            dgemm16x16_mfma_write_agpr_mfma_read_overlap
 body:             |
@@ -1080,8 +1050,7 @@ body:             |
 ...
 # GCN-LABEL: name: sgemm16x16_mfma_write_sgpr_dgemm_mfma_read_overlap
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 0
+# GCN-NEXT: S_NOP 8
 # GCN-NEXT: V_MFMA
 name:            sgemm16x16_mfma_write_sgpr_dgemm_mfma_read_overlap
 body:             |
@@ -1091,8 +1060,7 @@ body:             |
 ...
 # GCN-LABEL: name: sgemm32x32_mfma_write_agpr_dgemm_mfma_read_overlap
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 15
 # GCN-NEXT: S_NOP 0
 # GCN-NEXT: V_MFMA
 name:            sgemm32x32_mfma_write_agpr_dgemm_mfma_read_overlap
@@ -1133,8 +1101,7 @@ body:             |
 ...
 # GCN-LABEL: name: dgemm16x16_mfma_write_agpr_mfma_srca_read_overlap
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 2
+# GCN-NEXT: S_NOP 10
 # GCN-NEXT: V_MFMA
 name:            dgemm16x16_mfma_write_agpr_mfma_srca_read_overlap
 body:             |
@@ -1154,8 +1121,7 @@ body:             |
 ...
 # GCN-LABEL: name: dgemm16x16_mfma_write_agpr_sgemm_mfma_srca_read_overlap
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 2
+# GCN-NEXT: S_NOP 10
 # GCN-NEXT: V_MFMA
 name:            dgemm16x16_mfma_write_agpr_sgemm_mfma_srca_read_overlap
 body:             |
@@ -1185,8 +1151,7 @@ body:             |
 ...
 # GCN-LABEL: name: dgemm16x16_mfma_write_agpr_mfma_srcb_read_overlap
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 2
+# GCN-NEXT: S_NOP 10
 # GCN-NEXT: V_MFMA
 name:            dgemm16x16_mfma_write_agpr_mfma_srcb_read_overlap
 body:             |
@@ -1196,8 +1161,7 @@ body:             |
 ...
 # GCN-LABEL: name: dmfma4x4_write_agpr_flat_read_overlap
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 0
+# GCN-NEXT: S_NOP 8
 # GCN-NEXT: FLAT_STORE_DWORD
 name:            dmfma4x4_write_agpr_flat_read_overlap
 body:             |
@@ -1207,8 +1171,7 @@ body:             |
 ...
 # GCN-LABEL: name: dmfma4x4_write_agpr_flat_read_full
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 0
+# GCN-NEXT: S_NOP 8
 # GCN-NEXT: FLAT_STORE_DWORD
 name:            dmfma4x4_write_agpr_flat_read_full
 body:             |
@@ -1218,8 +1181,7 @@ body:             |
 ...
 # GCN-LABEL: name: dmfma16x16_write_agpr_flat_read
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 15
 # GCN-NEXT: S_NOP 1
 # GCN-NEXT: FLAT_STORE_DWORD
 name:            dmfma16x16_write_agpr_flat_read
@@ -1240,8 +1202,7 @@ body:             |
 ...
 # GCN-LABEL: name: dmfma16x16_write_agpr_valu_read
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 2
+# GCN-NEXT: S_NOP 10
 # GCN-NEXT: V_ACCVGPR_READ_B32_e64
 name:            dmfma16x16_write_agpr_valu_read
 body:             |
@@ -1261,8 +1222,7 @@ body:             |
 ...
 # GCN-LABEL: name: dmfma16x16_write_agpr_valu_write
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 2
+# GCN-NEXT: S_NOP 10
 # GCN-NEXT: V_ACCVGPR_WRITE_B32_e64
 name:            dmfma16x16_write_agpr_valu_write
 body:             |
diff --git a/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx942.mir b/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx942.mir
index 8f4f57a5d37c5..1ef6b4c844c93 100644
--- a/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx942.mir
+++ b/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx942.mir
@@ -178,11 +178,8 @@ body:             |
 ...
 # GCN-LABEL: name: xdl_sgemm16x16_mfma_write_agpr_mfma_read_overlap
 # GCN:      V_MFMA
-# GFX942-NEXT: S_NOP 7
-# GFX942-NEXT: S_NOP 0
-
-# GFX950-NEXT: S_NOP 7
-# GFX950-NEXT: S_NOP 1
+# GFX942-NEXT: S_NOP 8
+# GFX950-NEXT: S_NOP 9
 # GCN-NEXT: V_MFMA
 name:            xdl_sgemm16x16_mfma_write_agpr_mfma_read_overlap
 body:             |
@@ -192,11 +189,8 @@ body:             |
 ...
 # GCN-LABEL: name: xdl_sgemm16x16_mfma_write_vgpr_mfma_read_overlap
 # GCN:      V_MFMA
-# GFX942-NEXT: S_NOP 7
-# GFX942-NEXT: S_NOP 0
-
-# GFX950-NEXT: S_NOP 7
-# GFX950-NEXT: S_NOP 1
+# GFX942-NEXT: S_NOP 8
+# GFX950-NEXT: S_NOP 9
 # GCN-NEXT: V_MFMA
 name:            xdl_sgemm16x16_mfma_write_vgpr_mfma_read_overlap
 body:             |
@@ -225,11 +219,8 @@ body:             |
 ...
 # GCN-LABEL: name: xdl_sgemm16x16_mfma_write_agpr_smfmac_read_overlap
 # GCN:      V_MFMA
-# GFX942-NEXT: S_NOP 7
-# GFX942-NEXT: S_NOP 0
-
-# GFX950-NEXT: S_NOP 7
-# GFX950-NEXT: S_NOP 1
+# GFX942-NEXT: S_NOP 8
+# GFX950-NEXT: S_NOP 9
 # GCN-NEXT: V_SMFMAC
 name:            xdl_sgemm16x16_mfma_write_agpr_smfmac_read_overlap
 body:             |
@@ -239,8 +230,7 @@ body:             |
 ...
 # GCN-LABEL: name: xdl_sgemm32x32_mfma_write_agpr_mfma_read_overlap
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 15
 # GFX942-NEXT: S_NOP 0
 # GFX950-NEXT: S_NOP 1
 # GCN-NEXT: V_MFMA
@@ -252,8 +242,7 @@ body:             |
 ...
 # GCN-LABEL: name: xdl_sgemm32x32_mfma_write_vgpr_mfma_read_overlap
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 15
 # GFX942-NEXT: S_NOP 0
 # GFX950-NEXT: S_NOP 1
 # GCN-NEXT: V_MFMA
@@ -274,8 +263,7 @@ body:             |
 ...
 # GCN-LABEL: name: nonxdl_sgemm32x32_mfma_write_agpr_nonxdl_mfma_read_overlap
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 15
 # GCN-NEXT: V_MFMA
 name:            nonxdl_sgemm32x32_mfma_write_agpr_nonxdl_mfma_read_overlap
 body:             |
@@ -285,8 +273,7 @@ body:             |
 ...
 # GCN-LABEL: name: xdl_sgemm32x32_mfma_write_agpr_smfmac_read_overlap
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 15
 # GFX942-NEXT: S_NOP 0
 # GFX950-NEXT: S_NOP 1
 # GCN-NEXT: V_SMFMAC
@@ -298,11 +285,8 @@ body:             |
 ...
 # GCN-LABEL: name: dgemm16x16_mfma_write_vgpr_mfma_read_overlap
 # GCN:      V_MFMA
-# GFX942-NEXT: S_NOP 7
-# GFX942-NEXT: S_NOP 0
-
-# GFX950-NEXT: S_NOP 7
-# GFX950-NEXT: S_NOP 7
+# GFX942-NEXT: S_NOP 8
+# GFX950-NEXT: S_NOP 15
 # GFX950-NEXT: S_NOP 0
 # GCN-NEXT: V_MFMA
 name:            dgemm16x16_mfma_write_vgpr_mfma_read_overlap
@@ -323,11 +307,8 @@ body:             |
 ...
 # GCN-LABEL: name: dgemm16x16_mfma_write_vgpr_sgemm_mfma_read_overlap
 # GCN:      V_MFMA
-# GFX942-NEXT: S_NOP 7
-# GFX942-NEXT: S_NOP 0
-
-# GFX950-NEXT: S_NOP 7
-# GFX950-NEXT: S_NOP 7
+# GFX942-NEXT: S_NOP 8
+# GFX950-NEXT: S_NOP 15
 # GFX950-NEXT: S_NOP 0
 # GCN-NEXT: V_MFMA
 name:            dgemm16x16_mfma_write_vgpr_sgemm_mfma_read_overlap
@@ -358,9 +339,8 @@ body:             |
 ...
 # GCN-LABEL: name: xdl_sgemm16x16_mfma_write_vgpr_dgemm_mfma_read_overlap
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GFX942-NEXT: S_NOP 0
-# GFX950-NEXT: S_NOP 1
+# GFX942-NEXT: S_NOP 8
+# GFX950-NEXT: S_NOP 9
 # GCN-NEXT: V_MFMA
 name:            xdl_sgemm16x16_mfma_write_vgpr_dgemm_mfma_read_overlap
 body:             |
@@ -370,8 +350,7 @@ body:             |
 ...
 # GCN-LABEL: name: xdl_sgemm32x32_mfma_write_vgpr_dgemm_mfma_read_overlap
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 15
 # GFX942-NEXT: S_NOP 0
 # GFX950-NEXT: S_NOP 1
 # GCN-NEXT: V_MFMA
@@ -383,9 +362,8 @@ body:             |
 ...
 # GCN-LABEL: name: xdl_sgemm16x16_mfma_write_agpr_mfma_read_partial
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GFX942-NEXT: S_NOP 0
-# GFX950-NEXT: S_NOP 1
+# GFX942-NEXT: S_NOP 8
+# GFX950-NEXT: S_NOP 9
 # GCN-NEXT: V_MFMA
 name:            xdl_sgemm16x16_mfma_write_agpr_mfma_read_partial
 body:             |
@@ -395,9 +373,8 @@ body:             |
 ...
 # GCN-LABEL: name: xdl_sgemm16x16_mfma_write_vgpr_mfma_read_partial
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GFX942-NEXT: S_NOP 0
-# GFX950-NEXT: S_NOP 1
+# GFX942-NEXT: S_NOP 8
+# GFX950-NEXT: S_NOP 9
 # GCN-NEXT: V_MFMA
 name:            xdl_sgemm16x16_mfma_write_vgpr_mfma_read_partial
 body:             |
@@ -417,9 +394,8 @@ body:             |
 ...
 # GCN-LABEL: name: xdl_sgemm16x16_mfma_write_agpr_mfma_srca_read_overlap
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GFX942-NEXT: S_NOP 2
-# GFX950-NEXT: S_NOP 3
+# GFX942-NEXT: S_NOP 10
+# GFX950-NEXT: S_NOP 11
 # GCN-NEXT: V_MFMA
 name:            xdl_sgemm16x16_mfma_write_agpr_mfma_srca_read_overlap
 body:             |
@@ -429,8 +405,7 @@ body:             |
 ...
 # GCN-LABEL: name: nonxdl_sgemm16x16_mfma_write_agpr_mfma_srca_read_overlap
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 1
+# GCN-NEXT: S_NOP 9
 # GCN-NEXT: V_MFMA
 name:            nonxdl_sgemm16x16_mfma_write_agpr_mfma_srca_read_overlap
 body:             |
@@ -440,9 +415,8 @@ body:             |
 ...
 # GCN-LABEL: name: smfmac32x32_write_agpr_mfma_srca_read_overlap
 # GCN:      V_SMFMAC
-# GCN-NEXT: S_NOP 7
-# GFX942-NEXT: S_NOP 2
-# GFX950-NEXT: S_NOP 3
+# GFX942-NEXT: S_NOP 10
+# GFX950-NEXT: S_NOP 11
 # GCN-NEXT: V_MFMA
 name:            smfmac32x32_write_agpr_mfma_srca_read_overlap
 body:             |
@@ -452,9 +426,8 @@ body:             |
 ...
 # GCN-LABEL: name: smfmac32x32_write_agpr_smfmac_srcc_read_overlap
 # GCN:      V_SMFMAC
-# GCN-NEXT: S_NOP 7
-# GFX942-NEXT: S_NOP 2
-# GFX950-NEXT: S_NOP 3
+# GFX942-NEXT: S_NOP 10
+# GFX950-NEXT: S_NOP 11
 # GCN-NEXT: V_SMFMAC
 name:            smfmac32x32_write_agpr_smfmac_srcc_read_overlap
 body:             |
@@ -464,8 +437,7 @@ body:             |
 ...
 # GCN-LABEL: name: xdl_sgemm32x32_mfma_write_agpr_mfma_srca_read_overlap
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 15
 # GFX942-NEXT: S_NOP 2
 # GFX950-NEXT: S_NOP 3
 # GCN-NEXT: V_MFMA
@@ -477,8 +449,7 @@ body:             |
 ...
 # GCN-LABEL: name: nonxdl_sgemm32x32_mfma_write_agpr_mfma_srca_read_overlap
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 15
 # GCN-NEXT: S_NOP 1
 # GCN-NEXT: V_MFMA
 name:            nonxdl_sgemm32x32_mfma_write_agpr_mfma_srca_read_overlap
@@ -539,11 +510,8 @@ body:             |
 ...
 # GCN-LABEL: name: dgemm16x16_mfma_write_vgpr_mfma_srca_read_overlap
 # GCN:      V_MFMA
-# GFX942-NEXT: S_NOP 7
-# GFX942-NEXT: S_NOP 2
-
-# GFX950-NEXT: S_NOP 7
-# GFX950-NEXT: S_NOP 7
+# GFX942-NEXT: S_NOP 10
+# GFX950-NEXT: S_NOP 15
 # GFX950-NEXT: S_NOP 2
 # GCN-NEXT: V_MFMA
 name:            dgemm16x16_mfma_write_vgpr_mfma_srca_read_overlap
@@ -564,11 +532,8 @@ body:             |
 ...
 # GCN-LABEL: name: dgemm16x16_mfma_write_vgpr_sgemm_mfma_srca_read_overlap
 # GCN:      V_MFMA
-# GFX942-NEXT: S_NOP 7
-# GFX942-NEXT: S_NOP 2
-
-# GFX950-NEXT: S_NOP 7
-# GFX950-NEXT: S_NOP 7
+# GFX942-NEXT: S_NOP 10
+# GFX950-NEXT: S_NOP 15
 # GFX950-NEXT: S_NOP 2
 # GCN-NEXT: V_MFMA
 name:            dgemm16x16_mfma_write_vgpr_sgemm_mfma_srca_read_overlap
@@ -639,11 +604,8 @@ body:             |
 ...
 # GCN-LABEL: name: dgemm16x16_mfma_write_vgpr_mfma_srcb_read_overlap
 # GCN:      V_MFMA
-# GFX942-NEXT: S_NOP 7
-# GFX942-NEXT: S_NOP 2
-
-# GFX950-NEXT: S_NOP 7
-# GFX950-NEXT: S_NOP 7
+# GFX942-NEXT: S_NOP 10
+# GFX950-NEXT: S_NOP 15
 # GFX950-NEXT: S_NOP 2
 # GCN-NEXT: V_MFMA
 name:            dgemm16x16_mfma_write_vgpr_mfma_srcb_read_overlap
@@ -654,11 +616,8 @@ body:             |
 ...
 # GCN-LABEL: name: dgemm16x16_mfma_write_vgpr_smfmac_srcb_read_overlap
 # GCN:      V_MFMA
-# GFX942-NEXT: S_NOP 7
-# GFX942-NEXT: S_NOP 2
-
-# GFX950-NEXT: S_NOP 7
-# GFX950-NEXT: S_NOP 7
+# GFX942-NEXT: S_NOP 10
+# GFX950-NEXT: S_NOP 15
 # GFX950-NEXT: S_NOP 2
 # GCN-NEXT: V_SMFMAC
 name:            dgemm16x16_mfma_write_vgpr_smfmac_srcb_read_overlap
@@ -669,11 +628,8 @@ body:             |
 ...
 # GCN-LABEL: name: dgemm16x16_mfma_write_vgpr_smfmac_srcc_read_overlap
 # GCN:      V_MFMA
-# GFX942-NEXT: S_NOP 7
-# GFX942-NEXT: S_NOP 2
-
-# GFX950-NEXT: S_NOP 7
-# GFX950-NEXT: S_NOP 7
+# GFX942-NEXT: S_NOP 10
+# GFX950-NEXT: S_NOP 15
 # GFX950-NEXT: S_NOP 2
 
 # GCN-NEXT: V_SMFMAC
@@ -746,9 +702,8 @@ body:             |
 ...
 # GCN-LABEL: name: xdl_smfma16x16_write_vgpr_flat_read
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GFX942-NEXT: S_NOP 2
-# GFX950-NEXT: S_NOP 3
+# GFX942-NEXT: S_NOP 10
+# GFX950-NEXT: S_NOP 11
 # GCN-NEXT: FLAT_STORE_DWORD
 name:            xdl_smfma16x16_write_vgpr_flat_read
 body:             |
@@ -758,9 +713,8 @@ body:             |
 ...
 # GCN-LABEL: name: smfmac32x32_write_vgpr_flat_read
 # GCN:      V_SMFMAC
-# GCN-NEXT: S_NOP 7
-# GFX942-NEXT: S_NOP 2
-# GFX950-NEXT: S_NOP 3
+# GFX942-NEXT: S_NOP 10
+# GFX950-NEXT: S_NOP 11
 # GCN-NEXT: FLAT_STORE_DWORD
 name:            smfmac32x32_write_vgpr_flat_read
 body:             |
@@ -770,8 +724,7 @@ body:             |
 ...
 # GCN-LABEL: name: xdl_smfma32x32_write_vgpr_flat_read
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 15
 # GFX942-NEXT: S_NOP 2
 # GFX950-NEXT: S_NOP 3
 # GCN-NEXT: FLAT_STORE_DWORD
@@ -783,8 +736,7 @@ body:             |
 ...
 # GCN-LABEL: name: dmfma4x4_write_vgpr_flat_read_overlap
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 0
+# GCN-NEXT: S_NOP 8
 # GCN-NEXT: FLAT_STORE_DWORD
 name:            dmfma4x4_write_vgpr_flat_read_overlap
 body:             |
@@ -794,8 +746,7 @@ body:             |
 ...
 # GCN-LABEL: name: dmfma4x4_write_vgpr_flat_read_full
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 0
+# GCN-NEXT: S_NOP 8
 # GCN-NEXT: FLAT_STORE_DWORD
 name:            dmfma4x4_write_vgpr_flat_read_full
 body:             |
@@ -805,8 +756,7 @@ body:             |
 ...
 # GCN-LABEL: name: dmfma16x16_write_vgpr_flat_read
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 15
 # GCN-NEXT: S_NOP 1
 # GCN-NEXT: FLAT_STORE_DWORD
 name:            dmfma16x16_write_vgpr_flat_read
@@ -827,9 +777,8 @@ body:             |
 ...
 # GCN-LABEL: name: xdl_smfma16x16_write_vgpr_valu_read
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GFX942-NEXT: S_NOP 2
-# GFX950-NEXT: S_NOP 3
+# GFX942-NEXT: S_NOP 10
+# GFX950-NEXT: S_NOP 11
 # GCN-NEXT: V_MOV_B32
 name:            xdl_smfma16x16_write_vgpr_valu_read
 body:             |
@@ -839,8 +788,7 @@ body:             |
 ...
 # GCN-LABEL: name: xdl_smfma32x32_write_vgpr_valu_read
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 15
 # GFX942-NEXT: S_NOP 2
 # GFX950-NEXT: S_NOP 3
 # GCN-NEXT: V_MOV_B32
@@ -862,11 +810,8 @@ body:             |
 ...
 # GCN-LABEL: name: dmfma16x16_write_vgpr_valu_read
 # GCN:      V_MFMA
-# GFX942-NEXT: S_NOP 7
-# GFX942-NEXT: S_NOP 2
-
-# GFX950-NEXT: S_NOP 7
-# GFX950-NEXT: S_NOP 7
+# GFX942-NEXT: S_NOP 10
+# GFX950-NEXT: S_NOP 15
 # GFX950-NEXT: S_NOP 2
 # GCN-NEXT: V_MOV_B32
 name:            dmfma16x16_write_vgpr_valu_read
@@ -887,9 +832,8 @@ body:             |
 ...
 # GCN-LABEL: name: xdl_smfma16x16_write_vgpr_accv_read
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GFX942-NEXT: S_NOP 2
-# GFX950-NEXT: S_NOP 3
+# GFX942-NEXT: S_NOP 10
+# GFX950-NEXT: S_NOP 11
 # GCN-NEXT: V_ACCVGPR_WRITE_B32_e64
 name:            xdl_smfma16x16_write_vgpr_accv_read
 body:             |
@@ -899,8 +843,7 @@ body:             |
 ...
 # GCN-LABEL: name: xdl_smfma32x32_write_vgpr_accv_read
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 15
 # GFX942-NEXT: S_NOP 2
 # GFX950-NEXT: S_NOP 3
 # GCN-NEXT: V_ACCVGPR_WRITE_B32_e64
@@ -932,11 +875,8 @@ body:             |
 ...
 # GCN-LABEL: name: dmfma16x16_write_vgpr_dot_read
 # GCN:      V_MFMA
-# GFX942-NEXT: S_NOP 7
-# GFX942-NEXT: S_NOP 2
-
-# GFX950-NEXT: S_NOP 7
-# GFX950-NEXT: S_NOP 7
+# GFX942-NEXT: S_NOP 10
+# GFX950-NEXT: S_NOP 15
 # GFX950-NEXT: S_NOP 2
 
 # GCN-NEXT: V_DOT
@@ -958,9 +898,8 @@ body:             |
 ...
 # GCN-LABEL: name: xdl_smfma16x16_write_vgpr_valu_write
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GFX942-NEXT: S_NOP 2
-# GFX950-NEXT: S_NOP 3
+# GFX942-NEXT: S_NOP 10
+# GFX950-NEXT: S_NOP 11
 # GCN-NEXT: V_MOV_B32
 name:            xdl_smfma16x16_write_vgpr_valu_write
 body:             |
@@ -970,8 +909,7 @@ body:             |
 ...
 # GCN-LABEL: name: xdl_smfma32x32_write_vgpr_valu_write
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 15
 # GFX942-NEXT: S_NOP 2
 # GFX950-NEXT: S_NOP 3
 # GCN-NEXT: V_MOV_B32
@@ -993,9 +931,8 @@ body:             |
 ...
 # GCN-LABEL: name: xdl_smfma16x16_write_vgpr_valu_f16_write
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GFX942-NEXT: S_NOP 2
-# GFX950-NEXT: S_NOP 3
+# GFX942-NEXT: S_NOP 10
+# GFX950-NEXT: S_NOP 11
 # GCN-NEXT: V_FMA_F16_e64
 name:            xdl_smfma16x16_write_vgpr_valu_f16_write
 body:             |
@@ -1005,8 +942,7 @@ body:             |
 ...
 # GCN-LABEL: name: xdl_smfma32x32_write_vgpr_valu_f16_write
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 15
 # GFX942-NEXT: S_NOP 2
 # GFX950-NEXT: S_NOP 3
 # GCN-NEXT: V_FMA_F16_e64
@@ -1028,9 +964,8 @@ body:             |
 ...
 # GCN-LABEL: name: xdl_smfma16x16_write_vgpr_valu_sdwa_write
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GFX942-NEXT: S_NOP 2
-# GFX950-NEXT: S_NOP 3
+# GFX942-NEXT: S_NOP 10
+# GFX950-NEXT: S_NOP 11
 # GCN-NEXT: V_MOV_B32_sdwa
 name:            xdl_smfma16x16_write_vgpr_valu_sdwa_write
 body:             |
@@ -1040,8 +975,7 @@ body:             |
 ...
 # GCN-LABEL: name: xdl_smfma32x32_write_vgpr_valu_sdwa_write
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 15
 # GFX942-NEXT: S_NOP 2
 # GFX950-NEXT: S_NOP 3
 # GCN-NEXT: V_MOV_B32_sdwa
@@ -1063,8 +997,7 @@ body:             |
 ...
 # GCN-LABEL: name: dmfma16x16_write_vgpr_valu_write
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 2
+# GCN-NEXT: S_NOP 10
 # GCN-NEXT: V_MOV_B32
 name:            dmfma16x16_write_vgpr_valu_write
 body:             |
@@ -1379,11 +1312,8 @@ body:             |
 ...
 # GCN-LABEL: name: dgemm16x16_mfma_write_agpr_mfma_read_overlap
 # GCN:      V_MFMA
-# GFX942-NEXT: S_NOP 7
-# GFX942-NEXT: S_NOP 0
-
-# GFX950-NEXT: S_NOP 7
-# GFX950-NEXT: S_NOP 7
+# GFX942-NEXT: S_NOP 8
+# GFX950-NEXT: S_NOP 15
 # GFX950-NEXT: S_NOP 0
 # GCN-NEXT: V_MFMA
 name:            dgemm16x16_mfma_write_agpr_mfma_read_overlap
@@ -1404,11 +1334,8 @@ body:             |
 ...
 # GCN-LABEL: name: dgemm16x16_mfma_write_agpr_sgemm_mfma_read_overlap
 # GCN:      V_MFMA
-# GFX942-NEXT: S_NOP 7
-# GFX942-NEXT: S_NOP 0
-
-# GFX950-NEXT: S_NOP 7
-# GFX950-NEXT: S_NOP 7
+# GFX942-NEXT: S_NOP 8
+# GFX950-NEXT: S_NOP 15
 # GFX950-NEXT: S_NOP 0
 
 # GCN-NEXT: V_MFMA
@@ -1430,9 +1357,8 @@ body:             |
 ...
 # GCN-LABEL: name: xdl_sgemm16x16_mfma_write_sgpr_dgemm_mfma_read_overlap
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GFX942-NEXT: S_NOP 0
-# GFX950-NEXT: S_NOP 1
+# GFX942-NEXT: S_NOP 8
+# GFX950-NEXT: S_NOP 9
 # GCN-NEXT: V_MFMA
 name:            xdl_sgemm16x16_mfma_write_sgpr_dgemm_mfma_read_overlap
 body:             |
@@ -1442,8 +1368,7 @@ body:             |
 ...
 # GCN-LABEL: name: xdl_sgemm32x32_mfma_write_agpr_dgemm_mfma_read_overlap
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 15
 # GFX942-NEXT: S_NOP 0
 # GFX950-NEXT: S_NOP 1
 # GCN-NEXT: V_MFMA
@@ -1485,11 +1410,8 @@ body:             |
 ...
 # GCN-LABEL: name: dgemm16x16_mfma_write_agpr_mfma_srca_read_overlap
 # GCN:      V_MFMA
-# GFX942-NEXT: S_NOP 7
-# GFX942-NEXT: S_NOP 2
-
-# GFX950-NEXT: S_NOP 7
-# GFX950-NEXT: S_NOP 7
+# GFX942-NEXT: S_NOP 10
+# GFX950-NEXT: S_NOP 15
 # GFX950-NEXT: S_NOP 2
 # GCN-NEXT: V_MFMA
 name:            dgemm16x16_mfma_write_agpr_mfma_srca_read_overlap
@@ -1510,11 +1432,8 @@ body:             |
 ...
 # GCN-LABEL: name: dgemm16x16_mfma_write_agpr_sgemm_mfma_srca_read_overlap
 # GCN:      V_MFMA
-# GFX942-NEXT: S_NOP 7
-# GFX942-NEXT: S_NOP 2
-
-# GFX950-NEXT: S_NOP 7
-# GFX950-NEXT: S_NOP 7
+# GFX942-NEXT: S_NOP 10
+# GFX950-NEXT: S_NOP 15
 # GFX950-NEXT: S_NOP 2
 
 # GCN-NEXT: V_MFMA
@@ -1546,11 +1465,8 @@ body:             |
 ...
 # GCN-LABEL: name: dgemm16x16_mfma_write_agpr_mfma_srcb_read_overlap
 # GCN:      V_MFMA
-# GFX942-NEXT: S_NOP 7
-# GFX942-NEXT: S_NOP 2
-
-# GFX950-NEXT: S_NOP 7
-# GFX950-NEXT: S_NOP 7
+# GFX942-NEXT: S_NOP 10
+# GFX950-NEXT: S_NOP 15
 # GFX950-NEXT: S_NOP 2
 # GCN-NEXT: V_MFMA
 name:            dgemm16x16_mfma_write_agpr_mfma_srcb_read_overlap
@@ -1561,8 +1477,7 @@ body:             |
 ...
 # GCN-LABEL: name: dmfma4x4_write_agpr_flat_read_overlap
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 0
+# GCN-NEXT: S_NOP 8
 # GCN-NEXT: FLAT_STORE_DWORD
 name:            dmfma4x4_write_agpr_flat_read_overlap
 body:             |
@@ -1572,8 +1487,7 @@ body:             |
 ...
 # GCN-LABEL: name: dmfma4x4_write_agpr_flat_read_full
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 0
+# GCN-NEXT: S_NOP 8
 # GCN-NEXT: FLAT_STORE_DWORD
 name:            dmfma4x4_write_agpr_flat_read_full
 body:             |
@@ -1583,8 +1497,7 @@ body:             |
 ...
 # GCN-LABEL: name: dmfma16x16_write_agpr_flat_read
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 15
 # GCN-NEXT: S_NOP 1
 # GCN-NEXT: FLAT_STORE_DWORD
 name:            dmfma16x16_write_agpr_flat_read
@@ -1605,11 +1518,8 @@ body:             |
 ...
 # GCN-LABEL: name: dmfma16x16_write_agpr_valu_read
 # GCN:      V_MFMA
-# GFX942-NEXT: S_NOP 7
-# GFX942-NEXT: S_NOP 2
-
-# GFX950-NEXT: S_NOP 7
-# GFX950-NEXT: S_NOP 7
+# GFX942-NEXT: S_NOP 10
+# GFX950-NEXT: S_NOP 15
 # GFX950-NEXT: S_NOP 2
 # GCN-NEXT: V_ACCVGPR_READ_B32_e64
 name:            dmfma16x16_write_agpr_valu_read
@@ -1630,8 +1540,7 @@ body:             |
 ...
 # GCN-LABEL: name: dmfma16x16_write_agpr_valu_write
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 2
+# GCN-NEXT: S_NOP 10
 # GCN-NEXT: V_ACCVGPR_WRITE_B32_e64
 name:            dmfma16x16_write_agpr_valu_write
 body:             |
@@ -1840,9 +1749,8 @@ body:             |
 ...
 # GCN-LABEL: name: smfmac32x32x32_mfma_write_agpr_mfma_read_overlap
 # GCN:      V_SMFMAC
-# GCN-NEXT: S_NOP 7
-# GFX942-NEXT: S_NOP 0
-# GFX950-NEXT: S_NOP 1
+# GFX942-NEXT: S_NOP 8
+# GFX950-NEXT: S_NOP 9
 # GCN-NEXT: V_SMFMAC
 name:            smfmac32x32x32_mfma_write_agpr_mfma_read_overlap
 body:             |
@@ -1959,8 +1867,7 @@ body:             |
 ...
 # GCN-LABEL: name: nonxdl_8pass_smfma16x16_write_vgpr_vm_read
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 1
+# GCN-NEXT: S_NOP 9
 # GCN-NEXT: BUFFER_STORE_DWORD
 name:            nonxdl_8pass_smfma16x16_write_vgpr_vm_read
 body:             |
@@ -1970,8 +1877,7 @@ body:             |
 ...
 # GCN-LABEL: name: nonxdl_8pass_smfma16x16_write_vgpr_valu_read
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 1
+# GCN-NEXT: S_NOP 9
 # GCN-NEXT: V_MOV_B32
 name:            nonxdl_8pass_smfma16x16_write_vgpr_valu_read
 body:             |
@@ -1981,8 +1887,7 @@ body:             |
 ...
 # GCN-LABEL: name: nonxdl_8pass_smfma16x16_write_vgpr_valu_write
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 1
+# GCN-NEXT: S_NOP 9
 # GCN-NEXT: V_MOV_B32
 name:            nonxdl_8pass_smfma16x16_write_vgpr_valu_write
 body:             |
@@ -1992,8 +1897,7 @@ body:             |
 ...
 # GCN-LABEL: name: nonxdl_smfma32x32_write_vgpr_vm_read
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 15
 # GCN-NEXT: S_NOP 1
 # GCN-NEXT: BUFFER_STORE_DWORD
 name:            nonxdl_smfma32x32_write_vgpr_vm_read
@@ -2004,8 +1908,7 @@ body:             |
 ...
 # GCN-LABEL: name: nonxdl_smfma32x32_write_vgpr_valu_read
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 15
 # GCN-NEXT: S_NOP 1
 # GCN-NEXT: V_MOV_B32
 name:            nonxdl_smfma32x32_write_vgpr_valu_read
@@ -2016,8 +1919,7 @@ body:             |
 ...
 # GCN-LABEL: name: nonxdl_smfma32x32_write_vgpr_valu_write
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 15
 # GCN-NEXT: S_NOP 1
 # GCN-NEXT: V_MOV_B32
 name:            nonxdl_smfma32x32_write_vgpr_valu_write
@@ -2109,9 +2011,8 @@ body:             |
 ...
 # GCN-LABEL: name: smfmac32x32_read_vgpr_srcc_valu_write
 # GCN:      V_SMFMAC
-# GCN-NEXT: S_NOP 7
-# GFX942-NEXT: S_NOP 2
-# GFX950-NEXT: S_NOP 3
+# GFX942-NEXT: S_NOP 10
+# GFX950-NEXT: S_NOP 11
 # GCN-NEXT: V_MOV_B32
 name:            smfmac32x32_read_vgpr_srcc_valu_write
 body:             |
@@ -2121,8 +2022,7 @@ body:             |
 ...
 # GCN-LABEL: name: xdl_sgemm32x32_mfma_read_vgpr_srcc_valu_write
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 6
+# GCN-NEXT: S_NOP 14
 # GCN-NEXT: V_MOV_B32
 name:            xdl_sgemm32x32_mfma_read_vgpr_srcc_valu_write
 body:             |
@@ -2337,9 +2237,8 @@ body:             |
 # 8 pass source
 # GCN-LABEL: name: xdl_mfma_8pass_write_vgpr_nonxdl_sgemm_mfma_read_overlap_srcc
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GFX942-NEXT: S_NOP 0
-# GFX950-NEXT: S_NOP 1
+# GFX942-NEXT: S_NOP 8
+# GFX950-NEXT: S_NOP 9
 # GCN-NEXT: V_MFMA
 name: xdl_mfma_8pass_write_vgpr_nonxdl_sgemm_mfma_read_overlap_srcc
 body:             |
@@ -2353,9 +2252,8 @@ body:             |
 # 8 pass source
 # GCN-LABEL: name: xdl_mfma_8pass_write_vgpr_nonxdl_sgemm_mfma_read_overlap_srca
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GFX942-NEXT: S_NOP 2
-# GFX950-NEXT: S_NOP 3
+# GFX942-NEXT: S_NOP 10
+# GFX950-NEXT: S_NOP 11
 # GCN-NEXT: V_MFMA
 name: xdl_mfma_8pass_write_vgpr_nonxdl_sgemm_mfma_read_overlap_srca
 body:             |
@@ -2369,9 +2267,8 @@ body:             |
 # 8 pass source
 # GCN-LABEL: name: xdl_mfma_8pass_write_vgpr_nonxdl_sgemm_mfma_read_overlap_srcb
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GFX942-NEXT: S_NOP 2
-# GFX950-NEXT: S_NOP 3
+# GFX942-NEXT: S_NOP 10
+# GFX950-NEXT: S_NOP 11
 # GCN-NEXT: V_MFMA
 name: xdl_mfma_8pass_write_vgpr_nonxdl_sgemm_mfma_read_overlap_srcb
 body:             |
@@ -2385,8 +2282,7 @@ body:             |
 # 16 pass source
 # GCN-LABEL: name: xdl_16pass_write_vgpr_nonxdl_sgemm_mfma_read_overlap_srcc
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 15
 # GFX942-NEXT: S_NOP 0
 # GFX950-NEXT: S_NOP 1
 # GCN-NEXT: V_MFMA
@@ -2403,8 +2299,7 @@ body:             |
 # 16 pass source
 # GCN-LABEL: name: xdl_16pass_write_vgpr_nonxdl_sgemm_mfma_read_overlap_srca
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 15
 # GFX942-NEXT: S_NOP 2
 # GFX950-NEXT: S_NOP 3
 # GCN-NEXT: V_MFMA
@@ -2420,8 +2315,7 @@ body:             |
 # 16 pass source
 # GCN-LABEL: name: xdl_16pass_write_vgpr_nonxdl_sgemm_mfma_read_overlap_srcb
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 15
 # GFX942-NEXT: S_NOP 2
 # GFX950-NEXT: S_NOP 3
 # GCN-NEXT: V_MFMA
@@ -2450,8 +2344,7 @@ body:             |
 # 8 pass source
 # GCN-LABEL: name: nonxdl_8pass_write_vgpr_nonxdl_sgemm_mfma_read_overlap_srca
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 1
+# GCN-NEXT: S_NOP 9
 # GCN-NEXT: V_MFMA
 name:            nonxdl_8pass_write_vgpr_nonxdl_sgemm_mfma_read_overlap_srca
 body:             |
@@ -2464,8 +2357,7 @@ body:             |
 # 8 pass source
 # GCN-LABEL: name: nonxdl_8pass_write_vgpr_nonxdl_sgemm_mfma_read_overlap_srcb
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 1
+# GCN-NEXT: S_NOP 9
 # GCN-NEXT: V_MFMA
 name:            nonxdl_8pass_write_vgpr_nonxdl_sgemm_mfma_read_overlap_srcb
 body:             |
@@ -2477,9 +2369,8 @@ body:             |
 # 8 pass source
 # GCN-LABEL: name: xdl_mfma_8pass_write_vgpr_xdl_mfma_read_overlap_srcc
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GFX942-NEXT: S_NOP 0
-# GFX950-NEXT: S_NOP 1
+# GFX942-NEXT: S_NOP 8
+# GFX950-NEXT: S_NOP 9
 # GCN-NEXT: V_MFMA
 name:            xdl_mfma_8pass_write_vgpr_xdl_mfma_read_overlap_srcc
 body:             |
@@ -2492,9 +2383,8 @@ body:             |
 # 8 pass source
 # GCN-LABEL: name: xdl_mfma_8pass_write_vgpr_xdl_mfma_read_overlap_srca
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GFX942-NEXT: S_NOP 2
-# GFX950-NEXT: S_NOP 3
+# GFX942-NEXT: S_NOP 10
+# GFX950-NEXT: S_NOP 11
 # GCN-NEXT: V_MFMA
 name:            xdl_mfma_8pass_write_vgpr_xdl_mfma_read_overlap_srca
 body:             |
@@ -2507,9 +2397,8 @@ body:             |
 # 8 pass source
 # GCN-LABEL: name: xdl_mfma_8pass_write_vgpr_xdl_mfma_read_overlap_srcb
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GFX942-NEXT: S_NOP 2
-# GFX950-NEXT: S_NOP 3
+# GFX942-NEXT: S_NOP 10
+# GFX950-NEXT: S_NOP 11
 # GCN-NEXT: V_MFMA
 name:            xdl_mfma_8pass_write_vgpr_xdl_mfma_read_overlap_srcb
 body:             |
@@ -2522,8 +2411,7 @@ body:             |
 # 16 pass source
 # GCN-LABEL: name: xdl_16pass_write_vgpr_xdl_mfma_read_overlap_srcc
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 15
 # GFX942-NEXT: S_NOP 0
 # GFX950-NEXT: S_NOP 1
 # GCN-NEXT: V_MFMA
@@ -2539,8 +2427,7 @@ body:             |
 # 16 pass source
 # GCN-LABEL: name: xdl_16pass_write_vgpr_xdl_mfma_read_overlap_srca
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 15
 # GFX942-NEXT: S_NOP 2
 # GFX950-NEXT: S_NOP 3
 # GCN-NEXT: V_MFMA
@@ -2557,8 +2444,7 @@ body:             |
 # 16 pass source
 # GCN-LABEL: name: xdl_16pass_write_vgpr_xdl_mfma_read_overlap_srcb
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 15
 # GFX942-NEXT: S_NOP 2
 # GFX950-NEXT: S_NOP 3
 # GCN-NEXT: V_MFMA
@@ -2603,9 +2489,8 @@ body:             |
 ...
 # GCN-LABEL: name: xdl_8pass_mfma_write_agpr_smfmac_read_overlap_srcc
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GFX942-NEXT: S_NOP 0
-# GFX950-NEXT: S_NOP 1
+# GFX942-NEXT: S_NOP 8
+# GFX950-NEXT: S_NOP 9
 # GCN-NEXT: V_SMFMAC_
 name:            xdl_8pass_mfma_write_agpr_smfmac_read_overlap_srcc
 body:             |
@@ -2617,8 +2502,7 @@ body:             |
 ...
 # GCN-LABEL: name: xdl_16pass_mfma_write_agpr_smfmac_read_overlap_srcc
 # GCN:      V_MFMA
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 15
 # GFX942-NEXT: S_NOP 0
 # GFX950-NEXT: S_NOP 1
 # GCN-NEXT: V_SMFMAC_
diff --git a/llvm/test/CodeGen/AMDGPU/mai-hazards-mfma-scale.gfx950.mir b/llvm/test/CodeGen/AMDGPU/mai-hazards-mfma-scale.gfx950.mir
index 4585eca8fe894..7708c8fc00609 100644
--- a/llvm/test/CodeGen/AMDGPU/mai-hazards-mfma-scale.gfx950.mir
+++ b/llvm/test/CodeGen/AMDGPU/mai-hazards-mfma-scale.gfx950.mir
@@ -15,8 +15,7 @@ body:             |
     ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: S_NOP 7
-    ; GCN-NEXT: S_NOP 1
+    ; GCN-NEXT: S_NOP 9
     ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, killed $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, implicit $mode, implicit $exec
     ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
     renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $mode, implicit $exec
@@ -37,8 +36,7 @@ body:             |
     ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed $vgpr0_vgpr1_vgpr2_vgpr3, 1, 1, implicit $mode, implicit $exec
-    ; GCN-NEXT: S_NOP 7
-    ; GCN-NEXT: S_NOP 1
+    ; GCN-NEXT: S_NOP 9
     ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, killed $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, implicit $mode, implicit $exec
     ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
     renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed $vgpr0_vgpr1_vgpr2_vgpr3, 1, 1, implicit $mode, implicit $exec
@@ -59,8 +57,7 @@ body:             |
     ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed $vgpr0_vgpr1_vgpr2_vgpr3, 2, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: S_NOP 7
-    ; GCN-NEXT: S_NOP 1
+    ; GCN-NEXT: S_NOP 9
     ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, killed $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, implicit $mode, implicit $exec
     ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
     renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed $vgpr0_vgpr1_vgpr2_vgpr3, 2, 0, implicit $mode, implicit $exec
@@ -81,8 +78,7 @@ body:             |
     ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 2, implicit $mode, implicit $exec
-    ; GCN-NEXT: S_NOP 7
-    ; GCN-NEXT: S_NOP 1
+    ; GCN-NEXT: S_NOP 9
     ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, killed $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, implicit $mode, implicit $exec
     ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
     renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 2, implicit $mode, implicit $exec
@@ -157,19 +153,18 @@ name:            V_MFMA_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64___xdl_write_vgpr__c
 tracksRegLiveness: true
 body:             |
   bb.0:
-    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $sgpr4
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33
 
     ; GCN-LABEL: name: V_MFMA_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64___xdl_write_vgpr__cbsz0_blgp0____xdl_read_overlap_vgpr_srcC
-    ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $sgpr4
+    ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33
     ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, $sgpr4, $vgpr32, 12, 4, implicit $mode, implicit $exec
-    ; GCN-NEXT: S_NOP 7
-    ; GCN-NEXT: S_NOP 7
+    ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, $vgpr33, $vgpr32, 12, 4, implicit $mode, implicit $exec
+    ; GCN-NEXT: S_NOP 15
     ; GCN-NEXT: S_NOP 1
-    ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, $sgpr4, killed $vgpr32, 12, 4, implicit $mode, implicit $exec
+    ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, $vgpr33, killed $vgpr32, 12, 4, implicit $mode, implicit $exec
     ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
-    renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, $sgpr4, $vgpr32, 12, 4, implicit $mode, implicit $exec
-    renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, $sgpr4, killed $vgpr32, 12, 4, implicit $mode, implicit $exec
+    renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, $vgpr33, $vgpr32, 12, 4, implicit $mode, implicit $exec
+    renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, $vgpr33, killed $vgpr32, 12, 4, implicit $mode, implicit $exec
   S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
 
 ...
@@ -180,18 +175,17 @@ name:            V_MFMA_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64___xdl_write_vgpr__c
 tracksRegLiveness: true
 body:             |
   bb.0:
-    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $sgpr4
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33
 
     ; GCN-LABEL: name: V_MFMA_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64___xdl_write_vgpr__cbsz2_blgp2____xdl_read_overlap_vgpr_srcC
-    ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $sgpr4
+    ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33
     ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 2, 2, $sgpr4, $vgpr32, 12, 4, implicit $mode, implicit $exec
-    ; GCN-NEXT: S_NOP 7
-    ; GCN-NEXT: S_NOP 1
-    ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, $sgpr4, killed $vgpr32, 12, 4, implicit $mode, implicit $exec
+    ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 2, 2, $vgpr33, $vgpr32, 12, 4, implicit $mode, implicit $exec
+    ; GCN-NEXT: S_NOP 9
+    ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, $vgpr33, killed $vgpr32, 12, 4, implicit $mode, implicit $exec
     ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
-    renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 2, 2, $sgpr4, $vgpr32, 12, 4, implicit $mode, implicit $exec
-    renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, $sgpr4, killed $vgpr32, 12, 4, implicit $mode, implicit $exec
+    renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 2, 2, $vgpr33, $vgpr32, 12, 4, implicit $mode, implicit $exec
+    renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, $vgpr33, killed $vgpr32, 12, 4, implicit $mode, implicit $exec
   S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
 
 ...
@@ -202,19 +196,18 @@ name:            V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64___xdl_write_v
 tracksRegLiveness: true
 body:             |
   bb.0:
-    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $sgpr4
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33
 
     ; GCN-LABEL: name: V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64___xdl_write_vgpr__cbsz0_blgp0____xdl_read_overlap_vgpr_srcC
-    ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $sgpr4
+    ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33
     ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, $sgpr4, $vgpr32, 12, 4, implicit $mode, implicit $exec
-    ; GCN-NEXT: S_NOP 7
-    ; GCN-NEXT: S_NOP 7
+    ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, $vgpr33, $vgpr32, 12, 4, implicit $mode, implicit $exec
+    ; GCN-NEXT: S_NOP 15
     ; GCN-NEXT: S_NOP 1
-    ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, $sgpr4, $vgpr32, 12, 4, implicit $mode, implicit $exec
+    ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, $vgpr33, $vgpr32, 12, 4, implicit $mode, implicit $exec
     ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
-    renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, $sgpr4, $vgpr32, 12, 4, implicit $mode, implicit $exec
-    renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, $sgpr4, $vgpr32, 12, 4, implicit $mode, implicit $exec
+    renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, $vgpr33, $vgpr32, 12, 4, implicit $mode, implicit $exec
+    renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, $vgpr33, $vgpr32, 12, 4, implicit $mode, implicit $exec
   S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
 
 ...
@@ -225,18 +218,17 @@ name:            V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64___xdl_write_v
 tracksRegLiveness: true
 body:             |
   bb.0:
-    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $sgpr4
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33
 
     ; GCN-LABEL: name: V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64___xdl_write_vgpr__cbsz2_blgp2____xdl_read_overlap_vgpr_srcC
-    ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $sgpr4
+    ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33
     ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 2, 2, $sgpr4, $vgpr32, 12, 4, implicit $mode, implicit $exec
-    ; GCN-NEXT: S_NOP 7
-    ; GCN-NEXT: S_NOP 1
-    ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, $sgpr4, $vgpr32, 12, 4, implicit $mode, implicit $exec
+    ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 2, 2, $vgpr33, $vgpr32, 12, 4, implicit $mode, implicit $exec
+    ; GCN-NEXT: S_NOP 9
+    ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, $vgpr33, $vgpr32, 12, 4, implicit $mode, implicit $exec
     ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
-    renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 2, 2, $sgpr4, $vgpr32, 12, 4, implicit $mode, implicit $exec
-    renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, $sgpr4, $vgpr32, 12, 4, implicit $mode, implicit $exec
+    renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 2, 2, $vgpr33, $vgpr32, 12, 4, implicit $mode, implicit $exec
+    renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, $vgpr33, $vgpr32, 12, 4, implicit $mode, implicit $exec
   S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
 
 ...
@@ -247,18 +239,17 @@ name:            V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64___xdl_write_
 tracksRegLiveness: true
 body:             |
   bb.0:
-    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $sgpr4
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33
 
     ; GCN-LABEL: name: V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64___xdl_write_vgpr__cbsz0_blgp0____xdl_read_overlap_vgpr_srcC
-    ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $sgpr4
+    ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33
     ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, $sgpr4, $vgpr21, 12, 4, implicit $mode, implicit $exec
-    ; GCN-NEXT: S_NOP 7
-    ; GCN-NEXT: S_NOP 3
-    ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, killed $sgpr4, killed $vgpr21, 12, 4, implicit $mode, implicit $exec
+    ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, $vgpr33, $vgpr21, 12, 4, implicit $mode, implicit $exec
+    ; GCN-NEXT: S_NOP 11
+    ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, killed $vgpr33, killed $vgpr21, 12, 4, implicit $mode, implicit $exec
     ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
-    renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, $sgpr4, $vgpr21, 12, 4, implicit $mode, implicit $exec
-    renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, killed $sgpr4, killed $vgpr21, 12, 4, implicit $mode, implicit $exec
+    renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, $vgpr33, $vgpr21, 12, 4, implicit $mode, implicit $exec
+    renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, killed $vgpr33, killed $vgpr21, 12, 4, implicit $mode, implicit $exec
   S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
 
 ...
@@ -269,17 +260,17 @@ name:            V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64___xdl_write_
 tracksRegLiveness: true
 body:             |
   bb.0:
-    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $sgpr4
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33
 
     ; GCN-LABEL: name: V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64___xdl_write_vgpr__cbsz2_blgp2____xdl_read_overlap_vgpr_srcC
-    ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $sgpr4
+    ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33
     ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19, 2, 2, $sgpr4, $vgpr21, 12, 4, implicit $mode, implicit $exec
+    ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19, 2, 2, $vgpr33, $vgpr21, 12, 4, implicit $mode, implicit $exec
     ; GCN-NEXT: S_NOP 7
-    ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, killed $sgpr4, killed $vgpr21, 12, 4, implicit $mode, implicit $exec
+    ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, killed $vgpr33, killed $vgpr21, 12, 4, implicit $mode, implicit $exec
     ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
-    renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19, 2, 2, $sgpr4, $vgpr21, 12, 4, implicit $mode, implicit $exec
-    renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, killed $sgpr4, killed $vgpr21, 12, 4, implicit $mode, implicit $exec
+    renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19, 2, 2, $vgpr33, $vgpr21, 12, 4, implicit $mode, implicit $exec
+    renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, killed $vgpr33, killed $vgpr21, 12, 4, implicit $mode, implicit $exec
   S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
 
 ...
diff --git a/llvm/test/CodeGen/AMDGPU/mai-hazards.mir b/llvm/test/CodeGen/AMDGPU/mai-hazards.mir
index ce67a2eec93bc..61f2629dded83 100644
--- a/llvm/test/CodeGen/AMDGPU/mai-hazards.mir
+++ b/llvm/test/CodeGen/AMDGPU/mai-hazards.mir
@@ -157,8 +157,7 @@ body:             |
 
 # GCN-LABEL: name: mfma_16x16_write_agpr_accvgpr_read
 # GCN:      V_MFMA_F32_16X16X1F32
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 1
+# GCN-NEXT: S_NOP 9
 # GCN-NEXT: V_ACCVGPR_READ_B32_e64
 name:            mfma_16x16_write_agpr_accvgpr_read
 body:             |
@@ -170,8 +169,7 @@ body:             |
 
 # GCN-LABEL: name: mfma_32x32_write_agpr_accvgpr_read
 # GCN:      V_MFMA_F32_32X32X2F32
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 15
 # GCN-NEXT: S_NOP 1
 # GCN-NEXT: V_ACCVGPR_READ_B32_e64
 name:            mfma_32x32_write_agpr_accvgpr_read
@@ -208,8 +206,7 @@ body:             |
 
 # GCN-LABEL: name: mfma_32x32_write_agpr_accvgpr_write
 # GCN:      V_MFMA_F32_32X32X2F32
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 6
+# GCN-NEXT: S_NOP 14
 # GCN-NEXT: V_ACCVGPR_WRITE_B32_e64
 name:            mfma_32x32_write_agpr_accvgpr_write
 body:             |
@@ -244,8 +241,7 @@ body:             |
 
 # GCN-LABEL: name: mfma_32x32_read_srcc_accvgpr_write
 # GCN:      V_MFMA_F32_32X32X2F32
-# GCN-NEXT: S_NOP 7
-# GCN-NEXT: S_NOP 4
+# GCN-NEXT: S_NOP 12
 # GCN-NEXT: V_ACCVGPR_WRITE_B32_e64
 name:            mfma_32x32_read_srcc_accvgpr_write
 body:             |
diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-fixed-align.ll b/llvm/test/CodeGen/AMDGPU/memcpy-fixed-align.ll
index 37a261cab7563..e8bd640aa5409 100644
--- a/llvm/test/CodeGen/AMDGPU/memcpy-fixed-align.ll
+++ b/llvm/test/CodeGen/AMDGPU/memcpy-fixed-align.ll
@@ -7,23 +7,25 @@ define void @memcpy_fixed_align(ptr addrspace(5)  %dst, ptr addrspace(1) %src) {
 ; MUBUF-LABEL: memcpy_fixed_align:
 ; MUBUF:       ; %bb.0:
 ; MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; MUBUF-NEXT:    global_load_dwordx2 v[11:12], v[1:2], off offset:32
 ; MUBUF-NEXT:    global_load_dwordx4 v[3:6], v[1:2], off
 ; MUBUF-NEXT:    global_load_dwordx4 v[7:10], v[1:2], off offset:16
+; MUBUF-NEXT:    global_load_dwordx4 v[11:14], v[1:2], off offset:24
 ; MUBUF-NEXT:    s_lshr_b32 s4, s32, 6
 ; MUBUF-NEXT:    s_waitcnt vmcnt(2)
-; MUBUF-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:32
-; MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:36
-; MUBUF-NEXT:    s_waitcnt vmcnt(3)
 ; MUBUF-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:12
 ; MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:8
 ; MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:4
 ; MUBUF-NEXT:    buffer_store_dword v3, off, s[0:3], s32
-; MUBUF-NEXT:    s_waitcnt vmcnt(6)
+; MUBUF-NEXT:    s_waitcnt vmcnt(5)
 ; MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:28
 ; MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:24
 ; MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:20
 ; MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:16
+; MUBUF-NEXT:    s_waitcnt vmcnt(8)
+; MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:36
+; MUBUF-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:32
+; MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:28
+; MUBUF-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:24
 ; MUBUF-NEXT:    ;;#ASMSTART
 ; MUBUF-NEXT:    ; use s4
 ; MUBUF-NEXT:    ;;#ASMEND
@@ -35,14 +37,14 @@ define void @memcpy_fixed_align(ptr addrspace(5)  %dst, ptr addrspace(1) %src) {
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FLATSCR-NEXT:    global_load_dwordx4 v[3:6], v[1:2], off
 ; FLATSCR-NEXT:    global_load_dwordx4 v[7:10], v[1:2], off offset:16
-; FLATSCR-NEXT:    global_load_dwordx2 v[11:12], v[1:2], off offset:32
+; FLATSCR-NEXT:    global_load_dwordx4 v[11:14], v[1:2], off offset:24
 ; FLATSCR-NEXT:    s_mov_b32 s0, s32
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(2)
 ; FLATSCR-NEXT:    scratch_store_dwordx4 off, v[3:6], s32
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(2)
 ; FLATSCR-NEXT:    scratch_store_dwordx4 off, v[7:10], s32 offset:16
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(2)
-; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[11:12], s32 offset:32
+; FLATSCR-NEXT:    scratch_store_dwordx4 off, v[11:14], s32 offset:24
 ; FLATSCR-NEXT:    ;;#ASMSTART
 ; FLATSCR-NEXT:    ; use s0
 ; FLATSCR-NEXT:    ;;#ASMEND
diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll b/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll
index 0003366f3a3ea..5b7c36559a366 100644
--- a/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll
+++ b/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll
@@ -12,21 +12,19 @@ define amdgpu_kernel void @memcpy_p0_p0_minsize(ptr %dest, ptr readonly %src) #0
 ; CHECK-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
 ; CHECK-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_mov_b32_e32 v12, s3
-; CHECK-NEXT:    v_mov_b32_e32 v11, s2
-; CHECK-NEXT:    flat_load_ubyte v13, v[11:12] offset:46
-; CHECK-NEXT:    flat_load_ushort v14, v[11:12] offset:44
-; CHECK-NEXT:    flat_load_dwordx3 v[8:10], v[11:12] offset:32
-; CHECK-NEXT:    flat_load_dwordx4 v[0:3], v[11:12] offset:16
-; CHECK-NEXT:    flat_load_dwordx4 v[4:7], v[11:12]
-; CHECK-NEXT:    v_mov_b32_e32 v12, s1
-; CHECK-NEXT:    v_mov_b32_e32 v11, s0
+; CHECK-NEXT:    v_mov_b32_e32 v9, s3
+; CHECK-NEXT:    v_mov_b32_e32 v8, s2
+; CHECK-NEXT:    flat_load_dwordx2 v[10:11], v[8:9] offset:32
+; CHECK-NEXT:    flat_load_dwordx2 v[12:13], v[8:9] offset:39
+; CHECK-NEXT:    flat_load_dwordx4 v[0:3], v[8:9]
+; CHECK-NEXT:    flat_load_dwordx4 v[4:7], v[8:9] offset:16
+; CHECK-NEXT:    v_mov_b32_e32 v9, s1
+; CHECK-NEXT:    v_mov_b32_e32 v8, s0
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[11:12], v13 offset:46
-; CHECK-NEXT:    flat_store_short v[11:12], v14 offset:44
-; CHECK-NEXT:    flat_store_dwordx3 v[11:12], v[8:10] offset:32
-; CHECK-NEXT:    flat_store_dwordx4 v[11:12], v[0:3] offset:16
-; CHECK-NEXT:    flat_store_dwordx4 v[11:12], v[4:7]
+; CHECK-NEXT:    flat_store_dwordx2 v[8:9], v[10:11] offset:32
+; CHECK-NEXT:    flat_store_dwordx2 v[8:9], v[12:13] offset:39
+; CHECK-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
+; CHECK-NEXT:    flat_store_dwordx4 v[8:9], v[4:7] offset:16
 ; CHECK-NEXT:    s_endpgm
 entry:
   tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 47, i1 false)
@@ -173,33 +171,33 @@ define amdgpu_kernel void @memcpy_p0_p5_minsize(ptr %generic, ptr addrspace(5) %
 ; CHECK-NEXT:    v_mov_b32_e32 v26, s0
 ; CHECK-NEXT:    buffer_load_dword v3, v26, s[20:23], 0 offen offset:124
 ; CHECK-NEXT:    buffer_load_dword v2, v26, s[20:23], 0 offen offset:120
-; CHECK-NEXT:    buffer_load_dword v5, v26, s[20:23], 0 offen offset:100
-; CHECK-NEXT:    buffer_load_dword v7, v26, s[20:23], 0 offen offset:108
 ; CHECK-NEXT:    buffer_load_dword v1, v26, s[20:23], 0 offen offset:116
 ; CHECK-NEXT:    buffer_load_dword v0, v26, s[20:23], 0 offen offset:112
+; CHECK-NEXT:    buffer_load_dword v7, v26, s[20:23], 0 offen offset:108
 ; CHECK-NEXT:    buffer_load_dword v6, v26, s[20:23], 0 offen offset:104
+; CHECK-NEXT:    buffer_load_dword v5, v26, s[20:23], 0 offen offset:100
 ; CHECK-NEXT:    buffer_load_dword v4, v26, s[20:23], 0 offen offset:96
 ; CHECK-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CHECK-NEXT:    buffer_load_dword v8, v26, s[20:23], 0 offen offset:32
-; CHECK-NEXT:    buffer_load_dword v9, v26, s[20:23], 0 offen offset:36
-; CHECK-NEXT:    buffer_load_dword v10, v26, s[20:23], 0 offen offset:40
-; CHECK-NEXT:    buffer_load_dword v11, v26, s[20:23], 0 offen offset:44
-; CHECK-NEXT:    buffer_load_dword v12, v26, s[20:23], 0 offen offset:48
-; CHECK-NEXT:    buffer_load_dword v13, v26, s[20:23], 0 offen offset:52
-; CHECK-NEXT:    buffer_load_dword v14, v26, s[20:23], 0 offen offset:56
-; CHECK-NEXT:    buffer_load_dword v15, v26, s[20:23], 0 offen offset:60
-; CHECK-NEXT:    buffer_load_dword v17, v26, s[20:23], 0 offen offset:68
-; CHECK-NEXT:    buffer_load_dword v19, v26, s[20:23], 0 offen offset:76
-; CHECK-NEXT:    buffer_load_dword v21, v26, s[20:23], 0 offen offset:84
-; CHECK-NEXT:    buffer_load_dword v23, v26, s[20:23], 0 offen offset:92
-; CHECK-NEXT:    buffer_load_dword v22, v26, s[20:23], 0 offen offset:88
-; CHECK-NEXT:    buffer_load_dword v20, v26, s[20:23], 0 offen offset:80
-; CHECK-NEXT:    buffer_load_dword v18, v26, s[20:23], 0 offen offset:72
-; CHECK-NEXT:    buffer_load_dword v16, v26, s[20:23], 0 offen offset:64
+; CHECK-NEXT:    buffer_load_dword v11, v26, s[20:23], 0 offen offset:92
+; CHECK-NEXT:    buffer_load_dword v10, v26, s[20:23], 0 offen offset:88
+; CHECK-NEXT:    buffer_load_dword v9, v26, s[20:23], 0 offen offset:84
+; CHECK-NEXT:    buffer_load_dword v8, v26, s[20:23], 0 offen offset:80
+; CHECK-NEXT:    buffer_load_dword v15, v26, s[20:23], 0 offen offset:76
+; CHECK-NEXT:    buffer_load_dword v14, v26, s[20:23], 0 offen offset:72
+; CHECK-NEXT:    buffer_load_dword v13, v26, s[20:23], 0 offen offset:68
+; CHECK-NEXT:    buffer_load_dword v12, v26, s[20:23], 0 offen offset:64
+; CHECK-NEXT:    buffer_load_dword v16, v26, s[20:23], 0 offen offset:32
+; CHECK-NEXT:    buffer_load_dword v17, v26, s[20:23], 0 offen offset:36
+; CHECK-NEXT:    buffer_load_dword v18, v26, s[20:23], 0 offen offset:40
+; CHECK-NEXT:    buffer_load_dword v19, v26, s[20:23], 0 offen offset:44
+; CHECK-NEXT:    buffer_load_dword v20, v26, s[20:23], 0 offen offset:48
+; CHECK-NEXT:    buffer_load_dword v21, v26, s[20:23], 0 offen offset:52
+; CHECK-NEXT:    buffer_load_dword v22, v26, s[20:23], 0 offen offset:56
+; CHECK-NEXT:    buffer_load_dword v23, v26, s[20:23], 0 offen offset:60
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    v_mov_b32_e32 v25, s1
 ; CHECK-NEXT:    v_mov_b32_e32 v24, s0
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
+; CHECK-NEXT:    s_waitcnt vmcnt(20)
 ; CHECK-NEXT:    flat_store_dwordx4 v[24:25], v[0:3] offset:112
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    flat_store_dwordx4 v[24:25], v[4:7] offset:96
@@ -213,10 +211,10 @@ define amdgpu_kernel void @memcpy_p0_p5_minsize(ptr %generic, ptr addrspace(5) %
 ; CHECK-NEXT:    buffer_load_dword v7, v26, s[20:23], 0 offen offset:28
 ; CHECK-NEXT:    buffer_load_dword v3, v26, s[20:23], 0 offen offset:12
 ; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_dwordx4 v[24:25], v[20:23] offset:80
-; CHECK-NEXT:    flat_store_dwordx4 v[24:25], v[16:19] offset:64
-; CHECK-NEXT:    flat_store_dwordx4 v[24:25], v[12:15] offset:48
-; CHECK-NEXT:    flat_store_dwordx4 v[24:25], v[8:11] offset:32
+; CHECK-NEXT:    flat_store_dwordx4 v[24:25], v[8:11] offset:80
+; CHECK-NEXT:    flat_store_dwordx4 v[24:25], v[12:15] offset:64
+; CHECK-NEXT:    flat_store_dwordx4 v[24:25], v[20:23] offset:48
+; CHECK-NEXT:    flat_store_dwordx4 v[24:25], v[16:19] offset:32
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    flat_store_dwordx4 v[24:25], v[4:7] offset:16
 ; CHECK-NEXT:    flat_store_dwordx4 v[24:25], v[0:3]
@@ -281,8 +279,8 @@ define amdgpu_kernel void @memcpy_p0_p3_minsize(ptr %generic) #0 {
 ; CHECK-NEXT:    flat_store_dwordx4 v[20:21], v[8:11] offset:32
 ; CHECK-NEXT:    ds_read2_b64 v[0:3], v16 offset0:8 offset1:9
 ; CHECK-NEXT:    ds_read2_b64 v[4:7], v16 offset0:10 offset1:11
-; CHECK-NEXT:    ds_read2_b64 v[8:11], v16 offset0:12 offset1:13
-; CHECK-NEXT:    ds_read2_b64 v[16:19], v16 offset0:14 offset1:15
+; CHECK-NEXT:    ds_read_b128 v[8:11], v16 offset:96
+; CHECK-NEXT:    ds_read_b128 v[16:19], v16 offset:112
 ; CHECK-NEXT:    flat_store_dwordx4 v[20:21], v[12:15] offset:48
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    flat_store_dwordx4 v[20:21], v[0:3] offset:64
@@ -302,21 +300,19 @@ define amdgpu_kernel void @memcpy_p0_p0_optsize(ptr %dest, ptr %src) #1 {
 ; CHECK-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
 ; CHECK-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_mov_b32_e32 v12, s3
-; CHECK-NEXT:    v_mov_b32_e32 v11, s2
-; CHECK-NEXT:    flat_load_ubyte v13, v[11:12] offset:46
-; CHECK-NEXT:    flat_load_ushort v14, v[11:12] offset:44
-; CHECK-NEXT:    flat_load_dwordx3 v[8:10], v[11:12] offset:32
-; CHECK-NEXT:    flat_load_dwordx4 v[0:3], v[11:12] offset:16
-; CHECK-NEXT:    flat_load_dwordx4 v[4:7], v[11:12]
-; CHECK-NEXT:    v_mov_b32_e32 v12, s1
-; CHECK-NEXT:    v_mov_b32_e32 v11, s0
+; CHECK-NEXT:    v_mov_b32_e32 v9, s3
+; CHECK-NEXT:    v_mov_b32_e32 v8, s2
+; CHECK-NEXT:    flat_load_dwordx2 v[10:11], v[8:9] offset:32
+; CHECK-NEXT:    flat_load_dwordx2 v[12:13], v[8:9] offset:39
+; CHECK-NEXT:    flat_load_dwordx4 v[0:3], v[8:9]
+; CHECK-NEXT:    flat_load_dwordx4 v[4:7], v[8:9] offset:16
+; CHECK-NEXT:    v_mov_b32_e32 v9, s1
+; CHECK-NEXT:    v_mov_b32_e32 v8, s0
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[11:12], v13 offset:46
-; CHECK-NEXT:    flat_store_short v[11:12], v14 offset:44
-; CHECK-NEXT:    flat_store_dwordx3 v[11:12], v[8:10] offset:32
-; CHECK-NEXT:    flat_store_dwordx4 v[11:12], v[0:3] offset:16
-; CHECK-NEXT:    flat_store_dwordx4 v[11:12], v[4:7]
+; CHECK-NEXT:    flat_store_dwordx2 v[8:9], v[10:11] offset:32
+; CHECK-NEXT:    flat_store_dwordx2 v[8:9], v[12:13] offset:39
+; CHECK-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
+; CHECK-NEXT:    flat_store_dwordx4 v[8:9], v[4:7] offset:16
 ; CHECK-NEXT:    s_endpgm
 entry:
   tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 47, i1 false)
@@ -463,33 +459,33 @@ define amdgpu_kernel void @memcpy_p0_p5_optsize(ptr %generic, ptr addrspace(5) %
 ; CHECK-NEXT:    v_mov_b32_e32 v26, s0
 ; CHECK-NEXT:    buffer_load_dword v3, v26, s[20:23], 0 offen offset:124
 ; CHECK-NEXT:    buffer_load_dword v2, v26, s[20:23], 0 offen offset:120
-; CHECK-NEXT:    buffer_load_dword v5, v26, s[20:23], 0 offen offset:100
-; CHECK-NEXT:    buffer_load_dword v7, v26, s[20:23], 0 offen offset:108
 ; CHECK-NEXT:    buffer_load_dword v1, v26, s[20:23], 0 offen offset:116
 ; CHECK-NEXT:    buffer_load_dword v0, v26, s[20:23], 0 offen offset:112
+; CHECK-NEXT:    buffer_load_dword v7, v26, s[20:23], 0 offen offset:108
 ; CHECK-NEXT:    buffer_load_dword v6, v26, s[20:23], 0 offen offset:104
+; CHECK-NEXT:    buffer_load_dword v5, v26, s[20:23], 0 offen offset:100
 ; CHECK-NEXT:    buffer_load_dword v4, v26, s[20:23], 0 offen offset:96
 ; CHECK-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CHECK-NEXT:    buffer_load_dword v8, v26, s[20:23], 0 offen offset:32
-; CHECK-NEXT:    buffer_load_dword v9, v26, s[20:23], 0 offen offset:36
-; CHECK-NEXT:    buffer_load_dword v10, v26, s[20:23], 0 offen offset:40
-; CHECK-NEXT:    buffer_load_dword v11, v26, s[20:23], 0 offen offset:44
-; CHECK-NEXT:    buffer_load_dword v12, v26, s[20:23], 0 offen offset:48
-; CHECK-NEXT:    buffer_load_dword v13, v26, s[20:23], 0 offen offset:52
-; CHECK-NEXT:    buffer_load_dword v14, v26, s[20:23], 0 offen offset:56
-; CHECK-NEXT:    buffer_load_dword v15, v26, s[20:23], 0 offen offset:60
-; CHECK-NEXT:    buffer_load_dword v17, v26, s[20:23], 0 offen offset:68
-; CHECK-NEXT:    buffer_load_dword v19, v26, s[20:23], 0 offen offset:76
-; CHECK-NEXT:    buffer_load_dword v21, v26, s[20:23], 0 offen offset:84
-; CHECK-NEXT:    buffer_load_dword v23, v26, s[20:23], 0 offen offset:92
-; CHECK-NEXT:    buffer_load_dword v22, v26, s[20:23], 0 offen offset:88
-; CHECK-NEXT:    buffer_load_dword v20, v26, s[20:23], 0 offen offset:80
-; CHECK-NEXT:    buffer_load_dword v18, v26, s[20:23], 0 offen offset:72
-; CHECK-NEXT:    buffer_load_dword v16, v26, s[20:23], 0 offen offset:64
+; CHECK-NEXT:    buffer_load_dword v11, v26, s[20:23], 0 offen offset:92
+; CHECK-NEXT:    buffer_load_dword v10, v26, s[20:23], 0 offen offset:88
+; CHECK-NEXT:    buffer_load_dword v9, v26, s[20:23], 0 offen offset:84
+; CHECK-NEXT:    buffer_load_dword v8, v26, s[20:23], 0 offen offset:80
+; CHECK-NEXT:    buffer_load_dword v15, v26, s[20:23], 0 offen offset:76
+; CHECK-NEXT:    buffer_load_dword v14, v26, s[20:23], 0 offen offset:72
+; CHECK-NEXT:    buffer_load_dword v13, v26, s[20:23], 0 offen offset:68
+; CHECK-NEXT:    buffer_load_dword v12, v26, s[20:23], 0 offen offset:64
+; CHECK-NEXT:    buffer_load_dword v16, v26, s[20:23], 0 offen offset:32
+; CHECK-NEXT:    buffer_load_dword v17, v26, s[20:23], 0 offen offset:36
+; CHECK-NEXT:    buffer_load_dword v18, v26, s[20:23], 0 offen offset:40
+; CHECK-NEXT:    buffer_load_dword v19, v26, s[20:23], 0 offen offset:44
+; CHECK-NEXT:    buffer_load_dword v20, v26, s[20:23], 0 offen offset:48
+; CHECK-NEXT:    buffer_load_dword v21, v26, s[20:23], 0 offen offset:52
+; CHECK-NEXT:    buffer_load_dword v22, v26, s[20:23], 0 offen offset:56
+; CHECK-NEXT:    buffer_load_dword v23, v26, s[20:23], 0 offen offset:60
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    v_mov_b32_e32 v25, s1
 ; CHECK-NEXT:    v_mov_b32_e32 v24, s0
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
+; CHECK-NEXT:    s_waitcnt vmcnt(20)
 ; CHECK-NEXT:    flat_store_dwordx4 v[24:25], v[0:3] offset:112
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    flat_store_dwordx4 v[24:25], v[4:7] offset:96
@@ -503,10 +499,10 @@ define amdgpu_kernel void @memcpy_p0_p5_optsize(ptr %generic, ptr addrspace(5) %
 ; CHECK-NEXT:    buffer_load_dword v7, v26, s[20:23], 0 offen offset:28
 ; CHECK-NEXT:    buffer_load_dword v3, v26, s[20:23], 0 offen offset:12
 ; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_dwordx4 v[24:25], v[20:23] offset:80
-; CHECK-NEXT:    flat_store_dwordx4 v[24:25], v[16:19] offset:64
-; CHECK-NEXT:    flat_store_dwordx4 v[24:25], v[12:15] offset:48
-; CHECK-NEXT:    flat_store_dwordx4 v[24:25], v[8:11] offset:32
+; CHECK-NEXT:    flat_store_dwordx4 v[24:25], v[8:11] offset:80
+; CHECK-NEXT:    flat_store_dwordx4 v[24:25], v[12:15] offset:64
+; CHECK-NEXT:    flat_store_dwordx4 v[24:25], v[20:23] offset:48
+; CHECK-NEXT:    flat_store_dwordx4 v[24:25], v[16:19] offset:32
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    flat_store_dwordx4 v[24:25], v[4:7] offset:16
 ; CHECK-NEXT:    flat_store_dwordx4 v[24:25], v[0:3]
@@ -571,8 +567,8 @@ define amdgpu_kernel void @memcpy_p0_p3_optsize(ptr %generic) #1 {
 ; CHECK-NEXT:    flat_store_dwordx4 v[20:21], v[8:11] offset:32
 ; CHECK-NEXT:    ds_read2_b64 v[0:3], v16 offset0:8 offset1:9
 ; CHECK-NEXT:    ds_read2_b64 v[4:7], v16 offset0:10 offset1:11
-; CHECK-NEXT:    ds_read2_b64 v[8:11], v16 offset0:12 offset1:13
-; CHECK-NEXT:    ds_read2_b64 v[16:19], v16 offset0:14 offset1:15
+; CHECK-NEXT:    ds_read_b128 v[8:11], v16 offset:96
+; CHECK-NEXT:    ds_read_b128 v[16:19], v16 offset:112
 ; CHECK-NEXT:    flat_store_dwordx4 v[20:21], v[12:15] offset:48
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    flat_store_dwordx4 v[20:21], v[0:3] offset:64
diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-param-combinations.ll b/llvm/test/CodeGen/AMDGPU/memcpy-param-combinations.ll
index b43ccc551ca95..048610184368d 100644
--- a/llvm/test/CodeGen/AMDGPU/memcpy-param-combinations.ll
+++ b/llvm/test/CodeGen/AMDGPU/memcpy-param-combinations.ll
@@ -27,19 +27,16 @@ define void @memcpy_p0_p0_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p0_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x3
-; CHECK-NEXT:    flat_load_ubyte v9, v[2:3] offset:30
-; CHECK-NEXT:    flat_load_ushort v10, v[2:3] offset:28
-; CHECK-NEXT:    flat_load_dwordx3 v[6:8], v[2:3] offset:16
-; CHECK-NEXT:    flat_load_dwordx4 v[2:5], v[2:3]
-; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(3)
-; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(3)
-; CHECK-NEXT:    flat_store_dwordx3 v[0:1], v[6:8] offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(3)
-; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
+; CHECK-NEXT:    s_clause 0x2
+; CHECK-NEXT:    flat_load_dwordx2 v[8:9], v[2:3] offset:23
+; CHECK-NEXT:    flat_load_dwordx4 v[4:7], v[2:3]
+; CHECK-NEXT:    flat_load_dwordx2 v[2:3], v[2:3] offset:16
+; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
+; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[8:9] offset:23
+; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(2)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(2)
+; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[2:3] offset:16
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -83,19 +80,16 @@ define void @memcpy_p0_p0_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p0_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x3
-; CHECK-NEXT:    flat_load_ubyte v9, v[2:3] offset:30
-; CHECK-NEXT:    flat_load_ushort v10, v[2:3] offset:28
-; CHECK-NEXT:    flat_load_dwordx3 v[6:8], v[2:3] offset:16
-; CHECK-NEXT:    flat_load_dwordx4 v[2:5], v[2:3]
-; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(3)
-; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(3)
-; CHECK-NEXT:    flat_store_dwordx3 v[0:1], v[6:8] offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(3)
-; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
+; CHECK-NEXT:    s_clause 0x2
+; CHECK-NEXT:    flat_load_dwordx2 v[8:9], v[2:3] offset:23
+; CHECK-NEXT:    flat_load_dwordx4 v[4:7], v[2:3]
+; CHECK-NEXT:    flat_load_dwordx2 v[2:3], v[2:3] offset:16
+; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
+; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[8:9] offset:23
+; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(2)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(2)
+; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[2:3] offset:16
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -239,19 +233,16 @@ define void @memcpy_p0_p1_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p1_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x3
-; CHECK-NEXT:    global_load_ubyte v9, v[2:3], off offset:30
-; CHECK-NEXT:    global_load_ushort v10, v[2:3], off offset:28
-; CHECK-NEXT:    global_load_dwordx3 v[6:8], v[2:3], off offset:16
-; CHECK-NEXT:    global_load_dwordx4 v[2:5], v[2:3], off
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:30
+; CHECK-NEXT:    s_clause 0x2
+; CHECK-NEXT:    global_load_dwordx2 v[8:9], v[2:3], off offset:23
+; CHECK-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off
+; CHECK-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:28
+; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[8:9] offset:23
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_dwordx3 v[0:1], v[6:8] offset:16
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
+; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[2:3] offset:16
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -295,19 +286,16 @@ define void @memcpy_p0_p1_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p1_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x3
-; CHECK-NEXT:    global_load_ubyte v9, v[2:3], off offset:30
-; CHECK-NEXT:    global_load_ushort v10, v[2:3], off offset:28
-; CHECK-NEXT:    global_load_dwordx3 v[6:8], v[2:3], off offset:16
-; CHECK-NEXT:    global_load_dwordx4 v[2:5], v[2:3], off
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:30
+; CHECK-NEXT:    s_clause 0x2
+; CHECK-NEXT:    global_load_dwordx2 v[8:9], v[2:3], off offset:23
+; CHECK-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off
+; CHECK-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:28
+; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[8:9] offset:23
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_dwordx3 v[0:1], v[6:8] offset:16
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
+; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[2:3] offset:16
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -437,7 +425,7 @@ define void @memcpy_p0_p3_sz16_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p3_sz16_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_read2_b64 v[2:5], v2 offset1:1
+; CHECK-NEXT:    ds_read_b128 v[2:5], v2
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
@@ -451,19 +439,15 @@ define void @memcpy_p0_p3_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p3_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_read_b32 v8, v2 offset:24
-; CHECK-NEXT:    ds_read_u8 v9, v2 offset:30
-; CHECK-NEXT:    ds_read_u16 v10, v2 offset:28
-; CHECK-NEXT:    ds_read_b64 v[6:7], v2 offset:16
-; CHECK-NEXT:    ds_read2_b64 v[2:5], v2 offset1:1
-; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:30
-; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
-; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:28
-; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
-; CHECK-NEXT:    flat_store_dwordx3 v[0:1], v[6:8] offset:16
-; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
-; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
+; CHECK-NEXT:    ds_read_b64 v[7:8], v2 offset:23
+; CHECK-NEXT:    ds_read_b128 v[3:6], v2
+; CHECK-NEXT:    ds_read_b64 v[9:10], v2 offset:16
+; CHECK-NEXT:    s_waitcnt lgkmcnt(2)
+; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[7:8] offset:23
+; CHECK-NEXT:    s_waitcnt lgkmcnt(2)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6]
+; CHECK-NEXT:    s_waitcnt lgkmcnt(2)
+; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[9:10] offset:16
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -475,8 +459,8 @@ define void @memcpy_p0_p3_sz32_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p3_sz32_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_read2_b64 v[3:6], v2 offset0:2 offset1:3
-; CHECK-NEXT:    ds_read2_b64 v[7:10], v2 offset1:1
+; CHECK-NEXT:    ds_read_b128 v[3:6], v2 offset:16
+; CHECK-NEXT:    ds_read_b128 v[7:10], v2
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
 ; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6] offset:16
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
@@ -492,7 +476,7 @@ define void @memcpy_p0_p3_sz16_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p3_sz16_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_read2_b64 v[2:5], v2 offset1:1
+; CHECK-NEXT:    ds_read_b128 v[2:5], v2
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
@@ -506,19 +490,15 @@ define void @memcpy_p0_p3_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p3_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_read_b32 v8, v2 offset:24
-; CHECK-NEXT:    ds_read_u8 v9, v2 offset:30
-; CHECK-NEXT:    ds_read_u16 v10, v2 offset:28
-; CHECK-NEXT:    ds_read_b64 v[6:7], v2 offset:16
-; CHECK-NEXT:    ds_read2_b64 v[2:5], v2 offset1:1
-; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:30
-; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
-; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:28
-; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
-; CHECK-NEXT:    flat_store_dwordx3 v[0:1], v[6:8] offset:16
-; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
-; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
+; CHECK-NEXT:    ds_read_b64 v[7:8], v2 offset:23
+; CHECK-NEXT:    ds_read_b128 v[3:6], v2
+; CHECK-NEXT:    ds_read_b64 v[9:10], v2 offset:16
+; CHECK-NEXT:    s_waitcnt lgkmcnt(2)
+; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[7:8] offset:23
+; CHECK-NEXT:    s_waitcnt lgkmcnt(2)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6]
+; CHECK-NEXT:    s_waitcnt lgkmcnt(2)
+; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[9:10] offset:16
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -530,8 +510,8 @@ define void @memcpy_p0_p3_sz32_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p3_sz32_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_read2_b64 v[3:6], v2 offset0:2 offset1:3
-; CHECK-NEXT:    ds_read2_b64 v[7:10], v2 offset1:1
+; CHECK-NEXT:    ds_read_b128 v[3:6], v2 offset:16
+; CHECK-NEXT:    ds_read_b128 v[7:10], v2
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
 ; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6] offset:16
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
@@ -643,12 +623,9 @@ define void @memcpy_p0_p4_sz16_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p4_sz16_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[4:5]
-; CHECK-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off offset:8
+; CHECK-NEXT:    global_load_dwordx4 v[2:5], v[2:3], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[2:3] offset:8
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -660,24 +637,16 @@ define void @memcpy_p0_p4_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p4_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[4:5]
-; CHECK-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[4:5] offset:8
-; CHECK-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[4:5] offset:16
-; CHECK-NEXT:    global_load_dword v4, v[2:3], off offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_dword v[0:1], v4 offset:24
-; CHECK-NEXT:    global_load_ushort v4, v[2:3], off offset:28
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    global_load_dwordx2 v[8:9], v[2:3], off
+; CHECK-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off offset:8
+; CHECK-NEXT:    s_waitcnt vmcnt(1)
+; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[8:9]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_short v[0:1], v4 offset:28
-; CHECK-NEXT:    global_load_ubyte v2, v[2:3], off offset:30
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[4:7] offset:8
+; CHECK-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off offset:23
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:30
+; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[2:3] offset:23
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -689,18 +658,13 @@ define void @memcpy_p0_p4_sz32_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p4_sz32_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[4:5]
-; CHECK-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[4:5] offset:8
-; CHECK-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[4:5] offset:16
-; CHECK-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off offset:24
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off
+; CHECK-NEXT:    global_load_dwordx4 v[8:11], v[2:3], off offset:16
+; CHECK-NEXT:    s_waitcnt vmcnt(1)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[2:3] offset:24
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[8:11] offset:16
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -712,12 +676,9 @@ define void @memcpy_p0_p4_sz16_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p4_sz16_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[4:5]
-; CHECK-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off offset:8
+; CHECK-NEXT:    global_load_dwordx4 v[2:5], v[2:3], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[2:3] offset:8
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -729,24 +690,16 @@ define void @memcpy_p0_p4_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p4_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[4:5]
-; CHECK-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[4:5] offset:8
-; CHECK-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[4:5] offset:16
-; CHECK-NEXT:    global_load_dword v4, v[2:3], off offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_dword v[0:1], v4 offset:24
-; CHECK-NEXT:    global_load_ushort v4, v[2:3], off offset:28
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    global_load_dwordx2 v[8:9], v[2:3], off
+; CHECK-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off offset:8
+; CHECK-NEXT:    s_waitcnt vmcnt(1)
+; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[8:9]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_short v[0:1], v4 offset:28
-; CHECK-NEXT:    global_load_ubyte v2, v[2:3], off offset:30
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[4:7] offset:8
+; CHECK-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off offset:23
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:30
+; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[2:3] offset:23
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -758,18 +711,13 @@ define void @memcpy_p0_p4_sz32_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p4_sz32_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[4:5]
-; CHECK-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[4:5] offset:8
-; CHECK-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[4:5] offset:16
-; CHECK-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off offset:24
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off
+; CHECK-NEXT:    global_load_dwordx4 v[8:11], v[2:3], off offset:16
+; CHECK-NEXT:    s_waitcnt vmcnt(1)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[2:3] offset:24
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[8:11] offset:16
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -895,22 +843,20 @@ define void @memcpy_p0_p5_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p5_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x8
-; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:30
+; CHECK-NEXT:    s_clause 0x7
 ; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ushort v11, v2, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_short v[0:1], v11 offset:28
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:30
-; CHECK-NEXT:    flat_store_dwordx3 v[0:1], v[7:9] offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen offset:23
+; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:27
+; CHECK-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v10, v2, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    s_waitcnt vmcnt(2)
+; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[7:8] offset:23
 ; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6]
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[9:10] offset:16
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -964,22 +910,20 @@ define void @memcpy_p0_p5_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p5_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x8
-; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:30
+; CHECK-NEXT:    s_clause 0x7
 ; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ushort v11, v2, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_short v[0:1], v11 offset:28
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:30
-; CHECK-NEXT:    flat_store_dwordx3 v[0:1], v[7:9] offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen offset:23
+; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:27
+; CHECK-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v10, v2, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    s_waitcnt vmcnt(2)
+; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[7:8] offset:23
 ; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6]
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[9:10] offset:16
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -1161,15 +1105,15 @@ define void @memcpy_p1_p0_sz31_align_1_1(ptr addrspace(1) align 1 %dst, ptr addr
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_clause 0x2
-; CHECK-NEXT:    flat_load_dwordx2 v[6:7], v[2:3] offset:23
-; CHECK-NEXT:    flat_load_dwordx2 v[8:9], v[2:3] offset:16
-; CHECK-NEXT:    flat_load_dwordx4 v[2:5], v[2:3]
+; CHECK-NEXT:    flat_load_dwordx2 v[8:9], v[2:3] offset:23
+; CHECK-NEXT:    flat_load_dwordx4 v[4:7], v[2:3]
+; CHECK-NEXT:    flat_load_dwordx2 v[2:3], v[2:3] offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[6:7], off offset:23
+; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[8:9], off offset:23
 ; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
-; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[8:9], off offset:16
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[4:7], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
+; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off offset:16
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 31, i1 false)
@@ -1211,15 +1155,15 @@ define void @memcpy_p1_p0_sz31_align_2_2(ptr addrspace(1) align 2 %dst, ptr addr
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_clause 0x2
-; CHECK-NEXT:    flat_load_dwordx2 v[6:7], v[2:3] offset:23
-; CHECK-NEXT:    flat_load_dwordx2 v[8:9], v[2:3] offset:16
-; CHECK-NEXT:    flat_load_dwordx4 v[2:5], v[2:3]
+; CHECK-NEXT:    flat_load_dwordx2 v[8:9], v[2:3] offset:23
+; CHECK-NEXT:    flat_load_dwordx4 v[4:7], v[2:3]
+; CHECK-NEXT:    flat_load_dwordx2 v[2:3], v[2:3] offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[6:7], off offset:23
+; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[8:9], off offset:23
 ; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
-; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[8:9], off offset:16
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[4:7], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
+; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off offset:16
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(0) noundef nonnull align 2 %src, i64 31, i1 false)
@@ -1929,18 +1873,18 @@ define void @memcpy_p1_p5_sz31_align_1_1(ptr addrspace(1) align 1 %dst, ptr addr
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_clause 0x7
-; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:20
 ; CHECK-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:23
 ; CHECK-NEXT:    buffer_load_dword v10, v2, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[3:6], off
+; CHECK-NEXT:    s_waitcnt vmcnt(6)
+; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[7:8], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[7:8], off offset:16
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[3:6], off offset:8
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[9:10], off offset:23
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -1994,18 +1938,18 @@ define void @memcpy_p1_p5_sz31_align_2_2(ptr addrspace(1) align 2 %dst, ptr addr
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_clause 0x7
-; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:20
 ; CHECK-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:23
 ; CHECK-NEXT:    buffer_load_dword v10, v2, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[3:6], off
+; CHECK-NEXT:    s_waitcnt vmcnt(6)
+; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[7:8], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[7:8], off offset:16
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[3:6], off offset:8
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[9:10], off offset:23
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -3267,19 +3211,16 @@ define void @memcpy_p5_p0_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p5_p0_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x3
-; CHECK-NEXT:    flat_load_ubyte v8, v[1:2] offset:30
-; CHECK-NEXT:    flat_load_ushort v9, v[1:2] offset:28
-; CHECK-NEXT:    flat_load_dwordx3 v[5:7], v[1:2] offset:16
+; CHECK-NEXT:    s_clause 0x2
+; CHECK-NEXT:    flat_load_dwordx2 v[5:6], v[1:2] offset:23
+; CHECK-NEXT:    flat_load_dwordx2 v[7:8], v[1:2] offset:16
 ; CHECK-NEXT:    flat_load_dwordx4 v[1:4], v[1:2]
-; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:30
 ; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:23
 ; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
-; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
@@ -3334,19 +3275,16 @@ define void @memcpy_p5_p0_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p5_p0_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x3
-; CHECK-NEXT:    flat_load_ubyte v8, v[1:2] offset:30
-; CHECK-NEXT:    flat_load_ushort v9, v[1:2] offset:28
-; CHECK-NEXT:    flat_load_dwordx3 v[5:7], v[1:2] offset:16
+; CHECK-NEXT:    s_clause 0x2
+; CHECK-NEXT:    flat_load_dwordx2 v[5:6], v[1:2] offset:23
+; CHECK-NEXT:    flat_load_dwordx2 v[7:8], v[1:2] offset:16
 ; CHECK-NEXT:    flat_load_dwordx4 v[1:4], v[1:2]
-; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:30
 ; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:23
 ; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
-; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
@@ -3525,24 +3463,21 @@ define void @memcpy_p5_p1_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p5_p1_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x3
-; CHECK-NEXT:    global_load_dwordx3 v[5:7], v[1:2], off offset:16
-; CHECK-NEXT:    global_load_ushort v8, v[1:2], off offset:28
-; CHECK-NEXT:    global_load_ubyte v9, v[1:2], off offset:30
-; CHECK-NEXT:    global_load_dwordx4 v[1:4], v[1:2], off
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    s_clause 0x2
+; CHECK-NEXT:    global_load_dwordx4 v[3:6], v[1:2], off
+; CHECK-NEXT:    global_load_dwordx2 v[7:8], v[1:2], off offset:16
+; CHECK-NEXT:    global_load_dwordx2 v[1:2], v[1:2], off offset:23
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:23
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(1) noundef nonnull align 1 %src, i64 31, i1 false)
@@ -3592,24 +3527,21 @@ define void @memcpy_p5_p1_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p5_p1_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x3
-; CHECK-NEXT:    global_load_dwordx3 v[5:7], v[1:2], off offset:16
-; CHECK-NEXT:    global_load_ushort v8, v[1:2], off offset:28
-; CHECK-NEXT:    global_load_ubyte v9, v[1:2], off offset:30
-; CHECK-NEXT:    global_load_dwordx4 v[1:4], v[1:2], off
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    s_clause 0x2
+; CHECK-NEXT:    global_load_dwordx4 v[3:6], v[1:2], off
+; CHECK-NEXT:    global_load_dwordx2 v[7:8], v[1:2], off offset:16
+; CHECK-NEXT:    global_load_dwordx2 v[1:2], v[1:2], off offset:23
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:23
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(1) noundef nonnull align 2 %src, i64 31, i1 false)
@@ -3783,25 +3715,20 @@ define void @memcpy_p5_p3_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p5_p3_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_read_b32 v8, v1 offset:24
-; CHECK-NEXT:    ds_read_u16 v9, v1 offset:28
-; CHECK-NEXT:    ds_read_u8 v10, v1 offset:30
 ; CHECK-NEXT:    ds_read2_b64 v[2:5], v1 offset1:1
 ; CHECK-NEXT:    ds_read_b64 v[6:7], v1 offset:16
-; CHECK-NEXT:    s_waitcnt lgkmcnt(4)
-; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
-; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    ds_read_b64 v[8:9], v1 offset:23
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(2)
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
 ; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
 ; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:20
 ; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:23
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(3) noundef nonnull align 1 %src, i64 31, i1 false)
@@ -3850,25 +3777,20 @@ define void @memcpy_p5_p3_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p5_p3_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_read_b32 v8, v1 offset:24
-; CHECK-NEXT:    ds_read_u16 v9, v1 offset:28
-; CHECK-NEXT:    ds_read_u8 v10, v1 offset:30
 ; CHECK-NEXT:    ds_read2_b64 v[2:5], v1 offset1:1
 ; CHECK-NEXT:    ds_read_b64 v[6:7], v1 offset:16
-; CHECK-NEXT:    s_waitcnt lgkmcnt(4)
-; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
-; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    ds_read_b64 v[8:9], v1 offset:23
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(2)
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
 ; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
 ; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:20
 ; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:23
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(3) noundef nonnull align 2 %src, i64 31, i1 false)
@@ -4037,24 +3959,21 @@ define void @memcpy_p5_p4_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p5_p4_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x3
-; CHECK-NEXT:    global_load_dwordx3 v[5:7], v[1:2], off offset:16
-; CHECK-NEXT:    global_load_ushort v8, v[1:2], off offset:28
-; CHECK-NEXT:    global_load_ubyte v9, v[1:2], off offset:30
-; CHECK-NEXT:    global_load_dwordx4 v[1:4], v[1:2], off
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    s_clause 0x2
+; CHECK-NEXT:    global_load_dwordx4 v[3:6], v[1:2], off
+; CHECK-NEXT:    global_load_dwordx2 v[7:8], v[1:2], off offset:16
+; CHECK-NEXT:    global_load_dwordx2 v[1:2], v[1:2], off offset:23
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:23
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 31, i1 false)
@@ -4104,24 +4023,21 @@ define void @memcpy_p5_p4_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p5_p4_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x3
-; CHECK-NEXT:    global_load_dwordx3 v[5:7], v[1:2], off offset:16
-; CHECK-NEXT:    global_load_ushort v8, v[1:2], off offset:28
-; CHECK-NEXT:    global_load_ubyte v9, v[1:2], off offset:30
-; CHECK-NEXT:    global_load_dwordx4 v[1:4], v[1:2], off
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    s_clause 0x2
+; CHECK-NEXT:    global_load_dwordx4 v[3:6], v[1:2], off
+; CHECK-NEXT:    global_load_dwordx2 v[7:8], v[1:2], off offset:16
+; CHECK-NEXT:    global_load_dwordx2 v[1:2], v[1:2], off offset:23
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:23
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(4) noundef nonnull align 2 %src, i64 31, i1 false)
@@ -4302,34 +4218,31 @@ define void @memcpy_p5_p5_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p5_p5_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x8
-; CHECK-NEXT:    buffer_load_ushort v2, v1, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    s_clause 0x7
+; CHECK-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen offset:23
+; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:27
 ; CHECK-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    buffer_load_dword v5, v1, s[0:3], 0 offen offset:20
 ; CHECK-NEXT:    buffer_load_dword v6, v1, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    buffer_load_dword v7, v1, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    buffer_load_dword v8, v1, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_dword v9, v1, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    buffer_store_short v2, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:23
 ; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:27
 ; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:20
 ; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 31, i1 false)
@@ -4398,34 +4311,31 @@ define void @memcpy_p5_p5_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p5_p5_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x8
-; CHECK-NEXT:    buffer_load_ushort v2, v1, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    s_clause 0x7
+; CHECK-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen offset:23
+; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:27
 ; CHECK-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    buffer_load_dword v5, v1, s[0:3], 0 offen offset:20
 ; CHECK-NEXT:    buffer_load_dword v6, v1, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    buffer_load_dword v7, v1, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    buffer_load_dword v8, v1, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_dword v9, v1, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    buffer_store_short v2, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:23
 ; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:27
 ; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:20
 ; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 31, i1 false)
diff --git a/llvm/test/CodeGen/AMDGPU/memmove-param-combinations.ll b/llvm/test/CodeGen/AMDGPU/memmove-param-combinations.ll
index f08ea27040fb5..01b7f40f6256f 100644
--- a/llvm/test/CodeGen/AMDGPU/memmove-param-combinations.ll
+++ b/llvm/test/CodeGen/AMDGPU/memmove-param-combinations.ll
@@ -471,7 +471,7 @@ define void @memmove_p0_p3_sz16_align_1_1(ptr addrspace(0) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p0_p3_sz16_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_read2_b64 v[2:5], v2 offset1:1
+; CHECK-NEXT:    ds_read_b128 v[2:5], v2
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
@@ -489,7 +489,7 @@ define void @memmove_p0_p3_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr add
 ; CHECK-NEXT:    ds_read_u8 v9, v2 offset:30
 ; CHECK-NEXT:    ds_read_u16 v10, v2 offset:28
 ; CHECK-NEXT:    ds_read_b64 v[6:7], v2 offset:16
-; CHECK-NEXT:    ds_read2_b64 v[2:5], v2 offset1:1
+; CHECK-NEXT:    ds_read_b128 v[2:5], v2
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
 ; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:30
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
@@ -509,8 +509,8 @@ define void @memmove_p0_p3_sz32_align_1_1(ptr addrspace(0) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p0_p3_sz32_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_read2_b64 v[3:6], v2 offset0:2 offset1:3
-; CHECK-NEXT:    ds_read2_b64 v[7:10], v2 offset1:1
+; CHECK-NEXT:    ds_read_b128 v[3:6], v2 offset:16
+; CHECK-NEXT:    ds_read_b128 v[7:10], v2
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
 ; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6] offset:16
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
@@ -526,7 +526,7 @@ define void @memmove_p0_p3_sz16_align_2_2(ptr addrspace(0) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p0_p3_sz16_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_read2_b64 v[2:5], v2 offset1:1
+; CHECK-NEXT:    ds_read_b128 v[2:5], v2
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
@@ -544,7 +544,7 @@ define void @memmove_p0_p3_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr add
 ; CHECK-NEXT:    ds_read_u8 v9, v2 offset:30
 ; CHECK-NEXT:    ds_read_u16 v10, v2 offset:28
 ; CHECK-NEXT:    ds_read_b64 v[6:7], v2 offset:16
-; CHECK-NEXT:    ds_read2_b64 v[2:5], v2 offset1:1
+; CHECK-NEXT:    ds_read_b128 v[2:5], v2
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
 ; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:30
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
@@ -564,8 +564,8 @@ define void @memmove_p0_p3_sz32_align_2_2(ptr addrspace(0) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p0_p3_sz32_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_read2_b64 v[3:6], v2 offset0:2 offset1:3
-; CHECK-NEXT:    ds_read2_b64 v[7:10], v2 offset1:1
+; CHECK-NEXT:    ds_read_b128 v[3:6], v2 offset:16
+; CHECK-NEXT:    ds_read_b128 v[7:10], v2
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
 ; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6] offset:16
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
@@ -2077,21 +2077,23 @@ define void @memmove_p1_p5_sz31_align_1_1(ptr addrspace(1) align 1 %dst, ptr add
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_clause 0x8
-; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:30
+; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v10, v2, s[0:3], 0 offen offset:24
 ; CHECK-NEXT:    buffer_load_ushort v11, v2, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    s_waitcnt vmcnt(4)
+; CHECK-NEXT:    global_store_dword v[0:1], v10, off offset:24
 ; CHECK-NEXT:    s_waitcnt vmcnt(3)
 ; CHECK-NEXT:    global_store_short v[0:1], v11, off offset:28
-; CHECK-NEXT:    global_store_byte v[0:1], v10, off offset:30
-; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[3:6], off
+; CHECK-NEXT:    global_store_byte v[0:1], v9, off offset:30
+; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[7:8], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx3 v[0:1], v[7:9], off offset:16
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[3:6], off offset:8
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p1.p5.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 31, i1 false)
@@ -2143,21 +2145,23 @@ define void @memmove_p1_p5_sz31_align_2_2(ptr addrspace(1) align 2 %dst, ptr add
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_clause 0x8
-; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:30
+; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v10, v2, s[0:3], 0 offen offset:24
 ; CHECK-NEXT:    buffer_load_ushort v11, v2, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    s_waitcnt vmcnt(4)
+; CHECK-NEXT:    global_store_dword v[0:1], v10, off offset:24
 ; CHECK-NEXT:    s_waitcnt vmcnt(3)
 ; CHECK-NEXT:    global_store_short v[0:1], v11, off offset:28
-; CHECK-NEXT:    global_store_byte v[0:1], v10, off offset:30
-; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[3:6], off
+; CHECK-NEXT:    global_store_byte v[0:1], v9, off offset:30
+; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[7:8], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx3 v[0:1], v[7:9], off offset:16
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[3:6], off offset:8
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p1.p5.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 31, i1 false)
diff --git a/llvm/test/CodeGen/AMDGPU/mfma-cd-select.ll b/llvm/test/CodeGen/AMDGPU/mfma-cd-select.ll
index f7aaa3ec4d0ed..9585c486aeb9e 100644
--- a/llvm/test/CodeGen/AMDGPU/mfma-cd-select.ll
+++ b/llvm/test/CodeGen/AMDGPU/mfma-cd-select.ll
@@ -84,8 +84,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_vgpr(ptr addrspace(1) %arg)
 ; GFX908-NEXT:    v_mov_b32_e32 v0, 2.0
 ; GFX908-NEXT:    s_nop 1
 ; GFX908-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v3, v0, a[0:31]
-; GFX908-NEXT:    s_nop 7
-; GFX908-NEXT:    s_nop 7
+; GFX908-NEXT:    s_nop 15
 ; GFX908-NEXT:    s_nop 1
 ; GFX908-NEXT:    v_accvgpr_read_b32 v3, a27
 ; GFX908-NEXT:    v_accvgpr_read_b32 v2, a26
@@ -227,8 +226,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_agpr(ptr addrspace(1) %arg)
 ; GFX908-NEXT:    v_mov_b32_e32 v0, 2.0
 ; GFX908-NEXT:    s_nop 1
 ; GFX908-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v3, v0, a[0:31]
-; GFX908-NEXT:    s_nop 7
-; GFX908-NEXT:    s_nop 7
+; GFX908-NEXT:    s_nop 15
 ; GFX908-NEXT:    s_nop 1
 ; GFX908-NEXT:    v_accvgpr_read_b32 v3, a27
 ; GFX908-NEXT:    v_accvgpr_read_b32 v2, a26
@@ -347,8 +345,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_inline_asm_virtual_agpr(ptr
 ; GFX908-NEXT:    v_mov_b32_e32 v1, 2.0
 ; GFX908-NEXT:    s_nop 1
 ; GFX908-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
-; GFX908-NEXT:    s_nop 7
-; GFX908-NEXT:    s_nop 7
+; GFX908-NEXT:    s_nop 15
 ; GFX908-NEXT:    s_nop 1
 ; GFX908-NEXT:    v_accvgpr_read_b32 v3, a27
 ; GFX908-NEXT:    v_accvgpr_read_b32 v2, a26
@@ -454,8 +451,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_inline_asm_phys_agpr(ptr add
 ; GFX908-NEXT:    v_mov_b32_e32 v1, 2.0
 ; GFX908-NEXT:    s_nop 1
 ; GFX908-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
-; GFX908-NEXT:    s_nop 7
-; GFX908-NEXT:    s_nop 7
+; GFX908-NEXT:    s_nop 15
 ; GFX908-NEXT:    s_nop 1
 ; GFX908-NEXT:    v_accvgpr_read_b32 v3, a27
 ; GFX908-NEXT:    v_accvgpr_read_b32 v2, a26
@@ -561,8 +557,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_inline_asm_no_agprs(ptr addr
 ; GFX908-NEXT:    v_mov_b32_e32 v1, 2.0
 ; GFX908-NEXT:    s_nop 1
 ; GFX908-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
-; GFX908-NEXT:    s_nop 7
-; GFX908-NEXT:    s_nop 7
+; GFX908-NEXT:    s_nop 15
 ; GFX908-NEXT:    s_nop 1
 ; GFX908-NEXT:    v_accvgpr_read_b32 v3, a27
 ; GFX908-NEXT:    v_accvgpr_read_b32 v2, a26
@@ -690,8 +685,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_call(ptr addrspace(1) %arg)
 ; GFX908-NEXT:    v_mov_b32_e32 v1, 2.0
 ; GFX908-NEXT:    s_nop 1
 ; GFX908-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
-; GFX908-NEXT:    s_nop 7
-; GFX908-NEXT:    s_nop 7
+; GFX908-NEXT:    s_nop 15
 ; GFX908-NEXT:    s_nop 1
 ; GFX908-NEXT:    v_accvgpr_read_b32 v3, a27
 ; GFX908-NEXT:    v_accvgpr_read_b32 v2, a26
@@ -835,8 +829,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_call_multi_bb(ptr addrspace(
 ; GFX908-NEXT:    v_mov_b32_e32 v3, 2.0
 ; GFX908-NEXT:    s_nop 1
 ; GFX908-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v6, v3, a[0:31] cbsz:1 abid:2 blgp:3
-; GFX908-NEXT:    s_nop 7
-; GFX908-NEXT:    s_nop 7
+; GFX908-NEXT:    s_nop 15
 ; GFX908-NEXT:    s_nop 1
 ; GFX908-NEXT:    v_accvgpr_read_b32 v6, a27
 ; GFX908-NEXT:    v_accvgpr_read_b32 v5, a26
@@ -977,8 +970,7 @@ define void @test_mfma_f32_32x32x1f32_nonentry_noagpr(ptr addrspace(1) %arg) #0
 ; GFX908-NEXT:    v_mov_b32_e32 v3, 2.0
 ; GFX908-NEXT:    s_nop 1
 ; GFX908-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]
-; GFX908-NEXT:    s_nop 7
-; GFX908-NEXT:    s_nop 7
+; GFX908-NEXT:    s_nop 15
 ; GFX908-NEXT:    s_nop 1
 ; GFX908-NEXT:    v_accvgpr_read_b32 v5, a27
 ; GFX908-NEXT:    v_accvgpr_read_b32 v4, a26
@@ -1079,8 +1071,7 @@ define void @test_mfma_f32_32x32x1f32_nonentry_with_agpr(ptr addrspace(1) %arg)
 ; GFX908-NEXT:    v_mov_b32_e32 v3, 2.0
 ; GFX908-NEXT:    s_nop 1
 ; GFX908-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]
-; GFX908-NEXT:    s_nop 7
-; GFX908-NEXT:    s_nop 7
+; GFX908-NEXT:    s_nop 15
 ; GFX908-NEXT:    s_nop 1
 ; GFX908-NEXT:    v_accvgpr_read_b32 v5, a27
 ; GFX908-NEXT:    v_accvgpr_read_b32 v4, a26
diff --git a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll
index d39daaade677f..3b8efafba06f4 100644
--- a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll
@@ -54,8 +54,7 @@ define amdgpu_kernel void @test_mfma_loop_zeroinit(ptr addrspace(1) %arg) #0 {
 ; GFX908-NEXT:    s_cbranch_scc1 .LBB0_1
 ; GFX908-NEXT:  ; %bb.2: ; %exit
 ; GFX908-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX908-NEXT:    s_nop 7
-; GFX908-NEXT:    s_nop 5
+; GFX908-NEXT:    s_nop 13
 ; GFX908-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GFX908-NEXT:    v_accvgpr_read_b32 v28, a28
 ; GFX908-NEXT:    v_accvgpr_read_b32 v29, a29
@@ -148,8 +147,7 @@ define amdgpu_kernel void @test_mfma_loop_zeroinit(ptr addrspace(1) %arg) #0 {
 ; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_nop 7
-; GFX90A-NEXT:    s_nop 4
+; GFX90A-NEXT:    s_nop 12
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@@ -208,8 +206,7 @@ define amdgpu_kernel void @test_mfma_loop_zeroinit(ptr addrspace(1) %arg) #0 {
 ; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    s_nop 7
-; GFX942-NEXT:    s_nop 3
+; GFX942-NEXT:    s_nop 11
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@@ -288,8 +285,7 @@ define amdgpu_kernel void @test_mfma_loop_unfoldable_splat(ptr addrspace(1) %arg
 ; GFX908-NEXT:    s_cbranch_scc1 .LBB1_1
 ; GFX908-NEXT:  ; %bb.2: ; %exit
 ; GFX908-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX908-NEXT:    s_nop 7
-; GFX908-NEXT:    s_nop 5
+; GFX908-NEXT:    s_nop 13
 ; GFX908-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GFX908-NEXT:    v_accvgpr_read_b32 v28, a28
 ; GFX908-NEXT:    v_accvgpr_read_b32 v29, a29
@@ -383,8 +379,7 @@ define amdgpu_kernel void @test_mfma_loop_unfoldable_splat(ptr addrspace(1) %arg
 ; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_nop 7
-; GFX90A-NEXT:    s_nop 4
+; GFX90A-NEXT:    s_nop 12
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@@ -444,8 +439,7 @@ define amdgpu_kernel void @test_mfma_loop_unfoldable_splat(ptr addrspace(1) %arg
 ; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    s_nop 7
-; GFX942-NEXT:    s_nop 3
+; GFX942-NEXT:    s_nop 11
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@@ -518,8 +512,7 @@ define amdgpu_kernel void @test_mfma_loop_non_splat(ptr addrspace(1) %arg) #0 {
 ; GFX908-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX908-NEXT:  ; %bb.2: ; %exit
 ; GFX908-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX908-NEXT:    s_nop 7
-; GFX908-NEXT:    s_nop 5
+; GFX908-NEXT:    s_nop 13
 ; GFX908-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GFX908-NEXT:    v_accvgpr_read_b32 v28, a28
 ; GFX908-NEXT:    v_accvgpr_read_b32 v29, a29
@@ -612,8 +605,7 @@ define amdgpu_kernel void @test_mfma_loop_non_splat(ptr addrspace(1) %arg) #0 {
 ; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_nop 7
-; GFX90A-NEXT:    s_nop 4
+; GFX90A-NEXT:    s_nop 12
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@@ -672,8 +664,7 @@ define amdgpu_kernel void @test_mfma_loop_non_splat(ptr addrspace(1) %arg) #0 {
 ; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    s_nop 7
-; GFX942-NEXT:    s_nop 3
+; GFX942-NEXT:    s_nop 11
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@@ -783,8 +774,7 @@ define amdgpu_kernel void @test_mfma_loop_unfoldable_seq(ptr addrspace(1) %arg)
 ; GFX908-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX908-NEXT:  ; %bb.2: ; %exit
 ; GFX908-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX908-NEXT:    s_nop 7
-; GFX908-NEXT:    s_nop 5
+; GFX908-NEXT:    s_nop 13
 ; GFX908-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GFX908-NEXT:    v_accvgpr_read_b32 v28, a28
 ; GFX908-NEXT:    v_accvgpr_read_b32 v29, a29
@@ -909,8 +899,7 @@ define amdgpu_kernel void @test_mfma_loop_unfoldable_seq(ptr addrspace(1) %arg)
 ; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_nop 7
-; GFX90A-NEXT:    s_nop 4
+; GFX90A-NEXT:    s_nop 12
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@@ -1001,8 +990,7 @@ define amdgpu_kernel void @test_mfma_loop_unfoldable_seq(ptr addrspace(1) %arg)
 ; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    s_nop 7
-; GFX942-NEXT:    s_nop 3
+; GFX942-NEXT:    s_nop 11
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@@ -1075,8 +1063,7 @@ define amdgpu_kernel void @test_mfma_loop_vgpr_init(ptr addrspace(1) %arg) #0 {
 ; GFX908-NEXT:    s_cbranch_scc1 .LBB4_1
 ; GFX908-NEXT:  ; %bb.2: ; %exit
 ; GFX908-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX908-NEXT:    s_nop 7
-; GFX908-NEXT:    s_nop 5
+; GFX908-NEXT:    s_nop 13
 ; GFX908-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GFX908-NEXT:    v_accvgpr_read_b32 v28, a28
 ; GFX908-NEXT:    v_accvgpr_read_b32 v29, a29
@@ -1170,8 +1157,7 @@ define amdgpu_kernel void @test_mfma_loop_vgpr_init(ptr addrspace(1) %arg) #0 {
 ; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_nop 7
-; GFX90A-NEXT:    s_nop 4
+; GFX90A-NEXT:    s_nop 12
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@@ -1231,8 +1217,7 @@ define amdgpu_kernel void @test_mfma_loop_vgpr_init(ptr addrspace(1) %arg) #0 {
 ; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    s_nop 7
-; GFX942-NEXT:    s_nop 3
+; GFX942-NEXT:    s_nop 11
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@@ -1344,8 +1329,7 @@ define amdgpu_kernel void @test_mfma_loop_sgpr_init(ptr addrspace(1) %arg, float
 ; GFX908-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX908-NEXT:  ; %bb.2: ; %exit
 ; GFX908-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX908-NEXT:    s_nop 7
-; GFX908-NEXT:    s_nop 5
+; GFX908-NEXT:    s_nop 13
 ; GFX908-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GFX908-NEXT:    v_accvgpr_read_b32 v28, a28
 ; GFX908-NEXT:    v_accvgpr_read_b32 v29, a29
@@ -1441,8 +1425,7 @@ define amdgpu_kernel void @test_mfma_loop_sgpr_init(ptr addrspace(1) %arg, float
 ; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_nop 7
-; GFX90A-NEXT:    s_nop 4
+; GFX90A-NEXT:    s_nop 12
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@@ -1504,8 +1487,7 @@ define amdgpu_kernel void @test_mfma_loop_sgpr_init(ptr addrspace(1) %arg, float
 ; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    s_nop 7
-; GFX942-NEXT:    s_nop 3
+; GFX942-NEXT:    s_nop 11
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@@ -1614,8 +1596,7 @@ define amdgpu_kernel void @test_mfma_loop_mixed_init(ptr addrspace(1) %arg, floa
 ; GFX908-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX908-NEXT:  ; %bb.2: ; %exit
 ; GFX908-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX908-NEXT:    s_nop 7
-; GFX908-NEXT:    s_nop 5
+; GFX908-NEXT:    s_nop 13
 ; GFX908-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GFX908-NEXT:    v_accvgpr_read_b32 v28, a28
 ; GFX908-NEXT:    v_accvgpr_read_b32 v29, a29
@@ -1712,8 +1693,7 @@ define amdgpu_kernel void @test_mfma_loop_mixed_init(ptr addrspace(1) %arg, floa
 ; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_nop 7
-; GFX90A-NEXT:    s_nop 4
+; GFX90A-NEXT:    s_nop 12
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@@ -1776,8 +1756,7 @@ define amdgpu_kernel void @test_mfma_loop_mixed_init(ptr addrspace(1) %arg, floa
 ; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    s_nop 7
-; GFX942-NEXT:    s_nop 3
+; GFX942-NEXT:    s_nop 11
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@@ -1856,8 +1835,7 @@ define amdgpu_kernel void @test_mfma_loop_mfma_forward_init(ptr addrspace(1) %ar
 ; GFX908-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX908-NEXT:  ; %bb.2: ; %exit
 ; GFX908-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX908-NEXT:    s_nop 7
-; GFX908-NEXT:    s_nop 5
+; GFX908-NEXT:    s_nop 13
 ; GFX908-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GFX908-NEXT:    v_accvgpr_read_b32 v28, a28
 ; GFX908-NEXT:    v_accvgpr_read_b32 v29, a29
@@ -1919,8 +1897,7 @@ define amdgpu_kernel void @test_mfma_loop_mfma_forward_init(ptr addrspace(1) %ar
 ; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_nop 7
-; GFX90A-NEXT:    s_nop 4
+; GFX90A-NEXT:    s_nop 12
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@@ -1948,8 +1925,7 @@ define amdgpu_kernel void @test_mfma_loop_mfma_forward_init(ptr addrspace(1) %ar
 ; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    s_nop 7
-; GFX942-NEXT:    s_nop 3
+; GFX942-NEXT:    s_nop 11
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@@ -2019,8 +1995,7 @@ define amdgpu_kernel void @test_mfma_loop_agpr_init(ptr addrspace(1) %arg) #0 {
 ; GFX908-NEXT:    s_mov_b32 s0, 16
 ; GFX908-NEXT:    s_nop 0
 ; GFX908-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
-; GFX908-NEXT:    s_nop 7
-; GFX908-NEXT:    s_nop 7
+; GFX908-NEXT:    s_nop 15
 ; GFX908-NEXT:    s_nop 1
 ; GFX908-NEXT:    v_accvgpr_read_b32 v2, a0
 ; GFX908-NEXT:    s_nop 1
@@ -2065,8 +2040,7 @@ define amdgpu_kernel void @test_mfma_loop_agpr_init(ptr addrspace(1) %arg) #0 {
 ; GFX908-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX908-NEXT:  ; %bb.2: ; %exit
 ; GFX908-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX908-NEXT:    s_nop 7
-; GFX908-NEXT:    s_nop 5
+; GFX908-NEXT:    s_nop 13
 ; GFX908-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GFX908-NEXT:    v_accvgpr_read_b32 v28, a28
 ; GFX908-NEXT:    v_accvgpr_read_b32 v29, a29
@@ -2118,8 +2092,7 @@ define amdgpu_kernel void @test_mfma_loop_agpr_init(ptr addrspace(1) %arg) #0 {
 ; GFX90A-NEXT:    s_mov_b32 s0, 16
 ; GFX90A-NEXT:    s_nop 0
 ; GFX90A-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v0, v1, 0
-; GFX90A-NEXT:    s_nop 7
-; GFX90A-NEXT:    s_nop 7
+; GFX90A-NEXT:    s_nop 15
 ; GFX90A-NEXT:    s_nop 2
 ; GFX90A-NEXT:    v_accvgpr_mov_b32 a1, a0
 ; GFX90A-NEXT:    v_accvgpr_mov_b32 a2, a0
@@ -2163,8 +2136,7 @@ define amdgpu_kernel void @test_mfma_loop_agpr_init(ptr addrspace(1) %arg) #0 {
 ; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_nop 7
-; GFX90A-NEXT:    s_nop 4
+; GFX90A-NEXT:    s_nop 12
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@@ -2182,8 +2154,7 @@ define amdgpu_kernel void @test_mfma_loop_agpr_init(ptr addrspace(1) %arg) #0 {
 ; GFX942-NEXT:    s_mov_b32 s0, 16
 ; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, 0
-; GFX942-NEXT:    s_nop 7
-; GFX942-NEXT:    s_nop 7
+; GFX942-NEXT:    s_nop 15
 ; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_accvgpr_mov_b32 a1, a0
 ; GFX942-NEXT:    v_accvgpr_mov_b32 a2, a0
@@ -2227,8 +2198,7 @@ define amdgpu_kernel void @test_mfma_loop_agpr_init(ptr addrspace(1) %arg) #0 {
 ; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    s_nop 7
-; GFX942-NEXT:    s_nop 3
+; GFX942-NEXT:    s_nop 11
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@@ -2349,8 +2319,7 @@ define amdgpu_kernel void @test_mfma_nested_loop_zeroinit(ptr addrspace(1) %arg)
 ; GFX908-NEXT:    s_cbranch_scc1 .LBB9_1
 ; GFX908-NEXT:  ; %bb.4: ; %exit
 ; GFX908-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX908-NEXT:    s_nop 7
-; GFX908-NEXT:    s_nop 2
+; GFX908-NEXT:    s_nop 10
 ; GFX908-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GFX908-NEXT:    v_accvgpr_read_b32 v28, a28
 ; GFX908-NEXT:    v_accvgpr_read_b32 v29, a29
@@ -2453,8 +2422,7 @@ define amdgpu_kernel void @test_mfma_nested_loop_zeroinit(ptr addrspace(1) %arg)
 ; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_nop 7
-; GFX90A-NEXT:    s_nop 1
+; GFX90A-NEXT:    s_nop 9
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@@ -2523,8 +2491,7 @@ define amdgpu_kernel void @test_mfma_nested_loop_zeroinit(ptr addrspace(1) %arg)
 ; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    s_nop 7
-; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    s_nop 8
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
diff --git a/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll b/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll
index e6d7b14381d7a..51cd564bdece3 100644
--- a/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll
+++ b/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll
@@ -93,8 +93,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 {
 ; GREEDY908-NEXT:    s_nop 0
 ; GREEDY908-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v3, v0, a[0:31]
 ; GREEDY908-NEXT:    v_mfma_f32_32x32x1f32 a[32:63], v3, v0, a[0:31]
-; GREEDY908-NEXT:    s_nop 7
-; GREEDY908-NEXT:    s_nop 7
+; GREEDY908-NEXT:    s_nop 15
 ; GREEDY908-NEXT:    s_nop 1
 ; GREEDY908-NEXT:    v_accvgpr_read_b32 v1, a32
 ; GREEDY908-NEXT:    v_accvgpr_read_b32 v5, a61
@@ -158,8 +157,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 {
 ; GREEDY908-NEXT:    v_accvgpr_write_b32 a31, v5
 ; GREEDY908-NEXT:    s_nop 0
 ; GREEDY908-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v3, v0, a[0:31]
-; GREEDY908-NEXT:    s_nop 7
-; GREEDY908-NEXT:    s_nop 7
+; GREEDY908-NEXT:    s_nop 15
 ; GREEDY908-NEXT:    s_nop 1
 ; GREEDY908-NEXT:    v_accvgpr_read_b32 v3, a27
 ; GREEDY908-NEXT:    v_accvgpr_read_b32 v2, a26
@@ -263,8 +261,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 {
 ; GREEDY90A-NEXT:    s_nop 1
 ; GREEDY90A-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
 ; GREEDY90A-NEXT:    v_mfma_f32_32x32x1f32 a[32:63], v0, v1, a[0:31]
-; GREEDY90A-NEXT:    s_nop 7
-; GREEDY90A-NEXT:    s_nop 7
+; GREEDY90A-NEXT:    s_nop 15
 ; GREEDY90A-NEXT:    s_nop 2
 ; GREEDY90A-NEXT:    v_accvgpr_mov_b32 a2, a32
 ; GREEDY90A-NEXT:    v_accvgpr_mov_b32 a3, a33
@@ -298,8 +295,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 {
 ; GREEDY90A-NEXT:    v_accvgpr_mov_b32 a31, a61
 ; GREEDY90A-NEXT:    s_nop 1
 ; GREEDY90A-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
-; GREEDY90A-NEXT:    s_nop 7
-; GREEDY90A-NEXT:    s_nop 7
+; GREEDY90A-NEXT:    s_nop 15
 ; GREEDY90A-NEXT:    s_nop 2
 ; GREEDY90A-NEXT:    global_store_dwordx4 v2, a[24:27], s[34:35] offset:96
 ; GREEDY90A-NEXT:    global_store_dwordx4 v2, a[28:31], s[34:35] offset:112
@@ -356,8 +352,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 {
 ; GREEDY942-NEXT:    s_nop 1
 ; GREEDY942-NEXT:    v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31]
 ; GREEDY942-NEXT:    v_mfma_f32_32x32x1_2b_f32 a[32:63], v0, v1, a[0:31]
-; GREEDY942-NEXT:    s_nop 7
-; GREEDY942-NEXT:    s_nop 7
+; GREEDY942-NEXT:    s_nop 15
 ; GREEDY942-NEXT:    s_nop 1
 ; GREEDY942-NEXT:    v_accvgpr_mov_b32 a2, a32
 ; GREEDY942-NEXT:    v_accvgpr_mov_b32 a3, a33
@@ -391,8 +386,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 {
 ; GREEDY942-NEXT:    v_accvgpr_mov_b32 a31, a61
 ; GREEDY942-NEXT:    s_nop 1
 ; GREEDY942-NEXT:    v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31]
-; GREEDY942-NEXT:    s_nop 7
-; GREEDY942-NEXT:    s_nop 7
+; GREEDY942-NEXT:    s_nop 15
 ; GREEDY942-NEXT:    s_nop 1
 ; GREEDY942-NEXT:    global_store_dwordx4 v2, a[24:27], s[34:35] offset:96
 ; GREEDY942-NEXT:    global_store_dwordx4 v2, a[28:31], s[34:35] offset:112
@@ -448,8 +442,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 {
 ; GREEDY90A-GISEL-NEXT:    s_nop 1
 ; GREEDY90A-GISEL-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
 ; GREEDY90A-GISEL-NEXT:    v_mfma_f32_32x32x1f32 a[32:63], v0, v1, a[0:31]
-; GREEDY90A-GISEL-NEXT:    s_nop 7
-; GREEDY90A-GISEL-NEXT:    s_nop 7
+; GREEDY90A-GISEL-NEXT:    s_nop 15
 ; GREEDY90A-GISEL-NEXT:    s_nop 2
 ; GREEDY90A-GISEL-NEXT:    v_accvgpr_mov_b32 a2, a32
 ; GREEDY90A-GISEL-NEXT:    v_accvgpr_mov_b32 a3, a33
@@ -484,8 +477,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 {
 ; GREEDY90A-GISEL-NEXT:    s_nop 1
 ; GREEDY90A-GISEL-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
 ; GREEDY90A-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GREEDY90A-GISEL-NEXT:    s_nop 7
-; GREEDY90A-GISEL-NEXT:    s_nop 7
+; GREEDY90A-GISEL-NEXT:    s_nop 15
 ; GREEDY90A-GISEL-NEXT:    s_nop 1
 ; GREEDY90A-GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[34:35]
 ; GREEDY90A-GISEL-NEXT:    global_store_dwordx4 v0, a[4:7], s[34:35] offset:16
@@ -542,8 +534,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 {
 ; FAST90A-NEXT:    s_nop 1
 ; FAST90A-NEXT:    v_mfma_f32_32x32x1f32 a[32:63], v1, v2, a[32:63]
 ; FAST90A-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[32:63]
-; FAST90A-NEXT:    s_nop 7
-; FAST90A-NEXT:    s_nop 7
+; FAST90A-NEXT:    s_nop 15
 ; FAST90A-NEXT:    s_nop 2
 ; FAST90A-NEXT:    v_accvgpr_read_b32 v3, a29
 ; FAST90A-NEXT:    v_accvgpr_read_b32 v4, a28
@@ -609,8 +600,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 {
 ; FAST90A-NEXT:    v_accvgpr_write_b32 a31, v3
 ; FAST90A-NEXT:    s_nop 1
 ; FAST90A-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31]
-; FAST90A-NEXT:    s_nop 7
-; FAST90A-NEXT:    s_nop 7
+; FAST90A-NEXT:    s_nop 15
 ; FAST90A-NEXT:    s_nop 2
 ; FAST90A-NEXT:    global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
 ; FAST90A-NEXT:    global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
@@ -676,8 +666,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 {
 ; GREEDY908-NEXT:    s_nop 1
 ; GREEDY908-NEXT:    v_mfma_f32_16x16x1f32 a[18:33], v0, v1, a[18:33]
 ; GREEDY908-NEXT:    v_mfma_f32_16x16x1f32 a[2:17], v0, v1, a[18:33]
-; GREEDY908-NEXT:    s_nop 7
-; GREEDY908-NEXT:    s_nop 0
+; GREEDY908-NEXT:    s_nop 8
 ; GREEDY908-NEXT:    v_accvgpr_read_b32 v2, a19
 ; GREEDY908-NEXT:    v_accvgpr_read_b32 v3, a18
 ; GREEDY908-NEXT:    s_nop 0
@@ -685,8 +674,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 {
 ; GREEDY908-NEXT:    v_accvgpr_write_b32 a0, v3
 ; GREEDY908-NEXT:    s_nop 0
 ; GREEDY908-NEXT:    v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15]
-; GREEDY908-NEXT:    s_nop 7
-; GREEDY908-NEXT:    s_nop 1
+; GREEDY908-NEXT:    s_nop 9
 ; GREEDY908-NEXT:    v_accvgpr_read_b32 v3, a15
 ; GREEDY908-NEXT:    v_accvgpr_read_b32 v2, a14
 ; GREEDY908-NEXT:    v_accvgpr_read_b32 v1, a13
@@ -744,14 +732,12 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 {
 ; GREEDY90A-NEXT:    s_nop 1
 ; GREEDY90A-NEXT:    v_mfma_f32_16x16x1f32 a[18:33], v0, v1, a[18:33]
 ; GREEDY90A-NEXT:    v_mfma_f32_16x16x1f32 a[2:17], v0, v1, a[18:33]
-; GREEDY90A-NEXT:    s_nop 7
-; GREEDY90A-NEXT:    s_nop 1
+; GREEDY90A-NEXT:    s_nop 9
 ; GREEDY90A-NEXT:    v_accvgpr_mov_b32 a0, a18
 ; GREEDY90A-NEXT:    v_accvgpr_mov_b32 a1, a19
 ; GREEDY90A-NEXT:    s_nop 1
 ; GREEDY90A-NEXT:    v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15]
-; GREEDY90A-NEXT:    s_nop 7
-; GREEDY90A-NEXT:    s_nop 2
+; GREEDY90A-NEXT:    s_nop 10
 ; GREEDY90A-NEXT:    global_store_dwordx4 v2, a[12:15], s[16:17] offset:48
 ; GREEDY90A-NEXT:    global_store_dwordx4 v2, a[8:11], s[16:17] offset:32
 ; GREEDY90A-NEXT:    global_store_dwordx4 v2, a[4:7], s[16:17] offset:16
@@ -786,14 +772,12 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 {
 ; GREEDY942-NEXT:    s_nop 1
 ; GREEDY942-NEXT:    v_mfma_f32_16x16x1_4b_f32 a[18:33], v0, v1, a[18:33]
 ; GREEDY942-NEXT:    v_mfma_f32_16x16x1_4b_f32 a[2:17], v0, v1, a[18:33]
-; GREEDY942-NEXT:    s_nop 7
-; GREEDY942-NEXT:    s_nop 0
+; GREEDY942-NEXT:    s_nop 8
 ; GREEDY942-NEXT:    v_accvgpr_mov_b32 a0, a18
 ; GREEDY942-NEXT:    v_accvgpr_mov_b32 a1, a19
 ; GREEDY942-NEXT:    s_nop 1
 ; GREEDY942-NEXT:    v_mfma_f32_16x16x1_4b_f32 a[0:15], v0, v1, a[0:15]
-; GREEDY942-NEXT:    s_nop 7
-; GREEDY942-NEXT:    s_nop 1
+; GREEDY942-NEXT:    s_nop 9
 ; GREEDY942-NEXT:    global_store_dwordx4 v2, a[12:15], s[16:17] offset:48
 ; GREEDY942-NEXT:    global_store_dwordx4 v2, a[8:11], s[16:17] offset:32
 ; GREEDY942-NEXT:    global_store_dwordx4 v2, a[4:7], s[16:17] offset:16
@@ -827,8 +811,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 {
 ; GREEDY90A-GISEL-NEXT:    s_nop 1
 ; GREEDY90A-GISEL-NEXT:    v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15]
 ; GREEDY90A-GISEL-NEXT:    v_mfma_f32_16x16x1f32 a[16:31], v0, v1, a[0:15]
-; GREEDY90A-GISEL-NEXT:    s_nop 7
-; GREEDY90A-GISEL-NEXT:    s_nop 2
+; GREEDY90A-GISEL-NEXT:    s_nop 10
 ; GREEDY90A-GISEL-NEXT:    v_accvgpr_mov_b32 a2, a16
 ; GREEDY90A-GISEL-NEXT:    v_accvgpr_mov_b32 a3, a17
 ; GREEDY90A-GISEL-NEXT:    v_accvgpr_mov_b32 a4, a18
@@ -846,8 +829,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 {
 ; GREEDY90A-GISEL-NEXT:    s_nop 1
 ; GREEDY90A-GISEL-NEXT:    v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15]
 ; GREEDY90A-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GREEDY90A-GISEL-NEXT:    s_nop 7
-; GREEDY90A-GISEL-NEXT:    s_nop 1
+; GREEDY90A-GISEL-NEXT:    s_nop 9
 ; GREEDY90A-GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[16:17]
 ; GREEDY90A-GISEL-NEXT:    global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
 ; GREEDY90A-GISEL-NEXT:    global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
@@ -882,8 +864,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 {
 ; FAST90A-NEXT:    s_nop 1
 ; FAST90A-NEXT:    v_mfma_f32_16x16x1f32 a[0:15], v1, v2, a[0:15]
 ; FAST90A-NEXT:    v_mfma_f32_16x16x1f32 a[16:31], v1, v2, a[0:15]
-; FAST90A-NEXT:    s_nop 7
-; FAST90A-NEXT:    s_nop 2
+; FAST90A-NEXT:    s_nop 10
 ; FAST90A-NEXT:    v_accvgpr_mov_b32 a2, a16
 ; FAST90A-NEXT:    v_accvgpr_mov_b32 a3, a17
 ; FAST90A-NEXT:    v_accvgpr_mov_b32 a4, a18
@@ -900,8 +881,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 {
 ; FAST90A-NEXT:    v_accvgpr_mov_b32 a15, a29
 ; FAST90A-NEXT:    s_nop 1
 ; FAST90A-NEXT:    v_mfma_f32_16x16x1f32 a[0:15], v1, v2, a[0:15]
-; FAST90A-NEXT:    s_nop 7
-; FAST90A-NEXT:    s_nop 2
+; FAST90A-NEXT:    s_nop 10
 ; FAST90A-NEXT:    global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
 ; FAST90A-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
 ; FAST90A-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
diff --git a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll
index f4e5c276b8b75..33cd598aae9b5 100644
--- a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll
@@ -265,6 +265,6 @@ declare float @llvm.fmuladd.f32(float, float, float) #1
 
 attributes #0 = { nounwind willreturn "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
 attributes #1 = { nounwind readnone speculatable }
-attributes #2 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
+attributes #2 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-cluster-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-cluster-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-cluster-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
 
 !0 = !{float 2.500000e+00}
diff --git a/llvm/test/CodeGen/AMDGPU/neighboring-mfma-padding.mir b/llvm/test/CodeGen/AMDGPU/neighboring-mfma-padding.mir
index df3dd7292b7f8..4d1a663aace42 100644
--- a/llvm/test/CodeGen/AMDGPU/neighboring-mfma-padding.mir
+++ b/llvm/test/CodeGen/AMDGPU/neighboring-mfma-padding.mir
@@ -372,14 +372,12 @@ body: |
     ;
     ; gfx908-PAD75-LABEL: name: mfma_padding_16_pass
     ; gfx908-PAD75: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
-    ; gfx908-PAD75-NEXT: S_NOP 7
-    ; gfx908-PAD75-NEXT: S_NOP 3
+    ; gfx908-PAD75-NEXT: S_NOP 11
     ; gfx908-PAD75-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
     ;
     ; gfx908-PAD100-LABEL: name: mfma_padding_16_pass
     ; gfx908-PAD100: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
-    ; gfx908-PAD100-NEXT: S_NOP 7
-    ; gfx908-PAD100-NEXT: S_NOP 7
+    ; gfx908-PAD100-NEXT: S_NOP 15
     ; gfx908-PAD100-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
     ;
     ; gfx90a-DEFAULT-LABEL: name: mfma_padding_16_pass
@@ -393,8 +391,7 @@ body: |
     ;
     ; gfx90a-PAD100-LABEL: name: mfma_padding_16_pass
     ; gfx90a-PAD100: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
-    ; gfx90a-PAD100-NEXT: S_NOP 7
-    ; gfx90a-PAD100-NEXT: S_NOP 7
+    ; gfx90a-PAD100-NEXT: S_NOP 15
     ; gfx90a-PAD100-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
     ;
     ; gfx942-DEFAULT-LABEL: name: mfma_padding_16_pass
@@ -408,8 +405,7 @@ body: |
     ;
     ; gfx942-PAD100-LABEL: name: mfma_padding_16_pass
     ; gfx942-PAD100: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
-    ; gfx942-PAD100-NEXT: S_NOP 7
-    ; gfx942-PAD100-NEXT: S_NOP 7
+    ; gfx942-PAD100-NEXT: S_NOP 15
     ; gfx942-PAD100-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
     $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
     $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
@@ -459,8 +455,7 @@ body: |
     ; gfx908-PAD100-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec
     ; gfx908-PAD100-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec
     ; gfx908-PAD100-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec
-    ; gfx908-PAD100-NEXT: S_NOP 7
-    ; gfx908-PAD100-NEXT: S_NOP 3
+    ; gfx908-PAD100-NEXT: S_NOP 11
     ; gfx908-PAD100-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
     ;
     ; gfx90a-DEFAULT-LABEL: name: mfma_padding_16_pass_4_intervening_valu
@@ -486,8 +481,7 @@ body: |
     ; gfx90a-PAD100-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec
     ; gfx90a-PAD100-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec
     ; gfx90a-PAD100-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec
-    ; gfx90a-PAD100-NEXT: S_NOP 7
-    ; gfx90a-PAD100-NEXT: S_NOP 3
+    ; gfx90a-PAD100-NEXT: S_NOP 11
     ; gfx90a-PAD100-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
     ;
     ; gfx942-DEFAULT-LABEL: name: mfma_padding_16_pass_4_intervening_valu
@@ -513,8 +507,7 @@ body: |
     ; gfx942-PAD100-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec
     ; gfx942-PAD100-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec
     ; gfx942-PAD100-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec
-    ; gfx942-PAD100-NEXT: S_NOP 7
-    ; gfx942-PAD100-NEXT: S_NOP 3
+    ; gfx942-PAD100-NEXT: S_NOP 11
     ; gfx942-PAD100-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
     $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
     $vgpr2 = V_MOV_B32_e32 1, implicit $exec
@@ -887,8 +880,7 @@ body: |
   ; gfx908-PAD75-NEXT: {{  $}}
   ; gfx908-PAD75-NEXT: bb.2:
   ; gfx908-PAD75-NEXT:   $vgpr3 = V_MOV_B32_e32 1, implicit $exec
-  ; gfx908-PAD75-NEXT:   S_NOP 7
-  ; gfx908-PAD75-NEXT:   S_NOP 1
+  ; gfx908-PAD75-NEXT:   S_NOP 9
   ; gfx908-PAD75-NEXT:   early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
   ;
   ; gfx908-PAD100-LABEL: name: mfma_padding_16_pass_2_preds
@@ -905,8 +897,7 @@ body: |
   ; gfx908-PAD100-NEXT: {{  $}}
   ; gfx908-PAD100-NEXT: bb.2:
   ; gfx908-PAD100-NEXT:   $vgpr3 = V_MOV_B32_e32 1, implicit $exec
-  ; gfx908-PAD100-NEXT:   S_NOP 7
-  ; gfx908-PAD100-NEXT:   S_NOP 5
+  ; gfx908-PAD100-NEXT:   S_NOP 13
   ; gfx908-PAD100-NEXT:   early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
   ;
   ; gfx90a-DEFAULT-LABEL: name: mfma_padding_16_pass_2_preds
@@ -956,8 +947,7 @@ body: |
   ; gfx90a-PAD100-NEXT: {{  $}}
   ; gfx90a-PAD100-NEXT: bb.2:
   ; gfx90a-PAD100-NEXT:   $vgpr3 = V_MOV_B32_e32 1, implicit $exec
-  ; gfx90a-PAD100-NEXT:   S_NOP 7
-  ; gfx90a-PAD100-NEXT:   S_NOP 5
+  ; gfx90a-PAD100-NEXT:   S_NOP 13
   ; gfx90a-PAD100-NEXT:   early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
   ;
   ; gfx942-DEFAULT-LABEL: name: mfma_padding_16_pass_2_preds
@@ -1007,8 +997,7 @@ body: |
   ; gfx942-PAD100-NEXT: {{  $}}
   ; gfx942-PAD100-NEXT: bb.2:
   ; gfx942-PAD100-NEXT:   $vgpr3 = V_MOV_B32_e32 1, implicit $exec
-  ; gfx942-PAD100-NEXT:   S_NOP 7
-  ; gfx942-PAD100-NEXT:   S_NOP 5
+  ; gfx942-PAD100-NEXT:   S_NOP 13
   ; gfx942-PAD100-NEXT:   early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
   bb.0:
     $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll
index cfe7315e20ff7..627f4ada95dba 100644
--- a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll
+++ b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll
@@ -406,7 +406,7 @@ bb.1:
 declare i32 @llvm.amdgcn.workitem.id.x() #0
 
 attributes #0 = { nounwind readnone speculatable }
-attributes #1 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+attributes #1 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-cluster-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-cluster-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-cluster-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
 
 !llvm.module.flags = !{!0}
 !0 = !{i32 1, !"amdhsa_code_object_version", i32 CODE_OBJECT_VERSION}
diff --git a/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll b/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll
index ecf1d3bcdc86d..6509d8010dd95 100644
--- a/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll
@@ -12,9 +12,9 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
   ; REGALLOC-GFX908-NEXT:   liveins: $sgpr4_sgpr5
   ; REGALLOC-GFX908-NEXT: {{  $}}
   ; REGALLOC-GFX908-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:AGPR_32 */, undef %6:agpr_32
-  ; REGALLOC-GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7208970 /* regdef:VReg_128 */, def %25
+  ; REGALLOC-GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7798794 /* regdef:VReg_128 */, def %25
   ; REGALLOC-GFX908-NEXT:   [[COPY:%[0-9]+]]:av_128 = COPY %25
-  ; REGALLOC-GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3670026 /* regdef:VReg_64 */, def %27
+  ; REGALLOC-GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3735562 /* regdef:VReg_64 */, def %27
   ; REGALLOC-GFX908-NEXT:   SI_SPILL_AV64_SAVE %27, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5)
   ; REGALLOC-GFX908-NEXT:   [[COPY1:%[0-9]+]]:vreg_128 = COPY [[COPY]]
   ; REGALLOC-GFX908-NEXT:   GLOBAL_STORE_DWORDX4 undef %15:vreg_64, [[COPY1]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1)
@@ -37,9 +37,9 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
   ; PEI-GFX908-NEXT:   $sgpr12 = S_ADD_U32 $sgpr12, $sgpr9, implicit-def $scc, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15
   ; PEI-GFX908-NEXT:   $sgpr13 = S_ADDC_U32 $sgpr13, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15
   ; PEI-GFX908-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:AGPR_32 */, undef renamable $agpr0
-  ; PEI-GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7208970 /* regdef:VReg_128 */, def renamable $vgpr0_vgpr1_vgpr2_vgpr3
+  ; PEI-GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7798794 /* regdef:VReg_128 */, def renamable $vgpr0_vgpr1_vgpr2_vgpr3
   ; PEI-GFX908-NEXT:   renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec
-  ; PEI-GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3670026 /* regdef:VReg_64 */, def renamable $vgpr0_vgpr1
+  ; PEI-GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3735562 /* regdef:VReg_64 */, def renamable $vgpr0_vgpr1
   ; PEI-GFX908-NEXT:   BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr12_sgpr13_sgpr14_sgpr15, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store (s32) into %stack.0, addrspace 5)
   ; PEI-GFX908-NEXT:   $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit killed $vgpr0_vgpr1
   ; PEI-GFX908-NEXT:   renamable $vgpr0_vgpr1_vgpr2_vgpr3 = COPY killed renamable $agpr0_agpr1_agpr2_agpr3, implicit $exec
@@ -61,9 +61,9 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
   ; REGALLOC-GFX90A-NEXT:   liveins: $sgpr4_sgpr5
   ; REGALLOC-GFX90A-NEXT: {{  $}}
   ; REGALLOC-GFX90A-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:AGPR_32 */, undef %6:agpr_32
-  ; REGALLOC-GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7405578 /* regdef:VReg_128_Align2 */, def %23
+  ; REGALLOC-GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7995402 /* regdef:VReg_128_Align2 */, def %23
   ; REGALLOC-GFX90A-NEXT:   [[COPY:%[0-9]+]]:av_128_align2 = COPY %23
-  ; REGALLOC-GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3866634 /* regdef:VReg_64_Align2 */, def %21
+  ; REGALLOC-GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3997706 /* regdef:VReg_64_Align2 */, def %21
   ; REGALLOC-GFX90A-NEXT:   [[COPY1:%[0-9]+]]:av_64_align2 = COPY %21
   ; REGALLOC-GFX90A-NEXT:   GLOBAL_STORE_DWORDX4 undef %15:vreg_64_align2, [[COPY]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1)
   ; REGALLOC-GFX90A-NEXT:   renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4)
@@ -80,9 +80,9 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
   ; PEI-GFX90A-NEXT:   liveins: $sgpr4_sgpr5
   ; PEI-GFX90A-NEXT: {{  $}}
   ; PEI-GFX90A-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:AGPR_32 */, undef renamable $agpr0
-  ; PEI-GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7405578 /* regdef:VReg_128_Align2 */, def renamable $vgpr0_vgpr1_vgpr2_vgpr3
+  ; PEI-GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7995402 /* regdef:VReg_128_Align2 */, def renamable $vgpr0_vgpr1_vgpr2_vgpr3
   ; PEI-GFX90A-NEXT:   renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec
-  ; PEI-GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3866634 /* regdef:VReg_64_Align2 */, def renamable $vgpr2_vgpr3
+  ; PEI-GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3997706 /* regdef:VReg_64_Align2 */, def renamable $vgpr2_vgpr3
   ; PEI-GFX90A-NEXT:   GLOBAL_STORE_DWORDX4 undef renamable $vgpr0_vgpr1, killed renamable $agpr0_agpr1_agpr2_agpr3, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1)
   ; PEI-GFX90A-NEXT:   renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4)
   ; PEI-GFX90A-NEXT:   renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec
@@ -104,4 +104,4 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
 
 declare <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32, i32, <4 x i32>, i32, i32, i32)
 
-attributes #0 = { nounwind "amdgpu-num-vgpr"="5" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
+attributes #0 = { nounwind "amdgpu-num-vgpr"="5" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-cluster-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-cluster-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-cluster-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
index b1e05158b6212..83c521043025c 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
@@ -2688,4 +2688,4 @@ end:
 }
 
 
-attributes #0 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+attributes #0 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-cluster-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-cluster-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-cluster-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="fiji" "uniform-work-group-size"="false" }
diff --git a/llvm/test/CodeGen/AMDGPU/propagate-amdgpu-cluster-dims.ll b/llvm/test/CodeGen/AMDGPU/propagate-amdgpu-cluster-dims.ll
new file mode 100644
index 0000000000000..e8271220fa66f
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/propagate-amdgpu-cluster-dims.ll
@@ -0,0 +1,182 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes --check-globals all --version 5
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -passes=amdgpu-attributor %s -o - | FileCheck %s
+
+declare void @unknown()
+
+; fixed_cluster_dims_0 is only used by fixed_cluster_dims_1 that is expected to
+; have fixed dims.
+define internal void @fixed_cluster_dims_before_0() {
+; CHECK-LABEL: define internal void @fixed_cluster_dims_before_0(
+; CHECK-SAME: ) #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT:    call void @unknown()
+; CHECK-NEXT:    ret void
+;
+  call void @unknown()
+  ret void
+}
+
+; fixed_cluster_dims_1 is used by two kernels with the same cluster dims.
+define internal void @fixed_cluster_dims_1() {
+; CHECK-LABEL: define internal void @fixed_cluster_dims_1(
+; CHECK-SAME: ) #[[ATTR1]] {
+; CHECK-NEXT:    call void @fixed_cluster_dims_before_0()
+; CHECK-NEXT:    call void @unknown()
+; CHECK-NEXT:    ret void
+;
+  call void @fixed_cluster_dims_before_0()
+  call void @unknown()
+  ret void
+}
+
+; no_cluster_dims_0 is only used by no_cluster_dims_1 that is expected to not use
+; cluster.
+define internal void @no_cluster_dims_0() {
+; CHECK-LABEL: define internal void @no_cluster_dims_0(
+; CHECK-SAME: ) #[[ATTR2:[0-9]+]] {
+; CHECK-NEXT:    call void @unknown()
+; CHECK-NEXT:    ret void
+;
+  call void @unknown()
+  ret void
+}
+
+; no_cluster_dims_1 is used by two kernels that don't use cluster.
+define internal void @no_cluster_dims_1() {
+; CHECK-LABEL: define internal void @no_cluster_dims_1(
+; CHECK-SAME: ) #[[ATTR2]] {
+; CHECK-NEXT:    call void @unknown()
+; CHECK-NEXT:    call void @no_cluster_dims_0()
+; CHECK-NEXT:    ret void
+;
+  call void @unknown()
+  call void @no_cluster_dims_0()
+  ret void
+}
+
+; non_fixed_cluster_dims is used by two kernels with different cluster dims, so
+; it will use cluster but cluster dims is unknown.
+define internal void @non_fixed_cluster_dims() {
+; CHECK-LABEL: define internal void @non_fixed_cluster_dims(
+; CHECK-SAME: ) #[[ATTR3:[0-9]+]] {
+; CHECK-NEXT:    call void @unknown()
+; CHECK-NEXT:    ret void
+;
+  call void @unknown()
+  ret void
+}
+
+; unknown_cluster_dims is used by a kernel that uses cluster and another one that
+; doesn't use cluster, so it is unknown that whether cluster is used or not.
+define internal void @unknown_cluster_dims() {
+; CHECK-LABEL: define internal void @unknown_cluster_dims(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    call void @unknown()
+; CHECK-NEXT:    ret void
+;
+  call void @unknown()
+  ret void
+}
+
+; unknown_cluster_use is used by a kernel that we don't know whether cluster will
+; be used or not (because it doesn't have any attribute), so it is unknown that
+; whether cluster is used or not.
+define internal void @unknown_cluster_use() {
+; CHECK-LABEL: define internal void @unknown_cluster_use(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:    call void @unknown()
+; CHECK-NEXT:    ret void
+;
+  call void @unknown()
+  ret void
+}
+
+; unknown_call_site has unknown call site, so it is unknown that whether cluster
+; is used or not.
+define void @unknown_call_site() {
+; CHECK-LABEL: define void @unknown_call_site(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:    call void @unknown()
+; CHECK-NEXT:    ret void
+;
+  call void @unknown()
+  ret void
+}
+
+define amdgpu_kernel void @cluster_kernel_0() #0 {
+; CHECK-LABEL: define amdgpu_kernel void @cluster_kernel_0(
+; CHECK-SAME: ) #[[ATTR1]] {
+; CHECK-NEXT:    call void @fixed_cluster_dims_1()
+; CHECK-NEXT:    call void @non_fixed_cluster_dims()
+; CHECK-NEXT:    call void @unknown_cluster_dims()
+; CHECK-NEXT:    call void @unknown_call_site()
+; CHECK-NEXT:    ret void
+;
+  call void @fixed_cluster_dims_1()
+  call void @non_fixed_cluster_dims()
+  call void @unknown_cluster_dims()
+  call void @unknown_call_site()
+  ret void
+}
+
+define amdgpu_kernel void @cluster_kernel_0_1() #0 {
+; CHECK-LABEL: define amdgpu_kernel void @cluster_kernel_0_1(
+; CHECK-SAME: ) #[[ATTR1]] {
+; CHECK-NEXT:    call void @fixed_cluster_dims_1()
+; CHECK-NEXT:    ret void
+;
+  call void @fixed_cluster_dims_1()
+  ret void
+}
+
+define amdgpu_kernel void @cluster_kernel_1() #1 {
+; CHECK-LABEL: define amdgpu_kernel void @cluster_kernel_1(
+; CHECK-SAME: ) #[[ATTR4:[0-9]+]] {
+; CHECK-NEXT:    call void @non_fixed_cluster_dims()
+; CHECK-NEXT:    ret void
+;
+  call void @non_fixed_cluster_dims()
+  ret void
+}
+
+define amdgpu_kernel void @no_cluster_kernel() #2 {
+; CHECK-LABEL: define amdgpu_kernel void @no_cluster_kernel(
+; CHECK-SAME: ) #[[ATTR2]] {
+; CHECK-NEXT:    call void @no_cluster_dims_1()
+; CHECK-NEXT:    call void @unknown_cluster_dims()
+; CHECK-NEXT:    ret void
+;
+  call void @no_cluster_dims_1()
+  call void @unknown_cluster_dims()
+  ret void
+}
+
+define amdgpu_kernel void @no_cluster_kernel_1() #2 {
+; CHECK-LABEL: define amdgpu_kernel void @no_cluster_kernel_1(
+; CHECK-SAME: ) #[[ATTR2]] {
+; CHECK-NEXT:    call void @no_cluster_dims_1()
+; CHECK-NEXT:    ret void
+;
+  call void @no_cluster_dims_1()
+  ret void
+}
+
+define amdgpu_kernel void @unknown_cluster_use_kernel() {
+; CHECK-LABEL: define amdgpu_kernel void @unknown_cluster_use_kernel(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:    call void @unknown_cluster_use()
+; CHECK-NEXT:    ret void
+;
+  call void @unknown_cluster_use()
+  ret void
+}
+
+attributes #0 = { "amdgpu-cluster-dims"="2,2,2" }
+attributes #1 = { "amdgpu-cluster-dims"="1,2,1" }
+attributes #2 = { "amdgpu-cluster-dims"="0,0,0" }
+;.
+; CHECK: attributes #[[ATTR0]] = { "target-cpu"="gfx1250" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1]] = { "amdgpu-cluster-dims"="2,2,2" "target-cpu"="gfx1250" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR2]] = { "amdgpu-cluster-dims"="0,0,0" "target-cpu"="gfx1250" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR3]] = { "amdgpu-cluster-dims"="1024,1024,1024" "target-cpu"="gfx1250" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR4]] = { "amdgpu-cluster-dims"="1,2,1" "target-cpu"="gfx1250" "uniform-work-group-size"="false" }
+;.
diff --git a/llvm/test/CodeGen/AMDGPU/propagate-flat-work-group-size.ll b/llvm/test/CodeGen/AMDGPU/propagate-flat-work-group-size.ll
index 606cd653084f6..42469c8682150 100644
--- a/llvm/test/CodeGen/AMDGPU/propagate-flat-work-group-size.ll
+++ b/llvm/test/CodeGen/AMDGPU/propagate-flat-work-group-size.ll
@@ -202,13 +202,13 @@ attributes #5 = { "amdgpu-flat-work-group-size"="128,512" }
 attributes #6 = { "amdgpu-flat-work-group-size"="512,512" }
 attributes #7 = { "amdgpu-flat-work-group-size"="64,256" }
 ;.
-; CHECK: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="1,256" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR1]] = { "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="64,128" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR2]] = { "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="128,512" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR3]] = { "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="64,64" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR4]] = { "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="128,256" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR5]] = { "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="512,1024" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR6]] = { "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="512,512" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR7]] = { "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="64,256" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR8]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="1,256" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1]] = { "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="64,128" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR2]] = { "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="128,512" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR3]] = { "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="64,64" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR4]] = { "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="128,256" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR5]] = { "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="512,1024" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR6]] = { "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="512,512" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR7]] = { "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="64,256" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR8]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll b/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll
index 02c76473591de..06533b40b73d3 100644
--- a/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll
+++ b/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll
@@ -399,25 +399,25 @@ attributes #17 = { "amdgpu-waves-per-eu"="5,8" }
 attributes #18 = { "amdgpu-waves-per-eu"="9,10" }
 attributes #19 = { "amdgpu-waves-per-eu"="8,9" }
 ;.
-; CHECK: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,8" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR1]] = { "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,8" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR2]] = { "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,2" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR3]] = { "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,4" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR4]] = { "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="9,9" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR5]] = { "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,1" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR6]] = { "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="9,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR7]] = { "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,9" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR8]] = { "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="3,8" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR9]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR10]] = { "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR11]] = { "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,123" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR12]] = { "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="1,512" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR13]] = { "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="1,512" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="3,6" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR14]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="3,6" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR15]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,8" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR16]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="6,8" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR17]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="5,5" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR18]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="5,8" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR19]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="9,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR20]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,9" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,8" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1]] = { "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,8" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR2]] = { "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,2" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR3]] = { "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,4" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR4]] = { "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="9,9" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR5]] = { "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,1" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR6]] = { "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="9,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR7]] = { "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,9" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR8]] = { "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="3,8" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR9]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR10]] = { "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR11]] = { "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,123" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR12]] = { "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="1,512" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR13]] = { "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="1,512" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="3,6" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR14]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="3,6" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR15]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,8" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR16]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="6,8" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR17]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="5,5" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR18]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="5,8" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR19]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="9,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR20]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,9" "uniform-work-group-size"="false" }
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll b/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll
index 25609e881254e..b2bcb74e4184f 100644
--- a/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll
+++ b/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll
@@ -4089,32 +4089,44 @@ define amdgpu_kernel void @compute_mad(ptr addrspace(4) %i18, ptr addrspace(4) %
 ; GFX1250-NEXT:    s_add_co_i32 s0, s10, 1
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1250-NEXT:    v_mul_lo_u32 v1, s0, v0
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-NEXT:    v_dual_add_nc_u32 v2, s0, v1 :: v_dual_add_nc_u32 v1, 1, v1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    s_and_b32 s4, ttmp6, 15
+; GFX1250-NEXT:    s_getreg_b32 s5, hwreg(HW_REG_IB_STS2, 6, 4)
 ; GFX1250-NEXT:    v_mul_lo_u32 v2, v2, v0
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-NEXT:    v_mul_lo_u32 v3, v2, v1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[2:3], 0x4
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    s_bfe_u32 s3, ttmp6, 0x4000c
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1250-NEXT:    s_add_co_i32 s3, s3, 1
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    s_mul_i32 s3, ttmp9, s3
 ; GFX1250-NEXT:    v_add_nc_u32_e32 v1, v3, v1
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    s_add_co_i32 s4, s4, s3
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_mul_lo_u32 v1, v1, v2
 ; GFX1250-NEXT:    v_add_nc_u32_e32 v2, 1, v3
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX1250-NEXT:    s_cmp_eq_u32 s5, 0
 ; GFX1250-NEXT:    v_mul_lo_u32 v3, v1, v2
-; GFX1250-NEXT:    v_mad_u32 v0, ttmp9, s2, v0
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    s_cselect_b32 s3, ttmp9, s4
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT:    v_mad_u32 v0, s3, s2, v0
 ; GFX1250-NEXT:    v_add_nc_u32_e32 v2, v3, v2
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_mul_lo_u32 v2, v2, v1
 ; GFX1250-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1250-NEXT:    v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1250-NEXT:    v_mad_u32 v3, v2, v3, v2
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1250-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 2, s[8:9]
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1250-NEXT:    v_mad_u32 v2, v3, v2, v3
 ; GFX1250-NEXT:    global_store_b32 v[0:1], v2, off
 ; GFX1250-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/recursive_global_initializer.ll b/llvm/test/CodeGen/AMDGPU/recursive_global_initializer.ll
index 2b07fc716e8b9..8930626ae73e8 100644
--- a/llvm/test/CodeGen/AMDGPU/recursive_global_initializer.ll
+++ b/llvm/test/CodeGen/AMDGPU/recursive_global_initializer.ll
@@ -19,5 +19,5 @@ define void @hoge()  {
   ret void
 }
 ;.
-; CHECK: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/regcoalesce-64-bit-only-regs.mir b/llvm/test/CodeGen/AMDGPU/regcoalesce-64-bit-only-regs.mir
index 038e195742305..7475c15f6357a 100644
--- a/llvm/test/CodeGen/AMDGPU/regcoalesce-64-bit-only-regs.mir
+++ b/llvm/test/CodeGen/AMDGPU/regcoalesce-64-bit-only-regs.mir
@@ -1,55 +1,66 @@
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -start-before=register-coalescer -show-mc-encoding -o - %s | FileCheck %s
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -run-pass=register-coalescer -o - %s | FileCheck %s
 
-# FIXME: These SRC_*_HI registers do not exist, although defined in the register file
-#        and happily used by the coalescer. The resulting encoding is in fact belong
-#        to the 64-bit register and corresponding *_LO 32-bit part of it.
+# These SRC_*_HI registers do not exist, make sure coalescer does not use it.
 
-# CHECK-LABEL: src_private_base:
-# CHECK: s_subb_u32 s0, SRC_PRIVATE_BASE_HI, s1  ; encoding: [0xed,0x01,0x80,0x82]
 ---
 name:            src_private_base
 tracksRegLiveness: true
 body:             |
   bb.0:
+    ; CHECK-LABEL: name: src_private_base
+    ; CHECK: [[COPY:%[0-9]+]]:sreg_64 = COPY $src_private_base
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+    ; CHECK-NEXT: $scc = IMPLICIT_DEF
+    ; CHECK-NEXT: dead [[S_SUBB_U32_:%[0-9]+]]:sreg_32 = S_SUBB_U32 [[COPY]].sub1, [[DEF]].sub1, implicit-def dead $scc, implicit killed $scc
     %0:sreg_64 = COPY $src_private_base
     %1:sreg_64 = IMPLICIT_DEF
     $scc = IMPLICIT_DEF
     %2:sreg_32 = S_SUBB_U32 killed %0.sub1:sreg_64, %1.sub1:sreg_64, implicit-def dead $scc, implicit killed $scc
 ...
 
-# CHECK-LABEL: src_private_limit:
-# CHECK: s_subb_u32 s0, SRC_PRIVATE_LIMIT_HI, s1  ; encoding: [0xee,0x01,0x80,0x82]
 ---
 name:            src_private_limit
 tracksRegLiveness: true
 body:             |
   bb.0:
+    ; CHECK-LABEL: name: src_private_limit
+    ; CHECK: [[COPY:%[0-9]+]]:sreg_64 = COPY $src_private_limit
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+    ; CHECK-NEXT: $scc = IMPLICIT_DEF
+    ; CHECK-NEXT: dead [[S_SUBB_U32_:%[0-9]+]]:sreg_32 = S_SUBB_U32 [[COPY]].sub1, [[DEF]].sub1, implicit-def dead $scc, implicit killed $scc
     %0:sreg_64 = COPY $src_private_limit
     %1:sreg_64 = IMPLICIT_DEF
     $scc = IMPLICIT_DEF
     %2:sreg_32 = S_SUBB_U32 killed %0.sub1:sreg_64, %1.sub1:sreg_64, implicit-def dead $scc, implicit killed $scc
 ...
 
-# CHECK-LABEL: src_shared_base:
-# CHECK: s_subb_u32 s0, SRC_SHARED_BASE_HI, s1  ; encoding: [0xeb,0x01,0x80,0x82]
 ---
 name:            src_shared_base
 tracksRegLiveness: true
 body:             |
   bb.0:
+    ; CHECK-LABEL: name: src_shared_base
+    ; CHECK: [[COPY:%[0-9]+]]:sreg_64 = COPY $src_shared_base
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+    ; CHECK-NEXT: $scc = IMPLICIT_DEF
+    ; CHECK-NEXT: dead [[S_SUBB_U32_:%[0-9]+]]:sreg_32 = S_SUBB_U32 [[COPY]].sub1, [[DEF]].sub1, implicit-def dead $scc, implicit killed $scc
     %0:sreg_64 = COPY $src_shared_base
     %1:sreg_64 = IMPLICIT_DEF
     $scc = IMPLICIT_DEF
     %2:sreg_32 = S_SUBB_U32 killed %0.sub1:sreg_64, %1.sub1:sreg_64, implicit-def dead $scc, implicit killed $scc
 ...
 
-# CHECK-LABEL: src_shared_limit:
-# CHECK: s_subb_u32 s0, SRC_SHARED_LIMIT_HI, s1  ; encoding: [0xec,0x01,0x80,0x82]
 ---
 name:            src_shared_limit
 tracksRegLiveness: true
 body:             |
   bb.0:
+    ; CHECK-LABEL: name: src_shared_limit
+    ; CHECK: [[COPY:%[0-9]+]]:sreg_64 = COPY $src_shared_limit
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+    ; CHECK-NEXT: $scc = IMPLICIT_DEF
+    ; CHECK-NEXT: dead [[S_SUBB_U32_:%[0-9]+]]:sreg_32 = S_SUBB_U32 [[COPY]].sub1, [[DEF]].sub1, implicit-def dead $scc, implicit killed $scc
     %0:sreg_64 = COPY $src_shared_limit
     %1:sreg_64 = IMPLICIT_DEF
     $scc = IMPLICIT_DEF
diff --git a/llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll b/llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll
index fe643ff00b7b1..3dfb0e1a0389a 100644
--- a/llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll
+++ b/llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll
@@ -191,12 +191,12 @@ define amdgpu_kernel void @kernel_lds_recursion() {
 !1 = !{i32 1, !"amdhsa_code_object_version", i32 400}
 
 ;.
-; CHECK: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR1]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR2]] = { "amdgpu-agpr-alloc"="0" "amdgpu-lds-size"="2" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR2]] = { "amdgpu-agpr-alloc"="0" "amdgpu-lds-size"="2" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
 ; CHECK: attributes #[[ATTR3]] = { "amdgpu-lds-size"="4" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR4]] = { "amdgpu-agpr-alloc"="0" "amdgpu-lds-size"="2" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR5]] = { "amdgpu-agpr-alloc"="0" "amdgpu-lds-size"="4" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR4]] = { "amdgpu-agpr-alloc"="0" "amdgpu-lds-size"="2" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR5]] = { "amdgpu-agpr-alloc"="0" "amdgpu-lds-size"="4" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
 ; CHECK: attributes #[[ATTR6:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) }
 ; CHECK: attributes #[[ATTR7:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/rename-independent-subregs.mir b/llvm/test/CodeGen/AMDGPU/rename-independent-subregs.mir
index 75a2be755185f..e1148c437e91d 100644
--- a/llvm/test/CodeGen/AMDGPU/rename-independent-subregs.mir
+++ b/llvm/test/CodeGen/AMDGPU/rename-independent-subregs.mir
@@ -73,7 +73,7 @@ body: |
 # (1) %0.sub0 + %0.sub0 and (2) %0.sub1 + %0.sub1
 # Check that renaming (2) does not inadvertently rename (1).
 # CHECK-LABEL: name: test2
-# CHECK: INLINEASM &"", 32 /* isconvergent attdialect */, 327690 /* regdef:SReg_1_with_sub0 */, def undef %0.sub0, 327690 /* regdef:SReg_1_with_sub0 */, def dead %1.sub1, 2147483657 /* reguse tiedto:$0 */, undef %0.sub0(tied-def 3), 2147549193 /* reguse tiedto:$1 */, %1.sub1(tied-def 5)
+# CHECK: INLINEASM &"", 1 /* sideeffect attdialect */, 2031626 /* regdef:VGPR_32 */, def undef %0.sub0, 2031626 /* regdef:VGPR_32 */, def dead %1.sub1, 2147483657 /* reguse tiedto:$0 */, undef %0.sub0(tied-def 3), 2147549193 /* reguse tiedto:$1 */, %1.sub1(tied-def 5)
 name: test2
 body: |
   bb.0:
@@ -81,7 +81,7 @@ body: |
 
   bb.1:
     undef %0.sub1:vreg_64 = V_ALIGNBIT_B32_e64 %0.sub0:vreg_64, %0.sub0:vreg_64, 16, implicit $exec
-    INLINEASM &"", 32, 327690, def undef %0.sub0:vreg_64, 327690, def %0.sub1:vreg_64, 2147483657, undef %0.sub0:vreg_64(tied-def 3), 2147549193, %0.sub1:vreg_64(tied-def 5)
+    INLINEASM &"", 1 /* sideeffect attdialect */, 2031626 /* regdef:VGPR_32 */, def undef %0.sub0:vreg_64, 2031626 /* regdef:VGPR_32 */, def %0.sub1:vreg_64, 2147483657 /* reguse tiedto:$0 */, undef %0.sub0:vreg_64(tied-def 3), 2147549193 /* reguse tiedto:$1 */, %0.sub1:vreg_64(tied-def 5)
     S_BRANCH %bb.1
 
 ...
diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-copy-from.mir b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-copy-from.mir
index bcd0e027b209e..1b09f5d6ab9c3 100644
--- a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-copy-from.mir
+++ b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-copy-from.mir
@@ -41,9 +41,9 @@ body:             |
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:av_64_align2 = COPY $vgpr0_vgpr1
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:av_64_align2 = COPY $vgpr2_vgpr3
     ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:areg_128_align2 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec :: (load (s128), addrspace 1)
-    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vreg_128_align2 = COPY [[GLOBAL_LOAD_DWORDX4_]]
-    ; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_vgprcd_e64_:%[0-9]+]]:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 [[COPY1]], [[COPY2]], [[COPY3]].sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec
-    ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3866633 /* reguse:VReg_64_Align2 */, [[V_MFMA_F64_4X4X4F64_vgprcd_e64_]]
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[GLOBAL_LOAD_DWORDX4_]]
+    ; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_e64_:%[0-9]+]]:areg_64_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[COPY3]].sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3801097 /* reguse:AV_64_Align2 */, [[V_MFMA_F64_4X4X4F64_e64_]]
     ; CHECK-NEXT: SI_RETURN
     %0:vreg_64_align2 = COPY $vgpr4_vgpr5
     %1:av_64_align2 = COPY $vgpr0_vgpr1
@@ -51,7 +51,7 @@ body:             |
     %3:areg_128_align2 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, implicit $exec :: (load (s128), addrspace 1)
     %4:vreg_128_align2 = COPY %3
     %5:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %4.sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3866633 /* reguse:VReg_64_Align2 */, %5
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3801097 /* reguse:VReg_64_Align2 */, %5
     SI_RETURN
 ...
 
diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-phi.ll b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-phi.ll
index cefcd7e0d2651..fc154604b8700 100644
--- a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-phi.ll
+++ b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-phi.ll
@@ -33,8 +33,7 @@ define amdgpu_kernel void @test_rewrite_mfma_copy_to_agpr_phi(ptr addrspace(1) %
 ; CHECK-NEXT:  .LBB0_2:
 ; CHECK-NEXT:    ; implicit-def: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
 ; CHECK-NEXT:  .LBB0_3: ; %if
-; CHECK-NEXT:    s_nop 7
-; CHECK-NEXT:    s_nop 7
+; CHECK-NEXT:    s_nop 15
 ; CHECK-NEXT:    global_load_dwordx4 a[28:31], v32, s[0:1] offset:112
 ; CHECK-NEXT:    global_load_dwordx4 a[24:27], v32, s[0:1] offset:96
 ; CHECK-NEXT:    global_load_dwordx4 a[20:23], v32, s[0:1] offset:80
@@ -98,8 +97,7 @@ define amdgpu_kernel void @test_rewrite_mfma_copy_to_agpr_phi_loop(ptr addrspace
 ; CHECK-NEXT:  .LBB1_1: ; %loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    s_nop 7
-; CHECK-NEXT:    s_nop 7
+; CHECK-NEXT:    s_nop 15
 ; CHECK-NEXT:    v_mov_b64_e32 v[62:63], v[30:31]
 ; CHECK-NEXT:    v_mov_b64_e32 v[60:61], v[28:29]
 ; CHECK-NEXT:    v_mov_b64_e32 v[58:59], v[26:27]
diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-insert-extract.mir b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-insert-extract.mir
index e9c9170caeac4..d7b713aa53b86 100644
--- a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-insert-extract.mir
+++ b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-insert-extract.mir
@@ -19,7 +19,7 @@ body:             |
     ; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_e64_:%[0-9]+]]:areg_64_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[GLOBAL_LOAD_DWORDX2_]], 0, 0, 0, implicit $mode, implicit $exec
     ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]]
     ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub2_sub3:areg_128_align2 = IMPLICIT_DEF
-    ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8060937 /* reguse:AReg_128_Align2 */, [[COPY3]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY3]]
     ; CHECK-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s128), addrspace 1)
     ; CHECK-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY3]].sub2_sub3, 0, 0, implicit $exec :: (store (s128), addrspace 1)
     ; CHECK-NEXT: SI_RETURN
@@ -30,7 +30,7 @@ body:             |
     %4:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %3, 0, 0, 0, implicit $mode, implicit $exec
     undef %5.sub0_sub1:areg_128_align2 = COPY %4
     %5.sub2_sub3 = IMPLICIT_DEF
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8060937 /* reguse:AReg_128_Align2 */, %5
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %5
     GLOBAL_STORE_DWORDX4 %0, %5, 0, 0, implicit $exec :: (store (s128), addrspace 1)
     GLOBAL_STORE_DWORDX2 %0, %5.sub2_sub3, 0, 0, implicit $exec :: (store (s128), addrspace 1)
     SI_RETURN
@@ -172,7 +172,7 @@ body:             |
     ; CHECK-NEXT: undef [[V_MFMA_F64_4X4X4F64_e64_:%[0-9]+]].sub2_sub3:areg_128_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[GLOBAL_LOAD_DWORDX2_]], 0, 0, 0, implicit $mode, implicit $exec
     ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]].sub2_sub3
     ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub2_sub3:areg_128_align2 = IMPLICIT_DEF
-    ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8060937 /* reguse:AReg_128_Align2 */, [[COPY3]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY3]]
     ; CHECK-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s128), addrspace 1)
     ; CHECK-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY3]].sub2_sub3, 0, 0, implicit $exec :: (store (s128), addrspace 1)
     ; CHECK-NEXT: SI_RETURN
@@ -183,7 +183,7 @@ body:             |
     undef %4.sub2_sub3:vreg_128_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %3, 0, 0, 0, implicit $mode, implicit $exec
     undef %5.sub0_sub1:areg_128_align2 = COPY %4.sub2_sub3
     %5.sub2_sub3 = IMPLICIT_DEF
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8060937 /* reguse:AReg_128_Align2 */, %5
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %5
     GLOBAL_STORE_DWORDX4 %0, %5, 0, 0, implicit $exec :: (store (s128), addrspace 1)
     GLOBAL_STORE_DWORDX2 %0, %5.sub2_sub3, 0, 0, implicit $exec :: (store (s128), addrspace 1)
     SI_RETURN
@@ -208,7 +208,7 @@ body:             |
     ; CHECK-NEXT: undef [[V_MFMA_F64_4X4X4F64_vgprcd_e64_:%[0-9]+]].sub2_sub3:vreg_128_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 [[COPY1]], [[COPY2]], [[GLOBAL_LOAD_DWORDX2_]], 0, 0, 0, implicit $mode, implicit $exec
     ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub1:areg_128_align2 = COPY [[V_MFMA_F64_4X4X4F64_vgprcd_e64_]].sub2
     ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub2_sub3:areg_128_align2 = IMPLICIT_DEF
-    ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8060937 /* reguse:AReg_128_Align2 */, [[COPY3]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY3]]
     ; CHECK-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s128), addrspace 1)
     ; CHECK-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY3]].sub2_sub3, 0, 0, implicit $exec :: (store (s128), addrspace 1)
     ; CHECK-NEXT: SI_RETURN
@@ -219,7 +219,7 @@ body:             |
     undef %4.sub2_sub3:vreg_128_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %3, 0, 0, 0, implicit $mode, implicit $exec
     undef %5.sub1:areg_128_align2 = COPY %4.sub2
     %5.sub2_sub3 = IMPLICIT_DEF
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8060937 /* reguse:AReg_128_Align2 */, %5
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %5
     GLOBAL_STORE_DWORDX4 %0, %5, 0, 0, implicit $exec :: (store (s128), addrspace 1)
     GLOBAL_STORE_DWORDX2 %0, %5.sub2_sub3, 0, 0, implicit $exec :: (store (s128), addrspace 1)
     SI_RETURN
diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-src2-chain.mir b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-src2-chain.mir
index b51aad748bc28..57f611b4a033e 100644
--- a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-src2-chain.mir
+++ b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-src2-chain.mir
@@ -17,7 +17,7 @@ body:             |
     ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:areg_128_align2 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec :: (load (s128), addrspace 1)
     ; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_e64_:%[0-9]+]]:areg_64_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[GLOBAL_LOAD_DWORDX4_]].sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec
     ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8060937 /* reguse:AReg_128_Align2 */, [[COPY3]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY3]]
     ; CHECK-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s128), addrspace 1)
     ; CHECK-NEXT: SI_RETURN
     %0:vreg_64_align2 = COPY $vgpr4_vgpr5
@@ -26,7 +26,7 @@ body:             |
     %3:vreg_128_align2 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, implicit $exec :: (load (s128), addrspace 1)
     %4:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %3.sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec
     undef %5.sub0_sub1:areg_128_align2 = COPY %4
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8060937 /* reguse:AReg_128_Align2 */, %5
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %5
     GLOBAL_STORE_DWORDX4 %0, %5, 0, 0, implicit $exec :: (store (s128), addrspace 1)
     SI_RETURN
 ...
@@ -47,7 +47,7 @@ body:             |
     ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:areg_128_align2 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec :: (load (s128), addrspace 1)
     ; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_e64_:%[0-9]+]]:areg_64_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[GLOBAL_LOAD_DWORDX4_]].sub2_sub3, 0, 0, 0, implicit $mode, implicit $exec
     ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8060937 /* reguse:AReg_128_Align2 */, [[COPY3]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY3]]
     ; CHECK-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s128), addrspace 1)
     ; CHECK-NEXT: SI_RETURN
     %0:vreg_64_align2 = COPY $vgpr4_vgpr5
@@ -56,7 +56,7 @@ body:             |
     %3:vreg_128_align2 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, implicit $exec :: (load (s128), addrspace 1)
     %4:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %3.sub2_sub3, 0, 0, 0, implicit $mode, implicit $exec
     undef %5.sub0_sub1:areg_128_align2 = COPY %4
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8060937 /* reguse:AReg_128_Align2 */, %5
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %5
     GLOBAL_STORE_DWORDX4 %0, %5, 0, 0, implicit $exec :: (store (s128), addrspace 1)
     SI_RETURN
 ...
@@ -79,7 +79,7 @@ body:             |
     ; CHECK-NEXT: dead %other_use:vreg_64_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]].sub0_sub1
     ; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_e64_1:%[0-9]+]]:areg_64_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[V_MFMA_F64_4X4X4F64_e64_]].sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:areg_64_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_1]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4521993 /* reguse:AReg_64_Align2 */, [[COPY3]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4521993 /* reguse:VS_64_with_sub0_in_VS_32_Lo128 */, [[COPY3]]
     ; CHECK-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s64), addrspace 1)
     ; CHECK-NEXT: SI_RETURN
     %0:vreg_64_align2 = COPY $vgpr4_vgpr5
@@ -114,7 +114,7 @@ body:             |
     ; CHECK-NEXT: undef [[V_MFMA_F64_4X4X4F64_e64_1:%[0-9]+]].sub0_sub1:areg_128_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[V_MFMA_F64_4X4X4F64_e64_]], 0, 0, 0, implicit $mode, implicit $exec
     ; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_e64_2:%[0-9]+]]:areg_64_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[V_MFMA_F64_4X4X4F64_e64_1]].sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:areg_64_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_2]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4521993 /* reguse:AReg_64_Align2 */, [[COPY3]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4521993 /* reguse:VS_64_with_sub0_in_VS_32_Lo128 */, [[COPY3]]
     ; CHECK-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s64), addrspace 1)
     ; CHECK-NEXT: SI_RETURN
     %0:vreg_64_align2 = COPY $vgpr4_vgpr5
@@ -151,7 +151,7 @@ body:             |
     ; CHECK-NEXT: dead %other_use:vreg_64_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_1]].sub0_sub1
     ; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_e64_2:%[0-9]+]]:areg_64_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[V_MFMA_F64_4X4X4F64_e64_1]].sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec
     ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_2]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8060937 /* reguse:AReg_128_Align2 */, [[COPY3]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY3]]
     ; CHECK-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s128), addrspace 1)
     ; CHECK-NEXT: SI_RETURN
     %0:vreg_64_align2 = COPY $vgpr4_vgpr5
@@ -163,7 +163,7 @@ body:             |
     %other_use:vreg_64_align2 = COPY %5.sub0_sub1
     %6:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %5.sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec
     undef %8.sub0_sub1:areg_128_align2 = COPY %6
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8060937 /* reguse:AReg_128_Align2 */, %8:areg_128_align2
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %8:areg_128_align2
     GLOBAL_STORE_DWORDX4 %0, %8, 0, 0, implicit $exec :: (store (s128), addrspace 1)
     SI_RETURN
 
@@ -231,7 +231,7 @@ body:             |
     ; CHECK-NEXT: dead %other_use1:vreg_64_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]].sub2_sub3
     ; CHECK-NEXT: dead %other_use2:vreg_64 = COPY [[V_MFMA_F64_4X4X4F64_e64_]].sub1_sub2
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8060937 /* reguse:AReg_128_Align2 */, [[COPY3]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY3]]
     ; CHECK-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s128), addrspace 1)
     ; CHECK-NEXT: SI_RETURN
     %0:vreg_64_align2 = COPY $vgpr4_vgpr5
@@ -245,7 +245,7 @@ body:             |
     %other_use1:vreg_64_align2 = COPY %4.sub2_sub3
     %other_use2:vreg_64 = COPY %4.sub1_sub2
     %6:areg_128_align2 = COPY %4
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8060937 /* reguse:AReg_128_Align2 */, %6:areg_128_align2
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %6:areg_128_align2
     GLOBAL_STORE_DWORDX4 %0, %6, 0, 0, implicit $exec :: (store (s128), addrspace 1)
     SI_RETURN
 ...
@@ -273,7 +273,7 @@ body:             |
     ; CHECK-NEXT: %other_use1:vreg_64_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]].sub2_sub3
     ; CHECK-NEXT: dead %other_use2:vreg_64 = COPY [[V_MFMA_F64_4X4X4F64_e64_]].sub1_sub2
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:areg_64 = COPY [[V_MFMA_F64_4X4X4F64_e64_]].sub1_sub2
-    ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4128777 /* reguse:AReg_64 */, [[COPY3]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4325385 /* reguse:AReg_64 */, [[COPY3]]
     ; CHECK-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], %other_use1, 0, 0, implicit $exec :: (store (s64), addrspace 1)
     ; CHECK-NEXT: SI_RETURN
     %0:vreg_64_align2 = COPY $vgpr4_vgpr5
@@ -287,7 +287,7 @@ body:             |
     %other_use1:vreg_64_align2 = COPY %4.sub2_sub3
     %other_use2:vreg_64 = COPY %4.sub1_sub2
     %6:areg_64 = COPY %4.sub1_sub2
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4128777 /* reguse:AReg_64 */, %6:areg_64
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4325385 /* reguse:AReg_64 */, %6:areg_64
     GLOBAL_STORE_DWORDX2 %0, %other_use1, 0, 0, implicit $exec :: (store (s64), addrspace 1)
     SI_RETURN
 ...
@@ -313,7 +313,7 @@ body:             |
     ; CHECK-NEXT: %other_use1:vreg_64_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]].sub2_sub3
     ; CHECK-NEXT: dead %other_use2:vreg_64 = COPY [[V_MFMA_F64_4X4X4F64_e64_]].sub1_sub2
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:areg_64 = COPY [[V_MFMA_F64_4X4X4F64_e64_]].sub1_sub2
-    ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4128777 /* reguse:AReg_64 */, [[COPY3]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4325385 /* reguse:AReg_64 */, [[COPY3]]
     ; CHECK-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], %other_use1, 0, 0, implicit $exec :: (store (s64), addrspace 1)
     ; CHECK-NEXT: SI_RETURN
     %0:vreg_64_align2 = COPY $vgpr4_vgpr5
@@ -327,7 +327,7 @@ body:             |
     %other_use1:vreg_64_align2 = COPY %4.sub2_sub3
     %other_use2:vreg_64 = COPY %4.sub1_sub2
     %6:areg_64 = COPY %4.sub1_sub2
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4128777 /* reguse:AReg_64 */, %6:areg_64
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4325385 /* reguse:AReg_64 */, %6:areg_64
     GLOBAL_STORE_DWORDX2 %0, %other_use1, 0, 0, implicit $exec :: (store (s64), addrspace 1)
     SI_RETURN
 ...
diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll
index 5f42abbeae253..b9e9893ede4e2 100644
--- a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll
@@ -60,8 +60,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_rewrite_vgpr_mfma(ptr addrsp
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, v[0:31]
 ; CHECK-NEXT:    v_mfma_f32_32x32x1_2b_f32 v[32:63], a0, a1, v[0:31]
-; CHECK-NEXT:    s_nop 7
-; CHECK-NEXT:    s_nop 7
+; CHECK-NEXT:    s_nop 15
 ; CHECK-NEXT:    s_nop 1
 ; CHECK-NEXT:    v_mov_b32_e32 v2, v32
 ; CHECK-NEXT:    v_mov_b32_e32 v3, v33
@@ -96,8 +95,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_rewrite_vgpr_mfma(ptr addrsp
 ; CHECK-NEXT:    v_mov_b32_e32 v32, 0
 ; CHECK-NEXT:    s_nop 0
 ; CHECK-NEXT:    v_mfma_f32_32x32x1_2b_f32 v[0:31], a0, a1, v[0:31]
-; CHECK-NEXT:    s_nop 7
-; CHECK-NEXT:    s_nop 7
+; CHECK-NEXT:    s_nop 15
 ; CHECK-NEXT:    s_nop 1
 ; CHECK-NEXT:    global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
 ; CHECK-NEXT:    global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
@@ -143,8 +141,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_rewrite_vgpr_mfma_noshuffle(
 ; CHECK-NEXT:    v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, v[0:31]
 ; CHECK-NEXT:    v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, v[0:31]
 ; CHECK-NEXT:    v_mov_b32_e32 v32, 0
-; CHECK-NEXT:    s_nop 7
-; CHECK-NEXT:    s_nop 7
+; CHECK-NEXT:    s_nop 15
 ; CHECK-NEXT:    s_nop 0
 ; CHECK-NEXT:    global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
 ; CHECK-NEXT:    global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
@@ -178,8 +175,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_rewrite_vgpr_mfma_imm0_src2(
 ; CHECK-NEXT:    v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, v[0:31]
 ; CHECK-NEXT:    v_mov_b32_e32 v32, 0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    s_nop 7
-; CHECK-NEXT:    s_nop 7
+; CHECK-NEXT:    s_nop 15
 ; CHECK-NEXT:    global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
 ; CHECK-NEXT:    global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
 ; CHECK-NEXT:    global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
@@ -212,8 +208,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_rewrite_vgpr_mfma_imm1_src2(
 ; CHECK-NEXT:    v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, v[0:31]
 ; CHECK-NEXT:    v_mov_b32_e32 v32, 0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    s_nop 7
-; CHECK-NEXT:    s_nop 7
+; CHECK-NEXT:    s_nop 15
 ; CHECK-NEXT:    global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
 ; CHECK-NEXT:    global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
 ; CHECK-NEXT:    global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
@@ -351,8 +346,7 @@ define void @test_rewrite_mfma_subreg_extract2(float %arg0, float %arg1, ptr add
 ; CHECK-NEXT:    global_load_dwordx4 a[0:3], v[2:3], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31]
-; CHECK-NEXT:    s_nop 7
-; CHECK-NEXT:    s_nop 7
+; CHECK-NEXT:    s_nop 15
 ; CHECK-NEXT:    s_nop 1
 ; CHECK-NEXT:    v_accvgpr_mov_b32 a0, a1
 ; CHECK-NEXT:    v_accvgpr_mov_b32 a1, a2
@@ -717,8 +711,7 @@ define amdgpu_kernel void @test_rewrite_mfma_direct_copy_from_agpr_class_chain(p
 ; CHECK-NEXT:    s_nop 0
 ; CHECK-NEXT:    v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v34, a[0:31]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    s_nop 7
-; CHECK-NEXT:    s_nop 7
+; CHECK-NEXT:    s_nop 15
 ; CHECK-NEXT:    s_nop 0
 ; CHECK-NEXT:    global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
 ; CHECK-NEXT:    global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
@@ -777,8 +770,7 @@ define void @test_rewrite_mfma_copy_from_agpr_class_f64_4x4x4f64_chain(double %a
 ; CHECK-NEXT:    v_mov_b32_e32 v3, 0
 ; CHECK-NEXT:    v_lshl_add_u64 v[2:3], v[8:9], 0, v[2:3]
 ; CHECK-NEXT:    v_mfma_f64_4x4x4_4b_f64 a[0:1], v[4:5], v[6:7], a[0:1]
-; CHECK-NEXT:    s_nop 7
-; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    s_nop 8
 ; CHECK-NEXT:    global_store_dwordx2 v[2:3], a[0:1], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/rotl.ll b/llvm/test/CodeGen/AMDGPU/rotl.ll
index 25020673bce22..0a1d15bf945f9 100644
--- a/llvm/test/CodeGen/AMDGPU/rotl.ll
+++ b/llvm/test/CodeGen/AMDGPU/rotl.ll
@@ -376,9 +376,8 @@ define void @test_rotl_i16(ptr addrspace(1) nocapture readonly %sourceA, ptr add
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    global_load_d16_b16 v2, v[2:3], off offset:48
 ; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v[0:1], off offset:32
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT:    v_sub_nc_u16 v0.h, 0, v2.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_sub_nc_u16 v0.h, 0, v2.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v1.l, v2.l, v0.l
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b16 v0.l, v0.h, v0.l
diff --git a/llvm/test/CodeGen/AMDGPU/rotr.ll b/llvm/test/CodeGen/AMDGPU/rotr.ll
index 3e7b8f438efdb..403a556688091 100644
--- a/llvm/test/CodeGen/AMDGPU/rotr.ll
+++ b/llvm/test/CodeGen/AMDGPU/rotr.ll
@@ -461,9 +461,8 @@ define void @test_rotr_i16(ptr addrspace(1) nocapture readonly %sourceA, ptr add
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    global_load_d16_b16 v2, v[2:3], off offset:48
 ; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v[0:1], off offset:32
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT:    v_sub_nc_u16 v0.h, 0, v2.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_sub_nc_u16 v0.h, 0, v2.l
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b16 v1.l, v2.l, v0.l
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v0.l, v0.h, v0.l
diff --git a/llvm/test/CodeGen/AMDGPU/scale-offset-flat.ll b/llvm/test/CodeGen/AMDGPU/scale-offset-flat.ll
index e0bdd77bd18e2..307ff046d48c2 100644
--- a/llvm/test/CodeGen/AMDGPU/scale-offset-flat.ll
+++ b/llvm/test/CodeGen/AMDGPU/scale-offset-flat.ll
@@ -380,8 +380,7 @@ define amdgpu_ps <2 x float> @flat_atomicrmw_b64_rtn_idxprom(ptr align 8 inreg %
 ;
 ; GISEL-LABEL: flat_atomicrmw_b64_rtn_idxprom:
 ; GISEL:       ; %bb.0: ; %entry
-; GISEL-NEXT:    s_mov_b32 s2, src_flat_scratch_base_hi
-; GISEL-NEXT:    v_mov_b32_e32 v2, v0
+; GISEL-NEXT:    v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v6, src_flat_scratch_base_hi
 ; GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[0:1]
 ; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
@@ -390,7 +389,7 @@ define amdgpu_ps <2 x float> @flat_atomicrmw_b64_rtn_idxprom(ptr align 8 inreg %
 ; GISEL-NEXT:    v_add_co_u32 v4, vcc_lo, v4, v0
 ; GISEL-NEXT:    v_add_co_ci_u32_e64 v5, null, v5, v1, vcc_lo
 ; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GISEL-NEXT:    v_xor_b32_e32 v0, s2, v5
+; GISEL-NEXT:    v_xor_b32_e32 v0, v5, v6
 ; GISEL-NEXT:    v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
 ; GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GISEL-NEXT:    s_and_saveexec_b32 s2, vcc_lo
@@ -412,11 +411,11 @@ define amdgpu_ps <2 x float> @flat_atomicrmw_b64_rtn_idxprom(ptr align 8 inreg %
 ; GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s2
 ; GISEL-NEXT:    s_cbranch_execz .LBB21_2
 ; GISEL-NEXT:  .LBB21_4: ; %atomicrmw.private
-; GISEL-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
-; GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
 ; GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GISEL-NEXT:    v_subrev_nc_u32_e32 v0, s1, v4
-; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GISEL-NEXT:    v_mov_b32_e32 v0, src_flat_scratch_base_lo
+; GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
+; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GISEL-NEXT:    v_sub_nc_u32_e32 v0, v4, v0
 ; GISEL-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
 ; GISEL-NEXT:    scratch_load_b64 v[0:1], v4, off
 ; GISEL-NEXT:    s_wait_loadcnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/sched-assert-dead-def-subreg-use-other-subreg.mir b/llvm/test/CodeGen/AMDGPU/sched-assert-dead-def-subreg-use-other-subreg.mir
index c90975959c3f4..3c7dd6463813f 100644
--- a/llvm/test/CodeGen/AMDGPU/sched-assert-dead-def-subreg-use-other-subreg.mir
+++ b/llvm/test/CodeGen/AMDGPU/sched-assert-dead-def-subreg-use-other-subreg.mir
@@ -37,7 +37,7 @@ body:             |
   ; CHECK-NEXT:   dead [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128 = DS_READ_B128_gfx9 [[V_ADD_U32_e32_]], 0, 0, implicit $exec
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
   ; CHECK-NEXT:   undef [[COPY2:%[0-9]+]].sub1:vreg_512 = COPY [[COPY]].sub1
-  ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, 851978 /* regdef:VGPR_16 */, def dead [[COPY1]], 851978 /* regdef:VGPR_16 */, def dead [[COPY]].sub1, 2147483657 /* reguse tiedto:$0 */, [[COPY1]], 2147549193 /* reguse tiedto:$1 */, [[COPY]].sub1
+  ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, 2031626 /* regdef:VGPR_32 */, def dead [[COPY1]], 2031626 /* regdef:VGPR_32 */, def dead [[COPY]].sub1, 2031625 /* reguse:VGPR_32 */, [[COPY1]], 2031625 /* reguse:VGPR_32 */, [[COPY]].sub1
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]].sub0:vreg_512 = COPY [[COPY]].sub0
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]].sub3:vreg_512 = COPY [[COPY]].sub3
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]].sub2:vreg_512 = COPY undef [[V_MOV_B32_e32_1]]
@@ -63,7 +63,7 @@ body:             |
     undef %11.sub0:vreg_512 = COPY %4.sub0
     %12:vgpr_32 = COPY %4.sub0
     %11.sub1:vreg_512 = COPY %4.sub1
-    INLINEASM &"", 1, 851978, def dead %12, 851978, def dead %4.sub1, 2147483657, %12, 2147549193, %4.sub1
+    INLINEASM &"", 1 /* sideeffect attdialect */, 2031626 /* regdef:VGPR_32 */, def dead %12:vgpr_32, 2031626 /* regdef:VGPR_32 */, def dead %4.sub1:vreg_512, 2031625 /* reguse:VGPR_32 */, %12:vgpr_32, 2031625 /* reguse:VGPR_32 */, %4.sub1:vreg_512
     %11.sub2:vreg_512 = COPY undef %1
     %11.sub3:vreg_512 = COPY %4.sub3
     %11.sub5:vreg_512 = COPY undef %1
diff --git a/llvm/test/CodeGen/AMDGPU/sched-handleMoveUp-subreg-def-across-subreg-def.mir b/llvm/test/CodeGen/AMDGPU/sched-handleMoveUp-subreg-def-across-subreg-def.mir
index 3ca61d26e8e42..24df4b8947672 100644
--- a/llvm/test/CodeGen/AMDGPU/sched-handleMoveUp-subreg-def-across-subreg-def.mir
+++ b/llvm/test/CodeGen/AMDGPU/sched-handleMoveUp-subreg-def-across-subreg-def.mir
@@ -40,18 +40,18 @@ body:             |
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, 851978 /* regdef:VGPR_16 */, def dead %11
+  ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, 2031626 /* regdef:VGPR_32 */, def dead %11
   ; CHECK-NEXT:   GLOBAL_STORE_DWORD undef %12:vreg_64, [[BUFFER_LOAD_DWORD_OFFEN]], 0, 0, implicit $exec :: (store (s32), addrspace 1)
   ; CHECK-NEXT:   [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
   ; CHECK-NEXT:   [[V_MOV_B32_e32_4:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
   ; CHECK-NEXT:   [[DS_READ_B64_gfx9_:%[0-9]+]]:vreg_64 = DS_READ_B64_gfx9 undef %14:vgpr_32, 0, 0, implicit $exec :: (load (s64), addrspace 3)
-  ; CHECK-NEXT:   INLINEASM &"def $0 $1", 1 /* sideeffect attdialect */, 851978 /* regdef:VGPR_16 */, def %15, 851978 /* regdef:VGPR_16 */, def %16
+  ; CHECK-NEXT:   INLINEASM &"def $0 $1", 1 /* sideeffect attdialect */, 2031626 /* regdef:VGPR_32 */, def %15, 2031626 /* regdef:VGPR_32 */, def %16
   ; CHECK-NEXT:   [[DS_READ_B32_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[V_MOV_B32_e32_1]], 0, 0, implicit $exec
   ; CHECK-NEXT:   [[DS_READ_B32_gfx9_1:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[V_MOV_B32_e32_2]], 0, 0, implicit $exec
   ; CHECK-NEXT:   [[DS_READ_B32_gfx9_2:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 undef %20:vgpr_32, 0, 0, implicit $exec
-  ; CHECK-NEXT:   INLINEASM &"def $0 $1", 1 /* sideeffect attdialect */, 851978 /* regdef:VGPR_16 */, def %21, 851978 /* regdef:VGPR_16 */, def %22
+  ; CHECK-NEXT:   INLINEASM &"def $0 $1", 1 /* sideeffect attdialect */, 2031626 /* regdef:VGPR_32 */, def %21, 2031626 /* regdef:VGPR_32 */, def %22
   ; CHECK-NEXT:   [[DS_READ_B32_gfx9_3:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[V_MOV_B32_e32_2]], 0, 0, implicit $exec
-  ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, 851978 /* regdef:VGPR_16 */, def dead [[V_MOV_B32_e32_3]], 851978 /* regdef:VGPR_16 */, def dead [[V_MOV_B32_e32_4]], 851977 /* reguse:VGPR_16 */, [[DS_READ_B64_gfx9_]].sub0, 2147483657 /* reguse tiedto:$0 */, [[V_MOV_B32_e32_3]](tied-def 3), 2147549193 /* reguse tiedto:$1 */, [[V_MOV_B32_e32_4]](tied-def 5), 851977 /* reguse:VGPR_16 */, %15, 851977 /* reguse:VGPR_16 */, %16, 851977 /* reguse:VGPR_16 */, [[DS_READ_B32_gfx9_1]], 851977 /* reguse:VGPR_16 */, [[DS_READ_B32_gfx9_]], 851977 /* reguse:VGPR_16 */, [[DS_READ_B32_gfx9_3]], 851977 /* reguse:VGPR_16 */, [[DS_READ_B32_gfx9_2]]
+  ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, 2031626 /* regdef:VGPR_32 */, def dead [[V_MOV_B32_e32_3]], 2031626 /* regdef:VGPR_32 */, def dead [[V_MOV_B32_e32_4]], 2031625 /* reguse:VGPR_32 */, [[DS_READ_B64_gfx9_]].sub0, 2147483657 /* reguse tiedto:$0 */, [[V_MOV_B32_e32_3]](tied-def 3), 2147549193 /* reguse tiedto:$1 */, [[V_MOV_B32_e32_4]](tied-def 5), 2031625 /* reguse:VGPR_32 */, %15, 2031625 /* reguse:VGPR_32 */, %16, 2031625 /* reguse:VGPR_32 */, [[DS_READ_B32_gfx9_1]], 2031625 /* reguse:VGPR_32 */, [[DS_READ_B32_gfx9_]], 2031625 /* reguse:VGPR_32 */, [[DS_READ_B32_gfx9_3]], 2031625 /* reguse:VGPR_32 */, [[DS_READ_B32_gfx9_2]]
   ; CHECK-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]].sub1:vreg_64 = COPY [[V_MOV_B32_e32_1]]
   ; CHECK-NEXT:   DS_WRITE_B32_gfx9 undef %28:vgpr_32, %21, 0, 0, implicit $exec :: (store (s32), addrspace 3)
   ; CHECK-NEXT:   DS_WRITE_B32_gfx9 undef %29:vgpr_32, %22, 0, 0, implicit $exec :: (store (s32), addrspace 3)
@@ -94,21 +94,21 @@ body:             |
     %10:vgpr_32 = IMPLICIT_DEF
 
   bb.1:
-    INLINEASM &"", 1, 851978, def %11:vgpr_32
+    INLINEASM &"", 1 /* sideeffect attdialect */, 2031626 /* regdef:VGPR_32 */, def %11:vgpr_32
     GLOBAL_STORE_DWORD undef %12:vreg_64, %1, 0, 0, implicit $exec :: (store (s32), addrspace 1)
     %13:vreg_64 = DS_READ_B64_gfx9 undef %14:vgpr_32, 0, 0, implicit $exec :: (load (s64), addrspace 3)
-    INLINEASM &"def $0 $1", 1, 851978, def %15:vgpr_32, 851978, def %16:vgpr_32
+    INLINEASM &"def $0 $1", 1 /* sideeffect attdialect */, 2031626 /* regdef:VGPR_32 */, def %15:vgpr_32, 2031626 /* regdef:VGPR_32 */, def %16:vgpr_32
     %17:vgpr_32 = DS_READ_B32_gfx9 %6, 0, 0, implicit $exec
     %18:vgpr_32 = DS_READ_B32_gfx9 %7, 0, 0, implicit $exec
     %19:vgpr_32 = DS_READ_B32_gfx9 undef %20:vgpr_32, 0, 0, implicit $exec
-    INLINEASM &"def $0 $1", 1, 851978, def %21:vgpr_32, 851978, def %22:vgpr_32
+    INLINEASM &"def $0 $1", 1 /* sideeffect attdialect */, 2031626 /* regdef:VGPR_32 */, def %21:vgpr_32, 2031626 /* regdef:VGPR_32 */, def %22:vgpr_32
     %23:vgpr_32 = DS_READ_B32_gfx9 %7, 0, 0, implicit $exec
     %24:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
     %5.sub1:vreg_64 = COPY %6
     %25:vgpr_32 = V_ADD_U32_e32 1, %10, implicit $exec
     %26:sreg_64_xexec = V_CMP_GT_U32_e64 64, %25, implicit $exec
     %27:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    INLINEASM &"", 1, 851978, def dead %24, 851978, def dead %27, 851977, %13.sub0, 2147483657, %24(tied-def 3), 2147549193, %27(tied-def 5), 851977, %15, 851977, %16, 851977, %18, 851977, %17, 851977, %23, 851977, %19
+    INLINEASM &"", 1 /* sideeffect attdialect */, 2031626 /* regdef:VGPR_32 */, def dead %24:vgpr_32, 2031626 /* regdef:VGPR_32 */, def dead %27:vgpr_32, 2031625 /* reguse:VGPR_32 */, %13.sub0:vreg_64, 2147483657 /* reguse tiedto:$0 */, %24:vgpr_32(tied-def 3), 2147549193 /* reguse tiedto:$1 */, %27:vgpr_32(tied-def 5), 2031625 /* reguse:VGPR_32 */, %15, 2031625 /* reguse:VGPR_32 */, %16, 2031625 /* reguse:VGPR_32 */, %18, 2031625 /* reguse:VGPR_32 */, %17, 2031625 /* reguse:VGPR_32 */, %23, 2031625 /* reguse:VGPR_32 */, %19
     DS_WRITE_B32_gfx9 undef %28:vgpr_32, %21, 0, 0, implicit $exec :: (store (s32), addrspace 3)
     DS_WRITE_B32_gfx9 undef %29:vgpr_32, %22, 0, 0, implicit $exec :: (store (s32), addrspace 3)
     DS_WRITE_B64_gfx9 undef %30:vgpr_32, %5, 0, 0, implicit $exec :: (store (s64), addrspace 3)
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-phys-copy.mir b/llvm/test/CodeGen/AMDGPU/sgpr-phys-copy.mir
index d86e5e6ec7bac..9553fcc1c51c8 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-phys-copy.mir
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-phys-copy.mir
@@ -49,6 +49,15 @@ body:             |
     $sgpr2_sgpr3 = COPY killed $sgpr0_sgpr1
 ...
 
+---
+name: src_shared_base_to_sgpr64
+body:             |
+  bb.0:
+    ; GFX9-LABEL: name: src_shared_base_to_sgpr64
+    ; GFX9: $sgpr0_sgpr1 = S_MOV_B64 $src_shared_base
+    $sgpr0_sgpr1 = COPY $src_shared_base
+...
+
 ---
 name: sgpr96_aligned_src_dst
 body:             |
diff --git a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
index 91c88ec5e718c..b538d6066d551 100644
--- a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
+++ b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
@@ -1528,8 +1528,8 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_zext_to_i32(ptr addrspace(1) %out
 ; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
 ; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_b16 v0, v0, s[2:3]
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.h, 0
 ; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.h, 0
 ; GFX11-SDAG-TRUE16-NEXT:    v_sub_nc_u16 v0.l, v0.l, 64
 ; GFX11-SDAG-TRUE16-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
@@ -1559,8 +1559,8 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_zext_to_i32(ptr addrspace(1) %out
 ; GFX11-GISEL-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
 ; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-TRUE16-NEXT:    global_load_d16_b16 v0, v0, s[2:3]
-; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v0.h, 0
 ; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v0.h, 0
 ; GFX11-GISEL-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0xffc0, v0.l
 ; GFX11-GISEL-TRUE16-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX11-GISEL-TRUE16-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/sibling-call.ll b/llvm/test/CodeGen/AMDGPU/sibling-call.ll
index 308d87ba79052..00214ef36e1f0 100644
--- a/llvm/test/CodeGen/AMDGPU/sibling-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/sibling-call.ll
@@ -1072,4 +1072,4 @@ entry:
 }
 
 attributes #0 = { nounwind }
-attributes #1 = { nounwind noinline "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
+attributes #1 = { nounwind noinline "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-cluster-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-cluster-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-cluster-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
diff --git a/llvm/test/CodeGen/AMDGPU/simple-indirect-call-2.ll b/llvm/test/CodeGen/AMDGPU/simple-indirect-call-2.ll
index 2895031365f92..f1cadeacc29c1 100644
--- a/llvm/test/CodeGen/AMDGPU/simple-indirect-call-2.ll
+++ b/llvm/test/CodeGen/AMDGPU/simple-indirect-call-2.ll
@@ -101,14 +101,14 @@ entry:
 }
 
 ;.
-; NO: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; NO: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; NO: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; NO: attributes #[[ATTR1]] = { "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
 ;.
-; OW: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; OW: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
 ; OW: attributes #[[ATTR1]] = { "uniform-work-group-size"="false" }
 ;.
-; CW: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; CW: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CW: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CW: attributes #[[ATTR1]] = { "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
 ;.
 ; NO: [[META0]] = !{ptr @bar1, ptr @bar2}
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll b/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll
index 3290bdbeb550d..775d2f954e130 100644
--- a/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll
@@ -58,7 +58,7 @@ define amdgpu_kernel void @test_simple_indirect_call() {
 
 
 ;.
-; ATTRIBUTOR_GCN: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_GCN: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
 ; ATTRIBUTOR_GCN: attributes #[[ATTR1]] = { "uniform-work-group-size"="false" }
 ;.
 ; ATTRIBUTOR_GCN: [[META0]] = !{i32 1, i32 5, i32 6, i32 10}
diff --git a/llvm/test/CodeGen/AMDGPU/spill-agpr.ll b/llvm/test/CodeGen/AMDGPU/spill-agpr.ll
index 2040e2b26cb15..da48af100d27b 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-agpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-agpr.ll
@@ -258,8 +258,7 @@ define amdgpu_kernel void @max_32regs_mfma32(ptr addrspace(1) %arg) #3 {
 ; GFX908-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; GFX908-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v4, v4, a[0:31]
 ; GFX908-NEXT:    v_mov_b32_e32 v0, 0
-; GFX908-NEXT:    s_nop 7
-; GFX908-NEXT:    s_nop 5
+; GFX908-NEXT:    s_nop 13
 ; GFX908-NEXT:    v_accvgpr_write_b32 a1, v5
 ; GFX908-NEXT:    ;;#ASMSTART
 ; GFX908-NEXT:    ;;#ASMEND
@@ -339,8 +338,7 @@ define amdgpu_kernel void @max_32regs_mfma32(ptr addrspace(1) %arg) #3 {
 ; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v2, v2, a[0:31]
-; GFX90A-NEXT:    s_nop 7
-; GFX90A-NEXT:    s_nop 7
+; GFX90A-NEXT:    s_nop 15
 ; GFX90A-NEXT:    s_nop 2
 ; GFX90A-NEXT:    v_accvgpr_write_b32 a1, v3
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -365,11 +363,6 @@ use:
 define amdgpu_kernel void @max_6regs_used_8a(ptr addrspace(1) %arg) #4 {
 ; GFX908-LABEL: max_6regs_used_8a:
 ; GFX908:       ; %bb.0:
-; GFX908-NEXT:    s_mov_b32 s4, SCRATCH_RSRC_DWORD0
-; GFX908-NEXT:    s_mov_b32 s5, SCRATCH_RSRC_DWORD1
-; GFX908-NEXT:    s_mov_b32 s6, -1
-; GFX908-NEXT:    s_mov_b32 s7, 0xe00000
-; GFX908-NEXT:    s_add_u32 s4, s4, s3
 ; GFX908-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; GFX908-NEXT:    ;;#ASMSTART
 ; GFX908-NEXT:    ; def v1
@@ -378,22 +371,27 @@ define amdgpu_kernel void @max_6regs_used_8a(ptr addrspace(1) %arg) #4 {
 ; GFX908-NEXT:    ;;#ASMSTART
 ; GFX908-NEXT:    ; def a[0:3]
 ; GFX908-NEXT:    ;;#ASMEND
-; GFX908-NEXT:    s_addc_u32 s5, s5, 0
+; GFX908-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
 ; GFX908-NEXT:    v_accvgpr_write_b32 a4, v1
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dwordx4 v[0:3], v4, s[2:3]
+; GFX908-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX908-NEXT:    s_mov_b32 s10, -1
+; GFX908-NEXT:    s_mov_b32 s11, 0xe00000
+; GFX908-NEXT:    s_add_u32 s8, s8, s5
+; GFX908-NEXT:    s_addc_u32 s9, s9, 0
 ; GFX908-NEXT:    v_accvgpr_read_b32 v5, a0 ; Reload Reuse
 ; GFX908-NEXT:    s_nop 1
-; GFX908-NEXT:    buffer_store_dword v5, off, s[4:7], 0 ; 4-byte Folded Spill
+; GFX908-NEXT:    buffer_store_dword v5, off, s[8:11], 0 ; 4-byte Folded Spill
 ; GFX908-NEXT:    v_accvgpr_read_b32 v5, a1 ; Reload Reuse
 ; GFX908-NEXT:    s_nop 1
-; GFX908-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:4 ; 4-byte Folded Spill
+; GFX908-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill
 ; GFX908-NEXT:    v_accvgpr_read_b32 v5, a2 ; Reload Reuse
 ; GFX908-NEXT:    s_nop 1
-; GFX908-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:8 ; 4-byte Folded Spill
+; GFX908-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:8 ; 4-byte Folded Spill
 ; GFX908-NEXT:    v_accvgpr_read_b32 v5, a3 ; Reload Reuse
 ; GFX908-NEXT:    s_nop 1
-; GFX908-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:12 ; 4-byte Folded Spill
+; GFX908-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill
 ; GFX908-NEXT:    s_waitcnt vmcnt(4)
 ; GFX908-NEXT:    v_accvgpr_write_b32 a0, v0
 ; GFX908-NEXT:    v_accvgpr_write_b32 a1, v1
@@ -409,11 +407,11 @@ define amdgpu_kernel void @max_6regs_used_8a(ptr addrspace(1) %arg) #4 {
 ; GFX908-NEXT:    v_accvgpr_read_b32 v3, a3
 ; GFX908-NEXT:    s_nop 1
 ; GFX908-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
-; GFX908-NEXT:    buffer_load_dword v0, off, s[4:7], 0 ; 4-byte Folded Reload
+; GFX908-NEXT:    buffer_load_dword v0, off, s[8:11], 0 ; 4-byte Folded Reload
 ; GFX908-NEXT:    s_nop 0
-; GFX908-NEXT:    buffer_load_dword v1, off, s[4:7], 0 offset:4 ; 4-byte Folded Reload
-; GFX908-NEXT:    buffer_load_dword v2, off, s[4:7], 0 offset:8 ; 4-byte Folded Reload
-; GFX908-NEXT:    buffer_load_dword v3, off, s[4:7], 0 offset:12 ; 4-byte Folded Reload
+; GFX908-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload
+; GFX908-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:8 ; 4-byte Folded Reload
+; GFX908-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:12 ; 4-byte Folded Reload
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
@@ -425,24 +423,24 @@ define amdgpu_kernel void @max_6regs_used_8a(ptr addrspace(1) %arg) #4 {
 ;
 ; GFX90A-LABEL: max_6regs_used_8a:
 ; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_mov_b32 s4, SCRATCH_RSRC_DWORD0
-; GFX90A-NEXT:    s_mov_b32 s5, SCRATCH_RSRC_DWORD1
-; GFX90A-NEXT:    s_mov_b32 s6, -1
-; GFX90A-NEXT:    s_mov_b32 s7, 0xe00000
-; GFX90A-NEXT:    s_add_u32 s4, s4, s3
+; GFX90A-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX90A-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX90A-NEXT:    s_mov_b32 s10, -1
 ; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX90A-NEXT:    s_addc_u32 s5, s5, 0
+; GFX90A-NEXT:    s_mov_b32 s11, 0xe00000
+; GFX90A-NEXT:    s_add_u32 s8, s8, s5
+; GFX90A-NEXT:    s_addc_u32 s9, s9, 0
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v1
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def a[0:3]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    buffer_store_dword a0, off, s[4:7], 0 ; 4-byte Folded Spill
+; GFX90A-NEXT:    buffer_store_dword a0, off, s[8:11], 0 ; 4-byte Folded Spill
 ; GFX90A-NEXT:    s_nop 0
-; GFX90A-NEXT:    buffer_store_dword a1, off, s[4:7], 0 offset:4 ; 4-byte Folded Spill
-; GFX90A-NEXT:    buffer_store_dword a2, off, s[4:7], 0 offset:8 ; 4-byte Folded Spill
-; GFX90A-NEXT:    buffer_store_dword a3, off, s[4:7], 0 offset:12 ; 4-byte Folded Spill
+; GFX90A-NEXT:    buffer_store_dword a1, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill
+; GFX90A-NEXT:    buffer_store_dword a2, off, s[8:11], 0 offset:8 ; 4-byte Folded Spill
+; GFX90A-NEXT:    buffer_store_dword a3, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dwordx4 a[0:3], v0, s[2:3]
@@ -452,10 +450,10 @@ define amdgpu_kernel void @max_6regs_used_8a(ptr addrspace(1) %arg) #4 {
 ; GFX90A-NEXT:    v_mfma_f32_4x4x1f32 a[0:3], v2, v2, a[0:3]
 ; GFX90A-NEXT:    s_nop 4
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[0:3], s[2:3]
-; GFX90A-NEXT:    buffer_load_dword v2, off, s[4:7], 0 ; 4-byte Folded Reload
-; GFX90A-NEXT:    buffer_load_dword v3, off, s[4:7], 0 offset:4 ; 4-byte Folded Reload
-; GFX90A-NEXT:    buffer_load_dword v4, off, s[4:7], 0 offset:8 ; 4-byte Folded Reload
-; GFX90A-NEXT:    buffer_load_dword v5, off, s[4:7], 0 offset:12 ; 4-byte Folded Reload
+; GFX90A-NEXT:    buffer_load_dword v2, off, s[8:11], 0 ; 4-byte Folded Reload
+; GFX90A-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload
+; GFX90A-NEXT:    buffer_load_dword v4, off, s[8:11], 0 offset:8 ; 4-byte Folded Reload
+; GFX90A-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:12 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
@@ -481,6 +479,12 @@ declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float, float, <16 x float>
 declare <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float, float, <4 x float>, i32, i32, i32)
 declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32)
 
+
+attributes #1 = { nounwind "amdgpu-num-vgpr"="10" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-cluster-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-cluster-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-cluster-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
+attributes #2 = { nounwind "amdgpu-num-vgpr"="12" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-cluster-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-cluster-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-cluster-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
+attributes #3 = { nounwind "amdgpu-num-vgpr"="32" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-cluster-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-cluster-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-cluster-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
+attributes #4 = { nounwind "amdgpu-num-vgpr"="6" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-cluster-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-cluster-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-cluster-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
+
 attributes #1 = { nounwind "amdgpu-num-vgpr"="10" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
 attributes #2 = { nounwind "amdgpu-num-vgpr"="12" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
 attributes #3 = { nounwind "amdgpu-num-vgpr"="32" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
diff --git a/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll b/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll
index 2a18594335e96..4893bff18570a 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll
@@ -15,7 +15,7 @@ define amdgpu_kernel void @test_spill_av_class(<4 x i32> %arg) #0 {
   ; GCN-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 2031626 /* regdef:VGPR_32 */, def undef %14.sub0
   ; GCN-NEXT:   [[COPY1:%[0-9]+]]:vreg_128 = COPY [[V_MFMA_I32_4X4X4I8_e64_]]
   ; GCN-NEXT:   GLOBAL_STORE_DWORDX4 undef %24:vreg_64, [[COPY1]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1)
-  ; GCN-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3670025 /* reguse:VReg_64 */, %14
+  ; GCN-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3735561 /* reguse:VReg_64 */, %14
   ; GCN-NEXT:   S_ENDPGM 0
   %v0 = call i32 asm sideeffect "; def $0", "=v"()
   %tmp = insertelement <2 x i32> poison, i32 %v0, i32 0
diff --git a/llvm/test/CodeGen/AMDGPU/spillv16.ll b/llvm/test/CodeGen/AMDGPU/spillv16.ll
index 3d21860e2af40..0e45df223465d 100644
--- a/llvm/test/CodeGen/AMDGPU/spillv16.ll
+++ b/llvm/test/CodeGen/AMDGPU/spillv16.ll
@@ -61,8 +61,8 @@ define void @spill_i16_alu_two_vals() {
 ; GCN-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 offset:4 glc dlc
 ; GCN-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-TRUE16-NEXT:    scratch_load_d16_hi_b16 v0, off, s32 offset:6 ; 2-byte Folded Reload
-; GCN-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x7b, v0.l
 ; GCN-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GCN-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x7b, v0.l
 ; GCN-TRUE16-NEXT:    scratch_store_d16_hi_b16 off, v0, s32 dlc
 ; GCN-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GCN-TRUE16-NEXT:    scratch_store_b16 off, v0, s32 offset:4 dlc
diff --git a/llvm/test/CodeGen/AMDGPU/stack-passed-subdword-arg-crash-issue157997.ll b/llvm/test/CodeGen/AMDGPU/stack-passed-subdword-arg-crash-issue157997.ll
new file mode 100644
index 0000000000000..4791f603fc7ae
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/stack-passed-subdword-arg-crash-issue157997.ll
@@ -0,0 +1,283 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s
+
+; Make sure that sub-dword arguments passed on the stack do not assert
+
+define i32 @stack_arg_i1(<8 x i32>, <8 x i32>, <8 x i32>, <4 x i32>, <3 x i32>, i1 %badarg) #0 {
+; GFX9-LABEL: stack_arg_i1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_load_ubyte v0, off, s[0:3], s32
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: stack_arg_i1:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    scratch_load_u8 v0, off, s32
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %ext = zext i1 %badarg to i32
+  ret i32 %ext
+}
+
+define i32 @stack_arg_i1_zeroext(<8 x i32>, <8 x i32>, <8 x i32>, <4 x i32>, <3 x i32>, i1 zeroext %badarg) #0 {
+; GFX9-LABEL: stack_arg_i1_zeroext:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_load_ubyte v0, off, s[0:3], s32
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: stack_arg_i1_zeroext:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    scratch_load_u8 v0, off, s32
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %ext = zext i1 %badarg to i32
+  ret i32 %ext
+}
+
+define i32 @stack_arg_i1_signext(<8 x i32>, <8 x i32>, <8 x i32>, <4 x i32>, <3 x i32>, i1 signext %badarg) #0 {
+; GFX9-LABEL: stack_arg_i1_signext:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_load_ubyte v0, off, s[0:3], s32
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_bfe_i32 v0, v0, 0, 1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: stack_arg_i1_signext:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    scratch_load_u8 v0, off, s32
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_bfe_i32 v0, v0, 0, 1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %ext = sext i1 %badarg to i32
+  ret i32 %ext
+}
+
+define i32 @stack_arg_i8(<8 x i32>, <8 x i32>, <8 x i32>, <4 x i32>, <3 x i32>, i8 %badarg) #0 {
+; GFX9-LABEL: stack_arg_i8:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_load_ubyte v0, off, s[0:3], s32
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: stack_arg_i8:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    scratch_load_u8 v0, off, s32
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %ext = zext i8 %badarg to i32
+  ret i32 %ext
+}
+
+define i32 @stack_arg_i8_zeroext(<8 x i32>, <8 x i32>, <8 x i32>, <4 x i32>, <3 x i32>, i8 zeroext %badarg) #0 {
+; GFX9-LABEL: stack_arg_i8_zeroext:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: stack_arg_i8_zeroext:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    scratch_load_u16 v0, off, s32
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %ext = zext i8 %badarg to i32
+  ret i32 %ext
+}
+
+define i32 @stack_arg_i8_signext(<8 x i32>, <8 x i32>, <8 x i32>, <4 x i32>, <3 x i32>, i8 signext %badarg) #0 {
+; GFX9-LABEL: stack_arg_i8_signext:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_load_sshort v0, off, s[0:3], s32
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: stack_arg_i8_signext:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    scratch_load_i16 v0, off, s32
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %ext = sext i8 %badarg to i32
+  ret i32 %ext
+}
+
+define i32 @stack_arg_i16(<8 x i32>, <8 x i32>, <8 x i32>, <4 x i32>, <3 x i32>, i16 %badarg) #0 {
+; GFX9-LABEL: stack_arg_i16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: stack_arg_i16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    scratch_load_u16 v0, off, s32
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %ext = zext i16 %badarg to i32
+  ret i32 %ext
+}
+
+define i32 @stack_arg_i16_zeroext(<8 x i32>, <8 x i32>, <8 x i32>, <4 x i32>, <3 x i32>, i16 zeroext %badarg) #0 {
+; GFX9-LABEL: stack_arg_i16_zeroext:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: stack_arg_i16_zeroext:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    scratch_load_u16 v0, off, s32
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %ext = zext i16 %badarg to i32
+  ret i32 %ext
+}
+
+define i32 @stack_arg_i16_signext(<8 x i32>, <8 x i32>, <8 x i32>, <4 x i32>, <3 x i32>, i16 signext %badarg) #0 {
+; GFX9-LABEL: stack_arg_i16_signext:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_load_sshort v0, off, s[0:3], s32
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: stack_arg_i16_signext:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    scratch_load_i16 v0, off, s32
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %ext = sext i16 %badarg to i32
+  ret i32 %ext
+}
+
+define i32 @stack_arg_i7(<8 x i32>, <8 x i32>, <8 x i32>, <4 x i32>, <3 x i32>, i7 %badarg) #0 {
+; GFX9-LABEL: stack_arg_i7:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 0x7f, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: stack_arg_i7:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    scratch_load_u16 v0, off, s32
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0x7f, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %ext = zext i7 %badarg to i32
+  ret i32 %ext
+}
+
+define i32 @stack_arg_i7_zeroext(<8 x i32>, <8 x i32>, <8 x i32>, <4 x i32>, <3 x i32>, i7 zeroext %badarg) #0 {
+; GFX9-LABEL: stack_arg_i7_zeroext:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: stack_arg_i7_zeroext:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    scratch_load_u16 v0, off, s32
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %ext = zext i7 %badarg to i32
+  ret i32 %ext
+}
+
+define i32 @stack_arg_i7_signext(<8 x i32>, <8 x i32>, <8 x i32>, <4 x i32>, <3 x i32>, i7 signext %badarg) #0 {
+; GFX9-LABEL: stack_arg_i7_signext:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_load_sshort v0, off, s[0:3], s32
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: stack_arg_i7_signext:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    scratch_load_i16 v0, off, s32
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %ext = sext i7 %badarg to i32
+  ret i32 %ext
+}
+
+define i32 @stack_arg_i17(<8 x i32>, <8 x i32>, <8 x i32>, <4 x i32>, <3 x i32>, i17 %badarg) #0 {
+; GFX9-LABEL: stack_arg_i17:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 0x1ffff, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: stack_arg_i17:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    scratch_load_b32 v0, off, s32
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0x1ffff, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %ext = zext i17 %badarg to i32
+  ret i32 %ext
+}
+
+define i32 @stack_arg_i17_zeroext(<8 x i32>, <8 x i32>, <8 x i32>, <4 x i32>, <3 x i32>, i17 zeroext %badarg) #0 {
+; GFX9-LABEL: stack_arg_i17_zeroext:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: stack_arg_i17_zeroext:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    scratch_load_b32 v0, off, s32
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %ext = zext i17 %badarg to i32
+  ret i32 %ext
+}
+
+define i32 @stack_arg_i17_signext(<8 x i32>, <8 x i32>, <8 x i32>, <4 x i32>, <3 x i32>, i17 signext %badarg) #0 {
+; GFX9-LABEL: stack_arg_i17_signext:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: stack_arg_i17_signext:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    scratch_load_b32 v0, off, s32
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %ext = sext i17 %badarg to i32
+  ret i32 %ext
+}
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll b/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll
index 477297ba2e7d5..c84c49ee1a41d 100644
--- a/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll
+++ b/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll
@@ -157,4 +157,4 @@ shader_eval_surface.exit:                         ; preds = %entry
 
 declare hidden i32 @svm_eval_nodes(ptr addrspace(5), ptr addrspace(5), ptr addrspace(5), i32, i32) local_unnamed_addr #0
 
-attributes #0 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
+attributes #0 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-cluster-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-cluster-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-cluster-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
diff --git a/llvm/test/CodeGen/AMDGPU/subreg-undef-def-with-other-subreg-defs.mir b/llvm/test/CodeGen/AMDGPU/subreg-undef-def-with-other-subreg-defs.mir
index b428e859a6d32..b6d630eda8b81 100644
--- a/llvm/test/CodeGen/AMDGPU/subreg-undef-def-with-other-subreg-defs.mir
+++ b/llvm/test/CodeGen/AMDGPU/subreg-undef-def-with-other-subreg-defs.mir
@@ -28,9 +28,9 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[DS_READ_B32_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[V_MOV_B32_e32_1]], 0, 0, implicit $exec :: (load (s32), addrspace 3)
-  ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, 851978 /* regdef:VGPR_16 */, def [[V_MOV_B32_e32_]], 2147549193 /* reguse tiedto:$1 */, [[V_MOV_B32_e32_]](tied-def 3)
-  ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, 851977 /* reguse:VGPR_16 */, [[DS_READ_B32_gfx9_]]
-  ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, 851978 /* regdef:VGPR_16 */, def undef [[V_MOV_B32_e32_]].sub0, 851978 /* regdef:VGPR_16 */, def undef [[V_MOV_B32_e32_]].sub1
+  ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, 2031626 /* regdef:VGPR_32 */, def [[V_MOV_B32_e32_]], 2147483657 /* reguse tiedto:$0 */, [[V_MOV_B32_e32_]](tied-def 3)
+  ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[DS_READ_B32_gfx9_]]
+  ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, 2031626 /* regdef:VGPR_32 */, def undef [[V_MOV_B32_e32_]].sub0, 2031626 /* regdef:VGPR_32 */, def undef [[V_MOV_B32_e32_]].sub1
   ; CHECK-NEXT:   S_NOP 0, implicit [[V_MOV_B32_e32_]].sub1
   ; CHECK-NEXT:   $sgpr10 = S_MOV_B32 -1
   ; CHECK-NEXT:   S_BRANCH %bb.1
@@ -41,9 +41,9 @@ body:             |
 
   bb.1:
     %2:vgpr_32 = DS_READ_B32_gfx9 %1, 0, 0, implicit $exec :: (load (s32), addrspace 3)
-    INLINEASM &"", 1, 851978, def %0, 2147549193, %0(tied-def 3)
-    INLINEASM &"", 1, 851977, %2
-    INLINEASM &"", 1, 851978, def undef %0.sub0, 851978, def %0.sub1
+    INLINEASM &"", 1 /* sideeffect attdialect */, 2031626 /* regdef:VGPR_32 */, def %0, 2147483657 /* reguse tiedto:$0 */, %0(tied-def 3)
+    INLINEASM &"", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, %2
+    INLINEASM &"", 1 /* sideeffect attdialect */, 2031626 /* regdef:VGPR_32 */, def undef %0.sub0, 2031626 /* regdef:VGPR_32 */, def %0.sub1
     S_NOP 0, implicit %0.sub1
     $sgpr10 = S_MOV_B32 -1
     S_BRANCH %bb.1
@@ -69,9 +69,9 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[DS_READ_B32_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[V_MOV_B32_e32_1]], 0, 0, implicit $exec :: (load (s32), addrspace 3)
-  ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, 851978 /* regdef:VGPR_16 */, def [[V_MOV_B32_e32_]], 2147549193 /* reguse tiedto:$1 */, [[V_MOV_B32_e32_]](tied-def 3)
-  ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, 851977 /* reguse:VGPR_16 */, [[DS_READ_B32_gfx9_]]
-  ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, 851978 /* regdef:VGPR_16 */, def undef [[V_MOV_B32_e32_]].sub1, 851978 /* regdef:VGPR_16 */, def undef [[V_MOV_B32_e32_]].sub0
+  ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, 2031626 /* regdef:VGPR_32 */, def [[V_MOV_B32_e32_]], 2147483657 /* reguse tiedto:$0 */, [[V_MOV_B32_e32_]](tied-def 3)
+  ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[DS_READ_B32_gfx9_]]
+  ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, 2031626 /* regdef:VGPR_32 */, def undef [[V_MOV_B32_e32_]].sub1, 2031626 /* regdef:VGPR_32 */, def undef [[V_MOV_B32_e32_]].sub0
   ; CHECK-NEXT:   S_NOP 0, implicit [[V_MOV_B32_e32_]].sub1
   ; CHECK-NEXT:   $sgpr10 = S_MOV_B32 -1
   ; CHECK-NEXT:   S_BRANCH %bb.1
@@ -82,9 +82,9 @@ body:             |
 
   bb.1:
     %2:vgpr_32 = DS_READ_B32_gfx9 %1, 0, 0, implicit $exec :: (load (s32), addrspace 3)
-    INLINEASM &"", 1, 851978, def %0, 2147549193, %0(tied-def 3)
-    INLINEASM &"", 1, 851977, %2
-    INLINEASM &"", 1, 851978, def %0.sub1, 851978, def undef %0.sub0
+    INLINEASM &"", 1 /* sideeffect attdialect */, 2031626 /* regdef:VGPR_32 */, def %0, 2147483657 /* reguse tiedto:$0 */, %0(tied-def 3)
+    INLINEASM &"", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, %2
+    INLINEASM &"", 1 /* sideeffect attdialect */, 2031626 /* regdef:VGPR_32 */, def %0.sub1, 2031626 /* regdef:VGPR_32 */, def undef %0.sub0
     S_NOP 0, implicit %0.sub1
     $sgpr10 = S_MOV_B32 -1
     S_BRANCH %bb.1
diff --git a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll
index 30accc846d2b6..c561e32d2db72 100644
--- a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll
+++ b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll
@@ -23,7 +23,7 @@ entry:
   ret void
 }
 
-attributes #0 = { "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
+attributes #0 = { "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-cluster-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-cluster-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-cluster-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
 
 !llvm.module.flags = !{!0}
 !0 = !{i32 1, !"amdhsa_code_object_version", i32 400}
diff --git a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll
index 4f84b31f1877b..2922424704edc 100644
--- a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll
+++ b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll
@@ -23,7 +23,7 @@ entry:
   ret void
 }
 
-attributes #0 = { "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
+attributes #0 = { "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-cluster-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-cluster-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-cluster-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
 
 !llvm.module.flags = !{!0}
 !0 = !{i32 1, !"amdhsa_code_object_version", i32 400}
diff --git a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll
index 644f434923368..aedb5f9106ec8 100644
--- a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll
+++ b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll
@@ -23,7 +23,7 @@ entry:
   ret void
 }
 
-attributes #0 = { "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
+attributes #0 = { "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-cluster-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-cluster-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-cluster-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
 
 !llvm.module.flags = !{!0}
 !0 = !{i32 1, !"amdhsa_code_object_version", i32 400}
diff --git a/llvm/test/CodeGen/AMDGPU/undef-handling-crash-in-ra.ll b/llvm/test/CodeGen/AMDGPU/undef-handling-crash-in-ra.ll
index 8867e6102406b..d2008be4fd32a 100644
--- a/llvm/test/CodeGen/AMDGPU/undef-handling-crash-in-ra.ll
+++ b/llvm/test/CodeGen/AMDGPU/undef-handling-crash-in-ra.ll
@@ -10,17 +10,17 @@ define amdgpu_kernel void @foo(ptr addrspace(5) %ptr5, ptr %p0, double %v0, <4 x
 ; CHECK-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
 ; CHECK-NEXT:    v_pk_mov_b32 v[46:47], 0, 0
 ; CHECK-NEXT:    flat_load_dword v42, v[46:47]
-; CHECK-NEXT:    s_load_dwordx4 s[64:67], s[8:9], 0x8
-; CHECK-NEXT:    s_load_dword s68, s[8:9], 0x0
+; CHECK-NEXT:    s_mov_b64 s[34:35], s[8:9]
+; CHECK-NEXT:    s_load_dwordx4 s[64:67], s[34:35], 0x8
+; CHECK-NEXT:    s_load_dword s68, s[34:35], 0x0
 ; CHECK-NEXT:    s_add_u32 s0, s0, s17
 ; CHECK-NEXT:    s_addc_u32 s1, s1, 0
+; CHECK-NEXT:    s_mov_b64 s[8:9], src_private_base
 ; CHECK-NEXT:    s_mov_b64 s[48:49], s[4:5]
-; CHECK-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_cmp_lg_u32 s68, -1
-; CHECK-NEXT:    s_mov_b64 s[34:35], s[8:9]
 ; CHECK-NEXT:    s_mov_b32 s4, 0
-; CHECK-NEXT:    s_cselect_b32 s5, s5, 0
+; CHECK-NEXT:    s_cselect_b32 s5, s9, 0
 ; CHECK-NEXT:    s_mov_b64 s[38:39], s[6:7]
 ; CHECK-NEXT:    s_cselect_b32 s6, s68, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v57, s5
diff --git a/llvm/test/CodeGen/AMDGPU/uniform-work-group-attribute-missing.ll b/llvm/test/CodeGen/AMDGPU/uniform-work-group-attribute-missing.ll
index 473d996bf721d..a1557418f5789 100644
--- a/llvm/test/CodeGen/AMDGPU/uniform-work-group-attribute-missing.ll
+++ b/llvm/test/CodeGen/AMDGPU/uniform-work-group-attribute-missing.ll
@@ -31,5 +31,5 @@ define amdgpu_kernel void @kernel1() #1 {
 
 attributes #0 = { "uniform-work-group-size"="true" }
 ;.
-; CHECK: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/uniform-work-group-multistep.ll b/llvm/test/CodeGen/AMDGPU/uniform-work-group-multistep.ll
index c9ee40cec878a..fb225a97b2441 100644
--- a/llvm/test/CodeGen/AMDGPU/uniform-work-group-multistep.ll
+++ b/llvm/test/CodeGen/AMDGPU/uniform-work-group-multistep.ll
@@ -98,7 +98,7 @@ define amdgpu_kernel void @kernel2() #0 {
 attributes #0 = { "uniform-work-group-size"="true" }
 ;.
 ; CHECK: attributes #[[ATTR0]] = { "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR1]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
 ; CHECK: attributes #[[ATTR2]] = { "uniform-work-group-size"="true" }
-; CHECK: attributes #[[ATTR3]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="true" }
+; CHECK: attributes #[[ATTR3]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="true" }
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/uniform-work-group-nested-function-calls.ll b/llvm/test/CodeGen/AMDGPU/uniform-work-group-nested-function-calls.ll
index 308f8b595eb06..cfede0ce13241 100644
--- a/llvm/test/CodeGen/AMDGPU/uniform-work-group-nested-function-calls.ll
+++ b/llvm/test/CodeGen/AMDGPU/uniform-work-group-nested-function-calls.ll
@@ -41,6 +41,6 @@ define amdgpu_kernel void @kernel3() #2 {
 
 attributes #2 = { "uniform-work-group-size"="true" }
 ;.
-; CHECK: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR1]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="true" }
+; CHECK: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="true" }
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/uniform-work-group-prevent-attribute-propagation.ll b/llvm/test/CodeGen/AMDGPU/uniform-work-group-prevent-attribute-propagation.ll
index 7e2b085f5a879..854b7245bbfec 100644
--- a/llvm/test/CodeGen/AMDGPU/uniform-work-group-prevent-attribute-propagation.ll
+++ b/llvm/test/CodeGen/AMDGPU/uniform-work-group-prevent-attribute-propagation.ll
@@ -41,6 +41,6 @@ define amdgpu_kernel void @kernel2() #2 {
 
 attributes #1 = { "uniform-work-group-size"="true" }
 ;.
-; CHECK: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR1]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="true" }
+; CHECK: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="true" }
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/uniform-work-group-propagate-attribute.ll b/llvm/test/CodeGen/AMDGPU/uniform-work-group-propagate-attribute.ll
index 3d6454cc9f99b..c4e0a60d43baa 100644
--- a/llvm/test/CodeGen/AMDGPU/uniform-work-group-propagate-attribute.ll
+++ b/llvm/test/CodeGen/AMDGPU/uniform-work-group-propagate-attribute.ll
@@ -52,8 +52,8 @@ attributes #0 = { nounwind }
 attributes #1 = { "uniform-work-group-size"="false" }
 attributes #2 = { "uniform-work-group-size"="true" }
 ;.
-; CHECK: attributes #[[ATTR0]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR1]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR0]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
 ; CHECK: attributes #[[ATTR2]] = { nounwind "uniform-work-group-size"="false" }
 ; CHECK: attributes #[[ATTR3]] = { "uniform-work-group-size"="true" }
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/uniform-work-group-recursion-test.ll b/llvm/test/CodeGen/AMDGPU/uniform-work-group-recursion-test.ll
index 3032d8ddf0a53..05af74df23c2b 100644
--- a/llvm/test/CodeGen/AMDGPU/uniform-work-group-recursion-test.ll
+++ b/llvm/test/CodeGen/AMDGPU/uniform-work-group-recursion-test.ll
@@ -101,7 +101,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %m) #1 {
 attributes #0 = { nounwind readnone }
 attributes #1 = { "uniform-work-group-size"="true" }
 ;.
-; CHECK: attributes #[[ATTR0]] = { nounwind memory(none) "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR1]] = { nounwind memory(none) "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="true" }
-; CHECK: attributes #[[ATTR2]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="true" }
+; CHECK: attributes #[[ATTR0]] = { nounwind memory(none) "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1]] = { nounwind memory(none) "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="true" }
+; CHECK: attributes #[[ATTR2]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="true" }
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/uniform-work-group-test.ll b/llvm/test/CodeGen/AMDGPU/uniform-work-group-test.ll
index e315e0454f424..cdbca7f5feb62 100644
--- a/llvm/test/CodeGen/AMDGPU/uniform-work-group-test.ll
+++ b/llvm/test/CodeGen/AMDGPU/uniform-work-group-test.ll
@@ -61,5 +61,5 @@ define amdgpu_kernel void @kernel3() #0 {
 
 attributes #0 = { "uniform-work-group-size"="false" }
 ;.
-; CHECK: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/unspill-vgpr-after-rewrite-vgpr-mfma.ll b/llvm/test/CodeGen/AMDGPU/unspill-vgpr-after-rewrite-vgpr-mfma.ll
index 122d46b39ff32..8878e9b65a088 100644
--- a/llvm/test/CodeGen/AMDGPU/unspill-vgpr-after-rewrite-vgpr-mfma.ll
+++ b/llvm/test/CodeGen/AMDGPU/unspill-vgpr-after-rewrite-vgpr-mfma.ll
@@ -101,13 +101,8 @@ define void @eliminate_spill_after_mfma_rewrite(i32 %x, i32 %y, <4 x i32> %arg,
 ; CHECK-NEXT:    v_accvgpr_read_b32 v2, a2
 ; CHECK-NEXT:    v_accvgpr_read_b32 v3, a3
 ; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; def v[0:3]
+; CHECK-NEXT:    ; def v[10:13]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; CHECK-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; CHECK-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def a[0:31]
@@ -147,12 +142,7 @@ define void @eliminate_spill_after_mfma_rewrite(i32 %x, i32 %y, <4 x i32> %arg,
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    global_store_dwordx4 v0, a[36:39], s[16:17] offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v0, v[2:5], s[16:17]
+; CHECK-NEXT:    global_store_dwordx4 v0, v[10:13], s[16:17]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    buffer_load_dword a63, off, s[0:3], s32 ; 4-byte Folded Reload
 ; CHECK-NEXT:    buffer_load_dword a62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -311,26 +301,16 @@ define void @eliminate_spill_after_mfma_rewrite_x2(i32 %x, i32 %y, <4 x i32> %ar
 ; CHECK-NEXT:    v_accvgpr_write_b32 a33, v1
 ; CHECK-NEXT:    v_accvgpr_write_b32 a32, v0
 ; CHECK-NEXT:    v_accvgpr_read_b32 v7, a3
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-NEXT:    v_accvgpr_read_b32 v6, a2
 ; CHECK-NEXT:    v_accvgpr_read_b32 v5, a1
 ; CHECK-NEXT:    v_accvgpr_read_b32 v4, a0
 ; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; def v[0:3]
+; CHECK-NEXT:    ; def v[10:13]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; CHECK-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; CHECK-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
 ; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; def v[0:3]
+; CHECK-NEXT:    ; def v[14:17]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; CHECK-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; CHECK-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; CHECK-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def a[0:31]
 ; CHECK-NEXT:    ;;#ASMEND
@@ -369,19 +349,9 @@ define void @eliminate_spill_after_mfma_rewrite_x2(i32 %x, i32 %y, <4 x i32> %ar
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    global_store_dwordx4 v0, a[36:39], s[16:17] offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v0, v[2:5], s[16:17]
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; CHECK-NEXT:    global_store_dwordx4 v0, v[10:13], s[16:17]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v0, v[2:5], s[16:17]
+; CHECK-NEXT:    global_store_dwordx4 v0, v[14:17], s[16:17]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    buffer_load_dword a63, off, s[0:3], s32 ; 4-byte Folded Reload
 ; CHECK-NEXT:    buffer_load_dword a62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
index d8264b5a091e1..b045c761436de 100644
--- a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
+++ b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
@@ -958,8 +958,7 @@ define amdgpu_kernel void @v8i8_mfma_half(ptr addrspace(1) %src1, ptr addrspace(
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    v_mfma_f32_32x32x4_2b_f16 a[0:31], v[2:3], v[2:3], a[0:31] cbsz:1 abid:2 blgp:3
-; GFX942-NEXT:    s_nop 7
-; GFX942-NEXT:    s_nop 7
+; GFX942-NEXT:    s_nop 15
 ; GFX942-NEXT:    s_nop 2
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[28:31], s[40:41] offset:112
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[24:27], s[40:41] offset:96
diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-unscoped.ll b/llvm/test/CodeGen/AMDGPU/waitcnt-unscoped.ll
index 0bd8667d17e52..a00aca34252b1 100644
--- a/llvm/test/CodeGen/AMDGPU/waitcnt-unscoped.ll
+++ b/llvm/test/CodeGen/AMDGPU/waitcnt-unscoped.ll
@@ -26,7 +26,6 @@ define amdgpu_kernel void @test_waitcnt(ptr addrspace(1) %global_buffer, ptr add
 ; CHECK-NEXT:    ds_write_b32 v1, v3
 ; CHECK-NEXT:    ds_write_b32 v2, v3
 ; CHECK-NEXT:    ; sched_barrier mask(0x00000000)
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    ds_read_b32 v1, v1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    global_store_dword v0, v1, s[0:1] offset:16
diff --git a/llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll b/llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll
index 7a64e55abb8d3..a392692e618cd 100644
--- a/llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll
+++ b/llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll
@@ -1,8 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+architected-sgprs -global-isel=0 < %s | FileCheck -check-prefixes=GFX9,GFX9-SDAG %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+architected-sgprs -global-isel=1 -new-reg-bank-select < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -global-isel=0 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -global-isel=1 -new-reg-bank-select < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -global-isel=0 < %s | FileCheck -check-prefixes=GFX1200 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -global-isel=1 -new-reg-bank-select < %s | FileCheck -check-prefixes=GFX1200 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -global-isel=0 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-SDAG %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -global-isel=1 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-GISEL %s
 
 define amdgpu_kernel void @workgroup_id_x(ptr addrspace(1) %ptrx) {
 ;
@@ -15,6 +17,50 @@ define amdgpu_kernel void @workgroup_id_x(ptr addrspace(1) %ptrx) {
 ; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX1200-LABEL: workgroup_id_x:
+; GFX1200:       ; %bb.0:
+; GFX1200-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1200-NEXT:    v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, 0
+; GFX1200-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1200-NEXT:    s_endpgm
+;
+; GFX1250-SDAG-LABEL: workgroup_id_x:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-SDAG-NEXT:    s_bfe_u32 s2, ttmp6, 0x4000c
+; GFX1250-SDAG-NEXT:    s_and_b32 s3, ttmp6, 15
+; GFX1250-SDAG-NEXT:    s_add_co_i32 s2, s2, 1
+; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
+; GFX1250-SDAG-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_IB_STS2, 6, 4)
+; GFX1250-SDAG-NEXT:    s_mul_i32 s2, ttmp9, s2
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT:    s_add_co_i32 s3, s3, s2
+; GFX1250-SDAG-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX1250-SDAG-NEXT:    s_cselect_b32 s2, ttmp9, s3
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: workgroup_id_x:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-GISEL-NEXT:    s_bfe_u32 s2, ttmp6, 0x4000c
+; GFX1250-GISEL-NEXT:    s_and_b32 s3, ttmp6, 15
+; GFX1250-GISEL-NEXT:    s_add_co_i32 s2, s2, 1
+; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
+; GFX1250-GISEL-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_IB_STS2, 6, 4)
+; GFX1250-GISEL-NEXT:    s_mul_i32 s2, ttmp9, s2
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1250-GISEL-NEXT:    s_add_co_i32 s3, s3, s2
+; GFX1250-GISEL-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX1250-GISEL-NEXT:    s_cselect_b32 s2, ttmp9, s3
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1250-GISEL-NEXT:    s_endpgm
 ; GFX12-LABEL: workgroup_id_x:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
@@ -41,6 +87,74 @@ define amdgpu_kernel void @workgroup_id_xy(ptr addrspace(1) %ptrx, ptr addrspace
 ; GFX9-NEXT:    global_store_dword v1, v2, s[2:3]
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX1200-LABEL: workgroup_id_xy:
+; GFX1200:       ; %bb.0:
+; GFX1200-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX1200-NEXT:    s_and_b32 s4, ttmp7, 0xffff
+; GFX1200-NEXT:    v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, 0
+; GFX1200-NEXT:    v_mov_b32_e32 v2, s4
+; GFX1200-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-NEXT:    s_clause 0x1
+; GFX1200-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1200-NEXT:    global_store_b32 v1, v2, s[2:3]
+; GFX1200-NEXT:    s_endpgm
+;
+; GFX1250-SDAG-LABEL: workgroup_id_xy:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX1250-SDAG-NEXT:    s_bfe_u32 s6, ttmp6, 0x40010
+; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
+; GFX1250-SDAG-NEXT:    s_and_b32 s4, ttmp7, 0xffff
+; GFX1250-SDAG-NEXT:    s_add_co_i32 s6, s6, 1
+; GFX1250-SDAG-NEXT:    s_bfe_u32 s7, ttmp6, 0x4000c
+; GFX1250-SDAG-NEXT:    s_mul_i32 s5, s4, s6
+; GFX1250-SDAG-NEXT:    s_bfe_u32 s6, ttmp6, 0x40004
+; GFX1250-SDAG-NEXT:    s_add_co_i32 s7, s7, 1
+; GFX1250-SDAG-NEXT:    s_add_co_i32 s6, s6, s5
+; GFX1250-SDAG-NEXT:    s_and_b32 s5, ttmp6, 15
+; GFX1250-SDAG-NEXT:    s_mul_i32 s7, ttmp9, s7
+; GFX1250-SDAG-NEXT:    s_getreg_b32 s8, hwreg(HW_REG_IB_STS2, 6, 4)
+; GFX1250-SDAG-NEXT:    s_add_co_i32 s5, s5, s7
+; GFX1250-SDAG-NEXT:    s_cmp_eq_u32 s8, 0
+; GFX1250-SDAG-NEXT:    s_cselect_b32 s5, ttmp9, s5
+; GFX1250-SDAG-NEXT:    s_cselect_b32 s4, s4, s6
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s5
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v2, s4
+; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT:    s_clause 0x1
+; GFX1250-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1250-SDAG-NEXT:    global_store_b32 v0, v2, s[2:3]
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: workgroup_id_xy:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_bfe_u32 s6, ttmp6, 0x4000c
+; GFX1250-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX1250-GISEL-NEXT:    s_add_co_i32 s6, s6, 1
+; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
+; GFX1250-GISEL-NEXT:    s_and_b32 s4, ttmp6, 15
+; GFX1250-GISEL-NEXT:    s_mul_i32 s5, ttmp9, s6
+; GFX1250-GISEL-NEXT:    s_getreg_b32 s6, hwreg(HW_REG_IB_STS2, 6, 4)
+; GFX1250-GISEL-NEXT:    s_add_co_i32 s4, s4, s5
+; GFX1250-GISEL-NEXT:    s_cmp_eq_u32 s6, 0
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1250-GISEL-NEXT:    s_cselect_b32 s4, ttmp9, s4
+; GFX1250-GISEL-NEXT:    s_bfe_u32 s5, ttmp6, 0x40010
+; GFX1250-GISEL-NEXT:    s_and_b32 s7, ttmp7, 0xffff
+; GFX1250-GISEL-NEXT:    s_add_co_i32 s5, s5, 1
+; GFX1250-GISEL-NEXT:    s_bfe_u32 s8, ttmp6, 0x40004
+; GFX1250-GISEL-NEXT:    s_mul_i32 s5, s7, s5
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-GISEL-NEXT:    s_add_co_i32 s8, s8, s5
+; GFX1250-GISEL-NEXT:    s_cmp_eq_u32 s6, 0
+; GFX1250-GISEL-NEXT:    s_cselect_b32 s4, s7, s8
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT:    s_clause 0x1
+; GFX1250-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1250-GISEL-NEXT:    global_store_b32 v1, v2, s[2:3]
+; GFX1250-GISEL-NEXT:    s_endpgm
 ; GFX12-LABEL: workgroup_id_xy:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
@@ -77,6 +191,99 @@ define amdgpu_kernel void @workgroup_id_xyz(ptr addrspace(1) %ptrx, ptr addrspac
 ; GFX9-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX1200-LABEL: workgroup_id_xyz:
+; GFX1200:       ; %bb.0:
+; GFX1200-NEXT:    s_clause 0x1
+; GFX1200-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX1200-NEXT:    s_load_b64 s[4:5], s[4:5], 0x10
+; GFX1200-NEXT:    s_and_b32 s6, ttmp7, 0xffff
+; GFX1200-NEXT:    v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, 0
+; GFX1200-NEXT:    s_lshr_b32 s7, ttmp7, 16
+; GFX1200-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1200-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX1200-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-NEXT:    s_clause 0x2
+; GFX1200-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1200-NEXT:    global_store_b32 v1, v2, s[2:3]
+; GFX1200-NEXT:    global_store_b32 v1, v3, s[4:5]
+; GFX1200-NEXT:    s_endpgm
+;
+; GFX1250-SDAG-LABEL: workgroup_id_xyz:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_bfe_u32 s0, ttmp6, 0x40014
+; GFX1250-SDAG-NEXT:    s_lshr_b32 s6, ttmp7, 16
+; GFX1250-SDAG-NEXT:    s_add_co_i32 s7, s0, 1
+; GFX1250-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
+; GFX1250-SDAG-NEXT:    s_load_b64 s[4:5], s[4:5], 0x10
+; GFX1250-SDAG-NEXT:    s_bfe_u32 s9, ttmp6, 0x40010
+; GFX1250-SDAG-NEXT:    s_mul_i32 s7, s6, s7
+; GFX1250-SDAG-NEXT:    s_bfe_u32 s8, ttmp6, 0x40008
+; GFX1250-SDAG-NEXT:    s_and_b32 s10, ttmp7, 0xffff
+; GFX1250-SDAG-NEXT:    s_add_co_i32 s9, s9, 1
+; GFX1250-SDAG-NEXT:    s_bfe_u32 s11, ttmp6, 0x4000c
+; GFX1250-SDAG-NEXT:    s_add_co_i32 s8, s8, s7
+; GFX1250-SDAG-NEXT:    s_mul_i32 s7, s10, s9
+; GFX1250-SDAG-NEXT:    s_bfe_u32 s9, ttmp6, 0x40004
+; GFX1250-SDAG-NEXT:    s_add_co_i32 s11, s11, 1
+; GFX1250-SDAG-NEXT:    s_add_co_i32 s9, s9, s7
+; GFX1250-SDAG-NEXT:    s_and_b32 s7, ttmp6, 15
+; GFX1250-SDAG-NEXT:    s_mul_i32 s11, ttmp9, s11
+; GFX1250-SDAG-NEXT:    s_getreg_b32 s12, hwreg(HW_REG_IB_STS2, 6, 4)
+; GFX1250-SDAG-NEXT:    s_add_co_i32 s7, s7, s11
+; GFX1250-SDAG-NEXT:    s_cmp_eq_u32 s12, 0
+; GFX1250-SDAG-NEXT:    s_cselect_b32 s7, ttmp9, s7
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s7
+; GFX1250-SDAG-NEXT:    s_cselect_b32 s7, s10, s9
+; GFX1250-SDAG-NEXT:    s_cselect_b32 s6, s6, s8
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s6
+; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT:    s_clause 0x2
+; GFX1250-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1250-SDAG-NEXT:    global_store_b32 v0, v2, s[2:3]
+; GFX1250-SDAG-NEXT:    global_store_b32 v0, v3, s[4:5]
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: workgroup_id_xyz:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_bfe_u32 s0, ttmp6, 0x4000c
+; GFX1250-GISEL-NEXT:    s_and_b32 s1, ttmp6, 15
+; GFX1250-GISEL-NEXT:    s_add_co_i32 s0, s0, 1
+; GFX1250-GISEL-NEXT:    s_getreg_b32 s6, hwreg(HW_REG_IB_STS2, 6, 4)
+; GFX1250-GISEL-NEXT:    s_mul_i32 s0, ttmp9, s0
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1250-GISEL-NEXT:    s_add_co_i32 s1, s1, s0
+; GFX1250-GISEL-NEXT:    s_cmp_eq_u32 s6, 0
+; GFX1250-GISEL-NEXT:    s_cselect_b32 s7, ttmp9, s1
+; GFX1250-GISEL-NEXT:    s_bfe_u32 s0, ttmp6, 0x40010
+; GFX1250-GISEL-NEXT:    s_and_b32 s8, ttmp7, 0xffff
+; GFX1250-GISEL-NEXT:    s_add_co_i32 s0, s0, 1
+; GFX1250-GISEL-NEXT:    s_bfe_u32 s9, ttmp6, 0x40004
+; GFX1250-GISEL-NEXT:    s_mul_i32 s10, s8, s0
+; GFX1250-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
+; GFX1250-GISEL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x10
+; GFX1250-GISEL-NEXT:    s_add_co_i32 s9, s9, s10
+; GFX1250-GISEL-NEXT:    s_cmp_eq_u32 s6, 0
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, s7
+; GFX1250-GISEL-NEXT:    s_cselect_b32 s8, s8, s9
+; GFX1250-GISEL-NEXT:    s_bfe_u32 s9, ttmp6, 0x40014
+; GFX1250-GISEL-NEXT:    s_lshr_b32 s10, ttmp7, 16
+; GFX1250-GISEL-NEXT:    s_add_co_i32 s9, s9, 1
+; GFX1250-GISEL-NEXT:    s_bfe_u32 s11, ttmp6, 0x40008
+; GFX1250-GISEL-NEXT:    s_mul_i32 s9, s10, s9
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX1250-GISEL-NEXT:    s_add_co_i32 s11, s11, s9
+; GFX1250-GISEL-NEXT:    s_cmp_eq_u32 s6, 0
+; GFX1250-GISEL-NEXT:    s_cselect_b32 s6, s10, s11
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v2, s8 :: v_dual_mov_b32 v3, s6
+; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT:    s_clause 0x2
+; GFX1250-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1250-GISEL-NEXT:    global_store_b32 v1, v2, s[2:3]
+; GFX1250-GISEL-NEXT:    global_store_b32 v1, v3, s[4:5]
+; GFX1250-GISEL-NEXT:    s_endpgm
 ; GFX12-LABEL: workgroup_id_xyz:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_clause 0x1
@@ -107,7 +314,6 @@ declare i32 @llvm.amdgcn.workgroup.id.x()
 declare i32 @llvm.amdgcn.workgroup.id.y()
 declare i32 @llvm.amdgcn.workgroup.id.z()
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; GFX12-GISEL: {{.*}}
-; GFX12-SDAG: {{.*}}
+; GFX1250: {{.*}}
 ; GFX9-GISEL: {{.*}}
 ; GFX9-SDAG: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/xor3-i1-const.ll b/llvm/test/CodeGen/AMDGPU/xor3-i1-const.ll
index 3059b5b445958..f76c68919d8e1 100644
--- a/llvm/test/CodeGen/AMDGPU/xor3-i1-const.ll
+++ b/llvm/test/CodeGen/AMDGPU/xor3-i1-const.ll
@@ -5,10 +5,7 @@
 define amdgpu_ps float @xor3_i1_const(float inreg %arg1, i32 inreg %arg2) {
 ; GCN-LABEL: xor3_i1_const:
 ; GCN:       ; %bb.0: ; %main_body
-; GCN-NEXT:    v_mov_b32_e32 v0, 0x42640000
-; GCN-NEXT:    v_cmp_lt_f32_e64 s[2:3], s0, 0
-; GCN-NEXT:    v_cmp_lt_f32_e32 vcc, s0, v0
-; GCN-NEXT:    s_and_b64 s[0:1], s[2:3], vcc
+; GCN-NEXT:    v_cmp_lt_f32_e64 s[0:1], s0, 0
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0, s[0:1]
 ; GCN-NEXT:    ; return to shader part epilog
 main_body:
diff --git a/llvm/test/CodeGen/BPF/jump_table_blockaddr.ll b/llvm/test/CodeGen/BPF/jump_table_blockaddr.ll
new file mode 100644
index 0000000000000..d5a1d63b644a8
--- /dev/null
+++ b/llvm/test/CodeGen/BPF/jump_table_blockaddr.ll
@@ -0,0 +1,91 @@
+; Checks generated using command:
+;    llvm/utils/update_test_body.py llvm/test/CodeGen/BPF/jump_table_blockaddr.ll
+
+; RUN: rm -rf %t && split-file %s %t && cd %t
+; RUN: llc -march=bpf -mcpu=v4 < test.ll | FileCheck %s
+;
+; Source code:
+;    int bar(int a) {
+;       __label__ l1, l2;
+;       void * volatile tgt;
+;       int ret = 0;
+;       if (a)
+;         tgt = &&l1; // synthetic jump table generated here
+;       else
+;         tgt = &&l2; // another synthetic jump table
+;       goto *tgt;
+;   l1: ret += 1;
+;   l2: ret += 2;
+;       return ret;
+;     }
+;
+; Compilation Flags:
+;   clang --target=bpf -mcpu=v4 -O2 -emit-llvm -S test.c
+
+.ifdef GEN
+;--- test.ll
+define dso_local range(i32 2, 4) i32 @bar(i32 noundef %a) local_unnamed_addr{
+entry:
+  %tgt = alloca ptr, align 8
+  %tobool.not = icmp eq i32 %a, 0
+  %. = select i1 %tobool.not, ptr blockaddress(@bar, %l2), ptr blockaddress(@bar, %l1)
+  store volatile ptr %., ptr %tgt, align 8
+  %tgt.0.tgt.0.tgt.0.tgt.0. = load volatile ptr, ptr %tgt, align 8
+  indirectbr ptr %tgt.0.tgt.0.tgt.0.tgt.0., [label %l1, label %l2]
+
+l1:                                               ; preds = %entry
+  br label %l2
+
+l2:                                               ; preds = %l1, %entry
+  %ret.0 = phi i32 [ 3, %l1 ], [ 2, %entry ]
+  ret i32 %ret.0
+}
+
+;--- gen
+echo ""
+echo "; Generated checks follow"
+echo ";"
+llc -march=bpf -mcpu=v4 < test.ll \
+  | awk '/# -- End function/ {p=0} /@function/ {p=1} p {print "; CHECK" ": " $0}'
+
+.endif
+
+; Generated checks follow
+;
+; CHECK: 	.type	bar,@function
+; CHECK: bar:                                    # @bar
+; CHECK: .Lbar$local:
+; CHECK: 	.type	.Lbar$local,@function
+; CHECK: 	.cfi_startproc
+; CHECK: # %bb.0:                                # %entry
+; CHECK: 	r2 = BPF.JT.0.0 ll
+; CHECK: 	r2 = *(u64 *)(r2 + 0)
+; CHECK: 	r3 = BPF.JT.0.1 ll
+; CHECK: 	r3 = *(u64 *)(r3 + 0)
+; CHECK: 	if w1 == 0 goto LBB0_2
+; CHECK: # %bb.1:                                # %entry
+; CHECK: 	r3 = r2
+; CHECK: LBB0_2:                                 # %entry
+; CHECK: 	*(u64 *)(r10 - 8) = r3
+; CHECK: 	r1 = *(u64 *)(r10 - 8)
+; CHECK: 	gotox r1
+; CHECK: .Ltmp0:                                 # Block address taken
+; CHECK: LBB0_3:                                 # %l1
+; CHECK: 	w0 = 3
+; CHECK: 	goto LBB0_5
+; CHECK: .Ltmp1:                                 # Block address taken
+; CHECK: LBB0_4:                                 # %l2
+; CHECK: 	w0 = 2
+; CHECK: LBB0_5:                                 # %.split
+; CHECK: 	exit
+; CHECK: .Lfunc_end0:
+; CHECK: 	.size	bar, .Lfunc_end0-bar
+; CHECK: 	.size	.Lbar$local, .Lfunc_end0-bar
+; CHECK: 	.cfi_endproc
+; CHECK: 	.section	.jumptables,"",@progbits
+; CHECK: BPF.JT.0.0:
+; CHECK: 	.quad	LBB0_3
+; CHECK: 	.size	BPF.JT.0.0, 8
+; CHECK: BPF.JT.0.1:
+; CHECK: 	.quad	LBB0_4
+; CHECK: 	.size	BPF.JT.0.1, 8
diff --git a/llvm/test/CodeGen/BPF/jump_table_global_var.ll b/llvm/test/CodeGen/BPF/jump_table_global_var.ll
new file mode 100644
index 0000000000000..bbca46850843b
--- /dev/null
+++ b/llvm/test/CodeGen/BPF/jump_table_global_var.ll
@@ -0,0 +1,83 @@
+; Checks generated using command:
+;    llvm/utils/update_test_body.py llvm/test/CodeGen/BPF/jump_table_global_var.ll
+
+; RUN: rm -rf %t && split-file %s %t && cd %t
+; RUN: llc -march=bpf -mcpu=v4 < test.ll | FileCheck %s
+;
+; Source code:
+;   int foo(unsigned a) {
+;     __label__ l1, l2;
+;     void *jt1[] = {[0]=&&l1, [1]=&&l2};
+;     int ret = 0;
+;
+;     goto *jt1[a % 2];
+;     l1: ret += 1;
+;     l2: ret += 3;
+;     return ret;
+;   }
+;
+; Compilation Flags:
+;   clang --target=bpf -mcpu=v4 -O2 -emit-llvm -S test.c
+
+.ifdef GEN
+;--- test.ll
+@__const.foo.jt1 = private unnamed_addr constant [2 x ptr] [ptr blockaddress(@foo, %l1), ptr blockaddress(@foo, %l2)], align 8
+
+define dso_local range(i32 3, 5) i32 @foo(i32 noundef %a) local_unnamed_addr {
+entry:
+  %rem = and i32 %a, 1
+  %idxprom = zext nneg i32 %rem to i64
+  %arrayidx = getelementptr inbounds nuw [2 x ptr], ptr @__const.foo.jt1, i64 0, i64 %idxprom
+  %0 = load ptr, ptr %arrayidx, align 8
+  indirectbr ptr %0, [label %l1, label %l2]
+
+l1:                                               ; preds = %entry
+  br label %l2
+
+l2:                                               ; preds = %l1, %entry
+  %ret.0 = phi i32 [ 4, %l1 ], [ 3, %entry ]
+  ret i32 %ret.0
+}
+
+;--- gen
+echo ""
+echo "; Generated checks follow"
+echo ";"
+llc -march=bpf -mcpu=v4 < test.ll \
+  | awk '/# -- End function/ {p=0} /@function/ {p=1} p {print "; CHECK" ": " $0}'
+
+.endif
+
+; Generated checks follow
+;
+; CHECK: 	.type	foo,@function
+; CHECK: foo:                                    # @foo
+; CHECK: .Lfoo$local:
+; CHECK: 	.type	.Lfoo$local,@function
+; CHECK: 	.cfi_startproc
+; CHECK: # %bb.0:                                # %entry
+; CHECK:                                         # kill: def $w1 killed $w1 def $r1
+; CHECK: 	w1 &= 1
+; CHECK: 	r1 <<= 3
+; CHECK: 	r2 = BPF.JT.0.0 ll
+; CHECK: 	r2 += r1
+; CHECK: 	r1 = *(u64 *)(r2 + 0)
+; CHECK: 	gotox r1
+; CHECK: .Ltmp0:                                 # Block address taken
+; CHECK: LBB0_1:                                 # %l1
+; CHECK: 	w0 = 4
+; CHECK: 	goto LBB0_3
+; CHECK: .Ltmp1:                                 # Block address taken
+; CHECK: LBB0_2:                                 # %l2
+; CHECK: 	w0 = 3
+; CHECK: LBB0_3:                                 # %.split
+; CHECK: 	exit
+; CHECK: .Lfunc_end0:
+; CHECK: 	.size	foo, .Lfunc_end0-foo
+; CHECK: 	.size	.Lfoo$local, .Lfunc_end0-foo
+; CHECK: 	.cfi_endproc
+; CHECK: 	.section	.jumptables,"",@progbits
+; CHECK: BPF.JT.0.0:
+; CHECK: 	.quad	LBB0_1
+; CHECK: 	.quad	LBB0_2
+; CHECK: 	.size	BPF.JT.0.0, 16
diff --git a/llvm/test/CodeGen/BPF/jump_table_switch_stmt.ll b/llvm/test/CodeGen/BPF/jump_table_switch_stmt.ll
new file mode 100644
index 0000000000000..682b025d665d6
--- /dev/null
+++ b/llvm/test/CodeGen/BPF/jump_table_switch_stmt.ll
@@ -0,0 +1,126 @@
+; Checks generated using command:
+;    llvm/utils/update_test_body.py llvm/test/CodeGen/BPF/jump_table_switch_stmt.ll
+
+; RUN: rm -rf %t && split-file %s %t && cd %t
+; RUN: llc -march=bpf -mcpu=v4 -bpf-min-jump-table-entries=3 < test.ll | FileCheck %s
+;
+; Source code:
+;   int ret_user;
+;   int foo(int a)
+;   {
+;      switch (a) {
+;      case 1: ret_user = 18; break;
+;      case 20: ret_user = 6; break;
+;      case 30: ret_user = 2; break;
+;      default: break;
+;      }
+;      return 0;
+;   }
+;
+; Compilation Flags:
+;   clang --target=bpf -mcpu=v4 -O2 -emit-llvm -S test.c
+
+.ifdef GEN
+;--- test.ll
+@ret_user = dso_local local_unnamed_addr global i32 0, align 4
+
+define dso_local noundef i32 @foo(i32 noundef %a) local_unnamed_addr {
+entry:
+  switch i32 %a, label %sw.epilog [
+    i32 1, label %sw.epilog.sink.split
+    i32 20, label %sw.bb1
+    i32 30, label %sw.bb2
+  ]
+
+sw.bb1:                                           ; preds = %entry
+  br label %sw.epilog.sink.split
+
+sw.bb2:                                           ; preds = %entry
+  br label %sw.epilog.sink.split
+
+sw.epilog.sink.split:                             ; preds = %entry, %sw.bb1, %sw.bb2
+  %.sink = phi i32 [ 2, %sw.bb2 ], [ 6, %sw.bb1 ], [ 18, %entry ]
+  store i32 %.sink, ptr @ret_user, align 4
+  br label %sw.epilog
+
+sw.epilog:                                        ; preds = %sw.epilog.sink.split, %entry
+  ret i32 0
+}
+
+;--- gen
+echo ""
+echo "; Generated checks follow"
+echo ";"
+llc -march=bpf -mcpu=v4 -bpf-min-jump-table-entries=3 < test.ll \
+  | awk '/# -- End function/ {p=0} /@function/ {p=1} p {print "; CHECK" ": " $0}'
+
+.endif
+
+; Generated checks follow
+;
+; CHECK: 	.type	foo,@function
+; CHECK: foo:                                    # @foo
+; CHECK: .Lfoo$local:
+; CHECK: 	.type	.Lfoo$local,@function
+; CHECK: 	.cfi_startproc
+; CHECK: # %bb.0:                                # %entry
+; CHECK:                                         # kill: def $w1 killed $w1 def $r1
+; CHECK: 	w1 += -1
+; CHECK: 	if w1 > 29 goto LBB0_5
+; CHECK: # %bb.1:                                # %entry
+; CHECK: 	w2 = 18
+; CHECK: 	r1 <<= 3
+; CHECK: 	r3 = BPF.JT.0.0 ll
+; CHECK: 	r4 = BPF.JT.0.0 ll
+; CHECK: 	r4 += r1
+; CHECK: 	r1 = *(u64 *)(r4 + 0)
+; CHECK: 	r3 += r1
+; CHECK: 	gotox r3
+; CHECK: LBB0_2:                                 # %sw.bb1
+; CHECK: 	w2 = 6
+; CHECK: 	goto LBB0_4
+; CHECK: LBB0_3:                                 # %sw.bb2
+; CHECK: 	w2 = 2
+; CHECK: LBB0_4:                                 # %sw.epilog.sink.split
+; CHECK: 	r1 = ret_user ll
+; CHECK: 	*(u32 *)(r1 + 0) = w2
+; CHECK: LBB0_5:                                 # %sw.epilog
+; CHECK: 	w0 = 0
+; CHECK: 	exit
+; CHECK: .Lfunc_end0:
+; CHECK: 	.size	foo, .Lfunc_end0-foo
+; CHECK: 	.size	.Lfoo$local, .Lfunc_end0-foo
+; CHECK: 	.cfi_endproc
+; CHECK: 	.section	.jumptables,"",@progbits
+; CHECK: BPF.JT.0.0:
+; CHECK: 	.quad	LBB0_4
+; CHECK: 	.quad	LBB0_5
+; CHECK: 	.quad	LBB0_5
+; CHECK: 	.quad	LBB0_5
+; CHECK: 	.quad	LBB0_5
+; CHECK: 	.quad	LBB0_5
+; CHECK: 	.quad	LBB0_5
+; CHECK: 	.quad	LBB0_5
+; CHECK: 	.quad	LBB0_5
+; CHECK: 	.quad	LBB0_5
+; CHECK: 	.quad	LBB0_5
+; CHECK: 	.quad	LBB0_5
+; CHECK: 	.quad	LBB0_5
+; CHECK: 	.quad	LBB0_5
+; CHECK: 	.quad	LBB0_5
+; CHECK: 	.quad	LBB0_5
+; CHECK: 	.quad	LBB0_5
+; CHECK: 	.quad	LBB0_5
+; CHECK: 	.quad	LBB0_5
+; CHECK: 	.quad	LBB0_2
+; CHECK: 	.quad	LBB0_5
+; CHECK: 	.quad	LBB0_5
+; CHECK: 	.quad	LBB0_5
+; CHECK: 	.quad	LBB0_5
+; CHECK: 	.quad	LBB0_5
+; CHECK: 	.quad	LBB0_5
+; CHECK: 	.quad	LBB0_5
+; CHECK: 	.quad	LBB0_5
+; CHECK: 	.quad	LBB0_5
+; CHECK: 	.quad	LBB0_3
+; CHECK: 	.size	BPF.JT.0.0, 240
diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-DescriptorTable-AllValidFlagCombinations.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-DescriptorTable-AllValidFlagCombinations.ll
index 1bc9b85935819..d6cb05b5d0dd9 100644
--- a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-DescriptorTable-AllValidFlagCombinations.ll
+++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-DescriptorTable-AllValidFlagCombinations.ll
@@ -61,94 +61,94 @@ attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
 ;DXC-NEXT:      NumStaticSamplers: 0
 ;DXC-NEXT:      StaticSamplersOffset: 380
 ;DXC-NEXT:      Parameters:
-;DXC-NEXT:        - ParameterType:   0
-;DXC-NEXT:          ShaderVisibility: 0
+;DXC-NEXT:        - ParameterType:   DescriptorTable
+;DXC-NEXT:          ShaderVisibility: All
 ;DXC-NEXT:          Table:
 ;DXC-NEXT:            NumRanges:       14
 ;DXC-NEXT:            RangesOffset:    44
 ;DXC-NEXT:            Ranges:
-;DXC-NEXT:              - RangeType:       3
+;DXC-NEXT:              - RangeType:       Sampler
 ;DXC-NEXT:                NumDescriptors:  1
 ;DXC-NEXT:                BaseShaderRegister: 0
 ;DXC-NEXT:                RegisterSpace:   1
 ;DXC-NEXT:                OffsetInDescriptorsFromTableStart: 4294967295
-;DXC-NEXT:              - RangeType:       3
+;DXC-NEXT:              - RangeType:       Sampler
 ;DXC-NEXT:                NumDescriptors:  1
 ;DXC-NEXT:                BaseShaderRegister: 0
 ;DXC-NEXT:                RegisterSpace:   3
 ;DXC-NEXT:                OffsetInDescriptorsFromTableStart: 4294967295
 ;DXC-NEXT:                DESCRIPTORS_VOLATILE: true
-;DXC-NEXT:              - RangeType:       3
+;DXC-NEXT:              - RangeType:       Sampler
 ;DXC-NEXT:                NumDescriptors:  1
 ;DXC-NEXT:                BaseShaderRegister: 0
 ;DXC-NEXT:                RegisterSpace:   4
 ;DXC-NEXT:                OffsetInDescriptorsFromTableStart: 4294967295
 ;DXC-NEXT:                DESCRIPTORS_STATIC_KEEPING_BUFFER_BOUNDS_CHECKS: true
-;DXC-NEXT:              - RangeType:       0
+;DXC-NEXT:              - RangeType:       SRV
 ;DXC-NEXT:                NumDescriptors:  1
 ;DXC-NEXT:                BaseShaderRegister: 0
 ;DXC-NEXT:                RegisterSpace:   5
 ;DXC-NEXT:                OffsetInDescriptorsFromTableStart: 4294967295
 ;DXC-NEXT:                DESCRIPTORS_VOLATILE: true
-;DXC-NEXT:              - RangeType:       1
+;DXC-NEXT:              - RangeType:       UAV
 ;DXC-NEXT:                NumDescriptors:  5
 ;DXC-NEXT:                BaseShaderRegister: 1
 ;DXC-NEXT:                RegisterSpace:   6
 ;DXC-NEXT:                OffsetInDescriptorsFromTableStart: 5
 ;DXC-NEXT:                DESCRIPTORS_VOLATILE: true
-;DXC-NEXT:              - RangeType:       2
+;DXC-NEXT:              - RangeType:       CBuffer
 ;DXC-NEXT:                NumDescriptors:  5
 ;DXC-NEXT:                BaseShaderRegister: 1
 ;DXC-NEXT:                RegisterSpace:   7
 ;DXC-NEXT:                OffsetInDescriptorsFromTableStart: 5
 ;DXC-NEXT:                DATA_VOLATILE:   true
-;DXC-NEXT:              - RangeType:       0
+;DXC-NEXT:              - RangeType:       SRV
 ;DXC-NEXT:                NumDescriptors:  5
 ;DXC-NEXT:                BaseShaderRegister: 1
 ;DXC-NEXT:                RegisterSpace:   8
 ;DXC-NEXT:                OffsetInDescriptorsFromTableStart: 5
 ;DXC-NEXT:                DATA_STATIC:     true
-;DXC-NEXT:              - RangeType:       1
+;DXC-NEXT:              - RangeType:       UAV
 ;DXC-NEXT:                NumDescriptors:  5
 ;DXC-NEXT:                BaseShaderRegister: 1
 ;DXC-NEXT:                RegisterSpace:   9
 ;DXC-NEXT:                OffsetInDescriptorsFromTableStart: 5
 ;DXC-NEXT:                DATA_STATIC_WHILE_SET_AT_EXECUTE: true
-;DXC-NEXT:              - RangeType:       2
+;DXC-NEXT:              - RangeType:       CBuffer
 ;DXC-NEXT:                NumDescriptors:  5
 ;DXC-NEXT:                BaseShaderRegister: 1
 ;DXC-NEXT:                RegisterSpace:   10
 ;DXC-NEXT:                OffsetInDescriptorsFromTableStart: 5
 ;DXC-NEXT:                DESCRIPTORS_VOLATILE: true
 ;DXC-NEXT:                DATA_VOLATILE:   true
-;DXC-NEXT:              - RangeType:       0
+;DXC-NEXT:              - RangeType:       SRV
 ;DXC-NEXT:                NumDescriptors:  5
 ;DXC-NEXT:                BaseShaderRegister: 1
 ;DXC-NEXT:                RegisterSpace:   11
 ;DXC-NEXT:                OffsetInDescriptorsFromTableStart: 5
 ;DXC-NEXT:                DESCRIPTORS_VOLATILE: true
 ;DXC-NEXT:                DATA_STATIC_WHILE_SET_AT_EXECUTE: true
-;DXC-NEXT:              - RangeType:       1
+;DXC-NEXT:              - RangeType:       UAV
 ;DXC-NEXT:                NumDescriptors:  5
 ;DXC-NEXT:                BaseShaderRegister: 1
 ;DXC-NEXT:                RegisterSpace:   12
 ;DXC-NEXT:                OffsetInDescriptorsFromTableStart: 5
 ;DXC-NEXT:                DESCRIPTORS_STATIC_KEEPING_BUFFER_BOUNDS_CHECKS: true
-;DXC-NEXT:              - RangeType:       2
+;DXC-NEXT:              - RangeType:       CBuffer
 ;DXC-NEXT:                NumDescriptors:  5
 ;DXC-NEXT:                BaseShaderRegister: 1
 ;DXC-NEXT:                RegisterSpace:   13
 ;DXC-NEXT:                OffsetInDescriptorsFromTableStart: 5
 ;DXC-NEXT:                DATA_VOLATILE:   true
 ;DXC-NEXT:                DESCRIPTORS_STATIC_KEEPING_BUFFER_BOUNDS_CHECKS: true
-;DXC-NEXT:              - RangeType:       0
+;DXC-NEXT:              - RangeType:       SRV
 ;DXC-NEXT:                NumDescriptors:  5
 ;DXC-NEXT:                BaseShaderRegister: 1
 ;DXC-NEXT:                RegisterSpace:   14
 ;DXC-NEXT:                OffsetInDescriptorsFromTableStart: 5
 ;DXC-NEXT:                DATA_STATIC:     true
 ;DXC-NEXT:                DESCRIPTORS_STATIC_KEEPING_BUFFER_BOUNDS_CHECKS: true
-;DXC-NEXT:              - RangeType:       1
+;DXC-NEXT:              - RangeType:       UAV
 ;DXC-NEXT:                NumDescriptors:  5
 ;DXC-NEXT:                BaseShaderRegister: 1
 ;DXC-NEXT:                RegisterSpace:   15
diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-DescriptorTable-AllValidFlagCombinationsV1.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-DescriptorTable-AllValidFlagCombinationsV1.ll
index fec6c4c959642..c65eab5f4aa5f 100644
--- a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-DescriptorTable-AllValidFlagCombinationsV1.ll
+++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-DescriptorTable-AllValidFlagCombinationsV1.ll
@@ -26,18 +26,18 @@ attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
 ; DXC-NEXT:       NumStaticSamplers: 0
 ; DXC-NEXT:       StaticSamplersOffset: 84
 ; DXC-NEXT:       Parameters:
-; DXC-NEXT:         - ParameterType:   0
-; DXC-NEXT:           ShaderVisibility: 0
+; DXC-NEXT:         - ParameterType:   DescriptorTable
+; DXC-NEXT:           ShaderVisibility: All
 ; DXC-NEXT:           Table:
 ; DXC-NEXT:             NumRanges:       2
 ; DXC-NEXT:             RangesOffset:    44
 ; DXC-NEXT:             Ranges:
-; DXC-NEXT:               - RangeType:       3
+; DXC-NEXT:               - RangeType:       Sampler
 ; DXC-NEXT:                 NumDescriptors:  1
 ; DXC-NEXT:                 BaseShaderRegister: 1
 ; DXC-NEXT:                 RegisterSpace:   0
 ; DXC-NEXT:                 OffsetInDescriptorsFromTableStart: 4294967295
-; DXC-NEXT:               - RangeType:       1
+; DXC-NEXT:               - RangeType:       UAV
 ; DXC-NEXT:                 NumDescriptors:  5
 ; DXC-NEXT:                 BaseShaderRegister: 1
 ; DXC-NEXT:                 RegisterSpace:   10
diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-DescriptorTable.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-DescriptorTable.ll
index 4f6f0d0bd6a14..c3985503e3788 100644
--- a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-DescriptorTable.ll
+++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-DescriptorTable.ll
@@ -23,24 +23,24 @@ attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
 ; DXC-NEXT:    Size:            92
 ; DXC-NEXT:    RootSignature:
 ; DXC-NEXT:      Version:         2
-; DXC-NEXT:      NumRootParameters: 1 
-; DXC-NEXT:      RootParametersOffset: 24 
+; DXC-NEXT:      NumRootParameters: 1
+; DXC-NEXT:      RootParametersOffset: 24
 ; DXC-NEXT:      NumStaticSamplers: 0
 ; DXC-NEXT:      StaticSamplersOffset: 92
 ; DXC-NEXT:      Parameters:
-; DXC-NEXT:        - ParameterType:   0
-; DXC-NEXT:          ShaderVisibility: 0
+; DXC-NEXT:        - ParameterType:   DescriptorTable
+; DXC-NEXT:          ShaderVisibility: All
 ; DXC-NEXT:          Table:
 ; DXC-NEXT:            NumRanges:       2
 ; DXC-NEXT:            RangesOffset:    44
 ; DXC-NEXT:            Ranges:
-; DXC-NEXT:              - RangeType:       0
+; DXC-NEXT:              - RangeType:       SRV
 ; DXC-NEXT:                NumDescriptors:  1
 ; DXC-NEXT:                BaseShaderRegister: 1
 ; DXC-NEXT:                RegisterSpace:   0
 ; DXC-NEXT:                OffsetInDescriptorsFromTableStart: 4294967295
 ; DXC-NEXT:                DATA_STATIC_WHILE_SET_AT_EXECUTE:   true
-; DXC-NEXT:              - RangeType:       1
+; DXC-NEXT:              - RangeType:       UAV
 ; DXC-NEXT:                NumDescriptors:  5
 ; DXC-NEXT:                BaseShaderRegister: 1
 ; DXC-NEXT:                RegisterSpace:   10
diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-RootConstants.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-RootConstants.ll
index d217f396722bc..4dec4e51abcd8 100644
--- a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-RootConstants.ll
+++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-RootConstants.ll
@@ -21,13 +21,13 @@ attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
 ; DXC-NEXT:    Size:            48
 ; DXC-NEXT:    RootSignature:
 ; DXC-NEXT:      Version:         2
-; DXC-NEXT:      NumRootParameters: 1 
-; DXC-NEXT:      RootParametersOffset: 24 
+; DXC-NEXT:      NumRootParameters: 1
+; DXC-NEXT:      RootParametersOffset: 24
 ; DXC-NEXT:      NumStaticSamplers: 0
 ; DXC-NEXT:      StaticSamplersOffset: 48
 ; DXC-NEXT:      Parameters:
-; DXC-NEXT:        - ParameterType:   1
-; DXC-NEXT:          ShaderVisibility: 0
+; DXC-NEXT:        - ParameterType:   Constants32Bit
+; DXC-NEXT:          ShaderVisibility: All
 ; DXC-NEXT:          Constants:
 ; DXC-NEXT:            Num32BitValues:  3
 ; DXC-NEXT:            RegisterSpace:   2
diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-RootDescriptor.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-RootDescriptor.ll
index 54292bb651532..6f3acdae2b81f 100644
--- a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-RootDescriptor.ll
+++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-RootDescriptor.ll
@@ -21,13 +21,13 @@ attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
 ; DXC-NEXT:    Size:            48
 ; DXC-NEXT:    RootSignature:
 ; DXC-NEXT:      Version:         2
-; DXC-NEXT:      NumRootParameters: 1 
-; DXC-NEXT:      RootParametersOffset: 24 
+; DXC-NEXT:      NumRootParameters: 1
+; DXC-NEXT:      RootParametersOffset: 24
 ; DXC-NEXT:      NumStaticSamplers: 0
 ; DXC-NEXT:      StaticSamplersOffset: 48
 ; DXC-NEXT:      Parameters:
-; DXC-NEXT:        - ParameterType:   2
-; DXC-NEXT:          ShaderVisibility: 0
+; DXC-NEXT:        - ParameterType:   CBV
+; DXC-NEXT:          ShaderVisibility: All
 ; DXC-NEXT:          Descriptor:
 ; DXC-NEXT:            RegisterSpace: 2
 ; DXC-NEXT:            ShaderRegister: 1
diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-RootDescriptor_V1.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-RootDescriptor_V1.ll
index 891a03b688a82..3509360e313e3 100644
--- a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-RootDescriptor_V1.ll
+++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-RootDescriptor_V1.ll
@@ -21,13 +21,13 @@ attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
 ; DXC-NEXT:    Size:            44
 ; DXC-NEXT:    RootSignature:
 ; DXC-NEXT:      Version:         1
-; DXC-NEXT:      NumRootParameters: 1 
-; DXC-NEXT:      RootParametersOffset: 24 
+; DXC-NEXT:      NumRootParameters: 1
+; DXC-NEXT:      RootParametersOffset: 24
 ; DXC-NEXT:      NumStaticSamplers: 0
 ; DXC-NEXT:      StaticSamplersOffset: 44
 ; DXC-NEXT:      Parameters:
-; DXC-NEXT:        - ParameterType:   2
-; DXC-NEXT:          ShaderVisibility: 0
+; DXC-NEXT:        - ParameterType:   CBV
+; DXC-NEXT:          ShaderVisibility: All
 ; DXC-NEXT:          Descriptor:
 ; DXC-NEXT:            RegisterSpace: 2
 ; DXC-NEXT:            ShaderRegister: 1
diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers.ll
index d9ee39dbb7287..1dd470d7fb822 100644
--- a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers.ll
+++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers.ll
@@ -27,16 +27,16 @@ attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
 ; DXC-NEXT:       StaticSamplersOffset: 24
 ; DXC-NEXT:       Parameters:      []
 ; DXC-NEXT:       Samplers:
-; DXC-NEXT:         - Filter:          4
-; DXC-NEXT:           AddressU:        2
-; DXC-NEXT:           AddressV:        3
-; DXC-NEXT:           AddressW:        5
+; DXC-NEXT:         - Filter:          MinPointMagLinearMipPoint
+; DXC-NEXT:           AddressU:        Mirror
+; DXC-NEXT:           AddressV:        Clamp
+; DXC-NEXT:           AddressW:        MirrorOnce
 ; DXC-NEXT:           MipLODBias:      1.425
 ; DXC-NEXT:           MaxAnisotropy:   9
-; DXC-NEXT:           ComparisonFunc:  3
-; DXC-NEXT:           BorderColor:     2
+; DXC-NEXT:           ComparisonFunc:  Equal
+; DXC-NEXT:           BorderColor:     OpaqueWhite
 ; DXC-NEXT:           MinLOD:          -128
 ; DXC-NEXT:           MaxLOD:          128
 ; DXC-NEXT:           ShaderRegister:  42
 ; DXC-NEXT:           RegisterSpace:   0
-; DXC-NEXT:           ShaderVisibility: 0
+; DXC-NEXT:           ShaderVisibility: All
diff --git a/llvm/test/CodeGen/Hexagon/isel-uinttofp-v32i1tov32f32.ll b/llvm/test/CodeGen/Hexagon/isel-uinttofp-v32i1tov32f32.ll
new file mode 100644
index 0000000000000..dfb2bc83537dc
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/isel-uinttofp-v32i1tov32f32.ll
@@ -0,0 +1,25 @@
+; Tests lowering of v32i1 to v32f32
+
+; RUN: llc -march=hexagon -mattr=+hvxv79,+hvx-length128b,+hvx-ieee-fp \
+; RUN: -stop-after=hexagon-isel %s -o - | FileCheck %s
+
+; CHECK: [[R0:%[0-9]+]]:hvxvr = V6_lvsplatw killed %{{[0-9]+}}
+; CHECK-NEXT: [[R1:%[0-9]+]]:intregs = A2_tfrsi 1
+; CHECK-NEXT: [[R2:%[0-9]+]]:hvxvr = V6_lvsplatw [[R1]]
+; CHECK-NEXT: [[R3:%[0-9]+]]:hvxqr = V6_vandvrt [[R2]], [[R1]]
+; CHECK-NEXT: [[R4:%[0-9]+]]:hvxvr = V6_vprefixqw killed [[R3]]
+; CHECK-NEXT: [[R5:%[0-9]+]]:hvxvr = V6_vsubw killed [[R4]], [[R2]]
+; CHECK-NEXT: [[R6:%[0-9]+]]:hvxvr = V6_vlsrwv killed [[R0]], killed [[R5]]
+; CHECK-NEXT: [[R7:%[0-9]+]]:hvxvr = V6_vand killed [[R6]], [[R2]]
+; CHECK-NEXT: [[R8:%[0-9]+]]:hvxvr = V6_vconv_sf_w killed [[R7]]
+; CHECK-NEXT: hvxvr = V6_vadd_sf_sf [[R8]], [[R8]]
+
+define <32 x float> @uitofp_i1(<32 x i16> %in0, <32 x i16> %in1) #0
+{
+   %q1 = icmp eq <32 x i16> %in0, %in1
+   %fp0 = uitofp <32 x i1> %q1 to <32 x float>
+   %out = fadd <32 x float> %fp0, %fp0
+   ret <32 x float> %out
+}
+
+attributes #0 = { nounwind readnone "target-cpu"="hexagonv79" "target-features"="+hvxv79,+hvx-length128b" }
diff --git a/llvm/test/CodeGen/Hexagon/isel-uinttofp-v64i1tov64f16.ll b/llvm/test/CodeGen/Hexagon/isel-uinttofp-v64i1tov64f16.ll
new file mode 100644
index 0000000000000..8769e345655e9
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/isel-uinttofp-v64i1tov64f16.ll
@@ -0,0 +1,27 @@
+; Tests the conversion pattern for v64i1 to v64f16
+; r0, r3 and r9 registers are i32 types converted from
+; v32i1 via a bitcasting sequence.
+
+; RUN: llc -march=hexagon -mattr=+hvxv79,+hvx-length128b \
+; RUN: %s -verify-machineinstrs -o - | FileCheck %s
+
+; CHECK: [[V3:v[0-9]+]] = vsplat([[R0:r[0-9]+]])
+; CHECK: [[Q0:q[0-9]+]] = vand([[V3]],[[R0]])
+; CHECK: [[V4:v[0-9]+]].w = prefixsum([[Q0]])
+; CHECK: [[V5:v[0-9]+]].w = vsub([[V4]].w,[[V3]].w)
+; CHECK: [[V1:v[0-9]+]] = vsplat(r
+; CHECK: [[V2:v[0-9]+]] = vsplat(r
+; CHECK: [[V6:v[0-9]+]].w = vlsr([[V1]].w,[[V5]].w)
+; CHECK: [[V7:v[0-9]+]].w = vlsr([[V2]].w,[[V5]].w)
+; CHECK: [[V8:v[0-9]+]] = vand([[V6]],[[V3]])
+; CHECK: [[V9:v[0-9]+]] = vand([[V7]],[[V3]])
+; CHECK: [[V10:v[0-9]+]].h = vpacke([[V9]].w,[[V8]].w)
+; CHECK: .hf = [[V10]].h
+
+define <64 x half> @uitofp_i1(<64 x i16> %in0, <64 x i16> %in1)
+{
+   %in = icmp eq <64 x i16> %in0, %in1
+   %fp0 = uitofp <64 x i1> %in to <64 x half>
+   %out = fadd <64 x half> %fp0, %fp0
+   ret <64 x half> %out
+}
diff --git a/llvm/test/CodeGen/Hexagon/vararg-musl.ll b/llvm/test/CodeGen/Hexagon/vararg-musl.ll
new file mode 100644
index 0000000000000..b902dded32153
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/vararg-musl.ll
@@ -0,0 +1,15 @@
+; RUN: llc -mtriple=hexagon-unknown-linux-musl < %s | FileCheck %s -check-prefix=MUSL
+; RUN: llc -mtriple=hexagon-unknown-none-elf   < %s | FileCheck %s -check-prefix=NONMUSL
+
+; MUSL-NOT: memw
+; NONMUSL: memw
+
+declare i32 @f0(i32 %a0, ...)
+
+define i32 @f1(i32 %a0, i32 %a1) #0 {
+b1:
+  %v7 = call i32 (i32, ...) @f0(i32 %a0, i32 %a1)
+  ret i32 %v7
+}
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/Hexagon/vsubsat.ll b/llvm/test/CodeGen/Hexagon/vsubsat.ll
new file mode 100644
index 0000000000000..bb65aff166e23
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/vsubsat.ll
@@ -0,0 +1,99 @@
+; RUN: llc -march=hexagon -mattr=+hvxv73,+hvx-length128b < %s | FileCheck %s
+
+;; Saturating subtraction.
+
+; CHECK-LABEL: vsububsat
+; CHECK: v[[#]].ub = vsub(v[[#]].ub,v[[#]].ub):sat
+define dso_local <128 x i8> @vsububsat(<128 x i8> %x, <128 x i8> %y) {
+entry:
+  %0 = tail call <128 x i8> @llvm.usub.sat.v128i8(<128 x i8> %x, <128 x i8> %y)
+  ret <128 x i8> %0
+}
+
+; CHECK-LABEL: vsubuhsat
+; CHECK: v[[#]].uh = vsub(v[[#]].uh,v[[#]].uh):sat
+define dso_local <64 x i16> @vsubuhsat(<64 x i16> %x, <64 x i16> %y) {
+entry:
+  %0 = tail call <64 x i16> @llvm.usub.sat.v64i16(<64 x i16> %x, <64 x i16> %y)
+  ret <64 x i16> %0
+}
+
+; CHECK-LABEL: vsubuwsat
+; CHECK: v[[#]].uw = vsub(v[[#]].uw,v[[#]].uw):sat
+define dso_local <32 x i32> @vsubuwsat(<32 x i32> %x, <32 x i32> %y) {
+entry:
+  %0 = tail call <32 x i32> @llvm.usub.sat.v32i32(<32 x i32> %x, <32 x i32> %y)
+  ret <32 x i32> %0
+}
+
+; CHECK-LABEL: vsubbsat
+; CHECK: v[[#]].b = vsub(v[[#]].b,v[[#]].b):sat
+define dso_local <128 x i8> @vsubbsat(<128 x i8> %x, <128 x i8> %y) {
+entry:
+  %0 = tail call <128 x i8> @llvm.ssub.sat.v128i8(<128 x i8> %x, <128 x i8> %y)
+  ret <128 x i8> %0
+}
+
+; CHECK-LABEL: vsubhsat
+; CHECK: v[[#]].h = vsub(v[[#]].h,v[[#]].h):sat
+define dso_local <64 x i16> @vsubhsat(<64 x i16> %x, <64 x i16> %y) {
+entry:
+  %0 = tail call <64 x i16> @llvm.ssub.sat.v64i16(<64 x i16> %x, <64 x i16> %y)
+  ret <64 x i16> %0
+}
+
+; CHECK-LABEL: vsubwsat
+; CHECK: v[[#]].w = vsub(v[[#]].w,v[[#]].w):sat
+define dso_local <32 x i32> @vsubwsat(<32 x i32> %x, <32 x i32> %y) {
+entry:
+  %0 = tail call <32 x i32> @llvm.ssub.sat.v32i32(<32 x i32> %x, <32 x i32> %y)
+  ret <32 x i32> %0
+}
+
+; CHECK-LABEL: vsububsat_dv
+; CHECK: v[[#]]:[[#]].ub = vsub(v[[#]]:[[#]].ub,v[[#]]:[[#]].ub):sat
+define dso_local <256 x i8> @vsububsat_dv(<256 x i8> %x, <256 x i8> %y) {
+entry:
+  %0 = tail call <256 x i8> @llvm.usub.sat.v256i8(<256 x i8> %x, <256 x i8> %y)
+  ret <256 x i8> %0
+}
+
+; CHECK-LABEL: vsubuhsat_dv
+; CHECK: v[[#]]:[[#]].uh = vsub(v[[#]]:[[#]].uh,v[[#]]:[[#]].uh):sat
+define dso_local <128 x i16> @vsubuhsat_dv(<128 x i16> %x, <128 x i16> %y) {
+entry:
+  %0 = tail call <128 x i16> @llvm.usub.sat.v128i16(<128 x i16> %x, <128 x i16> %y)
+  ret <128 x i16> %0
+}
+
+; CHECK-LABEL: vsubuwsat_dv
+; CHECK: v[[#]]:[[#]].uw = vsub(v[[#]]:[[#]].uw,v[[#]]:[[#]].uw):sat
+define dso_local <64 x i32> @vsubuwsat_dv(<64 x i32> %x, <64 x i32> %y) {
+entry:
+  %0 = tail call <64 x i32> @llvm.usub.sat.v64i32(<64 x i32> %x, <64 x i32> %y)
+  ret <64 x i32> %0
+}
+
+; CHECK-LABEL: vsubbsat_dv
+; CHECK: v[[#]]:[[#]].b = vsub(v[[#]]:[[#]].b,v[[#]]:[[#]].b):sat
+define dso_local <256 x i8> @vsubbsat_dv(<256 x i8> %x, <256 x i8> %y) {
+entry:
+  %0 = tail call <256 x i8> @llvm.ssub.sat.v256i8(<256 x i8> %x, <256 x i8> %y)
+  ret <256 x i8> %0
+}
+
+; CHECK-LABEL: vsubhsat_dv
+; CHECK: v[[#]]:[[#]].h = vsub(v[[#]]:[[#]].h,v[[#]]:[[#]].h):sat
+define dso_local <128 x i16> @vsubhsat_dv(<128 x i16> %x, <128 x i16> %y) {
+entry:
+  %0 = tail call <128 x i16> @llvm.ssub.sat.v128i16(<128 x i16> %x, <128 x i16> %y)
+  ret <128 x i16> %0
+}
+
+; CHECK-LABEL: vsubwsat_dv
+; CHECK: v[[#]]:[[#]].w = vsub(v[[#]]:[[#]].w,v[[#]]:[[#]].w):sat
+define dso_local <64 x i32> @vsubwsat_dv(<64 x i32> %x, <64 x i32> %y) {
+entry:
+  %0 = tail call <64 x i32> @llvm.ssub.sat.v64i32(<64 x i32> %x, <64 x i32> %y)
+  ret <64 x i32> %0
+}
diff --git a/llvm/test/CodeGen/LoongArch/lasx/build-vector.ll b/llvm/test/CodeGen/LoongArch/lasx/build-vector.ll
index 23245726c8968..d6756b9395237 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/build-vector.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/build-vector.ll
@@ -106,6 +106,26 @@ entry:
   ret void
 }
 
+define void @buildvector_v32i8_const_splat_v4i64(ptr %dst) nounwind {
+; LA32-LABEL: buildvector_v32i8_const_splat_v4i64:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI7_0)
+; LA32-NEXT:    xvld $xr0, $a1, %pc_lo12(.LCPI7_0)
+; LA32-NEXT:    xvst $xr0, $a0, 0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: buildvector_v32i8_const_splat_v4i64:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    lu12i.w $a1, 7
+; LA64-NEXT:    ori $a1, $a1, 3453
+; LA64-NEXT:    xvreplgr2vr.d $xr0, $a1
+; LA64-NEXT:    xvst $xr0, $a0, 0
+; LA64-NEXT:    ret
+entry:
+  store <32 x i8> <i8 125, i8 125, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 125, i8 125, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 125, i8 125, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 125, i8 125, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, ptr %dst
+  ret void
+}
+
 define void @buildvector_v16i16_const_splat(ptr %dst) nounwind {
 ; CHECK-LABEL: buildvector_v16i16_const_splat:
 ; CHECK:       # %bb.0: # %entry
@@ -117,6 +137,25 @@ entry:
   ret void
 }
 
+define void @buildvector_v16i16_const_splat_v4i64(ptr %dst) nounwind {
+; LA32-LABEL: buildvector_v16i16_const_splat_v4i64:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI9_0)
+; LA32-NEXT:    xvld $xr0, $a1, %pc_lo12(.LCPI9_0)
+; LA32-NEXT:    xvst $xr0, $a0, 0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: buildvector_v16i16_const_splat_v4i64:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    ori $a1, $zero, 512
+; LA64-NEXT:    xvreplgr2vr.d $xr0, $a1
+; LA64-NEXT:    xvst $xr0, $a0, 0
+; LA64-NEXT:    ret
+entry:
+  store <16 x i16> <i16 512, i16 0, i16 0, i16 0, i16 512, i16 0, i16 0, i16 0, i16 512, i16 0, i16 0, i16 0, i16 512, i16 0, i16 0, i16 0>, ptr %dst
+  ret void
+}
+
 define void @buildvector_v8i32_const_splat(ptr %dst) nounwind {
 ; CHECK-LABEL: buildvector_v8i32_const_splat:
 ; CHECK:       # %bb.0: # %entry
@@ -128,6 +167,25 @@ entry:
   ret void
 }
 
+define void @buildvector_v8i32_const_splat_v4i64(ptr %dst) nounwind {
+; LA32-LABEL: buildvector_v8i32_const_splat_v4i64:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI11_0)
+; LA32-NEXT:    xvld $xr0, $a1, %pc_lo12(.LCPI11_0)
+; LA32-NEXT:    xvst $xr0, $a0, 0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: buildvector_v8i32_const_splat_v4i64:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    ori $a1, $zero, 512
+; LA64-NEXT:    xvreplgr2vr.d $xr0, $a1
+; LA64-NEXT:    xvst $xr0, $a0, 0
+; LA64-NEXT:    ret
+entry:
+  store <8 x i32> <i32 512, i32 0, i32 512, i32 0, i32 512, i32 0, i32 512, i32 0>, ptr %dst
+  ret void
+}
+
 define void @buildvector_v4i64_const_splat(ptr %dst) nounwind {
 ; CHECK-LABEL: buildvector_v4i64_const_splat:
 ; CHECK:       # %bb.0: # %entry
@@ -154,8 +212,8 @@ entry:
 define void @buildvector_v4f64_const_splat(ptr %dst) nounwind {
 ; LA32-LABEL: buildvector_v4f64_const_splat:
 ; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI11_0)
-; LA32-NEXT:    xvld $xr0, $a1, %pc_lo12(.LCPI11_0)
+; LA32-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI14_0)
+; LA32-NEXT:    xvld $xr0, $a1, %pc_lo12(.LCPI14_0)
 ; LA32-NEXT:    xvst $xr0, $a0, 0
 ; LA32-NEXT:    ret
 ;
@@ -173,8 +231,8 @@ entry:
 define void @buildvector_v32i8_const(ptr %dst) nounwind {
 ; CHECK-LABEL: buildvector_v32i8_const:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI12_0)
-; CHECK-NEXT:    xvld $xr0, $a1, %pc_lo12(.LCPI12_0)
+; CHECK-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI15_0)
+; CHECK-NEXT:    xvld $xr0, $a1, %pc_lo12(.LCPI15_0)
 ; CHECK-NEXT:    xvst $xr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -185,8 +243,8 @@ entry:
 define void @buildvector_v16i16_const(ptr %dst) nounwind {
 ; CHECK-LABEL: buildvector_v16i16_const:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI13_0)
-; CHECK-NEXT:    xvld $xr0, $a1, %pc_lo12(.LCPI13_0)
+; CHECK-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI16_0)
+; CHECK-NEXT:    xvld $xr0, $a1, %pc_lo12(.LCPI16_0)
 ; CHECK-NEXT:    xvst $xr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -197,8 +255,8 @@ entry:
 define void @buildvector_v8i32_const(ptr %dst) nounwind {
 ; CHECK-LABEL: buildvector_v8i32_const:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI14_0)
-; CHECK-NEXT:    xvld $xr0, $a1, %pc_lo12(.LCPI14_0)
+; CHECK-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI17_0)
+; CHECK-NEXT:    xvld $xr0, $a1, %pc_lo12(.LCPI17_0)
 ; CHECK-NEXT:    xvst $xr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -209,8 +267,8 @@ entry:
 define void @buildvector_v4i64_const(ptr %dst) nounwind {
 ; CHECK-LABEL: buildvector_v4i64_const:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI15_0)
-; CHECK-NEXT:    xvld $xr0, $a1, %pc_lo12(.LCPI15_0)
+; CHECK-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI18_0)
+; CHECK-NEXT:    xvld $xr0, $a1, %pc_lo12(.LCPI18_0)
 ; CHECK-NEXT:    xvst $xr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -221,8 +279,8 @@ entry:
 define void @buildvector_v2f32_const(ptr %dst) nounwind {
 ; CHECK-LABEL: buildvector_v2f32_const:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI16_0)
-; CHECK-NEXT:    xvld $xr0, $a1, %pc_lo12(.LCPI16_0)
+; CHECK-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI19_0)
+; CHECK-NEXT:    xvld $xr0, $a1, %pc_lo12(.LCPI19_0)
 ; CHECK-NEXT:    xvst $xr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -233,8 +291,8 @@ entry:
 define void @buildvector_v4f64_const(ptr %dst) nounwind {
 ; CHECK-LABEL: buildvector_v4f64_const:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI17_0)
-; CHECK-NEXT:    xvld $xr0, $a1, %pc_lo12(.LCPI17_0)
+; CHECK-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI20_0)
+; CHECK-NEXT:    xvld $xr0, $a1, %pc_lo12(.LCPI20_0)
 ; CHECK-NEXT:    xvst $xr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-bitclr.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-bitclr.ll
index 9f148e5a447a5..786233018ad7d 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-bitclr.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-bitclr.ll
@@ -47,9 +47,7 @@ define <4 x i64> @lasx_xvbitclr_d(<4 x i64> %va, <4 x i64> %vb) nounwind {
 ; LA32-NEXT:    xvand.v $xr1, $xr1, $xr2
 ; LA32-NEXT:    xvrepli.d $xr2, 1
 ; LA32-NEXT:    xvsll.d $xr1, $xr2, $xr1
-; LA32-NEXT:    xvrepli.b $xr2, -1
-; LA32-NEXT:    xvxor.v $xr1, $xr1, $xr2
-; LA32-NEXT:    xvand.v $xr0, $xr0, $xr1
+; LA32-NEXT:    xvandn.v $xr0, $xr1, $xr0
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: lasx_xvbitclr_d:
diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/andn.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/andn.ll
new file mode 100644
index 0000000000000..a61d016b8e21e
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/andn.ll
@@ -0,0 +1,71 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx < %s | FileCheck %s
+; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
+
+define void @andn_v32i8(ptr %res, ptr %a0, ptr %a1) nounwind {
+; CHECK-LABEL: andn_v32i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    xvandn.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %v0 = load <32 x i8>, ptr %a0
+  %v1 = load <32 x i8>, ptr %a1
+  %v2 = xor <32 x i8> %v0, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+  %v3 = and <32 x i8> %v2, %v1
+  store <32 x i8> %v3, ptr %res
+  ret void
+}
+
+define void @andn_v16i16(ptr %res, ptr %a0, ptr %a1) nounwind {
+; CHECK-LABEL: andn_v16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    xvandn.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %v0 = load <16 x i16>, ptr %a0
+  %v1 = load <16 x i16>, ptr %a1
+  %v2 = xor <16 x i16> %v0, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+  %v3 = and <16 x i16> %v2, %v1
+  store <16 x i16> %v3, ptr %res
+  ret void
+}
+
+define void @andn_v8i32(ptr %res, ptr %a0, ptr %a1) nounwind {
+; CHECK-LABEL: andn_v8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    xvandn.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %v0 = load <8 x i32>, ptr %a0
+  %v1 = load <8 x i32>, ptr %a1
+  %v2 = xor <8 x i32> %v0, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
+  %v3 = and <8 x i32> %v2, %v1
+  store <8 x i32> %v3, ptr %res
+  ret void
+}
+
+define void @andn_v4i64(ptr %res, ptr %a0, ptr %a1) nounwind {
+; CHECK-LABEL: andn_v4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    xvandn.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %v0 = load <4 x i64>, ptr %a0
+  %v1 = load <4 x i64>, ptr %a1
+  %v2 = xor <4 x i64> %v0, <i64 -1, i64 -1, i64 -1, i64 -1>
+  %v3 = and <4 x i64> %v2, %v1
+  store <4 x i64> %v3, ptr %res
+  ret void
+}
diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/nor.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/nor.ll
new file mode 100644
index 0000000000000..62ca303475241
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/nor.ll
@@ -0,0 +1,135 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx < %s | FileCheck %s
+; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
+
+define void @nor_v32i8(ptr %res, ptr %a0, ptr %a1) nounwind {
+; CHECK-LABEL: nor_v32i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    xvnor.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %v0 = load <32 x i8>, ptr %a0
+  %v1 = load <32 x i8>, ptr %a1
+  %v2 = or <32 x i8> %v0, %v1
+  %v3 = xor <32 x i8> %v2, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+  store <32 x i8> %v3, ptr %res
+  ret void
+}
+
+define void @nor_v16i16(ptr %res, ptr %a0, ptr %a1) nounwind {
+; CHECK-LABEL: nor_v16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    xvnor.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %v0 = load <16 x i16>, ptr %a0
+  %v1 = load <16 x i16>, ptr %a1
+  %v2 = or <16 x i16> %v0, %v1
+  %v3 = xor <16 x i16> %v2, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+  store <16 x i16> %v3, ptr %res
+  ret void
+}
+
+define void @nor_v8i32(ptr %res, ptr %a0, ptr %a1) nounwind {
+; CHECK-LABEL: nor_v8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    xvnor.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %v0 = load <8 x i32>, ptr %a0
+  %v1 = load <8 x i32>, ptr %a1
+  %v2 = or <8 x i32> %v0, %v1
+  %v3 = xor <8 x i32> %v2, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
+  store <8 x i32> %v3, ptr %res
+  ret void
+}
+
+define void @nor_v4i64(ptr %res, ptr %a0, ptr %a1) nounwind {
+; CHECK-LABEL: nor_v4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    xvnor.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %v0 = load <4 x i64>, ptr %a0
+  %v1 = load <4 x i64>, ptr %a1
+  %v2 = or <4 x i64> %v0, %v1
+  %v3 = xor <4 x i64> %v2, <i64 -1, i64 -1, i64 -1, i64 -1>
+  store <4 x i64> %v3, ptr %res
+  ret void
+}
+
+define void @nor_u_v32i8(ptr %res, ptr %a0) nounwind {
+; CHECK-LABEL: nor_u_v32i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvrepli.b $xr1, 31
+; CHECK-NEXT:    xvnor.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %v0 = load <32 x i8>, ptr %a0
+  %v1 = or <32 x i8> %v0, <i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31>
+  %v2 = xor <32 x i8> %v1, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+  store <32 x i8> %v2, ptr %res
+  ret void
+}
+
+define void @nor_u_v16i16(ptr %res, ptr %a0) nounwind {
+; CHECK-LABEL: nor_u_v16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvrepli.h $xr1, 31
+; CHECK-NEXT:    xvnor.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %v0 = load <16 x i16>, ptr %a0
+  %v1 = or <16 x i16> %v0, <i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31>
+  %v2 = xor <16 x i16> %v1, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+  store <16 x i16> %v2, ptr %res
+  ret void
+}
+
+define void @nor_u_v8i32(ptr %res, ptr %a0) nounwind {
+; CHECK-LABEL: nor_u_v8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvrepli.w $xr1, 31
+; CHECK-NEXT:    xvnor.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %v0 = load <8 x i32>, ptr %a0
+  %v1 = or <8 x i32> %v0, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+  %v2 = xor <8 x i32> %v1, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
+  store <8 x i32> %v2, ptr %res
+  ret void
+}
+
+define void @nor_u_v4i64(ptr %res, ptr %a0) nounwind {
+; CHECK-LABEL: nor_u_v4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvrepli.d $xr1, 31
+; CHECK-NEXT:    xvnor.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %v0 = load <4 x i64>, ptr %a0
+  %v1 = or <4 x i64> %v0, <i64 31, i64 31, i64 31, i64 31>
+  %v2 = xor <4 x i64> %v1, <i64 -1, i64 -1, i64 -1, i64 -1>
+  store <4 x i64> %v2, ptr %res
+  ret void
+}
diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/orn.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/orn.ll
new file mode 100644
index 0000000000000..e0f72ebc62445
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/orn.ll
@@ -0,0 +1,71 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx < %s | FileCheck %s
+; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
+
+define void @orn_v32i8(ptr %res, ptr %a0, ptr %a1) nounwind {
+; CHECK-LABEL: orn_v32i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    xvorn.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %v0 = load <32 x i8>, ptr %a0
+  %v1 = load <32 x i8>, ptr %a1
+  %v2 = xor <32 x i8> %v1, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+  %v3 = or <32 x i8> %v0, %v2
+  store <32 x i8> %v3, ptr %res
+  ret void
+}
+
+define void @orn_v16i16(ptr %res, ptr %a0, ptr %a1) nounwind {
+; CHECK-LABEL: orn_v16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    xvorn.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %v0 = load <16 x i16>, ptr %a0
+  %v1 = load <16 x i16>, ptr %a1
+  %v2 = xor <16 x i16> %v1, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+  %v3 = or <16 x i16> %v0, %v2
+  store <16 x i16> %v3, ptr %res
+  ret void
+}
+
+define void @orn_v8i32(ptr %res, ptr %a0, ptr %a1) nounwind {
+; CHECK-LABEL: orn_v8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    xvorn.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %v0 = load <8 x i32>, ptr %a0
+  %v1 = load <8 x i32>, ptr %a1
+  %v2 = xor <8 x i32> %v1, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
+  %v3 = or <8 x i32> %v0, %v2
+  store <8 x i32> %v3, ptr %res
+  ret void
+}
+
+define void @orn_v4i64(ptr %res, ptr %a0, ptr %a1) nounwind {
+; CHECK-LABEL: orn_v4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    xvorn.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %v0 = load <4 x i64>, ptr %a0
+  %v1 = load <4 x i64>, ptr %a1
+  %v2 = xor <4 x i64> %v1, <i64 -1, i64 -1, i64 -1, i64 -1>
+  %v3 = or <4 x i64> %v0, %v2
+  store <4 x i64> %v3, ptr %res
+  ret void
+}
diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/sadd-sat.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/sadd-sat.ll
new file mode 100644
index 0000000000000..1802838305ed5
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/sadd-sat.ll
@@ -0,0 +1,44 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx < %s | FileCheck %s
+; RUN: llc -mtriple=loongarch64 -mattr=+lasx < %s | FileCheck %s
+
+define <32 x i8> @xvsadd_b(<32 x i8> %a, <32 x i8> %b) {
+; CHECK-LABEL: xvsadd_b:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvsadd.b $xr0, $xr0, $xr1
+; CHECK-NEXT:    ret
+  %ret = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> %a, <32 x i8> %b)
+  ret <32 x i8> %ret
+}
+
+define <16 x i16> @xvsadd_h(<16 x i16> %a, <16 x i16> %b) {
+; CHECK-LABEL: xvsadd_h:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvsadd.h $xr0, $xr0, $xr1
+; CHECK-NEXT:    ret
+  %ret = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> %a, <16 x i16> %b)
+  ret <16 x i16> %ret
+}
+
+define <8 x i32> @xvsadd_w(<8 x i32> %a, <8 x i32> %b) {
+; CHECK-LABEL: xvsadd_w:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvsadd.w $xr0, $xr0, $xr1
+; CHECK-NEXT:    ret
+  %ret = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> %a, <8 x i32> %b)
+  ret <8 x i32> %ret
+}
+
+define <4 x i64> @xvsadd_d(<4 x i64> %a, <4 x i64> %b) {
+; CHECK-LABEL: xvsadd_d:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvsadd.d $xr0, $xr0, $xr1
+; CHECK-NEXT:    ret
+  %ret = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> %a, <4 x i64> %b)
+  ret <4 x i64> %ret
+}
+
+declare <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8>, <32 x i8>)
+declare <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16>, <16 x i16>)
+declare <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32>, <8 x i32>)
+declare <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64>, <4 x i64>)
diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/ssub-sat.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/ssub-sat.ll
new file mode 100644
index 0000000000000..5497c4cb913bc
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/ssub-sat.ll
@@ -0,0 +1,44 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx < %s | FileCheck %s
+; RUN: llc -mtriple=loongarch64 -mattr=+lasx < %s | FileCheck %s
+
+define <32 x i8> @xvssub_b(<32 x i8> %a, <32 x i8> %b) {
+; CHECK-LABEL: xvssub_b:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvssub.b $xr0, $xr0, $xr1
+; CHECK-NEXT:    ret
+  %ret = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> %a, <32 x i8> %b)
+  ret <32 x i8> %ret
+}
+
+define <16 x i16> @xvssub_h(<16 x i16> %a, <16 x i16> %b) {
+; CHECK-LABEL: xvssub_h:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvssub.h $xr0, $xr0, $xr1
+; CHECK-NEXT:    ret
+  %ret = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> %a, <16 x i16> %b)
+  ret <16 x i16> %ret
+}
+
+define <8 x i32> @xvssub_w(<8 x i32> %a, <8 x i32> %b) {
+; CHECK-LABEL: xvssub_w:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvssub.w $xr0, $xr0, $xr1
+; CHECK-NEXT:    ret
+  %ret = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> %a, <8 x i32> %b)
+  ret <8 x i32> %ret
+}
+
+define <4 x i64> @xvssub_d(<4 x i64> %a, <4 x i64> %b) {
+; CHECK-LABEL: xvssub_d:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvssub.d $xr0, $xr0, $xr1
+; CHECK-NEXT:    ret
+  %ret = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> %a, <4 x i64> %b)
+  ret <4 x i64> %ret
+}
+
+declare <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8>, <32 x i8>)
+declare <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16>, <16 x i16>)
+declare <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32>, <8 x i32>)
+declare <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64>, <4 x i64>)
diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/uadd-sat.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/uadd-sat.ll
new file mode 100644
index 0000000000000..6943c9188ada9
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/uadd-sat.ll
@@ -0,0 +1,44 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx < %s | FileCheck %s
+; RUN: llc -mtriple=loongarch64 -mattr=+lasx < %s | FileCheck %s
+
+define <32 x i8> @xvuadd_b(<32 x i8> %a, <32 x i8> %b) {
+; CHECK-LABEL: xvuadd_b:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvsadd.bu $xr0, $xr0, $xr1
+; CHECK-NEXT:    ret
+  %ret = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> %a, <32 x i8> %b)
+  ret <32 x i8> %ret
+}
+
+define <16 x i16> @xvuadd_h(<16 x i16> %a, <16 x i16> %b) {
+; CHECK-LABEL: xvuadd_h:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvsadd.hu $xr0, $xr0, $xr1
+; CHECK-NEXT:    ret
+  %ret = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> %a, <16 x i16> %b)
+  ret <16 x i16> %ret
+}
+
+define <8 x i32> @xvuadd_w(<8 x i32> %a, <8 x i32> %b) {
+; CHECK-LABEL: xvuadd_w:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvsadd.wu $xr0, $xr0, $xr1
+; CHECK-NEXT:    ret
+  %ret = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> %a, <8 x i32> %b)
+  ret <8 x i32> %ret
+}
+
+define <4 x i64> @xvuadd_d(<4 x i64> %a, <4 x i64> %b) {
+; CHECK-LABEL: xvuadd_d:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvsadd.du $xr0, $xr0, $xr1
+; CHECK-NEXT:    ret
+  %ret = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> %a, <4 x i64> %b)
+  ret <4 x i64> %ret
+}
+
+declare <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8>, <32 x i8>)
+declare <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16>, <16 x i16>)
+declare <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32>, <8 x i32>)
+declare <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64>, <4 x i64>)
diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/usub-sat.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/usub-sat.ll
new file mode 100644
index 0000000000000..9c0ff46179e5a
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/usub-sat.ll
@@ -0,0 +1,44 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx < %s | FileCheck %s
+; RUN: llc -mtriple=loongarch64 -mattr=+lasx < %s | FileCheck %s
+
+define <32 x i8> @xvusub_b(<32 x i8> %a, <32 x i8> %b) {
+; CHECK-LABEL: xvusub_b:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvssub.bu $xr0, $xr0, $xr1
+; CHECK-NEXT:    ret
+  %ret = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> %a, <32 x i8> %b)
+  ret <32 x i8> %ret
+}
+
+define <16 x i16> @xvusub_h(<16 x i16> %a, <16 x i16> %b) {
+; CHECK-LABEL: xvusub_h:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvssub.hu $xr0, $xr0, $xr1
+; CHECK-NEXT:    ret
+  %ret = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> %a, <16 x i16> %b)
+  ret <16 x i16> %ret
+}
+
+define <8 x i32> @xvusub_w(<8 x i32> %a, <8 x i32> %b) {
+; CHECK-LABEL: xvusub_w:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvssub.wu $xr0, $xr0, $xr1
+; CHECK-NEXT:    ret
+  %ret = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> %a, <8 x i32> %b)
+  ret <8 x i32> %ret
+}
+
+define <4 x i64> @xvusub_d(<4 x i64> %a, <4 x i64> %b) {
+; CHECK-LABEL: xvusub_d:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvssub.du $xr0, $xr0, $xr1
+; CHECK-NEXT:    ret
+  %ret = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> %a, <4 x i64> %b)
+  ret <4 x i64> %ret
+}
+
+declare <32 x i8> @llvm.usub.sat.v32i8(<32 x i8>, <32 x i8>)
+declare <16 x i16> @llvm.usub.sat.v16i16(<16 x i16>, <16 x i16>)
+declare <8 x i32> @llvm.usub.sat.v8i32(<8 x i32>, <8 x i32>)
+declare <4 x i64> @llvm.usub.sat.v4i64(<4 x i64>, <4 x i64>)
diff --git a/llvm/test/CodeGen/LoongArch/lsx/build-vector.ll b/llvm/test/CodeGen/LoongArch/lsx/build-vector.ll
index 24df71c2ad71b..ac28151dc2f54 100644
--- a/llvm/test/CodeGen/LoongArch/lsx/build-vector.ll
+++ b/llvm/test/CodeGen/LoongArch/lsx/build-vector.ll
@@ -102,6 +102,26 @@ entry:
   ret void
 }
 
+define void @buildvector_v16i8_const_splat_v2i64(ptr %dst) nounwind {
+; LA32-LABEL: buildvector_v16i8_const_splat_v2i64:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI7_0)
+; LA32-NEXT:    vld $vr0, $a1, %pc_lo12(.LCPI7_0)
+; LA32-NEXT:    vst $vr0, $a0, 0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: buildvector_v16i8_const_splat_v2i64:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    lu12i.w $a1, 7
+; LA64-NEXT:    ori $a1, $a1, 3453
+; LA64-NEXT:    vreplgr2vr.d $vr0, $a1
+; LA64-NEXT:    vst $vr0, $a0, 0
+; LA64-NEXT:    ret
+entry:
+  store <16 x i8> <i8 125, i8 125, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 125, i8 125, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, ptr %dst
+  ret void
+}
+
 define void @buildvector_v8i16_const_splat(ptr %dst) nounwind {
 ; CHECK-LABEL: buildvector_v8i16_const_splat:
 ; CHECK:       # %bb.0: # %entry
@@ -113,6 +133,25 @@ entry:
   ret void
 }
 
+define void @buildvector_v8i16_const_splat_v2i64(ptr %dst) nounwind {
+; LA32-LABEL: buildvector_v8i16_const_splat_v2i64:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI9_0)
+; LA32-NEXT:    vld $vr0, $a1, %pc_lo12(.LCPI9_0)
+; LA32-NEXT:    vst $vr0, $a0, 0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: buildvector_v8i16_const_splat_v2i64:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    ori $a1, $zero, 512
+; LA64-NEXT:    vreplgr2vr.d $vr0, $a1
+; LA64-NEXT:    vst $vr0, $a0, 0
+; LA64-NEXT:    ret
+entry:
+  store <8 x i16> <i16 512, i16 0, i16 0, i16 0, i16 512, i16 0, i16 0, i16 0>, ptr %dst
+  ret void
+}
+
 define void @buildvector_v4i32_const_splat(ptr %dst) nounwind {
 ; CHECK-LABEL: buildvector_v4i32_const_splat:
 ; CHECK:       # %bb.0: # %entry
@@ -124,6 +163,25 @@ entry:
   ret void
 }
 
+define void @buildvector_v4i32_const_splat_v2i64(ptr %dst) nounwind {
+; LA32-LABEL: buildvector_v4i32_const_splat_v2i64:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI11_0)
+; LA32-NEXT:    vld $vr0, $a1, %pc_lo12(.LCPI11_0)
+; LA32-NEXT:    vst $vr0, $a0, 0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: buildvector_v4i32_const_splat_v2i64:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    ori $a1, $zero, 512
+; LA64-NEXT:    vreplgr2vr.d $vr0, $a1
+; LA64-NEXT:    vst $vr0, $a0, 0
+; LA64-NEXT:    ret
+entry:
+  store <4 x i32> <i32 512, i32 0, i32 512, i32 0>, ptr %dst
+  ret void
+}
+
 define void @buildvector_v2i64_const_splat(ptr %dst) nounwind {
 ; CHECK-LABEL: buildvector_v2i64_const_splat:
 ; CHECK:       # %bb.0: # %entry
@@ -150,8 +208,8 @@ entry:
 define void @buildvector_v2f64_const_splat(ptr %dst) nounwind {
 ; LA32-LABEL: buildvector_v2f64_const_splat:
 ; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI11_0)
-; LA32-NEXT:    vld $vr0, $a1, %pc_lo12(.LCPI11_0)
+; LA32-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI14_0)
+; LA32-NEXT:    vld $vr0, $a1, %pc_lo12(.LCPI14_0)
 ; LA32-NEXT:    vst $vr0, $a0, 0
 ; LA32-NEXT:    ret
 ;
@@ -169,8 +227,8 @@ entry:
 define void @buildvector_v16i8_const(ptr %dst) nounwind {
 ; CHECK-LABEL: buildvector_v16i8_const:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI12_0)
-; CHECK-NEXT:    vld $vr0, $a1, %pc_lo12(.LCPI12_0)
+; CHECK-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI15_0)
+; CHECK-NEXT:    vld $vr0, $a1, %pc_lo12(.LCPI15_0)
 ; CHECK-NEXT:    vst $vr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -181,8 +239,8 @@ entry:
 define void @buildvector_v8i16_const(ptr %dst) nounwind {
 ; CHECK-LABEL: buildvector_v8i16_const:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI13_0)
-; CHECK-NEXT:    vld $vr0, $a1, %pc_lo12(.LCPI13_0)
+; CHECK-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI16_0)
+; CHECK-NEXT:    vld $vr0, $a1, %pc_lo12(.LCPI16_0)
 ; CHECK-NEXT:    vst $vr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -193,8 +251,8 @@ entry:
 define void @buildvector_v4i32_const(ptr %dst) nounwind {
 ; CHECK-LABEL: buildvector_v4i32_const:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI14_0)
-; CHECK-NEXT:    vld $vr0, $a1, %pc_lo12(.LCPI14_0)
+; CHECK-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI17_0)
+; CHECK-NEXT:    vld $vr0, $a1, %pc_lo12(.LCPI17_0)
 ; CHECK-NEXT:    vst $vr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -205,8 +263,8 @@ entry:
 define void @buildvector_v2i64_const(ptr %dst) nounwind {
 ; CHECK-LABEL: buildvector_v2i64_const:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI15_0)
-; CHECK-NEXT:    vld $vr0, $a1, %pc_lo12(.LCPI15_0)
+; CHECK-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI18_0)
+; CHECK-NEXT:    vld $vr0, $a1, %pc_lo12(.LCPI18_0)
 ; CHECK-NEXT:    vst $vr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -217,8 +275,8 @@ entry:
 define void @buildvector_v2f32_const(ptr %dst) nounwind {
 ; CHECK-LABEL: buildvector_v2f32_const:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI16_0)
-; CHECK-NEXT:    vld $vr0, $a1, %pc_lo12(.LCPI16_0)
+; CHECK-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI19_0)
+; CHECK-NEXT:    vld $vr0, $a1, %pc_lo12(.LCPI19_0)
 ; CHECK-NEXT:    vst $vr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -229,8 +287,8 @@ entry:
 define void @buildvector_v2f64_const(ptr %dst) nounwind {
 ; CHECK-LABEL: buildvector_v2f64_const:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI17_0)
-; CHECK-NEXT:    vld $vr0, $a1, %pc_lo12(.LCPI17_0)
+; CHECK-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI20_0)
+; CHECK-NEXT:    vld $vr0, $a1, %pc_lo12(.LCPI20_0)
 ; CHECK-NEXT:    vst $vr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-bitclr.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-bitclr.ll
index ac0eca2fc33ea..438004d2d52db 100644
--- a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-bitclr.ll
+++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-bitclr.ll
@@ -47,9 +47,7 @@ define <2 x i64> @lsx_vbitclr_d(<2 x i64> %va, <2 x i64> %vb) nounwind {
 ; LA32-NEXT:    vand.v $vr1, $vr1, $vr2
 ; LA32-NEXT:    vrepli.d $vr2, 1
 ; LA32-NEXT:    vsll.d $vr1, $vr2, $vr1
-; LA32-NEXT:    vrepli.b $vr2, -1
-; LA32-NEXT:    vxor.v $vr1, $vr1, $vr2
-; LA32-NEXT:    vand.v $vr0, $vr0, $vr1
+; LA32-NEXT:    vandn.v $vr0, $vr1, $vr0
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: lsx_vbitclr_d:
diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/andn.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/andn.ll
new file mode 100644
index 0000000000000..c1ba98c5dd146
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/andn.ll
@@ -0,0 +1,71 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lsx < %s | FileCheck %s
+; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
+
+define void @andn_v16i8(ptr %res, ptr %a0, ptr %a1) nounwind {
+; CHECK-LABEL: andn_v16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    vandn.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %v0 = load <16 x i8>, ptr %a0
+  %v1 = load <16 x i8>, ptr %a1
+  %v2 = xor <16 x i8> %v0, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+  %v3 = and <16 x i8> %v2, %v1
+  store <16 x i8> %v3, ptr %res
+  ret void
+}
+
+define void @andn_v8i16(ptr %res, ptr %a0, ptr %a1) nounwind {
+; CHECK-LABEL: andn_v8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    vandn.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %v0 = load <8 x i16>, ptr %a0
+  %v1 = load <8 x i16>, ptr %a1
+  %v2 = xor <8 x i16> %v0, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+  %v3 = and <8 x i16> %v2, %v1
+  store <8 x i16> %v3, ptr %res
+  ret void
+}
+
+define void @andn_v4i32(ptr %res, ptr %a0, ptr %a1) nounwind {
+; CHECK-LABEL: andn_v4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    vandn.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %v0 = load <4 x i32>, ptr %a0
+  %v1 = load <4 x i32>, ptr %a1
+  %v2 = xor <4 x i32> %v0, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %v3 = and <4 x i32> %v2, %v1
+  store <4 x i32> %v3, ptr %res
+  ret void
+}
+
+define void @andn_v2i64(ptr %res, ptr %a0, ptr %a1) nounwind {
+; CHECK-LABEL: andn_v2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    vandn.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %v0 = load <2 x i64>, ptr %a0
+  %v1 = load <2 x i64>, ptr %a1
+  %v2 = xor <2 x i64> %v0, <i64 -1, i64 -1>
+  %v3 = and <2 x i64> %v2, %v1
+  store <2 x i64> %v3, ptr %res
+  ret void
+}
diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/nor.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/nor.ll
new file mode 100644
index 0000000000000..a2c15b28f8827
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/nor.ll
@@ -0,0 +1,135 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lsx < %s | FileCheck %s
+; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
+
+define void @nor_v16i8(ptr %res, ptr %a0, ptr %a1) nounwind {
+; CHECK-LABEL: nor_v16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    vnor.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %v0 = load <16 x i8>, ptr %a0
+  %v1 = load <16 x i8>, ptr %a1
+  %v2 = or <16 x i8> %v0, %v1
+  %v3 = xor <16 x i8> %v2, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+  store <16 x i8> %v3, ptr %res
+  ret void
+}
+
+define void @nor_v8i16(ptr %res, ptr %a0, ptr %a1) nounwind {
+; CHECK-LABEL: nor_v8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    vnor.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %v0 = load <8 x i16>, ptr %a0
+  %v1 = load <8 x i16>, ptr %a1
+  %v2 = or <8 x i16> %v0, %v1
+  %v3 = xor <8 x i16> %v2, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+  store <8 x i16> %v3, ptr %res
+  ret void
+}
+
+define void @nor_v4i32(ptr %res, ptr %a0, ptr %a1) nounwind {
+; CHECK-LABEL: nor_v4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    vnor.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %v0 = load <4 x i32>, ptr %a0
+  %v1 = load <4 x i32>, ptr %a1
+  %v2 = or <4 x i32> %v0, %v1
+  %v3 = xor <4 x i32> %v2, <i32 -1, i32 -1, i32 -1, i32 -1>
+  store <4 x i32> %v3, ptr %res
+  ret void
+}
+
+define void @nor_v2i64(ptr %res, ptr %a0, ptr %a1) nounwind {
+; CHECK-LABEL: nor_v2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    vnor.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %v0 = load <2 x i64>, ptr %a0
+  %v1 = load <2 x i64>, ptr %a1
+  %v2 = or <2 x i64> %v0, %v1
+  %v3 = xor <2 x i64> %v2, <i64 -1, i64 -1>
+  store <2 x i64> %v3, ptr %res
+  ret void
+}
+
+define void @nor_u_v16i8(ptr %res, ptr %a0) nounwind {
+; CHECK-LABEL: nor_u_v16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vrepli.b $vr1, 31
+; CHECK-NEXT:    vnor.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %v0 = load <16 x i8>, ptr %a0
+  %v1 = or <16 x i8> %v0, <i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31>
+  %v2 = xor <16 x i8> %v1, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+  store <16 x i8> %v2, ptr %res
+  ret void
+}
+
+define void @nor_u_v8i16(ptr %res, ptr %a0) nounwind {
+; CHECK-LABEL: nor_u_v8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vrepli.h $vr1, 31
+; CHECK-NEXT:    vnor.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %v0 = load <8 x i16>, ptr %a0
+  %v1 = or <8 x i16> %v0, <i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31>
+  %v2 = xor <8 x i16> %v1, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+  store <8 x i16> %v2, ptr %res
+  ret void
+}
+
+define void @nor_u_v4i32(ptr %res, ptr %a0) nounwind {
+; CHECK-LABEL: nor_u_v4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vrepli.w $vr1, 31
+; CHECK-NEXT:    vnor.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %v0 = load <4 x i32>, ptr %a0
+  %v1 = or <4 x i32> %v0, <i32 31, i32 31, i32 31, i32 31>
+  %v2 = xor <4 x i32> %v1, <i32 -1, i32 -1, i32 -1, i32 -1>
+  store <4 x i32> %v2, ptr %res
+  ret void
+}
+
+define void @nor_u_v2i64(ptr %res, ptr %a0) nounwind {
+; CHECK-LABEL: nor_u_v2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vrepli.d $vr1, 31
+; CHECK-NEXT:    vnor.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %v0 = load <2 x i64>, ptr %a0
+  %v1 = or <2 x i64> %v0, <i64 31, i64 31>
+  %v2 = xor <2 x i64> %v1, <i64 -1, i64 -1>
+  store <2 x i64> %v2, ptr %res
+  ret void
+}
diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/orn.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/orn.ll
new file mode 100644
index 0000000000000..bb1be34aebcab
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/orn.ll
@@ -0,0 +1,71 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lsx < %s | FileCheck %s
+; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
+
+define void @orn_v16i8(ptr %res, ptr %a0, ptr %a1) nounwind {
+; CHECK-LABEL: orn_v16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    vorn.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %v0 = load <16 x i8>, ptr %a0
+  %v1 = load <16 x i8>, ptr %a1
+  %v2 = xor <16 x i8> %v1, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+  %v3 = or <16 x i8> %v0, %v2
+  store <16 x i8> %v3, ptr %res
+  ret void
+}
+
+define void @orn_v8i16(ptr %res, ptr %a0, ptr %a1) nounwind {
+; CHECK-LABEL: orn_v8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    vorn.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %v0 = load <8 x i16>, ptr %a0
+  %v1 = load <8 x i16>, ptr %a1
+  %v2 = xor <8 x i16> %v1, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+  %v3 = or <8 x i16> %v0, %v2
+  store <8 x i16> %v3, ptr %res
+  ret void
+}
+
+define void @orn_v4i32(ptr %res, ptr %a0, ptr %a1) nounwind {
+; CHECK-LABEL: orn_v4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    vorn.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %v0 = load <4 x i32>, ptr %a0
+  %v1 = load <4 x i32>, ptr %a1
+  %v2 = xor <4 x i32> %v1, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %v3 = or <4 x i32> %v0, %v2
+  store <4 x i32> %v3, ptr %res
+  ret void
+}
+
+define void @orn_v2i64(ptr %res, ptr %a0, ptr %a1) nounwind {
+; CHECK-LABEL: orn_v2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    vorn.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %v0 = load <2 x i64>, ptr %a0
+  %v1 = load <2 x i64>, ptr %a1
+  %v2 = xor <2 x i64> %v1, <i64 -1, i64 -1>
+  %v3 = or <2 x i64> %v0, %v2
+  store <2 x i64> %v3, ptr %res
+  ret void
+}
diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/sadd-sat.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/sadd-sat.ll
new file mode 100644
index 0000000000000..5871b4c497e50
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/sadd-sat.ll
@@ -0,0 +1,44 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lsx < %s | FileCheck %s
+; RUN: llc -mtriple=loongarch64 -mattr=+lsx < %s | FileCheck %s
+
+define <16 x i8> @vsadd_b(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: vsadd_b:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsadd.b $vr0, $vr0, $vr1
+; CHECK-NEXT:    ret
+  %ret = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> %a, <16 x i8> %b)
+  ret <16 x i8> %ret
+}
+
+define <8 x i16> @vsadd_h(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: vsadd_h:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsadd.h $vr0, $vr0, $vr1
+; CHECK-NEXT:    ret
+  %ret = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> %a, <8 x i16> %b)
+  ret <8 x i16> %ret
+}
+
+define <4 x i32> @vsadd_w(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: vsadd_w:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsadd.w $vr0, $vr0, $vr1
+; CHECK-NEXT:    ret
+  %ret = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %a, <4 x i32> %b)
+  ret <4 x i32> %ret
+}
+
+define <2 x i64> @vsadd_d(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: vsadd_d:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsadd.d $vr0, $vr0, $vr1
+; CHECK-NEXT:    ret
+  %ret = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> %a, <2 x i64> %b)
+  ret <2 x i64> %ret
+}
+
+declare <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8>, <16 x i8>)
+declare <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16>, <8 x i16>)
+declare <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32>, <4 x i32>)
+declare <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64>, <2 x i64>)
diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/ssub-sat.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/ssub-sat.ll
new file mode 100644
index 0000000000000..4ae52f30bb7cd
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/ssub-sat.ll
@@ -0,0 +1,44 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lsx < %s | FileCheck %s
+; RUN: llc -mtriple=loongarch64 -mattr=+lsx < %s | FileCheck %s
+
+define <16 x i8> @vssub_b(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: vssub_b:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vssub.b $vr0, $vr0, $vr1
+; CHECK-NEXT:    ret
+  %ret = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> %a, <16 x i8> %b)
+  ret <16 x i8> %ret
+}
+
+define <8 x i16> @vssub_h(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: vssub_h:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vssub.h $vr0, $vr0, $vr1
+; CHECK-NEXT:    ret
+  %ret = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> %a, <8 x i16> %b)
+  ret <8 x i16> %ret
+}
+
+define <4 x i32> @vssub_w(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: vssub_w:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vssub.w $vr0, $vr0, $vr1
+; CHECK-NEXT:    ret
+  %ret = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %a, <4 x i32> %b)
+  ret <4 x i32> %ret
+}
+
+define <2 x i64> @vssub_d(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: vssub_d:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vssub.d $vr0, $vr0, $vr1
+; CHECK-NEXT:    ret
+  %ret = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> %a, <2 x i64> %b)
+  ret <2 x i64> %ret
+}
+
+declare <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8>, <16 x i8>)
+declare <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16>, <8 x i16>)
+declare <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32>, <4 x i32>)
+declare <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64>, <2 x i64>)
diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/uadd-sat.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/uadd-sat.ll
new file mode 100644
index 0000000000000..faf1383257804
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/uadd-sat.ll
@@ -0,0 +1,44 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lsx < %s | FileCheck %s
+; RUN: llc -mtriple=loongarch64 -mattr=+lsx < %s | FileCheck %s
+
+define <16 x i8> @vuadd_b(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: vuadd_b:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsadd.bu $vr0, $vr0, $vr1
+; CHECK-NEXT:    ret
+  %ret = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> %a, <16 x i8> %b)
+  ret <16 x i8> %ret
+}
+
+define <8 x i16> @vuadd_h(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: vuadd_h:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsadd.hu $vr0, $vr0, $vr1
+; CHECK-NEXT:    ret
+  %ret = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> %a, <8 x i16> %b)
+  ret <8 x i16> %ret
+}
+
+define <4 x i32> @vuadd_w(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: vuadd_w:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsadd.wu $vr0, $vr0, $vr1
+; CHECK-NEXT:    ret
+  %ret = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> %a, <4 x i32> %b)
+  ret <4 x i32> %ret
+}
+
+define <2 x i64> @vuadd_d(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: vuadd_d:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsadd.du $vr0, $vr0, $vr1
+; CHECK-NEXT:    ret
+  %ret = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> %a, <2 x i64> %b)
+  ret <2 x i64> %ret
+}
+
+declare <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8>, <16 x i8>)
+declare <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16>, <8 x i16>)
+declare <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32>, <4 x i32>)
+declare <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64>, <2 x i64>)
diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/usub-sat.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/usub-sat.ll
new file mode 100644
index 0000000000000..59de967fd288a
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/usub-sat.ll
@@ -0,0 +1,44 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lsx < %s | FileCheck %s
+; RUN: llc -mtriple=loongarch64 -mattr=+lsx < %s | FileCheck %s
+
+define <16 x i8> @vusub_b(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: vusub_b:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vssub.bu $vr0, $vr0, $vr1
+; CHECK-NEXT:    ret
+  %ret = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> %a, <16 x i8> %b)
+  ret <16 x i8> %ret
+}
+
+define <8 x i16> @vusub_h(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: vusub_h:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vssub.hu $vr0, $vr0, $vr1
+; CHECK-NEXT:    ret
+  %ret = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> %a, <8 x i16> %b)
+  ret <8 x i16> %ret
+}
+
+define <4 x i32> @vusub_w(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: vusub_w:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vssub.wu $vr0, $vr0, $vr1
+; CHECK-NEXT:    ret
+  %ret = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> %a, <4 x i32> %b)
+  ret <4 x i32> %ret
+}
+
+define <2 x i64> @vusub_d(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: vusub_d:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vssub.du $vr0, $vr0, $vr1
+; CHECK-NEXT:    ret
+  %ret = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> %a, <2 x i64> %b)
+  ret <2 x i64> %ret
+}
+
+declare <16 x i8> @llvm.usub.sat.v16i8(<16 x i8>, <16 x i8>)
+declare <8 x i16> @llvm.usub.sat.v8i16(<8 x i16>, <8 x i16>)
+declare <4 x i32> @llvm.usub.sat.v4i32(<4 x i32>, <4 x i32>)
+declare <2 x i64> @llvm.usub.sat.v2i64(<2 x i64>, <2 x i64>)
diff --git a/llvm/test/CodeGen/MIR/AArch64/hasstackframe.mir b/llvm/test/CodeGen/MIR/AArch64/hasstackframe.mir
new file mode 100644
index 0000000000000..bf3d8ec478d18
--- /dev/null
+++ b/llvm/test/CodeGen/MIR/AArch64/hasstackframe.mir
@@ -0,0 +1,41 @@
+# RUN: llc -run-pass=prologepilog -mtriple arm64-apple-ios -o - -simplify-mir \
+# RUN:    -verify-machineinstrs %s | FileCheck %s
+
+# CHECK:  hasStackFrame:   true
+
+--- |
+
+  define i32 @f(i32 %a, i32 %b) #0 {
+    %local_array = alloca [10 x i32], align 4
+    %temp = alloca i32, align 4
+    store i32 %a, ptr %temp, align 4
+    %loaded = load i32, ptr %temp, align 4
+    %gep = getelementptr inbounds [10 x i32], ptr %local_array, i64 0, i64 5
+    store i32 %loaded, ptr %gep, align 4
+    %result = add i32 %loaded, %b
+    %blah = call i32 @foo(i32 noundef %result)
+    ret i32 %blah
+  }
+
+  declare i32 @foo(i32 noundef)
+
+...
+---
+name:            f
+frameInfo:
+  adjustsStack:    true
+stack:
+  - { id: 0, name: local_array, size: 40, alignment: 4, local-offset: -40 }
+  - { id: 1, name: temp, size: 4, alignment: 4, local-offset: -44 }
+body:             |
+  bb.0:
+    liveins: $w0, $w1
+
+    STRWui renamable $w0, %stack.1.temp, 0
+    STRWui renamable $w0, %stack.0.local_array, 5
+    ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+    $w0 = ADDWrr killed renamable $w0, killed renamable $w1
+    BL @foo, csr_darwin_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp, implicit-def $w0
+    ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+    RET_ReallyLR implicit $w0
+...
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll b/llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll
index 929db4c9be1c7..ed8bc9ca700a8 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll
+++ b/llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll
@@ -540,7 +540,7 @@
 ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
 declare align 4 ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() #2
 
-attributes #0 = { "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
+attributes #0 = { "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-cluster-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-cluster-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-cluster-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
 attributes #1 = { nounwind }
 attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
 !0 = !{}
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll
index f054bea1f2780..68c3d1b2f2972 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll
+++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll
@@ -63,4 +63,4 @@ define amdgpu_kernel void @scavenge_fi(ptr addrspace(1) %out, i32 %in) #0 {
   ret void
 }
 
-attributes #0 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
+attributes #0 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-cluster-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-cluster-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-cluster-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll
index 924216efcc461..55598ec70d953 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll
+++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll
@@ -99,7 +99,7 @@
   ; Function Attrs: convergent nocallback nofree nounwind willreturn
   declare void @llvm.amdgcn.end.cf.i64(i64) #2
 
-  attributes #0 = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+  attributes #0 = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-cluster-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-cluster-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-cluster-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
   attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
   attributes #2 = { convergent nocallback nofree nounwind willreturn }
   attributes #3 = { convergent nocallback nofree nounwind willreturn memory(none) }
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll
index 39f1ddd0609d8..2326b2dc09b58 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll
+++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll
@@ -73,5 +73,5 @@ bb4:
   ret void
 }
 
-attributes #0 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
+attributes #0 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-cluster-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-cluster-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-cluster-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
 attributes #1 = { nounwind readnone }
diff --git a/llvm/test/CodeGen/Mips/atomic-min-max.ll b/llvm/test/CodeGen/Mips/atomic-min-max.ll
index 85bf6d02c7d8f..8320224680ff8 100644
--- a/llvm/test/CodeGen/Mips/atomic-min-max.ll
+++ b/llvm/test/CodeGen/Mips/atomic-min-max.ll
@@ -3,6 +3,7 @@
 ; RUN: llc -mtriple=mips-elf -O0 -mcpu=mips32r6 -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=MIPSR6
 ; RUN: llc -mtriple=mips-elf -O0 -mcpu=mips32r2 -mattr=+micromips -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=MM
 ; RUN: llc -mtriple=mips-elf -O0 -mcpu=mips32r6 -mattr=+micromips -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=MMR6
+; RUN: llc -mtriple=mipsel-elf -O0 -mcpu=mips2 %s -o - | FileCheck %s --check-prefix=MIPS2
 ; RUN: llc -mtriple=mipsel-elf -O0 -mcpu=mips32 -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=MIPS32
 ; RUN: llc -mtriple=mipsel-elf -O0 -mcpu=mips32r2 -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=MIPSEL
 ; RUN: llc -mtriple=mipsel-elf -O0 -mcpu=mips32r6 -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=MIPSELR6
@@ -31,6 +32,33 @@ define i32 @test_max_32(ptr nocapture %ptr, i32 signext %val) {
 ; MIPS-NEXT:    jr $ra
 ; MIPS-NEXT:    nop
 ;
+; MIPS2-LABEL: test_max_32:
+; MIPS2:       # %bb.0: # %entry
+; MIPS2-NEXT:    sync
+; MIPS2-NEXT:  $BB0_1: # %entry
+; MIPS2-NEXT:    # =>This Inner Loop Header: Depth=1
+; MIPS2-NEXT:    ll $2, 0($4)
+; MIPS2-NEXT:    slt $3, $2, $5
+; MIPS2-NEXT:    move $1, $5
+; MIPS2-NEXT:    beqz $3, $BB0_3
+; MIPS2-NEXT:    nop
+; MIPS2-NEXT:  # %bb.2: # %entry
+; MIPS2-NEXT:    # in Loop: Header=BB0_1 Depth=1
+; MIPS2-NEXT:    b $BB0_4
+; MIPS2-NEXT:    nop
+; MIPS2-NEXT:  $BB0_3: # %entry
+; MIPS2-NEXT:    # in Loop: Header=BB0_1 Depth=1
+; MIPS2-NEXT:    move $1, $2
+; MIPS2-NEXT:  $BB0_4: # %entry
+; MIPS2-NEXT:    # in Loop: Header=BB0_1 Depth=1
+; MIPS2-NEXT:    sc $1, 0($4)
+; MIPS2-NEXT:    beqz $1, $BB0_1
+; MIPS2-NEXT:    nop
+; MIPS2-NEXT:  # %bb.5: # %entry
+; MIPS2-NEXT:    sync
+; MIPS2-NEXT:    jr $ra
+; MIPS2-NEXT:    nop
+;
 ; MIPSR6-LABEL: test_max_32:
 ; MIPSR6:       # %bb.0: # %entry
 ; MIPSR6-NEXT:    sync
@@ -251,6 +279,33 @@ define i32 @test_min_32(ptr nocapture %ptr, i32 signext %val) {
 ; MIPS-NEXT:    jr $ra
 ; MIPS-NEXT:    nop
 ;
+; MIPS2-LABEL: test_min_32:
+; MIPS2:       # %bb.0: # %entry
+; MIPS2-NEXT:    sync
+; MIPS2-NEXT:  $BB1_1: # %entry
+; MIPS2-NEXT:    # =>This Inner Loop Header: Depth=1
+; MIPS2-NEXT:    ll $2, 0($4)
+; MIPS2-NEXT:    slt $3, $2, $5
+; MIPS2-NEXT:    move $1, $2
+; MIPS2-NEXT:    beqz $3, $BB1_3
+; MIPS2-NEXT:    nop
+; MIPS2-NEXT:  # %bb.2: # %entry
+; MIPS2-NEXT:    # in Loop: Header=BB1_1 Depth=1
+; MIPS2-NEXT:    b $BB1_4
+; MIPS2-NEXT:    nop
+; MIPS2-NEXT:  $BB1_3: # %entry
+; MIPS2-NEXT:    # in Loop: Header=BB1_1 Depth=1
+; MIPS2-NEXT:    move $1, $5
+; MIPS2-NEXT:  $BB1_4: # %entry
+; MIPS2-NEXT:    # in Loop: Header=BB1_1 Depth=1
+; MIPS2-NEXT:    sc $1, 0($4)
+; MIPS2-NEXT:    beqz $1, $BB1_1
+; MIPS2-NEXT:    nop
+; MIPS2-NEXT:  # %bb.5: # %entry
+; MIPS2-NEXT:    sync
+; MIPS2-NEXT:    jr $ra
+; MIPS2-NEXT:    nop
+;
 ; MIPSR6-LABEL: test_min_32:
 ; MIPSR6:       # %bb.0: # %entry
 ; MIPSR6-NEXT:    sync
@@ -471,6 +526,33 @@ define i32 @test_umax_32(ptr nocapture %ptr, i32 signext %val) {
 ; MIPS-NEXT:    jr $ra
 ; MIPS-NEXT:    nop
 ;
+; MIPS2-LABEL: test_umax_32:
+; MIPS2:       # %bb.0: # %entry
+; MIPS2-NEXT:    sync
+; MIPS2-NEXT:  $BB2_1: # %entry
+; MIPS2-NEXT:    # =>This Inner Loop Header: Depth=1
+; MIPS2-NEXT:    ll $2, 0($4)
+; MIPS2-NEXT:    sltu $3, $2, $5
+; MIPS2-NEXT:    move $1, $5
+; MIPS2-NEXT:    beqz $3, $BB2_3
+; MIPS2-NEXT:    nop
+; MIPS2-NEXT:  # %bb.2: # %entry
+; MIPS2-NEXT:    # in Loop: Header=BB2_1 Depth=1
+; MIPS2-NEXT:    b $BB2_4
+; MIPS2-NEXT:    nop
+; MIPS2-NEXT:  $BB2_3: # %entry
+; MIPS2-NEXT:    # in Loop: Header=BB2_1 Depth=1
+; MIPS2-NEXT:    move $1, $2
+; MIPS2-NEXT:  $BB2_4: # %entry
+; MIPS2-NEXT:    # in Loop: Header=BB2_1 Depth=1
+; MIPS2-NEXT:    sc $1, 0($4)
+; MIPS2-NEXT:    beqz $1, $BB2_1
+; MIPS2-NEXT:    nop
+; MIPS2-NEXT:  # %bb.5: # %entry
+; MIPS2-NEXT:    sync
+; MIPS2-NEXT:    jr $ra
+; MIPS2-NEXT:    nop
+;
 ; MIPSR6-LABEL: test_umax_32:
 ; MIPSR6:       # %bb.0: # %entry
 ; MIPSR6-NEXT:    sync
@@ -691,6 +773,33 @@ define i32 @test_umin_32(ptr nocapture %ptr, i32 signext %val) {
 ; MIPS-NEXT:    jr $ra
 ; MIPS-NEXT:    nop
 ;
+; MIPS2-LABEL: test_umin_32:
+; MIPS2:       # %bb.0: # %entry
+; MIPS2-NEXT:    sync
+; MIPS2-NEXT:  $BB3_1: # %entry
+; MIPS2-NEXT:    # =>This Inner Loop Header: Depth=1
+; MIPS2-NEXT:    ll $2, 0($4)
+; MIPS2-NEXT:    sltu $3, $2, $5
+; MIPS2-NEXT:    move $1, $2
+; MIPS2-NEXT:    beqz $3, $BB3_3
+; MIPS2-NEXT:    nop
+; MIPS2-NEXT:  # %bb.2: # %entry
+; MIPS2-NEXT:    # in Loop: Header=BB3_1 Depth=1
+; MIPS2-NEXT:    b $BB3_4
+; MIPS2-NEXT:    nop
+; MIPS2-NEXT:  $BB3_3: # %entry
+; MIPS2-NEXT:    # in Loop: Header=BB3_1 Depth=1
+; MIPS2-NEXT:    move $1, $5
+; MIPS2-NEXT:  $BB3_4: # %entry
+; MIPS2-NEXT:    # in Loop: Header=BB3_1 Depth=1
+; MIPS2-NEXT:    sc $1, 0($4)
+; MIPS2-NEXT:    beqz $1, $BB3_1
+; MIPS2-NEXT:    nop
+; MIPS2-NEXT:  # %bb.5: # %entry
+; MIPS2-NEXT:    sync
+; MIPS2-NEXT:    jr $ra
+; MIPS2-NEXT:    nop
+;
 ; MIPSR6-LABEL: test_umin_32:
 ; MIPSR6:       # %bb.0: # %entry
 ; MIPSR6-NEXT:    sync
@@ -936,6 +1045,58 @@ define i16 @test_max_16(ptr nocapture %ptr, i16 signext %val) {
 ; MIPS-NEXT:    jr $ra
 ; MIPS-NEXT:    nop
 ;
+; MIPS2-LABEL: test_max_16:
+; MIPS2:       # %bb.0: # %entry
+; MIPS2-NEXT:    addiu $sp, $sp, -8
+; MIPS2-NEXT:    .cfi_def_cfa_offset 8
+; MIPS2-NEXT:    # kill: def $at killed $a1
+; MIPS2-NEXT:    sync
+; MIPS2-NEXT:    addiu $1, $zero, -4
+; MIPS2-NEXT:    and $6, $4, $1
+; MIPS2-NEXT:    andi $1, $4, 3
+; MIPS2-NEXT:    sll $10, $1, 3
+; MIPS2-NEXT:    ori $1, $zero, 65535
+; MIPS2-NEXT:    sllv $8, $1, $10
+; MIPS2-NEXT:    nor $9, $zero, $8
+; MIPS2-NEXT:    sllv $7, $5, $10
+; MIPS2-NEXT:  $BB4_1: # %entry
+; MIPS2-NEXT:    # =>This Inner Loop Header: Depth=1
+; MIPS2-NEXT:    ll $2, 0($6)
+; MIPS2-NEXT:    srav $4, $2, $10
+; MIPS2-NEXT:    sll $4, $4, 16
+; MIPS2-NEXT:    sra $4, $4, 16
+; MIPS2-NEXT:    or $1, $zero, $4
+; MIPS2-NEXT:    sllv $4, $4, $10
+; MIPS2-NEXT:    slt $5, $4, $7
+; MIPS2-NEXT:    move $3, $7
+; MIPS2-NEXT:    beqz $5, $BB4_3
+; MIPS2-NEXT:    nop
+; MIPS2-NEXT:  # %bb.2: # %entry
+; MIPS2-NEXT:    #   in Loop: Header=BB4_1 Depth=1
+; MIPS2-NEXT:    b $BB4_4
+; MIPS2-NEXT:    nop
+; MIPS2-NEXT:  $BB4_3: # %entry
+; MIPS2-NEXT:    #   in Loop: Header=BB4_1 Depth=1
+; MIPS2-NEXT:    move $3, $4
+; MIPS2-NEXT:  $BB4_4: # %entry
+; MIPS2-NEXT:    #   in Loop: Header=BB4_1 Depth=1
+; MIPS2-NEXT:    and $3, $3, $8
+; MIPS2-NEXT:    and $4, $2, $9
+; MIPS2-NEXT:    or $4, $4, $3
+; MIPS2-NEXT:    sc $4, 0($6)
+; MIPS2-NEXT:    beqz $4, $BB4_1
+; MIPS2-NEXT:    nop
+; MIPS2-NEXT:  # %bb.5: # %entry
+; MIPS2-NEXT:    .insn
+; MIPS2-NEXT:  $BB4_6: # %entry
+; MIPS2-NEXT:    sw $1, 4($sp) # 4-byte Folded Spill
+; MIPS2-NEXT:  # %bb.7: # %entry
+; MIPS2-NEXT:    lw $2, 4($sp) # 4-byte Folded Reload
+; MIPS2-NEXT:    sync
+; MIPS2-NEXT:    addiu $sp, $sp, 8
+; MIPS2-NEXT:    jr $ra
+; MIPS2-NEXT:    nop
+;
 ; MIPSR6-LABEL: test_max_16:
 ; MIPSR6:       # %bb.0: # %entry
 ; MIPSR6-NEXT:    addiu $sp, $sp, -8
@@ -1476,6 +1637,58 @@ define i16 @test_min_16(ptr nocapture %ptr, i16 signext %val) {
 ; MIPS-NEXT:    jr $ra
 ; MIPS-NEXT:    nop
 ;
+; MIPS2-LABEL: test_min_16:
+; MIPS2:       # %bb.0: # %entry
+; MIPS2-NEXT:    addiu $sp, $sp, -8
+; MIPS2-NEXT:    .cfi_def_cfa_offset 8
+; MIPS2-NEXT:    # kill: def $at killed $a1
+; MIPS2-NEXT:    sync
+; MIPS2-NEXT:    addiu $1, $zero, -4
+; MIPS2-NEXT:    and $6, $4, $1
+; MIPS2-NEXT:    andi $1, $4, 3
+; MIPS2-NEXT:    sll $10, $1, 3
+; MIPS2-NEXT:    ori $1, $zero, 65535
+; MIPS2-NEXT:    sllv $8, $1, $10
+; MIPS2-NEXT:    nor $9, $zero, $8
+; MIPS2-NEXT:    sllv $7, $5, $10
+; MIPS2-NEXT:  $BB5_1: # %entry
+; MIPS2-NEXT:    # =>This Inner Loop Header: Depth=1
+; MIPS2-NEXT:    ll $2, 0($6)
+; MIPS2-NEXT:    srav $4, $2, $10
+; MIPS2-NEXT:    sll $4, $4, 16
+; MIPS2-NEXT:    sra $4, $4, 16
+; MIPS2-NEXT:    or $1, $zero, $4
+; MIPS2-NEXT:    sllv $4, $4, $10
+; MIPS2-NEXT:    slt $5, $4, $7
+; MIPS2-NEXT:    move $3, $4
+; MIPS2-NEXT:    beqz $5, $BB5_3
+; MIPS2-NEXT:    nop
+; MIPS2-NEXT:  # %bb.2: # %entry
+; MIPS2-NEXT:    #   in Loop: Header=BB5_1 Depth=1
+; MIPS2-NEXT:    b $BB5_4
+; MIPS2-NEXT:    nop
+; MIPS2-NEXT:  $BB5_3: # %entry
+; MIPS2-NEXT:    #   in Loop: Header=BB5_1 Depth=1
+; MIPS2-NEXT:    move $3, $7
+; MIPS2-NEXT:  $BB5_4: # %entry
+; MIPS2-NEXT:    #   in Loop: Header=BB5_1 Depth=1
+; MIPS2-NEXT:    and $3, $3, $8
+; MIPS2-NEXT:    and $4, $2, $9
+; MIPS2-NEXT:    or $4, $4, $3
+; MIPS2-NEXT:    sc $4, 0($6)
+; MIPS2-NEXT:    beqz $4, $BB5_1
+; MIPS2-NEXT:    nop
+; MIPS2-NEXT:  # %bb.5: # %entry
+; MIPS2-NEXT:    .insn
+; MIPS2-NEXT:  $BB5_6: # %entry
+; MIPS2-NEXT:    sw $1, 4($sp) # 4-byte Folded Spill
+; MIPS2-NEXT:  # %bb.7: # %entry
+; MIPS2-NEXT:    lw $2, 4($sp) # 4-byte Folded Reload
+; MIPS2-NEXT:    sync
+; MIPS2-NEXT:    addiu $sp, $sp, 8
+; MIPS2-NEXT:    jr $ra
+; MIPS2-NEXT:    nop
+;
 ; MIPSR6-LABEL: test_min_16:
 ; MIPSR6:       # %bb.0: # %entry
 ; MIPSR6-NEXT:    addiu $sp, $sp, -8
@@ -2015,6 +2228,57 @@ define i16 @test_umax_16(ptr nocapture %ptr, i16 signext %val) {
 ; MIPS-NEXT:    jr $ra
 ; MIPS-NEXT:    nop
 ;
+; MIPS2-LABEL: test_umax_16:
+; MIPS2:       # %bb.0: # %entry
+; MIPS2-NEXT:    addiu $sp, $sp, -8
+; MIPS2-NEXT:    .cfi_def_cfa_offset 8
+; MIPS2-NEXT:    # kill: def $at killed $a1
+; MIPS2-NEXT:    sync
+; MIPS2-NEXT:    addiu $1, $zero, -4
+; MIPS2-NEXT:    and $6, $4, $1
+; MIPS2-NEXT:    andi $1, $4, 3
+; MIPS2-NEXT:    sll $10, $1, 3
+; MIPS2-NEXT:    ori $1, $zero, 65535
+; MIPS2-NEXT:    sllv $8, $1, $10
+; MIPS2-NEXT:    nor $9, $zero, $8
+; MIPS2-NEXT:    sllv $7, $5, $10
+; MIPS2-NEXT:  $BB6_1: # %entry
+; MIPS2-NEXT:    # =>This Inner Loop Header: Depth=1
+; MIPS2-NEXT:    ll $2, 0($6)
+; MIPS2-NEXT:    srav $4, $2, $10
+; MIPS2-NEXT:    andi $4, $4, 65535
+; MIPS2-NEXT:    or $1, $zero, $4
+; MIPS2-NEXT:    sllv $4, $4, $10
+; MIPS2-NEXT:    sltu $5, $4, $7
+; MIPS2-NEXT:    move $3, $7
+; MIPS2-NEXT:    beqz $5, $BB6_3
+; MIPS2-NEXT:    nop
+; MIPS2-NEXT:  # %bb.2: # %entry
+; MIPS2-NEXT:    #   in Loop: Header=BB6_1 Depth=1
+; MIPS2-NEXT:    b $BB6_4
+; MIPS2-NEXT:    nop
+; MIPS2-NEXT:  $BB6_3: # %entry
+; MIPS2-NEXT:    #   in Loop: Header=BB6_1 Depth=1
+; MIPS2-NEXT:    move $3, $4
+; MIPS2-NEXT:  $BB6_4: # %entry
+; MIPS2-NEXT:    #   in Loop: Header=BB6_1 Depth=1
+; MIPS2-NEXT:    and $3, $3, $8
+; MIPS2-NEXT:    and $4, $2, $9
+; MIPS2-NEXT:    or $4, $4, $3
+; MIPS2-NEXT:    sc $4, 0($6)
+; MIPS2-NEXT:    beqz $4, $BB6_1
+; MIPS2-NEXT:    nop
+; MIPS2-NEXT:  # %bb.5: # %entry
+; MIPS2-NEXT:    .insn
+; MIPS2-NEXT:  $BB6_6: # %entry
+; MIPS2-NEXT:    sw $1, 4($sp) # 4-byte Folded Spill
+; MIPS2-NEXT:  # %bb.7: # %entry
+; MIPS2-NEXT:    lw $2, 4($sp) # 4-byte Folded Reload
+; MIPS2-NEXT:    sync
+; MIPS2-NEXT:    addiu $sp, $sp, 8
+; MIPS2-NEXT:    jr $ra
+; MIPS2-NEXT:    nop
+;
 ; MIPSR6-LABEL: test_umax_16:
 ; MIPSR6:       # %bb.0: # %entry
 ; MIPSR6-NEXT:    addiu $sp, $sp, -8
@@ -2553,6 +2817,57 @@ define i16 @test_umin_16(ptr nocapture %ptr, i16 signext %val) {
 ; MIPS-NEXT:    jr $ra
 ; MIPS-NEXT:    nop
 ;
+; MIPS2-LABEL: test_umin_16:
+; MIPS2:       # %bb.0: # %entry
+; MIPS2-NEXT:    addiu $sp, $sp, -8
+; MIPS2-NEXT:    .cfi_def_cfa_offset 8
+; MIPS2-NEXT:    # kill: def $at killed $a1
+; MIPS2-NEXT:    sync
+; MIPS2-NEXT:    addiu $1, $zero, -4
+; MIPS2-NEXT:    and $6, $4, $1
+; MIPS2-NEXT:    andi $1, $4, 3
+; MIPS2-NEXT:    sll $10, $1, 3
+; MIPS2-NEXT:    ori $1, $zero, 65535
+; MIPS2-NEXT:    sllv $8, $1, $10
+; MIPS2-NEXT:    nor $9, $zero, $8
+; MIPS2-NEXT:    sllv $7, $5, $10
+; MIPS2-NEXT:  $BB7_1: # %entry
+; MIPS2-NEXT:    # =>This Inner Loop Header: Depth=1
+; MIPS2-NEXT:    ll $2, 0($6)
+; MIPS2-NEXT:    srav $4, $2, $10
+; MIPS2-NEXT:    andi $4, $4, 65535
+; MIPS2-NEXT:    or $1, $zero, $4
+; MIPS2-NEXT:    sllv $4, $4, $10
+; MIPS2-NEXT:    sltu $5, $4, $7
+; MIPS2-NEXT:    move $3, $4
+; MIPS2-NEXT:    beqz $5, $BB7_3
+; MIPS2-NEXT:    nop
+; MIPS2-NEXT:  # %bb.2: # %entry
+; MIPS2-NEXT:    #   in Loop: Header=BB7_1 Depth=1
+; MIPS2-NEXT:    b $BB7_4
+; MIPS2-NEXT:    nop
+; MIPS2-NEXT:  $BB7_3: # %entry
+; MIPS2-NEXT:    #   in Loop: Header=BB7_1 Depth=1
+; MIPS2-NEXT:    move $3, $7
+; MIPS2-NEXT:  $BB7_4: # %entry
+; MIPS2-NEXT:    #   in Loop: Header=BB7_1 Depth=1
+; MIPS2-NEXT:    and $3, $3, $8
+; MIPS2-NEXT:    and $4, $2, $9
+; MIPS2-NEXT:    or $4, $4, $3
+; MIPS2-NEXT:    sc $4, 0($6)
+; MIPS2-NEXT:    beqz $4, $BB7_1
+; MIPS2-NEXT:    nop
+; MIPS2-NEXT:  # %bb.5: # %entry
+; MIPS2-NEXT:    .insn
+; MIPS2-NEXT:  $BB7_6: # %entry
+; MIPS2-NEXT:    sw $1, 4($sp) # 4-byte Folded Spill
+; MIPS2-NEXT:  # %bb.7: # %entry
+; MIPS2-NEXT:    lw $2, 4($sp) # 4-byte Folded Reload
+; MIPS2-NEXT:    sync
+; MIPS2-NEXT:    addiu $sp, $sp, 8
+; MIPS2-NEXT:    jr $ra
+; MIPS2-NEXT:    nop
+;
 ; MIPSR6-LABEL: test_umin_16:
 ; MIPSR6:       # %bb.0: # %entry
 ; MIPSR6-NEXT:    addiu $sp, $sp, -8
@@ -3092,6 +3407,58 @@ define i8 @test_max_8(ptr nocapture %ptr, i8 signext %val) {
 ; MIPS-NEXT:    jr $ra
 ; MIPS-NEXT:    nop
 ;
+; MIPS2-LABEL: test_max_8:
+; MIPS2:       # %bb.0: # %entry
+; MIPS2-NEXT:    addiu $sp, $sp, -8
+; MIPS2-NEXT:    .cfi_def_cfa_offset 8
+; MIPS2-NEXT:    # kill: def $at killed $a1
+; MIPS2-NEXT:    sync
+; MIPS2-NEXT:    addiu $1, $zero, -4
+; MIPS2-NEXT:    and $6, $4, $1
+; MIPS2-NEXT:    andi $1, $4, 3
+; MIPS2-NEXT:    sll $10, $1, 3
+; MIPS2-NEXT:    ori $1, $zero, 255
+; MIPS2-NEXT:    sllv $8, $1, $10
+; MIPS2-NEXT:    nor $9, $zero, $8
+; MIPS2-NEXT:    sllv $7, $5, $10
+; MIPS2-NEXT:  $BB8_1: # %entry
+; MIPS2-NEXT:    # =>This Inner Loop Header: Depth=1
+; MIPS2-NEXT:    ll $2, 0($6)
+; MIPS2-NEXT:    srav $4, $2, $10
+; MIPS2-NEXT:    sll $4, $4, 24
+; MIPS2-NEXT:    sra $4, $4, 24
+; MIPS2-NEXT:    or $1, $zero, $4
+; MIPS2-NEXT:    sllv $4, $4, $10
+; MIPS2-NEXT:    slt $5, $4, $7
+; MIPS2-NEXT:    move $3, $7
+; MIPS2-NEXT:    beqz $5, $BB8_3
+; MIPS2-NEXT:    nop
+; MIPS2-NEXT:  # %bb.2: # %entry
+; MIPS2-NEXT:    #   in Loop: Header=BB8_1 Depth=1
+; MIPS2-NEXT:    b $BB8_4
+; MIPS2-NEXT:    nop
+; MIPS2-NEXT:  $BB8_3: # %entry
+; MIPS2-NEXT:    #   in Loop: Header=BB8_1 Depth=1
+; MIPS2-NEXT:    move $3, $4
+; MIPS2-NEXT:  $BB8_4: # %entry
+; MIPS2-NEXT:    #   in Loop: Header=BB8_1 Depth=1
+; MIPS2-NEXT:    and $3, $3, $8
+; MIPS2-NEXT:    and $4, $2, $9
+; MIPS2-NEXT:    or $4, $4, $3
+; MIPS2-NEXT:    sc $4, 0($6)
+; MIPS2-NEXT:    beqz $4, $BB8_1
+; MIPS2-NEXT:    nop
+; MIPS2-NEXT:  # %bb.5: # %entry
+; MIPS2-NEXT:    .insn
+; MIPS2-NEXT:  $BB8_6: # %entry
+; MIPS2-NEXT:    sw $1, 4($sp) # 4-byte Folded Spill
+; MIPS2-NEXT:  # %bb.7: # %entry
+; MIPS2-NEXT:    lw $2, 4($sp) # 4-byte Folded Reload
+; MIPS2-NEXT:    sync
+; MIPS2-NEXT:    addiu $sp, $sp, 8
+; MIPS2-NEXT:    jr $ra
+; MIPS2-NEXT:    nop
+;
 ; MIPSR6-LABEL: test_max_8:
 ; MIPSR6:       # %bb.0: # %entry
 ; MIPSR6-NEXT:    addiu $sp, $sp, -8
@@ -3631,6 +3998,58 @@ define i8 @test_min_8(ptr nocapture %ptr, i8 signext %val) {
 ; MIPS-NEXT:    jr $ra
 ; MIPS-NEXT:    nop
 ;
+; MIPS2-LABEL: test_min_8:
+; MIPS2:       # %bb.0: # %entry
+; MIPS2-NEXT:    addiu $sp, $sp, -8
+; MIPS2-NEXT:    .cfi_def_cfa_offset 8
+; MIPS2-NEXT:    # kill: def $at killed $a1
+; MIPS2-NEXT:    sync
+; MIPS2-NEXT:    addiu $1, $zero, -4
+; MIPS2-NEXT:    and $6, $4, $1
+; MIPS2-NEXT:    andi $1, $4, 3
+; MIPS2-NEXT:    sll $10, $1, 3
+; MIPS2-NEXT:    ori $1, $zero, 255
+; MIPS2-NEXT:    sllv $8, $1, $10
+; MIPS2-NEXT:    nor $9, $zero, $8
+; MIPS2-NEXT:    sllv $7, $5, $10
+; MIPS2-NEXT:  $BB9_1: # %entry
+; MIPS2-NEXT:    # =>This Inner Loop Header: Depth=1
+; MIPS2-NEXT:    ll $2, 0($6)
+; MIPS2-NEXT:    srav $4, $2, $10
+; MIPS2-NEXT:    sll $4, $4, 24
+; MIPS2-NEXT:    sra $4, $4, 24
+; MIPS2-NEXT:    or $1, $zero, $4
+; MIPS2-NEXT:    sllv $4, $4, $10
+; MIPS2-NEXT:    slt $5, $4, $7
+; MIPS2-NEXT:    move $3, $4
+; MIPS2-NEXT:    beqz $5, $BB9_3
+; MIPS2-NEXT:    nop
+; MIPS2-NEXT:  # %bb.2: # %entry
+; MIPS2-NEXT:    #   in Loop: Header=BB9_1 Depth=1
+; MIPS2-NEXT:    b $BB9_4
+; MIPS2-NEXT:    nop
+; MIPS2-NEXT:  $BB9_3: # %entry
+; MIPS2-NEXT:    #   in Loop: Header=BB9_1 Depth=1
+; MIPS2-NEXT:    move $3, $7
+; MIPS2-NEXT:  $BB9_4: # %entry
+; MIPS2-NEXT:    #   in Loop: Header=BB9_1 Depth=1
+; MIPS2-NEXT:    and $3, $3, $8
+; MIPS2-NEXT:    and $4, $2, $9
+; MIPS2-NEXT:    or $4, $4, $3
+; MIPS2-NEXT:    sc $4, 0($6)
+; MIPS2-NEXT:    beqz $4, $BB9_1
+; MIPS2-NEXT:    nop
+; MIPS2-NEXT:  # %bb.5: # %entry
+; MIPS2-NEXT:    .insn
+; MIPS2-NEXT:  $BB9_6: # %entry
+; MIPS2-NEXT:    sw $1, 4($sp) # 4-byte Folded Spill
+; MIPS2-NEXT:  # %bb.7: # %entry
+; MIPS2-NEXT:    lw $2, 4($sp) # 4-byte Folded Reload
+; MIPS2-NEXT:    sync
+; MIPS2-NEXT:    addiu $sp, $sp, 8
+; MIPS2-NEXT:    jr $ra
+; MIPS2-NEXT:    nop
+;
 ; MIPSR6-LABEL: test_min_8:
 ; MIPSR6:       # %bb.0: # %entry
 ; MIPSR6-NEXT:    addiu $sp, $sp, -8
@@ -4170,6 +4589,57 @@ define i8 @test_umax_8(ptr nocapture %ptr, i8 signext %val) {
 ; MIPS-NEXT:    jr $ra
 ; MIPS-NEXT:    nop
 ;
+; MIPS2-LABEL: test_umax_8:
+; MIPS2:       # %bb.0: # %entry
+; MIPS2-NEXT:    addiu $sp, $sp, -8
+; MIPS2-NEXT:    .cfi_def_cfa_offset 8
+; MIPS2-NEXT:    # kill: def $at killed $a1
+; MIPS2-NEXT:    sync
+; MIPS2-NEXT:    addiu $1, $zero, -4
+; MIPS2-NEXT:    and $6, $4, $1
+; MIPS2-NEXT:    andi $1, $4, 3
+; MIPS2-NEXT:    sll $10, $1, 3
+; MIPS2-NEXT:    ori $1, $zero, 255
+; MIPS2-NEXT:    sllv $8, $1, $10
+; MIPS2-NEXT:    nor $9, $zero, $8
+; MIPS2-NEXT:    sllv $7, $5, $10
+; MIPS2-NEXT:  $BB10_1: # %entry
+; MIPS2-NEXT:    # =>This Inner Loop Header: Depth=1
+; MIPS2-NEXT:    ll $2, 0($6)
+; MIPS2-NEXT:    srav $4, $2, $10
+; MIPS2-NEXT:    andi $4, $4, 255
+; MIPS2-NEXT:    or $1, $zero, $4
+; MIPS2-NEXT:    sllv $4, $4, $10
+; MIPS2-NEXT:    sltu $5, $4, $7
+; MIPS2-NEXT:    move $3, $7
+; MIPS2-NEXT:    beqz $5, $BB10_3
+; MIPS2-NEXT:    nop
+; MIPS2-NEXT:  # %bb.2: # %entry
+; MIPS2-NEXT:    #   in Loop: Header=BB10_1 Depth=1
+; MIPS2-NEXT:    b $BB10_4
+; MIPS2-NEXT:    nop
+; MIPS2-NEXT:  $BB10_3: # %entry
+; MIPS2-NEXT:    #   in Loop: Header=BB10_1 Depth=1
+; MIPS2-NEXT:    move $3, $4
+; MIPS2-NEXT:  $BB10_4: # %entry
+; MIPS2-NEXT:    #   in Loop: Header=BB10_1 Depth=1
+; MIPS2-NEXT:    and $3, $3, $8
+; MIPS2-NEXT:    and $4, $2, $9
+; MIPS2-NEXT:    or $4, $4, $3
+; MIPS2-NEXT:    sc $4, 0($6)
+; MIPS2-NEXT:    beqz $4, $BB10_1
+; MIPS2-NEXT:    nop
+; MIPS2-NEXT:  # %bb.5: # %entry
+; MIPS2-NEXT:    .insn
+; MIPS2-NEXT:  $BB10_6: # %entry
+; MIPS2-NEXT:    sw $1, 4($sp) # 4-byte Folded Spill
+; MIPS2-NEXT:  # %bb.7: # %entry
+; MIPS2-NEXT:    lw $2, 4($sp) # 4-byte Folded Reload
+; MIPS2-NEXT:    sync
+; MIPS2-NEXT:    addiu $sp, $sp, 8
+; MIPS2-NEXT:    jr $ra
+; MIPS2-NEXT:    nop
+;
 ; MIPSR6-LABEL: test_umax_8:
 ; MIPSR6:       # %bb.0: # %entry
 ; MIPSR6-NEXT:    addiu $sp, $sp, -8
@@ -4708,6 +5178,57 @@ define i8 @test_umin_8(ptr nocapture %ptr, i8 signext %val) {
 ; MIPS-NEXT:    jr $ra
 ; MIPS-NEXT:    nop
 ;
+; MIPS2-LABEL: test_umin_8:
+; MIPS2:       # %bb.0: # %entry
+; MIPS2-NEXT:    addiu $sp, $sp, -8
+; MIPS2-NEXT:    .cfi_def_cfa_offset 8
+; MIPS2-NEXT:    # kill: def $at killed $a1
+; MIPS2-NEXT:    sync
+; MIPS2-NEXT:    addiu $1, $zero, -4
+; MIPS2-NEXT:    and $6, $4, $1
+; MIPS2-NEXT:    andi $1, $4, 3
+; MIPS2-NEXT:    sll $10, $1, 3
+; MIPS2-NEXT:    ori $1, $zero, 255
+; MIPS2-NEXT:    sllv $8, $1, $10
+; MIPS2-NEXT:    nor $9, $zero, $8
+; MIPS2-NEXT:    sllv $7, $5, $10
+; MIPS2-NEXT:  $BB11_1: # %entry
+; MIPS2-NEXT:    # =>This Inner Loop Header: Depth=1
+; MIPS2-NEXT:    ll $2, 0($6)
+; MIPS2-NEXT:    srav $4, $2, $10
+; MIPS2-NEXT:    andi $4, $4, 255
+; MIPS2-NEXT:    or $1, $zero, $4
+; MIPS2-NEXT:    sllv $4, $4, $10
+; MIPS2-NEXT:    sltu $5, $4, $7
+; MIPS2-NEXT:    move $3, $4
+; MIPS2-NEXT:    beqz $5, $BB11_3
+; MIPS2-NEXT:    nop
+; MIPS2-NEXT:  # %bb.2: # %entry
+; MIPS2-NEXT:    #   in Loop: Header=BB11_1 Depth=1
+; MIPS2-NEXT:    b $BB11_4
+; MIPS2-NEXT:    nop
+; MIPS2-NEXT:  $BB11_3: # %entry
+; MIPS2-NEXT:    #   in Loop: Header=BB11_1 Depth=1
+; MIPS2-NEXT:    move $3, $7
+; MIPS2-NEXT:  $BB11_4: # %entry
+; MIPS2-NEXT:    #   in Loop: Header=BB11_1 Depth=1
+; MIPS2-NEXT:    and $3, $3, $8
+; MIPS2-NEXT:    and $4, $2, $9
+; MIPS2-NEXT:    or $4, $4, $3
+; MIPS2-NEXT:    sc $4, 0($6)
+; MIPS2-NEXT:    beqz $4, $BB11_1
+; MIPS2-NEXT:    nop
+; MIPS2-NEXT:  # %bb.5: # %entry
+; MIPS2-NEXT:    .insn
+; MIPS2-NEXT:  $BB11_6: # %entry
+; MIPS2-NEXT:    sw $1, 4($sp) # 4-byte Folded Spill
+; MIPS2-NEXT:  # %bb.7: # %entry
+; MIPS2-NEXT:    lw $2, 4($sp) # 4-byte Folded Reload
+; MIPS2-NEXT:    sync
+; MIPS2-NEXT:    addiu $sp, $sp, 8
+; MIPS2-NEXT:    jr $ra
+; MIPS2-NEXT:    nop
+;
 ; MIPSR6-LABEL: test_umin_8:
 ; MIPSR6:       # %bb.0: # %entry
 ; MIPSR6-NEXT:    addiu $sp, $sp, -8
diff --git a/llvm/test/CodeGen/PowerPC/build-vector-tests.ll b/llvm/test/CodeGen/PowerPC/build-vector-tests.ll
index 9dd0fbe4474b1..fb55511162a7e 100644
--- a/llvm/test/CodeGen/PowerPC/build-vector-tests.ll
+++ b/llvm/test/CodeGen/PowerPC/build-vector-tests.ll
@@ -1036,12 +1036,8 @@ define <4 x i32> @fromDiffMemVarDi(ptr nocapture readonly %arr, i32 signext %ele
 ; P9LE:       # %bb.0: # %entry
 ; P9LE-NEXT:    sldi r4, r4, 2
 ; P9LE-NEXT:    add r3, r3, r4
-; P9LE-NEXT:    li r4, -12
-; P9LE-NEXT:    lxvx v2, r3, r4
-; P9LE-NEXT:    addis r3, r2, .LCPI9_0@toc@ha
-; P9LE-NEXT:    addi r3, r3, .LCPI9_0@toc@l
-; P9LE-NEXT:    lxv vs0, 0(r3)
-; P9LE-NEXT:    xxperm v2, v2, vs0
+; P9LE-NEXT:    addi r3, r3, -12
+; P9LE-NEXT:    lxvw4x v2, 0, r3
 ; P9LE-NEXT:    blr
 ;
 ; P8BE-LABEL: fromDiffMemVarDi:
@@ -1058,15 +1054,16 @@ define <4 x i32> @fromDiffMemVarDi(ptr nocapture readonly %arr, i32 signext %ele
 ;
 ; P8LE-LABEL: fromDiffMemVarDi:
 ; P8LE:       # %bb.0: # %entry
-; P8LE-NEXT:    addis r5, r2, .LCPI9_0@toc@ha
 ; P8LE-NEXT:    sldi r4, r4, 2
-; P8LE-NEXT:    addi r5, r5, .LCPI9_0@toc@l
 ; P8LE-NEXT:    add r3, r3, r4
-; P8LE-NEXT:    lxvd2x vs0, 0, r5
 ; P8LE-NEXT:    addi r3, r3, -12
-; P8LE-NEXT:    lxvd2x v3, 0, r3
+; P8LE-NEXT:    lxvd2x vs0, 0, r3
+; P8LE-NEXT:    addis r3, r2, .LCPI9_0@toc@ha
+; P8LE-NEXT:    addi r3, r3, .LCPI9_0@toc@l
 ; P8LE-NEXT:    xxswapd v2, vs0
-; P8LE-NEXT:    vperm v2, v3, v3, v2
+; P8LE-NEXT:    lxvd2x vs0, 0, r3
+; P8LE-NEXT:    xxswapd v3, vs0
+; P8LE-NEXT:    vperm v2, v2, v2, v3
 ; P8LE-NEXT:    blr
 entry:
   %idxprom = sext i32 %elem to i64
@@ -2524,12 +2521,8 @@ define <4 x i32> @fromDiffMemVarDui(ptr nocapture readonly %arr, i32 signext %el
 ; P9LE:       # %bb.0: # %entry
 ; P9LE-NEXT:    sldi r4, r4, 2
 ; P9LE-NEXT:    add r3, r3, r4
-; P9LE-NEXT:    li r4, -12
-; P9LE-NEXT:    lxvx v2, r3, r4
-; P9LE-NEXT:    addis r3, r2, .LCPI41_0@toc@ha
-; P9LE-NEXT:    addi r3, r3, .LCPI41_0@toc@l
-; P9LE-NEXT:    lxv vs0, 0(r3)
-; P9LE-NEXT:    xxperm v2, v2, vs0
+; P9LE-NEXT:    addi r3, r3, -12
+; P9LE-NEXT:    lxvw4x v2, 0, r3
 ; P9LE-NEXT:    blr
 ;
 ; P8BE-LABEL: fromDiffMemVarDui:
@@ -2546,15 +2539,16 @@ define <4 x i32> @fromDiffMemVarDui(ptr nocapture readonly %arr, i32 signext %el
 ;
 ; P8LE-LABEL: fromDiffMemVarDui:
 ; P8LE:       # %bb.0: # %entry
-; P8LE-NEXT:    addis r5, r2, .LCPI41_0@toc@ha
 ; P8LE-NEXT:    sldi r4, r4, 2
-; P8LE-NEXT:    addi r5, r5, .LCPI41_0@toc@l
 ; P8LE-NEXT:    add r3, r3, r4
-; P8LE-NEXT:    lxvd2x vs0, 0, r5
 ; P8LE-NEXT:    addi r3, r3, -12
-; P8LE-NEXT:    lxvd2x v3, 0, r3
+; P8LE-NEXT:    lxvd2x vs0, 0, r3
+; P8LE-NEXT:    addis r3, r2, .LCPI41_0@toc@ha
+; P8LE-NEXT:    addi r3, r3, .LCPI41_0@toc@l
 ; P8LE-NEXT:    xxswapd v2, vs0
-; P8LE-NEXT:    vperm v2, v3, v3, v2
+; P8LE-NEXT:    lxvd2x vs0, 0, r3
+; P8LE-NEXT:    xxswapd v3, vs0
+; P8LE-NEXT:    vperm v2, v2, v2, v3
 ; P8LE-NEXT:    blr
 entry:
   %idxprom = sext i32 %elem to i64
diff --git a/llvm/test/CodeGen/PowerPC/i64_fp.ll b/llvm/test/CodeGen/PowerPC/i64_fp.ll
index b9456150df7b8..3cec87d6653a6 100644
--- a/llvm/test/CodeGen/PowerPC/i64_fp.ll
+++ b/llvm/test/CodeGen/PowerPC/i64_fp.ll
@@ -1,17 +1,17 @@
 ; fcfid and fctid should be generated when the 64bit feature is enabled, but not
 ; otherwise.
 
-; RUN: llc -verify-machineinstrs < %s -mattr=-vsx -mtriple=ppc32-- -mattr=+64bit | \
+; RUN: llc -verify-machineinstrs < %s -mattr=-vsx -mtriple=ppc32-- -mattr=+64bit-support | \
 ; RUN:   grep fcfid
-; RUN: llc -verify-machineinstrs < %s -mattr=-vsx -mtriple=ppc32-- -mattr=+64bit | \
+; RUN: llc -verify-machineinstrs < %s -mattr=-vsx -mtriple=ppc32-- -mattr=+64bit-support | \
 ; RUN:   grep fctidz
 ; RUN: llc -verify-machineinstrs < %s -mattr=-vsx -mtriple=ppc32-- -mcpu=g5 | \
 ; RUN:   grep fcfid
 ; RUN: llc -verify-machineinstrs < %s -mattr=-vsx -mtriple=ppc32-- -mcpu=g5 | \
 ; RUN:   grep fctidz
-; RUN: llc -verify-machineinstrs < %s -mattr=-vsx -mtriple=ppc32-- -mattr=-64bit | \
+; RUN: llc -verify-machineinstrs < %s -mattr=-vsx -mtriple=ppc32-- -mattr=-64bit-support | \
 ; RUN:   not grep fcfid
-; RUN: llc -verify-machineinstrs < %s -mattr=-vsx -mtriple=ppc32-- -mattr=-64bit | \
+; RUN: llc -verify-machineinstrs < %s -mattr=-vsx -mtriple=ppc32-- -mattr=-64bit-support | \
 ; RUN:   not grep fctidz
 ; RUN: llc -verify-machineinstrs < %s -mattr=-vsx -mtriple=ppc32-- -mcpu=g4 | \
 ; RUN:   not grep fcfid
diff --git a/llvm/test/CodeGen/PowerPC/memcmp32_fixsize.ll b/llvm/test/CodeGen/PowerPC/memcmp32_fixsize.ll
new file mode 100644
index 0000000000000..f5483ad2a7c3f
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/memcmp32_fixsize.ll
@@ -0,0 +1,130 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mcpu=pwr8 -ppc-asm-full-reg-names -mtriple=powerpc-ibm-aix < %s | \
+; RUN:   FileCheck %s --check-prefix=CHECK-AIX32-P8
+
+; RUN: llc -mcpu=pwr10 -ppc-asm-full-reg-names -mtriple=powerpc-ibm-aix < %s | \
+; RUN:   FileCheck %s --check-prefix=CHECK-AIX32-P10
+
+; RUN: llc -mcpu=pwr8 -ppc-asm-full-reg-names -mtriple=powerpcle-unknown-linux-gnu < %s | \
+; RUN:   FileCheck %s --check-prefix=CHECK-LINUX32-P8
+
+; RUN: llc -mcpu=pwr10 -ppc-asm-full-reg-names -mtriple=powerpcle-unknown-linux-gnu < %s | \
+; RUN:   FileCheck %s --check-prefix=CHECK-LINUX32-P10
+
+define dso_local signext range(i32 0, 2) i32 @cmpeq16(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b) {
+; CHECK-AIX32-P8-LABEL: cmpeq16:
+; CHECK-AIX32-P8:       # %bb.0: # %entry
+; CHECK-AIX32-P8-NEXT:    lwz r5, 4(r3)
+; CHECK-AIX32-P8-NEXT:    lwz r6, 0(r3)
+; CHECK-AIX32-P8-NEXT:    lwz r7, 4(r4)
+; CHECK-AIX32-P8-NEXT:    lwz r8, 0(r4)
+; CHECK-AIX32-P8-NEXT:    xor r6, r6, r8
+; CHECK-AIX32-P8-NEXT:    xor r5, r5, r7
+; CHECK-AIX32-P8-NEXT:    or. r5, r5, r6
+; CHECK-AIX32-P8-NEXT:    bne cr0, L..BB0_2
+; CHECK-AIX32-P8-NEXT:  # %bb.1: # %loadbb1
+; CHECK-AIX32-P8-NEXT:    lwz r5, 12(r3)
+; CHECK-AIX32-P8-NEXT:    lwz r3, 8(r3)
+; CHECK-AIX32-P8-NEXT:    lwz r6, 12(r4)
+; CHECK-AIX32-P8-NEXT:    lwz r4, 8(r4)
+; CHECK-AIX32-P8-NEXT:    xor r3, r3, r4
+; CHECK-AIX32-P8-NEXT:    xor r4, r5, r6
+; CHECK-AIX32-P8-NEXT:    or. r3, r4, r3
+; CHECK-AIX32-P8-NEXT:    li r3, 0
+; CHECK-AIX32-P8-NEXT:    beq cr0, L..BB0_3
+; CHECK-AIX32-P8-NEXT:  L..BB0_2: # %res_block
+; CHECK-AIX32-P8-NEXT:    li r3, 1
+; CHECK-AIX32-P8-NEXT:  L..BB0_3: # %endblock
+; CHECK-AIX32-P8-NEXT:    cntlzw r3, r3
+; CHECK-AIX32-P8-NEXT:    rlwinm r3, r3, 27, 31, 31
+; CHECK-AIX32-P8-NEXT:    blr
+;
+; CHECK-AIX32-P10-LABEL: cmpeq16:
+; CHECK-AIX32-P10:       # %bb.0: # %entry
+; CHECK-AIX32-P10-NEXT:    lwz r5, 4(r3)
+; CHECK-AIX32-P10-NEXT:    lwz r6, 0(r3)
+; CHECK-AIX32-P10-NEXT:    lwz r7, 4(r4)
+; CHECK-AIX32-P10-NEXT:    xor r5, r5, r7
+; CHECK-AIX32-P10-NEXT:    lwz r8, 0(r4)
+; CHECK-AIX32-P10-NEXT:    xor r6, r6, r8
+; CHECK-AIX32-P10-NEXT:    or. r5, r5, r6
+; CHECK-AIX32-P10-NEXT:    bne cr0, L..BB0_2
+; CHECK-AIX32-P10-NEXT:  # %bb.1: # %loadbb1
+; CHECK-AIX32-P10-NEXT:    lwz r5, 12(r3)
+; CHECK-AIX32-P10-NEXT:    lwz r3, 8(r3)
+; CHECK-AIX32-P10-NEXT:    lwz r6, 12(r4)
+; CHECK-AIX32-P10-NEXT:    lwz r4, 8(r4)
+; CHECK-AIX32-P10-NEXT:    xor r3, r3, r4
+; CHECK-AIX32-P10-NEXT:    xor r4, r5, r6
+; CHECK-AIX32-P10-NEXT:    or. r3, r4, r3
+; CHECK-AIX32-P10-NEXT:    li r3, 0
+; CHECK-AIX32-P10-NEXT:    beq cr0, L..BB0_3
+; CHECK-AIX32-P10-NEXT:  L..BB0_2: # %res_block
+; CHECK-AIX32-P10-NEXT:    li r3, 1
+; CHECK-AIX32-P10-NEXT:  L..BB0_3: # %endblock
+; CHECK-AIX32-P10-NEXT:    cntlzw r3, r3
+; CHECK-AIX32-P10-NEXT:    rlwinm r3, r3, 27, 31, 31
+; CHECK-AIX32-P10-NEXT:    blr
+;
+; CHECK-LINUX32-P8-LABEL: cmpeq16:
+; CHECK-LINUX32-P8:       # %bb.0: # %entry
+; CHECK-LINUX32-P8-NEXT:    lwz r5, 0(r3)
+; CHECK-LINUX32-P8-NEXT:    lwz r6, 4(r3)
+; CHECK-LINUX32-P8-NEXT:    lwz r7, 0(r4)
+; CHECK-LINUX32-P8-NEXT:    lwz r8, 4(r4)
+; CHECK-LINUX32-P8-NEXT:    xor r6, r6, r8
+; CHECK-LINUX32-P8-NEXT:    xor r5, r5, r7
+; CHECK-LINUX32-P8-NEXT:    or. r5, r5, r6
+; CHECK-LINUX32-P8-NEXT:    bne cr0, .LBB0_2
+; CHECK-LINUX32-P8-NEXT:  # %bb.1: # %loadbb1
+; CHECK-LINUX32-P8-NEXT:    lwz r5, 8(r3)
+; CHECK-LINUX32-P8-NEXT:    lwz r3, 12(r3)
+; CHECK-LINUX32-P8-NEXT:    lwz r6, 8(r4)
+; CHECK-LINUX32-P8-NEXT:    lwz r4, 12(r4)
+; CHECK-LINUX32-P8-NEXT:    xor r3, r3, r4
+; CHECK-LINUX32-P8-NEXT:    xor r4, r5, r6
+; CHECK-LINUX32-P8-NEXT:    or. r3, r4, r3
+; CHECK-LINUX32-P8-NEXT:    li r3, 0
+; CHECK-LINUX32-P8-NEXT:    beq cr0, .LBB0_3
+; CHECK-LINUX32-P8-NEXT:  .LBB0_2: # %res_block
+; CHECK-LINUX32-P8-NEXT:    li r3, 1
+; CHECK-LINUX32-P8-NEXT:  .LBB0_3: # %endblock
+; CHECK-LINUX32-P8-NEXT:    cntlzw r3, r3
+; CHECK-LINUX32-P8-NEXT:    rlwinm r3, r3, 27, 31, 31
+; CHECK-LINUX32-P8-NEXT:    blr
+;
+; CHECK-LINUX32-P10-LABEL: cmpeq16:
+; CHECK-LINUX32-P10:       # %bb.0: # %entry
+; CHECK-LINUX32-P10-NEXT:    lwz r5, 0(r3)
+; CHECK-LINUX32-P10-NEXT:    lwz r6, 4(r3)
+; CHECK-LINUX32-P10-NEXT:    lwz r7, 0(r4)
+; CHECK-LINUX32-P10-NEXT:    xor r5, r5, r7
+; CHECK-LINUX32-P10-NEXT:    lwz r8, 4(r4)
+; CHECK-LINUX32-P10-NEXT:    xor r6, r6, r8
+; CHECK-LINUX32-P10-NEXT:    or. r5, r5, r6
+; CHECK-LINUX32-P10-NEXT:    bne cr0, .LBB0_2
+; CHECK-LINUX32-P10-NEXT:  # %bb.1: # %loadbb1
+; CHECK-LINUX32-P10-NEXT:    lwz r5, 8(r3)
+; CHECK-LINUX32-P10-NEXT:    lwz r3, 12(r3)
+; CHECK-LINUX32-P10-NEXT:    lwz r6, 8(r4)
+; CHECK-LINUX32-P10-NEXT:    lwz r4, 12(r4)
+; CHECK-LINUX32-P10-NEXT:    xor r3, r3, r4
+; CHECK-LINUX32-P10-NEXT:    xor r4, r5, r6
+; CHECK-LINUX32-P10-NEXT:    or. r3, r4, r3
+; CHECK-LINUX32-P10-NEXT:    li r3, 0
+; CHECK-LINUX32-P10-NEXT:    beq cr0, .LBB0_3
+; CHECK-LINUX32-P10-NEXT:  .LBB0_2: # %res_block
+; CHECK-LINUX32-P10-NEXT:    li r3, 1
+; CHECK-LINUX32-P10-NEXT:  .LBB0_3: # %endblock
+; CHECK-LINUX32-P10-NEXT:    cntlzw r3, r3
+; CHECK-LINUX32-P10-NEXT:    rlwinm r3, r3, 27, 31, 31
+; CHECK-LINUX32-P10-NEXT:    blr
+entry:
+  %bcmp = tail call i32 @bcmp(ptr noundef nonnull dereferenceable(16) %a, ptr noundef nonnull dereferenceable(16) %b, i32 16)
+  %cmp = icmp eq i32 %bcmp, 0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+declare signext i32 @bcmp(ptr captures(none), ptr captures(none), i32)
+
diff --git a/llvm/test/CodeGen/PowerPC/memcmp64_fixsize.ll b/llvm/test/CodeGen/PowerPC/memcmp64_fixsize.ll
new file mode 100644
index 0000000000000..216b7638642d4
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/memcmp64_fixsize.ll
@@ -0,0 +1,98 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mcpu=pwr8 -ppc-asm-full-reg-names -mtriple=powerpc64-ibm-aix < %s | \
+; RUN:   FileCheck %s --check-prefix=CHECK-AIX64-32-P8
+
+; RUN: llc -mcpu=pwr10 -ppc-asm-full-reg-names -mtriple=powerpc64-ibm-aix < %s | \
+; RUN:   FileCheck %s --check-prefix=CHECK-AIX64-32-P10
+
+; RUN: llc -mcpu=pwr8 -ppc-asm-full-reg-names -mtriple=powerpc64le-unknown-linux-gnu < %s | \
+; RUN:   FileCheck %s --check-prefix=CHECK-LINUX64-P8
+
+; RUN: llc -mcpu=pwr10 -ppc-asm-full-reg-names -mtriple=powerpc64le-unknown-linux-gnu < %s | \
+; RUN:   FileCheck %s --check-prefix=CHECK-LINUX64-P10
+
+define dso_local signext range(i32 0, 2) i32 @cmpeq16(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b) {
+; CHECK-AIX64-32-P8-LABEL: cmpeq16:
+; CHECK-AIX64-32-P8:       # %bb.0: # %entry
+; CHECK-AIX64-32-P8-NEXT:    ld r5, 0(r3)
+; CHECK-AIX64-32-P8-NEXT:    ld r6, 0(r4)
+; CHECK-AIX64-32-P8-NEXT:    cmpld r5, r6
+; CHECK-AIX64-32-P8-NEXT:    bne cr0, L..BB0_2
+; CHECK-AIX64-32-P8-NEXT:  # %bb.1: # %loadbb1
+; CHECK-AIX64-32-P8-NEXT:    ld r5, 8(r3)
+; CHECK-AIX64-32-P8-NEXT:    ld r4, 8(r4)
+; CHECK-AIX64-32-P8-NEXT:    li r3, 0
+; CHECK-AIX64-32-P8-NEXT:    cmpld r5, r4
+; CHECK-AIX64-32-P8-NEXT:    beq cr0, L..BB0_3
+; CHECK-AIX64-32-P8-NEXT:  L..BB0_2: # %res_block
+; CHECK-AIX64-32-P8-NEXT:    li r3, 1
+; CHECK-AIX64-32-P8-NEXT:  L..BB0_3: # %endblock
+; CHECK-AIX64-32-P8-NEXT:    cntlzw r3, r3
+; CHECK-AIX64-32-P8-NEXT:    srwi r3, r3, 5
+; CHECK-AIX64-32-P8-NEXT:    blr
+;
+; CHECK-AIX64-32-P10-LABEL: cmpeq16:
+; CHECK-AIX64-32-P10:       # %bb.0: # %entry
+; CHECK-AIX64-32-P10-NEXT:    ld r5, 0(r3)
+; CHECK-AIX64-32-P10-NEXT:    ld r6, 0(r4)
+; CHECK-AIX64-32-P10-NEXT:    cmpld r5, r6
+; CHECK-AIX64-32-P10-NEXT:    bne cr0, L..BB0_2
+; CHECK-AIX64-32-P10-NEXT:  # %bb.1: # %loadbb1
+; CHECK-AIX64-32-P10-NEXT:    ld r5, 8(r3)
+; CHECK-AIX64-32-P10-NEXT:    ld r4, 8(r4)
+; CHECK-AIX64-32-P10-NEXT:    li r3, 0
+; CHECK-AIX64-32-P10-NEXT:    cmpld r5, r4
+; CHECK-AIX64-32-P10-NEXT:    beq cr0, L..BB0_3
+; CHECK-AIX64-32-P10-NEXT:  L..BB0_2: # %res_block
+; CHECK-AIX64-32-P10-NEXT:    li r3, 1
+; CHECK-AIX64-32-P10-NEXT:  L..BB0_3: # %endblock
+; CHECK-AIX64-32-P10-NEXT:    cntlzw r3, r3
+; CHECK-AIX64-32-P10-NEXT:    rlwinm r3, r3, 27, 31, 31
+; CHECK-AIX64-32-P10-NEXT:    blr
+;
+; CHECK-LINUX64-P8-LABEL: cmpeq16:
+; CHECK-LINUX64-P8:       # %bb.0: # %entry
+; CHECK-LINUX64-P8-NEXT:    ld r5, 0(r3)
+; CHECK-LINUX64-P8-NEXT:    ld r6, 0(r4)
+; CHECK-LINUX64-P8-NEXT:    cmpld r5, r6
+; CHECK-LINUX64-P8-NEXT:    bne cr0, .LBB0_2
+; CHECK-LINUX64-P8-NEXT:  # %bb.1: # %loadbb1
+; CHECK-LINUX64-P8-NEXT:    ld r5, 8(r3)
+; CHECK-LINUX64-P8-NEXT:    ld r4, 8(r4)
+; CHECK-LINUX64-P8-NEXT:    li r3, 0
+; CHECK-LINUX64-P8-NEXT:    cmpld r5, r4
+; CHECK-LINUX64-P8-NEXT:    beq cr0, .LBB0_3
+; CHECK-LINUX64-P8-NEXT:  .LBB0_2: # %res_block
+; CHECK-LINUX64-P8-NEXT:    li r3, 1
+; CHECK-LINUX64-P8-NEXT:  .LBB0_3: # %endblock
+; CHECK-LINUX64-P8-NEXT:    cntlzw r3, r3
+; CHECK-LINUX64-P8-NEXT:    srwi r3, r3, 5
+; CHECK-LINUX64-P8-NEXT:    blr
+;
+; CHECK-LINUX64-P10-LABEL: cmpeq16:
+; CHECK-LINUX64-P10:       # %bb.0: # %entry
+; CHECK-LINUX64-P10-NEXT:    ld r5, 0(r3)
+; CHECK-LINUX64-P10-NEXT:    ld r6, 0(r4)
+; CHECK-LINUX64-P10-NEXT:    cmpld r5, r6
+; CHECK-LINUX64-P10-NEXT:    bne cr0, .LBB0_2
+; CHECK-LINUX64-P10-NEXT:  # %bb.1: # %loadbb1
+; CHECK-LINUX64-P10-NEXT:    ld r5, 8(r3)
+; CHECK-LINUX64-P10-NEXT:    ld r4, 8(r4)
+; CHECK-LINUX64-P10-NEXT:    li r3, 0
+; CHECK-LINUX64-P10-NEXT:    cmpld r5, r4
+; CHECK-LINUX64-P10-NEXT:    beq cr0, .LBB0_3
+; CHECK-LINUX64-P10-NEXT:  .LBB0_2: # %res_block
+; CHECK-LINUX64-P10-NEXT:    li r3, 1
+; CHECK-LINUX64-P10-NEXT:  .LBB0_3: # %endblock
+; CHECK-LINUX64-P10-NEXT:    cntlzw r3, r3
+; CHECK-LINUX64-P10-NEXT:    rlwinm r3, r3, 27, 31, 31
+; CHECK-LINUX64-P10-NEXT:    blr
+entry:
+  %bcmp = tail call i32 @bcmp(ptr noundef nonnull dereferenceable(16) %a, ptr noundef nonnull dereferenceable(16) %b, i64 16)
+  %cmp = icmp eq i32 %bcmp, 0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+declare signext i32 @bcmp(ptr captures(none), ptr captures(none), i64)
+
diff --git a/llvm/test/CodeGen/PowerPC/mma-intrinsics.ll b/llvm/test/CodeGen/PowerPC/mma-intrinsics.ll
index 9d5e5b2210c07..8fbc9d785796d 100644
--- a/llvm/test/CodeGen/PowerPC/mma-intrinsics.ll
+++ b/llvm/test/CodeGen/PowerPC/mma-intrinsics.ll
@@ -394,18 +394,18 @@ define void @testcse4(ptr %res, i32 %lim, ptr %vc) {
 ; CHECK-NEXT:    xxsetaccz acc2
 ; CHECK-NEXT:    xxsetaccz acc1
 ; CHECK-NEXT:    addi r6, r6, 6
+; CHECK-NEXT:    add r8, r5, r7
 ; CHECK-NEXT:    lxvx vs0, r5, r7
-; CHECK-NEXT:    add r7, r5, r7
-; CHECK-NEXT:    lxv vs1, 16(r7)
+; CHECK-NEXT:    rldic r7, r4, 6, 26
+; CHECK-NEXT:    addi r4, r4, 3
+; CHECK-NEXT:    lxv vs1, 16(r8)
 ; CHECK-NEXT:    xvf32gerpp acc2, vs0, vs1
-; CHECK-NEXT:    lxv vs0, 32(r7)
-; CHECK-NEXT:    lxv vs1, 48(r7)
+; CHECK-NEXT:    lxv vs0, 32(r8)
+; CHECK-NEXT:    lxv vs1, 48(r8)
 ; CHECK-NEXT:    xvf32gerpn acc1, vs0, vs1
-; CHECK-NEXT:    lxv vs12, 64(r7)
-; CHECK-NEXT:    lxv vs13, 80(r7)
+; CHECK-NEXT:    lxv vs12, 64(r8)
+; CHECK-NEXT:    lxv vs13, 80(r8)
 ; CHECK-NEXT:    xxsetaccz acc0
-; CHECK-NEXT:    rldic r7, r4, 6, 26
-; CHECK-NEXT:    addi r4, r4, 3
 ; CHECK-NEXT:    add r8, r3, r7
 ; CHECK-NEXT:    xxmfacc acc2
 ; CHECK-NEXT:    xvf32gernp acc0, vs12, vs13
@@ -443,18 +443,18 @@ define void @testcse4(ptr %res, i32 %lim, ptr %vc) {
 ; CHECK-BE-NEXT:    xxsetaccz acc2
 ; CHECK-BE-NEXT:    xxsetaccz acc1
 ; CHECK-BE-NEXT:    addi r6, r6, 6
+; CHECK-BE-NEXT:    add r8, r5, r7
 ; CHECK-BE-NEXT:    lxvx vs0, r5, r7
-; CHECK-BE-NEXT:    add r7, r5, r7
-; CHECK-BE-NEXT:    lxv vs1, 16(r7)
+; CHECK-BE-NEXT:    rldic r7, r4, 6, 26
+; CHECK-BE-NEXT:    addi r4, r4, 3
+; CHECK-BE-NEXT:    lxv vs1, 16(r8)
 ; CHECK-BE-NEXT:    xvf32gerpp acc2, vs0, vs1
-; CHECK-BE-NEXT:    lxv vs0, 32(r7)
-; CHECK-BE-NEXT:    lxv vs1, 48(r7)
+; CHECK-BE-NEXT:    lxv vs0, 32(r8)
+; CHECK-BE-NEXT:    lxv vs1, 48(r8)
 ; CHECK-BE-NEXT:    xvf32gerpn acc1, vs0, vs1
-; CHECK-BE-NEXT:    lxv vs12, 64(r7)
-; CHECK-BE-NEXT:    lxv vs13, 80(r7)
+; CHECK-BE-NEXT:    lxv vs12, 64(r8)
+; CHECK-BE-NEXT:    lxv vs13, 80(r8)
 ; CHECK-BE-NEXT:    xxsetaccz acc0
-; CHECK-BE-NEXT:    rldic r7, r4, 6, 26
-; CHECK-BE-NEXT:    addi r4, r4, 3
 ; CHECK-BE-NEXT:    add r8, r3, r7
 ; CHECK-BE-NEXT:    xxmfacc acc2
 ; CHECK-BE-NEXT:    xvf32gernp acc0, vs12, vs13
diff --git a/llvm/test/CodeGen/PowerPC/vsx-ldst-with-length.ll b/llvm/test/CodeGen/PowerPC/vsx-ldst-with-length.ll
new file mode 100644
index 0000000000000..e7bc8fbca3202
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/vsx-ldst-with-length.ll
@@ -0,0 +1,157 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN:   -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr -mcpu=future < %s | \
+; RUN:   FileCheck %s
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-ibm-aix-xcoff \
+; RUN:   -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr -mcpu=future < %s | \
+; RUN:   FileCheck %s --check-prefix=AIX
+
+; Test for load/store to/from v4i32.
+
+define <4 x i32> @testLXVRL(ptr %a, i64 %b) {
+; CHECK-LABEL: testLXVRL:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lxvrl v2, r3, r4
+; CHECK-NEXT:    blr
+;
+; AIX-LABEL: testLXVRL:
+; AIX:       # %bb.0: # %entry
+; AIX-NEXT:    lxvrl v2, r3, r4
+; AIX-NEXT:    blr
+entry:
+  %0 = tail call <4 x i32> @llvm.ppc.vsx.lxvrl(ptr %a, i64 %b)
+  ret <4 x i32> %0
+}
+declare <4 x i32> @llvm.ppc.vsx.lxvrl(ptr, i64)
+
+define <4 x i32> @testLXVRLL(ptr %a, i64 %b) {
+; CHECK-LABEL: testLXVRLL:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lxvrll v2, r3, r4
+; CHECK-NEXT:    blr
+;
+; AIX-LABEL: testLXVRLL:
+; AIX:       # %bb.0: # %entry
+; AIX-NEXT:    lxvrll v2, r3, r4
+; AIX-NEXT:    blr
+entry:
+  %0 = tail call <4 x i32> @llvm.ppc.vsx.lxvrll(ptr %a, i64 %b)
+  ret <4 x i32> %0
+}
+declare <4 x i32> @llvm.ppc.vsx.lxvrll(ptr, i64)
+
+define void @testSTXVRL(<4 x i32> %a, ptr %b, i64 %c) {
+; CHECK-LABEL: testSTXVRL:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    stxvrl v2, r5, r6
+; CHECK-NEXT:    blr
+;
+; AIX-LABEL: testSTXVRL:
+; AIX:       # %bb.0: # %entry
+; AIX-NEXT:    stxvrl v2, r3, r4
+; AIX-NEXT:    blr
+entry:
+  tail call void @llvm.ppc.vsx.stxvrl(<4 x i32> %a, ptr %b, i64 %c)
+  ret void
+}
+declare void @llvm.ppc.vsx.stxvrl(<4 x i32>, ptr, i64)
+
+define void @testSTXVRLL(<4 x i32> %a, ptr %b, i64 %c) {
+; CHECK-LABEL: testSTXVRLL:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    stxvrll v2, r5, r6
+; CHECK-NEXT:    blr
+;
+; AIX-LABEL: testSTXVRLL:
+; AIX:       # %bb.0: # %entry
+; AIX-NEXT:    stxvrll v2, r3, r4
+; AIX-NEXT:    blr
+entry:
+  tail call void @llvm.ppc.vsx.stxvrll(<4 x i32> %a, ptr %b, i64 %c)
+  ret void
+}
+declare void @llvm.ppc.vsx.stxvrll(<4 x i32>, ptr, i64)
+
+; Test for load/store vectore pair.
+
+define <256 x i1> @testLXVPRL(ptr %vpp, i64 %b) {
+; CHECK-LABEL: testLXVPRL:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lxvprl vsp34, r4, r5
+; CHECK-NEXT:    stxv v2, 16(r3)
+; CHECK-NEXT:    stxv v3, 0(r3)
+; CHECK-NEXT:    blr
+;
+; AIX-LABEL: testLXVPRL:
+; AIX:       # %bb.0: # %entry
+; AIX-NEXT:    lxvprl vsp34, r4, r5
+; AIX-NEXT:    stxv v3, 16(r3)
+; AIX-NEXT:    stxv v2, 0(r3)
+; AIX-NEXT:    blr
+entry:
+  %0 = tail call <256 x i1> @llvm.ppc.vsx.lxvprl(ptr %vpp, i64 %b)
+  ret <256 x i1> %0
+}
+declare <256 x i1> @llvm.ppc.vsx.lxvprl(ptr, i64)
+
+define <256 x i1> @testLXVPRLL(ptr %vpp, i64 %b) {
+; CHECK-LABEL: testLXVPRLL:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lxvprll vsp34, r4, r5
+; CHECK-NEXT:    stxv v2, 16(r3)
+; CHECK-NEXT:    stxv v3, 0(r3)
+; CHECK-NEXT:    blr
+;
+; AIX-LABEL: testLXVPRLL:
+; AIX:       # %bb.0: # %entry
+; AIX-NEXT:    lxvprll vsp34, r4, r5
+; AIX-NEXT:    stxv v3, 16(r3)
+; AIX-NEXT:    stxv v2, 0(r3)
+; AIX-NEXT:    blr
+entry:
+  %0 = tail call <256 x i1> @llvm.ppc.vsx.lxvprll(ptr %vpp, i64 %b)
+  ret <256 x i1> %0
+}
+declare <256 x i1> @llvm.ppc.vsx.lxvprll(ptr, i64)
+
+define void @testSTXVPRL(ptr %v, ptr %vp, i64 %len) {
+; CHECK-LABEL: testSTXVPRL:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lxv v2, 16(r3)
+; CHECK-NEXT:    lxv v3, 0(r3)
+; CHECK-NEXT:    stxvprl vsp34, r4, r5
+; CHECK-NEXT:    blr
+;
+; AIX-LABEL: testSTXVPRL:
+; AIX:       # %bb.0: # %entry
+; AIX-NEXT:    lxv v2, 0(r3)
+; AIX-NEXT:    lxv v3, 16(r3)
+; AIX-NEXT:    stxvprl vsp34, r4, r5
+; AIX-NEXT:    blr
+entry:
+  %0 = load <256 x i1>, ptr %v, align 32
+  tail call void @llvm.ppc.vsx.stxvprl(<256 x i1> %0, ptr %vp, i64 %len)
+  ret void
+}
+declare void @llvm.ppc.vsx.stxvprl(<256 x i1>, ptr, i64)
+
+define void @testSTXVPRLL(ptr %v, ptr %vp, i64 %len) {
+; CHECK-LABEL: testSTXVPRLL:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lxv v2, 16(r3)
+; CHECK-NEXT:    lxv v3, 0(r3)
+; CHECK-NEXT:    stxvprll vsp34, r4, r5
+; CHECK-NEXT:    blr
+;
+; AIX-LABEL: testSTXVPRLL:
+; AIX:       # %bb.0: # %entry
+; AIX-NEXT:    lxv v2, 0(r3)
+; AIX-NEXT:    lxv v3, 16(r3)
+; AIX-NEXT:    stxvprll vsp34, r4, r5
+; AIX-NEXT:    blr
+entry:
+  %0 = load <256 x i1>, ptr %v, align 32
+  tail call void @llvm.ppc.vsx.stxvprll(<256 x i1> %0, ptr %vp, i64 %len)
+  ret void
+}
+declare void @llvm.ppc.vsx.stxvprll(<256 x i1>, ptr, i64)
diff --git a/llvm/test/CodeGen/PowerPC/xxeval-vselect-x-eqv.ll b/llvm/test/CodeGen/PowerPC/xxeval-vselect-x-eqv.ll
new file mode 100644
index 0000000000000..7fa576f599dc4
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/xxeval-vselect-x-eqv.ll
@@ -0,0 +1,327 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; Test file to verify the emission of Vector Evaluate instructions when ternary operators are used.
+
+; RUN: llc -verify-machineinstrs -mcpu=pwr10 -mtriple=powerpc64le-unknown-unknown \
+; RUN:   -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr < %s | FileCheck %s
+
+; RUN: llc -verify-machineinstrs -mcpu=pwr10 -mtriple=powerpc-ibm-aix-xcoff \
+; RUN:   -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr < %s | FileCheck %s
+
+; RUN: llc -verify-machineinstrs -mcpu=pwr10 -mtriple=powerpc64-ibm-aix-xcoff \
+; RUN:   -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr < %s | FileCheck %s
+
+; Function to test ternary(A, or(B, C), eqv(B, C)) for <4 x i32>
+define <4 x i32> @ternary_A_or_BC_eqv_BC_4x32(<4 x i1> %A, <4 x i32> %B, <4 x i32> %C) {
+; CHECK-LABEL: ternary_A_or_BC_eqv_BC_4x32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxleqv v5, v5, v5
+; CHECK-NEXT:    xxlor vs0, v3, v4
+; CHECK-NEXT:    xxleqv vs1, v3, v4
+; CHECK-NEXT:    vslw v2, v2, v5
+; CHECK-NEXT:    vsraw v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    blr
+entry:
+  %or = or <4 x i32> %B, %C
+  %xor = xor <4 x i32> %B, %C
+  %eqv = xor <4 x i32> %xor, <i32 -1, i32 -1, i32 -1, i32 -1>  ; Vector eqv operation
+  %res = select <4 x i1> %A, <4 x i32> %or, <4 x i32> %eqv
+  ret <4 x i32> %res
+}
+
+; Function to test ternary(A, or(B, C), eqv(B, C)) for <2 x i64>
+define <2 x i64> @ternary_A_or_BC_eqv_BC_2x64(<2 x i1> %A, <2 x i64> %B, <2 x i64> %C) {
+; CHECK-LABEL: ternary_A_or_BC_eqv_BC_2x64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxlxor v5, v5, v5
+; CHECK-NEXT:    xxlor vs0, v3, v4
+; CHECK-NEXT:    xxleqv vs1, v3, v4
+; CHECK-NEXT:    xxsplti32dx v5, 1, 63
+; CHECK-NEXT:    vsld v2, v2, v5
+; CHECK-NEXT:    vsrad v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    blr
+entry:
+  %or = or <2 x i64> %B, %C
+  %xor = xor <2 x i64> %B, %C
+  %eqv = xor <2 x i64> %xor, <i64 -1, i64 -1>  ; Vector eqv operation
+  %res = select <2 x i1> %A, <2 x i64> %or, <2 x i64> %eqv
+  ret <2 x i64> %res
+}
+
+; Function to test ternary(A, or(B, C), eqv(B, C)) for <16 x i8>
+define <16 x i8> @ternary_A_or_BC_eqv_BC_16x8(<16 x i1> %A, <16 x i8> %B, <16 x i8> %C) {
+; CHECK-LABEL: ternary_A_or_BC_eqv_BC_16x8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxspltib v5, 7
+; CHECK-NEXT:    xxlor vs0, v3, v4
+; CHECK-NEXT:    xxleqv vs1, v3, v4
+; CHECK-NEXT:    vslb v2, v2, v5
+; CHECK-NEXT:    vsrab v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    blr
+entry:
+  %or = or <16 x i8> %B, %C
+  %xor = xor <16 x i8> %B, %C
+  %eqv = xor <16 x i8> %xor, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>  ; Vector eqv operation
+  %res = select <16 x i1> %A, <16 x i8> %or, <16 x i8> %eqv
+  ret <16 x i8> %res
+}
+
+; Function to test ternary(A, or(B, C), eqv(B, C)) for <8 x i16>
+define <8 x i16> @ternary_A_or_BC_eqv_BC_8x16(<8 x i1> %A, <8 x i16> %B, <8 x i16> %C) {
+; CHECK-LABEL: ternary_A_or_BC_eqv_BC_8x16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxspltiw v5, 983055
+; CHECK-NEXT:    xxlor vs0, v3, v4
+; CHECK-NEXT:    xxleqv vs1, v3, v4
+; CHECK-NEXT:    vslh v2, v2, v5
+; CHECK-NEXT:    vsrah v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    blr
+entry:
+  %or = or <8 x i16> %B, %C
+  %xor = xor <8 x i16> %B, %C
+  %eqv = xor <8 x i16> %xor, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>  ; Vector eqv operation
+  %res = select <8 x i1> %A, <8 x i16> %or, <8 x i16> %eqv
+  ret <8 x i16> %res
+}
+
+; Function to test ternary(A, nor(B, C), eqv(B, C)) for <4 x i32>
+define <4 x i32> @ternary_A_nor_BC_eqv_BC_4x32(<4 x i1> %A, <4 x i32> %B, <4 x i32> %C) {
+; CHECK-LABEL: ternary_A_nor_BC_eqv_BC_4x32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxleqv v5, v5, v5
+; CHECK-NEXT:    xxlnor vs0, v3, v4
+; CHECK-NEXT:    xxleqv vs1, v3, v4
+; CHECK-NEXT:    vslw v2, v2, v5
+; CHECK-NEXT:    vsraw v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    blr
+entry:
+  %or = or <4 x i32> %B, %C
+  %nor = xor <4 x i32> %or, <i32 -1, i32 -1, i32 -1, i32 -1>  ; Vector NOR operation
+  %xor = xor <4 x i32> %B, %C
+  %eqv = xor <4 x i32> %xor, <i32 -1, i32 -1, i32 -1, i32 -1>  ; Vector eqv operation
+  %res = select <4 x i1> %A, <4 x i32> %nor, <4 x i32> %eqv
+  ret <4 x i32> %res
+}
+
+; Function to test ternary(A, nor(B, C), eqv(B, C)) for <2 x i64>
+define <2 x i64> @ternary_A_nor_BC_eqv_BC_2x64(<2 x i1> %A, <2 x i64> %B, <2 x i64> %C) {
+; CHECK-LABEL: ternary_A_nor_BC_eqv_BC_2x64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxlxor v5, v5, v5
+; CHECK-NEXT:    xxlnor vs0, v3, v4
+; CHECK-NEXT:    xxleqv vs1, v3, v4
+; CHECK-NEXT:    xxsplti32dx v5, 1, 63
+; CHECK-NEXT:    vsld v2, v2, v5
+; CHECK-NEXT:    vsrad v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    blr
+entry:
+  %or = or <2 x i64> %B, %C
+  %nor = xor <2 x i64> %or, <i64 -1, i64 -1>  ; Vector NOR operation
+  %xor = xor <2 x i64> %B, %C
+  %eqv = xor <2 x i64> %xor, <i64 -1, i64 -1>  ; Vector eqv operation
+  %res = select <2 x i1> %A, <2 x i64> %nor, <2 x i64> %eqv
+  ret <2 x i64> %res
+}
+
+; Function to test ternary(A, nor(B, C), eqv(B, C)) for <16 x i8>
+define <16 x i8> @ternary_A_nor_BC_eqv_BC_16x8(<16 x i1> %A, <16 x i8> %B, <16 x i8> %C) {
+; CHECK-LABEL: ternary_A_nor_BC_eqv_BC_16x8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxspltib v5, 7
+; CHECK-NEXT:    xxlnor vs0, v3, v4
+; CHECK-NEXT:    xxleqv vs1, v3, v4
+; CHECK-NEXT:    vslb v2, v2, v5
+; CHECK-NEXT:    vsrab v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    blr
+entry:
+  %or = or <16 x i8> %B, %C
+  %nor = xor <16 x i8> %or, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>  ; Vector NOR operation
+  %xor = xor <16 x i8> %B, %C
+  %eqv = xor <16 x i8> %xor, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>  ; Vector eqv operation
+  %res = select <16 x i1> %A, <16 x i8> %nor, <16 x i8> %eqv
+  ret <16 x i8> %res
+}
+
+; Function to test ternary(A, nor(B, C), eqv(B, C)) for <8 x i16>
+define <8 x i16> @ternary_A_nor_BC_eqv_BC_8x16(<8 x i1> %A, <8 x i16> %B, <8 x i16> %C) {
+; CHECK-LABEL: ternary_A_nor_BC_eqv_BC_8x16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxspltiw v5, 983055
+; CHECK-NEXT:    xxlnor vs0, v3, v4
+; CHECK-NEXT:    xxleqv vs1, v3, v4
+; CHECK-NEXT:    vslh v2, v2, v5
+; CHECK-NEXT:    vsrah v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    blr
+entry:
+  %or = or <8 x i16> %B, %C
+  %nor = xor <8 x i16> %or, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>  ; Vector NOR operation
+  %xor = xor <8 x i16> %B, %C
+  %eqv = xor <8 x i16> %xor, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>  ; Vector eqv operation
+  %res = select <8 x i1> %A, <8 x i16> %nor, <8 x i16> %eqv
+  ret <8 x i16> %res
+}
+
+; Function to test ternary(A, not(C), eqv(B, C)) for <4 x i32>
+define <4 x i32> @ternary_A_not_C_eqv_BC_4x32(<4 x i1> %A, <4 x i32> %B, <4 x i32> %C) {
+; CHECK-LABEL: ternary_A_not_C_eqv_BC_4x32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxleqv v5, v5, v5
+; CHECK-NEXT:    xxlnor vs0, v4, v4
+; CHECK-NEXT:    xxleqv vs1, v4, v3
+; CHECK-NEXT:    vslw v2, v2, v5
+; CHECK-NEXT:    vsraw v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    blr
+entry:
+  %not = xor <4 x i32> %C, <i32 -1, i32 -1, i32 -1, i32 -1>  ; Vector not operation
+  %xor = xor <4 x i32> %B, %C
+  %eqv = xor <4 x i32> %xor, <i32 -1, i32 -1, i32 -1, i32 -1>  ; Vector eqv operation
+  %res = select <4 x i1> %A, <4 x i32> %not, <4 x i32> %eqv
+  ret <4 x i32> %res
+}
+
+; Function to test ternary(A, not(C), eqv(B, C)) for <2 x i64>
+define <2 x i64> @ternary_A_not_C_eqv_BC_2x64(<2 x i1> %A, <2 x i64> %B, <2 x i64> %C) {
+; CHECK-LABEL: ternary_A_not_C_eqv_BC_2x64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxlxor v5, v5, v5
+; CHECK-NEXT:    xxlnor vs0, v4, v4
+; CHECK-NEXT:    xxleqv vs1, v4, v3
+; CHECK-NEXT:    xxsplti32dx v5, 1, 63
+; CHECK-NEXT:    vsld v2, v2, v5
+; CHECK-NEXT:    vsrad v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    blr
+entry:
+  %not = xor <2 x i64> %C, <i64 -1, i64 -1>  ; Vector not operation
+  %xor = xor <2 x i64> %B, %C
+  %eqv = xor <2 x i64> %xor, <i64 -1, i64 -1>  ; Vector eqv operation
+  %res = select <2 x i1> %A, <2 x i64> %not, <2 x i64> %eqv
+  ret <2 x i64> %res
+}
+
+; Function to test ternary(A, not(C), eqv(B, C)) for <16 x i8>
+define <16 x i8> @ternary_A_not_C_eqv_BC_16x8(<16 x i1> %A, <16 x i8> %B, <16 x i8> %C) {
+; CHECK-LABEL: ternary_A_not_C_eqv_BC_16x8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxspltib v5, 7
+; CHECK-NEXT:    xxlnor vs0, v4, v4
+; CHECK-NEXT:    xxleqv vs1, v4, v3
+; CHECK-NEXT:    vslb v2, v2, v5
+; CHECK-NEXT:    vsrab v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    blr
+entry:
+  %not = xor <16 x i8> %C, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>  ; Vector not operation
+  %xor = xor <16 x i8> %B, %C
+  %eqv = xor <16 x i8> %xor, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>  ; Vector eqv operation
+  %res = select <16 x i1> %A, <16 x i8> %not, <16 x i8> %eqv
+  ret <16 x i8> %res
+}
+
+; Function to test ternary(A, not(C), eqv(B, C)) for <8 x i16>
+define <8 x i16> @ternary_A_not_C_eqv_BC_8x16(<8 x i1> %A, <8 x i16> %B, <8 x i16> %C) {
+; CHECK-LABEL: ternary_A_not_C_eqv_BC_8x16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxspltiw v5, 983055
+; CHECK-NEXT:    xxlnor vs0, v4, v4
+; CHECK-NEXT:    xxleqv vs1, v4, v3
+; CHECK-NEXT:    vslh v2, v2, v5
+; CHECK-NEXT:    vsrah v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    blr
+entry:
+  %not = xor <8 x i16> %C, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>  ; Vector not operation
+  %xor = xor <8 x i16> %B, %C
+  %eqv = xor <8 x i16> %xor, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>  ; Vector eqv operation
+  %res = select <8 x i1> %A, <8 x i16> %not, <8 x i16> %eqv
+  ret <8 x i16> %res
+}
+
+; Function to test ternary(A, nand(B, C), eqv(B, C)) for <4 x i32>
+define <4 x i32> @ternary_A_nand_BC_eqv_BC_4x32(<4 x i1> %A, <4 x i32> %B, <4 x i32> %C) {
+; CHECK-LABEL: ternary_A_nand_BC_eqv_BC_4x32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxleqv v5, v5, v5
+; CHECK-NEXT:    xxlnand vs0, v3, v4
+; CHECK-NEXT:    xxleqv vs1, v3, v4
+; CHECK-NEXT:    vslw v2, v2, v5
+; CHECK-NEXT:    vsraw v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    blr
+entry:
+  %and = and <4 x i32> %B, %C
+  %nand = xor <4 x i32> %and, <i32 -1, i32 -1, i32 -1, i32 -1>  ; Vector nand operation
+  %xor = xor <4 x i32> %B, %C
+  %eqv = xor <4 x i32> %xor, <i32 -1, i32 -1, i32 -1, i32 -1>  ; Vector eqv operation
+  %res = select <4 x i1> %A, <4 x i32> %nand, <4 x i32> %eqv
+  ret <4 x i32> %res
+}
+
+; Function to test ternary(A, nand(B, C), eqv(B, C)) for <2 x i64>
+define <2 x i64> @ternary_A_nand_BC_eqv_BC_2x64(<2 x i1> %A, <2 x i64> %B, <2 x i64> %C) {
+; CHECK-LABEL: ternary_A_nand_BC_eqv_BC_2x64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxlxor v5, v5, v5
+; CHECK-NEXT:    xxlnand vs0, v3, v4
+; CHECK-NEXT:    xxleqv vs1, v3, v4
+; CHECK-NEXT:    xxsplti32dx v5, 1, 63
+; CHECK-NEXT:    vsld v2, v2, v5
+; CHECK-NEXT:    vsrad v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    blr
+entry:
+  %and = and <2 x i64> %B, %C
+  %nand = xor <2 x i64> %and, <i64 -1, i64 -1>  ; Vector nand operation
+  %xor = xor <2 x i64> %B, %C
+  %eqv = xor <2 x i64> %xor, <i64 -1, i64 -1>  ; Vector eqv operation
+  %res = select <2 x i1> %A, <2 x i64> %nand, <2 x i64> %eqv
+  ret <2 x i64> %res
+}
+
+; Function to test ternary(A, nand(B, C), eqv(B, C)) for <16 x i8>
+define <16 x i8> @ternary_A_nand_BC_eqv_BC_16x8(<16 x i1> %A, <16 x i8> %B, <16 x i8> %C) {
+; CHECK-LABEL: ternary_A_nand_BC_eqv_BC_16x8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxspltib v5, 7
+; CHECK-NEXT:    xxlnand vs0, v3, v4
+; CHECK-NEXT:    xxleqv vs1, v3, v4
+; CHECK-NEXT:    vslb v2, v2, v5
+; CHECK-NEXT:    vsrab v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    blr
+entry:
+  %and = and <16 x i8> %B, %C
+  %nand = xor <16 x i8> %and, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>  ; Vector nand operation
+  %xor = xor <16 x i8> %B, %C
+  %eqv = xor <16 x i8> %xor, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>  ; Vector eqv operation
+  %res = select <16 x i1> %A, <16 x i8> %nand, <16 x i8> %eqv
+  ret <16 x i8> %res
+}
+
+; Function to test ternary(A, nand(B, C), eqv(B, C)) for <8 x i16>
+define <8 x i16> @ternary_A_nand_BC_eqv_BC_8x16(<8 x i1> %A, <8 x i16> %B, <8 x i16> %C) {
+; CHECK-LABEL: ternary_A_nand_BC_eqv_BC_8x16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxspltiw v5, 983055
+; CHECK-NEXT:    xxlnand vs0, v3, v4
+; CHECK-NEXT:    xxleqv vs1, v3, v4
+; CHECK-NEXT:    vslh v2, v2, v5
+; CHECK-NEXT:    vsrah v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    blr
+entry:
+  %and = and <8 x i16> %B, %C
+  %nand = xor <8 x i16> %and, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>  ; Vector nand operation
+  %xor = xor <8 x i16> %B, %C
+  %eqv = xor <8 x i16> %xor, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>  ; Vector eqv operation
+  %res = select <8 x i1> %A, <8 x i16> %nand, <8 x i16> %eqv
+  ret <8 x i16> %res
+}
diff --git a/llvm/test/CodeGen/PowerPC/xxeval-vselect-x-nand.ll b/llvm/test/CodeGen/PowerPC/xxeval-vselect-x-nand.ll
new file mode 100644
index 0000000000000..7a6733d3b5510
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/xxeval-vselect-x-nand.ll
@@ -0,0 +1,384 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; Test file to verify the emission of Vector Evaluate instructions when ternary operators are used.
+
+; RUN: llc -verify-machineinstrs -mcpu=pwr10 -mtriple=powerpc64le-unknown-unknown \
+; RUN:   -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr < %s | FileCheck %s
+
+; RUN: llc -verify-machineinstrs -mcpu=pwr10 -mtriple=powerpc-ibm-aix-xcoff \
+; RUN:   -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr < %s | FileCheck %s
+
+; RUN: llc -verify-machineinstrs -mcpu=pwr10 -mtriple=powerpc64-ibm-aix-xcoff \
+; RUN:   -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr < %s | FileCheck %s
+
+; Function to test ternary(A, B, nand(B, C)) for <4 x i32>
+define <4 x i32> @ternary_A_B_nand_BC_4x32(<4 x i1> %A, <4 x i32> %B, <4 x i32> %C) {
+; CHECK-LABEL: ternary_A_B_nand_BC_4x32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxleqv v5, v5, v5
+; CHECK-NEXT:    xxlnand vs0, v3, v4
+; CHECK-NEXT:    vslw v2, v2, v5
+; CHECK-NEXT:    vsraw v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs0, v3, v2
+; CHECK-NEXT:    blr
+entry:
+  %and = and <4 x i32> %B, %C
+  %nand = xor <4 x i32> %and, <i32 -1, i32 -1, i32 -1, i32 -1>  ; Vector nand operation
+  %res = select <4 x i1> %A, <4 x i32> %B, <4 x i32> %nand
+  ret <4 x i32> %res
+}
+
+; Function to test ternary(A, B, nand(B, C)) for <2 x i64>
+define <2 x i64> @ternary_A_B_nand_BC_2x64(<2 x i1> %A, <2 x i64> %B, <2 x i64> %C) {
+; CHECK-LABEL: ternary_A_B_nand_BC_2x64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxlxor v5, v5, v5
+; CHECK-NEXT:    xxlnand vs0, v3, v4
+; CHECK-NEXT:    xxsplti32dx v5, 1, 63
+; CHECK-NEXT:    vsld v2, v2, v5
+; CHECK-NEXT:    vsrad v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs0, v3, v2
+; CHECK-NEXT:    blr
+entry:
+  %and = and <2 x i64> %B, %C
+  %nand = xor <2 x i64> %and, <i64 -1, i64 -1>  ; Vector nand operation
+  %res = select <2 x i1> %A, <2 x i64> %B, <2 x i64> %nand
+  ret <2 x i64> %res
+}
+
+; Function to test ternary(A, B, nand(B, C)) for <16 x i8>
+define <16 x i8> @ternary_A_B_nand_BC_16x8(<16 x i1> %A, <16 x i8> %B, <16 x i8> %C) {
+; CHECK-LABEL: ternary_A_B_nand_BC_16x8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxspltib v5, 7
+; CHECK-NEXT:    xxlnand vs0, v3, v4
+; CHECK-NEXT:    vslb v2, v2, v5
+; CHECK-NEXT:    vsrab v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs0, v3, v2
+; CHECK-NEXT:    blr
+entry:
+  %and = and <16 x i8> %B, %C
+  %nand = xor <16 x i8> %and, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>  ; Vector nand operation
+  %res = select <16 x i1> %A, <16 x i8> %B, <16 x i8> %nand
+  ret <16 x i8> %res
+}
+
+; Function to test ternary(A, B, nand(B, C)) for <8 x i16>
+define <8 x i16> @ternary_A_B_nand_BC_8x16(<8 x i1> %A, <8 x i16> %B, <8 x i16> %C) {
+; CHECK-LABEL: ternary_A_B_nand_BC_8x16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxspltiw v5, 983055
+; CHECK-NEXT:    xxlnand vs0, v3, v4
+; CHECK-NEXT:    vslh v2, v2, v5
+; CHECK-NEXT:    vsrah v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs0, v3, v2
+; CHECK-NEXT:    blr
+entry:
+  %and = and <8 x i16> %B, %C
+  %nand = xor <8 x i16> %and, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>  ; Vector nand operation
+  %res = select <8 x i1> %A, <8 x i16> %B, <8 x i16> %nand
+  ret <8 x i16> %res
+}
+
+; Function to test ternary(A, C, nand(B, C)) for <4 x i32>
+define <4 x i32> @ternary_A_C_nand_BC_4x32(<4 x i1> %A, <4 x i32> %B, <4 x i32> %C) {
+; CHECK-LABEL: ternary_A_C_nand_BC_4x32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxleqv v5, v5, v5
+; CHECK-NEXT:    xxlnand vs0, v3, v4
+; CHECK-NEXT:    vslw v2, v2, v5
+; CHECK-NEXT:    vsraw v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs0, v4, v2
+; CHECK-NEXT:    blr
+entry:
+  %and = and <4 x i32> %B, %C
+  %nand = xor <4 x i32> %and, <i32 -1, i32 -1, i32 -1, i32 -1>  ; Vector nand operation
+  %res = select <4 x i1> %A, <4 x i32> %C, <4 x i32> %nand
+  ret <4 x i32> %res
+}
+
+; Function to test ternary(A, C, nand(B, C)) for <2 x i64>
+define <2 x i64> @ternary_A_C_nand_BC_2x64(<2 x i1> %A, <2 x i64> %B, <2 x i64> %C) {
+; CHECK-LABEL: ternary_A_C_nand_BC_2x64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxlxor v5, v5, v5
+; CHECK-NEXT:    xxlnand vs0, v3, v4
+; CHECK-NEXT:    xxsplti32dx v5, 1, 63
+; CHECK-NEXT:    vsld v2, v2, v5
+; CHECK-NEXT:    vsrad v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs0, v4, v2
+; CHECK-NEXT:    blr
+entry:
+  %and = and <2 x i64> %B, %C
+  %nand = xor <2 x i64> %and, <i64 -1, i64 -1>  ; Vector nand operation
+  %res = select <2 x i1> %A, <2 x i64> %C, <2 x i64> %nand
+  ret <2 x i64> %res
+}
+
+; Function to test ternary(A, C, nand(B, C)) for <16 x i8>
+define <16 x i8> @ternary_A_C_nand_BC_16x8(<16 x i1> %A, <16 x i8> %B, <16 x i8> %C) {
+; CHECK-LABEL: ternary_A_C_nand_BC_16x8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxspltib v5, 7
+; CHECK-NEXT:    xxlnand vs0, v3, v4
+; CHECK-NEXT:    vslb v2, v2, v5
+; CHECK-NEXT:    vsrab v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs0, v4, v2
+; CHECK-NEXT:    blr
+entry:
+  %and = and <16 x i8> %B, %C
+  %nand = xor <16 x i8> %and, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>  ; Vector nand operation
+  %res = select <16 x i1> %A, <16 x i8> %C, <16 x i8> %nand
+  ret <16 x i8> %res
+}
+
+; Function to test ternary(A, C, nand(B, C)) for <8 x i16>
+define <8 x i16> @ternary_A_C_nand_BC_8x16(<8 x i1> %A, <8 x i16> %B, <8 x i16> %C) {
+; CHECK-LABEL: ternary_A_C_nand_BC_8x16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxspltiw v5, 983055
+; CHECK-NEXT:    xxlnand vs0, v3, v4
+; CHECK-NEXT:    vslh v2, v2, v5
+; CHECK-NEXT:    vsrah v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs0, v4, v2
+; CHECK-NEXT:    blr
+entry:
+  %and = and <8 x i16> %B, %C
+  %nand = xor <8 x i16> %and, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>  ; Vector nand operation
+  %res = select <8 x i1> %A, <8 x i16> %C, <8 x i16> %nand
+  ret <8 x i16> %res
+}
+
+; Function to test ternary(A, xor(B, C), nand(B, C)) for <4 x i32>
+define <4 x i32> @ternary_A_xor_BC_nand_BC_4x32(<4 x i1> %A, <4 x i32> %B, <4 x i32> %C) {
+; CHECK-LABEL: ternary_A_xor_BC_nand_BC_4x32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxleqv v5, v5, v5
+; CHECK-NEXT:    xxlxor vs0, v3, v4
+; CHECK-NEXT:    xxlnand vs1, v3, v4
+; CHECK-NEXT:    vslw v2, v2, v5
+; CHECK-NEXT:    vsraw v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    blr
+entry:
+  %xor = xor <4 x i32> %B, %C
+  %and = and <4 x i32> %B, %C
+  %nand = xor <4 x i32> %and, <i32 -1, i32 -1, i32 -1, i32 -1>  ; Vector nand operation
+  %res = select <4 x i1> %A, <4 x i32> %xor, <4 x i32> %nand
+  ret <4 x i32> %res
+}
+
+; Function to test ternary(A, xor(B, C), nand(B, C)) for <2 x i64>
+define <2 x i64> @ternary_A_xor_BC_nand_BC_2x64(<2 x i1> %A, <2 x i64> %B, <2 x i64> %C) {
+; CHECK-LABEL: ternary_A_xor_BC_nand_BC_2x64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxlxor v5, v5, v5
+; CHECK-NEXT:    xxlxor vs0, v3, v4
+; CHECK-NEXT:    xxlnand vs1, v3, v4
+; CHECK-NEXT:    xxsplti32dx v5, 1, 63
+; CHECK-NEXT:    vsld v2, v2, v5
+; CHECK-NEXT:    vsrad v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    blr
+entry:
+  %xor = xor <2 x i64> %B, %C
+  %and = and <2 x i64> %B, %C
+  %nand = xor <2 x i64> %and, <i64 -1, i64 -1>  ; Vector nand operation
+  %res = select <2 x i1> %A, <2 x i64> %xor, <2 x i64> %nand
+  ret <2 x i64> %res
+}
+
+; Function to test ternary(A, xor(B, C), nand(B, C)) for <16 x i8>
+define <16 x i8> @ternary_A_xor_BC_nand_BC_16x8(<16 x i1> %A, <16 x i8> %B, <16 x i8> %C) {
+; CHECK-LABEL: ternary_A_xor_BC_nand_BC_16x8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxspltib v5, 7
+; CHECK-NEXT:    xxlxor vs0, v3, v4
+; CHECK-NEXT:    xxlnand vs1, v3, v4
+; CHECK-NEXT:    vslb v2, v2, v5
+; CHECK-NEXT:    vsrab v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    blr
+entry:
+  %xor = xor <16 x i8> %B, %C
+  %and = and <16 x i8> %B, %C
+  %nand = xor <16 x i8> %and, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>  ; Vector nand operation
+  %res = select <16 x i1> %A, <16 x i8> %xor, <16 x i8> %nand
+  ret <16 x i8> %res
+}
+
+; Function to test ternary(A, xor(B, C), nand(B, C)) for <8 x i16>
+define <8 x i16> @ternary_A_xor_BC_nand_BC_8x16(<8 x i1> %A, <8 x i16> %B, <8 x i16> %C) {
+; CHECK-LABEL: ternary_A_xor_BC_nand_BC_8x16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxspltiw v5, 983055
+; CHECK-NEXT:    xxlxor vs0, v3, v4
+; CHECK-NEXT:    xxlnand vs1, v3, v4
+; CHECK-NEXT:    vslh v2, v2, v5
+; CHECK-NEXT:    vsrah v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    blr
+entry:
+  %xor = xor <8 x i16> %B, %C
+  %and = and <8 x i16> %B, %C
+  %nand = xor <8 x i16> %and, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>  ; Vector nand operation
+  %res = select <8 x i1> %A, <8 x i16> %xor, <8 x i16> %nand
+  ret <8 x i16> %res
+}
+
+; Function to test ternary(A, or(B, C), nand(B, C)) for <4 x i32>
+define <4 x i32> @ternary_A_or_BC_nand_BC_4x32(<4 x i1> %A, <4 x i32> %B, <4 x i32> %C) {
+; CHECK-LABEL: ternary_A_or_BC_nand_BC_4x32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxleqv v5, v5, v5
+; CHECK-NEXT:    xxlor vs0, v3, v4
+; CHECK-NEXT:    xxlnand vs1, v3, v4
+; CHECK-NEXT:    vslw v2, v2, v5
+; CHECK-NEXT:    vsraw v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    blr
+entry:
+  %or = or <4 x i32> %B, %C
+  %and = and <4 x i32> %B, %C
+  %nand = xor <4 x i32> %and, <i32 -1, i32 -1, i32 -1, i32 -1>  ; Vector nand operation
+  %res = select <4 x i1> %A, <4 x i32> %or, <4 x i32> %nand
+  ret <4 x i32> %res
+}
+
+; Function to test ternary(A, or(B, C), nand(B, C)) for <2 x i64>
+define <2 x i64> @ternary_A_or_BC_nand_BC_2x64(<2 x i1> %A, <2 x i64> %B, <2 x i64> %C) {
+; CHECK-LABEL: ternary_A_or_BC_nand_BC_2x64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxlxor v5, v5, v5
+; CHECK-NEXT:    xxlor vs0, v3, v4
+; CHECK-NEXT:    xxlnand vs1, v3, v4
+; CHECK-NEXT:    xxsplti32dx v5, 1, 63
+; CHECK-NEXT:    vsld v2, v2, v5
+; CHECK-NEXT:    vsrad v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    blr
+entry:
+  %or = or <2 x i64> %B, %C
+  %and = and <2 x i64> %B, %C
+  %nand = xor <2 x i64> %and, <i64 -1, i64 -1>  ; Vector nand operation
+  %res = select <2 x i1> %A, <2 x i64> %or, <2 x i64> %nand
+  ret <2 x i64> %res
+}
+
+; Function to test ternary(A, or(B, C), nand(B, C)) for <16 x i8>
+define <16 x i8> @ternary_A_or_BC_nand_BC_16x8(<16 x i1> %A, <16 x i8> %B, <16 x i8> %C) {
+; CHECK-LABEL: ternary_A_or_BC_nand_BC_16x8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxspltib v5, 7
+; CHECK-NEXT:    xxlor vs0, v3, v4
+; CHECK-NEXT:    xxlnand vs1, v3, v4
+; CHECK-NEXT:    vslb v2, v2, v5
+; CHECK-NEXT:    vsrab v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    blr
+entry:
+  %or = or <16 x i8> %B, %C
+  %and = and <16 x i8> %B, %C
+  %nand = xor <16 x i8> %and, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>  ; Vector nand operation
+  %res = select <16 x i1> %A, <16 x i8> %or, <16 x i8> %nand
+  ret <16 x i8> %res
+}
+
+; Function to test ternary(A, or(B, C), nand(B, C)) for <8 x i16>
+define <8 x i16> @ternary_A_or_BC_nand_BC_8x16(<8 x i1> %A, <8 x i16> %B, <8 x i16> %C) {
+; CHECK-LABEL: ternary_A_or_BC_nand_BC_8x16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxspltiw v5, 983055
+; CHECK-NEXT:    xxlor vs0, v3, v4
+; CHECK-NEXT:    xxlnand vs1, v3, v4
+; CHECK-NEXT:    vslh v2, v2, v5
+; CHECK-NEXT:    vsrah v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    blr
+entry:
+  %or = or <8 x i16> %B, %C
+  %and = and <8 x i16> %B, %C
+  %nand = xor <8 x i16> %and, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>  ; Vector nand operation
+  %res = select <8 x i1> %A, <8 x i16> %or, <8 x i16> %nand
+  ret <8 x i16> %res
+}
+
+; Function to test ternary(A, eqv(B, C), nand(B, C)) for <4 x i32>
+define <4 x i32> @ternary_A_eqv_BC_nand_BC_4x32(<4 x i1> %A, <4 x i32> %B, <4 x i32> %C) {
+; CHECK-LABEL: ternary_A_eqv_BC_nand_BC_4x32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxleqv v5, v5, v5
+; CHECK-NEXT:    xxleqv vs0, v3, v4
+; CHECK-NEXT:    xxlnand vs1, v3, v4
+; CHECK-NEXT:    vslw v2, v2, v5
+; CHECK-NEXT:    vsraw v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    blr
+entry:
+  %xor = xor <4 x i32> %B, %C
+  %eqv = xor <4 x i32> %xor, <i32 -1, i32 -1, i32 -1, i32 -1>  ; Vector eqv operation
+  %and = and <4 x i32> %B, %C
+  %nand = xor <4 x i32> %and, <i32 -1, i32 -1, i32 -1, i32 -1>  ; Vector nand operation
+  %res = select <4 x i1> %A, <4 x i32> %eqv, <4 x i32> %nand
+  ret <4 x i32> %res
+}
+
+; Function to test ternary(A, eqv(B, C), nand(B, C)) for <2 x i64>
+define <2 x i64> @ternary_A_eqv_BC_nand_BC_2x64(<2 x i1> %A, <2 x i64> %B, <2 x i64> %C) {
+; CHECK-LABEL: ternary_A_eqv_BC_nand_BC_2x64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxlxor v5, v5, v5
+; CHECK-NEXT:    xxleqv vs0, v3, v4
+; CHECK-NEXT:    xxlnand vs1, v3, v4
+; CHECK-NEXT:    xxsplti32dx v5, 1, 63
+; CHECK-NEXT:    vsld v2, v2, v5
+; CHECK-NEXT:    vsrad v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    blr
+entry:
+  %xor = xor <2 x i64> %B, %C
+  %eqv = xor <2 x i64> %xor, <i64 -1, i64 -1>  ; Vector eqv operation
+  %and = and <2 x i64> %B, %C
+  %nand = xor <2 x i64> %and, <i64 -1, i64 -1>  ; Vector nand operation
+  %res = select <2 x i1> %A, <2 x i64> %eqv, <2 x i64> %nand
+  ret <2 x i64> %res
+}
+
+; Function to test ternary(A, eqv(B, C), nand(B, C)) for <16 x i8>
+define <16 x i8> @ternary_A_eqv_BC_nand_BC_16x8(<16 x i1> %A, <16 x i8> %B, <16 x i8> %C) {
+; CHECK-LABEL: ternary_A_eqv_BC_nand_BC_16x8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxspltib v5, 7
+; CHECK-NEXT:    xxleqv vs0, v3, v4
+; CHECK-NEXT:    xxlnand vs1, v3, v4
+; CHECK-NEXT:    vslb v2, v2, v5
+; CHECK-NEXT:    vsrab v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    blr
+entry:
+  %xor = xor <16 x i8> %B, %C
+  %eqv = xor <16 x i8> %xor, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>  ; Vector eqv operation
+  %and = and <16 x i8> %B, %C
+  %nand = xor <16 x i8> %and, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>  ; Vector nand operation
+  %res = select <16 x i1> %A, <16 x i8> %eqv, <16 x i8> %nand
+  ret <16 x i8> %res
+}
+
+; Function to test ternary(A, eqv(B, C), nand(B, C)) for <8 x i16>
+define <8 x i16> @ternary_A_eqv_BC_nand_BC_8x16(<8 x i1> %A, <8 x i16> %B, <8 x i16> %C) {
+; CHECK-LABEL: ternary_A_eqv_BC_nand_BC_8x16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxspltiw v5, 983055
+; CHECK-NEXT:    xxleqv vs0, v3, v4
+; CHECK-NEXT:    xxlnand vs1, v3, v4
+; CHECK-NEXT:    vslh v2, v2, v5
+; CHECK-NEXT:    vsrah v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    blr
+entry:
+  %xor = xor <8 x i16> %B, %C
+  %eqv = xor <8 x i16> %xor, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>  ; Vector eqv operation
+  %and = and <8 x i16> %B, %C
+  %nand = xor <8 x i16> %and, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>  ; Vector nand operation
+  %res = select <8 x i1> %A, <8 x i16> %eqv, <8 x i16> %nand
+  ret <8 x i16> %res
+}
diff --git a/llvm/test/CodeGen/PowerPC/xxeval-vselect-x-nor.ll b/llvm/test/CodeGen/PowerPC/xxeval-vselect-x-nor.ll
new file mode 100644
index 0000000000000..d635952e5d8f2
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/xxeval-vselect-x-nor.ll
@@ -0,0 +1,538 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; Test file to verify the emission of Vector selection instructions when ternary operators are used.
+
+; RUN: llc -verify-machineinstrs -mcpu=pwr10 -mtriple=powerpc64le-unknown-unknown \
+; RUN:   -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr < %s | FileCheck %s
+
+; RUN: llc -verify-machineinstrs -mcpu=pwr10 -mtriple=powerpc-ibm-aix-xcoff \
+; RUN:   -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr < %s | FileCheck %s
+
+; RUN: llc -verify-machineinstrs -mcpu=pwr10 -mtriple=powerpc64-ibm-aix-xcoff \
+; RUN:   -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr < %s | FileCheck %s
+
+; Function to test ternary(A, and(B, C), nor(B,C)) for <4 x i32>
+define <4 x i32> @ternary_A_and_BC_nor_BC_4x32(<4 x i1> %A, <4 x i32> %B, <4 x i32> %C) {
+; CHECK-LABEL: ternary_A_and_BC_nor_BC_4x32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxleqv v5, v5, v5
+; CHECK-NEXT:    xxland vs0, v3, v4
+; CHECK-NEXT:    xxlnor vs1, v3, v4
+; CHECK-NEXT:    vslw v2, v2, v5
+; CHECK-NEXT:    vsraw v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    blr
+entry:
+  %and = and <4 x i32> %B, %C
+  %or = or <4 x i32> %B, %C
+  %nor = xor <4 x i32> %or, <i32 -1, i32 -1, i32 -1, i32 -1>  ; Vector NOR operation
+  %res = select <4 x i1> %A, <4 x i32> %and, <4 x i32> %nor
+  ret <4 x i32> %res
+}
+
+; Function to test ternary(A, and(B, C), nor(B,C)) for <2 x i64>
+define <2 x i64> @ternary_A_and_BC_nor_BC_2x64(<2 x i1> %A, <2 x i64> %B, <2 x i64> %C) {
+; CHECK-LABEL: ternary_A_and_BC_nor_BC_2x64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxlxor v5, v5, v5
+; CHECK-NEXT:    xxland vs0, v3, v4
+; CHECK-NEXT:    xxlnor vs1, v3, v4
+; CHECK-NEXT:    xxsplti32dx v5, 1, 63
+; CHECK-NEXT:    vsld v2, v2, v5
+; CHECK-NEXT:    vsrad v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    blr
+entry:
+  %and = and <2 x i64> %B, %C
+  %or = or <2 x i64> %B, %C
+  %nor = xor <2 x i64> %or, <i64 -1, i64 -1>  ; Vector NOR operation
+  %res = select <2 x i1> %A, <2 x i64> %and, <2 x i64> %nor
+  ret <2 x i64> %res
+}
+
+; Function to test ternary(A, and(B, C), nor(B,C)) for <16 x i8>
+define <16 x i8> @ternary_A_and_BC_nor_BC_16x8(<16 x i1> %A, <16 x i8> %B, <16 x i8> %C) {
+; CHECK-LABEL: ternary_A_and_BC_nor_BC_16x8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxspltib v5, 7
+; CHECK-NEXT:    xxland vs0, v3, v4
+; CHECK-NEXT:    xxlnor vs1, v3, v4
+; CHECK-NEXT:    vslb v2, v2, v5
+; CHECK-NEXT:    vsrab v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    blr
+entry:
+  %and = and <16 x i8> %B, %C
+  %or = or <16 x i8> %B, %C
+  %nor = xor <16 x i8> %or, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>  ; Vector NOR operation
+  %res = select <16 x i1> %A, <16 x i8> %and, <16 x i8> %nor
+  ret <16 x i8> %res
+}
+
+; Function to test ternary(A, and(B, C), nor(B,C)) for <8 x i16>
+define <8 x i16> @ternary_A_and_BC_nor_BC_8x16(<8 x i1> %A, <8 x i16> %B, <8 x i16> %C) {
+; CHECK-LABEL: ternary_A_and_BC_nor_BC_8x16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxspltiw v5, 983055
+; CHECK-NEXT:    xxland vs0, v3, v4
+; CHECK-NEXT:    xxlnor vs1, v3, v4
+; CHECK-NEXT:    vslh v2, v2, v5
+; CHECK-NEXT:    vsrah v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    blr
+entry:
+  %and = and <8 x i16> %B, %C
+  %or = or <8 x i16> %B, %C
+  %nor = xor <8 x i16> %or, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>  ; Vector NOR operation
+  %res = select <8 x i1> %A, <8 x i16> %and, <8 x i16> %nor
+  ret <8 x i16> %res
+}
+
+; Function to test ternary(A, B, nor(B,C)) for <4 x i32>
+define <4 x i32> @ternary_A_B_nor_BC_4x32(<4 x i1> %A, <4 x i32> %B, <4 x i32> %C) {
+; CHECK-LABEL: ternary_A_B_nor_BC_4x32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxleqv v5, v5, v5
+; CHECK-NEXT:    xxlnor vs0, v3, v4
+; CHECK-NEXT:    vslw v2, v2, v5
+; CHECK-NEXT:    vsraw v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs0, v3, v2
+; CHECK-NEXT:    blr
+entry:
+  %or = or <4 x i32> %B, %C
+  %nor = xor <4 x i32> %or, <i32 -1, i32 -1, i32 -1, i32 -1>  ; Vector NOR operation
+  %res = select <4 x i1> %A, <4 x i32> %B, <4 x i32> %nor
+  ret <4 x i32> %res
+}
+
+; Function to test ternary(A, B, nor(B,C)) for <2 x i64>
+define <2 x i64> @ternary_A_B_nor_BC_2x64(<2 x i1> %A, <2 x i64> %B, <2 x i64> %C) {
+; CHECK-LABEL: ternary_A_B_nor_BC_2x64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxlxor v5, v5, v5
+; CHECK-NEXT:    xxlnor vs0, v3, v4
+; CHECK-NEXT:    xxsplti32dx v5, 1, 63
+; CHECK-NEXT:    vsld v2, v2, v5
+; CHECK-NEXT:    vsrad v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs0, v3, v2
+; CHECK-NEXT:    blr
+entry:
+  %or = or <2 x i64> %B, %C
+  %nor = xor <2 x i64> %or, <i64 -1, i64 -1>  ; Vector NOR operation
+  %res = select <2 x i1> %A, <2 x i64> %B, <2 x i64> %nor
+  ret <2 x i64> %res
+}
+
+; Function to test ternary(A, B, nor(B,C)) for <16 x i8>
+define <16 x i8> @ternary_A_B_nor_BC_16x8(<16 x i1> %A, <16 x i8> %B, <16 x i8> %C) {
+; CHECK-LABEL: ternary_A_B_nor_BC_16x8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxspltib v5, 7
+; CHECK-NEXT:    xxlnor vs0, v3, v4
+; CHECK-NEXT:    vslb v2, v2, v5
+; CHECK-NEXT:    vsrab v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs0, v3, v2
+; CHECK-NEXT:    blr
+entry:
+  %or = or <16 x i8> %B, %C
+  %nor = xor <16 x i8> %or, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>  ; Vector NOR operation
+  %res = select <16 x i1> %A, <16 x i8> %B, <16 x i8> %nor
+  ret <16 x i8> %res
+}
+
+; Function to test ternary(A, B, nor(B,C)) for <8 x i16>
+define <8 x i16> @ternary_A_B_nor_BC_8x16(<8 x i1> %A, <8 x i16> %B, <8 x i16> %C) {
+; CHECK-LABEL: ternary_A_B_nor_BC_8x16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxspltiw v5, 983055
+; CHECK-NEXT:    xxlnor vs0, v3, v4
+; CHECK-NEXT:    vslh v2, v2, v5
+; CHECK-NEXT:    vsrah v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs0, v3, v2
+; CHECK-NEXT:    blr
+entry:
+  %or = or <8 x i16> %B, %C
+  %nor = xor <8 x i16> %or, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>  ; Vector NOR operation
+  %res = select <8 x i1> %A, <8 x i16> %B, <8 x i16> %nor
+  ret <8 x i16> %res
+}
+
+; Function to test ternary(A, C, nor(B,C)) for <4 x i32>
+define <4 x i32> @ternary_A_C_nor_BC_4x32(<4 x i1> %A, <4 x i32> %B, <4 x i32> %C) {
+; CHECK-LABEL: ternary_A_C_nor_BC_4x32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxleqv v5, v5, v5
+; CHECK-NEXT:    xxlnor vs0, v3, v4
+; CHECK-NEXT:    vslw v2, v2, v5
+; CHECK-NEXT:    vsraw v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs0, v4, v2
+; CHECK-NEXT:    blr
+entry:
+  %or = or <4 x i32> %B, %C
+  %nor = xor <4 x i32> %or, <i32 -1, i32 -1, i32 -1, i32 -1>  ; Vector NOR operation
+  %res = select <4 x i1> %A, <4 x i32> %C, <4 x i32> %nor
+  ret <4 x i32> %res
+}
+
+; Function to test ternary(A, C, nor(B,C)) for <2 x i64>
+define <2 x i64> @ternary_A_C_nor_BC_2x64(<2 x i1> %A, <2 x i64> %B, <2 x i64> %C) {
+; CHECK-LABEL: ternary_A_C_nor_BC_2x64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxlxor v5, v5, v5
+; CHECK-NEXT:    xxlnor vs0, v3, v4
+; CHECK-NEXT:    xxsplti32dx v5, 1, 63
+; CHECK-NEXT:    vsld v2, v2, v5
+; CHECK-NEXT:    vsrad v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs0, v4, v2
+; CHECK-NEXT:    blr
+entry:
+  %or = or <2 x i64> %B, %C
+  %nor = xor <2 x i64> %or, <i64 -1, i64 -1>  ; Vector NOR operation
+  %res = select <2 x i1> %A, <2 x i64> %C, <2 x i64> %nor
+  ret <2 x i64> %res
+}
+
+; Function to test ternary(A, C, nor(B,C)) for <16 x i8>
+define <16 x i8> @ternary_A_C_nor_BC_16x8(<16 x i1> %A, <16 x i8> %B, <16 x i8> %C) {
+; CHECK-LABEL: ternary_A_C_nor_BC_16x8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxspltib v5, 7
+; CHECK-NEXT:    xxlnor vs0, v3, v4
+; CHECK-NEXT:    vslb v2, v2, v5
+; CHECK-NEXT:    vsrab v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs0, v4, v2
+; CHECK-NEXT:    blr
+entry:
+  %or = or <16 x i8> %B, %C
+  %nor = xor <16 x i8> %or, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>  ; Vector NOR operation
+  %res = select <16 x i1> %A, <16 x i8> %C, <16 x i8> %nor
+  ret <16 x i8> %res
+}
+
+; Function to test ternary(A, C, nor(B,C)) for <8 x i16>
+define <8 x i16> @ternary_A_C_nor_BC_8x16(<8 x i1> %A, <8 x i16> %B, <8 x i16> %C) {
+; CHECK-LABEL: ternary_A_C_nor_BC_8x16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxspltiw v5, 983055
+; CHECK-NEXT:    xxlnor vs0, v3, v4
+; CHECK-NEXT:    vslh v2, v2, v5
+; CHECK-NEXT:    vsrah v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs0, v4, v2
+; CHECK-NEXT:    blr
+entry:
+  %or = or <8 x i16> %B, %C
+  %nor = xor <8 x i16> %or, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>  ; Vector NOR operation
+  %res = select <8 x i1> %A, <8 x i16> %C, <8 x i16> %nor
+  ret <8 x i16> %res
+}
+
+; Function to test ternary(A, xor(B,C), nor(B,C)) for <4 x i32>
+define <4 x i32> @ternary_A_xor_BC_nor_BC_4x32(<4 x i1> %A, <4 x i32> %B, <4 x i32> %C) {
+; CHECK-LABEL: ternary_A_xor_BC_nor_BC_4x32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxleqv v5, v5, v5
+; CHECK-NEXT:    xxlxor vs0, v3, v4
+; CHECK-NEXT:    xxlnor vs1, v3, v4
+; CHECK-NEXT:    vslw v2, v2, v5
+; CHECK-NEXT:    vsraw v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    blr
+entry:
+  %xor = xor <4 x i32> %B, %C
+  %or = or <4 x i32> %B, %C
+  %nor = xor <4 x i32> %or, <i32 -1, i32 -1, i32 -1, i32 -1>  ; Vector NOR operation
+  %res = select <4 x i1> %A, <4 x i32> %xor, <4 x i32> %nor
+  ret <4 x i32> %res
+}
+
+; Function to test ternary(A, xor(B,C), nor(B,C)) for <2 x i64>
+define <2 x i64> @ternary_A_xor_BC_nor_BC_2x64(<2 x i1> %A, <2 x i64> %B, <2 x i64> %C) {
+; CHECK-LABEL: ternary_A_xor_BC_nor_BC_2x64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxlxor v5, v5, v5
+; CHECK-NEXT:    xxlxor vs0, v3, v4
+; CHECK-NEXT:    xxlnor vs1, v3, v4
+; CHECK-NEXT:    xxsplti32dx v5, 1, 63
+; CHECK-NEXT:    vsld v2, v2, v5
+; CHECK-NEXT:    vsrad v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    blr
+entry:
+  %xor = xor <2 x i64> %B, %C
+  %or = or <2 x i64> %B, %C
+  %nor = xor <2 x i64> %or, <i64 -1, i64 -1>  ; Vector NOR operation
+  %res = select <2 x i1> %A, <2 x i64> %xor, <2 x i64> %nor
+  ret <2 x i64> %res
+}
+
+; Function to test ternary(A, xor(B,C), nor(B,C)) for <16 x i8>
+define <16 x i8> @ternary_A_xor_BC_nor_BC_16x8(<16 x i1> %A, <16 x i8> %B, <16 x i8> %C) {
+; CHECK-LABEL: ternary_A_xor_BC_nor_BC_16x8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxspltib v5, 7
+; CHECK-NEXT:    xxlxor vs0, v3, v4
+; CHECK-NEXT:    xxlnor vs1, v3, v4
+; CHECK-NEXT:    vslb v2, v2, v5
+; CHECK-NEXT:    vsrab v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    blr
+entry:
+  %xor = xor <16 x i8> %B, %C
+  %or = or <16 x i8> %B, %C
+  %nor = xor <16 x i8> %or, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>  ; Vector NOR operation
+  %res = select <16 x i1> %A, <16 x i8> %xor, <16 x i8> %nor
+  ret <16 x i8> %res
+}
+
+; Function to test ternary(A, xor(B,C), nor(B,C)) for <8 x i16>
+define <8 x i16> @ternary_A_xor_BC_nor_BC_8x16(<8 x i1> %A, <8 x i16> %B, <8 x i16> %C) {
+; CHECK-LABEL: ternary_A_xor_BC_nor_BC_8x16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxspltiw v5, 983055
+; CHECK-NEXT:    xxlxor vs0, v3, v4
+; CHECK-NEXT:    xxlnor vs1, v3, v4
+; CHECK-NEXT:    vslh v2, v2, v5
+; CHECK-NEXT:    vsrah v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    blr
+entry:
+  %xor = xor <8 x i16> %B, %C
+  %or = or <8 x i16> %B, %C
+  %nor = xor <8 x i16> %or, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>  ; Vector NOR operation
+  %res = select <8 x i1> %A, <8 x i16> %xor, <8 x i16> %nor
+  ret <8 x i16> %res
+}
+
+; Function to test ternary(A, not(C), nor(B,C)) for <4 x i32>
+define <4 x i32> @ternary_A_not_C_nor_BC_4x32(<4 x i1> %A, <4 x i32> %B, <4 x i32> %C) {
+; CHECK-LABEL: ternary_A_not_C_nor_BC_4x32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxleqv v5, v5, v5
+; CHECK-NEXT:    xxlnor vs0, v4, v4
+; CHECK-NEXT:    xxlnor vs1, v3, v4
+; CHECK-NEXT:    vslw v2, v2, v5
+; CHECK-NEXT:    vsraw v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    blr
+entry:
+  %not = xor <4 x i32> %C, <i32 -1, i32 -1, i32 -1, i32 -1>  ; Vector not operation
+  %or = or <4 x i32> %B, %C
+  %nor = xor <4 x i32> %or, <i32 -1, i32 -1, i32 -1, i32 -1>  ; Vector NOR operation
+  %res = select <4 x i1> %A, <4 x i32> %not, <4 x i32> %nor
+  ret <4 x i32> %res
+}
+
+; Function to test ternary(A, not(C), nor(B,C)) for <2 x i64>
+define <2 x i64> @ternary_A_not_C_nor_BC_2x64(<2 x i1> %A, <2 x i64> %B, <2 x i64> %C) {
+; CHECK-LABEL: ternary_A_not_C_nor_BC_2x64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxlxor v5, v5, v5
+; CHECK-NEXT:    xxlnor vs0, v4, v4
+; CHECK-NEXT:    xxlnor vs1, v3, v4
+; CHECK-NEXT:    xxsplti32dx v5, 1, 63
+; CHECK-NEXT:    vsld v2, v2, v5
+; CHECK-NEXT:    vsrad v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    blr
+entry:
+  %not = xor <2 x i64> %C, <i64 -1, i64 -1>  ; Vector not operation
+  %or = or <2 x i64> %B, %C
+  %nor = xor <2 x i64> %or, <i64 -1, i64 -1>  ; Vector NOR operation
+  %res = select <2 x i1> %A, <2 x i64> %not, <2 x i64> %nor
+  ret <2 x i64> %res
+}
+
+; Function to test ternary(A, not(C), nor(B,C)) for <16 x i8>
+define <16 x i8> @ternary_A_not_C_nor_BC_16x8(<16 x i1> %A, <16 x i8> %B, <16 x i8> %C) {
+; CHECK-LABEL: ternary_A_not_C_nor_BC_16x8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxspltib v5, 7
+; CHECK-NEXT:    xxlnor vs0, v4, v4
+; CHECK-NEXT:    xxlnor vs1, v3, v4
+; CHECK-NEXT:    vslb v2, v2, v5
+; CHECK-NEXT:    vsrab v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    blr
+entry:
+  %not = xor <16 x i8> %C, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>  ; Vector not operation
+  %or = or <16 x i8> %B, %C
+  %nor = xor <16 x i8> %or, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>  ; Vector NOR operation
+  %res = select <16 x i1> %A, <16 x i8> %not, <16 x i8> %nor
+  ret <16 x i8> %res
+}
+
+; Function to test ternary(A, not(C), nor(B,C)) for <8 x i16>
+define <8 x i16> @ternary_A_not_C_nor_BC_8x16(<8 x i1> %A, <8 x i16> %B, <8 x i16> %C) {
+; CHECK-LABEL: ternary_A_not_C_nor_BC_8x16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxspltiw v5, 983055
+; CHECK-NEXT:    xxlnor vs0, v4, v4
+; CHECK-NEXT:    xxlnor vs1, v3, v4
+; CHECK-NEXT:    vslh v2, v2, v5
+; CHECK-NEXT:    vsrah v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    blr
+entry:
+  %not = xor <8 x i16> %C, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>  ; Vector not operation
+  %or = or <8 x i16> %B, %C
+  %nor = xor <8 x i16> %or, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>  ; Vector NOR operation
+  %res = select <8 x i1> %A, <8 x i16> %not, <8 x i16> %nor
+  ret <8 x i16> %res
+}
+
+; Function to test ternary(A, not(B), nor(B,C)) for <4 x i32>
+define <4 x i32> @ternary_A_not_B_nor_BC_4x32(<4 x i1> %A, <4 x i32> %B, <4 x i32> %C) {
+; CHECK-LABEL: ternary_A_not_B_nor_BC_4x32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxleqv v5, v5, v5
+; CHECK-NEXT:    xxlnor vs0, v3, v3
+; CHECK-NEXT:    xxlnor vs1, v3, v4
+; CHECK-NEXT:    vslw v2, v2, v5
+; CHECK-NEXT:    vsraw v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    blr
+entry:
+  %not = xor <4 x i32> %B, <i32 -1, i32 -1, i32 -1, i32 -1>  ; Vector not operation
+  %or = or <4 x i32> %B, %C
+  %nor = xor <4 x i32> %or, <i32 -1, i32 -1, i32 -1, i32 -1>  ; Vector NOR operation
+  %res = select <4 x i1> %A, <4 x i32> %not, <4 x i32> %nor
+  ret <4 x i32> %res
+}
+
+; Function to test ternary(A, not(B), nor(B,C)) for <2 x i64>
+define <2 x i64> @ternary_A_not_B_nor_BC_2x64(<2 x i1> %A, <2 x i64> %B, <2 x i64> %C) {
+; CHECK-LABEL: ternary_A_not_B_nor_BC_2x64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxlxor v5, v5, v5
+; CHECK-NEXT:    xxlnor vs0, v3, v3
+; CHECK-NEXT:    xxlnor vs1, v3, v4
+; CHECK-NEXT:    xxsplti32dx v5, 1, 63
+; CHECK-NEXT:    vsld v2, v2, v5
+; CHECK-NEXT:    vsrad v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    blr
+entry:
+  %not = xor <2 x i64> %B, <i64 -1, i64 -1>  ; Vector not operation
+  %or = or <2 x i64> %B, %C
+  %nor = xor <2 x i64> %or, <i64 -1, i64 -1>  ; Vector NOR operation
+  %res = select <2 x i1> %A, <2 x i64> %not, <2 x i64> %nor
+  ret <2 x i64> %res
+}
+
+; Function to test ternary(A, not(B), nor(B,C)) for <16 x i8>
+define <16 x i8> @ternary_A_not_B_nor_BC_16x8(<16 x i1> %A, <16 x i8> %B, <16 x i8> %C) {
+; CHECK-LABEL: ternary_A_not_B_nor_BC_16x8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxspltib v5, 7
+; CHECK-NEXT:    xxlnor vs0, v3, v3
+; CHECK-NEXT:    xxlnor vs1, v3, v4
+; CHECK-NEXT:    vslb v2, v2, v5
+; CHECK-NEXT:    vsrab v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    blr
+entry:
+  %not = xor <16 x i8> %B, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>  ; Vector not operation
+  %or = or <16 x i8> %B, %C
+  %nor = xor <16 x i8> %or, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>  ; Vector NOR operation
+  %res = select <16 x i1> %A, <16 x i8> %not, <16 x i8> %nor
+  ret <16 x i8> %res
+}
+
+; Function to test ternary(A, not(B), nor(B,C)) for <8 x i16>
+define <8 x i16> @ternary_A_not_B_nor_BC_8x16(<8 x i1> %A, <8 x i16> %B, <8 x i16> %C) {
+; CHECK-LABEL: ternary_A_not_B_nor_BC_8x16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxspltiw v5, 983055
+; CHECK-NEXT:    xxlnor vs0, v3, v3
+; CHECK-NEXT:    xxlnor vs1, v3, v4
+; CHECK-NEXT:    vslh v2, v2, v5
+; CHECK-NEXT:    vsrah v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    blr
+entry:
+  %not = xor <8 x i16> %B, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>  ; Vector not operation
+  %or = or <8 x i16> %B, %C
+  %nor = xor <8 x i16> %or, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>  ; Vector NOR operation
+  %res = select <8 x i1> %A, <8 x i16> %not, <8 x i16> %nor
+  ret <8 x i16> %res
+}
+
+; Function to test ternary(A, nand(B,C), nor(B,C)) for <4 x i32>
+define <4 x i32> @ternary_A_nand_BC_nor_BC_4x32(<4 x i1> %A, <4 x i32> %B, <4 x i32> %C) {
+; CHECK-LABEL: ternary_A_nand_BC_nor_BC_4x32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxleqv v5, v5, v5
+; CHECK-NEXT:    xxlnand vs0, v3, v4
+; CHECK-NEXT:    xxlnor vs1, v3, v4
+; CHECK-NEXT:    vslw v2, v2, v5
+; CHECK-NEXT:    vsraw v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    blr
+entry:
+  %and = and <4 x i32> %B, %C
+  %nand = xor <4 x i32> %and, <i32 -1, i32 -1, i32 -1, i32 -1>  ; Vector nand operation
+  %or = or <4 x i32> %B, %C
+  %nor = xor <4 x i32> %or, <i32 -1, i32 -1, i32 -1, i32 -1>  ; Vector NOR operation
+  %res = select <4 x i1> %A, <4 x i32> %nand, <4 x i32> %nor
+  ret <4 x i32> %res
+}
+
+; Function to test ternary(A, nand(B,C), nor(B,C)) for <2 x i64>
+define <2 x i64> @ternary_A_nand_BC_nor_BC_2x64(<2 x i1> %A, <2 x i64> %B, <2 x i64> %C) {
+; CHECK-LABEL: ternary_A_nand_BC_nor_BC_2x64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxlxor v5, v5, v5
+; CHECK-NEXT:    xxlnand vs0, v3, v4
+; CHECK-NEXT:    xxlnor vs1, v3, v4
+; CHECK-NEXT:    xxsplti32dx v5, 1, 63
+; CHECK-NEXT:    vsld v2, v2, v5
+; CHECK-NEXT:    vsrad v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    blr
+entry:
+  %and = and <2 x i64> %B, %C
+  %nand = xor <2 x i64> %and, <i64 -1, i64 -1>  ; Vector nand operation
+  %or = or <2 x i64> %B, %C
+  %nor = xor <2 x i64> %or, <i64 -1, i64 -1>  ; Vector NOR operation
+  %res = select <2 x i1> %A, <2 x i64> %nand, <2 x i64> %nor
+  ret <2 x i64> %res
+}
+
+; Function to test ternary(A, nand(B,C), nor(B,C)) for <16 x i8>
+define <16 x i8> @ternary_A_nand_BC_nor_BC_16x8(<16 x i1> %A, <16 x i8> %B, <16 x i8> %C) {
+; CHECK-LABEL: ternary_A_nand_BC_nor_BC_16x8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxspltib v5, 7
+; CHECK-NEXT:    xxlnand vs0, v3, v4
+; CHECK-NEXT:    xxlnor vs1, v3, v4
+; CHECK-NEXT:    vslb v2, v2, v5
+; CHECK-NEXT:    vsrab v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    blr
+entry:
+  %and = and <16 x i8> %B, %C
+  %nand = xor <16 x i8> %and, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>  ; Vector nand operation
+  %or = or <16 x i8> %B, %C
+  %nor = xor <16 x i8> %or, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>  ; Vector NOR operation
+  %res = select <16 x i1> %A, <16 x i8> %nand, <16 x i8> %nor
+  ret <16 x i8> %res
+}
+
+; Function to test ternary(A, nand(B,C), nor(B,C)) for <8 x i16>
+define <8 x i16> @ternary_A_nand_BC_nor_BC_8x16(<8 x i1> %A, <8 x i16> %B, <8 x i16> %C) {
+; CHECK-LABEL: ternary_A_nand_BC_nor_BC_8x16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxspltiw v5, 983055
+; CHECK-NEXT:    xxlnand vs0, v3, v4
+; CHECK-NEXT:    xxlnor vs1, v3, v4
+; CHECK-NEXT:    vslh v2, v2, v5
+; CHECK-NEXT:    vsrah v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    blr
+entry:
+  %and = and <8 x i16> %B, %C
+  %nand = xor <8 x i16> %and, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>  ; Vector nand operation
+  %or = or <8 x i16> %B, %C
+  %nor = xor <8 x i16> %or, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>  ; Vector NOR operation
+  %res = select <8 x i1> %A, <8 x i16> %nand, <8 x i16> %nor
+  ret <8 x i16> %res
+}
diff --git a/llvm/test/CodeGen/PowerPC/xxeval-vselect-x-not-b.ll b/llvm/test/CodeGen/PowerPC/xxeval-vselect-x-not-b.ll
new file mode 100644
index 0000000000000..6203a96555395
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/xxeval-vselect-x-not-b.ll
@@ -0,0 +1,307 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; Test file to verify the emission of Vector Evaluate instructions when ternary operators are used.
+
+; RUN: llc -verify-machineinstrs -mcpu=pwr10 -mtriple=powerpc64le-unknown-unknown \
+; RUN:   -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr < %s | FileCheck %s
+
+; RUN: llc -verify-machineinstrs -mcpu=pwr10 -mtriple=powerpc-ibm-aix-xcoff \
+; RUN:   -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr < %s | FileCheck %s
+
+; RUN: llc -verify-machineinstrs -mcpu=pwr10 -mtriple=powerpc64-ibm-aix-xcoff \
+; RUN:   -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr < %s | FileCheck %s
+
+; Function to test ternary(A, and(B, C), not(B)) for <4 x i32>
+define <4 x i32> @ternary_A_and_BC_not_B_4x32(<4 x i1> %A, <4 x i32> %B, <4 x i32> %C) {
+; CHECK-LABEL: ternary_A_and_BC_not_B_4x32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxleqv v5, v5, v5
+; CHECK-NEXT:    xxland vs0, v3, v4
+; CHECK-NEXT:    xxlnor vs1, v3, v3
+; CHECK-NEXT:    vslw v2, v2, v5
+; CHECK-NEXT:    vsraw v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    blr
+entry:
+  %and = and <4 x i32> %B, %C
+  %not = xor <4 x i32> %B, <i32 -1, i32 -1, i32 -1, i32 -1>  ; Vector not operation
+  %res = select <4 x i1> %A, <4 x i32> %and, <4 x i32> %not
+  ret <4 x i32> %res
+}
+
+; Function to test ternary(A, and(B, C), not(B)) for <2 x i64>
+define <2 x i64> @ternary_A_and_BC_not_B_2x64(<2 x i1> %A, <2 x i64> %B, <2 x i64> %C) {
+; CHECK-LABEL: ternary_A_and_BC_not_B_2x64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxlxor v5, v5, v5
+; CHECK-NEXT:    xxland vs0, v3, v4
+; CHECK-NEXT:    xxlnor vs1, v3, v3
+; CHECK-NEXT:    xxsplti32dx v5, 1, 63
+; CHECK-NEXT:    vsld v2, v2, v5
+; CHECK-NEXT:    vsrad v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    blr
+entry:
+  %and = and <2 x i64> %B, %C
+  %not = xor <2 x i64> %B, <i64 -1, i64 -1>  ; Vector not operation
+  %res = select <2 x i1> %A, <2 x i64> %and, <2 x i64> %not
+  ret <2 x i64> %res
+}
+
+; Function to test ternary(A, and(B, C), not(B)) for <16 x i8>
+define <16 x i8> @ternary_A_and_BC_not_B_16x8(<16 x i1> %A, <16 x i8> %B, <16 x i8> %C) {
+; CHECK-LABEL: ternary_A_and_BC_not_B_16x8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxspltib v5, 7
+; CHECK-NEXT:    xxland vs0, v3, v4
+; CHECK-NEXT:    xxlnor vs1, v3, v3
+; CHECK-NEXT:    vslb v2, v2, v5
+; CHECK-NEXT:    vsrab v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    blr
+entry:
+  %and = and <16 x i8> %B, %C
+  %not = xor <16 x i8> %B, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>  ; Vector not operation
+  %res = select <16 x i1> %A, <16 x i8> %and, <16 x i8> %not
+  ret <16 x i8> %res
+}
+
+; Function to test ternary(A, and(B, C), not(B)) for <8 x i16>
+define <8 x i16> @ternary_A_and_BC_not_B_8x16(<8 x i1> %A, <8 x i16> %B, <8 x i16> %C) {
+; CHECK-LABEL: ternary_A_and_BC_not_B_8x16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxspltiw v5, 983055
+; CHECK-NEXT:    xxland vs0, v3, v4
+; CHECK-NEXT:    xxlnor vs1, v3, v3
+; CHECK-NEXT:    vslh v2, v2, v5
+; CHECK-NEXT:    vsrah v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    blr
+entry:
+  %and = and <8 x i16> %B, %C
+  %not = xor <8 x i16> %B, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>  ; Vector not operation
+  %res = select <8 x i1> %A, <8 x i16> %and, <8 x i16> %not
+  ret <8 x i16> %res
+}
+
+; Function to test ternary(A, xor(B, C), not(B)) for <4 x i32>
+define <4 x i32> @ternary_A_xor_BC_not_B_4x32(<4 x i1> %A, <4 x i32> %B, <4 x i32> %C) {
+; CHECK-LABEL: ternary_A_xor_BC_not_B_4x32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxleqv v5, v5, v5
+; CHECK-NEXT:    xxlxor vs0, v3, v4
+; CHECK-NEXT:    xxlnor vs1, v3, v3
+; CHECK-NEXT:    vslw v2, v2, v5
+; CHECK-NEXT:    vsraw v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    blr
+entry:
+  %xor = xor <4 x i32> %B, %C
+  %not = xor <4 x i32> %B, <i32 -1, i32 -1, i32 -1, i32 -1>  ; Vector not operation
+  %res = select <4 x i1> %A, <4 x i32> %xor, <4 x i32> %not
+  ret <4 x i32> %res
+}
+
+; Function to test ternary(A, xor(B, C), not(B)) for <2 x i64>
+define <2 x i64> @ternary_A_xor_BC_not_B_2x64(<2 x i1> %A, <2 x i64> %B, <2 x i64> %C) {
+; CHECK-LABEL: ternary_A_xor_BC_not_B_2x64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxlxor v5, v5, v5
+; CHECK-NEXT:    xxlxor vs0, v3, v4
+; CHECK-NEXT:    xxlnor vs1, v3, v3
+; CHECK-NEXT:    xxsplti32dx v5, 1, 63
+; CHECK-NEXT:    vsld v2, v2, v5
+; CHECK-NEXT:    vsrad v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    blr
+entry:
+  %xor = xor <2 x i64> %B, %C
+  %not = xor <2 x i64> %B, <i64 -1, i64 -1>  ; Vector not operation
+  %res = select <2 x i1> %A, <2 x i64> %xor, <2 x i64> %not
+  ret <2 x i64> %res
+}
+
+; Function to test ternary(A, xor(B, C), not(B)) for <16 x i8>
+define <16 x i8> @ternary_A_xor_BC_not_B_16x8(<16 x i1> %A, <16 x i8> %B, <16 x i8> %C) {
+; CHECK-LABEL: ternary_A_xor_BC_not_B_16x8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxspltib v5, 7
+; CHECK-NEXT:    xxlxor vs0, v3, v4
+; CHECK-NEXT:    xxlnor vs1, v3, v3
+; CHECK-NEXT:    vslb v2, v2, v5
+; CHECK-NEXT:    vsrab v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    blr
+entry:
+  %xor = xor <16 x i8> %B, %C
+  %not = xor <16 x i8> %B, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>  ; Vector not operation
+  %res = select <16 x i1> %A, <16 x i8> %xor, <16 x i8> %not
+  ret <16 x i8> %res
+}
+
+; Function to test ternary(A, xor(B, C), not(B)) for <8 x i16>
+define <8 x i16> @ternary_A_xor_BC_not_B_8x16(<8 x i1> %A, <8 x i16> %B, <8 x i16> %C) {
+; CHECK-LABEL: ternary_A_xor_BC_not_B_8x16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxspltiw v5, 983055
+; CHECK-NEXT:    xxlxor vs0, v3, v4
+; CHECK-NEXT:    xxlnor vs1, v3, v3
+; CHECK-NEXT:    vslh v2, v2, v5
+; CHECK-NEXT:    vsrah v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    blr
+entry:
+  %xor = xor <8 x i16> %B, %C
+  %not = xor <8 x i16> %B, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>  ; Vector not operation
+  %res = select <8 x i1> %A, <8 x i16> %xor, <8 x i16> %not
+  ret <8 x i16> %res
+}
+
+; Function to test ternary(A, or(B, C), not(B)) for <4 x i32>
+define <4 x i32> @ternary_A_or_BC_not_B_4x32(<4 x i1> %A, <4 x i32> %B, <4 x i32> %C) {
+; CHECK-LABEL: ternary_A_or_BC_not_B_4x32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxleqv v5, v5, v5
+; CHECK-NEXT:    xxlor vs0, v3, v4
+; CHECK-NEXT:    xxlnor vs1, v3, v3
+; CHECK-NEXT:    vslw v2, v2, v5
+; CHECK-NEXT:    vsraw v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    blr
+entry:
+  %or = or <4 x i32> %B, %C
+  %not = xor <4 x i32> %B, <i32 -1, i32 -1, i32 -1, i32 -1>  ; Vector not operation
+  %res = select <4 x i1> %A, <4 x i32> %or, <4 x i32> %not
+  ret <4 x i32> %res
+}
+
+; Function to test ternary(A, or(B, C), not(B)) for <2 x i64>
+define <2 x i64> @ternary_A_or_BC_not_B_2x64(<2 x i1> %A, <2 x i64> %B, <2 x i64> %C) {
+; CHECK-LABEL: ternary_A_or_BC_not_B_2x64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxlxor v5, v5, v5
+; CHECK-NEXT:    xxlor vs0, v3, v4
+; CHECK-NEXT:    xxlnor vs1, v3, v3
+; CHECK-NEXT:    xxsplti32dx v5, 1, 63
+; CHECK-NEXT:    vsld v2, v2, v5
+; CHECK-NEXT:    vsrad v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    blr
+entry:
+  %or = or <2 x i64> %B, %C
+  %not = xor <2 x i64> %B, <i64 -1, i64 -1>  ; Vector not operation
+  %res = select <2 x i1> %A, <2 x i64> %or, <2 x i64> %not
+  ret <2 x i64> %res
+}
+
+; Function to test ternary(A, or(B, C), not(B)) for <16 x i8>
+define <16 x i8> @ternary_A_or_BC_not_B_16x8(<16 x i1> %A, <16 x i8> %B, <16 x i8> %C) {
+; CHECK-LABEL: ternary_A_or_BC_not_B_16x8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxspltib v5, 7
+; CHECK-NEXT:    xxlor vs0, v3, v4
+; CHECK-NEXT:    xxlnor vs1, v3, v3
+; CHECK-NEXT:    vslb v2, v2, v5
+; CHECK-NEXT:    vsrab v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    blr
+entry:
+  %or = or <16 x i8> %B, %C
+  %not = xor <16 x i8> %B, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>  ; Vector not operation
+  %res = select <16 x i1> %A, <16 x i8> %or, <16 x i8> %not
+  ret <16 x i8> %res
+}
+
+; Function to test ternary(A, or(B, C), not(B)) for <8 x i16>
+define <8 x i16> @ternary_A_or_BC_not_B_8x16(<8 x i1> %A, <8 x i16> %B, <8 x i16> %C) {
+; CHECK-LABEL: ternary_A_or_BC_not_B_8x16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxspltiw v5, 983055
+; CHECK-NEXT:    xxlor vs0, v3, v4
+; CHECK-NEXT:    xxlnor vs1, v3, v3
+; CHECK-NEXT:    vslh v2, v2, v5
+; CHECK-NEXT:    vsrah v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    blr
+entry:
+  %or = or <8 x i16> %B, %C
+  %not = xor <8 x i16> %B, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>  ; Vector not operation
+  %res = select <8 x i1> %A, <8 x i16> %or, <8 x i16> %not
+  ret <8 x i16> %res
+}
+
+; Function to test ternary(A, nand(B, C), not(B)) for <4 x i32>
+define <4 x i32> @ternary_A_nand_BC_not_B_4x32(<4 x i1> %A, <4 x i32> %B, <4 x i32> %C) {
+; CHECK-LABEL: ternary_A_nand_BC_not_B_4x32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxleqv v5, v5, v5
+; CHECK-NEXT:    xxlnand vs0, v3, v4
+; CHECK-NEXT:    xxlnor vs1, v3, v3
+; CHECK-NEXT:    vslw v2, v2, v5
+; CHECK-NEXT:    vsraw v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    blr
+entry:
+  %and = and <4 x i32> %B, %C
+  %nand = xor <4 x i32> %and, <i32 -1, i32 -1, i32 -1, i32 -1>  ; Vector nand operation
+  %not = xor <4 x i32> %B, <i32 -1, i32 -1, i32 -1, i32 -1>  ; Vector not operation
+  %res = select <4 x i1> %A, <4 x i32> %nand, <4 x i32> %not
+  ret <4 x i32> %res
+}
+
+; Function to test ternary(A, nand(B, C), not(B)) for <2 x i64>
+define <2 x i64> @ternary_A_nand_BC_not_B_2x64(<2 x i1> %A, <2 x i64> %B, <2 x i64> %C) {
+; CHECK-LABEL: ternary_A_nand_BC_not_B_2x64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxlxor v5, v5, v5
+; CHECK-NEXT:    xxlnand vs0, v3, v4
+; CHECK-NEXT:    xxlnor vs1, v3, v3
+; CHECK-NEXT:    xxsplti32dx v5, 1, 63
+; CHECK-NEXT:    vsld v2, v2, v5
+; CHECK-NEXT:    vsrad v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    blr
+entry:
+  %and = and <2 x i64> %B, %C
+  %nand = xor <2 x i64> %and, <i64 -1, i64 -1>  ; Vector nand operation
+  %not = xor <2 x i64> %B, <i64 -1, i64 -1>  ; Vector not operation
+  %res = select <2 x i1> %A, <2 x i64> %nand, <2 x i64> %not
+  ret <2 x i64> %res
+}
+
+; Function to test ternary(A, nand(B, C), not(B)) for <16 x i8>
+define <16 x i8> @ternary_A_nand_BC_not_B_16x8(<16 x i1> %A, <16 x i8> %B, <16 x i8> %C) {
+; CHECK-LABEL: ternary_A_nand_BC_not_B_16x8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxspltib v5, 7
+; CHECK-NEXT:    xxlnand vs0, v3, v4
+; CHECK-NEXT:    xxlnor vs1, v3, v3
+; CHECK-NEXT:    vslb v2, v2, v5
+; CHECK-NEXT:    vsrab v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    blr
+entry:
+  %and = and <16 x i8> %B, %C
+  %nand = xor <16 x i8> %and, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>  ; Vector nand operation
+  %not = xor <16 x i8> %B, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>  ; Vector not operation
+  %res = select <16 x i1> %A, <16 x i8> %nand, <16 x i8> %not
+  ret <16 x i8> %res
+}
+
+; Function to test ternary(A, nand(B, C), not(B)) for <8 x i16>
+define <8 x i16> @ternary_A_nand_BC_not_B_8x16(<8 x i1> %A, <8 x i16> %B, <8 x i16> %C) {
+; CHECK-LABEL: ternary_A_nand_BC_not_B_8x16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxspltiw v5, 983055
+; CHECK-NEXT:    xxlnand vs0, v3, v4
+; CHECK-NEXT:    xxlnor vs1, v3, v3
+; CHECK-NEXT:    vslh v2, v2, v5
+; CHECK-NEXT:    vsrah v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    blr
+entry:
+  %and = and <8 x i16> %B, %C
+  %nand = xor <8 x i16> %and, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>  ; Vector nand operation
+  %not = xor <8 x i16> %B, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>  ; Vector not operation
+  %res = select <8 x i1> %A, <8 x i16> %nand, <8 x i16> %not
+  ret <8 x i16> %res
+}
diff --git a/llvm/test/CodeGen/PowerPC/xxeval-vselect-x-not-c.ll b/llvm/test/CodeGen/PowerPC/xxeval-vselect-x-not-c.ll
new file mode 100644
index 0000000000000..3479d949439be
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/xxeval-vselect-x-not-c.ll
@@ -0,0 +1,445 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; Test file to verify the emission of Vector Evaluate instructions when ternary operators are used.
+
+; RUN: llc -verify-machineinstrs -mcpu=pwr10 -mtriple=powerpc64le-unknown-unknown \
+; RUN:   -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr < %s | FileCheck %s
+
+; RUN: llc -verify-machineinstrs -mcpu=pwr10 -mtriple=powerpc-ibm-aix-xcoff \
+; RUN:   -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr < %s | FileCheck %s
+
+; RUN: llc -verify-machineinstrs -mcpu=pwr10 -mtriple=powerpc64-ibm-aix-xcoff \
+; RUN:   -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr < %s | FileCheck %s
+
+; Function to test ternary(A, and(B, C), not(C)) for <4 x i32>
+define <4 x i32> @ternary_A_and_BC_not_C_4x32(<4 x i1> %A, <4 x i32> %B, <4 x i32> %C) {
+; CHECK-LABEL: ternary_A_and_BC_not_C_4x32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxleqv v5, v5, v5
+; CHECK-NEXT:    xxland vs0, v3, v4
+; CHECK-NEXT:    xxlnor vs1, v4, v4
+; CHECK-NEXT:    vslw v2, v2, v5
+; CHECK-NEXT:    vsraw v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    blr
+entry:
+  %and = and <4 x i32> %B, %C
+  %not = xor <4 x i32> %C, <i32 -1, i32 -1, i32 -1, i32 -1>  ; Vector not operation
+  %res = select <4 x i1> %A, <4 x i32> %and, <4 x i32> %not
+  ret <4 x i32> %res
+}
+
+; Function to test ternary(A, and(B, C), not(C)) for <2 x i64>
+define <2 x i64> @ternary_A_and_BC_not_C_2x64(<2 x i1> %A, <2 x i64> %B, <2 x i64> %C) {
+; CHECK-LABEL: ternary_A_and_BC_not_C_2x64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxlxor v5, v5, v5
+; CHECK-NEXT:    xxland vs0, v3, v4
+; CHECK-NEXT:    xxlnor vs1, v4, v4
+; CHECK-NEXT:    xxsplti32dx v5, 1, 63
+; CHECK-NEXT:    vsld v2, v2, v5
+; CHECK-NEXT:    vsrad v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    blr
+entry:
+  %and = and <2 x i64> %B, %C
+  %not = xor <2 x i64> %C, <i64 -1, i64 -1>  ; Vector not operation
+  %res = select <2 x i1> %A, <2 x i64> %and, <2 x i64> %not
+  ret <2 x i64> %res
+}
+
+; Function to test ternary(A, and(B, C), not(C)) for <16 x i8>
+define <16 x i8> @ternary_A_and_BC_not_C_16x8(<16 x i1> %A, <16 x i8> %B, <16 x i8> %C) {
+; CHECK-LABEL: ternary_A_and_BC_not_C_16x8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxspltib v5, 7
+; CHECK-NEXT:    xxland vs0, v3, v4
+; CHECK-NEXT:    xxlnor vs1, v4, v4
+; CHECK-NEXT:    vslb v2, v2, v5
+; CHECK-NEXT:    vsrab v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    blr
+entry:
+  %and = and <16 x i8> %B, %C
+  %not = xor <16 x i8> %C, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>  ; Vector not operation
+  %res = select <16 x i1> %A, <16 x i8> %and, <16 x i8> %not
+  ret <16 x i8> %res
+}
+
+; Function to test ternary(A, and(B, C), not(C)) for <8 x i16>
+define <8 x i16> @ternary_A_and_BC_not_C_8x16(<8 x i1> %A, <8 x i16> %B, <8 x i16> %C) {
+; CHECK-LABEL: ternary_A_and_BC_not_C_8x16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxspltiw v5, 983055
+; CHECK-NEXT:    xxland vs0, v3, v4
+; CHECK-NEXT:    xxlnor vs1, v4, v4
+; CHECK-NEXT:    vslh v2, v2, v5
+; CHECK-NEXT:    vsrah v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    blr
+entry:
+  %and = and <8 x i16> %B, %C
+  %not = xor <8 x i16> %C, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>  ; Vector not operation
+  %res = select <8 x i1> %A, <8 x i16> %and, <8 x i16> %not
+  ret <8 x i16> %res
+}
+
+; Function to test ternary(A, B, not(C)) for <4 x i32>
+define <4 x i32> @ternary_A_B_not_C_4x32(<4 x i1> %A, <4 x i32> %B, <4 x i32> %C) {
+; CHECK-LABEL: ternary_A_B_not_C_4x32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxleqv v5, v5, v5
+; CHECK-NEXT:    xxlnor vs0, v4, v4
+; CHECK-NEXT:    vslw v2, v2, v5
+; CHECK-NEXT:    vsraw v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs0, v3, v2
+; CHECK-NEXT:    blr
+entry:
+  %not = xor <4 x i32> %C, <i32 -1, i32 -1, i32 -1, i32 -1>  ; Vector not operation
+  %res = select <4 x i1> %A, <4 x i32> %B, <4 x i32> %not
+  ret <4 x i32> %res
+}
+
+; Function to test ternary(A, B, not(C)) for <2 x i64>
+define <2 x i64> @ternary_A_B_not_C_2x64(<2 x i1> %A, <2 x i64> %B, <2 x i64> %C) {
+; CHECK-LABEL: ternary_A_B_not_C_2x64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxlxor v5, v5, v5
+; CHECK-NEXT:    xxlnor vs0, v4, v4
+; CHECK-NEXT:    xxsplti32dx v5, 1, 63
+; CHECK-NEXT:    vsld v2, v2, v5
+; CHECK-NEXT:    vsrad v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs0, v3, v2
+; CHECK-NEXT:    blr
+entry:
+  %not = xor <2 x i64> %C, <i64 -1, i64 -1>  ; Vector not operation
+  %res = select <2 x i1> %A, <2 x i64> %B, <2 x i64> %not
+  ret <2 x i64> %res
+}
+
+; Function to test ternary(A, B, not(C)) for <16 x i8>
+define <16 x i8> @ternary_A_B_not_C_16x8(<16 x i1> %A, <16 x i8> %B, <16 x i8> %C) {
+; CHECK-LABEL: ternary_A_B_not_C_16x8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxspltib v5, 7
+; CHECK-NEXT:    xxlnor vs0, v4, v4
+; CHECK-NEXT:    vslb v2, v2, v5
+; CHECK-NEXT:    vsrab v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs0, v3, v2
+; CHECK-NEXT:    blr
+entry:
+  %not = xor <16 x i8> %C, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>  ; Vector not operation
+  %res = select <16 x i1> %A, <16 x i8> %B, <16 x i8> %not
+  ret <16 x i8> %res
+}
+
+; Function to test ternary(A, B, not(C)) for <8 x i16>
+define <8 x i16> @ternary_A_B_not_C_8x16(<8 x i1> %A, <8 x i16> %B, <8 x i16> %C) {
+; CHECK-LABEL: ternary_A_B_not_C_8x16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxspltiw v5, 983055
+; CHECK-NEXT:    xxlnor vs0, v4, v4
+; CHECK-NEXT:    vslh v2, v2, v5
+; CHECK-NEXT:    vsrah v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs0, v3, v2
+; CHECK-NEXT:    blr
+entry:
+  %not = xor <8 x i16> %C, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>  ; Vector not operation
+  %res = select <8 x i1> %A, <8 x i16> %B, <8 x i16> %not
+  ret <8 x i16> %res
+}
+
+; Function to test ternary(A, xor(B, C), not(C)) for <4 x i32>
+define <4 x i32> @ternary_A_xor_BC_not_C_4x32(<4 x i1> %A, <4 x i32> %B, <4 x i32> %C) {
+; CHECK-LABEL: ternary_A_xor_BC_not_C_4x32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxleqv v5, v5, v5
+; CHECK-NEXT:    xxlxor vs0, v3, v4
+; CHECK-NEXT:    xxlnor vs1, v4, v4
+; CHECK-NEXT:    vslw v2, v2, v5
+; CHECK-NEXT:    vsraw v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    blr
+entry:
+  %xor = xor <4 x i32> %B, %C
+  %not = xor <4 x i32> %C, <i32 -1, i32 -1, i32 -1, i32 -1>  ; Vector not operation
+  %res = select <4 x i1> %A, <4 x i32> %xor, <4 x i32> %not
+  ret <4 x i32> %res
+}
+
+; Function to test ternary(A, xor(B, C), not(C)) for <2 x i64>
+define <2 x i64> @ternary_A_xor_BC_not_C_2x64(<2 x i1> %A, <2 x i64> %B, <2 x i64> %C) {
+; CHECK-LABEL: ternary_A_xor_BC_not_C_2x64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxlxor v5, v5, v5
+; CHECK-NEXT:    xxlxor vs0, v3, v4
+; CHECK-NEXT:    xxlnor vs1, v4, v4
+; CHECK-NEXT:    xxsplti32dx v5, 1, 63
+; CHECK-NEXT:    vsld v2, v2, v5
+; CHECK-NEXT:    vsrad v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    blr
+entry:
+  %xor = xor <2 x i64> %B, %C
+  %not = xor <2 x i64> %C, <i64 -1, i64 -1>  ; Vector not operation
+  %res = select <2 x i1> %A, <2 x i64> %xor, <2 x i64> %not
+  ret <2 x i64> %res
+}
+
+; Function to test ternary(A, xor(B, C), not(C)) for <16 x i8>
+define <16 x i8> @ternary_A_xor_BC_not_C_16x8(<16 x i1> %A, <16 x i8> %B, <16 x i8> %C) {
+; CHECK-LABEL: ternary_A_xor_BC_not_C_16x8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxspltib v5, 7
+; CHECK-NEXT:    xxlxor vs0, v3, v4
+; CHECK-NEXT:    xxlnor vs1, v4, v4
+; CHECK-NEXT:    vslb v2, v2, v5
+; CHECK-NEXT:    vsrab v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    blr
+entry:
+  %xor = xor <16 x i8> %B, %C
+  %not = xor <16 x i8> %C, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>  ; Vector not operation
+  %res = select <16 x i1> %A, <16 x i8> %xor, <16 x i8> %not
+  ret <16 x i8> %res
+}
+
+; Function to test ternary(A, xor(B, C), not(C)) for <8 x i16>
+define <8 x i16> @ternary_A_xor_BC_not_C_8x16(<8 x i1> %A, <8 x i16> %B, <8 x i16> %C) {
+; CHECK-LABEL: ternary_A_xor_BC_not_C_8x16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxspltiw v5, 983055
+; CHECK-NEXT:    xxlxor vs0, v3, v4
+; CHECK-NEXT:    xxlnor vs1, v4, v4
+; CHECK-NEXT:    vslh v2, v2, v5
+; CHECK-NEXT:    vsrah v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    blr
+entry:
+  %xor = xor <8 x i16> %B, %C
+  %not = xor <8 x i16> %C, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>  ; Vector not operation
+  %res = select <8 x i1> %A, <8 x i16> %xor, <8 x i16> %not
+  ret <8 x i16> %res
+}
+
+; Function to test ternary(A, or(B, C), not(C)) for <4 x i32>
+define <4 x i32> @ternary_A_or_BC_not_C_4x32(<4 x i1> %A, <4 x i32> %B, <4 x i32> %C) {
+; CHECK-LABEL: ternary_A_or_BC_not_C_4x32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxleqv v5, v5, v5
+; CHECK-NEXT:    xxlor vs0, v3, v4
+; CHECK-NEXT:    xxlnor vs1, v4, v4
+; CHECK-NEXT:    vslw v2, v2, v5
+; CHECK-NEXT:    vsraw v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    blr
+entry:
+  %or = or <4 x i32> %B, %C
+  %not = xor <4 x i32> %C, <i32 -1, i32 -1, i32 -1, i32 -1>  ; Vector not operation
+  %res = select <4 x i1> %A, <4 x i32> %or, <4 x i32> %not
+  ret <4 x i32> %res
+}
+
+; Function to test ternary(A, or(B, C), not(C)) for <2 x i64>
+define <2 x i64> @ternary_A_or_BC_not_C_2x64(<2 x i1> %A, <2 x i64> %B, <2 x i64> %C) {
+; CHECK-LABEL: ternary_A_or_BC_not_C_2x64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxlxor v5, v5, v5
+; CHECK-NEXT:    xxlor vs0, v3, v4
+; CHECK-NEXT:    xxlnor vs1, v4, v4
+; CHECK-NEXT:    xxsplti32dx v5, 1, 63
+; CHECK-NEXT:    vsld v2, v2, v5
+; CHECK-NEXT:    vsrad v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    blr
+entry:
+  %or = or <2 x i64> %B, %C
+  %not = xor <2 x i64> %C, <i64 -1, i64 -1>  ; Vector not operation
+  %res = select <2 x i1> %A, <2 x i64> %or, <2 x i64> %not
+  ret <2 x i64> %res
+}
+
+; Function to test ternary(A, or(B, C), not(C)) for <16 x i8>
+define <16 x i8> @ternary_A_or_BC_not_C_16x8(<16 x i1> %A, <16 x i8> %B, <16 x i8> %C) {
+; CHECK-LABEL: ternary_A_or_BC_not_C_16x8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxspltib v5, 7
+; CHECK-NEXT:    xxlor vs0, v3, v4
+; CHECK-NEXT:    xxlnor vs1, v4, v4
+; CHECK-NEXT:    vslb v2, v2, v5
+; CHECK-NEXT:    vsrab v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    blr
+entry:
+  %or = or <16 x i8> %B, %C
+  %not = xor <16 x i8> %C, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>  ; Vector not operation
+  %res = select <16 x i1> %A, <16 x i8> %or, <16 x i8> %not
+  ret <16 x i8> %res
+}
+
+; Function to test ternary(A, or(B, C), not(C)) for <8 x i16>
+define <8 x i16> @ternary_A_or_BC_not_C_8x16(<8 x i1> %A, <8 x i16> %B, <8 x i16> %C) {
+; CHECK-LABEL: ternary_A_or_BC_not_C_8x16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxspltiw v5, 983055
+; CHECK-NEXT:    xxlor vs0, v3, v4
+; CHECK-NEXT:    xxlnor vs1, v4, v4
+; CHECK-NEXT:    vslh v2, v2, v5
+; CHECK-NEXT:    vsrah v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    blr
+entry:
+  %or = or <8 x i16> %B, %C
+  %not = xor <8 x i16> %C, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>  ; Vector not operation
+  %res = select <8 x i1> %A, <8 x i16> %or, <8 x i16> %not
+  ret <8 x i16> %res
+}
+
+; Function to test ternary(A, not(B), not(C)) for <4 x i32>
+define <4 x i32> @ternary_A_not_B_not_C_4x32(<4 x i1> %A, <4 x i32> %B, <4 x i32> %C) {
+; CHECK-LABEL: ternary_A_not_B_not_C_4x32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxleqv v5, v5, v5
+; CHECK-NEXT:    xxlnor vs0, v3, v3
+; CHECK-NEXT:    xxlnor vs1, v4, v4
+; CHECK-NEXT:    vslw v2, v2, v5
+; CHECK-NEXT:    vsraw v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    blr
+entry:
+  %not_b = xor <4 x i32> %B, <i32 -1, i32 -1, i32 -1, i32 -1>  ; Vector not operation
+  %not_c = xor <4 x i32> %C, <i32 -1, i32 -1, i32 -1, i32 -1>  ; Vector not operation
+  %res = select <4 x i1> %A, <4 x i32> %not_b, <4 x i32> %not_c
+  ret <4 x i32> %res
+}
+
+; Function to test ternary(A, not(B), not(C)) for <2 x i64>
+define <2 x i64> @ternary_A_not_B_not_C_2x64(<2 x i1> %A, <2 x i64> %B, <2 x i64> %C) {
+; CHECK-LABEL: ternary_A_not_B_not_C_2x64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxlxor v5, v5, v5
+; CHECK-NEXT:    xxlnor vs0, v3, v3
+; CHECK-NEXT:    xxlnor vs1, v4, v4
+; CHECK-NEXT:    xxsplti32dx v5, 1, 63
+; CHECK-NEXT:    vsld v2, v2, v5
+; CHECK-NEXT:    vsrad v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    blr
+entry:
+  %not_b = xor <2 x i64> %B, <i64 -1, i64 -1>  ; Vector not operation
+  %not_c = xor <2 x i64> %C, <i64 -1, i64 -1>  ; Vector not operation
+  %res = select <2 x i1> %A, <2 x i64> %not_b, <2 x i64> %not_c
+  ret <2 x i64> %res
+}
+
+; Function to test ternary(A, not(B), not(C)) for <16 x i8>
+define <16 x i8> @ternary_A_not_B_not_C_16x8(<16 x i1> %A, <16 x i8> %B, <16 x i8> %C) {
+; CHECK-LABEL: ternary_A_not_B_not_C_16x8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxspltib v5, 7
+; CHECK-NEXT:    xxlnor vs0, v3, v3
+; CHECK-NEXT:    xxlnor vs1, v4, v4
+; CHECK-NEXT:    vslb v2, v2, v5
+; CHECK-NEXT:    vsrab v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    blr
+entry:
+  %not_b = xor <16 x i8> %B, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>  ; Vector not operation
+  %not_c = xor <16 x i8> %C, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>  ; Vector not operation
+  %res = select <16 x i1> %A, <16 x i8> %not_b, <16 x i8> %not_c
+  ret <16 x i8> %res
+}
+
+; Function to test ternary(A, not(B), not(C)) for <8 x i16>
+define <8 x i16> @ternary_A_not_B_not_C_8x16(<8 x i1> %A, <8 x i16> %B, <8 x i16> %C) {
+; CHECK-LABEL: ternary_A_not_B_not_C_8x16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxspltiw v5, 983055
+; CHECK-NEXT:    xxlnor vs0, v3, v3
+; CHECK-NEXT:    xxlnor vs1, v4, v4
+; CHECK-NEXT:    vslh v2, v2, v5
+; CHECK-NEXT:    vsrah v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    blr
+entry:
+  %not_b = xor <8 x i16> %B, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>  ; Vector not operation
+  %not_c = xor <8 x i16> %C, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>  ; Vector not operation
+  %res = select <8 x i1> %A, <8 x i16> %not_b, <8 x i16> %not_c
+  ret <8 x i16> %res
+}
+
+; Function to test ternary(A, nand(B, C), not(C)) for <4 x i32>
+define <4 x i32> @ternary_A_nand_BC_not_C_4x32(<4 x i1> %A, <4 x i32> %B, <4 x i32> %C) {
+; CHECK-LABEL: ternary_A_nand_BC_not_C_4x32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxleqv v5, v5, v5
+; CHECK-NEXT:    xxlnand vs0, v3, v4
+; CHECK-NEXT:    xxlnor vs1, v4, v4
+; CHECK-NEXT:    vslw v2, v2, v5
+; CHECK-NEXT:    vsraw v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    blr
+entry:
+  %and = and <4 x i32> %B, %C
+  %nand = xor <4 x i32> %and, <i32 -1, i32 -1, i32 -1, i32 -1>  ; Vector nand operation
+  %not = xor <4 x i32> %C, <i32 -1, i32 -1, i32 -1, i32 -1>  ; Vector not operation
+  %res = select <4 x i1> %A, <4 x i32> %nand, <4 x i32> %not
+  ret <4 x i32> %res
+}
+
+; Function to test ternary(A, nand(B, C), not(C)) for <2 x i64>
+define <2 x i64> @ternary_A_nand_BC_not_C_2x64(<2 x i1> %A, <2 x i64> %B, <2 x i64> %C) {
+; CHECK-LABEL: ternary_A_nand_BC_not_C_2x64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxlxor v5, v5, v5
+; CHECK-NEXT:    xxlnand vs0, v3, v4
+; CHECK-NEXT:    xxlnor vs1, v4, v4
+; CHECK-NEXT:    xxsplti32dx v5, 1, 63
+; CHECK-NEXT:    vsld v2, v2, v5
+; CHECK-NEXT:    vsrad v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    blr
+entry:
+  %and = and <2 x i64> %B, %C
+  %nand = xor <2 x i64> %and, <i64 -1, i64 -1>  ; Vector nand operation
+  %not = xor <2 x i64> %C, <i64 -1, i64 -1>  ; Vector not operation
+  %res = select <2 x i1> %A, <2 x i64> %nand, <2 x i64> %not
+  ret <2 x i64> %res
+}
+
+; Function to test ternary(A, nand(B, C), not(C)) for <16 x i8>
+define <16 x i8> @ternary_A_nand_BC_not_C_16x8(<16 x i1> %A, <16 x i8> %B, <16 x i8> %C) {
+; CHECK-LABEL: ternary_A_nand_BC_not_C_16x8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxspltib v5, 7
+; CHECK-NEXT:    xxlnand vs0, v3, v4
+; CHECK-NEXT:    xxlnor vs1, v4, v4
+; CHECK-NEXT:    vslb v2, v2, v5
+; CHECK-NEXT:    vsrab v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    blr
+entry:
+  %and = and <16 x i8> %B, %C
+  %nand = xor <16 x i8> %and, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>  ; Vector nand operation
+  %not = xor <16 x i8> %C, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>  ; Vector not operation
+  %res = select <16 x i1> %A, <16 x i8> %nand, <16 x i8> %not
+  ret <16 x i8> %res
+}
+
+; Function to test ternary(A, nand(B, C), not(C)) for <8 x i16>
+define <8 x i16> @ternary_A_nand_BC_not_C_8x16(<8 x i1> %A, <8 x i16> %B, <8 x i16> %C) {
+; CHECK-LABEL: ternary_A_nand_BC_not_C_8x16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxspltiw v5, 983055
+; CHECK-NEXT:    xxlnand vs0, v3, v4
+; CHECK-NEXT:    xxlnor vs1, v4, v4
+; CHECK-NEXT:    vslh v2, v2, v5
+; CHECK-NEXT:    vsrah v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    blr
+entry:
+  %and = and <8 x i16> %B, %C
+  %nand = xor <8 x i16> %and, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>  ; Vector nand operation
+  %not = xor <8 x i16> %C, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>  ; Vector not operation
+  %res = select <8 x i1> %A, <8 x i16> %nand, <8 x i16> %not
+  ret <8 x i16> %res
+}
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/constantpool.ll b/llvm/test/CodeGen/RISCV/GlobalISel/constantpool.ll
index 1eeeb60c2eb40..cee04492dc441 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/constantpool.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/constantpool.ll
@@ -15,47 +15,37 @@
 define void @constpool_f32(ptr %p) {
 ; RV32-SMALL-LABEL: constpool_f32:
 ; RV32-SMALL:       # %bb.0:
-; RV32-SMALL-NEXT:    lui a1, %hi(.LCPI0_0)
-; RV32-SMALL-NEXT:    lw a1, %lo(.LCPI0_0)(a1)
+; RV32-SMALL-NEXT:    lui a1, 260096
 ; RV32-SMALL-NEXT:    sw a1, 0(a0)
 ; RV32-SMALL-NEXT:    ret
 ;
 ; RV32-MEDIUM-LABEL: constpool_f32:
 ; RV32-MEDIUM:       # %bb.0:
-; RV32-MEDIUM-NEXT:  .Lpcrel_hi0:
-; RV32-MEDIUM-NEXT:    auipc a1, %pcrel_hi(.LCPI0_0)
-; RV32-MEDIUM-NEXT:    lw a1, %pcrel_lo(.Lpcrel_hi0)(a1)
+; RV32-MEDIUM-NEXT:    lui a1, 260096
 ; RV32-MEDIUM-NEXT:    sw a1, 0(a0)
 ; RV32-MEDIUM-NEXT:    ret
 ;
 ; RV32-PIC-LABEL: constpool_f32:
 ; RV32-PIC:       # %bb.0:
-; RV32-PIC-NEXT:  .Lpcrel_hi0:
-; RV32-PIC-NEXT:    auipc a1, %pcrel_hi(.LCPI0_0)
-; RV32-PIC-NEXT:    lw a1, %pcrel_lo(.Lpcrel_hi0)(a1)
+; RV32-PIC-NEXT:    lui a1, 260096
 ; RV32-PIC-NEXT:    sw a1, 0(a0)
 ; RV32-PIC-NEXT:    ret
 ;
 ; RV64-SMALL-LABEL: constpool_f32:
 ; RV64-SMALL:       # %bb.0:
-; RV64-SMALL-NEXT:    lui a1, %hi(.LCPI0_0)
-; RV64-SMALL-NEXT:    lw a1, %lo(.LCPI0_0)(a1)
+; RV64-SMALL-NEXT:    lui a1, 260096
 ; RV64-SMALL-NEXT:    sw a1, 0(a0)
 ; RV64-SMALL-NEXT:    ret
 ;
 ; RV64-MEDIUM-LABEL: constpool_f32:
 ; RV64-MEDIUM:       # %bb.0:
-; RV64-MEDIUM-NEXT:  .Lpcrel_hi0:
-; RV64-MEDIUM-NEXT:    auipc a1, %pcrel_hi(.LCPI0_0)
-; RV64-MEDIUM-NEXT:    lw a1, %pcrel_lo(.Lpcrel_hi0)(a1)
+; RV64-MEDIUM-NEXT:    lui a1, 260096
 ; RV64-MEDIUM-NEXT:    sw a1, 0(a0)
 ; RV64-MEDIUM-NEXT:    ret
 ;
 ; RV64-PIC-LABEL: constpool_f32:
 ; RV64-PIC:       # %bb.0:
-; RV64-PIC-NEXT:  .Lpcrel_hi0:
-; RV64-PIC-NEXT:    auipc a1, %pcrel_hi(.LCPI0_0)
-; RV64-PIC-NEXT:    lw a1, %pcrel_lo(.Lpcrel_hi0)(a1)
+; RV64-PIC-NEXT:    lui a1, 260096
 ; RV64-PIC-NEXT:    sw a1, 0(a0)
 ; RV64-PIC-NEXT:    ret
   store float 1.0, ptr %p
@@ -75,9 +65,9 @@ define void @constpool_f64(ptr %p) {
 ;
 ; RV32-MEDIUM-LABEL: constpool_f64:
 ; RV32-MEDIUM:       # %bb.0:
-; RV32-MEDIUM-NEXT:  .Lpcrel_hi1:
+; RV32-MEDIUM-NEXT:  .Lpcrel_hi0:
 ; RV32-MEDIUM-NEXT:    auipc a1, %pcrel_hi(.LCPI1_0)
-; RV32-MEDIUM-NEXT:    addi a1, a1, %pcrel_lo(.Lpcrel_hi1)
+; RV32-MEDIUM-NEXT:    addi a1, a1, %pcrel_lo(.Lpcrel_hi0)
 ; RV32-MEDIUM-NEXT:    lw a2, 0(a1)
 ; RV32-MEDIUM-NEXT:    lw a1, 4(a1)
 ; RV32-MEDIUM-NEXT:    sw a2, 0(a0)
@@ -86,9 +76,9 @@ define void @constpool_f64(ptr %p) {
 ;
 ; RV32-PIC-LABEL: constpool_f64:
 ; RV32-PIC:       # %bb.0:
-; RV32-PIC-NEXT:  .Lpcrel_hi1:
+; RV32-PIC-NEXT:  .Lpcrel_hi0:
 ; RV32-PIC-NEXT:    auipc a1, %pcrel_hi(.LCPI1_0)
-; RV32-PIC-NEXT:    addi a1, a1, %pcrel_lo(.Lpcrel_hi1)
+; RV32-PIC-NEXT:    addi a1, a1, %pcrel_lo(.Lpcrel_hi0)
 ; RV32-PIC-NEXT:    lw a2, 0(a1)
 ; RV32-PIC-NEXT:    lw a1, 4(a1)
 ; RV32-PIC-NEXT:    sw a2, 0(a0)
@@ -97,26 +87,124 @@ define void @constpool_f64(ptr %p) {
 ;
 ; RV64-SMALL-LABEL: constpool_f64:
 ; RV64-SMALL:       # %bb.0:
-; RV64-SMALL-NEXT:    lui a1, %hi(.LCPI1_0)
-; RV64-SMALL-NEXT:    ld a1, %lo(.LCPI1_0)(a1)
+; RV64-SMALL-NEXT:    li a1, 1023
+; RV64-SMALL-NEXT:    slli a1, a1, 52
 ; RV64-SMALL-NEXT:    sd a1, 0(a0)
 ; RV64-SMALL-NEXT:    ret
 ;
 ; RV64-MEDIUM-LABEL: constpool_f64:
 ; RV64-MEDIUM:       # %bb.0:
-; RV64-MEDIUM-NEXT:  .Lpcrel_hi1:
-; RV64-MEDIUM-NEXT:    auipc a1, %pcrel_hi(.LCPI1_0)
-; RV64-MEDIUM-NEXT:    ld a1, %pcrel_lo(.Lpcrel_hi1)(a1)
+; RV64-MEDIUM-NEXT:    li a1, 1023
+; RV64-MEDIUM-NEXT:    slli a1, a1, 52
 ; RV64-MEDIUM-NEXT:    sd a1, 0(a0)
 ; RV64-MEDIUM-NEXT:    ret
 ;
 ; RV64-PIC-LABEL: constpool_f64:
 ; RV64-PIC:       # %bb.0:
-; RV64-PIC-NEXT:  .Lpcrel_hi1:
-; RV64-PIC-NEXT:    auipc a1, %pcrel_hi(.LCPI1_0)
-; RV64-PIC-NEXT:    ld a1, %pcrel_lo(.Lpcrel_hi1)(a1)
+; RV64-PIC-NEXT:    li a1, 1023
+; RV64-PIC-NEXT:    slli a1, a1, 52
 ; RV64-PIC-NEXT:    sd a1, 0(a0)
 ; RV64-PIC-NEXT:    ret
   store double 1.0, ptr %p
   ret void
 }
+
+define void @constpool_f32_1234_5(ptr %p) {
+; RV32-SMALL-LABEL: constpool_f32_1234_5:
+; RV32-SMALL:       # %bb.0:
+; RV32-SMALL-NEXT:    lui a1, 280997
+; RV32-SMALL-NEXT:    sw a1, 0(a0)
+; RV32-SMALL-NEXT:    ret
+;
+; RV32-MEDIUM-LABEL: constpool_f32_1234_5:
+; RV32-MEDIUM:       # %bb.0:
+; RV32-MEDIUM-NEXT:    lui a1, 280997
+; RV32-MEDIUM-NEXT:    sw a1, 0(a0)
+; RV32-MEDIUM-NEXT:    ret
+;
+; RV32-PIC-LABEL: constpool_f32_1234_5:
+; RV32-PIC:       # %bb.0:
+; RV32-PIC-NEXT:    lui a1, 280997
+; RV32-PIC-NEXT:    sw a1, 0(a0)
+; RV32-PIC-NEXT:    ret
+;
+; RV64-SMALL-LABEL: constpool_f32_1234_5:
+; RV64-SMALL:       # %bb.0:
+; RV64-SMALL-NEXT:    lui a1, 280997
+; RV64-SMALL-NEXT:    sw a1, 0(a0)
+; RV64-SMALL-NEXT:    ret
+;
+; RV64-MEDIUM-LABEL: constpool_f32_1234_5:
+; RV64-MEDIUM:       # %bb.0:
+; RV64-MEDIUM-NEXT:    lui a1, 280997
+; RV64-MEDIUM-NEXT:    sw a1, 0(a0)
+; RV64-MEDIUM-NEXT:    ret
+;
+; RV64-PIC-LABEL: constpool_f32_1234_5:
+; RV64-PIC:       # %bb.0:
+; RV64-PIC-NEXT:    lui a1, 280997
+; RV64-PIC-NEXT:    sw a1, 0(a0)
+; RV64-PIC-NEXT:    ret
+  store float 1.234500e+03, ptr %p
+  ret void
+}
+
+define void @constpool_f64_1234_5(ptr %p) {
+; RV32-SMALL-LABEL: constpool_f64_1234_5:
+; RV32-SMALL:       # %bb.0:
+; RV32-SMALL-NEXT:    lui a1, %hi(.LCPI3_0)
+; RV32-SMALL-NEXT:    addi a1, a1, %lo(.LCPI3_0)
+; RV32-SMALL-NEXT:    lw a2, 0(a1)
+; RV32-SMALL-NEXT:    lw a1, 4(a1)
+; RV32-SMALL-NEXT:    sw a2, 0(a0)
+; RV32-SMALL-NEXT:    sw a1, 4(a0)
+; RV32-SMALL-NEXT:    ret
+;
+; RV32-MEDIUM-LABEL: constpool_f64_1234_5:
+; RV32-MEDIUM:       # %bb.0:
+; RV32-MEDIUM-NEXT:  .Lpcrel_hi1:
+; RV32-MEDIUM-NEXT:    auipc a1, %pcrel_hi(.LCPI3_0)
+; RV32-MEDIUM-NEXT:    addi a1, a1, %pcrel_lo(.Lpcrel_hi1)
+; RV32-MEDIUM-NEXT:    lw a2, 0(a1)
+; RV32-MEDIUM-NEXT:    lw a1, 4(a1)
+; RV32-MEDIUM-NEXT:    sw a2, 0(a0)
+; RV32-MEDIUM-NEXT:    sw a1, 4(a0)
+; RV32-MEDIUM-NEXT:    ret
+;
+; RV32-PIC-LABEL: constpool_f64_1234_5:
+; RV32-PIC:       # %bb.0:
+; RV32-PIC-NEXT:  .Lpcrel_hi1:
+; RV32-PIC-NEXT:    auipc a1, %pcrel_hi(.LCPI3_0)
+; RV32-PIC-NEXT:    addi a1, a1, %pcrel_lo(.Lpcrel_hi1)
+; RV32-PIC-NEXT:    lw a2, 0(a1)
+; RV32-PIC-NEXT:    lw a1, 4(a1)
+; RV32-PIC-NEXT:    sw a2, 0(a0)
+; RV32-PIC-NEXT:    sw a1, 4(a0)
+; RV32-PIC-NEXT:    ret
+;
+; RV64-SMALL-LABEL: constpool_f64_1234_5:
+; RV64-SMALL:       # %bb.0:
+; RV64-SMALL-NEXT:    lui a1, 517
+; RV64-SMALL-NEXT:    addi a1, a1, -1627
+; RV64-SMALL-NEXT:    slli a1, a1, 41
+; RV64-SMALL-NEXT:    sd a1, 0(a0)
+; RV64-SMALL-NEXT:    ret
+;
+; RV64-MEDIUM-LABEL: constpool_f64_1234_5:
+; RV64-MEDIUM:       # %bb.0:
+; RV64-MEDIUM-NEXT:    lui a1, 517
+; RV64-MEDIUM-NEXT:    addi a1, a1, -1627
+; RV64-MEDIUM-NEXT:    slli a1, a1, 41
+; RV64-MEDIUM-NEXT:    sd a1, 0(a0)
+; RV64-MEDIUM-NEXT:    ret
+;
+; RV64-PIC-LABEL: constpool_f64_1234_5:
+; RV64-PIC:       # %bb.0:
+; RV64-PIC-NEXT:    lui a1, 517
+; RV64-PIC-NEXT:    addi a1, a1, -1627
+; RV64-PIC-NEXT:    slli a1, a1, 41
+; RV64-PIC-NEXT:    sd a1, 0(a0)
+; RV64-PIC-NEXT:    ret
+  store double 1.234500e+03, ptr %p
+  ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/div-by-constant.ll b/llvm/test/CodeGen/RISCV/GlobalISel/div-by-constant.ll
index 6864afe3855f4..225ceed9627b7 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/div-by-constant.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/div-by-constant.ll
@@ -240,7 +240,6 @@ define i8 @udiv8_constant_add(i8 %a) nounwind {
 ; RV32-NEXT:    zext.b a0, a0
 ; RV32-NEXT:    srli a0, a0, 1
 ; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    zext.b a0, a0
 ; RV32-NEXT:    srli a0, a0, 2
 ; RV32-NEXT:    ret
 ;
@@ -254,7 +253,6 @@ define i8 @udiv8_constant_add(i8 %a) nounwind {
 ; RV64-NEXT:    zext.b a0, a0
 ; RV64-NEXT:    srli a0, a0, 1
 ; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    zext.b a0, a0
 ; RV64-NEXT:    srli a0, a0, 2
 ; RV64-NEXT:    ret
   %1 = udiv i8 %a, 7
@@ -317,7 +315,6 @@ define i16 @udiv16_constant_add(i16 %a) nounwind {
 ; RV32IM-NEXT:    and a0, a0, a2
 ; RV32IM-NEXT:    srli a0, a0, 1
 ; RV32IM-NEXT:    add a0, a0, a1
-; RV32IM-NEXT:    and a0, a0, a2
 ; RV32IM-NEXT:    srli a0, a0, 2
 ; RV32IM-NEXT:    ret
 ;
@@ -332,7 +329,6 @@ define i16 @udiv16_constant_add(i16 %a) nounwind {
 ; RV32IMZB-NEXT:    zext.h a0, a0
 ; RV32IMZB-NEXT:    srli a0, a0, 1
 ; RV32IMZB-NEXT:    add a0, a0, a1
-; RV32IMZB-NEXT:    zext.h a0, a0
 ; RV32IMZB-NEXT:    srli a0, a0, 2
 ; RV32IMZB-NEXT:    ret
 ;
@@ -349,7 +345,6 @@ define i16 @udiv16_constant_add(i16 %a) nounwind {
 ; RV64IM-NEXT:    and a0, a0, a2
 ; RV64IM-NEXT:    srli a0, a0, 1
 ; RV64IM-NEXT:    add a0, a0, a1
-; RV64IM-NEXT:    and a0, a0, a2
 ; RV64IM-NEXT:    srli a0, a0, 2
 ; RV64IM-NEXT:    ret
 ;
@@ -364,7 +359,6 @@ define i16 @udiv16_constant_add(i16 %a) nounwind {
 ; RV64IMZB-NEXT:    zext.h a0, a0
 ; RV64IMZB-NEXT:    srli a0, a0, 1
 ; RV64IMZB-NEXT:    add a0, a0, a1
-; RV64IMZB-NEXT:    zext.h a0, a0
 ; RV64IMZB-NEXT:    srli a0, a0, 2
 ; RV64IMZB-NEXT:    ret
   %1 = udiv i16 %a, 7
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/double-arith.ll b/llvm/test/CodeGen/RISCV/GlobalISel/double-arith.ll
index 12684f30dbee0..4246aa545dd0e 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/double-arith.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/double-arith.ll
@@ -508,9 +508,8 @@ define double @fmsub_d(double %a, double %b, double %c) nounwind {
 ; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    mv s0, a0
 ; RV64I-NEXT:    mv s1, a1
-; RV64I-NEXT:    lui a0, %hi(.LCPI14_0)
-; RV64I-NEXT:    ld a1, %lo(.LCPI14_0)(a0)
 ; RV64I-NEXT:    mv a0, a2
+; RV64I-NEXT:    li a1, 0
 ; RV64I-NEXT:    call __adddf3
 ; RV64I-NEXT:    li a1, -1
 ; RV64I-NEXT:    slli a1, a1, 63
@@ -599,35 +598,31 @@ define double @fnmadd_d(double %a, double %b, double %c) nounwind {
 ;
 ; RV64I-LABEL: fnmadd_d:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    addi sp, sp, -48
-; RV64I-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    addi sp, sp, -32
+; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s2, 0(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    mv s0, a1
-; RV64I-NEXT:    lui a1, %hi(.LCPI15_0)
-; RV64I-NEXT:    ld s1, %lo(.LCPI15_0)(a1)
-; RV64I-NEXT:    mv s2, a2
-; RV64I-NEXT:    mv a1, s1
+; RV64I-NEXT:    mv s1, a2
+; RV64I-NEXT:    li a1, 0
 ; RV64I-NEXT:    call __adddf3
-; RV64I-NEXT:    mv s3, a0
-; RV64I-NEXT:    mv a0, s2
-; RV64I-NEXT:    mv a1, s1
+; RV64I-NEXT:    mv s2, a0
+; RV64I-NEXT:    mv a0, s1
+; RV64I-NEXT:    li a1, 0
 ; RV64I-NEXT:    call __adddf3
 ; RV64I-NEXT:    li a1, -1
 ; RV64I-NEXT:    slli a2, a1, 63
-; RV64I-NEXT:    xor a1, s3, a2
+; RV64I-NEXT:    xor a1, s2, a2
 ; RV64I-NEXT:    xor a2, a0, a2
 ; RV64I-NEXT:    mv a0, a1
 ; RV64I-NEXT:    mv a1, s0
 ; RV64I-NEXT:    call fma
-; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    addi sp, sp, 48
+; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s2, 0(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 32
 ; RV64I-NEXT:    ret
   %a_ = fadd double 0.0, %a
   %c_ = fadd double 0.0, %c
@@ -708,35 +703,31 @@ define double @fnmadd_d_2(double %a, double %b, double %c) nounwind {
 ;
 ; RV64I-LABEL: fnmadd_d_2:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    addi sp, sp, -48
-; RV64I-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    addi sp, sp, -32
+; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s2, 0(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    mv s0, a0
 ; RV64I-NEXT:    mv a0, a1
-; RV64I-NEXT:    lui a1, %hi(.LCPI16_0)
-; RV64I-NEXT:    ld s1, %lo(.LCPI16_0)(a1)
-; RV64I-NEXT:    mv s2, a2
-; RV64I-NEXT:    mv a1, s1
+; RV64I-NEXT:    mv s1, a2
+; RV64I-NEXT:    li a1, 0
 ; RV64I-NEXT:    call __adddf3
-; RV64I-NEXT:    mv s3, a0
-; RV64I-NEXT:    mv a0, s2
-; RV64I-NEXT:    mv a1, s1
+; RV64I-NEXT:    mv s2, a0
+; RV64I-NEXT:    mv a0, s1
+; RV64I-NEXT:    li a1, 0
 ; RV64I-NEXT:    call __adddf3
 ; RV64I-NEXT:    li a1, -1
 ; RV64I-NEXT:    slli a2, a1, 63
-; RV64I-NEXT:    xor a1, s3, a2
+; RV64I-NEXT:    xor a1, s2, a2
 ; RV64I-NEXT:    xor a2, a0, a2
 ; RV64I-NEXT:    mv a0, s0
 ; RV64I-NEXT:    call fma
-; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    addi sp, sp, 48
+; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s2, 0(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 32
 ; RV64I-NEXT:    ret
   %b_ = fadd double 0.0, %b
   %c_ = fadd double 0.0, %c
@@ -869,9 +860,8 @@ define double @fnmsub_d(double %a, double %b, double %c) nounwind {
 ; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    mv s0, a1
-; RV64I-NEXT:    lui a1, %hi(.LCPI19_0)
-; RV64I-NEXT:    ld a1, %lo(.LCPI19_0)(a1)
 ; RV64I-NEXT:    mv s1, a2
+; RV64I-NEXT:    li a1, 0
 ; RV64I-NEXT:    call __adddf3
 ; RV64I-NEXT:    li a1, -1
 ; RV64I-NEXT:    slli a1, a1, 63
@@ -948,9 +938,8 @@ define double @fnmsub_d_2(double %a, double %b, double %c) nounwind {
 ; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    mv s0, a0
 ; RV64I-NEXT:    mv a0, a1
-; RV64I-NEXT:    lui a1, %hi(.LCPI20_0)
-; RV64I-NEXT:    ld a1, %lo(.LCPI20_0)(a1)
 ; RV64I-NEXT:    mv s1, a2
+; RV64I-NEXT:    li a1, 0
 ; RV64I-NEXT:    call __adddf3
 ; RV64I-NEXT:    li a1, -1
 ; RV64I-NEXT:    slli a1, a1, 63
@@ -1078,9 +1067,8 @@ define double @fmsub_d_contract(double %a, double %b, double %c) nounwind {
 ; RV64I-NEXT:    sd s2, 0(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    mv s0, a0
 ; RV64I-NEXT:    mv s1, a1
-; RV64I-NEXT:    lui a0, %hi(.LCPI22_0)
-; RV64I-NEXT:    ld a1, %lo(.LCPI22_0)(a0)
 ; RV64I-NEXT:    mv a0, a2
+; RV64I-NEXT:    li a1, 0
 ; RV64I-NEXT:    call __adddf3
 ; RV64I-NEXT:    mv s2, a0
 ; RV64I-NEXT:    mv a0, s0
@@ -1186,28 +1174,25 @@ define double @fnmadd_d_contract(double %a, double %b, double %c) nounwind {
 ;
 ; RV64I-LABEL: fnmadd_d_contract:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    addi sp, sp, -48
-; RV64I-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    addi sp, sp, -32
+; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s2, 0(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    mv s0, a1
-; RV64I-NEXT:    lui a1, %hi(.LCPI23_0)
-; RV64I-NEXT:    ld s1, %lo(.LCPI23_0)(a1)
-; RV64I-NEXT:    mv s2, a2
-; RV64I-NEXT:    mv a1, s1
+; RV64I-NEXT:    mv s1, a2
+; RV64I-NEXT:    li a1, 0
 ; RV64I-NEXT:    call __adddf3
-; RV64I-NEXT:    mv s3, a0
+; RV64I-NEXT:    mv s2, a0
 ; RV64I-NEXT:    mv a0, s0
-; RV64I-NEXT:    mv a1, s1
+; RV64I-NEXT:    li a1, 0
 ; RV64I-NEXT:    call __adddf3
 ; RV64I-NEXT:    mv s0, a0
-; RV64I-NEXT:    mv a0, s2
-; RV64I-NEXT:    mv a1, s1
+; RV64I-NEXT:    mv a0, s1
+; RV64I-NEXT:    li a1, 0
 ; RV64I-NEXT:    call __adddf3
 ; RV64I-NEXT:    mv s1, a0
-; RV64I-NEXT:    mv a0, s3
+; RV64I-NEXT:    mv a0, s2
 ; RV64I-NEXT:    mv a1, s0
 ; RV64I-NEXT:    call __muldf3
 ; RV64I-NEXT:    li a1, -1
@@ -1215,12 +1200,11 @@ define double @fnmadd_d_contract(double %a, double %b, double %c) nounwind {
 ; RV64I-NEXT:    xor a0, a0, a1
 ; RV64I-NEXT:    mv a1, s1
 ; RV64I-NEXT:    call __subdf3
-; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    addi sp, sp, 48
+; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s2, 0(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 32
 ; RV64I-NEXT:    ret
   %a_ = fadd double 0.0, %a ; avoid negation using xor
   %b_ = fadd double 0.0, %b ; avoid negation using xor
@@ -1302,34 +1286,30 @@ define double @fnmsub_d_contract(double %a, double %b, double %c) nounwind {
 ;
 ; RV64I-LABEL: fnmsub_d_contract:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    addi sp, sp, -48
-; RV64I-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    addi sp, sp, -32
+; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s2, 0(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    mv s0, a1
-; RV64I-NEXT:    lui a1, %hi(.LCPI24_0)
-; RV64I-NEXT:    ld s1, %lo(.LCPI24_0)(a1)
-; RV64I-NEXT:    mv s2, a2
-; RV64I-NEXT:    mv a1, s1
+; RV64I-NEXT:    mv s1, a2
+; RV64I-NEXT:    li a1, 0
 ; RV64I-NEXT:    call __adddf3
-; RV64I-NEXT:    mv s3, a0
+; RV64I-NEXT:    mv s2, a0
 ; RV64I-NEXT:    mv a0, s0
-; RV64I-NEXT:    mv a1, s1
+; RV64I-NEXT:    li a1, 0
 ; RV64I-NEXT:    call __adddf3
 ; RV64I-NEXT:    mv a1, a0
-; RV64I-NEXT:    mv a0, s3
+; RV64I-NEXT:    mv a0, s2
 ; RV64I-NEXT:    call __muldf3
 ; RV64I-NEXT:    mv a1, a0
-; RV64I-NEXT:    mv a0, s2
+; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    call __subdf3
-; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    addi sp, sp, 48
+; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s2, 0(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 32
 ; RV64I-NEXT:    ret
   %a_ = fadd double 0.0, %a ; avoid negation using xor
   %b_ = fadd double 0.0, %b ; avoid negation using xor
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/float-arith.ll b/llvm/test/CodeGen/RISCV/GlobalISel/float-arith.ll
index 739f225ad1525..3222849641baf 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/float-arith.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/float-arith.ll
@@ -472,9 +472,8 @@ define float @fmsub_s(float %a, float %b, float %c) nounwind {
 ; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a0
 ; RV32I-NEXT:    mv s1, a1
-; RV32I-NEXT:    lui a0, %hi(.LCPI14_0)
-; RV32I-NEXT:    lw a1, %lo(.LCPI14_0)(a0)
 ; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:    li a1, 0
 ; RV32I-NEXT:    call __addsf3
 ; RV32I-NEXT:    lui a2, 524288
 ; RV32I-NEXT:    xor a2, a0, a2
@@ -495,9 +494,8 @@ define float @fmsub_s(float %a, float %b, float %c) nounwind {
 ; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    mv s0, a0
 ; RV64I-NEXT:    mv s1, a1
-; RV64I-NEXT:    lui a0, %hi(.LCPI14_0)
-; RV64I-NEXT:    lw a1, %lo(.LCPI14_0)(a0)
 ; RV64I-NEXT:    mv a0, a2
+; RV64I-NEXT:    li a1, 0
 ; RV64I-NEXT:    call __addsf3
 ; RV64I-NEXT:    lui a2, 524288
 ; RV64I-NEXT:    xor a2, a0, a2
@@ -526,66 +524,58 @@ define float @fnmadd_s(float %a, float %b, float %c) nounwind {
 ;
 ; RV32I-LABEL: fnmadd_s:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -32
-; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a1
-; RV32I-NEXT:    lui a1, %hi(.LCPI15_0)
-; RV32I-NEXT:    lw s1, %lo(.LCPI15_0)(a1)
-; RV32I-NEXT:    mv s2, a2
-; RV32I-NEXT:    mv a1, s1
+; RV32I-NEXT:    mv s1, a2
+; RV32I-NEXT:    li a1, 0
 ; RV32I-NEXT:    call __addsf3
-; RV32I-NEXT:    mv s3, a0
-; RV32I-NEXT:    mv a0, s2
-; RV32I-NEXT:    mv a1, s1
+; RV32I-NEXT:    mv s2, a0
+; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    li a1, 0
 ; RV32I-NEXT:    call __addsf3
 ; RV32I-NEXT:    lui a2, 524288
-; RV32I-NEXT:    xor a1, s3, a2
+; RV32I-NEXT:    xor a1, s2, a2
 ; RV32I-NEXT:    xor a2, a0, a2
 ; RV32I-NEXT:    mv a0, a1
 ; RV32I-NEXT:    mv a1, s0
 ; RV32I-NEXT:    call fmaf
-; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 32
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: fnmadd_s:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    addi sp, sp, -48
-; RV64I-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    addi sp, sp, -32
+; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s2, 0(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    mv s0, a1
-; RV64I-NEXT:    lui a1, %hi(.LCPI15_0)
-; RV64I-NEXT:    lw s1, %lo(.LCPI15_0)(a1)
-; RV64I-NEXT:    mv s2, a2
-; RV64I-NEXT:    mv a1, s1
+; RV64I-NEXT:    mv s1, a2
+; RV64I-NEXT:    li a1, 0
 ; RV64I-NEXT:    call __addsf3
-; RV64I-NEXT:    mv s3, a0
-; RV64I-NEXT:    mv a0, s2
-; RV64I-NEXT:    mv a1, s1
+; RV64I-NEXT:    mv s2, a0
+; RV64I-NEXT:    mv a0, s1
+; RV64I-NEXT:    li a1, 0
 ; RV64I-NEXT:    call __addsf3
 ; RV64I-NEXT:    lui a2, 524288
-; RV64I-NEXT:    xor a1, s3, a2
+; RV64I-NEXT:    xor a1, s2, a2
 ; RV64I-NEXT:    xor a2, a0, a2
 ; RV64I-NEXT:    mv a0, a1
 ; RV64I-NEXT:    mv a1, s0
 ; RV64I-NEXT:    call fmaf
-; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    addi sp, sp, 48
+; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s2, 0(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 32
 ; RV64I-NEXT:    ret
   %a_ = fadd float 0.0, %a
   %c_ = fadd float 0.0, %c
@@ -606,66 +596,58 @@ define float @fnmadd_s_2(float %a, float %b, float %c) nounwind {
 ;
 ; RV32I-LABEL: fnmadd_s_2:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -32
-; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a0
 ; RV32I-NEXT:    mv a0, a1
-; RV32I-NEXT:    lui a1, %hi(.LCPI16_0)
-; RV32I-NEXT:    lw s1, %lo(.LCPI16_0)(a1)
-; RV32I-NEXT:    mv s2, a2
-; RV32I-NEXT:    mv a1, s1
+; RV32I-NEXT:    mv s1, a2
+; RV32I-NEXT:    li a1, 0
 ; RV32I-NEXT:    call __addsf3
-; RV32I-NEXT:    mv s3, a0
-; RV32I-NEXT:    mv a0, s2
-; RV32I-NEXT:    mv a1, s1
+; RV32I-NEXT:    mv s2, a0
+; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    li a1, 0
 ; RV32I-NEXT:    call __addsf3
 ; RV32I-NEXT:    lui a2, 524288
-; RV32I-NEXT:    xor a1, s3, a2
+; RV32I-NEXT:    xor a1, s2, a2
 ; RV32I-NEXT:    xor a2, a0, a2
 ; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    call fmaf
-; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 32
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: fnmadd_s_2:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    addi sp, sp, -48
-; RV64I-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    addi sp, sp, -32
+; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s2, 0(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    mv s0, a0
 ; RV64I-NEXT:    mv a0, a1
-; RV64I-NEXT:    lui a1, %hi(.LCPI16_0)
-; RV64I-NEXT:    lw s1, %lo(.LCPI16_0)(a1)
-; RV64I-NEXT:    mv s2, a2
-; RV64I-NEXT:    mv a1, s1
+; RV64I-NEXT:    mv s1, a2
+; RV64I-NEXT:    li a1, 0
 ; RV64I-NEXT:    call __addsf3
-; RV64I-NEXT:    mv s3, a0
-; RV64I-NEXT:    mv a0, s2
-; RV64I-NEXT:    mv a1, s1
+; RV64I-NEXT:    mv s2, a0
+; RV64I-NEXT:    mv a0, s1
+; RV64I-NEXT:    li a1, 0
 ; RV64I-NEXT:    call __addsf3
 ; RV64I-NEXT:    lui a2, 524288
-; RV64I-NEXT:    xor a1, s3, a2
+; RV64I-NEXT:    xor a1, s2, a2
 ; RV64I-NEXT:    xor a2, a0, a2
 ; RV64I-NEXT:    mv a0, s0
 ; RV64I-NEXT:    call fmaf
-; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    addi sp, sp, 48
+; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s2, 0(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 32
 ; RV64I-NEXT:    ret
   %b_ = fadd float 0.0, %b
   %c_ = fadd float 0.0, %c
@@ -778,9 +760,8 @@ define float @fnmsub_s(float %a, float %b, float %c) nounwind {
 ; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a1
-; RV32I-NEXT:    lui a1, %hi(.LCPI19_0)
-; RV32I-NEXT:    lw a1, %lo(.LCPI19_0)(a1)
 ; RV32I-NEXT:    mv s1, a2
+; RV32I-NEXT:    li a1, 0
 ; RV32I-NEXT:    call __addsf3
 ; RV32I-NEXT:    lui a1, 524288
 ; RV32I-NEXT:    xor a0, a0, a1
@@ -800,9 +781,8 @@ define float @fnmsub_s(float %a, float %b, float %c) nounwind {
 ; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    mv s0, a1
-; RV64I-NEXT:    lui a1, %hi(.LCPI19_0)
-; RV64I-NEXT:    lw a1, %lo(.LCPI19_0)(a1)
 ; RV64I-NEXT:    mv s1, a2
+; RV64I-NEXT:    li a1, 0
 ; RV64I-NEXT:    call __addsf3
 ; RV64I-NEXT:    lui a1, 524288
 ; RV64I-NEXT:    xor a0, a0, a1
@@ -836,9 +816,8 @@ define float @fnmsub_s_2(float %a, float %b, float %c) nounwind {
 ; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a0
 ; RV32I-NEXT:    mv a0, a1
-; RV32I-NEXT:    lui a1, %hi(.LCPI20_0)
-; RV32I-NEXT:    lw a1, %lo(.LCPI20_0)(a1)
 ; RV32I-NEXT:    mv s1, a2
+; RV32I-NEXT:    li a1, 0
 ; RV32I-NEXT:    call __addsf3
 ; RV32I-NEXT:    lui a1, 524288
 ; RV32I-NEXT:    xor a1, a0, a1
@@ -859,9 +838,8 @@ define float @fnmsub_s_2(float %a, float %b, float %c) nounwind {
 ; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    mv s0, a0
 ; RV64I-NEXT:    mv a0, a1
-; RV64I-NEXT:    lui a1, %hi(.LCPI20_0)
-; RV64I-NEXT:    lw a1, %lo(.LCPI20_0)(a1)
 ; RV64I-NEXT:    mv s1, a2
+; RV64I-NEXT:    li a1, 0
 ; RV64I-NEXT:    call __addsf3
 ; RV64I-NEXT:    lui a1, 524288
 ; RV64I-NEXT:    xor a1, a0, a1
@@ -935,9 +913,8 @@ define float @fmsub_s_contract(float %a, float %b, float %c) nounwind {
 ; RV32I-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a0
 ; RV32I-NEXT:    mv s1, a1
-; RV32I-NEXT:    lui a0, %hi(.LCPI22_0)
-; RV32I-NEXT:    lw a1, %lo(.LCPI22_0)(a0)
 ; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:    li a1, 0
 ; RV32I-NEXT:    call __addsf3
 ; RV32I-NEXT:    mv s2, a0
 ; RV32I-NEXT:    mv a0, s0
@@ -961,9 +938,8 @@ define float @fmsub_s_contract(float %a, float %b, float %c) nounwind {
 ; RV64I-NEXT:    sd s2, 0(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    mv s0, a0
 ; RV64I-NEXT:    mv s1, a1
-; RV64I-NEXT:    lui a0, %hi(.LCPI22_0)
-; RV64I-NEXT:    lw a1, %lo(.LCPI22_0)(a0)
 ; RV64I-NEXT:    mv a0, a2
+; RV64I-NEXT:    li a1, 0
 ; RV64I-NEXT:    call __addsf3
 ; RV64I-NEXT:    mv s2, a0
 ; RV64I-NEXT:    mv a0, s0
@@ -997,78 +973,70 @@ define float @fnmadd_s_contract(float %a, float %b, float %c) nounwind {
 ;
 ; RV32I-LABEL: fnmadd_s_contract:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -32
-; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a1
-; RV32I-NEXT:    lui a1, %hi(.LCPI23_0)
-; RV32I-NEXT:    lw s1, %lo(.LCPI23_0)(a1)
-; RV32I-NEXT:    mv s2, a2
-; RV32I-NEXT:    mv a1, s1
+; RV32I-NEXT:    mv s1, a2
+; RV32I-NEXT:    li a1, 0
 ; RV32I-NEXT:    call __addsf3
-; RV32I-NEXT:    mv s3, a0
+; RV32I-NEXT:    mv s2, a0
 ; RV32I-NEXT:    mv a0, s0
-; RV32I-NEXT:    mv a1, s1
+; RV32I-NEXT:    li a1, 0
 ; RV32I-NEXT:    call __addsf3
 ; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    mv a0, s2
-; RV32I-NEXT:    mv a1, s1
+; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    li a1, 0
 ; RV32I-NEXT:    call __addsf3
 ; RV32I-NEXT:    mv s1, a0
-; RV32I-NEXT:    mv a0, s3
+; RV32I-NEXT:    mv a0, s2
 ; RV32I-NEXT:    mv a1, s0
 ; RV32I-NEXT:    call __mulsf3
 ; RV32I-NEXT:    lui a1, 524288
 ; RV32I-NEXT:    xor a0, a0, a1
 ; RV32I-NEXT:    mv a1, s1
 ; RV32I-NEXT:    call __subsf3
-; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 32
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: fnmadd_s_contract:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    addi sp, sp, -48
-; RV64I-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    addi sp, sp, -32
+; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s2, 0(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    mv s0, a1
-; RV64I-NEXT:    lui a1, %hi(.LCPI23_0)
-; RV64I-NEXT:    lw s1, %lo(.LCPI23_0)(a1)
-; RV64I-NEXT:    mv s2, a2
-; RV64I-NEXT:    mv a1, s1
+; RV64I-NEXT:    mv s1, a2
+; RV64I-NEXT:    li a1, 0
 ; RV64I-NEXT:    call __addsf3
-; RV64I-NEXT:    mv s3, a0
+; RV64I-NEXT:    mv s2, a0
 ; RV64I-NEXT:    mv a0, s0
-; RV64I-NEXT:    mv a1, s1
+; RV64I-NEXT:    li a1, 0
 ; RV64I-NEXT:    call __addsf3
 ; RV64I-NEXT:    mv s0, a0
-; RV64I-NEXT:    mv a0, s2
-; RV64I-NEXT:    mv a1, s1
+; RV64I-NEXT:    mv a0, s1
+; RV64I-NEXT:    li a1, 0
 ; RV64I-NEXT:    call __addsf3
 ; RV64I-NEXT:    mv s1, a0
-; RV64I-NEXT:    mv a0, s3
+; RV64I-NEXT:    mv a0, s2
 ; RV64I-NEXT:    mv a1, s0
 ; RV64I-NEXT:    call __mulsf3
 ; RV64I-NEXT:    lui a1, 524288
 ; RV64I-NEXT:    xor a0, a0, a1
 ; RV64I-NEXT:    mv a1, s1
 ; RV64I-NEXT:    call __subsf3
-; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    addi sp, sp, 48
+; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s2, 0(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 32
 ; RV64I-NEXT:    ret
   %a_ = fadd float 0.0, %a ; avoid negation using xor
   %b_ = fadd float 0.0, %b ; avoid negation using xor
@@ -1090,66 +1058,58 @@ define float @fnmsub_s_contract(float %a, float %b, float %c) nounwind {
 ;
 ; RV32I-LABEL: fnmsub_s_contract:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -32
-; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a1
-; RV32I-NEXT:    lui a1, %hi(.LCPI24_0)
-; RV32I-NEXT:    lw s1, %lo(.LCPI24_0)(a1)
-; RV32I-NEXT:    mv s2, a2
-; RV32I-NEXT:    mv a1, s1
+; RV32I-NEXT:    mv s1, a2
+; RV32I-NEXT:    li a1, 0
 ; RV32I-NEXT:    call __addsf3
-; RV32I-NEXT:    mv s3, a0
+; RV32I-NEXT:    mv s2, a0
 ; RV32I-NEXT:    mv a0, s0
-; RV32I-NEXT:    mv a1, s1
+; RV32I-NEXT:    li a1, 0
 ; RV32I-NEXT:    call __addsf3
 ; RV32I-NEXT:    mv a1, a0
-; RV32I-NEXT:    mv a0, s3
+; RV32I-NEXT:    mv a0, s2
 ; RV32I-NEXT:    call __mulsf3
 ; RV32I-NEXT:    mv a1, a0
-; RV32I-NEXT:    mv a0, s2
+; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    call __subsf3
-; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 32
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: fnmsub_s_contract:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    addi sp, sp, -48
-; RV64I-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    addi sp, sp, -32
+; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s2, 0(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    mv s0, a1
-; RV64I-NEXT:    lui a1, %hi(.LCPI24_0)
-; RV64I-NEXT:    lw s1, %lo(.LCPI24_0)(a1)
-; RV64I-NEXT:    mv s2, a2
-; RV64I-NEXT:    mv a1, s1
+; RV64I-NEXT:    mv s1, a2
+; RV64I-NEXT:    li a1, 0
 ; RV64I-NEXT:    call __addsf3
-; RV64I-NEXT:    mv s3, a0
+; RV64I-NEXT:    mv s2, a0
 ; RV64I-NEXT:    mv a0, s0
-; RV64I-NEXT:    mv a1, s1
+; RV64I-NEXT:    li a1, 0
 ; RV64I-NEXT:    call __addsf3
 ; RV64I-NEXT:    mv a1, a0
-; RV64I-NEXT:    mv a0, s3
+; RV64I-NEXT:    mv a0, s2
 ; RV64I-NEXT:    call __mulsf3
 ; RV64I-NEXT:    mv a1, a0
-; RV64I-NEXT:    mv a0, s2
+; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    call __subsf3
-; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    addi sp, sp, 48
+; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s2, 0(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 32
 ; RV64I-NEXT:    ret
   %a_ = fadd float 0.0, %a ; avoid negation using xor
   %b_ = fadd float 0.0, %b ; avoid negation using xor
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/fp-constant-f16.mir b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/fp-constant-f16.mir
index 3028b6476e20b..a688153d44be5 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/fp-constant-f16.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/fp-constant-f16.mir
@@ -57,8 +57,7 @@ body:             |
     ; CHECK-LABEL: name: half_positive_zero
     ; CHECK: liveins: $x10
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x0
-    ; CHECK-NEXT: [[FMV_H_X:%[0-9]+]]:fpr16 = FMV_H_X [[COPY]]
+    ; CHECK-NEXT: [[FMV_H_X:%[0-9]+]]:fpr16 = FMV_H_X $x0
     ; CHECK-NEXT: $f10_h = COPY [[FMV_H_X]]
     ; CHECK-NEXT: PseudoRET implicit $f10_h
     %1:fprb(s16) = G_FCONSTANT half 0.000000e+00
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/fp-constant.mir b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/fp-constant.mir
index 4db80c6c1141f..7dde7771f161b 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/fp-constant.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/fp-constant.mir
@@ -56,8 +56,7 @@ body:             |
     ; CHECK-LABEL: name: float_positive_zero
     ; CHECK: liveins: $x10
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x0
-    ; CHECK-NEXT: [[FMV_W_X:%[0-9]+]]:fpr32 = FMV_W_X [[COPY]]
+    ; CHECK-NEXT: [[FMV_W_X:%[0-9]+]]:fpr32 = FMV_W_X $x0
     ; CHECK-NEXT: $f10_f = COPY [[FMV_W_X]]
     ; CHECK-NEXT: PseudoRET implicit $f10_f
     %1:fprb(s32) = G_FCONSTANT float 0.000000e+00
@@ -171,8 +170,7 @@ body:             |
     ; RV64-LABEL: name: double_positive_zero
     ; RV64: liveins: $x10
     ; RV64-NEXT: {{  $}}
-    ; RV64-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x0
-    ; RV64-NEXT: [[FMV_D_X:%[0-9]+]]:fpr64 = FMV_D_X [[COPY]]
+    ; RV64-NEXT: [[FMV_D_X:%[0-9]+]]:fpr64 = FMV_D_X $x0
     ; RV64-NEXT: $f10_d = COPY [[FMV_D_X]]
     ; RV64-NEXT: PseudoRET implicit $f10_d
     %1:fprb(s64) = G_FCONSTANT double 0.000000e+00
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-sadde-rv32.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-sadde-rv32.mir
index 64800fedc9d2a..c67998eb50d4b 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-sadde-rv32.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-sadde-rv32.mir
@@ -136,38 +136,49 @@ body:             |
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $x13
     ; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $x14
     ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY]], [[COPY2]]
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C]]
-    ; CHECK-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[ADD]], [[AND]]
-    ; CHECK-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY [[ADD1]](s32)
-    ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[ADD1]], [[COPY]]
-    ; CHECK-NEXT: [[XOR1:%[0-9]+]]:_(s32) = G_XOR [[ADD1]], [[COPY2]]
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[XOR]], [[XOR1]]
-    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(slt), [[AND1]](s32), [[C1]]
-    ; CHECK-NEXT: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[COPY1]], [[COPY3]]
-    ; CHECK-NEXT: [[ADD3:%[0-9]+]]:_(s32) = G_ADD [[ADD2]], [[ICMP]]
-    ; CHECK-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY [[ADD3]](s32)
-    ; CHECK-NEXT: [[XOR2:%[0-9]+]]:_(s32) = G_XOR [[ADD3]], [[COPY1]]
-    ; CHECK-NEXT: [[XOR3:%[0-9]+]]:_(s32) = G_XOR [[ADD3]], [[COPY3]]
-    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[XOR2]], [[XOR3]]
-    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(slt), [[AND2]](s32), [[C1]]
-    ; CHECK-NEXT: $x10 = COPY [[COPY5]](s32)
-    ; CHECK-NEXT: $x11 = COPY [[COPY6]](s32)
-    ; CHECK-NEXT: $x12 = COPY [[ICMP1]](s32)
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[ADD]](s32), [[COPY2]]
+    ; CHECK-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY [[ADD]](s32)
+    ; CHECK-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[COPY1]], [[COPY3]]
+    ; CHECK-NEXT: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[ADD1]], [[ICMP]]
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY [[ADD2]](s32)
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C1]]
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[DEF]], [[C]]
+    ; CHECK-NEXT: [[ADD3:%[0-9]+]]:_(s32) = G_ADD [[COPY5]], [[AND]]
+    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[ADD3]](s32), [[AND]]
+    ; CHECK-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY [[ADD3]](s32)
+    ; CHECK-NEXT: [[ADD4:%[0-9]+]]:_(s32) = G_ADD [[COPY6]], [[AND1]]
+    ; CHECK-NEXT: [[ADD5:%[0-9]+]]:_(s32) = G_ADD [[ADD4]], [[ICMP1]]
+    ; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY [[ADD5]](s32)
+    ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[COPY7]], [[COPY]]
+    ; CHECK-NEXT: [[XOR1:%[0-9]+]]:_(s32) = G_XOR [[COPY8]], [[COPY1]]
+    ; CHECK-NEXT: [[XOR2:%[0-9]+]]:_(s32) = G_XOR [[COPY7]], [[COPY2]]
+    ; CHECK-NEXT: [[XOR3:%[0-9]+]]:_(s32) = G_XOR [[COPY8]], [[COPY3]]
+    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[XOR]], [[XOR2]]
+    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[XOR1]], [[XOR3]]
+    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[AND2]](s32), [[C]]
+    ; CHECK-NEXT: [[ICMP3:%[0-9]+]]:_(s32) = G_ICMP intpred(slt), [[AND3]](s32), [[C]]
+    ; CHECK-NEXT: [[ICMP4:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[AND3]](s32), [[C]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP4]](s32), [[ICMP2]], [[ICMP3]]
+    ; CHECK-NEXT: $x10 = COPY [[COPY7]](s32)
+    ; CHECK-NEXT: $x11 = COPY [[COPY8]](s32)
+    ; CHECK-NEXT: $x12 = COPY [[SELECT]](s32)
     ; CHECK-NEXT: PseudoRET implicit $x10, implicit $x11, implicit $x12
     %0:_(s32) = COPY $x10
     %1:_(s32) = COPY $x11
-    %2:_(s32) = COPY $x12
-    %3:_(s32) = COPY $x13
-    %4:_(s32) = COPY $x14
-    %5:_(s1)  = G_TRUNC %4(s32)
-    %6:_(s32), %7:_(s1) = G_SADDE %0, %2, %5
-    %8:_(s32), %9:_(s1) = G_SADDE %1, %3, %7
-    %10:_(s32) = G_ANYEXT %9(s1)
-    $x10 = COPY %6(s32)
-    $x11 = COPY %8(s32)
-    $x12 = COPY %10(s32)
-
+    %2:_(s64) = G_MERGE_VALUES %0(s32), %1(s32)
+    %3:_(s32) = COPY $x12
+    %4:_(s32) = COPY $x13
+    %5:_(s64) = G_MERGE_VALUES %3(s32), %4(s32)
+    %6:_(s32) = COPY $x14
+    %7:_(s1)  = G_TRUNC %6(s32)
+    %8:_(s64), %9:_(s1) = G_SADDE %2, %5, %7
+    %10:_(s32), %11:_(s32) = G_UNMERGE_VALUES %8(s64)
+    %12:_(s32) = G_ANYEXT %9(s1)
+    $x10 = COPY %10(s32)
+    $x11 = COPY %11(s32)
+    $x12 = COPY %12(s32)
     PseudoRET implicit $x10, implicit $x11, implicit $x12
 ...
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-sadde-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-sadde-rv64.mir
index db1f50535b526..413aaff9b644b 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-sadde-rv64.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-sadde-rv64.mir
@@ -174,38 +174,52 @@ body:             |
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s64) = COPY $x13
     ; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(s64) = COPY $x14
     ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[COPY]], [[COPY2]]
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY4]], [[C]]
-    ; CHECK-NEXT: [[ADD1:%[0-9]+]]:_(s64) = G_ADD [[ADD]], [[AND]]
-    ; CHECK-NEXT: [[COPY5:%[0-9]+]]:_(s64) = COPY [[ADD1]](s64)
-    ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s64) = G_XOR [[ADD1]], [[COPY]]
-    ; CHECK-NEXT: [[XOR1:%[0-9]+]]:_(s64) = G_XOR [[ADD1]], [[COPY2]]
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[XOR]], [[XOR1]]
-    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(slt), [[AND1]](s64), [[C1]]
-    ; CHECK-NEXT: [[ADD2:%[0-9]+]]:_(s64) = G_ADD [[COPY1]], [[COPY3]]
-    ; CHECK-NEXT: [[ADD3:%[0-9]+]]:_(s64) = G_ADD [[ADD2]], [[ICMP]]
-    ; CHECK-NEXT: [[COPY6:%[0-9]+]]:_(s64) = COPY [[ADD3]](s64)
-    ; CHECK-NEXT: [[XOR2:%[0-9]+]]:_(s64) = G_XOR [[ADD3]], [[COPY1]]
-    ; CHECK-NEXT: [[XOR3:%[0-9]+]]:_(s64) = G_XOR [[ADD3]], [[COPY3]]
-    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s64) = G_AND [[XOR2]], [[XOR3]]
-    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s64) = G_ICMP intpred(slt), [[AND2]](s64), [[C1]]
-    ; CHECK-NEXT: $x10 = COPY [[COPY5]](s64)
-    ; CHECK-NEXT: $x11 = COPY [[COPY6]](s64)
-    ; CHECK-NEXT: $x12 = COPY [[ICMP1]](s64)
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(ult), [[ADD]](s64), [[COPY2]]
+    ; CHECK-NEXT: [[COPY5:%[0-9]+]]:_(s64) = COPY [[ADD]](s64)
+    ; CHECK-NEXT: [[ADD1:%[0-9]+]]:_(s64) = G_ADD [[COPY1]], [[COPY3]]
+    ; CHECK-NEXT: [[ADD2:%[0-9]+]]:_(s64) = G_ADD [[ADD1]], [[ICMP]]
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[COPY6:%[0-9]+]]:_(s64) = COPY [[ADD2]](s64)
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s64) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY4]], [[C1]]
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[DEF]], [[C]]
+    ; CHECK-NEXT: [[ADD3:%[0-9]+]]:_(s64) = G_ADD [[COPY5]], [[AND]]
+    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s64) = G_ICMP intpred(ult), [[ADD3]](s64), [[AND]]
+    ; CHECK-NEXT: [[COPY7:%[0-9]+]]:_(s64) = COPY [[ADD3]](s64)
+    ; CHECK-NEXT: [[ADD4:%[0-9]+]]:_(s64) = G_ADD [[COPY6]], [[AND1]]
+    ; CHECK-NEXT: [[ADD5:%[0-9]+]]:_(s64) = G_ADD [[ADD4]], [[ICMP1]]
+    ; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(s64) = COPY [[ADD5]](s64)
+    ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s64) = G_XOR [[COPY7]], [[COPY]]
+    ; CHECK-NEXT: [[XOR1:%[0-9]+]]:_(s64) = G_XOR [[COPY8]], [[COPY1]]
+    ; CHECK-NEXT: [[XOR2:%[0-9]+]]:_(s64) = G_XOR [[COPY7]], [[COPY2]]
+    ; CHECK-NEXT: [[XOR3:%[0-9]+]]:_(s64) = G_XOR [[COPY8]], [[COPY3]]
+    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s64) = G_AND [[XOR]], [[XOR2]]
+    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s64) = G_AND [[XOR1]], [[XOR3]]
+    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s64) = G_ICMP intpred(ult), [[AND2]](s64), [[C]]
+    ; CHECK-NEXT: [[ICMP3:%[0-9]+]]:_(s64) = G_ICMP intpred(slt), [[AND3]](s64), [[C]]
+    ; CHECK-NEXT: [[ICMP4:%[0-9]+]]:_(s64) = G_ICMP intpred(eq), [[AND3]](s64), [[C]]
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[ICMP2]](s64)
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[ICMP3]](s64)
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP4]](s64), [[TRUNC]], [[TRUNC1]]
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[SELECT]](s32)
+    ; CHECK-NEXT: $x10 = COPY [[COPY7]](s64)
+    ; CHECK-NEXT: $x11 = COPY [[COPY8]](s64)
+    ; CHECK-NEXT: $x12 = COPY [[ANYEXT]](s64)
     ; CHECK-NEXT: PseudoRET implicit $x10, implicit $x11, implicit $x12
     %0:_(s64) = COPY $x10
     %1:_(s64) = COPY $x11
-    %2:_(s64) = COPY $x12
-    %3:_(s64) = COPY $x13
-    %4:_(s64) = COPY $x14
-    %5:_(s1)  = G_TRUNC %4(s64)
-    %6:_(s64), %7:_(s1) = G_SADDE %0, %2, %5
-    %8:_(s64), %9:_(s1) = G_SADDE %1, %3, %7
-    %10:_(s64) = G_ANYEXT %9(s1)
-    $x10 = COPY %6(s64)
-    $x11 = COPY %8(s64)
-    $x12 = COPY %10(s64)
-
+    %2:_(s128) = G_MERGE_VALUES %0(s64), %1(s64)
+    %3:_(s64) = COPY $x12
+    %4:_(s64) = COPY $x13
+    %5:_(s128) = G_MERGE_VALUES %3(s64), %4(s64)
+    %6:_(s64) = COPY $x14
+    %7:_(s1)  = G_TRUNC %6(s64)
+    %8:_(s128), %9:_(s1) = G_SADDE %2, %5, %7
+    %10:_(s64), %11:_(s64) = G_UNMERGE_VALUES %8(s128)
+    %12:_(s64) = G_ANYEXT %9(s1)
+    $x10 = COPY %10(s64)
+    $x11 = COPY %11(s64)
+    $x12 = COPY %12(s64)
     PseudoRET implicit $x10, implicit $x11, implicit $x12
 ...
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/vararg.ll b/llvm/test/CodeGen/RISCV/GlobalISel/vararg.ll
index d9ddf655c283a..bb96ba7e5b1fb 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/vararg.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/vararg.ll
@@ -437,8 +437,8 @@ define void @va1_caller() nounwind {
 ; LP64:       # %bb.0:
 ; LP64-NEXT:    addi sp, sp, -16
 ; LP64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
-; LP64-NEXT:    lui a0, %hi(.LCPI3_0)
-; LP64-NEXT:    ld a1, %lo(.LCPI3_0)(a0)
+; LP64-NEXT:    li a1, 1023
+; LP64-NEXT:    slli a1, a1, 52
 ; LP64-NEXT:    li a2, 2
 ; LP64-NEXT:    call va1
 ; LP64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
@@ -494,8 +494,8 @@ define void @va1_caller() nounwind {
 ; RV64-WITHFP-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; RV64-WITHFP-NEXT:    sd s0, 0(sp) # 8-byte Folded Spill
 ; RV64-WITHFP-NEXT:    addi s0, sp, 16
-; RV64-WITHFP-NEXT:    lui a0, %hi(.LCPI3_0)
-; RV64-WITHFP-NEXT:    ld a1, %lo(.LCPI3_0)(a0)
+; RV64-WITHFP-NEXT:    li a1, 1023
+; RV64-WITHFP-NEXT:    slli a1, a1, 52
 ; RV64-WITHFP-NEXT:    li a2, 2
 ; RV64-WITHFP-NEXT:    call va1
 ; RV64-WITHFP-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/O3-pipeline.ll b/llvm/test/CodeGen/RISCV/O3-pipeline.ll
index c7f70a9d266c2..ea08061221fd4 100644
--- a/llvm/test/CodeGen/RISCV/O3-pipeline.ll
+++ b/llvm/test/CodeGen/RISCV/O3-pipeline.ll
@@ -107,6 +107,9 @@
 ; CHECK-NEXT:       Remove dead machine instructions
 ; CHECK-NEXT:       MachineDominator Tree Construction
 ; CHECK-NEXT:       Machine Natural Loop Construction
+; CHECK-NEXT:       Machine Trace Metrics 
+; CHECK-NEXT:       Lazy Machine Block Frequency Analysis 
+; CHECK-NEXT:       Machine InstCombiner 
 ; CHECK-NEXT:       Machine Block Frequency Analysis
 ; CHECK-NEXT:       Early Machine Loop Invariant Code Motion
 ; CHECK-NEXT:       MachineDominator Tree Construction
@@ -117,9 +120,6 @@
 ; CHECK-NEXT:       Machine code sinking
 ; CHECK-NEXT:       Peephole Optimizations
 ; CHECK-NEXT:       Remove dead machine instructions
-; CHECK-NEXT:       Machine Trace Metrics
-; CHECK-NEXT:       Lazy Machine Block Frequency Analysis
-; CHECK-NEXT:       Machine InstCombiner
 ; RV64-NEXT:        RISC-V Optimize W Instructions
 ; CHECK-NEXT:       RISC-V Pre-RA pseudo instruction expansion pass
 ; CHECK-NEXT:       RISC-V Merge Base Offset
diff --git a/llvm/test/CodeGen/RISCV/attributes.ll b/llvm/test/CodeGen/RISCV/attributes.ll
index eacd5c9a88bba..154fb83172341 100644
--- a/llvm/test/CodeGen/RISCV/attributes.ll
+++ b/llvm/test/CodeGen/RISCV/attributes.ll
@@ -143,6 +143,7 @@
 ; RUN: llc -mtriple=riscv32 -mattr=+supm %s -o - | FileCheck --check-prefix=RV32SUPM %s
 ; RUN: llc -mtriple=riscv32 -mattr=+smctr  %s -o - | FileCheck --check-prefix=RV32SMCTR %s
 ; RUN: llc -mtriple=riscv32 -mattr=+ssctr  %s -o - | FileCheck --check-prefix=RV32SSCTR %s
+; RUN: llc -mtriple=riscv32 -mattr=+experimental-zibi %s -o - | FileCheck --check-prefix=RV32ZIBI %s
 
 ; RUN: llc -mtriple=riscv64 %s -o - | FileCheck %s
 ; RUN: llc -mtriple=riscv64 -mattr=+m %s -o - | FileCheck --check-prefixes=CHECK,RV64M %s
@@ -292,6 +293,7 @@
 ; RUN: llc -mtriple=riscv64 -mattr=+sdext  %s -o - | FileCheck --check-prefix=RV64SDEXT %s
 ; RUN: llc -mtriple=riscv64 -mattr=+sdtrig  %s -o - | FileCheck --check-prefix=RV64SDTRIG %s
 ; RUN: llc -mtriple=riscv64 -mattr=+experimental-p %s -o - | FileCheck --check-prefix=RV64P %s
+; RUN: llc -mtriple=riscv64 -mattr=+experimental-zibi %s -o - | FileCheck --check-prefix=RV64ZIBI %s
 
 
 ; Tests for profile features.
@@ -452,6 +454,7 @@
 ; RV32SMCTR: .attribute 5, "rv32i2p1_smctr1p0_sscsrind1p0"
 ; RV32SSCTR: .attribute 5, "rv32i2p1_sscsrind1p0_ssctr1p0"
 ; RV32P: .attribute 5, "rv32i2p1_p0p15"
+; RV32ZIBI: .attribute 5, "rv32i2p1_zibi0p1"
 
 ; RV64M: .attribute 5, "rv64i2p1_m2p0_zmmul1p0"
 ; RV64ZMMUL: .attribute 5, "rv64i2p1_zmmul1p0"
@@ -599,6 +602,7 @@
 ; RV64SDEXT: .attribute 5, "rv64i2p1_sdext1p0"
 ; RV64SDTRIG: .attribute 5, "rv64i2p1_sdtrig1p0"
 ; RV64P: .attribute 5, "rv64i2p1_p0p15"
+; RV64ZIBI: .attribute 5, "rv64i2p1_zibi0p1"
 
 ; RVI20U32: .attribute 5, "rv32i2p1"
 ; RVI20U64: .attribute 5, "rv64i2p1"
diff --git a/llvm/test/CodeGen/RISCV/condops.ll b/llvm/test/CodeGen/RISCV/condops.ll
index 4fb3dff88017c..9d95f1f5c9615 100644
--- a/llvm/test/CodeGen/RISCV/condops.ll
+++ b/llvm/test/CodeGen/RISCV/condops.ll
@@ -3,7 +3,7 @@
 ; RUN: llc -mtriple=riscv64 -target-abi=lp64f -mattr=+f,+zbs < %s | FileCheck %s -check-prefix=RV64I
 ; RUN: llc -mtriple=riscv32 -target-abi=ilp32f -mattr=+f,+zbs,+xventanacondops < %s | FileCheck %s -check-prefix=RV32XVENTANACONDOPS
 ; RUN: llc -mtriple=riscv64 -target-abi=lp64f -mattr=+f,+zbs,+xventanacondops < %s | FileCheck %s -check-prefix=RV64XVENTANACONDOPS
-; RUN: llc -mtriple=riscv64 -target-abi=lp64f -mattr=+f,+zbs,+xtheadcondmov < %s | FileCheck %s -check-prefix=RV64XTHEADCONDMOV
+; RUN: llc -mtriple=riscv64 -target-abi=lp64f -mattr=+f,+xtheadbs,+xtheadcondmov < %s | FileCheck %s -check-prefix=RV64XTHEADCONDMOV
 ; RUN: llc -mtriple=riscv32 -target-abi=ilp32f -mattr=+f,+zbs,+zicond < %s | FileCheck %s -check-prefix=RV32ZICOND
 ; RUN: llc -mtriple=riscv64 -target-abi=lp64f -mattr=+f,+zbs,+zicond < %s | FileCheck %s -check-prefix=RV64ZICOND
 
@@ -126,7 +126,7 @@ define i64 @zero_singlebit1(i64 %rs1, i64 %rs2) {
 ;
 ; RV64XTHEADCONDMOV-LABEL: zero_singlebit1:
 ; RV64XTHEADCONDMOV:       # %bb.0:
-; RV64XTHEADCONDMOV-NEXT:    bexti a1, a1, 12
+; RV64XTHEADCONDMOV-NEXT:    th.tst a1, a1, 12
 ; RV64XTHEADCONDMOV-NEXT:    th.mvnez a0, zero, a1
 ; RV64XTHEADCONDMOV-NEXT:    ret
 ;
@@ -179,9 +179,8 @@ define i64 @zero_singlebit2(i64 %rs1, i64 %rs2) {
 ;
 ; RV64XTHEADCONDMOV-LABEL: zero_singlebit2:
 ; RV64XTHEADCONDMOV:       # %bb.0:
-; RV64XTHEADCONDMOV-NEXT:    slli a1, a1, 51
-; RV64XTHEADCONDMOV-NEXT:    srai a1, a1, 63
-; RV64XTHEADCONDMOV-NEXT:    and a0, a1, a0
+; RV64XTHEADCONDMOV-NEXT:    th.tst a1, a1, 12
+; RV64XTHEADCONDMOV-NEXT:    th.mveqz a0, zero, a1
 ; RV64XTHEADCONDMOV-NEXT:    ret
 ;
 ; RV32ZICOND-LABEL: zero_singlebit2:
@@ -4297,9 +4296,8 @@ define i64 @single_bit(i64 %x) {
 ;
 ; RV64XTHEADCONDMOV-LABEL: single_bit:
 ; RV64XTHEADCONDMOV:       # %bb.0: # %entry
-; RV64XTHEADCONDMOV-NEXT:    slli a1, a0, 53
-; RV64XTHEADCONDMOV-NEXT:    srai a1, a1, 63
-; RV64XTHEADCONDMOV-NEXT:    and a0, a1, a0
+; RV64XTHEADCONDMOV-NEXT:    andi a1, a0, 1024
+; RV64XTHEADCONDMOV-NEXT:    th.mveqz a0, zero, a1
 ; RV64XTHEADCONDMOV-NEXT:    ret
 ;
 ; RV32ZICOND-LABEL: single_bit:
@@ -4353,9 +4351,8 @@ define i64 @single_bit2(i64 %x) {
 ;
 ; RV64XTHEADCONDMOV-LABEL: single_bit2:
 ; RV64XTHEADCONDMOV:       # %bb.0: # %entry
-; RV64XTHEADCONDMOV-NEXT:    slli a1, a0, 52
-; RV64XTHEADCONDMOV-NEXT:    srai a1, a1, 63
-; RV64XTHEADCONDMOV-NEXT:    and a0, a1, a0
+; RV64XTHEADCONDMOV-NEXT:    th.tst a1, a0, 11
+; RV64XTHEADCONDMOV-NEXT:    th.mveqz a0, zero, a1
 ; RV64XTHEADCONDMOV-NEXT:    ret
 ;
 ; RV32ZICOND-LABEL: single_bit2:
diff --git a/llvm/test/CodeGen/RISCV/features-info.ll b/llvm/test/CodeGen/RISCV/features-info.ll
index 01b8c0eaadb05..a3b56c6fd3d77 100644
--- a/llvm/test/CodeGen/RISCV/features-info.ll
+++ b/llvm/test/CodeGen/RISCV/features-info.ll
@@ -52,6 +52,7 @@
 ; CHECK-NEXT:   experimental-xsfmclic            - 'XSfmclic' (SiFive CLIC Machine-mode CSRs).
 ; CHECK-NEXT:   experimental-xsfsclic            - 'XSfsclic' (SiFive CLIC Supervisor-mode CSRs).
 ; CHECK-NEXT:   experimental-zalasr              - 'Zalasr' (Load-Acquire and Store-Release Instructions).
+; CHECK-NEXT:   experimental-zibi                - 'Zibi' (Branch with Immediate).
 ; CHECK-NEXT:   experimental-zicfilp             - 'Zicfilp' (Landing pad).
 ; CHECK-NEXT:   experimental-zicfiss             - 'Zicfiss' (Shadow stack).
 ; CHECK-NEXT:   experimental-zvbc32e             - 'Zvbc32e' (Vector Carryless Multiplication with 32-bits elements).
diff --git a/llvm/test/CodeGen/RISCV/machine-combiner.ll b/llvm/test/CodeGen/RISCV/machine-combiner.ll
index 7a1c41c1839fa..69eca6dd7768a 100644
--- a/llvm/test/CodeGen/RISCV/machine-combiner.ll
+++ b/llvm/test/CodeGen/RISCV/machine-combiner.ll
@@ -1094,33 +1094,19 @@ declare float @llvm.maxnum.f32(float, float)
 declare double @llvm.maxnum.f64(double, double)
 
 define double @test_fmadd_strategy(double %a0, double %a1, double %a2, double %a3, i64 %flag) {
-; CHECK_LOCAL-LABEL: test_fmadd_strategy:
-; CHECK_LOCAL:       # %bb.0: # %entry
-; CHECK_LOCAL-NEXT:    fsub.d fa4, fa0, fa1
-; CHECK_LOCAL-NEXT:    andi a0, a0, 1
-; CHECK_LOCAL-NEXT:    fmv.d fa5, fa0
-; CHECK_LOCAL-NEXT:    fmul.d fa0, fa4, fa2
-; CHECK_LOCAL-NEXT:    beqz a0, .LBB76_2
-; CHECK_LOCAL-NEXT:  # %bb.1: # %entry
-; CHECK_LOCAL-NEXT:    fmul.d fa4, fa5, fa1
-; CHECK_LOCAL-NEXT:    fmadd.d fa5, fa5, fa1, fa0
-; CHECK_LOCAL-NEXT:    fsub.d fa0, fa5, fa4
-; CHECK_LOCAL-NEXT:  .LBB76_2: # %entry
-; CHECK_LOCAL-NEXT:    ret
-;
-; CHECK_GLOBAL-LABEL: test_fmadd_strategy:
-; CHECK_GLOBAL:       # %bb.0: # %entry
-; CHECK_GLOBAL-NEXT:    fsub.d fa4, fa0, fa1
-; CHECK_GLOBAL-NEXT:    andi a0, a0, 1
-; CHECK_GLOBAL-NEXT:    fmv.d fa5, fa0
-; CHECK_GLOBAL-NEXT:    fmul.d fa0, fa4, fa2
-; CHECK_GLOBAL-NEXT:    beqz a0, .LBB76_2
-; CHECK_GLOBAL-NEXT:  # %bb.1: # %entry
-; CHECK_GLOBAL-NEXT:    fmul.d fa5, fa5, fa1
-; CHECK_GLOBAL-NEXT:    fadd.d fa4, fa5, fa0
-; CHECK_GLOBAL-NEXT:    fsub.d fa0, fa4, fa5
-; CHECK_GLOBAL-NEXT:  .LBB76_2: # %entry
-; CHECK_GLOBAL-NEXT:    ret
+; CHECK-LABEL: test_fmadd_strategy:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsub.d fa5, fa0, fa1
+; CHECK-NEXT:    andi a0, a0, 1
+; CHECK-NEXT:    beqz a0, .LBB76_2
+; CHECK-NEXT:  # %bb.1: # %entry
+; CHECK-NEXT:    fmul.d fa4, fa0, fa1
+; CHECK-NEXT:    fmadd.d fa5, fa5, fa2, fa4
+; CHECK-NEXT:    fsub.d fa0, fa5, fa4
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB76_2:
+; CHECK-NEXT:    fmul.d fa0, fa5, fa2
+; CHECK-NEXT:    ret
 entry:
   %sub = fsub contract double %a0, %a1
   %mul = fmul contract double %sub, %a2
@@ -1132,3 +1118,6 @@ entry:
   %retval.0 = select i1 %tobool.not, double %mul, double %sub3
   ret double %retval.0
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK_GLOBAL: {{.*}}
+; CHECK_LOCAL: {{.*}}
diff --git a/llvm/test/CodeGen/RISCV/neg-abs.ll b/llvm/test/CodeGen/RISCV/neg-abs.ll
index da81fe5708814..f9ccf7637eee9 100644
--- a/llvm/test/CodeGen/RISCV/neg-abs.ll
+++ b/llvm/test/CodeGen/RISCV/neg-abs.ll
@@ -208,14 +208,14 @@ define i64 @neg_abs64_multiuse(i64 %x, ptr %y) {
 ; RV32I-NEXT:    sub a1, a1, a3
 ; RV32I-NEXT:    neg a0, a0
 ; RV32I-NEXT:  .LBB5_2:
-; RV32I-NEXT:    snez a3, a0
-; RV32I-NEXT:    neg a4, a1
-; RV32I-NEXT:    sub a3, a4, a3
-; RV32I-NEXT:    neg a4, a0
+; RV32I-NEXT:    snez a4, a0
+; RV32I-NEXT:    neg a3, a0
+; RV32I-NEXT:    add a4, a1, a4
+; RV32I-NEXT:    neg a4, a4
 ; RV32I-NEXT:    sw a0, 0(a2)
 ; RV32I-NEXT:    sw a1, 4(a2)
-; RV32I-NEXT:    mv a0, a4
-; RV32I-NEXT:    mv a1, a3
+; RV32I-NEXT:    mv a0, a3
+; RV32I-NEXT:    mv a1, a4
 ; RV32I-NEXT:    ret
 ;
 ; RV32ZBB-LABEL: neg_abs64_multiuse:
@@ -227,14 +227,14 @@ define i64 @neg_abs64_multiuse(i64 %x, ptr %y) {
 ; RV32ZBB-NEXT:    sub a1, a1, a3
 ; RV32ZBB-NEXT:    neg a0, a0
 ; RV32ZBB-NEXT:  .LBB5_2:
-; RV32ZBB-NEXT:    snez a3, a0
-; RV32ZBB-NEXT:    neg a4, a1
-; RV32ZBB-NEXT:    sub a3, a4, a3
-; RV32ZBB-NEXT:    neg a4, a0
+; RV32ZBB-NEXT:    snez a4, a0
+; RV32ZBB-NEXT:    neg a3, a0
+; RV32ZBB-NEXT:    add a4, a1, a4
+; RV32ZBB-NEXT:    neg a4, a4
 ; RV32ZBB-NEXT:    sw a0, 0(a2)
 ; RV32ZBB-NEXT:    sw a1, 4(a2)
-; RV32ZBB-NEXT:    mv a0, a4
-; RV32ZBB-NEXT:    mv a1, a3
+; RV32ZBB-NEXT:    mv a0, a3
+; RV32ZBB-NEXT:    mv a1, a4
 ; RV32ZBB-NEXT:    ret
 ;
 ; RV64I-LABEL: neg_abs64_multiuse:
diff --git a/llvm/test/CodeGen/RISCV/pr158121.ll b/llvm/test/CodeGen/RISCV/pr158121.ll
new file mode 100644
index 0000000000000..2c018444e9c67
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/pr158121.ll
@@ -0,0 +1,17 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=riscv64 | FileCheck %s
+
+define i64 @f(ptr %p) {
+; CHECK-LABEL: f:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lb a0, 0(a0)
+; CHECK-NEXT:    srai a0, a0, 63
+; CHECK-NEXT:    ret
+  %load = load i8, ptr %p, align 1
+  %conv1 = zext i8 %load to i32
+  %cmp = icmp ult i32 127, %conv1
+  %conv2 = zext i1 %cmp to i32
+  %sub = sub nsw i32 0, %conv2
+  %conv3 = sext i32 %sub to i64
+  ret i64 %conv3
+}
diff --git a/llvm/test/CodeGen/RISCV/rv32xtheadba.ll b/llvm/test/CodeGen/RISCV/rv32xtheadba.ll
index 0fc0adbfa83d9..0e4a5c07020ee 100644
--- a/llvm/test/CodeGen/RISCV/rv32xtheadba.ll
+++ b/llvm/test/CodeGen/RISCV/rv32xtheadba.ll
@@ -656,12 +656,18 @@ define i32 @add8192(i32 %a) {
 }
 
 define i32 @addshl_5_6(i32 %a, i32 %b) {
-; CHECK-LABEL: addshl_5_6:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    slli a0, a0, 5
-; CHECK-NEXT:    slli a1, a1, 6
-; CHECK-NEXT:    add a0, a0, a1
-; CHECK-NEXT:    ret
+; RV32I-LABEL: addshl_5_6:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    slli a0, a0, 5
+; RV32I-NEXT:    slli a1, a1, 6
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    ret
+;
+; RV32XTHEADBA-LABEL: addshl_5_6:
+; RV32XTHEADBA:       # %bb.0:
+; RV32XTHEADBA-NEXT:    th.addsl a0, a0, a1, 1
+; RV32XTHEADBA-NEXT:    slli a0, a0, 5
+; RV32XTHEADBA-NEXT:    ret
   %c = shl i32 %a, 5
   %d = shl i32 %b, 6
   %e = add i32 %c, %d
@@ -669,12 +675,18 @@ define i32 @addshl_5_6(i32 %a, i32 %b) {
 }
 
 define i32 @addshl_5_7(i32 %a, i32 %b) {
-; CHECK-LABEL: addshl_5_7:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    slli a0, a0, 5
-; CHECK-NEXT:    slli a1, a1, 7
-; CHECK-NEXT:    add a0, a0, a1
-; CHECK-NEXT:    ret
+; RV32I-LABEL: addshl_5_7:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    slli a0, a0, 5
+; RV32I-NEXT:    slli a1, a1, 7
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    ret
+;
+; RV32XTHEADBA-LABEL: addshl_5_7:
+; RV32XTHEADBA:       # %bb.0:
+; RV32XTHEADBA-NEXT:    th.addsl a0, a0, a1, 2
+; RV32XTHEADBA-NEXT:    slli a0, a0, 5
+; RV32XTHEADBA-NEXT:    ret
   %c = shl i32 %a, 5
   %d = shl i32 %b, 7
   %e = add i32 %c, %d
@@ -682,12 +694,18 @@ define i32 @addshl_5_7(i32 %a, i32 %b) {
 }
 
 define i32 @addshl_5_8(i32 %a, i32 %b) {
-; CHECK-LABEL: addshl_5_8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    slli a0, a0, 5
-; CHECK-NEXT:    slli a1, a1, 8
-; CHECK-NEXT:    add a0, a0, a1
-; CHECK-NEXT:    ret
+; RV32I-LABEL: addshl_5_8:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    slli a0, a0, 5
+; RV32I-NEXT:    slli a1, a1, 8
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    ret
+;
+; RV32XTHEADBA-LABEL: addshl_5_8:
+; RV32XTHEADBA:       # %bb.0:
+; RV32XTHEADBA-NEXT:    th.addsl a0, a0, a1, 3
+; RV32XTHEADBA-NEXT:    slli a0, a0, 5
+; RV32XTHEADBA-NEXT:    ret
   %c = shl i32 %a, 5
   %d = shl i32 %b, 8
   %e = add i32 %c, %d
diff --git a/llvm/test/CodeGen/RISCV/rv32zba.ll b/llvm/test/CodeGen/RISCV/rv32zba.ll
index ab099103b4216..a6dbd94caad4f 100644
--- a/llvm/test/CodeGen/RISCV/rv32zba.ll
+++ b/llvm/test/CodeGen/RISCV/rv32zba.ll
@@ -1136,3 +1136,167 @@ define i32 @mul_neg8(i32 %a) {
   %c = mul i32 %a, -8
   ret i32 %c
 }
+
+define i32 @select3i32(i1 zeroext %x) {
+; RV32I-LABEL: select3i32:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    neg a0, a0
+; RV32I-NEXT:    andi a0, a0, 3
+; RV32I-NEXT:    ret
+;
+; RV32ZBA-LABEL: select3i32:
+; RV32ZBA:       # %bb.0:
+; RV32ZBA-NEXT:    sh1add a0, a0, a0
+; RV32ZBA-NEXT:    ret
+;
+; RV32XANDESPERF-LABEL: select3i32:
+; RV32XANDESPERF:       # %bb.0:
+; RV32XANDESPERF-NEXT:    nds.lea.h a0, a0, a0
+; RV32XANDESPERF-NEXT:    ret
+  %select = select i1 %x, i32 3, i32 0
+  ret i32 %select
+}
+
+define i32 @select5i32(i1 zeroext %x) {
+; RV32I-LABEL: select5i32:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    neg a0, a0
+; RV32I-NEXT:    andi a0, a0, 5
+; RV32I-NEXT:    ret
+;
+; RV32ZBA-LABEL: select5i32:
+; RV32ZBA:       # %bb.0:
+; RV32ZBA-NEXT:    sh2add a0, a0, a0
+; RV32ZBA-NEXT:    ret
+;
+; RV32XANDESPERF-LABEL: select5i32:
+; RV32XANDESPERF:       # %bb.0:
+; RV32XANDESPERF-NEXT:    nds.lea.w a0, a0, a0
+; RV32XANDESPERF-NEXT:    ret
+  %select = select i1 %x, i32 5, i32 0
+  ret i32 %select
+}
+
+define i32 @select9i32(i1 zeroext %x) {
+; RV32I-LABEL: select9i32:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    neg a0, a0
+; RV32I-NEXT:    andi a0, a0, 9
+; RV32I-NEXT:    ret
+;
+; RV32ZBA-LABEL: select9i32:
+; RV32ZBA:       # %bb.0:
+; RV32ZBA-NEXT:    sh3add a0, a0, a0
+; RV32ZBA-NEXT:    ret
+;
+; RV32XANDESPERF-LABEL: select9i32:
+; RV32XANDESPERF:       # %bb.0:
+; RV32XANDESPERF-NEXT:    nds.lea.d a0, a0, a0
+; RV32XANDESPERF-NEXT:    ret
+  %select = select i1 %x, i32 9, i32 0
+  ret i32 %select
+}
+
+define i64 @select3i64(i1 zeroext %x) {
+; RV32I-LABEL: select3i64:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    neg a0, a0
+; RV32I-NEXT:    andi a0, a0, 3
+; RV32I-NEXT:    li a1, 0
+; RV32I-NEXT:    ret
+;
+; RV32ZBA-LABEL: select3i64:
+; RV32ZBA:       # %bb.0:
+; RV32ZBA-NEXT:    sh1add a0, a0, a0
+; RV32ZBA-NEXT:    li a1, 0
+; RV32ZBA-NEXT:    ret
+;
+; RV32XANDESPERF-LABEL: select3i64:
+; RV32XANDESPERF:       # %bb.0:
+; RV32XANDESPERF-NEXT:    nds.lea.h a0, a0, a0
+; RV32XANDESPERF-NEXT:    li a1, 0
+; RV32XANDESPERF-NEXT:    ret
+  %select = select i1 %x, i64 3, i64 0
+  ret i64 %select
+}
+
+define i64 @select5i64(i1 zeroext %x) {
+; RV32I-LABEL: select5i64:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    neg a0, a0
+; RV32I-NEXT:    andi a0, a0, 5
+; RV32I-NEXT:    li a1, 0
+; RV32I-NEXT:    ret
+;
+; RV32ZBA-LABEL: select5i64:
+; RV32ZBA:       # %bb.0:
+; RV32ZBA-NEXT:    sh2add a0, a0, a0
+; RV32ZBA-NEXT:    li a1, 0
+; RV32ZBA-NEXT:    ret
+;
+; RV32XANDESPERF-LABEL: select5i64:
+; RV32XANDESPERF:       # %bb.0:
+; RV32XANDESPERF-NEXT:    nds.lea.w a0, a0, a0
+; RV32XANDESPERF-NEXT:    li a1, 0
+; RV32XANDESPERF-NEXT:    ret
+  %select = select i1 %x, i64 5, i64 0
+  ret i64 %select
+}
+
+define i64 @select9i64(i1 zeroext %x) {
+; RV32I-LABEL: select9i64:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    neg a0, a0
+; RV32I-NEXT:    andi a0, a0, 9
+; RV32I-NEXT:    li a1, 0
+; RV32I-NEXT:    ret
+;
+; RV32ZBA-LABEL: select9i64:
+; RV32ZBA:       # %bb.0:
+; RV32ZBA-NEXT:    sh3add a0, a0, a0
+; RV32ZBA-NEXT:    li a1, 0
+; RV32ZBA-NEXT:    ret
+;
+; RV32XANDESPERF-LABEL: select9i64:
+; RV32XANDESPERF:       # %bb.0:
+; RV32XANDESPERF-NEXT:    nds.lea.d a0, a0, a0
+; RV32XANDESPERF-NEXT:    li a1, 0
+; RV32XANDESPERF-NEXT:    ret
+  %select = select i1 %x, i64 9, i64 0
+  ret i64 %select
+}
+
+define ptr @shl_add_knownbits(ptr %p, i32 %i) {
+; RV32I-LABEL: shl_add_knownbits:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    slli a1, a1, 18
+; RV32I-NEXT:    srli a1, a1, 18
+; RV32I-NEXT:    slli a2, a1, 1
+; RV32I-NEXT:    slli a1, a1, 3
+; RV32I-NEXT:    sub a1, a1, a2
+; RV32I-NEXT:    srli a1, a1, 3
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    ret
+;
+; RV32ZBA-LABEL: shl_add_knownbits:
+; RV32ZBA:       # %bb.0:
+; RV32ZBA-NEXT:    slli a1, a1, 18
+; RV32ZBA-NEXT:    srli a1, a1, 18
+; RV32ZBA-NEXT:    sh1add a1, a1, a1
+; RV32ZBA-NEXT:    srli a1, a1, 2
+; RV32ZBA-NEXT:    add a0, a0, a1
+; RV32ZBA-NEXT:    ret
+;
+; RV32XANDESPERF-LABEL: shl_add_knownbits:
+; RV32XANDESPERF:       # %bb.0:
+; RV32XANDESPERF-NEXT:    nds.bfoz a1, a1, 13, 0
+; RV32XANDESPERF-NEXT:    nds.lea.h a1, a1, a1
+; RV32XANDESPERF-NEXT:    srli a1, a1, 2
+; RV32XANDESPERF-NEXT:    add a0, a0, a1
+; RV32XANDESPERF-NEXT:    ret
+  %and = and i32 %i, 16383
+  %mul = mul i32 %and, 6
+  %shr = lshr i32 %mul, 3
+  %r = getelementptr i8, ptr %p, i32 %shr
+  ret ptr %r
+}
diff --git a/llvm/test/CodeGen/RISCV/rv32zbb.ll b/llvm/test/CodeGen/RISCV/rv32zbb.ll
index 3b3ef72e32aa7..a1a843a7c1ba7 100644
--- a/llvm/test/CodeGen/RISCV/rv32zbb.ll
+++ b/llvm/test/CodeGen/RISCV/rv32zbb.ll
@@ -423,100 +423,62 @@ define <2 x i32> @ctpop_v2i32(<2 x i32> %a) nounwind {
 }
 
 define <2 x i1> @ctpop_v2i32_ult_two(<2 x i32> %a) nounwind {
-; RV32I-LABEL: ctpop_v2i32_ult_two:
-; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi a2, a0, -1
-; RV32I-NEXT:    addi a3, a1, -1
-; RV32I-NEXT:    and a1, a1, a3
-; RV32I-NEXT:    and a0, a0, a2
-; RV32I-NEXT:    seqz a0, a0
-; RV32I-NEXT:    seqz a1, a1
-; RV32I-NEXT:    ret
-;
-; RV32ZBB-LABEL: ctpop_v2i32_ult_two:
-; RV32ZBB:       # %bb.0:
-; RV32ZBB-NEXT:    cpop a1, a1
-; RV32ZBB-NEXT:    cpop a0, a0
-; RV32ZBB-NEXT:    sltiu a0, a0, 2
-; RV32ZBB-NEXT:    sltiu a1, a1, 2
-; RV32ZBB-NEXT:    ret
+; CHECK-LABEL: ctpop_v2i32_ult_two:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a2, a0, -1
+; CHECK-NEXT:    addi a3, a1, -1
+; CHECK-NEXT:    and a1, a1, a3
+; CHECK-NEXT:    and a0, a0, a2
+; CHECK-NEXT:    seqz a0, a0
+; CHECK-NEXT:    seqz a1, a1
+; CHECK-NEXT:    ret
   %1 = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %a)
   %2 = icmp ult <2 x i32> %1, <i32 2, i32 2>
   ret <2 x i1> %2
 }
 
 define <2 x i1> @ctpop_v2i32_ugt_one(<2 x i32> %a) nounwind {
-; RV32I-LABEL: ctpop_v2i32_ugt_one:
-; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi a2, a0, -1
-; RV32I-NEXT:    addi a3, a1, -1
-; RV32I-NEXT:    and a1, a1, a3
-; RV32I-NEXT:    and a0, a0, a2
-; RV32I-NEXT:    snez a0, a0
-; RV32I-NEXT:    snez a1, a1
-; RV32I-NEXT:    ret
-;
-; RV32ZBB-LABEL: ctpop_v2i32_ugt_one:
-; RV32ZBB:       # %bb.0:
-; RV32ZBB-NEXT:    cpop a1, a1
-; RV32ZBB-NEXT:    cpop a0, a0
-; RV32ZBB-NEXT:    sltiu a0, a0, 2
-; RV32ZBB-NEXT:    sltiu a1, a1, 2
-; RV32ZBB-NEXT:    xori a0, a0, 1
-; RV32ZBB-NEXT:    xori a1, a1, 1
-; RV32ZBB-NEXT:    ret
+; CHECK-LABEL: ctpop_v2i32_ugt_one:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a2, a0, -1
+; CHECK-NEXT:    addi a3, a1, -1
+; CHECK-NEXT:    and a1, a1, a3
+; CHECK-NEXT:    and a0, a0, a2
+; CHECK-NEXT:    snez a0, a0
+; CHECK-NEXT:    snez a1, a1
+; CHECK-NEXT:    ret
   %1 = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %a)
   %2 = icmp ugt <2 x i32> %1, <i32 1, i32 1>
   ret <2 x i1> %2
 }
 
 define <2 x i1> @ctpop_v2i32_eq_one(<2 x i32> %a) nounwind {
-; RV32I-LABEL: ctpop_v2i32_eq_one:
-; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi a2, a0, -1
-; RV32I-NEXT:    addi a3, a1, -1
-; RV32I-NEXT:    xor a1, a1, a3
-; RV32I-NEXT:    xor a0, a0, a2
-; RV32I-NEXT:    sltu a0, a2, a0
-; RV32I-NEXT:    sltu a1, a3, a1
-; RV32I-NEXT:    ret
-;
-; RV32ZBB-LABEL: ctpop_v2i32_eq_one:
-; RV32ZBB:       # %bb.0:
-; RV32ZBB-NEXT:    cpop a1, a1
-; RV32ZBB-NEXT:    cpop a0, a0
-; RV32ZBB-NEXT:    addi a0, a0, -1
-; RV32ZBB-NEXT:    addi a1, a1, -1
-; RV32ZBB-NEXT:    seqz a0, a0
-; RV32ZBB-NEXT:    seqz a1, a1
-; RV32ZBB-NEXT:    ret
+; CHECK-LABEL: ctpop_v2i32_eq_one:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a2, a0, -1
+; CHECK-NEXT:    addi a3, a1, -1
+; CHECK-NEXT:    xor a1, a1, a3
+; CHECK-NEXT:    xor a0, a0, a2
+; CHECK-NEXT:    sltu a0, a2, a0
+; CHECK-NEXT:    sltu a1, a3, a1
+; CHECK-NEXT:    ret
   %1 = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %a)
   %2 = icmp eq <2 x i32> %1, <i32 1, i32 1>
   ret <2 x i1> %2
 }
 
 define <2 x i1> @ctpop_v2i32_ne_one(<2 x i32> %a) nounwind {
-; RV32I-LABEL: ctpop_v2i32_ne_one:
-; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi a2, a0, -1
-; RV32I-NEXT:    addi a3, a1, -1
-; RV32I-NEXT:    xor a1, a1, a3
-; RV32I-NEXT:    xor a0, a0, a2
-; RV32I-NEXT:    sltu a0, a2, a0
-; RV32I-NEXT:    sltu a1, a3, a1
-; RV32I-NEXT:    xori a0, a0, 1
-; RV32I-NEXT:    xori a1, a1, 1
-; RV32I-NEXT:    ret
-;
-; RV32ZBB-LABEL: ctpop_v2i32_ne_one:
-; RV32ZBB:       # %bb.0:
-; RV32ZBB-NEXT:    cpop a1, a1
-; RV32ZBB-NEXT:    cpop a0, a0
-; RV32ZBB-NEXT:    addi a0, a0, -1
-; RV32ZBB-NEXT:    addi a1, a1, -1
-; RV32ZBB-NEXT:    snez a0, a0
-; RV32ZBB-NEXT:    snez a1, a1
-; RV32ZBB-NEXT:    ret
+; CHECK-LABEL: ctpop_v2i32_ne_one:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a2, a0, -1
+; CHECK-NEXT:    addi a3, a1, -1
+; CHECK-NEXT:    xor a1, a1, a3
+; CHECK-NEXT:    xor a0, a0, a2
+; CHECK-NEXT:    sltu a0, a2, a0
+; CHECK-NEXT:    sltu a1, a3, a1
+; CHECK-NEXT:    xori a0, a0, 1
+; CHECK-NEXT:    xori a1, a1, 1
+; CHECK-NEXT:    ret
   %1 = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %a)
   %2 = icmp ne <2 x i32> %1, <i32 1, i32 1>
   ret <2 x i1> %2
@@ -792,200 +754,130 @@ define <2 x i64> @ctpop_v2i64(<2 x i64> %a) nounwind {
 }
 
 define <2 x i1> @ctpop_v2i64_ult_two(<2 x i64> %a) nounwind {
-; RV32I-LABEL: ctpop_v2i64_ult_two:
-; RV32I:       # %bb.0:
-; RV32I-NEXT:    lw a1, 0(a0)
-; RV32I-NEXT:    lw a2, 8(a0)
-; RV32I-NEXT:    lw a3, 4(a0)
-; RV32I-NEXT:    lw a0, 12(a0)
-; RV32I-NEXT:    seqz a4, a1
-; RV32I-NEXT:    seqz a5, a2
-; RV32I-NEXT:    addi a6, a1, -1
-; RV32I-NEXT:    addi a7, a2, -1
-; RV32I-NEXT:    sub a4, a3, a4
-; RV32I-NEXT:    sub a5, a0, a5
-; RV32I-NEXT:    and a2, a2, a7
-; RV32I-NEXT:    and a1, a1, a6
-; RV32I-NEXT:    and a0, a0, a5
-; RV32I-NEXT:    and a3, a3, a4
-; RV32I-NEXT:    or a1, a1, a3
-; RV32I-NEXT:    or a2, a2, a0
-; RV32I-NEXT:    seqz a0, a1
-; RV32I-NEXT:    seqz a1, a2
-; RV32I-NEXT:    ret
-;
-; RV32ZBB-LABEL: ctpop_v2i64_ult_two:
-; RV32ZBB:       # %bb.0:
-; RV32ZBB-NEXT:    lw a1, 12(a0)
-; RV32ZBB-NEXT:    lw a2, 8(a0)
-; RV32ZBB-NEXT:    lw a3, 4(a0)
-; RV32ZBB-NEXT:    lw a0, 0(a0)
-; RV32ZBB-NEXT:    cpop a1, a1
-; RV32ZBB-NEXT:    cpop a2, a2
-; RV32ZBB-NEXT:    cpop a3, a3
-; RV32ZBB-NEXT:    cpop a0, a0
-; RV32ZBB-NEXT:    add a1, a2, a1
-; RV32ZBB-NEXT:    add a0, a0, a3
-; RV32ZBB-NEXT:    sltiu a0, a0, 2
-; RV32ZBB-NEXT:    sltiu a1, a1, 2
-; RV32ZBB-NEXT:    ret
+; CHECK-LABEL: ctpop_v2i64_ult_two:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lw a1, 0(a0)
+; CHECK-NEXT:    lw a2, 8(a0)
+; CHECK-NEXT:    lw a3, 4(a0)
+; CHECK-NEXT:    lw a0, 12(a0)
+; CHECK-NEXT:    seqz a4, a1
+; CHECK-NEXT:    seqz a5, a2
+; CHECK-NEXT:    addi a6, a1, -1
+; CHECK-NEXT:    addi a7, a2, -1
+; CHECK-NEXT:    sub a4, a3, a4
+; CHECK-NEXT:    sub a5, a0, a5
+; CHECK-NEXT:    and a2, a2, a7
+; CHECK-NEXT:    and a1, a1, a6
+; CHECK-NEXT:    and a0, a0, a5
+; CHECK-NEXT:    and a3, a3, a4
+; CHECK-NEXT:    or a1, a1, a3
+; CHECK-NEXT:    or a2, a2, a0
+; CHECK-NEXT:    seqz a0, a1
+; CHECK-NEXT:    seqz a1, a2
+; CHECK-NEXT:    ret
   %1 = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %a)
   %2 = icmp ult <2 x i64> %1, <i64 2, i64 2>
   ret <2 x i1> %2
 }
 
 define <2 x i1> @ctpop_v2i64_ugt_one(<2 x i64> %a) nounwind {
-; RV32I-LABEL: ctpop_v2i64_ugt_one:
-; RV32I:       # %bb.0:
-; RV32I-NEXT:    lw a1, 0(a0)
-; RV32I-NEXT:    lw a2, 8(a0)
-; RV32I-NEXT:    lw a3, 4(a0)
-; RV32I-NEXT:    lw a0, 12(a0)
-; RV32I-NEXT:    seqz a4, a1
-; RV32I-NEXT:    seqz a5, a2
-; RV32I-NEXT:    addi a6, a1, -1
-; RV32I-NEXT:    addi a7, a2, -1
-; RV32I-NEXT:    sub a4, a3, a4
-; RV32I-NEXT:    sub a5, a0, a5
-; RV32I-NEXT:    and a2, a2, a7
-; RV32I-NEXT:    and a1, a1, a6
-; RV32I-NEXT:    and a0, a0, a5
-; RV32I-NEXT:    and a3, a3, a4
-; RV32I-NEXT:    or a1, a1, a3
-; RV32I-NEXT:    or a2, a2, a0
-; RV32I-NEXT:    snez a0, a1
-; RV32I-NEXT:    snez a1, a2
-; RV32I-NEXT:    ret
-;
-; RV32ZBB-LABEL: ctpop_v2i64_ugt_one:
-; RV32ZBB:       # %bb.0:
-; RV32ZBB-NEXT:    lw a1, 12(a0)
-; RV32ZBB-NEXT:    lw a2, 8(a0)
-; RV32ZBB-NEXT:    lw a3, 4(a0)
-; RV32ZBB-NEXT:    lw a0, 0(a0)
-; RV32ZBB-NEXT:    cpop a1, a1
-; RV32ZBB-NEXT:    cpop a2, a2
-; RV32ZBB-NEXT:    cpop a3, a3
-; RV32ZBB-NEXT:    cpop a0, a0
-; RV32ZBB-NEXT:    add a1, a2, a1
-; RV32ZBB-NEXT:    add a0, a0, a3
-; RV32ZBB-NEXT:    sltiu a0, a0, 2
-; RV32ZBB-NEXT:    sltiu a1, a1, 2
-; RV32ZBB-NEXT:    xori a0, a0, 1
-; RV32ZBB-NEXT:    xori a1, a1, 1
-; RV32ZBB-NEXT:    ret
+; CHECK-LABEL: ctpop_v2i64_ugt_one:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lw a1, 0(a0)
+; CHECK-NEXT:    lw a2, 8(a0)
+; CHECK-NEXT:    lw a3, 4(a0)
+; CHECK-NEXT:    lw a0, 12(a0)
+; CHECK-NEXT:    seqz a4, a1
+; CHECK-NEXT:    seqz a5, a2
+; CHECK-NEXT:    addi a6, a1, -1
+; CHECK-NEXT:    addi a7, a2, -1
+; CHECK-NEXT:    sub a4, a3, a4
+; CHECK-NEXT:    sub a5, a0, a5
+; CHECK-NEXT:    and a2, a2, a7
+; CHECK-NEXT:    and a1, a1, a6
+; CHECK-NEXT:    and a0, a0, a5
+; CHECK-NEXT:    and a3, a3, a4
+; CHECK-NEXT:    or a1, a1, a3
+; CHECK-NEXT:    or a2, a2, a0
+; CHECK-NEXT:    snez a0, a1
+; CHECK-NEXT:    snez a1, a2
+; CHECK-NEXT:    ret
   %1 = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %a)
   %2 = icmp ugt <2 x i64> %1, <i64 1, i64 1>
   ret <2 x i1> %2
 }
 
 define <2 x i1> @ctpop_v2i64_eq_one(<2 x i64> %a) nounwind {
-; RV32I-LABEL: ctpop_v2i64_eq_one:
-; RV32I:       # %bb.0:
-; RV32I-NEXT:    mv a1, a0
-; RV32I-NEXT:    lw a0, 0(a0)
-; RV32I-NEXT:    lw a3, 4(a1)
-; RV32I-NEXT:    lw a2, 12(a1)
-; RV32I-NEXT:    beqz a3, .LBB22_3
-; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    seqz a0, a0
-; RV32I-NEXT:    sub a0, a3, a0
-; RV32I-NEXT:    xor a3, a3, a0
-; RV32I-NEXT:    sltu a0, a0, a3
-; RV32I-NEXT:    lw a1, 8(a1)
-; RV32I-NEXT:    bnez a2, .LBB22_4
-; RV32I-NEXT:  .LBB22_2:
-; RV32I-NEXT:    addi a2, a1, -1
-; RV32I-NEXT:    xor a1, a1, a2
-; RV32I-NEXT:    sltu a1, a2, a1
-; RV32I-NEXT:    ret
-; RV32I-NEXT:  .LBB22_3:
-; RV32I-NEXT:    addi a3, a0, -1
-; RV32I-NEXT:    xor a0, a0, a3
-; RV32I-NEXT:    sltu a0, a3, a0
-; RV32I-NEXT:    lw a1, 8(a1)
-; RV32I-NEXT:    beqz a2, .LBB22_2
-; RV32I-NEXT:  .LBB22_4:
-; RV32I-NEXT:    seqz a1, a1
-; RV32I-NEXT:    sub a1, a2, a1
-; RV32I-NEXT:    xor a2, a2, a1
-; RV32I-NEXT:    sltu a1, a1, a2
-; RV32I-NEXT:    ret
-;
-; RV32ZBB-LABEL: ctpop_v2i64_eq_one:
-; RV32ZBB:       # %bb.0:
-; RV32ZBB-NEXT:    lw a1, 12(a0)
-; RV32ZBB-NEXT:    lw a2, 8(a0)
-; RV32ZBB-NEXT:    lw a3, 4(a0)
-; RV32ZBB-NEXT:    lw a0, 0(a0)
-; RV32ZBB-NEXT:    cpop a1, a1
-; RV32ZBB-NEXT:    cpop a2, a2
-; RV32ZBB-NEXT:    cpop a3, a3
-; RV32ZBB-NEXT:    cpop a0, a0
-; RV32ZBB-NEXT:    add a1, a2, a1
-; RV32ZBB-NEXT:    add a0, a0, a3
-; RV32ZBB-NEXT:    addi a0, a0, -1
-; RV32ZBB-NEXT:    addi a1, a1, -1
-; RV32ZBB-NEXT:    seqz a0, a0
-; RV32ZBB-NEXT:    seqz a1, a1
-; RV32ZBB-NEXT:    ret
+; CHECK-LABEL: ctpop_v2i64_eq_one:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mv a1, a0
+; CHECK-NEXT:    lw a0, 0(a0)
+; CHECK-NEXT:    lw a3, 4(a1)
+; CHECK-NEXT:    lw a2, 12(a1)
+; CHECK-NEXT:    beqz a3, .LBB22_3
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    seqz a0, a0
+; CHECK-NEXT:    sub a0, a3, a0
+; CHECK-NEXT:    xor a3, a3, a0
+; CHECK-NEXT:    sltu a0, a0, a3
+; CHECK-NEXT:    lw a1, 8(a1)
+; CHECK-NEXT:    bnez a2, .LBB22_4
+; CHECK-NEXT:  .LBB22_2:
+; CHECK-NEXT:    addi a2, a1, -1
+; CHECK-NEXT:    xor a1, a1, a2
+; CHECK-NEXT:    sltu a1, a2, a1
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB22_3:
+; CHECK-NEXT:    addi a3, a0, -1
+; CHECK-NEXT:    xor a0, a0, a3
+; CHECK-NEXT:    sltu a0, a3, a0
+; CHECK-NEXT:    lw a1, 8(a1)
+; CHECK-NEXT:    beqz a2, .LBB22_2
+; CHECK-NEXT:  .LBB22_4:
+; CHECK-NEXT:    seqz a1, a1
+; CHECK-NEXT:    sub a1, a2, a1
+; CHECK-NEXT:    xor a2, a2, a1
+; CHECK-NEXT:    sltu a1, a1, a2
+; CHECK-NEXT:    ret
   %1 = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %a)
   %2 = icmp eq <2 x i64> %1, <i64 1, i64 1>
   ret <2 x i1> %2
 }
 
 define <2 x i1> @ctpop_v2i64_ne_one(<2 x i64> %a) nounwind {
-; RV32I-LABEL: ctpop_v2i64_ne_one:
-; RV32I:       # %bb.0:
-; RV32I-NEXT:    lw a2, 0(a0)
-; RV32I-NEXT:    lw a3, 4(a0)
-; RV32I-NEXT:    lw a1, 12(a0)
-; RV32I-NEXT:    beqz a3, .LBB23_2
-; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    seqz a2, a2
-; RV32I-NEXT:    sub a2, a3, a2
-; RV32I-NEXT:    xor a3, a3, a2
-; RV32I-NEXT:    sltu a2, a2, a3
-; RV32I-NEXT:    j .LBB23_3
-; RV32I-NEXT:  .LBB23_2:
-; RV32I-NEXT:    addi a3, a2, -1
-; RV32I-NEXT:    xor a2, a2, a3
-; RV32I-NEXT:    sltu a2, a3, a2
-; RV32I-NEXT:  .LBB23_3:
-; RV32I-NEXT:    lw a3, 8(a0)
-; RV32I-NEXT:    xori a0, a2, 1
-; RV32I-NEXT:    beqz a1, .LBB23_5
-; RV32I-NEXT:  # %bb.4:
-; RV32I-NEXT:    seqz a2, a3
-; RV32I-NEXT:    sub a2, a1, a2
-; RV32I-NEXT:    xor a1, a1, a2
-; RV32I-NEXT:    sltu a1, a2, a1
-; RV32I-NEXT:    xori a1, a1, 1
-; RV32I-NEXT:    ret
-; RV32I-NEXT:  .LBB23_5:
-; RV32I-NEXT:    addi a1, a3, -1
-; RV32I-NEXT:    xor a3, a3, a1
-; RV32I-NEXT:    sltu a1, a1, a3
-; RV32I-NEXT:    xori a1, a1, 1
-; RV32I-NEXT:    ret
-;
-; RV32ZBB-LABEL: ctpop_v2i64_ne_one:
-; RV32ZBB:       # %bb.0:
-; RV32ZBB-NEXT:    lw a1, 12(a0)
-; RV32ZBB-NEXT:    lw a2, 8(a0)
-; RV32ZBB-NEXT:    lw a3, 4(a0)
-; RV32ZBB-NEXT:    lw a0, 0(a0)
-; RV32ZBB-NEXT:    cpop a1, a1
-; RV32ZBB-NEXT:    cpop a2, a2
-; RV32ZBB-NEXT:    cpop a3, a3
-; RV32ZBB-NEXT:    cpop a0, a0
-; RV32ZBB-NEXT:    add a1, a2, a1
-; RV32ZBB-NEXT:    add a0, a0, a3
-; RV32ZBB-NEXT:    addi a0, a0, -1
-; RV32ZBB-NEXT:    addi a1, a1, -1
-; RV32ZBB-NEXT:    snez a0, a0
-; RV32ZBB-NEXT:    snez a1, a1
-; RV32ZBB-NEXT:    ret
+; CHECK-LABEL: ctpop_v2i64_ne_one:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lw a2, 0(a0)
+; CHECK-NEXT:    lw a3, 4(a0)
+; CHECK-NEXT:    lw a1, 12(a0)
+; CHECK-NEXT:    beqz a3, .LBB23_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    seqz a2, a2
+; CHECK-NEXT:    sub a2, a3, a2
+; CHECK-NEXT:    xor a3, a3, a2
+; CHECK-NEXT:    sltu a2, a2, a3
+; CHECK-NEXT:    j .LBB23_3
+; CHECK-NEXT:  .LBB23_2:
+; CHECK-NEXT:    addi a3, a2, -1
+; CHECK-NEXT:    xor a2, a2, a3
+; CHECK-NEXT:    sltu a2, a3, a2
+; CHECK-NEXT:  .LBB23_3:
+; CHECK-NEXT:    lw a3, 8(a0)
+; CHECK-NEXT:    xori a0, a2, 1
+; CHECK-NEXT:    beqz a1, .LBB23_5
+; CHECK-NEXT:  # %bb.4:
+; CHECK-NEXT:    seqz a2, a3
+; CHECK-NEXT:    sub a2, a1, a2
+; CHECK-NEXT:    xor a1, a1, a2
+; CHECK-NEXT:    sltu a1, a2, a1
+; CHECK-NEXT:    xori a1, a1, 1
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB23_5:
+; CHECK-NEXT:    addi a1, a3, -1
+; CHECK-NEXT:    xor a3, a3, a1
+; CHECK-NEXT:    sltu a1, a1, a3
+; CHECK-NEXT:    xori a1, a1, 1
+; CHECK-NEXT:    ret
   %1 = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %a)
   %2 = icmp ne <2 x i64> %1, <i64 1, i64 1>
   ret <2 x i1> %2
diff --git a/llvm/test/CodeGen/RISCV/rv64xtheadba.ll b/llvm/test/CodeGen/RISCV/rv64xtheadba.ll
index d20fb66dbbeea..50bd22bf5fd69 100644
--- a/llvm/test/CodeGen/RISCV/rv64xtheadba.ll
+++ b/llvm/test/CodeGen/RISCV/rv64xtheadba.ll
@@ -1104,12 +1104,18 @@ define i64 @add8192(i64 %a) {
 }
 
 define signext i32 @addshl32_5_6(i32 signext %a, i32 signext %b) {
-; CHECK-LABEL: addshl32_5_6:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    slli a0, a0, 5
-; CHECK-NEXT:    slli a1, a1, 6
-; CHECK-NEXT:    addw a0, a0, a1
-; CHECK-NEXT:    ret
+; RV64I-LABEL: addshl32_5_6:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 5
+; RV64I-NEXT:    slli a1, a1, 6
+; RV64I-NEXT:    addw a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: addshl32_5_6:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a1, 1
+; RV64XTHEADBA-NEXT:    slliw a0, a0, 5
+; RV64XTHEADBA-NEXT:    ret
   %c = shl i32 %a, 5
   %d = shl i32 %b, 6
   %e = add i32 %c, %d
@@ -1117,12 +1123,18 @@ define signext i32 @addshl32_5_6(i32 signext %a, i32 signext %b) {
 }
 
 define i64 @addshl64_5_6(i64 %a, i64 %b) {
-; CHECK-LABEL: addshl64_5_6:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    slli a0, a0, 5
-; CHECK-NEXT:    slli a1, a1, 6
-; CHECK-NEXT:    add a0, a0, a1
-; CHECK-NEXT:    ret
+; RV64I-LABEL: addshl64_5_6:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 5
+; RV64I-NEXT:    slli a1, a1, 6
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: addshl64_5_6:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a1, 1
+; RV64XTHEADBA-NEXT:    slli a0, a0, 5
+; RV64XTHEADBA-NEXT:    ret
   %c = shl i64 %a, 5
   %d = shl i64 %b, 6
   %e = add i64 %c, %d
@@ -1130,12 +1142,18 @@ define i64 @addshl64_5_6(i64 %a, i64 %b) {
 }
 
 define signext i32 @addshl32_5_7(i32 signext %a, i32 signext %b) {
-; CHECK-LABEL: addshl32_5_7:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    slli a0, a0, 5
-; CHECK-NEXT:    slli a1, a1, 7
-; CHECK-NEXT:    addw a0, a0, a1
-; CHECK-NEXT:    ret
+; RV64I-LABEL: addshl32_5_7:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 5
+; RV64I-NEXT:    slli a1, a1, 7
+; RV64I-NEXT:    addw a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: addshl32_5_7:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a1, 2
+; RV64XTHEADBA-NEXT:    slliw a0, a0, 5
+; RV64XTHEADBA-NEXT:    ret
   %c = shl i32 %a, 5
   %d = shl i32 %b, 7
   %e = add i32 %c, %d
@@ -1143,12 +1161,18 @@ define signext i32 @addshl32_5_7(i32 signext %a, i32 signext %b) {
 }
 
 define i64 @addshl64_5_7(i64 %a, i64 %b) {
-; CHECK-LABEL: addshl64_5_7:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    slli a0, a0, 5
-; CHECK-NEXT:    slli a1, a1, 7
-; CHECK-NEXT:    add a0, a0, a1
-; CHECK-NEXT:    ret
+; RV64I-LABEL: addshl64_5_7:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 5
+; RV64I-NEXT:    slli a1, a1, 7
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: addshl64_5_7:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a1, 2
+; RV64XTHEADBA-NEXT:    slli a0, a0, 5
+; RV64XTHEADBA-NEXT:    ret
   %c = shl i64 %a, 5
   %d = shl i64 %b, 7
   %e = add i64 %c, %d
@@ -1156,12 +1180,18 @@ define i64 @addshl64_5_7(i64 %a, i64 %b) {
 }
 
 define signext i32 @addshl32_5_8(i32 signext %a, i32 signext %b) {
-; CHECK-LABEL: addshl32_5_8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    slli a0, a0, 5
-; CHECK-NEXT:    slli a1, a1, 8
-; CHECK-NEXT:    addw a0, a0, a1
-; CHECK-NEXT:    ret
+; RV64I-LABEL: addshl32_5_8:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 5
+; RV64I-NEXT:    slli a1, a1, 8
+; RV64I-NEXT:    addw a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: addshl32_5_8:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a1, 3
+; RV64XTHEADBA-NEXT:    slliw a0, a0, 5
+; RV64XTHEADBA-NEXT:    ret
   %c = shl i32 %a, 5
   %d = shl i32 %b, 8
   %e = add i32 %c, %d
@@ -1169,12 +1199,18 @@ define signext i32 @addshl32_5_8(i32 signext %a, i32 signext %b) {
 }
 
 define i64 @addshl64_5_8(i64 %a, i64 %b) {
-; CHECK-LABEL: addshl64_5_8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    slli a0, a0, 5
-; CHECK-NEXT:    slli a1, a1, 8
-; CHECK-NEXT:    add a0, a0, a1
-; CHECK-NEXT:    ret
+; RV64I-LABEL: addshl64_5_8:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 5
+; RV64I-NEXT:    slli a1, a1, 8
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: addshl64_5_8:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a1, 3
+; RV64XTHEADBA-NEXT:    slli a0, a0, 5
+; RV64XTHEADBA-NEXT:    ret
   %c = shl i64 %a, 5
   %d = shl i64 %b, 8
   %e = add i64 %c, %d
@@ -1192,9 +1228,8 @@ define i64 @sh6_sh3_add1(i64 noundef %x, i64 noundef %y, i64 noundef %z) {
 ;
 ; RV64XTHEADBA-LABEL: sh6_sh3_add1:
 ; RV64XTHEADBA:       # %bb.0: # %entry
-; RV64XTHEADBA-NEXT:    slli a1, a1, 6
-; RV64XTHEADBA-NEXT:    th.addsl a1, a1, a2, 3
-; RV64XTHEADBA-NEXT:    add a0, a1, a0
+; RV64XTHEADBA-NEXT:    th.addsl a1, a2, a1, 3
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a1, 3
 ; RV64XTHEADBA-NEXT:    ret
 entry:
   %shl = shl i64 %z, 3
@@ -1238,9 +1273,8 @@ define i64 @sh6_sh3_add3(i64 noundef %x, i64 noundef %y, i64 noundef %z) {
 ;
 ; RV64XTHEADBA-LABEL: sh6_sh3_add3:
 ; RV64XTHEADBA:       # %bb.0: # %entry
-; RV64XTHEADBA-NEXT:    slli a1, a1, 6
-; RV64XTHEADBA-NEXT:    th.addsl a1, a1, a2, 3
-; RV64XTHEADBA-NEXT:    add a0, a0, a1
+; RV64XTHEADBA-NEXT:    th.addsl a1, a2, a1, 3
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a1, 3
 ; RV64XTHEADBA-NEXT:    ret
 entry:
   %shl = shl i64 %z, 3
diff --git a/llvm/test/CodeGen/RISCV/rv64zba.ll b/llvm/test/CodeGen/RISCV/rv64zba.ll
index b46f7cc440b7a..c028d25169749 100644
--- a/llvm/test/CodeGen/RISCV/rv64zba.ll
+++ b/llvm/test/CodeGen/RISCV/rv64zba.ll
@@ -4576,3 +4576,98 @@ define i64 @append_32ones(i64 %x) {
   %o = or i64 %s, 4294967295
   ret i64 %o
 }
+
+define i32 @select3(i1 zeroext %x) {
+; RV64I-LABEL: select3:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    neg a0, a0
+; RV64I-NEXT:    andi a0, a0, 3
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: select3:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    sh1add a0, a0, a0
+; RV64ZBA-NEXT:    ret
+;
+; RV64XANDESPERF-LABEL: select3:
+; RV64XANDESPERF:       # %bb.0:
+; RV64XANDESPERF-NEXT:    nds.lea.h a0, a0, a0
+; RV64XANDESPERF-NEXT:    ret
+  %select = select i1 %x, i32 3, i32 0
+  ret i32 %select
+}
+
+define i32 @select5(i1 zeroext %x) {
+; RV64I-LABEL: select5:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    neg a0, a0
+; RV64I-NEXT:    andi a0, a0, 5
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: select5:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    sh2add a0, a0, a0
+; RV64ZBA-NEXT:    ret
+;
+; RV64XANDESPERF-LABEL: select5:
+; RV64XANDESPERF:       # %bb.0:
+; RV64XANDESPERF-NEXT:    nds.lea.w a0, a0, a0
+; RV64XANDESPERF-NEXT:    ret
+  %select = select i1 %x, i32 5, i32 0
+  ret i32 %select
+}
+
+define i32 @select9(i1 zeroext %x) {
+; RV64I-LABEL: select9:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    neg a0, a0
+; RV64I-NEXT:    andi a0, a0, 9
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: select9:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    sh3add a0, a0, a0
+; RV64ZBA-NEXT:    ret
+;
+; RV64XANDESPERF-LABEL: select9:
+; RV64XANDESPERF:       # %bb.0:
+; RV64XANDESPERF-NEXT:    nds.lea.d a0, a0, a0
+; RV64XANDESPERF-NEXT:    ret
+  %select = select i1 %x, i32 9, i32 0
+  ret i32 %select
+}
+
+define ptr @shl_add_knownbits(ptr %p, i64 %i) {
+; RV64I-LABEL: shl_add_knownbits:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a1, a1, 50
+; RV64I-NEXT:    srli a1, a1, 50
+; RV64I-NEXT:    slli a2, a1, 1
+; RV64I-NEXT:    slli a1, a1, 3
+; RV64I-NEXT:    sub a1, a1, a2
+; RV64I-NEXT:    srli a1, a1, 3
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: shl_add_knownbits:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    slli a1, a1, 50
+; RV64ZBA-NEXT:    srli a1, a1, 50
+; RV64ZBA-NEXT:    sh1add a1, a1, a1
+; RV64ZBA-NEXT:    srli a1, a1, 2
+; RV64ZBA-NEXT:    add a0, a0, a1
+; RV64ZBA-NEXT:    ret
+;
+; RV64XANDESPERF-LABEL: shl_add_knownbits:
+; RV64XANDESPERF:       # %bb.0:
+; RV64XANDESPERF-NEXT:    nds.bfoz a1, a1, 13, 0
+; RV64XANDESPERF-NEXT:    nds.lea.h a1, a1, a1
+; RV64XANDESPERF-NEXT:    srli a1, a1, 2
+; RV64XANDESPERF-NEXT:    add a0, a0, a1
+; RV64XANDESPERF-NEXT:    ret
+  %and = and i64 %i, 16383
+  %mul = mul i64 %and, 6
+  %shr = lshr i64 %mul, 3
+  %r = getelementptr i8, ptr %p, i64 %shr
+  ret ptr %r
+}
diff --git a/llvm/test/CodeGen/RISCV/rv64zbb.ll b/llvm/test/CodeGen/RISCV/rv64zbb.ll
index d133f9d1db389..d8b7bfcbceb27 100644
--- a/llvm/test/CodeGen/RISCV/rv64zbb.ll
+++ b/llvm/test/CodeGen/RISCV/rv64zbb.ll
@@ -762,108 +762,70 @@ define <2 x i32> @ctpop_v2i32(<2 x i32> %a) nounwind {
 }
 
 define <2 x i1> @ctpop_v2i32_ult_two(<2 x i32> %a) nounwind {
-; RV64I-LABEL: ctpop_v2i32_ult_two:
-; RV64I:       # %bb.0:
-; RV64I-NEXT:    addi a2, a0, -1
-; RV64I-NEXT:    addi a3, a1, -1
-; RV64I-NEXT:    and a1, a1, a3
-; RV64I-NEXT:    and a0, a0, a2
-; RV64I-NEXT:    sext.w a1, a1
-; RV64I-NEXT:    sext.w a0, a0
-; RV64I-NEXT:    seqz a0, a0
-; RV64I-NEXT:    seqz a1, a1
-; RV64I-NEXT:    ret
-;
-; RV64ZBB-LABEL: ctpop_v2i32_ult_two:
-; RV64ZBB:       # %bb.0:
-; RV64ZBB-NEXT:    cpopw a1, a1
-; RV64ZBB-NEXT:    cpopw a0, a0
-; RV64ZBB-NEXT:    sltiu a0, a0, 2
-; RV64ZBB-NEXT:    sltiu a1, a1, 2
-; RV64ZBB-NEXT:    ret
+; CHECK-LABEL: ctpop_v2i32_ult_two:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a2, a0, -1
+; CHECK-NEXT:    addi a3, a1, -1
+; CHECK-NEXT:    and a1, a1, a3
+; CHECK-NEXT:    and a0, a0, a2
+; CHECK-NEXT:    sext.w a1, a1
+; CHECK-NEXT:    sext.w a0, a0
+; CHECK-NEXT:    seqz a0, a0
+; CHECK-NEXT:    seqz a1, a1
+; CHECK-NEXT:    ret
   %1 = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %a)
   %2 = icmp ult <2 x i32> %1, <i32 2, i32 2>
   ret <2 x i1> %2
 }
 
 define <2 x i1> @ctpop_v2i32_ugt_one(<2 x i32> %a) nounwind {
-; RV64I-LABEL: ctpop_v2i32_ugt_one:
-; RV64I:       # %bb.0:
-; RV64I-NEXT:    addi a2, a0, -1
-; RV64I-NEXT:    addi a3, a1, -1
-; RV64I-NEXT:    and a1, a1, a3
-; RV64I-NEXT:    and a0, a0, a2
-; RV64I-NEXT:    sext.w a1, a1
-; RV64I-NEXT:    sext.w a0, a0
-; RV64I-NEXT:    snez a0, a0
-; RV64I-NEXT:    snez a1, a1
-; RV64I-NEXT:    ret
-;
-; RV64ZBB-LABEL: ctpop_v2i32_ugt_one:
-; RV64ZBB:       # %bb.0:
-; RV64ZBB-NEXT:    cpopw a1, a1
-; RV64ZBB-NEXT:    cpopw a0, a0
-; RV64ZBB-NEXT:    sltiu a0, a0, 2
-; RV64ZBB-NEXT:    sltiu a1, a1, 2
-; RV64ZBB-NEXT:    xori a0, a0, 1
-; RV64ZBB-NEXT:    xori a1, a1, 1
-; RV64ZBB-NEXT:    ret
+; CHECK-LABEL: ctpop_v2i32_ugt_one:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a2, a0, -1
+; CHECK-NEXT:    addi a3, a1, -1
+; CHECK-NEXT:    and a1, a1, a3
+; CHECK-NEXT:    and a0, a0, a2
+; CHECK-NEXT:    sext.w a1, a1
+; CHECK-NEXT:    sext.w a0, a0
+; CHECK-NEXT:    snez a0, a0
+; CHECK-NEXT:    snez a1, a1
+; CHECK-NEXT:    ret
   %1 = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %a)
   %2 = icmp ugt <2 x i32> %1, <i32 1, i32 1>
   ret <2 x i1> %2
 }
 
 define <2 x i1> @ctpop_v2i32_eq_one(<2 x i32> %a) nounwind {
-; RV64I-LABEL: ctpop_v2i32_eq_one:
-; RV64I:       # %bb.0:
-; RV64I-NEXT:    addiw a2, a0, -1
-; RV64I-NEXT:    addiw a3, a1, -1
-; RV64I-NEXT:    xor a1, a1, a3
-; RV64I-NEXT:    xor a0, a0, a2
-; RV64I-NEXT:    sext.w a1, a1
-; RV64I-NEXT:    sext.w a0, a0
-; RV64I-NEXT:    sltu a0, a2, a0
-; RV64I-NEXT:    sltu a1, a3, a1
-; RV64I-NEXT:    ret
-;
-; RV64ZBB-LABEL: ctpop_v2i32_eq_one:
-; RV64ZBB:       # %bb.0:
-; RV64ZBB-NEXT:    cpopw a1, a1
-; RV64ZBB-NEXT:    cpopw a0, a0
-; RV64ZBB-NEXT:    addi a0, a0, -1
-; RV64ZBB-NEXT:    addi a1, a1, -1
-; RV64ZBB-NEXT:    seqz a0, a0
-; RV64ZBB-NEXT:    seqz a1, a1
-; RV64ZBB-NEXT:    ret
+; CHECK-LABEL: ctpop_v2i32_eq_one:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addiw a2, a0, -1
+; CHECK-NEXT:    addiw a3, a1, -1
+; CHECK-NEXT:    xor a1, a1, a3
+; CHECK-NEXT:    xor a0, a0, a2
+; CHECK-NEXT:    sext.w a1, a1
+; CHECK-NEXT:    sext.w a0, a0
+; CHECK-NEXT:    sltu a0, a2, a0
+; CHECK-NEXT:    sltu a1, a3, a1
+; CHECK-NEXT:    ret
   %1 = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %a)
   %2 = icmp eq <2 x i32> %1, <i32 1, i32 1>
   ret <2 x i1> %2
 }
 
 define <2 x i1> @ctpop_v2i32_ne_one(<2 x i32> %a) nounwind {
-; RV64I-LABEL: ctpop_v2i32_ne_one:
-; RV64I:       # %bb.0:
-; RV64I-NEXT:    addiw a2, a0, -1
-; RV64I-NEXT:    addiw a3, a1, -1
-; RV64I-NEXT:    xor a1, a1, a3
-; RV64I-NEXT:    xor a0, a0, a2
-; RV64I-NEXT:    sext.w a1, a1
-; RV64I-NEXT:    sext.w a0, a0
-; RV64I-NEXT:    sltu a0, a2, a0
-; RV64I-NEXT:    sltu a1, a3, a1
-; RV64I-NEXT:    xori a0, a0, 1
-; RV64I-NEXT:    xori a1, a1, 1
-; RV64I-NEXT:    ret
-;
-; RV64ZBB-LABEL: ctpop_v2i32_ne_one:
-; RV64ZBB:       # %bb.0:
-; RV64ZBB-NEXT:    cpopw a1, a1
-; RV64ZBB-NEXT:    cpopw a0, a0
-; RV64ZBB-NEXT:    addi a0, a0, -1
-; RV64ZBB-NEXT:    addi a1, a1, -1
-; RV64ZBB-NEXT:    snez a0, a0
-; RV64ZBB-NEXT:    snez a1, a1
-; RV64ZBB-NEXT:    ret
+; CHECK-LABEL: ctpop_v2i32_ne_one:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addiw a2, a0, -1
+; CHECK-NEXT:    addiw a3, a1, -1
+; CHECK-NEXT:    xor a1, a1, a3
+; CHECK-NEXT:    xor a0, a0, a2
+; CHECK-NEXT:    sext.w a1, a1
+; CHECK-NEXT:    sext.w a0, a0
+; CHECK-NEXT:    sltu a0, a2, a0
+; CHECK-NEXT:    sltu a1, a3, a1
+; CHECK-NEXT:    xori a0, a0, 1
+; CHECK-NEXT:    xori a1, a1, 1
+; CHECK-NEXT:    ret
   %1 = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %a)
   %2 = icmp ne <2 x i32> %1, <i32 1, i32 1>
   ret <2 x i1> %2
@@ -1052,100 +1014,62 @@ define <2 x i64> @ctpop_v2i64(<2 x i64> %a) nounwind {
 }
 
 define <2 x i1> @ctpop_v2i64_ult_two(<2 x i64> %a) nounwind {
-; RV64I-LABEL: ctpop_v2i64_ult_two:
-; RV64I:       # %bb.0:
-; RV64I-NEXT:    addi a2, a0, -1
-; RV64I-NEXT:    addi a3, a1, -1
-; RV64I-NEXT:    and a1, a1, a3
-; RV64I-NEXT:    and a0, a0, a2
-; RV64I-NEXT:    seqz a0, a0
-; RV64I-NEXT:    seqz a1, a1
-; RV64I-NEXT:    ret
-;
-; RV64ZBB-LABEL: ctpop_v2i64_ult_two:
-; RV64ZBB:       # %bb.0:
-; RV64ZBB-NEXT:    cpop a1, a1
-; RV64ZBB-NEXT:    cpop a0, a0
-; RV64ZBB-NEXT:    sltiu a0, a0, 2
-; RV64ZBB-NEXT:    sltiu a1, a1, 2
-; RV64ZBB-NEXT:    ret
+; CHECK-LABEL: ctpop_v2i64_ult_two:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a2, a0, -1
+; CHECK-NEXT:    addi a3, a1, -1
+; CHECK-NEXT:    and a1, a1, a3
+; CHECK-NEXT:    and a0, a0, a2
+; CHECK-NEXT:    seqz a0, a0
+; CHECK-NEXT:    seqz a1, a1
+; CHECK-NEXT:    ret
   %1 = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %a)
   %2 = icmp ult <2 x i64> %1, <i64 2, i64 2>
   ret <2 x i1> %2
 }
 
 define <2 x i1> @ctpop_v2i64_ugt_one(<2 x i64> %a) nounwind {
-; RV64I-LABEL: ctpop_v2i64_ugt_one:
-; RV64I:       # %bb.0:
-; RV64I-NEXT:    addi a2, a0, -1
-; RV64I-NEXT:    addi a3, a1, -1
-; RV64I-NEXT:    and a1, a1, a3
-; RV64I-NEXT:    and a0, a0, a2
-; RV64I-NEXT:    snez a0, a0
-; RV64I-NEXT:    snez a1, a1
-; RV64I-NEXT:    ret
-;
-; RV64ZBB-LABEL: ctpop_v2i64_ugt_one:
-; RV64ZBB:       # %bb.0:
-; RV64ZBB-NEXT:    cpop a1, a1
-; RV64ZBB-NEXT:    cpop a0, a0
-; RV64ZBB-NEXT:    sltiu a0, a0, 2
-; RV64ZBB-NEXT:    sltiu a1, a1, 2
-; RV64ZBB-NEXT:    xori a0, a0, 1
-; RV64ZBB-NEXT:    xori a1, a1, 1
-; RV64ZBB-NEXT:    ret
+; CHECK-LABEL: ctpop_v2i64_ugt_one:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a2, a0, -1
+; CHECK-NEXT:    addi a3, a1, -1
+; CHECK-NEXT:    and a1, a1, a3
+; CHECK-NEXT:    and a0, a0, a2
+; CHECK-NEXT:    snez a0, a0
+; CHECK-NEXT:    snez a1, a1
+; CHECK-NEXT:    ret
   %1 = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %a)
   %2 = icmp ugt <2 x i64> %1, <i64 1, i64 1>
   ret <2 x i1> %2
 }
 
 define <2 x i1> @ctpop_v2i64_eq_one(<2 x i64> %a) nounwind {
-; RV64I-LABEL: ctpop_v2i64_eq_one:
-; RV64I:       # %bb.0:
-; RV64I-NEXT:    addi a2, a0, -1
-; RV64I-NEXT:    addi a3, a1, -1
-; RV64I-NEXT:    xor a1, a1, a3
-; RV64I-NEXT:    xor a0, a0, a2
-; RV64I-NEXT:    sltu a0, a2, a0
-; RV64I-NEXT:    sltu a1, a3, a1
-; RV64I-NEXT:    ret
-;
-; RV64ZBB-LABEL: ctpop_v2i64_eq_one:
-; RV64ZBB:       # %bb.0:
-; RV64ZBB-NEXT:    cpop a1, a1
-; RV64ZBB-NEXT:    cpop a0, a0
-; RV64ZBB-NEXT:    addi a0, a0, -1
-; RV64ZBB-NEXT:    addi a1, a1, -1
-; RV64ZBB-NEXT:    seqz a0, a0
-; RV64ZBB-NEXT:    seqz a1, a1
-; RV64ZBB-NEXT:    ret
+; CHECK-LABEL: ctpop_v2i64_eq_one:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a2, a0, -1
+; CHECK-NEXT:    addi a3, a1, -1
+; CHECK-NEXT:    xor a1, a1, a3
+; CHECK-NEXT:    xor a0, a0, a2
+; CHECK-NEXT:    sltu a0, a2, a0
+; CHECK-NEXT:    sltu a1, a3, a1
+; CHECK-NEXT:    ret
   %1 = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %a)
   %2 = icmp eq <2 x i64> %1, <i64 1, i64 1>
   ret <2 x i1> %2
 }
 
 define <2 x i1> @ctpop_v2i64_ne_one(<2 x i64> %a) nounwind {
-; RV64I-LABEL: ctpop_v2i64_ne_one:
-; RV64I:       # %bb.0:
-; RV64I-NEXT:    addi a2, a0, -1
-; RV64I-NEXT:    addi a3, a1, -1
-; RV64I-NEXT:    xor a1, a1, a3
-; RV64I-NEXT:    xor a0, a0, a2
-; RV64I-NEXT:    sltu a0, a2, a0
-; RV64I-NEXT:    sltu a1, a3, a1
-; RV64I-NEXT:    xori a0, a0, 1
-; RV64I-NEXT:    xori a1, a1, 1
-; RV64I-NEXT:    ret
-;
-; RV64ZBB-LABEL: ctpop_v2i64_ne_one:
-; RV64ZBB:       # %bb.0:
-; RV64ZBB-NEXT:    cpop a1, a1
-; RV64ZBB-NEXT:    cpop a0, a0
-; RV64ZBB-NEXT:    addi a0, a0, -1
-; RV64ZBB-NEXT:    addi a1, a1, -1
-; RV64ZBB-NEXT:    snez a0, a0
-; RV64ZBB-NEXT:    snez a1, a1
-; RV64ZBB-NEXT:    ret
+; CHECK-LABEL: ctpop_v2i64_ne_one:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a2, a0, -1
+; CHECK-NEXT:    addi a3, a1, -1
+; CHECK-NEXT:    xor a1, a1, a3
+; CHECK-NEXT:    xor a0, a0, a2
+; CHECK-NEXT:    sltu a0, a2, a0
+; CHECK-NEXT:    sltu a1, a3, a1
+; CHECK-NEXT:    xori a0, a0, 1
+; CHECK-NEXT:    xori a1, a1, 1
+; CHECK-NEXT:    ret
   %1 = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %a)
   %2 = icmp ne <2 x i64> %1, <i64 1, i64 1>
   ret <2 x i1> %2
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-int.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-int.ll
index eb41ed413a0b4..5683476852683 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-int.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-int.ll
@@ -840,7 +840,6 @@ define <8 x i32> @shuffle_spread3_singlesrc_e32(<8 x i32> %v) {
   ret <8 x i32> %out
 }
 
-; TODO: This should be a single vslideup.vi
 define <8 x i32> @shuffle_spread4_singlesrc_e32(<8 x i32> %v) {
 ; CHECK-LABEL: shuffle_spread4_singlesrc_e32:
 ; CHECK:       # %bb.0:
@@ -937,7 +936,6 @@ define <8 x i32> @shuffle_decompress_singlesrc_e32(<8 x i32> %v) {
   ret <8 x i32> %out
 }
 
-; TODO: This should be a single vslideup.vi
 define <8 x i8> @shuffle_decompress_singlesrc_e8(<8 x i8> %v) {
 ; CHECK-LABEL: shuffle_decompress_singlesrc_e8:
 ; CHECK:       # %bb.0:
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll
index 83b435ddff902..056f55260b854 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll
@@ -934,7 +934,7 @@ define void @strided_load_startval_add_with_splat(ptr noalias nocapture %arg, pt
 ; CHECK-NEXT:    add a1, a1, a5
 ; CHECK-NEXT:    slli a3, a3, 32
 ; CHECK-NEXT:    srli a3, a3, 32
-; CHECK-NEXT:    add a0, a4, a0
+; CHECK-NEXT:    add a0, a0, a4
 ; CHECK-NEXT:    add a0, a0, a3
 ; CHECK-NEXT:    addi a0, a0, 1
 ; CHECK-NEXT:  .LBB14_6: # %bb35
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-zvqdotq.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-zvqdotq.ll
index 684eb609635ef..e6ca6875e1412 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-zvqdotq.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-zvqdotq.ll
@@ -537,7 +537,7 @@ entry:
   %a.sext = sext <4 x i8> %a to <4 x i32>
   %b.sext = sext <4 x i8> %b to <4 x i32>
   %mul = mul <4 x i32> %a.sext, %b.sext
-  %res = call <1 x i32> @llvm.experimental.vector.partial.reduce.add(<1 x i32> zeroinitializer, <4 x i32> %mul)
+  %res = call <1 x i32> @llvm.vector.partial.reduce.add(<1 x i32> zeroinitializer, <4 x i32> %mul)
   ret <1 x i32> %res
 }
 
@@ -570,7 +570,7 @@ entry:
   %a.sext = zext <4 x i8> %a to <4 x i32>
   %b.sext = zext <4 x i8> %b to <4 x i32>
   %mul = mul <4 x i32> %a.sext, %b.sext
-  %res = call <1 x i32> @llvm.experimental.vector.partial.reduce.add(<1 x i32> zeroinitializer, <4 x i32> %mul)
+  %res = call <1 x i32> @llvm.vector.partial.reduce.add(<1 x i32> zeroinitializer, <4 x i32> %mul)
   ret <1 x i32> %res
 }
 
@@ -605,7 +605,7 @@ define <1 x i32> @vqdotu_vx_partial_reduce(<4 x i8> %a, <4 x i8> %b) {
 entry:
   %a.ext = zext <4 x i8> %a to <4 x i32>
   %mul = mul <4 x i32> %a.ext, splat (i32 128)
-  %res = call <1 x i32> @llvm.experimental.vector.partial.reduce.add(<1 x i32> zeroinitializer, <4 x i32> %mul)
+  %res = call <1 x i32> @llvm.vector.partial.reduce.add(<1 x i32> zeroinitializer, <4 x i32> %mul)
   ret <1 x i32> %res
 }
 
@@ -641,7 +641,7 @@ define <1 x i32> @vqdot_vx_partial_reduce(<4 x i8> %a, <4 x i8> %b) {
 entry:
   %a.ext = sext <4 x i8> %a to <4 x i32>
   %mul = mul <4 x i32> %a.ext, splat (i32 -128)
-  %res = call <1 x i32> @llvm.experimental.vector.partial.reduce.add(<1 x i32> zeroinitializer, <4 x i32> %mul)
+  %res = call <1 x i32> @llvm.vector.partial.reduce.add(<1 x i32> zeroinitializer, <4 x i32> %mul)
   ret <1 x i32> %res
 }
 
@@ -675,7 +675,7 @@ entry:
   %a.sext = sext <4 x i8> %a to <4 x i32>
   %b.sext = zext <4 x i8> %b to <4 x i32>
   %mul = mul <4 x i32> %a.sext, %b.sext
-  %res = call <1 x i32> @llvm.experimental.vector.partial.reduce.add(<1 x i32> zeroinitializer, <4 x i32> %mul)
+  %res = call <1 x i32> @llvm.vector.partial.reduce.add(<1 x i32> zeroinitializer, <4 x i32> %mul)
   ret <1 x i32> %res
 }
 
@@ -709,7 +709,7 @@ entry:
   %a.ext = sext <4 x i8> %a to <4 x i32>
   %b.ext = zext <4 x i8> %b to <4 x i32>
   %mul = mul <4 x i32> %b.ext, %a.ext
-  %res = call <1 x i32> @llvm.experimental.vector.partial.reduce.add(<1 x i32> zeroinitializer, <4 x i32> %mul)
+  %res = call <1 x i32> @llvm.vector.partial.reduce.add(<1 x i32> zeroinitializer, <4 x i32> %mul)
   ret <1 x i32> %res
 }
 
@@ -732,7 +732,7 @@ define <1 x i32> @vqdotsu_vx_partial_reduce(<4 x i8> %a, <4 x i8> %b) {
 entry:
   %a.ext = sext <4 x i8> %a to <4 x i32>
   %mul = mul <4 x i32> %a.ext, splat (i32 128)
-  %res = call <1 x i32> @llvm.experimental.vector.partial.reduce.add(<1 x i32> zeroinitializer, <4 x i32> %mul)
+  %res = call <1 x i32> @llvm.vector.partial.reduce.add(<1 x i32> zeroinitializer, <4 x i32> %mul)
   ret <1 x i32> %res
 }
 
@@ -768,7 +768,7 @@ entry:
   %a.sext = sext <8 x i8> %a to <8 x i32>
   %b.sext = sext <8 x i8> %b to <8 x i32>
   %mul = mul <8 x i32> %a.sext, %b.sext
-  %res = call <2 x i32> @llvm.experimental.vector.partial.reduce.add(<2 x i32> zeroinitializer, <8 x i32> %mul)
+  %res = call <2 x i32> @llvm.vector.partial.reduce.add(<2 x i32> zeroinitializer, <8 x i32> %mul)
   ret <2 x i32> %res
 }
 
@@ -945,7 +945,7 @@ entry:
   %a.sext = sext <64 x i8> %a to <64 x i32>
   %b.sext = sext <64 x i8> %b to <64 x i32>
   %mul = mul <64 x i32> %a.sext, %b.sext
-  %res = call <2 x i32> @llvm.experimental.vector.partial.reduce.add(<2 x i32> zeroinitializer, <64 x i32> %mul)
+  %res = call <2 x i32> @llvm.vector.partial.reduce.add(<2 x i32> zeroinitializer, <64 x i32> %mul)
   ret <2 x i32> %res
 }
 
@@ -980,7 +980,7 @@ entry:
   %a.sext = sext <16 x i8> %a to <16 x i32>
   %b.sext = sext <16 x i8> %b to <16 x i32>
   %mul = mul <16 x i32> %a.sext, %b.sext
-  %res = call <4 x i32> @llvm.experimental.vector.partial.reduce.add(<4 x i32> zeroinitializer, <16 x i32> %mul)
+  %res = call <4 x i32> @llvm.vector.partial.reduce.add(<4 x i32> zeroinitializer, <16 x i32> %mul)
   ret <4 x i32> %res
 }
 
@@ -1023,7 +1023,7 @@ entry:
   %a.sext = sext <64 x i8> %a to <64 x i32>
   %b.sext = sext <64 x i8> %b to <64 x i32>
   %mul = mul <64 x i32> %a.sext, %b.sext
-  %res = call <16 x i32> @llvm.experimental.vector.partial.reduce.add(<16 x i32> zeroinitializer, <64 x i32> %mul)
+  %res = call <16 x i32> @llvm.vector.partial.reduce.add(<16 x i32> zeroinitializer, <64 x i32> %mul)
   ret <16 x i32> %res
 }
 
@@ -1059,7 +1059,7 @@ entry:
   %a.sext = sext <16 x i8> %a to <16 x i32>
   %b.sext = sext <16 x i8> %b to <16 x i32>
   %mul = mul <16 x i32> %a.sext, %b.sext
-  %res = call <4 x i32> @llvm.experimental.vector.partial.reduce.add(<4 x i32> %accum, <16 x i32> %mul)
+  %res = call <4 x i32> @llvm.vector.partial.reduce.add(<4 x i32> %accum, <16 x i32> %mul)
   ret <4 x i32> %res
 }
 
@@ -1075,7 +1075,7 @@ entry:
   %a.sext = sext <16 x i8> %a to <16 x i32>
   %b.sext = sext <16 x i8> %b to <16 x i32>
   %mul = mul <16 x i32> %a.sext, %b.sext
-  %res = call <16 x i32> @llvm.experimental.vector.partial.reduce.add.nvx8i32.nvx16i32.nvx16i32(<16 x i32> %mul, <16 x i32> zeroinitializer)
+  %res = call <16 x i32> @llvm.vector.partial.reduce.add.nvx8i32.nvx16i32.nvx16i32(<16 x i32> %mul, <16 x i32> zeroinitializer)
   ret <16 x i32> %res
 }
 
@@ -1370,7 +1370,7 @@ entry:
   %a.ext = sext <256 x i8> %a to <256 x i32>
   %b.ext = zext <256 x i8> %b to <256 x i32>
   %mul = mul <256 x i32> %b.ext, %a.ext
-  %res = call <64 x i32> @llvm.experimental.vector.partial.reduce.add(<64 x i32> zeroinitializer, <256 x i32> %mul)
+  %res = call <64 x i32> @llvm.vector.partial.reduce.add(<64 x i32> zeroinitializer, <256 x i32> %mul)
   ret <64 x i32> %res
 }
 
@@ -1419,7 +1419,7 @@ entry:
   %a.ext = sext <16 x i7> %a to <16 x i31>
   %b.ext = zext <16 x i7> %b to <16 x i31>
   %mul = mul <16 x i31> %b.ext, %a.ext
-  %res = call <4 x i31> @llvm.experimental.vector.partial.reduce.add(<4 x i31> zeroinitializer, <16 x i31> %mul)
+  %res = call <4 x i31> @llvm.vector.partial.reduce.add(<4 x i31> zeroinitializer, <16 x i31> %mul)
   ret <4 x i31> %res
 }
 
@@ -1441,7 +1441,7 @@ entry:
   %a.ext = sext <2 x i8> %a to <2 x i32>
   %b.ext = zext <2 x i8> %b to <2 x i32>
   %mul = mul <2 x i32> %b.ext, %a.ext
-  %res = call <1 x i32> @llvm.experimental.vector.partial.reduce.add(<1 x i32> zeroinitializer, <2 x i32> %mul)
+  %res = call <1 x i32> @llvm.vector.partial.reduce.add(<1 x i32> zeroinitializer, <2 x i32> %mul)
   ret <1 x i32> %res
 }
 
@@ -1482,7 +1482,7 @@ entry:
   %a.ext = sext <8 x i8> %a to <8 x i32>
   %b.ext = zext <8 x i8> %b to <8 x i32>
   %mul = mul <8 x i32> %b.ext, %a.ext
-  %res = call <1 x i32> @llvm.experimental.vector.partial.reduce.add(<1 x i32> zeroinitializer, <8 x i32> %mul)
+  %res = call <1 x i32> @llvm.vector.partial.reduce.add(<1 x i32> zeroinitializer, <8 x i32> %mul)
   ret <1 x i32> %res
 }
 
@@ -1516,7 +1516,7 @@ define <4 x i32> @partial_of_sext(<16 x i8> %a) {
 ; DOT-NEXT:    ret
 entry:
   %a.ext = sext <16 x i8> %a to <16 x i32>
-  %res = call <4 x i32> @llvm.experimental.vector.partial.reduce.add(<4 x i32> zeroinitializer, <16 x i32> %a.ext)
+  %res = call <4 x i32> @llvm.vector.partial.reduce.add(<4 x i32> zeroinitializer, <16 x i32> %a.ext)
   ret <4 x i32> %res
 }
 
@@ -1549,7 +1549,7 @@ define <4 x i32> @partial_of_zext(<16 x i8> %a) {
 ; DOT-NEXT:    ret
 entry:
   %a.ext = zext <16 x i8> %a to <16 x i32>
-  %res = call <4 x i32> @llvm.experimental.vector.partial.reduce.add(<4 x i32> zeroinitializer, <16 x i32> %a.ext)
+  %res = call <4 x i32> @llvm.vector.partial.reduce.add(<4 x i32> zeroinitializer, <16 x i32> %a.ext)
   ret <4 x i32> %res
 }
 
diff --git a/llvm/test/CodeGen/RISCV/rvv/partial-reduction-add.ll b/llvm/test/CodeGen/RISCV/rvv/partial-reduction-add.ll
index ff8037502a4e3..1ef168b765346 100644
--- a/llvm/test/CodeGen/RISCV/rvv/partial-reduction-add.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/partial-reduction-add.ll
@@ -9,7 +9,7 @@ define <4 x i32> @partial_reduce_add_v4i32_v4i32(<4 x i32> %accumulator, <4 x i3
 ; CHECK-NEXT:    vadd.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 entry:
-  %partial.reduce = call <4 x i32> @llvm.experimental.vector.partial.reduce.add(<4 x i32> %accumulator, <4 x i32> %0)
+  %partial.reduce = call <4 x i32> @llvm.vector.partial.reduce.add(<4 x i32> %accumulator, <4 x i32> %0)
   ret <4 x i32> %partial.reduce
 }
 
@@ -24,7 +24,7 @@ define <4 x i32> @partial_reduce_add_v4i32_v8i32(<4 x i32> %accumulator, <8 x i3
 ; CHECK-NEXT:    vadd.vv v8, v8, v12
 ; CHECK-NEXT:    ret
 entry:
-  %partial.reduce = call <4 x i32> @llvm.experimental.vector.partial.reduce.add(<4 x i32> %accumulator, <8 x i32> %0)
+  %partial.reduce = call <4 x i32> @llvm.vector.partial.reduce.add(<4 x i32> %accumulator, <8 x i32> %0)
   ret <4 x i32> %partial.reduce
 }
 
@@ -35,7 +35,7 @@ define <vscale x 4 x i32> @partial_reduce_add_nvx4i32_nvx4i32(<vscale x 4 x i32>
 ; CHECK-NEXT:    vadd.vv v8, v8, v10
 ; CHECK-NEXT:    ret
 entry:
-  %partial.reduce = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add(<vscale x 4 x i32> %accumulator, <vscale x 4 x i32> %0)
+  %partial.reduce = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add(<vscale x 4 x i32> %accumulator, <vscale x 4 x i32> %0)
   ret <vscale x 4 x i32> %partial.reduce
 }
 
@@ -47,7 +47,7 @@ define <vscale x 4 x i32> @partial_reduce_add_nvx4i32_nvx8i32(<vscale x 4 x i32>
 ; CHECK-NEXT:    vadd.vv v8, v14, v8
 ; CHECK-NEXT:    ret
 entry:
-  %partial.reduce = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add(<vscale x 4 x i32> %accumulator, <vscale x 8 x i32> %0)
+  %partial.reduce = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add(<vscale x 4 x i32> %accumulator, <vscale x 8 x i32> %0)
   ret <vscale x 4 x i32> %partial.reduce
 }
 
@@ -61,7 +61,7 @@ define <vscale x 4 x i32> @partial_reduce_add_nvx4i32_nvx16i32(<vscale x 4 x i32
 ; CHECK-NEXT:    vadd.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 entry:
-  %partial.reduce = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add(<vscale x 4 x i32> %accumulator, <vscale x 16 x i32> %0)
+  %partial.reduce = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add(<vscale x 4 x i32> %accumulator, <vscale x 16 x i32> %0)
   ret <vscale x 4 x i32> %partial.reduce
 }
 
@@ -73,7 +73,7 @@ define <vscale x 8 x i32> @partial_reduce_add_nvx8i32_nvx16i32(<vscale x 8 x i32
 ; CHECK-NEXT:    vadd.vv v8, v20, v8
 ; CHECK-NEXT:    ret
 entry:
-  %partial.reduce = call <vscale x 8 x i32> @llvm.experimental.vector.partial.reduce.add(<vscale x 8 x i32> %accumulator, <vscale x 16 x i32> %0)
+  %partial.reduce = call <vscale x 8 x i32> @llvm.vector.partial.reduce.add(<vscale x 8 x i32> %accumulator, <vscale x 16 x i32> %0)
   ret <vscale x 8 x i32> %partial.reduce
 }
 
diff --git a/llvm/test/CodeGen/RISCV/rvv/reproducer-pr146855.ll b/llvm/test/CodeGen/RISCV/rvv/reproducer-pr146855.ll
index cca00bf58063d..2d64defe8c7b1 100644
--- a/llvm/test/CodeGen/RISCV/rvv/reproducer-pr146855.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/reproducer-pr146855.ll
@@ -6,7 +6,7 @@ target triple = "riscv64-unknown-linux-gnu"
 define i32 @_ZN4Mesh12rezone_countESt6vectorIiSaIiEERiS3_(<vscale x 4 x i32> %wide.load, <vscale x 4 x i1> %0, <vscale x 4 x i1> %1, <vscale x 4 x i1> %2, <vscale x 4 x i1> %3) #0 {
 ; CHECK-LABEL: _ZN4Mesh12rezone_countESt6vectorIiSaIiEERiS3_:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 0, e32, m2, ta, ma
 ; CHECK-NEXT:    vmv1r.v v8, v0
 ; CHECK-NEXT:    li a0, 0
 ; CHECK-NEXT:    vmv.v.i v10, 0
@@ -14,7 +14,7 @@ define i32 @_ZN4Mesh12rezone_countESt6vectorIiSaIiEERiS3_(<vscale x 4 x i32> %wi
 ; CHECK-NEXT:    vmv.v.i v14, 0
 ; CHECK-NEXT:  .LBB0_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vsetvli a1, zero, e32, m2, ta, mu
+; CHECK-NEXT:    vsetivli zero, 0, e32, m2, ta, mu
 ; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    slli a0, a0, 2
 ; CHECK-NEXT:    vmv2r.v v16, v10
diff --git a/llvm/test/CodeGen/RISCV/rvv/vl-opt.ll b/llvm/test/CodeGen/RISCV/rvv/vl-opt.ll
index 20608cd6bed87..3844b984455c4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vl-opt.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vl-opt.ll
@@ -238,3 +238,90 @@ define void @segmented_store_insert_subreg(<vscale x 4 x float> %v0, <vscale x 4
   call void @llvm.riscv.vsseg3(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %t2, ptr %p, iXLen %vl, iXLen 5)
   ret void
 }
+
+define void @recurrence(<vscale x 4 x i32> %v, ptr %p, iXLen %n, iXLen %vl) {
+; CHECK-LABEL: recurrence:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e32, m2, ta, ma
+; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:  .LBB16_1: # %loop
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    addi a1, a1, -1
+; CHECK-NEXT:    vadd.vv v10, v10, v8
+; CHECK-NEXT:    bnez a1, .LBB16_1
+; CHECK-NEXT:  # %bb.2: # %exit
+; CHECK-NEXT:    vse32.v v10, (a0)
+; CHECK-NEXT:    ret
+entry:
+  br label %loop
+loop:
+  %iv = phi iXLen [ 0, %entry ], [ %iv.next, %loop ]
+  %phi = phi <vscale x 4 x i32> [ zeroinitializer, %entry ], [ %x, %loop ]
+  %x = add <vscale x 4 x i32> %phi, %v
+  %iv.next = add iXLen %iv, 1
+  %done = icmp eq iXLen %iv.next, %n
+  br i1 %done, label %exit, label %loop
+exit:
+  call void @llvm.riscv.vse(<vscale x 4 x i32> %x, ptr %p, iXLen %vl)
+  ret void
+}
+
+define void @recurrence_vleff(<vscale x 4 x i32> %v, ptr %p, iXLen %n, iXLen %vl) {
+; CHECK-LABEL: recurrence_vleff:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e32, m2, ta, ma
+; CHECK-NEXT:    vmv.v.i v8, 0
+; CHECK-NEXT:    mv a3, a0
+; CHECK-NEXT:  .LBB17_1: # %loop
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vsetvli zero, a2, e32, m2, ta, ma
+; CHECK-NEXT:    vle32ff.v v10, (a3)
+; CHECK-NEXT:    addi a1, a1, -1
+; CHECK-NEXT:    vadd.vv v8, v8, v10
+; CHECK-NEXT:    vse32.v v8, (a0)
+; CHECK-NEXT:    addi a3, a3, 4
+; CHECK-NEXT:    bnez a1, .LBB17_1
+; CHECK-NEXT:  # %bb.2: # %exit
+; CHECK-NEXT:    ret
+entry:
+  br label %loop
+loop:
+  %iv = phi iXLen [ 0, %entry ], [ %iv.next, %loop ]
+  %phi = phi <vscale x 4 x i32> [ zeroinitializer, %entry ], [ %y, %loop ]
+  %gep = getelementptr i32, ptr %p, iXLen %iv
+  %vleff = call { <vscale x 4 x i32>, iXLen } @llvm.riscv.vleff(<vscale x 4 x i32> poison, ptr %gep, iXLen %vl)
+  %vleff.x = extractvalue { <vscale x 4 x i32>, iXLen } %vleff, 0
+  %vleff.vl = extractvalue { <vscale x 4 x i32>, iXLen } %vleff, 1
+  %y = add <vscale x 4 x i32> %phi, %vleff.x
+  call void @llvm.riscv.vse(<vscale x 4 x i32> %y, ptr %p, iXLen %vleff.vl)
+  %iv.next = add iXLen %iv, 1
+  %done = icmp eq iXLen %iv.next, %n
+  br i1 %done, label %exit, label %loop
+exit:
+  ret void
+}
+
+define <vscale x 4 x i32> @join(<vscale x 4 x i32> %v, i1 %cond, iXLen %vl) {
+; CHECK-LABEL: join:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    andi a0, a0, 1
+; CHECK-NEXT:    vsetivli zero, 2, e32, m2, ta, ma
+; CHECK-NEXT:    vadd.vi v8, v8, 1
+; CHECK-NEXT:    beqz a0, .LBB18_2
+; CHECK-NEXT:  # %bb.1: # %foo
+; CHECK-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
+; CHECK-NEXT:    vadd.vi v8, v8, 1
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB18_2: # %bar
+; CHECK-NEXT:    vadd.vi v8, v8, 2
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i32> @llvm.riscv.vadd(<vscale x 4 x i32> poison, <vscale x 4 x i32> %v, iXLen 1, iXLen -1)
+  br i1 %cond, label %foo, label %bar
+foo:
+  %b = call <vscale x 4 x i32> @llvm.riscv.vadd(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, iXLen 1, iXLen 1)
+  ret <vscale x 4 x i32> %b
+bar:
+  %c = call <vscale x 4 x i32> @llvm.riscv.vadd(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, iXLen 2, iXLen 2)
+  ret <vscale x 4 x i32> %c
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vl-opt.mir b/llvm/test/CodeGen/RISCV/rvv/vl-opt.mir
index 086b3203ed5b0..9174b98de0aa9 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vl-opt.mir
+++ b/llvm/test/CodeGen/RISCV/rvv/vl-opt.mir
@@ -699,3 +699,74 @@ body: |
     %11:vr = PseudoVADD_VV_M1 $noreg, %2, $noreg, 10, 5 /* e32 */, 3 /* ta, ma */
     $v10 = COPY %11
     PseudoRET implicit $v10
+...
+---
+name: recurrence
+tracksRegLiveness: true
+body: |
+  ; CHECK-LABEL: name: recurrence
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $x8
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   %avl:gprnox0 = COPY $x8
+  ; CHECK-NEXT:   %start:vr = PseudoVMV_V_I_M1 $noreg, 0, %avl, 3 /* e8 */, 3 /* ta, ma */
+  ; CHECK-NEXT:   PseudoBR %bb.1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   %phi:vr = PHI %start, %bb.0, %inc, %bb.1
+  ; CHECK-NEXT:   %inc:vr = PseudoVADD_VI_M1 $noreg, %phi, 1, %avl, 3 /* e8 */, 3 /* ta, ma */
+  ; CHECK-NEXT:   BNE $noreg, $noreg, %bb.1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   PseudoVSE8_V_M1 %inc, $noreg, %avl, 3 /* e8 */
+  bb.0:
+    liveins: $x8
+    %avl:gprnox0 = COPY $x8
+    %start:vr = PseudoVMV_V_I_M1 $noreg, 0, -1, 3 /* e8 */, 3, /* ta, ma */
+    PseudoBR %bb.1
+  bb.1:
+    %phi:vr = PHI %start, %bb.0, %inc, %bb.1
+    %inc:vr = PseudoVADD_VI_M1 $noreg, %phi, 1, -1, 3 /* e8 */, 3 /* ta, ma */
+    BNE $noreg, $noreg, %bb.1
+  bb.2:
+    PseudoVSE8_V_M1 %inc, $noreg, %avl, 3 /* e8 */
+...
+---
+name: recurrence_cant_reduce
+tracksRegLiveness: true
+body: |
+  ; CHECK-LABEL: name: recurrence_cant_reduce
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $x8, $x9
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   %avl1:gprnox0 = COPY $x8
+  ; CHECK-NEXT:   %avl2:gprnox0 = COPY $x8
+  ; CHECK-NEXT:   %start:vr = PseudoVMV_V_I_M1 $noreg, 0, %avl1, 3 /* e8 */, 3 /* ta, ma */
+  ; CHECK-NEXT:   PseudoBR %bb.1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   %phi:vr = PHI %start, %bb.0, %inc, %bb.1
+  ; CHECK-NEXT:   %inc:vr = PseudoVADD_VI_M1 $noreg, %phi, 1, %avl1, 3 /* e8 */, 3 /* ta, ma */
+  ; CHECK-NEXT:   BNE $noreg, $noreg, %bb.1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   PseudoVSE8_V_M1 %inc, $noreg, %avl2, 3 /* e8 */
+  bb.0:
+    liveins: $x8, $x9
+    %avl1:gprnox0 = COPY $x8
+    %avl2:gprnox0 = COPY $x8
+    %start:vr = PseudoVMV_V_I_M1 $noreg, 0, -1, 3 /* e8 */, 3, /* ta, ma */
+    PseudoBR %bb.1
+  bb.1:
+    %phi:vr = PHI %start, %bb.0, %inc, %bb.1
+    %inc:vr = PseudoVADD_VI_M1 $noreg, %phi, 1, %avl1, 3 /* e8 */, 3 /* ta, ma */
+    BNE $noreg, $noreg, %bb.1
+  bb.2:
+    PseudoVSE8_V_M1 %inc, $noreg, %avl2, 3 /* e8 */
+...
diff --git a/llvm/test/CodeGen/RISCV/rvv/vlopt-same-vl.ll b/llvm/test/CodeGen/RISCV/rvv/vlopt-same-vl.ll
index 4b9f9a0579c48..3a05477e64ccd 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vlopt-same-vl.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vlopt-same-vl.ll
@@ -11,7 +11,7 @@
 ; which was responsible for speeding it up. 
 
 define <vscale x 4 x i32> @same_vl_imm(<vscale x 4 x i32> %passthru, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
-  ; CHECK: User VL is: 4
+  ; CHECK: Trying to reduce VL for %{{.+}}:vrm2 = PseudoVADD_VV_M2
   ; CHECK: Abort due to CommonVL == VLOp, no point in reducing.
   %v = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, i64 4)
   %w = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %v, <vscale x 4 x i32> %a, i64 4)
@@ -19,7 +19,7 @@ define <vscale x 4 x i32> @same_vl_imm(<vscale x 4 x i32> %passthru, <vscale x 4
 }
 
 define <vscale x 4 x i32> @same_vl_reg(<vscale x 4 x i32> %passthru, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, i64 %vl) {
-  ; CHECK: User VL is: %3:gprnox0
+  ; CHECK: Trying to reduce VL for %{{.+}}:vrm2 = PseudoVADD_VV_M2
   ; CHECK: Abort due to CommonVL == VLOp, no point in reducing.
   %v = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, i64 %vl)
   %w = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %v, <vscale x 4 x i32> %a, i64 %vl)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vxrm-insert-out-of-loop.ll b/llvm/test/CodeGen/RISCV/rvv/vxrm-insert-out-of-loop.ll
index dddcd4f107e3b..ead79fcf53d8b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vxrm-insert-out-of-loop.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vxrm-insert-out-of-loop.ll
@@ -18,13 +18,10 @@ define void @test1(ptr nocapture noundef writeonly %dst, i32 noundef signext %i_
 ; RV32-NEXT:  # %bb.1: # %for.cond1.preheader.lr.ph
 ; RV32-NEXT:    blez a6, .LBB0_17
 ; RV32-NEXT:  # %bb.2: # %for.cond1.preheader.us.preheader
-; RV32-NEXT:    addi t0, a7, -1
+; RV32-NEXT:    addi t3, a7, -1
 ; RV32-NEXT:    csrr t2, vlenb
-; RV32-NEXT:    mul t3, a1, t0
-; RV32-NEXT:    mul t4, a3, t0
-; RV32-NEXT:    mul t5, a5, t0
 ; RV32-NEXT:    slli t1, t2, 1
-; RV32-NEXT:    li t6, 32
+; RV32-NEXT:    li t4, 32
 ; RV32-NEXT:    mv t0, t1
 ; RV32-NEXT:  # %bb.3: # %for.cond1.preheader.us.preheader
 ; RV32-NEXT:    li t0, 32
@@ -34,27 +31,32 @@ define void @test1(ptr nocapture noundef writeonly %dst, i32 noundef signext %i_
 ; RV32-NEXT:    sw s0, 12(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    sw s1, 8(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    sw s2, 4(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s3, 0(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    .cfi_offset s0, -4
 ; RV32-NEXT:    .cfi_offset s1, -8
 ; RV32-NEXT:    .cfi_offset s2, -12
+; RV32-NEXT:    .cfi_offset s3, -16
 ; RV32-NEXT:    .cfi_remember_state
-; RV32-NEXT:    add t3, a0, t3
-; RV32-NEXT:    add t4, a2, t4
-; RV32-NEXT:    add s0, a4, t5
-; RV32-NEXT:    bltu t6, t1, .LBB0_6
+; RV32-NEXT:    mul t5, a1, t3
+; RV32-NEXT:    add s0, a0, a6
+; RV32-NEXT:    mul t6, a3, t3
+; RV32-NEXT:    add s2, a2, a6
+; RV32-NEXT:    mul s1, a5, t3
+; RV32-NEXT:    add s3, a4, a6
+; RV32-NEXT:    bltu t4, t1, .LBB0_6
 ; RV32-NEXT:  # %bb.5: # %for.cond1.preheader.us.preheader
 ; RV32-NEXT:    li t1, 32
 ; RV32-NEXT:  .LBB0_6: # %for.cond1.preheader.us.preheader
-; RV32-NEXT:    add t3, t3, a6
-; RV32-NEXT:    add t5, t4, a6
-; RV32-NEXT:    add t4, s0, a6
+; RV32-NEXT:    add t3, s0, t5
+; RV32-NEXT:    add t6, s2, t6
+; RV32-NEXT:    add t4, s3, s1
 ; RV32-NEXT:    j .LBB0_8
 ; RV32-NEXT:  # %bb.7: # %for.cond1.preheader.us.preheader
 ; RV32-NEXT:    mv t1, t0
 ; RV32-NEXT:  .LBB0_8: # %for.cond1.preheader.us.preheader
 ; RV32-NEXT:    .cfi_restore_state
 ; RV32-NEXT:    li t0, 0
-; RV32-NEXT:    sltu t5, a0, t5
+; RV32-NEXT:    sltu t5, a0, t6
 ; RV32-NEXT:    sltu t6, a2, t3
 ; RV32-NEXT:    and t5, t5, t6
 ; RV32-NEXT:    sltu t4, a0, t4
@@ -140,9 +142,11 @@ define void @test1(ptr nocapture noundef writeonly %dst, i32 noundef signext %i_
 ; RV32-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    lw s2, 4(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s3, 0(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    .cfi_restore s0
 ; RV32-NEXT:    .cfi_restore s1
 ; RV32-NEXT:    .cfi_restore s2
+; RV32-NEXT:    .cfi_restore s3
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
 ; RV32-NEXT:  .LBB0_17: # %for.cond.cleanup
@@ -190,7 +194,7 @@ define void @test1(ptr nocapture noundef writeonly %dst, i32 noundef signext %i_
 ; RV64P670-NEXT:    or t6, s0, s1
 ; RV64P670-NEXT:    sltu s1, a0, t5
 ; RV64P670-NEXT:    sltu s0, a4, t4
-; RV64P670-NEXT:    mv t5, a0
+; RV64P670-NEXT:    add t4, a0, a6
 ; RV64P670-NEXT:    and s0, s0, s1
 ; RV64P670-NEXT:    or s1, a1, a5
 ; RV64P670-NEXT:    srli s1, s1, 63
@@ -200,11 +204,11 @@ define void @test1(ptr nocapture noundef writeonly %dst, i32 noundef signext %i_
 ; RV64P670-NEXT:    or s0, t6, s0
 ; RV64P670-NEXT:    sltu s1, a6, s1
 ; RV64P670-NEXT:    or s0, s0, s1
-; RV64P670-NEXT:    andi t4, s0, 1
+; RV64P670-NEXT:    andi t5, s0, 1
 ; RV64P670-NEXT:    j .LBB0_4
 ; RV64P670-NEXT:  .LBB0_3: # %for.cond1.for.cond.cleanup3_crit_edge.us
 ; RV64P670-NEXT:    # in Loop: Header=BB0_4 Depth=1
-; RV64P670-NEXT:    add t5, t5, a1
+; RV64P670-NEXT:    add a0, a0, a1
 ; RV64P670-NEXT:    add a2, a2, a3
 ; RV64P670-NEXT:    add a4, a4, a5
 ; RV64P670-NEXT:    addiw t1, t1, 1
@@ -214,7 +218,7 @@ define void @test1(ptr nocapture noundef writeonly %dst, i32 noundef signext %i_
 ; RV64P670-NEXT:    # =>This Loop Header: Depth=1
 ; RV64P670-NEXT:    # Child Loop BB0_7 Depth 2
 ; RV64P670-NEXT:    # Child Loop BB0_10 Depth 2
-; RV64P670-NEXT:    beqz t4, .LBB0_6
+; RV64P670-NEXT:    beqz t5, .LBB0_6
 ; RV64P670-NEXT:  # %bb.5: # in Loop: Header=BB0_4 Depth=1
 ; RV64P670-NEXT:    li t6, 0
 ; RV64P670-NEXT:    j .LBB0_9
@@ -223,7 +227,7 @@ define void @test1(ptr nocapture noundef writeonly %dst, i32 noundef signext %i_
 ; RV64P670-NEXT:    slli s1, t2, 28
 ; RV64P670-NEXT:    mv s2, a2
 ; RV64P670-NEXT:    mv s3, a4
-; RV64P670-NEXT:    mv s4, t5
+; RV64P670-NEXT:    mv s4, a0
 ; RV64P670-NEXT:    sub s1, s1, t3
 ; RV64P670-NEXT:    vsetvli s0, zero, e8, m2, ta, ma
 ; RV64P670-NEXT:    and t6, s1, a6
@@ -246,11 +250,10 @@ define void @test1(ptr nocapture noundef writeonly %dst, i32 noundef signext %i_
 ; RV64P670-NEXT:  .LBB0_9: # %for.body4.us.preheader
 ; RV64P670-NEXT:    # in Loop: Header=BB0_4 Depth=1
 ; RV64P670-NEXT:    mul s2, a1, t0
-; RV64P670-NEXT:    add s0, a0, a6
-; RV64P670-NEXT:    add s1, t5, t6
+; RV64P670-NEXT:    add s1, a0, t6
 ; RV64P670-NEXT:    add s4, a4, t6
 ; RV64P670-NEXT:    add t6, t6, a2
-; RV64P670-NEXT:    add s2, s2, s0
+; RV64P670-NEXT:    add s2, s2, t4
 ; RV64P670-NEXT:  .LBB0_10: # %for.body4.us
 ; RV64P670-NEXT:    # Parent Loop BB0_4 Depth=1
 ; RV64P670-NEXT:    # => This Inner Loop Header: Depth=2
@@ -332,12 +335,12 @@ define void @test1(ptr nocapture noundef writeonly %dst, i32 noundef signext %i_
 ; RV64X60-NEXT:    or s0, t4, s0
 ; RV64X60-NEXT:    sltu s1, a6, s1
 ; RV64X60-NEXT:    or s0, s0, s1
-; RV64X60-NEXT:    andi t4, s0, 1
-; RV64X60-NEXT:    mv t5, a0
+; RV64X60-NEXT:    add t4, a0, a6
+; RV64X60-NEXT:    andi t5, s0, 1
 ; RV64X60-NEXT:    j .LBB0_4
 ; RV64X60-NEXT:  .LBB0_3: # %for.cond1.for.cond.cleanup3_crit_edge.us
 ; RV64X60-NEXT:    # in Loop: Header=BB0_4 Depth=1
-; RV64X60-NEXT:    add t5, t5, a1
+; RV64X60-NEXT:    add a0, a0, a1
 ; RV64X60-NEXT:    add a2, a2, a3
 ; RV64X60-NEXT:    addiw t1, t1, 1
 ; RV64X60-NEXT:    add a4, a4, a5
@@ -347,7 +350,7 @@ define void @test1(ptr nocapture noundef writeonly %dst, i32 noundef signext %i_
 ; RV64X60-NEXT:    # =>This Loop Header: Depth=1
 ; RV64X60-NEXT:    # Child Loop BB0_7 Depth 2
 ; RV64X60-NEXT:    # Child Loop BB0_10 Depth 2
-; RV64X60-NEXT:    beqz t4, .LBB0_6
+; RV64X60-NEXT:    beqz t5, .LBB0_6
 ; RV64X60-NEXT:  # %bb.5: # in Loop: Header=BB0_4 Depth=1
 ; RV64X60-NEXT:    li t6, 0
 ; RV64X60-NEXT:    j .LBB0_9
@@ -358,7 +361,7 @@ define void @test1(ptr nocapture noundef writeonly %dst, i32 noundef signext %i_
 ; RV64X60-NEXT:    and t6, s1, a6
 ; RV64X60-NEXT:    mv s2, a2
 ; RV64X60-NEXT:    mv s3, a4
-; RV64X60-NEXT:    mv s4, t5
+; RV64X60-NEXT:    mv s4, a0
 ; RV64X60-NEXT:    mv s1, t6
 ; RV64X60-NEXT:    vsetvli s0, zero, e8, m2, ta, ma
 ; RV64X60-NEXT:  .LBB0_7: # %vector.body
@@ -379,9 +382,8 @@ define void @test1(ptr nocapture noundef writeonly %dst, i32 noundef signext %i_
 ; RV64X60-NEXT:  .LBB0_9: # %for.body4.us.preheader
 ; RV64X60-NEXT:    # in Loop: Header=BB0_4 Depth=1
 ; RV64X60-NEXT:    mul s2, a1, t0
-; RV64X60-NEXT:    add s1, a0, a6
-; RV64X60-NEXT:    add s0, t5, t6
-; RV64X60-NEXT:    add s2, s2, s1
+; RV64X60-NEXT:    add s0, a0, t6
+; RV64X60-NEXT:    add s2, s2, t4
 ; RV64X60-NEXT:    add s4, a4, t6
 ; RV64X60-NEXT:    add t6, t6, a2
 ; RV64X60-NEXT:  .LBB0_10: # %for.body4.us
@@ -466,16 +468,16 @@ define void @test1(ptr nocapture noundef writeonly %dst, i32 noundef signext %i_
 ; RV64-NEXT:    or s0, a1, a5
 ; RV64-NEXT:    srli s0, s0, 63
 ; RV64-NEXT:    or t5, t5, s0
+; RV64-NEXT:    sltu s0, a6, t4
 ; RV64-NEXT:    or t5, t6, t5
-; RV64-NEXT:    sltu t4, a6, t4
-; RV64-NEXT:    or t4, t4, t5
-; RV64-NEXT:    andi t4, t4, 1
-; RV64-NEXT:    mv t5, a0
+; RV64-NEXT:    add t4, a0, a6
+; RV64-NEXT:    or t5, s0, t5
+; RV64-NEXT:    andi t5, t5, 1
 ; RV64-NEXT:    csrwi vxrm, 0
 ; RV64-NEXT:    j .LBB0_6
 ; RV64-NEXT:  .LBB0_5: # %for.cond1.for.cond.cleanup3_crit_edge.us
 ; RV64-NEXT:    # in Loop: Header=BB0_6 Depth=1
-; RV64-NEXT:    add t5, t5, a1
+; RV64-NEXT:    add a0, a0, a1
 ; RV64-NEXT:    add a2, a2, a3
 ; RV64-NEXT:    add a4, a4, a5
 ; RV64-NEXT:    addiw t3, t3, 1
@@ -485,7 +487,7 @@ define void @test1(ptr nocapture noundef writeonly %dst, i32 noundef signext %i_
 ; RV64-NEXT:    # =>This Loop Header: Depth=1
 ; RV64-NEXT:    # Child Loop BB0_9 Depth 2
 ; RV64-NEXT:    # Child Loop BB0_12 Depth 2
-; RV64-NEXT:    beqz t4, .LBB0_8
+; RV64-NEXT:    beqz t5, .LBB0_8
 ; RV64-NEXT:  # %bb.7: # in Loop: Header=BB0_6 Depth=1
 ; RV64-NEXT:    li t6, 0
 ; RV64-NEXT:    j .LBB0_11
@@ -496,7 +498,7 @@ define void @test1(ptr nocapture noundef writeonly %dst, i32 noundef signext %i_
 ; RV64-NEXT:    and t6, t6, a6
 ; RV64-NEXT:    mv s0, a2
 ; RV64-NEXT:    mv s1, a4
-; RV64-NEXT:    mv s2, t5
+; RV64-NEXT:    mv s2, a0
 ; RV64-NEXT:    mv s3, t6
 ; RV64-NEXT:    vsetvli s4, zero, e8, m2, ta, ma
 ; RV64-NEXT:  .LBB0_9: # %vector.body
@@ -516,25 +518,24 @@ define void @test1(ptr nocapture noundef writeonly %dst, i32 noundef signext %i_
 ; RV64-NEXT:    beq t6, a6, .LBB0_5
 ; RV64-NEXT:  .LBB0_11: # %for.body4.us.preheader
 ; RV64-NEXT:    # in Loop: Header=BB0_6 Depth=1
-; RV64-NEXT:    mul s1, a1, t2
-; RV64-NEXT:    add s2, a0, a6
-; RV64-NEXT:    add s0, t5, t6
-; RV64-NEXT:    add s1, s2, s1
-; RV64-NEXT:    add s2, a4, t6
+; RV64-NEXT:    mul s2, a1, t2
+; RV64-NEXT:    add s0, a0, t6
+; RV64-NEXT:    add s1, a4, t6
+; RV64-NEXT:    add s2, t4, s2
 ; RV64-NEXT:    add t6, a2, t6
 ; RV64-NEXT:  .LBB0_12: # %for.body4.us
 ; RV64-NEXT:    # Parent Loop BB0_6 Depth=1
 ; RV64-NEXT:    # => This Inner Loop Header: Depth=2
 ; RV64-NEXT:    lbu s3, 0(t6)
-; RV64-NEXT:    lbu s4, 0(s2)
+; RV64-NEXT:    lbu s4, 0(s1)
 ; RV64-NEXT:    add s3, s3, s4
 ; RV64-NEXT:    addi s3, s3, 1
 ; RV64-NEXT:    srli s3, s3, 1
 ; RV64-NEXT:    sb s3, 0(s0)
 ; RV64-NEXT:    addi s0, s0, 1
-; RV64-NEXT:    addi s2, s2, 1
+; RV64-NEXT:    addi s1, s1, 1
 ; RV64-NEXT:    addi t6, t6, 1
-; RV64-NEXT:    bne s0, s1, .LBB0_12
+; RV64-NEXT:    bne s0, s2, .LBB0_12
 ; RV64-NEXT:    j .LBB0_5
 ; RV64-NEXT:  .LBB0_13:
 ; RV64-NEXT:    ld s0, 40(sp) # 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/rvv/zvqdotq-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/zvqdotq-sdnode.ll
index 87a984bda1fee..772895316ebc6 100644
--- a/llvm/test/CodeGen/RISCV/rvv/zvqdotq-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/zvqdotq-sdnode.ll
@@ -528,7 +528,7 @@ entry:
   %a.sext = sext <vscale x 4 x i8> %a to <vscale x 4 x i32>
   %b.sext = sext <vscale x 4 x i8> %b to <vscale x 4 x i32>
   %mul = mul <vscale x 4 x i32> %a.sext, %b.sext
-  %res = call <vscale x 1 x i32> @llvm.experimental.vector.partial.reduce.add(<vscale x 1 x i32> zeroinitializer, <vscale x 4 x i32> %mul)
+  %res = call <vscale x 1 x i32> @llvm.vector.partial.reduce.add(<vscale x 1 x i32> zeroinitializer, <vscale x 4 x i32> %mul)
   ret <vscale x 1 x i32> %res
 }
 
@@ -556,7 +556,7 @@ entry:
   %a.sext = sext <vscale x 8 x i8> %a to <vscale x 8 x i32>
   %b.sext = sext <vscale x 8 x i8> %b to <vscale x 8 x i32>
   %mul = mul <vscale x 8 x i32> %a.sext, %b.sext
-  %res = call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add(<vscale x 2 x i32> zeroinitializer, <vscale x 8 x i32> %mul)
+  %res = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add(<vscale x 2 x i32> zeroinitializer, <vscale x 8 x i32> %mul)
   ret <vscale x 2 x i32> %res
 }
 
@@ -584,7 +584,7 @@ entry:
   %a.sext = sext <vscale x 16 x i8> %a to <vscale x 16 x i32>
   %b.sext = sext <vscale x 16 x i8> %b to <vscale x 16 x i32>
   %mul = mul <vscale x 16 x i32> %a.sext, %b.sext
-  %res = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add(<vscale x 4 x i32> zeroinitializer, <vscale x 16 x i32> %mul)
+  %res = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add(<vscale x 4 x i32> zeroinitializer, <vscale x 16 x i32> %mul)
   ret <vscale x 4 x i32> %res
 }
 
@@ -615,7 +615,7 @@ entry:
   %a.sext = sext <vscale x 32 x i8> %a to <vscale x 32 x i32>
   %b.sext = sext <vscale x 32 x i8> %b to <vscale x 32 x i32>
   %mul = mul <vscale x 32 x i32> %a.sext, %b.sext
-  %res = call <vscale x 8 x i32> @llvm.experimental.vector.partial.reduce.add(<vscale x 8 x i32> zeroinitializer, <vscale x 32 x i32> %mul)
+  %res = call <vscale x 8 x i32> @llvm.vector.partial.reduce.add(<vscale x 8 x i32> zeroinitializer, <vscale x 32 x i32> %mul)
   ret <vscale x 8 x i32> %res
 }
 
@@ -664,7 +664,7 @@ entry:
   %a.sext = sext <vscale x 64 x i8> %a to <vscale x 64 x i32>
   %b.sext = sext <vscale x 64 x i8> %b to <vscale x 64 x i32>
   %mul = mul <vscale x 64 x i32> %a.sext, %b.sext
-  %res = call <vscale x 16 x i32> @llvm.experimental.vector.partial.reduce.add(<vscale x 16 x i32> zeroinitializer, <vscale x 64 x i32> %mul)
+  %res = call <vscale x 16 x i32> @llvm.vector.partial.reduce.add(<vscale x 16 x i32> zeroinitializer, <vscale x 64 x i32> %mul)
   ret <vscale x 16 x i32> %res
 }
 
@@ -828,7 +828,7 @@ entry:
   %a.sext = sext <vscale x 128 x i8> %a to <vscale x 128 x i32>
   %b.sext = sext <vscale x 128 x i8> %b to <vscale x 128 x i32>
   %mul = mul <vscale x 128 x i32> %a.sext, %b.sext
-  %res = call <vscale x 32 x i32> @llvm.experimental.vector.partial.reduce.add(<vscale x 32 x i32> zeroinitializer, <vscale x 128 x i32> %mul)
+  %res = call <vscale x 32 x i32> @llvm.vector.partial.reduce.add(<vscale x 32 x i32> zeroinitializer, <vscale x 128 x i32> %mul)
   ret <vscale x 32 x i32> %res
 }
 
@@ -856,7 +856,7 @@ entry:
   %a.sext = sext <vscale x 16 x i8> %a to <vscale x 16 x i32>
   %b.sext = sext <vscale x 16 x i8> %b to <vscale x 16 x i32>
   %mul = mul <vscale x 16 x i32> %a.sext, %b.sext
-  %res = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add(<vscale x 4 x i32> %accum, <vscale x 16 x i32> %mul)
+  %res = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add(<vscale x 4 x i32> %accum, <vscale x 16 x i32> %mul)
   ret <vscale x 4 x i32> %res
 }
 
@@ -872,7 +872,7 @@ entry:
   %a.sext = sext <vscale x 16 x i8> %a to <vscale x 16 x i32>
   %b.sext = sext <vscale x 16 x i8> %b to <vscale x 16 x i32>
   %mul = mul <vscale x 16 x i32> %a.sext, %b.sext
-  %res = call <vscale x 16 x i32> @llvm.experimental.vector.partial.reduce.add.nvx16i32.nvx16i32(<vscale x 16 x i32> %mul, <vscale x 16 x i32> zeroinitializer)
+  %res = call <vscale x 16 x i32> @llvm.vector.partial.reduce.add.nvx16i32.nvx16i32(<vscale x 16 x i32> %mul, <vscale x 16 x i32> zeroinitializer)
   ret <vscale x 16 x i32> %res
 }
 
@@ -905,7 +905,7 @@ entry:
   %a.sext = zext <vscale x 4 x i8> %a to <vscale x 4 x i32>
   %b.sext = zext <vscale x 4 x i8> %b to <vscale x 4 x i32>
   %mul = mul <vscale x 4 x i32> %a.sext, %b.sext
-  %res = call <vscale x 1 x i32> @llvm.experimental.vector.partial.reduce.add(<vscale x 1 x i32> zeroinitializer, <vscale x 4 x i32> %mul)
+  %res = call <vscale x 1 x i32> @llvm.vector.partial.reduce.add(<vscale x 1 x i32> zeroinitializer, <vscale x 4 x i32> %mul)
   ret <vscale x 1 x i32> %res
 }
 
@@ -938,7 +938,7 @@ entry:
   %a.sext = sext <vscale x 4 x i8> %a to <vscale x 4 x i32>
   %b.sext = zext <vscale x 4 x i8> %b to <vscale x 4 x i32>
   %mul = mul <vscale x 4 x i32> %a.sext, %b.sext
-  %res = call <vscale x 1 x i32> @llvm.experimental.vector.partial.reduce.add(<vscale x 1 x i32> zeroinitializer, <vscale x 4 x i32> %mul)
+  %res = call <vscale x 1 x i32> @llvm.vector.partial.reduce.add(<vscale x 1 x i32> zeroinitializer, <vscale x 4 x i32> %mul)
   ret <vscale x 1 x i32> %res
 }
 
@@ -965,7 +965,7 @@ define <vscale x 4 x i32> @partial_of_sext(<vscale x 16 x i8> %a) {
 ; DOT-NEXT:    ret
 entry:
   %a.ext = sext <vscale x 16 x i8> %a to <vscale x 16 x i32>
-  %res = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add(<vscale x 4 x i32> zeroinitializer, <vscale x 16 x i32> %a.ext)
+  %res = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add(<vscale x 4 x i32> zeroinitializer, <vscale x 16 x i32> %a.ext)
   ret <vscale x 4 x i32> %res
 }
 
@@ -991,7 +991,7 @@ define <vscale x 4 x i32> @partial_of_zext(<vscale x 16 x i8> %a) {
 ; DOT-NEXT:    ret
 entry:
   %a.ext = zext <vscale x 16 x i8> %a to <vscale x 16 x i32>
-  %res = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add(<vscale x 4 x i32> zeroinitializer, <vscale x 16 x i32> %a.ext)
+  %res = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add(<vscale x 4 x i32> zeroinitializer, <vscale x 16 x i32> %a.ext)
   ret <vscale x 4 x i32> %res
 }
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
diff --git a/llvm/test/CodeGen/RISCV/select-zbb.ll b/llvm/test/CodeGen/RISCV/select-zbb.ll
index 0af699aae3288..efc3f46376b4e 100644
--- a/llvm/test/CodeGen/RISCV/select-zbb.ll
+++ b/llvm/test/CodeGen/RISCV/select-zbb.ll
@@ -12,96 +12,80 @@
 define i32 @select_umin_1(i1 zeroext %cond, i32 %a, i32 %b) {
 ; RV32IM-LABEL: select_umin_1:
 ; RV32IM:       # %bb.0: # %entry
-; RV32IM-NEXT:    bgeu a1, a2, .LBB0_3
+; RV32IM-NEXT:    addi a0, a0, -1
+; RV32IM-NEXT:    or a1, a0, a1
+; RV32IM-NEXT:    mv a0, a2
+; RV32IM-NEXT:    bltu a2, a1, .LBB0_2
 ; RV32IM-NEXT:  # %bb.1: # %entry
-; RV32IM-NEXT:    beqz a0, .LBB0_4
-; RV32IM-NEXT:  .LBB0_2: # %entry
 ; RV32IM-NEXT:    mv a0, a1
-; RV32IM-NEXT:    ret
-; RV32IM-NEXT:  .LBB0_3: # %entry
-; RV32IM-NEXT:    mv a1, a2
-; RV32IM-NEXT:    bnez a0, .LBB0_2
-; RV32IM-NEXT:  .LBB0_4: # %entry
-; RV32IM-NEXT:    mv a0, a2
+; RV32IM-NEXT:  .LBB0_2: # %entry
 ; RV32IM-NEXT:    ret
 ;
 ; RV64IM-LABEL: select_umin_1:
 ; RV64IM:       # %bb.0: # %entry
-; RV64IM-NEXT:    sext.w a3, a2
+; RV64IM-NEXT:    mv a3, a0
+; RV64IM-NEXT:    sext.w a0, a2
+; RV64IM-NEXT:    addi a3, a3, -1
+; RV64IM-NEXT:    or a1, a3, a1
 ; RV64IM-NEXT:    sext.w a1, a1
-; RV64IM-NEXT:    bgeu a1, a3, .LBB0_3
+; RV64IM-NEXT:    bltu a0, a1, .LBB0_2
 ; RV64IM-NEXT:  # %bb.1: # %entry
-; RV64IM-NEXT:    beqz a0, .LBB0_4
-; RV64IM-NEXT:  .LBB0_2: # %entry
 ; RV64IM-NEXT:    mv a0, a1
-; RV64IM-NEXT:    ret
-; RV64IM-NEXT:  .LBB0_3: # %entry
-; RV64IM-NEXT:    mv a1, a3
-; RV64IM-NEXT:    bnez a0, .LBB0_2
-; RV64IM-NEXT:  .LBB0_4: # %entry
-; RV64IM-NEXT:    mv a0, a2
+; RV64IM-NEXT:  .LBB0_2: # %entry
 ; RV64IM-NEXT:    ret
 ;
 ; RV32IMZBB-LABEL: select_umin_1:
 ; RV32IMZBB:       # %bb.0: # %entry
-; RV32IMZBB-NEXT:    beqz a0, .LBB0_2
-; RV32IMZBB-NEXT:  # %bb.1:
-; RV32IMZBB-NEXT:    minu a2, a1, a2
-; RV32IMZBB-NEXT:  .LBB0_2: # %entry
-; RV32IMZBB-NEXT:    mv a0, a2
+; RV32IMZBB-NEXT:    addi a0, a0, -1
+; RV32IMZBB-NEXT:    or a0, a0, a1
+; RV32IMZBB-NEXT:    minu a0, a2, a0
 ; RV32IMZBB-NEXT:    ret
 ;
 ; RV64IMZBB-LABEL: select_umin_1:
 ; RV64IMZBB:       # %bb.0: # %entry
-; RV64IMZBB-NEXT:    beqz a0, .LBB0_2
-; RV64IMZBB-NEXT:  # %bb.1:
 ; RV64IMZBB-NEXT:    sext.w a2, a2
-; RV64IMZBB-NEXT:    sext.w a1, a1
-; RV64IMZBB-NEXT:    minu a2, a1, a2
-; RV64IMZBB-NEXT:  .LBB0_2: # %entry
-; RV64IMZBB-NEXT:    mv a0, a2
+; RV64IMZBB-NEXT:    addi a0, a0, -1
+; RV64IMZBB-NEXT:    or a0, a0, a1
+; RV64IMZBB-NEXT:    sext.w a0, a0
+; RV64IMZBB-NEXT:    minu a0, a2, a0
 ; RV64IMZBB-NEXT:    ret
 ;
 ; RV32IMZICOND-LABEL: select_umin_1:
 ; RV32IMZICOND:       # %bb.0: # %entry
-; RV32IMZICOND-NEXT:    sltu a3, a1, a2
-; RV32IMZICOND-NEXT:    czero.nez a4, a2, a3
-; RV32IMZICOND-NEXT:    czero.eqz a1, a1, a3
-; RV32IMZICOND-NEXT:    or a1, a1, a4
-; RV32IMZICOND-NEXT:    czero.eqz a1, a1, a0
-; RV32IMZICOND-NEXT:    czero.nez a0, a2, a0
+; RV32IMZICOND-NEXT:    addi a0, a0, -1
+; RV32IMZICOND-NEXT:    or a0, a0, a1
+; RV32IMZICOND-NEXT:    sltu a1, a2, a0
+; RV32IMZICOND-NEXT:    czero.nez a0, a0, a1
+; RV32IMZICOND-NEXT:    czero.eqz a1, a2, a1
 ; RV32IMZICOND-NEXT:    or a0, a1, a0
 ; RV32IMZICOND-NEXT:    ret
 ;
 ; RV64IMZICOND-LABEL: select_umin_1:
 ; RV64IMZICOND:       # %bb.0: # %entry
-; RV64IMZICOND-NEXT:    sext.w a3, a2
-; RV64IMZICOND-NEXT:    sext.w a1, a1
-; RV64IMZICOND-NEXT:    sltu a4, a1, a3
-; RV64IMZICOND-NEXT:    czero.nez a3, a3, a4
-; RV64IMZICOND-NEXT:    czero.eqz a1, a1, a4
-; RV64IMZICOND-NEXT:    or a1, a1, a3
-; RV64IMZICOND-NEXT:    czero.eqz a1, a1, a0
-; RV64IMZICOND-NEXT:    czero.nez a0, a2, a0
+; RV64IMZICOND-NEXT:    sext.w a2, a2
+; RV64IMZICOND-NEXT:    addi a0, a0, -1
+; RV64IMZICOND-NEXT:    or a0, a0, a1
+; RV64IMZICOND-NEXT:    sext.w a0, a0
+; RV64IMZICOND-NEXT:    sltu a1, a2, a0
+; RV64IMZICOND-NEXT:    czero.nez a0, a0, a1
+; RV64IMZICOND-NEXT:    czero.eqz a1, a2, a1
 ; RV64IMZICOND-NEXT:    or a0, a1, a0
 ; RV64IMZICOND-NEXT:    ret
 ;
 ; RV32IMBOTH-LABEL: select_umin_1:
 ; RV32IMBOTH:       # %bb.0: # %entry
-; RV32IMBOTH-NEXT:    minu a1, a1, a2
-; RV32IMBOTH-NEXT:    czero.nez a2, a2, a0
-; RV32IMBOTH-NEXT:    czero.eqz a0, a1, a0
-; RV32IMBOTH-NEXT:    or a0, a0, a2
+; RV32IMBOTH-NEXT:    addi a0, a0, -1
+; RV32IMBOTH-NEXT:    or a0, a0, a1
+; RV32IMBOTH-NEXT:    minu a0, a2, a0
 ; RV32IMBOTH-NEXT:    ret
 ;
 ; RV64IMBOTH-LABEL: select_umin_1:
 ; RV64IMBOTH:       # %bb.0: # %entry
-; RV64IMBOTH-NEXT:    sext.w a3, a2
-; RV64IMBOTH-NEXT:    sext.w a1, a1
-; RV64IMBOTH-NEXT:    minu a1, a1, a3
-; RV64IMBOTH-NEXT:    czero.nez a2, a2, a0
-; RV64IMBOTH-NEXT:    czero.eqz a0, a1, a0
-; RV64IMBOTH-NEXT:    or a0, a0, a2
+; RV64IMBOTH-NEXT:    sext.w a2, a2
+; RV64IMBOTH-NEXT:    addi a0, a0, -1
+; RV64IMBOTH-NEXT:    or a0, a0, a1
+; RV64IMBOTH-NEXT:    sext.w a0, a0
+; RV64IMBOTH-NEXT:    minu a0, a2, a0
 ; RV64IMBOTH-NEXT:    ret
 entry:
   %c = call i32 @llvm.umin(i32 %a, i32 %b)
@@ -112,97 +96,80 @@ entry:
 define i32 @select_umin_2(i1 zeroext %cond, i32 %a, i32 %b) {
 ; RV32IM-LABEL: select_umin_2:
 ; RV32IM:       # %bb.0: # %entry
-; RV32IM-NEXT:    mv a3, a1
-; RV32IM-NEXT:    bgeu a1, a2, .LBB1_3
+; RV32IM-NEXT:    neg a0, a0
+; RV32IM-NEXT:    or a2, a0, a2
+; RV32IM-NEXT:    mv a0, a1
+; RV32IM-NEXT:    bltu a1, a2, .LBB1_2
 ; RV32IM-NEXT:  # %bb.1: # %entry
-; RV32IM-NEXT:    beqz a0, .LBB1_4
+; RV32IM-NEXT:    mv a0, a2
 ; RV32IM-NEXT:  .LBB1_2: # %entry
-; RV32IM-NEXT:    mv a0, a1
-; RV32IM-NEXT:    ret
-; RV32IM-NEXT:  .LBB1_3: # %entry
-; RV32IM-NEXT:    mv a3, a2
-; RV32IM-NEXT:    bnez a0, .LBB1_2
-; RV32IM-NEXT:  .LBB1_4: # %entry
-; RV32IM-NEXT:    mv a0, a3
 ; RV32IM-NEXT:    ret
 ;
 ; RV64IM-LABEL: select_umin_2:
 ; RV64IM:       # %bb.0: # %entry
-; RV64IM-NEXT:    sext.w a3, a2
-; RV64IM-NEXT:    sext.w a2, a1
-; RV64IM-NEXT:    bgeu a2, a3, .LBB1_3
+; RV64IM-NEXT:    mv a3, a0
+; RV64IM-NEXT:    sext.w a0, a1
+; RV64IM-NEXT:    neg a1, a3
+; RV64IM-NEXT:    or a1, a1, a2
+; RV64IM-NEXT:    sext.w a1, a1
+; RV64IM-NEXT:    bltu a0, a1, .LBB1_2
 ; RV64IM-NEXT:  # %bb.1: # %entry
-; RV64IM-NEXT:    beqz a0, .LBB1_4
-; RV64IM-NEXT:  .LBB1_2: # %entry
 ; RV64IM-NEXT:    mv a0, a1
-; RV64IM-NEXT:    ret
-; RV64IM-NEXT:  .LBB1_3: # %entry
-; RV64IM-NEXT:    mv a2, a3
-; RV64IM-NEXT:    bnez a0, .LBB1_2
-; RV64IM-NEXT:  .LBB1_4: # %entry
-; RV64IM-NEXT:    mv a0, a2
+; RV64IM-NEXT:  .LBB1_2: # %entry
 ; RV64IM-NEXT:    ret
 ;
 ; RV32IMZBB-LABEL: select_umin_2:
 ; RV32IMZBB:       # %bb.0: # %entry
-; RV32IMZBB-NEXT:    bnez a0, .LBB1_2
-; RV32IMZBB-NEXT:  # %bb.1: # %entry
-; RV32IMZBB-NEXT:    minu a1, a1, a2
-; RV32IMZBB-NEXT:  .LBB1_2: # %entry
-; RV32IMZBB-NEXT:    mv a0, a1
+; RV32IMZBB-NEXT:    neg a0, a0
+; RV32IMZBB-NEXT:    or a0, a0, a2
+; RV32IMZBB-NEXT:    minu a0, a1, a0
 ; RV32IMZBB-NEXT:    ret
 ;
 ; RV64IMZBB-LABEL: select_umin_2:
 ; RV64IMZBB:       # %bb.0: # %entry
-; RV64IMZBB-NEXT:    bnez a0, .LBB1_2
-; RV64IMZBB-NEXT:  # %bb.1: # %entry
-; RV64IMZBB-NEXT:    sext.w a2, a2
 ; RV64IMZBB-NEXT:    sext.w a1, a1
-; RV64IMZBB-NEXT:    minu a1, a1, a2
-; RV64IMZBB-NEXT:  .LBB1_2: # %entry
-; RV64IMZBB-NEXT:    mv a0, a1
+; RV64IMZBB-NEXT:    neg a0, a0
+; RV64IMZBB-NEXT:    or a0, a0, a2
+; RV64IMZBB-NEXT:    sext.w a0, a0
+; RV64IMZBB-NEXT:    minu a0, a1, a0
 ; RV64IMZBB-NEXT:    ret
 ;
 ; RV32IMZICOND-LABEL: select_umin_2:
 ; RV32IMZICOND:       # %bb.0: # %entry
-; RV32IMZICOND-NEXT:    sltu a3, a1, a2
-; RV32IMZICOND-NEXT:    czero.nez a2, a2, a3
-; RV32IMZICOND-NEXT:    czero.eqz a3, a1, a3
-; RV32IMZICOND-NEXT:    or a2, a3, a2
-; RV32IMZICOND-NEXT:    czero.nez a2, a2, a0
-; RV32IMZICOND-NEXT:    czero.eqz a0, a1, a0
+; RV32IMZICOND-NEXT:    neg a0, a0
 ; RV32IMZICOND-NEXT:    or a0, a0, a2
+; RV32IMZICOND-NEXT:    sltu a2, a1, a0
+; RV32IMZICOND-NEXT:    czero.nez a0, a0, a2
+; RV32IMZICOND-NEXT:    czero.eqz a1, a1, a2
+; RV32IMZICOND-NEXT:    or a0, a1, a0
 ; RV32IMZICOND-NEXT:    ret
 ;
 ; RV64IMZICOND-LABEL: select_umin_2:
 ; RV64IMZICOND:       # %bb.0: # %entry
-; RV64IMZICOND-NEXT:    sext.w a2, a2
-; RV64IMZICOND-NEXT:    sext.w a3, a1
-; RV64IMZICOND-NEXT:    sltu a4, a3, a2
-; RV64IMZICOND-NEXT:    czero.nez a2, a2, a4
-; RV64IMZICOND-NEXT:    czero.eqz a3, a3, a4
-; RV64IMZICOND-NEXT:    or a2, a3, a2
-; RV64IMZICOND-NEXT:    czero.nez a2, a2, a0
-; RV64IMZICOND-NEXT:    czero.eqz a0, a1, a0
+; RV64IMZICOND-NEXT:    sext.w a1, a1
+; RV64IMZICOND-NEXT:    neg a0, a0
 ; RV64IMZICOND-NEXT:    or a0, a0, a2
+; RV64IMZICOND-NEXT:    sext.w a0, a0
+; RV64IMZICOND-NEXT:    sltu a2, a1, a0
+; RV64IMZICOND-NEXT:    czero.nez a0, a0, a2
+; RV64IMZICOND-NEXT:    czero.eqz a1, a1, a2
+; RV64IMZICOND-NEXT:    or a0, a1, a0
 ; RV64IMZICOND-NEXT:    ret
 ;
 ; RV32IMBOTH-LABEL: select_umin_2:
 ; RV32IMBOTH:       # %bb.0: # %entry
-; RV32IMBOTH-NEXT:    minu a2, a1, a2
-; RV32IMBOTH-NEXT:    czero.eqz a1, a1, a0
-; RV32IMBOTH-NEXT:    czero.nez a0, a2, a0
-; RV32IMBOTH-NEXT:    or a0, a1, a0
+; RV32IMBOTH-NEXT:    neg a0, a0
+; RV32IMBOTH-NEXT:    or a0, a0, a2
+; RV32IMBOTH-NEXT:    minu a0, a1, a0
 ; RV32IMBOTH-NEXT:    ret
 ;
 ; RV64IMBOTH-LABEL: select_umin_2:
 ; RV64IMBOTH:       # %bb.0: # %entry
-; RV64IMBOTH-NEXT:    sext.w a2, a2
-; RV64IMBOTH-NEXT:    sext.w a3, a1
-; RV64IMBOTH-NEXT:    minu a2, a3, a2
-; RV64IMBOTH-NEXT:    czero.eqz a1, a1, a0
-; RV64IMBOTH-NEXT:    czero.nez a0, a2, a0
-; RV64IMBOTH-NEXT:    or a0, a1, a0
+; RV64IMBOTH-NEXT:    sext.w a1, a1
+; RV64IMBOTH-NEXT:    neg a0, a0
+; RV64IMBOTH-NEXT:    or a0, a0, a2
+; RV64IMBOTH-NEXT:    sext.w a0, a0
+; RV64IMBOTH-NEXT:    minu a0, a1, a0
 ; RV64IMBOTH-NEXT:    ret
 entry:
   %c = call i32 @llvm.umin(i32 %a, i32 %b)
@@ -213,99 +180,76 @@ entry:
 define i32 @select_umin_3(i1 zeroext %cond, i32 %a) {
 ; RV32IM-LABEL: select_umin_3:
 ; RV32IM:       # %bb.0: # %entry
-; RV32IM-NEXT:    li a3, 32
-; RV32IM-NEXT:    mv a2, a1
-; RV32IM-NEXT:    bgeu a1, a3, .LBB2_3
-; RV32IM-NEXT:  # %bb.1: # %entry
-; RV32IM-NEXT:    beqz a0, .LBB2_4
-; RV32IM-NEXT:  .LBB2_2: # %entry
+; RV32IM-NEXT:    neg a0, a0
+; RV32IM-NEXT:    ori a2, a0, 32
 ; RV32IM-NEXT:    mv a0, a1
-; RV32IM-NEXT:    ret
-; RV32IM-NEXT:  .LBB2_3: # %entry
-; RV32IM-NEXT:    li a2, 32
-; RV32IM-NEXT:    bnez a0, .LBB2_2
-; RV32IM-NEXT:  .LBB2_4: # %entry
+; RV32IM-NEXT:    bltu a1, a2, .LBB2_2
+; RV32IM-NEXT:  # %bb.1: # %entry
 ; RV32IM-NEXT:    mv a0, a2
+; RV32IM-NEXT:  .LBB2_2: # %entry
 ; RV32IM-NEXT:    ret
 ;
 ; RV64IM-LABEL: select_umin_3:
 ; RV64IM:       # %bb.0: # %entry
-; RV64IM-NEXT:    sext.w a2, a1
-; RV64IM-NEXT:    li a3, 32
-; RV64IM-NEXT:    bgeu a2, a3, .LBB2_3
+; RV64IM-NEXT:    mv a2, a0
+; RV64IM-NEXT:    sext.w a0, a1
+; RV64IM-NEXT:    neg a1, a2
+; RV64IM-NEXT:    ori a1, a1, 32
+; RV64IM-NEXT:    bltu a0, a1, .LBB2_2
 ; RV64IM-NEXT:  # %bb.1: # %entry
-; RV64IM-NEXT:    beqz a0, .LBB2_4
-; RV64IM-NEXT:  .LBB2_2: # %entry
 ; RV64IM-NEXT:    mv a0, a1
-; RV64IM-NEXT:    ret
-; RV64IM-NEXT:  .LBB2_3: # %entry
-; RV64IM-NEXT:    li a2, 32
-; RV64IM-NEXT:    bnez a0, .LBB2_2
-; RV64IM-NEXT:  .LBB2_4: # %entry
-; RV64IM-NEXT:    mv a0, a2
+; RV64IM-NEXT:  .LBB2_2: # %entry
 ; RV64IM-NEXT:    ret
 ;
 ; RV32IMZBB-LABEL: select_umin_3:
 ; RV32IMZBB:       # %bb.0: # %entry
-; RV32IMZBB-NEXT:    bnez a0, .LBB2_2
-; RV32IMZBB-NEXT:  # %bb.1: # %entry
-; RV32IMZBB-NEXT:    li a0, 32
-; RV32IMZBB-NEXT:    minu a1, a1, a0
-; RV32IMZBB-NEXT:  .LBB2_2: # %entry
-; RV32IMZBB-NEXT:    mv a0, a1
+; RV32IMZBB-NEXT:    neg a0, a0
+; RV32IMZBB-NEXT:    ori a0, a0, 32
+; RV32IMZBB-NEXT:    minu a0, a1, a0
 ; RV32IMZBB-NEXT:    ret
 ;
 ; RV64IMZBB-LABEL: select_umin_3:
 ; RV64IMZBB:       # %bb.0: # %entry
-; RV64IMZBB-NEXT:    bnez a0, .LBB2_2
-; RV64IMZBB-NEXT:  # %bb.1: # %entry
 ; RV64IMZBB-NEXT:    sext.w a1, a1
-; RV64IMZBB-NEXT:    li a0, 32
-; RV64IMZBB-NEXT:    minu a1, a1, a0
-; RV64IMZBB-NEXT:  .LBB2_2: # %entry
-; RV64IMZBB-NEXT:    mv a0, a1
+; RV64IMZBB-NEXT:    neg a0, a0
+; RV64IMZBB-NEXT:    ori a0, a0, 32
+; RV64IMZBB-NEXT:    minu a0, a1, a0
 ; RV64IMZBB-NEXT:    ret
 ;
 ; RV32IMZICOND-LABEL: select_umin_3:
 ; RV32IMZICOND:       # %bb.0: # %entry
-; RV32IMZICOND-NEXT:    sltiu a2, a1, 32
-; RV32IMZICOND-NEXT:    addi a3, a1, -32
-; RV32IMZICOND-NEXT:    czero.eqz a2, a3, a2
-; RV32IMZICOND-NEXT:    addi a2, a2, 32
-; RV32IMZICOND-NEXT:    czero.eqz a1, a1, a0
-; RV32IMZICOND-NEXT:    czero.nez a0, a2, a0
+; RV32IMZICOND-NEXT:    neg a0, a0
+; RV32IMZICOND-NEXT:    ori a0, a0, 32
+; RV32IMZICOND-NEXT:    sltu a2, a1, a0
+; RV32IMZICOND-NEXT:    czero.nez a0, a0, a2
+; RV32IMZICOND-NEXT:    czero.eqz a1, a1, a2
 ; RV32IMZICOND-NEXT:    or a0, a1, a0
 ; RV32IMZICOND-NEXT:    ret
 ;
 ; RV64IMZICOND-LABEL: select_umin_3:
 ; RV64IMZICOND:       # %bb.0: # %entry
-; RV64IMZICOND-NEXT:    sext.w a2, a1
-; RV64IMZICOND-NEXT:    sltiu a3, a2, 32
-; RV64IMZICOND-NEXT:    addi a2, a2, -32
-; RV64IMZICOND-NEXT:    czero.eqz a2, a2, a3
-; RV64IMZICOND-NEXT:    addi a2, a2, 32
-; RV64IMZICOND-NEXT:    czero.eqz a1, a1, a0
-; RV64IMZICOND-NEXT:    czero.nez a0, a2, a0
+; RV64IMZICOND-NEXT:    sext.w a1, a1
+; RV64IMZICOND-NEXT:    neg a0, a0
+; RV64IMZICOND-NEXT:    ori a0, a0, 32
+; RV64IMZICOND-NEXT:    sltu a2, a1, a0
+; RV64IMZICOND-NEXT:    czero.nez a0, a0, a2
+; RV64IMZICOND-NEXT:    czero.eqz a1, a1, a2
 ; RV64IMZICOND-NEXT:    or a0, a1, a0
 ; RV64IMZICOND-NEXT:    ret
 ;
 ; RV32IMBOTH-LABEL: select_umin_3:
 ; RV32IMBOTH:       # %bb.0: # %entry
-; RV32IMBOTH-NEXT:    li a2, 32
-; RV32IMBOTH-NEXT:    minu a2, a1, a2
-; RV32IMBOTH-NEXT:    czero.eqz a1, a1, a0
-; RV32IMBOTH-NEXT:    czero.nez a0, a2, a0
-; RV32IMBOTH-NEXT:    or a0, a1, a0
+; RV32IMBOTH-NEXT:    neg a0, a0
+; RV32IMBOTH-NEXT:    ori a0, a0, 32
+; RV32IMBOTH-NEXT:    minu a0, a1, a0
 ; RV32IMBOTH-NEXT:    ret
 ;
 ; RV64IMBOTH-LABEL: select_umin_3:
 ; RV64IMBOTH:       # %bb.0: # %entry
-; RV64IMBOTH-NEXT:    sext.w a2, a1
-; RV64IMBOTH-NEXT:    li a3, 32
-; RV64IMBOTH-NEXT:    minu a2, a2, a3
-; RV64IMBOTH-NEXT:    czero.eqz a1, a1, a0
-; RV64IMBOTH-NEXT:    czero.nez a0, a2, a0
-; RV64IMBOTH-NEXT:    or a0, a1, a0
+; RV64IMBOTH-NEXT:    sext.w a1, a1
+; RV64IMBOTH-NEXT:    neg a0, a0
+; RV64IMBOTH-NEXT:    ori a0, a0, 32
+; RV64IMBOTH-NEXT:    minu a0, a1, a0
 ; RV64IMBOTH-NEXT:    ret
 entry:
   %c = call i32 @llvm.umin(i32 %a, i32 32)
@@ -316,94 +260,80 @@ entry:
 define i32 @select_umin_4(i1 zeroext %cond, i32 %x) {
 ; RV32IM-LABEL: select_umin_4:
 ; RV32IM:       # %bb.0:
-; RV32IM-NEXT:    li a2, 128
-; RV32IM-NEXT:    bgeu a1, a2, .LBB3_3
+; RV32IM-NEXT:    neg a0, a0
+; RV32IM-NEXT:    or a0, a0, a1
+; RV32IM-NEXT:    li a1, 128
+; RV32IM-NEXT:    bltu a0, a1, .LBB3_2
 ; RV32IM-NEXT:  # %bb.1:
-; RV32IM-NEXT:    beqz a0, .LBB3_4
+; RV32IM-NEXT:    li a0, 128
 ; RV32IM-NEXT:  .LBB3_2:
-; RV32IM-NEXT:    mv a0, a2
-; RV32IM-NEXT:    ret
-; RV32IM-NEXT:  .LBB3_3:
-; RV32IM-NEXT:    li a1, 128
-; RV32IM-NEXT:    bnez a0, .LBB3_2
-; RV32IM-NEXT:  .LBB3_4:
-; RV32IM-NEXT:    mv a0, a1
 ; RV32IM-NEXT:    ret
 ;
 ; RV64IM-LABEL: select_umin_4:
 ; RV64IM:       # %bb.0:
-; RV64IM-NEXT:    sext.w a2, a1
+; RV64IM-NEXT:    neg a0, a0
+; RV64IM-NEXT:    or a0, a0, a1
+; RV64IM-NEXT:    sext.w a0, a0
 ; RV64IM-NEXT:    li a1, 128
-; RV64IM-NEXT:    bgeu a2, a1, .LBB3_3
+; RV64IM-NEXT:    bltu a0, a1, .LBB3_2
 ; RV64IM-NEXT:  # %bb.1:
-; RV64IM-NEXT:    beqz a0, .LBB3_4
+; RV64IM-NEXT:    li a0, 128
 ; RV64IM-NEXT:  .LBB3_2:
-; RV64IM-NEXT:    mv a0, a1
-; RV64IM-NEXT:    ret
-; RV64IM-NEXT:  .LBB3_3:
-; RV64IM-NEXT:    li a2, 128
-; RV64IM-NEXT:    bnez a0, .LBB3_2
-; RV64IM-NEXT:  .LBB3_4:
-; RV64IM-NEXT:    mv a0, a2
 ; RV64IM-NEXT:    ret
 ;
 ; RV32IMZBB-LABEL: select_umin_4:
 ; RV32IMZBB:       # %bb.0:
-; RV32IMZBB-NEXT:    mv a2, a0
-; RV32IMZBB-NEXT:    li a0, 128
-; RV32IMZBB-NEXT:    bnez a2, .LBB3_2
-; RV32IMZBB-NEXT:  # %bb.1:
-; RV32IMZBB-NEXT:    minu a0, a1, a0
-; RV32IMZBB-NEXT:  .LBB3_2:
+; RV32IMZBB-NEXT:    neg a0, a0
+; RV32IMZBB-NEXT:    or a0, a0, a1
+; RV32IMZBB-NEXT:    li a1, 128
+; RV32IMZBB-NEXT:    minu a0, a0, a1
 ; RV32IMZBB-NEXT:    ret
 ;
 ; RV64IMZBB-LABEL: select_umin_4:
 ; RV64IMZBB:       # %bb.0:
-; RV64IMZBB-NEXT:    mv a2, a0
-; RV64IMZBB-NEXT:    li a0, 128
-; RV64IMZBB-NEXT:    bnez a2, .LBB3_2
-; RV64IMZBB-NEXT:  # %bb.1:
-; RV64IMZBB-NEXT:    sext.w a1, a1
-; RV64IMZBB-NEXT:    minu a0, a1, a0
-; RV64IMZBB-NEXT:  .LBB3_2:
+; RV64IMZBB-NEXT:    neg a0, a0
+; RV64IMZBB-NEXT:    or a0, a0, a1
+; RV64IMZBB-NEXT:    sext.w a0, a0
+; RV64IMZBB-NEXT:    li a1, 128
+; RV64IMZBB-NEXT:    minu a0, a0, a1
 ; RV64IMZBB-NEXT:    ret
 ;
 ; RV32IMZICOND-LABEL: select_umin_4:
 ; RV32IMZICOND:       # %bb.0:
-; RV32IMZICOND-NEXT:    sltiu a2, a1, 128
-; RV32IMZICOND-NEXT:    addi a1, a1, -128
-; RV32IMZICOND-NEXT:    czero.eqz a1, a1, a2
-; RV32IMZICOND-NEXT:    czero.nez a0, a1, a0
+; RV32IMZICOND-NEXT:    neg a0, a0
+; RV32IMZICOND-NEXT:    or a0, a0, a1
+; RV32IMZICOND-NEXT:    sltiu a1, a0, 128
+; RV32IMZICOND-NEXT:    addi a0, a0, -128
+; RV32IMZICOND-NEXT:    czero.eqz a0, a0, a1
 ; RV32IMZICOND-NEXT:    addi a0, a0, 128
 ; RV32IMZICOND-NEXT:    ret
 ;
 ; RV64IMZICOND-LABEL: select_umin_4:
 ; RV64IMZICOND:       # %bb.0:
-; RV64IMZICOND-NEXT:    sext.w a1, a1
-; RV64IMZICOND-NEXT:    sltiu a2, a1, 128
-; RV64IMZICOND-NEXT:    addi a1, a1, -128
-; RV64IMZICOND-NEXT:    czero.eqz a1, a1, a2
-; RV64IMZICOND-NEXT:    czero.nez a0, a1, a0
+; RV64IMZICOND-NEXT:    neg a0, a0
+; RV64IMZICOND-NEXT:    or a0, a0, a1
+; RV64IMZICOND-NEXT:    sext.w a0, a0
+; RV64IMZICOND-NEXT:    sltiu a1, a0, 128
+; RV64IMZICOND-NEXT:    addi a0, a0, -128
+; RV64IMZICOND-NEXT:    czero.eqz a0, a0, a1
 ; RV64IMZICOND-NEXT:    addi a0, a0, 128
 ; RV64IMZICOND-NEXT:    ret
 ;
 ; RV32IMBOTH-LABEL: select_umin_4:
 ; RV32IMBOTH:       # %bb.0:
-; RV32IMBOTH-NEXT:    li a2, 128
-; RV32IMBOTH-NEXT:    minu a1, a1, a2
-; RV32IMBOTH-NEXT:    addi a1, a1, -128
-; RV32IMBOTH-NEXT:    czero.nez a0, a1, a0
-; RV32IMBOTH-NEXT:    addi a0, a0, 128
+; RV32IMBOTH-NEXT:    neg a0, a0
+; RV32IMBOTH-NEXT:    or a0, a0, a1
+; RV32IMBOTH-NEXT:    li a1, 128
+; RV32IMBOTH-NEXT:    minu a0, a0, a1
 ; RV32IMBOTH-NEXT:    ret
 ;
 ; RV64IMBOTH-LABEL: select_umin_4:
 ; RV64IMBOTH:       # %bb.0:
-; RV64IMBOTH-NEXT:    sext.w a1, a1
-; RV64IMBOTH-NEXT:    li a2, 128
-; RV64IMBOTH-NEXT:    minu a1, a1, a2
-; RV64IMBOTH-NEXT:    addi a1, a1, -128
-; RV64IMBOTH-NEXT:    czero.nez a0, a1, a0
-; RV64IMBOTH-NEXT:    addi a0, a0, 128
+; RV64IMBOTH-NEXT:    neg a0, a0
+; RV64IMBOTH-NEXT:    or a0, a0, a1
+; RV64IMBOTH-NEXT:    sext.w a0, a0
+; RV64IMBOTH-NEXT:    li a1, 128
+; RV64IMBOTH-NEXT:    minu a0, a0, a1
 ; RV64IMBOTH-NEXT:    ret
   %minmax = call i32 @llvm.umin(i32 %x, i32 128)
   %sel = select i1 %cond, i32 128, i32 %minmax
@@ -413,96 +343,76 @@ define i32 @select_umin_4(i1 zeroext %cond, i32 %x) {
 define i32 @select_umax_1(i1 zeroext %cond, i32 %a, i32 %b) {
 ; RV32IM-LABEL: select_umax_1:
 ; RV32IM:       # %bb.0: # %entry
-; RV32IM-NEXT:    bgeu a2, a1, .LBB4_3
+; RV32IM-NEXT:    neg a0, a0
+; RV32IM-NEXT:    and a1, a0, a1
+; RV32IM-NEXT:    mv a0, a2
+; RV32IM-NEXT:    bltu a1, a2, .LBB4_2
 ; RV32IM-NEXT:  # %bb.1: # %entry
-; RV32IM-NEXT:    beqz a0, .LBB4_4
-; RV32IM-NEXT:  .LBB4_2: # %entry
 ; RV32IM-NEXT:    mv a0, a1
-; RV32IM-NEXT:    ret
-; RV32IM-NEXT:  .LBB4_3: # %entry
-; RV32IM-NEXT:    mv a1, a2
-; RV32IM-NEXT:    bnez a0, .LBB4_2
-; RV32IM-NEXT:  .LBB4_4: # %entry
-; RV32IM-NEXT:    mv a0, a2
+; RV32IM-NEXT:  .LBB4_2: # %entry
 ; RV32IM-NEXT:    ret
 ;
 ; RV64IM-LABEL: select_umax_1:
 ; RV64IM:       # %bb.0: # %entry
+; RV64IM-NEXT:    mv a3, a0
+; RV64IM-NEXT:    sext.w a0, a2
+; RV64IM-NEXT:    neg a2, a3
+; RV64IM-NEXT:    and a1, a2, a1
 ; RV64IM-NEXT:    sext.w a1, a1
-; RV64IM-NEXT:    sext.w a3, a2
-; RV64IM-NEXT:    bgeu a3, a1, .LBB4_3
+; RV64IM-NEXT:    bltu a1, a0, .LBB4_2
 ; RV64IM-NEXT:  # %bb.1: # %entry
-; RV64IM-NEXT:    beqz a0, .LBB4_4
-; RV64IM-NEXT:  .LBB4_2: # %entry
 ; RV64IM-NEXT:    mv a0, a1
-; RV64IM-NEXT:    ret
-; RV64IM-NEXT:  .LBB4_3: # %entry
-; RV64IM-NEXT:    mv a1, a3
-; RV64IM-NEXT:    bnez a0, .LBB4_2
-; RV64IM-NEXT:  .LBB4_4: # %entry
-; RV64IM-NEXT:    mv a0, a2
+; RV64IM-NEXT:  .LBB4_2: # %entry
 ; RV64IM-NEXT:    ret
 ;
 ; RV32IMZBB-LABEL: select_umax_1:
 ; RV32IMZBB:       # %bb.0: # %entry
-; RV32IMZBB-NEXT:    beqz a0, .LBB4_2
-; RV32IMZBB-NEXT:  # %bb.1:
-; RV32IMZBB-NEXT:    maxu a2, a1, a2
-; RV32IMZBB-NEXT:  .LBB4_2: # %entry
-; RV32IMZBB-NEXT:    mv a0, a2
+; RV32IMZBB-NEXT:    neg a0, a0
+; RV32IMZBB-NEXT:    and a0, a0, a1
+; RV32IMZBB-NEXT:    maxu a0, a2, a0
 ; RV32IMZBB-NEXT:    ret
 ;
 ; RV64IMZBB-LABEL: select_umax_1:
 ; RV64IMZBB:       # %bb.0: # %entry
-; RV64IMZBB-NEXT:    beqz a0, .LBB4_2
-; RV64IMZBB-NEXT:  # %bb.1:
 ; RV64IMZBB-NEXT:    sext.w a2, a2
-; RV64IMZBB-NEXT:    sext.w a1, a1
-; RV64IMZBB-NEXT:    maxu a2, a1, a2
-; RV64IMZBB-NEXT:  .LBB4_2: # %entry
-; RV64IMZBB-NEXT:    mv a0, a2
+; RV64IMZBB-NEXT:    neg a0, a0
+; RV64IMZBB-NEXT:    and a0, a0, a1
+; RV64IMZBB-NEXT:    sext.w a0, a0
+; RV64IMZBB-NEXT:    maxu a0, a2, a0
 ; RV64IMZBB-NEXT:    ret
 ;
 ; RV32IMZICOND-LABEL: select_umax_1:
 ; RV32IMZICOND:       # %bb.0: # %entry
-; RV32IMZICOND-NEXT:    sltu a3, a2, a1
-; RV32IMZICOND-NEXT:    czero.nez a4, a2, a3
-; RV32IMZICOND-NEXT:    czero.eqz a1, a1, a3
-; RV32IMZICOND-NEXT:    or a1, a1, a4
-; RV32IMZICOND-NEXT:    czero.eqz a1, a1, a0
-; RV32IMZICOND-NEXT:    czero.nez a0, a2, a0
+; RV32IMZICOND-NEXT:    czero.eqz a0, a1, a0
+; RV32IMZICOND-NEXT:    sltu a1, a0, a2
+; RV32IMZICOND-NEXT:    czero.nez a0, a0, a1
+; RV32IMZICOND-NEXT:    czero.eqz a1, a2, a1
 ; RV32IMZICOND-NEXT:    or a0, a1, a0
 ; RV32IMZICOND-NEXT:    ret
 ;
 ; RV64IMZICOND-LABEL: select_umax_1:
 ; RV64IMZICOND:       # %bb.0: # %entry
-; RV64IMZICOND-NEXT:    sext.w a1, a1
-; RV64IMZICOND-NEXT:    sext.w a3, a2
-; RV64IMZICOND-NEXT:    sltu a4, a3, a1
-; RV64IMZICOND-NEXT:    czero.nez a3, a3, a4
-; RV64IMZICOND-NEXT:    czero.eqz a1, a1, a4
-; RV64IMZICOND-NEXT:    or a1, a1, a3
-; RV64IMZICOND-NEXT:    czero.eqz a1, a1, a0
-; RV64IMZICOND-NEXT:    czero.nez a0, a2, a0
+; RV64IMZICOND-NEXT:    sext.w a2, a2
+; RV64IMZICOND-NEXT:    czero.eqz a0, a1, a0
+; RV64IMZICOND-NEXT:    sext.w a0, a0
+; RV64IMZICOND-NEXT:    sltu a1, a0, a2
+; RV64IMZICOND-NEXT:    czero.nez a0, a0, a1
+; RV64IMZICOND-NEXT:    czero.eqz a1, a2, a1
 ; RV64IMZICOND-NEXT:    or a0, a1, a0
 ; RV64IMZICOND-NEXT:    ret
 ;
 ; RV32IMBOTH-LABEL: select_umax_1:
 ; RV32IMBOTH:       # %bb.0: # %entry
-; RV32IMBOTH-NEXT:    maxu a1, a1, a2
-; RV32IMBOTH-NEXT:    czero.nez a2, a2, a0
 ; RV32IMBOTH-NEXT:    czero.eqz a0, a1, a0
-; RV32IMBOTH-NEXT:    or a0, a0, a2
+; RV32IMBOTH-NEXT:    maxu a0, a2, a0
 ; RV32IMBOTH-NEXT:    ret
 ;
 ; RV64IMBOTH-LABEL: select_umax_1:
 ; RV64IMBOTH:       # %bb.0: # %entry
-; RV64IMBOTH-NEXT:    sext.w a3, a2
-; RV64IMBOTH-NEXT:    sext.w a1, a1
-; RV64IMBOTH-NEXT:    maxu a1, a1, a3
-; RV64IMBOTH-NEXT:    czero.nez a2, a2, a0
+; RV64IMBOTH-NEXT:    sext.w a2, a2
 ; RV64IMBOTH-NEXT:    czero.eqz a0, a1, a0
-; RV64IMBOTH-NEXT:    or a0, a0, a2
+; RV64IMBOTH-NEXT:    sext.w a0, a0
+; RV64IMBOTH-NEXT:    maxu a0, a2, a0
 ; RV64IMBOTH-NEXT:    ret
 entry:
   %c = call i32 @llvm.umax(i32 %a, i32 %b)
@@ -513,97 +423,76 @@ entry:
 define i32 @select_umax_2(i1 zeroext %cond, i32 %a, i32 %b) {
 ; RV32IM-LABEL: select_umax_2:
 ; RV32IM:       # %bb.0: # %entry
-; RV32IM-NEXT:    mv a3, a1
-; RV32IM-NEXT:    bgeu a2, a1, .LBB5_3
+; RV32IM-NEXT:    addi a0, a0, -1
+; RV32IM-NEXT:    and a2, a0, a2
+; RV32IM-NEXT:    mv a0, a1
+; RV32IM-NEXT:    bltu a2, a1, .LBB5_2
 ; RV32IM-NEXT:  # %bb.1: # %entry
-; RV32IM-NEXT:    beqz a0, .LBB5_4
+; RV32IM-NEXT:    mv a0, a2
 ; RV32IM-NEXT:  .LBB5_2: # %entry
-; RV32IM-NEXT:    mv a0, a1
-; RV32IM-NEXT:    ret
-; RV32IM-NEXT:  .LBB5_3: # %entry
-; RV32IM-NEXT:    mv a3, a2
-; RV32IM-NEXT:    bnez a0, .LBB5_2
-; RV32IM-NEXT:  .LBB5_4: # %entry
-; RV32IM-NEXT:    mv a0, a3
 ; RV32IM-NEXT:    ret
 ;
 ; RV64IM-LABEL: select_umax_2:
 ; RV64IM:       # %bb.0: # %entry
-; RV64IM-NEXT:    sext.w a3, a1
-; RV64IM-NEXT:    sext.w a2, a2
-; RV64IM-NEXT:    bgeu a2, a3, .LBB5_3
+; RV64IM-NEXT:    mv a3, a0
+; RV64IM-NEXT:    sext.w a0, a1
+; RV64IM-NEXT:    addi a3, a3, -1
+; RV64IM-NEXT:    and a1, a3, a2
+; RV64IM-NEXT:    sext.w a1, a1
+; RV64IM-NEXT:    bltu a1, a0, .LBB5_2
 ; RV64IM-NEXT:  # %bb.1: # %entry
-; RV64IM-NEXT:    beqz a0, .LBB5_4
-; RV64IM-NEXT:  .LBB5_2: # %entry
 ; RV64IM-NEXT:    mv a0, a1
-; RV64IM-NEXT:    ret
-; RV64IM-NEXT:  .LBB5_3: # %entry
-; RV64IM-NEXT:    mv a3, a2
-; RV64IM-NEXT:    bnez a0, .LBB5_2
-; RV64IM-NEXT:  .LBB5_4: # %entry
-; RV64IM-NEXT:    mv a0, a3
+; RV64IM-NEXT:  .LBB5_2: # %entry
 ; RV64IM-NEXT:    ret
 ;
 ; RV32IMZBB-LABEL: select_umax_2:
 ; RV32IMZBB:       # %bb.0: # %entry
-; RV32IMZBB-NEXT:    bnez a0, .LBB5_2
-; RV32IMZBB-NEXT:  # %bb.1: # %entry
-; RV32IMZBB-NEXT:    maxu a1, a1, a2
-; RV32IMZBB-NEXT:  .LBB5_2: # %entry
-; RV32IMZBB-NEXT:    mv a0, a1
+; RV32IMZBB-NEXT:    addi a0, a0, -1
+; RV32IMZBB-NEXT:    and a0, a0, a2
+; RV32IMZBB-NEXT:    maxu a0, a1, a0
 ; RV32IMZBB-NEXT:    ret
 ;
 ; RV64IMZBB-LABEL: select_umax_2:
 ; RV64IMZBB:       # %bb.0: # %entry
-; RV64IMZBB-NEXT:    bnez a0, .LBB5_2
-; RV64IMZBB-NEXT:  # %bb.1: # %entry
-; RV64IMZBB-NEXT:    sext.w a2, a2
 ; RV64IMZBB-NEXT:    sext.w a1, a1
-; RV64IMZBB-NEXT:    maxu a1, a1, a2
-; RV64IMZBB-NEXT:  .LBB5_2: # %entry
-; RV64IMZBB-NEXT:    mv a0, a1
+; RV64IMZBB-NEXT:    addi a0, a0, -1
+; RV64IMZBB-NEXT:    and a0, a0, a2
+; RV64IMZBB-NEXT:    sext.w a0, a0
+; RV64IMZBB-NEXT:    maxu a0, a1, a0
 ; RV64IMZBB-NEXT:    ret
 ;
 ; RV32IMZICOND-LABEL: select_umax_2:
 ; RV32IMZICOND:       # %bb.0: # %entry
-; RV32IMZICOND-NEXT:    sltu a3, a2, a1
-; RV32IMZICOND-NEXT:    czero.nez a2, a2, a3
-; RV32IMZICOND-NEXT:    czero.eqz a3, a1, a3
-; RV32IMZICOND-NEXT:    or a2, a3, a2
-; RV32IMZICOND-NEXT:    czero.nez a2, a2, a0
-; RV32IMZICOND-NEXT:    czero.eqz a0, a1, a0
-; RV32IMZICOND-NEXT:    or a0, a0, a2
+; RV32IMZICOND-NEXT:    czero.nez a0, a2, a0
+; RV32IMZICOND-NEXT:    sltu a2, a0, a1
+; RV32IMZICOND-NEXT:    czero.nez a0, a0, a2
+; RV32IMZICOND-NEXT:    czero.eqz a1, a1, a2
+; RV32IMZICOND-NEXT:    or a0, a1, a0
 ; RV32IMZICOND-NEXT:    ret
 ;
 ; RV64IMZICOND-LABEL: select_umax_2:
 ; RV64IMZICOND:       # %bb.0: # %entry
-; RV64IMZICOND-NEXT:    sext.w a3, a1
-; RV64IMZICOND-NEXT:    sext.w a2, a2
-; RV64IMZICOND-NEXT:    sltu a4, a2, a3
-; RV64IMZICOND-NEXT:    czero.nez a2, a2, a4
-; RV64IMZICOND-NEXT:    czero.eqz a3, a3, a4
-; RV64IMZICOND-NEXT:    or a2, a3, a2
-; RV64IMZICOND-NEXT:    czero.nez a2, a2, a0
-; RV64IMZICOND-NEXT:    czero.eqz a0, a1, a0
-; RV64IMZICOND-NEXT:    or a0, a0, a2
+; RV64IMZICOND-NEXT:    sext.w a1, a1
+; RV64IMZICOND-NEXT:    czero.nez a0, a2, a0
+; RV64IMZICOND-NEXT:    sext.w a0, a0
+; RV64IMZICOND-NEXT:    sltu a2, a0, a1
+; RV64IMZICOND-NEXT:    czero.nez a0, a0, a2
+; RV64IMZICOND-NEXT:    czero.eqz a1, a1, a2
+; RV64IMZICOND-NEXT:    or a0, a1, a0
 ; RV64IMZICOND-NEXT:    ret
 ;
 ; RV32IMBOTH-LABEL: select_umax_2:
 ; RV32IMBOTH:       # %bb.0: # %entry
-; RV32IMBOTH-NEXT:    maxu a2, a1, a2
-; RV32IMBOTH-NEXT:    czero.eqz a1, a1, a0
 ; RV32IMBOTH-NEXT:    czero.nez a0, a2, a0
-; RV32IMBOTH-NEXT:    or a0, a1, a0
+; RV32IMBOTH-NEXT:    maxu a0, a1, a0
 ; RV32IMBOTH-NEXT:    ret
 ;
 ; RV64IMBOTH-LABEL: select_umax_2:
 ; RV64IMBOTH:       # %bb.0: # %entry
-; RV64IMBOTH-NEXT:    sext.w a2, a2
-; RV64IMBOTH-NEXT:    sext.w a3, a1
-; RV64IMBOTH-NEXT:    maxu a2, a3, a2
-; RV64IMBOTH-NEXT:    czero.eqz a1, a1, a0
+; RV64IMBOTH-NEXT:    sext.w a1, a1
 ; RV64IMBOTH-NEXT:    czero.nez a0, a2, a0
-; RV64IMBOTH-NEXT:    or a0, a1, a0
+; RV64IMBOTH-NEXT:    sext.w a0, a0
+; RV64IMBOTH-NEXT:    maxu a0, a1, a0
 ; RV64IMBOTH-NEXT:    ret
 entry:
   %c = call i32 @llvm.umax(i32 %a, i32 %b)
@@ -614,99 +503,76 @@ entry:
 define i32 @select_umax_3(i1 zeroext %cond, i32 %a) {
 ; RV32IM-LABEL: select_umax_3:
 ; RV32IM:       # %bb.0: # %entry
-; RV32IM-NEXT:    li a3, 32
-; RV32IM-NEXT:    mv a2, a1
-; RV32IM-NEXT:    bgeu a3, a1, .LBB6_3
-; RV32IM-NEXT:  # %bb.1: # %entry
-; RV32IM-NEXT:    beqz a0, .LBB6_4
-; RV32IM-NEXT:  .LBB6_2: # %entry
+; RV32IM-NEXT:    addi a0, a0, -1
+; RV32IM-NEXT:    andi a2, a0, 32
 ; RV32IM-NEXT:    mv a0, a1
-; RV32IM-NEXT:    ret
-; RV32IM-NEXT:  .LBB6_3: # %entry
-; RV32IM-NEXT:    li a2, 32
-; RV32IM-NEXT:    bnez a0, .LBB6_2
-; RV32IM-NEXT:  .LBB6_4: # %entry
+; RV32IM-NEXT:    bltu a2, a1, .LBB6_2
+; RV32IM-NEXT:  # %bb.1: # %entry
 ; RV32IM-NEXT:    mv a0, a2
+; RV32IM-NEXT:  .LBB6_2: # %entry
 ; RV32IM-NEXT:    ret
 ;
 ; RV64IM-LABEL: select_umax_3:
 ; RV64IM:       # %bb.0: # %entry
-; RV64IM-NEXT:    sext.w a2, a1
-; RV64IM-NEXT:    li a3, 32
-; RV64IM-NEXT:    bgeu a3, a2, .LBB6_3
+; RV64IM-NEXT:    mv a2, a0
+; RV64IM-NEXT:    sext.w a0, a1
+; RV64IM-NEXT:    addi a2, a2, -1
+; RV64IM-NEXT:    andi a1, a2, 32
+; RV64IM-NEXT:    bltu a1, a0, .LBB6_2
 ; RV64IM-NEXT:  # %bb.1: # %entry
-; RV64IM-NEXT:    beqz a0, .LBB6_4
-; RV64IM-NEXT:  .LBB6_2: # %entry
 ; RV64IM-NEXT:    mv a0, a1
-; RV64IM-NEXT:    ret
-; RV64IM-NEXT:  .LBB6_3: # %entry
-; RV64IM-NEXT:    li a2, 32
-; RV64IM-NEXT:    bnez a0, .LBB6_2
-; RV64IM-NEXT:  .LBB6_4: # %entry
-; RV64IM-NEXT:    mv a0, a2
+; RV64IM-NEXT:  .LBB6_2: # %entry
 ; RV64IM-NEXT:    ret
 ;
 ; RV32IMZBB-LABEL: select_umax_3:
 ; RV32IMZBB:       # %bb.0: # %entry
-; RV32IMZBB-NEXT:    bnez a0, .LBB6_2
-; RV32IMZBB-NEXT:  # %bb.1: # %entry
-; RV32IMZBB-NEXT:    li a0, 32
-; RV32IMZBB-NEXT:    maxu a1, a1, a0
-; RV32IMZBB-NEXT:  .LBB6_2: # %entry
-; RV32IMZBB-NEXT:    mv a0, a1
+; RV32IMZBB-NEXT:    addi a0, a0, -1
+; RV32IMZBB-NEXT:    andi a0, a0, 32
+; RV32IMZBB-NEXT:    maxu a0, a1, a0
 ; RV32IMZBB-NEXT:    ret
 ;
 ; RV64IMZBB-LABEL: select_umax_3:
 ; RV64IMZBB:       # %bb.0: # %entry
-; RV64IMZBB-NEXT:    bnez a0, .LBB6_2
-; RV64IMZBB-NEXT:  # %bb.1: # %entry
 ; RV64IMZBB-NEXT:    sext.w a1, a1
-; RV64IMZBB-NEXT:    li a0, 32
-; RV64IMZBB-NEXT:    maxu a1, a1, a0
-; RV64IMZBB-NEXT:  .LBB6_2: # %entry
-; RV64IMZBB-NEXT:    mv a0, a1
+; RV64IMZBB-NEXT:    addi a0, a0, -1
+; RV64IMZBB-NEXT:    andi a0, a0, 32
+; RV64IMZBB-NEXT:    maxu a0, a1, a0
 ; RV64IMZBB-NEXT:    ret
 ;
 ; RV32IMZICOND-LABEL: select_umax_3:
 ; RV32IMZICOND:       # %bb.0: # %entry
-; RV32IMZICOND-NEXT:    sltiu a2, a1, 33
-; RV32IMZICOND-NEXT:    addi a3, a1, -32
-; RV32IMZICOND-NEXT:    czero.nez a2, a3, a2
-; RV32IMZICOND-NEXT:    addi a2, a2, 32
-; RV32IMZICOND-NEXT:    czero.eqz a1, a1, a0
-; RV32IMZICOND-NEXT:    czero.nez a0, a2, a0
+; RV32IMZICOND-NEXT:    addi a0, a0, -1
+; RV32IMZICOND-NEXT:    andi a0, a0, 32
+; RV32IMZICOND-NEXT:    sltu a2, a0, a1
+; RV32IMZICOND-NEXT:    czero.nez a0, a0, a2
+; RV32IMZICOND-NEXT:    czero.eqz a1, a1, a2
 ; RV32IMZICOND-NEXT:    or a0, a1, a0
 ; RV32IMZICOND-NEXT:    ret
 ;
 ; RV64IMZICOND-LABEL: select_umax_3:
 ; RV64IMZICOND:       # %bb.0: # %entry
-; RV64IMZICOND-NEXT:    sext.w a2, a1
-; RV64IMZICOND-NEXT:    sltiu a3, a2, 33
-; RV64IMZICOND-NEXT:    addi a2, a2, -32
-; RV64IMZICOND-NEXT:    czero.nez a2, a2, a3
-; RV64IMZICOND-NEXT:    addi a2, a2, 32
-; RV64IMZICOND-NEXT:    czero.eqz a1, a1, a0
-; RV64IMZICOND-NEXT:    czero.nez a0, a2, a0
+; RV64IMZICOND-NEXT:    sext.w a1, a1
+; RV64IMZICOND-NEXT:    addi a0, a0, -1
+; RV64IMZICOND-NEXT:    andi a0, a0, 32
+; RV64IMZICOND-NEXT:    sltu a2, a0, a1
+; RV64IMZICOND-NEXT:    czero.nez a0, a0, a2
+; RV64IMZICOND-NEXT:    czero.eqz a1, a1, a2
 ; RV64IMZICOND-NEXT:    or a0, a1, a0
 ; RV64IMZICOND-NEXT:    ret
 ;
 ; RV32IMBOTH-LABEL: select_umax_3:
 ; RV32IMBOTH:       # %bb.0: # %entry
-; RV32IMBOTH-NEXT:    li a2, 32
-; RV32IMBOTH-NEXT:    maxu a2, a1, a2
-; RV32IMBOTH-NEXT:    czero.eqz a1, a1, a0
-; RV32IMBOTH-NEXT:    czero.nez a0, a2, a0
-; RV32IMBOTH-NEXT:    or a0, a1, a0
+; RV32IMBOTH-NEXT:    addi a0, a0, -1
+; RV32IMBOTH-NEXT:    andi a0, a0, 32
+; RV32IMBOTH-NEXT:    maxu a0, a1, a0
 ; RV32IMBOTH-NEXT:    ret
 ;
 ; RV64IMBOTH-LABEL: select_umax_3:
 ; RV64IMBOTH:       # %bb.0: # %entry
-; RV64IMBOTH-NEXT:    sext.w a2, a1
-; RV64IMBOTH-NEXT:    li a3, 32
-; RV64IMBOTH-NEXT:    maxu a2, a2, a3
-; RV64IMBOTH-NEXT:    czero.eqz a1, a1, a0
-; RV64IMBOTH-NEXT:    czero.nez a0, a2, a0
-; RV64IMBOTH-NEXT:    or a0, a1, a0
+; RV64IMBOTH-NEXT:    sext.w a1, a1
+; RV64IMBOTH-NEXT:    addi a0, a0, -1
+; RV64IMBOTH-NEXT:    andi a0, a0, 32
+; RV64IMBOTH-NEXT:    maxu a0, a1, a0
 ; RV64IMBOTH-NEXT:    ret
 entry:
   %c = call i32 @llvm.umax(i32 %a, i32 32)
@@ -717,94 +583,76 @@ entry:
 define i32 @select_umax_4(i1 zeroext %cond, i32 %x) {
 ; RV32IM-LABEL: select_umax_4:
 ; RV32IM:       # %bb.0:
-; RV32IM-NEXT:    li a2, 128
-; RV32IM-NEXT:    bgeu a2, a1, .LBB7_3
+; RV32IM-NEXT:    addi a0, a0, -1
+; RV32IM-NEXT:    and a0, a0, a1
+; RV32IM-NEXT:    li a1, 128
+; RV32IM-NEXT:    bltu a1, a0, .LBB7_2
 ; RV32IM-NEXT:  # %bb.1:
-; RV32IM-NEXT:    beqz a0, .LBB7_4
+; RV32IM-NEXT:    li a0, 128
 ; RV32IM-NEXT:  .LBB7_2:
-; RV32IM-NEXT:    mv a0, a2
-; RV32IM-NEXT:    ret
-; RV32IM-NEXT:  .LBB7_3:
-; RV32IM-NEXT:    li a1, 128
-; RV32IM-NEXT:    bnez a0, .LBB7_2
-; RV32IM-NEXT:  .LBB7_4:
-; RV32IM-NEXT:    mv a0, a1
 ; RV32IM-NEXT:    ret
 ;
 ; RV64IM-LABEL: select_umax_4:
 ; RV64IM:       # %bb.0:
-; RV64IM-NEXT:    sext.w a2, a1
+; RV64IM-NEXT:    addi a0, a0, -1
+; RV64IM-NEXT:    and a0, a0, a1
+; RV64IM-NEXT:    sext.w a0, a0
 ; RV64IM-NEXT:    li a1, 128
-; RV64IM-NEXT:    bgeu a1, a2, .LBB7_3
+; RV64IM-NEXT:    bltu a1, a0, .LBB7_2
 ; RV64IM-NEXT:  # %bb.1:
-; RV64IM-NEXT:    beqz a0, .LBB7_4
+; RV64IM-NEXT:    li a0, 128
 ; RV64IM-NEXT:  .LBB7_2:
-; RV64IM-NEXT:    mv a0, a1
-; RV64IM-NEXT:    ret
-; RV64IM-NEXT:  .LBB7_3:
-; RV64IM-NEXT:    li a2, 128
-; RV64IM-NEXT:    bnez a0, .LBB7_2
-; RV64IM-NEXT:  .LBB7_4:
-; RV64IM-NEXT:    mv a0, a2
 ; RV64IM-NEXT:    ret
 ;
 ; RV32IMZBB-LABEL: select_umax_4:
 ; RV32IMZBB:       # %bb.0:
-; RV32IMZBB-NEXT:    mv a2, a0
-; RV32IMZBB-NEXT:    li a0, 128
-; RV32IMZBB-NEXT:    bnez a2, .LBB7_2
-; RV32IMZBB-NEXT:  # %bb.1:
-; RV32IMZBB-NEXT:    maxu a0, a1, a0
-; RV32IMZBB-NEXT:  .LBB7_2:
+; RV32IMZBB-NEXT:    addi a0, a0, -1
+; RV32IMZBB-NEXT:    and a0, a0, a1
+; RV32IMZBB-NEXT:    li a1, 128
+; RV32IMZBB-NEXT:    maxu a0, a0, a1
 ; RV32IMZBB-NEXT:    ret
 ;
 ; RV64IMZBB-LABEL: select_umax_4:
 ; RV64IMZBB:       # %bb.0:
-; RV64IMZBB-NEXT:    mv a2, a0
-; RV64IMZBB-NEXT:    li a0, 128
-; RV64IMZBB-NEXT:    bnez a2, .LBB7_2
-; RV64IMZBB-NEXT:  # %bb.1:
-; RV64IMZBB-NEXT:    sext.w a1, a1
-; RV64IMZBB-NEXT:    maxu a0, a1, a0
-; RV64IMZBB-NEXT:  .LBB7_2:
+; RV64IMZBB-NEXT:    addi a0, a0, -1
+; RV64IMZBB-NEXT:    and a0, a0, a1
+; RV64IMZBB-NEXT:    sext.w a0, a0
+; RV64IMZBB-NEXT:    li a1, 128
+; RV64IMZBB-NEXT:    maxu a0, a0, a1
 ; RV64IMZBB-NEXT:    ret
 ;
 ; RV32IMZICOND-LABEL: select_umax_4:
 ; RV32IMZICOND:       # %bb.0:
-; RV32IMZICOND-NEXT:    sltiu a2, a1, 129
-; RV32IMZICOND-NEXT:    addi a1, a1, -128
-; RV32IMZICOND-NEXT:    czero.nez a1, a1, a2
 ; RV32IMZICOND-NEXT:    czero.nez a0, a1, a0
+; RV32IMZICOND-NEXT:    sltiu a1, a0, 129
+; RV32IMZICOND-NEXT:    addi a0, a0, -128
+; RV32IMZICOND-NEXT:    czero.nez a0, a0, a1
 ; RV32IMZICOND-NEXT:    addi a0, a0, 128
 ; RV32IMZICOND-NEXT:    ret
 ;
 ; RV64IMZICOND-LABEL: select_umax_4:
 ; RV64IMZICOND:       # %bb.0:
-; RV64IMZICOND-NEXT:    sext.w a1, a1
-; RV64IMZICOND-NEXT:    sltiu a2, a1, 129
-; RV64IMZICOND-NEXT:    addi a1, a1, -128
-; RV64IMZICOND-NEXT:    czero.nez a1, a1, a2
 ; RV64IMZICOND-NEXT:    czero.nez a0, a1, a0
+; RV64IMZICOND-NEXT:    sext.w a0, a0
+; RV64IMZICOND-NEXT:    sltiu a1, a0, 129
+; RV64IMZICOND-NEXT:    addi a0, a0, -128
+; RV64IMZICOND-NEXT:    czero.nez a0, a0, a1
 ; RV64IMZICOND-NEXT:    addi a0, a0, 128
 ; RV64IMZICOND-NEXT:    ret
 ;
 ; RV32IMBOTH-LABEL: select_umax_4:
 ; RV32IMBOTH:       # %bb.0:
-; RV32IMBOTH-NEXT:    li a2, 128
-; RV32IMBOTH-NEXT:    maxu a1, a1, a2
-; RV32IMBOTH-NEXT:    addi a1, a1, -128
 ; RV32IMBOTH-NEXT:    czero.nez a0, a1, a0
-; RV32IMBOTH-NEXT:    addi a0, a0, 128
+; RV32IMBOTH-NEXT:    li a1, 128
+; RV32IMBOTH-NEXT:    maxu a0, a0, a1
 ; RV32IMBOTH-NEXT:    ret
 ;
 ; RV64IMBOTH-LABEL: select_umax_4:
 ; RV64IMBOTH:       # %bb.0:
-; RV64IMBOTH-NEXT:    sext.w a1, a1
-; RV64IMBOTH-NEXT:    li a2, 128
-; RV64IMBOTH-NEXT:    maxu a1, a1, a2
-; RV64IMBOTH-NEXT:    addi a1, a1, -128
 ; RV64IMBOTH-NEXT:    czero.nez a0, a1, a0
-; RV64IMBOTH-NEXT:    addi a0, a0, 128
+; RV64IMBOTH-NEXT:    sext.w a0, a0
+; RV64IMBOTH-NEXT:    li a1, 128
+; RV64IMBOTH-NEXT:    maxu a0, a0, a1
 ; RV64IMBOTH-NEXT:    ret
   %minmax = call i32 @llvm.umax(i32 %x, i32 128)
   %sel = select i1 %cond, i32 128, i32 %minmax
diff --git a/llvm/test/CodeGen/RISCV/short-forward-branch-opt.ll b/llvm/test/CodeGen/RISCV/short-forward-branch-opt.ll
index 59a702ab6b17f..1bfeeb92e06dd 100644
--- a/llvm/test/CodeGen/RISCV/short-forward-branch-opt.ll
+++ b/llvm/test/CodeGen/RISCV/short-forward-branch-opt.ll
@@ -2075,14 +2075,14 @@ define i64 @abs_i64(i64 %x) {
 ; RV32SFB-LABEL: abs_i64:
 ; RV32SFB:       # %bb.0:
 ; RV32SFB-NEXT:    snez a2, a0
-; RV32SFB-NEXT:    add a2, a2, a1
+; RV32SFB-NEXT:    neg a3, a1
 ; RV32SFB-NEXT:    bgez a1, .LBB35_2
 ; RV32SFB-NEXT:  # %bb.1:
 ; RV32SFB-NEXT:    neg a0, a0
 ; RV32SFB-NEXT:  .LBB35_2:
 ; RV32SFB-NEXT:    bgez a1, .LBB35_4
 ; RV32SFB-NEXT:  # %bb.3:
-; RV32SFB-NEXT:    neg a1, a2
+; RV32SFB-NEXT:    sub a1, a3, a2
 ; RV32SFB-NEXT:  .LBB35_4:
 ; RV32SFB-NEXT:    ret
   %a = call i64 @llvm.abs.i64(i64 %x, i1 false)
diff --git a/llvm/test/CodeGen/RISCV/xcvbitmanip.ll b/llvm/test/CodeGen/RISCV/xcvbitmanip.ll
index d25ff28475c4b..b2cebabb7df8b 100644
--- a/llvm/test/CodeGen/RISCV/xcvbitmanip.ll
+++ b/llvm/test/CodeGen/RISCV/xcvbitmanip.ll
@@ -229,3 +229,50 @@ define i32 @test.llvm.bitrev(i32 %a) {
   %1 = call i32 @llvm.bitreverse(i32 %a)
   ret i32 %1
 }
+
+define i1 @ctpop_i32_ult_two(i32 signext %a) nounwind {
+; CHECK-LABEL: ctpop_i32_ult_two:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    cv.cnt a0, a0
+; CHECK-NEXT:    sltiu a0, a0, 2
+; CHECK-NEXT:    ret
+  %1 = call i32 @llvm.ctpop.i32(i32 %a)
+  %2 = icmp ult i32 %1, 2
+  ret i1 %2
+}
+
+define i1 @ctpop_i32_ugt_one(i32 signext %a) nounwind {
+; CHECK-LABEL: ctpop_i32_ugt_one:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    cv.cnt a0, a0
+; CHECK-NEXT:    sltiu a0, a0, 2
+; CHECK-NEXT:    xori a0, a0, 1
+; CHECK-NEXT:    ret
+  %1 = call i32 @llvm.ctpop.i32(i32 %a)
+  %2 = icmp ugt i32 %1, 1
+  ret i1 %2
+}
+
+define i1 @ctpop_i32_eq_one(i32 signext %a) nounwind {
+; CHECK-LABEL: ctpop_i32_eq_one:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    cv.cnt a0, a0
+; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    seqz a0, a0
+; CHECK-NEXT:    ret
+  %1 = call i32 @llvm.ctpop.i32(i32 %a)
+  %2 = icmp eq i32 %1, 1
+  ret i1 %2
+}
+
+define i1 @ctpop_i32_ne_one(i32 signext %a) nounwind {
+; CHECK-LABEL: ctpop_i32_ne_one:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    cv.cnt a0, a0
+; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    snez a0, a0
+; CHECK-NEXT:    ret
+  %1 = call i32 @llvm.ctpop.i32(i32 %a)
+  %2 = icmp ne i32 %1, 1
+  ret i1 %2
+}
diff --git a/llvm/test/CodeGen/RISCV/xqciac.ll b/llvm/test/CodeGen/RISCV/xqciac.ll
index c76e1a9d64f17..918468bdf03d3 100644
--- a/llvm/test/CodeGen/RISCV/xqciac.ll
+++ b/llvm/test/CodeGen/RISCV/xqciac.ll
@@ -361,8 +361,8 @@ define dso_local i32 @shladdc1c2(i32 %a, i32 %b) local_unnamed_addr #0 {
 ;
 ; RV32IMXQCIAC-LABEL: shladdc1c2:
 ; RV32IMXQCIAC:       # %bb.0: # %entry
-; RV32IMXQCIAC-NEXT:    qc.shladd a0, a0, a1, 5
-; RV32IMXQCIAC-NEXT:    slli a0, a0, 26
+; RV32IMXQCIAC-NEXT:    slli a1, a1, 26
+; RV32IMXQCIAC-NEXT:    qc.shladd a0, a0, a1, 31
 ; RV32IMXQCIAC-NEXT:    ret
 ;
 ; RV32IZBAMXQCIAC-LABEL: shladdc1c2:
@@ -599,3 +599,23 @@ define i32 @add_shl_moreOneUse_4(i32 %x) {
   %add = add i32 %mul, %or
   ret i32 %add
 }
+
+define i32 @select65(i1 zeroext %x) {
+; RV32IM-LABEL: select65:
+; RV32IM:       # %bb.0:
+; RV32IM-NEXT:    neg a0, a0
+; RV32IM-NEXT:    andi a0, a0, 65
+; RV32IM-NEXT:    ret
+;
+; RV32IMXQCIAC-LABEL: select65:
+; RV32IMXQCIAC:       # %bb.0:
+; RV32IMXQCIAC-NEXT:    qc.shladd a0, a0, a0, 6
+; RV32IMXQCIAC-NEXT:    ret
+;
+; RV32IZBAMXQCIAC-LABEL: select65:
+; RV32IZBAMXQCIAC:       # %bb.0:
+; RV32IZBAMXQCIAC-NEXT:    qc.shladd a0, a0, a0, 6
+; RV32IZBAMXQCIAC-NEXT:    ret
+  %select = select i1 %x, i32 65, i32 0
+  ret i32 %select
+}
diff --git a/llvm/test/CodeGen/SPIRV/basic_float_types.ll b/llvm/test/CodeGen/SPIRV/basic_float_types.ll
index dfee1ace2205d..486f6358ce5de 100644
--- a/llvm/test/CodeGen/SPIRV/basic_float_types.ll
+++ b/llvm/test/CodeGen/SPIRV/basic_float_types.ll
@@ -1,6 +1,6 @@
 ; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s
 ; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s
-; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+; RUNx: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 
 define void @main() {
 entry:
@@ -8,7 +8,8 @@ entry:
 ; CHECK-DAG: OpCapability Float16
 ; CHECK-DAG: OpCapability Float64
 
-; CHECK-DAG:     %[[#half:]] = OpTypeFloat 16
+; CHECK-DAG:     %[[#half:]] = OpTypeFloat 16{{$}}
+; CHECK-DAG:   %[[#bfloat:]] = OpTypeFloat 16 0{{$}}
 ; CHECK-DAG:    %[[#float:]] = OpTypeFloat 32
 ; CHECK-DAG:   %[[#double:]] = OpTypeFloat 64
 
@@ -16,6 +17,10 @@ entry:
 ; CHECK-DAG:   %[[#v3half:]] = OpTypeVector %[[#half]] 3
 ; CHECK-DAG:   %[[#v4half:]] = OpTypeVector %[[#half]] 4
 
+; CHECK-DAG:  %[[#v2bfloat:]] = OpTypeVector %[[#bfloat]] 2
+; CHECK-DAG:  %[[#v3bfloat:]] = OpTypeVector %[[#bfloat]] 3
+; CHECK-DAG:  %[[#v4bfloat:]] = OpTypeVector %[[#bfloat]] 4
+
 ; CHECK-DAG:  %[[#v2float:]] = OpTypeVector %[[#float]] 2
 ; CHECK-DAG:  %[[#v3float:]] = OpTypeVector %[[#float]] 3
 ; CHECK-DAG:  %[[#v4float:]] = OpTypeVector %[[#float]] 4
@@ -25,11 +30,15 @@ entry:
 ; CHECK-DAG: %[[#v4double:]] = OpTypeVector %[[#double]] 4
 
 ; CHECK-DAG:     %[[#ptr_Function_half:]] = OpTypePointer Function %[[#half]]
+; CHECK-DAG:    %[[#ptr_Function_bfloat:]] = OpTypePointer Function %[[#bfloat]]
 ; CHECK-DAG:    %[[#ptr_Function_float:]] = OpTypePointer Function %[[#float]]
 ; CHECK-DAG:   %[[#ptr_Function_double:]] = OpTypePointer Function %[[#double]]
 ; CHECK-DAG:   %[[#ptr_Function_v2half:]] = OpTypePointer Function %[[#v2half]]
 ; CHECK-DAG:   %[[#ptr_Function_v3half:]] = OpTypePointer Function %[[#v3half]]
 ; CHECK-DAG:   %[[#ptr_Function_v4half:]] = OpTypePointer Function %[[#v4half]]
+; CHECK-DAG:  %[[#ptr_Function_v2bfloat:]] = OpTypePointer Function %[[#v2bfloat]]
+; CHECK-DAG:  %[[#ptr_Function_v3bfloat:]] = OpTypePointer Function %[[#v3bfloat]]
+; CHECK-DAG:  %[[#ptr_Function_v4bfloat:]] = OpTypePointer Function %[[#v4bfloat]]
 ; CHECK-DAG:  %[[#ptr_Function_v2float:]] = OpTypePointer Function %[[#v2float]]
 ; CHECK-DAG:  %[[#ptr_Function_v3float:]] = OpTypePointer Function %[[#v3float]]
 ; CHECK-DAG:  %[[#ptr_Function_v4float:]] = OpTypePointer Function %[[#v4float]]
@@ -40,6 +49,9 @@ entry:
 ; CHECK: %[[#]] = OpVariable %[[#ptr_Function_half]] Function
   %half_Val = alloca half, align 2
 
+; CHECK: %[[#]] = OpVariable %[[#ptr_Function_bfloat]] Function
+  %bfloat_Val = alloca bfloat, align 2
+
 ; CHECK: %[[#]] = OpVariable %[[#ptr_Function_float]] Function
   %float_Val = alloca float, align 4
 
@@ -55,6 +67,15 @@ entry:
 ; CHECK: %[[#]] = OpVariable %[[#ptr_Function_v4half]] Function
   %half4_Val = alloca <4 x half>, align 8
 
+; CHECK: %[[#]] = OpVariable %[[#ptr_Function_v2bfloat]] Function
+  %bfloat2_Val = alloca <2 x bfloat>, align 4
+
+; CHECK: %[[#]] = OpVariable %[[#ptr_Function_v3bfloat]] Function
+  %bfloat3_Val = alloca <3 x bfloat>, align 8
+
+; CHECK: %[[#]] = OpVariable %[[#ptr_Function_v4bfloat]] Function
+  %bfloat4_Val = alloca <4 x bfloat>, align 8
+
 ; CHECK: %[[#]] = OpVariable %[[#ptr_Function_v2float]] Function
   %float2_Val = alloca <2 x float>, align 8
 
diff --git a/llvm/test/CodeGen/Thumb2/bti-indirect-branches.ll b/llvm/test/CodeGen/Thumb2/bti-indirect-branches.ll
index c6ffb92d60d8d..8e570f0e91a08 100644
--- a/llvm/test/CodeGen/Thumb2/bti-indirect-branches.ll
+++ b/llvm/test/CodeGen/Thumb2/bti-indirect-branches.ll
@@ -25,7 +25,7 @@ define internal i32 @table_switch(i32 %x) "branch-target-enforcement" {
 ; CHECK-NEXT:    movs r0, #3
 ; CHECK-NEXT:    bx lr
 ; CHECK-NEXT:  .LBB0_5: @ %bb4
-; CHECK-NEXT:    movs r0, #4
+; CHECK-NEXT:    movs r0, #5
 ; CHECK-NEXT:    bx lr
 ; CHECK-NEXT:  .LBB0_6: @ %sw.epilog
 ; CHECK-NEXT:    movs r0, #0
@@ -51,7 +51,7 @@ sw.epilog:
   br label %return
 
 return:
-  %ret = phi i32 [ 0, %sw.epilog ], [ 1, %bb1 ], [ 2, %bb2 ], [ 3, %bb3 ], [ 4, %bb4 ]
+  %ret = phi i32 [ 0, %sw.epilog ], [ 1, %bb1 ], [ 2, %bb2 ], [ 3, %bb3 ], [ 5, %bb4 ]
   ret i32 %ret
 }
 
diff --git a/llvm/test/CodeGen/WebAssembly/partial-reduce-accumulate.ll b/llvm/test/CodeGen/WebAssembly/partial-reduce-accumulate.ll
new file mode 100644
index 0000000000000..47ea762864cc2
--- /dev/null
+++ b/llvm/test/CodeGen/WebAssembly/partial-reduce-accumulate.ll
@@ -0,0 +1,609 @@
+; RUN: opt -mattr=+simd128 -passes=loop-vectorize %s | llc -mtriple=wasm32 -mattr=+simd128 -verify-machineinstrs -o - | FileCheck %s
+; RUN: opt -mattr=+simd128 -passes=loop-vectorize -vectorizer-maximize-bandwidth %s | llc -mtriple=wasm32 -mattr=+simd128 -verify-machineinstrs -o - | FileCheck %s --check-prefix=MAX-BANDWIDTH
+
+target triple = "wasm32"
+
+define hidden i32 @accumulate_add_u8_u8(ptr noundef readonly  %a, ptr noundef readonly  %b, i32 noundef %N) {
+; CHECK-LABEL: accumulate_add_u8_u8:
+; CHECK: loop
+; CHECK: v128.load32_zero
+; CHECK: i16x8.extend_low_i8x16_u
+; CHECK: i32x4.extend_low_i16x8_u
+; CHECK: i32x4.add
+; CHECK: v128.load32_zero
+; CHECK: i16x8.extend_low_i8x16_u
+; CHECK: i32x4.extend_low_i16x8_u
+; CHECK: i32x4.add
+
+; MAX-BANDWIDTH: loop
+; MAX-BANDWIDTH: v128.load
+; MAX-BANDWIDTH: i16x8.extadd_pairwise_i8x16_u
+; MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_u
+; MAX-BANDWIDTH: i32x4.add
+; MAX-BANDWIDTH: v128.load
+; MAX-BANDWIDTH: i16x8.extadd_pairwise_i8x16_u
+; MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_u
+; MAX-BANDWIDTH: i32x4.add
+
+entry:
+  %cmp8.not = icmp eq i32 %N, 0
+  br i1 %cmp8.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  %result.0.lcssa = phi i32 [ 0, %entry ], [ %add3, %for.body ]
+  ret i32 %result.0.lcssa
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.010 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %result.09 = phi i32 [ %add3, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw i8, ptr %a, i32 %i.010
+  %0 = load i8, ptr %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  %arrayidx1 = getelementptr inbounds nuw i8, ptr %b, i32 %i.010
+  %1 = load i8, ptr %arrayidx1, align 1
+  %conv2 = zext i8 %1 to i32
+  %add = add i32 %result.09, %conv
+  %add3 = add i32 %add, %conv2
+  %inc = add nuw i32 %i.010, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+define hidden i32 @accumulate_add_s8_s8(ptr noundef readonly  %a, ptr noundef readonly  %b, i32 noundef %N) {
+; CHECK-LABEL: accumulate_add_s8_s8:
+; CHECK: loop
+; CHECK: v128.load32_zero
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: i32x4.add
+; CHECK: v128.load32_zero
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: i32x4.add
+
+; MAX-BANDWIDTH: loop
+; MAX-BANDWIDTH: v128.load
+; MAX-BANDWIDTH: i16x8.extadd_pairwise_i8x16_s
+; MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_s
+; MAX-BANDWIDTH: i32x4.add
+; MAX-BANDWIDTH: v128.load
+; MAX-BANDWIDTH: i16x8.extadd_pairwise_i8x16_s
+; MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_s
+; MAX-BANDWIDTH: i32x4.add
+entry:
+  %cmp8.not = icmp eq i32 %N, 0
+  br i1 %cmp8.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  %result.0.lcssa = phi i32 [ 0, %entry ], [ %add3, %for.body ]
+  ret i32 %result.0.lcssa
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.010 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %result.09 = phi i32 [ %add3, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw i8, ptr %a, i32 %i.010
+  %0 = load i8, ptr %arrayidx, align 1
+  %conv = sext i8 %0 to i32
+  %arrayidx1 = getelementptr inbounds nuw i8, ptr %b, i32 %i.010
+  %1 = load i8, ptr %arrayidx1, align 1
+  %conv2 = sext i8 %1 to i32
+  %add = add i32 %result.09, %conv
+  %add3 = add i32 %add, %conv2
+  %inc = add nuw i32 %i.010, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+define hidden i32 @accumulate_add_s8_u8(ptr noundef readonly  %a, ptr noundef readonly  %b, i32 noundef %N) {
+; CHECK-LABEL: accumulate_add_s8_u8:
+; CHECK: loop
+; CHECK: v128.load32_zero
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: i32x4.add
+; CHECK: v128.load32_zero
+; CHECK: i16x8.extend_low_i8x16_u
+; CHECK: i32x4.extend_low_i16x8_u
+; CHECK: i32x4.add
+
+; MAX-BANDWIDTH: loop
+; MAX-BANDWIDTH: v128.load
+; MAX-BANDWIDTH: i16x8.extadd_pairwise_i8x16_s
+; MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_s
+; MAX-BANDWIDTH: i32x4.add
+; MAX-BANDWIDTH: v128.load
+; MAX-BANDWIDTH: i16x8.extadd_pairwise_i8x16_u
+; MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_u
+; MAX-BANDWIDTH: i32x4.add
+entry:
+  %cmp8.not = icmp eq i32 %N, 0
+  br i1 %cmp8.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  %result.0.lcssa = phi i32 [ 0, %entry ], [ %add3, %for.body ]
+  ret i32 %result.0.lcssa
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.010 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %result.09 = phi i32 [ %add3, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw i8, ptr %a, i32 %i.010
+  %0 = load i8, ptr %arrayidx, align 1
+  %conv = sext i8 %0 to i32
+  %arrayidx1 = getelementptr inbounds nuw i8, ptr %b, i32 %i.010
+  %1 = load i8, ptr %arrayidx1, align 1
+  %conv2 = zext i8 %1 to i32
+  %add = add i32 %result.09, %conv
+  %add3 = add i32 %add, %conv2
+  %inc = add nuw i32 %i.010, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+define hidden i32 @accumulate_add_s8_s16(ptr noundef readonly  %a, ptr noundef readonly  %b, i32 noundef %N) {
+; CHECK-LABEL: accumulate_add_s8_s16:
+; CHECK: loop
+; CHECK: v128.load32_zero
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: i32x4.add
+; CHECK: i32x4.load16x4_s
+; CHECK: i32x4.add
+
+; MAX-BANDWIDTH: loop
+; MAX-BANDWIDTH: v128.load
+; MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
+; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; MAX-BANDWIDTH: i32x4.add
+; MAX-BANDWIDTH: v128.load
+; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; MAX-BANDWIDTH: i32x4.add
+; MAX-BANDWIDTH: i8x16.shuffle	12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
+; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; MAX-BANDWIDTH: i32x4.add
+; MAX-BANDWIDTH: v128.load
+; MAX-BANDWIDTH: i32x4.extend_high_i16x8_s
+; MAX-BANDWIDTH: i32x4.add
+; MAX-BANDWIDTH: i8x16.shuffle	8, 9, 10, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
+; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; MAX-BANDWIDTH: i32x4.add
+; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; MAX-BANDWIDTH: i32x4.add
+; MAX-BANDWIDTH: i8x16.shuffle	4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
+; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; MAX-BANDWIDTH: i32x4.add
+; MAX-BANDWIDTH: i32x4.extend_high_i16x8_s
+; MAX-BANDWIDTH: i32x4.add
+entry:
+  %cmp8.not = icmp eq i32 %N, 0
+  br i1 %cmp8.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  %result.0.lcssa = phi i32 [ 0, %entry ], [ %add3, %for.body ]
+  ret i32 %result.0.lcssa
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.010 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %result.09 = phi i32 [ %add3, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw i8, ptr %a, i32 %i.010
+  %0 = load i8, ptr %arrayidx, align 1
+  %conv = sext i8 %0 to i32
+  %arrayidx1 = getelementptr inbounds nuw i16, ptr %b, i32 %i.010
+  %1 = load i16, ptr %arrayidx1, align 2
+  %conv2 = sext i16 %1 to i32
+  %add = add i32 %result.09, %conv
+  %add3 = add i32 %add, %conv2
+  %inc = add nuw i32 %i.010, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+define hidden i32 @accumulate_shr_u8(ptr noundef readonly  %a, i32 noundef %N) {
+; CHECK-LABEL: accumulate_shr_u8:
+; CHECK: loop
+; CHECK: v128.load32_zero
+; CHECK: i8x16.shr_u
+; CHECK: i16x8.extend_low_i8x16_u
+; CHECK: i32x4.extend_low_i16x8_u
+; CHECK: i32x4.add
+
+; MAX-BANDWIDTH: loop
+; MAX-BANDWIDTH: v128.load
+; MAX-BANDWIDTH: i8x16.shr_u
+; MAX-BANDWIDTH: i16x8.extadd_pairwise_i8x16_u
+; MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_u
+; MAX-BANDWIDTH: i32x4.add
+entry:
+  %cmp4.not = icmp eq i32 %N, 0
+  br i1 %cmp4.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  %result.0.lcssa = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  ret i32 %result.0.lcssa
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.06 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %result.05 = phi i32 [ %add, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw i8, ptr %a, i32 %i.06
+  %0 = load i8, ptr %arrayidx, align 1
+  %1 = lshr i8 %0, 1
+  %shr = zext nneg i8 %1 to i32
+  %add = add i32 %result.05, %shr
+  %inc = add nuw i32 %i.06, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+define hidden i32 @accumulate_shr_s8(ptr noundef readonly  %a, i32 noundef %N) {
+; CHECK-LABEL: accumulate_shr_s8:
+; CHECK: loop
+; CHECK: v128.load32_zero
+; CHECK: i8x16.shr_s
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: i32x4.add
+
+; MAX-BANDWIDTH: loop
+; MAX-BANDWIDTH: v128.load
+; MAX-BANDWIDTH: i8x16.shr_s
+; MAX-BANDWIDTH: i16x8.extadd_pairwise_i8x16_s
+; MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_s
+; MAX-BANDWIDTH: i32x4.add
+entry:
+  %cmp4.not = icmp eq i32 %N, 0
+  br i1 %cmp4.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  %result.0.lcssa = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  ret i32 %result.0.lcssa
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.06 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %result.05 = phi i32 [ %add, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw i8, ptr %a, i32 %i.06
+  %0 = load i8, ptr %arrayidx, align 1
+  %1 = ashr i8 %0, 1
+  %shr = sext i8 %1 to i32
+  %add = add nsw i32 %result.05, %shr
+  %inc = add nuw i32 %i.06, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+define hidden i32 @accumulate_max_u8_u8(ptr noundef readonly  %a, ptr noundef readonly  %b, i32 noundef %N) {
+; CHECK-LABEL: accumulate_max_u8_u8:
+; CHECK: loop
+; CHECK: v128.load32_zero
+; CHECK: v128.load32_zero
+; CHECK: i8x16.max_u
+; CHECK: i16x8.extend_low_i8x16_u
+; CHECK: i32x4.extend_low_i16x8_u
+; CHECK: i32x4.add
+
+; MAX-BANDWIDTH: loop
+; MAX-BANDWIDTH: v128.load
+; MAX-BANDWIDTH: v128.load
+; MAX-BANDWIDTH: i8x16.max_u
+; MAX-BANDWIDTH: i16x8.extadd_pairwise_i8x16_u
+; MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_u
+; MAX-BANDWIDTH: i32x4.add
+entry:
+  %cmp17.not = icmp eq i32 %N, 0
+  br i1 %cmp17.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  %result.0.lcssa = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  ret i32 %result.0.lcssa
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.019 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %result.018 = phi i32 [ %add, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw i8, ptr %a, i32 %i.019
+  %0 = load i8, ptr %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds nuw i8, ptr %b, i32 %i.019
+  %1 = load i8, ptr %arrayidx1, align 1
+  %. = tail call i8 @llvm.umax.i8(i8 %0, i8 %1)
+  %cond = zext i8 %. to i32
+  %add = add i32 %result.018, %cond
+  %inc = add nuw i32 %i.019, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+define hidden i32 @accumulate_min_s8_s8(ptr noundef readonly  %a, ptr noundef readonly  %b, i32 noundef %N) {
+; CHECK-LABEL: accumulate_min_s8_s8:
+; CHECK: loop
+; CHECK: v128.load32_zero
+; CHECK: v128.load32_zero
+; CHECK: i8x16.min_s
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: i32x4.add
+
+; MAX-BANDWIDTH: loop
+; MAX-BANDWIDTH: v128.load
+; MAX-BANDWIDTH: v128.load
+; MAX-BANDWIDTH: i8x16.min_s
+; MAX-BANDWIDTH: i16x8.extadd_pairwise_i8x16_s
+; MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_s
+; MAX-BANDWIDTH: i32x4.add
+entry:
+  %cmp17.not = icmp eq i32 %N, 0
+  br i1 %cmp17.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  %result.0.lcssa = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  ret i32 %result.0.lcssa
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.019 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %result.018 = phi i32 [ %add, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw i8, ptr %a, i32 %i.019
+  %0 = load i8, ptr %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds nuw i8, ptr %b, i32 %i.019
+  %1 = load i8, ptr %arrayidx1, align 1
+  %. = tail call i8 @llvm.smin.i8(i8 %0, i8 %1)
+  %cond = sext i8 %. to i32
+  %add = add nsw i32 %result.018, %cond
+  %inc = add nuw i32 %i.019, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+define hidden i32 @accumulate_add_u16_u16(ptr noundef readonly  %a, ptr noundef readonly  %b, i32 noundef %N) {
+; CHECK-LABEL: accumulate_add_u16_u16:
+; CHECK: loop
+; CHECK: i32x4.load16x4_u
+; CHECK: i32x4.add
+; CHECK: i32x4.load16x4_u
+; CHECK: i32x4.add
+
+; MAX-BANDWIDTH: loop
+; MAX-BANDWIDTH: v128.load
+; MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_u
+; MAX-BANDWIDTH: i32x4.add
+; MAX-BANDWIDTH: v128.load
+; MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_u
+; MAX-BANDWIDTH: i32x4.add
+entry:
+  %cmp8.not = icmp eq i32 %N, 0
+  br i1 %cmp8.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  %result.0.lcssa = phi i32 [ 0, %entry ], [ %add3, %for.body ]
+  ret i32 %result.0.lcssa
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.010 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %result.09 = phi i32 [ %add3, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw i16, ptr %a, i32 %i.010
+  %0 = load i16, ptr %arrayidx, align 2
+  %conv = zext i16 %0 to i32
+  %arrayidx1 = getelementptr inbounds nuw i16, ptr %b, i32 %i.010
+  %1 = load i16, ptr %arrayidx1, align 2
+  %conv2 = zext i16 %1 to i32
+  %add = add i32 %result.09, %conv
+  %add3 = add i32 %add, %conv2
+  %inc = add nuw i32 %i.010, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+define hidden i32 @accumulate_add_s16_s16(ptr noundef readonly  %a, ptr noundef readonly  %b, i32 noundef %N) {
+; CHECK-LABEL: accumulate_add_s16_s16:
+; CHECK: loop
+; CHECK: i32x4.load16x4_s
+; CHECK: i32x4.add
+; CHECK: i32x4.load16x4_s
+; CHECK: i32x4.add
+
+; MAX-BANDWIDTH: loop
+; MAX-BANDWIDTH: v128.load
+; MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_s
+; MAX-BANDWIDTH: i32x4.add
+; MAX-BANDWIDTH: v128.load
+; MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_s
+; MAX-BANDWIDTH: i32x4.add
+entry:
+  %cmp8.not = icmp eq i32 %N, 0
+  br i1 %cmp8.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  %result.0.lcssa = phi i32 [ 0, %entry ], [ %add3, %for.body ]
+  ret i32 %result.0.lcssa
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.010 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %result.09 = phi i32 [ %add3, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw i16, ptr %a, i32 %i.010
+  %0 = load i16, ptr %arrayidx, align 2
+  %conv = sext i16 %0 to i32
+  %arrayidx1 = getelementptr inbounds nuw i16, ptr %b, i32 %i.010
+  %1 = load i16, ptr %arrayidx1, align 2
+  %conv2 = sext i16 %1 to i32
+  %add = add i32 %result.09, %conv
+  %add3 = add i32 %add, %conv2
+  %inc = add nuw i32 %i.010, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+define hidden i32 @accumulate_shr_u16(ptr noundef readonly  %a, i32 noundef %N) {
+; CHECK-LABEL: accumulate_shr_u16:
+; CHECK: loop
+; CHECK: v128.load64_zero
+; CHECK: i16x8.shr_u
+; CHECK: i32x4.extend_low_i16x8_u
+; CHECK: i32x4.add
+
+; MAX-BANDWIDTH: loop
+; MAX-BANDWIDTH: v128.load
+; MAX-BANDWIDTH: i16x8.shr_u
+; MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_u
+; MAX-BANDWIDTH: i32x4.add
+entry:
+  %cmp4.not = icmp eq i32 %N, 0
+  br i1 %cmp4.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  %result.0.lcssa = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  ret i32 %result.0.lcssa
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.06 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %result.05 = phi i32 [ %add, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw i16, ptr %a, i32 %i.06
+  %0 = load i16, ptr %arrayidx, align 2
+  %1 = lshr i16 %0, 1
+  %shr = zext nneg i16 %1 to i32
+  %add = add i32 %result.05, %shr
+  %inc = add nuw i32 %i.06, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+define hidden i32 @accumulate_shr_s16(ptr noundef readonly  %a, i32 noundef %N) {
+; CHECK-LABEL: accumulate_shr_s16:
+; CHECK: loop
+; CHECK: v128.load64_zero
+; CHECK: i16x8.shr_s
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: i32x4.add
+
+; MAX-BANDWIDTH: loop
+; MAX-BANDWIDTH: v128.load
+; MAX-BANDWIDTH: i16x8.shr_s
+; MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_s
+; MAX-BANDWIDTH: i32x4.add
+entry:
+  %cmp4.not = icmp eq i32 %N, 0
+  br i1 %cmp4.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  %result.0.lcssa = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  ret i32 %result.0.lcssa
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.06 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %result.05 = phi i32 [ %add, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw i16, ptr %a, i32 %i.06
+  %0 = load i16, ptr %arrayidx, align 2
+  %1 = ashr i16 %0, 1
+  %shr = sext i16 %1 to i32
+  %add = add nsw i32 %result.05, %shr
+  %inc = add nuw i32 %i.06, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+define hidden i32 @accumulate_sub_s8_s8(ptr noundef readonly  %a, ptr noundef readonly  %b, i32 noundef %N) {
+; CHECK-LABEL: accumulate_sub_s8_s8:
+; CHECK: loop
+; CHECK: v128.load32_zero
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: i32x4.add
+; CHECK: v128.load32_zero
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: i32x4.sub
+
+; MAX-BANDWIDTH: loop
+; MAX-BANDWIDTH: v128.load
+; MAX-BANDWIDTH: i8x16.shuffle	12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
+; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; MAX-BANDWIDTH: i32x4.add
+; MAX-BANDWIDTH: v128.load
+; MAX-BANDWIDTH: i8x16.shuffle	12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
+; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; MAX-BANDWIDTH: i32x4.sub
+; MAX-BANDWIDTH: i8x16.shuffle	8, 9, 10, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
+; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; MAX-BANDWIDTH: i32x4.add
+; MAX-BANDWIDTH: i8x16.shuffle	8, 9, 10, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
+; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; MAX-BANDWIDTH: i32x4.sub
+; MAX-BANDWIDTH: i8x16.shuffle	4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
+; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; MAX-BANDWIDTH: i32x4.add
+; MAX-BANDWIDTH: i8x16.shuffle	4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
+; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; MAX-BANDWIDTH: i32x4.sub
+entry:
+  %cmp7.not = icmp eq i32 %N, 0
+  br i1 %cmp7.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  %result.0.lcssa = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  ret i32 %result.0.lcssa
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.09 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %result.08 = phi i32 [ %add, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw i8, ptr %a, i32 %i.09
+  %0 = load i8, ptr %arrayidx, align 1
+  %conv = sext i8 %0 to i32
+  %arrayidx1 = getelementptr inbounds nuw i8, ptr %b, i32 %i.09
+  %1 = load i8, ptr %arrayidx1, align 1
+  %conv2 = sext i8 %1 to i32
+  %sub = add i32 %result.08, %conv
+  %add = sub i32 %sub, %conv2
+  %inc = add nuw i32 %i.09, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+define hidden i32 @accumulate_sub_s16_s16(ptr noundef readonly  %a, ptr noundef readonly  %b, i32 noundef %N) {
+; CHECK-LABEL: accumulate_sub_s16_s16:
+; CHECK: loop
+; CHECK: i32x4.load16x4_s
+; CHECK: i32x4.add
+; CHECK: i32x4.load16x4_s
+; CHECK: i32x4.sub
+
+; MAX-BANDWIDTH: loop
+; MAX-BANDWIDTH: v128.load
+; MAX-BANDWIDTH: i32x4.extend_high_i16x8_s
+; MAX-BANDWIDTH: i32x4.add
+; MAX-BANDWIDTH: v128.load
+; MAX-BANDWIDTH: i32x4.extend_high_i16x8_s
+; MAX-BANDWIDTH: i32x4.sub
+; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; MAX-BANDWIDTH: i32x4.add
+; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; MAX-BANDWIDTH: i32x4.sub
+entry:
+  %cmp7.not = icmp eq i32 %N, 0
+  br i1 %cmp7.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  %result.0.lcssa = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  ret i32 %result.0.lcssa
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.09 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %result.08 = phi i32 [ %add, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw i16, ptr %a, i32 %i.09
+  %0 = load i16, ptr %arrayidx, align 2
+  %conv = sext i16 %0 to i32
+  %arrayidx1 = getelementptr inbounds nuw i16, ptr %b, i32 %i.09
+  %1 = load i16, ptr %arrayidx1, align 2
+  %conv2 = sext i16 %1 to i32
+  %sub = add i32 %result.08, %conv
+  %add = sub i32 %sub, %conv2
+  %inc = add nuw i32 %i.09, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+declare i8 @llvm.umax.i8(i8, i8)
+
+declare i8 @llvm.smin.i8(i8, i8)
diff --git a/llvm/test/CodeGen/X86/apx/add.ll b/llvm/test/CodeGen/X86/apx/add.ll
index 86343811901a9..4ab0edfba7ce8 100644
--- a/llvm/test/CodeGen/X86/apx/add.ll
+++ b/llvm/test/CodeGen/X86/apx/add.ll
@@ -36,12 +36,12 @@ entry:
 define i32 @add32rr(i32 noundef %a, i32 noundef %b) {
 ; CHECK-LABEL: add32rr:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addl %esi, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x01,0xf7]
+; CHECK-NEXT:    leal (%rdi,%rsi), %eax # encoding: [0x8d,0x04,0x37]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: add32rr:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} addl %esi, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x01,0xf7]
+; NF-NEXT:    leal (%rdi,%rsi), %eax # encoding: [0x8d,0x04,0x37]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %add = add i32 %a, %b
@@ -51,12 +51,12 @@ entry:
 define i64 @add64rr(i64 noundef %a, i64 noundef %b) {
 ; CHECK-LABEL: add64rr:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addq %rsi, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x01,0xf7]
+; CHECK-NEXT:    leaq (%rdi,%rsi), %rax # encoding: [0x48,0x8d,0x04,0x37]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: add64rr:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} addq %rsi, %rdi, %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x01,0xf7]
+; NF-NEXT:    leaq (%rdi,%rsi), %rax # encoding: [0x48,0x8d,0x04,0x37]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %add = add i64 %a, %b
@@ -145,12 +145,12 @@ entry:
 define i32 @add32ri8(i32 noundef %a) {
 ; CHECK-LABEL: add32ri8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addl $123, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x83,0xc7,0x7b]
+; CHECK-NEXT:    leal 123(%rdi), %eax # encoding: [0x8d,0x47,0x7b]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: add32ri8:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} addl $123, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x83,0xc7,0x7b]
+; NF-NEXT:    leal 123(%rdi), %eax # encoding: [0x8d,0x47,0x7b]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %add = add i32 %a, 123
@@ -160,12 +160,12 @@ entry:
 define i64 @add64ri8(i64 noundef %a) {
 ; CHECK-LABEL: add64ri8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addq $123, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x83,0xc7,0x7b]
+; CHECK-NEXT:    leaq 123(%rdi), %rax # encoding: [0x48,0x8d,0x47,0x7b]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: add64ri8:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} addq $123, %rdi, %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x83,0xc7,0x7b]
+; NF-NEXT:    leaq 123(%rdi), %rax # encoding: [0x48,0x8d,0x47,0x7b]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %add = add i64 %a, 123
@@ -207,14 +207,12 @@ entry:
 define i32 @add32ri(i32 noundef %a) {
 ; CHECK-LABEL: add32ri:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addl $123456, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x81,0xc7,0x40,0xe2,0x01,0x00]
-; CHECK-NEXT:    # imm = 0x1E240
+; CHECK-NEXT:    leal 123456(%rdi), %eax # encoding: [0x8d,0x87,0x40,0xe2,0x01,0x00]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: add32ri:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} addl $123456, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x81,0xc7,0x40,0xe2,0x01,0x00]
-; NF-NEXT:    # imm = 0x1E240
+; NF-NEXT:    leal 123456(%rdi), %eax # encoding: [0x8d,0x87,0x40,0xe2,0x01,0x00]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %add = add i32 %a, 123456
@@ -224,14 +222,12 @@ entry:
 define i64 @add64ri(i64 noundef %a) {
 ; CHECK-LABEL: add64ri:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addq $123456, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x81,0xc7,0x40,0xe2,0x01,0x00]
-; CHECK-NEXT:    # imm = 0x1E240
+; CHECK-NEXT:    leaq 123456(%rdi), %rax # encoding: [0x48,0x8d,0x87,0x40,0xe2,0x01,0x00]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: add64ri:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} addq $123456, %rdi, %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x81,0xc7,0x40,0xe2,0x01,0x00]
-; NF-NEXT:    # imm = 0x1E240
+; NF-NEXT:    leaq 123456(%rdi), %rax # encoding: [0x48,0x8d,0x87,0x40,0xe2,0x01,0x00]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %add = add i64 %a, 123456
diff --git a/llvm/test/CodeGen/X86/apx/mul-i1024.ll b/llvm/test/CodeGen/X86/apx/mul-i1024.ll
index a29a92176f432..0bb3b179cc305 100644
--- a/llvm/test/CodeGen/X86/apx/mul-i1024.ll
+++ b/llvm/test/CodeGen/X86/apx/mul-i1024.ll
@@ -1613,7 +1613,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; EGPR-NDD-NEXT:    imulq %r23, %rdi
 ; EGPR-NDD-NEXT:    addq %rdi, %rdx
 ; EGPR-NDD-NEXT:    imulq 120(%r22), %r24, %rax
-; EGPR-NDD-NEXT:    addq %rax, %rdx, %r9
+; EGPR-NDD-NEXT:    leaq (%rdx,%rax), %r9
 ; EGPR-NDD-NEXT:    movq 96(%r22), %r20
 ; EGPR-NDD-NEXT:    movq 104(%r22), %rdi
 ; EGPR-NDD-NEXT:    imulq %rdi, %r26, %r10
@@ -1756,7 +1756,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; EGPR-NDD-NEXT:    addq %rsi, %rdx
 ; EGPR-NDD-NEXT:    movq 104(%r15), %r8
 ; EGPR-NDD-NEXT:    imulq %r10, %r8, %rax
-; EGPR-NDD-NEXT:    addq %rax, %rdx, %rsi
+; EGPR-NDD-NEXT:    leaq (%rdx,%rax), %rsi
 ; EGPR-NDD-NEXT:    movq 112(%r15), %rax
 ; EGPR-NDD-NEXT:    imulq %r23, %rax, %r9
 ; EGPR-NDD-NEXT:    mulq %r16
@@ -1793,7 +1793,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; EGPR-NDD-NEXT:    movq %rax, %r9
 ; EGPR-NDD-NEXT:    addq %r8, %rdx
 ; EGPR-NDD-NEXT:    imulq %r16, %r25, %rax
-; EGPR-NDD-NEXT:    addq %rax, %rdx, %r8
+; EGPR-NDD-NEXT:    leaq (%rdx,%rax), %r8
 ; EGPR-NDD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r23 # 8-byte Reload
 ; EGPR-NDD-NEXT:    imulq %r23, %r24, %r16
 ; EGPR-NDD-NEXT:    movq %r24, %rax
diff --git a/llvm/test/CodeGen/X86/apx/ndd-false-deps-asm.mir b/llvm/test/CodeGen/X86/apx/ndd-false-deps-asm.mir
index 5be5ca8d71947..bfc0120765e53 100644
--- a/llvm/test/CodeGen/X86/apx/ndd-false-deps-asm.mir
+++ b/llvm/test/CodeGen/X86/apx/ndd-false-deps-asm.mir
@@ -15,14 +15,14 @@
   define signext i16 @partial_write(ptr %p, i32 %a, i32 %b, i16 signext %x, i16 signext %y) #0 {
   ; RCDEFAULT-LABEL: partial_write:
   ; RCDEFAULT:       # %bb.0: # %entry
-  ; RCDEFAULT-NEXT:    addl %esi, %edx, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x01,0xf2]
+  ; RCDEFAULT-NEXT:    leal (%rdx,%rsi), %eax # encoding: [0x8d,0x04,0x32]
   ; RCDEFAULT-NEXT:    movl %eax, (%rdi) # encoding: [0x89,0x07]
   ; RCDEFAULT-NEXT:    addw %cx, %ax, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x01,0xc8]
   ; RCDEFAULT-NEXT:    retq # encoding: [0xc3]
   ;
   ; RC1-LABEL: partial_write:
   ; RC1:       # %bb.0: # %entry
-  ; RC1-NEXT:    addl %esi, %edx, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x01,0xf2]
+  ; RC1-NEXT:    leal (%rdx,%rsi), %eax # encoding: [0x8d,0x04,0x32]
   ; RC1-NEXT:    movl %eax, (%rdi) # encoding: [0x89,0x07]
   ; RC1-NEXT:    addw %cx, %ax # EVEX TO LEGACY Compression encoding: [0x66,0x01,0xc8]
   ; RC1-NEXT:    retq # encoding: [0xc3]
diff --git a/llvm/test/CodeGen/X86/apx/shl.ll b/llvm/test/CodeGen/X86/apx/shl.ll
index 896cd55bc7452..9c6229a483c73 100644
--- a/llvm/test/CodeGen/X86/apx/shl.ll
+++ b/llvm/test/CodeGen/X86/apx/shl.ll
@@ -396,12 +396,12 @@ entry:
 define i32 @shl32r1(i32 noundef %a) {
 ; CHECK-LABEL: shl32r1:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addl %edi, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x01,0xff]
+; CHECK-NEXT:    leal (%rdi,%rdi), %eax # encoding: [0x8d,0x04,0x3f]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: shl32r1:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} addl %edi, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x01,0xff]
+; NF-NEXT:    leal (%rdi,%rdi), %eax # encoding: [0x8d,0x04,0x3f]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %shl = shl i32 %a, 1
@@ -411,12 +411,12 @@ entry:
 define i64 @shl64r1(i64 noundef %a) {
 ; CHECK-LABEL: shl64r1:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addq %rdi, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x01,0xff]
+; CHECK-NEXT:    leaq (%rdi,%rdi), %rax # encoding: [0x48,0x8d,0x04,0x3f]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: shl64r1:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} addq %rdi, %rdi, %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x01,0xff]
+; NF-NEXT:    leaq (%rdi,%rdi), %rax # encoding: [0x48,0x8d,0x04,0x3f]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %shl = shl i64 %a, 1
diff --git a/llvm/test/CodeGen/X86/apx/sub.ll b/llvm/test/CodeGen/X86/apx/sub.ll
index d7914577634e7..75ee8cf31dee5 100644
--- a/llvm/test/CodeGen/X86/apx/sub.ll
+++ b/llvm/test/CodeGen/X86/apx/sub.ll
@@ -207,14 +207,12 @@ entry:
 define i32 @sub32ri(i32 noundef %a) {
 ; CHECK-LABEL: sub32ri:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addl $-123456, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x81,0xc7,0xc0,0x1d,0xfe,0xff]
-; CHECK-NEXT:    # imm = 0xFFFE1DC0
+; CHECK-NEXT:    leal -123456(%rdi), %eax # encoding: [0x8d,0x87,0xc0,0x1d,0xfe,0xff]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: sub32ri:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} addl $-123456, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x81,0xc7,0xc0,0x1d,0xfe,0xff]
-; NF-NEXT:    # imm = 0xFFFE1DC0
+; NF-NEXT:    leal -123456(%rdi), %eax # encoding: [0x8d,0x87,0xc0,0x1d,0xfe,0xff]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %sub = sub i32 %a, 123456
diff --git a/llvm/test/CodeGen/X86/avx512-fma.ll b/llvm/test/CodeGen/X86/avx512-fma.ll
index 97f8e5f4ea16c..29120c8815aea 100644
--- a/llvm/test/CodeGen/X86/avx512-fma.ll
+++ b/llvm/test/CodeGen/X86/avx512-fma.ll
@@ -1,14 +1,14 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f -fp-contract=fast | FileCheck %s --check-prefix=ALL --check-prefix=KNL
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx -fp-contract=fast | FileCheck %s --check-prefix=ALL --check-prefix=SKX
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=KNL
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s --check-prefix=ALL --check-prefix=SKX
 
 define <16 x float> @test_x86_fmadd_ps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
 ; ALL-LABEL: test_x86_fmadd_ps_z:
 ; ALL:       ## %bb.0:
 ; ALL-NEXT:    vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
 ; ALL-NEXT:    retq
-  %x = fmul <16 x float> %a0, %a1
-  %res = fadd <16 x float> %x, %a2
+  %x = fmul contract <16 x float> %a0, %a1
+  %res = fadd contract <16 x float> %x, %a2
   ret <16 x float> %res
 }
 
@@ -17,8 +17,8 @@ define <16 x float> @test_x86_fmsub_ps_z(<16 x float> %a0, <16 x float> %a1, <16
 ; ALL:       ## %bb.0:
 ; ALL-NEXT:    vfmsub213ps {{.*#+}} zmm0 = (zmm1 * zmm0) - zmm2
 ; ALL-NEXT:    retq
-  %x = fmul <16 x float> %a0, %a1
-  %res = fsub <16 x float> %x, %a2
+  %x = fmul contract <16 x float> %a0, %a1
+  %res = fsub contract <16 x float> %x, %a2
   ret <16 x float> %res
 }
 
@@ -27,8 +27,8 @@ define <16 x float> @test_x86_fnmadd_ps_z(<16 x float> %a0, <16 x float> %a1, <1
 ; ALL:       ## %bb.0:
 ; ALL-NEXT:    vfnmadd213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) + zmm2
 ; ALL-NEXT:    retq
-  %x = fmul <16 x float> %a0, %a1
-  %res = fsub <16 x float> %a2, %x
+  %x = fmul contract <16 x float> %a0, %a1
+  %res = fsub contract <16 x float> %a2, %x
   ret <16 x float> %res
 }
 
@@ -37,12 +37,12 @@ define <16 x float> @test_x86_fnmsub_ps_z(<16 x float> %a0, <16 x float> %a1, <1
 ; ALL:       ## %bb.0:
 ; ALL-NEXT:    vfnmsub213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) - zmm2
 ; ALL-NEXT:    retq
-  %x = fmul <16 x float> %a0, %a1
-  %y = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00,
+  %x = fmul contract <16 x float> %a0, %a1
+  %y = fsub contract <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00,
                           float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00,
                           float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00,
                           float -0.000000e+00>, %x
-  %res = fsub <16 x float> %y, %a2
+  %res = fsub contract <16 x float> %y, %a2
   ret <16 x float> %res
 }
 
@@ -51,8 +51,8 @@ define <8 x double> @test_x86_fmadd_pd_z(<8 x double> %a0, <8 x double> %a1, <8
 ; ALL:       ## %bb.0:
 ; ALL-NEXT:    vfmadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
 ; ALL-NEXT:    retq
-  %x = fmul <8 x double> %a0, %a1
-  %res = fadd <8 x double> %x, %a2
+  %x = fmul contract <8 x double> %a0, %a1
+  %res = fadd contract <8 x double> %x, %a2
   ret <8 x double> %res
 }
 
@@ -61,8 +61,8 @@ define <8 x double> @test_x86_fmsub_pd_z(<8 x double> %a0, <8 x double> %a1, <8
 ; ALL:       ## %bb.0:
 ; ALL-NEXT:    vfmsub213pd {{.*#+}} zmm0 = (zmm1 * zmm0) - zmm2
 ; ALL-NEXT:    retq
-  %x = fmul <8 x double> %a0, %a1
-  %res = fsub <8 x double> %x, %a2
+  %x = fmul contract <8 x double> %a0, %a1
+  %res = fsub contract <8 x double> %x, %a2
   ret <8 x double> %res
 }
 
@@ -71,8 +71,8 @@ define double @test_x86_fmsub_213(double %a0, double %a1, double %a2) {
 ; ALL:       ## %bb.0:
 ; ALL-NEXT:    vfmsub213sd {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
 ; ALL-NEXT:    retq
-  %x = fmul double %a0, %a1
-  %res = fsub double %x, %a2
+  %x = fmul contract double %a0, %a1
+  %res = fsub contract double %x, %a2
   ret double %res
 }
 
@@ -82,8 +82,8 @@ define double @test_x86_fmsub_213_m(double %a0, double %a1, ptr %a2_ptr) {
 ; ALL-NEXT:    vfmsub213sd {{.*#+}} xmm0 = (xmm1 * xmm0) - mem
 ; ALL-NEXT:    retq
   %a2 = load double , ptr%a2_ptr
-  %x = fmul double %a0, %a1
-  %res = fsub double %x, %a2
+  %x = fmul contract double %a0, %a1
+  %res = fsub contract double %x, %a2
   ret double %res
 }
 
@@ -93,8 +93,8 @@ define double @test_x86_fmsub_231_m(double %a0, double %a1, ptr %a2_ptr) {
 ; ALL-NEXT:    vfmsub132sd {{.*#+}} xmm0 = (xmm0 * mem) - xmm1
 ; ALL-NEXT:    retq
   %a2 = load double , ptr%a2_ptr
-  %x = fmul double %a0, %a2
-  %res = fsub double %x, %a1
+  %x = fmul contract double %a0, %a2
+  %res = fsub contract double %x, %a1
   ret double %res
 }
 
@@ -103,8 +103,8 @@ define <16 x float> @test231_br(<16 x float> %a1, <16 x float> %a2) nounwind {
 ; ALL:       ## %bb.0:
 ; ALL-NEXT:    vfmadd132ps {{.*#+}} zmm0 = (zmm0 * mem) + zmm1
 ; ALL-NEXT:    retq
-  %b1 = fmul <16 x float> %a1, <float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000>
-  %b2 = fadd <16 x float> %b1, %a2
+  %b1 = fmul contract <16 x float> %a1, <float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000>
+  %b2 = fadd contract <16 x float> %b1, %a2
   ret <16 x float> %b2
 }
 
@@ -113,8 +113,8 @@ define <16 x float> @test213_br(<16 x float> %a1, <16 x float> %a2) nounwind {
 ; ALL:       ## %bb.0:
 ; ALL-NEXT:    vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + mem
 ; ALL-NEXT:    retq
-  %b1 = fmul <16 x float> %a1, %a2
-  %b2 = fadd <16 x float> %b1, <float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000>
+  %b1 = fmul contract <16 x float> %a1, %a2
+  %b2 = fadd contract <16 x float> %b1, <float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000>
   ret <16 x float> %b2
 }
 
@@ -135,8 +135,8 @@ define <16 x float> @test_x86_fmadd132_ps(<16 x float> %a0, <16 x float> %a1, pt
 ; SKX-NEXT:    vfmadd132ps {{.*#+}} zmm0 {%k1} = (zmm0 * mem) + zmm1
 ; SKX-NEXT:    retq
   %a2   = load <16 x float>,ptr%a2_ptrt,align 1
-  %x = fmul <16 x float> %a0, %a2
-  %y = fadd <16 x float> %x, %a1
+  %x = fmul contract <16 x float> %a0, %a2
+  %y = fadd contract <16 x float> %x, %a1
   %res = select <16 x i1> %mask, <16 x float> %y, <16 x float> %a0
   ret <16 x float> %res
 }
@@ -160,8 +160,8 @@ define <16 x float> @test_x86_fmadd231_ps(<16 x float> %a0, <16 x float> %a1, pt
 ; SKX-NEXT:    vmovaps %zmm1, %zmm0
 ; SKX-NEXT:    retq
   %a2   = load <16 x float>,ptr%a2_ptrt,align 1
-  %x = fmul <16 x float> %a0, %a2
-  %y = fadd <16 x float> %x, %a1
+  %x = fmul contract <16 x float> %a0, %a2
+  %y = fadd contract <16 x float> %x, %a1
   %res = select <16 x i1> %mask, <16 x float> %y, <16 x float> %a1
   ret <16 x float> %res
 }
@@ -185,8 +185,8 @@ define <16 x float> @test_x86_fmadd213_ps(<16 x float> %a0, <16 x float> %a1, pt
 ; SKX-NEXT:    vmovaps %zmm1, %zmm0
 ; SKX-NEXT:    retq
   %a2   = load <16 x float>,ptr%a2_ptrt,align 1
-  %x = fmul <16 x float> %a1, %a0
-  %y = fadd <16 x float> %x, %a2
+  %x = fmul contract <16 x float> %a1, %a0
+  %y = fadd contract <16 x float> %x, %a2
   %res = select <16 x i1> %mask, <16 x float> %y, <16 x float> %a1
   ret <16 x float> %res
 }
diff --git a/llvm/test/CodeGen/X86/avx512fp16-combine-vfmac-fadd.ll b/llvm/test/CodeGen/X86/avx512fp16-combine-vfmac-fadd.ll
index 36b95e744ba14..f1477b57375c4 100644
--- a/llvm/test/CodeGen/X86/avx512fp16-combine-vfmac-fadd.ll
+++ b/llvm/test/CodeGen/X86/avx512fp16-combine-vfmac-fadd.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown --fp-contract=fast --enable-no-signed-zeros-fp-math -mattr=avx512fp16,avx512vl | FileCheck %s --check-prefixes=CHECK,NO-SZ
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown --fp-contract=fast -mattr=avx512fp16,avx512vl | FileCheck %s --check-prefixes=CHECK,HAS-SZ
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown --enable-no-signed-zeros-fp-math -mattr=avx512fp16,avx512vl | FileCheck %s --check-prefixes=CHECK,NO-SZ
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512fp16,avx512vl | FileCheck %s --check-prefixes=CHECK,HAS-SZ
 
 ; FADD(acc, FMA(a, b, +0.0)) can be combined to FMA(a, b, acc) if the nsz flag set.
 define dso_local <32 x half> @test1(<32 x half> %acc, <32 x half> %a, <32 x half> %b) {
@@ -18,9 +18,9 @@ define dso_local <32 x half> @test1(<32 x half> %acc, <32 x half> %a, <32 x half
 entry:
   %0 = bitcast <32 x half> %a to <16 x float>
   %1 = bitcast <32 x half> %b to <16 x float>
-  %2 = tail call <16 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.512(<16 x float> %0, <16 x float> %1, <16 x float> zeroinitializer, i16 -1, i32 4)
+  %2 = tail call contract <16 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.512(<16 x float> %0, <16 x float> %1, <16 x float> zeroinitializer, i16 -1, i32 4)
   %3 = bitcast <16 x float> %2 to <32 x half>
-  %add.i = fadd <32 x half> %3, %acc
+  %add.i = fadd contract <32 x half> %3, %acc
   ret <32 x half> %add.i
 }
 
@@ -39,9 +39,9 @@ define dso_local <32 x half> @test2(<32 x half> %acc, <32 x half> %a, <32 x half
 entry:
   %0 = bitcast <32 x half> %a to <16 x float>
   %1 = bitcast <32 x half> %b to <16 x float>
-  %2 = tail call <16 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.512(<16 x float> %0, <16 x float> %1, <16 x float> zeroinitializer, i16 -1, i32 4)
+  %2 = tail call contract <16 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.512(<16 x float> %0, <16 x float> %1, <16 x float> zeroinitializer, i16 -1, i32 4)
   %3 = bitcast <16 x float> %2 to <32 x half>
-  %add.i = fadd <32 x half> %3, %acc
+  %add.i = fadd contract <32 x half> %3, %acc
   ret <32 x half> %add.i
 }
 
@@ -60,9 +60,9 @@ define dso_local <16 x half> @test3(<16 x half> %acc, <16 x half> %a, <16 x half
 entry:
   %0 = bitcast <16 x half> %a to <8 x float>
   %1 = bitcast <16 x half> %b to <8 x float>
-  %2 = tail call <8 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.256(<8 x float> %0, <8 x float> %1, <8 x float> zeroinitializer, i8 -1)
+  %2 = tail call contract <8 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.256(<8 x float> %0, <8 x float> %1, <8 x float> zeroinitializer, i8 -1)
   %3 = bitcast <8 x float> %2 to <16 x half>
-  %add.i = fadd <16 x half> %3, %acc
+  %add.i = fadd contract <16 x half> %3, %acc
   ret <16 x half> %add.i
 }
 
@@ -81,9 +81,9 @@ define dso_local <16 x half> @test4(<16 x half> %acc, <16 x half> %a, <16 x half
 entry:
   %0 = bitcast <16 x half> %a to <8 x float>
   %1 = bitcast <16 x half> %b to <8 x float>
-  %2 = tail call <8 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.256(<8 x float> %0, <8 x float> %1, <8 x float> zeroinitializer, i8 -1)
+  %2 = tail call contract <8 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.256(<8 x float> %0, <8 x float> %1, <8 x float> zeroinitializer, i8 -1)
   %3 = bitcast <8 x float> %2 to <16 x half>
-  %add.i = fadd <16 x half> %3, %acc
+  %add.i = fadd contract <16 x half> %3, %acc
   ret <16 x half> %add.i
 }
 
@@ -102,9 +102,9 @@ define dso_local <8 x half> @test5(<8 x half> %acc, <8 x half> %a, <8 x half> %b
 entry:
   %0 = bitcast <8 x half> %a to <4 x float>
   %1 = bitcast <8 x half> %b to <4 x float>
-  %2 = tail call <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.128(<4 x float> %0, <4 x float> %1, <4 x float> zeroinitializer, i8 -1)
+  %2 = tail call contract <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.128(<4 x float> %0, <4 x float> %1, <4 x float> zeroinitializer, i8 -1)
   %3 = bitcast <4 x float> %2 to <8 x half>
-  %add.i = fadd <8 x half> %3, %acc
+  %add.i = fadd contract <8 x half> %3, %acc
   ret <8 x half> %add.i
 }
 
@@ -123,9 +123,9 @@ define dso_local <8 x half> @test6(<8 x half> %acc, <8 x half> %a, <8 x half> %b
 entry:
   %0 = bitcast <8 x half> %a to <4 x float>
   %1 = bitcast <8 x half> %b to <4 x float>
-  %2 = tail call <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.128(<4 x float> %0, <4 x float> %1, <4 x float> zeroinitializer, i8 -1)
+  %2 = tail call contract <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.128(<4 x float> %0, <4 x float> %1, <4 x float> zeroinitializer, i8 -1)
   %3 = bitcast <4 x float> %2 to <8 x half>
-  %add.i = fadd <8 x half> %3, %acc
+  %add.i = fadd contract <8 x half> %3, %acc
   ret <8 x half> %add.i
 }
 
@@ -138,9 +138,9 @@ define dso_local <32 x half> @test13(<32 x half> %acc, <32 x half> %a, <32 x hal
 entry:
   %0 = bitcast <32 x half> %a to <16 x float>
   %1 = bitcast <32 x half> %b to <16 x float>
-  %2 = tail call <16 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.512(<16 x float> %0, <16 x float> %1, <16 x float> <float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000>, i16 -1, i32 4)
+  %2 = tail call contract <16 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.512(<16 x float> %0, <16 x float> %1, <16 x float> <float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000>, i16 -1, i32 4)
   %3 = bitcast <16 x float> %2 to <32 x half>
-  %add.i = fadd <32 x half> %3, %acc
+  %add.i = fadd contract <32 x half> %3, %acc
   ret <32 x half> %add.i
 }
 
@@ -152,9 +152,9 @@ define dso_local <32 x half> @test14(<32 x half> %acc, <32 x half> %a, <32 x hal
 entry:
   %0 = bitcast <32 x half> %a to <16 x float>
   %1 = bitcast <32 x half> %b to <16 x float>
-  %2 = tail call <16 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.512(<16 x float> %0, <16 x float> %1, <16 x float> <float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000>, i16 -1, i32 4)
+  %2 = tail call contract <16 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.512(<16 x float> %0, <16 x float> %1, <16 x float> <float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000>, i16 -1, i32 4)
   %3 = bitcast <16 x float> %2 to <32 x half>
-  %add.i = fadd <32 x half> %3, %acc
+  %add.i = fadd contract <32 x half> %3, %acc
   ret <32 x half> %add.i
 }
 
@@ -166,9 +166,9 @@ define dso_local <16 x half> @test15(<16 x half> %acc, <16 x half> %a, <16 x hal
 entry:
   %0 = bitcast <16 x half> %a to <8 x float>
   %1 = bitcast <16 x half> %b to <8 x float>
-  %2 = tail call <8 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.256(<8 x float> %0, <8 x float> %1, <8 x float> <float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000>, i8 -1)
+  %2 = tail call contract <8 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.256(<8 x float> %0, <8 x float> %1, <8 x float> <float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000>, i8 -1)
   %3 = bitcast <8 x float> %2 to <16 x half>
-  %add.i = fadd <16 x half> %3, %acc
+  %add.i = fadd contract <16 x half> %3, %acc
   ret <16 x half> %add.i
 }
 
@@ -180,9 +180,9 @@ define dso_local <16 x half> @test16(<16 x half> %acc, <16 x half> %a, <16 x hal
 entry:
   %0 = bitcast <16 x half> %a to <8 x float>
   %1 = bitcast <16 x half> %b to <8 x float>
-  %2 = tail call <8 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.256(<8 x float> %0, <8 x float> %1, <8 x float> <float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000>, i8 -1)
+  %2 = tail call contract <8 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.256(<8 x float> %0, <8 x float> %1, <8 x float> <float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000>, i8 -1)
   %3 = bitcast <8 x float> %2 to <16 x half>
-  %add.i = fadd <16 x half> %3, %acc
+  %add.i = fadd contract <16 x half> %3, %acc
   ret <16 x half> %add.i
 }
 
@@ -194,9 +194,9 @@ define dso_local <8 x half> @test17(<8 x half> %acc, <8 x half> %a, <8 x half> %
 entry:
   %0 = bitcast <8 x half> %a to <4 x float>
   %1 = bitcast <8 x half> %b to <4 x float>
-  %2 = tail call <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.128(<4 x float> %0, <4 x float> %1, <4 x float> <float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000>, i8 -1)
+  %2 = tail call contract <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.128(<4 x float> %0, <4 x float> %1, <4 x float> <float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000>, i8 -1)
   %3 = bitcast <4 x float> %2 to <8 x half>
-  %add.i = fadd <8 x half> %3, %acc
+  %add.i = fadd contract <8 x half> %3, %acc
   ret <8 x half> %add.i
 }
 
@@ -208,9 +208,9 @@ define dso_local <8 x half> @test18(<8 x half> %acc, <8 x half> %a, <8 x half> %
 entry:
   %0 = bitcast <8 x half> %a to <4 x float>
   %1 = bitcast <8 x half> %b to <4 x float>
-  %2 = tail call <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.128(<4 x float> %0, <4 x float> %1, <4 x float> <float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000>, i8 -1)
+  %2 = tail call contract <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.128(<4 x float> %0, <4 x float> %1, <4 x float> <float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000>, i8 -1)
   %3 = bitcast <4 x float> %2 to <8 x half>
-  %add.i = fadd <8 x half> %3, %acc
+  %add.i = fadd contract <8 x half> %3, %acc
   ret <8 x half> %add.i
 }
 
diff --git a/llvm/test/CodeGen/X86/avx512fp16-combine-vfmulc-fadd.ll b/llvm/test/CodeGen/X86/avx512fp16-combine-vfmulc-fadd.ll
index a509503584649..5d9784aa5d2eb 100644
--- a/llvm/test/CodeGen/X86/avx512fp16-combine-vfmulc-fadd.ll
+++ b/llvm/test/CodeGen/X86/avx512fp16-combine-vfmulc-fadd.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512fp16,avx512vl --fp-contract=fast --enable-unsafe-fp-math | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512fp16,avx512vl --enable-unsafe-fp-math | FileCheck %s
 
 define dso_local <32 x half> @test1(<32 x half> %acc.coerce, <32 x half> %lhs.coerce, <32 x half> %rhs.coerce) {
 ; CHECK-LABEL: test1:
diff --git a/llvm/test/CodeGen/X86/avx512fp16-combine-xor-vfmulc-fadd.ll b/llvm/test/CodeGen/X86/avx512fp16-combine-xor-vfmulc-fadd.ll
index 43f30da15b20d..b58bae93ed660 100644
--- a/llvm/test/CodeGen/X86/avx512fp16-combine-xor-vfmulc-fadd.ll
+++ b/llvm/test/CodeGen/X86/avx512fp16-combine-xor-vfmulc-fadd.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512fp16,avx512vl --fp-contract=fast --enable-unsafe-fp-math | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512fp16,avx512vl --enable-unsafe-fp-math | FileCheck %s
 
 define dso_local <32 x half> @test1(<32 x half> %acc.coerce, <32 x half> %lhs.coerce.conj, <32 x half> %rhs.coerce) local_unnamed_addr #0 {
 ; CHECK-LABEL: test1:
diff --git a/llvm/test/CodeGen/X86/avx512fp16-combine-xor-vfmulc.ll b/llvm/test/CodeGen/X86/avx512fp16-combine-xor-vfmulc.ll
index 7b142ea170c22..92bdebb34979a 100644
--- a/llvm/test/CodeGen/X86/avx512fp16-combine-xor-vfmulc.ll
+++ b/llvm/test/CodeGen/X86/avx512fp16-combine-xor-vfmulc.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512fp16,avx512vl --fp-contract=fast --enable-unsafe-fp-math | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512fp16,avx512vl --enable-unsafe-fp-math | FileCheck %s
 
 define dso_local <32 x half> @test1(<32 x half> %lhs.coerce.conj, <32 x half> %rhs.coerce) local_unnamed_addr #0 {
 ; CHECK-LABEL: test1:
@@ -94,13 +94,13 @@ define dso_local <32 x half> @test6(<16 x i32> %a, <16 x float> %b) local_unname
 entry:
   %0 = xor <16 x i32> %a, splat (i32 -2147483648)
   %1 = bitcast <16 x i32> %0 to <16 x float>
-  %2 = tail call <16 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.512(<16 x float> splat (float 1.000000e+00), <16 x float> %1, <16 x float> zeroinitializer, i16 -1, i32 4)
+  %2 = tail call contract <16 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.512(<16 x float> splat (float 1.000000e+00), <16 x float> %1, <16 x float> zeroinitializer, i16 -1, i32 4)
   %3 = bitcast <16 x float> %2 to <32 x half>
-  %4 = tail call <16 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.512(<16 x float> %1, <16 x float> %b, <16 x float> zeroinitializer, i16 -1, i32 4)
+  %4 = tail call contract <16 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.512(<16 x float> %1, <16 x float> %b, <16 x float> zeroinitializer, i16 -1, i32 4)
   %5 = bitcast <16 x float> %4 to <32 x half>
-  %6 = fadd <32 x half> %3, %5
+  %6 = fadd contract <32 x half> %3, %5
   %7 = bitcast <16 x float> %b to <32 x half>
-  %8 = fadd <32 x half> %6, %7
+  %8 = fadd contract <32 x half> %6, %7
   ret <32 x half> %8
 }
 
diff --git a/llvm/test/CodeGen/X86/dag-combiner-fma-folding.ll b/llvm/test/CodeGen/X86/dag-combiner-fma-folding.ll
index 6291100f42c3d..3ebbf34dd8367 100644
--- a/llvm/test/CodeGen/X86/dag-combiner-fma-folding.ll
+++ b/llvm/test/CodeGen/X86/dag-combiner-fma-folding.ll
@@ -1,6 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -mtriple=x86_64-- --start-before=x86-isel -mattr=+avx,+fma %s -o - | FileCheck %s
-; RUN: llc -mtriple=x86_64-- --start-before=x86-isel -mattr=+avx,+fma %s -o - -fp-contract=fast | FileCheck %s
 
 define double @fma_folding(double %x) {
 ; CHECK-LABEL: fma_folding:
diff --git a/llvm/test/CodeGen/X86/dbg-distringtype-uint.ll b/llvm/test/CodeGen/X86/dbg-distringtype-uint.ll
index 638a65d4a8b0b..7542c1b8db327 100644
--- a/llvm/test/CodeGen/X86/dbg-distringtype-uint.ll
+++ b/llvm/test/CodeGen/X86/dbg-distringtype-uint.ll
@@ -1,5 +1,13 @@
 ; RUN: llc -mtriple=x86_64 -filetype=obj < %s | llvm-dwarfdump -debug-info - | FileCheck %s
-;
+
+; Ensure that static local variable elemnt is placed in abstract subprogram DIE.
+; CHECK:                   DW_TAG_subprogram
+; CHECK-NOT:               DW_TAG
+; CHECK:                     DW_AT_inline  (DW_INL_inlined)
+; CHECK-EMPTY:
+; CHECK-NEXT:                DW_TAG_variable
+; CHECK-NEXT:                  DW_AT_name  ("elemnt")
+
 ; CHECK: [[SYM:[a-z0-9]+]]:  DW_TAG_formal_parameter
 ; CHECK:                     DW_AT_name	("esym")
 ; CHECK:                     DW_AT_type	([[TYPE:[a-z0-9]+]] "CHARACTER_1")
diff --git a/llvm/test/CodeGen/X86/fma-do-not-commute.ll b/llvm/test/CodeGen/X86/fma-do-not-commute.ll
index 0dc8e62c56d0c..1b60c15cf2be0 100644
--- a/llvm/test/CodeGen/X86/fma-do-not-commute.ll
+++ b/llvm/test/CodeGen/X86/fma-do-not-commute.ll
@@ -1,4 +1,4 @@
-; RUN: llc -fp-contract=fast -mattr=+fma -disable-cgp < %s -o - | FileCheck %s
+; RUN: llc -mattr=+fma -disable-cgp < %s -o - | FileCheck %s
 ; Check that the 2nd and 3rd arguments of fmaXXX231 reg1, reg2, mem3 are not commuted.
 ; <rdar://problem/16800495> 
 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
@@ -20,8 +20,8 @@ loop:
   %sum0 = phi float [ %fma, %loop ], [ %arg, %entry ]
   %addrVal = load float, ptr %addr, align 4
   %addr2Val = load float, ptr %addr2, align 4
-  %fmul = fmul float %addrVal, %addr2Val
-  %fma = fadd float %sum0, %fmul
+  %fmul = fmul contract float %addrVal, %addr2Val
+  %fma = fadd contract float %sum0, %fmul
   br i1 true, label %exit, label %loop
 
 exit:
diff --git a/llvm/test/CodeGen/X86/fma_patterns.ll b/llvm/test/CodeGen/X86/fma_patterns.ll
index dc35c8f8dc657..be5e23cd4cce3 100644
--- a/llvm/test/CodeGen/X86/fma_patterns.ll
+++ b/llvm/test/CodeGen/X86/fma_patterns.ll
@@ -1,12 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma -fp-contract=fast | FileCheck %s --check-prefixes=FMA,FMA-INFS
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4,+fma -fp-contract=fast | FileCheck %s --check-prefixes=FMA4,FMA4-INFS
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4 -fp-contract=fast | FileCheck %s --check-prefixes=FMA4,FMA4-INFS
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl -fp-contract=fast | FileCheck %s --check-prefixes=AVX512,AVX512-INFS
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma -fp-contract=fast -enable-no-infs-fp-math | FileCheck %s --check-prefixes=FMA,FMA-NOINFS
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4,+fma -fp-contract=fast -enable-no-infs-fp-math | FileCheck %s --check-prefixes=FMA4,FMA4-NOINFS
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4 -fp-contract=fast -enable-no-infs-fp-math | FileCheck %s --check-prefixes=FMA4,FMA4-NOINFS
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl -fp-contract=fast -enable-no-infs-fp-math | FileCheck %s --check-prefixes=AVX512,AVX512-NOINFS
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma | FileCheck %s --check-prefixes=FMA,FMA-INFS
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4,+fma | FileCheck %s --check-prefixes=FMA4,FMA4-INFS
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4 | FileCheck %s --check-prefixes=FMA4,FMA4-INFS
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512-INFS
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma -enable-no-infs-fp-math | FileCheck %s --check-prefixes=FMA,FMA-NOINFS
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4,+fma -enable-no-infs-fp-math | FileCheck %s --check-prefixes=FMA4,FMA4-NOINFS
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4 -enable-no-infs-fp-math | FileCheck %s --check-prefixes=FMA4,FMA4-NOINFS
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl -enable-no-infs-fp-math | FileCheck %s --check-prefixes=AVX512,AVX512-NOINFS
 
 ;
 ; Pattern: (fadd (fmul x, y), z) -> (fmadd x,y,z)
@@ -27,8 +27,8 @@ define float @test_f32_fmadd(float %a0, float %a1, float %a2) {
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
 ; AVX512-NEXT:    retq
-  %x = fmul float %a0, %a1
-  %res = fadd float %x, %a2
+  %x = fmul contract float %a0, %a1
+  %res = fadd contract float %x, %a2
   ret float %res
 }
 
@@ -47,8 +47,8 @@ define <4 x float> @test_4f32_fmadd(<4 x float> %a0, <4 x float> %a1, <4 x float
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
 ; AVX512-NEXT:    retq
-  %x = fmul <4 x float> %a0, %a1
-  %res = fadd <4 x float> %x, %a2
+  %x = fmul contract <4 x float> %a0, %a1
+  %res = fadd contract <4 x float> %x, %a2
   ret <4 x float> %res
 }
 
@@ -67,8 +67,8 @@ define <8 x float> @test_8f32_fmadd(<8 x float> %a0, <8 x float> %a1, <8 x float
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vfmadd213ps {{.*#+}} ymm0 = (ymm1 * ymm0) + ymm2
 ; AVX512-NEXT:    retq
-  %x = fmul <8 x float> %a0, %a1
-  %res = fadd <8 x float> %x, %a2
+  %x = fmul contract <8 x float> %a0, %a1
+  %res = fadd contract <8 x float> %x, %a2
   ret <8 x float> %res
 }
 
@@ -87,8 +87,8 @@ define double @test_f64_fmadd(double %a0, double %a1, double %a2) {
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vfmadd213sd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
 ; AVX512-NEXT:    retq
-  %x = fmul double %a0, %a1
-  %res = fadd double %x, %a2
+  %x = fmul contract double %a0, %a1
+  %res = fadd contract double %x, %a2
   ret double %res
 }
 
@@ -107,8 +107,8 @@ define <2 x double> @test_2f64_fmadd(<2 x double> %a0, <2 x double> %a1, <2 x do
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vfmadd213pd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
 ; AVX512-NEXT:    retq
-  %x = fmul <2 x double> %a0, %a1
-  %res = fadd <2 x double> %x, %a2
+  %x = fmul contract <2 x double> %a0, %a1
+  %res = fadd contract <2 x double> %x, %a2
   ret <2 x double> %res
 }
 
@@ -127,8 +127,8 @@ define <4 x double> @test_4f64_fmadd(<4 x double> %a0, <4 x double> %a1, <4 x do
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vfmadd213pd {{.*#+}} ymm0 = (ymm1 * ymm0) + ymm2
 ; AVX512-NEXT:    retq
-  %x = fmul <4 x double> %a0, %a1
-  %res = fadd <4 x double> %x, %a2
+  %x = fmul contract <4 x double> %a0, %a1
+  %res = fadd contract <4 x double> %x, %a2
   ret <4 x double> %res
 }
 
@@ -151,8 +151,8 @@ define float @test_f32_fmsub(float %a0, float %a1, float %a2) {
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vfmsub213ss {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
 ; AVX512-NEXT:    retq
-  %x = fmul float %a0, %a1
-  %res = fsub float %x, %a2
+  %x = fmul contract float %a0, %a1
+  %res = fsub contract float %x, %a2
   ret float %res
 }
 
@@ -171,8 +171,8 @@ define <4 x float> @test_4f32_fmsub(<4 x float> %a0, <4 x float> %a1, <4 x float
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
 ; AVX512-NEXT:    retq
-  %x = fmul <4 x float> %a0, %a1
-  %res = fsub <4 x float> %x, %a2
+  %x = fmul contract <4 x float> %a0, %a1
+  %res = fsub contract <4 x float> %x, %a2
   ret <4 x float> %res
 }
 
@@ -191,8 +191,8 @@ define <8 x float> @test_8f32_fmsub(<8 x float> %a0, <8 x float> %a1, <8 x float
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vfmsub213ps {{.*#+}} ymm0 = (ymm1 * ymm0) - ymm2
 ; AVX512-NEXT:    retq
-  %x = fmul <8 x float> %a0, %a1
-  %res = fsub <8 x float> %x, %a2
+  %x = fmul contract <8 x float> %a0, %a1
+  %res = fsub contract <8 x float> %x, %a2
   ret <8 x float> %res
 }
 
@@ -211,8 +211,8 @@ define double @test_f64_fmsub(double %a0, double %a1, double %a2) {
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vfmsub213sd {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
 ; AVX512-NEXT:    retq
-  %x = fmul double %a0, %a1
-  %res = fsub double %x, %a2
+  %x = fmul contract double %a0, %a1
+  %res = fsub contract double %x, %a2
   ret double %res
 }
 
@@ -231,8 +231,8 @@ define <2 x double> @test_2f64_fmsub(<2 x double> %a0, <2 x double> %a1, <2 x do
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vfmsub213pd {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
 ; AVX512-NEXT:    retq
-  %x = fmul <2 x double> %a0, %a1
-  %res = fsub <2 x double> %x, %a2
+  %x = fmul contract <2 x double> %a0, %a1
+  %res = fsub contract <2 x double> %x, %a2
   ret <2 x double> %res
 }
 
@@ -251,8 +251,8 @@ define <4 x double> @test_4f64_fmsub(<4 x double> %a0, <4 x double> %a1, <4 x do
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vfmsub213pd {{.*#+}} ymm0 = (ymm1 * ymm0) - ymm2
 ; AVX512-NEXT:    retq
-  %x = fmul <4 x double> %a0, %a1
-  %res = fsub <4 x double> %x, %a2
+  %x = fmul contract <4 x double> %a0, %a1
+  %res = fsub contract <4 x double> %x, %a2
   ret <4 x double> %res
 }
 
@@ -275,8 +275,8 @@ define float @test_f32_fnmadd(float %a0, float %a1, float %a2) {
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2
 ; AVX512-NEXT:    retq
-  %x = fmul float %a0, %a1
-  %res = fsub float %a2, %x
+  %x = fmul contract float %a0, %a1
+  %res = fsub contract float %a2, %x
   ret float %res
 }
 
@@ -295,8 +295,8 @@ define <4 x float> @test_4f32_fnmadd(<4 x float> %a0, <4 x float> %a1, <4 x floa
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2
 ; AVX512-NEXT:    retq
-  %x = fmul <4 x float> %a0, %a1
-  %res = fsub <4 x float> %a2, %x
+  %x = fmul contract <4 x float> %a0, %a1
+  %res = fsub contract <4 x float> %a2, %x
   ret <4 x float> %res
 }
 
@@ -315,8 +315,8 @@ define <8 x float> @test_8f32_fnmadd(<8 x float> %a0, <8 x float> %a1, <8 x floa
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm2
 ; AVX512-NEXT:    retq
-  %x = fmul <8 x float> %a0, %a1
-  %res = fsub <8 x float> %a2, %x
+  %x = fmul contract <8 x float> %a0, %a1
+  %res = fsub contract <8 x float> %a2, %x
   ret <8 x float> %res
 }
 
@@ -335,8 +335,8 @@ define double @test_f64_fnmadd(double %a0, double %a1, double %a2) {
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vfnmadd213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2
 ; AVX512-NEXT:    retq
-  %x = fmul double %a0, %a1
-  %res = fsub double %a2, %x
+  %x = fmul contract double %a0, %a1
+  %res = fsub contract double %a2, %x
   ret double %res
 }
 
@@ -355,8 +355,8 @@ define <2 x double> @test_2f64_fnmadd(<2 x double> %a0, <2 x double> %a1, <2 x d
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vfnmadd213pd {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2
 ; AVX512-NEXT:    retq
-  %x = fmul <2 x double> %a0, %a1
-  %res = fsub <2 x double> %a2, %x
+  %x = fmul contract <2 x double> %a0, %a1
+  %res = fsub contract <2 x double> %a2, %x
   ret <2 x double> %res
 }
 
@@ -375,8 +375,8 @@ define <4 x double> @test_4f64_fnmadd(<4 x double> %a0, <4 x double> %a1, <4 x d
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vfnmadd213pd {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm2
 ; AVX512-NEXT:    retq
-  %x = fmul <4 x double> %a0, %a1
-  %res = fsub <4 x double> %a2, %x
+  %x = fmul contract <4 x double> %a0, %a1
+  %res = fsub contract <4 x double> %a2, %x
   ret <4 x double> %res
 }
 
@@ -399,9 +399,9 @@ define float @test_f32_fnmsub(float %a0, float %a1, float %a2) {
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vfnmsub213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
 ; AVX512-NEXT:    retq
-  %x = fmul float %a0, %a1
-  %y = fsub float -0.000000e+00, %x
-  %res = fsub float %y, %a2
+  %x = fmul contract float %a0, %a1
+  %y = fsub contract float -0.000000e+00, %x
+  %res = fsub contract float %y, %a2
   ret float %res
 }
 
@@ -420,9 +420,9 @@ define <4 x float> @test_4f32_fnmsub(<4 x float> %a0, <4 x float> %a1, <4 x floa
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vfnmsub213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
 ; AVX512-NEXT:    retq
-  %x = fmul <4 x float> %a0, %a1
-  %y = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x
-  %res = fsub <4 x float> %y, %a2
+  %x = fmul contract <4 x float> %a0, %a1
+  %y = fsub contract <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x
+  %res = fsub contract <4 x float> %y, %a2
   ret <4 x float> %res
 }
 
@@ -441,9 +441,9 @@ define <8 x float> @test_8f32_fnmsub(<8 x float> %a0, <8 x float> %a1, <8 x floa
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vfnmsub213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) - ymm2
 ; AVX512-NEXT:    retq
-  %x = fmul <8 x float> %a0, %a1
-  %y = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x
-  %res = fsub <8 x float> %y, %a2
+  %x = fmul contract <8 x float> %a0, %a1
+  %y = fsub contract <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x
+  %res = fsub contract <8 x float> %y, %a2
   ret <8 x float> %res
 }
 
@@ -462,9 +462,9 @@ define double @test_f64_fnmsub(double %a0, double %a1, double %a2) {
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vfnmsub213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
 ; AVX512-NEXT:    retq
-  %x = fmul double %a0, %a1
-  %y = fsub double -0.000000e+00, %x
-  %res = fsub double %y, %a2
+  %x = fmul contract double %a0, %a1
+  %y = fsub contract double -0.000000e+00, %x
+  %res = fsub contract double %y, %a2
   ret double %res
 }
 
@@ -483,9 +483,9 @@ define <2 x double> @test_2f64_fnmsub(<2 x double> %a0, <2 x double> %a1, <2 x d
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vfnmsub213pd {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
 ; AVX512-NEXT:    retq
-  %x = fmul <2 x double> %a0, %a1
-  %y = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %x
-  %res = fsub <2 x double> %y, %a2
+  %x = fmul contract <2 x double> %a0, %a1
+  %y = fsub contract <2 x double> <double -0.000000e+00, double -0.000000e+00>, %x
+  %res = fsub contract <2 x double> %y, %a2
   ret <2 x double> %res
 }
 
@@ -504,9 +504,9 @@ define <4 x double> @test_4f64_fnmsub(<4 x double> %a0, <4 x double> %a1, <4 x d
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vfnmsub213pd {{.*#+}} ymm0 = -(ymm1 * ymm0) - ymm2
 ; AVX512-NEXT:    retq
-  %x = fmul <4 x double> %a0, %a1
-  %y = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %x
-  %res = fsub <4 x double> %y, %a2
+  %x = fmul contract <4 x double> %a0, %a1
+  %y = fsub contract <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %x
+  %res = fsub contract <4 x double> %y, %a2
   ret <4 x double> %res
 }
 
@@ -530,8 +530,8 @@ define <4 x float> @test_4f32_fmadd_load(ptr %a0, <4 x float> %a1, <4 x float> %
 ; AVX512-NEXT:    vfmadd132ps {{.*#+}} xmm0 = (xmm0 * mem) + xmm1
 ; AVX512-NEXT:    retq
   %x = load <4 x float>, ptr %a0
-  %y = fmul <4 x float> %x, %a1
-  %res = fadd <4 x float> %y, %a2
+  %y = fmul contract <4 x float> %x, %a1
+  %res = fadd contract <4 x float> %y, %a2
   ret <4 x float> %res
 }
 
@@ -551,8 +551,8 @@ define <2 x double> @test_2f64_fmsub_load(ptr %a0, <2 x double> %a1, <2 x double
 ; AVX512-NEXT:    vfmsub132pd {{.*#+}} xmm0 = (xmm0 * mem) - xmm1
 ; AVX512-NEXT:    retq
   %x = load <2 x double>, ptr %a0
-  %y = fmul <2 x double> %x, %a1
-  %res = fsub <2 x double> %y, %a2
+  %y = fmul contract <2 x double> %x, %a1
+  %res = fsub contract <2 x double> %y, %a2
   ret <2 x double> %res
 }
 
@@ -593,8 +593,8 @@ define <4 x float> @test_v4f32_mul_add_x_one_y(<4 x float> %x, <4 x float> %y) {
 ; AVX512-NOINFS:       # %bb.0:
 ; AVX512-NOINFS-NEXT:    vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm1
 ; AVX512-NOINFS-NEXT:    retq
-  %a = fadd <4 x float> %x, <float 1.0, float 1.0, float 1.0, float 1.0>
-  %m = fmul <4 x float> %a, %y
+  %a = fadd contract <4 x float> %x, <float 1.0, float 1.0, float 1.0, float 1.0>
+  %m = fmul contract <4 x float> %a, %y
   ret <4 x float> %m
 }
 
@@ -631,8 +631,8 @@ define <4 x float> @test_v4f32_mul_y_add_x_one(<4 x float> %x, <4 x float> %y) {
 ; AVX512-NOINFS:       # %bb.0:
 ; AVX512-NOINFS-NEXT:    vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm1
 ; AVX512-NOINFS-NEXT:    retq
-  %a = fadd <4 x float> %x, <float 1.0, float 1.0, float 1.0, float 1.0>
-  %m = fmul <4 x float> %y, %a
+  %a = fadd contract <4 x float> %x, <float 1.0, float 1.0, float 1.0, float 1.0>
+  %m = fmul contract <4 x float> %y, %a
   ret <4 x float> %m
 }
 
@@ -669,8 +669,8 @@ define <4 x float> @test_v4f32_mul_y_add_x_one_undefs(<4 x float> %x, <4 x float
 ; AVX512-NOINFS:       # %bb.0:
 ; AVX512-NOINFS-NEXT:    vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm1
 ; AVX512-NOINFS-NEXT:    retq
-  %a = fadd <4 x float> %x, <float 1.0, float undef, float 1.0, float undef>
-  %m = fmul <4 x float> %y, %a
+  %a = fadd contract <4 x float> %x, <float 1.0, float undef, float 1.0, float undef>
+  %m = fmul contract <4 x float> %y, %a
   ret <4 x float> %m
 }
 
@@ -707,8 +707,8 @@ define <4 x float> @test_v4f32_mul_add_x_negone_y(<4 x float> %x, <4 x float> %y
 ; AVX512-NOINFS:       # %bb.0:
 ; AVX512-NOINFS-NEXT:    vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm1
 ; AVX512-NOINFS-NEXT:    retq
-  %a = fadd <4 x float> %x, <float -1.0, float -1.0, float -1.0, float -1.0>
-  %m = fmul <4 x float> %a, %y
+  %a = fadd contract <4 x float> %x, <float -1.0, float -1.0, float -1.0, float -1.0>
+  %m = fmul contract <4 x float> %a, %y
   ret <4 x float> %m
 }
 
@@ -745,8 +745,8 @@ define <4 x float> @test_v4f32_mul_y_add_x_negone(<4 x float> %x, <4 x float> %y
 ; AVX512-NOINFS:       # %bb.0:
 ; AVX512-NOINFS-NEXT:    vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm1
 ; AVX512-NOINFS-NEXT:    retq
-  %a = fadd <4 x float> %x, <float -1.0, float -1.0, float -1.0, float -1.0>
-  %m = fmul <4 x float> %y, %a
+  %a = fadd contract <4 x float> %x, <float -1.0, float -1.0, float -1.0, float -1.0>
+  %m = fmul contract <4 x float> %y, %a
   ret <4 x float> %m
 }
 
@@ -783,8 +783,8 @@ define <4 x float> @test_v4f32_mul_y_add_x_negone_undefs(<4 x float> %x, <4 x fl
 ; AVX512-NOINFS:       # %bb.0:
 ; AVX512-NOINFS-NEXT:    vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm1
 ; AVX512-NOINFS-NEXT:    retq
-  %a = fadd <4 x float> %x, <float undef, float -1.0, float undef, float -1.0>
-  %m = fmul <4 x float> %y, %a
+  %a = fadd contract <4 x float> %x, <float undef, float -1.0, float undef, float -1.0>
+  %m = fmul contract <4 x float> %y, %a
   ret <4 x float> %m
 }
 
@@ -824,8 +824,8 @@ define <4 x float> @test_v4f32_mul_sub_one_x_y(<4 x float> %x, <4 x float> %y) {
 ; AVX512-NOINFS:       # %bb.0:
 ; AVX512-NOINFS-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm1
 ; AVX512-NOINFS-NEXT:    retq
-  %s = fsub <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %x
-  %m = fmul <4 x float> %s, %y
+  %s = fsub contract <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %x
+  %m = fmul contract <4 x float> %s, %y
   ret <4 x float> %m
 }
 
@@ -865,8 +865,8 @@ define <4 x float> @test_v4f32_mul_y_sub_one_x(<4 x float> %x, <4 x float> %y) {
 ; AVX512-NOINFS:       # %bb.0:
 ; AVX512-NOINFS-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm1
 ; AVX512-NOINFS-NEXT:    retq
-  %s = fsub <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %x
-  %m = fmul <4 x float> %y, %s
+  %s = fsub contract <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %x
+  %m = fmul contract <4 x float> %y, %s
   ret <4 x float> %m
 }
 
@@ -906,8 +906,8 @@ define <4 x float> @test_v4f32_mul_y_sub_one_x_undefs(<4 x float> %x, <4 x float
 ; AVX512-NOINFS:       # %bb.0:
 ; AVX512-NOINFS-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm1
 ; AVX512-NOINFS-NEXT:    retq
-  %s = fsub <4 x float> <float 1.0, float undef, float 1.0, float 1.0>, %x
-  %m = fmul <4 x float> %y, %s
+  %s = fsub contract <4 x float> <float 1.0, float undef, float 1.0, float 1.0>, %x
+  %m = fmul contract <4 x float> %y, %s
   ret <4 x float> %m
 }
 
@@ -947,8 +947,8 @@ define <4 x float> @test_v4f32_mul_sub_negone_x_y(<4 x float> %x, <4 x float> %y
 ; AVX512-NOINFS:       # %bb.0:
 ; AVX512-NOINFS-NEXT:    vfnmsub213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm1
 ; AVX512-NOINFS-NEXT:    retq
-  %s = fsub <4 x float> <float -1.0, float -1.0, float -1.0, float -1.0>, %x
-  %m = fmul <4 x float> %s, %y
+  %s = fsub contract <4 x float> <float -1.0, float -1.0, float -1.0, float -1.0>, %x
+  %m = fmul contract <4 x float> %s, %y
   ret <4 x float> %m
 }
 
@@ -988,8 +988,8 @@ define <4 x float> @test_v4f32_mul_y_sub_negone_x(<4 x float> %x, <4 x float> %y
 ; AVX512-NOINFS:       # %bb.0:
 ; AVX512-NOINFS-NEXT:    vfnmsub213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm1
 ; AVX512-NOINFS-NEXT:    retq
-  %s = fsub <4 x float> <float -1.0, float -1.0, float -1.0, float -1.0>, %x
-  %m = fmul <4 x float> %y, %s
+  %s = fsub contract <4 x float> <float -1.0, float -1.0, float -1.0, float -1.0>, %x
+  %m = fmul contract <4 x float> %y, %s
   ret <4 x float> %m
 }
 
@@ -1029,8 +1029,8 @@ define <4 x float> @test_v4f32_mul_y_sub_negone_x_undefs(<4 x float> %x, <4 x fl
 ; AVX512-NOINFS:       # %bb.0:
 ; AVX512-NOINFS-NEXT:    vfnmsub213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm1
 ; AVX512-NOINFS-NEXT:    retq
-  %s = fsub <4 x float> <float -1.0, float -1.0, float undef, float -1.0>, %x
-  %m = fmul <4 x float> %y, %s
+  %s = fsub contract <4 x float> <float -1.0, float -1.0, float undef, float -1.0>, %x
+  %m = fmul contract <4 x float> %y, %s
   ret <4 x float> %m
 }
 
@@ -1067,8 +1067,8 @@ define <4 x float> @test_v4f32_mul_sub_x_one_y(<4 x float> %x, <4 x float> %y) {
 ; AVX512-NOINFS:       # %bb.0:
 ; AVX512-NOINFS-NEXT:    vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm1
 ; AVX512-NOINFS-NEXT:    retq
-  %s = fsub <4 x float> %x, <float 1.0, float 1.0, float 1.0, float 1.0>
-  %m = fmul <4 x float> %s, %y
+  %s = fsub contract <4 x float> %x, <float 1.0, float 1.0, float 1.0, float 1.0>
+  %m = fmul contract <4 x float> %s, %y
   ret <4 x float> %m
 }
 
@@ -1105,8 +1105,8 @@ define <4 x float> @test_v4f32_mul_y_sub_x_one(<4 x float> %x, <4 x float> %y) {
 ; AVX512-NOINFS:       # %bb.0:
 ; AVX512-NOINFS-NEXT:    vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm1
 ; AVX512-NOINFS-NEXT:    retq
-  %s = fsub <4 x float> %x, <float 1.0, float 1.0, float 1.0, float 1.0>
-  %m = fmul <4 x float> %y, %s
+  %s = fsub contract <4 x float> %x, <float 1.0, float 1.0, float 1.0, float 1.0>
+  %m = fmul contract <4 x float> %y, %s
   ret <4 x float> %m
 }
 
@@ -1143,8 +1143,8 @@ define <4 x float> @test_v4f32_mul_y_sub_x_one_undefs(<4 x float> %x, <4 x float
 ; AVX512-NOINFS:       # %bb.0:
 ; AVX512-NOINFS-NEXT:    vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm1
 ; AVX512-NOINFS-NEXT:    retq
-  %s = fsub <4 x float> %x, <float 1.0, float 1.0, float 1.0, float undef>
-  %m = fmul <4 x float> %y, %s
+  %s = fsub contract <4 x float> %x, <float 1.0, float 1.0, float 1.0, float undef>
+  %m = fmul contract <4 x float> %y, %s
   ret <4 x float> %m
 }
 
@@ -1181,8 +1181,8 @@ define <4 x float> @test_v4f32_mul_sub_x_negone_y(<4 x float> %x, <4 x float> %y
 ; AVX512-NOINFS:       # %bb.0:
 ; AVX512-NOINFS-NEXT:    vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm1
 ; AVX512-NOINFS-NEXT:    retq
-  %s = fsub <4 x float> %x, <float -1.0, float -1.0, float -1.0, float -1.0>
-  %m = fmul <4 x float> %s, %y
+  %s = fsub contract <4 x float> %x, <float -1.0, float -1.0, float -1.0, float -1.0>
+  %m = fmul contract <4 x float> %s, %y
   ret <4 x float> %m
 }
 
@@ -1219,8 +1219,8 @@ define <4 x float> @test_v4f32_mul_y_sub_x_negone(<4 x float> %x, <4 x float> %y
 ; AVX512-NOINFS:       # %bb.0:
 ; AVX512-NOINFS-NEXT:    vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm1
 ; AVX512-NOINFS-NEXT:    retq
-  %s = fsub <4 x float> %x, <float -1.0, float -1.0, float -1.0, float -1.0>
-  %m = fmul <4 x float> %y, %s
+  %s = fsub contract <4 x float> %x, <float -1.0, float -1.0, float -1.0, float -1.0>
+  %m = fmul contract <4 x float> %y, %s
   ret <4 x float> %m
 }
 
@@ -1257,8 +1257,8 @@ define <4 x float> @test_v4f32_mul_y_sub_x_negone_undefs(<4 x float> %x, <4 x fl
 ; AVX512-NOINFS:       # %bb.0:
 ; AVX512-NOINFS-NEXT:    vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm1
 ; AVX512-NOINFS-NEXT:    retq
-  %s = fsub <4 x float> %x, <float undef, float -1.0, float -1.0, float -1.0>
-  %m = fmul <4 x float> %y, %s
+  %s = fsub contract <4 x float> %x, <float undef, float -1.0, float -1.0, float -1.0>
+  %m = fmul contract <4 x float> %y, %s
   ret <4 x float> %m
 }
 
@@ -1308,10 +1308,10 @@ define float @test_f32_interp(float %x, float %y, float %t) {
 ; AVX512-NOINFS-NEXT:    vfmsub213ss {{.*#+}} xmm1 = (xmm2 * xmm1) - xmm1
 ; AVX512-NOINFS-NEXT:    vfmsub213ss {{.*#+}} xmm0 = (xmm2 * xmm0) - xmm1
 ; AVX512-NOINFS-NEXT:    retq
-  %t1 = fsub nsz float 1.0, %t
-  %tx = fmul nsz float %x, %t
-  %ty = fmul nsz float %y, %t1
-  %r = fadd nsz float %tx, %ty
+  %t1 = fsub contract nsz float 1.0, %t
+  %tx = fmul contract nsz float %x, %t
+  %ty = fmul contract nsz float %y, %t1
+  %r = fadd contract nsz float %tx, %ty
   ret float %r
 }
 
@@ -1357,10 +1357,10 @@ define <4 x float> @test_v4f32_interp(<4 x float> %x, <4 x float> %y, <4 x float
 ; AVX512-NOINFS-NEXT:    vfmsub213ps {{.*#+}} xmm1 = (xmm2 * xmm1) - xmm1
 ; AVX512-NOINFS-NEXT:    vfmsub213ps {{.*#+}} xmm0 = (xmm2 * xmm0) - xmm1
 ; AVX512-NOINFS-NEXT:    retq
-  %t1 = fsub nsz <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %t
-  %tx = fmul nsz <4 x float> %x, %t
-  %ty = fmul nsz <4 x float> %y, %t1
-  %r = fadd nsz <4 x float> %tx, %ty
+  %t1 = fsub contract nsz <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %t
+  %tx = fmul contract nsz <4 x float> %x, %t
+  %ty = fmul contract nsz <4 x float> %y, %t1
+  %r = fadd contract nsz <4 x float> %tx, %ty
   ret <4 x float> %r
 }
 
@@ -1406,10 +1406,10 @@ define <8 x float> @test_v8f32_interp(<8 x float> %x, <8 x float> %y, <8 x float
 ; AVX512-NOINFS-NEXT:    vfmsub213ps {{.*#+}} ymm1 = (ymm2 * ymm1) - ymm1
 ; AVX512-NOINFS-NEXT:    vfmsub213ps {{.*#+}} ymm0 = (ymm2 * ymm0) - ymm1
 ; AVX512-NOINFS-NEXT:    retq
-  %t1 = fsub nsz <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %t
-  %tx = fmul nsz <8 x float> %x, %t
-  %ty = fmul nsz <8 x float> %y, %t1
-  %r = fadd nsz <8 x float> %tx, %ty
+  %t1 = fsub contract nsz <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %t
+  %tx = fmul contract nsz <8 x float> %x, %t
+  %ty = fmul contract nsz <8 x float> %y, %t1
+  %r = fadd contract nsz <8 x float> %tx, %ty
   ret <8 x float> %r
 }
 
@@ -1455,10 +1455,10 @@ define double @test_f64_interp(double %x, double %y, double %t) {
 ; AVX512-NOINFS-NEXT:    vfmsub213sd {{.*#+}} xmm1 = (xmm2 * xmm1) - xmm1
 ; AVX512-NOINFS-NEXT:    vfmsub213sd {{.*#+}} xmm0 = (xmm2 * xmm0) - xmm1
 ; AVX512-NOINFS-NEXT:    retq
-  %t1 = fsub nsz double 1.0, %t
-  %tx = fmul nsz double %x, %t
-  %ty = fmul nsz double %y, %t1
-  %r = fadd nsz double %tx, %ty
+  %t1 = fsub contract nsz double 1.0, %t
+  %tx = fmul contract nsz double %x, %t
+  %ty = fmul contract nsz double %y, %t1
+  %r = fadd contract nsz double %tx, %ty
   ret double %r
 }
 
@@ -1507,10 +1507,10 @@ define <2 x double> @test_v2f64_interp(<2 x double> %x, <2 x double> %y, <2 x do
 ; AVX512-NOINFS-NEXT:    vfmsub213pd {{.*#+}} xmm1 = (xmm2 * xmm1) - xmm1
 ; AVX512-NOINFS-NEXT:    vfmsub213pd {{.*#+}} xmm0 = (xmm2 * xmm0) - xmm1
 ; AVX512-NOINFS-NEXT:    retq
-  %t1 = fsub nsz <2 x double> <double 1.0, double 1.0>, %t
-  %tx = fmul nsz <2 x double> %x, %t
-  %ty = fmul nsz <2 x double> %y, %t1
-  %r = fadd nsz <2 x double> %tx, %ty
+  %t1 = fsub contract nsz <2 x double> <double 1.0, double 1.0>, %t
+  %tx = fmul contract nsz <2 x double> %x, %t
+  %ty = fmul contract nsz <2 x double> %y, %t1
+  %r = fadd contract nsz <2 x double> %tx, %ty
   ret <2 x double> %r
 }
 
@@ -1556,10 +1556,10 @@ define <4 x double> @test_v4f64_interp(<4 x double> %x, <4 x double> %y, <4 x do
 ; AVX512-NOINFS-NEXT:    vfmsub213pd {{.*#+}} ymm1 = (ymm2 * ymm1) - ymm1
 ; AVX512-NOINFS-NEXT:    vfmsub213pd {{.*#+}} ymm0 = (ymm2 * ymm0) - ymm1
 ; AVX512-NOINFS-NEXT:    retq
-  %t1 = fsub nsz <4 x double> <double 1.0, double 1.0, double 1.0, double 1.0>, %t
-  %tx = fmul nsz <4 x double> %x, %t
-  %ty = fmul nsz <4 x double> %y, %t1
-  %r = fadd nsz <4 x double> %tx, %ty
+  %t1 = fsub contract nsz <4 x double> <double 1.0, double 1.0, double 1.0, double 1.0>, %t
+  %tx = fmul contract nsz <4 x double> %x, %t
+  %ty = fmul contract nsz <4 x double> %y, %t1
+  %r = fadd contract nsz <4 x double> %tx, %ty
   ret <4 x double> %r
 }
 
@@ -1603,9 +1603,9 @@ define <4 x double> @test_v4f64_fneg_fmsub(<4 x double> %a0, <4 x double> %a1, <
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vfnmadd213pd {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm2
 ; AVX512-NEXT:    retq
-  %mul = fmul nsz <4 x double> %a0, %a1
-  %sub = fsub nsz <4 x double> %mul, %a2
-  %neg = fsub nsz <4 x double> <double -0.0, double -0.0, double -0.0, double -0.0>, %sub
+  %mul = fmul contract nsz <4 x double> %a0, %a1
+  %sub = fsub contract nsz <4 x double> %mul, %a2
+  %neg = fsub contract nsz <4 x double> <double -0.0, double -0.0, double -0.0, double -0.0>, %sub
   ret <4 x double> %neg
 }
 
@@ -1817,10 +1817,10 @@ define double @fadd_fma_fmul_1(double %a, double %b, double %c, double %d, doubl
 ; AVX512-NEXT:    vfmadd213sd {{.*#+}} xmm2 = (xmm3 * xmm2) + xmm4
 ; AVX512-NEXT:    vfmadd213sd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
 ; AVX512-NEXT:    retq
-  %m1 = fmul fast double %a, %b
-  %m2 = fmul fast double %c, %d
-  %a1 = fadd fast double %m1, %m2
-  %a2 = fadd fast double %a1, %n1
+  %m1 = fmul contract fast double %a, %b
+  %m2 = fmul contract fast double %c, %d
+  %a1 = fadd contract fast double %m1, %m2
+  %a2 = fadd contract fast double %a1, %n1
   ret double %a2
 }
 
@@ -1846,10 +1846,10 @@ define float @fadd_fma_fmul_fmf(float %a, float %b, float %c, float %d, float %n
 ; AVX512-NEXT:    vfmadd213ss {{.*#+}} xmm2 = (xmm3 * xmm2) + xmm4
 ; AVX512-NEXT:    vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
 ; AVX512-NEXT:    retq
-  %m1 = fmul float %a, %b
-  %m2 = fmul float %c, %d
+  %m1 = fmul contract float %a, %b
+  %m2 = fmul contract float %c, %d
   %a1 = fadd contract float %m1, %m2
-  %a2 = fadd reassoc float %n0, %a1
+  %a2 = fadd contract reassoc float %n0, %a1
   ret float %a2
 }
 
@@ -1876,8 +1876,8 @@ define float @fadd_fma_fmul_2(float %a, float %b, float %c, float %d, float %n0)
 ; AVX512-NEXT:    vfmadd231ss {{.*#+}} xmm2 = (xmm1 * xmm0) + xmm2
 ; AVX512-NEXT:    vaddss %xmm2, %xmm4, %xmm0
 ; AVX512-NEXT:    retq
-  %m1 = fmul float %a, %b
-  %m2 = fmul float %c, %d
+  %m1 = fmul contract float %a, %b
+  %m2 = fmul contract float %c, %d
   %a1 = fadd contract float %m1, %m2
   %a2 = fadd contract float %n0, %a1
   ret float %a2
@@ -1911,13 +1911,13 @@ define <2 x double> @fadd_fma_fmul_3(<2 x double> %x1, <2 x double> %x2, <2 x do
 ; AVX512-NEXT:    vfmadd231pd {{.*#+}} xmm2 = (xmm5 * xmm4) + xmm2
 ; AVX512-NEXT:    vmovapd %xmm2, %xmm0
 ; AVX512-NEXT:    retq
-  %m1 = fmul fast <2 x double> %x1, %x2
-  %m2 = fmul fast <2 x double> %x3, %x4
-  %m3 = fmul fast <2 x double> %x5, %x6
-  %m4 = fmul fast <2 x double> %x7, %x8
-  %a1 = fadd fast <2 x double> %m1, %m2
-  %a2 = fadd fast <2 x double> %m3, %m4
-  %a3 = fadd fast <2 x double> %a1, %a2
+  %m1 = fmul contract fast <2 x double> %x1, %x2
+  %m2 = fmul contract fast <2 x double> %x3, %x4
+  %m3 = fmul contract fast <2 x double> %x5, %x6
+  %m4 = fmul contract fast <2 x double> %x7, %x8
+  %a1 = fadd contract fast <2 x double> %m1, %m2
+  %a2 = fadd contract fast <2 x double> %m3, %m4
+  %a3 = fadd contract fast <2 x double> %a1, %a2
   ret <2 x double> %a3
 }
 
@@ -1947,11 +1947,11 @@ define float @fadd_fma_fmul_extra_use_1(float %a, float %b, float %c, float %d,
 ; AVX512-NEXT:    vfmadd213ss {{.*#+}} xmm2 = (xmm3 * xmm2) + xmm0
 ; AVX512-NEXT:    vaddss %xmm2, %xmm4, %xmm0
 ; AVX512-NEXT:    retq
-  %m1 = fmul fast float %a, %b
+  %m1 = fmul contract fast float %a, %b
   store float %m1, ptr %p
-  %m2 = fmul fast float %c, %d
-  %a1 = fadd fast float %m1, %m2
-  %a2 = fadd fast float %n0, %a1
+  %m2 = fmul contract fast float %c, %d
+  %a1 = fadd contract fast float %m1, %m2
+  %a2 = fadd contract fast float %n0, %a1
   ret float %a2
 }
 
@@ -1981,11 +1981,11 @@ define float @fadd_fma_fmul_extra_use_2(float %a, float %b, float %c, float %d,
 ; AVX512-NEXT:    vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
 ; AVX512-NEXT:    vaddss %xmm0, %xmm4, %xmm0
 ; AVX512-NEXT:    retq
-  %m1 = fmul fast float %a, %b
-  %m2 = fmul fast float %c, %d
+  %m1 = fmul contract fast float %a, %b
+  %m2 = fmul contract fast float %c, %d
   store float %m2, ptr %p
-  %a1 = fadd fast float %m1, %m2
-  %a2 = fadd fast float %n0, %a1
+  %a1 = fadd contract fast float %m1, %m2
+  %a2 = fadd contract fast float %n0, %a1
   ret float %a2
 }
 
@@ -2015,10 +2015,10 @@ define float @fadd_fma_fmul_extra_use_3(float %a, float %b, float %c, float %d,
 ; AVX512-NEXT:    vmovss %xmm2, (%rdi)
 ; AVX512-NEXT:    vaddss %xmm2, %xmm4, %xmm0
 ; AVX512-NEXT:    retq
-  %m1 = fmul fast float %a, %b
-  %m2 = fmul fast float %c, %d
-  %a1 = fadd fast float %m1, %m2
+  %m1 = fmul contract fast float %a, %b
+  %m2 = fmul contract fast float %c, %d
+  %a1 = fadd contract fast float %m1, %m2
   store float %a1, ptr %p
-  %a2 = fadd fast float %n0, %a1
+  %a2 = fadd contract fast float %n0, %a1
   ret float %a2
 }
diff --git a/llvm/test/CodeGen/X86/fma_patterns_wide.ll b/llvm/test/CodeGen/X86/fma_patterns_wide.ll
index d910110467ee0..f0af3945ae959 100644
--- a/llvm/test/CodeGen/X86/fma_patterns_wide.ll
+++ b/llvm/test/CodeGen/X86/fma_patterns_wide.ll
@@ -1,12 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma -fp-contract=fast | FileCheck %s --check-prefix=FMA --check-prefix=FMA-INFS
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4,+fma -fp-contract=fast | FileCheck %s --check-prefix=FMA4 --check-prefix=FMA4-INFS
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4 -fp-contract=fast | FileCheck %s --check-prefix=FMA4 --check-prefix=FMA4-INFS
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq -fp-contract=fast | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512-INFS
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma -fp-contract=fast -enable-no-infs-fp-math | FileCheck %s --check-prefix=FMA --check-prefix=FMA-NOINFS
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4,+fma -fp-contract=fast -enable-no-infs-fp-math | FileCheck %s --check-prefix=FMA4 --check-prefix=FMA4-NOINFS
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4 -fp-contract=fast -enable-no-infs-fp-math | FileCheck %s --check-prefix=FMA4 --check-prefix=FMA4-NOINFS
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq -fp-contract=fast -enable-no-infs-fp-math | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512-NOINFS
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma | FileCheck %s --check-prefix=FMA --check-prefix=FMA-INFS
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4,+fma | FileCheck %s --check-prefix=FMA4 --check-prefix=FMA4-INFS
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4 | FileCheck %s --check-prefix=FMA4 --check-prefix=FMA4-INFS
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512-INFS
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma -enable-no-infs-fp-math | FileCheck %s --check-prefix=FMA --check-prefix=FMA-NOINFS
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4,+fma -enable-no-infs-fp-math | FileCheck %s --check-prefix=FMA4 --check-prefix=FMA4-NOINFS
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4 -enable-no-infs-fp-math | FileCheck %s --check-prefix=FMA4 --check-prefix=FMA4-NOINFS
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq -enable-no-infs-fp-math | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512-NOINFS
 
 ;
 ; Pattern: (fadd (fmul x, y), z) -> (fmadd x,y,z)
@@ -29,8 +29,8 @@ define <16 x float> @test_16f32_fmadd(<16 x float> %a0, <16 x float> %a1, <16 x
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
 ; AVX512-NEXT:    retq
-  %x = fmul <16 x float> %a0, %a1
-  %res = fadd <16 x float> %x, %a2
+  %x = fmul contract <16 x float> %a0, %a1
+  %res = fadd contract <16 x float> %x, %a2
   ret <16 x float> %res
 }
 
@@ -51,8 +51,8 @@ define <8 x double> @test_8f64_fmadd(<8 x double> %a0, <8 x double> %a1, <8 x do
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vfmadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
 ; AVX512-NEXT:    retq
-  %x = fmul <8 x double> %a0, %a1
-  %res = fadd <8 x double> %x, %a2
+  %x = fmul contract <8 x double> %a0, %a1
+  %res = fadd contract <8 x double> %x, %a2
   ret <8 x double> %res
 }
 
@@ -77,8 +77,8 @@ define <16 x float> @test_16f32_fmsub(<16 x float> %a0, <16 x float> %a1, <16 x
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vfmsub213ps {{.*#+}} zmm0 = (zmm1 * zmm0) - zmm2
 ; AVX512-NEXT:    retq
-  %x = fmul <16 x float> %a0, %a1
-  %res = fsub <16 x float> %x, %a2
+  %x = fmul contract <16 x float> %a0, %a1
+  %res = fsub contract <16 x float> %x, %a2
   ret <16 x float> %res
 }
 
@@ -99,8 +99,8 @@ define <8 x double> @test_8f64_fmsub(<8 x double> %a0, <8 x double> %a1, <8 x do
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vfmsub213pd {{.*#+}} zmm0 = (zmm1 * zmm0) - zmm2
 ; AVX512-NEXT:    retq
-  %x = fmul <8 x double> %a0, %a1
-  %res = fsub <8 x double> %x, %a2
+  %x = fmul contract <8 x double> %a0, %a1
+  %res = fsub contract <8 x double> %x, %a2
   ret <8 x double> %res
 }
 
@@ -125,8 +125,8 @@ define <16 x float> @test_16f32_fnmadd(<16 x float> %a0, <16 x float> %a1, <16 x
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vfnmadd213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) + zmm2
 ; AVX512-NEXT:    retq
-  %x = fmul <16 x float> %a0, %a1
-  %res = fsub <16 x float> %a2, %x
+  %x = fmul contract <16 x float> %a0, %a1
+  %res = fsub contract <16 x float> %a2, %x
   ret <16 x float> %res
 }
 
@@ -147,8 +147,8 @@ define <8 x double> @test_8f64_fnmadd(<8 x double> %a0, <8 x double> %a1, <8 x d
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vfnmadd213pd {{.*#+}} zmm0 = -(zmm1 * zmm0) + zmm2
 ; AVX512-NEXT:    retq
-  %x = fmul <8 x double> %a0, %a1
-  %res = fsub <8 x double> %a2, %x
+  %x = fmul contract <8 x double> %a0, %a1
+  %res = fsub contract <8 x double> %a2, %x
   ret <8 x double> %res
 }
 
@@ -173,9 +173,9 @@ define <16 x float> @test_16f32_fnmsub(<16 x float> %a0, <16 x float> %a1, <16 x
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vfnmsub213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) - zmm2
 ; AVX512-NEXT:    retq
-  %x = fmul <16 x float> %a0, %a1
-  %y = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x
-  %res = fsub <16 x float> %y, %a2
+  %x = fmul contract <16 x float> %a0, %a1
+  %y = fsub contract <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x
+  %res = fsub contract <16 x float> %y, %a2
   ret <16 x float> %res
 }
 
@@ -196,9 +196,9 @@ define <8 x double> @test_8f64_fnmsub(<8 x double> %a0, <8 x double> %a1, <8 x d
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vfnmsub213pd {{.*#+}} zmm0 = -(zmm1 * zmm0) - zmm2
 ; AVX512-NEXT:    retq
-  %x = fmul <8 x double> %a0, %a1
-  %y = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %x
-  %res = fsub <8 x double> %y, %a2
+  %x = fmul contract <8 x double> %a0, %a1
+  %y = fsub contract <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %x
+  %res = fsub contract <8 x double> %y, %a2
   ret <8 x double> %res
 }
 
@@ -224,8 +224,8 @@ define <16 x float> @test_16f32_fmadd_load(ptr %a0, <16 x float> %a1, <16 x floa
 ; AVX512-NEXT:    vfmadd132ps {{.*#+}} zmm0 = (zmm0 * mem) + zmm1
 ; AVX512-NEXT:    retq
   %x = load <16 x float>, ptr %a0
-  %y = fmul <16 x float> %x, %a1
-  %res = fadd <16 x float> %y, %a2
+  %y = fmul contract <16 x float> %x, %a1
+  %res = fadd contract <16 x float> %y, %a2
   ret <16 x float> %res
 }
 
@@ -247,8 +247,8 @@ define <8 x double> @test_8f64_fmsub_load(ptr %a0, <8 x double> %a1, <8 x double
 ; AVX512-NEXT:    vfmsub132pd {{.*#+}} zmm0 = (zmm0 * mem) - zmm1
 ; AVX512-NEXT:    retq
   %x = load <8 x double>, ptr %a0
-  %y = fmul <8 x double> %x, %a1
-  %res = fsub <8 x double> %y, %a2
+  %y = fmul contract <8 x double> %x, %a1
+  %res = fsub contract <8 x double> %y, %a2
   ret <8 x double> %res
 }
 
@@ -297,8 +297,8 @@ define <16 x float> @test_v16f32_mul_add_x_one_y(<16 x float> %x, <16 x float> %
 ; AVX512-NOINFS:       # %bb.0:
 ; AVX512-NOINFS-NEXT:    vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm1
 ; AVX512-NOINFS-NEXT:    retq
-  %a = fadd <16 x float> %x, <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>
-  %m = fmul <16 x float> %a, %y
+  %a = fadd contract <16 x float> %x, <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>
+  %m = fmul contract <16 x float> %a, %y
   ret <16 x float> %m
 }
 
@@ -343,8 +343,8 @@ define <8 x double> @test_v8f64_mul_y_add_x_one(<8 x double> %x, <8 x double> %y
 ; AVX512-NOINFS:       # %bb.0:
 ; AVX512-NOINFS-NEXT:    vfmadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm1
 ; AVX512-NOINFS-NEXT:    retq
-  %a = fadd <8 x double> %x, <double 1.0, double 1.0, double 1.0, double 1.0, double 1.0, double 1.0, double 1.0, double 1.0>
-  %m = fmul <8 x double> %y, %a
+  %a = fadd contract <8 x double> %x, <double 1.0, double 1.0, double 1.0, double 1.0, double 1.0, double 1.0, double 1.0, double 1.0>
+  %m = fmul contract <8 x double> %y, %a
   ret <8 x double> %m
 }
 
@@ -389,8 +389,8 @@ define <16 x float> @test_v16f32_mul_add_x_negone_y(<16 x float> %x, <16 x float
 ; AVX512-NOINFS:       # %bb.0:
 ; AVX512-NOINFS-NEXT:    vfmsub213ps {{.*#+}} zmm0 = (zmm1 * zmm0) - zmm1
 ; AVX512-NOINFS-NEXT:    retq
-  %a = fadd <16 x float> %x, <float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0>
-  %m = fmul <16 x float> %a, %y
+  %a = fadd contract <16 x float> %x, <float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0>
+  %m = fmul contract <16 x float> %a, %y
   ret <16 x float> %m
 }
 
@@ -435,8 +435,8 @@ define <8 x double> @test_v8f64_mul_y_add_x_negone(<8 x double> %x, <8 x double>
 ; AVX512-NOINFS:       # %bb.0:
 ; AVX512-NOINFS-NEXT:    vfmsub213pd {{.*#+}} zmm0 = (zmm1 * zmm0) - zmm1
 ; AVX512-NOINFS-NEXT:    retq
-  %a = fadd <8 x double> %x, <double -1.0, double -1.0, double -1.0, double -1.0, double -1.0, double -1.0, double -1.0, double -1.0>
-  %m = fmul <8 x double> %y, %a
+  %a = fadd contract <8 x double> %x, <double -1.0, double -1.0, double -1.0, double -1.0, double -1.0, double -1.0, double -1.0, double -1.0>
+  %m = fmul contract <8 x double> %y, %a
   ret <8 x double> %m
 }
 
@@ -482,8 +482,8 @@ define <16 x float> @test_v16f32_mul_sub_one_x_y(<16 x float> %x, <16 x float> %
 ; AVX512-NOINFS:       # %bb.0:
 ; AVX512-NOINFS-NEXT:    vfnmadd213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) + zmm1
 ; AVX512-NOINFS-NEXT:    retq
-  %s = fsub <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x
-  %m = fmul <16 x float> %s, %y
+  %s = fsub contract <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x
+  %m = fmul contract <16 x float> %s, %y
   ret <16 x float> %m
 }
 
@@ -529,8 +529,8 @@ define <8 x double> @test_v8f64_mul_y_sub_one_x(<8 x double> %x, <8 x double> %y
 ; AVX512-NOINFS:       # %bb.0:
 ; AVX512-NOINFS-NEXT:    vfnmadd213pd {{.*#+}} zmm0 = -(zmm1 * zmm0) + zmm1
 ; AVX512-NOINFS-NEXT:    retq
-  %s = fsub <8 x double> <double 1.0, double 1.0, double 1.0, double 1.0, double 1.0, double 1.0, double 1.0, double 1.0>, %x
-  %m = fmul <8 x double> %y, %s
+  %s = fsub contract <8 x double> <double 1.0, double 1.0, double 1.0, double 1.0, double 1.0, double 1.0, double 1.0, double 1.0>, %x
+  %m = fmul contract <8 x double> %y, %s
   ret <8 x double> %m
 }
 
@@ -576,8 +576,8 @@ define <16 x float> @test_v16f32_mul_sub_negone_x_y(<16 x float> %x, <16 x float
 ; AVX512-NOINFS:       # %bb.0:
 ; AVX512-NOINFS-NEXT:    vfnmsub213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) - zmm1
 ; AVX512-NOINFS-NEXT:    retq
-  %s = fsub <16 x float> <float -1.0, float -1.0, float -1.0, float -1.0,float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0>, %x
-  %m = fmul <16 x float> %s, %y
+  %s = fsub contract <16 x float> <float -1.0, float -1.0, float -1.0, float -1.0,float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0>, %x
+  %m = fmul contract <16 x float> %s, %y
   ret <16 x float> %m
 }
 
@@ -623,8 +623,8 @@ define <8 x double> @test_v8f64_mul_y_sub_negone_x(<8 x double> %x, <8 x double>
 ; AVX512-NOINFS:       # %bb.0:
 ; AVX512-NOINFS-NEXT:    vfnmsub213pd {{.*#+}} zmm0 = -(zmm1 * zmm0) - zmm1
 ; AVX512-NOINFS-NEXT:    retq
-  %s = fsub <8 x double> <double -1.0, double -1.0, double -1.0, double -1.0, double -1.0, double -1.0, double -1.0, double -1.0>, %x
-  %m = fmul <8 x double> %y, %s
+  %s = fsub contract <8 x double> <double -1.0, double -1.0, double -1.0, double -1.0, double -1.0, double -1.0, double -1.0, double -1.0>, %x
+  %m = fmul contract <8 x double> %y, %s
   ret <8 x double> %m
 }
 
@@ -669,8 +669,8 @@ define <16 x float> @test_v16f32_mul_sub_x_one_y(<16 x float> %x, <16 x float> %
 ; AVX512-NOINFS:       # %bb.0:
 ; AVX512-NOINFS-NEXT:    vfmsub213ps {{.*#+}} zmm0 = (zmm1 * zmm0) - zmm1
 ; AVX512-NOINFS-NEXT:    retq
-  %s = fsub <16 x float> %x, <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>
-  %m = fmul <16 x float> %s, %y
+  %s = fsub contract <16 x float> %x, <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>
+  %m = fmul contract <16 x float> %s, %y
   ret <16 x float> %m
 }
 
@@ -715,8 +715,8 @@ define <8 x double> @test_v8f64_mul_y_sub_x_one(<8 x double> %x, <8 x double> %y
 ; AVX512-NOINFS:       # %bb.0:
 ; AVX512-NOINFS-NEXT:    vfmsub213pd {{.*#+}} zmm0 = (zmm1 * zmm0) - zmm1
 ; AVX512-NOINFS-NEXT:    retq
-  %s = fsub <8 x double> %x, <double 1.0, double 1.0, double 1.0, double 1.0, double 1.0, double 1.0, double 1.0, double 1.0>
-  %m = fmul <8 x double> %y, %s
+  %s = fsub contract <8 x double> %x, <double 1.0, double 1.0, double 1.0, double 1.0, double 1.0, double 1.0, double 1.0, double 1.0>
+  %m = fmul contract <8 x double> %y, %s
   ret <8 x double> %m
 }
 
@@ -761,8 +761,8 @@ define <16 x float> @test_v16f32_mul_sub_x_negone_y(<16 x float> %x, <16 x float
 ; AVX512-NOINFS:       # %bb.0:
 ; AVX512-NOINFS-NEXT:    vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm1
 ; AVX512-NOINFS-NEXT:    retq
-  %s = fsub <16 x float> %x, <float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0>
-  %m = fmul <16 x float> %s, %y
+  %s = fsub contract <16 x float> %x, <float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0>
+  %m = fmul contract <16 x float> %s, %y
   ret <16 x float> %m
 }
 
@@ -807,8 +807,8 @@ define <8 x double> @test_v8f64_mul_y_sub_x_negone(<8 x double> %x, <8 x double>
 ; AVX512-NOINFS:       # %bb.0:
 ; AVX512-NOINFS-NEXT:    vfmadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm1
 ; AVX512-NOINFS-NEXT:    retq
-  %s = fsub <8 x double> %x, <double -1.0, double -1.0, double -1.0, double -1.0, double -1.0, double -1.0, double -1.0, double -1.0>
-  %m = fmul <8 x double> %y, %s
+  %s = fsub contract <8 x double> %x, <double -1.0, double -1.0, double -1.0, double -1.0, double -1.0, double -1.0, double -1.0, double -1.0>
+  %m = fmul contract <8 x double> %y, %s
   ret <8 x double> %m
 }
 
@@ -868,10 +868,10 @@ define <16 x float> @test_v16f32_interp(<16 x float> %x, <16 x float> %y, <16 x
 ; AVX512-NOINFS-NEXT:    vfmsub213ps {{.*#+}} zmm1 = (zmm2 * zmm1) - zmm1
 ; AVX512-NOINFS-NEXT:    vfmsub213ps {{.*#+}} zmm0 = (zmm2 * zmm0) - zmm1
 ; AVX512-NOINFS-NEXT:    retq
-  %t1 = fsub nsz <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %t
-  %tx = fmul nsz <16 x float> %x, %t
-  %ty = fmul nsz <16 x float> %y, %t1
-  %r = fadd nsz <16 x float> %tx, %ty
+  %t1 = fsub contract nsz <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %t
+  %tx = fmul contract nsz <16 x float> %x, %t
+  %ty = fmul contract nsz <16 x float> %y, %t1
+  %r = fadd contract nsz <16 x float> %tx, %ty
   ret <16 x float> %r
 }
 
@@ -927,10 +927,10 @@ define <8 x double> @test_v8f64_interp(<8 x double> %x, <8 x double> %y, <8 x do
 ; AVX512-NOINFS-NEXT:    vfmsub213pd {{.*#+}} zmm1 = (zmm2 * zmm1) - zmm1
 ; AVX512-NOINFS-NEXT:    vfmsub213pd {{.*#+}} zmm0 = (zmm2 * zmm0) - zmm1
 ; AVX512-NOINFS-NEXT:    retq
-  %t1 = fsub nsz <8 x double> <double 1.0, double 1.0, double 1.0, double 1.0, double 1.0, double 1.0, double 1.0, double 1.0>, %t
-  %tx = fmul nsz <8 x double> %x, %t
-  %ty = fmul nsz <8 x double> %y, %t1
-  %r = fadd nsz <8 x double> %tx, %ty
+  %t1 = fsub contract nsz <8 x double> <double 1.0, double 1.0, double 1.0, double 1.0, double 1.0, double 1.0, double 1.0, double 1.0>, %t
+  %tx = fmul contract nsz <8 x double> %x, %t
+  %ty = fmul contract nsz <8 x double> %y, %t1
+  %r = fadd contract nsz <8 x double> %tx, %ty
   ret <8 x double> %r
 }
 
@@ -955,9 +955,9 @@ define <16 x float> @test_v16f32_fneg_fmadd(<16 x float> %a0, <16 x float> %a1,
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vfnmsub213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) - zmm2
 ; AVX512-NEXT:    retq
-  %mul = fmul nsz <16 x float> %a0, %a1
-  %add = fadd nsz <16 x float> %mul, %a2
-  %neg = fsub nsz <16 x float> <float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0>, %add
+  %mul = fmul contract nsz <16 x float> %a0, %a1
+  %add = fadd contract nsz <16 x float> %mul, %a2
+  %neg = fsub contract nsz <16 x float> <float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0>, %add
   ret <16 x float> %neg
 }
 
@@ -978,9 +978,9 @@ define <8 x double> @test_v8f64_fneg_fmsub(<8 x double> %a0, <8 x double> %a1, <
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vfnmadd213pd {{.*#+}} zmm0 = -(zmm1 * zmm0) + zmm2
 ; AVX512-NEXT:    retq
-  %mul = fmul nsz <8 x double> %a0, %a1
-  %sub = fsub nsz <8 x double> %mul, %a2
-  %neg = fsub nsz <8 x double> <double -0.0, double -0.0, double -0.0, double -0.0, double -0.0, double -0.0, double -0.0, double -0.0>, %sub
+  %mul = fmul contract nsz <8 x double> %a0, %a1
+  %sub = fsub contract nsz <8 x double> %mul, %a2
+  %neg = fsub contract nsz <8 x double> <double -0.0, double -0.0, double -0.0, double -0.0, double -0.0, double -0.0, double -0.0, double -0.0>, %sub
   ret <8 x double> %neg
 }
 
@@ -1001,10 +1001,10 @@ define <16 x float> @test_v16f32_fneg_fnmadd(<16 x float> %a0, <16 x float> %a1,
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vfmsub213ps {{.*#+}} zmm0 = (zmm1 * zmm0) - zmm2
 ; AVX512-NEXT:    retq
-  %mul = fmul nsz <16 x float> %a0, %a1
-  %neg0 = fsub nsz <16 x float> <float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0>, %mul
-  %add = fadd nsz <16 x float> %neg0, %a2
-  %neg1 = fsub nsz <16 x float> <float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0>, %add
+  %mul = fmul contract nsz <16 x float> %a0, %a1
+  %neg0 = fsub contract nsz <16 x float> <float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0>, %mul
+  %add = fadd contract nsz <16 x float> %neg0, %a2
+  %neg1 = fsub contract nsz <16 x float> <float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0>, %add
   ret <16 x float> %neg1
 }
 
@@ -1025,10 +1025,10 @@ define <8 x double> @test_v8f64_fneg_fnmsub(<8 x double> %a0, <8 x double> %a1,
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vfmadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
 ; AVX512-NEXT:    retq
-  %mul = fmul nsz <8 x double> %a0, %a1
-  %neg0 = fsub nsz <8 x double> <double -0.0, double -0.0, double -0.0, double -0.0, double -0.0, double -0.0, double -0.0, double -0.0>, %mul
-  %sub = fsub nsz <8 x double> %neg0, %a2
-  %neg1 = fsub nsz <8 x double> <double -0.0, double -0.0, double -0.0, double -0.0, double -0.0, double -0.0, double -0.0, double -0.0>, %sub
+  %mul = fmul contract nsz <8 x double> %a0, %a1
+  %neg0 = fsub contract nsz <8 x double> <double -0.0, double -0.0, double -0.0, double -0.0, double -0.0, double -0.0, double -0.0, double -0.0>, %mul
+  %sub = fsub contract nsz <8 x double> %neg0, %a2
+  %neg1 = fsub contract nsz <8 x double> <double -0.0, double -0.0, double -0.0, double -0.0, double -0.0, double -0.0, double -0.0, double -0.0>, %sub
   ret <8 x double> %neg1
 }
 
@@ -1108,8 +1108,8 @@ define <16 x float> @test_v16f32_fneg_fmul(<16 x float> %x, <16 x float> %y) #0
 ; AVX512-NEXT:    vxorps %xmm2, %xmm2, %xmm2
 ; AVX512-NEXT:    vfnmsub213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) - zmm2
 ; AVX512-NEXT:    retq
-  %m = fmul nsz <16 x float> %x, %y
-  %n = fsub <16 x float> <float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0>, %m
+  %m = fmul contract nsz <16 x float> %x, %y
+  %n = fsub contract <16 x float> <float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0>, %m
   ret <16 x float> %n
 }
 
@@ -1133,8 +1133,8 @@ define <8 x double> @test_v8f64_fneg_fmul(<8 x double> %x, <8 x double> %y) #0 {
 ; AVX512-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
 ; AVX512-NEXT:    vfnmsub213pd {{.*#+}} zmm0 = -(zmm1 * zmm0) - zmm2
 ; AVX512-NEXT:    retq
-  %m = fmul nsz <8 x double> %x, %y
-  %n = fsub <8 x double> <double -0.0, double -0.0, double -0.0, double -0.0, double -0.0, double -0.0, double -0.0, double -0.0>, %m
+  %m = fmul contract nsz <8 x double> %x, %y
+  %n = fsub contract <8 x double> <double -0.0, double -0.0, double -0.0, double -0.0, double -0.0, double -0.0, double -0.0, double -0.0>, %m
   ret <8 x double> %n
 }
 
@@ -1162,8 +1162,8 @@ define <8 x double> @test_v8f64_fneg_fmul_no_nsz(<8 x double> %x, <8 x double> %
 ; AVX512-NEXT:    vmulpd %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vxorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
 ; AVX512-NEXT:    retq
-  %m = fmul <8 x double> %x, %y
-  %n = fsub <8 x double> <double -0.0, double -0.0, double -0.0, double -0.0, double -0.0, double -0.0, double -0.0, double -0.0>, %m
+  %m = fmul contract <8 x double> %x, %y
+  %n = fsub contract <8 x double> <double -0.0, double -0.0, double -0.0, double -0.0, double -0.0, double -0.0, double -0.0, double -0.0>, %m
   ret <8 x double> %n
 }
 
diff --git a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll
index d59b12c6d1231..81529aff39ff1 100644
--- a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll
+++ b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefixes=CHECK-SSE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK-AVX,CHECK-AVX2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK-AVX,CHECK-AVX512F,CHECK-NO-FASTFMA
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skx -fp-contract=fast | FileCheck %s --check-prefixes=CHECK-AVX,CHECK-AVX512F,CHECK-FMA
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK-AVX,CHECK-AVX512F,CHECK-ONLY-AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skx | FileCheck %s --check-prefixes=CHECK-AVX,CHECK-AVX512F,CHECK-SKX
 
 declare i16 @llvm.umax.i16(i16, i16)
 declare i64 @llvm.umin.i64(i64, i64)
@@ -23,18 +23,18 @@ define <4 x float> @fmul_pow2_4xfloat(<4 x i32> %i) {
 ; CHECK-AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    retq
 ;
-; CHECK-NO-FASTFMA-LABEL: fmul_pow2_4xfloat:
-; CHECK-NO-FASTFMA:       # %bb.0:
-; CHECK-NO-FASTFMA-NEXT:    vpslld $23, %xmm0, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [1091567616,1091567616,1091567616,1091567616]
-; CHECK-NO-FASTFMA-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    retq
-;
-; CHECK-FMA-LABEL: fmul_pow2_4xfloat:
-; CHECK-FMA:       # %bb.0:
-; CHECK-FMA-NEXT:    vpslld $23, %xmm0, %xmm0
-; CHECK-FMA-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
-; CHECK-FMA-NEXT:    retq
+; CHECK-ONLY-AVX512F-LABEL: fmul_pow2_4xfloat:
+; CHECK-ONLY-AVX512F:       # %bb.0:
+; CHECK-ONLY-AVX512F-NEXT:    vpslld $23, %xmm0, %xmm0
+; CHECK-ONLY-AVX512F-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [1091567616,1091567616,1091567616,1091567616]
+; CHECK-ONLY-AVX512F-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; CHECK-ONLY-AVX512F-NEXT:    retq
+;
+; CHECK-SKX-LABEL: fmul_pow2_4xfloat:
+; CHECK-SKX:       # %bb.0:
+; CHECK-SKX-NEXT:    vpslld $23, %xmm0, %xmm0
+; CHECK-SKX-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
+; CHECK-SKX-NEXT:    retq
   %p2 = shl <4 x i32> <i32 1, i32 1, i32 1, i32 1>, %i
   %p2_f = uitofp <4 x i32> %p2 to <4 x float>
   %r = fmul <4 x float> <float 9.000000e+00, float 9.000000e+00, float 9.000000e+00, float 9.000000e+00>, %p2_f
@@ -371,34 +371,34 @@ define <8 x half> @fmul_pow2_8xhalf(<8 x i16> %i) {
 ; CHECK-AVX2-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-AVX2-NEXT:    retq
 ;
-; CHECK-NO-FASTFMA-LABEL: fmul_pow2_8xhalf:
-; CHECK-NO-FASTFMA:       # %bb.0:
-; CHECK-NO-FASTFMA-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; CHECK-NO-FASTFMA-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1]
-; CHECK-NO-FASTFMA-NEXT:    vpsllvd %ymm0, %ymm1, %ymm0
-; CHECK-NO-FASTFMA-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NO-FASTFMA-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
-; CHECK-NO-FASTFMA-NEXT:    vcvtdq2ps %ymm0, %ymm0
-; CHECK-NO-FASTFMA-NEXT:    vcvtps2ph $4, %ymm0, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    vcvtph2ps %xmm0, %ymm0
-; CHECK-NO-FASTFMA-NEXT:    vbroadcastss {{.*#+}} ymm1 = [8.192E+3,8.192E+3,8.192E+3,8.192E+3,8.192E+3,8.192E+3,8.192E+3,8.192E+3]
-; CHECK-NO-FASTFMA-NEXT:    vmulps %ymm1, %ymm0, %ymm0
-; CHECK-NO-FASTFMA-NEXT:    vcvtps2ph $4, %ymm0, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    vzeroupper
-; CHECK-NO-FASTFMA-NEXT:    retq
-;
-; CHECK-FMA-LABEL: fmul_pow2_8xhalf:
-; CHECK-FMA:       # %bb.0:
-; CHECK-FMA-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1]
-; CHECK-FMA-NEXT:    vpsllvw %xmm0, %xmm1, %xmm0
-; CHECK-FMA-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; CHECK-FMA-NEXT:    vcvtdq2ps %ymm0, %ymm0
-; CHECK-FMA-NEXT:    vcvtps2ph $4, %ymm0, %xmm0
-; CHECK-FMA-NEXT:    vcvtph2ps %xmm0, %ymm0
-; CHECK-FMA-NEXT:    vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
-; CHECK-FMA-NEXT:    vcvtps2ph $4, %ymm0, %xmm0
-; CHECK-FMA-NEXT:    vzeroupper
-; CHECK-FMA-NEXT:    retq
+; CHECK-ONLY-AVX512F-LABEL: fmul_pow2_8xhalf:
+; CHECK-ONLY-AVX512F:       # %bb.0:
+; CHECK-ONLY-AVX512F-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; CHECK-ONLY-AVX512F-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1]
+; CHECK-ONLY-AVX512F-NEXT:    vpsllvd %ymm0, %ymm1, %ymm0
+; CHECK-ONLY-AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-ONLY-AVX512F-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
+; CHECK-ONLY-AVX512F-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; CHECK-ONLY-AVX512F-NEXT:    vcvtps2ph $4, %ymm0, %xmm0
+; CHECK-ONLY-AVX512F-NEXT:    vcvtph2ps %xmm0, %ymm0
+; CHECK-ONLY-AVX512F-NEXT:    vbroadcastss {{.*#+}} ymm1 = [8.192E+3,8.192E+3,8.192E+3,8.192E+3,8.192E+3,8.192E+3,8.192E+3,8.192E+3]
+; CHECK-ONLY-AVX512F-NEXT:    vmulps %ymm1, %ymm0, %ymm0
+; CHECK-ONLY-AVX512F-NEXT:    vcvtps2ph $4, %ymm0, %xmm0
+; CHECK-ONLY-AVX512F-NEXT:    vzeroupper
+; CHECK-ONLY-AVX512F-NEXT:    retq
+;
+; CHECK-SKX-LABEL: fmul_pow2_8xhalf:
+; CHECK-SKX:       # %bb.0:
+; CHECK-SKX-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1]
+; CHECK-SKX-NEXT:    vpsllvw %xmm0, %xmm1, %xmm0
+; CHECK-SKX-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; CHECK-SKX-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; CHECK-SKX-NEXT:    vcvtps2ph $4, %ymm0, %xmm0
+; CHECK-SKX-NEXT:    vcvtph2ps %xmm0, %ymm0
+; CHECK-SKX-NEXT:    vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
+; CHECK-SKX-NEXT:    vcvtps2ph $4, %ymm0, %xmm0
+; CHECK-SKX-NEXT:    vzeroupper
+; CHECK-SKX-NEXT:    retq
   %p2 = shl <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>, %i
   %p2_f = uitofp <8 x i16> %p2 to <8 x half>
   %r = fmul <8 x half> <half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000>, %p2_f
@@ -656,19 +656,19 @@ define <8 x half> @fdiv_pow2_8xhalf(<8 x i16> %i) {
 ; CHECK-AVX2-NEXT:    vpsubw %xmm0, %xmm1, %xmm0
 ; CHECK-AVX2-NEXT:    retq
 ;
-; CHECK-NO-FASTFMA-LABEL: fdiv_pow2_8xhalf:
-; CHECK-NO-FASTFMA:       # %bb.0:
-; CHECK-NO-FASTFMA-NEXT:    vpsllw $10, %xmm0, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [28672,28672,28672,28672,28672,28672,28672,28672]
-; CHECK-NO-FASTFMA-NEXT:    vpsubw %xmm0, %xmm1, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    retq
-;
-; CHECK-FMA-LABEL: fdiv_pow2_8xhalf:
-; CHECK-FMA:       # %bb.0:
-; CHECK-FMA-NEXT:    vpsllw $10, %xmm0, %xmm0
-; CHECK-FMA-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [28672,28672,28672,28672,28672,28672,28672,28672]
-; CHECK-FMA-NEXT:    vpsubw %xmm0, %xmm1, %xmm0
-; CHECK-FMA-NEXT:    retq
+; CHECK-ONLY-AVX512F-LABEL: fdiv_pow2_8xhalf:
+; CHECK-ONLY-AVX512F:       # %bb.0:
+; CHECK-ONLY-AVX512F-NEXT:    vpsllw $10, %xmm0, %xmm0
+; CHECK-ONLY-AVX512F-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [28672,28672,28672,28672,28672,28672,28672,28672]
+; CHECK-ONLY-AVX512F-NEXT:    vpsubw %xmm0, %xmm1, %xmm0
+; CHECK-ONLY-AVX512F-NEXT:    retq
+;
+; CHECK-SKX-LABEL: fdiv_pow2_8xhalf:
+; CHECK-SKX:       # %bb.0:
+; CHECK-SKX-NEXT:    vpsllw $10, %xmm0, %xmm0
+; CHECK-SKX-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [28672,28672,28672,28672,28672,28672,28672,28672]
+; CHECK-SKX-NEXT:    vpsubw %xmm0, %xmm1, %xmm0
+; CHECK-SKX-NEXT:    retq
   %p2 = shl <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>, %i
   %p2_f = uitofp <8 x i16> %p2 to <8 x half>
   %r = fdiv <8 x half> <half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000>, %p2_f
@@ -882,21 +882,21 @@ define double @fmul_pow_shl_cnt_fail_maybe_non_pow2(i64 %v, i64 %cnt) nounwind {
 ; CHECK-AVX2-NEXT:    vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    retq
 ;
-; CHECK-NO-FASTFMA-LABEL: fmul_pow_shl_cnt_fail_maybe_non_pow2:
-; CHECK-NO-FASTFMA:       # %bb.0:
-; CHECK-NO-FASTFMA-NEXT:    movq %rsi, %rcx
-; CHECK-NO-FASTFMA-NEXT:    # kill: def $cl killed $cl killed $rcx
-; CHECK-NO-FASTFMA-NEXT:    shlq %cl, %rdi
-; CHECK-NO-FASTFMA-NEXT:    vcvtusi2sd %rdi, %xmm15, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    retq
-;
-; CHECK-FMA-LABEL: fmul_pow_shl_cnt_fail_maybe_non_pow2:
-; CHECK-FMA:       # %bb.0:
-; CHECK-FMA-NEXT:    shlxq %rsi, %rdi, %rax
-; CHECK-FMA-NEXT:    vcvtusi2sd %rax, %xmm15, %xmm0
-; CHECK-FMA-NEXT:    vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; CHECK-FMA-NEXT:    retq
+; CHECK-ONLY-AVX512F-LABEL: fmul_pow_shl_cnt_fail_maybe_non_pow2:
+; CHECK-ONLY-AVX512F:       # %bb.0:
+; CHECK-ONLY-AVX512F-NEXT:    movq %rsi, %rcx
+; CHECK-ONLY-AVX512F-NEXT:    # kill: def $cl killed $cl killed $rcx
+; CHECK-ONLY-AVX512F-NEXT:    shlq %cl, %rdi
+; CHECK-ONLY-AVX512F-NEXT:    vcvtusi2sd %rdi, %xmm15, %xmm0
+; CHECK-ONLY-AVX512F-NEXT:    vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-ONLY-AVX512F-NEXT:    retq
+;
+; CHECK-SKX-LABEL: fmul_pow_shl_cnt_fail_maybe_non_pow2:
+; CHECK-SKX:       # %bb.0:
+; CHECK-SKX-NEXT:    shlxq %rsi, %rdi, %rax
+; CHECK-SKX-NEXT:    vcvtusi2sd %rax, %xmm15, %xmm0
+; CHECK-SKX-NEXT:    vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-SKX-NEXT:    retq
   %shl = shl nuw i64 %v, %cnt
   %conv = uitofp i64 %shl to double
   %mul = fmul double 9.000000e+00, %conv
@@ -935,26 +935,26 @@ define <2 x float> @fmul_pow_shl_cnt_vec_fail_expensive_cast(<2 x i64> %cnt) nou
 ; CHECK-AVX2-NEXT:    vmulps %xmm1, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    retq
 ;
-; CHECK-NO-FASTFMA-LABEL: fmul_pow_shl_cnt_vec_fail_expensive_cast:
-; CHECK-NO-FASTFMA:       # %bb.0:
-; CHECK-NO-FASTFMA-NEXT:    vpmovsxbq {{.*#+}} xmm1 = [2,2]
-; CHECK-NO-FASTFMA-NEXT:    vpsllvq %xmm0, %xmm1, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    vpextrq $1, %xmm0, %rax
-; CHECK-NO-FASTFMA-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm1
-; CHECK-NO-FASTFMA-NEXT:    vmovq %xmm0, %rax
-; CHECK-NO-FASTFMA-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
-; CHECK-NO-FASTFMA-NEXT:    vbroadcastss {{.*#+}} xmm1 = [1.5E+1,1.5E+1,1.5E+1,1.5E+1]
-; CHECK-NO-FASTFMA-NEXT:    vmulps %xmm1, %xmm0, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    retq
-;
-; CHECK-FMA-LABEL: fmul_pow_shl_cnt_vec_fail_expensive_cast:
-; CHECK-FMA:       # %bb.0:
-; CHECK-FMA-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [2,2]
-; CHECK-FMA-NEXT:    vpsllvq %xmm0, %xmm1, %xmm0
-; CHECK-FMA-NEXT:    vcvtqq2ps %xmm0, %xmm0
-; CHECK-FMA-NEXT:    vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
-; CHECK-FMA-NEXT:    retq
+; CHECK-ONLY-AVX512F-LABEL: fmul_pow_shl_cnt_vec_fail_expensive_cast:
+; CHECK-ONLY-AVX512F:       # %bb.0:
+; CHECK-ONLY-AVX512F-NEXT:    vpmovsxbq {{.*#+}} xmm1 = [2,2]
+; CHECK-ONLY-AVX512F-NEXT:    vpsllvq %xmm0, %xmm1, %xmm0
+; CHECK-ONLY-AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
+; CHECK-ONLY-AVX512F-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm1
+; CHECK-ONLY-AVX512F-NEXT:    vmovq %xmm0, %rax
+; CHECK-ONLY-AVX512F-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm0
+; CHECK-ONLY-AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
+; CHECK-ONLY-AVX512F-NEXT:    vbroadcastss {{.*#+}} xmm1 = [1.5E+1,1.5E+1,1.5E+1,1.5E+1]
+; CHECK-ONLY-AVX512F-NEXT:    vmulps %xmm1, %xmm0, %xmm0
+; CHECK-ONLY-AVX512F-NEXT:    retq
+;
+; CHECK-SKX-LABEL: fmul_pow_shl_cnt_vec_fail_expensive_cast:
+; CHECK-SKX:       # %bb.0:
+; CHECK-SKX-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [2,2]
+; CHECK-SKX-NEXT:    vpsllvq %xmm0, %xmm1, %xmm0
+; CHECK-SKX-NEXT:    vcvtqq2ps %xmm0, %xmm0
+; CHECK-SKX-NEXT:    vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
+; CHECK-SKX-NEXT:    retq
   %shl = shl nsw nuw <2 x i64> <i64 2, i64 2>, %cnt
   %conv = uitofp <2 x i64> %shl to <2 x float>
   %mul = fmul <2 x float> <float 15.000000e+00, float 15.000000e+00>, %conv
@@ -974,17 +974,17 @@ define <2 x double> @fmul_pow_shl_cnt_vec(<2 x i64> %cnt) nounwind {
 ; CHECK-AVX2-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    retq
 ;
-; CHECK-NO-FASTFMA-LABEL: fmul_pow_shl_cnt_vec:
-; CHECK-NO-FASTFMA:       # %bb.0:
-; CHECK-NO-FASTFMA-NEXT:    vpsllq $52, %xmm0, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    retq
-;
-; CHECK-FMA-LABEL: fmul_pow_shl_cnt_vec:
-; CHECK-FMA:       # %bb.0:
-; CHECK-FMA-NEXT:    vpsllq $52, %xmm0, %xmm0
-; CHECK-FMA-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0
-; CHECK-FMA-NEXT:    retq
+; CHECK-ONLY-AVX512F-LABEL: fmul_pow_shl_cnt_vec:
+; CHECK-ONLY-AVX512F:       # %bb.0:
+; CHECK-ONLY-AVX512F-NEXT:    vpsllq $52, %xmm0, %xmm0
+; CHECK-ONLY-AVX512F-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-ONLY-AVX512F-NEXT:    retq
+;
+; CHECK-SKX-LABEL: fmul_pow_shl_cnt_vec:
+; CHECK-SKX:       # %bb.0:
+; CHECK-SKX-NEXT:    vpsllq $52, %xmm0, %xmm0
+; CHECK-SKX-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0
+; CHECK-SKX-NEXT:    retq
   %shl = shl nsw nuw <2 x i64> <i64 2, i64 2>, %cnt
   %conv = uitofp <2 x i64> %shl to <2 x double>
   %mul = fmul <2 x double> <double 15.000000e+00, double 15.000000e+00>, %conv
@@ -1007,21 +1007,59 @@ define <4 x float> @fmul_pow_shl_cnt_vec_preserve_fma(<4 x i32> %cnt, <4 x float
 ; CHECK-AVX2-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    retq
 ;
-; CHECK-NO-FASTFMA-LABEL: fmul_pow_shl_cnt_vec_preserve_fma:
-; CHECK-NO-FASTFMA:       # %bb.0:
-; CHECK-NO-FASTFMA-NEXT:    vpslld $23, %xmm0, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [1092616192,1092616192,1092616192,1092616192]
-; CHECK-NO-FASTFMA-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    vaddps %xmm1, %xmm0, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    retq
-;
-; CHECK-FMA-LABEL: fmul_pow_shl_cnt_vec_preserve_fma:
-; CHECK-FMA:       # %bb.0:
-; CHECK-FMA-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [2,2,2,2]
-; CHECK-FMA-NEXT:    vpsllvd %xmm0, %xmm2, %xmm0
-; CHECK-FMA-NEXT:    vcvtdq2ps %xmm0, %xmm0
-; CHECK-FMA-NEXT:    vfmadd132ps {{.*#+}} xmm0 = (xmm0 * mem) + xmm1
-; CHECK-FMA-NEXT:    retq
+; CHECK-ONLY-AVX512F-LABEL: fmul_pow_shl_cnt_vec_preserve_fma:
+; CHECK-ONLY-AVX512F:       # %bb.0:
+; CHECK-ONLY-AVX512F-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [2,2,2,2]
+; CHECK-ONLY-AVX512F-NEXT:    vpsllvd %xmm0, %xmm2, %xmm0
+; CHECK-ONLY-AVX512F-NEXT:    vcvtdq2ps %xmm0, %xmm2
+; CHECK-ONLY-AVX512F-NEXT:    vbroadcastss {{.*#+}} xmm0 = [5.0E+0,5.0E+0,5.0E+0,5.0E+0]
+; CHECK-ONLY-AVX512F-NEXT:    vfmadd213ps {{.*#+}} xmm0 = (xmm2 * xmm0) + xmm1
+; CHECK-ONLY-AVX512F-NEXT:    retq
+;
+; CHECK-SKX-LABEL: fmul_pow_shl_cnt_vec_preserve_fma:
+; CHECK-SKX:       # %bb.0:
+; CHECK-SKX-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [2,2,2,2]
+; CHECK-SKX-NEXT:    vpsllvd %xmm0, %xmm2, %xmm0
+; CHECK-SKX-NEXT:    vcvtdq2ps %xmm0, %xmm0
+; CHECK-SKX-NEXT:    vfmadd132ps {{.*#+}} xmm0 = (xmm0 * mem) + xmm1
+; CHECK-SKX-NEXT:    retq
+  %shl = shl nsw nuw <4 x i32> <i32 2, i32 2, i32 2, i32 2>, %cnt
+  %conv = uitofp <4 x i32> %shl to <4 x float>
+  %mul = fmul contract <4 x float> <float 5.000000e+00, float 5.000000e+00, float 5.000000e+00, float 5.000000e+00>, %conv
+  %res = fadd contract <4 x float> %mul, %add
+  ret <4 x float> %res
+}
+
+define <4 x float> @fmul_pow_shl_cnt_vec_no_fma(<4 x i32> %cnt, <4 x float> %add) nounwind {
+; CHECK-SSE-LABEL: fmul_pow_shl_cnt_vec_no_fma:
+; CHECK-SSE:       # %bb.0:
+; CHECK-SSE-NEXT:    pslld $23, %xmm0
+; CHECK-SSE-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE-NEXT:    addps %xmm1, %xmm0
+; CHECK-SSE-NEXT:    retq
+;
+; CHECK-AVX2-LABEL: fmul_pow_shl_cnt_vec_no_fma:
+; CHECK-AVX2:       # %bb.0:
+; CHECK-AVX2-NEXT:    vpslld $23, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [1092616192,1092616192,1092616192,1092616192]
+; CHECK-AVX2-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vaddps %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    retq
+;
+; CHECK-ONLY-AVX512F-LABEL: fmul_pow_shl_cnt_vec_no_fma:
+; CHECK-ONLY-AVX512F:       # %bb.0:
+; CHECK-ONLY-AVX512F-NEXT:    vpslld $23, %xmm0, %xmm0
+; CHECK-ONLY-AVX512F-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [1092616192,1092616192,1092616192,1092616192]
+; CHECK-ONLY-AVX512F-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
+; CHECK-ONLY-AVX512F-NEXT:    vaddps %xmm1, %xmm0, %xmm0
+; CHECK-ONLY-AVX512F-NEXT:    retq
+;
+; CHECK-SKX-LABEL: fmul_pow_shl_cnt_vec_no_fma:
+; CHECK-SKX:       # %bb.0:
+; CHECK-SKX-NEXT:    vpslld $23, %xmm0, %xmm0
+; CHECK-SKX-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
+; CHECK-SKX-NEXT:    vaddps %xmm1, %xmm0, %xmm0
+; CHECK-SKX-NEXT:    retq
   %shl = shl nsw nuw <4 x i32> <i32 2, i32 2, i32 2, i32 2>, %cnt
   %conv = uitofp <4 x i32> %shl to <4 x float>
   %mul = fmul <4 x float> <float 5.000000e+00, float 5.000000e+00, float 5.000000e+00, float 5.000000e+00>, %conv
@@ -1131,34 +1169,34 @@ define <2 x half> @fmul_pow_shl_cnt_vec_fail_to_large(<2 x i16> %cnt) nounwind {
 ; CHECK-AVX2-NEXT:    addq $56, %rsp
 ; CHECK-AVX2-NEXT:    retq
 ;
-; CHECK-NO-FASTFMA-LABEL: fmul_pow_shl_cnt_vec_fail_to_large:
-; CHECK-NO-FASTFMA:       # %bb.0:
-; CHECK-NO-FASTFMA-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; CHECK-NO-FASTFMA-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [2,2,0,0,0,0,0,0]
-; CHECK-NO-FASTFMA-NEXT:    vpsllvd %ymm0, %ymm1, %ymm0
-; CHECK-NO-FASTFMA-NEXT:    vpmovdw %zmm0, %ymm0
-; CHECK-NO-FASTFMA-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; CHECK-NO-FASTFMA-NEXT:    vcvtdq2ps %ymm0, %ymm0
-; CHECK-NO-FASTFMA-NEXT:    vcvtps2ph $4, %ymm0, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    vcvtph2ps %xmm0, %ymm0
-; CHECK-NO-FASTFMA-NEXT:    vbroadcastss {{.*#+}} ymm1 = [1.5E+1,1.5E+1,1.5E+1,1.5E+1,1.5E+1,1.5E+1,1.5E+1,1.5E+1]
-; CHECK-NO-FASTFMA-NEXT:    vmulps %ymm1, %ymm0, %ymm0
-; CHECK-NO-FASTFMA-NEXT:    vcvtps2ph $4, %ymm0, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    vzeroupper
-; CHECK-NO-FASTFMA-NEXT:    retq
-;
-; CHECK-FMA-LABEL: fmul_pow_shl_cnt_vec_fail_to_large:
-; CHECK-FMA:       # %bb.0:
-; CHECK-FMA-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2]
-; CHECK-FMA-NEXT:    vpsllvw %xmm0, %xmm1, %xmm0
-; CHECK-FMA-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; CHECK-FMA-NEXT:    vcvtdq2ps %ymm0, %ymm0
-; CHECK-FMA-NEXT:    vcvtps2ph $4, %ymm0, %xmm0
-; CHECK-FMA-NEXT:    vcvtph2ps %xmm0, %ymm0
-; CHECK-FMA-NEXT:    vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
-; CHECK-FMA-NEXT:    vcvtps2ph $4, %ymm0, %xmm0
-; CHECK-FMA-NEXT:    vzeroupper
-; CHECK-FMA-NEXT:    retq
+; CHECK-ONLY-AVX512F-LABEL: fmul_pow_shl_cnt_vec_fail_to_large:
+; CHECK-ONLY-AVX512F:       # %bb.0:
+; CHECK-ONLY-AVX512F-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; CHECK-ONLY-AVX512F-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [2,2,0,0,0,0,0,0]
+; CHECK-ONLY-AVX512F-NEXT:    vpsllvd %ymm0, %ymm1, %ymm0
+; CHECK-ONLY-AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
+; CHECK-ONLY-AVX512F-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; CHECK-ONLY-AVX512F-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; CHECK-ONLY-AVX512F-NEXT:    vcvtps2ph $4, %ymm0, %xmm0
+; CHECK-ONLY-AVX512F-NEXT:    vcvtph2ps %xmm0, %ymm0
+; CHECK-ONLY-AVX512F-NEXT:    vbroadcastss {{.*#+}} ymm1 = [1.5E+1,1.5E+1,1.5E+1,1.5E+1,1.5E+1,1.5E+1,1.5E+1,1.5E+1]
+; CHECK-ONLY-AVX512F-NEXT:    vmulps %ymm1, %ymm0, %ymm0
+; CHECK-ONLY-AVX512F-NEXT:    vcvtps2ph $4, %ymm0, %xmm0
+; CHECK-ONLY-AVX512F-NEXT:    vzeroupper
+; CHECK-ONLY-AVX512F-NEXT:    retq
+;
+; CHECK-SKX-LABEL: fmul_pow_shl_cnt_vec_fail_to_large:
+; CHECK-SKX:       # %bb.0:
+; CHECK-SKX-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2]
+; CHECK-SKX-NEXT:    vpsllvw %xmm0, %xmm1, %xmm0
+; CHECK-SKX-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; CHECK-SKX-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; CHECK-SKX-NEXT:    vcvtps2ph $4, %ymm0, %xmm0
+; CHECK-SKX-NEXT:    vcvtph2ps %xmm0, %ymm0
+; CHECK-SKX-NEXT:    vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
+; CHECK-SKX-NEXT:    vcvtps2ph $4, %ymm0, %xmm0
+; CHECK-SKX-NEXT:    vzeroupper
+; CHECK-SKX-NEXT:    retq
   %shl = shl nsw nuw <2 x i16> <i16 2, i16 2>, %cnt
   %conv = uitofp <2 x i16> %shl to <2 x half>
   %mul = fmul <2 x half> <half 15.000000e+00, half 15.000000e+00>, %conv
@@ -1195,23 +1233,23 @@ define double @fmul_pow_shl_cnt_fail_maybe_bad_exp(i64 %cnt) nounwind {
 ; CHECK-AVX2-NEXT:    vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    retq
 ;
-; CHECK-NO-FASTFMA-LABEL: fmul_pow_shl_cnt_fail_maybe_bad_exp:
-; CHECK-NO-FASTFMA:       # %bb.0:
-; CHECK-NO-FASTFMA-NEXT:    movq %rdi, %rcx
-; CHECK-NO-FASTFMA-NEXT:    movl $1, %eax
-; CHECK-NO-FASTFMA-NEXT:    # kill: def $cl killed $cl killed $rcx
-; CHECK-NO-FASTFMA-NEXT:    shlq %cl, %rax
-; CHECK-NO-FASTFMA-NEXT:    vcvtusi2sd %rax, %xmm15, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    retq
-;
-; CHECK-FMA-LABEL: fmul_pow_shl_cnt_fail_maybe_bad_exp:
-; CHECK-FMA:       # %bb.0:
-; CHECK-FMA-NEXT:    movl $1, %eax
-; CHECK-FMA-NEXT:    shlxq %rdi, %rax, %rax
-; CHECK-FMA-NEXT:    vcvtusi2sd %rax, %xmm15, %xmm0
-; CHECK-FMA-NEXT:    vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; CHECK-FMA-NEXT:    retq
+; CHECK-ONLY-AVX512F-LABEL: fmul_pow_shl_cnt_fail_maybe_bad_exp:
+; CHECK-ONLY-AVX512F:       # %bb.0:
+; CHECK-ONLY-AVX512F-NEXT:    movq %rdi, %rcx
+; CHECK-ONLY-AVX512F-NEXT:    movl $1, %eax
+; CHECK-ONLY-AVX512F-NEXT:    # kill: def $cl killed $cl killed $rcx
+; CHECK-ONLY-AVX512F-NEXT:    shlq %cl, %rax
+; CHECK-ONLY-AVX512F-NEXT:    vcvtusi2sd %rax, %xmm15, %xmm0
+; CHECK-ONLY-AVX512F-NEXT:    vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-ONLY-AVX512F-NEXT:    retq
+;
+; CHECK-SKX-LABEL: fmul_pow_shl_cnt_fail_maybe_bad_exp:
+; CHECK-SKX:       # %bb.0:
+; CHECK-SKX-NEXT:    movl $1, %eax
+; CHECK-SKX-NEXT:    shlxq %rdi, %rax, %rax
+; CHECK-SKX-NEXT:    vcvtusi2sd %rax, %xmm15, %xmm0
+; CHECK-SKX-NEXT:    vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-SKX-NEXT:    retq
   %shl = shl nuw i64 1, %cnt
   %conv = uitofp i64 %shl to double
   %mul = fmul double 9.745314e+288, %conv
@@ -1295,15 +1333,15 @@ define float @fdiv_pow_shl_cnt_fail_maybe_z(i64 %cnt) nounwind {
 ; CHECK-SSE-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; CHECK-SSE-NEXT:    shlq %cl, %rax
 ; CHECK-SSE-NEXT:    testq %rax, %rax
-; CHECK-SSE-NEXT:    js .LBB23_1
+; CHECK-SSE-NEXT:    js .LBB24_1
 ; CHECK-SSE-NEXT:  # %bb.2:
 ; CHECK-SSE-NEXT:    cvtsi2ss %rax, %xmm1
-; CHECK-SSE-NEXT:    jmp .LBB23_3
-; CHECK-SSE-NEXT:  .LBB23_1:
+; CHECK-SSE-NEXT:    jmp .LBB24_3
+; CHECK-SSE-NEXT:  .LBB24_1:
 ; CHECK-SSE-NEXT:    shrq %rax
 ; CHECK-SSE-NEXT:    cvtsi2ss %rax, %xmm1
 ; CHECK-SSE-NEXT:    addss %xmm1, %xmm1
-; CHECK-SSE-NEXT:  .LBB23_3:
+; CHECK-SSE-NEXT:  .LBB24_3:
 ; CHECK-SSE-NEXT:    movss {{.*#+}} xmm0 = [-9.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-SSE-NEXT:    divss %xmm1, %xmm0
 ; CHECK-SSE-NEXT:    retq
@@ -1315,38 +1353,38 @@ define float @fdiv_pow_shl_cnt_fail_maybe_z(i64 %cnt) nounwind {
 ; CHECK-AVX2-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; CHECK-AVX2-NEXT:    shlq %cl, %rax
 ; CHECK-AVX2-NEXT:    testq %rax, %rax
-; CHECK-AVX2-NEXT:    js .LBB23_1
+; CHECK-AVX2-NEXT:    js .LBB24_1
 ; CHECK-AVX2-NEXT:  # %bb.2:
 ; CHECK-AVX2-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm0
-; CHECK-AVX2-NEXT:    jmp .LBB23_3
-; CHECK-AVX2-NEXT:  .LBB23_1:
+; CHECK-AVX2-NEXT:    jmp .LBB24_3
+; CHECK-AVX2-NEXT:  .LBB24_1:
 ; CHECK-AVX2-NEXT:    shrq %rax
 ; CHECK-AVX2-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm0
 ; CHECK-AVX2-NEXT:    vaddss %xmm0, %xmm0, %xmm0
-; CHECK-AVX2-NEXT:  .LBB23_3:
+; CHECK-AVX2-NEXT:  .LBB24_3:
 ; CHECK-AVX2-NEXT:    vmovss {{.*#+}} xmm1 = [-9.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-AVX2-NEXT:    vdivss %xmm0, %xmm1, %xmm0
 ; CHECK-AVX2-NEXT:    retq
 ;
-; CHECK-NO-FASTFMA-LABEL: fdiv_pow_shl_cnt_fail_maybe_z:
-; CHECK-NO-FASTFMA:       # %bb.0:
-; CHECK-NO-FASTFMA-NEXT:    movq %rdi, %rcx
-; CHECK-NO-FASTFMA-NEXT:    movl $8, %eax
-; CHECK-NO-FASTFMA-NEXT:    # kill: def $cl killed $cl killed $rcx
-; CHECK-NO-FASTFMA-NEXT:    shlq %cl, %rax
-; CHECK-NO-FASTFMA-NEXT:    vcvtusi2ss %rax, %xmm15, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    vmovss {{.*#+}} xmm1 = [-9.0E+0,0.0E+0,0.0E+0,0.0E+0]
-; CHECK-NO-FASTFMA-NEXT:    vdivss %xmm0, %xmm1, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    retq
-;
-; CHECK-FMA-LABEL: fdiv_pow_shl_cnt_fail_maybe_z:
-; CHECK-FMA:       # %bb.0:
-; CHECK-FMA-NEXT:    movl $8, %eax
-; CHECK-FMA-NEXT:    shlxq %rdi, %rax, %rax
-; CHECK-FMA-NEXT:    vcvtusi2ss %rax, %xmm15, %xmm0
-; CHECK-FMA-NEXT:    vmovss {{.*#+}} xmm1 = [-9.0E+0,0.0E+0,0.0E+0,0.0E+0]
-; CHECK-FMA-NEXT:    vdivss %xmm0, %xmm1, %xmm0
-; CHECK-FMA-NEXT:    retq
+; CHECK-ONLY-AVX512F-LABEL: fdiv_pow_shl_cnt_fail_maybe_z:
+; CHECK-ONLY-AVX512F:       # %bb.0:
+; CHECK-ONLY-AVX512F-NEXT:    movq %rdi, %rcx
+; CHECK-ONLY-AVX512F-NEXT:    movl $8, %eax
+; CHECK-ONLY-AVX512F-NEXT:    # kill: def $cl killed $cl killed $rcx
+; CHECK-ONLY-AVX512F-NEXT:    shlq %cl, %rax
+; CHECK-ONLY-AVX512F-NEXT:    vcvtusi2ss %rax, %xmm15, %xmm0
+; CHECK-ONLY-AVX512F-NEXT:    vmovss {{.*#+}} xmm1 = [-9.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; CHECK-ONLY-AVX512F-NEXT:    vdivss %xmm0, %xmm1, %xmm0
+; CHECK-ONLY-AVX512F-NEXT:    retq
+;
+; CHECK-SKX-LABEL: fdiv_pow_shl_cnt_fail_maybe_z:
+; CHECK-SKX:       # %bb.0:
+; CHECK-SKX-NEXT:    movl $8, %eax
+; CHECK-SKX-NEXT:    shlxq %rdi, %rax, %rax
+; CHECK-SKX-NEXT:    vcvtusi2ss %rax, %xmm15, %xmm0
+; CHECK-SKX-NEXT:    vmovss {{.*#+}} xmm1 = [-9.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; CHECK-SKX-NEXT:    vdivss %xmm0, %xmm1, %xmm0
+; CHECK-SKX-NEXT:    retq
   %shl = shl i64 8, %cnt
   %conv = uitofp i64 %shl to float
   %mul = fdiv float -9.000000e+00, %conv
@@ -1376,25 +1414,25 @@ define float @fdiv_pow_shl_cnt_fail_neg_int(i64 %cnt) nounwind {
 ; CHECK-AVX2-NEXT:    vdivss %xmm0, %xmm1, %xmm0
 ; CHECK-AVX2-NEXT:    retq
 ;
-; CHECK-NO-FASTFMA-LABEL: fdiv_pow_shl_cnt_fail_neg_int:
-; CHECK-NO-FASTFMA:       # %bb.0:
-; CHECK-NO-FASTFMA-NEXT:    movq %rdi, %rcx
-; CHECK-NO-FASTFMA-NEXT:    movl $8, %eax
-; CHECK-NO-FASTFMA-NEXT:    # kill: def $cl killed $cl killed $rcx
-; CHECK-NO-FASTFMA-NEXT:    shlq %cl, %rax
-; CHECK-NO-FASTFMA-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    vmovss {{.*#+}} xmm1 = [-9.0E+0,0.0E+0,0.0E+0,0.0E+0]
-; CHECK-NO-FASTFMA-NEXT:    vdivss %xmm0, %xmm1, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    retq
-;
-; CHECK-FMA-LABEL: fdiv_pow_shl_cnt_fail_neg_int:
-; CHECK-FMA:       # %bb.0:
-; CHECK-FMA-NEXT:    movl $8, %eax
-; CHECK-FMA-NEXT:    shlxq %rdi, %rax, %rax
-; CHECK-FMA-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm0
-; CHECK-FMA-NEXT:    vmovss {{.*#+}} xmm1 = [-9.0E+0,0.0E+0,0.0E+0,0.0E+0]
-; CHECK-FMA-NEXT:    vdivss %xmm0, %xmm1, %xmm0
-; CHECK-FMA-NEXT:    retq
+; CHECK-ONLY-AVX512F-LABEL: fdiv_pow_shl_cnt_fail_neg_int:
+; CHECK-ONLY-AVX512F:       # %bb.0:
+; CHECK-ONLY-AVX512F-NEXT:    movq %rdi, %rcx
+; CHECK-ONLY-AVX512F-NEXT:    movl $8, %eax
+; CHECK-ONLY-AVX512F-NEXT:    # kill: def $cl killed $cl killed $rcx
+; CHECK-ONLY-AVX512F-NEXT:    shlq %cl, %rax
+; CHECK-ONLY-AVX512F-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm0
+; CHECK-ONLY-AVX512F-NEXT:    vmovss {{.*#+}} xmm1 = [-9.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; CHECK-ONLY-AVX512F-NEXT:    vdivss %xmm0, %xmm1, %xmm0
+; CHECK-ONLY-AVX512F-NEXT:    retq
+;
+; CHECK-SKX-LABEL: fdiv_pow_shl_cnt_fail_neg_int:
+; CHECK-SKX:       # %bb.0:
+; CHECK-SKX-NEXT:    movl $8, %eax
+; CHECK-SKX-NEXT:    shlxq %rdi, %rax, %rax
+; CHECK-SKX-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm0
+; CHECK-SKX-NEXT:    vmovss {{.*#+}} xmm1 = [-9.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; CHECK-SKX-NEXT:    vdivss %xmm0, %xmm1, %xmm0
+; CHECK-SKX-NEXT:    retq
   %shl = shl i64 8, %cnt
   %conv = sitofp i64 %shl to float
   %mul = fdiv float -9.000000e+00, %conv
@@ -1460,31 +1498,31 @@ define half @fdiv_pow_shl_cnt_fail_out_of_bounds(i32 %cnt) nounwind {
 ; CHECK-AVX2-NEXT:    popq %rax
 ; CHECK-AVX2-NEXT:    retq
 ;
-; CHECK-NO-FASTFMA-LABEL: fdiv_pow_shl_cnt_fail_out_of_bounds:
-; CHECK-NO-FASTFMA:       # %bb.0:
-; CHECK-NO-FASTFMA-NEXT:    movl %edi, %ecx
-; CHECK-NO-FASTFMA-NEXT:    movl $1, %eax
-; CHECK-NO-FASTFMA-NEXT:    # kill: def $cl killed $cl killed $ecx
-; CHECK-NO-FASTFMA-NEXT:    shll %cl, %eax
-; CHECK-NO-FASTFMA-NEXT:    vcvtusi2ss %eax, %xmm15, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    vcvtph2ps %xmm0, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    vmovss {{.*#+}} xmm1 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
-; CHECK-NO-FASTFMA-NEXT:    vdivss %xmm0, %xmm1, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    retq
-;
-; CHECK-FMA-LABEL: fdiv_pow_shl_cnt_fail_out_of_bounds:
-; CHECK-FMA:       # %bb.0:
-; CHECK-FMA-NEXT:    movl $1, %eax
-; CHECK-FMA-NEXT:    shlxl %edi, %eax, %eax
-; CHECK-FMA-NEXT:    vcvtusi2ss %eax, %xmm15, %xmm0
-; CHECK-FMA-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; CHECK-FMA-NEXT:    vcvtph2ps %xmm0, %xmm0
-; CHECK-FMA-NEXT:    vmovss {{.*#+}} xmm1 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
-; CHECK-FMA-NEXT:    vdivss %xmm0, %xmm1, %xmm0
-; CHECK-FMA-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; CHECK-FMA-NEXT:    retq
+; CHECK-ONLY-AVX512F-LABEL: fdiv_pow_shl_cnt_fail_out_of_bounds:
+; CHECK-ONLY-AVX512F:       # %bb.0:
+; CHECK-ONLY-AVX512F-NEXT:    movl %edi, %ecx
+; CHECK-ONLY-AVX512F-NEXT:    movl $1, %eax
+; CHECK-ONLY-AVX512F-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-ONLY-AVX512F-NEXT:    shll %cl, %eax
+; CHECK-ONLY-AVX512F-NEXT:    vcvtusi2ss %eax, %xmm15, %xmm0
+; CHECK-ONLY-AVX512F-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; CHECK-ONLY-AVX512F-NEXT:    vcvtph2ps %xmm0, %xmm0
+; CHECK-ONLY-AVX512F-NEXT:    vmovss {{.*#+}} xmm1 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
+; CHECK-ONLY-AVX512F-NEXT:    vdivss %xmm0, %xmm1, %xmm0
+; CHECK-ONLY-AVX512F-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; CHECK-ONLY-AVX512F-NEXT:    retq
+;
+; CHECK-SKX-LABEL: fdiv_pow_shl_cnt_fail_out_of_bounds:
+; CHECK-SKX:       # %bb.0:
+; CHECK-SKX-NEXT:    movl $1, %eax
+; CHECK-SKX-NEXT:    shlxl %edi, %eax, %eax
+; CHECK-SKX-NEXT:    vcvtusi2ss %eax, %xmm15, %xmm0
+; CHECK-SKX-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; CHECK-SKX-NEXT:    vcvtph2ps %xmm0, %xmm0
+; CHECK-SKX-NEXT:    vmovss {{.*#+}} xmm1 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
+; CHECK-SKX-NEXT:    vdivss %xmm0, %xmm1, %xmm0
+; CHECK-SKX-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; CHECK-SKX-NEXT:    retq
   %shl = shl nuw i32 1, %cnt
   %conv = uitofp i32 %shl to half
   %mul = fdiv half 0xH7000, %conv
@@ -1571,33 +1609,33 @@ define half @fdiv_pow_shl_cnt_fail_out_of_bound2(i16 %cnt) nounwind {
 ; CHECK-AVX2-NEXT:    popq %rax
 ; CHECK-AVX2-NEXT:    retq
 ;
-; CHECK-NO-FASTFMA-LABEL: fdiv_pow_shl_cnt_fail_out_of_bound2:
-; CHECK-NO-FASTFMA:       # %bb.0:
-; CHECK-NO-FASTFMA-NEXT:    movl %edi, %ecx
-; CHECK-NO-FASTFMA-NEXT:    movl $1, %eax
-; CHECK-NO-FASTFMA-NEXT:    # kill: def $cl killed $cl killed $ecx
-; CHECK-NO-FASTFMA-NEXT:    shll %cl, %eax
-; CHECK-NO-FASTFMA-NEXT:    movzwl %ax, %eax
-; CHECK-NO-FASTFMA-NEXT:    vcvtsi2ss %eax, %xmm15, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    vcvtph2ps %xmm0, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    vmovss {{.*#+}} xmm1 = [2.0E+0,0.0E+0,0.0E+0,0.0E+0]
-; CHECK-NO-FASTFMA-NEXT:    vdivss %xmm0, %xmm1, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    retq
-;
-; CHECK-FMA-LABEL: fdiv_pow_shl_cnt_fail_out_of_bound2:
-; CHECK-FMA:       # %bb.0:
-; CHECK-FMA-NEXT:    movl $1, %eax
-; CHECK-FMA-NEXT:    shlxl %edi, %eax, %eax
-; CHECK-FMA-NEXT:    movzwl %ax, %eax
-; CHECK-FMA-NEXT:    vcvtsi2ss %eax, %xmm15, %xmm0
-; CHECK-FMA-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; CHECK-FMA-NEXT:    vcvtph2ps %xmm0, %xmm0
-; CHECK-FMA-NEXT:    vmovss {{.*#+}} xmm1 = [2.0E+0,0.0E+0,0.0E+0,0.0E+0]
-; CHECK-FMA-NEXT:    vdivss %xmm0, %xmm1, %xmm0
-; CHECK-FMA-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; CHECK-FMA-NEXT:    retq
+; CHECK-ONLY-AVX512F-LABEL: fdiv_pow_shl_cnt_fail_out_of_bound2:
+; CHECK-ONLY-AVX512F:       # %bb.0:
+; CHECK-ONLY-AVX512F-NEXT:    movl %edi, %ecx
+; CHECK-ONLY-AVX512F-NEXT:    movl $1, %eax
+; CHECK-ONLY-AVX512F-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-ONLY-AVX512F-NEXT:    shll %cl, %eax
+; CHECK-ONLY-AVX512F-NEXT:    movzwl %ax, %eax
+; CHECK-ONLY-AVX512F-NEXT:    vcvtsi2ss %eax, %xmm15, %xmm0
+; CHECK-ONLY-AVX512F-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; CHECK-ONLY-AVX512F-NEXT:    vcvtph2ps %xmm0, %xmm0
+; CHECK-ONLY-AVX512F-NEXT:    vmovss {{.*#+}} xmm1 = [2.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; CHECK-ONLY-AVX512F-NEXT:    vdivss %xmm0, %xmm1, %xmm0
+; CHECK-ONLY-AVX512F-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; CHECK-ONLY-AVX512F-NEXT:    retq
+;
+; CHECK-SKX-LABEL: fdiv_pow_shl_cnt_fail_out_of_bound2:
+; CHECK-SKX:       # %bb.0:
+; CHECK-SKX-NEXT:    movl $1, %eax
+; CHECK-SKX-NEXT:    shlxl %edi, %eax, %eax
+; CHECK-SKX-NEXT:    movzwl %ax, %eax
+; CHECK-SKX-NEXT:    vcvtsi2ss %eax, %xmm15, %xmm0
+; CHECK-SKX-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; CHECK-SKX-NEXT:    vcvtph2ps %xmm0, %xmm0
+; CHECK-SKX-NEXT:    vmovss {{.*#+}} xmm1 = [2.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; CHECK-SKX-NEXT:    vdivss %xmm0, %xmm1, %xmm0
+; CHECK-SKX-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; CHECK-SKX-NEXT:    retq
   %shl = shl nuw i16 1, %cnt
   %conv = uitofp i16 %shl to half
   %mul = fdiv half 0xH4000, %conv
@@ -1653,25 +1691,25 @@ define float @fdiv_pow_shl_cnt32_out_of_bounds2(i32 %cnt) nounwind {
 ; CHECK-AVX2-NEXT:    vdivss %xmm0, %xmm1, %xmm0
 ; CHECK-AVX2-NEXT:    retq
 ;
-; CHECK-NO-FASTFMA-LABEL: fdiv_pow_shl_cnt32_out_of_bounds2:
-; CHECK-NO-FASTFMA:       # %bb.0:
-; CHECK-NO-FASTFMA-NEXT:    movl %edi, %ecx
-; CHECK-NO-FASTFMA-NEXT:    movl $1, %eax
-; CHECK-NO-FASTFMA-NEXT:    # kill: def $cl killed $cl killed $ecx
-; CHECK-NO-FASTFMA-NEXT:    shll %cl, %eax
-; CHECK-NO-FASTFMA-NEXT:    vcvtusi2ss %eax, %xmm15, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    vmovss {{.*#+}} xmm1 = [1.00974148E-28,0.0E+0,0.0E+0,0.0E+0]
-; CHECK-NO-FASTFMA-NEXT:    vdivss %xmm0, %xmm1, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    retq
-;
-; CHECK-FMA-LABEL: fdiv_pow_shl_cnt32_out_of_bounds2:
-; CHECK-FMA:       # %bb.0:
-; CHECK-FMA-NEXT:    movl $1, %eax
-; CHECK-FMA-NEXT:    shlxl %edi, %eax, %eax
-; CHECK-FMA-NEXT:    vcvtusi2ss %eax, %xmm15, %xmm0
-; CHECK-FMA-NEXT:    vmovss {{.*#+}} xmm1 = [1.00974148E-28,0.0E+0,0.0E+0,0.0E+0]
-; CHECK-FMA-NEXT:    vdivss %xmm0, %xmm1, %xmm0
-; CHECK-FMA-NEXT:    retq
+; CHECK-ONLY-AVX512F-LABEL: fdiv_pow_shl_cnt32_out_of_bounds2:
+; CHECK-ONLY-AVX512F:       # %bb.0:
+; CHECK-ONLY-AVX512F-NEXT:    movl %edi, %ecx
+; CHECK-ONLY-AVX512F-NEXT:    movl $1, %eax
+; CHECK-ONLY-AVX512F-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-ONLY-AVX512F-NEXT:    shll %cl, %eax
+; CHECK-ONLY-AVX512F-NEXT:    vcvtusi2ss %eax, %xmm15, %xmm0
+; CHECK-ONLY-AVX512F-NEXT:    vmovss {{.*#+}} xmm1 = [1.00974148E-28,0.0E+0,0.0E+0,0.0E+0]
+; CHECK-ONLY-AVX512F-NEXT:    vdivss %xmm0, %xmm1, %xmm0
+; CHECK-ONLY-AVX512F-NEXT:    retq
+;
+; CHECK-SKX-LABEL: fdiv_pow_shl_cnt32_out_of_bounds2:
+; CHECK-SKX:       # %bb.0:
+; CHECK-SKX-NEXT:    movl $1, %eax
+; CHECK-SKX-NEXT:    shlxl %edi, %eax, %eax
+; CHECK-SKX-NEXT:    vcvtusi2ss %eax, %xmm15, %xmm0
+; CHECK-SKX-NEXT:    vmovss {{.*#+}} xmm1 = [1.00974148E-28,0.0E+0,0.0E+0,0.0E+0]
+; CHECK-SKX-NEXT:    vdivss %xmm0, %xmm1, %xmm0
+; CHECK-SKX-NEXT:    retq
   %shl = shl nuw i32 1, %cnt
   %conv = uitofp i32 %shl to float
   %mul = fdiv float 0x3a1fffff00000000, %conv
diff --git a/llvm/test/CodeGen/X86/isel-ceil.ll b/llvm/test/CodeGen/X86/isel-ceil.ll
index c82cfebd4814d..21df3f1160003 100644
--- a/llvm/test/CodeGen/X86/isel-ceil.ll
+++ b/llvm/test/CodeGen/X86/isel-ceil.ll
@@ -3,8 +3,8 @@
 ; RUN: llc < %s -mtriple=x86_64-linux-gnu -fast-isel | FileCheck %s --check-prefixes=X64,FASTISEL-X64
 ; RUN: llc < %s -mtriple=i686-linux-gnu | FileCheck %s --check-prefixes=X86
 ; RUN: llc < %s -mtriple=i686-linux-gnu -fast-isel | FileCheck %s --check-prefixes=X86
-; RUN: llc < %s -mtriple=x86_64-linux-gnu -global-isel -global-isel-abort=2 | FileCheck %s --check-prefixes=GISEL-X64
-; RUN: llc < %s -mtriple=i686-linux-gnu -global-isel -global-isel-abort=2 | FileCheck %s --check-prefixes=X86
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -global-isel -global-isel-abort=1 | FileCheck %s --check-prefixes=GISEL-X64
+; RUN: llc < %s -mtriple=i686-linux-gnu -global-isel -global-isel-abort=1 | FileCheck %s --check-prefixes=GISEL-X86
 
 define float @ceil_f32(float %a) nounwind readnone {
 ; DAG-X64-LABEL: ceil_f32:
@@ -29,7 +29,19 @@ define float @ceil_f32(float %a) nounwind readnone {
 ;
 ; GISEL-X64-LABEL: ceil_f32:
 ; GISEL-X64:       # %bb.0:
-; GISEL-X64-NEXT:    jmp ceilf@PLT # TAILCALL
+; GISEL-X64-NEXT:    pushq %rax
+; GISEL-X64-NEXT:    callq ceilf
+; GISEL-X64-NEXT:    popq %rax
+; GISEL-X64-NEXT:    retq
+;
+; GISEL-X86-LABEL: ceil_f32:
+; GISEL-X86:       # %bb.0:
+; GISEL-X86-NEXT:    subl $12, %esp
+; GISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; GISEL-X86-NEXT:    movl %eax, (%esp)
+; GISEL-X86-NEXT:    calll ceilf
+; GISEL-X86-NEXT:    addl $12, %esp
+; GISEL-X86-NEXT:    retl
   %c = call float @llvm.ceil.f32(float %a)
   ret float %c
 }
@@ -57,7 +69,24 @@ define double @ceil_f64(double %a) nounwind readnone {
 ;
 ; GISEL-X64-LABEL: ceil_f64:
 ; GISEL-X64:       # %bb.0:
-; GISEL-X64-NEXT:    jmp ceil@PLT # TAILCALL
+; GISEL-X64-NEXT:    pushq %rax
+; GISEL-X64-NEXT:    callq ceil
+; GISEL-X64-NEXT:    popq %rax
+; GISEL-X64-NEXT:    retq
+;
+; GISEL-X86-LABEL: ceil_f64:
+; GISEL-X86:       # %bb.0:
+; GISEL-X86-NEXT:    subl $12, %esp
+; GISEL-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; GISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; GISEL-X86-NEXT:    movl 4(%eax), %eax
+; GISEL-X86-NEXT:    xorl %edx, %edx
+; GISEL-X86-NEXT:    addl %esp, %edx
+; GISEL-X86-NEXT:    movl %ecx, (%esp)
+; GISEL-X86-NEXT:    movl %eax, 4(%edx)
+; GISEL-X86-NEXT:    calll ceil
+; GISEL-X86-NEXT:    addl $12, %esp
+; GISEL-X86-NEXT:    retl
   %c = call double @llvm.ceil.f64(double %a)
   ret double %c
 }
@@ -86,10 +115,18 @@ define x86_fp80 @ceil_f80(x86_fp80 %a) nounwind readnone {
 ; GISEL-X64-NEXT:    subq $24, %rsp
 ; GISEL-X64-NEXT:    fldt {{[0-9]+}}(%rsp)
 ; GISEL-X64-NEXT:    fstpt (%rsp)
-; GISEL-X64-NEXT:    callq ceill@PLT
+; GISEL-X64-NEXT:    callq ceill
 ; GISEL-X64-NEXT:    addq $24, %rsp
 ; GISEL-X64-NEXT:    retq
+;
+; GISEL-X86-LABEL: ceil_f80:
+; GISEL-X86:       # %bb.0:
+; GISEL-X86-NEXT:    subl $12, %esp
+; GISEL-X86-NEXT:    fldt {{[0-9]+}}(%esp)
+; GISEL-X86-NEXT:    fstpt (%esp)
+; GISEL-X86-NEXT:    calll ceill
+; GISEL-X86-NEXT:    addl $12, %esp
+; GISEL-X86-NEXT:    retl
   %c = call x86_fp80 @llvm.ceil.f80(x86_fp80 %a)
   ret x86_fp80 %c
 }
-
diff --git a/llvm/test/CodeGen/X86/isel-floor.ll b/llvm/test/CodeGen/X86/isel-floor.ll
index 675925b611263..66eeee89169ba 100644
--- a/llvm/test/CodeGen/X86/isel-floor.ll
+++ b/llvm/test/CodeGen/X86/isel-floor.ll
@@ -3,8 +3,8 @@
 ; RUN: llc < %s -mtriple=x86_64-linux-gnu -fast-isel | FileCheck %s --check-prefixes=X64,FASTISEL-X64
 ; RUN: llc < %s -mtriple=i686-linux-gnu | FileCheck %s --check-prefixes=X86
 ; RUN: llc < %s -mtriple=i686-linux-gnu -fast-isel | FileCheck %s --check-prefixes=X86
-; RUN: llc < %s -mtriple=x86_64-linux-gnu -global-isel -global-isel-abort=2 | FileCheck %s --check-prefixes=GISEL-X64
-; RUN: llc < %s -mtriple=i686-linux-gnu -global-isel -global-isel-abort=2 | FileCheck %s --check-prefixes=X86
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -global-isel -global-isel-abort=1 | FileCheck %s --check-prefixes=GISEL-X64
+; RUN: llc < %s -mtriple=i686-linux-gnu -global-isel -global-isel-abort=1 | FileCheck %s --check-prefixes=GISEL-X86
 
 define float @floor_f32(float %a) nounwind readnone {
 ; DAG-X64-LABEL: floor_f32:
@@ -29,7 +29,19 @@ define float @floor_f32(float %a) nounwind readnone {
 ;
 ; GISEL-X64-LABEL: floor_f32:
 ; GISEL-X64:       # %bb.0:
-; GISEL-X64-NEXT:    jmp floorf@PLT # TAILCALL
+; GISEL-X64-NEXT:    pushq %rax
+; GISEL-X64-NEXT:    callq floorf
+; GISEL-X64-NEXT:    popq %rax
+; GISEL-X64-NEXT:    retq
+;
+; GISEL-X86-LABEL: floor_f32:
+; GISEL-X86:       # %bb.0:
+; GISEL-X86-NEXT:    subl $12, %esp
+; GISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; GISEL-X86-NEXT:    movl %eax, (%esp)
+; GISEL-X86-NEXT:    calll floorf
+; GISEL-X86-NEXT:    addl $12, %esp
+; GISEL-X86-NEXT:    retl
   %c = call float @llvm.floor.f32(float %a)
   ret float %c
 }
@@ -57,7 +69,24 @@ define double @floor_f64(double %a) nounwind readnone {
 ;
 ; GISEL-X64-LABEL: floor_f64:
 ; GISEL-X64:       # %bb.0:
-; GISEL-X64-NEXT:    jmp floor@PLT # TAILCALL
+; GISEL-X64-NEXT:    pushq %rax
+; GISEL-X64-NEXT:    callq floor
+; GISEL-X64-NEXT:    popq %rax
+; GISEL-X64-NEXT:    retq
+;
+; GISEL-X86-LABEL: floor_f64:
+; GISEL-X86:       # %bb.0:
+; GISEL-X86-NEXT:    subl $12, %esp
+; GISEL-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; GISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; GISEL-X86-NEXT:    movl 4(%eax), %eax
+; GISEL-X86-NEXT:    xorl %edx, %edx
+; GISEL-X86-NEXT:    addl %esp, %edx
+; GISEL-X86-NEXT:    movl %ecx, (%esp)
+; GISEL-X86-NEXT:    movl %eax, 4(%edx)
+; GISEL-X86-NEXT:    calll floor
+; GISEL-X86-NEXT:    addl $12, %esp
+; GISEL-X86-NEXT:    retl
   %c = call double @llvm.floor.f64(double %a)
   ret double %c
 }
@@ -86,10 +115,18 @@ define x86_fp80 @floor_f80(x86_fp80 %a) nounwind readnone {
 ; GISEL-X64-NEXT:    subq $24, %rsp
 ; GISEL-X64-NEXT:    fldt {{[0-9]+}}(%rsp)
 ; GISEL-X64-NEXT:    fstpt (%rsp)
-; GISEL-X64-NEXT:    callq floorl@PLT
+; GISEL-X64-NEXT:    callq floorl
 ; GISEL-X64-NEXT:    addq $24, %rsp
 ; GISEL-X64-NEXT:    retq
+;
+; GISEL-X86-LABEL: floor_f80:
+; GISEL-X86:       # %bb.0:
+; GISEL-X86-NEXT:    subl $12, %esp
+; GISEL-X86-NEXT:    fldt {{[0-9]+}}(%esp)
+; GISEL-X86-NEXT:    fstpt (%esp)
+; GISEL-X86-NEXT:    calll floorl
+; GISEL-X86-NEXT:    addl $12, %esp
+; GISEL-X86-NEXT:    retl
   %c = call x86_fp80 @llvm.floor.f80(x86_fp80 %a)
   ret x86_fp80 %c
 }
-
diff --git a/llvm/test/CodeGen/X86/isel-ftrunc.ll b/llvm/test/CodeGen/X86/isel-ftrunc.ll
index 9bf06193961a3..dcdb016d29aca 100644
--- a/llvm/test/CodeGen/X86/isel-ftrunc.ll
+++ b/llvm/test/CodeGen/X86/isel-ftrunc.ll
@@ -3,8 +3,8 @@
 ; RUN: llc < %s -mtriple=x86_64-linux-gnu -fast-isel | FileCheck %s --check-prefixes=X64,FASTISEL-X64
 ; RUN: llc < %s -mtriple=i686-linux-gnu | FileCheck %s --check-prefixes=X86
 ; RUN: llc < %s -mtriple=i686-linux-gnu -fast-isel | FileCheck %s --check-prefixes=X86
-; RUN: llc < %s -mtriple=x86_64-linux-gnu -global-isel -global-isel-abort=2 | FileCheck %s --check-prefixes=GISEL-X64
-; RUN: llc < %s -mtriple=i686-linux-gnu -global-isel -global-isel-abort=2 | FileCheck %s --check-prefixes=X86
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -global-isel -global-isel-abort=1 | FileCheck %s --check-prefixes=GISEL-X64
+; RUN: llc < %s -mtriple=i686-linux-gnu -global-isel -global-isel-abort=1 | FileCheck %s --check-prefixes=GISEL-X86
 
 define float @trunc_f32(float %a) nounwind readnone {
 ; DAG-X64-LABEL: trunc_f32:
@@ -29,7 +29,19 @@ define float @trunc_f32(float %a) nounwind readnone {
 ;
 ; GISEL-X64-LABEL: trunc_f32:
 ; GISEL-X64:       # %bb.0:
-; GISEL-X64-NEXT:    jmp truncf@PLT # TAILCALL
+; GISEL-X64-NEXT:    pushq %rax
+; GISEL-X64-NEXT:    callq truncf
+; GISEL-X64-NEXT:    popq %rax
+; GISEL-X64-NEXT:    retq
+;
+; GISEL-X86-LABEL: trunc_f32:
+; GISEL-X86:       # %bb.0:
+; GISEL-X86-NEXT:    subl $12, %esp
+; GISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; GISEL-X86-NEXT:    movl %eax, (%esp)
+; GISEL-X86-NEXT:    calll truncf
+; GISEL-X86-NEXT:    addl $12, %esp
+; GISEL-X86-NEXT:    retl
   %c = call float @llvm.trunc.f32(float %a)
   ret float %c
 }
@@ -57,7 +69,24 @@ define double @trunc_f64(double %a) nounwind readnone {
 ;
 ; GISEL-X64-LABEL: trunc_f64:
 ; GISEL-X64:       # %bb.0:
-; GISEL-X64-NEXT:    jmp trunc@PLT # TAILCALL
+; GISEL-X64-NEXT:    pushq %rax
+; GISEL-X64-NEXT:    callq trunc
+; GISEL-X64-NEXT:    popq %rax
+; GISEL-X64-NEXT:    retq
+;
+; GISEL-X86-LABEL: trunc_f64:
+; GISEL-X86:       # %bb.0:
+; GISEL-X86-NEXT:    subl $12, %esp
+; GISEL-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; GISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; GISEL-X86-NEXT:    movl 4(%eax), %eax
+; GISEL-X86-NEXT:    xorl %edx, %edx
+; GISEL-X86-NEXT:    addl %esp, %edx
+; GISEL-X86-NEXT:    movl %ecx, (%esp)
+; GISEL-X86-NEXT:    movl %eax, 4(%edx)
+; GISEL-X86-NEXT:    calll trunc
+; GISEL-X86-NEXT:    addl $12, %esp
+; GISEL-X86-NEXT:    retl
   %c = call double @llvm.trunc.f64(double %a)
   ret double %c
 }
@@ -86,10 +115,18 @@ define x86_fp80 @trunc_f80(x86_fp80   %a) nounwind readnone {
 ; GISEL-X64-NEXT:    subq $24, %rsp
 ; GISEL-X64-NEXT:    fldt {{[0-9]+}}(%rsp)
 ; GISEL-X64-NEXT:    fstpt (%rsp)
-; GISEL-X64-NEXT:    callq truncl@PLT
+; GISEL-X64-NEXT:    callq truncl
 ; GISEL-X64-NEXT:    addq $24, %rsp
 ; GISEL-X64-NEXT:    retq
+;
+; GISEL-X86-LABEL: trunc_f80:
+; GISEL-X86:       # %bb.0:
+; GISEL-X86-NEXT:    subl $12, %esp
+; GISEL-X86-NEXT:    fldt {{[0-9]+}}(%esp)
+; GISEL-X86-NEXT:    fstpt (%esp)
+; GISEL-X86-NEXT:    calll truncl
+; GISEL-X86-NEXT:    addl $12, %esp
+; GISEL-X86-NEXT:    retl
   %c = call x86_fp80   @llvm.trunc.f80(x86_fp80   %a)
   ret x86_fp80   %c
 }
-
diff --git a/llvm/test/CodeGen/X86/isel-smax.ll b/llvm/test/CodeGen/X86/isel-smax.ll
new file mode 100644
index 0000000000000..9c9a48e3a1b3e
--- /dev/null
+++ b/llvm/test/CodeGen/X86/isel-smax.ll
@@ -0,0 +1,189 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=x86_64-linux-gnu | FileCheck %s --check-prefixes=X64
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -fast-isel | FileCheck %s --check-prefixes=FASTISEL-X64
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -global-isel -global-isel-abort=2 | FileCheck %s --check-prefixes=X64
+; RUN: llc < %s -mtriple=i686-linux-gnu | FileCheck %s --check-prefixes=X86
+; RUN: llc < %s -mtriple=i686-linux-gnu -fast-isel | FileCheck %s --check-prefixes=FASTISEL-X86
+; RUN: llc < %s -mtriple=i686-linux-gnu -global-isel -global-isel-abort=2 | FileCheck %s --check-prefixes=X86
+
+define i8 @smax_i8(i8 %a, i8 %b) nounwind readnone {
+; X64-LABEL: smax_i8:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    cmpb %al, %dil
+; X64-NEXT:    cmovgl %edi, %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+;
+; FASTISEL-X64-LABEL: smax_i8:
+; FASTISEL-X64:       # %bb.0:
+; FASTISEL-X64-NEXT:    movzbl %dil, %ecx
+; FASTISEL-X64-NEXT:    movzbl %sil, %eax
+; FASTISEL-X64-NEXT:    cmpb %al, %cl
+; FASTISEL-X64-NEXT:    cmovgl %ecx, %eax
+; FASTISEL-X64-NEXT:    # kill: def $al killed $al killed $eax
+; FASTISEL-X64-NEXT:    retq
+;
+; X86-LABEL: smax_i8:
+; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpb %cl, %al
+; X86-NEXT:    jg .LBB0_2
+; X86-NEXT:  # %bb.1:
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:  .LBB0_2:
+; X86-NEXT:    retl
+;
+; FASTISEL-X86-LABEL: smax_i8:
+; FASTISEL-X86:       # %bb.0:
+; FASTISEL-X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; FASTISEL-X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; FASTISEL-X86-NEXT:    cmpb %cl, %al
+; FASTISEL-X86-NEXT:    jg .LBB0_2
+; FASTISEL-X86-NEXT:  # %bb.1:
+; FASTISEL-X86-NEXT:    movl %ecx, %eax
+; FASTISEL-X86-NEXT:  .LBB0_2:
+; FASTISEL-X86-NEXT:    retl
+    %ret = call i8 @llvm.smax.i8(i8 %a, i8 %b)
+    ret i8 %ret
+}
+define i16 @smax_i16(i16 %a, i16 %b) nounwind readnone {
+; X64-LABEL: smax_i16:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    cmpw %ax, %di
+; X64-NEXT:    cmovgl %edi, %eax
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-NEXT:    retq
+;
+; FASTISEL-X64-LABEL: smax_i16:
+; FASTISEL-X64:       # %bb.0:
+; FASTISEL-X64-NEXT:    movl %esi, %eax
+; FASTISEL-X64-NEXT:    cmpw %ax, %di
+; FASTISEL-X64-NEXT:    cmovgl %edi, %eax
+; FASTISEL-X64-NEXT:    # kill: def $ax killed $ax killed $eax
+; FASTISEL-X64-NEXT:    retq
+;
+; X86-LABEL: smax_i16:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpw %cx, %ax
+; X86-NEXT:    jg .LBB1_2
+; X86-NEXT:  # %bb.1:
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:  .LBB1_2:
+; X86-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-NEXT:    retl
+;
+; FASTISEL-X86-LABEL: smax_i16:
+; FASTISEL-X86:       # %bb.0:
+; FASTISEL-X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; FASTISEL-X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; FASTISEL-X86-NEXT:    cmpw %cx, %ax
+; FASTISEL-X86-NEXT:    jg .LBB1_2
+; FASTISEL-X86-NEXT:  # %bb.1:
+; FASTISEL-X86-NEXT:    movl %ecx, %eax
+; FASTISEL-X86-NEXT:  .LBB1_2:
+; FASTISEL-X86-NEXT:    # kill: def $ax killed $ax killed $eax
+; FASTISEL-X86-NEXT:    retl
+    %ret = call i16 @llvm.smax.i16(i16 %a, i16 %b)
+    ret i16 %ret
+}
+define i32 @smax_i32(i32 %a, i32 %b) nounwind readnone {
+; X64-LABEL: smax_i32:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    cmpl %esi, %edi
+; X64-NEXT:    cmovgl %edi, %eax
+; X64-NEXT:    retq
+;
+; FASTISEL-X64-LABEL: smax_i32:
+; FASTISEL-X64:       # %bb.0:
+; FASTISEL-X64-NEXT:    movl %esi, %eax
+; FASTISEL-X64-NEXT:    cmpl %esi, %edi
+; FASTISEL-X64-NEXT:    cmovgl %edi, %eax
+; FASTISEL-X64-NEXT:    retq
+;
+; X86-LABEL: smax_i32:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl %ecx, %eax
+; X86-NEXT:    jg .LBB2_2
+; X86-NEXT:  # %bb.1:
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:  .LBB2_2:
+; X86-NEXT:    retl
+;
+; FASTISEL-X86-LABEL: smax_i32:
+; FASTISEL-X86:       # %bb.0:
+; FASTISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FASTISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FASTISEL-X86-NEXT:    cmpl %ecx, %eax
+; FASTISEL-X86-NEXT:    jg .LBB2_2
+; FASTISEL-X86-NEXT:  # %bb.1:
+; FASTISEL-X86-NEXT:    movl %ecx, %eax
+; FASTISEL-X86-NEXT:  .LBB2_2:
+; FASTISEL-X86-NEXT:    retl
+    %ret = call i32 @llvm.smax.i32(i32 %a, i32 %b)
+    ret i32 %ret
+}
+define i64 @smax_i64(i64 %a, i64 %b) nounwind readnone {
+; X64-LABEL: smax_i64:
+; X64:       # %bb.0:
+; X64-NEXT:    movq %rsi, %rax
+; X64-NEXT:    cmpq %rsi, %rdi
+; X64-NEXT:    cmovgq %rdi, %rax
+; X64-NEXT:    retq
+;
+; FASTISEL-X64-LABEL: smax_i64:
+; FASTISEL-X64:       # %bb.0:
+; FASTISEL-X64-NEXT:    movq %rsi, %rax
+; FASTISEL-X64-NEXT:    cmpq %rsi, %rdi
+; FASTISEL-X64-NEXT:    cmovgq %rdi, %rax
+; FASTISEL-X64-NEXT:    retq
+;
+; X86-LABEL: smax_i64:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    cmpl %eax, %ecx
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    sbbl %edx, %edi
+; X86-NEXT:    jl .LBB3_2
+; X86-NEXT:  # %bb.1:
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl %esi, %edx
+; X86-NEXT:  .LBB3_2:
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    retl
+;
+; FASTISEL-X86-LABEL: smax_i64:
+; FASTISEL-X86:       # %bb.0:
+; FASTISEL-X86-NEXT:    pushl %edi
+; FASTISEL-X86-NEXT:    pushl %esi
+; FASTISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FASTISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; FASTISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; FASTISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FASTISEL-X86-NEXT:    cmpl %eax, %esi
+; FASTISEL-X86-NEXT:    movl %ecx, %edi
+; FASTISEL-X86-NEXT:    sbbl %edx, %edi
+; FASTISEL-X86-NEXT:    jl .LBB3_2
+; FASTISEL-X86-NEXT:  # %bb.1:
+; FASTISEL-X86-NEXT:    movl %esi, %eax
+; FASTISEL-X86-NEXT:    movl %ecx, %edx
+; FASTISEL-X86-NEXT:  .LBB3_2:
+; FASTISEL-X86-NEXT:    popl %esi
+; FASTISEL-X86-NEXT:    popl %edi
+; FASTISEL-X86-NEXT:    retl
+    %ret = call i64 @llvm.smax.i64(i64 %a, i64 %b)
+    ret i64 %ret
+}
diff --git a/llvm/test/CodeGen/X86/isel-smin.ll b/llvm/test/CodeGen/X86/isel-smin.ll
new file mode 100644
index 0000000000000..7349a7c6a06f3
--- /dev/null
+++ b/llvm/test/CodeGen/X86/isel-smin.ll
@@ -0,0 +1,189 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=x86_64-linux-gnu | FileCheck %s --check-prefixes=X64
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -fast-isel | FileCheck %s --check-prefixes=FASTISEL-X64
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -global-isel -global-isel-abort=2 | FileCheck %s --check-prefixes=X64
+; RUN: llc < %s -mtriple=i686-linux-gnu | FileCheck %s --check-prefixes=X86
+; RUN: llc < %s -mtriple=i686-linux-gnu -fast-isel | FileCheck %s --check-prefixes=FASTISEL-X86
+; RUN: llc < %s -mtriple=i686-linux-gnu -global-isel -global-isel-abort=2 | FileCheck %s --check-prefixes=X86
+
+define i8 @smin_i8(i8 %a, i8 %b) nounwind readnone {
+; X64-LABEL: smin_i8:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    cmpb %al, %dil
+; X64-NEXT:    cmovll %edi, %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+;
+; FASTISEL-X64-LABEL: smin_i8:
+; FASTISEL-X64:       # %bb.0:
+; FASTISEL-X64-NEXT:    movzbl %dil, %ecx
+; FASTISEL-X64-NEXT:    movzbl %sil, %eax
+; FASTISEL-X64-NEXT:    cmpb %al, %cl
+; FASTISEL-X64-NEXT:    cmovll %ecx, %eax
+; FASTISEL-X64-NEXT:    # kill: def $al killed $al killed $eax
+; FASTISEL-X64-NEXT:    retq
+;
+; X86-LABEL: smin_i8:
+; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpb %cl, %al
+; X86-NEXT:    jl .LBB0_2
+; X86-NEXT:  # %bb.1:
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:  .LBB0_2:
+; X86-NEXT:    retl
+;
+; FASTISEL-X86-LABEL: smin_i8:
+; FASTISEL-X86:       # %bb.0:
+; FASTISEL-X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; FASTISEL-X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; FASTISEL-X86-NEXT:    cmpb %cl, %al
+; FASTISEL-X86-NEXT:    jl .LBB0_2
+; FASTISEL-X86-NEXT:  # %bb.1:
+; FASTISEL-X86-NEXT:    movl %ecx, %eax
+; FASTISEL-X86-NEXT:  .LBB0_2:
+; FASTISEL-X86-NEXT:    retl
+    %ret = call i8 @llvm.smin.i8(i8 %a, i8 %b)
+    ret i8 %ret
+}
+define i16 @smin_i16(i16 %a, i16 %b) nounwind readnone {
+; X64-LABEL: smin_i16:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    cmpw %ax, %di
+; X64-NEXT:    cmovll %edi, %eax
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-NEXT:    retq
+;
+; FASTISEL-X64-LABEL: smin_i16:
+; FASTISEL-X64:       # %bb.0:
+; FASTISEL-X64-NEXT:    movl %esi, %eax
+; FASTISEL-X64-NEXT:    cmpw %ax, %di
+; FASTISEL-X64-NEXT:    cmovll %edi, %eax
+; FASTISEL-X64-NEXT:    # kill: def $ax killed $ax killed $eax
+; FASTISEL-X64-NEXT:    retq
+;
+; X86-LABEL: smin_i16:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpw %cx, %ax
+; X86-NEXT:    jl .LBB1_2
+; X86-NEXT:  # %bb.1:
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:  .LBB1_2:
+; X86-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-NEXT:    retl
+;
+; FASTISEL-X86-LABEL: smin_i16:
+; FASTISEL-X86:       # %bb.0:
+; FASTISEL-X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; FASTISEL-X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; FASTISEL-X86-NEXT:    cmpw %cx, %ax
+; FASTISEL-X86-NEXT:    jl .LBB1_2
+; FASTISEL-X86-NEXT:  # %bb.1:
+; FASTISEL-X86-NEXT:    movl %ecx, %eax
+; FASTISEL-X86-NEXT:  .LBB1_2:
+; FASTISEL-X86-NEXT:    # kill: def $ax killed $ax killed $eax
+; FASTISEL-X86-NEXT:    retl
+    %ret = call i16 @llvm.smin.i16(i16 %a, i16 %b)
+    ret i16 %ret
+}
+define i32 @smin_i32(i32 %a, i32 %b) nounwind readnone {
+; X64-LABEL: smin_i32:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    cmpl %esi, %edi
+; X64-NEXT:    cmovll %edi, %eax
+; X64-NEXT:    retq
+;
+; FASTISEL-X64-LABEL: smin_i32:
+; FASTISEL-X64:       # %bb.0:
+; FASTISEL-X64-NEXT:    movl %esi, %eax
+; FASTISEL-X64-NEXT:    cmpl %esi, %edi
+; FASTISEL-X64-NEXT:    cmovll %edi, %eax
+; FASTISEL-X64-NEXT:    retq
+;
+; X86-LABEL: smin_i32:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl %ecx, %eax
+; X86-NEXT:    jl .LBB2_2
+; X86-NEXT:  # %bb.1:
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:  .LBB2_2:
+; X86-NEXT:    retl
+;
+; FASTISEL-X86-LABEL: smin_i32:
+; FASTISEL-X86:       # %bb.0:
+; FASTISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FASTISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FASTISEL-X86-NEXT:    cmpl %ecx, %eax
+; FASTISEL-X86-NEXT:    jl .LBB2_2
+; FASTISEL-X86-NEXT:  # %bb.1:
+; FASTISEL-X86-NEXT:    movl %ecx, %eax
+; FASTISEL-X86-NEXT:  .LBB2_2:
+; FASTISEL-X86-NEXT:    retl
+    %ret = call i32 @llvm.smin.i32(i32 %a, i32 %b)
+    ret i32 %ret
+}
+define i64 @smin_i64(i64 %a, i64 %b) nounwind readnone {
+; X64-LABEL: smin_i64:
+; X64:       # %bb.0:
+; X64-NEXT:    movq %rsi, %rax
+; X64-NEXT:    cmpq %rsi, %rdi
+; X64-NEXT:    cmovlq %rdi, %rax
+; X64-NEXT:    retq
+;
+; FASTISEL-X64-LABEL: smin_i64:
+; FASTISEL-X64:       # %bb.0:
+; FASTISEL-X64-NEXT:    movq %rsi, %rax
+; FASTISEL-X64-NEXT:    cmpq %rsi, %rdi
+; FASTISEL-X64-NEXT:    cmovlq %rdi, %rax
+; FASTISEL-X64-NEXT:    retq
+;
+; X86-LABEL: smin_i64:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpl %ecx, %eax
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    sbbl %esi, %edi
+; X86-NEXT:    jl .LBB3_2
+; X86-NEXT:  # %bb.1:
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl %esi, %edx
+; X86-NEXT:  .LBB3_2:
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    retl
+;
+; FASTISEL-X86-LABEL: smin_i64:
+; FASTISEL-X86:       # %bb.0:
+; FASTISEL-X86-NEXT:    pushl %edi
+; FASTISEL-X86-NEXT:    pushl %esi
+; FASTISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FASTISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; FASTISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; FASTISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FASTISEL-X86-NEXT:    cmpl %esi, %eax
+; FASTISEL-X86-NEXT:    movl %edx, %edi
+; FASTISEL-X86-NEXT:    sbbl %ecx, %edi
+; FASTISEL-X86-NEXT:    jl .LBB3_2
+; FASTISEL-X86-NEXT:  # %bb.1:
+; FASTISEL-X86-NEXT:    movl %esi, %eax
+; FASTISEL-X86-NEXT:    movl %ecx, %edx
+; FASTISEL-X86-NEXT:  .LBB3_2:
+; FASTISEL-X86-NEXT:    popl %esi
+; FASTISEL-X86-NEXT:    popl %edi
+; FASTISEL-X86-NEXT:    retl
+    %ret = call i64 @llvm.smin.i64(i64 %a, i64 %b)
+    ret i64 %ret
+}
diff --git a/llvm/test/CodeGen/X86/isel-umax.ll b/llvm/test/CodeGen/X86/isel-umax.ll
new file mode 100644
index 0000000000000..a90456cdbebb1
--- /dev/null
+++ b/llvm/test/CodeGen/X86/isel-umax.ll
@@ -0,0 +1,189 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=x86_64-linux-gnu | FileCheck %s --check-prefixes=X64
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -fast-isel | FileCheck %s --check-prefixes=FASTISEL-X64
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -global-isel -global-isel-abort=2 | FileCheck %s --check-prefixes=X64
+; RUN: llc < %s -mtriple=i686-linux-gnu | FileCheck %s --check-prefixes=X86
+; RUN: llc < %s -mtriple=i686-linux-gnu -fast-isel | FileCheck %s --check-prefixes=FASTISEL-X86
+; RUN: llc < %s -mtriple=i686-linux-gnu -global-isel -global-isel-abort=2 | FileCheck %s --check-prefixes=X86
+
+define i8 @umax_i8(i8 %a, i8 %b) nounwind readnone {
+; X64-LABEL: umax_i8:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    cmpb %al, %dil
+; X64-NEXT:    cmoval %edi, %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+;
+; FASTISEL-X64-LABEL: umax_i8:
+; FASTISEL-X64:       # %bb.0:
+; FASTISEL-X64-NEXT:    movzbl %dil, %ecx
+; FASTISEL-X64-NEXT:    movzbl %sil, %eax
+; FASTISEL-X64-NEXT:    cmpb %al, %cl
+; FASTISEL-X64-NEXT:    cmoval %ecx, %eax
+; FASTISEL-X64-NEXT:    # kill: def $al killed $al killed $eax
+; FASTISEL-X64-NEXT:    retq
+;
+; X86-LABEL: umax_i8:
+; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpb %cl, %al
+; X86-NEXT:    ja .LBB0_2
+; X86-NEXT:  # %bb.1:
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:  .LBB0_2:
+; X86-NEXT:    retl
+;
+; FASTISEL-X86-LABEL: umax_i8:
+; FASTISEL-X86:       # %bb.0:
+; FASTISEL-X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; FASTISEL-X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; FASTISEL-X86-NEXT:    cmpb %cl, %al
+; FASTISEL-X86-NEXT:    ja .LBB0_2
+; FASTISEL-X86-NEXT:  # %bb.1:
+; FASTISEL-X86-NEXT:    movl %ecx, %eax
+; FASTISEL-X86-NEXT:  .LBB0_2:
+; FASTISEL-X86-NEXT:    retl
+    %ret = call i8 @llvm.umax.i8(i8 %a, i8 %b)
+    ret i8 %ret
+}
+define i16 @umax_i16(i16 %a, i16 %b) nounwind readnone {
+; X64-LABEL: umax_i16:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    cmpw %ax, %di
+; X64-NEXT:    cmoval %edi, %eax
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-NEXT:    retq
+;
+; FASTISEL-X64-LABEL: umax_i16:
+; FASTISEL-X64:       # %bb.0:
+; FASTISEL-X64-NEXT:    movl %esi, %eax
+; FASTISEL-X64-NEXT:    cmpw %ax, %di
+; FASTISEL-X64-NEXT:    cmoval %edi, %eax
+; FASTISEL-X64-NEXT:    # kill: def $ax killed $ax killed $eax
+; FASTISEL-X64-NEXT:    retq
+;
+; X86-LABEL: umax_i16:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpw %cx, %ax
+; X86-NEXT:    ja .LBB1_2
+; X86-NEXT:  # %bb.1:
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:  .LBB1_2:
+; X86-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-NEXT:    retl
+;
+; FASTISEL-X86-LABEL: umax_i16:
+; FASTISEL-X86:       # %bb.0:
+; FASTISEL-X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; FASTISEL-X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; FASTISEL-X86-NEXT:    cmpw %cx, %ax
+; FASTISEL-X86-NEXT:    ja .LBB1_2
+; FASTISEL-X86-NEXT:  # %bb.1:
+; FASTISEL-X86-NEXT:    movl %ecx, %eax
+; FASTISEL-X86-NEXT:  .LBB1_2:
+; FASTISEL-X86-NEXT:    # kill: def $ax killed $ax killed $eax
+; FASTISEL-X86-NEXT:    retl
+    %ret = call i16 @llvm.umax.i16(i16 %a, i16 %b)
+    ret i16 %ret
+}
+define i32 @umax_i32(i32 %a, i32 %b) nounwind readnone {
+; X64-LABEL: umax_i32:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    cmpl %esi, %edi
+; X64-NEXT:    cmoval %edi, %eax
+; X64-NEXT:    retq
+;
+; FASTISEL-X64-LABEL: umax_i32:
+; FASTISEL-X64:       # %bb.0:
+; FASTISEL-X64-NEXT:    movl %esi, %eax
+; FASTISEL-X64-NEXT:    cmpl %esi, %edi
+; FASTISEL-X64-NEXT:    cmoval %edi, %eax
+; FASTISEL-X64-NEXT:    retq
+;
+; X86-LABEL: umax_i32:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl %ecx, %eax
+; X86-NEXT:    ja .LBB2_2
+; X86-NEXT:  # %bb.1:
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:  .LBB2_2:
+; X86-NEXT:    retl
+;
+; FASTISEL-X86-LABEL: umax_i32:
+; FASTISEL-X86:       # %bb.0:
+; FASTISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FASTISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FASTISEL-X86-NEXT:    cmpl %ecx, %eax
+; FASTISEL-X86-NEXT:    ja .LBB2_2
+; FASTISEL-X86-NEXT:  # %bb.1:
+; FASTISEL-X86-NEXT:    movl %ecx, %eax
+; FASTISEL-X86-NEXT:  .LBB2_2:
+; FASTISEL-X86-NEXT:    retl
+    %ret = call i32 @llvm.umax.i32(i32 %a, i32 %b)
+    ret i32 %ret
+}
+define i64 @umax_i64(i64 %a, i64 %b) nounwind readnone {
+; X64-LABEL: umax_i64:
+; X64:       # %bb.0:
+; X64-NEXT:    movq %rsi, %rax
+; X64-NEXT:    cmpq %rsi, %rdi
+; X64-NEXT:    cmovaq %rdi, %rax
+; X64-NEXT:    retq
+;
+; FASTISEL-X64-LABEL: umax_i64:
+; FASTISEL-X64:       # %bb.0:
+; FASTISEL-X64-NEXT:    movq %rsi, %rax
+; FASTISEL-X64-NEXT:    cmpq %rsi, %rdi
+; FASTISEL-X64-NEXT:    cmovaq %rdi, %rax
+; FASTISEL-X64-NEXT:    retq
+;
+; X86-LABEL: umax_i64:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    cmpl %eax, %ecx
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    sbbl %edx, %edi
+; X86-NEXT:    jb .LBB3_2
+; X86-NEXT:  # %bb.1:
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl %esi, %edx
+; X86-NEXT:  .LBB3_2:
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    retl
+;
+; FASTISEL-X86-LABEL: umax_i64:
+; FASTISEL-X86:       # %bb.0:
+; FASTISEL-X86-NEXT:    pushl %edi
+; FASTISEL-X86-NEXT:    pushl %esi
+; FASTISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FASTISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; FASTISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; FASTISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FASTISEL-X86-NEXT:    cmpl %eax, %esi
+; FASTISEL-X86-NEXT:    movl %ecx, %edi
+; FASTISEL-X86-NEXT:    sbbl %edx, %edi
+; FASTISEL-X86-NEXT:    jb .LBB3_2
+; FASTISEL-X86-NEXT:  # %bb.1:
+; FASTISEL-X86-NEXT:    movl %esi, %eax
+; FASTISEL-X86-NEXT:    movl %ecx, %edx
+; FASTISEL-X86-NEXT:  .LBB3_2:
+; FASTISEL-X86-NEXT:    popl %esi
+; FASTISEL-X86-NEXT:    popl %edi
+; FASTISEL-X86-NEXT:    retl
+    %ret = call i64 @llvm.umax.i64(i64 %a, i64 %b)
+    ret i64 %ret
+}
diff --git a/llvm/test/CodeGen/X86/isel-umin.ll b/llvm/test/CodeGen/X86/isel-umin.ll
new file mode 100644
index 0000000000000..53a0b277e6d7b
--- /dev/null
+++ b/llvm/test/CodeGen/X86/isel-umin.ll
@@ -0,0 +1,189 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=x86_64-linux-gnu | FileCheck %s --check-prefixes=X64
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -fast-isel | FileCheck %s --check-prefixes=FASTISEL-X64
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -global-isel -global-isel-abort=2 | FileCheck %s --check-prefixes=X64
+; RUN: llc < %s -mtriple=i686-linux-gnu | FileCheck %s --check-prefixes=X86
+; RUN: llc < %s -mtriple=i686-linux-gnu -fast-isel | FileCheck %s --check-prefixes=FASTISEL-X86
+; RUN: llc < %s -mtriple=i686-linux-gnu -global-isel -global-isel-abort=2 | FileCheck %s --check-prefixes=X86
+
+define i8 @umin_i8(i8 %a, i8 %b) nounwind readnone {
+; X64-LABEL: umin_i8:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    cmpb %al, %dil
+; X64-NEXT:    cmovbl %edi, %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+;
+; FASTISEL-X64-LABEL: umin_i8:
+; FASTISEL-X64:       # %bb.0:
+; FASTISEL-X64-NEXT:    movzbl %dil, %ecx
+; FASTISEL-X64-NEXT:    movzbl %sil, %eax
+; FASTISEL-X64-NEXT:    cmpb %al, %cl
+; FASTISEL-X64-NEXT:    cmovbl %ecx, %eax
+; FASTISEL-X64-NEXT:    # kill: def $al killed $al killed $eax
+; FASTISEL-X64-NEXT:    retq
+;
+; X86-LABEL: umin_i8:
+; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpb %cl, %al
+; X86-NEXT:    jb .LBB0_2
+; X86-NEXT:  # %bb.1:
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:  .LBB0_2:
+; X86-NEXT:    retl
+;
+; FASTISEL-X86-LABEL: umin_i8:
+; FASTISEL-X86:       # %bb.0:
+; FASTISEL-X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; FASTISEL-X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; FASTISEL-X86-NEXT:    cmpb %cl, %al
+; FASTISEL-X86-NEXT:    jb .LBB0_2
+; FASTISEL-X86-NEXT:  # %bb.1:
+; FASTISEL-X86-NEXT:    movl %ecx, %eax
+; FASTISEL-X86-NEXT:  .LBB0_2:
+; FASTISEL-X86-NEXT:    retl
+    %ret = call i8 @llvm.umin.i8(i8 %a, i8 %b)
+    ret i8 %ret
+}
+define i16 @umin_i16(i16 %a, i16 %b) nounwind readnone {
+; X64-LABEL: umin_i16:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    cmpw %ax, %di
+; X64-NEXT:    cmovbl %edi, %eax
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-NEXT:    retq
+;
+; FASTISEL-X64-LABEL: umin_i16:
+; FASTISEL-X64:       # %bb.0:
+; FASTISEL-X64-NEXT:    movl %esi, %eax
+; FASTISEL-X64-NEXT:    cmpw %ax, %di
+; FASTISEL-X64-NEXT:    cmovbl %edi, %eax
+; FASTISEL-X64-NEXT:    # kill: def $ax killed $ax killed $eax
+; FASTISEL-X64-NEXT:    retq
+;
+; X86-LABEL: umin_i16:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpw %cx, %ax
+; X86-NEXT:    jb .LBB1_2
+; X86-NEXT:  # %bb.1:
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:  .LBB1_2:
+; X86-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-NEXT:    retl
+;
+; FASTISEL-X86-LABEL: umin_i16:
+; FASTISEL-X86:       # %bb.0:
+; FASTISEL-X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; FASTISEL-X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; FASTISEL-X86-NEXT:    cmpw %cx, %ax
+; FASTISEL-X86-NEXT:    jb .LBB1_2
+; FASTISEL-X86-NEXT:  # %bb.1:
+; FASTISEL-X86-NEXT:    movl %ecx, %eax
+; FASTISEL-X86-NEXT:  .LBB1_2:
+; FASTISEL-X86-NEXT:    # kill: def $ax killed $ax killed $eax
+; FASTISEL-X86-NEXT:    retl
+    %ret = call i16 @llvm.umin.i16(i16 %a, i16 %b)
+    ret i16 %ret
+}
+define i32 @umin_i32(i32 %a, i32 %b) nounwind readnone {
+; X64-LABEL: umin_i32:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    cmpl %esi, %edi
+; X64-NEXT:    cmovbl %edi, %eax
+; X64-NEXT:    retq
+;
+; FASTISEL-X64-LABEL: umin_i32:
+; FASTISEL-X64:       # %bb.0:
+; FASTISEL-X64-NEXT:    movl %esi, %eax
+; FASTISEL-X64-NEXT:    cmpl %esi, %edi
+; FASTISEL-X64-NEXT:    cmovbl %edi, %eax
+; FASTISEL-X64-NEXT:    retq
+;
+; X86-LABEL: umin_i32:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl %ecx, %eax
+; X86-NEXT:    jb .LBB2_2
+; X86-NEXT:  # %bb.1:
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:  .LBB2_2:
+; X86-NEXT:    retl
+;
+; FASTISEL-X86-LABEL: umin_i32:
+; FASTISEL-X86:       # %bb.0:
+; FASTISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FASTISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FASTISEL-X86-NEXT:    cmpl %ecx, %eax
+; FASTISEL-X86-NEXT:    jb .LBB2_2
+; FASTISEL-X86-NEXT:  # %bb.1:
+; FASTISEL-X86-NEXT:    movl %ecx, %eax
+; FASTISEL-X86-NEXT:  .LBB2_2:
+; FASTISEL-X86-NEXT:    retl
+    %ret = call i32 @llvm.umin.i32(i32 %a, i32 %b)
+    ret i32 %ret
+}
+define i64 @umin_i64(i64 %a, i64 %b) nounwind readnone {
+; X64-LABEL: umin_i64:
+; X64:       # %bb.0:
+; X64-NEXT:    movq %rsi, %rax
+; X64-NEXT:    cmpq %rsi, %rdi
+; X64-NEXT:    cmovbq %rdi, %rax
+; X64-NEXT:    retq
+;
+; FASTISEL-X64-LABEL: umin_i64:
+; FASTISEL-X64:       # %bb.0:
+; FASTISEL-X64-NEXT:    movq %rsi, %rax
+; FASTISEL-X64-NEXT:    cmpq %rsi, %rdi
+; FASTISEL-X64-NEXT:    cmovbq %rdi, %rax
+; FASTISEL-X64-NEXT:    retq
+;
+; X86-LABEL: umin_i64:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpl %ecx, %eax
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    sbbl %esi, %edi
+; X86-NEXT:    jb .LBB3_2
+; X86-NEXT:  # %bb.1:
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl %esi, %edx
+; X86-NEXT:  .LBB3_2:
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    retl
+;
+; FASTISEL-X86-LABEL: umin_i64:
+; FASTISEL-X86:       # %bb.0:
+; FASTISEL-X86-NEXT:    pushl %edi
+; FASTISEL-X86-NEXT:    pushl %esi
+; FASTISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FASTISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; FASTISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; FASTISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FASTISEL-X86-NEXT:    cmpl %esi, %eax
+; FASTISEL-X86-NEXT:    movl %edx, %edi
+; FASTISEL-X86-NEXT:    sbbl %ecx, %edi
+; FASTISEL-X86-NEXT:    jb .LBB3_2
+; FASTISEL-X86-NEXT:  # %bb.1:
+; FASTISEL-X86-NEXT:    movl %esi, %eax
+; FASTISEL-X86-NEXT:    movl %ecx, %edx
+; FASTISEL-X86-NEXT:  .LBB3_2:
+; FASTISEL-X86-NEXT:    popl %esi
+; FASTISEL-X86-NEXT:    popl %edi
+; FASTISEL-X86-NEXT:    retl
+    %ret = call i64 @llvm.umin.i64(i64 %a, i64 %b)
+    ret i64 %ret
+}
diff --git a/llvm/test/CodeGen/X86/symbol-name.ll b/llvm/test/CodeGen/X86/symbol-name.ll
new file mode 100644
index 0000000000000..dd9be14fb053e
--- /dev/null
+++ b/llvm/test/CodeGen/X86/symbol-name.ll
@@ -0,0 +1,5 @@
+; RUN: llc < %s -mtriple=x86_64 -relocation-model=pic | FileCheck %s
+
+; CHECK:      .globl  "\\\""
+; CHECK-NEXT: "\\\"":
+@"\\\22" = constant i8 0
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll
index f7764b1593b51..298858a8fcc73 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll
@@ -1092,3 +1092,116 @@ define void @packss_zext_v8i1() {
   store <16 x i16> %tmp11, ptr undef, align 2
   ret void
 }
+
+define <32 x i16> @PR158415(<8 x i8> %arg) {
+; X86-AVX2-LABEL: PR158415:
+; X86-AVX2:       # %bb.0: # %entry
+; X86-AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u],zero,xmm0[u,u,u,0,2,u,u,u,u,u,u,u,4]
+; X86-AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1]
+; X86-AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; X86-AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24],zero,ymm0[25],zero,ymm0[30],zero,ymm0[31],zero,ymm0[u,u,u,u,u,u,u,u]
+; X86-AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; X86-AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[3,2,2,3]
+; X86-AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[12,13,14,15],zero,zero,ymm1[4,5,u,u,u,u,u,u,u,u,28,29,30,31],zero,zero,ymm1[20,21],zero,zero,ymm1[26,27,28,29,30,31]
+; X86-AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,3,0,2]
+; X86-AVX2-NEXT:    vpbroadcastw {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; X86-AVX2-NEXT:    vpxor %ymm2, %ymm1, %ymm1
+; X86-AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm2
+; X86-AVX2-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
+; X86-AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpbroadcastw %xmm1, %ymm3
+; X86-AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm3[2],ymm0[3,4,5,6,7,8,9],ymm3[10],ymm0[11,12,13,14,15]
+; X86-AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; X86-AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
+; X86-AVX2-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm2[14,15],zero,zero,zero,zero,xmm2[u,u],zero,zero
+; X86-AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm1
+; X86-AVX2-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; X86-AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5],xmm1[6],xmm2[7]
+; X86-AVX2-NEXT:    retl
+;
+; X86-AVX512-LABEL: PR158415:
+; X86-AVX512:       # %bb.0: # %entry
+; X86-AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u],zero,xmm0[u,u,u,0,2,u,u,u,u,u,u,u,4]
+; X86-AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1]
+; X86-AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; X86-AVX512-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; X86-AVX512-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; X86-AVX512-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; X86-AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm1
+; X86-AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; X86-AVX512-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; X86-AVX512-NEXT:    vpbroadcastd %xmm0, %ymm0
+; X86-AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; X86-AVX512-NEXT:    vpxord {{\.?LCPI[0-9]+_[0-9]+}}{1to16}, %zmm0, %zmm0
+; X86-AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; X86-AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; X86-AVX512-NEXT:    vpsrld $16, %xmm2, %xmm2
+; X86-AVX512-NEXT:    vpalignr {{.*#+}} xmm2 = xmm0[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
+; X86-AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; X86-AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,2,3]
+; X86-AVX512-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7]
+; X86-AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; X86-AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; X86-AVX512-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}, %zmm0, %zmm0
+; X86-AVX512-NEXT:    retl
+;
+; X64-AVX2-LABEL: PR158415:
+; X64-AVX2:       # %bb.0: # %entry
+; X64-AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u],zero,xmm0[u,u,u,0,2,u,u,u,u,u,u,u,4]
+; X64-AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1]
+; X64-AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; X64-AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24],zero,ymm0[25],zero,ymm0[30],zero,ymm0[31],zero,ymm0[u,u,u,u,u,u,u,u]
+; X64-AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; X64-AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[3,2,2,3]
+; X64-AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[12,13,14,15],zero,zero,ymm1[4,5,u,u,u,u,u,u,u,u,28,29,30,31],zero,zero,ymm1[20,21],zero,zero,ymm1[26,27,28,29,30,31]
+; X64-AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,3,0,2]
+; X64-AVX2-NEXT:    vpbroadcastw {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; X64-AVX2-NEXT:    vpxor %ymm2, %ymm1, %ymm1
+; X64-AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm2
+; X64-AVX2-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
+; X64-AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpbroadcastw %xmm1, %ymm3
+; X64-AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm3[2],ymm0[3,4,5,6,7,8,9],ymm3[10],ymm0[11,12,13,14,15]
+; X64-AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; X64-AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
+; X64-AVX2-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm2[14,15],zero,zero,zero,zero,xmm2[u,u],zero,zero
+; X64-AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm1
+; X64-AVX2-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; X64-AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5],xmm1[6],xmm2[7]
+; X64-AVX2-NEXT:    retq
+;
+; X64-AVX512-LABEL: PR158415:
+; X64-AVX512:       # %bb.0: # %entry
+; X64-AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u],zero,xmm0[u,u,u,0,2,u,u,u,u,u,u,u,4]
+; X64-AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1]
+; X64-AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; X64-AVX512-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; X64-AVX512-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; X64-AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm1
+; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; X64-AVX512-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; X64-AVX512-NEXT:    vpbroadcastd %xmm0, %ymm0
+; X64-AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; X64-AVX512-NEXT:    vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
+; X64-AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; X64-AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; X64-AVX512-NEXT:    vpsrld $16, %xmm2, %xmm2
+; X64-AVX512-NEXT:    vpalignr {{.*#+}} xmm2 = xmm0[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
+; X64-AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,2,3]
+; X64-AVX512-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7]
+; X64-AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; X64-AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; X64-AVX512-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
+; X64-AVX512-NEXT:    retq
+entry:
+  %shuffle2 = shufflevector <8 x i8> %arg, <8 x i8> zeroinitializer, <32 x i32> <i32 2, i32 2, i32 9, i32 3, i32 1, i32 0, i32 0, i32 2, i32 0, i32 5, i32 9, i32 6, i32 5, i32 4, i32 7, i32 2, i32 7, i32 9, i32 4, i32 0, i32 9, i32 2, i32 4, i32 3, i32 3, i32 2, i32 2, i32 3, i32 9, i32 0, i32 6, i32 4>
+  %conv3 = zext <32 x i8> %shuffle2 to <32 x i16>
+  %shuffle4 = shufflevector <32 x i16> zeroinitializer, <32 x i16> %conv3, <32 x i32> <i32 5, i32 3, i32 4, i32 47, i32 5, i32 5, i32 3, i32 63, i32 4, i32 4, i32 60, i32 2, i32 2, i32 5, i32 4, i32 0, i32 38, i32 1, i32 0, i32 3, i32 59, i32 2, i32 3, i32 1, i32 1, i32 0, i32 3, i32 34, i32 0, i32 0, i32 62, i32 5>
+  %not = xor <32 x i16> %shuffle4, splat (i16 1)
+  %shuffle5 = shufflevector <32 x i16> zeroinitializer, <32 x i16> %not, <32 x i32> <i32 3, i32 9, i32 3, i32 1, i32 9, i32 8, i32 9, i32 2, i32 0, i32 8, i32 48, i32 8, i32 35, i32 3, i32 0, i32 4, i32 4, i32 7, i32 4, i32 39, i32 9, i32 0, i32 59, i32 6, i32 0, i32 4, i32 9, i32 1, i32 1, i32 2, i32 8, i32 9>
+  ret <32 x i16> %shuffle5
+}
diff --git a/llvm/test/CodeGen/X86/win64-tailcall-memory.ll b/llvm/test/CodeGen/X86/win64-tailcall-memory.ll
new file mode 100644
index 0000000000000..568f4fe04fea9
--- /dev/null
+++ b/llvm/test/CodeGen/X86/win64-tailcall-memory.ll
@@ -0,0 +1,48 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=x86_64-unknown-windows-gnu < %s | FileCheck %s
+
+; Check calling convention is correct for win64 when doing a tailcall
+; for a pointer loaded from memory.
+
+declare void @foo(i64, ptr)
+
+define void @do_tailcall(ptr %objp) nounwind {
+; CHECK-LABEL: do_tailcall:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushq %rsi
+; CHECK-NEXT:    subq $32, %rsp
+; CHECK-NEXT:    movq %rcx, %rsi
+; CHECK-NEXT:    xorl %ecx, %ecx
+; CHECK-NEXT:    xorl %edx, %edx
+; CHECK-NEXT:    callq foo
+; CHECK-NEXT:    xorl %ecx, %ecx
+; CHECK-NEXT:    movq %rsi, %rax
+; CHECK-NEXT:    addq $32, %rsp
+; CHECK-NEXT:    popq %rsi
+; CHECK-NEXT:    rex64 jmpq *(%rax) # TAILCALL
+  tail call void @foo(i64 0, ptr null)
+  %fptr = load ptr, ptr %objp, align 8
+  tail call void %fptr(ptr null)
+  ret void
+}
+
+; Make sure aliases of ccc are also treated as win64 functions
+define fastcc void @do_tailcall_fastcc(ptr %objp) nounwind {
+; CHECK-LABEL: do_tailcall_fastcc:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushq %rsi
+; CHECK-NEXT:    subq $32, %rsp
+; CHECK-NEXT:    movq %rcx, %rsi
+; CHECK-NEXT:    xorl %ecx, %ecx
+; CHECK-NEXT:    xorl %edx, %edx
+; CHECK-NEXT:    callq foo
+; CHECK-NEXT:    xorl %ecx, %ecx
+; CHECK-NEXT:    movq %rsi, %rax
+; CHECK-NEXT:    addq $32, %rsp
+; CHECK-NEXT:    popq %rsi
+; CHECK-NEXT:    rex64 jmpq *(%rax) # TAILCALL
+  tail call void @foo(i64 0, ptr null)
+  %fptr = load ptr, ptr %objp, align 8
+  tail call fastcc void %fptr(ptr null)
+  ret void
+}
diff --git a/llvm/test/DebugInfo/AArch64/debug-types.ll b/llvm/test/DebugInfo/AArch64/debug-types.ll
new file mode 100644
index 0000000000000..0d0fd33f49fdf
--- /dev/null
+++ b/llvm/test/DebugInfo/AArch64/debug-types.ll
@@ -0,0 +1,59 @@
+; Check that composite type DIEs go to debug_types section.
+
+; RUN: llc -generate-type-units -filetype=obj %s -o - | llvm-dwarfdump -debug-info -debug-types - | FileCheck %s
+
+; CHECK: .debug_info contents:
+; CHECK: DW_TAG_compile_unit
+; CHECK: DW_TAG_class_type
+; CHECK: DW_AT_signature ([[SIG_A:0x[0-9a-f]+]])
+; CHECK: DW_TAG_subprogram
+; CHECK: NULL
+; CHECK: DW_TAG_subprogram
+; CHECK: "_ZN1A6AppendEv"
+; CHECK: DW_TAG_class_type
+; CHECK: DW_AT_signature ([[SIG_LAMBDA:0x[0-9a-f]+]])
+; CHECK: DW_TAG_variable
+; CHECK: NULL
+; CHECK: DW_TAG_subprogram
+; CHECK: DW_TAG_inlined_subroutine
+; CHECK: NULL
+; CHECK: NULL
+
+; CHECK:      .debug_types contents:
+; CHECK:      Type Unit: {{.*}} type_signature = [[SIG_A]]
+; CHECK:      DW_TAG_class_type
+; CHECK-NOT:    DW_TAG
+; CHECK:        DW_AT_name ("A")
+; CHECK:      Type Unit: {{.*}} type_signature = [[SIG_LAMBDA]]
+; CHECK:      DW_TAG_class_type
+; CHECK:      DW_TAG_class_type
+; CHECK-NOT:    DW_TAG
+; CHECK:        DW_AT_decl_line (7)
+
+target triple = "aarch64-unknown-linux-gnu"
+
+define void @_Z1f1A() !dbg !4 {
+entry:
+  ret void, !dbg !8
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, emissionKind: FullDebug, globals: !2)
+!1 = !DIFile(filename: "<stdin>", directory: "")
+!2 = !{}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!4 = distinct !DISubprogram(name: "f", linkageName: "_Z1f1A", scope: !5, file: !5, line: 14, type: !6, scopeLine: 14, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2)
+!5 = !DIFile(filename: "repro.ii", directory: "")
+!6 = distinct !DISubroutineType(types: !7)
+!7 = !{null}
+!8 = !DILocation(line: 8, column: 12, scope: !9, inlinedAt: !16)
+!9 = distinct !DISubprogram(name: "Append", linkageName: "_ZN1A6AppendEv", scope: !10, file: !5, line: 6, type: !11, scopeLine: 6, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, declaration: !12, retainedNodes: !13)
+!10 = distinct !DICompositeType(tag: DW_TAG_class_type, name: "A", file: !5, line: 3, size: 32, flags: DIFlagTypePassByValue, elements: !2, identifier: "_ZTS1A")
+!11 = distinct !DISubroutineType(types: !7)
+!12 = !DISubprogram(name: "Append", linkageName: "_ZN1A6AppendEv", scope: !10, file: !5, line: 6, type: !11, scopeLine: 6, flags: DIFlagPublic | DIFlagPrototyped, spFlags: DISPFlagOptimized)
+!13 = !{!14}
+!14 = !DILocalVariable(name: "raw_append", scope: !9, file: !5, line: 7, type: !15)
+!15 = distinct !DICompositeType(tag: DW_TAG_class_type, scope: !9, file: !5, line: 7, size: 8, flags: DIFlagTypePassByValue | DIFlagNonTrivial, elements: !2, identifier: "_ZTSZN1A6AppendEvEUlvE_")
+!16 = distinct !DILocation(line: 14, column: 15, scope: !4)
diff --git a/llvm/test/DebugInfo/AArch64/populate-abstract-sp-once.ll b/llvm/test/DebugInfo/AArch64/populate-abstract-sp-once.ll
new file mode 100644
index 0000000000000..20cc98a1bfdcd
--- /dev/null
+++ b/llvm/test/DebugInfo/AArch64/populate-abstract-sp-once.ll
@@ -0,0 +1,67 @@
+; Check that abstract DIEs for inlined subprograms and lexical scopes
+; are populated only once.
+
+; RUN: llc -filetype=obj %s -o - | llvm-dwarfdump - -o - | FileCheck --implicit-check-not=DW_TAG_lexical_scope --implicit-check-not DW_TAG_subprogram %s
+
+; CHECK:  DW_TAG_compile_unit
+; CHECK:    DW_TAG_namespace
+; CHECK:      DW_TAG_subprogram
+; CHECK:        DW_AT_declaration (true)
+; CHECK:      DW_TAG_subprogram
+; CHECK:        DW_AT_declaration (true)
+; CHECK:      DW_TAG_subprogram
+; CHECK:        DW_AT_declaration (true)
+; CHECK:      NULL
+
+; CHECK:  [[ABSTRACT_SP:0x[0-9a-f]+]]: DW_TAG_subprogram
+; CHECK:    DW_AT_inline (DW_INL_inlined)
+
+; CHECK:    DW_TAG_lexical_block
+; CHECK:      DW_TAG_imported_module
+; CHECK:      NULL
+
+; CHECK:    NULL
+
+; CHECK:  DW_TAG_subprogram
+; CHECK:    DW_TAG_inlined_subroutine
+; CHECK:      DW_AT_abstract_origin ([[ABSTRACT_SP]]
+; CHECK:    NULL
+; CHECK:  DW_TAG_subprogram
+; CHECK:    DW_TAG_inlined_subroutine
+; CHECK:      DW_AT_abstract_origin ([[ABSTRACT_SP]]
+; CHECK:    NULL
+
+target triple = "aarch64-unknown-linux-gnu"
+
+define void @_ZN12_GLOBAL__N_117MapRegionCounters14TraverseIfStmtEPN5clang6IfStmtE() !dbg !4 {
+entry:
+  ret void, !dbg !8
+}
+
+define void @_ZN12_GLOBAL__N_117MapRegionCounters9VisitStmtEPN5clang4StmtE() !dbg !15 {
+entry:
+  ret void, !dbg !17
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, retainedTypes: !2, globals: !2, imports: !2)
+!1 = !DIFile(filename: "CodeGenPGO.cpp", directory: "/")
+!2 = !{}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!4 = distinct !DISubprogram(name: "TraverseIfStmt", linkageName: "_ZN12_GLOBAL__N_117MapRegionCounters14TraverseIfStmtEPN5clang6IfStmtE", scope: !5, file: !1, line: 364, type: !6, scopeLine: 364, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0, declaration: !7, retainedNodes: !2, keyInstructions: true)
+!5 = !DINamespace(name: "llvm", scope: null)
+!6 = distinct !DISubroutineType(types: !2)
+!7 = !DISubprogram(name: "TraverseIfStmt", linkageName: "_ZN12_GLOBAL__N_117MapRegionCounters14TraverseIfStmtEPN5clang6IfStmtE", scope: !5, file: !1, line: 364, type: !6, scopeLine: 364, flags: DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagOptimized)
+!8 = !DILocation(line: 982, column: 39, scope: !9, inlinedAt: !14, atomGroup: 6, atomRank: 2)
+!9 = distinct !DISubprogram(name: "combine", linkageName: "_ZN12_GLOBAL__N_17PGOHash7combineENS0_8HashTypeE", scope: !5, file: !1, line: 966, type: !6, scopeLine: 966, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0, declaration: !10, retainedNodes: !11, keyInstructions: true)
+!10 = !DISubprogram(name: "combine", linkageName: "_ZN12_GLOBAL__N_17PGOHash7combineENS0_8HashTypeE", scope: !5, file: !1, line: 140, type: !6, scopeLine: 140, flags: DIFlagPublic | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagOptimized)
+!11 = !{!12}
+!12 = !DIImportedEntity(tag: DW_TAG_imported_module, scope: !13, entity: !5, file: !1, line: 973)
+!13 = distinct !DILexicalBlock(scope: !9, file: !1, line: 972, column: 7)
+!14 = distinct !DILocation(line: 393, column: 10, scope: !4)
+!15 = distinct !DISubprogram(name: "VisitStmt", linkageName: "_ZN12_GLOBAL__N_117MapRegionCounters9VisitStmtEPN5clang4StmtE", scope: !5, file: !1, line: 355, type: !6, scopeLine: 355, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0, declaration: !16, retainedNodes: !2, keyInstructions: true)
+!16 = !DISubprogram(name: "VisitStmt", linkageName: "_ZN12_GLOBAL__N_117MapRegionCounters9VisitStmtEPN5clang4StmtE", scope: !5, file: !1, line: 355, type: !6, scopeLine: 355, flags: DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagOptimized)
+!17 = !DILocation(line: 982, column: 13, scope: !9, inlinedAt: !18)
+!18 = distinct !DILocation(line: 360, column: 12, scope: !15)
diff --git a/llvm/test/DebugInfo/AMDGPU/debug-loc-copy.ll b/llvm/test/DebugInfo/AMDGPU/debug-loc-copy.ll
index 8b54f709eec7a..1f13282a1f04c 100644
--- a/llvm/test/DebugInfo/AMDGPU/debug-loc-copy.ll
+++ b/llvm/test/DebugInfo/AMDGPU/debug-loc-copy.ll
@@ -14,17 +14,15 @@ define void @_Z12lane_pc_testj() #0 !dbg !9 {
 ; GCN-NEXT:  ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:  ; %bb.1: ; %lab
+; GCN-NEXT:    s_mov_b64 s[4:5], 0
 ; GCN-NEXT:  .Ltmp0:
 ; GCN-NEXT:    .loc 0 12 1 prologue_end ; t.cpp:12:1
-; GCN-NEXT:    s_mov_b64 s[4:5], src_private_base
-; GCN-NEXT:    s_mov_b32 s6, 32
-; GCN-NEXT:    s_lshr_b64 s[4:5], s[4:5], s6
-; GCN-NEXT:    s_mov_b64 s[6:7], 0
-; GCN-NEXT:    s_mov_b32 s5, -1
+; GCN-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GCN-NEXT:    s_mov_b32 s6, -1
 ; GCN-NEXT:    s_lshr_b32 s8, s32, 5
-; GCN-NEXT:    s_cmp_lg_u32 s8, s5
-; GCN-NEXT:    s_cselect_b32 s5, s4, s7
-; GCN-NEXT:    s_cselect_b32 s4, s8, s6
+; GCN-NEXT:    s_cmp_lg_u32 s8, s6
+; GCN-NEXT:    s_cselect_b32 s5, s7, s5
+; GCN-NEXT:    s_cselect_b32 s4, s8, s4
 ; GCN-NEXT:    v_mov_b32_e32 v2, 0
 ; GCN-NEXT:    .loc 0 13 1 ; t.cpp:13:1
 ; GCN-NEXT:    v_mov_b32_e32 v0, s4
diff --git a/llvm/test/DebugInfo/Generic/inlined-static-var.ll b/llvm/test/DebugInfo/Generic/inlined-static-var.ll
new file mode 100644
index 0000000000000..1d24646896d80
--- /dev/null
+++ b/llvm/test/DebugInfo/Generic/inlined-static-var.ll
@@ -0,0 +1,93 @@
+; RUN: %llc_dwarf -O0 -filetype=obj < %s | llvm-dwarfdump -debug-info - | FileCheck --implicit-check-not "{{DW_TAG|NULL}}" %s
+
+; inline __attribute__((always_inline))
+; int removed() { static int A; return A++; }
+;
+; __attribute__((always_inline))
+; int not_removed() { static int B; return B++; }
+;
+; int foo() { return removed() + not_removed(); }
+
+; Ensure that global variables belong to the correct subprograms even if those
+; subprograms are inlined.
+
+; CHECK: DW_TAG_compile_unit
+; CHECK:   DW_TAG_subprogram
+; CHECK:     DW_AT_abstract_origin {{.*}} "_Z11not_removedv"
+; TODO: This variable should be emitted in abstract subprogram DIE.
+; CHECK:     DW_TAG_variable
+; CHECK:       DW_AT_name     ("B")
+; CHECK:     NULL
+; CHECK:   DW_TAG_base_type
+; CHECK:   DW_TAG_subprogram
+; CHECK:     DW_AT_name       ("removed")
+; CHECK:     DW_TAG_variable
+; CHECK:       DW_AT_name     ("A")
+; CHECK:     NULL
+; CHECK:   DW_TAG_subprogram
+; CHECK:     DW_AT_name       ("not_removed")
+; CHECK:   DW_TAG_subprogram
+; CHECK:     DW_AT_name       ("foo")
+; CHECK:     DW_TAG_inlined_subroutine
+; CHECK:     DW_TAG_inlined_subroutine
+; CHECK:     NULL
+; CHECK:   NULL
+
+@_ZZ11not_removedvE1A = internal global i32 0, align 4, !dbg !0
+@_ZZ7removedvE1A = linkonce_odr dso_local global i32 0, align 4, !dbg !10
+
+define dso_local i32 @_Z11not_removedv() !dbg !2 {
+  %1 = load i32, i32* @_ZZ11not_removedvE1A, align 4, !dbg !24
+  %2 = add nsw i32 %1, 1, !dbg !24
+  store i32 %2, i32* @_ZZ11not_removedvE1A, align 4, !dbg !24
+  ret i32 %1, !dbg !25
+}
+
+define dso_local i32 @_Z3foov() !dbg !26 {
+  %1 = load i32, i32* @_ZZ7removedvE1A, align 4, !dbg !27
+  %2 = add nsw i32 %1, 1, !dbg !27
+  store i32 %2, i32* @_ZZ7removedvE1A, align 4, !dbg !27
+  %3 = load i32, i32* @_ZZ11not_removedvE1A, align 4, !dbg !29
+  %4 = add nsw i32 %3, 1, !dbg !29
+  store i32 %4, i32* @_ZZ11not_removedvE1A, align 4, !dbg !29
+  %5 = add nsw i32 %1, %3, !dbg !31
+  ret i32 %5, !dbg !32
+}
+
+!llvm.dbg.cu = !{!7}
+!llvm.module.flags = !{!14, !15, !16, !17, !18, !19, !20, !21, !22}
+!llvm.ident = !{!23}
+
+!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression())
+!1 = distinct !DIGlobalVariable(name: "B", scope: !2, file: !3, line: 5, type: !6, isLocal: true, isDefinition: true)
+!2 = distinct !DISubprogram(name: "not_removed", linkageName: "_Z11not_removedv", scope: !3, file: !3, line: 5, type: !4, scopeLine: 5, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !7, retainedNodes: !13)
+!3 = !DIFile(filename: "example.cpp", directory: "")
+!4 = !DISubroutineType(types: !5)
+!5 = !{!6}
+!6 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!7 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !8, producer: "clang version 14.0.0", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, globals: !9, splitDebugInlining: false, nameTableKind: None)
+!8 = !DIFile(filename: "example.cpp", directory: "")
+!9 = !{!0, !10}
+!10 = !DIGlobalVariableExpression(var: !11, expr: !DIExpression())
+!11 = distinct !DIGlobalVariable(name: "A", scope: !12, file: !3, line: 2, type: !6, isLocal: false, isDefinition: true)
+!12 = distinct !DISubprogram(name: "removed", linkageName: "_Z7removedv", scope: !3, file: !3, line: 2, type: !4, scopeLine: 2, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !7, retainedNodes: !13)
+!13 = !{}
+!14 = !{i32 7, !"Dwarf Version", i32 4}
+!15 = !{i32 2, !"Debug Info Version", i32 3}
+!16 = !{i32 1, !"wchar_size", i32 4}
+!17 = !{i32 1, !"branch-target-enforcement", i32 0}
+!18 = !{i32 1, !"sign-return-address", i32 0}
+!19 = !{i32 1, !"sign-return-address-all", i32 0}
+!20 = !{i32 1, !"sign-return-address-with-bkey", i32 0}
+!21 = !{i32 7, !"uwtable", i32 1}
+!22 = !{i32 7, !"frame-pointer", i32 1}
+!23 = !{!"clang version 14.0.0"}
+!24 = !DILocation(line: 5, column: 43, scope: !2)
+!25 = !DILocation(line: 5, column: 35, scope: !2)
+!26 = distinct !DISubprogram(name: "foo", linkageName: "_Z3foov", scope: !3, file: !3, line: 7, type: !4, scopeLine: 7, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !7, retainedNodes: !13)
+!27 = !DILocation(line: 2, column: 39, scope: !12, inlinedAt: !28)
+!28 = distinct !DILocation(line: 7, column: 20, scope: !26)
+!29 = !DILocation(line: 5, column: 43, scope: !2, inlinedAt: !30)
+!30 = distinct !DILocation(line: 7, column: 32, scope: !26)
+!31 = !DILocation(line: 7, column: 30, scope: !26)
+!32 = !DILocation(line: 7, column: 13, scope: !26)
diff --git a/llvm/test/DebugInfo/RISCV/dw_op_entry_value_32bit.ll b/llvm/test/DebugInfo/RISCV/dw_op_entry_value_32bit.ll
new file mode 100644
index 0000000000000..cb7c61df77646
--- /dev/null
+++ b/llvm/test/DebugInfo/RISCV/dw_op_entry_value_32bit.ll
@@ -0,0 +1,65 @@
+;; Test RISC-V 32 bit:
+; RUN: llc -emit-call-site-info -stop-after=livedebugvalues -mtriple=riscv32-linux-gnu -o - %s | FileCheck %s --check-prefix=CHECK32
+
+;; Built from source:
+;; extern long fn1(long,long,long);
+;; long fn2(long a, long b, long c) {
+;;   long local = fn1(a+b, c, b+10);
+;;   if (local > 10)
+;;     return local + 10;
+;;   return b;
+;; }
+;; Using command:
+;; clang -g -O2 -target riscv32-linux-gnu m.c -c -S -emit-llvm
+;; Confirm that info from callSites attribute is used as entry_value in DIExpression.
+
+;; Test riscv32:
+; CHECK32: $x10 = nsw ADD $x11, killed renamable $x10
+; CHECK32-NEXT: DBG_VALUE $x10, $noreg, !{{.*}}, !DIExpression(DW_OP_LLVM_entry_value, 1)
+
+; ModuleID = 'm.c'
+source_filename = "m.c"
+target datalayout = "e-m:e-p:32:32-i64:64-n32-S128"
+target triple = "riscv32-unknown-linux-gnu"
+
+; Function Attrs: nounwind uwtable
+define i32 @fn2(i32 noundef %a, i32 noundef %b, i32 noundef %c) !dbg !14 {
+entry:
+    #dbg_value(i32 %a, !20, !DIExpression(), !23)
+    #dbg_value(i32 %b, !21, !DIExpression(), !23)
+    #dbg_value(i32 %c, !22, !DIExpression(), !23)
+  %add = add nsw i32 %b, %a
+  %add1 = add nsw i32 %b, 10
+  %call = tail call i32 @fn1(i32 noundef %add, i32 noundef %c, i32 noundef %add1)
+    #dbg_value(i32 %call, !22, !DIExpression(), !23)
+  %cmp = icmp sgt i32 %call, 10
+  %add2 = add nuw nsw i32 %call, 10
+  %retval.0 = select i1 %cmp, i32 %add2, i32 %b
+  ret i32 %retval.0, !dbg !29
+}
+
+declare !dbg !30 i32 @fn1(i32 noundef, i32 noundef, i32 noundef)
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3, !4}
+!llvm.ident = !{!5}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C11, file: !1, producer: "clang", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None)
+!1 = !DIFile(filename: "m.c", directory: ".")
+!2 = !{i32 7, !"Dwarf Version", i32 5}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!4 = !{i32 1, !"wchar_size", i32 4}
+!5 = !{!"clang"}
+!14 = distinct !DISubprogram(name: "fn2", scope: !1, file: !1, line: 2, type: !15, scopeLine: 2, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !18)
+!15 = !DISubroutineType(types: !16)
+!16 = !{!17, !17, !17, !17}
+!17 = !DIBasicType(name: "long", size: 64, encoding: DW_ATE_signed)
+!18 = !{!19, !20, !21, !22}
+!19 = !DILocalVariable(name: "a", arg: 1, scope: !14, file: !1, line: 2, type: !17)
+!20 = !DILocalVariable(name: "b", arg: 2, scope: !14, file: !1, line: 2, type: !17)
+!21 = !DILocalVariable(name: "c", arg: 3, scope: !14, file: !1, line: 2, type: !17)
+!22 = !DILocalVariable(name: "local", scope: !14, file: !1, line: 3, type: !17)
+!23 = !DILocation(line: 0, scope: !14)
+!29 = !DILocation(line: 7, column: 1, scope: !14)
+!30 = !DISubprogram(name: "fn1", scope: !1, file: !1, line: 1, type: !15, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized)
+
diff --git a/llvm/test/DebugInfo/RISCV/dw_op_entry_value_64bit.ll b/llvm/test/DebugInfo/RISCV/dw_op_entry_value_64bit.ll
new file mode 100644
index 0000000000000..cd6a7650780e6
--- /dev/null
+++ b/llvm/test/DebugInfo/RISCV/dw_op_entry_value_64bit.ll
@@ -0,0 +1,65 @@
+;; Test RISC-V 64 bit:
+; RUN: llc -emit-call-site-info -stop-after=livedebugvalues -mtriple=riscv64-linux-gnu -o - %s | FileCheck %s --check-prefix=CHECK64
+
+;; Built from source:
+;; extern long fn1(long,long,long);
+;; long fn2(long a, long b, long c) {
+;;   long local = fn1(a+b, c, b+10);
+;;   if (local > 10)
+;;     return local + 10;
+;;   return b;
+;; }
+;; Using command:
+;; clang -g -O2 -target riscv64-linux-gnu m.c -c -S -emit-llvm
+;; Confirm that info from callSites attribute is used as entry_value in DIExpression.
+
+;; Test riscv64:
+; CHECK64: $x10 = nsw ADD $x11, killed renamable $x10
+; CHECK64-NEXT: DBG_VALUE $x10, $noreg, !{{.*}}, !DIExpression(DW_OP_LLVM_entry_value, 1)
+
+; ModuleID = 'm.c'
+source_filename = "m.c"
+target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128"
+target triple = "riscv64-unknown-linux-gnu"
+
+; Function Attrs: nounwind uwtable
+define i64 @fn2(i64 noundef %a, i64 noundef %b, i64 noundef %c) !dbg !14 {
+entry:
+    #dbg_value(i64 %a, !19, !DIExpression(), !23)
+    #dbg_value(i64 %b, !20, !DIExpression(), !23)
+    #dbg_value(i64 %c, !21, !DIExpression(), !23)
+  %add = add nsw i64 %b, %a
+  %add1 = add nsw i64 %b, 10
+  %call = tail call i64 @fn1(i64 noundef %add, i64 noundef %c, i64 noundef %add1)
+    #dbg_value(i64 %call, !22, !DIExpression(), !23)
+  %cmp = icmp sgt i64 %call, 10
+  %add2 = add nuw nsw i64 %call, 10
+  %retval.0 = select i1 %cmp, i64 %add2, i64 %b
+  ret i64 %retval.0, !dbg !29
+}
+
+declare !dbg !30 i64 @fn1(i64 noundef, i64 noundef, i64 noundef)
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3, !4}
+!llvm.ident = !{!5}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C11, file: !1, producer: "clang", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None)
+!1 = !DIFile(filename: "m.c", directory: ".")
+!2 = !{i32 7, !"Dwarf Version", i32 5}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!4 = !{i32 1, !"wchar_size", i32 4}
+!5 = !{!"clang"}
+!14 = distinct !DISubprogram(name: "fn2", scope: !1, file: !1, line: 2, type: !15, scopeLine: 2, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !18)
+!15 = !DISubroutineType(types: !16)
+!16 = !{!17, !17, !17, !17}
+!17 = !DIBasicType(name: "long", size: 64, encoding: DW_ATE_signed)
+!18 = !{!19, !20, !21, !22}
+!19 = !DILocalVariable(name: "a", arg: 1, scope: !14, file: !1, line: 2, type: !17)
+!20 = !DILocalVariable(name: "b", arg: 2, scope: !14, file: !1, line: 2, type: !17)
+!21 = !DILocalVariable(name: "c", arg: 3, scope: !14, file: !1, line: 2, type: !17)
+!22 = !DILocalVariable(name: "local", scope: !14, file: !1, line: 3, type: !17)
+!23 = !DILocation(line: 0, scope: !14)
+!29 = !DILocation(line: 7, column: 1, scope: !14)
+!30 = !DISubprogram(name: "fn1", scope: !1, file: !1, line: 1, type: !15, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized)
+
diff --git a/llvm/test/DebugInfo/X86/DW_AT_LLVM_stmt_seq_sec_offset.ll b/llvm/test/DebugInfo/X86/DW_AT_LLVM_stmt_seq_sec_offset.ll
index 58f6495924b90..f17c6e5429b6b 100644
--- a/llvm/test/DebugInfo/X86/DW_AT_LLVM_stmt_seq_sec_offset.ll
+++ b/llvm/test/DebugInfo/X86/DW_AT_LLVM_stmt_seq_sec_offset.ll
@@ -14,7 +14,7 @@
 ; STMT_SEQ:       DW_AT_LLVM_stmt_sequence [DW_FORM_sec_offset]	(0x00000043)
 ; STMT_SEQ:   DW_AT_name {{.*}}func01
 ; STMT_SEQ:   DW_TAG_subprogram [[[ABBREV_CODE2]]]
-; STMT_SEQ:       DW_AT_LLVM_stmt_sequence [DW_FORM_sec_offset]	(0x00000056)
+; STMT_SEQ:       DW_AT_LLVM_stmt_sequence [DW_FORM_sec_offset]	(0x00000058)
 ; STMT_SEQ:   DW_AT_name {{.*}}main
 
 ;; Check the entire line sequence to see that it's correct
@@ -29,22 +29,23 @@
 ; STMT_SEQ-NEXT:  0x00000050: 05 DW_LNS_set_column (3)
 ; STMT_SEQ-NEXT:  0x00000052: 67 address += 6,  line += 1,  op-index += 0
 ; STMT_SEQ-NEXT:              0x0000000000000006      6      3      0   0             0       0  is_stmt
-; STMT_SEQ-NEXT:  0x00000053: 00 DW_LNE_end_sequence
-; STMT_SEQ-NEXT:              0x0000000000000006      6      3      0   0             0       0  is_stmt end_sequence
-; STMT_SEQ-NEXT:  0x00000056: 04 DW_LNS_set_file (0)
-; STMT_SEQ-NEXT:  0x00000058: 00 DW_LNE_set_address (0x00000008)
-; STMT_SEQ-NEXT:  0x0000005f: 03 DW_LNS_advance_line (10)
-; STMT_SEQ-NEXT:  0x00000061: 01 DW_LNS_copy
+; STMT_SEQ-NEXT:  0x00000053: 02 DW_LNS_advance_pc (addr += 2, op-index += 0)
+; STMT_SEQ-NEXT:  0x00000055: 00 DW_LNE_end_sequence
+; STMT_SEQ-NEXT:              0x0000000000000008      6      3      0   0             0       0  is_stmt end_sequence
+; STMT_SEQ-NEXT:  0x00000058: 04 DW_LNS_set_file (0)
+; STMT_SEQ-NEXT:  0x0000005a: 00 DW_LNE_set_address (0x00000008)
+; STMT_SEQ-NEXT:  0x00000061: 03 DW_LNS_advance_line (10)
+; STMT_SEQ-NEXT:  0x00000063: 01 DW_LNS_copy
 ; STMT_SEQ-NEXT:              0x0000000000000008     10      0      0   0             0       0  is_stmt
-; STMT_SEQ-NEXT:  0x00000062: 05 DW_LNS_set_column (10)
-; STMT_SEQ-NEXT:  0x00000064: 0a DW_LNS_set_prologue_end
-; STMT_SEQ-NEXT:  0x00000065: 83 address += 8,  line += 1,  op-index += 0
+; STMT_SEQ-NEXT:  0x00000064: 05 DW_LNS_set_column (10)
+; STMT_SEQ-NEXT:  0x00000066: 0a DW_LNS_set_prologue_end
+; STMT_SEQ-NEXT:  0x00000067: 83 address += 8,  line += 1,  op-index += 0
 ; STMT_SEQ-NEXT:              0x0000000000000010     11     10      0   0             0       0  is_stmt prologue_end
-; STMT_SEQ-NEXT:  0x00000066: 05 DW_LNS_set_column (3)
-; STMT_SEQ-NEXT:  0x00000068: 9f address += 10,  line += 1,  op-index += 0
+; STMT_SEQ-NEXT:  0x00000068: 05 DW_LNS_set_column (3)
+; STMT_SEQ-NEXT:  0x0000006a: 9f address += 10,  line += 1,  op-index += 0
 ; STMT_SEQ-NEXT:              0x000000000000001a     12      3      0   0             0       0  is_stmt
-; STMT_SEQ-NEXT:  0x00000069: 02 DW_LNS_advance_pc (addr += 5, op-index += 0)
-; STMT_SEQ-NEXT:  0x0000006b: 00 DW_LNE_end_sequence
+; STMT_SEQ-NEXT:  0x0000006b: 02 DW_LNS_advance_pc (addr += 5, op-index += 0)
+; STMT_SEQ-NEXT:  0x0000006d: 00 DW_LNE_end_sequence
 ; STMT_SEQ-NEXT:              0x000000000000001f     12      3      0   0             0       0  is_stmt end_sequence
 
 ; generated from:
diff --git a/llvm/test/DebugInfo/unrolled-loop-remainder.ll b/llvm/test/DebugInfo/unrolled-loop-remainder.ll
index f2bd855015e77..c6035ffa65e08 100644
--- a/llvm/test/DebugInfo/unrolled-loop-remainder.ll
+++ b/llvm/test/DebugInfo/unrolled-loop-remainder.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt -passes=loop-unroll -unroll-runtime -unroll-allow-remainder -unroll-count=4 -unroll-remainder -S %s -o - | FileCheck %s
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
@@ -12,13 +12,14 @@ target triple = "x86_64-unknown-linux-gnu"
 
 define i32 @func_c() local_unnamed_addr #0 !dbg !14 {
 ;
-; CHECK-LABEL: @func_c(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[DOTPR:%.*]] = load i32, ptr @b, align 4, !dbg [[DBG17:![0-9]+]], !tbaa [[TBAA20:![0-9]+]]
+; CHECK-LABEL: define i32 @func_c(
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] !dbg [[DBG14:![0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[DOTPR:%.*]] = load i32, ptr @b, align 4, !dbg [[DBG17:![0-9]+]], !tbaa [[INT_TBAA20:![0-9]+]]
 ; CHECK-NEXT:    [[TOBOOL1:%.*]] = icmp eq i32 [[DOTPR]], 0, !dbg [[DBG24:![0-9]+]]
-; CHECK-NEXT:    br i1 [[TOBOOL1]], label [[FOR_END:%.*]], label [[FOR_BODY_LR_PH:%.*]], !dbg [[DBG24]]
-; CHECK:       for.body.lr.ph:
-; CHECK-NEXT:    [[A_PROMOTED:%.*]] = load ptr, ptr @a, align 8, !dbg [[DBG25:![0-9]+]], !tbaa [[TBAA26:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TOBOOL1]], label %[[FOR_END:.*]], label %[[FOR_BODY_LR_PH:.*]], !dbg [[DBG24]]
+; CHECK:       [[FOR_BODY_LR_PH]]:
+; CHECK-NEXT:    [[A_PROMOTED:%.*]] = load ptr, ptr @a, align 8, !dbg [[DBG25:![0-9]+]], !tbaa [[ANYPTR_TBAA26:![0-9]+]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = sub i32 -2, [[DOTPR]], !dbg [[DBG24]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = and i32 [[TMP0]], -2, !dbg [[DBG24]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[DOTPR]], [[TMP1]], !dbg [[DBG24]]
@@ -26,77 +27,77 @@ define i32 @func_c() local_unnamed_addr #0 !dbg !14 {
 ; CHECK-NEXT:    [[TMP4:%.*]] = add nuw i32 [[TMP3]], 1, !dbg [[DBG24]]
 ; CHECK-NEXT:    [[XTRAITER:%.*]] = and i32 [[TMP4]], 3, !dbg [[DBG24]]
 ; CHECK-NEXT:    [[LCMP_MOD:%.*]] = icmp ne i32 [[XTRAITER]], 0, !dbg [[DBG24]]
-; CHECK-NEXT:    br i1 [[LCMP_MOD]], label [[FOR_BODY_PROL_PREHEADER:%.*]], label [[FOR_BODY_PROL_LOOPEXIT:%.*]], !dbg [[DBG24]]
-; CHECK:       for.body.prol.preheader:
-; CHECK-NEXT:    br label [[FOR_BODY_PROL:%.*]], !dbg [[DBG24]]
-; CHECK:       for.body.prol:
+; CHECK-NEXT:    br i1 [[LCMP_MOD]], label %[[FOR_BODY_PROL_PREHEADER:.*]], label %[[FOR_BODY_PROL_LOOPEXIT:.*]], !dbg [[DBG24]]
+; CHECK:       [[FOR_BODY_PROL_PREHEADER]]:
+; CHECK-NEXT:    br label %[[FOR_BODY_PROL:.*]], !dbg [[DBG24]]
+; CHECK:       [[FOR_BODY_PROL]]:
 ; CHECK-NEXT:    [[ARRAYIDX_PROL:%.*]] = getelementptr inbounds i32, ptr [[A_PROMOTED]], i64 1, !dbg [[DBG28:![0-9]+]]
-; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[ARRAYIDX_PROL]], align 4, !dbg [[DBG28]], !tbaa [[TBAA20]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[ARRAYIDX_PROL]], align 4, !dbg [[DBG28]], !tbaa [[INT_TBAA20]]
 ; CHECK-NEXT:    [[CONV_PROL:%.*]] = sext i32 [[TMP5]] to i64, !dbg [[DBG28]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[CONV_PROL]] to ptr, !dbg [[DBG28]]
 ; CHECK-NEXT:    [[ADD_PROL:%.*]] = add nsw i32 [[DOTPR]], 2, !dbg [[DBG29:![0-9]+]]
 ; CHECK-NEXT:    [[PROL_ITER_CMP:%.*]] = icmp ne i32 1, [[XTRAITER]], !dbg [[DBG24]]
-; CHECK-NEXT:    br i1 [[PROL_ITER_CMP]], label [[FOR_BODY_PROL_1:%.*]], label [[FOR_BODY_PROL_LOOPEXIT_UNR_LCSSA:%.*]], !dbg [[DBG24]]
-; CHECK:       for.body.prol.1:
+; CHECK-NEXT:    br i1 [[PROL_ITER_CMP]], label %[[FOR_BODY_PROL_1:.*]], label %[[FOR_BODY_PROL_LOOPEXIT_UNR_LCSSA:.*]], !dbg [[DBG24]]
+; CHECK:       [[FOR_BODY_PROL_1]]:
 ; CHECK-NEXT:    [[ARRAYIDX_PROL_1:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i64 1, !dbg [[DBG28]]
-; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[ARRAYIDX_PROL_1]], align 4, !dbg [[DBG28]], !tbaa [[TBAA20]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[ARRAYIDX_PROL_1]], align 4, !dbg [[DBG28]], !tbaa [[INT_TBAA20]]
 ; CHECK-NEXT:    [[CONV_PROL_1:%.*]] = sext i32 [[TMP7]] to i64, !dbg [[DBG28]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[CONV_PROL_1]] to ptr, !dbg [[DBG28]]
 ; CHECK-NEXT:    [[ADD_PROL_1:%.*]] = add nsw i32 [[DOTPR]], 4, !dbg [[DBG29]]
 ; CHECK-NEXT:    [[PROL_ITER_CMP_1:%.*]] = icmp ne i32 2, [[XTRAITER]], !dbg [[DBG24]]
-; CHECK-NEXT:    br i1 [[PROL_ITER_CMP_1]], label [[FOR_BODY_PROL_2:%.*]], label [[FOR_BODY_PROL_LOOPEXIT_UNR_LCSSA]], !dbg [[DBG24]]
-; CHECK:       for.body.prol.2:
+; CHECK-NEXT:    br i1 [[PROL_ITER_CMP_1]], label %[[FOR_BODY_PROL_2:.*]], label %[[FOR_BODY_PROL_LOOPEXIT_UNR_LCSSA]], !dbg [[DBG24]]
+; CHECK:       [[FOR_BODY_PROL_2]]:
 ; CHECK-NEXT:    [[ARRAYIDX_PROL_2:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i64 1, !dbg [[DBG28]]
-; CHECK-NEXT:    [[TMP9:%.*]] = load i32, ptr [[ARRAYIDX_PROL_2]], align 4, !dbg [[DBG28]], !tbaa [[TBAA20]]
+; CHECK-NEXT:    [[TMP9:%.*]] = load i32, ptr [[ARRAYIDX_PROL_2]], align 4, !dbg [[DBG28]], !tbaa [[INT_TBAA20]]
 ; CHECK-NEXT:    [[CONV_PROL_2:%.*]] = sext i32 [[TMP9]] to i64, !dbg [[DBG28]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[CONV_PROL_2]] to ptr, !dbg [[DBG28]]
 ; CHECK-NEXT:    [[ADD_PROL_2:%.*]] = add nsw i32 [[DOTPR]], 6, !dbg [[DBG29]]
-; CHECK-NEXT:    br label [[FOR_BODY_PROL_LOOPEXIT_UNR_LCSSA]]
-; CHECK:       for.body.prol.loopexit.unr-lcssa:
-; CHECK-NEXT:    [[DOTLCSSA_UNR_PH:%.*]] = phi ptr [ [[TMP6]], [[FOR_BODY_PROL]] ], [ [[TMP8]], [[FOR_BODY_PROL_1]] ], [ [[TMP10]], [[FOR_BODY_PROL_2]] ]
-; CHECK-NEXT:    [[DOTUNR_PH:%.*]] = phi ptr [ [[TMP6]], [[FOR_BODY_PROL]] ], [ [[TMP8]], [[FOR_BODY_PROL_1]] ], [ [[TMP10]], [[FOR_BODY_PROL_2]] ]
-; CHECK-NEXT:    [[DOTUNR1_PH:%.*]] = phi i32 [ [[ADD_PROL]], [[FOR_BODY_PROL]] ], [ [[ADD_PROL_1]], [[FOR_BODY_PROL_1]] ], [ [[ADD_PROL_2]], [[FOR_BODY_PROL_2]] ]
-; CHECK-NEXT:    br label [[FOR_BODY_PROL_LOOPEXIT]], !dbg [[DBG24]]
-; CHECK:       for.body.prol.loopexit:
-; CHECK-NEXT:    [[DOTLCSSA_UNR:%.*]] = phi ptr [ poison, [[FOR_BODY_LR_PH]] ], [ [[DOTLCSSA_UNR_PH]], [[FOR_BODY_PROL_LOOPEXIT_UNR_LCSSA]] ]
-; CHECK-NEXT:    [[DOTUNR:%.*]] = phi ptr [ [[A_PROMOTED]], [[FOR_BODY_LR_PH]] ], [ [[DOTUNR_PH]], [[FOR_BODY_PROL_LOOPEXIT_UNR_LCSSA]] ]
-; CHECK-NEXT:    [[DOTUNR1:%.*]] = phi i32 [ [[DOTPR]], [[FOR_BODY_LR_PH]] ], [ [[DOTUNR1_PH]], [[FOR_BODY_PROL_LOOPEXIT_UNR_LCSSA]] ]
+; CHECK-NEXT:    br label %[[FOR_BODY_PROL_LOOPEXIT_UNR_LCSSA]], !dbg [[DBG24]]
+; CHECK:       [[FOR_BODY_PROL_LOOPEXIT_UNR_LCSSA]]:
+; CHECK-NEXT:    [[DOTLCSSA_UNR_PH:%.*]] = phi ptr [ [[TMP6]], %[[FOR_BODY_PROL]] ], [ [[TMP8]], %[[FOR_BODY_PROL_1]] ], [ [[TMP10]], %[[FOR_BODY_PROL_2]] ]
+; CHECK-NEXT:    [[DOTUNR_PH:%.*]] = phi ptr [ [[TMP6]], %[[FOR_BODY_PROL]] ], [ [[TMP8]], %[[FOR_BODY_PROL_1]] ], [ [[TMP10]], %[[FOR_BODY_PROL_2]] ]
+; CHECK-NEXT:    [[DOTUNR1_PH:%.*]] = phi i32 [ [[ADD_PROL]], %[[FOR_BODY_PROL]] ], [ [[ADD_PROL_1]], %[[FOR_BODY_PROL_1]] ], [ [[ADD_PROL_2]], %[[FOR_BODY_PROL_2]] ]
+; CHECK-NEXT:    br label %[[FOR_BODY_PROL_LOOPEXIT]], !dbg [[DBG24]]
+; CHECK:       [[FOR_BODY_PROL_LOOPEXIT]]:
+; CHECK-NEXT:    [[DOTLCSSA_UNR:%.*]] = phi ptr [ poison, %[[FOR_BODY_LR_PH]] ], [ [[DOTLCSSA_UNR_PH]], %[[FOR_BODY_PROL_LOOPEXIT_UNR_LCSSA]] ]
+; CHECK-NEXT:    [[DOTUNR:%.*]] = phi ptr [ [[A_PROMOTED]], %[[FOR_BODY_LR_PH]] ], [ [[DOTUNR_PH]], %[[FOR_BODY_PROL_LOOPEXIT_UNR_LCSSA]] ]
+; CHECK-NEXT:    [[DOTUNR1:%.*]] = phi i32 [ [[DOTPR]], %[[FOR_BODY_LR_PH]] ], [ [[DOTUNR1_PH]], %[[FOR_BODY_PROL_LOOPEXIT_UNR_LCSSA]] ]
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp ult i32 [[TMP3]], 3, !dbg [[DBG24]]
-; CHECK-NEXT:    br i1 [[TMP11]], label [[FOR_COND_FOR_END_CRIT_EDGE:%.*]], label [[FOR_BODY_LR_PH_NEW:%.*]], !dbg [[DBG24]]
-; CHECK:       for.body.lr.ph.new:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]], !dbg [[DBG24]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[TMP12:%.*]] = phi ptr [ [[DOTUNR]], [[FOR_BODY_LR_PH_NEW]] ], [ [[TMP21:%.*]], [[FOR_BODY]] ], !dbg [[DBG28]]
-; CHECK-NEXT:    [[TMP13:%.*]] = phi i32 [ [[DOTUNR1]], [[FOR_BODY_LR_PH_NEW]] ], [ [[ADD_3:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    br i1 [[TMP11]], label %[[FOR_COND_FOR_END_CRIT_EDGE:.*]], label %[[FOR_BODY_LR_PH_NEW:.*]], !dbg [[DBG24]]
+; CHECK:       [[FOR_BODY_LR_PH_NEW]]:
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]], !dbg [[DBG24]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[TMP12:%.*]] = phi ptr [ [[DOTUNR]], %[[FOR_BODY_LR_PH_NEW]] ], [ [[TMP21:%.*]], %[[FOR_BODY]] ], !dbg [[DBG28]]
+; CHECK-NEXT:    [[TMP13:%.*]] = phi i32 [ [[DOTUNR1]], %[[FOR_BODY_LR_PH_NEW]] ], [ [[ADD_3:%.*]], %[[FOR_BODY]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i64 1, !dbg [[DBG28]]
-; CHECK-NEXT:    [[TMP14:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !dbg [[DBG28]], !tbaa [[TBAA20]]
+; CHECK-NEXT:    [[TMP14:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !dbg [[DBG28]], !tbaa [[INT_TBAA20]]
 ; CHECK-NEXT:    [[CONV:%.*]] = sext i32 [[TMP14]] to i64, !dbg [[DBG28]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[CONV]] to ptr, !dbg [[DBG28]]
 ; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i64 1, !dbg [[DBG28]]
-; CHECK-NEXT:    [[TMP16:%.*]] = load i32, ptr [[ARRAYIDX_1]], align 4, !dbg [[DBG28]], !tbaa [[TBAA20]]
+; CHECK-NEXT:    [[TMP16:%.*]] = load i32, ptr [[ARRAYIDX_1]], align 4, !dbg [[DBG28]], !tbaa [[INT_TBAA20]]
 ; CHECK-NEXT:    [[CONV_1:%.*]] = sext i32 [[TMP16]] to i64, !dbg [[DBG28]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = inttoptr i64 [[CONV_1]] to ptr, !dbg [[DBG28]]
 ; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i64 1, !dbg [[DBG28]]
-; CHECK-NEXT:    [[TMP18:%.*]] = load i32, ptr [[ARRAYIDX_2]], align 4, !dbg [[DBG28]], !tbaa [[TBAA20]]
+; CHECK-NEXT:    [[TMP18:%.*]] = load i32, ptr [[ARRAYIDX_2]], align 4, !dbg [[DBG28]], !tbaa [[INT_TBAA20]]
 ; CHECK-NEXT:    [[CONV_2:%.*]] = sext i32 [[TMP18]] to i64, !dbg [[DBG28]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = inttoptr i64 [[CONV_2]] to ptr, !dbg [[DBG28]]
 ; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i64 1, !dbg [[DBG28]]
-; CHECK-NEXT:    [[TMP20:%.*]] = load i32, ptr [[ARRAYIDX_3]], align 4, !dbg [[DBG28]], !tbaa [[TBAA20]]
+; CHECK-NEXT:    [[TMP20:%.*]] = load i32, ptr [[ARRAYIDX_3]], align 4, !dbg [[DBG28]], !tbaa [[INT_TBAA20]]
 ; CHECK-NEXT:    [[CONV_3:%.*]] = sext i32 [[TMP20]] to i64, !dbg [[DBG28]]
 ; CHECK-NEXT:    [[TMP21]] = inttoptr i64 [[CONV_3]] to ptr, !dbg [[DBG28]]
 ; CHECK-NEXT:    [[ADD_3]] = add nsw i32 [[TMP13]], 8, !dbg [[DBG29]]
 ; CHECK-NEXT:    [[TOBOOL_3:%.*]] = icmp eq i32 [[ADD_3]], 0, !dbg [[DBG24]]
-; CHECK-NEXT:    br i1 [[TOBOOL_3]], label [[FOR_COND_FOR_END_CRIT_EDGE_UNR_LCSSA:%.*]], label [[FOR_BODY]], !dbg [[DBG24]], !llvm.loop [[LOOP30:![0-9]+]]
-; CHECK:       for.cond.for.end_crit_edge.unr-lcssa:
-; CHECK-NEXT:    [[DOTLCSSA_PH:%.*]] = phi ptr [ [[TMP21]], [[FOR_BODY]] ]
-; CHECK-NEXT:    br label [[FOR_COND_FOR_END_CRIT_EDGE]], !dbg [[DBG24]]
-; CHECK:       for.cond.for.end_crit_edge:
-; CHECK-NEXT:    [[DOTLCSSA:%.*]] = phi ptr [ [[DOTLCSSA_UNR]], [[FOR_BODY_PROL_LOOPEXIT]] ], [ [[DOTLCSSA_PH]], [[FOR_COND_FOR_END_CRIT_EDGE_UNR_LCSSA]] ], !dbg [[DBG28]]
+; CHECK-NEXT:    br i1 [[TOBOOL_3]], label %[[FOR_COND_FOR_END_CRIT_EDGE_UNR_LCSSA:.*]], label %[[FOR_BODY]], !dbg [[DBG24]], !llvm.loop [[LOOP30:![0-9]+]]
+; CHECK:       [[FOR_COND_FOR_END_CRIT_EDGE_UNR_LCSSA]]:
+; CHECK-NEXT:    [[DOTLCSSA_PH:%.*]] = phi ptr [ [[TMP21]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    br label %[[FOR_COND_FOR_END_CRIT_EDGE]], !dbg [[DBG24]]
+; CHECK:       [[FOR_COND_FOR_END_CRIT_EDGE]]:
+; CHECK-NEXT:    [[DOTLCSSA:%.*]] = phi ptr [ [[DOTLCSSA_UNR]], %[[FOR_BODY_PROL_LOOPEXIT]] ], [ [[DOTLCSSA_PH]], %[[FOR_COND_FOR_END_CRIT_EDGE_UNR_LCSSA]] ], !dbg [[DBG28]]
 ; CHECK-NEXT:    [[TMP22:%.*]] = add i32 [[TMP2]], 2, !dbg [[DBG24]]
-; CHECK-NEXT:    store ptr [[DOTLCSSA]], ptr @a, align 8, !dbg [[DBG25]], !tbaa [[TBAA26]]
-; CHECK-NEXT:    store i32 [[TMP22]], ptr @b, align 4, !dbg [[DBG33:![0-9]+]], !tbaa [[TBAA20]]
-; CHECK-NEXT:    br label [[FOR_END]], !dbg [[DBG24]]
-; CHECK:       for.end:
+; CHECK-NEXT:    store ptr [[DOTLCSSA]], ptr @a, align 8, !dbg [[DBG25]], !tbaa [[ANYPTR_TBAA26]]
+; CHECK-NEXT:    store i32 [[TMP22]], ptr @b, align 4, !dbg [[DBG33:![0-9]+]], !tbaa [[INT_TBAA20]]
+; CHECK-NEXT:    br label %[[FOR_END]], !dbg [[DBG24]]
+; CHECK:       [[FOR_END]]:
 ; CHECK-NEXT:    ret i32 undef, !dbg [[DBG34:![0-9]+]]
 ;
 entry:
@@ -134,8 +135,9 @@ for.end:
 
 define void @func_d() local_unnamed_addr #1 !dbg !34 {
 ;
-; CHECK-LABEL: @func_d(
-; CHECK-NEXT:  entry:
+; CHECK-LABEL: define void @func_d(
+; CHECK-SAME: ) local_unnamed_addr !dbg [[DBG35:![0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    ret void, !dbg [[DBG38:![0-9]+]]
 ;
 entry:
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics.ll
index a2f1d65e7cd41..b2a4f0e582f9e 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics.ll
@@ -28,8 +28,6 @@
 ; - llvm.x86.avx512.mul.pd.512, llvm.x86.avx512.mul.ps.512
 ; - llvm.x86.avx512.permvar.df.512, llvm.x86.avx512.permvar.sf.512
 ; - llvm.x86.avx512.pternlog.d.512, llvm.x86.avx512.pternlog.q.512
-; - llvm.x86.avx512.rcp14.pd.512, llvm.x86.avx512.rcp14.ps.512
-; - llvm.x86.avx512.rsqrt14.ps.512
 ; - llvm.x86.avx512.sitofp.round.v16f32.v16i32
 ; - llvm.x86.avx512.sqrt.pd.512, llvm.x86.avx512.sqrt.ps.512
 ; - llvm.x86.avx512.sub.ps.512
@@ -682,15 +680,11 @@ define <16 x float> @test_rcp_ps_512(<16 x float> %a0) #0 {
 ; CHECK-LABEL: @test_rcp_ps_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
-; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       4:
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne <16 x i32> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = sext <16 x i1> [[TMP2]] to <16 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = select <16 x i1> splat (i1 true), <16 x i32> [[TMP3]], <16 x i32> zeroinitializer
 ; CHECK-NEXT:    [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.rcp14.ps.512(<16 x float> [[A0:%.*]], <16 x float> zeroinitializer, i16 -1)
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <16 x i32> [[TMP4]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x float> [[RES]]
 ;
   %res = call <16 x float> @llvm.x86.avx512.rcp14.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1) ; <<16 x float>> [#uses=1]
@@ -702,15 +696,11 @@ define <8 x double> @test_rcp_pd_512(<8 x double> %a0) #0 {
 ; CHECK-LABEL: @test_rcp_pd_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
-; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       4:
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne <8 x i64> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = sext <8 x i1> [[TMP2]] to <8 x i64>
+; CHECK-NEXT:    [[TMP4:%.*]] = select <8 x i1> splat (i1 true), <8 x i64> [[TMP3]], <8 x i64> zeroinitializer
 ; CHECK-NEXT:    [[RES:%.*]] = call <8 x double> @llvm.x86.avx512.rcp14.pd.512(<8 x double> [[A0:%.*]], <8 x double> zeroinitializer, i8 -1)
-; CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <8 x i64> [[TMP4]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x double> [[RES]]
 ;
   %res = call <8 x double> @llvm.x86.avx512.rcp14.pd.512(<8 x double> %a0, <8 x double> zeroinitializer, i8 -1) ; <<8 x double>> [#uses=1]
@@ -1021,15 +1011,11 @@ define <16 x float> @test_rsqrt_ps_512(<16 x float> %a0) #0 {
 ; CHECK-LABEL: @test_rsqrt_ps_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
-; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       4:
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne <16 x i32> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = sext <16 x i1> [[TMP2]] to <16 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = select <16 x i1> splat (i1 true), <16 x i32> [[TMP3]], <16 x i32> zeroinitializer
 ; CHECK-NEXT:    [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.rsqrt14.ps.512(<16 x float> [[A0:%.*]], <16 x float> zeroinitializer, i16 -1)
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <16 x i32> [[TMP4]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x float> [[RES]]
 ;
   %res = call <16 x float> @llvm.x86.avx512.rsqrt14.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1) ; <<16 x float>> [#uses=1]
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bw-intrinsics-upgrade.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bw-intrinsics-upgrade.ll
index 51dad35a1edbc..7bd35182d5c90 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bw-intrinsics-upgrade.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bw-intrinsics-upgrade.ll
@@ -5,8 +5,6 @@
 ;
 ; Strictly handled:
 ; - llvm.x86.avx512.dbpsadbw.512
-; - llvm.x86.avx512.packssdw.512, llvm.x86.avx512.packsswb.512
-; - llvm.x86.avx512.packusdw.512, llvm.x86.avx512.packuswb.512
 ;
 ; Heuristically handled:
 ; - llvm.sadd.sat.v32i16, llvm.sadd.sat.v64i8
@@ -2039,19 +2037,14 @@ define <32 x i16> @test_mask_packs_epi32_rr_512(<16 x i32> %a, <16 x i32> %b) no
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[TMP7:%.*]] = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> [[A:%.*]], <16 x i32> [[B:%.*]])
-; CHECK-NEXT:    store <32 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <32 x i16> [[TMP7]]
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <16 x i32> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[A:%.*]] = sext <16 x i1> [[TMP3]] to <16 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[B:%.*]] = sext <16 x i1> [[TMP5]] to <16 x i32>
+; CHECK-NEXT:    [[TMP7:%.*]] = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> [[A]], <16 x i32> [[B]])
+; CHECK-NEXT:    [[TMP8:%.*]] = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> [[A1:%.*]], <16 x i32> [[B1:%.*]])
+; CHECK-NEXT:    store <32 x i16> [[TMP7]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <32 x i16> [[TMP8]]
 ;
   %res = call <32 x i16> @llvm.x86.avx512.mask.packssdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> zeroinitializer, i32 -1)
   ret <32 x i16> %res
@@ -2064,25 +2057,20 @@ define <32 x i16> @test_mask_packs_epi32_rrk_512(<16 x i32> %a, <16 x i32> %b, <
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[TMP9:%.*]] = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> [[A:%.*]], <16 x i32> [[B:%.*]])
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <16 x i32> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[A:%.*]] = sext <16 x i1> [[TMP5]] to <16 x i32>
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[B:%.*]] = sext <16 x i1> [[TMP7]] to <16 x i32>
+; CHECK-NEXT:    [[TMP9:%.*]] = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> [[A]], <16 x i32> [[B]])
+; CHECK-NEXT:    [[TMP17:%.*]] = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> [[A1:%.*]], <16 x i32> [[B1:%.*]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i32 [[TMP3]] to <32 x i1>
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <32 x i1> [[TMP11]], <32 x i16> zeroinitializer, <32 x i16> [[TMP4]]
-; CHECK-NEXT:    [[TMP13:%.*]] = xor <32 x i16> [[TMP9]], [[PASSTHRU:%.*]]
-; CHECK-NEXT:    [[TMP14:%.*]] = or <32 x i16> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = select <32 x i1> [[TMP11]], <32 x i16> [[TMP9]], <32 x i16> [[TMP4]]
+; CHECK-NEXT:    [[TMP13:%.*]] = xor <32 x i16> [[TMP17]], [[PASSTHRU:%.*]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or <32 x i16> [[TMP13]], [[TMP9]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = or <32 x i16> [[TMP14]], [[TMP4]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <32 x i1> [[TMP10]], <32 x i16> [[TMP15]], <32 x i16> [[TMP12]]
-; CHECK-NEXT:    [[TMP16:%.*]] = select <32 x i1> [[TMP11]], <32 x i16> [[TMP9]], <32 x i16> [[PASSTHRU]]
+; CHECK-NEXT:    [[TMP16:%.*]] = select <32 x i1> [[TMP11]], <32 x i16> [[TMP17]], <32 x i16> [[PASSTHRU]]
 ; CHECK-NEXT:    store <32 x i16> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <32 x i16> [[TMP16]]
 ;
@@ -2096,25 +2084,20 @@ define <32 x i16> @test_mask_packs_epi32_rrkz_512(<16 x i32> %a, <16 x i32> %b,
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[TMP8:%.*]] = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> [[A:%.*]], <16 x i32> [[B:%.*]])
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <16 x i32> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[A:%.*]] = sext <16 x i1> [[TMP4]] to <16 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[B:%.*]] = sext <16 x i1> [[TMP6]] to <16 x i32>
+; CHECK-NEXT:    [[TMP8:%.*]] = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> [[A]], <16 x i32> [[B]])
+; CHECK-NEXT:    [[TMP16:%.*]] = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> [[A1:%.*]], <16 x i32> [[B1:%.*]])
 ; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i32 [[TMP3]] to <32 x i1>
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = select <32 x i1> [[TMP10]], <32 x i16> zeroinitializer, <32 x i16> zeroinitializer
-; CHECK-NEXT:    [[TMP12:%.*]] = xor <32 x i16> [[TMP8]], zeroinitializer
-; CHECK-NEXT:    [[TMP13:%.*]] = or <32 x i16> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = select <32 x i1> [[TMP10]], <32 x i16> [[TMP8]], <32 x i16> zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = xor <32 x i16> [[TMP16]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = or <32 x i16> [[TMP12]], [[TMP8]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = or <32 x i16> [[TMP13]], zeroinitializer
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <32 x i1> [[TMP9]], <32 x i16> [[TMP14]], <32 x i16> [[TMP11]]
-; CHECK-NEXT:    [[TMP15:%.*]] = select <32 x i1> [[TMP10]], <32 x i16> [[TMP8]], <32 x i16> zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = select <32 x i1> [[TMP10]], <32 x i16> [[TMP16]], <32 x i16> zeroinitializer
 ; CHECK-NEXT:    store <32 x i16> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <32 x i16> [[TMP15]]
 ;
@@ -2138,18 +2121,13 @@ define <32 x i16> @test_mask_packs_epi32_rm_512(<16 x i32> %a, ptr %ptr_b) nounw
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP7]], align 64
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP8]], 0
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i32> [[_MSLD]] to i512
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP9]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP1]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
-; CHECK:       10:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-; CHECK-NEXT:    unreachable
-; CHECK:       11:
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = sext <16 x i1> [[TMP8]] to <16 x i32>
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <16 x i32> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = sext <16 x i1> [[TMP10]] to <16 x i32>
+; CHECK-NEXT:    [[TMP9:%.*]] = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> [[TMP13]], <16 x i32> [[TMP11]])
 ; CHECK-NEXT:    [[TMP12:%.*]] = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> [[A:%.*]], <16 x i32> [[B]])
-; CHECK-NEXT:    store <32 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <32 x i16> [[TMP9]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <32 x i16> [[TMP12]]
 ;
   %b = load <16 x i32>, ptr %ptr_b
@@ -2175,22 +2153,17 @@ define <32 x i16> @test_mask_packs_epi32_rmk_512(<16 x i32> %a, ptr %ptr_b, <32
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP9]], align 64
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP10]], 0
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i32> [[_MSLD]] to i512
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP11]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP1]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
-; CHECK:       12:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-; CHECK-NEXT:    unreachable
-; CHECK:       13:
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = sext <16 x i1> [[TMP10]] to <16 x i32>
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <16 x i32> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = sext <16 x i1> [[TMP12]] to <16 x i32>
+; CHECK-NEXT:    [[TMP11:%.*]] = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> [[TMP22]], <16 x i32> [[TMP13]])
 ; CHECK-NEXT:    [[TMP14:%.*]] = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> [[A:%.*]], <16 x i32> [[B]])
 ; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i32 [[TMP3]] to <32 x i1>
 ; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1>
-; CHECK-NEXT:    [[TMP17:%.*]] = select <32 x i1> [[TMP16]], <32 x i16> zeroinitializer, <32 x i16> [[TMP4]]
+; CHECK-NEXT:    [[TMP17:%.*]] = select <32 x i1> [[TMP16]], <32 x i16> [[TMP11]], <32 x i16> [[TMP4]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = xor <32 x i16> [[TMP14]], [[PASSTHRU:%.*]]
-; CHECK-NEXT:    [[TMP19:%.*]] = or <32 x i16> [[TMP18]], zeroinitializer
+; CHECK-NEXT:    [[TMP19:%.*]] = or <32 x i16> [[TMP18]], [[TMP11]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = or <32 x i16> [[TMP19]], [[TMP4]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <32 x i1> [[TMP15]], <32 x i16> [[TMP20]], <32 x i16> [[TMP17]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = select <32 x i1> [[TMP16]], <32 x i16> [[TMP14]], <32 x i16> [[PASSTHRU]]
@@ -2219,22 +2192,17 @@ define <32 x i16> @test_mask_packs_epi32_rmkz_512(<16 x i32> %a, ptr %ptr_b, i32
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
 ; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP8]], align 64
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP9]], 0
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i32> [[_MSLD]] to i512
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP10]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP1]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP11:%.*]], label [[TMP12:%.*]], !prof [[PROF1]]
-; CHECK:       11:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-; CHECK-NEXT:    unreachable
-; CHECK:       12:
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = sext <16 x i1> [[TMP9]] to <16 x i32>
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne <16 x i32> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = sext <16 x i1> [[TMP11]] to <16 x i32>
+; CHECK-NEXT:    [[TMP10:%.*]] = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> [[TMP21]], <16 x i32> [[TMP12]])
 ; CHECK-NEXT:    [[TMP13:%.*]] = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> [[A:%.*]], <16 x i32> [[B]])
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i32 [[TMP3]] to <32 x i1>
 ; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1>
-; CHECK-NEXT:    [[TMP16:%.*]] = select <32 x i1> [[TMP15]], <32 x i16> zeroinitializer, <32 x i16> zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = select <32 x i1> [[TMP15]], <32 x i16> [[TMP10]], <32 x i16> zeroinitializer
 ; CHECK-NEXT:    [[TMP17:%.*]] = xor <32 x i16> [[TMP13]], zeroinitializer
-; CHECK-NEXT:    [[TMP18:%.*]] = or <32 x i16> [[TMP17]], zeroinitializer
+; CHECK-NEXT:    [[TMP18:%.*]] = or <32 x i16> [[TMP17]], [[TMP10]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = or <32 x i16> [[TMP18]], zeroinitializer
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <32 x i1> [[TMP14]], <32 x i16> [[TMP19]], <32 x i16> [[TMP16]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = select <32 x i1> [[TMP15]], <32 x i16> [[TMP13]], <32 x i16> zeroinitializer
@@ -2266,18 +2234,13 @@ define <32 x i16> @test_mask_packs_epi32_rmb_512(<16 x i32> %a, ptr %ptr_b) noun
 ; CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <16 x i32> poison, i32 [[Q]], i32 0
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <16 x i32> [[_MSPROP]], <16 x i32> splat (i32 -1), <16 x i32> zeroinitializer
 ; CHECK-NEXT:    [[B:%.*]] = shufflevector <16 x i32> [[VECINIT_I]], <16 x i32> poison, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP8]], 0
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i32> [[_MSPROP1]] to i512
-; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i512 [[TMP9]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
-; CHECK:       10:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-; CHECK-NEXT:    unreachable
-; CHECK:       11:
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = sext <16 x i1> [[TMP8]] to <16 x i32>
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <16 x i32> [[_MSPROP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = sext <16 x i1> [[TMP10]] to <16 x i32>
+; CHECK-NEXT:    [[TMP9:%.*]] = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> [[TMP13]], <16 x i32> [[TMP11]])
 ; CHECK-NEXT:    [[TMP12:%.*]] = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> [[A:%.*]], <16 x i32> [[B]])
-; CHECK-NEXT:    store <32 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <32 x i16> [[TMP9]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <32 x i16> [[TMP12]]
 ;
   %q = load i32, ptr %ptr_b
@@ -2309,22 +2272,17 @@ define <32 x i16> @test_mask_packs_epi32_rmbk_512(<16 x i32> %a, ptr %ptr_b, <32
 ; CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <16 x i32> poison, i32 [[Q]], i32 0
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <16 x i32> [[_MSPROP]], <16 x i32> splat (i32 -1), <16 x i32> zeroinitializer
 ; CHECK-NEXT:    [[B:%.*]] = shufflevector <16 x i32> [[VECINIT_I]], <16 x i32> poison, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP10]], 0
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i32> [[_MSPROP1]] to i512
-; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i512 [[TMP11]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
-; CHECK:       12:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-; CHECK-NEXT:    unreachable
-; CHECK:       13:
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = sext <16 x i1> [[TMP10]] to <16 x i32>
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <16 x i32> [[_MSPROP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = sext <16 x i1> [[TMP12]] to <16 x i32>
+; CHECK-NEXT:    [[TMP11:%.*]] = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> [[TMP22]], <16 x i32> [[TMP13]])
 ; CHECK-NEXT:    [[TMP14:%.*]] = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> [[A:%.*]], <16 x i32> [[B]])
 ; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i32 [[TMP3]] to <32 x i1>
 ; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1>
-; CHECK-NEXT:    [[TMP17:%.*]] = select <32 x i1> [[TMP16]], <32 x i16> zeroinitializer, <32 x i16> [[TMP4]]
+; CHECK-NEXT:    [[TMP17:%.*]] = select <32 x i1> [[TMP16]], <32 x i16> [[TMP11]], <32 x i16> [[TMP4]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = xor <32 x i16> [[TMP14]], [[PASSTHRU:%.*]]
-; CHECK-NEXT:    [[TMP19:%.*]] = or <32 x i16> [[TMP18]], zeroinitializer
+; CHECK-NEXT:    [[TMP19:%.*]] = or <32 x i16> [[TMP18]], [[TMP11]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = or <32 x i16> [[TMP19]], [[TMP4]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <32 x i1> [[TMP15]], <32 x i16> [[TMP20]], <32 x i16> [[TMP17]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = select <32 x i1> [[TMP16]], <32 x i16> [[TMP14]], <32 x i16> [[PASSTHRU]]
@@ -2359,22 +2317,17 @@ define <32 x i16> @test_mask_packs_epi32_rmbkz_512(<16 x i32> %a, ptr %ptr_b, i3
 ; CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <16 x i32> poison, i32 [[Q]], i32 0
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <16 x i32> [[_MSPROP]], <16 x i32> splat (i32 -1), <16 x i32> zeroinitializer
 ; CHECK-NEXT:    [[B:%.*]] = shufflevector <16 x i32> [[VECINIT_I]], <16 x i32> poison, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP9]], 0
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i32> [[_MSPROP1]] to i512
-; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i512 [[TMP10]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP11:%.*]], label [[TMP12:%.*]], !prof [[PROF1]]
-; CHECK:       11:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-; CHECK-NEXT:    unreachable
-; CHECK:       12:
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = sext <16 x i1> [[TMP9]] to <16 x i32>
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne <16 x i32> [[_MSPROP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = sext <16 x i1> [[TMP11]] to <16 x i32>
+; CHECK-NEXT:    [[TMP10:%.*]] = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> [[TMP21]], <16 x i32> [[TMP12]])
 ; CHECK-NEXT:    [[TMP13:%.*]] = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> [[A:%.*]], <16 x i32> [[B]])
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i32 [[TMP3]] to <32 x i1>
 ; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1>
-; CHECK-NEXT:    [[TMP16:%.*]] = select <32 x i1> [[TMP15]], <32 x i16> zeroinitializer, <32 x i16> zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = select <32 x i1> [[TMP15]], <32 x i16> [[TMP10]], <32 x i16> zeroinitializer
 ; CHECK-NEXT:    [[TMP17:%.*]] = xor <32 x i16> [[TMP13]], zeroinitializer
-; CHECK-NEXT:    [[TMP18:%.*]] = or <32 x i16> [[TMP17]], zeroinitializer
+; CHECK-NEXT:    [[TMP18:%.*]] = or <32 x i16> [[TMP17]], [[TMP10]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = or <32 x i16> [[TMP18]], zeroinitializer
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <32 x i1> [[TMP14]], <32 x i16> [[TMP19]], <32 x i16> [[TMP16]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = select <32 x i1> [[TMP15]], <32 x i16> [[TMP13]], <32 x i16> zeroinitializer
@@ -2395,19 +2348,14 @@ define <64 x i8> @test_mask_packs_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) nou
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <32 x i16> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <32 x i16> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[TMP7:%.*]] = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> [[A:%.*]], <32 x i16> [[B:%.*]])
-; CHECK-NEXT:    store <64 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <64 x i8> [[TMP7]]
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <32 x i16> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[A:%.*]] = sext <32 x i1> [[TMP3]] to <32 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <32 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[B:%.*]] = sext <32 x i1> [[TMP5]] to <32 x i16>
+; CHECK-NEXT:    [[TMP7:%.*]] = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> [[A]], <32 x i16> [[B]])
+; CHECK-NEXT:    [[TMP8:%.*]] = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> [[A1:%.*]], <32 x i16> [[B1:%.*]])
+; CHECK-NEXT:    store <64 x i8> [[TMP7]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <64 x i8> [[TMP8]]
 ;
   %res = call <64 x i8> @llvm.x86.avx512.mask.packsswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> zeroinitializer, i64 -1)
   ret <64 x i8> %res
@@ -2420,25 +2368,20 @@ define <64 x i8> @test_mask_packs_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <6
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <32 x i16> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <32 x i16> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[TMP9:%.*]] = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> [[A:%.*]], <32 x i16> [[B:%.*]])
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <32 x i16> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[A:%.*]] = sext <32 x i1> [[TMP5]] to <32 x i16>
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne <32 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[B:%.*]] = sext <32 x i1> [[TMP7]] to <32 x i16>
+; CHECK-NEXT:    [[TMP9:%.*]] = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> [[A]], <32 x i16> [[B]])
+; CHECK-NEXT:    [[TMP17:%.*]] = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> [[A1:%.*]], <32 x i16> [[B1:%.*]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i64 [[TMP3]] to <64 x i1>
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i64 [[MASK:%.*]] to <64 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <64 x i1> [[TMP11]], <64 x i8> zeroinitializer, <64 x i8> [[TMP4]]
-; CHECK-NEXT:    [[TMP13:%.*]] = xor <64 x i8> [[TMP9]], [[PASSTHRU:%.*]]
-; CHECK-NEXT:    [[TMP14:%.*]] = or <64 x i8> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = select <64 x i1> [[TMP11]], <64 x i8> [[TMP9]], <64 x i8> [[TMP4]]
+; CHECK-NEXT:    [[TMP13:%.*]] = xor <64 x i8> [[TMP17]], [[PASSTHRU:%.*]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or <64 x i8> [[TMP13]], [[TMP9]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = or <64 x i8> [[TMP14]], [[TMP4]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <64 x i1> [[TMP10]], <64 x i8> [[TMP15]], <64 x i8> [[TMP12]]
-; CHECK-NEXT:    [[TMP16:%.*]] = select <64 x i1> [[TMP11]], <64 x i8> [[TMP9]], <64 x i8> [[PASSTHRU]]
+; CHECK-NEXT:    [[TMP16:%.*]] = select <64 x i1> [[TMP11]], <64 x i8> [[TMP17]], <64 x i8> [[PASSTHRU]]
 ; CHECK-NEXT:    store <64 x i8> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <64 x i8> [[TMP16]]
 ;
@@ -2452,25 +2395,20 @@ define <64 x i8> @test_mask_packs_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <32 x i16> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <32 x i16> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[TMP8:%.*]] = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> [[A:%.*]], <32 x i16> [[B:%.*]])
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <32 x i16> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[A:%.*]] = sext <32 x i1> [[TMP4]] to <32 x i16>
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <32 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[B:%.*]] = sext <32 x i1> [[TMP6]] to <32 x i16>
+; CHECK-NEXT:    [[TMP8:%.*]] = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> [[A]], <32 x i16> [[B]])
+; CHECK-NEXT:    [[TMP16:%.*]] = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> [[A1:%.*]], <32 x i16> [[B1:%.*]])
 ; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[TMP3]] to <64 x i1>
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i64 [[MASK:%.*]] to <64 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = select <64 x i1> [[TMP10]], <64 x i8> zeroinitializer, <64 x i8> zeroinitializer
-; CHECK-NEXT:    [[TMP12:%.*]] = xor <64 x i8> [[TMP8]], zeroinitializer
-; CHECK-NEXT:    [[TMP13:%.*]] = or <64 x i8> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = select <64 x i1> [[TMP10]], <64 x i8> [[TMP8]], <64 x i8> zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = xor <64 x i8> [[TMP16]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = or <64 x i8> [[TMP12]], [[TMP8]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = or <64 x i8> [[TMP13]], zeroinitializer
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <64 x i1> [[TMP9]], <64 x i8> [[TMP14]], <64 x i8> [[TMP11]]
-; CHECK-NEXT:    [[TMP15:%.*]] = select <64 x i1> [[TMP10]], <64 x i8> [[TMP8]], <64 x i8> zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = select <64 x i1> [[TMP10]], <64 x i8> [[TMP16]], <64 x i8> zeroinitializer
 ; CHECK-NEXT:    store <64 x i8> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <64 x i8> [[TMP15]]
 ;
@@ -2494,18 +2432,13 @@ define <64 x i8> @test_mask_packs_epi16_rm_512(<32 x i16> %a, ptr %ptr_b) nounwi
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <32 x i16>, ptr [[TMP7]], align 64
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <32 x i16> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP8]], 0
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <32 x i16> [[_MSLD]] to i512
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP9]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP1]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
-; CHECK:       10:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-; CHECK-NEXT:    unreachable
-; CHECK:       11:
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne <32 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = sext <32 x i1> [[TMP8]] to <32 x i16>
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <32 x i16> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = sext <32 x i1> [[TMP10]] to <32 x i16>
+; CHECK-NEXT:    [[TMP9:%.*]] = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> [[TMP13]], <32 x i16> [[TMP11]])
 ; CHECK-NEXT:    [[TMP12:%.*]] = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> [[A:%.*]], <32 x i16> [[B]])
-; CHECK-NEXT:    store <64 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <64 x i8> [[TMP9]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <64 x i8> [[TMP12]]
 ;
   %b = load <32 x i16>, ptr %ptr_b
@@ -2531,22 +2464,17 @@ define <64 x i8> @test_mask_packs_epi16_rmk_512(<32 x i16> %a, ptr %ptr_b, <64 x
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <32 x i16>, ptr [[TMP9]], align 64
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <32 x i16> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP10]], 0
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <32 x i16> [[_MSLD]] to i512
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP11]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP1]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
-; CHECK:       12:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-; CHECK-NEXT:    unreachable
-; CHECK:       13:
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <32 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = sext <32 x i1> [[TMP10]] to <32 x i16>
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <32 x i16> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = sext <32 x i1> [[TMP12]] to <32 x i16>
+; CHECK-NEXT:    [[TMP11:%.*]] = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> [[TMP22]], <32 x i16> [[TMP13]])
 ; CHECK-NEXT:    [[TMP14:%.*]] = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> [[A:%.*]], <32 x i16> [[B]])
 ; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i64 [[TMP3]] to <64 x i1>
 ; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i64 [[MASK:%.*]] to <64 x i1>
-; CHECK-NEXT:    [[TMP17:%.*]] = select <64 x i1> [[TMP16]], <64 x i8> zeroinitializer, <64 x i8> [[TMP4]]
+; CHECK-NEXT:    [[TMP17:%.*]] = select <64 x i1> [[TMP16]], <64 x i8> [[TMP11]], <64 x i8> [[TMP4]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = xor <64 x i8> [[TMP14]], [[PASSTHRU:%.*]]
-; CHECK-NEXT:    [[TMP19:%.*]] = or <64 x i8> [[TMP18]], zeroinitializer
+; CHECK-NEXT:    [[TMP19:%.*]] = or <64 x i8> [[TMP18]], [[TMP11]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = or <64 x i8> [[TMP19]], [[TMP4]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <64 x i1> [[TMP15]], <64 x i8> [[TMP20]], <64 x i8> [[TMP17]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = select <64 x i1> [[TMP16]], <64 x i8> [[TMP14]], <64 x i8> [[PASSTHRU]]
@@ -2575,22 +2503,17 @@ define <64 x i8> @test_mask_packs_epi16_rmkz_512(<32 x i16> %a, ptr %ptr_b, i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
 ; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <32 x i16>, ptr [[TMP8]], align 64
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <32 x i16> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP9]], 0
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <32 x i16> [[_MSLD]] to i512
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP10]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP1]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP11:%.*]], label [[TMP12:%.*]], !prof [[PROF1]]
-; CHECK:       11:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-; CHECK-NEXT:    unreachable
-; CHECK:       12:
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <32 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = sext <32 x i1> [[TMP9]] to <32 x i16>
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne <32 x i16> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = sext <32 x i1> [[TMP11]] to <32 x i16>
+; CHECK-NEXT:    [[TMP10:%.*]] = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> [[TMP21]], <32 x i16> [[TMP12]])
 ; CHECK-NEXT:    [[TMP13:%.*]] = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> [[A:%.*]], <32 x i16> [[B]])
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i64 [[TMP3]] to <64 x i1>
 ; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i64 [[MASK:%.*]] to <64 x i1>
-; CHECK-NEXT:    [[TMP16:%.*]] = select <64 x i1> [[TMP15]], <64 x i8> zeroinitializer, <64 x i8> zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = select <64 x i1> [[TMP15]], <64 x i8> [[TMP10]], <64 x i8> zeroinitializer
 ; CHECK-NEXT:    [[TMP17:%.*]] = xor <64 x i8> [[TMP13]], zeroinitializer
-; CHECK-NEXT:    [[TMP18:%.*]] = or <64 x i8> [[TMP17]], zeroinitializer
+; CHECK-NEXT:    [[TMP18:%.*]] = or <64 x i8> [[TMP17]], [[TMP10]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = or <64 x i8> [[TMP18]], zeroinitializer
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <64 x i1> [[TMP14]], <64 x i8> [[TMP19]], <64 x i8> [[TMP16]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = select <64 x i1> [[TMP15]], <64 x i8> [[TMP13]], <64 x i8> zeroinitializer
@@ -2610,18 +2533,13 @@ define <32 x i16> @test_mask_packus_epi32_rr_512(<16 x i32> %a, <16 x i32> %b) n
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <16 x i32> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = sext <16 x i1> [[TMP3]] to <16 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = sext <16 x i1> [[TMP5]] to <16 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> [[TMP8]], <16 x i32> [[TMP6]])
 ; CHECK-NEXT:    [[TMP7:%.*]] = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> [[A:%.*]], <16 x i32> [[B:%.*]])
-; CHECK-NEXT:    store <32 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <32 x i16> [[TMP4]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <32 x i16> [[TMP7]]
 ;
   %res = call <32 x i16> @llvm.x86.avx512.mask.packusdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> zeroinitializer, i32 -1)
@@ -2635,22 +2553,17 @@ define <32 x i16> @test_mask_packus_epi32_rrk_512(<16 x i32> %a, <16 x i32> %b,
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-; CHECK-NEXT:    unreachable
-; CHECK:       8:
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <16 x i32> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = sext <16 x i1> [[TMP5]] to <16 x i32>
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = sext <16 x i1> [[TMP7]] to <16 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> [[TMP17]], <16 x i32> [[TMP8]])
 ; CHECK-NEXT:    [[TMP9:%.*]] = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> [[A:%.*]], <16 x i32> [[B:%.*]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i32 [[TMP3]] to <32 x i1>
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <32 x i1> [[TMP11]], <32 x i16> zeroinitializer, <32 x i16> [[TMP4]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <32 x i1> [[TMP11]], <32 x i16> [[TMP6]], <32 x i16> [[TMP4]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = xor <32 x i16> [[TMP9]], [[PASSTHRU:%.*]]
-; CHECK-NEXT:    [[TMP14:%.*]] = or <32 x i16> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = or <32 x i16> [[TMP13]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = or <32 x i16> [[TMP14]], [[TMP4]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <32 x i1> [[TMP10]], <32 x i16> [[TMP15]], <32 x i16> [[TMP12]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = select <32 x i1> [[TMP11]], <32 x i16> [[TMP9]], <32 x i16> [[PASSTHRU]]
@@ -2667,22 +2580,17 @@ define <32 x i16> @test_mask_packus_epi32_rrkz_512(<16 x i32> %a, <16 x i32> %b,
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <16 x i32> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = sext <16 x i1> [[TMP4]] to <16 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = sext <16 x i1> [[TMP6]] to <16 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> [[TMP16]], <16 x i32> [[TMP7]])
 ; CHECK-NEXT:    [[TMP8:%.*]] = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> [[A:%.*]], <16 x i32> [[B:%.*]])
 ; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i32 [[TMP3]] to <32 x i1>
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = select <32 x i1> [[TMP10]], <32 x i16> zeroinitializer, <32 x i16> zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = select <32 x i1> [[TMP10]], <32 x i16> [[TMP5]], <32 x i16> zeroinitializer
 ; CHECK-NEXT:    [[TMP12:%.*]] = xor <32 x i16> [[TMP8]], zeroinitializer
-; CHECK-NEXT:    [[TMP13:%.*]] = or <32 x i16> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = or <32 x i16> [[TMP12]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = or <32 x i16> [[TMP13]], zeroinitializer
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <32 x i1> [[TMP9]], <32 x i16> [[TMP14]], <32 x i16> [[TMP11]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = select <32 x i1> [[TMP10]], <32 x i16> [[TMP8]], <32 x i16> zeroinitializer
@@ -2709,18 +2617,13 @@ define <32 x i16> @test_mask_packus_epi32_rm_512(<16 x i32> %a, ptr %ptr_b) noun
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP7]], align 64
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP8]], 0
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i32> [[_MSLD]] to i512
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP9]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP1]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
-; CHECK:       10:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-; CHECK-NEXT:    unreachable
-; CHECK:       11:
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = sext <16 x i1> [[TMP8]] to <16 x i32>
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <16 x i32> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = sext <16 x i1> [[TMP10]] to <16 x i32>
+; CHECK-NEXT:    [[TMP9:%.*]] = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> [[TMP13]], <16 x i32> [[TMP11]])
 ; CHECK-NEXT:    [[TMP12:%.*]] = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> [[A:%.*]], <16 x i32> [[B]])
-; CHECK-NEXT:    store <32 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <32 x i16> [[TMP9]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <32 x i16> [[TMP12]]
 ;
   %b = load <16 x i32>, ptr %ptr_b
@@ -2746,22 +2649,17 @@ define <32 x i16> @test_mask_packus_epi32_rmk_512(<16 x i32> %a, ptr %ptr_b, <32
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP9]], align 64
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP10]], 0
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i32> [[_MSLD]] to i512
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP11]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP1]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
-; CHECK:       12:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-; CHECK-NEXT:    unreachable
-; CHECK:       13:
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = sext <16 x i1> [[TMP10]] to <16 x i32>
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <16 x i32> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = sext <16 x i1> [[TMP12]] to <16 x i32>
+; CHECK-NEXT:    [[TMP11:%.*]] = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> [[TMP22]], <16 x i32> [[TMP13]])
 ; CHECK-NEXT:    [[TMP14:%.*]] = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> [[A:%.*]], <16 x i32> [[B]])
 ; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i32 [[TMP3]] to <32 x i1>
 ; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1>
-; CHECK-NEXT:    [[TMP17:%.*]] = select <32 x i1> [[TMP16]], <32 x i16> zeroinitializer, <32 x i16> [[TMP4]]
+; CHECK-NEXT:    [[TMP17:%.*]] = select <32 x i1> [[TMP16]], <32 x i16> [[TMP11]], <32 x i16> [[TMP4]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = xor <32 x i16> [[TMP14]], [[PASSTHRU:%.*]]
-; CHECK-NEXT:    [[TMP19:%.*]] = or <32 x i16> [[TMP18]], zeroinitializer
+; CHECK-NEXT:    [[TMP19:%.*]] = or <32 x i16> [[TMP18]], [[TMP11]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = or <32 x i16> [[TMP19]], [[TMP4]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <32 x i1> [[TMP15]], <32 x i16> [[TMP20]], <32 x i16> [[TMP17]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = select <32 x i1> [[TMP16]], <32 x i16> [[TMP14]], <32 x i16> [[PASSTHRU]]
@@ -2790,22 +2688,17 @@ define <32 x i16> @test_mask_packus_epi32_rmkz_512(<16 x i32> %a, ptr %ptr_b, i3
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
 ; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP8]], align 64
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP9]], 0
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i32> [[_MSLD]] to i512
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP10]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP1]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP11:%.*]], label [[TMP12:%.*]], !prof [[PROF1]]
-; CHECK:       11:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-; CHECK-NEXT:    unreachable
-; CHECK:       12:
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = sext <16 x i1> [[TMP9]] to <16 x i32>
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne <16 x i32> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = sext <16 x i1> [[TMP11]] to <16 x i32>
+; CHECK-NEXT:    [[TMP10:%.*]] = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> [[TMP21]], <16 x i32> [[TMP12]])
 ; CHECK-NEXT:    [[TMP13:%.*]] = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> [[A:%.*]], <16 x i32> [[B]])
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i32 [[TMP3]] to <32 x i1>
 ; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1>
-; CHECK-NEXT:    [[TMP16:%.*]] = select <32 x i1> [[TMP15]], <32 x i16> zeroinitializer, <32 x i16> zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = select <32 x i1> [[TMP15]], <32 x i16> [[TMP10]], <32 x i16> zeroinitializer
 ; CHECK-NEXT:    [[TMP17:%.*]] = xor <32 x i16> [[TMP13]], zeroinitializer
-; CHECK-NEXT:    [[TMP18:%.*]] = or <32 x i16> [[TMP17]], zeroinitializer
+; CHECK-NEXT:    [[TMP18:%.*]] = or <32 x i16> [[TMP17]], [[TMP10]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = or <32 x i16> [[TMP18]], zeroinitializer
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <32 x i1> [[TMP14]], <32 x i16> [[TMP19]], <32 x i16> [[TMP16]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = select <32 x i1> [[TMP15]], <32 x i16> [[TMP13]], <32 x i16> zeroinitializer
@@ -2837,18 +2730,13 @@ define <32 x i16> @test_mask_packus_epi32_rmb_512(<16 x i32> %a, ptr %ptr_b) nou
 ; CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <16 x i32> poison, i32 [[Q]], i32 0
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <16 x i32> [[_MSPROP]], <16 x i32> splat (i32 -1), <16 x i32> zeroinitializer
 ; CHECK-NEXT:    [[B:%.*]] = shufflevector <16 x i32> [[VECINIT_I]], <16 x i32> poison, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP8]], 0
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i32> [[_MSPROP1]] to i512
-; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i512 [[TMP9]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
-; CHECK:       10:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-; CHECK-NEXT:    unreachable
-; CHECK:       11:
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = sext <16 x i1> [[TMP8]] to <16 x i32>
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <16 x i32> [[_MSPROP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = sext <16 x i1> [[TMP10]] to <16 x i32>
+; CHECK-NEXT:    [[TMP9:%.*]] = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> [[TMP13]], <16 x i32> [[TMP11]])
 ; CHECK-NEXT:    [[TMP12:%.*]] = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> [[A:%.*]], <16 x i32> [[B]])
-; CHECK-NEXT:    store <32 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <32 x i16> [[TMP9]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <32 x i16> [[TMP12]]
 ;
   %q = load i32, ptr %ptr_b
@@ -2880,22 +2768,17 @@ define <32 x i16> @test_mask_packus_epi32_rmbk_512(<16 x i32> %a, ptr %ptr_b, <3
 ; CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <16 x i32> poison, i32 [[Q]], i32 0
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <16 x i32> [[_MSPROP]], <16 x i32> splat (i32 -1), <16 x i32> zeroinitializer
 ; CHECK-NEXT:    [[B:%.*]] = shufflevector <16 x i32> [[VECINIT_I]], <16 x i32> poison, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP10]], 0
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i32> [[_MSPROP1]] to i512
-; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i512 [[TMP11]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
-; CHECK:       12:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-; CHECK-NEXT:    unreachable
-; CHECK:       13:
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = sext <16 x i1> [[TMP10]] to <16 x i32>
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <16 x i32> [[_MSPROP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = sext <16 x i1> [[TMP12]] to <16 x i32>
+; CHECK-NEXT:    [[TMP11:%.*]] = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> [[TMP22]], <16 x i32> [[TMP13]])
 ; CHECK-NEXT:    [[TMP14:%.*]] = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> [[A:%.*]], <16 x i32> [[B]])
 ; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i32 [[TMP3]] to <32 x i1>
 ; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1>
-; CHECK-NEXT:    [[TMP17:%.*]] = select <32 x i1> [[TMP16]], <32 x i16> zeroinitializer, <32 x i16> [[TMP4]]
+; CHECK-NEXT:    [[TMP17:%.*]] = select <32 x i1> [[TMP16]], <32 x i16> [[TMP11]], <32 x i16> [[TMP4]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = xor <32 x i16> [[TMP14]], [[PASSTHRU:%.*]]
-; CHECK-NEXT:    [[TMP19:%.*]] = or <32 x i16> [[TMP18]], zeroinitializer
+; CHECK-NEXT:    [[TMP19:%.*]] = or <32 x i16> [[TMP18]], [[TMP11]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = or <32 x i16> [[TMP19]], [[TMP4]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <32 x i1> [[TMP15]], <32 x i16> [[TMP20]], <32 x i16> [[TMP17]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = select <32 x i1> [[TMP16]], <32 x i16> [[TMP14]], <32 x i16> [[PASSTHRU]]
@@ -2930,22 +2813,17 @@ define <32 x i16> @test_mask_packus_epi32_rmbkz_512(<16 x i32> %a, ptr %ptr_b, i
 ; CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <16 x i32> poison, i32 [[Q]], i32 0
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <16 x i32> [[_MSPROP]], <16 x i32> splat (i32 -1), <16 x i32> zeroinitializer
 ; CHECK-NEXT:    [[B:%.*]] = shufflevector <16 x i32> [[VECINIT_I]], <16 x i32> poison, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP9]], 0
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i32> [[_MSPROP1]] to i512
-; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i512 [[TMP10]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP11:%.*]], label [[TMP12:%.*]], !prof [[PROF1]]
-; CHECK:       11:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-; CHECK-NEXT:    unreachable
-; CHECK:       12:
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = sext <16 x i1> [[TMP9]] to <16 x i32>
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne <16 x i32> [[_MSPROP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = sext <16 x i1> [[TMP11]] to <16 x i32>
+; CHECK-NEXT:    [[TMP10:%.*]] = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> [[TMP21]], <16 x i32> [[TMP12]])
 ; CHECK-NEXT:    [[TMP13:%.*]] = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> [[A:%.*]], <16 x i32> [[B]])
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i32 [[TMP3]] to <32 x i1>
 ; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1>
-; CHECK-NEXT:    [[TMP16:%.*]] = select <32 x i1> [[TMP15]], <32 x i16> zeroinitializer, <32 x i16> zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = select <32 x i1> [[TMP15]], <32 x i16> [[TMP10]], <32 x i16> zeroinitializer
 ; CHECK-NEXT:    [[TMP17:%.*]] = xor <32 x i16> [[TMP13]], zeroinitializer
-; CHECK-NEXT:    [[TMP18:%.*]] = or <32 x i16> [[TMP17]], zeroinitializer
+; CHECK-NEXT:    [[TMP18:%.*]] = or <32 x i16> [[TMP17]], [[TMP10]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = or <32 x i16> [[TMP18]], zeroinitializer
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <32 x i1> [[TMP14]], <32 x i16> [[TMP19]], <32 x i16> [[TMP16]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = select <32 x i1> [[TMP15]], <32 x i16> [[TMP13]], <32 x i16> zeroinitializer
@@ -2966,18 +2844,13 @@ define <64 x i8> @test_mask_packus_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) no
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <32 x i16> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <32 x i16> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <32 x i16> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = sext <32 x i1> [[TMP3]] to <32 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <32 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = sext <32 x i1> [[TMP5]] to <32 x i16>
+; CHECK-NEXT:    [[TMP4:%.*]] = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> [[TMP8]], <32 x i16> [[TMP6]])
 ; CHECK-NEXT:    [[TMP7:%.*]] = call <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16> [[A:%.*]], <32 x i16> [[B:%.*]])
-; CHECK-NEXT:    store <64 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <64 x i8> [[TMP4]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <64 x i8> [[TMP7]]
 ;
   %res = call <64 x i8> @llvm.x86.avx512.mask.packuswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> zeroinitializer, i64 -1)
@@ -2991,22 +2864,17 @@ define <64 x i8> @test_mask_packus_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <32 x i16> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <32 x i16> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-; CHECK-NEXT:    unreachable
-; CHECK:       8:
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <32 x i16> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = sext <32 x i1> [[TMP5]] to <32 x i16>
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne <32 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = sext <32 x i1> [[TMP7]] to <32 x i16>
+; CHECK-NEXT:    [[TMP6:%.*]] = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> [[TMP17]], <32 x i16> [[TMP8]])
 ; CHECK-NEXT:    [[TMP9:%.*]] = call <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16> [[A:%.*]], <32 x i16> [[B:%.*]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i64 [[TMP3]] to <64 x i1>
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i64 [[MASK:%.*]] to <64 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <64 x i1> [[TMP11]], <64 x i8> zeroinitializer, <64 x i8> [[TMP4]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <64 x i1> [[TMP11]], <64 x i8> [[TMP6]], <64 x i8> [[TMP4]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = xor <64 x i8> [[TMP9]], [[PASSTHRU:%.*]]
-; CHECK-NEXT:    [[TMP14:%.*]] = or <64 x i8> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = or <64 x i8> [[TMP13]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = or <64 x i8> [[TMP14]], [[TMP4]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <64 x i1> [[TMP10]], <64 x i8> [[TMP15]], <64 x i8> [[TMP12]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = select <64 x i1> [[TMP11]], <64 x i8> [[TMP9]], <64 x i8> [[PASSTHRU]]
@@ -3023,22 +2891,17 @@ define <64 x i8> @test_mask_packus_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b,
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <32 x i16> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <32 x i16> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <32 x i16> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = sext <32 x i1> [[TMP4]] to <32 x i16>
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <32 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = sext <32 x i1> [[TMP6]] to <32 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> [[TMP16]], <32 x i16> [[TMP7]])
 ; CHECK-NEXT:    [[TMP8:%.*]] = call <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16> [[A:%.*]], <32 x i16> [[B:%.*]])
 ; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[TMP3]] to <64 x i1>
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i64 [[MASK:%.*]] to <64 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = select <64 x i1> [[TMP10]], <64 x i8> zeroinitializer, <64 x i8> zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = select <64 x i1> [[TMP10]], <64 x i8> [[TMP5]], <64 x i8> zeroinitializer
 ; CHECK-NEXT:    [[TMP12:%.*]] = xor <64 x i8> [[TMP8]], zeroinitializer
-; CHECK-NEXT:    [[TMP13:%.*]] = or <64 x i8> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = or <64 x i8> [[TMP12]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = or <64 x i8> [[TMP13]], zeroinitializer
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <64 x i1> [[TMP9]], <64 x i8> [[TMP14]], <64 x i8> [[TMP11]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = select <64 x i1> [[TMP10]], <64 x i8> [[TMP8]], <64 x i8> zeroinitializer
@@ -3065,18 +2928,13 @@ define <64 x i8> @test_mask_packus_epi16_rm_512(<32 x i16> %a, ptr %ptr_b) nounw
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <32 x i16>, ptr [[TMP7]], align 64
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <32 x i16> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP8]], 0
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <32 x i16> [[_MSLD]] to i512
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP9]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP1]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
-; CHECK:       10:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-; CHECK-NEXT:    unreachable
-; CHECK:       11:
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne <32 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = sext <32 x i1> [[TMP8]] to <32 x i16>
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <32 x i16> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = sext <32 x i1> [[TMP10]] to <32 x i16>
+; CHECK-NEXT:    [[TMP9:%.*]] = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> [[TMP13]], <32 x i16> [[TMP11]])
 ; CHECK-NEXT:    [[TMP12:%.*]] = call <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16> [[A:%.*]], <32 x i16> [[B]])
-; CHECK-NEXT:    store <64 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <64 x i8> [[TMP9]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <64 x i8> [[TMP12]]
 ;
   %b = load <32 x i16>, ptr %ptr_b
@@ -3102,22 +2960,17 @@ define <64 x i8> @test_mask_packus_epi16_rmk_512(<32 x i16> %a, ptr %ptr_b, <64
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <32 x i16>, ptr [[TMP9]], align 64
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <32 x i16> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP10]], 0
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <32 x i16> [[_MSLD]] to i512
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP11]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP1]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
-; CHECK:       12:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-; CHECK-NEXT:    unreachable
-; CHECK:       13:
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <32 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = sext <32 x i1> [[TMP10]] to <32 x i16>
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <32 x i16> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = sext <32 x i1> [[TMP12]] to <32 x i16>
+; CHECK-NEXT:    [[TMP11:%.*]] = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> [[TMP22]], <32 x i16> [[TMP13]])
 ; CHECK-NEXT:    [[TMP14:%.*]] = call <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16> [[A:%.*]], <32 x i16> [[B]])
 ; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i64 [[TMP3]] to <64 x i1>
 ; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i64 [[MASK:%.*]] to <64 x i1>
-; CHECK-NEXT:    [[TMP17:%.*]] = select <64 x i1> [[TMP16]], <64 x i8> zeroinitializer, <64 x i8> [[TMP4]]
+; CHECK-NEXT:    [[TMP17:%.*]] = select <64 x i1> [[TMP16]], <64 x i8> [[TMP11]], <64 x i8> [[TMP4]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = xor <64 x i8> [[TMP14]], [[PASSTHRU:%.*]]
-; CHECK-NEXT:    [[TMP19:%.*]] = or <64 x i8> [[TMP18]], zeroinitializer
+; CHECK-NEXT:    [[TMP19:%.*]] = or <64 x i8> [[TMP18]], [[TMP11]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = or <64 x i8> [[TMP19]], [[TMP4]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <64 x i1> [[TMP15]], <64 x i8> [[TMP20]], <64 x i8> [[TMP17]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = select <64 x i1> [[TMP16]], <64 x i8> [[TMP14]], <64 x i8> [[PASSTHRU]]
@@ -3146,22 +2999,17 @@ define <64 x i8> @test_mask_packus_epi16_rmkz_512(<32 x i16> %a, ptr %ptr_b, i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
 ; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <32 x i16>, ptr [[TMP8]], align 64
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <32 x i16> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP9]], 0
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <32 x i16> [[_MSLD]] to i512
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP10]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP1]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP11:%.*]], label [[TMP12:%.*]], !prof [[PROF1]]
-; CHECK:       11:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-; CHECK-NEXT:    unreachable
-; CHECK:       12:
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <32 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = sext <32 x i1> [[TMP9]] to <32 x i16>
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne <32 x i16> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = sext <32 x i1> [[TMP11]] to <32 x i16>
+; CHECK-NEXT:    [[TMP10:%.*]] = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> [[TMP21]], <32 x i16> [[TMP12]])
 ; CHECK-NEXT:    [[TMP13:%.*]] = call <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16> [[A:%.*]], <32 x i16> [[B]])
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i64 [[TMP3]] to <64 x i1>
 ; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i64 [[MASK:%.*]] to <64 x i1>
-; CHECK-NEXT:    [[TMP16:%.*]] = select <64 x i1> [[TMP15]], <64 x i8> zeroinitializer, <64 x i8> zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = select <64 x i1> [[TMP15]], <64 x i8> [[TMP10]], <64 x i8> zeroinitializer
 ; CHECK-NEXT:    [[TMP17:%.*]] = xor <64 x i8> [[TMP13]], zeroinitializer
-; CHECK-NEXT:    [[TMP18:%.*]] = or <64 x i8> [[TMP17]], zeroinitializer
+; CHECK-NEXT:    [[TMP18:%.*]] = or <64 x i8> [[TMP17]], [[TMP10]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = or <64 x i8> [[TMP18]], zeroinitializer
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <64 x i1> [[TMP14]], <64 x i8> [[TMP19]], <64 x i8> [[TMP16]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = select <64 x i1> [[TMP15]], <64 x i8> [[TMP13]], <64 x i8> zeroinitializer
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bw-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bw-intrinsics.ll
index c6c7e002213bd..8bf6d5acc21ba 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bw-intrinsics.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bw-intrinsics.ll
@@ -7,8 +7,6 @@
 ; - llvm.x86.avx512.dbpsadbw.512
 ; - llvm.x86.avx512.ktestc.d, llvm.x86.avx512.ktestc.q, llvm.x86.avx512.ktestz.d, llvm.x86.avx512.ktestz.q
 ; - llvm.x86.avx512.mask.pmov.wb.mem.512
-; - llvm.x86.avx512.packssdw.512, llvm.x86.avx512.packsswb.512
-; - llvm.x86.avx512.packusdw.512, llvm.x86.avx512.packuswb.512
 ; - llvm.x86.avx512.psad.bw.512
 ;
 ; Heuristically handled:
@@ -295,19 +293,14 @@ define <32 x i16> @test_mask_packs_epi32_rr_512(<16 x i32> %a, <16 x i32> %b) #0
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[TMP7:%.*]] = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> [[A:%.*]], <16 x i32> [[B:%.*]])
-; CHECK-NEXT:    store <32 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <32 x i16> [[TMP7]]
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <16 x i32> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[A:%.*]] = sext <16 x i1> [[TMP3]] to <16 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[B:%.*]] = sext <16 x i1> [[TMP5]] to <16 x i32>
+; CHECK-NEXT:    [[TMP7:%.*]] = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> [[A]], <16 x i32> [[B]])
+; CHECK-NEXT:    [[TMP8:%.*]] = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> [[A1:%.*]], <16 x i32> [[B1:%.*]])
+; CHECK-NEXT:    store <32 x i16> [[TMP7]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <32 x i16> [[TMP8]]
 ;
   %1 = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> %a, <16 x i32> %b)
   ret <32 x i16> %1
@@ -320,25 +313,20 @@ define <32 x i16> @test_mask_packs_epi32_rrk_512(<16 x i32> %a, <16 x i32> %b, <
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[TMP9:%.*]] = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> [[A:%.*]], <16 x i32> [[B:%.*]])
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <16 x i32> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[A:%.*]] = sext <16 x i1> [[TMP5]] to <16 x i32>
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[B:%.*]] = sext <16 x i1> [[TMP7]] to <16 x i32>
+; CHECK-NEXT:    [[TMP9:%.*]] = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> [[A]], <16 x i32> [[B]])
+; CHECK-NEXT:    [[TMP17:%.*]] = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> [[A1:%.*]], <16 x i32> [[B1:%.*]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i32 [[TMP3]] to <32 x i1>
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <32 x i1> [[TMP11]], <32 x i16> zeroinitializer, <32 x i16> [[TMP4]]
-; CHECK-NEXT:    [[TMP13:%.*]] = xor <32 x i16> [[TMP9]], [[PASSTHRU:%.*]]
-; CHECK-NEXT:    [[TMP14:%.*]] = or <32 x i16> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = select <32 x i1> [[TMP11]], <32 x i16> [[TMP9]], <32 x i16> [[TMP4]]
+; CHECK-NEXT:    [[TMP13:%.*]] = xor <32 x i16> [[TMP17]], [[PASSTHRU:%.*]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or <32 x i16> [[TMP13]], [[TMP9]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = or <32 x i16> [[TMP14]], [[TMP4]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <32 x i1> [[TMP10]], <32 x i16> [[TMP15]], <32 x i16> [[TMP12]]
-; CHECK-NEXT:    [[TMP16:%.*]] = select <32 x i1> [[TMP11]], <32 x i16> [[TMP9]], <32 x i16> [[PASSTHRU]]
+; CHECK-NEXT:    [[TMP16:%.*]] = select <32 x i1> [[TMP11]], <32 x i16> [[TMP17]], <32 x i16> [[PASSTHRU]]
 ; CHECK-NEXT:    store <32 x i16> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <32 x i16> [[TMP16]]
 ;
@@ -354,25 +342,20 @@ define <32 x i16> @test_mask_packs_epi32_rrkz_512(<16 x i32> %a, <16 x i32> %b,
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[TMP8:%.*]] = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> [[A:%.*]], <16 x i32> [[B:%.*]])
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <16 x i32> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[A:%.*]] = sext <16 x i1> [[TMP4]] to <16 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[B:%.*]] = sext <16 x i1> [[TMP6]] to <16 x i32>
+; CHECK-NEXT:    [[TMP8:%.*]] = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> [[A]], <16 x i32> [[B]])
+; CHECK-NEXT:    [[TMP16:%.*]] = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> [[A1:%.*]], <16 x i32> [[B1:%.*]])
 ; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i32 [[TMP3]] to <32 x i1>
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = select <32 x i1> [[TMP10]], <32 x i16> zeroinitializer, <32 x i16> zeroinitializer
-; CHECK-NEXT:    [[TMP12:%.*]] = xor <32 x i16> [[TMP8]], zeroinitializer
-; CHECK-NEXT:    [[TMP13:%.*]] = or <32 x i16> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = select <32 x i1> [[TMP10]], <32 x i16> [[TMP8]], <32 x i16> zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = xor <32 x i16> [[TMP16]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = or <32 x i16> [[TMP12]], [[TMP8]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = or <32 x i16> [[TMP13]], zeroinitializer
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <32 x i1> [[TMP9]], <32 x i16> [[TMP14]], <32 x i16> [[TMP11]]
-; CHECK-NEXT:    [[TMP15:%.*]] = select <32 x i1> [[TMP10]], <32 x i16> [[TMP8]], <32 x i16> zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = select <32 x i1> [[TMP10]], <32 x i16> [[TMP16]], <32 x i16> zeroinitializer
 ; CHECK-NEXT:    store <32 x i16> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <32 x i16> [[TMP15]]
 ;
@@ -398,18 +381,13 @@ define <32 x i16> @test_mask_packs_epi32_rm_512(<16 x i32> %a, ptr %ptr_b) #0 {
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP7]], align 64
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP8]], 0
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i32> [[_MSLD]] to i512
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP9]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP1]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
-; CHECK:       10:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       11:
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = sext <16 x i1> [[TMP8]] to <16 x i32>
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <16 x i32> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = sext <16 x i1> [[TMP10]] to <16 x i32>
+; CHECK-NEXT:    [[TMP9:%.*]] = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> [[TMP13]], <16 x i32> [[TMP11]])
 ; CHECK-NEXT:    [[TMP12:%.*]] = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> [[A:%.*]], <16 x i32> [[B]])
-; CHECK-NEXT:    store <32 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <32 x i16> [[TMP9]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <32 x i16> [[TMP12]]
 ;
   %b = load <16 x i32>, ptr %ptr_b
@@ -435,22 +413,17 @@ define <32 x i16> @test_mask_packs_epi32_rmk_512(<16 x i32> %a, ptr %ptr_b, <32
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP9]], align 64
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP10]], 0
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i32> [[_MSLD]] to i512
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP11]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP1]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
-; CHECK:       12:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       13:
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = sext <16 x i1> [[TMP10]] to <16 x i32>
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <16 x i32> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = sext <16 x i1> [[TMP12]] to <16 x i32>
+; CHECK-NEXT:    [[TMP11:%.*]] = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> [[TMP22]], <16 x i32> [[TMP13]])
 ; CHECK-NEXT:    [[TMP14:%.*]] = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> [[A:%.*]], <16 x i32> [[B]])
 ; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i32 [[TMP3]] to <32 x i1>
 ; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1>
-; CHECK-NEXT:    [[TMP17:%.*]] = select <32 x i1> [[TMP16]], <32 x i16> zeroinitializer, <32 x i16> [[TMP4]]
+; CHECK-NEXT:    [[TMP17:%.*]] = select <32 x i1> [[TMP16]], <32 x i16> [[TMP11]], <32 x i16> [[TMP4]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = xor <32 x i16> [[TMP14]], [[PASSTHRU:%.*]]
-; CHECK-NEXT:    [[TMP19:%.*]] = or <32 x i16> [[TMP18]], zeroinitializer
+; CHECK-NEXT:    [[TMP19:%.*]] = or <32 x i16> [[TMP18]], [[TMP11]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = or <32 x i16> [[TMP19]], [[TMP4]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <32 x i1> [[TMP15]], <32 x i16> [[TMP20]], <32 x i16> [[TMP17]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = select <32 x i1> [[TMP16]], <32 x i16> [[TMP14]], <32 x i16> [[PASSTHRU]]
@@ -481,22 +454,17 @@ define <32 x i16> @test_mask_packs_epi32_rmkz_512(<16 x i32> %a, ptr %ptr_b, i32
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
 ; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP8]], align 64
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP9]], 0
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i32> [[_MSLD]] to i512
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP10]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP1]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP11:%.*]], label [[TMP12:%.*]], !prof [[PROF1]]
-; CHECK:       11:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       12:
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = sext <16 x i1> [[TMP9]] to <16 x i32>
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne <16 x i32> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = sext <16 x i1> [[TMP11]] to <16 x i32>
+; CHECK-NEXT:    [[TMP10:%.*]] = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> [[TMP21]], <16 x i32> [[TMP12]])
 ; CHECK-NEXT:    [[TMP13:%.*]] = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> [[A:%.*]], <16 x i32> [[B]])
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i32 [[TMP3]] to <32 x i1>
 ; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1>
-; CHECK-NEXT:    [[TMP16:%.*]] = select <32 x i1> [[TMP15]], <32 x i16> zeroinitializer, <32 x i16> zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = select <32 x i1> [[TMP15]], <32 x i16> [[TMP10]], <32 x i16> zeroinitializer
 ; CHECK-NEXT:    [[TMP17:%.*]] = xor <32 x i16> [[TMP13]], zeroinitializer
-; CHECK-NEXT:    [[TMP18:%.*]] = or <32 x i16> [[TMP17]], zeroinitializer
+; CHECK-NEXT:    [[TMP18:%.*]] = or <32 x i16> [[TMP17]], [[TMP10]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = or <32 x i16> [[TMP18]], zeroinitializer
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <32 x i1> [[TMP14]], <32 x i16> [[TMP19]], <32 x i16> [[TMP16]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = select <32 x i1> [[TMP15]], <32 x i16> [[TMP13]], <32 x i16> zeroinitializer
@@ -530,18 +498,13 @@ define <32 x i16> @test_mask_packs_epi32_rmb_512(<16 x i32> %a, ptr %ptr_b) #0 {
 ; CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <16 x i32> poison, i32 [[Q]], i32 0
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <16 x i32> [[_MSPROP]], <16 x i32> splat (i32 -1), <16 x i32> zeroinitializer
 ; CHECK-NEXT:    [[B:%.*]] = shufflevector <16 x i32> [[VECINIT_I]], <16 x i32> poison, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP8]], 0
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i32> [[_MSPROP1]] to i512
-; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i512 [[TMP9]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
-; CHECK:       10:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       11:
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = sext <16 x i1> [[TMP8]] to <16 x i32>
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <16 x i32> [[_MSPROP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = sext <16 x i1> [[TMP10]] to <16 x i32>
+; CHECK-NEXT:    [[TMP9:%.*]] = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> [[TMP13]], <16 x i32> [[TMP11]])
 ; CHECK-NEXT:    [[TMP12:%.*]] = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> [[A:%.*]], <16 x i32> [[B]])
-; CHECK-NEXT:    store <32 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <32 x i16> [[TMP9]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <32 x i16> [[TMP12]]
 ;
   %q = load i32, ptr %ptr_b
@@ -573,22 +536,17 @@ define <32 x i16> @test_mask_packs_epi32_rmbk_512(<16 x i32> %a, ptr %ptr_b, <32
 ; CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <16 x i32> poison, i32 [[Q]], i32 0
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <16 x i32> [[_MSPROP]], <16 x i32> splat (i32 -1), <16 x i32> zeroinitializer
 ; CHECK-NEXT:    [[B:%.*]] = shufflevector <16 x i32> [[VECINIT_I]], <16 x i32> poison, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP10]], 0
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i32> [[_MSPROP1]] to i512
-; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i512 [[TMP11]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
-; CHECK:       12:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       13:
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = sext <16 x i1> [[TMP10]] to <16 x i32>
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <16 x i32> [[_MSPROP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = sext <16 x i1> [[TMP12]] to <16 x i32>
+; CHECK-NEXT:    [[TMP11:%.*]] = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> [[TMP22]], <16 x i32> [[TMP13]])
 ; CHECK-NEXT:    [[TMP14:%.*]] = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> [[A:%.*]], <16 x i32> [[B]])
 ; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i32 [[TMP3]] to <32 x i1>
 ; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1>
-; CHECK-NEXT:    [[TMP17:%.*]] = select <32 x i1> [[TMP16]], <32 x i16> zeroinitializer, <32 x i16> [[TMP4]]
+; CHECK-NEXT:    [[TMP17:%.*]] = select <32 x i1> [[TMP16]], <32 x i16> [[TMP11]], <32 x i16> [[TMP4]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = xor <32 x i16> [[TMP14]], [[PASSTHRU:%.*]]
-; CHECK-NEXT:    [[TMP19:%.*]] = or <32 x i16> [[TMP18]], zeroinitializer
+; CHECK-NEXT:    [[TMP19:%.*]] = or <32 x i16> [[TMP18]], [[TMP11]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = or <32 x i16> [[TMP19]], [[TMP4]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <32 x i1> [[TMP15]], <32 x i16> [[TMP20]], <32 x i16> [[TMP17]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = select <32 x i1> [[TMP16]], <32 x i16> [[TMP14]], <32 x i16> [[PASSTHRU]]
@@ -625,22 +583,17 @@ define <32 x i16> @test_mask_packs_epi32_rmbkz_512(<16 x i32> %a, ptr %ptr_b, i3
 ; CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <16 x i32> poison, i32 [[Q]], i32 0
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <16 x i32> [[_MSPROP]], <16 x i32> splat (i32 -1), <16 x i32> zeroinitializer
 ; CHECK-NEXT:    [[B:%.*]] = shufflevector <16 x i32> [[VECINIT_I]], <16 x i32> poison, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP9]], 0
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i32> [[_MSPROP1]] to i512
-; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i512 [[TMP10]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP11:%.*]], label [[TMP12:%.*]], !prof [[PROF1]]
-; CHECK:       11:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       12:
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = sext <16 x i1> [[TMP9]] to <16 x i32>
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne <16 x i32> [[_MSPROP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = sext <16 x i1> [[TMP11]] to <16 x i32>
+; CHECK-NEXT:    [[TMP10:%.*]] = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> [[TMP21]], <16 x i32> [[TMP12]])
 ; CHECK-NEXT:    [[TMP13:%.*]] = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> [[A:%.*]], <16 x i32> [[B]])
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i32 [[TMP3]] to <32 x i1>
 ; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1>
-; CHECK-NEXT:    [[TMP16:%.*]] = select <32 x i1> [[TMP15]], <32 x i16> zeroinitializer, <32 x i16> zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = select <32 x i1> [[TMP15]], <32 x i16> [[TMP10]], <32 x i16> zeroinitializer
 ; CHECK-NEXT:    [[TMP17:%.*]] = xor <32 x i16> [[TMP13]], zeroinitializer
-; CHECK-NEXT:    [[TMP18:%.*]] = or <32 x i16> [[TMP17]], zeroinitializer
+; CHECK-NEXT:    [[TMP18:%.*]] = or <32 x i16> [[TMP17]], [[TMP10]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = or <32 x i16> [[TMP18]], zeroinitializer
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <32 x i1> [[TMP14]], <32 x i16> [[TMP19]], <32 x i16> [[TMP16]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = select <32 x i1> [[TMP15]], <32 x i16> [[TMP13]], <32 x i16> zeroinitializer
@@ -663,19 +616,14 @@ define <64 x i8> @test_mask_packs_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) #0
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <32 x i16> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <32 x i16> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[TMP7:%.*]] = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> [[A:%.*]], <32 x i16> [[B:%.*]])
-; CHECK-NEXT:    store <64 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <64 x i8> [[TMP7]]
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <32 x i16> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[A:%.*]] = sext <32 x i1> [[TMP3]] to <32 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <32 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[B:%.*]] = sext <32 x i1> [[TMP5]] to <32 x i16>
+; CHECK-NEXT:    [[TMP7:%.*]] = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> [[A]], <32 x i16> [[B]])
+; CHECK-NEXT:    [[TMP8:%.*]] = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> [[A1:%.*]], <32 x i16> [[B1:%.*]])
+; CHECK-NEXT:    store <64 x i8> [[TMP7]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <64 x i8> [[TMP8]]
 ;
   %1 = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> %a, <32 x i16> %b)
   ret <64 x i8> %1
@@ -688,25 +636,20 @@ define <64 x i8> @test_mask_packs_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <6
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <32 x i16> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <32 x i16> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[TMP9:%.*]] = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> [[A:%.*]], <32 x i16> [[B:%.*]])
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <32 x i16> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[A:%.*]] = sext <32 x i1> [[TMP5]] to <32 x i16>
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne <32 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[B:%.*]] = sext <32 x i1> [[TMP7]] to <32 x i16>
+; CHECK-NEXT:    [[TMP9:%.*]] = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> [[A]], <32 x i16> [[B]])
+; CHECK-NEXT:    [[TMP17:%.*]] = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> [[A1:%.*]], <32 x i16> [[B1:%.*]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i64 [[TMP3]] to <64 x i1>
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i64 [[MASK:%.*]] to <64 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <64 x i1> [[TMP11]], <64 x i8> zeroinitializer, <64 x i8> [[TMP4]]
-; CHECK-NEXT:    [[TMP13:%.*]] = xor <64 x i8> [[TMP9]], [[PASSTHRU:%.*]]
-; CHECK-NEXT:    [[TMP14:%.*]] = or <64 x i8> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = select <64 x i1> [[TMP11]], <64 x i8> [[TMP9]], <64 x i8> [[TMP4]]
+; CHECK-NEXT:    [[TMP13:%.*]] = xor <64 x i8> [[TMP17]], [[PASSTHRU:%.*]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or <64 x i8> [[TMP13]], [[TMP9]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = or <64 x i8> [[TMP14]], [[TMP4]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <64 x i1> [[TMP10]], <64 x i8> [[TMP15]], <64 x i8> [[TMP12]]
-; CHECK-NEXT:    [[TMP16:%.*]] = select <64 x i1> [[TMP11]], <64 x i8> [[TMP9]], <64 x i8> [[PASSTHRU]]
+; CHECK-NEXT:    [[TMP16:%.*]] = select <64 x i1> [[TMP11]], <64 x i8> [[TMP17]], <64 x i8> [[PASSTHRU]]
 ; CHECK-NEXT:    store <64 x i8> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <64 x i8> [[TMP16]]
 ;
@@ -722,25 +665,20 @@ define <64 x i8> @test_mask_packs_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <32 x i16> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <32 x i16> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[TMP8:%.*]] = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> [[A:%.*]], <32 x i16> [[B:%.*]])
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <32 x i16> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[A:%.*]] = sext <32 x i1> [[TMP4]] to <32 x i16>
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <32 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[B:%.*]] = sext <32 x i1> [[TMP6]] to <32 x i16>
+; CHECK-NEXT:    [[TMP8:%.*]] = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> [[A]], <32 x i16> [[B]])
+; CHECK-NEXT:    [[TMP16:%.*]] = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> [[A1:%.*]], <32 x i16> [[B1:%.*]])
 ; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[TMP3]] to <64 x i1>
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i64 [[MASK:%.*]] to <64 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = select <64 x i1> [[TMP10]], <64 x i8> zeroinitializer, <64 x i8> zeroinitializer
-; CHECK-NEXT:    [[TMP12:%.*]] = xor <64 x i8> [[TMP8]], zeroinitializer
-; CHECK-NEXT:    [[TMP13:%.*]] = or <64 x i8> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = select <64 x i1> [[TMP10]], <64 x i8> [[TMP8]], <64 x i8> zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = xor <64 x i8> [[TMP16]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = or <64 x i8> [[TMP12]], [[TMP8]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = or <64 x i8> [[TMP13]], zeroinitializer
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <64 x i1> [[TMP9]], <64 x i8> [[TMP14]], <64 x i8> [[TMP11]]
-; CHECK-NEXT:    [[TMP15:%.*]] = select <64 x i1> [[TMP10]], <64 x i8> [[TMP8]], <64 x i8> zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = select <64 x i1> [[TMP10]], <64 x i8> [[TMP16]], <64 x i8> zeroinitializer
 ; CHECK-NEXT:    store <64 x i8> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <64 x i8> [[TMP15]]
 ;
@@ -766,18 +704,13 @@ define <64 x i8> @test_mask_packs_epi16_rm_512(<32 x i16> %a, ptr %ptr_b) #0 {
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <32 x i16>, ptr [[TMP7]], align 64
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <32 x i16> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP8]], 0
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <32 x i16> [[_MSLD]] to i512
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP9]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP1]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
-; CHECK:       10:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       11:
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne <32 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = sext <32 x i1> [[TMP8]] to <32 x i16>
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <32 x i16> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = sext <32 x i1> [[TMP10]] to <32 x i16>
+; CHECK-NEXT:    [[TMP9:%.*]] = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> [[TMP13]], <32 x i16> [[TMP11]])
 ; CHECK-NEXT:    [[TMP12:%.*]] = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> [[A:%.*]], <32 x i16> [[B]])
-; CHECK-NEXT:    store <64 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <64 x i8> [[TMP9]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <64 x i8> [[TMP12]]
 ;
   %b = load <32 x i16>, ptr %ptr_b
@@ -803,22 +736,17 @@ define <64 x i8> @test_mask_packs_epi16_rmk_512(<32 x i16> %a, ptr %ptr_b, <64 x
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <32 x i16>, ptr [[TMP9]], align 64
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <32 x i16> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP10]], 0
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <32 x i16> [[_MSLD]] to i512
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP11]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP1]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
-; CHECK:       12:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       13:
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <32 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = sext <32 x i1> [[TMP10]] to <32 x i16>
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <32 x i16> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = sext <32 x i1> [[TMP12]] to <32 x i16>
+; CHECK-NEXT:    [[TMP11:%.*]] = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> [[TMP22]], <32 x i16> [[TMP13]])
 ; CHECK-NEXT:    [[TMP14:%.*]] = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> [[A:%.*]], <32 x i16> [[B]])
 ; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i64 [[TMP3]] to <64 x i1>
 ; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i64 [[MASK:%.*]] to <64 x i1>
-; CHECK-NEXT:    [[TMP17:%.*]] = select <64 x i1> [[TMP16]], <64 x i8> zeroinitializer, <64 x i8> [[TMP4]]
+; CHECK-NEXT:    [[TMP17:%.*]] = select <64 x i1> [[TMP16]], <64 x i8> [[TMP11]], <64 x i8> [[TMP4]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = xor <64 x i8> [[TMP14]], [[PASSTHRU:%.*]]
-; CHECK-NEXT:    [[TMP19:%.*]] = or <64 x i8> [[TMP18]], zeroinitializer
+; CHECK-NEXT:    [[TMP19:%.*]] = or <64 x i8> [[TMP18]], [[TMP11]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = or <64 x i8> [[TMP19]], [[TMP4]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <64 x i1> [[TMP15]], <64 x i8> [[TMP20]], <64 x i8> [[TMP17]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = select <64 x i1> [[TMP16]], <64 x i8> [[TMP14]], <64 x i8> [[PASSTHRU]]
@@ -849,22 +777,17 @@ define <64 x i8> @test_mask_packs_epi16_rmkz_512(<32 x i16> %a, ptr %ptr_b, i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
 ; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <32 x i16>, ptr [[TMP8]], align 64
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <32 x i16> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP9]], 0
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <32 x i16> [[_MSLD]] to i512
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP10]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP1]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP11:%.*]], label [[TMP12:%.*]], !prof [[PROF1]]
-; CHECK:       11:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       12:
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <32 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = sext <32 x i1> [[TMP9]] to <32 x i16>
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne <32 x i16> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = sext <32 x i1> [[TMP11]] to <32 x i16>
+; CHECK-NEXT:    [[TMP10:%.*]] = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> [[TMP21]], <32 x i16> [[TMP12]])
 ; CHECK-NEXT:    [[TMP13:%.*]] = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> [[A:%.*]], <32 x i16> [[B]])
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i64 [[TMP3]] to <64 x i1>
 ; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i64 [[MASK:%.*]] to <64 x i1>
-; CHECK-NEXT:    [[TMP16:%.*]] = select <64 x i1> [[TMP15]], <64 x i8> zeroinitializer, <64 x i8> zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = select <64 x i1> [[TMP15]], <64 x i8> [[TMP10]], <64 x i8> zeroinitializer
 ; CHECK-NEXT:    [[TMP17:%.*]] = xor <64 x i8> [[TMP13]], zeroinitializer
-; CHECK-NEXT:    [[TMP18:%.*]] = or <64 x i8> [[TMP17]], zeroinitializer
+; CHECK-NEXT:    [[TMP18:%.*]] = or <64 x i8> [[TMP17]], [[TMP10]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = or <64 x i8> [[TMP18]], zeroinitializer
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <64 x i1> [[TMP14]], <64 x i8> [[TMP19]], <64 x i8> [[TMP16]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = select <64 x i1> [[TMP15]], <64 x i8> [[TMP13]], <64 x i8> zeroinitializer
@@ -886,18 +809,13 @@ define <32 x i16> @test_mask_packus_epi32_rr_512(<16 x i32> %a, <16 x i32> %b) #
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <16 x i32> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = sext <16 x i1> [[TMP3]] to <16 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = sext <16 x i1> [[TMP5]] to <16 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> [[TMP8]], <16 x i32> [[TMP6]])
 ; CHECK-NEXT:    [[TMP7:%.*]] = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> [[A:%.*]], <16 x i32> [[B:%.*]])
-; CHECK-NEXT:    store <32 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <32 x i16> [[TMP4]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <32 x i16> [[TMP7]]
 ;
   %1 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> %a, <16 x i32> %b)
@@ -911,22 +829,17 @@ define <32 x i16> @test_mask_packus_epi32_rrk_512(<16 x i32> %a, <16 x i32> %b,
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       8:
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <16 x i32> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = sext <16 x i1> [[TMP5]] to <16 x i32>
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = sext <16 x i1> [[TMP7]] to <16 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> [[TMP17]], <16 x i32> [[TMP8]])
 ; CHECK-NEXT:    [[TMP9:%.*]] = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> [[A:%.*]], <16 x i32> [[B:%.*]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i32 [[TMP3]] to <32 x i1>
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <32 x i1> [[TMP11]], <32 x i16> zeroinitializer, <32 x i16> [[TMP4]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <32 x i1> [[TMP11]], <32 x i16> [[TMP6]], <32 x i16> [[TMP4]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = xor <32 x i16> [[TMP9]], [[PASSTHRU:%.*]]
-; CHECK-NEXT:    [[TMP14:%.*]] = or <32 x i16> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = or <32 x i16> [[TMP13]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = or <32 x i16> [[TMP14]], [[TMP4]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <32 x i1> [[TMP10]], <32 x i16> [[TMP15]], <32 x i16> [[TMP12]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = select <32 x i1> [[TMP11]], <32 x i16> [[TMP9]], <32 x i16> [[PASSTHRU]]
@@ -945,22 +858,17 @@ define <32 x i16> @test_mask_packus_epi32_rrkz_512(<16 x i32> %a, <16 x i32> %b,
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <16 x i32> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = sext <16 x i1> [[TMP4]] to <16 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = sext <16 x i1> [[TMP6]] to <16 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> [[TMP16]], <16 x i32> [[TMP7]])
 ; CHECK-NEXT:    [[TMP8:%.*]] = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> [[A:%.*]], <16 x i32> [[B:%.*]])
 ; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i32 [[TMP3]] to <32 x i1>
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = select <32 x i1> [[TMP10]], <32 x i16> zeroinitializer, <32 x i16> zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = select <32 x i1> [[TMP10]], <32 x i16> [[TMP5]], <32 x i16> zeroinitializer
 ; CHECK-NEXT:    [[TMP12:%.*]] = xor <32 x i16> [[TMP8]], zeroinitializer
-; CHECK-NEXT:    [[TMP13:%.*]] = or <32 x i16> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = or <32 x i16> [[TMP12]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = or <32 x i16> [[TMP13]], zeroinitializer
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <32 x i1> [[TMP9]], <32 x i16> [[TMP14]], <32 x i16> [[TMP11]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = select <32 x i1> [[TMP10]], <32 x i16> [[TMP8]], <32 x i16> zeroinitializer
@@ -989,18 +897,13 @@ define <32 x i16> @test_mask_packus_epi32_rm_512(<16 x i32> %a, ptr %ptr_b) #0 {
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP7]], align 64
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP8]], 0
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i32> [[_MSLD]] to i512
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP9]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP1]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
-; CHECK:       10:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       11:
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = sext <16 x i1> [[TMP8]] to <16 x i32>
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <16 x i32> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = sext <16 x i1> [[TMP10]] to <16 x i32>
+; CHECK-NEXT:    [[TMP9:%.*]] = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> [[TMP13]], <16 x i32> [[TMP11]])
 ; CHECK-NEXT:    [[TMP12:%.*]] = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> [[A:%.*]], <16 x i32> [[B]])
-; CHECK-NEXT:    store <32 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <32 x i16> [[TMP9]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <32 x i16> [[TMP12]]
 ;
   %b = load <16 x i32>, ptr %ptr_b
@@ -1026,22 +929,17 @@ define <32 x i16> @test_mask_packus_epi32_rmk_512(<16 x i32> %a, ptr %ptr_b, <32
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP9]], align 64
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP10]], 0
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i32> [[_MSLD]] to i512
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP11]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP1]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
-; CHECK:       12:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       13:
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = sext <16 x i1> [[TMP10]] to <16 x i32>
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <16 x i32> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = sext <16 x i1> [[TMP12]] to <16 x i32>
+; CHECK-NEXT:    [[TMP11:%.*]] = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> [[TMP22]], <16 x i32> [[TMP13]])
 ; CHECK-NEXT:    [[TMP14:%.*]] = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> [[A:%.*]], <16 x i32> [[B]])
 ; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i32 [[TMP3]] to <32 x i1>
 ; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1>
-; CHECK-NEXT:    [[TMP17:%.*]] = select <32 x i1> [[TMP16]], <32 x i16> zeroinitializer, <32 x i16> [[TMP4]]
+; CHECK-NEXT:    [[TMP17:%.*]] = select <32 x i1> [[TMP16]], <32 x i16> [[TMP11]], <32 x i16> [[TMP4]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = xor <32 x i16> [[TMP14]], [[PASSTHRU:%.*]]
-; CHECK-NEXT:    [[TMP19:%.*]] = or <32 x i16> [[TMP18]], zeroinitializer
+; CHECK-NEXT:    [[TMP19:%.*]] = or <32 x i16> [[TMP18]], [[TMP11]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = or <32 x i16> [[TMP19]], [[TMP4]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <32 x i1> [[TMP15]], <32 x i16> [[TMP20]], <32 x i16> [[TMP17]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = select <32 x i1> [[TMP16]], <32 x i16> [[TMP14]], <32 x i16> [[PASSTHRU]]
@@ -1072,22 +970,17 @@ define <32 x i16> @test_mask_packus_epi32_rmkz_512(<16 x i32> %a, ptr %ptr_b, i3
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
 ; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP8]], align 64
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP9]], 0
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i32> [[_MSLD]] to i512
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP10]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP1]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP11:%.*]], label [[TMP12:%.*]], !prof [[PROF1]]
-; CHECK:       11:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       12:
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = sext <16 x i1> [[TMP9]] to <16 x i32>
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne <16 x i32> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = sext <16 x i1> [[TMP11]] to <16 x i32>
+; CHECK-NEXT:    [[TMP10:%.*]] = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> [[TMP21]], <16 x i32> [[TMP12]])
 ; CHECK-NEXT:    [[TMP13:%.*]] = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> [[A:%.*]], <16 x i32> [[B]])
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i32 [[TMP3]] to <32 x i1>
 ; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1>
-; CHECK-NEXT:    [[TMP16:%.*]] = select <32 x i1> [[TMP15]], <32 x i16> zeroinitializer, <32 x i16> zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = select <32 x i1> [[TMP15]], <32 x i16> [[TMP10]], <32 x i16> zeroinitializer
 ; CHECK-NEXT:    [[TMP17:%.*]] = xor <32 x i16> [[TMP13]], zeroinitializer
-; CHECK-NEXT:    [[TMP18:%.*]] = or <32 x i16> [[TMP17]], zeroinitializer
+; CHECK-NEXT:    [[TMP18:%.*]] = or <32 x i16> [[TMP17]], [[TMP10]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = or <32 x i16> [[TMP18]], zeroinitializer
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <32 x i1> [[TMP14]], <32 x i16> [[TMP19]], <32 x i16> [[TMP16]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = select <32 x i1> [[TMP15]], <32 x i16> [[TMP13]], <32 x i16> zeroinitializer
@@ -1121,18 +1014,13 @@ define <32 x i16> @test_mask_packus_epi32_rmb_512(<16 x i32> %a, ptr %ptr_b) #0
 ; CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <16 x i32> poison, i32 [[Q]], i32 0
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <16 x i32> [[_MSPROP]], <16 x i32> splat (i32 -1), <16 x i32> zeroinitializer
 ; CHECK-NEXT:    [[B:%.*]] = shufflevector <16 x i32> [[VECINIT_I]], <16 x i32> poison, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP8]], 0
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i32> [[_MSPROP1]] to i512
-; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i512 [[TMP9]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
-; CHECK:       10:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       11:
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = sext <16 x i1> [[TMP8]] to <16 x i32>
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <16 x i32> [[_MSPROP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = sext <16 x i1> [[TMP10]] to <16 x i32>
+; CHECK-NEXT:    [[TMP9:%.*]] = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> [[TMP13]], <16 x i32> [[TMP11]])
 ; CHECK-NEXT:    [[TMP12:%.*]] = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> [[A:%.*]], <16 x i32> [[B]])
-; CHECK-NEXT:    store <32 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <32 x i16> [[TMP9]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <32 x i16> [[TMP12]]
 ;
   %q = load i32, ptr %ptr_b
@@ -1164,22 +1052,17 @@ define <32 x i16> @test_mask_packus_epi32_rmbk_512(<16 x i32> %a, ptr %ptr_b, <3
 ; CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <16 x i32> poison, i32 [[Q]], i32 0
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <16 x i32> [[_MSPROP]], <16 x i32> splat (i32 -1), <16 x i32> zeroinitializer
 ; CHECK-NEXT:    [[B:%.*]] = shufflevector <16 x i32> [[VECINIT_I]], <16 x i32> poison, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP10]], 0
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i32> [[_MSPROP1]] to i512
-; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i512 [[TMP11]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
-; CHECK:       12:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       13:
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = sext <16 x i1> [[TMP10]] to <16 x i32>
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <16 x i32> [[_MSPROP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = sext <16 x i1> [[TMP12]] to <16 x i32>
+; CHECK-NEXT:    [[TMP11:%.*]] = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> [[TMP22]], <16 x i32> [[TMP13]])
 ; CHECK-NEXT:    [[TMP14:%.*]] = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> [[A:%.*]], <16 x i32> [[B]])
 ; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i32 [[TMP3]] to <32 x i1>
 ; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1>
-; CHECK-NEXT:    [[TMP17:%.*]] = select <32 x i1> [[TMP16]], <32 x i16> zeroinitializer, <32 x i16> [[TMP4]]
+; CHECK-NEXT:    [[TMP17:%.*]] = select <32 x i1> [[TMP16]], <32 x i16> [[TMP11]], <32 x i16> [[TMP4]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = xor <32 x i16> [[TMP14]], [[PASSTHRU:%.*]]
-; CHECK-NEXT:    [[TMP19:%.*]] = or <32 x i16> [[TMP18]], zeroinitializer
+; CHECK-NEXT:    [[TMP19:%.*]] = or <32 x i16> [[TMP18]], [[TMP11]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = or <32 x i16> [[TMP19]], [[TMP4]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <32 x i1> [[TMP15]], <32 x i16> [[TMP20]], <32 x i16> [[TMP17]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = select <32 x i1> [[TMP16]], <32 x i16> [[TMP14]], <32 x i16> [[PASSTHRU]]
@@ -1216,22 +1099,17 @@ define <32 x i16> @test_mask_packus_epi32_rmbkz_512(<16 x i32> %a, ptr %ptr_b, i
 ; CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <16 x i32> poison, i32 [[Q]], i32 0
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <16 x i32> [[_MSPROP]], <16 x i32> splat (i32 -1), <16 x i32> zeroinitializer
 ; CHECK-NEXT:    [[B:%.*]] = shufflevector <16 x i32> [[VECINIT_I]], <16 x i32> poison, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP9]], 0
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i32> [[_MSPROP1]] to i512
-; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i512 [[TMP10]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP11:%.*]], label [[TMP12:%.*]], !prof [[PROF1]]
-; CHECK:       11:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       12:
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = sext <16 x i1> [[TMP9]] to <16 x i32>
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne <16 x i32> [[_MSPROP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = sext <16 x i1> [[TMP11]] to <16 x i32>
+; CHECK-NEXT:    [[TMP10:%.*]] = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> [[TMP21]], <16 x i32> [[TMP12]])
 ; CHECK-NEXT:    [[TMP13:%.*]] = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> [[A:%.*]], <16 x i32> [[B]])
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i32 [[TMP3]] to <32 x i1>
 ; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1>
-; CHECK-NEXT:    [[TMP16:%.*]] = select <32 x i1> [[TMP15]], <32 x i16> zeroinitializer, <32 x i16> zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = select <32 x i1> [[TMP15]], <32 x i16> [[TMP10]], <32 x i16> zeroinitializer
 ; CHECK-NEXT:    [[TMP17:%.*]] = xor <32 x i16> [[TMP13]], zeroinitializer
-; CHECK-NEXT:    [[TMP18:%.*]] = or <32 x i16> [[TMP17]], zeroinitializer
+; CHECK-NEXT:    [[TMP18:%.*]] = or <32 x i16> [[TMP17]], [[TMP10]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = or <32 x i16> [[TMP18]], zeroinitializer
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <32 x i1> [[TMP14]], <32 x i16> [[TMP19]], <32 x i16> [[TMP16]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = select <32 x i1> [[TMP15]], <32 x i16> [[TMP13]], <32 x i16> zeroinitializer
@@ -1254,18 +1132,13 @@ define <64 x i8> @test_mask_packus_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) #0
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <32 x i16> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <32 x i16> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <32 x i16> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = sext <32 x i1> [[TMP3]] to <32 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <32 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = sext <32 x i1> [[TMP5]] to <32 x i16>
+; CHECK-NEXT:    [[TMP4:%.*]] = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> [[TMP8]], <32 x i16> [[TMP6]])
 ; CHECK-NEXT:    [[TMP7:%.*]] = call <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16> [[A:%.*]], <32 x i16> [[B:%.*]])
-; CHECK-NEXT:    store <64 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <64 x i8> [[TMP4]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <64 x i8> [[TMP7]]
 ;
   %1 = call <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16> %a, <32 x i16> %b)
@@ -1279,22 +1152,17 @@ define <64 x i8> @test_mask_packus_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <32 x i16> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <32 x i16> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       8:
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <32 x i16> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = sext <32 x i1> [[TMP5]] to <32 x i16>
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne <32 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = sext <32 x i1> [[TMP7]] to <32 x i16>
+; CHECK-NEXT:    [[TMP6:%.*]] = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> [[TMP17]], <32 x i16> [[TMP8]])
 ; CHECK-NEXT:    [[TMP9:%.*]] = call <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16> [[A:%.*]], <32 x i16> [[B:%.*]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i64 [[TMP3]] to <64 x i1>
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i64 [[MASK:%.*]] to <64 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <64 x i1> [[TMP11]], <64 x i8> zeroinitializer, <64 x i8> [[TMP4]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <64 x i1> [[TMP11]], <64 x i8> [[TMP6]], <64 x i8> [[TMP4]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = xor <64 x i8> [[TMP9]], [[PASSTHRU:%.*]]
-; CHECK-NEXT:    [[TMP14:%.*]] = or <64 x i8> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = or <64 x i8> [[TMP13]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = or <64 x i8> [[TMP14]], [[TMP4]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <64 x i1> [[TMP10]], <64 x i8> [[TMP15]], <64 x i8> [[TMP12]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = select <64 x i1> [[TMP11]], <64 x i8> [[TMP9]], <64 x i8> [[PASSTHRU]]
@@ -1313,22 +1181,17 @@ define <64 x i8> @test_mask_packus_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b,
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <32 x i16> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <32 x i16> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <32 x i16> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = sext <32 x i1> [[TMP4]] to <32 x i16>
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <32 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = sext <32 x i1> [[TMP6]] to <32 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> [[TMP16]], <32 x i16> [[TMP7]])
 ; CHECK-NEXT:    [[TMP8:%.*]] = call <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16> [[A:%.*]], <32 x i16> [[B:%.*]])
 ; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[TMP3]] to <64 x i1>
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i64 [[MASK:%.*]] to <64 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = select <64 x i1> [[TMP10]], <64 x i8> zeroinitializer, <64 x i8> zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = select <64 x i1> [[TMP10]], <64 x i8> [[TMP5]], <64 x i8> zeroinitializer
 ; CHECK-NEXT:    [[TMP12:%.*]] = xor <64 x i8> [[TMP8]], zeroinitializer
-; CHECK-NEXT:    [[TMP13:%.*]] = or <64 x i8> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = or <64 x i8> [[TMP12]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = or <64 x i8> [[TMP13]], zeroinitializer
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <64 x i1> [[TMP9]], <64 x i8> [[TMP14]], <64 x i8> [[TMP11]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = select <64 x i1> [[TMP10]], <64 x i8> [[TMP8]], <64 x i8> zeroinitializer
@@ -1357,18 +1220,13 @@ define <64 x i8> @test_mask_packus_epi16_rm_512(<32 x i16> %a, ptr %ptr_b) #0 {
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <32 x i16>, ptr [[TMP7]], align 64
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <32 x i16> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP8]], 0
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <32 x i16> [[_MSLD]] to i512
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP9]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP1]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
-; CHECK:       10:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       11:
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne <32 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = sext <32 x i1> [[TMP8]] to <32 x i16>
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <32 x i16> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = sext <32 x i1> [[TMP10]] to <32 x i16>
+; CHECK-NEXT:    [[TMP9:%.*]] = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> [[TMP13]], <32 x i16> [[TMP11]])
 ; CHECK-NEXT:    [[TMP12:%.*]] = call <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16> [[A:%.*]], <32 x i16> [[B]])
-; CHECK-NEXT:    store <64 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <64 x i8> [[TMP9]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <64 x i8> [[TMP12]]
 ;
   %b = load <32 x i16>, ptr %ptr_b
@@ -1394,22 +1252,17 @@ define <64 x i8> @test_mask_packus_epi16_rmk_512(<32 x i16> %a, ptr %ptr_b, <64
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <32 x i16>, ptr [[TMP9]], align 64
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <32 x i16> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP10]], 0
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <32 x i16> [[_MSLD]] to i512
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP11]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP1]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
-; CHECK:       12:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       13:
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <32 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = sext <32 x i1> [[TMP10]] to <32 x i16>
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <32 x i16> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = sext <32 x i1> [[TMP12]] to <32 x i16>
+; CHECK-NEXT:    [[TMP11:%.*]] = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> [[TMP22]], <32 x i16> [[TMP13]])
 ; CHECK-NEXT:    [[TMP14:%.*]] = call <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16> [[A:%.*]], <32 x i16> [[B]])
 ; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i64 [[TMP3]] to <64 x i1>
 ; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i64 [[MASK:%.*]] to <64 x i1>
-; CHECK-NEXT:    [[TMP17:%.*]] = select <64 x i1> [[TMP16]], <64 x i8> zeroinitializer, <64 x i8> [[TMP4]]
+; CHECK-NEXT:    [[TMP17:%.*]] = select <64 x i1> [[TMP16]], <64 x i8> [[TMP11]], <64 x i8> [[TMP4]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = xor <64 x i8> [[TMP14]], [[PASSTHRU:%.*]]
-; CHECK-NEXT:    [[TMP19:%.*]] = or <64 x i8> [[TMP18]], zeroinitializer
+; CHECK-NEXT:    [[TMP19:%.*]] = or <64 x i8> [[TMP18]], [[TMP11]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = or <64 x i8> [[TMP19]], [[TMP4]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <64 x i1> [[TMP15]], <64 x i8> [[TMP20]], <64 x i8> [[TMP17]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = select <64 x i1> [[TMP16]], <64 x i8> [[TMP14]], <64 x i8> [[PASSTHRU]]
@@ -1440,22 +1293,17 @@ define <64 x i8> @test_mask_packus_epi16_rmkz_512(<32 x i16> %a, ptr %ptr_b, i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
 ; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <32 x i16>, ptr [[TMP8]], align 64
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <32 x i16> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP9]], 0
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <32 x i16> [[_MSLD]] to i512
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP10]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP1]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP11:%.*]], label [[TMP12:%.*]], !prof [[PROF1]]
-; CHECK:       11:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       12:
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <32 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = sext <32 x i1> [[TMP9]] to <32 x i16>
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne <32 x i16> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = sext <32 x i1> [[TMP11]] to <32 x i16>
+; CHECK-NEXT:    [[TMP10:%.*]] = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> [[TMP21]], <32 x i16> [[TMP12]])
 ; CHECK-NEXT:    [[TMP13:%.*]] = call <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16> [[A:%.*]], <32 x i16> [[B]])
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i64 [[TMP3]] to <64 x i1>
 ; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i64 [[MASK:%.*]] to <64 x i1>
-; CHECK-NEXT:    [[TMP16:%.*]] = select <64 x i1> [[TMP15]], <64 x i8> zeroinitializer, <64 x i8> zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = select <64 x i1> [[TMP15]], <64 x i8> [[TMP10]], <64 x i8> zeroinitializer
 ; CHECK-NEXT:    [[TMP17:%.*]] = xor <64 x i8> [[TMP13]], zeroinitializer
-; CHECK-NEXT:    [[TMP18:%.*]] = or <64 x i8> [[TMP17]], zeroinitializer
+; CHECK-NEXT:    [[TMP18:%.*]] = or <64 x i8> [[TMP17]], [[TMP10]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = or <64 x i8> [[TMP18]], zeroinitializer
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <64 x i1> [[TMP14]], <64 x i8> [[TMP19]], <64 x i8> [[TMP16]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = select <64 x i1> [[TMP15]], <64 x i8> [[TMP13]], <64 x i8> zeroinitializer
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512fp16-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512fp16-intrinsics.ll
index c5d91adf64cb3..e5cbe8c132238 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512fp16-intrinsics.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512fp16-intrinsics.ll
@@ -19,7 +19,6 @@
 ; - llvm.x86.avx512fp16.mask.reduce.sh
 ; - llvm.x86.avx512fp16.mask.rndscale.ph.512
 ; - llvm.x86.avx512fp16.mask.rndscale.sh
-; - llvm.x86.avx512fp16.mask.rsqrt.ph.512
 ; - llvm.x86.avx512fp16.mask.rsqrt.sh
 ; - llvm.x86.avx512fp16.mask.scalef.ph.512
 ; - llvm.x86.avx512fp16.mask.scalef.sh
@@ -442,15 +441,11 @@ define <32 x half> @test_rsqrt_ph_512(<32 x half> %a0) #0 {
 ; CHECK-SAME: <32 x half> [[A0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <32 x i16> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
-; CHECK:       [[BB3]]:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       [[BB4]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne <32 x i16> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = sext <32 x i1> [[TMP2]] to <32 x i16>
+; CHECK-NEXT:    [[TMP4:%.*]] = select <32 x i1> splat (i1 true), <32 x i16> [[TMP3]], <32 x i16> zeroinitializer
 ; CHECK-NEXT:    [[RES:%.*]] = call <32 x half> @llvm.x86.avx512fp16.mask.rsqrt.ph.512(<32 x half> [[A0]], <32 x half> zeroinitializer, i32 -1)
-; CHECK-NEXT:    store <32 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <32 x i16> [[TMP4]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <32 x half> [[RES]]
 ;
   %res = call <32 x half> @llvm.x86.avx512fp16.mask.rsqrt.ph.512(<32 x half> %a0, <32 x half> zeroinitializer, i32 -1)
@@ -681,24 +676,22 @@ declare <32 x half> @llvm.x86.avx512fp16.mask.rcp.ph.512(<32 x half>, <32 x half
 define <32 x half> @test_rcp_ph_512(<32 x half> %a0, <32 x half> %a1, i32 %mask) #0 {
 ; CHECK-LABEL: define <32 x half> @test_rcp_ph_512(
 ; CHECK-SAME: <32 x half> [[A0:%.*]], <32 x half> [[A1:%.*]], i32 [[MASK:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <32 x i16> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <32 x i16> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32 [[MASK]] to <32 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <32 x i16> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = sext <32 x i1> [[TMP5]] to <32 x i16>
+; CHECK-NEXT:    [[TMP7:%.*]] = select <32 x i1> [[TMP4]], <32 x i16> [[TMP6]], <32 x i16> [[TMP2]]
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i32 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
-; CHECK:       [[BB6]]:
+; CHECK-NEXT:    br i1 [[_MSCMP2]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       [[BB7]]:
+; CHECK:       [[BB9]]:
 ; CHECK-NEXT:    [[RES:%.*]] = call <32 x half> @llvm.x86.avx512fp16.mask.rcp.ph.512(<32 x half> [[A0]], <32 x half> [[A1]], i32 [[MASK]])
-; CHECK-NEXT:    store <32 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <32 x i16> [[TMP7]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <32 x half> [[RES]]
 ;
   %res = call <32 x half> @llvm.x86.avx512fp16.mask.rcp.ph.512(<32 x half> %a0, <32 x half> %a1, i32 %mask)
@@ -3260,3 +3253,6 @@ define <32 x half> @test_mm512_castph256_ph512_freeze(<16 x half> %a0) nounwind
 }
 
 attributes #0 = { sanitize_memory }
+;.
+; CHECK: [[PROF1]] = !{!"branch_weights", i32 1, i32 1048575}
+;.
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl-intrinsics.ll
index e2dc8cbdca968..20114fe7d3151 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl-intrinsics.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl-intrinsics.ll
@@ -63,10 +63,6 @@
 ; - llvm.x86.avx512.permvar.df.256
 ; - llvm.x86.avx512.pternlog.d.128, llvm.x86.avx512.pternlog.d.256
 ; - llvm.x86.avx512.pternlog.q.128, llvm.x86.avx512.pternlog.q.256
-; - llvm.x86.avx512.rcp14.pd.128, llvm.x86.avx512.rcp14.pd.256
-; - llvm.x86.avx512.rcp14.ps.128, llvm.x86.avx512.rcp14.ps.256
-; - llvm.x86.avx512.rsqrt14.pd.128, llvm.x86.avx512.rsqrt14.pd.256
-; - llvm.x86.avx512.rsqrt14.ps.128, llvm.x86.avx512.rsqrt14.ps.256
 ;
 ; Handled heuristically: (none)
 
@@ -8066,15 +8062,11 @@ define <8 x float> @test_rsqrt_ps_256_rr(<8 x float> %a0) #0 {
 ; CHECK-SAME: <8 x float> [[A0:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
-; CHECK:       [[BB3]]:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
-; CHECK-NEXT:    unreachable
-; CHECK:       [[BB4]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne <8 x i32> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = sext <8 x i1> [[TMP2]] to <8 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = select <8 x i1> splat (i1 true), <8 x i32> [[TMP3]], <8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[RES:%.*]] = call <8 x float> @llvm.x86.avx512.rsqrt14.ps.256(<8 x float> [[A0]], <8 x float> zeroinitializer, i8 -1)
-; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <8 x i32> [[TMP4]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x float> [[RES]]
 ;
   %res = call <8 x float> @llvm.x86.avx512.rsqrt14.ps.256(<8 x float> %a0, <8 x float> zeroinitializer, i8 -1)
@@ -8085,20 +8077,21 @@ define <8 x float> @test_rsqrt_ps_256_rrkz(<8 x float> %a0, i8 %mask) #0 {
 ;
 ; CHECK-LABEL: define <8 x float> @test_rsqrt_ps_256_rrkz(
 ; CHECK-SAME: <8 x float> [[A0:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <8 x i32> [[TMP7]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = sext <8 x i1> [[TMP4]] to <8 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = select <8 x i1> [[TMP3]], <8 x i32> [[TMP5]], <8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i8 [[TMP2]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
-; CHECK:       [[BB4]]:
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       [[BB5]]:
+; CHECK:       [[BB8]]:
 ; CHECK-NEXT:    [[RES:%.*]] = call <8 x float> @llvm.x86.avx512.rsqrt14.ps.256(<8 x float> [[A0]], <8 x float> zeroinitializer, i8 [[MASK]])
-; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <8 x i32> [[TMP6]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x float> [[RES]]
 ;
   %res = call <8 x float> @llvm.x86.avx512.rsqrt14.ps.256(<8 x float> %a0, <8 x float> zeroinitializer, i8 %mask)
@@ -8109,24 +8102,22 @@ define <8 x float> @test_rsqrt_ps_256_rrk(<8 x float> %a0, <8 x float> %a1, i8 %
 ;
 ; CHECK-LABEL: define <8 x float> @test_rsqrt_ps_256_rrk(
 ; CHECK-SAME: <8 x float> [[A0:%.*]], <8 x float> [[A1:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <8 x i32> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = sext <8 x i1> [[TMP5]] to <8 x i32>
+; CHECK-NEXT:    [[TMP7:%.*]] = select <8 x i1> [[TMP4]], <8 x i32> [[TMP6]], <8 x i32> [[TMP2]]
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
-; CHECK:       [[BB6]]:
+; CHECK-NEXT:    br i1 [[_MSCMP2]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       [[BB7]]:
+; CHECK:       [[BB9]]:
 ; CHECK-NEXT:    [[RES:%.*]] = call <8 x float> @llvm.x86.avx512.rsqrt14.ps.256(<8 x float> [[A0]], <8 x float> [[A1]], i8 [[MASK]])
-; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <8 x i32> [[TMP7]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x float> [[RES]]
 ;
   %res = call <8 x float> @llvm.x86.avx512.rsqrt14.ps.256(<8 x float> %a0, <8 x float> %a1, i8 %mask)
@@ -8138,15 +8129,11 @@ define <4 x float> @test_rsqrt_ps_128_rr(<4 x float> %a0) #0 {
 ; CHECK-SAME: <4 x float> [[A0:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
-; CHECK:       [[BB3]]:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
-; CHECK-NEXT:    unreachable
-; CHECK:       [[BB4]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne <4 x i32> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = select <4 x i1> splat (i1 true), <4 x i32> [[TMP3]], <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.rsqrt14.ps.128(<4 x float> [[A0]], <4 x float> zeroinitializer, i8 -1)
-; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <4 x i32> [[TMP4]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x float> [[RES]]
 ;
   %res = call <4 x float> @llvm.x86.avx512.rsqrt14.ps.128(<4 x float> %a0, <4 x float> zeroinitializer, i8 -1)
@@ -8157,20 +8144,22 @@ define <4 x float> @test_rsqrt_ps_128_rrkz(<4 x float> %a0, i8 %mask) #0 {
 ;
 ; CHECK-LABEL: define <4 x float> @test_rsqrt_ps_128_rrkz(
 ; CHECK-SAME: <4 x float> [[A0:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc i8 [[MASK]] to i4
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i4 [[TMP3]] to <4 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <4 x i32> [[TMP8]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = sext <4 x i1> [[TMP5]] to <4 x i32>
+; CHECK-NEXT:    [[TMP7:%.*]] = select <4 x i1> [[TMP4]], <4 x i32> [[TMP6]], <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i8 [[TMP2]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
-; CHECK:       [[BB4]]:
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       [[BB5]]:
+; CHECK:       [[BB9]]:
 ; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.rsqrt14.ps.128(<4 x float> [[A0]], <4 x float> zeroinitializer, i8 [[MASK]])
-; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <4 x i32> [[TMP7]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x float> [[RES]]
 ;
   %res = call <4 x float> @llvm.x86.avx512.rsqrt14.ps.128(<4 x float> %a0, <4 x float> zeroinitializer, i8 %mask)
@@ -8181,24 +8170,23 @@ define <4 x float> @test_rsqrt_ps_128_rrk(<4 x float> %a0, <4 x float> %a1, i8 %
 ;
 ; CHECK-LABEL: define <4 x float> @test_rsqrt_ps_128_rrk(
 ; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc i8 [[MASK]] to i4
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i4 [[TMP4]] to <4 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <4 x i32> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = sext <4 x i1> [[TMP6]] to <4 x i32>
+; CHECK-NEXT:    [[TMP8:%.*]] = select <4 x i1> [[TMP5]], <4 x i32> [[TMP7]], <4 x i32> [[TMP2]]
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
-; CHECK:       [[BB6]]:
+; CHECK-NEXT:    br i1 [[_MSCMP2]], label %[[BB9:.*]], label %[[BB10:.*]], !prof [[PROF1]]
+; CHECK:       [[BB9]]:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       [[BB7]]:
+; CHECK:       [[BB10]]:
 ; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.rsqrt14.ps.128(<4 x float> [[A0]], <4 x float> [[A1]], i8 [[MASK]])
-; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <4 x i32> [[TMP8]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x float> [[RES]]
 ;
   %res = call <4 x float> @llvm.x86.avx512.rsqrt14.ps.128(<4 x float> %a0, <4 x float> %a1, i8 %mask)
@@ -8213,15 +8201,11 @@ define <8 x float> @test_rcp_ps_256_rr(<8 x float> %a0) #0 {
 ; CHECK-SAME: <8 x float> [[A0:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
-; CHECK:       [[BB3]]:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
-; CHECK-NEXT:    unreachable
-; CHECK:       [[BB4]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne <8 x i32> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = sext <8 x i1> [[TMP2]] to <8 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = select <8 x i1> splat (i1 true), <8 x i32> [[TMP3]], <8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[RES:%.*]] = call <8 x float> @llvm.x86.avx512.rcp14.ps.256(<8 x float> [[A0]], <8 x float> zeroinitializer, i8 -1)
-; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <8 x i32> [[TMP4]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x float> [[RES]]
 ;
   %res = call <8 x float> @llvm.x86.avx512.rcp14.ps.256(<8 x float> %a0, <8 x float> zeroinitializer, i8 -1)
@@ -8232,20 +8216,21 @@ define <8 x float> @test_rcp_ps_256_rrkz(<8 x float> %a0, i8 %mask) #0 {
 ;
 ; CHECK-LABEL: define <8 x float> @test_rcp_ps_256_rrkz(
 ; CHECK-SAME: <8 x float> [[A0:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <8 x i32> [[TMP7]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = sext <8 x i1> [[TMP4]] to <8 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = select <8 x i1> [[TMP3]], <8 x i32> [[TMP5]], <8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i8 [[TMP2]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
-; CHECK:       [[BB4]]:
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       [[BB5]]:
+; CHECK:       [[BB8]]:
 ; CHECK-NEXT:    [[RES:%.*]] = call <8 x float> @llvm.x86.avx512.rcp14.ps.256(<8 x float> [[A0]], <8 x float> zeroinitializer, i8 [[MASK]])
-; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <8 x i32> [[TMP6]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x float> [[RES]]
 ;
   %res = call <8 x float> @llvm.x86.avx512.rcp14.ps.256(<8 x float> %a0, <8 x float> zeroinitializer, i8 %mask)
@@ -8256,24 +8241,22 @@ define <8 x float> @test_rcp_ps_256_rrk(<8 x float> %a0, <8 x float> %a1, i8 %ma
 ;
 ; CHECK-LABEL: define <8 x float> @test_rcp_ps_256_rrk(
 ; CHECK-SAME: <8 x float> [[A0:%.*]], <8 x float> [[A1:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <8 x i32> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = sext <8 x i1> [[TMP5]] to <8 x i32>
+; CHECK-NEXT:    [[TMP7:%.*]] = select <8 x i1> [[TMP4]], <8 x i32> [[TMP6]], <8 x i32> [[TMP2]]
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
-; CHECK:       [[BB6]]:
+; CHECK-NEXT:    br i1 [[_MSCMP2]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       [[BB7]]:
+; CHECK:       [[BB9]]:
 ; CHECK-NEXT:    [[RES:%.*]] = call <8 x float> @llvm.x86.avx512.rcp14.ps.256(<8 x float> [[A0]], <8 x float> [[A1]], i8 [[MASK]])
-; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <8 x i32> [[TMP7]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x float> [[RES]]
 ;
   %res = call <8 x float> @llvm.x86.avx512.rcp14.ps.256(<8 x float> %a0, <8 x float> %a1, i8 %mask)
@@ -8285,15 +8268,11 @@ define <4 x float> @test_rcp_ps_128_rr(<4 x float> %a0) #0 {
 ; CHECK-SAME: <4 x float> [[A0:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
-; CHECK:       [[BB3]]:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
-; CHECK-NEXT:    unreachable
-; CHECK:       [[BB4]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne <4 x i32> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = select <4 x i1> splat (i1 true), <4 x i32> [[TMP3]], <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.rcp14.ps.128(<4 x float> [[A0]], <4 x float> zeroinitializer, i8 -1)
-; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <4 x i32> [[TMP4]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x float> [[RES]]
 ;
   %res = call <4 x float> @llvm.x86.avx512.rcp14.ps.128(<4 x float> %a0, <4 x float> zeroinitializer, i8 -1)
@@ -8304,20 +8283,22 @@ define <4 x float> @test_rcp_ps_128_rrkz(<4 x float> %a0, i8 %mask) #0 {
 ;
 ; CHECK-LABEL: define <4 x float> @test_rcp_ps_128_rrkz(
 ; CHECK-SAME: <4 x float> [[A0:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc i8 [[MASK]] to i4
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i4 [[TMP3]] to <4 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <4 x i32> [[TMP8]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = sext <4 x i1> [[TMP5]] to <4 x i32>
+; CHECK-NEXT:    [[TMP7:%.*]] = select <4 x i1> [[TMP4]], <4 x i32> [[TMP6]], <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i8 [[TMP2]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
-; CHECK:       [[BB4]]:
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       [[BB5]]:
+; CHECK:       [[BB9]]:
 ; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.rcp14.ps.128(<4 x float> [[A0]], <4 x float> zeroinitializer, i8 [[MASK]])
-; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <4 x i32> [[TMP7]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x float> [[RES]]
 ;
   %res = call <4 x float> @llvm.x86.avx512.rcp14.ps.128(<4 x float> %a0, <4 x float> zeroinitializer, i8 %mask)
@@ -8328,24 +8309,23 @@ define <4 x float> @test_rcp_ps_128_rrk(<4 x float> %a0, <4 x float> %a1, i8 %ma
 ;
 ; CHECK-LABEL: define <4 x float> @test_rcp_ps_128_rrk(
 ; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc i8 [[MASK]] to i4
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i4 [[TMP4]] to <4 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <4 x i32> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = sext <4 x i1> [[TMP6]] to <4 x i32>
+; CHECK-NEXT:    [[TMP8:%.*]] = select <4 x i1> [[TMP5]], <4 x i32> [[TMP7]], <4 x i32> [[TMP2]]
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
-; CHECK:       [[BB6]]:
+; CHECK-NEXT:    br i1 [[_MSCMP2]], label %[[BB9:.*]], label %[[BB10:.*]], !prof [[PROF1]]
+; CHECK:       [[BB9]]:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       [[BB7]]:
+; CHECK:       [[BB10]]:
 ; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.rcp14.ps.128(<4 x float> [[A0]], <4 x float> [[A1]], i8 [[MASK]])
-; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <4 x i32> [[TMP8]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x float> [[RES]]
 ;
   %res = call <4 x float> @llvm.x86.avx512.rcp14.ps.128(<4 x float> %a0, <4 x float> %a1, i8 %mask)
@@ -8360,15 +8340,11 @@ define <4 x double> @test_rsqrt_pd_256_rr(<4 x double> %a0) #0 {
 ; CHECK-SAME: <4 x double> [[A0:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
-; CHECK:       [[BB3]]:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
-; CHECK-NEXT:    unreachable
-; CHECK:       [[BB4]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne <4 x i64> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i64>
+; CHECK-NEXT:    [[TMP4:%.*]] = select <4 x i1> splat (i1 true), <4 x i64> [[TMP3]], <4 x i64> zeroinitializer
 ; CHECK-NEXT:    [[RES:%.*]] = call <4 x double> @llvm.x86.avx512.rsqrt14.pd.256(<4 x double> [[A0]], <4 x double> zeroinitializer, i8 -1)
-; CHECK-NEXT:    store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <4 x i64> [[TMP4]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x double> [[RES]]
 ;
   %res = call <4 x double> @llvm.x86.avx512.rsqrt14.pd.256(<4 x double> %a0, <4 x double> zeroinitializer, i8 -1)
@@ -8379,20 +8355,22 @@ define <4 x double> @test_rsqrt_pd_256_rrkz(<4 x double> %a0, i8 %mask) #0 {
 ;
 ; CHECK-LABEL: define <4 x double> @test_rsqrt_pd_256_rrkz(
 ; CHECK-SAME: <4 x double> [[A0:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc i8 [[MASK]] to i4
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i4 [[TMP3]] to <4 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <4 x i64> [[TMP8]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = sext <4 x i1> [[TMP5]] to <4 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = select <4 x i1> [[TMP4]], <4 x i64> [[TMP6]], <4 x i64> zeroinitializer
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i8 [[TMP2]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
-; CHECK:       [[BB4]]:
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       [[BB5]]:
+; CHECK:       [[BB9]]:
 ; CHECK-NEXT:    [[RES:%.*]] = call <4 x double> @llvm.x86.avx512.rsqrt14.pd.256(<4 x double> [[A0]], <4 x double> zeroinitializer, i8 [[MASK]])
-; CHECK-NEXT:    store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <4 x i64> [[TMP7]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x double> [[RES]]
 ;
   %res = call <4 x double> @llvm.x86.avx512.rsqrt14.pd.256(<4 x double> %a0, <4 x double> zeroinitializer, i8 %mask)
@@ -8403,24 +8381,23 @@ define <4 x double> @test_rsqrt_pd_256_rrk(<4 x double> %a0, <4 x double> %a1, i
 ;
 ; CHECK-LABEL: define <4 x double> @test_rsqrt_pd_256_rrk(
 ; CHECK-SAME: <4 x double> [[A0:%.*]], <4 x double> [[A1:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc i8 [[MASK]] to i4
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i4 [[TMP4]] to <4 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <4 x i64> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = sext <4 x i1> [[TMP6]] to <4 x i64>
+; CHECK-NEXT:    [[TMP8:%.*]] = select <4 x i1> [[TMP5]], <4 x i64> [[TMP7]], <4 x i64> [[TMP2]]
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
-; CHECK:       [[BB6]]:
+; CHECK-NEXT:    br i1 [[_MSCMP2]], label %[[BB9:.*]], label %[[BB10:.*]], !prof [[PROF1]]
+; CHECK:       [[BB9]]:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       [[BB7]]:
+; CHECK:       [[BB10]]:
 ; CHECK-NEXT:    [[RES:%.*]] = call <4 x double> @llvm.x86.avx512.rsqrt14.pd.256(<4 x double> [[A0]], <4 x double> [[A1]], i8 [[MASK]])
-; CHECK-NEXT:    store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <4 x i64> [[TMP8]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x double> [[RES]]
 ;
   %res = call <4 x double> @llvm.x86.avx512.rsqrt14.pd.256(<4 x double> %a0, <4 x double> %a1, i8 %mask)
@@ -8432,15 +8409,11 @@ define <2 x double> @test_rsqrt_pd_128_rr(<2 x double> %a0) #0 {
 ; CHECK-SAME: <2 x double> [[A0:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
-; CHECK:       [[BB3]]:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
-; CHECK-NEXT:    unreachable
-; CHECK:       [[BB4]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne <2 x i64> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64>
+; CHECK-NEXT:    [[TMP4:%.*]] = select <2 x i1> splat (i1 true), <2 x i64> [[TMP3]], <2 x i64> zeroinitializer
 ; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.rsqrt14.pd.128(<2 x double> [[A0]], <2 x double> zeroinitializer, i8 -1)
-; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <2 x i64> [[TMP4]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x double> [[RES]]
 ;
   %res = call <2 x double> @llvm.x86.avx512.rsqrt14.pd.128(<2 x double> %a0, <2 x double> zeroinitializer, i8 -1)
@@ -8451,20 +8424,22 @@ define <2 x double> @test_rsqrt_pd_128_rrkz(<2 x double> %a0, i8 %mask) #0 {
 ;
 ; CHECK-LABEL: define <2 x double> @test_rsqrt_pd_128_rrkz(
 ; CHECK-SAME: <2 x double> [[A0:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc i8 [[MASK]] to i2
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i2 [[TMP3]] to <2 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <2 x i64> [[TMP8]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = sext <2 x i1> [[TMP5]] to <2 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = select <2 x i1> [[TMP4]], <2 x i64> [[TMP6]], <2 x i64> zeroinitializer
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i8 [[TMP2]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
-; CHECK:       [[BB4]]:
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       [[BB5]]:
+; CHECK:       [[BB9]]:
 ; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.rsqrt14.pd.128(<2 x double> [[A0]], <2 x double> zeroinitializer, i8 [[MASK]])
-; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <2 x i64> [[TMP7]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x double> [[RES]]
 ;
   %res = call <2 x double> @llvm.x86.avx512.rsqrt14.pd.128(<2 x double> %a0, <2 x double> zeroinitializer, i8 %mask)
@@ -8475,24 +8450,23 @@ define <2 x double> @test_rsqrt_pd_128_rrk(<2 x double> %a0, <2 x double> %a1, i
 ;
 ; CHECK-LABEL: define <2 x double> @test_rsqrt_pd_128_rrk(
 ; CHECK-SAME: <2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc i8 [[MASK]] to i2
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i2 [[TMP4]] to <2 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <2 x i64> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = sext <2 x i1> [[TMP6]] to <2 x i64>
+; CHECK-NEXT:    [[TMP8:%.*]] = select <2 x i1> [[TMP5]], <2 x i64> [[TMP7]], <2 x i64> [[TMP2]]
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
-; CHECK:       [[BB6]]:
+; CHECK-NEXT:    br i1 [[_MSCMP2]], label %[[BB9:.*]], label %[[BB10:.*]], !prof [[PROF1]]
+; CHECK:       [[BB9]]:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       [[BB7]]:
+; CHECK:       [[BB10]]:
 ; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.rsqrt14.pd.128(<2 x double> [[A0]], <2 x double> [[A1]], i8 [[MASK]])
-; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <2 x i64> [[TMP8]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x double> [[RES]]
 ;
   %res = call <2 x double> @llvm.x86.avx512.rsqrt14.pd.128(<2 x double> %a0, <2 x double> %a1, i8 %mask)
@@ -8507,15 +8481,11 @@ define <4 x double> @test_rcp_pd_256_rr(<4 x double> %a0) #0 {
 ; CHECK-SAME: <4 x double> [[A0:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
-; CHECK:       [[BB3]]:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
-; CHECK-NEXT:    unreachable
-; CHECK:       [[BB4]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne <4 x i64> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i64>
+; CHECK-NEXT:    [[TMP4:%.*]] = select <4 x i1> splat (i1 true), <4 x i64> [[TMP3]], <4 x i64> zeroinitializer
 ; CHECK-NEXT:    [[RES:%.*]] = call <4 x double> @llvm.x86.avx512.rcp14.pd.256(<4 x double> [[A0]], <4 x double> zeroinitializer, i8 -1)
-; CHECK-NEXT:    store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <4 x i64> [[TMP4]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x double> [[RES]]
 ;
   %res = call <4 x double> @llvm.x86.avx512.rcp14.pd.256(<4 x double> %a0, <4 x double> zeroinitializer, i8 -1)
@@ -8526,20 +8496,22 @@ define <4 x double> @test_rcp_pd_256_rrkz(<4 x double> %a0, i8 %mask) #0 {
 ;
 ; CHECK-LABEL: define <4 x double> @test_rcp_pd_256_rrkz(
 ; CHECK-SAME: <4 x double> [[A0:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc i8 [[MASK]] to i4
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i4 [[TMP3]] to <4 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <4 x i64> [[TMP8]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = sext <4 x i1> [[TMP5]] to <4 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = select <4 x i1> [[TMP4]], <4 x i64> [[TMP6]], <4 x i64> zeroinitializer
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i8 [[TMP2]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
-; CHECK:       [[BB4]]:
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       [[BB5]]:
+; CHECK:       [[BB9]]:
 ; CHECK-NEXT:    [[RES:%.*]] = call <4 x double> @llvm.x86.avx512.rcp14.pd.256(<4 x double> [[A0]], <4 x double> zeroinitializer, i8 [[MASK]])
-; CHECK-NEXT:    store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <4 x i64> [[TMP7]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x double> [[RES]]
 ;
   %res = call <4 x double> @llvm.x86.avx512.rcp14.pd.256(<4 x double> %a0, <4 x double> zeroinitializer, i8 %mask)
@@ -8550,24 +8522,23 @@ define <4 x double> @test_rcp_pd_256_rrk(<4 x double> %a0, <4 x double> %a1, i8
 ;
 ; CHECK-LABEL: define <4 x double> @test_rcp_pd_256_rrk(
 ; CHECK-SAME: <4 x double> [[A0:%.*]], <4 x double> [[A1:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc i8 [[MASK]] to i4
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i4 [[TMP4]] to <4 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <4 x i64> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = sext <4 x i1> [[TMP6]] to <4 x i64>
+; CHECK-NEXT:    [[TMP8:%.*]] = select <4 x i1> [[TMP5]], <4 x i64> [[TMP7]], <4 x i64> [[TMP2]]
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
-; CHECK:       [[BB6]]:
+; CHECK-NEXT:    br i1 [[_MSCMP2]], label %[[BB9:.*]], label %[[BB10:.*]], !prof [[PROF1]]
+; CHECK:       [[BB9]]:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       [[BB7]]:
+; CHECK:       [[BB10]]:
 ; CHECK-NEXT:    [[RES:%.*]] = call <4 x double> @llvm.x86.avx512.rcp14.pd.256(<4 x double> [[A0]], <4 x double> [[A1]], i8 [[MASK]])
-; CHECK-NEXT:    store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <4 x i64> [[TMP8]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x double> [[RES]]
 ;
   %res = call <4 x double> @llvm.x86.avx512.rcp14.pd.256(<4 x double> %a0, <4 x double> %a1, i8 %mask)
@@ -8579,15 +8550,11 @@ define <2 x double> @test_rcp_pd_128_rr(<2 x double> %a0) #0 {
 ; CHECK-SAME: <2 x double> [[A0:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
-; CHECK:       [[BB3]]:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
-; CHECK-NEXT:    unreachable
-; CHECK:       [[BB4]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne <2 x i64> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64>
+; CHECK-NEXT:    [[TMP4:%.*]] = select <2 x i1> splat (i1 true), <2 x i64> [[TMP3]], <2 x i64> zeroinitializer
 ; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.rcp14.pd.128(<2 x double> [[A0]], <2 x double> zeroinitializer, i8 -1)
-; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <2 x i64> [[TMP4]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x double> [[RES]]
 ;
   %res = call <2 x double> @llvm.x86.avx512.rcp14.pd.128(<2 x double> %a0, <2 x double> zeroinitializer, i8 -1)
@@ -8598,20 +8565,22 @@ define <2 x double> @test_rcp_pd_128_rrkz(<2 x double> %a0, i8 %mask) #0 {
 ;
 ; CHECK-LABEL: define <2 x double> @test_rcp_pd_128_rrkz(
 ; CHECK-SAME: <2 x double> [[A0:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc i8 [[MASK]] to i2
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i2 [[TMP3]] to <2 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <2 x i64> [[TMP8]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = sext <2 x i1> [[TMP5]] to <2 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = select <2 x i1> [[TMP4]], <2 x i64> [[TMP6]], <2 x i64> zeroinitializer
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i8 [[TMP2]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
-; CHECK:       [[BB4]]:
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       [[BB5]]:
+; CHECK:       [[BB9]]:
 ; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.rcp14.pd.128(<2 x double> [[A0]], <2 x double> zeroinitializer, i8 [[MASK]])
-; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <2 x i64> [[TMP7]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x double> [[RES]]
 ;
   %res = call <2 x double> @llvm.x86.avx512.rcp14.pd.128(<2 x double> %a0, <2 x double> zeroinitializer, i8 %mask)
@@ -8622,24 +8591,23 @@ define <2 x double> @test_rcp_pd_128_rrk(<2 x double> %a0, <2 x double> %a1, i8
 ;
 ; CHECK-LABEL: define <2 x double> @test_rcp_pd_128_rrk(
 ; CHECK-SAME: <2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc i8 [[MASK]] to i2
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i2 [[TMP4]] to <2 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <2 x i64> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = sext <2 x i1> [[TMP6]] to <2 x i64>
+; CHECK-NEXT:    [[TMP8:%.*]] = select <2 x i1> [[TMP5]], <2 x i64> [[TMP7]], <2 x i64> [[TMP2]]
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
-; CHECK:       [[BB6]]:
+; CHECK-NEXT:    br i1 [[_MSCMP2]], label %[[BB9:.*]], label %[[BB10:.*]], !prof [[PROF1]]
+; CHECK:       [[BB9]]:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       [[BB7]]:
+; CHECK:       [[BB10]]:
 ; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.rcp14.pd.128(<2 x double> [[A0]], <2 x double> [[A1]], i8 [[MASK]])
-; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <2 x i64> [[TMP8]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x double> [[RES]]
 ;
   %res = call <2 x double> @llvm.x86.avx512.rcp14.pd.128(<2 x double> %a0, <2 x double> %a1, i8 %mask)
diff --git a/llvm/test/Instrumentation/MemorySanitizer/vector-track-origins-neon.ll b/llvm/test/Instrumentation/MemorySanitizer/vector-track-origins-neon.ll
index 05d4d2a6551f5..48de5d1717134 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/vector-track-origins-neon.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/vector-track-origins-neon.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt < %s -S -passes="msan<eager-checks;track-origins=2>" -msan-instrumentation-with-call-threshold=0 | FileCheck %s
 ;
 ; This test illustrates a bug in MemorySanitizer that will shortly be fixed
@@ -16,7 +16,7 @@ define dso_local void @_Z1cv() local_unnamed_addr #0 {
 ; CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[DOTPRE:%.*]] = load <4 x i16>, ptr @_Z1cv, align 8, !tbaa [[TBAA1:![0-9]+]]
+; CHECK-NEXT:    [[DOTPRE:%.*]] = load <4 x i16>, ptr @_Z1cv, align 8, !tbaa [[CHAR_TBAA1:![0-9]+]]
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i16>, ptr inttoptr (i64 xor (i64 ptrtoint (ptr @_Z1cv to i64), i64 193514046488576) to ptr), align 8
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr inttoptr (i64 add (i64 xor (i64 ptrtoint (ptr @_Z1cv to i64), i64 193514046488576), i64 35184372088832) to ptr), align 8
 ; CHECK-NEXT:    br label %[[FOR_COND:.*]]
@@ -36,7 +36,7 @@ define dso_local void @_Z1cv() local_unnamed_addr #0 {
 ; CHECK-NEXT:    [[CALL:%.*]] = tail call noundef i32 @_Z1b11__Int16x4_tS_(<4 x i16> noundef [[TMP1]], <4 x i16> noundef [[LANE]])
 ; CHECK-NEXT:    [[CONV:%.*]] = sext i32 [[CALL]] to i64
 ; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[CONV]] to ptr
-; CHECK-NEXT:    [[TMP5]] = load <4 x i16>, ptr [[TMP4]], align 8, !tbaa [[TBAA1]]
+; CHECK-NEXT:    [[TMP5]] = load <4 x i16>, ptr [[TMP4]], align 8, !tbaa [[CHAR_TBAA1]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[TMP4]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
 ; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
@@ -47,7 +47,7 @@ define dso_local void @_Z1cv() local_unnamed_addr #0 {
 ; CHECK-NEXT:    store <4 x i16> [[_MSLD3]], ptr inttoptr (i64 xor (i64 ptrtoint (ptr @_Z1cv to i64), i64 193514046488576) to ptr), align 8
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[_MSLD3]] to i64
 ; CHECK-NEXT:    call void @__msan_maybe_store_origin_8(i64 zeroext [[TMP12]], ptr @_Z1cv, i32 zeroext [[TMP11]])
-; CHECK-NEXT:    store <4 x i16> [[TMP5]], ptr @_Z1cv, align 8, !tbaa [[TBAA1]]
+; CHECK-NEXT:    store <4 x i16> [[TMP5]], ptr @_Z1cv, align 8, !tbaa [[CHAR_TBAA1]]
 ; CHECK-NEXT:    br label %[[FOR_COND]], !llvm.loop [[LOOP4:![0-9]+]]
 ;
 entry:
@@ -76,7 +76,7 @@ attributes #0 = { mustprogress noreturn nounwind sanitize_memory "no-trapping-ma
 !5 = distinct !{!5, !6}
 !6 = !{!"llvm.loop.mustprogress"}
 ;.
-; CHECK: [[TBAA1]] = !{[[META2:![0-9]+]], [[META2]], i64 0}
+; CHECK: [[CHAR_TBAA1]] = !{[[META2:![0-9]+]], [[META2]], i64 0}
 ; CHECK: [[META2]] = !{!"omnipotent char", [[META3:![0-9]+]], i64 0}
 ; CHECK: [[META3]] = !{!"Simple C++ TBAA"}
 ; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META5:![0-9]+]]}
diff --git a/llvm/test/Instrumentation/TypeSanitizer/access-with-offset.ll b/llvm/test/Instrumentation/TypeSanitizer/access-with-offset.ll
index 56cf3f528f836..84e0f7307c7ec 100644
--- a/llvm/test/Instrumentation/TypeSanitizer/access-with-offset.ll
+++ b/llvm/test/Instrumentation/TypeSanitizer/access-with-offset.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 6
 ; RUN: opt -passes='tysan' -S %s | FileCheck %s
 
 ;.
@@ -12,8 +12,9 @@
 ; CHECK: @__tysan_app_memory_mask = external global i64
 ;.
 define ptr @test_load_offset(ptr %argv) {
-; CHECK-LABEL: @test_load_offset(
-; CHECK-NEXT:  entry:
+; CHECK-LABEL: define ptr @test_load_offset(
+; CHECK-SAME: ptr [[ARGV:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[APP_MEM_MASK:%.*]] = load i64, ptr @__tysan_app_memory_mask, align 4
 ; CHECK-NEXT:    [[SHADOW_BASE:%.*]] = load i64, ptr @__tysan_shadow_memory_address, align 4
 ; CHECK-NEXT:    [[APP_PTR_MASKED:%.*]] = and i64 0, [[APP_MEM_MASK]]
@@ -22,8 +23,8 @@ define ptr @test_load_offset(ptr %argv) {
 ; CHECK-NEXT:    [[SHADOW_PTR:%.*]] = inttoptr i64 [[SHADOW_PTR_INT]] to ptr
 ; CHECK-NEXT:    [[SHADOW_DESC:%.*]] = load ptr, ptr [[SHADOW_PTR]], align 8
 ; CHECK-NEXT:    [[DESC_SET:%.*]] = icmp eq ptr [[SHADOW_DESC]], null
-; CHECK-NEXT:    br i1 [[DESC_SET]], label [[SET_TYPE:%.*]], label [[TMP0:%.*]], !prof [[PROF0:![0-9]+]]
-; CHECK:       set.type:
+; CHECK-NEXT:    br i1 [[DESC_SET]], label %[[SET_TYPE:.*]], label %[[BB0:.*]], !prof [[PROF0:![0-9]+]]
+; CHECK:       [[SET_TYPE]]:
 ; CHECK-NEXT:    store ptr @__tysan_v1_any_20pointer_o_0, ptr [[SHADOW_PTR]], align 8
 ; CHECK-NEXT:    [[SHADOW_BYTE_1_OFFSET:%.*]] = add i64 [[SHADOW_PTR_INT]], 8
 ; CHECK-NEXT:    [[SHADOW_BYTE_1_PTR:%.*]] = inttoptr i64 [[SHADOW_BYTE_1_OFFSET]] to ptr
@@ -46,9 +47,9 @@ define ptr @test_load_offset(ptr %argv) {
 ; CHECK-NEXT:    [[SHADOW_BYTE_7_OFFSET:%.*]] = add i64 [[SHADOW_PTR_INT]], 56
 ; CHECK-NEXT:    [[SHADOW_BYTE_7_PTR:%.*]] = inttoptr i64 [[SHADOW_BYTE_7_OFFSET]] to ptr
 ; CHECK-NEXT:    store ptr inttoptr (i64 -7 to ptr), ptr [[SHADOW_BYTE_7_PTR]], align 8
-; CHECK-NEXT:    br label [[TMP0]]
-; CHECK:       0:
-; CHECK-NEXT:    [[L:%.*]] = load ptr, ptr null, align 8, !tbaa [[TBAA1:![0-9]+]]
+; CHECK-NEXT:    br label %[[BB0]]
+; CHECK:       [[BB0]]:
+; CHECK-NEXT:    [[L:%.*]] = load ptr, ptr null, align 8, !tbaa [[ANYPTR_TBAA1:![0-9]+]]
 ; CHECK-NEXT:    ret ptr [[L]]
 ;
 entry:
@@ -64,7 +65,7 @@ entry:
 ; CHECK: attributes #[[ATTR0:[0-9]+]] = { nounwind }
 ;.
 ; CHECK: [[PROF0]] = !{!"branch_weights", i32 1, i32 100000}
-; CHECK: [[TBAA1]] = !{[[META2:![0-9]+]], [[META2]], i64 0}
+; CHECK: [[ANYPTR_TBAA1]] = !{[[META2:![0-9]+]], [[META2]], i64 0}
 ; CHECK: [[META2]] = !{!"any pointer", [[META3:![0-9]+]], i64 0}
 ; CHECK: [[META3]] = !{!"omnipotent char", [[META4:![0-9]+]], i64 0}
 ; CHECK: [[META4]] = !{!"Simple C/C++ TBAA"}
diff --git a/llvm/test/Instrumentation/TypeSanitizer/anon.ll b/llvm/test/Instrumentation/TypeSanitizer/anon.ll
index 37de1b71e0c7e..1f0f1bd7ace15 100644
--- a/llvm/test/Instrumentation/TypeSanitizer/anon.ll
+++ b/llvm/test/Instrumentation/TypeSanitizer/anon.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 6
 ; Test basic type sanitizer instrumentation.
 ;
 ; RUN: opt -passes='tysan' -S %s | FileCheck %s
@@ -23,22 +23,23 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 ; CHECK: @llvm.used = appending global [6 x ptr] [ptr @tysan.module_ctor, ptr @__tysan_v1_Simple_20C_2b_2b_20TBAA, ptr @__tysan_v1_omnipotent_20char, ptr @__tysan_v1_int, ptr @__tysan_v1_____anonymous__027d9e575c5d34cb5d60d6a1d6276f95, ptr @__tysan_v1_____anonymous__027d9e575c5d34cb5d60d6a1d6276f95_o_24], section "llvm.metadata"
 ;.
 define void @test_anon_ns(ptr %a, ptr %b) sanitize_type {
-; CHECK-LABEL: @test_anon_ns(
-; CHECK-NEXT:  entry:
+; CHECK-LABEL: define void @test_anon_ns(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[APP_MEM_MASK:%.*]] = load i64, ptr @__tysan_app_memory_mask, align 8
 ; CHECK-NEXT:    [[SHADOW_BASE:%.*]] = load i64, ptr @__tysan_shadow_memory_address, align 8
-; CHECK-NEXT:    [[APP_PTR_INT:%.*]] = ptrtoint ptr [[A:%.*]] to i64
+; CHECK-NEXT:    [[APP_PTR_INT:%.*]] = ptrtoint ptr [[A]] to i64
 ; CHECK-NEXT:    [[APP_PTR_MASKED:%.*]] = and i64 [[APP_PTR_INT]], [[APP_MEM_MASK]]
 ; CHECK-NEXT:    [[APP_PTR_SHIFTED:%.*]] = shl i64 [[APP_PTR_MASKED]], 3
 ; CHECK-NEXT:    [[SHADOW_PTR_INT:%.*]] = add i64 [[APP_PTR_SHIFTED]], [[SHADOW_BASE]]
 ; CHECK-NEXT:    [[SHADOW_PTR:%.*]] = inttoptr i64 [[SHADOW_PTR_INT]] to ptr
 ; CHECK-NEXT:    [[SHADOW_DESC:%.*]] = load ptr, ptr [[SHADOW_PTR]], align 8
 ; CHECK-NEXT:    [[BAD_DESC:%.*]] = icmp ne ptr [[SHADOW_DESC]], @__tysan_v1___ZTSN12__GLOBAL____N__11zE_o_24
-; CHECK-NEXT:    br i1 [[BAD_DESC]], label [[TMP0:%.*]], label [[TMP22:%.*]], !prof [[PROF0:![0-9]+]]
-; CHECK:       0:
+; CHECK-NEXT:    br i1 [[BAD_DESC]], label %[[BB0:.*]], label %[[BB22:.*]], !prof [[PROF0:![0-9]+]]
+; CHECK:       [[BB0]]:
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq ptr [[SHADOW_DESC]], null
-; CHECK-NEXT:    br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP20:%.*]]
-; CHECK:       2:
+; CHECK-NEXT:    br i1 [[TMP1]], label %[[BB2:.*]], label %[[BB20:.*]]
+; CHECK:       [[BB2]]:
 ; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[SHADOW_PTR_INT]], 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr
 ; CHECK-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[TMP4]], align 8
@@ -54,11 +55,11 @@ define void @test_anon_ns(ptr %a, ptr %b) sanitize_type {
 ; CHECK-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[TMP14]], align 8
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne ptr [[TMP15]], null
 ; CHECK-NEXT:    [[TMP17:%.*]] = or i1 [[TMP12]], [[TMP16]]
-; CHECK-NEXT:    br i1 [[TMP17]], label [[TMP18:%.*]], label [[TMP19:%.*]], !prof [[PROF0]]
-; CHECK:       18:
+; CHECK-NEXT:    br i1 [[TMP17]], label %[[BB18:.*]], label %[[BB19:.*]], !prof [[PROF0]]
+; CHECK:       [[BB18]]:
 ; CHECK-NEXT:    call void @__tysan_check(ptr [[A]], i32 4, ptr @__tysan_v1___ZTSN12__GLOBAL____N__11zE_o_24, i32 2)
-; CHECK-NEXT:    br label [[TMP19]]
-; CHECK:       19:
+; CHECK-NEXT:    br label %[[BB19]]
+; CHECK:       [[BB19]]:
 ; CHECK-NEXT:    store ptr @__tysan_v1___ZTSN12__GLOBAL____N__11zE_o_24, ptr [[SHADOW_PTR]], align 8
 ; CHECK-NEXT:    [[SHADOW_BYTE_1_OFFSET:%.*]] = add i64 [[SHADOW_PTR_INT]], 8
 ; CHECK-NEXT:    [[SHADOW_BYTE_1_PTR:%.*]] = inttoptr i64 [[SHADOW_BYTE_1_OFFSET]] to ptr
@@ -69,13 +70,13 @@ define void @test_anon_ns(ptr %a, ptr %b) sanitize_type {
 ; CHECK-NEXT:    [[SHADOW_BYTE_3_OFFSET:%.*]] = add i64 [[SHADOW_PTR_INT]], 24
 ; CHECK-NEXT:    [[SHADOW_BYTE_3_PTR:%.*]] = inttoptr i64 [[SHADOW_BYTE_3_OFFSET]] to ptr
 ; CHECK-NEXT:    store ptr inttoptr (i64 -3 to ptr), ptr [[SHADOW_BYTE_3_PTR]], align 8
-; CHECK-NEXT:    br label [[TMP21:%.*]]
-; CHECK:       20:
+; CHECK-NEXT:    br label %[[BB21:.*]]
+; CHECK:       [[BB20]]:
 ; CHECK-NEXT:    call void @__tysan_check(ptr [[A]], i32 4, ptr @__tysan_v1___ZTSN12__GLOBAL____N__11zE_o_24, i32 2)
-; CHECK-NEXT:    br label [[TMP21]]
-; CHECK:       21:
-; CHECK-NEXT:    br label [[TMP43:%.*]]
-; CHECK:       22:
+; CHECK-NEXT:    br label %[[BB21]]
+; CHECK:       [[BB21]]:
+; CHECK-NEXT:    br label %[[BB43:.*]]
+; CHECK:       [[BB22]]:
 ; CHECK-NEXT:    [[TMP23:%.*]] = add i64 [[SHADOW_PTR_INT]], 8
 ; CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
 ; CHECK-NEXT:    [[TMP25:%.*]] = load ptr, ptr [[TMP24]], align 8
@@ -94,26 +95,26 @@ define void @test_anon_ns(ptr %a, ptr %b) sanitize_type {
 ; CHECK-NEXT:    [[TMP38:%.*]] = ptrtoint ptr [[TMP37]] to i64
 ; CHECK-NEXT:    [[TMP39:%.*]] = icmp sge i64 [[TMP38]], 0
 ; CHECK-NEXT:    [[TMP40:%.*]] = or i1 [[TMP34]], [[TMP39]]
-; CHECK-NEXT:    br i1 [[TMP40]], label [[TMP41:%.*]], label [[TMP42:%.*]], !prof [[PROF0]]
-; CHECK:       41:
+; CHECK-NEXT:    br i1 [[TMP40]], label %[[BB41:.*]], label %[[BB42:.*]], !prof [[PROF0]]
+; CHECK:       [[BB41]]:
 ; CHECK-NEXT:    call void @__tysan_check(ptr [[A]], i32 4, ptr @__tysan_v1___ZTSN12__GLOBAL____N__11zE_o_24, i32 2)
-; CHECK-NEXT:    br label [[TMP42]]
-; CHECK:       42:
-; CHECK-NEXT:    br label [[TMP43]]
-; CHECK:       43:
-; CHECK-NEXT:    store i32 42, ptr [[A]], align 4, !tbaa [[TBAA1:![0-9]+]]
-; CHECK-NEXT:    [[APP_PTR_INT1:%.*]] = ptrtoint ptr [[B:%.*]] to i64
+; CHECK-NEXT:    br label %[[BB42]]
+; CHECK:       [[BB42]]:
+; CHECK-NEXT:    br label %[[BB43]]
+; CHECK:       [[BB43]]:
+; CHECK-NEXT:    store i32 42, ptr [[A]], align 4, !tbaa [[INT_TBAA1:![0-9]+]]
+; CHECK-NEXT:    [[APP_PTR_INT1:%.*]] = ptrtoint ptr [[B]] to i64
 ; CHECK-NEXT:    [[APP_PTR_MASKED2:%.*]] = and i64 [[APP_PTR_INT1]], [[APP_MEM_MASK]]
 ; CHECK-NEXT:    [[APP_PTR_SHIFTED3:%.*]] = shl i64 [[APP_PTR_MASKED2]], 3
 ; CHECK-NEXT:    [[SHADOW_PTR_INT4:%.*]] = add i64 [[APP_PTR_SHIFTED3]], [[SHADOW_BASE]]
 ; CHECK-NEXT:    [[SHADOW_PTR5:%.*]] = inttoptr i64 [[SHADOW_PTR_INT4]] to ptr
 ; CHECK-NEXT:    [[SHADOW_DESC6:%.*]] = load ptr, ptr [[SHADOW_PTR5]], align 8
 ; CHECK-NEXT:    [[BAD_DESC7:%.*]] = icmp ne ptr [[SHADOW_DESC6]], @__tysan_v1___ZTS1yIN12__GLOBAL____N__11zEE_o_24
-; CHECK-NEXT:    br i1 [[BAD_DESC7]], label [[TMP44:%.*]], label [[TMP66:%.*]], !prof [[PROF0]]
-; CHECK:       44:
+; CHECK-NEXT:    br i1 [[BAD_DESC7]], label %[[BB44:.*]], label %[[BB66:.*]], !prof [[PROF0]]
+; CHECK:       [[BB44]]:
 ; CHECK-NEXT:    [[TMP45:%.*]] = icmp eq ptr [[SHADOW_DESC6]], null
-; CHECK-NEXT:    br i1 [[TMP45]], label [[TMP46:%.*]], label [[TMP64:%.*]]
-; CHECK:       46:
+; CHECK-NEXT:    br i1 [[TMP45]], label %[[BB46:.*]], label %[[BB64:.*]]
+; CHECK:       [[BB46]]:
 ; CHECK-NEXT:    [[TMP47:%.*]] = add i64 [[SHADOW_PTR_INT4]], 8
 ; CHECK-NEXT:    [[TMP48:%.*]] = inttoptr i64 [[TMP47]] to ptr
 ; CHECK-NEXT:    [[TMP49:%.*]] = load ptr, ptr [[TMP48]], align 8
@@ -129,11 +130,11 @@ define void @test_anon_ns(ptr %a, ptr %b) sanitize_type {
 ; CHECK-NEXT:    [[TMP59:%.*]] = load ptr, ptr [[TMP58]], align 8
 ; CHECK-NEXT:    [[TMP60:%.*]] = icmp ne ptr [[TMP59]], null
 ; CHECK-NEXT:    [[TMP61:%.*]] = or i1 [[TMP56]], [[TMP60]]
-; CHECK-NEXT:    br i1 [[TMP61]], label [[TMP62:%.*]], label [[TMP63:%.*]], !prof [[PROF0]]
-; CHECK:       62:
+; CHECK-NEXT:    br i1 [[TMP61]], label %[[BB62:.*]], label %[[BB63:.*]], !prof [[PROF0]]
+; CHECK:       [[BB62]]:
 ; CHECK-NEXT:    call void @__tysan_check(ptr [[B]], i32 4, ptr @__tysan_v1___ZTS1yIN12__GLOBAL____N__11zEE_o_24, i32 2)
-; CHECK-NEXT:    br label [[TMP63]]
-; CHECK:       63:
+; CHECK-NEXT:    br label %[[BB63]]
+; CHECK:       [[BB63]]:
 ; CHECK-NEXT:    store ptr @__tysan_v1___ZTS1yIN12__GLOBAL____N__11zEE_o_24, ptr [[SHADOW_PTR5]], align 8
 ; CHECK-NEXT:    [[SHADOW_BYTE_1_OFFSET8:%.*]] = add i64 [[SHADOW_PTR_INT4]], 8
 ; CHECK-NEXT:    [[SHADOW_BYTE_1_PTR9:%.*]] = inttoptr i64 [[SHADOW_BYTE_1_OFFSET8]] to ptr
@@ -144,13 +145,13 @@ define void @test_anon_ns(ptr %a, ptr %b) sanitize_type {
 ; CHECK-NEXT:    [[SHADOW_BYTE_3_OFFSET12:%.*]] = add i64 [[SHADOW_PTR_INT4]], 24
 ; CHECK-NEXT:    [[SHADOW_BYTE_3_PTR13:%.*]] = inttoptr i64 [[SHADOW_BYTE_3_OFFSET12]] to ptr
 ; CHECK-NEXT:    store ptr inttoptr (i64 -3 to ptr), ptr [[SHADOW_BYTE_3_PTR13]], align 8
-; CHECK-NEXT:    br label [[TMP65:%.*]]
-; CHECK:       64:
+; CHECK-NEXT:    br label %[[BB65:.*]]
+; CHECK:       [[BB64]]:
 ; CHECK-NEXT:    call void @__tysan_check(ptr [[B]], i32 4, ptr @__tysan_v1___ZTS1yIN12__GLOBAL____N__11zEE_o_24, i32 2)
-; CHECK-NEXT:    br label [[TMP65]]
-; CHECK:       65:
-; CHECK-NEXT:    br label [[TMP87:%.*]]
-; CHECK:       66:
+; CHECK-NEXT:    br label %[[BB65]]
+; CHECK:       [[BB65]]:
+; CHECK-NEXT:    br label %[[BB87:.*]]
+; CHECK:       [[BB66]]:
 ; CHECK-NEXT:    [[TMP67:%.*]] = add i64 [[SHADOW_PTR_INT4]], 8
 ; CHECK-NEXT:    [[TMP68:%.*]] = inttoptr i64 [[TMP67]] to ptr
 ; CHECK-NEXT:    [[TMP69:%.*]] = load ptr, ptr [[TMP68]], align 8
@@ -169,14 +170,14 @@ define void @test_anon_ns(ptr %a, ptr %b) sanitize_type {
 ; CHECK-NEXT:    [[TMP82:%.*]] = ptrtoint ptr [[TMP81]] to i64
 ; CHECK-NEXT:    [[TMP83:%.*]] = icmp sge i64 [[TMP82]], 0
 ; CHECK-NEXT:    [[TMP84:%.*]] = or i1 [[TMP78]], [[TMP83]]
-; CHECK-NEXT:    br i1 [[TMP84]], label [[TMP85:%.*]], label [[TMP86:%.*]], !prof [[PROF0]]
-; CHECK:       85:
+; CHECK-NEXT:    br i1 [[TMP84]], label %[[BB85:.*]], label %[[BB86:.*]], !prof [[PROF0]]
+; CHECK:       [[BB85]]:
 ; CHECK-NEXT:    call void @__tysan_check(ptr [[B]], i32 4, ptr @__tysan_v1___ZTS1yIN12__GLOBAL____N__11zEE_o_24, i32 2)
-; CHECK-NEXT:    br label [[TMP86]]
-; CHECK:       86:
-; CHECK-NEXT:    br label [[TMP87]]
-; CHECK:       87:
-; CHECK-NEXT:    store i32 43, ptr [[B]], align 4, !tbaa [[TBAA6:![0-9]+]]
+; CHECK-NEXT:    br label %[[BB86]]
+; CHECK:       [[BB86]]:
+; CHECK-NEXT:    br label %[[BB87]]
+; CHECK:       [[BB87]]:
+; CHECK-NEXT:    store i32 43, ptr [[B]], align 4, !tbaa [[INT_TBAA6:![0-9]+]]
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -187,22 +188,23 @@ entry:
 }
 
 define void @test_anon_type(ptr %a) sanitize_type {
-; CHECK-LABEL: @test_anon_type(
-; CHECK-NEXT:  entry:
+; CHECK-LABEL: define void @test_anon_type(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[APP_MEM_MASK:%.*]] = load i64, ptr @__tysan_app_memory_mask, align 8
 ; CHECK-NEXT:    [[SHADOW_BASE:%.*]] = load i64, ptr @__tysan_shadow_memory_address, align 8
-; CHECK-NEXT:    [[APP_PTR_INT:%.*]] = ptrtoint ptr [[A:%.*]] to i64
+; CHECK-NEXT:    [[APP_PTR_INT:%.*]] = ptrtoint ptr [[A]] to i64
 ; CHECK-NEXT:    [[APP_PTR_MASKED:%.*]] = and i64 [[APP_PTR_INT]], [[APP_MEM_MASK]]
 ; CHECK-NEXT:    [[APP_PTR_SHIFTED:%.*]] = shl i64 [[APP_PTR_MASKED]], 3
 ; CHECK-NEXT:    [[SHADOW_PTR_INT:%.*]] = add i64 [[APP_PTR_SHIFTED]], [[SHADOW_BASE]]
 ; CHECK-NEXT:    [[SHADOW_PTR:%.*]] = inttoptr i64 [[SHADOW_PTR_INT]] to ptr
 ; CHECK-NEXT:    [[SHADOW_DESC:%.*]] = load ptr, ptr [[SHADOW_PTR]], align 8
 ; CHECK-NEXT:    [[BAD_DESC:%.*]] = icmp ne ptr [[SHADOW_DESC]], @__tysan_v1_____anonymous__027d9e575c5d34cb5d60d6a1d6276f95_o_24
-; CHECK-NEXT:    br i1 [[BAD_DESC]], label [[TMP0:%.*]], label [[TMP22:%.*]], !prof [[PROF0]]
-; CHECK:       0:
+; CHECK-NEXT:    br i1 [[BAD_DESC]], label %[[BB0:.*]], label %[[BB22:.*]], !prof [[PROF0]]
+; CHECK:       [[BB0]]:
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq ptr [[SHADOW_DESC]], null
-; CHECK-NEXT:    br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP20:%.*]]
-; CHECK:       2:
+; CHECK-NEXT:    br i1 [[TMP1]], label %[[BB2:.*]], label %[[BB20:.*]]
+; CHECK:       [[BB2]]:
 ; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[SHADOW_PTR_INT]], 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr
 ; CHECK-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[TMP4]], align 8
@@ -218,11 +220,11 @@ define void @test_anon_type(ptr %a) sanitize_type {
 ; CHECK-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[TMP14]], align 8
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne ptr [[TMP15]], null
 ; CHECK-NEXT:    [[TMP17:%.*]] = or i1 [[TMP12]], [[TMP16]]
-; CHECK-NEXT:    br i1 [[TMP17]], label [[TMP18:%.*]], label [[TMP19:%.*]], !prof [[PROF0]]
-; CHECK:       18:
+; CHECK-NEXT:    br i1 [[TMP17]], label %[[BB18:.*]], label %[[BB19:.*]], !prof [[PROF0]]
+; CHECK:       [[BB18]]:
 ; CHECK-NEXT:    call void @__tysan_check(ptr [[A]], i32 4, ptr @__tysan_v1_____anonymous__027d9e575c5d34cb5d60d6a1d6276f95_o_24, i32 2)
-; CHECK-NEXT:    br label [[TMP19]]
-; CHECK:       19:
+; CHECK-NEXT:    br label %[[BB19]]
+; CHECK:       [[BB19]]:
 ; CHECK-NEXT:    store ptr @__tysan_v1_____anonymous__027d9e575c5d34cb5d60d6a1d6276f95_o_24, ptr [[SHADOW_PTR]], align 8
 ; CHECK-NEXT:    [[SHADOW_BYTE_1_OFFSET:%.*]] = add i64 [[SHADOW_PTR_INT]], 8
 ; CHECK-NEXT:    [[SHADOW_BYTE_1_PTR:%.*]] = inttoptr i64 [[SHADOW_BYTE_1_OFFSET]] to ptr
@@ -233,13 +235,13 @@ define void @test_anon_type(ptr %a) sanitize_type {
 ; CHECK-NEXT:    [[SHADOW_BYTE_3_OFFSET:%.*]] = add i64 [[SHADOW_PTR_INT]], 24
 ; CHECK-NEXT:    [[SHADOW_BYTE_3_PTR:%.*]] = inttoptr i64 [[SHADOW_BYTE_3_OFFSET]] to ptr
 ; CHECK-NEXT:    store ptr inttoptr (i64 -3 to ptr), ptr [[SHADOW_BYTE_3_PTR]], align 8
-; CHECK-NEXT:    br label [[TMP21:%.*]]
-; CHECK:       20:
+; CHECK-NEXT:    br label %[[BB21:.*]]
+; CHECK:       [[BB20]]:
 ; CHECK-NEXT:    call void @__tysan_check(ptr [[A]], i32 4, ptr @__tysan_v1_____anonymous__027d9e575c5d34cb5d60d6a1d6276f95_o_24, i32 2)
-; CHECK-NEXT:    br label [[TMP21]]
-; CHECK:       21:
-; CHECK-NEXT:    br label [[TMP43:%.*]]
-; CHECK:       22:
+; CHECK-NEXT:    br label %[[BB21]]
+; CHECK:       [[BB21]]:
+; CHECK-NEXT:    br label %[[BB43:.*]]
+; CHECK:       [[BB22]]:
 ; CHECK-NEXT:    [[TMP23:%.*]] = add i64 [[SHADOW_PTR_INT]], 8
 ; CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
 ; CHECK-NEXT:    [[TMP25:%.*]] = load ptr, ptr [[TMP24]], align 8
@@ -258,14 +260,14 @@ define void @test_anon_type(ptr %a) sanitize_type {
 ; CHECK-NEXT:    [[TMP38:%.*]] = ptrtoint ptr [[TMP37]] to i64
 ; CHECK-NEXT:    [[TMP39:%.*]] = icmp sge i64 [[TMP38]], 0
 ; CHECK-NEXT:    [[TMP40:%.*]] = or i1 [[TMP34]], [[TMP39]]
-; CHECK-NEXT:    br i1 [[TMP40]], label [[TMP41:%.*]], label [[TMP42:%.*]], !prof [[PROF0]]
-; CHECK:       41:
+; CHECK-NEXT:    br i1 [[TMP40]], label %[[BB41:.*]], label %[[BB42:.*]], !prof [[PROF0]]
+; CHECK:       [[BB41]]:
 ; CHECK-NEXT:    call void @__tysan_check(ptr [[A]], i32 4, ptr @__tysan_v1_____anonymous__027d9e575c5d34cb5d60d6a1d6276f95_o_24, i32 2)
-; CHECK-NEXT:    br label [[TMP42]]
-; CHECK:       42:
-; CHECK-NEXT:    br label [[TMP43]]
-; CHECK:       43:
-; CHECK-NEXT:    store i32 42, ptr [[A]], align 4, !tbaa [[TBAA8:![0-9]+]]
+; CHECK-NEXT:    br label %[[BB42]]
+; CHECK:       [[BB42]]:
+; CHECK-NEXT:    br label %[[BB43]]
+; CHECK:       [[BB43]]:
+; CHECK-NEXT:    store i32 42, ptr [[A]], align 4, !tbaa [[INT_TBAA8:![0-9]+]]
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -284,17 +286,17 @@ entry:
 !11 = !{!"", !2, i64 24}
 !12 = !{!11, !2, i64 24}
 ;.
-; CHECK: attributes #[[ATTR0:[0-9]+]] = { sanitize_type }
+; CHECK: attributes #[[ATTR0]] = { sanitize_type }
 ; CHECK: attributes #[[ATTR1:[0-9]+]] = { nounwind }
 ;.
 ; CHECK: [[PROF0]] = !{!"branch_weights", i32 1, i32 100000}
-; CHECK: [[TBAA1]] = !{[[META2:![0-9]+]], [[META3:![0-9]+]], i64 24}
+; CHECK: [[INT_TBAA1]] = !{[[META2:![0-9]+]], [[META3:![0-9]+]], i64 24}
 ; CHECK: [[META2]] = !{!"_ZTSN12_GLOBAL__N_11zE", [[META3]], i64 24}
 ; CHECK: [[META3]] = !{!"int", [[META4:![0-9]+]], i64 0}
 ; CHECK: [[META4]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0}
 ; CHECK: [[META5]] = !{!"Simple C++ TBAA"}
-; CHECK: [[TBAA6]] = !{[[META7:![0-9]+]], [[META3]], i64 24}
+; CHECK: [[INT_TBAA6]] = !{[[META7:![0-9]+]], [[META3]], i64 24}
 ; CHECK: [[META7]] = !{!"_ZTS1yIN12_GLOBAL__N_11zEE", [[META3]], i64 24}
-; CHECK: [[TBAA8]] = !{[[META9:![0-9]+]], [[META3]], i64 24}
+; CHECK: [[INT_TBAA8]] = !{[[META9:![0-9]+]], [[META3]], i64 24}
 ; CHECK: [[META9]] = !{!"", [[META3]], i64 24}
 ;.
diff --git a/llvm/test/Instrumentation/TypeSanitizer/basic-nosan.ll b/llvm/test/Instrumentation/TypeSanitizer/basic-nosan.ll
index 8ddc5738a673d..c1a452d629b7b 100644
--- a/llvm/test/Instrumentation/TypeSanitizer/basic-nosan.ll
+++ b/llvm/test/Instrumentation/TypeSanitizer/basic-nosan.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals --include-generated-funcs
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --include-generated-funcs --version 6
 ; Test basic type sanitizer instrumentation.
 ; RUN: opt -passes='tysan' -S %s | FileCheck %s
 
@@ -31,19 +31,20 @@ entry:
 ; CHECK: @__tysan_shadow_memory_address = external global i64
 ; CHECK: @__tysan_app_memory_mask = external global i64
 ;.
-; CHECK-LABEL: @test_load_nsan(
-; CHECK-NEXT:  entry:
+; CHECK-LABEL: define i32 @test_load_nsan(
+; CHECK-SAME: ptr [[A:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[APP_MEM_MASK:%.*]] = load i64, ptr @__tysan_app_memory_mask, align 8
 ; CHECK-NEXT:    [[SHADOW_BASE:%.*]] = load i64, ptr @__tysan_shadow_memory_address, align 8
-; CHECK-NEXT:    [[APP_PTR_INT:%.*]] = ptrtoint ptr [[A:%.*]] to i64
+; CHECK-NEXT:    [[APP_PTR_INT:%.*]] = ptrtoint ptr [[A]] to i64
 ; CHECK-NEXT:    [[APP_PTR_MASKED:%.*]] = and i64 [[APP_PTR_INT]], [[APP_MEM_MASK]]
 ; CHECK-NEXT:    [[APP_PTR_SHIFTED:%.*]] = shl i64 [[APP_PTR_MASKED]], 3
 ; CHECK-NEXT:    [[SHADOW_PTR_INT:%.*]] = add i64 [[APP_PTR_SHIFTED]], [[SHADOW_BASE]]
 ; CHECK-NEXT:    [[SHADOW_PTR:%.*]] = inttoptr i64 [[SHADOW_PTR_INT]] to ptr
 ; CHECK-NEXT:    [[SHADOW_DESC:%.*]] = load ptr, ptr [[SHADOW_PTR]], align 8
 ; CHECK-NEXT:    [[DESC_SET:%.*]] = icmp eq ptr [[SHADOW_DESC]], null
-; CHECK-NEXT:    br i1 [[DESC_SET]], label [[SET_TYPE:%.*]], label [[TMP0:%.*]], !prof [[PROF0:![0-9]+]]
-; CHECK:       set.type:
+; CHECK-NEXT:    br i1 [[DESC_SET]], label %[[SET_TYPE:.*]], label %[[BB0:.*]], !prof [[PROF0:![0-9]+]]
+; CHECK:       [[SET_TYPE]]:
 ; CHECK-NEXT:    store ptr @__tysan_v1_int_o_0, ptr [[SHADOW_PTR]], align 8
 ; CHECK-NEXT:    [[SHADOW_BYTE_1_OFFSET:%.*]] = add i64 [[SHADOW_PTR_INT]], 8
 ; CHECK-NEXT:    [[SHADOW_BYTE_1_PTR:%.*]] = inttoptr i64 [[SHADOW_BYTE_1_OFFSET]] to ptr
@@ -54,25 +55,26 @@ entry:
 ; CHECK-NEXT:    [[SHADOW_BYTE_3_OFFSET:%.*]] = add i64 [[SHADOW_PTR_INT]], 24
 ; CHECK-NEXT:    [[SHADOW_BYTE_3_PTR:%.*]] = inttoptr i64 [[SHADOW_BYTE_3_OFFSET]] to ptr
 ; CHECK-NEXT:    store ptr inttoptr (i64 -3 to ptr), ptr [[SHADOW_BYTE_3_PTR]], align 8
-; CHECK-NEXT:    br label [[TMP0]]
-; CHECK:       0:
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[A]], align 4, !tbaa [[TBAA1:![0-9]+]]
+; CHECK-NEXT:    br label %[[BB0]]
+; CHECK:       [[BB0]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[A]], align 4, !tbaa [[INT_TBAA1:![0-9]+]]
 ; CHECK-NEXT:    ret i32 [[TMP1]]
 ;
 ;
-; CHECK-LABEL: @test_store_nsan(
-; CHECK-NEXT:  entry:
+; CHECK-LABEL: define void @test_store_nsan(
+; CHECK-SAME: ptr [[A:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[APP_MEM_MASK:%.*]] = load i64, ptr @__tysan_app_memory_mask, align 8
 ; CHECK-NEXT:    [[SHADOW_BASE:%.*]] = load i64, ptr @__tysan_shadow_memory_address, align 8
-; CHECK-NEXT:    [[APP_PTR_INT:%.*]] = ptrtoint ptr [[A:%.*]] to i64
+; CHECK-NEXT:    [[APP_PTR_INT:%.*]] = ptrtoint ptr [[A]] to i64
 ; CHECK-NEXT:    [[APP_PTR_MASKED:%.*]] = and i64 [[APP_PTR_INT]], [[APP_MEM_MASK]]
 ; CHECK-NEXT:    [[APP_PTR_SHIFTED:%.*]] = shl i64 [[APP_PTR_MASKED]], 3
 ; CHECK-NEXT:    [[SHADOW_PTR_INT:%.*]] = add i64 [[APP_PTR_SHIFTED]], [[SHADOW_BASE]]
 ; CHECK-NEXT:    [[SHADOW_PTR:%.*]] = inttoptr i64 [[SHADOW_PTR_INT]] to ptr
 ; CHECK-NEXT:    [[SHADOW_DESC:%.*]] = load ptr, ptr [[SHADOW_PTR]], align 8
 ; CHECK-NEXT:    [[DESC_SET:%.*]] = icmp eq ptr [[SHADOW_DESC]], null
-; CHECK-NEXT:    br i1 [[DESC_SET]], label [[SET_TYPE:%.*]], label [[TMP0:%.*]], !prof [[PROF0]]
-; CHECK:       set.type:
+; CHECK-NEXT:    br i1 [[DESC_SET]], label %[[SET_TYPE:.*]], label %[[BB0:.*]], !prof [[PROF0]]
+; CHECK:       [[SET_TYPE]]:
 ; CHECK-NEXT:    store ptr @__tysan_v1_int_o_0, ptr [[SHADOW_PTR]], align 8
 ; CHECK-NEXT:    [[SHADOW_BYTE_1_OFFSET:%.*]] = add i64 [[SHADOW_PTR_INT]], 8
 ; CHECK-NEXT:    [[SHADOW_BYTE_1_PTR:%.*]] = inttoptr i64 [[SHADOW_BYTE_1_OFFSET]] to ptr
@@ -83,21 +85,22 @@ entry:
 ; CHECK-NEXT:    [[SHADOW_BYTE_3_OFFSET:%.*]] = add i64 [[SHADOW_PTR_INT]], 24
 ; CHECK-NEXT:    [[SHADOW_BYTE_3_PTR:%.*]] = inttoptr i64 [[SHADOW_BYTE_3_OFFSET]] to ptr
 ; CHECK-NEXT:    store ptr inttoptr (i64 -3 to ptr), ptr [[SHADOW_BYTE_3_PTR]], align 8
-; CHECK-NEXT:    br label [[TMP0]]
-; CHECK:       0:
-; CHECK-NEXT:    store i32 42, ptr [[A]], align 4, !tbaa [[TBAA1]]
+; CHECK-NEXT:    br label %[[BB0]]
+; CHECK:       [[BB0]]:
+; CHECK-NEXT:    store i32 42, ptr [[A]], align 4, !tbaa [[INT_TBAA1]]
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK-LABEL: @tysan.module_ctor(
+; CHECK-LABEL: define internal void @tysan.module_ctor(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:    call void @__tysan_init()
 ; CHECK-NEXT:    ret void
 ;
 ;.
-; CHECK: attributes #[[ATTR0:[0-9]+]] = { nounwind }
+; CHECK: attributes #[[ATTR0]] = { nounwind }
 ;.
 ; CHECK: [[PROF0]] = !{!"branch_weights", i32 1, i32 100000}
-; CHECK: [[TBAA1]] = !{[[META2:![0-9]+]], [[META2]], i64 0}
+; CHECK: [[INT_TBAA1]] = !{[[META2:![0-9]+]], [[META2]], i64 0}
 ; CHECK: [[META2]] = !{!"int", [[META3:![0-9]+]], i64 0}
 ; CHECK: [[META3]] = !{!"omnipotent char", [[META4:![0-9]+]], i64 0}
 ; CHECK: [[META4]] = !{!"Simple C++ TBAA"}
diff --git a/llvm/test/Instrumentation/TypeSanitizer/basic.ll b/llvm/test/Instrumentation/TypeSanitizer/basic.ll
index b40b64664502a..ae7ac5304dc08 100644
--- a/llvm/test/Instrumentation/TypeSanitizer/basic.ll
+++ b/llvm/test/Instrumentation/TypeSanitizer/basic.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 6
 ; Test basic type sanitizer instrumentation.
 ;
 ; RUN: opt -passes='tysan' -S %s | FileCheck %s
@@ -21,22 +21,23 @@ declare i32 @declaration_only(i32 %a) sanitize_type
 ; CHECK: @llvm.used = appending global [8 x ptr] [ptr @tysan.module_ctor, ptr @__tysan_v1_Simple_20C_2b_2b_20TBAA, ptr @__tysan_v1_omnipotent_20char, ptr @__tysan_v1_int, ptr @__tysan_v1_int_o_0, ptr @__tysan_v1___ZTS1x, ptr @__tysan_v1___ZTS1v, ptr @__tysan_v1___ZTS1v_o_12], section "llvm.metadata"
 ;.
 define i32 @test_load(ptr %a) sanitize_type {
-; CHECK-LABEL: @test_load(
-; CHECK-NEXT:  entry:
+; CHECK-LABEL: define i32 @test_load(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[APP_MEM_MASK:%.*]] = load i64, ptr @__tysan_app_memory_mask, align 8
 ; CHECK-NEXT:    [[SHADOW_BASE:%.*]] = load i64, ptr @__tysan_shadow_memory_address, align 8
-; CHECK-NEXT:    [[APP_PTR_INT:%.*]] = ptrtoint ptr [[A:%.*]] to i64
+; CHECK-NEXT:    [[APP_PTR_INT:%.*]] = ptrtoint ptr [[A]] to i64
 ; CHECK-NEXT:    [[APP_PTR_MASKED:%.*]] = and i64 [[APP_PTR_INT]], [[APP_MEM_MASK]]
 ; CHECK-NEXT:    [[APP_PTR_SHIFTED:%.*]] = shl i64 [[APP_PTR_MASKED]], 3
 ; CHECK-NEXT:    [[SHADOW_PTR_INT:%.*]] = add i64 [[APP_PTR_SHIFTED]], [[SHADOW_BASE]]
 ; CHECK-NEXT:    [[SHADOW_PTR:%.*]] = inttoptr i64 [[SHADOW_PTR_INT]] to ptr
 ; CHECK-NEXT:    [[SHADOW_DESC:%.*]] = load ptr, ptr [[SHADOW_PTR]], align 8
 ; CHECK-NEXT:    [[BAD_DESC:%.*]] = icmp ne ptr [[SHADOW_DESC]], @__tysan_v1_int_o_0
-; CHECK-NEXT:    br i1 [[BAD_DESC]], label [[TMP0:%.*]], label [[TMP22:%.*]], !prof [[PROF0:![0-9]+]]
-; CHECK:       0:
+; CHECK-NEXT:    br i1 [[BAD_DESC]], label %[[BB0:.*]], label %[[BB22:.*]], !prof [[PROF0:![0-9]+]]
+; CHECK:       [[BB0]]:
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq ptr [[SHADOW_DESC]], null
-; CHECK-NEXT:    br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP20:%.*]]
-; CHECK:       2:
+; CHECK-NEXT:    br i1 [[TMP1]], label %[[BB2:.*]], label %[[BB20:.*]]
+; CHECK:       [[BB2]]:
 ; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[SHADOW_PTR_INT]], 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr
 ; CHECK-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[TMP4]], align 8
@@ -52,11 +53,11 @@ define i32 @test_load(ptr %a) sanitize_type {
 ; CHECK-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[TMP14]], align 8
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne ptr [[TMP15]], null
 ; CHECK-NEXT:    [[TMP17:%.*]] = or i1 [[TMP12]], [[TMP16]]
-; CHECK-NEXT:    br i1 [[TMP17]], label [[TMP18:%.*]], label [[TMP19:%.*]], !prof [[PROF0]]
-; CHECK:       18:
+; CHECK-NEXT:    br i1 [[TMP17]], label %[[BB18:.*]], label %[[BB19:.*]], !prof [[PROF0]]
+; CHECK:       [[BB18]]:
 ; CHECK-NEXT:    call void @__tysan_check(ptr [[A]], i32 4, ptr @__tysan_v1_int_o_0, i32 1)
-; CHECK-NEXT:    br label [[TMP19]]
-; CHECK:       19:
+; CHECK-NEXT:    br label %[[BB19]]
+; CHECK:       [[BB19]]:
 ; CHECK-NEXT:    store ptr @__tysan_v1_int_o_0, ptr [[SHADOW_PTR]], align 8
 ; CHECK-NEXT:    [[SHADOW_BYTE_1_OFFSET:%.*]] = add i64 [[SHADOW_PTR_INT]], 8
 ; CHECK-NEXT:    [[SHADOW_BYTE_1_PTR:%.*]] = inttoptr i64 [[SHADOW_BYTE_1_OFFSET]] to ptr
@@ -67,13 +68,13 @@ define i32 @test_load(ptr %a) sanitize_type {
 ; CHECK-NEXT:    [[SHADOW_BYTE_3_OFFSET:%.*]] = add i64 [[SHADOW_PTR_INT]], 24
 ; CHECK-NEXT:    [[SHADOW_BYTE_3_PTR:%.*]] = inttoptr i64 [[SHADOW_BYTE_3_OFFSET]] to ptr
 ; CHECK-NEXT:    store ptr inttoptr (i64 -3 to ptr), ptr [[SHADOW_BYTE_3_PTR]], align 8
-; CHECK-NEXT:    br label [[TMP21:%.*]]
-; CHECK:       20:
+; CHECK-NEXT:    br label %[[BB21:.*]]
+; CHECK:       [[BB20]]:
 ; CHECK-NEXT:    call void @__tysan_check(ptr [[A]], i32 4, ptr @__tysan_v1_int_o_0, i32 1)
-; CHECK-NEXT:    br label [[TMP21]]
-; CHECK:       21:
-; CHECK-NEXT:    br label [[TMP43:%.*]]
-; CHECK:       22:
+; CHECK-NEXT:    br label %[[BB21]]
+; CHECK:       [[BB21]]:
+; CHECK-NEXT:    br label %[[BB43:.*]]
+; CHECK:       [[BB22]]:
 ; CHECK-NEXT:    [[TMP23:%.*]] = add i64 [[SHADOW_PTR_INT]], 8
 ; CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
 ; CHECK-NEXT:    [[TMP25:%.*]] = load ptr, ptr [[TMP24]], align 8
@@ -92,14 +93,14 @@ define i32 @test_load(ptr %a) sanitize_type {
 ; CHECK-NEXT:    [[TMP38:%.*]] = ptrtoint ptr [[TMP37]] to i64
 ; CHECK-NEXT:    [[TMP39:%.*]] = icmp sge i64 [[TMP38]], 0
 ; CHECK-NEXT:    [[TMP40:%.*]] = or i1 [[TMP34]], [[TMP39]]
-; CHECK-NEXT:    br i1 [[TMP40]], label [[TMP41:%.*]], label [[TMP42:%.*]], !prof [[PROF0]]
-; CHECK:       41:
+; CHECK-NEXT:    br i1 [[TMP40]], label %[[BB41:.*]], label %[[BB42:.*]], !prof [[PROF0]]
+; CHECK:       [[BB41]]:
 ; CHECK-NEXT:    call void @__tysan_check(ptr [[A]], i32 4, ptr @__tysan_v1_int_o_0, i32 1)
-; CHECK-NEXT:    br label [[TMP42]]
-; CHECK:       42:
-; CHECK-NEXT:    br label [[TMP43]]
-; CHECK:       43:
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[A]], align 4, !tbaa [[TBAA1:![0-9]+]]
+; CHECK-NEXT:    br label %[[BB42]]
+; CHECK:       [[BB42]]:
+; CHECK-NEXT:    br label %[[BB43]]
+; CHECK:       [[BB43]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[A]], align 4, !tbaa [[INT_TBAA1:![0-9]+]]
 ; CHECK-NEXT:    ret i32 [[TMP1]]
 ;
 entry:
@@ -108,22 +109,23 @@ entry:
 }
 
 define void @test_store(ptr %a) sanitize_type {
-; CHECK-LABEL: @test_store(
-; CHECK-NEXT:  entry:
+; CHECK-LABEL: define void @test_store(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[APP_MEM_MASK:%.*]] = load i64, ptr @__tysan_app_memory_mask, align 8
 ; CHECK-NEXT:    [[SHADOW_BASE:%.*]] = load i64, ptr @__tysan_shadow_memory_address, align 8
-; CHECK-NEXT:    [[APP_PTR_INT:%.*]] = ptrtoint ptr [[A:%.*]] to i64
+; CHECK-NEXT:    [[APP_PTR_INT:%.*]] = ptrtoint ptr [[A]] to i64
 ; CHECK-NEXT:    [[APP_PTR_MASKED:%.*]] = and i64 [[APP_PTR_INT]], [[APP_MEM_MASK]]
 ; CHECK-NEXT:    [[APP_PTR_SHIFTED:%.*]] = shl i64 [[APP_PTR_MASKED]], 3
 ; CHECK-NEXT:    [[SHADOW_PTR_INT:%.*]] = add i64 [[APP_PTR_SHIFTED]], [[SHADOW_BASE]]
 ; CHECK-NEXT:    [[SHADOW_PTR:%.*]] = inttoptr i64 [[SHADOW_PTR_INT]] to ptr
 ; CHECK-NEXT:    [[SHADOW_DESC:%.*]] = load ptr, ptr [[SHADOW_PTR]], align 8
 ; CHECK-NEXT:    [[BAD_DESC:%.*]] = icmp ne ptr [[SHADOW_DESC]], @__tysan_v1___ZTS1v_o_12
-; CHECK-NEXT:    br i1 [[BAD_DESC]], label [[TMP0:%.*]], label [[TMP22:%.*]], !prof [[PROF0]]
-; CHECK:       0:
+; CHECK-NEXT:    br i1 [[BAD_DESC]], label %[[BB0:.*]], label %[[BB22:.*]], !prof [[PROF0]]
+; CHECK:       [[BB0]]:
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq ptr [[SHADOW_DESC]], null
-; CHECK-NEXT:    br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP20:%.*]]
-; CHECK:       2:
+; CHECK-NEXT:    br i1 [[TMP1]], label %[[BB2:.*]], label %[[BB20:.*]]
+; CHECK:       [[BB2]]:
 ; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[SHADOW_PTR_INT]], 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr
 ; CHECK-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[TMP4]], align 8
@@ -139,11 +141,11 @@ define void @test_store(ptr %a) sanitize_type {
 ; CHECK-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[TMP14]], align 8
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne ptr [[TMP15]], null
 ; CHECK-NEXT:    [[TMP17:%.*]] = or i1 [[TMP12]], [[TMP16]]
-; CHECK-NEXT:    br i1 [[TMP17]], label [[TMP18:%.*]], label [[TMP19:%.*]], !prof [[PROF0]]
-; CHECK:       18:
+; CHECK-NEXT:    br i1 [[TMP17]], label %[[BB18:.*]], label %[[BB19:.*]], !prof [[PROF0]]
+; CHECK:       [[BB18]]:
 ; CHECK-NEXT:    call void @__tysan_check(ptr [[A]], i32 4, ptr @__tysan_v1___ZTS1v_o_12, i32 2)
-; CHECK-NEXT:    br label [[TMP19]]
-; CHECK:       19:
+; CHECK-NEXT:    br label %[[BB19]]
+; CHECK:       [[BB19]]:
 ; CHECK-NEXT:    store ptr @__tysan_v1___ZTS1v_o_12, ptr [[SHADOW_PTR]], align 8
 ; CHECK-NEXT:    [[SHADOW_BYTE_1_OFFSET:%.*]] = add i64 [[SHADOW_PTR_INT]], 8
 ; CHECK-NEXT:    [[SHADOW_BYTE_1_PTR:%.*]] = inttoptr i64 [[SHADOW_BYTE_1_OFFSET]] to ptr
@@ -154,13 +156,13 @@ define void @test_store(ptr %a) sanitize_type {
 ; CHECK-NEXT:    [[SHADOW_BYTE_3_OFFSET:%.*]] = add i64 [[SHADOW_PTR_INT]], 24
 ; CHECK-NEXT:    [[SHADOW_BYTE_3_PTR:%.*]] = inttoptr i64 [[SHADOW_BYTE_3_OFFSET]] to ptr
 ; CHECK-NEXT:    store ptr inttoptr (i64 -3 to ptr), ptr [[SHADOW_BYTE_3_PTR]], align 8
-; CHECK-NEXT:    br label [[TMP21:%.*]]
-; CHECK:       20:
+; CHECK-NEXT:    br label %[[BB21:.*]]
+; CHECK:       [[BB20]]:
 ; CHECK-NEXT:    call void @__tysan_check(ptr [[A]], i32 4, ptr @__tysan_v1___ZTS1v_o_12, i32 2)
-; CHECK-NEXT:    br label [[TMP21]]
-; CHECK:       21:
-; CHECK-NEXT:    br label [[TMP43:%.*]]
-; CHECK:       22:
+; CHECK-NEXT:    br label %[[BB21]]
+; CHECK:       [[BB21]]:
+; CHECK-NEXT:    br label %[[BB43:.*]]
+; CHECK:       [[BB22]]:
 ; CHECK-NEXT:    [[TMP23:%.*]] = add i64 [[SHADOW_PTR_INT]], 8
 ; CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
 ; CHECK-NEXT:    [[TMP25:%.*]] = load ptr, ptr [[TMP24]], align 8
@@ -179,14 +181,14 @@ define void @test_store(ptr %a) sanitize_type {
 ; CHECK-NEXT:    [[TMP38:%.*]] = ptrtoint ptr [[TMP37]] to i64
 ; CHECK-NEXT:    [[TMP39:%.*]] = icmp sge i64 [[TMP38]], 0
 ; CHECK-NEXT:    [[TMP40:%.*]] = or i1 [[TMP34]], [[TMP39]]
-; CHECK-NEXT:    br i1 [[TMP40]], label [[TMP41:%.*]], label [[TMP42:%.*]], !prof [[PROF0]]
-; CHECK:       41:
+; CHECK-NEXT:    br i1 [[TMP40]], label %[[BB41:.*]], label %[[BB42:.*]], !prof [[PROF0]]
+; CHECK:       [[BB41]]:
 ; CHECK-NEXT:    call void @__tysan_check(ptr [[A]], i32 4, ptr @__tysan_v1___ZTS1v_o_12, i32 2)
-; CHECK-NEXT:    br label [[TMP42]]
-; CHECK:       42:
-; CHECK-NEXT:    br label [[TMP43]]
-; CHECK:       43:
-; CHECK-NEXT:    store i32 42, ptr [[A]], align 4, !tbaa [[TBAA5:![0-9]+]]
+; CHECK-NEXT:    br label %[[BB42]]
+; CHECK:       [[BB42]]:
+; CHECK-NEXT:    br label %[[BB43]]
+; CHECK:       [[BB43]]:
+; CHECK-NEXT:    store i32 42, ptr [[A]], align 4, !tbaa [[INT_TBAA5:![0-9]+]]
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -202,15 +204,15 @@ entry:
 !5 = !{!"_ZTS1v", !2, i64 8, !2, i64 12, !4, i64 16}
 !6 = !{!5, !2, i64 12}
 ;.
-; CHECK: attributes #[[ATTR0:[0-9]+]] = { sanitize_type }
+; CHECK: attributes #[[ATTR0]] = { sanitize_type }
 ; CHECK: attributes #[[ATTR1:[0-9]+]] = { nounwind }
 ;.
 ; CHECK: [[PROF0]] = !{!"branch_weights", i32 1, i32 100000}
-; CHECK: [[TBAA1]] = !{[[META2:![0-9]+]], [[META2]], i64 0}
+; CHECK: [[INT_TBAA1]] = !{[[META2:![0-9]+]], [[META2]], i64 0}
 ; CHECK: [[META2]] = !{!"int", [[META3:![0-9]+]], i64 0}
 ; CHECK: [[META3]] = !{!"omnipotent char", [[META4:![0-9]+]], i64 0}
 ; CHECK: [[META4]] = !{!"Simple C++ TBAA"}
-; CHECK: [[TBAA5]] = !{[[META6:![0-9]+]], [[META2]], i64 12}
+; CHECK: [[INT_TBAA5]] = !{[[META6:![0-9]+]], [[META2]], i64 12}
 ; CHECK: [[META6]] = !{!"_ZTS1v", [[META2]], i64 8, [[META2]], i64 12, [[META7:![0-9]+]], i64 16}
 ; CHECK: [[META7]] = !{!"_ZTS1x", [[META2]], i64 0, [[META2]], i64 4}
 ;.
diff --git a/llvm/test/Instrumentation/TypeSanitizer/nosanitize.ll b/llvm/test/Instrumentation/TypeSanitizer/nosanitize.ll
index c7c153e140fc2..d0ae3bcb435ba 100644
--- a/llvm/test/Instrumentation/TypeSanitizer/nosanitize.ll
+++ b/llvm/test/Instrumentation/TypeSanitizer/nosanitize.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 6
 ; Test basic type sanitizer instrumentation.
 ;
 ; RUN: opt -passes='tysan' -S %s | FileCheck %s
@@ -10,9 +10,10 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 ; CHECK: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 0, ptr @tysan.module_ctor, ptr null }]
 ;.
 define i32 @test_load(ptr %a) sanitize_type {
-; CHECK-LABEL: @test_load(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[A:%.*]], align 4, !tbaa [[TBAA0:![0-9]+]], !nosanitize [[META4:![0-9]+]]
+; CHECK-LABEL: define i32 @test_load(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[A]], align 4, !tbaa [[INT_TBAA0:![0-9]+]], !nosanitize [[META4:![0-9]+]]
 ; CHECK-NEXT:    ret i32 [[TMP1]]
 ;
 entry:
@@ -28,10 +29,10 @@ entry:
 !5 = !{!"_ZTS1v", !2, i64 8, !2, i64 12, !4, i64 16}
 !6 = !{!5, !2, i64 12}
 ;.
-; CHECK: attributes #[[ATTR0:[0-9]+]] = { sanitize_type }
+; CHECK: attributes #[[ATTR0]] = { sanitize_type }
 ; CHECK: attributes #[[ATTR1:[0-9]+]] = { nounwind }
 ;.
-; CHECK: [[TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0}
+; CHECK: [[INT_TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0}
 ; CHECK: [[META1]] = !{!"int", [[META2:![0-9]+]], i64 0}
 ; CHECK: [[META2]] = !{!"omnipotent char", [[META3:![0-9]+]], i64 0}
 ; CHECK: [[META3]] = !{!"Simple C++ TBAA"}
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vopd_err.s b/llvm/test/MC/AMDGPU/gfx11_asm_vopd_err.s
index 3c5905b14e06c..497fd86251fb4 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vopd_err.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vopd_err.s
@@ -22,7 +22,7 @@ v_dual_fmamk_f32    v122, v74, 0xa0172923, v161  ::  v_dual_lshlrev_b32  v247, 0
 v_dual_add_f32      v5, 0xaf123456, v2           ::  v_dual_fmaak_f32     v6, v3, v1, 0xbabe
 // GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: only one unique literal operand is allowed
 // GFX11-NEXT:{{^}}v_dual_add_f32      v5, 0xaf123456, v2           ::  v_dual_fmaak_f32     v6, v3, v1, 0xbabe
-// GFX11-NEXT:{{^}}                        ^
+// GFX11-NEXT:{{^}}                                                                                      ^
 
 v_dual_add_f32      v5, 0xaf123456, v2           ::  v_dual_fmaak_f32     v6, 0xbabe, v1, 0xbabe
 // GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: only one unique literal operand is allowed
@@ -32,12 +32,12 @@ v_dual_add_f32      v5, 0xaf123456, v2           ::  v_dual_fmaak_f32     v6, 0x
 v_dual_fmamk_f32    v122, 0xdeadbeef, 0xdeadbeef, v161 ::  v_dual_fmamk_f32  v123, 0xdeadbeef, 0x1234, v162
 // GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: only one unique literal operand is allowed
 // GFX11-NEXT:{{^}}v_dual_fmamk_f32    v122, 0xdeadbeef, 0xdeadbeef, v161 ::  v_dual_fmamk_f32  v123, 0xdeadbeef, 0x1234, v162
-// GFX11-NEXT:{{^}}                                                                                   ^
+// GFX11-NEXT:{{^}}                                                                                               ^
 
 v_dual_fmamk_f32    v122, 0xdeadbeef, 0xdeadbeef, v161 ::  v_dual_fmamk_f32  v123, s0, 0x1234, v162
 // GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: only one unique literal operand is allowed
 // GFX11-NEXT:{{^}}v_dual_fmamk_f32    v122, 0xdeadbeef, 0xdeadbeef, v161 ::  v_dual_fmamk_f32  v123, s0, 0x1234, v162
-// GFX11-NEXT:{{^}}                          ^
+// GFX11-NEXT:{{^}}                                                                                       ^
 
 //===----------------------------------------------------------------------===//
 // Check that KImm operands are counted as literals
@@ -52,12 +52,12 @@ v_dual_fmamk_f32    v122, v74, 0, v161           ::  v_dual_lshlrev_b32  v247, 0
 v_dual_add_f32      v5, 0xaf123456, v2           ::  v_dual_fmaak_f32     v6, v3, v1, 1.0
 // GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: only one unique literal operand is allowed
 // GFX11-NEXT:{{^}}v_dual_add_f32      v5, 0xaf123456, v2           ::  v_dual_fmaak_f32     v6, v3, v1, 1.0
-// GFX11-NEXT:{{^}}                        ^
+// GFX11-NEXT:{{^}}                                                                                      ^
 
 v_dual_fmamk_f32    v122, 0xdeadbeef, 2, v161    ::  v_dual_fmamk_f32  v123, s0, 1, v162
 // GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: only one unique literal operand is allowed
 // GFX11-NEXT:{{^}}v_dual_fmamk_f32    v122, 0xdeadbeef, 2, v161    ::  v_dual_fmamk_f32  v123, s0, 1, v162
-// GFX11-NEXT:{{^}}                          ^
+// GFX11-NEXT:{{^}}                                      ^
 
 v_dual_fmamk_f32    v122, v1, 2, v161            ::  v_dual_fmamk_f32  v123, s0, 1, v162
 // GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: only one unique literal operand is allowed
@@ -71,7 +71,7 @@ v_dual_fmamk_f32    v122, v1, 2, v161            ::  v_dual_fmamk_f32  v123, s0,
 v_dual_fmamk_f32    v122, 0xdeadbeef, 0xdeadbeef, v161 ::  v_dual_fmamk_f32  v123, 0xdeadbeef, 0x1234, v162
 // GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: only one unique literal operand is allowed
 // GFX11-NEXT:{{^}}v_dual_fmamk_f32    v122, 0xdeadbeef, 0xdeadbeef, v161 ::  v_dual_fmamk_f32  v123, 0xdeadbeef, 0x1234, v162
-// GFX11-NEXT:{{^}}                                                                                   ^
+// GFX11-NEXT:{{^}}                                                                                               ^
 
 v_dual_fmamk_f32    v122, 0xdeadbeef, 0xdeadbeef, v161 ::  v_dual_fmamk_f32  v123, 0x1234, 0xdeadbeef, v162
 // GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: only one unique literal operand is allowed
@@ -81,7 +81,7 @@ v_dual_fmamk_f32    v122, 0xdeadbeef, 0xdeadbeef, v161 ::  v_dual_fmamk_f32  v12
 v_dual_fmamk_f32    v122, 0xdeadbeef, 0x1234, v161     ::  v_dual_fmamk_f32  v123, 0xdeadbeef, 0xdeadbeef, v162
 // GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: only one unique literal operand is allowed
 // GFX11-NEXT:{{^}}v_dual_fmamk_f32    v122, 0xdeadbeef, 0x1234, v161     ::  v_dual_fmamk_f32  v123, 0xdeadbeef, 0xdeadbeef, v162
-// GFX11-NEXT:{{^}}                                                                                   ^
+// GFX11-NEXT:{{^}}                                      ^
 
 v_dual_fmamk_f32    v122, 0x1234, 0xdeadbeef, v161     ::  v_dual_fmamk_f32  v123, 0xdeadbeef, 0xdeadbeef, v162
 // GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: only one unique literal operand is allowed
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3-fake16.s
index 13f1bb036188d..d3b44eb788444 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3-fake16.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3-fake16.s
@@ -713,6 +713,9 @@ v_cvt_scale_pk8_f16_bf8 v[10:13], v[20:21], 0xcf00
 v_cvt_scale_pk8_f16_bf8 v[10:13], v[20:21], v8 scale_sel:7
 // GFX1250: v_cvt_scale_pk8_f16_bf8 v[10:13], v[20:21], v8 scale_sel:7 ; encoding: [0x0a,0x38,0xab,0xd6,0x14,0x11,0x02,0x00]
 
+v_cvt_scale_pk8_f16_bf8 v[10:13], v[20:21], v8 scale_sel:8
+// GFX1250: v_cvt_scale_pk8_f16_bf8 v[10:13], v[20:21], v8 scale_sel:8 ; encoding: [0x0a,0x40,0xab,0xd6,0x14,0x11,0x02,0x00]
+
 v_cvt_scale_pk8_bf16_bf8 v[10:13], v[20:21], v8
 // GFX1250: v_cvt_scale_pk8_bf16_bf8 v[10:13], v[20:21], v8 ; encoding: [0x0a,0x00,0xac,0xd6,0x14,0x11,0x02,0x00]
 
@@ -758,6 +761,9 @@ v_cvt_scale_pk8_f32_bf8 v[10:17], v[20:21], 0xcf00
 v_cvt_scale_pk8_f32_bf8 v[10:17], v[20:21], v8 scale_sel:7
 // GFX1250: v_cvt_scale_pk8_f32_bf8 v[10:17], v[20:21], v8 scale_sel:7 ; encoding: [0x0a,0x38,0xad,0xd6,0x14,0x11,0x02,0x00]
 
+v_cvt_scale_pk8_f32_bf8 v[10:17], v[20:21], v8 scale_sel:8
+// GFX1250: v_cvt_scale_pk8_f32_bf8 v[10:17], v[20:21], v8 scale_sel:8 ; encoding: [0x0a,0x40,0xad,0xd6,0x14,0x11,0x02,0x00]
+
 v_cvt_scale_pk8_f32_fp4 v[10:17], v20, v8
 // GFX1250: v_cvt_scale_pk8_f32_fp4 v[10:17], v20, v8 ; encoding: [0x0a,0x00,0xa1,0xd6,0x14,0x11,0x02,0x00]
 
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3.s
index 1441f3806987c..b4d4e365d0453 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3.s
@@ -713,6 +713,9 @@ v_cvt_scale_pk8_f16_bf8 v[10:13], v[20:21], 0xcf00
 v_cvt_scale_pk8_f16_bf8 v[10:13], v[20:21], v8 scale_sel:7
 // GFX1250: v_cvt_scale_pk8_f16_bf8 v[10:13], v[20:21], v8 scale_sel:7 ; encoding: [0x0a,0x38,0xab,0xd6,0x14,0x11,0x02,0x00]
 
+v_cvt_scale_pk8_f16_bf8 v[10:13], v[20:21], v8 scale_sel:8
+// GFX1250: v_cvt_scale_pk8_f16_bf8 v[10:13], v[20:21], v8 scale_sel:8 ; encoding: [0x0a,0x40,0xab,0xd6,0x14,0x11,0x02,0x00]
+
 v_cvt_scale_pk8_bf16_bf8 v[10:13], v[20:21], v8
 // GFX1250: v_cvt_scale_pk8_bf16_bf8 v[10:13], v[20:21], v8 ; encoding: [0x0a,0x00,0xac,0xd6,0x14,0x11,0x02,0x00]
 
@@ -758,6 +761,9 @@ v_cvt_scale_pk8_f32_bf8 v[10:17], v[20:21], 0xcf00
 v_cvt_scale_pk8_f32_bf8 v[10:17], v[20:21], v8 scale_sel:7
 // GFX1250: v_cvt_scale_pk8_f32_bf8 v[10:17], v[20:21], v8 scale_sel:7 ; encoding: [0x0a,0x38,0xad,0xd6,0x14,0x11,0x02,0x00]
 
+v_cvt_scale_pk8_f32_bf8 v[10:17], v[20:21], v8 scale_sel:8
+// GFX1250: v_cvt_scale_pk8_f32_bf8 v[10:17], v[20:21], v8 scale_sel:8 ; encoding: [0x0a,0x40,0xad,0xd6,0x14,0x11,0x02,0x00]
+
 v_cvt_scale_pk8_f32_fp4 v[10:17], v20, v8
 // GFX1250: v_cvt_scale_pk8_f32_fp4 v[10:17], v20, v8 ; encoding: [0x0a,0x00,0xa1,0xd6,0x14,0x11,0x02,0x00]
 
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_err.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_err.s
index e87943224e8f5..cce8e1ef24f5f 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_err.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_err.s
@@ -277,9 +277,9 @@ v_cvt_sr_fp8_f16 v1, v2, v3 mul:2
 // GFX125X-ERR-NEXT:{{^}}v_cvt_sr_fp8_f16 v1, v2, v3 mul:2
 // GFX125X-ERR-NEXT:{{^}}                            ^
 
-v_cvt_scale_pk8_f32_fp8 v[10:17], v[20:21], v8 scale_sel:8
+v_cvt_scale_pk8_f32_fp8 v[10:17], v[20:21], v8 scale_sel:16
 // GFX125X-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid scale_sel value.
-// GFX125X-ERR-NEXT:{{^}}v_cvt_scale_pk8_f32_fp8 v[10:17], v[20:21], v8 scale_sel:8
+// GFX125X-ERR-NEXT:{{^}}v_cvt_scale_pk8_f32_fp8 v[10:17], v[20:21], v8 scale_sel:16
 // GFX125X-ERR-NEXT:{{^}}                                               ^
 
 v_cvt_sr_bf8_f16 v1, v2, v3 byte_sel:4
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vopd_errs.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vopd_errs.s
index 81b79cb8c28da..f8cdf31c48e17 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_vopd_errs.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vopd_errs.s
@@ -22,7 +22,7 @@ v_dual_fmamk_f32    v122, v74, 0xa0172923, v161  ::  v_dual_lshlrev_b32  v247, 0
 v_dual_add_f32      v5, 0xaf123456, v2           ::  v_dual_fmaak_f32     v6, v3, v1, 0xbabe
 // GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: only one unique literal operand is allowed
 // GFX12-NEXT:{{^}}v_dual_add_f32      v5, 0xaf123456, v2           ::  v_dual_fmaak_f32     v6, v3, v1, 0xbabe
-// GFX12-NEXT:{{^}}                        ^
+// GFX12-NEXT:{{^}}                                                                                      ^
 
 v_dual_add_f32      v5, 0xaf123456, v2           ::  v_dual_fmaak_f32     v6, 0xbabe, v1, 0xbabe
 // GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: only one unique literal operand is allowed
@@ -32,12 +32,12 @@ v_dual_add_f32      v5, 0xaf123456, v2           ::  v_dual_fmaak_f32     v6, 0x
 v_dual_fmamk_f32    v122, 0xdeadbeef, 0xdeadbeef, v161 ::  v_dual_fmamk_f32  v123, 0xdeadbeef, 0x1234, v162
 // GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: only one unique literal operand is allowed
 // GFX12-NEXT:{{^}}v_dual_fmamk_f32    v122, 0xdeadbeef, 0xdeadbeef, v161 ::  v_dual_fmamk_f32  v123, 0xdeadbeef, 0x1234, v162
-// GFX12-NEXT:{{^}}                                                                                   ^
+// GFX12-NEXT:{{^}}                                                                                               ^
 
 v_dual_fmamk_f32    v122, 0xdeadbeef, 0xdeadbeef, v161 ::  v_dual_fmamk_f32  v123, s0, 0x1234, v162
 // GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: only one unique literal operand is allowed
 // GFX12-NEXT:{{^}}v_dual_fmamk_f32    v122, 0xdeadbeef, 0xdeadbeef, v161 ::  v_dual_fmamk_f32  v123, s0, 0x1234, v162
-// GFX12-NEXT:{{^}}                          ^
+// GFX12-NEXT:{{^}}                                                                                       ^
 
 //===----------------------------------------------------------------------===//
 // Check that assembler detects a different literal regardless of its location.
@@ -46,7 +46,7 @@ v_dual_fmamk_f32    v122, 0xdeadbeef, 0xdeadbeef, v161 ::  v_dual_fmamk_f32  v12
 v_dual_fmamk_f32    v122, 0xdeadbeef, 0xdeadbeef, v161 ::  v_dual_fmamk_f32  v123, 0xdeadbeef, 0x1234, v162
 // GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: only one unique literal operand is allowed
 // GFX12-NEXT:{{^}}v_dual_fmamk_f32    v122, 0xdeadbeef, 0xdeadbeef, v161 ::  v_dual_fmamk_f32  v123, 0xdeadbeef, 0x1234, v162
-// GFX12-NEXT:{{^}}                                                                                   ^
+// GFX12-NEXT:{{^}}                                                                                               ^
 
 v_dual_fmamk_f32    v122, 0xdeadbeef, 0xdeadbeef, v161 ::  v_dual_fmamk_f32  v123, 0x1234, 0xdeadbeef, v162
 // GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: only one unique literal operand is allowed
@@ -56,7 +56,7 @@ v_dual_fmamk_f32    v122, 0xdeadbeef, 0xdeadbeef, v161 ::  v_dual_fmamk_f32  v12
 v_dual_fmamk_f32    v122, 0xdeadbeef, 0x1234, v161     ::  v_dual_fmamk_f32  v123, 0xdeadbeef, 0xdeadbeef, v162
 // GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: only one unique literal operand is allowed
 // GFX12-NEXT:{{^}}v_dual_fmamk_f32    v122, 0xdeadbeef, 0x1234, v161     ::  v_dual_fmamk_f32  v123, 0xdeadbeef, 0xdeadbeef, v162
-// GFX12-NEXT:{{^}}                                                                                   ^
+// GFX12-NEXT:{{^}}                                      ^
 
 v_dual_fmamk_f32    v122, 0x1234, 0xdeadbeef, v161     ::  v_dual_fmamk_f32  v123, 0xdeadbeef, 0xdeadbeef, v162
 // GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: only one unique literal operand is allowed
@@ -114,7 +114,7 @@ v_dual_cndmask_b32  v255, s1, v2                 ::  v_dual_cndmask_b32   v6, s2
 v_dual_cndmask_b32 v1, s2, v3, vcc_lo :: v_dual_cndmask_b32 v2, s3, v4, vcc_lo
 // GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand (violates constant bus restrictions)
 // GFX12-NEXT:{{^}}v_dual_cndmask_b32 v1, s2, v3, vcc_lo :: v_dual_cndmask_b32 v2, s3, v4, vcc_lo
-// GFX12-NEXT:{{^}}                                                                        ^
+// GFX12-NEXT:{{^}}                                                                ^
 
 // SGPR + LITERAL + VCC
 
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s
index 90a449173320a..871925547224e 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s
@@ -1239,37 +1239,37 @@ v_cvt_pk_bf8_f32_e64_dpp v1.l, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_m
 // GFX12: v_cvt_pk_bf8_f32_e64_dpp v1.l, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd fi:1 ; encoding: [0x01,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x04,0xed]
 
 v_cvt_pk_fp8_f32_e64_dpp v1.l, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd
-// GFX1200: v_cvt_pk_fp8_f32_e64_dpp v1.l, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed]
+// GFX12: v_cvt_pk_fp8_f32_e64_dpp v1.l, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed]
 
 v_cvt_pk_fp8_f32_e64_dpp v255.h, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd
-// GFX1200: v_cvt_pk_fp8_f32_e64_dpp v255.h, -v2, |v3| op_sel:[0,0,1] quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0xff,0x42,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed]
+// GFX12: v_cvt_pk_fp8_f32_e64_dpp v255.h, -v2, |v3| op_sel:[0,0,1] quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0xff,0x42,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed]
 
 v_cvt_pk_fp8_f32_e64_dpp v1.l, -v2, |v3| quad_perm:[0,1,2,3]
-// GFX1200: v_cvt_pk_fp8_f32_e64_dpp v1.l, -v2, |v3| quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0xe4,0x00,0xff]
+// GFX12: v_cvt_pk_fp8_f32_e64_dpp v1.l, -v2, |v3| quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0xe4,0x00,0xff]
 
 v_cvt_pk_fp8_f32_e64_dpp v6.l, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd
-// GFX1200: v_cvt_pk_fp8_f32_e64_dpp v6.l, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x06,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed]
+// GFX12: v_cvt_pk_fp8_f32_e64_dpp v6.l, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x06,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed]
 
 v_cvt_pk_fp8_f32_e64_dpp v1.l, -v6, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd
-// GFX1200: v_cvt_pk_fp8_f32_e64_dpp v1.l, -v6, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x06,0x1b,0x00,0xed]
+// GFX12: v_cvt_pk_fp8_f32_e64_dpp v1.l, -v6, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x06,0x1b,0x00,0xed]
 
 v_cvt_pk_fp8_f32_e64_dpp v1.l, -v2, |v255| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd
-// GFX1200: v_cvt_pk_fp8_f32_e64_dpp v1.l, -v2, |v255| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x02,0x69,0xd7,0xfa,0xfe,0x03,0x20,0x02,0x1b,0x00,0xed]
+// GFX12: v_cvt_pk_fp8_f32_e64_dpp v1.l, -v2, |v255| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x02,0x69,0xd7,0xfa,0xfe,0x03,0x20,0x02,0x1b,0x00,0xed]
 
 v_cvt_pk_fp8_f32_e64_dpp v1.l, -v2, |v3| quad_perm:[0,2,1,3] row_mask:0xe bank_mask:0xd
-// GFX1200: v_cvt_pk_fp8_f32_e64_dpp v1.l, -v2, |v3| quad_perm:[0,2,1,3] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0xd8,0x00,0xed]
+// GFX12: v_cvt_pk_fp8_f32_e64_dpp v1.l, -v2, |v3| quad_perm:[0,2,1,3] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0xd8,0x00,0xed]
 
 v_cvt_pk_fp8_f32_e64_dpp v1.l, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0x2 bank_mask:0xd
-// GFX1200: v_cvt_pk_fp8_f32_e64_dpp v1.l, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0x2 bank_mask:0xd ; encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0x2d]
+// GFX12: v_cvt_pk_fp8_f32_e64_dpp v1.l, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0x2 bank_mask:0xd ; encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0x2d]
 
 v_cvt_pk_fp8_f32_e64_dpp v1.l, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0x5
-// GFX1200: v_cvt_pk_fp8_f32_e64_dpp v1.l, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0x5 ; encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xe5]
+// GFX12: v_cvt_pk_fp8_f32_e64_dpp v1.l, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0x5 ; encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xe5]
 
 v_cvt_pk_fp8_f32_e64_dpp v1.l, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd fi:1
-// GFX1200: v_cvt_pk_fp8_f32_e64_dpp v1.l, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd fi:1 ; encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x04,0xed]
+// GFX12: v_cvt_pk_fp8_f32_e64_dpp v1.l, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd fi:1 ; encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x04,0xed]
 
 v_cvt_pk_fp8_f32_e64_dpp v255.h, -v2, |v3| quad_perm:[0,1,2,3]
-// GFX1200: v_cvt_pk_fp8_f32_e64_dpp v255.h, -v2, |v3| op_sel:[0,0,1] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x42,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0xe4,0x00,0xff]
+// GFX12: v_cvt_pk_fp8_f32_e64_dpp v255.h, -v2, |v3| op_sel:[0,0,1] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x42,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0xe4,0x00,0xff]
 
 v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd
 // GFX12: v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s
index a73aa40a2751a..75d4dd060f575 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s
@@ -763,19 +763,19 @@ v_cubetc_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,
 // GFX12: v_cubetc_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x0e,0xd6,0xe9,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
 
 v_cvt_pk_fp8_f32_e64_dpp v5.l, v1, v2 dpp8:[7,6,5,4,2,3,0,1]
-// GFX1200: v_cvt_pk_fp8_f32_e64_dpp v5.l, v1, v2 dpp8:[7,6,5,4,2,3,0,1] ; encoding: [0x05,0x00,0x69,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0xa9,0x21]
+// GFX12: v_cvt_pk_fp8_f32_e64_dpp v5.l, v1, v2 dpp8:[7,6,5,4,2,3,0,1] ; encoding: [0x05,0x00,0x69,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0xa9,0x21]
 
 v_cvt_pk_fp8_f32_e64_dpp v5.l, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX1200: v_cvt_pk_fp8_f32_e64_dpp v5.l, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x69,0xd7,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+// GFX12: v_cvt_pk_fp8_f32_e64_dpp v5.l, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x69,0xd7,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
 v_cvt_pk_fp8_f32_e64_dpp v5.l, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX1200: v_cvt_pk_fp8_f32_e64_dpp v5.l, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x02,0x69,0xd7,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+// GFX12: v_cvt_pk_fp8_f32_e64_dpp v5.l, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x02,0x69,0xd7,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
 v_cvt_pk_fp8_f32_e64_dpp v255.l, -|v255|, -|v255| dpp8:[0,0,0,0,0,0,0,0]
-// GFX1200: v_cvt_pk_fp8_f32_e64_dpp v255.l, -|v255|, -|v255| dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x03,0x69,0xd7,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+// GFX12: v_cvt_pk_fp8_f32_e64_dpp v255.l, -|v255|, -|v255| dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x03,0x69,0xd7,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
 v_cvt_pk_fp8_f32_e64_dpp v255.h, -|v255|, -|v255| dpp8:[0,0,0,0,0,0,0,0]
-// GFX1200: v_cvt_pk_fp8_f32_e64_dpp v255.h, -|v255|, -|v255| op_sel:[0,0,1] dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x43,0x69,0xd7,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+// GFX12: v_cvt_pk_fp8_f32_e64_dpp v255.h, -|v255|, -|v255| op_sel:[0,0,1] dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x43,0x69,0xd7,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
 v_cvt_pk_bf8_f32_e64_dpp v5.l, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: v_cvt_pk_bf8_f32_e64_dpp v5.l, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vopd_errs.s b/llvm/test/MC/AMDGPU/gfx12_asm_vopd_errs.s
index 5751258fe85d8..aa71e4d2969fc 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vopd_errs.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vopd_errs.s
@@ -22,7 +22,7 @@ v_dual_fmamk_f32    v122, v74, 0xa0172923, v161  ::  v_dual_lshlrev_b32  v247, 0
 v_dual_add_f32      v5, 0xaf123456, v2           ::  v_dual_fmaak_f32     v6, v3, v1, 0xbabe
 // GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: only one unique literal operand is allowed
 // GFX12-NEXT:{{^}}v_dual_add_f32      v5, 0xaf123456, v2           ::  v_dual_fmaak_f32     v6, v3, v1, 0xbabe
-// GFX12-NEXT:{{^}}                        ^
+// GFX12-NEXT:{{^}}                                                                                      ^
 
 v_dual_add_f32      v5, 0xaf123456, v2           ::  v_dual_fmaak_f32     v6, 0xbabe, v1, 0xbabe
 // GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: only one unique literal operand is allowed
@@ -32,12 +32,12 @@ v_dual_add_f32      v5, 0xaf123456, v2           ::  v_dual_fmaak_f32     v6, 0x
 v_dual_fmamk_f32    v122, 0xdeadbeef, 0xdeadbeef, v161 ::  v_dual_fmamk_f32  v123, 0xdeadbeef, 0x1234, v162
 // GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: only one unique literal operand is allowed
 // GFX12-NEXT:{{^}}v_dual_fmamk_f32    v122, 0xdeadbeef, 0xdeadbeef, v161 ::  v_dual_fmamk_f32  v123, 0xdeadbeef, 0x1234, v162
-// GFX12-NEXT:{{^}}                                                                                   ^
+// GFX12-NEXT:{{^}}                                                                                               ^
 
 v_dual_fmamk_f32    v122, 0xdeadbeef, 0xdeadbeef, v161 ::  v_dual_fmamk_f32  v123, s0, 0x1234, v162
 // GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: only one unique literal operand is allowed
 // GFX12-NEXT:{{^}}v_dual_fmamk_f32    v122, 0xdeadbeef, 0xdeadbeef, v161 ::  v_dual_fmamk_f32  v123, s0, 0x1234, v162
-// GFX12-NEXT:{{^}}                          ^
+// GFX12-NEXT:{{^}}                                                                                       ^
 
 //===----------------------------------------------------------------------===//
 // Check that assembler detects a different literal regardless of its location.
@@ -46,7 +46,7 @@ v_dual_fmamk_f32    v122, 0xdeadbeef, 0xdeadbeef, v161 ::  v_dual_fmamk_f32  v12
 v_dual_fmamk_f32    v122, 0xdeadbeef, 0xdeadbeef, v161 ::  v_dual_fmamk_f32  v123, 0xdeadbeef, 0x1234, v162
 // GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: only one unique literal operand is allowed
 // GFX12-NEXT:{{^}}v_dual_fmamk_f32    v122, 0xdeadbeef, 0xdeadbeef, v161 ::  v_dual_fmamk_f32  v123, 0xdeadbeef, 0x1234, v162
-// GFX12-NEXT:{{^}}                                                                                   ^
+// GFX12-NEXT:{{^}}                                                                                               ^
 
 v_dual_fmamk_f32    v122, 0xdeadbeef, 0xdeadbeef, v161 ::  v_dual_fmamk_f32  v123, 0x1234, 0xdeadbeef, v162
 // GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: only one unique literal operand is allowed
@@ -56,7 +56,7 @@ v_dual_fmamk_f32    v122, 0xdeadbeef, 0xdeadbeef, v161 ::  v_dual_fmamk_f32  v12
 v_dual_fmamk_f32    v122, 0xdeadbeef, 0x1234, v161     ::  v_dual_fmamk_f32  v123, 0xdeadbeef, 0xdeadbeef, v162
 // GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: only one unique literal operand is allowed
 // GFX12-NEXT:{{^}}v_dual_fmamk_f32    v122, 0xdeadbeef, 0x1234, v161     ::  v_dual_fmamk_f32  v123, 0xdeadbeef, 0xdeadbeef, v162
-// GFX12-NEXT:{{^}}                                                                                   ^
+// GFX12-NEXT:{{^}}                                      ^
 
 v_dual_fmamk_f32    v122, 0x1234, 0xdeadbeef, v161     ::  v_dual_fmamk_f32  v123, 0xdeadbeef, 0xdeadbeef, v162
 // GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: only one unique literal operand is allowed
diff --git a/llvm/test/MC/AMDGPU/hsa-metadata-kernel-attrs-v6.s b/llvm/test/MC/AMDGPU/hsa-metadata-kernel-attrs-v6.s
new file mode 100644
index 0000000000000..b91888d0bba6f
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/hsa-metadata-kernel-attrs-v6.s
@@ -0,0 +1,35 @@
+// RUN: llvm-mc -triple=amdgcn-amd-amdhsa -mcpu=gfx1250 --amdhsa-code-object-version=6 -show-encoding %s | FileCheck %s
+
+// CHECK: .amdgpu_metadata
+// CHECK: amdhsa.kernels:
+// CHECK: - .cluster_dims:
+// CHECK-NEXT: - 4
+// CHECK-NEXT: - 2
+// CHECK-NEXT: - 1
+.amdgpu_metadata
+  amdhsa.version:
+    - 1
+    - 0
+  amdhsa.printf:
+    - '1:1:4:%d\n'
+    - '2:1:8:%g\n'
+  amdhsa.kernels:
+    - .name:            test_kernel
+      .symbol:      test_kernel@kd
+      .language:        OpenCL C
+      .language_version:
+        - 2
+        - 0
+      .kernarg_segment_size: 8
+      .group_segment_fixed_size: 16
+      .private_segment_fixed_size: 32
+      .kernarg_segment_align: 64
+      .wavefront_size: 128
+      .sgpr_count: 14
+      .vgpr_count: 40
+      .max_flat_workgroup_size: 256
+      .cluster_dims:
+        - 4
+        - 2
+        - 1
+.end_amdgpu_metadata
diff --git a/llvm/test/MC/AMDGPU/mai-gfx950-err.s b/llvm/test/MC/AMDGPU/mai-gfx950-err.s
index e700b0b3cabfe..747deab3bfcae 100644
--- a/llvm/test/MC/AMDGPU/mai-gfx950-err.s
+++ b/llvm/test/MC/AMDGPU/mai-gfx950-err.s
@@ -7,7 +7,9 @@ v_mfma_ld_scale_b32 65, v0
 // CHECK: :[[@LINE-1]]:21: error: literal operands are not supported
 
 v_mfma_ld_scale_b32 65, 65
-// CHECK: :[[@LINE-1]]:25: error: literal operands are not supported
+// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: literal operands are not supported
+// CHECK-NEXT:{{^}}v_mfma_ld_scale_b32 65, 65
+// CHECK-NEXT:{{^}}                    ^
 
 v_mfma_ld_scale_b32 s0, s1
 // CHECK: :[[@LINE-1]]:25: error: invalid operand (violates constant bus restrictions)
@@ -156,3 +158,51 @@ v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[12:19], v[4:9], v[0:3] v20, v21 blgp
 
 v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[12:19], v[4:11], v[0:3] v20, v21 blgp:4
 // CHECK: :[[@LINE-1]]:53: error: wrong register tuple size for blgp value 4
+
+
+// Workaround a hardware bug to disallow sgpr/inline constants as scale operands
+
+v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], v44, s24
+// CHECK: :[[@LINE-1]]:77: error: invalid operand for instruction
+
+v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], s24, v44
+// CHECK: :[[@LINE-1]]:72: error: invalid operand for instruction
+
+v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], m0, v24
+// CHECK: :[[@LINE-1]]:72: error: invalid operand for instruction
+
+v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], vcc_lo, v24
+// CHECK: :[[@LINE-1]]:72: error: invalid operand for instruction
+
+v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], 9, v24
+// CHECK: :[[@LINE-1]]:72: error: invalid operand for instruction
+
+v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], v24, 9
+// CHECK: :[[@LINE-1]]:77: error: invalid operand for instruction
+
+v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], 33, v24
+// CHECK: :[[@LINE-1]]:72: error: invalid operand for instruction
+
+v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], 4.0, v24
+// CHECK: :[[@LINE-1]]:72: error: invalid operand for instruction
+
+v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], v24, 4.0
+// CHECK: :[[@LINE-1]]:77: error: invalid operand for instruction
+
+v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], -4.0, v24
+// CHECK: :[[@LINE-1]]:72: error: invalid operand for instruction
+
+v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], 0.15915494, v24
+// CHECK: :[[@LINE-1]]:72: error: invalid operand for instruction
+
+v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], 16, v49
+// CHECK: :[[@LINE-1]]:73: error: invalid operand for instruction
+
+v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], v48, -4.0
+// CHECK: :[[@LINE-1]]:78: error: invalid operand for instruction
+
+v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], 4.0, v24
+// CHECK: :[[@LINE-1]]:73: error: invalid operand for instruction
+
+v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], 0.15915494, v24
+// CHECK: :[[@LINE-1]]:73: error: invalid operand for instruction
diff --git a/llvm/test/MC/AMDGPU/mai-gfx950.s b/llvm/test/MC/AMDGPU/mai-gfx950.s
index 2d3a56703674a..c9035033912ac 100644
--- a/llvm/test/MC/AMDGPU/mai-gfx950.s
+++ b/llvm/test/MC/AMDGPU/mai-gfx950.s
@@ -405,58 +405,6 @@ v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], a[4:11], v[12:19], v[20:23], v24, v25
 // ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
 v_mfma_scale_f32_16x16x128_f8f6f4 v[50:53], v[4:11], v[12:19], v[20:23], v24, v25
 
-// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], v44, s24 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x2c,0x31,0x00,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04]
-// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
-v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], v44, s24
-
-// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], s24, s24 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x30,0x00,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04]
-// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
-v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], s24, s24
-
-// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], s24, v44 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x58,0x02,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04]
-// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
-v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], s24, v44
-
-// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], m0, m0 op_sel_hi:[0,0,0]  ; encoding: [0x00,0x00,0xac,0xd3,0x7c,0xf8,0x00,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04]
-// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
-v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], m0, m0
-
-// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], vcc_lo, v2 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x6a,0x04,0x02,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04]
-// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
-v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], vcc_lo, v2
-
-// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], 9, v2 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x89,0x04,0x02,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04]
-// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
-v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], 9, v2
-
-// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], v2, 9 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x02,0x13,0x01,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04]
-// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
-v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], v2, 9
-
-// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], s20, 9 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x14,0x12,0x01,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04]
-// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
-v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], s20, 9
-
-// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], 33, 9 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0xa1,0x12,0x01,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04]
-// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
-v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], 33, 9
-
-// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], 4.0, v2 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0xf6,0x04,0x02,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04]
-// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
-v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], 4.0, v2
-
-// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], v2, 4.0 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x02,0xed,0x01,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04]
-// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
-v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], v2, 4.0
-
-// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], -4.0, 1.0 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0xf7,0xe4,0x01,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04]
-// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
-v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], -4.0, 1.0
-
-// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], 0.15915494, -16 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0xf8,0xa0,0x01,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04]
-// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
-v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], 0.15915494, -16
-
 // GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:3 blgp:1 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x0b,0xad,0xd3,0x04,0x19,0x52,0x24]
 // ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
 v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 cbsz:3 blgp:1
@@ -585,22 +533,6 @@ v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:29], v[32:47], v48, v49
 // ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
 v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:21], v[24:29], v[32:47], v48, v49 op_sel:[0,1,0] op_sel_hi:[0,1,0] cbsz:2 blgp:3
 
-// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], 16, v49 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x90,0x62,0x02,0x00,0x00,0x08,0xae,0xd3,0x10,0x31,0x82,0x04]
-// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
-v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], 16, v49
-
-// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], v48, -4.0 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x30,0xef,0x01,0x00,0x00,0x08,0xae,0xd3,0x10,0x31,0x82,0x04]
-// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
-v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], v48, -4.0
-
-// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], 4.0, 1.0 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0xf6,0xe4,0x01,0x00,0x00,0x08,0xae,0xd3,0x10,0x31,0x82,0x04]
-// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
-v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], 4.0, 1.0
-
-// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], 0.15915494, -16 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0xf8,0xa0,0x01,0x00,0x00,0x08,0xae,0xd3,0x10,0x31,0x82,0x04]
-// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
-v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], 0.15915494, -16
-
 // op_sel combinations
 
 // GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], v48, v49 op_sel:[0,1,0] op_sel_hi:[0,0,0] ; encoding: [0x00,0x10,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x08,0xae,0xd3,0x10,0x31,0x82,0x04]
diff --git a/llvm/test/MC/AMDGPU/vop3-literal.s b/llvm/test/MC/AMDGPU/vop3-literal.s
index ba683e5423c72..56e71b9cfcfd1 100644
--- a/llvm/test/MC/AMDGPU/vop3-literal.s
+++ b/llvm/test/MC/AMDGPU/vop3-literal.s
@@ -26,25 +26,25 @@ v_bfe_u32 v0, v1, s1, 0x3039
 v_bfe_u32 v0, 0x3039, 0x3039, s1
 // GFX10: v_bfe_u32 v0, 0x3039, 0x3039, s1        ; encoding: [0x00,0x00,0x48,0xd5,0xff,0xfe,0x05,0x00,0x39,0x30,0x00,0x00]
 // GFX1250: v_bfe_u32 v0, 0x3039, 0x3039, s1        ; encoding: [0x00,0x00,0x10,0xd6,0xff,0xfe,0x05,0x00,0x39,0x30,0x00,0x00]
-// GFX9-ERR: :[[@LINE-3]]:23: error: literal operands are not supported
+// GFX9-ERR: :[[@LINE-3]]:15: error: literal operands are not supported
 
 v_bfe_u32 v0, 0x3039, s1, 0x3039
 // GFX10: v_bfe_u32 v0, 0x3039, s1, 0x3039        ; encoding: [0x00,0x00,0x48,0xd5,0xff,0x02,0xfc,0x03,0x39,0x30,0x00,0x00]
 // GFX1250: v_bfe_u32 v0, 0x3039, s1, 0x3039        ; encoding: [0x00,0x00,0x10,0xd6,0xff,0x02,0xfc,0x03,0x39,0x30,0x00,0x00]
-// GFX9-ERR: :[[@LINE-3]]:27: error: literal operands are not supported
+// GFX9-ERR: :[[@LINE-3]]:15: error: literal operands are not supported
 
 v_bfe_u32 v0, v1, 0x3039, 0x3039
 // GFX10: v_bfe_u32 v0, v1, 0x3039, 0x3039        ; encoding: [0x00,0x00,0x48,0xd5,0x01,0xff,0xfd,0x03,0x39,0x30,0x00,0x00]
 // GFX1250: v_bfe_u32 v0, v1, 0x3039, 0x3039        ; encoding: [0x00,0x00,0x10,0xd6,0x01,0xff,0xfd,0x03,0x39,0x30,0x00,0x00]
-// GFX9-ERR: :[[@LINE-3]]:27: error: literal operands are not supported
+// GFX9-ERR: :[[@LINE-3]]:19: error: literal operands are not supported
 
 v_bfe_u32 v0, 0x3039, 0x3039, 0x3039
 // GFX10: v_bfe_u32 v0, 0x3039, 0x3039, 0x3039    ; encoding: [0x00,0x00,0x48,0xd5,0xff,0xfe,0xfd,0x03,0x39,0x30,0x00,0x00]
 // GFX1250: v_bfe_u32 v0, 0x3039, 0x3039, 0x3039    ; encoding: [0x00,0x00,0x10,0xd6,0xff,0xfe,0xfd,0x03,0x39,0x30,0x00,0x00]
-// GFX9-ERR: :[[@LINE-3]]:31: error: literal operands are not supported
+// GFX9-ERR: :[[@LINE-3]]:15: error: literal operands are not supported
 
 v_bfe_u32 v0, 0x3039, s1, 0x3038
-// GFX9-ERR: :[[@LINE-1]]:27: error: literal operands are not supported
+// GFX9-ERR: :[[@LINE-1]]:15: error: literal operands are not supported
 // GFX10-ERR: :[[@LINE-2]]:27: error: only one unique literal operand is allowed
 // GFX1250-ERR: :[[@LINE-3]]:27: error: only one unique literal operand is allowed
 
@@ -54,7 +54,7 @@ v_bfe_u32 v0, 0x3039, v1, v2
 // GFX9-ERR: :[[@LINE-3]]:15: error: literal operands are not supported
 
 v_bfe_u32 v0, 0x3039, 0x12345, v2
-// GFX9-ERR: :[[@LINE-1]]:23: error: literal operands are not supported
+// GFX9-ERR: :[[@LINE-1]]:15: error: literal operands are not supported
 // GFX10-ERR: :[[@LINE-2]]:23: error: only one unique literal operand is allowed
 // GFX1250-ERR: :[[@LINE-3]]:23: error: only one unique literal operand is allowed
 
@@ -81,10 +81,10 @@ v_bfm_b32_e64 v0, 0x3039, v1
 v_bfm_b32_e64 v0, 0x3039, 0x3039
 // GFX10: v_bfm_b32 v0, 0x3039, 0x3039            ; encoding: [0x00,0x00,0x63,0xd7,0xff,0xfe,0x01,0x00,0x39,0x30,0x00,0x00]
 // GFX1250: v_bfm_b32 v0, 0x3039, 0x3039            ; encoding: [0x00,0x00,0x1d,0xd7,0xff,0xfe,0x01,0x00,0x39,0x30,0x00,0x00]
-// GFX9-ERR: :[[@LINE-3]]:27: error: literal operands are not supported
+// GFX9-ERR: :[[@LINE-3]]:19: error: literal operands are not supported
 
 v_bfm_b32_e64 v0, 0x3039, 0x3038
-// GFX9-ERR: :[[@LINE-1]]:27: error: literal operands are not supported
+// GFX9-ERR: :[[@LINE-1]]:19: error: literal operands are not supported
 // GFX10-ERR: :[[@LINE-2]]:27: error: only one unique literal operand is allowed
 // GFX1250-ERR: :[[@LINE-3]]:27: error: only one unique literal operand is allowed
 
@@ -106,10 +106,10 @@ v_pk_add_f16 v1, -200, v2
 v_pk_add_f16 v1, 25.0, 25.0
 // GFX10: v_pk_add_f16 v1, 0x4e40, 0x4e40         ; encoding: [0x01,0x40,0x0f,0xcc,0xff,0xfe,0x01,0x18,0x40,0x4e,0x00,0x00]
 // GFX1250: v_pk_add_f16 v1, 0x4e40, 0x4e40         ; encoding: [0x01,0x40,0x0f,0xcc,0xff,0xfe,0x01,0x18,0x40,0x4e,0x00,0x00]
-// GFX9-ERR: :[[@LINE-3]]:24: error: literal operands are not supported
+// GFX9-ERR: :[[@LINE-3]]:18: error: literal operands are not supported
 
 v_pk_add_f16 v1, 25.0, 25.1
-// GFX9-ERR: :[[@LINE-1]]:24: error: literal operands are not supported
+// GFX9-ERR: :[[@LINE-1]]:18: error: literal operands are not supported
 // GFX10-ERR: :[[@LINE-2]]:24: error: only one unique literal operand is allowed
 // GFX1250-ERR: :[[@LINE-3]]:24: error: only one unique literal operand is allowed
 
@@ -146,7 +146,7 @@ v_pk_add_u16 v1, -100, v2
 v_pk_add_u16 v1, -100, -100
 // GFX10: v_pk_add_u16 v1, 0xffffff9c, 0xffffff9c ; encoding: [0x01,0x40,0x0a,0xcc,0xff,0xfe,0x01,0x18,0x9c,0xff,0xff,0xff]
 // GFX1250: v_pk_add_u16 v1, 0xffffff9c, 0xffffff9c ; encoding: [0x01,0x40,0x0a,0xcc,0xff,0xfe,0x01,0x18,0x9c,0xff,0xff,0xff]
-// GFX9-ERR: :[[@LINE-3]]:24: error: literal operands are not supported
+// GFX9-ERR: :[[@LINE-3]]:18: error: literal operands are not supported
 
 v_add_f32_e64 v1, neg(abs(0x123)), v3
 // GFX10: v_add_f32_e64 v1, -|0x123|, v3          ; encoding: [0x01,0x01,0x03,0xd5,0xff,0x06,0x02,0x20,0x23,0x01,0x00,0x00]
@@ -161,7 +161,7 @@ v_add_f32_e64 v1, v3, neg(0x123)
 v_add_f32_e64 v1, neg(abs(0x12345678)), neg(0x12345678)
 // GFX10: v_add_f32_e64 v1, -|0x12345678|, neg(0x12345678) ; encoding: [0x01,0x01,0x03,0xd5,0xff,0xfe,0x01,0x60,0x78,0x56,0x34,0x12]
 // GFX1250: v_add_f32_e64 v1, -|0x12345678|, neg(0x12345678) ; encoding: [0x01,0x01,0x03,0xd5,0xff,0xfe,0x01,0x60,0x78,0x56,0x34,0x12]
-// GFX9-ERR: :[[@LINE-3]]:45: error: literal operands are not supported
+// GFX9-ERR: :[[@LINE-3]]:27: error: literal operands are not supported
 
 v_add_f16_e64 v0, v0, 0xfe0b
 // GFX10: v_add_f16_e64 v0, v0, 0xfe0b            ; encoding: [0x00,0x00,0x32,0xd5,0x00,0xff,0x01,0x00,0x0b,0xfe,0x00,0x00]
@@ -181,7 +181,7 @@ v_add_f16_e64 v0, 0x3456, v0
 v_add_f16_e64 v0, 0xfe0b, neg(0xfe0b)
 // GFX10: v_add_f16_e64 v0, 0xfe0b, neg(0xfe0b)   ; encoding: [0x00,0x00,0x32,0xd5,0xff,0xfe,0x01,0x40,0x0b,0xfe,0x00,0x00]
 // GFX1250: v_add_f16_e64 v0, 0xfe0b, neg(0xfe0b)   ; encoding: [0x00,0x00,0x32,0xd5,0xff,0xfe,0x01,0x40,0x0b,0xfe,0x00,0x00]
-// GFX9-ERR: :[[@LINE-3]]:31: error: literal operands are not supported
+// GFX9-ERR: :[[@LINE-3]]:19: error: literal operands are not supported
 
 v_add_f64 v[0:1], 1.23456, v[0:1]
 // GFX10: v_add_f64 v[0:1], 0x3ff3c0c1, v[0:1]    ; encoding: [0x00,0x00,0x64,0xd5,0xff,0x00,0x02,0x00,0xc1,0xc0,0xf3,0x3f]
@@ -196,10 +196,10 @@ v_add_f64 v[0:1], v[0:1], -abs(1.23456)
 v_add_f64 v[0:1], 1.23456, -abs(1.23456)
 // GFX10: v_add_f64 v[0:1], 0x3ff3c0c1, -|0x3ff3c0c1| ; encoding: [0x00,0x02,0x64,0xd5,0xff,0xfe,0x01,0x40,0xc1,0xc0,0xf3,0x3f]
 // GFX1250: v_add_f64_e64 v[0:1], 0x3ff3c0c1, -|0x3ff3c0c1| ; encoding: [0x00,0x02,0x02,0xd5,0xff,0xfe,0x01,0x40,0xc1,0xc0,0xf3,0x3f]
-// GFX9-ERR: :[[@LINE-3]]:33: error: literal operands are not supported
+// GFX9-ERR: :[[@LINE-3]]:19: error: literal operands are not supported
 
 v_add_f64 v[0:1], 1.23456, -abs(1.2345)
-// GFX9-ERR: :[[@LINE-1]]:33: error: literal operands are not supported
+// GFX9-ERR: :[[@LINE-1]]:19: error: literal operands are not supported
 // GFX10-ERR: :[[@LINE-2]]:33: error: only one unique literal operand is allowed
 // GFX1250-ERR: :[[@LINE-3]]:33: error: only one unique literal operand is allowed
 
@@ -216,7 +216,7 @@ v_max_i16_e64 v5, v1, 0x123
 v_max_i16_e64 v5, 0x1234, 0x1234
 // GFX10: v_max_i16 v5, 0x1234, 0x1234            ; encoding: [0x05,0x00,0x0a,0xd7,0xff,0xfe,0x01,0x00,0x34,0x12,0x00,0x00]
 // GFX1250: v_max_i16 v5, 0x1234, 0x1234            ; encoding: [0x05,0x00,0x0a,0xd7,0xff,0xfe,0x01,0x00,0x34,0x12,0x00,0x00]
-// GFX9-ERR: :[[@LINE-3]]:27: error: literal operands are not supported
+// GFX9-ERR: :[[@LINE-3]]:19: error: literal operands are not supported
 
 v_min3_i16 v5, 0xfe0b, v2, v3
 // GFX10: v_min3_i16 v5, 0xfe0b, v2, v3           ; encoding: [0x05,0x00,0x52,0xd7,0xff,0x04,0x0e,0x04,0x0b,0xfe,0x00,0x00]
@@ -236,15 +236,15 @@ v_min3_i16 v5, v1, v2, 0x5678
 v_min3_i16 v5, 0x5678, 0x5678, 0x5678
 // GFX10: v_min3_i16 v5, 0x5678, 0x5678, 0x5678   ; encoding: [0x05,0x00,0x52,0xd7,0xff,0xfe,0xfd,0x03,0x78,0x56,0x00,0x00]
 // GFX1250: v_min3_i16 v5, 0x5678, 0x5678, 0x5678   ; encoding: [0x05,0x00,0x4a,0xd6,0xff,0xfe,0xfd,0x03,0x78,0x56,0x00,0x00]
-// GFX9-ERR: :[[@LINE-3]]:32: error: literal operands are not supported
+// GFX9-ERR: :[[@LINE-3]]:16: error: literal operands are not supported
 
 v_min3_i16 v5, 0x5678, 0x5679, 0x5678
-// GFX9-ERR: :[[@LINE-1]]:32: error: literal operands are not supported
-// GFX10-ERR: :[[@LINE-2]]:32: error: only one unique literal operand is allowed
-// GFX1250-ERR: :[[@LINE-3]]:32: error: only one unique literal operand is allowed
+// GFX9-ERR: :[[@LINE-1]]:16: error: literal operands are not supported
+// GFX10-ERR: :[[@LINE-2]]:24: error: only one unique literal operand is allowed
+// GFX1250-ERR: :[[@LINE-3]]:24: error: only one unique literal operand is allowed
 
 v_min3_i16 v5, 0x5678, 0x5678, 0x5679
-// GFX9-ERR: :[[@LINE-1]]:32: error: literal operands are not supported
+// GFX9-ERR: :[[@LINE-1]]:16: error: literal operands are not supported
 // GFX10-ERR: :[[@LINE-2]]:32: error: only one unique literal operand is allowed
 // GFX1250-ERR: :[[@LINE-3]]:32: error: only one unique literal operand is allowed
 
@@ -286,7 +286,7 @@ v_mad_u16 v5, v1, v2, 0x5678
 v_mad_u16 v5, 0x5678, 0x5678, 0x5678
 // GFX10: v_mad_u16 v5, 0x5678, 0x5678, 0x5678    ; encoding: [0x05,0x00,0x40,0xd7,0xff,0xfe,0xfd,0x03,0x78,0x56,0x00,0x00]
 // GFX1250: v_mad_u16 v5, 0x5678, 0x5678, 0x5678    ; encoding: [0x05,0x00,0x41,0xd6,0xff,0xfe,0xfd,0x03,0x78,0x56,0x00,0x00]
-// GFX9-ERR: :[[@LINE-3]]:31: error: literal operands are not supported
+// GFX9-ERR: :[[@LINE-3]]:15: error: literal operands are not supported
 
 v_mad_legacy_f32 v5, 0xaf123456, v2, v3
 // GFX10: v_mad_legacy_f32 v5, 0xaf123456, v2, v3 ; encoding: [0x05,0x00,0x40,0xd5,0xff,0x04,0x0e,0x04,0x56,0x34,0x12,0xaf]
@@ -305,7 +305,7 @@ v_mad_legacy_f32 v5, v1, v2, 0xaf123456
 
 v_mad_legacy_f32 v5, 0xaf123456, 0xaf123456, 0xaf123456
 // GFX10: v_mad_legacy_f32 v5, 0xaf123456, 0xaf123456, 0xaf123456 ; encoding: [0x05,0x00,0x40,0xd5,0xff,0xfe,0xfd,0x03,0x56,0x34,0x12,0xaf]
-// GFX9-ERR: :[[@LINE-2]]:46: error: literal operands are not supported
+// GFX9-ERR: :[[@LINE-2]]:22: error: literal operands are not supported
 // GFX1250-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
 
 v_cmp_eq_i32_e64 s[10:11], 0xaf123456, v2
@@ -321,10 +321,10 @@ v_cmp_eq_i32_e64 s[10:11], v1, 0xaf123456
 v_cmp_eq_i32_e64 s[10:11], 0xaf123456, 0xaf123456
 // GFX10: v_cmp_eq_i32_e64 s[10:11], 0xaf123456, 0xaf123456 ; encoding: [0x0a,0x00,0x82,0xd4,0xff,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 // GFX1250: v_cmp_eq_i32_e64 s[10:11], 0xaf123456, 0xaf123456 ; encoding: [0x0a,0x00,0x42,0xd4,0xff,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
-// GFX9-ERR: :[[@LINE-3]]:40: error: literal operands are not supported
+// GFX9-ERR: :[[@LINE-3]]:28: error: literal operands are not supported
 
 v_cmp_eq_i32_e64 s[10:11], 0xaf123456, 0xaf123455
-// GFX9-ERR: :[[@LINE-1]]:40: error: literal operands are not supported
+// GFX9-ERR: :[[@LINE-1]]:28: error: literal operands are not supported
 // GFX10-ERR: :[[@LINE-2]]:40: error: only one unique literal operand is allowed
 // GFX1250-ERR: :[[@LINE-3]]:40: error: only one unique literal operand is allowed
 
@@ -341,7 +341,7 @@ v_cmp_eq_u64_e64 s[10:11], v[2:3], 0x3f717273
 v_cmp_eq_u64_e64 s[10:11], 0x3f717273, 0x3f717273
 // GFX10: v_cmp_eq_u64_e64 s[10:11], 0x3f717273, 0x3f717273 ; encoding: [0x0a,0x00,0xe2,0xd4,0xff,0xfe,0x01,0x00,0x73,0x72,0x71,0x3f]
 // GFX1250: v_cmp_eq_u64_e64 s[10:11], 0x3f717273, 0x3f717273 ; encoding: [0x0a,0x00,0x5a,0xd4,0xff,0xfe,0x01,0x00,0x73,0x72,0x71,0x3f]
-// GFX9-ERR: :[[@LINE-3]]:40: error: literal operands are not supported
+// GFX9-ERR: :[[@LINE-3]]:28: error: literal operands are not supported
 
 v_cmpx_class_f32_e64 0xaf123456, v2
 // GFX10: v_cmpx_class_f32_e64 0xaf123456, v2     ; encoding: [0x7e,0x00,0x98,0xd4,0xff,0x04,0x02,0x00,0x56,0x34,0x12,0xaf]
@@ -441,7 +441,7 @@ v_pk_add_f16 v5, v1, 0xbf717273
 v_pk_add_f16 v5, 0x3f717273, 0x3f717273
 // GFX10: v_pk_add_f16 v5, 0x3f717273, 0x3f717273 ; encoding: [0x05,0x40,0x0f,0xcc,0xff,0xfe,0x01,0x18,0x73,0x72,0x71,0x3f]
 // GFX1250: v_pk_add_f16 v5, 0x3f717273, 0x3f717273 ; encoding: [0x05,0x40,0x0f,0xcc,0xff,0xfe,0x01,0x18,0x73,0x72,0x71,0x3f]
-// GFX9-ERR: :[[@LINE-3]]:30: error: literal operands are not supported
+// GFX9-ERR: :[[@LINE-3]]:18: error: literal operands are not supported
 
 v_pk_add_i16 v5, 0x7b, v2
 // GFX10: v_pk_add_i16 v5, 0x7b, v2               ; encoding: [0x05,0x40,0x02,0xcc,0xff,0x04,0x02,0x18,0x7b,0x00,0x00,0x00]
@@ -456,10 +456,10 @@ v_pk_add_i16 v5, v1, 0x7b
 v_pk_add_i16 v5, 0xab7b, 0xab7b
 // GFX10: v_pk_add_i16 v5, 0xab7b, 0xab7b         ; encoding: [0x05,0x40,0x02,0xcc,0xff,0xfe,0x01,0x18,0x7b,0xab,0x00,0x00]
 // GFX1250: v_pk_add_i16 v5, 0xab7b, 0xab7b         ; encoding: [0x05,0x40,0x02,0xcc,0xff,0xfe,0x01,0x18,0x7b,0xab,0x00,0x00]
-// GFX9-ERR: :[[@LINE-3]]:26: error: literal operands are not supported
+// GFX9-ERR: :[[@LINE-3]]:18: error: literal operands are not supported
 
 v_pk_add_i16 v5, 0xab7b, 0xab7a
-// GFX9-ERR: :[[@LINE-1]]:26: error: literal operands are not supported
+// GFX9-ERR: :[[@LINE-1]]:18: error: literal operands are not supported
 // GFX10-ERR: :[[@LINE-2]]:26: error: only one unique literal operand is allowed
 // GFX1250-ERR: :[[@LINE-3]]:26: error: only one unique literal operand is allowed
 
@@ -471,12 +471,12 @@ v_div_fmas_f32 v5, v1, 0x123, v3
 v_div_fmas_f32 v5, v1, 0x123, 0x123
 // GFX10: v_div_fmas_f32 v5, v1, 0x123, 0x123     ; encoding: [0x05,0x00,0x6f,0xd5,0x01,0xff,0xfd,0x03,0x23,0x01,0x00,0x00]
 // GFX1250: v_div_fmas_f32 v5, v1, 0x123, 0x123     ; encoding: [0x05,0x00,0x37,0xd6,0x01,0xff,0xfd,0x03,0x23,0x01,0x00,0x00]
-// GFX9-ERR: :[[@LINE-3]]:31: error: literal operands are not supported
+// GFX9-ERR: :[[@LINE-3]]:24: error: literal operands are not supported
 
 v_div_fmas_f32 v5, 0x123, 0x123, 0x123
 // GFX10: v_div_fmas_f32 v5, 0x123, 0x123, 0x123  ; encoding: [0x05,0x00,0x6f,0xd5,0xff,0xfe,0xfd,0x03,0x23,0x01,0x00,0x00]
 // GFX1250: v_div_fmas_f32 v5, 0x123, 0x123, 0x123  ; encoding: [0x05,0x00,0x37,0xd6,0xff,0xfe,0xfd,0x03,0x23,0x01,0x00,0x00]
-// GFX9-ERR: :[[@LINE-3]]:34: error: literal operands are not supported
+// GFX9-ERR: :[[@LINE-3]]:20: error: literal operands are not supported
 
 v_div_fmas_f64 v[4:5], 0x12345678, v[2:3], v[4:5]
 // GFX10: v_div_fmas_f64 v[4:5], 0x12345678, v[2:3], v[4:5] ; encoding: [0x04,0x00,0x70,0xd5,0xff,0x04,0x12,0x04,0x78,0x56,0x34,0x12]
@@ -486,10 +486,10 @@ v_div_fmas_f64 v[4:5], 0x12345678, v[2:3], v[4:5]
 v_div_fmas_f64 v[6:7], 0x12345678, 0x12345678, 0x12345678
 // GFX10: v_div_fmas_f64 v[6:7], 0x12345678, 0x12345678, 0x12345678 ; encoding: [0x06,0x00,0x70,0xd5,0xff,0xfe,0xfd,0x03,0x78,0x56,0x34,0x12]
 // GFX1250: v_div_fmas_f64 v[6:7], 0x12345678, 0x12345678, 0x12345678 ; encoding: [0x06,0x00,0x38,0xd6,0xff,0xfe,0xfd,0x03,0x78,0x56,0x34,0x12]
-// GFX9-ERR: :[[@LINE-3]]:48: error: literal operands are not supported
+// GFX9-ERR: :[[@LINE-3]]:24: error: literal operands are not supported
 
 v_div_fmas_f64 v[4:5], v[2:3], 0x123457, 0x123456
-// GFX9-ERR: :[[@LINE-1]]:42: error: literal operands are not supported
+// GFX9-ERR: :[[@LINE-1]]:32: error: literal operands are not supported
 // GFX10-ERR: :[[@LINE-2]]:42: error: only one unique literal operand is allowed
 // GFX1250-ERR: :[[@LINE-3]]:42: error: only one unique literal operand is allowed
 
@@ -501,9 +501,9 @@ v_ldexp_f64 v[4:5], 0.12345, v2
 v_ldexp_f64 v[6:7], 0.12345, 0x3fbf9a6b
 // GFX10: v_ldexp_f64 v[6:7], 0x3fbf9a6b, 0x3fbf9a6b ; encoding: [0x06,0x00,0x68,0xd5,0xff,0xfe,0x01,0x00,0x6b,0x9a,0xbf,0x3f]
 // GFX1250: v_ldexp_f64 v[6:7], 0x3fbf9a6b, 0x3fbf9a6b ; encoding: [0x06,0x00,0x2b,0xd7,0xff,0xfe,0x01,0x00,0x6b,0x9a,0xbf,0x3f]
-// GFX9-ERR: :[[@LINE-3]]:30: error: literal operands are not supported
+// GFX9-ERR: :[[@LINE-3]]:21: error: literal operands are not supported
 
 v_ldexp_f64 v[4:5], 0.12345, 0x3fbf9a6c
-// GFX9-ERR: :[[@LINE-1]]:30: error: literal operands are not supported
+// GFX9-ERR: :[[@LINE-1]]:21: error: literal operands are not supported
 // GFX10-ERR: :[[@LINE-2]]:30: error: only one unique literal operand is allowed
 // GFX1250-ERR: :[[@LINE-3]]:30: error: only one unique literal operand is allowed
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx10-vop3-literal.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx10-vop3-literal.txt
index 015ce3e963fb3..e6410a1b6b8a8 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx10-vop3-literal.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx10-vop3-literal.txt
@@ -51,7 +51,7 @@
 # GFX10: v_add_nc_i16 v5, v1, 0xcdab ; encoding: [0x05,0x00,0x0d,0xd7,0x01,0xff,0x01,0x00,0xab,0xcd,0xff,0xff]
 0x05,0x00,0x0d,0xd7,0x01,0xff,0x01,0x00,0xab,0xcd,0xff,0xff
 
-# GFX10: v_ceil_f16_e64 v255, 0xabcd clamp ; encoding: [0xff,0x80,0xdc,0xd5,0xff,0x00,0x00,0x00,0xcd,0xab,0xff,0xff]
+# GFX10: v_ceil_f16_e64 v255, 0xabcd clamp ; encoding: [0xff,0x80,0xdc,0xd5,0xff,0x00,0x00,0x00,0xcd,0xab,0x00,0x00]
 0xff,0x80,0xdc,0xd5,0xff,0x00,0x00,0x00,0xcd,0xab,0xff,0xff
 
 # GFX10: v_min_u16 v5, v1, 0xabcd ; encoding: [0x05,0x00,0x0b,0xd7,0x01,0xff,0x01,0x00,0xcd,0xab,0xff,0xff]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3.txt
index 4b44c27570af5..29bfa54f2c10d 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3.txt
@@ -761,6 +761,9 @@
 0x0a,0x38,0xab,0xd6,0x14,0x11,0x02,0x00
 # GFX1250: v_cvt_scale_pk8_f16_bf8 v[10:13], v[20:21], v8 scale_sel:7 ; encoding: [0x0a,0x38,0xab,0xd6,0x14,0x11,0x02,0x00]
 
+0x0a,0x40,0xab,0xd6,0x14,0x11,0x02,0x00
+# GFX1250: v_cvt_scale_pk8_f16_bf8 v[10:13], v[20:21], v8 scale_sel:8 ; encoding: [0x0a,0x40,0xab,0xd6,0x14,0x11,0x02,0x00]
+
 0x0a,0x00,0xa8,0xd6,0x14,0xff,0x01,0x00,0x00,0xcf,0x00,0x00
 # GFX1250: v_cvt_scale_pk8_f16_fp8 v[10:13], v[20:21], 0xcf00 ; encoding: [0x0a,0x00,0xa8,0xd6,0x14,0xff,0x01,0x00,0x00,0xcf,0x00,0x00]
 
@@ -800,6 +803,9 @@
 0x0a,0x38,0xad,0xd6,0x14,0x11,0x02,0x00
 # GFX1250: v_cvt_scale_pk8_f32_bf8 v[10:17], v[20:21], v8 scale_sel:7 ; encoding: [0x0a,0x38,0xad,0xd6,0x14,0x11,0x02,0x00]
 
+0x0a,0x40,0xad,0xd6,0x14,0x11,0x02,0x00
+# GFX1250: v_cvt_scale_pk8_f32_bf8 v[10:17], v[20:21], v8 scale_sel:8 ; encoding: [0x0a,0x40,0xad,0xd6,0x14,0x11,0x02,0x00]
+
 0x0a,0x00,0xaa,0xd6,0x14,0xff,0x01,0x00,0x00,0xcf,0x00,0x00
 # GFX1250: v_cvt_scale_pk8_f32_fp8 v[10:17], v[20:21], 0xcf00 ; encoding: [0x0a,0x00,0xaa,0xd6,0x14,0xff,0x01,0x00,0x00,0xcf,0x00,0x00]
 
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx8-literal16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx8-literal16.txt
index c5751de810d90..d2da087a44743 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx8-literal16.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx8-literal16.txt
@@ -34,17 +34,17 @@
 0xff 0x06 0x02 0x3e 0x00 0x01 0x00 0x00
 
 # non-zero unused bits in constant
-# VI: v_add_f16_e32 v1, 0x41, v3 ; encoding: [0xff,0x06,0x02,0x3e,0x41,0x00,0x01,0x00]
+# VI: v_add_f16_e32 v1, 0x41, v3 ; encoding: [0xff,0x06,0x02,0x3e,0x41,0x00,0x00,0x00]
 0xff 0x06 0x02 0x3e 0x41 0x00 0x01 0x00
 
-# VI: v_add_f16_e32 v1, 0x41, v3 ; encoding: [0xff,0x06,0x02,0x3e,0x41,0x00,0x00,0x01]
+# VI: v_add_f16_e32 v1, 0x41, v3 ; encoding: [0xff,0x06,0x02,0x3e,0x41,0x00,0x00,0x00]
 0xff 0x06 0x02 0x3e 0x41 0x00 0x00 0x01
 
 # FIXME: This should be able to round trip with literal after instruction
 # VI: v_add_f16_e32 v1, 0, v3 ; encoding: [0x80,0x06,0x02,0x3e]
 0xff 0x06 0x02 0x3e 0x00 0x00 0x00 0x00
 
-# VI: v_add_f16_e32 v1, 0xffcd, v3 ; encoding: [0xff,0x06,0x02,0x3e,0xcd,0xff,0xff,0xff]
+# VI: v_add_f16_e32 v1, 0xffcd, v3 ; encoding: [0xff,0x06,0x02,0x3e,0xcd,0xff,0x00,0x00]
 0xff 0x06 0x02 0x3e 0xcd 0xff 0xff 0xff
 
 # VI: v_mul_lo_u16_e32 v2, 0xffcd, v2 ; encoding: [0xff,0x04,0x04,0x52,0xcd,0xff,0xff,0xff]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx950_mai.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx950_mai.txt
index 77b87ac63f335..e191455beb64d 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx950_mai.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx950_mai.txt
@@ -392,27 +392,6 @@
 # GFX950:   v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:17], v[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:1 blgp:3 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x09,0xad,0xd3,0x04,0x19,0x52,0x64]
 0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x09,0xad,0xd3,0x04,0x19,0x52,0x64
 
-# GFX950:   v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], 33, 9 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0xa1,0x12,0x01,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04]
-0x00,0x00,0xac,0xd3,0xa1,0x12,0x01,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04
-
-# GFX950:   v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], 9, v2 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x89,0x04,0x02,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04]
-0x00,0x00,0xac,0xd3,0x89,0x04,0x02,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04
-
-# GFX950:   v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], m0, m0 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x7c,0xf8,0x00,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04]
-0x00,0x00,0xac,0xd3,0x7c,0xf8,0x00,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04
-
-# GFX950:   v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], s20, 9 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x14,0x12,0x01,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04]
-0x00,0x00,0xac,0xd3,0x14,0x12,0x01,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04
-
-# GFX950:   v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], s24, s24 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x30,0x00,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04]
-0x00,0x00,0xac,0xd3,0x18,0x30,0x00,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04
-
-# GFX950:   v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], s24, v44 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x58,0x02,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04]
-0x00,0x00,0xac,0xd3,0x18,0x58,0x02,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04
-
-# GFX950:   v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], v2, 9 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x02,0x13,0x01,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04]
-0x00,0x00,0xac,0xd3,0x02,0x13,0x01,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04
-
 # GFX950:   v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], v24, v25 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04]
 0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04
 
@@ -422,15 +401,6 @@
 # GFX950:   v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], v24, v25 op_sel_hi:[0,0,0] blgp:1 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x24]
 0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x24
 
-# GFX950:   v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:1 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x09,0xad,0xd3,0x04,0x19,0x52,0x04]
-0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x09,0xad,0xd3,0x04,0x19,0x52,0x04
-
-# GFX950:   v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], v44, s24 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x2c,0x31,0x00,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04]
-0x00,0x00,0xac,0xd3,0x2c,0x31,0x00,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04
-
-# GFX950:   v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], vcc_lo, v2 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x6a,0x04,0x02,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04]
-0x00,0x00,0xac,0xd3,0x6a,0x04,0x02,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04
-
 # GFX950:   v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:7], v[12:15], v[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:4 blgp:4 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x0c,0xad,0xd3,0x04,0x19,0x52,0x84]
 0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x0c,0xad,0xd3,0x04,0x19,0x52,0x84
 
@@ -467,18 +437,6 @@
 # GFX950:   v_mfma_scale_f32_16x16x128_f8f6f4 v[50:53], v[4:11], v[12:19], v[20:23], v24, v25 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x32,0x08,0xad,0xd3,0x04,0x19,0x52,0x04]
 0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x32,0x08,0xad,0xd3,0x04,0x19,0x52,0x04
 
-# GFX950:   v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], 4.0, v2 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0xf6,0x04,0x02,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04]
-0x00,0x00,0xac,0xd3,0xf6,0x04,0x02,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04
-
-# GFX950:   v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], v2, 4.0 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x02,0xed,0x01,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04]
-0x00,0x00,0xac,0xd3,0x02,0xed,0x01,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04
-
-# GFX950:   v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], -4.0, 1.0 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0xf7,0xe4,0x01,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04]
-0x00,0x00,0xac,0xd3,0xf7,0xe4,0x01,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04
-
-# GFX950:   v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], 0.15915494, -16 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0xf8,0xa0,0x01,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04]
-0x00,0x00,0xac,0xd3,0xf8,0xa0,0x01,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04
-
 # GFX950:   v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], a[16:19], a[24:27], a[32:47], v48, v49 op_sel_hi:[0,0,0] cbsz:4 blgp:4 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x8c,0xae,0xd3,0x10,0x31,0x82,0x9c]
 0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x8c,0xae,0xd3,0x10,0x31,0x82,0x9c
 
@@ -581,18 +539,6 @@
 # GFX950:   v_mfma_scale_f32_32x32x64_f8f6f4 v[50:65], v[16:23], v[24:31], v[32:47], v48, v49 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x32,0x08,0xae,0xd3,0x10,0x31,0x82,0x04]
 0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x32,0x08,0xae,0xd3,0x10,0x31,0x82,0x04
 
-# GFX950: 	v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], 16, v49 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x90,0x62,0x02,0x00,0x00,0x08,0xae,0xd3,0x10,0x31,0x82,0x04]
-0x00,0x00,0xac,0xd3,0x90,0x62,0x02,0x00,0x00,0x08,0xae,0xd3,0x10,0x31,0x82,0x04
-
-# GFX950: 	v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], v48, -4.0 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x30,0xef,0x01,0x00,0x00,0x08,0xae,0xd3,0x10,0x31,0x82,0x04]
-0x00,0x00,0xac,0xd3,0x30,0xef,0x01,0x00,0x00,0x08,0xae,0xd3,0x10,0x31,0x82,0x04
-
-# GFX950: 	v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], 4.0, 1.0 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0xf6,0xe4,0x01,0x00,0x00,0x08,0xae,0xd3,0x10,0x31,0x82,0x04]
-0x00,0x00,0xac,0xd3,0xf6,0xe4,0x01,0x00,0x00,0x08,0xae,0xd3,0x10,0x31,0x82,0x04
-
-# GFX950: 	v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], 0.15915494, -16 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0xf8,0xa0,0x01,0x00,0x00,0x08,0xae,0xd3,0x10,0x31,0x82,0x04]
-0x00,0x00,0xac,0xd3,0xf8,0xa0,0x01,0x00,0x00,0x08,0xae,0xd3,0x10,0x31,0x82,0x04
-
 # GFX950:   v_mfma_i32_16x16x64_i8 a[0:3], a[0:3], a[0:3], a[0:3] ; encoding: [0x00,0x80,0xb6,0xd3,0x00,0x01,0x02,0x1c]
 0x00,0x80,0xb6,0xd3,0x00,0x01,0x02,0x1c
 
diff --git a/llvm/test/MC/Disassembler/Mips/mips64/sc-ptr64.txt b/llvm/test/MC/Disassembler/Mips/mips64/sc-ptr64.txt
new file mode 100644
index 0000000000000..3964301b0827e
--- /dev/null
+++ b/llvm/test/MC/Disassembler/Mips/mips64/sc-ptr64.txt
@@ -0,0 +1,3 @@
+# RUN: llvm-mc -triple=mips64el-unknown-linux -disassemble -mattr=+ptr64 -mcpu=mips3 %s | FileCheck %s
+
+0xd8 0x49 0x6f 0xe2 # CHECK: sc $15, 18904($19)
diff --git a/llvm/test/MC/ELF/cfi-sframe-encoding.s b/llvm/test/MC/ELF/cfi-sframe-encoding.s
new file mode 100644
index 0000000000000..e13e11c51c05a
--- /dev/null
+++ b/llvm/test/MC/ELF/cfi-sframe-encoding.s
@@ -0,0 +1,87 @@
+// TODO: Add other architectures as they gain sframe support
+// REQUIRES: x86-registered-target
+// RUN: llvm-mc --assemble --filetype=obj --gsframe -triple x86_64 %s -o %t.o
+// RUN: llvm-readelf --sframe %t.o | FileCheck %s
+
+// Tests selection for the proper FDE AddrX encoding at the boundaries
+// between uint8_t, uint16_t, and uint32_t. The first FRE always fits
+// anywhere, because its address-offset is zero. The last FRE
+// determines the smallest AddrX it is possible to use.  Align
+// functions to 1024 to make it easier to interpet offsets.
+
+	.cfi_sections .sframe
+
+        .align 1024
+fde0_uses_addr1:
+// CHECK:        FuncDescEntry [0] {
+// CHECK:          Start FRE Offset: 0x0
+// CHECK-NEXT:          Num FREs: 2
+// CHECK:            FRE Type: Addr1 (0x0)
+	.cfi_startproc
+// CHECK:        Frame Row Entry {
+// CHECK-NEXT:          Start Address: 0x0
+// CHECK-NEXT:          Return Address Signed: No
+// CHECK-NEXT:          Offset Size: B1 (0x0)
+// CHECK-NEXT:          Base Register: SP (0x1)
+// CHECK-NEXT:          CFA Offset: 8
+// CHECK-NEXT:          RA Offset: -8
+// CHECK-NEXT:        }
+        .fill 0xFF
+	.cfi_def_cfa_offset 16
+// CHECK-NEXT:        Frame Row Entry {
+// CHECK-NEXT:          Start Address: 0xFF
+// CHECK-NEXT:          Return Address Signed: No
+// CHECK-NEXT:          Offset Size: B1 (0x0)
+// CHECK-NEXT:          Base Register: SP (0x1)
+// CHECK-NEXT:          CFA Offset: 16
+// CHECK-NEXT:          RA Offset: -8
+  	nop
+        .cfi_endproc
+
+        .align 1024	
+fde1_uses_addr2:
+// CHECK:        FuncDescEntry [1] {
+// CHECK:          Start FRE Offset: 0x6
+// CHECK-NEXT:          Num FREs: 2
+// CHECK:            FRE Type: Addr2 (0x1)
+	.cfi_startproc
+// CHECK:        Frame Row Entry {
+// CHECK-NEXT:          Start Address: 0x400
+        .fill 0xFF + 1
+	.cfi_def_cfa_offset 16
+// CHECK:        Frame Row Entry {
+// CHECK-NEXT:          Start Address: 0x500
+        .cfi_endproc
+
+.align 1024	
+fde2_uses_addr2:
+// CHECK:        FuncDescEntry [2] {
+// CHECK:          Start FRE Offset: 0xE
+// CHECK-NEXT:          Num FREs: 2
+// CHECK:            FRE Type: Addr2 (0x1)
+	.cfi_startproc
+// CHECK:        Frame Row Entry {
+// CHECK-NEXT:          Start Address: 0x800
+        .fill 0xFFFF
+	.cfi_def_cfa_offset 16
+// CHECK:        Frame Row Entry {
+// CHECK-NEXT:          Start Address: 0x107FF
+	nop
+        .cfi_endproc
+
+        .align 1024
+fde3_uses_addr4:
+// CHECK:        FuncDescEntry [3] {
+// CHECK:          Start FRE Offset: 0x16
+// CHECK-NEXT:          Num FREs: 2
+// CHECK:            FRE Type: Addr4 (0x2)
+	.cfi_startproc
+// CHECK:        Frame Row Entry {
+// CHECK-NEXT:          Start Address: 0x10800
+        .fill 0xFFFF + 1
+// CHECK:        Frame Row Entry {
+// CHECK-NEXT:          Start Address: 0x20800
+	.cfi_def_cfa_offset 16
+	nop
+        .cfi_endproc
+
diff --git a/llvm/test/MC/ELF/cfi-sframe-fre-cases.s b/llvm/test/MC/ELF/cfi-sframe-fre-cases.s
new file mode 100644
index 0000000000000..61ed58f8a9d0e
--- /dev/null
+++ b/llvm/test/MC/ELF/cfi-sframe-fre-cases.s
@@ -0,0 +1,114 @@
+// REQUIRES: x86-registered-target
+// RUN: llvm-mc --assemble --filetype=obj --gsframe -triple x86_64 %s -o %t.o
+// RUN: llvm-readelf --sframe %t.o | FileCheck %s
+
+// Tests selection for the proper FRE::BX encoding at the boundaries
+// between int8_t, int16_t, and int32_t.  Ensures the largest offset
+// between CFA, RA, and FP governs. Align functions to 1024 to make it
+// easier to interpet offsets. Some directives require alignment, so
+// it isn't always possible to test exact boundaries.
+
+// Also, check that irrelevant cfi directives don't create new fres,
+// or affect the current ones. Checking the Start Address ensures that
+// the proper FRE gets the proper checks. Using .long makes addresses
+// architecture independent.
+
+        .align 1024
+fde4_fre_offset_sizes:
+// CHECK:        FuncDescEntry [0] {
+// CHECK:          Start FRE Offset: 0
+// CHECK:            FRE Type: Addr1 (0x0)
+	.cfi_startproc
+// CHECK:        Frame Row Entry {
+// CHECK-NEXT:          Start Address: 0x0
+// CHECK-NEXT:          Return Address Signed: No
+// CHECK-NEXT:          Offset Size: B1 (0x0)
+// CHECK-NEXT:          Base Register: SP (0x1)
+// CHECK-NEXT:          CFA Offset: 8
+// CHECK-NEXT:          RA Offset: -8
+        .long 0
+// Uninteresting register no new fre, no effect on cfa
+	.cfi_offset 0, 8
+        .long 0
+	.cfi_def_cfa_offset 0x78
+// CHECK:        Frame Row Entry {
+// CHECK-NEXT:          Start Address: 0x8
+// CHECK-NEXT:          Return Address Signed: No
+// CHECK-NEXT:          Offset Size: B1 (0x0)
+// CHECK-NEXT:          Base Register: SP (0x1)
+// CHECK-NEXT:          CFA Offset: 120
+// CHECK-NEXT:          RA Offset: -8
+	.long 0
+// Uninteresting register no new fre, no effect on cfa
+        .cfi_rel_offset 1, 8
+        .long 0 
+	.cfi_def_cfa_offset 0x80 
+// CHECK:        Frame Row Entry {
+// CHECK-NEXT:          Start Address: 0x10
+// CHECK-NEXT:          Return Address Signed: No
+// CHECK-NEXT:          Offset Size: B2 (0x1)
+// CHECK-NEXT:          Base Register: SP (0x1)
+// CHECK-NEXT:          CFA Offset: 128
+// CHECK-NEXT:          RA Offset: -8
+	.long 0
+// Uninteresting register no new fre, no effect on cfa
+        .cfi_val_offset 1, 8
+        .long 0
+	.cfi_def_cfa_offset 0x7FFF
+// CHECK:        Frame Row Entry {
+// CHECK-NEXT:          Start Address: 0x18
+// CHECK-NEXT:          Return Address Signed: No
+// CHECK-NEXT:          Offset Size: B2 (0x1)
+// CHECK-NEXT:          Base Register: SP (0x1)
+// CHECK-NEXT:          CFA Offset: 32767
+// CHECK-NEXT:          RA Offset: -8
+	.long 0
+	.cfi_def_cfa_offset 0x8000
+// CHECK:        Frame Row Entry {
+// CHECK-NEXT:          Start Address: 0x1C
+// CHECK-NEXT:          Return Address Signed: No
+// CHECK-NEXT:          Offset Size: B4 (0x2)
+// CHECK-NEXT:          Base Register: SP (0x1)
+// CHECK-NEXT:          CFA Offset: 32768
+// CHECK-NEXT:          RA Offset: -8
+	.long 0
+	.cfi_def_cfa_offset 0x8
+// CHECK:        Frame Row Entry {
+// CHECK-NEXT:          Start Address: 0x20
+// CHECK-NEXT:          Return Address Signed: No
+// CHECK-NEXT:          Offset Size: B1 (0x0)
+// CHECK-NEXT:          Base Register: SP (0x1)
+// CHECK-NEXT:          CFA Offset: 8
+// CHECK-NEXT:          RA Offset: -8
+	.long 0
+	.cfi_adjust_cfa_offset 0x8
+// CHECK:        Frame Row Entry {
+// CHECK-NEXT:          Start Address: 0x24
+// CHECK-NEXT:          Return Address Signed: No
+// CHECK-NEXT:          Offset Size: B1 (0x0)
+// CHECK-NEXT:          Base Register: SP (0x1)
+// CHECK-NEXT:          CFA Offset: 16
+// CHECK-NEXT:          RA Offset: -8
+	.long 0
+	.cfi_def_cfa_register  6  # switch to fp
+// CHECK:        Frame Row Entry {
+// CHECK-NEXT:          Start Address: 0x28
+// CHECK-NEXT:          Return Address Signed: No
+// CHECK-NEXT:          Offset Size: B1 (0x0)
+// CHECK-NEXT:          Base Register: FP (0x0)
+// CHECK-NEXT:          CFA Offset: 16
+// CHECK-NEXT:          RA Offset: -8
+	.long 0
+	.cfi_offset 7, 32
+	# sp not the cfa but with large offset still changes encoding.
+	.cfi_offset 6, 0x7FF8
+// CHECK:        Frame Row Entry {
+// CHECK-NEXT:          Start Address: 0x2C
+// CHECK-NEXT:          Return Address Signed: No
+// CHECK-NEXT:          Offset Size: B2 (0x1)
+// CHECK-NEXT:          Base Register: FP (0x0)
+// CHECK-NEXT:          CFA Offset: 16
+// CHECK-NEXT:          RA Offset: -8
+// CHECK-NEXT:          FP Offset: 32760
+	.long 0
+        .cfi_endproc
diff --git a/llvm/test/MC/ELF/cfi-sframe.s b/llvm/test/MC/ELF/cfi-sframe.s
index ecf77bc3ea6b3..28ee3cbf33b16 100644
--- a/llvm/test/MC/ELF/cfi-sframe.s
+++ b/llvm/test/MC/ELF/cfi-sframe.s
@@ -4,27 +4,35 @@
 // RUN: llvm-readelf --sframe %t.o | FileCheck %s
 
 	.cfi_sections .sframe
-f1:
-	.cfi_startproc  // FRE 0
+f0:
+	.cfi_startproc  # FRE 0
 	nop
-	.cfi_def_cfa_offset 16  // FRE 1
-	.cfi_def_cfa_offset 8   // location didn't change. No new FRE, but new offset.
+	.cfi_def_cfa_offset 16  # FRE 1 
+	.cfi_def_cfa_offset 8   # location didn't change. No new FRE, but new offset.
 	nop
-	.cfi_def_cfa_offset 8   // offset didn't change. No new FRE.
+	.cfi_def_cfa_offset 8   # offset didn't change. No new FRE.
 	nop
-	.cfi_def_cfa_offset 16  // FRE 2. new location, new offset.
+	.cfi_def_cfa_offset 16  # FRE 2. new location, new offset.
 	nop
-	.cfi_register 0, 1      // Uninteresting register. No new FRE.
+	.cfi_register 0, 1      # Uninteresting register. No new FRE.
 	nop
 
         .cfi_endproc
 
-f2:
+f1:
 	.cfi_startproc
 	nop
 	nop
         .cfi_endproc
 
+f2:
+	.cfi_startproc
+        .cfi_endproc
+
+f3:
+	.cfi_startproc simple
+        .cfi_endproc
+
 // CHECK: SFrame section '.sframe' {
 // CHECK-NEXT:  Header {
 // CHECK-NEXT:    Magic: 0xDEE2
@@ -32,11 +40,11 @@ f2:
 // CHECK-NEXT:    Flags [ (0x4)
 // CHECK:    ABI: AMD64EndianLittle (0x3)
 // CHECK-NEXT:    CFA fixed FP offset (unused): 0
-// CHECK-NEXT:    CFA fixed RA offset: 0
+// CHECK-NEXT:    CFA fixed RA offset: -8
 // CHECK-NEXT:    Auxiliary header length: 0
 // CHECK-NEXT:    Num FDEs: 2
-// CHECK-NEXT:    Num FREs: 0
-// CHECK-NEXT:    FRE subsection length: 0
+// CHECK-NEXT:    Num FREs: 4
+// CHECK-NEXT:    FRE subsection length: 12
 // CHECK-NEXT:    FDE subsection offset: 0
 // CHECK-NEXT:    FRE subsection offset: 40
 // CHECK:    Function Index [
@@ -48,7 +56,7 @@ f2:
 // CHECK-NEXT:          }
 // CHECK-NEXT:          Size: 0x5
 // CHECK-NEXT:          Start FRE Offset: 0x0
-// CHECK-NEXT:          Num FREs: 0
+// CHECK-NEXT:          Num FREs: 3
 // CHECK-NEXT:          Info {
 // CHECK-NEXT:            FRE Type: Addr1 (0x0)
 // CHECK-NEXT:            FDE Type: PCInc (0x0)
@@ -56,18 +64,18 @@ f2:
 // CHECK-NEXT:          }
 // CHECK-NEXT:          Repetitive block size (unused): 0x0
 // CHECK-NEXT:          Padding2: 0x0
-// CHECK-NEXT:          FREs [
-// CHECK-NEXT:          ]
-// CHECK-NEXT:        }
-// CHECK-NEXT:        FuncDescEntry [1] {
+
+// Contents of FREs are tested elsewhere
+	
+// CHECK:             FuncDescEntry [1] {
 // CHECK-NEXT:          PC {
 // CHECK-NEXT:            Relocation: {{.*}}PC32{{.*}}
 // CHECK-NEXT:            Symbol Name: .text
 // CHECK-NEXT:            Start Address: {{.*}}
 // CHECK-NEXT:          }
 // CHECK-NEXT:          Size: 0x2
-// CHECK-NEXT:          Start FRE Offset: 0x0
-// CHECK-NEXT:          Num FREs: 0
+// CHECK-NEXT:          Start FRE Offset: 0x9
+// CHECK-NEXT:          Num FREs: 1
 // CHECK-NEXT:          Info {
 // CHECK-NEXT:            FRE Type: Addr1 (0x0)
 // CHECK-NEXT:            FDE Type: PCInc (0x0)
@@ -75,8 +83,5 @@ f2:
 // CHECK-NEXT:          }
 // CHECK-NEXT:          Repetitive block size (unused): 0x0
 // CHECK-NEXT:          Padding2: 0x0
-// CHECK-NEXT:          FREs [
-// CHECK-NEXT:          ]
-// CHECK-NEXT:        }
-// CHECK-NEXT:      ]
-// CHECK-NEXT:    }
+
+
diff --git a/llvm/test/MC/ELF/cgprofile.s b/llvm/test/MC/ELF/cgprofile.s
index f8469ddc68877..28d8b72185556 100644
--- a/llvm/test/MC/ELF/cgprofile.s
+++ b/llvm/test/MC/ELF/cgprofile.s
@@ -5,11 +5,11 @@ a: .word b
 
   .cg_profile a, b, 32
   .cg_profile freq, a, 11
-  .cg_profile late, late2, 20
+  .cg_profile "late\\", late2, 20
   .cg_profile .L.local, b, 42
 
-	.globl late
-late:
+	.globl "late\\"
+"late\\":
 late2: .word 0
 late3:
 .L.local:
@@ -31,7 +31,7 @@ late3:
 # CHECK-NEXT:   0010: 14000000 00000000 2A000000 00000000
 # CHECK-NEXT: )
 
-# CHECK:      Name: .rel.llvm.call-graph-profile (28)
+# CHECK:      Name: .rel.llvm.call-graph-profile
 # CHECK-NEXT: Type: SHT_REL (0x9)
 # CHECK-NEXT: Flags [ (0x40)
 # CHECK-NEXT:   SHF_INFO_LINK
@@ -83,7 +83,7 @@ late3:
 # CHECK-NEXT: Type:
 # CHECK-NEXT: Other:
 # CHECK-NEXT: Section: Undefined
-# CHECK:      Name: late
+# CHECK:      Name: late\ ([[#]])
 # CHECK-NEXT: Value:
 # CHECK-NEXT: Size:
 # CHECK-NEXT: Binding: Global
diff --git a/llvm/test/MC/ELF/debug-loc-label.s b/llvm/test/MC/ELF/debug-loc-label.s
index 6b5d04777bef4..4200b1192107b 100644
--- a/llvm/test/MC/ELF/debug-loc-label.s
+++ b/llvm/test/MC/ELF/debug-loc-label.s
@@ -17,43 +17,47 @@
 # CHECK-LINE-TABLE-NEXT: 0x0000002a: 00 DW_LNE_set_address (0x0000000000000000)
 # CHECK-LINE-TABLE-NEXT: 0x00000035: 01 DW_LNS_copy
 # CHECK-LINE-TABLE-NEXT:             0x0000000000000000      1      1      1   0             0       0  is_stmt
-# CHECK-LINE-TABLE-NEXT: 0x00000036: 00 DW_LNE_end_sequence
-# CHECK-LINE-TABLE-NEXT:             0x0000000000000000      1      1      1   0             0       0  is_stmt end_sequence
-# CHECK-LINE-TABLE-NEXT: 0x00000039: 05 DW_LNS_set_column (2)
-# CHECK-LINE-TABLE-NEXT: 0x0000003b: 00 DW_LNE_set_address (0x0000000000000008)
-# CHECK-LINE-TABLE-NEXT: 0x00000046: 01 DW_LNS_copy
+# CHECK-LINE-TABLE-NEXT: 0x00000036: 02 DW_LNS_advance_pc (addr += 8, op-index += 0)
+# CHECK-LINE-TABLE-NEXT: 0x00000038: 00 DW_LNE_end_sequence
+# CHECK-LINE-TABLE-NEXT:             0x0000000000000008      1      1      1   0             0       0  is_stmt end_sequence
+# CHECK-LINE-TABLE-NEXT: 0x0000003b: 05 DW_LNS_set_column (2)
+# CHECK-LINE-TABLE-NEXT: 0x0000003d: 00 DW_LNE_set_address (0x0000000000000008)
+# CHECK-LINE-TABLE-NEXT: 0x00000048: 01 DW_LNS_copy
 # CHECK-LINE-TABLE-NEXT:             0x0000000000000008      1      2      1   0             0       0  is_stmt
-# CHECK-LINE-TABLE-NEXT: 0x00000047: 00 DW_LNE_end_sequence
-# CHECK-LINE-TABLE-NEXT:             0x0000000000000008      1      2      1   0             0       0  is_stmt end_sequence
-# CHECK-LINE-TABLE-NEXT: 0x0000004a: 05 DW_LNS_set_column (3)
-# CHECK-LINE-TABLE-NEXT: 0x0000004c: 00 DW_LNE_set_address (0x0000000000000010)
-# CHECK-LINE-TABLE-NEXT: 0x00000057: 01 DW_LNS_copy
+# CHECK-LINE-TABLE-NEXT: 0x00000049: 02 DW_LNS_advance_pc (addr += 8, op-index += 0)
+# CHECK-LINE-TABLE-NEXT: 0x0000004b: 00 DW_LNE_end_sequence
+# CHECK-LINE-TABLE-NEXT:             0x0000000000000010      1      2      1   0             0       0  is_stmt end_sequence
+# CHECK-LINE-TABLE-NEXT: 0x0000004e: 05 DW_LNS_set_column (3)
+# CHECK-LINE-TABLE-NEXT: 0x00000050: 00 DW_LNE_set_address (0x0000000000000010)
+# CHECK-LINE-TABLE-NEXT: 0x0000005b: 01 DW_LNS_copy
 # CHECK-LINE-TABLE-NEXT:             0x0000000000000010      1      3      1   0             0       0  is_stmt
-# CHECK-LINE-TABLE-NEXT: 0x00000058: 00 DW_LNE_end_sequence
-# CHECK-LINE-TABLE-NEXT:             0x0000000000000010      1      3      1   0             0       0  is_stmt end_sequence
-# CHECK-LINE-TABLE-NEXT: 0x0000005b: 05 DW_LNS_set_column (4)
-# CHECK-LINE-TABLE-NEXT: 0x0000005d: 00 DW_LNE_set_address (0x0000000000000018)
-# CHECK-LINE-TABLE-NEXT: 0x00000068: 01 DW_LNS_copy
+# CHECK-LINE-TABLE-NEXT: 0x0000005c: 02 DW_LNS_advance_pc (addr += 8, op-index += 0)
+# CHECK-LINE-TABLE-NEXT: 0x0000005e: 00 DW_LNE_end_sequence
+# CHECK-LINE-TABLE-NEXT:             0x0000000000000018      1      3      1   0             0       0  is_stmt end_sequence
+# CHECK-LINE-TABLE-NEXT: 0x00000061: 05 DW_LNS_set_column (4)
+# CHECK-LINE-TABLE-NEXT: 0x00000063: 00 DW_LNE_set_address (0x0000000000000018)
+# CHECK-LINE-TABLE-NEXT: 0x0000006e: 01 DW_LNS_copy
 # CHECK-LINE-TABLE-NEXT:             0x0000000000000018      1      4      1   0             0       0  is_stmt
-# CHECK-LINE-TABLE-NEXT: 0x00000069: 05 DW_LNS_set_column (5)
-# CHECK-LINE-TABLE-NEXT: 0x0000006b: 01 DW_LNS_copy
+# CHECK-LINE-TABLE-NEXT: 0x0000006f: 05 DW_LNS_set_column (5)
+# CHECK-LINE-TABLE-NEXT: 0x00000071: 01 DW_LNS_copy
 # CHECK-LINE-TABLE-NEXT:             0x0000000000000018      1      5      1   0             0       0  is_stmt
-# CHECK-LINE-TABLE-NEXT: 0x0000006c: 00 DW_LNE_end_sequence
-# CHECK-LINE-TABLE-NEXT:             0x0000000000000018      1      5      1   0             0       0  is_stmt end_sequence
+# CHECK-LINE-TABLE-NEXT: 0x00000072: 02 DW_LNS_advance_pc (addr += 8, op-index += 0)
+# CHECK-LINE-TABLE-NEXT: 0x00000074: 00 DW_LNE_end_sequence
+# CHECK-LINE-TABLE-NEXT:             0x0000000000000020      1      5      1   0             0       0  is_stmt end_sequence
 
 # CHECK-SYM:      Symbol table '.symtab' contains 9 entries:
 # CHECK-SYM-NEXT:    Num:    Value          Size Type    Bind   Vis       Ndx Name
 # CHECK-SYM-NEXT:      0: 0000000000000000     0 NOTYPE  LOCAL  DEFAULT   UND
 # CHECK-SYM-NEXT:      1: 0000000000000000     0 FILE    LOCAL  DEFAULT   ABS test.c
 # CHECK-SYM-NEXT:      2: 0000000000000000     0 SECTION LOCAL  DEFAULT     2 .text
-# CHECK-SYM-NEXT:      3: 0000000000000039     0 NOTYPE  LOCAL  DEFAULT     3 my_label_02
-# CHECK-SYM-NEXT:      4: 000000000000004a     0 NOTYPE  LOCAL  DEFAULT     3 my_label_03
-# CHECK-SYM-NEXT:      5: 000000000000005b     0 NOTYPE  LOCAL  DEFAULT     3 my_label_04
-# CHECK-SYM-NEXT:      6: 000000000000004a     0 NOTYPE  LOCAL  DEFAULT     3 my_label_03.1
-# CHECK-SYM-NEXT:      7: 000000000000006f     0 NOTYPE  LOCAL  DEFAULT     3 my_label_05
+# CHECK-SYM-NEXT:      3: 000000000000003b     0 NOTYPE  LOCAL  DEFAULT     3 my_label_02
+# CHECK-SYM-NEXT:      4: 000000000000004e     0 NOTYPE  LOCAL  DEFAULT     3 my_label_03
+# CHECK-SYM-NEXT:      5: 0000000000000061     0 NOTYPE  LOCAL  DEFAULT     3 my_label_04
+# CHECK-SYM-NEXT:      6: 000000000000004e     0 NOTYPE  LOCAL  DEFAULT     3 my_label_03.1
+# CHECK-SYM-NEXT:      7: 0000000000000077     0 NOTYPE  LOCAL  DEFAULT     3 my_label_05
 # CHECK-SYM-NEXT:      8: 0000000000000000     0 FUNC    GLOBAL DEFAULT     2 foo
 
-# CHECK-OFFSETS: 0000 39000000 4a000000 5b000000
+# CHECK-OFFSETS: 0000 3b000000 4e000000 61000000
 
 	.text
 	.file	"test.c"
diff --git a/llvm/test/MC/ELF/symbol-names.s b/llvm/test/MC/ELF/symbol-names.s
index 427187c329acf..f1593dd2f8099 100644
--- a/llvm/test/MC/ELF/symbol-names.s
+++ b/llvm/test/MC/ELF/symbol-names.s
@@ -5,6 +5,7 @@
 // CHECK-LABEL:SYMBOL TABLE:
 // CHECK-NEXT: 0000000000000001 l     F .text  0000000000000000 a"b\{{$}}
 // CHECK-NEXT: 0000000000000006 l       .text  0000000000000000 a\{{$}}
+// CHECK-NEXT: 000000000000000b l       .text  0000000000000000 a\\{{$}}
 // CHECK-NEXT: 0000000000000000 g     F .text  0000000000000000 foo?bar
 // CHECK-NEXT: 0000000000000000 *UND*          0000000000000000 a"b\q{{$}}
 // CHECK-EMPTY:
@@ -26,3 +27,5 @@ ret
 "a\\":
 /// GAS emits a warning for \q
   call "a\"b\q"
+
+"a\\\\" = .
diff --git a/llvm/test/MC/RISCV/zibi-invalid.s b/llvm/test/MC/RISCV/zibi-invalid.s
new file mode 100644
index 0000000000000..50e5f0709fa6c
--- /dev/null
+++ b/llvm/test/MC/RISCV/zibi-invalid.s
@@ -0,0 +1,34 @@
+# RUN: not llvm-mc -triple=riscv32 --mattr=+experimental-zibi %s 2>&1 \
+# RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
+# RUN: not llvm-mc -triple=riscv64 --mattr=+experimental-zibi %s 2>&1 \
+# RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
+beqi a0, 0x0, 0x400
+# CHECK-ERROR: [[@LINE-1]]:10: error: immediate must be non-zero in the range [-1, 31]
+# CHECK-ERROR-LABEL: beqi a0, 0x0, 0x400
+beqi a0, 0x21, 0x400
+# CHECK-ERROR: [[@LINE-1]]:10: error: immediate must be non-zero in the range [-1, 31]
+# CHECK-ERROR-LABEL: beqi a0, 0x21, 0x400
+beqi a2, 0x10, -0x1f000
+# CHECK-ERROR: [[@LINE-1]]:16: error: immediate must be a multiple of 2 bytes in the range [-4096, 4094]
+# CHECK-ERROR-LABEL: beqi a2, 0x10, -0x1f000
+beqi a2, 0x10, 0x1000
+# CHECK-ERROR: [[@LINE-1]]:16: error: immediate must be a multiple of 2 bytes in the range [-4096, 4094]
+# CHECK-ERROR-LABEL: beqi a2, 0x10, 0x1000
+beqi a2, 0x10, 0x111
+# CHECK-ERROR: [[@LINE-1]]:16: error: immediate must be a multiple of 2 bytes in the range [-4096, 4094]
+# CHECK-ERROR-LABEL: beqi a2, 0x10, 0x111
+bnei a0, 0x0, 0x400
+# CHECK-ERROR: [[@LINE-1]]:10: error: immediate must be non-zero in the range [-1, 31]
+# CHECK-ERROR-LABEL: bnei a0, 0x0, 0x400
+bnei a0, 0x21, 0x400
+# CHECK-ERROR: [[@LINE-1]]:10: error: immediate must be non-zero in the range [-1, 31]
+# CHECK-ERROR-LABEL: bnei a0, 0x21, 0x400
+bnei a2, 0x10, -0x1f000
+# CHECK-ERROR: [[@LINE-1]]:16: error: immediate must be a multiple of 2 bytes in the range [-4096, 4094]
+# CHECK-ERROR-LABEL: bnei a2, 0x10, -0x1f000
+bnei a2, 0x10, 0x1000
+# CHECK-ERROR: [[@LINE-1]]:16: error: immediate must be a multiple of 2 bytes in the range [-4096, 4094]
+# CHECK-ERROR-LABEL: bnei a2, 0x10, 0x1000
+bnei a2, 0x10, 0x111
+# CHECK-ERROR: [[@LINE-1]]:16: error: immediate must be a multiple of 2 bytes in the range [-4096, 4094]
+# CHECK-ERROR-LABEL: bnei a2, 0x10, 0x111
diff --git a/llvm/test/MC/RISCV/zibi-valid.s b/llvm/test/MC/RISCV/zibi-valid.s
new file mode 100644
index 0000000000000..b062c4cf1efb3
--- /dev/null
+++ b/llvm/test/MC/RISCV/zibi-valid.s
@@ -0,0 +1,63 @@
+# RUN: llvm-mc -triple=riscv32 -show-encoding --mattr=+experimental-zibi %s \
+# RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-ASM
+# RUN: llvm-mc -triple=riscv64 -show-encoding --mattr=+experimental-zibi %s \
+# RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-ASM
+# RUN: not llvm-mc -triple=riscv32 -show-encoding %s 2>&1 \
+# RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
+# RUN: not llvm-mc -triple=riscv64 -show-encoding %s 2>&1 \
+# RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
+# RUN: llvm-mc -triple=riscv32 -filetype=obj --mattr=+experimental-zibi %s \
+# RUN:        | llvm-objdump -d --mattr=+experimental-zibi --no-print-imm-hex  - \
+# RUN:        | FileCheck %s --check-prefix=CHECK-OBJ
+# RUN: llvm-mc -triple=riscv32 -filetype=obj --mattr=+experimental-zibi %s \
+# RUN:        | llvm-objdump -d - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+# RUN: llvm-mc -triple=riscv64 -filetype=obj --mattr=+experimental-zibi %s \
+# RUN:        | llvm-objdump -d - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+beqi a0, 1, 1024
+# CHECK-OBJ: beqi a0, 1, 0x400
+# CHECK-ASM: beqi a0, 1, 1024
+# CHECK-ENCODING: [0x63,0x20,0x15,0x40]
+# CHECK-ERROR: instruction requires the following: 'Zibi' (Branch with Immediate){{$}}
+# CHECK-UNKNOWN: 40152063 <unknown>
+beqi a5, -1, -1024
+# CHECK-OBJ: beqi a5, -1, 0xfffffc04
+# CHECK-ASM: beqi a5, -1, -1024
+# CHECK-ENCODING: [0xe3,0xa0,0x07,0xc0]
+# CHECK-ERROR: instruction requires the following: 'Zibi' (Branch with Immediate){{$}}
+# CHECK-UNKNOWN: c007a0e3 <unknown>
+beqi s0, 22, 0xffe
+# CHECK-OBJ: beqi s0, 22, 0x1006
+# CHECK-ASM: beqi s0, 22, 4094
+# CHECK-ENCODING: [0xe3,0x2f,0x64,0x7f]
+# CHECK-ERROR: instruction requires the following: 'Zibi' (Branch with Immediate){{$}}
+# CHECK-UNKNOWN: 7f642fe3 <unknown>
+beqi s1, 11, -4096
+# CHECK-OBJ: beqi s1, 11, 0xfffff00c
+# CHECK-ASM: beqi s1, 11, -4096
+# CHECK-ENCODING: [0x63,0xa0,0xb4,0x80]
+# CHECK-ERROR: instruction requires the following: 'Zibi' (Branch with Immediate){{$}}
+# CHECK-UNKNOWN: 80b4a063 <unknown>
+bnei a0, 1, 1024
+# CHECK-OBJ: bnei a0, 1, 0x410
+# CHECK-ASM: bnei a0, 1, 1024
+# CHECK-ENCODING: [0x63,0x30,0x15,0x40]
+# CHECK-ERROR: instruction requires the following: 'Zibi' (Branch with Immediate){{$}}
+# CHECK-UNKNOWN: 40153063 <unknown>
+bnei a5, -1, -1024
+# CHECK-OBJ: bnei a5, -1, 0xfffffc14
+# CHECK-ASM: bnei a5, -1, -1024
+# CHECK-ENCODING: [0xe3,0xb0,0x07,0xc0]
+# CHECK-ERROR: instruction requires the following: 'Zibi' (Branch with Immediate){{$}}
+# CHECK-UNKNOWN: c007b0e3 <unknown>
+bnei s0, 22, 0xffe
+# CHECK-OBJ: bnei s0, 22, 0x1016
+# CHECK-ASM: bnei s0, 22, 4094
+# CHECK-ENCODING: [0xe3,0x3f,0x64,0x7f]
+# CHECK-ERROR: instruction requires the following: 'Zibi' (Branch with Immediate){{$}}
+# CHECK-UNKNOWN: 7f643fe3 <unknown>
+bnei s1, 11, -4096
+# CHECK-OBJ: bnei s1, 11, 0xfffff01c
+# CHECK-ASM: bnei s1, 11, -4096
+# CHECK-ENCODING: [0x63,0xb0,0xb4,0x80]
+# CHECK-ERROR: instruction requires the following: 'Zibi' (Branch with Immediate){{$}}
+# CHECK-UNKNOWN: 80b4b063 <unknown>
diff --git a/llvm/test/MC/WebAssembly/tag-section-decoding.ll b/llvm/test/MC/WebAssembly/tag-section-decoding.ll
deleted file mode 100644
index 4e3b7688e4806..0000000000000
--- a/llvm/test/MC/WebAssembly/tag-section-decoding.ll
+++ /dev/null
@@ -1,342 +0,0 @@
-; RUN: llc -filetype=obj -wasm-enable-eh -exception-model=wasm -mattr=+exception-handling %s -o - | obj2yaml | FileCheck %s
-
-; This is a regression test for a decoding bug that happens when a tag's
-; sigindex is greater than 63, so we put 63 dummy functions with different
-; signatures before the function that contains the 'throw' instruction to make
-; the tag's sigindex 64.
-
-target triple = "wasm32-unknown-unknown"
-
-declare void @llvm.wasm.throw(i32, ptr)
-
-define i32 @dummy0() {
-entry:
-  ret i32 0
-}
-
-define i32 @dummy1(i32) {
-entry:
-  ret i32 0
-}
-
-define i32 @dummy2(i32, i32) {
-entry:
-  ret i32 0
-}
-
-define i32 @dummy3(i32, i32, i32) {
-entry:
-  ret i32 0
-}
-
-define i32 @dummy4(i32, i32, i32, i32) {
-entry:
-  ret i32 0
-}
-
-define i32 @dummy5(i32, i32, i32, i32, i32) {
-entry:
-  ret i32 0
-}
-
-define i32 @dummy6(i32, i32, i32, i32, i32, i32) {
-entry:
-  ret i32 0
-}
-
-define i32 @dummy7(i32, i32, i32, i32, i32, i32, i32) {
-entry:
-  ret i32 0
-}
-
-define i32 @dummy8(i32, i32, i32, i32, i32, i32, i32, i32) {
-entry:
-  ret i32 0
-}
-
-define i32 @dummy9(i32, i32, i32, i32, i32, i32, i32, i32, i32) {
-entry:
-  ret i32 0
-}
-
-define i32 @dummy10(i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) {
-entry:
-  ret i32 0
-}
-
-define i32 @dummy11(i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) {
-entry:
-  ret i32 0
-}
-
-define i32 @dummy12(i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) {
-entry:
-  ret i32 0
-}
-
-define i32 @dummy13(i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) {
-entry:
-  ret i32 0
-}
-
-define i32 @dummy14(i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) {
-entry:
-  ret i32 0
-}
-
-define i32 @dummy15(i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) {
-entry:
-  ret i32 0
-}
-
-define i32 @dummy16(i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) {
-entry:
-  ret i32 0
-}
-
-define i32 @dummy17(i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) {
-entry:
-  ret i32 0
-}
-
-define i32 @dummy18(i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) {
-entry:
-  ret i32 0
-}
-
-define i32 @dummy19(i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) {
-entry:
-  ret i32 0
-}
-
-define i32 @dummy20(i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) {
-entry:
-  ret i32 0
-}
-
-define i32 @dummy21(i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) {
-entry:
-  ret i32 0
-}
-
-define i32 @dummy22(i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) {
-entry:
-  ret i32 0
-}
-
-define i32 @dummy23(i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) {
-entry:
-  ret i32 0
-}
-
-define i32 @dummy24(i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) {
-entry:
-  ret i32 0
-}
-
-define i32 @dummy25(i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) {
-entry:
-  ret i32 0
-}
-
-define i32 @dummy26(i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) {
-entry:
-  ret i32 0
-}
-
-define i32 @dummy27(i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) {
-entry:
-  ret i32 0
-}
-
-define i32 @dummy28(i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) {
-entry:
-  ret i32 0
-}
-
-define i32 @dummy29(i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) {
-entry:
-  ret i32 0
-}
-
-define i32 @dummy30(i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) {
-entry:
-  ret i32 0
-}
-
-define i32 @dummy31(i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) {
-entry:
-  ret i32 0
-}
-
-define i32 @dummy32(i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) {
-entry:
-  ret i32 0
-}
-
-define i32 @dummy33(i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) {
-entry:
-  ret i32 0
-}
-
-define i32 @dummy34(i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) {
-entry:
-  ret i32 0
-}
-
-define i32 @dummy35(i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) {
-entry:
-  ret i32 0
-}
-
-define i32 @dummy36(i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) {
-entry:
-  ret i32 0
-}
-
-define i32 @dummy37(i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) {
-entry:
-  ret i32 0
-}
-
-define i32 @dummy38(i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) {
-entry:
-  ret i32 0
-}
-
-define i32 @dummy39(i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) {
-entry:
-  ret i32 0
-}
-
-define i32 @dummy40(i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) {
-entry:
-  ret i32 0
-}
-
-define i32 @dummy41(i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) {
-entry:
-  ret i32 0
-}
-
-define i32 @dummy42(i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) {
-entry:
-  ret i32 0
-}
-
-define i32 @dummy43(i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) {
-entry:
-  ret i32 0
-}
-
-define i32 @dummy44(i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) {
-entry:
-  ret i32 0
-}
-
-define i32 @dummy45(i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) {
-entry:
-  ret i32 0
-}
-
-define i32 @dummy46(i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) {
-entry:
-  ret i32 0
-}
-
-define i32 @dummy47(i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) {
-entry:
-  ret i32 0
-}
-
-define i32 @dummy48(i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) {
-entry:
-  ret i32 0
-}
-
-define i32 @dummy49(i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) {
-entry:
-  ret i32 0
-}
-
-define i32 @dummy50(i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) {
-entry:
-  ret i32 0
-}
-
-define i32 @dummy51(i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) {
-entry:
-  ret i32 0
-}
-
-define i32 @dummy52(i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) {
-entry:
-  ret i32 0
-}
-
-define i32 @dummy53(i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) {
-entry:
-  ret i32 0
-}
-
-define i32 @dummy54(i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) {
-entry:
-  ret i32 0
-}
-
-define i32 @dummy55(i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) {
-entry:
-  ret i32 0
-}
-
-define i32 @dummy56(i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) {
-entry:
-  ret i32 0
-}
-
-define i32 @dummy57(i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) {
-entry:
-  ret i32 0
-}
-
-define i32 @dummy58(i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) {
-entry:
-  ret i32 0
-}
-
-define i32 @dummy59(i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) {
-entry:
-  ret i32 0
-}
-
-define i32 @dummy60(i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) {
-entry:
-  ret i32 0
-}
-
-define i32 @dummy61(i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) {
-entry:
-  ret i32 0
-}
-
-define i32 @dummy62(i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) {
-entry:
-  ret i32 0
-}
-
-define i32 @dummy63(i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) {
-entry:
-  ret i32 0
-}
-
-define i32 @test_throw(ptr %p) {
-  call void @llvm.wasm.throw(i32 0, ptr %p)
-  ret i32 0
-}
-
-; Checks to see if the tag index is correctly decoded in ULEB128. If it is
-; decoded with LEB128, 64 will not be correctly decoded. 64 is the smallest
-; number with which its LEB128 and ULEB128 encodings are different, because its
-; 7th least significant bit is not 0.
-; CHECK:      - Type:            TAG
-; CHECK-NEXT:    TagTypes:        [ 64 ]
diff --git a/llvm/test/MC/WebAssembly/tag-section-decoding.s b/llvm/test/MC/WebAssembly/tag-section-decoding.s
new file mode 100644
index 0000000000000..a3e2cf1e2a6f6
--- /dev/null
+++ b/llvm/test/MC/WebAssembly/tag-section-decoding.s
@@ -0,0 +1,409 @@
+# RUN: llvm-mc -triple=wasm32-unknown-unknown -mattr=+exception-handling -filetype=obj %s | obj2yaml | FileCheck %s
+
+# This is a regression test for a decoding bug that happens when a tag's
+# sigindex is greater than 63, so we put 63 dummy functions with different
+# signatures before the function that contains the 'throw' instruction to make
+# the tag's sigindex 64.
+
+.tagtype my_exception i32
+
+.globl dummy0
+dummy0:
+  .functype dummy0 () -> (i32)
+  i32.const 0
+  end_function
+
+.globl dummy1
+dummy1:
+  .functype dummy1 (i32) -> (i32)
+  i32.const 0
+  end_function
+
+.globl dummy2
+dummy2:
+  .functype dummy2 (i32, i32) -> (i32)
+  i32.const 0
+  end_function
+
+.globl dummy3
+dummy3:
+  .functype dummy3 (i32, i32, i32) -> (i32)
+  i32.const 0
+  end_function
+
+.globl dummy4
+dummy4:
+  .functype dummy4 (i32, i32, i32, i32) -> (i32)
+  i32.const 0
+  end_function
+
+.globl dummy5
+dummy5:
+  .functype dummy5 (i32, i32, i32, i32, i32) -> (i32)
+  i32.const 0
+  end_function
+
+.globl dummy6
+dummy6:
+  .functype dummy6 (i32, i32, i32, i32, i32, i32) -> (i32)
+  i32.const 0
+  end_function
+
+.globl dummy7
+dummy7:
+  .functype dummy7 (i32, i32, i32, i32, i32, i32, i32) -> (i32)
+  i32.const 0
+  end_function
+
+.globl dummy8
+dummy8:
+  .functype dummy8 (i32, i32, i32, i32, i32, i32, i32, i32) -> (i32)
+  i32.const 0
+  end_function
+
+.globl dummy9
+dummy9:
+  .functype dummy9 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> (i32)
+  i32.const 0
+  end_function
+
+.globl dummy10
+dummy10:
+  .functype dummy10 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> (i32)
+  i32.const 0
+  end_function
+
+.globl dummy11
+dummy11:
+  .functype dummy11 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> (i32)
+  i32.const 0
+  end_function
+
+.globl dummy12
+dummy12:
+  .functype dummy12 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> (i32)
+  i32.const 0
+  end_function
+
+.globl dummy13
+dummy13:
+  .functype dummy13 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> (i32)
+  i32.const 0
+  end_function
+
+.globl dummy14
+dummy14:
+  .functype dummy14 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> (i32)
+  i32.const 0
+  end_function
+
+.globl dummy15
+dummy15:
+  .functype dummy15 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> (i32)
+  i32.const 0
+  end_function
+
+.globl dummy16
+dummy16:
+  .functype dummy16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> (i32)
+  i32.const 0
+  end_function
+
+.globl dummy17
+dummy17:
+  .functype dummy17 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> (i32)
+  i32.const 0
+  end_function
+
+.globl dummy18
+dummy18:
+  .functype dummy18 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> (i32)
+  i32.const 0
+  end_function
+
+.globl dummy19
+dummy19:
+  .functype dummy19 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> (i32)
+  i32.const 0
+  end_function
+
+.globl dummy20
+dummy20:
+  .functype dummy20 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> (i32)
+  i32.const 0
+  end_function
+
+.globl dummy21
+dummy21:
+  .functype dummy21 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> (i32)
+  i32.const 0
+  end_function
+
+.globl dummy22
+dummy22:
+  .functype dummy22 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> (i32)
+  i32.const 0
+  end_function
+
+.globl dummy23
+dummy23:
+  .functype dummy23 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> (i32)
+  i32.const 0
+  end_function
+
+.globl dummy24
+dummy24:
+  .functype dummy24 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> (i32)
+  i32.const 0
+  end_function
+
+.globl dummy25
+dummy25:
+  .functype dummy25 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> (i32)
+  i32.const 0
+  end_function
+
+.globl dummy26
+dummy26:
+  .functype dummy26 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> (i32)
+  i32.const 0
+  end_function
+
+.globl dummy27
+dummy27:
+  .functype dummy27 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> (i32)
+  i32.const 0
+  end_function
+
+.globl dummy28
+dummy28:
+  .functype dummy28 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> (i32)
+  i32.const 0
+  end_function
+
+.globl dummy29
+dummy29:
+  .functype dummy29 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> (i32)
+  i32.const 0
+  end_function
+
+.globl dummy30
+dummy30:
+  .functype dummy30 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> (i32)
+  i32.const 0
+  end_function
+
+.globl dummy31
+dummy31:
+  .functype dummy31 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> (i32)
+  i32.const 0
+  end_function
+
+.globl dummy32
+dummy32:
+  .functype dummy32 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> (i32)
+  i32.const 0
+  end_function
+
+.globl dummy33
+dummy33:
+  .functype dummy33 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> (i32)
+  i32.const 0
+  end_function
+
+.globl dummy34
+dummy34:
+  .functype dummy34 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> (i32)
+  i32.const 0
+  end_function
+
+.globl dummy35
+dummy35:
+  .functype dummy35 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> (i32)
+  i32.const 0
+  end_function
+
+.globl dummy36
+dummy36:
+  .functype dummy36 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> (i32)
+  i32.const 0
+  end_function
+
+.globl dummy37
+dummy37:
+  .functype dummy37 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> (i32)
+  i32.const 0
+  end_function
+
+.globl dummy38
+dummy38:
+  .functype dummy38 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> (i32)
+  i32.const 0
+  end_function
+
+.globl dummy39
+dummy39:
+  .functype dummy39 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> (i32)
+  i32.const 0
+  end_function
+
+.globl dummy40
+dummy40:
+  .functype dummy40 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> (i32)
+  i32.const 0
+  end_function
+
+.globl dummy41
+dummy41:
+  .functype dummy41 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> (i32)
+  i32.const 0
+  end_function
+
+.globl dummy42
+dummy42:
+  .functype dummy42 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> (i32)
+  i32.const 0
+  end_function
+
+.globl dummy43
+dummy43:
+  .functype dummy43 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> (i32)
+  i32.const 0
+  end_function
+
+.globl dummy44
+dummy44:
+  .functype dummy44 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> (i32)
+  i32.const 0
+  end_function
+
+.globl dummy45
+dummy45:
+  .functype dummy45 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> (i32)
+  i32.const 0
+  end_function
+
+.globl dummy46
+dummy46:
+  .functype dummy46 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> (i32)
+  i32.const 0
+  end_function
+
+.globl dummy47
+dummy47:
+  .functype dummy47 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> (i32)
+  i32.const 0
+  end_function
+
+.globl dummy48
+dummy48:
+  .functype dummy48 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> (i32)
+  i32.const 0
+  end_function
+
+.globl dummy49
+dummy49:
+  .functype dummy49 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> (i32)
+  i32.const 0
+  end_function
+
+.globl dummy50
+dummy50:
+  .functype dummy50 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> (i32)
+  i32.const 0
+  end_function
+
+.globl dummy51
+dummy51:
+  .functype dummy51 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> (i32)
+  i32.const 0
+  end_function
+
+.globl dummy52
+dummy52:
+  .functype dummy52 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> (i32)
+  i32.const 0
+  end_function
+
+.globl dummy53
+dummy53:
+  .functype dummy53 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> (i32)
+  i32.const 0
+  end_function
+
+.globl dummy54
+dummy54:
+  .functype dummy54 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> (i32)
+  i32.const 0
+  end_function
+
+.globl dummy55
+dummy55:
+  .functype dummy55 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> (i32)
+  i32.const 0
+  end_function
+
+.globl dummy56
+dummy56:
+  .functype dummy56 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> (i32)
+  i32.const 0
+  end_function
+
+.globl dummy57
+dummy57:
+  .functype dummy57 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> (i32)
+  i32.const 0
+  end_function
+
+.globl dummy58
+dummy58:
+  .functype dummy58 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> (i32)
+  i32.const 0
+  end_function
+
+.globl dummy59
+dummy59:
+  .functype dummy59 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> (i32)
+  i32.const 0
+  end_function
+
+.globl dummy60
+dummy60:
+  .functype dummy60 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> (i32)
+  i32.const 0
+  end_function
+
+.globl dummy61
+dummy61:
+  .functype dummy61 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> (i32)
+  i32.const 0
+  end_function
+
+.globl dummy62
+dummy62:
+  .functype dummy62 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> (i32)
+  i32.const 0
+  end_function
+
+.globl dummy63
+dummy63:
+  .functype dummy63 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> (i32)
+  i32.const 0
+  end_function
+
+.globl test_throw
+
+test_throw:
+  .functype test_throw (i32) -> (i32)
+  local.get 0
+  throw my_exception
+  end_function
+
+my_exception:
+
+# Checks to see if the tag index is correctly decoded in ULEB128. If it is
+# decoded with LEB128, 64 will not be correctly decoded. 64 is the smallest
+# number with which its LEB128 and ULEB128 encodings are different, because its
+# 7th least significant bit is not 0.
+# CHECK:      - Type:            TAG
+# CHECK-NEXT:    TagTypes:        [ 64 ]
diff --git a/llvm/test/MC/WebAssembly/tag-section.ll b/llvm/test/MC/WebAssembly/tag-section.ll
deleted file mode 100644
index 56738ec80c8dc..0000000000000
--- a/llvm/test/MC/WebAssembly/tag-section.ll
+++ /dev/null
@@ -1,56 +0,0 @@
-; RUN: llc -filetype=obj -wasm-enable-eh -exception-model=wasm -mattr=+exception-handling %s -o - | obj2yaml | FileCheck %s
-; RUN: llc -filetype=obj -wasm-enable-eh -exception-model=wasm -mattr=+exception-handling %s -o - | llvm-readobj -S - | FileCheck -check-prefix=SEC %s
-
-target triple = "wasm32-unknown-unknown"
-
-declare void @llvm.wasm.throw(i32, ptr)
-
-define i32 @test_throw0(ptr %p) {
-  call void @llvm.wasm.throw(i32 0, ptr %p)
-  ret i32 0
-}
-
-define i32 @test_throw1(ptr %p) {
-  call void @llvm.wasm.throw(i32 0, ptr %p)
-  ret i32 1
-}
-
-; CHECK:      Sections:
-; CHECK-NEXT:   - Type:            TYPE
-; CHECK-NEXT:     Signatures:
-; CHECK-NEXT:       - Index:           0
-; CHECK-NEXT:         ParamTypes:
-; CHECK-NEXT:           - I32
-; CHECK-NEXT:         ReturnTypes:
-; CHECK-NEXT:           - I32
-; CHECK-NEXT:       - Index:           1
-; CHECK-NEXT:         ParamTypes:
-; CHECK-NEXT:           - I32
-; CHECK-NEXT:         ReturnTypes:      []
-
-; CHECK:        - Type:            TAG
-; CHECK-NEXT:     TagTypes:        [ 1 ]
-
-; CHECK-NEXT:   - Type:            CODE
-; CHECK-NEXT:     Relocations:
-; CHECK-NEXT:       - Type:            R_WASM_TAG_INDEX_LEB
-; CHECK-NEXT:         Index:           1
-; CHECK-NEXT:         Offset:          0x6
-; CHECK-NEXT:       - Type:            R_WASM_TAG_INDEX_LEB
-; CHECK-NEXT:         Index:           1
-; CHECK-NEXT:         Offset:          0x11
-
-; CHECK:        - Type:            CUSTOM
-; CHECK-NEXT:     Name:            linking
-; CHECK-NEXT:     Version:         2
-; CHECK-NEXT:     SymbolTable:
-
-; CHECK:            - Index:           1
-; CHECK-NEXT:         Kind:            TAG
-; CHECK-NEXT:         Name:            __cpp_exception
-; CHECK-NEXT:         Flags:           [ BINDING_WEAK ]
-; CHECK-NEXT:         Tag:             0
-
-; SEC:          Type: TAG (0xD)
-; SEC-NEXT:     Size: 3
-; SEC-NEXT:     Offset: 69
diff --git a/llvm/test/MC/WebAssembly/tag-section.s b/llvm/test/MC/WebAssembly/tag-section.s
new file mode 100644
index 0000000000000..40339c5fa3f37
--- /dev/null
+++ b/llvm/test/MC/WebAssembly/tag-section.s
@@ -0,0 +1,63 @@
+# RUN: llvm-mc -triple=wasm32-unknown-unknown -mattr=+exception-handling -filetype=obj %s | obj2yaml | FileCheck %s
+# RUN: llvm-mc -triple=wasm32-unknown-unknown -mattr=+exception-handling -filetype=obj %s | llvm-readobj -S - | FileCheck -check-prefix=SEC %s
+
+.tagtype my_exception i32
+
+.globl test_throw0
+test_throw0:
+  .functype test_throw0 (i32) -> (i32)
+  i32.const 0
+  throw my_exception
+  i32.const 0
+  end_function
+
+.globl test_throw1
+test_throw1:
+  .functype test_throw1 (i32) -> (i32)
+  i32.const 0
+  throw my_exception
+  i32.const 0
+  end_function
+
+.globl my_exception
+my_exception:
+
+# CHECK:      Sections:
+# CHECK-NEXT:   - Type:            TYPE
+# CHECK-NEXT:     Signatures:
+# CHECK-NEXT:       - Index:           0
+# CHECK-NEXT:         ParamTypes:
+# CHECK-NEXT:           - I32
+# CHECK-NEXT:         ReturnTypes:
+# CHECK-NEXT:           - I32
+# CHECK-NEXT:       - Index:           1
+# CHECK-NEXT:         ParamTypes:
+# CHECK-NEXT:           - I32
+# CHECK-NEXT:         ReturnTypes:      []
+
+# CHECK:        - Type:            TAG
+# CHECK-NEXT:     TagTypes:        [ 1 ]
+
+# CHECK-NEXT:   - Type:            CODE
+# CHECK-NEXT:     Relocations:
+# CHECK-NEXT:       - Type:            R_WASM_TAG_INDEX_LEB
+# CHECK-NEXT:         Index:           1
+# CHECK-NEXT:         Offset:          0x6
+# CHECK-NEXT:       - Type:            R_WASM_TAG_INDEX_LEB
+# CHECK-NEXT:         Index:           1
+# CHECK-NEXT:         Offset:          0x13
+
+# CHECK:        - Type:            CUSTOM
+# CHECK-NEXT:     Name:            linking
+# CHECK-NEXT:     Version:         2
+# CHECK-NEXT:     SymbolTable:
+
+# CHECK:            - Index:           1
+# CHECK-NEXT:         Kind:            TAG
+# CHECK-NEXT:         Name:            my_exception
+# CHECK-NEXT:         Flags:           [ ]
+# CHECK-NEXT:         Tag:             0
+
+# SEC:          Type: TAG (0xD)
+# SEC-NEXT:     Size: 3
+# SEC-NEXT:     Offset: 69
diff --git a/llvm/test/MachineVerifier/AMDGPU/test_copy_physregs_llt_virtreg.mir b/llvm/test/MachineVerifier/AMDGPU/test_copy_physregs_llt_virtreg.mir
new file mode 100644
index 0000000000000..0fd50391a7e3a
--- /dev/null
+++ b/llvm/test/MachineVerifier/AMDGPU/test_copy_physregs_llt_virtreg.mir
@@ -0,0 +1,58 @@
+# RUN: not --crash llc -mtriple=amdgcn -run-pass=none -filetype=null %s 2>&1 | FileCheck -implicit-check-not="Bad machine code" %s
+
+---
+name:            test_valid_copies
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vcc
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s16) = COPY $vgpr0
+    %2:_(s64) = COPY $vcc
+    %3:_(s1) = COPY $vcc
+    $vgpr0 = COPY %0
+    $vgpr0 = COPY %0
+    $vcc = COPY %2
+    $vcc = COPY %3
+...
+
+---
+name:            test_invalid_copies
+tracksRegLiveness: true
+body:             |
+  bb.0:
+  liveins: $vgpr0_vgpr1, $vgpr2, $vcc
+
+  ; CHECK: *** Bad machine code: Copy Instruction is illegal with mismatching sizes ***
+  ; CHECK: - instruction: %0:_(s32) = COPY $vgpr0_vgpr1
+  %0:_(s32) = COPY $vgpr0_vgpr1
+
+  ; CHECK: *** Bad machine code: Copy Instruction is illegal with mismatching sizes ***
+  ; CHECK: - instruction: %1:_(s64) = COPY $vgpr2
+  %1:_(s64) = COPY $vgpr2
+
+  ; CHECK: *** Bad machine code: Copy Instruction is illegal with mismatching sizes ***
+  ; CHECK: - instruction: %2:_(s32) = COPY $vcc
+  %2:_(s32) = COPY $vcc
+
+  ; CHECK: *** Bad machine code: Copy Instruction is illegal with mismatching sizes ***
+  ; CHECK: - instruction: %3:_(s8) = COPY $vgpr2
+  %3:_(s8) = COPY $vgpr2
+
+  ; CHECK: *** Bad machine code: Copy Instruction is illegal with mismatching sizes ***
+  ; CHECK: - instruction: $vgpr0_vgpr1 = COPY %0:_(s32)
+  $vgpr0_vgpr1 = COPY %0
+
+  ; CHECK: *** Bad machine code: Copy Instruction is illegal with mismatching sizes ***
+  ; CHECK: - instruction: $vgpr2 = COPY %1:_(s64)
+  $vgpr2 = COPY %1
+
+  ; CHECK: *** Bad machine code: Copy Instruction is illegal with mismatching sizes ***
+  ; CHECK: - instruction: $vcc = COPY %2:_(s32)
+  $vcc = COPY %2
+
+  ; CHECK: *** Bad machine code: Copy Instruction is illegal with mismatching sizes ***
+  ; CHECK: - instruction: $vgpr2 = COPY %3:_(s8)
+  $vgpr2 = COPY %3
+
+...
diff --git a/llvm/test/MachineVerifier/test_copy_physregs_x86.mir b/llvm/test/MachineVerifier/test_copy_physregs_x86.mir
index a239379a34e62..f3323c4353142 100644
--- a/llvm/test/MachineVerifier/test_copy_physregs_x86.mir
+++ b/llvm/test/MachineVerifier/test_copy_physregs_x86.mir
@@ -29,34 +29,26 @@ body:             |
     liveins: $xmm0, $xmm1, $xmm2, $xmm3
 
     ; CHECK: *** Bad machine code: Copy Instruction is illegal with mismatching sizes ***
-    ; CHECK: - instruction: %0:_(s16) = COPY $xmm0
-    %0:_(s16) = COPY $xmm0
+    ; CHECK: - instruction: %0:_(<4 x s16>) = COPY $xmm1
+    %0:_(<4 x s16>) = COPY $xmm1
 
     ; CHECK: *** Bad machine code: Copy Instruction is illegal with mismatching sizes ***
-    ; CHECK: - instruction: %1:_(<4 x s16>) = COPY $xmm1
-    %1:_(<4 x s16>) = COPY $xmm1
+    ; CHECK: - instruction: %1:_(s256) = COPY $xmm2
+    %1:_(s256) = COPY $xmm2
 
     ; CHECK: *** Bad machine code: Copy Instruction is illegal with mismatching sizes ***
-    ; CHECK: - instruction: %2:_(s256) = COPY $xmm2
-    %2:_(s256) = COPY $xmm2
+    ; CHECK: - instruction: %2:_(<8 x s32>) = COPY $xmm3
+    %2:_(<8 x s32>) = COPY $xmm3
 
     ; CHECK: *** Bad machine code: Copy Instruction is illegal with mismatching sizes ***
-    ; CHECK: - instruction: %3:_(<8 x s32>) = COPY $xmm3
-    %3:_(<8 x s32>) = COPY $xmm3
+    ; CHECK: - instruction: $xmm1 = COPY %0:_(<4 x s16>)
+    $xmm1 = COPY %0
 
     ; CHECK: *** Bad machine code: Copy Instruction is illegal with mismatching sizes ***
-    ; CHECK: - instruction: $xmm0 = COPY %0:_(s16)
-    $xmm0 = COPY %0
-
-    ; CHECK: *** Bad machine code: Copy Instruction is illegal with mismatching sizes ***
-    ; CHECK: - instruction: $xmm1 = COPY %1:_(<4 x s16>)
-    $xmm1 = COPY %1
+    ; CHECK: - instruction: $xmm2 = COPY %1:_(s256)
+    $xmm2 = COPY %1
 
     ; CHECK: *** Bad machine code: Copy Instruction is illegal with mismatching sizes ***
-    ; CHECK: - instruction: $xmm2 = COPY %2:_(s256)
-    $xmm2 = COPY %2
-
-    ; CHECK: *** Bad machine code: Copy Instruction is illegal with mismatching sizes ***
-    ; CHECK: - instruction: $xmm3 = COPY %3:_(<8 x s32>)
-    $xmm3 = COPY %3
+    ; CHECK: - instruction: $xmm3 = COPY %2:_(<8 x s32>)
+    $xmm3 = COPY %2
 ...
diff --git a/llvm/test/ObjectYAML/DXContainer/RootSignature-Descriptor1.0.yaml b/llvm/test/ObjectYAML/DXContainer/RootSignature-Descriptor1.0.yaml
index 70dc35287ba91..530ed79a95ebb 100644
--- a/llvm/test/ObjectYAML/DXContainer/RootSignature-Descriptor1.0.yaml
+++ b/llvm/test/ObjectYAML/DXContainer/RootSignature-Descriptor1.0.yaml
@@ -19,8 +19,8 @@ Parts:
       NumStaticSamplers: 0
       StaticSamplersOffset: 44
       Parameters:         
-      - ParameterType: 2 # SRV
-        ShaderVisibility: 3 # Domain
+      - ParameterType: CBV
+        ShaderVisibility: Domain
         Descriptor:
           ShaderRegister: 31
           RegisterSpace: 32
@@ -36,8 +36,8 @@ Parts:
 # CHECK-NEXT:      NumStaticSamplers: 0
 # CHECK-NEXT:      StaticSamplersOffset: 44
 # CHECK-NEXT:      Parameters:         
-# CHECK-NEXT:      - ParameterType: 2
-# CHECK-NEXT:        ShaderVisibility: 3
+# CHECK-NEXT:      - ParameterType: CBV
+# CHECK-NEXT:        ShaderVisibility: Domain
 # CHECK-NEXT:        Descriptor:
 # CHECK-NEXT:          RegisterSpace: 32
 # CHECK-NEXT:          ShaderRegister: 31
diff --git a/llvm/test/ObjectYAML/DXContainer/RootSignature-Descriptor1.1.yaml b/llvm/test/ObjectYAML/DXContainer/RootSignature-Descriptor1.1.yaml
index 33a74dbf6a3f4..2e8df2eaed7a8 100644
--- a/llvm/test/ObjectYAML/DXContainer/RootSignature-Descriptor1.1.yaml
+++ b/llvm/test/ObjectYAML/DXContainer/RootSignature-Descriptor1.1.yaml
@@ -19,8 +19,8 @@ Parts:
       NumStaticSamplers: 0
       StaticSamplersOffset: 48
       Parameters:         
-      - ParameterType: 2 # SRV
-        ShaderVisibility: 3 # Domain
+      - ParameterType: CBV
+        ShaderVisibility: Domain
         Descriptor:
           ShaderRegister: 31
           RegisterSpace: 32
@@ -37,8 +37,8 @@ Parts:
 # CHECK-NEXT:      NumStaticSamplers: 0
 # CHECK-NEXT:      StaticSamplersOffset: 48
 # CHECK-NEXT:      Parameters:         
-# CHECK-NEXT:      - ParameterType: 2
-# CHECK-NEXT:        ShaderVisibility: 3
+# CHECK-NEXT:      - ParameterType: CBV
+# CHECK-NEXT:        ShaderVisibility: Domain
 # CHECK-NEXT:        Descriptor:
 # CHECK-NEXT:          RegisterSpace: 32
 # CHECK-NEXT:          ShaderRegister: 31
diff --git a/llvm/test/ObjectYAML/DXContainer/RootSignature-DescriptorTable1.0.yaml b/llvm/test/ObjectYAML/DXContainer/RootSignature-DescriptorTable1.0.yaml
index b04549fde88f7..88d941f75682b 100644
--- a/llvm/test/ObjectYAML/DXContainer/RootSignature-DescriptorTable1.0.yaml
+++ b/llvm/test/ObjectYAML/DXContainer/RootSignature-DescriptorTable1.0.yaml
@@ -20,12 +20,12 @@ Parts:
     NumStaticSamplers: 0
     StaticSamplersOffset: 64
     Parameters:         
-    - ParameterType: 0 # SRV
-      ShaderVisibility: 3 # Domain
+    - ParameterType: DescriptorTable
+      ShaderVisibility: Domain
       Table:
         NumRanges: 1
         Ranges:
-          - RangeType: 0
+          - RangeType: SRV
             NumDescriptors: -1
             BaseShaderRegister: 42
             RegisterSpace: 43
@@ -42,13 +42,13 @@ Parts:
 # CHECK-NEXT:     NumStaticSamplers: 0
 # CHECK-NEXT:     StaticSamplersOffset: 64
 # CHECK-NEXT:     Parameters:         
-# CHECK-NEXT:     - ParameterType: 0
-# CHECK-NEXT:       ShaderVisibility: 3
+# CHECK-NEXT:     - ParameterType: DescriptorTable
+# CHECK-NEXT:       ShaderVisibility: Domain
 # CHECK-NEXT:       Table:
 # CHECK-NEXT:         NumRanges: 1
 # CHECK-NEXT:         RangesOffset: 44
 # CHECK-NEXT:         Ranges:
-# CHECK-NEXT:           - RangeType: 0
+# CHECK-NEXT:           - RangeType: SRV
 # CHECK-NEXT:             NumDescriptors: -1
 # CHECK-NEXT:             BaseShaderRegister: 42
 # CHECK-NEXT:             RegisterSpace: 43
diff --git a/llvm/test/ObjectYAML/DXContainer/RootSignature-DescriptorTable1.1.yaml b/llvm/test/ObjectYAML/DXContainer/RootSignature-DescriptorTable1.1.yaml
index d8f399010053e..c09726defe4a5 100644
--- a/llvm/test/ObjectYAML/DXContainer/RootSignature-DescriptorTable1.1.yaml
+++ b/llvm/test/ObjectYAML/DXContainer/RootSignature-DescriptorTable1.1.yaml
@@ -20,12 +20,12 @@ Parts:
     NumStaticSamplers: 0
     StaticSamplersOffset: 68
     Parameters:         
-    - ParameterType: 0 # SRV
-      ShaderVisibility: 3 # Domain
+    - ParameterType: DescriptorTable
+      ShaderVisibility: Domain
       Table:
         NumRanges: 1
         Ranges:
-          - RangeType: 0
+          - RangeType: SRV
             NumDescriptors: -1
             BaseShaderRegister: 42
             RegisterSpace: 43
@@ -43,13 +43,13 @@ Parts:
 # CHECK-NEXT:       NumStaticSamplers: 0
 # CHECK-NEXT:       StaticSamplersOffset: 68
 # CHECK-NEXT:       Parameters:
-# CHECK-NEXT:         - ParameterType:   0
-# CHECK-NEXT:           ShaderVisibility: 3
+# CHECK-NEXT:         - ParameterType: DescriptorTable
+# CHECK-NEXT:           ShaderVisibility: Domain
 # CHECK-NEXT:           Table:
 # CHECK-NEXT:             NumRanges:       1
 # CHECK-NEXT:             RangesOffset:    44
 # CHECK-NEXT:             Ranges:
-# CHECK-NEXT:               - RangeType:       0
+# CHECK-NEXT:               - RangeType:       SRV
 # CHECK-NEXT:                 NumDescriptors:  -1
 # CHECK-NEXT:                 BaseShaderRegister: 42
 # CHECK-NEXT:                 RegisterSpace:   43
diff --git a/llvm/test/ObjectYAML/DXContainer/RootSignature-Invalid-StaticSamplersOffset.yaml b/llvm/test/ObjectYAML/DXContainer/RootSignature-Invalid-StaticSamplersOffset.yaml
index e805526ea7c51..1322a4ef365ad 100644
--- a/llvm/test/ObjectYAML/DXContainer/RootSignature-Invalid-StaticSamplersOffset.yaml
+++ b/llvm/test/ObjectYAML/DXContainer/RootSignature-Invalid-StaticSamplersOffset.yaml
@@ -21,8 +21,8 @@ Parts:
       NumStaticSamplers: 0
       StaticSamplersOffset: 0
       Parameters:
-      - ParameterType: 2
-        ShaderVisibility: 3
+      - ParameterType: SRV
+        ShaderVisibility: Domain
         Descriptor:
           ShaderRegister: 31
           RegisterSpace: 32
diff --git a/llvm/test/ObjectYAML/DXContainer/RootSignature-MultipleParameters.yaml b/llvm/test/ObjectYAML/DXContainer/RootSignature-MultipleParameters.yaml
index 26d56536b9e44..684ada465d8fc 100644
--- a/llvm/test/ObjectYAML/DXContainer/RootSignature-MultipleParameters.yaml
+++ b/llvm/test/ObjectYAML/DXContainer/RootSignature-MultipleParameters.yaml
@@ -19,30 +19,30 @@ Parts:
       NumStaticSamplers: 0
       StaticSamplersOffset: 140
       Parameters:         
-      - ParameterType: 1 # Constants32Bit
-        ShaderVisibility: 2 # Hull
+      - ParameterType: Constants32Bit
+        ShaderVisibility: Hull
         Constants:
           Num32BitValues: 16
           ShaderRegister: 15
           RegisterSpace: 14
-      - ParameterType: 1 # Constants32Bit
-        ShaderVisibility: 4 # Geometry
+      - ParameterType: Constants32Bit
+        ShaderVisibility: Geometry
         Constants:
           Num32BitValues: 21
           ShaderRegister: 22
           RegisterSpace: 23     
-      - ParameterType: 2 # SRV
-        ShaderVisibility: 3 # Domain
+      - ParameterType: SRV
+        ShaderVisibility: Domain
         Descriptor:
           ShaderRegister: 31
           RegisterSpace: 32
           DATA_STATIC_WHILE_SET_AT_EXECUTE: true
-      - ParameterType: 0 # SRV
-        ShaderVisibility: 3 # Domain
+      - ParameterType: DescriptorTable
+        ShaderVisibility: Domain
         Table:
           NumRanges: 1
           Ranges:
-            - RangeType: 0
+            - RangeType: SRV
               NumDescriptors: -1
               BaseShaderRegister: 42
               RegisterSpace: 43
@@ -60,31 +60,31 @@ Parts:
 # CHECK-NEXT:      NumStaticSamplers: 0
 # CHECK-NEXT:      StaticSamplersOffset: 140
 # CHECK-NEXT:      Parameters:
-# CHECK-NEXT:        - ParameterType:   1
-# CHECK-NEXT:          ShaderVisibility: 2
+# CHECK-NEXT:        - ParameterType:   Constants32Bit
+# CHECK-NEXT:          ShaderVisibility: Hull
 # CHECK-NEXT:          Constants:
 # CHECK-NEXT:            Num32BitValues:  16
 # CHECK-NEXT:            RegisterSpace:   14
 # CHECK-NEXT:            ShaderRegister:  15
-# CHECK-NEXT:        - ParameterType:   1
-# CHECK-NEXT:          ShaderVisibility: 4
+# CHECK-NEXT:        - ParameterType:   Constants32Bit
+# CHECK-NEXT:          ShaderVisibility: Geometry
 # CHECK-NEXT:          Constants:
 # CHECK-NEXT:            Num32BitValues:  21
 # CHECK-NEXT:            RegisterSpace:   23
 # CHECK-NEXT:            ShaderRegister:  22
-# CHECK-NEXT:        - ParameterType:   2
-# CHECK-NEXT:          ShaderVisibility: 3
+# CHECK-NEXT:        - ParameterType:   SRV
+# CHECK-NEXT:          ShaderVisibility: Domain
 # CHECK-NEXT:          Descriptor:
 # CHECK-NEXT:            RegisterSpace:   32
 # CHECK-NEXT:            ShaderRegister:  31
 # CHECK-NEXT:            DATA_STATIC_WHILE_SET_AT_EXECUTE: true
-# CHECK-NEXT:        - ParameterType:   0
-# CHECK-NEXT:          ShaderVisibility: 3
+# CHECK-NEXT:        - ParameterType:   DescriptorTable
+# CHECK-NEXT:          ShaderVisibility: Domain
 # CHECK-NEXT:          Table:
 # CHECK-NEXT:            NumRanges:       1
 # CHECK-NEXT:            RangesOffset:    116
 # CHECK-NEXT:            Ranges:
-# CHECK-NEXT:              - RangeType:       0
+# CHECK-NEXT:              - RangeType:       SRV
 # CHECK-NEXT:                NumDescriptors:  -1
 # CHECK-NEXT:                BaseShaderRegister: 42
 # CHECK-NEXT:                RegisterSpace:   43
diff --git a/llvm/test/ObjectYAML/DXContainer/RootSignature-OptionalOffsets.yaml b/llvm/test/ObjectYAML/DXContainer/RootSignature-OptionalOffsets.yaml
index 88d7c632968be..00bc190c0903d 100644
--- a/llvm/test/ObjectYAML/DXContainer/RootSignature-OptionalOffsets.yaml
+++ b/llvm/test/ObjectYAML/DXContainer/RootSignature-OptionalOffsets.yaml
@@ -31,24 +31,24 @@ Parts:
       NumRootParameters: 3
       NumStaticSamplers: 0
       Parameters:
-      - ParameterType: 1 # RootConstants
-        ShaderVisibility: 0
+      - ParameterType: Constants32Bit
+        ShaderVisibility: All
         Constants:
           Num32BitValues: 16
           ShaderRegister: 15
           RegisterSpace: 14
-      - ParameterType: 2 # SRV
-        ShaderVisibility: 0
+      - ParameterType: SRV
+        ShaderVisibility: All
         Descriptor:
           ShaderRegister: 31
           RegisterSpace: 32
           DATA_STATIC_WHILE_SET_AT_EXECUTE: true
-      - ParameterType: 0 # Descriptor Table
-        ShaderVisibility: 0
+      - ParameterType: DescriptorTable
+        ShaderVisibility: All
         Table:
           NumRanges: 1
           Ranges:
-            - RangeType: 0 # CBV
+            - RangeType: CBuffer
               NumDescriptors: -1
               BaseShaderRegister: 42
               RegisterSpace: 43
diff --git a/llvm/test/ObjectYAML/DXContainer/RootSignature-StaticSamplerOffset1.0.yaml b/llvm/test/ObjectYAML/DXContainer/RootSignature-StaticSamplerOffset1.0.yaml
index 347d8f3be1710..eb940865e7c66 100644
--- a/llvm/test/ObjectYAML/DXContainer/RootSignature-StaticSamplerOffset1.0.yaml
+++ b/llvm/test/ObjectYAML/DXContainer/RootSignature-StaticSamplerOffset1.0.yaml
@@ -33,24 +33,24 @@ Parts:
       NumStaticSamplers: 0
       StaticSamplersOffset: 108
       Parameters:
-      - ParameterType: 1 # RootConstants
-        ShaderVisibility: 0
+      - ParameterType: Constants32Bit
+        ShaderVisibility: All
         Constants:
           Num32BitValues: 16
           ShaderRegister: 15
           RegisterSpace: 14
-      - ParameterType: 2 # SRV
-        ShaderVisibility: 0
+      - ParameterType: CBV
+        ShaderVisibility: All
         Descriptor:
           ShaderRegister: 31
           RegisterSpace: 32
           DATA_STATIC_WHILE_SET_AT_EXECUTE: true
-      - ParameterType: 0 # Descriptor Table
-        ShaderVisibility: 0
+      - ParameterType: DescriptorTable
+        ShaderVisibility: All
         Table:
           NumRanges: 1
           Ranges:
-            - RangeType: 0 # CBV
+            - RangeType: CBuffer
               NumDescriptors: -1
               BaseShaderRegister: 42
               RegisterSpace: 43
diff --git a/llvm/test/ObjectYAML/DXContainer/RootSignature-StaticSamplerOffset1.1.yaml b/llvm/test/ObjectYAML/DXContainer/RootSignature-StaticSamplerOffset1.1.yaml
index 8e03e1a8b29be..73e89c2dbe336 100644
--- a/llvm/test/ObjectYAML/DXContainer/RootSignature-StaticSamplerOffset1.1.yaml
+++ b/llvm/test/ObjectYAML/DXContainer/RootSignature-StaticSamplerOffset1.1.yaml
@@ -33,24 +33,24 @@ Parts:
       NumStaticSamplers: 0
       StaticSamplersOffset: 116
       Parameters:
-      - ParameterType: 1 # RootConstants
-        ShaderVisibility: 0
+      - ParameterType: Constants32Bit
+        ShaderVisibility: All
         Constants:
           Num32BitValues: 16
           ShaderRegister: 15
           RegisterSpace: 14
-      - ParameterType: 2 # SRV
-        ShaderVisibility: 0
+      - ParameterType: SRV
+        ShaderVisibility: All
         Descriptor:
           ShaderRegister: 31
           RegisterSpace: 32
           DATA_STATIC_WHILE_SET_AT_EXECUTE: true
-      - ParameterType: 0 # Descriptor Table
-        ShaderVisibility: 0
+      - ParameterType: DescriptorTable
+        ShaderVisibility: All
         Table:
           NumRanges: 1
           Ranges:
-            - RangeType: 0 # CBV
+            - RangeType: CBuffer
               NumDescriptors: -1
               BaseShaderRegister: 42
               RegisterSpace: 43
diff --git a/llvm/test/ObjectYAML/DXContainer/RootSignature-StaticSamplers-Defaults.yaml b/llvm/test/ObjectYAML/DXContainer/RootSignature-StaticSamplers-Defaults.yaml
index 5df7da87aafd2..a45e3b025a5c0 100644
--- a/llvm/test/ObjectYAML/DXContainer/RootSignature-StaticSamplers-Defaults.yaml
+++ b/llvm/test/ObjectYAML/DXContainer/RootSignature-StaticSamplers-Defaults.yaml
@@ -22,7 +22,7 @@ Parts:
       Samplers: 
         - ShaderRegister: 31 
           RegisterSpace: 32
-          ShaderVisibility:  7
+          ShaderVisibility:  Mesh
       AllowInputAssemblerInputLayout: true
       DenyGeometryShaderRootAccess: true
 
@@ -36,18 +36,18 @@ Parts:
 #CHECK-NEXT:      StaticSamplersOffset: 24
 #CHECK-NEXT:      Parameters:      []
 #CHECK-NEXT:      Samplers:
-#CHECK-NEXT:      - Filter:          85
-#CHECK-NEXT:        AddressU:        1
-#CHECK-NEXT:        AddressV:        1
-#CHECK-NEXT:        AddressW:        1
+#CHECK-NEXT:      - Filter:          Anisotropic
+#CHECK-NEXT:        AddressU:        Wrap
+#CHECK-NEXT:        AddressV:        Wrap
+#CHECK-NEXT:        AddressW:        Wrap
 #CHECK-NEXT:        MipLODBias:      0
 #CHECK-NEXT:        MaxAnisotropy:   16
-#CHECK-NEXT:        ComparisonFunc:  4
-#CHECK-NEXT:        BorderColor:     2
+#CHECK-NEXT:        ComparisonFunc:  LessEqual
+#CHECK-NEXT:        BorderColor:     OpaqueWhite
 #CHECK-NEXT:        MinLOD:          0
 #CHECK-NEXT:        MaxLOD:          3.40282e+38
 #CHECK-NEXT:        ShaderRegister:  31
 #CHECK-NEXT:        RegisterSpace:   32
-#CHECK-NEXT:        ShaderVisibility: 7
+#CHECK-NEXT:        ShaderVisibility: Mesh
 #CHECK-NEXT:      AllowInputAssemblerInputLayout: true
 #CHECK-NEXT:      DenyGeometryShaderRootAccess: true
diff --git a/llvm/test/ObjectYAML/DXContainer/RootSignature-StaticSamplers.yaml b/llvm/test/ObjectYAML/DXContainer/RootSignature-StaticSamplers.yaml
index 82d9a4ffdb4f8..745473117c937 100644
--- a/llvm/test/ObjectYAML/DXContainer/RootSignature-StaticSamplers.yaml
+++ b/llvm/test/ObjectYAML/DXContainer/RootSignature-StaticSamplers.yaml
@@ -20,19 +20,19 @@ Parts:
       StaticSamplersOffset: 24
       Parameters: []
       Samplers: 
-        - Filter: 10 
-          AddressU: 1
-          AddressV: 2
-          AddressW: 5
+        - Filter: MinLinearMagMipPoint 
+          AddressU: Wrap
+          AddressV: Mirror
+          AddressW: MirrorOnce
           MipLODBias: 1.23
           MaxAnisotropy: 20
-          ComparisonFunc: 4
-          BorderColor: 0
+          ComparisonFunc: LessEqual
+          BorderColor: TransparentBlack
           MinLOD: 4.56
           MaxLOD: 8.90
           ShaderRegister: 31 
           RegisterSpace: 32
-          ShaderVisibility:  7
+          ShaderVisibility:  Mesh
       AllowInputAssemblerInputLayout: true
       DenyGeometryShaderRootAccess: true
 
@@ -46,18 +46,18 @@ Parts:
 #CHECK-NEXT:      StaticSamplersOffset: 24
 #CHECK-NEXT:      Parameters:      []
 #CHECK-NEXT:      Samplers:
-#CHECK-NEXT:        - Filter:          10
-#CHECK-NEXT:          AddressU:        1
-#CHECK-NEXT:          AddressV:        2
-#CHECK-NEXT:          AddressW:        5
+#CHECK-NEXT:        - Filter:          MinLinearMagMipPoint
+#CHECK-NEXT:          AddressU:        Wrap
+#CHECK-NEXT:          AddressV:        Mirror
+#CHECK-NEXT:          AddressW:        MirrorOnce
 #CHECK-NEXT:          MipLODBias:      1.23
 #CHECK-NEXT:          MaxAnisotropy:   20
-#CHECK-NEXT:          ComparisonFunc:  4
-#CHECK-NEXT:          BorderColor:     0
+#CHECK-NEXT:          ComparisonFunc:  LessEqual
+#CHECK-NEXT:          BorderColor:     TransparentBlack
 #CHECK-NEXT:          MinLOD:          4.56
 #CHECK-NEXT:          MaxLOD:          8.9
 #CHECK-NEXT:          ShaderRegister:  31
 #CHECK-NEXT:          RegisterSpace:   32
-#CHECK-NEXT:          ShaderVisibility: 7
+#CHECK-NEXT:          ShaderVisibility: Mesh
 #CHECK-NEXT:      AllowInputAssemblerInputLayout: true
 #CHECK-NEXT:      DenyGeometryShaderRootAccess: true
diff --git a/llvm/test/Other/cgscc-devirt-iteration.ll b/llvm/test/Other/cgscc-devirt-iteration.ll
index 2ee016a134baa..990d474f8f571 100644
--- a/llvm/test/Other/cgscc-devirt-iteration.ll
+++ b/llvm/test/Other/cgscc-devirt-iteration.ll
@@ -59,7 +59,7 @@ define void @test2_a(ptr %ignore) {
 ; AFTER1: Function Attrs: nofree memory(read)
 ; AFTER2: Function Attrs: nofree nosync memory(none)
 ; BEFORE: define void @test2_a(ptr %ignore)
-; AFTER: define void @test2_a(ptr readnone %ignore)
+; AFTER: define void @test2_a(ptr readnone captures(address) %ignore)
 entry:
   %f1ptr = alloca ptr
   store ptr @readnone_with_arg, ptr %f1ptr
diff --git a/llvm/test/TableGen/AsmPredicateCombining.td b/llvm/test/TableGen/AsmPredicateCombining.td
index c8081a428d7bb..c68647928efce 100644
--- a/llvm/test/TableGen/AsmPredicateCombining.td
+++ b/llvm/test/TableGen/AsmPredicateCombining.td
@@ -63,19 +63,19 @@ def AsmPred4 : Predicate<"Pred4">, AssemblerPredicate<(all_of AsmCond4, (not (an
 // MATCHER-NEXT:   Features.set(Feature_AsmPred4Bit);
 
 def insn1 : TestInsn<1, [AsmPred1]>;
-// DISASS: return (Bits[arch::AsmCond1]);
+// DISASS: return FB[arch::AsmCond1];
 
 def insn2 : TestInsn<2, [AsmPred2]>;
-// DISASS: return (Bits[arch::AsmCond2a] && Bits[arch::AsmCond2b])
+// DISASS: return FB[arch::AsmCond2a] && FB[arch::AsmCond2b];
 
 def insn3 : TestInsn<3, [AsmPred3]>;
-// DISASS: return (Bits[arch::AsmCond3a] || Bits[arch::AsmCond3b])
+// DISASS: return FB[arch::AsmCond3a] || FB[arch::AsmCond3b];
 
 def insn4 : TestInsn<4, [AsmPred1, AsmPred2]>;
-// DISASS: return (Bits[arch::AsmCond1] && (Bits[arch::AsmCond2a] && Bits[arch::AsmCond2b]))
+// DISASS: return FB[arch::AsmCond1] && (FB[arch::AsmCond2a] && FB[arch::AsmCond2b]);
 
 def insn5 : TestInsn<5, [AsmPred1, AsmPred3]>;
-// DISASS: return (Bits[arch::AsmCond1] && (Bits[arch::AsmCond3a] || Bits[arch::AsmCond3b]))
+// DISASS: return FB[arch::AsmCond1] && (FB[arch::AsmCond3a] || FB[arch::AsmCond3b]);
 
 def insn6 : TestInsn<6, []>;
 def : InstAlias<"alias1", (insn6 R0)> { let Predicates = [AsmPred1]; }
diff --git a/llvm/test/TableGen/BitOffsetDecoder.td b/llvm/test/TableGen/BitOffsetDecoder.td
index 04d6e164d0eee..f94e8d4f09789 100644
--- a/llvm/test/TableGen/BitOffsetDecoder.td
+++ b/llvm/test/TableGen/BitOffsetDecoder.td
@@ -59,6 +59,6 @@ def baz : Instruction {
 
 // CHECK: tmp = fieldFromInstruction(insn, 8, 7);
 // CHECK: tmp = fieldFromInstruction(insn, 8, 8) << 3;
-// CHECK: insertBits(tmp, fieldFromInstruction(insn, 8, 4), 7, 4);
-// CHECK: insertBits(tmp, fieldFromInstruction(insn, 12, 4), 3, 4);
+// CHECK: tmp |= fieldFromInstruction(insn, 8, 4) << 7;
+// CHECK: tmp |= fieldFromInstruction(insn, 12, 4) << 3;
 // CHECK: tmp = fieldFromInstruction(insn, 8, 8) << 4;
diff --git a/llvm/test/TableGen/DFAPacketizer.td b/llvm/test/TableGen/DFAPacketizer.td
new file mode 100644
index 0000000000000..6237bfbea9a0f
--- /dev/null
+++ b/llvm/test/TableGen/DFAPacketizer.td
@@ -0,0 +1,39 @@
+// RUN: llvm-tblgen -gen-dfa-packetizer -I %p/../../include %s | FileCheck %s
+
+include "llvm/Target/Target.td"
+
+def TestTarget : Target;
+
+def TestSchedModel : SchedMachineModel {
+  let CompleteModel = 0;
+}
+
+def TestProcessor1 : ProcessorModel<"testprocessor1", TestSchedModel, []>;
+
+def FU0 : FuncUnit;
+def FU1 : FuncUnit;
+
+def OP0 : InstrItinClass;
+def OP1 : InstrItinClass;
+
+def Itin {
+  list<InstrItinData> ItinList = [
+    InstrItinData<OP0, [InstrStage<1, [FU0]>]>,
+    InstrItinData<OP1, [InstrStage<1, [FU1]>]>,
+  ];
+}
+
+// CHECK:      int TestTargetGetResourceIndex(unsigned ProcID) {
+// CHECK-NEXT:   static const unsigned TestTargetProcIdToProcResourceIdxTable[][2] = {
+// CHECK-NEXT:     { 2,  1 }, // TestItinerariesModel
+// CHECK-NEXT:   };
+// CHECK-NEXT:   auto It = llvm::lower_bound(TestTargetProcIdToProcResourceIdxTable, ProcID,
+// CHECK-NEXT:       [](const unsigned LHS[], unsigned Val) { return LHS[0] < Val; });
+// CHECK-NEXT:   assert(*It[0] == ProcID);
+// CHECK-NEXT:   return (*It)[1];
+// CHECK-NEXT: }
+
+// CHECK:  unsigned Index = TestTargetGetResourceIndex(IID->SchedModel.ProcID);
+
+def TestItineraries: ProcessorItineraries<[], [], Itin.ItinList>;
+def TestProcessor2 : Processor<"testprocessor2", TestItineraries, []>;
diff --git a/llvm/test/TableGen/AsmPredicateCondsEmission.td b/llvm/test/TableGen/DecoderEmitter/AsmPredicateCondsEmission.td
similarity index 86%
rename from llvm/test/TableGen/AsmPredicateCondsEmission.td
rename to llvm/test/TableGen/DecoderEmitter/AsmPredicateCondsEmission.td
index 7b2ab2afa5a8d..6178a4f9fe609 100644
--- a/llvm/test/TableGen/AsmPredicateCondsEmission.td
+++ b/llvm/test/TableGen/DecoderEmitter/AsmPredicateCondsEmission.td
@@ -1,4 +1,4 @@
-// RUN: llvm-tblgen -gen-disassembler -I %p/../../include %s | FileCheck %s
+// RUN: llvm-tblgen -gen-disassembler -I %p/../../../include %s | FileCheck %s
 
 // Check that we don't generate invalid code of the form "( && Cond2)" when
 // emitting AssemblerPredicate conditions. In the example below, the invalid
@@ -29,4 +29,4 @@ def foo : Instruction {
   let Predicates = [Pred1, Pred2];
 }
 
-// CHECK: return (Bits[arch::AssemblerCondition2]);
+// CHECK: return FB[arch::AssemblerCondition2];
diff --git a/llvm/test/TableGen/DecoderEmitterBitwidthSpecialization.td b/llvm/test/TableGen/DecoderEmitter/DecoderEmitterBitwidthSpecialization.td
similarity index 95%
rename from llvm/test/TableGen/DecoderEmitterBitwidthSpecialization.td
rename to llvm/test/TableGen/DecoderEmitter/DecoderEmitterBitwidthSpecialization.td
index c656616a62451..71b0c99675baa 100644
--- a/llvm/test/TableGen/DecoderEmitterBitwidthSpecialization.td
+++ b/llvm/test/TableGen/DecoderEmitter/DecoderEmitterBitwidthSpecialization.td
@@ -1,6 +1,6 @@
-// RUN: llvm-tblgen -gen-disassembler -I %p/../../include %s | FileCheck %s --check-prefix=CHECK-DEFAULT
-// RUN: llvm-tblgen -gen-disassembler -specialize-decoders-per-bitwidth -I %p/../../include %s | FileCheck %s --check-prefix=CHECK-SPECIALIZE-NO-TABLE
-// RUN: llvm-tblgen -gen-disassembler -specialize-decoders-per-bitwidth -use-fn-table-in-decode-to-mcinst -I %p/../../include %s | FileCheck %s --check-prefix=CHECK-SPECIALIZE-TABLE
+// RUN: llvm-tblgen -gen-disassembler -I %p/../../../include %s | FileCheck %s --check-prefix=CHECK-DEFAULT
+// RUN: llvm-tblgen -gen-disassembler -specialize-decoders-per-bitwidth -I %p/../../../include %s | FileCheck %s --check-prefix=CHECK-SPECIALIZE-NO-TABLE
+// RUN: llvm-tblgen -gen-disassembler -specialize-decoders-per-bitwidth -use-fn-table-in-decode-to-mcinst -I %p/../../../include %s | FileCheck %s --check-prefix=CHECK-SPECIALIZE-TABLE
 
 
 include "llvm/Target/Target.td"
diff --git a/llvm/test/TableGen/DecoderEmitterFnTable.td b/llvm/test/TableGen/DecoderEmitter/DecoderEmitterFnTable.td
similarity index 98%
rename from llvm/test/TableGen/DecoderEmitterFnTable.td
rename to llvm/test/TableGen/DecoderEmitter/DecoderEmitterFnTable.td
index 8929e6da716e6..455089588511f 100644
--- a/llvm/test/TableGen/DecoderEmitterFnTable.td
+++ b/llvm/test/TableGen/DecoderEmitter/DecoderEmitterFnTable.td
@@ -1,4 +1,4 @@
-// RUN: llvm-tblgen -gen-disassembler -use-fn-table-in-decode-to-mcinst -I %p/../../include %s | FileCheck %s
+// RUN: llvm-tblgen -gen-disassembler -use-fn-table-in-decode-to-mcinst -I %p/../../../include %s | FileCheck %s
 
 include "llvm/Target/Target.td"
 
diff --git a/llvm/test/TableGen/FixedLenDecoderEmitter/InvalidEncoding.td b/llvm/test/TableGen/DecoderEmitter/InvalidEncoding.td
similarity index 100%
rename from llvm/test/TableGen/FixedLenDecoderEmitter/InvalidEncoding.td
rename to llvm/test/TableGen/DecoderEmitter/InvalidEncoding.td
diff --git a/llvm/test/TableGen/FixedLenDecoderEmitter/MultiOps.td b/llvm/test/TableGen/DecoderEmitter/MultiOps.td
similarity index 100%
rename from llvm/test/TableGen/FixedLenDecoderEmitter/MultiOps.td
rename to llvm/test/TableGen/DecoderEmitter/MultiOps.td
diff --git a/llvm/test/TableGen/VarLenDecoder.td b/llvm/test/TableGen/DecoderEmitter/VarLenDecoder.td
similarity index 64%
rename from llvm/test/TableGen/VarLenDecoder.td
rename to llvm/test/TableGen/DecoderEmitter/VarLenDecoder.td
index 7eda1e6e47431..d046c1a192111 100644
--- a/llvm/test/TableGen/VarLenDecoder.td
+++ b/llvm/test/TableGen/DecoderEmitter/VarLenDecoder.td
@@ -1,5 +1,5 @@
-// RUN: llvm-tblgen -gen-disassembler -I %p/../../include %s | FileCheck %s --check-prefixes=CHECK,CHECK-SMALL
-// RUN: llvm-tblgen -gen-disassembler --large-decoder-table -I %p/../../include %s | FileCheck %s --check-prefixes=CHECK,CHECK-LARGE
+// RUN: llvm-tblgen -gen-disassembler -I %p/../../../include %s | FileCheck %s --check-prefixes=CHECK,CHECK-SMALL
+// RUN: llvm-tblgen -gen-disassembler --large-decoder-table -I %p/../../../include %s | FileCheck %s --check-prefixes=CHECK,CHECK-LARGE
 
 include "llvm/Target/Target.td"
 
@@ -53,18 +53,18 @@ def FOO32 : MyVarInst<MemOp32> {
 // CHECK-NEXT: 43,
 // CHECK-NEXT: };
 
-// CHECK-SMALL:      /* 0 */       MCD::OPC_ExtractField, 3, 5,  // Inst{7-3} ...
-// CHECK-SMALL-NEXT: /* 3 */       MCD::OPC_FilterValueOrSkip, 8, 4, 0, // Skip to: 11
-// CHECK-SMALL-NEXT: /* 7 */       MCD::OPC_Decode, {{[0-9]+}}, {{[0-9]+}}, 0, // Opcode: FOO16
-// CHECK-SMALL-NEXT: /* 11 */      MCD::OPC_FilterValue, 9,
-// CHECK-SMALL-NEXT: /* 13 */      MCD::OPC_Decode, {{[0-9]+}}, {{[0-9]+}}, 1, // Opcode: FOO32
+// CHECK-SMALL:      /* 0 */       OPC_ExtractField, 3, 5,                // Field = Inst{7-3}
+// CHECK-SMALL-NEXT: /* 3 */       OPC_FilterValueOrSkip, 8, 4, 0,        // if Field != 0x8 skip to 11
+// CHECK-SMALL-NEXT: /* 7 */       OPC_Decode, {{[0-9]+}}, {{[0-9]+}}, 0, // Opcode: FOO16
+// CHECK-SMALL-NEXT: /* 11 */      OPC_FilterValue, 9,                    // if Field != 0x9 pop scope
+// CHECK-SMALL-NEXT: /* 13 */      OPC_Decode, {{[0-9]+}}, {{[0-9]+}}, 1, // Opcode: FOO32
 // CHECK-SMALL-NEXT: };
 
-// CHECK-LARGE:      /* 0 */       MCD::OPC_ExtractField, 3, 5,  // Inst{7-3} ...
-// CHECK-LARGE-NEXT: /* 3 */       MCD::OPC_FilterValueOrSkip, 8, 4, 0, 0, // Skip to: 12
-// CHECK-LARGE-NEXT: /* 8 */       MCD::OPC_Decode, {{[0-9]+}}, {{[0-9]+}}, 0, // Opcode: FOO16
-// CHECK-LARGE-NEXT: /* 12 */      MCD::OPC_FilterValue, 9,
-// CHECK-LARGE-NEXT: /* 14 */      MCD::OPC_Decode, {{[0-9]+}}, {{[0-9]+}}, 1, // Opcode: FOO32
+// CHECK-LARGE:      /* 0 */       OPC_ExtractField, 3, 5,                // Field = Inst{7-3}
+// CHECK-LARGE-NEXT: /* 3 */       OPC_FilterValueOrSkip, 8, 4, 0, 0,     // if Field != 0x8 skip to 12
+// CHECK-LARGE-NEXT: /* 8 */       OPC_Decode, {{[0-9]+}}, {{[0-9]+}}, 0, // Opcode: FOO16
+// CHECK-LARGE-NEXT: /* 12 */      OPC_FilterValue, 9,                    // if Field != 0x9 pop scope
+// CHECK-LARGE-NEXT: /* 14 */      OPC_Decode, {{[0-9]+}}, {{[0-9]+}}, 1, // Opcode: FOO32
 // CHECK-LARGE-NEXT: };
 
 // CHECK:      case 0:
@@ -81,17 +81,17 @@ def FOO32 : MyVarInst<MemOp32> {
 // CHECK-NEXT: tmp = fieldFromInstruction(insn, 0, 3);
 // CHECK-NEXT: if (!Check(S, myCustomDecoder(MI, tmp, Address, Decoder))) { return MCDisassembler::Fail; }
 // CHECK-NEXT: tmp = 0x0;
-// CHECK-NEXT: insertBits(tmp, fieldFromInstruction(insn, 11, 16), 16, 16);
-// CHECK-NEXT: insertBits(tmp, fieldFromInstruction(insn, 27, 16), 0, 16);
+// CHECK-NEXT: tmp |= fieldFromInstruction(insn, 11, 16) << 16;
+// CHECK-NEXT: tmp |= fieldFromInstruction(insn, 27, 16);
 // CHECK-NEXT: MI.addOperand(MCOperand::createImm(tmp));
 // CHECK-NEXT: return S;
 
-// CHECK-LABEL: case MCD::OPC_ExtractField: {
+// CHECK-LABEL: case OPC_ExtractField: {
 // CHECK: makeUp(insn, Start + Len);
 
-// CHECK-LABEL: case MCD::OPC_CheckField: {
+// CHECK-LABEL: case OPC_CheckField: {
 // CHECK: makeUp(insn, Start + Len);
 
-// CHECK-LABEL: case MCD::OPC_Decode: {
+// CHECK-LABEL: case OPC_Decode: {
 // CHECK: Len = InstrLenTable[Opc];
 // CHECK-NEXT: makeUp(insn, Len);
diff --git a/llvm/test/TableGen/DecoderEmitter/additional-encoding.td b/llvm/test/TableGen/DecoderEmitter/additional-encoding.td
new file mode 100644
index 0000000000000..e3ef572bfa7f8
--- /dev/null
+++ b/llvm/test/TableGen/DecoderEmitter/additional-encoding.td
@@ -0,0 +1,71 @@
+// RUN: llvm-tblgen -gen-disassembler -I %p/../../../include %s | FileCheck %s
+
+include "llvm/Target/Target.td"
+
+class Enc {
+  int Size = 2;
+  bits<16> Inst;
+}
+
+class EncSHIFT<bits<2> opc> : Enc {
+  bits<6> shamt;
+  let Inst{15...14} = {0, 0};
+  let Inst{13...12} = opc;
+  let Inst{11...6} = shamt;
+}
+
+class EncNOP<bits<2> opc> : Enc {
+  let Inst{15...14} = {0, 0};
+  let Inst{13...12} = opc;
+  let Inst{11...6} = {0, 0, 0, 0, 0, 0};
+}
+
+def ShAmtOp : Operand<i32> {
+  let DecoderMethod = "decodeShAmt";
+  let hasCompleteDecoder = false;
+}
+
+class I<dag out_ops, dag in_ops> : Instruction {
+  let InOperandList = in_ops;
+  let OutOperandList = out_ops;
+}
+
+// CHECK:      /* 0 */  OPC_ExtractField, 12, 4,               // Field = Inst{15-12}
+// CHECK-NEXT: /* 3 */  OPC_FilterValueOrSkip, 0, 15, 0,       // if Field != 0x0 skip to 22
+// CHECK-NEXT: /* 7 */  OPC_Scope, 8, 0,                       // end scope at 18
+// CHECK-NEXT: /* 10 */ OPC_CheckField, 6, 6, 0,               // if Inst{11-6} != 0x0
+// CHECK-NEXT: /* 14 */ OPC_Decode, {{[0-9]+}}, 2, 0,          // Opcode: {{.*}}:NOP, DecodeIdx: 0
+// CHECK-NEXT: /* 18 */ OPC_Decode, {{[0-9]+}}, 2, 1,          // Opcode: SHIFT0, DecodeIdx: 1
+// CHECK-NEXT: /* 22 */ OPC_FilterValueOrSkip, 1, 15, 0,       // if Field != 0x1 skip to 41
+// CHECK-NEXT: /* 26 */ OPC_Scope, 8, 0,                       // end scope at 37
+// CHECK-NEXT: /* 29 */ OPC_CheckField, 6, 6, 0,               // if Inst{11-6} != 0x0
+// CHECK-NEXT: /* 33 */ OPC_Decode, {{[0-9]+}}, 2, 0,          // Opcode: {{.*}}:NOP, DecodeIdx: 0
+// CHECK-NEXT: /* 37 */ OPC_Decode, {{[0-9]+}}, 2, 1,          // Opcode: SHIFT1, DecodeIdx: 1
+// CHECK-NEXT: /* 41 */ OPC_FilterValueOrSkip, 2, 15, 0,       // if Field != 0x2 skip to 60
+// CHECK-NEXT: /* 45 */ OPC_Scope, 8, 0,                       // end scope at 56
+// CHECK-NEXT: /* 48 */ OPC_CheckField, 6, 6, 0,               // if Inst{11-6} != 0x0
+// CHECK-NEXT: /* 52 */ OPC_Decode, {{[0-9]+}}, 2, 0,          // Opcode: {{.*}}:NOP, DecodeIdx: 0
+// CHECK-NEXT: /* 56 */ OPC_Decode, {{[0-9]+}}, 2, 1,          // Opcode: SHIFT2, DecodeIdx: 1
+// CHECK-NEXT: /* 60 */ OPC_FilterValue, 3,                    // if Field != 0x3
+// CHECK-NEXT: /* 62 */ OPC_Scope, 8, 0,                       // end scope at 73
+// CHECK-NEXT: /* 65 */ OPC_CheckField, 6, 6, 0,               // if Inst{11-6} != 0x0
+// CHECK-NEXT: /* 69 */ OPC_Decode, {{[0-9]+}}, 2, 0,          // Opcode: {{.*}}:NOP, DecodeIdx: 0
+// CHECK-NEXT: /* 73 */ OPC_Decode, {{[0-9]+}}, 2, 1,          // Opcode: SHIFT3, DecodeIdx: 1
+
+
+class SHIFT<bits<2> opc> : I<(outs), (ins ShAmtOp:$shamt)>, EncSHIFT<opc>;
+def SHIFT0 : SHIFT<0>;
+def SHIFT1 : SHIFT<1>;
+def SHIFT2 : SHIFT<2>;
+def SHIFT3 : SHIFT<3>;
+
+def NOP : I<(outs), (ins)>, EncNOP<0>;
+def : AdditionalEncoding<NOP>, EncNOP<1>;
+def : AdditionalEncoding<NOP>, EncNOP<2>;
+def : AdditionalEncoding<NOP>, EncNOP<3>;
+
+def II : InstrInfo;
+
+def MyTarget : Target {
+  let InstructionSet = II;
+}
diff --git a/llvm/test/TableGen/DecoderEmitter/big-filter.td b/llvm/test/TableGen/DecoderEmitter/big-filter.td
new file mode 100644
index 0000000000000..87aa7f814c3f3
--- /dev/null
+++ b/llvm/test/TableGen/DecoderEmitter/big-filter.td
@@ -0,0 +1,38 @@
+// RUN: llvm-tblgen -gen-disassembler -I %p/../../../include %s | FileCheck %s
+
+include "llvm/Target/Target.td"
+
+class I : Instruction {
+  let InOperandList = (ins);
+  let OutOperandList = (outs);
+  let Size = 16;
+  bits<128> Inst;
+}
+
+// Check that a 64-bit filter with all bits set does not confuse DecoderEmitter.
+//
+// CHECK-LABEL: static const uint8_t DecoderTable128[34] = {
+// CHECK-NEXT:  /* 0 */  OPC_ExtractField, 0, 64,        // Field = Inst{63-0}
+// CHECK-NEXT:  /* 3 */  OPC_FilterValueOrSkip, 1, 8, 0, // if Field != 0x1 skip to 15
+// CHECK-NEXT:  /* 7 */  OPC_CheckField, 127, 1, 1,      // if Inst{127} != 0x1
+// CHECK-NEXT:  /* 11 */ OPC_Decode, {{[0-9]+}}, 2, 0,   // Opcode: I2, DecodeIdx: 0
+// CHECK-NEXT:  /* 15 */ OPC_FilterValue, 255, 255, 255, 255, 255, 255, 255, 255, 255, 1, // if Field != 0xffffffffffffffff
+// CHECK-NEXT:  /* 26 */ OPC_CheckField, 127, 1, 0,      // if Inst{127} != 0x0
+// CHECK-NEXT:  /* 30 */ OPC_Decode, {{[0-9]+}}, 2, 0,   // Opcode: I1, DecodeIdx: 0
+// CHECK-NEXT:  };
+
+def I1 : I {
+  let Inst{63...0} = -1;
+  let Inst{127} = 0;
+}
+
+def I2 : I {
+  let Inst{63...0} = 1;
+  let Inst{127} = 1;
+}
+
+def II : InstrInfo;
+
+def MyTarget : Target {
+  let InstructionSet = II;
+}
diff --git a/llvm/test/TableGen/FixedLenDecoderEmitter/conflict.td b/llvm/test/TableGen/DecoderEmitter/conflict.td
similarity index 100%
rename from llvm/test/TableGen/FixedLenDecoderEmitter/conflict.td
rename to llvm/test/TableGen/DecoderEmitter/conflict.td
diff --git a/llvm/test/TableGen/DecoderEmitter/operand-decoder.td b/llvm/test/TableGen/DecoderEmitter/operand-decoder.td
new file mode 100644
index 0000000000000..c6ec2ee1a4db4
--- /dev/null
+++ b/llvm/test/TableGen/DecoderEmitter/operand-decoder.td
@@ -0,0 +1,66 @@
+// RUN: llvm-tblgen -gen-disassembler -I %p/../../../include %s | FileCheck %s
+
+include "llvm/Target/Target.td"
+
+def R0 : Register<"r0">;
+def RC : RegisterClass<"MyTarget", [i32], 32, (add R0)>;
+
+def MyInstrInfo : InstrInfo;
+
+def MyTarget : Target {
+  let InstructionSet = MyInstrInfo;
+}
+
+// CHECK-LABEL: case 0:
+// CHECK-NEXT:    if (!Check(S, DecodeRCRegisterClass(MI, Decoder)))
+// CHECK-NEXT:      return MCDisassembler::Fail;
+// CHECK-NEXT:    tmp = fieldFromInstruction(insn, 2, 4);
+// CHECK-NEXT:    MI.addOperand(MCOperand::createImm(tmp));
+// CHECK-NEXT:    tmp = 0x0;
+// CHECK-NEXT:    tmp |= fieldFromInstruction(insn, 0, 2);
+// CHECK-NEXT:    tmp |= fieldFromInstruction(insn, 6, 2) << 2;
+// CHECK-NEXT:    MI.addOperand(MCOperand::createImm(tmp));
+// CHECK-NEXT:    tmp = 0x0;
+// CHECK-NEXT:    MI.addOperand(MCOperand::createImm(tmp));
+// CHECK-NEXT:    tmp = fieldFromInstruction(insn, 13, 2) << 1;
+// CHECK-NEXT:    MI.addOperand(MCOperand::createImm(tmp));
+// CHECK-NEXT:    tmp = 0x0;
+// CHECK-NEXT:    tmp |= fieldFromInstruction(insn, 17, 1) << 1;
+// CHECK-NEXT:    tmp |= fieldFromInstruction(insn, 19, 1) << 3;
+// CHECK-NEXT:    MI.addOperand(MCOperand::createImm(tmp));
+// CHECK-NEXT:    tmp = 0x5;
+// CHECK-NEXT:    MI.addOperand(MCOperand::createImm(tmp));
+// CHECK-NEXT:    tmp = 0x2;
+// CHECK-NEXT:    tmp |= fieldFromInstruction(insn, 26, 2) << 2;
+// CHECK-NEXT:    MI.addOperand(MCOperand::createImm(tmp));
+// CHECK-NEXT:    tmp = 0xa;
+// CHECK-NEXT:    tmp |= fieldFromInstruction(insn, 28, 1);
+// CHECK-NEXT:    tmp |= fieldFromInstruction(insn, 30, 1) << 2;
+// CHECK-NEXT:    MI.addOperand(MCOperand::createImm(tmp));
+// CHECK-NEXT:    return S;
+
+def I : Instruction {
+  let OutOperandList = (outs RC:$op0);
+  let InOperandList = (ins i32imm:$op1, i32imm:$op2, i32imm:$op3, i32imm:$op4,
+                           i32imm:$op5, i32imm:$op6, i32imm:$op7, i32imm:$op8);
+  let Size = 4;
+  bits<32> Inst;
+  bits<0> op0;                  // no init, no variable parts
+  bits<4> op1;                  // no init, 1 variable part
+  bits<4> op2;                  // no init, 2 variable parts
+  bits<4> op3 = 0b0000;         // zero init, no variable parts
+  bits<4> op4 = {0, ?, ?, 0};   // zero init, 1 variable part
+  bits<4> op5 = {?, 0, ?, 0};   // zero init, 2 variable parts
+  bits<4> op6 = 0b0101;         // non-zero init, no variable parts
+  bits<4> op7 = {?, ?, 1, 0};   // non-zero init, 1 variable part
+  bits<4> op8 = {1, ?, 1, ?};   // non-zero init, 2 variable parts
+  let Inst{5...2} = op1;
+  let Inst{1...0} = op2{1...0};
+  let Inst{7...6} = op2{3...2};
+  let Inst{11...8} = op3;
+  let Inst{15...12} = op4;
+  let Inst{19...16} = op5;
+  let Inst{23...20} = op6;
+  let Inst{27...24} = op7;
+  let Inst{31...28} = op8;
+}
diff --git a/llvm/test/TableGen/FixedLenDecoderEmitter/sub-arg-dag-error-1.td b/llvm/test/TableGen/DecoderEmitter/sub-arg-dag-error-1.td
similarity index 100%
rename from llvm/test/TableGen/FixedLenDecoderEmitter/sub-arg-dag-error-1.td
rename to llvm/test/TableGen/DecoderEmitter/sub-arg-dag-error-1.td
diff --git a/llvm/test/TableGen/FixedLenDecoderEmitter/sub-arg-dag-error-2.td b/llvm/test/TableGen/DecoderEmitter/sub-arg-dag-error-2.td
similarity index 100%
rename from llvm/test/TableGen/FixedLenDecoderEmitter/sub-arg-dag-error-2.td
rename to llvm/test/TableGen/DecoderEmitter/sub-arg-dag-error-2.td
diff --git a/llvm/test/TableGen/trydecode-emission.td b/llvm/test/TableGen/DecoderEmitter/trydecode-emission.td
similarity index 56%
rename from llvm/test/TableGen/trydecode-emission.td
rename to llvm/test/TableGen/DecoderEmitter/trydecode-emission.td
index d1cf4bf541835..b04ba2b4a6f5b 100644
--- a/llvm/test/TableGen/trydecode-emission.td
+++ b/llvm/test/TableGen/DecoderEmitter/trydecode-emission.td
@@ -1,5 +1,5 @@
-// RUN: llvm-tblgen -gen-disassembler -I %p/../../include %s | FileCheck %s
-// RUN: llvm-tblgen -gen-disassembler --large-decoder-table -I %p/../../include %s | FileCheck %s --check-prefix=CHECK-LARGE
+// RUN: llvm-tblgen -gen-disassembler -I %p/../../../include %s | FileCheck %s
+// RUN: llvm-tblgen -gen-disassembler --large-decoder-table -I %p/../../../include %s | FileCheck %s --check-prefix=CHECK-LARGE
 
 // Check that if decoding of an instruction fails and the instruction does not
 // have a complete decoder method that can determine if the bitpattern is valid
@@ -34,11 +34,11 @@ def InstB : TestInstruction {
   let hasCompleteDecoder = 0;
 }
 
-// CHECK:      /* 0 */       MCD::OPC_CheckField, 4, 4, 0,
-// CHECK-NEXT: /* 4 */       MCD::OPC_Scope, 8, 0, // Skip to: 15
-// CHECK-NEXT: /* 7 */       MCD::OPC_CheckField, 2, 2, 0,
-// CHECK-NEXT: /* 11 */      MCD::OPC_TryDecode, {{[0-9]+}}, {{[0-9]+}}, 0,
-// CHECK-NEXT: /* 15 */      MCD::OPC_Decode, {{[0-9]+}}, {{[0-9]+}}, 1, // Opcode: InstA, DecodeIdx: 1
+// CHECK:      /* 0 */       OPC_CheckField, 4, 4, 0,                   // if Inst{7-4} != 0x0
+// CHECK-NEXT: /* 4 */       OPC_Scope, 8, 0,                           // end scope at 15
+// CHECK-NEXT: /* 7 */       OPC_CheckField, 2, 2, 0,                   // if Inst{3-2} != 0x0
+// CHECK-NEXT: /* 11 */      OPC_Decode, {{[0-9]+}}, {{[0-9]+}}, 0,     // Opcode: InstB, DecodeIdx: 0
+// CHECK-NEXT: /* 15 */      OPC_Decode, {{[0-9]+}}, {{[0-9]+}}, 1,     // Opcode: InstA, DecodeIdx: 1
 // CHECK-NEXT: };
 
 // CHECK: if (!Check(S, DecodeInstB(MI, insn, Address, Decoder))) { DecodeComplete = false; return MCDisassembler::Fail; }
@@ -47,11 +47,11 @@ def InstB : TestInstruction {
 // CHECK-NEXT:  NumToSkip |= (*Ptr++) << 8;
 // CHECK-NEXT:  return NumToSkip;
 
-// CHECK-LARGE:      /* 0 */       MCD::OPC_CheckField, 4, 4, 0,
-// CHECK-LARGE-NEXT: /* 4 */       MCD::OPC_Scope, 8, 0, 0, // Skip to: 16
-// CHECK-LARGE-NEXT: /* 8 */       MCD::OPC_CheckField, 2, 2, 0,
-// CHECK-LARGE-NEXT: /* 12 */      MCD::OPC_TryDecode, {{[0-9]+}}, {{[0-9]+}}, 0,
-// CHECK-LARGE-NEXT: /* 16 */      MCD::OPC_Decode, {{[0-9]+}}, {{[0-9]+}}, 1, // Opcode: InstA, DecodeIdx: 1
+// CHECK-LARGE:      /* 0 */       OPC_CheckField, 4, 4, 0,                  // if Inst{7-4} != 0x0
+// CHECK-LARGE-NEXT: /* 4 */       OPC_Scope, 8, 0, 0,                       // end scope at 16
+// CHECK-LARGE-NEXT: /* 8 */       OPC_CheckField, 2, 2, 0,                  // if Inst{3-2} != 0x0
+// CHECK-LARGE-NEXT: /* 12 */      OPC_Decode, {{[0-9]+}}, {{[0-9]+}}, 0,    // Opcode: InstB, DecodeIdx: 0
+// CHECK-LARGE-NEXT: /* 16 */      OPC_Decode, {{[0-9]+}}, {{[0-9]+}}, 1,    // Opcode: InstA, DecodeIdx: 1
 // CHECK-LARGE-NEXT: };
 
 // CHECK-LARGE: if (!Check(S, DecodeInstB(MI, insn, Address, Decoder))) { DecodeComplete = false; return MCDisassembler::Fail; }
diff --git a/llvm/test/TableGen/trydecode-emission2.td b/llvm/test/TableGen/DecoderEmitter/trydecode-emission2.td
similarity index 52%
rename from llvm/test/TableGen/trydecode-emission2.td
rename to llvm/test/TableGen/DecoderEmitter/trydecode-emission2.td
index d7a87eb4b8691..7fd26fffd28b7 100644
--- a/llvm/test/TableGen/trydecode-emission2.td
+++ b/llvm/test/TableGen/DecoderEmitter/trydecode-emission2.td
@@ -1,5 +1,5 @@
-// RUN: llvm-tblgen -gen-disassembler -I %p/../../include %s | FileCheck %s
-// RUN: llvm-tblgen -gen-disassembler --large-decoder-table -I %p/../../include %s | FileCheck %s --check-prefix=CHECK-LARGE
+// RUN: llvm-tblgen -gen-disassembler -I %p/../../../include %s | FileCheck %s
+// RUN: llvm-tblgen -gen-disassembler --large-decoder-table -I %p/../../../include %s | FileCheck %s --check-prefix=CHECK-LARGE
 
 include "llvm/Target/Target.td"
 
@@ -31,24 +31,24 @@ def InstB : TestInstruction {
   let hasCompleteDecoder = 0;
 }
 
-// CHECK:      /* 0 */       MCD::OPC_CheckField, 2, 1, 0,
-// CHECK-NEXT: /* 4 */       MCD::OPC_CheckField, 5, 3, 0,
-// CHECK-NEXT: /* 8 */       MCD::OPC_Scope, 8, 0, // Skip to: 19
-// CHECK-NEXT: /* 11 */      MCD::OPC_CheckField, 0, 2, 3,
-// CHECK-NEXT: /* 15 */      MCD::OPC_TryDecode, {{[0-9]+}}, {{[0-9]+}}, 0,
-// CHECK-NEXT: /* 19 */      MCD::OPC_CheckField, 3, 2, 0,
-// CHECK-NEXT: /* 23 */      MCD::OPC_TryDecode, {{[0-9]+}}, {{[0-9]+}}, 1,
+// CHECK:      /* 0 */       OPC_CheckField, 2, 1, 0,
+// CHECK-NEXT: /* 4 */       OPC_CheckField, 5, 3, 0,
+// CHECK-NEXT: /* 8 */       OPC_Scope, 8, 0, // end scope at 19
+// CHECK-NEXT: /* 11 */      OPC_CheckField, 0, 2, 3,
+// CHECK-NEXT: /* 15 */      OPC_Decode, {{[0-9]+}}, {{[0-9]+}}, 0,
+// CHECK-NEXT: /* 19 */      OPC_CheckField, 3, 2, 0,
+// CHECK-NEXT: /* 23 */      OPC_Decode, {{[0-9]+}}, {{[0-9]+}}, 1,
 
 // CHECK: if (!Check(S, DecodeInstB(MI, insn, Address, Decoder))) { DecodeComplete = false; return MCDisassembler::Fail; }
 // CHECK: if (!Check(S, DecodeInstA(MI, insn, Address, Decoder))) { DecodeComplete = false; return MCDisassembler::Fail; }
 
-// CHECK-LARGE:      /* 0 */       MCD::OPC_CheckField, 2, 1, 0,
-// CHECK-LARGE-NEXT: /* 4 */       MCD::OPC_CheckField, 5, 3, 0,
-// CHECK-LARGE-NEXT: /* 8 */       MCD::OPC_Scope, 8, 0, 0, // Skip to: 20
-// CHECK-LARGE-NEXT: /* 12 */      MCD::OPC_CheckField, 0, 2, 3,
-// CHECK-LARGE-NEXT: /* 16 */      MCD::OPC_TryDecode, {{[0-9]+}}, {{[0-9]+}}, 0,
-// CHECK-LARGE-NEXT: /* 20 */      MCD::OPC_CheckField, 3, 2, 0,
-// CHECK-LARGE-NEXT: /* 24 */      MCD::OPC_TryDecode, {{[0-9]+}}, {{[0-9]+}}, 1,
+// CHECK-LARGE:      /* 0 */       OPC_CheckField, 2, 1, 0,
+// CHECK-LARGE-NEXT: /* 4 */       OPC_CheckField, 5, 3, 0,
+// CHECK-LARGE-NEXT: /* 8 */       OPC_Scope, 8, 0, 0, // end scope at 20
+// CHECK-LARGE-NEXT: /* 12 */      OPC_CheckField, 0, 2, 3,
+// CHECK-LARGE-NEXT: /* 16 */      OPC_Decode, {{[0-9]+}}, {{[0-9]+}}, 0,
+// CHECK-LARGE-NEXT: /* 20 */      OPC_CheckField, 3, 2, 0,
+// CHECK-LARGE-NEXT: /* 24 */      OPC_Decode, {{[0-9]+}}, {{[0-9]+}}, 1,
 // CHECK-LARGE-NEXT: };
 
 // CHECK-LARGE: if (!Check(S, DecodeInstB(MI, insn, Address, Decoder))) { DecodeComplete = false; return MCDisassembler::Fail; }
diff --git a/llvm/test/TableGen/trydecode-emission3.td b/llvm/test/TableGen/DecoderEmitter/trydecode-emission3.td
similarity index 53%
rename from llvm/test/TableGen/trydecode-emission3.td
rename to llvm/test/TableGen/DecoderEmitter/trydecode-emission3.td
index b7d1b8ddc1b6c..c884d6b8a93cc 100644
--- a/llvm/test/TableGen/trydecode-emission3.td
+++ b/llvm/test/TableGen/DecoderEmitter/trydecode-emission3.td
@@ -1,5 +1,5 @@
-// RUN: llvm-tblgen -gen-disassembler  -I %p/../../include %s | FileCheck %s
-// RUN: llvm-tblgen -gen-disassembler --large-decoder-table -I %p/../../include %s | FileCheck %s --check-prefix=CHECK-LARGE
+// RUN: llvm-tblgen -gen-disassembler  -I %p/../../../include %s | FileCheck %s
+// RUN: llvm-tblgen -gen-disassembler --large-decoder-table -I %p/../../../include %s | FileCheck %s --check-prefix=CHECK-LARGE
 
 include "llvm/Target/Target.td"
 
@@ -35,20 +35,20 @@ def InstB : TestInstruction {
   let AsmString = "InstB";
 }
 
-// CHECK:      /* 0 */       MCD::OPC_CheckField, 4, 4, 0,
-// CHECK-NEXT: /* 4 */       MCD::OPC_Scope, 8, 0, // Skip to: 15
-// CHECK-NEXT: /* 7 */       MCD::OPC_CheckField, 2, 2, 0,
-// CHECK-NEXT: /* 11 */      MCD::OPC_TryDecode, {{[0-9]+}}, {{[0-9]+}}, 0,
-// CHECK-NEXT: /* 15 */      MCD::OPC_Decode, {{[0-9]+}}, {{[0-9]+}}, 1, // Opcode: InstA, DecodeIdx: 1
+// CHECK:      /* 0 */       OPC_CheckField, 4, 4, 0,
+// CHECK-NEXT: /* 4 */       OPC_Scope, 8, 0, // end scope at 15
+// CHECK-NEXT: /* 7 */       OPC_CheckField, 2, 2, 0,
+// CHECK-NEXT: /* 11 */      OPC_Decode, {{[0-9]+}}, {{[0-9]+}}, 0,
+// CHECK-NEXT: /* 15 */      OPC_Decode, {{[0-9]+}}, {{[0-9]+}}, 1, // Opcode: InstA, DecodeIdx: 1
 // CHECK-NEXT: };
 
 // CHECK: if (!Check(S, DecodeInstBOp(MI, tmp, Address, Decoder))) { DecodeComplete = false; return MCDisassembler::Fail; }
 
-// CHECK-LARGE:      /* 0 */       MCD::OPC_CheckField, 4, 4, 0,
-// CHECK-LARGE-NEXT: /* 4 */       MCD::OPC_Scope, 8, 0, 0, // Skip to: 16
-// CHECK-LARGE-NEXT: /* 8 */       MCD::OPC_CheckField, 2, 2, 0,
-// CHECK-LARGE-NEXT: /* 12 */      MCD::OPC_TryDecode, {{[0-9]+}}, {{[0-9]+}}, 0,
-// CHECK-LARGE-NEXT: /* 16 */      MCD::OPC_Decode, {{[0-9]+}}, {{[0-9]+}}, 1, // Opcode: InstA, DecodeIdx: 1
+// CHECK-LARGE:      /* 0 */       OPC_CheckField, 4, 4, 0,
+// CHECK-LARGE-NEXT: /* 4 */       OPC_Scope, 8, 0, 0, // end scope at 16
+// CHECK-LARGE-NEXT: /* 8 */       OPC_CheckField, 2, 2, 0,
+// CHECK-LARGE-NEXT: /* 12 */      OPC_Decode, {{[0-9]+}}, {{[0-9]+}}, 0,
+// CHECK-LARGE-NEXT: /* 16 */      OPC_Decode, {{[0-9]+}}, {{[0-9]+}}, 1, // Opcode: InstA, DecodeIdx: 1
 // CHECK-LARGE-NEXT: };
 
 // CHECK-LARGE: if (!Check(S, DecodeInstBOp(MI, tmp, Address, Decoder))) { DecodeComplete = false; return MCDisassembler::Fail; }
diff --git a/llvm/test/TableGen/trydecode-emission4.td b/llvm/test/TableGen/DecoderEmitter/trydecode-emission4.td
similarity index 54%
rename from llvm/test/TableGen/trydecode-emission4.td
rename to llvm/test/TableGen/DecoderEmitter/trydecode-emission4.td
index 439bd9d4ff369..2ff160cda2ec5 100644
--- a/llvm/test/TableGen/trydecode-emission4.td
+++ b/llvm/test/TableGen/DecoderEmitter/trydecode-emission4.td
@@ -1,5 +1,5 @@
-// RUN: llvm-tblgen -gen-disassembler  -I %p/../../include %s | FileCheck %s
-// RUN: llvm-tblgen -gen-disassembler --large-decoder-table -I %p/../../include %s | FileCheck %s --check-prefix=CHECK-LARGE
+// RUN: llvm-tblgen -gen-disassembler  -I %p/../../../include %s | FileCheck %s
+// RUN: llvm-tblgen -gen-disassembler --large-decoder-table -I %p/../../../include %s | FileCheck %s --check-prefix=CHECK-LARGE
 
 // Test for OPC_ExtractField/OPC_CheckField with start bit > 255.
 // These large start values may arise for architectures with long instruction
@@ -33,21 +33,21 @@ def InstB : TestInstruction {
   let hasCompleteDecoder = 0;
 }
 
-// CHECK:      /* 0 */       MCD::OPC_CheckField, 250, 3, 4, 0,
-// CHECK-NEXT: /* 5 */       MCD::OPC_Scope, 9, 0, // Skip to: 17
-// CHECK-NEXT: /* 8 */       MCD::OPC_CheckField, 248, 3, 2, 0,
-// CHECK-NEXT: /* 13 */      MCD::OPC_TryDecode, {{[0-9]+}}, {{[0-9]+}}, 0,
-// CHECK-NEXT: /* 17 */      MCD::OPC_Decode, {{[0-9]+}}, {{[0-9]+}}, 1, // Opcode: InstA, DecodeIdx: 1
+// CHECK:      /* 0 */       OPC_CheckField, 250, 3, 4, 0,
+// CHECK-NEXT: /* 5 */       OPC_Scope, 9, 0, // end scope at 17
+// CHECK-NEXT: /* 8 */       OPC_CheckField, 248, 3, 2, 0,
+// CHECK-NEXT: /* 13 */      OPC_Decode, {{[0-9]+}}, {{[0-9]+}}, 0,
+// CHECK-NEXT: /* 17 */      OPC_Decode, {{[0-9]+}}, {{[0-9]+}}, 1, // Opcode: InstA, DecodeIdx: 1
 // CHECK-NEXT: };
 
 // CHECK: if (!Check(S, DecodeInstB(MI, insn, Address, Decoder))) { DecodeComplete = false; return MCDisassembler::Fail; }
 
 
-// CHECK-LARGE:      /* 0 */       MCD::OPC_CheckField, 250, 3, 4, 0,
-// CHECK-LARGE-NEXT: /* 5 */       MCD::OPC_Scope, 9, 0, 0, // Skip to: 18
-// CHECK-LARGE-NEXT: /* 9 */       MCD::OPC_CheckField, 248, 3, 2, 0,
-// CHECK-LARGE-NEXT: /* 14 */      MCD::OPC_TryDecode, {{[0-9]+}}, {{[0-9]+}}, 0,
-// CHECK-LARGE-NEXT: /* 18 */      MCD::OPC_Decode, {{[0-9]+}}, {{[0-9]+}}, 1, // Opcode: InstA, DecodeIdx: 1
+// CHECK-LARGE:      /* 0 */       OPC_CheckField, 250, 3, 4, 0,
+// CHECK-LARGE-NEXT: /* 5 */       OPC_Scope, 9, 0, 0, // end scope at 18
+// CHECK-LARGE-NEXT: /* 9 */       OPC_CheckField, 248, 3, 2, 0,
+// CHECK-LARGE-NEXT: /* 14 */      OPC_Decode, {{[0-9]+}}, {{[0-9]+}}, 0,
+// CHECK-LARGE-NEXT: /* 18 */      OPC_Decode, {{[0-9]+}}, {{[0-9]+}}, 1, // Opcode: InstA, DecodeIdx: 1
 // CHECK-LARGE-NEXT: };
 
 // CHECK-LARGE: if (!Check(S, DecodeInstB(MI, insn, Address, Decoder))) { DecodeComplete = false; return MCDisassembler::Fail; }
diff --git a/llvm/test/TableGen/DecoderEmitter/var-len-conflict-1.td b/llvm/test/TableGen/DecoderEmitter/var-len-conflict-1.td
new file mode 100644
index 0000000000000..b18d28a9f5136
--- /dev/null
+++ b/llvm/test/TableGen/DecoderEmitter/var-len-conflict-1.td
@@ -0,0 +1,45 @@
+// RUN: llvm-tblgen -gen-disassembler -I %p/../../../include %s | FileCheck %s
+
+include "llvm/Target/Target.td"
+
+class I : Instruction {
+  let InOperandList = (ins i32imm:$op);
+  let OutOperandList = (outs);
+}
+
+// Check that we don't try to read the second byte without ruling out
+// 1-byte encodings first. This should actually be a decoding conflict,
+// but DecoderEmitter heuristics decide that I8_0 and I8_1 are more specific
+// than the rest and give them priority.
+
+//          _______0  I8_0
+//          _______1  I8_1
+// 00000000 ________  I16_0
+// 00000001 ________  I16_1
+// 00000010 ________  I16_2
+
+// CHECK:      /* 0 */  OPC_Scope, 17, 0,               // end scope at 20
+// CHECK-NEXT: /* 3 */  OPC_ExtractField, 0, 1,         // Field = Inst{0}
+// CHECK-NEXT: /* 6 */  OPC_FilterValueOrSkip, 0, 4, 0, // if Field != 0x0 skip to 14
+// CHECK-NEXT: /* 10 */ OPC_Decode, {{[0-9]+}}, 2, 0,   // Opcode: I8_0, DecodeIdx: 0
+// CHECK-NEXT: /* 14 */ OPC_FilterValue, 1,             // if Field != 0x1
+// CHECK-NEXT: /* 16 */ OPC_Decode, {{[0-9]+}}, 2, 0,   // Opcode: I8_1, DecodeIdx: 0
+// CHECK-NEXT: /* 20 */ OPC_ExtractField, 8, 8,         // Field = Inst{15-8}
+// CHECK-NEXT: /* 23 */ OPC_FilterValueOrSkip, 0, 4, 0, // if Field != 0x0 skip to 31
+// CHECK-NEXT: /* 27 */ OPC_Decode, {{[0-9]+}}, 2, 1,   // Opcode: I16_0, DecodeIdx: 1
+// CHECK-NEXT: /* 31 */ OPC_FilterValueOrSkip, 1, 4, 0, // if Field != 0x1 skip to 39
+// CHECK-NEXT: /* 35 */ OPC_Decode, {{[0-9]+}}, 2, 1,   // Opcode: I16_1, DecodeIdx: 1
+// CHECK-NEXT: /* 39 */ OPC_FilterValue, 2,             // if Field != 0x2
+// CHECK-NEXT: /* 41 */ OPC_Decode, {{[0-9]+}}, 2, 1,   // Opcode: I16_2, DecodeIdx: 1
+
+def I8_0  : I { dag Inst = (descend (operand "$op", 7), 0b0); }
+def I8_1  : I { dag Inst = (descend (operand "$op", 7), 0b1); }
+def I16_0 : I { dag Inst = (descend 0b00000000, (operand "$op", 8)); }
+def I16_1 : I { dag Inst = (descend 0b00000001, (operand "$op", 8)); }
+def I16_2 : I { dag Inst = (descend 0b00000010, (operand "$op", 8)); }
+
+def II : InstrInfo;
+
+def MyTarget : Target {
+  let InstructionSet = II;
+}
diff --git a/llvm/test/TableGen/FixedLenDecoderEmitter/var-len-conflict-2.td b/llvm/test/TableGen/DecoderEmitter/var-len-conflict-2.td
similarity index 100%
rename from llvm/test/TableGen/FixedLenDecoderEmitter/var-len-conflict-2.td
rename to llvm/test/TableGen/DecoderEmitter/var-len-conflict-2.td
diff --git a/llvm/test/TableGen/FixedLenDecoderEmitter/InitValue.td b/llvm/test/TableGen/FixedLenDecoderEmitter/InitValue.td
deleted file mode 100644
index 03847439ffc2e..0000000000000
--- a/llvm/test/TableGen/FixedLenDecoderEmitter/InitValue.td
+++ /dev/null
@@ -1,46 +0,0 @@
-// RUN: llvm-tblgen -gen-disassembler -I %p/../../../include %s | FileCheck %s
-
-include "llvm/Target/Target.td"
-
-def archInstrInfo : InstrInfo { }
-
-def arch : Target {
-    let InstructionSet = archInstrInfo;
-}
-
-let OutOperandList = (outs), Size = 2 in {
-
-def foo : Instruction {
-    let InOperandList = (ins i32imm:$factor);
-    field bits<16> Inst;
-    field bits<16> SoftFail = 0;
-    bits<8> factor;
-    let factor{0} = 0; // zero initial value
-    let Inst{15...8} = factor{7...0};
-    }
-
-def bar : Instruction {
-    let InOperandList = (ins i32imm:$factor);
-    field bits<16> Inst;
-    field bits<16> SoftFail = 0;
-    bits<8> factor;
-    let factor{0} = 1; // non-zero initial value
-    let Inst{15...8} = factor{7...0};
-    }
-
-def bax : Instruction {
-    let InOperandList = (ins i32imm:$factor);
-    field bits<16> Inst;
-    field bits<16> SoftFail = 0;
-    bits<33> factor;
-    let factor{32} = 1; // non-zero initial value
-    let Inst{15...8} = factor{32...25};
-    }
-
-}
-
-// CHECK: tmp = fieldFromInstruction(insn, 9, 7) << 1;
-// CHECK: tmp = 0x1;
-// CHECK: insertBits(tmp, fieldFromInstruction(insn, 9, 7), 1, 7);
-// CHECK: tmp = 0x100000000;
-// CHECK: insertBits(tmp, fieldFromInstruction(insn, 8, 7), 25, 7);
diff --git a/llvm/test/TableGen/FixedLenDecoderEmitter/additional-encoding.td b/llvm/test/TableGen/FixedLenDecoderEmitter/additional-encoding.td
deleted file mode 100644
index ec7e35e1ecac7..0000000000000
--- a/llvm/test/TableGen/FixedLenDecoderEmitter/additional-encoding.td
+++ /dev/null
@@ -1,71 +0,0 @@
-// RUN: llvm-tblgen -gen-disassembler -I %p/../../../include %s | FileCheck %s
-
-include "llvm/Target/Target.td"
-
-class Enc {
-  int Size = 2;
-  bits<16> Inst;
-}
-
-class EncSHIFT<bits<2> opc> : Enc {
-  bits<6> shamt;
-  let Inst{15...14} = {0, 0};
-  let Inst{13...12} = opc;
-  let Inst{11...6} = shamt;
-}
-
-class EncNOP<bits<2> opc> : Enc {
-  let Inst{15...14} = {0, 0};
-  let Inst{13...12} = opc;
-  let Inst{11...6} = {0, 0, 0, 0, 0, 0};
-}
-
-def ShAmtOp : Operand<i32> {
-  let DecoderMethod = "decodeShAmt";
-  let hasCompleteDecoder = false;
-}
-
-class I<dag out_ops, dag in_ops> : Instruction {
-  let InOperandList = in_ops;
-  let OutOperandList = out_ops;
-}
-
-// CHECK:      /* 0 */  MCD::OPC_ExtractField, 12, 4,  // Inst{15-12} ...
-// CHECK-NEXT: /* 3 */  MCD::OPC_FilterValueOrSkip, 0, 15, 0, // Skip to: 22
-// CHECK-NEXT: /* 7 */  MCD::OPC_Scope, 8, 0, // Skip to: 18
-// CHECK-NEXT: /* 10 */ MCD::OPC_CheckField, 6, 6, 0,
-// CHECK-NEXT: /* 14 */ MCD::OPC_Decode, {{[0-9]+}}, 2, 0, // Opcode: {{.*}}:NOP, DecodeIdx: 0
-// CHECK-NEXT: /* 18 */ MCD::OPC_TryDecode, 187, 2, 1,
-// CHECK-NEXT: /* 22 */ MCD::OPC_FilterValueOrSkip, 1, 15, 0, // Skip to: 41
-// CHECK-NEXT: /* 26 */ MCD::OPC_Scope, 8, 0, // Skip to: 37
-// CHECK-NEXT: /* 29 */ MCD::OPC_CheckField, 6, 6, 0,
-// CHECK-NEXT: /* 33 */ MCD::OPC_Decode, {{[0-9]+}}, 2, 0, // Opcode: {{.*}}:NOP, DecodeIdx: 0
-// CHECK-NEXT: /* 37 */ MCD::OPC_TryDecode, 188, 2, 1,
-// CHECK-NEXT: /* 41 */ MCD::OPC_FilterValueOrSkip, 2, 15, 0, // Skip to: 60
-// CHECK-NEXT: /* 45 */ MCD::OPC_Scope, 8, 0, // Skip to: 56
-// CHECK-NEXT: /* 48 */ MCD::OPC_CheckField, 6, 6, 0,
-// CHECK-NEXT: /* 52 */ MCD::OPC_Decode, {{[0-9]+}}, 2, 0, // Opcode: {{.*}}:NOP, DecodeIdx: 0
-// CHECK-NEXT: /* 56 */ MCD::OPC_TryDecode, 189, 2, 1,
-// CHECK-NEXT: /* 60 */ MCD::OPC_FilterValue, 3,
-// CHECK-NEXT: /* 62 */ MCD::OPC_Scope, 8, 0, // Skip to: 73
-// CHECK-NEXT: /* 65 */ MCD::OPC_CheckField, 6, 6, 0,
-// CHECK-NEXT: /* 69 */ MCD::OPC_Decode, {{[0-9]+}}, 2, 0, // Opcode: {{.*}}:NOP, DecodeIdx: 0
-// CHECK-NEXT: /* 73 */ MCD::OPC_TryDecode, 190, 2, 1,
-
-
-class SHIFT<bits<2> opc> : I<(outs), (ins ShAmtOp:$shamt)>, EncSHIFT<opc>;
-def SHIFT0 : SHIFT<0>;
-def SHIFT1 : SHIFT<1>;
-def SHIFT2 : SHIFT<2>;
-def SHIFT3 : SHIFT<3>;
-
-def NOP : I<(outs), (ins)>, EncNOP<0>;
-def : AdditionalEncoding<NOP>, EncNOP<1>;
-def : AdditionalEncoding<NOP>, EncNOP<2>;
-def : AdditionalEncoding<NOP>, EncNOP<3>;
-
-def II : InstrInfo;
-
-def MyTarget : Target {
-  let InstructionSet = II;
-}
diff --git a/llvm/test/TableGen/FixedLenDecoderEmitter/big-filter.td b/llvm/test/TableGen/FixedLenDecoderEmitter/big-filter.td
deleted file mode 100644
index 28762bfa1ec24..0000000000000
--- a/llvm/test/TableGen/FixedLenDecoderEmitter/big-filter.td
+++ /dev/null
@@ -1,38 +0,0 @@
-// RUN: llvm-tblgen -gen-disassembler -I %p/../../../include %s | FileCheck %s
-
-include "llvm/Target/Target.td"
-
-class I : Instruction {
-  let InOperandList = (ins);
-  let OutOperandList = (outs);
-  let Size = 16;
-  bits<128> Inst;
-}
-
-// Check that a 64-bit filter with all bits set does not confuse DecoderEmitter.
-//
-// CHECK-LABEL: static const uint8_t DecoderTable128[34] = {
-// CHECK-NEXT:  /* 0 */  MCD::OPC_ExtractField, 0, 64,  // Inst{63-0} ...
-// CHECK-NEXT:  /* 3 */  MCD::OPC_FilterValueOrSkip, 1, 8, 0, // Skip to: 15
-// CHECK-NEXT:  /* 7 */  MCD::OPC_CheckField, 127, 1, 1,
-// CHECK-NEXT:  /* 11 */ MCD::OPC_Decode, {{[0-9]+}}, 2, 0, // Opcode: I2, DecodeIdx: 0
-// CHECK-NEXT:  /* 15 */ MCD::OPC_FilterValue, 255, 255, 255, 255, 255, 255, 255, 255, 255, 1,
-// CHECK-NEXT:  /* 26 */ MCD::OPC_CheckField, 127, 1, 0,
-// CHECK-NEXT:  /* 30 */ MCD::OPC_Decode, {{[0-9]+}}, 2, 0, // Opcode: I1, DecodeIdx: 0
-// CHECK-NEXT:  };
-
-def I1 : I {
-  let Inst{63...0} = -1;
-  let Inst{127} = 0;
-}
-
-def I2 : I {
-  let Inst{63...0} = 1;
-  let Inst{127} = 1;
-}
-
-def II : InstrInfo;
-
-def MyTarget : Target {
-  let InstructionSet = II;
-}
diff --git a/llvm/test/TableGen/FixedLenDecoderEmitter/var-len-conflict-1.td b/llvm/test/TableGen/FixedLenDecoderEmitter/var-len-conflict-1.td
deleted file mode 100644
index 8afcf786f9c73..0000000000000
--- a/llvm/test/TableGen/FixedLenDecoderEmitter/var-len-conflict-1.td
+++ /dev/null
@@ -1,45 +0,0 @@
-// RUN: llvm-tblgen -gen-disassembler -I %p/../../../include %s | FileCheck %s
-
-include "llvm/Target/Target.td"
-
-class I : Instruction {
-  let InOperandList = (ins i32imm:$op);
-  let OutOperandList = (outs);
-}
-
-// Check that we don't try to read the second byte without ruling out
-// 1-byte encodings first. This should actually be a decoding conflict,
-// but DecoderEmitter heuristics decide that I8_0 and I8_1 are more specific
-// than the rest and give them priority.
-
-//          _______0  I8_0
-//          _______1  I8_1
-// 00000000 ________  I16_0
-// 00000001 ________  I16_1
-// 00000010 ________  I16_2
-
-// CHECK:      /* 0 */  MCD::OPC_Scope, 17, 0,               // Skip to: 20
-// CHECK-NEXT: /* 3 */  MCD::OPC_ExtractField, 0, 1,         // Inst{0} ...
-// CHECK-NEXT: /* 6 */  MCD::OPC_FilterValueOrSkip, 0, 4, 0, // Skip to: 14
-// CHECK-NEXT: /* 10 */ MCD::OPC_Decode, {{[0-9]+}}, 2, 0,   // Opcode: I8_0, DecodeIdx: 0
-// CHECK-NEXT: /* 14 */ MCD::OPC_FilterValue, 1,
-// CHECK-NEXT: /* 16 */ MCD::OPC_Decode, {{[0-9]+}}, 2, 0,   // Opcode: I8_1, DecodeIdx: 0
-// CHECK-NEXT: /* 20 */ MCD::OPC_ExtractField, 8, 8,         // Inst{15-8} ...
-// CHECK-NEXT: /* 23 */ MCD::OPC_FilterValueOrSkip, 0, 4, 0, // Skip to: 31
-// CHECK-NEXT: /* 27 */ MCD::OPC_Decode, {{[0-9]+}}, 2, 1,   // Opcode: I16_0, DecodeIdx: 1
-// CHECK-NEXT: /* 31 */ MCD::OPC_FilterValueOrSkip, 1, 4, 0, // Skip to: 39
-// CHECK-NEXT: /* 35 */ MCD::OPC_Decode, {{[0-9]+}}, 2, 1,   // Opcode: I16_1, DecodeIdx: 1
-// CHECK-NEXT: /* 39 */ MCD::OPC_FilterValue, 2,
-// CHECK-NEXT: /* 41 */ MCD::OPC_Decode, {{[0-9]+}}, 2, 1,   // Opcode: I16_2, DecodeIdx: 1
-
-def I8_0  : I { dag Inst = (descend (operand "$op", 7), 0b0); }
-def I8_1  : I { dag Inst = (descend (operand "$op", 7), 0b1); }
-def I16_0 : I { dag Inst = (descend 0b00000000, (operand "$op", 8)); }
-def I16_1 : I { dag Inst = (descend 0b00000001, (operand "$op", 8)); }
-def I16_2 : I { dag Inst = (descend 0b00000010, (operand "$op", 8)); }
-
-def II : InstrInfo;
-
-def MyTarget : Target {
-  let InstructionSet = II;
-}
diff --git a/llvm/test/TableGen/HwModeEncodeDecode3.td b/llvm/test/TableGen/HwModeEncodeDecode3.td
index 6a7d080491c2e..3893216f70e01 100644
--- a/llvm/test/TableGen/HwModeEncodeDecode3.td
+++ b/llvm/test/TableGen/HwModeEncodeDecode3.td
@@ -230,28 +230,22 @@ def unrelated: Instruction {
 // ENCODER: default: llvm_unreachable("Unhandled HwMode");
 // ENCODER: case 0: {
 // ENCODER: op = getMachineOpValue(MI, MI.getOperand(0), Fixups, STI);
-// ENCODER: op &= UINT64_C(240);
-// ENCODER: Value |= op;
+// ENCODER: Value |= (op & 0xf0);
 // ENCODER: break;
 // ENCODER: }
 // ENCODER: case 1: {
 // ENCODER: op = getMachineOpValue(MI, MI.getOperand(0), Fixups, STI);
-// ENCODER: op &= UINT64_C(240);
-// ENCODER: Value |= op;
+// ENCODER: Value |= (op & 0xf0);
 // ENCODER: break;
 // ENCODER: }
 // ENCODER: case 2: {
 // ENCODER: op = getMachineOpValue(MI, MI.getOperand(0), Fixups, STI);
-// ENCODER: op &= UINT64_C(255);
-// ENCODER: op <<= 8;
-// ENCODER: Value |= op;
+// ENCODER: Value |= (op & 0xff) << 8;
 // ENCODER: break;
 // ENCODER: }
 // ENCODER: case 3: {
 // ENCODER: op = getMachineOpValue(MI, MI.getOperand(0), Fixups, STI);
-// ENCODER: op &= UINT64_C(255);
-// ENCODER: op <<= 24;
-// ENCODER: Value |= op;
+// ENCODER: Value |= (op & 0xff) << 24;
 // ENCODER: break;
 // ENCODER: }
 // ENCODER-LABEL: case ::baz: {
@@ -265,7 +259,6 @@ def unrelated: Instruction {
 // ENCODER: default: llvm_unreachable("Unhandled HwMode");
 // ENCODER: case 2: {
 // ENCODER: op = getMachineOpValue(MI, MI.getOperand(0), Fixups, STI);
-// ENCODER: op &= UINT64_C(240);
-// ENCODER: Value |= op;
+// ENCODER: Value |= (op & 0xf0);
 // ENCODER: break;
 // ENCODER: }
diff --git a/llvm/test/TableGen/RegisterEncoder.td b/llvm/test/TableGen/RegisterEncoder.td
index 3038eab42411c..02366a8a2e061 100644
--- a/llvm/test/TableGen/RegisterEncoder.td
+++ b/llvm/test/TableGen/RegisterEncoder.td
@@ -30,8 +30,7 @@ def foo1 : Instruction {
 
 // CHECK: case ::foo1: {
 // CHECK:   op = barEncoder
-// CHECK:   op &= UINT64_C(255);
-// CHECK:   Value |= op;
+// CHECK:   Value |= (op & 0xff);
 // CHECK:   break;
 // CHECK: }
 
@@ -57,10 +56,8 @@ def foo2 : Instruction {
 
 // CHECK: case ::foo2: {
 // CHECK:   op = barEncoder
-// CHECK:   op &= UINT64_C(15);
-// CHECK:   Value |= op;
+// CHECK:   Value |= (op & 0xf);
 // CHECK:   op = barEncoder
-// CHECK:   op &= UINT64_C(15);
-// CHECK:   Value |= op;
+// CHECK:   Value |= (op & 0xf) << 4;
 // CHECK:   break;
 // CHECK: }
diff --git a/llvm/test/TableGen/intrinsic-attrs.td b/llvm/test/TableGen/intrinsic-attrs.td
index bcded0cd2e9f1..ab808445f40a2 100644
--- a/llvm/test/TableGen/intrinsic-attrs.td
+++ b/llvm/test/TableGen/intrinsic-attrs.td
@@ -25,8 +25,8 @@ def int_deref_ptr_ret : Intrinsic<[llvm_ptr_ty], [], [Dereferenceable<RetIndex,
 // CHECK-NEXT: });
 
 // CHECK: static constexpr uint16_t IntrinsicsToAttributesMap[] = {
-// CHECK: 0 << 8 | 0, // llvm.deref.ptr.ret
-// CHECK: 1 << 8 | 1, // llvm.random.gen
+// CHECK: 0 << 2 | 0, // llvm.deref.ptr.ret
+// CHECK: 1 << 2 | 1, // llvm.random.gen
 // CHECK: }; // IntrinsicsToAttributesMap
 
 // CHECK: static constexpr ArgNoAttrIDPair ArgAttrIdTable[] = {
diff --git a/llvm/test/ThinLTO/X86/memprof-basic.ll b/llvm/test/ThinLTO/X86/memprof-basic.ll
index 757f6e9322b23..0ff0ce04452f1 100644
--- a/llvm/test/ThinLTO/X86/memprof-basic.ll
+++ b/llvm/test/ThinLTO/X86/memprof-basic.ll
@@ -341,34 +341,34 @@ attributes #0 = { noinline optnone }
 
 ; DOT: digraph "postbuild" {
 ; DOT: 	label="postbuild";
-; DOT: 	Node[[BAR:0x[a-z0-9]+]] [shape=record,tooltip="N[[BAR]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",label="{OrigId: Alloc0\n_Z3barv -\> alloc}"];
-; DOT: 	Node[[BAZ:0x[a-z0-9]+]] [shape=record,tooltip="N[[BAZ]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",label="{OrigId: 12481870273128938184\n_Z3bazv -\> _Z3barv}"];
+; DOT: 	Node[[BAR:0x[a-z0-9]+]] [shape=record,tooltip="N[[BAR]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",label="{OrigId: Alloc0{{.*}}\n_Z3barv -\> alloc}"];
+; DOT: 	Node[[BAZ:0x[a-z0-9]+]] [shape=record,tooltip="N[[BAZ]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",label="{OrigId: 12481870273128938184{{.*}}\n_Z3bazv -\> _Z3barv}"];
 ; DOT: 	Node[[BAZ]] -> Node[[BAR]][tooltip="ContextIds: 1 2",fillcolor="mediumorchid1",color="mediumorchid1"];
-; DOT: 	Node[[FOO:0x[a-z0-9]+]] [shape=record,tooltip="N[[FOO]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",label="{OrigId: 2732490490862098848\n_Z3foov -\> _Z3bazv}"];
+; DOT: 	Node[[FOO:0x[a-z0-9]+]] [shape=record,tooltip="N[[FOO]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",label="{OrigId: 2732490490862098848{{.*}}\n_Z3foov -\> _Z3bazv}"];
 ; DOT: 	Node[[FOO]] -> Node[[BAZ]][tooltip="ContextIds: 1 2",fillcolor="mediumorchid1",color="mediumorchid1"];
-; DOT: 	Node[[MAIN1:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN1]] ContextIds: 1",fillcolor="brown1",style="filled",label="{OrigId: 8632435727821051414\nmain -\> _Z3foov}"];
+; DOT: 	Node[[MAIN1:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN1]] ContextIds: 1",fillcolor="brown1",style="filled",label="{OrigId: 8632435727821051414{{.*}}\nmain -\> _Z3foov}"];
 ; DOT: 	Node[[MAIN1]] -> Node[[FOO]][tooltip="ContextIds: 1",fillcolor="brown1",color="brown1"];
-; DOT: 	Node[[MAIN2:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN2]] ContextIds: 2",fillcolor="cyan",style="filled",label="{OrigId: 15025054523792398438\nmain -\> _Z3foov}"];
+; DOT: 	Node[[MAIN2:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN2]] ContextIds: 2",fillcolor="cyan",style="filled",label="{OrigId: 15025054523792398438{{.*}}\nmain -\> _Z3foov}"];
 ; DOT: 	Node[[MAIN2]] -> Node[[FOO]][tooltip="ContextIds: 2",fillcolor="cyan",color="cyan"];
 ; DOT: }
 
 
 ; DOTCLONED: digraph "cloned" {
 ; DOTCLONED: 	label="cloned";
-; DOTCLONED: 	Node[[BAR:0x[a-z0-9]+]] [shape=record,tooltip="N[[BAR]] ContextIds: 1",fillcolor="brown1",style="filled",label="{OrigId: Alloc0\n_Z3barv -\> alloc}"];
-; DOTCLONED: 	Node[[BAZ:0x[a-z0-9]+]] [shape=record,tooltip="N[[BAZ]] ContextIds: 1",fillcolor="brown1",style="filled",label="{OrigId: 12481870273128938184\n_Z3bazv -\> _Z3barv}"];
+; DOTCLONED: 	Node[[BAR:0x[a-z0-9]+]] [shape=record,tooltip="N[[BAR]] ContextIds: 1",fillcolor="brown1",style="filled",label="{OrigId: Alloc0{{.*}}\n_Z3barv -\> alloc}"];
+; DOTCLONED: 	Node[[BAZ:0x[a-z0-9]+]] [shape=record,tooltip="N[[BAZ]] ContextIds: 1",fillcolor="brown1",style="filled",label="{OrigId: 12481870273128938184{{.*}}\n_Z3bazv -\> _Z3barv}"];
 ; DOTCLONED: 	Node[[BAZ]] -> Node[[BAR]][tooltip="ContextIds: 1",fillcolor="brown1",color="brown1"];
-; DOTCLONED: 	Node[[FOO:0x[a-z0-9]+]] [shape=record,tooltip="N[[FOO]] ContextIds: 1",fillcolor="brown1",style="filled",label="{OrigId: 2732490490862098848\n_Z3foov -\> _Z3bazv}"];
+; DOTCLONED: 	Node[[FOO:0x[a-z0-9]+]] [shape=record,tooltip="N[[FOO]] ContextIds: 1",fillcolor="brown1",style="filled",label="{OrigId: 2732490490862098848{{.*}}\n_Z3foov -\> _Z3bazv}"];
 ; DOTCLONED: 	Node[[FOO]] -> Node[[BAZ]][tooltip="ContextIds: 1",fillcolor="brown1",color="brown1"];
-; DOTCLONED: 	Node[[MAIN1:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN1]] ContextIds: 1",fillcolor="brown1",style="filled",label="{OrigId: 8632435727821051414\nmain -\> _Z3foov}"];
+; DOTCLONED: 	Node[[MAIN1:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN1]] ContextIds: 1",fillcolor="brown1",style="filled",label="{OrigId: 8632435727821051414{{.*}}\nmain -\> _Z3foov}"];
 ; DOTCLONED: 	Node[[MAIN1]] -> Node[[FOO]][tooltip="ContextIds: 1",fillcolor="brown1",color="brown1"];
-; DOTCLONED: 	Node[[MAIN2:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN2]] ContextIds: 2",fillcolor="cyan",style="filled",label="{OrigId: 15025054523792398438\nmain -\> _Z3foov}"];
+; DOTCLONED: 	Node[[MAIN2:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN2]] ContextIds: 2",fillcolor="cyan",style="filled",label="{OrigId: 15025054523792398438{{.*}}\nmain -\> _Z3foov}"];
 ; DOTCLONED: 	Node[[MAIN2]] -> Node[[FOO2:0x[a-z0-9]+]][tooltip="ContextIds: 2",fillcolor="cyan",color="cyan"];
-; DOTCLONED: 	Node[[FOO2]] [shape=record,tooltip="N[[FOO2]] ContextIds: 2",fillcolor="cyan",color="blue",style="filled,bold,dashed",label="{OrigId: 0\n_Z3foov -\> _Z3bazv}"];
+; DOTCLONED: 	Node[[FOO2]] [shape=record,tooltip="N[[FOO2]] ContextIds: 2",fillcolor="cyan",color="blue",style="filled,bold,dashed",label="{OrigId: 0{{.*}}\n_Z3foov -\> _Z3bazv}"];
 ; DOTCLONED: 	Node[[FOO2]] -> Node[[BAZ2:0x[a-z0-9]+]][tooltip="ContextIds: 2",fillcolor="cyan",color="cyan"];
-; DOTCLONED: 	Node[[BAZ2]] [shape=record,tooltip="N[[BAZ2]] ContextIds: 2",fillcolor="cyan",color="blue",style="filled,bold,dashed",label="{OrigId: 0\n_Z3bazv -\> _Z3barv}"];
+; DOTCLONED: 	Node[[BAZ2]] [shape=record,tooltip="N[[BAZ2]] ContextIds: 2",fillcolor="cyan",color="blue",style="filled,bold,dashed",label="{OrigId: 0{{.*}}\n_Z3bazv -\> _Z3barv}"];
 ; DOTCLONED: 	Node[[BAZ2]] -> Node[[BAR2:0x[a-z0-9]+]][tooltip="ContextIds: 2",fillcolor="cyan",color="cyan"];
-; DOTCLONED: 	Node[[BAR2]] [shape=record,tooltip="N[[BAR2]] ContextIds: 2",fillcolor="cyan",color="blue",style="filled,bold,dashed",label="{OrigId: Alloc0\n_Z3barv -\> alloc}"];
+; DOTCLONED: 	Node[[BAR2]] [shape=record,tooltip="N[[BAR2]] ContextIds: 2",fillcolor="cyan",color="blue",style="filled,bold,dashed",label="{OrigId: Alloc0{{.*}}\n_Z3barv -\> alloc}"];
 ; DOTCLONED: }
 
 ;; Here we are just ensuring that the post-function assign dot graph includes
diff --git a/llvm/test/ThinLTO/X86/memprof-indirectcall.ll b/llvm/test/ThinLTO/X86/memprof-indirectcall.ll
index a69a5b50f5df1..1a6bf0151489e 100644
--- a/llvm/test/ThinLTO/X86/memprof-indirectcall.ll
+++ b/llvm/test/ThinLTO/X86/memprof-indirectcall.ll
@@ -434,50 +434,50 @@ attributes #0 = { noinline optnone }
 
 ; DOT: digraph "postbuild" {
 ; DOT: 	label="postbuild";
-; DOT: 	Node[[FOO:0x[a-z0-9]+]] [shape=record,tooltip="N[[FOO]] ContextIds: 1 2 3 4 5 6",fillcolor="mediumorchid1",style="filled",label="{OrigId: Alloc0\n_Z3foov -\> alloc}"];
-; DOT: 	Node[[AX:0x[a-z0-9]+]] [shape=record,tooltip="N[[AX]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",label="{OrigId: 8256774051149711748\n_ZN1A1xEv -\> _Z3foov}"];
+; DOT: 	Node[[FOO:0x[a-z0-9]+]] [shape=record,tooltip="N[[FOO]] ContextIds: 1 2 3 4 5 6",fillcolor="mediumorchid1",style="filled",label="{OrigId: Alloc0{{.*}}\n_Z3foov -\> alloc}"];
+; DOT: 	Node[[AX:0x[a-z0-9]+]] [shape=record,tooltip="N[[AX]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",label="{OrigId: 8256774051149711748{{.*}}\n_ZN1A1xEv -\> _Z3foov}"];
 ; DOT: 	Node[[AX]] -> Node[[FOO]][tooltip="ContextIds: 1 2",fillcolor="mediumorchid1"
-; DOT: 	Node[[BAR:0x[a-z0-9]+]] [shape=record,tooltip="N[[BAR]] ContextIds: 1 2 4 5",fillcolor="mediumorchid1",style="filled",label="{OrigId: 13626499562959447861\nnull call (external)}"];
+; DOT: 	Node[[BAR:0x[a-z0-9]+]] [shape=record,tooltip="N[[BAR]] ContextIds: 1 2 4 5",fillcolor="mediumorchid1",style="filled",label="{OrigId: 13626499562959447861{{.*}}\nnull call (external)}"];
 ; DOT: 	Node[[BAR]] -> Node[[AX]][tooltip="ContextIds: 1 2",fillcolor="mediumorchid1"
 ; DOT: 	Node[[BAR]] -> Node[[BX:0x[a-z0-9]+]][tooltip="ContextIds: 4 5",fillcolor="mediumorchid1"
-; DOT: 	Node[[MAIN1:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN1]] ContextIds: 1",fillcolor="brown1",style="filled",label="{OrigId: 748269490701775343\nmain -\> _Z3barP1A}"];
+; DOT: 	Node[[MAIN1:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN1]] ContextIds: 1",fillcolor="brown1",style="filled",label="{OrigId: 748269490701775343{{.*}}\nmain -\> _Z3barP1A}"];
 ; DOT: 	Node[[MAIN1]] -> Node[[BAR]][tooltip="ContextIds: 1",fillcolor="brown1"
-; DOT: 	Node[[MAIN2:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN2]] ContextIds: 2",fillcolor="cyan",style="filled",label="{OrigId: 12699492813229484831\nmain -\> _Z3barP1A}"];
+; DOT: 	Node[[MAIN2:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN2]] ContextIds: 2",fillcolor="cyan",style="filled",label="{OrigId: 12699492813229484831{{.*}}\nmain -\> _Z3barP1A}"];
 ; DOT: 	Node[[MAIN2]] -> Node[[BAR]][tooltip="ContextIds: 2",fillcolor="cyan"
-; DOT: 	Node[[MAIN3:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN3]] ContextIds: 3",fillcolor="brown1",style="filled",label="{OrigId: 8632435727821051414\nmain -\> _Z3foov}"];
+; DOT: 	Node[[MAIN3:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN3]] ContextIds: 3",fillcolor="brown1",style="filled",label="{OrigId: 8632435727821051414{{.*}}\nmain -\> _Z3foov}"];
 ; DOT: 	Node[[MAIN3]] -> Node[[FOO]][tooltip="ContextIds: 3",fillcolor="brown1"
-; DOT: 	Node[[BX]] [shape=record,tooltip="N[[BX]] ContextIds: 4 5",fillcolor="mediumorchid1",style="filled",label="{OrigId: 13614864978754796978\n_ZN1B1xEv -\> _Z3foov}"];
+; DOT: 	Node[[BX]] [shape=record,tooltip="N[[BX]] ContextIds: 4 5",fillcolor="mediumorchid1",style="filled",label="{OrigId: 13614864978754796978{{.*}}\n_ZN1B1xEv -\> _Z3foov}"];
 ; DOT: 	Node[[BX]] -> Node[[FOO]][tooltip="ContextIds: 4 5",fillcolor="mediumorchid1"
-; DOT: 	Node[[MAIN4:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN4]] ContextIds: 4",fillcolor="cyan",style="filled",label="{OrigId: 6792096022461663180\nmain -\> _Z3barP1A}"];
+; DOT: 	Node[[MAIN4:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN4]] ContextIds: 4",fillcolor="cyan",style="filled",label="{OrigId: 6792096022461663180{{.*}}\nmain -\> _Z3barP1A}"];
 ; DOT: 	Node[[MAIN4]] -> Node[[BAR]][tooltip="ContextIds: 4",fillcolor="cyan"
-; DOT: 	Node[[MAIN5:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN5]] ContextIds: 5",fillcolor="brown1",style="filled",label="{OrigId: 15737101490731057601\nmain -\> _Z3barP1A}"];
+; DOT: 	Node[[MAIN5:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN5]] ContextIds: 5",fillcolor="brown1",style="filled",label="{OrigId: 15737101490731057601{{.*}}\nmain -\> _Z3barP1A}"];
 ; DOT: 	Node[[MAIN5]] -> Node[[BAR]][tooltip="ContextIds: 5",fillcolor="brown1"
-; DOT: 	Node[[MAIN6:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN6]] ContextIds: 6",fillcolor="cyan",style="filled",label="{OrigId: 15025054523792398438\nmain -\> _Z3foov}"];
+; DOT: 	Node[[MAIN6:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN6]] ContextIds: 6",fillcolor="cyan",style="filled",label="{OrigId: 15025054523792398438{{.*}}\nmain -\> _Z3foov}"];
 ; DOT: 	Node[[MAIN6]] -> Node[[FOO]][tooltip="ContextIds: 6",fillcolor="cyan"
 ; DOT: }
 
 
 ; DOTCLONED: digraph "cloned" {
 ; DOTCLONED: 	label="cloned";
-; DOTCLONED: 	Node[[FOO2:0x[a-z0-9]+]] [shape=record,tooltip="N[[FOO2]] ContextIds: 1 2 3 4 5",fillcolor="mediumorchid1",style="filled",label="{OrigId: Alloc0\n_Z3foov -\> alloc}"];
-; DOTCLONED: 	Node[[AX:0x[a-z0-9]+]] [shape=record,tooltip="N[[AX]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",label="{OrigId: 8256774051149711748\n_ZN1A1xEv -\> _Z3foov}"];
+; DOTCLONED: 	Node[[FOO2:0x[a-z0-9]+]] [shape=record,tooltip="N[[FOO2]] ContextIds: 1 2 3 4 5",fillcolor="mediumorchid1",style="filled",label="{OrigId: Alloc0{{.*}}\n_Z3foov -\> alloc}"];
+; DOTCLONED: 	Node[[AX:0x[a-z0-9]+]] [shape=record,tooltip="N[[AX]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",label="{OrigId: 8256774051149711748{{.*}}\n_ZN1A1xEv -\> _Z3foov}"];
 ; DOTCLONED: 	Node[[AX]] -> Node[[FOO2]][tooltip="ContextIds: 1 2",fillcolor="mediumorchid1"
-; DOTCLONED: 	Node[[BAR:0x[a-z0-9]+]] [shape=record,tooltip="N[[BAR]] ContextIds: 1 2 4 5",fillcolor="mediumorchid1",style="filled",label="{OrigId: 13626499562959447861\nnull call (external)}"];
+; DOTCLONED: 	Node[[BAR:0x[a-z0-9]+]] [shape=record,tooltip="N[[BAR]] ContextIds: 1 2 4 5",fillcolor="mediumorchid1",style="filled",label="{OrigId: 13626499562959447861{{.*}}\nnull call (external)}"];
 ; DOTCLONED: 	Node[[BAR]] -> Node[[AX]][tooltip="ContextIds: 1 2",fillcolor="mediumorchid1"
 ; DOTCLONED: 	Node[[BAR]] -> Node[[BX:0x[a-z0-9]+]][tooltip="ContextIds: 4 5",fillcolor="mediumorchid1"
-; DOTCLONED: 	Node[[MAIN1:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN1]] ContextIds: 1",fillcolor="brown1",style="filled",label="{OrigId: 748269490701775343\nmain -\> _Z3barP1A}"];
+; DOTCLONED: 	Node[[MAIN1:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN1]] ContextIds: 1",fillcolor="brown1",style="filled",label="{OrigId: 748269490701775343{{.*}}\nmain -\> _Z3barP1A}"];
 ; DOTCLONED: 	Node[[MAIN1]] -> Node[[BAR]][tooltip="ContextIds: 1",fillcolor="brown1"
-; DOTCLONED: 	Node[[MAIN2:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN2]] ContextIds: 2",fillcolor="cyan",style="filled",label="{OrigId: 12699492813229484831\nmain -\> _Z3barP1A}"];
+; DOTCLONED: 	Node[[MAIN2:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN2]] ContextIds: 2",fillcolor="cyan",style="filled",label="{OrigId: 12699492813229484831{{.*}}\nmain -\> _Z3barP1A}"];
 ; DOTCLONED: 	Node[[MAIN2]] -> Node[[BAR]][tooltip="ContextIds: 2",fillcolor="cyan"
-; DOTCLONED: 	Node[[MAIN3:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN3]] ContextIds: 3",fillcolor="brown1",style="filled",label="{OrigId: 8632435727821051414\nmain -\> _Z3foov}"];
+; DOTCLONED: 	Node[[MAIN3:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN3]] ContextIds: 3",fillcolor="brown1",style="filled",label="{OrigId: 8632435727821051414{{.*}}\nmain -\> _Z3foov}"];
 ; DOTCLONED: 	Node[[MAIN3]] -> Node[[FOO2]][tooltip="ContextIds: 3",fillcolor="brown1"
-; DOTCLONED: 	Node[[BX]] [shape=record,tooltip="N[[BX]] ContextIds: 4 5",fillcolor="mediumorchid1",style="filled",label="{OrigId: 13614864978754796978\n_ZN1B1xEv -\> _Z3foov}"];
+; DOTCLONED: 	Node[[BX]] [shape=record,tooltip="N[[BX]] ContextIds: 4 5",fillcolor="mediumorchid1",style="filled",label="{OrigId: 13614864978754796978{{.*}}\n_ZN1B1xEv -\> _Z3foov}"];
 ; DOTCLONED: 	Node[[BX]] -> Node[[FOO2]][tooltip="ContextIds: 4 5",fillcolor="mediumorchid1"
-; DOTCLONED: 	Node[[MAIN4:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN4]] ContextIds: 4",fillcolor="cyan",style="filled",label="{OrigId: 6792096022461663180\nmain -\> _Z3barP1A}"];
+; DOTCLONED: 	Node[[MAIN4:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN4]] ContextIds: 4",fillcolor="cyan",style="filled",label="{OrigId: 6792096022461663180{{.*}}\nmain -\> _Z3barP1A}"];
 ; DOTCLONED: 	Node[[MAIN4]] -> Node[[BAR]][tooltip="ContextIds: 4",fillcolor="cyan"
-; DOTCLONED: 	Node[[MAIN5:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN5]] ContextIds: 5",fillcolor="brown1",style="filled",label="{OrigId: 15737101490731057601\nmain -\> _Z3barP1A}"];
+; DOTCLONED: 	Node[[MAIN5:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN5]] ContextIds: 5",fillcolor="brown1",style="filled",label="{OrigId: 15737101490731057601{{.*}}\nmain -\> _Z3barP1A}"];
 ; DOTCLONED: 	Node[[MAIN5]] -> Node[[BAR]][tooltip="ContextIds: 5",fillcolor="brown1"
-; DOTCLONED: 	Node[[MAIN6:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN6]] ContextIds: 6",fillcolor="cyan",style="filled",label="{OrigId: 15025054523792398438\nmain -\> _Z3foov}"];
+; DOTCLONED: 	Node[[MAIN6:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN6]] ContextIds: 6",fillcolor="cyan",style="filled",label="{OrigId: 15025054523792398438{{.*}}\nmain -\> _Z3foov}"];
 ; DOTCLONED: 	Node[[MAIN6]] -> Node[[FOO2:0x[a-z0-9]+]][tooltip="ContextIds: 6",fillcolor="cyan"
-; DOTCLONED: 	Node[[FOO2]] [shape=record,tooltip="N[[FOO2]] ContextIds: 6",fillcolor="cyan",color="blue",style="filled,bold,dashed",label="{OrigId: Alloc0\n_Z3foov -\> alloc}"];
+; DOTCLONED: 	Node[[FOO2]] [shape=record,tooltip="N[[FOO2]] ContextIds: 6",fillcolor="cyan",color="blue",style="filled,bold,dashed",label="{OrigId: Alloc0{{.*}}\n_Z3foov -\> alloc}"];
 ; DOTCLONED: }
diff --git a/llvm/test/ThinLTO/X86/memprof-inlined.ll b/llvm/test/ThinLTO/X86/memprof-inlined.ll
index 1bb3e3a6b0162..00082f808d3c5 100644
--- a/llvm/test/ThinLTO/X86/memprof-inlined.ll
+++ b/llvm/test/ThinLTO/X86/memprof-inlined.ll
@@ -344,36 +344,36 @@ attributes #0 = { noinline optnone }
 
 ; DOT: digraph "postbuild" {
 ; DOT: 	label="postbuild";
-; DOT: 	Node[[BAZ:0x[a-z0-9]+]] [shape=record,tooltip="N[[BAZ]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",label="{OrigId: Alloc0\n_Z3bazv -\> alloc}"];
-; DOT: 	Node[[FOO:0x[a-z0-9]+]] [shape=record,tooltip="N[[FOO]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",label="{OrigId: 2732490490862098848\nnull call (external)}"];
+; DOT: 	Node[[BAZ:0x[a-z0-9]+]] [shape=record,tooltip="N[[BAZ]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",label="{OrigId: Alloc0{{.*}}{{.*}}\n_Z3bazv -\> alloc}"];
+; DOT: 	Node[[FOO:0x[a-z0-9]+]] [shape=record,tooltip="N[[FOO]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",label="{OrigId: 2732490490862098848{{.*}}{{.*}}\nnull call (external)}"];
 ; DOT: 	Node[[FOO]] -> Node[[BAZ]][tooltip="ContextIds: 1 2",fillcolor="mediumorchid1"
-; DOT: 	Node[[MAIN1:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN1]] ContextIds: 1 3",fillcolor="brown1",style="filled",label="{OrigId: 8632435727821051414\nmain -\> _Z3foov}"];
+; DOT: 	Node[[MAIN1:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN1]] ContextIds: 1 3",fillcolor="brown1",style="filled",label="{OrigId: 8632435727821051414{{.*}}{{.*}}\nmain -\> _Z3foov}"];
 ; DOT: 	Node[[MAIN1]] -> Node[[FOO]][tooltip="ContextIds: 1",fillcolor="brown1"
 ; DOT: 	Node[[MAIN1]] -> Node[[FOO2:0x[a-z0-9]+]][tooltip="ContextIds: 3",fillcolor="brown1"
-; DOT: 	Node[[MAIN2:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN2]] ContextIds: 2 4",fillcolor="cyan",style="filled",label="{OrigId: 15025054523792398438\nmain -\> _Z3foov}"];
+; DOT: 	Node[[MAIN2:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN2]] ContextIds: 2 4",fillcolor="cyan",style="filled",label="{OrigId: 15025054523792398438{{.*}}{{.*}}\nmain -\> _Z3foov}"];
 ; DOT: 	Node[[MAIN2]] -> Node[[FOO]][tooltip="ContextIds: 2",fillcolor="cyan"
 ; DOT: 	Node[[MAIN2]] -> Node[[FOO2]][tooltip="ContextIds: 4",fillcolor="cyan"
-; DOT: 	Node[[BAR:0x[a-z0-9]+]] [shape=record,tooltip="N[[BAR]] ContextIds: 3 4",fillcolor="mediumorchid1",style="filled",label="{OrigId: Alloc2\n_Z3barv -\> alloc}"];
-; DOT: 	Node[[FOO2]] [shape=record,tooltip="N[[FOO2]] ContextIds: 3 4",fillcolor="mediumorchid1",style="filled",label="{OrigId: 0\n_Z3foov -\> _Z3barv}"];
+; DOT: 	Node[[BAR:0x[a-z0-9]+]] [shape=record,tooltip="N[[BAR]] ContextIds: 3 4",fillcolor="mediumorchid1",style="filled",label="{OrigId: Alloc2{{.*}}{{.*}}\n_Z3barv -\> alloc}"];
+; DOT: 	Node[[FOO2]] [shape=record,tooltip="N[[FOO2]] ContextIds: 3 4",fillcolor="mediumorchid1",style="filled",label="{OrigId: 0{{.*}}{{.*}}\n_Z3foov -\> _Z3barv}"];
 ; DOT: 	Node[[FOO2]] -> Node[[BAR]][tooltip="ContextIds: 3 4",fillcolor="mediumorchid1"
 ; DOT: }
 
 
 ; DOTCLONED: digraph "cloned" {
 ; DOTCLONED: 	label="cloned";
-; DOTCLONED: 	Node[[BAZ:0x[a-z0-9]+]] [shape=record,tooltip="N[[BAZ]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",label="{OrigId: Alloc0\n_Z3bazv -\> alloc}"];
-; DOTCLONED: 	Node[[FOO2:0x[a-z0-9]+]] [shape=record,tooltip="N[[FOO2]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",label="{OrigId: 2732490490862098848\nnull call (external)}"];
+; DOTCLONED: 	Node[[BAZ:0x[a-z0-9]+]] [shape=record,tooltip="N[[BAZ]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",label="{OrigId: Alloc0{{.*}}{{.*}}\n_Z3bazv -\> alloc}"];
+; DOTCLONED: 	Node[[FOO2:0x[a-z0-9]+]] [shape=record,tooltip="N[[FOO2]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",label="{OrigId: 2732490490862098848{{.*}}{{.*}}\nnull call (external)}"];
 ; DOTCLONED: 	Node[[FOO2]] -> Node[[BAZ]][tooltip="ContextIds: 1 2",fillcolor="mediumorchid1"
-; DOTCLONED: 	Node[[MAIN1:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN1]] ContextIds: 1 3",fillcolor="brown1",style="filled",label="{OrigId: 8632435727821051414\nmain -\> _Z3foov}"];
+; DOTCLONED: 	Node[[MAIN1:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN1]] ContextIds: 1 3",fillcolor="brown1",style="filled",label="{OrigId: 8632435727821051414{{.*}}\nmain -\> _Z3foov}"];
 ; DOTCLONED: 	Node[[MAIN1]] -> Node[[FOO2]][tooltip="ContextIds: 1",fillcolor="brown1"
 ; DOTCLONED: 	Node[[MAIN1]] -> Node[[FOO:0x[a-z0-9]+]][tooltip="ContextIds: 3",fillcolor="brown1"
-; DOTCLONED: 	Node[[MAIN2:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN2]] ContextIds: 2 4",fillcolor="cyan",style="filled",label="{OrigId: 15025054523792398438\nmain -\> _Z3foov}"];
+; DOTCLONED: 	Node[[MAIN2:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN2]] ContextIds: 2 4",fillcolor="cyan",style="filled",label="{OrigId: 15025054523792398438{{.*}}\nmain -\> _Z3foov}"];
 ; DOTCLONED: 	Node[[MAIN2]] -> Node[[FOO2]][tooltip="ContextIds: 2",fillcolor="cyan"
 ; DOTCLONED: 	Node[[MAIN2]] -> Node[[FOO3:0x[a-z0-9]+]][tooltip="ContextIds: 4",fillcolor="cyan"
-; DOTCLONED: 	Node[[BAR:0x[a-z0-9]+]] [shape=record,tooltip="N[[BAR]] ContextIds: 3",fillcolor="brown1",style="filled",label="{OrigId: Alloc2\n_Z3barv -\> alloc}"];
-; DOTCLONED: 	Node[[FOO]] [shape=record,tooltip="N[[FOO]] ContextIds: 3",fillcolor="brown1",style="filled",label="{OrigId: 0\n_Z3foov -\> _Z3barv}"];
+; DOTCLONED: 	Node[[BAR:0x[a-z0-9]+]] [shape=record,tooltip="N[[BAR]] ContextIds: 3",fillcolor="brown1",style="filled",label="{OrigId: Alloc2{{.*}}\n_Z3barv -\> alloc}"];
+; DOTCLONED: 	Node[[FOO]] [shape=record,tooltip="N[[FOO]] ContextIds: 3",fillcolor="brown1",style="filled",label="{OrigId: 0{{.*}}\n_Z3foov -\> _Z3barv}"];
 ; DOTCLONED: 	Node[[FOO]] -> Node[[BAR]][tooltip="ContextIds: 3",fillcolor="brown1"
-; DOTCLONED: 	Node[[FOO3]] [shape=record,tooltip="N[[FOO3]] ContextIds: 4",fillcolor="cyan",color="blue",style="filled,bold,dashed",label="{OrigId: 0\n_Z3foov -\> _Z3barv}"];
+; DOTCLONED: 	Node[[FOO3]] [shape=record,tooltip="N[[FOO3]] ContextIds: 4",fillcolor="cyan",color="blue",style="filled,bold,dashed",label="{OrigId: 0{{.*}}\n_Z3foov -\> _Z3barv}"];
 ; DOTCLONED: 	Node[[FOO3]] -> Node[[BAR2:0x[a-z0-9]+]][tooltip="ContextIds: 4",fillcolor="cyan"
-; DOTCLONED: 	Node[[BAR2]] [shape=record,tooltip="N[[BAR2]] ContextIds: 4",fillcolor="cyan",color="blue",style="filled,bold,dashed",label="{OrigId: Alloc0\n_Z3barv -\> alloc}"];
+; DOTCLONED: 	Node[[BAR2]] [shape=record,tooltip="N[[BAR2]] ContextIds: 4",fillcolor="cyan",color="blue",style="filled,bold,dashed",label="{OrigId: Alloc0{{.*}}\n_Z3barv -\> alloc}"];
 ; DOTCLONED: }
diff --git a/llvm/test/Transforms/AggressiveInstCombine/AMDGPU/fold-consecutive-loads.ll b/llvm/test/Transforms/AggressiveInstCombine/AMDGPU/fold-consecutive-loads.ll
new file mode 100644
index 0000000000000..05d2330fffc7f
--- /dev/null
+++ b/llvm/test/Transforms/AggressiveInstCombine/AMDGPU/fold-consecutive-loads.ll
@@ -0,0 +1,234 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -passes=sroa,instcombine,aggressive-instcombine %s -S -o - | FileCheck %s
+
+define i64 @quux(ptr %arg) {
+; CHECK-LABEL: define i64 @quux(
+; CHECK-SAME: ptr [[ARG:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[BB:.*:]]
+; CHECK-NEXT:    [[LOAD:%.*]] = load i64, ptr [[ARG]], align 1
+; CHECK-NEXT:    ret i64 [[LOAD]]
+;
+bb:
+  %load = load i8, ptr %arg, align 1
+  %getelementptr = getelementptr inbounds nuw i8, ptr %arg, i64 1
+  %load1 = load i8, ptr %getelementptr, align 1
+  %getelementptr2 = getelementptr inbounds nuw i8, ptr %arg, i64 2
+  %load3 = load i8, ptr %getelementptr2, align 1
+  %getelementptr4 = getelementptr inbounds nuw i8, ptr %arg, i64 3
+  %load5 = load i8, ptr %getelementptr4, align 1
+  %getelementptr6 = getelementptr inbounds nuw i8, ptr %arg, i64 4
+  %load7 = load i8, ptr %getelementptr6, align 1
+  %getelementptr8 = getelementptr inbounds nuw i8, ptr %arg, i64 5
+  %load9 = load i8, ptr %getelementptr8, align 1
+  %getelementptr10 = getelementptr inbounds nuw i8, ptr %arg, i64 6
+  %load11 = load i8, ptr %getelementptr10, align 1
+  %getelementptr12 = getelementptr inbounds nuw i8, ptr %arg, i64 7
+  %load13 = load i8, ptr %getelementptr12, align 1
+  %zext = zext i8 %load13 to i64
+  %shl = shl nuw i64 %zext, 56
+  %zext14 = zext i8 %load11 to i64
+  %shl15 = shl nuw nsw i64 %zext14, 48
+  %or = or disjoint i64 %shl, %shl15
+  %zext16 = zext i8 %load9 to i64
+  %shl17 = shl nuw nsw i64 %zext16, 40
+  %or18 = or disjoint i64 %or, %shl17
+  %zext19 = zext i8 %load7 to i64
+  %shl20 = shl nuw nsw i64 %zext19, 32
+  %or21 = or disjoint i64 %or18, %shl20
+  %zext22 = zext i8 %load5 to i64
+  %shl23 = shl nuw nsw i64 %zext22, 24
+  %or24 = or disjoint i64 %or21, %shl23
+  %zext25 = zext i8 %load3 to i64
+  %shl26 = shl nuw nsw i64 %zext25, 16
+  %zext27 = zext i8 %load1 to i64
+  %shl28 = shl nuw nsw i64 %zext27, 8
+  %or29 = or disjoint i64 %or24, %shl26
+  %zext30 = zext i8 %load to i64
+  %or31 = or i64 %or29, %shl28
+  %or32 = or i64 %or31, %zext30
+  ret i64 %or32
+}
+
+
+; The following test case reduced from a client kernel
+define fastcc <16 x float> @hoge(ptr %arg) {
+; CHECK-LABEL: define fastcc <16 x float> @hoge(
+; CHECK-SAME: ptr [[ARG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[BB:.*:]]
+; CHECK-NEXT:    [[LOAD:%.*]] = load ptr, ptr [[ARG]], align 8
+; CHECK-NEXT:    [[LOAD28:%.*]] = load i64, ptr [[LOAD]], align 1
+; CHECK-NEXT:    [[GETELEMENTPTR72:%.*]] = getelementptr i8, ptr [[LOAD]], i64 8
+; CHECK-NEXT:    [[LOAD73:%.*]] = load i64, ptr [[GETELEMENTPTR72]], align 1
+; CHECK-NEXT:    [[GETELEMENTPTR120:%.*]] = getelementptr i8, ptr [[LOAD]], i64 16
+; CHECK-NEXT:    [[LOAD121:%.*]] = load i64, ptr [[GETELEMENTPTR120]], align 1
+; CHECK-NEXT:    [[GETELEMENTPTR168:%.*]] = getelementptr i8, ptr [[LOAD]], i64 24
+; CHECK-NEXT:    [[LOAD169:%.*]] = load i64, ptr [[GETELEMENTPTR168]], align 1
+; CHECK-NEXT:    [[CALL:%.*]] = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.fp8(i64 [[LOAD28]], i64 0, <16 x float> zeroinitializer, i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[CALL225:%.*]] = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.fp8(i64 [[LOAD73]], i64 0, <16 x float> [[CALL]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[CALL230:%.*]] = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.fp8(i64 [[LOAD121]], i64 0, <16 x float> [[CALL225]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[CALL235:%.*]] = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.fp8(i64 [[LOAD169]], i64 0, <16 x float> [[CALL230]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret <16 x float> [[CALL235]]
+;
+bb:
+  %load = load ptr, ptr %arg, align 8
+  %load28 = load i8, ptr %load, align 1
+  %getelementptr30 = getelementptr i8, ptr %load, i64 1
+  %load31 = load i8, ptr %getelementptr30, align 1
+  %getelementptr36 = getelementptr i8, ptr %load, i64 2
+  %load37 = load i8, ptr %getelementptr36, align 1
+  %getelementptr42 = getelementptr i8, ptr %load, i64 3
+  %load43 = load i8, ptr %getelementptr42, align 1
+  %getelementptr48 = getelementptr i8, ptr %load, i64 4
+  %load49 = load i8, ptr %getelementptr48, align 1
+  %getelementptr54 = getelementptr i8, ptr %load, i64 5
+  %load55 = load i8, ptr %getelementptr54, align 1
+  %getelementptr60 = getelementptr i8, ptr %load, i64 6
+  %load61 = load i8, ptr %getelementptr60, align 1
+  %getelementptr66 = getelementptr i8, ptr %load, i64 7
+  %load67 = load i8, ptr %getelementptr66, align 1
+  %getelementptr72 = getelementptr i8, ptr %load, i64 8
+  %load73 = load i8, ptr %getelementptr72, align 1
+  %getelementptr78 = getelementptr i8, ptr %load, i64 9
+  %load79 = load i8, ptr %getelementptr78, align 1
+  %getelementptr84 = getelementptr i8, ptr %load, i64 10
+  %load85 = load i8, ptr %getelementptr84, align 1
+  %getelementptr90 = getelementptr i8, ptr %load, i64 11
+  %load91 = load i8, ptr %getelementptr90, align 1
+  %getelementptr96 = getelementptr i8, ptr %load, i64 12
+  %load97 = load i8, ptr %getelementptr96, align 1
+  %getelementptr102 = getelementptr i8, ptr %load, i64 13
+  %load103 = load i8, ptr %getelementptr102, align 1
+  %getelementptr108 = getelementptr i8, ptr %load, i64 14
+  %load109 = load i8, ptr %getelementptr108, align 1
+  %getelementptr114 = getelementptr i8, ptr %load, i64 15
+  %load115 = load i8, ptr %getelementptr114, align 1
+  %getelementptr120 = getelementptr i8, ptr %load, i64 16
+  %load121 = load i8, ptr %getelementptr120, align 1
+  %getelementptr126 = getelementptr i8, ptr %load, i64 17
+  %load127 = load i8, ptr %getelementptr126, align 1
+  %getelementptr132 = getelementptr i8, ptr %load, i64 18
+  %load133 = load i8, ptr %getelementptr132, align 1
+  %getelementptr138 = getelementptr i8, ptr %load, i64 19
+  %load139 = load i8, ptr %getelementptr138, align 1
+  %getelementptr144 = getelementptr i8, ptr %load, i64 20
+  %load145 = load i8, ptr %getelementptr144, align 1
+  %getelementptr150 = getelementptr i8, ptr %load, i64 21
+  %load151 = load i8, ptr %getelementptr150, align 1
+  %getelementptr156 = getelementptr i8, ptr %load, i64 22
+  %load157 = load i8, ptr %getelementptr156, align 1
+  %getelementptr162 = getelementptr i8, ptr %load, i64 23
+  %load163 = load i8, ptr %getelementptr162, align 1
+  %getelementptr168 = getelementptr i8, ptr %load, i64 24
+  %load169 = load i8, ptr %getelementptr168, align 1
+  %getelementptr174 = getelementptr i8, ptr %load, i64 25
+  %load175 = load i8, ptr %getelementptr174, align 1
+  %getelementptr180 = getelementptr i8, ptr %load, i64 26
+  %load181 = load i8, ptr %getelementptr180, align 1
+  %getelementptr186 = getelementptr i8, ptr %load, i64 27
+  %load187 = load i8, ptr %getelementptr186, align 1
+  %getelementptr192 = getelementptr i8, ptr %load, i64 28
+  %load193 = load i8, ptr %getelementptr192, align 1
+  %getelementptr198 = getelementptr i8, ptr %load, i64 29
+  %load199 = load i8, ptr %getelementptr198, align 1
+  %getelementptr204 = getelementptr i8, ptr %load, i64 30
+  %load205 = load i8, ptr %getelementptr204, align 1
+  %getelementptr210 = getelementptr i8, ptr %load, i64 31
+  %load211 = load i8, ptr %getelementptr210, align 1
+  %alloca1.sroa.8.0.insert.ext = zext i8 %load67 to i64
+  %alloca1.sroa.8.0.insert.shift = shl i64 %alloca1.sroa.8.0.insert.ext, 56
+  %alloca1.sroa.7.0.insert.ext = zext i8 %load61 to i64
+  %alloca1.sroa.7.0.insert.shift = shl i64 %alloca1.sroa.7.0.insert.ext, 48
+  %alloca1.sroa.7.0.insert.insert = or i64 %alloca1.sroa.8.0.insert.shift, %alloca1.sroa.7.0.insert.shift
+  %alloca1.sroa.6.0.insert.ext = zext i8 %load55 to i64
+  %alloca1.sroa.6.0.insert.shift = shl i64 %alloca1.sroa.6.0.insert.ext, 40
+  %alloca1.sroa.6.0.insert.insert = or i64 %alloca1.sroa.7.0.insert.insert, %alloca1.sroa.6.0.insert.shift
+  %alloca1.sroa.5.0.insert.ext = zext i8 %load49 to i64
+  %alloca1.sroa.5.0.insert.shift = shl i64 %alloca1.sroa.5.0.insert.ext, 32
+  %alloca1.sroa.5.0.insert.insert = or i64 %alloca1.sroa.6.0.insert.insert, %alloca1.sroa.5.0.insert.shift
+  %alloca1.sroa.4.0.insert.ext = zext i8 %load43 to i64
+  %alloca1.sroa.4.0.insert.shift = shl i64 %alloca1.sroa.4.0.insert.ext, 24
+  %alloca1.sroa.4.0.insert.insert = or i64 %alloca1.sroa.5.0.insert.insert, %alloca1.sroa.4.0.insert.shift
+  %alloca1.sroa.3.0.insert.ext = zext i8 %load37 to i64
+  %alloca1.sroa.3.0.insert.shift = shl i64 %alloca1.sroa.3.0.insert.ext, 16
+  %alloca1.sroa.2.0.insert.ext = zext i8 %load31 to i64
+  %alloca1.sroa.2.0.insert.shift = shl i64 %alloca1.sroa.2.0.insert.ext, 8
+  %alloca1.sroa.2.0.insert.mask = or i64 %alloca1.sroa.4.0.insert.insert, %alloca1.sroa.3.0.insert.shift
+  %alloca1.sroa.0.0.insert.ext = zext i8 %load28 to i64
+  %alloca1.sroa.0.0.insert.mask = or i64 %alloca1.sroa.2.0.insert.mask, %alloca1.sroa.2.0.insert.shift
+  %alloca1.sroa.0.0.insert.insert = or i64 %alloca1.sroa.0.0.insert.mask, %alloca1.sroa.0.0.insert.ext
+  %call = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.fp8(i64 %alloca1.sroa.0.0.insert.insert, i64 0, <16 x float> zeroinitializer, i32 0, i32 0, i32 0)
+  %alloca1.sroa.17.8.insert.ext = zext i8 %load115 to i64
+  %alloca1.sroa.17.8.insert.shift = shl i64 %alloca1.sroa.17.8.insert.ext, 56
+  %alloca1.sroa.16.8.insert.ext = zext i8 %load109 to i64
+  %alloca1.sroa.16.8.insert.shift = shl i64 %alloca1.sroa.16.8.insert.ext, 48
+  %alloca1.sroa.16.8.insert.insert = or i64 %alloca1.sroa.17.8.insert.shift, %alloca1.sroa.16.8.insert.shift
+  %alloca1.sroa.15.8.insert.ext = zext i8 %load103 to i64
+  %alloca1.sroa.15.8.insert.shift = shl i64 %alloca1.sroa.15.8.insert.ext, 40
+  %alloca1.sroa.15.8.insert.insert = or i64 %alloca1.sroa.16.8.insert.insert, %alloca1.sroa.15.8.insert.shift
+  %alloca1.sroa.14.8.insert.ext = zext i8 %load97 to i64
+  %alloca1.sroa.14.8.insert.shift = shl i64 %alloca1.sroa.14.8.insert.ext, 32
+  %alloca1.sroa.14.8.insert.insert = or i64 %alloca1.sroa.15.8.insert.insert, %alloca1.sroa.14.8.insert.shift
+  %alloca1.sroa.13.8.insert.ext = zext i8 %load91 to i64
+  %alloca1.sroa.13.8.insert.shift = shl i64 %alloca1.sroa.13.8.insert.ext, 24
+  %alloca1.sroa.13.8.insert.insert = or i64 %alloca1.sroa.14.8.insert.insert, %alloca1.sroa.13.8.insert.shift
+  %alloca1.sroa.12.8.insert.ext = zext i8 %load85 to i64
+  %alloca1.sroa.12.8.insert.shift = shl i64 %alloca1.sroa.12.8.insert.ext, 16
+  %alloca1.sroa.11.8.insert.ext = zext i8 %load79 to i64
+  %alloca1.sroa.11.8.insert.shift = shl i64 %alloca1.sroa.11.8.insert.ext, 8
+  %alloca1.sroa.11.8.insert.mask = or i64 %alloca1.sroa.13.8.insert.insert, %alloca1.sroa.12.8.insert.shift
+  %alloca1.sroa.9.8.insert.ext = zext i8 %load73 to i64
+  %alloca1.sroa.9.8.insert.mask = or i64 %alloca1.sroa.11.8.insert.mask, %alloca1.sroa.11.8.insert.shift
+  %alloca1.sroa.9.8.insert.insert = or i64 %alloca1.sroa.9.8.insert.mask, %alloca1.sroa.9.8.insert.ext
+  %call225 = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.fp8(i64 %alloca1.sroa.9.8.insert.insert, i64 0, <16 x float> %call, i32 0, i32 0, i32 0)
+  %alloca1.sroa.26.16.insert.ext = zext i8 %load163 to i64
+  %alloca1.sroa.26.16.insert.shift = shl i64 %alloca1.sroa.26.16.insert.ext, 56
+  %alloca1.sroa.25.16.insert.ext = zext i8 %load157 to i64
+  %alloca1.sroa.25.16.insert.shift = shl i64 %alloca1.sroa.25.16.insert.ext, 48
+  %alloca1.sroa.25.16.insert.insert = or i64 %alloca1.sroa.26.16.insert.shift, %alloca1.sroa.25.16.insert.shift
+  %alloca1.sroa.24.16.insert.ext = zext i8 %load151 to i64
+  %alloca1.sroa.24.16.insert.shift = shl i64 %alloca1.sroa.24.16.insert.ext, 40
+  %alloca1.sroa.24.16.insert.insert = or i64 %alloca1.sroa.25.16.insert.insert, %alloca1.sroa.24.16.insert.shift
+  %alloca1.sroa.23.16.insert.ext = zext i8 %load145 to i64
+  %alloca1.sroa.23.16.insert.shift = shl i64 %alloca1.sroa.23.16.insert.ext, 32
+  %alloca1.sroa.23.16.insert.insert = or i64 %alloca1.sroa.24.16.insert.insert, %alloca1.sroa.23.16.insert.shift
+  %alloca1.sroa.22.16.insert.ext = zext i8 %load139 to i64
+  %alloca1.sroa.22.16.insert.shift = shl i64 %alloca1.sroa.22.16.insert.ext, 24
+  %alloca1.sroa.22.16.insert.insert = or i64 %alloca1.sroa.23.16.insert.insert, %alloca1.sroa.22.16.insert.shift
+  %alloca1.sroa.21.16.insert.ext = zext i8 %load133 to i64
+  %alloca1.sroa.21.16.insert.shift = shl i64 %alloca1.sroa.21.16.insert.ext, 16
+  %alloca1.sroa.20.16.insert.ext = zext i8 %load127 to i64
+  %alloca1.sroa.20.16.insert.shift = shl i64 %alloca1.sroa.20.16.insert.ext, 8
+  %alloca1.sroa.20.16.insert.mask = or i64 %alloca1.sroa.22.16.insert.insert, %alloca1.sroa.21.16.insert.shift
+  %alloca1.sroa.18.16.insert.ext = zext i8 %load121 to i64
+  %alloca1.sroa.18.16.insert.mask = or i64 %alloca1.sroa.20.16.insert.mask, %alloca1.sroa.20.16.insert.shift
+  %alloca1.sroa.18.16.insert.insert = or i64 %alloca1.sroa.18.16.insert.mask, %alloca1.sroa.18.16.insert.ext
+  %call230 = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.fp8(i64 %alloca1.sroa.18.16.insert.insert, i64 0, <16 x float> %call225, i32 0, i32 0, i32 0)
+  %alloca1.sroa.35.24.insert.ext = zext i8 %load211 to i64
+  %alloca1.sroa.35.24.insert.shift = shl i64 %alloca1.sroa.35.24.insert.ext, 56
+  %alloca1.sroa.34.24.insert.ext = zext i8 %load205 to i64
+  %alloca1.sroa.34.24.insert.shift = shl i64 %alloca1.sroa.34.24.insert.ext, 48
+  %alloca1.sroa.34.24.insert.insert = or i64 %alloca1.sroa.35.24.insert.shift, %alloca1.sroa.34.24.insert.shift
+  %alloca1.sroa.33.24.insert.ext = zext i8 %load199 to i64
+  %alloca1.sroa.33.24.insert.shift = shl i64 %alloca1.sroa.33.24.insert.ext, 40
+  %alloca1.sroa.33.24.insert.insert = or i64 %alloca1.sroa.34.24.insert.insert, %alloca1.sroa.33.24.insert.shift
+  %alloca1.sroa.32.24.insert.ext = zext i8 %load193 to i64
+  %alloca1.sroa.32.24.insert.shift = shl i64 %alloca1.sroa.32.24.insert.ext, 32
+  %alloca1.sroa.32.24.insert.insert = or i64 %alloca1.sroa.33.24.insert.insert, %alloca1.sroa.32.24.insert.shift
+  %alloca1.sroa.31.24.insert.ext = zext i8 %load187 to i64
+  %alloca1.sroa.31.24.insert.shift = shl i64 %alloca1.sroa.31.24.insert.ext, 24
+  %alloca1.sroa.31.24.insert.insert = or i64 %alloca1.sroa.32.24.insert.insert, %alloca1.sroa.31.24.insert.shift
+  %alloca1.sroa.30.24.insert.ext = zext i8 %load181 to i64
+  %alloca1.sroa.30.24.insert.shift = shl i64 %alloca1.sroa.30.24.insert.ext, 16
+  %alloca1.sroa.29.24.insert.ext = zext i8 %load175 to i64
+  %alloca1.sroa.29.24.insert.shift = shl i64 %alloca1.sroa.29.24.insert.ext, 8
+  %alloca1.sroa.29.24.insert.mask = or i64 %alloca1.sroa.31.24.insert.insert, %alloca1.sroa.30.24.insert.shift
+  %alloca1.sroa.27.24.insert.ext = zext i8 %load169 to i64
+  %alloca1.sroa.27.24.insert.mask = or i64 %alloca1.sroa.29.24.insert.mask, %alloca1.sroa.29.24.insert.shift
+  %alloca1.sroa.27.24.insert.insert = or i64 %alloca1.sroa.27.24.insert.mask, %alloca1.sroa.27.24.insert.ext
+  %call235 = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.fp8(i64 %alloca1.sroa.27.24.insert.insert, i64 0, <16 x float> %call230, i32 0, i32 0, i32 0)
+  ret <16 x float> %call235
+}
+
+declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.fp8(i64, i64, <16 x float>, i32 immarg, i32 immarg, i32 immarg) #0
+
+attributes #0 = { convergent nocallback nofree nosync nounwind willreturn memory(none) }
diff --git a/llvm/test/Transforms/AggressiveInstCombine/AMDGPU/lit.local.cfg b/llvm/test/Transforms/AggressiveInstCombine/AMDGPU/lit.local.cfg
new file mode 100644
index 0000000000000..7c492428aec76
--- /dev/null
+++ b/llvm/test/Transforms/AggressiveInstCombine/AMDGPU/lit.local.cfg
@@ -0,0 +1,2 @@
+if not "AMDGPU" in config.root.targets:
+    config.unsupported = True
diff --git a/llvm/test/Transforms/ArgumentPromotion/reserve-tbaa.ll b/llvm/test/Transforms/ArgumentPromotion/reserve-tbaa.ll
index f60dd48a464d2..a18c3bad12fcf 100644
--- a/llvm/test/Transforms/ArgumentPromotion/reserve-tbaa.ll
+++ b/llvm/test/Transforms/ArgumentPromotion/reserve-tbaa.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --scrub-attributes
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --scrub-attributes --version 6
 ; RUN: opt < %s -passes=argpromotion -S | FileCheck %s
 
 ; PR17906
@@ -14,12 +14,12 @@
 @d = global i8 0, align 1
 
 define internal fastcc void @fn(ptr nocapture readonly %p1, ptr nocapture readonly %p2) {
-; CHECK-LABEL: define {{[^@]+}}@fn
-; CHECK-SAME: (i32 [[P1_0_VAL:%.*]], i64 [[P2_0_VAL:%.*]]) {
-; CHECK-NEXT:  entry:
+; CHECK-LABEL: define internal fastcc void @fn(
+; CHECK-SAME: i32 [[P1_0_VAL:%.*]], i64 [[P2_0_VAL:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[CONV:%.*]] = trunc i64 [[P2_0_VAL]] to i32
 ; CHECK-NEXT:    [[CONV1:%.*]] = trunc i32 [[P1_0_VAL]] to i8
-; CHECK-NEXT:    store i8 [[CONV1]], ptr @d, align 1, !tbaa [[TBAA0:![0-9]+]]
+; CHECK-NEXT:    store i8 [[CONV1]], ptr @d, align 1, !tbaa [[CHAR_TBAA0:![0-9]+]]
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -32,14 +32,14 @@ entry:
 }
 
 define i32 @main() {
-; CHECK-LABEL: define {{[^@]+}}@main() {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr @e, align 8, !tbaa [[TBAA3:![0-9]+]]
-; CHECK-NEXT:    store ptr @g, ptr [[TMP0]], align 8, !tbaa [[TBAA3]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr @a, align 8, !tbaa [[TBAA3]]
-; CHECK-NEXT:    store i32 1, ptr [[TMP1]], align 4, !tbaa [[TBAA5:![0-9]+]]
-; CHECK-NEXT:    [[G_VAL:%.*]] = load i32, ptr @g, align 4, !tbaa [[TBAA5]]
-; CHECK-NEXT:    [[C_VAL:%.*]] = load i64, ptr @c, align 8, !tbaa [[TBAA7:![0-9]+]]
+; CHECK-LABEL: define i32 @main() {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr @e, align 8, !tbaa [[ANYPTR_TBAA3:![0-9]+]]
+; CHECK-NEXT:    store ptr @g, ptr [[TMP0]], align 8, !tbaa [[ANYPTR_TBAA3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr @a, align 8, !tbaa [[ANYPTR_TBAA3]]
+; CHECK-NEXT:    store i32 1, ptr [[TMP1]], align 4, !tbaa [[INT_TBAA5:![0-9]+]]
+; CHECK-NEXT:    [[G_VAL:%.*]] = load i32, ptr @g, align 4, !tbaa [[INT_TBAA5]]
+; CHECK-NEXT:    [[C_VAL:%.*]] = load i64, ptr @c, align 8, !tbaa [[LONG_TBAA7:![0-9]+]]
 ; CHECK-NEXT:    call fastcc void @fn(i32 [[G_VAL]], i64 [[C_VAL]])
 ; CHECK-NEXT:    ret i32 0
 ;
@@ -63,3 +63,14 @@ entry:
 !8 = !{!9, !9, i64 0}
 !9 = !{!"any pointer", !3, i64 0}
 
+;.
+; CHECK: [[CHAR_TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0}
+; CHECK: [[META1]] = !{!"omnipotent char", [[META2:![0-9]+]], i64 0}
+; CHECK: [[META2]] = !{!"Simple C/C++ TBAA"}
+; CHECK: [[ANYPTR_TBAA3]] = !{[[META4:![0-9]+]], [[META4]], i64 0}
+; CHECK: [[META4]] = !{!"any pointer", [[META1]], i64 0}
+; CHECK: [[INT_TBAA5]] = !{[[META6:![0-9]+]], [[META6]], i64 0}
+; CHECK: [[META6]] = !{!"int", [[META1]], i64 0}
+; CHECK: [[LONG_TBAA7]] = !{[[META8:![0-9]+]], [[META8]], i64 0}
+; CHECK: [[META8]] = !{!"long", [[META1]], i64 0}
+;.
diff --git a/llvm/test/Transforms/Attributor/ArgumentPromotion/reserve-tbaa.ll b/llvm/test/Transforms/Attributor/ArgumentPromotion/reserve-tbaa.ll
index bed038968a527..c27f827fc941e 100644
--- a/llvm/test/Transforms/Attributor/ArgumentPromotion/reserve-tbaa.ll
+++ b/llvm/test/Transforms/Attributor/ArgumentPromotion/reserve-tbaa.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-attributes --check-globals
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes --check-globals all --version 6
 ; RUN: opt -aa-pipeline=basic-aa -passes=attributor -attributor-manifest-internal  -attributor-annotate-decl-cs  -S < %s | FileCheck %s --check-prefixes=CHECK,TUNIT
 ; RUN: opt -aa-pipeline=basic-aa -passes=attributor-cgscc -attributor-manifest-internal  -attributor-annotate-decl-cs -S < %s | FileCheck %s --check-prefixes=CHECK,CGSCC
 
@@ -23,12 +23,12 @@
 ;.
 define internal fastcc void @fn(ptr nocapture readonly %p1, ptr nocapture readonly %p2) {
 ; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(readwrite, argmem: none)
-; CHECK-LABEL: define {{[^@]+}}@fn
-; CHECK-SAME: () #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr @g, align 4, !tbaa [[TBAA0:![0-9]+]]
+; CHECK-LABEL: define internal fastcc void @fn(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr @g, align 4, !tbaa [[INT_TBAA0:![0-9]+]]
 ; CHECK-NEXT:    [[CONV1:%.*]] = trunc i32 [[TMP0]] to i8
-; CHECK-NEXT:    store i8 [[CONV1]], ptr @d, align 1, !tbaa [[TBAA4:![0-9]+]]
+; CHECK-NEXT:    store i8 [[CONV1]], ptr @d, align 1, !tbaa [[CHAR_TBAA4:![0-9]+]]
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -42,24 +42,24 @@ entry:
 
 define i32 @main() {
 ; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn
-; TUNIT-LABEL: define {{[^@]+}}@main
-; TUNIT-SAME: () #[[ATTR1:[0-9]+]] {
-; TUNIT-NEXT:  entry:
-; TUNIT-NEXT:    [[TMP0:%.*]] = load ptr, ptr @e, align 8, !tbaa [[TBAA5:![0-9]+]]
-; TUNIT-NEXT:    store ptr @g, ptr [[TMP0]], align 8, !tbaa [[TBAA5]]
-; TUNIT-NEXT:    [[TMP1:%.*]] = load ptr, ptr @a, align 8, !tbaa [[TBAA5]]
-; TUNIT-NEXT:    store i32 1, ptr [[TMP1]], align 4, !tbaa [[TBAA0]]
+; TUNIT-LABEL: define noundef i32 @main(
+; TUNIT-SAME: ) #[[ATTR1:[0-9]+]] {
+; TUNIT-NEXT:  [[ENTRY:.*:]]
+; TUNIT-NEXT:    [[TMP0:%.*]] = load ptr, ptr @e, align 8, !tbaa [[ANYPTR_TBAA5:![0-9]+]]
+; TUNIT-NEXT:    store ptr @g, ptr [[TMP0]], align 8, !tbaa [[ANYPTR_TBAA5]]
+; TUNIT-NEXT:    [[TMP1:%.*]] = load ptr, ptr @a, align 8, !tbaa [[ANYPTR_TBAA5]]
+; TUNIT-NEXT:    store i32 1, ptr [[TMP1]], align 4, !tbaa [[INT_TBAA0]]
 ; TUNIT-NEXT:    call fastcc void @fn() #[[ATTR2:[0-9]+]]
 ; TUNIT-NEXT:    ret i32 0
 ;
 ; CGSCC: Function Attrs: mustprogress nofree nosync nounwind willreturn
-; CGSCC-LABEL: define {{[^@]+}}@main
-; CGSCC-SAME: () #[[ATTR1:[0-9]+]] {
-; CGSCC-NEXT:  entry:
-; CGSCC-NEXT:    [[TMP0:%.*]] = load ptr, ptr @e, align 8, !tbaa [[TBAA5:![0-9]+]]
-; CGSCC-NEXT:    store ptr @g, ptr [[TMP0]], align 8, !tbaa [[TBAA5]]
-; CGSCC-NEXT:    [[TMP1:%.*]] = load ptr, ptr @a, align 8, !tbaa [[TBAA5]]
-; CGSCC-NEXT:    store i32 1, ptr [[TMP1]], align 4, !tbaa [[TBAA0]]
+; CGSCC-LABEL: define noundef i32 @main(
+; CGSCC-SAME: ) #[[ATTR1:[0-9]+]] {
+; CGSCC-NEXT:  [[ENTRY:.*:]]
+; CGSCC-NEXT:    [[TMP0:%.*]] = load ptr, ptr @e, align 8, !tbaa [[ANYPTR_TBAA5:![0-9]+]]
+; CGSCC-NEXT:    store ptr @g, ptr [[TMP0]], align 8, !tbaa [[ANYPTR_TBAA5]]
+; CGSCC-NEXT:    [[TMP1:%.*]] = load ptr, ptr @a, align 8, !tbaa [[ANYPTR_TBAA5]]
+; CGSCC-NEXT:    store i32 1, ptr [[TMP1]], align 4, !tbaa [[INT_TBAA0]]
 ; CGSCC-NEXT:    call fastcc void @fn() #[[ATTR2:[0-9]+]]
 ; CGSCC-NEXT:    ret i32 0
 ;
@@ -92,19 +92,19 @@ entry:
 ; CGSCC: attributes #[[ATTR1]] = { mustprogress nofree nosync nounwind willreturn }
 ; CGSCC: attributes #[[ATTR2]] = { nofree nounwind willreturn }
 ;.
-; TUNIT: [[TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0}
+; TUNIT: [[INT_TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0}
 ; TUNIT: [[META1]] = !{!"int", [[META2:![0-9]+]], i64 0}
 ; TUNIT: [[META2]] = !{!"omnipotent char", [[META3:![0-9]+]], i64 0}
 ; TUNIT: [[META3]] = !{!"Simple C/C++ TBAA"}
-; TUNIT: [[TBAA4]] = !{[[META2]], [[META2]], i64 0}
-; TUNIT: [[TBAA5]] = !{[[META6:![0-9]+]], [[META6]], i64 0}
+; TUNIT: [[CHAR_TBAA4]] = !{[[META2]], [[META2]], i64 0}
+; TUNIT: [[ANYPTR_TBAA5]] = !{[[META6:![0-9]+]], [[META6]], i64 0}
 ; TUNIT: [[META6]] = !{!"any pointer", [[META2]], i64 0}
 ;.
-; CGSCC: [[TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0}
+; CGSCC: [[INT_TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0}
 ; CGSCC: [[META1]] = !{!"int", [[META2:![0-9]+]], i64 0}
 ; CGSCC: [[META2]] = !{!"omnipotent char", [[META3:![0-9]+]], i64 0}
 ; CGSCC: [[META3]] = !{!"Simple C/C++ TBAA"}
-; CGSCC: [[TBAA4]] = !{[[META2]], [[META2]], i64 0}
-; CGSCC: [[TBAA5]] = !{[[META6:![0-9]+]], [[META6]], i64 0}
+; CGSCC: [[CHAR_TBAA4]] = !{[[META2]], [[META2]], i64 0}
+; CGSCC: [[ANYPTR_TBAA5]] = !{[[META6:![0-9]+]], [[META6]], i64 0}
 ; CGSCC: [[META6]] = !{!"any pointer", [[META2]], i64 0}
 ;.
diff --git a/llvm/test/Transforms/Attributor/nocapture-1.ll b/llvm/test/Transforms/Attributor/nocapture-1.ll
index 1f1c442fbc5f7..034b5ef397f0a 100644
--- a/llvm/test/Transforms/Attributor/nocapture-1.ll
+++ b/llvm/test/Transforms/Attributor/nocapture-1.ll
@@ -350,7 +350,7 @@ define void @callsite_readonly_nounwind_not_willreturn(ptr %f, ptr %p) {
 
 define void @callsite_readonly_nounwind_willreturn(ptr %f, ptr %p) {
 ; CHECK-LABEL: define {{[^@]+}}@callsite_readonly_nounwind_willreturn
-; CHECK-SAME: (ptr nofree noundef nonnull captures(none) [[F:%.*]], ptr captures(none) [[P:%.*]]) {
+; CHECK-SAME: (ptr nofree noundef nonnull captures(none) [[F:%.*]], ptr [[P:%.*]]) {
 ; CHECK-NEXT:    call void [[F]](ptr captures(none) [[P]])
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/Attributor/nofpclass.ll b/llvm/test/Transforms/Attributor/nofpclass.ll
index 6491f01a6d2a3..a9ebdaa397015 100644
--- a/llvm/test/Transforms/Attributor/nofpclass.ll
+++ b/llvm/test/Transforms/Attributor/nofpclass.ll
@@ -1,6 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes --version 2
-; RUN: opt -aa-pipeline=basic-aa -passes=attributor -attributor-manifest-internal  -attributor-annotate-decl-cs  -S < %s | FileCheck %s --check-prefixes=CHECK,TUNIT
-; RUN: opt -aa-pipeline=basic-aa -passes=attributor-cgscc -attributor-manifest-internal  -attributor-annotate-decl-cs -S < %s | FileCheck %s --check-prefixes=CHECK,CGSCC
+; RUN: opt -aa-pipeline=basic-aa -passes=attributor -attributor-manifest-internal -attributor-annotate-decl-cs -use-constant-fp-for-scalable-splat=false -S < %s | FileCheck %s --check-prefixes=CHECK,CHECK-CV,TUNIT,TUNIT-CV
+; RUN: opt -aa-pipeline=basic-aa -passes=attributor-cgscc -attributor-manifest-internal -attributor-annotate-decl-cs -use-constant-fp-for-scalable-splat=false -S < %s | FileCheck %s --check-prefixes=CHECK,CHECK-CV,CGSCC,CGSCC-CV
+
+; RUN: opt -aa-pipeline=basic-aa -passes=attributor -attributor-manifest-internal -attributor-annotate-decl-cs -use-constant-fp-for-scalable-splat -S < %s | FileCheck %s --check-prefixes=CHECK,CHECK-CI,TUNIT,TUNIT-CI
+; RUN: opt -aa-pipeline=basic-aa -passes=attributor-cgscc -attributor-manifest-internal -attributor-annotate-decl-cs -use-constant-fp-for-scalable-splat -S < %s | FileCheck %s --check-prefixes=CHECK,CHECK-CI,CGSCC,CGSCC-CI
 
 declare nofpclass(nan) float @ret_nofpclass_nan()
 declare [2 x [3 x float]] @ret_array()
@@ -2468,7 +2471,7 @@ define internal float @through_memory0(ptr %ptr.arg) {
 ; CGSCC-SAME: (float [[TMP0:%.*]]) #[[ATTR3]] {
 ; CGSCC-NEXT:    [[PTR_ARG_PRIV:%.*]] = alloca float, align 4
 ; CGSCC-NEXT:    store float [[TMP0]], ptr [[PTR_ARG_PRIV]], align 4
-; CGSCC-NEXT:    [[LOAD:%.*]] = load float, ptr [[PTR_ARG_PRIV]], align 4
+; CGSCC-NEXT:    [[LOAD:%.*]] = load float, ptr [[PTR_ARG_PRIV]], align 4, !invariant.load [[META0:![0-9]+]]
 ; CGSCC-NEXT:    ret float [[LOAD]]
 ;
   %load = load float, ptr %ptr.arg
@@ -2491,7 +2494,7 @@ define internal float @through_memory1(ptr %ptr.arg) {
 ; CGSCC-SAME: (float [[TMP0:%.*]]) #[[ATTR3]] {
 ; CGSCC-NEXT:    [[PTR_ARG_PRIV:%.*]] = alloca float, align 4
 ; CGSCC-NEXT:    store float [[TMP0]], ptr [[PTR_ARG_PRIV]], align 4
-; CGSCC-NEXT:    [[LOAD:%.*]] = load float, ptr [[PTR_ARG_PRIV]], align 4
+; CGSCC-NEXT:    [[LOAD:%.*]] = load float, ptr [[PTR_ARG_PRIV]], align 4, !invariant.load [[META0]]
 ; CGSCC-NEXT:    [[CALL:%.*]] = call float @llvm.arithmetic.fence.f32(float [[LOAD]]) #[[ATTR19]]
 ; CGSCC-NEXT:    ret float [[CALL]]
 ;
@@ -2507,7 +2510,7 @@ define internal float @through_memory2(ptr %ptr.arg) {
 ; CHECK-SAME: (float [[TMP0:%.*]]) #[[ATTR15:[0-9]+]] {
 ; CHECK-NEXT:    [[PTR_ARG_PRIV:%.*]] = alloca float, align 4
 ; CHECK-NEXT:    store float [[TMP0]], ptr [[PTR_ARG_PRIV]], align 4
-; CHECK-NEXT:    [[LOAD:%.*]] = load float, ptr [[PTR_ARG_PRIV]], align 4
+; CHECK-NEXT:    [[LOAD:%.*]] = load float, ptr [[PTR_ARG_PRIV]], align 4, !invariant.load [[META0:![0-9]+]]
 ; CHECK-NEXT:    [[CALL:%.*]] = call float @extern.f32(float [[LOAD]])
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -2664,10 +2667,15 @@ define [4 x float] @constant_aggregate_zero() {
 }
 
 define <vscale x 4 x float> @scalable_splat_pnorm() {
-; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
-; CHECK-LABEL: define noundef <vscale x 4 x float> @scalable_splat_pnorm
-; CHECK-SAME: () #[[ATTR3]] {
-; CHECK-NEXT:    ret <vscale x 4 x float> splat (float 1.000000e+00)
+; CHECK-CV: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+; CHECK-CV-LABEL: define noundef <vscale x 4 x float> @scalable_splat_pnorm
+; CHECK-CV-SAME: () #[[ATTR3]] {
+; CHECK-CV-NEXT:    ret <vscale x 4 x float> splat (float 1.000000e+00)
+;
+; CHECK-CI: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+; CHECK-CI-LABEL: define noundef nofpclass(nan inf zero sub nnorm) <vscale x 4 x float> @scalable_splat_pnorm
+; CHECK-CI-SAME: () #[[ATTR3]] {
+; CHECK-CI-NEXT:    ret <vscale x 4 x float> splat (float 1.000000e+00)
 ;
   ret <vscale x 4 x float> splat (float 1.0)
 }
@@ -2978,3 +2986,8 @@ attributes #2 = { "denormal-fp-math"="ieee,preserve-sign" }
 attributes #3 = { "denormal-fp-math"="positive-zero,positive-zero" }
 attributes #4 = { "denormal-fp-math"="positive-zero,ieee" }
 attributes #5 = { "denormal-fp-math"="ieee,positive-zero" }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CGSCC-CI: {{.*}}
+; CGSCC-CV: {{.*}}
+; TUNIT-CI: {{.*}}
+; TUNIT-CV: {{.*}}
diff --git a/llvm/test/Transforms/Attributor/value-simplify-pointer-info.ll b/llvm/test/Transforms/Attributor/value-simplify-pointer-info.ll
index 82bed0f27c046..3e07fe42261e9 100644
--- a/llvm/test/Transforms/Attributor/value-simplify-pointer-info.ll
+++ b/llvm/test/Transforms/Attributor/value-simplify-pointer-info.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-attributes --check-globals
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes --check-globals all --version 6
 ; RUN: opt -aa-pipeline=basic-aa -passes=attributor -attributor-manifest-internal  -attributor-annotate-decl-cs  -S < %s | FileCheck %s --check-prefixes=CHECK,TUNIT
 ; RUN: opt -aa-pipeline=basic-aa -passes=attributor-cgscc -attributor-manifest-internal  -attributor-annotate-decl-cs -S < %s | FileCheck %s --check-prefixes=CHECK,CGSCC
 ;
@@ -66,10 +66,10 @@
 ;.
 define void @write_arg(ptr %p, i32 %v) {
 ; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: write)
-; CHECK-LABEL: define {{[^@]+}}@write_arg
-; CHECK-SAME: (ptr nofree noundef nonnull writeonly align 4 captures(none) dereferenceable(4) [[P:%.*]], i32 [[V:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    store i32 [[V]], ptr [[P]], align 4, !tbaa [[TBAA3:![0-9]+]]
+; CHECK-LABEL: define void @write_arg(
+; CHECK-SAME: ptr nofree noundef nonnull writeonly align 4 captures(none) dereferenceable(4) [[P:%.*]], i32 [[V:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store i32 [[V]], ptr [[P]], align 4, !tbaa [[INT_TBAA3:![0-9]+]]
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -78,11 +78,11 @@ entry:
 }
 
 define void @write_random(ptr %p) {
-; CHECK-LABEL: define {{[^@]+}}@write_random
-; CHECK-SAME: (ptr nofree writeonly captures(none) [[P:%.*]]) {
-; CHECK-NEXT:  entry:
+; CHECK-LABEL: define void @write_random(
+; CHECK-SAME: ptr nofree writeonly captures(none) [[P:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[CALL:%.*]] = call i32 (...) @random()
-; CHECK-NEXT:    store i32 [[CALL]], ptr [[P]], align 4, !tbaa [[TBAA3]]
+; CHECK-NEXT:    store i32 [[CALL]], ptr [[P]], align 4, !tbaa [[INT_TBAA3]]
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -112,9 +112,9 @@ declare i32 @random(...)
 ;    }
 define void @local_alloca_simplifiable_1(ptr noalias sret(%struct.S) align 4 %agg.result) {
 ; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
-; TUNIT-LABEL: define {{[^@]+}}@local_alloca_simplifiable_1
-; TUNIT-SAME: (ptr noalias nofree writeonly sret([[STRUCT_S:%.*]]) align 4 captures(none) dereferenceable_or_null(24) [[AGG_RESULT:%.*]]) #[[ATTR1:[0-9]+]] {
-; TUNIT-NEXT:  entry:
+; TUNIT-LABEL: define void @local_alloca_simplifiable_1(
+; TUNIT-SAME: ptr noalias nofree writeonly sret([[STRUCT_S:%.*]]) align 4 captures(none) dereferenceable_or_null(24) [[AGG_RESULT:%.*]]) #[[ATTR1:[0-9]+]] {
+; TUNIT-NEXT:  [[ENTRY:.*:]]
 ; TUNIT-NEXT:    [[S:%.*]] = alloca [[STRUCT_S]], align 4
 ; TUNIT-NEXT:    call void @llvm.lifetime.start.p0(ptr noalias nofree noundef nonnull align 4 captures(none) dereferenceable(24) [[S]]) #[[ATTR17:[0-9]+]]
 ; TUNIT-NEXT:    [[F1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S]], i64 0, i32 3
@@ -126,65 +126,65 @@ define void @local_alloca_simplifiable_1(ptr noalias sret(%struct.S) align 4 %ag
 ; TUNIT-NEXT:    [[I3:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S]], i64 0, i32 2
 ; TUNIT-NEXT:    call void @write_arg(ptr nofree noundef nonnull writeonly align 4 captures(none) dereferenceable(16) [[I3]], i32 noundef 3) #[[ATTR18]]
 ; TUNIT-NEXT:    [[F12:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[AGG_RESULT]], i64 0, i32 3
-; TUNIT-NEXT:    store float 0x3FF19999A0000000, ptr [[F12]], align 4, !tbaa [[TBAA7:![0-9]+]]
+; TUNIT-NEXT:    store float 0x3FF19999A0000000, ptr [[F12]], align 4, !tbaa [[FLOAT_TBAA7:![0-9]+]]
 ; TUNIT-NEXT:    [[F24:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[AGG_RESULT]], i64 0, i32 4
-; TUNIT-NEXT:    store float 0x40119999A0000000, ptr [[F24]], align 4, !tbaa [[TBAA10:![0-9]+]]
+; TUNIT-NEXT:    store float 0x40119999A0000000, ptr [[F24]], align 4, !tbaa [[FLOAT_TBAA10:![0-9]+]]
 ; TUNIT-NEXT:    [[F37:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[AGG_RESULT]], i64 0, i32 5
-; TUNIT-NEXT:    store float 0x40119999A0000000, ptr [[F37]], align 4, !tbaa [[TBAA11:![0-9]+]]
-; TUNIT-NEXT:    store i32 1, ptr [[AGG_RESULT]], align 4, !tbaa [[TBAA12:![0-9]+]]
+; TUNIT-NEXT:    store float 0x40119999A0000000, ptr [[F37]], align 4, !tbaa [[FLOAT_TBAA11:![0-9]+]]
+; TUNIT-NEXT:    store i32 1, ptr [[AGG_RESULT]], align 4, !tbaa [[INT_TBAA12:![0-9]+]]
 ; TUNIT-NEXT:    [[I212:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[AGG_RESULT]], i64 0, i32 1
-; TUNIT-NEXT:    store i32 4, ptr [[I212]], align 4, !tbaa [[TBAA13:![0-9]+]]
+; TUNIT-NEXT:    store i32 4, ptr [[I212]], align 4, !tbaa [[INT_TBAA13:![0-9]+]]
 ; TUNIT-NEXT:    [[I316:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[AGG_RESULT]], i64 0, i32 2
-; TUNIT-NEXT:    store i32 4, ptr [[I316]], align 4, !tbaa [[TBAA14:![0-9]+]]
+; TUNIT-NEXT:    store i32 4, ptr [[I316]], align 4, !tbaa [[INT_TBAA14:![0-9]+]]
 ; TUNIT-NEXT:    call void @llvm.lifetime.end.p0(ptr noalias nofree noundef nonnull align 4 captures(none) dereferenceable(24) [[S]]) #[[ATTR17]]
 ; TUNIT-NEXT:    ret void
 ;
 ; CGSCC: Function Attrs: mustprogress nofree nosync nounwind willreturn memory(argmem: readwrite)
-; CGSCC-LABEL: define {{[^@]+}}@local_alloca_simplifiable_1
-; CGSCC-SAME: (ptr noalias nofree noundef nonnull writeonly sret([[STRUCT_S:%.*]]) align 4 captures(none) dereferenceable(24) [[AGG_RESULT:%.*]]) #[[ATTR1:[0-9]+]] {
-; CGSCC-NEXT:  entry:
+; CGSCC-LABEL: define void @local_alloca_simplifiable_1(
+; CGSCC-SAME: ptr noalias nofree noundef nonnull writeonly sret([[STRUCT_S:%.*]]) align 4 captures(none) dereferenceable(24) [[AGG_RESULT:%.*]]) #[[ATTR1:[0-9]+]] {
+; CGSCC-NEXT:  [[ENTRY:.*:]]
 ; CGSCC-NEXT:    [[S:%.*]] = alloca [[STRUCT_S]], align 4
 ; CGSCC-NEXT:    call void @llvm.lifetime.start.p0(ptr noalias nofree noundef nonnull align 4 captures(none) dereferenceable(24) [[S]]) #[[ATTR20:[0-9]+]]
 ; CGSCC-NEXT:    [[F1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S]], i64 0, i32 3
-; CGSCC-NEXT:    store float 0x3FF19999A0000000, ptr [[F1]], align 4, !tbaa [[TBAA7:![0-9]+]]
+; CGSCC-NEXT:    store float 0x3FF19999A0000000, ptr [[F1]], align 4, !tbaa [[FLOAT_TBAA7:![0-9]+]]
 ; CGSCC-NEXT:    [[F2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S]], i64 0, i32 4
-; CGSCC-NEXT:    store float 0x40019999A0000000, ptr [[F2]], align 4, !tbaa [[TBAA10:![0-9]+]]
+; CGSCC-NEXT:    store float 0x40019999A0000000, ptr [[F2]], align 4, !tbaa [[FLOAT_TBAA10:![0-9]+]]
 ; CGSCC-NEXT:    [[F3:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S]], i64 0, i32 5
-; CGSCC-NEXT:    store float 0x400A666660000000, ptr [[F3]], align 4, !tbaa [[TBAA11:![0-9]+]]
+; CGSCC-NEXT:    store float 0x400A666660000000, ptr [[F3]], align 4, !tbaa [[FLOAT_TBAA11:![0-9]+]]
 ; CGSCC-NEXT:    call void @write_arg(ptr noalias nofree noundef nonnull writeonly align 4 captures(none) dereferenceable(24) [[S]], i32 noundef 1) #[[ATTR21:[0-9]+]]
 ; CGSCC-NEXT:    [[I2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S]], i64 0, i32 1
 ; CGSCC-NEXT:    call void @write_arg(ptr nofree noundef nonnull writeonly align 4 captures(none) dereferenceable(20) [[I2]], i32 noundef 2) #[[ATTR21]]
 ; CGSCC-NEXT:    [[I3:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S]], i64 0, i32 2
 ; CGSCC-NEXT:    call void @write_arg(ptr nofree noundef nonnull writeonly align 4 captures(none) dereferenceable(16) [[I3]], i32 noundef 3) #[[ATTR21]]
 ; CGSCC-NEXT:    [[F11:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S]], i64 0, i32 3
-; CGSCC-NEXT:    [[I4:%.*]] = load float, ptr [[F11]], align 4, !tbaa [[TBAA7]]
+; CGSCC-NEXT:    [[I4:%.*]] = load float, ptr [[F11]], align 4, !tbaa [[FLOAT_TBAA7]]
 ; CGSCC-NEXT:    [[F12:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[AGG_RESULT]], i64 0, i32 3
-; CGSCC-NEXT:    store float [[I4]], ptr [[F12]], align 4, !tbaa [[TBAA7]]
+; CGSCC-NEXT:    store float [[I4]], ptr [[F12]], align 4, !tbaa [[FLOAT_TBAA7]]
 ; CGSCC-NEXT:    [[F23:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S]], i64 0, i32 4
-; CGSCC-NEXT:    [[I5:%.*]] = load float, ptr [[F23]], align 4, !tbaa [[TBAA10]]
+; CGSCC-NEXT:    [[I5:%.*]] = load float, ptr [[F23]], align 4, !tbaa [[FLOAT_TBAA10]]
 ; CGSCC-NEXT:    [[MUL:%.*]] = fmul float [[I5]], 2.000000e+00
 ; CGSCC-NEXT:    [[F24:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[AGG_RESULT]], i64 0, i32 4
-; CGSCC-NEXT:    store float [[MUL]], ptr [[F24]], align 4, !tbaa [[TBAA10]]
+; CGSCC-NEXT:    store float [[MUL]], ptr [[F24]], align 4, !tbaa [[FLOAT_TBAA10]]
 ; CGSCC-NEXT:    [[F35:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S]], i64 0, i32 5
-; CGSCC-NEXT:    [[I6:%.*]] = load float, ptr [[F35]], align 4, !tbaa [[TBAA11]]
+; CGSCC-NEXT:    [[I6:%.*]] = load float, ptr [[F35]], align 4, !tbaa [[FLOAT_TBAA11]]
 ; CGSCC-NEXT:    [[F16:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S]], i64 0, i32 3
-; CGSCC-NEXT:    [[I7:%.*]] = load float, ptr [[F16]], align 4, !tbaa [[TBAA7]]
+; CGSCC-NEXT:    [[I7:%.*]] = load float, ptr [[F16]], align 4, !tbaa [[FLOAT_TBAA7]]
 ; CGSCC-NEXT:    [[ADD:%.*]] = fadd float [[I6]], [[I7]]
 ; CGSCC-NEXT:    [[F37:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[AGG_RESULT]], i64 0, i32 5
-; CGSCC-NEXT:    store float [[ADD]], ptr [[F37]], align 4, !tbaa [[TBAA11]]
-; CGSCC-NEXT:    [[I8:%.*]] = load i32, ptr [[S]], align 4, !tbaa [[TBAA12:![0-9]+]]
-; CGSCC-NEXT:    store i32 [[I8]], ptr [[AGG_RESULT]], align 4, !tbaa [[TBAA12]]
+; CGSCC-NEXT:    store float [[ADD]], ptr [[F37]], align 4, !tbaa [[FLOAT_TBAA11]]
+; CGSCC-NEXT:    [[I8:%.*]] = load i32, ptr [[S]], align 4, !tbaa [[INT_TBAA12:![0-9]+]]
+; CGSCC-NEXT:    store i32 [[I8]], ptr [[AGG_RESULT]], align 4, !tbaa [[INT_TBAA12]]
 ; CGSCC-NEXT:    [[I210:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S]], i64 0, i32 1
-; CGSCC-NEXT:    [[I9:%.*]] = load i32, ptr [[I210]], align 4, !tbaa [[TBAA13:![0-9]+]]
+; CGSCC-NEXT:    [[I9:%.*]] = load i32, ptr [[I210]], align 4, !tbaa [[INT_TBAA13:![0-9]+]]
 ; CGSCC-NEXT:    [[MUL11:%.*]] = shl nsw i32 [[I9]], 1
 ; CGSCC-NEXT:    [[I212:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[AGG_RESULT]], i64 0, i32 1
-; CGSCC-NEXT:    store i32 [[MUL11]], ptr [[I212]], align 4, !tbaa [[TBAA13]]
+; CGSCC-NEXT:    store i32 [[MUL11]], ptr [[I212]], align 4, !tbaa [[INT_TBAA13]]
 ; CGSCC-NEXT:    [[I313:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S]], i64 0, i32 2
-; CGSCC-NEXT:    [[I10:%.*]] = load i32, ptr [[I313]], align 4, !tbaa [[TBAA14:![0-9]+]]
-; CGSCC-NEXT:    [[I11:%.*]] = load i32, ptr [[S]], align 4, !tbaa [[TBAA12]]
+; CGSCC-NEXT:    [[I10:%.*]] = load i32, ptr [[I313]], align 4, !tbaa [[INT_TBAA14:![0-9]+]]
+; CGSCC-NEXT:    [[I11:%.*]] = load i32, ptr [[S]], align 4, !tbaa [[INT_TBAA12]]
 ; CGSCC-NEXT:    [[ADD15:%.*]] = add nsw i32 [[I10]], [[I11]]
 ; CGSCC-NEXT:    [[I316:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[AGG_RESULT]], i64 0, i32 2
-; CGSCC-NEXT:    store i32 [[ADD15]], ptr [[I316]], align 4, !tbaa [[TBAA14]]
+; CGSCC-NEXT:    store i32 [[ADD15]], ptr [[I316]], align 4, !tbaa [[INT_TBAA14]]
 ; CGSCC-NEXT:    call void @llvm.lifetime.end.p0(ptr noalias nofree noundef nonnull align 4 captures(none) dereferenceable(24) [[S]]) #[[ATTR20]]
 ; CGSCC-NEXT:    ret void
 ;
@@ -256,156 +256,156 @@ declare void @llvm.lifetime.end.p0(ptr nocapture)
 ;
 define void @local_alloca_simplifiable_2() {
 ; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn
-; TUNIT-LABEL: define {{[^@]+}}@local_alloca_simplifiable_2
-; TUNIT-SAME: () #[[ATTR3:[0-9]+]] {
-; TUNIT-NEXT:  entry:
+; TUNIT-LABEL: define void @local_alloca_simplifiable_2(
+; TUNIT-SAME: ) #[[ATTR3:[0-9]+]] {
+; TUNIT-NEXT:  [[ENTRY:.*]]:
 ; TUNIT-NEXT:    [[BYTES:%.*]] = alloca [1024 x i8], align 16
 ; TUNIT-NEXT:    call void @llvm.lifetime.start.p0(ptr noalias nofree noundef nonnull align 16 captures(none) dereferenceable(1024) [[BYTES]]) #[[ATTR17]]
-; TUNIT-NEXT:    br label [[FOR_COND:%.*]]
-; TUNIT:       for.cond:
-; TUNIT-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ 0, [[ENTRY:%.*]] ]
+; TUNIT-NEXT:    br label %[[FOR_COND:.*]]
+; TUNIT:       [[FOR_COND]]:
+; TUNIT-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_INC:.*]] ], [ 0, %[[ENTRY]] ]
 ; TUNIT-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV]], 100
-; TUNIT-NEXT:    br i1 [[EXITCOND]], label [[FOR_BODY:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; TUNIT:       for.cond.cleanup:
-; TUNIT-NEXT:    br label [[FOR_END:%.*]]
-; TUNIT:       for.body:
+; TUNIT-NEXT:    br i1 [[EXITCOND]], label %[[FOR_BODY:.*]], label %[[FOR_COND_CLEANUP:.*]]
+; TUNIT:       [[FOR_COND_CLEANUP]]:
+; TUNIT-NEXT:    br label %[[FOR_END:.*]]
+; TUNIT:       [[FOR_BODY]]:
 ; TUNIT-NEXT:    [[I15:%.*]] = mul nuw nsw i64 [[INDVARS_IV]], 10
 ; TUNIT-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1024 x i8], ptr [[BYTES]], i64 0, i64 [[I15]]
-; TUNIT-NEXT:    br label [[FOR_INC]]
-; TUNIT:       for.inc:
+; TUNIT-NEXT:    br label %[[FOR_INC]]
+; TUNIT:       [[FOR_INC]]:
 ; TUNIT-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; TUNIT-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP15:![0-9]+]]
-; TUNIT:       for.end:
-; TUNIT-NEXT:    br label [[FOR_COND2:%.*]]
-; TUNIT:       for.cond2:
-; TUNIT-NEXT:    [[INDVARS_IV2:%.*]] = phi i64 [ [[INDVARS_IV_NEXT3:%.*]], [[FOR_INC9:%.*]] ], [ 0, [[FOR_END]] ]
+; TUNIT-NEXT:    br label %[[FOR_COND]], !llvm.loop [[LOOP15:![0-9]+]]
+; TUNIT:       [[FOR_END]]:
+; TUNIT-NEXT:    br label %[[FOR_COND2:.*]]
+; TUNIT:       [[FOR_COND2]]:
+; TUNIT-NEXT:    [[INDVARS_IV2:%.*]] = phi i64 [ [[INDVARS_IV_NEXT3:%.*]], %[[FOR_INC9:.*]] ], [ 0, %[[FOR_END]] ]
 ; TUNIT-NEXT:    [[EXITCOND6:%.*]] = icmp ne i64 [[INDVARS_IV2]], 10
-; TUNIT-NEXT:    br i1 [[EXITCOND6]], label [[FOR_BODY5:%.*]], label [[FOR_COND_CLEANUP4:%.*]]
-; TUNIT:       for.cond.cleanup4:
-; TUNIT-NEXT:    br label [[FOR_END11:%.*]]
-; TUNIT:       for.body5:
+; TUNIT-NEXT:    br i1 [[EXITCOND6]], label %[[FOR_BODY5:.*]], label %[[FOR_COND_CLEANUP4:.*]]
+; TUNIT:       [[FOR_COND_CLEANUP4]]:
+; TUNIT-NEXT:    br label %[[FOR_END11:.*]]
+; TUNIT:       [[FOR_BODY5]]:
 ; TUNIT-NEXT:    [[I17:%.*]] = mul nuw nsw i64 [[INDVARS_IV2]], 10
 ; TUNIT-NEXT:    [[I18:%.*]] = or i64 [[I17]], 1
 ; TUNIT-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[BYTES]], i64 [[I18]]
-; TUNIT-NEXT:    br label [[FOR_INC9]]
-; TUNIT:       for.inc9:
+; TUNIT-NEXT:    br label %[[FOR_INC9]]
+; TUNIT:       [[FOR_INC9]]:
 ; TUNIT-NEXT:    [[INDVARS_IV_NEXT3]] = add nuw nsw i64 [[INDVARS_IV2]], 1
-; TUNIT-NEXT:    br label [[FOR_COND2]], !llvm.loop [[LOOP17:![0-9]+]]
-; TUNIT:       for.end11:
-; TUNIT-NEXT:    br label [[FOR_COND13:%.*]]
-; TUNIT:       for.cond13:
-; TUNIT-NEXT:    [[INDVARS_IV7:%.*]] = phi i64 [ [[INDVARS_IV_NEXT8:%.*]], [[FOR_INC22:%.*]] ], [ 0, [[FOR_END11]] ]
+; TUNIT-NEXT:    br label %[[FOR_COND2]], !llvm.loop [[LOOP17:![0-9]+]]
+; TUNIT:       [[FOR_END11]]:
+; TUNIT-NEXT:    br label %[[FOR_COND13:.*]]
+; TUNIT:       [[FOR_COND13]]:
+; TUNIT-NEXT:    [[INDVARS_IV7:%.*]] = phi i64 [ [[INDVARS_IV_NEXT8:%.*]], %[[FOR_INC22:.*]] ], [ 0, %[[FOR_END11]] ]
 ; TUNIT-NEXT:    [[EXITCOND11:%.*]] = icmp ne i64 [[INDVARS_IV7]], 20
-; TUNIT-NEXT:    br i1 [[EXITCOND11]], label [[FOR_BODY16:%.*]], label [[FOR_COND_CLEANUP15:%.*]]
-; TUNIT:       for.cond.cleanup15:
-; TUNIT-NEXT:    br label [[FOR_END24:%.*]]
-; TUNIT:       for.body16:
+; TUNIT-NEXT:    br i1 [[EXITCOND11]], label %[[FOR_BODY16:.*]], label %[[FOR_COND_CLEANUP15:.*]]
+; TUNIT:       [[FOR_COND_CLEANUP15]]:
+; TUNIT-NEXT:    br label %[[FOR_END24:.*]]
+; TUNIT:       [[FOR_BODY16]]:
 ; TUNIT-NEXT:    [[I20:%.*]] = mul nuw nsw i64 [[INDVARS_IV7]], 10
 ; TUNIT-NEXT:    [[I21:%.*]] = add nuw nsw i64 [[I20]], 2
 ; TUNIT-NEXT:    [[ARRAYIDX21:%.*]] = getelementptr inbounds i64, ptr [[BYTES]], i64 [[I21]]
-; TUNIT-NEXT:    br label [[FOR_INC22]]
-; TUNIT:       for.inc22:
+; TUNIT-NEXT:    br label %[[FOR_INC22]]
+; TUNIT:       [[FOR_INC22]]:
 ; TUNIT-NEXT:    [[INDVARS_IV_NEXT8]] = add nuw nsw i64 [[INDVARS_IV7]], 1
-; TUNIT-NEXT:    br label [[FOR_COND13]], !llvm.loop [[LOOP18:![0-9]+]]
-; TUNIT:       for.end24:
+; TUNIT-NEXT:    br label %[[FOR_COND13]], !llvm.loop [[LOOP18:![0-9]+]]
+; TUNIT:       [[FOR_END24]]:
 ; TUNIT-NEXT:    [[ARRAYIDX25:%.*]] = getelementptr inbounds [1024 x i8], ptr [[BYTES]], i64 0, i64 1023
 ; TUNIT-NEXT:    [[ARRAYIDX26:%.*]] = getelementptr inbounds [1024 x i8], ptr [[BYTES]], i64 0, i64 500
 ; TUNIT-NEXT:    call void @write_arg(ptr nofree noundef nonnull writeonly align 4 captures(none) dereferenceable(524) [[ARRAYIDX26]], i32 noundef 0) #[[ATTR18]]
-; TUNIT-NEXT:    br label [[FOR_COND28:%.*]]
-; TUNIT:       for.cond28:
-; TUNIT-NEXT:    [[INDVARS_IV12:%.*]] = phi i64 [ [[INDVARS_IV_NEXT13:%.*]], [[FOR_INC36:%.*]] ], [ 0, [[FOR_END24]] ]
+; TUNIT-NEXT:    br label %[[FOR_COND28:.*]]
+; TUNIT:       [[FOR_COND28]]:
+; TUNIT-NEXT:    [[INDVARS_IV12:%.*]] = phi i64 [ [[INDVARS_IV_NEXT13:%.*]], %[[FOR_INC36:.*]] ], [ 0, %[[FOR_END24]] ]
 ; TUNIT-NEXT:    [[EXITCOND14:%.*]] = icmp ne i64 [[INDVARS_IV12]], 1024
-; TUNIT-NEXT:    br i1 [[EXITCOND14]], label [[FOR_BODY31:%.*]], label [[FOR_COND_CLEANUP30:%.*]]
-; TUNIT:       for.cond.cleanup30:
-; TUNIT-NEXT:    br label [[FOR_END38:%.*]]
-; TUNIT:       for.body31:
+; TUNIT-NEXT:    br i1 [[EXITCOND14]], label %[[FOR_BODY31:.*]], label %[[FOR_COND_CLEANUP30:.*]]
+; TUNIT:       [[FOR_COND_CLEANUP30]]:
+; TUNIT-NEXT:    br label %[[FOR_END38:.*]]
+; TUNIT:       [[FOR_BODY31]]:
 ; TUNIT-NEXT:    [[ARRAYIDX35:%.*]] = getelementptr inbounds [1024 x i8], ptr @globalBytes, i64 0, i64 [[INDVARS_IV12]]
-; TUNIT-NEXT:    store i8 0, ptr [[ARRAYIDX35]], align 1, !tbaa [[TBAA19:![0-9]+]]
-; TUNIT-NEXT:    br label [[FOR_INC36]]
-; TUNIT:       for.inc36:
+; TUNIT-NEXT:    store i8 0, ptr [[ARRAYIDX35]], align 1, !tbaa [[CHAR_TBAA19:![0-9]+]]
+; TUNIT-NEXT:    br label %[[FOR_INC36]]
+; TUNIT:       [[FOR_INC36]]:
 ; TUNIT-NEXT:    [[INDVARS_IV_NEXT13]] = add nuw nsw i64 [[INDVARS_IV12]], 1
-; TUNIT-NEXT:    br label [[FOR_COND28]], !llvm.loop [[LOOP20:![0-9]+]]
-; TUNIT:       for.end38:
+; TUNIT-NEXT:    br label %[[FOR_COND28]], !llvm.loop [[LOOP20:![0-9]+]]
+; TUNIT:       [[FOR_END38]]:
 ; TUNIT-NEXT:    call void @llvm.lifetime.end.p0(ptr noalias nofree noundef nonnull align 16 captures(none) dereferenceable(1024) [[BYTES]]) #[[ATTR17]]
 ; TUNIT-NEXT:    ret void
 ;
 ; CGSCC: Function Attrs: mustprogress nofree nosync nounwind willreturn
-; CGSCC-LABEL: define {{[^@]+}}@local_alloca_simplifiable_2
-; CGSCC-SAME: () #[[ATTR3:[0-9]+]] {
-; CGSCC-NEXT:  entry:
+; CGSCC-LABEL: define void @local_alloca_simplifiable_2(
+; CGSCC-SAME: ) #[[ATTR3:[0-9]+]] {
+; CGSCC-NEXT:  [[ENTRY:.*]]:
 ; CGSCC-NEXT:    [[BYTES:%.*]] = alloca [1024 x i8], align 16
 ; CGSCC-NEXT:    call void @llvm.lifetime.start.p0(ptr noalias nofree noundef nonnull align 16 captures(none) dereferenceable(1024) [[BYTES]]) #[[ATTR20]]
-; CGSCC-NEXT:    br label [[FOR_COND:%.*]]
-; CGSCC:       for.cond:
-; CGSCC-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ 0, [[ENTRY:%.*]] ]
+; CGSCC-NEXT:    br label %[[FOR_COND:.*]]
+; CGSCC:       [[FOR_COND]]:
+; CGSCC-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_INC:.*]] ], [ 0, %[[ENTRY]] ]
 ; CGSCC-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV]], 100
-; CGSCC-NEXT:    br i1 [[EXITCOND]], label [[FOR_BODY:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; CGSCC:       for.cond.cleanup:
-; CGSCC-NEXT:    br label [[FOR_END:%.*]]
-; CGSCC:       for.body:
+; CGSCC-NEXT:    br i1 [[EXITCOND]], label %[[FOR_BODY:.*]], label %[[FOR_COND_CLEANUP:.*]]
+; CGSCC:       [[FOR_COND_CLEANUP]]:
+; CGSCC-NEXT:    br label %[[FOR_END:.*]]
+; CGSCC:       [[FOR_BODY]]:
 ; CGSCC-NEXT:    [[I15:%.*]] = mul nuw nsw i64 [[INDVARS_IV]], 10
 ; CGSCC-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1024 x i8], ptr [[BYTES]], i64 0, i64 [[I15]]
-; CGSCC-NEXT:    store i8 0, ptr [[ARRAYIDX]], align 2, !tbaa [[TBAA15:![0-9]+]]
-; CGSCC-NEXT:    br label [[FOR_INC]]
-; CGSCC:       for.inc:
+; CGSCC-NEXT:    store i8 0, ptr [[ARRAYIDX]], align 2, !tbaa [[CHAR_TBAA15:![0-9]+]]
+; CGSCC-NEXT:    br label %[[FOR_INC]]
+; CGSCC:       [[FOR_INC]]:
 ; CGSCC-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CGSCC-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP16:![0-9]+]]
-; CGSCC:       for.end:
-; CGSCC-NEXT:    br label [[FOR_COND2:%.*]]
-; CGSCC:       for.cond2:
-; CGSCC-NEXT:    [[INDVARS_IV2:%.*]] = phi i64 [ [[INDVARS_IV_NEXT3:%.*]], [[FOR_INC9:%.*]] ], [ 0, [[FOR_END]] ]
+; CGSCC-NEXT:    br label %[[FOR_COND]], !llvm.loop [[LOOP16:![0-9]+]]
+; CGSCC:       [[FOR_END]]:
+; CGSCC-NEXT:    br label %[[FOR_COND2:.*]]
+; CGSCC:       [[FOR_COND2]]:
+; CGSCC-NEXT:    [[INDVARS_IV2:%.*]] = phi i64 [ [[INDVARS_IV_NEXT3:%.*]], %[[FOR_INC9:.*]] ], [ 0, %[[FOR_END]] ]
 ; CGSCC-NEXT:    [[EXITCOND6:%.*]] = icmp ne i64 [[INDVARS_IV2]], 10
-; CGSCC-NEXT:    br i1 [[EXITCOND6]], label [[FOR_BODY5:%.*]], label [[FOR_COND_CLEANUP4:%.*]]
-; CGSCC:       for.cond.cleanup4:
-; CGSCC-NEXT:    br label [[FOR_END11:%.*]]
-; CGSCC:       for.body5:
+; CGSCC-NEXT:    br i1 [[EXITCOND6]], label %[[FOR_BODY5:.*]], label %[[FOR_COND_CLEANUP4:.*]]
+; CGSCC:       [[FOR_COND_CLEANUP4]]:
+; CGSCC-NEXT:    br label %[[FOR_END11:.*]]
+; CGSCC:       [[FOR_BODY5]]:
 ; CGSCC-NEXT:    [[I17:%.*]] = mul nuw nsw i64 [[INDVARS_IV2]], 10
 ; CGSCC-NEXT:    [[I18:%.*]] = or i64 [[I17]], 1
 ; CGSCC-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[BYTES]], i64 [[I18]]
-; CGSCC-NEXT:    store float 0.000000e+00, ptr [[ARRAYIDX8]], align 4, !tbaa [[TBAA18:![0-9]+]]
-; CGSCC-NEXT:    br label [[FOR_INC9]]
-; CGSCC:       for.inc9:
+; CGSCC-NEXT:    store float 0.000000e+00, ptr [[ARRAYIDX8]], align 4, !tbaa [[FLOAT_TBAA18:![0-9]+]]
+; CGSCC-NEXT:    br label %[[FOR_INC9]]
+; CGSCC:       [[FOR_INC9]]:
 ; CGSCC-NEXT:    [[INDVARS_IV_NEXT3]] = add nuw nsw i64 [[INDVARS_IV2]], 1
-; CGSCC-NEXT:    br label [[FOR_COND2]], !llvm.loop [[LOOP19:![0-9]+]]
-; CGSCC:       for.end11:
-; CGSCC-NEXT:    br label [[FOR_COND13:%.*]]
-; CGSCC:       for.cond13:
-; CGSCC-NEXT:    [[INDVARS_IV7:%.*]] = phi i64 [ [[INDVARS_IV_NEXT8:%.*]], [[FOR_INC22:%.*]] ], [ 0, [[FOR_END11]] ]
+; CGSCC-NEXT:    br label %[[FOR_COND2]], !llvm.loop [[LOOP19:![0-9]+]]
+; CGSCC:       [[FOR_END11]]:
+; CGSCC-NEXT:    br label %[[FOR_COND13:.*]]
+; CGSCC:       [[FOR_COND13]]:
+; CGSCC-NEXT:    [[INDVARS_IV7:%.*]] = phi i64 [ [[INDVARS_IV_NEXT8:%.*]], %[[FOR_INC22:.*]] ], [ 0, %[[FOR_END11]] ]
 ; CGSCC-NEXT:    [[EXITCOND11:%.*]] = icmp ne i64 [[INDVARS_IV7]], 20
-; CGSCC-NEXT:    br i1 [[EXITCOND11]], label [[FOR_BODY16:%.*]], label [[FOR_COND_CLEANUP15:%.*]]
-; CGSCC:       for.cond.cleanup15:
-; CGSCC-NEXT:    br label [[FOR_END24:%.*]]
-; CGSCC:       for.body16:
+; CGSCC-NEXT:    br i1 [[EXITCOND11]], label %[[FOR_BODY16:.*]], label %[[FOR_COND_CLEANUP15:.*]]
+; CGSCC:       [[FOR_COND_CLEANUP15]]:
+; CGSCC-NEXT:    br label %[[FOR_END24:.*]]
+; CGSCC:       [[FOR_BODY16]]:
 ; CGSCC-NEXT:    [[I20:%.*]] = mul nuw nsw i64 [[INDVARS_IV7]], 10
 ; CGSCC-NEXT:    [[I21:%.*]] = add nuw nsw i64 [[I20]], 2
 ; CGSCC-NEXT:    [[ARRAYIDX21:%.*]] = getelementptr inbounds i64, ptr [[BYTES]], i64 [[I21]]
-; CGSCC-NEXT:    store i64 0, ptr [[ARRAYIDX21]], align 16, !tbaa [[TBAA20:![0-9]+]]
-; CGSCC-NEXT:    br label [[FOR_INC22]]
-; CGSCC:       for.inc22:
+; CGSCC-NEXT:    store i64 0, ptr [[ARRAYIDX21]], align 16, !tbaa [[LONG_LONG_TBAA20:![0-9]+]]
+; CGSCC-NEXT:    br label %[[FOR_INC22]]
+; CGSCC:       [[FOR_INC22]]:
 ; CGSCC-NEXT:    [[INDVARS_IV_NEXT8]] = add nuw nsw i64 [[INDVARS_IV7]], 1
-; CGSCC-NEXT:    br label [[FOR_COND13]], !llvm.loop [[LOOP22:![0-9]+]]
-; CGSCC:       for.end24:
+; CGSCC-NEXT:    br label %[[FOR_COND13]], !llvm.loop [[LOOP22:![0-9]+]]
+; CGSCC:       [[FOR_END24]]:
 ; CGSCC-NEXT:    [[ARRAYIDX25:%.*]] = getelementptr inbounds [1024 x i8], ptr [[BYTES]], i64 0, i64 1023
-; CGSCC-NEXT:    store i8 0, ptr [[ARRAYIDX25]], align 1, !tbaa [[TBAA15]]
+; CGSCC-NEXT:    store i8 0, ptr [[ARRAYIDX25]], align 1, !tbaa [[CHAR_TBAA15]]
 ; CGSCC-NEXT:    [[ARRAYIDX26:%.*]] = getelementptr inbounds [1024 x i8], ptr [[BYTES]], i64 0, i64 500
 ; CGSCC-NEXT:    call void @write_arg(ptr nofree noundef nonnull writeonly align 4 captures(none) dereferenceable(524) [[ARRAYIDX26]], i32 noundef 0) #[[ATTR21]]
-; CGSCC-NEXT:    br label [[FOR_COND28:%.*]]
-; CGSCC:       for.cond28:
-; CGSCC-NEXT:    [[INDVARS_IV12:%.*]] = phi i64 [ [[INDVARS_IV_NEXT13:%.*]], [[FOR_INC36:%.*]] ], [ 0, [[FOR_END24]] ]
+; CGSCC-NEXT:    br label %[[FOR_COND28:.*]]
+; CGSCC:       [[FOR_COND28]]:
+; CGSCC-NEXT:    [[INDVARS_IV12:%.*]] = phi i64 [ [[INDVARS_IV_NEXT13:%.*]], %[[FOR_INC36:.*]] ], [ 0, %[[FOR_END24]] ]
 ; CGSCC-NEXT:    [[EXITCOND14:%.*]] = icmp ne i64 [[INDVARS_IV12]], 1024
-; CGSCC-NEXT:    br i1 [[EXITCOND14]], label [[FOR_BODY31:%.*]], label [[FOR_COND_CLEANUP30:%.*]]
-; CGSCC:       for.cond.cleanup30:
-; CGSCC-NEXT:    br label [[FOR_END38:%.*]]
-; CGSCC:       for.body31:
+; CGSCC-NEXT:    br i1 [[EXITCOND14]], label %[[FOR_BODY31:.*]], label %[[FOR_COND_CLEANUP30:.*]]
+; CGSCC:       [[FOR_COND_CLEANUP30]]:
+; CGSCC-NEXT:    br label %[[FOR_END38:.*]]
+; CGSCC:       [[FOR_BODY31]]:
 ; CGSCC-NEXT:    [[ARRAYIDX33:%.*]] = getelementptr inbounds [1024 x i8], ptr [[BYTES]], i64 0, i64 [[INDVARS_IV12]]
-; CGSCC-NEXT:    [[I23:%.*]] = load i8, ptr [[ARRAYIDX33]], align 1, !tbaa [[TBAA15]]
+; CGSCC-NEXT:    [[I23:%.*]] = load i8, ptr [[ARRAYIDX33]], align 1, !tbaa [[CHAR_TBAA15]]
 ; CGSCC-NEXT:    [[ARRAYIDX35:%.*]] = getelementptr inbounds [1024 x i8], ptr @globalBytes, i64 0, i64 [[INDVARS_IV12]]
-; CGSCC-NEXT:    store i8 [[I23]], ptr [[ARRAYIDX35]], align 1, !tbaa [[TBAA15]]
-; CGSCC-NEXT:    br label [[FOR_INC36]]
-; CGSCC:       for.inc36:
+; CGSCC-NEXT:    store i8 [[I23]], ptr [[ARRAYIDX35]], align 1, !tbaa [[CHAR_TBAA15]]
+; CGSCC-NEXT:    br label %[[FOR_INC36]]
+; CGSCC:       [[FOR_INC36]]:
 ; CGSCC-NEXT:    [[INDVARS_IV_NEXT13]] = add nuw nsw i64 [[INDVARS_IV12]], 1
-; CGSCC-NEXT:    br label [[FOR_COND28]], !llvm.loop [[LOOP23:![0-9]+]]
-; CGSCC:       for.end38:
+; CGSCC-NEXT:    br label %[[FOR_COND28]], !llvm.loop [[LOOP23:![0-9]+]]
+; CGSCC:       [[FOR_END38]]:
 ; CGSCC-NEXT:    call void @llvm.lifetime.end.p0(ptr noalias nofree noundef nonnull align 16 captures(none) dereferenceable(1024) [[BYTES]]) #[[ATTR20]]
 ; CGSCC-NEXT:    ret void
 ;
@@ -516,10 +516,10 @@ for.end38:                                        ; preds = %for.cond.cleanup30
 ;
 define i32 @local_alloca_simplifiable_3() {
 ; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
-; CHECK-LABEL: define {{[^@]+}}@local_alloca_simplifiable_3
-; CHECK-SAME: () #[[ATTR4:[0-9]+]] {
-; CHECK-NEXT:    br label [[SPLIT:%.*]]
-; CHECK:       split:
+; CHECK-LABEL: define noundef i32 @local_alloca_simplifiable_3(
+; CHECK-SAME: ) #[[ATTR4:[0-9]+]] {
+; CHECK-NEXT:    br label %[[SPLIT:.*]]
+; CHECK:       [[SPLIT]]:
 ; CHECK-NEXT:    ret i32 2
 ;
   %A = alloca i32, align 4
@@ -537,8 +537,8 @@ split:
 ;
 define i32 @local_alloca_simplifiable_4() {
 ; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
-; CHECK-LABEL: define {{[^@]+}}@local_alloca_simplifiable_4
-; CHECK-SAME: () #[[ATTR4]] {
+; CHECK-LABEL: define i32 @local_alloca_simplifiable_4(
+; CHECK-SAME: ) #[[ATTR4]] {
 ; CHECK-NEXT:    ret i32 undef
 ;
   %A = alloca i32, align 4
@@ -554,34 +554,34 @@ define i32 @local_alloca_simplifiable_4() {
 ;    }
 define i32 @multi_obj_simplifiable_1(i32 %cnd) {
 ; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn
-; TUNIT-LABEL: define {{[^@]+}}@multi_obj_simplifiable_1
-; TUNIT-SAME: (i32 [[CND:%.*]]) #[[ATTR3]] {
-; TUNIT-NEXT:  entry:
+; TUNIT-LABEL: define noundef i32 @multi_obj_simplifiable_1(
+; TUNIT-SAME: i32 [[CND:%.*]]) #[[ATTR3]] {
+; TUNIT-NEXT:  [[ENTRY:.*:]]
 ; TUNIT-NEXT:    [[L:%.*]] = alloca i32, align 4
 ; TUNIT-NEXT:    call void @llvm.lifetime.start.p0(ptr noalias nofree noundef nonnull align 4 captures(none) dereferenceable(4) [[L]]) #[[ATTR17]]
 ; TUNIT-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i32 [[CND]], 0
-; TUNIT-NEXT:    br i1 [[TOBOOL_NOT]], label [[COND_FALSE:%.*]], label [[COND_TRUE:%.*]]
-; TUNIT:       cond.true:
-; TUNIT-NEXT:    br label [[COND_END:%.*]]
-; TUNIT:       cond.false:
-; TUNIT-NEXT:    br label [[COND_END]]
-; TUNIT:       cond.end:
+; TUNIT-NEXT:    br i1 [[TOBOOL_NOT]], label %[[COND_FALSE:.*]], label %[[COND_TRUE:.*]]
+; TUNIT:       [[COND_TRUE]]:
+; TUNIT-NEXT:    br label %[[COND_END:.*]]
+; TUNIT:       [[COND_FALSE]]:
+; TUNIT-NEXT:    br label %[[COND_END]]
+; TUNIT:       [[COND_END]]:
 ; TUNIT-NEXT:    call void @llvm.lifetime.end.p0(ptr noalias nofree noundef nonnull align 4 captures(none) dereferenceable(4) [[L]]) #[[ATTR17]]
 ; TUNIT-NEXT:    ret i32 5
 ;
 ; CGSCC: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn
-; CGSCC-LABEL: define {{[^@]+}}@multi_obj_simplifiable_1
-; CGSCC-SAME: (i32 [[CND:%.*]]) #[[ATTR5:[0-9]+]] {
-; CGSCC-NEXT:  entry:
+; CGSCC-LABEL: define noundef i32 @multi_obj_simplifiable_1(
+; CGSCC-SAME: i32 [[CND:%.*]]) #[[ATTR5:[0-9]+]] {
+; CGSCC-NEXT:  [[ENTRY:.*:]]
 ; CGSCC-NEXT:    [[L:%.*]] = alloca i32, align 4
 ; CGSCC-NEXT:    call void @llvm.lifetime.start.p0(ptr noalias nofree noundef nonnull align 4 captures(none) dereferenceable(4) [[L]]) #[[ATTR20]]
 ; CGSCC-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i32 [[CND]], 0
-; CGSCC-NEXT:    br i1 [[TOBOOL_NOT]], label [[COND_FALSE:%.*]], label [[COND_TRUE:%.*]]
-; CGSCC:       cond.true:
-; CGSCC-NEXT:    br label [[COND_END:%.*]]
-; CGSCC:       cond.false:
-; CGSCC-NEXT:    br label [[COND_END]]
-; CGSCC:       cond.end:
+; CGSCC-NEXT:    br i1 [[TOBOOL_NOT]], label %[[COND_FALSE:.*]], label %[[COND_TRUE:.*]]
+; CGSCC:       [[COND_TRUE]]:
+; CGSCC-NEXT:    br label %[[COND_END:.*]]
+; CGSCC:       [[COND_FALSE]]:
+; CGSCC-NEXT:    br label %[[COND_END]]
+; CGSCC:       [[COND_END]]:
 ; CGSCC-NEXT:    call void @llvm.lifetime.end.p0(ptr noalias nofree noundef nonnull align 4 captures(none) dereferenceable(4) [[L]]) #[[ATTR20]]
 ; CGSCC-NEXT:    ret i32 5
 ;
@@ -616,34 +616,34 @@ cond.end:                                         ; preds = %cond.false, %cond.t
 ;
 define i32 @multi_obj_simplifiable_2(i32 %cnd) {
 ; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn
-; TUNIT-LABEL: define {{[^@]+}}@multi_obj_simplifiable_2
-; TUNIT-SAME: (i32 [[CND:%.*]]) #[[ATTR3]] {
-; TUNIT-NEXT:  entry:
+; TUNIT-LABEL: define i32 @multi_obj_simplifiable_2(
+; TUNIT-SAME: i32 [[CND:%.*]]) #[[ATTR3]] {
+; TUNIT-NEXT:  [[ENTRY:.*:]]
 ; TUNIT-NEXT:    [[L:%.*]] = alloca i32, align 4
 ; TUNIT-NEXT:    call void @llvm.lifetime.start.p0(ptr noalias nofree noundef nonnull align 4 captures(none) dereferenceable(4) [[L]]) #[[ATTR17]]
 ; TUNIT-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i32 [[CND]], 0
-; TUNIT-NEXT:    br i1 [[TOBOOL_NOT]], label [[COND_FALSE:%.*]], label [[COND_TRUE:%.*]]
-; TUNIT:       cond.true:
-; TUNIT-NEXT:    br label [[COND_END:%.*]]
-; TUNIT:       cond.false:
-; TUNIT-NEXT:    br label [[COND_END]]
-; TUNIT:       cond.end:
+; TUNIT-NEXT:    br i1 [[TOBOOL_NOT]], label %[[COND_FALSE:.*]], label %[[COND_TRUE:.*]]
+; TUNIT:       [[COND_TRUE]]:
+; TUNIT-NEXT:    br label %[[COND_END:.*]]
+; TUNIT:       [[COND_FALSE]]:
+; TUNIT-NEXT:    br label %[[COND_END]]
+; TUNIT:       [[COND_END]]:
 ; TUNIT-NEXT:    call void @llvm.lifetime.end.p0(ptr noalias nofree noundef nonnull align 4 captures(none) dereferenceable(4) [[L]]) #[[ATTR17]]
 ; TUNIT-NEXT:    ret i32 5
 ;
 ; CGSCC: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn
-; CGSCC-LABEL: define {{[^@]+}}@multi_obj_simplifiable_2
-; CGSCC-SAME: (i32 [[CND:%.*]]) #[[ATTR5]] {
-; CGSCC-NEXT:  entry:
+; CGSCC-LABEL: define i32 @multi_obj_simplifiable_2(
+; CGSCC-SAME: i32 [[CND:%.*]]) #[[ATTR5]] {
+; CGSCC-NEXT:  [[ENTRY:.*:]]
 ; CGSCC-NEXT:    [[L:%.*]] = alloca i32, align 4
 ; CGSCC-NEXT:    call void @llvm.lifetime.start.p0(ptr noalias nofree noundef nonnull align 4 captures(none) dereferenceable(4) [[L]]) #[[ATTR20]]
 ; CGSCC-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i32 [[CND]], 0
-; CGSCC-NEXT:    br i1 [[TOBOOL_NOT]], label [[COND_FALSE:%.*]], label [[COND_TRUE:%.*]]
-; CGSCC:       cond.true:
-; CGSCC-NEXT:    br label [[COND_END:%.*]]
-; CGSCC:       cond.false:
-; CGSCC-NEXT:    br label [[COND_END]]
-; CGSCC:       cond.end:
+; CGSCC-NEXT:    br i1 [[TOBOOL_NOT]], label %[[COND_FALSE:.*]], label %[[COND_TRUE:.*]]
+; CGSCC:       [[COND_TRUE]]:
+; CGSCC-NEXT:    br label %[[COND_END:.*]]
+; CGSCC:       [[COND_FALSE]]:
+; CGSCC-NEXT:    br label %[[COND_END]]
+; CGSCC:       [[COND_END]]:
 ; CGSCC-NEXT:    call void @llvm.lifetime.end.p0(ptr noalias nofree noundef nonnull align 4 captures(none) dereferenceable(4) [[L]]) #[[ATTR20]]
 ; CGSCC-NEXT:    ret i32 5
 ;
@@ -687,58 +687,58 @@ cond.end:                                         ; preds = %cond.false, %cond.t
 ;
 define void @static_global_simplifiable_1(ptr noalias sret(%struct.S) align 4 %agg.result) {
 ; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(write)
-; TUNIT-LABEL: define {{[^@]+}}@static_global_simplifiable_1
-; TUNIT-SAME: (ptr noalias nofree writeonly sret([[STRUCT_S:%.*]]) align 4 captures(none) dereferenceable_or_null(24) [[AGG_RESULT:%.*]]) #[[ATTR5:[0-9]+]] {
-; TUNIT-NEXT:  entry:
+; TUNIT-LABEL: define void @static_global_simplifiable_1(
+; TUNIT-SAME: ptr noalias nofree writeonly sret([[STRUCT_S:%.*]]) align 4 captures(none) dereferenceable_or_null(24) [[AGG_RESULT:%.*]]) #[[ATTR5:[0-9]+]] {
+; TUNIT-NEXT:  [[ENTRY:.*:]]
 ; TUNIT-NEXT:    call void @write_arg(ptr nofree noundef nonnull writeonly align 4 captures(none) dereferenceable(24) @Gs1, i32 noundef 1) #[[ATTR18]]
 ; TUNIT-NEXT:    call void @write_arg(ptr nofree noundef nonnull writeonly align 4 captures(none) dereferenceable(20) getelementptr inbounds ([[STRUCT_S]], ptr @Gs1, i64 0, i32 1), i32 noundef 2) #[[ATTR18]]
 ; TUNIT-NEXT:    call void @write_arg(ptr nofree noundef nonnull writeonly align 4 captures(none) dereferenceable(16) getelementptr inbounds ([[STRUCT_S]], ptr @Gs1, i64 0, i32 2), i32 noundef 3) #[[ATTR18]]
 ; TUNIT-NEXT:    [[F1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[AGG_RESULT]], i64 0, i32 3
-; TUNIT-NEXT:    store float 0x3FF19999A0000000, ptr [[F1]], align 4, !tbaa [[TBAA7]]
+; TUNIT-NEXT:    store float 0x3FF19999A0000000, ptr [[F1]], align 4, !tbaa [[FLOAT_TBAA7]]
 ; TUNIT-NEXT:    [[F2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[AGG_RESULT]], i64 0, i32 4
-; TUNIT-NEXT:    store float 0x40119999A0000000, ptr [[F2]], align 4, !tbaa [[TBAA10]]
+; TUNIT-NEXT:    store float 0x40119999A0000000, ptr [[F2]], align 4, !tbaa [[FLOAT_TBAA10]]
 ; TUNIT-NEXT:    [[F3:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[AGG_RESULT]], i64 0, i32 5
-; TUNIT-NEXT:    store float 0x40119999A0000000, ptr [[F3]], align 4, !tbaa [[TBAA11]]
-; TUNIT-NEXT:    store i32 1, ptr [[AGG_RESULT]], align 4, !tbaa [[TBAA12]]
+; TUNIT-NEXT:    store float 0x40119999A0000000, ptr [[F3]], align 4, !tbaa [[FLOAT_TBAA11]]
+; TUNIT-NEXT:    store i32 1, ptr [[AGG_RESULT]], align 4, !tbaa [[INT_TBAA12]]
 ; TUNIT-NEXT:    [[I2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[AGG_RESULT]], i64 0, i32 1
-; TUNIT-NEXT:    store i32 4, ptr [[I2]], align 4, !tbaa [[TBAA13]]
+; TUNIT-NEXT:    store i32 4, ptr [[I2]], align 4, !tbaa [[INT_TBAA13]]
 ; TUNIT-NEXT:    [[I3:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[AGG_RESULT]], i64 0, i32 2
-; TUNIT-NEXT:    store i32 4, ptr [[I3]], align 4, !tbaa [[TBAA14]]
+; TUNIT-NEXT:    store i32 4, ptr [[I3]], align 4, !tbaa [[INT_TBAA14]]
 ; TUNIT-NEXT:    ret void
 ;
 ; CGSCC: Function Attrs: mustprogress nofree nosync nounwind willreturn
-; CGSCC-LABEL: define {{[^@]+}}@static_global_simplifiable_1
-; CGSCC-SAME: (ptr noalias nofree noundef nonnull writeonly sret([[STRUCT_S:%.*]]) align 4 captures(none) dereferenceable(24) [[AGG_RESULT:%.*]]) #[[ATTR3]] {
-; CGSCC-NEXT:  entry:
-; CGSCC-NEXT:    store float 0x3FF19999A0000000, ptr getelementptr inbounds ([[STRUCT_S]], ptr @Gs1, i64 0, i32 3), align 4, !tbaa [[TBAA7]]
-; CGSCC-NEXT:    store float 0x40019999A0000000, ptr getelementptr inbounds ([[STRUCT_S]], ptr @Gs1, i64 0, i32 4), align 4, !tbaa [[TBAA10]]
-; CGSCC-NEXT:    store float 0x400A666660000000, ptr getelementptr inbounds ([[STRUCT_S]], ptr @Gs1, i64 0, i32 5), align 4, !tbaa [[TBAA11]]
+; CGSCC-LABEL: define void @static_global_simplifiable_1(
+; CGSCC-SAME: ptr noalias nofree noundef nonnull writeonly sret([[STRUCT_S:%.*]]) align 4 captures(none) dereferenceable(24) [[AGG_RESULT:%.*]]) #[[ATTR3]] {
+; CGSCC-NEXT:  [[ENTRY:.*:]]
+; CGSCC-NEXT:    store float 0x3FF19999A0000000, ptr getelementptr inbounds ([[STRUCT_S]], ptr @Gs1, i64 0, i32 3), align 4, !tbaa [[FLOAT_TBAA7]]
+; CGSCC-NEXT:    store float 0x40019999A0000000, ptr getelementptr inbounds ([[STRUCT_S]], ptr @Gs1, i64 0, i32 4), align 4, !tbaa [[FLOAT_TBAA10]]
+; CGSCC-NEXT:    store float 0x400A666660000000, ptr getelementptr inbounds ([[STRUCT_S]], ptr @Gs1, i64 0, i32 5), align 4, !tbaa [[FLOAT_TBAA11]]
 ; CGSCC-NEXT:    call void @write_arg(ptr nofree noundef nonnull writeonly align 4 captures(none) dereferenceable(24) @Gs1, i32 noundef 1) #[[ATTR21]]
 ; CGSCC-NEXT:    call void @write_arg(ptr nofree noundef nonnull writeonly align 4 captures(none) dereferenceable(20) getelementptr inbounds ([[STRUCT_S]], ptr @Gs1, i64 0, i32 1), i32 noundef 2) #[[ATTR21]]
 ; CGSCC-NEXT:    call void @write_arg(ptr nofree noundef nonnull writeonly align 4 captures(none) dereferenceable(16) getelementptr inbounds ([[STRUCT_S]], ptr @Gs1, i64 0, i32 2), i32 noundef 3) #[[ATTR21]]
-; CGSCC-NEXT:    [[I:%.*]] = load float, ptr getelementptr inbounds ([[STRUCT_S]], ptr @Gs1, i64 0, i32 3), align 4, !tbaa [[TBAA7]]
+; CGSCC-NEXT:    [[I:%.*]] = load float, ptr getelementptr inbounds ([[STRUCT_S]], ptr @Gs1, i64 0, i32 3), align 4, !tbaa [[FLOAT_TBAA7]]
 ; CGSCC-NEXT:    [[F1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[AGG_RESULT]], i64 0, i32 3
-; CGSCC-NEXT:    store float [[I]], ptr [[F1]], align 4, !tbaa [[TBAA7]]
-; CGSCC-NEXT:    [[I4:%.*]] = load float, ptr getelementptr inbounds ([[STRUCT_S]], ptr @Gs1, i64 0, i32 4), align 4, !tbaa [[TBAA10]]
+; CGSCC-NEXT:    store float [[I]], ptr [[F1]], align 4, !tbaa [[FLOAT_TBAA7]]
+; CGSCC-NEXT:    [[I4:%.*]] = load float, ptr getelementptr inbounds ([[STRUCT_S]], ptr @Gs1, i64 0, i32 4), align 4, !tbaa [[FLOAT_TBAA10]]
 ; CGSCC-NEXT:    [[MUL:%.*]] = fmul float [[I4]], 2.000000e+00
 ; CGSCC-NEXT:    [[F2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[AGG_RESULT]], i64 0, i32 4
-; CGSCC-NEXT:    store float [[MUL]], ptr [[F2]], align 4, !tbaa [[TBAA10]]
-; CGSCC-NEXT:    [[I5:%.*]] = load float, ptr getelementptr inbounds ([[STRUCT_S]], ptr @Gs1, i64 0, i32 5), align 4, !tbaa [[TBAA11]]
-; CGSCC-NEXT:    [[I6:%.*]] = load float, ptr getelementptr inbounds ([[STRUCT_S]], ptr @Gs1, i64 0, i32 3), align 4, !tbaa [[TBAA7]]
+; CGSCC-NEXT:    store float [[MUL]], ptr [[F2]], align 4, !tbaa [[FLOAT_TBAA10]]
+; CGSCC-NEXT:    [[I5:%.*]] = load float, ptr getelementptr inbounds ([[STRUCT_S]], ptr @Gs1, i64 0, i32 5), align 4, !tbaa [[FLOAT_TBAA11]]
+; CGSCC-NEXT:    [[I6:%.*]] = load float, ptr getelementptr inbounds ([[STRUCT_S]], ptr @Gs1, i64 0, i32 3), align 4, !tbaa [[FLOAT_TBAA7]]
 ; CGSCC-NEXT:    [[ADD:%.*]] = fadd float [[I5]], [[I6]]
 ; CGSCC-NEXT:    [[F3:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[AGG_RESULT]], i64 0, i32 5
-; CGSCC-NEXT:    store float [[ADD]], ptr [[F3]], align 4, !tbaa [[TBAA11]]
-; CGSCC-NEXT:    [[I7:%.*]] = load i32, ptr @Gs1, align 4, !tbaa [[TBAA12]]
-; CGSCC-NEXT:    store i32 [[I7]], ptr [[AGG_RESULT]], align 4, !tbaa [[TBAA12]]
-; CGSCC-NEXT:    [[I8:%.*]] = load i32, ptr getelementptr inbounds ([[STRUCT_S]], ptr @Gs1, i64 0, i32 1), align 4, !tbaa [[TBAA13]]
+; CGSCC-NEXT:    store float [[ADD]], ptr [[F3]], align 4, !tbaa [[FLOAT_TBAA11]]
+; CGSCC-NEXT:    [[I7:%.*]] = load i32, ptr @Gs1, align 4, !tbaa [[INT_TBAA12]]
+; CGSCC-NEXT:    store i32 [[I7]], ptr [[AGG_RESULT]], align 4, !tbaa [[INT_TBAA12]]
+; CGSCC-NEXT:    [[I8:%.*]] = load i32, ptr getelementptr inbounds ([[STRUCT_S]], ptr @Gs1, i64 0, i32 1), align 4, !tbaa [[INT_TBAA13]]
 ; CGSCC-NEXT:    [[MUL1:%.*]] = shl nsw i32 [[I8]], 1
 ; CGSCC-NEXT:    [[I2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[AGG_RESULT]], i64 0, i32 1
-; CGSCC-NEXT:    store i32 [[MUL1]], ptr [[I2]], align 4, !tbaa [[TBAA13]]
-; CGSCC-NEXT:    [[I9:%.*]] = load i32, ptr getelementptr inbounds ([[STRUCT_S]], ptr @Gs1, i64 0, i32 2), align 4, !tbaa [[TBAA14]]
-; CGSCC-NEXT:    [[I10:%.*]] = load i32, ptr @Gs1, align 4, !tbaa [[TBAA12]]
+; CGSCC-NEXT:    store i32 [[MUL1]], ptr [[I2]], align 4, !tbaa [[INT_TBAA13]]
+; CGSCC-NEXT:    [[I9:%.*]] = load i32, ptr getelementptr inbounds ([[STRUCT_S]], ptr @Gs1, i64 0, i32 2), align 4, !tbaa [[INT_TBAA14]]
+; CGSCC-NEXT:    [[I10:%.*]] = load i32, ptr @Gs1, align 4, !tbaa [[INT_TBAA12]]
 ; CGSCC-NEXT:    [[ADD2:%.*]] = add nsw i32 [[I9]], [[I10]]
 ; CGSCC-NEXT:    [[I3:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[AGG_RESULT]], i64 0, i32 2
-; CGSCC-NEXT:    store i32 [[ADD2]], ptr [[I3]], align 4, !tbaa [[TBAA14]]
+; CGSCC-NEXT:    store i32 [[ADD2]], ptr [[I3]], align 4, !tbaa [[INT_TBAA14]]
 ; CGSCC-NEXT:    ret void
 ;
 entry:
@@ -776,13 +776,13 @@ entry:
 
 define i32 @test_range_merge1() {
 ; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(write)
-; TUNIT-LABEL: define {{[^@]+}}@test_range_merge1
-; TUNIT-SAME: () #[[ATTR5]] {
+; TUNIT-LABEL: define noundef i32 @test_range_merge1(
+; TUNIT-SAME: ) #[[ATTR5]] {
 ; TUNIT-NEXT:    ret i32 2
 ;
 ; CGSCC: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(write)
-; CGSCC-LABEL: define {{[^@]+}}@test_range_merge1
-; CGSCC-SAME: () #[[ATTR6:[0-9]+]] {
+; CGSCC-LABEL: define noundef i32 @test_range_merge1(
+; CGSCC-SAME: ) #[[ATTR6:[0-9]+]] {
 ; CGSCC-NEXT:    ret i32 2
 ;
   store <2 x i32> <i32 1, i32 1>, ptr @Vs1
@@ -795,8 +795,8 @@ define i32 @test_range_merge1() {
 
 define i32 @test_range_merge2() {
 ; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn
-; TUNIT-LABEL: define {{[^@]+}}@test_range_merge2
-; TUNIT-SAME: () #[[ATTR3]] {
+; TUNIT-LABEL: define i32 @test_range_merge2(
+; TUNIT-SAME: ) #[[ATTR3]] {
 ; TUNIT-NEXT:    store <2 x i32> <i32 3, i32 4>, ptr @Vs2, align 8
 ; TUNIT-NEXT:    [[L0:%.*]] = load i32, ptr @Vs2, align 4
 ; TUNIT-NEXT:    [[L1:%.*]] = load i32, ptr getelementptr inbounds ([[STRUCT_S:%.*]], ptr @Vs2, i64 0, i32 1), align 4
@@ -804,8 +804,8 @@ define i32 @test_range_merge2() {
 ; TUNIT-NEXT:    ret i32 [[ADD]]
 ;
 ; CGSCC: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn
-; CGSCC-LABEL: define {{[^@]+}}@test_range_merge2
-; CGSCC-SAME: () #[[ATTR5]] {
+; CGSCC-LABEL: define i32 @test_range_merge2(
+; CGSCC-SAME: ) #[[ATTR5]] {
 ; CGSCC-NEXT:    store <2 x i32> <i32 3, i32 4>, ptr @Vs2, align 8
 ; CGSCC-NEXT:    [[L0:%.*]] = load i32, ptr @Vs2, align 4
 ; CGSCC-NEXT:    [[L1:%.*]] = load i32, ptr getelementptr inbounds ([[STRUCT_S:%.*]], ptr @Vs2, i64 0, i32 1), align 4
@@ -837,147 +837,147 @@ define i32 @test_range_merge2() {
 ;
 define void @static_global_simplifiable_2() {
 ; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(write)
-; TUNIT-LABEL: define {{[^@]+}}@static_global_simplifiable_2
-; TUNIT-SAME: () #[[ATTR5]] {
-; TUNIT-NEXT:  entry:
-; TUNIT-NEXT:    br label [[FOR_COND:%.*]]
-; TUNIT:       for.cond:
-; TUNIT-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ 0, [[ENTRY:%.*]] ]
+; TUNIT-LABEL: define void @static_global_simplifiable_2(
+; TUNIT-SAME: ) #[[ATTR5]] {
+; TUNIT-NEXT:  [[ENTRY:.*]]:
+; TUNIT-NEXT:    br label %[[FOR_COND:.*]]
+; TUNIT:       [[FOR_COND]]:
+; TUNIT-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_INC:.*]] ], [ 0, %[[ENTRY]] ]
 ; TUNIT-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV]], 100
-; TUNIT-NEXT:    br i1 [[EXITCOND]], label [[FOR_BODY:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; TUNIT:       for.cond.cleanup:
-; TUNIT-NEXT:    br label [[FOR_END:%.*]]
-; TUNIT:       for.body:
+; TUNIT-NEXT:    br i1 [[EXITCOND]], label %[[FOR_BODY:.*]], label %[[FOR_COND_CLEANUP:.*]]
+; TUNIT:       [[FOR_COND_CLEANUP]]:
+; TUNIT-NEXT:    br label %[[FOR_END:.*]]
+; TUNIT:       [[FOR_BODY]]:
 ; TUNIT-NEXT:    [[I:%.*]] = mul nuw nsw i64 [[INDVARS_IV]], 10
 ; TUNIT-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1024 x i8], ptr @GBytes, i64 0, i64 [[I]]
-; TUNIT-NEXT:    br label [[FOR_INC]]
-; TUNIT:       for.inc:
+; TUNIT-NEXT:    br label %[[FOR_INC]]
+; TUNIT:       [[FOR_INC]]:
 ; TUNIT-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; TUNIT-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP21:![0-9]+]]
-; TUNIT:       for.end:
-; TUNIT-NEXT:    br label [[FOR_COND2:%.*]]
-; TUNIT:       for.cond2:
-; TUNIT-NEXT:    [[INDVARS_IV2:%.*]] = phi i64 [ [[INDVARS_IV_NEXT3:%.*]], [[FOR_INC9:%.*]] ], [ 0, [[FOR_END]] ]
+; TUNIT-NEXT:    br label %[[FOR_COND]], !llvm.loop [[LOOP21:![0-9]+]]
+; TUNIT:       [[FOR_END]]:
+; TUNIT-NEXT:    br label %[[FOR_COND2:.*]]
+; TUNIT:       [[FOR_COND2]]:
+; TUNIT-NEXT:    [[INDVARS_IV2:%.*]] = phi i64 [ [[INDVARS_IV_NEXT3:%.*]], %[[FOR_INC9:.*]] ], [ 0, %[[FOR_END]] ]
 ; TUNIT-NEXT:    [[EXITCOND6:%.*]] = icmp ne i64 [[INDVARS_IV2]], 10
-; TUNIT-NEXT:    br i1 [[EXITCOND6]], label [[FOR_BODY5:%.*]], label [[FOR_COND_CLEANUP4:%.*]]
-; TUNIT:       for.cond.cleanup4:
-; TUNIT-NEXT:    br label [[FOR_END11:%.*]]
-; TUNIT:       for.body5:
+; TUNIT-NEXT:    br i1 [[EXITCOND6]], label %[[FOR_BODY5:.*]], label %[[FOR_COND_CLEANUP4:.*]]
+; TUNIT:       [[FOR_COND_CLEANUP4]]:
+; TUNIT-NEXT:    br label %[[FOR_END11:.*]]
+; TUNIT:       [[FOR_BODY5]]:
 ; TUNIT-NEXT:    [[I15:%.*]] = mul nuw nsw i64 [[INDVARS_IV2]], 10
 ; TUNIT-NEXT:    [[I16:%.*]] = or i64 [[I15]], 1
 ; TUNIT-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr @GBytes, i64 [[I16]]
-; TUNIT-NEXT:    br label [[FOR_INC9]]
-; TUNIT:       for.inc9:
+; TUNIT-NEXT:    br label %[[FOR_INC9]]
+; TUNIT:       [[FOR_INC9]]:
 ; TUNIT-NEXT:    [[INDVARS_IV_NEXT3]] = add nuw nsw i64 [[INDVARS_IV2]], 1
-; TUNIT-NEXT:    br label [[FOR_COND2]], !llvm.loop [[LOOP22:![0-9]+]]
-; TUNIT:       for.end11:
-; TUNIT-NEXT:    br label [[FOR_COND13:%.*]]
-; TUNIT:       for.cond13:
-; TUNIT-NEXT:    [[INDVARS_IV7:%.*]] = phi i64 [ [[INDVARS_IV_NEXT8:%.*]], [[FOR_INC21:%.*]] ], [ 0, [[FOR_END11]] ]
+; TUNIT-NEXT:    br label %[[FOR_COND2]], !llvm.loop [[LOOP22:![0-9]+]]
+; TUNIT:       [[FOR_END11]]:
+; TUNIT-NEXT:    br label %[[FOR_COND13:.*]]
+; TUNIT:       [[FOR_COND13]]:
+; TUNIT-NEXT:    [[INDVARS_IV7:%.*]] = phi i64 [ [[INDVARS_IV_NEXT8:%.*]], %[[FOR_INC21:.*]] ], [ 0, %[[FOR_END11]] ]
 ; TUNIT-NEXT:    [[EXITCOND11:%.*]] = icmp ne i64 [[INDVARS_IV7]], 20
-; TUNIT-NEXT:    br i1 [[EXITCOND11]], label [[FOR_BODY16:%.*]], label [[FOR_COND_CLEANUP15:%.*]]
-; TUNIT:       for.cond.cleanup15:
-; TUNIT-NEXT:    br label [[FOR_END23:%.*]]
-; TUNIT:       for.body16:
+; TUNIT-NEXT:    br i1 [[EXITCOND11]], label %[[FOR_BODY16:.*]], label %[[FOR_COND_CLEANUP15:.*]]
+; TUNIT:       [[FOR_COND_CLEANUP15]]:
+; TUNIT-NEXT:    br label %[[FOR_END23:.*]]
+; TUNIT:       [[FOR_BODY16]]:
 ; TUNIT-NEXT:    [[I17:%.*]] = mul nuw nsw i64 [[INDVARS_IV7]], 10
 ; TUNIT-NEXT:    [[I18:%.*]] = add nuw nsw i64 [[I17]], 2
 ; TUNIT-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds i64, ptr @GBytes, i64 [[I18]]
-; TUNIT-NEXT:    br label [[FOR_INC21]]
-; TUNIT:       for.inc21:
+; TUNIT-NEXT:    br label %[[FOR_INC21]]
+; TUNIT:       [[FOR_INC21]]:
 ; TUNIT-NEXT:    [[INDVARS_IV_NEXT8]] = add nuw nsw i64 [[INDVARS_IV7]], 1
-; TUNIT-NEXT:    br label [[FOR_COND13]], !llvm.loop [[LOOP23:![0-9]+]]
-; TUNIT:       for.end23:
+; TUNIT-NEXT:    br label %[[FOR_COND13]], !llvm.loop [[LOOP23:![0-9]+]]
+; TUNIT:       [[FOR_END23]]:
 ; TUNIT-NEXT:    call void @write_arg(ptr nofree noundef nonnull writeonly align 4 captures(none) dereferenceable(524) getelementptr inbounds ([1024 x i8], ptr @GBytes, i64 0, i64 500), i32 noundef 0) #[[ATTR18]]
-; TUNIT-NEXT:    br label [[FOR_COND25:%.*]]
-; TUNIT:       for.cond25:
-; TUNIT-NEXT:    [[INDVARS_IV12:%.*]] = phi i64 [ [[INDVARS_IV_NEXT13:%.*]], [[FOR_INC33:%.*]] ], [ 0, [[FOR_END23]] ]
+; TUNIT-NEXT:    br label %[[FOR_COND25:.*]]
+; TUNIT:       [[FOR_COND25]]:
+; TUNIT-NEXT:    [[INDVARS_IV12:%.*]] = phi i64 [ [[INDVARS_IV_NEXT13:%.*]], %[[FOR_INC33:.*]] ], [ 0, %[[FOR_END23]] ]
 ; TUNIT-NEXT:    [[EXITCOND14:%.*]] = icmp ne i64 [[INDVARS_IV12]], 1024
-; TUNIT-NEXT:    br i1 [[EXITCOND14]], label [[FOR_BODY28:%.*]], label [[FOR_COND_CLEANUP27:%.*]]
-; TUNIT:       for.cond.cleanup27:
-; TUNIT-NEXT:    br label [[FOR_END35:%.*]]
-; TUNIT:       for.body28:
+; TUNIT-NEXT:    br i1 [[EXITCOND14]], label %[[FOR_BODY28:.*]], label %[[FOR_COND_CLEANUP27:.*]]
+; TUNIT:       [[FOR_COND_CLEANUP27]]:
+; TUNIT-NEXT:    br label %[[FOR_END35:.*]]
+; TUNIT:       [[FOR_BODY28]]:
 ; TUNIT-NEXT:    [[ARRAYIDX32:%.*]] = getelementptr inbounds [1024 x i8], ptr @globalBytes, i64 0, i64 [[INDVARS_IV12]]
-; TUNIT-NEXT:    store i8 0, ptr [[ARRAYIDX32]], align 1, !tbaa [[TBAA19]]
-; TUNIT-NEXT:    br label [[FOR_INC33]]
-; TUNIT:       for.inc33:
+; TUNIT-NEXT:    store i8 0, ptr [[ARRAYIDX32]], align 1, !tbaa [[CHAR_TBAA19]]
+; TUNIT-NEXT:    br label %[[FOR_INC33]]
+; TUNIT:       [[FOR_INC33]]:
 ; TUNIT-NEXT:    [[INDVARS_IV_NEXT13]] = add nuw nsw i64 [[INDVARS_IV12]], 1
-; TUNIT-NEXT:    br label [[FOR_COND25]], !llvm.loop [[LOOP24:![0-9]+]]
-; TUNIT:       for.end35:
+; TUNIT-NEXT:    br label %[[FOR_COND25]], !llvm.loop [[LOOP24:![0-9]+]]
+; TUNIT:       [[FOR_END35]]:
 ; TUNIT-NEXT:    ret void
 ;
 ; CGSCC: Function Attrs: mustprogress nofree nosync nounwind willreturn
-; CGSCC-LABEL: define {{[^@]+}}@static_global_simplifiable_2
-; CGSCC-SAME: () #[[ATTR3]] {
-; CGSCC-NEXT:  entry:
-; CGSCC-NEXT:    br label [[FOR_COND:%.*]]
-; CGSCC:       for.cond:
-; CGSCC-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ 0, [[ENTRY:%.*]] ]
+; CGSCC-LABEL: define void @static_global_simplifiable_2(
+; CGSCC-SAME: ) #[[ATTR3]] {
+; CGSCC-NEXT:  [[ENTRY:.*]]:
+; CGSCC-NEXT:    br label %[[FOR_COND:.*]]
+; CGSCC:       [[FOR_COND]]:
+; CGSCC-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_INC:.*]] ], [ 0, %[[ENTRY]] ]
 ; CGSCC-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV]], 100
-; CGSCC-NEXT:    br i1 [[EXITCOND]], label [[FOR_BODY:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; CGSCC:       for.cond.cleanup:
-; CGSCC-NEXT:    br label [[FOR_END:%.*]]
-; CGSCC:       for.body:
+; CGSCC-NEXT:    br i1 [[EXITCOND]], label %[[FOR_BODY:.*]], label %[[FOR_COND_CLEANUP:.*]]
+; CGSCC:       [[FOR_COND_CLEANUP]]:
+; CGSCC-NEXT:    br label %[[FOR_END:.*]]
+; CGSCC:       [[FOR_BODY]]:
 ; CGSCC-NEXT:    [[I:%.*]] = mul nuw nsw i64 [[INDVARS_IV]], 10
 ; CGSCC-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1024 x i8], ptr @GBytes, i64 0, i64 [[I]]
-; CGSCC-NEXT:    store i8 0, ptr [[ARRAYIDX]], align 2, !tbaa [[TBAA15]]
-; CGSCC-NEXT:    br label [[FOR_INC]]
-; CGSCC:       for.inc:
+; CGSCC-NEXT:    store i8 0, ptr [[ARRAYIDX]], align 2, !tbaa [[CHAR_TBAA15]]
+; CGSCC-NEXT:    br label %[[FOR_INC]]
+; CGSCC:       [[FOR_INC]]:
 ; CGSCC-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CGSCC-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP24:![0-9]+]]
-; CGSCC:       for.end:
-; CGSCC-NEXT:    br label [[FOR_COND2:%.*]]
-; CGSCC:       for.cond2:
-; CGSCC-NEXT:    [[INDVARS_IV2:%.*]] = phi i64 [ [[INDVARS_IV_NEXT3:%.*]], [[FOR_INC9:%.*]] ], [ 0, [[FOR_END]] ]
+; CGSCC-NEXT:    br label %[[FOR_COND]], !llvm.loop [[LOOP24:![0-9]+]]
+; CGSCC:       [[FOR_END]]:
+; CGSCC-NEXT:    br label %[[FOR_COND2:.*]]
+; CGSCC:       [[FOR_COND2]]:
+; CGSCC-NEXT:    [[INDVARS_IV2:%.*]] = phi i64 [ [[INDVARS_IV_NEXT3:%.*]], %[[FOR_INC9:.*]] ], [ 0, %[[FOR_END]] ]
 ; CGSCC-NEXT:    [[EXITCOND6:%.*]] = icmp ne i64 [[INDVARS_IV2]], 10
-; CGSCC-NEXT:    br i1 [[EXITCOND6]], label [[FOR_BODY5:%.*]], label [[FOR_COND_CLEANUP4:%.*]]
-; CGSCC:       for.cond.cleanup4:
-; CGSCC-NEXT:    br label [[FOR_END11:%.*]]
-; CGSCC:       for.body5:
+; CGSCC-NEXT:    br i1 [[EXITCOND6]], label %[[FOR_BODY5:.*]], label %[[FOR_COND_CLEANUP4:.*]]
+; CGSCC:       [[FOR_COND_CLEANUP4]]:
+; CGSCC-NEXT:    br label %[[FOR_END11:.*]]
+; CGSCC:       [[FOR_BODY5]]:
 ; CGSCC-NEXT:    [[I15:%.*]] = mul nuw nsw i64 [[INDVARS_IV2]], 10
 ; CGSCC-NEXT:    [[I16:%.*]] = or i64 [[I15]], 1
 ; CGSCC-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr @GBytes, i64 [[I16]]
-; CGSCC-NEXT:    store float 0.000000e+00, ptr [[ARRAYIDX8]], align 4, !tbaa [[TBAA18]]
-; CGSCC-NEXT:    br label [[FOR_INC9]]
-; CGSCC:       for.inc9:
+; CGSCC-NEXT:    store float 0.000000e+00, ptr [[ARRAYIDX8]], align 4, !tbaa [[FLOAT_TBAA18]]
+; CGSCC-NEXT:    br label %[[FOR_INC9]]
+; CGSCC:       [[FOR_INC9]]:
 ; CGSCC-NEXT:    [[INDVARS_IV_NEXT3]] = add nuw nsw i64 [[INDVARS_IV2]], 1
-; CGSCC-NEXT:    br label [[FOR_COND2]], !llvm.loop [[LOOP25:![0-9]+]]
-; CGSCC:       for.end11:
-; CGSCC-NEXT:    br label [[FOR_COND13:%.*]]
-; CGSCC:       for.cond13:
-; CGSCC-NEXT:    [[INDVARS_IV7:%.*]] = phi i64 [ [[INDVARS_IV_NEXT8:%.*]], [[FOR_INC21:%.*]] ], [ 0, [[FOR_END11]] ]
+; CGSCC-NEXT:    br label %[[FOR_COND2]], !llvm.loop [[LOOP25:![0-9]+]]
+; CGSCC:       [[FOR_END11]]:
+; CGSCC-NEXT:    br label %[[FOR_COND13:.*]]
+; CGSCC:       [[FOR_COND13]]:
+; CGSCC-NEXT:    [[INDVARS_IV7:%.*]] = phi i64 [ [[INDVARS_IV_NEXT8:%.*]], %[[FOR_INC21:.*]] ], [ 0, %[[FOR_END11]] ]
 ; CGSCC-NEXT:    [[EXITCOND11:%.*]] = icmp ne i64 [[INDVARS_IV7]], 20
-; CGSCC-NEXT:    br i1 [[EXITCOND11]], label [[FOR_BODY16:%.*]], label [[FOR_COND_CLEANUP15:%.*]]
-; CGSCC:       for.cond.cleanup15:
-; CGSCC-NEXT:    br label [[FOR_END23:%.*]]
-; CGSCC:       for.body16:
+; CGSCC-NEXT:    br i1 [[EXITCOND11]], label %[[FOR_BODY16:.*]], label %[[FOR_COND_CLEANUP15:.*]]
+; CGSCC:       [[FOR_COND_CLEANUP15]]:
+; CGSCC-NEXT:    br label %[[FOR_END23:.*]]
+; CGSCC:       [[FOR_BODY16]]:
 ; CGSCC-NEXT:    [[I17:%.*]] = mul nuw nsw i64 [[INDVARS_IV7]], 10
 ; CGSCC-NEXT:    [[I18:%.*]] = add nuw nsw i64 [[I17]], 2
 ; CGSCC-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds i64, ptr @GBytes, i64 [[I18]]
-; CGSCC-NEXT:    store i64 0, ptr [[ARRAYIDX20]], align 16, !tbaa [[TBAA20]]
-; CGSCC-NEXT:    br label [[FOR_INC21]]
-; CGSCC:       for.inc21:
+; CGSCC-NEXT:    store i64 0, ptr [[ARRAYIDX20]], align 16, !tbaa [[LONG_LONG_TBAA20]]
+; CGSCC-NEXT:    br label %[[FOR_INC21]]
+; CGSCC:       [[FOR_INC21]]:
 ; CGSCC-NEXT:    [[INDVARS_IV_NEXT8]] = add nuw nsw i64 [[INDVARS_IV7]], 1
-; CGSCC-NEXT:    br label [[FOR_COND13]], !llvm.loop [[LOOP26:![0-9]+]]
-; CGSCC:       for.end23:
-; CGSCC-NEXT:    store i8 0, ptr getelementptr inbounds ([1024 x i8], ptr @GBytes, i64 0, i64 1023), align 1, !tbaa [[TBAA15]]
+; CGSCC-NEXT:    br label %[[FOR_COND13]], !llvm.loop [[LOOP26:![0-9]+]]
+; CGSCC:       [[FOR_END23]]:
+; CGSCC-NEXT:    store i8 0, ptr getelementptr inbounds ([1024 x i8], ptr @GBytes, i64 0, i64 1023), align 1, !tbaa [[CHAR_TBAA15]]
 ; CGSCC-NEXT:    call void @write_arg(ptr nofree noundef nonnull writeonly align 4 captures(none) dereferenceable(524) getelementptr inbounds ([1024 x i8], ptr @GBytes, i64 0, i64 500), i32 noundef 0) #[[ATTR21]]
-; CGSCC-NEXT:    br label [[FOR_COND25:%.*]]
-; CGSCC:       for.cond25:
-; CGSCC-NEXT:    [[INDVARS_IV12:%.*]] = phi i64 [ [[INDVARS_IV_NEXT13:%.*]], [[FOR_INC33:%.*]] ], [ 0, [[FOR_END23]] ]
+; CGSCC-NEXT:    br label %[[FOR_COND25:.*]]
+; CGSCC:       [[FOR_COND25]]:
+; CGSCC-NEXT:    [[INDVARS_IV12:%.*]] = phi i64 [ [[INDVARS_IV_NEXT13:%.*]], %[[FOR_INC33:.*]] ], [ 0, %[[FOR_END23]] ]
 ; CGSCC-NEXT:    [[EXITCOND14:%.*]] = icmp ne i64 [[INDVARS_IV12]], 1024
-; CGSCC-NEXT:    br i1 [[EXITCOND14]], label [[FOR_BODY28:%.*]], label [[FOR_COND_CLEANUP27:%.*]]
-; CGSCC:       for.cond.cleanup27:
-; CGSCC-NEXT:    br label [[FOR_END35:%.*]]
-; CGSCC:       for.body28:
+; CGSCC-NEXT:    br i1 [[EXITCOND14]], label %[[FOR_BODY28:.*]], label %[[FOR_COND_CLEANUP27:.*]]
+; CGSCC:       [[FOR_COND_CLEANUP27]]:
+; CGSCC-NEXT:    br label %[[FOR_END35:.*]]
+; CGSCC:       [[FOR_BODY28]]:
 ; CGSCC-NEXT:    [[ARRAYIDX30:%.*]] = getelementptr inbounds [1024 x i8], ptr @GBytes, i64 0, i64 [[INDVARS_IV12]]
-; CGSCC-NEXT:    [[I19:%.*]] = load i8, ptr [[ARRAYIDX30]], align 1, !tbaa [[TBAA15]]
+; CGSCC-NEXT:    [[I19:%.*]] = load i8, ptr [[ARRAYIDX30]], align 1, !tbaa [[CHAR_TBAA15]]
 ; CGSCC-NEXT:    [[ARRAYIDX32:%.*]] = getelementptr inbounds [1024 x i8], ptr @globalBytes, i64 0, i64 [[INDVARS_IV12]]
-; CGSCC-NEXT:    store i8 [[I19]], ptr [[ARRAYIDX32]], align 1, !tbaa [[TBAA15]]
-; CGSCC-NEXT:    br label [[FOR_INC33]]
-; CGSCC:       for.inc33:
+; CGSCC-NEXT:    store i8 [[I19]], ptr [[ARRAYIDX32]], align 1, !tbaa [[CHAR_TBAA15]]
+; CGSCC-NEXT:    br label %[[FOR_INC33]]
+; CGSCC:       [[FOR_INC33]]:
 ; CGSCC-NEXT:    [[INDVARS_IV_NEXT13]] = add nuw nsw i64 [[INDVARS_IV12]], 1
-; CGSCC-NEXT:    br label [[FOR_COND25]], !llvm.loop [[LOOP27:![0-9]+]]
-; CGSCC:       for.end35:
+; CGSCC-NEXT:    br label %[[FOR_COND25]], !llvm.loop [[LOOP27:![0-9]+]]
+; CGSCC:       [[FOR_END35]]:
 ; CGSCC-NEXT:    ret void
 ;
 entry:
@@ -1080,15 +1080,15 @@ for.end35:                                        ; preds = %for.cond.cleanup27
 ;    }
 define i32 @static_global_simplifiable_3() {
 ; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(write)
-; TUNIT-LABEL: define {{[^@]+}}@static_global_simplifiable_3
-; TUNIT-SAME: () #[[ATTR5]] {
-; TUNIT-NEXT:    store i32 1, ptr @Flag3, align 4, !tbaa [[TBAA3]]
+; TUNIT-LABEL: define noundef i32 @static_global_simplifiable_3(
+; TUNIT-SAME: ) #[[ATTR5]] {
+; TUNIT-NEXT:    store i32 1, ptr @Flag3, align 4, !tbaa [[INT_TBAA3]]
 ; TUNIT-NEXT:    ret i32 1
 ;
 ; CGSCC: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(write)
-; CGSCC-LABEL: define {{[^@]+}}@static_global_simplifiable_3
-; CGSCC-SAME: () #[[ATTR6]] {
-; CGSCC-NEXT:    store i32 1, ptr @Flag3, align 4, !tbaa [[TBAA3]]
+; CGSCC-LABEL: define noundef i32 @static_global_simplifiable_3(
+; CGSCC-SAME: ) #[[ATTR6]] {
+; CGSCC-NEXT:    store i32 1, ptr @Flag3, align 4, !tbaa [[INT_TBAA3]]
 ; CGSCC-NEXT:    ret i32 1
 ;
   store i32 1, ptr @Flag3, align 4, !tbaa !3
@@ -1115,95 +1115,95 @@ define i32 @static_global_simplifiable_3() {
 ;
 define void @noalias_arg_simplifiable_1(ptr noalias sret(%struct.S) align 4 %agg.result, ptr byval(%struct.S) align 8 %s) {
 ; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
-; TUNIT-LABEL: define {{[^@]+}}@noalias_arg_simplifiable_1
-; TUNIT-SAME: (ptr noalias nofree writeonly sret([[STRUCT_S:%.*]]) align 4 captures(none) dereferenceable_or_null(24) [[AGG_RESULT:%.*]], ptr noalias nofree nonnull byval([[STRUCT_S]]) align 8 captures(none) dereferenceable(24) [[S:%.*]]) #[[ATTR1]] {
-; TUNIT-NEXT:  entry:
+; TUNIT-LABEL: define void @noalias_arg_simplifiable_1(
+; TUNIT-SAME: ptr noalias nofree writeonly sret([[STRUCT_S:%.*]]) align 4 captures(none) dereferenceable_or_null(24) [[AGG_RESULT:%.*]], ptr noalias nofree nonnull byval([[STRUCT_S]]) align 8 captures(none) dereferenceable(24) [[S:%.*]]) #[[ATTR1]] {
+; TUNIT-NEXT:  [[ENTRY:.*:]]
 ; TUNIT-NEXT:    [[F1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S]], i64 0, i32 3
-; TUNIT-NEXT:    store float 0x3FF19999A0000000, ptr [[F1]], align 4, !tbaa [[TBAA7]]
+; TUNIT-NEXT:    store float 0x3FF19999A0000000, ptr [[F1]], align 4, !tbaa [[FLOAT_TBAA7]]
 ; TUNIT-NEXT:    [[F2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S]], i64 0, i32 4
-; TUNIT-NEXT:    store float 0x40019999A0000000, ptr [[F2]], align 8, !tbaa [[TBAA10]]
+; TUNIT-NEXT:    store float 0x40019999A0000000, ptr [[F2]], align 8, !tbaa [[FLOAT_TBAA10]]
 ; TUNIT-NEXT:    [[F3:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S]], i64 0, i32 5
-; TUNIT-NEXT:    store float 0x400A666660000000, ptr [[F3]], align 4, !tbaa [[TBAA11]]
+; TUNIT-NEXT:    store float 0x400A666660000000, ptr [[F3]], align 4, !tbaa [[FLOAT_TBAA11]]
 ; TUNIT-NEXT:    call void @write_arg(ptr noalias nofree noundef nonnull writeonly align 8 captures(none) dereferenceable(24) [[S]], i32 noundef 1) #[[ATTR18]]
 ; TUNIT-NEXT:    [[I2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S]], i64 0, i32 1
 ; TUNIT-NEXT:    call void @write_arg(ptr nofree noundef nonnull writeonly align 4 captures(none) dereferenceable(20) [[I2]], i32 noundef 2) #[[ATTR18]]
 ; TUNIT-NEXT:    [[I3:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S]], i64 0, i32 2
 ; TUNIT-NEXT:    call void @write_arg(ptr nofree noundef nonnull writeonly align 8 captures(none) dereferenceable(16) [[I3]], i32 noundef 3) #[[ATTR18]]
 ; TUNIT-NEXT:    [[F11:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S]], i64 0, i32 3
-; TUNIT-NEXT:    [[I:%.*]] = load float, ptr [[F11]], align 4, !tbaa [[TBAA7]]
+; TUNIT-NEXT:    [[I:%.*]] = load float, ptr [[F11]], align 4, !tbaa [[FLOAT_TBAA7]]
 ; TUNIT-NEXT:    [[F12:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[AGG_RESULT]], i64 0, i32 3
-; TUNIT-NEXT:    store float [[I]], ptr [[F12]], align 4, !tbaa [[TBAA7]]
+; TUNIT-NEXT:    store float [[I]], ptr [[F12]], align 4, !tbaa [[FLOAT_TBAA7]]
 ; TUNIT-NEXT:    [[F23:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S]], i64 0, i32 4
-; TUNIT-NEXT:    [[I4:%.*]] = load float, ptr [[F23]], align 8, !tbaa [[TBAA10]]
+; TUNIT-NEXT:    [[I4:%.*]] = load float, ptr [[F23]], align 8, !tbaa [[FLOAT_TBAA10]]
 ; TUNIT-NEXT:    [[MUL:%.*]] = fmul float [[I4]], 2.000000e+00
 ; TUNIT-NEXT:    [[F24:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[AGG_RESULT]], i64 0, i32 4
-; TUNIT-NEXT:    store float [[MUL]], ptr [[F24]], align 4, !tbaa [[TBAA10]]
+; TUNIT-NEXT:    store float [[MUL]], ptr [[F24]], align 4, !tbaa [[FLOAT_TBAA10]]
 ; TUNIT-NEXT:    [[F35:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S]], i64 0, i32 5
-; TUNIT-NEXT:    [[I5:%.*]] = load float, ptr [[F35]], align 4, !tbaa [[TBAA11]]
+; TUNIT-NEXT:    [[I5:%.*]] = load float, ptr [[F35]], align 4, !tbaa [[FLOAT_TBAA11]]
 ; TUNIT-NEXT:    [[F16:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S]], i64 0, i32 3
-; TUNIT-NEXT:    [[I6:%.*]] = load float, ptr [[F16]], align 4, !tbaa [[TBAA7]]
+; TUNIT-NEXT:    [[I6:%.*]] = load float, ptr [[F16]], align 4, !tbaa [[FLOAT_TBAA7]]
 ; TUNIT-NEXT:    [[ADD:%.*]] = fadd float [[I5]], [[I6]]
 ; TUNIT-NEXT:    [[F37:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[AGG_RESULT]], i64 0, i32 5
-; TUNIT-NEXT:    store float [[ADD]], ptr [[F37]], align 4, !tbaa [[TBAA11]]
-; TUNIT-NEXT:    [[I7:%.*]] = load i32, ptr [[S]], align 8, !tbaa [[TBAA12]]
-; TUNIT-NEXT:    store i32 [[I7]], ptr [[AGG_RESULT]], align 4, !tbaa [[TBAA12]]
+; TUNIT-NEXT:    store float [[ADD]], ptr [[F37]], align 4, !tbaa [[FLOAT_TBAA11]]
+; TUNIT-NEXT:    [[I7:%.*]] = load i32, ptr [[S]], align 8, !tbaa [[INT_TBAA12]]
+; TUNIT-NEXT:    store i32 [[I7]], ptr [[AGG_RESULT]], align 4, !tbaa [[INT_TBAA12]]
 ; TUNIT-NEXT:    [[I210:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S]], i64 0, i32 1
-; TUNIT-NEXT:    [[I8:%.*]] = load i32, ptr [[I210]], align 4, !tbaa [[TBAA13]]
+; TUNIT-NEXT:    [[I8:%.*]] = load i32, ptr [[I210]], align 4, !tbaa [[INT_TBAA13]]
 ; TUNIT-NEXT:    [[MUL11:%.*]] = shl nsw i32 [[I8]], 1
 ; TUNIT-NEXT:    [[I212:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[AGG_RESULT]], i64 0, i32 1
-; TUNIT-NEXT:    store i32 [[MUL11]], ptr [[I212]], align 4, !tbaa [[TBAA13]]
+; TUNIT-NEXT:    store i32 [[MUL11]], ptr [[I212]], align 4, !tbaa [[INT_TBAA13]]
 ; TUNIT-NEXT:    [[I313:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S]], i64 0, i32 2
-; TUNIT-NEXT:    [[I9:%.*]] = load i32, ptr [[I313]], align 8, !tbaa [[TBAA14]]
-; TUNIT-NEXT:    [[I10:%.*]] = load i32, ptr [[S]], align 8, !tbaa [[TBAA12]]
+; TUNIT-NEXT:    [[I9:%.*]] = load i32, ptr [[I313]], align 8, !tbaa [[INT_TBAA14]]
+; TUNIT-NEXT:    [[I10:%.*]] = load i32, ptr [[S]], align 8, !tbaa [[INT_TBAA12]]
 ; TUNIT-NEXT:    [[ADD15:%.*]] = add nsw i32 [[I9]], [[I10]]
 ; TUNIT-NEXT:    [[I316:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[AGG_RESULT]], i64 0, i32 2
-; TUNIT-NEXT:    store i32 [[ADD15]], ptr [[I316]], align 4, !tbaa [[TBAA14]]
+; TUNIT-NEXT:    store i32 [[ADD15]], ptr [[I316]], align 4, !tbaa [[INT_TBAA14]]
 ; TUNIT-NEXT:    ret void
 ;
 ; CGSCC: Function Attrs: mustprogress nofree nosync nounwind willreturn memory(argmem: readwrite)
-; CGSCC-LABEL: define {{[^@]+}}@noalias_arg_simplifiable_1
-; CGSCC-SAME: (ptr noalias nofree noundef nonnull writeonly sret([[STRUCT_S:%.*]]) align 4 captures(none) dereferenceable(24) [[AGG_RESULT:%.*]], ptr noalias nofree noundef nonnull byval([[STRUCT_S]]) align 8 captures(none) dereferenceable(24) [[S:%.*]]) #[[ATTR1]] {
-; CGSCC-NEXT:  entry:
+; CGSCC-LABEL: define void @noalias_arg_simplifiable_1(
+; CGSCC-SAME: ptr noalias nofree noundef nonnull writeonly sret([[STRUCT_S:%.*]]) align 4 captures(none) dereferenceable(24) [[AGG_RESULT:%.*]], ptr noalias nofree noundef nonnull byval([[STRUCT_S]]) align 8 captures(none) dereferenceable(24) [[S:%.*]]) #[[ATTR1]] {
+; CGSCC-NEXT:  [[ENTRY:.*:]]
 ; CGSCC-NEXT:    [[F1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S]], i64 0, i32 3
-; CGSCC-NEXT:    store float 0x3FF19999A0000000, ptr [[F1]], align 4, !tbaa [[TBAA7]]
+; CGSCC-NEXT:    store float 0x3FF19999A0000000, ptr [[F1]], align 4, !tbaa [[FLOAT_TBAA7]]
 ; CGSCC-NEXT:    [[F2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S]], i64 0, i32 4
-; CGSCC-NEXT:    store float 0x40019999A0000000, ptr [[F2]], align 8, !tbaa [[TBAA10]]
+; CGSCC-NEXT:    store float 0x40019999A0000000, ptr [[F2]], align 8, !tbaa [[FLOAT_TBAA10]]
 ; CGSCC-NEXT:    [[F3:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S]], i64 0, i32 5
-; CGSCC-NEXT:    store float 0x400A666660000000, ptr [[F3]], align 4, !tbaa [[TBAA11]]
+; CGSCC-NEXT:    store float 0x400A666660000000, ptr [[F3]], align 4, !tbaa [[FLOAT_TBAA11]]
 ; CGSCC-NEXT:    call void @write_arg(ptr noalias nofree noundef nonnull writeonly align 8 captures(none) dereferenceable(24) [[S]], i32 noundef 1) #[[ATTR21]]
 ; CGSCC-NEXT:    [[I2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S]], i64 0, i32 1
 ; CGSCC-NEXT:    call void @write_arg(ptr nofree noundef nonnull writeonly align 4 captures(none) dereferenceable(20) [[I2]], i32 noundef 2) #[[ATTR21]]
 ; CGSCC-NEXT:    [[I3:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S]], i64 0, i32 2
 ; CGSCC-NEXT:    call void @write_arg(ptr nofree noundef nonnull writeonly align 8 captures(none) dereferenceable(16) [[I3]], i32 noundef 3) #[[ATTR21]]
 ; CGSCC-NEXT:    [[F11:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S]], i64 0, i32 3
-; CGSCC-NEXT:    [[I:%.*]] = load float, ptr [[F11]], align 4, !tbaa [[TBAA7]]
+; CGSCC-NEXT:    [[I:%.*]] = load float, ptr [[F11]], align 4, !tbaa [[FLOAT_TBAA7]]
 ; CGSCC-NEXT:    [[F12:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[AGG_RESULT]], i64 0, i32 3
-; CGSCC-NEXT:    store float [[I]], ptr [[F12]], align 4, !tbaa [[TBAA7]]
+; CGSCC-NEXT:    store float [[I]], ptr [[F12]], align 4, !tbaa [[FLOAT_TBAA7]]
 ; CGSCC-NEXT:    [[F23:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S]], i64 0, i32 4
-; CGSCC-NEXT:    [[I4:%.*]] = load float, ptr [[F23]], align 8, !tbaa [[TBAA10]]
+; CGSCC-NEXT:    [[I4:%.*]] = load float, ptr [[F23]], align 8, !tbaa [[FLOAT_TBAA10]]
 ; CGSCC-NEXT:    [[MUL:%.*]] = fmul float [[I4]], 2.000000e+00
 ; CGSCC-NEXT:    [[F24:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[AGG_RESULT]], i64 0, i32 4
-; CGSCC-NEXT:    store float [[MUL]], ptr [[F24]], align 4, !tbaa [[TBAA10]]
+; CGSCC-NEXT:    store float [[MUL]], ptr [[F24]], align 4, !tbaa [[FLOAT_TBAA10]]
 ; CGSCC-NEXT:    [[F35:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S]], i64 0, i32 5
-; CGSCC-NEXT:    [[I5:%.*]] = load float, ptr [[F35]], align 4, !tbaa [[TBAA11]]
+; CGSCC-NEXT:    [[I5:%.*]] = load float, ptr [[F35]], align 4, !tbaa [[FLOAT_TBAA11]]
 ; CGSCC-NEXT:    [[F16:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S]], i64 0, i32 3
-; CGSCC-NEXT:    [[I6:%.*]] = load float, ptr [[F16]], align 4, !tbaa [[TBAA7]]
+; CGSCC-NEXT:    [[I6:%.*]] = load float, ptr [[F16]], align 4, !tbaa [[FLOAT_TBAA7]]
 ; CGSCC-NEXT:    [[ADD:%.*]] = fadd float [[I5]], [[I6]]
 ; CGSCC-NEXT:    [[F37:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[AGG_RESULT]], i64 0, i32 5
-; CGSCC-NEXT:    store float [[ADD]], ptr [[F37]], align 4, !tbaa [[TBAA11]]
-; CGSCC-NEXT:    [[I7:%.*]] = load i32, ptr [[S]], align 8, !tbaa [[TBAA12]]
-; CGSCC-NEXT:    store i32 [[I7]], ptr [[AGG_RESULT]], align 4, !tbaa [[TBAA12]]
+; CGSCC-NEXT:    store float [[ADD]], ptr [[F37]], align 4, !tbaa [[FLOAT_TBAA11]]
+; CGSCC-NEXT:    [[I7:%.*]] = load i32, ptr [[S]], align 8, !tbaa [[INT_TBAA12]]
+; CGSCC-NEXT:    store i32 [[I7]], ptr [[AGG_RESULT]], align 4, !tbaa [[INT_TBAA12]]
 ; CGSCC-NEXT:    [[I210:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S]], i64 0, i32 1
-; CGSCC-NEXT:    [[I8:%.*]] = load i32, ptr [[I210]], align 4, !tbaa [[TBAA13]]
+; CGSCC-NEXT:    [[I8:%.*]] = load i32, ptr [[I210]], align 4, !tbaa [[INT_TBAA13]]
 ; CGSCC-NEXT:    [[MUL11:%.*]] = shl nsw i32 [[I8]], 1
 ; CGSCC-NEXT:    [[I212:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[AGG_RESULT]], i64 0, i32 1
-; CGSCC-NEXT:    store i32 [[MUL11]], ptr [[I212]], align 4, !tbaa [[TBAA13]]
+; CGSCC-NEXT:    store i32 [[MUL11]], ptr [[I212]], align 4, !tbaa [[INT_TBAA13]]
 ; CGSCC-NEXT:    [[I313:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S]], i64 0, i32 2
-; CGSCC-NEXT:    [[I9:%.*]] = load i32, ptr [[I313]], align 8, !tbaa [[TBAA14]]
-; CGSCC-NEXT:    [[I10:%.*]] = load i32, ptr [[S]], align 8, !tbaa [[TBAA12]]
+; CGSCC-NEXT:    [[I9:%.*]] = load i32, ptr [[I313]], align 8, !tbaa [[INT_TBAA14]]
+; CGSCC-NEXT:    [[I10:%.*]] = load i32, ptr [[S]], align 8, !tbaa [[INT_TBAA12]]
 ; CGSCC-NEXT:    [[ADD15:%.*]] = add nsw i32 [[I9]], [[I10]]
 ; CGSCC-NEXT:    [[I316:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[AGG_RESULT]], i64 0, i32 2
-; CGSCC-NEXT:    store i32 [[ADD15]], ptr [[I316]], align 4, !tbaa [[TBAA14]]
+; CGSCC-NEXT:    store i32 [[ADD15]], ptr [[I316]], align 4, !tbaa [[INT_TBAA14]]
 ; CGSCC-NEXT:    ret void
 ;
 entry:
@@ -1266,157 +1266,157 @@ entry:
 ;
 define void @noalias_arg_simplifiable_2(ptr %Bytes) {
 ; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn
-; TUNIT-LABEL: define {{[^@]+}}@noalias_arg_simplifiable_2
-; TUNIT-SAME: (ptr nofree captures(none) [[BYTES:%.*]]) #[[ATTR3]] {
-; TUNIT-NEXT:  entry:
-; TUNIT-NEXT:    br label [[FOR_COND:%.*]]
-; TUNIT:       for.cond:
-; TUNIT-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ 0, [[ENTRY:%.*]] ]
+; TUNIT-LABEL: define void @noalias_arg_simplifiable_2(
+; TUNIT-SAME: ptr nofree captures(none) [[BYTES:%.*]]) #[[ATTR3]] {
+; TUNIT-NEXT:  [[ENTRY:.*]]:
+; TUNIT-NEXT:    br label %[[FOR_COND:.*]]
+; TUNIT:       [[FOR_COND]]:
+; TUNIT-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_INC:.*]] ], [ 0, %[[ENTRY]] ]
 ; TUNIT-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV]], 100
-; TUNIT-NEXT:    br i1 [[EXITCOND]], label [[FOR_BODY:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; TUNIT:       for.cond.cleanup:
-; TUNIT-NEXT:    br label [[FOR_END:%.*]]
-; TUNIT:       for.body:
+; TUNIT-NEXT:    br i1 [[EXITCOND]], label %[[FOR_BODY:.*]], label %[[FOR_COND_CLEANUP:.*]]
+; TUNIT:       [[FOR_COND_CLEANUP]]:
+; TUNIT-NEXT:    br label %[[FOR_END:.*]]
+; TUNIT:       [[FOR_BODY]]:
 ; TUNIT-NEXT:    [[I:%.*]] = mul nuw nsw i64 [[INDVARS_IV]], 10
 ; TUNIT-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[BYTES]], i64 [[I]]
-; TUNIT-NEXT:    store i8 0, ptr [[ARRAYIDX]], align 1, !tbaa [[TBAA19]]
-; TUNIT-NEXT:    br label [[FOR_INC]]
-; TUNIT:       for.inc:
+; TUNIT-NEXT:    store i8 0, ptr [[ARRAYIDX]], align 1, !tbaa [[CHAR_TBAA19]]
+; TUNIT-NEXT:    br label %[[FOR_INC]]
+; TUNIT:       [[FOR_INC]]:
 ; TUNIT-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; TUNIT-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP25:![0-9]+]]
-; TUNIT:       for.end:
-; TUNIT-NEXT:    br label [[FOR_COND2:%.*]]
-; TUNIT:       for.cond2:
-; TUNIT-NEXT:    [[INDVARS_IV2:%.*]] = phi i64 [ [[INDVARS_IV_NEXT3:%.*]], [[FOR_INC9:%.*]] ], [ 0, [[FOR_END]] ]
+; TUNIT-NEXT:    br label %[[FOR_COND]], !llvm.loop [[LOOP25:![0-9]+]]
+; TUNIT:       [[FOR_END]]:
+; TUNIT-NEXT:    br label %[[FOR_COND2:.*]]
+; TUNIT:       [[FOR_COND2]]:
+; TUNIT-NEXT:    [[INDVARS_IV2:%.*]] = phi i64 [ [[INDVARS_IV_NEXT3:%.*]], %[[FOR_INC9:.*]] ], [ 0, %[[FOR_END]] ]
 ; TUNIT-NEXT:    [[EXITCOND6:%.*]] = icmp ne i64 [[INDVARS_IV2]], 10
-; TUNIT-NEXT:    br i1 [[EXITCOND6]], label [[FOR_BODY5:%.*]], label [[FOR_COND_CLEANUP4:%.*]]
-; TUNIT:       for.cond.cleanup4:
-; TUNIT-NEXT:    br label [[FOR_END11:%.*]]
-; TUNIT:       for.body5:
+; TUNIT-NEXT:    br i1 [[EXITCOND6]], label %[[FOR_BODY5:.*]], label %[[FOR_COND_CLEANUP4:.*]]
+; TUNIT:       [[FOR_COND_CLEANUP4]]:
+; TUNIT-NEXT:    br label %[[FOR_END11:.*]]
+; TUNIT:       [[FOR_BODY5]]:
 ; TUNIT-NEXT:    [[I16:%.*]] = mul nuw nsw i64 [[INDVARS_IV2]], 10
 ; TUNIT-NEXT:    [[I17:%.*]] = or i64 [[I16]], 1
 ; TUNIT-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[BYTES]], i64 [[I17]]
-; TUNIT-NEXT:    store float 0.000000e+00, ptr [[ARRAYIDX8]], align 4, !tbaa [[TBAA26:![0-9]+]]
-; TUNIT-NEXT:    br label [[FOR_INC9]]
-; TUNIT:       for.inc9:
+; TUNIT-NEXT:    store float 0.000000e+00, ptr [[ARRAYIDX8]], align 4, !tbaa [[FLOAT_TBAA26:![0-9]+]]
+; TUNIT-NEXT:    br label %[[FOR_INC9]]
+; TUNIT:       [[FOR_INC9]]:
 ; TUNIT-NEXT:    [[INDVARS_IV_NEXT3]] = add nuw nsw i64 [[INDVARS_IV2]], 1
-; TUNIT-NEXT:    br label [[FOR_COND2]], !llvm.loop [[LOOP27:![0-9]+]]
-; TUNIT:       for.end11:
-; TUNIT-NEXT:    br label [[FOR_COND13:%.*]]
-; TUNIT:       for.cond13:
-; TUNIT-NEXT:    [[INDVARS_IV7:%.*]] = phi i64 [ [[INDVARS_IV_NEXT8:%.*]], [[FOR_INC21:%.*]] ], [ 0, [[FOR_END11]] ]
+; TUNIT-NEXT:    br label %[[FOR_COND2]], !llvm.loop [[LOOP27:![0-9]+]]
+; TUNIT:       [[FOR_END11]]:
+; TUNIT-NEXT:    br label %[[FOR_COND13:.*]]
+; TUNIT:       [[FOR_COND13]]:
+; TUNIT-NEXT:    [[INDVARS_IV7:%.*]] = phi i64 [ [[INDVARS_IV_NEXT8:%.*]], %[[FOR_INC21:.*]] ], [ 0, %[[FOR_END11]] ]
 ; TUNIT-NEXT:    [[EXITCOND11:%.*]] = icmp ne i64 [[INDVARS_IV7]], 20
-; TUNIT-NEXT:    br i1 [[EXITCOND11]], label [[FOR_BODY16:%.*]], label [[FOR_COND_CLEANUP15:%.*]]
-; TUNIT:       for.cond.cleanup15:
-; TUNIT-NEXT:    br label [[FOR_END23:%.*]]
-; TUNIT:       for.body16:
+; TUNIT-NEXT:    br i1 [[EXITCOND11]], label %[[FOR_BODY16:.*]], label %[[FOR_COND_CLEANUP15:.*]]
+; TUNIT:       [[FOR_COND_CLEANUP15]]:
+; TUNIT-NEXT:    br label %[[FOR_END23:.*]]
+; TUNIT:       [[FOR_BODY16]]:
 ; TUNIT-NEXT:    [[I19:%.*]] = mul nuw nsw i64 [[INDVARS_IV7]], 10
 ; TUNIT-NEXT:    [[I20:%.*]] = add nuw nsw i64 [[I19]], 2
 ; TUNIT-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds i64, ptr [[BYTES]], i64 [[I20]]
-; TUNIT-NEXT:    store i64 0, ptr [[ARRAYIDX20]], align 8, !tbaa [[TBAA28:![0-9]+]]
-; TUNIT-NEXT:    br label [[FOR_INC21]]
-; TUNIT:       for.inc21:
+; TUNIT-NEXT:    store i64 0, ptr [[ARRAYIDX20]], align 8, !tbaa [[LONG_LONG_TBAA28:![0-9]+]]
+; TUNIT-NEXT:    br label %[[FOR_INC21]]
+; TUNIT:       [[FOR_INC21]]:
 ; TUNIT-NEXT:    [[INDVARS_IV_NEXT8]] = add nuw nsw i64 [[INDVARS_IV7]], 1
-; TUNIT-NEXT:    br label [[FOR_COND13]], !llvm.loop [[LOOP30:![0-9]+]]
-; TUNIT:       for.end23:
+; TUNIT-NEXT:    br label %[[FOR_COND13]], !llvm.loop [[LOOP30:![0-9]+]]
+; TUNIT:       [[FOR_END23]]:
 ; TUNIT-NEXT:    [[ARRAYIDX24:%.*]] = getelementptr inbounds i8, ptr [[BYTES]], i64 1023
-; TUNIT-NEXT:    store i8 0, ptr [[ARRAYIDX24]], align 1, !tbaa [[TBAA19]]
+; TUNIT-NEXT:    store i8 0, ptr [[ARRAYIDX24]], align 1, !tbaa [[CHAR_TBAA19]]
 ; TUNIT-NEXT:    [[ARRAYIDX25:%.*]] = getelementptr inbounds i8, ptr [[BYTES]], i64 500
 ; TUNIT-NEXT:    call void @write_arg(ptr nofree noundef nonnull writeonly align 4 captures(none) [[ARRAYIDX25]], i32 noundef 0) #[[ATTR18]]
-; TUNIT-NEXT:    br label [[FOR_COND27:%.*]]
-; TUNIT:       for.cond27:
-; TUNIT-NEXT:    [[INDVARS_IV12:%.*]] = phi i64 [ [[INDVARS_IV_NEXT13:%.*]], [[FOR_INC35:%.*]] ], [ 0, [[FOR_END23]] ]
+; TUNIT-NEXT:    br label %[[FOR_COND27:.*]]
+; TUNIT:       [[FOR_COND27]]:
+; TUNIT-NEXT:    [[INDVARS_IV12:%.*]] = phi i64 [ [[INDVARS_IV_NEXT13:%.*]], %[[FOR_INC35:.*]] ], [ 0, %[[FOR_END23]] ]
 ; TUNIT-NEXT:    [[EXITCOND14:%.*]] = icmp ne i64 [[INDVARS_IV12]], 1024
-; TUNIT-NEXT:    br i1 [[EXITCOND14]], label [[FOR_BODY30:%.*]], label [[FOR_COND_CLEANUP29:%.*]]
-; TUNIT:       for.cond.cleanup29:
-; TUNIT-NEXT:    br label [[FOR_END37:%.*]]
-; TUNIT:       for.body30:
+; TUNIT-NEXT:    br i1 [[EXITCOND14]], label %[[FOR_BODY30:.*]], label %[[FOR_COND_CLEANUP29:.*]]
+; TUNIT:       [[FOR_COND_CLEANUP29]]:
+; TUNIT-NEXT:    br label %[[FOR_END37:.*]]
+; TUNIT:       [[FOR_BODY30]]:
 ; TUNIT-NEXT:    [[ARRAYIDX32:%.*]] = getelementptr inbounds i8, ptr [[BYTES]], i64 [[INDVARS_IV12]]
-; TUNIT-NEXT:    [[I22:%.*]] = load i8, ptr [[ARRAYIDX32]], align 1, !tbaa [[TBAA19]]
+; TUNIT-NEXT:    [[I22:%.*]] = load i8, ptr [[ARRAYIDX32]], align 1, !tbaa [[CHAR_TBAA19]]
 ; TUNIT-NEXT:    [[ARRAYIDX34:%.*]] = getelementptr inbounds [1024 x i8], ptr @globalBytes, i64 0, i64 [[INDVARS_IV12]]
-; TUNIT-NEXT:    store i8 [[I22]], ptr [[ARRAYIDX34]], align 1, !tbaa [[TBAA19]]
-; TUNIT-NEXT:    br label [[FOR_INC35]]
-; TUNIT:       for.inc35:
+; TUNIT-NEXT:    store i8 [[I22]], ptr [[ARRAYIDX34]], align 1, !tbaa [[CHAR_TBAA19]]
+; TUNIT-NEXT:    br label %[[FOR_INC35]]
+; TUNIT:       [[FOR_INC35]]:
 ; TUNIT-NEXT:    [[INDVARS_IV_NEXT13]] = add nuw nsw i64 [[INDVARS_IV12]], 1
-; TUNIT-NEXT:    br label [[FOR_COND27]], !llvm.loop [[LOOP31:![0-9]+]]
-; TUNIT:       for.end37:
+; TUNIT-NEXT:    br label %[[FOR_COND27]], !llvm.loop [[LOOP31:![0-9]+]]
+; TUNIT:       [[FOR_END37]]:
 ; TUNIT-NEXT:    ret void
 ;
 ; CGSCC: Function Attrs: mustprogress nofree nosync nounwind willreturn
-; CGSCC-LABEL: define {{[^@]+}}@noalias_arg_simplifiable_2
-; CGSCC-SAME: (ptr nofree captures(none) [[BYTES:%.*]]) #[[ATTR3]] {
-; CGSCC-NEXT:  entry:
-; CGSCC-NEXT:    br label [[FOR_COND:%.*]]
-; CGSCC:       for.cond:
-; CGSCC-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ 0, [[ENTRY:%.*]] ]
+; CGSCC-LABEL: define void @noalias_arg_simplifiable_2(
+; CGSCC-SAME: ptr nofree captures(none) [[BYTES:%.*]]) #[[ATTR3]] {
+; CGSCC-NEXT:  [[ENTRY:.*]]:
+; CGSCC-NEXT:    br label %[[FOR_COND:.*]]
+; CGSCC:       [[FOR_COND]]:
+; CGSCC-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_INC:.*]] ], [ 0, %[[ENTRY]] ]
 ; CGSCC-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV]], 100
-; CGSCC-NEXT:    br i1 [[EXITCOND]], label [[FOR_BODY:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; CGSCC:       for.cond.cleanup:
-; CGSCC-NEXT:    br label [[FOR_END:%.*]]
-; CGSCC:       for.body:
+; CGSCC-NEXT:    br i1 [[EXITCOND]], label %[[FOR_BODY:.*]], label %[[FOR_COND_CLEANUP:.*]]
+; CGSCC:       [[FOR_COND_CLEANUP]]:
+; CGSCC-NEXT:    br label %[[FOR_END:.*]]
+; CGSCC:       [[FOR_BODY]]:
 ; CGSCC-NEXT:    [[I:%.*]] = mul nuw nsw i64 [[INDVARS_IV]], 10
 ; CGSCC-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[BYTES]], i64 [[I]]
-; CGSCC-NEXT:    store i8 0, ptr [[ARRAYIDX]], align 1, !tbaa [[TBAA15]]
-; CGSCC-NEXT:    br label [[FOR_INC]]
-; CGSCC:       for.inc:
+; CGSCC-NEXT:    store i8 0, ptr [[ARRAYIDX]], align 1, !tbaa [[CHAR_TBAA15]]
+; CGSCC-NEXT:    br label %[[FOR_INC]]
+; CGSCC:       [[FOR_INC]]:
 ; CGSCC-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CGSCC-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP28:![0-9]+]]
-; CGSCC:       for.end:
-; CGSCC-NEXT:    br label [[FOR_COND2:%.*]]
-; CGSCC:       for.cond2:
-; CGSCC-NEXT:    [[INDVARS_IV2:%.*]] = phi i64 [ [[INDVARS_IV_NEXT3:%.*]], [[FOR_INC9:%.*]] ], [ 0, [[FOR_END]] ]
+; CGSCC-NEXT:    br label %[[FOR_COND]], !llvm.loop [[LOOP28:![0-9]+]]
+; CGSCC:       [[FOR_END]]:
+; CGSCC-NEXT:    br label %[[FOR_COND2:.*]]
+; CGSCC:       [[FOR_COND2]]:
+; CGSCC-NEXT:    [[INDVARS_IV2:%.*]] = phi i64 [ [[INDVARS_IV_NEXT3:%.*]], %[[FOR_INC9:.*]] ], [ 0, %[[FOR_END]] ]
 ; CGSCC-NEXT:    [[EXITCOND6:%.*]] = icmp ne i64 [[INDVARS_IV2]], 10
-; CGSCC-NEXT:    br i1 [[EXITCOND6]], label [[FOR_BODY5:%.*]], label [[FOR_COND_CLEANUP4:%.*]]
-; CGSCC:       for.cond.cleanup4:
-; CGSCC-NEXT:    br label [[FOR_END11:%.*]]
-; CGSCC:       for.body5:
+; CGSCC-NEXT:    br i1 [[EXITCOND6]], label %[[FOR_BODY5:.*]], label %[[FOR_COND_CLEANUP4:.*]]
+; CGSCC:       [[FOR_COND_CLEANUP4]]:
+; CGSCC-NEXT:    br label %[[FOR_END11:.*]]
+; CGSCC:       [[FOR_BODY5]]:
 ; CGSCC-NEXT:    [[I16:%.*]] = mul nuw nsw i64 [[INDVARS_IV2]], 10
 ; CGSCC-NEXT:    [[I17:%.*]] = or i64 [[I16]], 1
 ; CGSCC-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[BYTES]], i64 [[I17]]
-; CGSCC-NEXT:    store float 0.000000e+00, ptr [[ARRAYIDX8]], align 4, !tbaa [[TBAA18]]
-; CGSCC-NEXT:    br label [[FOR_INC9]]
-; CGSCC:       for.inc9:
+; CGSCC-NEXT:    store float 0.000000e+00, ptr [[ARRAYIDX8]], align 4, !tbaa [[FLOAT_TBAA18]]
+; CGSCC-NEXT:    br label %[[FOR_INC9]]
+; CGSCC:       [[FOR_INC9]]:
 ; CGSCC-NEXT:    [[INDVARS_IV_NEXT3]] = add nuw nsw i64 [[INDVARS_IV2]], 1
-; CGSCC-NEXT:    br label [[FOR_COND2]], !llvm.loop [[LOOP29:![0-9]+]]
-; CGSCC:       for.end11:
-; CGSCC-NEXT:    br label [[FOR_COND13:%.*]]
-; CGSCC:       for.cond13:
-; CGSCC-NEXT:    [[INDVARS_IV7:%.*]] = phi i64 [ [[INDVARS_IV_NEXT8:%.*]], [[FOR_INC21:%.*]] ], [ 0, [[FOR_END11]] ]
+; CGSCC-NEXT:    br label %[[FOR_COND2]], !llvm.loop [[LOOP29:![0-9]+]]
+; CGSCC:       [[FOR_END11]]:
+; CGSCC-NEXT:    br label %[[FOR_COND13:.*]]
+; CGSCC:       [[FOR_COND13]]:
+; CGSCC-NEXT:    [[INDVARS_IV7:%.*]] = phi i64 [ [[INDVARS_IV_NEXT8:%.*]], %[[FOR_INC21:.*]] ], [ 0, %[[FOR_END11]] ]
 ; CGSCC-NEXT:    [[EXITCOND11:%.*]] = icmp ne i64 [[INDVARS_IV7]], 20
-; CGSCC-NEXT:    br i1 [[EXITCOND11]], label [[FOR_BODY16:%.*]], label [[FOR_COND_CLEANUP15:%.*]]
-; CGSCC:       for.cond.cleanup15:
-; CGSCC-NEXT:    br label [[FOR_END23:%.*]]
-; CGSCC:       for.body16:
+; CGSCC-NEXT:    br i1 [[EXITCOND11]], label %[[FOR_BODY16:.*]], label %[[FOR_COND_CLEANUP15:.*]]
+; CGSCC:       [[FOR_COND_CLEANUP15]]:
+; CGSCC-NEXT:    br label %[[FOR_END23:.*]]
+; CGSCC:       [[FOR_BODY16]]:
 ; CGSCC-NEXT:    [[I19:%.*]] = mul nuw nsw i64 [[INDVARS_IV7]], 10
 ; CGSCC-NEXT:    [[I20:%.*]] = add nuw nsw i64 [[I19]], 2
 ; CGSCC-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds i64, ptr [[BYTES]], i64 [[I20]]
-; CGSCC-NEXT:    store i64 0, ptr [[ARRAYIDX20]], align 8, !tbaa [[TBAA20]]
-; CGSCC-NEXT:    br label [[FOR_INC21]]
-; CGSCC:       for.inc21:
+; CGSCC-NEXT:    store i64 0, ptr [[ARRAYIDX20]], align 8, !tbaa [[LONG_LONG_TBAA20]]
+; CGSCC-NEXT:    br label %[[FOR_INC21]]
+; CGSCC:       [[FOR_INC21]]:
 ; CGSCC-NEXT:    [[INDVARS_IV_NEXT8]] = add nuw nsw i64 [[INDVARS_IV7]], 1
-; CGSCC-NEXT:    br label [[FOR_COND13]], !llvm.loop [[LOOP30:![0-9]+]]
-; CGSCC:       for.end23:
+; CGSCC-NEXT:    br label %[[FOR_COND13]], !llvm.loop [[LOOP30:![0-9]+]]
+; CGSCC:       [[FOR_END23]]:
 ; CGSCC-NEXT:    [[ARRAYIDX24:%.*]] = getelementptr inbounds i8, ptr [[BYTES]], i64 1023
-; CGSCC-NEXT:    store i8 0, ptr [[ARRAYIDX24]], align 1, !tbaa [[TBAA15]]
+; CGSCC-NEXT:    store i8 0, ptr [[ARRAYIDX24]], align 1, !tbaa [[CHAR_TBAA15]]
 ; CGSCC-NEXT:    [[ARRAYIDX25:%.*]] = getelementptr inbounds i8, ptr [[BYTES]], i64 500
 ; CGSCC-NEXT:    call void @write_arg(ptr nofree noundef nonnull writeonly align 4 captures(none) dereferenceable(4) [[ARRAYIDX25]], i32 noundef 0) #[[ATTR21]]
-; CGSCC-NEXT:    br label [[FOR_COND27:%.*]]
-; CGSCC:       for.cond27:
-; CGSCC-NEXT:    [[INDVARS_IV12:%.*]] = phi i64 [ [[INDVARS_IV_NEXT13:%.*]], [[FOR_INC35:%.*]] ], [ 0, [[FOR_END23]] ]
+; CGSCC-NEXT:    br label %[[FOR_COND27:.*]]
+; CGSCC:       [[FOR_COND27]]:
+; CGSCC-NEXT:    [[INDVARS_IV12:%.*]] = phi i64 [ [[INDVARS_IV_NEXT13:%.*]], %[[FOR_INC35:.*]] ], [ 0, %[[FOR_END23]] ]
 ; CGSCC-NEXT:    [[EXITCOND14:%.*]] = icmp ne i64 [[INDVARS_IV12]], 1024
-; CGSCC-NEXT:    br i1 [[EXITCOND14]], label [[FOR_BODY30:%.*]], label [[FOR_COND_CLEANUP29:%.*]]
-; CGSCC:       for.cond.cleanup29:
-; CGSCC-NEXT:    br label [[FOR_END37:%.*]]
-; CGSCC:       for.body30:
+; CGSCC-NEXT:    br i1 [[EXITCOND14]], label %[[FOR_BODY30:.*]], label %[[FOR_COND_CLEANUP29:.*]]
+; CGSCC:       [[FOR_COND_CLEANUP29]]:
+; CGSCC-NEXT:    br label %[[FOR_END37:.*]]
+; CGSCC:       [[FOR_BODY30]]:
 ; CGSCC-NEXT:    [[ARRAYIDX32:%.*]] = getelementptr inbounds i8, ptr [[BYTES]], i64 [[INDVARS_IV12]]
-; CGSCC-NEXT:    [[I22:%.*]] = load i8, ptr [[ARRAYIDX32]], align 1, !tbaa [[TBAA15]]
+; CGSCC-NEXT:    [[I22:%.*]] = load i8, ptr [[ARRAYIDX32]], align 1, !tbaa [[CHAR_TBAA15]]
 ; CGSCC-NEXT:    [[ARRAYIDX34:%.*]] = getelementptr inbounds [1024 x i8], ptr @globalBytes, i64 0, i64 [[INDVARS_IV12]]
-; CGSCC-NEXT:    store i8 [[I22]], ptr [[ARRAYIDX34]], align 1, !tbaa [[TBAA15]]
-; CGSCC-NEXT:    br label [[FOR_INC35]]
-; CGSCC:       for.inc35:
+; CGSCC-NEXT:    store i8 [[I22]], ptr [[ARRAYIDX34]], align 1, !tbaa [[CHAR_TBAA15]]
+; CGSCC-NEXT:    br label %[[FOR_INC35]]
+; CGSCC:       [[FOR_INC35]]:
 ; CGSCC-NEXT:    [[INDVARS_IV_NEXT13]] = add nuw nsw i64 [[INDVARS_IV12]], 1
-; CGSCC-NEXT:    br label [[FOR_COND27]], !llvm.loop [[LOOP31:![0-9]+]]
-; CGSCC:       for.end37:
+; CGSCC-NEXT:    br label %[[FOR_COND27]], !llvm.loop [[LOOP31:![0-9]+]]
+; CGSCC:       [[FOR_END37]]:
 ; CGSCC-NEXT:    ret void
 ;
 entry:
@@ -1524,40 +1524,40 @@ for.end37:                                        ; preds = %for.cond.cleanup29
 ;    }
 ;
 define i32 @local_alloca_not_simplifiable_1() {
-; TUNIT-LABEL: define {{[^@]+}}@local_alloca_not_simplifiable_1() {
-; TUNIT-NEXT:  entry:
+; TUNIT-LABEL: define i32 @local_alloca_not_simplifiable_1() {
+; TUNIT-NEXT:  [[ENTRY:.*:]]
 ; TUNIT-NEXT:    [[X:%.*]] = alloca i32, align 4
 ; TUNIT-NEXT:    [[Y:%.*]] = alloca i32, align 4
 ; TUNIT-NEXT:    call void @llvm.lifetime.start.p0(ptr nofree noundef nonnull align 4 captures(none) dereferenceable(4) [[X]]) #[[ATTR17]]
 ; TUNIT-NEXT:    call void @llvm.lifetime.start.p0(ptr noalias nofree noundef nonnull align 4 captures(none) dereferenceable(4) [[Y]]) #[[ATTR17]]
-; TUNIT-NEXT:    store i32 1, ptr [[Y]], align 4, !tbaa [[TBAA3]]
-; TUNIT-NEXT:    store i32 1, ptr [[X]], align 4, !tbaa [[TBAA3]]
+; TUNIT-NEXT:    store i32 1, ptr [[Y]], align 4, !tbaa [[INT_TBAA3]]
+; TUNIT-NEXT:    store i32 1, ptr [[X]], align 4, !tbaa [[INT_TBAA3]]
 ; TUNIT-NEXT:    call void @escape(ptr noundef nonnull align 4 dereferenceable(4) [[X]])
 ; TUNIT-NEXT:    call void @write_random(ptr noalias nofree noundef nonnull writeonly align 4 captures(none) dereferenceable(4) [[Y]])
-; TUNIT-NEXT:    [[I3:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[TBAA3]]
+; TUNIT-NEXT:    [[I3:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[INT_TBAA3]]
 ; TUNIT-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i32 [[I3]], 0
 ; TUNIT-NEXT:    [[COND:%.*]] = select i1 [[TOBOOL_NOT]], i32 2, i32 1
-; TUNIT-NEXT:    [[I4:%.*]] = load i32, ptr [[Y]], align 4, !tbaa [[TBAA3]]
+; TUNIT-NEXT:    [[I4:%.*]] = load i32, ptr [[Y]], align 4, !tbaa [[INT_TBAA3]]
 ; TUNIT-NEXT:    [[ADD:%.*]] = add nsw i32 [[I3]], [[I4]]
 ; TUNIT-NEXT:    [[ADD1:%.*]] = add nsw i32 [[ADD]], [[COND]]
 ; TUNIT-NEXT:    call void @llvm.lifetime.end.p0(ptr noalias nofree noundef nonnull align 4 captures(none) dereferenceable(4) [[Y]])
 ; TUNIT-NEXT:    call void @llvm.lifetime.end.p0(ptr nofree noundef nonnull align 4 captures(none) dereferenceable(4) [[X]])
 ; TUNIT-NEXT:    ret i32 [[ADD1]]
 ;
-; CGSCC-LABEL: define {{[^@]+}}@local_alloca_not_simplifiable_1() {
-; CGSCC-NEXT:  entry:
+; CGSCC-LABEL: define i32 @local_alloca_not_simplifiable_1() {
+; CGSCC-NEXT:  [[ENTRY:.*:]]
 ; CGSCC-NEXT:    [[X:%.*]] = alloca i32, align 4
 ; CGSCC-NEXT:    [[Y:%.*]] = alloca i32, align 4
 ; CGSCC-NEXT:    call void @llvm.lifetime.start.p0(ptr nofree noundef nonnull align 4 captures(none) dereferenceable(4) [[X]]) #[[ATTR20]]
 ; CGSCC-NEXT:    call void @llvm.lifetime.start.p0(ptr noalias nofree noundef nonnull align 4 captures(none) dereferenceable(4) [[Y]]) #[[ATTR20]]
-; CGSCC-NEXT:    store i32 1, ptr [[Y]], align 4, !tbaa [[TBAA3]]
-; CGSCC-NEXT:    store i32 1, ptr [[X]], align 4, !tbaa [[TBAA3]]
+; CGSCC-NEXT:    store i32 1, ptr [[Y]], align 4, !tbaa [[INT_TBAA3]]
+; CGSCC-NEXT:    store i32 1, ptr [[X]], align 4, !tbaa [[INT_TBAA3]]
 ; CGSCC-NEXT:    call void @escape(ptr noundef nonnull align 4 dereferenceable(4) [[X]])
 ; CGSCC-NEXT:    call void @write_random(ptr noalias nofree noundef nonnull writeonly align 4 captures(none) dereferenceable(4) [[Y]])
-; CGSCC-NEXT:    [[I3:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[TBAA3]]
+; CGSCC-NEXT:    [[I3:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[INT_TBAA3]]
 ; CGSCC-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i32 [[I3]], 0
 ; CGSCC-NEXT:    [[COND:%.*]] = select i1 [[TOBOOL_NOT]], i32 2, i32 1
-; CGSCC-NEXT:    [[I4:%.*]] = load i32, ptr [[Y]], align 4, !tbaa [[TBAA3]]
+; CGSCC-NEXT:    [[I4:%.*]] = load i32, ptr [[Y]], align 4, !tbaa [[INT_TBAA3]]
 ; CGSCC-NEXT:    [[ADD:%.*]] = add nsw i32 [[I3]], [[I4]]
 ; CGSCC-NEXT:    [[ADD1:%.*]] = add nsw i32 [[ADD]], [[COND]]
 ; CGSCC-NEXT:    call void @llvm.lifetime.end.p0(ptr noalias nofree noundef nonnull align 4 captures(none) dereferenceable(4) [[Y]])
@@ -1586,20 +1586,20 @@ entry:
 
 define i8 @local_alloca_not_simplifiable_2(i64 %index1, i64 %index2, i1 %cnd) {
 ; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
-; CHECK-LABEL: define {{[^@]+}}@local_alloca_not_simplifiable_2
-; CHECK-SAME: (i64 [[INDEX1:%.*]], i64 [[INDEX2:%.*]], i1 noundef [[CND:%.*]]) #[[ATTR4]] {
-; CHECK-NEXT:  entry:
+; CHECK-LABEL: define i8 @local_alloca_not_simplifiable_2(
+; CHECK-SAME: i64 [[INDEX1:%.*]], i64 [[INDEX2:%.*]], i1 noundef [[CND:%.*]]) #[[ATTR4]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[BYTES:%.*]] = alloca [1024 x i8], align 16
 ; CHECK-NEXT:    store i8 7, ptr [[BYTES]], align 16
-; CHECK-NEXT:    br i1 [[CND]], label [[LEFT:%.*]], label [[RIGHT:%.*]]
-; CHECK:       left:
+; CHECK-NEXT:    br i1 [[CND]], label %[[LEFT:.*]], label %[[RIGHT:.*]]
+; CHECK:       [[LEFT]]:
 ; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds [1024 x i8], ptr [[BYTES]], i64 0, i64 [[INDEX1]]
-; CHECK-NEXT:    br label [[JOIN:%.*]]
-; CHECK:       right:
+; CHECK-NEXT:    br label %[[JOIN:.*]]
+; CHECK:       [[RIGHT]]:
 ; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds [1024 x i8], ptr [[BYTES]], i64 0, i64 [[INDEX2]]
-; CHECK-NEXT:    br label [[JOIN]]
-; CHECK:       join:
-; CHECK-NEXT:    [[GEP_JOIN:%.*]] = phi ptr [ [[GEP1]], [[LEFT]] ], [ [[GEP2]], [[RIGHT]] ]
+; CHECK-NEXT:    br label %[[JOIN]]
+; CHECK:       [[JOIN]]:
+; CHECK-NEXT:    [[GEP_JOIN:%.*]] = phi ptr [ [[GEP1]], %[[LEFT]] ], [ [[GEP2]], %[[RIGHT]] ]
 ; CHECK-NEXT:    store i8 9, ptr [[GEP_JOIN]], align 4
 ; CHECK-NEXT:    [[I:%.*]] = load i8, ptr [[BYTES]], align 16
 ; CHECK-NEXT:    ret i8 [[I]]
@@ -1630,9 +1630,9 @@ join:                                             ; preds = %right, %left
 ; We could simplify these if we separate accessed bins wrt. alignment (here mod 4).
 define i32 @unknown_access_mixed_simplifiable(i32 %arg1, i32 %arg2) {
 ; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
-; CHECK-LABEL: define {{[^@]+}}@unknown_access_mixed_simplifiable
-; CHECK-SAME: (i32 [[ARG1:%.*]], i32 [[ARG2:%.*]]) #[[ATTR4]] {
-; CHECK-NEXT:  entry:
+; CHECK-LABEL: define i32 @unknown_access_mixed_simplifiable(
+; CHECK-SAME: i32 [[ARG1:%.*]], i32 [[ARG2:%.*]]) #[[ATTR4]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[S:%.*]] = alloca [[STRUCT_S:%.*]], align 4
 ; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S]], i64 0, i32 2
 ; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds i32, ptr [[S]], i32 [[ARG1]]
@@ -1666,9 +1666,9 @@ entry:
 ; The access to bc4b could go anywhere, nothing is simplifiable.
 define i32 @unknown_access_mixed_not_simplifiable(i32 %arg1, i32 %arg2, i32 %arg3) {
 ; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
-; CHECK-LABEL: define {{[^@]+}}@unknown_access_mixed_not_simplifiable
-; CHECK-SAME: (i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 [[ARG3:%.*]]) #[[ATTR4]] {
-; CHECK-NEXT:  entry:
+; CHECK-LABEL: define i32 @unknown_access_mixed_not_simplifiable(
+; CHECK-SAME: i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 [[ARG3:%.*]]) #[[ATTR4]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[S:%.*]] = alloca [[STRUCT_S:%.*]], align 4
 ; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S]], i64 0, i32 2
 ; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds i32, ptr [[S]], i32 [[ARG1]]
@@ -1716,17 +1716,17 @@ declare void @escape(ptr)
 ;
 define i32 @global_not_simplifiable_1(i32 %cnd) {
 ; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(read)
-; TUNIT-LABEL: define {{[^@]+}}@global_not_simplifiable_1
-; TUNIT-SAME: (i32 [[CND:%.*]]) #[[ATTR6:[0-9]+]] {
-; TUNIT-NEXT:  entry:
-; TUNIT-NEXT:    [[I:%.*]] = load i32, ptr @Flag0, align 4, !tbaa [[TBAA3]]
+; TUNIT-LABEL: define i32 @global_not_simplifiable_1(
+; TUNIT-SAME: i32 [[CND:%.*]]) #[[ATTR6:[0-9]+]] {
+; TUNIT-NEXT:  [[ENTRY:.*:]]
+; TUNIT-NEXT:    [[I:%.*]] = load i32, ptr @Flag0, align 4, !tbaa [[INT_TBAA3]]
 ; TUNIT-NEXT:    ret i32 [[I]]
 ;
 ; CGSCC: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(read)
-; CGSCC-LABEL: define {{[^@]+}}@global_not_simplifiable_1
-; CGSCC-SAME: (i32 [[CND:%.*]]) #[[ATTR7:[0-9]+]] {
-; CGSCC-NEXT:  entry:
-; CGSCC-NEXT:    [[I:%.*]] = load i32, ptr @Flag0, align 4, !tbaa [[TBAA3]]
+; CGSCC-LABEL: define i32 @global_not_simplifiable_1(
+; CGSCC-SAME: i32 [[CND:%.*]]) #[[ATTR7:[0-9]+]] {
+; CGSCC-NEXT:  [[ENTRY:.*:]]
+; CGSCC-NEXT:    [[I:%.*]] = load i32, ptr @Flag0, align 4, !tbaa [[INT_TBAA3]]
 ; CGSCC-NEXT:    ret i32 [[I]]
 ;
 entry:
@@ -1744,15 +1744,15 @@ entry:
 ;    }
 ;
 define i32 @static_global_not_simplifiable_1(i32 %cnd) {
-; CHECK-LABEL: define {{[^@]+}}@static_global_not_simplifiable_1
-; CHECK-SAME: (i32 [[CND:%.*]]) {
-; CHECK-NEXT:  entry:
+; CHECK-LABEL: define i32 @static_global_not_simplifiable_1(
+; CHECK-SAME: i32 [[CND:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    call void @sync()
 ; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i32 [[CND]], 0
-; CHECK-NEXT:    br i1 [[TOBOOL_NOT]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
-; CHECK:       if.then:
-; CHECK-NEXT:    br label [[IF_END]]
-; CHECK:       if.end:
+; CHECK-NEXT:    br i1 [[TOBOOL_NOT]], label %[[IF_END:.*]], label %[[IF_THEN:.*]]
+; CHECK:       [[IF_THEN]]:
+; CHECK-NEXT:    br label %[[IF_END]]
+; CHECK:       [[IF_END]]:
 ; CHECK-NEXT:    ret i32 1
 ;
 entry:
@@ -1780,13 +1780,13 @@ declare void @sync()
 ;      return v;
 ;    }
 define i32 @static_global_simplifiable_4(i32 %cnd) {
-; CHECK-LABEL: define {{[^@]+}}@static_global_simplifiable_4
-; CHECK-SAME: (i32 [[CND:%.*]]) {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    store i32 1, ptr @Flag2, align 4, !tbaa [[TBAA3]]
+; CHECK-LABEL: define noundef i32 @static_global_simplifiable_4(
+; CHECK-SAME: i32 [[CND:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store i32 1, ptr @Flag2, align 4, !tbaa [[INT_TBAA3]]
 ; CHECK-NEXT:    call void @sync()
-; CHECK-NEXT:    [[I:%.*]] = load i32, ptr @Flag2, align 4, !tbaa [[TBAA3]]
-; CHECK-NEXT:    store i32 2, ptr @Flag2, align 4, !tbaa [[TBAA3]]
+; CHECK-NEXT:    [[I:%.*]] = load i32, ptr @Flag2, align 4, !tbaa [[INT_TBAA3]]
+; CHECK-NEXT:    store i32 2, ptr @Flag2, align 4, !tbaa [[INT_TBAA3]]
 ; CHECK-NEXT:    ret i32 [[I]]
 ;
 entry:
@@ -1806,22 +1806,22 @@ entry:
 ;      return v;
 ;    }
 define i32 @static_global_not_simplifiable_2(i32 %cnd) {
-; TUNIT-LABEL: define {{[^@]+}}@static_global_not_simplifiable_2
-; TUNIT-SAME: (i32 [[CND:%.*]]) {
-; TUNIT-NEXT:  entry:
-; TUNIT-NEXT:    store i32 1, ptr @Flag4, align 4, !tbaa [[TBAA3]]
+; TUNIT-LABEL: define noundef i32 @static_global_not_simplifiable_2(
+; TUNIT-SAME: i32 [[CND:%.*]]) {
+; TUNIT-NEXT:  [[ENTRY:.*:]]
+; TUNIT-NEXT:    store i32 1, ptr @Flag4, align 4, !tbaa [[INT_TBAA3]]
 ; TUNIT-NEXT:    call void @sync() #[[ATTR19:[0-9]+]]
-; TUNIT-NEXT:    [[I:%.*]] = load i32, ptr @Flag4, align 4, !tbaa [[TBAA3]]
-; TUNIT-NEXT:    store i32 2, ptr @Flag4, align 4, !tbaa [[TBAA3]]
+; TUNIT-NEXT:    [[I:%.*]] = load i32, ptr @Flag4, align 4, !tbaa [[INT_TBAA3]]
+; TUNIT-NEXT:    store i32 2, ptr @Flag4, align 4, !tbaa [[INT_TBAA3]]
 ; TUNIT-NEXT:    ret i32 [[I]]
 ;
-; CGSCC-LABEL: define {{[^@]+}}@static_global_not_simplifiable_2
-; CGSCC-SAME: (i32 [[CND:%.*]]) {
-; CGSCC-NEXT:  entry:
-; CGSCC-NEXT:    store i32 1, ptr @Flag4, align 4, !tbaa [[TBAA3]]
+; CGSCC-LABEL: define noundef i32 @static_global_not_simplifiable_2(
+; CGSCC-SAME: i32 [[CND:%.*]]) {
+; CGSCC-NEXT:  [[ENTRY:.*:]]
+; CGSCC-NEXT:    store i32 1, ptr @Flag4, align 4, !tbaa [[INT_TBAA3]]
 ; CGSCC-NEXT:    call void @sync() #[[ATTR22:[0-9]+]]
-; CGSCC-NEXT:    [[I:%.*]] = load i32, ptr @Flag4, align 4, !tbaa [[TBAA3]]
-; CGSCC-NEXT:    store i32 2, ptr @Flag4, align 4, !tbaa [[TBAA3]]
+; CGSCC-NEXT:    [[I:%.*]] = load i32, ptr @Flag4, align 4, !tbaa [[INT_TBAA3]]
+; CGSCC-NEXT:    store i32 2, ptr @Flag4, align 4, !tbaa [[INT_TBAA3]]
 ; CGSCC-NEXT:    ret i32 [[I]]
 ;
 entry:
@@ -1833,15 +1833,15 @@ entry:
 }
 define void @static_global_not_simplifiable_2_helper() {
 ; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(write)
-; TUNIT-LABEL: define {{[^@]+}}@static_global_not_simplifiable_2_helper
-; TUNIT-SAME: () #[[ATTR5]] {
-; TUNIT-NEXT:    store i32 2, ptr @Flag4, align 4, !tbaa [[TBAA3]]
+; TUNIT-LABEL: define void @static_global_not_simplifiable_2_helper(
+; TUNIT-SAME: ) #[[ATTR5]] {
+; TUNIT-NEXT:    store i32 2, ptr @Flag4, align 4, !tbaa [[INT_TBAA3]]
 ; TUNIT-NEXT:    ret void
 ;
 ; CGSCC: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(write)
-; CGSCC-LABEL: define {{[^@]+}}@static_global_not_simplifiable_2_helper
-; CGSCC-SAME: () #[[ATTR6]] {
-; CGSCC-NEXT:    store i32 2, ptr @Flag4, align 4, !tbaa [[TBAA3]]
+; CGSCC-LABEL: define void @static_global_not_simplifiable_2_helper(
+; CGSCC-SAME: ) #[[ATTR6]] {
+; CGSCC-NEXT:    store i32 2, ptr @Flag4, align 4, !tbaa [[INT_TBAA3]]
 ; CGSCC-NEXT:    ret void
 ;
   store i32 2, ptr @Flag4, align 4, !tbaa !3
@@ -1851,19 +1851,19 @@ define void @static_global_not_simplifiable_2_helper() {
 ; Similiar to static_global_simplifiable_3 but with a may-store.
 define i32 @static_global_not_simplifiable_3(i1 %c, ptr %p) {
 ; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn
-; TUNIT-LABEL: define {{[^@]+}}@static_global_not_simplifiable_3
-; TUNIT-SAME: (i1 [[C:%.*]], ptr nofree writeonly captures(none) [[P:%.*]]) #[[ATTR3]] {
+; TUNIT-LABEL: define noundef i32 @static_global_not_simplifiable_3(
+; TUNIT-SAME: i1 [[C:%.*]], ptr nofree writeonly captures(none) [[P:%.*]]) #[[ATTR3]] {
 ; TUNIT-NEXT:    [[SEL:%.*]] = select i1 [[C]], ptr @Flag3, ptr [[P]]
-; TUNIT-NEXT:    store i32 1, ptr [[SEL]], align 4, !tbaa [[TBAA3]]
-; TUNIT-NEXT:    [[I:%.*]] = load i32, ptr @Flag3, align 4, !tbaa [[TBAA3]]
+; TUNIT-NEXT:    store i32 1, ptr [[SEL]], align 4, !tbaa [[INT_TBAA3]]
+; TUNIT-NEXT:    [[I:%.*]] = load i32, ptr @Flag3, align 4, !tbaa [[INT_TBAA3]]
 ; TUNIT-NEXT:    ret i32 [[I]]
 ;
 ; CGSCC: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn
-; CGSCC-LABEL: define {{[^@]+}}@static_global_not_simplifiable_3
-; CGSCC-SAME: (i1 [[C:%.*]], ptr nofree writeonly captures(none) [[P:%.*]]) #[[ATTR5]] {
+; CGSCC-LABEL: define noundef i32 @static_global_not_simplifiable_3(
+; CGSCC-SAME: i1 [[C:%.*]], ptr nofree writeonly captures(none) [[P:%.*]]) #[[ATTR5]] {
 ; CGSCC-NEXT:    [[SEL:%.*]] = select i1 [[C]], ptr @Flag3, ptr [[P]]
-; CGSCC-NEXT:    store i32 1, ptr [[SEL]], align 4, !tbaa [[TBAA3]]
-; CGSCC-NEXT:    [[I:%.*]] = load i32, ptr @Flag3, align 4, !tbaa [[TBAA3]]
+; CGSCC-NEXT:    store i32 1, ptr [[SEL]], align 4, !tbaa [[INT_TBAA3]]
+; CGSCC-NEXT:    [[I:%.*]] = load i32, ptr @Flag3, align 4, !tbaa [[INT_TBAA3]]
 ; CGSCC-NEXT:    ret i32 [[I]]
 ;
   %sel = select i1 %c, ptr @Flag3, ptr %p
@@ -1887,15 +1887,15 @@ define i32 @static_global_not_simplifiable_3(i1 %c, ptr %p) {
 ; FIXME: We could replace these loads.
 define i32 @write_read_global() {
 ; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn
-; TUNIT-LABEL: define {{[^@]+}}@write_read_global
-; TUNIT-SAME: () #[[ATTR3]] {
+; TUNIT-LABEL: define i32 @write_read_global(
+; TUNIT-SAME: ) #[[ATTR3]] {
 ; TUNIT-NEXT:    store i32 7, ptr @Gint1, align 4
 ; TUNIT-NEXT:    [[L:%.*]] = load i32, ptr @Gint1, align 4
 ; TUNIT-NEXT:    ret i32 [[L]]
 ;
 ; CGSCC: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn
-; CGSCC-LABEL: define {{[^@]+}}@write_read_global
-; CGSCC-SAME: () #[[ATTR5]] {
+; CGSCC-LABEL: define i32 @write_read_global(
+; CGSCC-SAME: ) #[[ATTR5]] {
 ; CGSCC-NEXT:    store i32 7, ptr @Gint1, align 4
 ; CGSCC-NEXT:    [[L:%.*]] = load i32, ptr @Gint1, align 4
 ; CGSCC-NEXT:    ret i32 [[L]]
@@ -1906,14 +1906,14 @@ define i32 @write_read_global() {
 }
 define void @write_global() {
 ; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(write)
-; TUNIT-LABEL: define {{[^@]+}}@write_global
-; TUNIT-SAME: () #[[ATTR5]] {
+; TUNIT-LABEL: define void @write_global(
+; TUNIT-SAME: ) #[[ATTR5]] {
 ; TUNIT-NEXT:    store i32 7, ptr @Gint2, align 4
 ; TUNIT-NEXT:    ret void
 ;
 ; CGSCC: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(write)
-; CGSCC-LABEL: define {{[^@]+}}@write_global
-; CGSCC-SAME: () #[[ATTR6]] {
+; CGSCC-LABEL: define void @write_global(
+; CGSCC-SAME: ) #[[ATTR6]] {
 ; CGSCC-NEXT:    store i32 7, ptr @Gint2, align 4
 ; CGSCC-NEXT:    ret void
 ;
@@ -1922,14 +1922,14 @@ define void @write_global() {
 }
 define i32 @read_global() {
 ; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(read)
-; TUNIT-LABEL: define {{[^@]+}}@read_global
-; TUNIT-SAME: () #[[ATTR6]] {
+; TUNIT-LABEL: define i32 @read_global(
+; TUNIT-SAME: ) #[[ATTR6]] {
 ; TUNIT-NEXT:    [[L:%.*]] = load i32, ptr @Gint2, align 4
 ; TUNIT-NEXT:    ret i32 [[L]]
 ;
 ; CGSCC: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(read)
-; CGSCC-LABEL: define {{[^@]+}}@read_global
-; CGSCC-SAME: () #[[ATTR7]] {
+; CGSCC-LABEL: define i32 @read_global(
+; CGSCC-SAME: ) #[[ATTR7]] {
 ; CGSCC-NEXT:    [[L:%.*]] = load i32, ptr @Gint2, align 4
 ; CGSCC-NEXT:    ret i32 [[L]]
 ;
@@ -1938,13 +1938,13 @@ define i32 @read_global() {
 }
 define i32 @write_read_static_global() {
 ; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(write)
-; TUNIT-LABEL: define {{[^@]+}}@write_read_static_global
-; TUNIT-SAME: () #[[ATTR5]] {
+; TUNIT-LABEL: define noundef i32 @write_read_static_global(
+; TUNIT-SAME: ) #[[ATTR5]] {
 ; TUNIT-NEXT:    ret i32 7
 ;
 ; CGSCC: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(write)
-; CGSCC-LABEL: define {{[^@]+}}@write_read_static_global
-; CGSCC-SAME: () #[[ATTR6]] {
+; CGSCC-LABEL: define noundef i32 @write_read_static_global(
+; CGSCC-SAME: ) #[[ATTR6]] {
 ; CGSCC-NEXT:    ret i32 7
 ;
   store i32 7, ptr @Gstatic_int1
@@ -1953,14 +1953,14 @@ define i32 @write_read_static_global() {
 }
 define void @write_static_global() {
 ; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(write)
-; TUNIT-LABEL: define {{[^@]+}}@write_static_global
-; TUNIT-SAME: () #[[ATTR5]] {
+; TUNIT-LABEL: define void @write_static_global(
+; TUNIT-SAME: ) #[[ATTR5]] {
 ; TUNIT-NEXT:    store i32 7, ptr @Gstatic_int2, align 4
 ; TUNIT-NEXT:    ret void
 ;
 ; CGSCC: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(write)
-; CGSCC-LABEL: define {{[^@]+}}@write_static_global
-; CGSCC-SAME: () #[[ATTR6]] {
+; CGSCC-LABEL: define void @write_static_global(
+; CGSCC-SAME: ) #[[ATTR6]] {
 ; CGSCC-NEXT:    store i32 7, ptr @Gstatic_int2, align 4
 ; CGSCC-NEXT:    ret void
 ;
@@ -1969,14 +1969,14 @@ define void @write_static_global() {
 }
 define i32 @read_static_global() {
 ; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(read)
-; TUNIT-LABEL: define {{[^@]+}}@read_static_global
-; TUNIT-SAME: () #[[ATTR6]] {
+; TUNIT-LABEL: define noundef i32 @read_static_global(
+; TUNIT-SAME: ) #[[ATTR6]] {
 ; TUNIT-NEXT:    [[L:%.*]] = load i32, ptr @Gstatic_int2, align 4
 ; TUNIT-NEXT:    ret i32 [[L]]
 ;
 ; CGSCC: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(read)
-; CGSCC-LABEL: define {{[^@]+}}@read_static_global
-; CGSCC-SAME: () #[[ATTR7]] {
+; CGSCC-LABEL: define noundef i32 @read_static_global(
+; CGSCC-SAME: ) #[[ATTR7]] {
 ; CGSCC-NEXT:    [[L:%.*]] = load i32, ptr @Gstatic_int2, align 4
 ; CGSCC-NEXT:    ret i32 [[L]]
 ;
@@ -1985,13 +1985,13 @@ define i32 @read_static_global() {
 }
 define i32 @write_read_static_undef_global() {
 ; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(write)
-; TUNIT-LABEL: define {{[^@]+}}@write_read_static_undef_global
-; TUNIT-SAME: () #[[ATTR5]] {
+; TUNIT-LABEL: define noundef i32 @write_read_static_undef_global(
+; TUNIT-SAME: ) #[[ATTR5]] {
 ; TUNIT-NEXT:    ret i32 7
 ;
 ; CGSCC: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(write)
-; CGSCC-LABEL: define {{[^@]+}}@write_read_static_undef_global
-; CGSCC-SAME: () #[[ATTR6]] {
+; CGSCC-LABEL: define noundef i32 @write_read_static_undef_global(
+; CGSCC-SAME: ) #[[ATTR6]] {
 ; CGSCC-NEXT:    ret i32 7
 ;
   store i32 7, ptr @Gstatic_undef_int1
@@ -2000,13 +2000,13 @@ define i32 @write_read_static_undef_global() {
 }
 define void @write_static_undef_global() {
 ; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(write)
-; TUNIT-LABEL: define {{[^@]+}}@write_static_undef_global
-; TUNIT-SAME: () #[[ATTR5]] {
+; TUNIT-LABEL: define void @write_static_undef_global(
+; TUNIT-SAME: ) #[[ATTR5]] {
 ; TUNIT-NEXT:    ret void
 ;
 ; CGSCC: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(write)
-; CGSCC-LABEL: define {{[^@]+}}@write_static_undef_global
-; CGSCC-SAME: () #[[ATTR6]] {
+; CGSCC-LABEL: define void @write_static_undef_global(
+; CGSCC-SAME: ) #[[ATTR6]] {
 ; CGSCC-NEXT:    store i32 7, ptr @Gstatic_undef_int2, align 4
 ; CGSCC-NEXT:    ret void
 ;
@@ -2015,8 +2015,8 @@ define void @write_static_undef_global() {
 }
 define i32 @read_static_undef_global() {
 ; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
-; CHECK-LABEL: define {{[^@]+}}@read_static_undef_global
-; CHECK-SAME: () #[[ATTR4]] {
+; CHECK-LABEL: define i32 @read_static_undef_global(
+; CHECK-SAME: ) #[[ATTR4]] {
 ; CHECK-NEXT:    ret i32 7
 ;
   %l = load i32, ptr @Gstatic_undef_int2
@@ -2025,8 +2025,8 @@ define i32 @read_static_undef_global() {
 
 define i32 @single_read_of_static_global() {
 ; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
-; CHECK-LABEL: define {{[^@]+}}@single_read_of_static_global
-; CHECK-SAME: () #[[ATTR4]] {
+; CHECK-LABEL: define noundef i32 @single_read_of_static_global(
+; CHECK-SAME: ) #[[ATTR4]] {
 ; CHECK-NEXT:    ret i32 0
 ;
   %l = load i32, ptr @Gstatic_int3
@@ -2035,20 +2035,20 @@ define i32 @single_read_of_static_global() {
 
 define i8 @phi_store() {
 ; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
-; CHECK-LABEL: define {{[^@]+}}@phi_store
-; CHECK-SAME: () #[[ATTR4]] {
-; CHECK-NEXT:  entry:
+; CHECK-LABEL: define i8 @phi_store(
+; CHECK-SAME: ) #[[ATTR4]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    [[A:%.*]] = alloca i16, align 2
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[P:%.*]] = phi ptr [ [[A]], [[ENTRY:%.*]] ], [ [[G:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[I:%.*]] = phi i8 [ 0, [[ENTRY]] ], [ [[O:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[P:%.*]] = phi ptr [ [[A]], %[[ENTRY]] ], [ [[G:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[I:%.*]] = phi i8 [ 0, %[[ENTRY]] ], [ [[O:%.*]], %[[LOOP]] ]
 ; CHECK-NEXT:    store i8 1, ptr [[P]], align 1
 ; CHECK-NEXT:    [[G]] = getelementptr i8, ptr [[P]], i64 1
 ; CHECK-NEXT:    [[O]] = add nsw i8 [[I]], 1
 ; CHECK-NEXT:    [[C:%.*]] = icmp eq i8 [[O]], 2
-; CHECK-NEXT:    br i1 [[C]], label [[END:%.*]], label [[LOOP]]
-; CHECK:       end:
+; CHECK-NEXT:    br i1 [[C]], label %[[END:.*]], label %[[LOOP]]
+; CHECK:       [[END]]:
 ; CHECK-NEXT:    [[S:%.*]] = getelementptr i8, ptr [[A]], i64 1
 ; CHECK-NEXT:    [[L:%.*]] = load i8, ptr [[S]], align 1
 ; CHECK-NEXT:    ret i8 [[L]]
@@ -2074,19 +2074,19 @@ end:
 define i8 @phi_no_store_1() {
 ;
 ; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn
-; TUNIT-LABEL: define {{[^@]+}}@phi_no_store_1
-; TUNIT-SAME: () #[[ATTR3]] {
-; TUNIT-NEXT:  entry:
-; TUNIT-NEXT:    br label [[LOOP:%.*]]
-; TUNIT:       loop:
-; TUNIT-NEXT:    [[P:%.*]] = phi ptr [ @a1, [[ENTRY:%.*]] ], [ [[G:%.*]], [[LOOP]] ]
-; TUNIT-NEXT:    [[I:%.*]] = phi i8 [ 0, [[ENTRY]] ], [ [[O:%.*]], [[LOOP]] ]
+; TUNIT-LABEL: define i8 @phi_no_store_1(
+; TUNIT-SAME: ) #[[ATTR3]] {
+; TUNIT-NEXT:  [[ENTRY:.*]]:
+; TUNIT-NEXT:    br label %[[LOOP:.*]]
+; TUNIT:       [[LOOP]]:
+; TUNIT-NEXT:    [[P:%.*]] = phi ptr [ @a1, %[[ENTRY]] ], [ [[G:%.*]], %[[LOOP]] ]
+; TUNIT-NEXT:    [[I:%.*]] = phi i8 [ 0, %[[ENTRY]] ], [ [[O:%.*]], %[[LOOP]] ]
 ; TUNIT-NEXT:    store i8 1, ptr [[P]], align 1
 ; TUNIT-NEXT:    [[G]] = getelementptr i8, ptr [[P]], i64 1
 ; TUNIT-NEXT:    [[O]] = add nsw i8 [[I]], 1
 ; TUNIT-NEXT:    [[C:%.*]] = icmp eq i8 [[O]], 3
-; TUNIT-NEXT:    br i1 [[C]], label [[END:%.*]], label [[LOOP]]
-; TUNIT:       end:
+; TUNIT-NEXT:    br i1 [[C]], label %[[END:.*]], label %[[LOOP]]
+; TUNIT:       [[END]]:
 ; TUNIT-NEXT:    [[S11:%.*]] = getelementptr i8, ptr @a1, i64 2
 ; TUNIT-NEXT:    [[L11:%.*]] = load i8, ptr [[S11]], align 2
 ; TUNIT-NEXT:    [[S12:%.*]] = getelementptr i8, ptr @a1, i64 3
@@ -2095,19 +2095,19 @@ define i8 @phi_no_store_1() {
 ; TUNIT-NEXT:    ret i8 [[ADD]]
 ;
 ; CGSCC: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn
-; CGSCC-LABEL: define {{[^@]+}}@phi_no_store_1
-; CGSCC-SAME: () #[[ATTR5]] {
-; CGSCC-NEXT:  entry:
-; CGSCC-NEXT:    br label [[LOOP:%.*]]
-; CGSCC:       loop:
-; CGSCC-NEXT:    [[P:%.*]] = phi ptr [ @a1, [[ENTRY:%.*]] ], [ [[G:%.*]], [[LOOP]] ]
-; CGSCC-NEXT:    [[I:%.*]] = phi i8 [ 0, [[ENTRY]] ], [ [[O:%.*]], [[LOOP]] ]
+; CGSCC-LABEL: define i8 @phi_no_store_1(
+; CGSCC-SAME: ) #[[ATTR5]] {
+; CGSCC-NEXT:  [[ENTRY:.*]]:
+; CGSCC-NEXT:    br label %[[LOOP:.*]]
+; CGSCC:       [[LOOP]]:
+; CGSCC-NEXT:    [[P:%.*]] = phi ptr [ @a1, %[[ENTRY]] ], [ [[G:%.*]], %[[LOOP]] ]
+; CGSCC-NEXT:    [[I:%.*]] = phi i8 [ 0, %[[ENTRY]] ], [ [[O:%.*]], %[[LOOP]] ]
 ; CGSCC-NEXT:    store i8 1, ptr [[P]], align 1
 ; CGSCC-NEXT:    [[G]] = getelementptr i8, ptr [[P]], i64 1
 ; CGSCC-NEXT:    [[O]] = add nsw i8 [[I]], 1
 ; CGSCC-NEXT:    [[C:%.*]] = icmp eq i8 [[O]], 3
-; CGSCC-NEXT:    br i1 [[C]], label [[END:%.*]], label [[LOOP]]
-; CGSCC:       end:
+; CGSCC-NEXT:    br i1 [[C]], label %[[END:.*]], label %[[LOOP]]
+; CGSCC:       [[END]]:
 ; CGSCC-NEXT:    [[S11:%.*]] = getelementptr i8, ptr @a1, i64 2
 ; CGSCC-NEXT:    [[L11:%.*]] = load i8, ptr [[S11]], align 2
 ; CGSCC-NEXT:    [[S12:%.*]] = getelementptr i8, ptr @a1, i64 3
@@ -2138,19 +2138,19 @@ end:
 define i8 @phi_no_store_2() {
 ;
 ; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn
-; TUNIT-LABEL: define {{[^@]+}}@phi_no_store_2
-; TUNIT-SAME: () #[[ATTR3]] {
-; TUNIT-NEXT:  entry:
-; TUNIT-NEXT:    br label [[LOOP:%.*]]
-; TUNIT:       loop:
-; TUNIT-NEXT:    [[P:%.*]] = phi ptr [ @a2, [[ENTRY:%.*]] ], [ [[G:%.*]], [[LOOP]] ]
-; TUNIT-NEXT:    [[I:%.*]] = phi i8 [ 0, [[ENTRY]] ], [ [[O:%.*]], [[LOOP]] ]
+; TUNIT-LABEL: define i8 @phi_no_store_2(
+; TUNIT-SAME: ) #[[ATTR3]] {
+; TUNIT-NEXT:  [[ENTRY:.*]]:
+; TUNIT-NEXT:    br label %[[LOOP:.*]]
+; TUNIT:       [[LOOP]]:
+; TUNIT-NEXT:    [[P:%.*]] = phi ptr [ @a2, %[[ENTRY]] ], [ [[G:%.*]], %[[LOOP]] ]
+; TUNIT-NEXT:    [[I:%.*]] = phi i8 [ 0, %[[ENTRY]] ], [ [[O:%.*]], %[[LOOP]] ]
 ; TUNIT-NEXT:    store i8 1, ptr [[P]], align 1
 ; TUNIT-NEXT:    [[G]] = getelementptr i8, ptr @a2, i64 2
 ; TUNIT-NEXT:    [[O]] = add nsw i8 [[I]], 1
 ; TUNIT-NEXT:    [[C:%.*]] = icmp eq i8 [[O]], 7
-; TUNIT-NEXT:    br i1 [[C]], label [[END:%.*]], label [[LOOP]]
-; TUNIT:       end:
+; TUNIT-NEXT:    br i1 [[C]], label %[[END:.*]], label %[[LOOP]]
+; TUNIT:       [[END]]:
 ; TUNIT-NEXT:    [[S21:%.*]] = getelementptr i8, ptr @a2, i64 2
 ; TUNIT-NEXT:    [[L21:%.*]] = load i8, ptr [[S21]], align 2
 ; TUNIT-NEXT:    [[S22:%.*]] = getelementptr i8, ptr @a2, i64 3
@@ -2159,19 +2159,19 @@ define i8 @phi_no_store_2() {
 ; TUNIT-NEXT:    ret i8 [[ADD]]
 ;
 ; CGSCC: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn
-; CGSCC-LABEL: define {{[^@]+}}@phi_no_store_2
-; CGSCC-SAME: () #[[ATTR5]] {
-; CGSCC-NEXT:  entry:
-; CGSCC-NEXT:    br label [[LOOP:%.*]]
-; CGSCC:       loop:
-; CGSCC-NEXT:    [[P:%.*]] = phi ptr [ @a2, [[ENTRY:%.*]] ], [ [[G:%.*]], [[LOOP]] ]
-; CGSCC-NEXT:    [[I:%.*]] = phi i8 [ 0, [[ENTRY]] ], [ [[O:%.*]], [[LOOP]] ]
+; CGSCC-LABEL: define i8 @phi_no_store_2(
+; CGSCC-SAME: ) #[[ATTR5]] {
+; CGSCC-NEXT:  [[ENTRY:.*]]:
+; CGSCC-NEXT:    br label %[[LOOP:.*]]
+; CGSCC:       [[LOOP]]:
+; CGSCC-NEXT:    [[P:%.*]] = phi ptr [ @a2, %[[ENTRY]] ], [ [[G:%.*]], %[[LOOP]] ]
+; CGSCC-NEXT:    [[I:%.*]] = phi i8 [ 0, %[[ENTRY]] ], [ [[O:%.*]], %[[LOOP]] ]
 ; CGSCC-NEXT:    store i8 1, ptr [[P]], align 1
 ; CGSCC-NEXT:    [[G]] = getelementptr i8, ptr @a2, i64 2
 ; CGSCC-NEXT:    [[O]] = add nsw i8 [[I]], 1
 ; CGSCC-NEXT:    [[C:%.*]] = icmp eq i8 [[O]], 7
-; CGSCC-NEXT:    br i1 [[C]], label [[END:%.*]], label [[LOOP]]
-; CGSCC:       end:
+; CGSCC-NEXT:    br i1 [[C]], label %[[END:.*]], label %[[LOOP]]
+; CGSCC:       [[END]]:
 ; CGSCC-NEXT:    [[S21:%.*]] = getelementptr i8, ptr @a2, i64 2
 ; CGSCC-NEXT:    [[L21:%.*]] = load i8, ptr [[S21]], align 2
 ; CGSCC-NEXT:    [[S22:%.*]] = getelementptr i8, ptr @a2, i64 3
@@ -2200,21 +2200,21 @@ end:
 
 define i8 @phi_no_store_3() {
 ; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn
-; TUNIT-LABEL: define {{[^@]+}}@phi_no_store_3
-; TUNIT-SAME: () #[[ATTR3]] {
-; TUNIT-NEXT:  entry:
+; TUNIT-LABEL: define i8 @phi_no_store_3(
+; TUNIT-SAME: ) #[[ATTR3]] {
+; TUNIT-NEXT:  [[ENTRY:.*]]:
 ; TUNIT-NEXT:    [[S30:%.*]] = getelementptr i8, ptr @a3, i64 3
 ; TUNIT-NEXT:    store i8 0, ptr [[S30]], align 1
-; TUNIT-NEXT:    br label [[LOOP:%.*]]
-; TUNIT:       loop:
-; TUNIT-NEXT:    [[P:%.*]] = phi ptr [ @a3, [[ENTRY:%.*]] ], [ [[G:%.*]], [[LOOP]] ]
-; TUNIT-NEXT:    [[I:%.*]] = phi i8 [ 0, [[ENTRY]] ], [ [[O:%.*]], [[LOOP]] ]
+; TUNIT-NEXT:    br label %[[LOOP:.*]]
+; TUNIT:       [[LOOP]]:
+; TUNIT-NEXT:    [[P:%.*]] = phi ptr [ @a3, %[[ENTRY]] ], [ [[G:%.*]], %[[LOOP]] ]
+; TUNIT-NEXT:    [[I:%.*]] = phi i8 [ 0, %[[ENTRY]] ], [ [[O:%.*]], %[[LOOP]] ]
 ; TUNIT-NEXT:    store i8 1, ptr [[P]], align 1
 ; TUNIT-NEXT:    [[G]] = getelementptr i8, ptr @a3, i64 2
 ; TUNIT-NEXT:    [[O]] = add nsw i8 [[I]], 1
 ; TUNIT-NEXT:    [[C:%.*]] = icmp eq i8 [[O]], 7
-; TUNIT-NEXT:    br i1 [[C]], label [[END:%.*]], label [[LOOP]]
-; TUNIT:       end:
+; TUNIT-NEXT:    br i1 [[C]], label %[[END:.*]], label %[[LOOP]]
+; TUNIT:       [[END]]:
 ; TUNIT-NEXT:    [[S31:%.*]] = getelementptr i8, ptr @a3, i64 2
 ; TUNIT-NEXT:    [[L31:%.*]] = load i8, ptr [[S31]], align 2
 ; TUNIT-NEXT:    [[S32:%.*]] = getelementptr i8, ptr @a3, i64 3
@@ -2226,21 +2226,21 @@ define i8 @phi_no_store_3() {
 ; TUNIT-NEXT:    ret i8 [[ADD2]]
 ;
 ; CGSCC: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn
-; CGSCC-LABEL: define {{[^@]+}}@phi_no_store_3
-; CGSCC-SAME: () #[[ATTR5]] {
-; CGSCC-NEXT:  entry:
+; CGSCC-LABEL: define i8 @phi_no_store_3(
+; CGSCC-SAME: ) #[[ATTR5]] {
+; CGSCC-NEXT:  [[ENTRY:.*]]:
 ; CGSCC-NEXT:    [[S30:%.*]] = getelementptr i8, ptr @a3, i64 3
 ; CGSCC-NEXT:    store i8 0, ptr [[S30]], align 1
-; CGSCC-NEXT:    br label [[LOOP:%.*]]
-; CGSCC:       loop:
-; CGSCC-NEXT:    [[P:%.*]] = phi ptr [ @a3, [[ENTRY:%.*]] ], [ [[G:%.*]], [[LOOP]] ]
-; CGSCC-NEXT:    [[I:%.*]] = phi i8 [ 0, [[ENTRY]] ], [ [[O:%.*]], [[LOOP]] ]
+; CGSCC-NEXT:    br label %[[LOOP:.*]]
+; CGSCC:       [[LOOP]]:
+; CGSCC-NEXT:    [[P:%.*]] = phi ptr [ @a3, %[[ENTRY]] ], [ [[G:%.*]], %[[LOOP]] ]
+; CGSCC-NEXT:    [[I:%.*]] = phi i8 [ 0, %[[ENTRY]] ], [ [[O:%.*]], %[[LOOP]] ]
 ; CGSCC-NEXT:    store i8 1, ptr [[P]], align 1
 ; CGSCC-NEXT:    [[G]] = getelementptr i8, ptr @a3, i64 2
 ; CGSCC-NEXT:    [[O]] = add nsw i8 [[I]], 1
 ; CGSCC-NEXT:    [[C:%.*]] = icmp eq i8 [[O]], 7
-; CGSCC-NEXT:    br i1 [[C]], label [[END:%.*]], label [[LOOP]]
-; CGSCC:       end:
+; CGSCC-NEXT:    br i1 [[C]], label %[[END:.*]], label %[[LOOP]]
+; CGSCC:       [[END]]:
 ; CGSCC-NEXT:    [[S31:%.*]] = getelementptr i8, ptr @a3, i64 2
 ; CGSCC-NEXT:    [[L31:%.*]] = load i8, ptr [[S31]], align 2
 ; CGSCC-NEXT:    [[S32:%.*]] = getelementptr i8, ptr @a3, i64 3
@@ -2277,15 +2277,15 @@ end:
 
 define i8 @cast_and_load_1() {
 ; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn
-; TUNIT-LABEL: define {{[^@]+}}@cast_and_load_1
-; TUNIT-SAME: () #[[ATTR3]] {
+; TUNIT-LABEL: define i8 @cast_and_load_1(
+; TUNIT-SAME: ) #[[ATTR3]] {
 ; TUNIT-NEXT:    store i32 42, ptr @bytes1, align 4
 ; TUNIT-NEXT:    [[L:%.*]] = load i8, ptr @bytes1, align 4
 ; TUNIT-NEXT:    ret i8 [[L]]
 ;
 ; CGSCC: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn
-; CGSCC-LABEL: define {{[^@]+}}@cast_and_load_1
-; CGSCC-SAME: () #[[ATTR5]] {
+; CGSCC-LABEL: define i8 @cast_and_load_1(
+; CGSCC-SAME: ) #[[ATTR5]] {
 ; CGSCC-NEXT:    store i32 42, ptr @bytes1, align 4
 ; CGSCC-NEXT:    [[L:%.*]] = load i8, ptr @bytes1, align 4
 ; CGSCC-NEXT:    ret i8 [[L]]
@@ -2297,15 +2297,15 @@ define i8 @cast_and_load_1() {
 
 define i64 @cast_and_load_2() {
 ; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn
-; TUNIT-LABEL: define {{[^@]+}}@cast_and_load_2
-; TUNIT-SAME: () #[[ATTR3]] {
+; TUNIT-LABEL: define i64 @cast_and_load_2(
+; TUNIT-SAME: ) #[[ATTR3]] {
 ; TUNIT-NEXT:    store i32 42, ptr @bytes2, align 4
 ; TUNIT-NEXT:    [[L:%.*]] = load i64, ptr @bytes2, align 4
 ; TUNIT-NEXT:    ret i64 [[L]]
 ;
 ; CGSCC: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn
-; CGSCC-LABEL: define {{[^@]+}}@cast_and_load_2
-; CGSCC-SAME: () #[[ATTR5]] {
+; CGSCC-LABEL: define i64 @cast_and_load_2(
+; CGSCC-SAME: ) #[[ATTR5]] {
 ; CGSCC-NEXT:    store i32 42, ptr @bytes2, align 4
 ; CGSCC-NEXT:    [[L:%.*]] = load i64, ptr @bytes2, align 4
 ; CGSCC-NEXT:    ret i64 [[L]]
@@ -2318,33 +2318,33 @@ define i64 @cast_and_load_2() {
 define void @recursive_load_store(i64 %N, i32 %v) {
 ;
 ; TUNIT: Function Attrs: nofree norecurse nosync nounwind memory(write)
-; TUNIT-LABEL: define {{[^@]+}}@recursive_load_store
-; TUNIT-SAME: (i64 [[N:%.*]], i32 [[V:%.*]]) #[[ATTR7:[0-9]+]] {
-; TUNIT-NEXT:  entry:
-; TUNIT-NEXT:    br label [[FOR_COND:%.*]]
-; TUNIT:       for.cond:
-; TUNIT-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY:%.*]] ], [ 0, [[ENTRY:%.*]] ]
+; TUNIT-LABEL: define void @recursive_load_store(
+; TUNIT-SAME: i64 [[N:%.*]], i32 [[V:%.*]]) #[[ATTR7:[0-9]+]] {
+; TUNIT-NEXT:  [[ENTRY:.*]]:
+; TUNIT-NEXT:    br label %[[FOR_COND:.*]]
+; TUNIT:       [[FOR_COND]]:
+; TUNIT-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY:.*]] ], [ 0, %[[ENTRY]] ]
 ; TUNIT-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV]], [[N]]
-; TUNIT-NEXT:    br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_END:%.*]]
-; TUNIT:       for.body:
+; TUNIT-NEXT:    br i1 [[EXITCOND]], label %[[FOR_BODY]], label %[[FOR_END:.*]]
+; TUNIT:       [[FOR_BODY]]:
 ; TUNIT-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; TUNIT-NEXT:    br label [[FOR_COND]]
-; TUNIT:       for.end:
+; TUNIT-NEXT:    br label %[[FOR_COND]]
+; TUNIT:       [[FOR_END]]:
 ; TUNIT-NEXT:    ret void
 ;
 ; CGSCC: Function Attrs: nofree norecurse nosync nounwind memory(write)
-; CGSCC-LABEL: define {{[^@]+}}@recursive_load_store
-; CGSCC-SAME: (i64 [[N:%.*]], i32 [[V:%.*]]) #[[ATTR8:[0-9]+]] {
-; CGSCC-NEXT:  entry:
-; CGSCC-NEXT:    br label [[FOR_COND:%.*]]
-; CGSCC:       for.cond:
-; CGSCC-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY:%.*]] ], [ 0, [[ENTRY:%.*]] ]
+; CGSCC-LABEL: define void @recursive_load_store(
+; CGSCC-SAME: i64 [[N:%.*]], i32 [[V:%.*]]) #[[ATTR8:[0-9]+]] {
+; CGSCC-NEXT:  [[ENTRY:.*]]:
+; CGSCC-NEXT:    br label %[[FOR_COND:.*]]
+; CGSCC:       [[FOR_COND]]:
+; CGSCC-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY:.*]] ], [ 0, %[[ENTRY]] ]
 ; CGSCC-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV]], [[N]]
-; CGSCC-NEXT:    br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_END:%.*]]
-; CGSCC:       for.body:
+; CGSCC-NEXT:    br i1 [[EXITCOND]], label %[[FOR_BODY]], label %[[FOR_END:.*]]
+; CGSCC:       [[FOR_BODY]]:
 ; CGSCC-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CGSCC-NEXT:    br label [[FOR_COND]]
-; CGSCC:       for.end:
+; CGSCC-NEXT:    br label %[[FOR_COND]]
+; CGSCC:       [[FOR_END]]:
 ; CGSCC-NEXT:    ret void
 ;
 entry:
@@ -2369,9 +2369,9 @@ for.end:
 }
 
 define dso_local i32 @round_trip_malloc(i32 %x) {
-; CHECK-LABEL: define {{[^@]+}}@round_trip_malloc
-; CHECK-SAME: (i32 returned [[X:%.*]]) {
-; CHECK-NEXT:  entry:
+; CHECK-LABEL: define dso_local i32 @round_trip_malloc(
+; CHECK-SAME: i32 returned [[X:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[CALL_H2S:%.*]] = alloca i8, i64 4, align 1
 ; CHECK-NEXT:    store i32 [[X]], ptr [[CALL_H2S]], align 4
 ; CHECK-NEXT:    ret i32 [[X]]
@@ -2385,8 +2385,8 @@ entry:
 }
 
 define dso_local i32 @round_trip_malloc_constant() {
-; CHECK-LABEL: define {{[^@]+}}@round_trip_malloc_constant() {
-; CHECK-NEXT:  entry:
+; CHECK-LABEL: define dso_local noundef i32 @round_trip_malloc_constant() {
+; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    ret i32 7
 ;
 entry:
@@ -2402,16 +2402,16 @@ declare noalias ptr @malloc(i64) allockind("alloc,uninitialized") allocsize(0) "
 declare void @free(ptr) allockind("free") "alloc-family"="malloc"
 
 define dso_local i32 @conditional_malloc(i32 %x) {
-; CHECK-LABEL: define {{[^@]+}}@conditional_malloc
-; CHECK-SAME: (i32 returned [[X:%.*]]) {
-; CHECK-NEXT:  entry:
+; CHECK-LABEL: define dso_local i32 @conditional_malloc(
+; CHECK-SAME: i32 returned [[X:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[CALL_H2S:%.*]] = alloca i8, i64 4, align 1
 ; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[X]], 0
-; CHECK-NEXT:    br i1 [[TOBOOL]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
-; CHECK:       if.then:
+; CHECK-NEXT:    br i1 [[TOBOOL]], label %[[IF_THEN:.*]], label %[[IF_END:.*]]
+; CHECK:       [[IF_THEN]]:
 ; CHECK-NEXT:    store i32 [[X]], ptr [[CALL_H2S]], align 4
-; CHECK-NEXT:    br label [[IF_END]]
-; CHECK:       if.end:
+; CHECK-NEXT:    br label %[[IF_END]]
+; CHECK:       [[IF_END]]:
 ; CHECK-NEXT:    ret i32 [[X]]
 ;
 entry:
@@ -2429,9 +2429,9 @@ if.end:                                           ; preds = %if.then, %entry
 }
 
 define dso_local i32 @round_trip_calloc(i32 %x) {
-; CHECK-LABEL: define {{[^@]+}}@round_trip_calloc
-; CHECK-SAME: (i32 returned [[X:%.*]]) {
-; CHECK-NEXT:  entry:
+; CHECK-LABEL: define dso_local i32 @round_trip_calloc(
+; CHECK-SAME: i32 returned [[X:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[CALL_H2S:%.*]] = alloca i8, i64 4, align 1
 ; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr [[CALL_H2S]], i8 0, i64 4, i1 false)
 ; CHECK-NEXT:    store i32 [[X]], ptr [[CALL_H2S]], align 4
@@ -2445,8 +2445,8 @@ entry:
 }
 
 define dso_local i32 @round_trip_calloc_constant() {
-; CHECK-LABEL: define {{[^@]+}}@round_trip_calloc_constant() {
-; CHECK-NEXT:  entry:
+; CHECK-LABEL: define dso_local noundef i32 @round_trip_calloc_constant() {
+; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[CALL_H2S:%.*]] = alloca i8, i64 4, align 1
 ; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr [[CALL_H2S]], i8 0, i64 4, i1 false)
 ; CHECK-NEXT:    ret i32 11
@@ -2461,17 +2461,17 @@ entry:
 declare noalias ptr @calloc(i64, i64) allockind("alloc,zeroed") allocsize(0, 1) "alloc-family"="malloc"
 
 define dso_local i32 @conditional_calloc(i32 %x) {
-; CHECK-LABEL: define {{[^@]+}}@conditional_calloc
-; CHECK-SAME: (i32 [[X:%.*]]) {
-; CHECK-NEXT:  entry:
+; CHECK-LABEL: define dso_local i32 @conditional_calloc(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[CALL_H2S:%.*]] = alloca i8, i64 4, align 1
 ; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr [[CALL_H2S]], i8 0, i64 4, i1 false)
 ; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[X]], 0
-; CHECK-NEXT:    br i1 [[TOBOOL]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
-; CHECK:       if.then:
+; CHECK-NEXT:    br i1 [[TOBOOL]], label %[[IF_END:.*]], label %[[IF_THEN:.*]]
+; CHECK:       [[IF_THEN]]:
 ; CHECK-NEXT:    store i32 [[X]], ptr [[CALL_H2S]], align 4
-; CHECK-NEXT:    br label [[IF_END]]
-; CHECK:       if.end:
+; CHECK-NEXT:    br label %[[IF_END]]
+; CHECK:       [[IF_END]]:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[CALL_H2S]], align 4
 ; CHECK-NEXT:    ret i32 [[TMP0]]
 ;
@@ -2491,15 +2491,15 @@ if.end:                                           ; preds = %if.then, %entry
 }
 
 define dso_local i32 @conditional_calloc_zero(i1 %c) {
-; CHECK-LABEL: define {{[^@]+}}@conditional_calloc_zero
-; CHECK-SAME: (i1 [[C:%.*]]) {
-; CHECK-NEXT:  entry:
+; CHECK-LABEL: define dso_local noundef i32 @conditional_calloc_zero(
+; CHECK-SAME: i1 [[C:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[CALL_H2S:%.*]] = alloca i8, i64 4, align 1
 ; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr [[CALL_H2S]], i8 0, i64 4, i1 false)
-; CHECK-NEXT:    br i1 [[C]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
-; CHECK:       if.then:
-; CHECK-NEXT:    br label [[IF_END]]
-; CHECK:       if.end:
+; CHECK-NEXT:    br i1 [[C]], label %[[IF_END:.*]], label %[[IF_THEN:.*]]
+; CHECK:       [[IF_THEN]]:
+; CHECK-NEXT:    br label %[[IF_END]]
+; CHECK:       [[IF_END]]:
 ; CHECK-NEXT:    ret i32 0
 ;
 entry:
@@ -2517,16 +2517,16 @@ if.end:                                           ; preds = %if.then, %entry
 }
 
 define dso_local ptr @malloc_like(i32 %s) {
-; TUNIT-LABEL: define {{[^@]+}}@malloc_like
-; TUNIT-SAME: (i32 [[S:%.*]]) {
-; TUNIT-NEXT:  entry:
+; TUNIT-LABEL: define dso_local noalias ptr @malloc_like(
+; TUNIT-SAME: i32 [[S:%.*]]) {
+; TUNIT-NEXT:  [[ENTRY:.*:]]
 ; TUNIT-NEXT:    [[CONV:%.*]] = sext i32 [[S]] to i64
 ; TUNIT-NEXT:    [[CALL:%.*]] = call noalias ptr @malloc(i64 [[CONV]]) #[[ATTR20:[0-9]+]]
 ; TUNIT-NEXT:    ret ptr [[CALL]]
 ;
-; CGSCC-LABEL: define {{[^@]+}}@malloc_like
-; CGSCC-SAME: (i32 [[S:%.*]]) {
-; CGSCC-NEXT:  entry:
+; CGSCC-LABEL: define dso_local noalias ptr @malloc_like(
+; CGSCC-SAME: i32 [[S:%.*]]) {
+; CGSCC-NEXT:  [[ENTRY:.*:]]
 ; CGSCC-NEXT:    [[CONV:%.*]] = sext i32 [[S]] to i64
 ; CGSCC-NEXT:    [[CALL:%.*]] = call noalias ptr @malloc(i64 [[CONV]]) #[[ATTR23:[0-9]+]]
 ; CGSCC-NEXT:    ret ptr [[CALL]]
@@ -2538,18 +2538,18 @@ entry:
 }
 
 define dso_local i32 @round_trip_malloc_like(i32 %x) {
-; TUNIT-LABEL: define {{[^@]+}}@round_trip_malloc_like
-; TUNIT-SAME: (i32 [[X:%.*]]) {
-; TUNIT-NEXT:  entry:
+; TUNIT-LABEL: define dso_local i32 @round_trip_malloc_like(
+; TUNIT-SAME: i32 [[X:%.*]]) {
+; TUNIT-NEXT:  [[ENTRY:.*:]]
 ; TUNIT-NEXT:    [[CALL:%.*]] = call noalias ptr @malloc_like(i32 noundef 4) #[[ATTR20]]
 ; TUNIT-NEXT:    store i32 [[X]], ptr [[CALL]], align 4
 ; TUNIT-NEXT:    [[TMP0:%.*]] = load i32, ptr [[CALL]], align 4
 ; TUNIT-NEXT:    call void @free(ptr noundef nonnull align 4 dereferenceable(4) [[CALL]]) #[[ATTR20]]
 ; TUNIT-NEXT:    ret i32 [[TMP0]]
 ;
-; CGSCC-LABEL: define {{[^@]+}}@round_trip_malloc_like
-; CGSCC-SAME: (i32 [[X:%.*]]) {
-; CGSCC-NEXT:  entry:
+; CGSCC-LABEL: define dso_local i32 @round_trip_malloc_like(
+; CGSCC-SAME: i32 [[X:%.*]]) {
+; CGSCC-NEXT:  [[ENTRY:.*:]]
 ; CGSCC-NEXT:    [[CALL:%.*]] = call noalias ptr @malloc_like(i32 noundef 4) #[[ATTR23]]
 ; CGSCC-NEXT:    store i32 [[X]], ptr [[CALL]], align 4
 ; CGSCC-NEXT:    [[TMP0:%.*]] = load i32, ptr [[CALL]], align 4
@@ -2565,18 +2565,18 @@ entry:
 }
 
 define dso_local i32 @round_trip_unknown_alloc(i32 %x) {
-; TUNIT-LABEL: define {{[^@]+}}@round_trip_unknown_alloc
-; TUNIT-SAME: (i32 [[X:%.*]]) {
-; TUNIT-NEXT:  entry:
+; TUNIT-LABEL: define dso_local i32 @round_trip_unknown_alloc(
+; TUNIT-SAME: i32 [[X:%.*]]) {
+; TUNIT-NEXT:  [[ENTRY:.*:]]
 ; TUNIT-NEXT:    [[CALL:%.*]] = call noalias ptr @unknown_alloc(i32 noundef 4) #[[ATTR20]]
 ; TUNIT-NEXT:    store i32 [[X]], ptr [[CALL]], align 4
 ; TUNIT-NEXT:    [[TMP0:%.*]] = load i32, ptr [[CALL]], align 4
 ; TUNIT-NEXT:    call void @free(ptr noundef nonnull align 4 dereferenceable(4) [[CALL]]) #[[ATTR20]]
 ; TUNIT-NEXT:    ret i32 [[TMP0]]
 ;
-; CGSCC-LABEL: define {{[^@]+}}@round_trip_unknown_alloc
-; CGSCC-SAME: (i32 [[X:%.*]]) {
-; CGSCC-NEXT:  entry:
+; CGSCC-LABEL: define dso_local i32 @round_trip_unknown_alloc(
+; CGSCC-SAME: i32 [[X:%.*]]) {
+; CGSCC-NEXT:  [[ENTRY:.*:]]
 ; CGSCC-NEXT:    [[CALL:%.*]] = call noalias ptr @unknown_alloc(i32 noundef 4) #[[ATTR23]]
 ; CGSCC-NEXT:    store i32 [[X]], ptr [[CALL]], align 4
 ; CGSCC-NEXT:    [[TMP0:%.*]] = load i32, ptr [[CALL]], align 4
@@ -2594,30 +2594,30 @@ entry:
 declare noalias ptr @unknown_alloc(i32)
 
 define dso_local i32 @conditional_unknown_alloc(i32 %x) {
-; TUNIT-LABEL: define {{[^@]+}}@conditional_unknown_alloc
-; TUNIT-SAME: (i32 [[X:%.*]]) {
-; TUNIT-NEXT:  entry:
+; TUNIT-LABEL: define dso_local i32 @conditional_unknown_alloc(
+; TUNIT-SAME: i32 [[X:%.*]]) {
+; TUNIT-NEXT:  [[ENTRY:.*:]]
 ; TUNIT-NEXT:    [[CALL:%.*]] = call noalias ptr @unknown_alloc(i32 noundef 4) #[[ATTR20]]
 ; TUNIT-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[X]], 0
-; TUNIT-NEXT:    br i1 [[TOBOOL]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
-; TUNIT:       if.then:
+; TUNIT-NEXT:    br i1 [[TOBOOL]], label %[[IF_END:.*]], label %[[IF_THEN:.*]]
+; TUNIT:       [[IF_THEN]]:
 ; TUNIT-NEXT:    store i32 [[X]], ptr [[CALL]], align 4
-; TUNIT-NEXT:    br label [[IF_END]]
-; TUNIT:       if.end:
+; TUNIT-NEXT:    br label %[[IF_END]]
+; TUNIT:       [[IF_END]]:
 ; TUNIT-NEXT:    [[TMP0:%.*]] = load i32, ptr [[CALL]], align 4
 ; TUNIT-NEXT:    call void @free(ptr nonnull align 4 dereferenceable(4) [[CALL]]) #[[ATTR20]]
 ; TUNIT-NEXT:    ret i32 [[TMP0]]
 ;
-; CGSCC-LABEL: define {{[^@]+}}@conditional_unknown_alloc
-; CGSCC-SAME: (i32 [[X:%.*]]) {
-; CGSCC-NEXT:  entry:
+; CGSCC-LABEL: define dso_local i32 @conditional_unknown_alloc(
+; CGSCC-SAME: i32 [[X:%.*]]) {
+; CGSCC-NEXT:  [[ENTRY:.*:]]
 ; CGSCC-NEXT:    [[CALL:%.*]] = call noalias ptr @unknown_alloc(i32 noundef 4) #[[ATTR23]]
 ; CGSCC-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[X]], 0
-; CGSCC-NEXT:    br i1 [[TOBOOL]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
-; CGSCC:       if.then:
+; CGSCC-NEXT:    br i1 [[TOBOOL]], label %[[IF_END:.*]], label %[[IF_THEN:.*]]
+; CGSCC:       [[IF_THEN]]:
 ; CGSCC-NEXT:    store i32 [[X]], ptr [[CALL]], align 4
-; CGSCC-NEXT:    br label [[IF_END]]
-; CGSCC:       if.end:
+; CGSCC-NEXT:    br label %[[IF_END]]
+; CGSCC:       [[IF_END]]:
 ; CGSCC-NEXT:    [[TMP0:%.*]] = load i32, ptr [[CALL]], align 4
 ; CGSCC-NEXT:    call void @free(ptr nonnull align 4 dereferenceable(4) [[CALL]]) #[[ATTR23]]
 ; CGSCC-NEXT:    ret i32 [[TMP0]]
@@ -2643,9 +2643,9 @@ if.end:                                           ; preds = %if.then, %entry
 
 ; We mark %dst as writeonly and %src as readonly, that is (for now) all we can expect.
 define dso_local void @test_nested_memory(ptr %dst, ptr %src) {
-; TUNIT-LABEL: define {{[^@]+}}@test_nested_memory
-; TUNIT-SAME: (ptr nofree writeonly captures(none) [[DST:%.*]], ptr nofree readonly captures(none) [[SRC:%.*]]) {
-; TUNIT-NEXT:  entry:
+; TUNIT-LABEL: define dso_local void @test_nested_memory(
+; TUNIT-SAME: ptr nofree writeonly captures(none) [[DST:%.*]], ptr nofree readonly captures(none) [[SRC:%.*]]) {
+; TUNIT-NEXT:  [[ENTRY:.*:]]
 ; TUNIT-NEXT:    [[CALL_H2S:%.*]] = alloca i8, i64 24, align 1
 ; TUNIT-NEXT:    [[LOCAL:%.*]] = alloca [[STRUCT_STY:%.*]], align 8
 ; TUNIT-NEXT:    [[INNER:%.*]] = getelementptr inbounds [[STRUCT_STY]], ptr [[LOCAL]], i64 0, i32 2
@@ -2662,9 +2662,9 @@ define dso_local void @test_nested_memory(ptr %dst, ptr %src) {
 ; TUNIT-NEXT:    call fastcc void @nested_memory_callee(ptr [[TMP0]], ptr [[TMP1]], ptr [[TMP2]]) #[[ATTR21:[0-9]+]]
 ; TUNIT-NEXT:    ret void
 ;
-; CGSCC-LABEL: define {{[^@]+}}@test_nested_memory
-; CGSCC-SAME: (ptr nofree [[DST:%.*]], ptr nofree [[SRC:%.*]]) {
-; CGSCC-NEXT:  entry:
+; CGSCC-LABEL: define dso_local void @test_nested_memory(
+; CGSCC-SAME: ptr nofree [[DST:%.*]], ptr nofree [[SRC:%.*]]) {
+; CGSCC-NEXT:  [[ENTRY:.*:]]
 ; CGSCC-NEXT:    [[LOCAL:%.*]] = alloca [[STRUCT_STY:%.*]], align 8
 ; CGSCC-NEXT:    [[INNER:%.*]] = getelementptr inbounds [[STRUCT_STY]], ptr [[LOCAL]], i64 0, i32 2
 ; CGSCC-NEXT:    [[CALL:%.*]] = call noalias dereferenceable_or_null(24) ptr @malloc(i64 noundef 24)
@@ -2690,9 +2690,9 @@ entry:
 
 define internal fastcc void @nested_memory_callee(ptr nocapture readonly %S) nofree norecurse nounwind uwtable {
 ; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn uwtable
-; TUNIT-LABEL: define {{[^@]+}}@nested_memory_callee
-; TUNIT-SAME: (ptr [[TMP0:%.*]], ptr [[TMP1:%.*]], ptr [[TMP2:%.*]]) #[[ATTR11:[0-9]+]] {
-; TUNIT-NEXT:  entry:
+; TUNIT-LABEL: define internal fastcc void @nested_memory_callee(
+; TUNIT-SAME: ptr [[TMP0:%.*]], ptr [[TMP1:%.*]], ptr [[TMP2:%.*]]) #[[ATTR11:[0-9]+]] {
+; TUNIT-NEXT:  [[ENTRY:.*:]]
 ; TUNIT-NEXT:    [[S_PRIV:%.*]] = alloca [[STRUCT_STY:%.*]], align 8
 ; TUNIT-NEXT:    store ptr [[TMP0]], ptr [[S_PRIV]], align 8
 ; TUNIT-NEXT:    [[S_PRIV_B8:%.*]] = getelementptr i8, ptr [[S_PRIV]], i64 8
@@ -2700,21 +2700,21 @@ define internal fastcc void @nested_memory_callee(ptr nocapture readonly %S) nof
 ; TUNIT-NEXT:    [[S_PRIV_B16:%.*]] = getelementptr i8, ptr [[S_PRIV]], i64 16
 ; TUNIT-NEXT:    store ptr [[TMP2]], ptr [[S_PRIV_B16]], align 8
 ; TUNIT-NEXT:    [[INNER:%.*]] = getelementptr inbounds [[STRUCT_STY]], ptr [[S_PRIV]], i64 0, i32 2
-; TUNIT-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[INNER]], align 8
+; TUNIT-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[INNER]], align 8, !invariant.load [[META32:![0-9]+]]
 ; TUNIT-NEXT:    [[INNER1:%.*]] = getelementptr inbounds [[STRUCT_STY]], ptr [[TMP3]], i64 0, i32 2
-; TUNIT-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[INNER1]], align 8
+; TUNIT-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[INNER1]], align 8, !invariant.load [[META32]]
 ; TUNIT-NEXT:    [[SRC:%.*]] = getelementptr inbounds [[STRUCT_STY]], ptr [[TMP4]], i64 0, i32 1
-; TUNIT-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[SRC]], align 8
-; TUNIT-NEXT:    [[TMP6:%.*]] = load double, ptr [[TMP5]], align 8
+; TUNIT-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[SRC]], align 8, !invariant.load [[META32]]
+; TUNIT-NEXT:    [[TMP6:%.*]] = load double, ptr [[TMP5]], align 8, !invariant.load [[META32]]
 ; TUNIT-NEXT:    [[CONV:%.*]] = fptrunc double [[TMP6]] to float
-; TUNIT-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[TMP4]], align 8
+; TUNIT-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[TMP4]], align 8, !invariant.load [[META32]]
 ; TUNIT-NEXT:    store float [[CONV]], ptr [[TMP7]], align 4
 ; TUNIT-NEXT:    ret void
 ;
 ; CGSCC: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn uwtable
-; CGSCC-LABEL: define {{[^@]+}}@nested_memory_callee
-; CGSCC-SAME: (ptr nofree [[TMP0:%.*]], ptr nofree [[TMP1:%.*]], ptr nofree [[TMP2:%.*]]) #[[ATTR12:[0-9]+]] {
-; CGSCC-NEXT:  entry:
+; CGSCC-LABEL: define internal fastcc void @nested_memory_callee(
+; CGSCC-SAME: ptr nofree [[TMP0:%.*]], ptr nofree [[TMP1:%.*]], ptr nofree [[TMP2:%.*]]) #[[ATTR12:[0-9]+]] {
+; CGSCC-NEXT:  [[ENTRY:.*:]]
 ; CGSCC-NEXT:    [[S_PRIV:%.*]] = alloca [[STRUCT_STY:%.*]], align 8
 ; CGSCC-NEXT:    store ptr [[TMP0]], ptr [[S_PRIV]], align 8
 ; CGSCC-NEXT:    [[S_PRIV_B8:%.*]] = getelementptr i8, ptr [[S_PRIV]], i64 8
@@ -2722,14 +2722,14 @@ define internal fastcc void @nested_memory_callee(ptr nocapture readonly %S) nof
 ; CGSCC-NEXT:    [[S_PRIV_B16:%.*]] = getelementptr i8, ptr [[S_PRIV]], i64 16
 ; CGSCC-NEXT:    store ptr [[TMP2]], ptr [[S_PRIV_B16]], align 8
 ; CGSCC-NEXT:    [[INNER:%.*]] = getelementptr inbounds [[STRUCT_STY]], ptr [[S_PRIV]], i64 0, i32 2
-; CGSCC-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[INNER]], align 8
+; CGSCC-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[INNER]], align 8, !invariant.load [[META32:![0-9]+]]
 ; CGSCC-NEXT:    [[INNER1:%.*]] = getelementptr inbounds [[STRUCT_STY]], ptr [[TMP3]], i64 0, i32 2
-; CGSCC-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[INNER1]], align 8
+; CGSCC-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[INNER1]], align 8, !invariant.load [[META32]]
 ; CGSCC-NEXT:    [[SRC:%.*]] = getelementptr inbounds [[STRUCT_STY]], ptr [[TMP4]], i64 0, i32 1
-; CGSCC-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[SRC]], align 8
-; CGSCC-NEXT:    [[TMP6:%.*]] = load double, ptr [[TMP5]], align 8
+; CGSCC-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[SRC]], align 8, !invariant.load [[META32]]
+; CGSCC-NEXT:    [[TMP6:%.*]] = load double, ptr [[TMP5]], align 8, !invariant.load [[META32]]
 ; CGSCC-NEXT:    [[CONV:%.*]] = fptrunc double [[TMP6]] to float
-; CGSCC-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[TMP4]], align 8
+; CGSCC-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[TMP4]], align 8, !invariant.load [[META32]]
 ; CGSCC-NEXT:    store float [[CONV]], ptr [[TMP7]], align 4
 ; CGSCC-NEXT:    ret void
 ;
@@ -2751,34 +2751,34 @@ entry:
 ; varying and the accesses thus not "exact". This used to simplify %cmp12 to true.
 define hidden void @no_propagation_of_unknown_index_access(ptr %in, ptr %out, i32 %idx) #0 {
 ; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
-; TUNIT-LABEL: define {{[^@]+}}@no_propagation_of_unknown_index_access
-; TUNIT-SAME: (ptr nofree readonly captures(none) [[IN:%.*]], ptr nofree writeonly captures(none) [[OUT:%.*]], i32 [[IDX:%.*]]) #[[ATTR1]] {
-; TUNIT-NEXT:  entry:
+; TUNIT-LABEL: define hidden void @no_propagation_of_unknown_index_access(
+; TUNIT-SAME: ptr nofree readonly captures(none) [[IN:%.*]], ptr nofree writeonly captures(none) [[OUT:%.*]], i32 [[IDX:%.*]]) #[[ATTR1]] {
+; TUNIT-NEXT:  [[ENTRY:.*]]:
 ; TUNIT-NEXT:    [[BUF:%.*]] = alloca [128 x i32], align 16
 ; TUNIT-NEXT:    call void @llvm.lifetime.start.p0(ptr noalias nofree noundef nonnull align 16 captures(none) dereferenceable(512) [[BUF]]) #[[ATTR17]]
-; TUNIT-NEXT:    br label [[FOR_COND:%.*]]
-; TUNIT:       for.cond:
-; TUNIT-NEXT:    [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
+; TUNIT-NEXT:    br label %[[FOR_COND:.*]]
+; TUNIT:       [[FOR_COND]]:
+; TUNIT-NEXT:    [[I_0:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_BODY:.*]] ]
 ; TUNIT-NEXT:    [[CMP:%.*]] = icmp slt i32 [[I_0]], 128
-; TUNIT-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
-; TUNIT:       for.cond.cleanup:
-; TUNIT-NEXT:    br label [[FOR_COND4:%.*]]
-; TUNIT:       for.body:
+; TUNIT-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP:.*]]
+; TUNIT:       [[FOR_COND_CLEANUP]]:
+; TUNIT-NEXT:    br label %[[FOR_COND4:.*]]
+; TUNIT:       [[FOR_BODY]]:
 ; TUNIT-NEXT:    [[IDXPROM:%.*]] = sext i32 [[I_0]] to i64
 ; TUNIT-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[IN]], i64 [[IDXPROM]]
-; TUNIT-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; TUNIT-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !invariant.load [[META32]]
 ; TUNIT-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [128 x i32], ptr [[BUF]], i64 0, i64 [[IDXPROM]]
 ; TUNIT-NEXT:    store i32 [[TMP0]], ptr [[ARRAYIDX2]], align 4
 ; TUNIT-NEXT:    [[INC]] = add nsw i32 [[I_0]], 1
-; TUNIT-NEXT:    br label [[FOR_COND]], !llvm.loop [[TBAA10]]
-; TUNIT:       for.cond4:
-; TUNIT-NEXT:    [[I3_0:%.*]] = phi i32 [ 0, [[FOR_COND_CLEANUP]] ], [ [[INC16:%.*]], [[FOR_BODY7:%.*]] ]
+; TUNIT-NEXT:    br label %[[FOR_COND]], !llvm.loop [[FLOAT_TBAA10]]
+; TUNIT:       [[FOR_COND4]]:
+; TUNIT-NEXT:    [[I3_0:%.*]] = phi i32 [ 0, %[[FOR_COND_CLEANUP]] ], [ [[INC16:%.*]], %[[FOR_BODY7:.*]] ]
 ; TUNIT-NEXT:    [[CMP5:%.*]] = icmp slt i32 [[I3_0]], 128
-; TUNIT-NEXT:    br i1 [[CMP5]], label [[FOR_BODY7]], label [[FOR_COND_CLEANUP6:%.*]]
-; TUNIT:       for.cond.cleanup6:
+; TUNIT-NEXT:    br i1 [[CMP5]], label %[[FOR_BODY7]], label %[[FOR_COND_CLEANUP6:.*]]
+; TUNIT:       [[FOR_COND_CLEANUP6]]:
 ; TUNIT-NEXT:    call void @llvm.lifetime.end.p0(ptr noalias nofree noundef nonnull align 16 captures(none) dereferenceable(512) [[BUF]]) #[[ATTR17]]
 ; TUNIT-NEXT:    ret void
-; TUNIT:       for.body7:
+; TUNIT:       [[FOR_BODY7]]:
 ; TUNIT-NEXT:    [[IDXPROM8:%.*]] = sext i32 [[I3_0]] to i64
 ; TUNIT-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds [128 x i32], ptr [[BUF]], i64 0, i64 [[IDXPROM8]]
 ; TUNIT-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX9]], align 4
@@ -2790,37 +2790,37 @@ define hidden void @no_propagation_of_unknown_index_access(ptr %in, ptr %out, i3
 ; TUNIT-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, ptr [[OUT]], i64 [[IDXPROM8]]
 ; TUNIT-NEXT:    store i32 [[CONV]], ptr [[ARRAYIDX14]], align 4
 ; TUNIT-NEXT:    [[INC16]] = add nsw i32 [[I3_0]], 1
-; TUNIT-NEXT:    br label [[FOR_COND4]], !llvm.loop [[TBAA12]]
+; TUNIT-NEXT:    br label %[[FOR_COND4]], !llvm.loop [[INT_TBAA12]]
 ;
 ; CGSCC: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
-; CGSCC-LABEL: define {{[^@]+}}@no_propagation_of_unknown_index_access
-; CGSCC-SAME: (ptr nofree readonly captures(none) [[IN:%.*]], ptr nofree writeonly captures(none) [[OUT:%.*]], i32 [[IDX:%.*]]) #[[ATTR13:[0-9]+]] {
-; CGSCC-NEXT:  entry:
+; CGSCC-LABEL: define hidden void @no_propagation_of_unknown_index_access(
+; CGSCC-SAME: ptr nofree readonly captures(none) [[IN:%.*]], ptr nofree writeonly captures(none) [[OUT:%.*]], i32 [[IDX:%.*]]) #[[ATTR13:[0-9]+]] {
+; CGSCC-NEXT:  [[ENTRY:.*]]:
 ; CGSCC-NEXT:    [[BUF:%.*]] = alloca [128 x i32], align 16
 ; CGSCC-NEXT:    call void @llvm.lifetime.start.p0(ptr noalias nofree noundef nonnull align 16 captures(none) dereferenceable(512) [[BUF]]) #[[ATTR20]]
-; CGSCC-NEXT:    br label [[FOR_COND:%.*]]
-; CGSCC:       for.cond:
-; CGSCC-NEXT:    [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
+; CGSCC-NEXT:    br label %[[FOR_COND:.*]]
+; CGSCC:       [[FOR_COND]]:
+; CGSCC-NEXT:    [[I_0:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_BODY:.*]] ]
 ; CGSCC-NEXT:    [[CMP:%.*]] = icmp slt i32 [[I_0]], 128
-; CGSCC-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
-; CGSCC:       for.cond.cleanup:
-; CGSCC-NEXT:    br label [[FOR_COND4:%.*]]
-; CGSCC:       for.body:
+; CGSCC-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP:.*]]
+; CGSCC:       [[FOR_COND_CLEANUP]]:
+; CGSCC-NEXT:    br label %[[FOR_COND4:.*]]
+; CGSCC:       [[FOR_BODY]]:
 ; CGSCC-NEXT:    [[IDXPROM:%.*]] = sext i32 [[I_0]] to i64
 ; CGSCC-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[IN]], i64 [[IDXPROM]]
-; CGSCC-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CGSCC-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !invariant.load [[META32]]
 ; CGSCC-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [128 x i32], ptr [[BUF]], i64 0, i64 [[IDXPROM]]
 ; CGSCC-NEXT:    store i32 [[TMP0]], ptr [[ARRAYIDX2]], align 4
 ; CGSCC-NEXT:    [[INC]] = add nsw i32 [[I_0]], 1
-; CGSCC-NEXT:    br label [[FOR_COND]], !llvm.loop [[TBAA10]]
-; CGSCC:       for.cond4:
-; CGSCC-NEXT:    [[I3_0:%.*]] = phi i32 [ 0, [[FOR_COND_CLEANUP]] ], [ [[INC16:%.*]], [[FOR_BODY7:%.*]] ]
+; CGSCC-NEXT:    br label %[[FOR_COND]], !llvm.loop [[FLOAT_TBAA10]]
+; CGSCC:       [[FOR_COND4]]:
+; CGSCC-NEXT:    [[I3_0:%.*]] = phi i32 [ 0, %[[FOR_COND_CLEANUP]] ], [ [[INC16:%.*]], %[[FOR_BODY7:.*]] ]
 ; CGSCC-NEXT:    [[CMP5:%.*]] = icmp slt i32 [[I3_0]], 128
-; CGSCC-NEXT:    br i1 [[CMP5]], label [[FOR_BODY7]], label [[FOR_COND_CLEANUP6:%.*]]
-; CGSCC:       for.cond.cleanup6:
+; CGSCC-NEXT:    br i1 [[CMP5]], label %[[FOR_BODY7]], label %[[FOR_COND_CLEANUP6:.*]]
+; CGSCC:       [[FOR_COND_CLEANUP6]]:
 ; CGSCC-NEXT:    call void @llvm.lifetime.end.p0(ptr noalias nofree noundef nonnull align 16 captures(none) dereferenceable(512) [[BUF]]) #[[ATTR20]]
 ; CGSCC-NEXT:    ret void
-; CGSCC:       for.body7:
+; CGSCC:       [[FOR_BODY7]]:
 ; CGSCC-NEXT:    [[IDXPROM8:%.*]] = sext i32 [[I3_0]] to i64
 ; CGSCC-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds [128 x i32], ptr [[BUF]], i64 0, i64 [[IDXPROM8]]
 ; CGSCC-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX9]], align 4
@@ -2832,7 +2832,7 @@ define hidden void @no_propagation_of_unknown_index_access(ptr %in, ptr %out, i3
 ; CGSCC-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, ptr [[OUT]], i64 [[IDXPROM8]]
 ; CGSCC-NEXT:    store i32 [[CONV]], ptr [[ARRAYIDX14]], align 4
 ; CGSCC-NEXT:    [[INC16]] = add nsw i32 [[I3_0]], 1
-; CGSCC-NEXT:    br label [[FOR_COND4]], !llvm.loop [[TBAA12]]
+; CGSCC-NEXT:    br label %[[FOR_COND4]], !llvm.loop [[INT_TBAA12]]
 ;
 entry:
   %buf = alloca [128 x i32], align 16
@@ -2883,30 +2883,30 @@ for.body7:                                        ; preds = %for.cond4
 ; Ensure we do not return true.
 define internal i1 @alloca_non_unique(ptr %p, i32 %in, i1 %c) {
 ; TUNIT: Function Attrs: nofree nosync nounwind memory(argmem: readwrite)
-; TUNIT-LABEL: define {{[^@]+}}@alloca_non_unique
-; TUNIT-SAME: (ptr noalias nofree readonly align 4 captures(none) [[P:%.*]], i32 [[IN:%.*]], i1 noundef [[C:%.*]]) #[[ATTR12:[0-9]+]] {
+; TUNIT-LABEL: define internal i1 @alloca_non_unique(
+; TUNIT-SAME: ptr noalias nofree readonly align 4 captures(none) [[P:%.*]], i32 [[IN:%.*]], i1 noundef [[C:%.*]]) #[[ATTR12:[0-9]+]] {
 ; TUNIT-NEXT:    [[A:%.*]] = alloca i32, align 4
 ; TUNIT-NEXT:    store i32 [[IN]], ptr [[A]], align 4
-; TUNIT-NEXT:    br i1 [[C]], label [[T:%.*]], label [[F:%.*]]
-; TUNIT:       t:
+; TUNIT-NEXT:    br i1 [[C]], label %[[T:.*]], label %[[F:.*]]
+; TUNIT:       [[T]]:
 ; TUNIT-NEXT:    [[R:%.*]] = call i1 @alloca_non_unique(ptr noalias nofree noundef nonnull readonly align 4 captures(none) dereferenceable(4) [[A]], i32 noundef 42, i1 noundef false) #[[ATTR14:[0-9]+]]
 ; TUNIT-NEXT:    ret i1 [[R]]
-; TUNIT:       f:
-; TUNIT-NEXT:    [[L:%.*]] = load i32, ptr [[P]], align 4
+; TUNIT:       [[F]]:
+; TUNIT-NEXT:    [[L:%.*]] = load i32, ptr [[P]], align 4, !invariant.load [[META32]]
 ; TUNIT-NEXT:    [[CMP:%.*]] = icmp eq i32 [[IN]], [[L]]
 ; TUNIT-NEXT:    ret i1 [[CMP]]
 ;
 ; CGSCC: Function Attrs: nofree nosync nounwind memory(argmem: readwrite)
-; CGSCC-LABEL: define {{[^@]+}}@alloca_non_unique
-; CGSCC-SAME: (ptr noalias nofree readonly align 4 captures(none) [[P:%.*]], i32 [[IN:%.*]], i1 noundef [[C:%.*]]) #[[ATTR14:[0-9]+]] {
+; CGSCC-LABEL: define internal i1 @alloca_non_unique(
+; CGSCC-SAME: ptr noalias nofree readonly align 4 captures(none) [[P:%.*]], i32 [[IN:%.*]], i1 noundef [[C:%.*]]) #[[ATTR14:[0-9]+]] {
 ; CGSCC-NEXT:    [[A:%.*]] = alloca i32, align 4
 ; CGSCC-NEXT:    store i32 [[IN]], ptr [[A]], align 4
-; CGSCC-NEXT:    br i1 [[C]], label [[T:%.*]], label [[F:%.*]]
-; CGSCC:       t:
+; CGSCC-NEXT:    br i1 [[C]], label %[[T:.*]], label %[[F:.*]]
+; CGSCC:       [[T]]:
 ; CGSCC-NEXT:    [[R:%.*]] = call i1 @alloca_non_unique(ptr noalias nofree noundef nonnull readonly align 4 captures(none) dereferenceable(4) [[A]], i32 noundef 42, i1 noundef false) #[[ATTR17:[0-9]+]]
 ; CGSCC-NEXT:    ret i1 [[R]]
-; CGSCC:       f:
-; CGSCC-NEXT:    [[L:%.*]] = load i32, ptr [[P]], align 4
+; CGSCC:       [[F]]:
+; CGSCC-NEXT:    [[L:%.*]] = load i32, ptr [[P]], align 4, !invariant.load [[META32]]
 ; CGSCC-NEXT:    [[CMP:%.*]] = icmp eq i32 [[IN]], [[L]]
 ; CGSCC-NEXT:    ret i1 [[CMP]]
 ;
@@ -2925,14 +2925,14 @@ f:
 ; Ensure we do not return true.
 define i1 @alloca_non_unique_caller(i32 %in, i1 %c) {
 ; TUNIT: Function Attrs: nofree norecurse nosync nounwind memory(none)
-; TUNIT-LABEL: define {{[^@]+}}@alloca_non_unique_caller
-; TUNIT-SAME: (i32 [[IN:%.*]], i1 [[C:%.*]]) #[[ATTR13:[0-9]+]] {
+; TUNIT-LABEL: define i1 @alloca_non_unique_caller(
+; TUNIT-SAME: i32 [[IN:%.*]], i1 [[C:%.*]]) #[[ATTR13:[0-9]+]] {
 ; TUNIT-NEXT:    [[R:%.*]] = call i1 @alloca_non_unique(ptr undef, i32 [[IN]], i1 noundef [[C]]) #[[ATTR14]]
 ; TUNIT-NEXT:    ret i1 [[R]]
 ;
 ; CGSCC: Function Attrs: nofree nosync nounwind memory(none)
-; CGSCC-LABEL: define {{[^@]+}}@alloca_non_unique_caller
-; CGSCC-SAME: (i32 [[IN:%.*]], i1 noundef [[C:%.*]]) #[[ATTR15:[0-9]+]] {
+; CGSCC-LABEL: define i1 @alloca_non_unique_caller(
+; CGSCC-SAME: i32 [[IN:%.*]], i1 noundef [[C:%.*]]) #[[ATTR15:[0-9]+]] {
 ; CGSCC-NEXT:    [[R:%.*]] = call i1 @alloca_non_unique(ptr nofree undef, i32 [[IN]], i1 noundef [[C]]) #[[ATTR25:[0-9]+]]
 ; CGSCC-NEXT:    ret i1 [[R]]
 ;
@@ -2943,8 +2943,8 @@ define i1 @alloca_non_unique_caller(i32 %in, i1 %c) {
 ; Ensure we do not return %bad or %l, but %sel
 define i32 @scope_value_traversal(i32 %bad, i1 %c, i1 %c2) {
 ; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
-; TUNIT-LABEL: define {{[^@]+}}@scope_value_traversal
-; TUNIT-SAME: (i32 [[BAD:%.*]], i1 [[C:%.*]], i1 [[C2:%.*]]) #[[ATTR4]] {
+; TUNIT-LABEL: define i32 @scope_value_traversal(
+; TUNIT-SAME: i32 [[BAD:%.*]], i1 [[C:%.*]], i1 [[C2:%.*]]) #[[ATTR4]] {
 ; TUNIT-NEXT:    [[A:%.*]] = alloca i32, align 4
 ; TUNIT-NEXT:    store i32 [[BAD]], ptr [[A]], align 4
 ; TUNIT-NEXT:    call void @scope_value_traversal_helper(ptr noalias nofree noundef nonnull align 4 captures(none) dereferenceable(4) [[A]], i1 [[C2]]) #[[ATTR22:[0-9]+]]
@@ -2953,8 +2953,8 @@ define i32 @scope_value_traversal(i32 %bad, i1 %c, i1 %c2) {
 ; TUNIT-NEXT:    ret i32 [[SEL]]
 ;
 ; CGSCC: Function Attrs: mustprogress nofree nosync nounwind willreturn memory(none)
-; CGSCC-LABEL: define {{[^@]+}}@scope_value_traversal
-; CGSCC-SAME: (i32 [[BAD:%.*]], i1 [[C:%.*]], i1 [[C2:%.*]]) #[[ATTR16:[0-9]+]] {
+; CGSCC-LABEL: define i32 @scope_value_traversal(
+; CGSCC-SAME: i32 [[BAD:%.*]], i1 [[C:%.*]], i1 [[C2:%.*]]) #[[ATTR16:[0-9]+]] {
 ; CGSCC-NEXT:    [[A:%.*]] = alloca i32, align 4
 ; CGSCC-NEXT:    store i32 [[BAD]], ptr [[A]], align 4
 ; CGSCC-NEXT:    call void @scope_value_traversal_helper(ptr noalias nofree noundef nonnull align 4 captures(none) dereferenceable(4) [[A]], i1 [[C2]]) #[[ATTR26:[0-9]+]]
@@ -2972,16 +2972,16 @@ define i32 @scope_value_traversal(i32 %bad, i1 %c, i1 %c2) {
 
 define void @scope_value_traversal_helper(ptr %a, i1 %c) {
 ; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
-; TUNIT-LABEL: define {{[^@]+}}@scope_value_traversal_helper
-; TUNIT-SAME: (ptr nofree noundef nonnull align 4 captures(none) dereferenceable(4) [[A:%.*]], i1 [[C:%.*]]) #[[ATTR1]] {
+; TUNIT-LABEL: define void @scope_value_traversal_helper(
+; TUNIT-SAME: ptr nofree noundef nonnull align 4 captures(none) dereferenceable(4) [[A:%.*]], i1 [[C:%.*]]) #[[ATTR1]] {
 ; TUNIT-NEXT:    [[L:%.*]] = load i32, ptr [[A]], align 4
 ; TUNIT-NEXT:    [[SEL:%.*]] = select i1 [[C]], i32 [[L]], i32 42
 ; TUNIT-NEXT:    store i32 [[SEL]], ptr [[A]], align 4
 ; TUNIT-NEXT:    ret void
 ;
 ; CGSCC: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
-; CGSCC-LABEL: define {{[^@]+}}@scope_value_traversal_helper
-; CGSCC-SAME: (ptr nofree noundef nonnull align 4 captures(none) dereferenceable(4) [[A:%.*]], i1 [[C:%.*]]) #[[ATTR13]] {
+; CGSCC-LABEL: define void @scope_value_traversal_helper(
+; CGSCC-SAME: ptr nofree noundef nonnull align 4 captures(none) dereferenceable(4) [[A:%.*]], i1 [[C:%.*]]) #[[ATTR13]] {
 ; CGSCC-NEXT:    [[L:%.*]] = load i32, ptr [[A]], align 4
 ; CGSCC-NEXT:    [[SEL:%.*]] = select i1 [[C]], i32 [[L]], i32 42
 ; CGSCC-NEXT:    store i32 [[SEL]], ptr [[A]], align 4
@@ -2995,9 +2995,9 @@ define void @scope_value_traversal_helper(ptr %a, i1 %c) {
 
 define i8 @gep_index_from_binary_operator(i1 %cnd1, i1 %cnd2) {
 ; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
-; CHECK-LABEL: define {{[^@]+}}@gep_index_from_binary_operator
-; CHECK-SAME: (i1 [[CND1:%.*]], i1 [[CND2:%.*]]) #[[ATTR4]] {
-; CHECK-NEXT:  entry:
+; CHECK-LABEL: define noundef i8 @gep_index_from_binary_operator(
+; CHECK-SAME: i1 [[CND1:%.*]], i1 [[CND2:%.*]]) #[[ATTR4]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[BYTES:%.*]] = alloca [1024 x i8], align 16
 ; CHECK-NEXT:    [[GEP_FIXED:%.*]] = getelementptr inbounds [1024 x i8], ptr [[BYTES]], i64 0, i64 12
 ; CHECK-NEXT:    ret i8 100
@@ -3014,9 +3014,9 @@ entry:
 
 define i8 @gep_index_from_memory(i1 %cnd1, i1 %cnd2) {
 ; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
-; CHECK-LABEL: define {{[^@]+}}@gep_index_from_memory
-; CHECK-SAME: (i1 [[CND1:%.*]], i1 [[CND2:%.*]]) #[[ATTR4]] {
-; CHECK-NEXT:  entry:
+; CHECK-LABEL: define i8 @gep_index_from_memory(
+; CHECK-SAME: i1 [[CND1:%.*]], i1 [[CND2:%.*]]) #[[ATTR4]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[BYTES:%.*]] = alloca [1024 x i8], align 16
 ; CHECK-NEXT:    [[GEP_LOADED:%.*]] = getelementptr inbounds [1024 x i8], ptr [[BYTES]], i64 0, i64 12
 ; CHECK-NEXT:    ret i8 100
@@ -3040,27 +3040,27 @@ entry:
 ; Ensure this is not flattened to return 3
 define i32 @a(i1 %c) {
 ; TUNIT: Function Attrs: nofree nosync nounwind
-; TUNIT-LABEL: define {{[^@]+}}@a
-; TUNIT-SAME: (i1 noundef [[C:%.*]]) #[[ATTR14]] {
+; TUNIT-LABEL: define noundef i32 @a(
+; TUNIT-SAME: i1 noundef [[C:%.*]]) #[[ATTR14]] {
 ; TUNIT-NEXT:    store i32 3, ptr @G, align 4
-; TUNIT-NEXT:    br i1 [[C]], label [[T:%.*]], label [[F:%.*]]
-; TUNIT:       t:
+; TUNIT-NEXT:    br i1 [[C]], label %[[T:.*]], label %[[F:.*]]
+; TUNIT:       [[T]]:
 ; TUNIT-NEXT:    [[REC:%.*]] = call i32 @a(i1 noundef false) #[[ATTR14]]
-; TUNIT-NEXT:    br label [[F]]
-; TUNIT:       f:
+; TUNIT-NEXT:    br label %[[F]]
+; TUNIT:       [[F]]:
 ; TUNIT-NEXT:    [[R:%.*]] = load i32, ptr @G, align 4
 ; TUNIT-NEXT:    store i32 5, ptr @G, align 4
 ; TUNIT-NEXT:    ret i32 [[R]]
 ;
 ; CGSCC: Function Attrs: nofree nosync nounwind
-; CGSCC-LABEL: define {{[^@]+}}@a
-; CGSCC-SAME: (i1 noundef [[C:%.*]]) #[[ATTR17]] {
+; CGSCC-LABEL: define noundef i32 @a(
+; CGSCC-SAME: i1 noundef [[C:%.*]]) #[[ATTR17]] {
 ; CGSCC-NEXT:    store i32 3, ptr @G, align 4
-; CGSCC-NEXT:    br i1 [[C]], label [[T:%.*]], label [[F:%.*]]
-; CGSCC:       t:
+; CGSCC-NEXT:    br i1 [[C]], label %[[T:.*]], label %[[F:.*]]
+; CGSCC:       [[T]]:
 ; CGSCC-NEXT:    [[REC:%.*]] = call i32 @a(i1 noundef false) #[[ATTR17]]
-; CGSCC-NEXT:    br label [[F]]
-; CGSCC:       f:
+; CGSCC-NEXT:    br label %[[F]]
+; CGSCC:       [[F]]:
 ; CGSCC-NEXT:    [[R:%.*]] = load i32, ptr @G, align 4
 ; CGSCC-NEXT:    store i32 5, ptr @G, align 4
 ; CGSCC-NEXT:    ret i32 [[R]]
@@ -3081,22 +3081,22 @@ f:
 @GC = internal global i32 undef, align 4
 define void @atomicrmw(ptr %p, i32 %i, i1 %cnd) {
 ; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn
-; TUNIT-LABEL: define {{[^@]+}}@atomicrmw
-; TUNIT-SAME: (ptr nofree [[P:%.*]], i32 [[I:%.*]], i1 noundef [[CND:%.*]]) #[[ATTR3]] {
-; TUNIT-NEXT:    br i1 [[CND]], label [[T:%.*]], label [[M:%.*]]
-; TUNIT:       t:
-; TUNIT-NEXT:    br label [[M]]
-; TUNIT:       m:
+; TUNIT-LABEL: define void @atomicrmw(
+; TUNIT-SAME: ptr nofree [[P:%.*]], i32 [[I:%.*]], i1 noundef [[CND:%.*]]) #[[ATTR3]] {
+; TUNIT-NEXT:    br i1 [[CND]], label %[[T:.*]], label %[[M:.*]]
+; TUNIT:       [[T]]:
+; TUNIT-NEXT:    br label %[[M]]
+; TUNIT:       [[M]]:
 ; TUNIT-NEXT:    [[ARMW:%.*]] = atomicrmw add ptr @GC, i32 [[I]] monotonic, align 4
 ; TUNIT-NEXT:    ret void
 ;
 ; CGSCC: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn
-; CGSCC-LABEL: define {{[^@]+}}@atomicrmw
-; CGSCC-SAME: (ptr nofree [[P:%.*]], i32 [[I:%.*]], i1 noundef [[CND:%.*]]) #[[ATTR5]] {
-; CGSCC-NEXT:    br i1 [[CND]], label [[T:%.*]], label [[M:%.*]]
-; CGSCC:       t:
-; CGSCC-NEXT:    br label [[M]]
-; CGSCC:       m:
+; CGSCC-LABEL: define void @atomicrmw(
+; CGSCC-SAME: ptr nofree [[P:%.*]], i32 [[I:%.*]], i1 noundef [[CND:%.*]]) #[[ATTR5]] {
+; CGSCC-NEXT:    br i1 [[CND]], label %[[T:.*]], label %[[M:.*]]
+; CGSCC:       [[T]]:
+; CGSCC-NEXT:    br label %[[M]]
+; CGSCC:       [[M]]:
 ; CGSCC-NEXT:    [[ARMW:%.*]] = atomicrmw add ptr @GC, i32 [[I]] monotonic, align 4
 ; CGSCC-NEXT:    ret void
 ;
@@ -3123,24 +3123,24 @@ m:
 
 define i32 @recSimplify(i32 %v, i1 %cond) {
 ; TUNIT: Function Attrs: nofree nosync nounwind
-; TUNIT-LABEL: define {{[^@]+}}@recSimplify
-; TUNIT-SAME: (i32 [[V:%.*]], i1 noundef [[COND:%.*]]) #[[ATTR14]] {
-; TUNIT-NEXT:    br i1 [[COND]], label [[REC:%.*]], label [[COMP:%.*]]
-; TUNIT:       rec:
+; TUNIT-LABEL: define i32 @recSimplify(
+; TUNIT-SAME: i32 [[V:%.*]], i1 noundef [[COND:%.*]]) #[[ATTR14]] {
+; TUNIT-NEXT:    br i1 [[COND]], label %[[REC:.*]], label %[[COMP:.*]]
+; TUNIT:       [[REC]]:
 ; TUNIT-NEXT:    [[RV:%.*]] = call i32 @recSimplify(i32 undef, i1 noundef false) #[[ATTR14]]
 ; TUNIT-NEXT:    ret i32 1
-; TUNIT:       comp:
+; TUNIT:       [[COMP]]:
 ; TUNIT-NEXT:    store i32 1, ptr @GRS2, align 4
 ; TUNIT-NEXT:    ret i32 1
 ;
 ; CGSCC: Function Attrs: nofree nosync nounwind
-; CGSCC-LABEL: define {{[^@]+}}@recSimplify
-; CGSCC-SAME: (i32 [[V:%.*]], i1 noundef [[COND:%.*]]) #[[ATTR17]] {
-; CGSCC-NEXT:    br i1 [[COND]], label [[REC:%.*]], label [[COMP:%.*]]
-; CGSCC:       rec:
+; CGSCC-LABEL: define i32 @recSimplify(
+; CGSCC-SAME: i32 [[V:%.*]], i1 noundef [[COND:%.*]]) #[[ATTR17]] {
+; CGSCC-NEXT:    br i1 [[COND]], label %[[REC:.*]], label %[[COMP:.*]]
+; CGSCC:       [[REC]]:
 ; CGSCC-NEXT:    [[RV:%.*]] = call i32 @recSimplify(i32 [[V]], i1 noundef false) #[[ATTR17]]
 ; CGSCC-NEXT:    ret i32 [[RV]]
-; CGSCC:       comp:
+; CGSCC:       [[COMP]]:
 ; CGSCC-NEXT:    store i32 [[V]], ptr @GRS, align 4
 ; CGSCC-NEXT:    store i32 1, ptr @GRS2, align 4
 ; CGSCC-NEXT:    [[L:%.*]] = load i32, ptr @GRS, align 4
@@ -3167,8 +3167,8 @@ comp:
 
 define internal i32 @recSimplify2() {
 ; CGSCC: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(read)
-; CGSCC-LABEL: define {{[^@]+}}@recSimplify2
-; CGSCC-SAME: () #[[ATTR7]] {
+; CGSCC-LABEL: define internal i32 @recSimplify2(
+; CGSCC-SAME: ) #[[ATTR7]] {
 ; CGSCC-NEXT:    [[R:%.*]] = load i32, ptr @GRS, align 4
 ; CGSCC-NEXT:    ret i32 [[R]]
 ;
@@ -3179,18 +3179,18 @@ define internal i32 @recSimplify2() {
 ; Verify we do not return 10.
 define i32 @may_access_after_return(i32 noundef %N, i32 noundef %M) {
 ; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
-; TUNIT-LABEL: define {{[^@]+}}@may_access_after_return
-; TUNIT-SAME: (i32 noundef [[N:%.*]], i32 noundef [[M:%.*]]) #[[ATTR4]] {
-; TUNIT-NEXT:  entry:
+; TUNIT-LABEL: define noundef i32 @may_access_after_return(
+; TUNIT-SAME: i32 noundef [[N:%.*]], i32 noundef [[M:%.*]]) #[[ATTR4]] {
+; TUNIT-NEXT:  [[ENTRY:.*:]]
 ; TUNIT-NEXT:    [[A:%.*]] = alloca i32, align 4
 ; TUNIT-NEXT:    [[B:%.*]] = alloca i32, align 4
 ; TUNIT-NEXT:    call void @write_both(ptr nofree noundef nonnull writeonly align 4 captures(none) dereferenceable(4) [[A]], ptr nofree noundef nonnull writeonly align 4 captures(none) dereferenceable(4) [[B]]) #[[ATTR18]]
 ; TUNIT-NEXT:    ret i32 8
 ;
 ; CGSCC: Function Attrs: mustprogress nofree nosync nounwind willreturn memory(none)
-; CGSCC-LABEL: define {{[^@]+}}@may_access_after_return
-; CGSCC-SAME: (i32 noundef [[N:%.*]], i32 noundef [[M:%.*]]) #[[ATTR16]] {
-; CGSCC-NEXT:  entry:
+; CGSCC-LABEL: define i32 @may_access_after_return(
+; CGSCC-SAME: i32 noundef [[N:%.*]], i32 noundef [[M:%.*]]) #[[ATTR16]] {
+; CGSCC-NEXT:  [[ENTRY:.*:]]
 ; CGSCC-NEXT:    [[A:%.*]] = alloca i32, align 4
 ; CGSCC-NEXT:    [[B:%.*]] = alloca i32, align 4
 ; CGSCC-NEXT:    call void @write_both(ptr noalias nofree noundef nonnull writeonly align 4 captures(none) dereferenceable(4) [[A]], ptr noalias nofree noundef nonnull writeonly align 4 captures(none) dereferenceable(4) [[B]]) #[[ATTR21]]
@@ -3213,9 +3213,9 @@ entry:
 
 define internal void @write_both(ptr noundef %Q, ptr noundef %R) {
 ; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: write)
-; CHECK-LABEL: define {{[^@]+}}@write_both
-; CHECK-SAME: (ptr nofree noundef nonnull writeonly align 4 captures(none) dereferenceable(4) [[Q:%.*]], ptr nofree noundef nonnull writeonly align 4 captures(none) dereferenceable(4) [[R:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:  entry:
+; CHECK-LABEL: define internal void @write_both(
+; CHECK-SAME: ptr nofree noundef nonnull writeonly align 4 captures(none) dereferenceable(4) [[Q:%.*]], ptr nofree noundef nonnull writeonly align 4 captures(none) dereferenceable(4) [[R:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    store i32 3, ptr [[Q]], align 4
 ; CHECK-NEXT:    store i32 5, ptr [[R]], align 4
 ; CHECK-NEXT:    ret void
@@ -3228,9 +3228,9 @@ entry:
 
 define internal ptr @passthrough(ptr noundef %P) {
 ; CGSCC: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
-; CGSCC-LABEL: define {{[^@]+}}@passthrough
-; CGSCC-SAME: (ptr noalias nofree noundef nonnull readnone returned align 4 dereferenceable(4) "no-capture-maybe-returned" [[P:%.*]]) #[[ATTR4]] {
-; CGSCC-NEXT:  entry:
+; CGSCC-LABEL: define internal noundef nonnull align 4 dereferenceable(4) ptr @passthrough(
+; CGSCC-SAME: ptr noalias nofree noundef nonnull readnone returned align 4 dereferenceable(4) "no-capture-maybe-returned" [[P:%.*]]) #[[ATTR4]] {
+; CGSCC-NEXT:  [[ENTRY:.*:]]
 ; CGSCC-NEXT:    ret ptr [[P]]
 ;
 entry:
@@ -3240,9 +3240,9 @@ entry:
 ; Verify we do not return 10.
 define i32 @may_access_after_return_choice(i32 noundef %N, i32 noundef %M, i1 %c) {
 ; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
-; TUNIT-LABEL: define {{[^@]+}}@may_access_after_return_choice
-; TUNIT-SAME: (i32 noundef [[N:%.*]], i32 noundef [[M:%.*]], i1 [[C:%.*]]) #[[ATTR4]] {
-; TUNIT-NEXT:  entry:
+; TUNIT-LABEL: define noundef i32 @may_access_after_return_choice(
+; TUNIT-SAME: i32 noundef [[N:%.*]], i32 noundef [[M:%.*]], i1 [[C:%.*]]) #[[ATTR4]] {
+; TUNIT-NEXT:  [[ENTRY:.*:]]
 ; TUNIT-NEXT:    [[A:%.*]] = alloca i32, align 4
 ; TUNIT-NEXT:    [[B:%.*]] = alloca i32, align 4
 ; TUNIT-NEXT:    [[CALL:%.*]] = call nonnull align 4 dereferenceable(4) ptr @passthrough_choice(i1 [[C]], ptr noalias nofree noundef nonnull readnone align 4 dereferenceable(4) "no-capture-maybe-returned" [[A]], ptr noalias nofree noundef nonnull readnone align 4 dereferenceable(4) "no-capture-maybe-returned" [[B]]) #[[ATTR23:[0-9]+]]
@@ -3254,9 +3254,9 @@ define i32 @may_access_after_return_choice(i32 noundef %N, i32 noundef %M, i1 %c
 ; TUNIT-NEXT:    ret i32 [[ADD]]
 ;
 ; CGSCC: Function Attrs: mustprogress nofree nosync nounwind willreturn
-; CGSCC-LABEL: define {{[^@]+}}@may_access_after_return_choice
-; CGSCC-SAME: (i32 noundef [[N:%.*]], i32 noundef [[M:%.*]], i1 [[C:%.*]]) #[[ATTR3]] {
-; CGSCC-NEXT:  entry:
+; CGSCC-LABEL: define i32 @may_access_after_return_choice(
+; CGSCC-SAME: i32 noundef [[N:%.*]], i32 noundef [[M:%.*]], i1 [[C:%.*]]) #[[ATTR3]] {
+; CGSCC-NEXT:  [[ENTRY:.*:]]
 ; CGSCC-NEXT:    [[A:%.*]] = alloca i32, align 4
 ; CGSCC-NEXT:    [[B:%.*]] = alloca i32, align 4
 ; CGSCC-NEXT:    [[CALL:%.*]] = call nonnull align 4 dereferenceable(4) ptr @passthrough_choice(i1 [[C]], ptr noalias nofree noundef nonnull readnone align 4 dereferenceable(4) [[A]], ptr noalias nofree noundef nonnull readnone align 4 dereferenceable(4) [[B]]) #[[ATTR28:[0-9]+]]
@@ -3281,9 +3281,9 @@ entry:
 
 define internal ptr @passthrough_choice(i1 %c, ptr noundef %P, ptr noundef %Q) {
 ; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
-; CHECK-LABEL: define {{[^@]+}}@passthrough_choice
-; CHECK-SAME: (i1 [[C:%.*]], ptr noalias nofree noundef nonnull readnone align 4 dereferenceable(4) "no-capture-maybe-returned" [[P:%.*]], ptr noalias nofree noundef nonnull readnone align 4 dereferenceable(4) "no-capture-maybe-returned" [[Q:%.*]]) #[[ATTR4]] {
-; CHECK-NEXT:  entry:
+; CHECK-LABEL: define internal noundef nonnull align 4 dereferenceable(4) ptr @passthrough_choice(
+; CHECK-SAME: i1 [[C:%.*]], ptr noalias nofree noundef nonnull readnone align 4 dereferenceable(4) "no-capture-maybe-returned" [[P:%.*]], ptr noalias nofree noundef nonnull readnone align 4 dereferenceable(4) "no-capture-maybe-returned" [[Q:%.*]]) #[[ATTR4]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[R:%.*]] = select i1 [[C]], ptr [[P]], ptr [[Q]]
 ; CHECK-NEXT:    ret ptr [[R]]
 ;
@@ -3295,18 +3295,18 @@ entry:
 ; Verify we do not return 10.
 define i32 @may_access_after_return_no_choice1(i32 noundef %N, i32 noundef %M) {
 ; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
-; TUNIT-LABEL: define {{[^@]+}}@may_access_after_return_no_choice1
-; TUNIT-SAME: (i32 noundef [[N:%.*]], i32 noundef [[M:%.*]]) #[[ATTR4]] {
-; TUNIT-NEXT:  entry:
+; TUNIT-LABEL: define noundef i32 @may_access_after_return_no_choice1(
+; TUNIT-SAME: i32 noundef [[N:%.*]], i32 noundef [[M:%.*]]) #[[ATTR4]] {
+; TUNIT-NEXT:  [[ENTRY:.*:]]
 ; TUNIT-NEXT:    [[A:%.*]] = alloca i32, align 4
 ; TUNIT-NEXT:    [[B:%.*]] = alloca i32, align 4
 ; TUNIT-NEXT:    call void @write_both(ptr nofree noundef nonnull writeonly align 4 captures(none) dereferenceable(4) [[A]], ptr nofree noundef nonnull writeonly align 4 captures(none) dereferenceable(4) [[B]]) #[[ATTR18]]
 ; TUNIT-NEXT:    ret i32 8
 ;
 ; CGSCC: Function Attrs: mustprogress nofree nosync nounwind willreturn memory(none)
-; CGSCC-LABEL: define {{[^@]+}}@may_access_after_return_no_choice1
-; CGSCC-SAME: (i32 noundef [[N:%.*]], i32 noundef [[M:%.*]]) #[[ATTR16]] {
-; CGSCC-NEXT:  entry:
+; CGSCC-LABEL: define i32 @may_access_after_return_no_choice1(
+; CGSCC-SAME: i32 noundef [[N:%.*]], i32 noundef [[M:%.*]]) #[[ATTR16]] {
+; CGSCC-NEXT:  [[ENTRY:.*:]]
 ; CGSCC-NEXT:    [[A:%.*]] = alloca i32, align 4
 ; CGSCC-NEXT:    [[B:%.*]] = alloca i32, align 4
 ; CGSCC-NEXT:    call void @write_both(ptr noalias nofree noundef nonnull writeonly align 4 captures(none) dereferenceable(4) [[A]], ptr noalias nofree noundef nonnull writeonly align 4 captures(none) dereferenceable(4) [[B]]) #[[ATTR21]]
@@ -3330,18 +3330,18 @@ entry:
 ; Verify we do not return 10.
 define i32 @may_access_after_return_no_choice2(i32 noundef %N, i32 noundef %M) {
 ; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
-; TUNIT-LABEL: define {{[^@]+}}@may_access_after_return_no_choice2
-; TUNIT-SAME: (i32 noundef [[N:%.*]], i32 noundef [[M:%.*]]) #[[ATTR4]] {
-; TUNIT-NEXT:  entry:
+; TUNIT-LABEL: define noundef i32 @may_access_after_return_no_choice2(
+; TUNIT-SAME: i32 noundef [[N:%.*]], i32 noundef [[M:%.*]]) #[[ATTR4]] {
+; TUNIT-NEXT:  [[ENTRY:.*:]]
 ; TUNIT-NEXT:    [[A:%.*]] = alloca i32, align 4
 ; TUNIT-NEXT:    [[B:%.*]] = alloca i32, align 4
 ; TUNIT-NEXT:    call void @write_both(ptr nofree noundef nonnull writeonly align 4 captures(none) dereferenceable(4) [[B]], ptr nofree noundef nonnull writeonly align 4 captures(none) dereferenceable(4) [[A]]) #[[ATTR18]]
 ; TUNIT-NEXT:    ret i32 8
 ;
 ; CGSCC: Function Attrs: mustprogress nofree nosync nounwind willreturn memory(none)
-; CGSCC-LABEL: define {{[^@]+}}@may_access_after_return_no_choice2
-; CGSCC-SAME: (i32 noundef [[N:%.*]], i32 noundef [[M:%.*]]) #[[ATTR16]] {
-; CGSCC-NEXT:  entry:
+; CGSCC-LABEL: define i32 @may_access_after_return_no_choice2(
+; CGSCC-SAME: i32 noundef [[N:%.*]], i32 noundef [[M:%.*]]) #[[ATTR16]] {
+; CGSCC-NEXT:  [[ENTRY:.*:]]
 ; CGSCC-NEXT:    [[A:%.*]] = alloca i32, align 4
 ; CGSCC-NEXT:    [[B:%.*]] = alloca i32, align 4
 ; CGSCC-NEXT:    call void @write_both(ptr noalias nofree noundef nonnull writeonly align 4 captures(none) dereferenceable(4) [[B]], ptr noalias nofree noundef nonnull writeonly align 4 captures(none) dereferenceable(4) [[A]]) #[[ATTR21]]
@@ -3364,9 +3364,9 @@ entry:
 
 define internal ptr @passthrough_no_choice_true(i1 %c, ptr noundef %P, ptr noundef %Q) {
 ; CGSCC: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
-; CGSCC-LABEL: define {{[^@]+}}@passthrough_no_choice_true
-; CGSCC-SAME: (ptr noalias nofree noundef nonnull readnone returned align 4 dereferenceable(4) "no-capture-maybe-returned" [[P:%.*]], i32 [[TMP0:%.*]]) #[[ATTR4]] {
-; CGSCC-NEXT:  entry:
+; CGSCC-LABEL: define internal noundef nonnull align 4 dereferenceable(4) ptr @passthrough_no_choice_true(
+; CGSCC-SAME: ptr noalias nofree noundef nonnull readnone returned align 4 dereferenceable(4) "no-capture-maybe-returned" [[P:%.*]], i32 [[TMP0:%.*]]) #[[ATTR4]] {
+; CGSCC-NEXT:  [[ENTRY:.*:]]
 ; CGSCC-NEXT:    [[Q_PRIV:%.*]] = alloca i32, align 4
 ; CGSCC-NEXT:    store i32 [[TMP0]], ptr [[Q_PRIV]], align 4
 ; CGSCC-NEXT:    ret ptr [[P]]
@@ -3377,9 +3377,9 @@ entry:
 }
 define internal ptr @passthrough_no_choice_false(i1 %c, ptr noundef %P, ptr noundef %Q) {
 ; CGSCC: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
-; CGSCC-LABEL: define {{[^@]+}}@passthrough_no_choice_false
-; CGSCC-SAME: (i32 [[TMP0:%.*]], ptr noalias nofree noundef nonnull readnone returned align 4 dereferenceable(4) "no-capture-maybe-returned" [[Q:%.*]]) #[[ATTR4]] {
-; CGSCC-NEXT:  entry:
+; CGSCC-LABEL: define internal noundef nonnull align 4 dereferenceable(4) ptr @passthrough_no_choice_false(
+; CGSCC-SAME: i32 [[TMP0:%.*]], ptr noalias nofree noundef nonnull readnone returned align 4 dereferenceable(4) "no-capture-maybe-returned" [[Q:%.*]]) #[[ATTR4]] {
+; CGSCC-NEXT:  [[ENTRY:.*:]]
 ; CGSCC-NEXT:    [[P_PRIV:%.*]] = alloca i32, align 4
 ; CGSCC-NEXT:    store i32 [[TMP0]], ptr [[P_PRIV]], align 4
 ; CGSCC-NEXT:    ret ptr [[Q]]
@@ -3391,8 +3391,8 @@ entry:
 
 define ptr @move2(ptr %p) {
 ; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
-; CHECK-LABEL: define {{[^@]+}}@move2
-; CHECK-SAME: (ptr nofree readnone "no-capture-maybe-returned" [[P:%.*]]) #[[ATTR4]] {
+; CHECK-LABEL: define ptr @move2(
+; CHECK-SAME: ptr nofree readnone "no-capture-maybe-returned" [[P:%.*]]) #[[ATTR4]] {
 ; CHECK-NEXT:    [[G:%.*]] = getelementptr i8, ptr [[P]], i32 2
 ; CHECK-NEXT:    ret ptr [[G]]
 ;
@@ -3401,8 +3401,8 @@ define ptr @move2(ptr %p) {
 }
 define internal ptr @move4(ptr %p) {
 ; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
-; CHECK-LABEL: define {{[^@]+}}@move4
-; CHECK-SAME: (ptr noalias nofree readnone "no-capture-maybe-returned" [[P:%.*]]) #[[ATTR4]] {
+; CHECK-LABEL: define internal ptr @move4(
+; CHECK-SAME: ptr noalias nofree readnone "no-capture-maybe-returned" [[P:%.*]]) #[[ATTR4]] {
 ; CHECK-NEXT:    [[G:%.*]] = getelementptr i8, ptr [[P]], i32 4
 ; CHECK-NEXT:    ret ptr [[G]]
 ;
@@ -3412,20 +3412,20 @@ define internal ptr @move4(ptr %p) {
 
 define ptr @move246(i32 %i, ptr %p) {
 ; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
-; CHECK-LABEL: define {{[^@]+}}@move246
-; CHECK-SAME: (i32 [[I:%.*]], ptr nofree readnone "no-capture-maybe-returned" [[P:%.*]]) #[[ATTR4]] {
+; CHECK-LABEL: define ptr @move246(
+; CHECK-SAME: i32 [[I:%.*]], ptr nofree readnone "no-capture-maybe-returned" [[P:%.*]]) #[[ATTR4]] {
 ; CHECK-NEXT:    [[C0:%.*]] = icmp eq i32 [[I]], 0
-; CHECK-NEXT:    br i1 [[C0]], label [[BG2:%.*]], label [[BG46:%.*]]
-; CHECK:       bg2:
+; CHECK-NEXT:    br i1 [[C0]], label %[[BG2:.*]], label %[[BG46:.*]]
+; CHECK:       [[BG2]]:
 ; CHECK-NEXT:    [[G2:%.*]] = getelementptr i8, ptr [[P]], i32 2
 ; CHECK-NEXT:    ret ptr [[G2]]
-; CHECK:       bg46:
+; CHECK:       [[BG46]]:
 ; CHECK-NEXT:    [[C1:%.*]] = icmp eq i32 [[I]], 1
-; CHECK-NEXT:    br i1 [[C1]], label [[BG4:%.*]], label [[BG6:%.*]]
-; CHECK:       bg4:
+; CHECK-NEXT:    br i1 [[C1]], label %[[BG4:.*]], label %[[BG6:.*]]
+; CHECK:       [[BG4]]:
 ; CHECK-NEXT:    [[G4:%.*]] = getelementptr i8, ptr [[P]], i32 4
 ; CHECK-NEXT:    ret ptr [[G4]]
-; CHECK:       bg6:
+; CHECK:       [[BG6]]:
 ; CHECK-NEXT:    [[G6:%.*]] = getelementptr i8, ptr [[P]], i32 6
 ; CHECK-NEXT:    ret ptr [[G6]]
 ;
@@ -3448,7 +3448,7 @@ bg6:
 declare void @use3i8(i8, i8, i8)
 
 define void @returnedPtrAccesses() {
-; TUNIT-LABEL: define {{[^@]+}}@returnedPtrAccesses() {
+; TUNIT-LABEL: define void @returnedPtrAccesses() {
 ; TUNIT-NEXT:    [[A:%.*]] = alloca i64, align 8
 ; TUNIT-NEXT:    [[A2:%.*]] = call ptr @move2(ptr noalias nofree noundef nonnull readnone align 8 dereferenceable(8) "no-capture-maybe-returned" [[A]]) #[[ATTR23]]
 ; TUNIT-NEXT:    [[A4:%.*]] = call ptr @move4(ptr noalias nofree noundef nonnull readnone align 8 dereferenceable(8) "no-capture-maybe-returned" [[A]]) #[[ATTR23]]
@@ -3459,7 +3459,7 @@ define void @returnedPtrAccesses() {
 ; TUNIT-NEXT:    call void @use3i8(i8 2, i8 4, i8 6)
 ; TUNIT-NEXT:    ret void
 ;
-; CGSCC-LABEL: define {{[^@]+}}@returnedPtrAccesses() {
+; CGSCC-LABEL: define void @returnedPtrAccesses() {
 ; CGSCC-NEXT:    [[A:%.*]] = alloca i64, align 8
 ; CGSCC-NEXT:    [[A2:%.*]] = call nonnull dereferenceable(1) ptr @move2(ptr noalias nofree noundef nonnull readnone align 8 dereferenceable(8) [[A]]) #[[ATTR20]]
 ; CGSCC-NEXT:    [[A4:%.*]] = call ptr @move4(ptr noalias nofree noundef nonnull readnone align 8 dereferenceable(8) [[A]]) #[[ATTR20]]
@@ -3494,16 +3494,16 @@ define void @returnedPtrAccesses() {
 }
 
 define void @returnedPtrAccessesMultiple(i32 %i) {
-; TUNIT-LABEL: define {{[^@]+}}@returnedPtrAccessesMultiple
-; TUNIT-SAME: (i32 [[I:%.*]]) {
+; TUNIT-LABEL: define void @returnedPtrAccessesMultiple(
+; TUNIT-SAME: i32 [[I:%.*]]) {
 ; TUNIT-NEXT:    [[A:%.*]] = alloca i64, align 8
 ; TUNIT-NEXT:    [[AP:%.*]] = call ptr @move246(i32 [[I]], ptr noalias nofree noundef nonnull readnone align 8 dereferenceable(8) "no-capture-maybe-returned" [[A]]) #[[ATTR23]]
 ; TUNIT-NEXT:    store i8 2, ptr [[AP]], align 1
 ; TUNIT-NEXT:    call void @use3i8(i8 2, i8 2, i8 2)
 ; TUNIT-NEXT:    ret void
 ;
-; CGSCC-LABEL: define {{[^@]+}}@returnedPtrAccessesMultiple
-; CGSCC-SAME: (i32 [[I:%.*]]) {
+; CGSCC-LABEL: define void @returnedPtrAccessesMultiple(
+; CGSCC-SAME: i32 [[I:%.*]]) {
 ; CGSCC-NEXT:    [[A:%.*]] = alloca i64, align 8
 ; CGSCC-NEXT:    [[AP:%.*]] = call ptr @move246(i32 [[I]], ptr noalias nofree noundef nonnull readnone align 8 dereferenceable(8) [[A]]) #[[ATTR20]]
 ; CGSCC-NEXT:    [[G2:%.*]] = getelementptr i8, ptr [[A]], i32 2
@@ -3530,8 +3530,8 @@ define void @returnedPtrAccessesMultiple(i32 %i) {
 }
 
 define void @returnedPtrAccessesMultiple2(i32 %i) {
-; TUNIT-LABEL: define {{[^@]+}}@returnedPtrAccessesMultiple2
-; TUNIT-SAME: (i32 [[I:%.*]]) {
+; TUNIT-LABEL: define void @returnedPtrAccessesMultiple2(
+; TUNIT-SAME: i32 [[I:%.*]]) {
 ; TUNIT-NEXT:    [[A:%.*]] = alloca i64, align 8
 ; TUNIT-NEXT:    [[G2:%.*]] = getelementptr i8, ptr [[A]], i32 2
 ; TUNIT-NEXT:    [[G4:%.*]] = getelementptr i8, ptr [[A]], i32 4
@@ -3547,8 +3547,8 @@ define void @returnedPtrAccessesMultiple2(i32 %i) {
 ; TUNIT-NEXT:    call void @use3i8(i8 noundef [[L2]], i8 noundef [[L4]], i8 noundef [[L6]])
 ; TUNIT-NEXT:    ret void
 ;
-; CGSCC-LABEL: define {{[^@]+}}@returnedPtrAccessesMultiple2
-; CGSCC-SAME: (i32 [[I:%.*]]) {
+; CGSCC-LABEL: define void @returnedPtrAccessesMultiple2(
+; CGSCC-SAME: i32 [[I:%.*]]) {
 ; CGSCC-NEXT:    [[A:%.*]] = alloca i64, align 8
 ; CGSCC-NEXT:    [[G2:%.*]] = getelementptr i8, ptr [[A]], i32 2
 ; CGSCC-NEXT:    [[G4:%.*]] = getelementptr i8, ptr [[A]], i32 4
@@ -3677,57 +3677,58 @@ declare void @llvm.assume(i1 noundef)
 ; TUNIT: [[META0:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
 ; TUNIT: [[META1:![0-9]+]] = !{i32 7, !"uwtable", i32 1}
 ; TUNIT: [[META2:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
-; TUNIT: [[TBAA3]] = !{[[META4:![0-9]+]], [[META4]], i64 0}
+; TUNIT: [[INT_TBAA3]] = !{[[META4:![0-9]+]], [[META4]], i64 0}
 ; TUNIT: [[META4]] = !{!"int", [[META5:![0-9]+]], i64 0}
 ; TUNIT: [[META5]] = !{!"omnipotent char", [[META6:![0-9]+]], i64 0}
 ; TUNIT: [[META6]] = !{!"Simple C/C++ TBAA"}
-; TUNIT: [[TBAA7]] = !{[[META8:![0-9]+]], [[META9:![0-9]+]], i64 12}
+; TUNIT: [[FLOAT_TBAA7]] = !{[[META8:![0-9]+]], [[META9:![0-9]+]], i64 12}
 ; TUNIT: [[META8]] = !{!"S", [[META4]], i64 0, [[META4]], i64 4, [[META4]], i64 8, [[META9]], i64 12, [[META9]], i64 16, [[META9]], i64 20}
 ; TUNIT: [[META9]] = !{!"float", [[META5]], i64 0}
-; TUNIT: [[TBAA10]] = !{[[META8]], [[META9]], i64 16}
-; TUNIT: [[TBAA11]] = !{[[META8]], [[META9]], i64 20}
-; TUNIT: [[TBAA12]] = !{[[META8]], [[META4]], i64 0}
-; TUNIT: [[TBAA13]] = !{[[META8]], [[META4]], i64 4}
-; TUNIT: [[TBAA14]] = !{[[META8]], [[META4]], i64 8}
+; TUNIT: [[FLOAT_TBAA10]] = !{[[META8]], [[META9]], i64 16}
+; TUNIT: [[FLOAT_TBAA11]] = !{[[META8]], [[META9]], i64 20}
+; TUNIT: [[INT_TBAA12]] = !{[[META8]], [[META4]], i64 0}
+; TUNIT: [[INT_TBAA13]] = !{[[META8]], [[META4]], i64 4}
+; TUNIT: [[INT_TBAA14]] = !{[[META8]], [[META4]], i64 8}
 ; TUNIT: [[LOOP15]] = distinct !{[[LOOP15]], [[META16:![0-9]+]]}
 ; TUNIT: [[META16]] = !{!"llvm.loop.mustprogress"}
 ; TUNIT: [[LOOP17]] = distinct !{[[LOOP17]], [[META16]]}
 ; TUNIT: [[LOOP18]] = distinct !{[[LOOP18]], [[META16]]}
-; TUNIT: [[TBAA19]] = !{[[META5]], [[META5]], i64 0}
+; TUNIT: [[CHAR_TBAA19]] = !{[[META5]], [[META5]], i64 0}
 ; TUNIT: [[LOOP20]] = distinct !{[[LOOP20]], [[META16]]}
 ; TUNIT: [[LOOP21]] = distinct !{[[LOOP21]], [[META16]]}
 ; TUNIT: [[LOOP22]] = distinct !{[[LOOP22]], [[META16]]}
 ; TUNIT: [[LOOP23]] = distinct !{[[LOOP23]], [[META16]]}
 ; TUNIT: [[LOOP24]] = distinct !{[[LOOP24]], [[META16]]}
 ; TUNIT: [[LOOP25]] = distinct !{[[LOOP25]], [[META16]]}
-; TUNIT: [[TBAA26]] = !{[[META9]], [[META9]], i64 0}
+; TUNIT: [[FLOAT_TBAA26]] = !{[[META9]], [[META9]], i64 0}
 ; TUNIT: [[LOOP27]] = distinct !{[[LOOP27]], [[META16]]}
-; TUNIT: [[TBAA28]] = !{[[META29:![0-9]+]], [[META29]], i64 0}
+; TUNIT: [[LONG_LONG_TBAA28]] = !{[[META29:![0-9]+]], [[META29]], i64 0}
 ; TUNIT: [[META29]] = !{!"long long", [[META5]], i64 0}
 ; TUNIT: [[LOOP30]] = distinct !{[[LOOP30]], [[META16]]}
 ; TUNIT: [[LOOP31]] = distinct !{[[LOOP31]], [[META16]]}
+; TUNIT: [[META32]] = !{}
 ;.
 ; CGSCC: [[META0:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
 ; CGSCC: [[META1:![0-9]+]] = !{i32 7, !"uwtable", i32 1}
 ; CGSCC: [[META2:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
-; CGSCC: [[TBAA3]] = !{[[META4:![0-9]+]], [[META4]], i64 0}
+; CGSCC: [[INT_TBAA3]] = !{[[META4:![0-9]+]], [[META4]], i64 0}
 ; CGSCC: [[META4]] = !{!"int", [[META5:![0-9]+]], i64 0}
 ; CGSCC: [[META5]] = !{!"omnipotent char", [[META6:![0-9]+]], i64 0}
 ; CGSCC: [[META6]] = !{!"Simple C/C++ TBAA"}
-; CGSCC: [[TBAA7]] = !{[[META8:![0-9]+]], [[META9:![0-9]+]], i64 12}
+; CGSCC: [[FLOAT_TBAA7]] = !{[[META8:![0-9]+]], [[META9:![0-9]+]], i64 12}
 ; CGSCC: [[META8]] = !{!"S", [[META4]], i64 0, [[META4]], i64 4, [[META4]], i64 8, [[META9]], i64 12, [[META9]], i64 16, [[META9]], i64 20}
 ; CGSCC: [[META9]] = !{!"float", [[META5]], i64 0}
-; CGSCC: [[TBAA10]] = !{[[META8]], [[META9]], i64 16}
-; CGSCC: [[TBAA11]] = !{[[META8]], [[META9]], i64 20}
-; CGSCC: [[TBAA12]] = !{[[META8]], [[META4]], i64 0}
-; CGSCC: [[TBAA13]] = !{[[META8]], [[META4]], i64 4}
-; CGSCC: [[TBAA14]] = !{[[META8]], [[META4]], i64 8}
-; CGSCC: [[TBAA15]] = !{[[META5]], [[META5]], i64 0}
+; CGSCC: [[FLOAT_TBAA10]] = !{[[META8]], [[META9]], i64 16}
+; CGSCC: [[FLOAT_TBAA11]] = !{[[META8]], [[META9]], i64 20}
+; CGSCC: [[INT_TBAA12]] = !{[[META8]], [[META4]], i64 0}
+; CGSCC: [[INT_TBAA13]] = !{[[META8]], [[META4]], i64 4}
+; CGSCC: [[INT_TBAA14]] = !{[[META8]], [[META4]], i64 8}
+; CGSCC: [[CHAR_TBAA15]] = !{[[META5]], [[META5]], i64 0}
 ; CGSCC: [[LOOP16]] = distinct !{[[LOOP16]], [[META17:![0-9]+]]}
 ; CGSCC: [[META17]] = !{!"llvm.loop.mustprogress"}
-; CGSCC: [[TBAA18]] = !{[[META9]], [[META9]], i64 0}
+; CGSCC: [[FLOAT_TBAA18]] = !{[[META9]], [[META9]], i64 0}
 ; CGSCC: [[LOOP19]] = distinct !{[[LOOP19]], [[META17]]}
-; CGSCC: [[TBAA20]] = !{[[META21:![0-9]+]], [[META21]], i64 0}
+; CGSCC: [[LONG_LONG_TBAA20]] = !{[[META21:![0-9]+]], [[META21]], i64 0}
 ; CGSCC: [[META21]] = !{!"long long", [[META5]], i64 0}
 ; CGSCC: [[LOOP22]] = distinct !{[[LOOP22]], [[META17]]}
 ; CGSCC: [[LOOP23]] = distinct !{[[LOOP23]], [[META17]]}
@@ -3739,4 +3740,5 @@ declare void @llvm.assume(i1 noundef)
 ; CGSCC: [[LOOP29]] = distinct !{[[LOOP29]], [[META17]]}
 ; CGSCC: [[LOOP30]] = distinct !{[[LOOP30]], [[META17]]}
 ; CGSCC: [[LOOP31]] = distinct !{[[LOOP31]], [[META17]]}
+; CGSCC: [[META32]] = !{}
 ;.
diff --git a/llvm/test/Transforms/CodeGenPrepare/unfold-pow2-test-vec.ll b/llvm/test/Transforms/CodeGenPrepare/unfold-pow2-test-vec.ll
index 9e4a10d9eb864..9c5df5f70fc15 100644
--- a/llvm/test/Transforms/CodeGenPrepare/unfold-pow2-test-vec.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/unfold-pow2-test-vec.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt -p 'require<profile-summary>,function(codegenprepare)' -S %s \
 ; RUN:   | FileCheck %s --check-prefix=SLOW
-; RUN: opt -p 'require<profile-summary>,function(codegenprepare)' -S --mattr=+zvbb %s \
+; RUN: opt -p 'require<profile-summary>,function(codegenprepare)' -S --mattr=+v,+zvbb %s \
 ; RUN:   | FileCheck %s --check-prefix=FAST
 ; REQUIRES: riscv-registered-target
 
diff --git a/llvm/test/Transforms/Coroutines/coro-split-invalid.ll b/llvm/test/Transforms/Coroutines/coro-split-invalid.ll
new file mode 100644
index 0000000000000..94fe539697214
--- /dev/null
+++ b/llvm/test/Transforms/Coroutines/coro-split-invalid.ll
@@ -0,0 +1,14 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; Tests that coro-split correctly invalidate bad coroutines
+; RUN: opt < %s -passes='cgscc(coro-split)' -S | FileCheck %s
+
+define void @pr156444() presplitcoroutine {
+; CHECK-LABEL: define void @pr156444(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = call i8 @llvm.coro.suspend(token none, i1 false)
+  ret void
+}
diff --git a/llvm/test/Transforms/FunctionAttrs/nocapture.ll b/llvm/test/Transforms/FunctionAttrs/nocapture.ll
index 26b5dc2dc7760..60a4214548a72 100644
--- a/llvm/test/Transforms/FunctionAttrs/nocapture.ll
+++ b/llvm/test/Transforms/FunctionAttrs/nocapture.ll
@@ -46,7 +46,7 @@ define void @c3(ptr %q) {
 ; ATTRIBUTOR: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(write)
 ; ATTRIBUTOR-LABEL: define void @c3
 ; ATTRIBUTOR-SAME: (ptr nofree writeonly [[Q:%.*]]) #[[ATTR1]] {
-; ATTRIBUTOR-NEXT:    call void @c2(ptr nofree writeonly [[Q]]) #[[ATTR19:[0-9]+]]
+; ATTRIBUTOR-NEXT:    call void @c2(ptr nofree writeonly [[Q]]) #[[ATTR21:[0-9]+]]
 ; ATTRIBUTOR-NEXT:    ret void
 ;
   call void @c2(ptr %q)
@@ -161,7 +161,7 @@ declare void @throw_if_bit_set(ptr, i8) readonly
 define i1 @c6(ptr %q, i8 %bit) personality ptr @__gxx_personality_v0 {
 ; FNATTRS: Function Attrs: nofree memory(read)
 ; FNATTRS-LABEL: define noundef i1 @c6
-; FNATTRS-SAME: (ptr readonly [[Q:%.*]], i8 [[BIT:%.*]]) #[[ATTR5:[0-9]+]] personality ptr @__gxx_personality_v0 {
+; FNATTRS-SAME: (ptr readonly captures(address) [[Q:%.*]], i8 [[BIT:%.*]]) #[[ATTR5:[0-9]+]] personality ptr @__gxx_personality_v0 {
 ; FNATTRS-NEXT:    invoke void @throw_if_bit_set(ptr [[Q]], i8 [[BIT]])
 ; FNATTRS-NEXT:            to label [[RET0:%.*]] unwind label [[RET1:%.*]]
 ; FNATTRS:       ret0:
@@ -232,7 +232,7 @@ define i1 @c7(ptr %q, i32 %bitno) {
 ; ATTRIBUTOR: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(read)
 ; ATTRIBUTOR-LABEL: define i1 @c7
 ; ATTRIBUTOR-SAME: (ptr nofree readonly [[Q:%.*]], i32 [[BITNO:%.*]]) #[[ATTR2]] {
-; ATTRIBUTOR-NEXT:    [[PTR:%.*]] = call ptr @lookup_bit(ptr nofree readnone [[Q]], i32 [[BITNO]]) #[[ATTR20:[0-9]+]]
+; ATTRIBUTOR-NEXT:    [[PTR:%.*]] = call ptr @lookup_bit(ptr nofree readnone [[Q]], i32 [[BITNO]]) #[[ATTR22:[0-9]+]]
 ; ATTRIBUTOR-NEXT:    [[VAL:%.*]] = load i1, ptr [[PTR]], align 1
 ; ATTRIBUTOR-NEXT:    ret i1 [[VAL]]
 ;
@@ -337,7 +337,7 @@ define void @nc2(ptr %p, ptr %q) {
 ; ATTRIBUTOR: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn
 ; ATTRIBUTOR-LABEL: define void @nc2
 ; ATTRIBUTOR-SAME: (ptr nofree captures(none) [[P:%.*]], ptr nofree [[Q:%.*]]) #[[ATTR5]] {
-; ATTRIBUTOR-NEXT:    [[TMP1:%.*]] = call i32 @nc1(ptr nofree [[Q]], ptr nofree captures(none) [[P]], i1 false) #[[ATTR21:[0-9]+]]
+; ATTRIBUTOR-NEXT:    [[TMP1:%.*]] = call i32 @nc1(ptr nofree [[Q]], ptr nofree captures(none) [[P]], i1 false) #[[ATTR23:[0-9]+]]
 ; ATTRIBUTOR-NEXT:    ret void
 ;
   %1 = call i32 @nc1(ptr %q, ptr %p, i1 0)		; <i32> [#uses=0]
@@ -364,7 +364,7 @@ declare void @external_not_willreturn(ptr) readonly nounwind
 define void @readonly_nounwind_not_willreturn(ptr %p) {
 ; FNATTRS: Function Attrs: nofree nounwind memory(read)
 ; FNATTRS-LABEL: define void @readonly_nounwind_not_willreturn
-; FNATTRS-SAME: (ptr readonly [[P:%.*]]) #[[ATTR9:[0-9]+]] {
+; FNATTRS-SAME: (ptr readonly captures(address) [[P:%.*]]) #[[ATTR9:[0-9]+]] {
 ; FNATTRS-NEXT:    call void @external_not_willreturn(ptr [[P]])
 ; FNATTRS-NEXT:    ret void
 ;
@@ -382,14 +382,14 @@ declare void @external_willreturn(ptr) readonly nounwind willreturn
 define void @readonly_nounwind_willreturn(ptr %p) {
 ; FNATTRS: Function Attrs: mustprogress nofree nounwind willreturn memory(read)
 ; FNATTRS-LABEL: define void @readonly_nounwind_willreturn
-; FNATTRS-SAME: (ptr readonly captures(none) [[P:%.*]]) #[[ATTR11:[0-9]+]] {
+; FNATTRS-SAME: (ptr readonly captures(address) [[P:%.*]]) #[[ATTR11:[0-9]+]] {
 ; FNATTRS-NEXT:    call void @external_willreturn(ptr [[P]])
 ; FNATTRS-NEXT:    ret void
 ;
 ; ATTRIBUTOR: Function Attrs: mustprogress nosync nounwind willreturn memory(read)
 ; ATTRIBUTOR-LABEL: define void @readonly_nounwind_willreturn
 ; ATTRIBUTOR-SAME: (ptr readonly captures(none) [[P:%.*]]) #[[ATTR9:[0-9]+]] {
-; ATTRIBUTOR-NEXT:    call void @external_willreturn(ptr readonly captures(none) [[P]]) #[[ATTR22:[0-9]+]]
+; ATTRIBUTOR-NEXT:    call void @external_willreturn(ptr readonly captures(none) [[P]]) #[[ATTR24:[0-9]+]]
 ; ATTRIBUTOR-NEXT:    ret void
 ;
   call void @external_willreturn(ptr %p)
@@ -398,7 +398,7 @@ define void @readonly_nounwind_willreturn(ptr %p) {
 
 define void @callsite_readonly_nounwind_not_willreturn(ptr %f, ptr %p) {
 ; FNATTRS-LABEL: define void @callsite_readonly_nounwind_not_willreturn
-; FNATTRS-SAME: (ptr readonly captures(none) [[F:%.*]], ptr [[P:%.*]]) {
+; FNATTRS-SAME: (ptr readonly captures(none) [[F:%.*]], ptr captures(address) [[P:%.*]]) {
 ; FNATTRS-NEXT:    call void [[F]](ptr [[P]]) #[[ATTR8:[0-9]+]]
 ; FNATTRS-NEXT:    call void [[F]](ptr captures(none) [[P]])
 ; FNATTRS-NEXT:    ret void
@@ -416,13 +416,13 @@ define void @callsite_readonly_nounwind_not_willreturn(ptr %f, ptr %p) {
 
 define void @callsite_readonly_nounwind_willreturn(ptr %f, ptr %p) {
 ; FNATTRS-LABEL: define void @callsite_readonly_nounwind_willreturn
-; FNATTRS-SAME: (ptr readonly captures(none) [[F:%.*]], ptr captures(none) [[P:%.*]]) {
+; FNATTRS-SAME: (ptr readonly captures(none) [[F:%.*]], ptr captures(address) [[P:%.*]]) {
 ; FNATTRS-NEXT:    call void [[F]](ptr [[P]]) #[[ATTR10:[0-9]+]]
 ; FNATTRS-NEXT:    call void [[F]](ptr captures(none) [[P]])
 ; FNATTRS-NEXT:    ret void
 ;
 ; ATTRIBUTOR-LABEL: define void @callsite_readonly_nounwind_willreturn
-; ATTRIBUTOR-SAME: (ptr nofree nonnull captures(none) [[F:%.*]], ptr captures(none) [[P:%.*]]) {
+; ATTRIBUTOR-SAME: (ptr nofree nonnull captures(none) [[F:%.*]], ptr [[P:%.*]]) {
 ; ATTRIBUTOR-NEXT:    call void [[F]](ptr [[P]]) #[[ATTR8:[0-9]+]]
 ; ATTRIBUTOR-NEXT:    call void [[F]](ptr captures(none) [[P]])
 ; ATTRIBUTOR-NEXT:    ret void
@@ -732,7 +732,7 @@ define void @nocaptureLaunder(ptr %p) {
 ; ATTRIBUTOR-LABEL: define void @nocaptureLaunder
 ; ATTRIBUTOR-SAME: (ptr nofree captures(none) [[P:%.*]]) #[[ATTR12:[0-9]+]] {
 ; ATTRIBUTOR-NEXT:  entry:
-; ATTRIBUTOR-NEXT:    [[B:%.*]] = call ptr @llvm.launder.invariant.group.p0(ptr [[P]]) #[[ATTR23:[0-9]+]]
+; ATTRIBUTOR-NEXT:    [[B:%.*]] = call ptr @llvm.launder.invariant.group.p0(ptr [[P]]) #[[ATTR25:[0-9]+]]
 ; ATTRIBUTOR-NEXT:    store i8 42, ptr [[B]], align 1
 ; ATTRIBUTOR-NEXT:    ret void
 ;
@@ -754,7 +754,7 @@ define void @captureLaunder(ptr %p) {
 ; ATTRIBUTOR: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn
 ; ATTRIBUTOR-LABEL: define void @captureLaunder
 ; ATTRIBUTOR-SAME: (ptr nofree [[P:%.*]]) #[[ATTR5]] {
-; ATTRIBUTOR-NEXT:    [[B:%.*]] = call ptr @llvm.launder.invariant.group.p0(ptr [[P]]) #[[ATTR23]]
+; ATTRIBUTOR-NEXT:    [[B:%.*]] = call ptr @llvm.launder.invariant.group.p0(ptr [[P]]) #[[ATTR25]]
 ; ATTRIBUTOR-NEXT:    store ptr [[B]], ptr @g2, align 8
 ; ATTRIBUTOR-NEXT:    ret void
 ;
@@ -776,7 +776,7 @@ define void @nocaptureStrip(ptr %p) {
 ; ATTRIBUTOR-LABEL: define void @nocaptureStrip
 ; ATTRIBUTOR-SAME: (ptr nofree writeonly captures(none) [[P:%.*]]) #[[ATTR13:[0-9]+]] {
 ; ATTRIBUTOR-NEXT:  entry:
-; ATTRIBUTOR-NEXT:    [[B:%.*]] = call ptr @llvm.strip.invariant.group.p0(ptr [[P]]) #[[ATTR20]]
+; ATTRIBUTOR-NEXT:    [[B:%.*]] = call ptr @llvm.strip.invariant.group.p0(ptr [[P]]) #[[ATTR22]]
 ; ATTRIBUTOR-NEXT:    store i8 42, ptr [[B]], align 1
 ; ATTRIBUTOR-NEXT:    ret void
 ;
@@ -798,7 +798,7 @@ define void @captureStrip(ptr %p) {
 ; ATTRIBUTOR: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(write)
 ; ATTRIBUTOR-LABEL: define void @captureStrip
 ; ATTRIBUTOR-SAME: (ptr nofree writeonly [[P:%.*]]) #[[ATTR1]] {
-; ATTRIBUTOR-NEXT:    [[B:%.*]] = call ptr @llvm.strip.invariant.group.p0(ptr [[P]]) #[[ATTR20]]
+; ATTRIBUTOR-NEXT:    [[B:%.*]] = call ptr @llvm.strip.invariant.group.p0(ptr [[P]]) #[[ATTR22]]
 ; ATTRIBUTOR-NEXT:    store ptr [[B]], ptr @g3, align 8
 ; ATTRIBUTOR-NEXT:    ret void
 ;
@@ -1032,14 +1032,14 @@ define void @recurse_fptr(ptr %f, ptr %p) {
 define void @readnone_indirec(ptr %f, ptr %p) {
 ; FNATTRS: Function Attrs: nofree nosync memory(none)
 ; FNATTRS-LABEL: define void @readnone_indirec
-; FNATTRS-SAME: (ptr readonly captures(none) [[F:%.*]], ptr readnone [[P:%.*]]) #[[ATTR19:[0-9]+]] {
-; FNATTRS-NEXT:    call void [[F]](ptr [[P]]) #[[ATTR23:[0-9]+]]
+; FNATTRS-SAME: (ptr readonly captures(none) [[F:%.*]], ptr readnone captures(address) [[P:%.*]]) #[[ATTR19:[0-9]+]] {
+; FNATTRS-NEXT:    call void [[F]](ptr [[P]]) #[[ATTR25:[0-9]+]]
 ; FNATTRS-NEXT:    ret void
 ;
 ; ATTRIBUTOR: Function Attrs: nosync memory(none)
 ; ATTRIBUTOR-LABEL: define void @readnone_indirec
 ; ATTRIBUTOR-SAME: (ptr nofree nonnull readnone captures(none) [[F:%.*]], ptr readnone [[P:%.*]]) #[[ATTR15:[0-9]+]] {
-; ATTRIBUTOR-NEXT:    call void [[F]](ptr [[P]]) #[[ATTR24:[0-9]+]]
+; ATTRIBUTOR-NEXT:    call void [[F]](ptr [[P]]) #[[ATTR26:[0-9]+]]
 ; ATTRIBUTOR-NEXT:    ret void
 ;
   call void %f(ptr %p) readnone
@@ -1347,5 +1347,56 @@ exit:
   ret void
 }
 
+define void @assume_align(ptr %p) {
+; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(inaccessiblemem: write)
+; FNATTRS-LABEL: define void @assume_align
+; FNATTRS-SAME: (ptr readnone captures(none) [[P:%.*]]) #[[ATTR21:[0-9]+]] {
+; FNATTRS-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[P]], i64 8) ]
+; FNATTRS-NEXT:    ret void
+;
+; ATTRIBUTOR: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(inaccessiblemem: write)
+; ATTRIBUTOR-LABEL: define void @assume_align
+; ATTRIBUTOR-SAME: (ptr nofree readnone captures(none) [[P:%.*]]) #[[ATTR17:[0-9]+]] {
+; ATTRIBUTOR-NEXT:    call void @llvm.assume(i1 true) #[[ATTR27:[0-9]+]] [ "align"(ptr [[P]], i64 8) ]
+; ATTRIBUTOR-NEXT:    ret void
+;
+  call void @llvm.assume(i1 true) ["align"(ptr %p, i64 8)]
+  ret void
+}
+
+define void @assume_dereferenceable(ptr %p) {
+; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(inaccessiblemem: write)
+; FNATTRS-LABEL: define void @assume_dereferenceable
+; FNATTRS-SAME: (ptr readnone captures(none) [[P:%.*]]) #[[ATTR21]] {
+; FNATTRS-NEXT:    call void @llvm.assume(i1 true) [ "dereferenceable"(ptr [[P]], i64 8) ]
+; FNATTRS-NEXT:    ret void
+;
+; ATTRIBUTOR: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(inaccessiblemem: write)
+; ATTRIBUTOR-LABEL: define void @assume_dereferenceable
+; ATTRIBUTOR-SAME: (ptr nofree nonnull readnone captures(none) [[P:%.*]]) #[[ATTR17]] {
+; ATTRIBUTOR-NEXT:    call void @llvm.assume(i1 true) #[[ATTR27]] [ "dereferenceable"(ptr [[P]], i64 8) ]
+; ATTRIBUTOR-NEXT:    ret void
+;
+  call void @llvm.assume(i1 true) ["dereferenceable"(ptr %p, i64 8)]
+  ret void
+}
+
+define void @assume_nonnull(ptr %p) {
+; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(inaccessiblemem: write)
+; FNATTRS-LABEL: define void @assume_nonnull
+; FNATTRS-SAME: (ptr readnone captures(none) [[P:%.*]]) #[[ATTR21]] {
+; FNATTRS-NEXT:    call void @llvm.assume(i1 true) [ "nonnull"(ptr [[P]]) ]
+; FNATTRS-NEXT:    ret void
+;
+; ATTRIBUTOR: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(inaccessiblemem: write)
+; ATTRIBUTOR-LABEL: define void @assume_nonnull
+; ATTRIBUTOR-SAME: (ptr nofree nonnull readnone captures(none) [[P:%.*]]) #[[ATTR17]] {
+; ATTRIBUTOR-NEXT:    call void @llvm.assume(i1 true) #[[ATTR27]] [ "nonnull"(ptr [[P]]) ]
+; ATTRIBUTOR-NEXT:    ret void
+;
+  call void @llvm.assume(i1 true) ["nonnull"(ptr %p)]
+  ret void
+}
+
 declare ptr @llvm.launder.invariant.group.p0(ptr)
 declare ptr @llvm.strip.invariant.group.p0(ptr)
diff --git a/llvm/test/Transforms/FunctionAttrs/nonnull.ll b/llvm/test/Transforms/FunctionAttrs/nonnull.ll
index 8df242fb023af..9d5ae1606f2e3 100644
--- a/llvm/test/Transforms/FunctionAttrs/nonnull.ll
+++ b/llvm/test/Transforms/FunctionAttrs/nonnull.ll
@@ -1032,7 +1032,7 @@ define  ptr @g1() {
 declare void @use_i32_ptr(ptr) readnone nounwind willreturn
 define internal void @called_by_weak(ptr %a) {
 ; FNATTRS-LABEL: define internal void @called_by_weak(
-; FNATTRS-SAME: ptr readnone captures(none) [[A:%.*]]) #[[ATTR10:[0-9]+]] {
+; FNATTRS-SAME: ptr readnone captures(address) [[A:%.*]]) #[[ATTR10:[0-9]+]] {
 ; FNATTRS-NEXT:    call void @use_i32_ptr(ptr [[A]])
 ; FNATTRS-NEXT:    ret void
 ;
@@ -1064,7 +1064,7 @@ define weak_odr void @weak_caller(ptr nonnull %a) {
 ; Expect nonnull
 define internal void @control(ptr dereferenceable(4) %a) {
 ; FNATTRS-LABEL: define internal void @control(
-; FNATTRS-SAME: ptr readnone captures(none) dereferenceable(4) [[A:%.*]]) #[[ATTR10]] {
+; FNATTRS-SAME: ptr readnone captures(address) dereferenceable(4) [[A:%.*]]) #[[ATTR10]] {
 ; FNATTRS-NEXT:    call void @use_i32_ptr(ptr [[A]])
 ; FNATTRS-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/FunctionAttrs/out-of-bounds-iterator-bug.ll b/llvm/test/Transforms/FunctionAttrs/out-of-bounds-iterator-bug.ll
index e2e3603e9cb43..6acb0af13772a 100644
--- a/llvm/test/Transforms/FunctionAttrs/out-of-bounds-iterator-bug.ll
+++ b/llvm/test/Transforms/FunctionAttrs/out-of-bounds-iterator-bug.ll
@@ -17,14 +17,14 @@ define void @va_func(ptr readonly %b, ...) readonly nounwind willreturn {
 }
 
 define i32 @caller(ptr %x) {
-; CHECK-LABEL: define noundef i32 @caller(ptr readonly captures(none) %x)
+; CHECK-LABEL: define noundef i32 @caller(ptr readonly captures(address) %x)
  entry:
   call void(ptr,...) @va_func(ptr null, i32 0, i32 0, i32 0, ptr %x)
   ret i32 42
 }
 
 define void @va_func2(ptr readonly %b, ...) {
-; CHECK-LABEL: define void @va_func2(ptr readonly captures(none) %b, ...)
+; CHECK-LABEL: define void @va_func2(ptr readonly captures(address) %b, ...)
  entry:
   %valist = alloca i8
   call void @llvm.va_start(ptr %valist)
@@ -34,7 +34,7 @@ define void @va_func2(ptr readonly %b, ...) {
 }
 
 define i32 @caller2(ptr %x, ptr %y) {
-; CHECK-LABEL: define noundef i32 @caller2(ptr readonly captures(none) %x, ptr %y)
+; CHECK-LABEL: define noundef i32 @caller2(ptr readonly captures(address) %x, ptr %y)
  entry:
   call void(ptr,...) @va_func2(ptr %x, i32 0, i32 0, i32 0, ptr %y)
   ret i32 42
diff --git a/llvm/test/Transforms/FunctionAttrs/readattrs.ll b/llvm/test/Transforms/FunctionAttrs/readattrs.ll
index 5fc88d623c0ec..d0aec184f49c3 100644
--- a/llvm/test/Transforms/FunctionAttrs/readattrs.ll
+++ b/llvm/test/Transforms/FunctionAttrs/readattrs.ll
@@ -483,7 +483,7 @@ define void @fptr_test1b(ptr %p, ptr %f) {
 define void @fptr_test1c(ptr %p, ptr %f) {
 ; FNATTRS: Function Attrs: nofree memory(read)
 ; FNATTRS-LABEL: define {{[^@]+}}@fptr_test1c
-; FNATTRS-SAME: (ptr readnone [[P:%.*]], ptr readonly captures(none) [[F:%.*]]) #[[ATTR3]] {
+; FNATTRS-SAME: (ptr readnone captures(address) [[P:%.*]], ptr readonly captures(none) [[F:%.*]]) #[[ATTR3]] {
 ; FNATTRS-NEXT:    call void [[F]](ptr readnone [[P]]) #[[ATTR2:[0-9]+]]
 ; FNATTRS-NEXT:    ret void
 ;
@@ -547,7 +547,7 @@ define void @fptr_test2b(ptr %p, ptr %f) {
 define void @fptr_test2c(ptr %p, ptr %f) {
 ; FNATTRS: Function Attrs: nofree memory(read)
 ; FNATTRS-LABEL: define {{[^@]+}}@fptr_test2c
-; FNATTRS-SAME: (ptr readonly [[P:%.*]], ptr readonly captures(none) [[F:%.*]]) #[[ATTR3]] {
+; FNATTRS-SAME: (ptr readonly captures(address) [[P:%.*]], ptr readonly captures(none) [[F:%.*]]) #[[ATTR3]] {
 ; FNATTRS-NEXT:    call void [[F]](ptr readonly [[P]]) #[[ATTR2]]
 ; FNATTRS-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/GVN/PRE/load-pre-nonlocal.ll b/llvm/test/Transforms/GVN/PRE/load-pre-nonlocal.ll
index 9dba73a1beb77..7348df38d4de8 100644
--- a/llvm/test/Transforms/GVN/PRE/load-pre-nonlocal.ll
+++ b/llvm/test/Transforms/GVN/PRE/load-pre-nonlocal.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt -S -o - -passes=gvn %s | FileCheck %s --check-prefixes=CHECK,MDEP
 ; RUN: opt -S -o - -passes='gvn<memoryssa>' %s | FileCheck %s --check-prefixes=CHECK,MSSA
 
@@ -13,32 +13,33 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 ; Check that GVN doesn't determine %2 is partially redundant.
 
 define i32 @volatile_load(i32 %n) {
-; CHECK-LABEL: @volatile_load(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; CHECK-NEXT:    br i1 [[CMP6]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]]
-; CHECK:       for.body.lr.ph:
-; CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr @a2, align 8, !tbaa [[TBAA5:![0-9]+]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr @a, align 8, !tbaa [[TBAA5]]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[S_09:%.*]] = phi i32 [ 0, [[FOR_BODY_LR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[P_08:%.*]] = phi ptr [ [[TMP0]], [[FOR_BODY_LR_PH]] ], [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[P_08]], align 4, !tbaa [[TBAA9:![0-9]+]]
+; CHECK-LABEL: define i32 @volatile_load(
+; CHECK-SAME: i32 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[CMP6:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP6]], label %[[FOR_BODY_LR_PH:.*]], label %[[FOR_END:.*]]
+; CHECK:       [[FOR_BODY_LR_PH]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr @a2, align 8, !tbaa [[ANYPTR_TBAA5:![0-9]+]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr @a, align 8, !tbaa [[ANYPTR_TBAA5]]
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[FOR_BODY_LR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[S_09:%.*]] = phi i32 [ 0, %[[FOR_BODY_LR_PH]] ], [ [[ADD:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[P_08:%.*]] = phi ptr [ [[TMP0]], %[[FOR_BODY_LR_PH]] ], [ [[INCDEC_PTR:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[P_08]], align 4, !tbaa [[INT_TBAA9:![0-9]+]]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    store i32 [[TMP2]], ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA9]]
-; CHECK-NEXT:    [[TMP3:%.*]] = load volatile i32, ptr [[P_08]], align 4, !tbaa [[TBAA9]]
+; CHECK-NEXT:    store i32 [[TMP2]], ptr [[ARRAYIDX]], align 4, !tbaa [[INT_TBAA9]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load volatile i32, ptr [[P_08]], align 4, !tbaa [[INT_TBAA9]]
 ; CHECK-NEXT:    [[ADD]] = add nsw i32 [[TMP3]], [[S_09]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i32, ptr [[P_08]], i64 1
 ; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i32 [[LFTR_WIDEIV]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_COND_FOR_END_CRIT_EDGE:%.*]]
-; CHECK:       for.cond.for.end_crit_edge:
-; CHECK-NEXT:    br label [[FOR_END]]
-; CHECK:       for.end:
-; CHECK-NEXT:    [[S_0_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_COND_FOR_END_CRIT_EDGE]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[FOR_BODY]], label %[[FOR_COND_FOR_END_CRIT_EDGE:.*]]
+; CHECK:       [[FOR_COND_FOR_END_CRIT_EDGE]]:
+; CHECK-NEXT:    br label %[[FOR_END]]
+; CHECK:       [[FOR_END]]:
+; CHECK-NEXT:    [[S_0_LCSSA:%.*]] = phi i32 [ [[ADD]], %[[FOR_COND_FOR_END_CRIT_EDGE]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    ret i32 [[S_0_LCSSA]]
 ;
 entry:
@@ -78,22 +79,23 @@ for.end:
 ; But we should not widen %0 to 64-bit load.
 
 define i32 @overaligned_load(i32 %a, ptr nocapture %b) !dbg !13 {
-; CHECK-LABEL: @overaligned_load(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[A:%.*]], 0, !dbg [[DBG14:![0-9]+]]
-; CHECK-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]], !dbg [[DBG14]]
-; CHECK:       if.then:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr @s1, align 8, !dbg [[DBG15:![0-9]+]], !tbaa [[TBAA9]]
-; CHECK-NEXT:    br label [[IF_END:%.*]], !dbg [[DBG15]]
-; CHECK:       if.else:
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 2, !dbg [[DBG16:![0-9]+]]
-; CHECK-NEXT:    store i32 10, ptr [[ARRAYIDX]], align 4, !dbg [[DBG16]], !tbaa [[TBAA9]]
-; CHECK-NEXT:    br label [[IF_END]], !dbg [[DBG16]]
-; CHECK:       if.end:
-; CHECK-NEXT:    [[I_0:%.*]] = phi i32 [ [[TMP0]], [[IF_THEN]] ], [ 0, [[IF_ELSE]] ]
-; CHECK-NEXT:    [[P_0:%.*]] = phi ptr [ @s1, [[IF_THEN]] ], [ [[B]], [[IF_ELSE]] ]
+; CHECK-LABEL: define i32 @overaligned_load(
+; CHECK-SAME: i32 [[A:%.*]], ptr captures(none) [[B:%.*]]) !dbg [[DBG11:![0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[A]], 0, !dbg [[DBG14:![0-9]+]]
+; CHECK-NEXT:    br i1 [[CMP]], label %[[IF_THEN:.*]], label %[[IF_ELSE:.*]], !dbg [[DBG14]]
+; CHECK:       [[IF_THEN]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr @s1, align 8, !dbg [[DBG15:![0-9]+]], !tbaa [[INT_TBAA9]]
+; CHECK-NEXT:    br label %[[IF_END:.*]], !dbg [[DBG15]]
+; CHECK:       [[IF_ELSE]]:
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 2, !dbg [[DBG16:![0-9]+]]
+; CHECK-NEXT:    store i32 10, ptr [[ARRAYIDX]], align 4, !dbg [[DBG16]], !tbaa [[INT_TBAA9]]
+; CHECK-NEXT:    br label %[[IF_END]], !dbg [[DBG16]]
+; CHECK:       [[IF_END]]:
+; CHECK-NEXT:    [[I_0:%.*]] = phi i32 [ [[TMP0]], %[[IF_THEN]] ], [ 0, %[[IF_ELSE]] ]
+; CHECK-NEXT:    [[P_0:%.*]] = phi ptr [ @s1, %[[IF_THEN]] ], [ [[B]], %[[IF_ELSE]] ]
 ; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i32, ptr [[P_0]], i64 1, !dbg [[DBG17:![0-9]+]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ADD_PTR]], align 4, !dbg [[DBG17]], !tbaa [[TBAA9]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ADD_PTR]], align 4, !dbg [[DBG17]], !tbaa [[INT_TBAA9]]
 ; CHECK-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP1]], [[I_0]], !dbg [[DBG17]]
 ; CHECK-NEXT:    ret i32 [[ADD1]], !dbg [[DBG17]]
 ;
@@ -144,6 +146,23 @@ if.end:
   file: !12,
   isOptimized: true, flags: "-O2",
   splitDebugFilename: "abc.debug", emissionKind: 2)
+;.
+; CHECK: [[META3:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C99, file: [[META4:![0-9]+]], producer: "clang", isOptimized: true, flags: "-O2", runtimeVersion: 0, splitDebugFilename: "abc.debug", emissionKind: LineTablesOnly)
+; CHECK: [[META4]] = !DIFile(filename: "{{.*}}test.cpp", directory: {{.*}})
+; CHECK: [[ANYPTR_TBAA5]] = !{[[META6:![0-9]+]], [[META6]], i64 0}
+; CHECK: [[META6]] = !{!"any pointer", [[META7:![0-9]+]], i64 0}
+; CHECK: [[META7]] = !{!"omnipotent char", [[META8:![0-9]+]], i64 0}
+; CHECK: [[META8]] = !{!"Simple C/C++ TBAA"}
+; CHECK: [[INT_TBAA9]] = !{[[META10:![0-9]+]], [[META10]], i64 0}
+; CHECK: [[META10]] = !{!"int", [[META7]], i64 0}
+; CHECK: [[DBG11]] = distinct !DISubprogram(name: "test", scope: [[META4]], file: [[META4]], line: 99, type: [[META12:![0-9]+]], scopeLine: 100, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: [[META3]], retainedNodes: [[META13:![0-9]+]])
+; CHECK: [[META12]] = !DISubroutineType(types: [[META13]])
+; CHECK: [[META13]] = !{}
+; CHECK: [[DBG14]] = !DILocation(line: 100, column: 1, scope: [[DBG11]])
+; CHECK: [[DBG15]] = !DILocation(line: 101, column: 1, scope: [[DBG11]])
+; CHECK: [[DBG16]] = !DILocation(line: 102, column: 1, scope: [[DBG11]])
+; CHECK: [[DBG17]] = !DILocation(line: 103, column: 1, scope: [[DBG11]])
+;.
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; MDEP: {{.*}}
 ; MSSA: {{.*}}
diff --git a/llvm/test/Transforms/GVN/PRE/preserve-tbaa.ll b/llvm/test/Transforms/GVN/PRE/preserve-tbaa.ll
index abbb17f11f436..49ee089fed393 100644
--- a/llvm/test/Transforms/GVN/PRE/preserve-tbaa.ll
+++ b/llvm/test/Transforms/GVN/PRE/preserve-tbaa.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt -passes=gvn -S < %s | FileCheck %s --check-prefixes=CHECK,MDEP
 ; RUN: opt -passes='gvn<memoryssa>' -S < %s | FileCheck %s --check-prefixes=CHECK,MSSA
 
@@ -12,12 +12,12 @@ define void @test(ptr %P, ptr %Q, i1 %arg) nounwind {
 ; MDEP-NEXT:  [[ENTRY:.*:]]
 ; MDEP-NEXT:    br i1 [[ARG]], label %[[BB_NPH:.*]], label %[[FOR_END:.*]]
 ; MDEP:       [[BB_NPH]]:
-; MDEP-NEXT:    [[TMP33_PRE:%.*]] = load i16, ptr [[P]], align 2, !tbaa [[TBAA0:![0-9]+]]
+; MDEP-NEXT:    [[TMP33_PRE:%.*]] = load i16, ptr [[P]], align 2, !tbaa [[SHORT_TBAA0:![0-9]+]]
 ; MDEP-NEXT:    br label %[[FOR_BODY:.*]]
 ; MDEP:       [[FOR_BODY]]:
 ; MDEP-NEXT:    [[TMP33:%.*]] = phi i16 [ 0, %[[FOR_BODY]] ], [ [[TMP33_PRE]], %[[BB_NPH]] ]
 ; MDEP-NEXT:    store i16 [[TMP33]], ptr [[Q]], align 2
-; MDEP-NEXT:    store i16 0, ptr [[P]], align 2, !tbaa [[TBAA0]]
+; MDEP-NEXT:    store i16 0, ptr [[P]], align 2, !tbaa [[SHORT_TBAA0]]
 ; MDEP-NEXT:    br i1 false, label %[[FOR_BODY_FOR_END_CRIT_EDGE:.*]], label %[[FOR_BODY]]
 ; MDEP:       [[FOR_BODY_FOR_END_CRIT_EDGE]]:
 ; MDEP-NEXT:    br label %[[FOR_END]]
@@ -31,9 +31,9 @@ define void @test(ptr %P, ptr %Q, i1 %arg) nounwind {
 ; MSSA:       [[BB_NPH]]:
 ; MSSA-NEXT:    br label %[[FOR_BODY:.*]]
 ; MSSA:       [[FOR_BODY]]:
-; MSSA-NEXT:    [[TMP33:%.*]] = load i16, ptr [[P]], align 2, !tbaa [[TBAA0:![0-9]+]]
+; MSSA-NEXT:    [[TMP33:%.*]] = load i16, ptr [[P]], align 2, !tbaa [[SHORT_TBAA0:![0-9]+]]
 ; MSSA-NEXT:    store i16 [[TMP33]], ptr [[Q]], align 2
-; MSSA-NEXT:    store i16 0, ptr [[P]], align 2, !tbaa [[TBAA0]]
+; MSSA-NEXT:    store i16 0, ptr [[P]], align 2, !tbaa [[SHORT_TBAA0]]
 ; MSSA-NEXT:    br i1 false, label %[[FOR_BODY_FOR_END_CRIT_EDGE:.*]], label %[[FOR_BODY]]
 ; MSSA:       [[FOR_BODY_FOR_END_CRIT_EDGE]]:
 ; MSSA-NEXT:    br label %[[FOR_END]]
@@ -62,12 +62,12 @@ for.end:                                          ; preds = %for.body, %entry
 !2 = !{!"Simple C/C++ TBAA"}
 !3 = !{!"short", !1}
 ;.
-; MDEP: [[TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0}
+; MDEP: [[SHORT_TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0}
 ; MDEP: [[META1]] = !{!"short", [[META2:![0-9]+]]}
 ; MDEP: [[META2]] = !{!"omnipotent char", [[META3:![0-9]+]]}
 ; MDEP: [[META3]] = !{!"Simple C/C++ TBAA"}
 ;.
-; MSSA: [[TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0}
+; MSSA: [[SHORT_TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0}
 ; MSSA: [[META1]] = !{!"short", [[META2:![0-9]+]]}
 ; MSSA: [[META2]] = !{!"omnipotent char", [[META3:![0-9]+]]}
 ; MSSA: [[META3]] = !{!"Simple C/C++ TBAA"}
diff --git a/llvm/test/Transforms/GVN/pr33549.ll b/llvm/test/Transforms/GVN/pr33549.ll
index e0d7712c6f5cc..a8ce37c4f86a6 100644
--- a/llvm/test/Transforms/GVN/pr33549.ll
+++ b/llvm/test/Transforms/GVN/pr33549.ll
@@ -1,41 +1,42 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt -passes=gvn -S < %s | FileCheck %s
 
 @Data = common local_unnamed_addr global [32 x i32] zeroinitializer, align 4
 
 ; Function Attrs: norecurse nounwind
 define void @testshl() local_unnamed_addr #0 {
-; CHECK-LABEL: @testshl(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[K_031:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ], [ [[INC9:%.*]], [[FOR_INC8:%.*]] ]
+; CHECK-LABEL: define void @testshl(
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[K_031:%.*]] = phi i32 [ 1, %[[ENTRY]] ], [ [[INC9:%.*]], %[[FOR_INC8:.*]] ]
 ; CHECK-NEXT:    [[SHL:%.*]] = shl i32 1, [[K_031]]
 ; CHECK-NEXT:    [[SHR:%.*]] = ashr exact i32 [[SHL]], 1
 ; CHECK-NEXT:    [[CMP229:%.*]] = icmp slt i32 [[SHL]], 64
-; CHECK-NEXT:    br i1 [[CMP229]], label [[FOR_BODY3_PREHEADER:%.*]], label [[FOR_INC8]]
-; CHECK:       for.body3.preheader:
+; CHECK-NEXT:    br i1 [[CMP229]], label %[[FOR_BODY3_PREHEADER:.*]], label %[[FOR_INC8]]
+; CHECK:       [[FOR_BODY3_PREHEADER]]:
 ; CHECK-NEXT:    [[DIV:%.*]] = sdiv i32 [[SHR]], 2
-; CHECK-NEXT:    br label [[FOR_BODY3:%.*]]
-; CHECK:       for.body3:
-; CHECK-NEXT:    [[I_030:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY3]] ], [ [[DIV]], [[FOR_BODY3_PREHEADER]] ]
+; CHECK-NEXT:    br label %[[FOR_BODY3:.*]]
+; CHECK:       [[FOR_BODY3]]:
+; CHECK-NEXT:    [[I_030:%.*]] = phi i32 [ [[INC:%.*]], %[[FOR_BODY3]] ], [ [[DIV]], %[[FOR_BODY3_PREHEADER]] ]
 ; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[I_030]], [[SHR]]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [32 x i32], ptr @Data, i32 0, i32 [[ADD]]
 ; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [32 x i32], ptr @Data, i32 0, i32 [[I_030]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, [[TBAA3:!tbaa !.*]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX4]], align 4, [[TBAA3]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !tbaa [[INT_TBAA3:![0-9]+]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX4]], align 4, !tbaa [[INT_TBAA3]]
 ; CHECK-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP1]], [[TMP0]]
-; CHECK-NEXT:    store i32 [[SUB]], ptr [[ARRAYIDX]], align 4, [[TBAA3]]
+; CHECK-NEXT:    store i32 [[SUB]], ptr [[ARRAYIDX]], align 4, !tbaa [[INT_TBAA3]]
 ; CHECK-NEXT:    [[ADD7:%.*]] = add nsw i32 [[TMP1]], [[TMP0]]
-; CHECK-NEXT:    store i32 [[ADD7]], ptr [[ARRAYIDX4]], align 4, [[TBAA3]]
+; CHECK-NEXT:    store i32 [[ADD7]], ptr [[ARRAYIDX4]], align 4, !tbaa [[INT_TBAA3]]
 ; CHECK-NEXT:    [[INC]] = add nsw i32 [[I_030]], 1
 ; CHECK-NEXT:    [[CMP2:%.*]] = icmp slt i32 [[I_030]], 15
-; CHECK-NEXT:    br i1 [[CMP2]], label [[FOR_BODY3]], label [[FOR_INC8]]
-; CHECK:       for.inc8:
+; CHECK-NEXT:    br i1 [[CMP2]], label %[[FOR_BODY3]], label %[[FOR_INC8]]
+; CHECK:       [[FOR_INC8]]:
 ; CHECK-NEXT:    [[INC9]] = add nuw nsw i32 [[K_031]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC9]], 8
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END10:%.*]], label [[FOR_BODY]]
-; CHECK:       for.end10:
+; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[FOR_END10:.*]], label %[[FOR_BODY]]
+; CHECK:       [[FOR_END10]]:
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -89,3 +90,9 @@ attributes #0 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="fa
 !4 = !{!"int", !5, i64 0}
 !5 = !{!"omnipotent char", !6, i64 0}
 !6 = !{!"Simple C/C++ TBAA"}
+;.
+; CHECK: [[INT_TBAA3]] = !{[[META4:![0-9]+]], [[META4]], i64 0}
+; CHECK: [[META4]] = !{!"int", [[META5:![0-9]+]], i64 0}
+; CHECK: [[META5]] = !{!"omnipotent char", [[META6:![0-9]+]], i64 0}
+; CHECK: [[META6]] = !{!"Simple C/C++ TBAA"}
+;.
diff --git a/llvm/test/Transforms/GVN/pr64598.ll b/llvm/test/Transforms/GVN/pr64598.ll
index 902af984bce2b..80a9198b41c50 100644
--- a/llvm/test/Transforms/GVN/pr64598.ll
+++ b/llvm/test/Transforms/GVN/pr64598.ll
@@ -1,61 +1,61 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt -S -passes=gvn < %s | FileCheck %s
 
 define i32 @main(i64 %x, ptr %d, ptr noalias %p) {
-; CHECK-LABEL: define i32 @main
-; CHECK-SAME: (i64 [[X:%.*]], ptr [[D:%.*]], ptr noalias [[P:%.*]]) {
-; CHECK-NEXT:  entry:
+; CHECK-LABEL: define i32 @main(
+; CHECK-SAME: i64 [[X:%.*]], ptr [[D:%.*]], ptr noalias [[P:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    [[T1_PRE_PRE_PRE:%.*]] = load ptr, ptr [[P]], align 8
-; CHECK-NEXT:    [[T2_PRE_PRE_PRE:%.*]] = load ptr, ptr [[T1_PRE_PRE_PRE]], align 8, !tbaa [[TBAA0:![0-9]+]]
+; CHECK-NEXT:    [[T2_PRE_PRE_PRE:%.*]] = load ptr, ptr [[T1_PRE_PRE_PRE]], align 8, !tbaa [[ANYPTR_TBAA0:![0-9]+]]
 ; CHECK-NEXT:    [[T3_PRE_PRE_PRE:%.*]] = load ptr, ptr [[T2_PRE_PRE_PRE]], align 8
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[T2_PRE_PRE:%.*]] = phi ptr [ [[T2_PRE_PRE23:%.*]], [[LOOP_LATCH:%.*]] ], [ [[T2_PRE_PRE_PRE]], [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[T1_PRE_PRE:%.*]] = phi ptr [ [[T1_PRE_PRE19:%.*]], [[LOOP_LATCH]] ], [ [[T1_PRE_PRE_PRE]], [[ENTRY]] ]
-; CHECK-NEXT:    br label [[LOOP2:%.*]]
-; CHECK:       loop2:
-; CHECK-NEXT:    [[T2_PRE_PRE25:%.*]] = phi ptr [ [[T2_PRE_PRE23]], [[LOOP2_LATCH_LOOP2_CRIT_EDGE:%.*]] ], [ [[T2_PRE_PRE]], [[LOOP]] ]
-; CHECK-NEXT:    [[T1_PRE_PRE21:%.*]] = phi ptr [ [[T1_PRE_PRE19]], [[LOOP2_LATCH_LOOP2_CRIT_EDGE]] ], [ [[T1_PRE_PRE]], [[LOOP]] ]
-; CHECK-NEXT:    [[T3_PRE:%.*]] = phi ptr [ [[T3_PRE16:%.*]], [[LOOP2_LATCH_LOOP2_CRIT_EDGE]] ], [ [[T3_PRE_PRE_PRE]], [[LOOP]] ]
-; CHECK-NEXT:    [[T2_PRE:%.*]] = phi ptr [ [[T2_PRE13:%.*]], [[LOOP2_LATCH_LOOP2_CRIT_EDGE]] ], [ [[T2_PRE_PRE]], [[LOOP]] ]
-; CHECK-NEXT:    [[T1_PRE:%.*]] = phi ptr [ [[T1_PRE10:%.*]], [[LOOP2_LATCH_LOOP2_CRIT_EDGE]] ], [ [[T1_PRE_PRE]], [[LOOP]] ]
-; CHECK-NEXT:    br label [[LOOP3:%.*]]
-; CHECK:       loop3:
-; CHECK-NEXT:    [[T2_PRE_PRE24:%.*]] = phi ptr [ [[T2_PRE_PRE23]], [[LOOP3_LATCH:%.*]] ], [ [[T2_PRE_PRE25]], [[LOOP2]] ]
-; CHECK-NEXT:    [[T1_PRE_PRE20:%.*]] = phi ptr [ [[T1_PRE_PRE19]], [[LOOP3_LATCH]] ], [ [[T1_PRE_PRE21]], [[LOOP2]] ]
-; CHECK-NEXT:    [[T3_PRE17:%.*]] = phi ptr [ [[T3_PRE16]], [[LOOP3_LATCH]] ], [ [[T3_PRE]], [[LOOP2]] ]
-; CHECK-NEXT:    [[T2_PRE14:%.*]] = phi ptr [ [[T2_PRE13]], [[LOOP3_LATCH]] ], [ [[T2_PRE]], [[LOOP2]] ]
-; CHECK-NEXT:    [[T1_PRE11:%.*]] = phi ptr [ [[T1_PRE10]], [[LOOP3_LATCH]] ], [ [[T1_PRE]], [[LOOP2]] ]
-; CHECK-NEXT:    [[T78:%.*]] = phi ptr [ [[T7:%.*]], [[LOOP3_LATCH]] ], [ [[T3_PRE]], [[LOOP2]] ]
-; CHECK-NEXT:    [[T66:%.*]] = phi ptr [ [[T6:%.*]], [[LOOP3_LATCH]] ], [ [[T2_PRE]], [[LOOP2]] ]
-; CHECK-NEXT:    [[T54:%.*]] = phi ptr [ [[T5:%.*]], [[LOOP3_LATCH]] ], [ [[T1_PRE]], [[LOOP2]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[T2_PRE_PRE:%.*]] = phi ptr [ [[T2_PRE_PRE23:%.*]], %[[LOOP_LATCH:.*]] ], [ [[T2_PRE_PRE_PRE]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[T1_PRE_PRE:%.*]] = phi ptr [ [[T1_PRE_PRE19:%.*]], %[[LOOP_LATCH]] ], [ [[T1_PRE_PRE_PRE]], %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP2:.*]]
+; CHECK:       [[LOOP2]]:
+; CHECK-NEXT:    [[T2_PRE_PRE25:%.*]] = phi ptr [ [[T2_PRE_PRE23]], %[[LOOP2_LATCH_LOOP2_CRIT_EDGE:.*]] ], [ [[T2_PRE_PRE]], %[[LOOP]] ]
+; CHECK-NEXT:    [[T1_PRE_PRE21:%.*]] = phi ptr [ [[T1_PRE_PRE19]], %[[LOOP2_LATCH_LOOP2_CRIT_EDGE]] ], [ [[T1_PRE_PRE]], %[[LOOP]] ]
+; CHECK-NEXT:    [[T3_PRE:%.*]] = phi ptr [ [[T3_PRE16:%.*]], %[[LOOP2_LATCH_LOOP2_CRIT_EDGE]] ], [ [[T3_PRE_PRE_PRE]], %[[LOOP]] ]
+; CHECK-NEXT:    [[T2_PRE:%.*]] = phi ptr [ [[T2_PRE13:%.*]], %[[LOOP2_LATCH_LOOP2_CRIT_EDGE]] ], [ [[T2_PRE_PRE]], %[[LOOP]] ]
+; CHECK-NEXT:    [[T1_PRE:%.*]] = phi ptr [ [[T1_PRE10:%.*]], %[[LOOP2_LATCH_LOOP2_CRIT_EDGE]] ], [ [[T1_PRE_PRE]], %[[LOOP]] ]
+; CHECK-NEXT:    br label %[[LOOP3:.*]]
+; CHECK:       [[LOOP3]]:
+; CHECK-NEXT:    [[T2_PRE_PRE24:%.*]] = phi ptr [ [[T2_PRE_PRE23]], %[[LOOP3_LATCH:.*]] ], [ [[T2_PRE_PRE25]], %[[LOOP2]] ]
+; CHECK-NEXT:    [[T1_PRE_PRE20:%.*]] = phi ptr [ [[T1_PRE_PRE19]], %[[LOOP3_LATCH]] ], [ [[T1_PRE_PRE21]], %[[LOOP2]] ]
+; CHECK-NEXT:    [[T3_PRE17:%.*]] = phi ptr [ [[T3_PRE16]], %[[LOOP3_LATCH]] ], [ [[T3_PRE]], %[[LOOP2]] ]
+; CHECK-NEXT:    [[T2_PRE14:%.*]] = phi ptr [ [[T2_PRE13]], %[[LOOP3_LATCH]] ], [ [[T2_PRE]], %[[LOOP2]] ]
+; CHECK-NEXT:    [[T1_PRE11:%.*]] = phi ptr [ [[T1_PRE10]], %[[LOOP3_LATCH]] ], [ [[T1_PRE]], %[[LOOP2]] ]
+; CHECK-NEXT:    [[T78:%.*]] = phi ptr [ [[T7:%.*]], %[[LOOP3_LATCH]] ], [ [[T3_PRE]], %[[LOOP2]] ]
+; CHECK-NEXT:    [[T66:%.*]] = phi ptr [ [[T6:%.*]], %[[LOOP3_LATCH]] ], [ [[T2_PRE]], %[[LOOP2]] ]
+; CHECK-NEXT:    [[T54:%.*]] = phi ptr [ [[T5:%.*]], %[[LOOP3_LATCH]] ], [ [[T1_PRE]], %[[LOOP2]] ]
 ; CHECK-NEXT:    [[TOBOOL_NOT2_I:%.*]] = icmp eq i64 [[X]], 0
-; CHECK-NEXT:    br i1 false, label [[LOOP3_LOOP3_LATCH_CRIT_EDGE:%.*]], label [[FOR_BODY_LR_PH_I:%.*]]
-; CHECK:       loop3.loop3.latch_crit_edge:
-; CHECK-NEXT:    br label [[LOOP3_LATCH]]
-; CHECK:       for.body.lr.ph.i:
+; CHECK-NEXT:    br i1 false, label %[[LOOP3_LOOP3_LATCH_CRIT_EDGE:.*]], label %[[FOR_BODY_LR_PH_I:.*]]
+; CHECK:       [[LOOP3_LOOP3_LATCH_CRIT_EDGE]]:
+; CHECK-NEXT:    br label %[[LOOP3_LATCH]]
+; CHECK:       [[FOR_BODY_LR_PH_I]]:
 ; CHECK-NEXT:    store i32 0, ptr [[P]], align 4
 ; CHECK-NEXT:    [[T5_PRE:%.*]] = load ptr, ptr [[P]], align 8
-; CHECK-NEXT:    [[T6_PRE:%.*]] = load ptr, ptr [[T5_PRE]], align 8, !tbaa [[TBAA0]]
+; CHECK-NEXT:    [[T6_PRE:%.*]] = load ptr, ptr [[T5_PRE]], align 8, !tbaa [[ANYPTR_TBAA0]]
 ; CHECK-NEXT:    [[T7_PRE:%.*]] = load ptr, ptr [[T6_PRE]], align 8
-; CHECK-NEXT:    br label [[LOOP3_LATCH]]
-; CHECK:       loop3.latch:
-; CHECK-NEXT:    [[T2_PRE_PRE23]] = phi ptr [ [[T2_PRE_PRE24]], [[LOOP3_LOOP3_LATCH_CRIT_EDGE]] ], [ [[T6_PRE]], [[FOR_BODY_LR_PH_I]] ]
-; CHECK-NEXT:    [[T1_PRE_PRE19]] = phi ptr [ [[T1_PRE_PRE20]], [[LOOP3_LOOP3_LATCH_CRIT_EDGE]] ], [ [[T5_PRE]], [[FOR_BODY_LR_PH_I]] ]
-; CHECK-NEXT:    [[T3_PRE16]] = phi ptr [ [[T3_PRE17]], [[LOOP3_LOOP3_LATCH_CRIT_EDGE]] ], [ [[T7_PRE]], [[FOR_BODY_LR_PH_I]] ]
-; CHECK-NEXT:    [[T2_PRE13]] = phi ptr [ [[T2_PRE14]], [[LOOP3_LOOP3_LATCH_CRIT_EDGE]] ], [ [[T6_PRE]], [[FOR_BODY_LR_PH_I]] ]
-; CHECK-NEXT:    [[T1_PRE10]] = phi ptr [ [[T1_PRE11]], [[LOOP3_LOOP3_LATCH_CRIT_EDGE]] ], [ [[T5_PRE]], [[FOR_BODY_LR_PH_I]] ]
-; CHECK-NEXT:    [[T7]] = phi ptr [ [[T78]], [[LOOP3_LOOP3_LATCH_CRIT_EDGE]] ], [ [[T7_PRE]], [[FOR_BODY_LR_PH_I]] ]
-; CHECK-NEXT:    [[T6]] = phi ptr [ [[T66]], [[LOOP3_LOOP3_LATCH_CRIT_EDGE]] ], [ [[T6_PRE]], [[FOR_BODY_LR_PH_I]] ]
-; CHECK-NEXT:    [[T5]] = phi ptr [ [[T54]], [[LOOP3_LOOP3_LATCH_CRIT_EDGE]] ], [ [[T5_PRE]], [[FOR_BODY_LR_PH_I]] ]
-; CHECK-NEXT:    br i1 false, label [[LOOP2_LATCH:%.*]], label [[LOOP3]]
-; CHECK:       loop2.latch:
-; CHECK-NEXT:    br i1 false, label [[LOOP2_LATCH_LOOP2_CRIT_EDGE]], label [[LOOP_LATCH]]
-; CHECK:       loop2.latch.loop2_crit_edge:
-; CHECK-NEXT:    br label [[LOOP2]]
-; CHECK:       loop.latch:
-; CHECK-NEXT:    store i32 0, ptr [[D]], align 4, !tbaa [[TBAA4:![0-9]+]]
-; CHECK-NEXT:    br label [[LOOP]]
+; CHECK-NEXT:    br label %[[LOOP3_LATCH]]
+; CHECK:       [[LOOP3_LATCH]]:
+; CHECK-NEXT:    [[T2_PRE_PRE23]] = phi ptr [ [[T2_PRE_PRE24]], %[[LOOP3_LOOP3_LATCH_CRIT_EDGE]] ], [ [[T6_PRE]], %[[FOR_BODY_LR_PH_I]] ]
+; CHECK-NEXT:    [[T1_PRE_PRE19]] = phi ptr [ [[T1_PRE_PRE20]], %[[LOOP3_LOOP3_LATCH_CRIT_EDGE]] ], [ [[T5_PRE]], %[[FOR_BODY_LR_PH_I]] ]
+; CHECK-NEXT:    [[T3_PRE16]] = phi ptr [ [[T3_PRE17]], %[[LOOP3_LOOP3_LATCH_CRIT_EDGE]] ], [ [[T7_PRE]], %[[FOR_BODY_LR_PH_I]] ]
+; CHECK-NEXT:    [[T2_PRE13]] = phi ptr [ [[T2_PRE14]], %[[LOOP3_LOOP3_LATCH_CRIT_EDGE]] ], [ [[T6_PRE]], %[[FOR_BODY_LR_PH_I]] ]
+; CHECK-NEXT:    [[T1_PRE10]] = phi ptr [ [[T1_PRE11]], %[[LOOP3_LOOP3_LATCH_CRIT_EDGE]] ], [ [[T5_PRE]], %[[FOR_BODY_LR_PH_I]] ]
+; CHECK-NEXT:    [[T7]] = phi ptr [ [[T78]], %[[LOOP3_LOOP3_LATCH_CRIT_EDGE]] ], [ [[T7_PRE]], %[[FOR_BODY_LR_PH_I]] ]
+; CHECK-NEXT:    [[T6]] = phi ptr [ [[T66]], %[[LOOP3_LOOP3_LATCH_CRIT_EDGE]] ], [ [[T6_PRE]], %[[FOR_BODY_LR_PH_I]] ]
+; CHECK-NEXT:    [[T5]] = phi ptr [ [[T54]], %[[LOOP3_LOOP3_LATCH_CRIT_EDGE]] ], [ [[T5_PRE]], %[[FOR_BODY_LR_PH_I]] ]
+; CHECK-NEXT:    br i1 false, label %[[LOOP2_LATCH:.*]], label %[[LOOP3]]
+; CHECK:       [[LOOP2_LATCH]]:
+; CHECK-NEXT:    br i1 false, label %[[LOOP2_LATCH_LOOP2_CRIT_EDGE]], label %[[LOOP_LATCH]]
+; CHECK:       [[LOOP2_LATCH_LOOP2_CRIT_EDGE]]:
+; CHECK-NEXT:    br label %[[LOOP2]]
+; CHECK:       [[LOOP_LATCH]]:
+; CHECK-NEXT:    store i32 0, ptr [[D]], align 4, !tbaa [[INT_TBAA4:![0-9]+]]
+; CHECK-NEXT:    br label %[[LOOP]]
 ;
 entry:
   br label %loop
@@ -101,3 +101,11 @@ loop.latch:
 !3 = !{!"Simple C/C++ TBAA"}
 !4 = !{!5, !5, i64 0}
 !5 = !{!"int", !2, i64 0}
+;.
+; CHECK: [[ANYPTR_TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0}
+; CHECK: [[META1]] = !{!"any pointer", [[META2:![0-9]+]], i64 0}
+; CHECK: [[META2]] = !{!"omnipotent char", [[META3:![0-9]+]], i64 0}
+; CHECK: [[META3]] = !{!"Simple C/C++ TBAA"}
+; CHECK: [[INT_TBAA4]] = !{[[META5:![0-9]+]], [[META5]], i64 0}
+; CHECK: [[META5]] = !{!"int", [[META2]], i64 0}
+;.
diff --git a/llvm/test/Transforms/GVN/tbaa.ll b/llvm/test/Transforms/GVN/tbaa.ll
index 59ace145b5657..bb9b0dea73ab1 100644
--- a/llvm/test/Transforms/GVN/tbaa.ll
+++ b/llvm/test/Transforms/GVN/tbaa.ll
@@ -1,17 +1,17 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt -passes=gvn -S < %s | FileCheck --check-prefixes=CHECK,MDEP %s
 ; RUN: opt -passes='gvn<memoryssa>' -S < %s | FileCheck --check-prefixes=CHECK,MSSA %s
 
 define i32 @test1(ptr %p, ptr %q) {
 ; MDEP-LABEL: define i32 @test1(
 ; MDEP-SAME: ptr [[P:%.*]], ptr [[Q:%.*]]) {
-; MDEP-NEXT:    [[A:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[TBAA0:![0-9]+]]
+; MDEP-NEXT:    [[A:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[C_TBAA0:![0-9]+]]
 ; MDEP-NEXT:    [[C:%.*]] = add i32 [[A]], [[A]]
 ; MDEP-NEXT:    ret i32 [[C]]
 ;
 ; MSSA-LABEL: define i32 @test1(
 ; MSSA-SAME: ptr [[P:%.*]], ptr [[Q:%.*]]) {
-; MSSA-NEXT:    [[A:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[TBAA0:![0-9]+]]
+; MSSA-NEXT:    [[A:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[C_TBAA0:![0-9]+]]
 ; MSSA-NEXT:    [[B:%.*]] = call i32 @foo(ptr [[P]])
 ; MSSA-NEXT:    [[C:%.*]] = add i32 [[A]], [[B]]
 ; MSSA-NEXT:    ret i32 [[C]]
@@ -25,14 +25,14 @@ define i32 @test1(ptr %p, ptr %q) {
 define i32 @test2(ptr %p, ptr %q) {
 ; MDEP-LABEL: define i32 @test2(
 ; MDEP-SAME: ptr [[P:%.*]], ptr [[Q:%.*]]) {
-; MDEP-NEXT:    [[A:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[TBAA0]]
+; MDEP-NEXT:    [[A:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[C_TBAA0]]
 ; MDEP-NEXT:    [[C:%.*]] = add i32 [[A]], [[A]]
 ; MDEP-NEXT:    ret i32 [[C]]
 ;
 ; MSSA-LABEL: define i32 @test2(
 ; MSSA-SAME: ptr [[P:%.*]], ptr [[Q:%.*]]) {
-; MSSA-NEXT:    [[A:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[TBAA0]]
-; MSSA-NEXT:    [[B:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[TBAA0]]
+; MSSA-NEXT:    [[A:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[C_TBAA0]]
+; MSSA-NEXT:    [[B:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[C_TBAA0]]
 ; MSSA-NEXT:    [[C:%.*]] = add i32 [[A]], [[B]]
 ; MSSA-NEXT:    ret i32 [[C]]
 ;
@@ -45,14 +45,14 @@ define i32 @test2(ptr %p, ptr %q) {
 define i32 @test3(ptr %p, ptr %q) {
 ; MDEP-LABEL: define i32 @test3(
 ; MDEP-SAME: ptr [[P:%.*]], ptr [[Q:%.*]]) {
-; MDEP-NEXT:    [[A:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[TBAA4:![0-9]+]]
+; MDEP-NEXT:    [[A:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[B_TBAA4:![0-9]+]]
 ; MDEP-NEXT:    [[C:%.*]] = add i32 [[A]], [[A]]
 ; MDEP-NEXT:    ret i32 [[C]]
 ;
 ; MSSA-LABEL: define i32 @test3(
 ; MSSA-SAME: ptr [[P:%.*]], ptr [[Q:%.*]]) {
-; MSSA-NEXT:    [[A:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[TBAA4:![0-9]+]]
-; MSSA-NEXT:    [[B:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[TBAA4]]
+; MSSA-NEXT:    [[A:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[B_TBAA4:![0-9]+]]
+; MSSA-NEXT:    [[B:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[B_TBAA4]]
 ; MSSA-NEXT:    [[C:%.*]] = add i32 [[A]], [[B]]
 ; MSSA-NEXT:    ret i32 [[C]]
 ;
@@ -65,14 +65,14 @@ define i32 @test3(ptr %p, ptr %q) {
 define i32 @test4(ptr %p, ptr %q) {
 ; MDEP-LABEL: define i32 @test4(
 ; MDEP-SAME: ptr [[P:%.*]], ptr [[Q:%.*]]) {
-; MDEP-NEXT:    [[A:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[TBAA6:![0-9]+]]
+; MDEP-NEXT:    [[A:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[A_TBAA6:![0-9]+]]
 ; MDEP-NEXT:    [[C:%.*]] = add i32 [[A]], [[A]]
 ; MDEP-NEXT:    ret i32 [[C]]
 ;
 ; MSSA-LABEL: define i32 @test4(
 ; MSSA-SAME: ptr [[P:%.*]], ptr [[Q:%.*]]) {
-; MSSA-NEXT:    [[A:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[TBAA6:![0-9]+]]
-; MSSA-NEXT:    [[B:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[TBAA0]]
+; MSSA-NEXT:    [[A:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[A_TBAA6:![0-9]+]]
+; MSSA-NEXT:    [[B:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[C_TBAA0]]
 ; MSSA-NEXT:    [[C:%.*]] = add i32 [[A]], [[B]]
 ; MSSA-NEXT:    ret i32 [[C]]
 ;
@@ -85,14 +85,14 @@ define i32 @test4(ptr %p, ptr %q) {
 define i32 @test5(ptr %p, ptr %q) {
 ; MDEP-LABEL: define i32 @test5(
 ; MDEP-SAME: ptr [[P:%.*]], ptr [[Q:%.*]]) {
-; MDEP-NEXT:    [[A:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[TBAA0]]
+; MDEP-NEXT:    [[A:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[C_TBAA0]]
 ; MDEP-NEXT:    [[C:%.*]] = add i32 [[A]], [[A]]
 ; MDEP-NEXT:    ret i32 [[C]]
 ;
 ; MSSA-LABEL: define i32 @test5(
 ; MSSA-SAME: ptr [[P:%.*]], ptr [[Q:%.*]]) {
-; MSSA-NEXT:    [[A:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[TBAA0]]
-; MSSA-NEXT:    [[B:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[TBAA6]]
+; MSSA-NEXT:    [[A:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[C_TBAA0]]
+; MSSA-NEXT:    [[B:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[A_TBAA6]]
 ; MSSA-NEXT:    [[C:%.*]] = add i32 [[A]], [[B]]
 ; MSSA-NEXT:    ret i32 [[C]]
 ;
@@ -105,14 +105,14 @@ define i32 @test5(ptr %p, ptr %q) {
 define i32 @test6(ptr %p, ptr %q) {
 ; MDEP-LABEL: define i32 @test6(
 ; MDEP-SAME: ptr [[P:%.*]], ptr [[Q:%.*]]) {
-; MDEP-NEXT:    [[A:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[TBAA0]]
+; MDEP-NEXT:    [[A:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[C_TBAA0]]
 ; MDEP-NEXT:    [[C:%.*]] = add i32 [[A]], [[A]]
 ; MDEP-NEXT:    ret i32 [[C]]
 ;
 ; MSSA-LABEL: define i32 @test6(
 ; MSSA-SAME: ptr [[P:%.*]], ptr [[Q:%.*]]) {
-; MSSA-NEXT:    [[A:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[TBAA0]]
-; MSSA-NEXT:    [[B:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[TBAA4]]
+; MSSA-NEXT:    [[A:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[C_TBAA0]]
+; MSSA-NEXT:    [[B:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[B_TBAA4]]
 ; MSSA-NEXT:    [[C:%.*]] = add i32 [[A]], [[B]]
 ; MSSA-NEXT:    ret i32 [[C]]
 ;
@@ -125,14 +125,14 @@ define i32 @test6(ptr %p, ptr %q) {
 define i32 @test7(ptr %p, ptr %q) {
 ; MDEP-LABEL: define i32 @test7(
 ; MDEP-SAME: ptr [[P:%.*]], ptr [[Q:%.*]]) {
-; MDEP-NEXT:    [[A:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[TBAA7:![0-9]+]]
+; MDEP-NEXT:    [[A:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[SCALAR_TYPE_TBAA7:![0-9]+]]
 ; MDEP-NEXT:    [[C:%.*]] = add i32 [[A]], [[A]]
 ; MDEP-NEXT:    ret i32 [[C]]
 ;
 ; MSSA-LABEL: define i32 @test7(
 ; MSSA-SAME: ptr [[P:%.*]], ptr [[Q:%.*]]) {
-; MSSA-NEXT:    [[A:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[TBAA7:![0-9]+]]
-; MSSA-NEXT:    [[B:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[TBAA4]]
+; MSSA-NEXT:    [[A:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[SCALAR_TYPE_TBAA7:![0-9]+]]
+; MSSA-NEXT:    [[B:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[B_TBAA4]]
 ; MSSA-NEXT:    [[C:%.*]] = add i32 [[A]], [[B]]
 ; MSSA-NEXT:    ret i32 [[C]]
 ;
@@ -150,9 +150,9 @@ define i32 @test8(ptr %p, ptr %q) {
 ;
 ; MSSA-LABEL: define i32 @test8(
 ; MSSA-SAME: ptr [[P:%.*]], ptr [[Q:%.*]]) {
-; MSSA-NEXT:    [[A:%.*]] = load i32, ptr [[Q]], align 4, !tbaa [[TBAA10:![0-9]+]]
+; MSSA-NEXT:    [[A:%.*]] = load i32, ptr [[Q]], align 4, !tbaa [[NODE_TBAA10:![0-9]+]]
 ; MSSA-NEXT:    store i32 15, ptr [[P]], align 4
-; MSSA-NEXT:    [[B:%.*]] = load i32, ptr [[Q]], align 4, !tbaa [[TBAA10]]
+; MSSA-NEXT:    [[B:%.*]] = load i32, ptr [[Q]], align 4, !tbaa [[NODE_TBAA10]]
 ; MSSA-NEXT:    [[C:%.*]] = sub i32 [[A]], [[B]]
 ; MSSA-NEXT:    ret i32 [[C]]
 ;
@@ -174,9 +174,9 @@ define i32 @test9(ptr %p, ptr %q) {
 ;
 ; MSSA-LABEL: define i32 @test9(
 ; MSSA-SAME: ptr [[P:%.*]], ptr [[Q:%.*]]) {
-; MSSA-NEXT:    [[A:%.*]] = load i32, ptr [[Q]], align 4, !tbaa [[TBAA10]]
+; MSSA-NEXT:    [[A:%.*]] = load i32, ptr [[Q]], align 4, !tbaa [[NODE_TBAA10]]
 ; MSSA-NEXT:    call void @clobber()
-; MSSA-NEXT:    [[B:%.*]] = load i32, ptr [[Q]], align 4, !tbaa [[TBAA10]]
+; MSSA-NEXT:    [[B:%.*]] = load i32, ptr [[Q]], align 4, !tbaa [[NODE_TBAA10]]
 ; MSSA-NEXT:    [[C:%.*]] = sub i32 [[A]], [[B]]
 ; MSSA-NEXT:    ret i32 [[C]]
 ;
@@ -195,14 +195,14 @@ define i32 @test10(ptr %p, ptr %q) {
 ; and not just the common final access type.
 ; MDEP-LABEL: define i32 @test10(
 ; MDEP-SAME: ptr [[P:%.*]], ptr [[Q:%.*]]) {
-; MDEP-NEXT:    [[A:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[TBAA10:![0-9]+]]
+; MDEP-NEXT:    [[A:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[INT_TBAA10:![0-9]+]]
 ; MDEP-NEXT:    [[C:%.*]] = add i32 [[A]], [[A]]
 ; MDEP-NEXT:    ret i32 [[C]]
 ;
 ; MSSA-LABEL: define i32 @test10(
 ; MSSA-SAME: ptr [[P:%.*]], ptr [[Q:%.*]]) {
-; MSSA-NEXT:    [[A:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[TBAA13:![0-9]+]]
-; MSSA-NEXT:    [[B:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[TBAA17:![0-9]+]]
+; MSSA-NEXT:    [[A:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[INT_TBAA13:![0-9]+]]
+; MSSA-NEXT:    [[B:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[INT_TBAA17:![0-9]+]]
 ; MSSA-NEXT:    [[C:%.*]] = add i32 [[A]], [[B]]
 ; MSSA-NEXT:    ret i32 [[C]]
 ;
@@ -238,39 +238,39 @@ declare i32 @foo(ptr) readonly
 !9 = !{!"yet another root"}
 !10 = !{!"node", !9, i64 1}
 ;.
-; MDEP: [[TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0}
+; MDEP: [[C_TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0}
 ; MDEP: [[META1]] = !{!"C", [[META2:![0-9]+]]}
 ; MDEP: [[META2]] = !{!"A", [[META3:![0-9]+]]}
 ; MDEP: [[META3]] = !{!"tbaa root"}
-; MDEP: [[TBAA4]] = !{[[META5:![0-9]+]], [[META5]], i64 0}
+; MDEP: [[B_TBAA4]] = !{[[META5:![0-9]+]], [[META5]], i64 0}
 ; MDEP: [[META5]] = !{!"B", [[META2]]}
-; MDEP: [[TBAA6]] = !{[[META2]], [[META2]], i64 0}
-; MDEP: [[TBAA7]] = !{[[META8:![0-9]+]], [[META8]], i64 0}
+; MDEP: [[A_TBAA6]] = !{[[META2]], [[META2]], i64 0}
+; MDEP: [[SCALAR_TYPE_TBAA7]] = !{[[META8:![0-9]+]], [[META8]], i64 0}
 ; MDEP: [[META8]] = !{!"scalar type", [[META9:![0-9]+]]}
 ; MDEP: [[META9]] = !{!"another root"}
-; MDEP: [[TBAA10]] = !{[[META11:![0-9]+]], [[META12:![0-9]+]], i64 0}
+; MDEP: [[INT_TBAA10]] = !{[[META11:![0-9]+]], [[META12:![0-9]+]], i64 0}
 ; MDEP: [[META11]] = !{!"struct X", [[META12]], i64 0}
 ; MDEP: [[META12]] = !{!"int", [[META13:![0-9]+]], i64 0}
 ; MDEP: [[META13]] = !{!"char", [[META3]], i64 0}
 ;.
-; MSSA: [[TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0}
+; MSSA: [[C_TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0}
 ; MSSA: [[META1]] = !{!"C", [[META2:![0-9]+]]}
 ; MSSA: [[META2]] = !{!"A", [[META3:![0-9]+]]}
 ; MSSA: [[META3]] = !{!"tbaa root"}
-; MSSA: [[TBAA4]] = !{[[META5:![0-9]+]], [[META5]], i64 0}
+; MSSA: [[B_TBAA4]] = !{[[META5:![0-9]+]], [[META5]], i64 0}
 ; MSSA: [[META5]] = !{!"B", [[META2]]}
-; MSSA: [[TBAA6]] = !{[[META2]], [[META2]], i64 0}
-; MSSA: [[TBAA7]] = !{[[META8:![0-9]+]], [[META8]], i64 0}
+; MSSA: [[A_TBAA6]] = !{[[META2]], [[META2]], i64 0}
+; MSSA: [[SCALAR_TYPE_TBAA7]] = !{[[META8:![0-9]+]], [[META8]], i64 0}
 ; MSSA: [[META8]] = !{!"scalar type", [[META9:![0-9]+]]}
 ; MSSA: [[META9]] = !{!"another root"}
-; MSSA: [[TBAA10]] = !{[[META11:![0-9]+]], [[META11]], i64 0, i64 1}
+; MSSA: [[NODE_TBAA10]] = !{[[META11:![0-9]+]], [[META11]], i64 0, i64 1}
 ; MSSA: [[META11]] = !{!"node", [[META12:![0-9]+]]}
 ; MSSA: [[META12]] = !{!"yet another root"}
-; MSSA: [[TBAA13]] = !{[[META14:![0-9]+]], [[META15:![0-9]+]], i64 0}
+; MSSA: [[INT_TBAA13]] = !{[[META14:![0-9]+]], [[META15:![0-9]+]], i64 0}
 ; MSSA: [[META14]] = !{!"struct X", [[META15]], i64 0}
 ; MSSA: [[META15]] = !{!"int", [[META16:![0-9]+]], i64 0}
 ; MSSA: [[META16]] = !{!"char", [[META3]], i64 0}
-; MSSA: [[TBAA17]] = !{[[META18:![0-9]+]], [[META15]], i64 0}
+; MSSA: [[INT_TBAA17]] = !{[[META18:![0-9]+]], [[META15]], i64 0}
 ; MSSA: [[META18]] = !{!"struct Y", [[META14]], i64 0}
 ;.
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
diff --git a/llvm/test/Transforms/GVNHoist/hoist-md.ll b/llvm/test/Transforms/GVNHoist/hoist-md.ll
index 26fe475535add..2ef9bc30433c3 100644
--- a/llvm/test/Transforms/GVNHoist/hoist-md.ll
+++ b/llvm/test/Transforms/GVNHoist/hoist-md.ll
@@ -1,19 +1,19 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals --version 2
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 6
 ; RUN: opt -S -passes=gvn-hoist < %s | FileCheck %s
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
 define void @test1(i1 %b, ptr %x) {
-; CHECK-LABEL: define void @test1
-; CHECK-SAME: (i1 [[B:%.*]], ptr [[X:%.*]]) {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    store i32 2, ptr [[X]], align 4, !tbaa [[TBAA0:![0-9]+]]
-; CHECK-NEXT:    br i1 [[B]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
-; CHECK:       if.then:
-; CHECK-NEXT:    br label [[IF_END:%.*]]
-; CHECK:       if.else:
-; CHECK-NEXT:    br label [[IF_END]]
-; CHECK:       if.end:
+; CHECK-LABEL: define void @test1(
+; CHECK-SAME: i1 [[B:%.*]], ptr [[X:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store i32 2, ptr [[X]], align 4, !tbaa [[CHAR_TBAA0:![0-9]+]]
+; CHECK-NEXT:    br i1 [[B]], label %[[IF_THEN:.*]], label %[[IF_ELSE:.*]]
+; CHECK:       [[IF_THEN]]:
+; CHECK-NEXT:    br label %[[IF_END:.*]]
+; CHECK:       [[IF_ELSE]]:
+; CHECK-NEXT:    br label %[[IF_END]]
+; CHECK:       [[IF_END]]:
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -32,19 +32,19 @@ if.end:                                           ; preds = %if.else, %if.then
 }
 
 define void @test2(i1 %b, ptr %x) {
-; CHECK-LABEL: define void @test2
-; CHECK-SAME: (i1 [[B:%.*]], ptr [[X:%.*]]) {
-; CHECK-NEXT:  entry:
+; CHECK-LABEL: define void @test2(
+; CHECK-SAME: i1 [[B:%.*]], ptr [[X:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 1
-; CHECK-NEXT:    store i32 2, ptr [[TMP0]], align 4, !tbaa [[TBAA0]]
-; CHECK-NEXT:    br i1 [[B]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
-; CHECK:       if.then:
+; CHECK-NEXT:    store i32 2, ptr [[TMP0]], align 4, !tbaa [[CHAR_TBAA0]]
+; CHECK-NEXT:    br i1 [[B]], label %[[IF_THEN:.*]], label %[[IF_ELSE:.*]]
+; CHECK:       [[IF_THEN]]:
 ; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 1
-; CHECK-NEXT:    br label [[IF_END:%.*]]
-; CHECK:       if.else:
+; CHECK-NEXT:    br label %[[IF_END:.*]]
+; CHECK:       [[IF_ELSE]]:
 ; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 1
-; CHECK-NEXT:    br label [[IF_END]]
-; CHECK:       if.end:
+; CHECK-NEXT:    br label %[[IF_END]]
+; CHECK:       [[IF_END]]:
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -65,19 +65,19 @@ if.end:                                           ; preds = %if.else, %if.then
 }
 
 define void @test3(i1 %b, ptr %x) {
-; CHECK-LABEL: define void @test3
-; CHECK-SAME: (i1 [[B:%.*]], ptr [[X:%.*]]) {
-; CHECK-NEXT:  entry:
+; CHECK-LABEL: define void @test3(
+; CHECK-SAME: i1 [[B:%.*]], ptr [[X:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i32, ptr [[X]], i64 1
-; CHECK-NEXT:    store i32 2, ptr [[TMP0]], align 4, !tbaa [[TBAA0]]
-; CHECK-NEXT:    br i1 [[B]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
-; CHECK:       if.then:
+; CHECK-NEXT:    store i32 2, ptr [[TMP0]], align 4, !tbaa [[CHAR_TBAA0]]
+; CHECK-NEXT:    br i1 [[B]], label %[[IF_THEN:.*]], label %[[IF_ELSE:.*]]
+; CHECK:       [[IF_THEN]]:
 ; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 1
-; CHECK-NEXT:    br label [[IF_END:%.*]]
-; CHECK:       if.else:
+; CHECK-NEXT:    br label %[[IF_END:.*]]
+; CHECK:       [[IF_ELSE]]:
 ; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr i32, ptr [[X]], i64 1
-; CHECK-NEXT:    br label [[IF_END]]
-; CHECK:       if.end:
+; CHECK-NEXT:    br label %[[IF_END]]
+; CHECK:       [[IF_END]]:
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -98,17 +98,17 @@ if.end:                                           ; preds = %if.else, %if.then
 }
 
 define i32 @test4(i1 %b, ptr %y) {
-; CHECK-LABEL: define i32 @test4
-; CHECK-SAME: (i1 [[B:%.*]], ptr [[Y:%.*]]) {
-; CHECK-NEXT:  entry:
+; CHECK-LABEL: define i32 @test4(
+; CHECK-SAME: i1 [[B:%.*]], ptr [[Y:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[Y]], align 4, !range [[RNG3:![0-9]+]]
-; CHECK-NEXT:    br i1 [[B]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
-; CHECK:       if.then:
-; CHECK-NEXT:    br label [[RETURN:%.*]]
-; CHECK:       if.end:
-; CHECK-NEXT:    br label [[RETURN]]
-; CHECK:       return:
-; CHECK-NEXT:    [[RETVAL_0:%.*]] = phi i32 [ [[TMP0]], [[IF_THEN]] ], [ [[TMP0]], [[IF_END]] ]
+; CHECK-NEXT:    br i1 [[B]], label %[[IF_THEN:.*]], label %[[IF_END:.*]]
+; CHECK:       [[IF_THEN]]:
+; CHECK-NEXT:    br label %[[RETURN:.*]]
+; CHECK:       [[IF_END]]:
+; CHECK-NEXT:    br label %[[RETURN]]
+; CHECK:       [[RETURN]]:
+; CHECK-NEXT:    [[RETVAL_0:%.*]] = phi i32 [ [[TMP0]], %[[IF_THEN]] ], [ [[TMP0]], %[[IF_END]] ]
 ; CHECK-NEXT:    ret i32 [[RETVAL_0]]
 ;
 entry:
@@ -128,17 +128,17 @@ return:                                           ; preds = %if.end, %if.then
 }
 
 define ptr @test5(i1 %b, ptr %y) {
-; CHECK-LABEL: define ptr @test5
-; CHECK-SAME: (i1 [[B:%.*]], ptr [[Y:%.*]]) {
-; CHECK-NEXT:  entry:
+; CHECK-LABEL: define ptr @test5(
+; CHECK-SAME: i1 [[B:%.*]], ptr [[Y:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[Y]], align 4
-; CHECK-NEXT:    br i1 [[B]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
-; CHECK:       if.then:
-; CHECK-NEXT:    br label [[RETURN:%.*]]
-; CHECK:       if.end:
-; CHECK-NEXT:    br label [[RETURN]]
-; CHECK:       return:
-; CHECK-NEXT:    [[RETVAL_0:%.*]] = phi ptr [ [[TMP0]], [[IF_THEN]] ], [ [[TMP0]], [[IF_END]] ]
+; CHECK-NEXT:    br i1 [[B]], label %[[IF_THEN:.*]], label %[[IF_END:.*]]
+; CHECK:       [[IF_THEN]]:
+; CHECK-NEXT:    br label %[[RETURN:.*]]
+; CHECK:       [[IF_END]]:
+; CHECK-NEXT:    br label %[[RETURN]]
+; CHECK:       [[RETURN]]:
+; CHECK-NEXT:    [[RETVAL_0:%.*]] = phi ptr [ [[TMP0]], %[[IF_THEN]] ], [ [[TMP0]], %[[IF_END]] ]
 ; CHECK-NEXT:    ret ptr [[RETVAL_0]]
 ;
 entry:
@@ -167,8 +167,8 @@ return:                                           ; preds = %if.end, %if.then
 !8 = !{i32 3, i32 4}
 !9 = !{}
 ;.
-; CHECK: [[TBAA0]] = !{!1, !1, i64 0}
-; CHECK: [[META1:![0-9]+]] = !{!"omnipotent char", !2, i64 0}
-; CHECK: [[META2:![0-9]+]] = !{!"Simple C++ TBAA"}
+; CHECK: [[CHAR_TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0}
+; CHECK: [[META1]] = !{!"omnipotent char", [[META2:![0-9]+]], i64 0}
+; CHECK: [[META2]] = !{!"Simple C++ TBAA"}
 ; CHECK: [[RNG3]] = !{i32 0, i32 2, i32 3, i32 4}
 ;.
diff --git a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/mem-intrinsics.ll b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/mem-intrinsics.ll
index 1c317786d1c20..ebc5c0d717c6d 100644
--- a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/mem-intrinsics.ll
+++ b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/mem-intrinsics.ll
@@ -1,10 +1,10 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=infer-address-spaces %s | FileCheck %s
 
 define amdgpu_kernel void @memset_group_to_flat(ptr addrspace(3) %group.ptr, i32 %y) #0 {
 ; CHECK-LABEL: define amdgpu_kernel void @memset_group_to_flat(
 ; CHECK-SAME: ptr addrspace(3) [[GROUP_PTR:%.*]], i32 [[Y:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:    call void @llvm.memset.p3.i64(ptr addrspace(3) align 4 [[GROUP_PTR]], i8 4, i64 32, i1 false), !tbaa [[TBAA0:![0-9]+]], !alias.scope [[META3:![0-9]+]], !noalias [[META6:![0-9]+]]
+; CHECK-NEXT:    call void @llvm.memset.p3.i64(ptr addrspace(3) align 4 [[GROUP_PTR]], i8 4, i64 32, i1 false), !tbaa [[A_TBAA0:![0-9]+]], !alias.scope [[META3:![0-9]+]], !noalias [[META6:![0-9]+]]
 ; CHECK-NEXT:    ret void
 ;
   %cast = addrspacecast ptr addrspace(3) %group.ptr to ptr
@@ -15,7 +15,7 @@ define amdgpu_kernel void @memset_group_to_flat(ptr addrspace(3) %group.ptr, i32
 define amdgpu_kernel void @memset_global_to_flat(ptr addrspace(1) %global.ptr, i32 %y) #0 {
 ; CHECK-LABEL: define amdgpu_kernel void @memset_global_to_flat(
 ; CHECK-SAME: ptr addrspace(1) [[GLOBAL_PTR:%.*]], i32 [[Y:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    call void @llvm.memset.p1.i64(ptr addrspace(1) align 4 [[GLOBAL_PTR]], i8 4, i64 32, i1 false), !tbaa [[TBAA0]], !alias.scope [[META3]], !noalias [[META6]]
+; CHECK-NEXT:    call void @llvm.memset.p1.i64(ptr addrspace(1) align 4 [[GLOBAL_PTR]], i8 4, i64 32, i1 false), !tbaa [[A_TBAA0]], !alias.scope [[META3]], !noalias [[META6]]
 ; CHECK-NEXT:    ret void
 ;
   %cast = addrspacecast ptr addrspace(1) %global.ptr to ptr
@@ -49,7 +49,7 @@ define amdgpu_kernel void @memcpy_flat_to_flat_replace_src_with_group(ptr %dest,
 ; CHECK-LABEL: define amdgpu_kernel void @memcpy_flat_to_flat_replace_src_with_group(
 ; CHECK-SAME: ptr [[DEST:%.*]], ptr addrspace(3) [[SRC_GROUP_PTR:%.*]], i64 [[SIZE:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = addrspacecast ptr [[DEST]] to ptr addrspace(1)
-; CHECK-NEXT:    call void @llvm.memcpy.p1.p3.i64(ptr addrspace(1) align 4 [[TMP1]], ptr addrspace(3) align 4 [[SRC_GROUP_PTR]], i64 [[SIZE]], i1 false), !tbaa [[TBAA0]], !alias.scope [[META3]], !noalias [[META6]]
+; CHECK-NEXT:    call void @llvm.memcpy.p1.p3.i64(ptr addrspace(1) align 4 [[TMP1]], ptr addrspace(3) align 4 [[SRC_GROUP_PTR]], i64 [[SIZE]], i1 false), !tbaa [[A_TBAA0]], !alias.scope [[META3]], !noalias [[META6]]
 ; CHECK-NEXT:    ret void
 ;
   %cast.src = addrspacecast ptr addrspace(3) %src.group.ptr to ptr
@@ -61,7 +61,7 @@ define amdgpu_kernel void @memcpy_inline_flat_to_flat_replace_src_with_group(ptr
 ; CHECK-LABEL: define amdgpu_kernel void @memcpy_inline_flat_to_flat_replace_src_with_group(
 ; CHECK-SAME: ptr [[DEST:%.*]], ptr addrspace(3) [[SRC_GROUP_PTR:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = addrspacecast ptr [[DEST]] to ptr addrspace(1)
-; CHECK-NEXT:    call void @llvm.memcpy.inline.p1.p3.i64(ptr addrspace(1) align 4 [[TMP1]], ptr addrspace(3) align 4 [[SRC_GROUP_PTR]], i64 42, i1 false), !tbaa [[TBAA0]], !alias.scope [[META3]], !noalias [[META6]]
+; CHECK-NEXT:    call void @llvm.memcpy.inline.p1.p3.i64(ptr addrspace(1) align 4 [[TMP1]], ptr addrspace(3) align 4 [[SRC_GROUP_PTR]], i64 42, i1 false), !tbaa [[A_TBAA0]], !alias.scope [[META3]], !noalias [[META6]]
 ; CHECK-NEXT:    ret void
 ;
   %cast.src = addrspacecast ptr addrspace(3) %src.group.ptr to ptr
@@ -73,7 +73,7 @@ define amdgpu_kernel void @memcpy_flat_to_flat_replace_dest_with_group(ptr addrs
 ; CHECK-LABEL: define amdgpu_kernel void @memcpy_flat_to_flat_replace_dest_with_group(
 ; CHECK-SAME: ptr addrspace(3) [[DEST_GROUP_PTR:%.*]], ptr [[SRC_PTR:%.*]], i64 [[SIZE:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = addrspacecast ptr [[SRC_PTR]] to ptr addrspace(1)
-; CHECK-NEXT:    call void @llvm.memcpy.p3.p1.i64(ptr addrspace(3) align 4 [[DEST_GROUP_PTR]], ptr addrspace(1) align 4 [[TMP1]], i64 [[SIZE]], i1 false), !tbaa [[TBAA0]], !alias.scope [[META3]], !noalias [[META6]]
+; CHECK-NEXT:    call void @llvm.memcpy.p3.p1.i64(ptr addrspace(3) align 4 [[DEST_GROUP_PTR]], ptr addrspace(1) align 4 [[TMP1]], i64 [[SIZE]], i1 false), !tbaa [[A_TBAA0]], !alias.scope [[META3]], !noalias [[META6]]
 ; CHECK-NEXT:    ret void
 ;
   %cast.dest = addrspacecast ptr addrspace(3) %dest.group.ptr to ptr
@@ -84,7 +84,7 @@ define amdgpu_kernel void @memcpy_flat_to_flat_replace_dest_with_group(ptr addrs
 define amdgpu_kernel void @memcpy_flat_to_flat_replace_dest_src_with_group(ptr addrspace(3) %dest.group.ptr, ptr addrspace(3) %src.group.ptr, i64 %size) #0 {
 ; CHECK-LABEL: define amdgpu_kernel void @memcpy_flat_to_flat_replace_dest_src_with_group(
 ; CHECK-SAME: ptr addrspace(3) [[DEST_GROUP_PTR:%.*]], ptr addrspace(3) [[SRC_GROUP_PTR:%.*]], i64 [[SIZE:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    call void @llvm.memcpy.p3.p3.i64(ptr addrspace(3) align 4 [[SRC_GROUP_PTR]], ptr addrspace(3) align 4 [[SRC_GROUP_PTR]], i64 [[SIZE]], i1 false), !tbaa [[TBAA0]], !alias.scope [[META3]], !noalias [[META6]]
+; CHECK-NEXT:    call void @llvm.memcpy.p3.p3.i64(ptr addrspace(3) align 4 [[SRC_GROUP_PTR]], ptr addrspace(3) align 4 [[SRC_GROUP_PTR]], i64 [[SIZE]], i1 false), !tbaa [[A_TBAA0]], !alias.scope [[META3]], !noalias [[META6]]
 ; CHECK-NEXT:    ret void
 ;
   %cast.src = addrspacecast ptr addrspace(3) %src.group.ptr to ptr
@@ -96,7 +96,7 @@ define amdgpu_kernel void @memcpy_flat_to_flat_replace_dest_src_with_group(ptr a
 define amdgpu_kernel void @memcpy_flat_to_flat_replace_dest_group_src_global(ptr addrspace(3) %dest.group.ptr, ptr addrspace(1) %src.global.ptr, i64 %size) #0 {
 ; CHECK-LABEL: define amdgpu_kernel void @memcpy_flat_to_flat_replace_dest_group_src_global(
 ; CHECK-SAME: ptr addrspace(3) [[DEST_GROUP_PTR:%.*]], ptr addrspace(1) [[SRC_GLOBAL_PTR:%.*]], i64 [[SIZE:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    call void @llvm.memcpy.p3.p1.i64(ptr addrspace(3) align 4 [[DEST_GROUP_PTR]], ptr addrspace(1) align 4 [[SRC_GLOBAL_PTR]], i64 [[SIZE]], i1 false), !tbaa [[TBAA0]], !alias.scope [[META3]], !noalias [[META6]]
+; CHECK-NEXT:    call void @llvm.memcpy.p3.p1.i64(ptr addrspace(3) align 4 [[DEST_GROUP_PTR]], ptr addrspace(1) align 4 [[SRC_GLOBAL_PTR]], i64 [[SIZE]], i1 false), !tbaa [[A_TBAA0]], !alias.scope [[META3]], !noalias [[META6]]
 ; CHECK-NEXT:    ret void
 ;
   %cast.src = addrspacecast ptr addrspace(1) %src.global.ptr to ptr
@@ -108,7 +108,7 @@ define amdgpu_kernel void @memcpy_flat_to_flat_replace_dest_group_src_global(ptr
 define amdgpu_kernel void @memcpy_group_to_flat_replace_dest_global(ptr addrspace(1) %dest.global.ptr, ptr addrspace(3) %src.group.ptr, i32 %size) #0 {
 ; CHECK-LABEL: define amdgpu_kernel void @memcpy_group_to_flat_replace_dest_global(
 ; CHECK-SAME: ptr addrspace(1) [[DEST_GLOBAL_PTR:%.*]], ptr addrspace(3) [[SRC_GROUP_PTR:%.*]], i32 [[SIZE:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    call void @llvm.memcpy.p1.p3.i32(ptr addrspace(1) align 4 [[DEST_GLOBAL_PTR]], ptr addrspace(3) align 4 [[SRC_GROUP_PTR]], i32 [[SIZE]], i1 false), !tbaa [[TBAA0]], !alias.scope [[META3]], !noalias [[META6]]
+; CHECK-NEXT:    call void @llvm.memcpy.p1.p3.i32(ptr addrspace(1) align 4 [[DEST_GLOBAL_PTR]], ptr addrspace(3) align 4 [[SRC_GROUP_PTR]], i32 [[SIZE]], i1 false), !tbaa [[A_TBAA0]], !alias.scope [[META3]], !noalias [[META6]]
 ; CHECK-NEXT:    ret void
 ;
   %cast.dest = addrspacecast ptr addrspace(1) %dest.global.ptr to ptr
@@ -159,7 +159,7 @@ define amdgpu_kernel void @multiple_memcpy_flat_to_flat_replace_src_with_group_n
 define amdgpu_kernel void @memcpy_group_flat_to_flat_self(ptr addrspace(3) %group.ptr) #0 {
 ; CHECK-LABEL: define amdgpu_kernel void @memcpy_group_flat_to_flat_self(
 ; CHECK-SAME: ptr addrspace(3) [[GROUP_PTR:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    call void @llvm.memcpy.p3.p3.i64(ptr addrspace(3) align 4 [[GROUP_PTR]], ptr addrspace(3) align 4 [[GROUP_PTR]], i64 32, i1 false), !tbaa [[TBAA0]], !alias.scope [[META3]], !noalias [[META6]]
+; CHECK-NEXT:    call void @llvm.memcpy.p3.p3.i64(ptr addrspace(3) align 4 [[GROUP_PTR]], ptr addrspace(3) align 4 [[GROUP_PTR]], i64 32, i1 false), !tbaa [[A_TBAA0]], !alias.scope [[META3]], !noalias [[META6]]
 ; CHECK-NEXT:    ret void
 ;
   %cast = addrspacecast ptr addrspace(3) %group.ptr to ptr
@@ -170,7 +170,7 @@ define amdgpu_kernel void @memmove_flat_to_flat_replace_src_with_group(ptr %dest
 ; CHECK-LABEL: define amdgpu_kernel void @memmove_flat_to_flat_replace_src_with_group(
 ; CHECK-SAME: ptr [[DEST:%.*]], ptr addrspace(3) [[SRC_GROUP_PTR:%.*]], i64 [[SIZE:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = addrspacecast ptr [[DEST]] to ptr addrspace(1)
-; CHECK-NEXT:    call void @llvm.memmove.p1.p3.i64(ptr addrspace(1) align 4 [[TMP1]], ptr addrspace(3) align 4 [[SRC_GROUP_PTR]], i64 [[SIZE]], i1 false), !tbaa [[TBAA0]], !alias.scope [[META3]], !noalias [[META6]]
+; CHECK-NEXT:    call void @llvm.memmove.p1.p3.i64(ptr addrspace(1) align 4 [[TMP1]], ptr addrspace(3) align 4 [[SRC_GROUP_PTR]], i64 [[SIZE]], i1 false), !tbaa [[A_TBAA0]], !alias.scope [[META3]], !noalias [[META6]]
 ; CHECK-NEXT:    ret void
 ;
   %cast.src = addrspacecast ptr addrspace(3) %src.group.ptr to ptr
@@ -236,7 +236,7 @@ attributes #1 = { argmemonly nounwind }
 !7 = distinct !{!7, !5, !"some scope 2"}
 !8 = !{i64 0, i64 8, null}
 ;.
-; CHECK: [[TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0}
+; CHECK: [[A_TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0}
 ; CHECK: [[META1]] = !{!"A", [[META2:![0-9]+]]}
 ; CHECK: [[META2]] = !{!"tbaa root"}
 ; CHECK: [[META3]] = !{[[META4:![0-9]+]]}
diff --git a/llvm/test/Transforms/InstCombine/2004-09-20-BadLoadCombine2.ll b/llvm/test/Transforms/InstCombine/2004-09-20-BadLoadCombine2.ll
index f558e35ebe015..1d89dd6195032 100644
--- a/llvm/test/Transforms/InstCombine/2004-09-20-BadLoadCombine2.ll
+++ b/llvm/test/Transforms/InstCombine/2004-09-20-BadLoadCombine2.ll
@@ -1,25 +1,35 @@
-; RUN: opt < %s -passes=instcombine,mem2reg,simplifycfg -simplifycfg-require-and-preserve-domtree=1 | \
-; RUN:   llvm-dis | grep -v store | not grep "i32 1"
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt %s -passes=instcombine,mem2reg,simplifycfg -simplifycfg-require-and-preserve-domtree=1 -S -o - | FileCheck %s
 
 ; Test to make sure that instcombine does not accidentally propagate the load
 ; into the PHI, which would break the program.
 
 define i32 @test(i1 %C) {
+; CHECK-LABEL: define i32 @test(
+; CHECK-SAME: i1 [[C:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[X:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[X2:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    store i32 1, ptr [[X]], align 4
+; CHECK-NEXT:    store i32 2, ptr [[X2]], align 4
+; CHECK-NEXT:    [[SPEC_SELECT:%.*]] = select i1 [[C]], ptr [[X]], ptr [[X2]]
+; CHECK-NEXT:    store i32 3, ptr [[X]], align 4
+; CHECK-NEXT:    [[TMP_3:%.*]] = load i32, ptr [[SPEC_SELECT]], align 4
+; CHECK-NEXT:    ret i32 [[TMP_3]]
+;
 entry:
-        %X = alloca i32         ; <ptr> [#uses=3]
-        %X2 = alloca i32                ; <ptr> [#uses=2]
-        store i32 1, ptr %X
-        store i32 2, ptr %X2
-        br i1 %C, label %cond_true.i, label %cond_continue.i
+  %X = alloca i32         ; <ptr> [#uses=3]
+  %X2 = alloca i32                ; <ptr> [#uses=2]
+  store i32 1, ptr %X
+  store i32 2, ptr %X2
+  br i1 %C, label %cond_true.i, label %cond_continue.i
 
 cond_true.i:            ; preds = %entry
-        br label %cond_continue.i
+  br label %cond_continue.i
 
 cond_continue.i:                ; preds = %cond_true.i, %entry
-        %mem_tmp.i.0 = phi ptr [ %X, %cond_true.i ], [ %X2, %entry ]           ; <ptr> [#uses=1]
-        store i32 3, ptr %X
-        %tmp.3 = load i32, ptr %mem_tmp.i.0         ; <i32> [#uses=1]
-        ret i32 %tmp.3
+  %mem_tmp.i.0 = phi ptr [ %X, %cond_true.i ], [ %X2, %entry ]           ; <ptr> [#uses=1]
+  store i32 3, ptr %X
+  %tmp.3 = load i32, ptr %mem_tmp.i.0         ; <i32> [#uses=1]
+  ret i32 %tmp.3
 }
-
-
diff --git a/llvm/test/Transforms/InstCombine/AArch64/sme-intrinsic-opts-counting-elems.ll b/llvm/test/Transforms/InstCombine/AArch64/sme-intrinsic-opts-counting-elems.ll
index f213c0b53f6ef..c1d12b825b72c 100644
--- a/llvm/test/Transforms/InstCombine/AArch64/sme-intrinsic-opts-counting-elems.ll
+++ b/llvm/test/Transforms/InstCombine/AArch64/sme-intrinsic-opts-counting-elems.ll
@@ -5,48 +5,6 @@
 
 target triple = "aarch64-unknown-linux-gnu"
 
-define i64 @cntsb() {
-; CHECK-LABEL: @cntsb(
-; CHECK-NEXT:    [[OUT:%.*]] = call i64 @llvm.aarch64.sme.cntsb()
-; CHECK-NEXT:    ret i64 [[OUT]]
-;
-; CHECK-STREAMING-LABEL: @cntsb(
-; CHECK-STREAMING-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-STREAMING-NEXT:    [[OUT:%.*]] = shl nuw i64 [[TMP1]], 4
-; CHECK-STREAMING-NEXT:    ret i64 [[OUT]]
-;
-  %out = call i64 @llvm.aarch64.sme.cntsb()
-  ret i64 %out
-}
-
-define i64 @cntsh() {
-; CHECK-LABEL: @cntsh(
-; CHECK-NEXT:    [[OUT:%.*]] = call i64 @llvm.aarch64.sme.cntsh()
-; CHECK-NEXT:    ret i64 [[OUT]]
-;
-; CHECK-STREAMING-LABEL: @cntsh(
-; CHECK-STREAMING-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-STREAMING-NEXT:    [[OUT:%.*]] = shl nuw i64 [[TMP1]], 3
-; CHECK-STREAMING-NEXT:    ret i64 [[OUT]]
-;
-  %out = call i64 @llvm.aarch64.sme.cntsh()
-  ret i64 %out
-}
-
-define i64 @cntsw() {
-; CHECK-LABEL: @cntsw(
-; CHECK-NEXT:    [[OUT:%.*]] = call i64 @llvm.aarch64.sme.cntsw()
-; CHECK-NEXT:    ret i64 [[OUT]]
-;
-; CHECK-STREAMING-LABEL: @cntsw(
-; CHECK-STREAMING-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-STREAMING-NEXT:    [[OUT:%.*]] = shl nuw i64 [[TMP1]], 2
-; CHECK-STREAMING-NEXT:    ret i64 [[OUT]]
-;
-  %out = call i64 @llvm.aarch64.sme.cntsw()
-  ret i64 %out
-}
-
 define i64 @cntsd() {
 ; CHECK-LABEL: @cntsd(
 ; CHECK-NEXT:    [[OUT:%.*]] = call i64 @llvm.aarch64.sme.cntsd()
@@ -61,8 +19,5 @@ define i64 @cntsd() {
   ret i64 %out
 }
 
-declare i64 @llvm.aarch64.sve.cntsb()
-declare i64 @llvm.aarch64.sve.cntsh()
-declare i64 @llvm.aarch64.sve.cntsw()
 declare i64 @llvm.aarch64.sve.cntsd()
 
diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-whilelo.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-whilelo.ll
new file mode 100644
index 0000000000000..181a41786418f
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-whilelo.ll
@@ -0,0 +1,102 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -passes=instcombine < %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+define <vscale x 2 x i1> @whilelo_nxv2i1.i32(i32 %a, i32 %b) {
+; CHECK-LABEL: define <vscale x 2 x i1> @whilelo_nxv2i1.i32(
+; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) {
+; CHECK-NEXT:    [[MASK:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i32(i32 [[A]], i32 [[B]])
+; CHECK-NEXT:    ret <vscale x 2 x i1> [[MASK]]
+;
+  %mask = tail call <vscale x 2 x i1> @llvm.aarch64.sve.whilelo.nxv2i1.i32(i32 %a, i32 %b)
+  ret <vscale x 2 x i1> %mask
+}
+
+define <vscale x 4 x i1> @whilelo_nxv4i1.i32(i32 %a, i32 %b) {
+; CHECK-LABEL: define <vscale x 4 x i1> @whilelo_nxv4i1.i32(
+; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) {
+; CHECK-NEXT:    [[MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 [[A]], i32 [[B]])
+; CHECK-NEXT:    ret <vscale x 4 x i1> [[MASK]]
+;
+  %mask = tail call <vscale x 4 x i1> @llvm.aarch64.sve.whilelo.nxv4i1.i32(i32 %a, i32 %b)
+  ret <vscale x 4 x i1> %mask
+}
+
+define <vscale x 8 x i1> @whilelo_nxv8i1.i32(i32 %a, i32 %b) {
+; CHECK-LABEL: define <vscale x 8 x i1> @whilelo_nxv8i1.i32(
+; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) {
+; CHECK-NEXT:    [[MASK:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i32(i32 [[A]], i32 [[B]])
+; CHECK-NEXT:    ret <vscale x 8 x i1> [[MASK]]
+;
+  %mask = tail call <vscale x 8 x i1> @llvm.aarch64.sve.whilelo.nxv8i1.i32(i32 %a, i32 %b)
+  ret <vscale x 8 x i1> %mask
+}
+
+define <vscale x 16 x i1> @whilelo_nxv16i1.i32(i32 %a, i32 %b) {
+; CHECK-LABEL: define <vscale x 16 x i1> @whilelo_nxv16i1.i32(
+; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) {
+; CHECK-NEXT:    [[MASK:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[A]], i32 [[B]])
+; CHECK-NEXT:    ret <vscale x 16 x i1> [[MASK]]
+;
+  %mask = tail call <vscale x 16 x i1> @llvm.aarch64.sve.whilelo.nxv16i1.i32(i32 %a, i32 %b)
+  ret <vscale x 16 x i1> %mask
+}
+
+define <vscale x 2 x i1> @whilelo_nxv2i1.i64(i64 %a, i64 %b) {
+; CHECK-LABEL: define <vscale x 2 x i1> @whilelo_nxv2i1.i64(
+; CHECK-SAME: i64 [[A:%.*]], i64 [[B:%.*]]) {
+; CHECK-NEXT:    [[MASK:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[A]], i64 [[B]])
+; CHECK-NEXT:    ret <vscale x 2 x i1> [[MASK]]
+;
+  %mask = tail call <vscale x 2 x i1> @llvm.aarch64.sve.whilelo.nxv2i1.i64(i64 %a, i64 %b)
+  ret <vscale x 2 x i1> %mask
+}
+
+define <vscale x 4 x i1> @whilelo_nxv4i1.i64(i64 %a, i64 %b) {
+; CHECK-LABEL: define <vscale x 4 x i1> @whilelo_nxv4i1.i64(
+; CHECK-SAME: i64 [[A:%.*]], i64 [[B:%.*]]) {
+; CHECK-NEXT:    [[MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[A]], i64 [[B]])
+; CHECK-NEXT:    ret <vscale x 4 x i1> [[MASK]]
+;
+  %mask = tail call <vscale x 4 x i1> @llvm.aarch64.sve.whilelo.nxv4i1.i64(i64 %a, i64 %b)
+  ret <vscale x 4 x i1> %mask
+}
+
+define <vscale x 8 x i1> @whilelo_nxv8i1.i64(i64 %a, i64 %b) {
+; CHECK-LABEL: define <vscale x 8 x i1> @whilelo_nxv8i1.i64(
+; CHECK-SAME: i64 [[A:%.*]], i64 [[B:%.*]]) {
+; CHECK-NEXT:    [[MASK:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[A]], i64 [[B]])
+; CHECK-NEXT:    ret <vscale x 8 x i1> [[MASK]]
+;
+  %mask = tail call <vscale x 8 x i1> @llvm.aarch64.sve.whilelo.nxv8i1.i64(i64 %a, i64 %b)
+  ret <vscale x 8 x i1> %mask
+}
+
+define <vscale x 16 x i1> @whilelo_nxv16i1.i64(i64 %a, i64 %b) {
+; CHECK-LABEL: define <vscale x 16 x i1> @whilelo_nxv16i1.i64(
+; CHECK-SAME: i64 [[A:%.*]], i64 [[B:%.*]]) {
+; CHECK-NEXT:    [[MASK:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[A]], i64 [[B]])
+; CHECK-NEXT:    ret <vscale x 16 x i1> [[MASK]]
+;
+  %mask = tail call <vscale x 16 x i1> @llvm.aarch64.sve.whilelo.nxv16i1.i64(i64 %a, i64 %b)
+  ret <vscale x 16 x i1> %mask
+}
+
+define <vscale x 16 x i1> @whilelo_nxv16i1.i64_const() {
+; CHECK-LABEL: define <vscale x 16 x i1> @whilelo_nxv16i1.i64_const() {
+; CHECK-NEXT:    [[MASK:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 16)
+; CHECK-NEXT:    ret <vscale x 16 x i1> [[MASK]]
+;
+  %mask = tail call <vscale x 16 x i1> @llvm.aarch64.sve.whilelo.nxv16i1.i64(i64 0, i64 16)
+  ret <vscale x 16 x i1> %mask
+}
+
+define <vscale x 16 x i1> @whilelo_nxv16i1.i32_const() {
+; CHECK-LABEL: define <vscale x 16 x i1> @whilelo_nxv16i1.i32_const() {
+; CHECK-NEXT:    [[MASK:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 0, i32 16)
+; CHECK-NEXT:    ret <vscale x 16 x i1> [[MASK]]
+;
+  %mask = tail call <vscale x 16 x i1> @llvm.aarch64.sve.whilelo.nxv16i1.i32(i32 0, i32 16)
+  ret <vscale x 16 x i1> %mask
+}
diff --git a/llvm/test/Transforms/InstCombine/abs-intrinsic.ll b/llvm/test/Transforms/InstCombine/abs-intrinsic.ll
index 346111d892975..763d82652dd5d 100644
--- a/llvm/test/Transforms/InstCombine/abs-intrinsic.ll
+++ b/llvm/test/Transforms/InstCombine/abs-intrinsic.ll
@@ -859,4 +859,141 @@ define i32 @abs_range_metadata(i32 %x) {
   %b = and i32 %a, 15
   ret i32 %b
 }
+
 !1 = !{i32 0, i32 16}
+
+define i32 @abs_diff(i32 %x, i32 %y) {
+; CHECK-LABEL: @abs_diff(
+; CHECK-NEXT:    [[SUB:%.*]] = sub nsw i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[COND:%.*]] = call i32 @llvm.abs.i32(i32 [[SUB]], i1 false)
+; CHECK-NEXT:    ret i32 [[COND]]
+;
+  %sub = sub nsw i32 %x, %y
+  %cmp = icmp sgt i32 %x, %y
+  %sub1 = sub i32 0, %sub
+  %cond = select i1 %cmp, i32 %sub, i32 %sub1
+  ret i32 %cond
+}
+
+define i32 @abs_diff_neg_no_nsw_neg(i32 %x, i32 %y) {
+; CHECK-LABEL: @abs_diff_neg_no_nsw_neg(
+; CHECK-NEXT:    [[SUB:%.*]] = sub i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[SUB1:%.*]] = sub i32 0, [[SUB]]
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[CMP]], i32 [[SUB]], i32 [[SUB1]]
+; CHECK-NEXT:    ret i32 [[COND]]
+;
+  %sub = sub i32 %x, %y
+  %cmp = icmp sgt i32 %x, %y
+  %sub1 = sub i32 0, %sub
+  %cond = select i1 %cmp, i32 %sub, i32 %sub1
+  ret i32 %cond
+}
+
+define i32 @abs_diff_neg(i32 %x, i32 %y) {
+; CHECK-LABEL: @abs_diff_neg(
+; CHECK-NEXT:    [[SUB:%.*]] = sub nsw i32 [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[SUB1:%.*]] = sub i32 0, [[SUB]]
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[CMP]], i32 [[SUB]], i32 [[SUB1]]
+; CHECK-NEXT:    ret i32 [[COND]]
+;
+  %sub = sub nsw i32 %y, %x
+  %cmp = icmp sgt i32 %x, %y
+  %sub1 = sub i32 0, %sub
+  %cond = select i1 %cmp, i32 %sub, i32 %sub1
+  ret i32 %cond
+}
+
+define i32 @abs_diff_neg_no_nsw(i32 %x, i32 %y) {
+; CHECK-LABEL: @abs_diff_neg_no_nsw(
+; CHECK-NEXT:    [[SUB:%.*]] = sub i32 [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[SUB1:%.*]] = sub i32 0, [[SUB]]
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[CMP]], i32 [[SUB]], i32 [[SUB1]]
+; CHECK-NEXT:    ret i32 [[COND]]
+;
+  %sub = sub i32 %y, %x
+  %cmp = icmp sgt i32 %x, %y
+  %sub1 = sub i32 0, %sub
+  %cond = select i1 %cmp, i32 %sub, i32 %sub1
+  ret i32 %cond
+}
+
+define i32 @abs_diff_ge(i32 %x, i32 %y) {
+; CHECK-LABEL: @abs_diff_ge(
+; CHECK-NEXT:    [[SUB:%.*]] = sub nsw i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[COND:%.*]] = call i32 @llvm.abs.i32(i32 [[SUB]], i1 false)
+; CHECK-NEXT:    ret i32 [[COND]]
+;
+  %sub = sub nsw i32 %x, %y
+  %cmp = icmp sge i32 %x, %y
+  %sub1 = sub i32 0, %sub
+  %cond = select i1 %cmp, i32 %sub, i32 %sub1
+  ret i32 %cond
+}
+
+define i32 @abs_diff_slt_commute(i32 %x, i32 %y) {
+; CHECK-LABEL: @abs_diff_slt_commute(
+; CHECK-NEXT:    [[SUB:%.*]] = sub nsw i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[COND:%.*]] = call i32 @llvm.abs.i32(i32 [[SUB]], i1 false)
+; CHECK-NEXT:    ret i32 [[COND]]
+;
+  %sub = sub nsw i32 %x, %y
+  %cmp = icmp slt i32 %y, %x
+  %sub1 = sub i32 0, %sub
+  %cond = select i1 %cmp, i32 %sub, i32 %sub1
+  ret i32 %cond
+}
+
+define i32 @abs_diff_sge_same(i32 %x, i32 %y) {
+; CHECK-LABEL: @abs_diff_sge_same(
+; CHECK-NEXT:    [[SUB:%.*]] = sub nsw i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[COND:%.*]] = call i32 @llvm.abs.i32(i32 [[SUB]], i1 false)
+; CHECK-NEXT:    ret i32 [[COND]]
+;
+  %sub = sub nsw i32 %x, %y
+  %cmp = icmp sge i32 %x, %y
+  %sub1 = sub i32 0, %sub
+  %cond = select i1 %cmp, i32 %sub, i32 %sub1
+  ret i32 %cond
+}
+
+define i32 @abs_diff_sle_inverted(i32 %x, i32 %y) {
+; CHECK-LABEL: @abs_diff_sle_inverted(
+; CHECK-NEXT:    [[SUB:%.*]] = sub nsw i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[COND:%.*]] = call i32 @llvm.abs.i32(i32 [[SUB]], i1 false)
+; CHECK-NEXT:    ret i32 [[COND]]
+;
+  %sub = sub nsw i32 %x, %y
+  %cmp = icmp sle i32 %x, %y
+  %sub1 = sub i32 0, %sub
+  %cond = select i1 %cmp, i32 %sub1, i32 %sub
+  ret i32 %cond
+}
+
+define i32 @abs_diff_sle_commute(i32 %x, i32 %y) {
+; CHECK-LABEL: @abs_diff_sle_commute(
+; CHECK-NEXT:    [[SUB:%.*]] = sub nsw i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[COND:%.*]] = call i32 @llvm.abs.i32(i32 [[SUB]], i1 false)
+; CHECK-NEXT:    ret i32 [[COND]]
+;
+  %sub = sub nsw i32 %x, %y
+  %cmp = icmp sle i32 %y, %x
+  %sub1 = sub i32 0, %sub
+  %cond = select i1 %cmp, i32 %sub, i32 %sub1
+  ret i32 %cond
+}
+
+define i8 @abs_diff_sle_y_x(i8 %x, i8 %y) {
+; CHECK-LABEL: @abs_diff_sle_y_x(
+; CHECK-NEXT:    [[SUB:%.*]] = sub nsw i8 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[COND:%.*]] = call i8 @llvm.abs.i8(i8 [[SUB]], i1 false)
+; CHECK-NEXT:    ret i8 [[COND]]
+;
+  %sub = sub nsw i8 %x, %y
+  %cmp = icmp sle i8 %y, %x
+  %sub1 = sub i8 0, %sub
+  %cond = select i1 %cmp, i8 %sub, i8 %sub1
+  ret i8 %cond
+}
diff --git a/llvm/test/Transforms/InstCombine/alloca-cast-debuginfo.ll b/llvm/test/Transforms/InstCombine/alloca-cast-debuginfo.ll
index 08ce83b389786..ee3f2305f1a2c 100644
--- a/llvm/test/Transforms/InstCombine/alloca-cast-debuginfo.ll
+++ b/llvm/test/Transforms/InstCombine/alloca-cast-debuginfo.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt < %s -S -passes=instcombine -instcombine-lower-dbg-declare=0 | FileCheck %s
 
 ; In this example, instcombine wants to turn "local" into an i64, since that's
@@ -24,12 +24,13 @@ target triple = "x86_64-pc-windows-msvc19.11.25508"
 %struct.Foo = type { i32, i32 }
 
 define void @f(ptr %p) !dbg !11 {
-; CHECK-LABEL: @f(
-; CHECK-NEXT:  entry:
+; CHECK-LABEL: define void @f(
+; CHECK-SAME: ptr [[P:%.*]]) !dbg [[DBG11:![0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[LOCAL:%.*]] = alloca [[STRUCT_FOO:%.*]], align 4
 ; CHECK-NEXT:      #dbg_declare(ptr [[LOCAL]], [[META22:![0-9]+]], !DIExpression(), [[META23:![0-9]+]])
-; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[P:%.*]], align 8, !dbg [[DBG24:![0-9]+]], !tbaa [[TBAA25:![0-9]+]]
-; CHECK-NEXT:    store i64 [[TMP0]], ptr [[LOCAL]], align 4, !dbg [[DBG29:![0-9]+]], !tbaa [[TBAA25]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[P]], align 8, !dbg [[DBG24:![0-9]+]], !tbaa [[LONG_LONG_TBAA25:![0-9]+]]
+; CHECK-NEXT:    store i64 [[TMP0]], ptr [[LOCAL]], align 4, !dbg [[DBG29:![0-9]+]], !tbaa [[LONG_LONG_TBAA25]]
 ; CHECK-NEXT:    call void @escape(ptr nonnull [[LOCAL]]), !dbg [[DBG30:![0-9]+]]
 ; CHECK-NEXT:    ret void, !dbg [[DBG31:![0-9]+]]
 ;
diff --git a/llvm/test/Transforms/InstCombine/assume-align.ll b/llvm/test/Transforms/InstCombine/assume-align.ll
index f0e0257433086..274632658496b 100644
--- a/llvm/test/Transforms/InstCombine/assume-align.ll
+++ b/llvm/test/Transforms/InstCombine/assume-align.ll
@@ -175,7 +175,6 @@ define ptr @dont_fold_assume_align_zero_of_loaded_pointer_into_align_metadata(pt
 define ptr @redundant_assume_align_1(ptr %p) {
 ; CHECK-LABEL: @redundant_assume_align_1(
 ; CHECK-NEXT:    [[P2:%.*]] = load ptr, ptr [[P:%.*]], align 8
-; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[P2]], i32 1) ]
 ; CHECK-NEXT:    call void @foo(ptr [[P2]])
 ; CHECK-NEXT:    ret ptr [[P2]]
 ;
@@ -189,7 +188,6 @@ define ptr @redundant_assume_align_1(ptr %p) {
 define ptr @redundant_assume_align_8_via_align_metadata(ptr %p) {
 ; CHECK-LABEL: @redundant_assume_align_8_via_align_metadata(
 ; CHECK-NEXT:    [[P2:%.*]] = load ptr, ptr [[P:%.*]], align 8, !align [[META0:![0-9]+]]
-; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[P2]], i32 8) ]
 ; CHECK-NEXT:    call void @foo(ptr [[P2]])
 ; CHECK-NEXT:    ret ptr [[P2]]
 ;
@@ -250,6 +248,19 @@ define ptr @redundant_assume_align_8_via_asume(ptr %p) {
 }
 
 declare void @foo(ptr)
+
+; !align must have a constant integer alignment.
+define ptr @assume_load_pointer_result(ptr %p, i64 %align) {
+; CHECK-LABEL: @assume_load_pointer_result(
+; CHECK-NEXT:    [[P2:%.*]] = load ptr, ptr [[P:%.*]], align 8
+; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[P2]], i64 [[ALIGN:%.*]]) ]
+; CHECK-NEXT:    ret ptr [[P2]]
+;
+  %p2 = load ptr, ptr %p
+  call void @llvm.assume(i1 true) [ "align"(ptr %p2, i64 %align) ]
+  ret ptr %p2
+}
+
 ;.
 ; CHECK: [[META0]] = !{i64 8}
 ;.
diff --git a/llvm/test/Transforms/InstCombine/exact.ll b/llvm/test/Transforms/InstCombine/exact.ll
index c7377ab17d540..819e8fbb89b5f 100644
--- a/llvm/test/Transforms/InstCombine/exact.ll
+++ b/llvm/test/Transforms/InstCombine/exact.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -passes=instcombine -S | FileCheck %s
+; RUN: opt < %s -passes=instcombine -use-constant-int-for-fixed-length-splat=false -S | FileCheck %s --check-prefixes=CHECK,CHECK-CV
+; RUN: opt < %s -passes=instcombine -use-constant-int-for-fixed-length-splat -S | FileCheck %s --check-prefixes=CHECK,CHECK-CI
 
 define i32 @sdiv1(i32 %x) {
 ; CHECK-LABEL: @sdiv1(
@@ -162,12 +163,16 @@ define i1 @pr9998(i32 %V) {
 
 ; FIXME: Vectors should fold the same way.
 define <2 x i1> @pr9998vec(<2 x i32> %V) {
-; CHECK-LABEL: @pr9998vec(
-; CHECK-NEXT:    [[TMP1:%.*]] = and <2 x i32> [[V:%.*]], splat (i32 1)
-; CHECK-NEXT:    [[X:%.*]] = sub nsw <2 x i32> zeroinitializer, [[TMP1]]
-; CHECK-NEXT:    [[Y:%.*]] = sext <2 x i32> [[X]] to <2 x i64>
-; CHECK-NEXT:    [[Z:%.*]] = icmp ugt <2 x i64> [[Y]], splat (i64 7297771788697658747)
-; CHECK-NEXT:    ret <2 x i1> [[Z]]
+; CHECK-CV-LABEL: @pr9998vec(
+; CHECK-CV-NEXT:    [[TMP1:%.*]] = and <2 x i32> [[V:%.*]], splat (i32 1)
+; CHECK-CV-NEXT:    [[X:%.*]] = sub nsw <2 x i32> zeroinitializer, [[TMP1]]
+; CHECK-CV-NEXT:    [[Y:%.*]] = sext <2 x i32> [[X]] to <2 x i64>
+; CHECK-CV-NEXT:    [[Z:%.*]] = icmp ugt <2 x i64> [[Y]], splat (i64 7297771788697658747)
+; CHECK-CV-NEXT:    ret <2 x i1> [[Z]]
+;
+; CHECK-CI-LABEL: @pr9998vec(
+; CHECK-CI-NEXT:    [[Z:%.*]] = trunc <2 x i32> [[V:%.*]] to <2 x i1>
+; CHECK-CI-NEXT:    ret <2 x i1> [[Z]]
 ;
   %W = shl <2 x i32> %V, <i32 31, i32 31>
   %X = ashr exact <2 x i32> %W, <i32 31, i32 31>
diff --git a/llvm/test/Transforms/InstCombine/fold-fadd-with-zero-gh154238.ll b/llvm/test/Transforms/InstCombine/fold-fadd-with-zero-gh154238.ll
new file mode 100644
index 0000000000000..f9f0ca8a08bcb
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/fold-fadd-with-zero-gh154238.ll
@@ -0,0 +1,26 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -passes=instcombine -S | FileCheck %s
+define float @src(float %arg1) {
+; CHECK-LABEL: define float @src(
+; CHECK-SAME: float [[ARG1:%.*]]) {
+; CHECK-NEXT:    [[V3:%.*]] = call float @llvm.fabs.f32(float [[ARG1]])
+; CHECK-NEXT:    ret float [[V3]]
+;
+  %v2 = fadd float %arg1, 0.000000e+00
+  %v3 = call float @llvm.fabs.f32(float %v2)
+  ret float %v3
+}
+
+define float @src2(float %arg1) {
+; CHECK-LABEL: define float @src2(
+; CHECK-SAME: float [[ARG1:%.*]]) {
+; CHECK-NEXT:    [[V2:%.*]] = fadd float [[ARG1]], 0.000000e+00
+; CHECK-NEXT:    [[V3:%.*]] = call float @llvm.fabs.f32(float [[V2]])
+; CHECK-NEXT:    [[V4:%.*]] = fsub float [[V2]], [[V3]]
+; CHECK-NEXT:    ret float [[V4]]
+;
+  %v2 = fadd float %arg1, 0.000000e+00
+  %v3 = call float @llvm.fabs.f32(float %v2)
+  %v4 = fsub float %v2, %v3
+  ret float %v4
+}
diff --git a/llvm/test/Transforms/InstCombine/get_active_lane_mask.ll b/llvm/test/Transforms/InstCombine/get_active_lane_mask.ll
new file mode 100644
index 0000000000000..c642904cc275b
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/get_active_lane_mask.ll
@@ -0,0 +1,38 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -passes=instcombine -S | FileCheck %s
+
+define <vscale x 4 x i1> @rewrite_range_nxv4i1() {
+; CHECK-LABEL: define <vscale x 4 x i1> @rewrite_range_nxv4i1() {
+; CHECK-NEXT:    [[MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 3)
+; CHECK-NEXT:    ret <vscale x 4 x i1> [[MASK]]
+;
+  %mask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 1, i32 4)
+  ret <vscale x 4 x i1> %mask
+}
+
+define <vscale x 16 x i1> @rewrite_range_nxv16i1() {
+; CHECK-LABEL: define <vscale x 16 x i1> @rewrite_range_nxv16i1() {
+; CHECK-NEXT:    [[MASK:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 7)
+; CHECK-NEXT:    ret <vscale x 16 x i1> [[MASK]]
+;
+  %mask = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 123123, i64 123130)
+  ret <vscale x 16 x i1> %mask
+}
+
+define <vscale x 16 x i1> @rewrite_range_nxv16i1_i128() {
+; CHECK-LABEL: define <vscale x 16 x i1> @rewrite_range_nxv16i1_i128() {
+; CHECK-NEXT:    [[MASK:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i128(i128 0, i128 10)
+; CHECK-NEXT:    ret <vscale x 16 x i1> [[MASK]]
+;
+  %mask = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i128(i128 18446744073709551616, i128 18446744073709551626)
+  ret <vscale x 16 x i1> %mask
+}
+
+define <vscale x 4 x i1> @bail_lhs_is_zero() {
+; CHECK-LABEL: define <vscale x 4 x i1> @bail_lhs_is_zero() {
+; CHECK-NEXT:    [[MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)
+; CHECK-NEXT:    ret <vscale x 4 x i1> [[MASK]]
+;
+  %mask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)
+  ret <vscale x 4 x i1> %mask
+}
diff --git a/llvm/test/Transforms/InstCombine/is_fpclass.ll b/llvm/test/Transforms/InstCombine/is_fpclass.ll
index c1809b8bec61c..b86b307e4c7fd 100644
--- a/llvm/test/Transforms/InstCombine/is_fpclass.ll
+++ b/llvm/test/Transforms/InstCombine/is_fpclass.ll
@@ -3922,6 +3922,38 @@ define i1 @test_class_is_not_psub_pnorm_pinf__dynamic(float %arg) #3 {
   ret i1 %class
 }
 
+; Make sure we don't take sign bit from NaN operands.
+
+define i1 @minnum_qnan(i32 %x) {
+; CHECK-LABEL: @minnum_qnan(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    ret i1 true
+;
+entry:
+  %qnan_bits = or i32 %x, -5938
+  %qnan = bitcast i32 %qnan_bits to float
+  %min = call float @llvm.minnum.f32(float %qnan, float 0.000000e+00)
+  %test = call i1 @llvm.is.fpclass.f32(float %min, i32 64)
+  ret i1 %test
+}
+
+define i1 @minnum_qnan_commuted(i32 %x, float nofpclass(nnorm nsub nzero ninf nan) %y) {
+; CHECK-LABEL: @minnum_qnan_commuted(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[QNAN_BITS:%.*]] = or i32 [[X:%.*]], -5938
+; CHECK-NEXT:    [[QNAN:%.*]] = bitcast i32 [[QNAN_BITS]] to float
+; CHECK-NEXT:    [[MIN:%.*]] = call float @llvm.minnum.f32(float [[Y:%.*]], float [[QNAN]])
+; CHECK-NEXT:    [[TEST:%.*]] = call i1 @llvm.is.fpclass.f32(float [[MIN]], i32 64)
+; CHECK-NEXT:    ret i1 [[TEST]]
+;
+entry:
+  %qnan_bits = or i32 %x, -5938
+  %qnan = bitcast i32 %qnan_bits to float
+  %min = call float @llvm.minnum.f32(float %y, float %qnan)
+  %test = call i1 @llvm.is.fpclass.f32(float %min, i32 64)
+  ret i1 %test
+}
+
 declare i1 @llvm.is.fpclass.f32(float, i32 immarg)
 declare i1 @llvm.is.fpclass.f64(double, i32 immarg)
 declare <2 x i1> @llvm.is.fpclass.v2f32(<2 x float>, i32 immarg)
diff --git a/llvm/test/Transforms/InstCombine/load-no-aliasing.ll b/llvm/test/Transforms/InstCombine/load-no-aliasing.ll
index 67dfe9d6da265..a93892119056c 100644
--- a/llvm/test/Transforms/InstCombine/load-no-aliasing.ll
+++ b/llvm/test/Transforms/InstCombine/load-no-aliasing.ll
@@ -1,12 +1,13 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt -passes=instcombine -S < %s | FileCheck %s
 
 ; Check that load to load forwarding works with non aliasing store inbetween.
 define i32 @test_load_store_load_combine(ptr, ptr) {
-; CHECK-LABEL: @test_load_store_load_combine(
-; CHECK-NEXT:    [[A:%.*]] = load i32, ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0:![0-9]+]]
+; CHECK-LABEL: define i32 @test_load_store_load_combine(
+; CHECK-SAME: ptr [[TMP0:%.*]], ptr [[TMP1:%.*]]) {
+; CHECK-NEXT:    [[A:%.*]] = load i32, ptr [[TMP0]], align 4, !tbaa [[INT_TBAA0:![0-9]+]]
 ; CHECK-NEXT:    [[F:%.*]] = sitofp i32 [[A]] to float
-; CHECK-NEXT:    store float [[F]], ptr [[TMP1:%.*]], align 4, !tbaa [[TBAA4:![0-9]+]]
+; CHECK-NEXT:    store float [[F]], ptr [[TMP1]], align 4, !tbaa [[FLOAT_TBAA4:![0-9]+]]
 ; CHECK-NEXT:    ret i32 [[A]]
 ;
   %a = load i32, ptr %0, align 4, !tbaa !0
@@ -22,3 +23,11 @@ define i32 @test_load_store_load_combine(ptr, ptr) {
 !3 = !{!"Simple C++ TBAA"}
 !4 = !{!5, !5, i64 0}
 !5 = !{!"float", !2, i64 0}
+;.
+; CHECK: [[INT_TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0}
+; CHECK: [[META1]] = !{!"int", [[META2:![0-9]+]], i64 0}
+; CHECK: [[META2]] = !{!"omnipotent char", [[META3:![0-9]+]], i64 0}
+; CHECK: [[META3]] = !{!"Simple C++ TBAA"}
+; CHECK: [[FLOAT_TBAA4]] = !{[[META5:![0-9]+]], [[META5]], i64 0}
+; CHECK: [[META5]] = !{!"float", [[META2]], i64 0}
+;.
diff --git a/llvm/test/Transforms/InstCombine/load-store-forward.ll b/llvm/test/Transforms/InstCombine/load-store-forward.ll
index 0f03f16062e52..72329f637d8b3 100644
--- a/llvm/test/Transforms/InstCombine/load-store-forward.ll
+++ b/llvm/test/Transforms/InstCombine/load-store-forward.ll
@@ -1,6 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -passes=instcombine < %s | FileCheck %s --check-prefixes=CHECK,LITTLE
-; RUN: opt -S -passes=instcombine -data-layout="E" < %s | FileCheck %s --check-prefixes=CHECK,BIG
+; RUN: opt -S -passes=instcombine -use-constant-int-for-scalable-splat=false -use-constant-fp-for-scalable-splat=false < %s | FileCheck %s --check-prefixes=CHECK,CHECK-CV,LITTLE,LITTLE-CV
+; RUN: opt -S -passes=instcombine -use-constant-int-for-scalable-splat -use-constant-fp-for-scalable-splat < %s | FileCheck %s --check-prefixes=CHECK,CHECK-CI,LITTLE,LITTLE-CI
+; RUN: opt -S -passes=instcombine -data-layout="E" -use-constant-int-for-scalable-splat=false -use-constant-fp-for-scalable-splat=false < %s | FileCheck %s --check-prefixes=CHECK,CHECK-CV,BIG,BIG-CV
+; RUN: opt -S -passes=instcombine -data-layout="E" -use-constant-int-for-scalable-splat -use-constant-fp-for-scalable-splat < %s | FileCheck %s --check-prefixes=CHECK,CHECK-CI,BIG,BIG-CI
 
 define i8 @load_smaller_int(ptr %p) {
 ; LITTLE-LABEL: @load_smaller_int(
@@ -101,11 +103,16 @@ define i32 @vec_store_load_overlap(ptr %p) {
 }
 
 define i32 @load_i32_store_nxv4i32(ptr %a) {
-; CHECK-LABEL: @load_i32_store_nxv4i32(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    store <vscale x 4 x i32> splat (i32 1), ptr [[A:%.*]], align 16
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A]], align 4
-; CHECK-NEXT:    ret i32 [[TMP0]]
+; CHECK-CV-LABEL: @load_i32_store_nxv4i32(
+; CHECK-CV-NEXT:  entry:
+; CHECK-CV-NEXT:    store <vscale x 4 x i32> splat (i32 1), ptr [[A:%.*]], align 16
+; CHECK-CV-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A]], align 4
+; CHECK-CV-NEXT:    ret i32 [[TMP0]]
+;
+; CHECK-CI-LABEL: @load_i32_store_nxv4i32(
+; CHECK-CI-NEXT:  entry:
+; CHECK-CI-NEXT:    store <vscale x 4 x i32> splat (i32 1), ptr [[A:%.*]], align 16
+; CHECK-CI-NEXT:    ret i32 1
 ;
 entry:
   store <vscale x 4 x i32> splat (i32 1), ptr %a, align 16
@@ -153,11 +160,16 @@ entry:
 }
 
 define float @load_f32_store_nxv4f32(ptr %a) {
-; CHECK-LABEL: @load_f32_store_nxv4f32(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    store <vscale x 4 x float> splat (float 1.000000e+00), ptr [[A:%.*]], align 16
-; CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[A]], align 4
-; CHECK-NEXT:    ret float [[TMP0]]
+; CHECK-CV-LABEL: @load_f32_store_nxv4f32(
+; CHECK-CV-NEXT:  entry:
+; CHECK-CV-NEXT:    store <vscale x 4 x float> splat (float 1.000000e+00), ptr [[A:%.*]], align 16
+; CHECK-CV-NEXT:    [[TMP0:%.*]] = load float, ptr [[A]], align 4
+; CHECK-CV-NEXT:    ret float [[TMP0]]
+;
+; CHECK-CI-LABEL: @load_f32_store_nxv4f32(
+; CHECK-CI-NEXT:  entry:
+; CHECK-CI-NEXT:    store <vscale x 4 x float> splat (float 1.000000e+00), ptr [[A:%.*]], align 16
+; CHECK-CI-NEXT:    ret float 1.000000e+00
 ;
 entry:
   store <vscale x 4 x float> splat (float 1.0), ptr %a, align 16
@@ -166,11 +178,16 @@ entry:
 }
 
 define i32 @load_i32_store_nxv4f32(ptr %a) {
-; CHECK-LABEL: @load_i32_store_nxv4f32(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    store <vscale x 4 x float> splat (float 1.000000e+00), ptr [[A:%.*]], align 16
-; CHECK-NEXT:    [[LOAD:%.*]] = load i32, ptr [[A]], align 4
-; CHECK-NEXT:    ret i32 [[LOAD]]
+; CHECK-CV-LABEL: @load_i32_store_nxv4f32(
+; CHECK-CV-NEXT:  entry:
+; CHECK-CV-NEXT:    store <vscale x 4 x float> splat (float 1.000000e+00), ptr [[A:%.*]], align 16
+; CHECK-CV-NEXT:    [[LOAD:%.*]] = load i32, ptr [[A]], align 4
+; CHECK-CV-NEXT:    ret i32 [[LOAD]]
+;
+; CHECK-CI-LABEL: @load_i32_store_nxv4f32(
+; CHECK-CI-NEXT:  entry:
+; CHECK-CI-NEXT:    store <vscale x 4 x float> splat (float 1.000000e+00), ptr [[A:%.*]], align 16
+; CHECK-CI-NEXT:    ret i32 1065353216
 ;
 entry:
   store <vscale x 4 x float> splat (float 1.0), ptr %a, align 16
@@ -506,3 +523,8 @@ define <vscale x 1 x i32> @load_after_memset_0_scalable(ptr %a) {
 }
 
 declare void @llvm.memset.p0.i64(ptr, i8, i64, i1)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; BIG-CI: {{.*}}
+; BIG-CV: {{.*}}
+; LITTLE-CI: {{.*}}
+; LITTLE-CV: {{.*}}
diff --git a/llvm/test/Transforms/InstCombine/loadstore-metadata.ll b/llvm/test/Transforms/InstCombine/loadstore-metadata.ll
index 859c9b892f156..761129979445c 100644
--- a/llvm/test/Transforms/InstCombine/loadstore-metadata.ll
+++ b/llvm/test/Transforms/InstCombine/loadstore-metadata.ll
@@ -1,13 +1,14 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals smart
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt -passes=instcombine -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-p:64:64:64-i64:64-f80:128-n8:16:32:64-S128"
 
 define i32 @test_load_cast_combine_tbaa(ptr %ptr) {
 ; Ensure (cast (load (...))) -> (load (cast (...))) preserves TBAA.
-; CHECK-LABEL: @test_load_cast_combine_tbaa(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[L1:%.*]] = load i32, ptr [[PTR:%.*]], align 4, !tbaa [[TBAA0:![0-9]+]]
+; CHECK-LABEL: define i32 @test_load_cast_combine_tbaa(
+; CHECK-SAME: ptr [[PTR:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[L1:%.*]] = load i32, ptr [[PTR]], align 4, !tbaa [[SCALAR_TYPE_TBAA0:![0-9]+]]
 ; CHECK-NEXT:    ret i32 [[L1]]
 ;
 entry:
@@ -18,9 +19,10 @@ entry:
 
 define i32 @test_load_cast_combine_noalias(ptr %ptr) {
 ; Ensure (cast (load (...))) -> (load (cast (...))) preserves no-alias metadata.
-; CHECK-LABEL: @test_load_cast_combine_noalias(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[L1:%.*]] = load i32, ptr [[PTR:%.*]], align 4, !alias.scope [[META3:![0-9]+]], !noalias [[META3]]
+; CHECK-LABEL: define i32 @test_load_cast_combine_noalias(
+; CHECK-SAME: ptr [[PTR:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[L1:%.*]] = load i32, ptr [[PTR]], align 4, !alias.scope [[META3:![0-9]+]], !noalias [[META3]]
 ; CHECK-NEXT:    ret i32 [[L1]]
 ;
 entry:
@@ -33,9 +35,10 @@ define float @test_load_cast_combine_range(ptr %ptr) {
 ; Ensure (cast (load (...))) -> (load (cast (...))) drops range metadata. It
 ; would be nice to preserve or update it somehow but this is hard when moving
 ; between types.
-; CHECK-LABEL: @test_load_cast_combine_range(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[L1:%.*]] = load float, ptr [[PTR:%.*]], align 4
+; CHECK-LABEL: define float @test_load_cast_combine_range(
+; CHECK-SAME: ptr [[PTR:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[L1:%.*]] = load float, ptr [[PTR]], align 4
 ; CHECK-NEXT:    ret float [[L1]]
 ;
 entry:
@@ -46,9 +49,10 @@ entry:
 
 define i32 @test_load_cast_combine_invariant(ptr %ptr) {
 ; Ensure (cast (load (...))) -> (load (cast (...))) preserves invariant metadata.
-; CHECK-LABEL: @test_load_cast_combine_invariant(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[L1:%.*]] = load i32, ptr [[PTR:%.*]], align 4, !invariant.load [[META6:![0-9]+]]
+; CHECK-LABEL: define i32 @test_load_cast_combine_invariant(
+; CHECK-SAME: ptr [[PTR:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[L1:%.*]] = load i32, ptr [[PTR]], align 4, !invariant.load [[META6:![0-9]+]]
 ; CHECK-NEXT:    ret i32 [[L1]]
 ;
 entry:
@@ -60,9 +64,10 @@ entry:
 define i32 @test_load_cast_combine_nontemporal(ptr %ptr) {
 ; Ensure (cast (load (...))) -> (load (cast (...))) preserves nontemporal
 ; metadata.
-; CHECK-LABEL: @test_load_cast_combine_nontemporal(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[L1:%.*]] = load i32, ptr [[PTR:%.*]], align 4, !nontemporal [[META7:![0-9]+]]
+; CHECK-LABEL: define i32 @test_load_cast_combine_nontemporal(
+; CHECK-SAME: ptr [[PTR:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[L1:%.*]] = load i32, ptr [[PTR]], align 4, !nontemporal [[META7:![0-9]+]]
 ; CHECK-NEXT:    ret i32 [[L1]]
 ;
 entry:
@@ -74,9 +79,10 @@ entry:
 define ptr @test_load_cast_combine_align(ptr %ptr) {
 ; Ensure (cast (load (...))) -> (load (cast (...))) preserves align
 ; metadata.
-; CHECK-LABEL: @test_load_cast_combine_align(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[L:%.*]] = load ptr, ptr [[PTR:%.*]], align 8, !align [[META8:![0-9]+]]
+; CHECK-LABEL: define ptr @test_load_cast_combine_align(
+; CHECK-SAME: ptr [[PTR:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[L:%.*]] = load ptr, ptr [[PTR]], align 8, !align [[META8:![0-9]+]]
 ; CHECK-NEXT:    ret ptr [[L]]
 ;
 entry:
@@ -87,9 +93,10 @@ entry:
 define ptr @test_load_cast_combine_deref(ptr %ptr) {
 ; Ensure (cast (load (...))) -> (load (cast (...))) preserves dereferenceable
 ; metadata.
-; CHECK-LABEL: @test_load_cast_combine_deref(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[L:%.*]] = load ptr, ptr [[PTR:%.*]], align 8, !dereferenceable [[META8]]
+; CHECK-LABEL: define ptr @test_load_cast_combine_deref(
+; CHECK-SAME: ptr [[PTR:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[L:%.*]] = load ptr, ptr [[PTR]], align 8, !dereferenceable [[META8]]
 ; CHECK-NEXT:    ret ptr [[L]]
 ;
 entry:
@@ -100,9 +107,10 @@ entry:
 define ptr @test_load_cast_combine_deref_or_null(ptr %ptr) {
 ; Ensure (cast (load (...))) -> (load (cast (...))) preserves
 ; dereferenceable_or_null metadata.
-; CHECK-LABEL: @test_load_cast_combine_deref_or_null(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[L:%.*]] = load ptr, ptr [[PTR:%.*]], align 8, !dereferenceable_or_null [[META8]]
+; CHECK-LABEL: define ptr @test_load_cast_combine_deref_or_null(
+; CHECK-SAME: ptr [[PTR:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[L:%.*]] = load ptr, ptr [[PTR]], align 8, !dereferenceable_or_null [[META8]]
 ; CHECK-NEXT:    ret ptr [[L]]
 ;
 entry:
@@ -113,21 +121,22 @@ entry:
 define void @test_load_cast_combine_loop(ptr %src, ptr %dst, i32 %n) {
 ; Ensure (cast (load (...))) -> (load (cast (...))) preserves loop access
 ; metadata.
-; CHECK-LABEL: @test_load_cast_combine_loop(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[I_NEXT:%.*]], [[LOOP]] ]
+; CHECK-LABEL: define void @test_load_cast_combine_loop(
+; CHECK-SAME: ptr [[SRC:%.*]], ptr [[DST:%.*]], i32 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[I_NEXT:%.*]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = sext i32 [[I]] to i64
-; CHECK-NEXT:    [[SRC_GEP:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[SRC_GEP:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = sext i32 [[I]] to i64
-; CHECK-NEXT:    [[DST_GEP:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 [[TMP1]]
+; CHECK-NEXT:    [[DST_GEP:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP1]]
 ; CHECK-NEXT:    [[L1:%.*]] = load i32, ptr [[SRC_GEP]], align 4, !llvm.access.group [[ACC_GRP9:![0-9]+]]
 ; CHECK-NEXT:    store i32 [[L1]], ptr [[DST_GEP]], align 4
 ; CHECK-NEXT:    [[I_NEXT]] = add i32 [[I]], 1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[I_NEXT]], [[N:%.*]]
-; CHECK-NEXT:    br i1 [[CMP]], label [[LOOP]], label [[EXIT:%.*]], !llvm.loop [[LOOP1:![0-9]+]]
-; CHECK:       exit:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[I_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP]], label %[[EXIT:.*]], !llvm.loop [[LOOP1:![0-9]+]]
+; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -149,9 +158,10 @@ exit:
 }
 
 define void @test_load_cast_combine_nonnull(ptr %ptr) {
-; CHECK-LABEL: @test_load_cast_combine_nonnull(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[P:%.*]] = load ptr, ptr [[PTR:%.*]], align 8, !nonnull [[META6]]
+; CHECK-LABEL: define void @test_load_cast_combine_nonnull(
+; CHECK-SAME: ptr [[PTR:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[P:%.*]] = load ptr, ptr [[PTR]], align 8, !nonnull [[META6]]
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i8, ptr [[PTR]], i64 336
 ; CHECK-NEXT:    store ptr [[P]], ptr [[GEP]], align 8
 ; CHECK-NEXT:    ret void
@@ -164,8 +174,9 @@ entry:
 }
 
 define i32 @test_load_cast_combine_noundef(ptr %ptr) {
-; CHECK-LABEL: @test_load_cast_combine_noundef(
-; CHECK-NEXT:    [[L1:%.*]] = load i32, ptr [[PTR:%.*]], align 4, !noundef [[META6]]
+; CHECK-LABEL: define i32 @test_load_cast_combine_noundef(
+; CHECK-SAME: ptr [[PTR:%.*]]) {
+; CHECK-NEXT:    [[L1:%.*]] = load i32, ptr [[PTR]], align 4, !noundef [[META6]]
 ; CHECK-NEXT:    ret i32 [[L1]]
 ;
   %l = load float, ptr %ptr, !noundef !{}
@@ -175,9 +186,10 @@ define i32 @test_load_cast_combine_noundef(ptr %ptr) {
 
 define i32 @test_load_cast_combine_noalias_addrspace(ptr %ptr) {
 ; Ensure (cast (load (...))) -> (load (cast (...))) preserves TBAA.
-; CHECK-LABEL: @test_load_cast_combine_noalias_addrspace(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[L1:%.*]] = load i32, ptr [[PTR:%.*]], align 4, !noalias.addrspace [[META10:![0-9]+]]
+; CHECK-LABEL: define i32 @test_load_cast_combine_noalias_addrspace(
+; CHECK-SAME: ptr [[PTR:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[L1:%.*]] = load i32, ptr [[PTR]], align 4, !noalias.addrspace [[META10:![0-9]+]]
 ; CHECK-NEXT:    ret i32 [[L1]]
 ;
 entry:
@@ -188,11 +200,12 @@ entry:
 
 ; Preserve none-UB metadata on loads.
 define ptr @preserve_load_metadata_after_select_transform1(i1 %c, ptr dereferenceable(8) %a, ptr dereferenceable(8) %b) {
-; CHECK-LABEL: @preserve_load_metadata_after_select_transform1(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[B_VAL:%.*]] = load ptr, ptr [[B:%.*]], align 1, !nonnull [[META6]], !align [[META8]]
-; CHECK-NEXT:    [[A_VAL:%.*]] = load ptr, ptr [[A:%.*]], align 1, !nonnull [[META6]], !align [[META8]]
-; CHECK-NEXT:    [[L_SEL:%.*]] = select i1 [[C:%.*]], ptr [[B_VAL]], ptr [[A_VAL]]
+; CHECK-LABEL: define ptr @preserve_load_metadata_after_select_transform1(
+; CHECK-SAME: i1 [[C:%.*]], ptr dereferenceable(8) [[A:%.*]], ptr dereferenceable(8) [[B:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[B_VAL:%.*]] = load ptr, ptr [[B]], align 1, !nonnull [[META6]], !align [[META8]]
+; CHECK-NEXT:    [[A_VAL:%.*]] = load ptr, ptr [[A]], align 1, !nonnull [[META6]], !align [[META8]]
+; CHECK-NEXT:    [[L_SEL:%.*]] = select i1 [[C]], ptr [[B_VAL]], ptr [[A_VAL]]
 ; CHECK-NEXT:    ret ptr [[L_SEL]]
 ;
 entry:
@@ -203,11 +216,12 @@ entry:
 
 ; Preserve none-UB metadata on loads.
 define i32 @preserve_load_metadata_after_select_transform_range(i1 %c, ptr dereferenceable(8) %a, ptr dereferenceable(8) %b) {
-; CHECK-LABEL: @preserve_load_metadata_after_select_transform_range(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[B_VAL:%.*]] = load i32, ptr [[B:%.*]], align 1, !range [[RNG10:![0-9]+]]
-; CHECK-NEXT:    [[A_VAL:%.*]] = load i32, ptr [[A:%.*]], align 1, !range [[RNG10]]
-; CHECK-NEXT:    [[L_SEL:%.*]] = select i1 [[C:%.*]], i32 [[B_VAL]], i32 [[A_VAL]]
+; CHECK-LABEL: define i32 @preserve_load_metadata_after_select_transform_range(
+; CHECK-SAME: i1 [[C:%.*]], ptr dereferenceable(8) [[A:%.*]], ptr dereferenceable(8) [[B:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[B_VAL:%.*]] = load i32, ptr [[B]], align 1, !range [[RNG11:![0-9]+]]
+; CHECK-NEXT:    [[A_VAL:%.*]] = load i32, ptr [[A]], align 1, !range [[RNG11]]
+; CHECK-NEXT:    [[L_SEL:%.*]] = select i1 [[C]], i32 [[B_VAL]], i32 [[A_VAL]]
 ; CHECK-NEXT:    ret i32 [[L_SEL]]
 ;
 entry:
@@ -217,10 +231,11 @@ entry:
 }
 
 define double @preserve_load_metadata_after_select_transform2(ptr %a, ptr %b) {
-; CHECK-LABEL: @preserve_load_metadata_after_select_transform2(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[L_A:%.*]] = load double, ptr [[A:%.*]], align 8, !tbaa [[TBAA0]], !llvm.access.group [[META6]]
-; CHECK-NEXT:    [[L_B:%.*]] = load double, ptr [[B:%.*]], align 8, !tbaa [[TBAA0]], !llvm.access.group [[META6]]
+; CHECK-LABEL: define double @preserve_load_metadata_after_select_transform2(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[L_A:%.*]] = load double, ptr [[A]], align 8, !tbaa [[SCALAR_TYPE_TBAA0]], !llvm.access.group [[META6]]
+; CHECK-NEXT:    [[L_B:%.*]] = load double, ptr [[B]], align 8, !tbaa [[SCALAR_TYPE_TBAA0]], !llvm.access.group [[META6]]
 ; CHECK-NEXT:    [[CMP_I:%.*]] = fcmp fast olt double [[L_A]], [[L_B]]
 ; CHECK-NEXT:    [[L_SEL:%.*]] = select i1 [[CMP_I]], double [[L_B]], double [[L_A]]
 ; CHECK-NEXT:    ret double [[L_SEL]]
@@ -235,10 +250,11 @@ entry:
 }
 
 define double @preserve_load_metadata_after_select_transform_metadata_missing_1(ptr %a, ptr %b) {
-; CHECK-LABEL: @preserve_load_metadata_after_select_transform_metadata_missing_1(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[L_A:%.*]] = load double, ptr [[A:%.*]], align 8, !llvm.access.group [[META6]]
-; CHECK-NEXT:    [[L_B:%.*]] = load double, ptr [[B:%.*]], align 8, !tbaa [[TBAA0]], !llvm.access.group [[META6]]
+; CHECK-LABEL: define double @preserve_load_metadata_after_select_transform_metadata_missing_1(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[L_A:%.*]] = load double, ptr [[A]], align 8, !llvm.access.group [[META6]]
+; CHECK-NEXT:    [[L_B:%.*]] = load double, ptr [[B]], align 8, !tbaa [[SCALAR_TYPE_TBAA0]], !llvm.access.group [[META6]]
 ; CHECK-NEXT:    [[CMP_I:%.*]] = fcmp fast olt double [[L_A]], [[L_B]]
 ; CHECK-NEXT:    [[L_SEL:%.*]] = select i1 [[CMP_I]], double [[L_B]], double [[L_A]]
 ; CHECK-NEXT:    ret double [[L_SEL]]
@@ -253,10 +269,11 @@ entry:
 }
 
 define double @preserve_load_metadata_after_select_transform_metadata_missing_2(ptr %a, ptr %b) {
-; CHECK-LABEL: @preserve_load_metadata_after_select_transform_metadata_missing_2(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[L_A:%.*]] = load double, ptr [[A:%.*]], align 8, !llvm.access.group [[META6]]
-; CHECK-NEXT:    [[L_B:%.*]] = load double, ptr [[B:%.*]], align 8, !llvm.access.group [[META6]]
+; CHECK-LABEL: define double @preserve_load_metadata_after_select_transform_metadata_missing_2(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[L_A:%.*]] = load double, ptr [[A]], align 8, !llvm.access.group [[META6]]
+; CHECK-NEXT:    [[L_B:%.*]] = load double, ptr [[B]], align 8, !llvm.access.group [[META6]]
 ; CHECK-NEXT:    [[CMP_I:%.*]] = fcmp fast olt double [[L_A]], [[L_B]]
 ; CHECK-NEXT:    [[L_SEL:%.*]] = select i1 [[CMP_I]], double [[L_B]], double [[L_A]]
 ; CHECK-NEXT:    ret double [[L_SEL]]
@@ -271,10 +288,11 @@ entry:
 }
 
 define double @preserve_load_metadata_after_select_transform_metadata_missing_3(ptr %a, ptr %b) {
-; CHECK-LABEL: @preserve_load_metadata_after_select_transform_metadata_missing_3(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[L_A:%.*]] = load double, ptr [[A:%.*]], align 8, !tbaa [[TBAA0]], !llvm.access.group [[META6]]
-; CHECK-NEXT:    [[L_B:%.*]] = load double, ptr [[B:%.*]], align 8, !tbaa [[TBAA0]], !llvm.access.group [[META6]]
+; CHECK-LABEL: define double @preserve_load_metadata_after_select_transform_metadata_missing_3(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[L_A:%.*]] = load double, ptr [[A]], align 8, !tbaa [[SCALAR_TYPE_TBAA0]], !llvm.access.group [[META6]]
+; CHECK-NEXT:    [[L_B:%.*]] = load double, ptr [[B]], align 8, !tbaa [[SCALAR_TYPE_TBAA0]], !llvm.access.group [[META6]]
 ; CHECK-NEXT:    [[CMP_I:%.*]] = fcmp fast olt double [[L_A]], [[L_B]]
 ; CHECK-NEXT:    [[L_SEL:%.*]] = select i1 [[CMP_I]], double [[L_B]], double [[L_A]]
 ; CHECK-NEXT:    ret double [[L_SEL]]
@@ -291,10 +309,11 @@ entry:
 ; Like preserve_load_metadata_after_select_transform_metadata_missing_3, but
 ; with different access groups on all loads.
 define double @preserve_load_metadata_after_select_transform_metadata_missing_4(ptr %a, ptr %b) {
-; CHECK-LABEL: @preserve_load_metadata_after_select_transform_metadata_missing_4(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[L_A:%.*]] = load double, ptr [[A:%.*]], align 8, !tbaa [[TBAA0]], !alias.scope [[META3]], !noalias [[META3]], !llvm.access.group [[META6]]
-; CHECK-NEXT:    [[L_B:%.*]] = load double, ptr [[B:%.*]], align 8, !tbaa [[TBAA0]], !alias.scope [[META11:![0-9]+]], !noalias [[META11]], !llvm.access.group [[ACC_GRP14:![0-9]+]]
+; CHECK-LABEL: define double @preserve_load_metadata_after_select_transform_metadata_missing_4(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[L_A:%.*]] = load double, ptr [[A]], align 8, !tbaa [[SCALAR_TYPE_TBAA0]], !alias.scope [[META3]], !noalias [[META3]], !llvm.access.group [[META6]]
+; CHECK-NEXT:    [[L_B:%.*]] = load double, ptr [[B]], align 8, !tbaa [[SCALAR_TYPE_TBAA0]], !alias.scope [[META12:![0-9]+]], !noalias [[META12]], !llvm.access.group [[ACC_GRP15:![0-9]+]]
 ; CHECK-NEXT:    [[CMP_I:%.*]] = fcmp fast olt double [[L_A]], [[L_B]]
 ; CHECK-NEXT:    [[L_SEL:%.*]] = select i1 [[CMP_I]], double [[L_B]], double [[L_A]]
 ; CHECK-NEXT:    ret double [[L_SEL]]
@@ -327,7 +346,7 @@ entry:
 !16 = distinct !{!16}
 
 ;.
-; CHECK: [[TBAA0]] = !{[[LOOP1]], [[LOOP1]], i64 0}
+; CHECK: [[SCALAR_TYPE_TBAA0]] = !{[[LOOP1]], [[LOOP1]], i64 0}
 ; CHECK: [[LOOP1]] = !{!"scalar type", [[META2:![0-9]+]]}
 ; CHECK: [[META2]] = !{!"root"}
 ; CHECK: [[META3]] = !{[[META4:![0-9]+]]}
@@ -337,9 +356,10 @@ entry:
 ; CHECK: [[META7]] = !{i32 1}
 ; CHECK: [[META8]] = !{i64 8}
 ; CHECK: [[ACC_GRP9]] = distinct !{}
-; CHECK: [[RNG10]] = !{i32 0, i32 42}
-; CHECK: [[META11]] = !{[[META12:![0-9]+]]}
-; CHECK: [[META12]] = distinct !{[[META12]], [[META13:![0-9]+]]}
-; CHECK: [[META13]] = distinct !{[[META13]]}
-; CHECK: [[ACC_GRP14]] = distinct !{}
+; CHECK: [[META10]] = !{i32 5, i32 6}
+; CHECK: [[RNG11]] = !{i32 0, i32 42}
+; CHECK: [[META12]] = !{[[META13:![0-9]+]]}
+; CHECK: [[META13]] = distinct !{[[META13]], [[META14:![0-9]+]]}
+; CHECK: [[META14]] = distinct !{[[META14]]}
+; CHECK: [[ACC_GRP15]] = distinct !{}
 ;.
diff --git a/llvm/test/Transforms/InstCombine/masked_intrinsics_keep_metadata.ll b/llvm/test/Transforms/InstCombine/masked_intrinsics_keep_metadata.ll
index 1a571100323ff..0832561e2b02b 100644
--- a/llvm/test/Transforms/InstCombine/masked_intrinsics_keep_metadata.ll
+++ b/llvm/test/Transforms/InstCombine/masked_intrinsics_keep_metadata.ll
@@ -1,12 +1,13 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt -S -passes=instcombine < %s | FileCheck %s
 
 @g0 = global <4 x i32> zeroinitializer, align 16
 
 define inreg <4 x i32> @mload1(ptr nocapture readonly %a0) #0 {
-; CHECK-LABEL: @mload1(
-; CHECK-NEXT:  b0:
-; CHECK-NEXT:    [[UNMASKEDLOAD:%.*]] = load <4 x i32>, ptr [[A0:%.*]], align 16, !tbaa [[TBAA0:![0-9]+]]
+; CHECK-LABEL: define inreg <4 x i32> @mload1(
+; CHECK-SAME: ptr readonly captures(none) [[A0:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[B0:.*:]]
+; CHECK-NEXT:    [[UNMASKEDLOAD:%.*]] = load <4 x i32>, ptr [[A0]], align 16, !tbaa [[CHAR_TBAA0:![0-9]+]]
 ; CHECK-NEXT:    ret <4 x i32> [[UNMASKEDLOAD]]
 ;
 b0:
@@ -15,9 +16,10 @@ b0:
 }
 
 define inreg <4 x i32> @mload2() #0 {
-; CHECK-LABEL: @mload2(
-; CHECK-NEXT:  b0:
-; CHECK-NEXT:    [[UNMASKEDLOAD:%.*]] = load <4 x i32>, ptr @g0, align 16, !tbaa [[TBAA0]]
+; CHECK-LABEL: define inreg <4 x i32> @mload2(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:  [[B0:.*:]]
+; CHECK-NEXT:    [[UNMASKEDLOAD:%.*]] = load <4 x i32>, ptr @g0, align 16, !tbaa [[CHAR_TBAA0]]
 ; CHECK-NEXT:    [[V01:%.*]] = insertelement <4 x i32> [[UNMASKEDLOAD]], i32 0, i64 0
 ; CHECK-NEXT:    ret <4 x i32> [[V01]]
 ;
@@ -27,9 +29,10 @@ b0:
 }
 
 define void @mstore(<4 x i32> %a0, ptr nocapture readonly %a1) #0 {
-; CHECK-LABEL: @mstore(
-; CHECK-NEXT:  b0:
-; CHECK-NEXT:    store <4 x i32> [[A0:%.*]], ptr [[A1:%.*]], align 16, !tbaa [[TBAA0]]
+; CHECK-LABEL: define void @mstore(
+; CHECK-SAME: <4 x i32> [[A0:%.*]], ptr readonly captures(none) [[A1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[B0:.*:]]
+; CHECK-NEXT:    store <4 x i32> [[A0]], ptr [[A1]], align 16, !tbaa [[CHAR_TBAA0]]
 ; CHECK-NEXT:    ret void
 ;
 b0:
@@ -46,3 +49,8 @@ declare void @llvm.masked.store.v4i1.p0(<4 x i32>, ptr, i32, <4 x i1>)
 !1 = !{!"omnipotent char", !2, i64 0}
 !2 = !{!"Simple C/C++ TBAA"}
 
+;.
+; CHECK: [[CHAR_TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0}
+; CHECK: [[META1]] = !{!"omnipotent char", [[META2:![0-9]+]], i64 0}
+; CHECK: [[META2]] = !{!"Simple C/C++ TBAA"}
+;.
diff --git a/llvm/test/Transforms/InstCombine/may-alias-errno.ll b/llvm/test/Transforms/InstCombine/may-alias-errno.ll
new file mode 100644
index 0000000000000..40fab8024b362
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/may-alias-errno.ll
@@ -0,0 +1,171 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -passes=instcombine < %s | FileCheck %s
+
+; sinf clobbering errno, but %p cannot alias errno per C/C++ strict aliasing rules via TBAA.
+; Can do constant store-to-load forwarding.
+define float @does_not_alias_errno(ptr %p, float %f) {
+; CHECK-LABEL: define float @does_not_alias_errno(
+; CHECK-SAME: ptr [[P:%.*]], float [[F:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store float 0.000000e+00, ptr [[P]], align 4, !tbaa [[TBAA4:![0-9]+]]
+; CHECK-NEXT:    [[CALL:%.*]] = call float @sinf(float [[F]])
+; CHECK-NEXT:    ret float 0.000000e+00
+;
+entry:
+  store float 0.000000e+00, ptr %p, align 4, !tbaa !4
+  %call = call float @sinf(float %f)
+  %0 = load float, ptr %p, align 4, !tbaa !4
+  ret float %0
+}
+
+; sinf clobbering errno, but %p is alloca memory, wich can never aliases errno.
+; Can do constant store-to-load forwarding.
+define float @does_not_alias_errno_2(float %f) {
+; CHECK-LABEL: define float @does_not_alias_errno_2(
+; CHECK-SAME: float [[F:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[P:%.*]] = alloca float, align 4
+; CHECK-NEXT:    call void @escape(ptr nonnull [[P]])
+; CHECK-NEXT:    store float 0.000000e+00, ptr [[P]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = call float @sinf(float [[F]])
+; CHECK-NEXT:    ret float 0.000000e+00
+;
+entry:
+  %p = alloca float
+  call void @escape(ptr %p)
+  store float 0.0, ptr %p
+  call float @sinf(float %f)
+  %v = load float, ptr %p
+  ret float %v
+}
+
+; sinf clobbering errno, but %p is memory accessed w/ size larger than errno.
+; Can do constant store-to-load forwarding.
+define double @does_not_alias_errno_3(ptr %p, float %f) {
+; CHECK-LABEL: define double @does_not_alias_errno_3(
+; CHECK-SAME: ptr [[P:%.*]], float [[F:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    call void @escape(ptr [[P]])
+; CHECK-NEXT:    store double 0.000000e+00, ptr [[P]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = call float @sinf(float [[F]])
+; CHECK-NEXT:    ret double 0.000000e+00
+;
+entry:
+  call void @escape(ptr %p)
+  store double 0.0, ptr %p
+  call float @sinf(float %f)
+  %v = load double, ptr %p
+  ret double %v
+}
+
+; %p may alias errno, but read_errno does not clobber errno.
+; Can do constant store-to-load forwarding.
+define float @may_alias_errno_does_not_clobber(ptr %p, ptr byval(i8) %q) {
+; CHECK-LABEL: define float @may_alias_errno_does_not_clobber(
+; CHECK-SAME: ptr [[P:%.*]], ptr byval(i8) [[Q:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store float 0.000000e+00, ptr [[P]], align 4
+; CHECK-NEXT:    [[CALL:%.*]] = call float @read_errno(ptr nonnull [[Q]])
+; CHECK-NEXT:    ret float 0.000000e+00
+;
+entry:
+  store float 0.000000e+00, ptr %p, align 4
+  %call = call float @read_errno(ptr %q)
+  %0 = load float, ptr %p, align 4
+  ret float %0
+}
+
+; sinf clobbering errno, unknown TBAA info, %p may alias errno.
+; Cannot do constant store-to-load forwarding.
+define float @may_alias_errno(ptr %p, float %f) {
+; CHECK-LABEL: define float @may_alias_errno(
+; CHECK-SAME: ptr [[P:%.*]], float [[F:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store float 0.000000e+00, ptr [[P]], align 4
+; CHECK-NEXT:    [[CALL:%.*]] = call float @sinf(float [[F]])
+; CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[P]], align 4
+; CHECK-NEXT:    ret float [[TMP0]]
+;
+entry:
+  store float 0.000000e+00, ptr %p, align 4
+  %call = call float @sinf(float %f)
+  %0 = load float, ptr %p, align 4
+  ret float %0
+}
+
+; sinf clobbering errno, %p, a integer pointer, may alias errno.
+; Cannot do constant store-to-load forwarding.
+define i32 @may_alias_errno_2(ptr %p, float %f) {
+; CHECK-LABEL: define i32 @may_alias_errno_2(
+; CHECK-SAME: ptr [[P:%.*]], float [[F:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store i32 0, ptr [[P]], align 4, !tbaa [[TBAA0:![0-9]+]]
+; CHECK-NEXT:    [[CALL:%.*]] = call float @sinf(float [[F]])
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[P]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    ret i32 [[TMP0]]
+;
+entry:
+  store i32 0, ptr %p, align 4, !tbaa !0
+  %call = call float @sinf(float %f)
+  %0 = load i32, ptr %p, align 4, !tbaa !0
+  ret i32 %0
+}
+
+; sinf clobbering errno, but %p is memory accessed w/ vector size larger than errno.
+; Can do constant store-to-load forwarding.
+define <4 x i32> @does_not_alias_errno_vec(ptr %p, float %f) {
+; CHECK-LABEL: define <4 x i32> @does_not_alias_errno_vec(
+; CHECK-SAME: ptr [[P:%.*]], float [[F:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    call void @escape(ptr [[P]])
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr [[P]], align 16
+; CHECK-NEXT:    [[TMP0:%.*]] = call float @sinf(float [[F]])
+; CHECK-NEXT:    ret <4 x i32> zeroinitializer
+;
+entry:
+  call void @escape(ptr %p)
+  store <4 x i32> zeroinitializer, ptr %p
+  call float @sinf(float %f)
+  %v = load <4 x i32>, ptr %p
+  ret <4 x i32> %v
+}
+
+; sinf clobbering errno, but %p is memory accessed w/ scalable vector size larger than errno.
+; Can do constant store-to-load forwarding.
+define <vscale x 4 x i32> @does_not_alias_errno_scalablevec(ptr %p, float %f) {
+; CHECK-LABEL: define <vscale x 4 x i32> @does_not_alias_errno_scalablevec(
+; CHECK-SAME: ptr [[P:%.*]], float [[F:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    call void @escape(ptr [[P]])
+; CHECK-NEXT:    store <vscale x 4 x i32> zeroinitializer, ptr [[P]], align 16
+; CHECK-NEXT:    [[TMP0:%.*]] = call float @sinf(float [[F]])
+; CHECK-NEXT:    ret <vscale x 4 x i32> zeroinitializer
+;
+entry:
+  call void @escape(ptr %p)
+  store <vscale x 4 x i32> zeroinitializer, ptr %p
+  call float @sinf(float %f)
+  %v = load <vscale x 4 x i32>, ptr %p
+  ret <vscale x 4 x i32> %v
+}
+
+declare float @sinf(float) memory(errnomem: write)
+declare float @read_errno(ptr) memory(argmem: write, errnomem: read)
+declare void @escape(ptr %p)
+
+!llvm.errno.tbaa = !{!0}
+
+!0 = !{!1, !1, i64 0}
+!1 = !{!"int", !2, i64 0}
+!2 = !{!"omnipotent char", !3, i64 0}
+!3 = !{!"Simple C/C++ TBAA"}
+!4 = !{!5, !5, i64 0}
+!5 = !{!"float", !2, i64 0}
+;.
+; CHECK: [[TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0}
+; CHECK: [[META1]] = !{!"int", [[META2:![0-9]+]], i64 0}
+; CHECK: [[META2]] = !{!"omnipotent char", [[META3:![0-9]+]], i64 0}
+; CHECK: [[META3]] = !{!"Simple C/C++ TBAA"}
+; CHECK: [[TBAA4]] = !{[[META5:![0-9]+]], [[META5]], i64 0}
+; CHECK: [[META5]] = !{!"float", [[META2]], i64 0}
+;.
diff --git a/llvm/test/Transforms/InstCombine/memcpy-from-global.ll b/llvm/test/Transforms/InstCombine/memcpy-from-global.ll
index ff85d827bdcb4..f10ba1e3d27e6 100644
--- a/llvm/test/Transforms/InstCombine/memcpy-from-global.ll
+++ b/llvm/test/Transforms/InstCombine/memcpy-from-global.ll
@@ -139,13 +139,14 @@ define void @test2_addrspacecast() {
   ret void
 }
 
-declare void @bar(ptr)
-declare void @bar_as1(ptr addrspace(1))
+declare void @bar(ptr nocapture)
+declare void @bar_may_capture(ptr)
+declare void @bar_as1(ptr addrspace(1) nocapture)
 
 
 ;; Should be able to eliminate the alloca.
-define void @test3() {
-; CHECK-LABEL: @test3(
+define void @test3_nocapture() {
+; CHECK-LABEL: @test3_nocapture(
 ; CHECK-NEXT:    call void @bar(ptr nonnull @G) #[[ATTR3:[0-9]+]]
 ; CHECK-NEXT:    ret void
 ;
@@ -155,6 +156,20 @@ define void @test3() {
   ret void
 }
 
+; Can not eliminate the alloca, as the function may capture its address.
+define void @test3_may_capture() {
+; CHECK-LABEL: @test3_may_capture(
+; CHECK-NEXT:    [[A:%.*]] = alloca [[T:%.*]], align 8
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(124) [[A]], ptr noundef nonnull align 16 dereferenceable(124) @G, i64 124, i1 false)
+; CHECK-NEXT:    call void @bar_may_capture(ptr nonnull [[A]]) #[[ATTR3]]
+; CHECK-NEXT:    ret void
+;
+  %A = alloca %T
+  call void @llvm.memcpy.p0.p0.i64(ptr align 4 %A, ptr align 4 @G, i64 124, i1 false)
+  call void @bar_may_capture(ptr %A) readonly
+  ret void
+}
+
 define void @test3_addrspacecast() {
 ; CHECK-LABEL: @test3_addrspacecast(
 ; CHECK-NEXT:    call void @bar(ptr nonnull @G) #[[ATTR3]]
@@ -395,12 +410,12 @@ define void @memcpy_to_capturing_readonly() {
 ; CHECK-LABEL: @memcpy_to_capturing_readonly(
 ; CHECK-NEXT:    [[A:%.*]] = alloca [[U:%.*]], align 16
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(20) [[A]], ptr noundef nonnull align 16 dereferenceable(20) @H, i64 20, i1 false)
-; CHECK-NEXT:    call void @bar(ptr nonnull readonly [[A]])
+; CHECK-NEXT:    call void @bar_may_capture(ptr nonnull readonly [[A]])
 ; CHECK-NEXT:    ret void
 ;
   %A = alloca %U, align 16
   call void @llvm.memcpy.p0.p0.i64(ptr align 4 %A, ptr align 4 @H, i64 20, i1 false)
-  call void @bar(ptr readonly %A)
+  call void @bar_may_capture(ptr readonly %A)
   ret void
 }
 
diff --git a/llvm/test/Transforms/InstCombine/min-zext.ll b/llvm/test/Transforms/InstCombine/min-zext.ll
new file mode 100644
index 0000000000000..f016d1a8de524
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/min-zext.ll
@@ -0,0 +1,150 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt < %s -passes=instcombine -S | FileCheck %s
+
+define i32 @test_smin(i32 %arg0, i32 %arg1) {
+; CHECK-LABEL: define i32 @test_smin(
+; CHECK-SAME: i32 [[ARG0:%.*]], i32 [[ARG1:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp slt i32 [[ARG0]], [[ARG1]]
+; CHECK-NEXT:    [[V3:%.*]] = zext i1 [[TMP1]] to i32
+; CHECK-NEXT:    ret i32 [[V3]]
+;
+  %v0 = tail call i32 @llvm.smin.i32(i32 %arg0, i32 %arg1)
+  %v1 = add nsw i32 %arg0, 1
+  %v2 = tail call i32 @llvm.smin.i32(i32 %v1, i32 %arg1)
+  %v3 = sub i32 %v2, %v0
+  ret i32 %v3
+}
+
+define i32 @test_umin(i32 %arg0, i32 %arg1) {
+; CHECK-LABEL: define i32 @test_umin(
+; CHECK-SAME: i32 [[ARG0:%.*]], i32 [[ARG1:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[ARG0]], [[ARG1]]
+; CHECK-NEXT:    [[V3:%.*]] = zext i1 [[TMP1]] to i32
+; CHECK-NEXT:    ret i32 [[V3]]
+;
+  %v0 = tail call i32 @llvm.umin.i32(i32 %arg0, i32 %arg1)
+  %v1 = add nuw i32 %arg0, 1
+  %v2 = tail call i32 @llvm.umin.i32(i32 %v1, i32 %arg1)
+  %v3 = sub i32 %v2, %v0
+  ret i32 %v3
+}
+
+define i1 @test_smin_i1(i1 %arg0, i1 %arg1) {
+; CHECK-LABEL: define i1 @test_smin_i1(
+; CHECK-SAME: i1 [[ARG0:%.*]], i1 [[ARG1:%.*]]) {
+; CHECK-NEXT:    [[V0:%.*]] = or i1 [[ARG0]], [[ARG1]]
+; CHECK-NEXT:    [[V3:%.*]] = xor i1 [[V0]], true
+; CHECK-NEXT:    ret i1 [[V3]]
+;
+  %v0 = tail call i1 @llvm.smin.i1(i1 %arg0, i1 %arg1)
+  %v1 = add nsw i1 %arg0, 1
+  %v2 = tail call i1 @llvm.smin.i1(i1 %v1, i1 %arg1)
+  %v3 = sub i1 %v2, %v0
+  ret i1 %v3
+}
+
+declare void @use(i2)
+
+define i2 @test_smin_use_operands(i2 %arg0, i2 %arg1) {
+; CHECK-LABEL: define i2 @test_smin_use_operands(
+; CHECK-SAME: i2 [[ARG0:%.*]], i2 [[ARG1:%.*]]) {
+; CHECK-NEXT:    [[V0:%.*]] = tail call i2 @llvm.smin.i2(i2 [[ARG0]], i2 [[ARG1]])
+; CHECK-NEXT:    [[V1:%.*]] = add nsw i2 [[ARG0]], 1
+; CHECK-NEXT:    [[V2:%.*]] = tail call i2 @llvm.smin.i2(i2 [[V1]], i2 [[ARG1]])
+; CHECK-NEXT:    [[V3:%.*]] = sub i2 [[V2]], [[V0]]
+; CHECK-NEXT:    call void @use(i2 [[V2]])
+; CHECK-NEXT:    call void @use(i2 [[V0]])
+; CHECK-NEXT:    ret i2 [[V3]]
+;
+  %v0 = tail call i2 @llvm.smin.i2(i2 %arg0, i2 %arg1)
+  %v1 = add nsw i2 %arg0, 1
+  %v2 = tail call i2 @llvm.smin.i2(i2 %v1, i2 %arg1)
+  %v3 = sub i2 %v2, %v0
+  call void @use(i2 %v2)
+  call void @use(i2 %v0)
+  ret i2 %v3
+}
+
+define i2 @test_smin_use_operand(i2 %arg0, i2 %arg1) {
+; CHECK-LABEL: define i2 @test_smin_use_operand(
+; CHECK-SAME: i2 [[ARG0:%.*]], i2 [[ARG1:%.*]]) {
+; CHECK-NEXT:    [[V1:%.*]] = add nsw i2 [[ARG0]], 1
+; CHECK-NEXT:    [[V2:%.*]] = tail call i2 @llvm.smin.i2(i2 [[V1]], i2 [[ARG1]])
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp slt i2 [[ARG0]], [[ARG1]]
+; CHECK-NEXT:    [[V3:%.*]] = zext i1 [[TMP1]] to i2
+; CHECK-NEXT:    call void @use(i2 [[V2]])
+; CHECK-NEXT:    ret i2 [[V3]]
+;
+  %v0 = tail call i2 @llvm.smin.i2(i2 %arg0, i2 %arg1)
+  %v1 = add nsw i2 %arg0, 1
+  %v2 = tail call i2 @llvm.smin.i2(i2 %v1, i2 %arg1)
+  %v3 = sub i2 %v2, %v0
+  call void @use(i2 %v2)
+  ret i2 %v3
+}
+
+define i32 @test_smin_missing_nsw(i32 %arg0, i32 %arg1) {
+; CHECK-LABEL: define i32 @test_smin_missing_nsw(
+; CHECK-SAME: i32 [[ARG0:%.*]], i32 [[ARG1:%.*]]) {
+; CHECK-NEXT:    [[V0:%.*]] = tail call i32 @llvm.smin.i32(i32 [[ARG0]], i32 [[ARG1]])
+; CHECK-NEXT:    [[V1:%.*]] = add i32 [[ARG0]], 1
+; CHECK-NEXT:    [[V2:%.*]] = tail call i32 @llvm.smin.i32(i32 [[V1]], i32 [[ARG1]])
+; CHECK-NEXT:    [[V3:%.*]] = sub i32 [[V2]], [[V0]]
+; CHECK-NEXT:    ret i32 [[V3]]
+;
+  %v0 = tail call i32 @llvm.smin.i32(i32 %arg0, i32 %arg1)
+  %v1 = add i32 %arg0, 1
+  %v2 = tail call i32 @llvm.smin.i32(i32 %v1, i32 %arg1)
+  %v3 = sub i32 %v2, %v0
+  ret i32 %v3
+}
+
+define i32 @test_umin_missing_nuw(i32 %arg0, i32 %arg1) {
+; CHECK-LABEL: define i32 @test_umin_missing_nuw(
+; CHECK-SAME: i32 [[ARG0:%.*]], i32 [[ARG1:%.*]]) {
+; CHECK-NEXT:    [[V0:%.*]] = tail call i32 @llvm.umin.i32(i32 [[ARG0]], i32 [[ARG1]])
+; CHECK-NEXT:    [[V1:%.*]] = add i32 [[ARG0]], 1
+; CHECK-NEXT:    [[V2:%.*]] = tail call i32 @llvm.umin.i32(i32 [[V1]], i32 [[ARG1]])
+; CHECK-NEXT:    [[V3:%.*]] = sub i32 [[V2]], [[V0]]
+; CHECK-NEXT:    ret i32 [[V3]]
+;
+  %v0 = tail call i32 @llvm.umin.i32(i32 %arg0, i32 %arg1)
+  %v1 = add i32 %arg0, 1
+  %v2 = tail call i32 @llvm.umin.i32(i32 %v1, i32 %arg1)
+  %v3 = sub i32 %v2, %v0
+  ret i32 %v3
+}
+
+@tmp = external global i32
+
+define i32 @test_mismatched_operands(i32 %arg0, i32 %arg1) {
+; CHECK-LABEL: define i32 @test_mismatched_operands(
+; CHECK-SAME: i32 [[ARG0:%.*]], i32 [[ARG1:%.*]]) {
+; CHECK-NEXT:    [[TMP:%.*]] = load i32, ptr @tmp, align 4
+; CHECK-NEXT:    [[V0:%.*]] = tail call i32 @llvm.smin.i32(i32 [[ARG0]], i32 [[TMP]])
+; CHECK-NEXT:    [[V1:%.*]] = add nsw i32 [[ARG0]], 1
+; CHECK-NEXT:    [[V2:%.*]] = tail call i32 @llvm.smin.i32(i32 [[V1]], i32 [[ARG1]])
+; CHECK-NEXT:    [[V3:%.*]] = sub i32 [[V2]], [[V0]]
+; CHECK-NEXT:    ret i32 [[V3]]
+;
+  %tmp = load i32, ptr @tmp, align 4
+  %v0 = tail call i32 @llvm.smin.i32(i32 %arg0, i32 %tmp)
+  %v1 = add nsw i32 %arg0, 1
+  %v2 = tail call i32 @llvm.smin.i32(i32 %v1, i32 %arg1)
+  %v3 = sub i32 %v2, %v0
+  ret i32 %v3
+}
+
+define i32 @test_disjoint_or(i32 %arg0, i32 %arg1) {
+; CHECK-LABEL: define i32 @test_disjoint_or(
+; CHECK-SAME: i32 [[ARG0:%.*]], i32 [[ARG1:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp slt i32 [[ARG0]], [[ARG1]]
+; CHECK-NEXT:    [[V3:%.*]] = zext i1 [[TMP1]] to i32
+; CHECK-NEXT:    ret i32 [[V3]]
+;
+  %v0 = tail call i32 @llvm.smin.i32(i32 %arg0, i32 %arg1)
+  %v1 = or disjoint i32 %arg0, 1
+  %v2 = tail call i32 @llvm.smin.i32(i32 %v1, i32 %arg1)
+  %v3 = sub i32 %v2, %v0
+  ret i32 %v3
+}
diff --git a/llvm/test/Transforms/InstCombine/preserve-profile.ll b/llvm/test/Transforms/InstCombine/preserve-profile.ll
new file mode 100644
index 0000000000000..dd83805ed3397
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/preserve-profile.ll
@@ -0,0 +1,54 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 6
+; RUN: opt < %s -passes=instcombine -S | FileCheck %s
+
+define i32 @LHSBin(i1 %C) !prof !0 {
+; CHECK-LABEL: define i32 @LHSBin(
+; CHECK-SAME: i1 [[C:%.*]]) !prof [[PROF0:![0-9]+]] {
+; CHECK-NEXT:    [[V:%.*]] = select i1 [[C]], i32 1010, i32 20, !prof [[PROF1:![0-9]+]]
+; CHECK-NEXT:    ret i32 [[V]]
+;
+  %A = select i1 %C, i32 1000, i32 10, !prof !1
+  %V = add i32 %A, 10
+  ret i32 %V
+}
+
+define i32 @RHSBin(i1 %C) !prof !0 {
+; CHECK-LABEL: define i32 @RHSBin(
+; CHECK-SAME: i1 [[C:%.*]]) !prof [[PROF0]] {
+; CHECK-NEXT:    [[V:%.*]] = select i1 [[C]], i32 1010, i32 20, !prof [[PROF1]]
+; CHECK-NEXT:    ret i32 [[V]]
+;
+  %A = select i1 %C, i32 1000, i32 10, !prof !1
+  %V = add i32 10, %A
+  ret i32 %V;
+}
+
+define i32 @BothBin(i1 %C) !prof !0 {
+; CHECK-LABEL: define i32 @BothBin(
+; CHECK-SAME: i1 [[C:%.*]]) !prof [[PROF0]] {
+; CHECK-NEXT:    [[V:%.*]] = select i1 [[C]], i32 2000, i32 20, !prof [[PROF1]]
+; CHECK-NEXT:    ret i32 [[V]]
+;
+  %A = select i1 %C, i32 1000, i32 10, !prof !1
+  %B = select i1 %C, i32 1000, i32 10, !prof !1
+  %V = add i32 %A, %B
+  ret i32 %V;
+}
+
+define i32 @NegBin(i1 %C) !prof !0 {
+; CHECK-LABEL: define i32 @NegBin(
+; CHECK-SAME: i1 [[C:%.*]]) !prof [[PROF0]] {
+; CHECK-NEXT:    [[V:%.*]] = select i1 [[C]], i32 1010, i32 0, !prof [[PROF1]]
+; CHECK-NEXT:    ret i32 [[V]]
+;
+  %A = select i1 %C, i32 1000, i32 -10, !prof !1
+  %V = add i32 %A, 10
+  ret i32 %V
+}
+
+!0 = !{!"function_entry_count", i64 1000}
+!1 = !{!"branch_weights", i32 2, i32 3}
+;.
+; CHECK: [[PROF0]] = !{!"function_entry_count", i64 1000}
+; CHECK: [[PROF1]] = !{!"branch_weights", i32 2, i32 3}
+;.
diff --git a/llvm/test/Transforms/InstCombine/redundant-fcmp.ll b/llvm/test/Transforms/InstCombine/redundant-fcmp.ll
new file mode 100644
index 0000000000000..0f5fe9fb9a1b2
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/redundant-fcmp.ll
@@ -0,0 +1,186 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -passes=instcombine -S | FileCheck %s
+
+define i1 @or_fcmp_redundant_or1(double %v0) {
+; CHECK-LABEL: @or_fcmp_redundant_or1(
+; CHECK-NEXT:    [[V2:%.*]] = fcmp olt double [[V0:%.*]], 1.990000e+00
+; CHECK-NEXT:    ret i1 [[V2]]
+;
+  %v1 = fcmp olt double %v0, 1.000000e-02
+  %v2 = fcmp olt double %v0, 1.990000e+00
+  %v3 = or i1 %v1, %v2
+  ret i1 %v3
+}
+
+define i1 @or_fcmp_redundant_or2(double %v0) {
+; CHECK-LABEL: @or_fcmp_redundant_or2(
+; CHECK-NEXT:    [[V1:%.*]] = fcmp olt double [[V0:%.*]], 2.300000e+00
+; CHECK-NEXT:    ret i1 [[V1]]
+;
+  %v1 = fcmp olt double %v0, 2.300000e+00
+  %v2 = fcmp olt double %v0, 1.990000e+00
+  %v3 = or i1 %v1, %v2
+  ret i1 %v3
+}
+
+define i1 @or_fcmp_redundant_or3(double %v0) {
+; CHECK-LABEL: @or_fcmp_redundant_or3(
+; CHECK-NEXT:    [[V1:%.*]] = fcmp ogt double [[V0:%.*]], 1.000000e-02
+; CHECK-NEXT:    ret i1 [[V1]]
+;
+  %v1 = fcmp ogt double %v0, 1.000000e-02
+  %v2 = fcmp ogt double %v0, 1.990000e+00
+  %v3 = or i1 %v1, %v2
+  ret i1 %v3
+}
+
+define i1 @or_fcmp_redundant_or4(double %v0) {
+; CHECK-LABEL: @or_fcmp_redundant_or4(
+; CHECK-NEXT:    [[V2:%.*]] = fcmp ogt double [[V0:%.*]], 1.990000e+00
+; CHECK-NEXT:    ret i1 [[V2]]
+;
+  %v1 = fcmp ogt double %v0, 2.300000e+00
+  %v2 = fcmp ogt double %v0, 1.990000e+00
+  %v3 = or i1 %v1, %v2
+  ret i1 %v3
+}
+
+define i1 @or_fcmp_redundant_or_neg1(double %v0) {
+; CHECK-LABEL: @or_fcmp_redundant_or_neg1(
+; CHECK-NEXT:    [[V1:%.*]] = fcmp olt double [[V0:%.*]], 1.000000e-02
+; CHECK-NEXT:    [[V2:%.*]] = fcmp ogt double [[V0]], 1.990000e+00
+; CHECK-NEXT:    [[V3:%.*]] = or i1 [[V1]], [[V2]]
+; CHECK-NEXT:    ret i1 [[V3]]
+;
+  %v1 = fcmp olt double %v0, 1.000000e-02
+  %v2 = fcmp ogt double %v0, 1.990000e+00
+  %v3 = or i1 %v1, %v2
+  ret i1 %v3
+}
+
+define i1 @or_fcmp_redundant_or_neg2(double %v0) {
+; CHECK-LABEL: @or_fcmp_redundant_or_neg2(
+; CHECK-NEXT:    [[V1:%.*]] = fcmp ogt double [[V0:%.*]], 1.000000e-02
+; CHECK-NEXT:    [[V2:%.*]] = fcmp olt double [[V0]], 1.990000e+00
+; CHECK-NEXT:    [[V3:%.*]] = or i1 [[V1]], [[V2]]
+; CHECK-NEXT:    ret i1 [[V3]]
+;
+  %v1 = fcmp ogt double %v0, 1.000000e-02
+  %v2 = fcmp olt double %v0, 1.990000e+00
+  %v3 = or i1 %v1, %v2
+  ret i1 %v3
+}
+
+define i1 @or_fcmp_redundant_and1(double %v0) {
+; CHECK-LABEL: @or_fcmp_redundant_and1(
+; CHECK-NEXT:    [[V1:%.*]] = fcmp olt double [[V0:%.*]], 1.000000e-02
+; CHECK-NEXT:    ret i1 [[V1]]
+;
+  %v1 = fcmp olt double %v0, 1.000000e-02
+  %v2 = fcmp olt double %v0, 1.990000e+00
+  %v3 = and i1 %v1, %v2
+  ret i1 %v3
+}
+
+define i1 @or_fcmp_redundant_and2(double %v0) {
+; CHECK-LABEL: @or_fcmp_redundant_and2(
+; CHECK-NEXT:    [[V2:%.*]] = fcmp olt double [[V0:%.*]], 1.990000e+00
+; CHECK-NEXT:    ret i1 [[V2]]
+;
+  %v1 = fcmp olt double %v0, 2.300000e+00
+  %v2 = fcmp olt double %v0, 1.990000e+00
+  %v3 = and i1 %v1, %v2
+  ret i1 %v3
+}
+
+define i1 @or_fcmp_redundant_and3(double %v0) {
+; CHECK-LABEL: @or_fcmp_redundant_and3(
+; CHECK-NEXT:    [[V2:%.*]] = fcmp ogt double [[V0:%.*]], 1.990000e+00
+; CHECK-NEXT:    ret i1 [[V2]]
+;
+  %v1 = fcmp ogt double %v0, 1.000000e-02
+  %v2 = fcmp ogt double %v0, 1.990000e+00
+  %v3 = and i1 %v1, %v2
+  ret i1 %v3
+}
+
+define i1 @or_fcmp_redundant_and4(double %v0) {
+; CHECK-LABEL: @or_fcmp_redundant_and4(
+; CHECK-NEXT:    [[V1:%.*]] = fcmp ogt double [[V0:%.*]], 2.300000e+00
+; CHECK-NEXT:    ret i1 [[V1]]
+;
+  %v1 = fcmp ogt double %v0, 2.300000e+00
+  %v2 = fcmp ogt double %v0, 1.990000e+00
+  %v3 = and i1 %v1, %v2
+  ret i1 %v3
+}
+
+define i1 @or_fcmp_redundant_and_neg1(double %v0) {
+; CHECK-LABEL: @or_fcmp_redundant_and_neg1(
+; CHECK-NEXT:    [[V1:%.*]] = fcmp olt double [[V0:%.*]], 1.000000e-02
+; CHECK-NEXT:    [[V2:%.*]] = fcmp ogt double [[V0]], 1.990000e+00
+; CHECK-NEXT:    [[V3:%.*]] = and i1 [[V1]], [[V2]]
+; CHECK-NEXT:    ret i1 [[V3]]
+;
+  %v1 = fcmp olt double %v0, 1.000000e-02
+  %v2 = fcmp ogt double %v0, 1.990000e+00
+  %v3 = and i1 %v1, %v2
+  ret i1 %v3
+}
+
+define i1 @or_fcmp_redundant_and_neg2(double %v0) {
+; CHECK-LABEL: @or_fcmp_redundant_and_neg2(
+; CHECK-NEXT:    [[V1:%.*]] = fcmp ogt double [[V0:%.*]], 1.000000e-02
+; CHECK-NEXT:    [[V2:%.*]] = fcmp olt double [[V0]], 1.990000e+00
+; CHECK-NEXT:    [[V3:%.*]] = and i1 [[V1]], [[V2]]
+; CHECK-NEXT:    ret i1 [[V3]]
+;
+  %v1 = fcmp ogt double %v0, 1.000000e-02
+  %v2 = fcmp olt double %v0, 1.990000e+00
+  %v3 = and i1 %v1, %v2
+  ret i1 %v3
+}
+
+define i1 @or_fcmp_redundant_select_or1(double %v0) {
+; CHECK-LABEL: @or_fcmp_redundant_select_or1(
+; CHECK-NEXT:    [[V1:%.*]] = fcmp olt double [[V0:%.*]], 1.000000e-02
+; CHECK-NEXT:    ret i1 [[V1]]
+;
+  %v1 = fcmp olt double %v0, 1.000000e-02
+  %v2 = fcmp olt double %v0, 1.990000e+00
+  %v3 = select i1 %v1, i1 %v2, i1 false
+  ret i1 %v3
+}
+
+define i1 @or_fcmp_redundant_select_or2(double %v0) {
+; CHECK-LABEL: @or_fcmp_redundant_select_or2(
+; CHECK-NEXT:    [[V2:%.*]] = fcmp ogt double [[V0:%.*]], 1.990000e+00
+; CHECK-NEXT:    ret i1 [[V2]]
+;
+  %v1 = fcmp ogt double %v0, 1.000000e-02
+  %v2 = fcmp ogt double %v0, 1.990000e+00
+  %v3 = select i1 %v1, i1 %v2, i1 false
+  ret i1 %v3
+}
+
+define i1 @or_fcmp_redundant_select_and1(double %v0) {
+; CHECK-LABEL: @or_fcmp_redundant_select_and1(
+; CHECK-NEXT:    [[V2:%.*]] = fcmp olt double [[V0:%.*]], 1.990000e+00
+; CHECK-NEXT:    ret i1 [[V2]]
+;
+  %v1 = fcmp olt double %v0, 1.000000e-02
+  %v2 = fcmp olt double %v0, 1.990000e+00
+  %v3 = select i1 %v1, i1 true, i1 %v2
+  ret i1 %v3
+}
+
+define i1 @or_fcmp_redundant_select_and2(double %v0) {
+; CHECK-LABEL: @or_fcmp_redundant_select_and2(
+; CHECK-NEXT:    [[V1:%.*]] = fcmp ogt double [[V0:%.*]], 1.000000e-02
+; CHECK-NEXT:    ret i1 [[V1]]
+;
+  %v1 = fcmp ogt double %v0, 1.000000e-02
+  %v2 = fcmp ogt double %v0, 1.990000e+00
+  %v3 = select i1 %v1, i1 true, i1 %v2
+  ret i1 %v3
+}
diff --git a/llvm/test/Transforms/InstCombine/simplify-libcalls-new.ll b/llvm/test/Transforms/InstCombine/simplify-libcalls-new.ll
index 41db7f929dfdf..5a4fb04f5f2c0 100644
--- a/llvm/test/Transforms/InstCombine/simplify-libcalls-new.ll
+++ b/llvm/test/Transforms/InstCombine/simplify-libcalls-new.ll
@@ -6,14 +6,18 @@
 ; OFF-LABEL: @new_hot_cold()
 
 ;; First check with the default hint values (254 = -2, 128 = -128, 222 = -34).
-; RUN: opt < %s -passes=instcombine -optimize-hot-cold-new -S | FileCheck %s --check-prefix=HOTCOLD -DCOLD=1 -DHOT=-2 -DNOTCOLD=-128 -DAMBIG=-34 -DPREVHINTCOLD=7 -DPREVHINTNOTCOLD=7 -DPREVHINTHOT=7 -DPREVHINTAMBIG=7
+; RUN: opt < %s -passes=instcombine -optimize-hot-cold-new -S | FileCheck %s --check-prefixes=HOTCOLD,NOBUILTIN-OFF -DCOLD=1 -DHOT=-2 -DNOTCOLD=-128 -DAMBIG=-34 -DPREVHINTCOLD=7 -DPREVHINTNOTCOLD=7 -DPREVHINTHOT=7 -DPREVHINTAMBIG=7
 
 ;; Next check with the non-default cold and hot hint values (200 =-56).
-; RUN: opt < %s -passes=instcombine -optimize-hot-cold-new -cold-new-hint-value=5 -hot-new-hint-value=200 -notcold-new-hint-value=99 -ambiguous-new-hint-value=44 -S | FileCheck %s --check-prefix=HOTCOLD -DCOLD=5 -DHOT=-56 -DAMBIG=44 -DNOTCOLD=99 -DPREVHINTCOLD=7 -DPREVHINTNOTCOLD=7 -DPREVHINTHOT=7 -DPREVHINTAMBIG=7
+; RUN: opt < %s -passes=instcombine -optimize-hot-cold-new -cold-new-hint-value=5 -hot-new-hint-value=200 -notcold-new-hint-value=99 -ambiguous-new-hint-value=44 -S | FileCheck %s --check-prefixes=HOTCOLD,NOBUILTIN-OFF -DCOLD=5 -DHOT=-56 -DAMBIG=44 -DNOTCOLD=99 -DPREVHINTCOLD=7 -DPREVHINTNOTCOLD=7 -DPREVHINTHOT=7 -DPREVHINTAMBIG=7
+
+;; Next check with the same non-default cold and hot hint values (200 =-56),
+;; but with transformation of nobuiltin calls enabled.
+; RUN: opt < %s -passes=instcombine -optimize-hot-cold-new -optimize-nobuiltin-hot-cold-new-new -cold-new-hint-value=5 -hot-new-hint-value=200 -notcold-new-hint-value=99 -ambiguous-new-hint-value=44 -S | FileCheck %s --check-prefixes=HOTCOLD,NOBUILTIN-ON -DCOLD=5 -DHOT=-56 -DAMBIG=44 -DNOTCOLD=99 -DPREVHINTCOLD=7 -DPREVHINTNOTCOLD=7 -DPREVHINTHOT=7 -DPREVHINTAMBIG=7
 
 ;; Try again with the non-default cold and hot hint values (200 =-56), and this
 ;; time specify that existing hints should be updated.
-; RUN: opt < %s -passes=instcombine -optimize-hot-cold-new -cold-new-hint-value=5 -notcold-new-hint-value=100 -hot-new-hint-value=200 -ambiguous-new-hint-value=44 -optimize-existing-hot-cold-new -S | FileCheck %s --check-prefix=HOTCOLD -DCOLD=5 -DHOT=-56 -DNOTCOLD=100 -DAMBIG=44 -DPREVHINTCOLD=5 -DPREVHINTNOTCOLD=100 -DPREVHINTHOT=-56 -DPREVHINTAMBIG=44
+; RUN: opt < %s -passes=instcombine -optimize-hot-cold-new -cold-new-hint-value=5 -notcold-new-hint-value=100 -hot-new-hint-value=200 -ambiguous-new-hint-value=44 -optimize-existing-hot-cold-new -S | FileCheck %s --check-prefixes=HOTCOLD,NOBUILTIN-OFF -DCOLD=5 -DHOT=-56 -DNOTCOLD=100 -DAMBIG=44 -DPREVHINTCOLD=5 -DPREVHINTNOTCOLD=100 -DPREVHINTHOT=-56 -DPREVHINTAMBIG=44
 
 ;; Make sure that values not in 0..255 are flagged with an error
 ; RUN: not opt < %s -passes=instcombine -optimize-hot-cold-new -cold-new-hint-value=256 -S 2>&1 | FileCheck %s --check-prefix=ERROR
@@ -40,8 +44,9 @@ define void @new() {
   ; HOTCOLD: @_Znwm12__hot_cold_t(i64 10, i8 [[AMBIG]])
   %call4 = call ptr @_Znwm(i64 10) #7
   call void @dummy(ptr %call4)
-  ;; Attribute cold on a nobuiltin call has no effect.
-  ; HOTCOLD: @_Znwm(i64 10)
+  ;; Attribute cold on a nobuiltin call has no effect, unless optionally enabled.
+  ; NOBUILTIN-OFF: @_Znwm(i64 10)
+  ; NOBUILTIN-ON: @_Znwm12__hot_cold_t(i64 10, i8 [[COLD]])
   %call3 = call ptr @_Znwm(i64 10) #6
   call void @dummy(ptr %call3)
   ret void
@@ -68,8 +73,9 @@ define void @new_align() {
   ; HOTCOLD: @_ZnwmSt11align_val_t12__hot_cold_t(i64 10, i64 8, i8 [[AMBIG]])
   %call4 = call ptr @_ZnwmSt11align_val_t(i64 10, i64 8) #7
   call void @dummy(ptr %call4)
-  ;; Attribute cold on a nobuiltin call has no effect.
-  ; HOTCOLD: @_ZnwmSt11align_val_t(i64 10, i64 8)
+  ;; Attribute cold on a nobuiltin call has no effect, unless optionally enabled.
+  ; NOBUILTIN-OFF: @_ZnwmSt11align_val_t(i64 10, i64 8)
+  ; NOBUILTIN-ON: @_ZnwmSt11align_val_t12__hot_cold_t(i64 10, i64 8, i8 [[COLD]])
   %call3 = call ptr @_ZnwmSt11align_val_t(i64 10, i64 8) #6
   call void @dummy(ptr %call3)
   ret void
@@ -97,8 +103,9 @@ define void @new_nothrow() {
   ; HOTCOLD: @_ZnwmRKSt9nothrow_t12__hot_cold_t(i64 10, ptr nonnull %nt, i8 [[AMBIG]])
   %call4 = call ptr @_ZnwmRKSt9nothrow_t(i64 10, ptr %nt) #7
   call void @dummy(ptr %call4)
-  ;; Attribute cold on a nobuiltin call has no effect.
-  ; HOTCOLD: @_ZnwmRKSt9nothrow_t(i64 10, ptr nonnull %nt)
+  ;; Attribute cold on a nobuiltin call has no effect, unless optionally enabled.
+  ; NOBUILTIN-OFF: @_ZnwmRKSt9nothrow_t(i64 10, ptr nonnull %nt)
+  ; NOBUILTIN-ON: @_ZnwmRKSt9nothrow_t12__hot_cold_t(i64 10, ptr nonnull %nt, i8 [[COLD]])
   %call3 = call ptr @_ZnwmRKSt9nothrow_t(i64 10, ptr %nt) #6
   call void @dummy(ptr %call3)
   ret void
@@ -127,8 +134,9 @@ define void @new_align_nothrow() {
   ; HOTCOLD: @_ZnwmSt11align_val_tRKSt9nothrow_t12__hot_cold_t(i64 10, i64 8, ptr nonnull %nt, i8 [[AMBIG]])
   %call4 = call ptr @_ZnwmSt11align_val_tRKSt9nothrow_t(i64 10, i64 8, ptr %nt) #7
   call void @dummy(ptr %call4)
-  ;; Attribute cold on a nobuiltin call has no effect.
-  ; HOTCOLD: @_ZnwmSt11align_val_tRKSt9nothrow_t(i64 10, i64 8, ptr nonnull %nt)
+  ;; Attribute cold on a nobuiltin call has no effect, unless optionally enabled.
+  ; NOBUILTIN-OFF: @_ZnwmSt11align_val_tRKSt9nothrow_t(i64 10, i64 8, ptr nonnull %nt)
+  ; NOBUILTIN-ON: @_ZnwmSt11align_val_tRKSt9nothrow_t12__hot_cold_t(i64 10, i64 8, ptr nonnull %nt, i8 [[COLD]])
   %call3 = call ptr @_ZnwmSt11align_val_tRKSt9nothrow_t(i64 10, i64 8, ptr %nt) #6
   call void @dummy(ptr %call3)
   ret void
@@ -154,8 +162,9 @@ define void @array_new() {
   ; HOTCOLD: @_Znam12__hot_cold_t(i64 10, i8 [[AMBIG]])
   %call4 = call ptr @_Znam(i64 10) #7
   call void @dummy(ptr %call4)
-  ;; Attribute cold on a nobuiltin call has no effect.
-  ; HOTCOLD: @_Znam(i64 10)
+  ;; Attribute cold on a nobuiltin call has no effect, unless optionally enabled.
+  ; NOBUILTIN-OFF: @_Znam(i64 10)
+  ; NOBUILTIN-ON: @_Znam12__hot_cold_t(i64 10, i8 [[COLD]])
   %call3 = call ptr @_Znam(i64 10) #6
   call void @dummy(ptr %call3)
   ret void
@@ -182,8 +191,9 @@ define void @array_new_align() {
   ; HOTCOLD: @_ZnamSt11align_val_t12__hot_cold_t(i64 10, i64 8, i8 [[AMBIG]])
   %call4 = call ptr @_ZnamSt11align_val_t(i64 10, i64 8) #7
   call void @dummy(ptr %call4)
-  ;; Attribute cold on a nobuiltin call has no effect.
-  ; HOTCOLD: @_ZnamSt11align_val_t(i64 10, i64 8)
+  ;; Attribute cold on a nobuiltin call has no effect, unless optionally enabled.
+  ; NOBUILTIN-OFF: @_ZnamSt11align_val_t(i64 10, i64 8)
+  ; NOBUILTIN-ON: @_ZnamSt11align_val_t12__hot_cold_t(i64 10, i64 8, i8 [[COLD]])
   %call3 = call ptr @_ZnamSt11align_val_t(i64 10, i64 8) #6
   call void @dummy(ptr %call3)
   ret void
@@ -211,8 +221,9 @@ define void @array_new_nothrow() {
   ; HOTCOLD: @_ZnamRKSt9nothrow_t12__hot_cold_t(i64 10, ptr nonnull %nt, i8 [[AMBIG]])
   %call4 = call ptr @_ZnamRKSt9nothrow_t(i64 10, ptr %nt) #7
   call void @dummy(ptr %call4)
-  ;; Attribute cold on a nobuiltin call has no effect.
-  ; HOTCOLD: @_ZnamRKSt9nothrow_t(i64 10, ptr nonnull %nt)
+  ;; Attribute cold on a nobuiltin call has no effect, unless optionally enabled.
+  ; NOBUILTIN-OFF: @_ZnamRKSt9nothrow_t(i64 10, ptr nonnull %nt)
+  ; NOBUILTIN-ON: @_ZnamRKSt9nothrow_t12__hot_cold_t(i64 10, ptr nonnull %nt, i8 [[COLD]])
   %call3 = call ptr @_ZnamRKSt9nothrow_t(i64 10, ptr %nt) #6
   call void @dummy(ptr %call3)
   ret void
@@ -241,8 +252,9 @@ define void @array_new_align_nothrow() {
   ; HOTCOLD: @_ZnamSt11align_val_tRKSt9nothrow_t12__hot_cold_t(i64 10, i64 8, ptr nonnull %nt, i8 [[AMBIG]])
   %call4 = call ptr @_ZnamSt11align_val_tRKSt9nothrow_t(i64 10, i64 8, ptr %nt) #7
   call void @dummy(ptr %call4)
-  ;; Attribute cold on a nobuiltin call has no effect.
-  ; HOTCOLD: @_ZnamSt11align_val_tRKSt9nothrow_t(i64 10, i64 8, ptr nonnull %nt)
+  ;; Attribute cold on a nobuiltin call has no effect, unless optionally enabled.
+  ; NOBUILTIN-OFF: @_ZnamSt11align_val_tRKSt9nothrow_t(i64 10, i64 8, ptr nonnull %nt)
+  ; NOBUILTIN-ON: @_ZnamSt11align_val_tRKSt9nothrow_t12__hot_cold_t(i64 10, i64 8, ptr nonnull %nt, i8 [[COLD]])
   %call3 = call ptr @_ZnamSt11align_val_tRKSt9nothrow_t(i64 10, i64 8, ptr %nt) #6
   call void @dummy(ptr %call3)
   ret void
@@ -492,8 +504,9 @@ define void @size_returning_test() {
   %call4 = call {ptr, i64} @__size_returning_new(i64 10) #8
   %p4  = extractvalue {ptr, i64} %call4, 0
   call void @dummy(ptr %p4)
-  ;; Attribute cold on a nobuiltin call has no effect.
-  ; HOTCOLD: @__size_returning_new(i64 10)
+  ;; Attribute cold on a nobuiltin call has no effect, unless optionally enabled.
+  ; NOBUILTIN-OFF: @__size_returning_new(i64 10)
+  ; NOBUILTIN-ON: @__size_returning_new_hot_cold(i64 10, i8 [[COLD]])
   %call3 = call {ptr, i64} @__size_returning_new(i64 10) #6
   %p3 = extractvalue {ptr, i64} %call3, 0
   call void @dummy(ptr %p3)
@@ -524,8 +537,9 @@ define void @size_returning_aligned_test() {
   %call4 = call {ptr, i64} @__size_returning_new_aligned(i64 10, i64 8) #8
   %p4  = extractvalue {ptr, i64} %call4, 0
   call void @dummy(ptr %p4)
-  ;; Attribute cold on a nobuiltin call has no effect.
-  ; HOTCOLD: @__size_returning_new_aligned(i64 10, i64 8)
+  ;; Attribute cold on a nobuiltin call has no effect, unless optionally enabled.
+  ; NOBUILTIN-OFF: @__size_returning_new_aligned(i64 10, i64 8)
+  ; NOBUILTIN-ON: @__size_returning_new_aligned_hot_cold(i64 10, i64 8, i8 [[COLD]])
   %call3 = call {ptr, i64} @__size_returning_new_aligned(i64 10, i64 8) #6
   %p3 = extractvalue {ptr, i64} %call3, 0
   call void @dummy(ptr %p3)
diff --git a/llvm/test/Transforms/InstCombine/struct-assign-tbaa.ll b/llvm/test/Transforms/InstCombine/struct-assign-tbaa.ll
index e96452a3cebc8..6fc29bc2c9a28 100644
--- a/llvm/test/Transforms/InstCombine/struct-assign-tbaa.ll
+++ b/llvm/test/Transforms/InstCombine/struct-assign-tbaa.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 6
 ; RUN: opt -passes=instcombine -S < %s | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
@@ -11,10 +11,11 @@ declare void @llvm.memcpy.p0.p0.i64(ptr nocapture, ptr nocapture, i64, i1) nounw
 %struct.test1 = type { float }
 
 define void @test1(ptr nocapture %a, ptr nocapture %b) {
-; CHECK-LABEL: @test1(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[B:%.*]], align 4, !tbaa [[TBAA0:![0-9]+]]
-; CHECK-NEXT:    store i32 [[TMP0]], ptr [[A:%.*]], align 4, !tbaa [[TBAA0]]
+; CHECK-LABEL: define void @test1(
+; CHECK-SAME: ptr captures(none) [[A:%.*]], ptr captures(none) [[B:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[B]], align 4, !tbaa [[FLOAT_TBAA0:![0-9]+]]
+; CHECK-NEXT:    store i32 [[TMP0]], ptr [[A]], align 4, !tbaa [[FLOAT_TBAA0]]
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -25,7 +26,7 @@ entry:
 %struct.test2 = type { ptr }
 
 define ptr @test2() {
-; CHECK-LABEL: @test2(
+; CHECK-LABEL: define ptr @test2() {
 ; CHECK-NEXT:    store i1 true, ptr poison, align 1
 ; CHECK-NEXT:    ret ptr poison
 ;
@@ -36,10 +37,11 @@ define ptr @test2() {
 }
 
 define void @test3_multiple_fields(ptr nocapture %a, ptr nocapture %b) {
-; CHECK-LABEL: @test3_multiple_fields(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[B:%.*]], align 4
-; CHECK-NEXT:    store i64 [[TMP0]], ptr [[A:%.*]], align 4
+; CHECK-LABEL: define void @test3_multiple_fields(
+; CHECK-SAME: ptr captures(none) [[A:%.*]], ptr captures(none) [[B:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[B]], align 4
+; CHECK-NEXT:    store i64 [[TMP0]], ptr [[A]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -48,10 +50,11 @@ entry:
 }
 
 define void @test4_multiple_copy_first_field(ptr nocapture %a, ptr nocapture %b) {
-; CHECK-LABEL: @test4_multiple_copy_first_field(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[B:%.*]], align 4, !tbaa [[TBAA0]]
-; CHECK-NEXT:    store i32 [[TMP0]], ptr [[A:%.*]], align 4, !tbaa [[TBAA0]]
+; CHECK-LABEL: define void @test4_multiple_copy_first_field(
+; CHECK-SAME: ptr captures(none) [[A:%.*]], ptr captures(none) [[B:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[B]], align 4, !tbaa [[FLOAT_TBAA0]]
+; CHECK-NEXT:    store i32 [[TMP0]], ptr [[A]], align 4, !tbaa [[FLOAT_TBAA0]]
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -60,10 +63,11 @@ entry:
 }
 
 define void @test5_multiple_copy_more_than_first_field(ptr nocapture %a, ptr nocapture %b) {
-; CHECK-LABEL: @test5_multiple_copy_more_than_first_field(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[B:%.*]], align 4
-; CHECK-NEXT:    store i32 [[TMP0]], ptr [[A:%.*]], align 4
+; CHECK-LABEL: define void @test5_multiple_copy_more_than_first_field(
+; CHECK-SAME: ptr captures(none) [[A:%.*]], ptr captures(none) [[B:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[B]], align 4
+; CHECK-NEXT:    store i32 [[TMP0]], ptr [[A]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -83,7 +87,7 @@ entry:
 ;.
 ; CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
 ;.
-; CHECK: [[TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0}
+; CHECK: [[FLOAT_TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0}
 ; CHECK: [[META1]] = !{!"float", [[META2:![0-9]+]]}
 ; CHECK: [[META2]] = !{!"Simple C/C++ TBAA"}
 ;.
diff --git a/llvm/test/Transforms/InstCombine/trunc-lshr.ll b/llvm/test/Transforms/InstCombine/trunc-lshr.ll
index c443b35cb1c1e..0e996e5d017fe 100644
--- a/llvm/test/Transforms/InstCombine/trunc-lshr.ll
+++ b/llvm/test/Transforms/InstCombine/trunc-lshr.ll
@@ -219,3 +219,77 @@ define i1 @negative_test_fold_ashr(i8 %x) {
   %trunc = trunc i8 %ashr to i1
   ret i1 %trunc
 }
+
+define i1 @fold_lshr_negated_power_of_2(i8 %x) {
+; CHECK-LABEL: define i1 @fold_lshr_negated_power_of_2(
+; CHECK-SAME: i8 [[X:%.*]]) {
+; CHECK-NEXT:    [[TRUNC:%.*]] = icmp ugt i8 [[X]], 3
+; CHECK-NEXT:    ret i1 [[TRUNC]]
+;
+  %lshr = lshr i8 -16, %x
+  %trunc = trunc i8 %lshr to i1
+  ret i1 %trunc
+}
+
+define i1 @fold_ashr_negated_power_of_2(i8 %x) {
+; CHECK-LABEL: define i1 @fold_ashr_negated_power_of_2(
+; CHECK-SAME: i8 [[X:%.*]]) {
+; CHECK-NEXT:    [[TRUNC:%.*]] = icmp ugt i8 [[X]], 3
+; CHECK-NEXT:    ret i1 [[TRUNC]]
+;
+  %ashr = ashr i8 -16, %x
+  %trunc = trunc i8 %ashr to i1
+  ret i1 %trunc
+}
+
+define i1 @fold_lshr_negated_power_of_2_multi_use(i8 %x) {
+; CHECK-LABEL: define i1 @fold_lshr_negated_power_of_2_multi_use(
+; CHECK-SAME: i8 [[X:%.*]]) {
+; CHECK-NEXT:    [[LSHR:%.*]] = lshr i8 -16, [[X]]
+; CHECK-NEXT:    call void @use(i8 [[LSHR]])
+; CHECK-NEXT:    [[TRUNC:%.*]] = icmp ugt i8 [[X]], 3
+; CHECK-NEXT:    ret i1 [[TRUNC]]
+;
+  %lshr = lshr i8 -16, %x
+  call void @use(i8 %lshr)
+  %trunc = trunc i8 %lshr to i1
+  ret i1 %trunc
+}
+
+define i1 @fold_ashr_negated_power_of_2_multi_use(i8 %x) {
+; CHECK-LABEL: define i1 @fold_ashr_negated_power_of_2_multi_use(
+; CHECK-SAME: i8 [[X:%.*]]) {
+; CHECK-NEXT:    [[ASHR:%.*]] = ashr i8 -16, [[X]]
+; CHECK-NEXT:    call void @use(i8 [[ASHR]])
+; CHECK-NEXT:    [[TRUNC:%.*]] = icmp ugt i8 [[X]], 3
+; CHECK-NEXT:    ret i1 [[TRUNC]]
+;
+  %ashr = ashr i8 -16, %x
+  call void @use(i8 %ashr)
+  %trunc = trunc i8 %ashr to i1
+  ret i1 %trunc
+}
+
+define i1 @negative_test_fold_lshr_negated_power_of_2(i8 %x) {
+; CHECK-LABEL: define i1 @negative_test_fold_lshr_negated_power_of_2(
+; CHECK-SAME: i8 [[X:%.*]]) {
+; CHECK-NEXT:    [[LSHR:%.*]] = lshr i8 -17, [[X]]
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i8 [[LSHR]] to i1
+; CHECK-NEXT:    ret i1 [[TRUNC]]
+;
+  %lshr = lshr i8 -17, %x
+  %trunc = trunc i8 %lshr to i1
+  ret i1 %trunc
+}
+
+define i1 @negative_test_fold_ashr_negated_power_of_2(i8 %x) {
+; CHECK-LABEL: define i1 @negative_test_fold_ashr_negated_power_of_2(
+; CHECK-SAME: i8 [[X:%.*]]) {
+; CHECK-NEXT:    [[ASHR1:%.*]] = lshr i8 -17, [[X]]
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i8 [[ASHR1]] to i1
+; CHECK-NEXT:    ret i1 [[TRUNC]]
+;
+  %ashr = ashr i8 -17, %x
+  %trunc = trunc i8 %ashr to i1
+  ret i1 %trunc
+}
diff --git a/llvm/test/Transforms/InstSimplify/ConstProp/active-lane-mask.ll b/llvm/test/Transforms/InstSimplify/ConstProp/active-lane-mask.ll
index ed26deb58eae4..e9d9ac040ea1d 100644
--- a/llvm/test/Transforms/InstSimplify/ConstProp/active-lane-mask.ll
+++ b/llvm/test/Transforms/InstSimplify/ConstProp/active-lane-mask.ll
@@ -3,6 +3,8 @@
 
 target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
 
+@glob = external global i32
+
 define <16 x i1> @v16i1_0() {
 ; CHECK-LABEL: @v16i1_0(
 ; CHECK-NEXT:  entry:
@@ -337,6 +339,40 @@ entry:
   ret <vscale x 16 x i1> %mask
 }
 
+
+define <vscale x 16 x i1> @nxv16i1_0_constexpr() {
+; CHECK-LABEL: @nxv16i1_0_constexpr(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[MASK:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 ptrtoint (ptr @glob to i64))
+; CHECK-NEXT:    ret <vscale x 16 x i1> [[MASK]]
+;
+entry:
+  %mask = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 ptrtoint (ptr @glob to i64))
+  ret <vscale x 16 x i1> %mask
+}
+
+define <vscale x 16 x i1> @nxv16i1_constexpr_0() {
+; CHECK-LABEL: @nxv16i1_constexpr_0(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    ret <vscale x 16 x i1> zeroinitializer
+;
+entry:
+  %mask = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 ptrtoint (ptr @glob to i64), i64 0)
+  ret <vscale x 16 x i1> %mask
+}
+
+define <vscale x 16 x i1> @nxv16i1_constexpr_constexpr() {
+; CHECK-LABEL: @nxv16i1_constexpr_constexpr(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[MASK:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 ptrtoint (ptr @glob to i64), i64 ptrtoint (ptr getelementptr inbounds nuw (i8, ptr @glob, i64 2) to i64))
+; CHECK-NEXT:    ret <vscale x 16 x i1> [[MASK]]
+;
+entry:
+  %mask = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 ptrtoint (ptr @glob to i64), i64 ptrtoint (ptr getelementptr inbounds nuw (i8, ptr @glob, i64 2) to i64))
+  ret <vscale x 16 x i1> %mask
+}
+
+
 declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32)
 declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32, i32)
 declare <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32, i32)
diff --git a/llvm/test/Transforms/InstSimplify/ConstProp/bswap.ll b/llvm/test/Transforms/InstSimplify/ConstProp/bswap.ll
index d168422a2a9bc..42bb73344995b 100644
--- a/llvm/test/Transforms/InstSimplify/ConstProp/bswap.ll
+++ b/llvm/test/Transforms/InstSimplify/ConstProp/bswap.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; bswap should be constant folded when it is passed a constant argument
 
 ; RUN: opt < %s -passes=instsimplify -S | FileCheck %s
@@ -10,32 +11,34 @@ declare i64 @llvm.bswap.i64(i64)
 
 declare i80 @llvm.bswap.i80(i80)
 
-; CHECK-LABEL: define i16 @W(
 define i16 @W() {
-        ; CHECK: ret i16 256
-        %Z = call i16 @llvm.bswap.i16( i16 1 )          ; <i16> [#uses=1]
-        ret i16 %Z
+; CHECK-LABEL: define i16 @W() {
+; CHECK-NEXT:    ret i16 256
+;
+  %Z = call i16 @llvm.bswap.i16( i16 1 )          ; <i16> [#uses=1]
+  ret i16 %Z
 }
 
-; CHECK-LABEL: define i32 @X(
 define i32 @X() {
-        ; CHECK: ret i32 16777216
-        %Z = call i32 @llvm.bswap.i32( i32 1 )          ; <i32> [#uses=1]
-        ret i32 %Z
+; CHECK-LABEL: define i32 @X() {
+; CHECK-NEXT:    ret i32 16777216
+;
+  %Z = call i32 @llvm.bswap.i32( i32 1 )          ; <i32> [#uses=1]
+  ret i32 %Z
 }
 
-; CHECK-LABEL: define i64 @Y(
 define i64 @Y() {
-        ; CHECK: ret i64 72057594037927936
-        %Z = call i64 @llvm.bswap.i64( i64 1 )          ; <i64> [#uses=1]
-        ret i64 %Z
+; CHECK-LABEL: define i64 @Y() {
+; CHECK-NEXT:    ret i64 72057594037927936
+;
+  %Z = call i64 @llvm.bswap.i64( i64 1 )          ; <i64> [#uses=1]
+  ret i64 %Z
 }
 
-; CHECK-LABEL: define i80 @Z(
 define i80 @Z() {
-        ; CHECK: ret i80 -450681596205739728166896
-        ;                0xA0908070605040302010
-        %Z = call i80 @llvm.bswap.i80( i80 76151636403560493650080 )
-        ;                                  0x102030405060708090A0
-        ret i80 %Z
+; CHECK-LABEL: define i80 @Z() {
+; CHECK-NEXT:    ret i80 -450681596205739728166896
+;
+  %Z = call i80 @llvm.bswap.i80( i80 76151636403560493650080 )
+  ret i80 %Z
 }
diff --git a/llvm/test/Transforms/InstSimplify/get_active_lane_mask.ll b/llvm/test/Transforms/InstSimplify/get_active_lane_mask.ll
new file mode 100644
index 0000000000000..a3b8e4efbe939
--- /dev/null
+++ b/llvm/test/Transforms/InstSimplify/get_active_lane_mask.ll
@@ -0,0 +1,20 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -passes=instsimplify,verify -S | FileCheck %s
+
+define <4 x i1> @foo_v4i1(i32 %a) {
+; CHECK-LABEL: define <4 x i1> @foo_v4i1(
+; CHECK-SAME: i32 [[A:%.*]]) {
+; CHECK-NEXT:    ret <4 x i1> zeroinitializer
+;
+  %mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1(i32 %a, i32 0)
+  ret <4 x i1> %mask
+}
+
+define <vscale x 8 x i1> @foo_nxv8i1(i32 %a) {
+; CHECK-LABEL: define <vscale x 8 x i1> @foo_nxv8i1(
+; CHECK-SAME: i32 [[A:%.*]]) {
+; CHECK-NEXT:    ret <vscale x 8 x i1> zeroinitializer
+;
+  %mask = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1(i32 %a, i32 0)
+  ret <vscale x 8 x i1> %mask
+}
diff --git a/llvm/test/Transforms/JumpThreading/ddt-crash3.ll b/llvm/test/Transforms/JumpThreading/ddt-crash3.ll
index b37987bbf5cda..edaade329e9ce 100644
--- a/llvm/test/Transforms/JumpThreading/ddt-crash3.ll
+++ b/llvm/test/Transforms/JumpThreading/ddt-crash3.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt -S -passes=jump-threading -verify-dom-info < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@@ -10,19 +10,20 @@ target triple = "x86_64-unknown-linux-gnu"
 
 ; Function Attrs: norecurse noreturn nounwind uwtable
 define void @hoge() local_unnamed_addr #0 {
-; CHECK-LABEL: @hoge(
-; CHECK-NEXT:  bb:
-; CHECK-NEXT:    br label [[BB1:%.*]]
-; CHECK:       bb1:
-; CHECK-NEXT:    [[TMP:%.*]] = load i64, ptr @global, align 8, !tbaa [[TBAA1:![0-9]+]]
+; CHECK-LABEL: define void @hoge(
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[BB:.*:]]
+; CHECK-NEXT:    br label %[[BB1:.*]]
+; CHECK:       [[BB1]]:
+; CHECK-NEXT:    [[TMP:%.*]] = load i64, ptr @global, align 8, !tbaa [[LONG_TBAA1:![0-9]+]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP]], 0
-; CHECK-NEXT:    br i1 [[TMP2]], label [[BB26:%.*]], label [[BB3:%.*]]
-; CHECK:       bb3:
-; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr @global.1, align 8, !tbaa [[TBAA1]]
+; CHECK-NEXT:    br i1 [[TMP2]], label %[[BB26:.*]], label %[[BB3:.*]]
+; CHECK:       [[BB3]]:
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr @global.1, align 8, !tbaa [[LONG_TBAA1]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[TMP4]], 0
-; CHECK-NEXT:    br i1 [[TMP5]], label [[BB26]], label [[BB26]]
-; CHECK:       bb26:
-; CHECK-NEXT:    br label [[BB1]]
+; CHECK-NEXT:    br i1 [[TMP5]], label %[[BB26]], label %[[BB26]]
+; CHECK:       [[BB26]]:
+; CHECK-NEXT:    br label %[[BB1]]
 ;
 bb:
   br label %bb1
@@ -56,3 +57,9 @@ attributes #0 = { norecurse noreturn nounwind uwtable "correctly-rounded-divide-
 !2 = !{!"long", !3, i64 0}
 !3 = !{!"omnipotent char", !4, i64 0}
 !4 = !{!"Simple C/C++ TBAA"}
+;.
+; CHECK: [[LONG_TBAA1]] = !{[[META2:![0-9]+]], [[META2]], i64 0}
+; CHECK: [[META2]] = !{!"long", [[META3:![0-9]+]], i64 0}
+; CHECK: [[META3]] = !{!"omnipotent char", [[META4:![0-9]+]], i64 0}
+; CHECK: [[META4]] = !{!"Simple C/C++ TBAA"}
+;.
diff --git a/llvm/test/Transforms/JumpThreading/thread-loads.ll b/llvm/test/Transforms/JumpThreading/thread-loads.ll
index 4749de0b248e8..cb10168547d2a 100644
--- a/llvm/test/Transforms/JumpThreading/thread-loads.ll
+++ b/llvm/test/Transforms/JumpThreading/thread-loads.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals smart
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt < %s -passes=jump-threading -S | FileCheck %s
 ; RUN: opt < %s -aa-pipeline=basic-aa -passes=jump-threading -S | FileCheck %s
 
@@ -8,23 +8,24 @@ target triple = "i386-apple-darwin7"
 ; Test that we can thread through the block with the partially redundant load (%2).
 ; rdar://6402033
 define i32 @test1(ptr %P) nounwind {
-; CHECK-LABEL: @test1(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 (...) @f1() #[[ATTR0:[0-9]+]]
+; CHECK-LABEL: define i32 @test1(
+; CHECK-SAME: ptr [[P:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 (...) @f1() #[[ATTR0]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[TMP0]], 0
-; CHECK-NEXT:    br i1 [[TMP1]], label [[BB1:%.*]], label [[BB1_THREAD:%.*]]
-; CHECK:       bb1.thread:
-; CHECK-NEXT:    store i32 42, ptr [[P:%.*]], align 4
-; CHECK-NEXT:    br label [[BB3:%.*]]
-; CHECK:       bb1:
+; CHECK-NEXT:    br i1 [[TMP1]], label %[[BB1:.*]], label %[[BB1_THREAD:.*]]
+; CHECK:       [[BB1_THREAD]]:
+; CHECK-NEXT:    store i32 42, ptr [[P]], align 4
+; CHECK-NEXT:    br label %[[BB3:.*]]
+; CHECK:       [[BB1]]:
 ; CHECK-NEXT:    [[DOTPR:%.*]] = load i32, ptr [[P]], align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp sgt i32 [[DOTPR]], 36
-; CHECK-NEXT:    br i1 [[TMP2]], label [[BB3]], label [[BB2:%.*]]
-; CHECK:       bb2:
+; CHECK-NEXT:    br i1 [[TMP2]], label %[[BB3]], label %[[BB2:.*]]
+; CHECK:       [[BB2]]:
 ; CHECK-NEXT:    [[TMP3:%.*]] = tail call i32 (...) @f2() #[[ATTR0]]
 ; CHECK-NEXT:    ret i32 0
-; CHECK:       bb3:
-; CHECK-NEXT:    [[RES_02:%.*]] = phi i32 [ 1, [[BB1_THREAD]] ], [ 0, [[BB1]] ]
+; CHECK:       [[BB3]]:
+; CHECK-NEXT:    [[RES_02:%.*]] = phi i32 [ 1, %[[BB1_THREAD]] ], [ 0, %[[BB1]] ]
 ; CHECK-NEXT:    ret i32 [[RES_02]]
 ;
 entry:
@@ -59,23 +60,24 @@ declare i32 @f2(...)
 ; rdar://11039258
 
 define i32 @test2(ptr %P) nounwind {
-; CHECK-LABEL: @test2(
-; CHECK-NEXT:  entry:
+; CHECK-LABEL: define i32 @test2(
+; CHECK-SAME: ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 (...) @f1() #[[ATTR0]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[TMP0]], 0
-; CHECK-NEXT:    br i1 [[TMP1]], label [[BB1:%.*]], label [[BB1_THREAD:%.*]]
-; CHECK:       bb1.thread:
-; CHECK-NEXT:    store i32 42, ptr [[P:%.*]], align 4, !tbaa [[TBAA0:![0-9]+]]
-; CHECK-NEXT:    br label [[BB3:%.*]]
-; CHECK:       bb1:
-; CHECK-NEXT:    [[DOTPR:%.*]] = load i32, ptr [[P]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    br i1 [[TMP1]], label %[[BB1:.*]], label %[[BB1_THREAD:.*]]
+; CHECK:       [[BB1_THREAD]]:
+; CHECK-NEXT:    store i32 42, ptr [[P]], align 4, !tbaa [[INT_TBAA0:![0-9]+]]
+; CHECK-NEXT:    br label %[[BB3:.*]]
+; CHECK:       [[BB1]]:
+; CHECK-NEXT:    [[DOTPR:%.*]] = load i32, ptr [[P]], align 4, !tbaa [[INT_TBAA0]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp sgt i32 [[DOTPR]], 36
-; CHECK-NEXT:    br i1 [[TMP2]], label [[BB3]], label [[BB2:%.*]]
-; CHECK:       bb2:
+; CHECK-NEXT:    br i1 [[TMP2]], label %[[BB3]], label %[[BB2:.*]]
+; CHECK:       [[BB2]]:
 ; CHECK-NEXT:    [[TMP3:%.*]] = tail call i32 (...) @f2() #[[ATTR0]]
 ; CHECK-NEXT:    ret i32 0
-; CHECK:       bb3:
-; CHECK-NEXT:    [[RES_02:%.*]] = phi i32 [ 1, [[BB1_THREAD]] ], [ 0, [[BB1]] ]
+; CHECK:       [[BB3]]:
+; CHECK-NEXT:    [[RES_02:%.*]] = phi i32 [ 1, %[[BB1_THREAD]] ], [ 0, %[[BB1]] ]
 ; CHECK-NEXT:    ret i32 [[RES_02]]
 ;
 entry:
@@ -106,17 +108,18 @@ define i32 @test3(ptr %x, i1 %f) {
 ; as necessary in the predecessors. This is especially tricky because the same
 ; predecessor ends up with two entries in the PHI node and they must share
 ; a single cast.
-; CHECK-LABEL: @test3(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[X:%.*]], align 8
-; CHECK-NEXT:    br i1 [[F:%.*]], label [[IF_END57:%.*]], label [[IF_END57]]
-; CHECK:       if.end57:
-; CHECK-NEXT:    [[TMP3:%.*]] = phi ptr [ [[TMP1]], [[ENTRY:%.*]] ], [ [[TMP1]], [[ENTRY]] ]
+; CHECK-LABEL: define i32 @test3(
+; CHECK-SAME: ptr [[X:%.*]], i1 [[F:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[X]], align 8
+; CHECK-NEXT:    br i1 [[F]], label %[[IF_END57:.*]], label %[[IF_END57]]
+; CHECK:       [[IF_END57]]:
+; CHECK-NEXT:    [[TMP3:%.*]] = phi ptr [ [[TMP1]], %[[ENTRY]] ], [ [[TMP1]], %[[ENTRY]] ]
 ; CHECK-NEXT:    [[TOBOOL59:%.*]] = icmp eq ptr [[TMP3]], null
-; CHECK-NEXT:    br i1 [[TOBOOL59]], label [[RETURN:%.*]], label [[IF_THEN60:%.*]]
-; CHECK:       if.then60:
+; CHECK-NEXT:    br i1 [[TOBOOL59]], label %[[RETURN:.*]], label %[[IF_THEN60:.*]]
+; CHECK:       [[IF_THEN60]]:
 ; CHECK-NEXT:    ret i32 42
-; CHECK:       return:
+; CHECK:       [[RETURN]]:
 ; CHECK-NEXT:    ret i32 13
 ;
 entry:
@@ -139,23 +142,24 @@ return:
 }
 
 define i32 @test4(ptr %P) {
-; CHECK-LABEL: @test4(
-; CHECK-NEXT:  entry:
+; CHECK-LABEL: define i32 @test4(
+; CHECK-SAME: ptr [[P:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[V0:%.*]] = tail call i32 (...) @f1()
 ; CHECK-NEXT:    [[V1:%.*]] = icmp eq i32 [[V0]], 0
-; CHECK-NEXT:    br i1 [[V1]], label [[BB1:%.*]], label [[BB1_THREAD:%.*]]
-; CHECK:       bb1.thread:
-; CHECK-NEXT:    store atomic i32 42, ptr [[P:%.*]] unordered, align 4
-; CHECK-NEXT:    br label [[BB3:%.*]]
-; CHECK:       bb1:
+; CHECK-NEXT:    br i1 [[V1]], label %[[BB1:.*]], label %[[BB1_THREAD:.*]]
+; CHECK:       [[BB1_THREAD]]:
+; CHECK-NEXT:    store atomic i32 42, ptr [[P]] unordered, align 4
+; CHECK-NEXT:    br label %[[BB3:.*]]
+; CHECK:       [[BB1]]:
 ; CHECK-NEXT:    [[V2_PR:%.*]] = load atomic i32, ptr [[P]] unordered, align 4
 ; CHECK-NEXT:    [[V3:%.*]] = icmp sgt i32 [[V2_PR]], 36
-; CHECK-NEXT:    br i1 [[V3]], label [[BB3]], label [[BB2:%.*]]
-; CHECK:       bb2:
+; CHECK-NEXT:    br i1 [[V3]], label %[[BB3]], label %[[BB2:.*]]
+; CHECK:       [[BB2]]:
 ; CHECK-NEXT:    [[V4:%.*]] = tail call i32 (...) @f2()
 ; CHECK-NEXT:    ret i32 0
-; CHECK:       bb3:
-; CHECK-NEXT:    [[RES_04:%.*]] = phi i32 [ 1, [[BB1_THREAD]] ], [ 0, [[BB1]] ]
+; CHECK:       [[BB3]]:
+; CHECK-NEXT:    [[RES_04:%.*]] = phi i32 [ 1, %[[BB1_THREAD]] ], [ 0, %[[BB1]] ]
 ; CHECK-NEXT:    ret i32 [[RES_04]]
 ;
 entry:
@@ -183,23 +187,24 @@ bb3:
 
 define i32 @test5(ptr %P) {
 ; Negative test
-; CHECK-LABEL: @test5(
-; CHECK-NEXT:  entry:
+; CHECK-LABEL: define i32 @test5(
+; CHECK-SAME: ptr [[P:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    [[V0:%.*]] = tail call i32 (...) @f1()
 ; CHECK-NEXT:    [[V1:%.*]] = icmp eq i32 [[V0]], 0
-; CHECK-NEXT:    br i1 [[V1]], label [[BB1:%.*]], label [[BB:%.*]]
-; CHECK:       bb:
-; CHECK-NEXT:    store atomic i32 42, ptr [[P:%.*]] release, align 4
-; CHECK-NEXT:    br label [[BB1]]
-; CHECK:       bb1:
-; CHECK-NEXT:    [[RES_0:%.*]] = phi i32 [ 1, [[BB]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br i1 [[V1]], label %[[BB1:.*]], label %[[BB:.*]]
+; CHECK:       [[BB]]:
+; CHECK-NEXT:    store atomic i32 42, ptr [[P]] release, align 4
+; CHECK-NEXT:    br label %[[BB1]]
+; CHECK:       [[BB1]]:
+; CHECK-NEXT:    [[RES_0:%.*]] = phi i32 [ 1, %[[BB]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    [[V2:%.*]] = load atomic i32, ptr [[P]] acquire, align 4
 ; CHECK-NEXT:    [[V3:%.*]] = icmp sgt i32 [[V2]], 36
-; CHECK-NEXT:    br i1 [[V3]], label [[BB3:%.*]], label [[BB2:%.*]]
-; CHECK:       bb2:
+; CHECK-NEXT:    br i1 [[V3]], label %[[BB3:.*]], label %[[BB2:.*]]
+; CHECK:       [[BB2]]:
 ; CHECK-NEXT:    [[V4:%.*]] = tail call i32 (...) @f2()
 ; CHECK-NEXT:    ret i32 [[RES_0]]
-; CHECK:       bb3:
+; CHECK:       [[BB3]]:
 ; CHECK-NEXT:    ret i32 [[RES_0]]
 ;
 entry:
@@ -228,23 +233,24 @@ bb3:
 
 define i32 @test6(ptr %P) {
 ; Negative test
-; CHECK-LABEL: @test6(
-; CHECK-NEXT:  entry:
+; CHECK-LABEL: define i32 @test6(
+; CHECK-SAME: ptr [[P:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    [[V0:%.*]] = tail call i32 (...) @f1()
 ; CHECK-NEXT:    [[V1:%.*]] = icmp eq i32 [[V0]], 0
-; CHECK-NEXT:    br i1 [[V1]], label [[BB1:%.*]], label [[BB:%.*]]
-; CHECK:       bb:
-; CHECK-NEXT:    store i32 42, ptr [[P:%.*]], align 4
-; CHECK-NEXT:    br label [[BB1]]
-; CHECK:       bb1:
-; CHECK-NEXT:    [[RES_0:%.*]] = phi i32 [ 1, [[BB]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br i1 [[V1]], label %[[BB1:.*]], label %[[BB:.*]]
+; CHECK:       [[BB]]:
+; CHECK-NEXT:    store i32 42, ptr [[P]], align 4
+; CHECK-NEXT:    br label %[[BB1]]
+; CHECK:       [[BB1]]:
+; CHECK-NEXT:    [[RES_0:%.*]] = phi i32 [ 1, %[[BB]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    [[V2:%.*]] = load atomic i32, ptr [[P]] acquire, align 4
 ; CHECK-NEXT:    [[V3:%.*]] = icmp sgt i32 [[V2]], 36
-; CHECK-NEXT:    br i1 [[V3]], label [[BB3:%.*]], label [[BB2:%.*]]
-; CHECK:       bb2:
+; CHECK-NEXT:    br i1 [[V3]], label %[[BB3:.*]], label %[[BB2:.*]]
+; CHECK:       [[BB2]]:
 ; CHECK-NEXT:    [[V4:%.*]] = tail call i32 (...) @f2()
 ; CHECK-NEXT:    ret i32 [[RES_0]]
-; CHECK:       bb3:
+; CHECK:       [[BB3]]:
 ; CHECK-NEXT:    ret i32 [[RES_0]]
 ;
 entry:
@@ -273,23 +279,24 @@ bb3:
 
 define i32 @test7(ptr %P) {
 ; Negative test
-; CHECK-LABEL: @test7(
-; CHECK-NEXT:  entry:
+; CHECK-LABEL: define i32 @test7(
+; CHECK-SAME: ptr [[P:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    [[V0:%.*]] = tail call i32 (...) @f1()
 ; CHECK-NEXT:    [[V1:%.*]] = icmp eq i32 [[V0]], 0
-; CHECK-NEXT:    br i1 [[V1]], label [[BB1:%.*]], label [[BB:%.*]]
-; CHECK:       bb:
-; CHECK-NEXT:    [[VAL:%.*]] = load i32, ptr [[P:%.*]], align 4
-; CHECK-NEXT:    br label [[BB1]]
-; CHECK:       bb1:
-; CHECK-NEXT:    [[RES_0:%.*]] = phi i32 [ 1, [[BB]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br i1 [[V1]], label %[[BB1:.*]], label %[[BB:.*]]
+; CHECK:       [[BB]]:
+; CHECK-NEXT:    [[VAL:%.*]] = load i32, ptr [[P]], align 4
+; CHECK-NEXT:    br label %[[BB1]]
+; CHECK:       [[BB1]]:
+; CHECK-NEXT:    [[RES_0:%.*]] = phi i32 [ 1, %[[BB]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    [[V2:%.*]] = load atomic i32, ptr [[P]] acquire, align 4
 ; CHECK-NEXT:    [[V3:%.*]] = icmp sgt i32 [[V2]], 36
-; CHECK-NEXT:    br i1 [[V3]], label [[BB3:%.*]], label [[BB2:%.*]]
-; CHECK:       bb2:
+; CHECK-NEXT:    br i1 [[V3]], label %[[BB3:.*]], label %[[BB2:.*]]
+; CHECK:       [[BB2]]:
 ; CHECK-NEXT:    [[V4:%.*]] = tail call i32 (...) @f2()
 ; CHECK-NEXT:    ret i32 [[RES_0]]
-; CHECK:       bb3:
+; CHECK:       [[BB3]]:
 ; CHECK-NEXT:    ret i32 [[RES_0]]
 ;
 entry:
@@ -319,10 +326,11 @@ bb3:
 ; We keep the tbaa and range metadata for the first load, as it dominates the
 ; second load. Hence we can eliminate the branch.
 define void @test8(ptr, ptr, ptr) {
-; CHECK-LABEL: @test8(
-; CHECK-NEXT:  ret2:
-; CHECK-NEXT:    [[A:%.*]] = load i32, ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]], !range [[RNG4:![0-9]+]], !alias.scope [[META5:![0-9]+]], !noalias [[META8:![0-9]+]], !noundef [[META10:![0-9]+]]
-; CHECK-NEXT:    store i32 [[A]], ptr [[TMP1:%.*]], align 4
+; CHECK-LABEL: define void @test8(
+; CHECK-SAME: ptr [[TMP0:%.*]], ptr [[TMP1:%.*]], ptr [[TMP2:%.*]]) {
+; CHECK-NEXT:  [[RET2:.*:]]
+; CHECK-NEXT:    [[A:%.*]] = load i32, ptr [[TMP0]], align 4, !tbaa [[INT_TBAA0]], !range [[RNG4:![0-9]+]], !alias.scope [[META5:![0-9]+]], !noalias [[META8:![0-9]+]], !noundef [[META10:![0-9]+]]
+; CHECK-NEXT:    store i32 [[A]], ptr [[TMP1]], align 4
 ; CHECK-NEXT:    [[XXX:%.*]] = tail call i32 (...) @f1() #[[ATTR0]]
 ; CHECK-NEXT:    ret void
 ;
@@ -344,24 +352,25 @@ ret2:
 ; we need to remove metadata from the existing load, and add appropriate
 ; metadata to the newly inserted load.
 define void @test9(ptr, ptr, ptr, i1 %c) {
-; CHECK-LABEL: @test9(
-; CHECK-NEXT:    br i1 [[C:%.*]], label [[D1:%.*]], label [[D2:%.*]]
-; CHECK:       d1:
-; CHECK-NEXT:    [[A:%.*]] = load i32, ptr [[TMP0:%.*]], align 4
-; CHECK-NEXT:    br label [[D3:%.*]]
-; CHECK:       d2:
+; CHECK-LABEL: define void @test9(
+; CHECK-SAME: ptr [[TMP0:%.*]], ptr [[TMP1:%.*]], ptr [[TMP2:%.*]], i1 [[C:%.*]]) {
+; CHECK-NEXT:    br i1 [[C]], label %[[D1:.*]], label %[[D2:.*]]
+; CHECK:       [[D1]]:
+; CHECK-NEXT:    [[A:%.*]] = load i32, ptr [[TMP0]], align 4
+; CHECK-NEXT:    br label %[[D3:.*]]
+; CHECK:       [[D2]]:
 ; CHECK-NEXT:    [[XXXX:%.*]] = tail call i32 (...) @f1() #[[ATTR0]]
-; CHECK-NEXT:    [[B_PR:%.*]] = load i32, ptr [[TMP0]], align 4, !tbaa [[TBAA0]]
-; CHECK-NEXT:    br label [[D3]]
-; CHECK:       d3:
-; CHECK-NEXT:    [[B:%.*]] = phi i32 [ [[B_PR]], [[D2]] ], [ [[A]], [[D1]] ]
-; CHECK-NEXT:    [[P:%.*]] = phi i32 [ 1, [[D2]] ], [ [[A]], [[D1]] ]
-; CHECK-NEXT:    store i32 [[P]], ptr [[TMP1:%.*]], align 4
+; CHECK-NEXT:    [[B_PR:%.*]] = load i32, ptr [[TMP0]], align 4, !tbaa [[INT_TBAA0]]
+; CHECK-NEXT:    br label %[[D3]]
+; CHECK:       [[D3]]:
+; CHECK-NEXT:    [[B:%.*]] = phi i32 [ [[B_PR]], %[[D2]] ], [ [[A]], %[[D1]] ]
+; CHECK-NEXT:    [[P:%.*]] = phi i32 [ 1, %[[D2]] ], [ [[A]], %[[D1]] ]
+; CHECK-NEXT:    store i32 [[P]], ptr [[TMP1]], align 4
 ; CHECK-NEXT:    [[C2:%.*]] = icmp eq i32 [[B]], 8
-; CHECK-NEXT:    br i1 [[C2]], label [[RET1:%.*]], label [[RET2:%.*]]
-; CHECK:       ret1:
+; CHECK-NEXT:    br i1 [[C2]], label %[[RET1:.*]], label %[[RET2:.*]]
+; CHECK:       [[RET1]]:
 ; CHECK-NEXT:    ret void
-; CHECK:       ret2:
+; CHECK:       [[RET2]]:
 ; CHECK-NEXT:    [[XXX:%.*]] = tail call i32 (...) @f1() #[[ATTR0]]
 ; CHECK-NEXT:    ret void
 ;
@@ -391,27 +400,28 @@ ret2:
 }
 
 define i32 @fn_noalias(i1 %c2,ptr noalias %P, ptr noalias %P2) {
-; CHECK-LABEL: @fn_noalias(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 [[C2:%.*]], label [[COND2:%.*]], label [[COND1:%.*]]
-; CHECK:       cond1:
-; CHECK-NEXT:    [[L1:%.*]] = load i64, ptr [[P:%.*]], align 4
-; CHECK-NEXT:    store i64 42, ptr [[P2:%.*]], align 4
+; CHECK-LABEL: define i32 @fn_noalias(
+; CHECK-SAME: i1 [[C2:%.*]], ptr noalias [[P:%.*]], ptr noalias [[P2:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br i1 [[C2]], label %[[COND2:.*]], label %[[COND1:.*]]
+; CHECK:       [[COND1]]:
+; CHECK-NEXT:    [[L1:%.*]] = load i64, ptr [[P]], align 4
+; CHECK-NEXT:    store i64 42, ptr [[P2]], align 4
 ; CHECK-NEXT:    [[C:%.*]] = icmp eq i64 [[L1]], 0
-; CHECK-NEXT:    br i1 [[C]], label [[COND2_THREAD:%.*]], label [[END:%.*]]
-; CHECK:       cond2.thread:
+; CHECK-NEXT:    br i1 [[C]], label %[[COND2_THREAD:.*]], label %[[END:.*]]
+; CHECK:       [[COND2_THREAD]]:
 ; CHECK-NEXT:    call void @fn2(i64 [[L1]])
-; CHECK-NEXT:    br label [[COND3:%.*]]
-; CHECK:       cond2:
+; CHECK-NEXT:    br label %[[COND3:.*]]
+; CHECK:       [[COND2]]:
 ; CHECK-NEXT:    [[L2_PR:%.*]] = load i64, ptr [[P]], align 4
 ; CHECK-NEXT:    call void @fn2(i64 [[L2_PR]])
 ; CHECK-NEXT:    [[C3:%.*]] = icmp eq i64 [[L2_PR]], 0
-; CHECK-NEXT:    br i1 [[C3]], label [[COND3]], label [[END]]
-; CHECK:       cond3:
-; CHECK-NEXT:    [[L23:%.*]] = phi i64 [ [[L1]], [[COND2_THREAD]] ], [ [[L2_PR]], [[COND2]] ]
+; CHECK-NEXT:    br i1 [[C3]], label %[[COND3]], label %[[END]]
+; CHECK:       [[COND3]]:
+; CHECK-NEXT:    [[L23:%.*]] = phi i64 [ [[L1]], %[[COND2_THREAD]] ], [ [[L2_PR]], %[[COND2]] ]
 ; CHECK-NEXT:    call void @fn3(i64 [[L23]])
-; CHECK-NEXT:    br label [[END]]
-; CHECK:       end:
+; CHECK-NEXT:    br label %[[END]]
+; CHECK:       [[END]]:
 ; CHECK-NEXT:    ret i32 0
 ;
 entry:
@@ -447,43 +457,44 @@ end:
 @last = internal unnamed_addr global [65 x ptr] zeroinitializer, align 8
 @next_status = internal unnamed_addr global [65 x %struct.NEXT_MOVE] zeroinitializer, align 8
 define fastcc i32 @Search(i64 %idxprom.i, i64 %idxprom.i89, i32 %c) {
-; CHECK-LABEL: @Search(
-; CHECK-NEXT:  cond.true282:
-; CHECK-NEXT:    [[ARRAYIDX185:%.*]] = getelementptr inbounds [65 x i32], ptr @hash_move, i64 0, i64 [[IDXPROM_I:%.*]]
+; CHECK-LABEL: define fastcc i32 @Search(
+; CHECK-SAME: i64 [[IDXPROM_I:%.*]], i64 [[IDXPROM_I89:%.*]], i32 [[C:%.*]]) {
+; CHECK-NEXT:  [[COND_TRUE282:.*:]]
+; CHECK-NEXT:    [[ARRAYIDX185:%.*]] = getelementptr inbounds [65 x i32], ptr @hash_move, i64 0, i64 [[IDXPROM_I]]
 ; CHECK-NEXT:    [[ARRAYIDX307:%.*]] = getelementptr inbounds [65 x i32], ptr @current_move, i64 0, i64 [[IDXPROM_I]]
 ; CHECK-NEXT:    [[ARRAYIDX89:%.*]] = getelementptr inbounds [65 x ptr], ptr @last, i64 0, i64 [[IDXPROM_I]]
 ; CHECK-NEXT:    [[PHASE:%.*]] = getelementptr inbounds [65 x %struct.NEXT_MOVE], ptr @next_status, i64 0, i64 [[IDXPROM_I]], i32 0
-; CHECK-NEXT:    switch i32 [[C:%.*]], label [[CLEANUP:%.*]] [
-; CHECK-NEXT:      i32 1, label [[SW_BB_I:%.*]]
-; CHECK-NEXT:      i32 0, label [[SW_BB21_I:%.*]]
+; CHECK-NEXT:    switch i32 [[C]], label %[[CLEANUP:.*]] [
+; CHECK-NEXT:      i32 1, label %[[SW_BB_I:.*]]
+; CHECK-NEXT:      i32 0, label %[[SW_BB21_I:.*]]
 ; CHECK-NEXT:    ]
-; CHECK:       sw.bb.i:
+; CHECK:       [[SW_BB_I]]:
 ; CHECK-NEXT:    [[CALL_I62:%.*]] = call fastcc ptr @GenerateCheckEvasions()
 ; CHECK-NEXT:    store ptr [[CALL_I62]], ptr [[ARRAYIDX89]], align 8
 ; CHECK-NEXT:    [[L2:%.*]] = load i32, ptr [[ARRAYIDX185]], align 4
 ; CHECK-NEXT:    [[TOBOOL_I63:%.*]] = icmp eq i32 [[L2]], 0
-; CHECK-NEXT:    br i1 [[TOBOOL_I63]], label [[SW_BB21_I_THREAD:%.*]], label [[IF_THEN_I64:%.*]]
-; CHECK:       sw.bb21.i.thread:
+; CHECK-NEXT:    br i1 [[TOBOOL_I63]], label %[[SW_BB21_I_THREAD:.*]], label %[[IF_THEN_I64:.*]]
+; CHECK:       [[SW_BB21_I_THREAD]]:
 ; CHECK-NEXT:    store i32 10, ptr [[PHASE]], align 8
-; CHECK-NEXT:    br label [[DO_BODY_PREHEADER_I67:%.*]]
-; CHECK:       if.then.i64:
+; CHECK-NEXT:    br label %[[DO_BODY_PREHEADER_I67:.*]]
+; CHECK:       [[IF_THEN_I64]]:
 ; CHECK-NEXT:    store i32 7, ptr [[PHASE]], align 8
 ; CHECK-NEXT:    store i32 [[L2]], ptr [[ARRAYIDX307]], align 4
 ; CHECK-NEXT:    [[CALL16_I:%.*]] = call fastcc i32 @ValidMove(i32 [[L2]])
 ; CHECK-NEXT:    [[TOBOOL17_I:%.*]] = icmp eq i32 [[CALL16_I]], 0
-; CHECK-NEXT:    br i1 [[TOBOOL17_I]], label [[IF_ELSE_I65:%.*]], label [[CLEANUP]]
-; CHECK:       if.else.i65:
+; CHECK-NEXT:    br i1 [[TOBOOL17_I]], label %[[IF_ELSE_I65:.*]], label %[[CLEANUP]]
+; CHECK:       [[IF_ELSE_I65]]:
 ; CHECK-NEXT:    call void @f65()
-; CHECK-NEXT:    br label [[SW_BB21_I]]
-; CHECK:       sw.bb21.i:
+; CHECK-NEXT:    br label %[[SW_BB21_I]]
+; CHECK:       [[SW_BB21_I]]:
 ; CHECK-NEXT:    [[L3_PR:%.*]] = load i32, ptr [[ARRAYIDX185]], align 4
 ; CHECK-NEXT:    store i32 10, ptr [[PHASE]], align 8
 ; CHECK-NEXT:    [[TOBOOL27_I:%.*]] = icmp eq i32 [[L3_PR]], 0
-; CHECK-NEXT:    br i1 [[TOBOOL27_I]], label [[DO_BODY_PREHEADER_I67]], label [[CLEANUP]]
-; CHECK:       do.body.preheader.i67:
+; CHECK-NEXT:    br i1 [[TOBOOL27_I]], label %[[DO_BODY_PREHEADER_I67]], label %[[CLEANUP]]
+; CHECK:       [[DO_BODY_PREHEADER_I67]]:
 ; CHECK-NEXT:    call void @f67()
 ; CHECK-NEXT:    ret i32 67
-; CHECK:       cleanup:
+; CHECK:       [[CLEANUP]]:
 ; CHECK-NEXT:    call void @Cleanup()
 ; CHECK-NEXT:    ret i32 0
 ;
@@ -543,22 +554,23 @@ declare void @Cleanup()
 declare void @f65()
 
 define i32 @fn_SinglePred(i1 %c2,ptr %P) {
-; CHECK-LABEL: @fn_SinglePred(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[L1:%.*]] = load i64, ptr [[P:%.*]], align 4
+; CHECK-LABEL: define i32 @fn_SinglePred(
+; CHECK-SAME: i1 [[C2:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[L1:%.*]] = load i64, ptr [[P]], align 4
 ; CHECK-NEXT:    [[C:%.*]] = icmp eq i64 [[L1]], 0
-; CHECK-NEXT:    br i1 [[C]], label [[COND3:%.*]], label [[COND1:%.*]]
-; CHECK:       cond1:
-; CHECK-NEXT:    br i1 [[C2:%.*]], label [[COND2:%.*]], label [[END:%.*]]
-; CHECK:       cond2:
-; CHECK-NEXT:    [[L2:%.*]] = phi i64 [ [[L1]], [[COND1]] ]
+; CHECK-NEXT:    br i1 [[C]], label %[[COND3:.*]], label %[[COND1:.*]]
+; CHECK:       [[COND1]]:
+; CHECK-NEXT:    br i1 [[C2]], label %[[COND2:.*]], label %[[END:.*]]
+; CHECK:       [[COND2]]:
+; CHECK-NEXT:    [[L2:%.*]] = phi i64 [ [[L1]], %[[COND1]] ]
 ; CHECK-NEXT:    call void @fn2(i64 [[L2]])
-; CHECK-NEXT:    br label [[END]]
-; CHECK:       cond3:
+; CHECK-NEXT:    br label %[[END]]
+; CHECK:       [[COND3]]:
 ; CHECK-NEXT:    call void @fn2(i64 [[L1]])
 ; CHECK-NEXT:    call void @fn3(i64 [[L1]])
-; CHECK-NEXT:    br label [[END]]
-; CHECK:       end:
+; CHECK-NEXT:    br label %[[END]]
+; CHECK:       [[END]]:
 ; CHECK-NEXT:    ret i32 0
 ;
 
@@ -585,24 +597,25 @@ end:
 }
 
 define i32 @fn_SinglePredMultihop(i1 %c1, i1 %c2,ptr %P) {
-; CHECK-LABEL: @fn_SinglePredMultihop(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[L1:%.*]] = load i64, ptr [[P:%.*]], align 4
+; CHECK-LABEL: define i32 @fn_SinglePredMultihop(
+; CHECK-SAME: i1 [[C1:%.*]], i1 [[C2:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[L1:%.*]] = load i64, ptr [[P]], align 4
 ; CHECK-NEXT:    [[C0:%.*]] = icmp eq i64 [[L1]], 0
-; CHECK-NEXT:    br i1 [[C0]], label [[COND3:%.*]], label [[COND0:%.*]]
-; CHECK:       cond0:
-; CHECK-NEXT:    br i1 [[C1:%.*]], label [[COND1:%.*]], label [[END:%.*]]
-; CHECK:       cond1:
-; CHECK-NEXT:    br i1 [[C2:%.*]], label [[COND2:%.*]], label [[END]]
-; CHECK:       cond2:
-; CHECK-NEXT:    [[L2:%.*]] = phi i64 [ [[L1]], [[COND1]] ]
+; CHECK-NEXT:    br i1 [[C0]], label %[[COND3:.*]], label %[[COND0:.*]]
+; CHECK:       [[COND0]]:
+; CHECK-NEXT:    br i1 [[C1]], label %[[COND1:.*]], label %[[END:.*]]
+; CHECK:       [[COND1]]:
+; CHECK-NEXT:    br i1 [[C2]], label %[[COND2:.*]], label %[[END]]
+; CHECK:       [[COND2]]:
+; CHECK-NEXT:    [[L2:%.*]] = phi i64 [ [[L1]], %[[COND1]] ]
 ; CHECK-NEXT:    call void @fn2(i64 [[L2]])
-; CHECK-NEXT:    br label [[END]]
-; CHECK:       cond3:
+; CHECK-NEXT:    br label %[[END]]
+; CHECK:       [[COND3]]:
 ; CHECK-NEXT:    call void @fn2(i64 [[L1]])
 ; CHECK-NEXT:    call void @fn3(i64 [[L1]])
-; CHECK-NEXT:    br label [[END]]
-; CHECK:       end:
+; CHECK-NEXT:    br label %[[END]]
+; CHECK:       [[END]]:
 ; CHECK-NEXT:    ret i32 0
 ;
 
@@ -640,19 +653,20 @@ declare void @fn3(i64)
 ; store.
 ;
 define i32 @phi_translate_partial_redundant_loads(i32, ptr, ptr) {
-; CHECK-LABEL: @phi_translate_partial_redundant_loads(
-; CHECK-NEXT:    [[CMP0:%.*]] = icmp ne i32 [[TMP0:%.*]], 0
-; CHECK-NEXT:    br i1 [[CMP0]], label [[MERGE_THREAD:%.*]], label [[MERGE:%.*]]
-; CHECK:       merge.thread:
-; CHECK-NEXT:    store i32 1, ptr [[TMP1:%.*]], align 4
-; CHECK-NEXT:    br label [[LEFT_X:%.*]]
-; CHECK:       merge:
-; CHECK-NEXT:    [[NEWLOAD_PR:%.*]] = load i32, ptr [[TMP2:%.*]], align 4
+; CHECK-LABEL: define i32 @phi_translate_partial_redundant_loads(
+; CHECK-SAME: i32 [[TMP0:%.*]], ptr [[TMP1:%.*]], ptr [[TMP2:%.*]]) {
+; CHECK-NEXT:    [[CMP0:%.*]] = icmp ne i32 [[TMP0]], 0
+; CHECK-NEXT:    br i1 [[CMP0]], label %[[MERGE_THREAD:.*]], label %[[MERGE:.*]]
+; CHECK:       [[MERGE_THREAD]]:
+; CHECK-NEXT:    store i32 1, ptr [[TMP1]], align 4
+; CHECK-NEXT:    br label %[[LEFT_X:.*]]
+; CHECK:       [[MERGE]]:
+; CHECK-NEXT:    [[NEWLOAD_PR:%.*]] = load i32, ptr [[TMP2]], align 4
 ; CHECK-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[NEWLOAD_PR]], 5
-; CHECK-NEXT:    br i1 [[CMP1]], label [[LEFT_X]], label [[RIGHT_X:%.*]]
-; CHECK:       left_x:
+; CHECK-NEXT:    br i1 [[CMP1]], label %[[LEFT_X]], label %[[RIGHT_X:.*]]
+; CHECK:       [[LEFT_X]]:
 ; CHECK-NEXT:    ret i32 20
-; CHECK:       right_x:
+; CHECK:       [[RIGHT_X]]:
 ; CHECK-NEXT:    ret i32 10
 ;
   %cmp0 = icmp ne i32 %0, 0
@@ -693,7 +707,7 @@ right_x:
 !10 = !{!8}
 !11 = !{}
 ;.
-; CHECK: [[TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0}
+; CHECK: [[INT_TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0}
 ; CHECK: [[META1]] = !{!"int", [[META2:![0-9]+]]}
 ; CHECK: [[META2]] = !{!"omnipotent char", [[META3:![0-9]+]]}
 ; CHECK: [[META3]] = !{!"Simple C/C++ TBAA"}
diff --git a/llvm/test/Transforms/LICM/2011-04-06-PromoteResultOfPromotion.ll b/llvm/test/Transforms/LICM/2011-04-06-PromoteResultOfPromotion.ll
index 33027189dc5c0..0d32e508edf5f 100644
--- a/llvm/test/Transforms/LICM/2011-04-06-PromoteResultOfPromotion.ll
+++ b/llvm/test/Transforms/LICM/2011-04-06-PromoteResultOfPromotion.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt < %s -passes=licm -S | FileCheck %s
 ; PR9634
 
@@ -7,21 +7,21 @@
 
 define void @f() {
 ; CHECK-LABEL: define void @f() {
-; CHECK-NEXT:  entry:
+; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    [[L_87_I:%.*]] = alloca [9 x i16], align 16
-; CHECK-NEXT:    [[G_58_PROMOTED:%.*]] = load i32, ptr @g_58, align 4, !tbaa [[TBAA0:![0-9]+]]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[TMP31:%.*]] = phi i32 [ [[G_58_PROMOTED]], [[ENTRY:%.*]] ], [ [[OR:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[INC12:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[G_58_PROMOTED:%.*]] = load i32, ptr @g_58, align 4, !tbaa [[INT_TBAA0:![0-9]+]]
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[TMP31:%.*]] = phi i32 [ [[G_58_PROMOTED]], %[[ENTRY]] ], [ [[OR:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[INC12:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
 ; CHECK-NEXT:    [[OR]] = or i32 [[TMP31]], 10
 ; CHECK-NEXT:    [[INC]] = add nsw i32 [[INC12]], 1
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[INC]], 4
-; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]]
-; CHECK:       for.end:
-; CHECK-NEXT:    [[OR_LCSSA:%.*]] = phi i32 [ [[OR]], [[FOR_BODY]] ]
-; CHECK-NEXT:    store ptr @g_58, ptr @g_116, align 8, !tbaa [[TBAA4:![0-9]+]]
-; CHECK-NEXT:    store i32 [[OR_LCSSA]], ptr @g_58, align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_END:.*]]
+; CHECK:       [[FOR_END]]:
+; CHECK-NEXT:    [[OR_LCSSA:%.*]] = phi i32 [ [[OR]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    store ptr @g_58, ptr @g_116, align 8, !tbaa [[ANYPTR_TBAA4:![0-9]+]]
+; CHECK-NEXT:    store i32 [[OR_LCSSA]], ptr @g_58, align 4, !tbaa [[INT_TBAA0]]
 ; CHECK-NEXT:    ret void
 ;
 
@@ -51,3 +51,11 @@ for.end:                                          ; preds = %for.inc
 !4 = !{!6, !6, i64 0}
 !5 = !{!"any pointer", !1}
 !6 = !{!"int", !1}
+;.
+; CHECK: [[INT_TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0}
+; CHECK: [[META1]] = !{!"int", [[META2:![0-9]+]]}
+; CHECK: [[META2]] = !{!"omnipotent char", [[META3:![0-9]+]]}
+; CHECK: [[META3]] = !{!"Simple C/C++ TBAA"}
+; CHECK: [[ANYPTR_TBAA4]] = !{[[META5:![0-9]+]], [[META5]], i64 0}
+; CHECK: [[META5]] = !{!"any pointer", [[META2]]}
+;.
diff --git a/llvm/test/Transforms/LICM/licm-coroutine.ll b/llvm/test/Transforms/LICM/licm-coroutine.ll
deleted file mode 100644
index a4765acfb93f8..0000000000000
--- a/llvm/test/Transforms/LICM/licm-coroutine.ll
+++ /dev/null
@@ -1,78 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt < %s -passes=licm -S | FileCheck %s
-
-; %fca.0 and %fca.1 should not be hoisted out of the loop because the ramp
-; function and resume function have different stack frames, so %pointer1 and
-; %pointer2 have different values before and after @llvm.coro.suspend.
-
-define ptr @f(i32 %n) presplitcoroutine {
-; CHECK-LABEL: define ptr @f(
-; CHECK-SAME: i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:  [[ENTRY:.*]]:
-; CHECK-NEXT:    [[POINTER1:%.*]] = alloca ptr, align 8
-; CHECK-NEXT:    [[POINTER2:%.*]] = alloca ptr, align 8
-; CHECK-NEXT:    [[ID:%.*]] = call token @llvm.coro.id(i32 0, ptr null, ptr null, ptr null)
-; CHECK-NEXT:    [[SIZE:%.*]] = call i32 @llvm.coro.size.i32()
-; CHECK-NEXT:    [[ALLOC:%.*]] = call ptr @malloc(i32 [[SIZE]])
-; CHECK-NEXT:    [[HDL:%.*]] = call noalias ptr @llvm.coro.begin(token [[ID]], ptr [[ALLOC]])
-; CHECK-NEXT:    br label %[[LOOP:.*]]
-; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[N_VAL:%.*]] = phi i32 [ [[N]], %[[ENTRY]] ], [ [[INC:%.*]], %[[RESUME:.*]] ]
-; CHECK-NEXT:    [[INC]] = add nsw i32 [[N_VAL]], 1
-; CHECK-NEXT:    call void @print(i32 [[N_VAL]])
-; CHECK-NEXT:    [[TMP0:%.*]] = call i8 @llvm.coro.suspend(token none, i1 false)
-; CHECK-NEXT:    switch i8 [[TMP0]], label %[[SUSPEND_LOOPEXIT:.*]] [
-; CHECK-NEXT:      i8 0, label %[[RESUME]]
-; CHECK-NEXT:      i8 1, label %[[CLEANUP:.*]]
-; CHECK-NEXT:    ]
-; CHECK:       [[RESUME]]:
-; CHECK-NEXT:    [[FCA_0:%.*]] = insertvalue [2 x ptr] poison, ptr [[POINTER1]], 0
-; CHECK-NEXT:    [[FCA_1:%.*]] = insertvalue [2 x ptr] [[FCA_0]], ptr [[POINTER2]], 1
-; CHECK-NEXT:    call void @foo([2 x ptr] [[FCA_1]])
-; CHECK-NEXT:    br label %[[LOOP]]
-; CHECK:       [[CLEANUP]]:
-; CHECK-NEXT:    [[MEM:%.*]] = call ptr @llvm.coro.free(token [[ID]], ptr [[HDL]])
-; CHECK-NEXT:    call void @free(ptr [[MEM]])
-; CHECK-NEXT:    br label %[[SUSPEND:.*]]
-; CHECK:       [[SUSPEND_LOOPEXIT]]:
-; CHECK-NEXT:    br label %[[SUSPEND]]
-; CHECK:       [[SUSPEND]]:
-; CHECK-NEXT:    [[UNUSED:%.*]] = call i1 @llvm.coro.end(ptr [[HDL]], i1 false, token none)
-; CHECK-NEXT:    ret ptr [[HDL]]
-;
-entry:
-  %pointer1 = alloca ptr
-  %pointer2 = alloca ptr
-  %id = call token @llvm.coro.id(i32 0, ptr null, ptr null, ptr null)
-  %size = call i32 @llvm.coro.size.i32()
-  %alloc = call ptr @malloc(i32 %size)
-  %hdl = call noalias ptr @llvm.coro.begin(token %id, ptr %alloc)
-  br label %loop
-
-loop:
-  %n.val = phi i32 [ %n, %entry ], [ %inc, %resume ]
-  %inc = add nsw i32 %n.val, 1
-  call void @print(i32 %n.val)
-  %0 = call i8 @llvm.coro.suspend(token none, i1 false)
-  switch i8 %0, label %suspend [i8 0, label %resume
-  i8 1, label %cleanup]
-
-resume:
-  %fca.0 = insertvalue [2 x ptr] poison, ptr %pointer1, 0
-  %fca.1 = insertvalue [2 x ptr] %fca.0, ptr %pointer2, 1
-  call void @foo([2 x ptr] %fca.1)
-  br label %loop
-
-cleanup:
-  %mem = call ptr @llvm.coro.free(token %id, ptr %hdl)
-  call void @free(ptr %mem)
-  br label %suspend
-suspend:
-  %unused = call i1 @llvm.coro.end(ptr %hdl, i1 false, token none)
-  ret ptr %hdl
-}
-
-declare void @free(ptr)
-declare ptr @malloc(i32)
-declare void @print(i32)
-declare void @foo([2 x ptr])
diff --git a/llvm/test/Transforms/LICM/pr50367.ll b/llvm/test/Transforms/LICM/pr50367.ll
index 7fd176b6c6bb6..6aafff74f61d8 100644
--- a/llvm/test/Transforms/LICM/pr50367.ll
+++ b/llvm/test/Transforms/LICM/pr50367.ll
@@ -1,23 +1,24 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt -S -passes='loop-mssa(licm)' < %s | FileCheck %s
 @e = external dso_local global ptr, align 8
 
 define void @main(i1 %arg, ptr %arg1) {
-; CHECK-LABEL: @main(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[LOOP1:%.*]]
-; CHECK:       loop1:
-; CHECK-NEXT:    br label [[LOOP2:%.*]]
-; CHECK:       loop2:
-; CHECK-NEXT:    br i1 [[ARG:%.*]], label [[LOOP2_LATCH:%.*]], label [[LOOP_LATCH:%.*]]
-; CHECK:       loop2.latch:
-; CHECK-NEXT:    store i32 0, ptr [[ARG1:%.*]], align 4
-; CHECK-NEXT:    br label [[LOOP2]]
-; CHECK:       loop.latch:
-; CHECK-NEXT:    store ptr null, ptr @e, align 8, !tbaa [[TBAA0:![0-9]+]]
-; CHECK-NEXT:    [[PTR:%.*]] = load ptr, ptr @e, align 8, !tbaa [[TBAA0]]
-; CHECK-NEXT:    store i32 0, ptr [[PTR]], align 4, !tbaa [[TBAA4:![0-9]+]]
-; CHECK-NEXT:    br label [[LOOP1]]
+; CHECK-LABEL: define void @main(
+; CHECK-SAME: i1 [[ARG:%.*]], ptr [[ARG1:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br label %[[LOOP1:.*]]
+; CHECK:       [[LOOP1]]:
+; CHECK-NEXT:    br label %[[LOOP2:.*]]
+; CHECK:       [[LOOP2]]:
+; CHECK-NEXT:    br i1 [[ARG]], label %[[LOOP2_LATCH:.*]], label %[[LOOP_LATCH:.*]]
+; CHECK:       [[LOOP2_LATCH]]:
+; CHECK-NEXT:    store i32 0, ptr [[ARG1]], align 4
+; CHECK-NEXT:    br label %[[LOOP2]]
+; CHECK:       [[LOOP_LATCH]]:
+; CHECK-NEXT:    store ptr null, ptr @e, align 8, !tbaa [[ANYPTR_TBAA0:![0-9]+]]
+; CHECK-NEXT:    [[PTR:%.*]] = load ptr, ptr @e, align 8, !tbaa [[ANYPTR_TBAA0]]
+; CHECK-NEXT:    store i32 0, ptr [[PTR]], align 4, !tbaa [[INT_TBAA4:![0-9]+]]
+; CHECK-NEXT:    br label %[[LOOP1]]
 ;
 entry:
   br label %loop1
@@ -40,19 +41,20 @@ loop.latch:
 }
 
 define void @store_null(i1 %arg) {
-; CHECK-LABEL: @store_null(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[LOOP1:%.*]]
-; CHECK:       loop1:
-; CHECK-NEXT:    br label [[LOOP2:%.*]]
-; CHECK:       loop2:
-; CHECK-NEXT:    br i1 [[ARG:%.*]], label [[LOOP2_LATCH:%.*]], label [[LOOP_LATCH:%.*]]
-; CHECK:       loop2.latch:
+; CHECK-LABEL: define void @store_null(
+; CHECK-SAME: i1 [[ARG:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br label %[[LOOP1:.*]]
+; CHECK:       [[LOOP1]]:
+; CHECK-NEXT:    br label %[[LOOP2:.*]]
+; CHECK:       [[LOOP2]]:
+; CHECK-NEXT:    br i1 [[ARG]], label %[[LOOP2_LATCH:.*]], label %[[LOOP_LATCH:.*]]
+; CHECK:       [[LOOP2_LATCH]]:
 ; CHECK-NEXT:    store i32 0, ptr null, align 4
-; CHECK-NEXT:    br label [[LOOP2]]
-; CHECK:       loop.latch:
-; CHECK-NEXT:    store i32 0, ptr null, align 4, !tbaa [[TBAA4]]
-; CHECK-NEXT:    br label [[LOOP1]]
+; CHECK-NEXT:    br label %[[LOOP2]]
+; CHECK:       [[LOOP_LATCH]]:
+; CHECK-NEXT:    store i32 0, ptr null, align 4, !tbaa [[INT_TBAA4]]
+; CHECK-NEXT:    br label %[[LOOP1]]
 ;
 entry:
   br label %loop1
@@ -80,3 +82,11 @@ loop.latch:
 !3 = !{!"Simple C/C++ TBAA"}
 !4 = !{!5, !5, i64 0}
 !5 = !{!"int", !2, i64 0}
+;.
+; CHECK: [[ANYPTR_TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0}
+; CHECK: [[META1]] = !{!"any pointer", [[META2:![0-9]+]], i64 0}
+; CHECK: [[META2]] = !{!"omnipotent char", [[META3:![0-9]+]], i64 0}
+; CHECK: [[META3]] = !{!"Simple C/C++ TBAA"}
+; CHECK: [[INT_TBAA4]] = !{[[META5:![0-9]+]], [[META5]], i64 0}
+; CHECK: [[META5]] = !{!"int", [[META2]], i64 0}
+;.
diff --git a/llvm/test/Transforms/LICM/promote-capture.ll b/llvm/test/Transforms/LICM/promote-capture.ll
index eac670e6f3edd..19c5700740e7c 100644
--- a/llvm/test/Transforms/LICM/promote-capture.ll
+++ b/llvm/test/Transforms/LICM/promote-capture.ll
@@ -2,7 +2,7 @@
 ; RUN: opt -S -passes='loop-mssa(licm)' < %s | FileCheck %s
 
 declare i1 @cond(i32 %v) readnone
-declare void @capture(ptr %p) readnone
+declare ptr @capture(ptr %p) readnone
 
 define void @test_captured_after_loop(i32 %len) {
 ; CHECK-LABEL: @test_captured_after_loop(
@@ -27,7 +27,7 @@ define void @test_captured_after_loop(i32 %len) {
 ; CHECK:       exit:
 ; CHECK-NEXT:    [[C_INC1_LCSSA:%.*]] = phi i32 [ [[C_INC1]], [[LATCH]] ]
 ; CHECK-NEXT:    store i32 [[C_INC1_LCSSA]], ptr [[COUNT]], align 4
-; CHECK-NEXT:    call void @capture(ptr [[COUNT]])
+; CHECK-NEXT:    [[TMP0:%.*]] = call ptr @capture(ptr [[COUNT]])
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -52,7 +52,7 @@ latch:
   br i1 %cmp, label %exit, label %loop
 
 exit:
-  call void @capture(ptr %count)
+  call ptr @capture(ptr %count)
   ret void
 }
 
@@ -71,7 +71,7 @@ define void @test_captured_in_loop(i32 %len) {
 ; CHECK:       if:
 ; CHECK-NEXT:    [[C_INC:%.*]] = add i32 [[C_INC2]], 1
 ; CHECK-NEXT:    store i32 [[C_INC]], ptr [[COUNT]], align 4
-; CHECK-NEXT:    call void @capture(ptr [[COUNT]])
+; CHECK-NEXT:    [[TMP0:%.*]] = call ptr @capture(ptr [[COUNT]])
 ; CHECK-NEXT:    br label [[LATCH]]
 ; CHECK:       latch:
 ; CHECK-NEXT:    [[C_INC1]] = phi i32 [ [[C_INC]], [[IF]] ], [ [[C_INC2]], [[LOOP]] ]
@@ -95,7 +95,7 @@ if:
   %c = load i32, ptr %count
   %c.inc = add i32 %c, 1
   store i32 %c.inc, ptr %count
-  call void @capture(ptr %count)
+  call ptr @capture(ptr %count)
   br label %latch
 
 latch:
@@ -112,7 +112,7 @@ define void @test_captured_before_loop(i32 %len) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[COUNT:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    store i32 0, ptr [[COUNT]], align 4
-; CHECK-NEXT:    call void @capture(ptr [[COUNT]])
+; CHECK-NEXT:    [[TMP0:%.*]] = call ptr @capture(ptr [[COUNT]])
 ; CHECK-NEXT:    [[COUNT_PROMOTED:%.*]] = load i32, ptr [[COUNT]], align 4
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
@@ -135,7 +135,7 @@ define void @test_captured_before_loop(i32 %len) {
 entry:
   %count = alloca i32
   store i32 0, ptr %count
-  call void @capture(ptr %count)
+  call ptr @capture(ptr %count)
   br label %loop
 
 loop:
@@ -163,7 +163,7 @@ define void @test_captured_before_loop_address_only(i32 %len) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[COUNT:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    store i32 0, ptr [[COUNT]], align 4
-; CHECK-NEXT:    call void @capture(ptr captures(address) [[COUNT]])
+; CHECK-NEXT:    [[TMP0:%.*]] = call ptr @capture(ptr captures(address) [[COUNT]])
 ; CHECK-NEXT:    [[COUNT_PROMOTED:%.*]] = load i32, ptr [[COUNT]], align 4
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
@@ -187,7 +187,7 @@ define void @test_captured_before_loop_address_only(i32 %len) {
 entry:
   %count = alloca i32
   store i32 0, ptr %count
-  call void @capture(ptr captures(address) %count)
+  call ptr @capture(ptr captures(address) %count)
   br label %loop
 
 loop:
@@ -216,7 +216,7 @@ define void @test_captured_before_loop_byval(ptr byval(i32) align 4 %count, i32
 ; CHECK-LABEL: @test_captured_before_loop_byval(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    store i32 0, ptr [[COUNT:%.*]], align 4
-; CHECK-NEXT:    call void @capture(ptr [[COUNT]])
+; CHECK-NEXT:    [[TMP0:%.*]] = call ptr @capture(ptr [[COUNT]])
 ; CHECK-NEXT:    [[COUNT_PROMOTED:%.*]] = load i32, ptr [[COUNT]], align 4
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
@@ -238,7 +238,7 @@ define void @test_captured_before_loop_byval(ptr byval(i32) align 4 %count, i32
 ;
 entry:
   store i32 0, ptr %count
-  call void @capture(ptr %count)
+  call ptr @capture(ptr %count)
   br label %loop
 
 loop:
@@ -283,7 +283,7 @@ define void @test_captured_after_loop_byval(ptr byval(i32) align 4 %count, i32 %
 ; CHECK:       exit:
 ; CHECK-NEXT:    [[C_INC1_LCSSA:%.*]] = phi i32 [ [[C_INC1]], [[LATCH]] ]
 ; CHECK-NEXT:    store i32 [[C_INC1_LCSSA]], ptr [[COUNT]], align 4
-; CHECK-NEXT:    call void @capture(ptr [[COUNT]])
+; CHECK-NEXT:    [[TMP0:%.*]] = call ptr @capture(ptr [[COUNT]])
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -307,6 +307,6 @@ latch:
   br i1 %cmp, label %exit, label %loop
 
 exit:
-  call void @capture(ptr %count)
+  call ptr @capture(ptr %count)
   ret void
 }
diff --git a/llvm/test/Transforms/LICM/scalar-promote.ll b/llvm/test/Transforms/LICM/scalar-promote.ll
index bd3960e846b42..3af65df55a099 100644
--- a/llvm/test/Transforms/LICM/scalar-promote.ll
+++ b/llvm/test/Transforms/LICM/scalar-promote.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes --version 6
 ; RUN: opt < %s -passes=licm -S | FileCheck %s
 ; RUN: opt -aa-pipeline=tbaa,basic-aa -passes='require<aa>,require<target-ir>,require<scalar-evolution>,require<opt-remark-emit>,loop-mssa(licm)' -S %s | FileCheck %s
 target datalayout = "E-p:64:64:64-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-v64:64:64-v128:128:128"
@@ -6,19 +6,20 @@ target datalayout = "E-p:64:64:64-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:1
 @X = global i32 7   ; <ptr> [#uses=4]
 
 define void @test1(i32 %i) {
-; CHECK-LABEL: @test1(
-; CHECK-NEXT:  Entry:
+; CHECK-LABEL: define void @test1(
+; CHECK-SAME: i32 [[I:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    [[X_PROMOTED:%.*]] = load i32, ptr @X, align 4
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       Loop:
-; CHECK-NEXT:    [[X21:%.*]] = phi i32 [ [[X_PROMOTED]], [[ENTRY:%.*]] ], [ [[X2:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[J:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[X21:%.*]] = phi i32 [ [[X_PROMOTED]], %[[ENTRY]] ], [ [[X2:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[J:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[NEXT:%.*]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[X2]] = add i32 [[X21]], 1
 ; CHECK-NEXT:    [[NEXT]] = add i32 [[J]], 1
 ; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[NEXT]], 0
-; CHECK-NEXT:    br i1 [[COND]], label [[OUT:%.*]], label [[LOOP]]
-; CHECK:       Out:
-; CHECK-NEXT:    [[X2_LCSSA:%.*]] = phi i32 [ [[X2]], [[LOOP]] ]
+; CHECK-NEXT:    br i1 [[COND]], label %[[OUT:.*]], label %[[LOOP]]
+; CHECK:       [[OUT]]:
+; CHECK-NEXT:    [[X2_LCSSA:%.*]] = phi i32 [ [[X2]], %[[LOOP]] ]
 ; CHECK-NEXT:    store i32 [[X2_LCSSA]], ptr @X, align 4
 ; CHECK-NEXT:    ret void
 ;
@@ -39,18 +40,19 @@ Out:
 }
 
 define void @test2(i32 %i) {
-; CHECK-LABEL: @test2(
-; CHECK-NEXT:  Entry:
+; CHECK-LABEL: define void @test2(
+; CHECK-SAME: i32 [[I:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    [[X1:%.*]] = getelementptr i32, ptr @X, i64 1
 ; CHECK-NEXT:    [[X2:%.*]] = getelementptr i32, ptr @X, i64 1
 ; CHECK-NEXT:    [[X1_PROMOTED:%.*]] = load i32, ptr [[X1]], align 4
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       Loop:
-; CHECK-NEXT:    [[A1:%.*]] = phi i32 [ [[V:%.*]], [[LOOP]] ], [ [[X1_PROMOTED]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[A1:%.*]] = phi i32 [ [[V:%.*]], %[[LOOP]] ], [ [[X1_PROMOTED]], %[[ENTRY]] ]
 ; CHECK-NEXT:    [[V]] = add i32 [[A1]], 1
-; CHECK-NEXT:    br i1 false, label [[LOOP]], label [[EXIT:%.*]]
-; CHECK:       Exit:
-; CHECK-NEXT:    [[V_LCSSA:%.*]] = phi i32 [ [[V]], [[LOOP]] ]
+; CHECK-NEXT:    br i1 false, label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[V_LCSSA:%.*]] = phi i32 [ [[V]], %[[LOOP]] ]
 ; CHECK-NEXT:    store i32 [[V_LCSSA]], ptr [[X1]], align 4
 ; CHECK-NEXT:    ret void
 ;
@@ -70,14 +72,15 @@ Exit:   ; preds = %Loop
 }
 
 define void @test3(i32 %i) {
-; CHECK-LABEL: @test3(
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       Loop:
+; CHECK-LABEL: define void @test3(
+; CHECK-SAME: i32 [[I:%.*]]) {
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
 ; CHECK-NEXT:    [[X:%.*]] = load volatile i32, ptr @X, align 4
 ; CHECK-NEXT:    [[X2:%.*]] = add i32 [[X]], 1
 ; CHECK-NEXT:    store i32 [[X2]], ptr @X, align 4
-; CHECK-NEXT:    br i1 true, label [[OUT:%.*]], label [[LOOP]]
-; CHECK:       Out:
+; CHECK-NEXT:    br i1 true, label %[[OUT:.*]], label %[[LOOP]]
+; CHECK:       [[OUT]]:
 ; CHECK-NEXT:    ret void
 ;
   br label %Loop
@@ -94,14 +97,15 @@ Out:    ; preds = %Loop
 
 ; Should not promote this to a register
 define void @test3b(i32 %i) {
-; CHECK-LABEL: @test3b(
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       Loop:
+; CHECK-LABEL: define void @test3b(
+; CHECK-SAME: i32 [[I:%.*]]) {
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
 ; CHECK-NEXT:    [[X:%.*]] = load i32, ptr @X, align 4
 ; CHECK-NEXT:    [[X2:%.*]] = add i32 [[X]], 1
 ; CHECK-NEXT:    store volatile i32 [[X2]], ptr @X, align 4
-; CHECK-NEXT:    br i1 true, label [[OUT:%.*]], label [[LOOP]]
-; CHECK:       Out:
+; CHECK-NEXT:    br i1 true, label %[[OUT:.*]], label %[[LOOP]]
+; CHECK:       [[OUT]]:
 ; CHECK-NEXT:    ret void
 ;
   br label %Loop
@@ -119,30 +123,31 @@ Out:    ; preds = %Loop
 ; Should have promoted 'handle2' accesses.
 ; Should not have promoted offsetx1 loads.
 define void @test4(ptr %x, i8 %n) {
-; CHECK-LABEL: @test4(
+; CHECK-LABEL: define void @test4(
+; CHECK-SAME: ptr [[X:%.*]], i8 [[N:%.*]]) {
 ; CHECK-NEXT:    [[HANDLE1:%.*]] = alloca ptr, align 8
 ; CHECK-NEXT:    [[HANDLE2:%.*]] = alloca ptr, align 8
-; CHECK-NEXT:    store ptr [[X:%.*]], ptr [[HANDLE1]], align 8
+; CHECK-NEXT:    store ptr [[X]], ptr [[HANDLE1]], align 8
 ; CHECK-NEXT:    [[TMP:%.*]] = getelementptr i8, ptr [[X]], i64 8
 ; CHECK-NEXT:    [[OFFSETX1:%.*]] = load ptr, ptr [[HANDLE1]], align 8
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    br label [[SUBLOOP:%.*]]
-; CHECK:       subloop:
-; CHECK-NEXT:    [[NEWOFFSETX21:%.*]] = phi ptr [ [[TMP]], [[LOOP]] ], [ [[NEWOFFSETX2:%.*]], [[SUBLOOP]] ]
-; CHECK-NEXT:    [[COUNT:%.*]] = phi i8 [ 0, [[LOOP]] ], [ [[NEXTCOUNT:%.*]], [[SUBLOOP]] ]
-; CHECK-NEXT:    store i8 [[N:%.*]], ptr [[NEWOFFSETX21]], align 1
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    br label %[[SUBLOOP:.*]]
+; CHECK:       [[SUBLOOP]]:
+; CHECK-NEXT:    [[NEWOFFSETX21:%.*]] = phi ptr [ [[TMP]], %[[LOOP]] ], [ [[NEWOFFSETX2:%.*]], %[[SUBLOOP]] ]
+; CHECK-NEXT:    [[COUNT:%.*]] = phi i8 [ 0, %[[LOOP]] ], [ [[NEXTCOUNT:%.*]], %[[SUBLOOP]] ]
+; CHECK-NEXT:    store i8 [[N]], ptr [[NEWOFFSETX21]], align 1
 ; CHECK-NEXT:    [[NEWOFFSETX2]] = getelementptr i8, ptr [[NEWOFFSETX21]], i64 -1
 ; CHECK-NEXT:    [[NEXTCOUNT]] = add i8 [[COUNT]], 1
 ; CHECK-NEXT:    [[INNEREXITCOND:%.*]] = icmp sge i8 [[NEXTCOUNT]], 8
-; CHECK-NEXT:    br i1 [[INNEREXITCOND]], label [[INNEREXIT:%.*]], label [[SUBLOOP]]
-; CHECK:       innerexit:
-; CHECK-NEXT:    [[NEWOFFSETX2_LCSSA:%.*]] = phi ptr [ [[NEWOFFSETX2]], [[SUBLOOP]] ]
+; CHECK-NEXT:    br i1 [[INNEREXITCOND]], label %[[INNEREXIT:.*]], label %[[SUBLOOP]]
+; CHECK:       [[INNEREXIT]]:
+; CHECK-NEXT:    [[NEWOFFSETX2_LCSSA:%.*]] = phi ptr [ [[NEWOFFSETX2]], %[[SUBLOOP]] ]
 ; CHECK-NEXT:    [[VAL:%.*]] = load i8, ptr [[OFFSETX1]], align 1
 ; CHECK-NEXT:    [[COND:%.*]] = icmp eq i8 [[VAL]], [[N]]
-; CHECK-NEXT:    br i1 [[COND]], label [[EXIT:%.*]], label [[LOOP]]
-; CHECK:       exit:
-; CHECK-NEXT:    [[NEWOFFSETX2_LCSSA_LCSSA:%.*]] = phi ptr [ [[NEWOFFSETX2_LCSSA]], [[INNEREXIT]] ]
+; CHECK-NEXT:    br i1 [[COND]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[NEWOFFSETX2_LCSSA_LCSSA:%.*]] = phi ptr [ [[NEWOFFSETX2_LCSSA]], %[[INNEREXIT]] ]
 ; CHECK-NEXT:    store ptr [[NEWOFFSETX2_LCSSA_LCSSA]], ptr [[HANDLE2]], align 8
 ; CHECK-NEXT:    ret void
 ;
@@ -177,20 +182,21 @@ exit:
 }
 
 define void @test5(i32 %i, ptr noalias %P2) {
-; CHECK-LABEL: @test5(
-; CHECK-NEXT:  Entry:
+; CHECK-LABEL: define void @test5(
+; CHECK-SAME: i32 [[I:%.*]], ptr noalias [[P2:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    [[X_PROMOTED:%.*]] = load i32, ptr @X, align 4
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       Loop:
-; CHECK-NEXT:    [[X21:%.*]] = phi i32 [ [[X_PROMOTED]], [[ENTRY:%.*]] ], [ [[X2:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[J:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[X21:%.*]] = phi i32 [ [[X_PROMOTED]], %[[ENTRY]] ], [ [[X2:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[J:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[NEXT:%.*]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[X2]] = add i32 [[X21]], 1
-; CHECK-NEXT:    store atomic ptr @X, ptr [[P2:%.*]] monotonic, align 8
+; CHECK-NEXT:    store atomic ptr @X, ptr [[P2]] monotonic, align 8
 ; CHECK-NEXT:    [[NEXT]] = add i32 [[J]], 1
 ; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[NEXT]], 0
-; CHECK-NEXT:    br i1 [[COND]], label [[OUT:%.*]], label [[LOOP]]
-; CHECK:       Out:
-; CHECK-NEXT:    [[X2_LCSSA:%.*]] = phi i32 [ [[X2]], [[LOOP]] ]
+; CHECK-NEXT:    br i1 [[COND]], label %[[OUT:.*]], label %[[LOOP]]
+; CHECK:       [[OUT]]:
+; CHECK-NEXT:    [[X2_LCSSA:%.*]] = phi i32 [ [[X2]], %[[LOOP]] ]
 ; CHECK-NEXT:    store i32 [[X2_LCSSA]], ptr @X, align 4
 ; CHECK-NEXT:    ret void
 ;
@@ -217,28 +223,29 @@ Out:
 
 ; PR14753 - Preserve TBAA tags when promoting values in a loop.
 define void @test6(i32 %n, ptr nocapture %a, ptr %gi) {
-; CHECK-LABEL: @test6(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    store i32 0, ptr [[GI:%.*]], align 4, !tbaa [[TBAA0:![0-9]+]]
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp slt i32 0, [[N:%.*]]
-; CHECK-NEXT:    br i1 [[CMP1]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]]
-; CHECK:       for.body.lr.ph:
-; CHECK-NEXT:    [[GI_PROMOTED:%.*]] = load i32, ptr [[GI]], align 4, !tbaa [[TBAA0]]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INC1:%.*]] = phi i32 [ [[GI_PROMOTED]], [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[STOREMERGE2:%.*]] = phi i32 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC]], [[FOR_BODY]] ]
+; CHECK-LABEL: define void @test6(
+; CHECK-SAME: i32 [[N:%.*]], ptr captures(none) [[A:%.*]], ptr [[GI:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store i32 0, ptr [[GI]], align 4, !tbaa [[INT_TBAA0:![0-9]+]]
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp slt i32 0, [[N]]
+; CHECK-NEXT:    br i1 [[CMP1]], label %[[FOR_BODY_LR_PH:.*]], label %[[FOR_END:.*]]
+; CHECK:       [[FOR_BODY_LR_PH]]:
+; CHECK-NEXT:    [[GI_PROMOTED:%.*]] = load i32, ptr [[GI]], align 4, !tbaa [[INT_TBAA0]]
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[INC1:%.*]] = phi i32 [ [[GI_PROMOTED]], %[[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[STOREMERGE2:%.*]] = phi i32 [ 0, %[[FOR_BODY_LR_PH]] ], [ [[INC]], %[[FOR_BODY]] ]
 ; CHECK-NEXT:    [[IDXPROM:%.*]] = sext i32 [[STOREMERGE2]] to i64
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[IDXPROM]]
-; CHECK-NEXT:    store float 0.000000e+00, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA4:![0-9]+]]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IDXPROM]]
+; CHECK-NEXT:    store float 0.000000e+00, ptr [[ARRAYIDX]], align 4, !tbaa [[FLOAT_TBAA4:![0-9]+]]
 ; CHECK-NEXT:    [[INC]] = add nsw i32 [[INC1]], 1
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[INC]], [[N]]
-; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_FOR_END_CRIT_EDGE:%.*]]
-; CHECK:       for.cond.for.end_crit_edge:
-; CHECK-NEXT:    [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[FOR_BODY]] ]
-; CHECK-NEXT:    store i32 [[INC_LCSSA]], ptr [[GI]], align 4, !tbaa [[TBAA0]]
-; CHECK-NEXT:    br label [[FOR_END]]
-; CHECK:       for.end:
+; CHECK-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_FOR_END_CRIT_EDGE:.*]]
+; CHECK:       [[FOR_COND_FOR_END_CRIT_EDGE]]:
+; CHECK-NEXT:    [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    store i32 [[INC_LCSSA]], ptr [[GI]], align 4, !tbaa [[INT_TBAA0]]
+; CHECK-NEXT:    br label %[[FOR_END]]
+; CHECK:       [[FOR_END]]:
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -273,21 +280,21 @@ declare void @capture(ptr)
 
 ; We can promote even if opaque may throw.
 define i32 @test7() {
-; CHECK-LABEL: @test7(
-; CHECK-NEXT:  entry:
+; CHECK-LABEL: define i32 @test7() {
+; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    [[LOCAL:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    call void @capture(ptr [[LOCAL]])
 ; CHECK-NEXT:    [[LOCAL_PROMOTED:%.*]] = load i32, ptr [[LOCAL]], align 4
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[X21:%.*]] = phi i32 [ [[LOCAL_PROMOTED]], [[ENTRY:%.*]] ], [ [[X2:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[J:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[X21:%.*]] = phi i32 [ [[LOCAL_PROMOTED]], %[[ENTRY]] ], [ [[X2:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[J:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[NEXT:%.*]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[X2]] = call i32 @opaque(i32 [[X21]])
 ; CHECK-NEXT:    [[NEXT]] = add i32 [[J]], 1
 ; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[NEXT]], 0
-; CHECK-NEXT:    br i1 [[COND]], label [[EXIT:%.*]], label [[LOOP]]
-; CHECK:       exit:
-; CHECK-NEXT:    [[X2_LCSSA:%.*]] = phi i32 [ [[X2]], [[LOOP]] ]
+; CHECK-NEXT:    br i1 [[COND]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[X2_LCSSA:%.*]] = phi i32 [ [[X2]], %[[LOOP]] ]
 ; CHECK-NEXT:    store i32 [[X2_LCSSA]], ptr [[LOCAL]], align 4
 ; CHECK-NEXT:    [[RET:%.*]] = load i32, ptr [[LOCAL]], align 4
 ; CHECK-NEXT:    ret i32 [[RET]]
@@ -314,27 +321,27 @@ exit:
 ; Hoist the load even if we cannot sink the store, since the store is really
 ; control-flow dependent.
 define i32 @test7bad() {
-; CHECK-LABEL: @test7bad(
-; CHECK-NEXT:  entry:
+; CHECK-LABEL: define i32 @test7bad() {
+; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    [[LOCAL:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    call void @capture(ptr [[LOCAL]])
 ; CHECK-NEXT:    [[LOCAL_PROMOTED:%.*]] = load i32, ptr [[LOCAL]], align 4
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[X22:%.*]] = phi i32 [ [[LOCAL_PROMOTED]], [[ENTRY:%.*]] ], [ [[X21:%.*]], [[ELSE:%.*]] ]
-; CHECK-NEXT:    [[J:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[NEXT:%.*]], [[ELSE]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[X22:%.*]] = phi i32 [ [[LOCAL_PROMOTED]], %[[ENTRY]] ], [ [[X21:%.*]], %[[ELSE:.*]] ]
+; CHECK-NEXT:    [[J:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[NEXT:%.*]], %[[ELSE]] ]
 ; CHECK-NEXT:    [[X2:%.*]] = call i32 @opaque(i32 [[X22]])
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X2]], 0
-; CHECK-NEXT:    br i1 [[CMP]], label [[IF:%.*]], label [[ELSE]]
-; CHECK:       if:
+; CHECK-NEXT:    br i1 [[CMP]], label %[[IF:.*]], label %[[ELSE]]
+; CHECK:       [[IF]]:
 ; CHECK-NEXT:    store i32 [[X2]], ptr [[LOCAL]], align 4
-; CHECK-NEXT:    br label [[ELSE]]
-; CHECK:       else:
-; CHECK-NEXT:    [[X21]] = phi i32 [ [[X2]], [[IF]] ], [ [[X22]], [[LOOP]] ]
+; CHECK-NEXT:    br label %[[ELSE]]
+; CHECK:       [[ELSE]]:
+; CHECK-NEXT:    [[X21]] = phi i32 [ [[X2]], %[[IF]] ], [ [[X22]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[NEXT]] = add i32 [[J]], 1
 ; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[NEXT]], 0
-; CHECK-NEXT:    br i1 [[COND]], label [[EXIT:%.*]], label [[LOOP]]
-; CHECK:       exit:
+; CHECK-NEXT:    br i1 [[COND]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    [[RET:%.*]] = load i32, ptr [[LOCAL]], align 4
 ; CHECK-NEXT:    ret i32 [[RET]]
 ;
@@ -367,22 +374,22 @@ exit:
 ; opaque() may throw, we can still promote - the load not being guaranteed
 ; doesn't block us, because %local is always dereferenceable.
 define i32 @test8() {
-; CHECK-LABEL: @test8(
-; CHECK-NEXT:  entry:
+; CHECK-LABEL: define i32 @test8() {
+; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    [[LOCAL:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    call void @capture(ptr [[LOCAL]])
 ; CHECK-NEXT:    [[LOCAL_PROMOTED:%.*]] = load i32, ptr [[LOCAL]], align 4
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[X21:%.*]] = phi i32 [ [[LOCAL_PROMOTED]], [[ENTRY:%.*]] ], [ [[X2:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[J:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[X21:%.*]] = phi i32 [ [[LOCAL_PROMOTED]], %[[ENTRY]] ], [ [[X2:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[J:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[NEXT:%.*]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[THROWAWAY:%.*]] = call i32 @opaque(i32 [[J]])
 ; CHECK-NEXT:    [[X2]] = call i32 @opaque(i32 [[X21]])
 ; CHECK-NEXT:    [[NEXT]] = add i32 [[J]], 1
 ; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[NEXT]], 0
-; CHECK-NEXT:    br i1 [[COND]], label [[EXIT:%.*]], label [[LOOP]]
-; CHECK:       exit:
-; CHECK-NEXT:    [[X2_LCSSA:%.*]] = phi i32 [ [[X2]], [[LOOP]] ]
+; CHECK-NEXT:    br i1 [[COND]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[X2_LCSSA:%.*]] = phi i32 [ [[X2]], %[[LOOP]] ]
 ; CHECK-NEXT:    store i32 [[X2_LCSSA]], ptr [[LOCAL]], align 4
 ; CHECK-NEXT:    [[RET:%.*]] = load i32, ptr [[LOCAL]], align 4
 ; CHECK-NEXT:    ret i32 [[RET]]
@@ -412,27 +419,27 @@ exit:
 ; control flow, we can only promote if the pointer is otherwise known to be
 ; dereferenceable
 define i32 @test9() {
-; CHECK-LABEL: @test9(
-; CHECK-NEXT:  entry:
+; CHECK-LABEL: define i32 @test9() {
+; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    [[LOCAL:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    call void @capture(ptr [[LOCAL]])
 ; CHECK-NEXT:    [[LOCAL_PROMOTED:%.*]] = load i32, ptr [[LOCAL]], align 4
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[X21:%.*]] = phi i32 [ [[LOCAL_PROMOTED]], [[ENTRY:%.*]] ], [ [[X2:%.*]], [[ELSE:%.*]] ]
-; CHECK-NEXT:    [[J:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[NEXT:%.*]], [[ELSE]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[X21:%.*]] = phi i32 [ [[LOCAL_PROMOTED]], %[[ENTRY]] ], [ [[X2:%.*]], %[[ELSE:.*]] ]
+; CHECK-NEXT:    [[J:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[NEXT:%.*]], %[[ELSE]] ]
 ; CHECK-NEXT:    [[J2:%.*]] = call i32 @opaque(i32 [[J]])
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[J2]], 0
-; CHECK-NEXT:    br i1 [[CMP]], label [[IF:%.*]], label [[ELSE]]
-; CHECK:       if:
-; CHECK-NEXT:    br label [[ELSE]]
-; CHECK:       else:
-; CHECK-NEXT:    [[X2]] = phi i32 [ 0, [[LOOP]] ], [ [[X21]], [[IF]] ]
+; CHECK-NEXT:    br i1 [[CMP]], label %[[IF:.*]], label %[[ELSE]]
+; CHECK:       [[IF]]:
+; CHECK-NEXT:    br label %[[ELSE]]
+; CHECK:       [[ELSE]]:
+; CHECK-NEXT:    [[X2]] = phi i32 [ 0, %[[LOOP]] ], [ [[X21]], %[[IF]] ]
 ; CHECK-NEXT:    [[NEXT]] = add i32 [[J]], 1
 ; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[NEXT]], 0
-; CHECK-NEXT:    br i1 [[COND]], label [[EXIT:%.*]], label [[LOOP]]
-; CHECK:       exit:
-; CHECK-NEXT:    [[X2_LCSSA:%.*]] = phi i32 [ [[X2]], [[ELSE]] ]
+; CHECK-NEXT:    br i1 [[COND]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[X2_LCSSA:%.*]] = phi i32 [ [[X2]], %[[ELSE]] ]
 ; CHECK-NEXT:    store i32 [[X2_LCSSA]], ptr [[LOCAL]], align 4
 ; CHECK-NEXT:    [[RET:%.*]] = load i32, ptr [[LOCAL]], align 4
 ; CHECK-NEXT:    ret i32 [[RET]]
@@ -465,27 +472,28 @@ exit:
 }
 
 define i32 @test9bad(i32 %i) {
-; CHECK-LABEL: @test9bad(
-; CHECK-NEXT:  entry:
+; CHECK-LABEL: define i32 @test9bad(
+; CHECK-SAME: i32 [[I:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    [[LOCAL:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    call void @capture(ptr [[LOCAL]])
-; CHECK-NEXT:    [[NOTDEREF:%.*]] = getelementptr i32, ptr [[LOCAL]], i32 [[I:%.*]]
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[J:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[NEXT:%.*]], [[ELSE:%.*]] ]
+; CHECK-NEXT:    [[NOTDEREF:%.*]] = getelementptr i32, ptr [[LOCAL]], i32 [[I]]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[J:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[NEXT:%.*]], %[[ELSE:.*]] ]
 ; CHECK-NEXT:    [[J2:%.*]] = call i32 @opaque(i32 [[J]])
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[J2]], 0
-; CHECK-NEXT:    br i1 [[CMP]], label [[IF:%.*]], label [[ELSE]]
-; CHECK:       if:
+; CHECK-NEXT:    br i1 [[CMP]], label %[[IF:.*]], label %[[ELSE]]
+; CHECK:       [[IF]]:
 ; CHECK-NEXT:    [[X:%.*]] = load i32, ptr [[NOTDEREF]], align 4
-; CHECK-NEXT:    br label [[ELSE]]
-; CHECK:       else:
-; CHECK-NEXT:    [[X2:%.*]] = phi i32 [ 0, [[LOOP]] ], [ [[X]], [[IF]] ]
+; CHECK-NEXT:    br label %[[ELSE]]
+; CHECK:       [[ELSE]]:
+; CHECK-NEXT:    [[X2:%.*]] = phi i32 [ 0, %[[LOOP]] ], [ [[X]], %[[IF]] ]
 ; CHECK-NEXT:    store i32 [[X2]], ptr [[NOTDEREF]], align 4
 ; CHECK-NEXT:    [[NEXT]] = add i32 [[J]], 1
 ; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[NEXT]], 0
-; CHECK-NEXT:    br i1 [[COND]], label [[EXIT:%.*]], label [[LOOP]]
-; CHECK:       exit:
+; CHECK-NEXT:    br i1 [[COND]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    [[RET:%.*]] = load i32, ptr [[NOTDEREF]], align 4
 ; CHECK-NEXT:    ret i32 [[RET]]
 ;
@@ -518,19 +526,20 @@ exit:
 }
 
 define void @test10(i32 %i) {
-; CHECK-LABEL: @test10(
-; CHECK-NEXT:  Entry:
+; CHECK-LABEL: define void @test10(
+; CHECK-SAME: i32 [[I:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    [[X_PROMOTED:%.*]] = load atomic i32, ptr @X unordered, align 4
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       Loop:
-; CHECK-NEXT:    [[X21:%.*]] = phi i32 [ [[X_PROMOTED]], [[ENTRY:%.*]] ], [ [[X2:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[J:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[X21:%.*]] = phi i32 [ [[X_PROMOTED]], %[[ENTRY]] ], [ [[X2:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[J:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[NEXT:%.*]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[X2]] = add i32 [[X21]], 1
 ; CHECK-NEXT:    [[NEXT]] = add i32 [[J]], 1
 ; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[NEXT]], 0
-; CHECK-NEXT:    br i1 [[COND]], label [[OUT:%.*]], label [[LOOP]]
-; CHECK:       Out:
-; CHECK-NEXT:    [[X2_LCSSA:%.*]] = phi i32 [ [[X2]], [[LOOP]] ]
+; CHECK-NEXT:    br i1 [[COND]], label %[[OUT:.*]], label %[[LOOP]]
+; CHECK:       [[OUT]]:
+; CHECK-NEXT:    [[X2_LCSSA:%.*]] = phi i32 [ [[X2]], %[[LOOP]] ]
 ; CHECK-NEXT:    store atomic i32 [[X2_LCSSA]], ptr @X unordered, align 4
 ; CHECK-NEXT:    ret void
 ;
@@ -555,26 +564,27 @@ Out:
 ; Early exit is known not to be taken on first iteration and thus doesn't
 ; effect whether load is known to execute.
 define void @test11(i32 %i) {
-; CHECK-LABEL: @test11(
-; CHECK-NEXT:  Entry:
+; CHECK-LABEL: define void @test11(
+; CHECK-SAME: i32 [[I:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    [[X_PROMOTED:%.*]] = load i32, ptr @X, align 4
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       Loop:
-; CHECK-NEXT:    [[X21:%.*]] = phi i32 [ [[X_PROMOTED]], [[ENTRY:%.*]] ], [ [[X2:%.*]], [[BODY:%.*]] ]
-; CHECK-NEXT:    [[J:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[NEXT:%.*]], [[BODY]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[X21:%.*]] = phi i32 [ [[X_PROMOTED]], %[[ENTRY]] ], [ [[X2:%.*]], %[[BODY:.*]] ]
+; CHECK-NEXT:    [[J:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[NEXT:%.*]], %[[BODY]] ]
 ; CHECK-NEXT:    [[EARLY_TEST:%.*]] = icmp ult i32 [[J]], 32
-; CHECK-NEXT:    br i1 [[EARLY_TEST]], label [[BODY]], label [[EARLY:%.*]]
-; CHECK:       body:
+; CHECK-NEXT:    br i1 [[EARLY_TEST]], label %[[BODY]], label %[[EARLY:.*]]
+; CHECK:       [[BODY]]:
 ; CHECK-NEXT:    [[X2]] = add i32 [[X21]], 1
 ; CHECK-NEXT:    [[NEXT]] = add i32 [[J]], 1
 ; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[NEXT]], 0
-; CHECK-NEXT:    br i1 [[COND]], label [[OUT:%.*]], label [[LOOP]]
-; CHECK:       Early:
-; CHECK-NEXT:    [[X21_LCSSA:%.*]] = phi i32 [ [[X21]], [[LOOP]] ]
+; CHECK-NEXT:    br i1 [[COND]], label %[[OUT:.*]], label %[[LOOP]]
+; CHECK:       [[EARLY]]:
+; CHECK-NEXT:    [[X21_LCSSA:%.*]] = phi i32 [ [[X21]], %[[LOOP]] ]
 ; CHECK-NEXT:    store i32 [[X21_LCSSA]], ptr @X, align 4
 ; CHECK-NEXT:    ret void
-; CHECK:       Out:
-; CHECK-NEXT:    [[X2_LCSSA:%.*]] = phi i32 [ [[X2]], [[BODY]] ]
+; CHECK:       [[OUT]]:
+; CHECK-NEXT:    [[X2_LCSSA:%.*]] = phi i32 [ [[X2]], %[[BODY]] ]
 ; CHECK-NEXT:    store i32 [[X2_LCSSA]], ptr @X, align 4
 ; CHECK-NEXT:    ret void
 ;
@@ -603,21 +613,22 @@ Out:
 
 define i8 @test_hoistable_existing_load_sinkable_store_writeonly(ptr dereferenceable(8) %ptr, i8 %start) writeonly {
 ; CHECK: Function Attrs: memory(write)
-; CHECK-LABEL: @test_hoistable_existing_load_sinkable_store_writeonly(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[PTR_PROMOTED:%.*]] = load i8, ptr [[PTR:%.*]], align 1
-; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
-; CHECK:       loop.header:
-; CHECK-NEXT:    [[INC1:%.*]] = phi i8 [ [[PTR_PROMOTED]], [[ENTRY:%.*]] ], [ [[INC1]], [[LOOP_LATCH:%.*]] ]
-; CHECK-NEXT:    [[I:%.*]] = phi i8 [ [[START:%.*]], [[ENTRY]] ], [ [[ADD:%.*]], [[LOOP_LATCH]] ]
+; CHECK-LABEL: define i8 @test_hoistable_existing_load_sinkable_store_writeonly(
+; CHECK-SAME: ptr dereferenceable(8) [[PTR:%.*]], i8 [[START:%.*]]) #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[PTR_PROMOTED:%.*]] = load i8, ptr [[PTR]], align 1
+; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
+; CHECK:       [[LOOP_HEADER]]:
+; CHECK-NEXT:    [[INC1:%.*]] = phi i8 [ [[PTR_PROMOTED]], %[[ENTRY]] ], [ [[INC1]], %[[LOOP_LATCH:.*]] ]
+; CHECK-NEXT:    [[I:%.*]] = phi i8 [ [[START]], %[[ENTRY]] ], [ [[ADD:%.*]], %[[LOOP_LATCH]] ]
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i8 [[I]], 4
-; CHECK-NEXT:    br i1 [[CMP]], label [[LOOP_LATCH]], label [[EXIT:%.*]]
-; CHECK:       loop.latch:
+; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP_LATCH]], label %[[EXIT:.*]]
+; CHECK:       [[LOOP_LATCH]]:
 ; CHECK-NEXT:    store i8 [[INC1]], ptr [[PTR]], align 1
 ; CHECK-NEXT:    [[ADD]] = add i8 [[I]], [[INC1]]
-; CHECK-NEXT:    br label [[LOOP_HEADER]]
-; CHECK:       exit:
-; CHECK-NEXT:    [[I_LCSSA:%.*]] = phi i8 [ [[I]], [[LOOP_HEADER]] ]
+; CHECK-NEXT:    br label %[[LOOP_HEADER]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[I_LCSSA:%.*]] = phi i8 [ [[I]], %[[LOOP_HEADER]] ]
 ; CHECK-NEXT:    ret i8 [[I_LCSSA]]
 ;
 entry:
@@ -644,20 +655,21 @@ exit:
 ; Test case for PR51248.
 define void @test_sink_store_only() writeonly {
 ; CHECK: Function Attrs: memory(write)
-; CHECK-LABEL: @test_sink_store_only(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
-; CHECK:       loop.header:
-; CHECK-NEXT:    [[DIV1:%.*]] = phi i8 [ poison, [[ENTRY:%.*]] ], [ [[DIV:%.*]], [[LOOP_LATCH:%.*]] ]
-; CHECK-NEXT:    [[I:%.*]] = phi i8 [ 0, [[ENTRY]] ], [ [[ADD:%.*]], [[LOOP_LATCH]] ]
+; CHECK-LABEL: define void @test_sink_store_only(
+; CHECK-SAME: ) #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
+; CHECK:       [[LOOP_HEADER]]:
+; CHECK-NEXT:    [[DIV1:%.*]] = phi i8 [ poison, %[[ENTRY]] ], [ [[DIV:%.*]], %[[LOOP_LATCH:.*]] ]
+; CHECK-NEXT:    [[I:%.*]] = phi i8 [ 0, %[[ENTRY]] ], [ [[ADD:%.*]], %[[LOOP_LATCH]] ]
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i8 [[I]], 4
-; CHECK-NEXT:    br i1 [[CMP]], label [[LOOP_LATCH]], label [[EXIT:%.*]]
-; CHECK:       loop.latch:
+; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP_LATCH]], label %[[EXIT:.*]]
+; CHECK:       [[LOOP_LATCH]]:
 ; CHECK-NEXT:    [[DIV]] = sdiv i8 [[I]], 3
 ; CHECK-NEXT:    [[ADD]] = add i8 [[I]], 4
-; CHECK-NEXT:    br label [[LOOP_HEADER]]
-; CHECK:       exit:
-; CHECK-NEXT:    [[DIV1_LCSSA:%.*]] = phi i8 [ [[DIV1]], [[LOOP_HEADER]] ]
+; CHECK-NEXT:    br label %[[LOOP_HEADER]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[DIV1_LCSSA:%.*]] = phi i8 [ [[DIV1]], %[[LOOP_HEADER]] ]
 ; CHECK-NEXT:    store i8 [[DIV1_LCSSA]], ptr @glb, align 1
 ; CHECK-NEXT:    ret void
 ;
@@ -681,21 +693,22 @@ exit:
 
 define void @test_sink_store_to_local_object_only_loop_must_execute() writeonly {
 ; CHECK: Function Attrs: memory(write)
-; CHECK-LABEL: @test_sink_store_to_local_object_only_loop_must_execute(
-; CHECK-NEXT:  entry:
+; CHECK-LABEL: define void @test_sink_store_to_local_object_only_loop_must_execute(
+; CHECK-SAME: ) #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    [[A:%.*]] = alloca i8, align 1
-; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
-; CHECK:       loop.header:
-; CHECK-NEXT:    [[DIV1:%.*]] = phi i8 [ poison, [[ENTRY:%.*]] ], [ [[DIV:%.*]], [[LOOP_LATCH:%.*]] ]
-; CHECK-NEXT:    [[I:%.*]] = phi i8 [ 0, [[ENTRY]] ], [ [[ADD:%.*]], [[LOOP_LATCH]] ]
+; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
+; CHECK:       [[LOOP_HEADER]]:
+; CHECK-NEXT:    [[DIV1:%.*]] = phi i8 [ poison, %[[ENTRY]] ], [ [[DIV:%.*]], %[[LOOP_LATCH:.*]] ]
+; CHECK-NEXT:    [[I:%.*]] = phi i8 [ 0, %[[ENTRY]] ], [ [[ADD:%.*]], %[[LOOP_LATCH]] ]
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i8 [[I]], 4
-; CHECK-NEXT:    br i1 [[CMP]], label [[LOOP_LATCH]], label [[EXIT:%.*]]
-; CHECK:       loop.latch:
+; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP_LATCH]], label %[[EXIT:.*]]
+; CHECK:       [[LOOP_LATCH]]:
 ; CHECK-NEXT:    [[DIV]] = sdiv i8 [[I]], 3
 ; CHECK-NEXT:    [[ADD]] = add i8 [[I]], 4
-; CHECK-NEXT:    br label [[LOOP_HEADER]]
-; CHECK:       exit:
-; CHECK-NEXT:    [[DIV1_LCSSA:%.*]] = phi i8 [ [[DIV1]], [[LOOP_HEADER]] ]
+; CHECK-NEXT:    br label %[[LOOP_HEADER]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[DIV1_LCSSA:%.*]] = phi i8 [ [[DIV1]], %[[LOOP_HEADER]] ]
 ; CHECK-NEXT:    store i8 [[DIV1_LCSSA]], ptr [[A]], align 1
 ; CHECK-NEXT:    ret void
 ;
@@ -722,22 +735,23 @@ exit:
 ; pre-header. Make sure the writeonly attribute is dropped.
 define void @test_sink_store_to_local_object_only_loop_may_not_execute(i8 %n) writeonly {
 ; CHECK: Function Attrs: memory(write)
-; CHECK-LABEL: @test_sink_store_to_local_object_only_loop_may_not_execute(
-; CHECK-NEXT:  entry:
+; CHECK-LABEL: define void @test_sink_store_to_local_object_only_loop_may_not_execute(
+; CHECK-SAME: i8 [[N:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    [[A:%.*]] = alloca i8, align 1
 ; CHECK-NEXT:    [[A_PROMOTED:%.*]] = load i8, ptr [[A]], align 1
-; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
-; CHECK:       loop.header:
-; CHECK-NEXT:    [[DIV1:%.*]] = phi i8 [ [[A_PROMOTED]], [[ENTRY:%.*]] ], [ [[DIV:%.*]], [[LOOP_LATCH:%.*]] ]
-; CHECK-NEXT:    [[I:%.*]] = phi i8 [ 0, [[ENTRY]] ], [ [[ADD:%.*]], [[LOOP_LATCH]] ]
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i8 [[I]], [[N:%.*]]
-; CHECK-NEXT:    br i1 [[CMP]], label [[LOOP_LATCH]], label [[EXIT:%.*]]
-; CHECK:       loop.latch:
+; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
+; CHECK:       [[LOOP_HEADER]]:
+; CHECK-NEXT:    [[DIV1:%.*]] = phi i8 [ [[A_PROMOTED]], %[[ENTRY]] ], [ [[DIV:%.*]], %[[LOOP_LATCH:.*]] ]
+; CHECK-NEXT:    [[I:%.*]] = phi i8 [ 0, %[[ENTRY]] ], [ [[ADD:%.*]], %[[LOOP_LATCH]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i8 [[I]], [[N]]
+; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP_LATCH]], label %[[EXIT:.*]]
+; CHECK:       [[LOOP_LATCH]]:
 ; CHECK-NEXT:    [[DIV]] = sdiv i8 [[I]], 3
 ; CHECK-NEXT:    [[ADD]] = add i8 [[I]], 4
-; CHECK-NEXT:    br label [[LOOP_HEADER]]
-; CHECK:       exit:
-; CHECK-NEXT:    [[DIV1_LCSSA:%.*]] = phi i8 [ [[DIV1]], [[LOOP_HEADER]] ]
+; CHECK-NEXT:    br label %[[LOOP_HEADER]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[DIV1_LCSSA:%.*]] = phi i8 [ [[DIV1]], %[[LOOP_HEADER]] ]
 ; CHECK-NEXT:    store i8 [[DIV1_LCSSA]], ptr [[A]], align 1
 ; CHECK-NEXT:    ret void
 ;
@@ -764,22 +778,23 @@ declare dereferenceable(8) noalias ptr @alloc_writeonly() writeonly
 
 define void @test_sink_store_to_noalias_call_object_only_loop_may_not_execute1(i8 %n) writeonly {
 ; CHECK: Function Attrs: memory(write)
-; CHECK-LABEL: @test_sink_store_to_noalias_call_object_only_loop_may_not_execute1(
-; CHECK-NEXT:  entry:
+; CHECK-LABEL: define void @test_sink_store_to_noalias_call_object_only_loop_may_not_execute1(
+; CHECK-SAME: i8 [[N:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    [[A:%.*]] = call noalias dereferenceable(8) ptr @alloc_writeonly()
 ; CHECK-NEXT:    [[A_PROMOTED:%.*]] = load i8, ptr [[A]], align 1
-; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
-; CHECK:       loop.header:
-; CHECK-NEXT:    [[DIV1:%.*]] = phi i8 [ [[A_PROMOTED]], [[ENTRY:%.*]] ], [ [[DIV:%.*]], [[LOOP_LATCH:%.*]] ]
-; CHECK-NEXT:    [[I:%.*]] = phi i8 [ 0, [[ENTRY]] ], [ [[ADD:%.*]], [[LOOP_LATCH]] ]
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i8 [[I]], [[N:%.*]]
-; CHECK-NEXT:    br i1 [[CMP]], label [[LOOP_LATCH]], label [[EXIT:%.*]]
-; CHECK:       loop.latch:
+; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
+; CHECK:       [[LOOP_HEADER]]:
+; CHECK-NEXT:    [[DIV1:%.*]] = phi i8 [ [[A_PROMOTED]], %[[ENTRY]] ], [ [[DIV:%.*]], %[[LOOP_LATCH:.*]] ]
+; CHECK-NEXT:    [[I:%.*]] = phi i8 [ 0, %[[ENTRY]] ], [ [[ADD:%.*]], %[[LOOP_LATCH]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i8 [[I]], [[N]]
+; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP_LATCH]], label %[[EXIT:.*]]
+; CHECK:       [[LOOP_LATCH]]:
 ; CHECK-NEXT:    [[DIV]] = sdiv i8 [[I]], 3
 ; CHECK-NEXT:    [[ADD]] = add i8 [[I]], 4
-; CHECK-NEXT:    br label [[LOOP_HEADER]]
-; CHECK:       exit:
-; CHECK-NEXT:    [[DIV1_LCSSA:%.*]] = phi i8 [ [[DIV1]], [[LOOP_HEADER]] ]
+; CHECK-NEXT:    br label %[[LOOP_HEADER]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[DIV1_LCSSA:%.*]] = phi i8 [ [[DIV1]], %[[LOOP_HEADER]] ]
 ; CHECK-NEXT:    store i8 [[DIV1_LCSSA]], ptr [[A]], align 1
 ; CHECK-NEXT:    ret void
 ;
@@ -804,17 +819,18 @@ exit:
 
 define void @test_sink_store_only_no_phi_needed() writeonly {
 ; CHECK: Function Attrs: memory(write)
-; CHECK-LABEL: @test_sink_store_only_no_phi_needed(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[I:%.*]] = phi i8 [ 0, [[ENTRY:%.*]] ], [ [[ADD:%.*]], [[LOOP]] ]
+; CHECK-LABEL: define void @test_sink_store_only_no_phi_needed(
+; CHECK-SAME: ) #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[I:%.*]] = phi i8 [ 0, %[[ENTRY]] ], [ [[ADD:%.*]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i8 [[I]], 4
 ; CHECK-NEXT:    [[DIV:%.*]] = sdiv i8 [[I]], 3
 ; CHECK-NEXT:    [[ADD]] = add i8 [[I]], 4
-; CHECK-NEXT:    br i1 [[CMP]], label [[LOOP]], label [[EXIT:%.*]]
-; CHECK:       exit:
-; CHECK-NEXT:    [[DIV_LCSSA:%.*]] = phi i8 [ [[DIV]], [[LOOP]] ]
+; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[DIV_LCSSA:%.*]] = phi i8 [ [[DIV]], %[[LOOP]] ]
 ; CHECK-NEXT:    store i8 [[DIV_LCSSA]], ptr @glb, align 1
 ; CHECK-NEXT:    ret void
 ;
@@ -834,28 +850,29 @@ exit:
 }
 
 define void @sink_store_lcssa_phis(ptr %ptr, i1 %c) {
-; CHECK-LABEL: @sink_store_lcssa_phis(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[LOOP_1_HEADER:%.*]]
-; CHECK:       loop.1.header:
-; CHECK-NEXT:    br label [[LOOP_2_HEADER:%.*]]
-; CHECK:       loop.2.header:
-; CHECK-NEXT:    br i1 false, label [[LOOP_3_HEADER_PREHEADER:%.*]], label [[LOOP_1_LATCH:%.*]]
-; CHECK:       loop.3.header.preheader:
-; CHECK-NEXT:    br label [[LOOP_3_HEADER:%.*]]
-; CHECK:       loop.3.header:
-; CHECK-NEXT:    [[I_11:%.*]] = phi i32 [ [[I_1:%.*]], [[LOOP_3_LATCH:%.*]] ], [ poison, [[LOOP_3_HEADER_PREHEADER]] ]
-; CHECK-NEXT:    [[I_1]] = phi i32 [ 1, [[LOOP_3_LATCH]] ], [ 0, [[LOOP_3_HEADER_PREHEADER]] ]
-; CHECK-NEXT:    br i1 true, label [[LOOP_3_LATCH]], label [[LOOP_2_LATCH:%.*]]
-; CHECK:       loop.3.latch:
-; CHECK-NEXT:    br label [[LOOP_3_HEADER]]
-; CHECK:       loop.2.latch:
-; CHECK-NEXT:    [[I_11_LCSSA:%.*]] = phi i32 [ [[I_11]], [[LOOP_3_HEADER]] ]
-; CHECK-NEXT:    store i32 [[I_11_LCSSA]], ptr [[PTR:%.*]], align 4
-; CHECK-NEXT:    br label [[LOOP_2_HEADER]]
-; CHECK:       loop.1.latch:
-; CHECK-NEXT:    br i1 [[C:%.*]], label [[LOOP_1_HEADER]], label [[EXIT:%.*]]
-; CHECK:       exit:
+; CHECK-LABEL: define void @sink_store_lcssa_phis(
+; CHECK-SAME: ptr [[PTR:%.*]], i1 [[C:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br label %[[LOOP_1_HEADER:.*]]
+; CHECK:       [[LOOP_1_HEADER]]:
+; CHECK-NEXT:    br label %[[LOOP_2_HEADER:.*]]
+; CHECK:       [[LOOP_2_HEADER]]:
+; CHECK-NEXT:    br i1 false, label %[[LOOP_3_HEADER_PREHEADER:.*]], label %[[LOOP_1_LATCH:.*]]
+; CHECK:       [[LOOP_3_HEADER_PREHEADER]]:
+; CHECK-NEXT:    br label %[[LOOP_3_HEADER:.*]]
+; CHECK:       [[LOOP_3_HEADER]]:
+; CHECK-NEXT:    [[I_11:%.*]] = phi i32 [ [[I_1:%.*]], %[[LOOP_3_LATCH:.*]] ], [ poison, %[[LOOP_3_HEADER_PREHEADER]] ]
+; CHECK-NEXT:    [[I_1]] = phi i32 [ 1, %[[LOOP_3_LATCH]] ], [ 0, %[[LOOP_3_HEADER_PREHEADER]] ]
+; CHECK-NEXT:    br i1 true, label %[[LOOP_3_LATCH]], label %[[LOOP_2_LATCH:.*]]
+; CHECK:       [[LOOP_3_LATCH]]:
+; CHECK-NEXT:    br label %[[LOOP_3_HEADER]]
+; CHECK:       [[LOOP_2_LATCH]]:
+; CHECK-NEXT:    [[I_11_LCSSA:%.*]] = phi i32 [ [[I_11]], %[[LOOP_3_HEADER]] ]
+; CHECK-NEXT:    store i32 [[I_11_LCSSA]], ptr [[PTR]], align 4
+; CHECK-NEXT:    br label %[[LOOP_2_HEADER]]
+; CHECK:       [[LOOP_1_LATCH]]:
+; CHECK-NEXT:    br i1 [[C]], label %[[LOOP_1_HEADER]], label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -886,18 +903,19 @@ exit:
 }
 
 define void @cond_store_writable_dereferenceable(ptr noalias writable dereferenceable(4) %ptr) {
-; CHECK-LABEL: @cond_store_writable_dereferenceable(
-; CHECK-NEXT:    [[PTR_PROMOTED:%.*]] = load i32, ptr [[PTR:%.*]], align 4
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[V_INC1:%.*]] = phi i32 [ [[V_INC:%.*]], [[LOOP_LATCH:%.*]] ], [ [[PTR_PROMOTED]], [[TMP0:%.*]] ]
+; CHECK-LABEL: define void @cond_store_writable_dereferenceable(
+; CHECK-SAME: ptr noalias writable dereferenceable(4) [[PTR:%.*]]) {
+; CHECK-NEXT:    [[PTR_PROMOTED:%.*]] = load i32, ptr [[PTR]], align 4
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[V_INC1:%.*]] = phi i32 [ [[V_INC:%.*]], %[[LOOP_LATCH:.*]] ], [ [[PTR_PROMOTED]], [[TMP0:%.*]] ]
 ; CHECK-NEXT:    [[C:%.*]] = icmp ult i32 [[V_INC1]], 10
-; CHECK-NEXT:    br i1 [[C]], label [[LOOP_LATCH]], label [[EXIT:%.*]]
-; CHECK:       loop.latch:
+; CHECK-NEXT:    br i1 [[C]], label %[[LOOP_LATCH]], label %[[EXIT:.*]]
+; CHECK:       [[LOOP_LATCH]]:
 ; CHECK-NEXT:    [[V_INC]] = add i32 [[V_INC1]], 1
-; CHECK-NEXT:    br label [[LOOP]]
-; CHECK:       exit:
-; CHECK-NEXT:    [[V_INC1_LCSSA:%.*]] = phi i32 [ [[V_INC1]], [[LOOP]] ]
+; CHECK-NEXT:    br label %[[LOOP]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[V_INC1_LCSSA:%.*]] = phi i32 [ [[V_INC1]], %[[LOOP]] ]
 ; CHECK-NEXT:    store i32 [[V_INC1_LCSSA]], ptr [[PTR]], align 4
 ; CHECK-NEXT:    ret void
 ;
@@ -918,18 +936,19 @@ exit:
 }
 
 define void @cond_store_writable_not_sufficiently_dereferenceable(ptr noalias writable dereferenceable(2) %ptr) {
-; CHECK-LABEL: @cond_store_writable_not_sufficiently_dereferenceable(
-; CHECK-NEXT:    [[PTR_PROMOTED:%.*]] = load i32, ptr [[PTR:%.*]], align 4
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[V_INC1:%.*]] = phi i32 [ [[V_INC:%.*]], [[LOOP_LATCH:%.*]] ], [ [[PTR_PROMOTED]], [[TMP0:%.*]] ]
+; CHECK-LABEL: define void @cond_store_writable_not_sufficiently_dereferenceable(
+; CHECK-SAME: ptr noalias writable dereferenceable(2) [[PTR:%.*]]) {
+; CHECK-NEXT:    [[PTR_PROMOTED:%.*]] = load i32, ptr [[PTR]], align 4
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[V_INC1:%.*]] = phi i32 [ [[V_INC:%.*]], %[[LOOP_LATCH:.*]] ], [ [[PTR_PROMOTED]], [[TMP0:%.*]] ]
 ; CHECK-NEXT:    [[C:%.*]] = icmp ult i32 [[V_INC1]], 10
-; CHECK-NEXT:    br i1 [[C]], label [[LOOP_LATCH]], label [[EXIT:%.*]]
-; CHECK:       loop.latch:
+; CHECK-NEXT:    br i1 [[C]], label %[[LOOP_LATCH]], label %[[EXIT:.*]]
+; CHECK:       [[LOOP_LATCH]]:
 ; CHECK-NEXT:    [[V_INC]] = add i32 [[V_INC1]], 1
 ; CHECK-NEXT:    store i32 [[V_INC]], ptr [[PTR]], align 4
-; CHECK-NEXT:    br label [[LOOP]]
-; CHECK:       exit:
+; CHECK-NEXT:    br label %[[LOOP]]
+; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
   br label %loop
@@ -954,3 +973,11 @@ exit:
 !3 = !{!5, !5, i64 0}
 !4 = !{!"int", !1}
 !5 = !{!"float", !1}
+;.
+; CHECK: [[INT_TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0}
+; CHECK: [[META1]] = !{!"int", [[META2:![0-9]+]]}
+; CHECK: [[META2]] = !{!"omnipotent char", [[META3:![0-9]+]]}
+; CHECK: [[META3]] = !{!"Simple C/C++ TBAA"}
+; CHECK: [[FLOAT_TBAA4]] = !{[[META5:![0-9]+]], [[META5]], i64 0}
+; CHECK: [[META5]] = !{!"float", [[META2]]}
+;.
diff --git a/llvm/test/Transforms/LICM/variant-aainfo.ll b/llvm/test/Transforms/LICM/variant-aainfo.ll
index 1e2a33ec990c5..4eac3f2770f67 100644
--- a/llvm/test/Transforms/LICM/variant-aainfo.ll
+++ b/llvm/test/Transforms/LICM/variant-aainfo.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt < %s -S -passes=licm | FileCheck %s
 
 ; See https://discourse.llvm.org/t/rfc-dont-merge-memory-locations-in-aliassettracker/73336
@@ -8,21 +8,21 @@
 define void @_Z4testP1S(ptr %s) {
 ; CHECK-LABEL: define void @_Z4testP1S(
 ; CHECK-SAME: ptr [[S:%.*]]) {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[S_PROMOTED:%.*]] = load ptr, ptr [[S]], align 4, !tbaa [[TBAA0:![0-9]+]]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    [[ADD_PTR_I_LCSSA:%.*]] = phi ptr [ [[ADD_PTR_I:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    store ptr [[ADD_PTR_I_LCSSA]], ptr [[S]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[S_PROMOTED:%.*]] = load ptr, ptr [[S]], align 4, !tbaa [[ANYPTR_TBAA0:![0-9]+]]
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_COND_CLEANUP:.*]]:
+; CHECK-NEXT:    [[ADD_PTR_I_LCSSA:%.*]] = phi ptr [ [[ADD_PTR_I:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    store ptr [[ADD_PTR_I_LCSSA]], ptr [[S]], align 4, !tbaa [[ANYPTR_TBAA0]]
 ; CHECK-NEXT:    ret void
-; CHECK:       for.body:
-; CHECK-NEXT:    [[ADD_PTR_I1:%.*]] = phi ptr [ [[S_PROMOTED]], [[ENTRY:%.*]] ], [ [[ADD_PTR_I]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[I_05:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    store i32 [[I_05]], ptr [[ADD_PTR_I1]], align 4, !tbaa [[TBAA4:![0-9]+]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[ADD_PTR_I1:%.*]] = phi ptr [ [[S_PROMOTED]], %[[ENTRY]] ], [ [[ADD_PTR_I]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[I_05:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    store i32 [[I_05]], ptr [[ADD_PTR_I1]], align 4, !tbaa [[INT_TBAA4:![0-9]+]]
 ; CHECK-NEXT:    [[ADD_PTR_I]] = getelementptr inbounds i32, ptr [[ADD_PTR_I1]], i32 1
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_05]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], 100
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
 ;
 entry:
   br label %for.body
@@ -50,10 +50,10 @@ for.body:                                         ; preds = %entry, %for.body
 !6 = !{!"int", !3, i64 0}
 !7 = !{!2, !2, i64 0}
 ;.
-; CHECK: [[TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0}
+; CHECK: [[ANYPTR_TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0}
 ; CHECK: [[META1]] = !{!"any pointer", [[META2:![0-9]+]], i64 0}
 ; CHECK: [[META2]] = !{!"omnipotent char", [[META3:![0-9]+]], i64 0}
 ; CHECK: [[META3]] = !{!"Simple C++ TBAA"}
-; CHECK: [[TBAA4]] = !{[[META5:![0-9]+]], [[META5]], i64 0}
+; CHECK: [[INT_TBAA4]] = !{[[META5:![0-9]+]], [[META5]], i64 0}
 ; CHECK: [[META5]] = !{!"int", [[META2]], i64 0}
 ;.
diff --git a/llvm/test/Transforms/LoopIdiom/memmove-tbaa.ll b/llvm/test/Transforms/LoopIdiom/memmove-tbaa.ll
index 881931e0ccc2c..218b7f4487cb5 100644
--- a/llvm/test/Transforms/LoopIdiom/memmove-tbaa.ll
+++ b/llvm/test/Transforms/LoopIdiom/memmove-tbaa.ll
@@ -1,21 +1,22 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt -passes="loop-idiom" < %s -S | FileCheck %s
 
 define void @looper(ptr nocapture %out) {
-; CHECK-LABEL: @looper(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[M:%.*]] = getelementptr double, ptr [[OUT:%.*]], i32 16
-; CHECK-NEXT:    call void @llvm.memmove.p0.p0.i64(ptr align 8 [[OUT]], ptr align 8 [[M]], i64 256, i1 false), !tbaa [[TBAA0:![0-9]+]]
-; CHECK-NEXT:    br label [[FOR_BODY4:%.*]]
-; CHECK:       for.body4:
-; CHECK-NEXT:    [[J_020:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY4]] ]
+; CHECK-LABEL: define void @looper(
+; CHECK-SAME: ptr captures(none) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[M:%.*]] = getelementptr double, ptr [[OUT]], i32 16
+; CHECK-NEXT:    call void @llvm.memmove.p0.p0.i64(ptr align 8 [[OUT]], ptr align 8 [[M]], i64 256, i1 false), !tbaa [[DOUBLE_TBAA0:![0-9]+]]
+; CHECK-NEXT:    br label %[[FOR_BODY4:.*]]
+; CHECK:       [[FOR_BODY4]]:
+; CHECK-NEXT:    [[J_020:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_BODY4]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[M]], i64 [[J_020]]
-; CHECK-NEXT:    [[A0:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA0]]
+; CHECK-NEXT:    [[A0:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !tbaa [[DOUBLE_TBAA0]]
 ; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds double, ptr [[OUT]], i64 [[J_020]]
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[J_020]], 1
 ; CHECK-NEXT:    [[CMP2:%.*]] = icmp ult i64 [[J_020]], 31
-; CHECK-NEXT:    br i1 [[CMP2]], label [[FOR_BODY4]], label [[FOR_COND_CLEANUP:%.*]]
-; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    br i1 [[CMP2]], label %[[FOR_BODY4]], label %[[FOR_COND_CLEANUP:.*]]
+; CHECK:       [[FOR_COND_CLEANUP]]:
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -38,20 +39,21 @@ for.cond.cleanup:                                 ; preds = %for.cond.cleanup3
 
 
 define void @looperBadMerge(ptr nocapture %out) {
-; CHECK-LABEL: @looperBadMerge(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[M:%.*]] = getelementptr double, ptr [[OUT:%.*]], i32 16
-; CHECK-NEXT:    call void @llvm.memmove.p0.p0.i64(ptr align 8 [[OUT]], ptr align 8 [[M]], i64 256, i1 false), !tbaa [[TBAA4:![0-9]+]]
-; CHECK-NEXT:    br label [[FOR_BODY4:%.*]]
-; CHECK:       for.body4:
-; CHECK-NEXT:    [[J_020:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY4]] ]
+; CHECK-LABEL: define void @looperBadMerge(
+; CHECK-SAME: ptr captures(none) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[M:%.*]] = getelementptr double, ptr [[OUT]], i32 16
+; CHECK-NEXT:    call void @llvm.memmove.p0.p0.i64(ptr align 8 [[OUT]], ptr align 8 [[M]], i64 256, i1 false), !tbaa [[CHAR_TBAA4:![0-9]+]]
+; CHECK-NEXT:    br label %[[FOR_BODY4:.*]]
+; CHECK:       [[FOR_BODY4]]:
+; CHECK-NEXT:    [[J_020:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_BODY4]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[M]], i64 [[J_020]]
-; CHECK-NEXT:    [[A0:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA0]]
+; CHECK-NEXT:    [[A0:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !tbaa [[DOUBLE_TBAA0]]
 ; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds double, ptr [[OUT]], i64 [[J_020]]
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[J_020]], 1
 ; CHECK-NEXT:    [[CMP2:%.*]] = icmp ult i64 [[J_020]], 31
-; CHECK-NEXT:    br i1 [[CMP2]], label [[FOR_BODY4]], label [[FOR_COND_CLEANUP:%.*]]
-; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    br i1 [[CMP2]], label %[[FOR_BODY4]], label %[[FOR_COND_CLEANUP:.*]]
+; CHECK:       [[FOR_COND_CLEANUP]]:
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -73,20 +75,21 @@ for.cond.cleanup:                                 ; preds = %for.cond.cleanup3
 }
 
 define void @looperGoodMerge(ptr nocapture %out) {
-; CHECK-LABEL: @looperGoodMerge(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[M:%.*]] = getelementptr double, ptr [[OUT:%.*]], i32 16
+; CHECK-LABEL: define void @looperGoodMerge(
+; CHECK-SAME: ptr captures(none) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[M:%.*]] = getelementptr double, ptr [[OUT]], i32 16
 ; CHECK-NEXT:    call void @llvm.memmove.p0.p0.i64(ptr align 8 [[OUT]], ptr align 8 [[M]], i64 256, i1 false)
-; CHECK-NEXT:    br label [[FOR_BODY4:%.*]]
-; CHECK:       for.body4:
-; CHECK-NEXT:    [[J_020:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY4]] ]
+; CHECK-NEXT:    br label %[[FOR_BODY4:.*]]
+; CHECK:       [[FOR_BODY4]]:
+; CHECK-NEXT:    [[J_020:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_BODY4]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[M]], i64 [[J_020]]
-; CHECK-NEXT:    [[A0:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA0]]
+; CHECK-NEXT:    [[A0:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !tbaa [[DOUBLE_TBAA0]]
 ; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds double, ptr [[OUT]], i64 [[J_020]]
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[J_020]], 1
 ; CHECK-NEXT:    [[CMP2:%.*]] = icmp ult i64 [[J_020]], 31
-; CHECK-NEXT:    br i1 [[CMP2]], label [[FOR_BODY4]], label [[FOR_COND_CLEANUP:%.*]]
-; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    br i1 [[CMP2]], label %[[FOR_BODY4]], label %[[FOR_COND_CLEANUP:.*]]
+; CHECK:       [[FOR_COND_CLEANUP]]:
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -114,3 +117,10 @@ for.cond.cleanup:                                 ; preds = %for.cond.cleanup3
 !6 = !{!"double", !7, i64 0}
 !7 = !{!"omnipotent char", !8, i64 0}
 !8 = !{!"Simple C++ TBAA"}
+;.
+; CHECK: [[DOUBLE_TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0}
+; CHECK: [[META1]] = !{!"double", [[META2:![0-9]+]], i64 0}
+; CHECK: [[META2]] = !{!"omnipotent char", [[META3:![0-9]+]], i64 0}
+; CHECK: [[META3]] = !{!"Simple C++ TBAA"}
+; CHECK: [[CHAR_TBAA4]] = !{[[META2]], [[META2]], i64 0}
+;.
diff --git a/llvm/test/Transforms/LoopStrengthReduce/AArch64/prefer-all.ll b/llvm/test/Transforms/LoopStrengthReduce/AArch64/prefer-all.ll
new file mode 100644
index 0000000000000..db30fd23b0c9d
--- /dev/null
+++ b/llvm/test/Transforms/LoopStrengthReduce/AArch64/prefer-all.ll
@@ -0,0 +1,178 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -mtriple=aarch64-none-elf -loop-reduce -lsr-preferred-addressing-mode=all < %s | FileCheck %s
+
+define i32 @postindex_loop(ptr %p, i64 %n) {
+; CHECK-LABEL: define i32 @postindex_loop(
+; CHECK-SAME: ptr [[P:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[LSR_IV1:%.*]] = phi ptr [ [[SCEVGEP:%.*]], %[[FOR_BODY]] ], [ [[P]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[LSR_IV:%.*]] = phi i64 [ [[LSR_IV_NEXT:%.*]], %[[FOR_BODY]] ], [ [[N]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[RET:%.*]] = phi i32 [ [[ADD:%.*]], %[[FOR_BODY]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[VAL:%.*]] = load i32, ptr [[LSR_IV1]], align 4
+; CHECK-NEXT:    [[ADD]] = add i32 [[RET]], [[VAL]]
+; CHECK-NEXT:    [[LSR_IV_NEXT]] = add i64 [[LSR_IV]], -1
+; CHECK-NEXT:    [[SCEVGEP]] = getelementptr i8, ptr [[LSR_IV1]], i64 4
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[LSR_IV_NEXT]], 0
+; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[EXIT:.*]], label %[[FOR_BODY]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret i32 [[ADD]]
+;
+entry:
+  br label %for.body
+
+for.body:
+  %idx = phi i64 [ %idx.next, %for.body ], [ 0, %entry ]
+  %ret = phi i32 [ %add, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw i32, ptr %p, i64 %idx
+  %val = load i32, ptr %arrayidx, align 4
+  %add = add i32 %ret, %val
+  %idx.next = add nuw nsw i64 %idx, 1
+  %exitcond = icmp eq i64 %idx.next, %n
+  br i1 %exitcond, label %exit, label %for.body
+
+exit:
+  ret i32 %add
+}
+
+; Preindex saves a setup instruction compared to postindex
+; FIXME: We currently don't recognize that preindex is possible here
+define i32 @preindex_loop(ptr %p, i64 %n) {
+; CHECK-LABEL: define i32 @preindex_loop(
+; CHECK-SAME: ptr [[P:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr nuw i8, ptr [[P]], i64 4
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[LSR_IV1:%.*]] = phi ptr [ [[SCEVGEP2:%.*]], %[[FOR_BODY]] ], [ [[SCEVGEP]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[LSR_IV:%.*]] = phi i64 [ [[LSR_IV_NEXT:%.*]], %[[FOR_BODY]] ], [ [[N]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[RET:%.*]] = phi i32 [ [[ADD:%.*]], %[[FOR_BODY]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[VAL:%.*]] = load i32, ptr [[LSR_IV1]], align 4
+; CHECK-NEXT:    [[ADD]] = add i32 [[RET]], [[VAL]]
+; CHECK-NEXT:    [[LSR_IV_NEXT]] = add i64 [[LSR_IV]], -1
+; CHECK-NEXT:    [[SCEVGEP2]] = getelementptr i8, ptr [[LSR_IV1]], i64 4
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[LSR_IV_NEXT]], 0
+; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[EXIT:.*]], label %[[FOR_BODY]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret i32 [[ADD]]
+;
+entry:
+  br label %for.body
+
+for.body:
+  %idx = phi i64 [ %idx.next, %for.body ], [ 0, %entry ]
+  %ret = phi i32 [ %add, %for.body ], [ 0, %entry ]
+  %idx.next = add nuw nsw i64 %idx, 1
+  %arrayidx = getelementptr inbounds nuw i32, ptr %p, i64 %idx.next
+  %val = load i32, ptr %arrayidx, align 4
+  %add = add i32 %ret, %val
+  %exitcond = icmp eq i64 %idx.next, %n
+  br i1 %exitcond, label %exit, label %for.body
+
+exit:
+  ret i32 %add
+}
+
+; We should use offset addressing here as postindex uses an extra register.
+; FIXME: We currently use postindex as we don't realize the load of val2 is also
+; a use of p that needs it to be live in the loop.
+define i64 @offset_loop(ptr %p, i64 %n) {
+; CHECK-LABEL: define i64 @offset_loop(
+; CHECK-SAME: ptr [[P:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[LSR_IV:%.*]] = phi ptr [ [[SCEVGEP:%.*]], %[[FOR_BODY]] ], [ [[P]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[RET:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[ADD:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[IDX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IDX_NEXT:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[VAL1:%.*]] = load i64, ptr [[LSR_IV]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i64, ptr [[P]], i64 [[VAL1]]
+; CHECK-NEXT:    [[VAL2:%.*]] = load i64, ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[ADD]] = add i64 [[VAL2]], [[RET]]
+; CHECK-NEXT:    [[IDX_NEXT]] = add nuw nsw i64 [[IDX]], 1
+; CHECK-NEXT:    [[SCEVGEP]] = getelementptr i8, ptr [[LSR_IV]], i64 8
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i64 [[IDX_NEXT]], [[VAL1]]
+; CHECK-NEXT:    br i1 [[CMP]], label %[[FOR_END:.*]], label %[[FOR_BODY]]
+; CHECK:       [[FOR_END]]:
+; CHECK-NEXT:    ret i64 [[ADD]]
+;
+entry:
+  br label %for.body
+
+for.body:
+  %ret = phi i64 [ 0, %entry ], [ %add, %for.body ]
+  %idx = phi i64 [ 0, %entry ], [ %idx.next, %for.body ]
+  %arrayidx1 = getelementptr inbounds nuw i64, ptr %p, i64 %idx
+  %val1 = load i64, ptr %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds nuw i64, ptr %p, i64 %val1
+  %val2 = load i64, ptr %arrayidx2, align 4
+  %add = add i64 %val2, %ret
+  %idx.next = add nuw nsw i64 %idx, 1
+  %cmp = icmp eq i64 %idx.next, %val1
+  br i1 %cmp, label %for.end, label %for.body
+
+for.end:
+  ret i64 %add
+}
+
+; We can't use postindex addressing on the conditional load of qval and can't
+; convert the loop condition to a compare with zero, so we should instead use
+; offset addressing.
+; FIXME: Currently we don't notice the load of qval is conditional, and attempt
+; postindex addressing anyway.
+define i32 @conditional_load(ptr %p, ptr %q, ptr %n) {
+; CHECK-LABEL: define i32 @conditional_load(
+; CHECK-SAME: ptr [[P:%.*]], ptr [[Q:%.*]], ptr [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[LSR_IV1:%.*]] = phi ptr [ [[SCEVGEP2:%.*]], %[[FOR_INC:.*]] ], [ [[P]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[LSR_IV:%.*]] = phi ptr [ [[SCEVGEP:%.*]], %[[FOR_INC]] ], [ [[Q]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[IDX:%.*]] = phi i64 [ [[IDX_NEXT:%.*]], %[[FOR_INC]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[RET:%.*]] = phi i32 [ [[RET_NEXT:%.*]], %[[FOR_INC]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[PVAL:%.*]] = load i32, ptr [[LSR_IV1]], align 4
+; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i32 [[PVAL]], 0
+; CHECK-NEXT:    [[SCEVGEP2]] = getelementptr i8, ptr [[LSR_IV1]], i64 4
+; CHECK-NEXT:    br i1 [[TOBOOL_NOT]], label %[[FOR_INC]], label %[[IF_THEN:.*]]
+; CHECK:       [[IF_THEN]]:
+; CHECK-NEXT:    [[QVAL:%.*]] = load i32, ptr [[LSR_IV]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[RET]], [[QVAL]]
+; CHECK-NEXT:    br label %[[FOR_INC]]
+; CHECK:       [[FOR_INC]]:
+; CHECK-NEXT:    [[RET_NEXT]] = phi i32 [ [[ADD]], %[[IF_THEN]] ], [ [[RET]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[IDX_NEXT]] = add nuw nsw i64 [[IDX]], 1
+; CHECK-NEXT:    [[NVAL:%.*]] = load volatile i64, ptr [[N]], align 8
+; CHECK-NEXT:    [[SCEVGEP]] = getelementptr i8, ptr [[LSR_IV]], i64 4
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i64 [[IDX_NEXT]], [[NVAL]]
+; CHECK-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret i32 [[RET_NEXT]]
+;
+entry:
+  br label %for.body
+
+for.body:
+  %idx = phi i64 [ %idx.next, %for.inc ], [ 0, %entry ]
+  %ret = phi i32 [ %ret.next, %for.inc ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw i32, ptr %p, i64 %idx
+  %pval = load i32, ptr %arrayidx, align 4
+  %tobool.not = icmp eq i32 %pval, 0
+  br i1 %tobool.not, label %for.inc, label %if.then
+
+if.then:
+  %arrayidx1 = getelementptr inbounds nuw i32, ptr %q, i64 %idx
+  %qval = load i32, ptr %arrayidx1, align 4
+  %add = add i32 %ret, %qval
+  br label %for.inc
+
+for.inc:
+  %ret.next = phi i32 [ %add, %if.then ], [ %ret, %for.body ]
+  %idx.next = add nuw nsw i64 %idx, 1
+  %nval = load volatile i64, ptr %n, align 8
+  %cmp = icmp slt i64 %idx.next, %nval
+  br i1 %cmp, label %for.body, label %exit
+
+exit:
+  ret i32 %ret.next
+}
diff --git a/llvm/test/Transforms/LoopStrengthReduce/duplicated-phis.ll b/llvm/test/Transforms/LoopStrengthReduce/duplicated-phis.ll
index c59f7d9c2a41a..43389b5df8f00 100644
--- a/llvm/test/Transforms/LoopStrengthReduce/duplicated-phis.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/duplicated-phis.ll
@@ -83,3 +83,41 @@ for.end:
   %res.0.lcssa = phi i64 [ 0, %entry ], [ %spec.select, %for.end.loopexit.unr-lcssa ]
   ret i64 %res.0.lcssa
 }
+
+define i64 @duplicated_phis_compare_uses_mul_udiv(i64 %x) {
+; CHECK-LABEL: define i64 @duplicated_phis_compare_uses_mul_udiv(
+; CHECK-SAME: i64 [[X:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[MUL_2:%.*]] = shl i64 [[X]], 1
+; CHECK-NEXT:    [[DIV_16:%.*]] = lshr i64 [[MUL_2]], 4
+; CHECK-NEXT:    [[MASKED:%.*]] = and i64 [[DIV_16]], 1152921504606846974
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV_1:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_1_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    call void @clobber()
+; CHECK-NEXT:    [[IV_1_NEXT]] = add i64 [[IV_1]], 2
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[MASKED]], [[IV_1_NEXT]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret i64 [[IV_1_NEXT]]
+;
+entry:
+  %mul.2 = shl i64 %x, 1
+  %div.16 = lshr exact i64 %mul.2, 4
+  %masked = and i64 %div.16, 1152921504606846974
+  br label %loop
+
+loop:
+  %iv.1 = phi i64 [ 0, %entry ], [ %iv.1.next, %loop ]
+  %iv.2 = phi i64 [ 0, %entry ], [ %iv.2.next, %loop ]
+  call void @clobber()
+  %iv.1.next = add i64 %iv.1, 2
+  %iv.2.next = add i64 %iv.2, 2
+  %ec = icmp eq i64 %iv.2.next, %masked
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret i64 %iv.1.next
+}
+
+declare void @clobber()
diff --git a/llvm/test/Transforms/LoopUnrollAndJam/unroll-and-jam.ll b/llvm/test/Transforms/LoopUnrollAndJam/unroll-and-jam.ll
index 89ce66767ccc9..6f48c41a2ad06 100644
--- a/llvm/test/Transforms/LoopUnrollAndJam/unroll-and-jam.ll
+++ b/llvm/test/Transforms/LoopUnrollAndJam/unroll-and-jam.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt -passes=loop-unroll-and-jam -allow-unroll-and-jam -unroll-and-jam-count=4 -unroll-remainder < %s -S | FileCheck %s
 ; RUN: opt -aa-pipeline=tbaa,basic-aa -passes='loop-unroll-and-jam' -allow-unroll-and-jam -unroll-and-jam-count=4 -unroll-remainder < %s -S | FileCheck %s
 
@@ -6,137 +6,138 @@ target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
 
 ; Tests for(i) { sum = 0; for(j) sum += B[j]; A[i] = sum; }
 define void @test1(i32 %I, i32 %E, ptr noalias nocapture %A, ptr noalias nocapture readonly %B) #0 {
-; CHECK-LABEL: @test1(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[E:%.*]], 0
-; CHECK-NEXT:    [[CMPJ:%.*]] = icmp ne i32 [[I:%.*]], 0
+; CHECK-LABEL: define void @test1(
+; CHECK-SAME: i32 [[I:%.*]], i32 [[E:%.*]], ptr noalias captures(none) [[A:%.*]], ptr noalias readonly captures(none) [[B:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[E]], 0
+; CHECK-NEXT:    [[CMPJ:%.*]] = icmp ne i32 [[I]], 0
 ; CHECK-NEXT:    [[OR_COND:%.*]] = and i1 [[CMP]], [[CMPJ]]
-; CHECK-NEXT:    br i1 [[OR_COND]], label [[FOR_OUTER_PREHEADER:%.*]], label [[FOR_END:%.*]]
-; CHECK:       for.outer.preheader:
+; CHECK-NEXT:    br i1 [[OR_COND]], label %[[FOR_OUTER_PREHEADER:.*]], label %[[FOR_END:.*]]
+; CHECK:       [[FOR_OUTER_PREHEADER]]:
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[I]], -1
 ; CHECK-NEXT:    [[XTRAITER:%.*]] = and i32 [[I]], 3
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP0]], 3
-; CHECK-NEXT:    br i1 [[TMP1]], label [[FOR_END_LOOPEXIT_UNR_LCSSA:%.*]], label [[FOR_OUTER_PREHEADER_NEW:%.*]]
-; CHECK:       for.outer.preheader.new:
+; CHECK-NEXT:    br i1 [[TMP1]], label %[[FOR_END_LOOPEXIT_UNR_LCSSA:.*]], label %[[FOR_OUTER_PREHEADER_NEW:.*]]
+; CHECK:       [[FOR_OUTER_PREHEADER_NEW]]:
 ; CHECK-NEXT:    [[UNROLL_ITER:%.*]] = sub i32 [[I]], [[XTRAITER]]
-; CHECK-NEXT:    br label [[FOR_OUTER:%.*]]
-; CHECK:       for.outer:
-; CHECK-NEXT:    [[I:%.*]] = phi i32 [ [[ADD8_3:%.*]], [[FOR_LATCH:%.*]] ], [ 0, [[FOR_OUTER_PREHEADER_NEW]] ]
-; CHECK-NEXT:    [[NITER:%.*]] = phi i32 [ 0, [[FOR_OUTER_PREHEADER_NEW]] ], [ [[NITER_NEXT_3:%.*]], [[FOR_LATCH]] ]
+; CHECK-NEXT:    br label %[[FOR_OUTER:.*]]
+; CHECK:       [[FOR_OUTER]]:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ [[ADD8_3:%.*]], %[[FOR_LATCH:.*]] ], [ 0, %[[FOR_OUTER_PREHEADER_NEW]] ]
+; CHECK-NEXT:    [[NITER:%.*]] = phi i32 [ 0, %[[FOR_OUTER_PREHEADER_NEW]] ], [ [[NITER_NEXT_3:%.*]], %[[FOR_LATCH]] ]
 ; CHECK-NEXT:    [[ADD8:%.*]] = add nuw nsw i32 [[I]], 1
 ; CHECK-NEXT:    [[ADD8_1:%.*]] = add nuw nsw i32 [[I]], 2
 ; CHECK-NEXT:    [[ADD8_2:%.*]] = add nuw nsw i32 [[I]], 3
 ; CHECK-NEXT:    [[ADD8_3]] = add nuw i32 [[I]], 4
 ; CHECK-NEXT:    [[NITER_NEXT_3]] = add i32 [[NITER]], 4
-; CHECK-NEXT:    br label [[FOR_INNER:%.*]]
-; CHECK:       for.inner:
-; CHECK-NEXT:    [[J:%.*]] = phi i32 [ 0, [[FOR_OUTER]] ], [ [[INC:%.*]], [[FOR_INNER]] ]
-; CHECK-NEXT:    [[SUM:%.*]] = phi i32 [ 0, [[FOR_OUTER]] ], [ [[ADD:%.*]], [[FOR_INNER]] ]
-; CHECK-NEXT:    [[J_1:%.*]] = phi i32 [ 0, [[FOR_OUTER]] ], [ [[INC_1:%.*]], [[FOR_INNER]] ]
-; CHECK-NEXT:    [[SUM_1:%.*]] = phi i32 [ 0, [[FOR_OUTER]] ], [ [[ADD_1:%.*]], [[FOR_INNER]] ]
-; CHECK-NEXT:    [[J_2:%.*]] = phi i32 [ 0, [[FOR_OUTER]] ], [ [[INC_2:%.*]], [[FOR_INNER]] ]
-; CHECK-NEXT:    [[SUM_2:%.*]] = phi i32 [ 0, [[FOR_OUTER]] ], [ [[ADD_2:%.*]], [[FOR_INNER]] ]
-; CHECK-NEXT:    [[J_3:%.*]] = phi i32 [ 0, [[FOR_OUTER]] ], [ [[INC_3:%.*]], [[FOR_INNER]] ]
-; CHECK-NEXT:    [[SUM_3:%.*]] = phi i32 [ 0, [[FOR_OUTER]] ], [ [[ADD_3:%.*]], [[FOR_INNER]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i32 [[J]]
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA0:![0-9]+]]
+; CHECK-NEXT:    br label %[[FOR_INNER:.*]]
+; CHECK:       [[FOR_INNER]]:
+; CHECK-NEXT:    [[J:%.*]] = phi i32 [ 0, %[[FOR_OUTER]] ], [ [[INC:%.*]], %[[FOR_INNER]] ]
+; CHECK-NEXT:    [[SUM:%.*]] = phi i32 [ 0, %[[FOR_OUTER]] ], [ [[ADD:%.*]], %[[FOR_INNER]] ]
+; CHECK-NEXT:    [[J_1:%.*]] = phi i32 [ 0, %[[FOR_OUTER]] ], [ [[INC_1:%.*]], %[[FOR_INNER]] ]
+; CHECK-NEXT:    [[SUM_1:%.*]] = phi i32 [ 0, %[[FOR_OUTER]] ], [ [[ADD_1:%.*]], %[[FOR_INNER]] ]
+; CHECK-NEXT:    [[J_2:%.*]] = phi i32 [ 0, %[[FOR_OUTER]] ], [ [[INC_2:%.*]], %[[FOR_INNER]] ]
+; CHECK-NEXT:    [[SUM_2:%.*]] = phi i32 [ 0, %[[FOR_OUTER]] ], [ [[ADD_2:%.*]], %[[FOR_INNER]] ]
+; CHECK-NEXT:    [[J_3:%.*]] = phi i32 [ 0, %[[FOR_OUTER]] ], [ [[INC_3:%.*]], %[[FOR_INNER]] ]
+; CHECK-NEXT:    [[SUM_3:%.*]] = phi i32 [ 0, %[[FOR_OUTER]] ], [ [[ADD_3:%.*]], %[[FOR_INNER]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[J]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !tbaa [[INT_TBAA0:![0-9]+]]
 ; CHECK-NEXT:    [[ADD]] = add i32 [[TMP2]], [[SUM]]
 ; CHECK-NEXT:    [[INC]] = add nuw i32 [[J]], 1
 ; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[J_1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX_1]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX_1]], align 4, !tbaa [[INT_TBAA0]]
 ; CHECK-NEXT:    [[ADD_1]] = add i32 [[TMP3]], [[SUM_1]]
 ; CHECK-NEXT:    [[INC_1]] = add nuw i32 [[J_1]], 1
 ; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[J_2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX_2]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX_2]], align 4, !tbaa [[INT_TBAA0]]
 ; CHECK-NEXT:    [[ADD_2]] = add i32 [[TMP4]], [[SUM_2]]
 ; CHECK-NEXT:    [[INC_2]] = add nuw i32 [[J_2]], 1
 ; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[J_3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[ARRAYIDX_3]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[ARRAYIDX_3]], align 4, !tbaa [[INT_TBAA0]]
 ; CHECK-NEXT:    [[ADD_3]] = add i32 [[TMP5]], [[SUM_3]]
 ; CHECK-NEXT:    [[INC_3]] = add nuw i32 [[J_3]], 1
 ; CHECK-NEXT:    [[EXITCOND_3:%.*]] = icmp eq i32 [[INC_3]], [[E]]
-; CHECK-NEXT:    br i1 [[EXITCOND_3]], label [[FOR_LATCH]], label [[FOR_INNER]]
-; CHECK:       for.latch:
-; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_INNER]] ]
-; CHECK-NEXT:    [[ADD_LCSSA_1:%.*]] = phi i32 [ [[ADD_1]], [[FOR_INNER]] ]
-; CHECK-NEXT:    [[ADD_LCSSA_2:%.*]] = phi i32 [ [[ADD_2]], [[FOR_INNER]] ]
-; CHECK-NEXT:    [[ADD_LCSSA_3:%.*]] = phi i32 [ [[ADD_3]], [[FOR_INNER]] ]
-; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[I]]
-; CHECK-NEXT:    store i32 [[ADD_LCSSA]], ptr [[ARRAYIDX6]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    br i1 [[EXITCOND_3]], label %[[FOR_LATCH]], label %[[FOR_INNER]]
+; CHECK:       [[FOR_LATCH]]:
+; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], %[[FOR_INNER]] ]
+; CHECK-NEXT:    [[ADD_LCSSA_1:%.*]] = phi i32 [ [[ADD_1]], %[[FOR_INNER]] ]
+; CHECK-NEXT:    [[ADD_LCSSA_2:%.*]] = phi i32 [ [[ADD_2]], %[[FOR_INNER]] ]
+; CHECK-NEXT:    [[ADD_LCSSA_3:%.*]] = phi i32 [ [[ADD_3]], %[[FOR_INNER]] ]
+; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[I]]
+; CHECK-NEXT:    store i32 [[ADD_LCSSA]], ptr [[ARRAYIDX6]], align 4, !tbaa [[INT_TBAA0]]
 ; CHECK-NEXT:    [[ARRAYIDX6_1:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[ADD8]]
-; CHECK-NEXT:    store i32 [[ADD_LCSSA_1]], ptr [[ARRAYIDX6_1]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    store i32 [[ADD_LCSSA_1]], ptr [[ARRAYIDX6_1]], align 4, !tbaa [[INT_TBAA0]]
 ; CHECK-NEXT:    [[ARRAYIDX6_2:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[ADD8_1]]
-; CHECK-NEXT:    store i32 [[ADD_LCSSA_2]], ptr [[ARRAYIDX6_2]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    store i32 [[ADD_LCSSA_2]], ptr [[ARRAYIDX6_2]], align 4, !tbaa [[INT_TBAA0]]
 ; CHECK-NEXT:    [[ARRAYIDX6_3:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[ADD8_2]]
-; CHECK-NEXT:    store i32 [[ADD_LCSSA_3]], ptr [[ARRAYIDX6_3]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    store i32 [[ADD_LCSSA_3]], ptr [[ARRAYIDX6_3]], align 4, !tbaa [[INT_TBAA0]]
 ; CHECK-NEXT:    [[NITER_NCMP_3:%.*]] = icmp eq i32 [[NITER_NEXT_3]], [[UNROLL_ITER]]
-; CHECK-NEXT:    br i1 [[NITER_NCMP_3]], label [[FOR_END_LOOPEXIT_UNR_LCSSA_LOOPEXIT:%.*]], label [[FOR_OUTER]], !llvm.loop [[LOOP4:![0-9]+]]
-; CHECK:       for.end.loopexit.unr-lcssa.loopexit:
-; CHECK-NEXT:    [[I_UNR_PH:%.*]] = phi i32 [ [[ADD8_3]], [[FOR_LATCH]] ]
-; CHECK-NEXT:    br label [[FOR_END_LOOPEXIT_UNR_LCSSA]]
-; CHECK:       for.end.loopexit.unr-lcssa:
-; CHECK-NEXT:    [[I_UNR:%.*]] = phi i32 [ 0, [[FOR_OUTER_PREHEADER]] ], [ [[I_UNR_PH]], [[FOR_END_LOOPEXIT_UNR_LCSSA_LOOPEXIT]] ]
+; CHECK-NEXT:    br i1 [[NITER_NCMP_3]], label %[[FOR_END_LOOPEXIT_UNR_LCSSA_LOOPEXIT:.*]], label %[[FOR_OUTER]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       [[FOR_END_LOOPEXIT_UNR_LCSSA_LOOPEXIT]]:
+; CHECK-NEXT:    [[I_UNR_PH:%.*]] = phi i32 [ [[ADD8_3]], %[[FOR_LATCH]] ]
+; CHECK-NEXT:    br label %[[FOR_END_LOOPEXIT_UNR_LCSSA]]
+; CHECK:       [[FOR_END_LOOPEXIT_UNR_LCSSA]]:
+; CHECK-NEXT:    [[I_UNR:%.*]] = phi i32 [ 0, %[[FOR_OUTER_PREHEADER]] ], [ [[I_UNR_PH]], %[[FOR_END_LOOPEXIT_UNR_LCSSA_LOOPEXIT]] ]
 ; CHECK-NEXT:    [[LCMP_MOD:%.*]] = icmp ne i32 [[XTRAITER]], 0
-; CHECK-NEXT:    br i1 [[LCMP_MOD]], label [[FOR_OUTER_EPIL_PREHEADER:%.*]], label [[FOR_END_LOOPEXIT:%.*]]
-; CHECK:       for.outer.epil.preheader:
-; CHECK-NEXT:    br label [[FOR_OUTER_EPIL:%.*]]
-; CHECK:       for.outer.epil:
-; CHECK-NEXT:    br label [[FOR_INNER_EPIL:%.*]]
-; CHECK:       for.inner.epil:
-; CHECK-NEXT:    [[J_EPIL:%.*]] = phi i32 [ 0, [[FOR_OUTER_EPIL]] ], [ [[INC_EPIL:%.*]], [[FOR_INNER_EPIL]] ]
-; CHECK-NEXT:    [[SUM_EPIL:%.*]] = phi i32 [ 0, [[FOR_OUTER_EPIL]] ], [ [[ADD_EPIL:%.*]], [[FOR_INNER_EPIL]] ]
+; CHECK-NEXT:    br i1 [[LCMP_MOD]], label %[[FOR_OUTER_EPIL_PREHEADER:.*]], label %[[FOR_END_LOOPEXIT:.*]]
+; CHECK:       [[FOR_OUTER_EPIL_PREHEADER]]:
+; CHECK-NEXT:    br label %[[FOR_OUTER_EPIL:.*]]
+; CHECK:       [[FOR_OUTER_EPIL]]:
+; CHECK-NEXT:    br label %[[FOR_INNER_EPIL:.*]]
+; CHECK:       [[FOR_INNER_EPIL]]:
+; CHECK-NEXT:    [[J_EPIL:%.*]] = phi i32 [ 0, %[[FOR_OUTER_EPIL]] ], [ [[INC_EPIL:%.*]], %[[FOR_INNER_EPIL]] ]
+; CHECK-NEXT:    [[SUM_EPIL:%.*]] = phi i32 [ 0, %[[FOR_OUTER_EPIL]] ], [ [[ADD_EPIL:%.*]], %[[FOR_INNER_EPIL]] ]
 ; CHECK-NEXT:    [[ARRAYIDX_EPIL:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[J_EPIL]]
-; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[ARRAYIDX_EPIL]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[ARRAYIDX_EPIL]], align 4, !tbaa [[INT_TBAA0]]
 ; CHECK-NEXT:    [[ADD_EPIL]] = add i32 [[TMP6]], [[SUM_EPIL]]
 ; CHECK-NEXT:    [[INC_EPIL]] = add nuw i32 [[J_EPIL]], 1
 ; CHECK-NEXT:    [[EXITCOND_EPIL:%.*]] = icmp eq i32 [[INC_EPIL]], [[E]]
-; CHECK-NEXT:    br i1 [[EXITCOND_EPIL]], label [[FOR_LATCH_EPIL:%.*]], label [[FOR_INNER_EPIL]]
-; CHECK:       for.latch.epil:
-; CHECK-NEXT:    [[ADD_LCSSA_EPIL:%.*]] = phi i32 [ [[ADD_EPIL]], [[FOR_INNER_EPIL]] ]
+; CHECK-NEXT:    br i1 [[EXITCOND_EPIL]], label %[[FOR_LATCH_EPIL:.*]], label %[[FOR_INNER_EPIL]]
+; CHECK:       [[FOR_LATCH_EPIL]]:
+; CHECK-NEXT:    [[ADD_LCSSA_EPIL:%.*]] = phi i32 [ [[ADD_EPIL]], %[[FOR_INNER_EPIL]] ]
 ; CHECK-NEXT:    [[ARRAYIDX6_EPIL:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[I_UNR]]
-; CHECK-NEXT:    store i32 [[ADD_LCSSA_EPIL]], ptr [[ARRAYIDX6_EPIL]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    store i32 [[ADD_LCSSA_EPIL]], ptr [[ARRAYIDX6_EPIL]], align 4, !tbaa [[INT_TBAA0]]
 ; CHECK-NEXT:    [[ADD8_EPIL:%.*]] = add nuw i32 [[I_UNR]], 1
 ; CHECK-NEXT:    [[EPIL_ITER_CMP:%.*]] = icmp ne i32 1, [[XTRAITER]]
-; CHECK-NEXT:    br i1 [[EPIL_ITER_CMP]], label [[FOR_OUTER_EPIL_1:%.*]], label [[FOR_END_LOOPEXIT_EPILOG_LCSSA:%.*]]
-; CHECK:       for.outer.epil.1:
-; CHECK-NEXT:    br label [[FOR_INNER_EPIL_1:%.*]]
-; CHECK:       for.inner.epil.1:
-; CHECK-NEXT:    [[J_EPIL_1:%.*]] = phi i32 [ 0, [[FOR_OUTER_EPIL_1]] ], [ [[INC_EPIL_1:%.*]], [[FOR_INNER_EPIL_1]] ]
-; CHECK-NEXT:    [[SUM_EPIL_1:%.*]] = phi i32 [ 0, [[FOR_OUTER_EPIL_1]] ], [ [[ADD_EPIL_1:%.*]], [[FOR_INNER_EPIL_1]] ]
+; CHECK-NEXT:    br i1 [[EPIL_ITER_CMP]], label %[[FOR_OUTER_EPIL_1:.*]], label %[[FOR_END_LOOPEXIT_EPILOG_LCSSA:.*]]
+; CHECK:       [[FOR_OUTER_EPIL_1]]:
+; CHECK-NEXT:    br label %[[FOR_INNER_EPIL_1:.*]]
+; CHECK:       [[FOR_INNER_EPIL_1]]:
+; CHECK-NEXT:    [[J_EPIL_1:%.*]] = phi i32 [ 0, %[[FOR_OUTER_EPIL_1]] ], [ [[INC_EPIL_1:%.*]], %[[FOR_INNER_EPIL_1]] ]
+; CHECK-NEXT:    [[SUM_EPIL_1:%.*]] = phi i32 [ 0, %[[FOR_OUTER_EPIL_1]] ], [ [[ADD_EPIL_1:%.*]], %[[FOR_INNER_EPIL_1]] ]
 ; CHECK-NEXT:    [[ARRAYIDX_EPIL_1:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[J_EPIL_1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[ARRAYIDX_EPIL_1]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[ARRAYIDX_EPIL_1]], align 4, !tbaa [[INT_TBAA0]]
 ; CHECK-NEXT:    [[ADD_EPIL_1]] = add i32 [[TMP7]], [[SUM_EPIL_1]]
 ; CHECK-NEXT:    [[INC_EPIL_1]] = add nuw i32 [[J_EPIL_1]], 1
 ; CHECK-NEXT:    [[EXITCOND_EPIL_1:%.*]] = icmp eq i32 [[INC_EPIL_1]], [[E]]
-; CHECK-NEXT:    br i1 [[EXITCOND_EPIL_1]], label [[FOR_LATCH_EPIL_1:%.*]], label [[FOR_INNER_EPIL_1]]
-; CHECK:       for.latch.epil.1:
-; CHECK-NEXT:    [[ADD_LCSSA_EPIL_1:%.*]] = phi i32 [ [[ADD_EPIL_1]], [[FOR_INNER_EPIL_1]] ]
+; CHECK-NEXT:    br i1 [[EXITCOND_EPIL_1]], label %[[FOR_LATCH_EPIL_1:.*]], label %[[FOR_INNER_EPIL_1]]
+; CHECK:       [[FOR_LATCH_EPIL_1]]:
+; CHECK-NEXT:    [[ADD_LCSSA_EPIL_1:%.*]] = phi i32 [ [[ADD_EPIL_1]], %[[FOR_INNER_EPIL_1]] ]
 ; CHECK-NEXT:    [[ARRAYIDX6_EPIL_1:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[ADD8_EPIL]]
-; CHECK-NEXT:    store i32 [[ADD_LCSSA_EPIL_1]], ptr [[ARRAYIDX6_EPIL_1]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    store i32 [[ADD_LCSSA_EPIL_1]], ptr [[ARRAYIDX6_EPIL_1]], align 4, !tbaa [[INT_TBAA0]]
 ; CHECK-NEXT:    [[ADD8_EPIL_1:%.*]] = add nuw i32 [[I_UNR]], 2
 ; CHECK-NEXT:    [[EPIL_ITER_CMP_1:%.*]] = icmp ne i32 2, [[XTRAITER]]
-; CHECK-NEXT:    br i1 [[EPIL_ITER_CMP_1]], label [[FOR_OUTER_EPIL_2:%.*]], label [[FOR_END_LOOPEXIT_EPILOG_LCSSA]]
-; CHECK:       for.outer.epil.2:
-; CHECK-NEXT:    br label [[FOR_INNER_EPIL_2:%.*]]
-; CHECK:       for.inner.epil.2:
-; CHECK-NEXT:    [[J_EPIL_2:%.*]] = phi i32 [ 0, [[FOR_OUTER_EPIL_2]] ], [ [[INC_EPIL_2:%.*]], [[FOR_INNER_EPIL_2]] ]
-; CHECK-NEXT:    [[SUM_EPIL_2:%.*]] = phi i32 [ 0, [[FOR_OUTER_EPIL_2]] ], [ [[ADD_EPIL_2:%.*]], [[FOR_INNER_EPIL_2]] ]
+; CHECK-NEXT:    br i1 [[EPIL_ITER_CMP_1]], label %[[FOR_OUTER_EPIL_2:.*]], label %[[FOR_END_LOOPEXIT_EPILOG_LCSSA]]
+; CHECK:       [[FOR_OUTER_EPIL_2]]:
+; CHECK-NEXT:    br label %[[FOR_INNER_EPIL_2:.*]]
+; CHECK:       [[FOR_INNER_EPIL_2]]:
+; CHECK-NEXT:    [[J_EPIL_2:%.*]] = phi i32 [ 0, %[[FOR_OUTER_EPIL_2]] ], [ [[INC_EPIL_2:%.*]], %[[FOR_INNER_EPIL_2]] ]
+; CHECK-NEXT:    [[SUM_EPIL_2:%.*]] = phi i32 [ 0, %[[FOR_OUTER_EPIL_2]] ], [ [[ADD_EPIL_2:%.*]], %[[FOR_INNER_EPIL_2]] ]
 ; CHECK-NEXT:    [[ARRAYIDX_EPIL_2:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[J_EPIL_2]]
-; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr [[ARRAYIDX_EPIL_2]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr [[ARRAYIDX_EPIL_2]], align 4, !tbaa [[INT_TBAA0]]
 ; CHECK-NEXT:    [[ADD_EPIL_2]] = add i32 [[TMP8]], [[SUM_EPIL_2]]
 ; CHECK-NEXT:    [[INC_EPIL_2]] = add nuw i32 [[J_EPIL_2]], 1
 ; CHECK-NEXT:    [[EXITCOND_EPIL_2:%.*]] = icmp eq i32 [[INC_EPIL_2]], [[E]]
-; CHECK-NEXT:    br i1 [[EXITCOND_EPIL_2]], label [[FOR_LATCH_EPIL_2:%.*]], label [[FOR_INNER_EPIL_2]]
-; CHECK:       for.latch.epil.2:
-; CHECK-NEXT:    [[ADD_LCSSA_EPIL_2:%.*]] = phi i32 [ [[ADD_EPIL_2]], [[FOR_INNER_EPIL_2]] ]
+; CHECK-NEXT:    br i1 [[EXITCOND_EPIL_2]], label %[[FOR_LATCH_EPIL_2:.*]], label %[[FOR_INNER_EPIL_2]]
+; CHECK:       [[FOR_LATCH_EPIL_2]]:
+; CHECK-NEXT:    [[ADD_LCSSA_EPIL_2:%.*]] = phi i32 [ [[ADD_EPIL_2]], %[[FOR_INNER_EPIL_2]] ]
 ; CHECK-NEXT:    [[ARRAYIDX6_EPIL_2:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[ADD8_EPIL_1]]
-; CHECK-NEXT:    store i32 [[ADD_LCSSA_EPIL_2]], ptr [[ARRAYIDX6_EPIL_2]], align 4, !tbaa [[TBAA0]]
-; CHECK-NEXT:    br label [[FOR_END_LOOPEXIT_EPILOG_LCSSA]]
-; CHECK:       for.end.loopexit.epilog-lcssa:
-; CHECK-NEXT:    br label [[FOR_END_LOOPEXIT]]
-; CHECK:       for.end.loopexit:
-; CHECK-NEXT:    br label [[FOR_END]]
-; CHECK:       for.end:
+; CHECK-NEXT:    store i32 [[ADD_LCSSA_EPIL_2]], ptr [[ARRAYIDX6_EPIL_2]], align 4, !tbaa [[INT_TBAA0]]
+; CHECK-NEXT:    br label %[[FOR_END_LOOPEXIT_EPILOG_LCSSA]]
+; CHECK:       [[FOR_END_LOOPEXIT_EPILOG_LCSSA]]:
+; CHECK-NEXT:    br label %[[FOR_END_LOOPEXIT]]
+; CHECK:       [[FOR_END_LOOPEXIT]]:
+; CHECK-NEXT:    br label %[[FOR_END]]
+; CHECK:       [[FOR_END]]:
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -181,144 +182,145 @@ for.end:
 ; Tests for(i) { sum = A[i]; for(j) sum += B[j]; A[i] = sum; }
 ; A[i] load/store dependency should not block unroll-and-jam
 define void @test2(i32 %I, i32 %E, ptr noalias nocapture %A, ptr noalias nocapture readonly %B) #0 {
-; CHECK-LABEL: @test2(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[E:%.*]], 0
-; CHECK-NEXT:    [[CMP125:%.*]] = icmp ne i32 [[I:%.*]], 0
+; CHECK-LABEL: define void @test2(
+; CHECK-SAME: i32 [[I:%.*]], i32 [[E:%.*]], ptr noalias captures(none) [[A:%.*]], ptr noalias readonly captures(none) [[B:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[E]], 0
+; CHECK-NEXT:    [[CMP125:%.*]] = icmp ne i32 [[I]], 0
 ; CHECK-NEXT:    [[OR_COND:%.*]] = and i1 [[CMP]], [[CMP125]]
-; CHECK-NEXT:    br i1 [[OR_COND]], label [[FOR_OUTER_PREHEADER:%.*]], label [[FOR_END10:%.*]]
-; CHECK:       for.outer.preheader:
+; CHECK-NEXT:    br i1 [[OR_COND]], label %[[FOR_OUTER_PREHEADER:.*]], label %[[FOR_END10:.*]]
+; CHECK:       [[FOR_OUTER_PREHEADER]]:
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[I]], -1
 ; CHECK-NEXT:    [[XTRAITER:%.*]] = and i32 [[I]], 3
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP0]], 3
-; CHECK-NEXT:    br i1 [[TMP1]], label [[FOR_END10_LOOPEXIT_UNR_LCSSA:%.*]], label [[FOR_OUTER_PREHEADER_NEW:%.*]]
-; CHECK:       for.outer.preheader.new:
+; CHECK-NEXT:    br i1 [[TMP1]], label %[[FOR_END10_LOOPEXIT_UNR_LCSSA:.*]], label %[[FOR_OUTER_PREHEADER_NEW:.*]]
+; CHECK:       [[FOR_OUTER_PREHEADER_NEW]]:
 ; CHECK-NEXT:    [[UNROLL_ITER:%.*]] = sub i32 [[I]], [[XTRAITER]]
-; CHECK-NEXT:    br label [[FOR_OUTER:%.*]]
-; CHECK:       for.outer:
-; CHECK-NEXT:    [[I:%.*]] = phi i32 [ [[ADD9_3:%.*]], [[FOR_LATCH:%.*]] ], [ 0, [[FOR_OUTER_PREHEADER_NEW]] ]
-; CHECK-NEXT:    [[NITER:%.*]] = phi i32 [ 0, [[FOR_OUTER_PREHEADER_NEW]] ], [ [[NITER_NEXT_3:%.*]], [[FOR_LATCH]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[I]]
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    br label %[[FOR_OUTER:.*]]
+; CHECK:       [[FOR_OUTER]]:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ [[ADD9_3:%.*]], %[[FOR_LATCH:.*]] ], [ 0, %[[FOR_OUTER_PREHEADER_NEW]] ]
+; CHECK-NEXT:    [[NITER:%.*]] = phi i32 [ 0, %[[FOR_OUTER_PREHEADER_NEW]] ], [ [[NITER_NEXT_3:%.*]], %[[FOR_LATCH]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[I]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !tbaa [[INT_TBAA0]]
 ; CHECK-NEXT:    [[ADD9:%.*]] = add nuw nsw i32 [[I]], 1
 ; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[ADD9]]
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX_1]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX_1]], align 4, !tbaa [[INT_TBAA0]]
 ; CHECK-NEXT:    [[ADD9_1:%.*]] = add nuw nsw i32 [[I]], 2
 ; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[ADD9_1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX_2]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX_2]], align 4, !tbaa [[INT_TBAA0]]
 ; CHECK-NEXT:    [[ADD9_2:%.*]] = add nuw nsw i32 [[I]], 3
 ; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[ADD9_2]]
-; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[ARRAYIDX_3]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[ARRAYIDX_3]], align 4, !tbaa [[INT_TBAA0]]
 ; CHECK-NEXT:    [[ADD9_3]] = add nuw i32 [[I]], 4
 ; CHECK-NEXT:    [[NITER_NEXT_3]] = add i32 [[NITER]], 4
-; CHECK-NEXT:    br label [[FOR_INNER:%.*]]
-; CHECK:       for.inner:
-; CHECK-NEXT:    [[J:%.*]] = phi i32 [ 0, [[FOR_OUTER]] ], [ [[INC:%.*]], [[FOR_INNER]] ]
-; CHECK-NEXT:    [[SUM:%.*]] = phi i32 [ [[TMP2]], [[FOR_OUTER]] ], [ [[ADD:%.*]], [[FOR_INNER]] ]
-; CHECK-NEXT:    [[J_1:%.*]] = phi i32 [ 0, [[FOR_OUTER]] ], [ [[INC_1:%.*]], [[FOR_INNER]] ]
-; CHECK-NEXT:    [[SUM_1:%.*]] = phi i32 [ [[TMP3]], [[FOR_OUTER]] ], [ [[ADD_1:%.*]], [[FOR_INNER]] ]
-; CHECK-NEXT:    [[J_2:%.*]] = phi i32 [ 0, [[FOR_OUTER]] ], [ [[INC_2:%.*]], [[FOR_INNER]] ]
-; CHECK-NEXT:    [[SUM_2:%.*]] = phi i32 [ [[TMP4]], [[FOR_OUTER]] ], [ [[ADD_2:%.*]], [[FOR_INNER]] ]
-; CHECK-NEXT:    [[J_3:%.*]] = phi i32 [ 0, [[FOR_OUTER]] ], [ [[INC_3:%.*]], [[FOR_INNER]] ]
-; CHECK-NEXT:    [[SUM_3:%.*]] = phi i32 [ [[TMP5]], [[FOR_OUTER]] ], [ [[ADD_3:%.*]], [[FOR_INNER]] ]
-; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i32 [[J]]
-; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[ARRAYIDX6]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    br label %[[FOR_INNER:.*]]
+; CHECK:       [[FOR_INNER]]:
+; CHECK-NEXT:    [[J:%.*]] = phi i32 [ 0, %[[FOR_OUTER]] ], [ [[INC:%.*]], %[[FOR_INNER]] ]
+; CHECK-NEXT:    [[SUM:%.*]] = phi i32 [ [[TMP2]], %[[FOR_OUTER]] ], [ [[ADD:%.*]], %[[FOR_INNER]] ]
+; CHECK-NEXT:    [[J_1:%.*]] = phi i32 [ 0, %[[FOR_OUTER]] ], [ [[INC_1:%.*]], %[[FOR_INNER]] ]
+; CHECK-NEXT:    [[SUM_1:%.*]] = phi i32 [ [[TMP3]], %[[FOR_OUTER]] ], [ [[ADD_1:%.*]], %[[FOR_INNER]] ]
+; CHECK-NEXT:    [[J_2:%.*]] = phi i32 [ 0, %[[FOR_OUTER]] ], [ [[INC_2:%.*]], %[[FOR_INNER]] ]
+; CHECK-NEXT:    [[SUM_2:%.*]] = phi i32 [ [[TMP4]], %[[FOR_OUTER]] ], [ [[ADD_2:%.*]], %[[FOR_INNER]] ]
+; CHECK-NEXT:    [[J_3:%.*]] = phi i32 [ 0, %[[FOR_OUTER]] ], [ [[INC_3:%.*]], %[[FOR_INNER]] ]
+; CHECK-NEXT:    [[SUM_3:%.*]] = phi i32 [ [[TMP5]], %[[FOR_OUTER]] ], [ [[ADD_3:%.*]], %[[FOR_INNER]] ]
+; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[J]]
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[ARRAYIDX6]], align 4, !tbaa [[INT_TBAA0]]
 ; CHECK-NEXT:    [[ADD]] = add i32 [[TMP6]], [[SUM]]
 ; CHECK-NEXT:    [[INC]] = add nuw i32 [[J]], 1
 ; CHECK-NEXT:    [[ARRAYIDX6_1:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[J_1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[ARRAYIDX6_1]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[ARRAYIDX6_1]], align 4, !tbaa [[INT_TBAA0]]
 ; CHECK-NEXT:    [[ADD_1]] = add i32 [[TMP7]], [[SUM_1]]
 ; CHECK-NEXT:    [[INC_1]] = add nuw i32 [[J_1]], 1
 ; CHECK-NEXT:    [[ARRAYIDX6_2:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[J_2]]
-; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr [[ARRAYIDX6_2]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr [[ARRAYIDX6_2]], align 4, !tbaa [[INT_TBAA0]]
 ; CHECK-NEXT:    [[ADD_2]] = add i32 [[TMP8]], [[SUM_2]]
 ; CHECK-NEXT:    [[INC_2]] = add nuw i32 [[J_2]], 1
 ; CHECK-NEXT:    [[ARRAYIDX6_3:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[J_3]]
-; CHECK-NEXT:    [[TMP9:%.*]] = load i32, ptr [[ARRAYIDX6_3]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    [[TMP9:%.*]] = load i32, ptr [[ARRAYIDX6_3]], align 4, !tbaa [[INT_TBAA0]]
 ; CHECK-NEXT:    [[ADD_3]] = add i32 [[TMP9]], [[SUM_3]]
 ; CHECK-NEXT:    [[INC_3]] = add nuw i32 [[J_3]], 1
 ; CHECK-NEXT:    [[EXITCOND_3:%.*]] = icmp eq i32 [[INC_3]], [[E]]
-; CHECK-NEXT:    br i1 [[EXITCOND_3]], label [[FOR_LATCH]], label [[FOR_INNER]]
-; CHECK:       for.latch:
-; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_INNER]] ]
-; CHECK-NEXT:    [[ADD_LCSSA_1:%.*]] = phi i32 [ [[ADD_1]], [[FOR_INNER]] ]
-; CHECK-NEXT:    [[ADD_LCSSA_2:%.*]] = phi i32 [ [[ADD_2]], [[FOR_INNER]] ]
-; CHECK-NEXT:    [[ADD_LCSSA_3:%.*]] = phi i32 [ [[ADD_3]], [[FOR_INNER]] ]
-; CHECK-NEXT:    store i32 [[ADD_LCSSA]], ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA0]]
-; CHECK-NEXT:    store i32 [[ADD_LCSSA_1]], ptr [[ARRAYIDX_1]], align 4, !tbaa [[TBAA0]]
-; CHECK-NEXT:    store i32 [[ADD_LCSSA_2]], ptr [[ARRAYIDX_2]], align 4, !tbaa [[TBAA0]]
-; CHECK-NEXT:    store i32 [[ADD_LCSSA_3]], ptr [[ARRAYIDX_3]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    br i1 [[EXITCOND_3]], label %[[FOR_LATCH]], label %[[FOR_INNER]]
+; CHECK:       [[FOR_LATCH]]:
+; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], %[[FOR_INNER]] ]
+; CHECK-NEXT:    [[ADD_LCSSA_1:%.*]] = phi i32 [ [[ADD_1]], %[[FOR_INNER]] ]
+; CHECK-NEXT:    [[ADD_LCSSA_2:%.*]] = phi i32 [ [[ADD_2]], %[[FOR_INNER]] ]
+; CHECK-NEXT:    [[ADD_LCSSA_3:%.*]] = phi i32 [ [[ADD_3]], %[[FOR_INNER]] ]
+; CHECK-NEXT:    store i32 [[ADD_LCSSA]], ptr [[ARRAYIDX]], align 4, !tbaa [[INT_TBAA0]]
+; CHECK-NEXT:    store i32 [[ADD_LCSSA_1]], ptr [[ARRAYIDX_1]], align 4, !tbaa [[INT_TBAA0]]
+; CHECK-NEXT:    store i32 [[ADD_LCSSA_2]], ptr [[ARRAYIDX_2]], align 4, !tbaa [[INT_TBAA0]]
+; CHECK-NEXT:    store i32 [[ADD_LCSSA_3]], ptr [[ARRAYIDX_3]], align 4, !tbaa [[INT_TBAA0]]
 ; CHECK-NEXT:    [[NITER_NCMP_3:%.*]] = icmp eq i32 [[NITER_NEXT_3]], [[UNROLL_ITER]]
-; CHECK-NEXT:    br i1 [[NITER_NCMP_3]], label [[FOR_END10_LOOPEXIT_UNR_LCSSA_LOOPEXIT:%.*]], label [[FOR_OUTER]], !llvm.loop [[LOOP6:![0-9]+]]
-; CHECK:       for.end10.loopexit.unr-lcssa.loopexit:
-; CHECK-NEXT:    [[I_UNR_PH:%.*]] = phi i32 [ [[ADD9_3]], [[FOR_LATCH]] ]
-; CHECK-NEXT:    br label [[FOR_END10_LOOPEXIT_UNR_LCSSA]]
-; CHECK:       for.end10.loopexit.unr-lcssa:
-; CHECK-NEXT:    [[I_UNR:%.*]] = phi i32 [ 0, [[FOR_OUTER_PREHEADER]] ], [ [[I_UNR_PH]], [[FOR_END10_LOOPEXIT_UNR_LCSSA_LOOPEXIT]] ]
+; CHECK-NEXT:    br i1 [[NITER_NCMP_3]], label %[[FOR_END10_LOOPEXIT_UNR_LCSSA_LOOPEXIT:.*]], label %[[FOR_OUTER]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK:       [[FOR_END10_LOOPEXIT_UNR_LCSSA_LOOPEXIT]]:
+; CHECK-NEXT:    [[I_UNR_PH:%.*]] = phi i32 [ [[ADD9_3]], %[[FOR_LATCH]] ]
+; CHECK-NEXT:    br label %[[FOR_END10_LOOPEXIT_UNR_LCSSA]]
+; CHECK:       [[FOR_END10_LOOPEXIT_UNR_LCSSA]]:
+; CHECK-NEXT:    [[I_UNR:%.*]] = phi i32 [ 0, %[[FOR_OUTER_PREHEADER]] ], [ [[I_UNR_PH]], %[[FOR_END10_LOOPEXIT_UNR_LCSSA_LOOPEXIT]] ]
 ; CHECK-NEXT:    [[LCMP_MOD:%.*]] = icmp ne i32 [[XTRAITER]], 0
-; CHECK-NEXT:    br i1 [[LCMP_MOD]], label [[FOR_OUTER_EPIL_PREHEADER:%.*]], label [[FOR_END10_LOOPEXIT:%.*]]
-; CHECK:       for.outer.epil.preheader:
-; CHECK-NEXT:    br label [[FOR_OUTER_EPIL:%.*]]
-; CHECK:       for.outer.epil:
+; CHECK-NEXT:    br i1 [[LCMP_MOD]], label %[[FOR_OUTER_EPIL_PREHEADER:.*]], label %[[FOR_END10_LOOPEXIT:.*]]
+; CHECK:       [[FOR_OUTER_EPIL_PREHEADER]]:
+; CHECK-NEXT:    br label %[[FOR_OUTER_EPIL:.*]]
+; CHECK:       [[FOR_OUTER_EPIL]]:
 ; CHECK-NEXT:    [[ARRAYIDX_EPIL:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[I_UNR]]
-; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[ARRAYIDX_EPIL]], align 4, !tbaa [[TBAA0]]
-; CHECK-NEXT:    br label [[FOR_INNER_EPIL:%.*]]
-; CHECK:       for.inner.epil:
-; CHECK-NEXT:    [[J_EPIL:%.*]] = phi i32 [ 0, [[FOR_OUTER_EPIL]] ], [ [[INC_EPIL:%.*]], [[FOR_INNER_EPIL]] ]
-; CHECK-NEXT:    [[SUM_EPIL:%.*]] = phi i32 [ [[TMP10]], [[FOR_OUTER_EPIL]] ], [ [[ADD_EPIL:%.*]], [[FOR_INNER_EPIL]] ]
+; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[ARRAYIDX_EPIL]], align 4, !tbaa [[INT_TBAA0]]
+; CHECK-NEXT:    br label %[[FOR_INNER_EPIL:.*]]
+; CHECK:       [[FOR_INNER_EPIL]]:
+; CHECK-NEXT:    [[J_EPIL:%.*]] = phi i32 [ 0, %[[FOR_OUTER_EPIL]] ], [ [[INC_EPIL:%.*]], %[[FOR_INNER_EPIL]] ]
+; CHECK-NEXT:    [[SUM_EPIL:%.*]] = phi i32 [ [[TMP10]], %[[FOR_OUTER_EPIL]] ], [ [[ADD_EPIL:%.*]], %[[FOR_INNER_EPIL]] ]
 ; CHECK-NEXT:    [[ARRAYIDX6_EPIL:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[J_EPIL]]
-; CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX6_EPIL]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX6_EPIL]], align 4, !tbaa [[INT_TBAA0]]
 ; CHECK-NEXT:    [[ADD_EPIL]] = add i32 [[TMP11]], [[SUM_EPIL]]
 ; CHECK-NEXT:    [[INC_EPIL]] = add nuw i32 [[J_EPIL]], 1
 ; CHECK-NEXT:    [[EXITCOND_EPIL:%.*]] = icmp eq i32 [[INC_EPIL]], [[E]]
-; CHECK-NEXT:    br i1 [[EXITCOND_EPIL]], label [[FOR_LATCH_EPIL:%.*]], label [[FOR_INNER_EPIL]]
-; CHECK:       for.latch.epil:
-; CHECK-NEXT:    [[ADD_LCSSA_EPIL:%.*]] = phi i32 [ [[ADD_EPIL]], [[FOR_INNER_EPIL]] ]
-; CHECK-NEXT:    store i32 [[ADD_LCSSA_EPIL]], ptr [[ARRAYIDX_EPIL]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    br i1 [[EXITCOND_EPIL]], label %[[FOR_LATCH_EPIL:.*]], label %[[FOR_INNER_EPIL]]
+; CHECK:       [[FOR_LATCH_EPIL]]:
+; CHECK-NEXT:    [[ADD_LCSSA_EPIL:%.*]] = phi i32 [ [[ADD_EPIL]], %[[FOR_INNER_EPIL]] ]
+; CHECK-NEXT:    store i32 [[ADD_LCSSA_EPIL]], ptr [[ARRAYIDX_EPIL]], align 4, !tbaa [[INT_TBAA0]]
 ; CHECK-NEXT:    [[ADD9_EPIL:%.*]] = add nuw i32 [[I_UNR]], 1
 ; CHECK-NEXT:    [[EPIL_ITER_CMP:%.*]] = icmp ne i32 1, [[XTRAITER]]
-; CHECK-NEXT:    br i1 [[EPIL_ITER_CMP]], label [[FOR_OUTER_EPIL_1:%.*]], label [[FOR_END10_LOOPEXIT_EPILOG_LCSSA:%.*]]
-; CHECK:       for.outer.epil.1:
+; CHECK-NEXT:    br i1 [[EPIL_ITER_CMP]], label %[[FOR_OUTER_EPIL_1:.*]], label %[[FOR_END10_LOOPEXIT_EPILOG_LCSSA:.*]]
+; CHECK:       [[FOR_OUTER_EPIL_1]]:
 ; CHECK-NEXT:    [[ARRAYIDX_EPIL_1:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[ADD9_EPIL]]
-; CHECK-NEXT:    [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX_EPIL_1]], align 4, !tbaa [[TBAA0]]
-; CHECK-NEXT:    br label [[FOR_INNER_EPIL_1:%.*]]
-; CHECK:       for.inner.epil.1:
-; CHECK-NEXT:    [[J_EPIL_1:%.*]] = phi i32 [ 0, [[FOR_OUTER_EPIL_1]] ], [ [[INC_EPIL_1:%.*]], [[FOR_INNER_EPIL_1]] ]
-; CHECK-NEXT:    [[SUM_EPIL_1:%.*]] = phi i32 [ [[TMP12]], [[FOR_OUTER_EPIL_1]] ], [ [[ADD_EPIL_1:%.*]], [[FOR_INNER_EPIL_1]] ]
+; CHECK-NEXT:    [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX_EPIL_1]], align 4, !tbaa [[INT_TBAA0]]
+; CHECK-NEXT:    br label %[[FOR_INNER_EPIL_1:.*]]
+; CHECK:       [[FOR_INNER_EPIL_1]]:
+; CHECK-NEXT:    [[J_EPIL_1:%.*]] = phi i32 [ 0, %[[FOR_OUTER_EPIL_1]] ], [ [[INC_EPIL_1:%.*]], %[[FOR_INNER_EPIL_1]] ]
+; CHECK-NEXT:    [[SUM_EPIL_1:%.*]] = phi i32 [ [[TMP12]], %[[FOR_OUTER_EPIL_1]] ], [ [[ADD_EPIL_1:%.*]], %[[FOR_INNER_EPIL_1]] ]
 ; CHECK-NEXT:    [[ARRAYIDX6_EPIL_1:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[J_EPIL_1]]
-; CHECK-NEXT:    [[TMP13:%.*]] = load i32, ptr [[ARRAYIDX6_EPIL_1]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    [[TMP13:%.*]] = load i32, ptr [[ARRAYIDX6_EPIL_1]], align 4, !tbaa [[INT_TBAA0]]
 ; CHECK-NEXT:    [[ADD_EPIL_1]] = add i32 [[TMP13]], [[SUM_EPIL_1]]
 ; CHECK-NEXT:    [[INC_EPIL_1]] = add nuw i32 [[J_EPIL_1]], 1
 ; CHECK-NEXT:    [[EXITCOND_EPIL_1:%.*]] = icmp eq i32 [[INC_EPIL_1]], [[E]]
-; CHECK-NEXT:    br i1 [[EXITCOND_EPIL_1]], label [[FOR_LATCH_EPIL_1:%.*]], label [[FOR_INNER_EPIL_1]]
-; CHECK:       for.latch.epil.1:
-; CHECK-NEXT:    [[ADD_LCSSA_EPIL_1:%.*]] = phi i32 [ [[ADD_EPIL_1]], [[FOR_INNER_EPIL_1]] ]
-; CHECK-NEXT:    store i32 [[ADD_LCSSA_EPIL_1]], ptr [[ARRAYIDX_EPIL_1]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    br i1 [[EXITCOND_EPIL_1]], label %[[FOR_LATCH_EPIL_1:.*]], label %[[FOR_INNER_EPIL_1]]
+; CHECK:       [[FOR_LATCH_EPIL_1]]:
+; CHECK-NEXT:    [[ADD_LCSSA_EPIL_1:%.*]] = phi i32 [ [[ADD_EPIL_1]], %[[FOR_INNER_EPIL_1]] ]
+; CHECK-NEXT:    store i32 [[ADD_LCSSA_EPIL_1]], ptr [[ARRAYIDX_EPIL_1]], align 4, !tbaa [[INT_TBAA0]]
 ; CHECK-NEXT:    [[ADD9_EPIL_1:%.*]] = add nuw i32 [[I_UNR]], 2
 ; CHECK-NEXT:    [[EPIL_ITER_CMP_1:%.*]] = icmp ne i32 2, [[XTRAITER]]
-; CHECK-NEXT:    br i1 [[EPIL_ITER_CMP_1]], label [[FOR_OUTER_EPIL_2:%.*]], label [[FOR_END10_LOOPEXIT_EPILOG_LCSSA]]
-; CHECK:       for.outer.epil.2:
+; CHECK-NEXT:    br i1 [[EPIL_ITER_CMP_1]], label %[[FOR_OUTER_EPIL_2:.*]], label %[[FOR_END10_LOOPEXIT_EPILOG_LCSSA]]
+; CHECK:       [[FOR_OUTER_EPIL_2]]:
 ; CHECK-NEXT:    [[ARRAYIDX_EPIL_2:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[ADD9_EPIL_1]]
-; CHECK-NEXT:    [[TMP14:%.*]] = load i32, ptr [[ARRAYIDX_EPIL_2]], align 4, !tbaa [[TBAA0]]
-; CHECK-NEXT:    br label [[FOR_INNER_EPIL_2:%.*]]
-; CHECK:       for.inner.epil.2:
-; CHECK-NEXT:    [[J_EPIL_2:%.*]] = phi i32 [ 0, [[FOR_OUTER_EPIL_2]] ], [ [[INC_EPIL_2:%.*]], [[FOR_INNER_EPIL_2]] ]
-; CHECK-NEXT:    [[SUM_EPIL_2:%.*]] = phi i32 [ [[TMP14]], [[FOR_OUTER_EPIL_2]] ], [ [[ADD_EPIL_2:%.*]], [[FOR_INNER_EPIL_2]] ]
+; CHECK-NEXT:    [[TMP14:%.*]] = load i32, ptr [[ARRAYIDX_EPIL_2]], align 4, !tbaa [[INT_TBAA0]]
+; CHECK-NEXT:    br label %[[FOR_INNER_EPIL_2:.*]]
+; CHECK:       [[FOR_INNER_EPIL_2]]:
+; CHECK-NEXT:    [[J_EPIL_2:%.*]] = phi i32 [ 0, %[[FOR_OUTER_EPIL_2]] ], [ [[INC_EPIL_2:%.*]], %[[FOR_INNER_EPIL_2]] ]
+; CHECK-NEXT:    [[SUM_EPIL_2:%.*]] = phi i32 [ [[TMP14]], %[[FOR_OUTER_EPIL_2]] ], [ [[ADD_EPIL_2:%.*]], %[[FOR_INNER_EPIL_2]] ]
 ; CHECK-NEXT:    [[ARRAYIDX6_EPIL_2:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[J_EPIL_2]]
-; CHECK-NEXT:    [[TMP15:%.*]] = load i32, ptr [[ARRAYIDX6_EPIL_2]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    [[TMP15:%.*]] = load i32, ptr [[ARRAYIDX6_EPIL_2]], align 4, !tbaa [[INT_TBAA0]]
 ; CHECK-NEXT:    [[ADD_EPIL_2]] = add i32 [[TMP15]], [[SUM_EPIL_2]]
 ; CHECK-NEXT:    [[INC_EPIL_2]] = add nuw i32 [[J_EPIL_2]], 1
 ; CHECK-NEXT:    [[EXITCOND_EPIL_2:%.*]] = icmp eq i32 [[INC_EPIL_2]], [[E]]
-; CHECK-NEXT:    br i1 [[EXITCOND_EPIL_2]], label [[FOR_LATCH_EPIL_2:%.*]], label [[FOR_INNER_EPIL_2]]
-; CHECK:       for.latch.epil.2:
-; CHECK-NEXT:    [[ADD_LCSSA_EPIL_2:%.*]] = phi i32 [ [[ADD_EPIL_2]], [[FOR_INNER_EPIL_2]] ]
-; CHECK-NEXT:    store i32 [[ADD_LCSSA_EPIL_2]], ptr [[ARRAYIDX_EPIL_2]], align 4, !tbaa [[TBAA0]]
-; CHECK-NEXT:    br label [[FOR_END10_LOOPEXIT_EPILOG_LCSSA]]
-; CHECK:       for.end10.loopexit.epilog-lcssa:
-; CHECK-NEXT:    br label [[FOR_END10_LOOPEXIT]]
-; CHECK:       for.end10.loopexit:
-; CHECK-NEXT:    br label [[FOR_END10]]
-; CHECK:       for.end10:
+; CHECK-NEXT:    br i1 [[EXITCOND_EPIL_2]], label %[[FOR_LATCH_EPIL_2:.*]], label %[[FOR_INNER_EPIL_2]]
+; CHECK:       [[FOR_LATCH_EPIL_2]]:
+; CHECK-NEXT:    [[ADD_LCSSA_EPIL_2:%.*]] = phi i32 [ [[ADD_EPIL_2]], %[[FOR_INNER_EPIL_2]] ]
+; CHECK-NEXT:    store i32 [[ADD_LCSSA_EPIL_2]], ptr [[ARRAYIDX_EPIL_2]], align 4, !tbaa [[INT_TBAA0]]
+; CHECK-NEXT:    br label %[[FOR_END10_LOOPEXIT_EPILOG_LCSSA]]
+; CHECK:       [[FOR_END10_LOOPEXIT_EPILOG_LCSSA]]:
+; CHECK-NEXT:    br label %[[FOR_END10_LOOPEXIT]]
+; CHECK:       [[FOR_END10_LOOPEXIT]]:
+; CHECK-NEXT:    br label %[[FOR_END10]]
+; CHECK:       [[FOR_END10]]:
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -363,61 +365,62 @@ for.end10:
 
 ; Tests Complete unroll-and-jam of the outer loop
 define void @test3(i32 %I, i32 %E, ptr noalias nocapture %A, ptr noalias nocapture readonly %B) #0 {
-; CHECK-LABEL: @test3(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[E:%.*]], 0
-; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_END:%.*]], label [[FOR_PREHEADER:%.*]]
-; CHECK:       for.preheader:
-; CHECK-NEXT:    br label [[FOR_OUTER:%.*]]
-; CHECK:       for.outer:
-; CHECK-NEXT:    br label [[FOR_INNER:%.*]]
-; CHECK:       for.inner:
-; CHECK-NEXT:    [[J:%.*]] = phi i32 [ 0, [[FOR_OUTER]] ], [ [[INC:%.*]], [[FOR_INNER]] ]
-; CHECK-NEXT:    [[SUM:%.*]] = phi i32 [ 0, [[FOR_OUTER]] ], [ [[ADD:%.*]], [[FOR_INNER]] ]
-; CHECK-NEXT:    [[J_1:%.*]] = phi i32 [ 0, [[FOR_OUTER]] ], [ [[INC_1:%.*]], [[FOR_INNER]] ]
-; CHECK-NEXT:    [[SUM_1:%.*]] = phi i32 [ 0, [[FOR_OUTER]] ], [ [[ADD_1:%.*]], [[FOR_INNER]] ]
-; CHECK-NEXT:    [[J_2:%.*]] = phi i32 [ 0, [[FOR_OUTER]] ], [ [[INC_2:%.*]], [[FOR_INNER]] ]
-; CHECK-NEXT:    [[SUM_2:%.*]] = phi i32 [ 0, [[FOR_OUTER]] ], [ [[ADD_2:%.*]], [[FOR_INNER]] ]
-; CHECK-NEXT:    [[J_3:%.*]] = phi i32 [ 0, [[FOR_OUTER]] ], [ [[INC_3:%.*]], [[FOR_INNER]] ]
-; CHECK-NEXT:    [[SUM_3:%.*]] = phi i32 [ 0, [[FOR_OUTER]] ], [ [[ADD_3:%.*]], [[FOR_INNER]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i32 [[J]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA0]]
+; CHECK-LABEL: define void @test3(
+; CHECK-SAME: i32 [[I:%.*]], i32 [[E:%.*]], ptr noalias captures(none) [[A:%.*]], ptr noalias readonly captures(none) [[B:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[E]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label %[[FOR_END:.*]], label %[[FOR_PREHEADER:.*]]
+; CHECK:       [[FOR_PREHEADER]]:
+; CHECK-NEXT:    br label %[[FOR_OUTER:.*]]
+; CHECK:       [[FOR_OUTER]]:
+; CHECK-NEXT:    br label %[[FOR_INNER:.*]]
+; CHECK:       [[FOR_INNER]]:
+; CHECK-NEXT:    [[J:%.*]] = phi i32 [ 0, %[[FOR_OUTER]] ], [ [[INC:%.*]], %[[FOR_INNER]] ]
+; CHECK-NEXT:    [[SUM:%.*]] = phi i32 [ 0, %[[FOR_OUTER]] ], [ [[ADD:%.*]], %[[FOR_INNER]] ]
+; CHECK-NEXT:    [[J_1:%.*]] = phi i32 [ 0, %[[FOR_OUTER]] ], [ [[INC_1:%.*]], %[[FOR_INNER]] ]
+; CHECK-NEXT:    [[SUM_1:%.*]] = phi i32 [ 0, %[[FOR_OUTER]] ], [ [[ADD_1:%.*]], %[[FOR_INNER]] ]
+; CHECK-NEXT:    [[J_2:%.*]] = phi i32 [ 0, %[[FOR_OUTER]] ], [ [[INC_2:%.*]], %[[FOR_INNER]] ]
+; CHECK-NEXT:    [[SUM_2:%.*]] = phi i32 [ 0, %[[FOR_OUTER]] ], [ [[ADD_2:%.*]], %[[FOR_INNER]] ]
+; CHECK-NEXT:    [[J_3:%.*]] = phi i32 [ 0, %[[FOR_OUTER]] ], [ [[INC_3:%.*]], %[[FOR_INNER]] ]
+; CHECK-NEXT:    [[SUM_3:%.*]] = phi i32 [ 0, %[[FOR_OUTER]] ], [ [[ADD_3:%.*]], %[[FOR_INNER]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[J]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !tbaa [[INT_TBAA0]]
 ; CHECK-NEXT:    [[SUB:%.*]] = add i32 [[SUM]], 10
 ; CHECK-NEXT:    [[ADD]] = sub i32 [[SUB]], [[TMP0]]
 ; CHECK-NEXT:    [[INC]] = add nuw i32 [[J]], 1
 ; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[J_1]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX_1]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX_1]], align 4, !tbaa [[INT_TBAA0]]
 ; CHECK-NEXT:    [[SUB_1:%.*]] = add i32 [[SUM_1]], 10
 ; CHECK-NEXT:    [[ADD_1]] = sub i32 [[SUB_1]], [[TMP1]]
 ; CHECK-NEXT:    [[INC_1]] = add nuw i32 [[J_1]], 1
 ; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[J_2]]
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX_2]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX_2]], align 4, !tbaa [[INT_TBAA0]]
 ; CHECK-NEXT:    [[SUB_2:%.*]] = add i32 [[SUM_2]], 10
 ; CHECK-NEXT:    [[ADD_2]] = sub i32 [[SUB_2]], [[TMP2]]
 ; CHECK-NEXT:    [[INC_2]] = add nuw i32 [[J_2]], 1
 ; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[J_3]]
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX_3]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX_3]], align 4, !tbaa [[INT_TBAA0]]
 ; CHECK-NEXT:    [[SUB_3:%.*]] = add i32 [[SUM_3]], 10
 ; CHECK-NEXT:    [[ADD_3]] = sub i32 [[SUB_3]], [[TMP3]]
 ; CHECK-NEXT:    [[INC_3]] = add nuw i32 [[J_3]], 1
 ; CHECK-NEXT:    [[EXITCOND_3:%.*]] = icmp eq i32 [[INC_3]], [[E]]
-; CHECK-NEXT:    br i1 [[EXITCOND_3]], label [[FOR_LATCH:%.*]], label [[FOR_INNER]]
-; CHECK:       for.latch:
-; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_INNER]] ]
-; CHECK-NEXT:    [[ADD_LCSSA_1:%.*]] = phi i32 [ [[ADD_1]], [[FOR_INNER]] ]
-; CHECK-NEXT:    [[ADD_LCSSA_2:%.*]] = phi i32 [ [[ADD_2]], [[FOR_INNER]] ]
-; CHECK-NEXT:    [[ADD_LCSSA_3:%.*]] = phi i32 [ [[ADD_3]], [[FOR_INNER]] ]
-; CHECK-NEXT:    store i32 [[ADD_LCSSA]], ptr [[A:%.*]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    br i1 [[EXITCOND_3]], label %[[FOR_LATCH:.*]], label %[[FOR_INNER]]
+; CHECK:       [[FOR_LATCH]]:
+; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], %[[FOR_INNER]] ]
+; CHECK-NEXT:    [[ADD_LCSSA_1:%.*]] = phi i32 [ [[ADD_1]], %[[FOR_INNER]] ]
+; CHECK-NEXT:    [[ADD_LCSSA_2:%.*]] = phi i32 [ [[ADD_2]], %[[FOR_INNER]] ]
+; CHECK-NEXT:    [[ADD_LCSSA_3:%.*]] = phi i32 [ [[ADD_3]], %[[FOR_INNER]] ]
+; CHECK-NEXT:    store i32 [[ADD_LCSSA]], ptr [[A]], align 4, !tbaa [[INT_TBAA0]]
 ; CHECK-NEXT:    [[ARRAYIDX6_1:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 1
-; CHECK-NEXT:    store i32 [[ADD_LCSSA_1]], ptr [[ARRAYIDX6_1]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    store i32 [[ADD_LCSSA_1]], ptr [[ARRAYIDX6_1]], align 4, !tbaa [[INT_TBAA0]]
 ; CHECK-NEXT:    [[ARRAYIDX6_2:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 2
-; CHECK-NEXT:    store i32 [[ADD_LCSSA_2]], ptr [[ARRAYIDX6_2]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    store i32 [[ADD_LCSSA_2]], ptr [[ARRAYIDX6_2]], align 4, !tbaa [[INT_TBAA0]]
 ; CHECK-NEXT:    [[ARRAYIDX6_3:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 3
-; CHECK-NEXT:    store i32 [[ADD_LCSSA_3]], ptr [[ARRAYIDX6_3]], align 4, !tbaa [[TBAA0]]
-; CHECK-NEXT:    br label [[FOR_END_LOOPEXIT:%.*]]
-; CHECK:       for.end.loopexit:
-; CHECK-NEXT:    br label [[FOR_END]]
-; CHECK:       for.end:
+; CHECK-NEXT:    store i32 [[ADD_LCSSA_3]], ptr [[ARRAYIDX6_3]], align 4, !tbaa [[INT_TBAA0]]
+; CHECK-NEXT:    br label %[[FOR_END_LOOPEXIT:.*]]
+; CHECK:       [[FOR_END_LOOPEXIT]]:
+; CHECK-NEXT:    br label %[[FOR_END]]
+; CHECK:       [[FOR_END]]:
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -456,31 +459,32 @@ for.end:
 
 ; Tests Complete unroll-and-jam with a trip count of 1
 define void @test4(i32 %I, i32 %E, ptr noalias nocapture %A, ptr noalias nocapture readonly %B) #0 {
-; CHECK-LABEL: @test4(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[E:%.*]], 0
-; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_END:%.*]], label [[FOR_PREHEADER:%.*]]
-; CHECK:       for.preheader:
-; CHECK-NEXT:    br label [[FOR_OUTER:%.*]]
-; CHECK:       for.outer:
-; CHECK-NEXT:    br label [[FOR_INNER:%.*]]
-; CHECK:       for.inner:
-; CHECK-NEXT:    [[J:%.*]] = phi i32 [ 0, [[FOR_OUTER]] ], [ [[INC:%.*]], [[FOR_INNER]] ]
-; CHECK-NEXT:    [[SUM:%.*]] = phi i32 [ 0, [[FOR_OUTER]] ], [ [[ADD:%.*]], [[FOR_INNER]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i32 [[J]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA0]]
+; CHECK-LABEL: define void @test4(
+; CHECK-SAME: i32 [[I:%.*]], i32 [[E:%.*]], ptr noalias captures(none) [[A:%.*]], ptr noalias readonly captures(none) [[B:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[E]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label %[[FOR_END:.*]], label %[[FOR_PREHEADER:.*]]
+; CHECK:       [[FOR_PREHEADER]]:
+; CHECK-NEXT:    br label %[[FOR_OUTER:.*]]
+; CHECK:       [[FOR_OUTER]]:
+; CHECK-NEXT:    br label %[[FOR_INNER:.*]]
+; CHECK:       [[FOR_INNER]]:
+; CHECK-NEXT:    [[J:%.*]] = phi i32 [ 0, %[[FOR_OUTER]] ], [ [[INC:%.*]], %[[FOR_INNER]] ]
+; CHECK-NEXT:    [[SUM:%.*]] = phi i32 [ 0, %[[FOR_OUTER]] ], [ [[ADD:%.*]], %[[FOR_INNER]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[J]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !tbaa [[INT_TBAA0]]
 ; CHECK-NEXT:    [[SUB:%.*]] = add i32 [[SUM]], 10
 ; CHECK-NEXT:    [[ADD]] = sub i32 [[SUB]], [[TMP0]]
 ; CHECK-NEXT:    [[INC]] = add nuw i32 [[J]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[E]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_LATCH:%.*]], label [[FOR_INNER]]
-; CHECK:       for.latch:
-; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_INNER]] ]
-; CHECK-NEXT:    store i32 [[ADD_LCSSA]], ptr [[A:%.*]], align 4, !tbaa [[TBAA0]]
-; CHECK-NEXT:    br label [[FOR_END_LOOPEXIT:%.*]]
-; CHECK:       for.end.loopexit:
-; CHECK-NEXT:    br label [[FOR_END]]
-; CHECK:       for.end:
+; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[FOR_LATCH:.*]], label %[[FOR_INNER]]
+; CHECK:       [[FOR_LATCH]]:
+; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], %[[FOR_INNER]] ]
+; CHECK-NEXT:    store i32 [[ADD_LCSSA]], ptr [[A]], align 4, !tbaa [[INT_TBAA0]]
+; CHECK-NEXT:    br label %[[FOR_END_LOOPEXIT:.*]]
+; CHECK:       [[FOR_END_LOOPEXIT]]:
+; CHECK-NEXT:    br label %[[FOR_END]]
+; CHECK:       [[FOR_END]]:
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -520,47 +524,47 @@ for.end:
 ; Multiple SubLoopBlocks
 @a = hidden global [1 x i32] zeroinitializer, align 4
 define i32 @test5() #0 {
-; CHECK-LABEL: @test5(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[FOR_OUTER:%.*]]
-; CHECK:       for.outer:
-; CHECK-NEXT:    br label [[FOR_INNER:%.*]]
-; CHECK:       for.inner:
-; CHECK-NEXT:    [[INC8_SINK15:%.*]] = phi i32 [ 0, [[FOR_OUTER]] ], [ [[INC8:%.*]], [[FOR_INC_1:%.*]] ]
-; CHECK-NEXT:    [[INC8_SINK15_1:%.*]] = phi i32 [ 0, [[FOR_OUTER]] ], [ [[INC8_1:%.*]], [[FOR_INC_1]] ]
-; CHECK-NEXT:    br label [[FOR_INNER2:%.*]]
-; CHECK:       for.inner2:
+; CHECK-LABEL: define i32 @test5() {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br label %[[FOR_OUTER:.*]]
+; CHECK:       [[FOR_OUTER]]:
+; CHECK-NEXT:    br label %[[FOR_INNER:.*]]
+; CHECK:       [[FOR_INNER]]:
+; CHECK-NEXT:    [[INC8_SINK15:%.*]] = phi i32 [ 0, %[[FOR_OUTER]] ], [ [[INC8:%.*]], %[[FOR_INC_1:.*]] ]
+; CHECK-NEXT:    [[INC8_SINK15_1:%.*]] = phi i32 [ 0, %[[FOR_OUTER]] ], [ [[INC8_1:%.*]], %[[FOR_INC_1]] ]
+; CHECK-NEXT:    br label %[[FOR_INNER2:.*]]
+; CHECK:       [[FOR_INNER2]]:
 ; CHECK-NEXT:    [[L1:%.*]] = load i32, ptr @a, align 4
 ; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp eq i32 [[L1]], 0
-; CHECK-NEXT:    br i1 [[TOBOOL]], label [[FOR_COND4:%.*]], label [[FOR_INC:%.*]]
-; CHECK:       for.cond4:
+; CHECK-NEXT:    br i1 [[TOBOOL]], label %[[FOR_COND4:.*]], label %[[FOR_INC:.*]]
+; CHECK:       [[FOR_COND4]]:
 ; CHECK-NEXT:    [[L0:%.*]] = load i32, ptr getelementptr inbounds ([1 x i32], ptr @a, i32 1, i32 0), align 4
 ; CHECK-NEXT:    [[TOBOOL_1:%.*]] = icmp eq i32 [[L0]], 0
-; CHECK-NEXT:    br i1 [[TOBOOL_1]], label [[FOR_COND4A:%.*]], label [[FOR_INC]]
-; CHECK:       for.cond4a:
-; CHECK-NEXT:    br label [[FOR_INC]]
-; CHECK:       for.inc:
+; CHECK-NEXT:    br i1 [[TOBOOL_1]], label %[[FOR_COND4A:.*]], label %[[FOR_INC]]
+; CHECK:       [[FOR_COND4A]]:
+; CHECK-NEXT:    br label %[[FOR_INC]]
+; CHECK:       [[FOR_INC]]:
 ; CHECK-NEXT:    [[INC8]] = add nuw nsw i32 [[INC8_SINK15]], 1
 ; CHECK-NEXT:    [[L1_1:%.*]] = load i32, ptr @a, align 4
 ; CHECK-NEXT:    [[TOBOOL_11:%.*]] = icmp eq i32 [[L1_1]], 0
-; CHECK-NEXT:    br i1 [[TOBOOL_11]], label [[FOR_COND4_1:%.*]], label [[FOR_INC_1]]
-; CHECK:       for.latch:
-; CHECK-NEXT:    [[DOTLCSSA_1:%.*]] = phi i32 [ [[L2_1:%.*]], [[FOR_INC_1]] ]
-; CHECK-NEXT:    br label [[FOR_END:%.*]]
-; CHECK:       for.end:
-; CHECK-NEXT:    [[DOTLCSSA_LCSSA:%.*]] = phi i32 [ [[DOTLCSSA_1]], [[FOR_LATCH:%.*]] ]
+; CHECK-NEXT:    br i1 [[TOBOOL_11]], label %[[FOR_COND4_1:.*]], label %[[FOR_INC_1]]
+; CHECK:       [[FOR_LATCH:.*]]:
+; CHECK-NEXT:    [[DOTLCSSA_1:%.*]] = phi i32 [ [[L2_1:%.*]], %[[FOR_INC_1]] ]
+; CHECK-NEXT:    br label %[[FOR_END:.*]]
+; CHECK:       [[FOR_END]]:
+; CHECK-NEXT:    [[DOTLCSSA_LCSSA:%.*]] = phi i32 [ [[DOTLCSSA_1]], %[[FOR_LATCH]] ]
 ; CHECK-NEXT:    ret i32 0
-; CHECK:       for.cond4.1:
+; CHECK:       [[FOR_COND4_1]]:
 ; CHECK-NEXT:    [[L0_1:%.*]] = load i32, ptr getelementptr inbounds ([1 x i32], ptr @a, i32 1, i32 0), align 4
 ; CHECK-NEXT:    [[TOBOOL_1_1:%.*]] = icmp eq i32 [[L0_1]], 0
-; CHECK-NEXT:    br i1 [[TOBOOL_1_1]], label [[FOR_COND4A_1:%.*]], label [[FOR_INC_1]]
-; CHECK:       for.cond4a.1:
-; CHECK-NEXT:    br label [[FOR_INC_1]]
-; CHECK:       for.inc.1:
-; CHECK-NEXT:    [[L2_1]] = phi i32 [ 0, [[FOR_INC]] ], [ 1, [[FOR_COND4_1]] ], [ 2, [[FOR_COND4A_1]] ]
+; CHECK-NEXT:    br i1 [[TOBOOL_1_1]], label %[[FOR_COND4A_1:.*]], label %[[FOR_INC_1]]
+; CHECK:       [[FOR_COND4A_1]]:
+; CHECK-NEXT:    br label %[[FOR_INC_1]]
+; CHECK:       [[FOR_INC_1]]:
+; CHECK-NEXT:    [[L2_1]] = phi i32 [ 0, %[[FOR_INC]] ], [ 1, %[[FOR_COND4_1]] ], [ 2, %[[FOR_COND4A_1]] ]
 ; CHECK-NEXT:    [[INC8_1]] = add nuw nsw i32 [[INC8_SINK15_1]], 1
 ; CHECK-NEXT:    [[EXITCOND_1:%.*]] = icmp eq i32 [[INC8_1]], 3
-; CHECK-NEXT:    br i1 [[EXITCOND_1]], label [[FOR_LATCH]], label [[FOR_INNER]]
+; CHECK-NEXT:    br i1 [[EXITCOND_1]], label %[[FOR_LATCH]], label %[[FOR_INNER]]
 ;
 entry:
   br label %for.outer
@@ -608,57 +612,57 @@ for.end:
 ; Test odd uses of phi nodes
 @f = hidden global i32 0, align 4
 define i32 @test6() #0 {
-; CHECK-LABEL: @test6(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[F_PROMOTED10:%.*]] = load i32, ptr @f, align 4, !tbaa [[TBAA0]]
-; CHECK-NEXT:    br i1 false, label [[FOR_END_UNR_LCSSA:%.*]], label [[ENTRY_NEW:%.*]]
-; CHECK:       entry.new:
-; CHECK-NEXT:    br label [[FOR_OUTER:%.*]]
-; CHECK:       for.outer:
-; CHECK-NEXT:    [[INC5_SINK9:%.*]] = phi i32 [ 2, [[ENTRY_NEW]] ], [ [[INC5_3:%.*]], [[FOR_LATCH:%.*]] ]
-; CHECK-NEXT:    [[NITER:%.*]] = phi i32 [ 0, [[ENTRY_NEW]] ], [ [[NITER_NEXT_3:%.*]], [[FOR_LATCH]] ]
+; CHECK-LABEL: define i32 @test6() {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[F_PROMOTED10:%.*]] = load i32, ptr @f, align 4, !tbaa [[INT_TBAA0]]
+; CHECK-NEXT:    br i1 false, label %[[FOR_END_UNR_LCSSA:.*]], label %[[ENTRY_NEW:.*]]
+; CHECK:       [[ENTRY_NEW]]:
+; CHECK-NEXT:    br label %[[FOR_OUTER:.*]]
+; CHECK:       [[FOR_OUTER]]:
+; CHECK-NEXT:    [[INC5_SINK9:%.*]] = phi i32 [ 2, %[[ENTRY_NEW]] ], [ [[INC5_3:%.*]], %[[FOR_LATCH:.*]] ]
+; CHECK-NEXT:    [[NITER:%.*]] = phi i32 [ 0, %[[ENTRY_NEW]] ], [ [[NITER_NEXT_3:%.*]], %[[FOR_LATCH]] ]
 ; CHECK-NEXT:    [[INC5_3]] = add nuw nsw i32 [[INC5_SINK9]], 4
 ; CHECK-NEXT:    [[NITER_NEXT_3]] = add nuw nsw i32 [[NITER]], 4
-; CHECK-NEXT:    br label [[FOR_INNER:%.*]]
-; CHECK:       for.inner:
-; CHECK-NEXT:    [[INC_SINK8:%.*]] = phi i32 [ 0, [[FOR_OUTER]] ], [ [[INC:%.*]], [[FOR_INNER]] ]
-; CHECK-NEXT:    [[INC_SINK8_1:%.*]] = phi i32 [ 0, [[FOR_OUTER]] ], [ [[INC_1:%.*]], [[FOR_INNER]] ]
-; CHECK-NEXT:    [[INC_SINK8_2:%.*]] = phi i32 [ 0, [[FOR_OUTER]] ], [ [[INC_2:%.*]], [[FOR_INNER]] ]
-; CHECK-NEXT:    [[INC_SINK8_3:%.*]] = phi i32 [ 0, [[FOR_OUTER]] ], [ [[INC_3:%.*]], [[FOR_INNER]] ]
+; CHECK-NEXT:    br label %[[FOR_INNER:.*]]
+; CHECK:       [[FOR_INNER]]:
+; CHECK-NEXT:    [[INC_SINK8:%.*]] = phi i32 [ 0, %[[FOR_OUTER]] ], [ [[INC:%.*]], %[[FOR_INNER]] ]
+; CHECK-NEXT:    [[INC_SINK8_1:%.*]] = phi i32 [ 0, %[[FOR_OUTER]] ], [ [[INC_1:%.*]], %[[FOR_INNER]] ]
+; CHECK-NEXT:    [[INC_SINK8_2:%.*]] = phi i32 [ 0, %[[FOR_OUTER]] ], [ [[INC_2:%.*]], %[[FOR_INNER]] ]
+; CHECK-NEXT:    [[INC_SINK8_3:%.*]] = phi i32 [ 0, %[[FOR_OUTER]] ], [ [[INC_3:%.*]], %[[FOR_INNER]] ]
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[INC_SINK8]], 1
 ; CHECK-NEXT:    [[INC_1]] = add nuw nsw i32 [[INC_SINK8_1]], 1
 ; CHECK-NEXT:    [[INC_2]] = add nuw nsw i32 [[INC_SINK8_2]], 1
 ; CHECK-NEXT:    [[INC_3]] = add nuw nsw i32 [[INC_SINK8_3]], 1
 ; CHECK-NEXT:    [[EXITCOND_3:%.*]] = icmp ne i32 [[INC_3]], 7
-; CHECK-NEXT:    br i1 [[EXITCOND_3]], label [[FOR_INNER]], label [[FOR_LATCH]]
-; CHECK:       for.latch:
-; CHECK-NEXT:    br i1 false, label [[FOR_OUTER]], label [[FOR_END_UNR_LCSSA_LOOPEXIT:%.*]], !llvm.loop [[LOOP7:![0-9]+]]
-; CHECK:       for.end.unr-lcssa.loopexit:
-; CHECK-NEXT:    [[DOTLCSSA_LCSSA_PH_PH:%.*]] = phi i32 [ 2, [[FOR_LATCH]] ]
-; CHECK-NEXT:    [[INC_LCSSA_LCSSA_PH_PH:%.*]] = phi i32 [ 7, [[FOR_LATCH]] ]
-; CHECK-NEXT:    [[P0_UNR_PH:%.*]] = phi i32 [ 2, [[FOR_LATCH]] ]
-; CHECK-NEXT:    br label [[FOR_END_UNR_LCSSA]]
-; CHECK:       for.end.unr-lcssa:
-; CHECK-NEXT:    [[DOTLCSSA_LCSSA_PH:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[DOTLCSSA_LCSSA_PH_PH]], [[FOR_END_UNR_LCSSA_LOOPEXIT]] ]
-; CHECK-NEXT:    [[INC_LCSSA_LCSSA_PH:%.*]] = phi i32 [ poison, [[ENTRY]] ], [ [[INC_LCSSA_LCSSA_PH_PH]], [[FOR_END_UNR_LCSSA_LOOPEXIT]] ]
-; CHECK-NEXT:    [[P0_UNR:%.*]] = phi i32 [ [[F_PROMOTED10]], [[ENTRY]] ], [ [[P0_UNR_PH]], [[FOR_END_UNR_LCSSA_LOOPEXIT]] ]
-; CHECK-NEXT:    br i1 true, label [[FOR_OUTER_EPIL_PREHEADER:%.*]], label [[FOR_END:%.*]]
-; CHECK:       for.outer.epil.preheader:
-; CHECK-NEXT:    br label [[FOR_OUTER_EPIL:%.*]]
-; CHECK:       for.outer.epil:
-; CHECK-NEXT:    br label [[FOR_INNER_EPIL:%.*]]
-; CHECK:       for.inner.epil:
-; CHECK-NEXT:    [[P1_EPIL:%.*]] = phi i32 [ [[P0_UNR]], [[FOR_OUTER_EPIL]] ], [ 2, [[FOR_INNER_EPIL]] ]
-; CHECK-NEXT:    [[INC_SINK8_EPIL:%.*]] = phi i32 [ 0, [[FOR_OUTER_EPIL]] ], [ [[INC_EPIL:%.*]], [[FOR_INNER_EPIL]] ]
+; CHECK-NEXT:    br i1 [[EXITCOND_3]], label %[[FOR_INNER]], label %[[FOR_LATCH]]
+; CHECK:       [[FOR_LATCH]]:
+; CHECK-NEXT:    br i1 false, label %[[FOR_OUTER]], label %[[FOR_END_UNR_LCSSA_LOOPEXIT:.*]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK:       [[FOR_END_UNR_LCSSA_LOOPEXIT]]:
+; CHECK-NEXT:    [[DOTLCSSA_LCSSA_PH_PH:%.*]] = phi i32 [ 2, %[[FOR_LATCH]] ]
+; CHECK-NEXT:    [[INC_LCSSA_LCSSA_PH_PH:%.*]] = phi i32 [ 7, %[[FOR_LATCH]] ]
+; CHECK-NEXT:    [[P0_UNR_PH:%.*]] = phi i32 [ 2, %[[FOR_LATCH]] ]
+; CHECK-NEXT:    br label %[[FOR_END_UNR_LCSSA]]
+; CHECK:       [[FOR_END_UNR_LCSSA]]:
+; CHECK-NEXT:    [[DOTLCSSA_LCSSA_PH:%.*]] = phi i32 [ poison, %[[ENTRY]] ], [ [[DOTLCSSA_LCSSA_PH_PH]], %[[FOR_END_UNR_LCSSA_LOOPEXIT]] ]
+; CHECK-NEXT:    [[INC_LCSSA_LCSSA_PH:%.*]] = phi i32 [ poison, %[[ENTRY]] ], [ [[INC_LCSSA_LCSSA_PH_PH]], %[[FOR_END_UNR_LCSSA_LOOPEXIT]] ]
+; CHECK-NEXT:    [[P0_UNR:%.*]] = phi i32 [ [[F_PROMOTED10]], %[[ENTRY]] ], [ [[P0_UNR_PH]], %[[FOR_END_UNR_LCSSA_LOOPEXIT]] ]
+; CHECK-NEXT:    br i1 true, label %[[FOR_OUTER_EPIL_PREHEADER:.*]], label %[[FOR_END:.*]]
+; CHECK:       [[FOR_OUTER_EPIL_PREHEADER]]:
+; CHECK-NEXT:    br label %[[FOR_OUTER_EPIL:.*]]
+; CHECK:       [[FOR_OUTER_EPIL]]:
+; CHECK-NEXT:    br label %[[FOR_INNER_EPIL:.*]]
+; CHECK:       [[FOR_INNER_EPIL]]:
+; CHECK-NEXT:    [[P1_EPIL:%.*]] = phi i32 [ [[P0_UNR]], %[[FOR_OUTER_EPIL]] ], [ 2, %[[FOR_INNER_EPIL]] ]
+; CHECK-NEXT:    [[INC_SINK8_EPIL:%.*]] = phi i32 [ 0, %[[FOR_OUTER_EPIL]] ], [ [[INC_EPIL:%.*]], %[[FOR_INNER_EPIL]] ]
 ; CHECK-NEXT:    [[INC_EPIL]] = add nuw nsw i32 [[INC_SINK8_EPIL]], 1
 ; CHECK-NEXT:    [[EXITCOND_EPIL:%.*]] = icmp ne i32 [[INC_EPIL]], 7
-; CHECK-NEXT:    br i1 [[EXITCOND_EPIL]], label [[FOR_INNER_EPIL]], label [[FOR_LATCH_EPIL:%.*]]
-; CHECK:       for.latch.epil:
-; CHECK-NEXT:    [[DOTLCSSA_EPIL:%.*]] = phi i32 [ [[P1_EPIL]], [[FOR_INNER_EPIL]] ]
-; CHECK-NEXT:    br label [[FOR_END]]
-; CHECK:       for.end:
-; CHECK-NEXT:    [[DOTLCSSA_LCSSA:%.*]] = phi i32 [ [[DOTLCSSA_LCSSA_PH]], [[FOR_END_UNR_LCSSA]] ], [ [[DOTLCSSA_EPIL]], [[FOR_LATCH_EPIL]] ]
-; CHECK-NEXT:    [[INC_LCSSA_LCSSA:%.*]] = phi i32 [ [[INC_LCSSA_LCSSA_PH]], [[FOR_END_UNR_LCSSA]] ], [ 7, [[FOR_LATCH_EPIL]] ]
+; CHECK-NEXT:    br i1 [[EXITCOND_EPIL]], label %[[FOR_INNER_EPIL]], label %[[FOR_LATCH_EPIL:.*]]
+; CHECK:       [[FOR_LATCH_EPIL]]:
+; CHECK-NEXT:    [[DOTLCSSA_EPIL:%.*]] = phi i32 [ [[P1_EPIL]], %[[FOR_INNER_EPIL]] ]
+; CHECK-NEXT:    br label %[[FOR_END]]
+; CHECK:       [[FOR_END]]:
+; CHECK-NEXT:    [[DOTLCSSA_LCSSA:%.*]] = phi i32 [ [[DOTLCSSA_LCSSA_PH]], %[[FOR_END_UNR_LCSSA]] ], [ [[DOTLCSSA_EPIL]], %[[FOR_LATCH_EPIL]] ]
+; CHECK-NEXT:    [[INC_LCSSA_LCSSA:%.*]] = phi i32 [ [[INC_LCSSA_LCSSA_PH]], %[[FOR_END_UNR_LCSSA]] ], [ 7, %[[FOR_LATCH_EPIL]] ]
 ; CHECK-NEXT:    ret i32 0
 ;
 entry:
@@ -693,159 +697,160 @@ for.end:
 ; Has a positive dependency between two stores. Still valid.
 ; The negative dependecy is in unroll-and-jam-disabled.ll
 define void @test7(i32 %I, i32 %E, ptr noalias nocapture %A, ptr noalias nocapture readonly %B) #0 {
-; CHECK-LABEL: @test7(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[E:%.*]], 0
-; CHECK-NEXT:    [[CMP128:%.*]] = icmp ne i32 [[I:%.*]], 0
+; CHECK-LABEL: define void @test7(
+; CHECK-SAME: i32 [[I:%.*]], i32 [[E:%.*]], ptr noalias captures(none) [[A:%.*]], ptr noalias readonly captures(none) [[B:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[E]], 0
+; CHECK-NEXT:    [[CMP128:%.*]] = icmp ne i32 [[I]], 0
 ; CHECK-NEXT:    [[OR_COND:%.*]] = and i1 [[CMP128]], [[CMP]]
-; CHECK-NEXT:    br i1 [[OR_COND]], label [[FOR_PREHEADER:%.*]], label [[FOR_END:%.*]]
-; CHECK:       for.preheader:
+; CHECK-NEXT:    br i1 [[OR_COND]], label %[[FOR_PREHEADER:.*]], label %[[FOR_END:.*]]
+; CHECK:       [[FOR_PREHEADER]]:
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[I]], -1
 ; CHECK-NEXT:    [[XTRAITER:%.*]] = and i32 [[I]], 3
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP0]], 3
-; CHECK-NEXT:    br i1 [[TMP1]], label [[FOR_END_LOOPEXIT_UNR_LCSSA:%.*]], label [[FOR_PREHEADER_NEW:%.*]]
-; CHECK:       for.preheader.new:
+; CHECK-NEXT:    br i1 [[TMP1]], label %[[FOR_END_LOOPEXIT_UNR_LCSSA:.*]], label %[[FOR_PREHEADER_NEW:.*]]
+; CHECK:       [[FOR_PREHEADER_NEW]]:
 ; CHECK-NEXT:    [[UNROLL_ITER:%.*]] = sub i32 [[I]], [[XTRAITER]]
-; CHECK-NEXT:    br label [[FOR_OUTER:%.*]]
-; CHECK:       for.outer:
-; CHECK-NEXT:    [[I:%.*]] = phi i32 [ [[ADD_3:%.*]], [[FOR_LATCH:%.*]] ], [ 0, [[FOR_PREHEADER_NEW]] ]
-; CHECK-NEXT:    [[NITER:%.*]] = phi i32 [ 0, [[FOR_PREHEADER_NEW]] ], [ [[NITER_NEXT_3:%.*]], [[FOR_LATCH]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[I]]
-; CHECK-NEXT:    store i32 0, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    br label %[[FOR_OUTER:.*]]
+; CHECK:       [[FOR_OUTER]]:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ [[ADD_3:%.*]], %[[FOR_LATCH:.*]] ], [ 0, %[[FOR_PREHEADER_NEW]] ]
+; CHECK-NEXT:    [[NITER:%.*]] = phi i32 [ 0, %[[FOR_PREHEADER_NEW]] ], [ [[NITER_NEXT_3:%.*]], %[[FOR_LATCH]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[I]]
+; CHECK-NEXT:    store i32 0, ptr [[ARRAYIDX]], align 4, !tbaa [[INT_TBAA0]]
 ; CHECK-NEXT:    [[ADD:%.*]] = add nuw nsw i32 [[I]], 1
 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[ADD]]
-; CHECK-NEXT:    store i32 2, ptr [[ARRAYIDX2]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    store i32 2, ptr [[ARRAYIDX2]], align 4, !tbaa [[INT_TBAA0]]
 ; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[ADD]]
-; CHECK-NEXT:    store i32 0, ptr [[ARRAYIDX_1]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    store i32 0, ptr [[ARRAYIDX_1]], align 4, !tbaa [[INT_TBAA0]]
 ; CHECK-NEXT:    [[ADD_1:%.*]] = add nuw nsw i32 [[I]], 2
 ; CHECK-NEXT:    [[ARRAYIDX2_1:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[ADD_1]]
-; CHECK-NEXT:    store i32 2, ptr [[ARRAYIDX2_1]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    store i32 2, ptr [[ARRAYIDX2_1]], align 4, !tbaa [[INT_TBAA0]]
 ; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[ADD_1]]
-; CHECK-NEXT:    store i32 0, ptr [[ARRAYIDX_2]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    store i32 0, ptr [[ARRAYIDX_2]], align 4, !tbaa [[INT_TBAA0]]
 ; CHECK-NEXT:    [[ADD_2:%.*]] = add nuw nsw i32 [[I]], 3
 ; CHECK-NEXT:    [[ARRAYIDX2_2:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[ADD_2]]
-; CHECK-NEXT:    store i32 2, ptr [[ARRAYIDX2_2]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    store i32 2, ptr [[ARRAYIDX2_2]], align 4, !tbaa [[INT_TBAA0]]
 ; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[ADD_2]]
-; CHECK-NEXT:    store i32 0, ptr [[ARRAYIDX_3]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    store i32 0, ptr [[ARRAYIDX_3]], align 4, !tbaa [[INT_TBAA0]]
 ; CHECK-NEXT:    [[ADD_3]] = add nuw i32 [[I]], 4
 ; CHECK-NEXT:    [[ARRAYIDX2_3:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[ADD_3]]
-; CHECK-NEXT:    store i32 2, ptr [[ARRAYIDX2_3]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    store i32 2, ptr [[ARRAYIDX2_3]], align 4, !tbaa [[INT_TBAA0]]
 ; CHECK-NEXT:    [[NITER_NEXT_3]] = add i32 [[NITER]], 4
-; CHECK-NEXT:    br label [[FOR_INNER:%.*]]
-; CHECK:       for.latch:
-; CHECK-NEXT:    [[ADD9_LCSSA:%.*]] = phi i32 [ [[ADD9:%.*]], [[FOR_INNER]] ]
-; CHECK-NEXT:    [[ADD9_LCSSA_1:%.*]] = phi i32 [ [[ADD9_1:%.*]], [[FOR_INNER]] ]
-; CHECK-NEXT:    [[ADD9_LCSSA_2:%.*]] = phi i32 [ [[ADD9_2:%.*]], [[FOR_INNER]] ]
-; CHECK-NEXT:    [[ADD9_LCSSA_3:%.*]] = phi i32 [ [[ADD9_3:%.*]], [[FOR_INNER]] ]
-; CHECK-NEXT:    store i32 [[ADD9_LCSSA]], ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA0]]
-; CHECK-NEXT:    store i32 [[ADD9_LCSSA_1]], ptr [[ARRAYIDX_1]], align 4, !tbaa [[TBAA0]]
-; CHECK-NEXT:    store i32 [[ADD9_LCSSA_2]], ptr [[ARRAYIDX_2]], align 4, !tbaa [[TBAA0]]
-; CHECK-NEXT:    store i32 [[ADD9_LCSSA_3]], ptr [[ARRAYIDX_3]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    br label %[[FOR_INNER:.*]]
+; CHECK:       [[FOR_LATCH]]:
+; CHECK-NEXT:    [[ADD9_LCSSA:%.*]] = phi i32 [ [[ADD9:%.*]], %[[FOR_INNER]] ]
+; CHECK-NEXT:    [[ADD9_LCSSA_1:%.*]] = phi i32 [ [[ADD9_1:%.*]], %[[FOR_INNER]] ]
+; CHECK-NEXT:    [[ADD9_LCSSA_2:%.*]] = phi i32 [ [[ADD9_2:%.*]], %[[FOR_INNER]] ]
+; CHECK-NEXT:    [[ADD9_LCSSA_3:%.*]] = phi i32 [ [[ADD9_3:%.*]], %[[FOR_INNER]] ]
+; CHECK-NEXT:    store i32 [[ADD9_LCSSA]], ptr [[ARRAYIDX]], align 4, !tbaa [[INT_TBAA0]]
+; CHECK-NEXT:    store i32 [[ADD9_LCSSA_1]], ptr [[ARRAYIDX_1]], align 4, !tbaa [[INT_TBAA0]]
+; CHECK-NEXT:    store i32 [[ADD9_LCSSA_2]], ptr [[ARRAYIDX_2]], align 4, !tbaa [[INT_TBAA0]]
+; CHECK-NEXT:    store i32 [[ADD9_LCSSA_3]], ptr [[ARRAYIDX_3]], align 4, !tbaa [[INT_TBAA0]]
 ; CHECK-NEXT:    [[NITER_NCMP_3:%.*]] = icmp eq i32 [[NITER_NEXT_3]], [[UNROLL_ITER]]
-; CHECK-NEXT:    br i1 [[NITER_NCMP_3]], label [[FOR_END_LOOPEXIT_UNR_LCSSA_LOOPEXIT:%.*]], label [[FOR_OUTER]], !llvm.loop [[LOOP8:![0-9]+]]
-; CHECK:       for.inner:
-; CHECK-NEXT:    [[SUM:%.*]] = phi i32 [ 0, [[FOR_OUTER]] ], [ [[ADD9]], [[FOR_INNER]] ]
-; CHECK-NEXT:    [[J:%.*]] = phi i32 [ 0, [[FOR_OUTER]] ], [ [[ADD10:%.*]], [[FOR_INNER]] ]
-; CHECK-NEXT:    [[SUM_1:%.*]] = phi i32 [ 0, [[FOR_OUTER]] ], [ [[ADD9_1]], [[FOR_INNER]] ]
-; CHECK-NEXT:    [[J_1:%.*]] = phi i32 [ 0, [[FOR_OUTER]] ], [ [[ADD10_1:%.*]], [[FOR_INNER]] ]
-; CHECK-NEXT:    [[SUM_2:%.*]] = phi i32 [ 0, [[FOR_OUTER]] ], [ [[ADD9_2]], [[FOR_INNER]] ]
-; CHECK-NEXT:    [[J_2:%.*]] = phi i32 [ 0, [[FOR_OUTER]] ], [ [[ADD10_2:%.*]], [[FOR_INNER]] ]
-; CHECK-NEXT:    [[SUM_3:%.*]] = phi i32 [ 0, [[FOR_OUTER]] ], [ [[ADD9_3]], [[FOR_INNER]] ]
-; CHECK-NEXT:    [[J_3:%.*]] = phi i32 [ 0, [[FOR_OUTER]] ], [ [[ADD10_3:%.*]], [[FOR_INNER]] ]
-; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i32 [[J]]
-; CHECK-NEXT:    [[L1:%.*]] = load i32, ptr [[ARRAYIDX7]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    br i1 [[NITER_NCMP_3]], label %[[FOR_END_LOOPEXIT_UNR_LCSSA_LOOPEXIT:.*]], label %[[FOR_OUTER]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK:       [[FOR_INNER]]:
+; CHECK-NEXT:    [[SUM:%.*]] = phi i32 [ 0, %[[FOR_OUTER]] ], [ [[ADD9]], %[[FOR_INNER]] ]
+; CHECK-NEXT:    [[J:%.*]] = phi i32 [ 0, %[[FOR_OUTER]] ], [ [[ADD10:%.*]], %[[FOR_INNER]] ]
+; CHECK-NEXT:    [[SUM_1:%.*]] = phi i32 [ 0, %[[FOR_OUTER]] ], [ [[ADD9_1]], %[[FOR_INNER]] ]
+; CHECK-NEXT:    [[J_1:%.*]] = phi i32 [ 0, %[[FOR_OUTER]] ], [ [[ADD10_1:%.*]], %[[FOR_INNER]] ]
+; CHECK-NEXT:    [[SUM_2:%.*]] = phi i32 [ 0, %[[FOR_OUTER]] ], [ [[ADD9_2]], %[[FOR_INNER]] ]
+; CHECK-NEXT:    [[J_2:%.*]] = phi i32 [ 0, %[[FOR_OUTER]] ], [ [[ADD10_2:%.*]], %[[FOR_INNER]] ]
+; CHECK-NEXT:    [[SUM_3:%.*]] = phi i32 [ 0, %[[FOR_OUTER]] ], [ [[ADD9_3]], %[[FOR_INNER]] ]
+; CHECK-NEXT:    [[J_3:%.*]] = phi i32 [ 0, %[[FOR_OUTER]] ], [ [[ADD10_3:%.*]], %[[FOR_INNER]] ]
+; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[J]]
+; CHECK-NEXT:    [[L1:%.*]] = load i32, ptr [[ARRAYIDX7]], align 4, !tbaa [[INT_TBAA0]]
 ; CHECK-NEXT:    [[ADD9]] = add i32 [[L1]], [[SUM]]
 ; CHECK-NEXT:    [[ADD10]] = add nuw i32 [[J]], 1
 ; CHECK-NEXT:    [[ARRAYIDX7_1:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[J_1]]
-; CHECK-NEXT:    [[L1_1:%.*]] = load i32, ptr [[ARRAYIDX7_1]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    [[L1_1:%.*]] = load i32, ptr [[ARRAYIDX7_1]], align 4, !tbaa [[INT_TBAA0]]
 ; CHECK-NEXT:    [[ADD9_1]] = add i32 [[L1_1]], [[SUM_1]]
 ; CHECK-NEXT:    [[ADD10_1]] = add nuw i32 [[J_1]], 1
 ; CHECK-NEXT:    [[ARRAYIDX7_2:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[J_2]]
-; CHECK-NEXT:    [[L1_2:%.*]] = load i32, ptr [[ARRAYIDX7_2]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    [[L1_2:%.*]] = load i32, ptr [[ARRAYIDX7_2]], align 4, !tbaa [[INT_TBAA0]]
 ; CHECK-NEXT:    [[ADD9_2]] = add i32 [[L1_2]], [[SUM_2]]
 ; CHECK-NEXT:    [[ADD10_2]] = add nuw i32 [[J_2]], 1
 ; CHECK-NEXT:    [[ARRAYIDX7_3:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[J_3]]
-; CHECK-NEXT:    [[L1_3:%.*]] = load i32, ptr [[ARRAYIDX7_3]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    [[L1_3:%.*]] = load i32, ptr [[ARRAYIDX7_3]], align 4, !tbaa [[INT_TBAA0]]
 ; CHECK-NEXT:    [[ADD9_3]] = add i32 [[L1_3]], [[SUM_3]]
 ; CHECK-NEXT:    [[ADD10_3]] = add nuw i32 [[J_3]], 1
 ; CHECK-NEXT:    [[EXITCOND_3:%.*]] = icmp eq i32 [[ADD10_3]], [[E]]
-; CHECK-NEXT:    br i1 [[EXITCOND_3]], label [[FOR_LATCH]], label [[FOR_INNER]]
-; CHECK:       for.end.loopexit.unr-lcssa.loopexit:
-; CHECK-NEXT:    [[I_UNR_PH:%.*]] = phi i32 [ [[ADD_3]], [[FOR_LATCH]] ]
-; CHECK-NEXT:    br label [[FOR_END_LOOPEXIT_UNR_LCSSA]]
-; CHECK:       for.end.loopexit.unr-lcssa:
-; CHECK-NEXT:    [[I_UNR:%.*]] = phi i32 [ 0, [[FOR_PREHEADER]] ], [ [[I_UNR_PH]], [[FOR_END_LOOPEXIT_UNR_LCSSA_LOOPEXIT]] ]
+; CHECK-NEXT:    br i1 [[EXITCOND_3]], label %[[FOR_LATCH]], label %[[FOR_INNER]]
+; CHECK:       [[FOR_END_LOOPEXIT_UNR_LCSSA_LOOPEXIT]]:
+; CHECK-NEXT:    [[I_UNR_PH:%.*]] = phi i32 [ [[ADD_3]], %[[FOR_LATCH]] ]
+; CHECK-NEXT:    br label %[[FOR_END_LOOPEXIT_UNR_LCSSA]]
+; CHECK:       [[FOR_END_LOOPEXIT_UNR_LCSSA]]:
+; CHECK-NEXT:    [[I_UNR:%.*]] = phi i32 [ 0, %[[FOR_PREHEADER]] ], [ [[I_UNR_PH]], %[[FOR_END_LOOPEXIT_UNR_LCSSA_LOOPEXIT]] ]
 ; CHECK-NEXT:    [[LCMP_MOD:%.*]] = icmp ne i32 [[XTRAITER]], 0
-; CHECK-NEXT:    br i1 [[LCMP_MOD]], label [[FOR_OUTER_EPIL_PREHEADER:%.*]], label [[FOR_END_LOOPEXIT:%.*]]
-; CHECK:       for.outer.epil.preheader:
-; CHECK-NEXT:    br label [[FOR_OUTER_EPIL:%.*]]
-; CHECK:       for.outer.epil:
+; CHECK-NEXT:    br i1 [[LCMP_MOD]], label %[[FOR_OUTER_EPIL_PREHEADER:.*]], label %[[FOR_END_LOOPEXIT:.*]]
+; CHECK:       [[FOR_OUTER_EPIL_PREHEADER]]:
+; CHECK-NEXT:    br label %[[FOR_OUTER_EPIL:.*]]
+; CHECK:       [[FOR_OUTER_EPIL]]:
 ; CHECK-NEXT:    [[ARRAYIDX_EPIL:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[I_UNR]]
-; CHECK-NEXT:    store i32 0, ptr [[ARRAYIDX_EPIL]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    store i32 0, ptr [[ARRAYIDX_EPIL]], align 4, !tbaa [[INT_TBAA0]]
 ; CHECK-NEXT:    [[ADD_EPIL:%.*]] = add nuw i32 [[I_UNR]], 1
 ; CHECK-NEXT:    [[ARRAYIDX2_EPIL:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[ADD_EPIL]]
-; CHECK-NEXT:    store i32 2, ptr [[ARRAYIDX2_EPIL]], align 4, !tbaa [[TBAA0]]
-; CHECK-NEXT:    br label [[FOR_INNER_EPIL:%.*]]
-; CHECK:       for.inner.epil:
-; CHECK-NEXT:    [[SUM_EPIL:%.*]] = phi i32 [ 0, [[FOR_OUTER_EPIL]] ], [ [[ADD9_EPIL:%.*]], [[FOR_INNER_EPIL]] ]
-; CHECK-NEXT:    [[J_EPIL:%.*]] = phi i32 [ 0, [[FOR_OUTER_EPIL]] ], [ [[ADD10_EPIL:%.*]], [[FOR_INNER_EPIL]] ]
+; CHECK-NEXT:    store i32 2, ptr [[ARRAYIDX2_EPIL]], align 4, !tbaa [[INT_TBAA0]]
+; CHECK-NEXT:    br label %[[FOR_INNER_EPIL:.*]]
+; CHECK:       [[FOR_INNER_EPIL]]:
+; CHECK-NEXT:    [[SUM_EPIL:%.*]] = phi i32 [ 0, %[[FOR_OUTER_EPIL]] ], [ [[ADD9_EPIL:%.*]], %[[FOR_INNER_EPIL]] ]
+; CHECK-NEXT:    [[J_EPIL:%.*]] = phi i32 [ 0, %[[FOR_OUTER_EPIL]] ], [ [[ADD10_EPIL:%.*]], %[[FOR_INNER_EPIL]] ]
 ; CHECK-NEXT:    [[ARRAYIDX7_EPIL:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[J_EPIL]]
-; CHECK-NEXT:    [[L1_EPIL:%.*]] = load i32, ptr [[ARRAYIDX7_EPIL]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    [[L1_EPIL:%.*]] = load i32, ptr [[ARRAYIDX7_EPIL]], align 4, !tbaa [[INT_TBAA0]]
 ; CHECK-NEXT:    [[ADD9_EPIL]] = add i32 [[L1_EPIL]], [[SUM_EPIL]]
 ; CHECK-NEXT:    [[ADD10_EPIL]] = add nuw i32 [[J_EPIL]], 1
 ; CHECK-NEXT:    [[EXITCOND_EPIL:%.*]] = icmp eq i32 [[ADD10_EPIL]], [[E]]
-; CHECK-NEXT:    br i1 [[EXITCOND_EPIL]], label [[FOR_LATCH_EPIL:%.*]], label [[FOR_INNER_EPIL]]
-; CHECK:       for.latch.epil:
-; CHECK-NEXT:    [[ADD9_LCSSA_EPIL:%.*]] = phi i32 [ [[ADD9_EPIL]], [[FOR_INNER_EPIL]] ]
-; CHECK-NEXT:    store i32 [[ADD9_LCSSA_EPIL]], ptr [[ARRAYIDX_EPIL]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    br i1 [[EXITCOND_EPIL]], label %[[FOR_LATCH_EPIL:.*]], label %[[FOR_INNER_EPIL]]
+; CHECK:       [[FOR_LATCH_EPIL]]:
+; CHECK-NEXT:    [[ADD9_LCSSA_EPIL:%.*]] = phi i32 [ [[ADD9_EPIL]], %[[FOR_INNER_EPIL]] ]
+; CHECK-NEXT:    store i32 [[ADD9_LCSSA_EPIL]], ptr [[ARRAYIDX_EPIL]], align 4, !tbaa [[INT_TBAA0]]
 ; CHECK-NEXT:    [[EPIL_ITER_CMP:%.*]] = icmp ne i32 1, [[XTRAITER]]
-; CHECK-NEXT:    br i1 [[EPIL_ITER_CMP]], label [[FOR_OUTER_EPIL_1:%.*]], label [[FOR_END_LOOPEXIT_EPILOG_LCSSA:%.*]]
-; CHECK:       for.outer.epil.1:
+; CHECK-NEXT:    br i1 [[EPIL_ITER_CMP]], label %[[FOR_OUTER_EPIL_1:.*]], label %[[FOR_END_LOOPEXIT_EPILOG_LCSSA:.*]]
+; CHECK:       [[FOR_OUTER_EPIL_1]]:
 ; CHECK-NEXT:    [[ARRAYIDX_EPIL_1:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[ADD_EPIL]]
-; CHECK-NEXT:    store i32 0, ptr [[ARRAYIDX_EPIL_1]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    store i32 0, ptr [[ARRAYIDX_EPIL_1]], align 4, !tbaa [[INT_TBAA0]]
 ; CHECK-NEXT:    [[ADD_EPIL_1:%.*]] = add nuw i32 [[I_UNR]], 2
 ; CHECK-NEXT:    [[ARRAYIDX2_EPIL_1:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[ADD_EPIL_1]]
-; CHECK-NEXT:    store i32 2, ptr [[ARRAYIDX2_EPIL_1]], align 4, !tbaa [[TBAA0]]
-; CHECK-NEXT:    br label [[FOR_INNER_EPIL_1:%.*]]
-; CHECK:       for.inner.epil.1:
-; CHECK-NEXT:    [[SUM_EPIL_1:%.*]] = phi i32 [ 0, [[FOR_OUTER_EPIL_1]] ], [ [[ADD9_EPIL_1:%.*]], [[FOR_INNER_EPIL_1]] ]
-; CHECK-NEXT:    [[J_EPIL_1:%.*]] = phi i32 [ 0, [[FOR_OUTER_EPIL_1]] ], [ [[ADD10_EPIL_1:%.*]], [[FOR_INNER_EPIL_1]] ]
+; CHECK-NEXT:    store i32 2, ptr [[ARRAYIDX2_EPIL_1]], align 4, !tbaa [[INT_TBAA0]]
+; CHECK-NEXT:    br label %[[FOR_INNER_EPIL_1:.*]]
+; CHECK:       [[FOR_INNER_EPIL_1]]:
+; CHECK-NEXT:    [[SUM_EPIL_1:%.*]] = phi i32 [ 0, %[[FOR_OUTER_EPIL_1]] ], [ [[ADD9_EPIL_1:%.*]], %[[FOR_INNER_EPIL_1]] ]
+; CHECK-NEXT:    [[J_EPIL_1:%.*]] = phi i32 [ 0, %[[FOR_OUTER_EPIL_1]] ], [ [[ADD10_EPIL_1:%.*]], %[[FOR_INNER_EPIL_1]] ]
 ; CHECK-NEXT:    [[ARRAYIDX7_EPIL_1:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[J_EPIL_1]]
-; CHECK-NEXT:    [[L1_EPIL_1:%.*]] = load i32, ptr [[ARRAYIDX7_EPIL_1]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    [[L1_EPIL_1:%.*]] = load i32, ptr [[ARRAYIDX7_EPIL_1]], align 4, !tbaa [[INT_TBAA0]]
 ; CHECK-NEXT:    [[ADD9_EPIL_1]] = add i32 [[L1_EPIL_1]], [[SUM_EPIL_1]]
 ; CHECK-NEXT:    [[ADD10_EPIL_1]] = add nuw i32 [[J_EPIL_1]], 1
 ; CHECK-NEXT:    [[EXITCOND_EPIL_1:%.*]] = icmp eq i32 [[ADD10_EPIL_1]], [[E]]
-; CHECK-NEXT:    br i1 [[EXITCOND_EPIL_1]], label [[FOR_LATCH_EPIL_1:%.*]], label [[FOR_INNER_EPIL_1]]
-; CHECK:       for.latch.epil.1:
-; CHECK-NEXT:    [[ADD9_LCSSA_EPIL_1:%.*]] = phi i32 [ [[ADD9_EPIL_1]], [[FOR_INNER_EPIL_1]] ]
-; CHECK-NEXT:    store i32 [[ADD9_LCSSA_EPIL_1]], ptr [[ARRAYIDX_EPIL_1]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    br i1 [[EXITCOND_EPIL_1]], label %[[FOR_LATCH_EPIL_1:.*]], label %[[FOR_INNER_EPIL_1]]
+; CHECK:       [[FOR_LATCH_EPIL_1]]:
+; CHECK-NEXT:    [[ADD9_LCSSA_EPIL_1:%.*]] = phi i32 [ [[ADD9_EPIL_1]], %[[FOR_INNER_EPIL_1]] ]
+; CHECK-NEXT:    store i32 [[ADD9_LCSSA_EPIL_1]], ptr [[ARRAYIDX_EPIL_1]], align 4, !tbaa [[INT_TBAA0]]
 ; CHECK-NEXT:    [[EPIL_ITER_CMP_1:%.*]] = icmp ne i32 2, [[XTRAITER]]
-; CHECK-NEXT:    br i1 [[EPIL_ITER_CMP_1]], label [[FOR_OUTER_EPIL_2:%.*]], label [[FOR_END_LOOPEXIT_EPILOG_LCSSA]]
-; CHECK:       for.outer.epil.2:
+; CHECK-NEXT:    br i1 [[EPIL_ITER_CMP_1]], label %[[FOR_OUTER_EPIL_2:.*]], label %[[FOR_END_LOOPEXIT_EPILOG_LCSSA]]
+; CHECK:       [[FOR_OUTER_EPIL_2]]:
 ; CHECK-NEXT:    [[ARRAYIDX_EPIL_2:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[ADD_EPIL_1]]
-; CHECK-NEXT:    store i32 0, ptr [[ARRAYIDX_EPIL_2]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    store i32 0, ptr [[ARRAYIDX_EPIL_2]], align 4, !tbaa [[INT_TBAA0]]
 ; CHECK-NEXT:    [[ADD_EPIL_2:%.*]] = add nuw i32 [[I_UNR]], 3
 ; CHECK-NEXT:    [[ARRAYIDX2_EPIL_2:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[ADD_EPIL_2]]
-; CHECK-NEXT:    store i32 2, ptr [[ARRAYIDX2_EPIL_2]], align 4, !tbaa [[TBAA0]]
-; CHECK-NEXT:    br label [[FOR_INNER_EPIL_2:%.*]]
-; CHECK:       for.inner.epil.2:
-; CHECK-NEXT:    [[SUM_EPIL_2:%.*]] = phi i32 [ 0, [[FOR_OUTER_EPIL_2]] ], [ [[ADD9_EPIL_2:%.*]], [[FOR_INNER_EPIL_2]] ]
-; CHECK-NEXT:    [[J_EPIL_2:%.*]] = phi i32 [ 0, [[FOR_OUTER_EPIL_2]] ], [ [[ADD10_EPIL_2:%.*]], [[FOR_INNER_EPIL_2]] ]
+; CHECK-NEXT:    store i32 2, ptr [[ARRAYIDX2_EPIL_2]], align 4, !tbaa [[INT_TBAA0]]
+; CHECK-NEXT:    br label %[[FOR_INNER_EPIL_2:.*]]
+; CHECK:       [[FOR_INNER_EPIL_2]]:
+; CHECK-NEXT:    [[SUM_EPIL_2:%.*]] = phi i32 [ 0, %[[FOR_OUTER_EPIL_2]] ], [ [[ADD9_EPIL_2:%.*]], %[[FOR_INNER_EPIL_2]] ]
+; CHECK-NEXT:    [[J_EPIL_2:%.*]] = phi i32 [ 0, %[[FOR_OUTER_EPIL_2]] ], [ [[ADD10_EPIL_2:%.*]], %[[FOR_INNER_EPIL_2]] ]
 ; CHECK-NEXT:    [[ARRAYIDX7_EPIL_2:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[J_EPIL_2]]
-; CHECK-NEXT:    [[L1_EPIL_2:%.*]] = load i32, ptr [[ARRAYIDX7_EPIL_2]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    [[L1_EPIL_2:%.*]] = load i32, ptr [[ARRAYIDX7_EPIL_2]], align 4, !tbaa [[INT_TBAA0]]
 ; CHECK-NEXT:    [[ADD9_EPIL_2]] = add i32 [[L1_EPIL_2]], [[SUM_EPIL_2]]
 ; CHECK-NEXT:    [[ADD10_EPIL_2]] = add nuw i32 [[J_EPIL_2]], 1
 ; CHECK-NEXT:    [[EXITCOND_EPIL_2:%.*]] = icmp eq i32 [[ADD10_EPIL_2]], [[E]]
-; CHECK-NEXT:    br i1 [[EXITCOND_EPIL_2]], label [[FOR_LATCH_EPIL_2:%.*]], label [[FOR_INNER_EPIL_2]]
-; CHECK:       for.latch.epil.2:
-; CHECK-NEXT:    [[ADD9_LCSSA_EPIL_2:%.*]] = phi i32 [ [[ADD9_EPIL_2]], [[FOR_INNER_EPIL_2]] ]
-; CHECK-NEXT:    store i32 [[ADD9_LCSSA_EPIL_2]], ptr [[ARRAYIDX_EPIL_2]], align 4, !tbaa [[TBAA0]]
-; CHECK-NEXT:    br label [[FOR_END_LOOPEXIT_EPILOG_LCSSA]]
-; CHECK:       for.end.loopexit.epilog-lcssa:
-; CHECK-NEXT:    br label [[FOR_END_LOOPEXIT]]
-; CHECK:       for.end.loopexit:
-; CHECK-NEXT:    br label [[FOR_END]]
-; CHECK:       for.end:
+; CHECK-NEXT:    br i1 [[EXITCOND_EPIL_2]], label %[[FOR_LATCH_EPIL_2:.*]], label %[[FOR_INNER_EPIL_2]]
+; CHECK:       [[FOR_LATCH_EPIL_2]]:
+; CHECK-NEXT:    [[ADD9_LCSSA_EPIL_2:%.*]] = phi i32 [ [[ADD9_EPIL_2]], %[[FOR_INNER_EPIL_2]] ]
+; CHECK-NEXT:    store i32 [[ADD9_LCSSA_EPIL_2]], ptr [[ARRAYIDX_EPIL_2]], align 4, !tbaa [[INT_TBAA0]]
+; CHECK-NEXT:    br label %[[FOR_END_LOOPEXIT_EPILOG_LCSSA]]
+; CHECK:       [[FOR_END_LOOPEXIT_EPILOG_LCSSA]]:
+; CHECK-NEXT:    br label %[[FOR_END_LOOPEXIT]]
+; CHECK:       [[FOR_END_LOOPEXIT]]:
+; CHECK-NEXT:    br label %[[FOR_END]]
+; CHECK:       [[FOR_END]]:
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -888,166 +893,167 @@ for.end:
 
 ; Same as test7 with an extra outer loop nest
 define void @test8(i32 %I, i32 %E, ptr noalias nocapture %A, ptr noalias nocapture readonly %B) #0 {
-; CHECK-LABEL: @test8(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[E:%.*]], 0
-; CHECK-NEXT:    [[CMP336:%.*]] = icmp eq i32 [[I:%.*]], 0
+; CHECK-LABEL: define void @test8(
+; CHECK-SAME: i32 [[I:%.*]], i32 [[E:%.*]], ptr noalias captures(none) [[A:%.*]], ptr noalias readonly captures(none) [[B:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[E]], 0
+; CHECK-NEXT:    [[CMP336:%.*]] = icmp eq i32 [[I]], 0
 ; CHECK-NEXT:    [[OR_COND:%.*]] = or i1 [[CMP]], [[CMP336]]
-; CHECK-NEXT:    br i1 [[OR_COND]], label [[FOR_END:%.*]], label [[FOR_PREHEADER:%.*]]
-; CHECK:       for.preheader:
+; CHECK-NEXT:    br i1 [[OR_COND]], label %[[FOR_END:.*]], label %[[FOR_PREHEADER:.*]]
+; CHECK:       [[FOR_PREHEADER]]:
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[I]], -1
-; CHECK-NEXT:    br label [[FOR_OUTEST:%.*]]
-; CHECK:       for.outest:
-; CHECK-NEXT:    [[X_038:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_CLEANUP:%.*]] ], [ 0, [[FOR_PREHEADER]] ]
+; CHECK-NEXT:    br label %[[FOR_OUTEST:.*]]
+; CHECK:       [[FOR_OUTEST]]:
+; CHECK-NEXT:    [[X_038:%.*]] = phi i32 [ [[INC:%.*]], %[[FOR_CLEANUP:.*]] ], [ 0, %[[FOR_PREHEADER]] ]
 ; CHECK-NEXT:    [[XTRAITER:%.*]] = and i32 [[I]], 3
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP0]], 3
-; CHECK-NEXT:    br i1 [[TMP1]], label [[FOR_CLEANUP_UNR_LCSSA:%.*]], label [[FOR_OUTEST_NEW:%.*]]
-; CHECK:       for.outest.new:
+; CHECK-NEXT:    br i1 [[TMP1]], label %[[FOR_CLEANUP_UNR_LCSSA:.*]], label %[[FOR_OUTEST_NEW:.*]]
+; CHECK:       [[FOR_OUTEST_NEW]]:
 ; CHECK-NEXT:    [[UNROLL_ITER:%.*]] = sub i32 [[I]], [[XTRAITER]]
-; CHECK-NEXT:    br label [[FOR_OUTER:%.*]]
-; CHECK:       for.outer:
-; CHECK-NEXT:    [[I:%.*]] = phi i32 [ [[ADD_3:%.*]], [[FOR_LATCH:%.*]] ], [ 0, [[FOR_OUTEST_NEW]] ]
-; CHECK-NEXT:    [[NITER:%.*]] = phi i32 [ 0, [[FOR_OUTEST_NEW]] ], [ [[NITER_NEXT_3:%.*]], [[FOR_LATCH]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[I]]
-; CHECK-NEXT:    store i32 0, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    br label %[[FOR_OUTER:.*]]
+; CHECK:       [[FOR_OUTER]]:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ [[ADD_3:%.*]], %[[FOR_LATCH:.*]] ], [ 0, %[[FOR_OUTEST_NEW]] ]
+; CHECK-NEXT:    [[NITER:%.*]] = phi i32 [ 0, %[[FOR_OUTEST_NEW]] ], [ [[NITER_NEXT_3:%.*]], %[[FOR_LATCH]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[I]]
+; CHECK-NEXT:    store i32 0, ptr [[ARRAYIDX]], align 4, !tbaa [[INT_TBAA0]]
 ; CHECK-NEXT:    [[ADD:%.*]] = add nuw nsw i32 [[I]], 1
 ; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[ADD]]
-; CHECK-NEXT:    store i32 2, ptr [[ARRAYIDX6]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    store i32 2, ptr [[ARRAYIDX6]], align 4, !tbaa [[INT_TBAA0]]
 ; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[ADD]]
-; CHECK-NEXT:    store i32 0, ptr [[ARRAYIDX_1]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    store i32 0, ptr [[ARRAYIDX_1]], align 4, !tbaa [[INT_TBAA0]]
 ; CHECK-NEXT:    [[ADD_1:%.*]] = add nuw nsw i32 [[I]], 2
 ; CHECK-NEXT:    [[ARRAYIDX6_1:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[ADD_1]]
-; CHECK-NEXT:    store i32 2, ptr [[ARRAYIDX6_1]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    store i32 2, ptr [[ARRAYIDX6_1]], align 4, !tbaa [[INT_TBAA0]]
 ; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[ADD_1]]
-; CHECK-NEXT:    store i32 0, ptr [[ARRAYIDX_2]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    store i32 0, ptr [[ARRAYIDX_2]], align 4, !tbaa [[INT_TBAA0]]
 ; CHECK-NEXT:    [[ADD_2:%.*]] = add nuw nsw i32 [[I]], 3
 ; CHECK-NEXT:    [[ARRAYIDX6_2:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[ADD_2]]
-; CHECK-NEXT:    store i32 2, ptr [[ARRAYIDX6_2]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    store i32 2, ptr [[ARRAYIDX6_2]], align 4, !tbaa [[INT_TBAA0]]
 ; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[ADD_2]]
-; CHECK-NEXT:    store i32 0, ptr [[ARRAYIDX_3]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    store i32 0, ptr [[ARRAYIDX_3]], align 4, !tbaa [[INT_TBAA0]]
 ; CHECK-NEXT:    [[ADD_3]] = add nuw i32 [[I]], 4
 ; CHECK-NEXT:    [[ARRAYIDX6_3:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[ADD_3]]
-; CHECK-NEXT:    store i32 2, ptr [[ARRAYIDX6_3]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    store i32 2, ptr [[ARRAYIDX6_3]], align 4, !tbaa [[INT_TBAA0]]
 ; CHECK-NEXT:    [[NITER_NEXT_3]] = add i32 [[NITER]], 4
-; CHECK-NEXT:    br label [[FOR_INNER:%.*]]
-; CHECK:       for.inner:
-; CHECK-NEXT:    [[SUM:%.*]] = phi i32 [ 0, [[FOR_OUTER]] ], [ [[ADD9:%.*]], [[FOR_INNER]] ]
-; CHECK-NEXT:    [[J:%.*]] = phi i32 [ 0, [[FOR_OUTER]] ], [ [[ADD10:%.*]], [[FOR_INNER]] ]
-; CHECK-NEXT:    [[SUM_1:%.*]] = phi i32 [ 0, [[FOR_OUTER]] ], [ [[ADD9_1:%.*]], [[FOR_INNER]] ]
-; CHECK-NEXT:    [[J_1:%.*]] = phi i32 [ 0, [[FOR_OUTER]] ], [ [[ADD10_1:%.*]], [[FOR_INNER]] ]
-; CHECK-NEXT:    [[SUM_2:%.*]] = phi i32 [ 0, [[FOR_OUTER]] ], [ [[ADD9_2:%.*]], [[FOR_INNER]] ]
-; CHECK-NEXT:    [[J_2:%.*]] = phi i32 [ 0, [[FOR_OUTER]] ], [ [[ADD10_2:%.*]], [[FOR_INNER]] ]
-; CHECK-NEXT:    [[SUM_3:%.*]] = phi i32 [ 0, [[FOR_OUTER]] ], [ [[ADD9_3:%.*]], [[FOR_INNER]] ]
-; CHECK-NEXT:    [[J_3:%.*]] = phi i32 [ 0, [[FOR_OUTER]] ], [ [[ADD10_3:%.*]], [[FOR_INNER]] ]
-; CHECK-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i32 [[J]]
-; CHECK-NEXT:    [[L1:%.*]] = load i32, ptr [[ARRAYIDX11]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    br label %[[FOR_INNER:.*]]
+; CHECK:       [[FOR_INNER]]:
+; CHECK-NEXT:    [[SUM:%.*]] = phi i32 [ 0, %[[FOR_OUTER]] ], [ [[ADD9:%.*]], %[[FOR_INNER]] ]
+; CHECK-NEXT:    [[J:%.*]] = phi i32 [ 0, %[[FOR_OUTER]] ], [ [[ADD10:%.*]], %[[FOR_INNER]] ]
+; CHECK-NEXT:    [[SUM_1:%.*]] = phi i32 [ 0, %[[FOR_OUTER]] ], [ [[ADD9_1:%.*]], %[[FOR_INNER]] ]
+; CHECK-NEXT:    [[J_1:%.*]] = phi i32 [ 0, %[[FOR_OUTER]] ], [ [[ADD10_1:%.*]], %[[FOR_INNER]] ]
+; CHECK-NEXT:    [[SUM_2:%.*]] = phi i32 [ 0, %[[FOR_OUTER]] ], [ [[ADD9_2:%.*]], %[[FOR_INNER]] ]
+; CHECK-NEXT:    [[J_2:%.*]] = phi i32 [ 0, %[[FOR_OUTER]] ], [ [[ADD10_2:%.*]], %[[FOR_INNER]] ]
+; CHECK-NEXT:    [[SUM_3:%.*]] = phi i32 [ 0, %[[FOR_OUTER]] ], [ [[ADD9_3:%.*]], %[[FOR_INNER]] ]
+; CHECK-NEXT:    [[J_3:%.*]] = phi i32 [ 0, %[[FOR_OUTER]] ], [ [[ADD10_3:%.*]], %[[FOR_INNER]] ]
+; CHECK-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[J]]
+; CHECK-NEXT:    [[L1:%.*]] = load i32, ptr [[ARRAYIDX11]], align 4, !tbaa [[INT_TBAA0]]
 ; CHECK-NEXT:    [[ADD9]] = add i32 [[L1]], [[SUM]]
 ; CHECK-NEXT:    [[ADD10]] = add nuw i32 [[J]], 1
 ; CHECK-NEXT:    [[ARRAYIDX11_1:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[J_1]]
-; CHECK-NEXT:    [[L1_1:%.*]] = load i32, ptr [[ARRAYIDX11_1]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    [[L1_1:%.*]] = load i32, ptr [[ARRAYIDX11_1]], align 4, !tbaa [[INT_TBAA0]]
 ; CHECK-NEXT:    [[ADD9_1]] = add i32 [[L1_1]], [[SUM_1]]
 ; CHECK-NEXT:    [[ADD10_1]] = add nuw i32 [[J_1]], 1
 ; CHECK-NEXT:    [[ARRAYIDX11_2:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[J_2]]
-; CHECK-NEXT:    [[L1_2:%.*]] = load i32, ptr [[ARRAYIDX11_2]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    [[L1_2:%.*]] = load i32, ptr [[ARRAYIDX11_2]], align 4, !tbaa [[INT_TBAA0]]
 ; CHECK-NEXT:    [[ADD9_2]] = add i32 [[L1_2]], [[SUM_2]]
 ; CHECK-NEXT:    [[ADD10_2]] = add nuw i32 [[J_2]], 1
 ; CHECK-NEXT:    [[ARRAYIDX11_3:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[J_3]]
-; CHECK-NEXT:    [[L1_3:%.*]] = load i32, ptr [[ARRAYIDX11_3]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    [[L1_3:%.*]] = load i32, ptr [[ARRAYIDX11_3]], align 4, !tbaa [[INT_TBAA0]]
 ; CHECK-NEXT:    [[ADD9_3]] = add i32 [[L1_3]], [[SUM_3]]
 ; CHECK-NEXT:    [[ADD10_3]] = add nuw i32 [[J_3]], 1
 ; CHECK-NEXT:    [[EXITCOND_3:%.*]] = icmp eq i32 [[ADD10_3]], [[E]]
-; CHECK-NEXT:    br i1 [[EXITCOND_3]], label [[FOR_LATCH]], label [[FOR_INNER]]
-; CHECK:       for.latch:
-; CHECK-NEXT:    [[ADD9_LCSSA:%.*]] = phi i32 [ [[ADD9]], [[FOR_INNER]] ]
-; CHECK-NEXT:    [[ADD9_LCSSA_1:%.*]] = phi i32 [ [[ADD9_1]], [[FOR_INNER]] ]
-; CHECK-NEXT:    [[ADD9_LCSSA_2:%.*]] = phi i32 [ [[ADD9_2]], [[FOR_INNER]] ]
-; CHECK-NEXT:    [[ADD9_LCSSA_3:%.*]] = phi i32 [ [[ADD9_3]], [[FOR_INNER]] ]
-; CHECK-NEXT:    store i32 [[ADD9_LCSSA]], ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA0]]
-; CHECK-NEXT:    store i32 [[ADD9_LCSSA_1]], ptr [[ARRAYIDX_1]], align 4, !tbaa [[TBAA0]]
-; CHECK-NEXT:    store i32 [[ADD9_LCSSA_2]], ptr [[ARRAYIDX_2]], align 4, !tbaa [[TBAA0]]
-; CHECK-NEXT:    store i32 [[ADD9_LCSSA_3]], ptr [[ARRAYIDX_3]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    br i1 [[EXITCOND_3]], label %[[FOR_LATCH]], label %[[FOR_INNER]]
+; CHECK:       [[FOR_LATCH]]:
+; CHECK-NEXT:    [[ADD9_LCSSA:%.*]] = phi i32 [ [[ADD9]], %[[FOR_INNER]] ]
+; CHECK-NEXT:    [[ADD9_LCSSA_1:%.*]] = phi i32 [ [[ADD9_1]], %[[FOR_INNER]] ]
+; CHECK-NEXT:    [[ADD9_LCSSA_2:%.*]] = phi i32 [ [[ADD9_2]], %[[FOR_INNER]] ]
+; CHECK-NEXT:    [[ADD9_LCSSA_3:%.*]] = phi i32 [ [[ADD9_3]], %[[FOR_INNER]] ]
+; CHECK-NEXT:    store i32 [[ADD9_LCSSA]], ptr [[ARRAYIDX]], align 4, !tbaa [[INT_TBAA0]]
+; CHECK-NEXT:    store i32 [[ADD9_LCSSA_1]], ptr [[ARRAYIDX_1]], align 4, !tbaa [[INT_TBAA0]]
+; CHECK-NEXT:    store i32 [[ADD9_LCSSA_2]], ptr [[ARRAYIDX_2]], align 4, !tbaa [[INT_TBAA0]]
+; CHECK-NEXT:    store i32 [[ADD9_LCSSA_3]], ptr [[ARRAYIDX_3]], align 4, !tbaa [[INT_TBAA0]]
 ; CHECK-NEXT:    [[NITER_NCMP_3:%.*]] = icmp eq i32 [[NITER_NEXT_3]], [[UNROLL_ITER]]
-; CHECK-NEXT:    br i1 [[NITER_NCMP_3]], label [[FOR_CLEANUP_UNR_LCSSA_LOOPEXIT:%.*]], label [[FOR_OUTER]], !llvm.loop [[LOOP9:![0-9]+]]
-; CHECK:       for.cleanup.unr-lcssa.loopexit:
-; CHECK-NEXT:    [[I_UNR_PH:%.*]] = phi i32 [ [[ADD_3]], [[FOR_LATCH]] ]
-; CHECK-NEXT:    br label [[FOR_CLEANUP_UNR_LCSSA]]
-; CHECK:       for.cleanup.unr-lcssa:
-; CHECK-NEXT:    [[I_UNR:%.*]] = phi i32 [ 0, [[FOR_OUTEST]] ], [ [[I_UNR_PH]], [[FOR_CLEANUP_UNR_LCSSA_LOOPEXIT]] ]
+; CHECK-NEXT:    br i1 [[NITER_NCMP_3]], label %[[FOR_CLEANUP_UNR_LCSSA_LOOPEXIT:.*]], label %[[FOR_OUTER]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK:       [[FOR_CLEANUP_UNR_LCSSA_LOOPEXIT]]:
+; CHECK-NEXT:    [[I_UNR_PH:%.*]] = phi i32 [ [[ADD_3]], %[[FOR_LATCH]] ]
+; CHECK-NEXT:    br label %[[FOR_CLEANUP_UNR_LCSSA]]
+; CHECK:       [[FOR_CLEANUP_UNR_LCSSA]]:
+; CHECK-NEXT:    [[I_UNR:%.*]] = phi i32 [ 0, %[[FOR_OUTEST]] ], [ [[I_UNR_PH]], %[[FOR_CLEANUP_UNR_LCSSA_LOOPEXIT]] ]
 ; CHECK-NEXT:    [[LCMP_MOD:%.*]] = icmp ne i32 [[XTRAITER]], 0
-; CHECK-NEXT:    br i1 [[LCMP_MOD]], label [[FOR_OUTER_EPIL_PREHEADER:%.*]], label [[FOR_CLEANUP]]
-; CHECK:       for.outer.epil.preheader:
-; CHECK-NEXT:    br label [[FOR_OUTER_EPIL:%.*]]
-; CHECK:       for.outer.epil:
+; CHECK-NEXT:    br i1 [[LCMP_MOD]], label %[[FOR_OUTER_EPIL_PREHEADER:.*]], label %[[FOR_CLEANUP]]
+; CHECK:       [[FOR_OUTER_EPIL_PREHEADER]]:
+; CHECK-NEXT:    br label %[[FOR_OUTER_EPIL:.*]]
+; CHECK:       [[FOR_OUTER_EPIL]]:
 ; CHECK-NEXT:    [[ARRAYIDX_EPIL:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[I_UNR]]
-; CHECK-NEXT:    store i32 0, ptr [[ARRAYIDX_EPIL]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    store i32 0, ptr [[ARRAYIDX_EPIL]], align 4, !tbaa [[INT_TBAA0]]
 ; CHECK-NEXT:    [[ADD_EPIL:%.*]] = add nuw i32 [[I_UNR]], 1
 ; CHECK-NEXT:    [[ARRAYIDX6_EPIL:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[ADD_EPIL]]
-; CHECK-NEXT:    store i32 2, ptr [[ARRAYIDX6_EPIL]], align 4, !tbaa [[TBAA0]]
-; CHECK-NEXT:    br label [[FOR_INNER_EPIL:%.*]]
-; CHECK:       for.inner.epil:
-; CHECK-NEXT:    [[SUM_EPIL:%.*]] = phi i32 [ 0, [[FOR_OUTER_EPIL]] ], [ [[ADD9_EPIL:%.*]], [[FOR_INNER_EPIL]] ]
-; CHECK-NEXT:    [[J_EPIL:%.*]] = phi i32 [ 0, [[FOR_OUTER_EPIL]] ], [ [[ADD10_EPIL:%.*]], [[FOR_INNER_EPIL]] ]
+; CHECK-NEXT:    store i32 2, ptr [[ARRAYIDX6_EPIL]], align 4, !tbaa [[INT_TBAA0]]
+; CHECK-NEXT:    br label %[[FOR_INNER_EPIL:.*]]
+; CHECK:       [[FOR_INNER_EPIL]]:
+; CHECK-NEXT:    [[SUM_EPIL:%.*]] = phi i32 [ 0, %[[FOR_OUTER_EPIL]] ], [ [[ADD9_EPIL:%.*]], %[[FOR_INNER_EPIL]] ]
+; CHECK-NEXT:    [[J_EPIL:%.*]] = phi i32 [ 0, %[[FOR_OUTER_EPIL]] ], [ [[ADD10_EPIL:%.*]], %[[FOR_INNER_EPIL]] ]
 ; CHECK-NEXT:    [[ARRAYIDX11_EPIL:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[J_EPIL]]
-; CHECK-NEXT:    [[L1_EPIL:%.*]] = load i32, ptr [[ARRAYIDX11_EPIL]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    [[L1_EPIL:%.*]] = load i32, ptr [[ARRAYIDX11_EPIL]], align 4, !tbaa [[INT_TBAA0]]
 ; CHECK-NEXT:    [[ADD9_EPIL]] = add i32 [[L1_EPIL]], [[SUM_EPIL]]
 ; CHECK-NEXT:    [[ADD10_EPIL]] = add nuw i32 [[J_EPIL]], 1
 ; CHECK-NEXT:    [[EXITCOND_EPIL:%.*]] = icmp eq i32 [[ADD10_EPIL]], [[E]]
-; CHECK-NEXT:    br i1 [[EXITCOND_EPIL]], label [[FOR_LATCH_EPIL:%.*]], label [[FOR_INNER_EPIL]]
-; CHECK:       for.latch.epil:
-; CHECK-NEXT:    [[ADD9_LCSSA_EPIL:%.*]] = phi i32 [ [[ADD9_EPIL]], [[FOR_INNER_EPIL]] ]
-; CHECK-NEXT:    store i32 [[ADD9_LCSSA_EPIL]], ptr [[ARRAYIDX_EPIL]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    br i1 [[EXITCOND_EPIL]], label %[[FOR_LATCH_EPIL:.*]], label %[[FOR_INNER_EPIL]]
+; CHECK:       [[FOR_LATCH_EPIL]]:
+; CHECK-NEXT:    [[ADD9_LCSSA_EPIL:%.*]] = phi i32 [ [[ADD9_EPIL]], %[[FOR_INNER_EPIL]] ]
+; CHECK-NEXT:    store i32 [[ADD9_LCSSA_EPIL]], ptr [[ARRAYIDX_EPIL]], align 4, !tbaa [[INT_TBAA0]]
 ; CHECK-NEXT:    [[EPIL_ITER_CMP:%.*]] = icmp ne i32 1, [[XTRAITER]]
-; CHECK-NEXT:    br i1 [[EPIL_ITER_CMP]], label [[FOR_OUTER_EPIL_1:%.*]], label [[FOR_CLEANUP_EPILOG_LCSSA:%.*]]
-; CHECK:       for.outer.epil.1:
+; CHECK-NEXT:    br i1 [[EPIL_ITER_CMP]], label %[[FOR_OUTER_EPIL_1:.*]], label %[[FOR_CLEANUP_EPILOG_LCSSA:.*]]
+; CHECK:       [[FOR_OUTER_EPIL_1]]:
 ; CHECK-NEXT:    [[ARRAYIDX_EPIL_1:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[ADD_EPIL]]
-; CHECK-NEXT:    store i32 0, ptr [[ARRAYIDX_EPIL_1]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    store i32 0, ptr [[ARRAYIDX_EPIL_1]], align 4, !tbaa [[INT_TBAA0]]
 ; CHECK-NEXT:    [[ADD_EPIL_1:%.*]] = add nuw i32 [[I_UNR]], 2
 ; CHECK-NEXT:    [[ARRAYIDX6_EPIL_1:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[ADD_EPIL_1]]
-; CHECK-NEXT:    store i32 2, ptr [[ARRAYIDX6_EPIL_1]], align 4, !tbaa [[TBAA0]]
-; CHECK-NEXT:    br label [[FOR_INNER_EPIL_1:%.*]]
-; CHECK:       for.inner.epil.1:
-; CHECK-NEXT:    [[SUM_EPIL_1:%.*]] = phi i32 [ 0, [[FOR_OUTER_EPIL_1]] ], [ [[ADD9_EPIL_1:%.*]], [[FOR_INNER_EPIL_1]] ]
-; CHECK-NEXT:    [[J_EPIL_1:%.*]] = phi i32 [ 0, [[FOR_OUTER_EPIL_1]] ], [ [[ADD10_EPIL_1:%.*]], [[FOR_INNER_EPIL_1]] ]
+; CHECK-NEXT:    store i32 2, ptr [[ARRAYIDX6_EPIL_1]], align 4, !tbaa [[INT_TBAA0]]
+; CHECK-NEXT:    br label %[[FOR_INNER_EPIL_1:.*]]
+; CHECK:       [[FOR_INNER_EPIL_1]]:
+; CHECK-NEXT:    [[SUM_EPIL_1:%.*]] = phi i32 [ 0, %[[FOR_OUTER_EPIL_1]] ], [ [[ADD9_EPIL_1:%.*]], %[[FOR_INNER_EPIL_1]] ]
+; CHECK-NEXT:    [[J_EPIL_1:%.*]] = phi i32 [ 0, %[[FOR_OUTER_EPIL_1]] ], [ [[ADD10_EPIL_1:%.*]], %[[FOR_INNER_EPIL_1]] ]
 ; CHECK-NEXT:    [[ARRAYIDX11_EPIL_1:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[J_EPIL_1]]
-; CHECK-NEXT:    [[L1_EPIL_1:%.*]] = load i32, ptr [[ARRAYIDX11_EPIL_1]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    [[L1_EPIL_1:%.*]] = load i32, ptr [[ARRAYIDX11_EPIL_1]], align 4, !tbaa [[INT_TBAA0]]
 ; CHECK-NEXT:    [[ADD9_EPIL_1]] = add i32 [[L1_EPIL_1]], [[SUM_EPIL_1]]
 ; CHECK-NEXT:    [[ADD10_EPIL_1]] = add nuw i32 [[J_EPIL_1]], 1
 ; CHECK-NEXT:    [[EXITCOND_EPIL_1:%.*]] = icmp eq i32 [[ADD10_EPIL_1]], [[E]]
-; CHECK-NEXT:    br i1 [[EXITCOND_EPIL_1]], label [[FOR_LATCH_EPIL_1:%.*]], label [[FOR_INNER_EPIL_1]]
-; CHECK:       for.latch.epil.1:
-; CHECK-NEXT:    [[ADD9_LCSSA_EPIL_1:%.*]] = phi i32 [ [[ADD9_EPIL_1]], [[FOR_INNER_EPIL_1]] ]
-; CHECK-NEXT:    store i32 [[ADD9_LCSSA_EPIL_1]], ptr [[ARRAYIDX_EPIL_1]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    br i1 [[EXITCOND_EPIL_1]], label %[[FOR_LATCH_EPIL_1:.*]], label %[[FOR_INNER_EPIL_1]]
+; CHECK:       [[FOR_LATCH_EPIL_1]]:
+; CHECK-NEXT:    [[ADD9_LCSSA_EPIL_1:%.*]] = phi i32 [ [[ADD9_EPIL_1]], %[[FOR_INNER_EPIL_1]] ]
+; CHECK-NEXT:    store i32 [[ADD9_LCSSA_EPIL_1]], ptr [[ARRAYIDX_EPIL_1]], align 4, !tbaa [[INT_TBAA0]]
 ; CHECK-NEXT:    [[EPIL_ITER_CMP_1:%.*]] = icmp ne i32 2, [[XTRAITER]]
-; CHECK-NEXT:    br i1 [[EPIL_ITER_CMP_1]], label [[FOR_OUTER_EPIL_2:%.*]], label [[FOR_CLEANUP_EPILOG_LCSSA]]
-; CHECK:       for.outer.epil.2:
+; CHECK-NEXT:    br i1 [[EPIL_ITER_CMP_1]], label %[[FOR_OUTER_EPIL_2:.*]], label %[[FOR_CLEANUP_EPILOG_LCSSA]]
+; CHECK:       [[FOR_OUTER_EPIL_2]]:
 ; CHECK-NEXT:    [[ARRAYIDX_EPIL_2:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[ADD_EPIL_1]]
-; CHECK-NEXT:    store i32 0, ptr [[ARRAYIDX_EPIL_2]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    store i32 0, ptr [[ARRAYIDX_EPIL_2]], align 4, !tbaa [[INT_TBAA0]]
 ; CHECK-NEXT:    [[ADD_EPIL_2:%.*]] = add nuw i32 [[I_UNR]], 3
 ; CHECK-NEXT:    [[ARRAYIDX6_EPIL_2:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[ADD_EPIL_2]]
-; CHECK-NEXT:    store i32 2, ptr [[ARRAYIDX6_EPIL_2]], align 4, !tbaa [[TBAA0]]
-; CHECK-NEXT:    br label [[FOR_INNER_EPIL_2:%.*]]
-; CHECK:       for.inner.epil.2:
-; CHECK-NEXT:    [[SUM_EPIL_2:%.*]] = phi i32 [ 0, [[FOR_OUTER_EPIL_2]] ], [ [[ADD9_EPIL_2:%.*]], [[FOR_INNER_EPIL_2]] ]
-; CHECK-NEXT:    [[J_EPIL_2:%.*]] = phi i32 [ 0, [[FOR_OUTER_EPIL_2]] ], [ [[ADD10_EPIL_2:%.*]], [[FOR_INNER_EPIL_2]] ]
+; CHECK-NEXT:    store i32 2, ptr [[ARRAYIDX6_EPIL_2]], align 4, !tbaa [[INT_TBAA0]]
+; CHECK-NEXT:    br label %[[FOR_INNER_EPIL_2:.*]]
+; CHECK:       [[FOR_INNER_EPIL_2]]:
+; CHECK-NEXT:    [[SUM_EPIL_2:%.*]] = phi i32 [ 0, %[[FOR_OUTER_EPIL_2]] ], [ [[ADD9_EPIL_2:%.*]], %[[FOR_INNER_EPIL_2]] ]
+; CHECK-NEXT:    [[J_EPIL_2:%.*]] = phi i32 [ 0, %[[FOR_OUTER_EPIL_2]] ], [ [[ADD10_EPIL_2:%.*]], %[[FOR_INNER_EPIL_2]] ]
 ; CHECK-NEXT:    [[ARRAYIDX11_EPIL_2:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[J_EPIL_2]]
-; CHECK-NEXT:    [[L1_EPIL_2:%.*]] = load i32, ptr [[ARRAYIDX11_EPIL_2]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    [[L1_EPIL_2:%.*]] = load i32, ptr [[ARRAYIDX11_EPIL_2]], align 4, !tbaa [[INT_TBAA0]]
 ; CHECK-NEXT:    [[ADD9_EPIL_2]] = add i32 [[L1_EPIL_2]], [[SUM_EPIL_2]]
 ; CHECK-NEXT:    [[ADD10_EPIL_2]] = add nuw i32 [[J_EPIL_2]], 1
 ; CHECK-NEXT:    [[EXITCOND_EPIL_2:%.*]] = icmp eq i32 [[ADD10_EPIL_2]], [[E]]
-; CHECK-NEXT:    br i1 [[EXITCOND_EPIL_2]], label [[FOR_LATCH_EPIL_2:%.*]], label [[FOR_INNER_EPIL_2]]
-; CHECK:       for.latch.epil.2:
-; CHECK-NEXT:    [[ADD9_LCSSA_EPIL_2:%.*]] = phi i32 [ [[ADD9_EPIL_2]], [[FOR_INNER_EPIL_2]] ]
-; CHECK-NEXT:    store i32 [[ADD9_LCSSA_EPIL_2]], ptr [[ARRAYIDX_EPIL_2]], align 4, !tbaa [[TBAA0]]
-; CHECK-NEXT:    br label [[FOR_CLEANUP_EPILOG_LCSSA]]
-; CHECK:       for.cleanup.epilog-lcssa:
-; CHECK-NEXT:    br label [[FOR_CLEANUP]]
-; CHECK:       for.cleanup:
+; CHECK-NEXT:    br i1 [[EXITCOND_EPIL_2]], label %[[FOR_LATCH_EPIL_2:.*]], label %[[FOR_INNER_EPIL_2]]
+; CHECK:       [[FOR_LATCH_EPIL_2]]:
+; CHECK-NEXT:    [[ADD9_LCSSA_EPIL_2:%.*]] = phi i32 [ [[ADD9_EPIL_2]], %[[FOR_INNER_EPIL_2]] ]
+; CHECK-NEXT:    store i32 [[ADD9_LCSSA_EPIL_2]], ptr [[ARRAYIDX_EPIL_2]], align 4, !tbaa [[INT_TBAA0]]
+; CHECK-NEXT:    br label %[[FOR_CLEANUP_EPILOG_LCSSA]]
+; CHECK:       [[FOR_CLEANUP_EPILOG_LCSSA]]:
+; CHECK-NEXT:    br label %[[FOR_CLEANUP]]
+; CHECK:       [[FOR_CLEANUP]]:
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[X_038]], 1
 ; CHECK-NEXT:    [[EXITCOND41:%.*]] = icmp eq i32 [[INC]], 5
-; CHECK-NEXT:    br i1 [[EXITCOND41]], label [[FOR_END_LOOPEXIT:%.*]], label [[FOR_OUTEST]]
-; CHECK:       for.end.loopexit:
-; CHECK-NEXT:    br label [[FOR_END]]
-; CHECK:       for.end:
+; CHECK-NEXT:    br i1 [[EXITCOND41]], label %[[FOR_END_LOOPEXIT:.*]], label %[[FOR_OUTEST]]
+; CHECK:       [[FOR_END_LOOPEXIT]]:
+; CHECK-NEXT:    br label %[[FOR_END]]
+; CHECK:       [[FOR_END]]:
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -1099,144 +1105,145 @@ for.end:
 
 ; Same as test1 with tbaa, not noalias
 define void @test9(i32 %I, i32 %E, ptr nocapture %A, ptr nocapture readonly %B) #0 {
-; CHECK-LABEL: @test9(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[E:%.*]], 0
-; CHECK-NEXT:    [[CMPJ:%.*]] = icmp ne i32 [[I:%.*]], 0
+; CHECK-LABEL: define void @test9(
+; CHECK-SAME: i32 [[I:%.*]], i32 [[E:%.*]], ptr captures(none) [[A:%.*]], ptr readonly captures(none) [[B:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[E]], 0
+; CHECK-NEXT:    [[CMPJ:%.*]] = icmp ne i32 [[I]], 0
 ; CHECK-NEXT:    [[OR_COND:%.*]] = and i1 [[CMP]], [[CMPJ]]
-; CHECK-NEXT:    br i1 [[OR_COND]], label [[FOR_OUTER_PREHEADER:%.*]], label [[FOR_END:%.*]]
-; CHECK:       for.outer.preheader:
+; CHECK-NEXT:    br i1 [[OR_COND]], label %[[FOR_OUTER_PREHEADER:.*]], label %[[FOR_END:.*]]
+; CHECK:       [[FOR_OUTER_PREHEADER]]:
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[I]], -1
 ; CHECK-NEXT:    [[XTRAITER:%.*]] = and i32 [[I]], 3
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP0]], 3
-; CHECK-NEXT:    br i1 [[TMP1]], label [[FOR_END_LOOPEXIT_UNR_LCSSA:%.*]], label [[FOR_OUTER_PREHEADER_NEW:%.*]]
-; CHECK:       for.outer.preheader.new:
+; CHECK-NEXT:    br i1 [[TMP1]], label %[[FOR_END_LOOPEXIT_UNR_LCSSA:.*]], label %[[FOR_OUTER_PREHEADER_NEW:.*]]
+; CHECK:       [[FOR_OUTER_PREHEADER_NEW]]:
 ; CHECK-NEXT:    [[UNROLL_ITER:%.*]] = sub i32 [[I]], [[XTRAITER]]
-; CHECK-NEXT:    br label [[FOR_OUTER:%.*]]
-; CHECK:       for.outer:
-; CHECK-NEXT:    [[I:%.*]] = phi i32 [ [[ADD8_3:%.*]], [[FOR_LATCH:%.*]] ], [ 0, [[FOR_OUTER_PREHEADER_NEW]] ]
-; CHECK-NEXT:    [[NITER:%.*]] = phi i32 [ 0, [[FOR_OUTER_PREHEADER_NEW]] ], [ [[NITER_NEXT_3:%.*]], [[FOR_LATCH]] ]
+; CHECK-NEXT:    br label %[[FOR_OUTER:.*]]
+; CHECK:       [[FOR_OUTER]]:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ [[ADD8_3:%.*]], %[[FOR_LATCH:.*]] ], [ 0, %[[FOR_OUTER_PREHEADER_NEW]] ]
+; CHECK-NEXT:    [[NITER:%.*]] = phi i32 [ 0, %[[FOR_OUTER_PREHEADER_NEW]] ], [ [[NITER_NEXT_3:%.*]], %[[FOR_LATCH]] ]
 ; CHECK-NEXT:    [[ADD8:%.*]] = add nuw nsw i32 [[I]], 1
 ; CHECK-NEXT:    [[ADD8_1:%.*]] = add nuw nsw i32 [[I]], 2
 ; CHECK-NEXT:    [[ADD8_2:%.*]] = add nuw nsw i32 [[I]], 3
 ; CHECK-NEXT:    [[ADD8_3]] = add nuw i32 [[I]], 4
 ; CHECK-NEXT:    [[NITER_NEXT_3]] = add i32 [[NITER]], 4
-; CHECK-NEXT:    br label [[FOR_INNER:%.*]]
-; CHECK:       for.inner:
-; CHECK-NEXT:    [[J:%.*]] = phi i32 [ 0, [[FOR_OUTER]] ], [ [[INC:%.*]], [[FOR_INNER]] ]
-; CHECK-NEXT:    [[SUM:%.*]] = phi i32 [ 0, [[FOR_OUTER]] ], [ [[ADD:%.*]], [[FOR_INNER]] ]
-; CHECK-NEXT:    [[J_1:%.*]] = phi i32 [ 0, [[FOR_OUTER]] ], [ [[INC_1:%.*]], [[FOR_INNER]] ]
-; CHECK-NEXT:    [[SUM_1:%.*]] = phi i32 [ 0, [[FOR_OUTER]] ], [ [[ADD_1:%.*]], [[FOR_INNER]] ]
-; CHECK-NEXT:    [[J_2:%.*]] = phi i32 [ 0, [[FOR_OUTER]] ], [ [[INC_2:%.*]], [[FOR_INNER]] ]
-; CHECK-NEXT:    [[SUM_2:%.*]] = phi i32 [ 0, [[FOR_OUTER]] ], [ [[ADD_2:%.*]], [[FOR_INNER]] ]
-; CHECK-NEXT:    [[J_3:%.*]] = phi i32 [ 0, [[FOR_OUTER]] ], [ [[INC_3:%.*]], [[FOR_INNER]] ]
-; CHECK-NEXT:    [[SUM_3:%.*]] = phi i32 [ 0, [[FOR_OUTER]] ], [ [[ADD_3:%.*]], [[FOR_INNER]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[B:%.*]], i32 [[J]]
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA10:![0-9]+]]
+; CHECK-NEXT:    br label %[[FOR_INNER:.*]]
+; CHECK:       [[FOR_INNER]]:
+; CHECK-NEXT:    [[J:%.*]] = phi i32 [ 0, %[[FOR_OUTER]] ], [ [[INC:%.*]], %[[FOR_INNER]] ]
+; CHECK-NEXT:    [[SUM:%.*]] = phi i32 [ 0, %[[FOR_OUTER]] ], [ [[ADD:%.*]], %[[FOR_INNER]] ]
+; CHECK-NEXT:    [[J_1:%.*]] = phi i32 [ 0, %[[FOR_OUTER]] ], [ [[INC_1:%.*]], %[[FOR_INNER]] ]
+; CHECK-NEXT:    [[SUM_1:%.*]] = phi i32 [ 0, %[[FOR_OUTER]] ], [ [[ADD_1:%.*]], %[[FOR_INNER]] ]
+; CHECK-NEXT:    [[J_2:%.*]] = phi i32 [ 0, %[[FOR_OUTER]] ], [ [[INC_2:%.*]], %[[FOR_INNER]] ]
+; CHECK-NEXT:    [[SUM_2:%.*]] = phi i32 [ 0, %[[FOR_OUTER]] ], [ [[ADD_2:%.*]], %[[FOR_INNER]] ]
+; CHECK-NEXT:    [[J_3:%.*]] = phi i32 [ 0, %[[FOR_OUTER]] ], [ [[INC_3:%.*]], %[[FOR_INNER]] ]
+; CHECK-NEXT:    [[SUM_3:%.*]] = phi i32 [ 0, %[[FOR_OUTER]] ], [ [[ADD_3:%.*]], %[[FOR_INNER]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[B]], i32 [[J]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr [[ARRAYIDX]], align 4, !tbaa [[SHORT_TBAA10:![0-9]+]]
 ; CHECK-NEXT:    [[SEXT:%.*]] = sext i16 [[TMP2]] to i32
 ; CHECK-NEXT:    [[ADD]] = add i32 [[SEXT]], [[SUM]]
 ; CHECK-NEXT:    [[INC]] = add nuw i32 [[J]], 1
 ; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i16, ptr [[B]], i32 [[J_1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr [[ARRAYIDX_1]], align 4, !tbaa [[TBAA10]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr [[ARRAYIDX_1]], align 4, !tbaa [[SHORT_TBAA10]]
 ; CHECK-NEXT:    [[SEXT_1:%.*]] = sext i16 [[TMP3]] to i32
 ; CHECK-NEXT:    [[ADD_1]] = add i32 [[SEXT_1]], [[SUM_1]]
 ; CHECK-NEXT:    [[INC_1]] = add nuw i32 [[J_1]], 1
 ; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i16, ptr [[B]], i32 [[J_2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr [[ARRAYIDX_2]], align 4, !tbaa [[TBAA10]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr [[ARRAYIDX_2]], align 4, !tbaa [[SHORT_TBAA10]]
 ; CHECK-NEXT:    [[SEXT_2:%.*]] = sext i16 [[TMP4]] to i32
 ; CHECK-NEXT:    [[ADD_2]] = add i32 [[SEXT_2]], [[SUM_2]]
 ; CHECK-NEXT:    [[INC_2]] = add nuw i32 [[J_2]], 1
 ; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i16, ptr [[B]], i32 [[J_3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = load i16, ptr [[ARRAYIDX_3]], align 4, !tbaa [[TBAA10]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load i16, ptr [[ARRAYIDX_3]], align 4, !tbaa [[SHORT_TBAA10]]
 ; CHECK-NEXT:    [[SEXT_3:%.*]] = sext i16 [[TMP5]] to i32
 ; CHECK-NEXT:    [[ADD_3]] = add i32 [[SEXT_3]], [[SUM_3]]
 ; CHECK-NEXT:    [[INC_3]] = add nuw i32 [[J_3]], 1
 ; CHECK-NEXT:    [[EXITCOND_3:%.*]] = icmp eq i32 [[INC_3]], [[E]]
-; CHECK-NEXT:    br i1 [[EXITCOND_3]], label [[FOR_LATCH]], label [[FOR_INNER]]
-; CHECK:       for.latch:
-; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_INNER]] ]
-; CHECK-NEXT:    [[ADD_LCSSA_1:%.*]] = phi i32 [ [[ADD_1]], [[FOR_INNER]] ]
-; CHECK-NEXT:    [[ADD_LCSSA_2:%.*]] = phi i32 [ [[ADD_2]], [[FOR_INNER]] ]
-; CHECK-NEXT:    [[ADD_LCSSA_3:%.*]] = phi i32 [ [[ADD_3]], [[FOR_INNER]] ]
-; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[I]]
-; CHECK-NEXT:    store i32 [[ADD_LCSSA]], ptr [[ARRAYIDX6]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    br i1 [[EXITCOND_3]], label %[[FOR_LATCH]], label %[[FOR_INNER]]
+; CHECK:       [[FOR_LATCH]]:
+; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], %[[FOR_INNER]] ]
+; CHECK-NEXT:    [[ADD_LCSSA_1:%.*]] = phi i32 [ [[ADD_1]], %[[FOR_INNER]] ]
+; CHECK-NEXT:    [[ADD_LCSSA_2:%.*]] = phi i32 [ [[ADD_2]], %[[FOR_INNER]] ]
+; CHECK-NEXT:    [[ADD_LCSSA_3:%.*]] = phi i32 [ [[ADD_3]], %[[FOR_INNER]] ]
+; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[I]]
+; CHECK-NEXT:    store i32 [[ADD_LCSSA]], ptr [[ARRAYIDX6]], align 4, !tbaa [[INT_TBAA0]]
 ; CHECK-NEXT:    [[ARRAYIDX6_1:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[ADD8]]
-; CHECK-NEXT:    store i32 [[ADD_LCSSA_1]], ptr [[ARRAYIDX6_1]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    store i32 [[ADD_LCSSA_1]], ptr [[ARRAYIDX6_1]], align 4, !tbaa [[INT_TBAA0]]
 ; CHECK-NEXT:    [[ARRAYIDX6_2:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[ADD8_1]]
-; CHECK-NEXT:    store i32 [[ADD_LCSSA_2]], ptr [[ARRAYIDX6_2]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    store i32 [[ADD_LCSSA_2]], ptr [[ARRAYIDX6_2]], align 4, !tbaa [[INT_TBAA0]]
 ; CHECK-NEXT:    [[ARRAYIDX6_3:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[ADD8_2]]
-; CHECK-NEXT:    store i32 [[ADD_LCSSA_3]], ptr [[ARRAYIDX6_3]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    store i32 [[ADD_LCSSA_3]], ptr [[ARRAYIDX6_3]], align 4, !tbaa [[INT_TBAA0]]
 ; CHECK-NEXT:    [[NITER_NCMP_3:%.*]] = icmp eq i32 [[NITER_NEXT_3]], [[UNROLL_ITER]]
-; CHECK-NEXT:    br i1 [[NITER_NCMP_3]], label [[FOR_END_LOOPEXIT_UNR_LCSSA_LOOPEXIT:%.*]], label [[FOR_OUTER]], !llvm.loop [[LOOP12:![0-9]+]]
-; CHECK:       for.end.loopexit.unr-lcssa.loopexit:
-; CHECK-NEXT:    [[I_UNR_PH:%.*]] = phi i32 [ [[ADD8_3]], [[FOR_LATCH]] ]
-; CHECK-NEXT:    br label [[FOR_END_LOOPEXIT_UNR_LCSSA]]
-; CHECK:       for.end.loopexit.unr-lcssa:
-; CHECK-NEXT:    [[I_UNR:%.*]] = phi i32 [ 0, [[FOR_OUTER_PREHEADER]] ], [ [[I_UNR_PH]], [[FOR_END_LOOPEXIT_UNR_LCSSA_LOOPEXIT]] ]
+; CHECK-NEXT:    br i1 [[NITER_NCMP_3]], label %[[FOR_END_LOOPEXIT_UNR_LCSSA_LOOPEXIT:.*]], label %[[FOR_OUTER]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK:       [[FOR_END_LOOPEXIT_UNR_LCSSA_LOOPEXIT]]:
+; CHECK-NEXT:    [[I_UNR_PH:%.*]] = phi i32 [ [[ADD8_3]], %[[FOR_LATCH]] ]
+; CHECK-NEXT:    br label %[[FOR_END_LOOPEXIT_UNR_LCSSA]]
+; CHECK:       [[FOR_END_LOOPEXIT_UNR_LCSSA]]:
+; CHECK-NEXT:    [[I_UNR:%.*]] = phi i32 [ 0, %[[FOR_OUTER_PREHEADER]] ], [ [[I_UNR_PH]], %[[FOR_END_LOOPEXIT_UNR_LCSSA_LOOPEXIT]] ]
 ; CHECK-NEXT:    [[LCMP_MOD:%.*]] = icmp ne i32 [[XTRAITER]], 0
-; CHECK-NEXT:    br i1 [[LCMP_MOD]], label [[FOR_OUTER_EPIL_PREHEADER:%.*]], label [[FOR_END_LOOPEXIT:%.*]]
-; CHECK:       for.outer.epil.preheader:
-; CHECK-NEXT:    br label [[FOR_OUTER_EPIL:%.*]]
-; CHECK:       for.outer.epil:
-; CHECK-NEXT:    br label [[FOR_INNER_EPIL:%.*]]
-; CHECK:       for.inner.epil:
-; CHECK-NEXT:    [[J_EPIL:%.*]] = phi i32 [ 0, [[FOR_OUTER_EPIL]] ], [ [[INC_EPIL:%.*]], [[FOR_INNER_EPIL]] ]
-; CHECK-NEXT:    [[SUM_EPIL:%.*]] = phi i32 [ 0, [[FOR_OUTER_EPIL]] ], [ [[ADD_EPIL:%.*]], [[FOR_INNER_EPIL]] ]
+; CHECK-NEXT:    br i1 [[LCMP_MOD]], label %[[FOR_OUTER_EPIL_PREHEADER:.*]], label %[[FOR_END_LOOPEXIT:.*]]
+; CHECK:       [[FOR_OUTER_EPIL_PREHEADER]]:
+; CHECK-NEXT:    br label %[[FOR_OUTER_EPIL:.*]]
+; CHECK:       [[FOR_OUTER_EPIL]]:
+; CHECK-NEXT:    br label %[[FOR_INNER_EPIL:.*]]
+; CHECK:       [[FOR_INNER_EPIL]]:
+; CHECK-NEXT:    [[J_EPIL:%.*]] = phi i32 [ 0, %[[FOR_OUTER_EPIL]] ], [ [[INC_EPIL:%.*]], %[[FOR_INNER_EPIL]] ]
+; CHECK-NEXT:    [[SUM_EPIL:%.*]] = phi i32 [ 0, %[[FOR_OUTER_EPIL]] ], [ [[ADD_EPIL:%.*]], %[[FOR_INNER_EPIL]] ]
 ; CHECK-NEXT:    [[ARRAYIDX_EPIL:%.*]] = getelementptr inbounds i16, ptr [[B]], i32 [[J_EPIL]]
-; CHECK-NEXT:    [[TMP6:%.*]] = load i16, ptr [[ARRAYIDX_EPIL]], align 4, !tbaa [[TBAA10]]
+; CHECK-NEXT:    [[TMP6:%.*]] = load i16, ptr [[ARRAYIDX_EPIL]], align 4, !tbaa [[SHORT_TBAA10]]
 ; CHECK-NEXT:    [[SEXT_EPIL:%.*]] = sext i16 [[TMP6]] to i32
 ; CHECK-NEXT:    [[ADD_EPIL]] = add i32 [[SEXT_EPIL]], [[SUM_EPIL]]
 ; CHECK-NEXT:    [[INC_EPIL]] = add nuw i32 [[J_EPIL]], 1
 ; CHECK-NEXT:    [[EXITCOND_EPIL:%.*]] = icmp eq i32 [[INC_EPIL]], [[E]]
-; CHECK-NEXT:    br i1 [[EXITCOND_EPIL]], label [[FOR_LATCH_EPIL:%.*]], label [[FOR_INNER_EPIL]]
-; CHECK:       for.latch.epil:
-; CHECK-NEXT:    [[ADD_LCSSA_EPIL:%.*]] = phi i32 [ [[ADD_EPIL]], [[FOR_INNER_EPIL]] ]
+; CHECK-NEXT:    br i1 [[EXITCOND_EPIL]], label %[[FOR_LATCH_EPIL:.*]], label %[[FOR_INNER_EPIL]]
+; CHECK:       [[FOR_LATCH_EPIL]]:
+; CHECK-NEXT:    [[ADD_LCSSA_EPIL:%.*]] = phi i32 [ [[ADD_EPIL]], %[[FOR_INNER_EPIL]] ]
 ; CHECK-NEXT:    [[ARRAYIDX6_EPIL:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[I_UNR]]
-; CHECK-NEXT:    store i32 [[ADD_LCSSA_EPIL]], ptr [[ARRAYIDX6_EPIL]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    store i32 [[ADD_LCSSA_EPIL]], ptr [[ARRAYIDX6_EPIL]], align 4, !tbaa [[INT_TBAA0]]
 ; CHECK-NEXT:    [[ADD8_EPIL:%.*]] = add nuw i32 [[I_UNR]], 1
 ; CHECK-NEXT:    [[EPIL_ITER_CMP:%.*]] = icmp ne i32 1, [[XTRAITER]]
-; CHECK-NEXT:    br i1 [[EPIL_ITER_CMP]], label [[FOR_OUTER_EPIL_1:%.*]], label [[FOR_END_LOOPEXIT_EPILOG_LCSSA:%.*]]
-; CHECK:       for.outer.epil.1:
-; CHECK-NEXT:    br label [[FOR_INNER_EPIL_1:%.*]]
-; CHECK:       for.inner.epil.1:
-; CHECK-NEXT:    [[J_EPIL_1:%.*]] = phi i32 [ 0, [[FOR_OUTER_EPIL_1]] ], [ [[INC_EPIL_1:%.*]], [[FOR_INNER_EPIL_1]] ]
-; CHECK-NEXT:    [[SUM_EPIL_1:%.*]] = phi i32 [ 0, [[FOR_OUTER_EPIL_1]] ], [ [[ADD_EPIL_1:%.*]], [[FOR_INNER_EPIL_1]] ]
+; CHECK-NEXT:    br i1 [[EPIL_ITER_CMP]], label %[[FOR_OUTER_EPIL_1:.*]], label %[[FOR_END_LOOPEXIT_EPILOG_LCSSA:.*]]
+; CHECK:       [[FOR_OUTER_EPIL_1]]:
+; CHECK-NEXT:    br label %[[FOR_INNER_EPIL_1:.*]]
+; CHECK:       [[FOR_INNER_EPIL_1]]:
+; CHECK-NEXT:    [[J_EPIL_1:%.*]] = phi i32 [ 0, %[[FOR_OUTER_EPIL_1]] ], [ [[INC_EPIL_1:%.*]], %[[FOR_INNER_EPIL_1]] ]
+; CHECK-NEXT:    [[SUM_EPIL_1:%.*]] = phi i32 [ 0, %[[FOR_OUTER_EPIL_1]] ], [ [[ADD_EPIL_1:%.*]], %[[FOR_INNER_EPIL_1]] ]
 ; CHECK-NEXT:    [[ARRAYIDX_EPIL_1:%.*]] = getelementptr inbounds i16, ptr [[B]], i32 [[J_EPIL_1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = load i16, ptr [[ARRAYIDX_EPIL_1]], align 4, !tbaa [[TBAA10]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load i16, ptr [[ARRAYIDX_EPIL_1]], align 4, !tbaa [[SHORT_TBAA10]]
 ; CHECK-NEXT:    [[SEXT_EPIL_1:%.*]] = sext i16 [[TMP7]] to i32
 ; CHECK-NEXT:    [[ADD_EPIL_1]] = add i32 [[SEXT_EPIL_1]], [[SUM_EPIL_1]]
 ; CHECK-NEXT:    [[INC_EPIL_1]] = add nuw i32 [[J_EPIL_1]], 1
 ; CHECK-NEXT:    [[EXITCOND_EPIL_1:%.*]] = icmp eq i32 [[INC_EPIL_1]], [[E]]
-; CHECK-NEXT:    br i1 [[EXITCOND_EPIL_1]], label [[FOR_LATCH_EPIL_1:%.*]], label [[FOR_INNER_EPIL_1]]
-; CHECK:       for.latch.epil.1:
-; CHECK-NEXT:    [[ADD_LCSSA_EPIL_1:%.*]] = phi i32 [ [[ADD_EPIL_1]], [[FOR_INNER_EPIL_1]] ]
+; CHECK-NEXT:    br i1 [[EXITCOND_EPIL_1]], label %[[FOR_LATCH_EPIL_1:.*]], label %[[FOR_INNER_EPIL_1]]
+; CHECK:       [[FOR_LATCH_EPIL_1]]:
+; CHECK-NEXT:    [[ADD_LCSSA_EPIL_1:%.*]] = phi i32 [ [[ADD_EPIL_1]], %[[FOR_INNER_EPIL_1]] ]
 ; CHECK-NEXT:    [[ARRAYIDX6_EPIL_1:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[ADD8_EPIL]]
-; CHECK-NEXT:    store i32 [[ADD_LCSSA_EPIL_1]], ptr [[ARRAYIDX6_EPIL_1]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    store i32 [[ADD_LCSSA_EPIL_1]], ptr [[ARRAYIDX6_EPIL_1]], align 4, !tbaa [[INT_TBAA0]]
 ; CHECK-NEXT:    [[ADD8_EPIL_1:%.*]] = add nuw i32 [[I_UNR]], 2
 ; CHECK-NEXT:    [[EPIL_ITER_CMP_1:%.*]] = icmp ne i32 2, [[XTRAITER]]
-; CHECK-NEXT:    br i1 [[EPIL_ITER_CMP_1]], label [[FOR_OUTER_EPIL_2:%.*]], label [[FOR_END_LOOPEXIT_EPILOG_LCSSA]]
-; CHECK:       for.outer.epil.2:
-; CHECK-NEXT:    br label [[FOR_INNER_EPIL_2:%.*]]
-; CHECK:       for.inner.epil.2:
-; CHECK-NEXT:    [[J_EPIL_2:%.*]] = phi i32 [ 0, [[FOR_OUTER_EPIL_2]] ], [ [[INC_EPIL_2:%.*]], [[FOR_INNER_EPIL_2]] ]
-; CHECK-NEXT:    [[SUM_EPIL_2:%.*]] = phi i32 [ 0, [[FOR_OUTER_EPIL_2]] ], [ [[ADD_EPIL_2:%.*]], [[FOR_INNER_EPIL_2]] ]
+; CHECK-NEXT:    br i1 [[EPIL_ITER_CMP_1]], label %[[FOR_OUTER_EPIL_2:.*]], label %[[FOR_END_LOOPEXIT_EPILOG_LCSSA]]
+; CHECK:       [[FOR_OUTER_EPIL_2]]:
+; CHECK-NEXT:    br label %[[FOR_INNER_EPIL_2:.*]]
+; CHECK:       [[FOR_INNER_EPIL_2]]:
+; CHECK-NEXT:    [[J_EPIL_2:%.*]] = phi i32 [ 0, %[[FOR_OUTER_EPIL_2]] ], [ [[INC_EPIL_2:%.*]], %[[FOR_INNER_EPIL_2]] ]
+; CHECK-NEXT:    [[SUM_EPIL_2:%.*]] = phi i32 [ 0, %[[FOR_OUTER_EPIL_2]] ], [ [[ADD_EPIL_2:%.*]], %[[FOR_INNER_EPIL_2]] ]
 ; CHECK-NEXT:    [[ARRAYIDX_EPIL_2:%.*]] = getelementptr inbounds i16, ptr [[B]], i32 [[J_EPIL_2]]
-; CHECK-NEXT:    [[TMP8:%.*]] = load i16, ptr [[ARRAYIDX_EPIL_2]], align 4, !tbaa [[TBAA10]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load i16, ptr [[ARRAYIDX_EPIL_2]], align 4, !tbaa [[SHORT_TBAA10]]
 ; CHECK-NEXT:    [[SEXT_EPIL_2:%.*]] = sext i16 [[TMP8]] to i32
 ; CHECK-NEXT:    [[ADD_EPIL_2]] = add i32 [[SEXT_EPIL_2]], [[SUM_EPIL_2]]
 ; CHECK-NEXT:    [[INC_EPIL_2]] = add nuw i32 [[J_EPIL_2]], 1
 ; CHECK-NEXT:    [[EXITCOND_EPIL_2:%.*]] = icmp eq i32 [[INC_EPIL_2]], [[E]]
-; CHECK-NEXT:    br i1 [[EXITCOND_EPIL_2]], label [[FOR_LATCH_EPIL_2:%.*]], label [[FOR_INNER_EPIL_2]]
-; CHECK:       for.latch.epil.2:
-; CHECK-NEXT:    [[ADD_LCSSA_EPIL_2:%.*]] = phi i32 [ [[ADD_EPIL_2]], [[FOR_INNER_EPIL_2]] ]
+; CHECK-NEXT:    br i1 [[EXITCOND_EPIL_2]], label %[[FOR_LATCH_EPIL_2:.*]], label %[[FOR_INNER_EPIL_2]]
+; CHECK:       [[FOR_LATCH_EPIL_2]]:
+; CHECK-NEXT:    [[ADD_LCSSA_EPIL_2:%.*]] = phi i32 [ [[ADD_EPIL_2]], %[[FOR_INNER_EPIL_2]] ]
 ; CHECK-NEXT:    [[ARRAYIDX6_EPIL_2:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[ADD8_EPIL_1]]
-; CHECK-NEXT:    store i32 [[ADD_LCSSA_EPIL_2]], ptr [[ARRAYIDX6_EPIL_2]], align 4, !tbaa [[TBAA0]]
-; CHECK-NEXT:    br label [[FOR_END_LOOPEXIT_EPILOG_LCSSA]]
-; CHECK:       for.end.loopexit.epilog-lcssa:
-; CHECK-NEXT:    br label [[FOR_END_LOOPEXIT]]
-; CHECK:       for.end.loopexit:
-; CHECK-NEXT:    br label [[FOR_END]]
-; CHECK:       for.end:
+; CHECK-NEXT:    store i32 [[ADD_LCSSA_EPIL_2]], ptr [[ARRAYIDX6_EPIL_2]], align 4, !tbaa [[INT_TBAA0]]
+; CHECK-NEXT:    br label %[[FOR_END_LOOPEXIT_EPILOG_LCSSA]]
+; CHECK:       [[FOR_END_LOOPEXIT_EPILOG_LCSSA]]:
+; CHECK-NEXT:    br label %[[FOR_END_LOOPEXIT]]
+; CHECK:       [[FOR_END_LOOPEXIT]]:
+; CHECK-NEXT:    br label %[[FOR_END]]
+; CHECK:       [[FOR_END]]:
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -1284,114 +1291,115 @@ for.end:
 @g = common global %struct.a zeroinitializer, align 8
 @c = common global [1 x i8] zeroinitializer, align 1
 define signext i16 @test10(i32 %k) #0 {
-; CHECK-LABEL: @test10(
-; CHECK-NEXT:  entry:
+; CHECK-LABEL: define signext i16 @test10(
+; CHECK-SAME: i32 [[K:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr @c, align 1
 ; CHECK-NEXT:    [[TOBOOL9:%.*]] = icmp eq i8 [[TMP0]], 0
-; CHECK-NEXT:    [[TOBOOL13:%.*]] = icmp ne i32 [[K:%.*]], 0
-; CHECK-NEXT:    br i1 false, label [[FOR_END26_UNR_LCSSA:%.*]], label [[ENTRY_NEW:%.*]]
-; CHECK:       entry.new:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[STOREMERGE82:%.*]] = phi i64 [ 0, [[ENTRY_NEW]] ], [ [[INC25_3:%.*]], [[FOR_INC24:%.*]] ]
-; CHECK-NEXT:    [[NITER:%.*]] = phi i64 [ 0, [[ENTRY_NEW]] ], [ [[NITER_NEXT_3:%.*]], [[FOR_INC24]] ]
+; CHECK-NEXT:    [[TOBOOL13:%.*]] = icmp ne i32 [[K]], 0
+; CHECK-NEXT:    br i1 false, label %[[FOR_END26_UNR_LCSSA:.*]], label %[[ENTRY_NEW:.*]]
+; CHECK:       [[ENTRY_NEW]]:
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[STOREMERGE82:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[INC25_3:%.*]], %[[FOR_INC24:.*]] ]
+; CHECK-NEXT:    [[NITER:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[NITER_NEXT_3:%.*]], %[[FOR_INC24]] ]
 ; CHECK-NEXT:    [[INC25_3]] = add nuw nsw i64 [[STOREMERGE82]], 4
 ; CHECK-NEXT:    [[NITER_NEXT_3]] = add nuw nsw i64 [[NITER]], 4
-; CHECK-NEXT:    br label [[FOR_BODY2:%.*]]
-; CHECK:       for.body2:
-; CHECK-NEXT:    [[STOREMERGE:%.*]] = phi i64 [ 4, [[FOR_BODY]] ], [ [[DEC:%.*]], [[FOR_INC21_3:%.*]] ]
-; CHECK-NEXT:    [[STOREMERGE_14:%.*]] = phi i64 [ 4, [[FOR_BODY]] ], [ [[DEC_1:%.*]], [[FOR_INC21_3]] ]
-; CHECK-NEXT:    [[STOREMERGE_25:%.*]] = phi i64 [ 4, [[FOR_BODY]] ], [ [[DEC_2:%.*]], [[FOR_INC21_3]] ]
-; CHECK-NEXT:    [[STOREMERGE_36:%.*]] = phi i64 [ 4, [[FOR_BODY]] ], [ [[DEC_3:%.*]], [[FOR_INC21_3]] ]
-; CHECK-NEXT:    br i1 [[TOBOOL9]], label [[FOR_BODY2_SPLIT:%.*]], label [[FOR_BODY2_SPLIT2:%.*]]
-; CHECK:       for.body2.split2:
-; CHECK-NEXT:    br i1 [[TOBOOL13]], label [[FOR_INC21:%.*]], label [[FOR_INC21_IF:%.*]]
-; CHECK:       for.body2.split:
-; CHECK-NEXT:    br i1 [[TOBOOL13]], label [[FOR_INC21]], label [[FOR_INC21_THEN:%.*]]
-; CHECK:       for.inc21.if:
-; CHECK-NEXT:    br label [[FOR_INC21]]
-; CHECK:       for.inc21.then:
-; CHECK-NEXT:    br label [[FOR_INC21]]
-; CHECK:       for.inc21:
+; CHECK-NEXT:    br label %[[FOR_BODY2:.*]]
+; CHECK:       [[FOR_BODY2]]:
+; CHECK-NEXT:    [[STOREMERGE:%.*]] = phi i64 [ 4, %[[FOR_BODY]] ], [ [[DEC:%.*]], %[[FOR_INC21_3:.*]] ]
+; CHECK-NEXT:    [[STOREMERGE_14:%.*]] = phi i64 [ 4, %[[FOR_BODY]] ], [ [[DEC_1:%.*]], %[[FOR_INC21_3]] ]
+; CHECK-NEXT:    [[STOREMERGE_25:%.*]] = phi i64 [ 4, %[[FOR_BODY]] ], [ [[DEC_2:%.*]], %[[FOR_INC21_3]] ]
+; CHECK-NEXT:    [[STOREMERGE_36:%.*]] = phi i64 [ 4, %[[FOR_BODY]] ], [ [[DEC_3:%.*]], %[[FOR_INC21_3]] ]
+; CHECK-NEXT:    br i1 [[TOBOOL9]], label %[[FOR_BODY2_SPLIT:.*]], label %[[FOR_BODY2_SPLIT2:.*]]
+; CHECK:       [[FOR_BODY2_SPLIT2]]:
+; CHECK-NEXT:    br i1 [[TOBOOL13]], label %[[FOR_INC21:.*]], label %[[FOR_INC21_IF:.*]]
+; CHECK:       [[FOR_BODY2_SPLIT]]:
+; CHECK-NEXT:    br i1 [[TOBOOL13]], label %[[FOR_INC21]], label %[[FOR_INC21_THEN:.*]]
+; CHECK:       [[FOR_INC21_IF]]:
+; CHECK-NEXT:    br label %[[FOR_INC21]]
+; CHECK:       [[FOR_INC21_THEN]]:
+; CHECK-NEXT:    br label %[[FOR_INC21]]
+; CHECK:       [[FOR_INC21]]:
 ; CHECK-NEXT:    [[DEC]] = add nsw i64 [[STOREMERGE]], -1
-; CHECK-NEXT:    br i1 [[TOBOOL9]], label [[FOR_BODY2_SPLIT_1:%.*]], label [[FOR_BODY2_SPLIT2_1:%.*]]
-; CHECK:       for.inc24:
-; CHECK-NEXT:    [[STOREMERGE_4_LCSSA_3:%.*]] = phi i64 [ [[STOREMERGE_4_3:%.*]], [[FOR_INC21_3]] ]
-; CHECK-NEXT:    br i1 false, label [[FOR_BODY]], label [[FOR_END26_UNR_LCSSA_LOOPEXIT:%.*]], !llvm.loop [[LOOP13:![0-9]+]]
-; CHECK:       for.end26.unr-lcssa.loopexit:
-; CHECK-NEXT:    [[DEC_LCSSA_LCSSA_PH_PH:%.*]] = phi i64 [ 0, [[FOR_INC24]] ]
-; CHECK-NEXT:    [[STOREMERGE_4_LCSSA_LCSSA_PH_PH:%.*]] = phi i64 [ [[STOREMERGE_4_LCSSA_3]], [[FOR_INC24]] ]
-; CHECK-NEXT:    [[STOREMERGE_5_LCSSA_LCSSA_PH_PH:%.*]] = phi i32 [ 0, [[FOR_INC24]] ]
-; CHECK-NEXT:    br label [[FOR_END26_UNR_LCSSA]]
-; CHECK:       for.end26.unr-lcssa:
-; CHECK-NEXT:    [[DEC_LCSSA_LCSSA_PH:%.*]] = phi i64 [ poison, [[ENTRY:%.*]] ], [ [[DEC_LCSSA_LCSSA_PH_PH]], [[FOR_END26_UNR_LCSSA_LOOPEXIT]] ]
-; CHECK-NEXT:    [[STOREMERGE_4_LCSSA_LCSSA_PH:%.*]] = phi i64 [ poison, [[ENTRY]] ], [ [[STOREMERGE_4_LCSSA_LCSSA_PH_PH]], [[FOR_END26_UNR_LCSSA_LOOPEXIT]] ]
-; CHECK-NEXT:    [[STOREMERGE_5_LCSSA_LCSSA_PH:%.*]] = phi i32 [ poison, [[ENTRY]] ], [ [[STOREMERGE_5_LCSSA_LCSSA_PH_PH]], [[FOR_END26_UNR_LCSSA_LOOPEXIT]] ]
-; CHECK-NEXT:    br i1 true, label [[FOR_BODY_EPIL_PREHEADER:%.*]], label [[FOR_END26:%.*]]
-; CHECK:       for.body.epil.preheader:
-; CHECK-NEXT:    br label [[FOR_BODY_EPIL:%.*]]
-; CHECK:       for.body.epil:
-; CHECK-NEXT:    br label [[FOR_BODY2_EPIL:%.*]]
-; CHECK:       for.body2.epil:
-; CHECK-NEXT:    [[STOREMERGE_EPIL:%.*]] = phi i64 [ 4, [[FOR_BODY_EPIL]] ], [ [[DEC_EPIL:%.*]], [[FOR_INC21_EPIL:%.*]] ]
-; CHECK-NEXT:    br i1 [[TOBOOL9]], label [[FOR_BODY2_SPLIT_EPIL:%.*]], label [[FOR_BODY2_SPLIT2_EPIL:%.*]]
-; CHECK:       for.body2.split2.epil:
-; CHECK-NEXT:    br i1 [[TOBOOL13]], label [[FOR_INC21_EPIL]], label [[FOR_INC21_IF_EPIL:%.*]]
-; CHECK:       for.inc21.if.epil:
-; CHECK-NEXT:    br label [[FOR_INC21_EPIL]]
-; CHECK:       for.body2.split.epil:
-; CHECK-NEXT:    br i1 [[TOBOOL13]], label [[FOR_INC21_EPIL]], label [[FOR_INC21_THEN_EPIL:%.*]]
-; CHECK:       for.inc21.then.epil:
-; CHECK-NEXT:    br label [[FOR_INC21_EPIL]]
-; CHECK:       for.inc21.epil:
-; CHECK-NEXT:    [[STOREMERGE_4_EPIL:%.*]] = phi i64 [ 0, [[FOR_INC21_IF_EPIL]] ], [ 0, [[FOR_INC21_THEN_EPIL]] ], [ 4, [[FOR_BODY2_SPLIT2_EPIL]] ], [ 4, [[FOR_BODY2_SPLIT_EPIL]] ]
+; CHECK-NEXT:    br i1 [[TOBOOL9]], label %[[FOR_BODY2_SPLIT_1:.*]], label %[[FOR_BODY2_SPLIT2_1:.*]]
+; CHECK:       [[FOR_INC24]]:
+; CHECK-NEXT:    [[STOREMERGE_4_LCSSA_3:%.*]] = phi i64 [ [[STOREMERGE_4_3:%.*]], %[[FOR_INC21_3]] ]
+; CHECK-NEXT:    br i1 false, label %[[FOR_BODY]], label %[[FOR_END26_UNR_LCSSA_LOOPEXIT:.*]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK:       [[FOR_END26_UNR_LCSSA_LOOPEXIT]]:
+; CHECK-NEXT:    [[DEC_LCSSA_LCSSA_PH_PH:%.*]] = phi i64 [ 0, %[[FOR_INC24]] ]
+; CHECK-NEXT:    [[STOREMERGE_4_LCSSA_LCSSA_PH_PH:%.*]] = phi i64 [ [[STOREMERGE_4_LCSSA_3]], %[[FOR_INC24]] ]
+; CHECK-NEXT:    [[STOREMERGE_5_LCSSA_LCSSA_PH_PH:%.*]] = phi i32 [ 0, %[[FOR_INC24]] ]
+; CHECK-NEXT:    br label %[[FOR_END26_UNR_LCSSA]]
+; CHECK:       [[FOR_END26_UNR_LCSSA]]:
+; CHECK-NEXT:    [[DEC_LCSSA_LCSSA_PH:%.*]] = phi i64 [ poison, %[[ENTRY]] ], [ [[DEC_LCSSA_LCSSA_PH_PH]], %[[FOR_END26_UNR_LCSSA_LOOPEXIT]] ]
+; CHECK-NEXT:    [[STOREMERGE_4_LCSSA_LCSSA_PH:%.*]] = phi i64 [ poison, %[[ENTRY]] ], [ [[STOREMERGE_4_LCSSA_LCSSA_PH_PH]], %[[FOR_END26_UNR_LCSSA_LOOPEXIT]] ]
+; CHECK-NEXT:    [[STOREMERGE_5_LCSSA_LCSSA_PH:%.*]] = phi i32 [ poison, %[[ENTRY]] ], [ [[STOREMERGE_5_LCSSA_LCSSA_PH_PH]], %[[FOR_END26_UNR_LCSSA_LOOPEXIT]] ]
+; CHECK-NEXT:    br i1 true, label %[[FOR_BODY_EPIL_PREHEADER:.*]], label %[[FOR_END26:.*]]
+; CHECK:       [[FOR_BODY_EPIL_PREHEADER]]:
+; CHECK-NEXT:    br label %[[FOR_BODY_EPIL:.*]]
+; CHECK:       [[FOR_BODY_EPIL]]:
+; CHECK-NEXT:    br label %[[FOR_BODY2_EPIL:.*]]
+; CHECK:       [[FOR_BODY2_EPIL]]:
+; CHECK-NEXT:    [[STOREMERGE_EPIL:%.*]] = phi i64 [ 4, %[[FOR_BODY_EPIL]] ], [ [[DEC_EPIL:%.*]], %[[FOR_INC21_EPIL:.*]] ]
+; CHECK-NEXT:    br i1 [[TOBOOL9]], label %[[FOR_BODY2_SPLIT_EPIL:.*]], label %[[FOR_BODY2_SPLIT2_EPIL:.*]]
+; CHECK:       [[FOR_BODY2_SPLIT2_EPIL]]:
+; CHECK-NEXT:    br i1 [[TOBOOL13]], label %[[FOR_INC21_EPIL]], label %[[FOR_INC21_IF_EPIL:.*]]
+; CHECK:       [[FOR_INC21_IF_EPIL]]:
+; CHECK-NEXT:    br label %[[FOR_INC21_EPIL]]
+; CHECK:       [[FOR_BODY2_SPLIT_EPIL]]:
+; CHECK-NEXT:    br i1 [[TOBOOL13]], label %[[FOR_INC21_EPIL]], label %[[FOR_INC21_THEN_EPIL:.*]]
+; CHECK:       [[FOR_INC21_THEN_EPIL]]:
+; CHECK-NEXT:    br label %[[FOR_INC21_EPIL]]
+; CHECK:       [[FOR_INC21_EPIL]]:
+; CHECK-NEXT:    [[STOREMERGE_4_EPIL:%.*]] = phi i64 [ 0, %[[FOR_INC21_IF_EPIL]] ], [ 0, %[[FOR_INC21_THEN_EPIL]] ], [ 4, %[[FOR_BODY2_SPLIT2_EPIL]] ], [ 4, %[[FOR_BODY2_SPLIT_EPIL]] ]
 ; CHECK-NEXT:    [[DEC_EPIL]] = add nsw i64 [[STOREMERGE_EPIL]], -1
 ; CHECK-NEXT:    [[TOBOOL_EPIL:%.*]] = icmp eq i64 [[DEC_EPIL]], 0
-; CHECK-NEXT:    br i1 [[TOBOOL_EPIL]], label [[FOR_INC24_EPIL:%.*]], label [[FOR_BODY2_EPIL]]
-; CHECK:       for.inc24.epil:
-; CHECK-NEXT:    [[STOREMERGE_4_LCSSA_EPIL:%.*]] = phi i64 [ [[STOREMERGE_4_EPIL]], [[FOR_INC21_EPIL]] ]
-; CHECK-NEXT:    br label [[FOR_END26]]
-; CHECK:       for.end26:
-; CHECK-NEXT:    [[DEC_LCSSA_LCSSA:%.*]] = phi i64 [ [[DEC_LCSSA_LCSSA_PH]], [[FOR_END26_UNR_LCSSA]] ], [ 0, [[FOR_INC24_EPIL]] ]
-; CHECK-NEXT:    [[STOREMERGE_4_LCSSA_LCSSA:%.*]] = phi i64 [ [[STOREMERGE_4_LCSSA_LCSSA_PH]], [[FOR_END26_UNR_LCSSA]] ], [ [[STOREMERGE_4_LCSSA_EPIL]], [[FOR_INC24_EPIL]] ]
-; CHECK-NEXT:    [[STOREMERGE_5_LCSSA_LCSSA:%.*]] = phi i32 [ [[STOREMERGE_5_LCSSA_LCSSA_PH]], [[FOR_END26_UNR_LCSSA]] ], [ 0, [[FOR_INC24_EPIL]] ]
+; CHECK-NEXT:    br i1 [[TOBOOL_EPIL]], label %[[FOR_INC24_EPIL:.*]], label %[[FOR_BODY2_EPIL]]
+; CHECK:       [[FOR_INC24_EPIL]]:
+; CHECK-NEXT:    [[STOREMERGE_4_LCSSA_EPIL:%.*]] = phi i64 [ [[STOREMERGE_4_EPIL]], %[[FOR_INC21_EPIL]] ]
+; CHECK-NEXT:    br label %[[FOR_END26]]
+; CHECK:       [[FOR_END26]]:
+; CHECK-NEXT:    [[DEC_LCSSA_LCSSA:%.*]] = phi i64 [ [[DEC_LCSSA_LCSSA_PH]], %[[FOR_END26_UNR_LCSSA]] ], [ 0, %[[FOR_INC24_EPIL]] ]
+; CHECK-NEXT:    [[STOREMERGE_4_LCSSA_LCSSA:%.*]] = phi i64 [ [[STOREMERGE_4_LCSSA_LCSSA_PH]], %[[FOR_END26_UNR_LCSSA]] ], [ [[STOREMERGE_4_LCSSA_EPIL]], %[[FOR_INC24_EPIL]] ]
+; CHECK-NEXT:    [[STOREMERGE_5_LCSSA_LCSSA:%.*]] = phi i32 [ [[STOREMERGE_5_LCSSA_LCSSA_PH]], %[[FOR_END26_UNR_LCSSA]] ], [ 0, %[[FOR_INC24_EPIL]] ]
 ; CHECK-NEXT:    store i64 [[DEC_LCSSA_LCSSA]], ptr @g, align 8
 ; CHECK-NEXT:    ret i16 0
-; CHECK:       for.body2.split2.1:
-; CHECK-NEXT:    br i1 [[TOBOOL13]], label [[FOR_INC21_1:%.*]], label [[FOR_INC21_IF_1:%.*]]
-; CHECK:       for.inc21.if.1:
-; CHECK-NEXT:    br label [[FOR_INC21_1]]
-; CHECK:       for.body2.split.1:
-; CHECK-NEXT:    br i1 [[TOBOOL13]], label [[FOR_INC21_1]], label [[FOR_INC21_THEN_1:%.*]]
-; CHECK:       for.inc21.then.1:
-; CHECK-NEXT:    br label [[FOR_INC21_1]]
-; CHECK:       for.inc21.1:
+; CHECK:       [[FOR_BODY2_SPLIT2_1]]:
+; CHECK-NEXT:    br i1 [[TOBOOL13]], label %[[FOR_INC21_1:.*]], label %[[FOR_INC21_IF_1:.*]]
+; CHECK:       [[FOR_INC21_IF_1]]:
+; CHECK-NEXT:    br label %[[FOR_INC21_1]]
+; CHECK:       [[FOR_BODY2_SPLIT_1]]:
+; CHECK-NEXT:    br i1 [[TOBOOL13]], label %[[FOR_INC21_1]], label %[[FOR_INC21_THEN_1:.*]]
+; CHECK:       [[FOR_INC21_THEN_1]]:
+; CHECK-NEXT:    br label %[[FOR_INC21_1]]
+; CHECK:       [[FOR_INC21_1]]:
 ; CHECK-NEXT:    [[DEC_1]] = add nsw i64 [[STOREMERGE_14]], -1
-; CHECK-NEXT:    br i1 [[TOBOOL9]], label [[FOR_BODY2_SPLIT_2:%.*]], label [[FOR_BODY2_SPLIT2_2:%.*]]
-; CHECK:       for.body2.split2.2:
-; CHECK-NEXT:    br i1 [[TOBOOL13]], label [[FOR_INC21_2:%.*]], label [[FOR_INC21_IF_2:%.*]]
-; CHECK:       for.inc21.if.2:
-; CHECK-NEXT:    br label [[FOR_INC21_2]]
-; CHECK:       for.body2.split.2:
-; CHECK-NEXT:    br i1 [[TOBOOL13]], label [[FOR_INC21_2]], label [[FOR_INC21_THEN_2:%.*]]
-; CHECK:       for.inc21.then.2:
-; CHECK-NEXT:    br label [[FOR_INC21_2]]
-; CHECK:       for.inc21.2:
+; CHECK-NEXT:    br i1 [[TOBOOL9]], label %[[FOR_BODY2_SPLIT_2:.*]], label %[[FOR_BODY2_SPLIT2_2:.*]]
+; CHECK:       [[FOR_BODY2_SPLIT2_2]]:
+; CHECK-NEXT:    br i1 [[TOBOOL13]], label %[[FOR_INC21_2:.*]], label %[[FOR_INC21_IF_2:.*]]
+; CHECK:       [[FOR_INC21_IF_2]]:
+; CHECK-NEXT:    br label %[[FOR_INC21_2]]
+; CHECK:       [[FOR_BODY2_SPLIT_2]]:
+; CHECK-NEXT:    br i1 [[TOBOOL13]], label %[[FOR_INC21_2]], label %[[FOR_INC21_THEN_2:.*]]
+; CHECK:       [[FOR_INC21_THEN_2]]:
+; CHECK-NEXT:    br label %[[FOR_INC21_2]]
+; CHECK:       [[FOR_INC21_2]]:
 ; CHECK-NEXT:    [[DEC_2]] = add nsw i64 [[STOREMERGE_25]], -1
-; CHECK-NEXT:    br i1 [[TOBOOL9]], label [[FOR_BODY2_SPLIT_3:%.*]], label [[FOR_BODY2_SPLIT2_3:%.*]]
-; CHECK:       for.body2.split2.3:
-; CHECK-NEXT:    br i1 [[TOBOOL13]], label [[FOR_INC21_3]], label [[FOR_INC21_IF_3:%.*]]
-; CHECK:       for.inc21.if.3:
-; CHECK-NEXT:    br label [[FOR_INC21_3]]
-; CHECK:       for.body2.split.3:
-; CHECK-NEXT:    br i1 [[TOBOOL13]], label [[FOR_INC21_3]], label [[FOR_INC21_THEN_3:%.*]]
-; CHECK:       for.inc21.then.3:
-; CHECK-NEXT:    br label [[FOR_INC21_3]]
-; CHECK:       for.inc21.3:
-; CHECK-NEXT:    [[STOREMERGE_4_3]] = phi i64 [ 0, [[FOR_INC21_IF_3]] ], [ 0, [[FOR_INC21_THEN_3]] ], [ 4, [[FOR_BODY2_SPLIT2_3]] ], [ 4, [[FOR_BODY2_SPLIT_3]] ]
+; CHECK-NEXT:    br i1 [[TOBOOL9]], label %[[FOR_BODY2_SPLIT_3:.*]], label %[[FOR_BODY2_SPLIT2_3:.*]]
+; CHECK:       [[FOR_BODY2_SPLIT2_3]]:
+; CHECK-NEXT:    br i1 [[TOBOOL13]], label %[[FOR_INC21_3]], label %[[FOR_INC21_IF_3:.*]]
+; CHECK:       [[FOR_INC21_IF_3]]:
+; CHECK-NEXT:    br label %[[FOR_INC21_3]]
+; CHECK:       [[FOR_BODY2_SPLIT_3]]:
+; CHECK-NEXT:    br i1 [[TOBOOL13]], label %[[FOR_INC21_3]], label %[[FOR_INC21_THEN_3:.*]]
+; CHECK:       [[FOR_INC21_THEN_3]]:
+; CHECK-NEXT:    br label %[[FOR_INC21_3]]
+; CHECK:       [[FOR_INC21_3]]:
+; CHECK-NEXT:    [[STOREMERGE_4_3]] = phi i64 [ 0, %[[FOR_INC21_IF_3]] ], [ 0, %[[FOR_INC21_THEN_3]] ], [ 4, %[[FOR_BODY2_SPLIT2_3]] ], [ 4, %[[FOR_BODY2_SPLIT_3]] ]
 ; CHECK-NEXT:    [[DEC_3]] = add nsw i64 [[STOREMERGE_36]], -1
 ; CHECK-NEXT:    [[TOBOOL_3:%.*]] = icmp eq i64 [[DEC_3]], 0
-; CHECK-NEXT:    br i1 [[TOBOOL_3]], label [[FOR_INC24]], label [[FOR_BODY2]]
+; CHECK-NEXT:    br i1 [[TOBOOL_3]], label %[[FOR_INC24]], label %[[FOR_BODY2]]
 ;
 entry:
   %0 = load i8, ptr @c, align 1
@@ -1451,3 +1459,19 @@ for.end26:
 !8 = !{!"Simple C/C++ TBAA"}
 !9 = !{!10, !10, i64 0}
 !10 = !{!"short", !7, i64 0}
+;.
+; CHECK: [[INT_TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0}
+; CHECK: [[META1]] = !{!"int", [[META2:![0-9]+]], i64 0}
+; CHECK: [[META2]] = !{!"omnipotent char", [[META3:![0-9]+]], i64 0}
+; CHECK: [[META3]] = !{!"Simple C/C++ TBAA"}
+; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META5:![0-9]+]]}
+; CHECK: [[META5]] = !{!"llvm.loop.unroll.disable"}
+; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META5]]}
+; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META5]]}
+; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META5]]}
+; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META5]]}
+; CHECK: [[SHORT_TBAA10]] = !{[[META11:![0-9]+]], [[META11]], i64 0}
+; CHECK: [[META11]] = !{!"short", [[META2]], i64 0}
+; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META5]]}
+; CHECK: [[LOOP13]] = distinct !{[[LOOP13]], [[META5]]}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/check-prof-info.ll b/llvm/test/Transforms/LoopVectorize/AArch64/check-prof-info.ll
index fc459a376710d..f39c6bd4c0d0d 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/check-prof-info.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/check-prof-info.ll
@@ -22,11 +22,11 @@ define void @foo_i32(i64 %n) {
 ; CHECK-V1-IC1:  [[VECTOR_BODY]]:
 ; CHECK-V1-IC1:    br i1 [[TMP6:%.*]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !prof [[PROF0]], !llvm.loop [[LOOP1:![0-9]+]]
 ; CHECK-V1-IC1:  [[MIDDLE_BLOCK]]:
-; CHECK-V1-IC1:    br i1 [[CMP_N:%.*]], label %[[FOR_COND_CLEANUP:.*]], label %[[SCALAR_PH]], !prof [[PROF4:![0-9]+]]
+; CHECK-V1-IC1:    br i1 [[CMP_N:%.*]], label %[[FOR_COND_CLEANUP:.*]], label %[[SCALAR_PH]], !prof [[PROF5:![0-9]+]]
 ; CHECK-V1-IC1:  [[SCALAR_PH]]:
 ; CHECK-V1-IC1:    br label %[[FOR_BODY:.*]]
 ; CHECK-V1-IC1:  [[FOR_BODY]]:
-; CHECK-V1-IC1:    br i1 [[EXITCOND:%.*]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !prof [[PROF5:![0-9]+]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-V1-IC1:    br i1 [[EXITCOND:%.*]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !prof [[PROF6:![0-9]+]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK-V1-IC1:  [[FOR_COND_CLEANUP]]:
 ;
 ; CHECK-V1-IC1-FORCE-EPI4-LABEL: define void @foo_i32(
@@ -40,19 +40,19 @@ define void @foo_i32(i64 %n) {
 ; CHECK-V1-IC1-FORCE-EPI4:  [[VECTOR_BODY]]:
 ; CHECK-V1-IC1-FORCE-EPI4:    br i1 [[TMP6:%.*]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !prof [[PROF0]], !llvm.loop [[LOOP1:![0-9]+]]
 ; CHECK-V1-IC1-FORCE-EPI4:  [[MIDDLE_BLOCK]]:
-; CHECK-V1-IC1-FORCE-EPI4:    br i1 [[CMP_N:%.*]], label %[[FOR_COND_CLEANUP:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]], !prof [[PROF4:![0-9]+]]
+; CHECK-V1-IC1-FORCE-EPI4:    br i1 [[CMP_N:%.*]], label %[[FOR_COND_CLEANUP:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]], !prof [[PROF5:![0-9]+]]
 ; CHECK-V1-IC1-FORCE-EPI4:  [[VEC_EPILOG_ITER_CHECK]]:
-; CHECK-V1-IC1-FORCE-EPI4:    br i1 [[MIN_EPILOG_ITERS_CHECK:%.*]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF5:![0-9]+]]
+; CHECK-V1-IC1-FORCE-EPI4:    br i1 [[MIN_EPILOG_ITERS_CHECK:%.*]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF6:![0-9]+]]
 ; CHECK-V1-IC1-FORCE-EPI4:  [[VEC_EPILOG_PH]]:
 ; CHECK-V1-IC1-FORCE-EPI4:    br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
 ; CHECK-V1-IC1-FORCE-EPI4:  [[VEC_EPILOG_VECTOR_BODY]]:
-; CHECK-V1-IC1-FORCE-EPI4:    br i1 [[TMP9:%.*]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-V1-IC1-FORCE-EPI4:    br i1 [[TMP9:%.*]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK-V1-IC1-FORCE-EPI4:  [[VEC_EPILOG_MIDDLE_BLOCK]]:
-; CHECK-V1-IC1-FORCE-EPI4:    br i1 [[CMP_N7:%.*]], label %[[FOR_COND_CLEANUP]], label %[[VEC_EPILOG_SCALAR_PH]], !prof [[PROF7:![0-9]+]]
+; CHECK-V1-IC1-FORCE-EPI4:    br i1 [[CMP_N7:%.*]], label %[[FOR_COND_CLEANUP]], label %[[VEC_EPILOG_SCALAR_PH]], !prof [[PROF9:![0-9]+]]
 ; CHECK-V1-IC1-FORCE-EPI4:  [[VEC_EPILOG_SCALAR_PH]]:
 ; CHECK-V1-IC1-FORCE-EPI4:    br label %[[FOR_BODY:.*]]
 ; CHECK-V1-IC1-FORCE-EPI4:  [[FOR_BODY]]:
-; CHECK-V1-IC1-FORCE-EPI4:    br i1 [[EXITCOND:%.*]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !prof [[PROF8:![0-9]+]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-V1-IC1-FORCE-EPI4:    br i1 [[EXITCOND:%.*]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !prof [[PROF10:![0-9]+]], !llvm.loop [[LOOP11:![0-9]+]]
 ; CHECK-V1-IC1-FORCE-EPI4:  [[FOR_COND_CLEANUP]]:
 ;
 ; CHECK-V2-IC1-LABEL: define void @foo_i32(
@@ -64,11 +64,11 @@ define void @foo_i32(i64 %n) {
 ; CHECK-V2-IC1:  [[VECTOR_BODY]]:
 ; CHECK-V2-IC1:    br i1 [[TMP2:%.*]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !prof [[PROF1:![0-9]+]], !llvm.loop [[LOOP2:![0-9]+]]
 ; CHECK-V2-IC1:  [[MIDDLE_BLOCK]]:
-; CHECK-V2-IC1:    br i1 [[CMP_N:%.*]], label %[[FOR_COND_CLEANUP:.*]], label %[[SCALAR_PH]], !prof [[PROF5:![0-9]+]]
+; CHECK-V2-IC1:    br i1 [[CMP_N:%.*]], label %[[FOR_COND_CLEANUP:.*]], label %[[SCALAR_PH]], !prof [[PROF6:![0-9]+]]
 ; CHECK-V2-IC1:  [[SCALAR_PH]]:
 ; CHECK-V2-IC1:    br label %[[FOR_BODY:.*]]
 ; CHECK-V2-IC1:  [[FOR_BODY]]:
-; CHECK-V2-IC1:    br i1 [[EXITCOND:%.*]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !prof [[PROF6:![0-9]+]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-V2-IC1:    br i1 [[EXITCOND:%.*]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !prof [[PROF7:![0-9]+]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK-V2-IC1:  [[FOR_COND_CLEANUP]]:
 ;
 ; CHECK-V2-IC4-LABEL: define void @foo_i32(
@@ -82,19 +82,19 @@ define void @foo_i32(i64 %n) {
 ; CHECK-V2-IC4:  [[VECTOR_BODY]]:
 ; CHECK-V2-IC4:    br i1 [[TMP8:%.*]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !prof [[PROF1:![0-9]+]], !llvm.loop [[LOOP2:![0-9]+]]
 ; CHECK-V2-IC4:  [[MIDDLE_BLOCK]]:
-; CHECK-V2-IC4:    br i1 [[CMP_N:%.*]], label %[[FOR_COND_CLEANUP:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]], !prof [[PROF5:![0-9]+]]
+; CHECK-V2-IC4:    br i1 [[CMP_N:%.*]], label %[[FOR_COND_CLEANUP:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]], !prof [[PROF6:![0-9]+]]
 ; CHECK-V2-IC4:  [[VEC_EPILOG_ITER_CHECK]]:
-; CHECK-V2-IC4:    br i1 [[MIN_EPILOG_ITERS_CHECK:%.*]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF6:![0-9]+]]
+; CHECK-V2-IC4:    br i1 [[MIN_EPILOG_ITERS_CHECK:%.*]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF7:![0-9]+]]
 ; CHECK-V2-IC4:  [[VEC_EPILOG_PH]]:
 ; CHECK-V2-IC4:    br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
 ; CHECK-V2-IC4:  [[VEC_EPILOG_VECTOR_BODY]]:
-; CHECK-V2-IC4:    br i1 [[TMP11:%.*]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-V2-IC4:    br i1 [[TMP11:%.*]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK-V2-IC4:  [[VEC_EPILOG_MIDDLE_BLOCK]]:
-; CHECK-V2-IC4:    br i1 [[CMP_N10:%.*]], label %[[FOR_COND_CLEANUP]], label %[[VEC_EPILOG_SCALAR_PH]], !prof [[PROF8:![0-9]+]]
+; CHECK-V2-IC4:    br i1 [[CMP_N10:%.*]], label %[[FOR_COND_CLEANUP]], label %[[VEC_EPILOG_SCALAR_PH]], !prof [[PROF10:![0-9]+]]
 ; CHECK-V2-IC4:  [[VEC_EPILOG_SCALAR_PH]]:
 ; CHECK-V2-IC4:    br label %[[FOR_BODY:.*]]
 ; CHECK-V2-IC4:  [[FOR_BODY]]:
-; CHECK-V2-IC4:    br i1 [[EXITCOND:%.*]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !prof [[PROF9:![0-9]+]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-V2-IC4:    br i1 [[EXITCOND:%.*]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !prof [[PROF11:![0-9]+]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK-V2-IC4:  [[FOR_COND_CLEANUP]]:
 ;
 entry:
@@ -124,21 +124,21 @@ define void @foo_i8(i64 %n) {
 ; CHECK-V1-IC1:  [[VECTOR_PH]]:
 ; CHECK-V1-IC1:    br label %[[VECTOR_BODY:.*]]
 ; CHECK-V1-IC1:  [[VECTOR_BODY]]:
-; CHECK-V1-IC1:    br i1 [[TMP8:%.*]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !prof [[PROF7:![0-9]+]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-V1-IC1:    br i1 [[TMP6:%.*]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !prof [[PROF9:![0-9]+]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK-V1-IC1:  [[MIDDLE_BLOCK]]:
-; CHECK-V1-IC1:    br i1 [[CMP_N:%.*]], label %[[FOR_COND_CLEANUP:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]], !prof [[PROF7]]
+; CHECK-V1-IC1:    br i1 [[CMP_N:%.*]], label %[[FOR_COND_CLEANUP:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]], !prof [[PROF9]]
 ; CHECK-V1-IC1:  [[VEC_EPILOG_ITER_CHECK]]:
-; CHECK-V1-IC1:    br i1 [[MIN_EPILOG_ITERS_CHECK:%.*]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF9:![0-9]+]]
+; CHECK-V1-IC1:    br i1 [[MIN_EPILOG_ITERS_CHECK:%.*]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF12:![0-9]+]]
 ; CHECK-V1-IC1:  [[VEC_EPILOG_PH]]:
 ; CHECK-V1-IC1:    br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
 ; CHECK-V1-IC1:  [[VEC_EPILOG_VECTOR_BODY]]:
-; CHECK-V1-IC1:    br i1 [[TMP15:%.*]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-V1-IC1:    br i1 [[TMP9:%.*]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
 ; CHECK-V1-IC1:  [[VEC_EPILOG_MIDDLE_BLOCK]]:
-; CHECK-V1-IC1:    br i1 [[CMP_N7:%.*]], label %[[FOR_COND_CLEANUP]], label %[[VEC_EPILOG_SCALAR_PH]], !prof [[PROF11:![0-9]+]]
+; CHECK-V1-IC1:    br i1 [[CMP_N7:%.*]], label %[[FOR_COND_CLEANUP]], label %[[VEC_EPILOG_SCALAR_PH]], !prof [[PROF14:![0-9]+]]
 ; CHECK-V1-IC1:  [[VEC_EPILOG_SCALAR_PH]]:
 ; CHECK-V1-IC1:    br label %[[FOR_BODY:.*]]
 ; CHECK-V1-IC1:  [[FOR_BODY]]:
-; CHECK-V1-IC1:    br i1 [[EXITCOND:%.*]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !prof [[PROF5]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-V1-IC1:    br i1 [[EXITCOND:%.*]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !prof [[PROF6]], !llvm.loop [[LOOP15:![0-9]+]]
 ; CHECK-V1-IC1:  [[FOR_COND_CLEANUP]]:
 ;
 ; CHECK-V1-IC1-FORCE-EPI4-LABEL: define void @foo_i8(
@@ -150,21 +150,21 @@ define void @foo_i8(i64 %n) {
 ; CHECK-V1-IC1-FORCE-EPI4:  [[VECTOR_PH]]:
 ; CHECK-V1-IC1-FORCE-EPI4:    br label %[[VECTOR_BODY:.*]]
 ; CHECK-V1-IC1-FORCE-EPI4:  [[VECTOR_BODY]]:
-; CHECK-V1-IC1-FORCE-EPI4:    br i1 [[TMP6:%.*]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !prof [[PROF10:![0-9]+]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK-V1-IC1-FORCE-EPI4:    br i1 [[TMP6:%.*]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !prof [[PROF12:![0-9]+]], !llvm.loop [[LOOP13:![0-9]+]]
 ; CHECK-V1-IC1-FORCE-EPI4:  [[MIDDLE_BLOCK]]:
-; CHECK-V1-IC1-FORCE-EPI4:    br i1 [[CMP_N:%.*]], label %[[FOR_COND_CLEANUP:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]], !prof [[PROF10]]
+; CHECK-V1-IC1-FORCE-EPI4:    br i1 [[CMP_N:%.*]], label %[[FOR_COND_CLEANUP:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]], !prof [[PROF12]]
 ; CHECK-V1-IC1-FORCE-EPI4:  [[VEC_EPILOG_ITER_CHECK]]:
-; CHECK-V1-IC1-FORCE-EPI4:    br i1 [[MIN_EPILOG_ITERS_CHECK:%.*]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF12:![0-9]+]]
+; CHECK-V1-IC1-FORCE-EPI4:    br i1 [[MIN_EPILOG_ITERS_CHECK:%.*]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF15:![0-9]+]]
 ; CHECK-V1-IC1-FORCE-EPI4:  [[VEC_EPILOG_PH]]:
 ; CHECK-V1-IC1-FORCE-EPI4:    br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
 ; CHECK-V1-IC1-FORCE-EPI4:  [[VEC_EPILOG_VECTOR_BODY]]:
-; CHECK-V1-IC1-FORCE-EPI4:    br i1 [[TMP9:%.*]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK-V1-IC1-FORCE-EPI4:    br i1 [[TMP9:%.*]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; CHECK-V1-IC1-FORCE-EPI4:  [[VEC_EPILOG_MIDDLE_BLOCK]]:
-; CHECK-V1-IC1-FORCE-EPI4:    br i1 [[CMP_N7:%.*]], label %[[FOR_COND_CLEANUP]], label %[[VEC_EPILOG_SCALAR_PH]], !prof [[PROF7]]
+; CHECK-V1-IC1-FORCE-EPI4:    br i1 [[CMP_N7:%.*]], label %[[FOR_COND_CLEANUP]], label %[[VEC_EPILOG_SCALAR_PH]], !prof [[PROF9]]
 ; CHECK-V1-IC1-FORCE-EPI4:  [[VEC_EPILOG_SCALAR_PH]]:
 ; CHECK-V1-IC1-FORCE-EPI4:    br label %[[FOR_BODY:.*]]
 ; CHECK-V1-IC1-FORCE-EPI4:  [[FOR_BODY]]:
-; CHECK-V1-IC1-FORCE-EPI4:    br i1 [[EXITCOND:%.*]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !prof [[PROF8]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-V1-IC1-FORCE-EPI4:    br i1 [[EXITCOND:%.*]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !prof [[PROF10]], !llvm.loop [[LOOP17:![0-9]+]]
 ; CHECK-V1-IC1-FORCE-EPI4:  [[FOR_COND_CLEANUP]]:
 ;
 ; CHECK-V2-IC1-LABEL: define void @foo_i8(
@@ -176,21 +176,21 @@ define void @foo_i8(i64 %n) {
 ; CHECK-V2-IC1:  [[VECTOR_PH]]:
 ; CHECK-V2-IC1:    br label %[[VECTOR_BODY:.*]]
 ; CHECK-V2-IC1:  [[VECTOR_BODY]]:
-; CHECK-V2-IC1:    br i1 [[TMP4:%.*]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !prof [[PROF8:![0-9]+]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-V2-IC1:    br i1 [[TMP4:%.*]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !prof [[PROF10:![0-9]+]], !llvm.loop [[LOOP11:![0-9]+]]
 ; CHECK-V2-IC1:  [[MIDDLE_BLOCK]]:
-; CHECK-V2-IC1:    br i1 [[CMP_N:%.*]], label %[[FOR_COND_CLEANUP:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]], !prof [[PROF10:![0-9]+]]
+; CHECK-V2-IC1:    br i1 [[CMP_N:%.*]], label %[[FOR_COND_CLEANUP:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]], !prof [[PROF13:![0-9]+]]
 ; CHECK-V2-IC1:  [[VEC_EPILOG_ITER_CHECK]]:
-; CHECK-V2-IC1:    br i1 [[MIN_EPILOG_ITERS_CHECK:%.*]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF11:![0-9]+]]
+; CHECK-V2-IC1:    br i1 [[MIN_EPILOG_ITERS_CHECK:%.*]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF14:![0-9]+]]
 ; CHECK-V2-IC1:  [[VEC_EPILOG_PH]]:
 ; CHECK-V2-IC1:    br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
 ; CHECK-V2-IC1:  [[VEC_EPILOG_VECTOR_BODY]]:
-; CHECK-V2-IC1:    br i1 [[TMP11:%.*]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-V2-IC1:    br i1 [[TMP11:%.*]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
 ; CHECK-V2-IC1:  [[VEC_EPILOG_MIDDLE_BLOCK]]:
-; CHECK-V2-IC1:    br i1 [[CMP_N7:%.*]], label %[[FOR_COND_CLEANUP]], label %[[VEC_EPILOG_SCALAR_PH]], !prof [[PROF5]]
+; CHECK-V2-IC1:    br i1 [[CMP_N7:%.*]], label %[[FOR_COND_CLEANUP]], label %[[VEC_EPILOG_SCALAR_PH]], !prof [[PROF6]]
 ; CHECK-V2-IC1:  [[VEC_EPILOG_SCALAR_PH]]:
 ; CHECK-V2-IC1:    br label %[[FOR_BODY:.*]]
 ; CHECK-V2-IC1:  [[FOR_BODY]]:
-; CHECK-V2-IC1:    br i1 [[EXITCOND:%.*]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !prof [[PROF6]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK-V2-IC1:    br i1 [[EXITCOND:%.*]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !prof [[PROF7]], !llvm.loop [[LOOP16:![0-9]+]]
 ; CHECK-V2-IC1:  [[FOR_COND_CLEANUP]]:
 ;
 ; CHECK-V2-IC4-LABEL: define void @foo_i8(
@@ -202,21 +202,21 @@ define void @foo_i8(i64 %n) {
 ; CHECK-V2-IC4:  [[VECTOR_PH]]:
 ; CHECK-V2-IC4:    br label %[[VECTOR_BODY:.*]]
 ; CHECK-V2-IC4:  [[VECTOR_BODY]]:
-; CHECK-V2-IC4:    br i1 [[TMP8:%.*]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !prof [[PROF5]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK-V2-IC4:    br i1 [[TMP8:%.*]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !prof [[PROF6]], !llvm.loop [[LOOP13:![0-9]+]]
 ; CHECK-V2-IC4:  [[MIDDLE_BLOCK]]:
 ; CHECK-V2-IC4:    br i1 [[CMP_N:%.*]], label %[[FOR_COND_CLEANUP:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]], !prof [[PROF1]]
 ; CHECK-V2-IC4:  [[VEC_EPILOG_ITER_CHECK]]:
-; CHECK-V2-IC4:    br i1 [[MIN_EPILOG_ITERS_CHECK:%.*]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF12:![0-9]+]]
+; CHECK-V2-IC4:    br i1 [[MIN_EPILOG_ITERS_CHECK:%.*]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF15:![0-9]+]]
 ; CHECK-V2-IC4:  [[VEC_EPILOG_PH]]:
 ; CHECK-V2-IC4:    br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
 ; CHECK-V2-IC4:  [[VEC_EPILOG_VECTOR_BODY]]:
-; CHECK-V2-IC4:    br i1 [[TMP11:%.*]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK-V2-IC4:    br i1 [[TMP11:%.*]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; CHECK-V2-IC4:  [[VEC_EPILOG_MIDDLE_BLOCK]]:
-; CHECK-V2-IC4:    br i1 [[CMP_N10:%.*]], label %[[FOR_COND_CLEANUP]], label %[[VEC_EPILOG_SCALAR_PH]], !prof [[PROF14:![0-9]+]]
+; CHECK-V2-IC4:    br i1 [[CMP_N10:%.*]], label %[[FOR_COND_CLEANUP]], label %[[VEC_EPILOG_SCALAR_PH]], !prof [[PROF17:![0-9]+]]
 ; CHECK-V2-IC4:  [[VEC_EPILOG_SCALAR_PH]]:
 ; CHECK-V2-IC4:    br label %[[FOR_BODY:.*]]
 ; CHECK-V2-IC4:  [[FOR_BODY]]:
-; CHECK-V2-IC4:    br i1 [[EXITCOND:%.*]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !prof [[PROF9]], !llvm.loop [[LOOP15:![0-9]+]]
+; CHECK-V2-IC4:    br i1 [[EXITCOND:%.*]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !prof [[PROF11]], !llvm.loop [[LOOP18:![0-9]+]]
 ; CHECK-V2-IC4:  [[FOR_COND_CLEANUP]]:
 ;
 entry:
@@ -244,13 +244,13 @@ define void @foo_i32_no_bw(i64 %n) {
 ; CHECK-V1-IC1:  [[VECTOR_PH]]:
 ; CHECK-V1-IC1:    br label %[[VECTOR_BODY:.*]]
 ; CHECK-V1-IC1:  [[VECTOR_BODY]]:
-; CHECK-V1-IC1:    br i1 [[TMP6:%.*]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK-V1-IC1:    br i1 [[TMP6:%.*]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; CHECK-V1-IC1:  [[MIDDLE_BLOCK]]:
 ; CHECK-V1-IC1:    br i1 [[CMP_N:%.*]], label %[[FOR_COND_CLEANUP:.*]], label %[[SCALAR_PH]]
 ; CHECK-V1-IC1:  [[SCALAR_PH]]:
 ; CHECK-V1-IC1:    br label %[[FOR_BODY:.*]]
 ; CHECK-V1-IC1:  [[FOR_BODY]]:
-; CHECK-V1-IC1:    br i1 [[EXITCOND:%.*]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-V1-IC1:    br i1 [[EXITCOND:%.*]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
 ; CHECK-V1-IC1:  [[FOR_COND_CLEANUP]]:
 ;
 ; CHECK-V1-IC1-FORCE-EPI4-LABEL: define void @foo_i32_no_bw(
@@ -262,21 +262,21 @@ define void @foo_i32_no_bw(i64 %n) {
 ; CHECK-V1-IC1-FORCE-EPI4:  [[VECTOR_PH]]:
 ; CHECK-V1-IC1-FORCE-EPI4:    br label %[[VECTOR_BODY:.*]]
 ; CHECK-V1-IC1-FORCE-EPI4:  [[VECTOR_BODY]]:
-; CHECK-V1-IC1-FORCE-EPI4:    br i1 [[TMP6:%.*]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
+; CHECK-V1-IC1-FORCE-EPI4:    br i1 [[TMP6:%.*]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
 ; CHECK-V1-IC1-FORCE-EPI4:  [[MIDDLE_BLOCK]]:
 ; CHECK-V1-IC1-FORCE-EPI4:    br i1 [[CMP_N:%.*]], label %[[FOR_COND_CLEANUP:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
 ; CHECK-V1-IC1-FORCE-EPI4:  [[VEC_EPILOG_ITER_CHECK]]:
-; CHECK-V1-IC1-FORCE-EPI4:    br i1 [[MIN_EPILOG_ITERS_CHECK:%.*]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF5]]
+; CHECK-V1-IC1-FORCE-EPI4:    br i1 [[MIN_EPILOG_ITERS_CHECK:%.*]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF6]]
 ; CHECK-V1-IC1-FORCE-EPI4:  [[VEC_EPILOG_PH]]:
 ; CHECK-V1-IC1-FORCE-EPI4:    br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
 ; CHECK-V1-IC1-FORCE-EPI4:  [[VEC_EPILOG_VECTOR_BODY]]:
-; CHECK-V1-IC1-FORCE-EPI4:    br i1 [[TMP9:%.*]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK-V1-IC1-FORCE-EPI4:    br i1 [[TMP9:%.*]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
 ; CHECK-V1-IC1-FORCE-EPI4:  [[VEC_EPILOG_MIDDLE_BLOCK]]:
 ; CHECK-V1-IC1-FORCE-EPI4:    br i1 [[CMP_N7:%.*]], label %[[FOR_COND_CLEANUP]], label %[[VEC_EPILOG_SCALAR_PH]]
 ; CHECK-V1-IC1-FORCE-EPI4:  [[VEC_EPILOG_SCALAR_PH]]:
 ; CHECK-V1-IC1-FORCE-EPI4:    br label %[[FOR_BODY:.*]]
 ; CHECK-V1-IC1-FORCE-EPI4:  [[FOR_BODY]]:
-; CHECK-V1-IC1-FORCE-EPI4:    br i1 [[EXITCOND:%.*]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
+; CHECK-V1-IC1-FORCE-EPI4:    br i1 [[EXITCOND:%.*]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
 ; CHECK-V1-IC1-FORCE-EPI4:  [[FOR_COND_CLEANUP]]:
 ;
 ; CHECK-V2-IC1-LABEL: define void @foo_i32_no_bw(
@@ -286,13 +286,13 @@ define void @foo_i32_no_bw(i64 %n) {
 ; CHECK-V2-IC1:  [[VECTOR_PH]]:
 ; CHECK-V2-IC1:    br label %[[VECTOR_BODY:.*]]
 ; CHECK-V2-IC1:  [[VECTOR_BODY]]:
-; CHECK-V2-IC1:    br i1 [[TMP2:%.*]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-V2-IC1:    br i1 [[TMP2:%.*]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
 ; CHECK-V2-IC1:  [[MIDDLE_BLOCK]]:
 ; CHECK-V2-IC1:    br i1 [[CMP_N:%.*]], label %[[FOR_COND_CLEANUP:.*]], label %[[SCALAR_PH]]
 ; CHECK-V2-IC1:  [[SCALAR_PH]]:
 ; CHECK-V2-IC1:    br label %[[FOR_BODY:.*]]
 ; CHECK-V2-IC1:  [[FOR_BODY]]:
-; CHECK-V2-IC1:    br i1 [[EXITCOND:%.*]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
+; CHECK-V2-IC1:    br i1 [[EXITCOND:%.*]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
 ; CHECK-V2-IC1:  [[FOR_COND_CLEANUP]]:
 ;
 ; CHECK-V2-IC4-LABEL: define void @foo_i32_no_bw(
@@ -304,21 +304,21 @@ define void @foo_i32_no_bw(i64 %n) {
 ; CHECK-V2-IC4:  [[VECTOR_PH]]:
 ; CHECK-V2-IC4:    br label %[[VECTOR_BODY:.*]]
 ; CHECK-V2-IC4:  [[VECTOR_BODY]]:
-; CHECK-V2-IC4:    br i1 [[TMP8:%.*]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK-V2-IC4:    br i1 [[TMP8:%.*]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
 ; CHECK-V2-IC4:  [[MIDDLE_BLOCK]]:
 ; CHECK-V2-IC4:    br i1 [[CMP_N:%.*]], label %[[FOR_COND_CLEANUP:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
 ; CHECK-V2-IC4:  [[VEC_EPILOG_ITER_CHECK]]:
-; CHECK-V2-IC4:    br i1 [[MIN_EPILOG_ITERS_CHECK:%.*]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF6]]
+; CHECK-V2-IC4:    br i1 [[MIN_EPILOG_ITERS_CHECK:%.*]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF7]]
 ; CHECK-V2-IC4:  [[VEC_EPILOG_PH]]:
 ; CHECK-V2-IC4:    br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
 ; CHECK-V2-IC4:  [[VEC_EPILOG_VECTOR_BODY]]:
-; CHECK-V2-IC4:    br i1 [[TMP11:%.*]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
+; CHECK-V2-IC4:    br i1 [[TMP11:%.*]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
 ; CHECK-V2-IC4:  [[VEC_EPILOG_MIDDLE_BLOCK]]:
 ; CHECK-V2-IC4:    br i1 [[CMP_N10:%.*]], label %[[FOR_COND_CLEANUP]], label %[[VEC_EPILOG_SCALAR_PH]]
 ; CHECK-V2-IC4:  [[VEC_EPILOG_SCALAR_PH]]:
 ; CHECK-V2-IC4:    br label %[[FOR_BODY:.*]]
 ; CHECK-V2-IC4:  [[FOR_BODY]]:
-; CHECK-V2-IC4:    br i1 [[EXITCOND:%.*]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; CHECK-V2-IC4:    br i1 [[EXITCOND:%.*]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
 ; CHECK-V2-IC4:  [[FOR_COND_CLEANUP]]:
 ;
 entry:
@@ -341,74 +341,86 @@ for.cond.cleanup:                                 ; preds = %for.body
 !0 = !{!"branch_weights", i32 1, i32 1023}
 ;.
 ; CHECK-V1-IC1: [[PROF0]] = !{!"branch_weights", i32 1, i32 127}
-; CHECK-V1-IC1: [[LOOP1]] = distinct !{[[LOOP1]], [[META2:![0-9]+]], [[META3:![0-9]+]]}
+; CHECK-V1-IC1: [[LOOP1]] = distinct !{[[LOOP1]], [[META2:![0-9]+]], [[META3:![0-9]+]], [[META4:![0-9]+]]}
 ; CHECK-V1-IC1: [[META2]] = !{!"llvm.loop.isvectorized", i32 1}
 ; CHECK-V1-IC1: [[META3]] = !{!"llvm.loop.unroll.runtime.disable"}
-; CHECK-V1-IC1: [[PROF4]] = !{!"branch_weights", i32 1, i32 7}
-; CHECK-V1-IC1: [[PROF5]] = !{!"branch_weights", i32 0, i32 0}
-; CHECK-V1-IC1: [[LOOP6]] = distinct !{[[LOOP6]], [[META3]], [[META2]]}
-; CHECK-V1-IC1: [[PROF7]] = !{!"branch_weights", i32 1, i32 31}
-; CHECK-V1-IC1: [[LOOP8]] = distinct !{[[LOOP8]], [[META2]], [[META3]]}
-; CHECK-V1-IC1: [[PROF9]] = !{!"branch_weights", i32 16, i32 16}
-; CHECK-V1-IC1: [[LOOP10]] = distinct !{[[LOOP10]], [[META2]], [[META3]]}
-; CHECK-V1-IC1: [[PROF11]] = !{!"branch_weights", i32 1, i32 15}
-; CHECK-V1-IC1: [[LOOP12]] = distinct !{[[LOOP12]], [[META3]], [[META2]]}
-; CHECK-V1-IC1: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META3]]}
-; CHECK-V1-IC1: [[LOOP14]] = distinct !{[[LOOP14]], [[META3]], [[META2]]}
+; CHECK-V1-IC1: [[META4]] = !{!"llvm.loop.estimated_trip_count", i32 128}
+; CHECK-V1-IC1: [[PROF5]] = !{!"branch_weights", i32 1, i32 7}
+; CHECK-V1-IC1: [[PROF6]] = !{!"branch_weights", i32 0, i32 0}
+; CHECK-V1-IC1: [[LOOP7]] = distinct !{[[LOOP7]], [[META3]], [[META2]], [[META8:![0-9]+]]}
+; CHECK-V1-IC1: [[META8]] = !{!"llvm.loop.estimated_trip_count", i32 0}
+; CHECK-V1-IC1: [[PROF9]] = !{!"branch_weights", i32 1, i32 31}
+; CHECK-V1-IC1: [[LOOP10]] = distinct !{[[LOOP10]], [[META2]], [[META3]], [[META11:![0-9]+]]}
+; CHECK-V1-IC1: [[META11]] = !{!"llvm.loop.estimated_trip_count", i32 32}
+; CHECK-V1-IC1: [[PROF12]] = !{!"branch_weights", i32 16, i32 16}
+; CHECK-V1-IC1: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META8]], [[META3]]}
+; CHECK-V1-IC1: [[PROF14]] = !{!"branch_weights", i32 1, i32 15}
+; CHECK-V1-IC1: [[LOOP15]] = distinct !{[[LOOP15]], [[META3]], [[META2]], [[META8]]}
+; CHECK-V1-IC1: [[LOOP16]] = distinct !{[[LOOP16]], [[META2]], [[META3]]}
+; CHECK-V1-IC1: [[LOOP17]] = distinct !{[[LOOP17]], [[META3]], [[META2]]}
 ;.
 ; CHECK-V1-IC1-FORCE-EPI4: [[PROF0]] = !{!"branch_weights", i32 1, i32 127}
-; CHECK-V1-IC1-FORCE-EPI4: [[LOOP1]] = distinct !{[[LOOP1]], [[META2:![0-9]+]], [[META3:![0-9]+]]}
+; CHECK-V1-IC1-FORCE-EPI4: [[LOOP1]] = distinct !{[[LOOP1]], [[META2:![0-9]+]], [[META3:![0-9]+]], [[META4:![0-9]+]]}
 ; CHECK-V1-IC1-FORCE-EPI4: [[META2]] = !{!"llvm.loop.isvectorized", i32 1}
 ; CHECK-V1-IC1-FORCE-EPI4: [[META3]] = !{!"llvm.loop.unroll.runtime.disable"}
-; CHECK-V1-IC1-FORCE-EPI4: [[PROF4]] = !{!"branch_weights", i32 1, i32 7}
-; CHECK-V1-IC1-FORCE-EPI4: [[PROF5]] = !{!"branch_weights", i32 4, i32 4}
-; CHECK-V1-IC1-FORCE-EPI4: [[LOOP6]] = distinct !{[[LOOP6]], [[META2]], [[META3]]}
-; CHECK-V1-IC1-FORCE-EPI4: [[PROF7]] = !{!"branch_weights", i32 1, i32 3}
-; CHECK-V1-IC1-FORCE-EPI4: [[PROF8]] = !{!"branch_weights", i32 0, i32 0}
-; CHECK-V1-IC1-FORCE-EPI4: [[LOOP9]] = distinct !{[[LOOP9]], [[META3]], [[META2]]}
-; CHECK-V1-IC1-FORCE-EPI4: [[PROF10]] = !{!"branch_weights", i32 1, i32 31}
-; CHECK-V1-IC1-FORCE-EPI4: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META3]]}
-; CHECK-V1-IC1-FORCE-EPI4: [[PROF12]] = !{!"branch_weights", i32 4, i32 28}
-; CHECK-V1-IC1-FORCE-EPI4: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META3]]}
-; CHECK-V1-IC1-FORCE-EPI4: [[LOOP14]] = distinct !{[[LOOP14]], [[META3]], [[META2]]}
-; CHECK-V1-IC1-FORCE-EPI4: [[LOOP15]] = distinct !{[[LOOP15]], [[META2]], [[META3]]}
-; CHECK-V1-IC1-FORCE-EPI4: [[LOOP16]] = distinct !{[[LOOP16]], [[META2]], [[META3]]}
-; CHECK-V1-IC1-FORCE-EPI4: [[LOOP17]] = distinct !{[[LOOP17]], [[META3]], [[META2]]}
+; CHECK-V1-IC1-FORCE-EPI4: [[META4]] = !{!"llvm.loop.estimated_trip_count", i32 128}
+; CHECK-V1-IC1-FORCE-EPI4: [[PROF5]] = !{!"branch_weights", i32 1, i32 7}
+; CHECK-V1-IC1-FORCE-EPI4: [[PROF6]] = !{!"branch_weights", i32 4, i32 4}
+; CHECK-V1-IC1-FORCE-EPI4: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META8:![0-9]+]], [[META3]]}
+; CHECK-V1-IC1-FORCE-EPI4: [[META8]] = !{!"llvm.loop.estimated_trip_count", i32 0}
+; CHECK-V1-IC1-FORCE-EPI4: [[PROF9]] = !{!"branch_weights", i32 1, i32 3}
+; CHECK-V1-IC1-FORCE-EPI4: [[PROF10]] = !{!"branch_weights", i32 0, i32 0}
+; CHECK-V1-IC1-FORCE-EPI4: [[LOOP11]] = distinct !{[[LOOP11]], [[META3]], [[META2]], [[META8]]}
+; CHECK-V1-IC1-FORCE-EPI4: [[PROF12]] = !{!"branch_weights", i32 1, i32 31}
+; CHECK-V1-IC1-FORCE-EPI4: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META3]], [[META14:![0-9]+]]}
+; CHECK-V1-IC1-FORCE-EPI4: [[META14]] = !{!"llvm.loop.estimated_trip_count", i32 32}
+; CHECK-V1-IC1-FORCE-EPI4: [[PROF15]] = !{!"branch_weights", i32 4, i32 28}
+; CHECK-V1-IC1-FORCE-EPI4: [[LOOP16]] = distinct !{[[LOOP16]], [[META2]], [[META8]], [[META3]]}
+; CHECK-V1-IC1-FORCE-EPI4: [[LOOP17]] = distinct !{[[LOOP17]], [[META3]], [[META2]], [[META8]]}
+; CHECK-V1-IC1-FORCE-EPI4: [[LOOP18]] = distinct !{[[LOOP18]], [[META2]], [[META3]]}
+; CHECK-V1-IC1-FORCE-EPI4: [[LOOP19]] = distinct !{[[LOOP19]], [[META2]], [[META3]]}
+; CHECK-V1-IC1-FORCE-EPI4: [[LOOP20]] = distinct !{[[LOOP20]], [[META3]], [[META2]]}
 ;.
 ; CHECK-V2-IC1: [[PROF0]] = !{!"branch_weights", i32 1, i32 127}
 ; CHECK-V2-IC1: [[PROF1]] = !{!"branch_weights", i32 1, i32 255}
-; CHECK-V2-IC1: [[LOOP2]] = distinct !{[[LOOP2]], [[META3:![0-9]+]], [[META4:![0-9]+]]}
+; CHECK-V2-IC1: [[LOOP2]] = distinct !{[[LOOP2]], [[META3:![0-9]+]], [[META4:![0-9]+]], [[META5:![0-9]+]]}
 ; CHECK-V2-IC1: [[META3]] = !{!"llvm.loop.isvectorized", i32 1}
 ; CHECK-V2-IC1: [[META4]] = !{!"llvm.loop.unroll.runtime.disable"}
-; CHECK-V2-IC1: [[PROF5]] = !{!"branch_weights", i32 1, i32 3}
-; CHECK-V2-IC1: [[PROF6]] = !{!"branch_weights", i32 0, i32 0}
-; CHECK-V2-IC1: [[LOOP7]] = distinct !{[[LOOP7]], [[META4]], [[META3]]}
-; CHECK-V2-IC1: [[PROF8]] = !{!"branch_weights", i32 1, i32 63}
-; CHECK-V2-IC1: [[LOOP9]] = distinct !{[[LOOP9]], [[META3]], [[META4]]}
-; CHECK-V2-IC1: [[PROF10]] = !{!"branch_weights", i32 1, i32 15}
-; CHECK-V2-IC1: [[PROF11]] = !{!"branch_weights", i32 4, i32 12}
-; CHECK-V2-IC1: [[LOOP12]] = distinct !{[[LOOP12]], [[META3]], [[META4]]}
-; CHECK-V2-IC1: [[LOOP13]] = distinct !{[[LOOP13]], [[META4]], [[META3]]}
-; CHECK-V2-IC1: [[LOOP14]] = distinct !{[[LOOP14]], [[META3]], [[META4]]}
-; CHECK-V2-IC1: [[LOOP15]] = distinct !{[[LOOP15]], [[META4]], [[META3]]}
+; CHECK-V2-IC1: [[META5]] = !{!"llvm.loop.estimated_trip_count", i32 256}
+; CHECK-V2-IC1: [[PROF6]] = !{!"branch_weights", i32 1, i32 3}
+; CHECK-V2-IC1: [[PROF7]] = !{!"branch_weights", i32 0, i32 0}
+; CHECK-V2-IC1: [[LOOP8]] = distinct !{[[LOOP8]], [[META4]], [[META3]], [[META9:![0-9]+]]}
+; CHECK-V2-IC1: [[META9]] = !{!"llvm.loop.estimated_trip_count", i32 0}
+; CHECK-V2-IC1: [[PROF10]] = !{!"branch_weights", i32 1, i32 63}
+; CHECK-V2-IC1: [[LOOP11]] = distinct !{[[LOOP11]], [[META3]], [[META4]], [[META12:![0-9]+]]}
+; CHECK-V2-IC1: [[META12]] = !{!"llvm.loop.estimated_trip_count", i32 64}
+; CHECK-V2-IC1: [[PROF13]] = !{!"branch_weights", i32 1, i32 15}
+; CHECK-V2-IC1: [[PROF14]] = !{!"branch_weights", i32 4, i32 12}
+; CHECK-V2-IC1: [[LOOP15]] = distinct !{[[LOOP15]], [[META3]], [[META9]], [[META4]]}
+; CHECK-V2-IC1: [[LOOP16]] = distinct !{[[LOOP16]], [[META4]], [[META3]], [[META9]]}
+; CHECK-V2-IC1: [[LOOP17]] = distinct !{[[LOOP17]], [[META3]], [[META4]]}
+; CHECK-V2-IC1: [[LOOP18]] = distinct !{[[LOOP18]], [[META4]], [[META3]]}
 ;.
 ; CHECK-V2-IC4: [[PROF0]] = !{!"branch_weights", i32 1, i32 127}
 ; CHECK-V2-IC4: [[PROF1]] = !{!"branch_weights", i32 1, i32 63}
-; CHECK-V2-IC4: [[LOOP2]] = distinct !{[[LOOP2]], [[META3:![0-9]+]], [[META4:![0-9]+]]}
+; CHECK-V2-IC4: [[LOOP2]] = distinct !{[[LOOP2]], [[META3:![0-9]+]], [[META4:![0-9]+]], [[META5:![0-9]+]]}
 ; CHECK-V2-IC4: [[META3]] = !{!"llvm.loop.isvectorized", i32 1}
 ; CHECK-V2-IC4: [[META4]] = !{!"llvm.loop.unroll.runtime.disable"}
-; CHECK-V2-IC4: [[PROF5]] = !{!"branch_weights", i32 1, i32 15}
-; CHECK-V2-IC4: [[PROF6]] = !{!"branch_weights", i32 4, i32 12}
-; CHECK-V2-IC4: [[LOOP7]] = distinct !{[[LOOP7]], [[META3]], [[META4]]}
-; CHECK-V2-IC4: [[PROF8]] = !{!"branch_weights", i32 1, i32 3}
-; CHECK-V2-IC4: [[PROF9]] = !{!"branch_weights", i32 0, i32 0}
-; CHECK-V2-IC4: [[LOOP10]] = distinct !{[[LOOP10]], [[META4]], [[META3]]}
-; CHECK-V2-IC4: [[LOOP11]] = distinct !{[[LOOP11]], [[META3]], [[META4]]}
-; CHECK-V2-IC4: [[PROF12]] = !{!"branch_weights", i32 8, i32 56}
-; CHECK-V2-IC4: [[LOOP13]] = distinct !{[[LOOP13]], [[META3]], [[META4]]}
-; CHECK-V2-IC4: [[PROF14]] = !{!"branch_weights", i32 1, i32 7}
-; CHECK-V2-IC4: [[LOOP15]] = distinct !{[[LOOP15]], [[META4]], [[META3]]}
-; CHECK-V2-IC4: [[LOOP16]] = distinct !{[[LOOP16]], [[META3]], [[META4]]}
-; CHECK-V2-IC4: [[LOOP17]] = distinct !{[[LOOP17]], [[META3]], [[META4]]}
-; CHECK-V2-IC4: [[LOOP18]] = distinct !{[[LOOP18]], [[META4]], [[META3]]}
+; CHECK-V2-IC4: [[META5]] = !{!"llvm.loop.estimated_trip_count", i32 64}
+; CHECK-V2-IC4: [[PROF6]] = !{!"branch_weights", i32 1, i32 15}
+; CHECK-V2-IC4: [[PROF7]] = !{!"branch_weights", i32 4, i32 12}
+; CHECK-V2-IC4: [[LOOP8]] = distinct !{[[LOOP8]], [[META3]], [[META9:![0-9]+]], [[META4]]}
+; CHECK-V2-IC4: [[META9]] = !{!"llvm.loop.estimated_trip_count", i32 0}
+; CHECK-V2-IC4: [[PROF10]] = !{!"branch_weights", i32 1, i32 3}
+; CHECK-V2-IC4: [[PROF11]] = !{!"branch_weights", i32 0, i32 0}
+; CHECK-V2-IC4: [[LOOP12]] = distinct !{[[LOOP12]], [[META4]], [[META3]], [[META9]]}
+; CHECK-V2-IC4: [[LOOP13]] = distinct !{[[LOOP13]], [[META3]], [[META4]], [[META14:![0-9]+]]}
+; CHECK-V2-IC4: [[META14]] = !{!"llvm.loop.estimated_trip_count", i32 16}
+; CHECK-V2-IC4: [[PROF15]] = !{!"branch_weights", i32 8, i32 56}
+; CHECK-V2-IC4: [[LOOP16]] = distinct !{[[LOOP16]], [[META3]], [[META9]], [[META4]]}
+; CHECK-V2-IC4: [[PROF17]] = !{!"branch_weights", i32 1, i32 7}
+; CHECK-V2-IC4: [[LOOP18]] = distinct !{[[LOOP18]], [[META4]], [[META3]], [[META9]]}
+; CHECK-V2-IC4: [[LOOP19]] = distinct !{[[LOOP19]], [[META3]], [[META4]]}
+; CHECK-V2-IC4: [[LOOP20]] = distinct !{[[LOOP20]], [[META3]], [[META4]]}
+; CHECK-V2-IC4: [[LOOP21]] = distinct !{[[LOOP21]], [[META4]], [[META3]]}
 ;.
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
index f094b9a72d85b..ffac7e6faeaf4 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
@@ -8,21 +8,26 @@ target triple = "arm64-apple-macosx14.0.0"
 define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
 ; DEFAULT-LABEL: define void @iv_casts(
 ; DEFAULT-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], i32 [[X:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
-; DEFAULT-NEXT:  [[ENTRY:.*]]:
+; DEFAULT-NEXT:  [[ITER_CHECK:.*]]:
 ; DEFAULT-NEXT:    [[SRC2:%.*]] = ptrtoint ptr [[SRC]] to i64
 ; DEFAULT-NEXT:    [[DST1:%.*]] = ptrtoint ptr [[DST]] to i64
 ; DEFAULT-NEXT:    [[TMP0:%.*]] = add i64 [[N]], 1
 ; DEFAULT-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; DEFAULT-NEXT:    [[TMP2:%.*]] = shl nuw i64 [[TMP1]], 4
+; DEFAULT-NEXT:    [[TMP2:%.*]] = shl nuw i64 [[TMP1]], 2
 ; DEFAULT-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
-; DEFAULT-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; DEFAULT-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
 ; DEFAULT:       [[VECTOR_MEMCHECK]]:
 ; DEFAULT-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
 ; DEFAULT-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 8
 ; DEFAULT-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
 ; DEFAULT-NEXT:    [[TMP6:%.*]] = sub i64 [[DST1]], [[SRC2]]
 ; DEFAULT-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP6]], [[TMP5]]
-; DEFAULT-NEXT:    br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; DEFAULT-NEXT:    br i1 [[DIFF_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]]
+; DEFAULT:       [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
+; DEFAULT-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; DEFAULT-NEXT:    [[TMP8:%.*]] = shl nuw i64 [[TMP7]], 4
+; DEFAULT-NEXT:    [[MIN_ITERS_CHECK3:%.*]] = icmp ult i64 [[TMP0]], [[TMP8]]
+; DEFAULT-NEXT:    br i1 [[MIN_ITERS_CHECK3]], label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]]
 ; DEFAULT:       [[VECTOR_PH]]:
 ; DEFAULT-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
 ; DEFAULT-NEXT:    [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 16
@@ -30,56 +35,89 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
 ; DEFAULT-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
 ; DEFAULT-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 [[X]], i64 0
 ; DEFAULT-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 8 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
-; DEFAULT-NEXT:    [[TMP13:%.*]] = trunc <vscale x 8 x i32> [[BROADCAST_SPLAT]] to <vscale x 8 x i16>
+; DEFAULT-NEXT:    [[TMP11:%.*]] = trunc <vscale x 8 x i32> [[BROADCAST_SPLAT]] to <vscale x 8 x i16>
 ; DEFAULT-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; DEFAULT:       [[VECTOR_BODY]]:
 ; DEFAULT-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; DEFAULT-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[INDEX]]
-; DEFAULT-NEXT:    [[TMP23:%.*]] = call i64 @llvm.vscale.i64()
-; DEFAULT-NEXT:    [[TMP24:%.*]] = shl nuw i64 [[TMP23]], 3
-; DEFAULT-NEXT:    [[TMP25:%.*]] = getelementptr i8, ptr [[TMP20]], i64 [[TMP24]]
-; DEFAULT-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP20]], align 1
-; DEFAULT-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 8 x i8>, ptr [[TMP25]], align 1
-; DEFAULT-NEXT:    [[TMP26:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i16>
-; DEFAULT-NEXT:    [[TMP27:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD4]] to <vscale x 8 x i16>
-; DEFAULT-NEXT:    [[TMP28:%.*]] = mul <vscale x 8 x i16> [[TMP26]], [[TMP13]]
-; DEFAULT-NEXT:    [[TMP29:%.*]] = mul <vscale x 8 x i16> [[TMP27]], [[TMP13]]
-; DEFAULT-NEXT:    [[TMP32:%.*]] = or <vscale x 8 x i16> [[TMP28]], [[TMP26]]
-; DEFAULT-NEXT:    [[TMP33:%.*]] = or <vscale x 8 x i16> [[TMP29]], [[TMP27]]
-; DEFAULT-NEXT:    [[TMP34:%.*]] = lshr <vscale x 8 x i16> [[TMP32]], splat (i16 1)
-; DEFAULT-NEXT:    [[TMP35:%.*]] = lshr <vscale x 8 x i16> [[TMP33]], splat (i16 1)
-; DEFAULT-NEXT:    [[TMP36:%.*]] = trunc <vscale x 8 x i16> [[TMP34]] to <vscale x 8 x i8>
-; DEFAULT-NEXT:    [[TMP37:%.*]] = trunc <vscale x 8 x i16> [[TMP35]] to <vscale x 8 x i8>
-; DEFAULT-NEXT:    [[TMP38:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INDEX]]
-; DEFAULT-NEXT:    [[TMP41:%.*]] = call i64 @llvm.vscale.i64()
-; DEFAULT-NEXT:    [[TMP42:%.*]] = shl nuw i64 [[TMP41]], 3
-; DEFAULT-NEXT:    [[TMP43:%.*]] = getelementptr i8, ptr [[TMP38]], i64 [[TMP42]]
-; DEFAULT-NEXT:    store <vscale x 8 x i8> [[TMP36]], ptr [[TMP38]], align 1
-; DEFAULT-NEXT:    store <vscale x 8 x i8> [[TMP37]], ptr [[TMP43]], align 1
+; DEFAULT-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[INDEX]]
+; DEFAULT-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
+; DEFAULT-NEXT:    [[TMP14:%.*]] = shl nuw i64 [[TMP13]], 3
+; DEFAULT-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[TMP12]], i64 [[TMP14]]
+; DEFAULT-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP12]], align 1
+; DEFAULT-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 8 x i8>, ptr [[TMP15]], align 1
+; DEFAULT-NEXT:    [[TMP16:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i16>
+; DEFAULT-NEXT:    [[TMP17:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD4]] to <vscale x 8 x i16>
+; DEFAULT-NEXT:    [[TMP18:%.*]] = mul <vscale x 8 x i16> [[TMP16]], [[TMP11]]
+; DEFAULT-NEXT:    [[TMP19:%.*]] = mul <vscale x 8 x i16> [[TMP17]], [[TMP11]]
+; DEFAULT-NEXT:    [[TMP20:%.*]] = or <vscale x 8 x i16> [[TMP18]], [[TMP16]]
+; DEFAULT-NEXT:    [[TMP21:%.*]] = or <vscale x 8 x i16> [[TMP19]], [[TMP17]]
+; DEFAULT-NEXT:    [[TMP22:%.*]] = lshr <vscale x 8 x i16> [[TMP20]], splat (i16 1)
+; DEFAULT-NEXT:    [[TMP23:%.*]] = lshr <vscale x 8 x i16> [[TMP21]], splat (i16 1)
+; DEFAULT-NEXT:    [[TMP24:%.*]] = trunc <vscale x 8 x i16> [[TMP22]] to <vscale x 8 x i8>
+; DEFAULT-NEXT:    [[TMP25:%.*]] = trunc <vscale x 8 x i16> [[TMP23]] to <vscale x 8 x i8>
+; DEFAULT-NEXT:    [[TMP26:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INDEX]]
+; DEFAULT-NEXT:    [[TMP27:%.*]] = call i64 @llvm.vscale.i64()
+; DEFAULT-NEXT:    [[TMP28:%.*]] = shl nuw i64 [[TMP27]], 3
+; DEFAULT-NEXT:    [[TMP29:%.*]] = getelementptr i8, ptr [[TMP26]], i64 [[TMP28]]
+; DEFAULT-NEXT:    store <vscale x 8 x i8> [[TMP24]], ptr [[TMP26]], align 1
+; DEFAULT-NEXT:    store <vscale x 8 x i8> [[TMP25]], ptr [[TMP29]], align 1
 ; DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]]
-; DEFAULT-NEXT:    [[TMP44:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; DEFAULT-NEXT:    br i1 [[TMP44]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; DEFAULT-NEXT:    [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; DEFAULT-NEXT:    br i1 [[TMP30]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; DEFAULT:       [[MIDDLE_BLOCK]]:
 ; DEFAULT-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
-; DEFAULT-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
-; DEFAULT:       [[SCALAR_PH]]:
-; DEFAULT-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
+; DEFAULT-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
+; DEFAULT:       [[VEC_EPILOG_ITER_CHECK]]:
+; DEFAULT-NEXT:    [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP0]], [[N_VEC]]
+; DEFAULT-NEXT:    [[TMP31:%.*]] = call i64 @llvm.vscale.i64()
+; DEFAULT-NEXT:    [[TMP32:%.*]] = shl nuw i64 [[TMP31]], 2
+; DEFAULT-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], [[TMP32]]
+; DEFAULT-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]]
+; DEFAULT:       [[VEC_EPILOG_PH]]:
+; DEFAULT-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; DEFAULT-NEXT:    [[TMP33:%.*]] = call i64 @llvm.vscale.i64()
+; DEFAULT-NEXT:    [[TMP34:%.*]] = mul nuw i64 [[TMP33]], 4
+; DEFAULT-NEXT:    [[N_MOD_VF5:%.*]] = urem i64 [[TMP0]], [[TMP34]]
+; DEFAULT-NEXT:    [[N_VEC6:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF5]]
+; DEFAULT-NEXT:    [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[X]], i64 0
+; DEFAULT-NEXT:    [[BROADCAST_SPLAT8:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT7]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; DEFAULT-NEXT:    [[TMP35:%.*]] = trunc <vscale x 4 x i32> [[BROADCAST_SPLAT8]] to <vscale x 4 x i16>
+; DEFAULT-NEXT:    br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
+; DEFAULT:       [[VEC_EPILOG_VECTOR_BODY]]:
+; DEFAULT-NEXT:    [[INDEX9:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT11:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
+; DEFAULT-NEXT:    [[TMP36:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[INDEX9]]
+; DEFAULT-NEXT:    [[WIDE_LOAD10:%.*]] = load <vscale x 4 x i8>, ptr [[TMP36]], align 1
+; DEFAULT-NEXT:    [[TMP37:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD10]] to <vscale x 4 x i16>
+; DEFAULT-NEXT:    [[TMP38:%.*]] = mul <vscale x 4 x i16> [[TMP37]], [[TMP35]]
+; DEFAULT-NEXT:    [[TMP39:%.*]] = or <vscale x 4 x i16> [[TMP38]], [[TMP37]]
+; DEFAULT-NEXT:    [[TMP40:%.*]] = lshr <vscale x 4 x i16> [[TMP39]], splat (i16 1)
+; DEFAULT-NEXT:    [[TMP41:%.*]] = trunc <vscale x 4 x i16> [[TMP40]] to <vscale x 4 x i8>
+; DEFAULT-NEXT:    [[TMP42:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INDEX9]]
+; DEFAULT-NEXT:    store <vscale x 4 x i8> [[TMP41]], ptr [[TMP42]], align 1
+; DEFAULT-NEXT:    [[INDEX_NEXT11]] = add nuw i64 [[INDEX9]], [[TMP34]]
+; DEFAULT-NEXT:    [[TMP43:%.*]] = icmp eq i64 [[INDEX_NEXT11]], [[N_VEC6]]
+; DEFAULT-NEXT:    br i1 [[TMP43]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; DEFAULT:       [[VEC_EPILOG_MIDDLE_BLOCK]]:
+; DEFAULT-NEXT:    [[CMP_N12:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC6]]
+; DEFAULT-NEXT:    br i1 [[CMP_N12]], label %[[EXIT]], label %[[VEC_EPILOG_SCALAR_PH]]
+; DEFAULT:       [[VEC_EPILOG_SCALAR_PH]]:
+; DEFAULT-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC6]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MEMCHECK]] ], [ 0, %[[ITER_CHECK]] ]
 ; DEFAULT-NEXT:    br label %[[LOOP:.*]]
 ; DEFAULT:       [[LOOP]]:
-; DEFAULT-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; DEFAULT-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
 ; DEFAULT-NEXT:    [[GEP_SRC:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[IV]]
 ; DEFAULT-NEXT:    [[L:%.*]] = load i8, ptr [[GEP_SRC]], align 1
 ; DEFAULT-NEXT:    [[L_EXT:%.*]] = zext i8 [[L]] to i32
-; DEFAULT-NEXT:    [[MUL16_US:%.*]] = mul i32 [[L_EXT]], [[X]]
+; DEFAULT-NEXT:    [[MUL:%.*]] = mul i32 [[L_EXT]], [[X]]
 ; DEFAULT-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; DEFAULT-NEXT:    [[CONV25_US:%.*]] = zext i8 [[L]] to i32
-; DEFAULT-NEXT:    [[ADD34_US:%.*]] = or i32 [[MUL16_US]], [[CONV25_US]]
-; DEFAULT-NEXT:    [[SHR35_US:%.*]] = lshr i32 [[ADD34_US]], 1
-; DEFAULT-NEXT:    [[CONV36_US:%.*]] = trunc i32 [[SHR35_US]] to i8
+; DEFAULT-NEXT:    [[L_EXT_2:%.*]] = zext i8 [[L]] to i32
+; DEFAULT-NEXT:    [[OR:%.*]] = or i32 [[MUL]], [[L_EXT_2]]
+; DEFAULT-NEXT:    [[LSHR:%.*]] = lshr i32 [[OR]], 1
+; DEFAULT-NEXT:    [[TRUNC:%.*]] = trunc i32 [[LSHR]] to i8
 ; DEFAULT-NEXT:    [[GEP_DST:%.*]] = getelementptr i8, ptr [[DST]], i64 [[IV]]
-; DEFAULT-NEXT:    store i8 [[CONV36_US]], ptr [[GEP_DST]], align 1
+; DEFAULT-NEXT:    store i8 [[TRUNC]], ptr [[GEP_DST]], align 1
 ; DEFAULT-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV]], [[N]]
-; DEFAULT-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; DEFAULT-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
 ; DEFAULT:       [[EXIT]]:
 ; DEFAULT-NEXT:    ret void
 ;
@@ -101,31 +139,31 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
 ; PRED-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 16
 ; PRED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[X]], i64 0
 ; PRED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
-; PRED-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
-; PRED-NEXT:    [[TMP12:%.*]] = shl nuw i64 [[TMP11]], 4
-; PRED-NEXT:    [[TMP13:%.*]] = sub i64 [[TMP0]], [[TMP12]]
-; PRED-NEXT:    [[TMP14:%.*]] = icmp ugt i64 [[TMP0]], [[TMP12]]
-; PRED-NEXT:    [[TMP15:%.*]] = select i1 [[TMP14]], i64 [[TMP13]], i64 0
+; PRED-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; PRED-NEXT:    [[TMP7:%.*]] = shl nuw i64 [[TMP6]], 4
+; PRED-NEXT:    [[TMP8:%.*]] = sub i64 [[TMP0]], [[TMP7]]
+; PRED-NEXT:    [[TMP9:%.*]] = icmp ugt i64 [[TMP0]], [[TMP7]]
+; PRED-NEXT:    [[TMP10:%.*]] = select i1 [[TMP9]], i64 [[TMP8]], i64 0
 ; PRED-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 [[TMP0]])
-; PRED-NEXT:    [[TMP16:%.*]] = trunc <vscale x 16 x i32> [[BROADCAST_SPLAT]] to <vscale x 16 x i16>
+; PRED-NEXT:    [[TMP11:%.*]] = trunc <vscale x 16 x i32> [[BROADCAST_SPLAT]] to <vscale x 16 x i16>
 ; PRED-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; PRED:       [[VECTOR_BODY]]:
 ; PRED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; PRED-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; PRED-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[INDEX]]
-; PRED-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP18]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
-; PRED-NEXT:    [[TMP17:%.*]] = zext <vscale x 16 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 16 x i16>
-; PRED-NEXT:    [[TMP22:%.*]] = mul <vscale x 16 x i16> [[TMP17]], [[TMP16]]
-; PRED-NEXT:    [[TMP20:%.*]] = or <vscale x 16 x i16> [[TMP22]], [[TMP17]]
-; PRED-NEXT:    [[TMP21:%.*]] = lshr <vscale x 16 x i16> [[TMP20]], splat (i16 1)
-; PRED-NEXT:    [[TMP23:%.*]] = trunc <vscale x 16 x i16> [[TMP21]] to <vscale x 16 x i8>
-; PRED-NEXT:    [[TMP26:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INDEX]]
-; PRED-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP23]], ptr [[TMP26]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
+; PRED-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[INDEX]]
+; PRED-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP12]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
+; PRED-NEXT:    [[TMP13:%.*]] = zext <vscale x 16 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 16 x i16>
+; PRED-NEXT:    [[TMP14:%.*]] = mul <vscale x 16 x i16> [[TMP13]], [[TMP11]]
+; PRED-NEXT:    [[TMP15:%.*]] = or <vscale x 16 x i16> [[TMP14]], [[TMP13]]
+; PRED-NEXT:    [[TMP16:%.*]] = lshr <vscale x 16 x i16> [[TMP15]], splat (i16 1)
+; PRED-NEXT:    [[TMP17:%.*]] = trunc <vscale x 16 x i16> [[TMP16]] to <vscale x 16 x i8>
+; PRED-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INDEX]]
+; PRED-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP17]], ptr [[TMP18]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
 ; PRED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP5]]
-; PRED-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP15]])
-; PRED-NEXT:    [[TMP25:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
-; PRED-NEXT:    [[TMP27:%.*]] = xor i1 [[TMP25]], true
-; PRED-NEXT:    br i1 [[TMP27]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; PRED-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP10]])
+; PRED-NEXT:    [[TMP19:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
+; PRED-NEXT:    [[TMP20:%.*]] = xor i1 [[TMP19]], true
+; PRED-NEXT:    br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; PRED:       [[MIDDLE_BLOCK]]:
 ; PRED-NEXT:    br label %[[EXIT:.*]]
 ; PRED:       [[SCALAR_PH]]:
@@ -136,14 +174,14 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
 ; PRED-NEXT:    [[GEP_SRC:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[IV]]
 ; PRED-NEXT:    [[L:%.*]] = load i8, ptr [[GEP_SRC]], align 1
 ; PRED-NEXT:    [[L_EXT:%.*]] = zext i8 [[L]] to i32
-; PRED-NEXT:    [[MUL16_US:%.*]] = mul i32 [[L_EXT]], [[X]]
+; PRED-NEXT:    [[MUL:%.*]] = mul i32 [[L_EXT]], [[X]]
 ; PRED-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; PRED-NEXT:    [[CONV25_US:%.*]] = zext i8 [[L]] to i32
-; PRED-NEXT:    [[ADD34_US:%.*]] = or i32 [[MUL16_US]], [[CONV25_US]]
-; PRED-NEXT:    [[SHR35_US:%.*]] = lshr i32 [[ADD34_US]], 1
-; PRED-NEXT:    [[CONV36_US:%.*]] = trunc i32 [[SHR35_US]] to i8
+; PRED-NEXT:    [[L_EXT_2:%.*]] = zext i8 [[L]] to i32
+; PRED-NEXT:    [[OR:%.*]] = or i32 [[MUL]], [[L_EXT_2]]
+; PRED-NEXT:    [[LSHR:%.*]] = lshr i32 [[OR]], 1
+; PRED-NEXT:    [[TRUNC:%.*]] = trunc i32 [[LSHR]] to i8
 ; PRED-NEXT:    [[GEP_DST:%.*]] = getelementptr i8, ptr [[DST]], i64 [[IV]]
-; PRED-NEXT:    store i8 [[CONV36_US]], ptr [[GEP_DST]], align 1
+; PRED-NEXT:    store i8 [[TRUNC]], ptr [[GEP_DST]], align 1
 ; PRED-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV]], [[N]]
 ; PRED-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
 ; PRED:       [[EXIT]]:
@@ -204,18 +242,18 @@ define void @iv_trunc(i32 %x, ptr %dst, i64 %N) #0 {
 ; DEFAULT:       [[VECTOR_BODY]]:
 ; DEFAULT-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; DEFAULT-NEXT:    [[TMP13:%.*]] = trunc i64 [[INDEX]] to i32
-; DEFAULT-NEXT:    [[TMP15:%.*]] = add i32 [[TMP13]], 1
-; DEFAULT-NEXT:    [[TMP16:%.*]] = mul i32 [[MUL_X]], [[TMP13]]
-; DEFAULT-NEXT:    [[TMP17:%.*]] = mul i32 [[MUL_X]], [[TMP15]]
+; DEFAULT-NEXT:    [[TMP14:%.*]] = add i32 [[TMP13]], 1
+; DEFAULT-NEXT:    [[TMP15:%.*]] = mul i32 [[MUL_X]], [[TMP13]]
+; DEFAULT-NEXT:    [[TMP16:%.*]] = mul i32 [[MUL_X]], [[TMP14]]
+; DEFAULT-NEXT:    [[TMP17:%.*]] = zext i32 [[TMP15]] to i64
 ; DEFAULT-NEXT:    [[TMP18:%.*]] = zext i32 [[TMP16]] to i64
-; DEFAULT-NEXT:    [[TMP19:%.*]] = zext i32 [[TMP17]] to i64
+; DEFAULT-NEXT:    [[TMP19:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP17]]
 ; DEFAULT-NEXT:    [[TMP20:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP18]]
-; DEFAULT-NEXT:    [[TMP21:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP19]]
+; DEFAULT-NEXT:    store i32 1, ptr [[TMP19]], align 4
 ; DEFAULT-NEXT:    store i32 1, ptr [[TMP20]], align 4
-; DEFAULT-NEXT:    store i32 1, ptr [[TMP21]], align 4
 ; DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; DEFAULT-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; DEFAULT-NEXT:    br i1 [[TMP22]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; DEFAULT-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; DEFAULT-NEXT:    br i1 [[TMP21]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; DEFAULT:       [[MIDDLE_BLOCK]]:
 ; DEFAULT-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
 ; DEFAULT-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
@@ -231,7 +269,7 @@ define void @iv_trunc(i32 %x, ptr %dst, i64 %N) #0 {
 ; DEFAULT-NEXT:    store i32 1, ptr [[GEP]], align 4
 ; DEFAULT-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
 ; DEFAULT-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV]], [[N]]
-; DEFAULT-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; DEFAULT-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; DEFAULT:       [[EXIT]]:
 ; DEFAULT-NEXT:    ret void
 ;
@@ -361,31 +399,31 @@ define void @trunc_ivs_and_store(i32 %x, ptr %dst, i64 %N) #0 {
 ; DEFAULT:       [[VECTOR_PH]]:
 ; DEFAULT-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 2
 ; DEFAULT-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
-; DEFAULT-NEXT:    [[IND_END:%.*]] = trunc i64 [[N_VEC]] to i32
+; DEFAULT-NEXT:    [[TMP14:%.*]] = trunc i64 [[N_VEC]] to i32
 ; DEFAULT-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; DEFAULT:       [[VECTOR_BODY]]:
 ; DEFAULT-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; DEFAULT-NEXT:    [[OFFSET_IDX:%.*]] = trunc i64 [[INDEX]] to i32
 ; DEFAULT-NEXT:    [[TMP15:%.*]] = add i32 [[OFFSET_IDX]], 1
 ; DEFAULT-NEXT:    [[TMP16:%.*]] = trunc i64 [[INDEX]] to i32
-; DEFAULT-NEXT:    [[TMP18:%.*]] = add i32 [[TMP16]], 1
-; DEFAULT-NEXT:    [[TMP19:%.*]] = mul i32 [[MUL]], [[TMP16]]
-; DEFAULT-NEXT:    [[TMP20:%.*]] = mul i32 [[MUL]], [[TMP18]]
+; DEFAULT-NEXT:    [[TMP17:%.*]] = add i32 [[TMP16]], 1
+; DEFAULT-NEXT:    [[TMP18:%.*]] = mul i32 [[MUL]], [[TMP16]]
+; DEFAULT-NEXT:    [[TMP19:%.*]] = mul i32 [[MUL]], [[TMP17]]
+; DEFAULT-NEXT:    [[TMP20:%.*]] = zext i32 [[TMP18]] to i64
 ; DEFAULT-NEXT:    [[TMP21:%.*]] = zext i32 [[TMP19]] to i64
-; DEFAULT-NEXT:    [[TMP22:%.*]] = zext i32 [[TMP20]] to i64
+; DEFAULT-NEXT:    [[TMP22:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP20]]
 ; DEFAULT-NEXT:    [[TMP23:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP21]]
-; DEFAULT-NEXT:    [[TMP24:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP22]]
-; DEFAULT-NEXT:    store i32 [[OFFSET_IDX]], ptr [[TMP23]], align 4
-; DEFAULT-NEXT:    store i32 [[TMP15]], ptr [[TMP24]], align 4
+; DEFAULT-NEXT:    store i32 [[OFFSET_IDX]], ptr [[TMP22]], align 4
+; DEFAULT-NEXT:    store i32 [[TMP15]], ptr [[TMP23]], align 4
 ; DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; DEFAULT-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; DEFAULT-NEXT:    br i1 [[TMP25]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; DEFAULT-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; DEFAULT-NEXT:    br i1 [[TMP24]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; DEFAULT:       [[MIDDLE_BLOCK]]:
 ; DEFAULT-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
 ; DEFAULT-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; DEFAULT:       [[SCALAR_PH]]:
 ; DEFAULT-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_SCEVCHECK]] ]
-; DEFAULT-NEXT:    [[BC_RESUME_VAL2:%.*]] = phi i32 [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_SCEVCHECK]] ]
+; DEFAULT-NEXT:    [[BC_RESUME_VAL2:%.*]] = phi i32 [ [[TMP14]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_SCEVCHECK]] ]
 ; DEFAULT-NEXT:    br label %[[LOOP:.*]]
 ; DEFAULT:       [[LOOP]]:
 ; DEFAULT-NEXT:    [[IV_1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_1_NEXT:%.*]], %[[LOOP]] ]
@@ -398,7 +436,7 @@ define void @trunc_ivs_and_store(i32 %x, ptr %dst, i64 %N) #0 {
 ; DEFAULT-NEXT:    store i32 [[IV_2]], ptr [[GEP]], align 4
 ; DEFAULT-NEXT:    [[IV_1_NEXT]] = add i64 [[IV_1]], 1
 ; DEFAULT-NEXT:    [[EXITCOND_3_NOT:%.*]] = icmp eq i64 [[IV_1]], [[N]]
-; DEFAULT-NEXT:    br i1 [[EXITCOND_3_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
+; DEFAULT-NEXT:    br i1 [[EXITCOND_3_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
 ; DEFAULT:       [[EXIT]]:
 ; DEFAULT-NEXT:    ret void
 ;
@@ -553,31 +591,31 @@ define void @ivs_trunc_and_ext(i32 %x, ptr %dst, i64 %N) #0 {
 ; DEFAULT:       [[VECTOR_PH]]:
 ; DEFAULT-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 2
 ; DEFAULT-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
-; DEFAULT-NEXT:    [[IND_END:%.*]] = trunc i64 [[N_VEC]] to i32
+; DEFAULT-NEXT:    [[TMP13:%.*]] = trunc i64 [[N_VEC]] to i32
 ; DEFAULT-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; DEFAULT:       [[VECTOR_BODY]]:
 ; DEFAULT-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; DEFAULT-NEXT:    [[OFFSET_IDX:%.*]] = trunc i64 [[INDEX]] to i32
 ; DEFAULT-NEXT:    [[TMP14:%.*]] = add i32 [[OFFSET_IDX]], 1
 ; DEFAULT-NEXT:    [[TMP15:%.*]] = trunc i64 [[INDEX]] to i32
-; DEFAULT-NEXT:    [[TMP17:%.*]] = add i32 [[TMP15]], 1
-; DEFAULT-NEXT:    [[TMP18:%.*]] = mul i32 [[ADD]], [[TMP15]]
-; DEFAULT-NEXT:    [[TMP19:%.*]] = mul i32 [[ADD]], [[TMP17]]
+; DEFAULT-NEXT:    [[TMP16:%.*]] = add i32 [[TMP15]], 1
+; DEFAULT-NEXT:    [[TMP17:%.*]] = mul i32 [[ADD]], [[TMP15]]
+; DEFAULT-NEXT:    [[TMP18:%.*]] = mul i32 [[ADD]], [[TMP16]]
+; DEFAULT-NEXT:    [[TMP19:%.*]] = zext i32 [[TMP17]] to i64
 ; DEFAULT-NEXT:    [[TMP20:%.*]] = zext i32 [[TMP18]] to i64
-; DEFAULT-NEXT:    [[TMP21:%.*]] = zext i32 [[TMP19]] to i64
+; DEFAULT-NEXT:    [[TMP21:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP19]]
 ; DEFAULT-NEXT:    [[TMP22:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP20]]
-; DEFAULT-NEXT:    [[TMP23:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP21]]
-; DEFAULT-NEXT:    store i32 [[OFFSET_IDX]], ptr [[TMP22]], align 4
-; DEFAULT-NEXT:    store i32 [[TMP14]], ptr [[TMP23]], align 4
+; DEFAULT-NEXT:    store i32 [[OFFSET_IDX]], ptr [[TMP21]], align 4
+; DEFAULT-NEXT:    store i32 [[TMP14]], ptr [[TMP22]], align 4
 ; DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; DEFAULT-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; DEFAULT-NEXT:    br i1 [[TMP24]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; DEFAULT-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; DEFAULT-NEXT:    br i1 [[TMP23]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; DEFAULT:       [[MIDDLE_BLOCK]]:
 ; DEFAULT-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
 ; DEFAULT-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; DEFAULT:       [[SCALAR_PH]]:
 ; DEFAULT-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_SCEVCHECK]] ]
-; DEFAULT-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i32 [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_SCEVCHECK]] ]
+; DEFAULT-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i32 [ [[TMP13]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_SCEVCHECK]] ]
 ; DEFAULT-NEXT:    br label %[[LOOP:.*]]
 ; DEFAULT:       [[LOOP]]:
 ; DEFAULT-NEXT:    [[IV_1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_1_NEXT:%.*]], %[[LOOP]] ]
@@ -590,7 +628,7 @@ define void @ivs_trunc_and_ext(i32 %x, ptr %dst, i64 %N) #0 {
 ; DEFAULT-NEXT:    store i32 [[IV_2]], ptr [[GEP]], align 4
 ; DEFAULT-NEXT:    [[IV_1_NEXT]] = add i64 [[IV_1]], 1
 ; DEFAULT-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_1]], [[N]]
-; DEFAULT-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
+; DEFAULT-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP11:![0-9]+]]
 ; DEFAULT:       [[EXIT]]:
 ; DEFAULT-NEXT:    ret void
 ;
@@ -727,32 +765,32 @@ define void @exit_cond_zext_iv(ptr %dst, i64 %N) {
 ; DEFAULT:       [[VECTOR_SCEVCHECK]]:
 ; DEFAULT-NEXT:    [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N]], i64 1)
 ; DEFAULT-NEXT:    [[TMP0:%.*]] = add i64 [[UMAX]], -1
-; DEFAULT-NEXT:    [[TMP2:%.*]] = trunc i64 [[TMP0]] to i32
-; DEFAULT-NEXT:    [[TMP3:%.*]] = add i32 1, [[TMP2]]
-; DEFAULT-NEXT:    [[TMP4:%.*]] = icmp ult i32 [[TMP3]], 1
-; DEFAULT-NEXT:    [[TMP5:%.*]] = icmp ugt i64 [[TMP0]], 4294967295
-; DEFAULT-NEXT:    [[TMP6:%.*]] = or i1 [[TMP4]], [[TMP5]]
-; DEFAULT-NEXT:    br i1 [[TMP6]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; DEFAULT-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
+; DEFAULT-NEXT:    [[TMP2:%.*]] = add i32 1, [[TMP1]]
+; DEFAULT-NEXT:    [[TMP3:%.*]] = icmp ult i32 [[TMP2]], 1
+; DEFAULT-NEXT:    [[TMP4:%.*]] = icmp ugt i64 [[TMP0]], 4294967295
+; DEFAULT-NEXT:    [[TMP5:%.*]] = or i1 [[TMP3]], [[TMP4]]
+; DEFAULT-NEXT:    br i1 [[TMP5]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
 ; DEFAULT:       [[VECTOR_PH]]:
 ; DEFAULT-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[UMAX1]], 2
 ; DEFAULT-NEXT:    [[N_VEC:%.*]] = sub i64 [[UMAX1]], [[N_MOD_VF]]
-; DEFAULT-NEXT:    [[IND_END:%.*]] = trunc i64 [[N_VEC]] to i32
+; DEFAULT-NEXT:    [[TMP6:%.*]] = trunc i64 [[N_VEC]] to i32
 ; DEFAULT-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; DEFAULT:       [[VECTOR_BODY]]:
 ; DEFAULT-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; DEFAULT-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 1
-; DEFAULT-NEXT:    [[TMP9:%.*]] = getelementptr { [100 x i32], i32, i32 }, ptr [[DST]], i64 [[INDEX]], i32 2
-; DEFAULT-NEXT:    [[TMP10:%.*]] = getelementptr { [100 x i32], i32, i32 }, ptr [[DST]], i64 [[TMP8]], i32 2
+; DEFAULT-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 1
+; DEFAULT-NEXT:    [[TMP8:%.*]] = getelementptr { [100 x i32], i32, i32 }, ptr [[DST]], i64 [[INDEX]], i32 2
+; DEFAULT-NEXT:    [[TMP9:%.*]] = getelementptr { [100 x i32], i32, i32 }, ptr [[DST]], i64 [[TMP7]], i32 2
+; DEFAULT-NEXT:    store i32 0, ptr [[TMP8]], align 8
 ; DEFAULT-NEXT:    store i32 0, ptr [[TMP9]], align 8
-; DEFAULT-NEXT:    store i32 0, ptr [[TMP10]], align 8
 ; DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; DEFAULT-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; DEFAULT-NEXT:    br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; DEFAULT-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; DEFAULT-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; DEFAULT:       [[MIDDLE_BLOCK]]:
 ; DEFAULT-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[UMAX1]], [[N_VEC]]
 ; DEFAULT-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; DEFAULT:       [[SCALAR_PH]]:
-; DEFAULT-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_SCEVCHECK]] ]
+; DEFAULT-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[TMP6]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_SCEVCHECK]] ]
 ; DEFAULT-NEXT:    [[BC_RESUME_VAL2:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_SCEVCHECK]] ]
 ; DEFAULT-NEXT:    br label %[[LOOP:.*]]
 ; DEFAULT:       [[LOOP]]:
@@ -763,7 +801,7 @@ define void @exit_cond_zext_iv(ptr %dst, i64 %N) {
 ; DEFAULT-NEXT:    [[IV_1_NEXT]] = add i32 [[IV_1]], 1
 ; DEFAULT-NEXT:    [[IV_EXT]] = zext i32 [[IV_1_NEXT]] to i64
 ; DEFAULT-NEXT:    [[C:%.*]] = icmp ult i64 [[IV_EXT]], [[N]]
-; DEFAULT-NEXT:    br i1 [[C]], label %[[LOOP]], label %[[EXIT]], !llvm.loop [[LOOP11:![0-9]+]]
+; DEFAULT-NEXT:    br i1 [[C]], label %[[LOOP]], label %[[EXIT]], !llvm.loop [[LOOP13:![0-9]+]]
 ; DEFAULT:       [[EXIT]]:
 ; DEFAULT-NEXT:    ret void
 ;
@@ -775,45 +813,45 @@ define void @exit_cond_zext_iv(ptr %dst, i64 %N) {
 ; PRED:       [[VECTOR_SCEVCHECK]]:
 ; PRED-NEXT:    [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N]], i64 1)
 ; PRED-NEXT:    [[TMP0:%.*]] = add i64 [[UMAX]], -1
-; PRED-NEXT:    [[TMP2:%.*]] = trunc i64 [[TMP0]] to i32
-; PRED-NEXT:    [[TMP3:%.*]] = add i32 1, [[TMP2]]
-; PRED-NEXT:    [[TMP4:%.*]] = icmp ult i32 [[TMP3]], 1
-; PRED-NEXT:    [[TMP5:%.*]] = icmp ugt i64 [[TMP0]], 4294967295
-; PRED-NEXT:    [[TMP6:%.*]] = or i1 [[TMP4]], [[TMP5]]
-; PRED-NEXT:    br i1 [[TMP6]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; PRED-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
+; PRED-NEXT:    [[TMP2:%.*]] = add i32 1, [[TMP1]]
+; PRED-NEXT:    [[TMP3:%.*]] = icmp ult i32 [[TMP2]], 1
+; PRED-NEXT:    [[TMP4:%.*]] = icmp ugt i64 [[TMP0]], 4294967295
+; PRED-NEXT:    [[TMP5:%.*]] = or i1 [[TMP3]], [[TMP4]]
+; PRED-NEXT:    br i1 [[TMP5]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
 ; PRED:       [[VECTOR_PH]]:
 ; PRED-NEXT:    [[N_RND_UP:%.*]] = add i64 [[UMAX1]], 1
 ; PRED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 2
 ; PRED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; PRED-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[UMAX1]], 1
-; PRED-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <2 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
-; PRED-NEXT:    [[BROADCAST_SPLAT3:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT2]], <2 x i64> poison, <2 x i32> zeroinitializer
+; PRED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; PRED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
 ; PRED-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; PRED:       [[VECTOR_BODY]]:
 ; PRED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE5:.*]] ]
-; PRED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[INDEX]], i64 0
-; PRED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
-; PRED-NEXT:    [[VEC_IV:%.*]] = add <2 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1>
-; PRED-NEXT:    [[TMP7:%.*]] = icmp ule <2 x i64> [[VEC_IV]], [[BROADCAST_SPLAT3]]
-; PRED-NEXT:    [[TMP8:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0
-; PRED-NEXT:    br i1 [[TMP8]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; PRED-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <2 x i64> poison, i64 [[INDEX]], i64 0
+; PRED-NEXT:    [[BROADCAST_SPLAT3:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT2]], <2 x i64> poison, <2 x i32> zeroinitializer
+; PRED-NEXT:    [[VEC_IV:%.*]] = add <2 x i64> [[BROADCAST_SPLAT3]], <i64 0, i64 1>
+; PRED-NEXT:    [[TMP6:%.*]] = icmp ule <2 x i64> [[VEC_IV]], [[BROADCAST_SPLAT]]
+; PRED-NEXT:    [[TMP7:%.*]] = extractelement <2 x i1> [[TMP6]], i32 0
+; PRED-NEXT:    br i1 [[TMP7]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
 ; PRED:       [[PRED_STORE_IF]]:
-; PRED-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 0
-; PRED-NEXT:    [[TMP10:%.*]] = getelementptr { [100 x i32], i32, i32 }, ptr [[DST]], i64 [[TMP9]], i32 2
-; PRED-NEXT:    store i32 0, ptr [[TMP10]], align 8
+; PRED-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 0
+; PRED-NEXT:    [[TMP9:%.*]] = getelementptr { [100 x i32], i32, i32 }, ptr [[DST]], i64 [[TMP8]], i32 2
+; PRED-NEXT:    store i32 0, ptr [[TMP9]], align 8
 ; PRED-NEXT:    br label %[[PRED_STORE_CONTINUE]]
 ; PRED:       [[PRED_STORE_CONTINUE]]:
-; PRED-NEXT:    [[TMP11:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1
-; PRED-NEXT:    br i1 [[TMP11]], label %[[PRED_STORE_IF4:.*]], label %[[PRED_STORE_CONTINUE5]]
+; PRED-NEXT:    [[TMP10:%.*]] = extractelement <2 x i1> [[TMP6]], i32 1
+; PRED-NEXT:    br i1 [[TMP10]], label %[[PRED_STORE_IF4:.*]], label %[[PRED_STORE_CONTINUE5]]
 ; PRED:       [[PRED_STORE_IF4]]:
-; PRED-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 1
-; PRED-NEXT:    [[TMP13:%.*]] = getelementptr { [100 x i32], i32, i32 }, ptr [[DST]], i64 [[TMP12]], i32 2
-; PRED-NEXT:    store i32 0, ptr [[TMP13]], align 8
+; PRED-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], 1
+; PRED-NEXT:    [[TMP12:%.*]] = getelementptr { [100 x i32], i32, i32 }, ptr [[DST]], i64 [[TMP11]], i32 2
+; PRED-NEXT:    store i32 0, ptr [[TMP12]], align 8
 ; PRED-NEXT:    br label %[[PRED_STORE_CONTINUE5]]
 ; PRED:       [[PRED_STORE_CONTINUE5]]:
 ; PRED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 2
-; PRED-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; PRED-NEXT:    br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; PRED-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; PRED-NEXT:    br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; PRED:       [[MIDDLE_BLOCK]]:
 ; PRED-NEXT:    br label %[[EXIT:.*]]
 ; PRED:       [[SCALAR_PH]]:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleaving-reduction.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleaving-reduction.ll
index 7ff4609f8ec4b..a869cf647b5ce 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/interleaving-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleaving-reduction.ll
@@ -24,59 +24,59 @@ define i32 @interleave_integer_reduction(ptr %src, i64 %N) {
 ; INTERLEAVE-4-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; INTERLEAVE-4:       vector.body:
 ; INTERLEAVE-4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; INTERLEAVE-4-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ]
-; INTERLEAVE-4-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
-; INTERLEAVE-4-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
-; INTERLEAVE-4-NEXT:    [[VEC_PHI4:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ]
-; INTERLEAVE-4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 [[INDEX]]
-; INTERLEAVE-4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 4
-; INTERLEAVE-4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 8
-; INTERLEAVE-4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 12
-; INTERLEAVE-4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP4]], align 1
-; INTERLEAVE-4-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i32>, ptr [[TMP9]], align 1
-; INTERLEAVE-4-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i32>, ptr [[TMP10]], align 1
-; INTERLEAVE-4-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x i32>, ptr [[TMP11]], align 1
-; INTERLEAVE-4-NEXT:    [[TMP12]] = add <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]]
-; INTERLEAVE-4-NEXT:    [[TMP13]] = add <4 x i32> [[VEC_PHI2]], [[WIDE_LOAD5]]
-; INTERLEAVE-4-NEXT:    [[TMP14]] = add <4 x i32> [[VEC_PHI3]], [[WIDE_LOAD6]]
-; INTERLEAVE-4-NEXT:    [[TMP15]] = add <4 x i32> [[VEC_PHI4]], [[WIDE_LOAD7]]
+; INTERLEAVE-4-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
+; INTERLEAVE-4-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
+; INTERLEAVE-4-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
+; INTERLEAVE-4-NEXT:    [[VEC_PHI4:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
+; INTERLEAVE-4-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 [[INDEX]]
+; INTERLEAVE-4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 4
+; INTERLEAVE-4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 8
+; INTERLEAVE-4-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 12
+; INTERLEAVE-4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP0]], align 1
+; INTERLEAVE-4-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i32>, ptr [[TMP1]], align 1
+; INTERLEAVE-4-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i32>, ptr [[TMP2]], align 1
+; INTERLEAVE-4-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x i32>, ptr [[TMP3]], align 1
+; INTERLEAVE-4-NEXT:    [[TMP4]] = add <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]]
+; INTERLEAVE-4-NEXT:    [[TMP5]] = add <4 x i32> [[VEC_PHI2]], [[WIDE_LOAD5]]
+; INTERLEAVE-4-NEXT:    [[TMP6]] = add <4 x i32> [[VEC_PHI3]], [[WIDE_LOAD6]]
+; INTERLEAVE-4-NEXT:    [[TMP7]] = add <4 x i32> [[VEC_PHI4]], [[WIDE_LOAD7]]
 ; INTERLEAVE-4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; INTERLEAVE-4-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; INTERLEAVE-4-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; INTERLEAVE-4-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; INTERLEAVE-4-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; INTERLEAVE-4:       middle.block:
-; INTERLEAVE-4-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP13]], [[TMP12]]
-; INTERLEAVE-4-NEXT:    [[BIN_RDX8:%.*]] = add <4 x i32> [[TMP14]], [[BIN_RDX]]
-; INTERLEAVE-4-NEXT:    [[BIN_RDX9:%.*]] = add <4 x i32> [[TMP15]], [[BIN_RDX8]]
-; INTERLEAVE-4-NEXT:    [[TMP17:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX9]])
+; INTERLEAVE-4-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP5]], [[TMP4]]
+; INTERLEAVE-4-NEXT:    [[BIN_RDX8:%.*]] = add <4 x i32> [[TMP6]], [[BIN_RDX]]
+; INTERLEAVE-4-NEXT:    [[BIN_RDX9:%.*]] = add <4 x i32> [[TMP7]], [[BIN_RDX8]]
+; INTERLEAVE-4-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX9]])
 ; INTERLEAVE-4-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; INTERLEAVE-4-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
 ; INTERLEAVE-4:       vec.epilog.iter.check:
 ; INTERLEAVE-4-NEXT:    [[N_VEC_REMAINING:%.*]] = sub i64 [[N]], [[N_VEC]]
 ; INTERLEAVE-4-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4
-; INTERLEAVE-4-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
+; INTERLEAVE-4-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]]
 ; INTERLEAVE-4:       vec.epilog.ph:
 ; INTERLEAVE-4-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
-; INTERLEAVE-4-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP17]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; INTERLEAVE-4-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP9]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; INTERLEAVE-4-NEXT:    [[N_MOD_VF10:%.*]] = urem i64 [[N]], 4
 ; INTERLEAVE-4-NEXT:    [[N_VEC11:%.*]] = sub i64 [[N]], [[N_MOD_VF10]]
-; INTERLEAVE-4-NEXT:    [[TMP18:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX]], i32 0
+; INTERLEAVE-4-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX]], i32 0
 ; INTERLEAVE-4-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; INTERLEAVE-4:       vec.epilog.vector.body:
 ; INTERLEAVE-4-NEXT:    [[INDEX12:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT15:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; INTERLEAVE-4-NEXT:    [[VEC_PHI13:%.*]] = phi <4 x i32> [ [[TMP18]], [[VEC_EPILOG_PH]] ], [ [[TMP22:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; INTERLEAVE-4-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[INDEX12]]
-; INTERLEAVE-4-NEXT:    [[WIDE_LOAD14:%.*]] = load <4 x i32>, ptr [[TMP20]], align 1
-; INTERLEAVE-4-NEXT:    [[TMP22]] = add <4 x i32> [[VEC_PHI13]], [[WIDE_LOAD14]]
+; INTERLEAVE-4-NEXT:    [[VEC_PHI13:%.*]] = phi <4 x i32> [ [[TMP10]], [[VEC_EPILOG_PH]] ], [ [[TMP12:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; INTERLEAVE-4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[INDEX12]]
+; INTERLEAVE-4-NEXT:    [[WIDE_LOAD14:%.*]] = load <4 x i32>, ptr [[TMP11]], align 1
+; INTERLEAVE-4-NEXT:    [[TMP12]] = add <4 x i32> [[VEC_PHI13]], [[WIDE_LOAD14]]
 ; INTERLEAVE-4-NEXT:    [[INDEX_NEXT15]] = add nuw i64 [[INDEX12]], 4
-; INTERLEAVE-4-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT15]], [[N_VEC11]]
-; INTERLEAVE-4-NEXT:    br i1 [[TMP23]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; INTERLEAVE-4-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT15]], [[N_VEC11]]
+; INTERLEAVE-4-NEXT:    br i1 [[TMP13]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; INTERLEAVE-4:       vec.epilog.middle.block:
-; INTERLEAVE-4-NEXT:    [[TMP24:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP22]])
+; INTERLEAVE-4-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP12]])
 ; INTERLEAVE-4-NEXT:    [[CMP_N16:%.*]] = icmp eq i64 [[N]], [[N_VEC11]]
 ; INTERLEAVE-4-NEXT:    br i1 [[CMP_N16]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]]
 ; INTERLEAVE-4:       vec.epilog.scalar.ph:
 ; INTERLEAVE-4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC11]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
-; INTERLEAVE-4-NEXT:    [[BC_MERGE_RDX17:%.*]] = phi i32 [ [[TMP24]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[TMP17]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK]] ]
+; INTERLEAVE-4-NEXT:    [[BC_MERGE_RDX17:%.*]] = phi i32 [ [[TMP14]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[TMP9]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK]] ]
 ; INTERLEAVE-4-NEXT:    br label [[LOOP:%.*]]
 ; INTERLEAVE-4:       loop:
 ; INTERLEAVE-4-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
@@ -86,9 +86,9 @@ define i32 @interleave_integer_reduction(ptr %src, i64 %N) {
 ; INTERLEAVE-4-NEXT:    [[RED_NEXT]] = add i32 [[RED]], [[L]]
 ; INTERLEAVE-4-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; INTERLEAVE-4-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; INTERLEAVE-4-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP4:![0-9]+]]
+; INTERLEAVE-4-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
 ; INTERLEAVE-4:       exit:
-; INTERLEAVE-4-NEXT:    [[RED_NEXT_LCSSA:%.*]] = phi i32 [ [[RED_NEXT]], [[LOOP]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ], [ [[TMP24]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
+; INTERLEAVE-4-NEXT:    [[RED_NEXT_LCSSA:%.*]] = phi i32 [ [[RED_NEXT]], [[LOOP]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ], [ [[TMP14]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
 ; INTERLEAVE-4-NEXT:    ret i32 [[RED_NEXT_LCSSA]]
 ;
 ; INTERLEAVE-2-LABEL: @interleave_integer_reduction(
@@ -101,25 +101,25 @@ define i32 @interleave_integer_reduction(ptr %src, i64 %N) {
 ; INTERLEAVE-2-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; INTERLEAVE-2:       vector.body:
 ; INTERLEAVE-2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; INTERLEAVE-2-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
-; INTERLEAVE-2-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
-; INTERLEAVE-2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 [[INDEX]]
-; INTERLEAVE-2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 4
-; INTERLEAVE-2-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 1
-; INTERLEAVE-2-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i32>, ptr [[TMP5]], align 1
-; INTERLEAVE-2-NEXT:    [[TMP6]] = add <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]]
-; INTERLEAVE-2-NEXT:    [[TMP7]] = add <4 x i32> [[VEC_PHI1]], [[WIDE_LOAD2]]
+; INTERLEAVE-2-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
+; INTERLEAVE-2-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
+; INTERLEAVE-2-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 [[INDEX]]
+; INTERLEAVE-2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 4
+; INTERLEAVE-2-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP0]], align 1
+; INTERLEAVE-2-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i32>, ptr [[TMP1]], align 1
+; INTERLEAVE-2-NEXT:    [[TMP2]] = add <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]]
+; INTERLEAVE-2-NEXT:    [[TMP3]] = add <4 x i32> [[VEC_PHI1]], [[WIDE_LOAD2]]
 ; INTERLEAVE-2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; INTERLEAVE-2-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; INTERLEAVE-2-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; INTERLEAVE-2-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; INTERLEAVE-2-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; INTERLEAVE-2:       middle.block:
-; INTERLEAVE-2-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP7]], [[TMP6]]
-; INTERLEAVE-2-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]])
+; INTERLEAVE-2-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP3]], [[TMP2]]
+; INTERLEAVE-2-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]])
 ; INTERLEAVE-2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; INTERLEAVE-2-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; INTERLEAVE-2:       scalar.ph:
 ; INTERLEAVE-2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; INTERLEAVE-2-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP9]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; INTERLEAVE-2-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP5]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; INTERLEAVE-2-NEXT:    br label [[LOOP:%.*]]
 ; INTERLEAVE-2:       loop:
 ; INTERLEAVE-2-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
@@ -131,15 +131,18 @@ define i32 @interleave_integer_reduction(ptr %src, i64 %N) {
 ; INTERLEAVE-2-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
 ; INTERLEAVE-2-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
 ; INTERLEAVE-2:       exit:
-; INTERLEAVE-2-NEXT:    [[RED_NEXT_LCSSA:%.*]] = phi i32 [ [[RED_NEXT]], [[LOOP]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ]
+; INTERLEAVE-2-NEXT:    [[RED_NEXT_LCSSA:%.*]] = phi i32 [ [[RED_NEXT]], [[LOOP]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ]
 ; INTERLEAVE-2-NEXT:    ret i32 [[RED_NEXT_LCSSA]]
 ;
 ; INTERLEAVE-4-VLA-LABEL: @interleave_integer_reduction(
-; INTERLEAVE-4-VLA-NEXT:  entry:
+; INTERLEAVE-4-VLA-NEXT:  iter.check:
+; INTERLEAVE-4-VLA-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 2
+; INTERLEAVE-4-VLA-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
+; INTERLEAVE-4-VLA:       vector.main.loop.iter.check:
 ; INTERLEAVE-4-VLA-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; INTERLEAVE-4-VLA-NEXT:    [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 4
-; INTERLEAVE-4-VLA-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
-; INTERLEAVE-4-VLA-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; INTERLEAVE-4-VLA-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[N]], [[TMP1]]
+; INTERLEAVE-4-VLA-NEXT:    br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; INTERLEAVE-4-VLA:       vector.ph:
 ; INTERLEAVE-4-VLA-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
 ; INTERLEAVE-4-VLA-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16
@@ -148,53 +151,77 @@ define i32 @interleave_integer_reduction(ptr %src, i64 %N) {
 ; INTERLEAVE-4-VLA-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; INTERLEAVE-4-VLA:       vector.body:
 ; INTERLEAVE-4-VLA-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; INTERLEAVE-4-VLA-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ]
-; INTERLEAVE-4-VLA-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ]
-; INTERLEAVE-4-VLA-NEXT:    [[VEC_PHI2:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ]
-; INTERLEAVE-4-VLA-NEXT:    [[VEC_PHI3:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
-; INTERLEAVE-4-VLA-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 [[INDEX]]
-; INTERLEAVE-4-VLA-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; INTERLEAVE-4-VLA-NEXT:    [[TMP8:%.*]] = shl nuw i64 [[TMP7]], 2
-; INTERLEAVE-4-VLA-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i64 [[TMP8]]
-; INTERLEAVE-4-VLA-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
-; INTERLEAVE-4-VLA-NEXT:    [[TMP11:%.*]] = shl nuw i64 [[TMP10]], 3
-; INTERLEAVE-4-VLA-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i64 [[TMP11]]
-; INTERLEAVE-4-VLA-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
-; INTERLEAVE-4-VLA-NEXT:    [[TMP14:%.*]] = mul nuw i64 [[TMP13]], 12
-; INTERLEAVE-4-VLA-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i64 [[TMP14]]
-; INTERLEAVE-4-VLA-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP6]], align 1
-; INTERLEAVE-4-VLA-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 4 x i32>, ptr [[TMP9]], align 1
-; INTERLEAVE-4-VLA-NEXT:    [[WIDE_LOAD5:%.*]] = load <vscale x 4 x i32>, ptr [[TMP12]], align 1
-; INTERLEAVE-4-VLA-NEXT:    [[WIDE_LOAD6:%.*]] = load <vscale x 4 x i32>, ptr [[TMP15]], align 1
-; INTERLEAVE-4-VLA-NEXT:    [[TMP16]] = add <vscale x 4 x i32> [[VEC_PHI]], [[WIDE_LOAD]]
-; INTERLEAVE-4-VLA-NEXT:    [[TMP17]] = add <vscale x 4 x i32> [[VEC_PHI1]], [[WIDE_LOAD4]]
-; INTERLEAVE-4-VLA-NEXT:    [[TMP18]] = add <vscale x 4 x i32> [[VEC_PHI2]], [[WIDE_LOAD5]]
-; INTERLEAVE-4-VLA-NEXT:    [[TMP19]] = add <vscale x 4 x i32> [[VEC_PHI3]], [[WIDE_LOAD6]]
+; INTERLEAVE-4-VLA-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
+; INTERLEAVE-4-VLA-NEXT:    [[VEC_PHI2:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ]
+; INTERLEAVE-4-VLA-NEXT:    [[VEC_PHI3:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ]
+; INTERLEAVE-4-VLA-NEXT:    [[VEC_PHI4:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ]
+; INTERLEAVE-4-VLA-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 [[INDEX]]
+; INTERLEAVE-4-VLA-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; INTERLEAVE-4-VLA-NEXT:    [[TMP6:%.*]] = shl nuw i64 [[TMP5]], 2
+; INTERLEAVE-4-VLA-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i64 [[TMP6]]
+; INTERLEAVE-4-VLA-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; INTERLEAVE-4-VLA-NEXT:    [[TMP9:%.*]] = shl nuw i64 [[TMP8]], 3
+; INTERLEAVE-4-VLA-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i64 [[TMP9]]
+; INTERLEAVE-4-VLA-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
+; INTERLEAVE-4-VLA-NEXT:    [[TMP12:%.*]] = mul nuw i64 [[TMP11]], 12
+; INTERLEAVE-4-VLA-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i64 [[TMP12]]
+; INTERLEAVE-4-VLA-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP4]], align 1
+; INTERLEAVE-4-VLA-NEXT:    [[WIDE_LOAD5:%.*]] = load <vscale x 4 x i32>, ptr [[TMP7]], align 1
+; INTERLEAVE-4-VLA-NEXT:    [[WIDE_LOAD6:%.*]] = load <vscale x 4 x i32>, ptr [[TMP10]], align 1
+; INTERLEAVE-4-VLA-NEXT:    [[WIDE_LOAD7:%.*]] = load <vscale x 4 x i32>, ptr [[TMP13]], align 1
+; INTERLEAVE-4-VLA-NEXT:    [[TMP14]] = add <vscale x 4 x i32> [[VEC_PHI]], [[WIDE_LOAD]]
+; INTERLEAVE-4-VLA-NEXT:    [[TMP15]] = add <vscale x 4 x i32> [[VEC_PHI2]], [[WIDE_LOAD5]]
+; INTERLEAVE-4-VLA-NEXT:    [[TMP16]] = add <vscale x 4 x i32> [[VEC_PHI3]], [[WIDE_LOAD6]]
+; INTERLEAVE-4-VLA-NEXT:    [[TMP17]] = add <vscale x 4 x i32> [[VEC_PHI4]], [[WIDE_LOAD7]]
 ; INTERLEAVE-4-VLA-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
-; INTERLEAVE-4-VLA-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; INTERLEAVE-4-VLA-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; INTERLEAVE-4-VLA-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; INTERLEAVE-4-VLA-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; INTERLEAVE-4-VLA:       middle.block:
-; INTERLEAVE-4-VLA-NEXT:    [[BIN_RDX:%.*]] = add <vscale x 4 x i32> [[TMP17]], [[TMP16]]
-; INTERLEAVE-4-VLA-NEXT:    [[BIN_RDX7:%.*]] = add <vscale x 4 x i32> [[TMP18]], [[BIN_RDX]]
-; INTERLEAVE-4-VLA-NEXT:    [[BIN_RDX8:%.*]] = add <vscale x 4 x i32> [[TMP19]], [[BIN_RDX7]]
-; INTERLEAVE-4-VLA-NEXT:    [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX8]])
+; INTERLEAVE-4-VLA-NEXT:    [[BIN_RDX:%.*]] = add <vscale x 4 x i32> [[TMP15]], [[TMP14]]
+; INTERLEAVE-4-VLA-NEXT:    [[BIN_RDX8:%.*]] = add <vscale x 4 x i32> [[TMP16]], [[BIN_RDX]]
+; INTERLEAVE-4-VLA-NEXT:    [[BIN_RDX9:%.*]] = add <vscale x 4 x i32> [[TMP17]], [[BIN_RDX8]]
+; INTERLEAVE-4-VLA-NEXT:    [[TMP19:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX9]])
 ; INTERLEAVE-4-VLA-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; INTERLEAVE-4-VLA-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; INTERLEAVE-4-VLA:       scalar.ph:
-; INTERLEAVE-4-VLA-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; INTERLEAVE-4-VLA-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP21]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; INTERLEAVE-4-VLA-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
+; INTERLEAVE-4-VLA:       vec.epilog.iter.check:
+; INTERLEAVE-4-VLA-NEXT:    [[N_VEC_REMAINING:%.*]] = sub i64 [[N]], [[N_VEC]]
+; INTERLEAVE-4-VLA-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 2
+; INTERLEAVE-4-VLA-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]]
+; INTERLEAVE-4-VLA:       vec.epilog.ph:
+; INTERLEAVE-4-VLA-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; INTERLEAVE-4-VLA-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP19]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; INTERLEAVE-4-VLA-NEXT:    [[N_MOD_VF10:%.*]] = urem i64 [[N]], 2
+; INTERLEAVE-4-VLA-NEXT:    [[N_VEC11:%.*]] = sub i64 [[N]], [[N_MOD_VF10]]
+; INTERLEAVE-4-VLA-NEXT:    [[TMP20:%.*]] = insertelement <2 x i32> zeroinitializer, i32 [[BC_MERGE_RDX]], i32 0
+; INTERLEAVE-4-VLA-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
+; INTERLEAVE-4-VLA:       vec.epilog.vector.body:
+; INTERLEAVE-4-VLA-NEXT:    [[INDEX12:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT15:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; INTERLEAVE-4-VLA-NEXT:    [[VEC_PHI13:%.*]] = phi <2 x i32> [ [[TMP20]], [[VEC_EPILOG_PH]] ], [ [[TMP22:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; INTERLEAVE-4-VLA-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[INDEX12]]
+; INTERLEAVE-4-VLA-NEXT:    [[WIDE_LOAD14:%.*]] = load <2 x i32>, ptr [[TMP21]], align 1
+; INTERLEAVE-4-VLA-NEXT:    [[TMP22]] = add <2 x i32> [[VEC_PHI13]], [[WIDE_LOAD14]]
+; INTERLEAVE-4-VLA-NEXT:    [[INDEX_NEXT15]] = add nuw i64 [[INDEX12]], 2
+; INTERLEAVE-4-VLA-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT15]], [[N_VEC11]]
+; INTERLEAVE-4-VLA-NEXT:    br i1 [[TMP23]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; INTERLEAVE-4-VLA:       vec.epilog.middle.block:
+; INTERLEAVE-4-VLA-NEXT:    [[TMP24:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[TMP22]])
+; INTERLEAVE-4-VLA-NEXT:    [[CMP_N16:%.*]] = icmp eq i64 [[N]], [[N_VEC11]]
+; INTERLEAVE-4-VLA-NEXT:    br i1 [[CMP_N16]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]]
+; INTERLEAVE-4-VLA:       vec.epilog.scalar.ph:
+; INTERLEAVE-4-VLA-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC11]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
+; INTERLEAVE-4-VLA-NEXT:    [[BC_MERGE_RDX17:%.*]] = phi i32 [ [[TMP24]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[TMP19]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK]] ]
 ; INTERLEAVE-4-VLA-NEXT:    br label [[LOOP:%.*]]
 ; INTERLEAVE-4-VLA:       loop:
-; INTERLEAVE-4-VLA-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; INTERLEAVE-4-VLA-NEXT:    [[RED:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[RED_NEXT:%.*]], [[LOOP]] ]
+; INTERLEAVE-4-VLA-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; INTERLEAVE-4-VLA-NEXT:    [[RED:%.*]] = phi i32 [ [[BC_MERGE_RDX17]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[RED_NEXT:%.*]], [[LOOP]] ]
 ; INTERLEAVE-4-VLA-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[IV]]
 ; INTERLEAVE-4-VLA-NEXT:    [[L:%.*]] = load i32, ptr [[GEP_SRC]], align 1
 ; INTERLEAVE-4-VLA-NEXT:    [[RED_NEXT]] = add i32 [[RED]], [[L]]
 ; INTERLEAVE-4-VLA-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; INTERLEAVE-4-VLA-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; INTERLEAVE-4-VLA-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; INTERLEAVE-4-VLA-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
 ; INTERLEAVE-4-VLA:       exit:
-; INTERLEAVE-4-VLA-NEXT:    [[RED_NEXT_LCSSA:%.*]] = phi i32 [ [[RED_NEXT]], [[LOOP]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ]
+; INTERLEAVE-4-VLA-NEXT:    [[RED_NEXT_LCSSA:%.*]] = phi i32 [ [[RED_NEXT]], [[LOOP]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ], [ [[TMP24]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
 ; INTERLEAVE-4-VLA-NEXT:    ret i32 [[RED_NEXT_LCSSA]]
 ;
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll
index c444d5bcc82c7..f4784b6259ce1 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll
@@ -33,10 +33,10 @@ define i32 @chained_partial_reduce_add_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
 ; CHECK-NEON-NEXT:    [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
 ; CHECK-NEON-NEXT:    [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
 ; CHECK-NEON-NEXT:    [[TMP10:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP8]]
-; CHECK-NEON-NEXT:    [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP10]])
+; CHECK-NEON-NEXT:    [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP10]])
 ; CHECK-NEON-NEXT:    [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP9]]
 ; CHECK-NEON-NEXT:    [[TMP13:%.*]] = sub <16 x i32> zeroinitializer, [[TMP12]]
-; CHECK-NEON-NEXT:    [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP13]])
+; CHECK-NEON-NEXT:    [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP13]])
 ; CHECK-NEON-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-NEON-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEON-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -116,10 +116,10 @@ define i32 @chained_partial_reduce_add_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP14:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP15:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP16:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP14]]
-; CHECK-SVE-MAXBW-NEXT:    [[PARTIAL_REDUCE:%.*]] = call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI]], <vscale x 8 x i32> [[TMP16]])
+; CHECK-SVE-MAXBW-NEXT:    [[PARTIAL_REDUCE:%.*]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI]], <vscale x 8 x i32> [[TMP16]])
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP17:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP15]]
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP18:%.*]] = sub <vscale x 8 x i32> zeroinitializer, [[TMP17]]
-; CHECK-SVE-MAXBW-NEXT:    [[PARTIAL_REDUCE3]] = call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[PARTIAL_REDUCE]], <vscale x 8 x i32> [[TMP18]])
+; CHECK-SVE-MAXBW-NEXT:    [[PARTIAL_REDUCE3]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[PARTIAL_REDUCE]], <vscale x 8 x i32> [[TMP18]])
 ; CHECK-SVE-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-SVE-MAXBW-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -186,9 +186,9 @@ define i32 @chained_partial_reduce_add_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
 ; CHECK-NEON-NEXT:    [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
 ; CHECK-NEON-NEXT:    [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
 ; CHECK-NEON-NEXT:    [[TMP10:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP8]]
-; CHECK-NEON-NEXT:    [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP10]])
+; CHECK-NEON-NEXT:    [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP10]])
 ; CHECK-NEON-NEXT:    [[TMP11:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP9]]
-; CHECK-NEON-NEXT:    [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP11]])
+; CHECK-NEON-NEXT:    [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP11]])
 ; CHECK-NEON-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-NEON-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEON-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
@@ -268,9 +268,9 @@ define i32 @chained_partial_reduce_add_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP14:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP15:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP16:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP14]]
-; CHECK-SVE-MAXBW-NEXT:    [[PARTIAL_REDUCE:%.*]] = call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI]], <vscale x 8 x i32> [[TMP16]])
+; CHECK-SVE-MAXBW-NEXT:    [[PARTIAL_REDUCE:%.*]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI]], <vscale x 8 x i32> [[TMP16]])
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP17:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP15]]
-; CHECK-SVE-MAXBW-NEXT:    [[PARTIAL_REDUCE3]] = call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[PARTIAL_REDUCE]], <vscale x 8 x i32> [[TMP17]])
+; CHECK-SVE-MAXBW-NEXT:    [[PARTIAL_REDUCE3]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[PARTIAL_REDUCE]], <vscale x 8 x i32> [[TMP17]])
 ; CHECK-SVE-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-SVE-MAXBW-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
@@ -338,9 +338,9 @@ define i32 @chained_partial_reduce_sub_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
 ; CHECK-NEON-NEXT:    [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
 ; CHECK-NEON-NEXT:    [[TMP10:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP8]]
 ; CHECK-NEON-NEXT:    [[TMP11:%.*]] = sub nsw <16 x i32> zeroinitializer, [[TMP10]]
-; CHECK-NEON-NEXT:    [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP11]])
+; CHECK-NEON-NEXT:    [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP11]])
 ; CHECK-NEON-NEXT:    [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP9]]
-; CHECK-NEON-NEXT:    [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP12]])
+; CHECK-NEON-NEXT:    [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP12]])
 ; CHECK-NEON-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-NEON-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEON-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
@@ -421,9 +421,9 @@ define i32 @chained_partial_reduce_sub_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP15:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP16:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP14]]
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP17:%.*]] = sub nsw <vscale x 8 x i32> zeroinitializer, [[TMP16]]
-; CHECK-SVE-MAXBW-NEXT:    [[PARTIAL_REDUCE:%.*]] = call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI]], <vscale x 8 x i32> [[TMP17]])
+; CHECK-SVE-MAXBW-NEXT:    [[PARTIAL_REDUCE:%.*]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI]], <vscale x 8 x i32> [[TMP17]])
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP18:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP15]]
-; CHECK-SVE-MAXBW-NEXT:    [[PARTIAL_REDUCE3]] = call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[PARTIAL_REDUCE]], <vscale x 8 x i32> [[TMP18]])
+; CHECK-SVE-MAXBW-NEXT:    [[PARTIAL_REDUCE3]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[PARTIAL_REDUCE]], <vscale x 8 x i32> [[TMP18]])
 ; CHECK-SVE-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-SVE-MAXBW-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
@@ -493,10 +493,10 @@ define i32 @chained_partial_reduce_sub_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
 ; CHECK-NEON-NEXT:    [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
 ; CHECK-NEON-NEXT:    [[TMP10:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP8]]
 ; CHECK-NEON-NEXT:    [[TMP11:%.*]] = sub nsw <16 x i32> zeroinitializer, [[TMP10]]
-; CHECK-NEON-NEXT:    [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP11]])
+; CHECK-NEON-NEXT:    [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP11]])
 ; CHECK-NEON-NEXT:    [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP9]]
 ; CHECK-NEON-NEXT:    [[TMP13:%.*]] = sub <16 x i32> zeroinitializer, [[TMP12]]
-; CHECK-NEON-NEXT:    [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP13]])
+; CHECK-NEON-NEXT:    [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP13]])
 ; CHECK-NEON-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-NEON-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEON-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
@@ -577,10 +577,10 @@ define i32 @chained_partial_reduce_sub_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP15:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP16:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP14]]
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP17:%.*]] = sub nsw <vscale x 8 x i32> zeroinitializer, [[TMP16]]
-; CHECK-SVE-MAXBW-NEXT:    [[PARTIAL_REDUCE:%.*]] = call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI]], <vscale x 8 x i32> [[TMP17]])
+; CHECK-SVE-MAXBW-NEXT:    [[PARTIAL_REDUCE:%.*]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI]], <vscale x 8 x i32> [[TMP17]])
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP18:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP15]]
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP19:%.*]] = sub <vscale x 8 x i32> zeroinitializer, [[TMP18]]
-; CHECK-SVE-MAXBW-NEXT:    [[PARTIAL_REDUCE3]] = call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[PARTIAL_REDUCE]], <vscale x 8 x i32> [[TMP19]])
+; CHECK-SVE-MAXBW-NEXT:    [[PARTIAL_REDUCE3]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[PARTIAL_REDUCE]], <vscale x 8 x i32> [[TMP19]])
 ; CHECK-SVE-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-SVE-MAXBW-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
@@ -650,11 +650,11 @@ define i32 @chained_partial_reduce_add_add_add(ptr %a, ptr %b, ptr %c, i32 %N) #
 ; CHECK-NEON-NEXT:    [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
 ; CHECK-NEON-NEXT:    [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
 ; CHECK-NEON-NEXT:    [[TMP10:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP8]]
-; CHECK-NEON-NEXT:    [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP10]])
+; CHECK-NEON-NEXT:    [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP10]])
 ; CHECK-NEON-NEXT:    [[TMP11:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP9]]
-; CHECK-NEON-NEXT:    [[PARTIAL_REDUCE3:%.*]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP11]])
+; CHECK-NEON-NEXT:    [[PARTIAL_REDUCE3:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP11]])
 ; CHECK-NEON-NEXT:    [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP8]], [[TMP9]]
-; CHECK-NEON-NEXT:    [[PARTIAL_REDUCE4]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE3]], <16 x i32> [[TMP12]])
+; CHECK-NEON-NEXT:    [[PARTIAL_REDUCE4]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE3]], <16 x i32> [[TMP12]])
 ; CHECK-NEON-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-NEON-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEON-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
@@ -736,11 +736,11 @@ define i32 @chained_partial_reduce_add_add_add(ptr %a, ptr %b, ptr %c, i32 %N) #
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP14:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP15:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP16:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP14]]
-; CHECK-SVE-MAXBW-NEXT:    [[PARTIAL_REDUCE:%.*]] = call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI]], <vscale x 8 x i32> [[TMP16]])
+; CHECK-SVE-MAXBW-NEXT:    [[PARTIAL_REDUCE:%.*]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI]], <vscale x 8 x i32> [[TMP16]])
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP17:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP15]]
-; CHECK-SVE-MAXBW-NEXT:    [[PARTIAL_REDUCE3:%.*]] = call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[PARTIAL_REDUCE]], <vscale x 8 x i32> [[TMP17]])
+; CHECK-SVE-MAXBW-NEXT:    [[PARTIAL_REDUCE3:%.*]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[PARTIAL_REDUCE]], <vscale x 8 x i32> [[TMP17]])
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP18:%.*]] = mul nsw <vscale x 8 x i32> [[TMP14]], [[TMP15]]
-; CHECK-SVE-MAXBW-NEXT:    [[PARTIAL_REDUCE4]] = call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[PARTIAL_REDUCE3]], <vscale x 8 x i32> [[TMP18]])
+; CHECK-SVE-MAXBW-NEXT:    [[PARTIAL_REDUCE4]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[PARTIAL_REDUCE3]], <vscale x 8 x i32> [[TMP18]])
 ; CHECK-SVE-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-SVE-MAXBW-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
@@ -813,12 +813,12 @@ define i32 @chained_partial_reduce_sub_add_sub(ptr %a, ptr %b, ptr %c, i32 %N) #
 ; CHECK-NEON-NEXT:    [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
 ; CHECK-NEON-NEXT:    [[TMP10:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP8]]
 ; CHECK-NEON-NEXT:    [[TMP11:%.*]] = sub nsw <16 x i32> zeroinitializer, [[TMP10]]
-; CHECK-NEON-NEXT:    [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP11]])
+; CHECK-NEON-NEXT:    [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP11]])
 ; CHECK-NEON-NEXT:    [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP9]]
-; CHECK-NEON-NEXT:    [[PARTIAL_REDUCE3:%.*]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP12]])
+; CHECK-NEON-NEXT:    [[PARTIAL_REDUCE3:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP12]])
 ; CHECK-NEON-NEXT:    [[TMP14:%.*]] = mul nsw <16 x i32> [[TMP8]], [[TMP9]]
 ; CHECK-NEON-NEXT:    [[TMP15:%.*]] = sub <16 x i32> zeroinitializer, [[TMP14]]
-; CHECK-NEON-NEXT:    [[PARTIAL_REDUCE4]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE3]], <16 x i32> [[TMP15]])
+; CHECK-NEON-NEXT:    [[PARTIAL_REDUCE4]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE3]], <16 x i32> [[TMP15]])
 ; CHECK-NEON-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-NEON-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEON-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
@@ -901,12 +901,12 @@ define i32 @chained_partial_reduce_sub_add_sub(ptr %a, ptr %b, ptr %c, i32 %N) #
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP15:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP16:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP14]]
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP17:%.*]] = sub nsw <vscale x 8 x i32> zeroinitializer, [[TMP16]]
-; CHECK-SVE-MAXBW-NEXT:    [[PARTIAL_REDUCE:%.*]] = call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI]], <vscale x 8 x i32> [[TMP17]])
+; CHECK-SVE-MAXBW-NEXT:    [[PARTIAL_REDUCE:%.*]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI]], <vscale x 8 x i32> [[TMP17]])
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP18:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP15]]
-; CHECK-SVE-MAXBW-NEXT:    [[PARTIAL_REDUCE3:%.*]] = call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[PARTIAL_REDUCE]], <vscale x 8 x i32> [[TMP18]])
+; CHECK-SVE-MAXBW-NEXT:    [[PARTIAL_REDUCE3:%.*]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[PARTIAL_REDUCE]], <vscale x 8 x i32> [[TMP18]])
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP19:%.*]] = mul nsw <vscale x 8 x i32> [[TMP14]], [[TMP15]]
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP20:%.*]] = sub <vscale x 8 x i32> zeroinitializer, [[TMP19]]
-; CHECK-SVE-MAXBW-NEXT:    [[PARTIAL_REDUCE4]] = call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[PARTIAL_REDUCE3]], <vscale x 8 x i32> [[TMP20]])
+; CHECK-SVE-MAXBW-NEXT:    [[PARTIAL_REDUCE4]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[PARTIAL_REDUCE3]], <vscale x 8 x i32> [[TMP20]])
 ; CHECK-SVE-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-SVE-MAXBW-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
@@ -979,8 +979,8 @@ define i32 @chained_partial_reduce_madd_extadd(ptr %a, ptr %b, ptr %c, i32 %N) #
 ; CHECK-NEON-NEXT:    [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
 ; CHECK-NEON-NEXT:    [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
 ; CHECK-NEON-NEXT:    [[TMP10:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP8]]
-; CHECK-NEON-NEXT:    [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP10]])
-; CHECK-NEON-NEXT:    [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP9]])
+; CHECK-NEON-NEXT:    [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP10]])
+; CHECK-NEON-NEXT:    [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP9]])
 ; CHECK-NEON-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-NEON-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEON-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
@@ -1059,8 +1059,8 @@ define i32 @chained_partial_reduce_madd_extadd(ptr %a, ptr %b, ptr %c, i32 %N) #
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP14:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP15:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP16:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP14]]
-; CHECK-SVE-MAXBW-NEXT:    [[PARTIAL_REDUCE:%.*]] = call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI]], <vscale x 8 x i32> [[TMP16]])
-; CHECK-SVE-MAXBW-NEXT:    [[PARTIAL_REDUCE3]] = call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[PARTIAL_REDUCE]], <vscale x 8 x i32> [[TMP15]])
+; CHECK-SVE-MAXBW-NEXT:    [[PARTIAL_REDUCE:%.*]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI]], <vscale x 8 x i32> [[TMP16]])
+; CHECK-SVE-MAXBW-NEXT:    [[PARTIAL_REDUCE3]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[PARTIAL_REDUCE]], <vscale x 8 x i32> [[TMP15]])
 ; CHECK-SVE-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-SVE-MAXBW-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
@@ -1123,8 +1123,8 @@ define i32 @chained_partial_reduce_extadd_extadd(ptr %a, ptr %b, i32 %N) #0 {
 ; CHECK-NEON-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
 ; CHECK-NEON-NEXT:    [[TMP5:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-NEON-NEXT:    [[TMP6:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
-; CHECK-NEON-NEXT:    [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP5]])
-; CHECK-NEON-NEXT:    [[PARTIAL_REDUCE2]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP6]])
+; CHECK-NEON-NEXT:    [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP5]])
+; CHECK-NEON-NEXT:    [[PARTIAL_REDUCE2]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP6]])
 ; CHECK-NEON-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-NEON-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEON-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
@@ -1195,8 +1195,8 @@ define i32 @chained_partial_reduce_extadd_extadd(ptr %a, ptr %b, i32 %N) #0 {
 ; CHECK-SVE-MAXBW-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP8]], align 1
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP11:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP12:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
-; CHECK-SVE-MAXBW-NEXT:    [[PARTIAL_REDUCE:%.*]] = call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI]], <vscale x 8 x i32> [[TMP11]])
-; CHECK-SVE-MAXBW-NEXT:    [[PARTIAL_REDUCE2]] = call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[PARTIAL_REDUCE]], <vscale x 8 x i32> [[TMP12]])
+; CHECK-SVE-MAXBW-NEXT:    [[PARTIAL_REDUCE:%.*]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI]], <vscale x 8 x i32> [[TMP11]])
+; CHECK-SVE-MAXBW-NEXT:    [[PARTIAL_REDUCE2]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[PARTIAL_REDUCE]], <vscale x 8 x i32> [[TMP12]])
 ; CHECK-SVE-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-SVE-MAXBW-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
@@ -1259,9 +1259,9 @@ define i32 @chained_partial_reduce_extadd_madd(ptr %a, ptr %b, ptr %c, i32 %N) #
 ; CHECK-NEON-NEXT:    [[TMP7:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-NEON-NEXT:    [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
 ; CHECK-NEON-NEXT:    [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
-; CHECK-NEON-NEXT:    [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP9]])
+; CHECK-NEON-NEXT:    [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP9]])
 ; CHECK-NEON-NEXT:    [[TMP10:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP8]]
-; CHECK-NEON-NEXT:    [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP10]])
+; CHECK-NEON-NEXT:    [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP10]])
 ; CHECK-NEON-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-NEON-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEON-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
@@ -1339,9 +1339,9 @@ define i32 @chained_partial_reduce_extadd_madd(ptr %a, ptr %b, ptr %c, i32 %N) #
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP13:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP14:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP15:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
-; CHECK-SVE-MAXBW-NEXT:    [[PARTIAL_REDUCE:%.*]] = call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI]], <vscale x 8 x i32> [[TMP15]])
+; CHECK-SVE-MAXBW-NEXT:    [[PARTIAL_REDUCE:%.*]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI]], <vscale x 8 x i32> [[TMP15]])
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP16:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP14]]
-; CHECK-SVE-MAXBW-NEXT:    [[PARTIAL_REDUCE3]] = call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[PARTIAL_REDUCE]], <vscale x 8 x i32> [[TMP16]])
+; CHECK-SVE-MAXBW-NEXT:    [[PARTIAL_REDUCE3]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[PARTIAL_REDUCE]], <vscale x 8 x i32> [[TMP16]])
 ; CHECK-SVE-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-SVE-MAXBW-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
@@ -1381,6 +1381,131 @@ for.body:                                         ; preds = %for.body.preheader,
   br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !loop !1
 }
 
+define i32 @red_extended_add_chain(ptr %start, ptr %end, i32 %offset) {
+; CHECK-NEON-LABEL: define i32 @red_extended_add_chain(
+; CHECK-NEON-SAME: ptr [[START:%.*]], ptr [[END:%.*]], i32 [[OFFSET:%.*]]) #[[ATTR1:[0-9]+]] {
+; CHECK-NEON-NEXT:  entry:
+; CHECK-NEON-NEXT:    [[START2:%.*]] = ptrtoint ptr [[START]] to i64
+; CHECK-NEON-NEXT:    [[END1:%.*]] = ptrtoint ptr [[END]] to i64
+; CHECK-NEON-NEXT:    [[TMP0:%.*]] = add i64 [[END1]], 1
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = sub i64 [[TMP0]], [[START2]]
+; CHECK-NEON-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], 16
+; CHECK-NEON-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NEON:       vector.ph:
+; CHECK-NEON-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], 16
+; CHECK-NEON-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]]
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[START]], i64 [[N_VEC]]
+; CHECK-NEON-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i32> poison, i32 [[OFFSET]], i64 0
+; CHECK-NEON-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT]], <16 x i32> poison, <16 x i32> zeroinitializer
+; CHECK-NEON-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-NEON:       vector.body:
+; CHECK-NEON-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEON-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEON-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[INDEX]]
+; CHECK-NEON-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[NEXT_GEP]], align 1
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-NEON-NEXT:    [[PARTIAL_REDUCE:%.*]] = call <16 x i32> @llvm.vector.partial.reduce.add.v16i32.v16i32(<16 x i32> [[VEC_PHI]], <16 x i32> [[TMP3]])
+; CHECK-NEON-NEXT:    [[TMP4]] = add <16 x i32> [[PARTIAL_REDUCE]], [[BROADCAST_SPLAT]]
+; CHECK-NEON-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-NEON-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEON-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; CHECK-NEON:       middle.block:
+; CHECK-NEON-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP4]])
+; CHECK-NEON-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]]
+; CHECK-NEON-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEON:       scalar.ph:
+;
+; CHECK-SVE-LABEL: define i32 @red_extended_add_chain(
+; CHECK-SVE-SAME: ptr [[START:%.*]], ptr [[END:%.*]], i32 [[OFFSET:%.*]]) #[[ATTR1:[0-9]+]] {
+; CHECK-SVE-NEXT:  entry:
+; CHECK-SVE-NEXT:    [[START2:%.*]] = ptrtoint ptr [[START]] to i64
+; CHECK-SVE-NEXT:    [[END1:%.*]] = ptrtoint ptr [[END]] to i64
+; CHECK-SVE-NEXT:    [[TMP0:%.*]] = add i64 [[END1]], 1
+; CHECK-SVE-NEXT:    [[TMP1:%.*]] = sub i64 [[TMP0]], [[START2]]
+; CHECK-SVE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-SVE-NEXT:    [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 2
+; CHECK-SVE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], [[TMP3]]
+; CHECK-SVE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-SVE:       vector.ph:
+; CHECK-SVE-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-SVE-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
+; CHECK-SVE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], [[TMP5]]
+; CHECK-SVE-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]]
+; CHECK-SVE-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[START]], i64 [[N_VEC]]
+; CHECK-SVE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[OFFSET]], i64 0
+; CHECK-SVE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-SVE-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-SVE:       vector.body:
+; CHECK-SVE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[INDEX]]
+; CHECK-SVE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[NEXT_GEP]], align 1
+; CHECK-SVE-NEXT:    [[TMP7:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
+; CHECK-SVE-NEXT:    [[TMP8:%.*]] = add <vscale x 4 x i32> [[VEC_PHI]], [[TMP7]]
+; CHECK-SVE-NEXT:    [[TMP9]] = add <vscale x 4 x i32> [[TMP8]], [[BROADCAST_SPLAT]]
+; CHECK-SVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-SVE-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-SVE-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; CHECK-SVE:       middle.block:
+; CHECK-SVE-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP9]])
+; CHECK-SVE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]]
+; CHECK-SVE-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-SVE:       scalar.ph:
+;
+; CHECK-SVE-MAXBW-LABEL: define i32 @red_extended_add_chain(
+; CHECK-SVE-MAXBW-SAME: ptr [[START:%.*]], ptr [[END:%.*]], i32 [[OFFSET:%.*]]) #[[ATTR1:[0-9]+]] {
+; CHECK-SVE-MAXBW-NEXT:  entry:
+; CHECK-SVE-MAXBW-NEXT:    [[START2:%.*]] = ptrtoint ptr [[START]] to i64
+; CHECK-SVE-MAXBW-NEXT:    [[END1:%.*]] = ptrtoint ptr [[END]] to i64
+; CHECK-SVE-MAXBW-NEXT:    [[TMP0:%.*]] = add i64 [[END1]], 1
+; CHECK-SVE-MAXBW-NEXT:    [[TMP1:%.*]] = sub i64 [[TMP0]], [[START2]]
+; CHECK-SVE-MAXBW-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-SVE-MAXBW-NEXT:    [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 3
+; CHECK-SVE-MAXBW-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], [[TMP3]]
+; CHECK-SVE-MAXBW-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-SVE-MAXBW:       vector.ph:
+; CHECK-SVE-MAXBW-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-SVE-MAXBW-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 8
+; CHECK-SVE-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], [[TMP5]]
+; CHECK-SVE-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]]
+; CHECK-SVE-MAXBW-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[START]], i64 [[N_VEC]]
+; CHECK-SVE-MAXBW-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 [[OFFSET]], i64 0
+; CHECK-SVE-MAXBW-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 8 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
+; CHECK-SVE-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-SVE-MAXBW:       vector.body:
+; CHECK-SVE-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-MAXBW-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[INDEX]]
+; CHECK-SVE-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[NEXT_GEP]], align 1
+; CHECK-SVE-MAXBW-NEXT:    [[TMP7:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
+; CHECK-SVE-MAXBW-NEXT:    [[PARTIAL_REDUCE:%.*]] = call <vscale x 8 x i32> @llvm.vector.partial.reduce.add.nxv8i32.nxv8i32(<vscale x 8 x i32> [[VEC_PHI]], <vscale x 8 x i32> [[TMP7]])
+; CHECK-SVE-MAXBW-NEXT:    [[TMP8]] = add <vscale x 8 x i32> [[PARTIAL_REDUCE]], [[BROADCAST_SPLAT]]
+; CHECK-SVE-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-SVE-MAXBW-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-SVE-MAXBW-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; CHECK-SVE-MAXBW:       middle.block:
+; CHECK-SVE-MAXBW-NEXT:    [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[TMP8]])
+; CHECK-SVE-MAXBW-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]]
+; CHECK-SVE-MAXBW-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-SVE-MAXBW:       scalar.ph:
+;
+entry:
+  br label %loop
+
+loop:
+  %ptr.iv = phi ptr [ %start, %entry ], [ %gep.iv.next, %loop ]
+  %red = phi i32 [ 0, %entry ], [ %red.next, %loop ]
+  %l = load i8, ptr %ptr.iv, align 1
+  %l.ext = zext i8 %l to i32
+  %add = add i32 %red, %l.ext
+  %red.next = add i32 %add, %offset
+  %gep.iv.next = getelementptr i8, ptr %ptr.iv, i64 1
+  %ec = icmp eq ptr %ptr.iv, %end
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret i32 %red.next
+}
 
 attributes #0 = { vscale_range(1,16) }
 
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll
index 9d4a969b571e7..1f72cc2be856d 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll
@@ -21,7 +21,7 @@ define i32 @dotp(ptr %a, ptr %b) #0 {
 ; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP6]], align 1
 ; CHECK-NEXT:    [[TMP8:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP9:%.*]] = mul <16 x i32> [[TMP8]], [[TMP5]]
-; CHECK-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP9]])
+; CHECK-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP9]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -77,7 +77,7 @@ define void @dotp_small_epilogue_vf(i64 %idx.neg, i8 %a) #1 {
 ; CHECK-NEXT:    [[BROADCAST_SPLAT3:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT2]], <16 x i8> poison, <16 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = sext <16 x i8> [[BROADCAST_SPLAT3]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP4:%.*]] = mul <16 x i32> [[TMP3]], [[TMP1]]
-; CHECK-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP4]])
+; CHECK-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP4]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[IV_NEXT]]
 ; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
@@ -491,7 +491,7 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) {
 ; CHECK-NEXT:    [[TMP178:%.*]] = sext <16 x i8> [[TMP177]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP179:%.*]] = mul nsw <16 x i32> [[TMP178]], [[TMP97]]
 ; CHECK-NEXT:    [[TMP180:%.*]] = select <16 x i1> [[TMP16]], <16 x i32> [[TMP179]], <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP180]])
+; CHECK-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP180]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], splat (i64 16)
 ; CHECK-NEXT:    [[TMP181:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-mixed.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-mixed.ll
index 8766d6540ed19..2535de7a2b0c6 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-mixed.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-mixed.ll
@@ -38,8 +38,8 @@ define i32 @sudot(ptr %a, ptr %b) #0 {
 ; CHECK-NEXT:    [[TMP19:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD4]] to <vscale x 8 x i32>
 ; CHECK-NEXT:    [[TMP20:%.*]] = mul <vscale x 8 x i32> [[TMP18]], [[TMP11]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = mul <vscale x 8 x i32> [[TMP19]], [[TMP12]]
-; CHECK-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI]], <vscale x 8 x i32> [[TMP20]])
-; CHECK-NEXT:    [[PARTIAL_REDUCE5]] = call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI1]], <vscale x 8 x i32> [[TMP21]])
+; CHECK-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI]], <vscale x 8 x i32> [[TMP20]])
+; CHECK-NEXT:    [[PARTIAL_REDUCE5]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI1]], <vscale x 8 x i32> [[TMP21]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -82,8 +82,8 @@ define i32 @sudot(ptr %a, ptr %b) #0 {
 ; CHECK-NOI8MM-NEXT:    [[TMP19:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD4]] to <vscale x 8 x i32>
 ; CHECK-NOI8MM-NEXT:    [[TMP20:%.*]] = mul <vscale x 8 x i32> [[TMP18]], [[TMP11]]
 ; CHECK-NOI8MM-NEXT:    [[TMP21:%.*]] = mul <vscale x 8 x i32> [[TMP19]], [[TMP12]]
-; CHECK-NOI8MM-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI]], <vscale x 8 x i32> [[TMP20]])
-; CHECK-NOI8MM-NEXT:    [[PARTIAL_REDUCE5]] = call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI1]], <vscale x 8 x i32> [[TMP21]])
+; CHECK-NOI8MM-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI]], <vscale x 8 x i32> [[TMP20]])
+; CHECK-NOI8MM-NEXT:    [[PARTIAL_REDUCE5]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI1]], <vscale x 8 x i32> [[TMP21]])
 ; CHECK-NOI8MM-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-NOI8MM-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NOI8MM-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -149,8 +149,8 @@ define i32 @usdot(ptr %a, ptr %b) #0 {
 ; CHECK-NEXT:    [[TMP19:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD4]] to <vscale x 8 x i32>
 ; CHECK-NEXT:    [[TMP20:%.*]] = mul <vscale x 8 x i32> [[TMP18]], [[TMP11]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = mul <vscale x 8 x i32> [[TMP19]], [[TMP12]]
-; CHECK-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI]], <vscale x 8 x i32> [[TMP20]])
-; CHECK-NEXT:    [[PARTIAL_REDUCE5]] = call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI1]], <vscale x 8 x i32> [[TMP21]])
+; CHECK-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI]], <vscale x 8 x i32> [[TMP20]])
+; CHECK-NEXT:    [[PARTIAL_REDUCE5]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI1]], <vscale x 8 x i32> [[TMP21]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
@@ -193,8 +193,8 @@ define i32 @usdot(ptr %a, ptr %b) #0 {
 ; CHECK-NOI8MM-NEXT:    [[TMP19:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD4]] to <vscale x 8 x i32>
 ; CHECK-NOI8MM-NEXT:    [[TMP20:%.*]] = mul <vscale x 8 x i32> [[TMP18]], [[TMP11]]
 ; CHECK-NOI8MM-NEXT:    [[TMP21:%.*]] = mul <vscale x 8 x i32> [[TMP19]], [[TMP12]]
-; CHECK-NOI8MM-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI]], <vscale x 8 x i32> [[TMP20]])
-; CHECK-NOI8MM-NEXT:    [[PARTIAL_REDUCE5]] = call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI1]], <vscale x 8 x i32> [[TMP21]])
+; CHECK-NOI8MM-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI]], <vscale x 8 x i32> [[TMP20]])
+; CHECK-NOI8MM-NEXT:    [[PARTIAL_REDUCE5]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI1]], <vscale x 8 x i32> [[TMP21]])
 ; CHECK-NOI8MM-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-NOI8MM-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NOI8MM-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
@@ -252,8 +252,8 @@ define i32 @sudot_neon(ptr %a, ptr %b) #1 {
 ; CHECK-NEXT:    [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP10:%.*]] = mul <16 x i32> [[TMP8]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = mul <16 x i32> [[TMP9]], [[TMP4]]
-; CHECK-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP10]])
-; CHECK-NEXT:    [[PARTIAL_REDUCE5]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP11]])
+; CHECK-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP10]])
+; CHECK-NEXT:    [[PARTIAL_REDUCE5]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP11]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
@@ -345,8 +345,8 @@ define i32 @usdot_neon(ptr %a, ptr %b) #1 {
 ; CHECK-NEXT:    [[TMP9:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP10:%.*]] = mul <16 x i32> [[TMP8]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = mul <16 x i32> [[TMP9]], [[TMP4]]
-; CHECK-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP10]])
-; CHECK-NEXT:    [[PARTIAL_REDUCE5]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP11]])
+; CHECK-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP10]])
+; CHECK-NEXT:    [[PARTIAL_REDUCE5]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP11]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll
index cc3203cdff46f..7b6c52cd2f39b 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll
@@ -23,7 +23,7 @@ define i32 @dotp(ptr %a, ptr %b) {
 ; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = mul <16 x i32> [[TMP6]], [[TMP3]]
-; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP7]])
+; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP7]])
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -56,8 +56,8 @@ define i32 @dotp(ptr %a, ptr %b) {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = mul <16 x i32> [[TMP9]], [[TMP4]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = mul <16 x i32> [[TMP10]], [[TMP5]]
-; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP11]])
-; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE5]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP12]])
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP11]])
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE5]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP12]])
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -83,7 +83,7 @@ define i32 @dotp(ptr %a, ptr %b) {
 ; CHECK-MAXBW-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1
 ; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
 ; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = mul <16 x i32> [[TMP6]], [[TMP3]]
-; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP7]])
+; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP7]])
 ; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-MAXBW-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -712,25 +712,25 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) {
 ; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP15:%.*]] = sext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP16:%.*]] = mul nsw <16 x i32> [[TMP15]], [[TMP13]]
-; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP16]])
+; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP16]])
 ; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP18:%.*]] = sext <16 x i8> [[WIDE_LOAD5]] to <16 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD6:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP20:%.*]] = sext <16 x i8> [[WIDE_LOAD6]] to <16 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP21:%.*]] = mul nsw <16 x i32> [[TMP18]], [[TMP20]]
-; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE7]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP21]])
+; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE7]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP21]])
 ; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD8:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP23:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD9:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP25:%.*]] = sext <16 x i8> [[WIDE_LOAD9]] to <16 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP26:%.*]] = mul nsw <16 x i32> [[TMP23]], [[TMP25]]
-; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE10]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP26]])
+; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE10]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP26]])
 ; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD11:%.*]] = load <16 x i8>, ptr [[TMP10]], align 1
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP28:%.*]] = sext <16 x i8> [[WIDE_LOAD11]] to <16 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD12:%.*]] = load <16 x i8>, ptr [[TMP11]], align 1
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP30:%.*]] = sext <16 x i8> [[WIDE_LOAD12]] to <16 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP31:%.*]] = mul nsw <16 x i32> [[TMP28]], [[TMP30]]
-; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE13]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP31]])
+; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE13]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP31]])
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
@@ -785,8 +785,8 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = sext <16 x i8> [[WIDE_LOAD10]] to <16 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP19:%.*]] = mul nsw <16 x i32> [[TMP17]], [[TMP13]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP20:%.*]] = mul nsw <16 x i32> [[TMP18]], [[TMP14]]
-; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE1]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI6]], <16 x i32> [[TMP19]])
-; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE11]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI7]], <16 x i32> [[TMP20]])
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE1]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI6]], <16 x i32> [[TMP19]])
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE11]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI7]], <16 x i32> [[TMP20]])
 ; CHECK-INTERLEAVED-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 16
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD12:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD13:%.*]] = load <16 x i8>, ptr [[TMP22]], align 1
@@ -799,8 +799,8 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP28:%.*]] = sext <16 x i8> [[WIDE_LOAD15]] to <16 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP29:%.*]] = mul nsw <16 x i32> [[TMP23]], [[TMP27]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP30:%.*]] = mul nsw <16 x i32> [[TMP24]], [[TMP28]]
-; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE16]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI4]], <16 x i32> [[TMP29]])
-; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE17]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI5]], <16 x i32> [[TMP30]])
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE16]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI4]], <16 x i32> [[TMP29]])
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE17]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI5]], <16 x i32> [[TMP30]])
 ; CHECK-INTERLEAVED-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 16
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD18:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD19:%.*]] = load <16 x i8>, ptr [[TMP32]], align 1
@@ -813,8 +813,8 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP56:%.*]] = sext <16 x i8> [[WIDE_LOAD21]] to <16 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP39:%.*]] = mul nsw <16 x i32> [[TMP33]], [[TMP37]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP40:%.*]] = mul nsw <16 x i32> [[TMP34]], [[TMP56]]
-; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE7]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP39]])
-; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP40]])
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE7]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP39]])
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP40]])
 ; CHECK-INTERLEAVED-NEXT:    [[TMP42:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 16
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD24:%.*]] = load <16 x i8>, ptr [[TMP10]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD25:%.*]] = load <16 x i8>, ptr [[TMP42]], align 1
@@ -827,8 +827,8 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP48:%.*]] = sext <16 x i8> [[WIDE_LOAD27]] to <16 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP49:%.*]] = mul nsw <16 x i32> [[TMP43]], [[TMP47]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP50:%.*]] = mul nsw <16 x i32> [[TMP44]], [[TMP48]]
-; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE13]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP49]])
-; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE10]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP50]])
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE13]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP49]])
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE10]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP50]])
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; CHECK-INTERLEAVED-NEXT:    [[TMP51:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP51]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
@@ -876,25 +876,25 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) {
 ; CHECK-MAXBW-NEXT:    [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
 ; CHECK-MAXBW-NEXT:    [[TMP15:%.*]] = sext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
 ; CHECK-MAXBW-NEXT:    [[TMP16:%.*]] = mul nsw <16 x i32> [[TMP15]], [[TMP13]]
-; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP16]])
+; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP16]])
 ; CHECK-MAXBW-NEXT:    [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1
 ; CHECK-MAXBW-NEXT:    [[TMP18:%.*]] = sext <16 x i8> [[WIDE_LOAD5]] to <16 x i32>
 ; CHECK-MAXBW-NEXT:    [[WIDE_LOAD6:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
 ; CHECK-MAXBW-NEXT:    [[TMP20:%.*]] = sext <16 x i8> [[WIDE_LOAD6]] to <16 x i32>
 ; CHECK-MAXBW-NEXT:    [[TMP21:%.*]] = mul nsw <16 x i32> [[TMP18]], [[TMP20]]
-; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE7]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP21]])
+; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE7]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP21]])
 ; CHECK-MAXBW-NEXT:    [[WIDE_LOAD8:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
 ; CHECK-MAXBW-NEXT:    [[TMP23:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32>
 ; CHECK-MAXBW-NEXT:    [[WIDE_LOAD9:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1
 ; CHECK-MAXBW-NEXT:    [[TMP25:%.*]] = sext <16 x i8> [[WIDE_LOAD9]] to <16 x i32>
 ; CHECK-MAXBW-NEXT:    [[TMP26:%.*]] = mul nsw <16 x i32> [[TMP23]], [[TMP25]]
-; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE10]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP26]])
+; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE10]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP26]])
 ; CHECK-MAXBW-NEXT:    [[WIDE_LOAD11:%.*]] = load <16 x i8>, ptr [[TMP10]], align 1
 ; CHECK-MAXBW-NEXT:    [[TMP28:%.*]] = sext <16 x i8> [[WIDE_LOAD11]] to <16 x i32>
 ; CHECK-MAXBW-NEXT:    [[WIDE_LOAD12:%.*]] = load <16 x i8>, ptr [[TMP11]], align 1
 ; CHECK-MAXBW-NEXT:    [[TMP30:%.*]] = sext <16 x i8> [[WIDE_LOAD12]] to <16 x i32>
 ; CHECK-MAXBW-NEXT:    [[TMP31:%.*]] = mul nsw <16 x i32> [[TMP28]], [[TMP30]]
-; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE13]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP31]])
+; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE13]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP31]])
 ; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-MAXBW-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-MAXBW-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
@@ -1288,7 +1288,7 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) {
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP178:%.*]] = sext <16 x i8> [[TMP177]] to <16 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP179:%.*]] = mul nsw <16 x i32> [[TMP178]], [[TMP97]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP180:%.*]] = select <16 x i1> [[TMP16]], <16 x i32> [[TMP179]], <16 x i32> zeroinitializer
-; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP180]])
+; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP180]])
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
 ; CHECK-INTERLEAVE1-NEXT:    [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], splat (i64 16)
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP181:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -1623,7 +1623,7 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP178:%.*]] = sext <16 x i8> [[TMP177]] to <16 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP179:%.*]] = mul nsw <16 x i32> [[TMP178]], [[TMP97]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP180:%.*]] = select <16 x i1> [[TMP16]], <16 x i32> [[TMP179]], <16 x i32> zeroinitializer
-; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP180]])
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP180]])
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
 ; CHECK-INTERLEAVED-NEXT:    [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], splat (i64 16)
 ; CHECK-INTERLEAVED-NEXT:    [[TMP181:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -1958,7 +1958,7 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) {
 ; CHECK-MAXBW-NEXT:    [[TMP178:%.*]] = sext <16 x i8> [[TMP177]] to <16 x i32>
 ; CHECK-MAXBW-NEXT:    [[TMP179:%.*]] = mul nsw <16 x i32> [[TMP178]], [[TMP97]]
 ; CHECK-MAXBW-NEXT:    [[TMP180:%.*]] = select <16 x i1> [[TMP16]], <16 x i32> [[TMP179]], <16 x i32> zeroinitializer
-; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP180]])
+; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP180]])
 ; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
 ; CHECK-MAXBW-NEXT:    [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], splat (i64 16)
 ; CHECK-MAXBW-NEXT:    [[TMP181:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
index 17fbbbd1d6843..67e2c08139efe 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
@@ -101,7 +101,7 @@ define i32 @dotp(ptr %a, ptr %b) #0 {
 ; CHECK-MAXBW-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 8 x i8>, ptr [[TMP14]], align 1
 ; CHECK-MAXBW-NEXT:    [[TMP20:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD4]] to <vscale x 8 x i32>
 ; CHECK-MAXBW-NEXT:    [[TMP22:%.*]] = mul <vscale x 8 x i32> [[TMP20]], [[TMP13]]
-; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE5]] = call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI1]], <vscale x 8 x i32> [[TMP22]])
+; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE5]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI1]], <vscale x 8 x i32> [[TMP22]])
 ; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-MAXBW-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-MAXBW-NEXT:    br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -1275,25 +1275,25 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 {
 ; CHECK-MAXBW-NEXT:    [[WIDE_LOAD9:%.*]] = load <vscale x 8 x i8>, ptr [[TMP8]], align 1
 ; CHECK-MAXBW-NEXT:    [[TMP29:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD9]] to <vscale x 8 x i32>
 ; CHECK-MAXBW-NEXT:    [[TMP31:%.*]] = mul nsw <vscale x 8 x i32> [[TMP29]], [[TMP23]]
-; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE11]] = call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI7]], <vscale x 8 x i32> [[TMP31]])
+; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE11]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI7]], <vscale x 8 x i32> [[TMP31]])
 ; CHECK-MAXBW-NEXT:    [[WIDE_LOAD12:%.*]] = load <vscale x 8 x i8>, ptr [[TMP10]], align 1
 ; CHECK-MAXBW-NEXT:    [[TMP37:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD12]] to <vscale x 8 x i32>
 ; CHECK-MAXBW-NEXT:    [[WIDE_LOAD14:%.*]] = load <vscale x 8 x i8>, ptr [[TMP11]], align 1
 ; CHECK-MAXBW-NEXT:    [[TMP43:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD14]] to <vscale x 8 x i32>
 ; CHECK-MAXBW-NEXT:    [[TMP45:%.*]] = mul nsw <vscale x 8 x i32> [[TMP37]], [[TMP43]]
-; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI6]], <vscale x 8 x i32> [[TMP45]])
+; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI6]], <vscale x 8 x i32> [[TMP45]])
 ; CHECK-MAXBW-NEXT:    [[WIDE_LOAD18:%.*]] = load <vscale x 8 x i8>, ptr [[TMP13]], align 1
 ; CHECK-MAXBW-NEXT:    [[TMP51:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD18]] to <vscale x 8 x i32>
 ; CHECK-MAXBW-NEXT:    [[WIDE_LOAD20:%.*]] = load <vscale x 8 x i8>, ptr [[TMP14]], align 1
 ; CHECK-MAXBW-NEXT:    [[TMP57:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD20]] to <vscale x 8 x i32>
 ; CHECK-MAXBW-NEXT:    [[TMP59:%.*]] = mul nsw <vscale x 8 x i32> [[TMP51]], [[TMP57]]
-; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE17]] = call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI5]], <vscale x 8 x i32> [[TMP59]])
+; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE17]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI5]], <vscale x 8 x i32> [[TMP59]])
 ; CHECK-MAXBW-NEXT:    [[WIDE_LOAD24:%.*]] = load <vscale x 8 x i8>, ptr [[TMP16]], align 1
 ; CHECK-MAXBW-NEXT:    [[TMP65:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD24]] to <vscale x 8 x i32>
 ; CHECK-MAXBW-NEXT:    [[WIDE_LOAD26:%.*]] = load <vscale x 8 x i8>, ptr [[TMP17]], align 1
 ; CHECK-MAXBW-NEXT:    [[TMP71:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD26]] to <vscale x 8 x i32>
 ; CHECK-MAXBW-NEXT:    [[TMP73:%.*]] = mul nsw <vscale x 8 x i32> [[TMP65]], [[TMP71]]
-; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE16]] = call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI4]], <vscale x 8 x i32> [[TMP73]])
+; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE16]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI4]], <vscale x 8 x i32> [[TMP73]])
 ; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-MAXBW-NEXT:    [[TMP74:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-MAXBW-NEXT:    br i1 [[TMP74]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
@@ -1462,7 +1462,7 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) #0 {
 ; CHECK-MAXBW-NEXT:    [[TMP16:%.*]] = sext <vscale x 16 x i8> [[WIDE_MASKED_LOAD1]] to <vscale x 16 x i32>
 ; CHECK-MAXBW-NEXT:    [[TMP17:%.*]] = mul nsw <vscale x 16 x i32> [[TMP16]], [[TMP13]]
 ; CHECK-MAXBW-NEXT:    [[TMP18:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i32> [[TMP17]], <vscale x 16 x i32> zeroinitializer
-; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP18]])
+; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP18]])
 ; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
 ; CHECK-MAXBW-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP9]])
 ; CHECK-MAXBW-NEXT:    [[TMP19:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
@@ -2441,28 +2441,28 @@ define dso_local void @not_dotp_high_register_pressure(ptr %a, ptr %b, ptr %sum,
 ; CHECK-INTERLEAVE1-NEXT:    [[STRIDED_VEC14:%.*]] = shufflevector <128 x i8> [[WIDE_VEC]], <128 x i8> poison, <16 x i32> <i32 7, i32 15, i32 23, i32 31, i32 39, i32 47, i32 55, i32 63, i32 71, i32 79, i32 87, i32 95, i32 103, i32 111, i32 119, i32 127>
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP13:%.*]] = sext <16 x i8> [[STRIDED_VEC]] to <16 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP14:%.*]] = mul nsw <16 x i32> [[TMP13]], [[TMP10]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP15]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI7]], <16 x i32> [[TMP14]])
+; CHECK-INTERLEAVE1-NEXT:    [[TMP15]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI7]], <16 x i32> [[TMP14]])
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP29:%.*]] = sext <16 x i8> [[STRIDED_VEC8]] to <16 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP16:%.*]] = mul nsw <16 x i32> [[TMP29]], [[TMP10]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP18]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI6]], <16 x i32> [[TMP16]])
+; CHECK-INTERLEAVE1-NEXT:    [[TMP18]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI6]], <16 x i32> [[TMP16]])
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP17:%.*]] = sext <16 x i8> [[STRIDED_VEC9]] to <16 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP31:%.*]] = mul nsw <16 x i32> [[TMP17]], [[TMP10]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP21]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI5]], <16 x i32> [[TMP31]])
+; CHECK-INTERLEAVE1-NEXT:    [[TMP21]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI5]], <16 x i32> [[TMP31]])
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP19:%.*]] = sext <16 x i8> [[STRIDED_VEC10]] to <16 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP20:%.*]] = mul nsw <16 x i32> [[TMP19]], [[TMP10]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP24]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI4]], <16 x i32> [[TMP20]])
+; CHECK-INTERLEAVE1-NEXT:    [[TMP24]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI4]], <16 x i32> [[TMP20]])
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP32:%.*]] = sext <16 x i8> [[STRIDED_VEC11]] to <16 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP22:%.*]] = mul nsw <16 x i32> [[TMP32]], [[TMP10]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP27]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP22]])
+; CHECK-INTERLEAVE1-NEXT:    [[TMP27]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP22]])
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP23:%.*]] = sext <16 x i8> [[STRIDED_VEC12]] to <16 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP34:%.*]] = mul nsw <16 x i32> [[TMP23]], [[TMP10]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP30]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP34]])
+; CHECK-INTERLEAVE1-NEXT:    [[TMP30]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP34]])
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP25:%.*]] = sext <16 x i8> [[STRIDED_VEC13]] to <16 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP26:%.*]] = mul nsw <16 x i32> [[TMP25]], [[TMP10]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP33]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP26]])
+; CHECK-INTERLEAVE1-NEXT:    [[TMP33]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP26]])
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP35:%.*]] = sext <16 x i8> [[STRIDED_VEC14]] to <16 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP28:%.*]] = mul nsw <16 x i32> [[TMP35]], [[TMP10]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP36]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP28]])
+; CHECK-INTERLEAVE1-NEXT:    [[TMP36]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP28]])
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP37:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP37]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP29:![0-9]+]]
@@ -2541,28 +2541,28 @@ define dso_local void @not_dotp_high_register_pressure(ptr %a, ptr %b, ptr %sum,
 ; CHECK-INTERLEAVED-NEXT:    [[STRIDED_VEC14:%.*]] = shufflevector <128 x i8> [[WIDE_VEC]], <128 x i8> poison, <16 x i32> <i32 7, i32 15, i32 23, i32 31, i32 39, i32 47, i32 55, i32 63, i32 71, i32 79, i32 87, i32 95, i32 103, i32 111, i32 119, i32 127>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = sext <16 x i8> [[STRIDED_VEC]] to <16 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = mul nsw <16 x i32> [[TMP13]], [[TMP10]]
-; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI7]], <16 x i32> [[TMP14]])
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI7]], <16 x i32> [[TMP14]])
 ; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = sext <16 x i8> [[STRIDED_VEC8]] to <16 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = mul nsw <16 x i32> [[TMP15]], [[TMP10]]
-; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE15]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI6]], <16 x i32> [[TMP16]])
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE15]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI6]], <16 x i32> [[TMP16]])
 ; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = sext <16 x i8> [[STRIDED_VEC9]] to <16 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = mul nsw <16 x i32> [[TMP17]], [[TMP10]]
-; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE16]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI5]], <16 x i32> [[TMP18]])
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE16]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI5]], <16 x i32> [[TMP18]])
 ; CHECK-INTERLEAVED-NEXT:    [[TMP19:%.*]] = sext <16 x i8> [[STRIDED_VEC10]] to <16 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP20:%.*]] = mul nsw <16 x i32> [[TMP19]], [[TMP10]]
-; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE17]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI4]], <16 x i32> [[TMP20]])
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE17]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI4]], <16 x i32> [[TMP20]])
 ; CHECK-INTERLEAVED-NEXT:    [[TMP21:%.*]] = sext <16 x i8> [[STRIDED_VEC11]] to <16 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP22:%.*]] = mul nsw <16 x i32> [[TMP21]], [[TMP10]]
-; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE18]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP22]])
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE18]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP22]])
 ; CHECK-INTERLEAVED-NEXT:    [[TMP23:%.*]] = sext <16 x i8> [[STRIDED_VEC12]] to <16 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP24:%.*]] = mul nsw <16 x i32> [[TMP23]], [[TMP10]]
-; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE19]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP24]])
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE19]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP24]])
 ; CHECK-INTERLEAVED-NEXT:    [[TMP25:%.*]] = sext <16 x i8> [[STRIDED_VEC13]] to <16 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP26:%.*]] = mul nsw <16 x i32> [[TMP25]], [[TMP10]]
-; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE20]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP26]])
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE20]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP26]])
 ; CHECK-INTERLEAVED-NEXT:    [[TMP27:%.*]] = sext <16 x i8> [[STRIDED_VEC14]] to <16 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP28:%.*]] = mul nsw <16 x i32> [[TMP27]], [[TMP10]]
-; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE21]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP28]])
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE21]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP28]])
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-INTERLEAVED-NEXT:    [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP31:![0-9]+]]
@@ -2641,28 +2641,28 @@ define dso_local void @not_dotp_high_register_pressure(ptr %a, ptr %b, ptr %sum,
 ; CHECK-MAXBW-NEXT:    [[STRIDED_VEC14:%.*]] = shufflevector <128 x i8> [[WIDE_VEC]], <128 x i8> poison, <16 x i32> <i32 7, i32 15, i32 23, i32 31, i32 39, i32 47, i32 55, i32 63, i32 71, i32 79, i32 87, i32 95, i32 103, i32 111, i32 119, i32 127>
 ; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = sext <16 x i8> [[STRIDED_VEC]] to <16 x i32>
 ; CHECK-MAXBW-NEXT:    [[TMP14:%.*]] = mul nsw <16 x i32> [[TMP13]], [[TMP10]]
-; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI7]], <16 x i32> [[TMP14]])
+; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI7]], <16 x i32> [[TMP14]])
 ; CHECK-MAXBW-NEXT:    [[TMP15:%.*]] = sext <16 x i8> [[STRIDED_VEC8]] to <16 x i32>
 ; CHECK-MAXBW-NEXT:    [[TMP16:%.*]] = mul nsw <16 x i32> [[TMP15]], [[TMP10]]
-; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE15]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI6]], <16 x i32> [[TMP16]])
+; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE15]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI6]], <16 x i32> [[TMP16]])
 ; CHECK-MAXBW-NEXT:    [[TMP17:%.*]] = sext <16 x i8> [[STRIDED_VEC9]] to <16 x i32>
 ; CHECK-MAXBW-NEXT:    [[TMP18:%.*]] = mul nsw <16 x i32> [[TMP17]], [[TMP10]]
-; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE16]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI5]], <16 x i32> [[TMP18]])
+; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE16]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI5]], <16 x i32> [[TMP18]])
 ; CHECK-MAXBW-NEXT:    [[TMP19:%.*]] = sext <16 x i8> [[STRIDED_VEC10]] to <16 x i32>
 ; CHECK-MAXBW-NEXT:    [[TMP20:%.*]] = mul nsw <16 x i32> [[TMP19]], [[TMP10]]
-; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE17]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI4]], <16 x i32> [[TMP20]])
+; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE17]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI4]], <16 x i32> [[TMP20]])
 ; CHECK-MAXBW-NEXT:    [[TMP21:%.*]] = sext <16 x i8> [[STRIDED_VEC11]] to <16 x i32>
 ; CHECK-MAXBW-NEXT:    [[TMP22:%.*]] = mul nsw <16 x i32> [[TMP21]], [[TMP10]]
-; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE18]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP22]])
+; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE18]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP22]])
 ; CHECK-MAXBW-NEXT:    [[TMP23:%.*]] = sext <16 x i8> [[STRIDED_VEC12]] to <16 x i32>
 ; CHECK-MAXBW-NEXT:    [[TMP24:%.*]] = mul nsw <16 x i32> [[TMP23]], [[TMP10]]
-; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE19]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP24]])
+; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE19]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP24]])
 ; CHECK-MAXBW-NEXT:    [[TMP25:%.*]] = sext <16 x i8> [[STRIDED_VEC13]] to <16 x i32>
 ; CHECK-MAXBW-NEXT:    [[TMP26:%.*]] = mul nsw <16 x i32> [[TMP25]], [[TMP10]]
-; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE20]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP26]])
+; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE20]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP26]])
 ; CHECK-MAXBW-NEXT:    [[TMP27:%.*]] = sext <16 x i8> [[STRIDED_VEC14]] to <16 x i32>
 ; CHECK-MAXBW-NEXT:    [[TMP28:%.*]] = mul nsw <16 x i32> [[TMP27]], [[TMP10]]
-; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE21]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP28]])
+; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE21]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP28]])
 ; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-MAXBW-NEXT:    [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-MAXBW-NEXT:    br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP29:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-interleave.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-interleave.ll
index e24b47db14008..b308b925181b1 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-interleave.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-interleave.ll
@@ -27,8 +27,8 @@ define i32 @partial_reduce_with_non_constant_start_value(ptr %src, i32 %rdx.star
 ; IC2-NEXT:    [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
 ; IC2-NEXT:    [[TMP6:%.*]] = mul nuw nsw <16 x i32> [[TMP4]], [[TMP4]]
 ; IC2-NEXT:    [[TMP7:%.*]] = mul nuw nsw <16 x i32> [[TMP5]], [[TMP5]]
-; IC2-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP6]])
-; IC2-NEXT:    [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP7]])
+; IC2-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP6]])
+; IC2-NEXT:    [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP7]])
 ; IC2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; IC2-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; IC2-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -88,10 +88,10 @@ define i32 @partial_reduce_with_non_constant_start_value(ptr %src, i32 %rdx.star
 ; IC4-NEXT:    [[TMP11:%.*]] = mul nuw nsw <16 x i32> [[TMP7]], [[TMP7]]
 ; IC4-NEXT:    [[TMP12:%.*]] = mul nuw nsw <16 x i32> [[TMP8]], [[TMP8]]
 ; IC4-NEXT:    [[TMP13:%.*]] = mul nuw nsw <16 x i32> [[TMP9]], [[TMP9]]
-; IC4-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP10]])
-; IC4-NEXT:    [[PARTIAL_REDUCE7]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP11]])
-; IC4-NEXT:    [[PARTIAL_REDUCE8]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP12]])
-; IC4-NEXT:    [[PARTIAL_REDUCE9]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP13]])
+; IC4-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP10]])
+; IC4-NEXT:    [[PARTIAL_REDUCE7]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP11]])
+; IC4-NEXT:    [[PARTIAL_REDUCE8]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP12]])
+; IC4-NEXT:    [[PARTIAL_REDUCE9]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP13]])
 ; IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 64
 ; IC4-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; IC4-NEXT:    br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-sub.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-sub.ll
index 64cb33181cc1e..efffadc559f62 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-sub.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-sub.ll
@@ -105,7 +105,7 @@ define i32 @dotp(ptr %a, ptr %b) #0 {
 ; CHECK-MAXBW-NEXT:    [[TMP12:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
 ; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = mul <vscale x 8 x i32> [[TMP12]], [[TMP9]]
 ; CHECK-MAXBW-NEXT:    [[TMP14:%.*]] = sub <vscale x 8 x i32> zeroinitializer, [[TMP13]]
-; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI]], <vscale x 8 x i32> [[TMP14]])
+; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI]], <vscale x 8 x i32> [[TMP14]])
 ; CHECK-MAXBW-NEXT:    [[IV_NEXT]] = add nuw i64 [[IV]], [[TMP3]]
 ; CHECK-MAXBW-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[IV_NEXT]], [[N_VEC]]
 ; CHECK-MAXBW-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll
index 370bfc641001a..40cce22116db9 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll
@@ -73,7 +73,7 @@ define i32 @zext_add_reduc_i8_i32_sve(ptr %a) #0 {
 ; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP6]], align 1
 ; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD]] to <vscale x 16 x i32>
-; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP8]])
+; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP8]])
 ; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-MAXBW-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -114,7 +114,7 @@ define i32 @zext_add_reduc_i8_i32_neon(ptr %a) #2 {
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP0]], align 1
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
-; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP2]])
+; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP2]])
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
@@ -139,8 +139,8 @@ define i32 @zext_add_reduc_i8_i32_neon(ptr %a) #2 {
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP3]])
-; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP4]])
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP3]])
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP4]])
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
@@ -162,7 +162,7 @@ define i32 @zext_add_reduc_i8_i32_neon(ptr %a) #2 {
 ; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP0]], align 1
 ; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
-; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP2]])
+; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP2]])
 ; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-MAXBW-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
@@ -255,7 +255,7 @@ define i64 @zext_add_reduc_i8_i64(ptr %a) #0 {
 ; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP7]], align 1
 ; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD]] to <vscale x 16 x i64>
-; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv16i64(<vscale x 2 x i64> [[VEC_PHI]], <vscale x 16 x i64> [[TMP9]])
+; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 2 x i64> @llvm.vector.partial.reduce.add.nxv2i64.nxv16i64(<vscale x 2 x i64> [[VEC_PHI]], <vscale x 16 x i64> [[TMP9]])
 ; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-MAXBW-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-MAXBW-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
@@ -349,7 +349,7 @@ define i64 @zext_add_reduc_i16_i64(ptr %a) #0 {
 ; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = getelementptr i16, ptr [[A]], i64 [[INDEX]]
 ; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i16>, ptr [[TMP7]], align 2
 ; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = zext <vscale x 8 x i16> [[WIDE_LOAD]] to <vscale x 8 x i64>
-; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[VEC_PHI]], <vscale x 8 x i64> [[TMP9]])
+; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 2 x i64> @llvm.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[VEC_PHI]], <vscale x 8 x i64> [[TMP9]])
 ; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-MAXBW-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-MAXBW-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
@@ -391,7 +391,7 @@ define i32 @zext_add_reduc_i8_i32_has_neon_dotprod(ptr %a) #1 {
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
-; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP3]])
+; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP3]])
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
@@ -416,8 +416,8 @@ define i32 @zext_add_reduc_i8_i32_has_neon_dotprod(ptr %a) #1 {
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP4]])
-; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP5]])
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP4]])
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP5]])
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
@@ -443,7 +443,7 @@ define i32 @zext_add_reduc_i8_i32_has_neon_dotprod(ptr %a) #1 {
 ; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP7]], align 1
 ; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD]] to <vscale x 16 x i32>
-; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP9]])
+; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP9]])
 ; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-MAXBW-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-MAXBW-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
@@ -546,7 +546,7 @@ define i32 @zext_add_reduc_i8_i32_predicated(ptr %a) #0 {
 ; CHECK-MAXBW-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP6]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
 ; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = zext <vscale x 16 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 16 x i32>
 ; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i32> [[TMP8]], <vscale x 16 x i32> zeroinitializer
-; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP9]])
+; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP9]])
 ; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
 ; CHECK-MAXBW-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX_NEXT]], i64 1025)
 ; CHECK-MAXBW-NEXT:    [[TMP10:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
@@ -817,7 +817,7 @@ define i32 @sext_add_reduc_i8_i32(ptr %a) #0 {
 ; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP7]], align 1
 ; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD]] to <vscale x 16 x i32>
-; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP9]])
+; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP9]])
 ; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-MAXBW-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-MAXBW-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
@@ -1061,7 +1061,7 @@ define i32 @add_of_loop_invariant_zext(i32 %a, ptr %b, i8 %c, i32 %d) #0 {
 ; CHECK-MAXBW-NEXT:    [[OFFSET_IDX:%.*]] = add i32 [[D]], [[INDEX]]
 ; CHECK-MAXBW-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[B]], i32 [[OFFSET_IDX]]
 ; CHECK-MAXBW-NEXT:    store <vscale x 16 x i8> zeroinitializer, ptr [[TMP10]], align 1
-; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP9]])
+; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP9]])
 ; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP4]]
 ; CHECK-MAXBW-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-MAXBW-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll
index 8b2da8c4a7047..55e24486de4b6 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll
@@ -376,12 +376,17 @@ define i16 @reduce_udiv(ptr %src, i16 %x, i64 %N) #0 {
 ;
 ; VSCALEFORTUNING2-LABEL: define i16 @reduce_udiv(
 ; VSCALEFORTUNING2-SAME: ptr [[SRC:%.*]], i16 [[X:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
-; VSCALEFORTUNING2-NEXT:  [[ENTRY:.*]]:
+; VSCALEFORTUNING2-NEXT:  [[ITER_CHECK:.*]]:
 ; VSCALEFORTUNING2-NEXT:    [[TMP0:%.*]] = add i64 [[N]], 1
 ; VSCALEFORTUNING2-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; VSCALEFORTUNING2-NEXT:    [[TMP2:%.*]] = shl nuw i64 [[TMP1]], 3
+; VSCALEFORTUNING2-NEXT:    [[TMP5:%.*]] = shl nuw i64 [[TMP1]], 1
+; VSCALEFORTUNING2-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[TMP0]], [[TMP5]]
+; VSCALEFORTUNING2-NEXT:    br i1 [[MIN_ITERS_CHECK1]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]]
+; VSCALEFORTUNING2:       [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
+; VSCALEFORTUNING2-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; VSCALEFORTUNING2-NEXT:    [[TMP2:%.*]] = shl nuw i64 [[TMP6]], 3
 ; VSCALEFORTUNING2-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
-; VSCALEFORTUNING2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VSCALEFORTUNING2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]]
 ; VSCALEFORTUNING2:       [[VECTOR_PH]]:
 ; VSCALEFORTUNING2-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
 ; VSCALEFORTUNING2-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 8
@@ -411,23 +416,54 @@ define i16 @reduce_udiv(ptr %src, i16 %x, i64 %N) #0 {
 ; VSCALEFORTUNING2-NEXT:    [[BIN_RDX:%.*]] = or <vscale x 4 x i16> [[TMP16]], [[TMP15]]
 ; VSCALEFORTUNING2-NEXT:    [[TMP18:%.*]] = call i16 @llvm.vector.reduce.or.nxv4i16(<vscale x 4 x i16> [[BIN_RDX]])
 ; VSCALEFORTUNING2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
-; VSCALEFORTUNING2-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
-; VSCALEFORTUNING2:       [[SCALAR_PH]]:
-; VSCALEFORTUNING2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
-; VSCALEFORTUNING2-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i16 [ [[TMP18]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VSCALEFORTUNING2-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
+; VSCALEFORTUNING2:       [[VEC_EPILOG_ITER_CHECK]]:
+; VSCALEFORTUNING2-NEXT:    [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP0]], [[N_VEC]]
+; VSCALEFORTUNING2-NEXT:    [[TMP22:%.*]] = call i64 @llvm.vscale.i64()
+; VSCALEFORTUNING2-NEXT:    [[TMP27:%.*]] = shl nuw i64 [[TMP22]], 1
+; VSCALEFORTUNING2-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], [[TMP27]]
+; VSCALEFORTUNING2-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]]
+; VSCALEFORTUNING2:       [[VEC_EPILOG_PH]]:
+; VSCALEFORTUNING2-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; VSCALEFORTUNING2-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i16 [ [[TMP18]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; VSCALEFORTUNING2-NEXT:    [[TMP19:%.*]] = call i64 @llvm.vscale.i64()
+; VSCALEFORTUNING2-NEXT:    [[TMP20:%.*]] = mul nuw i64 [[TMP19]], 2
+; VSCALEFORTUNING2-NEXT:    [[N_MOD_VF4:%.*]] = urem i64 [[TMP0]], [[TMP20]]
+; VSCALEFORTUNING2-NEXT:    [[N_VEC5:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF4]]
+; VSCALEFORTUNING2-NEXT:    [[TMP21:%.*]] = insertelement <vscale x 2 x i16> zeroinitializer, i16 [[BC_MERGE_RDX]], i32 0
+; VSCALEFORTUNING2-NEXT:    [[BROADCAST_SPLATINSERT6:%.*]] = insertelement <vscale x 2 x i16> poison, i16 [[X]], i64 0
+; VSCALEFORTUNING2-NEXT:    [[BROADCAST_SPLAT7:%.*]] = shufflevector <vscale x 2 x i16> [[BROADCAST_SPLATINSERT6]], <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer
+; VSCALEFORTUNING2-NEXT:    br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
+; VSCALEFORTUNING2:       [[VEC_EPILOG_VECTOR_BODY]]:
+; VSCALEFORTUNING2-NEXT:    [[IV:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT11:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
+; VSCALEFORTUNING2-NEXT:    [[VEC_PHI9:%.*]] = phi <vscale x 2 x i16> [ [[TMP21]], %[[VEC_EPILOG_PH]] ], [ [[TMP24:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
+; VSCALEFORTUNING2-NEXT:    [[GEP:%.*]] = getelementptr i16, ptr [[SRC]], i64 [[IV]]
+; VSCALEFORTUNING2-NEXT:    [[WIDE_LOAD10:%.*]] = load <vscale x 2 x i16>, ptr [[GEP]], align 2
+; VSCALEFORTUNING2-NEXT:    [[TMP23:%.*]] = udiv <vscale x 2 x i16> [[WIDE_LOAD10]], [[BROADCAST_SPLAT7]]
+; VSCALEFORTUNING2-NEXT:    [[TMP24]] = or <vscale x 2 x i16> [[TMP23]], [[VEC_PHI9]]
+; VSCALEFORTUNING2-NEXT:    [[INDEX_NEXT11]] = add nuw i64 [[IV]], [[TMP20]]
+; VSCALEFORTUNING2-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT11]], [[N_VEC5]]
+; VSCALEFORTUNING2-NEXT:    br i1 [[TMP25]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; VSCALEFORTUNING2:       [[VEC_EPILOG_MIDDLE_BLOCK]]:
+; VSCALEFORTUNING2-NEXT:    [[TMP26:%.*]] = call i16 @llvm.vector.reduce.or.nxv2i16(<vscale x 2 x i16> [[TMP24]])
+; VSCALEFORTUNING2-NEXT:    [[CMP_N12:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC5]]
+; VSCALEFORTUNING2-NEXT:    br i1 [[CMP_N12]], label %[[EXIT]], label %[[VEC_EPILOG_SCALAR_PH]]
+; VSCALEFORTUNING2:       [[VEC_EPILOG_SCALAR_PH]]:
+; VSCALEFORTUNING2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC5]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ]
+; VSCALEFORTUNING2-NEXT:    [[BC_MERGE_RDX13:%.*]] = phi i16 [ [[TMP26]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[TMP18]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ]
 ; VSCALEFORTUNING2-NEXT:    br label %[[LOOP:.*]]
 ; VSCALEFORTUNING2:       [[LOOP]]:
-; VSCALEFORTUNING2-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; VSCALEFORTUNING2-NEXT:    [[RED:%.*]] = phi i16 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[RED_NEXT:%.*]], %[[LOOP]] ]
-; VSCALEFORTUNING2-NEXT:    [[GEP:%.*]] = getelementptr i16, ptr [[SRC]], i64 [[IV]]
-; VSCALEFORTUNING2-NEXT:    [[L:%.*]] = load i16, ptr [[GEP]], align 2
+; VSCALEFORTUNING2-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VSCALEFORTUNING2-NEXT:    [[RED:%.*]] = phi i16 [ [[BC_MERGE_RDX13]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[RED_NEXT:%.*]], %[[LOOP]] ]
+; VSCALEFORTUNING2-NEXT:    [[GEP1:%.*]] = getelementptr i16, ptr [[SRC]], i64 [[IV1]]
+; VSCALEFORTUNING2-NEXT:    [[L:%.*]] = load i16, ptr [[GEP1]], align 2
 ; VSCALEFORTUNING2-NEXT:    [[DIV:%.*]] = udiv i16 [[L]], [[X]]
 ; VSCALEFORTUNING2-NEXT:    [[RED_NEXT]] = or i16 [[DIV]], [[RED]]
-; VSCALEFORTUNING2-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; VSCALEFORTUNING2-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV]], [[N]]
-; VSCALEFORTUNING2-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
+; VSCALEFORTUNING2-NEXT:    [[IV_NEXT]] = add i64 [[IV1]], 1
+; VSCALEFORTUNING2-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV1]], [[N]]
+; VSCALEFORTUNING2-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP6:![0-9]+]]
 ; VSCALEFORTUNING2:       [[EXIT]]:
-; VSCALEFORTUNING2-NEXT:    [[RED_NEXT_LCSSA:%.*]] = phi i16 [ [[RED_NEXT]], %[[LOOP]] ], [ [[TMP18]], %[[MIDDLE_BLOCK]] ]
+; VSCALEFORTUNING2-NEXT:    [[RED_NEXT_LCSSA:%.*]] = phi i16 [ [[RED_NEXT]], %[[LOOP]] ], [ [[TMP18]], %[[MIDDLE_BLOCK]] ], [ [[TMP26]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ]
 ; VSCALEFORTUNING2-NEXT:    ret i16 [[RED_NEXT_LCSSA]]
 ;
 ; PRED-LABEL: define i16 @reduce_udiv(
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/reg-usage.ll b/llvm/test/Transforms/LoopVectorize/AArch64/reg-usage.ll
index 01d103264fafe..c61361bb3df76 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/reg-usage.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/reg-usage.ll
@@ -74,7 +74,7 @@ define dso_local void @dotp_high_register_pressure(ptr %a, ptr %b, ptr %sum, i32
 ; CHECK:       LV(REG): VF = 16
 ; CHECK-NEXT:  LV(REG): Found max usage: 2 item
 ; CHECK-NEXT:  LV(REG): RegisterClass: Generic::ScalarRC, 3 registers
-; CHECK-NEXT:  LV(REG): RegisterClass: Generic::VectorRC, 48 registers
+; CHECK-NEXT:  LV(REG): RegisterClass: Generic::VectorRC, 47 registers
 ; CHECK-NEXT:  LV(REG): Found invariant usage: 1 item
 entry:
   %cmp100 = icmp sgt i32 %n, 0
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll
index 6f605acd7ecbe..32485c7428908 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll
@@ -518,11 +518,8 @@ define void @trip_count_vscale(ptr noalias %a, ptr noalias %b) vscale_range(1, 1
 ; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 4 x float>, ptr [[TMP11]], align 4
 ; CHECK-NEXT:    [[TMP12:%.*]] = fmul <vscale x 4 x float> [[WIDE_LOAD]], [[WIDE_LOAD2]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = fmul <vscale x 4 x float> [[WIDE_LOAD1]], [[WIDE_LOAD3]]
-; CHECK-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP15:%.*]] = shl nuw i64 [[TMP14]], 2
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds nuw float, ptr [[TMP8]], i64 [[TMP15]]
 ; CHECK-NEXT:    store <vscale x 4 x float> [[TMP12]], ptr [[TMP8]], align 4
-; CHECK-NEXT:    store <vscale x 4 x float> [[TMP13]], ptr [[TMP16]], align 4
+; CHECK-NEXT:    store <vscale x 4 x float> [[TMP13]], ptr [[TMP11]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
@@ -584,11 +581,8 @@ define void @trip_count_vscale(ptr noalias %a, ptr noalias %b) vscale_range(1, 1
 ; CHECK-VF8-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 4 x float>, ptr [[TMP9]], align 4
 ; CHECK-VF8-NEXT:    [[TMP10:%.*]] = fmul <vscale x 4 x float> [[WIDE_LOAD]], [[WIDE_LOAD2]]
 ; CHECK-VF8-NEXT:    [[TMP11:%.*]] = fmul <vscale x 4 x float> [[WIDE_LOAD1]], [[WIDE_LOAD3]]
-; CHECK-VF8-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF8-NEXT:    [[TMP13:%.*]] = shl nuw i64 [[TMP12]], 2
-; CHECK-VF8-NEXT:    [[TMP14:%.*]] = getelementptr inbounds nuw float, ptr [[TMP6]], i64 [[TMP13]]
 ; CHECK-VF8-NEXT:    store <vscale x 4 x float> [[TMP10]], ptr [[TMP6]], align 4
-; CHECK-VF8-NEXT:    store <vscale x 4 x float> [[TMP11]], ptr [[TMP14]], align 4
+; CHECK-VF8-NEXT:    store <vscale x 4 x float> [[TMP11]], ptr [[TMP9]], align 4
 ; CHECK-VF8-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]]
 ; CHECK-VF8-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-VF8-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
@@ -656,11 +650,8 @@ define void @trip_count_vscale_no_epilogue_iterations(ptr noalias %a, ptr noalia
 ; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 4 x float>, ptr [[TMP11]], align 4
 ; CHECK-NEXT:    [[TMP12:%.*]] = fmul <vscale x 4 x float> [[WIDE_LOAD]], [[WIDE_LOAD2]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = fmul <vscale x 4 x float> [[WIDE_LOAD1]], [[WIDE_LOAD3]]
-; CHECK-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP15:%.*]] = shl nuw i64 [[TMP14]], 2
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds nuw float, ptr [[TMP8]], i64 [[TMP15]]
 ; CHECK-NEXT:    store <vscale x 4 x float> [[TMP12]], ptr [[TMP8]], align 4
-; CHECK-NEXT:    store <vscale x 4 x float> [[TMP13]], ptr [[TMP16]], align 4
+; CHECK-NEXT:    store <vscale x 4 x float> [[TMP13]], ptr [[TMP11]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
@@ -719,11 +710,8 @@ define void @trip_count_vscale_no_epilogue_iterations(ptr noalias %a, ptr noalia
 ; CHECK-VF8-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 4 x float>, ptr [[TMP9]], align 4
 ; CHECK-VF8-NEXT:    [[TMP10:%.*]] = fmul <vscale x 4 x float> [[WIDE_LOAD]], [[WIDE_LOAD2]]
 ; CHECK-VF8-NEXT:    [[TMP11:%.*]] = fmul <vscale x 4 x float> [[WIDE_LOAD1]], [[WIDE_LOAD3]]
-; CHECK-VF8-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF8-NEXT:    [[TMP13:%.*]] = shl nuw i64 [[TMP12]], 2
-; CHECK-VF8-NEXT:    [[TMP14:%.*]] = getelementptr inbounds nuw float, ptr [[TMP6]], i64 [[TMP13]]
 ; CHECK-VF8-NEXT:    store <vscale x 4 x float> [[TMP10]], ptr [[TMP6]], align 4
-; CHECK-VF8-NEXT:    store <vscale x 4 x float> [[TMP11]], ptr [[TMP14]], align 4
+; CHECK-VF8-NEXT:    store <vscale x 4 x float> [[TMP11]], ptr [[TMP9]], align 4
 ; CHECK-VF8-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]]
 ; CHECK-VF8-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-VF8-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-vscale-based-trip-counts.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-vscale-based-trip-counts.ll
index 4444be36c3567..643ff29136d22 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-vscale-based-trip-counts.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-vscale-based-trip-counts.ll
@@ -76,11 +76,8 @@ define  void @vscale_mul_8(ptr noalias noundef readonly captures(none) %a, ptr n
 ; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 4 x float>, ptr [[TMP16]], align 4
 ; CHECK-NEXT:    [[TMP17:%.*]] = fmul <vscale x 4 x float> [[WIDE_LOAD]], [[WIDE_LOAD2]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = fmul <vscale x 4 x float> [[WIDE_LOAD1]], [[WIDE_LOAD3]]
-; CHECK-NEXT:    [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP21:%.*]] = shl nuw i64 [[TMP20]], 2
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds nuw float, ptr [[B]], i64 [[TMP21]]
 ; CHECK-NEXT:    store <vscale x 4 x float> [[TMP17]], ptr [[B]], align 4
-; CHECK-NEXT:    store <vscale x 4 x float> [[TMP18]], ptr [[TMP22]], align 4
+; CHECK-NEXT:    store <vscale x 4 x float> [[TMP18]], ptr [[TMP16]], align 4
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[MUL1]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY:.*]]
 ; CHECK:       [[FOR_COND_CLEANUP]]:
@@ -216,11 +213,8 @@ define void @vscale_mul_31(ptr noalias noundef readonly captures(none) %a, ptr n
 ; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 4 x float>, ptr [[TMP16]], align 4
 ; CHECK-NEXT:    [[TMP17:%.*]] = fmul <vscale x 4 x float> [[WIDE_LOAD]], [[WIDE_LOAD2]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = fmul <vscale x 4 x float> [[WIDE_LOAD1]], [[WIDE_LOAD3]]
-; CHECK-NEXT:    [[TMP19:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP20:%.*]] = shl nuw i64 [[TMP19]], 2
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds nuw float, ptr [[TMP12]], i64 [[TMP20]]
 ; CHECK-NEXT:    store <vscale x 4 x float> [[TMP17]], ptr [[TMP12]], align 4
-; CHECK-NEXT:    store <vscale x 4 x float> [[TMP18]], ptr [[TMP21]], align 4
+; CHECK-NEXT:    store <vscale x 4 x float> [[TMP18]], ptr [[TMP16]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
 ; CHECK-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP22]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
@@ -296,11 +290,8 @@ define void @vscale_mul_64(ptr noalias noundef readonly captures(none) %a, ptr n
 ; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 4 x float>, ptr [[TMP16]], align 4
 ; CHECK-NEXT:    [[TMP17:%.*]] = fmul <vscale x 4 x float> [[WIDE_LOAD]], [[WIDE_LOAD2]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = fmul <vscale x 4 x float> [[WIDE_LOAD1]], [[WIDE_LOAD3]]
-; CHECK-NEXT:    [[TMP19:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP20:%.*]] = shl nuw i64 [[TMP19]], 2
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds nuw float, ptr [[TMP12]], i64 [[TMP20]]
 ; CHECK-NEXT:    store <vscale x 4 x float> [[TMP17]], ptr [[TMP12]], align 4
-; CHECK-NEXT:    store <vscale x 4 x float> [[TMP18]], ptr [[TMP21]], align 4
+; CHECK-NEXT:    store <vscale x 4 x float> [[TMP18]], ptr [[TMP16]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
 ; CHECK-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP22]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
@@ -378,11 +369,8 @@ define void @trip_count_with_overflow(ptr noalias noundef readonly captures(none
 ; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 4 x float>, ptr [[TMP17]], align 4
 ; CHECK-NEXT:    [[TMP18:%.*]] = fmul <vscale x 4 x float> [[WIDE_LOAD]], [[WIDE_LOAD2]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = fmul <vscale x 4 x float> [[WIDE_LOAD1]], [[WIDE_LOAD3]]
-; CHECK-NEXT:    [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP21:%.*]] = shl nuw i64 [[TMP20]], 2
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds nuw float, ptr [[TMP13]], i64 [[TMP21]]
 ; CHECK-NEXT:    store <vscale x 4 x float> [[TMP18]], ptr [[TMP13]], align 4
-; CHECK-NEXT:    store <vscale x 4 x float> [[TMP19]], ptr [[TMP22]], align 4
+; CHECK-NEXT:    store <vscale x 4 x float> [[TMP19]], ptr [[TMP17]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP23]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
@@ -455,11 +443,8 @@ define void @trip_count_too_big_for_element_count(ptr noalias noundef readonly c
 ; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 4 x float>, ptr [[TMP17]], align 4
 ; CHECK-NEXT:    [[TMP18:%.*]] = fmul <vscale x 4 x float> [[WIDE_LOAD]], [[WIDE_LOAD2]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = fmul <vscale x 4 x float> [[WIDE_LOAD1]], [[WIDE_LOAD3]]
-; CHECK-NEXT:    [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP21:%.*]] = shl nuw i64 [[TMP20]], 2
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds nuw float, ptr [[TMP13]], i64 [[TMP21]]
 ; CHECK-NEXT:    store <vscale x 4 x float> [[TMP18]], ptr [[TMP13]], align 4
-; CHECK-NEXT:    store <vscale x 4 x float> [[TMP19]], ptr [[TMP22]], align 4
+; CHECK-NEXT:    store <vscale x 4 x float> [[TMP19]], ptr [[TMP17]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP23]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-cost.ll
index 3aad98145e2aa..92eb562f5caa6 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-cost.ll
@@ -437,4 +437,64 @@ exit:
   ret void
 }
 
+; FIXME: We should interleave by 2 after narrowing interleave groups to saturate
+; load/store units.
+define void @test_interleave_after_narrowing(i32 %n, ptr %x, ptr noalias %y) {
+; CHECK-LABEL: define void @test_interleave_after_narrowing(
+; CHECK-SAME: i32 [[N:%.*]], ptr [[X:%.*]], ptr noalias [[Y:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw float, ptr [[X]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = fneg <4 x float> [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw float, ptr [[Y]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    store <4 x float> [[TMP1]], ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
+; CHECK-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br [[EXIT:label %.*]]
+; CHECK:       [[SCALAR_PH]]:
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %gep.x = getelementptr inbounds nuw float, ptr %x, i64 %iv
+  %l.x = load float, ptr %gep.x, align 4
+  %neg.0 = fneg float %l.x
+  %gep.y = getelementptr inbounds nuw float, ptr %y, i64 %iv
+  store float %neg.0, ptr %gep.y, align 4
+  %iv.1 = or disjoint i64 %iv, 1
+  %gep.x.1 = getelementptr inbounds nuw float, ptr %x, i64 %iv.1
+  %l.x.1 = load float, ptr %gep.x.1, align 4
+  %neg.1 = fneg float %l.x.1
+  %gep.y.1 = getelementptr inbounds nuw float, ptr %y, i64 %iv.1
+  store float %neg.1, ptr %gep.y.1, align 4
+  %iv.2 = or disjoint i64 %iv, 2
+  %gep.x.2 = getelementptr inbounds nuw float, ptr %x, i64 %iv.2
+  %l.x.2 = load float, ptr %gep.x.2, align 4
+  %neg.2 = fneg float %l.x.2
+  %gep.y.2 = getelementptr inbounds nuw float, ptr %y, i64 %iv.2
+  store float %neg.2, ptr %gep.y.2, align 4
+  %iv.3 = or disjoint i64 %iv, 3
+  %gep.x.3 = getelementptr inbounds nuw float, ptr %x, i64 %iv.3
+  %l.x.3 = load float, ptr %gep.x.3, align 4
+  %neg.3 = fneg float %l.x.3
+  %gep.y.3 = getelementptr inbounds nuw float, ptr %y, i64 %iv.3
+  store float %neg.3, ptr %gep.y.3, align 4
+  %iv.next = add nuw nsw i64 %iv, 4
+  %ec = icmp samesign ult i64 %iv, 1020
+  br i1 %ec, label %loop, label %exit
+
+exit:
+  ret void
+}
+
 attributes #0 = { "target-cpu"="neoverse-v2" }
diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/large-loop-rdx.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/large-loop-rdx.ll
index 0b23206134bc0..43cce8005bbf6 100644
--- a/llvm/test/Transforms/LoopVectorize/PowerPC/large-loop-rdx.ll
+++ b/llvm/test/Transforms/LoopVectorize/PowerPC/large-loop-rdx.ll
@@ -10,28 +10,43 @@ define void @QLA_F3_r_veq_norm2_V(ptr noalias %r, ptr noalias %a, i32 %n) {
 ; CHECK-SAME: ptr noalias [[R:%.*]], ptr noalias [[A:%.*]], i32 [[N:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    [[CMP24:%.*]] = icmp sgt i32 [[N]], 0
-; CHECK-NEXT:    br i1 [[CMP24]], label %[[FOR_COND1_PREHEADER_PREHEADER:.*]], label %[[FOR_END13:.*]]
-; CHECK:       [[FOR_COND1_PREHEADER_PREHEADER]]:
+; CHECK-NEXT:    br i1 [[CMP24]], label %[[ITER_CHECK:.*]], label %[[FOR_END13:.*]]
+; CHECK:       [[ITER_CHECK]]:
 ; CHECK-NEXT:    [[TMP0:%.*]] = zext i32 [[N]] to i64
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 8
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 2
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]]
+; CHECK:       [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[TMP0]], 16
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK1]], label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 8
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 16
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x double> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP69:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <2 x double> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP65:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI2:%.*]] = phi <2 x double> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP66:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI3:%.*]] = phi <2 x double> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP67:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI2:%.*]] = phi <2 x double> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP65:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI3:%.*]] = phi <2 x double> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP66:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI4:%.*]] = phi <2 x double> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP131:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP25:%.*]] = phi <2 x double> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP132:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP26:%.*]] = phi <2 x double> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP133:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP27:%.*]] = phi <2 x double> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP134:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP28:%.*]] = phi <2 x double> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP135:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 6
+; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 10
+; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 12
+; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 14
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [3 x { float, float }], ptr [[A]], i64 [[INDEX]], i64 0, i32 0
 ; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [3 x { float, float }], ptr [[A]], i64 [[TMP1]], i64 0, i32 0
 ; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [3 x { float, float }], ptr [[A]], i64 [[TMP2]], i64 0, i32 0
 ; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [3 x { float, float }], ptr [[A]], i64 [[TMP3]], i64 0, i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [3 x { float, float }], ptr [[A]], i64 [[TMP4]], i64 0, i32 0
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [3 x { float, float }], ptr [[A]], i64 [[TMP5]], i64 0, i32 0
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [3 x { float, float }], ptr [[A]], i64 [[TMP6]], i64 0, i32 0
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [3 x { float, float }], ptr [[A]], i64 [[TMP7]], i64 0, i32 0
 ; CHECK-NEXT:    [[WIDE_VEC35:%.*]] = load <12 x float>, ptr [[TMP13]], align 8
 ; CHECK-NEXT:    [[STRIDED_VEC36:%.*]] = shufflevector <12 x float> [[WIDE_VEC35]], <12 x float> poison, <2 x i32> <i32 0, i32 6>
 ; CHECK-NEXT:    [[STRIDED_VEC37:%.*]] = shufflevector <12 x float> [[WIDE_VEC35]], <12 x float> poison, <2 x i32> <i32 1, i32 7>
@@ -60,116 +75,252 @@ define void @QLA_F3_r_veq_norm2_V(ptr noalias %r, ptr noalias %a, i32 %n) {
 ; CHECK-NEXT:    [[STRIDED_VEC60:%.*]] = shufflevector <12 x float> [[WIDE_VEC56]], <12 x float> poison, <2 x i32> <i32 3, i32 9>
 ; CHECK-NEXT:    [[STRIDED_VEC61:%.*]] = shufflevector <12 x float> [[WIDE_VEC56]], <12 x float> poison, <2 x i32> <i32 4, i32 10>
 ; CHECK-NEXT:    [[STRIDED_VEC62:%.*]] = shufflevector <12 x float> [[WIDE_VEC56]], <12 x float> poison, <2 x i32> <i32 5, i32 11>
+; CHECK-NEXT:    [[WIDE_VEC36:%.*]] = load <12 x float>, ptr [[TMP12]], align 8
+; CHECK-NEXT:    [[STRIDED_VEC42:%.*]] = shufflevector <12 x float> [[WIDE_VEC36]], <12 x float> poison, <2 x i32> <i32 0, i32 6>
+; CHECK-NEXT:    [[STRIDED_VEC49:%.*]] = shufflevector <12 x float> [[WIDE_VEC36]], <12 x float> poison, <2 x i32> <i32 1, i32 7>
+; CHECK-NEXT:    [[STRIDED_VEC56:%.*]] = shufflevector <12 x float> [[WIDE_VEC36]], <12 x float> poison, <2 x i32> <i32 2, i32 8>
+; CHECK-NEXT:    [[STRIDED_VEC63:%.*]] = shufflevector <12 x float> [[WIDE_VEC36]], <12 x float> poison, <2 x i32> <i32 3, i32 9>
+; CHECK-NEXT:    [[STRIDED_VEC64:%.*]] = shufflevector <12 x float> [[WIDE_VEC36]], <12 x float> poison, <2 x i32> <i32 4, i32 10>
+; CHECK-NEXT:    [[STRIDED_VEC65:%.*]] = shufflevector <12 x float> [[WIDE_VEC36]], <12 x float> poison, <2 x i32> <i32 5, i32 11>
+; CHECK-NEXT:    [[WIDE_VEC43:%.*]] = load <12 x float>, ptr [[TMP17]], align 8
+; CHECK-NEXT:    [[STRIDED_VEC66:%.*]] = shufflevector <12 x float> [[WIDE_VEC43]], <12 x float> poison, <2 x i32> <i32 0, i32 6>
+; CHECK-NEXT:    [[STRIDED_VEC67:%.*]] = shufflevector <12 x float> [[WIDE_VEC43]], <12 x float> poison, <2 x i32> <i32 1, i32 7>
+; CHECK-NEXT:    [[STRIDED_VEC68:%.*]] = shufflevector <12 x float> [[WIDE_VEC43]], <12 x float> poison, <2 x i32> <i32 2, i32 8>
+; CHECK-NEXT:    [[STRIDED_VEC69:%.*]] = shufflevector <12 x float> [[WIDE_VEC43]], <12 x float> poison, <2 x i32> <i32 3, i32 9>
+; CHECK-NEXT:    [[STRIDED_VEC70:%.*]] = shufflevector <12 x float> [[WIDE_VEC43]], <12 x float> poison, <2 x i32> <i32 4, i32 10>
+; CHECK-NEXT:    [[STRIDED_VEC71:%.*]] = shufflevector <12 x float> [[WIDE_VEC43]], <12 x float> poison, <2 x i32> <i32 5, i32 11>
+; CHECK-NEXT:    [[WIDE_VEC50:%.*]] = load <12 x float>, ptr [[TMP18]], align 8
+; CHECK-NEXT:    [[STRIDED_VEC72:%.*]] = shufflevector <12 x float> [[WIDE_VEC50]], <12 x float> poison, <2 x i32> <i32 0, i32 6>
+; CHECK-NEXT:    [[STRIDED_VEC73:%.*]] = shufflevector <12 x float> [[WIDE_VEC50]], <12 x float> poison, <2 x i32> <i32 1, i32 7>
+; CHECK-NEXT:    [[STRIDED_VEC80:%.*]] = shufflevector <12 x float> [[WIDE_VEC50]], <12 x float> poison, <2 x i32> <i32 2, i32 8>
+; CHECK-NEXT:    [[STRIDED_VEC81:%.*]] = shufflevector <12 x float> [[WIDE_VEC50]], <12 x float> poison, <2 x i32> <i32 3, i32 9>
+; CHECK-NEXT:    [[STRIDED_VEC82:%.*]] = shufflevector <12 x float> [[WIDE_VEC50]], <12 x float> poison, <2 x i32> <i32 4, i32 10>
+; CHECK-NEXT:    [[STRIDED_VEC83:%.*]] = shufflevector <12 x float> [[WIDE_VEC50]], <12 x float> poison, <2 x i32> <i32 5, i32 11>
+; CHECK-NEXT:    [[WIDE_VEC57:%.*]] = load <12 x float>, ptr [[TMP19]], align 8
+; CHECK-NEXT:    [[STRIDED_VEC84:%.*]] = shufflevector <12 x float> [[WIDE_VEC57]], <12 x float> poison, <2 x i32> <i32 0, i32 6>
+; CHECK-NEXT:    [[STRIDED_VEC85:%.*]] = shufflevector <12 x float> [[WIDE_VEC57]], <12 x float> poison, <2 x i32> <i32 1, i32 7>
+; CHECK-NEXT:    [[STRIDED_VEC86:%.*]] = shufflevector <12 x float> [[WIDE_VEC57]], <12 x float> poison, <2 x i32> <i32 2, i32 8>
+; CHECK-NEXT:    [[STRIDED_VEC87:%.*]] = shufflevector <12 x float> [[WIDE_VEC57]], <12 x float> poison, <2 x i32> <i32 3, i32 9>
+; CHECK-NEXT:    [[STRIDED_VEC88:%.*]] = shufflevector <12 x float> [[WIDE_VEC57]], <12 x float> poison, <2 x i32> <i32 4, i32 10>
+; CHECK-NEXT:    [[STRIDED_VEC89:%.*]] = shufflevector <12 x float> [[WIDE_VEC57]], <12 x float> poison, <2 x i32> <i32 5, i32 11>
 ; CHECK-NEXT:    [[TMP64:%.*]] = fmul fast <2 x float> [[STRIDED_VEC36]], [[STRIDED_VEC36]]
 ; CHECK-NEXT:    [[TMP97:%.*]] = fmul fast <2 x float> [[STRIDED_VEC43]], [[STRIDED_VEC43]]
 ; CHECK-NEXT:    [[TMP98:%.*]] = fmul fast <2 x float> [[STRIDED_VEC50]], [[STRIDED_VEC50]]
 ; CHECK-NEXT:    [[TMP99:%.*]] = fmul fast <2 x float> [[STRIDED_VEC57]], [[STRIDED_VEC57]]
+; CHECK-NEXT:    [[TMP100:%.*]] = fmul fast <2 x float> [[STRIDED_VEC42]], [[STRIDED_VEC42]]
+; CHECK-NEXT:    [[TMP101:%.*]] = fmul fast <2 x float> [[STRIDED_VEC66]], [[STRIDED_VEC66]]
+; CHECK-NEXT:    [[TMP102:%.*]] = fmul fast <2 x float> [[STRIDED_VEC72]], [[STRIDED_VEC72]]
+; CHECK-NEXT:    [[TMP103:%.*]] = fmul fast <2 x float> [[STRIDED_VEC84]], [[STRIDED_VEC84]]
 ; CHECK-NEXT:    [[TMP72:%.*]] = fmul fast <2 x float> [[STRIDED_VEC37]], [[STRIDED_VEC37]]
 ; CHECK-NEXT:    [[TMP105:%.*]] = fmul fast <2 x float> [[STRIDED_VEC44]], [[STRIDED_VEC44]]
 ; CHECK-NEXT:    [[TMP106:%.*]] = fmul fast <2 x float> [[STRIDED_VEC51]], [[STRIDED_VEC51]]
 ; CHECK-NEXT:    [[TMP107:%.*]] = fmul fast <2 x float> [[STRIDED_VEC58]], [[STRIDED_VEC58]]
+; CHECK-NEXT:    [[TMP108:%.*]] = fmul fast <2 x float> [[STRIDED_VEC49]], [[STRIDED_VEC49]]
+; CHECK-NEXT:    [[TMP109:%.*]] = fmul fast <2 x float> [[STRIDED_VEC67]], [[STRIDED_VEC67]]
+; CHECK-NEXT:    [[TMP110:%.*]] = fmul fast <2 x float> [[STRIDED_VEC73]], [[STRIDED_VEC73]]
+; CHECK-NEXT:    [[TMP111:%.*]] = fmul fast <2 x float> [[STRIDED_VEC85]], [[STRIDED_VEC85]]
 ; CHECK-NEXT:    [[TMP80:%.*]] = fadd fast <2 x float> [[TMP72]], [[TMP64]]
 ; CHECK-NEXT:    [[TMP113:%.*]] = fadd fast <2 x float> [[TMP105]], [[TMP97]]
 ; CHECK-NEXT:    [[TMP114:%.*]] = fadd fast <2 x float> [[TMP106]], [[TMP98]]
 ; CHECK-NEXT:    [[TMP115:%.*]] = fadd fast <2 x float> [[TMP107]], [[TMP99]]
-; CHECK-NEXT:    [[TMP21:%.*]] = fpext <2 x float> [[TMP80]] to <2 x double>
-; CHECK-NEXT:    [[TMP22:%.*]] = fpext <2 x float> [[TMP113]] to <2 x double>
-; CHECK-NEXT:    [[TMP23:%.*]] = fpext <2 x float> [[TMP114]] to <2 x double>
-; CHECK-NEXT:    [[TMP24:%.*]] = fpext <2 x float> [[TMP115]] to <2 x double>
-; CHECK-NEXT:    [[TMP25:%.*]] = fadd fast <2 x double> [[TMP21]], [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP26:%.*]] = fadd fast <2 x double> [[TMP22]], [[VEC_PHI1]]
-; CHECK-NEXT:    [[TMP27:%.*]] = fadd fast <2 x double> [[TMP23]], [[VEC_PHI2]]
-; CHECK-NEXT:    [[TMP28:%.*]] = fadd fast <2 x double> [[TMP24]], [[VEC_PHI3]]
-; CHECK-NEXT:    [[TMP100:%.*]] = fmul fast <2 x float> [[STRIDED_VEC38]], [[STRIDED_VEC38]]
-; CHECK-NEXT:    [[TMP101:%.*]] = fmul fast <2 x float> [[STRIDED_VEC45]], [[STRIDED_VEC45]]
-; CHECK-NEXT:    [[TMP102:%.*]] = fmul fast <2 x float> [[STRIDED_VEC52]], [[STRIDED_VEC52]]
-; CHECK-NEXT:    [[TMP103:%.*]] = fmul fast <2 x float> [[STRIDED_VEC59]], [[STRIDED_VEC59]]
-; CHECK-NEXT:    [[TMP108:%.*]] = fmul fast <2 x float> [[STRIDED_VEC39]], [[STRIDED_VEC39]]
-; CHECK-NEXT:    [[TMP109:%.*]] = fmul fast <2 x float> [[STRIDED_VEC46]], [[STRIDED_VEC46]]
-; CHECK-NEXT:    [[TMP110:%.*]] = fmul fast <2 x float> [[STRIDED_VEC53]], [[STRIDED_VEC53]]
-; CHECK-NEXT:    [[TMP111:%.*]] = fmul fast <2 x float> [[STRIDED_VEC60]], [[STRIDED_VEC60]]
 ; CHECK-NEXT:    [[TMP116:%.*]] = fadd fast <2 x float> [[TMP108]], [[TMP100]]
 ; CHECK-NEXT:    [[TMP117:%.*]] = fadd fast <2 x float> [[TMP109]], [[TMP101]]
 ; CHECK-NEXT:    [[TMP118:%.*]] = fadd fast <2 x float> [[TMP110]], [[TMP102]]
 ; CHECK-NEXT:    [[TMP119:%.*]] = fadd fast <2 x float> [[TMP111]], [[TMP103]]
+; CHECK-NEXT:    [[TMP40:%.*]] = fpext <2 x float> [[TMP80]] to <2 x double>
+; CHECK-NEXT:    [[TMP52:%.*]] = fpext <2 x float> [[TMP113]] to <2 x double>
+; CHECK-NEXT:    [[TMP53:%.*]] = fpext <2 x float> [[TMP114]] to <2 x double>
+; CHECK-NEXT:    [[TMP54:%.*]] = fpext <2 x float> [[TMP115]] to <2 x double>
 ; CHECK-NEXT:    [[TMP41:%.*]] = fpext <2 x float> [[TMP116]] to <2 x double>
 ; CHECK-NEXT:    [[TMP42:%.*]] = fpext <2 x float> [[TMP117]] to <2 x double>
 ; CHECK-NEXT:    [[TMP43:%.*]] = fpext <2 x float> [[TMP118]] to <2 x double>
 ; CHECK-NEXT:    [[TMP44:%.*]] = fpext <2 x float> [[TMP119]] to <2 x double>
+; CHECK-NEXT:    [[TMP55:%.*]] = fadd fast <2 x double> [[TMP40]], [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP49:%.*]] = fadd fast <2 x double> [[TMP52]], [[VEC_PHI2]]
+; CHECK-NEXT:    [[TMP50:%.*]] = fadd fast <2 x double> [[TMP53]], [[VEC_PHI3]]
+; CHECK-NEXT:    [[TMP51:%.*]] = fadd fast <2 x double> [[TMP54]], [[VEC_PHI4]]
 ; CHECK-NEXT:    [[TMP45:%.*]] = fadd fast <2 x double> [[TMP41]], [[TMP25]]
 ; CHECK-NEXT:    [[TMP46:%.*]] = fadd fast <2 x double> [[TMP42]], [[TMP26]]
 ; CHECK-NEXT:    [[TMP47:%.*]] = fadd fast <2 x double> [[TMP43]], [[TMP27]]
 ; CHECK-NEXT:    [[TMP48:%.*]] = fadd fast <2 x double> [[TMP44]], [[TMP28]]
-; CHECK-NEXT:    [[TMP104:%.*]] = fmul fast <2 x float> [[STRIDED_VEC40]], [[STRIDED_VEC40]]
-; CHECK-NEXT:    [[TMP142:%.*]] = fmul fast <2 x float> [[STRIDED_VEC47]], [[STRIDED_VEC47]]
-; CHECK-NEXT:    [[TMP147:%.*]] = fmul fast <2 x float> [[STRIDED_VEC54]], [[STRIDED_VEC54]]
-; CHECK-NEXT:    [[TMP152:%.*]] = fmul fast <2 x float> [[STRIDED_VEC61]], [[STRIDED_VEC61]]
-; CHECK-NEXT:    [[TMP112:%.*]] = fmul fast <2 x float> [[STRIDED_VEC41]], [[STRIDED_VEC41]]
-; CHECK-NEXT:    [[TMP143:%.*]] = fmul fast <2 x float> [[STRIDED_VEC48]], [[STRIDED_VEC48]]
-; CHECK-NEXT:    [[TMP148:%.*]] = fmul fast <2 x float> [[STRIDED_VEC55]], [[STRIDED_VEC55]]
-; CHECK-NEXT:    [[TMP153:%.*]] = fmul fast <2 x float> [[STRIDED_VEC62]], [[STRIDED_VEC62]]
+; CHECK-NEXT:    [[TMP104:%.*]] = fmul fast <2 x float> [[STRIDED_VEC38]], [[STRIDED_VEC38]]
+; CHECK-NEXT:    [[TMP142:%.*]] = fmul fast <2 x float> [[STRIDED_VEC45]], [[STRIDED_VEC45]]
+; CHECK-NEXT:    [[TMP147:%.*]] = fmul fast <2 x float> [[STRIDED_VEC52]], [[STRIDED_VEC52]]
+; CHECK-NEXT:    [[TMP152:%.*]] = fmul fast <2 x float> [[STRIDED_VEC59]], [[STRIDED_VEC59]]
+; CHECK-NEXT:    [[TMP60:%.*]] = fmul fast <2 x float> [[STRIDED_VEC56]], [[STRIDED_VEC56]]
+; CHECK-NEXT:    [[TMP67:%.*]] = fmul fast <2 x float> [[STRIDED_VEC68]], [[STRIDED_VEC68]]
+; CHECK-NEXT:    [[TMP73:%.*]] = fmul fast <2 x float> [[STRIDED_VEC80]], [[STRIDED_VEC80]]
+; CHECK-NEXT:    [[TMP74:%.*]] = fmul fast <2 x float> [[STRIDED_VEC86]], [[STRIDED_VEC86]]
+; CHECK-NEXT:    [[TMP112:%.*]] = fmul fast <2 x float> [[STRIDED_VEC39]], [[STRIDED_VEC39]]
+; CHECK-NEXT:    [[TMP143:%.*]] = fmul fast <2 x float> [[STRIDED_VEC46]], [[STRIDED_VEC46]]
+; CHECK-NEXT:    [[TMP148:%.*]] = fmul fast <2 x float> [[STRIDED_VEC53]], [[STRIDED_VEC53]]
+; CHECK-NEXT:    [[TMP153:%.*]] = fmul fast <2 x float> [[STRIDED_VEC60]], [[STRIDED_VEC60]]
+; CHECK-NEXT:    [[TMP75:%.*]] = fmul fast <2 x float> [[STRIDED_VEC63]], [[STRIDED_VEC63]]
+; CHECK-NEXT:    [[TMP81:%.*]] = fmul fast <2 x float> [[STRIDED_VEC69]], [[STRIDED_VEC69]]
+; CHECK-NEXT:    [[TMP70:%.*]] = fmul fast <2 x float> [[STRIDED_VEC81]], [[STRIDED_VEC81]]
+; CHECK-NEXT:    [[TMP71:%.*]] = fmul fast <2 x float> [[STRIDED_VEC87]], [[STRIDED_VEC87]]
 ; CHECK-NEXT:    [[TMP120:%.*]] = fadd fast <2 x float> [[TMP112]], [[TMP104]]
 ; CHECK-NEXT:    [[TMP144:%.*]] = fadd fast <2 x float> [[TMP143]], [[TMP142]]
 ; CHECK-NEXT:    [[TMP149:%.*]] = fadd fast <2 x float> [[TMP148]], [[TMP147]]
 ; CHECK-NEXT:    [[TMP154:%.*]] = fadd fast <2 x float> [[TMP153]], [[TMP152]]
+; CHECK-NEXT:    [[TMP76:%.*]] = fadd fast <2 x float> [[TMP75]], [[TMP60]]
+; CHECK-NEXT:    [[TMP77:%.*]] = fadd fast <2 x float> [[TMP81]], [[TMP67]]
+; CHECK-NEXT:    [[TMP78:%.*]] = fadd fast <2 x float> [[TMP70]], [[TMP73]]
+; CHECK-NEXT:    [[TMP79:%.*]] = fadd fast <2 x float> [[TMP71]], [[TMP74]]
 ; CHECK-NEXT:    [[TMP61:%.*]] = fpext <2 x float> [[TMP120]] to <2 x double>
 ; CHECK-NEXT:    [[TMP62:%.*]] = fpext <2 x float> [[TMP144]] to <2 x double>
 ; CHECK-NEXT:    [[TMP63:%.*]] = fpext <2 x float> [[TMP149]] to <2 x double>
 ; CHECK-NEXT:    [[TMP155:%.*]] = fpext <2 x float> [[TMP154]] to <2 x double>
-; CHECK-NEXT:    [[TMP69]] = fadd fast <2 x double> [[TMP61]], [[TMP45]]
-; CHECK-NEXT:    [[TMP65]] = fadd fast <2 x double> [[TMP62]], [[TMP46]]
-; CHECK-NEXT:    [[TMP66]] = fadd fast <2 x double> [[TMP63]], [[TMP47]]
-; CHECK-NEXT:    [[TMP67]] = fadd fast <2 x double> [[TMP155]], [[TMP48]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP84:%.*]] = fpext <2 x float> [[TMP76]] to <2 x double>
+; CHECK-NEXT:    [[TMP85:%.*]] = fpext <2 x float> [[TMP77]] to <2 x double>
+; CHECK-NEXT:    [[TMP86:%.*]] = fpext <2 x float> [[TMP78]] to <2 x double>
+; CHECK-NEXT:    [[TMP87:%.*]] = fpext <2 x float> [[TMP79]] to <2 x double>
+; CHECK-NEXT:    [[TMP88:%.*]] = fadd fast <2 x double> [[TMP61]], [[TMP55]]
+; CHECK-NEXT:    [[TMP89:%.*]] = fadd fast <2 x double> [[TMP62]], [[TMP49]]
+; CHECK-NEXT:    [[TMP90:%.*]] = fadd fast <2 x double> [[TMP63]], [[TMP50]]
+; CHECK-NEXT:    [[TMP91:%.*]] = fadd fast <2 x double> [[TMP155]], [[TMP51]]
+; CHECK-NEXT:    [[TMP92:%.*]] = fadd fast <2 x double> [[TMP84]], [[TMP45]]
+; CHECK-NEXT:    [[TMP93:%.*]] = fadd fast <2 x double> [[TMP85]], [[TMP46]]
+; CHECK-NEXT:    [[TMP94:%.*]] = fadd fast <2 x double> [[TMP86]], [[TMP47]]
+; CHECK-NEXT:    [[TMP95:%.*]] = fadd fast <2 x double> [[TMP87]], [[TMP48]]
+; CHECK-NEXT:    [[TMP96:%.*]] = fmul fast <2 x float> [[STRIDED_VEC40]], [[STRIDED_VEC40]]
+; CHECK-NEXT:    [[TMP128:%.*]] = fmul fast <2 x float> [[STRIDED_VEC47]], [[STRIDED_VEC47]]
+; CHECK-NEXT:    [[TMP129:%.*]] = fmul fast <2 x float> [[STRIDED_VEC54]], [[STRIDED_VEC54]]
+; CHECK-NEXT:    [[TMP130:%.*]] = fmul fast <2 x float> [[STRIDED_VEC61]], [[STRIDED_VEC61]]
+; CHECK-NEXT:    [[TMP136:%.*]] = fmul fast <2 x float> [[STRIDED_VEC64]], [[STRIDED_VEC64]]
+; CHECK-NEXT:    [[TMP137:%.*]] = fmul fast <2 x float> [[STRIDED_VEC70]], [[STRIDED_VEC70]]
+; CHECK-NEXT:    [[TMP139:%.*]] = fmul fast <2 x float> [[STRIDED_VEC82]], [[STRIDED_VEC82]]
+; CHECK-NEXT:    [[TMP157:%.*]] = fmul fast <2 x float> [[STRIDED_VEC88]], [[STRIDED_VEC88]]
+; CHECK-NEXT:    [[TMP159:%.*]] = fmul fast <2 x float> [[STRIDED_VEC41]], [[STRIDED_VEC41]]
+; CHECK-NEXT:    [[TMP160:%.*]] = fmul fast <2 x float> [[STRIDED_VEC48]], [[STRIDED_VEC48]]
+; CHECK-NEXT:    [[TMP161:%.*]] = fmul fast <2 x float> [[STRIDED_VEC55]], [[STRIDED_VEC55]]
+; CHECK-NEXT:    [[TMP162:%.*]] = fmul fast <2 x float> [[STRIDED_VEC62]], [[STRIDED_VEC62]]
+; CHECK-NEXT:    [[TMP163:%.*]] = fmul fast <2 x float> [[STRIDED_VEC65]], [[STRIDED_VEC65]]
+; CHECK-NEXT:    [[TMP164:%.*]] = fmul fast <2 x float> [[STRIDED_VEC71]], [[STRIDED_VEC71]]
+; CHECK-NEXT:    [[TMP165:%.*]] = fmul fast <2 x float> [[STRIDED_VEC83]], [[STRIDED_VEC83]]
+; CHECK-NEXT:    [[TMP166:%.*]] = fmul fast <2 x float> [[STRIDED_VEC89]], [[STRIDED_VEC89]]
+; CHECK-NEXT:    [[TMP167:%.*]] = fadd fast <2 x float> [[TMP159]], [[TMP96]]
+; CHECK-NEXT:    [[TMP168:%.*]] = fadd fast <2 x float> [[TMP160]], [[TMP128]]
+; CHECK-NEXT:    [[TMP169:%.*]] = fadd fast <2 x float> [[TMP161]], [[TMP129]]
+; CHECK-NEXT:    [[TMP170:%.*]] = fadd fast <2 x float> [[TMP162]], [[TMP130]]
+; CHECK-NEXT:    [[TMP171:%.*]] = fadd fast <2 x float> [[TMP163]], [[TMP136]]
+; CHECK-NEXT:    [[TMP172:%.*]] = fadd fast <2 x float> [[TMP164]], [[TMP137]]
+; CHECK-NEXT:    [[TMP173:%.*]] = fadd fast <2 x float> [[TMP165]], [[TMP139]]
+; CHECK-NEXT:    [[TMP174:%.*]] = fadd fast <2 x float> [[TMP166]], [[TMP157]]
+; CHECK-NEXT:    [[TMP175:%.*]] = fpext <2 x float> [[TMP167]] to <2 x double>
+; CHECK-NEXT:    [[TMP121:%.*]] = fpext <2 x float> [[TMP168]] to <2 x double>
+; CHECK-NEXT:    [[TMP122:%.*]] = fpext <2 x float> [[TMP169]] to <2 x double>
+; CHECK-NEXT:    [[TMP123:%.*]] = fpext <2 x float> [[TMP170]] to <2 x double>
+; CHECK-NEXT:    [[TMP124:%.*]] = fpext <2 x float> [[TMP171]] to <2 x double>
+; CHECK-NEXT:    [[TMP125:%.*]] = fpext <2 x float> [[TMP172]] to <2 x double>
+; CHECK-NEXT:    [[TMP126:%.*]] = fpext <2 x float> [[TMP173]] to <2 x double>
+; CHECK-NEXT:    [[TMP127:%.*]] = fpext <2 x float> [[TMP174]] to <2 x double>
+; CHECK-NEXT:    [[TMP69]] = fadd fast <2 x double> [[TMP175]], [[TMP88]]
+; CHECK-NEXT:    [[TMP65]] = fadd fast <2 x double> [[TMP121]], [[TMP89]]
+; CHECK-NEXT:    [[TMP66]] = fadd fast <2 x double> [[TMP122]], [[TMP90]]
+; CHECK-NEXT:    [[TMP131]] = fadd fast <2 x double> [[TMP123]], [[TMP91]]
+; CHECK-NEXT:    [[TMP132]] = fadd fast <2 x double> [[TMP124]], [[TMP92]]
+; CHECK-NEXT:    [[TMP133]] = fadd fast <2 x double> [[TMP125]], [[TMP93]]
+; CHECK-NEXT:    [[TMP134]] = fadd fast <2 x double> [[TMP126]], [[TMP94]]
+; CHECK-NEXT:    [[TMP135]] = fadd fast <2 x double> [[TMP127]], [[TMP95]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-NEXT:    [[TMP68:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP68]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd fast <2 x double> [[TMP65]], [[TMP69]]
 ; CHECK-NEXT:    [[BIN_RDX30:%.*]] = fadd fast <2 x double> [[TMP66]], [[BIN_RDX]]
-; CHECK-NEXT:    [[TMP156:%.*]] = fadd fast <2 x double> [[TMP67]], [[BIN_RDX30]]
+; CHECK-NEXT:    [[BIN_RDX64:%.*]] = fadd fast <2 x double> [[TMP131]], [[BIN_RDX30]]
+; CHECK-NEXT:    [[BIN_RDX65:%.*]] = fadd fast <2 x double> [[TMP132]], [[BIN_RDX64]]
+; CHECK-NEXT:    [[BIN_RDX66:%.*]] = fadd fast <2 x double> [[TMP133]], [[BIN_RDX65]]
+; CHECK-NEXT:    [[BIN_RDX67:%.*]] = fadd fast <2 x double> [[TMP134]], [[BIN_RDX66]]
+; CHECK-NEXT:    [[TMP156:%.*]] = fadd fast <2 x double> [[TMP135]], [[BIN_RDX67]]
 ; CHECK-NEXT:    [[TMP158:%.*]] = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> [[TMP156]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label %[[FOR_COND_FOR_END13_CRIT_EDGE:.*]], label %[[SCALAR_PH]]
-; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_COND1_PREHEADER_PREHEADER]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi double [ [[TMP158]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[FOR_COND1_PREHEADER_PREHEADER]] ]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[FOR_COND_FOR_END13_CRIT_EDGE:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
+; CHECK:       [[VEC_EPILOG_ITER_CHECK]]:
+; CHECK-NEXT:    [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP0]], [[N_VEC]]
+; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 2
+; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]]
+; CHECK:       [[VEC_EPILOG_PH]]:
+; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi double [ [[TMP158]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0.000000e+00, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT:    [[N_MOD_VF69:%.*]] = urem i64 [[TMP0]], 2
+; CHECK-NEXT:    [[N_VEC70:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF69]]
+; CHECK-NEXT:    [[TMP138:%.*]] = insertelement <2 x double> zeroinitializer, double [[BC_MERGE_RDX]], i32 0
+; CHECK-NEXT:    br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
+; CHECK:       [[VEC_EPILOG_VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT80:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI72:%.*]] = phi <2 x double> [ [[TMP138]], %[[VEC_EPILOG_PH]] ], [ [[TMP176:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX5_REALP:%.*]] = getelementptr inbounds [3 x { float, float }], ptr [[A]], i64 [[INDVARS_IV]], i64 0, i32 0
+; CHECK-NEXT:    [[WIDE_VEC73:%.*]] = load <12 x float>, ptr [[ARRAYIDX5_REALP]], align 8
+; CHECK-NEXT:    [[STRIDED_VEC74:%.*]] = shufflevector <12 x float> [[WIDE_VEC73]], <12 x float> poison, <2 x i32> <i32 0, i32 6>
+; CHECK-NEXT:    [[STRIDED_VEC75:%.*]] = shufflevector <12 x float> [[WIDE_VEC73]], <12 x float> poison, <2 x i32> <i32 1, i32 7>
+; CHECK-NEXT:    [[STRIDED_VEC76:%.*]] = shufflevector <12 x float> [[WIDE_VEC73]], <12 x float> poison, <2 x i32> <i32 2, i32 8>
+; CHECK-NEXT:    [[STRIDED_VEC77:%.*]] = shufflevector <12 x float> [[WIDE_VEC73]], <12 x float> poison, <2 x i32> <i32 3, i32 9>
+; CHECK-NEXT:    [[STRIDED_VEC78:%.*]] = shufflevector <12 x float> [[WIDE_VEC73]], <12 x float> poison, <2 x i32> <i32 4, i32 10>
+; CHECK-NEXT:    [[STRIDED_VEC79:%.*]] = shufflevector <12 x float> [[WIDE_VEC73]], <12 x float> poison, <2 x i32> <i32 5, i32 11>
+; CHECK-NEXT:    [[TMP140:%.*]] = fmul fast <2 x float> [[STRIDED_VEC74]], [[STRIDED_VEC74]]
+; CHECK-NEXT:    [[TMP141:%.*]] = fmul fast <2 x float> [[STRIDED_VEC75]], [[STRIDED_VEC75]]
+; CHECK-NEXT:    [[TMP177:%.*]] = fadd fast <2 x float> [[TMP141]], [[TMP140]]
+; CHECK-NEXT:    [[TMP178:%.*]] = fpext <2 x float> [[TMP177]] to <2 x double>
+; CHECK-NEXT:    [[TMP179:%.*]] = fadd fast <2 x double> [[TMP178]], [[VEC_PHI72]]
+; CHECK-NEXT:    [[TMP145:%.*]] = fmul fast <2 x float> [[STRIDED_VEC76]], [[STRIDED_VEC76]]
+; CHECK-NEXT:    [[TMP146:%.*]] = fmul fast <2 x float> [[STRIDED_VEC77]], [[STRIDED_VEC77]]
+; CHECK-NEXT:    [[TMP180:%.*]] = fadd fast <2 x float> [[TMP146]], [[TMP145]]
+; CHECK-NEXT:    [[TMP181:%.*]] = fpext <2 x float> [[TMP180]] to <2 x double>
+; CHECK-NEXT:    [[TMP182:%.*]] = fadd fast <2 x double> [[TMP181]], [[TMP179]]
+; CHECK-NEXT:    [[TMP150:%.*]] = fmul fast <2 x float> [[STRIDED_VEC78]], [[STRIDED_VEC78]]
+; CHECK-NEXT:    [[TMP151:%.*]] = fmul fast <2 x float> [[STRIDED_VEC79]], [[STRIDED_VEC79]]
+; CHECK-NEXT:    [[TMP183:%.*]] = fadd fast <2 x float> [[TMP151]], [[TMP150]]
+; CHECK-NEXT:    [[TMP184:%.*]] = fpext <2 x float> [[TMP183]] to <2 x double>
+; CHECK-NEXT:    [[TMP176]] = fadd fast <2 x double> [[TMP184]], [[TMP182]]
+; CHECK-NEXT:    [[INDEX_NEXT80]] = add nuw i64 [[INDVARS_IV]], 2
+; CHECK-NEXT:    [[TMP185:%.*]] = icmp eq i64 [[INDEX_NEXT80]], [[N_VEC70]]
+; CHECK-NEXT:    br i1 [[TMP185]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       [[VEC_EPILOG_MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP186:%.*]] = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> [[TMP176]])
+; CHECK-NEXT:    [[CMP_N81:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC70]]
+; CHECK-NEXT:    br i1 [[CMP_N81]], label %[[FOR_COND_FOR_END13_CRIT_EDGE]], label %[[VEC_EPILOG_SCALAR_PH]]
+; CHECK:       [[VEC_EPILOG_SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC70]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX82:%.*]] = phi double [ [[TMP186]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[TMP158]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0.000000e+00, %[[ITER_CHECK]] ]
 ; CHECK-NEXT:    br label %[[FOR_COND1_PREHEADER:.*]]
 ; CHECK:       [[FOR_COND1_PREHEADER]]:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_COND1_PREHEADER]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
-; CHECK-NEXT:    [[SUM_026:%.*]] = phi double [ [[ADD10_2:%.*]], %[[FOR_COND1_PREHEADER]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ]
-; CHECK-NEXT:    [[ARRAYIDX5_REALP:%.*]] = getelementptr inbounds [3 x { float, float }], ptr [[A]], i64 [[INDVARS_IV]], i64 0, i32 0
-; CHECK-NEXT:    [[ARRAYIDX5_REAL:%.*]] = load float, ptr [[ARRAYIDX5_REALP]], align 8
-; CHECK-NEXT:    [[ARRAYIDX5_IMAGP:%.*]] = getelementptr inbounds [3 x { float, float }], ptr [[A]], i64 [[INDVARS_IV]], i64 0, i32 1
+; CHECK-NEXT:    [[INDVARS_IV1:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_COND1_PREHEADER]] ], [ [[BC_RESUME_VAL]], %[[VEC_EPILOG_SCALAR_PH]] ]
+; CHECK-NEXT:    [[SUM_026:%.*]] = phi double [ [[ADD10_2:%.*]], %[[FOR_COND1_PREHEADER]] ], [ [[BC_MERGE_RDX82]], %[[VEC_EPILOG_SCALAR_PH]] ]
+; CHECK-NEXT:    [[ARRAYIDX5_REALP1:%.*]] = getelementptr inbounds [3 x { float, float }], ptr [[A]], i64 [[INDVARS_IV1]], i64 0, i32 0
+; CHECK-NEXT:    [[ARRAYIDX5_REAL:%.*]] = load float, ptr [[ARRAYIDX5_REALP1]], align 8
+; CHECK-NEXT:    [[ARRAYIDX5_IMAGP:%.*]] = getelementptr inbounds [3 x { float, float }], ptr [[A]], i64 [[INDVARS_IV1]], i64 0, i32 1
 ; CHECK-NEXT:    [[ARRAYIDX5_IMAG:%.*]] = load float, ptr [[ARRAYIDX5_IMAGP]], align 8
 ; CHECK-NEXT:    [[MUL:%.*]] = fmul fast float [[ARRAYIDX5_REAL]], [[ARRAYIDX5_REAL]]
 ; CHECK-NEXT:    [[MUL9:%.*]] = fmul fast float [[ARRAYIDX5_IMAG]], [[ARRAYIDX5_IMAG]]
 ; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[MUL9]], [[MUL]]
 ; CHECK-NEXT:    [[CONV:%.*]] = fpext float [[ADD]] to double
 ; CHECK-NEXT:    [[ADD10:%.*]] = fadd fast double [[CONV]], [[SUM_026]]
-; CHECK-NEXT:    [[ARRAYIDX5_REALP_1:%.*]] = getelementptr inbounds [3 x { float, float }], ptr [[A]], i64 [[INDVARS_IV]], i64 1, i32 0
+; CHECK-NEXT:    [[ARRAYIDX5_REALP_1:%.*]] = getelementptr inbounds [3 x { float, float }], ptr [[A]], i64 [[INDVARS_IV1]], i64 1, i32 0
 ; CHECK-NEXT:    [[ARRAYIDX5_REAL_1:%.*]] = load float, ptr [[ARRAYIDX5_REALP_1]], align 8
-; CHECK-NEXT:    [[ARRAYIDX5_IMAGP_1:%.*]] = getelementptr inbounds [3 x { float, float }], ptr [[A]], i64 [[INDVARS_IV]], i64 1, i32 1
+; CHECK-NEXT:    [[ARRAYIDX5_IMAGP_1:%.*]] = getelementptr inbounds [3 x { float, float }], ptr [[A]], i64 [[INDVARS_IV1]], i64 1, i32 1
 ; CHECK-NEXT:    [[ARRAYIDX5_IMAG_1:%.*]] = load float, ptr [[ARRAYIDX5_IMAGP_1]], align 8
 ; CHECK-NEXT:    [[MUL_1:%.*]] = fmul fast float [[ARRAYIDX5_REAL_1]], [[ARRAYIDX5_REAL_1]]
 ; CHECK-NEXT:    [[MUL9_1:%.*]] = fmul fast float [[ARRAYIDX5_IMAG_1]], [[ARRAYIDX5_IMAG_1]]
 ; CHECK-NEXT:    [[ADD_1:%.*]] = fadd fast float [[MUL9_1]], [[MUL_1]]
 ; CHECK-NEXT:    [[CONV_1:%.*]] = fpext float [[ADD_1]] to double
 ; CHECK-NEXT:    [[ADD10_1:%.*]] = fadd fast double [[CONV_1]], [[ADD10]]
-; CHECK-NEXT:    [[ARRAYIDX5_REALP_2:%.*]] = getelementptr inbounds [3 x { float, float }], ptr [[A]], i64 [[INDVARS_IV]], i64 2, i32 0
+; CHECK-NEXT:    [[ARRAYIDX5_REALP_2:%.*]] = getelementptr inbounds [3 x { float, float }], ptr [[A]], i64 [[INDVARS_IV1]], i64 2, i32 0
 ; CHECK-NEXT:    [[ARRAYIDX5_REAL_2:%.*]] = load float, ptr [[ARRAYIDX5_REALP_2]], align 8
-; CHECK-NEXT:    [[ARRAYIDX5_IMAGP_2:%.*]] = getelementptr inbounds [3 x { float, float }], ptr [[A]], i64 [[INDVARS_IV]], i64 2, i32 1
+; CHECK-NEXT:    [[ARRAYIDX5_IMAGP_2:%.*]] = getelementptr inbounds [3 x { float, float }], ptr [[A]], i64 [[INDVARS_IV1]], i64 2, i32 1
 ; CHECK-NEXT:    [[ARRAYIDX5_IMAG_2:%.*]] = load float, ptr [[ARRAYIDX5_IMAGP_2]], align 8
 ; CHECK-NEXT:    [[MUL_2:%.*]] = fmul fast float [[ARRAYIDX5_REAL_2]], [[ARRAYIDX5_REAL_2]]
 ; CHECK-NEXT:    [[MUL9_2:%.*]] = fmul fast float [[ARRAYIDX5_IMAG_2]], [[ARRAYIDX5_IMAG_2]]
 ; CHECK-NEXT:    [[ADD_2:%.*]] = fadd fast float [[MUL9_2]], [[MUL_2]]
 ; CHECK-NEXT:    [[CONV_2:%.*]] = fpext float [[ADD_2]] to double
 ; CHECK-NEXT:    [[ADD10_2]] = fadd fast double [[CONV_2]], [[ADD10_1]]
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV1]], 1
 ; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[FOR_COND_FOR_END13_CRIT_EDGE]], label %[[FOR_COND1_PREHEADER]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[FOR_COND_FOR_END13_CRIT_EDGE]], label %[[FOR_COND1_PREHEADER]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       [[FOR_COND_FOR_END13_CRIT_EDGE]]:
-; CHECK-NEXT:    [[ADD10_2_LCSSA:%.*]] = phi double [ [[ADD10_2]], %[[FOR_COND1_PREHEADER]] ], [ [[TMP158]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[ADD10_2_LCSSA:%.*]] = phi double [ [[ADD10_2]], %[[FOR_COND1_PREHEADER]] ], [ [[TMP158]], %[[MIDDLE_BLOCK]] ], [ [[TMP186]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    [[PHITMP:%.*]] = fptrunc double [[ADD10_2_LCSSA]] to float
 ; CHECK-NEXT:    br label %[[FOR_END13]]
 ; CHECK:       [[FOR_END13]]:
@@ -234,5 +385,6 @@ for.end13:                                        ; preds = %for.cond.for.end13_
 ; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
 ; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
 ; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
-; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]}
+; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META2]], [[META1]]}
 ;.
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/partial-reduce-dot-product.ll b/llvm/test/Transforms/LoopVectorize/RISCV/partial-reduce-dot-product.ll
index dff6c793897da..c37404c36f155 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/partial-reduce-dot-product.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/partial-reduce-dot-product.ll
@@ -65,7 +65,7 @@ define i32 @vqdot(ptr %a, ptr %b) #0 {
 ; ZVQDOTQ-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP9]], align 1
 ; ZVQDOTQ-NEXT:    [[TMP11:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
 ; ZVQDOTQ-NEXT:    [[TMP12:%.*]] = mul <vscale x 4 x i32> [[TMP11]], [[TMP8]]
-; ZVQDOTQ-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 1 x i32> @llvm.experimental.vector.partial.reduce.add.nxv1i32.nxv4i32(<vscale x 1 x i32> [[VEC_PHI]], <vscale x 4 x i32> [[TMP12]])
+; ZVQDOTQ-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 1 x i32> @llvm.vector.partial.reduce.add.nxv1i32.nxv4i32(<vscale x 1 x i32> [[VEC_PHI]], <vscale x 4 x i32> [[TMP12]])
 ; ZVQDOTQ-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; ZVQDOTQ-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; ZVQDOTQ-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -134,8 +134,8 @@ define i32 @vqdot(ptr %a, ptr %b) #0 {
 ; FIXED-ZVQDOTQ-NEXT:    [[TMP9:%.*]] = sext <8 x i8> [[WIDE_LOAD4]] to <8 x i32>
 ; FIXED-ZVQDOTQ-NEXT:    [[TMP10:%.*]] = mul <8 x i32> [[TMP8]], [[TMP3]]
 ; FIXED-ZVQDOTQ-NEXT:    [[TMP11:%.*]] = mul <8 x i32> [[TMP9]], [[TMP4]]
-; FIXED-ZVQDOTQ-NEXT:    [[PARTIAL_REDUCE]] = call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI]], <8 x i32> [[TMP10]])
-; FIXED-ZVQDOTQ-NEXT:    [[PARTIAL_REDUCE5]] = call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI1]], <8 x i32> [[TMP11]])
+; FIXED-ZVQDOTQ-NEXT:    [[PARTIAL_REDUCE]] = call <2 x i32> @llvm.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI]], <8 x i32> [[TMP10]])
+; FIXED-ZVQDOTQ-NEXT:    [[PARTIAL_REDUCE5]] = call <2 x i32> @llvm.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI1]], <8 x i32> [[TMP11]])
 ; FIXED-ZVQDOTQ-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; FIXED-ZVQDOTQ-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; FIXED-ZVQDOTQ-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -225,7 +225,7 @@ define i32 @vqdotu(ptr %a, ptr %b) #0 {
 ; ZVQDOTQ-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP9]], align 1
 ; ZVQDOTQ-NEXT:    [[TMP11:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
 ; ZVQDOTQ-NEXT:    [[TMP12:%.*]] = mul <vscale x 4 x i32> [[TMP11]], [[TMP8]]
-; ZVQDOTQ-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 1 x i32> @llvm.experimental.vector.partial.reduce.add.nxv1i32.nxv4i32(<vscale x 1 x i32> [[VEC_PHI]], <vscale x 4 x i32> [[TMP12]])
+; ZVQDOTQ-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 1 x i32> @llvm.vector.partial.reduce.add.nxv1i32.nxv4i32(<vscale x 1 x i32> [[VEC_PHI]], <vscale x 4 x i32> [[TMP12]])
 ; ZVQDOTQ-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; ZVQDOTQ-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; ZVQDOTQ-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
@@ -294,8 +294,8 @@ define i32 @vqdotu(ptr %a, ptr %b) #0 {
 ; FIXED-ZVQDOTQ-NEXT:    [[TMP9:%.*]] = zext <8 x i8> [[WIDE_LOAD4]] to <8 x i32>
 ; FIXED-ZVQDOTQ-NEXT:    [[TMP10:%.*]] = mul <8 x i32> [[TMP8]], [[TMP3]]
 ; FIXED-ZVQDOTQ-NEXT:    [[TMP11:%.*]] = mul <8 x i32> [[TMP9]], [[TMP4]]
-; FIXED-ZVQDOTQ-NEXT:    [[PARTIAL_REDUCE]] = call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI]], <8 x i32> [[TMP10]])
-; FIXED-ZVQDOTQ-NEXT:    [[PARTIAL_REDUCE5]] = call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI1]], <8 x i32> [[TMP11]])
+; FIXED-ZVQDOTQ-NEXT:    [[PARTIAL_REDUCE]] = call <2 x i32> @llvm.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI]], <8 x i32> [[TMP10]])
+; FIXED-ZVQDOTQ-NEXT:    [[PARTIAL_REDUCE5]] = call <2 x i32> @llvm.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI1]], <8 x i32> [[TMP11]])
 ; FIXED-ZVQDOTQ-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; FIXED-ZVQDOTQ-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; FIXED-ZVQDOTQ-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
@@ -385,7 +385,7 @@ define i32 @vqdotsu(ptr %a, ptr %b) #0 {
 ; ZVQDOTQ-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP9]], align 1
 ; ZVQDOTQ-NEXT:    [[TMP11:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
 ; ZVQDOTQ-NEXT:    [[TMP12:%.*]] = mul <vscale x 4 x i32> [[TMP11]], [[TMP8]]
-; ZVQDOTQ-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 1 x i32> @llvm.experimental.vector.partial.reduce.add.nxv1i32.nxv4i32(<vscale x 1 x i32> [[VEC_PHI]], <vscale x 4 x i32> [[TMP12]])
+; ZVQDOTQ-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 1 x i32> @llvm.vector.partial.reduce.add.nxv1i32.nxv4i32(<vscale x 1 x i32> [[VEC_PHI]], <vscale x 4 x i32> [[TMP12]])
 ; ZVQDOTQ-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; ZVQDOTQ-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; ZVQDOTQ-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
@@ -454,8 +454,8 @@ define i32 @vqdotsu(ptr %a, ptr %b) #0 {
 ; FIXED-ZVQDOTQ-NEXT:    [[TMP9:%.*]] = sext <8 x i8> [[WIDE_LOAD4]] to <8 x i32>
 ; FIXED-ZVQDOTQ-NEXT:    [[TMP10:%.*]] = mul <8 x i32> [[TMP8]], [[TMP3]]
 ; FIXED-ZVQDOTQ-NEXT:    [[TMP11:%.*]] = mul <8 x i32> [[TMP9]], [[TMP4]]
-; FIXED-ZVQDOTQ-NEXT:    [[PARTIAL_REDUCE]] = call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI]], <8 x i32> [[TMP10]])
-; FIXED-ZVQDOTQ-NEXT:    [[PARTIAL_REDUCE5]] = call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI1]], <8 x i32> [[TMP11]])
+; FIXED-ZVQDOTQ-NEXT:    [[PARTIAL_REDUCE]] = call <2 x i32> @llvm.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI]], <8 x i32> [[TMP10]])
+; FIXED-ZVQDOTQ-NEXT:    [[PARTIAL_REDUCE5]] = call <2 x i32> @llvm.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI1]], <8 x i32> [[TMP11]])
 ; FIXED-ZVQDOTQ-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; FIXED-ZVQDOTQ-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; FIXED-ZVQDOTQ-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
@@ -544,7 +544,7 @@ define i32 @vqdotsu2(ptr %a, ptr %b) #0 {
 ; ZVQDOTQ-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP9]], align 1
 ; ZVQDOTQ-NEXT:    [[TMP11:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
 ; ZVQDOTQ-NEXT:    [[TMP12:%.*]] = mul <vscale x 4 x i32> [[TMP11]], [[TMP8]]
-; ZVQDOTQ-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 1 x i32> @llvm.experimental.vector.partial.reduce.add.nxv1i32.nxv4i32(<vscale x 1 x i32> [[VEC_PHI]], <vscale x 4 x i32> [[TMP12]])
+; ZVQDOTQ-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 1 x i32> @llvm.vector.partial.reduce.add.nxv1i32.nxv4i32(<vscale x 1 x i32> [[VEC_PHI]], <vscale x 4 x i32> [[TMP12]])
 ; ZVQDOTQ-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; ZVQDOTQ-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; ZVQDOTQ-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
@@ -613,8 +613,8 @@ define i32 @vqdotsu2(ptr %a, ptr %b) #0 {
 ; FIXED-ZVQDOTQ-NEXT:    [[TMP9:%.*]] = zext <8 x i8> [[WIDE_LOAD4]] to <8 x i32>
 ; FIXED-ZVQDOTQ-NEXT:    [[TMP10:%.*]] = mul <8 x i32> [[TMP8]], [[TMP3]]
 ; FIXED-ZVQDOTQ-NEXT:    [[TMP11:%.*]] = mul <8 x i32> [[TMP9]], [[TMP4]]
-; FIXED-ZVQDOTQ-NEXT:    [[PARTIAL_REDUCE]] = call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI]], <8 x i32> [[TMP10]])
-; FIXED-ZVQDOTQ-NEXT:    [[PARTIAL_REDUCE5]] = call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI1]], <8 x i32> [[TMP11]])
+; FIXED-ZVQDOTQ-NEXT:    [[PARTIAL_REDUCE]] = call <2 x i32> @llvm.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI]], <8 x i32> [[TMP10]])
+; FIXED-ZVQDOTQ-NEXT:    [[PARTIAL_REDUCE5]] = call <2 x i32> @llvm.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI1]], <8 x i32> [[TMP11]])
 ; FIXED-ZVQDOTQ-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; FIXED-ZVQDOTQ-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; FIXED-ZVQDOTQ-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage-bf16.ll b/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage-bf16.ll
index 346f1cbcc7e3d..097f05d222cf6 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage-bf16.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage-bf16.ll
@@ -1,14 +1,11 @@
 ; REQUIRES: asserts
-; RUN: opt -passes=loop-vectorize -mtriple riscv64 -mattr=+v,+zvfbfmin -prefer-predicate-over-epilogue=scalar-epilogue -debug-only=loop-vectorize,vplan --disable-output -riscv-v-register-bit-width-lmul=1 -S < %s 2>&1 | FileCheck %s
-
-; TODO: -prefer-predicate-over-epilogue=scalar-epilogue was added to allow
-; unrolling. Calculate register pressure for all VPlans, not just unrolled ones,
-; and remove.
+; RUN: opt -passes=loop-vectorize -mtriple riscv64 -mattr=+v,+zvfbfmin -debug-only=loop-vectorize,vplan --disable-output -riscv-v-register-bit-width-lmul=1 -S < %s 2>&1 | FileCheck %s
 
 define void @add(ptr noalias nocapture readonly %src1, ptr noalias nocapture readonly %src2, i32 signext %size, ptr noalias nocapture writeonly %result) {
 ; CHECK-LABEL: add
-; CHECK:  LV(REG): Found max usage: 2 item
-; CHECK-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 3 registers
+; CHECK:       LV(REG): VF = vscale x 4
+; CHECK-NEXT:  LV(REG): Found max usage: 2 item
+; CHECK-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 6 registers
 ; CHECK-NEXT:  LV(REG): RegisterClass: RISCV::VRRC, 4 registers
 ; CHECK-NEXT:  LV(REG): Found invariant usage: 1 item
 ; CHECK-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 1 registers
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage-f16.ll b/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage-f16.ll
index b25bc485a9ca7..8bbfdf39a0624 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage-f16.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage-f16.ll
@@ -1,20 +1,19 @@
 ; REQUIRES: asserts
-; RUN: opt -passes=loop-vectorize -mtriple riscv64 -mattr=+v,+zvfh -prefer-predicate-over-epilogue=scalar-epilogue -debug-only=loop-vectorize,vplan --disable-output -riscv-v-register-bit-width-lmul=1 -S < %s 2>&1 | FileCheck %s --check-prefix=ZVFH
-; RUN: opt -passes=loop-vectorize -mtriple riscv64 -mattr=+v,+zvfhmin -prefer-predicate-over-epilogue=scalar-epilogue -debug-only=loop-vectorize,vplan --disable-output -riscv-v-register-bit-width-lmul=1 -S < %s 2>&1 | FileCheck %s --check-prefix=ZVFHMIN
-
-; TODO: -prefer-predicate-over-epilogue=scalar-epilogue was added to allow
-; unrolling. Calculate register pressure for all VPlans, not just unrolled ones,
-; and remove.
+; RUN: opt -passes=loop-vectorize -mtriple riscv64 -mattr=+v,+zvfh -debug-only=loop-vectorize,vplan --disable-output -riscv-v-register-bit-width-lmul=1 -S < %s 2>&1 | FileCheck %s --check-prefix=ZVFH
+; RUN: opt -passes=loop-vectorize -mtriple riscv64 -mattr=+v,+zvfhmin -debug-only=loop-vectorize,vplan --disable-output -riscv-v-register-bit-width-lmul=1 -S < %s 2>&1 | FileCheck %s --check-prefix=ZVFHMIN
 
 define void @add(ptr noalias nocapture readonly %src1, ptr noalias nocapture readonly %src2, i32 signext %size, ptr noalias nocapture writeonly %result) {
-; CHECK-LABEL: add
-; ZVFH:  LV(REG): Found max usage: 2 item
-; ZVFH-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 3 registers
+; ZVFH-LABEL: add
+; ZVFH:       LV(REG): VF = vscale x 4
+; ZVFH-NEXT:  LV(REG): Found max usage: 2 item
+; ZVFH-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 6 registers
 ; ZVFH-NEXT:  LV(REG): RegisterClass: RISCV::VRRC, 2 registers
 ; ZVFH-NEXT:  LV(REG): Found invariant usage: 1 item
 ; ZVFH-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 1 registers
-; ZVFHMIN:  LV(REG): Found max usage: 2 item
-; ZVFHMIN-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 3 registers
+; ZVFHMIN-LABEL: add
+; ZVFHMIN:       LV(REG): VF = vscale x 4
+; ZVFHMIN-NEXT:  LV(REG): Found max usage: 2 item
+; ZVFHMIN-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 6 registers
 ; ZVFHMIN-NEXT:  LV(REG): RegisterClass: RISCV::VRRC, 4 registers
 ; ZVFHMIN-NEXT:  LV(REG): Found invariant usage: 1 item
 ; ZVFHMIN-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 1 registers
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage-prune-vf.ll b/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage-prune-vf.ll
new file mode 100644
index 0000000000000..42f12ec2e4859
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage-prune-vf.ll
@@ -0,0 +1,233 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 5
+; RUN: opt -p loop-vectorize -mtriple riscv64 -mattr=+v -S < %s | FileCheck %s
+; RUN: opt -p loop-vectorize -mtriple riscv64 -mattr=+v -vectorizer-consider-reg-pressure=true -S < %s | FileCheck %s
+; RUN: opt -p loop-vectorize -mtriple riscv64 -mattr=+v -vectorizer-consider-reg-pressure=false -S < %s | FileCheck %s --check-prefix=NO-REG-PRESSURE-CHECK
+
+define void @f(ptr noalias %p0, ptr noalias %p1, ptr noalias %p2) {
+; CHECK-LABEL: define void @f(
+; CHECK-SAME: ptr noalias [[P0:%.*]], ptr noalias [[P1:%.*]], ptr noalias [[P2:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = mul <vscale x 4 x i64> [[TMP0]], splat (i64 2)
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = mul <vscale x 4 x i64> [[TMP2]], splat (i64 3)
+; CHECK-NEXT:    [[INDUCTION1:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP3]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
+; CHECK-NEXT:    [[TMP5:%.*]] = mul <vscale x 4 x i64> [[TMP4]], splat (i64 4)
+; CHECK-NEXT:    [[INDUCTION2:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP5]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND3:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION1]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT11:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND4:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION2]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT12:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[AVL:%.*]] = phi i64 [ 1025, %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
+; CHECK-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
+; CHECK-NEXT:    [[TMP8:%.*]] = mul i64 4, [[TMP7]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP8]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP6]] to i64
+; CHECK-NEXT:    [[TMP10:%.*]] = mul i64 3, [[TMP9]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP10]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT6:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT5]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = zext i32 [[TMP6]] to i64
+; CHECK-NEXT:    [[TMP12:%.*]] = mul i64 2, [[TMP11]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP12]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT8:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT7]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = sub <vscale x 4 x i64> [[VEC_IND]], splat (i64 1)
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[P0]], <vscale x 4 x i64> [[TMP13]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i8> @llvm.vp.gather.nxv4i8.nxv4p0(<vscale x 4 x ptr> align 1 [[TMP14]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP6]])
+; CHECK-NEXT:    [[TMP15:%.*]] = sub <vscale x 4 x i64> [[VEC_IND3]], splat (i64 1)
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[P0]], <vscale x 4 x i64> [[TMP15]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER9:%.*]] = call <vscale x 4 x i8> @llvm.vp.gather.nxv4i8.nxv4p0(<vscale x 4 x ptr> align 1 [[TMP16]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP6]])
+; CHECK-NEXT:    [[TMP17:%.*]] = sub <vscale x 4 x i64> [[VEC_IND4]], splat (i64 1)
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[P0]], <vscale x 4 x i64> [[TMP17]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER10:%.*]] = call <vscale x 4 x i8> @llvm.vp.gather.nxv4i8.nxv4p0(<vscale x 4 x ptr> align 1 [[TMP18]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP6]])
+; CHECK-NEXT:    [[TMP19:%.*]] = mul i64 [[EVL_BASED_IV]], 3
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[P1]], i64 [[TMP19]]
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr [[TMP20]], i8 0
+; CHECK-NEXT:    [[INTERLEAVE_EVL:%.*]] = mul nuw nsw i32 [[TMP6]], 3
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 12 x i8> @llvm.vector.interleave3.nxv12i8(<vscale x 4 x i8> [[WIDE_MASKED_GATHER]], <vscale x 4 x i8> [[WIDE_MASKED_GATHER9]], <vscale x 4 x i8> [[WIDE_MASKED_GATHER10]])
+; CHECK-NEXT:    call void @llvm.vp.store.nxv12i8.p0(<vscale x 12 x i8> [[INTERLEAVED_VEC]], ptr align 1 [[TMP21]], <vscale x 12 x i1> splat (i1 true), i32 [[INTERLEAVE_EVL]])
+; CHECK-NEXT:    [[TMP22:%.*]] = zext i32 [[TMP6]] to i64
+; CHECK-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP22]], [[EVL_BASED_IV]]
+; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP22]]
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT8]]
+; CHECK-NEXT:    [[VEC_IND_NEXT11]] = add <vscale x 4 x i64> [[VEC_IND3]], [[BROADCAST_SPLAT6]]
+; CHECK-NEXT:    [[VEC_IND_NEXT12]] = add <vscale x 4 x i64> [[VEC_IND4]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
+; CHECK-NEXT:    br i1 [[TMP23]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[WIDE_IV_0:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[WIDE_IV_0_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[WIDE_IV_1:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[WIDE_IV_1_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[WIDE_IV_2:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[WIDE_IV_2_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[WIDE_IV_0_SUB:%.*]] = sub i64 [[WIDE_IV_0]], 1
+; CHECK-NEXT:    [[A_GEP0:%.*]] = getelementptr i8, ptr [[P0]], i64 [[WIDE_IV_0_SUB]]
+; CHECK-NEXT:    [[A:%.*]] = load i8, ptr [[A_GEP0]], align 1
+; CHECK-NEXT:    [[WIDE_IV_1_SUB:%.*]] = sub i64 [[WIDE_IV_1]], 1
+; CHECK-NEXT:    [[B_GEP0:%.*]] = getelementptr i8, ptr [[P0]], i64 [[WIDE_IV_1_SUB]]
+; CHECK-NEXT:    [[B:%.*]] = load i8, ptr [[B_GEP0]], align 1
+; CHECK-NEXT:    [[WIDE_IV_2_SUB:%.*]] = sub i64 [[WIDE_IV_2]], 1
+; CHECK-NEXT:    [[C_GEP0:%.*]] = getelementptr i8, ptr [[P0]], i64 [[WIDE_IV_2_SUB]]
+; CHECK-NEXT:    [[C:%.*]] = load i8, ptr [[C_GEP0]], align 1
+; CHECK-NEXT:    [[IV_MUL:%.*]] = mul i64 [[IV]], 3
+; CHECK-NEXT:    [[BASE:%.*]] = getelementptr i8, ptr [[P1]], i64 [[IV_MUL]]
+; CHECK-NEXT:    [[A_GEP1:%.*]] = getelementptr i8, ptr [[BASE]], i8 0
+; CHECK-NEXT:    store i8 [[A]], ptr [[A_GEP1]], align 1
+; CHECK-NEXT:    [[B_GEP1:%.*]] = getelementptr i8, ptr [[BASE]], i8 1
+; CHECK-NEXT:    store i8 [[B]], ptr [[B_GEP1]], align 1
+; CHECK-NEXT:    [[C_GEP1:%.*]] = getelementptr i8, ptr [[BASE]], i8 2
+; CHECK-NEXT:    store i8 [[C]], ptr [[C_GEP1]], align 1
+; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-NEXT:    [[WIDE_IV_0_NEXT]] = add i64 [[WIDE_IV_0]], 2
+; CHECK-NEXT:    [[WIDE_IV_1_NEXT]] = add i64 [[WIDE_IV_1]], 3
+; CHECK-NEXT:    [[WIDE_IV_2_NEXT]] = add i64 [[WIDE_IV_2]], 4
+; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[IV]], 1024
+; CHECK-NEXT:    br i1 [[DONE]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+; NO-REG-PRESSURE-CHECK-LABEL: define void @f(
+; NO-REG-PRESSURE-CHECK-SAME: ptr noalias [[P0:%.*]], ptr noalias [[P1:%.*]], ptr noalias [[P2:%.*]]) #[[ATTR0:[0-9]+]] {
+; NO-REG-PRESSURE-CHECK-NEXT:  [[ENTRY:.*:]]
+; NO-REG-PRESSURE-CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; NO-REG-PRESSURE-CHECK:       [[VECTOR_PH]]:
+; NO-REG-PRESSURE-CHECK-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.stepvector.nxv8i64()
+; NO-REG-PRESSURE-CHECK-NEXT:    [[TMP1:%.*]] = mul <vscale x 8 x i64> [[TMP0]], splat (i64 2)
+; NO-REG-PRESSURE-CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 8 x i64> zeroinitializer, [[TMP1]]
+; NO-REG-PRESSURE-CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 8 x i64> @llvm.stepvector.nxv8i64()
+; NO-REG-PRESSURE-CHECK-NEXT:    [[TMP3:%.*]] = mul <vscale x 8 x i64> [[TMP2]], splat (i64 3)
+; NO-REG-PRESSURE-CHECK-NEXT:    [[INDUCTION1:%.*]] = add <vscale x 8 x i64> zeroinitializer, [[TMP3]]
+; NO-REG-PRESSURE-CHECK-NEXT:    [[TMP4:%.*]] = call <vscale x 8 x i64> @llvm.stepvector.nxv8i64()
+; NO-REG-PRESSURE-CHECK-NEXT:    [[TMP5:%.*]] = mul <vscale x 8 x i64> [[TMP4]], splat (i64 4)
+; NO-REG-PRESSURE-CHECK-NEXT:    [[INDUCTION2:%.*]] = add <vscale x 8 x i64> zeroinitializer, [[TMP5]]
+; NO-REG-PRESSURE-CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; NO-REG-PRESSURE-CHECK:       [[VECTOR_BODY]]:
+; NO-REG-PRESSURE-CHECK-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; NO-REG-PRESSURE-CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 8 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; NO-REG-PRESSURE-CHECK-NEXT:    [[VEC_IND3:%.*]] = phi <vscale x 8 x i64> [ [[INDUCTION1]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT11:%.*]], %[[VECTOR_BODY]] ]
+; NO-REG-PRESSURE-CHECK-NEXT:    [[VEC_IND4:%.*]] = phi <vscale x 8 x i64> [ [[INDUCTION2]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT12:%.*]], %[[VECTOR_BODY]] ]
+; NO-REG-PRESSURE-CHECK-NEXT:    [[AVL:%.*]] = phi i64 [ 1025, %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; NO-REG-PRESSURE-CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 8, i1 true)
+; NO-REG-PRESSURE-CHECK-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
+; NO-REG-PRESSURE-CHECK-NEXT:    [[TMP8:%.*]] = mul i64 4, [[TMP7]]
+; NO-REG-PRESSURE-CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i64> poison, i64 [[TMP8]], i64 0
+; NO-REG-PRESSURE-CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 8 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
+; NO-REG-PRESSURE-CHECK-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP6]] to i64
+; NO-REG-PRESSURE-CHECK-NEXT:    [[TMP10:%.*]] = mul i64 3, [[TMP9]]
+; NO-REG-PRESSURE-CHECK-NEXT:    [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <vscale x 8 x i64> poison, i64 [[TMP10]], i64 0
+; NO-REG-PRESSURE-CHECK-NEXT:    [[BROADCAST_SPLAT6:%.*]] = shufflevector <vscale x 8 x i64> [[BROADCAST_SPLATINSERT5]], <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
+; NO-REG-PRESSURE-CHECK-NEXT:    [[TMP11:%.*]] = zext i32 [[TMP6]] to i64
+; NO-REG-PRESSURE-CHECK-NEXT:    [[TMP12:%.*]] = mul i64 2, [[TMP11]]
+; NO-REG-PRESSURE-CHECK-NEXT:    [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <vscale x 8 x i64> poison, i64 [[TMP12]], i64 0
+; NO-REG-PRESSURE-CHECK-NEXT:    [[BROADCAST_SPLAT8:%.*]] = shufflevector <vscale x 8 x i64> [[BROADCAST_SPLATINSERT7]], <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
+; NO-REG-PRESSURE-CHECK-NEXT:    [[TMP13:%.*]] = sub <vscale x 8 x i64> [[VEC_IND]], splat (i64 1)
+; NO-REG-PRESSURE-CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[P0]], <vscale x 8 x i64> [[TMP13]]
+; NO-REG-PRESSURE-CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 8 x i8> @llvm.vp.gather.nxv8i8.nxv8p0(<vscale x 8 x ptr> align 1 [[TMP14]], <vscale x 8 x i1> splat (i1 true), i32 [[TMP6]])
+; NO-REG-PRESSURE-CHECK-NEXT:    [[TMP15:%.*]] = sub <vscale x 8 x i64> [[VEC_IND3]], splat (i64 1)
+; NO-REG-PRESSURE-CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[P0]], <vscale x 8 x i64> [[TMP15]]
+; NO-REG-PRESSURE-CHECK-NEXT:    [[WIDE_MASKED_GATHER9:%.*]] = call <vscale x 8 x i8> @llvm.vp.gather.nxv8i8.nxv8p0(<vscale x 8 x ptr> align 1 [[TMP16]], <vscale x 8 x i1> splat (i1 true), i32 [[TMP6]])
+; NO-REG-PRESSURE-CHECK-NEXT:    [[TMP17:%.*]] = sub <vscale x 8 x i64> [[VEC_IND4]], splat (i64 1)
+; NO-REG-PRESSURE-CHECK-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[P0]], <vscale x 8 x i64> [[TMP17]]
+; NO-REG-PRESSURE-CHECK-NEXT:    [[WIDE_MASKED_GATHER10:%.*]] = call <vscale x 8 x i8> @llvm.vp.gather.nxv8i8.nxv8p0(<vscale x 8 x ptr> align 1 [[TMP18]], <vscale x 8 x i1> splat (i1 true), i32 [[TMP6]])
+; NO-REG-PRESSURE-CHECK-NEXT:    [[TMP19:%.*]] = mul i64 [[EVL_BASED_IV]], 3
+; NO-REG-PRESSURE-CHECK-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[P1]], i64 [[TMP19]]
+; NO-REG-PRESSURE-CHECK-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr [[TMP20]], i8 0
+; NO-REG-PRESSURE-CHECK-NEXT:    [[INTERLEAVE_EVL:%.*]] = mul nuw nsw i32 [[TMP6]], 3
+; NO-REG-PRESSURE-CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 24 x i8> @llvm.vector.interleave3.nxv24i8(<vscale x 8 x i8> [[WIDE_MASKED_GATHER]], <vscale x 8 x i8> [[WIDE_MASKED_GATHER9]], <vscale x 8 x i8> [[WIDE_MASKED_GATHER10]])
+; NO-REG-PRESSURE-CHECK-NEXT:    call void @llvm.vp.store.nxv24i8.p0(<vscale x 24 x i8> [[INTERLEAVED_VEC]], ptr align 1 [[TMP21]], <vscale x 24 x i1> splat (i1 true), i32 [[INTERLEAVE_EVL]])
+; NO-REG-PRESSURE-CHECK-NEXT:    [[TMP22:%.*]] = zext i32 [[TMP6]] to i64
+; NO-REG-PRESSURE-CHECK-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP22]], [[EVL_BASED_IV]]
+; NO-REG-PRESSURE-CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP22]]
+; NO-REG-PRESSURE-CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 8 x i64> [[VEC_IND]], [[BROADCAST_SPLAT8]]
+; NO-REG-PRESSURE-CHECK-NEXT:    [[VEC_IND_NEXT11]] = add <vscale x 8 x i64> [[VEC_IND3]], [[BROADCAST_SPLAT6]]
+; NO-REG-PRESSURE-CHECK-NEXT:    [[VEC_IND_NEXT12]] = add <vscale x 8 x i64> [[VEC_IND4]], [[BROADCAST_SPLAT]]
+; NO-REG-PRESSURE-CHECK-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
+; NO-REG-PRESSURE-CHECK-NEXT:    br i1 [[TMP23]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; NO-REG-PRESSURE-CHECK:       [[MIDDLE_BLOCK]]:
+; NO-REG-PRESSURE-CHECK-NEXT:    br label %[[EXIT:.*]]
+; NO-REG-PRESSURE-CHECK:       [[SCALAR_PH]]:
+; NO-REG-PRESSURE-CHECK-NEXT:    br label %[[LOOP:.*]]
+; NO-REG-PRESSURE-CHECK:       [[LOOP]]:
+; NO-REG-PRESSURE-CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; NO-REG-PRESSURE-CHECK-NEXT:    [[WIDE_IV_0:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[WIDE_IV_0_NEXT:%.*]], %[[LOOP]] ]
+; NO-REG-PRESSURE-CHECK-NEXT:    [[WIDE_IV_1:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[WIDE_IV_1_NEXT:%.*]], %[[LOOP]] ]
+; NO-REG-PRESSURE-CHECK-NEXT:    [[WIDE_IV_2:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[WIDE_IV_2_NEXT:%.*]], %[[LOOP]] ]
+; NO-REG-PRESSURE-CHECK-NEXT:    [[WIDE_IV_0_SUB:%.*]] = sub i64 [[WIDE_IV_0]], 1
+; NO-REG-PRESSURE-CHECK-NEXT:    [[A_GEP0:%.*]] = getelementptr i8, ptr [[P0]], i64 [[WIDE_IV_0_SUB]]
+; NO-REG-PRESSURE-CHECK-NEXT:    [[A:%.*]] = load i8, ptr [[A_GEP0]], align 1
+; NO-REG-PRESSURE-CHECK-NEXT:    [[WIDE_IV_1_SUB:%.*]] = sub i64 [[WIDE_IV_1]], 1
+; NO-REG-PRESSURE-CHECK-NEXT:    [[B_GEP0:%.*]] = getelementptr i8, ptr [[P0]], i64 [[WIDE_IV_1_SUB]]
+; NO-REG-PRESSURE-CHECK-NEXT:    [[B:%.*]] = load i8, ptr [[B_GEP0]], align 1
+; NO-REG-PRESSURE-CHECK-NEXT:    [[WIDE_IV_2_SUB:%.*]] = sub i64 [[WIDE_IV_2]], 1
+; NO-REG-PRESSURE-CHECK-NEXT:    [[C_GEP0:%.*]] = getelementptr i8, ptr [[P0]], i64 [[WIDE_IV_2_SUB]]
+; NO-REG-PRESSURE-CHECK-NEXT:    [[C:%.*]] = load i8, ptr [[C_GEP0]], align 1
+; NO-REG-PRESSURE-CHECK-NEXT:    [[IV_MUL:%.*]] = mul i64 [[IV]], 3
+; NO-REG-PRESSURE-CHECK-NEXT:    [[BASE:%.*]] = getelementptr i8, ptr [[P1]], i64 [[IV_MUL]]
+; NO-REG-PRESSURE-CHECK-NEXT:    [[A_GEP1:%.*]] = getelementptr i8, ptr [[BASE]], i8 0
+; NO-REG-PRESSURE-CHECK-NEXT:    store i8 [[A]], ptr [[A_GEP1]], align 1
+; NO-REG-PRESSURE-CHECK-NEXT:    [[B_GEP1:%.*]] = getelementptr i8, ptr [[BASE]], i8 1
+; NO-REG-PRESSURE-CHECK-NEXT:    store i8 [[B]], ptr [[B_GEP1]], align 1
+; NO-REG-PRESSURE-CHECK-NEXT:    [[C_GEP1:%.*]] = getelementptr i8, ptr [[BASE]], i8 2
+; NO-REG-PRESSURE-CHECK-NEXT:    store i8 [[C]], ptr [[C_GEP1]], align 1
+; NO-REG-PRESSURE-CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; NO-REG-PRESSURE-CHECK-NEXT:    [[WIDE_IV_0_NEXT]] = add i64 [[WIDE_IV_0]], 2
+; NO-REG-PRESSURE-CHECK-NEXT:    [[WIDE_IV_1_NEXT]] = add i64 [[WIDE_IV_1]], 3
+; NO-REG-PRESSURE-CHECK-NEXT:    [[WIDE_IV_2_NEXT]] = add i64 [[WIDE_IV_2]], 4
+; NO-REG-PRESSURE-CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[IV]], 1024
+; NO-REG-PRESSURE-CHECK-NEXT:    br i1 [[DONE]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP4:![0-9]+]]
+; NO-REG-PRESSURE-CHECK:       [[EXIT]]:
+; NO-REG-PRESSURE-CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %wide.iv.0 = phi i64 [ 0, %entry ], [ %wide.iv.0.next, %loop ]
+  %wide.iv.1 = phi i64 [ 0, %entry ], [ %wide.iv.1.next, %loop ]
+  %wide.iv.2 = phi i64 [ 0, %entry ], [ %wide.iv.2.next, %loop ]
+
+  %wide.iv.0.sub = sub i64 %wide.iv.0, 1
+  %a.gep0 = getelementptr i8, ptr %p0, i64 %wide.iv.0.sub
+  %a = load i8, ptr %a.gep0
+
+  %wide.iv.1.sub = sub i64 %wide.iv.1, 1
+  %b.gep0 = getelementptr i8, ptr %p0, i64 %wide.iv.1.sub
+  %b = load i8, ptr %b.gep0
+
+  %wide.iv.2.sub = sub i64 %wide.iv.2, 1
+  %c.gep0 = getelementptr i8, ptr %p0, i64 %wide.iv.2.sub
+  %c = load i8, ptr %c.gep0
+
+  %iv.mul = mul i64 %iv, 3
+  %base = getelementptr i8, ptr %p1, i64 %iv.mul
+
+  %a.gep1 = getelementptr i8, ptr %base, i8 0
+  store i8 %a, ptr %a.gep1
+
+  %b.gep1 = getelementptr i8, ptr %base, i8 1
+  store i8 %b, ptr %b.gep1
+
+  %c.gep1 = getelementptr i8, ptr %base, i8 2
+  store i8 %c, ptr %c.gep1
+
+  %iv.next = add i64 %iv, 1
+  %wide.iv.0.next = add i64 %wide.iv.0, 2
+  %wide.iv.1.next = add i64 %wide.iv.1, 3
+  %wide.iv.2.next = add i64 %wide.iv.2, 4
+  %done = icmp eq i64 %iv, 1024
+  br i1 %done, label %exit, label %loop
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage.ll b/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage.ll
index 116ccc9961795..99139da67bb78 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage.ll
@@ -5,50 +5,54 @@
 ; RUN:   -S < %s 2>&1 | FileCheck %s --check-prefix=CHECK-SCALAR
 ; RUN: opt -passes=loop-vectorize -mtriple riscv64-linux-gnu \
 ; RUN:   -mattr=+v,+d -debug-only=loop-vectorize,vplan --disable-output \
-; RUN:   -riscv-v-register-bit-width-lmul=1 -prefer-predicate-over-epilogue=scalar-epilogue \
+; RUN:   -riscv-v-register-bit-width-lmul=1 \
 ; RUN:   -S < %s 2>&1 | FileCheck %s --check-prefix=CHECK-LMUL1
 ; RUN: opt -passes=loop-vectorize -mtriple riscv64-linux-gnu \
 ; RUN:   -mattr=+v,+d -debug-only=loop-vectorize,vplan --disable-output \
-; RUN:   -riscv-v-register-bit-width-lmul=2 -prefer-predicate-over-epilogue=scalar-epilogue \
+; RUN:   -riscv-v-register-bit-width-lmul=2 \
 ; RUN:   -S < %s 2>&1 | FileCheck %s --check-prefix=CHECK-LMUL2
 ; RUN: opt -passes=loop-vectorize -mtriple riscv64-linux-gnu \
 ; RUN:   -mattr=+v,+d -debug-only=loop-vectorize,vplan --disable-output \
-; RUN:   -riscv-v-register-bit-width-lmul=4 -prefer-predicate-over-epilogue=scalar-epilogue \
+; RUN:   -riscv-v-register-bit-width-lmul=4 \
 ; RUN:   -S < %s 2>&1 | FileCheck %s --check-prefix=CHECK-LMUL4
 ; RUN: opt -passes=loop-vectorize -mtriple riscv64-linux-gnu \
 ; RUN:   -mattr=+v,+d -debug-only=loop-vectorize,vplan --disable-output \
-; RUN:   -riscv-v-register-bit-width-lmul=8 -prefer-predicate-over-epilogue=scalar-epilogue \
+; RUN:   -riscv-v-register-bit-width-lmul=8 \
 ; RUN:   -S < %s 2>&1 | FileCheck %s --check-prefix=CHECK-LMUL8
 
-; TODO: -prefer-predicate-over-epilogue=scalar-epilogue was added to allow
-; unrolling. Calculate register pressure for all VPlans, not just unrolled ones,
-; and remove.
-
 define void @add(ptr noalias nocapture readonly %src1, ptr noalias nocapture readonly %src2, i32 signext %size, ptr noalias nocapture writeonly %result) {
-; CHECK-LABEL: add
+; CHECK-SCALAR-LABEL: add
 ; CHECK-SCALAR:      LV(REG): VF = 1
 ; CHECK-SCALAR-NEXT: LV(REG): Found max usage: 2 item
 ; CHECK-SCALAR-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 3 registers
 ; CHECK-SCALAR-NEXT: LV(REG): RegisterClass: RISCV::FPRRC, 2 registers
 ; CHECK-SCALAR-NEXT: LV(REG): Found invariant usage: 1 item
 ; CHECK-SCALAR-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 1 registers
-; CHECK-LMUL1:       LV(REG): Found max usage: 2 item
-; CHECK-LMUL1-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 3 registers
+; CHECK-LMUL1-LABEL: add
+; CHECK-LMUL1:       LV(REG): VF = vscale x 2
+; CHECK-LMUL1-NEXT:  LV(REG): Found max usage: 2 item
+; CHECK-LMUL1-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 6 registers
 ; CHECK-LMUL1-NEXT:  LV(REG): RegisterClass: RISCV::VRRC, 2 registers
 ; CHECK-LMUL1-NEXT:  LV(REG): Found invariant usage: 1 item
 ; CHECK-LMUL1-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 1 registers
-; CHECK-LMUL2:       LV(REG): Found max usage: 2 item
-; CHECK-LMUL2-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 3 registers
+; CHECK-LMUL2-LABEL: add
+; CHECK-LMUL2:       LV(REG): VF = vscale x 4
+; CHECK-LMUL2-NEXT:  LV(REG): Found max usage: 2 item
+; CHECK-LMUL2-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 6 registers
 ; CHECK-LMUL2-NEXT:  LV(REG): RegisterClass: RISCV::VRRC, 4 registers
 ; CHECK-LMUL2-NEXT:  LV(REG): Found invariant usage: 1 item
 ; CHECK-LMUL2-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 1 registers
-; CHECK-LMUL4:       LV(REG): Found max usage: 2 item
-; CHECK-LMUL4-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 3 registers
+; CHECK-LMUL4-LABEL: add
+; CHECK-LMUL4:       LV(REG): VF = vscale x 8
+; CHECK-LMUL4-NEXT:  LV(REG): Found max usage: 2 item
+; CHECK-LMUL4-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 6 registers
 ; CHECK-LMUL4-NEXT:  LV(REG): RegisterClass: RISCV::VRRC, 8 registers
 ; CHECK-LMUL4-NEXT:  LV(REG): Found invariant usage: 1 item
 ; CHECK-LMUL4-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 1 registers
-; CHECK-LMUL8:       LV(REG): Found max usage: 2 item
-; CHECK-LMUL8-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 3 registers
+; CHECK-LMUL8-LABEL: add
+; CHECK-LMUL8:       LV(REG): VF = vscale x 16
+; CHECK-LMUL8-NEXT:  LV(REG): Found max usage: 2 item
+; CHECK-LMUL8-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 6 registers
 ; CHECK-LMUL8-NEXT:  LV(REG): RegisterClass: RISCV::VRRC, 16 registers
 ; CHECK-LMUL8-NEXT:  LV(REG): Found invariant usage: 1 item
 ; CHECK-LMUL8-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 1 registers
@@ -76,22 +80,26 @@ for.body:
 }
 
 define void @goo(ptr nocapture noundef %a, i32 noundef signext %n) {
-; CHECK-LABEL: goo
+; CHECK-SCALAR-LABEL: goo
 ; CHECK-SCALAR:      LV(REG): VF = 1
 ; CHECK-SCALAR-NEXT: LV(REG): Found max usage: 1 item
 ; CHECK-SCALAR-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 3 registers
-; CHECK-LMUL1:       LV(REG): Found max usage: 2 item
-; CHECK-LMUL1-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 2 registers
-; CHECK-LMUL1-NEXT:  LV(REG): RegisterClass: RISCV::VRRC, 1 registers
-; CHECK-LMUL2:       LV(REG): Found max usage: 2 item
-; CHECK-LMUL2-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 2 registers
-; CHECK-LMUL2-NEXT:  LV(REG): RegisterClass: RISCV::VRRC, 2 registers
-; CHECK-LMUL4:       LV(REG): Found max usage: 2 item
-; CHECK-LMUL4-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 2 registers
-; CHECK-LMUL4-NEXT:  LV(REG): RegisterClass: RISCV::VRRC, 4 registers
-; CHECK-LMUL8:       LV(REG): Found max usage: 2 item
-; CHECK-LMUL8-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 2 registers
-; CHECK-LMUL8-NEXT:  LV(REG): RegisterClass: RISCV::VRRC, 8 registers
+; CHECK-LMUL1:       LV(REG): VF = vscale x 2
+; CHECK-LMUL1-NEXT:  LV(REG): Found max usage: 2 item
+; CHECK-LMUL1-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 6 registers
+; CHECK-LMUL1-NEXT:  LV(REG): RegisterClass: RISCV::VRRC, 2 registers
+; CHECK-LMUL2:       LV(REG): VF = vscale x 4
+; CHECK-LMUL2-NEXT:  LV(REG): Found max usage: 2 item
+; CHECK-LMUL2-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 6 registers
+; CHECK-LMUL2-NEXT:  LV(REG): RegisterClass: RISCV::VRRC, 4 registers
+; CHECK-LMUL4:       LV(REG): VF = vscale x 8
+; CHECK-LMUL4-NEXT:  LV(REG): Found max usage: 2 item
+; CHECK-LMUL4-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 6 registers
+; CHECK-LMUL4-NEXT:  LV(REG): RegisterClass: RISCV::VRRC, 8 registers
+; CHECK-LMUL8:       LV(REG): VF = vscale x 16
+; CHECK-LMUL8-NEXT:  LV(REG): Found max usage: 2 item
+; CHECK-LMUL8-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 6 registers
+; CHECK-LMUL8-NEXT:  LV(REG): RegisterClass: RISCV::VRRC, 16 registers
 entry:
   %cmp3 = icmp sgt i32 %n, 0
   br i1 %cmp3, label %for.body.preheader, label %for.cond.cleanup
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll b/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll
index 7a3d81b240394..ce3e734bdd84d 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll
@@ -430,11 +430,8 @@ define void @single_stride_int_scaled(ptr %p, i64 %stride) {
 ; NOSTRIDED-UF2-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i32>, ptr [[TMP7]], align 4
 ; NOSTRIDED-UF2-NEXT:    [[TMP8:%.*]] = add <vscale x 4 x i32> [[WIDE_LOAD]], splat (i32 1)
 ; NOSTRIDED-UF2-NEXT:    [[TMP9:%.*]] = add <vscale x 4 x i32> [[WIDE_LOAD1]], splat (i32 1)
-; NOSTRIDED-UF2-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
-; NOSTRIDED-UF2-NEXT:    [[TMP11:%.*]] = shl nuw i64 [[TMP10]], 2
-; NOSTRIDED-UF2-NEXT:    [[TMP12:%.*]] = getelementptr i32, ptr [[TMP4]], i64 [[TMP11]]
 ; NOSTRIDED-UF2-NEXT:    store <vscale x 4 x i32> [[TMP8]], ptr [[TMP4]], align 4
-; NOSTRIDED-UF2-NEXT:    store <vscale x 4 x i32> [[TMP9]], ptr [[TMP12]], align 4
+; NOSTRIDED-UF2-NEXT:    store <vscale x 4 x i32> [[TMP9]], ptr [[TMP7]], align 4
 ; NOSTRIDED-UF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; NOSTRIDED-UF2-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; NOSTRIDED-UF2-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
@@ -557,11 +554,8 @@ define void @single_stride_int_iv(ptr %p, i64 %stride) {
 ; NOSTRIDED-UF2-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i32>, ptr [[TMP7]], align 4
 ; NOSTRIDED-UF2-NEXT:    [[TMP8:%.*]] = add <vscale x 4 x i32> [[WIDE_LOAD]], splat (i32 1)
 ; NOSTRIDED-UF2-NEXT:    [[TMP9:%.*]] = add <vscale x 4 x i32> [[WIDE_LOAD1]], splat (i32 1)
-; NOSTRIDED-UF2-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
-; NOSTRIDED-UF2-NEXT:    [[TMP11:%.*]] = shl nuw i64 [[TMP10]], 2
-; NOSTRIDED-UF2-NEXT:    [[TMP12:%.*]] = getelementptr i32, ptr [[TMP4]], i64 [[TMP11]]
 ; NOSTRIDED-UF2-NEXT:    store <vscale x 4 x i32> [[TMP8]], ptr [[TMP4]], align 4
-; NOSTRIDED-UF2-NEXT:    store <vscale x 4 x i32> [[TMP9]], ptr [[TMP12]], align 4
+; NOSTRIDED-UF2-NEXT:    store <vscale x 4 x i32> [[TMP9]], ptr [[TMP7]], align 4
 ; NOSTRIDED-UF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; NOSTRIDED-UF2-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; NOSTRIDED-UF2-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
@@ -1060,11 +1054,8 @@ define void @double_stride_int_iv(ptr %p, ptr %p2, i64 %stride) {
 ; NOSTRIDED-UF2-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i32>, ptr [[TMP7]], align 4
 ; NOSTRIDED-UF2-NEXT:    [[TMP8:%.*]] = add <vscale x 4 x i32> [[WIDE_LOAD]], splat (i32 1)
 ; NOSTRIDED-UF2-NEXT:    [[TMP9:%.*]] = add <vscale x 4 x i32> [[WIDE_LOAD1]], splat (i32 1)
-; NOSTRIDED-UF2-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
-; NOSTRIDED-UF2-NEXT:    [[TMP11:%.*]] = shl nuw i64 [[TMP10]], 2
-; NOSTRIDED-UF2-NEXT:    [[TMP12:%.*]] = getelementptr i32, ptr [[TMP4]], i64 [[TMP11]]
 ; NOSTRIDED-UF2-NEXT:    store <vscale x 4 x i32> [[TMP8]], ptr [[TMP4]], align 4
-; NOSTRIDED-UF2-NEXT:    store <vscale x 4 x i32> [[TMP9]], ptr [[TMP12]], align 4
+; NOSTRIDED-UF2-NEXT:    store <vscale x 4 x i32> [[TMP9]], ptr [[TMP7]], align 4
 ; NOSTRIDED-UF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; NOSTRIDED-UF2-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; NOSTRIDED-UF2-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/cost-conditional-branches.ll b/llvm/test/Transforms/LoopVectorize/X86/cost-conditional-branches.ll
index 6c1b2568d872a..a3623ddddeef0 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/cost-conditional-branches.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/cost-conditional-branches.ll
@@ -892,6 +892,458 @@ exit:
   ret void
 }
 
+; Test case for https://github.com/llvm/llvm-project/issues/158660.
+define i64 @test_predicated_udiv(i32 %d, i1 %c) #2 {
+; CHECK-LABEL: @test_predicated_udiv(
+; CHECK-NEXT:  iter.check:
+; CHECK-NEXT:    br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
+; CHECK:       vector.main.loop.iter.check:
+; CHECK-NEXT:    br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <32 x i1> poison, i1 [[C:%.*]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <32 x i1> [[BROADCAST_SPLATINSERT]], <32 x i1> poison, <32 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP0:%.*]] = xor <32 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UDIV_CONTINUE62:%.*]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <32 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_UDIV_CONTINUE62]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <32 x i32> @llvm.usub.sat.v32i32(<32 x i32> [[VEC_IND]], <32 x i32> splat (i32 1))
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <32 x i1> [[TMP0]], i32 0
+; CHECK-NEXT:    br i1 [[TMP2]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]]
+; CHECK:       pred.udiv.if:
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <32 x i32> [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = udiv i32 [[TMP3]], [[D:%.*]]
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <32 x i32> poison, i32 [[TMP4]], i32 0
+; CHECK-NEXT:    br label [[PRED_UDIV_CONTINUE]]
+; CHECK:       pred.udiv.continue:
+; CHECK-NEXT:    [[TMP6:%.*]] = phi <32 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP5]], [[PRED_UDIV_IF]] ]
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <32 x i1> [[TMP0]], i32 1
+; CHECK-NEXT:    br i1 [[TMP7]], label [[PRED_UDIV_IF1:%.*]], label [[PRED_UDIV_CONTINUE2:%.*]]
+; CHECK:       pred.udiv.if1:
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <32 x i32> [[TMP1]], i32 1
+; CHECK-NEXT:    [[TMP9:%.*]] = udiv i32 [[TMP8]], [[D]]
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <32 x i32> [[TMP6]], i32 [[TMP9]], i32 1
+; CHECK-NEXT:    br label [[PRED_UDIV_CONTINUE2]]
+; CHECK:       pred.udiv.continue2:
+; CHECK-NEXT:    [[TMP11:%.*]] = phi <32 x i32> [ [[TMP6]], [[PRED_UDIV_CONTINUE]] ], [ [[TMP10]], [[PRED_UDIV_IF1]] ]
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <32 x i1> [[TMP0]], i32 2
+; CHECK-NEXT:    br i1 [[TMP12]], label [[PRED_UDIV_IF3:%.*]], label [[PRED_UDIV_CONTINUE4:%.*]]
+; CHECK:       pred.udiv.if3:
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <32 x i32> [[TMP1]], i32 2
+; CHECK-NEXT:    [[TMP14:%.*]] = udiv i32 [[TMP13]], [[D]]
+; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <32 x i32> [[TMP11]], i32 [[TMP14]], i32 2
+; CHECK-NEXT:    br label [[PRED_UDIV_CONTINUE4]]
+; CHECK:       pred.udiv.continue4:
+; CHECK-NEXT:    [[TMP16:%.*]] = phi <32 x i32> [ [[TMP11]], [[PRED_UDIV_CONTINUE2]] ], [ [[TMP15]], [[PRED_UDIV_IF3]] ]
+; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <32 x i1> [[TMP0]], i32 3
+; CHECK-NEXT:    br i1 [[TMP17]], label [[PRED_UDIV_IF5:%.*]], label [[PRED_UDIV_CONTINUE6:%.*]]
+; CHECK:       pred.udiv.if5:
+; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <32 x i32> [[TMP1]], i32 3
+; CHECK-NEXT:    [[TMP19:%.*]] = udiv i32 [[TMP18]], [[D]]
+; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <32 x i32> [[TMP16]], i32 [[TMP19]], i32 3
+; CHECK-NEXT:    br label [[PRED_UDIV_CONTINUE6]]
+; CHECK:       pred.udiv.continue6:
+; CHECK-NEXT:    [[TMP21:%.*]] = phi <32 x i32> [ [[TMP16]], [[PRED_UDIV_CONTINUE4]] ], [ [[TMP20]], [[PRED_UDIV_IF5]] ]
+; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <32 x i1> [[TMP0]], i32 4
+; CHECK-NEXT:    br i1 [[TMP22]], label [[PRED_UDIV_IF7:%.*]], label [[PRED_UDIV_CONTINUE8:%.*]]
+; CHECK:       pred.udiv.if7:
+; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <32 x i32> [[TMP1]], i32 4
+; CHECK-NEXT:    [[TMP24:%.*]] = udiv i32 [[TMP23]], [[D]]
+; CHECK-NEXT:    [[TMP25:%.*]] = insertelement <32 x i32> [[TMP21]], i32 [[TMP24]], i32 4
+; CHECK-NEXT:    br label [[PRED_UDIV_CONTINUE8]]
+; CHECK:       pred.udiv.continue8:
+; CHECK-NEXT:    [[TMP26:%.*]] = phi <32 x i32> [ [[TMP21]], [[PRED_UDIV_CONTINUE6]] ], [ [[TMP25]], [[PRED_UDIV_IF7]] ]
+; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <32 x i1> [[TMP0]], i32 5
+; CHECK-NEXT:    br i1 [[TMP27]], label [[PRED_UDIV_IF9:%.*]], label [[PRED_UDIV_CONTINUE10:%.*]]
+; CHECK:       pred.udiv.if9:
+; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <32 x i32> [[TMP1]], i32 5
+; CHECK-NEXT:    [[TMP29:%.*]] = udiv i32 [[TMP28]], [[D]]
+; CHECK-NEXT:    [[TMP30:%.*]] = insertelement <32 x i32> [[TMP26]], i32 [[TMP29]], i32 5
+; CHECK-NEXT:    br label [[PRED_UDIV_CONTINUE10]]
+; CHECK:       pred.udiv.continue10:
+; CHECK-NEXT:    [[TMP31:%.*]] = phi <32 x i32> [ [[TMP26]], [[PRED_UDIV_CONTINUE8]] ], [ [[TMP30]], [[PRED_UDIV_IF9]] ]
+; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <32 x i1> [[TMP0]], i32 6
+; CHECK-NEXT:    br i1 [[TMP32]], label [[PRED_UDIV_IF11:%.*]], label [[PRED_UDIV_CONTINUE12:%.*]]
+; CHECK:       pred.udiv.if11:
+; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <32 x i32> [[TMP1]], i32 6
+; CHECK-NEXT:    [[TMP34:%.*]] = udiv i32 [[TMP33]], [[D]]
+; CHECK-NEXT:    [[TMP35:%.*]] = insertelement <32 x i32> [[TMP31]], i32 [[TMP34]], i32 6
+; CHECK-NEXT:    br label [[PRED_UDIV_CONTINUE12]]
+; CHECK:       pred.udiv.continue12:
+; CHECK-NEXT:    [[TMP36:%.*]] = phi <32 x i32> [ [[TMP31]], [[PRED_UDIV_CONTINUE10]] ], [ [[TMP35]], [[PRED_UDIV_IF11]] ]
+; CHECK-NEXT:    [[TMP37:%.*]] = extractelement <32 x i1> [[TMP0]], i32 7
+; CHECK-NEXT:    br i1 [[TMP37]], label [[PRED_UDIV_IF13:%.*]], label [[PRED_UDIV_CONTINUE14:%.*]]
+; CHECK:       pred.udiv.if13:
+; CHECK-NEXT:    [[TMP38:%.*]] = extractelement <32 x i32> [[TMP1]], i32 7
+; CHECK-NEXT:    [[TMP39:%.*]] = udiv i32 [[TMP38]], [[D]]
+; CHECK-NEXT:    [[TMP40:%.*]] = insertelement <32 x i32> [[TMP36]], i32 [[TMP39]], i32 7
+; CHECK-NEXT:    br label [[PRED_UDIV_CONTINUE14]]
+; CHECK:       pred.udiv.continue14:
+; CHECK-NEXT:    [[TMP41:%.*]] = phi <32 x i32> [ [[TMP36]], [[PRED_UDIV_CONTINUE12]] ], [ [[TMP40]], [[PRED_UDIV_IF13]] ]
+; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <32 x i1> [[TMP0]], i32 8
+; CHECK-NEXT:    br i1 [[TMP42]], label [[PRED_UDIV_IF15:%.*]], label [[PRED_UDIV_CONTINUE16:%.*]]
+; CHECK:       pred.udiv.if15:
+; CHECK-NEXT:    [[TMP43:%.*]] = extractelement <32 x i32> [[TMP1]], i32 8
+; CHECK-NEXT:    [[TMP44:%.*]] = udiv i32 [[TMP43]], [[D]]
+; CHECK-NEXT:    [[TMP45:%.*]] = insertelement <32 x i32> [[TMP41]], i32 [[TMP44]], i32 8
+; CHECK-NEXT:    br label [[PRED_UDIV_CONTINUE16]]
+; CHECK:       pred.udiv.continue16:
+; CHECK-NEXT:    [[TMP46:%.*]] = phi <32 x i32> [ [[TMP41]], [[PRED_UDIV_CONTINUE14]] ], [ [[TMP45]], [[PRED_UDIV_IF15]] ]
+; CHECK-NEXT:    [[TMP47:%.*]] = extractelement <32 x i1> [[TMP0]], i32 9
+; CHECK-NEXT:    br i1 [[TMP47]], label [[PRED_UDIV_IF17:%.*]], label [[PRED_UDIV_CONTINUE18:%.*]]
+; CHECK:       pred.udiv.if17:
+; CHECK-NEXT:    [[TMP48:%.*]] = extractelement <32 x i32> [[TMP1]], i32 9
+; CHECK-NEXT:    [[TMP49:%.*]] = udiv i32 [[TMP48]], [[D]]
+; CHECK-NEXT:    [[TMP50:%.*]] = insertelement <32 x i32> [[TMP46]], i32 [[TMP49]], i32 9
+; CHECK-NEXT:    br label [[PRED_UDIV_CONTINUE18]]
+; CHECK:       pred.udiv.continue18:
+; CHECK-NEXT:    [[TMP51:%.*]] = phi <32 x i32> [ [[TMP46]], [[PRED_UDIV_CONTINUE16]] ], [ [[TMP50]], [[PRED_UDIV_IF17]] ]
+; CHECK-NEXT:    [[TMP52:%.*]] = extractelement <32 x i1> [[TMP0]], i32 10
+; CHECK-NEXT:    br i1 [[TMP52]], label [[PRED_UDIV_IF19:%.*]], label [[PRED_UDIV_CONTINUE20:%.*]]
+; CHECK:       pred.udiv.if19:
+; CHECK-NEXT:    [[TMP53:%.*]] = extractelement <32 x i32> [[TMP1]], i32 10
+; CHECK-NEXT:    [[TMP54:%.*]] = udiv i32 [[TMP53]], [[D]]
+; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <32 x i32> [[TMP51]], i32 [[TMP54]], i32 10
+; CHECK-NEXT:    br label [[PRED_UDIV_CONTINUE20]]
+; CHECK:       pred.udiv.continue20:
+; CHECK-NEXT:    [[TMP56:%.*]] = phi <32 x i32> [ [[TMP51]], [[PRED_UDIV_CONTINUE18]] ], [ [[TMP55]], [[PRED_UDIV_IF19]] ]
+; CHECK-NEXT:    [[TMP57:%.*]] = extractelement <32 x i1> [[TMP0]], i32 11
+; CHECK-NEXT:    br i1 [[TMP57]], label [[PRED_UDIV_IF21:%.*]], label [[PRED_UDIV_CONTINUE22:%.*]]
+; CHECK:       pred.udiv.if21:
+; CHECK-NEXT:    [[TMP58:%.*]] = extractelement <32 x i32> [[TMP1]], i32 11
+; CHECK-NEXT:    [[TMP59:%.*]] = udiv i32 [[TMP58]], [[D]]
+; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <32 x i32> [[TMP56]], i32 [[TMP59]], i32 11
+; CHECK-NEXT:    br label [[PRED_UDIV_CONTINUE22]]
+; CHECK:       pred.udiv.continue22:
+; CHECK-NEXT:    [[TMP61:%.*]] = phi <32 x i32> [ [[TMP56]], [[PRED_UDIV_CONTINUE20]] ], [ [[TMP60]], [[PRED_UDIV_IF21]] ]
+; CHECK-NEXT:    [[TMP62:%.*]] = extractelement <32 x i1> [[TMP0]], i32 12
+; CHECK-NEXT:    br i1 [[TMP62]], label [[PRED_UDIV_IF23:%.*]], label [[PRED_UDIV_CONTINUE24:%.*]]
+; CHECK:       pred.udiv.if23:
+; CHECK-NEXT:    [[TMP63:%.*]] = extractelement <32 x i32> [[TMP1]], i32 12
+; CHECK-NEXT:    [[TMP64:%.*]] = udiv i32 [[TMP63]], [[D]]
+; CHECK-NEXT:    [[TMP65:%.*]] = insertelement <32 x i32> [[TMP61]], i32 [[TMP64]], i32 12
+; CHECK-NEXT:    br label [[PRED_UDIV_CONTINUE24]]
+; CHECK:       pred.udiv.continue24:
+; CHECK-NEXT:    [[TMP66:%.*]] = phi <32 x i32> [ [[TMP61]], [[PRED_UDIV_CONTINUE22]] ], [ [[TMP65]], [[PRED_UDIV_IF23]] ]
+; CHECK-NEXT:    [[TMP67:%.*]] = extractelement <32 x i1> [[TMP0]], i32 13
+; CHECK-NEXT:    br i1 [[TMP67]], label [[PRED_UDIV_IF25:%.*]], label [[PRED_UDIV_CONTINUE26:%.*]]
+; CHECK:       pred.udiv.if25:
+; CHECK-NEXT:    [[TMP68:%.*]] = extractelement <32 x i32> [[TMP1]], i32 13
+; CHECK-NEXT:    [[TMP69:%.*]] = udiv i32 [[TMP68]], [[D]]
+; CHECK-NEXT:    [[TMP70:%.*]] = insertelement <32 x i32> [[TMP66]], i32 [[TMP69]], i32 13
+; CHECK-NEXT:    br label [[PRED_UDIV_CONTINUE26]]
+; CHECK:       pred.udiv.continue26:
+; CHECK-NEXT:    [[TMP71:%.*]] = phi <32 x i32> [ [[TMP66]], [[PRED_UDIV_CONTINUE24]] ], [ [[TMP70]], [[PRED_UDIV_IF25]] ]
+; CHECK-NEXT:    [[TMP72:%.*]] = extractelement <32 x i1> [[TMP0]], i32 14
+; CHECK-NEXT:    br i1 [[TMP72]], label [[PRED_UDIV_IF27:%.*]], label [[PRED_UDIV_CONTINUE28:%.*]]
+; CHECK:       pred.udiv.if27:
+; CHECK-NEXT:    [[TMP73:%.*]] = extractelement <32 x i32> [[TMP1]], i32 14
+; CHECK-NEXT:    [[TMP74:%.*]] = udiv i32 [[TMP73]], [[D]]
+; CHECK-NEXT:    [[TMP75:%.*]] = insertelement <32 x i32> [[TMP71]], i32 [[TMP74]], i32 14
+; CHECK-NEXT:    br label [[PRED_UDIV_CONTINUE28]]
+; CHECK:       pred.udiv.continue28:
+; CHECK-NEXT:    [[TMP76:%.*]] = phi <32 x i32> [ [[TMP71]], [[PRED_UDIV_CONTINUE26]] ], [ [[TMP75]], [[PRED_UDIV_IF27]] ]
+; CHECK-NEXT:    [[TMP77:%.*]] = extractelement <32 x i1> [[TMP0]], i32 15
+; CHECK-NEXT:    br i1 [[TMP77]], label [[PRED_UDIV_IF29:%.*]], label [[PRED_UDIV_CONTINUE30:%.*]]
+; CHECK:       pred.udiv.if29:
+; CHECK-NEXT:    [[TMP78:%.*]] = extractelement <32 x i32> [[TMP1]], i32 15
+; CHECK-NEXT:    [[TMP79:%.*]] = udiv i32 [[TMP78]], [[D]]
+; CHECK-NEXT:    [[TMP80:%.*]] = insertelement <32 x i32> [[TMP76]], i32 [[TMP79]], i32 15
+; CHECK-NEXT:    br label [[PRED_UDIV_CONTINUE30]]
+; CHECK:       pred.udiv.continue30:
+; CHECK-NEXT:    [[TMP81:%.*]] = phi <32 x i32> [ [[TMP76]], [[PRED_UDIV_CONTINUE28]] ], [ [[TMP80]], [[PRED_UDIV_IF29]] ]
+; CHECK-NEXT:    [[TMP82:%.*]] = extractelement <32 x i1> [[TMP0]], i32 16
+; CHECK-NEXT:    br i1 [[TMP82]], label [[PRED_UDIV_IF31:%.*]], label [[PRED_UDIV_CONTINUE32:%.*]]
+; CHECK:       pred.udiv.if31:
+; CHECK-NEXT:    [[TMP83:%.*]] = extractelement <32 x i32> [[TMP1]], i32 16
+; CHECK-NEXT:    [[TMP84:%.*]] = udiv i32 [[TMP83]], [[D]]
+; CHECK-NEXT:    [[TMP85:%.*]] = insertelement <32 x i32> [[TMP81]], i32 [[TMP84]], i32 16
+; CHECK-NEXT:    br label [[PRED_UDIV_CONTINUE32]]
+; CHECK:       pred.udiv.continue32:
+; CHECK-NEXT:    [[TMP86:%.*]] = phi <32 x i32> [ [[TMP81]], [[PRED_UDIV_CONTINUE30]] ], [ [[TMP85]], [[PRED_UDIV_IF31]] ]
+; CHECK-NEXT:    [[TMP87:%.*]] = extractelement <32 x i1> [[TMP0]], i32 17
+; CHECK-NEXT:    br i1 [[TMP87]], label [[PRED_UDIV_IF33:%.*]], label [[PRED_UDIV_CONTINUE34:%.*]]
+; CHECK:       pred.udiv.if33:
+; CHECK-NEXT:    [[TMP88:%.*]] = extractelement <32 x i32> [[TMP1]], i32 17
+; CHECK-NEXT:    [[TMP89:%.*]] = udiv i32 [[TMP88]], [[D]]
+; CHECK-NEXT:    [[TMP90:%.*]] = insertelement <32 x i32> [[TMP86]], i32 [[TMP89]], i32 17
+; CHECK-NEXT:    br label [[PRED_UDIV_CONTINUE34]]
+; CHECK:       pred.udiv.continue34:
+; CHECK-NEXT:    [[TMP91:%.*]] = phi <32 x i32> [ [[TMP86]], [[PRED_UDIV_CONTINUE32]] ], [ [[TMP90]], [[PRED_UDIV_IF33]] ]
+; CHECK-NEXT:    [[TMP92:%.*]] = extractelement <32 x i1> [[TMP0]], i32 18
+; CHECK-NEXT:    br i1 [[TMP92]], label [[PRED_UDIV_IF35:%.*]], label [[PRED_UDIV_CONTINUE36:%.*]]
+; CHECK:       pred.udiv.if35:
+; CHECK-NEXT:    [[TMP93:%.*]] = extractelement <32 x i32> [[TMP1]], i32 18
+; CHECK-NEXT:    [[TMP94:%.*]] = udiv i32 [[TMP93]], [[D]]
+; CHECK-NEXT:    [[TMP95:%.*]] = insertelement <32 x i32> [[TMP91]], i32 [[TMP94]], i32 18
+; CHECK-NEXT:    br label [[PRED_UDIV_CONTINUE36]]
+; CHECK:       pred.udiv.continue36:
+; CHECK-NEXT:    [[TMP96:%.*]] = phi <32 x i32> [ [[TMP91]], [[PRED_UDIV_CONTINUE34]] ], [ [[TMP95]], [[PRED_UDIV_IF35]] ]
+; CHECK-NEXT:    [[TMP97:%.*]] = extractelement <32 x i1> [[TMP0]], i32 19
+; CHECK-NEXT:    br i1 [[TMP97]], label [[PRED_UDIV_IF37:%.*]], label [[PRED_UDIV_CONTINUE38:%.*]]
+; CHECK:       pred.udiv.if37:
+; CHECK-NEXT:    [[TMP98:%.*]] = extractelement <32 x i32> [[TMP1]], i32 19
+; CHECK-NEXT:    [[TMP99:%.*]] = udiv i32 [[TMP98]], [[D]]
+; CHECK-NEXT:    [[TMP100:%.*]] = insertelement <32 x i32> [[TMP96]], i32 [[TMP99]], i32 19
+; CHECK-NEXT:    br label [[PRED_UDIV_CONTINUE38]]
+; CHECK:       pred.udiv.continue38:
+; CHECK-NEXT:    [[TMP101:%.*]] = phi <32 x i32> [ [[TMP96]], [[PRED_UDIV_CONTINUE36]] ], [ [[TMP100]], [[PRED_UDIV_IF37]] ]
+; CHECK-NEXT:    [[TMP102:%.*]] = extractelement <32 x i1> [[TMP0]], i32 20
+; CHECK-NEXT:    br i1 [[TMP102]], label [[PRED_UDIV_IF39:%.*]], label [[PRED_UDIV_CONTINUE40:%.*]]
+; CHECK:       pred.udiv.if39:
+; CHECK-NEXT:    [[TMP103:%.*]] = extractelement <32 x i32> [[TMP1]], i32 20
+; CHECK-NEXT:    [[TMP104:%.*]] = udiv i32 [[TMP103]], [[D]]
+; CHECK-NEXT:    [[TMP105:%.*]] = insertelement <32 x i32> [[TMP101]], i32 [[TMP104]], i32 20
+; CHECK-NEXT:    br label [[PRED_UDIV_CONTINUE40]]
+; CHECK:       pred.udiv.continue40:
+; CHECK-NEXT:    [[TMP106:%.*]] = phi <32 x i32> [ [[TMP101]], [[PRED_UDIV_CONTINUE38]] ], [ [[TMP105]], [[PRED_UDIV_IF39]] ]
+; CHECK-NEXT:    [[TMP107:%.*]] = extractelement <32 x i1> [[TMP0]], i32 21
+; CHECK-NEXT:    br i1 [[TMP107]], label [[PRED_UDIV_IF41:%.*]], label [[PRED_UDIV_CONTINUE42:%.*]]
+; CHECK:       pred.udiv.if41:
+; CHECK-NEXT:    [[TMP108:%.*]] = extractelement <32 x i32> [[TMP1]], i32 21
+; CHECK-NEXT:    [[TMP109:%.*]] = udiv i32 [[TMP108]], [[D]]
+; CHECK-NEXT:    [[TMP110:%.*]] = insertelement <32 x i32> [[TMP106]], i32 [[TMP109]], i32 21
+; CHECK-NEXT:    br label [[PRED_UDIV_CONTINUE42]]
+; CHECK:       pred.udiv.continue42:
+; CHECK-NEXT:    [[TMP111:%.*]] = phi <32 x i32> [ [[TMP106]], [[PRED_UDIV_CONTINUE40]] ], [ [[TMP110]], [[PRED_UDIV_IF41]] ]
+; CHECK-NEXT:    [[TMP112:%.*]] = extractelement <32 x i1> [[TMP0]], i32 22
+; CHECK-NEXT:    br i1 [[TMP112]], label [[PRED_UDIV_IF43:%.*]], label [[PRED_UDIV_CONTINUE44:%.*]]
+; CHECK:       pred.udiv.if43:
+; CHECK-NEXT:    [[TMP113:%.*]] = extractelement <32 x i32> [[TMP1]], i32 22
+; CHECK-NEXT:    [[TMP114:%.*]] = udiv i32 [[TMP113]], [[D]]
+; CHECK-NEXT:    [[TMP115:%.*]] = insertelement <32 x i32> [[TMP111]], i32 [[TMP114]], i32 22
+; CHECK-NEXT:    br label [[PRED_UDIV_CONTINUE44]]
+; CHECK:       pred.udiv.continue44:
+; CHECK-NEXT:    [[TMP116:%.*]] = phi <32 x i32> [ [[TMP111]], [[PRED_UDIV_CONTINUE42]] ], [ [[TMP115]], [[PRED_UDIV_IF43]] ]
+; CHECK-NEXT:    [[TMP117:%.*]] = extractelement <32 x i1> [[TMP0]], i32 23
+; CHECK-NEXT:    br i1 [[TMP117]], label [[PRED_UDIV_IF45:%.*]], label [[PRED_UDIV_CONTINUE46:%.*]]
+; CHECK:       pred.udiv.if45:
+; CHECK-NEXT:    [[TMP118:%.*]] = extractelement <32 x i32> [[TMP1]], i32 23
+; CHECK-NEXT:    [[TMP119:%.*]] = udiv i32 [[TMP118]], [[D]]
+; CHECK-NEXT:    [[TMP120:%.*]] = insertelement <32 x i32> [[TMP116]], i32 [[TMP119]], i32 23
+; CHECK-NEXT:    br label [[PRED_UDIV_CONTINUE46]]
+; CHECK:       pred.udiv.continue46:
+; CHECK-NEXT:    [[TMP121:%.*]] = phi <32 x i32> [ [[TMP116]], [[PRED_UDIV_CONTINUE44]] ], [ [[TMP120]], [[PRED_UDIV_IF45]] ]
+; CHECK-NEXT:    [[TMP122:%.*]] = extractelement <32 x i1> [[TMP0]], i32 24
+; CHECK-NEXT:    br i1 [[TMP122]], label [[PRED_UDIV_IF47:%.*]], label [[PRED_UDIV_CONTINUE48:%.*]]
+; CHECK:       pred.udiv.if47:
+; CHECK-NEXT:    [[TMP123:%.*]] = extractelement <32 x i32> [[TMP1]], i32 24
+; CHECK-NEXT:    [[TMP124:%.*]] = udiv i32 [[TMP123]], [[D]]
+; CHECK-NEXT:    [[TMP125:%.*]] = insertelement <32 x i32> [[TMP121]], i32 [[TMP124]], i32 24
+; CHECK-NEXT:    br label [[PRED_UDIV_CONTINUE48]]
+; CHECK:       pred.udiv.continue48:
+; CHECK-NEXT:    [[TMP126:%.*]] = phi <32 x i32> [ [[TMP121]], [[PRED_UDIV_CONTINUE46]] ], [ [[TMP125]], [[PRED_UDIV_IF47]] ]
+; CHECK-NEXT:    [[TMP127:%.*]] = extractelement <32 x i1> [[TMP0]], i32 25
+; CHECK-NEXT:    br i1 [[TMP127]], label [[PRED_UDIV_IF49:%.*]], label [[PRED_UDIV_CONTINUE50:%.*]]
+; CHECK:       pred.udiv.if49:
+; CHECK-NEXT:    [[TMP128:%.*]] = extractelement <32 x i32> [[TMP1]], i32 25
+; CHECK-NEXT:    [[TMP129:%.*]] = udiv i32 [[TMP128]], [[D]]
+; CHECK-NEXT:    [[TMP130:%.*]] = insertelement <32 x i32> [[TMP126]], i32 [[TMP129]], i32 25
+; CHECK-NEXT:    br label [[PRED_UDIV_CONTINUE50]]
+; CHECK:       pred.udiv.continue50:
+; CHECK-NEXT:    [[TMP131:%.*]] = phi <32 x i32> [ [[TMP126]], [[PRED_UDIV_CONTINUE48]] ], [ [[TMP130]], [[PRED_UDIV_IF49]] ]
+; CHECK-NEXT:    [[TMP132:%.*]] = extractelement <32 x i1> [[TMP0]], i32 26
+; CHECK-NEXT:    br i1 [[TMP132]], label [[PRED_UDIV_IF51:%.*]], label [[PRED_UDIV_CONTINUE52:%.*]]
+; CHECK:       pred.udiv.if51:
+; CHECK-NEXT:    [[TMP133:%.*]] = extractelement <32 x i32> [[TMP1]], i32 26
+; CHECK-NEXT:    [[TMP134:%.*]] = udiv i32 [[TMP133]], [[D]]
+; CHECK-NEXT:    [[TMP135:%.*]] = insertelement <32 x i32> [[TMP131]], i32 [[TMP134]], i32 26
+; CHECK-NEXT:    br label [[PRED_UDIV_CONTINUE52]]
+; CHECK:       pred.udiv.continue52:
+; CHECK-NEXT:    [[TMP136:%.*]] = phi <32 x i32> [ [[TMP131]], [[PRED_UDIV_CONTINUE50]] ], [ [[TMP135]], [[PRED_UDIV_IF51]] ]
+; CHECK-NEXT:    [[TMP137:%.*]] = extractelement <32 x i1> [[TMP0]], i32 27
+; CHECK-NEXT:    br i1 [[TMP137]], label [[PRED_UDIV_IF53:%.*]], label [[PRED_UDIV_CONTINUE54:%.*]]
+; CHECK:       pred.udiv.if53:
+; CHECK-NEXT:    [[TMP138:%.*]] = extractelement <32 x i32> [[TMP1]], i32 27
+; CHECK-NEXT:    [[TMP139:%.*]] = udiv i32 [[TMP138]], [[D]]
+; CHECK-NEXT:    [[TMP140:%.*]] = insertelement <32 x i32> [[TMP136]], i32 [[TMP139]], i32 27
+; CHECK-NEXT:    br label [[PRED_UDIV_CONTINUE54]]
+; CHECK:       pred.udiv.continue54:
+; CHECK-NEXT:    [[TMP141:%.*]] = phi <32 x i32> [ [[TMP136]], [[PRED_UDIV_CONTINUE52]] ], [ [[TMP140]], [[PRED_UDIV_IF53]] ]
+; CHECK-NEXT:    [[TMP142:%.*]] = extractelement <32 x i1> [[TMP0]], i32 28
+; CHECK-NEXT:    br i1 [[TMP142]], label [[PRED_UDIV_IF55:%.*]], label [[PRED_UDIV_CONTINUE56:%.*]]
+; CHECK:       pred.udiv.if55:
+; CHECK-NEXT:    [[TMP143:%.*]] = extractelement <32 x i32> [[TMP1]], i32 28
+; CHECK-NEXT:    [[TMP144:%.*]] = udiv i32 [[TMP143]], [[D]]
+; CHECK-NEXT:    [[TMP145:%.*]] = insertelement <32 x i32> [[TMP141]], i32 [[TMP144]], i32 28
+; CHECK-NEXT:    br label [[PRED_UDIV_CONTINUE56]]
+; CHECK:       pred.udiv.continue56:
+; CHECK-NEXT:    [[TMP146:%.*]] = phi <32 x i32> [ [[TMP141]], [[PRED_UDIV_CONTINUE54]] ], [ [[TMP145]], [[PRED_UDIV_IF55]] ]
+; CHECK-NEXT:    [[TMP147:%.*]] = extractelement <32 x i1> [[TMP0]], i32 29
+; CHECK-NEXT:    br i1 [[TMP147]], label [[PRED_UDIV_IF57:%.*]], label [[PRED_UDIV_CONTINUE58:%.*]]
+; CHECK:       pred.udiv.if57:
+; CHECK-NEXT:    [[TMP148:%.*]] = extractelement <32 x i32> [[TMP1]], i32 29
+; CHECK-NEXT:    [[TMP149:%.*]] = udiv i32 [[TMP148]], [[D]]
+; CHECK-NEXT:    [[TMP150:%.*]] = insertelement <32 x i32> [[TMP146]], i32 [[TMP149]], i32 29
+; CHECK-NEXT:    br label [[PRED_UDIV_CONTINUE58]]
+; CHECK:       pred.udiv.continue58:
+; CHECK-NEXT:    [[TMP151:%.*]] = phi <32 x i32> [ [[TMP146]], [[PRED_UDIV_CONTINUE56]] ], [ [[TMP150]], [[PRED_UDIV_IF57]] ]
+; CHECK-NEXT:    [[TMP152:%.*]] = extractelement <32 x i1> [[TMP0]], i32 30
+; CHECK-NEXT:    br i1 [[TMP152]], label [[PRED_UDIV_IF59:%.*]], label [[PRED_UDIV_CONTINUE60:%.*]]
+; CHECK:       pred.udiv.if59:
+; CHECK-NEXT:    [[TMP153:%.*]] = extractelement <32 x i32> [[TMP1]], i32 30
+; CHECK-NEXT:    [[TMP154:%.*]] = udiv i32 [[TMP153]], [[D]]
+; CHECK-NEXT:    [[TMP155:%.*]] = insertelement <32 x i32> [[TMP151]], i32 [[TMP154]], i32 30
+; CHECK-NEXT:    br label [[PRED_UDIV_CONTINUE60]]
+; CHECK:       pred.udiv.continue60:
+; CHECK-NEXT:    [[TMP156:%.*]] = phi <32 x i32> [ [[TMP151]], [[PRED_UDIV_CONTINUE58]] ], [ [[TMP155]], [[PRED_UDIV_IF59]] ]
+; CHECK-NEXT:    [[TMP157:%.*]] = extractelement <32 x i1> [[TMP0]], i32 31
+; CHECK-NEXT:    br i1 [[TMP157]], label [[PRED_UDIV_IF61:%.*]], label [[PRED_UDIV_CONTINUE62]]
+; CHECK:       pred.udiv.if61:
+; CHECK-NEXT:    [[TMP158:%.*]] = extractelement <32 x i32> [[TMP1]], i32 31
+; CHECK-NEXT:    [[TMP159:%.*]] = udiv i32 [[TMP158]], [[D]]
+; CHECK-NEXT:    [[TMP160:%.*]] = insertelement <32 x i32> [[TMP156]], i32 [[TMP159]], i32 31
+; CHECK-NEXT:    br label [[PRED_UDIV_CONTINUE62]]
+; CHECK:       pred.udiv.continue62:
+; CHECK-NEXT:    [[TMP161:%.*]] = phi <32 x i32> [ [[TMP156]], [[PRED_UDIV_CONTINUE60]] ], [ [[TMP160]], [[PRED_UDIV_IF61]] ]
+; CHECK-NEXT:    [[TMP162:%.*]] = zext <32 x i32> [[TMP161]] to <32 x i64>
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <32 x i1> [[BROADCAST_SPLAT]], <32 x i64> zeroinitializer, <32 x i64> [[TMP162]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 32
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <32 x i32> [[VEC_IND]], splat (i32 32)
+; CHECK-NEXT:    [[TMP163:%.*]] = icmp eq i32 [[INDEX_NEXT]], 992
+; CHECK-NEXT:    br i1 [[TMP163]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[TMP164:%.*]] = extractelement <32 x i64> [[PREDPHI]], i32 31
+; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
+; CHECK:       vec.epilog.iter.check:
+; CHECK-NEXT:    br i1 false, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF13:![0-9]+]]
+; CHECK:       vec.epilog.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 992, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT63:%.*]] = insertelement <8 x i1> poison, i1 [[C]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT64:%.*]] = shufflevector <8 x i1> [[BROADCAST_SPLATINSERT63]], <8 x i1> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP165:%.*]] = xor <8 x i1> [[BROADCAST_SPLAT64]], splat (i1 true)
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT65:%.*]] = insertelement <8 x i32> poison, i32 [[BC_RESUME_VAL]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT66:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT65]], <8 x i32> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <8 x i32> [[BROADCAST_SPLAT66]], <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
+; CHECK:       vec.epilog.vector.body:
+; CHECK-NEXT:    [[INDEX67:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT86:%.*]], [[PRED_UDIV_CONTINUE84:%.*]] ]
+; CHECK-NEXT:    [[VEC_IND68:%.*]] = phi <8 x i32> [ [[INDUCTION]], [[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT87:%.*]], [[PRED_UDIV_CONTINUE84]] ]
+; CHECK-NEXT:    [[TMP166:%.*]] = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> [[VEC_IND68]], <8 x i32> splat (i32 1))
+; CHECK-NEXT:    [[TMP167:%.*]] = extractelement <8 x i1> [[TMP165]], i32 0
+; CHECK-NEXT:    br i1 [[TMP167]], label [[PRED_UDIV_IF69:%.*]], label [[PRED_UDIV_CONTINUE70:%.*]]
+; CHECK:       pred.udiv.if69:
+; CHECK-NEXT:    [[TMP168:%.*]] = extractelement <8 x i32> [[TMP166]], i32 0
+; CHECK-NEXT:    [[TMP169:%.*]] = udiv i32 [[TMP168]], [[D]]
+; CHECK-NEXT:    [[TMP170:%.*]] = insertelement <8 x i32> poison, i32 [[TMP169]], i32 0
+; CHECK-NEXT:    br label [[PRED_UDIV_CONTINUE70]]
+; CHECK:       pred.udiv.continue70:
+; CHECK-NEXT:    [[TMP171:%.*]] = phi <8 x i32> [ poison, [[VEC_EPILOG_VECTOR_BODY]] ], [ [[TMP170]], [[PRED_UDIV_IF69]] ]
+; CHECK-NEXT:    [[TMP172:%.*]] = extractelement <8 x i1> [[TMP165]], i32 1
+; CHECK-NEXT:    br i1 [[TMP172]], label [[PRED_UDIV_IF71:%.*]], label [[PRED_UDIV_CONTINUE72:%.*]]
+; CHECK:       pred.udiv.if71:
+; CHECK-NEXT:    [[TMP173:%.*]] = extractelement <8 x i32> [[TMP166]], i32 1
+; CHECK-NEXT:    [[TMP174:%.*]] = udiv i32 [[TMP173]], [[D]]
+; CHECK-NEXT:    [[TMP175:%.*]] = insertelement <8 x i32> [[TMP171]], i32 [[TMP174]], i32 1
+; CHECK-NEXT:    br label [[PRED_UDIV_CONTINUE72]]
+; CHECK:       pred.udiv.continue72:
+; CHECK-NEXT:    [[TMP176:%.*]] = phi <8 x i32> [ [[TMP171]], [[PRED_UDIV_CONTINUE70]] ], [ [[TMP175]], [[PRED_UDIV_IF71]] ]
+; CHECK-NEXT:    [[TMP177:%.*]] = extractelement <8 x i1> [[TMP165]], i32 2
+; CHECK-NEXT:    br i1 [[TMP177]], label [[PRED_UDIV_IF73:%.*]], label [[PRED_UDIV_CONTINUE74:%.*]]
+; CHECK:       pred.udiv.if73:
+; CHECK-NEXT:    [[TMP178:%.*]] = extractelement <8 x i32> [[TMP166]], i32 2
+; CHECK-NEXT:    [[TMP179:%.*]] = udiv i32 [[TMP178]], [[D]]
+; CHECK-NEXT:    [[TMP180:%.*]] = insertelement <8 x i32> [[TMP176]], i32 [[TMP179]], i32 2
+; CHECK-NEXT:    br label [[PRED_UDIV_CONTINUE74]]
+; CHECK:       pred.udiv.continue74:
+; CHECK-NEXT:    [[TMP181:%.*]] = phi <8 x i32> [ [[TMP176]], [[PRED_UDIV_CONTINUE72]] ], [ [[TMP180]], [[PRED_UDIV_IF73]] ]
+; CHECK-NEXT:    [[TMP182:%.*]] = extractelement <8 x i1> [[TMP165]], i32 3
+; CHECK-NEXT:    br i1 [[TMP182]], label [[PRED_UDIV_IF75:%.*]], label [[PRED_UDIV_CONTINUE76:%.*]]
+; CHECK:       pred.udiv.if75:
+; CHECK-NEXT:    [[TMP183:%.*]] = extractelement <8 x i32> [[TMP166]], i32 3
+; CHECK-NEXT:    [[TMP184:%.*]] = udiv i32 [[TMP183]], [[D]]
+; CHECK-NEXT:    [[TMP185:%.*]] = insertelement <8 x i32> [[TMP181]], i32 [[TMP184]], i32 3
+; CHECK-NEXT:    br label [[PRED_UDIV_CONTINUE76]]
+; CHECK:       pred.udiv.continue76:
+; CHECK-NEXT:    [[TMP186:%.*]] = phi <8 x i32> [ [[TMP181]], [[PRED_UDIV_CONTINUE74]] ], [ [[TMP185]], [[PRED_UDIV_IF75]] ]
+; CHECK-NEXT:    [[TMP187:%.*]] = extractelement <8 x i1> [[TMP165]], i32 4
+; CHECK-NEXT:    br i1 [[TMP187]], label [[PRED_UDIV_IF77:%.*]], label [[PRED_UDIV_CONTINUE78:%.*]]
+; CHECK:       pred.udiv.if77:
+; CHECK-NEXT:    [[TMP188:%.*]] = extractelement <8 x i32> [[TMP166]], i32 4
+; CHECK-NEXT:    [[TMP189:%.*]] = udiv i32 [[TMP188]], [[D]]
+; CHECK-NEXT:    [[TMP190:%.*]] = insertelement <8 x i32> [[TMP186]], i32 [[TMP189]], i32 4
+; CHECK-NEXT:    br label [[PRED_UDIV_CONTINUE78]]
+; CHECK:       pred.udiv.continue78:
+; CHECK-NEXT:    [[TMP191:%.*]] = phi <8 x i32> [ [[TMP186]], [[PRED_UDIV_CONTINUE76]] ], [ [[TMP190]], [[PRED_UDIV_IF77]] ]
+; CHECK-NEXT:    [[TMP192:%.*]] = extractelement <8 x i1> [[TMP165]], i32 5
+; CHECK-NEXT:    br i1 [[TMP192]], label [[PRED_UDIV_IF79:%.*]], label [[PRED_UDIV_CONTINUE80:%.*]]
+; CHECK:       pred.udiv.if79:
+; CHECK-NEXT:    [[TMP193:%.*]] = extractelement <8 x i32> [[TMP166]], i32 5
+; CHECK-NEXT:    [[TMP194:%.*]] = udiv i32 [[TMP193]], [[D]]
+; CHECK-NEXT:    [[TMP195:%.*]] = insertelement <8 x i32> [[TMP191]], i32 [[TMP194]], i32 5
+; CHECK-NEXT:    br label [[PRED_UDIV_CONTINUE80]]
+; CHECK:       pred.udiv.continue80:
+; CHECK-NEXT:    [[TMP196:%.*]] = phi <8 x i32> [ [[TMP191]], [[PRED_UDIV_CONTINUE78]] ], [ [[TMP195]], [[PRED_UDIV_IF79]] ]
+; CHECK-NEXT:    [[TMP197:%.*]] = extractelement <8 x i1> [[TMP165]], i32 6
+; CHECK-NEXT:    br i1 [[TMP197]], label [[PRED_UDIV_IF81:%.*]], label [[PRED_UDIV_CONTINUE82:%.*]]
+; CHECK:       pred.udiv.if81:
+; CHECK-NEXT:    [[TMP198:%.*]] = extractelement <8 x i32> [[TMP166]], i32 6
+; CHECK-NEXT:    [[TMP199:%.*]] = udiv i32 [[TMP198]], [[D]]
+; CHECK-NEXT:    [[TMP200:%.*]] = insertelement <8 x i32> [[TMP196]], i32 [[TMP199]], i32 6
+; CHECK-NEXT:    br label [[PRED_UDIV_CONTINUE82]]
+; CHECK:       pred.udiv.continue82:
+; CHECK-NEXT:    [[TMP201:%.*]] = phi <8 x i32> [ [[TMP196]], [[PRED_UDIV_CONTINUE80]] ], [ [[TMP200]], [[PRED_UDIV_IF81]] ]
+; CHECK-NEXT:    [[TMP202:%.*]] = extractelement <8 x i1> [[TMP165]], i32 7
+; CHECK-NEXT:    br i1 [[TMP202]], label [[PRED_UDIV_IF83:%.*]], label [[PRED_UDIV_CONTINUE84]]
+; CHECK:       pred.udiv.if83:
+; CHECK-NEXT:    [[TMP203:%.*]] = extractelement <8 x i32> [[TMP166]], i32 7
+; CHECK-NEXT:    [[TMP204:%.*]] = udiv i32 [[TMP203]], [[D]]
+; CHECK-NEXT:    [[TMP205:%.*]] = insertelement <8 x i32> [[TMP201]], i32 [[TMP204]], i32 7
+; CHECK-NEXT:    br label [[PRED_UDIV_CONTINUE84]]
+; CHECK:       pred.udiv.continue84:
+; CHECK-NEXT:    [[TMP206:%.*]] = phi <8 x i32> [ [[TMP201]], [[PRED_UDIV_CONTINUE82]] ], [ [[TMP205]], [[PRED_UDIV_IF83]] ]
+; CHECK-NEXT:    [[TMP207:%.*]] = zext <8 x i32> [[TMP206]] to <8 x i64>
+; CHECK-NEXT:    [[PREDPHI85:%.*]] = select <8 x i1> [[BROADCAST_SPLAT64]], <8 x i64> zeroinitializer, <8 x i64> [[TMP207]]
+; CHECK-NEXT:    [[INDEX_NEXT86]] = add nuw i32 [[INDEX67]], 8
+; CHECK-NEXT:    [[VEC_IND_NEXT87]] = add <8 x i32> [[VEC_IND68]], splat (i32 8)
+; CHECK-NEXT:    [[TMP208:%.*]] = icmp eq i32 [[INDEX_NEXT86]], 1000
+; CHECK-NEXT:    br i1 [[TMP208]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK:       vec.epilog.middle.block:
+; CHECK-NEXT:    [[TMP209:%.*]] = extractelement <8 x i64> [[PREDPHI85]], i32 7
+; CHECK-NEXT:    br i1 false, label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]]
+; CHECK:       vec.epilog.scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL88:%.*]] = phi i32 [ 1000, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 992, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
+; CHECK:       loop.header:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL88]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
+; CHECK-NEXT:    br i1 [[C]], label [[LOOP_LATCH]], label [[THEN:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 @llvm.usub.sat.i32(i32 [[IV]], i32 1)
+; CHECK-NEXT:    [[UDIV:%.*]] = udiv i32 [[CALL]], [[D]]
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext i32 [[UDIV]] to i64
+; CHECK-NEXT:    br label [[LOOP_LATCH]]
+; CHECK:       loop.latch:
+; CHECK-NEXT:    [[MERGE:%.*]] = phi i64 [ [[ZEXT]], [[THEN]] ], [ 0, [[LOOP_HEADER]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i32 [[IV]], 1000
+; CHECK-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP15:![0-9]+]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[MERGE_LCSSA:%.*]] = phi i64 [ [[MERGE]], [[LOOP_LATCH]] ], [ [[TMP164]], [[MIDDLE_BLOCK]] ], [ [[TMP209]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i64 [[MERGE_LCSSA]]
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  br i1 %c, label %loop.latch, label %then
+
+then:
+  %call = tail call i32 @llvm.usub.sat.i32(i32 %iv, i32 1)
+  %udiv = udiv i32 %call, %d
+  %zext = zext i32 %udiv to i64
+  br label %loop.latch
+
+loop.latch:
+  %merge = phi i64 [ %zext, %then ], [ 0, %loop.header ]
+  %iv.next = add i32 %iv, 1
+  %ec = icmp eq i32 %iv, 1000
+  br i1 %ec, label %exit, label %loop.header
+
+exit:
+  ret i64 %merge
+}
+
 attributes #0 = { "target-cpu"="znver4" }
 attributes #1 = { "target-features"="+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512vl" }
 attributes #2 = { "target-cpu"="znver3" }
diff --git a/llvm/test/Transforms/LoopVectorize/X86/cost-model-assert.ll b/llvm/test/Transforms/LoopVectorize/X86/cost-model-assert.ll
index 8e3af54b770e8..4cff8753ba9b1 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/cost-model-assert.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/cost-model-assert.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt < %s -passes=loop-vectorize -S | FileCheck %s
 
 ; This is a bugpoint reduction of a test from PR43582:
@@ -12,31 +12,32 @@ target datalayout = "e-m:w-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16
 target triple = "x86_64-w64-windows-gnu"
 
 define void @cff_index_load_offsets(i1 %cond, i8 %x, ptr %p) #0 {
-; CHECK-LABEL: @cff_index_load_offsets(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 [[COND:%.*]], label [[IF_THEN:%.*]], label [[EXIT:%.*]]
-; CHECK:       if.then:
-; CHECK-NEXT:    br label [[FOR_BODY68:%.*]]
-; CHECK:       for.body68:
-; CHECK-NEXT:    [[P_359:%.*]] = phi ptr [ [[ADD_PTR86:%.*]], [[FOR_BODY68]] ], [ null, [[IF_THEN]] ]
-; CHECK-NEXT:    [[CONV70:%.*]] = zext i8 [[X:%.*]] to i32
+; CHECK-LABEL: define void @cff_index_load_offsets(
+; CHECK-SAME: i1 [[COND:%.*]], i8 [[X:%.*]], ptr [[P:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br i1 [[COND]], label %[[IF_THEN:.*]], label %[[EXIT:.*]]
+; CHECK:       [[IF_THEN]]:
+; CHECK-NEXT:    br label %[[FOR_BODY68:.*]]
+; CHECK:       [[FOR_BODY68]]:
+; CHECK-NEXT:    [[P_359:%.*]] = phi ptr [ [[ADD_PTR86:%.*]], %[[FOR_BODY68]] ], [ null, %[[IF_THEN]] ]
+; CHECK-NEXT:    [[CONV70:%.*]] = zext i8 [[X]] to i32
 ; CHECK-NEXT:    [[SHL71:%.*]] = shl nuw i32 [[CONV70]], 24
-; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[P:%.*]], align 1, !tbaa [[TBAA1:![0-9]+]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[P]], align 1, !tbaa [[CHAR_TBAA1:![0-9]+]]
 ; CHECK-NEXT:    [[CONV73:%.*]] = zext i8 [[TMP0]] to i32
 ; CHECK-NEXT:    [[SHL74:%.*]] = shl nuw nsw i32 [[CONV73]], 16
 ; CHECK-NEXT:    [[OR75:%.*]] = or i32 [[SHL74]], [[SHL71]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr undef, align 1, !tbaa [[TBAA1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr undef, align 1, !tbaa [[CHAR_TBAA1]]
 ; CHECK-NEXT:    [[SHL78:%.*]] = shl nuw nsw i32 undef, 8
 ; CHECK-NEXT:    [[OR79:%.*]] = or i32 [[OR75]], [[SHL78]]
 ; CHECK-NEXT:    [[CONV81:%.*]] = zext i8 [[TMP1]] to i32
 ; CHECK-NEXT:    [[OR83:%.*]] = or i32 [[OR79]], [[CONV81]]
-; CHECK-NEXT:    store i32 [[OR83]], ptr undef, align 4, !tbaa [[TBAA4:![0-9]+]]
+; CHECK-NEXT:    store i32 [[OR83]], ptr undef, align 4, !tbaa [[LONG_TBAA4:![0-9]+]]
 ; CHECK-NEXT:    [[ADD_PTR86]] = getelementptr inbounds i8, ptr [[P_359]], i64 4
 ; CHECK-NEXT:    [[CMP66:%.*]] = icmp ult ptr [[ADD_PTR86]], undef
-; CHECK-NEXT:    br i1 [[CMP66]], label [[FOR_BODY68]], label [[SW_EPILOG:%.*]]
-; CHECK:       sw.epilog:
+; CHECK-NEXT:    br i1 [[CMP66]], label %[[FOR_BODY68]], label %[[SW_EPILOG:.*]]
+; CHECK:       [[SW_EPILOG]]:
 ; CHECK-NEXT:    unreachable
-; CHECK:       Exit:
+; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -80,3 +81,10 @@ attributes #0 = { "use-soft-float"="false" }
 !3 = !{!"Simple C/C++ TBAA"}
 !4 = !{!5, !5, i64 0}
 !5 = !{!"long", !2, i64 0}
+;.
+; CHECK: [[CHAR_TBAA1]] = !{[[META2:![0-9]+]], [[META2]], i64 0}
+; CHECK: [[META2]] = !{!"omnipotent char", [[META3:![0-9]+]], i64 0}
+; CHECK: [[META3]] = !{!"Simple C/C++ TBAA"}
+; CHECK: [[LONG_TBAA4]] = !{[[META5:![0-9]+]], [[META5]], i64 0}
+; CHECK: [[META5]] = !{!"long", [[META2]], i64 0}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr47437.ll b/llvm/test/Transforms/LoopVectorize/X86/pr47437.ll
index d2f8f2203b724..5d16ce5346bbf 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/pr47437.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr47437.ll
@@ -169,64 +169,140 @@ define void @test_muladd(ptr noalias nocapture %d1, ptr noalias nocapture readon
 ; AVX1-NEXT:  entry:
 ; AVX1-NEXT:    [[CMP30:%.*]] = icmp sgt i32 [[N:%.*]], 0
 ; AVX1-NEXT:    br i1 [[CMP30]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
-; AVX1:       for.body.preheader:
+; AVX1:       iter.check:
 ; AVX1-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; AVX1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 8
+; AVX1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4
 ; AVX1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; AVX1:       vector.main.loop.iter.check:
+; AVX1-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16
+; AVX1-NEXT:    br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH1:%.*]]
 ; AVX1:       vector.ph:
-; AVX1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 8
+; AVX1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16
 ; AVX1-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
 ; AVX1-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; AVX1:       vector.body:
-; AVX1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; AVX1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH1]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; AVX1-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 4
+; AVX1-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 8
+; AVX1-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 12
 ; AVX1-NEXT:    [[TMP7:%.*]] = shl nuw nsw i64 [[INDEX]], 1
 ; AVX1-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 1
+; AVX1-NEXT:    [[TMP5:%.*]] = shl nuw nsw i64 [[TMP3]], 1
+; AVX1-NEXT:    [[TMP6:%.*]] = shl nuw nsw i64 [[TMP8]], 1
 ; AVX1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i16, ptr [[S1:%.*]], i64 [[TMP7]]
 ; AVX1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i16, ptr [[S1]], i64 [[TMP2]]
+; AVX1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i16, ptr [[S1]], i64 [[TMP5]]
+; AVX1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i16, ptr [[S1]], i64 [[TMP6]]
 ; AVX1-NEXT:    [[WIDE_VEC2:%.*]] = load <8 x i16>, ptr [[TMP11]], align 2
 ; AVX1-NEXT:    [[STRIDED_VEC5:%.*]] = shufflevector <8 x i16> [[WIDE_VEC2]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; AVX1-NEXT:    [[STRIDED_VEC9:%.*]] = shufflevector <8 x i16> [[WIDE_VEC2]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 ; AVX1-NEXT:    [[WIDE_VEC3:%.*]] = load <8 x i16>, ptr [[TMP4]], align 2
 ; AVX1-NEXT:    [[STRIDED_VEC6:%.*]] = shufflevector <8 x i16> [[WIDE_VEC3]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; AVX1-NEXT:    [[STRIDED_VEC10:%.*]] = shufflevector <8 x i16> [[WIDE_VEC3]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; AVX1-NEXT:    [[WIDE_VEC6:%.*]] = load <8 x i16>, ptr [[TMP9]], align 2
+; AVX1-NEXT:    [[STRIDED_VEC7:%.*]] = shufflevector <8 x i16> [[WIDE_VEC6]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; AVX1-NEXT:    [[STRIDED_VEC8:%.*]] = shufflevector <8 x i16> [[WIDE_VEC6]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; AVX1-NEXT:    [[WIDE_VEC9:%.*]] = load <8 x i16>, ptr [[TMP10]], align 2
+; AVX1-NEXT:    [[STRIDED_VEC12:%.*]] = shufflevector <8 x i16> [[WIDE_VEC9]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; AVX1-NEXT:    [[STRIDED_VEC11:%.*]] = shufflevector <8 x i16> [[WIDE_VEC9]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 ; AVX1-NEXT:    [[TMP36:%.*]] = sext <4 x i16> [[STRIDED_VEC5]] to <4 x i32>
 ; AVX1-NEXT:    [[TMP37:%.*]] = sext <4 x i16> [[STRIDED_VEC6]] to <4 x i32>
+; AVX1-NEXT:    [[TMP38:%.*]] = sext <4 x i16> [[STRIDED_VEC7]] to <4 x i32>
+; AVX1-NEXT:    [[TMP39:%.*]] = sext <4 x i16> [[STRIDED_VEC12]] to <4 x i32>
 ; AVX1-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i16, ptr [[S2:%.*]], i64 [[TMP7]]
 ; AVX1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i16, ptr [[S2]], i64 [[TMP2]]
+; AVX1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i16, ptr [[S2]], i64 [[TMP5]]
+; AVX1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i16, ptr [[S2]], i64 [[TMP6]]
 ; AVX1-NEXT:    [[WIDE_VEC13:%.*]] = load <8 x i16>, ptr [[TMP22]], align 2
 ; AVX1-NEXT:    [[STRIDED_VEC17:%.*]] = shufflevector <8 x i16> [[WIDE_VEC13]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; AVX1-NEXT:    [[STRIDED_VEC21:%.*]] = shufflevector <8 x i16> [[WIDE_VEC13]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 ; AVX1-NEXT:    [[WIDE_VEC14:%.*]] = load <8 x i16>, ptr [[TMP23]], align 2
 ; AVX1-NEXT:    [[STRIDED_VEC18:%.*]] = shufflevector <8 x i16> [[WIDE_VEC14]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; AVX1-NEXT:    [[STRIDED_VEC22:%.*]] = shufflevector <8 x i16> [[WIDE_VEC14]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; AVX1-NEXT:    [[WIDE_VEC18:%.*]] = load <8 x i16>, ptr [[TMP17]], align 2
+; AVX1-NEXT:    [[STRIDED_VEC19:%.*]] = shufflevector <8 x i16> [[WIDE_VEC18]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; AVX1-NEXT:    [[STRIDED_VEC20:%.*]] = shufflevector <8 x i16> [[WIDE_VEC18]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; AVX1-NEXT:    [[WIDE_VEC21:%.*]] = load <8 x i16>, ptr [[TMP18]], align 2
+; AVX1-NEXT:    [[STRIDED_VEC24:%.*]] = shufflevector <8 x i16> [[WIDE_VEC21]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; AVX1-NEXT:    [[STRIDED_VEC23:%.*]] = shufflevector <8 x i16> [[WIDE_VEC21]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 ; AVX1-NEXT:    [[TMP40:%.*]] = sext <4 x i16> [[STRIDED_VEC17]] to <4 x i32>
 ; AVX1-NEXT:    [[TMP41:%.*]] = sext <4 x i16> [[STRIDED_VEC18]] to <4 x i32>
+; AVX1-NEXT:    [[TMP42:%.*]] = sext <4 x i16> [[STRIDED_VEC19]] to <4 x i32>
+; AVX1-NEXT:    [[TMP43:%.*]] = sext <4 x i16> [[STRIDED_VEC24]] to <4 x i32>
 ; AVX1-NEXT:    [[TMP44:%.*]] = mul nsw <4 x i32> [[TMP40]], [[TMP36]]
 ; AVX1-NEXT:    [[TMP45:%.*]] = mul nsw <4 x i32> [[TMP41]], [[TMP37]]
-; AVX1-NEXT:    [[TMP38:%.*]] = sext <4 x i16> [[STRIDED_VEC9]] to <4 x i32>
-; AVX1-NEXT:    [[TMP39:%.*]] = sext <4 x i16> [[STRIDED_VEC10]] to <4 x i32>
-; AVX1-NEXT:    [[TMP42:%.*]] = sext <4 x i16> [[STRIDED_VEC21]] to <4 x i32>
-; AVX1-NEXT:    [[TMP43:%.*]] = sext <4 x i16> [[STRIDED_VEC22]] to <4 x i32>
 ; AVX1-NEXT:    [[TMP46:%.*]] = mul nsw <4 x i32> [[TMP42]], [[TMP38]]
 ; AVX1-NEXT:    [[TMP47:%.*]] = mul nsw <4 x i32> [[TMP43]], [[TMP39]]
-; AVX1-NEXT:    [[TMP19:%.*]] = add nsw <4 x i32> [[TMP46]], [[TMP44]]
-; AVX1-NEXT:    [[TMP20:%.*]] = add nsw <4 x i32> [[TMP47]], [[TMP45]]
+; AVX1-NEXT:    [[TMP27:%.*]] = sext <4 x i16> [[STRIDED_VEC9]] to <4 x i32>
+; AVX1-NEXT:    [[TMP28:%.*]] = sext <4 x i16> [[STRIDED_VEC10]] to <4 x i32>
+; AVX1-NEXT:    [[TMP29:%.*]] = sext <4 x i16> [[STRIDED_VEC8]] to <4 x i32>
+; AVX1-NEXT:    [[TMP30:%.*]] = sext <4 x i16> [[STRIDED_VEC11]] to <4 x i32>
+; AVX1-NEXT:    [[TMP31:%.*]] = sext <4 x i16> [[STRIDED_VEC21]] to <4 x i32>
+; AVX1-NEXT:    [[TMP32:%.*]] = sext <4 x i16> [[STRIDED_VEC22]] to <4 x i32>
+; AVX1-NEXT:    [[TMP33:%.*]] = sext <4 x i16> [[STRIDED_VEC20]] to <4 x i32>
+; AVX1-NEXT:    [[TMP34:%.*]] = sext <4 x i16> [[STRIDED_VEC23]] to <4 x i32>
+; AVX1-NEXT:    [[TMP35:%.*]] = mul nsw <4 x i32> [[TMP31]], [[TMP27]]
+; AVX1-NEXT:    [[TMP60:%.*]] = mul nsw <4 x i32> [[TMP32]], [[TMP28]]
+; AVX1-NEXT:    [[TMP67:%.*]] = mul nsw <4 x i32> [[TMP33]], [[TMP29]]
+; AVX1-NEXT:    [[TMP68:%.*]] = mul nsw <4 x i32> [[TMP34]], [[TMP30]]
+; AVX1-NEXT:    [[TMP19:%.*]] = add nsw <4 x i32> [[TMP35]], [[TMP44]]
+; AVX1-NEXT:    [[TMP20:%.*]] = add nsw <4 x i32> [[TMP60]], [[TMP45]]
+; AVX1-NEXT:    [[TMP69:%.*]] = add nsw <4 x i32> [[TMP67]], [[TMP46]]
+; AVX1-NEXT:    [[TMP70:%.*]] = add nsw <4 x i32> [[TMP68]], [[TMP47]]
 ; AVX1-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[D1:%.*]], i64 [[INDEX]]
 ; AVX1-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i32 4
+; AVX1-NEXT:    [[TMP71:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i32 8
+; AVX1-NEXT:    [[TMP72:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i32 12
 ; AVX1-NEXT:    store <4 x i32> [[TMP19]], ptr [[TMP21]], align 4
 ; AVX1-NEXT:    store <4 x i32> [[TMP20]], ptr [[TMP26]], align 4
-; AVX1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; AVX1-NEXT:    store <4 x i32> [[TMP69]], ptr [[TMP71]], align 4
+; AVX1-NEXT:    store <4 x i32> [[TMP70]], ptr [[TMP72]], align 4
+; AVX1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; AVX1-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; AVX1-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; AVX1:       middle.block:
 ; AVX1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; AVX1-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; AVX1:       scalar.ph:
-; AVX1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; AVX1-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
+; AVX1:       vec.epilog.iter.check:
+; AVX1-NEXT:    [[N_VEC_REMAINING:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; AVX1-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4
+; AVX1-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[SCALAR_PH]], label [[VEC_EPILOG_PH]]
+; AVX1:       vec.epilog.ph:
+; AVX1-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_PH]] ]
+; AVX1-NEXT:    [[N_MOD_VF24:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4
+; AVX1-NEXT:    [[N_VEC25:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF24]]
 ; AVX1-NEXT:    br label [[FOR_BODY1:%.*]]
+; AVX1:       vec.epilog.vector.body:
+; AVX1-NEXT:    [[INDEX26:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT33:%.*]], [[FOR_BODY1]] ]
+; AVX1-NEXT:    [[TMP48:%.*]] = shl nuw nsw i64 [[INDEX26]], 1
+; AVX1-NEXT:    [[TMP49:%.*]] = getelementptr inbounds i16, ptr [[S1]], i64 [[TMP48]]
+; AVX1-NEXT:    [[WIDE_VEC27:%.*]] = load <8 x i16>, ptr [[TMP49]], align 2
+; AVX1-NEXT:    [[STRIDED_VEC28:%.*]] = shufflevector <8 x i16> [[WIDE_VEC27]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; AVX1-NEXT:    [[STRIDED_VEC29:%.*]] = shufflevector <8 x i16> [[WIDE_VEC27]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; AVX1-NEXT:    [[TMP50:%.*]] = sext <4 x i16> [[STRIDED_VEC28]] to <4 x i32>
+; AVX1-NEXT:    [[TMP51:%.*]] = getelementptr inbounds i16, ptr [[S2]], i64 [[TMP48]]
+; AVX1-NEXT:    [[WIDE_VEC30:%.*]] = load <8 x i16>, ptr [[TMP51]], align 2
+; AVX1-NEXT:    [[STRIDED_VEC31:%.*]] = shufflevector <8 x i16> [[WIDE_VEC30]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; AVX1-NEXT:    [[STRIDED_VEC32:%.*]] = shufflevector <8 x i16> [[WIDE_VEC30]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; AVX1-NEXT:    [[TMP52:%.*]] = sext <4 x i16> [[STRIDED_VEC31]] to <4 x i32>
+; AVX1-NEXT:    [[TMP53:%.*]] = mul nsw <4 x i32> [[TMP52]], [[TMP50]]
+; AVX1-NEXT:    [[TMP54:%.*]] = sext <4 x i16> [[STRIDED_VEC29]] to <4 x i32>
+; AVX1-NEXT:    [[TMP55:%.*]] = sext <4 x i16> [[STRIDED_VEC32]] to <4 x i32>
+; AVX1-NEXT:    [[TMP56:%.*]] = mul nsw <4 x i32> [[TMP55]], [[TMP54]]
+; AVX1-NEXT:    [[TMP57:%.*]] = add nsw <4 x i32> [[TMP56]], [[TMP53]]
+; AVX1-NEXT:    [[TMP58:%.*]] = getelementptr inbounds i32, ptr [[D1]], i64 [[INDEX26]]
+; AVX1-NEXT:    store <4 x i32> [[TMP57]], ptr [[TMP58]], align 4
+; AVX1-NEXT:    [[INDEX_NEXT33]] = add nuw i64 [[INDEX26]], 4
+; AVX1-NEXT:    [[TMP59:%.*]] = icmp eq i64 [[INDEX_NEXT33]], [[N_VEC25]]
+; AVX1-NEXT:    br i1 [[TMP59]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[FOR_BODY1]], !llvm.loop [[LOOP3:![0-9]+]]
+; AVX1:       vec.epilog.middle.block:
+; AVX1-NEXT:    [[CMP_N34:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC25]]
+; AVX1-NEXT:    br i1 [[CMP_N34]], label [[FOR_END_LOOPEXIT]], label [[SCALAR_PH]]
+; AVX1:       vec.epilog.scalar.ph:
+; AVX1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC25]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; AVX1-NEXT:    br label [[FOR_BODY:%.*]]
 ; AVX1:       for.body:
-; AVX1-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY1]] ]
+; AVX1-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; AVX1-NEXT:    [[TMP61:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 1
 ; AVX1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[S1]], i64 [[TMP61]]
 ; AVX1-NEXT:    [[TMP62:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
@@ -248,7 +324,7 @@ define void @test_muladd(ptr noalias nocapture %d1, ptr noalias nocapture readon
 ; AVX1-NEXT:    store i32 [[ADD18]], ptr [[ARRAYIDX20]], align 4
 ; AVX1-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; AVX1-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; AVX1-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY1]], !llvm.loop [[LOOP3:![0-9]+]]
+; AVX1-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; AVX1:       for.end.loopexit:
 ; AVX1-NEXT:    br label [[FOR_END]]
 ; AVX1:       for.end:
diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr54634.ll b/llvm/test/Transforms/LoopVectorize/X86/pr54634.ll
index 994cd331c4194..8a48f997052f0 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/pr54634.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr54634.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt -passes=loop-vectorize -mcpu=skylake-avx512 -S %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:10:11:12:13"
@@ -7,35 +7,36 @@ target triple = "x86_64-unknown-linux-gnu"
 @jlplt_ijl_alloc_array_1d_10294_got = external dso_local local_unnamed_addr global ptr
 
 define ptr addrspace(10) @japi1_vect_42283(ptr nocapture readonly %0, i32 %1) local_unnamed_addr #0 {
-; CHECK-LABEL: @japi1_vect_42283(
-; CHECK-NEXT:  iter.check:
-; CHECK-NEXT:    [[TMP2:%.*]] = sext i32 [[TMP1:%.*]] to i64
+; CHECK-LABEL: define ptr addrspace(10) @japi1_vect_42283(
+; CHECK-SAME: ptr readonly captures(none) [[TMP0:%.*]], i32 [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ITER_CHECK:.*]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = sext i32 [[TMP1]] to i64
 ; CHECK-NEXT:    [[TMP3:%.*]] = load atomic ptr, ptr @jlplt_ijl_alloc_array_1d_10294_got unordered, align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = tail call ptr addrspace(10) [[TMP3]](ptr addrspace(10) null, i64 0)
-; CHECK-NEXT:    [[TMP5:%.*]] = load ptr addrspace(10), ptr [[TMP0:%.*]], align 8, !tbaa [[TBAA0:![0-9]+]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load ptr addrspace(10), ptr [[TMP0]], align 8, !tbaa [[JTBAA_VALUE_TBAA0:![0-9]+]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = addrspacecast ptr addrspace(10) [[TMP4]] to ptr addrspace(11)
-; CHECK-NEXT:    [[TMP7:%.*]] = load ptr addrspace(13), ptr addrspace(11) [[TMP6]], align 8, !tbaa [[TBAA5:![0-9]+]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load ptr addrspace(13), ptr addrspace(11) [[TMP6]], align 8, !tbaa [[JTBAA_ARRAYPTR_TBAA5:![0-9]+]]
 ; CHECK-NEXT:    [[DOTELT:%.*]] = getelementptr inbounds { ptr addrspace(10), i64 }, ptr addrspace(10) [[TMP5]], i64 0, i32 0
-; CHECK-NEXT:    [[DOTUNPACK:%.*]] = load ptr addrspace(10), ptr addrspace(10) [[DOTELT]], align 8, !tbaa [[TBAA8:![0-9]+]]
+; CHECK-NEXT:    [[DOTUNPACK:%.*]] = load ptr addrspace(10), ptr addrspace(10) [[DOTELT]], align 8, !tbaa [[JTBAA_IMMUT_TBAA8:![0-9]+]]
 ; CHECK-NEXT:    [[DOTELT1:%.*]] = getelementptr inbounds { ptr addrspace(10), i64 }, ptr addrspace(10) [[TMP5]], i64 0, i32 1
-; CHECK-NEXT:    [[DOTUNPACK2:%.*]] = load i64, ptr addrspace(10) [[DOTELT1]], align 8, !tbaa [[TBAA8]]
+; CHECK-NEXT:    [[DOTUNPACK2:%.*]] = load i64, ptr addrspace(10) [[DOTELT1]], align 8, !tbaa [[JTBAA_IMMUT_TBAA8]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = add nsw i64 [[TMP2]], 1
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP8]], 4
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[TOP:%.*]]
-; CHECK:       vector.main.loop.iter.check:
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[TOP:.*]]
+; CHECK:       [[TOP]]:
 ; CHECK-NEXT:    [[TMP17:%.*]] = icmp ult i64 [[TMP8]], 16
-; CHECK-NEXT:    br i1 [[TMP17]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
+; CHECK-NEXT:    br i1 [[TMP17]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP8]], 16
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP8]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x ptr addrspace(10)> poison, ptr addrspace(10) [[DOTUNPACK]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x ptr addrspace(10)> [[BROADCAST_SPLATINSERT]], <4 x ptr addrspace(10)> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <4 x i64> poison, i64 [[DOTUNPACK2]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT8:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT7]], <4 x i64> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[STEP_ADD4:%.*]] = add <4 x i64> [[STEP_ADD]], splat (i64 4)
 ; CHECK-NEXT:    [[STEP_ADD5:%.*]] = add <4 x i64> [[STEP_ADD4]], splat (i64 4)
@@ -43,31 +44,31 @@ define ptr addrspace(10) @japi1_vect_42283(ptr nocapture readonly %0, i32 %1) lo
 ; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds { ptr addrspace(10), i64 }, ptr addrspace(13) [[TMP7]], <4 x i64> [[STEP_ADD]], i32 0
 ; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds { ptr addrspace(10), i64 }, ptr addrspace(13) [[TMP7]], <4 x i64> [[STEP_ADD4]], i32 0
 ; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds { ptr addrspace(10), i64 }, ptr addrspace(13) [[TMP7]], <4 x i64> [[STEP_ADD5]], i32 0
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4p10.v4p13(<4 x ptr addrspace(10)> [[BROADCAST_SPLAT]], <4 x ptr addrspace(13)> [[TMP18]], i32 8, <4 x i1> splat (i1 true)), !tbaa [[TBAA10:![0-9]+]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4p10.v4p13(<4 x ptr addrspace(10)> [[BROADCAST_SPLAT]], <4 x ptr addrspace(13)> [[TMP19]], i32 8, <4 x i1> splat (i1 true)), !tbaa [[TBAA10]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4p10.v4p13(<4 x ptr addrspace(10)> [[BROADCAST_SPLAT]], <4 x ptr addrspace(13)> [[TMP20]], i32 8, <4 x i1> splat (i1 true)), !tbaa [[TBAA10]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4p10.v4p13(<4 x ptr addrspace(10)> [[BROADCAST_SPLAT]], <4 x ptr addrspace(13)> [[TMP21]], i32 8, <4 x i1> splat (i1 true)), !tbaa [[TBAA10]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4p10.v4p13(<4 x ptr addrspace(10)> [[BROADCAST_SPLAT]], <4 x ptr addrspace(13)> [[TMP18]], i32 8, <4 x i1> splat (i1 true)), !tbaa [[JTBAA_ARRAYBUF_TBAA10:![0-9]+]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4p10.v4p13(<4 x ptr addrspace(10)> [[BROADCAST_SPLAT]], <4 x ptr addrspace(13)> [[TMP19]], i32 8, <4 x i1> splat (i1 true)), !tbaa [[JTBAA_ARRAYBUF_TBAA10]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4p10.v4p13(<4 x ptr addrspace(10)> [[BROADCAST_SPLAT]], <4 x ptr addrspace(13)> [[TMP20]], i32 8, <4 x i1> splat (i1 true)), !tbaa [[JTBAA_ARRAYBUF_TBAA10]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4p10.v4p13(<4 x ptr addrspace(10)> [[BROADCAST_SPLAT]], <4 x ptr addrspace(13)> [[TMP21]], i32 8, <4 x i1> splat (i1 true)), !tbaa [[JTBAA_ARRAYBUF_TBAA10]]
 ; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds { ptr addrspace(10), i64 }, ptr addrspace(13) [[TMP7]], <4 x i64> [[VEC_IND]], i32 1
 ; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds { ptr addrspace(10), i64 }, ptr addrspace(13) [[TMP7]], <4 x i64> [[STEP_ADD]], i32 1
 ; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds { ptr addrspace(10), i64 }, ptr addrspace(13) [[TMP7]], <4 x i64> [[STEP_ADD4]], i32 1
 ; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds { ptr addrspace(10), i64 }, ptr addrspace(13) [[TMP7]], <4 x i64> [[STEP_ADD5]], i32 1
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4i64.v4p13(<4 x i64> [[BROADCAST_SPLAT8]], <4 x ptr addrspace(13)> [[TMP22]], i32 8, <4 x i1> splat (i1 true)), !tbaa [[TBAA10]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4i64.v4p13(<4 x i64> [[BROADCAST_SPLAT8]], <4 x ptr addrspace(13)> [[TMP23]], i32 8, <4 x i1> splat (i1 true)), !tbaa [[TBAA10]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4i64.v4p13(<4 x i64> [[BROADCAST_SPLAT8]], <4 x ptr addrspace(13)> [[TMP24]], i32 8, <4 x i1> splat (i1 true)), !tbaa [[TBAA10]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4i64.v4p13(<4 x i64> [[BROADCAST_SPLAT8]], <4 x ptr addrspace(13)> [[TMP25]], i32 8, <4 x i1> splat (i1 true)), !tbaa [[TBAA10]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i64.v4p13(<4 x i64> [[BROADCAST_SPLAT8]], <4 x ptr addrspace(13)> [[TMP22]], i32 8, <4 x i1> splat (i1 true)), !tbaa [[JTBAA_ARRAYBUF_TBAA10]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i64.v4p13(<4 x i64> [[BROADCAST_SPLAT8]], <4 x ptr addrspace(13)> [[TMP23]], i32 8, <4 x i1> splat (i1 true)), !tbaa [[JTBAA_ARRAYBUF_TBAA10]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i64.v4p13(<4 x i64> [[BROADCAST_SPLAT8]], <4 x ptr addrspace(13)> [[TMP24]], i32 8, <4 x i1> splat (i1 true)), !tbaa [[JTBAA_ARRAYBUF_TBAA10]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i64.v4p13(<4 x i64> [[BROADCAST_SPLAT8]], <4 x ptr addrspace(13)> [[TMP25]], i32 8, <4 x i1> splat (i1 true)), !tbaa [[JTBAA_ARRAYBUF_TBAA10]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD5]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP26]], label [[MIDDLE_BLOCK1:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
-; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 [[TMP26]], label %[[MIDDLE_BLOCK1:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK1]]:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP8]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[L44:%.*]], label [[MIDDLE_BLOCK:%.*]]
-; CHECK:       vec.epilog.iter.check:
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[L44:.*]], label %[[MIDDLE_BLOCK:.*]]
+; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP8]], [[N_VEC]]
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4
-; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[SCALAR_PH]]
-; CHECK:       vec.epilog.ph:
-; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[TOP]] ]
+; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[SCALAR_PH]], !prof [[PROF15:![0-9]+]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[TOP]] ]
 ; CHECK-NEXT:    [[N_MOD_VF4:%.*]] = urem i64 [[TMP8]], 4
 ; CHECK-NEXT:    [[N_VEC5:%.*]] = sub i64 [[TMP8]], [[N_MOD_VF4]]
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT10:%.*]] = insertelement <4 x ptr addrspace(10)> poison, ptr addrspace(10) [[DOTUNPACK]], i64 0
@@ -77,34 +78,34 @@ define ptr addrspace(10) @japi1_vect_42283(ptr nocapture readonly %0, i32 %1) lo
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[VEC_EPILOG_RESUME_VAL]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i64> [[DOTSPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i64> [[DOTSPLAT]], <i64 0, i64 1, i64 2, i64 3>
-; CHECK-NEXT:    br label [[L26:%.*]]
-; CHECK:       vec.epilog.vector.body:
-; CHECK-NEXT:    [[INDEX7:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDEX_NEXT14:%.*]], [[L26]] ]
-; CHECK-NEXT:    [[VEC_IND8:%.*]] = phi <4 x i64> [ [[INDUCTION]], [[SCALAR_PH]] ], [ [[VEC_IND_NEXT9:%.*]], [[L26]] ]
+; CHECK-NEXT:    br label %[[L26:.*]]
+; CHECK:       [[L26]]:
+; CHECK-NEXT:    [[INDEX7:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDEX_NEXT14:%.*]], %[[L26]] ]
+; CHECK-NEXT:    [[VEC_IND8:%.*]] = phi <4 x i64> [ [[INDUCTION]], %[[SCALAR_PH]] ], [ [[VEC_IND_NEXT9:%.*]], %[[L26]] ]
 ; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds { ptr addrspace(10), i64 }, ptr addrspace(13) [[TMP7]], <4 x i64> [[VEC_IND8]], i32 0
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4p10.v4p13(<4 x ptr addrspace(10)> [[BROADCAST_SPLAT11]], <4 x ptr addrspace(13)> [[TMP28]], i32 8, <4 x i1> splat (i1 true)), !tbaa [[TBAA10]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4p10.v4p13(<4 x ptr addrspace(10)> [[BROADCAST_SPLAT11]], <4 x ptr addrspace(13)> [[TMP28]], i32 8, <4 x i1> splat (i1 true)), !tbaa [[JTBAA_ARRAYBUF_TBAA10]]
 ; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds { ptr addrspace(10), i64 }, ptr addrspace(13) [[TMP7]], <4 x i64> [[VEC_IND8]], i32 1
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4i64.v4p13(<4 x i64> [[BROADCAST_SPLAT13]], <4 x ptr addrspace(13)> [[TMP29]], i32 8, <4 x i1> splat (i1 true)), !tbaa [[TBAA10]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i64.v4p13(<4 x i64> [[BROADCAST_SPLAT13]], <4 x ptr addrspace(13)> [[TMP29]], i32 8, <4 x i1> splat (i1 true)), !tbaa [[JTBAA_ARRAYBUF_TBAA10]]
 ; CHECK-NEXT:    [[INDEX_NEXT14]] = add nuw i64 [[INDEX7]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT9]] = add <4 x i64> [[VEC_IND8]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT14]], [[N_VEC5]]
-; CHECK-NEXT:    br i1 [[TMP30]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[L26]], !llvm.loop [[LOOP15:![0-9]+]]
-; CHECK:       vec.epilog.middle.block:
+; CHECK-NEXT:    br i1 [[TMP30]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[L26]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK:       [[VEC_EPILOG_MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[CMP_N15:%.*]] = icmp eq i64 [[TMP8]], [[N_VEC5]]
-; CHECK-NEXT:    br i1 [[CMP_N15]], label [[L44]], label [[VEC_EPILOG_SCALAR_PH]]
-; CHECK:       vec.epilog.scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL6:%.*]] = phi i64 [ [[N_VEC5]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK:%.*]] ]
-; CHECK-NEXT:    br label [[L27:%.*]]
-; CHECK:       L26:
-; CHECK-NEXT:    [[VALUE_PHI5:%.*]] = phi i64 [ [[BC_RESUME_VAL6]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[TMP27:%.*]], [[L27]] ]
+; CHECK-NEXT:    br i1 [[CMP_N15]], label %[[L44]], label %[[VEC_EPILOG_SCALAR_PH]]
+; CHECK:       [[VEC_EPILOG_SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL6:%.*]] = phi i64 [ [[N_VEC5]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ITER_CHECK]] ]
+; CHECK-NEXT:    br label %[[L27:.*]]
+; CHECK:       [[L27]]:
+; CHECK-NEXT:    [[VALUE_PHI5:%.*]] = phi i64 [ [[BC_RESUME_VAL6]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[TMP27:%.*]], %[[L27]] ]
 ; CHECK-NEXT:    [[DOTREPACK:%.*]] = getelementptr inbounds { ptr addrspace(10), i64 }, ptr addrspace(13) [[TMP7]], i64 [[VALUE_PHI5]], i32 0
-; CHECK-NEXT:    store ptr addrspace(10) [[DOTUNPACK]], ptr addrspace(13) [[DOTREPACK]], align 8, !tbaa [[TBAA10]]
+; CHECK-NEXT:    store ptr addrspace(10) [[DOTUNPACK]], ptr addrspace(13) [[DOTREPACK]], align 8, !tbaa [[JTBAA_ARRAYBUF_TBAA10]]
 ; CHECK-NEXT:    [[DOTREPACK4:%.*]] = getelementptr inbounds { ptr addrspace(10), i64 }, ptr addrspace(13) [[TMP7]], i64 [[VALUE_PHI5]], i32 1
-; CHECK-NEXT:    store i64 [[DOTUNPACK2]], ptr addrspace(13) [[DOTREPACK4]], align 8, !tbaa [[TBAA10]]
+; CHECK-NEXT:    store i64 [[DOTUNPACK2]], ptr addrspace(13) [[DOTREPACK4]], align 8, !tbaa [[JTBAA_ARRAYBUF_TBAA10]]
 ; CHECK-NEXT:    [[TMP27]] = add i64 [[VALUE_PHI5]], 1
 ; CHECK-NEXT:    [[DOTNOT:%.*]] = icmp eq i64 [[VALUE_PHI5]], [[TMP2]]
-; CHECK-NEXT:    br i1 [[DOTNOT]], label [[L44]], label [[L27]], !llvm.loop [[LOOP16:![0-9]+]]
-; CHECK:       L44:
+; CHECK-NEXT:    br i1 [[DOTNOT]], label %[[L44]], label %[[L27]], !llvm.loop [[LOOP17:![0-9]+]]
+; CHECK:       [[L44]]:
 ; CHECK-NEXT:    ret ptr addrspace(10) null
 ;
 top:
@@ -146,3 +147,23 @@ L44:                                              ; preds = %L26
 !9 = !{!"jtbaa_immut", !1, i64 0}
 !10 = !{!11, !11, i64 0}
 !11 = !{!"jtbaa_arraybuf", !2, i64 0}
+;.
+; CHECK: [[JTBAA_VALUE_TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0}
+; CHECK: [[META1]] = !{!"jtbaa_value", [[META2:![0-9]+]], i64 0}
+; CHECK: [[META2]] = !{!"jtbaa_data", [[META3:![0-9]+]], i64 0}
+; CHECK: [[META3]] = !{!"jtbaa", [[META4:![0-9]+]], i64 0}
+; CHECK: [[META4]] = !{!"jtbaa"}
+; CHECK: [[JTBAA_ARRAYPTR_TBAA5]] = !{[[META6:![0-9]+]], [[META6]], i64 0}
+; CHECK: [[META6]] = !{!"jtbaa_arrayptr", [[META7:![0-9]+]], i64 0}
+; CHECK: [[META7]] = !{!"jtbaa_array", [[META3]], i64 0}
+; CHECK: [[JTBAA_IMMUT_TBAA8]] = !{[[META9:![0-9]+]], [[META9]], i64 0}
+; CHECK: [[META9]] = !{!"jtbaa_immut", [[META1]], i64 0}
+; CHECK: [[JTBAA_ARRAYBUF_TBAA10]] = !{[[META11:![0-9]+]], [[META11]], i64 0}
+; CHECK: [[META11]] = !{!"jtbaa_arraybuf", [[META2]], i64 0}
+; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META13:![0-9]+]], [[META14:![0-9]+]]}
+; CHECK: [[META13]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META14]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[PROF15]] = !{!"branch_weights", i32 4, i32 12}
+; CHECK: [[LOOP16]] = distinct !{[[LOOP16]], [[META13]], [[META14]]}
+; CHECK: [[LOOP17]] = distinct !{[[LOOP17]], [[META14]], [[META13]]}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr81872.ll b/llvm/test/Transforms/LoopVectorize/X86/pr81872.ll
index d261827d4e111..439e1f181b5df 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/pr81872.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr81872.ll
@@ -46,7 +46,7 @@ define void @test(ptr noundef align 8 dereferenceable_or_null(16) %arr) #0 {
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 99, [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
 ; CHECK-NEXT:    [[AND:%.*]] = and i64 [[IV]], 1
 ; CHECK-NEXT:    [[ICMP17:%.*]] = icmp eq i64 [[AND]], 0
-; CHECK-NEXT:    br i1 [[ICMP17]], label [[BB18:%.*]], label [[LOOP_LATCH]], !prof [[PROF5:![0-9]+]]
+; CHECK-NEXT:    br i1 [[ICMP17]], label [[BB18:%.*]], label [[LOOP_LATCH]], !prof [[PROF6:![0-9]+]]
 ; CHECK:       bb18:
 ; CHECK-NEXT:    [[OR:%.*]] = or disjoint i64 [[IV]], 1
 ; CHECK-NEXT:    [[GETELEMENTPTR19:%.*]] = getelementptr inbounds i64, ptr [[ARR]], i64 [[OR]]
@@ -55,7 +55,7 @@ define void @test(ptr noundef align 8 dereferenceable_or_null(16) %arr) #0 {
 ; CHECK:       loop.latch:
 ; CHECK-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], -1
 ; CHECK-NEXT:    [[ICMP22:%.*]] = icmp eq i64 [[IV_NEXT]], 90
-; CHECK-NEXT:    br i1 [[ICMP22]], label [[BB6]], label [[LOOP_HEADER]], !prof [[PROF6:![0-9]+]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-NEXT:    br i1 [[ICMP22]], label [[BB6]], label [[LOOP_HEADER]], !prof [[PROF7:![0-9]+]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       bb6:
 ; CHECK-NEXT:    ret void
 ;
@@ -96,10 +96,12 @@ attributes #0 = {"target-cpu"="haswell" "target-features"="+avx2" }
 ;.
 ; CHECK: [[PROF0]] = !{!"branch_weights", i32 1, i32 127}
 ; CHECK: [[PROF1]] = !{!"branch_weights", i32 1, i32 23}
-; CHECK: [[LOOP2]] = distinct !{[[LOOP2]], [[META3:![0-9]+]], [[META4:![0-9]+]]}
+; CHECK: [[LOOP2]] = distinct !{[[LOOP2]], [[META3:![0-9]+]], [[META4:![0-9]+]], [[META5:![0-9]+]]}
 ; CHECK: [[META3]] = !{!"llvm.loop.isvectorized", i32 1}
 ; CHECK: [[META4]] = !{!"llvm.loop.unroll.runtime.disable"}
-; CHECK: [[PROF5]] = !{!"branch_weights", i32 1, i32 1}
-; CHECK: [[PROF6]] = !{!"branch_weights", i32 0, i32 0}
-; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META4]], [[META3]]}
+; CHECK: [[META5]] = !{!"llvm.loop.estimated_trip_count", i32 24}
+; CHECK: [[PROF6]] = !{!"branch_weights", i32 1, i32 1}
+; CHECK: [[PROF7]] = !{!"branch_weights", i32 0, i32 0}
+; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META4]], [[META3]], [[META9:![0-9]+]]}
+; CHECK: [[META9]] = !{!"llvm.loop.estimated_trip_count", i32 0}
 ;.
diff --git a/llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll b/llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll
index 35f61b2aa838a..050243faa49f4 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt < %s -passes=loop-vectorize -S -o - | FileCheck %s
 ; RUN: opt < %s -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -o - | FileCheck --check-prefix=MAX-BW %s
 
@@ -10,21 +10,22 @@ target triple = "x86_64-unknown-linux-gnu"
 
 ; Function Attrs: norecurse nounwind readonly uwtable
 define i32 @matrix_row_col(ptr nocapture readonly %data, i32 %i, i32 %j) local_unnamed_addr #0 {
-; CHECK-LABEL: @matrix_row_col(
-; CHECK-NEXT:  iter.check:
-; CHECK-NEXT:    [[IDXPROM:%.*]] = sext i32 [[I:%.*]] to i64
-; CHECK-NEXT:    [[IDXPROM5:%.*]] = sext i32 [[J:%.*]] to i64
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.main.loop.iter.check:
-; CHECK-NEXT:    br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH1:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH1]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH1]] ], [ [[TMP144:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH1]] ], [ [[TMP145:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI2:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH1]] ], [ [[TMP146:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI3:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH1]] ], [ [[TMP147:%.*]], [[VECTOR_BODY]] ]
+; CHECK-LABEL: define i32 @matrix_row_col(
+; CHECK-SAME: ptr readonly captures(none) [[DATA:%.*]], i32 [[I:%.*]], i32 [[J:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ITER_CHECK:.*]]:
+; CHECK-NEXT:    [[IDXPROM:%.*]] = sext i32 [[I]] to i64
+; CHECK-NEXT:    [[IDXPROM5:%.*]] = sext i32 [[J]] to i64
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    br i1 false, label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH1:.*]]
+; CHECK:       [[VECTOR_PH1]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH1]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <8 x i32> [ zeroinitializer, %[[VECTOR_PH1]] ], [ [[TMP144:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <8 x i32> [ zeroinitializer, %[[VECTOR_PH1]] ], [ [[TMP145:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI2:%.*]] = phi <8 x i32> [ zeroinitializer, %[[VECTOR_PH1]] ], [ [[TMP146:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI3:%.*]] = phi <8 x i32> [ zeroinitializer, %[[VECTOR_PH1]] ], [ [[TMP147:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
@@ -57,14 +58,14 @@ define i32 @matrix_row_col(ptr nocapture readonly %data, i32 %i, i32 %j) local_u
 ; CHECK-NEXT:    [[TMP29:%.*]] = add i64 [[INDEX]], 29
 ; CHECK-NEXT:    [[TMP30:%.*]] = add i64 [[INDEX]], 30
 ; CHECK-NEXT:    [[TMP31:%.*]] = add i64 [[INDEX]], 31
-; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA:%.*]], i64 [[IDXPROM]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[IDXPROM]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP37:%.*]] = getelementptr inbounds i32, ptr [[TMP32]], i32 8
 ; CHECK-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, ptr [[TMP32]], i32 16
 ; CHECK-NEXT:    [[TMP39:%.*]] = getelementptr inbounds i32, ptr [[TMP32]], i32 24
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP32]], align 4, !tbaa [[TBAA1:![0-9]+]]
-; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <8 x i32>, ptr [[TMP37]], align 4, !tbaa [[TBAA1]]
-; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <8 x i32>, ptr [[TMP38]], align 4, !tbaa [[TBAA1]]
-; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <8 x i32>, ptr [[TMP39]], align 4, !tbaa [[TBAA1]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP32]], align 4, !tbaa [[INT_TBAA1:![0-9]+]]
+; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <8 x i32>, ptr [[TMP37]], align 4, !tbaa [[INT_TBAA1]]
+; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <8 x i32>, ptr [[TMP38]], align 4, !tbaa [[INT_TBAA1]]
+; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <8 x i32>, ptr [[TMP39]], align 4, !tbaa [[INT_TBAA1]]
 ; CHECK-NEXT:    [[TMP40:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP0]], i64 [[IDXPROM5]]
 ; CHECK-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP1]], i64 [[IDXPROM5]]
 ; CHECK-NEXT:    [[TMP42:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP2]], i64 [[IDXPROM5]]
@@ -97,14 +98,14 @@ define i32 @matrix_row_col(ptr nocapture readonly %data, i32 %i, i32 %j) local_u
 ; CHECK-NEXT:    [[TMP69:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP29]], i64 [[IDXPROM5]]
 ; CHECK-NEXT:    [[TMP70:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP30]], i64 [[IDXPROM5]]
 ; CHECK-NEXT:    [[TMP71:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP31]], i64 [[IDXPROM5]]
-; CHECK-NEXT:    [[TMP72:%.*]] = load i32, ptr [[TMP40]], align 4, !tbaa [[TBAA1]]
-; CHECK-NEXT:    [[TMP73:%.*]] = load i32, ptr [[TMP41]], align 4, !tbaa [[TBAA1]]
-; CHECK-NEXT:    [[TMP74:%.*]] = load i32, ptr [[TMP42]], align 4, !tbaa [[TBAA1]]
-; CHECK-NEXT:    [[TMP75:%.*]] = load i32, ptr [[TMP43]], align 4, !tbaa [[TBAA1]]
-; CHECK-NEXT:    [[TMP76:%.*]] = load i32, ptr [[TMP44]], align 4, !tbaa [[TBAA1]]
-; CHECK-NEXT:    [[TMP77:%.*]] = load i32, ptr [[TMP45]], align 4, !tbaa [[TBAA1]]
-; CHECK-NEXT:    [[TMP78:%.*]] = load i32, ptr [[TMP46]], align 4, !tbaa [[TBAA1]]
-; CHECK-NEXT:    [[TMP79:%.*]] = load i32, ptr [[TMP47]], align 4, !tbaa [[TBAA1]]
+; CHECK-NEXT:    [[TMP72:%.*]] = load i32, ptr [[TMP40]], align 4, !tbaa [[INT_TBAA1]]
+; CHECK-NEXT:    [[TMP73:%.*]] = load i32, ptr [[TMP41]], align 4, !tbaa [[INT_TBAA1]]
+; CHECK-NEXT:    [[TMP74:%.*]] = load i32, ptr [[TMP42]], align 4, !tbaa [[INT_TBAA1]]
+; CHECK-NEXT:    [[TMP75:%.*]] = load i32, ptr [[TMP43]], align 4, !tbaa [[INT_TBAA1]]
+; CHECK-NEXT:    [[TMP76:%.*]] = load i32, ptr [[TMP44]], align 4, !tbaa [[INT_TBAA1]]
+; CHECK-NEXT:    [[TMP77:%.*]] = load i32, ptr [[TMP45]], align 4, !tbaa [[INT_TBAA1]]
+; CHECK-NEXT:    [[TMP78:%.*]] = load i32, ptr [[TMP46]], align 4, !tbaa [[INT_TBAA1]]
+; CHECK-NEXT:    [[TMP79:%.*]] = load i32, ptr [[TMP47]], align 4, !tbaa [[INT_TBAA1]]
 ; CHECK-NEXT:    [[TMP80:%.*]] = insertelement <8 x i32> poison, i32 [[TMP72]], i32 0
 ; CHECK-NEXT:    [[TMP81:%.*]] = insertelement <8 x i32> [[TMP80]], i32 [[TMP73]], i32 1
 ; CHECK-NEXT:    [[TMP82:%.*]] = insertelement <8 x i32> [[TMP81]], i32 [[TMP74]], i32 2
@@ -113,14 +114,14 @@ define i32 @matrix_row_col(ptr nocapture readonly %data, i32 %i, i32 %j) local_u
 ; CHECK-NEXT:    [[TMP85:%.*]] = insertelement <8 x i32> [[TMP84]], i32 [[TMP77]], i32 5
 ; CHECK-NEXT:    [[TMP86:%.*]] = insertelement <8 x i32> [[TMP85]], i32 [[TMP78]], i32 6
 ; CHECK-NEXT:    [[TMP87:%.*]] = insertelement <8 x i32> [[TMP86]], i32 [[TMP79]], i32 7
-; CHECK-NEXT:    [[TMP88:%.*]] = load i32, ptr [[TMP48]], align 4, !tbaa [[TBAA1]]
-; CHECK-NEXT:    [[TMP89:%.*]] = load i32, ptr [[TMP49]], align 4, !tbaa [[TBAA1]]
-; CHECK-NEXT:    [[TMP90:%.*]] = load i32, ptr [[TMP50]], align 4, !tbaa [[TBAA1]]
-; CHECK-NEXT:    [[TMP91:%.*]] = load i32, ptr [[TMP51]], align 4, !tbaa [[TBAA1]]
-; CHECK-NEXT:    [[TMP92:%.*]] = load i32, ptr [[TMP52]], align 4, !tbaa [[TBAA1]]
-; CHECK-NEXT:    [[TMP93:%.*]] = load i32, ptr [[TMP53]], align 4, !tbaa [[TBAA1]]
-; CHECK-NEXT:    [[TMP94:%.*]] = load i32, ptr [[TMP54]], align 4, !tbaa [[TBAA1]]
-; CHECK-NEXT:    [[TMP95:%.*]] = load i32, ptr [[TMP55]], align 4, !tbaa [[TBAA1]]
+; CHECK-NEXT:    [[TMP88:%.*]] = load i32, ptr [[TMP48]], align 4, !tbaa [[INT_TBAA1]]
+; CHECK-NEXT:    [[TMP89:%.*]] = load i32, ptr [[TMP49]], align 4, !tbaa [[INT_TBAA1]]
+; CHECK-NEXT:    [[TMP90:%.*]] = load i32, ptr [[TMP50]], align 4, !tbaa [[INT_TBAA1]]
+; CHECK-NEXT:    [[TMP91:%.*]] = load i32, ptr [[TMP51]], align 4, !tbaa [[INT_TBAA1]]
+; CHECK-NEXT:    [[TMP92:%.*]] = load i32, ptr [[TMP52]], align 4, !tbaa [[INT_TBAA1]]
+; CHECK-NEXT:    [[TMP93:%.*]] = load i32, ptr [[TMP53]], align 4, !tbaa [[INT_TBAA1]]
+; CHECK-NEXT:    [[TMP94:%.*]] = load i32, ptr [[TMP54]], align 4, !tbaa [[INT_TBAA1]]
+; CHECK-NEXT:    [[TMP95:%.*]] = load i32, ptr [[TMP55]], align 4, !tbaa [[INT_TBAA1]]
 ; CHECK-NEXT:    [[TMP96:%.*]] = insertelement <8 x i32> poison, i32 [[TMP88]], i32 0
 ; CHECK-NEXT:    [[TMP97:%.*]] = insertelement <8 x i32> [[TMP96]], i32 [[TMP89]], i32 1
 ; CHECK-NEXT:    [[TMP98:%.*]] = insertelement <8 x i32> [[TMP97]], i32 [[TMP90]], i32 2
@@ -129,14 +130,14 @@ define i32 @matrix_row_col(ptr nocapture readonly %data, i32 %i, i32 %j) local_u
 ; CHECK-NEXT:    [[TMP101:%.*]] = insertelement <8 x i32> [[TMP100]], i32 [[TMP93]], i32 5
 ; CHECK-NEXT:    [[TMP102:%.*]] = insertelement <8 x i32> [[TMP101]], i32 [[TMP94]], i32 6
 ; CHECK-NEXT:    [[TMP103:%.*]] = insertelement <8 x i32> [[TMP102]], i32 [[TMP95]], i32 7
-; CHECK-NEXT:    [[TMP104:%.*]] = load i32, ptr [[TMP56]], align 4, !tbaa [[TBAA1]]
-; CHECK-NEXT:    [[TMP105:%.*]] = load i32, ptr [[TMP57]], align 4, !tbaa [[TBAA1]]
-; CHECK-NEXT:    [[TMP106:%.*]] = load i32, ptr [[TMP58]], align 4, !tbaa [[TBAA1]]
-; CHECK-NEXT:    [[TMP107:%.*]] = load i32, ptr [[TMP59]], align 4, !tbaa [[TBAA1]]
-; CHECK-NEXT:    [[TMP108:%.*]] = load i32, ptr [[TMP60]], align 4, !tbaa [[TBAA1]]
-; CHECK-NEXT:    [[TMP109:%.*]] = load i32, ptr [[TMP61]], align 4, !tbaa [[TBAA1]]
-; CHECK-NEXT:    [[TMP110:%.*]] = load i32, ptr [[TMP62]], align 4, !tbaa [[TBAA1]]
-; CHECK-NEXT:    [[TMP111:%.*]] = load i32, ptr [[TMP63]], align 4, !tbaa [[TBAA1]]
+; CHECK-NEXT:    [[TMP104:%.*]] = load i32, ptr [[TMP56]], align 4, !tbaa [[INT_TBAA1]]
+; CHECK-NEXT:    [[TMP105:%.*]] = load i32, ptr [[TMP57]], align 4, !tbaa [[INT_TBAA1]]
+; CHECK-NEXT:    [[TMP106:%.*]] = load i32, ptr [[TMP58]], align 4, !tbaa [[INT_TBAA1]]
+; CHECK-NEXT:    [[TMP107:%.*]] = load i32, ptr [[TMP59]], align 4, !tbaa [[INT_TBAA1]]
+; CHECK-NEXT:    [[TMP108:%.*]] = load i32, ptr [[TMP60]], align 4, !tbaa [[INT_TBAA1]]
+; CHECK-NEXT:    [[TMP109:%.*]] = load i32, ptr [[TMP61]], align 4, !tbaa [[INT_TBAA1]]
+; CHECK-NEXT:    [[TMP110:%.*]] = load i32, ptr [[TMP62]], align 4, !tbaa [[INT_TBAA1]]
+; CHECK-NEXT:    [[TMP111:%.*]] = load i32, ptr [[TMP63]], align 4, !tbaa [[INT_TBAA1]]
 ; CHECK-NEXT:    [[TMP112:%.*]] = insertelement <8 x i32> poison, i32 [[TMP104]], i32 0
 ; CHECK-NEXT:    [[TMP113:%.*]] = insertelement <8 x i32> [[TMP112]], i32 [[TMP105]], i32 1
 ; CHECK-NEXT:    [[TMP114:%.*]] = insertelement <8 x i32> [[TMP113]], i32 [[TMP106]], i32 2
@@ -145,14 +146,14 @@ define i32 @matrix_row_col(ptr nocapture readonly %data, i32 %i, i32 %j) local_u
 ; CHECK-NEXT:    [[TMP117:%.*]] = insertelement <8 x i32> [[TMP116]], i32 [[TMP109]], i32 5
 ; CHECK-NEXT:    [[TMP118:%.*]] = insertelement <8 x i32> [[TMP117]], i32 [[TMP110]], i32 6
 ; CHECK-NEXT:    [[TMP119:%.*]] = insertelement <8 x i32> [[TMP118]], i32 [[TMP111]], i32 7
-; CHECK-NEXT:    [[TMP120:%.*]] = load i32, ptr [[TMP64]], align 4, !tbaa [[TBAA1]]
-; CHECK-NEXT:    [[TMP121:%.*]] = load i32, ptr [[TMP65]], align 4, !tbaa [[TBAA1]]
-; CHECK-NEXT:    [[TMP122:%.*]] = load i32, ptr [[TMP66]], align 4, !tbaa [[TBAA1]]
-; CHECK-NEXT:    [[TMP123:%.*]] = load i32, ptr [[TMP67]], align 4, !tbaa [[TBAA1]]
-; CHECK-NEXT:    [[TMP124:%.*]] = load i32, ptr [[TMP68]], align 4, !tbaa [[TBAA1]]
-; CHECK-NEXT:    [[TMP125:%.*]] = load i32, ptr [[TMP69]], align 4, !tbaa [[TBAA1]]
-; CHECK-NEXT:    [[TMP126:%.*]] = load i32, ptr [[TMP70]], align 4, !tbaa [[TBAA1]]
-; CHECK-NEXT:    [[TMP127:%.*]] = load i32, ptr [[TMP71]], align 4, !tbaa [[TBAA1]]
+; CHECK-NEXT:    [[TMP120:%.*]] = load i32, ptr [[TMP64]], align 4, !tbaa [[INT_TBAA1]]
+; CHECK-NEXT:    [[TMP121:%.*]] = load i32, ptr [[TMP65]], align 4, !tbaa [[INT_TBAA1]]
+; CHECK-NEXT:    [[TMP122:%.*]] = load i32, ptr [[TMP66]], align 4, !tbaa [[INT_TBAA1]]
+; CHECK-NEXT:    [[TMP123:%.*]] = load i32, ptr [[TMP67]], align 4, !tbaa [[INT_TBAA1]]
+; CHECK-NEXT:    [[TMP124:%.*]] = load i32, ptr [[TMP68]], align 4, !tbaa [[INT_TBAA1]]
+; CHECK-NEXT:    [[TMP125:%.*]] = load i32, ptr [[TMP69]], align 4, !tbaa [[INT_TBAA1]]
+; CHECK-NEXT:    [[TMP126:%.*]] = load i32, ptr [[TMP70]], align 4, !tbaa [[INT_TBAA1]]
+; CHECK-NEXT:    [[TMP127:%.*]] = load i32, ptr [[TMP71]], align 4, !tbaa [[INT_TBAA1]]
 ; CHECK-NEXT:    [[TMP128:%.*]] = insertelement <8 x i32> poison, i32 [[TMP120]], i32 0
 ; CHECK-NEXT:    [[TMP129:%.*]] = insertelement <8 x i32> [[TMP128]], i32 [[TMP121]], i32 1
 ; CHECK-NEXT:    [[TMP130:%.*]] = insertelement <8 x i32> [[TMP129]], i32 [[TMP122]], i32 2
@@ -175,37 +176,37 @@ define i32 @matrix_row_col(ptr nocapture readonly %data, i32 %i, i32 %j) local_u
 ; CHECK-NEXT:    [[TMP147]] = add <8 x i32> [[TMP143]], [[TMP139]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; CHECK-NEXT:    [[TMP148:%.*]] = icmp eq i64 [[INDEX_NEXT]], 96
-; CHECK-NEXT:    br i1 [[TMP148]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 [[TMP148]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <8 x i32> [[TMP145]], [[TMP144]]
 ; CHECK-NEXT:    [[BIN_RDX7:%.*]] = add <8 x i32> [[TMP146]], [[BIN_RDX]]
 ; CHECK-NEXT:    [[BIN_RDX8:%.*]] = add <8 x i32> [[TMP147]], [[BIN_RDX7]]
 ; CHECK-NEXT:    [[TMP149:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[BIN_RDX8]])
-; CHECK-NEXT:    br i1 false, label [[FOR_COND_CLEANUP:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
-; CHECK:       vec.epilog.iter.check:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH]], label [[VEC_EPILOG_PH]]
-; CHECK:       vec.epilog.ph:
-; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 96, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_PH]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP149]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_PH]] ]
+; CHECK-NEXT:    br i1 false, label %[[FOR_COND_CLEANUP:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
+; CHECK:       [[VEC_EPILOG_ITER_CHECK]]:
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH]], label %[[VEC_EPILOG_PH]]
+; CHECK:       [[VEC_EPILOG_PH]]:
+; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 96, %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_PH]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP149]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_PH]] ]
 ; CHECK-NEXT:    [[TMP171:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX]], i32 0
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       vec.epilog.vector.body:
-; CHECK-NEXT:    [[INDEX9:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT12:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI10:%.*]] = phi <4 x i32> [ [[TMP171]], [[VEC_EPILOG_PH]] ], [ [[TMP168:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[INDEX9:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT12:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI10:%.*]] = phi <4 x i32> [ [[TMP171]], %[[VEC_EPILOG_PH]] ], [ [[TMP168:%.*]], %[[FOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP172:%.*]] = add i64 [[INDEX9]], 0
 ; CHECK-NEXT:    [[TMP173:%.*]] = add i64 [[INDEX9]], 1
 ; CHECK-NEXT:    [[TMP174:%.*]] = add i64 [[INDEX9]], 2
 ; CHECK-NEXT:    [[TMP175:%.*]] = add i64 [[INDEX9]], 3
 ; CHECK-NEXT:    [[TMP152:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[IDXPROM]], i64 [[TMP172]]
-; CHECK-NEXT:    [[WIDE_LOAD11:%.*]] = load <4 x i32>, ptr [[TMP152]], align 4, !tbaa [[TBAA1]]
+; CHECK-NEXT:    [[WIDE_LOAD11:%.*]] = load <4 x i32>, ptr [[TMP152]], align 4, !tbaa [[INT_TBAA1]]
 ; CHECK-NEXT:    [[TMP154:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP172]], i64 [[IDXPROM5]]
 ; CHECK-NEXT:    [[TMP155:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP173]], i64 [[IDXPROM5]]
 ; CHECK-NEXT:    [[TMP156:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP174]], i64 [[IDXPROM5]]
 ; CHECK-NEXT:    [[TMP157:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP175]], i64 [[IDXPROM5]]
-; CHECK-NEXT:    [[TMP158:%.*]] = load i32, ptr [[TMP154]], align 4, !tbaa [[TBAA1]]
-; CHECK-NEXT:    [[TMP159:%.*]] = load i32, ptr [[TMP155]], align 4, !tbaa [[TBAA1]]
-; CHECK-NEXT:    [[TMP160:%.*]] = load i32, ptr [[TMP156]], align 4, !tbaa [[TBAA1]]
-; CHECK-NEXT:    [[TMP161:%.*]] = load i32, ptr [[TMP157]], align 4, !tbaa [[TBAA1]]
+; CHECK-NEXT:    [[TMP158:%.*]] = load i32, ptr [[TMP154]], align 4, !tbaa [[INT_TBAA1]]
+; CHECK-NEXT:    [[TMP159:%.*]] = load i32, ptr [[TMP155]], align 4, !tbaa [[INT_TBAA1]]
+; CHECK-NEXT:    [[TMP160:%.*]] = load i32, ptr [[TMP156]], align 4, !tbaa [[INT_TBAA1]]
+; CHECK-NEXT:    [[TMP161:%.*]] = load i32, ptr [[TMP157]], align 4, !tbaa [[INT_TBAA1]]
 ; CHECK-NEXT:    [[TMP162:%.*]] = insertelement <4 x i32> poison, i32 [[TMP158]], i32 0
 ; CHECK-NEXT:    [[TMP163:%.*]] = insertelement <4 x i32> [[TMP162]], i32 [[TMP159]], i32 1
 ; CHECK-NEXT:    [[TMP164:%.*]] = insertelement <4 x i32> [[TMP163]], i32 [[TMP160]], i32 2
@@ -215,46 +216,47 @@ define i32 @matrix_row_col(ptr nocapture readonly %data, i32 %i, i32 %j) local_u
 ; CHECK-NEXT:    [[TMP168]] = add <4 x i32> [[TMP167]], [[TMP166]]
 ; CHECK-NEXT:    [[INDEX_NEXT12]] = add nuw i64 [[INDEX9]], 4
 ; CHECK-NEXT:    [[TMP169:%.*]] = icmp eq i64 [[INDEX_NEXT12]], 100
-; CHECK-NEXT:    br i1 [[TMP169]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
-; CHECK:       vec.epilog.middle.block:
+; CHECK-NEXT:    br i1 [[TMP169]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK:       [[VEC_EPILOG_MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[TMP170:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP168]])
-; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP]], label [[SCALAR_PH]]
-; CHECK:       vec.epilog.scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 100, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 96, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX13:%.*]] = phi i32 [ [[TMP170]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[TMP149]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK]] ]
-; CHECK-NEXT:    br label [[FOR_BODY1:%.*]]
-; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    [[ADD7_LCSSA:%.*]] = phi i32 [ [[ADD7:%.*]], [[FOR_BODY1]] ], [ [[TMP149]], [[MIDDLE_BLOCK]] ], [ [[TMP170]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    br i1 true, label %[[FOR_COND_CLEANUP]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 100, %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 96, %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX13:%.*]] = phi i32 [ [[TMP170]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[TMP149]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ]
+; CHECK-NEXT:    br label %[[FOR_BODY1:.*]]
+; CHECK:       [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT:    [[ADD7_LCSSA:%.*]] = phi i32 [ [[ADD7:%.*]], %[[FOR_BODY1]] ], [ [[TMP149]], %[[MIDDLE_BLOCK]] ], [ [[TMP170]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i32 [[ADD7_LCSSA]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY1]] ]
-; CHECK-NEXT:    [[SUM_015:%.*]] = phi i32 [ [[BC_MERGE_RDX13]], [[SCALAR_PH]] ], [ [[ADD7]], [[FOR_BODY1]] ]
+; CHECK:       [[FOR_BODY1]]:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY1]] ]
+; CHECK-NEXT:    [[SUM_015:%.*]] = phi i32 [ [[BC_MERGE_RDX13]], %[[SCALAR_PH]] ], [ [[ADD7]], %[[FOR_BODY1]] ]
 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[IDXPROM]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP150:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4, !tbaa [[TBAA1]]
+; CHECK-NEXT:    [[TMP150:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4, !tbaa [[INT_TBAA1]]
 ; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[INDVARS_IV]], i64 [[IDXPROM5]]
-; CHECK-NEXT:    [[TMP151:%.*]] = load i32, ptr [[ARRAYIDX6]], align 4, !tbaa [[TBAA1]]
+; CHECK-NEXT:    [[TMP151:%.*]] = load i32, ptr [[ARRAYIDX6]], align 4, !tbaa [[INT_TBAA1]]
 ; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP151]], [[TMP150]]
 ; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SUM_015]], 4
 ; CHECK-NEXT:    [[ADD7]] = add i32 [[ADD]], [[MUL]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 100
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY1]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY1]], !llvm.loop [[LOOP9:![0-9]+]]
 ;
-; MAX-BW-LABEL: @matrix_row_col(
-; MAX-BW-NEXT:  iter.check:
-; MAX-BW-NEXT:    [[IDXPROM:%.*]] = sext i32 [[I:%.*]] to i64
-; MAX-BW-NEXT:    [[IDXPROM5:%.*]] = sext i32 [[J:%.*]] to i64
-; MAX-BW-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; MAX-BW:       vector.main.loop.iter.check:
-; MAX-BW-NEXT:    br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH1:%.*]]
-; MAX-BW:       vector.ph:
-; MAX-BW-NEXT:    br label [[VECTOR_BODY:%.*]]
-; MAX-BW:       vector.body:
-; MAX-BW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH1]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; MAX-BW-NEXT:    [[VEC_PHI:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH1]] ], [ [[TMP144:%.*]], [[VECTOR_BODY]] ]
-; MAX-BW-NEXT:    [[VEC_PHI1:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH1]] ], [ [[TMP145:%.*]], [[VECTOR_BODY]] ]
-; MAX-BW-NEXT:    [[VEC_PHI2:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH1]] ], [ [[TMP146:%.*]], [[VECTOR_BODY]] ]
-; MAX-BW-NEXT:    [[VEC_PHI3:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH1]] ], [ [[TMP147:%.*]], [[VECTOR_BODY]] ]
+; MAX-BW-LABEL: define i32 @matrix_row_col(
+; MAX-BW-SAME: ptr readonly captures(none) [[DATA:%.*]], i32 [[I:%.*]], i32 [[J:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; MAX-BW-NEXT:  [[ITER_CHECK:.*]]:
+; MAX-BW-NEXT:    [[IDXPROM:%.*]] = sext i32 [[I]] to i64
+; MAX-BW-NEXT:    [[IDXPROM5:%.*]] = sext i32 [[J]] to i64
+; MAX-BW-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; MAX-BW:       [[VECTOR_PH]]:
+; MAX-BW-NEXT:    br i1 false, label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH1:.*]]
+; MAX-BW:       [[VECTOR_PH1]]:
+; MAX-BW-NEXT:    br label %[[VECTOR_BODY:.*]]
+; MAX-BW:       [[VECTOR_BODY]]:
+; MAX-BW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH1]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; MAX-BW-NEXT:    [[VEC_PHI:%.*]] = phi <8 x i32> [ zeroinitializer, %[[VECTOR_PH1]] ], [ [[TMP144:%.*]], %[[VECTOR_BODY]] ]
+; MAX-BW-NEXT:    [[VEC_PHI1:%.*]] = phi <8 x i32> [ zeroinitializer, %[[VECTOR_PH1]] ], [ [[TMP145:%.*]], %[[VECTOR_BODY]] ]
+; MAX-BW-NEXT:    [[VEC_PHI2:%.*]] = phi <8 x i32> [ zeroinitializer, %[[VECTOR_PH1]] ], [ [[TMP146:%.*]], %[[VECTOR_BODY]] ]
+; MAX-BW-NEXT:    [[VEC_PHI3:%.*]] = phi <8 x i32> [ zeroinitializer, %[[VECTOR_PH1]] ], [ [[TMP147:%.*]], %[[VECTOR_BODY]] ]
 ; MAX-BW-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
 ; MAX-BW-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
 ; MAX-BW-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
@@ -287,14 +289,14 @@ define i32 @matrix_row_col(ptr nocapture readonly %data, i32 %i, i32 %j) local_u
 ; MAX-BW-NEXT:    [[TMP29:%.*]] = add i64 [[INDEX]], 29
 ; MAX-BW-NEXT:    [[TMP30:%.*]] = add i64 [[INDEX]], 30
 ; MAX-BW-NEXT:    [[TMP31:%.*]] = add i64 [[INDEX]], 31
-; MAX-BW-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA:%.*]], i64 [[IDXPROM]], i64 [[TMP0]]
+; MAX-BW-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[IDXPROM]], i64 [[TMP0]]
 ; MAX-BW-NEXT:    [[TMP37:%.*]] = getelementptr inbounds i32, ptr [[TMP32]], i32 8
 ; MAX-BW-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, ptr [[TMP32]], i32 16
 ; MAX-BW-NEXT:    [[TMP39:%.*]] = getelementptr inbounds i32, ptr [[TMP32]], i32 24
-; MAX-BW-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP32]], align 4, !tbaa [[TBAA1:![0-9]+]]
-; MAX-BW-NEXT:    [[WIDE_LOAD4:%.*]] = load <8 x i32>, ptr [[TMP37]], align 4, !tbaa [[TBAA1]]
-; MAX-BW-NEXT:    [[WIDE_LOAD5:%.*]] = load <8 x i32>, ptr [[TMP38]], align 4, !tbaa [[TBAA1]]
-; MAX-BW-NEXT:    [[WIDE_LOAD6:%.*]] = load <8 x i32>, ptr [[TMP39]], align 4, !tbaa [[TBAA1]]
+; MAX-BW-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP32]], align 4, !tbaa [[INT_TBAA1:![0-9]+]]
+; MAX-BW-NEXT:    [[WIDE_LOAD4:%.*]] = load <8 x i32>, ptr [[TMP37]], align 4, !tbaa [[INT_TBAA1]]
+; MAX-BW-NEXT:    [[WIDE_LOAD5:%.*]] = load <8 x i32>, ptr [[TMP38]], align 4, !tbaa [[INT_TBAA1]]
+; MAX-BW-NEXT:    [[WIDE_LOAD6:%.*]] = load <8 x i32>, ptr [[TMP39]], align 4, !tbaa [[INT_TBAA1]]
 ; MAX-BW-NEXT:    [[TMP40:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP0]], i64 [[IDXPROM5]]
 ; MAX-BW-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP1]], i64 [[IDXPROM5]]
 ; MAX-BW-NEXT:    [[TMP42:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP2]], i64 [[IDXPROM5]]
@@ -327,14 +329,14 @@ define i32 @matrix_row_col(ptr nocapture readonly %data, i32 %i, i32 %j) local_u
 ; MAX-BW-NEXT:    [[TMP69:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP29]], i64 [[IDXPROM5]]
 ; MAX-BW-NEXT:    [[TMP70:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP30]], i64 [[IDXPROM5]]
 ; MAX-BW-NEXT:    [[TMP71:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP31]], i64 [[IDXPROM5]]
-; MAX-BW-NEXT:    [[TMP72:%.*]] = load i32, ptr [[TMP40]], align 4, !tbaa [[TBAA1]]
-; MAX-BW-NEXT:    [[TMP73:%.*]] = load i32, ptr [[TMP41]], align 4, !tbaa [[TBAA1]]
-; MAX-BW-NEXT:    [[TMP74:%.*]] = load i32, ptr [[TMP42]], align 4, !tbaa [[TBAA1]]
-; MAX-BW-NEXT:    [[TMP75:%.*]] = load i32, ptr [[TMP43]], align 4, !tbaa [[TBAA1]]
-; MAX-BW-NEXT:    [[TMP76:%.*]] = load i32, ptr [[TMP44]], align 4, !tbaa [[TBAA1]]
-; MAX-BW-NEXT:    [[TMP77:%.*]] = load i32, ptr [[TMP45]], align 4, !tbaa [[TBAA1]]
-; MAX-BW-NEXT:    [[TMP78:%.*]] = load i32, ptr [[TMP46]], align 4, !tbaa [[TBAA1]]
-; MAX-BW-NEXT:    [[TMP79:%.*]] = load i32, ptr [[TMP47]], align 4, !tbaa [[TBAA1]]
+; MAX-BW-NEXT:    [[TMP72:%.*]] = load i32, ptr [[TMP40]], align 4, !tbaa [[INT_TBAA1]]
+; MAX-BW-NEXT:    [[TMP73:%.*]] = load i32, ptr [[TMP41]], align 4, !tbaa [[INT_TBAA1]]
+; MAX-BW-NEXT:    [[TMP74:%.*]] = load i32, ptr [[TMP42]], align 4, !tbaa [[INT_TBAA1]]
+; MAX-BW-NEXT:    [[TMP75:%.*]] = load i32, ptr [[TMP43]], align 4, !tbaa [[INT_TBAA1]]
+; MAX-BW-NEXT:    [[TMP76:%.*]] = load i32, ptr [[TMP44]], align 4, !tbaa [[INT_TBAA1]]
+; MAX-BW-NEXT:    [[TMP77:%.*]] = load i32, ptr [[TMP45]], align 4, !tbaa [[INT_TBAA1]]
+; MAX-BW-NEXT:    [[TMP78:%.*]] = load i32, ptr [[TMP46]], align 4, !tbaa [[INT_TBAA1]]
+; MAX-BW-NEXT:    [[TMP79:%.*]] = load i32, ptr [[TMP47]], align 4, !tbaa [[INT_TBAA1]]
 ; MAX-BW-NEXT:    [[TMP80:%.*]] = insertelement <8 x i32> poison, i32 [[TMP72]], i32 0
 ; MAX-BW-NEXT:    [[TMP81:%.*]] = insertelement <8 x i32> [[TMP80]], i32 [[TMP73]], i32 1
 ; MAX-BW-NEXT:    [[TMP82:%.*]] = insertelement <8 x i32> [[TMP81]], i32 [[TMP74]], i32 2
@@ -343,14 +345,14 @@ define i32 @matrix_row_col(ptr nocapture readonly %data, i32 %i, i32 %j) local_u
 ; MAX-BW-NEXT:    [[TMP85:%.*]] = insertelement <8 x i32> [[TMP84]], i32 [[TMP77]], i32 5
 ; MAX-BW-NEXT:    [[TMP86:%.*]] = insertelement <8 x i32> [[TMP85]], i32 [[TMP78]], i32 6
 ; MAX-BW-NEXT:    [[TMP87:%.*]] = insertelement <8 x i32> [[TMP86]], i32 [[TMP79]], i32 7
-; MAX-BW-NEXT:    [[TMP88:%.*]] = load i32, ptr [[TMP48]], align 4, !tbaa [[TBAA1]]
-; MAX-BW-NEXT:    [[TMP89:%.*]] = load i32, ptr [[TMP49]], align 4, !tbaa [[TBAA1]]
-; MAX-BW-NEXT:    [[TMP90:%.*]] = load i32, ptr [[TMP50]], align 4, !tbaa [[TBAA1]]
-; MAX-BW-NEXT:    [[TMP91:%.*]] = load i32, ptr [[TMP51]], align 4, !tbaa [[TBAA1]]
-; MAX-BW-NEXT:    [[TMP92:%.*]] = load i32, ptr [[TMP52]], align 4, !tbaa [[TBAA1]]
-; MAX-BW-NEXT:    [[TMP93:%.*]] = load i32, ptr [[TMP53]], align 4, !tbaa [[TBAA1]]
-; MAX-BW-NEXT:    [[TMP94:%.*]] = load i32, ptr [[TMP54]], align 4, !tbaa [[TBAA1]]
-; MAX-BW-NEXT:    [[TMP95:%.*]] = load i32, ptr [[TMP55]], align 4, !tbaa [[TBAA1]]
+; MAX-BW-NEXT:    [[TMP88:%.*]] = load i32, ptr [[TMP48]], align 4, !tbaa [[INT_TBAA1]]
+; MAX-BW-NEXT:    [[TMP89:%.*]] = load i32, ptr [[TMP49]], align 4, !tbaa [[INT_TBAA1]]
+; MAX-BW-NEXT:    [[TMP90:%.*]] = load i32, ptr [[TMP50]], align 4, !tbaa [[INT_TBAA1]]
+; MAX-BW-NEXT:    [[TMP91:%.*]] = load i32, ptr [[TMP51]], align 4, !tbaa [[INT_TBAA1]]
+; MAX-BW-NEXT:    [[TMP92:%.*]] = load i32, ptr [[TMP52]], align 4, !tbaa [[INT_TBAA1]]
+; MAX-BW-NEXT:    [[TMP93:%.*]] = load i32, ptr [[TMP53]], align 4, !tbaa [[INT_TBAA1]]
+; MAX-BW-NEXT:    [[TMP94:%.*]] = load i32, ptr [[TMP54]], align 4, !tbaa [[INT_TBAA1]]
+; MAX-BW-NEXT:    [[TMP95:%.*]] = load i32, ptr [[TMP55]], align 4, !tbaa [[INT_TBAA1]]
 ; MAX-BW-NEXT:    [[TMP96:%.*]] = insertelement <8 x i32> poison, i32 [[TMP88]], i32 0
 ; MAX-BW-NEXT:    [[TMP97:%.*]] = insertelement <8 x i32> [[TMP96]], i32 [[TMP89]], i32 1
 ; MAX-BW-NEXT:    [[TMP98:%.*]] = insertelement <8 x i32> [[TMP97]], i32 [[TMP90]], i32 2
@@ -359,14 +361,14 @@ define i32 @matrix_row_col(ptr nocapture readonly %data, i32 %i, i32 %j) local_u
 ; MAX-BW-NEXT:    [[TMP101:%.*]] = insertelement <8 x i32> [[TMP100]], i32 [[TMP93]], i32 5
 ; MAX-BW-NEXT:    [[TMP102:%.*]] = insertelement <8 x i32> [[TMP101]], i32 [[TMP94]], i32 6
 ; MAX-BW-NEXT:    [[TMP103:%.*]] = insertelement <8 x i32> [[TMP102]], i32 [[TMP95]], i32 7
-; MAX-BW-NEXT:    [[TMP104:%.*]] = load i32, ptr [[TMP56]], align 4, !tbaa [[TBAA1]]
-; MAX-BW-NEXT:    [[TMP105:%.*]] = load i32, ptr [[TMP57]], align 4, !tbaa [[TBAA1]]
-; MAX-BW-NEXT:    [[TMP106:%.*]] = load i32, ptr [[TMP58]], align 4, !tbaa [[TBAA1]]
-; MAX-BW-NEXT:    [[TMP107:%.*]] = load i32, ptr [[TMP59]], align 4, !tbaa [[TBAA1]]
-; MAX-BW-NEXT:    [[TMP108:%.*]] = load i32, ptr [[TMP60]], align 4, !tbaa [[TBAA1]]
-; MAX-BW-NEXT:    [[TMP109:%.*]] = load i32, ptr [[TMP61]], align 4, !tbaa [[TBAA1]]
-; MAX-BW-NEXT:    [[TMP110:%.*]] = load i32, ptr [[TMP62]], align 4, !tbaa [[TBAA1]]
-; MAX-BW-NEXT:    [[TMP111:%.*]] = load i32, ptr [[TMP63]], align 4, !tbaa [[TBAA1]]
+; MAX-BW-NEXT:    [[TMP104:%.*]] = load i32, ptr [[TMP56]], align 4, !tbaa [[INT_TBAA1]]
+; MAX-BW-NEXT:    [[TMP105:%.*]] = load i32, ptr [[TMP57]], align 4, !tbaa [[INT_TBAA1]]
+; MAX-BW-NEXT:    [[TMP106:%.*]] = load i32, ptr [[TMP58]], align 4, !tbaa [[INT_TBAA1]]
+; MAX-BW-NEXT:    [[TMP107:%.*]] = load i32, ptr [[TMP59]], align 4, !tbaa [[INT_TBAA1]]
+; MAX-BW-NEXT:    [[TMP108:%.*]] = load i32, ptr [[TMP60]], align 4, !tbaa [[INT_TBAA1]]
+; MAX-BW-NEXT:    [[TMP109:%.*]] = load i32, ptr [[TMP61]], align 4, !tbaa [[INT_TBAA1]]
+; MAX-BW-NEXT:    [[TMP110:%.*]] = load i32, ptr [[TMP62]], align 4, !tbaa [[INT_TBAA1]]
+; MAX-BW-NEXT:    [[TMP111:%.*]] = load i32, ptr [[TMP63]], align 4, !tbaa [[INT_TBAA1]]
 ; MAX-BW-NEXT:    [[TMP112:%.*]] = insertelement <8 x i32> poison, i32 [[TMP104]], i32 0
 ; MAX-BW-NEXT:    [[TMP113:%.*]] = insertelement <8 x i32> [[TMP112]], i32 [[TMP105]], i32 1
 ; MAX-BW-NEXT:    [[TMP114:%.*]] = insertelement <8 x i32> [[TMP113]], i32 [[TMP106]], i32 2
@@ -375,14 +377,14 @@ define i32 @matrix_row_col(ptr nocapture readonly %data, i32 %i, i32 %j) local_u
 ; MAX-BW-NEXT:    [[TMP117:%.*]] = insertelement <8 x i32> [[TMP116]], i32 [[TMP109]], i32 5
 ; MAX-BW-NEXT:    [[TMP118:%.*]] = insertelement <8 x i32> [[TMP117]], i32 [[TMP110]], i32 6
 ; MAX-BW-NEXT:    [[TMP119:%.*]] = insertelement <8 x i32> [[TMP118]], i32 [[TMP111]], i32 7
-; MAX-BW-NEXT:    [[TMP120:%.*]] = load i32, ptr [[TMP64]], align 4, !tbaa [[TBAA1]]
-; MAX-BW-NEXT:    [[TMP121:%.*]] = load i32, ptr [[TMP65]], align 4, !tbaa [[TBAA1]]
-; MAX-BW-NEXT:    [[TMP122:%.*]] = load i32, ptr [[TMP66]], align 4, !tbaa [[TBAA1]]
-; MAX-BW-NEXT:    [[TMP123:%.*]] = load i32, ptr [[TMP67]], align 4, !tbaa [[TBAA1]]
-; MAX-BW-NEXT:    [[TMP124:%.*]] = load i32, ptr [[TMP68]], align 4, !tbaa [[TBAA1]]
-; MAX-BW-NEXT:    [[TMP125:%.*]] = load i32, ptr [[TMP69]], align 4, !tbaa [[TBAA1]]
-; MAX-BW-NEXT:    [[TMP126:%.*]] = load i32, ptr [[TMP70]], align 4, !tbaa [[TBAA1]]
-; MAX-BW-NEXT:    [[TMP127:%.*]] = load i32, ptr [[TMP71]], align 4, !tbaa [[TBAA1]]
+; MAX-BW-NEXT:    [[TMP120:%.*]] = load i32, ptr [[TMP64]], align 4, !tbaa [[INT_TBAA1]]
+; MAX-BW-NEXT:    [[TMP121:%.*]] = load i32, ptr [[TMP65]], align 4, !tbaa [[INT_TBAA1]]
+; MAX-BW-NEXT:    [[TMP122:%.*]] = load i32, ptr [[TMP66]], align 4, !tbaa [[INT_TBAA1]]
+; MAX-BW-NEXT:    [[TMP123:%.*]] = load i32, ptr [[TMP67]], align 4, !tbaa [[INT_TBAA1]]
+; MAX-BW-NEXT:    [[TMP124:%.*]] = load i32, ptr [[TMP68]], align 4, !tbaa [[INT_TBAA1]]
+; MAX-BW-NEXT:    [[TMP125:%.*]] = load i32, ptr [[TMP69]], align 4, !tbaa [[INT_TBAA1]]
+; MAX-BW-NEXT:    [[TMP126:%.*]] = load i32, ptr [[TMP70]], align 4, !tbaa [[INT_TBAA1]]
+; MAX-BW-NEXT:    [[TMP127:%.*]] = load i32, ptr [[TMP71]], align 4, !tbaa [[INT_TBAA1]]
 ; MAX-BW-NEXT:    [[TMP128:%.*]] = insertelement <8 x i32> poison, i32 [[TMP120]], i32 0
 ; MAX-BW-NEXT:    [[TMP129:%.*]] = insertelement <8 x i32> [[TMP128]], i32 [[TMP121]], i32 1
 ; MAX-BW-NEXT:    [[TMP130:%.*]] = insertelement <8 x i32> [[TMP129]], i32 [[TMP122]], i32 2
@@ -405,37 +407,37 @@ define i32 @matrix_row_col(ptr nocapture readonly %data, i32 %i, i32 %j) local_u
 ; MAX-BW-NEXT:    [[TMP147]] = add <8 x i32> [[TMP143]], [[TMP139]]
 ; MAX-BW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; MAX-BW-NEXT:    [[TMP148:%.*]] = icmp eq i64 [[INDEX_NEXT]], 96
-; MAX-BW-NEXT:    br i1 [[TMP148]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-; MAX-BW:       middle.block:
+; MAX-BW-NEXT:    br i1 [[TMP148]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; MAX-BW:       [[MIDDLE_BLOCK]]:
 ; MAX-BW-NEXT:    [[BIN_RDX:%.*]] = add <8 x i32> [[TMP145]], [[TMP144]]
 ; MAX-BW-NEXT:    [[BIN_RDX7:%.*]] = add <8 x i32> [[TMP146]], [[BIN_RDX]]
 ; MAX-BW-NEXT:    [[BIN_RDX8:%.*]] = add <8 x i32> [[TMP147]], [[BIN_RDX7]]
 ; MAX-BW-NEXT:    [[TMP149:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[BIN_RDX8]])
-; MAX-BW-NEXT:    br i1 false, label [[FOR_COND_CLEANUP:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
-; MAX-BW:       vec.epilog.iter.check:
-; MAX-BW-NEXT:    br i1 false, label [[SCALAR_PH]], label [[VEC_EPILOG_PH]]
-; MAX-BW:       vec.epilog.ph:
-; MAX-BW-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 96, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_PH]] ]
-; MAX-BW-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP149]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_PH]] ]
+; MAX-BW-NEXT:    br i1 false, label %[[FOR_COND_CLEANUP:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
+; MAX-BW:       [[VEC_EPILOG_ITER_CHECK]]:
+; MAX-BW-NEXT:    br i1 false, label %[[SCALAR_PH]], label %[[VEC_EPILOG_PH]]
+; MAX-BW:       [[VEC_EPILOG_PH]]:
+; MAX-BW-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 96, %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_PH]] ]
+; MAX-BW-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP149]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_PH]] ]
 ; MAX-BW-NEXT:    [[TMP171:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX]], i32 0
-; MAX-BW-NEXT:    br label [[FOR_BODY:%.*]]
-; MAX-BW:       vec.epilog.vector.body:
-; MAX-BW-NEXT:    [[INDEX9:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT12:%.*]], [[FOR_BODY]] ]
-; MAX-BW-NEXT:    [[VEC_PHI10:%.*]] = phi <4 x i32> [ [[TMP171]], [[VEC_EPILOG_PH]] ], [ [[TMP168:%.*]], [[FOR_BODY]] ]
+; MAX-BW-NEXT:    br label %[[FOR_BODY:.*]]
+; MAX-BW:       [[FOR_BODY]]:
+; MAX-BW-NEXT:    [[INDEX9:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT12:%.*]], %[[FOR_BODY]] ]
+; MAX-BW-NEXT:    [[VEC_PHI10:%.*]] = phi <4 x i32> [ [[TMP171]], %[[VEC_EPILOG_PH]] ], [ [[TMP168:%.*]], %[[FOR_BODY]] ]
 ; MAX-BW-NEXT:    [[TMP172:%.*]] = add i64 [[INDEX9]], 0
 ; MAX-BW-NEXT:    [[TMP173:%.*]] = add i64 [[INDEX9]], 1
 ; MAX-BW-NEXT:    [[TMP174:%.*]] = add i64 [[INDEX9]], 2
 ; MAX-BW-NEXT:    [[TMP175:%.*]] = add i64 [[INDEX9]], 3
 ; MAX-BW-NEXT:    [[TMP152:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[IDXPROM]], i64 [[TMP172]]
-; MAX-BW-NEXT:    [[WIDE_LOAD11:%.*]] = load <4 x i32>, ptr [[TMP152]], align 4, !tbaa [[TBAA1]]
+; MAX-BW-NEXT:    [[WIDE_LOAD11:%.*]] = load <4 x i32>, ptr [[TMP152]], align 4, !tbaa [[INT_TBAA1]]
 ; MAX-BW-NEXT:    [[TMP154:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP172]], i64 [[IDXPROM5]]
 ; MAX-BW-NEXT:    [[TMP155:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP173]], i64 [[IDXPROM5]]
 ; MAX-BW-NEXT:    [[TMP156:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP174]], i64 [[IDXPROM5]]
 ; MAX-BW-NEXT:    [[TMP157:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP175]], i64 [[IDXPROM5]]
-; MAX-BW-NEXT:    [[TMP158:%.*]] = load i32, ptr [[TMP154]], align 4, !tbaa [[TBAA1]]
-; MAX-BW-NEXT:    [[TMP159:%.*]] = load i32, ptr [[TMP155]], align 4, !tbaa [[TBAA1]]
-; MAX-BW-NEXT:    [[TMP160:%.*]] = load i32, ptr [[TMP156]], align 4, !tbaa [[TBAA1]]
-; MAX-BW-NEXT:    [[TMP161:%.*]] = load i32, ptr [[TMP157]], align 4, !tbaa [[TBAA1]]
+; MAX-BW-NEXT:    [[TMP158:%.*]] = load i32, ptr [[TMP154]], align 4, !tbaa [[INT_TBAA1]]
+; MAX-BW-NEXT:    [[TMP159:%.*]] = load i32, ptr [[TMP155]], align 4, !tbaa [[INT_TBAA1]]
+; MAX-BW-NEXT:    [[TMP160:%.*]] = load i32, ptr [[TMP156]], align 4, !tbaa [[INT_TBAA1]]
+; MAX-BW-NEXT:    [[TMP161:%.*]] = load i32, ptr [[TMP157]], align 4, !tbaa [[INT_TBAA1]]
 ; MAX-BW-NEXT:    [[TMP162:%.*]] = insertelement <4 x i32> poison, i32 [[TMP158]], i32 0
 ; MAX-BW-NEXT:    [[TMP163:%.*]] = insertelement <4 x i32> [[TMP162]], i32 [[TMP159]], i32 1
 ; MAX-BW-NEXT:    [[TMP164:%.*]] = insertelement <4 x i32> [[TMP163]], i32 [[TMP160]], i32 2
@@ -445,30 +447,30 @@ define i32 @matrix_row_col(ptr nocapture readonly %data, i32 %i, i32 %j) local_u
 ; MAX-BW-NEXT:    [[TMP168]] = add <4 x i32> [[TMP167]], [[TMP166]]
 ; MAX-BW-NEXT:    [[INDEX_NEXT12]] = add nuw i64 [[INDEX9]], 4
 ; MAX-BW-NEXT:    [[TMP169:%.*]] = icmp eq i64 [[INDEX_NEXT12]], 100
-; MAX-BW-NEXT:    br i1 [[TMP169]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
-; MAX-BW:       vec.epilog.middle.block:
+; MAX-BW-NEXT:    br i1 [[TMP169]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; MAX-BW:       [[VEC_EPILOG_MIDDLE_BLOCK]]:
 ; MAX-BW-NEXT:    [[TMP170:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP168]])
-; MAX-BW-NEXT:    br i1 true, label [[FOR_COND_CLEANUP]], label [[SCALAR_PH]]
-; MAX-BW:       vec.epilog.scalar.ph:
-; MAX-BW-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 100, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 96, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
-; MAX-BW-NEXT:    [[BC_MERGE_RDX13:%.*]] = phi i32 [ [[TMP170]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[TMP149]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK]] ]
-; MAX-BW-NEXT:    br label [[FOR_BODY1:%.*]]
-; MAX-BW:       for.cond.cleanup:
-; MAX-BW-NEXT:    [[ADD7_LCSSA:%.*]] = phi i32 [ [[ADD7:%.*]], [[FOR_BODY1]] ], [ [[TMP149]], [[MIDDLE_BLOCK]] ], [ [[TMP170]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
+; MAX-BW-NEXT:    br i1 true, label %[[FOR_COND_CLEANUP]], label %[[SCALAR_PH]]
+; MAX-BW:       [[SCALAR_PH]]:
+; MAX-BW-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 100, %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 96, %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ]
+; MAX-BW-NEXT:    [[BC_MERGE_RDX13:%.*]] = phi i32 [ [[TMP170]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[TMP149]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ]
+; MAX-BW-NEXT:    br label %[[FOR_BODY1:.*]]
+; MAX-BW:       [[FOR_COND_CLEANUP]]:
+; MAX-BW-NEXT:    [[ADD7_LCSSA:%.*]] = phi i32 [ [[ADD7:%.*]], %[[FOR_BODY1]] ], [ [[TMP149]], %[[MIDDLE_BLOCK]] ], [ [[TMP170]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ]
 ; MAX-BW-NEXT:    ret i32 [[ADD7_LCSSA]]
-; MAX-BW:       for.body:
-; MAX-BW-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY1]] ]
-; MAX-BW-NEXT:    [[SUM_015:%.*]] = phi i32 [ [[BC_MERGE_RDX13]], [[SCALAR_PH]] ], [ [[ADD7]], [[FOR_BODY1]] ]
+; MAX-BW:       [[FOR_BODY1]]:
+; MAX-BW-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY1]] ]
+; MAX-BW-NEXT:    [[SUM_015:%.*]] = phi i32 [ [[BC_MERGE_RDX13]], %[[SCALAR_PH]] ], [ [[ADD7]], %[[FOR_BODY1]] ]
 ; MAX-BW-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[IDXPROM]], i64 [[INDVARS_IV]]
-; MAX-BW-NEXT:    [[TMP150:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4, !tbaa [[TBAA1]]
+; MAX-BW-NEXT:    [[TMP150:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4, !tbaa [[INT_TBAA1]]
 ; MAX-BW-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[INDVARS_IV]], i64 [[IDXPROM5]]
-; MAX-BW-NEXT:    [[TMP151:%.*]] = load i32, ptr [[ARRAYIDX6]], align 4, !tbaa [[TBAA1]]
+; MAX-BW-NEXT:    [[TMP151:%.*]] = load i32, ptr [[ARRAYIDX6]], align 4, !tbaa [[INT_TBAA1]]
 ; MAX-BW-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP151]], [[TMP150]]
 ; MAX-BW-NEXT:    [[ADD:%.*]] = add i32 [[SUM_015]], 4
 ; MAX-BW-NEXT:    [[ADD7]] = add i32 [[ADD]], [[MUL]]
 ; MAX-BW-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; MAX-BW-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 100
-; MAX-BW-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY1]], !llvm.loop [[LOOP9:![0-9]+]]
+; MAX-BW-NEXT:    br i1 [[EXITCOND]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY1]], !llvm.loop [[LOOP9:![0-9]+]]
 ;
 entry:
   %idxprom = sext i32 %i to i64
@@ -496,13 +498,14 @@ entry:
 }
 
 define void @test(ptr %A, ptr noalias %B) #0 {
-; CHECK-LABEL: @test(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-LABEL: define void @test(
+; CHECK-SAME: ptr [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 2
@@ -520,13 +523,13 @@ define void @test(ptr %A, ptr noalias %B) #0 {
 ; CHECK-NEXT:    [[TMP13:%.*]] = add nuw nsw i64 [[TMP5]], 0
 ; CHECK-NEXT:    [[TMP14:%.*]] = add nuw nsw i64 [[TMP6]], 0
 ; CHECK-NEXT:    [[TMP15:%.*]] = add nuw nsw i64 [[TMP7]], 0
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [1024 x i32], ptr [[A:%.*]], i64 0, i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [1024 x i32], ptr [[A]], i64 0, i64 [[TMP8]]
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <16 x i32>, ptr [[TMP16]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
 ; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
 ; CHECK-NEXT:    [[TMP18:%.*]] = add <8 x i32> [[STRIDED_VEC]], [[STRIDED_VEC1]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = trunc <8 x i32> [[TMP18]] to <8 x i8>
-; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B:%.*]], i64 0, i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP8]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP9]]
 ; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP10]]
 ; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP11]]
@@ -552,13 +555,13 @@ define void @test(ptr %A, ptr noalias %B) #0 {
 ; CHECK-NEXT:    store i8 [[TMP35]], ptr [[TMP27]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512
-; CHECK-NEXT:    br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[FOR_COND_CLEANUP:%.*]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    br i1 [[TMP36]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br label %[[FOR_COND_CLEANUP:.*]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
 ; CHECK-NEXT:    [[IV_0:%.*]] = add nuw nsw i64 [[IV]], 0
 ; CHECK-NEXT:    [[IV_1:%.*]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[IN0:%.*]] = getelementptr inbounds [1024 x i32], ptr [[A]], i64 0, i64 [[IV_0]]
@@ -571,17 +574,18 @@ define void @test(ptr %A, ptr noalias %B) #0 {
 ; CHECK-NEXT:    store i8 [[REDUCE_ADD_0_NARROW]], ptr [[OUT]], align 1
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV_0]], 2
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i64 [[IV_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP11:![0-9]+]]
-; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK:       [[FOR_COND_CLEANUP]]:
 ; CHECK-NEXT:    ret void
 ;
-; MAX-BW-LABEL: @test(
-; MAX-BW-NEXT:  entry:
-; MAX-BW-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; MAX-BW:       vector.ph:
-; MAX-BW-NEXT:    br label [[VECTOR_BODY:%.*]]
-; MAX-BW:       vector.body:
-; MAX-BW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; MAX-BW-LABEL: define void @test(
+; MAX-BW-SAME: ptr [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0]] {
+; MAX-BW-NEXT:  [[ENTRY:.*:]]
+; MAX-BW-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; MAX-BW:       [[VECTOR_PH]]:
+; MAX-BW-NEXT:    br label %[[VECTOR_BODY:.*]]
+; MAX-BW:       [[VECTOR_BODY]]:
+; MAX-BW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; MAX-BW-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2
 ; MAX-BW-NEXT:    [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
 ; MAX-BW-NEXT:    [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 2
@@ -615,13 +619,13 @@ define void @test(ptr %A, ptr noalias %B) #0 {
 ; MAX-BW-NEXT:    [[TMP29:%.*]] = add nuw nsw i64 [[TMP13]], 0
 ; MAX-BW-NEXT:    [[TMP30:%.*]] = add nuw nsw i64 [[TMP14]], 0
 ; MAX-BW-NEXT:    [[TMP31:%.*]] = add nuw nsw i64 [[TMP15]], 0
-; MAX-BW-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [1024 x i32], ptr [[A:%.*]], i64 0, i64 [[TMP16]]
+; MAX-BW-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [1024 x i32], ptr [[A]], i64 0, i64 [[TMP16]]
 ; MAX-BW-NEXT:    [[WIDE_VEC:%.*]] = load <32 x i32>, ptr [[TMP32]], align 4
 ; MAX-BW-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <32 x i32> [[WIDE_VEC]], <32 x i32> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
 ; MAX-BW-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <32 x i32> [[WIDE_VEC]], <32 x i32> poison, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
 ; MAX-BW-NEXT:    [[TMP34:%.*]] = add <16 x i32> [[STRIDED_VEC]], [[STRIDED_VEC1]]
 ; MAX-BW-NEXT:    [[TMP35:%.*]] = trunc <16 x i32> [[TMP34]] to <16 x i8>
-; MAX-BW-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B:%.*]], i64 0, i64 [[TMP16]]
+; MAX-BW-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP16]]
 ; MAX-BW-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP17]]
 ; MAX-BW-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP18]]
 ; MAX-BW-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP19]]
@@ -671,13 +675,13 @@ define void @test(ptr %A, ptr noalias %B) #0 {
 ; MAX-BW-NEXT:    store i8 [[TMP67]], ptr [[TMP51]], align 1
 ; MAX-BW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; MAX-BW-NEXT:    [[TMP68:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512
-; MAX-BW-NEXT:    br i1 [[TMP68]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
-; MAX-BW:       middle.block:
-; MAX-BW-NEXT:    br label [[FOR_COND_CLEANUP:%.*]]
-; MAX-BW:       scalar.ph:
-; MAX-BW-NEXT:    br label [[FOR_BODY:%.*]]
-; MAX-BW:       for.body:
-; MAX-BW-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; MAX-BW-NEXT:    br i1 [[TMP68]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; MAX-BW:       [[MIDDLE_BLOCK]]:
+; MAX-BW-NEXT:    br label %[[FOR_COND_CLEANUP:.*]]
+; MAX-BW:       [[SCALAR_PH]]:
+; MAX-BW-NEXT:    br label %[[FOR_BODY:.*]]
+; MAX-BW:       [[FOR_BODY]]:
+; MAX-BW-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
 ; MAX-BW-NEXT:    [[IV_0:%.*]] = add nuw nsw i64 [[IV]], 0
 ; MAX-BW-NEXT:    [[IV_1:%.*]] = add nuw nsw i64 [[IV]], 1
 ; MAX-BW-NEXT:    [[IN0:%.*]] = getelementptr inbounds [1024 x i32], ptr [[A]], i64 0, i64 [[IV_0]]
@@ -690,8 +694,8 @@ define void @test(ptr %A, ptr noalias %B) #0 {
 ; MAX-BW-NEXT:    store i8 [[REDUCE_ADD_0_NARROW]], ptr [[OUT]], align 1
 ; MAX-BW-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV_0]], 2
 ; MAX-BW-NEXT:    [[CMP:%.*]] = icmp ult i64 [[IV_NEXT]], 1024
-; MAX-BW-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP11:![0-9]+]]
-; MAX-BW:       for.cond.cleanup:
+; MAX-BW-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP]], !llvm.loop [[LOOP11:![0-9]+]]
+; MAX-BW:       [[FOR_COND_CLEANUP]]:
 ; MAX-BW-NEXT:    ret void
 ;
 entry:
@@ -733,3 +737,28 @@ attributes #0 = { "target-cpu"="core-avx2" "target-features"="+avx,+avx2,+sse,+s
 !2 = !{!"int", !3, i64 0}
 !3 = !{!"omnipotent char", !4, i64 0}
 !4 = !{!"Simple C/C++ TBAA"}
+;.
+; CHECK: [[INT_TBAA1]] = !{[[META2:![0-9]+]], [[META2]], i64 0}
+; CHECK: [[META2]] = !{!"int", [[META3:![0-9]+]], i64 0}
+; CHECK: [[META3]] = !{!"omnipotent char", [[META4:![0-9]+]], i64 0}
+; CHECK: [[META4]] = !{!"Simple C/C++ TBAA"}
+; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META6:![0-9]+]], [[META7:![0-9]+]]}
+; CHECK: [[META6]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META7]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META6]], [[META7]]}
+; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META7]], [[META6]]}
+; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META6]], [[META7]]}
+; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[META7]], [[META6]]}
+;.
+; MAX-BW: [[INT_TBAA1]] = !{[[META2:![0-9]+]], [[META2]], i64 0}
+; MAX-BW: [[META2]] = !{!"int", [[META3:![0-9]+]], i64 0}
+; MAX-BW: [[META3]] = !{!"omnipotent char", [[META4:![0-9]+]], i64 0}
+; MAX-BW: [[META4]] = !{!"Simple C/C++ TBAA"}
+; MAX-BW: [[LOOP5]] = distinct !{[[LOOP5]], [[META6:![0-9]+]], [[META7:![0-9]+]]}
+; MAX-BW: [[META6]] = !{!"llvm.loop.isvectorized", i32 1}
+; MAX-BW: [[META7]] = !{!"llvm.loop.unroll.runtime.disable"}
+; MAX-BW: [[LOOP8]] = distinct !{[[LOOP8]], [[META6]], [[META7]]}
+; MAX-BW: [[LOOP9]] = distinct !{[[LOOP9]], [[META7]], [[META6]]}
+; MAX-BW: [[LOOP10]] = distinct !{[[LOOP10]], [[META6]], [[META7]]}
+; MAX-BW: [[LOOP11]] = distinct !{[[LOOP11]], [[META7]], [[META6]]}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/X86/uniform_load.ll b/llvm/test/Transforms/LoopVectorize/X86/uniform_load.ll
index 62d08c8668235..9698c33d8e08c 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/uniform_load.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/uniform_load.ll
@@ -36,13 +36,10 @@ define void @foo(ptr nocapture noalias %A, i64 %N) #0 {
 ; CHECK-NEXT:    [[TMP8:%.*]] = fadd <8 x float> [[BROADCAST_SPLAT]], [[WIDE_LOAD2]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = fadd <8 x float> [[BROADCAST_SPLAT]], [[WIDE_LOAD3]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = fadd <8 x float> [[BROADCAST_SPLAT]], [[WIDE_LOAD4]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, ptr [[A]], i32 8
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, ptr [[A]], i32 16
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds float, ptr [[A]], i32 24
 ; CHECK-NEXT:    store <8 x float> [[TMP7]], ptr [[A]], align 4
-; CHECK-NEXT:    store <8 x float> [[TMP8]], ptr [[TMP11]], align 4
-; CHECK-NEXT:    store <8 x float> [[TMP9]], ptr [[TMP12]], align 4
-; CHECK-NEXT:    store <8 x float> [[TMP10]], ptr [[TMP13]], align 4
+; CHECK-NEXT:    store <8 x float> [[TMP8]], ptr [[TMP4]], align 4
+; CHECK-NEXT:    store <8 x float> [[TMP9]], ptr [[TMP5]], align 4
+; CHECK-NEXT:    store <8 x float> [[TMP10]], ptr [[TMP6]], align 4
 ; CHECK-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/branch-weights.ll b/llvm/test/Transforms/LoopVectorize/branch-weights.ll
index 7ae06953c5544..4445141549069 100644
--- a/llvm/test/Transforms/LoopVectorize/branch-weights.ll
+++ b/llvm/test/Transforms/LoopVectorize/branch-weights.ll
@@ -27,23 +27,23 @@ define void @f0(i8 %n, i32 %len, ptr %p) !prof !0 {
 ; MAINVF4IC1_EPI4:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !prof [[PROF3:![0-9]+]], !llvm.loop [[LOOP4:![0-9]+]]
 ; MAINVF4IC1_EPI4:  [[MIDDLE_BLOCK]]:
 ; MAINVF4IC1_EPI4:    [[CMP_N:%.*]] = icmp eq i32 [[TMP0]], [[N_VEC]]
-; MAINVF4IC1_EPI4:    br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]], !prof [[PROF7:![0-9]+]]
+; MAINVF4IC1_EPI4:    br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]], !prof [[PROF8:![0-9]+]]
 ; MAINVF4IC1_EPI4:  [[VEC_EPILOG_ITER_CHECK]]:
 ; MAINVF4IC1_EPI4:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i32 [[N_VEC_REMAINING:%.*]], 4
-; MAINVF4IC1_EPI4:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF8:![0-9]+]]
+; MAINVF4IC1_EPI4:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF9:![0-9]+]]
 ; MAINVF4IC1_EPI4:  [[VEC_EPILOG_PH]]:
 ; MAINVF4IC1_EPI4:    br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
 ; MAINVF4IC1_EPI4:  [[VEC_EPILOG_VECTOR_BODY]]:
 ; MAINVF4IC1_EPI4:    [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT6:%.*]], [[N_VEC3:%.*]]
-; MAINVF4IC1_EPI4:    br i1 [[TMP12]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !prof [[PROF9:![0-9]+]], !llvm.loop [[LOOP10:![0-9]+]]
+; MAINVF4IC1_EPI4:    br i1 [[TMP12]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !prof [[PROF10:![0-9]+]], !llvm.loop [[LOOP11:![0-9]+]]
 ; MAINVF4IC1_EPI4:  [[VEC_EPILOG_MIDDLE_BLOCK]]:
 ; MAINVF4IC1_EPI4:    [[CMP_N8:%.*]] = icmp eq i32 [[TMP0]], [[N_VEC3]]
-; MAINVF4IC1_EPI4:    br i1 [[CMP_N8]], label %[[EXIT_LOOPEXIT]], label %[[VEC_EPILOG_SCALAR_PH]], !prof [[PROF7]]
+; MAINVF4IC1_EPI4:    br i1 [[CMP_N8]], label %[[EXIT_LOOPEXIT]], label %[[VEC_EPILOG_SCALAR_PH]], !prof [[PROF8]]
 ; MAINVF4IC1_EPI4:  [[VEC_EPILOG_SCALAR_PH]]:
 ; MAINVF4IC1_EPI4:    br label %[[LOOP:.*]]
 ; MAINVF4IC1_EPI4:  [[LOOP]]:
 ; MAINVF4IC1_EPI4:    [[CMP_LOOP:%.*]] = icmp ult i32 [[I32:%.*]], [[LEN]]
-; MAINVF4IC1_EPI4:    br i1 [[CMP_LOOP]], label %[[LOOP]], label %[[EXIT_LOOPEXIT]], !prof [[PROF11:![0-9]+]], !llvm.loop [[LOOP12:![0-9]+]]
+; MAINVF4IC1_EPI4:    br i1 [[CMP_LOOP]], label %[[LOOP]], label %[[EXIT_LOOPEXIT]], !prof [[PROF13:![0-9]+]], !llvm.loop [[LOOP14:![0-9]+]]
 ; MAINVF4IC1_EPI4:  [[EXIT_LOOPEXIT]]:
 ; MAINVF4IC1_EPI4:    br label %[[EXIT]]
 ; MAINVF4IC1_EPI4:  [[EXIT]]:
@@ -70,23 +70,23 @@ define void @f0(i8 %n, i32 %len, ptr %p) !prof !0 {
 ; MAINVF4IC2_EPI4:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !prof [[PROF3:![0-9]+]], !llvm.loop [[LOOP4:![0-9]+]]
 ; MAINVF4IC2_EPI4:  [[MIDDLE_BLOCK]]:
 ; MAINVF4IC2_EPI4:    [[CMP_N:%.*]] = icmp eq i32 [[TMP0]], [[N_VEC]]
-; MAINVF4IC2_EPI4:    br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]], !prof [[PROF7:![0-9]+]]
+; MAINVF4IC2_EPI4:    br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]], !prof [[PROF8:![0-9]+]]
 ; MAINVF4IC2_EPI4:  [[VEC_EPILOG_ITER_CHECK]]:
 ; MAINVF4IC2_EPI4:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i32 [[N_VEC_REMAINING:%.*]], 4
-; MAINVF4IC2_EPI4:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF8:![0-9]+]]
+; MAINVF4IC2_EPI4:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF9:![0-9]+]]
 ; MAINVF4IC2_EPI4:  [[VEC_EPILOG_PH]]:
 ; MAINVF4IC2_EPI4:    br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
 ; MAINVF4IC2_EPI4:  [[VEC_EPILOG_VECTOR_BODY]]:
 ; MAINVF4IC2_EPI4:    [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT6:%.*]], [[N_VEC3:%.*]]
-; MAINVF4IC2_EPI4:    br i1 [[TMP13]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !prof [[PROF9:![0-9]+]], !llvm.loop [[LOOP10:![0-9]+]]
+; MAINVF4IC2_EPI4:    br i1 [[TMP13]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !prof [[PROF10:![0-9]+]], !llvm.loop [[LOOP11:![0-9]+]]
 ; MAINVF4IC2_EPI4:  [[VEC_EPILOG_MIDDLE_BLOCK]]:
 ; MAINVF4IC2_EPI4:    [[CMP_N8:%.*]] = icmp eq i32 [[TMP0]], [[N_VEC3]]
-; MAINVF4IC2_EPI4:    br i1 [[CMP_N8]], label %[[EXIT_LOOPEXIT]], label %[[VEC_EPILOG_SCALAR_PH]], !prof [[PROF11:![0-9]+]]
+; MAINVF4IC2_EPI4:    br i1 [[CMP_N8]], label %[[EXIT_LOOPEXIT]], label %[[VEC_EPILOG_SCALAR_PH]], !prof [[PROF13:![0-9]+]]
 ; MAINVF4IC2_EPI4:  [[VEC_EPILOG_SCALAR_PH]]:
 ; MAINVF4IC2_EPI4:    br label %[[LOOP:.*]]
 ; MAINVF4IC2_EPI4:  [[LOOP]]:
 ; MAINVF4IC2_EPI4:    [[CMP_LOOP:%.*]] = icmp ult i32 [[I32:%.*]], [[LEN]]
-; MAINVF4IC2_EPI4:    br i1 [[CMP_LOOP]], label %[[LOOP]], label %[[EXIT_LOOPEXIT]], !prof [[PROF12:![0-9]+]], !llvm.loop [[LOOP13:![0-9]+]]
+; MAINVF4IC2_EPI4:    br i1 [[CMP_LOOP]], label %[[LOOP]], label %[[EXIT_LOOPEXIT]], !prof [[PROF14:![0-9]+]], !llvm.loop [[LOOP15:![0-9]+]]
 ; MAINVF4IC2_EPI4:  [[EXIT_LOOPEXIT]]:
 ; MAINVF4IC2_EPI4:    br label %[[EXIT]]
 ; MAINVF4IC2_EPI4:  [[EXIT]]:
@@ -120,28 +120,34 @@ exit:
 ; MAINVF4IC1_EPI4: [[PROF1]] = !{!"branch_weights", i32 12, i32 1}
 ; MAINVF4IC1_EPI4: [[PROF2]] = !{!"branch_weights", i32 1, i32 127}
 ; MAINVF4IC1_EPI4: [[PROF3]] = !{!"branch_weights", i32 1, i32 307}
-; MAINVF4IC1_EPI4: [[LOOP4]] = distinct !{[[LOOP4]], [[META5:![0-9]+]], [[META6:![0-9]+]]}
+; MAINVF4IC1_EPI4: [[LOOP4]] = distinct !{[[LOOP4]], [[META5:![0-9]+]], [[META6:![0-9]+]], [[META7:![0-9]+]]}
 ; MAINVF4IC1_EPI4: [[META5]] = !{!"llvm.loop.isvectorized", i32 1}
 ; MAINVF4IC1_EPI4: [[META6]] = !{!"llvm.loop.unroll.runtime.disable"}
-; MAINVF4IC1_EPI4: [[PROF7]] = !{!"branch_weights", i32 1, i32 3}
-; MAINVF4IC1_EPI4: [[PROF8]] = !{!"branch_weights", i32 4, i32 0}
-; MAINVF4IC1_EPI4: [[PROF9]] = !{!"branch_weights", i32 0, i32 0}
-; MAINVF4IC1_EPI4: [[LOOP10]] = distinct !{[[LOOP10]], [[META5]], [[META6]]}
-; MAINVF4IC1_EPI4: [[PROF11]] = !{!"branch_weights", i32 2, i32 1}
-; MAINVF4IC1_EPI4: [[LOOP12]] = distinct !{[[LOOP12]], [[META5]]}
+; MAINVF4IC1_EPI4: [[META7]] = !{!"llvm.loop.estimated_trip_count", i32 308}
+; MAINVF4IC1_EPI4: [[PROF8]] = !{!"branch_weights", i32 1, i32 3}
+; MAINVF4IC1_EPI4: [[PROF9]] = !{!"branch_weights", i32 4, i32 0}
+; MAINVF4IC1_EPI4: [[PROF10]] = !{!"branch_weights", i32 0, i32 0}
+; MAINVF4IC1_EPI4: [[LOOP11]] = distinct !{[[LOOP11]], [[META5]], [[META6]], [[META12:![0-9]+]]}
+; MAINVF4IC1_EPI4: [[META12]] = !{!"llvm.loop.estimated_trip_count", i32 0}
+; MAINVF4IC1_EPI4: [[PROF13]] = !{!"branch_weights", i32 2, i32 1}
+; MAINVF4IC1_EPI4: [[LOOP14]] = distinct !{[[LOOP14]], [[META5]], [[META15:![0-9]+]]}
+; MAINVF4IC1_EPI4: [[META15]] = !{!"llvm.loop.estimated_trip_count", i32 3}
 ;.
 ; MAINVF4IC2_EPI4: [[PROF0]] = !{!"function_entry_count", i64 13}
 ; MAINVF4IC2_EPI4: [[PROF1]] = !{!"branch_weights", i32 12, i32 1}
 ; MAINVF4IC2_EPI4: [[PROF2]] = !{!"branch_weights", i32 1, i32 127}
 ; MAINVF4IC2_EPI4: [[PROF3]] = !{!"branch_weights", i32 1, i32 153}
-; MAINVF4IC2_EPI4: [[LOOP4]] = distinct !{[[LOOP4]], [[META5:![0-9]+]], [[META6:![0-9]+]]}
+; MAINVF4IC2_EPI4: [[LOOP4]] = distinct !{[[LOOP4]], [[META5:![0-9]+]], [[META6:![0-9]+]], [[META7:![0-9]+]]}
 ; MAINVF4IC2_EPI4: [[META5]] = !{!"llvm.loop.isvectorized", i32 1}
 ; MAINVF4IC2_EPI4: [[META6]] = !{!"llvm.loop.unroll.runtime.disable"}
-; MAINVF4IC2_EPI4: [[PROF7]] = !{!"branch_weights", i32 1, i32 7}
-; MAINVF4IC2_EPI4: [[PROF8]] = !{!"branch_weights", i32 4, i32 4}
-; MAINVF4IC2_EPI4: [[PROF9]] = !{!"branch_weights", i32 0, i32 0}
-; MAINVF4IC2_EPI4: [[LOOP10]] = distinct !{[[LOOP10]], [[META5]], [[META6]]}
-; MAINVF4IC2_EPI4: [[PROF11]] = !{!"branch_weights", i32 1, i32 3}
-; MAINVF4IC2_EPI4: [[PROF12]] = !{!"branch_weights", i32 2, i32 1}
-; MAINVF4IC2_EPI4: [[LOOP13]] = distinct !{[[LOOP13]], [[META5]]}
+; MAINVF4IC2_EPI4: [[META7]] = !{!"llvm.loop.estimated_trip_count", i32 154}
+; MAINVF4IC2_EPI4: [[PROF8]] = !{!"branch_weights", i32 1, i32 7}
+; MAINVF4IC2_EPI4: [[PROF9]] = !{!"branch_weights", i32 4, i32 4}
+; MAINVF4IC2_EPI4: [[PROF10]] = !{!"branch_weights", i32 0, i32 0}
+; MAINVF4IC2_EPI4: [[LOOP11]] = distinct !{[[LOOP11]], [[META5]], [[META6]], [[META12:![0-9]+]]}
+; MAINVF4IC2_EPI4: [[META12]] = !{!"llvm.loop.estimated_trip_count", i32 0}
+; MAINVF4IC2_EPI4: [[PROF13]] = !{!"branch_weights", i32 1, i32 3}
+; MAINVF4IC2_EPI4: [[PROF14]] = !{!"branch_weights", i32 2, i32 1}
+; MAINVF4IC2_EPI4: [[LOOP15]] = distinct !{[[LOOP15]], [[META5]], [[META16:![0-9]+]]}
+; MAINVF4IC2_EPI4: [[META16]] = !{!"llvm.loop.estimated_trip_count", i32 3}
 ;.
diff --git a/llvm/test/Transforms/LoopVectorize/constantfolder-infer-correct-gepty.ll b/llvm/test/Transforms/LoopVectorize/constantfolder-infer-correct-gepty.ll
index e629560354f2a..f86ad8fc88a01 100644
--- a/llvm/test/Transforms/LoopVectorize/constantfolder-infer-correct-gepty.ll
+++ b/llvm/test/Transforms/LoopVectorize/constantfolder-infer-correct-gepty.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 5
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 6
 ; RUN: opt -passes=loop-vectorize -force-vector-width=8 -S %s | FileCheck %s
 
 @postscale = external constant [64 x float]
@@ -11,11 +11,11 @@ define void @test(ptr %data) {
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x float>, ptr getelementptr inbounds nuw (i8, ptr @postscale, i64 4), align 4, !tbaa [[TBAA0:![0-9]+]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x float>, ptr getelementptr inbounds nuw (i8, ptr @postscale, i64 4), align 4, !tbaa [[FLOAT_TBAA0:![0-9]+]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i64> @llvm.lrint.v8i64.v8f32(<8 x float> [[WIDE_LOAD]])
 ; CHECK-NEXT:    [[TMP3:%.*]] = trunc <8 x i64> [[TMP2]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x i16> [[TMP3]], i32 7
-; CHECK-NEXT:    store i16 [[TMP4]], ptr [[DATA]], align 2, !tbaa [[TBAA4:![0-9]+]]
+; CHECK-NEXT:    store i16 [[TMP4]], ptr [[DATA]], align 2, !tbaa [[SHORT_TBAA4:![0-9]+]]
 ; CHECK-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[END:.*]]
@@ -25,10 +25,10 @@ define void @test(ptr %data) {
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[OR_IV_1:%.*]] = or disjoint i64 [[IV]], 1
 ; CHECK-NEXT:    [[GEP_POSTSCALE:%.*]] = getelementptr [64 x float], ptr @postscale, i64 0, i64 [[OR_IV_1]]
-; CHECK-NEXT:    [[LOAD_POSTSCALE:%.*]] = load float, ptr [[GEP_POSTSCALE]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    [[LOAD_POSTSCALE:%.*]] = load float, ptr [[GEP_POSTSCALE]], align 4, !tbaa [[FLOAT_TBAA0]]
 ; CHECK-NEXT:    [[LRINT:%.*]] = tail call i64 @llvm.lrint.i64.f32(float [[LOAD_POSTSCALE]])
 ; CHECK-NEXT:    [[LRINT_TRUNC:%.*]] = trunc i64 [[LRINT]] to i16
-; CHECK-NEXT:    store i16 [[LRINT_TRUNC]], ptr [[DATA]], align 2, !tbaa [[TBAA4]]
+; CHECK-NEXT:    store i16 [[LRINT_TRUNC]], ptr [[DATA]], align 2, !tbaa [[SHORT_TBAA4]]
 ; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
 ; CHECK-NEXT:    [[EXIT_COND:%.*]] = icmp eq i64 [[IV_NEXT]], 8
 ; CHECK-NEXT:    br i1 [[EXIT_COND]], label %[[END]], label %[[LOOP]], !llvm.loop [[LOOP6:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/cse-gep-source-element-type.ll b/llvm/test/Transforms/LoopVectorize/cse-gep-source-element-type.ll
new file mode 100644
index 0000000000000..49eb8b349a274
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/cse-gep-source-element-type.ll
@@ -0,0 +1,115 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "^scalar.ph" --version 5
+; RUN: opt -passes=loop-vectorize -force-vector-width=4 -force-vector-interleave=2 -S %s | FileCheck %s
+
+; Verify that we check the source element of GEPs when performing a CSE.
+
+define void @cse_replicate_gep(ptr noalias %A, ptr noalias %B, ptr noalias %C, i64 %n) {
+; CHECK-LABEL: define void @cse_replicate_gep(
+; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i32, ptr [[A]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[TMP0]], i32 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP0]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i16, ptr [[A]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i16, ptr [[TMP8]], i32 4
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i16>, ptr [[TMP8]], align 2
+; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i16>, ptr [[TMP2]], align 2
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i32, ptr [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i32, ptr [[TMP3]], i32 4
+; CHECK-NEXT:    store <4 x i32> [[WIDE_LOAD]], ptr [[TMP3]], align 4
+; CHECK-NEXT:    store <4 x i32> [[WIDE_LOAD1]], ptr [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i16, ptr [[C]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i16, ptr [[TMP5]], i32 4
+; CHECK-NEXT:    store <4 x i16> [[WIDE_LOAD2]], ptr [[TMP5]], align 2
+; CHECK-NEXT:    store <4 x i16> [[WIDE_LOAD3]], ptr [[TMP6]], align 2
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %gep.A.32 = getelementptr i32, ptr %A, i64 %iv
+  %l.32 = load i32, ptr %gep.A.32
+  %gep.A.16 = getelementptr i16, ptr %A, i64 %iv
+  %l.16 = load i16, ptr %gep.A.16
+  %gep.B = getelementptr i32, ptr %B, i64 %iv
+  store i32 %l.32, ptr %gep.B
+  %gep.C = getelementptr i16, ptr %C, i64 %iv
+  store i16 %l.16, ptr %gep.C
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exit.cond = icmp eq i64 %iv.next, %n
+  br i1 %exit.cond, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @cse_wide_gep(ptr noalias %A, ptr noalias %B, ptr noalias %C, i64 %n) {
+; CHECK-LABEL: define void @cse_wide_gep(
+; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i32, ptr [[A]], <4 x i64> [[VEC_IND]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[A]], <4 x i64> [[STEP_ADD]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i16, ptr [[A]], <4 x i64> [[VEC_IND]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i16, ptr [[A]], <4 x i64> [[STEP_ADD]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i64, ptr [[B]], i64 [[INDEX1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr ptr, ptr [[TMP4]], i32 4
+; CHECK-NEXT:    store <4 x ptr> [[TMP0]], ptr [[TMP4]], align 8
+; CHECK-NEXT:    store <4 x ptr> [[TMP1]], ptr [[TMP5]], align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i64, ptr [[C]], i64 [[INDEX1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr ptr, ptr [[TMP6]], i32 4
+; CHECK-NEXT:    store <4 x ptr> [[TMP2]], ptr [[TMP6]], align 8
+; CHECK-NEXT:    store <4 x ptr> [[TMP3]], ptr [[TMP8]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], 8
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD]], splat (i64 4)
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %gep.A.32 = getelementptr i32, ptr %A, i64 %iv
+  %gep.A.16 = getelementptr i16, ptr %A, i64 %iv
+  %gep.B = getelementptr i64, ptr %B, i64 %iv
+  store ptr %gep.A.32, ptr %gep.B
+  %gep.C = getelementptr i64, ptr %C, i64 %iv
+  store ptr %gep.A.16, ptr %gep.C
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exit.cond = icmp eq i64 %iv.next, %n
+  br i1 %exit.cond, label %exit, label %loop
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/early_exit_store_legality.ll b/llvm/test/Transforms/LoopVectorize/early_exit_store_legality.ll
index 8ae404cf9571f..4e8b8e51df6c2 100644
--- a/llvm/test/Transforms/LoopVectorize/early_exit_store_legality.ll
+++ b/llvm/test/Transforms/LoopVectorize/early_exit_store_legality.ll
@@ -602,5 +602,61 @@ exit:
   ret void
 }
 
+;; ICE was caused by assert for the load used in the uncountable exit condition
+;; being guaranteed to execute.
+@ee.global = external global [4 x i8]
+define void @crash_conditional_load_for_uncountable_exit(ptr dereferenceable(40) noalias %store.area) {
+; CHECK-LABEL: LV: Checking a loop in 'crash_conditional_load_for_uncountable_exit'
+; CHECK:       LV: Not vectorizing: Load for uncountable exit not guaranteed to execute.
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.inc ]
+  %ee.addr = getelementptr i8, ptr @ee.global, i64 %iv
+  br i1 false, label %ee.block, label %invalid.block
+
+ee.block:
+  %ee.val = load i8, ptr %ee.addr, align 1
+  store i16 0, ptr %store.area, align 2
+  %ee.cmp = icmp eq i8 %ee.val, 0
+  br i1 %ee.cmp, label %for.inc, label %invalid.block
+
+for.inc:
+  %iv.next = add nuw nsw i64 %iv, 1
+  %counted.cond = icmp eq i64 %iv.next, 10
+  br i1 %counted.cond, label %invalid.block, label %for.body
+
+invalid.block:
+  unreachable
+}
+
+define void @crash_conditional_load_for_uncountable_exit_argptr(ptr dereferenceable(40) noalias %store.area, ptr dereferenceable(4) %load.area, i1 %skip.cond) {
+; CHECK-LABEL: LV: Checking a loop in 'crash_conditional_load_for_uncountable_exit_argptr'
+; CHECK:       LV: Not vectorizing: Loop has too many uncountable exits.
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.inc ]
+  %ee.addr = getelementptr i8, ptr %load.area, i64 %iv
+  br i1 %skip.cond, label %ee.block, label %invalid.block
+
+ee.block:
+  %ee.val = load i8, ptr %ee.addr, align 1
+  store i16 0, ptr %store.area, align 2
+  %ee.cmp = icmp eq i8 %ee.val, 0
+  br i1 %ee.cmp, label %for.inc, label %invalid.block
+
+for.inc:
+  %iv.next = add nuw nsw i64 %iv, 1
+  %counted.cond = icmp eq i64 %iv.next, 10
+  br i1 %counted.cond, label %invalid.block, label %for.body
+
+invalid.block:
+  unreachable
+}
+
+
 declare void @init_mem(ptr, i64);
 declare i64 @get_an_unknown_offset();
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-with-uniform-ops.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-with-uniform-ops.ll
new file mode 100644
index 0000000000000..5ce6d68e05edd
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-with-uniform-ops.ll
@@ -0,0 +1,248 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 6
+; RUN: opt -passes=loop-vectorize -force-vector-width=4 -force-vector-interleave=2 -S %s | FileCheck %s --check-prefix=UNROLL-NO-IC
+; RUN: opt -passes=loop-vectorize -force-vector-width=1 -force-vector-interleave=2 -S %s | FileCheck %s --check-prefix=UNROLL-NO-VF
+; RUN: opt -passes=loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -S %s | FileCheck %s --check-prefix=SINK-AFTER
+
+; Test case for https://github.com/llvm/llvm-project/issues/95520.
+define i32 @recurence_uniform_load(ptr %src, ptr noalias %dst, i64 %n) {
+; UNROLL-NO-IC-LABEL: define i32 @recurence_uniform_load(
+; UNROLL-NO-IC-SAME: ptr [[SRC:%.*]], ptr noalias [[DST:%.*]], i64 [[N:%.*]]) {
+; UNROLL-NO-IC-NEXT:  [[ENTRY:.*]]:
+; UNROLL-NO-IC-NEXT:    [[TMP0:%.*]] = add i64 [[N]], 1
+; UNROLL-NO-IC-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 8
+; UNROLL-NO-IC-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; UNROLL-NO-IC:       [[VECTOR_PH]]:
+; UNROLL-NO-IC-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 8
+; UNROLL-NO-IC-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; UNROLL-NO-IC-NEXT:    br label %[[VECTOR_BODY:.*]]
+; UNROLL-NO-IC:       [[VECTOR_BODY]]:
+; UNROLL-NO-IC-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; UNROLL-NO-IC-NEXT:    [[TMP1:%.*]] = load i32, ptr [[SRC]], align 4
+; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; UNROLL-NO-IC-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; UNROLL-NO-IC-NEXT:    br i1 [[TMP2]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; UNROLL-NO-IC:       [[MIDDLE_BLOCK]]:
+; UNROLL-NO-IC-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; UNROLL-NO-IC-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; UNROLL-NO-IC:       [[SCALAR_PH]]:
+; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; UNROLL-NO-IC-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[TMP1]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; UNROLL-NO-IC-NEXT:    br label %[[LOOP:.*]]
+; UNROLL-NO-IC:       [[LOOP]]:
+; UNROLL-NO-IC-NEXT:    [[PHI:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[ADD:%.*]], %[[LOOP]] ]
+; UNROLL-NO-IC-NEXT:    [[RECUR:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[LOAD:%.*]], %[[LOOP]] ]
+; UNROLL-NO-IC-NEXT:    [[ADD]] = add i64 [[PHI]], 1
+; UNROLL-NO-IC-NEXT:    [[LOAD]] = load i32, ptr [[SRC]], align 4
+; UNROLL-NO-IC-NEXT:    [[ICMP:%.*]] = icmp ult i64 [[PHI]], [[N]]
+; UNROLL-NO-IC-NEXT:    br i1 [[ICMP]], label %[[LOOP]], label %[[EXIT]], !llvm.loop [[LOOP3:![0-9]+]]
+; UNROLL-NO-IC:       [[EXIT]]:
+; UNROLL-NO-IC-NEXT:    [[RECUR_LCSSA:%.*]] = phi i32 [ [[RECUR]], %[[LOOP]] ], [ [[TMP1]], %[[MIDDLE_BLOCK]] ]
+; UNROLL-NO-IC-NEXT:    ret i32 [[RECUR_LCSSA]]
+;
+; UNROLL-NO-VF-LABEL: define i32 @recurence_uniform_load(
+; UNROLL-NO-VF-SAME: ptr [[SRC:%.*]], ptr noalias [[DST:%.*]], i64 [[N:%.*]]) {
+; UNROLL-NO-VF-NEXT:  [[ENTRY:.*]]:
+; UNROLL-NO-VF-NEXT:    [[TMP1:%.*]] = add i64 [[N]], 1
+; UNROLL-NO-VF-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], 2
+; UNROLL-NO-VF-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; UNROLL-NO-VF:       [[VECTOR_PH]]:
+; UNROLL-NO-VF-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], 2
+; UNROLL-NO-VF-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]]
+; UNROLL-NO-VF-NEXT:    br label %[[VECTOR_BODY:.*]]
+; UNROLL-NO-VF:       [[VECTOR_BODY]]:
+; UNROLL-NO-VF-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; UNROLL-NO-VF-NEXT:    [[TMP0:%.*]] = load i32, ptr [[SRC]], align 4
+; UNROLL-NO-VF-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; UNROLL-NO-VF-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; UNROLL-NO-VF-NEXT:    br i1 [[TMP2]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; UNROLL-NO-VF:       [[MIDDLE_BLOCK]]:
+; UNROLL-NO-VF-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]]
+; UNROLL-NO-VF-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; UNROLL-NO-VF:       [[SCALAR_PH]]:
+; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; UNROLL-NO-VF-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[TMP0]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; UNROLL-NO-VF-NEXT:    br label %[[LOOP:.*]]
+; UNROLL-NO-VF:       [[LOOP]]:
+; UNROLL-NO-VF-NEXT:    [[PHI:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[ADD:%.*]], %[[LOOP]] ]
+; UNROLL-NO-VF-NEXT:    [[RECUR:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[LOAD:%.*]], %[[LOOP]] ]
+; UNROLL-NO-VF-NEXT:    [[ADD]] = add i64 [[PHI]], 1
+; UNROLL-NO-VF-NEXT:    [[LOAD]] = load i32, ptr [[SRC]], align 4
+; UNROLL-NO-VF-NEXT:    [[ICMP:%.*]] = icmp ult i64 [[PHI]], [[N]]
+; UNROLL-NO-VF-NEXT:    br i1 [[ICMP]], label %[[LOOP]], label %[[EXIT]], !llvm.loop [[LOOP3:![0-9]+]]
+; UNROLL-NO-VF:       [[EXIT]]:
+; UNROLL-NO-VF-NEXT:    [[RECUR_LCSSA:%.*]] = phi i32 [ [[RECUR]], %[[LOOP]] ], [ [[TMP0]], %[[MIDDLE_BLOCK]] ]
+; UNROLL-NO-VF-NEXT:    ret i32 [[RECUR_LCSSA]]
+;
+; SINK-AFTER-LABEL: define i32 @recurence_uniform_load(
+; SINK-AFTER-SAME: ptr [[SRC:%.*]], ptr noalias [[DST:%.*]], i64 [[N:%.*]]) {
+; SINK-AFTER-NEXT:  [[ENTRY:.*]]:
+; SINK-AFTER-NEXT:    [[TMP0:%.*]] = add i64 [[N]], 1
+; SINK-AFTER-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4
+; SINK-AFTER-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; SINK-AFTER:       [[VECTOR_PH]]:
+; SINK-AFTER-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 4
+; SINK-AFTER-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; SINK-AFTER-NEXT:    br label %[[VECTOR_BODY:.*]]
+; SINK-AFTER:       [[VECTOR_BODY]]:
+; SINK-AFTER-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; SINK-AFTER-NEXT:    [[TMP1:%.*]] = load i32, ptr [[SRC]], align 4
+; SINK-AFTER-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; SINK-AFTER-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; SINK-AFTER-NEXT:    br i1 [[TMP2]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; SINK-AFTER:       [[MIDDLE_BLOCK]]:
+; SINK-AFTER-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; SINK-AFTER-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; SINK-AFTER:       [[SCALAR_PH]]:
+; SINK-AFTER-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; SINK-AFTER-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[TMP1]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; SINK-AFTER-NEXT:    br label %[[LOOP:.*]]
+; SINK-AFTER:       [[LOOP]]:
+; SINK-AFTER-NEXT:    [[PHI:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[ADD:%.*]], %[[LOOP]] ]
+; SINK-AFTER-NEXT:    [[RECUR:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[LOAD:%.*]], %[[LOOP]] ]
+; SINK-AFTER-NEXT:    [[ADD]] = add i64 [[PHI]], 1
+; SINK-AFTER-NEXT:    [[LOAD]] = load i32, ptr [[SRC]], align 4
+; SINK-AFTER-NEXT:    [[ICMP:%.*]] = icmp ult i64 [[PHI]], [[N]]
+; SINK-AFTER-NEXT:    br i1 [[ICMP]], label %[[LOOP]], label %[[EXIT]], !llvm.loop [[LOOP3:![0-9]+]]
+; SINK-AFTER:       [[EXIT]]:
+; SINK-AFTER-NEXT:    [[RECUR_LCSSA:%.*]] = phi i32 [ [[RECUR]], %[[LOOP]] ], [ [[TMP1]], %[[MIDDLE_BLOCK]] ]
+; SINK-AFTER-NEXT:    ret i32 [[RECUR_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %phi = phi i64 [ 0, %entry ], [ %add, %loop ]
+  %recur = phi i32 [ 0, %entry ], [ %load, %loop ]
+  %add = add i64 %phi, 1
+  %load = load i32, ptr %src, align 4
+  %icmp = icmp ult i64 %phi, %n
+  br i1 %icmp, label %loop, label %exit
+
+exit:
+  ret i32 %recur
+}
+
+; Test for https://github.com/llvm/llvm-project/issues/158319. The recurrence
+; phi can be removed.
+define i16 @for_phi_removed(ptr  %src) {
+; UNROLL-NO-IC-LABEL: define i16 @for_phi_removed(
+; UNROLL-NO-IC-SAME: ptr [[SRC:%.*]]) {
+; UNROLL-NO-IC-NEXT:  [[ENTRY:.*]]:
+; UNROLL-NO-IC-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; UNROLL-NO-IC:       [[VECTOR_PH]]:
+; UNROLL-NO-IC-NEXT:    br label %[[VECTOR_BODY:.*]]
+; UNROLL-NO-IC:       [[VECTOR_BODY]]:
+; UNROLL-NO-IC-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; UNROLL-NO-IC-NEXT:    [[TMP0:%.*]] = load i32, ptr [[SRC]], align 4
+; UNROLL-NO-IC-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i64 0
+; UNROLL-NO-IC-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; UNROLL-NO-IC-NEXT:    [[TMP1:%.*]] = icmp eq <4 x i32> [[BROADCAST_SPLAT]], zeroinitializer
+; UNROLL-NO-IC-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x i16> splat (i16 1), <4 x i16> zeroinitializer
+; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
+; UNROLL-NO-IC-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 104
+; UNROLL-NO-IC-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; UNROLL-NO-IC:       [[MIDDLE_BLOCK]]:
+; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
+; UNROLL-NO-IC-NEXT:    br label %[[SCALAR_PH]]
+; UNROLL-NO-IC:       [[SCALAR_PH]]:
+; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 104, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; UNROLL-NO-IC-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; UNROLL-NO-IC-NEXT:    br label %[[LOOP:.*]]
+; UNROLL-NO-IC:       [[LOOP]]:
+; UNROLL-NO-IC-NEXT:    [[IV:%.*]] = phi i16 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; UNROLL-NO-IC-NEXT:    [[P:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[SEL:%.*]], %[[LOOP]] ]
+; UNROLL-NO-IC-NEXT:    [[L:%.*]] = load i32, ptr [[SRC]], align 4
+; UNROLL-NO-IC-NEXT:    [[C:%.*]] = icmp eq i32 [[L]], 0
+; UNROLL-NO-IC-NEXT:    [[SEL]] = select i1 [[C]], i16 1, i16 0
+; UNROLL-NO-IC-NEXT:    [[IV_NEXT]] = add i16 [[IV]], 1
+; UNROLL-NO-IC-NEXT:    [[EC:%.*]] = icmp eq i16 [[IV_NEXT]], 111
+; UNROLL-NO-IC-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
+; UNROLL-NO-IC:       [[EXIT]]:
+; UNROLL-NO-IC-NEXT:    [[P_LCSSA:%.*]] = phi i16 [ [[P]], %[[LOOP]] ]
+; UNROLL-NO-IC-NEXT:    ret i16 [[P_LCSSA]]
+;
+; UNROLL-NO-VF-LABEL: define i16 @for_phi_removed(
+; UNROLL-NO-VF-SAME: ptr [[SRC:%.*]]) {
+; UNROLL-NO-VF-NEXT:  [[ENTRY:.*]]:
+; UNROLL-NO-VF-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; UNROLL-NO-VF:       [[VECTOR_PH]]:
+; UNROLL-NO-VF-NEXT:    br label %[[VECTOR_BODY:.*]]
+; UNROLL-NO-VF:       [[VECTOR_BODY]]:
+; UNROLL-NO-VF-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; UNROLL-NO-VF-NEXT:    [[TMP0:%.*]] = load i32, ptr [[SRC]], align 4
+; UNROLL-NO-VF-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[TMP0]], 0
+; UNROLL-NO-VF-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i16 1, i16 0
+; UNROLL-NO-VF-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
+; UNROLL-NO-VF-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 110
+; UNROLL-NO-VF-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; UNROLL-NO-VF:       [[MIDDLE_BLOCK]]:
+; UNROLL-NO-VF-NEXT:    br label %[[SCALAR_PH]]
+; UNROLL-NO-VF:       [[SCALAR_PH]]:
+; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 110, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; UNROLL-NO-VF-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[TMP2]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; UNROLL-NO-VF-NEXT:    br label %[[LOOP:.*]]
+; UNROLL-NO-VF:       [[LOOP]]:
+; UNROLL-NO-VF-NEXT:    [[IV:%.*]] = phi i16 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; UNROLL-NO-VF-NEXT:    [[P:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[SEL:%.*]], %[[LOOP]] ]
+; UNROLL-NO-VF-NEXT:    [[L:%.*]] = load i32, ptr [[SRC]], align 4
+; UNROLL-NO-VF-NEXT:    [[C:%.*]] = icmp eq i32 [[L]], 0
+; UNROLL-NO-VF-NEXT:    [[SEL]] = select i1 [[C]], i16 1, i16 0
+; UNROLL-NO-VF-NEXT:    [[IV_NEXT]] = add i16 [[IV]], 1
+; UNROLL-NO-VF-NEXT:    [[EC:%.*]] = icmp eq i16 [[IV_NEXT]], 111
+; UNROLL-NO-VF-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
+; UNROLL-NO-VF:       [[EXIT]]:
+; UNROLL-NO-VF-NEXT:    [[P_LCSSA:%.*]] = phi i16 [ [[P]], %[[LOOP]] ]
+; UNROLL-NO-VF-NEXT:    ret i16 [[P_LCSSA]]
+;
+; SINK-AFTER-LABEL: define i16 @for_phi_removed(
+; SINK-AFTER-SAME: ptr [[SRC:%.*]]) {
+; SINK-AFTER-NEXT:  [[ENTRY:.*]]:
+; SINK-AFTER-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; SINK-AFTER:       [[VECTOR_PH]]:
+; SINK-AFTER-NEXT:    br label %[[VECTOR_BODY:.*]]
+; SINK-AFTER:       [[VECTOR_BODY]]:
+; SINK-AFTER-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; SINK-AFTER-NEXT:    [[TMP0:%.*]] = load i32, ptr [[SRC]], align 4
+; SINK-AFTER-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i64 0
+; SINK-AFTER-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; SINK-AFTER-NEXT:    [[TMP1:%.*]] = icmp eq <4 x i32> [[BROADCAST_SPLAT]], zeroinitializer
+; SINK-AFTER-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x i16> splat (i16 1), <4 x i16> zeroinitializer
+; SINK-AFTER-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
+; SINK-AFTER-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 108
+; SINK-AFTER-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; SINK-AFTER:       [[MIDDLE_BLOCK]]:
+; SINK-AFTER-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
+; SINK-AFTER-NEXT:    br label %[[SCALAR_PH]]
+; SINK-AFTER:       [[SCALAR_PH]]:
+; SINK-AFTER-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 108, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; SINK-AFTER-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; SINK-AFTER-NEXT:    br label %[[LOOP:.*]]
+; SINK-AFTER:       [[LOOP]]:
+; SINK-AFTER-NEXT:    [[IV:%.*]] = phi i16 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; SINK-AFTER-NEXT:    [[P:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[SEL:%.*]], %[[LOOP]] ]
+; SINK-AFTER-NEXT:    [[L:%.*]] = load i32, ptr [[SRC]], align 4
+; SINK-AFTER-NEXT:    [[C:%.*]] = icmp eq i32 [[L]], 0
+; SINK-AFTER-NEXT:    [[SEL]] = select i1 [[C]], i16 1, i16 0
+; SINK-AFTER-NEXT:    [[IV_NEXT]] = add i16 [[IV]], 1
+; SINK-AFTER-NEXT:    [[EC:%.*]] = icmp eq i16 [[IV_NEXT]], 111
+; SINK-AFTER-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
+; SINK-AFTER:       [[EXIT]]:
+; SINK-AFTER-NEXT:    [[P_LCSSA:%.*]] = phi i16 [ [[P]], %[[LOOP]] ]
+; SINK-AFTER-NEXT:    ret i16 [[P_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i16 [ 0, %entry ], [ %iv.next, %loop ]
+  %p = phi i16 [ 0, %entry ] , [ %sel, %loop ]
+  %l = load i32, ptr %src, align 4
+  %c = icmp eq i32 %l, 0
+  %sel = select i1 %c, i16 1, i16 0
+  %iv.next = add i16 %iv, 1
+  %ec = icmp eq i16 %iv.next, 111
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret i16 %p
+}
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
index 10cbf66c783db..7e288ab0eb76d 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
@@ -3619,117 +3619,4 @@ for.end:
   ret void
 }
 
-; Test case for https://github.com/llvm/llvm-project/issues/95520.
-define i32 @recurence_uniform_load(ptr %src, ptr noalias %dst, i64 %n) {
-; UNROLL-NO-IC-LABEL: @recurence_uniform_load(
-; UNROLL-NO-IC-NEXT:  entry:
-; UNROLL-NO-IC-NEXT:    [[TMP0:%.*]] = add i64 [[N:%.*]], 1
-; UNROLL-NO-IC-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 8
-; UNROLL-NO-IC-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; UNROLL-NO-IC:       vector.ph:
-; UNROLL-NO-IC-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 8
-; UNROLL-NO-IC-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
-; UNROLL-NO-IC-NEXT:    br label [[VECTOR_BODY:%.*]]
-; UNROLL-NO-IC:       vector.body:
-; UNROLL-NO-IC-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NO-IC-NEXT:    [[TMP1:%.*]] = load i32, ptr [[SRC:%.*]], align 4
-; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; UNROLL-NO-IC-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP37:![0-9]+]]
-; UNROLL-NO-IC:       middle.block:
-; UNROLL-NO-IC-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
-; UNROLL-NO-IC-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; UNROLL-NO-IC:       scalar.ph:
-; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; UNROLL-NO-IC-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[TMP1]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; UNROLL-NO-IC-NEXT:    br label [[LOOP:%.*]]
-; UNROLL-NO-IC:       loop:
-; UNROLL-NO-IC-NEXT:    [[PHI:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[LOOP]] ]
-; UNROLL-NO-IC-NEXT:    [[RECUR:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[LOAD:%.*]], [[LOOP]] ]
-; UNROLL-NO-IC-NEXT:    [[ADD]] = add i64 [[PHI]], 1
-; UNROLL-NO-IC-NEXT:    [[LOAD]] = load i32, ptr [[SRC]], align 4
-; UNROLL-NO-IC-NEXT:    [[ICMP:%.*]] = icmp ult i64 [[PHI]], [[N]]
-; UNROLL-NO-IC-NEXT:    br i1 [[ICMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP38:![0-9]+]]
-; UNROLL-NO-IC:       exit:
-; UNROLL-NO-IC-NEXT:    ret i32 0
-;
-; UNROLL-NO-VF-LABEL: @recurence_uniform_load(
-; UNROLL-NO-VF-NEXT:  entry:
-; UNROLL-NO-VF-NEXT:    [[TMP1:%.*]] = add i64 [[N:%.*]], 1
-; UNROLL-NO-VF-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], 2
-; UNROLL-NO-VF-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; UNROLL-NO-VF:       vector.ph:
-; UNROLL-NO-VF-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], 2
-; UNROLL-NO-VF-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]]
-; UNROLL-NO-VF-NEXT:    br label [[VECTOR_BODY:%.*]]
-; UNROLL-NO-VF:       vector.body:
-; UNROLL-NO-VF-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NO-VF-NEXT:    [[TMP0:%.*]] = load i32, ptr [[SRC:%.*]], align 4
-; UNROLL-NO-VF-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; UNROLL-NO-VF-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; UNROLL-NO-VF-NEXT:    br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP37:![0-9]+]]
-; UNROLL-NO-VF:       middle.block:
-; UNROLL-NO-VF-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]]
-; UNROLL-NO-VF-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; UNROLL-NO-VF:       scalar.ph:
-; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; UNROLL-NO-VF-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[TMP0]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; UNROLL-NO-VF-NEXT:    br label [[LOOP:%.*]]
-; UNROLL-NO-VF:       loop:
-; UNROLL-NO-VF-NEXT:    [[PHI:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[LOOP]] ]
-; UNROLL-NO-VF-NEXT:    [[RECUR:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[LOAD:%.*]], [[LOOP]] ]
-; UNROLL-NO-VF-NEXT:    [[ADD]] = add i64 [[PHI]], 1
-; UNROLL-NO-VF-NEXT:    [[LOAD]] = load i32, ptr [[SRC]], align 4
-; UNROLL-NO-VF-NEXT:    [[ICMP:%.*]] = icmp ult i64 [[PHI]], [[N]]
-; UNROLL-NO-VF-NEXT:    br i1 [[ICMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP38:![0-9]+]]
-; UNROLL-NO-VF:       exit:
-; UNROLL-NO-VF-NEXT:    ret i32 0
-;
-; SINK-AFTER-LABEL: @recurence_uniform_load(
-; SINK-AFTER-NEXT:  entry:
-; SINK-AFTER-NEXT:    [[TMP0:%.*]] = add i64 [[N:%.*]], 1
-; SINK-AFTER-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4
-; SINK-AFTER-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; SINK-AFTER:       vector.ph:
-; SINK-AFTER-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 4
-; SINK-AFTER-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
-; SINK-AFTER-NEXT:    br label [[VECTOR_BODY:%.*]]
-; SINK-AFTER:       vector.body:
-; SINK-AFTER-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SINK-AFTER-NEXT:    [[TMP1:%.*]] = load i32, ptr [[SRC:%.*]], align 4
-; SINK-AFTER-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; SINK-AFTER-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; SINK-AFTER-NEXT:    br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP37:![0-9]+]]
-; SINK-AFTER:       middle.block:
-; SINK-AFTER-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
-; SINK-AFTER-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; SINK-AFTER:       scalar.ph:
-; SINK-AFTER-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; SINK-AFTER-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[TMP1]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; SINK-AFTER-NEXT:    br label [[LOOP:%.*]]
-; SINK-AFTER:       loop:
-; SINK-AFTER-NEXT:    [[PHI:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[LOOP]] ]
-; SINK-AFTER-NEXT:    [[RECUR:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[LOAD:%.*]], [[LOOP]] ]
-; SINK-AFTER-NEXT:    [[ADD]] = add i64 [[PHI]], 1
-; SINK-AFTER-NEXT:    [[LOAD]] = load i32, ptr [[SRC]], align 4
-; SINK-AFTER-NEXT:    [[ICMP:%.*]] = icmp ult i64 [[PHI]], [[N]]
-; SINK-AFTER-NEXT:    br i1 [[ICMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP38:![0-9]+]]
-; SINK-AFTER:       exit:
-; SINK-AFTER-NEXT:    ret i32 0
-;
-entry:
-  br label %loop
-
-loop:
-  %phi = phi i64 [ 0, %entry ], [ %add, %loop ]
-  %recur = phi i32 [ 0, %entry ], [ %load, %loop ]
-  %add = add i64 %phi, 1
-  %load = load i32, ptr %src, align 4
-  %icmp = icmp ult i64 %phi, %n
-  br i1 %icmp, label %loop, label %exit
-
-exit:
-  ret i32 0
-}
-
 !2 = !{!"branch_weights", i32 1, i32 1}
diff --git a/llvm/test/Transforms/LoopVectorize/metadata.ll b/llvm/test/Transforms/LoopVectorize/metadata.ll
index 54779ed55cff8..e487eac3fee05 100644
--- a/llvm/test/Transforms/LoopVectorize/metadata.ll
+++ b/llvm/test/Transforms/LoopVectorize/metadata.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt -S < %s -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=2 | FileCheck %s
 ; RUN: opt -S < %s -passes=loop-vectorize -force-vector-interleave=2 -force-vector-width=2 | FileCheck --check-prefix=INTERLEAVE %s
 
@@ -18,12 +18,12 @@ define void @fp_math(ptr nocapture %a, ptr noalias %b, i64 %size) {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP0]], align 4, !tbaa [[TBAA0:![0-9]+]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP0]], align 4, !tbaa [[CHAR_TBAA0:![0-9]+]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[WIDE_LOAD]], splat (double 9.900000e+01), !fpmath [[META3:![0-9]+]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = fcmp oge <2 x double> [[TMP3]], splat (double 1.000000e+01)
 ; CHECK-NEXT:    [[TMP6:%.*]] = select <2 x i1> [[TMP4]], <2 x double> [[WIDE_LOAD]], <2 x double> zeroinitializer, !fpmath [[META3]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = fptrunc <2 x double> [[TMP6]] to <2 x float>, !fpmath [[META3]]
-; CHECK-NEXT:    store <2 x float> [[TMP5]], ptr [[TMP1]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    store <2 x float> [[TMP5]], ptr [[TMP1]], align 4, !tbaa [[CHAR_TBAA0]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
@@ -37,12 +37,12 @@ define void @fp_math(ptr nocapture %a, ptr noalias %b, i64 %size) {
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[IV]]
 ; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV]]
-; CHECK-NEXT:    [[L_1:%.*]] = load double, ptr [[ARRAYIDX_1]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    [[L_1:%.*]] = load double, ptr [[ARRAYIDX_1]], align 4, !tbaa [[CHAR_TBAA0]]
 ; CHECK-NEXT:    [[ADD:%.*]] = fadd double [[L_1]], 9.900000e+01, !fpmath [[META3]]
 ; CHECK-NEXT:    [[C:%.*]] = fcmp oge double [[ADD]], 1.000000e+01
 ; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[C]], double [[L_1]], double 0.000000e+00, !fpmath [[META3]]
 ; CHECK-NEXT:    [[T:%.*]] = fptrunc double [[SEL]] to float, !fpmath [[META3]]
-; CHECK-NEXT:    store float [[T]], ptr [[ARRAYIDX_2]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    store float [[T]], ptr [[ARRAYIDX_2]], align 4, !tbaa [[CHAR_TBAA0]]
 ; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[IV_NEXT]], [[SIZE]]
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[LOOP]], label %[[EXIT]], !llvm.loop [[LOOP7:![0-9]+]]
@@ -63,8 +63,8 @@ define void @fp_math(ptr nocapture %a, ptr noalias %b, i64 %size) {
 ; INTERLEAVE-NEXT:    [[TMP0:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[INDEX]]
 ; INTERLEAVE-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDEX]]
 ; INTERLEAVE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds double, ptr [[TMP0]], i32 2
-; INTERLEAVE-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP0]], align 4, !tbaa [[TBAA0:![0-9]+]]
-; INTERLEAVE-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x double>, ptr [[TMP3]], align 4, !tbaa [[TBAA0]]
+; INTERLEAVE-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP0]], align 4, !tbaa [[CHAR_TBAA0:![0-9]+]]
+; INTERLEAVE-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x double>, ptr [[TMP3]], align 4, !tbaa [[CHAR_TBAA0]]
 ; INTERLEAVE-NEXT:    [[TMP4:%.*]] = fadd <2 x double> [[WIDE_LOAD]], splat (double 9.900000e+01), !fpmath [[META3:![0-9]+]]
 ; INTERLEAVE-NEXT:    [[TMP5:%.*]] = fadd <2 x double> [[WIDE_LOAD1]], splat (double 9.900000e+01), !fpmath [[META3]]
 ; INTERLEAVE-NEXT:    [[TMP6:%.*]] = fcmp oge <2 x double> [[TMP4]], splat (double 1.000000e+01)
@@ -74,8 +74,8 @@ define void @fp_math(ptr nocapture %a, ptr noalias %b, i64 %size) {
 ; INTERLEAVE-NEXT:    [[TMP9:%.*]] = fptrunc <2 x double> [[TMP11]] to <2 x float>, !fpmath [[META3]]
 ; INTERLEAVE-NEXT:    [[TMP10:%.*]] = fptrunc <2 x double> [[TMP8]] to <2 x float>, !fpmath [[META3]]
 ; INTERLEAVE-NEXT:    [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 2
-; INTERLEAVE-NEXT:    store <2 x float> [[TMP9]], ptr [[TMP1]], align 4, !tbaa [[TBAA0]]
-; INTERLEAVE-NEXT:    store <2 x float> [[TMP10]], ptr [[TMP13]], align 4, !tbaa [[TBAA0]]
+; INTERLEAVE-NEXT:    store <2 x float> [[TMP9]], ptr [[TMP1]], align 4, !tbaa [[CHAR_TBAA0]]
+; INTERLEAVE-NEXT:    store <2 x float> [[TMP10]], ptr [[TMP13]], align 4, !tbaa [[CHAR_TBAA0]]
 ; INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; INTERLEAVE-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; INTERLEAVE-NEXT:    br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
@@ -89,12 +89,12 @@ define void @fp_math(ptr nocapture %a, ptr noalias %b, i64 %size) {
 ; INTERLEAVE-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
 ; INTERLEAVE-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[IV]]
 ; INTERLEAVE-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV]]
-; INTERLEAVE-NEXT:    [[L_1:%.*]] = load double, ptr [[ARRAYIDX_1]], align 4, !tbaa [[TBAA0]]
+; INTERLEAVE-NEXT:    [[L_1:%.*]] = load double, ptr [[ARRAYIDX_1]], align 4, !tbaa [[CHAR_TBAA0]]
 ; INTERLEAVE-NEXT:    [[ADD:%.*]] = fadd double [[L_1]], 9.900000e+01, !fpmath [[META3]]
 ; INTERLEAVE-NEXT:    [[C:%.*]] = fcmp oge double [[ADD]], 1.000000e+01
 ; INTERLEAVE-NEXT:    [[SEL:%.*]] = select i1 [[C]], double [[L_1]], double 0.000000e+00, !fpmath [[META3]]
 ; INTERLEAVE-NEXT:    [[T:%.*]] = fptrunc double [[SEL]] to float, !fpmath [[META3]]
-; INTERLEAVE-NEXT:    store float [[T]], ptr [[ARRAYIDX_2]], align 4, !tbaa [[TBAA0]]
+; INTERLEAVE-NEXT:    store float [[T]], ptr [[ARRAYIDX_2]], align 4, !tbaa [[CHAR_TBAA0]]
 ; INTERLEAVE-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
 ; INTERLEAVE-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[IV_NEXT]], [[SIZE]]
 ; INTERLEAVE-NEXT:    br i1 [[EXITCOND]], label %[[LOOP]], label %[[EXIT]], !llvm.loop [[LOOP7:![0-9]+]]
@@ -133,7 +133,7 @@ define void @widen_call_range(ptr noalias %a, ptr readonly %b) {
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i64, ptr [[B]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP0]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP0]], align 4, !tbaa [[CHAR_TBAA0]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = call <2 x i64> @foo_vector_fixed2_nomask(<2 x i64> [[WIDE_LOAD]])
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
 ; CHECK-NEXT:    store <2 x i64> [[TMP2]], ptr [[TMP3]], align 4
@@ -147,7 +147,7 @@ define void @widen_call_range(ptr noalias %a, ptr readonly %b) {
 ; CHECK:       [[LOOP]]:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i64, ptr [[B]], i64 [[IV]]
-; CHECK-NEXT:    [[LOAD:%.*]] = load i64, ptr [[GEP]], align 4, !tbaa [[TBAA0]], !range [[RNG9:![0-9]+]]
+; CHECK-NEXT:    [[LOAD:%.*]] = load i64, ptr [[GEP]], align 4, !tbaa [[CHAR_TBAA0]], !range [[RNG9:![0-9]+]]
 ; CHECK-NEXT:    [[CALL:%.*]] = call i64 @foo(i64 [[LOAD]]) #[[ATTR1:[0-9]+]], !range [[RNG9]]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
 ; CHECK-NEXT:    store i64 [[CALL]], ptr [[ARRAYIDX]], align 4
@@ -167,8 +167,8 @@ define void @widen_call_range(ptr noalias %a, ptr readonly %b) {
 ; INTERLEAVE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; INTERLEAVE-NEXT:    [[TMP0:%.*]] = getelementptr i64, ptr [[B]], i64 [[INDEX]]
 ; INTERLEAVE-NEXT:    [[TMP2:%.*]] = getelementptr i64, ptr [[TMP0]], i32 2
-; INTERLEAVE-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP0]], align 4, !tbaa [[TBAA0]]
-; INTERLEAVE-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x i64>, ptr [[TMP2]], align 4, !tbaa [[TBAA0]]
+; INTERLEAVE-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP0]], align 4, !tbaa [[CHAR_TBAA0]]
+; INTERLEAVE-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x i64>, ptr [[TMP2]], align 4, !tbaa [[CHAR_TBAA0]]
 ; INTERLEAVE-NEXT:    [[TMP3:%.*]] = call <2 x i64> @foo_vector_fixed2_nomask(<2 x i64> [[WIDE_LOAD]])
 ; INTERLEAVE-NEXT:    [[TMP4:%.*]] = call <2 x i64> @foo_vector_fixed2_nomask(<2 x i64> [[WIDE_LOAD1]])
 ; INTERLEAVE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
@@ -185,7 +185,7 @@ define void @widen_call_range(ptr noalias %a, ptr readonly %b) {
 ; INTERLEAVE:       [[LOOP]]:
 ; INTERLEAVE-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
 ; INTERLEAVE-NEXT:    [[GEP:%.*]] = getelementptr i64, ptr [[B]], i64 [[IV]]
-; INTERLEAVE-NEXT:    [[LOAD:%.*]] = load i64, ptr [[GEP]], align 4, !tbaa [[TBAA0]], !range [[RNG9:![0-9]+]]
+; INTERLEAVE-NEXT:    [[LOAD:%.*]] = load i64, ptr [[GEP]], align 4, !tbaa [[CHAR_TBAA0]], !range [[RNG9:![0-9]+]]
 ; INTERLEAVE-NEXT:    [[CALL:%.*]] = call i64 @foo(i64 [[LOAD]]) #[[ATTR1:[0-9]+]], !range [[RNG9]]
 ; INTERLEAVE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
 ; INTERLEAVE-NEXT:    store i64 [[CALL]], ptr [[ARRAYIDX]], align 4
@@ -223,7 +223,7 @@ define void @widen_call_fpmath(ptr noalias %a, ptr readonly %b) {
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr double, ptr [[B]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP0]], align 8, !tbaa [[TBAA0]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP0]], align 8, !tbaa [[CHAR_TBAA0]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x double> @bar_vector_fixed2_nomask(<2 x double> [[WIDE_LOAD]]), !fpmath [[META3]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[INDEX]]
 ; CHECK-NEXT:    store <2 x double> [[TMP1]], ptr [[TMP3]], align 8
@@ -237,7 +237,7 @@ define void @widen_call_fpmath(ptr noalias %a, ptr readonly %b) {
 ; CHECK:       [[LOOP]]:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr double, ptr [[B]], i64 [[IV]]
-; CHECK-NEXT:    [[LOAD:%.*]] = load double, ptr [[GEP]], align 8, !tbaa [[TBAA0]]
+; CHECK-NEXT:    [[LOAD:%.*]] = load double, ptr [[GEP]], align 8, !tbaa [[CHAR_TBAA0]]
 ; CHECK-NEXT:    [[CALL:%.*]] = call double @bar(double [[LOAD]]) #[[ATTR2:[0-9]+]], !fpmath [[META3]]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[IV]]
 ; CHECK-NEXT:    store double [[CALL]], ptr [[ARRAYIDX]], align 8
@@ -257,8 +257,8 @@ define void @widen_call_fpmath(ptr noalias %a, ptr readonly %b) {
 ; INTERLEAVE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; INTERLEAVE-NEXT:    [[TMP0:%.*]] = getelementptr double, ptr [[B]], i64 [[INDEX]]
 ; INTERLEAVE-NEXT:    [[TMP2:%.*]] = getelementptr double, ptr [[TMP0]], i32 2
-; INTERLEAVE-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP0]], align 8, !tbaa [[TBAA0]]
-; INTERLEAVE-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x double>, ptr [[TMP2]], align 8, !tbaa [[TBAA0]]
+; INTERLEAVE-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP0]], align 8, !tbaa [[CHAR_TBAA0]]
+; INTERLEAVE-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x double>, ptr [[TMP2]], align 8, !tbaa [[CHAR_TBAA0]]
 ; INTERLEAVE-NEXT:    [[TMP3:%.*]] = call <2 x double> @bar_vector_fixed2_nomask(<2 x double> [[WIDE_LOAD]]), !fpmath [[META3]]
 ; INTERLEAVE-NEXT:    [[TMP4:%.*]] = call <2 x double> @bar_vector_fixed2_nomask(<2 x double> [[WIDE_LOAD1]]), !fpmath [[META3]]
 ; INTERLEAVE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[INDEX]]
@@ -275,7 +275,7 @@ define void @widen_call_fpmath(ptr noalias %a, ptr readonly %b) {
 ; INTERLEAVE:       [[LOOP]]:
 ; INTERLEAVE-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
 ; INTERLEAVE-NEXT:    [[GEP:%.*]] = getelementptr double, ptr [[B]], i64 [[IV]]
-; INTERLEAVE-NEXT:    [[LOAD:%.*]] = load double, ptr [[GEP]], align 8, !tbaa [[TBAA0]]
+; INTERLEAVE-NEXT:    [[LOAD:%.*]] = load double, ptr [[GEP]], align 8, !tbaa [[CHAR_TBAA0]]
 ; INTERLEAVE-NEXT:    [[CALL:%.*]] = call double @bar(double [[LOAD]]) #[[ATTR2:[0-9]+]], !fpmath [[META3]]
 ; INTERLEAVE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[IV]]
 ; INTERLEAVE-NEXT:    store double [[CALL]], ptr [[ARRAYIDX]], align 8
@@ -403,7 +403,7 @@ define void @widen_intrinsic_fpmath(ptr noalias %a, ptr readonly %b) {
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr double, ptr [[B]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP0]], align 8, !tbaa [[TBAA0]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP0]], align 8, !tbaa [[CHAR_TBAA0]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x double> @llvm.sin.v2f64(<2 x double> [[WIDE_LOAD]]), !fpmath [[META3]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[INDEX]]
 ; CHECK-NEXT:    store <2 x double> [[TMP1]], ptr [[TMP3]], align 8
@@ -417,7 +417,7 @@ define void @widen_intrinsic_fpmath(ptr noalias %a, ptr readonly %b) {
 ; CHECK:       [[LOOP]]:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr double, ptr [[B]], i64 [[IV]]
-; CHECK-NEXT:    [[LOAD:%.*]] = load double, ptr [[GEP]], align 8, !tbaa [[TBAA0]]
+; CHECK-NEXT:    [[LOAD:%.*]] = load double, ptr [[GEP]], align 8, !tbaa [[CHAR_TBAA0]]
 ; CHECK-NEXT:    [[CALL:%.*]] = call double @llvm.sin.f64(double [[LOAD]]) #[[ATTR2]], !fpmath [[META3]]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[IV]]
 ; CHECK-NEXT:    store double [[CALL]], ptr [[ARRAYIDX]], align 8
@@ -437,8 +437,8 @@ define void @widen_intrinsic_fpmath(ptr noalias %a, ptr readonly %b) {
 ; INTERLEAVE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; INTERLEAVE-NEXT:    [[TMP0:%.*]] = getelementptr double, ptr [[B]], i64 [[INDEX]]
 ; INTERLEAVE-NEXT:    [[TMP2:%.*]] = getelementptr double, ptr [[TMP0]], i32 2
-; INTERLEAVE-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP0]], align 8, !tbaa [[TBAA0]]
-; INTERLEAVE-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x double>, ptr [[TMP2]], align 8, !tbaa [[TBAA0]]
+; INTERLEAVE-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP0]], align 8, !tbaa [[CHAR_TBAA0]]
+; INTERLEAVE-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x double>, ptr [[TMP2]], align 8, !tbaa [[CHAR_TBAA0]]
 ; INTERLEAVE-NEXT:    [[TMP3:%.*]] = call <2 x double> @llvm.sin.v2f64(<2 x double> [[WIDE_LOAD]]), !fpmath [[META3]]
 ; INTERLEAVE-NEXT:    [[TMP4:%.*]] = call <2 x double> @llvm.sin.v2f64(<2 x double> [[WIDE_LOAD1]]), !fpmath [[META3]]
 ; INTERLEAVE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[INDEX]]
@@ -455,7 +455,7 @@ define void @widen_intrinsic_fpmath(ptr noalias %a, ptr readonly %b) {
 ; INTERLEAVE:       [[LOOP]]:
 ; INTERLEAVE-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
 ; INTERLEAVE-NEXT:    [[GEP:%.*]] = getelementptr double, ptr [[B]], i64 [[IV]]
-; INTERLEAVE-NEXT:    [[LOAD:%.*]] = load double, ptr [[GEP]], align 8, !tbaa [[TBAA0]]
+; INTERLEAVE-NEXT:    [[LOAD:%.*]] = load double, ptr [[GEP]], align 8, !tbaa [[CHAR_TBAA0]]
 ; INTERLEAVE-NEXT:    [[CALL:%.*]] = call double @llvm.sin.f64(double [[LOAD]]) #[[ATTR2]], !fpmath [[META3]]
 ; INTERLEAVE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[IV]]
 ; INTERLEAVE-NEXT:    store double [[CALL]], ptr [[ARRAYIDX]], align 8
@@ -608,7 +608,7 @@ attributes #1 = { nounwind "vector-function-abi-variant"="_ZGV_LLVM_N2v_bar(bar_
 !3 = !{!"omnipotent char", !2, i64 0}
 
 ;.
-; CHECK: [[TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0, i64 0}
+; CHECK: [[CHAR_TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0, i64 0}
 ; CHECK: [[META1]] = !{!"omnipotent char", [[META2]]}
 ; CHECK: [[META2]] = !{!"Simple C/C++ TBAA"}
 ; CHECK: [[META3]] = !{float 2.500000e+00}
@@ -628,7 +628,7 @@ attributes #1 = { nounwind "vector-function-abi-variant"="_ZGV_LLVM_N2v_bar(bar_
 ; CHECK: [[LOOP17]] = distinct !{[[LOOP17]], [[META5]], [[META6]]}
 ; CHECK: [[LOOP18]] = distinct !{[[LOOP18]], [[META6]], [[META5]]}
 ;.
-; INTERLEAVE: [[TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0, i64 0}
+; INTERLEAVE: [[CHAR_TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0, i64 0}
 ; INTERLEAVE: [[META1]] = !{!"omnipotent char", [[META2]]}
 ; INTERLEAVE: [[META2]] = !{!"Simple C/C++ TBAA"}
 ; INTERLEAVE: [[META3]] = !{float 2.500000e+00}
diff --git a/llvm/test/Transforms/LoopVectorize/pointer-induction.ll b/llvm/test/Transforms/LoopVectorize/pointer-induction.ll
index d2c53f47a6670..a633dfee066ed 100644
--- a/llvm/test/Transforms/LoopVectorize/pointer-induction.ll
+++ b/llvm/test/Transforms/LoopVectorize/pointer-induction.ll
@@ -33,6 +33,10 @@ define void @a(ptr readnone %b) {
 ; CHECK-NEXT:    [[NEXT_GEP2:%.*]] = getelementptr i8, ptr null, i64 [[TMP11]]
 ; CHECK-NEXT:    [[NEXT_GEP3:%.*]] = getelementptr i8, ptr null, i64 [[TMP14]]
 ; CHECK-NEXT:    [[NEXT_GEP4:%.*]] = getelementptr i8, ptr null, i64 [[TMP17]]
+; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <4 x ptr> poison, ptr [[NEXT_GEP]], i32 0
+; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <4 x ptr> [[TMP21]], ptr [[NEXT_GEP2]], i32 1
+; CHECK-NEXT:    [[TMP23:%.*]] = insertelement <4 x ptr> [[TMP22]], ptr [[NEXT_GEP3]], i32 2
+; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <4 x ptr> [[TMP23]], ptr [[NEXT_GEP4]], i32 3
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[NEXT_GEP]], i64 -1
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 -3
@@ -649,9 +653,6 @@ define i64 @ivopt_widen_ptr_indvar_3(ptr noalias %a, i64 %stride, i64 %n) {
 ; STRIDED-NEXT:    [[TMP9:%.*]] = add i64 [[OFFSET_IDX]], [[TMP8]]
 ; STRIDED-NEXT:    [[TMP10:%.*]] = mul i64 3, [[TMP1]]
 ; STRIDED-NEXT:    [[TMP11:%.*]] = add i64 [[OFFSET_IDX]], [[TMP10]]
-; STRIDED-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr null, i64 [[TMP5]]
-; STRIDED-NEXT:    [[NEXT_GEP1:%.*]] = getelementptr i8, ptr null, i64 [[TMP7]]
-; STRIDED-NEXT:    [[NEXT_GEP2:%.*]] = getelementptr i8, ptr null, i64 [[TMP9]]
 ; STRIDED-NEXT:    [[NEXT_GEP3:%.*]] = getelementptr i8, ptr null, i64 [[TMP11]]
 ; STRIDED-NEXT:    [[TMP12:%.*]] = getelementptr i64, ptr [[A:%.*]], i64 [[INDEX]]
 ; STRIDED-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP12]], align 8
diff --git a/llvm/test/Transforms/LoopVectorize/predicate-switch.ll b/llvm/test/Transforms/LoopVectorize/predicate-switch.ll
index 97f4542bfe67a..87447b63f4383 100644
--- a/llvm/test/Transforms/LoopVectorize/predicate-switch.ll
+++ b/llvm/test/Transforms/LoopVectorize/predicate-switch.ll
@@ -22,6 +22,8 @@ define void @switch4_default_common_dest_with_case(ptr %start, ptr %end) {
 ; IC1-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 1
 ; IC1-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP1]]
 ; IC1-NEXT:    [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP2]]
+; IC1-NEXT:    [[TMP12:%.*]] = insertelement <2 x ptr> poison, ptr [[NEXT_GEP]], i32 0
+; IC1-NEXT:    [[TMP16:%.*]] = insertelement <2 x ptr> [[TMP12]], ptr [[NEXT_GEP3]], i32 1
 ; IC1-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i8>, ptr [[NEXT_GEP]], align 1
 ; IC1-NEXT:    [[TMP7:%.*]] = icmp eq <2 x i8> [[WIDE_LOAD]], splat (i8 -12)
 ; IC1-NEXT:    [[TMP4:%.*]] = icmp eq <2 x i8> [[WIDE_LOAD]], splat (i8 13)
@@ -117,8 +119,12 @@ define void @switch4_default_common_dest_with_case(ptr %start, ptr %end) {
 ; IC2-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 3
 ; IC2-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP1]]
 ; IC2-NEXT:    [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP2]]
+; IC2-NEXT:    [[TMP23:%.*]] = insertelement <2 x ptr> poison, ptr [[NEXT_GEP]], i32 0
+; IC2-NEXT:    [[TMP24:%.*]] = insertelement <2 x ptr> [[TMP23]], ptr [[NEXT_GEP3]], i32 1
 ; IC2-NEXT:    [[NEXT_GEP4:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP3]]
 ; IC2-NEXT:    [[NEXT_GEP5:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP4]]
+; IC2-NEXT:    [[TMP30:%.*]] = insertelement <2 x ptr> poison, ptr [[NEXT_GEP4]], i32 0
+; IC2-NEXT:    [[TMP31:%.*]] = insertelement <2 x ptr> [[TMP30]], ptr [[NEXT_GEP5]], i32 1
 ; IC2-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 2
 ; IC2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i8>, ptr [[NEXT_GEP]], align 1
 ; IC2-NEXT:    [[WIDE_LOAD6:%.*]] = load <2 x i8>, ptr [[TMP6]], align 1
@@ -338,21 +344,21 @@ define void @switch_to_header(ptr %start) {
 ; IC1-NEXT:  [[ENTRY:.*]]:
 ; IC1-NEXT:    br label %[[LOOP_HEADER:.*]]
 ; IC1:       [[LOOP_HEADER]]:
-; IC1-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[IF_THEN:.*]] ]
+; IC1-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[IF_THEN1:.*]] ]
 ; IC1-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
 ; IC1-NEXT:    switch i64 [[IV]], label %[[LOOP_LATCH:.*]] [
-; IC1-NEXT:      i64 120, label %[[IF_THEN]]
+; IC1-NEXT:      i64 120, label %[[IF_THEN1]]
 ; IC1-NEXT:      i64 100, label %[[LOOP_LATCH]]
 ; IC1-NEXT:    ]
-; IC1:       [[IF_THEN]]:
+; IC1:       [[IF_THEN1]]:
 ; IC1-NEXT:    br label %[[LOOP_HEADER]]
-; IC1:       [[IF_THEN1:.*:]]
+; IC1:       [[IF_THEN:.*:]]
 ; IC1-NEXT:    [[GEP:%.*]] = getelementptr inbounds i64, ptr [[START]], i64 poison
 ; IC1-NEXT:    store i64 42, ptr [[GEP]], align 1
 ; IC1-NEXT:    unreachable
 ; IC1:       [[LOOP_LATCH]]:
 ; IC1-NEXT:    [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], 100
-; IC1-NEXT:    br i1 [[CMP]], label %[[EXIT:.*]], label %[[IF_THEN]]
+; IC1-NEXT:    br i1 [[CMP]], label %[[EXIT:.*]], label %[[IF_THEN1]]
 ; IC1:       [[EXIT]]:
 ; IC1-NEXT:    ret void
 ;
@@ -361,21 +367,21 @@ define void @switch_to_header(ptr %start) {
 ; IC2-NEXT:  [[ENTRY:.*]]:
 ; IC2-NEXT:    br label %[[LOOP_HEADER:.*]]
 ; IC2:       [[LOOP_HEADER]]:
-; IC2-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[IF_THEN:.*]] ]
+; IC2-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[IF_THEN1:.*]] ]
 ; IC2-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
 ; IC2-NEXT:    switch i64 [[IV]], label %[[LOOP_LATCH:.*]] [
-; IC2-NEXT:      i64 120, label %[[IF_THEN]]
+; IC2-NEXT:      i64 120, label %[[IF_THEN1]]
 ; IC2-NEXT:      i64 100, label %[[LOOP_LATCH]]
 ; IC2-NEXT:    ]
-; IC2:       [[IF_THEN]]:
+; IC2:       [[IF_THEN1]]:
 ; IC2-NEXT:    br label %[[LOOP_HEADER]]
-; IC2:       [[IF_THEN1:.*:]]
+; IC2:       [[IF_THEN:.*:]]
 ; IC2-NEXT:    [[GEP:%.*]] = getelementptr inbounds i64, ptr [[START]], i64 poison
 ; IC2-NEXT:    store i64 42, ptr [[GEP]], align 1
 ; IC2-NEXT:    unreachable
 ; IC2:       [[LOOP_LATCH]]:
 ; IC2-NEXT:    [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], 100
-; IC2-NEXT:    br i1 [[CMP]], label %[[EXIT:.*]], label %[[IF_THEN]]
+; IC2-NEXT:    br i1 [[CMP]], label %[[EXIT:.*]], label %[[IF_THEN1]]
 ; IC2:       [[EXIT]]:
 ; IC2-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination-branch-weights.ll b/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination-branch-weights.ll
index 38dbbbb21583a..a9118da233e33 100644
--- a/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination-branch-weights.ll
+++ b/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination-branch-weights.ll
@@ -60,9 +60,8 @@ define void @test_tc_between_8_and_17(ptr %A, i64 range(i64 8, 17) %N) {
 ; VF8UF2-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1
 ; VF8UF2-NEXT:    [[TMP3:%.*]] = add nsw <8 x i8> [[WIDE_LOAD]], splat (i8 10)
 ; VF8UF2-NEXT:    [[TMP4:%.*]] = add nsw <8 x i8> [[WIDE_LOAD1]], splat (i8 10)
-; VF8UF2-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i32 8
 ; VF8UF2-NEXT:    store <8 x i8> [[TMP3]], ptr [[A]], align 1
-; VF8UF2-NEXT:    store <8 x i8> [[TMP4]], ptr [[TMP6]], align 1
+; VF8UF2-NEXT:    store <8 x i8> [[TMP4]], ptr [[TMP2]], align 1
 ; VF8UF2-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
 ; VF8UF2:       [[MIDDLE_BLOCK]]:
 ; VF8UF2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination.ll b/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination.ll
index 27fa57928aa96..5329d9b42befa 100644
--- a/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination.ll
+++ b/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination.ll
@@ -66,9 +66,8 @@ define void @test_tc_less_than_16(ptr %A, i64 %N) {
 ; VF8UF2-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x i8>, ptr [[TMP3]], align 1
 ; VF8UF2-NEXT:    [[TMP4:%.*]] = add nsw <8 x i8> [[WIDE_LOAD]], splat (i8 10)
 ; VF8UF2-NEXT:    [[TMP5:%.*]] = add nsw <8 x i8> [[WIDE_LOAD1]], splat (i8 10)
-; VF8UF2-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i32 8
 ; VF8UF2-NEXT:    store <8 x i8> [[TMP4]], ptr [[A]], align 1
-; VF8UF2-NEXT:    store <8 x i8> [[TMP5]], ptr [[TMP7]], align 1
+; VF8UF2-NEXT:    store <8 x i8> [[TMP5]], ptr [[TMP3]], align 1
 ; VF8UF2-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
 ; VF8UF2:       [[MIDDLE_BLOCK]]:
 ; VF8UF2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[AND]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-predicate-switch.ll b/llvm/test/Transforms/LoopVectorize/vplan-predicate-switch.ll
index 3d05ee7f27b5c..cf85f26992c2f 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-predicate-switch.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-predicate-switch.ll
@@ -22,7 +22,11 @@ define void @switch4_default_common_dest_with_case(ptr %start, ptr %end) {
 ; CHECK-NEXT: vector.body:
 ; CHECK-NEXT:   EMIT-SCALAR vp<[[CAN_IV:%.+]]> = phi [ ir<0>, vector.ph ], [ vp<[[CAN_IV_NEXT:%.+]]>, default.2 ]
 ; CHECK-NEXT:   vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>, ir<2>
-; CHECK-NEXT:   EMIT vp<[[PTR:%.+]]> = ptradd ir<%start>, vp<[[STEPS]]>
+; CHECK-NEXT:   EMIT vp<[[STEP1:%.+]]> = extractelement vp<[[STEPS]]>, ir<0>
+; CHECK-NEXT:   EMIT vp<[[PTR:%.+]]> = ptradd ir<%start>, vp<[[STEP1]]>
+; CHECK-NEXT:   EMIT vp<[[STEP2:%.+]]> = extractelement vp<[[STEPS]]>, ir<1>
+; CHECK-NEXT:   EMIT vp<[[PTR]]>.1 = ptradd ir<%start>, vp<[[STEP2]]>
+; CHECK-NEXT:   EMIT vp<[[PTR_VEC:%.+]]> = buildvector vp<[[PTR]]>, vp<[[PTR]]>.1
 ; CHECK-NEXT:   WIDEN ir<%l> = load vp<[[PTR]]>
 ; CHECK-NEXT:   EMIT vp<[[C1:%.+]]> = icmp eq ir<%l>, ir<-12>
 ; CHECK-NEXT:   EMIT vp<[[C2:%.+]]> = icmp eq ir<%l>, ir<13>
@@ -36,7 +40,7 @@ define void @switch4_default_common_dest_with_case(ptr %start, ptr %end) {
 ; CHECK-NEXT:   Successor(s): pred.store.if, pred.store.continue
 ; CHECK-EMPTY:
 ; CHECK-NEXT:   pred.store.if:
-; CHECK-NEXT:     REPLICATE store ir<0>, vp<[[PTR]]>
+; CHECK-NEXT:     REPLICATE store ir<0>, vp<[[PTR_VEC]]>
 ; CHECK-NEXT:   Successor(s): pred.store.continue
 ; CHECK-EMPTY:
 ; CHECK-NEXT:   pred.store.continue:
@@ -53,7 +57,7 @@ define void @switch4_default_common_dest_with_case(ptr %start, ptr %end) {
 ; CHECK-NEXT:   Successor(s): pred.store.if, pred.store.continue
 ; CHECK-EMPTY:
 ; CHECK-NEXT:     pred.store.if:
-; CHECK-NEXT:     REPLICATE store ir<42>, vp<[[PTR]]>
+; CHECK-NEXT:     REPLICATE store ir<42>, vp<[[PTR_VEC]]>
 ; CHECK-NEXT:   Successor(s): pred.store.continue
 ; CHECK-EMPTY:
 ; CHECK-NEXT:   pred.store.continue:
@@ -70,7 +74,7 @@ define void @switch4_default_common_dest_with_case(ptr %start, ptr %end) {
 ; CHECK-NEXT:   Successor(s): pred.store.if, pred.store.continue
 ; CHECK-EMPTY:
 ; CHECK-NEXT:   pred.store.if:
-; CHECK-NEXT:     REPLICATE store ir<2>, vp<[[PTR]]>
+; CHECK-NEXT:     REPLICATE store ir<2>, vp<[[PTR_VEC]]>
 ; CHECK-NEXT:   Successor(s): pred.store.continue
 ; CHECK-EMPTY:
 ; CHECK-NEXT:   pred.store.continue:
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing-before-execute.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing-before-execute.ll
index 2a7ffec27c2f9..12c5950d3a171 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-printing-before-execute.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-printing-before-execute.ll
@@ -82,9 +82,8 @@ define void @test_tc_less_than_16(ptr %A, i64 %N) {
 ; CHECK-NEXT:   WIDEN ir<%l>.1 = load vp<[[VPTR2]]>
 ; CHECK-NEXT:   WIDEN ir<%add> = add nsw ir<%l>, ir<10>
 ; CHECK-NEXT:   WIDEN ir<%add>.1 = add nsw ir<%l>.1, ir<10>
-; CHECK-NEXT:   vp<[[VPTR4:%.+]]> = vector-pointer ir<%A>, ir<1>
 ; CHECK-NEXT:   WIDEN store ir<%A>, ir<%add>
-; CHECK-NEXT:   WIDEN store vp<[[VPTR4]]>, ir<%add>.1
+; CHECK-NEXT:   WIDEN store vp<[[VPTR2]]>, ir<%add>.1
 ; CHECK-NEXT: Successor(s): middle.block
 ; CHECK-EMPTY:
 ; CHECK-NEXT: middle.block:
diff --git a/llvm/test/Transforms/LoopVersioning/add-phi-update-users.ll b/llvm/test/Transforms/LoopVersioning/add-phi-update-users.ll
index 16ad4bfed0fd3..9f77bbfe5ac35 100644
--- a/llvm/test/Transforms/LoopVersioning/add-phi-update-users.ll
+++ b/llvm/test/Transforms/LoopVersioning/add-phi-update-users.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt < %s -passes=loop-versioning -S -o - | FileCheck %s
 
 ; This test case used to end like this:
@@ -22,48 +22,48 @@
 
 define void @f1() {
 ; CHECK-LABEL: define void @f1() {
-; CHECK-NEXT:  entry:
+; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[T0:%.*]] = load ptr, ptr @c, align 1
 ; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[T0]], i64 2
-; CHECK-NEXT:    br label [[FOR_BODY_LVER_CHECK:%.*]]
-; CHECK:       for.body.lver.check:
+; CHECK-NEXT:    br label %[[FOR_BODY_LVER_CHECK:.*]]
+; CHECK:       [[FOR_BODY_LVER_CHECK]]:
 ; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[T0]], getelementptr inbounds nuw (i8, ptr @b, i64 2)
 ; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr @b, [[SCEVGEP]]
 ; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
-; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[FOR_BODY_PH_LVER_ORIG:%.*]], label [[FOR_BODY_PH:%.*]]
-; CHECK:       for.body.ph.lver.orig:
-; CHECK-NEXT:    br label [[FOR_BODY_LVER_ORIG:%.*]]
-; CHECK:       for.body.lver.orig:
-; CHECK-NEXT:    [[T1_LVER_ORIG:%.*]] = phi i64 [ 0, [[FOR_BODY_PH_LVER_ORIG]] ], [ [[INC_LVER_ORIG:%.*]], [[FOR_BODY_LVER_ORIG]] ]
-; CHECK-NEXT:    [[T2_LVER_ORIG:%.*]] = load i16, ptr @b, align 1, !tbaa [[TBAA2:![0-9]+]]
-; CHECK-NEXT:    store i16 [[T2_LVER_ORIG]], ptr [[T0]], align 1, !tbaa [[TBAA2]]
+; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label %[[FOR_BODY_PH_LVER_ORIG:.*]], label %[[FOR_BODY_PH:.*]]
+; CHECK:       [[FOR_BODY_PH_LVER_ORIG]]:
+; CHECK-NEXT:    br label %[[FOR_BODY_LVER_ORIG:.*]]
+; CHECK:       [[FOR_BODY_LVER_ORIG]]:
+; CHECK-NEXT:    [[T1_LVER_ORIG:%.*]] = phi i64 [ 0, %[[FOR_BODY_PH_LVER_ORIG]] ], [ [[INC_LVER_ORIG:%.*]], %[[FOR_BODY_LVER_ORIG]] ]
+; CHECK-NEXT:    [[T2_LVER_ORIG:%.*]] = load i16, ptr @b, align 1, !tbaa [[LONG_LONG_TBAA2:![0-9]+]]
+; CHECK-NEXT:    store i16 [[T2_LVER_ORIG]], ptr [[T0]], align 1, !tbaa [[LONG_LONG_TBAA2]]
 ; CHECK-NEXT:    [[INC_LVER_ORIG]] = add nuw nsw i64 [[T1_LVER_ORIG]], 1
 ; CHECK-NEXT:    [[CMP_LVER_ORIG:%.*]] = icmp ult i64 [[INC_LVER_ORIG]], 3
-; CHECK-NEXT:    br i1 [[CMP_LVER_ORIG]], label [[FOR_BODY_LVER_ORIG]], label [[FOR_END_LOOPEXIT:%.*]]
-; CHECK:       for.body.ph:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[T1:%.*]] = phi i64 [ 0, [[FOR_BODY_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[T2:%.*]] = load i16, ptr @b, align 1, !tbaa [[TBAA2]], !alias.scope [[META6:![0-9]+]]
-; CHECK-NEXT:    store i16 [[T2]], ptr [[T0]], align 1, !tbaa [[TBAA2]], !alias.scope [[META9:![0-9]+]], !noalias [[META6]]
+; CHECK-NEXT:    br i1 [[CMP_LVER_ORIG]], label %[[FOR_BODY_LVER_ORIG]], label %[[FOR_END_LOOPEXIT:.*]]
+; CHECK:       [[FOR_BODY_PH]]:
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[T1:%.*]] = phi i64 [ 0, %[[FOR_BODY_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[T2:%.*]] = load i16, ptr @b, align 1, !tbaa [[LONG_LONG_TBAA2]], !alias.scope [[META6:![0-9]+]]
+; CHECK-NEXT:    store i16 [[T2]], ptr [[T0]], align 1, !tbaa [[LONG_LONG_TBAA2]], !alias.scope [[META9:![0-9]+]], !noalias [[META6]]
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[T1]], 1
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i64 [[INC]], 3
-; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT1:%.*]]
-; CHECK:       for.end.loopexit:
-; CHECK-NEXT:    [[T2_LVER_PH:%.*]] = phi i16 [ [[T2_LVER_ORIG]], [[FOR_BODY_LVER_ORIG]] ]
-; CHECK-NEXT:    br label [[FOR_END:%.*]]
-; CHECK:       for.end.loopexit1:
-; CHECK-NEXT:    [[T2_LVER_PH2:%.*]] = phi i16 [ [[T2]], [[FOR_BODY]] ]
-; CHECK-NEXT:    br label [[FOR_END]]
-; CHECK:       for.end:
-; CHECK-NEXT:    [[T2_LVER:%.*]] = phi i16 [ [[T2_LVER_PH]], [[FOR_END_LOOPEXIT]] ], [ [[T2_LVER_PH2]], [[FOR_END_LOOPEXIT1]] ]
+; CHECK-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_END_LOOPEXIT1:.*]]
+; CHECK:       [[FOR_END_LOOPEXIT]]:
+; CHECK-NEXT:    [[T2_LVER_PH:%.*]] = phi i16 [ [[T2_LVER_ORIG]], %[[FOR_BODY_LVER_ORIG]] ]
+; CHECK-NEXT:    br label %[[FOR_END:.*]]
+; CHECK:       [[FOR_END_LOOPEXIT1]]:
+; CHECK-NEXT:    [[T2_LVER_PH2:%.*]] = phi i16 [ [[T2]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    br label %[[FOR_END]]
+; CHECK:       [[FOR_END]]:
+; CHECK-NEXT:    [[T2_LVER:%.*]] = phi i16 [ [[T2_LVER_PH]], %[[FOR_END_LOOPEXIT]] ], [ [[T2_LVER_PH2]], %[[FOR_END_LOOPEXIT1]] ]
 ; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp eq i16 [[T2_LVER]], 0
-; CHECK-NEXT:    br i1 [[TOBOOL]], label [[FOR_COND_BACKEDGE:%.*]], label [[IF_THEN:%.*]]
-; CHECK:       for.cond.backedge:
-; CHECK-NEXT:    br label [[FOR_BODY_LVER_CHECK]]
-; CHECK:       if.then:
-; CHECK-NEXT:    store i16 [[T2_LVER]], ptr @a, align 1, !tbaa [[TBAA2]]
-; CHECK-NEXT:    br label [[FOR_COND_BACKEDGE]]
+; CHECK-NEXT:    br i1 [[TOBOOL]], label %[[FOR_COND_BACKEDGE:.*]], label %[[IF_THEN:.*]]
+; CHECK:       [[FOR_COND_BACKEDGE]]:
+; CHECK-NEXT:    br label %[[FOR_BODY_LVER_CHECK]]
+; CHECK:       [[IF_THEN]]:
+; CHECK-NEXT:    store i16 [[T2_LVER]], ptr @a, align 1, !tbaa [[LONG_LONG_TBAA2]]
+; CHECK-NEXT:    br label %[[FOR_COND_BACKEDGE]]
 ;
 entry:
   %t0 = load ptr, ptr @c, align 1
@@ -101,3 +101,14 @@ if.then:                                          ; preds = %for.end
 !3 = !{!"long long", !4, i64 0}
 !4 = !{!"omnipotent char", !5, i64 0}
 !5 = !{!"Simple C/C++ TBAA"}
+;.
+; CHECK: [[LONG_LONG_TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
+; CHECK: [[META3]] = !{!"long long", [[META4:![0-9]+]], i64 0}
+; CHECK: [[META4]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0}
+; CHECK: [[META5]] = !{!"Simple C/C++ TBAA"}
+; CHECK: [[META6]] = !{[[META7:![0-9]+]]}
+; CHECK: [[META7]] = distinct !{[[META7]], [[META8:![0-9]+]]}
+; CHECK: [[META8]] = distinct !{[[META8]], !"LVerDomain"}
+; CHECK: [[META9]] = !{[[META10:![0-9]+]]}
+; CHECK: [[META10]] = distinct !{[[META10]], [[META8]]}
+;.
diff --git a/llvm/test/Transforms/MemProfContextDisambiguation/basic.ll b/llvm/test/Transforms/MemProfContextDisambiguation/basic.ll
index 1784c2fd208c3..dda3d2e469c7b 100644
--- a/llvm/test/Transforms/MemProfContextDisambiguation/basic.ll
+++ b/llvm/test/Transforms/MemProfContextDisambiguation/basic.ll
@@ -134,122 +134,135 @@ attributes #6 = { builtin }
 ; DUMP: Callsite Context Graph:
 ; DUMP: Node [[BAR:0x[a-z0-9]+]]
 ; DUMP: 	  %call = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #6	(clone 0)
+; DUMP: 	NodeId: 1
 ; DUMP: 	AllocTypes: NotColdCold
 ; DUMP: 	ContextIds: 1 2
 ; DUMP: 	CalleeEdges:
 ; DUMP: 	CallerEdges:
-; DUMP: 		Edge from Callee [[BAR]] to Caller: [[BAZ:0x[a-z0-9]+]] AllocTypes: NotColdCold ContextIds: 1 2
+; DUMP: 		Edge from Callee [[BAR]] to Caller: [[BAZ:0x[a-z0-9]+]] AllocTypes: NotColdCold ContextIds: 1 2 (Caller NodeId: 2)
 
 ; DUMP: Node [[BAZ]]
 ; DUMP: 	  %call = call noundef ptr @_Z3barv()	(clone 0)
+; DUMP: 	NodeId: 2
 ; DUMP: 	AllocTypes: NotColdCold
 ; DUMP: 	ContextIds: 1 2
 ; DUMP: 	CalleeEdges:
-; DUMP: 		Edge from Callee [[BAR]] to Caller: [[BAZ]] AllocTypes: NotColdCold ContextIds: 1 2
+; DUMP: 		Edge from Callee [[BAR]] to Caller: [[BAZ]] AllocTypes: NotColdCold ContextIds: 1 2 (Callee NodeId: 1)
 ; DUMP: 	CallerEdges:
-; DUMP: 		Edge from Callee [[BAZ]] to Caller: [[FOO:0x[a-z0-9]+]] AllocTypes: NotColdCold ContextIds: 1 2
+; DUMP: 		Edge from Callee [[BAZ]] to Caller: [[FOO:0x[a-z0-9]+]] AllocTypes: NotColdCold ContextIds: 1 2 (Caller NodeId: 3)
 
 ; DUMP: Node [[FOO]]
 ; DUMP: 	  %call = call noundef ptr @_Z3bazv()	(clone 0)
+; DUMP: 	NodeId: 3
 ; DUMP: 	AllocTypes: NotColdCold
 ; DUMP: 	ContextIds: 1 2
 ; DUMP: 	CalleeEdges:
-; DUMP: 		Edge from Callee [[BAZ]] to Caller: [[FOO]] AllocTypes: NotColdCold ContextIds: 1 2
+; DUMP: 		Edge from Callee [[BAZ]] to Caller: [[FOO]] AllocTypes: NotColdCold ContextIds: 1 2 (Callee NodeId: 2)
 ; DUMP: 	CallerEdges:
-; DUMP: 		Edge from Callee [[FOO]] to Caller: [[MAIN1:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 1
-; DUMP: 		Edge from Callee [[FOO]] to Caller: [[MAIN2:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 2
+; DUMP: 		Edge from Callee [[FOO]] to Caller: [[MAIN1:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 1 (Caller NodeId: 4)
+; DUMP: 		Edge from Callee [[FOO]] to Caller: [[MAIN2:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 2 (Caller NodeId: 5)
 
 ; DUMP: Node [[MAIN1]]
 ; DUMP: 	  %call = call noundef ptr @_Z3foov()	(clone 0)
+; DUMP: 	NodeId: 4
 ; DUMP: 	AllocTypes: NotCold
 ; DUMP: 	ContextIds: 1
 ; DUMP: 	CalleeEdges:
-; DUMP: 		Edge from Callee [[FOO]] to Caller: [[MAIN1]] AllocTypes: NotCold ContextIds: 1
+; DUMP: 		Edge from Callee [[FOO]] to Caller: [[MAIN1]] AllocTypes: NotCold ContextIds: 1 (Callee NodeId: 3)
 ; DUMP: 	CallerEdges:
 
 ; DUMP: Node [[MAIN2]]
 ; DUMP: 	  %call1 = call noundef ptr @_Z3foov()	(clone 0)
+; DUMP: 	NodeId: 5
 ; DUMP: 	AllocTypes: Cold
 ; DUMP: 	ContextIds: 2
 ; DUMP: 	CalleeEdges:
-; DUMP: 		Edge from Callee [[FOO]] to Caller: [[MAIN2]] AllocTypes: Cold ContextIds: 2
+; DUMP: 		Edge from Callee [[FOO]] to Caller: [[MAIN2]] AllocTypes: Cold ContextIds: 2 (Callee NodeId: 3)
 ; DUMP: 	CallerEdges:
 
 ; DUMP: CCG after cloning:
 ; DUMP: Callsite Context Graph:
 ; DUMP: Node [[BAR:0x[a-z0-9]+]]
 ; DUMP: 	  %call = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #6	(clone 0)
+; DUMP: 	NodeId: 1
 ; DUMP: 	AllocTypes: NotCold
 ; DUMP: 	ContextIds: 1
 ; DUMP: 	CalleeEdges:
 ; DUMP: 	CallerEdges:
-; DUMP: 		Edge from Callee [[BAR]] to Caller: [[BAZ:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 1
-; DUMP:		Clones: [[BAR2:0x[a-z0-9]+]]
+; DUMP: 		Edge from Callee [[BAR]] to Caller: [[BAZ:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 1 (Caller NodeId: 2)
+; DUMP:		Clones: [[BAR2:0x[a-z0-9]+]] NodeId: 8
 
 ; DUMP: Node [[BAZ]]
 ; DUMP: 	  %call = call noundef ptr @_Z3barv()	(clone 0)
+; DUMP: 	NodeId: 2
 ; DUMP: 	AllocTypes: NotCold
 ; DUMP: 	ContextIds: 1
 ; DUMP: 	CalleeEdges:
-; DUMP: 		Edge from Callee [[BAR]] to Caller: [[BAZ]] AllocTypes: NotCold ContextIds: 1
+; DUMP: 		Edge from Callee [[BAR]] to Caller: [[BAZ]] AllocTypes: NotCold ContextIds: 1 (Callee NodeId: 1)
 ; DUMP: 	CallerEdges:
-; DUMP: 		Edge from Callee [[BAZ]] to Caller: [[FOO:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 1
-; DUMP:		Clones: [[BAZ2:0x[a-z0-9]+]]
+; DUMP: 		Edge from Callee [[BAZ]] to Caller: [[FOO:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 1 (Caller NodeId: 3)
+; DUMP:		Clones: [[BAZ2:0x[a-z0-9]+]] NodeId: 7
 
 ; DUMP: Node [[FOO]]
 ; DUMP: 	  %call = call noundef ptr @_Z3bazv()	(clone 0)
+; DUMP: 	NodeId: 3
 ; DUMP: 	AllocTypes: NotCold
 ; DUMP: 	ContextIds: 1
 ; DUMP: 	CalleeEdges:
-; DUMP: 		Edge from Callee [[BAZ]] to Caller: [[FOO]] AllocTypes: NotCold ContextIds: 1
+; DUMP: 		Edge from Callee [[BAZ]] to Caller: [[FOO]] AllocTypes: NotCold ContextIds: 1 (Callee NodeId: 2)
 ; DUMP: 	CallerEdges:
-; DUMP: 		Edge from Callee [[FOO]] to Caller: [[MAIN1:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 1
-; DUMP:		Clones: [[FOO2:0x[a-z0-9]+]]
+; DUMP: 		Edge from Callee [[FOO]] to Caller: [[MAIN1:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 1 (Caller NodeId: 4)
+; DUMP:		Clones: [[FOO2:0x[a-z0-9]+]] NodeId: 6
 
 ; DUMP: Node [[MAIN1]]
 ; DUMP: 	  %call = call noundef ptr @_Z3foov()	(clone 0)
+; DUMP: 	NodeId: 4
 ; DUMP: 	AllocTypes: NotCold
 ; DUMP: 	ContextIds: 1
 ; DUMP: 	CalleeEdges:
-; DUMP: 		Edge from Callee [[FOO]] to Caller: [[MAIN1]] AllocTypes: NotCold ContextIds: 1
+; DUMP: 		Edge from Callee [[FOO]] to Caller: [[MAIN1]] AllocTypes: NotCold ContextIds: 1 (Callee NodeId: 3)
 ; DUMP: 	CallerEdges:
 
 ; DUMP: Node [[MAIN2]]
 ; DUMP: 	  %call1 = call noundef ptr @_Z3foov()	(clone 0)
+; DUMP: 	NodeId: 5
 ; DUMP: 	AllocTypes: Cold
 ; DUMP: 	ContextIds: 2
 ; DUMP: 	CalleeEdges:
-; DUMP: 		Edge from Callee [[FOO2]] to Caller: [[MAIN2]] AllocTypes: Cold ContextIds: 2
+; DUMP: 		Edge from Callee [[FOO2]] to Caller: [[MAIN2]] AllocTypes: Cold ContextIds: 2 (Callee NodeId: 6)
 ; DUMP: 	CallerEdges:
 
 ; DUMP: Node [[FOO2]]
 ; DUMP: 	  %call = call noundef ptr @_Z3bazv()	(clone 0)
+; DUMP: 	NodeId: 6
 ; DUMP: 	AllocTypes: Cold
 ; DUMP: 	ContextIds: 2
 ; DUMP: 	CalleeEdges:
-; DUMP: 		Edge from Callee [[BAZ2]] to Caller: [[FOO2]] AllocTypes: Cold ContextIds: 2
+; DUMP: 		Edge from Callee [[BAZ2]] to Caller: [[FOO2]] AllocTypes: Cold ContextIds: 2 (Callee NodeId: 7)
 ; DUMP: 	CallerEdges:
-; DUMP: 		Edge from Callee [[FOO2]] to Caller: [[MAIN2:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 2
-; DUMP:		Clone of [[FOO]]
+; DUMP: 		Edge from Callee [[FOO2]] to Caller: [[MAIN2:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 2 (Caller NodeId: 5)
+; DUMP:		Clone of [[FOO]] NodeId: 3
 
 ; DUMP: Node [[BAZ2]]
 ; DUMP: 	  %call = call noundef ptr @_Z3barv()	(clone 0)
+; DUMP: 	NodeId: 7
 ; DUMP: 	AllocTypes: Cold
 ; DUMP: 	ContextIds: 2
 ; DUMP: 	CalleeEdges:
-; DUMP: 		Edge from Callee [[BAR2]] to Caller: [[BAZ2]] AllocTypes: Cold ContextIds: 2
+; DUMP: 		Edge from Callee [[BAR2]] to Caller: [[BAZ2]] AllocTypes: Cold ContextIds: 2 (Callee NodeId: 8)
 ; DUMP: 	CallerEdges:
-; DUMP: 		Edge from Callee [[BAZ2]] to Caller: [[FOO2]] AllocTypes: Cold ContextIds: 2
-; DUMP:		Clone of [[BAZ]]
+; DUMP: 		Edge from Callee [[BAZ2]] to Caller: [[FOO2]] AllocTypes: Cold ContextIds: 2 (Caller NodeId: 6)
+; DUMP:		Clone of [[BAZ]] NodeId: 2
 
 ; DUMP: Node [[BAR2]]
 ; DUMP: 	  %call = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #6	(clone 0)
+; DUMP: 	NodeId: 8
 ; DUMP: 	AllocTypes: Cold
 ; DUMP: 	ContextIds: 2
 ; DUMP: 	CalleeEdges:
 ; DUMP: 	CallerEdges:
-; DUMP: 		Edge from Callee [[BAR2]] to Caller: [[BAZ2]] AllocTypes: Cold ContextIds: 2
-; DUMP:		Clone of [[BAR]]
+; DUMP: 		Edge from Callee [[BAR2]] to Caller: [[BAZ2]] AllocTypes: Cold ContextIds: 2 (Caller NodeId: 7)
+; DUMP:		Clone of [[BAR]] NodeId: 1
 
 
 ; REMARKS: created clone _Z3barv.memprof.1
@@ -302,32 +315,32 @@ attributes #6 = { builtin }
 
 ; DOT: digraph "postbuild" {
 ; DOT: 	label="postbuild";
-; DOT: 	Node[[BAR:0x[a-z0-9]+]] [shape=record,tooltip="N[[BAR]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",label="{OrigId: Alloc0\n_Z3barv -\> _Znam}"];
-; DOT: 	Node[[BAZ:0x[a-z0-9]+]] [shape=record,tooltip="N[[BAZ]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",label="{OrigId: 12481870273128938184\n_Z3bazv -\> _Z3barv}"];
+; DOT: 	Node[[BAR:0x[a-z0-9]+]] [shape=record,tooltip="N[[BAR]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",label="{OrigId: Alloc0 NodeId: 1\n_Z3barv -\> _Znam}"];
+; DOT: 	Node[[BAZ:0x[a-z0-9]+]] [shape=record,tooltip="N[[BAZ]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",label="{OrigId: 12481870273128938184 NodeId: 2\n_Z3bazv -\> _Z3barv}"];
 ; DOT: 	Node[[BAZ]] -> Node[[BAR]][tooltip="ContextIds: 1 2",fillcolor="mediumorchid1",color="mediumorchid1"];
-; DOT: 	Node[[FOO:0x[a-z0-9]+]] [shape=record,tooltip="N[[FOO]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",label="{OrigId: 2732490490862098848\n_Z3foov -\> _Z3bazv}"];
+; DOT: 	Node[[FOO:0x[a-z0-9]+]] [shape=record,tooltip="N[[FOO]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",label="{OrigId: 2732490490862098848 NodeId: 3\n_Z3foov -\> _Z3bazv}"];
 ; DOT: 	Node[[FOO]] -> Node[[BAZ]][tooltip="ContextIds: 1 2",fillcolor="mediumorchid1",color="mediumorchid1"];
-; DOT: 	Node[[MAIN1:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN1]] ContextIds: 1",fillcolor="brown1",style="filled",label="{OrigId: 8632435727821051414\nmain -\> _Z3foov}"];
+; DOT: 	Node[[MAIN1:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN1]] ContextIds: 1",fillcolor="brown1",style="filled",label="{OrigId: 8632435727821051414 NodeId: 4\nmain -\> _Z3foov}"];
 ; DOT: 	Node[[MAIN1]] -> Node[[FOO]][tooltip="ContextIds: 1",fillcolor="brown1",color="brown1"];
-; DOT: 	Node[[MAIN2:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN2]] ContextIds: 2",fillcolor="cyan",style="filled",label="{OrigId: 15025054523792398438\nmain -\> _Z3foov}"];
+; DOT: 	Node[[MAIN2:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN2]] ContextIds: 2",fillcolor="cyan",style="filled",label="{OrigId: 15025054523792398438 NodeId: 5\nmain -\> _Z3foov}"];
 ; DOT: 	Node[[MAIN2]] -> Node[[FOO]][tooltip="ContextIds: 2",fillcolor="cyan",color="cyan"];
 ; DOT: }
 
 
 ; DOTCLONED: digraph "cloned" {
 ; DOTCLONED: 	label="cloned";
-; DOTCLONED: 	Node[[BAR:0x[a-z0-9]+]] [shape=record,tooltip="N[[BAR]] ContextIds: 1",fillcolor="brown1",style="filled",label="{OrigId: Alloc0\n_Z3barv -\> _Znam}"];
-; DOTCLONED: 	Node[[BAZ:0x[a-z0-9]+]] [shape=record,tooltip="N[[BAZ]] ContextIds: 1",fillcolor="brown1",style="filled",label="{OrigId: 12481870273128938184\n_Z3bazv -\> _Z3barv}"];
+; DOTCLONED: 	Node[[BAR:0x[a-z0-9]+]] [shape=record,tooltip="N[[BAR]] ContextIds: 1",fillcolor="brown1",style="filled",label="{OrigId: Alloc0 NodeId: 1\n_Z3barv -\> _Znam}"];
+; DOTCLONED: 	Node[[BAZ:0x[a-z0-9]+]] [shape=record,tooltip="N[[BAZ]] ContextIds: 1",fillcolor="brown1",style="filled",label="{OrigId: 12481870273128938184 NodeId: 2\n_Z3bazv -\> _Z3barv}"];
 ; DOTCLONED: 	Node[[BAZ]] -> Node[[BAR]][tooltip="ContextIds: 1",fillcolor="brown1",color="brown1"];
-; DOTCLONED: 	Node[[FOO:0x[a-z0-9]+]] [shape=record,tooltip="N[[FOO]] ContextIds: 1",fillcolor="brown1",style="filled",label="{OrigId: 2732490490862098848\n_Z3foov -\> _Z3bazv}"];
+; DOTCLONED: 	Node[[FOO:0x[a-z0-9]+]] [shape=record,tooltip="N[[FOO]] ContextIds: 1",fillcolor="brown1",style="filled",label="{OrigId: 2732490490862098848 NodeId: 3\n_Z3foov -\> _Z3bazv}"];
 ; DOTCLONED: 	Node[[FOO]] -> Node[[BAZ]][tooltip="ContextIds: 1",fillcolor="brown1",color="brown1"];
-; DOTCLONED: 	Node[[MAIN1:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN1]] ContextIds: 1",fillcolor="brown1",style="filled",label="{OrigId: 8632435727821051414\nmain -\> _Z3foov}"];
+; DOTCLONED: 	Node[[MAIN1:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN1]] ContextIds: 1",fillcolor="brown1",style="filled",label="{OrigId: 8632435727821051414 NodeId: 4\nmain -\> _Z3foov}"];
 ; DOTCLONED: 	Node[[MAIN1]] -> Node[[FOO]][tooltip="ContextIds: 1",fillcolor="brown1",color="brown1"];
-; DOTCLONED: 	Node[[MAIN2:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN2]] ContextIds: 2",fillcolor="cyan",style="filled",label="{OrigId: 15025054523792398438\nmain -\> _Z3foov}"];
+; DOTCLONED: 	Node[[MAIN2:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN2]] ContextIds: 2",fillcolor="cyan",style="filled",label="{OrigId: 15025054523792398438 NodeId: 5\nmain -\> _Z3foov}"];
 ; DOTCLONED: 	Node[[MAIN2]] -> Node[[FOO2:0x[a-z0-9]+]][tooltip="ContextIds: 2",fillcolor="cyan",color="cyan"];
-; DOTCLONED: 	Node[[FOO2]] [shape=record,tooltip="N[[FOO2]] ContextIds: 2",fillcolor="cyan",color="blue",style="filled,bold,dashed",label="{OrigId: 0\n_Z3foov -\> _Z3bazv}"];
+; DOTCLONED: 	Node[[FOO2]] [shape=record,tooltip="N[[FOO2]] ContextIds: 2",fillcolor="cyan",color="blue",style="filled,bold,dashed",label="{OrigId: 0 NodeId: 6\n_Z3foov -\> _Z3bazv}"];
 ; DOTCLONED: 	Node[[FOO2]] -> Node[[BAZ2:0x[a-z0-9]+]][tooltip="ContextIds: 2",fillcolor="cyan",color="cyan"];
-; DOTCLONED: 	Node[[BAZ2]] [shape=record,tooltip="N[[BAZ2]] ContextIds: 2",fillcolor="cyan",color="blue",style="filled,bold,dashed",label="{OrigId: 0\n_Z3bazv -\> _Z3barv}"];
+; DOTCLONED: 	Node[[BAZ2]] [shape=record,tooltip="N[[BAZ2]] ContextIds: 2",fillcolor="cyan",color="blue",style="filled,bold,dashed",label="{OrigId: 0 NodeId: 7\n_Z3bazv -\> _Z3barv}"];
 ; DOTCLONED: 	Node[[BAZ2]] -> Node[[BAR2:0x[a-z0-9]+]][tooltip="ContextIds: 2",fillcolor="cyan",color="cyan"];
-; DOTCLONED: 	Node[[BAR2]] [shape=record,tooltip="N[[BAR2]] ContextIds: 2",fillcolor="cyan",color="blue",style="filled,bold,dashed",label="{OrigId: Alloc0\n_Z3barv -\> _Znam}"];
+; DOTCLONED: 	Node[[BAR2]] [shape=record,tooltip="N[[BAR2]] ContextIds: 2",fillcolor="cyan",color="blue",style="filled,bold,dashed",label="{OrigId: Alloc0 NodeId: 8\n_Z3barv -\> _Znam}"];
 ; DOTCLONED: }
diff --git a/llvm/test/Transforms/MemProfContextDisambiguation/dot.ll b/llvm/test/Transforms/MemProfContextDisambiguation/dot.ll
index 6ffe5038afdbf..3e026faab9dbe 100644
--- a/llvm/test/Transforms/MemProfContextDisambiguation/dot.ll
+++ b/llvm/test/Transforms/MemProfContextDisambiguation/dot.ll
@@ -172,7 +172,7 @@ attributes #6 = { builtin }
 ; DOTALLOC0NONE-SAME: fillcolor="mediumorchid1",
 ; DOTALLOC0CONTEXT1-SAME: fontsize="30",fillcolor="magenta",
 ; DOTCONTEXT1-SAME: fillcolor="mediumorchid1",
-; DOTCOMMON-SAME: style="filled",label="{OrigId: Alloc0\n_Z3barv -\> _Znam}"];
+; DOTCOMMON-SAME: style="filled",label="{OrigId: Alloc0{{.*}}\n_Z3barv -\> _Znam}"];
 
 ; DOTCOMMON:  Node[[BAZ:0x[a-z0-9]+]] [shape=record,tooltip="N[[BAZ]] ContextIds: 1 2 3 4",
 ;; This node is highlighted when dumping the whole graph and specifying
@@ -183,7 +183,7 @@ attributes #6 = { builtin }
 ; DOTALLOC0NONE-SAME: fillcolor="mediumorchid1",
 ; DOTALLOC0CONTEXT1-SAME: fontsize="30",fillcolor="magenta",
 ; DOTCONTEXT1-SAME: fillcolor="mediumorchid1",
-; DOTCOMMON-SAME: style="filled",label="{OrigId: 12481870273128938184\n_Z3bazv -\> _Z3barv}"];
+; DOTCOMMON-SAME: style="filled",label="{OrigId: 12481870273128938184{{.*}}\n_Z3bazv -\> _Z3barv}"];
 
 ; DOTCOMMON:  Node[[BAZ]] -> Node[[BAR]][tooltip="ContextIds: 1 2",
 ;; This edge is highlighted when dumping the whole graph and specifying
@@ -208,7 +208,7 @@ attributes #6 = { builtin }
 ; DOTALLOC0NONE-SAME: fillcolor="mediumorchid1",
 ; DOTALLOC0CONTEXT1-SAME: fontsize="30",fillcolor="magenta",
 ; DOTCONTEXT1-SAME: fillcolor="mediumorchid1",
-; DOTCOMMON-SAME: style="filled",label="{OrigId: 2732490490862098848\n_Z3foov -\> _Z3bazv}"];
+; DOTCOMMON-SAME: style="filled",label="{OrigId: 2732490490862098848{{.*}}\n_Z3foov -\> _Z3bazv}"];
 
 ; DOTCOMMON:  Node[[FOO]] -> Node[[BAZ]][tooltip="ContextIds: 1 2 3 4",
 ;; This edge is highlighted when dumping the whole graph and specifying
@@ -227,7 +227,7 @@ attributes #6 = { builtin }
 ; DOTALLALLOC0-SAME: fontsize="30",
 ; DOTALLCONTEXT1-SAME: fontsize="30",
 ; DOTALLOC0CONTEXT1-SAME: fontsize="30",
-; DOTCOMMON-SAME: fillcolor="brown1",style="filled",label="{OrigId: 8632435727821051414\nmain -\> _Z3foov}"];
+; DOTCOMMON-SAME: fillcolor="brown1",style="filled",label="{OrigId: 8632435727821051414{{.*}}\nmain -\> _Z3foov}"];
 
 ; DOTCOMMON:  Node[[MAIN1]] -> Node[[FOO]][tooltip="ContextIds: 1 3",fillcolor="brown1",color="brown1"
 ;; This edge is highlighted when dumping the whole graph and specifying
@@ -248,7 +248,7 @@ attributes #6 = { builtin }
 ; DOTALLCONTEXT1-SAME: fillcolor="lightskyblue",
 ; DOTALLOC0NONE-SAME: fillcolor="cyan",
 ; DOTALLOC0CONTEXT1-SAME: fillcolor="lightskyblue",
-; DOTALLANDALLOC0-SAME: style="filled",label="{OrigId: 15025054523792398438\nmain -\> _Z3foov}"];
+; DOTALLANDALLOC0-SAME: style="filled",label="{OrigId: 15025054523792398438{{.*}}\nmain -\> _Z3foov}"];
 
 ; DOTALLANDALLOC0:  Node[[MAIN2]] -> Node[[FOO]][tooltip="ContextIds: 2 4",
 ;; This edge is highlighted when dumping the whole graph and specifying
@@ -262,4 +262,4 @@ attributes #6 = { builtin }
 
 ;; This edge is not in alloc 0 or context 0, so only included when exporting
 ;; the whole graph (and never highlighted).
-; DOTALL:  Node[[BAR2]] [shape=record,tooltip="N[[BAR2]] ContextIds: 3 4",fillcolor="mediumorchid1",style="filled",label="{OrigId: Alloc2\n_Z3barv -\> _Znam}"];
+; DOTALL:  Node[[BAR2]] [shape=record,tooltip="N[[BAR2]] ContextIds: 3 4",fillcolor="mediumorchid1",style="filled",label="{OrigId: Alloc2{{.*}}\n_Z3barv -\> _Znam}"];
diff --git a/llvm/test/Transforms/MemProfContextDisambiguation/duplicate-context-ids.ll b/llvm/test/Transforms/MemProfContextDisambiguation/duplicate-context-ids.ll
index 18521c013d555..bc60430338086 100644
--- a/llvm/test/Transforms/MemProfContextDisambiguation/duplicate-context-ids.ll
+++ b/llvm/test/Transforms/MemProfContextDisambiguation/duplicate-context-ids.ll
@@ -309,38 +309,38 @@ attributes #6 = { builtin }
 
 ; DOTPRE: digraph "prestackupdate" {
 ; DOTPRE: 	label="prestackupdate";
-; DOTPRE: 	Node[[D:0x[a-z0-9]+]] [shape=record,tooltip="N[[D]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",label="{OrigId: Alloc0\n_Z1Dv -\> _Znam}"];
-; DOTPRE: 	Node[[C:0x[a-z0-9]+]] [shape=record,tooltip="N[[C]] ContextIds: 1",fillcolor="cyan",style="filled",label="{OrigId: 12176601099670543485\nnull call (external)}"];
+; DOTPRE: 	Node[[D:0x[a-z0-9]+]] [shape=record,tooltip="N[[D]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",label="{OrigId: Alloc0{{.*}}\n_Z1Dv -\> _Znam}"];
+; DOTPRE: 	Node[[C:0x[a-z0-9]+]] [shape=record,tooltip="N[[C]] ContextIds: 1",fillcolor="cyan",style="filled",label="{OrigId: 12176601099670543485{{.*}}\nnull call (external)}"];
 ; DOTPRE: 	Node[[C]] -> Node[[D]][tooltip="ContextIds: 1",fillcolor="cyan"
-; DOTPRE: 	Node[[F:0x[a-z0-9]+]] [shape=record,tooltip="N[[F]] ContextIds: 2",fillcolor="brown1",style="filled",label="{OrigId: 13543580133643026784\nnull call (external)}"];
+; DOTPRE: 	Node[[F:0x[a-z0-9]+]] [shape=record,tooltip="N[[F]] ContextIds: 2",fillcolor="brown1",style="filled",label="{OrigId: 13543580133643026784{{.*}}\nnull call (external)}"];
 ; DOTPRE: 	Node[[F]] -> Node[[D]][tooltip="ContextIds: 2",fillcolor="brown1"
 ; DOTPRE: }
 
 
 ; DOTPOST:digraph "post
 ; DOTPOST:	label="post
-; DOTPOST:	Node[[D:0x[a-z0-9]+]] [shape=record,tooltip="N[[D]] ContextIds: 1 2 3 4",fillcolor="mediumorchid1",style="filled",label="{OrigId: Alloc0\n_Z1Dv -\> _Znam}"];
-; DOTPOST:	Node[[F:0x[a-z0-9]+]] [shape=record,tooltip="N[[F]] ContextIds: 2",fillcolor="brown1",style="filled",label="{OrigId: 13543580133643026784\n_Z1Fv -\> _Z1Dv}"];
+; DOTPOST:	Node[[D:0x[a-z0-9]+]] [shape=record,tooltip="N[[D]] ContextIds: 1 2 3 4",fillcolor="mediumorchid1",style="filled",label="{OrigId: Alloc0{{.*}}\n_Z1Dv -\> _Znam}"];
+; DOTPOST:	Node[[F:0x[a-z0-9]+]] [shape=record,tooltip="N[[F]] ContextIds: 2",fillcolor="brown1",style="filled",label="{OrigId: 13543580133643026784{{.*}}\n_Z1Fv -\> _Z1Dv}"];
 ; DOTPOST:	Node[[F]] -> Node[[D]][tooltip="ContextIds: 2",fillcolor="brown1"
-; DOTPOST:	Node[[C:0x[a-z0-9]+]] [shape=record,tooltip="N[[C]] ContextIds: 3",fillcolor="cyan",style="filled",label="{OrigId: 0\n_Z1Cv -\> _Z1Dv}"];
+; DOTPOST:	Node[[C:0x[a-z0-9]+]] [shape=record,tooltip="N[[C]] ContextIds: 3",fillcolor="cyan",style="filled",label="{OrigId: 0{{.*}}\n_Z1Cv -\> _Z1Dv}"];
 ; DOTPOST:	Node[[C]] -> Node[[D]][tooltip="ContextIds: 3",fillcolor="cyan
-; DOTPOST:	Node[[B:0x[a-z0-9]+]] [shape=record,tooltip="N[[B]] ContextIds: 4",fillcolor="cyan",style="filled",label="{OrigId: 0\n_Z1Bv -\> _Z1Dv}"];
+; DOTPOST:	Node[[B:0x[a-z0-9]+]] [shape=record,tooltip="N[[B]] ContextIds: 4",fillcolor="cyan",style="filled",label="{OrigId: 0{{.*}}\n_Z1Bv -\> _Z1Dv}"];
 ; DOTPOST:	Node[[B]] -> Node[[D]][tooltip="ContextIds: 4",fillcolor="cyan"
-; DOTPOST:	Node[[E:0x[a-z0-9]+]] [shape=record,tooltip="N[[E]] ContextIds: 1",fillcolor="cyan",style="filled",label="{OrigId: 0\n_Z1Ev -\> _Z1Dv}"];
+; DOTPOST:	Node[[E:0x[a-z0-9]+]] [shape=record,tooltip="N[[E]] ContextIds: 1",fillcolor="cyan",style="filled",label="{OrigId: 0{{.*}}\n_Z1Ev -\> _Z1Dv}"];
 ; DOTPOST:	Node[[E]] -> Node[[D]][tooltip="ContextIds: 1",fillcolor="cyan"
 ; DOTPOST:}
 
 
 ; DOTCLONED: digraph "cloned" {
 ; DOTCLONED: 	label="cloned";
-; DOTCLONED: 	Node[[D:0x[a-z0-9]+]] [shape=record,tooltip="N[[D]] ContextIds: 2",fillcolor="brown1",style="filled",label="{OrigId: Alloc0\n_Z1Dv -\> _Znam}"];
-; DOTCLONED: 	Node[[F:0x[a-z0-9]+]] [shape=record,tooltip="N[[F]] ContextIds: 2",fillcolor="brown1",style="filled",label="{OrigId: 13543580133643026784\n_Z1Fv -\> _Z1Dv}"];
+; DOTCLONED: 	Node[[D:0x[a-z0-9]+]] [shape=record,tooltip="N[[D]] ContextIds: 2",fillcolor="brown1",style="filled",label="{OrigId: Alloc0{{.*}}\n_Z1Dv -\> _Znam}"];
+; DOTCLONED: 	Node[[F:0x[a-z0-9]+]] [shape=record,tooltip="N[[F]] ContextIds: 2",fillcolor="brown1",style="filled",label="{OrigId: 13543580133643026784{{.*}}\n_Z1Fv -\> _Z1Dv}"];
 ; DOTCLONED: 	Node[[F]] -> Node[[D]][tooltip="ContextIds: 2",fillcolor="brown1"
-; DOTCLONED: 	Node[[C:0x[a-z0-9]+]] [shape=record,tooltip="N[[C]] ContextIds: 3",fillcolor="cyan",style="filled",label="{OrigId: 0\n_Z1Cv -\> _Z1Dv}"];
+; DOTCLONED: 	Node[[C:0x[a-z0-9]+]] [shape=record,tooltip="N[[C]] ContextIds: 3",fillcolor="cyan",style="filled",label="{OrigId: 0{{.*}}\n_Z1Cv -\> _Z1Dv}"];
 ; DOTCLONED: 	Node[[C]] -> Node[[D2:0x[a-z0-9]+]][tooltip="ContextIds: 3",fillcolor="cyan"
-; DOTCLONED: 	Node[[B:0x[a-z0-9]+]] [shape=record,tooltip="N[[B]] ContextIds: 4",fillcolor="cyan",style="filled",label="{OrigId: 0\n_Z1Bv -\> _Z1Dv}"];
+; DOTCLONED: 	Node[[B:0x[a-z0-9]+]] [shape=record,tooltip="N[[B]] ContextIds: 4",fillcolor="cyan",style="filled",label="{OrigId: 0{{.*}}\n_Z1Bv -\> _Z1Dv}"];
 ; DOTCLONED: 	Node[[B]] -> Node[[D2]][tooltip="ContextIds: 4",fillcolor="cyan"
-; DOTCLONED: 	Node[[E:0x[a-z0-9]+]] [shape=record,tooltip="N[[E]] ContextIds: 1",fillcolor="cyan",style="filled",label="{OrigId: 0\n_Z1Ev -\> _Z1Dv}"];
+; DOTCLONED: 	Node[[E:0x[a-z0-9]+]] [shape=record,tooltip="N[[E]] ContextIds: 1",fillcolor="cyan",style="filled",label="{OrigId: 0{{.*}}\n_Z1Ev -\> _Z1Dv}"];
 ; DOTCLONED: 	Node[[E]] -> Node[[D2]][tooltip="ContextIds: 1",fillcolor="cyan"
-; DOTCLONED: 	Node[[D2]] [shape=record,tooltip="N[[D2]] ContextIds: 1 3 4",fillcolor="cyan",color="blue",style="filled,bold,dashed",label="{OrigId: Alloc0\n_Z1Dv -\> _Znam}"];
+; DOTCLONED: 	Node[[D2]] [shape=record,tooltip="N[[D2]] ContextIds: 1 3 4",fillcolor="cyan",color="blue",style="filled,bold,dashed",label="{OrigId: Alloc0{{.*}}\n_Z1Dv -\> _Znam}"];
 ; DOTCLONED: }
diff --git a/llvm/test/Transforms/MemProfContextDisambiguation/indirectcall.ll b/llvm/test/Transforms/MemProfContextDisambiguation/indirectcall.ll
index 471b364cd5f89..1765f98245dfd 100644
--- a/llvm/test/Transforms/MemProfContextDisambiguation/indirectcall.ll
+++ b/llvm/test/Transforms/MemProfContextDisambiguation/indirectcall.ll
@@ -382,50 +382,50 @@ attributes #7 = { builtin }
 
 ; DOT: digraph "postbuild" {
 ; DOT: 	label="postbuild";
-; DOT: 	Node[[FOO:0x[a-z0-9]+]] [shape=record,tooltip="N[[FOO]] ContextIds: 1 2 3 4 5 6",fillcolor="mediumorchid1",style="filled",label="{OrigId: Alloc0\n_Z3foov -\> _Znam}"];
-; DOT: 	Node[[AX:0x[a-z0-9]+]] [shape=record,tooltip="N[[AX]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",label="{OrigId: 8256774051149711748\n_ZN1A1xEv -\> _Z3foov}"];
+; DOT: 	Node[[FOO:0x[a-z0-9]+]] [shape=record,tooltip="N[[FOO]] ContextIds: 1 2 3 4 5 6",fillcolor="mediumorchid1",style="filled",label="{OrigId: Alloc0{{.*}}\n_Z3foov -\> _Znam}"];
+; DOT: 	Node[[AX:0x[a-z0-9]+]] [shape=record,tooltip="N[[AX]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",label="{OrigId: 8256774051149711748{{.*}}\n_ZN1A1xEv -\> _Z3foov}"];
 ; DOT: 	Node[[AX]] -> Node[[FOO]][tooltip="ContextIds: 1 2",fillcolor="mediumorchid1"
-; DOT: 	Node[[BAR:0x[a-z0-9]+]] [shape=record,tooltip="N[[BAR]] ContextIds: 1 2 4 5",fillcolor="mediumorchid1",style="filled",label="{OrigId: 13626499562959447861\nnull call (external)}"];
+; DOT: 	Node[[BAR:0x[a-z0-9]+]] [shape=record,tooltip="N[[BAR]] ContextIds: 1 2 4 5",fillcolor="mediumorchid1",style="filled",label="{OrigId: 13626499562959447861{{.*}}\nnull call (external)}"];
 ; DOT: 	Node[[BAR]] -> Node[[AX]][tooltip="ContextIds: 1 2",fillcolor="mediumorchid1"
 ; DOT: 	Node[[BAR]] -> Node[[BX:0x[a-z0-9]+]][tooltip="ContextIds: 4 5",fillcolor="mediumorchid1"
-; DOT: 	Node[[MAIN1:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN1]] ContextIds: 1",fillcolor="brown1",style="filled",label="{OrigId: 748269490701775343\nmain -\> _Z3barP1A}"];
+; DOT: 	Node[[MAIN1:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN1]] ContextIds: 1",fillcolor="brown1",style="filled",label="{OrigId: 748269490701775343{{.*}}\nmain -\> _Z3barP1A}"];
 ; DOT: 	Node[[MAIN1]] -> Node[[BAR]][tooltip="ContextIds: 1",fillcolor="brown1"
-; DOT: 	Node[[MAIN2:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN2]] ContextIds: 2",fillcolor="cyan",style="filled",label="{OrigId: 12699492813229484831\nmain -\> _Z3barP1A}"];
+; DOT: 	Node[[MAIN2:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN2]] ContextIds: 2",fillcolor="cyan",style="filled",label="{OrigId: 12699492813229484831{{.*}}\nmain -\> _Z3barP1A}"];
 ; DOT: 	Node[[MAIN2]] -> Node[[BAR]][tooltip="ContextIds: 2",fillcolor="cyan"
-; DOT: 	Node[[MAIN3:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN3]] ContextIds: 3",fillcolor="brown1",style="filled",label="{OrigId: 8632435727821051414\nmain -\> _Z3foov}"];
+; DOT: 	Node[[MAIN3:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN3]] ContextIds: 3",fillcolor="brown1",style="filled",label="{OrigId: 8632435727821051414{{.*}}\nmain -\> _Z3foov}"];
 ; DOT: 	Node[[MAIN3]] -> Node[[FOO]][tooltip="ContextIds: 3",fillcolor="brown1"
-; DOT: 	Node[[BX]] [shape=record,tooltip="N[[BX]] ContextIds: 4 5",fillcolor="mediumorchid1",style="filled",label="{OrigId: 13614864978754796978\n_ZN1B1xEv -\> _Z3foov}"];
+; DOT: 	Node[[BX]] [shape=record,tooltip="N[[BX]] ContextIds: 4 5",fillcolor="mediumorchid1",style="filled",label="{OrigId: 13614864978754796978{{.*}}\n_ZN1B1xEv -\> _Z3foov}"];
 ; DOT: 	Node[[BX]] -> Node[[FOO]][tooltip="ContextIds: 4 5",fillcolor="mediumorchid1"
-; DOT: 	Node[[MAIN4:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN4]] ContextIds: 4",fillcolor="cyan",style="filled",label="{OrigId: 6792096022461663180\nmain -\> _Z3barP1A}"];
+; DOT: 	Node[[MAIN4:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN4]] ContextIds: 4",fillcolor="cyan",style="filled",label="{OrigId: 6792096022461663180{{.*}}\nmain -\> _Z3barP1A}"];
 ; DOT: 	Node[[MAIN4]] -> Node[[BAR]][tooltip="ContextIds: 4",fillcolor="cyan"
-; DOT: 	Node[[MAIN5:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN5]] ContextIds: 5",fillcolor="brown1",style="filled",label="{OrigId: 15737101490731057601\nmain -\> _Z3barP1A}"];
+; DOT: 	Node[[MAIN5:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN5]] ContextIds: 5",fillcolor="brown1",style="filled",label="{OrigId: 15737101490731057601{{.*}}\nmain -\> _Z3barP1A}"];
 ; DOT: 	Node[[MAIN5]] -> Node[[BAR]][tooltip="ContextIds: 5",fillcolor="brown1"
-; DOT: 	Node[[MAIN6:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN6]] ContextIds: 6",fillcolor="cyan",style="filled",label="{OrigId: 15025054523792398438\nmain -\> _Z3foov}"];
+; DOT: 	Node[[MAIN6:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN6]] ContextIds: 6",fillcolor="cyan",style="filled",label="{OrigId: 15025054523792398438{{.*}}\nmain -\> _Z3foov}"];
 ; DOT: 	Node[[MAIN6]] -> Node[[FOO]][tooltip="ContextIds: 6",fillcolor="cyan"
 ; DOT: }
 
 
 ; DOTCLONED: digraph "cloned" {
 ; DOTCLONED: 	label="cloned";
-; DOTCLONED: 	Node[[FOO2:0x[a-z0-9]+]] [shape=record,tooltip="N[[FOO2]] ContextIds: 1 2 3 4 5",fillcolor="mediumorchid1",style="filled",label="{OrigId: Alloc0\n_Z3foov -\> _Znam}"];
-; DOTCLONED: 	Node[[AX:0x[a-z0-9]+]] [shape=record,tooltip="N[[AX]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",label="{OrigId: 8256774051149711748\n_ZN1A1xEv -\> _Z3foov}"];
+; DOTCLONED: 	Node[[FOO2:0x[a-z0-9]+]] [shape=record,tooltip="N[[FOO2]] ContextIds: 1 2 3 4 5",fillcolor="mediumorchid1",style="filled",label="{OrigId: Alloc0{{.*}}\n_Z3foov -\> _Znam}"];
+; DOTCLONED: 	Node[[AX:0x[a-z0-9]+]] [shape=record,tooltip="N[[AX]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",label="{OrigId: 8256774051149711748{{.*}}\n_ZN1A1xEv -\> _Z3foov}"];
 ; DOTCLONED: 	Node[[AX]] -> Node[[FOO2]][tooltip="ContextIds: 1 2",fillcolor="mediumorchid1"
-; DOTCLONED: 	Node[[BAR:0x[a-z0-9]+]] [shape=record,tooltip="N[[BAR]] ContextIds: 1 2 4 5",fillcolor="mediumorchid1",style="filled",label="{OrigId: 13626499562959447861\nnull call (external)}"];
+; DOTCLONED: 	Node[[BAR:0x[a-z0-9]+]] [shape=record,tooltip="N[[BAR]] ContextIds: 1 2 4 5",fillcolor="mediumorchid1",style="filled",label="{OrigId: 13626499562959447861{{.*}}\nnull call (external)}"];
 ; DOTCLONED: 	Node[[BAR]] -> Node[[AX]][tooltip="ContextIds: 1 2",fillcolor="mediumorchid1"
 ; DOTCLONED: 	Node[[BAR]] -> Node[[BX:0x[a-z0-9]+]][tooltip="ContextIds: 4 5",fillcolor="mediumorchid1"
-; DOTCLONED: 	Node[[MAIN1:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN1]] ContextIds: 1",fillcolor="brown1",style="filled",label="{OrigId: 748269490701775343\nmain -\> _Z3barP1A}"];
+; DOTCLONED: 	Node[[MAIN1:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN1]] ContextIds: 1",fillcolor="brown1",style="filled",label="{OrigId: 748269490701775343{{.*}}\nmain -\> _Z3barP1A}"];
 ; DOTCLONED: 	Node[[MAIN1]] -> Node[[BAR]][tooltip="ContextIds: 1",fillcolor="brown1"
-; DOTCLONED: 	Node[[MAIN2:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN2]] ContextIds: 2",fillcolor="cyan",style="filled",label="{OrigId: 12699492813229484831\nmain -\> _Z3barP1A}"];
+; DOTCLONED: 	Node[[MAIN2:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN2]] ContextIds: 2",fillcolor="cyan",style="filled",label="{OrigId: 12699492813229484831{{.*}}\nmain -\> _Z3barP1A}"];
 ; DOTCLONED: 	Node[[MAIN2]] -> Node[[BAR]][tooltip="ContextIds: 2",fillcolor="cyan"
-; DOTCLONED: 	Node[[MAIN3:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN3]] ContextIds: 3",fillcolor="brown1",style="filled",label="{OrigId: 8632435727821051414\nmain -\> _Z3foov}"];
+; DOTCLONED: 	Node[[MAIN3:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN3]] ContextIds: 3",fillcolor="brown1",style="filled",label="{OrigId: 8632435727821051414{{.*}}\nmain -\> _Z3foov}"];
 ; DOTCLONED: 	Node[[MAIN3]] -> Node[[FOO2]][tooltip="ContextIds: 3",fillcolor="brown1"
-; DOTCLONED: 	Node[[BX]] [shape=record,tooltip="N[[BX]] ContextIds: 4 5",fillcolor="mediumorchid1",style="filled",label="{OrigId: 13614864978754796978\n_ZN1B1xEv -\> _Z3foov}"];
+; DOTCLONED: 	Node[[BX]] [shape=record,tooltip="N[[BX]] ContextIds: 4 5",fillcolor="mediumorchid1",style="filled",label="{OrigId: 13614864978754796978{{.*}}\n_ZN1B1xEv -\> _Z3foov}"];
 ; DOTCLONED: 	Node[[BX]] -> Node[[FOO2]][tooltip="ContextIds: 4 5",fillcolor="mediumorchid1"
-; DOTCLONED: 	Node[[MAIN4:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN4]] ContextIds: 4",fillcolor="cyan",style="filled",label="{OrigId: 6792096022461663180\nmain -\> _Z3barP1A}"];
+; DOTCLONED: 	Node[[MAIN4:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN4]] ContextIds: 4",fillcolor="cyan",style="filled",label="{OrigId: 6792096022461663180{{.*}}\nmain -\> _Z3barP1A}"];
 ; DOTCLONED: 	Node[[MAIN4]] -> Node[[BAR]][tooltip="ContextIds: 4",fillcolor="cyan"
-; DOTCLONED: 	Node[[MAIN5:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN5]] ContextIds: 5",fillcolor="brown1",style="filled",label="{OrigId: 15737101490731057601\nmain -\> _Z3barP1A}"];
+; DOTCLONED: 	Node[[MAIN5:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN5]] ContextIds: 5",fillcolor="brown1",style="filled",label="{OrigId: 15737101490731057601{{.*}}\nmain -\> _Z3barP1A}"];
 ; DOTCLONED: 	Node[[MAIN5]] -> Node[[BAR]][tooltip="ContextIds: 5",fillcolor="brown1"
-; DOTCLONED: 	Node[[MAIN6:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN6]] ContextIds: 6",fillcolor="cyan",style="filled",label="{OrigId: 15025054523792398438\nmain -\> _Z3foov}"];
+; DOTCLONED: 	Node[[MAIN6:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN6]] ContextIds: 6",fillcolor="cyan",style="filled",label="{OrigId: 15025054523792398438{{.*}}\nmain -\> _Z3foov}"];
 ; DOTCLONED: 	Node[[MAIN6]] -> Node[[FOO2:0x[a-z0-9]+]][tooltip="ContextIds: 6",fillcolor="cyan"
-; DOTCLONED: 	Node[[FOO2]] [shape=record,tooltip="N[[FOO2]] ContextIds: 6",fillcolor="cyan",color="blue",style="filled,bold,dashed",label="{OrigId: Alloc0\n_Z3foov -\> _Znam}"];
+; DOTCLONED: 	Node[[FOO2]] [shape=record,tooltip="N[[FOO2]] ContextIds: 6",fillcolor="cyan",color="blue",style="filled,bold,dashed",label="{OrigId: Alloc0{{.*}}\n_Z3foov -\> _Znam}"];
 ; DOTCLONED: }
diff --git a/llvm/test/Transforms/MemProfContextDisambiguation/inlined.ll b/llvm/test/Transforms/MemProfContextDisambiguation/inlined.ll
index a1a478e88ebbf..f80a418392655 100644
--- a/llvm/test/Transforms/MemProfContextDisambiguation/inlined.ll
+++ b/llvm/test/Transforms/MemProfContextDisambiguation/inlined.ll
@@ -295,36 +295,36 @@ attributes #7 = { builtin }
 
 ; DOT: digraph "postbuild" {
 ; DOT: 	label="postbuild";
-; DOT: 	Node[[BAR:0x[a-z0-9]+]] [shape=record,tooltip="N[[BAR]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",label="{OrigId: Alloc0\n_Z3barv -\> _Znam}"];
-; DOT: 	Node[[FOO:0x[a-z0-9]+]] [shape=record,tooltip="N[[FOO]] ContextIds: 3 4",fillcolor="mediumorchid1",style="filled",label="{OrigId: 2732490490862098848\nnull call (external)}"];
+; DOT: 	Node[[BAR:0x[a-z0-9]+]] [shape=record,tooltip="N[[BAR]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",label="{OrigId: Alloc0{{.*}}\n_Z3barv -\> _Znam}"];
+; DOT: 	Node[[FOO:0x[a-z0-9]+]] [shape=record,tooltip="N[[FOO]] ContextIds: 3 4",fillcolor="mediumorchid1",style="filled",label="{OrigId: 2732490490862098848{{.*}}\nnull call (external)}"];
 ; DOT: 	Node[[FOO]] -> Node[[BAZ:0x[a-z0-9]+]][tooltip="ContextIds: 3 4",fillcolor="mediumorchid1"
-; DOT: 	Node[[MAIN1:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN1]] ContextIds: 1 3",fillcolor="brown1",style="filled",label="{OrigId: 8632435727821051414\nmain -\> _Z3foov}"];
+; DOT: 	Node[[MAIN1:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN1]] ContextIds: 1 3",fillcolor="brown1",style="filled",label="{OrigId: 8632435727821051414{{.*}}\nmain -\> _Z3foov}"];
 ; DOT: 	Node[[MAIN1]] -> Node[[FOO]][tooltip="ContextIds: 3",fillcolor="brown1"
 ; DOT: 	Node[[MAIN1]] -> Node[[FOO2:0x[a-z0-9]+]][tooltip="ContextIds: 1",fillcolor="brown1"
-; DOT: 	Node[[MAIN2:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN2]] ContextIds: 2 4",fillcolor="cyan",style="filled",label="{OrigId: 15025054523792398438\nmain -\> _Z3foov}"];
+; DOT: 	Node[[MAIN2:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN2]] ContextIds: 2 4",fillcolor="cyan",style="filled",label="{OrigId: 15025054523792398438{{.*}}\nmain -\> _Z3foov}"];
 ; DOT: 	Node[[MAIN2]] -> Node[[FOO]][tooltip="ContextIds: 4",fillcolor="cyan"
 ; DOT: 	Node[[MAIN2]] -> Node[[FOO2]][tooltip="ContextIds: 2",fillcolor="cyan"
-; DOT: 	Node[[BAZ]] [shape=record,tooltip="N[[BAZ]] ContextIds: 3 4",fillcolor="mediumorchid1",style="filled",label="{OrigId: Alloc2\n_Z3bazv -\> _Znam}"];
-; DOT: 	Node[[FOO2]] [shape=record,tooltip="N[[FOO2]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",label="{OrigId: 0\n_Z3foov -\> _Z3barv}"];
+; DOT: 	Node[[BAZ]] [shape=record,tooltip="N[[BAZ]] ContextIds: 3 4",fillcolor="mediumorchid1",style="filled",label="{OrigId: Alloc2{{.*}}\n_Z3bazv -\> _Znam}"];
+; DOT: 	Node[[FOO2]] [shape=record,tooltip="N[[FOO2]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",label="{OrigId: 0{{.*}}\n_Z3foov -\> _Z3barv}"];
 ; DOT: 	Node[[FOO2]] -> Node[[BAR]][tooltip="ContextIds: 1 2",fillcolor="mediumorchid1"
 ; DOT: }
 
 
 ; DOTCLONED: digraph "cloned" {
 ; DOTCLONED: 	label="cloned";
-; DOTCLONED: 	Node[[BAR:0x[a-z0-9]+]] [shape=record,tooltip="N[[BAR]] ContextIds: 1",fillcolor="brown1",style="filled",label="{OrigId: Alloc0\n_Z3barv -\> _Znam}"];
-; DOTCLONED: 	Node[[FOO2:0x[a-z0-9]+]] [shape=record,tooltip="N[[FOO2]] ContextIds: 3 4",fillcolor="mediumorchid1",style="filled",label="{OrigId: 2732490490862098848\nnull call (external)}"];
+; DOTCLONED: 	Node[[BAR:0x[a-z0-9]+]] [shape=record,tooltip="N[[BAR]] ContextIds: 1",fillcolor="brown1",style="filled",label="{OrigId: Alloc0{{.*}}\n_Z3barv -\> _Znam}"];
+; DOTCLONED: 	Node[[FOO2:0x[a-z0-9]+]] [shape=record,tooltip="N[[FOO2]] ContextIds: 3 4",fillcolor="mediumorchid1",style="filled",label="{OrigId: 2732490490862098848{{.*}}\nnull call (external)}"];
 ; DOTCLONED: 	Node[[FOO2]] -> Node[[BAZ:0x[a-z0-9]+]][tooltip="ContextIds: 3 4",fillcolor="mediumorchid1"
-; DOTCLONED: 	Node[[MAIN1:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN1]] ContextIds: 1 3",fillcolor="brown1",style="filled",label="{OrigId: 8632435727821051414\nmain -\> _Z3foov}"];
+; DOTCLONED: 	Node[[MAIN1:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN1]] ContextIds: 1 3",fillcolor="brown1",style="filled",label="{OrigId: 8632435727821051414{{.*}}\nmain -\> _Z3foov}"];
 ; DOTCLONED: 	Node[[MAIN1]] -> Node[[FOO2]][tooltip="ContextIds: 3",fillcolor="brown1"
 ; DOTCLONED: 	Node[[MAIN1]] -> Node[[FOO:0x[a-z0-9]+]][tooltip="ContextIds: 1",fillcolor="brown1"
-; DOTCLONED: 	Node[[MAIN2:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN2]] ContextIds: 2 4",fillcolor="cyan",style="filled",label="{OrigId: 15025054523792398438\nmain -\> _Z3foov}"];
+; DOTCLONED: 	Node[[MAIN2:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN2]] ContextIds: 2 4",fillcolor="cyan",style="filled",label="{OrigId: 15025054523792398438{{.*}}\nmain -\> _Z3foov}"];
 ; DOTCLONED: 	Node[[MAIN2]] -> Node[[FOO2]][tooltip="ContextIds: 4",fillcolor="cyan"
 ; DOTCLONED: 	Node[[MAIN2]] -> Node[[FOO3:0x[a-z0-9]+]][tooltip="ContextIds: 2",fillcolor="cyan"
-; DOTCLONED: 	Node[[BAZ]] [shape=record,tooltip="N[[BAZ]] ContextIds: 3 4",fillcolor="mediumorchid1",style="filled",label="{OrigId: Alloc2\n_Z3bazv -\> _Znam}"];
-; DOTCLONED: 	Node[[FOO]] [shape=record,tooltip="N[[FOO]] ContextIds: 1",fillcolor="brown1",style="filled",label="{OrigId: 0\n_Z3foov -\> _Z3barv}"];
+; DOTCLONED: 	Node[[BAZ]] [shape=record,tooltip="N[[BAZ]] ContextIds: 3 4",fillcolor="mediumorchid1",style="filled",label="{OrigId: Alloc2{{.*}}\n_Z3bazv -\> _Znam}"];
+; DOTCLONED: 	Node[[FOO]] [shape=record,tooltip="N[[FOO]] ContextIds: 1",fillcolor="brown1",style="filled",label="{OrigId: 0{{.*}}\n_Z3foov -\> _Z3barv}"];
 ; DOTCLONED: 	Node[[FOO]] -> Node[[BAR]][tooltip="ContextIds: 1",fillcolor="brown1"
-; DOTCLONED: 	Node[[FOO3]] [shape=record,tooltip="N[[FOO3]] ContextIds: 2",fillcolor="cyan",color="blue",style="filled,bold,dashed",label="{OrigId: 0\n_Z3foov -\> _Z3barv}"];
+; DOTCLONED: 	Node[[FOO3]] [shape=record,tooltip="N[[FOO3]] ContextIds: 2",fillcolor="cyan",color="blue",style="filled,bold,dashed",label="{OrigId: 0{{.*}}\n_Z3foov -\> _Z3barv}"];
 ; DOTCLONED: 	Node[[FOO3]] -> Node[[BAR2:0x[a-z0-9]+]][tooltip="ContextIds: 2",fillcolor="cyan"
-; DOTCLONED: 	Node[[BAR2]] [shape=record,tooltip="N[[BAR2]] ContextIds: 2",fillcolor="cyan",color="blue",style="filled,bold,dashed",label="{OrigId: Alloc0\n_Z3barv -\> _Znam}"];
+; DOTCLONED: 	Node[[BAR2]] [shape=record,tooltip="N[[BAR2]] ContextIds: 2",fillcolor="cyan",color="blue",style="filled,bold,dashed",label="{OrigId: Alloc0{{.*}}\n_Z3barv -\> _Znam}"];
 ; DOTCLONED: }
diff --git a/llvm/test/Transforms/MemProfContextDisambiguation/tailcall-nonunique.ll b/llvm/test/Transforms/MemProfContextDisambiguation/tailcall-nonunique.ll
index 97e433ccefa99..d293b2a6a4bc4 100644
--- a/llvm/test/Transforms/MemProfContextDisambiguation/tailcall-nonunique.ll
+++ b/llvm/test/Transforms/MemProfContextDisambiguation/tailcall-nonunique.ll
@@ -16,9 +16,9 @@
 ;; resolved via tail call fixup. That happens in between the initial
 ;; stack update and the end of graph building.
 ; RUN:  cat %t.ccg.poststackupdate.dot | FileCheck %s --check-prefix=DOTPOSTSTACKUPDATE
-; DOTPOSTSTACKUPDATE: {OrigId: 15025054523792398438\nmain -\> xyz}
+; DOTPOSTSTACKUPDATE: {OrigId: 15025054523792398438{{.*}}\nmain -\> xyz}
 ; RUN:  cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOTPOSTBUILD
-; DOTPOSTBUILD: {OrigId: 15025054523792398438\nnull call (external)}
+; DOTPOSTBUILD: {OrigId: 15025054523792398438{{.*}}\nnull call (external)}
 
 ;; Check that all calls in the IR are to the original functions, leading to a
 ;; non-cold operator new call.
diff --git a/llvm/test/Transforms/MergedLoadStoreMotion/preserve-store-metadata.ll b/llvm/test/Transforms/MergedLoadStoreMotion/preserve-store-metadata.ll
index 33e37c97b7a0e..1dfdf09a26999 100644
--- a/llvm/test/Transforms/MergedLoadStoreMotion/preserve-store-metadata.ll
+++ b/llvm/test/Transforms/MergedLoadStoreMotion/preserve-store-metadata.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt -passes=mldst-motion -S %s | FileCheck %s
 
 target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-n32:64-S128-Fn32"
@@ -10,7 +10,7 @@ define void @perserve_common_metadata(i1 %c, ptr %dst, ptr %min) {
 ; CHECK-NEXT:    [[GEP_DST_16:%.*]] = getelementptr inbounds nuw i8, ptr [[DST]], i64 16
 ; CHECK-NEXT:    br i1 [[C]], label %[[THEN:.*]], label %[[ELSE:.*]]
 ; CHECK:       [[THEN]]:
-; CHECK-NEXT:    store ptr [[DST]], ptr [[MIN]], align 8, !tbaa [[TBAA0:![0-9]+]]
+; CHECK-NEXT:    store ptr [[DST]], ptr [[MIN]], align 8, !tbaa [[INT_TBAA0:![0-9]+]]
 ; CHECK-NEXT:    br label %[[RETURN:.*]]
 ; CHECK:       [[ELSE]]:
 ; CHECK-NEXT:    [[GEP_DST_24:%.*]] = getelementptr inbounds nuw i8, ptr [[DST]], i64 24
@@ -18,7 +18,7 @@ define void @perserve_common_metadata(i1 %c, ptr %dst, ptr %min) {
 ; CHECK-NEXT:    br label %[[RETURN]]
 ; CHECK:       [[RETURN]]:
 ; CHECK-NEXT:    [[DOTSINK:%.*]] = phi ptr [ [[DST]], %[[THEN]] ], [ null, %[[ELSE]] ]
-; CHECK-NEXT:    store ptr [[DOTSINK]], ptr [[GEP_DST_16]], align 8, !tbaa [[TBAA4:![0-9]+]], !alias.scope [[META6:![0-9]+]], !noalias [[META6]], !llvm.access.group [[ACC_GRP9:![0-9]+]]
+; CHECK-NEXT:    store ptr [[DOTSINK]], ptr [[GEP_DST_16]], align 8, !tbaa [[LONG_TBAA4:![0-9]+]], !alias.scope [[META6:![0-9]+]], !noalias [[META6]], !llvm.access.group [[ACC_GRP9:![0-9]+]]
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -47,7 +47,7 @@ define void @clear_different_metadata(i1 %c, ptr %dst, ptr %min) {
 ; CHECK-NEXT:    [[GEP_DST_16:%.*]] = getelementptr inbounds nuw i8, ptr [[DST]], i64 16
 ; CHECK-NEXT:    br i1 [[C]], label %[[THEN:.*]], label %[[ELSE:.*]]
 ; CHECK:       [[THEN]]:
-; CHECK-NEXT:    store ptr [[DST]], ptr [[MIN]], align 8, !tbaa [[TBAA10:![0-9]+]]
+; CHECK-NEXT:    store ptr [[DST]], ptr [[MIN]], align 8, !tbaa [[_FOOPTR_TBAA10:![0-9]+]]
 ; CHECK-NEXT:    br label %[[RETURN:.*]]
 ; CHECK:       [[ELSE]]:
 ; CHECK-NEXT:    [[GEP_DST_24:%.*]] = getelementptr inbounds nuw i8, ptr [[DST]], i64 24
@@ -55,7 +55,7 @@ define void @clear_different_metadata(i1 %c, ptr %dst, ptr %min) {
 ; CHECK-NEXT:    br label %[[RETURN]]
 ; CHECK:       [[RETURN]]:
 ; CHECK-NEXT:    [[DOTSINK:%.*]] = phi ptr [ [[DST]], %[[THEN]] ], [ null, %[[ELSE]] ]
-; CHECK-NEXT:    store ptr [[DOTSINK]], ptr [[GEP_DST_16]], align 8
+; CHECK-NEXT:    store ptr [[DOTSINK]], ptr [[GEP_DST_16]], align 8, !tbaa [[CHAR_TBAA13:![0-9]+]], !alias.scope [[META6]], !noalias [[META6]]
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -93,17 +93,18 @@ return:
 !13 = distinct !{}
 !14 = distinct !{}
 ;.
-; CHECK: [[TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0}
+; CHECK: [[INT_TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0}
 ; CHECK: [[META1]] = !{!"int", [[META2:![0-9]+]]}
 ; CHECK: [[META2]] = !{!"omnipotent char", [[META3:![0-9]+]], i64 0}
 ; CHECK: [[META3]] = !{!"Simple C++ TBAA"}
-; CHECK: [[TBAA4]] = !{[[META5:![0-9]+]], [[META5]], i64 0, i64 0}
+; CHECK: [[LONG_TBAA4]] = !{[[META5:![0-9]+]], [[META5]], i64 0, i64 0}
 ; CHECK: [[META5]] = !{!"long", [[META2]]}
 ; CHECK: [[META6]] = !{[[META7:![0-9]+]]}
 ; CHECK: [[META7]] = distinct !{[[META7]], [[META8:![0-9]+]]}
 ; CHECK: [[META8]] = distinct !{[[META8]]}
 ; CHECK: [[ACC_GRP9]] = distinct !{}
-; CHECK: [[TBAA10]] = !{[[META11:![0-9]+]], [[META11]], i64 0, i64 0}
+; CHECK: [[_FOOPTR_TBAA10]] = !{[[META11:![0-9]+]], [[META11]], i64 0, i64 0}
 ; CHECK: [[META11]] = !{!"p2 _Foo", [[META12:![0-9]+]]}
 ; CHECK: [[META12]] = !{!"any pointer", [[META2]], i64 0}
+; CHECK: [[CHAR_TBAA13]] = !{[[META2]], [[META2]], i64 0}
 ;.
diff --git a/llvm/test/Transforms/NewGVN/memory-handling.ll b/llvm/test/Transforms/NewGVN/memory-handling.ll
index bf07edf91f2ba..f83d145167c75 100644
--- a/llvm/test/Transforms/NewGVN/memory-handling.ll
+++ b/llvm/test/Transforms/NewGVN/memory-handling.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ;; This test is really dependent on propagating a lot of memory info around, but in the end, not
 ;; screwing up a single add.
 ; RUN: opt < %s -passes=newgvn -S | FileCheck %s
@@ -26,114 +26,114 @@ define void @BuildMask(ptr nocapture readonly) local_unnamed_addr #0 {
 ; CHECK-NEXT:    tail call void @llvm.memset.p0.i64(ptr align 16 @alPhrase, i8 0, i64 416, i1 false)
 ; CHECK-NEXT:    tail call void @llvm.memset.p0.i64(ptr align 16 @aqMainMask, i8 0, i64 16, i1 false)
 ; CHECK-NEXT:    tail call void @llvm.memset.p0.i64(ptr align 16 @aqMainSign, i8 0, i64 16, i1 false)
-; CHECK-NEXT:    br label [[DOTSINK_SPLIT:%.*]]
-; CHECK:       .sink.split:
-; CHECK-NEXT:    [[DOT0:%.*]] = phi ptr [ [[TMP0]], [[TMP1:%.*]] ], [ [[TMP3:%.*]], [[TMP14:%.*]] ]
-; CHECK-NEXT:    [[DOTSINK:%.*]] = phi i32 [ 0, [[TMP1]] ], [ [[TMP22:%.*]], [[TMP14]] ]
-; CHECK-NEXT:    store i32 [[DOTSINK]], ptr @cchPhraseLength, align 4, !tbaa [[TBAA1:![0-9]+]]
-; CHECK-NEXT:    br label [[TMP2:%.*]]
-; CHECK:       2:
-; CHECK-NEXT:    [[DOT1:%.*]] = phi ptr [ [[DOT0]], [[DOTSINK_SPLIT]] ], [ [[TMP3]], [[TMP6:%.*]] ]
+; CHECK-NEXT:    br label %[[DOTSINK_SPLIT:.*]]
+; CHECK:       [[_SINK_SPLIT:.*:]]
+; CHECK-NEXT:    [[DOT0:%.*]] = phi ptr [ [[TMP0]], [[TMP1:%.*]] ], [ [[TMP3:%.*]], %[[TMP14:.*]] ]
+; CHECK-NEXT:    [[DOTSINK:%.*]] = phi i32 [ 0, [[TMP1]] ], [ [[TMP22:%.*]], %[[TMP14]] ]
+; CHECK-NEXT:    store i32 [[DOTSINK]], ptr @cchPhraseLength, align 4, !tbaa [[INT_TBAA1:![0-9]+]]
+; CHECK-NEXT:    br label %[[BB2:.*]]
+; CHECK:       [[BB2]]:
+; CHECK-NEXT:    [[DOT1:%.*]] = phi ptr [ [[DOT0]], %[[DOTSINK_SPLIT]] ], [ [[TMP3]], %[[TMP6:.*]] ]
 ; CHECK-NEXT:    [[TMP3]] = getelementptr inbounds i8, ptr [[DOT1]], i64 1
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr [[DOT1]], align 1, !tbaa [[TBAA5:![0-9]+]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr [[DOT1]], align 1, !tbaa [[CHAR_TBAA5:![0-9]+]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i8 [[TMP4]], 0
-; CHECK-NEXT:    br i1 [[TMP5]], label [[DOTPREHEADER_PREHEADER:%.*]], label [[TMP6]]
-; CHECK:       .preheader.preheader:
-; CHECK-NEXT:    br label [[DOTPREHEADER:%.*]]
-; CHECK:       6:
+; CHECK-NEXT:    br i1 [[TMP5]], label %[[DOTPREHEADER_PREHEADER:.*]], label %[[TMP6]]
+; CHECK:       [[_PREHEADER_PREHEADER:.*:]]
+; CHECK-NEXT:    br [[DOTPREHEADER:label %.*]]
+; CHECK:       [[TMP6]]:
 ; CHECK-NEXT:    [[TMP7:%.*]] = tail call ptr @__ctype_b_loc() #[[ATTR4:[0-9]+]]
-; CHECK-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[TMP7]], align 8, !tbaa [[TBAA6:![0-9]+]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[TMP7]], align 8, !tbaa [[ANYPTR_TBAA6:![0-9]+]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = sext i8 [[TMP4]] to i64
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i16, ptr [[TMP8]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = load i16, ptr [[TMP10]], align 2, !tbaa [[TBAA8:![0-9]+]]
+; CHECK-NEXT:    [[TMP11:%.*]] = load i16, ptr [[TMP10]], align 2, !tbaa [[SHORT_TBAA8:![0-9]+]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = and i16 [[TMP11]], 1024
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i16 [[TMP12]], 0
-; CHECK-NEXT:    br i1 [[TMP13]], label [[TMP2]], label [[TMP14]]
-; CHECK:       14:
+; CHECK-NEXT:    br i1 [[TMP13]], label %[[BB2]], label %[[TMP14]]
+; CHECK:       [[TMP14]]:
 ; CHECK-NEXT:    [[TMP15:%.*]] = sext i8 [[TMP4]] to i32
 ; CHECK-NEXT:    [[TMP16:%.*]] = tail call i32 @tolower(i32 [[TMP15]]) #[[ATTR5:[0-9]+]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = add nsw i32 [[TMP16]], -97
 ; CHECK-NEXT:    [[TMP18:%.*]] = sext i32 [[TMP17]] to i64
 ; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [26 x %struct.Letter], ptr @alPhrase, i64 0, i64 [[TMP18]], i32 0
-; CHECK-NEXT:    [[TMP20:%.*]] = load i32, ptr [[TMP19]], align 16, !tbaa [[TBAA10:![0-9]+]]
+; CHECK-NEXT:    [[TMP20:%.*]] = load i32, ptr [[TMP19]], align 16, !tbaa [[INT_TBAA10:![0-9]+]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = add i32 [[TMP20]], 1
-; CHECK-NEXT:    store i32 [[TMP21]], ptr [[TMP19]], align 16, !tbaa [[TBAA10]]
+; CHECK-NEXT:    store i32 [[TMP21]], ptr [[TMP19]], align 16, !tbaa [[INT_TBAA10]]
 ; CHECK-NEXT:    [[TMP22]] = add nsw i32 [[DOTSINK]], 1
-; CHECK-NEXT:    br label [[DOTSINK_SPLIT]]
-; CHECK:       .preheader:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[DOTPREHEADER_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[TMP57:%.*]] ]
-; CHECK-NEXT:    [[DOT04961:%.*]] = phi i32 [ [[DOT2:%.*]], [[TMP57]] ], [ 0, [[DOTPREHEADER_PREHEADER]] ]
-; CHECK-NEXT:    [[DOT05160:%.*]] = phi i32 [ [[DOT253:%.*]], [[TMP57]] ], [ 0, [[DOTPREHEADER_PREHEADER]] ]
+; CHECK-NEXT:    br label %[[DOTSINK_SPLIT]]
+; CHECK:       [[_PREHEADER:.*:]]
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[DOTPREHEADER_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[TMP57:.*]] ]
+; CHECK-NEXT:    [[DOT04961:%.*]] = phi i32 [ [[DOT2:%.*]], %[[TMP57]] ], [ 0, %[[DOTPREHEADER_PREHEADER]] ]
+; CHECK-NEXT:    [[DOT05160:%.*]] = phi i32 [ [[DOT253:%.*]], %[[TMP57]] ], [ 0, %[[DOTPREHEADER_PREHEADER]] ]
 ; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [26 x %struct.Letter], ptr @alPhrase, i64 0, i64 [[INDVARS_IV]], i32 0
-; CHECK-NEXT:    [[TMP24:%.*]] = load i32, ptr [[TMP23]], align 16, !tbaa [[TBAA10]]
+; CHECK-NEXT:    [[TMP24:%.*]] = load i32, ptr [[TMP23]], align 16, !tbaa [[INT_TBAA10]]
 ; CHECK-NEXT:    [[TMP25:%.*]] = icmp eq i32 [[TMP24]], 0
 ; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [26 x i32], ptr @auGlobalFrequency, i64 0, i64 [[INDVARS_IV]]
-; CHECK-NEXT:    br i1 [[TMP25]], label [[TMP27:%.*]], label [[TMP28:%.*]]
-; CHECK:       27:
-; CHECK-NEXT:    store i32 -1, ptr [[TMP26]], align 4, !tbaa [[TBAA1]]
-; CHECK-NEXT:    br label [[TMP57]]
-; CHECK:       28:
-; CHECK-NEXT:    store i32 0, ptr [[TMP26]], align 4, !tbaa [[TBAA1]]
+; CHECK-NEXT:    br i1 [[TMP25]], label %[[TMP27:.*]], label %[[TMP28:.*]]
+; CHECK:       [[TMP27]]:
+; CHECK-NEXT:    store i32 -1, ptr [[TMP26]], align 4, !tbaa [[INT_TBAA1]]
+; CHECK-NEXT:    br label %[[TMP57]]
+; CHECK:       [[TMP28]]:
+; CHECK-NEXT:    store i32 0, ptr [[TMP26]], align 4, !tbaa [[INT_TBAA1]]
 ; CHECK-NEXT:    [[TMP29:%.*]] = zext i32 [[TMP24]] to i64
-; CHECK-NEXT:    br i1 false, label [[DOT_CRIT_EDGE:%.*]], label [[DOTLR_PH_PREHEADER:%.*]]
-; CHECK:       .lr.ph.preheader:
-; CHECK-NEXT:    br label [[DOTLR_PH:%.*]]
-; CHECK:       .lr.ph:
-; CHECK-NEXT:    [[DOT04658:%.*]] = phi i64 [ [[TMP31:%.*]], [[DOTLR_PH]] ], [ 1, [[DOTLR_PH_PREHEADER]] ]
-; CHECK-NEXT:    [[DOT04857:%.*]] = phi i32 [ [[TMP30:%.*]], [[DOTLR_PH]] ], [ 1, [[DOTLR_PH_PREHEADER]] ]
+; CHECK-NEXT:    br i1 false, label %[[DOT_CRIT_EDGE:.*]], label %[[DOTLR_PH_PREHEADER:.*]]
+; CHECK:       [[_LR_PH_PREHEADER:.*:]]
+; CHECK-NEXT:    br label %[[DOTLR_PH:.*]]
+; CHECK:       [[_LR_PH:.*:]]
+; CHECK-NEXT:    [[DOT04658:%.*]] = phi i64 [ [[TMP31:%.*]], %[[DOTLR_PH]] ], [ 1, %[[DOTLR_PH_PREHEADER]] ]
+; CHECK-NEXT:    [[DOT04857:%.*]] = phi i32 [ [[TMP30:%.*]], %[[DOTLR_PH]] ], [ 1, %[[DOTLR_PH_PREHEADER]] ]
 ; CHECK-NEXT:    [[TMP30]] = add nuw nsw i32 [[DOT04857]], 1
 ; CHECK-NEXT:    [[TMP31]] = shl i64 [[DOT04658]], 1
 ; CHECK-NEXT:    [[TMP32:%.*]] = icmp ult i64 [[TMP29]], [[TMP31]]
-; CHECK-NEXT:    br i1 [[TMP32]], label [[DOT_CRIT_EDGE_LOOPEXIT:%.*]], label [[DOTLR_PH]]
-; CHECK:       ._crit_edge.loopexit:
-; CHECK-NEXT:    br label [[DOT_CRIT_EDGE]]
-; CHECK:       ._crit_edge:
-; CHECK-NEXT:    [[DOT048_LCSSA:%.*]] = phi i32 [ poison, [[TMP28]] ], [ [[TMP30]], [[DOT_CRIT_EDGE_LOOPEXIT]] ]
-; CHECK-NEXT:    [[DOT046_LCSSA:%.*]] = phi i64 [ poison, [[TMP28]] ], [ [[TMP31]], [[DOT_CRIT_EDGE_LOOPEXIT]] ]
+; CHECK-NEXT:    br i1 [[TMP32]], label %[[DOT_CRIT_EDGE_LOOPEXIT:.*]], label %[[DOTLR_PH]]
+; CHECK:       [[__CRIT_EDGE_LOOPEXIT:.*:]]
+; CHECK-NEXT:    br label %[[DOT_CRIT_EDGE]]
+; CHECK:       [[__CRIT_EDGE:.*:]]
+; CHECK-NEXT:    [[DOT048_LCSSA:%.*]] = phi i32 [ poison, %[[TMP28]] ], [ [[TMP30]], %[[DOT_CRIT_EDGE_LOOPEXIT]] ]
+; CHECK-NEXT:    [[DOT046_LCSSA:%.*]] = phi i64 [ poison, %[[TMP28]] ], [ [[TMP31]], %[[DOT_CRIT_EDGE_LOOPEXIT]] ]
 ; CHECK-NEXT:    [[TMP33:%.*]] = add nsw i32 [[DOT048_LCSSA]], [[DOT04961]]
 ; CHECK-NEXT:    [[TMP34:%.*]] = icmp ugt i32 [[TMP33]], 64
-; CHECK-NEXT:    br i1 [[TMP34]], label [[TMP35:%.*]], label [[TMP39:%.*]]
-; CHECK:       35:
+; CHECK-NEXT:    br i1 [[TMP34]], label %[[TMP35:.*]], label %[[TMP39:.*]]
+; CHECK:       [[TMP35]]:
 ; CHECK-NEXT:    [[TMP36:%.*]] = add i32 [[DOT05160]], 1
 ; CHECK-NEXT:    [[TMP37:%.*]] = icmp ugt i32 [[TMP36]], 1
-; CHECK-NEXT:    br i1 [[TMP37]], label [[TMP38:%.*]], label [[TMP39]]
-; CHECK:       38:
+; CHECK-NEXT:    br i1 [[TMP37]], label %[[TMP38:.*]], label %[[TMP39]]
+; CHECK:       [[TMP38]]:
 ; CHECK-NEXT:    tail call void @Fatal(ptr @.str.7, i32 0)
-; CHECK-NEXT:    br label [[TMP39]]
-; CHECK:       39:
-; CHECK-NEXT:    [[DOT152:%.*]] = phi i32 [ [[DOT05160]], [[DOT_CRIT_EDGE]] ], [ [[TMP36]], [[TMP38]] ], [ [[TMP36]], [[TMP35]] ]
-; CHECK-NEXT:    [[DOT150:%.*]] = phi i32 [ [[DOT04961]], [[DOT_CRIT_EDGE]] ], [ 0, [[TMP38]] ], [ 0, [[TMP35]] ]
+; CHECK-NEXT:    br label %[[TMP39]]
+; CHECK:       [[TMP39]]:
+; CHECK-NEXT:    [[DOT152:%.*]] = phi i32 [ [[DOT05160]], %[[DOT_CRIT_EDGE]] ], [ [[TMP36]], %[[TMP38]] ], [ [[TMP36]], %[[TMP35]] ]
+; CHECK-NEXT:    [[DOT150:%.*]] = phi i32 [ [[DOT04961]], %[[DOT_CRIT_EDGE]] ], [ 0, %[[TMP38]] ], [ 0, %[[TMP35]] ]
 ; CHECK-NEXT:    [[TMP40:%.*]] = add i64 [[DOT046_LCSSA]], 4294967295
 ; CHECK-NEXT:    [[TMP41:%.*]] = trunc i64 [[TMP40]] to i32
 ; CHECK-NEXT:    [[TMP42:%.*]] = getelementptr inbounds [26 x %struct.Letter], ptr @alPhrase, i64 0, i64 [[INDVARS_IV]], i32 2
-; CHECK-NEXT:    store i32 [[TMP41]], ptr [[TMP42]], align 8, !tbaa [[TBAA12:![0-9]+]]
+; CHECK-NEXT:    store i32 [[TMP41]], ptr [[TMP42]], align 8, !tbaa [[INT_TBAA12:![0-9]+]]
 ; CHECK-NEXT:    [[TMP43:%.*]] = zext i32 [[DOT150]] to i64
 ; CHECK-NEXT:    [[DOT046_:%.*]] = shl i64 [[DOT046_LCSSA]], [[TMP43]]
 ; CHECK-NEXT:    [[TMP44:%.*]] = zext i32 [[DOT152]] to i64
 ; CHECK-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [2 x i64], ptr @aqMainSign, i64 0, i64 [[TMP44]]
-; CHECK-NEXT:    [[TMP46:%.*]] = load i64, ptr [[TMP45]], align 8, !tbaa [[TBAA13:![0-9]+]]
+; CHECK-NEXT:    [[TMP46:%.*]] = load i64, ptr [[TMP45]], align 8, !tbaa [[LONG_TBAA13:![0-9]+]]
 ; CHECK-NEXT:    [[TMP47:%.*]] = or i64 [[TMP46]], [[DOT046_]]
-; CHECK-NEXT:    store i64 [[TMP47]], ptr [[TMP45]], align 8, !tbaa [[TBAA13]]
-; CHECK-NEXT:    [[TMP48:%.*]] = load i32, ptr [[TMP23]], align 16, !tbaa [[TBAA10]]
+; CHECK-NEXT:    store i64 [[TMP47]], ptr [[TMP45]], align 8, !tbaa [[LONG_TBAA13]]
+; CHECK-NEXT:    [[TMP48:%.*]] = load i32, ptr [[TMP23]], align 16, !tbaa [[INT_TBAA10]]
 ; CHECK-NEXT:    [[TMP49:%.*]] = zext i32 [[TMP48]] to i64
 ; CHECK-NEXT:    [[TMP50:%.*]] = shl i64 [[TMP49]], [[TMP43]]
 ; CHECK-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [2 x i64], ptr @aqMainMask, i64 0, i64 [[TMP44]]
-; CHECK-NEXT:    [[TMP52:%.*]] = load i64, ptr [[TMP51]], align 8, !tbaa [[TBAA13]]
+; CHECK-NEXT:    [[TMP52:%.*]] = load i64, ptr [[TMP51]], align 8, !tbaa [[LONG_TBAA13]]
 ; CHECK-NEXT:    [[TMP53:%.*]] = or i64 [[TMP50]], [[TMP52]]
-; CHECK-NEXT:    store i64 [[TMP53]], ptr [[TMP51]], align 8, !tbaa [[TBAA13]]
+; CHECK-NEXT:    store i64 [[TMP53]], ptr [[TMP51]], align 8, !tbaa [[LONG_TBAA13]]
 ; CHECK-NEXT:    [[TMP54:%.*]] = getelementptr inbounds [26 x %struct.Letter], ptr @alPhrase, i64 0, i64 [[INDVARS_IV]], i32 1
-; CHECK-NEXT:    store i32 [[DOT150]], ptr [[TMP54]], align 4, !tbaa [[TBAA15:![0-9]+]]
+; CHECK-NEXT:    store i32 [[DOT150]], ptr [[TMP54]], align 4, !tbaa [[INT_TBAA15:![0-9]+]]
 ; CHECK-NEXT:    [[TMP55:%.*]] = getelementptr inbounds [26 x %struct.Letter], ptr @alPhrase, i64 0, i64 [[INDVARS_IV]], i32 3
-; CHECK-NEXT:    store i32 [[DOT152]], ptr [[TMP55]], align 4, !tbaa [[TBAA16:![0-9]+]]
+; CHECK-NEXT:    store i32 [[DOT152]], ptr [[TMP55]], align 4, !tbaa [[INT_TBAA16:![0-9]+]]
 ; CHECK-NEXT:    [[TMP56:%.*]] = add nsw i32 [[DOT150]], [[DOT048_LCSSA]]
-; CHECK-NEXT:    br label [[TMP57]]
-; CHECK:       57:
-; CHECK-NEXT:    [[DOT253]] = phi i32 [ [[DOT05160]], [[TMP27]] ], [ [[DOT152]], [[TMP39]] ]
-; CHECK-NEXT:    [[DOT2]] = phi i32 [ [[DOT04961]], [[TMP27]] ], [ [[TMP56]], [[TMP39]] ]
+; CHECK-NEXT:    br label %[[TMP57]]
+; CHECK:       [[TMP57]]:
+; CHECK-NEXT:    [[DOT253]] = phi i32 [ [[DOT05160]], %[[TMP27]] ], [ [[DOT152]], %[[TMP39]] ]
+; CHECK-NEXT:    [[DOT2]] = phi i32 [ [[DOT04961]], %[[TMP27]] ], [ [[TMP56]], %[[TMP39]] ]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT]], 26
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[DOTPREHEADER]], label [[TMP58:%.*]]
-; CHECK:       58:
+; CHECK-NEXT:    br i1 [[EXITCOND]], [[DOTPREHEADER]], label %[[BB58:.*]]
+; CHECK:       [[BB58]]:
 ; CHECK-NEXT:    ret void
 ;
   tail call void @llvm.memset.p0.i64(ptr align 16 @alPhrase, i8 0, i64 416, i1 false)
@@ -309,20 +309,20 @@ attributes #5 = { nounwind readonly }
 !15 = !{!11, !2, i64 4}
 !16 = !{!11, !2, i64 12}
 ;.
-; CHECK: [[TBAA1]] = !{[[META2:![0-9]+]], [[META2]], i64 0}
+; CHECK: [[INT_TBAA1]] = !{[[META2:![0-9]+]], [[META2]], i64 0}
 ; CHECK: [[META2]] = !{!"int", [[META3:![0-9]+]], i64 0}
 ; CHECK: [[META3]] = !{!"omnipotent char", [[META4:![0-9]+]], i64 0}
 ; CHECK: [[META4]] = !{!"Simple C/C++ TBAA"}
-; CHECK: [[TBAA5]] = !{[[META3]], [[META3]], i64 0}
-; CHECK: [[TBAA6]] = !{[[META7:![0-9]+]], [[META7]], i64 0}
+; CHECK: [[CHAR_TBAA5]] = !{[[META3]], [[META3]], i64 0}
+; CHECK: [[ANYPTR_TBAA6]] = !{[[META7:![0-9]+]], [[META7]], i64 0}
 ; CHECK: [[META7]] = !{!"any pointer", [[META3]], i64 0}
-; CHECK: [[TBAA8]] = !{[[META9:![0-9]+]], [[META9]], i64 0}
+; CHECK: [[SHORT_TBAA8]] = !{[[META9:![0-9]+]], [[META9]], i64 0}
 ; CHECK: [[META9]] = !{!"short", [[META3]], i64 0}
-; CHECK: [[TBAA10]] = !{[[META11:![0-9]+]], [[META2]], i64 0}
+; CHECK: [[INT_TBAA10]] = !{[[META11:![0-9]+]], [[META2]], i64 0}
 ; CHECK: [[META11]] = !{!"", [[META2]], i64 0, [[META2]], i64 4, [[META2]], i64 8, [[META2]], i64 12}
-; CHECK: [[TBAA12]] = !{[[META11]], [[META2]], i64 8}
-; CHECK: [[TBAA13]] = !{[[META14:![0-9]+]], [[META14]], i64 0}
+; CHECK: [[INT_TBAA12]] = !{[[META11]], [[META2]], i64 8}
+; CHECK: [[LONG_TBAA13]] = !{[[META14:![0-9]+]], [[META14]], i64 0}
 ; CHECK: [[META14]] = !{!"long", [[META3]], i64 0}
-; CHECK: [[TBAA15]] = !{[[META11]], [[META2]], i64 4}
-; CHECK: [[TBAA16]] = !{[[META11]], [[META2]], i64 12}
+; CHECK: [[INT_TBAA15]] = !{[[META11]], [[META2]], i64 4}
+; CHECK: [[INT_TBAA16]] = !{[[META11]], [[META2]], i64 12}
 ;.
diff --git a/llvm/test/Transforms/NewGVN/pr31501.ll b/llvm/test/Transforms/NewGVN/pr31501.ll
index 18bfcd1b9ca09..353c693f2a29b 100644
--- a/llvm/test/Transforms/NewGVN/pr31501.ll
+++ b/llvm/test/Transforms/NewGVN/pr31501.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt < %s -passes=newgvn -S | FileCheck %s
 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 
@@ -50,32 +50,33 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 
 ; Function Attrs: norecurse nounwind ssp uwtable
 define weak_odr hidden ptr @quux(ptr %arg, ptr %arg1) local_unnamed_addr #0 align 2 {
-; CHECK-LABEL: @quux(
-; CHECK-NEXT:  bb:
-; CHECK-NEXT:    [[TMP:%.*]] = getelementptr inbounds [[STRUCT_BARNEY:%.*]], ptr [[ARG:%.*]], i64 0, i32 3, i32 0, i32 0, i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP]], align 8, !tbaa [[TBAA2:![0-9]+]]
+; CHECK-LABEL: define weak_odr hidden ptr @quux(
+; CHECK-SAME: ptr [[ARG:%.*]], ptr [[ARG1:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] align 2 {
+; CHECK-NEXT:  [[BB:.*]]:
+; CHECK-NEXT:    [[TMP:%.*]] = getelementptr inbounds [[STRUCT_BARNEY:%.*]], ptr [[ARG]], i64 0, i32 3, i32 0, i32 0, i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP]], align 8, !tbaa [[ANYPTR_TBAA2:![0-9]+]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_BARNEY]], ptr [[ARG]], i64 0, i32 3, i32 0, i32 0, i32 0, i32 0, i32 1
-; CHECK-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[TMP4]], align 8, !tbaa [[TBAA7:![0-9]+]]
+; CHECK-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[TMP4]], align 8, !tbaa [[ANYPTR_TBAA7:![0-9]+]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq ptr [[TMP3]], [[TMP6]]
-; CHECK-NEXT:    br i1 [[TMP7]], label [[BB21:%.*]], label [[BB8:%.*]]
-; CHECK:       bb8:
-; CHECK-NEXT:    br label [[BB11:%.*]]
-; CHECK:       bb9:
+; CHECK-NEXT:    br i1 [[TMP7]], label %[[BB21:.*]], label %[[BB8:.*]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    br label %[[BB11:.*]]
+; CHECK:       [[BB9:.*]]:
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq ptr [[TMP18:%.*]], [[TMP6]]
-; CHECK-NEXT:    br i1 [[TMP10]], label [[BB19:%.*]], label [[BB11]]
-; CHECK:       bb11:
-; CHECK-NEXT:    [[TMP12:%.*]] = phi ptr [ [[TMP17:%.*]], [[BB9:%.*]] ], [ undef, [[BB8]] ]
-; CHECK-NEXT:    [[TMP13:%.*]] = phi ptr [ [[TMP18]], [[BB9]] ], [ [[TMP3]], [[BB8]] ]
-; CHECK-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[TMP13]], align 8, !tbaa [[TBAA8:![0-9]+]]
-; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq ptr [[TMP15]], [[ARG1:%.*]]
+; CHECK-NEXT:    br i1 [[TMP10]], label %[[BB19:.*]], label %[[BB11]]
+; CHECK:       [[BB11]]:
+; CHECK-NEXT:    [[TMP12:%.*]] = phi ptr [ [[TMP17:%.*]], %[[BB9]] ], [ undef, %[[BB8]] ]
+; CHECK-NEXT:    [[TMP13:%.*]] = phi ptr [ [[TMP18]], %[[BB9]] ], [ [[TMP3]], %[[BB8]] ]
+; CHECK-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[TMP13]], align 8, !tbaa [[ANYPTR_TBAA8:![0-9]+]]
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq ptr [[TMP15]], [[ARG1]]
 ; CHECK-NEXT:    [[TMP17]] = select i1 [[TMP16]], ptr [[TMP13]], ptr [[TMP12]]
 ; CHECK-NEXT:    [[TMP18]] = getelementptr inbounds [[STRUCT_FOO:%.*]], ptr [[TMP13]], i64 1
-; CHECK-NEXT:    br i1 [[TMP16]], label [[BB19]], label [[BB9]]
-; CHECK:       bb19:
-; CHECK-NEXT:    [[TMP20:%.*]] = phi ptr [ null, [[BB9]] ], [ [[TMP17]], [[BB11]] ]
-; CHECK-NEXT:    br label [[BB21]]
-; CHECK:       bb21:
-; CHECK-NEXT:    [[TMP22:%.*]] = phi ptr [ null, [[BB:%.*]] ], [ [[TMP20]], [[BB19]] ]
+; CHECK-NEXT:    br i1 [[TMP16]], label %[[BB19]], label %[[BB9]]
+; CHECK:       [[BB19]]:
+; CHECK-NEXT:    [[TMP20:%.*]] = phi ptr [ null, %[[BB9]] ], [ [[TMP17]], %[[BB11]] ]
+; CHECK-NEXT:    br label %[[BB21]]
+; CHECK:       [[BB21]]:
+; CHECK-NEXT:    [[TMP22:%.*]] = phi ptr [ null, %[[BB]] ], [ [[TMP20]], %[[BB19]] ]
 ; CHECK-NEXT:    ret ptr [[TMP22]]
 ;
 bb:
@@ -128,3 +129,15 @@ attributes #0 = { norecurse nounwind ssp uwtable "correctly-rounded-divide-sqrt-
 !9 = !{!"_ZTSN4llvm9RecordValE", !4, i64 0, !10, i64 8, !4, i64 16}
 !10 = !{!"_ZTSN4llvm14PointerIntPairIPNS_5RecTyELj1EbNS_21PointerLikeTypeTraitsIS2_EENS_18PointerIntPairInfoIS2_Lj1ES4_EEEE", !11, i64 0}
 !11 = !{!"long", !5, i64 0}
+;.
+; CHECK: [[ANYPTR_TBAA2]] = !{[[META3:![0-9]+]], [[META4:![0-9]+]], i64 0}
+; CHECK: [[META3]] = !{!"_ZTSN4llvm15SmallVectorBaseE", [[META4]], i64 0, [[META4]], i64 8, [[META4]], i64 16}
+; CHECK: [[META4]] = !{!"any pointer", [[META5:![0-9]+]], i64 0}
+; CHECK: [[META5]] = !{!"omnipotent char", [[META6:![0-9]+]], i64 0}
+; CHECK: [[META6]] = !{!"Simple C++ TBAA"}
+; CHECK: [[ANYPTR_TBAA7]] = !{[[META3]], [[META4]], i64 8}
+; CHECK: [[ANYPTR_TBAA8]] = !{[[META9:![0-9]+]], [[META4]], i64 0}
+; CHECK: [[META9]] = !{!"_ZTSN4llvm9RecordValE", [[META4]], i64 0, [[META10:![0-9]+]], i64 8, [[META4]], i64 16}
+; CHECK: [[META10]] = !{!"_ZTSN4llvm14PointerIntPairIPNS_5RecTyELj1EbNS_21PointerLikeTypeTraitsIS2_EENS_18PointerIntPairInfoIS2_Lj1ES4_EEEE", [[META11:![0-9]+]], i64 0}
+; CHECK: [[META11]] = !{!"long", [[META5]], i64 0}
+;.
diff --git a/llvm/test/Transforms/NewGVN/pr33305.ll b/llvm/test/Transforms/NewGVN/pr33305.ll
index 3a19f610defcd..e742f14249c7c 100644
--- a/llvm/test/Transforms/NewGVN/pr33305.ll
+++ b/llvm/test/Transforms/NewGVN/pr33305.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt -passes=newgvn -S %s | FileCheck %s
 ; Ensure we do not incorrect do phi of ops
 source_filename = "/Users/dannyb/sources/llvm-clean/debug-build/pr33305.c"
@@ -17,68 +17,69 @@ target triple = "x86_64-apple-macosx10.12.0"
 
 ; Function Attrs: nounwind optsize ssp uwtable
 define i32 @main() local_unnamed_addr #0 {
-; CHECK-LABEL: @main(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[DOTPR_I:%.*]] = load i32, ptr @c, align 4, !tbaa [[TBAA3:![0-9]+]]
+; CHECK-LABEL: define i32 @main(
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[DOTPR_I:%.*]] = load i32, ptr @c, align 4, !tbaa [[INT_TBAA3:![0-9]+]]
 ; CHECK-NEXT:    [[CMP13_I:%.*]] = icmp slt i32 [[DOTPR_I]], 1
-; CHECK-NEXT:    br i1 [[CMP13_I]], label [[FOR_COND1_PREHEADER_LR_PH_I:%.*]], label [[ENTRY_FOR_END9_I_CRIT_EDGE:%.*]]
-; CHECK:       entry.for.end9.i_crit_edge:
-; CHECK-NEXT:    [[DOTPRE:%.*]] = load i32, ptr @h, align 4, !tbaa [[TBAA3]]
-; CHECK-NEXT:    br label [[FOR_END9_I:%.*]]
-; CHECK:       for.cond1.preheader.lr.ph.i:
-; CHECK-NEXT:    [[G_PROMOTED14_I:%.*]] = load i32, ptr @g, align 4, !tbaa [[TBAA3]]
-; CHECK-NEXT:    br label [[FOR_COND1_PREHEADER_I:%.*]]
-; CHECK:       for.cond1.preheader.i:
-; CHECK-NEXT:    [[INC816_I:%.*]] = phi i32 [ [[DOTPR_I]], [[FOR_COND1_PREHEADER_LR_PH_I]] ], [ [[INC8_I:%.*]], [[FOR_INC7_I:%.*]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ [[G_PROMOTED14_I]], [[FOR_COND1_PREHEADER_LR_PH_I]] ], [ 0, [[FOR_INC7_I]] ]
-; CHECK-NEXT:    br label [[FOR_BODY3_I:%.*]]
-; CHECK:       for.body3.i:
-; CHECK-NEXT:    [[TMP1:%.*]] = phi i1 [ false, [[FOR_COND1_PREHEADER_I]] ], [ true, [[LOR_END_I:%.*]] ]
-; CHECK-NEXT:    [[INC12_I:%.*]] = phi i32 [ 0, [[FOR_COND1_PREHEADER_I]] ], [ [[INC_I:%.*]], [[LOR_END_I]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = phi i32 [ [[TMP0]], [[FOR_COND1_PREHEADER_I]] ], [ 0, [[LOR_END_I]] ]
+; CHECK-NEXT:    br i1 [[CMP13_I]], label %[[FOR_COND1_PREHEADER_LR_PH_I:.*]], label %[[ENTRY_FOR_END9_I_CRIT_EDGE:.*]]
+; CHECK:       [[ENTRY_FOR_END9_I_CRIT_EDGE]]:
+; CHECK-NEXT:    [[DOTPRE:%.*]] = load i32, ptr @h, align 4, !tbaa [[INT_TBAA3]]
+; CHECK-NEXT:    br label %[[FOR_END9_I:.*]]
+; CHECK:       [[FOR_COND1_PREHEADER_LR_PH_I]]:
+; CHECK-NEXT:    [[G_PROMOTED14_I:%.*]] = load i32, ptr @g, align 4, !tbaa [[INT_TBAA3]]
+; CHECK-NEXT:    br label %[[FOR_COND1_PREHEADER_I:.*]]
+; CHECK:       [[FOR_COND1_PREHEADER_I]]:
+; CHECK-NEXT:    [[INC816_I:%.*]] = phi i32 [ [[DOTPR_I]], %[[FOR_COND1_PREHEADER_LR_PH_I]] ], [ [[INC8_I:%.*]], %[[FOR_INC7_I:.*]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ [[G_PROMOTED14_I]], %[[FOR_COND1_PREHEADER_LR_PH_I]] ], [ 0, %[[FOR_INC7_I]] ]
+; CHECK-NEXT:    br label %[[FOR_BODY3_I:.*]]
+; CHECK:       [[FOR_BODY3_I]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = phi i1 [ false, %[[FOR_COND1_PREHEADER_I]] ], [ true, %[[LOR_END_I:.*]] ]
+; CHECK-NEXT:    [[INC12_I:%.*]] = phi i32 [ 0, %[[FOR_COND1_PREHEADER_I]] ], [ [[INC_I:%.*]], %[[LOR_END_I]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = phi i32 [ [[TMP0]], %[[FOR_COND1_PREHEADER_I]] ], [ 0, %[[LOR_END_I]] ]
 ; CHECK-NEXT:    [[TOBOOL_I:%.*]] = icmp ne i32 [[TMP2]], 0
 ; CHECK-NEXT:    [[OR_COND_I:%.*]] = and i1 [[TMP1]], [[TOBOOL_I]]
-; CHECK-NEXT:    br i1 [[OR_COND_I]], label [[LOR_END_I]], label [[LOR_RHS_I:%.*]]
-; CHECK:       lor.rhs.i:
+; CHECK-NEXT:    br i1 [[OR_COND_I]], label %[[LOR_END_I]], label %[[LOR_RHS_I:.*]]
+; CHECK:       [[LOR_RHS_I]]:
 ; CHECK-NEXT:    [[LNOT_I:%.*]] = xor i1 [[TOBOOL_I]], true
 ; CHECK-NEXT:    [[LNOT_EXT_I:%.*]] = zext i1 [[LNOT_I]] to i32
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr @e, align 4, !tbaa [[TBAA3]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr @e, align 4, !tbaa [[INT_TBAA3]]
 ; CHECK-NEXT:    [[XOR_I:%.*]] = xor i32 [[TMP3]], [[LNOT_EXT_I]]
-; CHECK-NEXT:    store i32 [[XOR_I]], ptr @e, align 4, !tbaa [[TBAA3]]
-; CHECK-NEXT:    br label [[LOR_END_I]]
-; CHECK:       lor.end.i:
+; CHECK-NEXT:    store i32 [[XOR_I]], ptr @e, align 4, !tbaa [[INT_TBAA3]]
+; CHECK-NEXT:    br label %[[LOR_END_I]]
+; CHECK:       [[LOR_END_I]]:
 ; CHECK-NEXT:    [[INC_I]] = add nuw nsw i32 [[INC12_I]], 1
 ; CHECK-NEXT:    [[EXITCOND_I:%.*]] = icmp eq i32 [[INC_I]], 2
-; CHECK-NEXT:    br i1 [[EXITCOND_I]], label [[FOR_INC7_I]], label [[FOR_BODY3_I]]
-; CHECK:       for.inc7.i:
+; CHECK-NEXT:    br i1 [[EXITCOND_I]], label %[[FOR_INC7_I]], label %[[FOR_BODY3_I]]
+; CHECK:       [[FOR_INC7_I]]:
 ; CHECK-NEXT:    [[INC8_I]] = add nsw i32 [[INC816_I]], 1
 ; CHECK-NEXT:    [[CMP_I:%.*]] = icmp slt i32 [[INC816_I]], 0
-; CHECK-NEXT:    br i1 [[CMP_I]], label [[FOR_COND1_PREHEADER_I]], label [[FOR_COND_FOR_END9_CRIT_EDGE_I:%.*]]
-; CHECK:       for.cond.for.end9_crit_edge.i:
-; CHECK-NEXT:    store i32 0, ptr @g, align 4, !tbaa [[TBAA3]]
-; CHECK-NEXT:    store i32 2, ptr @h, align 4, !tbaa [[TBAA3]]
-; CHECK-NEXT:    store i32 [[INC8_I]], ptr @c, align 4, !tbaa [[TBAA3]]
-; CHECK-NEXT:    br label [[FOR_END9_I]]
-; CHECK:       for.end9.i:
-; CHECK-NEXT:    [[TMP4:%.*]] = phi i32 [ [[DOTPRE]], [[ENTRY_FOR_END9_I_CRIT_EDGE]] ], [ 2, [[FOR_COND_FOR_END9_CRIT_EDGE_I]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = load ptr, ptr @b, align 8, !tbaa [[TBAA7:![0-9]+]]
-; CHECK-NEXT:    store i32 [[TMP4]], ptr [[TMP5]], align 4, !tbaa [[TBAA3]]
-; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr @e, align 4, !tbaa [[TBAA3]]
+; CHECK-NEXT:    br i1 [[CMP_I]], label %[[FOR_COND1_PREHEADER_I]], label %[[FOR_COND_FOR_END9_CRIT_EDGE_I:.*]]
+; CHECK:       [[FOR_COND_FOR_END9_CRIT_EDGE_I]]:
+; CHECK-NEXT:    store i32 0, ptr @g, align 4, !tbaa [[INT_TBAA3]]
+; CHECK-NEXT:    store i32 2, ptr @h, align 4, !tbaa [[INT_TBAA3]]
+; CHECK-NEXT:    store i32 [[INC8_I]], ptr @c, align 4, !tbaa [[INT_TBAA3]]
+; CHECK-NEXT:    br label %[[FOR_END9_I]]
+; CHECK:       [[FOR_END9_I]]:
+; CHECK-NEXT:    [[TMP4:%.*]] = phi i32 [ [[DOTPRE]], %[[ENTRY_FOR_END9_I_CRIT_EDGE]] ], [ 2, %[[FOR_COND_FOR_END9_CRIT_EDGE_I]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = load ptr, ptr @b, align 8, !tbaa [[ANYPTR_TBAA7:![0-9]+]]
+; CHECK-NEXT:    store i32 [[TMP4]], ptr [[TMP5]], align 4, !tbaa [[INT_TBAA3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr @e, align 4, !tbaa [[INT_TBAA3]]
 ; CHECK-NEXT:    [[CMP10_I:%.*]] = icmp slt i32 [[TMP6]], -1
-; CHECK-NEXT:    br i1 [[CMP10_I]], label [[IF_THEN_I:%.*]], label [[FN1_EXIT:%.*]]
-; CHECK:       if.then.i:
-; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr @f, align 4, !tbaa [[TBAA3]]
-; CHECK-NEXT:    store i32 [[TMP7]], ptr [[TMP5]], align 4, !tbaa [[TBAA3]]
-; CHECK-NEXT:    br label [[FN1_EXIT]]
-; CHECK:       fn1.exit:
-; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr @a, align 4, !tbaa [[TBAA3]]
+; CHECK-NEXT:    br i1 [[CMP10_I]], label %[[IF_THEN_I:.*]], label %[[FN1_EXIT:.*]]
+; CHECK:       [[IF_THEN_I]]:
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr @f, align 4, !tbaa [[INT_TBAA3]]
+; CHECK-NEXT:    store i32 [[TMP7]], ptr [[TMP5]], align 4, !tbaa [[INT_TBAA3]]
+; CHECK-NEXT:    br label %[[FN1_EXIT]]
+; CHECK:       [[FN1_EXIT]]:
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr @a, align 4, !tbaa [[INT_TBAA3]]
 ; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp eq i32 [[TMP8]], 0
-; CHECK-NEXT:    br i1 [[TOBOOL]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
-; CHECK:       if.then:
+; CHECK-NEXT:    br i1 [[TOBOOL]], label %[[IF_END:.*]], label %[[IF_THEN:.*]]
+; CHECK:       [[IF_THEN]]:
 ; CHECK-NEXT:    [[PUTS2:%.*]] = tail call i32 @puts(ptr @str.2)
 ; CHECK-NEXT:    tail call void @abort() #[[ATTR3:[0-9]+]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       if.end:
+; CHECK:       [[IF_END]]:
 ; CHECK-NEXT:    [[PUTS:%.*]] = tail call i32 @puts(ptr @str)
 ; CHECK-NEXT:    ret i32 0
 ;
@@ -183,3 +184,11 @@ attributes #3 = { noreturn nounwind optsize }
 !6 = !{!"Simple C/C++ TBAA"}
 !7 = !{!8, !8, i64 0}
 !8 = !{!"any pointer", !5, i64 0}
+;.
+; CHECK: [[INT_TBAA3]] = !{[[META4:![0-9]+]], [[META4]], i64 0}
+; CHECK: [[META4]] = !{!"int", [[META5:![0-9]+]], i64 0}
+; CHECK: [[META5]] = !{!"omnipotent char", [[META6:![0-9]+]], i64 0}
+; CHECK: [[META6]] = !{!"Simple C/C++ TBAA"}
+; CHECK: [[ANYPTR_TBAA7]] = !{[[META8:![0-9]+]], [[META8]], i64 0}
+; CHECK: [[META8]] = !{!"any pointer", [[META5]], i64 0}
+;.
diff --git a/llvm/test/Transforms/NewGVN/pr33367.ll b/llvm/test/Transforms/NewGVN/pr33367.ll
index 597caa2b34ef2..428a053bcc894 100644
--- a/llvm/test/Transforms/NewGVN/pr33367.ll
+++ b/llvm/test/Transforms/NewGVN/pr33367.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt -aa-pipeline=basic-aa -passes=newgvn -S %s | FileCheck %s
 ; Verify that we don't accidentally delete intrinsics that aren't SSA copies
 %DS_struct = type { [32 x ptr], i8, [32 x i16] }
@@ -7,47 +7,48 @@
 declare i64 @llvm.x86.bmi.bextr.64(i64, i64) #3
 
 define %MNR_struct @f000316011717_2(ptr %pDS, ptr %pCG) #2 {
-; CHECK-LABEL: @f000316011717_2(
-; CHECK-NEXT:  Entry:
+; CHECK-LABEL: define %MNR_struct @f000316011717_2(
+; CHECK-SAME: ptr [[PDS:%.*]], ptr [[PCG:%.*]]) #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[RESTART:%.*]] = alloca [[MNR_STRUCT:%.*]], align 8
-; CHECK-NEXT:    [[PCARRY:%.*]] = getelementptr [[DS_STRUCT:%.*]], ptr [[PDS:%.*]], i32 0, i32 1
-; CHECK-NEXT:    [[BASE:%.*]] = load ptr, ptr [[PDS]], align 8, !tbaa [[TBAA14:![0-9]+]]
+; CHECK-NEXT:    [[PCARRY:%.*]] = getelementptr [[DS_STRUCT:%.*]], ptr [[PDS]], i32 0, i32 1
+; CHECK-NEXT:    [[BASE:%.*]] = load ptr, ptr [[PDS]], align 8, !tbaa [[BREG_TBAA14:![0-9]+]]
 ; CHECK-NEXT:    [[ABSADDR:%.*]] = getelementptr i64, ptr [[BASE]], i64 9
-; CHECK-NEXT:    [[EXTARGET:%.*]] = load i64, ptr [[ABSADDR]], align 8, !tbaa [[TBAA4:![0-9]+]]
+; CHECK-NEXT:    [[EXTARGET:%.*]] = load i64, ptr [[ABSADDR]], align 8, !tbaa [[MEM_TBAA4:![0-9]+]]
 ; CHECK-NEXT:    [[TEMPLATE:%.*]] = icmp eq i64 [[EXTARGET]], 8593987412
-; CHECK-NEXT:    br i1 [[TEMPLATE]], label %"BB3.000316011731#1", label [[BB2_000316011731_5:%.*]]
+; CHECK-NEXT:    br i1 [[TEMPLATE]], label %"BB3.000316011731#1", label %[[BB2_000316011731_5:.*]]
 ; CHECK:       "BB3.000316011731#1":
 ; CHECK-NEXT:    [[PBASE8:%.*]] = getelementptr [32 x ptr], ptr [[PDS]], i64 0, i64 29
-; CHECK-NEXT:    [[BASE9:%.*]] = load ptr, ptr [[PBASE8]], align 8, !tbaa [[TBAA14]]
+; CHECK-NEXT:    [[BASE9:%.*]] = load ptr, ptr [[PBASE8]], align 8, !tbaa [[BREG_TBAA14]]
 ; CHECK-NEXT:    [[ABSADDR1:%.*]] = getelementptr i64, ptr [[BASE9]], i64 7
-; CHECK-NEXT:    [[RMEM:%.*]] = load i64, ptr [[ABSADDR1]], align 8, !tbaa [[TBAA4]]
+; CHECK-NEXT:    [[RMEM:%.*]] = load i64, ptr [[ABSADDR1]], align 8, !tbaa [[MEM_TBAA4]]
 ; CHECK-NEXT:    [[PWT:%.*]] = getelementptr [[DS_STRUCT]], ptr [[PDS]], i32 0, i32 2
 ; CHECK-NEXT:    [[PWTE:%.*]] = getelementptr [32 x i16], ptr [[PWT]], i64 0, i64 8593987412
-; CHECK-NEXT:    [[SHIFTS:%.*]] = load i16, ptr [[PWTE]], align 2, !tbaa [[TBAA18:![0-9]+]], !invariant.load [[META20:![0-9]+]]
+; CHECK-NEXT:    [[SHIFTS:%.*]] = load i16, ptr [[PWTE]], align 2, !tbaa [[CONST_TBAA18:![0-9]+]], !invariant.load [[META20:![0-9]+]]
 ; CHECK-NEXT:    [[SLOWJ:%.*]] = icmp eq i16 [[SHIFTS]], 0
-; CHECK-NEXT:    br i1 [[SLOWJ]], label [[BB2_000316011731_5]], label %"BB3.000316011731#1.1"
-; CHECK:       BB2.000316011731.5:
+; CHECK-NEXT:    br i1 [[SLOWJ]], label %[[BB2_000316011731_5]], label %"BB3.000316011731#1.1"
+; CHECK:       [[BB2_000316011731_5]]:
 ; CHECK-NEXT:    [[EXTARGET1:%.*]] = and i64 [[EXTARGET]], 137438953471
-; CHECK-NEXT:    switch i64 [[EXTARGET1]], label [[EXIT:%.*]] [
+; CHECK-NEXT:    switch i64 [[EXTARGET1]], label %[[EXIT:.*]] [
 ; CHECK-NEXT:    ]
 ; CHECK:       "BB3.000316011731#1.1":
 ; CHECK-NEXT:    [[SHIFTS1:%.*]] = zext i16 [[SHIFTS]] to i64
 ; CHECK-NEXT:    [[VAL:%.*]] = call i64 @llvm.x86.bmi.bextr.64(i64 [[RMEM]], i64 [[SHIFTS1]])
-; CHECK-NEXT:    [[PREG:%.*]] = getelementptr [64 x i64], ptr [[PCG:%.*]], i64 0, i64 12
-; CHECK-NEXT:    store i64 [[VAL]], ptr [[PREG]], align 32, !tbaa [[TBAA10:![0-9]+]]
+; CHECK-NEXT:    [[PREG:%.*]] = getelementptr [64 x i64], ptr [[PCG]], i64 0, i64 12
+; CHECK-NEXT:    store i64 [[VAL]], ptr [[PREG]], align 32, !tbaa [[A0_TBAA10:![0-9]+]]
 ; CHECK-NEXT:    [[PREG2:%.*]] = getelementptr [64 x i64], ptr [[PCG]], i64 0, i64 14
-; CHECK-NEXT:    [[REG:%.*]] = load i64, ptr [[PREG2]], align 16, !tbaa [[TBAA12:![0-9]+]]
-; CHECK-NEXT:    [[BASE2:%.*]] = load ptr, ptr [[PBASE8]], align 8, !tbaa [[TBAA14]]
+; CHECK-NEXT:    [[REG:%.*]] = load i64, ptr [[PREG2]], align 16, !tbaa [[A2_TBAA12:![0-9]+]]
+; CHECK-NEXT:    [[BASE2:%.*]] = load ptr, ptr [[PBASE8]], align 8, !tbaa [[BREG_TBAA14]]
 ; CHECK-NEXT:    [[ABSADDR2:%.*]] = getelementptr i64, ptr [[BASE2]], i64 [[REG]]
-; CHECK-NEXT:    [[RMEM2:%.*]] = load i64, ptr [[ABSADDR2]], align 8, !tbaa [[TBAA1:![0-9]+]]
+; CHECK-NEXT:    [[RMEM2:%.*]] = load i64, ptr [[ABSADDR2]], align 8, !tbaa [[MEM_TBAA4]]
 ; CHECK-NEXT:    [[PREG7:%.*]] = getelementptr [64 x i64], ptr [[PCG]], i64 0, i64 9
-; CHECK-NEXT:    store i64 [[RMEM2]], ptr [[PREG7]], align 8, !tbaa [[TBAA8:![0-9]+]]
+; CHECK-NEXT:    store i64 [[RMEM2]], ptr [[PREG7]], align 8, !tbaa [[X9_TBAA8:![0-9]+]]
 ; CHECK-NEXT:    [[ADD2C279:%.*]] = add i64 [[RMEM2]], [[VAL]]
 ; CHECK-NEXT:    [[CCHK:%.*]] = icmp sge i64 [[ADD2C279]], 0
 ; CHECK-NEXT:    [[CFL:%.*]] = zext i1 [[CCHK]] to i8
-; CHECK-NEXT:    store i8 [[CFL]], ptr [[PCARRY]], align 1, !tbaa [[TBAA16:![0-9]+]]
-; CHECK-NEXT:    br label [[EXIT]]
-; CHECK:       Exit:
+; CHECK-NEXT:    store i8 [[CFL]], ptr [[PCARRY]], align 1, !tbaa [[CARRY_TBAA16:![0-9]+]]
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    [[RESTART378:%.*]] = load [[MNR_STRUCT]], ptr [[RESTART]], align 8
 ; CHECK-NEXT:    ret [[MNR_STRUCT]] [[RESTART378]]
 ;
@@ -129,3 +130,24 @@ attributes #3 = { nounwind readnone }
 !175 = !{!176, !176, i64 0, i32 1}
 !176 = !{!"const", !3}
 !181 = !{}
+;.
+; CHECK: [[META0:![0-9]+]] = !{!"tbaa2200"}
+; CHECK: [[META2:![0-9]+]] = !{!"data", [[META0]]}
+; CHECK: [[META3:![0-9]+]] = !{!"ctrl", [[META0]]}
+; CHECK: [[MEM_TBAA4]] = !{[[META5:![0-9]+]], [[META5]], i64 0}
+; CHECK: [[META5]] = !{!"mem", [[META2]]}
+; CHECK: [[META7:![0-9]+]] = !{!"grs", [[META2]]}
+; CHECK: [[X9_TBAA8]] = !{[[META9:![0-9]+]], [[META9]], i64 0}
+; CHECK: [[META9]] = !{!"X9", [[META7]]}
+; CHECK: [[A0_TBAA10]] = !{[[META11:![0-9]+]], [[META11]], i64 0}
+; CHECK: [[META11]] = !{!"A0", [[META7]]}
+; CHECK: [[A2_TBAA12]] = !{[[META13:![0-9]+]], [[META13]], i64 0}
+; CHECK: [[META13]] = !{!"A2", [[META7]]}
+; CHECK: [[BREG_TBAA14]] = !{[[META15:![0-9]+]], [[META15]], i64 0}
+; CHECK: [[META15]] = !{!"breg", [[META3]]}
+; CHECK: [[CARRY_TBAA16]] = !{[[META17:![0-9]+]], [[META17]], i64 0}
+; CHECK: [[META17]] = !{!"carry", [[META3]]}
+; CHECK: [[CONST_TBAA18]] = !{[[META19:![0-9]+]], [[META19]], i64 0, i32 1}
+; CHECK: [[META19]] = !{!"const", [[META3]]}
+; CHECK: [[META20]] = !{}
+;.
diff --git a/llvm/test/Transforms/NewGVN/pr34452.ll b/llvm/test/Transforms/NewGVN/pr34452.ll
index 9e65349a1b47b..48bdd88e9591a 100644
--- a/llvm/test/Transforms/NewGVN/pr34452.ll
+++ b/llvm/test/Transforms/NewGVN/pr34452.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt -passes=newgvn -S < %s | FileCheck %s
 ;; Ensure we don't crash when simplifying aggregate value expressions
 source_filename = "bugpoint-output-09f7a24.bc"
@@ -7,17 +7,18 @@ source_filename = "bugpoint-output-09f7a24.bc"
 
 ; Function Attrs: nounwind uwtable
 define void @sgrep() local_unnamed_addr #0 {
-; CHECK-LABEL: @sgrep(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr @WHOLELINE, align 4, !tbaa [[TBAA1:![0-9]+]]
+; CHECK-LABEL: define void @sgrep(
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr @WHOLELINE, align 4, !tbaa [[INT_TBAA1:![0-9]+]]
 ; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp eq i32 [[TMP0]], 0
 ; CHECK-NEXT:    [[DOT:%.*]] = select i1 [[TOBOOL]], i32 2048, i32 2047
-; CHECK-NEXT:    br label [[WHILE_BODY_US:%.*]]
-; CHECK:       while.body.us:
-; CHECK-NEXT:    [[START_1230_US:%.*]] = phi i32 [ [[DOT]], [[ENTRY:%.*]] ], [ 0, [[WHILE_BODY_US]] ]
+; CHECK-NEXT:    br label %[[WHILE_BODY_US:.*]]
+; CHECK:       [[WHILE_BODY_US]]:
+; CHECK-NEXT:    [[START_1230_US:%.*]] = phi i32 [ [[DOT]], %[[ENTRY]] ], [ 0, %[[WHILE_BODY_US]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = sext i32 [[START_1230_US]] to i64
 ; CHECK-NEXT:    [[TMP2:%.*]] = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 0, i64 [[TMP1]])
-; CHECK-NEXT:    br label [[WHILE_BODY_US]]
+; CHECK-NEXT:    br label %[[WHILE_BODY_US]]
 ;
 entry:
   %0 = load i32, ptr @WHOLELINE, align 4, !tbaa !1
@@ -47,3 +48,9 @@ attributes #1 = { nounwind readnone speculatable }
 !2 = !{!"int", !3, i64 0}
 !3 = !{!"omnipotent char", !4, i64 0}
 !4 = !{!"Simple C/C++ TBAA"}
+;.
+; CHECK: [[INT_TBAA1]] = !{[[META2:![0-9]+]], [[META2]], i64 0}
+; CHECK: [[META2]] = !{!"int", [[META3:![0-9]+]], i64 0}
+; CHECK: [[META3]] = !{!"omnipotent char", [[META4:![0-9]+]], i64 0}
+; CHECK: [[META4]] = !{!"Simple C/C++ TBAA"}
+;.
diff --git a/llvm/test/Transforms/NewGVN/preserve-metadata-for-predicate-replacements.ll b/llvm/test/Transforms/NewGVN/preserve-metadata-for-predicate-replacements.ll
index a63ca131b5c0d..c1e52b89ea620 100644
--- a/llvm/test/Transforms/NewGVN/preserve-metadata-for-predicate-replacements.ll
+++ b/llvm/test/Transforms/NewGVN/preserve-metadata-for-predicate-replacements.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt -passes=newgvn %s -S | FileCheck %s
 
 declare void @use(i32)
@@ -7,25 +7,26 @@ declare void @use(i32)
 ; PredicateInfo are replaced.
 
 define i32 @test(ptr %p1, ptr %p2, i1 %c) {
-; CHECK-LABEL: @test(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[LV:%.*]] = load i32, ptr [[P1:%.*]], align 8, !tbaa [[TBAA0:![0-9]+]]
+; CHECK-LABEL: define i32 @test(
+; CHECK-SAME: ptr [[P1:%.*]], ptr [[P2:%.*]], i1 [[C:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[LV:%.*]] = load i32, ptr [[P1]], align 8, !tbaa [[INT_TBAA0:![0-9]+]]
 ; CHECK-NEXT:    [[CMP_1:%.*]] = icmp slt i32 [[LV]], 1
-; CHECK-NEXT:    br i1 [[CMP_1]], label [[EXIT:%.*]], label [[IF_FALSE:%.*]]
-; CHECK:       if.false:
-; CHECK-NEXT:    br i1 [[C:%.*]], label [[EXIT]], label [[FOR_CHECK:%.*]]
-; CHECK:       for.check:
+; CHECK-NEXT:    br i1 [[CMP_1]], label %[[EXIT:.*]], label %[[IF_FALSE:.*]]
+; CHECK:       [[IF_FALSE]]:
+; CHECK-NEXT:    br i1 [[C]], label %[[EXIT]], label %[[FOR_CHECK:.*]]
+; CHECK:       [[FOR_CHECK]]:
 ; CHECK-NEXT:    [[CMP_2:%.*]] = icmp sgt i32 [[LV]], 0
-; CHECK-NEXT:    br i1 [[CMP_2]], label [[FOR_PH:%.*]], label [[EXIT]]
-; CHECK:       for.ph:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[FOR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    br i1 [[CMP_2]], label %[[FOR_PH:.*]], label %[[EXIT]]
+; CHECK:       [[FOR_PH]]:
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, %[[FOR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
 ; CHECK-NEXT:    call void @use(i32 [[IV]])
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
 ; CHECK-NEXT:    [[CMP_3:%.*]] = icmp ne i32 [[IV_NEXT]], [[LV]]
-; CHECK-NEXT:    br i1 [[CMP_3]], label [[FOR_BODY]], label [[EXIT]]
-; CHECK:       exit:
+; CHECK-NEXT:    br i1 [[CMP_3]], label %[[FOR_BODY]], label %[[EXIT]]
+; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret i32 [[LV]]
 ;
 entry:
@@ -59,3 +60,10 @@ exit:                                            ; preds = %for.body, %for.check
 !2 = !{!"int", !3, i64 0}
 !3 = !{!"omnipotent char", !4, i64 0}
 !4 = !{!"Simple C/C++ TBAA"}
+;.
+; CHECK: [[INT_TBAA0]] = !{[[META1:![0-9]+]], [[META2:![0-9]+]], i64 0}
+; CHECK: [[META1]] = !{!"FULL", [[META2]], i64 0, [[META2]], i64 4, [[META3:![0-9]+]], i64 8}
+; CHECK: [[META2]] = !{!"int", [[META3]], i64 0}
+; CHECK: [[META3]] = !{!"omnipotent char", [[META4:![0-9]+]], i64 0}
+; CHECK: [[META4]] = !{!"Simple C/C++ TBAA"}
+;.
diff --git a/llvm/test/Transforms/NewGVN/tbaa.ll b/llvm/test/Transforms/NewGVN/tbaa.ll
index 20c09aa68726a..a90660349f2f4 100644
--- a/llvm/test/Transforms/NewGVN/tbaa.ll
+++ b/llvm/test/Transforms/NewGVN/tbaa.ll
@@ -1,10 +1,10 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt -passes=newgvn -S < %s | FileCheck %s
 
 define i32 @test1(ptr %p, ptr %q) {
 ; CHECK-LABEL: define i32 @test1(
 ; CHECK-SAME: ptr [[P:%.*]], ptr [[Q:%.*]]) {
-; CHECK-NEXT:    [[A:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[TBAA0:![0-9]+]]
+; CHECK-NEXT:    [[A:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[C_TBAA0:![0-9]+]]
 ; CHECK-NEXT:    [[C:%.*]] = add i32 [[A]], [[A]]
 ; CHECK-NEXT:    ret i32 [[C]]
 ;
@@ -17,7 +17,7 @@ define i32 @test1(ptr %p, ptr %q) {
 define i32 @test2(ptr %p, ptr %q) {
 ; CHECK-LABEL: define i32 @test2(
 ; CHECK-SAME: ptr [[P:%.*]], ptr [[Q:%.*]]) {
-; CHECK-NEXT:    [[A:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[TBAA0]]
+; CHECK-NEXT:    [[A:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[C_TBAA0]]
 ; CHECK-NEXT:    [[C:%.*]] = add i32 [[A]], [[A]]
 ; CHECK-NEXT:    ret i32 [[C]]
 ;
@@ -30,7 +30,7 @@ define i32 @test2(ptr %p, ptr %q) {
 define i32 @test3(ptr %p, ptr %q) {
 ; CHECK-LABEL: define i32 @test3(
 ; CHECK-SAME: ptr [[P:%.*]], ptr [[Q:%.*]]) {
-; CHECK-NEXT:    [[A:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[TBAA4:![0-9]+]]
+; CHECK-NEXT:    [[A:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[B_TBAA4:![0-9]+]]
 ; CHECK-NEXT:    [[C:%.*]] = add i32 [[A]], [[A]]
 ; CHECK-NEXT:    ret i32 [[C]]
 ;
@@ -43,7 +43,7 @@ define i32 @test3(ptr %p, ptr %q) {
 define i32 @test4(ptr %p, ptr %q) {
 ; CHECK-LABEL: define i32 @test4(
 ; CHECK-SAME: ptr [[P:%.*]], ptr [[Q:%.*]]) {
-; CHECK-NEXT:    [[A:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[TBAA6:![0-9]+]]
+; CHECK-NEXT:    [[A:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[A_TBAA6:![0-9]+]]
 ; CHECK-NEXT:    [[C:%.*]] = add i32 [[A]], [[A]]
 ; CHECK-NEXT:    ret i32 [[C]]
 ;
@@ -56,7 +56,7 @@ define i32 @test4(ptr %p, ptr %q) {
 define i32 @test5(ptr %p, ptr %q) {
 ; CHECK-LABEL: define i32 @test5(
 ; CHECK-SAME: ptr [[P:%.*]], ptr [[Q:%.*]]) {
-; CHECK-NEXT:    [[A:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[TBAA0]]
+; CHECK-NEXT:    [[A:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[C_TBAA0]]
 ; CHECK-NEXT:    [[C:%.*]] = add i32 [[A]], [[A]]
 ; CHECK-NEXT:    ret i32 [[C]]
 ;
@@ -69,7 +69,7 @@ define i32 @test5(ptr %p, ptr %q) {
 define i32 @test6(ptr %p, ptr %q) {
 ; CHECK-LABEL: define i32 @test6(
 ; CHECK-SAME: ptr [[P:%.*]], ptr [[Q:%.*]]) {
-; CHECK-NEXT:    [[A:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[TBAA0]]
+; CHECK-NEXT:    [[A:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[C_TBAA0]]
 ; CHECK-NEXT:    [[C:%.*]] = add i32 [[A]], [[A]]
 ; CHECK-NEXT:    ret i32 [[C]]
 ;
@@ -82,7 +82,7 @@ define i32 @test6(ptr %p, ptr %q) {
 define i32 @test7(ptr %p, ptr %q) {
 ; CHECK-LABEL: define i32 @test7(
 ; CHECK-SAME: ptr [[P:%.*]], ptr [[Q:%.*]]) {
-; CHECK-NEXT:    [[A:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[TBAA7:![0-9]+]]
+; CHECK-NEXT:    [[A:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[SCALAR_TYPE_TBAA7:![0-9]+]]
 ; CHECK-NEXT:    [[C:%.*]] = add i32 [[A]], [[A]]
 ; CHECK-NEXT:    ret i32 [[C]]
 ;
@@ -129,7 +129,7 @@ define i32 @test10(ptr %p, ptr %q) {
 ; and not just the common final access type.
 ; CHECK-LABEL: define i32 @test10(
 ; CHECK-SAME: ptr [[P:%.*]], ptr [[Q:%.*]]) {
-; CHECK-NEXT:    [[A:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[TBAA10:![0-9]+]]
+; CHECK-NEXT:    [[A:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[INT_TBAA10:![0-9]+]]
 ; CHECK-NEXT:    [[C:%.*]] = add i32 [[A]], [[A]]
 ; CHECK-NEXT:    ret i32 [[C]]
 ;
@@ -165,17 +165,17 @@ declare i32 @foo(ptr) readonly
 !9 = !{!"yet another root"}
 !10 = !{!"node", !9, i64 1}
 ;.
-; CHECK: [[TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0}
+; CHECK: [[C_TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0}
 ; CHECK: [[META1]] = !{!"C", [[META2:![0-9]+]]}
 ; CHECK: [[META2]] = !{!"A", [[META3:![0-9]+]]}
 ; CHECK: [[META3]] = !{!"tbaa root"}
-; CHECK: [[TBAA4]] = !{[[META5:![0-9]+]], [[META5]], i64 0}
+; CHECK: [[B_TBAA4]] = !{[[META5:![0-9]+]], [[META5]], i64 0}
 ; CHECK: [[META5]] = !{!"B", [[META2]]}
-; CHECK: [[TBAA6]] = !{[[META2]], [[META2]], i64 0}
-; CHECK: [[TBAA7]] = !{[[META8:![0-9]+]], [[META8]], i64 0}
+; CHECK: [[A_TBAA6]] = !{[[META2]], [[META2]], i64 0}
+; CHECK: [[SCALAR_TYPE_TBAA7]] = !{[[META8:![0-9]+]], [[META8]], i64 0}
 ; CHECK: [[META8]] = !{!"scalar type", [[META9:![0-9]+]]}
 ; CHECK: [[META9]] = !{!"another root"}
-; CHECK: [[TBAA10]] = !{[[META11:![0-9]+]], [[META12:![0-9]+]], i64 0}
+; CHECK: [[INT_TBAA10]] = !{[[META11:![0-9]+]], [[META12:![0-9]+]], i64 0}
 ; CHECK: [[META11]] = !{!"struct X", [[META12]], i64 0}
 ; CHECK: [[META12]] = !{!"int", [[META13:![0-9]+]], i64 0}
 ; CHECK: [[META13]] = !{!"char", [[META3]], i64 0}
diff --git a/llvm/test/Transforms/NewGVN/volatile-nonvolatile.ll b/llvm/test/Transforms/NewGVN/volatile-nonvolatile.ll
index d8b28d73f24ee..68f7ee5c64e38 100644
--- a/llvm/test/Transforms/NewGVN/volatile-nonvolatile.ll
+++ b/llvm/test/Transforms/NewGVN/volatile-nonvolatile.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt -passes=newgvn -S < %s | FileCheck %s
 
 %struct.t = type { ptr }
@@ -8,10 +8,10 @@
 define void @test1(ptr nocapture readonly %p, i32 %v) #0 {
 ; CHECK-LABEL: define void @test1(
 ; CHECK-SAME: ptr readonly captures(none) [[P:%.*]], i32 [[V:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[P]], align 4, !tbaa [[TBAA0:![0-9]+]]
-; CHECK-NEXT:    store volatile i32 [[V]], ptr [[TMP0]], align 4, !tbaa [[TBAA5:![0-9]+]]
-; CHECK-NEXT:    store volatile i32 [[V]], ptr [[TMP0]], align 4, !tbaa [[TBAA5]]
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[P]], align 4, !tbaa [[ANYPTR_TBAA0:![0-9]+]]
+; CHECK-NEXT:    store volatile i32 [[V]], ptr [[TMP0]], align 4, !tbaa [[INT_TBAA5:![0-9]+]]
+; CHECK-NEXT:    store volatile i32 [[V]], ptr [[TMP0]], align 4, !tbaa [[INT_TBAA5]]
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -27,11 +27,11 @@ entry:
 define void @test2(ptr nocapture readonly %p, i32 %v) #0 {
 ; CHECK-LABEL: define void @test2(
 ; CHECK-SAME: ptr readonly captures(none) [[P:%.*]], i32 [[V:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[P]], align 4, !tbaa [[TBAA0]]
-; CHECK-NEXT:    store volatile i32 [[V]], ptr [[TMP0]], align 4, !tbaa [[TBAA0]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[P]], align 4, !tbaa [[TBAA0]]
-; CHECK-NEXT:    store volatile i32 [[V]], ptr [[TMP1]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[P]], align 4, !tbaa [[ANYPTR_TBAA0]]
+; CHECK-NEXT:    store volatile i32 [[V]], ptr [[TMP0]], align 4, !tbaa [[ANYPTR_TBAA0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[P]], align 4, !tbaa [[ANYPTR_TBAA0]]
+; CHECK-NEXT:    store volatile i32 [[V]], ptr [[TMP1]], align 4, !tbaa [[ANYPTR_TBAA0]]
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -47,11 +47,11 @@ entry:
 define void @test3(ptr nocapture readonly %p, i32 %v) #0 {
 ; CHECK-LABEL: define void @test3(
 ; CHECK-SAME: ptr readonly captures(none) [[P:%.*]], i32 [[V:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load atomic ptr, ptr [[P]] acquire, align 4, !tbaa [[TBAA0]]
-; CHECK-NEXT:    store volatile i32 [[V]], ptr [[TMP0]], align 4, !tbaa [[TBAA5]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load atomic ptr, ptr [[P]] acquire, align 4, !tbaa [[TBAA0]]
-; CHECK-NEXT:    store volatile i32 [[V]], ptr [[TMP1]], align 4, !tbaa [[TBAA5]]
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load atomic ptr, ptr [[P]] acquire, align 4, !tbaa [[ANYPTR_TBAA0]]
+; CHECK-NEXT:    store volatile i32 [[V]], ptr [[TMP0]], align 4, !tbaa [[INT_TBAA5]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load atomic ptr, ptr [[P]] acquire, align 4, !tbaa [[ANYPTR_TBAA0]]
+; CHECK-NEXT:    store volatile i32 [[V]], ptr [[TMP1]], align 4, !tbaa [[INT_TBAA5]]
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -73,11 +73,11 @@ attributes #0 = { norecurse nounwind }
 !7 = !{!"int", !4, i64 0}
 
 ;.
-; CHECK: [[TBAA0]] = !{[[META1:![0-9]+]], [[META2:![0-9]+]], i64 0}
+; CHECK: [[ANYPTR_TBAA0]] = !{[[META1:![0-9]+]], [[META2:![0-9]+]], i64 0}
 ; CHECK: [[META1]] = !{!"", [[META2]], i64 0}
 ; CHECK: [[META2]] = !{!"any pointer", [[META3:![0-9]+]], i64 0}
 ; CHECK: [[META3]] = !{!"omnipotent char", [[META4:![0-9]+]], i64 0}
 ; CHECK: [[META4]] = !{!"Simple C/C++ TBAA"}
-; CHECK: [[TBAA5]] = !{[[META6:![0-9]+]], [[META6]], i64 0}
+; CHECK: [[INT_TBAA5]] = !{[[META6:![0-9]+]], [[META6]], i64 0}
 ; CHECK: [[META6]] = !{!"int", [[META3]], i64 0}
 ;.
diff --git a/llvm/test/Transforms/OpenMP/dead_use.ll b/llvm/test/Transforms/OpenMP/dead_use.ll
index b3f5194b10fc3..1c4b2c6fe27a6 100644
--- a/llvm/test/Transforms/OpenMP/dead_use.ll
+++ b/llvm/test/Transforms/OpenMP/dead_use.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt -S -passes=openmp-opt-cgscc < %s | FileCheck %s
 %struct.ident_t = type { i32, i32, i32, i32, ptr }
 
@@ -7,8 +7,8 @@
 
 ; Function Attrs: nounwind uwtable
 define dso_local i32 @b() #0 {
-; CHECK-LABEL: define {{[^@]+}}@b
-; CHECK-SAME: () #[[ATTR0:[0-9]+]] {
+; CHECK-LABEL: define dso_local i32 @b(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @a()
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP1]], align 4
@@ -22,8 +22,8 @@ define dso_local i32 @b() #0 {
 
 ; Function Attrs: nounwind uwtable
 define internal i32 @a() #0 {
-; CHECK-LABEL: define {{[^@]+}}@a
-; CHECK-SAME: () #[[ATTR0]] {
+; CHECK-LABEL: define internal i32 @a(
+; CHECK-SAME: ) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @b()
 ; CHECK-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB0:[0-9]+]], i32 0, ptr @.omp_outlined.)
@@ -39,12 +39,12 @@ define internal i32 @a() #0 {
 
 ; Function Attrs: norecurse nounwind uwtable
 define internal void @.omp_outlined.(ptr noalias %0, ptr noalias %1) #1 {
-; CHECK-LABEL: define {{[^@]+}}@.omp_outlined.
-; CHECK-SAME: (ptr noalias [[TMP0:%.*]], ptr noalias [[TMP1:%.*]]) #[[ATTR1:[0-9]+]] {
+; CHECK-LABEL: define internal void @.omp_outlined.(
+; CHECK-SAME: ptr noalias [[TMP0:%.*]], ptr noalias [[TMP1:%.*]]) #[[ATTR1:[0-9]+]] {
 ; CHECK-NEXT:    [[TMP3:%.*]] = alloca ptr, align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = alloca ptr, align 8
-; CHECK-NEXT:    store ptr [[TMP0]], ptr [[TMP3]], align 8, !tbaa [[TBAA2:![0-9]+]]
-; CHECK-NEXT:    store ptr [[TMP1]], ptr [[TMP4]], align 8, !tbaa [[TBAA2]]
+; CHECK-NEXT:    store ptr [[TMP0]], ptr [[TMP3]], align 8, !tbaa [[ANYPTR_TBAA2:![0-9]+]]
+; CHECK-NEXT:    store ptr [[TMP1]], ptr [[TMP4]], align 8, !tbaa [[ANYPTR_TBAA2]]
 ; CHECK-NEXT:    ret void
 ;
   %3 = alloca ptr, align 8
@@ -72,3 +72,9 @@ attributes #2 = { nounwind }
 !5 = !{!"Simple C/C++ TBAA"}
 !6 = !{!7}
 !7 = !{i64 2, i64 -1, i64 -1, i1 true}
+;.
+; CHECK: [[ANYPTR_TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
+; CHECK: [[META3]] = !{!"any pointer", [[META4:![0-9]+]], i64 0}
+; CHECK: [[META4]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0}
+; CHECK: [[META5]] = !{!"Simple C/C++ TBAA"}
+;.
diff --git a/llvm/test/Transforms/OpenMP/global_constructor.ll b/llvm/test/Transforms/OpenMP/global_constructor.ll
index 1d18e527e1466..ad3955e2b9dd9 100644
--- a/llvm/test/Transforms/OpenMP/global_constructor.ll
+++ b/llvm/test/Transforms/OpenMP/global_constructor.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-attributes --include-generated-funcs
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes --include-generated-funcs --version 6
 ; RUN: opt -S -passes=openmp-opt < %s | FileCheck %s
 
 %struct.ident_t = type { i32, i32, i32, i32, ptr }
@@ -74,34 +74,40 @@ attributes #1 = { convergent nounwind }
 !12 = !{!"double", !13, i64 0}
 !13 = !{!"omnipotent char", !14, i64 0}
 !14 = !{!"Simple C++ TBAA"}
-; CHECK-LABEL: define {{[^@]+}}@__omp_offloading_fd02_85283c04_main_l11
-; CHECK-SAME: (ptr [[DYN:%.*]], ptr nonnull align 8 dereferenceable(8) [[X:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:  entry:
+; CHECK-LABEL: define weak ptx_kernel void @__omp_offloading_fd02_85283c04_main_l11(
+; CHECK-SAME: ptr [[DYN:%.*]], ptr nonnull align 8 dereferenceable(8) [[X:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_85283c04_main_l11_kernel_environment, ptr [[DYN]]) #[[ATTR1:[0-9]+]]
 ; CHECK-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
-; CHECK-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
-; CHECK:       common.ret:
+; CHECK-NEXT:    br i1 [[EXEC_USER_CODE]], label %[[USER_CODE_ENTRY:.*]], label %[[COMMON_RET:.*]]
+; CHECK:       [[COMMON_RET]]:
 ; CHECK-NEXT:    ret void
-; CHECK:       user_code.entry:
-; CHECK-NEXT:    [[TMP1:%.*]] = load double, ptr @_ZL6Device, align 8, !tbaa [[TBAA9:![0-9]+]]
+; CHECK:       [[USER_CODE_ENTRY]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = load double, ptr @_ZL6Device, align 8, !tbaa [[DOUBLE_TBAA9:![0-9]+]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = tail call i32 @__kmpc_get_hardware_thread_id_in_block() #[[ATTR1]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[TMP3]], label [[REGION_GUARDED:%.*]], label [[REGION_BARRIER:%.*]]
-; CHECK:       region.guarded:
-; CHECK-NEXT:    store double [[TMP1]], ptr [[X]], align 8, !tbaa [[TBAA9]]
-; CHECK-NEXT:    br label [[REGION_BARRIER]]
-; CHECK:       region.barrier:
+; CHECK-NEXT:    br i1 [[TMP3]], label %[[REGION_GUARDED:.*]], label %[[REGION_BARRIER:.*]]
+; CHECK:       [[REGION_GUARDED]]:
+; CHECK-NEXT:    store double [[TMP1]], ptr [[X]], align 8, !tbaa [[DOUBLE_TBAA9]]
+; CHECK-NEXT:    br label %[[REGION_BARRIER]]
+; CHECK:       [[REGION_BARRIER]]:
 ; CHECK-NEXT:    tail call void @__kmpc_barrier_simple_spmd(ptr nonnull @[[GLOB1:[0-9]+]], i32 [[TMP2]]) #[[ATTR1]]
 ; CHECK-NEXT:    tail call void @__kmpc_target_deinit() #[[ATTR1]]
-; CHECK-NEXT:    br label [[COMMON_RET]]
+; CHECK-NEXT:    br label %[[COMMON_RET]]
 ;
 ;
-; CHECK-LABEL: define {{[^@]+}}@__omp_offloading__fd02_85283c04_Device_l6_ctor
-; CHECK-SAME: () #[[ATTR0]] {
-; CHECK-NEXT:  entry:
+; CHECK-LABEL: define weak ptx_kernel void @__omp_offloading__fd02_85283c04_Device_l6_ctor(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[CALL_I:%.*]] = tail call double @__nv_log(double noundef 2.000000e+00) #[[ATTR2:[0-9]+]]
 ; CHECK-NEXT:    [[CALL_I2:%.*]] = tail call double @__nv_log(double noundef 2.000000e+00) #[[ATTR2]]
 ; CHECK-NEXT:    [[DIV:%.*]] = fdiv double [[CALL_I]], [[CALL_I2]]
-; CHECK-NEXT:    store double [[DIV]], ptr @_ZL6Device, align 8, !tbaa [[TBAA9]]
+; CHECK-NEXT:    store double [[DIV]], ptr @_ZL6Device, align 8, !tbaa [[DOUBLE_TBAA9]]
 ; CHECK-NEXT:    ret void
 ;
+;.
+; CHECK: [[DOUBLE_TBAA9]] = !{[[META10:![0-9]+]], [[META10]], i64 0}
+; CHECK: [[META10]] = !{!"double", [[META11:![0-9]+]], i64 0}
+; CHECK: [[META11]] = !{!"omnipotent char", [[META12:![0-9]+]], i64 0}
+; CHECK: [[META12]] = !{!"Simple C++ TBAA"}
+;.
diff --git a/llvm/test/Transforms/OpenMP/spmdization.ll b/llvm/test/Transforms/OpenMP/spmdization.ll
index 0272c41d9d1fc..19d447449dee4 100644
--- a/llvm/test/Transforms/OpenMP/spmdization.ll
+++ b/llvm/test/Transforms/OpenMP/spmdization.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 6
 ; RUN: opt --mtriple=amdgcn-amd-amdhsa --data-layout=A5 -S -passes=openmp-opt < %s | FileCheck %s --check-prefixes=AMDGPU
 ; RUN: opt --mtriple=nvptx64-- -S -passes=openmp-opt < %s | FileCheck %s --check-prefixes=NVPTX
 ; RUN: opt --mtriple=amdgcn-amd-amdhsa --data-layout=A5 -S -passes=openmp-opt -openmp-opt-disable-spmdization < %s | FileCheck %s --check-prefix=AMDGPU-DISABLED1
@@ -186,33 +186,33 @@
 ; NVPTX-DISABLED2: @x_shared1 = internal addrspace(3) global [4 x i8] poison, align 4
 ;.
 define weak ptx_kernel void @__omp_offloading_fd02_2044372e_sequential_loop_l5() #0 {
-; AMDGPU-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_l5
-; AMDGPU-SAME: () #[[ATTR0:[0-9]+]] {
+; AMDGPU-LABEL: define weak ptx_kernel void @__omp_offloading_fd02_2044372e_sequential_loop_l5(
+; AMDGPU-SAME: ) #[[ATTR0:[0-9]+]] {
 ; AMDGPU-NEXT:    call void @__omp_offloading_fd02_2044372e_sequential_loop_l5__debug()
 ; AMDGPU-NEXT:    ret void
 ;
-; NVPTX-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_l5
-; NVPTX-SAME: () #[[ATTR0:[0-9]+]] {
+; NVPTX-LABEL: define weak ptx_kernel void @__omp_offloading_fd02_2044372e_sequential_loop_l5(
+; NVPTX-SAME: ) #[[ATTR0:[0-9]+]] {
 ; NVPTX-NEXT:    call void @__omp_offloading_fd02_2044372e_sequential_loop_l5__debug()
 ; NVPTX-NEXT:    ret void
 ;
-; AMDGPU-DISABLED1-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_l5
-; AMDGPU-DISABLED1-SAME: () #[[ATTR0:[0-9]+]] {
+; AMDGPU-DISABLED1-LABEL: define weak ptx_kernel void @__omp_offloading_fd02_2044372e_sequential_loop_l5(
+; AMDGPU-DISABLED1-SAME: ) #[[ATTR0:[0-9]+]] {
 ; AMDGPU-DISABLED1-NEXT:    call void @__omp_offloading_fd02_2044372e_sequential_loop_l5__debug()
 ; AMDGPU-DISABLED1-NEXT:    ret void
 ;
-; AMDGPU-DISABLED2-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_l5
-; AMDGPU-DISABLED2-SAME: () #[[ATTR0:[0-9]+]] {
+; AMDGPU-DISABLED2-LABEL: define weak ptx_kernel void @__omp_offloading_fd02_2044372e_sequential_loop_l5(
+; AMDGPU-DISABLED2-SAME: ) #[[ATTR0:[0-9]+]] {
 ; AMDGPU-DISABLED2-NEXT:    call void @__omp_offloading_fd02_2044372e_sequential_loop_l5__debug()
 ; AMDGPU-DISABLED2-NEXT:    ret void
 ;
-; NVPTX-DISABLED1-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_l5
-; NVPTX-DISABLED1-SAME: () #[[ATTR0:[0-9]+]] {
+; NVPTX-DISABLED1-LABEL: define weak ptx_kernel void @__omp_offloading_fd02_2044372e_sequential_loop_l5(
+; NVPTX-DISABLED1-SAME: ) #[[ATTR0:[0-9]+]] {
 ; NVPTX-DISABLED1-NEXT:    call void @__omp_offloading_fd02_2044372e_sequential_loop_l5__debug()
 ; NVPTX-DISABLED1-NEXT:    ret void
 ;
-; NVPTX-DISABLED2-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_l5
-; NVPTX-DISABLED2-SAME: () #[[ATTR0:[0-9]+]] {
+; NVPTX-DISABLED2-LABEL: define weak ptx_kernel void @__omp_offloading_fd02_2044372e_sequential_loop_l5(
+; NVPTX-DISABLED2-SAME: ) #[[ATTR0:[0-9]+]] {
 ; NVPTX-DISABLED2-NEXT:    call void @__omp_offloading_fd02_2044372e_sequential_loop_l5__debug()
 ; NVPTX-DISABLED2-NEXT:    ret void
 ;
@@ -221,47 +221,47 @@ define weak ptx_kernel void @__omp_offloading_fd02_2044372e_sequential_loop_l5()
 }
 
 define internal void @__omp_offloading_fd02_2044372e_sequential_loop_l5__debug() {
-; AMDGPU-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_l5__debug
-; AMDGPU-SAME: () #[[ATTR1:[0-9]+]] {
-; AMDGPU-NEXT:  entry:
+; AMDGPU-LABEL: define internal void @__omp_offloading_fd02_2044372e_sequential_loop_l5__debug(
+; AMDGPU-SAME: ) #[[ATTR1:[0-9]+]] {
+; AMDGPU-NEXT:  [[ENTRY:.*:]]
 ; AMDGPU-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 ; AMDGPU-NEXT:    [[DOTZERO_ADDR_CAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTZERO_ADDR]] to ptr
 ; AMDGPU-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca ptr, align 8, addrspace(5)
 ; AMDGPU-NEXT:    [[DOTTHREADID_TEMP__CAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTTHREADID_TEMP_]] to ptr
 ; AMDGPU-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_l5_kernel_environment, ptr null)
 ; AMDGPU-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
-; AMDGPU-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
-; AMDGPU:       common.ret:
+; AMDGPU-NEXT:    br i1 [[EXEC_USER_CODE]], label %[[USER_CODE_ENTRY:.*]], label %[[COMMON_RET:.*]]
+; AMDGPU:       [[COMMON_RET]]:
 ; AMDGPU-NEXT:    ret void
-; AMDGPU:       user_code.entry:
+; AMDGPU:       [[USER_CODE_ENTRY]]:
 ; AMDGPU-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4:[0-9]+]]
-; AMDGPU-NEXT:    store i32 [[TMP1]], ptr addrspace(5) [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12:![0-9]+]]
+; AMDGPU-NEXT:    store i32 [[TMP1]], ptr addrspace(5) [[DOTTHREADID_TEMP_]], align 4, !tbaa [[INT_TBAA12:![0-9]+]]
 ; AMDGPU-NEXT:    call void @__omp_outlined__(ptr [[DOTTHREADID_TEMP__CAST]], ptr [[DOTZERO_ADDR_CAST]]) #[[ATTR4]]
 ; AMDGPU-NEXT:    call void @__kmpc_target_deinit()
-; AMDGPU-NEXT:    br label [[COMMON_RET]]
+; AMDGPU-NEXT:    br label %[[COMMON_RET]]
 ;
-; NVPTX-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_l5__debug
-; NVPTX-SAME: () #[[ATTR1:[0-9]+]] {
-; NVPTX-NEXT:  entry:
+; NVPTX-LABEL: define internal void @__omp_offloading_fd02_2044372e_sequential_loop_l5__debug(
+; NVPTX-SAME: ) #[[ATTR1:[0-9]+]] {
+; NVPTX-NEXT:  [[ENTRY:.*:]]
 ; NVPTX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 ; NVPTX-NEXT:    [[DOTZERO_ADDR_CAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTZERO_ADDR]] to ptr
 ; NVPTX-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca ptr, align 8, addrspace(5)
 ; NVPTX-NEXT:    [[DOTTHREADID_TEMP__CAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTTHREADID_TEMP_]] to ptr
 ; NVPTX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_l5_kernel_environment, ptr null)
 ; NVPTX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
-; NVPTX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
-; NVPTX:       common.ret:
+; NVPTX-NEXT:    br i1 [[EXEC_USER_CODE]], label %[[USER_CODE_ENTRY:.*]], label %[[COMMON_RET:.*]]
+; NVPTX:       [[COMMON_RET]]:
 ; NVPTX-NEXT:    ret void
-; NVPTX:       user_code.entry:
+; NVPTX:       [[USER_CODE_ENTRY]]:
 ; NVPTX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4:[0-9]+]]
-; NVPTX-NEXT:    store i32 [[TMP1]], ptr addrspace(5) [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12:![0-9]+]]
+; NVPTX-NEXT:    store i32 [[TMP1]], ptr addrspace(5) [[DOTTHREADID_TEMP_]], align 4, !tbaa [[INT_TBAA12:![0-9]+]]
 ; NVPTX-NEXT:    call void @__omp_outlined__(ptr [[DOTTHREADID_TEMP__CAST]], ptr [[DOTZERO_ADDR_CAST]]) #[[ATTR4]]
 ; NVPTX-NEXT:    call void @__kmpc_target_deinit()
-; NVPTX-NEXT:    br label [[COMMON_RET]]
+; NVPTX-NEXT:    br label %[[COMMON_RET]]
 ;
-; AMDGPU-DISABLED1-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_l5__debug
-; AMDGPU-DISABLED1-SAME: () #[[ATTR1:[0-9]+]] {
-; AMDGPU-DISABLED1-NEXT:  entry:
+; AMDGPU-DISABLED1-LABEL: define internal void @__omp_offloading_fd02_2044372e_sequential_loop_l5__debug(
+; AMDGPU-DISABLED1-SAME: ) #[[ATTR1:[0-9]+]] {
+; AMDGPU-DISABLED1-NEXT:  [[ENTRY:.*:]]
 ; AMDGPU-DISABLED1-NEXT:    [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 ; AMDGPU-DISABLED1-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 ; AMDGPU-DISABLED1-NEXT:    [[DOTZERO_ADDR_CAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTZERO_ADDR]] to ptr
@@ -269,71 +269,71 @@ define internal void @__omp_offloading_fd02_2044372e_sequential_loop_l5__debug()
 ; AMDGPU-DISABLED1-NEXT:    [[DOTTHREADID_TEMP__CAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTTHREADID_TEMP_]] to ptr
 ; AMDGPU-DISABLED1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_l5_kernel_environment, ptr null)
 ; AMDGPU-DISABLED1-NEXT:    [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
-; AMDGPU-DISABLED1-NEXT:    br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
-; AMDGPU-DISABLED1:       is_worker_check:
+; AMDGPU-DISABLED1-NEXT:    br i1 [[THREAD_IS_WORKER]], label %[[IS_WORKER_CHECK:.*]], label %[[THREAD_USER_CODE_CHECK:.*]]
+; AMDGPU-DISABLED1:       [[IS_WORKER_CHECK]]:
 ; AMDGPU-DISABLED1-NEXT:    [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
 ; AMDGPU-DISABLED1-NEXT:    [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
 ; AMDGPU-DISABLED1-NEXT:    [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]]
 ; AMDGPU-DISABLED1-NEXT:    [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]]
-; AMDGPU-DISABLED1-NEXT:    br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]]
-; AMDGPU-DISABLED1:       worker_state_machine.begin:
+; AMDGPU-DISABLED1-NEXT:    br i1 [[THREAD_IS_MAIN_OR_WORKER]], label %[[WORKER_STATE_MACHINE_BEGIN:.*]], label %[[WORKER_STATE_MACHINE_FINISHED:.*]]
+; AMDGPU-DISABLED1:       [[WORKER_STATE_MACHINE_BEGIN]]:
 ; AMDGPU-DISABLED1-NEXT:    call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
 ; AMDGPU-DISABLED1-NEXT:    [[WORKER_WORK_FN_ADDR_GENERIC:%.*]] = addrspacecast ptr addrspace(5) [[WORKER_WORK_FN_ADDR]] to ptr
 ; AMDGPU-DISABLED1-NEXT:    [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(ptr [[WORKER_WORK_FN_ADDR_GENERIC]])
 ; AMDGPU-DISABLED1-NEXT:    [[WORKER_WORK_FN:%.*]] = load ptr, ptr [[WORKER_WORK_FN_ADDR_GENERIC]], align 8
 ; AMDGPU-DISABLED1-NEXT:    [[WORKER_IS_DONE:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], null
-; AMDGPU-DISABLED1-NEXT:    br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
-; AMDGPU-DISABLED1:       worker_state_machine.finished:
+; AMDGPU-DISABLED1-NEXT:    br i1 [[WORKER_IS_DONE]], label %[[WORKER_STATE_MACHINE_FINISHED]], label %[[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:.*]]
+; AMDGPU-DISABLED1:       [[WORKER_STATE_MACHINE_FINISHED]]:
 ; AMDGPU-DISABLED1-NEXT:    ret void
-; AMDGPU-DISABLED1:       worker_state_machine.is_active.check:
-; AMDGPU-DISABLED1-NEXT:    br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]]
-; AMDGPU-DISABLED1:       worker_state_machine.parallel_region.check:
-; AMDGPU-DISABLED1-NEXT:    br i1 true, label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK1:%.*]]
-; AMDGPU-DISABLED1:       worker_state_machine.parallel_region.execute:
+; AMDGPU-DISABLED1:       [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK]]:
+; AMDGPU-DISABLED1-NEXT:    br i1 [[WORKER_IS_ACTIVE]], label %[[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:.*]], label %[[WORKER_STATE_MACHINE_DONE_BARRIER:.*]]
+; AMDGPU-DISABLED1:       [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK]]:
+; AMDGPU-DISABLED1-NEXT:    br i1 true, label %[[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:.*]], label %[[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK1:.*]]
+; AMDGPU-DISABLED1:       [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE]]:
 ; AMDGPU-DISABLED1-NEXT:    call void @__omp_outlined__1_wrapper(i16 0, i32 [[TMP0]])
-; AMDGPU-DISABLED1-NEXT:    br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]]
-; AMDGPU-DISABLED1:       worker_state_machine.parallel_region.check1:
-; AMDGPU-DISABLED1-NEXT:    br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]]
-; AMDGPU-DISABLED1:       worker_state_machine.parallel_region.end:
+; AMDGPU-DISABLED1-NEXT:    br label %[[WORKER_STATE_MACHINE_PARALLEL_REGION_END:.*]]
+; AMDGPU-DISABLED1:       [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK1]]:
+; AMDGPU-DISABLED1-NEXT:    br label %[[WORKER_STATE_MACHINE_PARALLEL_REGION_END]]
+; AMDGPU-DISABLED1:       [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]]:
 ; AMDGPU-DISABLED1-NEXT:    call void @__kmpc_kernel_end_parallel()
-; AMDGPU-DISABLED1-NEXT:    br label [[WORKER_STATE_MACHINE_DONE_BARRIER]]
-; AMDGPU-DISABLED1:       worker_state_machine.done.barrier:
+; AMDGPU-DISABLED1-NEXT:    br label %[[WORKER_STATE_MACHINE_DONE_BARRIER]]
+; AMDGPU-DISABLED1:       [[WORKER_STATE_MACHINE_DONE_BARRIER]]:
 ; AMDGPU-DISABLED1-NEXT:    call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
-; AMDGPU-DISABLED1-NEXT:    br label [[WORKER_STATE_MACHINE_BEGIN]]
-; AMDGPU-DISABLED1:       thread.user_code.check:
+; AMDGPU-DISABLED1-NEXT:    br label %[[WORKER_STATE_MACHINE_BEGIN]]
+; AMDGPU-DISABLED1:       [[THREAD_USER_CODE_CHECK]]:
 ; AMDGPU-DISABLED1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
-; AMDGPU-DISABLED1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
-; AMDGPU-DISABLED1:       common.ret:
+; AMDGPU-DISABLED1-NEXT:    br i1 [[EXEC_USER_CODE]], label %[[USER_CODE_ENTRY:.*]], label %[[COMMON_RET:.*]]
+; AMDGPU-DISABLED1:       [[COMMON_RET]]:
 ; AMDGPU-DISABLED1-NEXT:    ret void
-; AMDGPU-DISABLED1:       user_code.entry:
+; AMDGPU-DISABLED1:       [[USER_CODE_ENTRY]]:
 ; AMDGPU-DISABLED1-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4:[0-9]+]]
-; AMDGPU-DISABLED1-NEXT:    store i32 [[TMP1]], ptr addrspace(5) [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12:![0-9]+]]
+; AMDGPU-DISABLED1-NEXT:    store i32 [[TMP1]], ptr addrspace(5) [[DOTTHREADID_TEMP_]], align 4, !tbaa [[INT_TBAA12:![0-9]+]]
 ; AMDGPU-DISABLED1-NEXT:    call void @__omp_outlined__(ptr [[DOTTHREADID_TEMP__CAST]], ptr [[DOTZERO_ADDR_CAST]]) #[[ATTR4]]
 ; AMDGPU-DISABLED1-NEXT:    call void @__kmpc_target_deinit()
-; AMDGPU-DISABLED1-NEXT:    br label [[COMMON_RET]]
+; AMDGPU-DISABLED1-NEXT:    br label %[[COMMON_RET]]
 ;
-; AMDGPU-DISABLED2-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_l5__debug
-; AMDGPU-DISABLED2-SAME: () #[[ATTR1:[0-9]+]] {
-; AMDGPU-DISABLED2-NEXT:  entry:
+; AMDGPU-DISABLED2-LABEL: define internal void @__omp_offloading_fd02_2044372e_sequential_loop_l5__debug(
+; AMDGPU-DISABLED2-SAME: ) #[[ATTR1:[0-9]+]] {
+; AMDGPU-DISABLED2-NEXT:  [[ENTRY:.*:]]
 ; AMDGPU-DISABLED2-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 ; AMDGPU-DISABLED2-NEXT:    [[DOTZERO_ADDR_CAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTZERO_ADDR]] to ptr
 ; AMDGPU-DISABLED2-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca ptr, align 8, addrspace(5)
 ; AMDGPU-DISABLED2-NEXT:    [[DOTTHREADID_TEMP__CAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTTHREADID_TEMP_]] to ptr
 ; AMDGPU-DISABLED2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_l5_kernel_environment, ptr null)
 ; AMDGPU-DISABLED2-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
-; AMDGPU-DISABLED2-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
-; AMDGPU-DISABLED2:       common.ret:
+; AMDGPU-DISABLED2-NEXT:    br i1 [[EXEC_USER_CODE]], label %[[USER_CODE_ENTRY:.*]], label %[[COMMON_RET:.*]]
+; AMDGPU-DISABLED2:       [[COMMON_RET]]:
 ; AMDGPU-DISABLED2-NEXT:    ret void
-; AMDGPU-DISABLED2:       user_code.entry:
+; AMDGPU-DISABLED2:       [[USER_CODE_ENTRY]]:
 ; AMDGPU-DISABLED2-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4:[0-9]+]]
-; AMDGPU-DISABLED2-NEXT:    store i32 [[TMP1]], ptr addrspace(5) [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12:![0-9]+]]
+; AMDGPU-DISABLED2-NEXT:    store i32 [[TMP1]], ptr addrspace(5) [[DOTTHREADID_TEMP_]], align 4, !tbaa [[INT_TBAA12:![0-9]+]]
 ; AMDGPU-DISABLED2-NEXT:    call void @__omp_outlined__(ptr [[DOTTHREADID_TEMP__CAST]], ptr [[DOTZERO_ADDR_CAST]]) #[[ATTR4]]
 ; AMDGPU-DISABLED2-NEXT:    call void @__kmpc_target_deinit()
-; AMDGPU-DISABLED2-NEXT:    br label [[COMMON_RET]]
+; AMDGPU-DISABLED2-NEXT:    br label %[[COMMON_RET]]
 ;
-; NVPTX-DISABLED1-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_l5__debug
-; NVPTX-DISABLED1-SAME: () #[[ATTR1:[0-9]+]] {
-; NVPTX-DISABLED1-NEXT:  entry:
+; NVPTX-DISABLED1-LABEL: define internal void @__omp_offloading_fd02_2044372e_sequential_loop_l5__debug(
+; NVPTX-DISABLED1-SAME: ) #[[ATTR1:[0-9]+]] {
+; NVPTX-DISABLED1-NEXT:  [[ENTRY:.*:]]
 ; NVPTX-DISABLED1-NEXT:    [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8
 ; NVPTX-DISABLED1-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 ; NVPTX-DISABLED1-NEXT:    [[DOTZERO_ADDR_CAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTZERO_ADDR]] to ptr
@@ -341,66 +341,66 @@ define internal void @__omp_offloading_fd02_2044372e_sequential_loop_l5__debug()
 ; NVPTX-DISABLED1-NEXT:    [[DOTTHREADID_TEMP__CAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTTHREADID_TEMP_]] to ptr
 ; NVPTX-DISABLED1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_l5_kernel_environment, ptr null)
 ; NVPTX-DISABLED1-NEXT:    [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
-; NVPTX-DISABLED1-NEXT:    br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
-; NVPTX-DISABLED1:       is_worker_check:
+; NVPTX-DISABLED1-NEXT:    br i1 [[THREAD_IS_WORKER]], label %[[IS_WORKER_CHECK:.*]], label %[[THREAD_USER_CODE_CHECK:.*]]
+; NVPTX-DISABLED1:       [[IS_WORKER_CHECK]]:
 ; NVPTX-DISABLED1-NEXT:    [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
 ; NVPTX-DISABLED1-NEXT:    [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
 ; NVPTX-DISABLED1-NEXT:    [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]]
 ; NVPTX-DISABLED1-NEXT:    [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]]
-; NVPTX-DISABLED1-NEXT:    br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]]
-; NVPTX-DISABLED1:       worker_state_machine.begin:
+; NVPTX-DISABLED1-NEXT:    br i1 [[THREAD_IS_MAIN_OR_WORKER]], label %[[WORKER_STATE_MACHINE_BEGIN:.*]], label %[[WORKER_STATE_MACHINE_FINISHED:.*]]
+; NVPTX-DISABLED1:       [[WORKER_STATE_MACHINE_BEGIN]]:
 ; NVPTX-DISABLED1-NEXT:    call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
 ; NVPTX-DISABLED1-NEXT:    [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(ptr [[WORKER_WORK_FN_ADDR]])
 ; NVPTX-DISABLED1-NEXT:    [[WORKER_WORK_FN:%.*]] = load ptr, ptr [[WORKER_WORK_FN_ADDR]], align 8
 ; NVPTX-DISABLED1-NEXT:    [[WORKER_IS_DONE:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], null
-; NVPTX-DISABLED1-NEXT:    br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
-; NVPTX-DISABLED1:       worker_state_machine.finished:
+; NVPTX-DISABLED1-NEXT:    br i1 [[WORKER_IS_DONE]], label %[[WORKER_STATE_MACHINE_FINISHED]], label %[[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:.*]]
+; NVPTX-DISABLED1:       [[WORKER_STATE_MACHINE_FINISHED]]:
 ; NVPTX-DISABLED1-NEXT:    ret void
-; NVPTX-DISABLED1:       worker_state_machine.is_active.check:
-; NVPTX-DISABLED1-NEXT:    br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]]
-; NVPTX-DISABLED1:       worker_state_machine.parallel_region.check:
-; NVPTX-DISABLED1-NEXT:    br i1 true, label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK1:%.*]]
-; NVPTX-DISABLED1:       worker_state_machine.parallel_region.execute:
+; NVPTX-DISABLED1:       [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK]]:
+; NVPTX-DISABLED1-NEXT:    br i1 [[WORKER_IS_ACTIVE]], label %[[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:.*]], label %[[WORKER_STATE_MACHINE_DONE_BARRIER:.*]]
+; NVPTX-DISABLED1:       [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK]]:
+; NVPTX-DISABLED1-NEXT:    br i1 true, label %[[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:.*]], label %[[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK1:.*]]
+; NVPTX-DISABLED1:       [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE]]:
 ; NVPTX-DISABLED1-NEXT:    call void @__omp_outlined__1_wrapper(i16 0, i32 [[TMP0]])
-; NVPTX-DISABLED1-NEXT:    br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]]
-; NVPTX-DISABLED1:       worker_state_machine.parallel_region.check1:
-; NVPTX-DISABLED1-NEXT:    br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]]
-; NVPTX-DISABLED1:       worker_state_machine.parallel_region.end:
+; NVPTX-DISABLED1-NEXT:    br label %[[WORKER_STATE_MACHINE_PARALLEL_REGION_END:.*]]
+; NVPTX-DISABLED1:       [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK1]]:
+; NVPTX-DISABLED1-NEXT:    br label %[[WORKER_STATE_MACHINE_PARALLEL_REGION_END]]
+; NVPTX-DISABLED1:       [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]]:
 ; NVPTX-DISABLED1-NEXT:    call void @__kmpc_kernel_end_parallel()
-; NVPTX-DISABLED1-NEXT:    br label [[WORKER_STATE_MACHINE_DONE_BARRIER]]
-; NVPTX-DISABLED1:       worker_state_machine.done.barrier:
+; NVPTX-DISABLED1-NEXT:    br label %[[WORKER_STATE_MACHINE_DONE_BARRIER]]
+; NVPTX-DISABLED1:       [[WORKER_STATE_MACHINE_DONE_BARRIER]]:
 ; NVPTX-DISABLED1-NEXT:    call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
-; NVPTX-DISABLED1-NEXT:    br label [[WORKER_STATE_MACHINE_BEGIN]]
-; NVPTX-DISABLED1:       thread.user_code.check:
+; NVPTX-DISABLED1-NEXT:    br label %[[WORKER_STATE_MACHINE_BEGIN]]
+; NVPTX-DISABLED1:       [[THREAD_USER_CODE_CHECK]]:
 ; NVPTX-DISABLED1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
-; NVPTX-DISABLED1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
-; NVPTX-DISABLED1:       common.ret:
+; NVPTX-DISABLED1-NEXT:    br i1 [[EXEC_USER_CODE]], label %[[USER_CODE_ENTRY:.*]], label %[[COMMON_RET:.*]]
+; NVPTX-DISABLED1:       [[COMMON_RET]]:
 ; NVPTX-DISABLED1-NEXT:    ret void
-; NVPTX-DISABLED1:       user_code.entry:
+; NVPTX-DISABLED1:       [[USER_CODE_ENTRY]]:
 ; NVPTX-DISABLED1-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4:[0-9]+]]
-; NVPTX-DISABLED1-NEXT:    store i32 [[TMP1]], ptr addrspace(5) [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12:![0-9]+]]
+; NVPTX-DISABLED1-NEXT:    store i32 [[TMP1]], ptr addrspace(5) [[DOTTHREADID_TEMP_]], align 4, !tbaa [[INT_TBAA12:![0-9]+]]
 ; NVPTX-DISABLED1-NEXT:    call void @__omp_outlined__(ptr [[DOTTHREADID_TEMP__CAST]], ptr [[DOTZERO_ADDR_CAST]]) #[[ATTR4]]
 ; NVPTX-DISABLED1-NEXT:    call void @__kmpc_target_deinit()
-; NVPTX-DISABLED1-NEXT:    br label [[COMMON_RET]]
+; NVPTX-DISABLED1-NEXT:    br label %[[COMMON_RET]]
 ;
-; NVPTX-DISABLED2-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_l5__debug
-; NVPTX-DISABLED2-SAME: () #[[ATTR1:[0-9]+]] {
-; NVPTX-DISABLED2-NEXT:  entry:
+; NVPTX-DISABLED2-LABEL: define internal void @__omp_offloading_fd02_2044372e_sequential_loop_l5__debug(
+; NVPTX-DISABLED2-SAME: ) #[[ATTR1:[0-9]+]] {
+; NVPTX-DISABLED2-NEXT:  [[ENTRY:.*:]]
 ; NVPTX-DISABLED2-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 ; NVPTX-DISABLED2-NEXT:    [[DOTZERO_ADDR_CAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTZERO_ADDR]] to ptr
 ; NVPTX-DISABLED2-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca ptr, align 8, addrspace(5)
 ; NVPTX-DISABLED2-NEXT:    [[DOTTHREADID_TEMP__CAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTTHREADID_TEMP_]] to ptr
 ; NVPTX-DISABLED2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_l5_kernel_environment, ptr null)
 ; NVPTX-DISABLED2-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
-; NVPTX-DISABLED2-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
-; NVPTX-DISABLED2:       common.ret:
+; NVPTX-DISABLED2-NEXT:    br i1 [[EXEC_USER_CODE]], label %[[USER_CODE_ENTRY:.*]], label %[[COMMON_RET:.*]]
+; NVPTX-DISABLED2:       [[COMMON_RET]]:
 ; NVPTX-DISABLED2-NEXT:    ret void
-; NVPTX-DISABLED2:       user_code.entry:
+; NVPTX-DISABLED2:       [[USER_CODE_ENTRY]]:
 ; NVPTX-DISABLED2-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4:[0-9]+]]
-; NVPTX-DISABLED2-NEXT:    store i32 [[TMP1]], ptr addrspace(5) [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12:![0-9]+]]
+; NVPTX-DISABLED2-NEXT:    store i32 [[TMP1]], ptr addrspace(5) [[DOTTHREADID_TEMP_]], align 4, !tbaa [[INT_TBAA12:![0-9]+]]
 ; NVPTX-DISABLED2-NEXT:    call void @__omp_outlined__(ptr [[DOTTHREADID_TEMP__CAST]], ptr [[DOTZERO_ADDR_CAST]]) #[[ATTR4]]
 ; NVPTX-DISABLED2-NEXT:    call void @__kmpc_target_deinit()
-; NVPTX-DISABLED2-NEXT:    br label [[COMMON_RET]]
+; NVPTX-DISABLED2-NEXT:    br label %[[COMMON_RET]]
 ;
 entry:
   %.zero.addr = alloca ptr, align 8, addrspace(5)
@@ -424,125 +424,125 @@ user_code.entry:                                  ; preds = %entry
 }
 
 define internal void @__omp_outlined__(ptr noalias %.global_tid., ptr noalias %.bound_tid.) {
-; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__
-; AMDGPU-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
-; AMDGPU-NEXT:  entry:
+; AMDGPU-LABEL: define internal void @__omp_outlined__(
+; AMDGPU-SAME: ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+; AMDGPU-NEXT:  [[ENTRY:.*]]:
 ; AMDGPU-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca ptr, align 8, addrspace(5)
 ; AMDGPU-NEXT:    [[CAPTURED_VARS_ADDRS_CAST:%.*]] = addrspacecast ptr addrspace(5) [[CAPTURED_VARS_ADDRS]] to ptr
-; AMDGPU-NEXT:    br label [[FOR_COND:%.*]]
-; AMDGPU:       for.cond:
-; AMDGPU-NEXT:    [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
+; AMDGPU-NEXT:    br label %[[FOR_COND:.*]]
+; AMDGPU:       [[FOR_COND]]:
+; AMDGPU-NEXT:    [[I_0:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_BODY:.*]] ]
 ; AMDGPU-NEXT:    [[CMP:%.*]] = icmp slt i32 [[I_0]], 100
-; AMDGPU-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
-; AMDGPU:       for.cond.cleanup:
+; AMDGPU-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP:.*]]
+; AMDGPU:       [[FOR_COND_CLEANUP]]:
 ; AMDGPU-NEXT:    call void @spmd_amenable() #[[ATTR7:[0-9]+]]
 ; AMDGPU-NEXT:    ret void
-; AMDGPU:       for.body:
+; AMDGPU:       [[FOR_BODY]]:
 ; AMDGPU-NEXT:    [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID_]] to ptr addrspace(5)
-; AMDGPU-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[TMP0]], align 4, !tbaa [[TBAA12]]
+; AMDGPU-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[TMP0]], align 4, !tbaa [[INT_TBAA12]]
 ; AMDGPU-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__1, ptr @__omp_outlined__1_wrapper, ptr [[CAPTURED_VARS_ADDRS_CAST]], i64 0)
 ; AMDGPU-NEXT:    [[INC]] = add nsw i32 [[I_0]], 1
-; AMDGPU-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP16:![0-9]+]]
+; AMDGPU-NEXT:    br label %[[FOR_COND]], !llvm.loop [[LOOP16:![0-9]+]]
 ;
-; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__
-; NVPTX-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
-; NVPTX-NEXT:  entry:
+; NVPTX-LABEL: define internal void @__omp_outlined__(
+; NVPTX-SAME: ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+; NVPTX-NEXT:  [[ENTRY:.*]]:
 ; NVPTX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca ptr, align 8, addrspace(5)
 ; NVPTX-NEXT:    [[CAPTURED_VARS_ADDRS_CAST:%.*]] = addrspacecast ptr addrspace(5) [[CAPTURED_VARS_ADDRS]] to ptr
-; NVPTX-NEXT:    br label [[FOR_COND:%.*]]
-; NVPTX:       for.cond:
-; NVPTX-NEXT:    [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
+; NVPTX-NEXT:    br label %[[FOR_COND:.*]]
+; NVPTX:       [[FOR_COND]]:
+; NVPTX-NEXT:    [[I_0:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_BODY:.*]] ]
 ; NVPTX-NEXT:    [[CMP:%.*]] = icmp slt i32 [[I_0]], 100
-; NVPTX-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
-; NVPTX:       for.cond.cleanup:
+; NVPTX-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP:.*]]
+; NVPTX:       [[FOR_COND_CLEANUP]]:
 ; NVPTX-NEXT:    call void @spmd_amenable() #[[ATTR7:[0-9]+]]
 ; NVPTX-NEXT:    ret void
-; NVPTX:       for.body:
+; NVPTX:       [[FOR_BODY]]:
 ; NVPTX-NEXT:    [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID_]] to ptr addrspace(5)
-; NVPTX-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[TMP0]], align 4, !tbaa [[TBAA12]]
+; NVPTX-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[TMP0]], align 4, !tbaa [[INT_TBAA12]]
 ; NVPTX-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__1, ptr @__omp_outlined__1_wrapper, ptr [[CAPTURED_VARS_ADDRS_CAST]], i64 0)
 ; NVPTX-NEXT:    [[INC]] = add nsw i32 [[I_0]], 1
-; NVPTX-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP16:![0-9]+]]
+; NVPTX-NEXT:    br label %[[FOR_COND]], !llvm.loop [[LOOP16:![0-9]+]]
 ;
-; AMDGPU-DISABLED1-LABEL: define {{[^@]+}}@__omp_outlined__
-; AMDGPU-DISABLED1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
-; AMDGPU-DISABLED1-NEXT:  entry:
+; AMDGPU-DISABLED1-LABEL: define internal void @__omp_outlined__(
+; AMDGPU-DISABLED1-SAME: ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+; AMDGPU-DISABLED1-NEXT:  [[ENTRY:.*]]:
 ; AMDGPU-DISABLED1-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca ptr, align 8, addrspace(5)
 ; AMDGPU-DISABLED1-NEXT:    [[CAPTURED_VARS_ADDRS_CAST:%.*]] = addrspacecast ptr addrspace(5) [[CAPTURED_VARS_ADDRS]] to ptr
-; AMDGPU-DISABLED1-NEXT:    br label [[FOR_COND:%.*]]
-; AMDGPU-DISABLED1:       for.cond:
-; AMDGPU-DISABLED1-NEXT:    [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
+; AMDGPU-DISABLED1-NEXT:    br label %[[FOR_COND:.*]]
+; AMDGPU-DISABLED1:       [[FOR_COND]]:
+; AMDGPU-DISABLED1-NEXT:    [[I_0:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_BODY:.*]] ]
 ; AMDGPU-DISABLED1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[I_0]], 100
-; AMDGPU-DISABLED1-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
-; AMDGPU-DISABLED1:       for.cond.cleanup:
+; AMDGPU-DISABLED1-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP:.*]]
+; AMDGPU-DISABLED1:       [[FOR_COND_CLEANUP]]:
 ; AMDGPU-DISABLED1-NEXT:    call void @spmd_amenable() #[[ATTR7:[0-9]+]]
 ; AMDGPU-DISABLED1-NEXT:    ret void
-; AMDGPU-DISABLED1:       for.body:
+; AMDGPU-DISABLED1:       [[FOR_BODY]]:
 ; AMDGPU-DISABLED1-NEXT:    [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID_]] to ptr addrspace(5)
-; AMDGPU-DISABLED1-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[TMP0]], align 4, !tbaa [[TBAA12]]
+; AMDGPU-DISABLED1-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[TMP0]], align 4, !tbaa [[INT_TBAA12]]
 ; AMDGPU-DISABLED1-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__1, ptr @__omp_outlined__1_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS_CAST]], i64 0)
 ; AMDGPU-DISABLED1-NEXT:    [[INC]] = add nsw i32 [[I_0]], 1
-; AMDGPU-DISABLED1-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP16:![0-9]+]]
+; AMDGPU-DISABLED1-NEXT:    br label %[[FOR_COND]], !llvm.loop [[LOOP16:![0-9]+]]
 ;
-; AMDGPU-DISABLED2-LABEL: define {{[^@]+}}@__omp_outlined__
-; AMDGPU-DISABLED2-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
-; AMDGPU-DISABLED2-NEXT:  entry:
+; AMDGPU-DISABLED2-LABEL: define internal void @__omp_outlined__(
+; AMDGPU-DISABLED2-SAME: ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+; AMDGPU-DISABLED2-NEXT:  [[ENTRY:.*]]:
 ; AMDGPU-DISABLED2-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca ptr, align 8, addrspace(5)
 ; AMDGPU-DISABLED2-NEXT:    [[CAPTURED_VARS_ADDRS_CAST:%.*]] = addrspacecast ptr addrspace(5) [[CAPTURED_VARS_ADDRS]] to ptr
-; AMDGPU-DISABLED2-NEXT:    br label [[FOR_COND:%.*]]
-; AMDGPU-DISABLED2:       for.cond:
-; AMDGPU-DISABLED2-NEXT:    [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
+; AMDGPU-DISABLED2-NEXT:    br label %[[FOR_COND:.*]]
+; AMDGPU-DISABLED2:       [[FOR_COND]]:
+; AMDGPU-DISABLED2-NEXT:    [[I_0:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_BODY:.*]] ]
 ; AMDGPU-DISABLED2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[I_0]], 100
-; AMDGPU-DISABLED2-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
-; AMDGPU-DISABLED2:       for.cond.cleanup:
+; AMDGPU-DISABLED2-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP:.*]]
+; AMDGPU-DISABLED2:       [[FOR_COND_CLEANUP]]:
 ; AMDGPU-DISABLED2-NEXT:    call void @spmd_amenable() #[[ATTR7:[0-9]+]]
 ; AMDGPU-DISABLED2-NEXT:    ret void
-; AMDGPU-DISABLED2:       for.body:
+; AMDGPU-DISABLED2:       [[FOR_BODY]]:
 ; AMDGPU-DISABLED2-NEXT:    [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID_]] to ptr addrspace(5)
-; AMDGPU-DISABLED2-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[TMP0]], align 4, !tbaa [[TBAA12]]
+; AMDGPU-DISABLED2-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[TMP0]], align 4, !tbaa [[INT_TBAA12]]
 ; AMDGPU-DISABLED2-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__1, ptr @__omp_outlined__1_wrapper, ptr [[CAPTURED_VARS_ADDRS_CAST]], i64 0)
 ; AMDGPU-DISABLED2-NEXT:    [[INC]] = add nsw i32 [[I_0]], 1
-; AMDGPU-DISABLED2-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP16:![0-9]+]]
+; AMDGPU-DISABLED2-NEXT:    br label %[[FOR_COND]], !llvm.loop [[LOOP16:![0-9]+]]
 ;
-; NVPTX-DISABLED1-LABEL: define {{[^@]+}}@__omp_outlined__
-; NVPTX-DISABLED1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
-; NVPTX-DISABLED1-NEXT:  entry:
+; NVPTX-DISABLED1-LABEL: define internal void @__omp_outlined__(
+; NVPTX-DISABLED1-SAME: ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+; NVPTX-DISABLED1-NEXT:  [[ENTRY:.*]]:
 ; NVPTX-DISABLED1-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca ptr, align 8, addrspace(5)
 ; NVPTX-DISABLED1-NEXT:    [[CAPTURED_VARS_ADDRS_CAST:%.*]] = addrspacecast ptr addrspace(5) [[CAPTURED_VARS_ADDRS]] to ptr
-; NVPTX-DISABLED1-NEXT:    br label [[FOR_COND:%.*]]
-; NVPTX-DISABLED1:       for.cond:
-; NVPTX-DISABLED1-NEXT:    [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
+; NVPTX-DISABLED1-NEXT:    br label %[[FOR_COND:.*]]
+; NVPTX-DISABLED1:       [[FOR_COND]]:
+; NVPTX-DISABLED1-NEXT:    [[I_0:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_BODY:.*]] ]
 ; NVPTX-DISABLED1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[I_0]], 100
-; NVPTX-DISABLED1-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
-; NVPTX-DISABLED1:       for.cond.cleanup:
+; NVPTX-DISABLED1-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP:.*]]
+; NVPTX-DISABLED1:       [[FOR_COND_CLEANUP]]:
 ; NVPTX-DISABLED1-NEXT:    call void @spmd_amenable() #[[ATTR7:[0-9]+]]
 ; NVPTX-DISABLED1-NEXT:    ret void
-; NVPTX-DISABLED1:       for.body:
+; NVPTX-DISABLED1:       [[FOR_BODY]]:
 ; NVPTX-DISABLED1-NEXT:    [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID_]] to ptr addrspace(5)
-; NVPTX-DISABLED1-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[TMP0]], align 4, !tbaa [[TBAA12]]
+; NVPTX-DISABLED1-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[TMP0]], align 4, !tbaa [[INT_TBAA12]]
 ; NVPTX-DISABLED1-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__1, ptr @__omp_outlined__1_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS_CAST]], i64 0)
 ; NVPTX-DISABLED1-NEXT:    [[INC]] = add nsw i32 [[I_0]], 1
-; NVPTX-DISABLED1-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP16:![0-9]+]]
+; NVPTX-DISABLED1-NEXT:    br label %[[FOR_COND]], !llvm.loop [[LOOP16:![0-9]+]]
 ;
-; NVPTX-DISABLED2-LABEL: define {{[^@]+}}@__omp_outlined__
-; NVPTX-DISABLED2-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
-; NVPTX-DISABLED2-NEXT:  entry:
+; NVPTX-DISABLED2-LABEL: define internal void @__omp_outlined__(
+; NVPTX-DISABLED2-SAME: ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+; NVPTX-DISABLED2-NEXT:  [[ENTRY:.*]]:
 ; NVPTX-DISABLED2-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca ptr, align 8, addrspace(5)
 ; NVPTX-DISABLED2-NEXT:    [[CAPTURED_VARS_ADDRS_CAST:%.*]] = addrspacecast ptr addrspace(5) [[CAPTURED_VARS_ADDRS]] to ptr
-; NVPTX-DISABLED2-NEXT:    br label [[FOR_COND:%.*]]
-; NVPTX-DISABLED2:       for.cond:
-; NVPTX-DISABLED2-NEXT:    [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
+; NVPTX-DISABLED2-NEXT:    br label %[[FOR_COND:.*]]
+; NVPTX-DISABLED2:       [[FOR_COND]]:
+; NVPTX-DISABLED2-NEXT:    [[I_0:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_BODY:.*]] ]
 ; NVPTX-DISABLED2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[I_0]], 100
-; NVPTX-DISABLED2-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
-; NVPTX-DISABLED2:       for.cond.cleanup:
+; NVPTX-DISABLED2-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP:.*]]
+; NVPTX-DISABLED2:       [[FOR_COND_CLEANUP]]:
 ; NVPTX-DISABLED2-NEXT:    call void @spmd_amenable() #[[ATTR7:[0-9]+]]
 ; NVPTX-DISABLED2-NEXT:    ret void
-; NVPTX-DISABLED2:       for.body:
+; NVPTX-DISABLED2:       [[FOR_BODY]]:
 ; NVPTX-DISABLED2-NEXT:    [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID_]] to ptr addrspace(5)
-; NVPTX-DISABLED2-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[TMP0]], align 4, !tbaa [[TBAA12]]
+; NVPTX-DISABLED2-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[TMP0]], align 4, !tbaa [[INT_TBAA12]]
 ; NVPTX-DISABLED2-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__1, ptr @__omp_outlined__1_wrapper, ptr [[CAPTURED_VARS_ADDRS_CAST]], i64 0)
 ; NVPTX-DISABLED2-NEXT:    [[INC]] = add nsw i32 [[I_0]], 1
-; NVPTX-DISABLED2-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP16:![0-9]+]]
+; NVPTX-DISABLED2-NEXT:    br label %[[FOR_COND]], !llvm.loop [[LOOP16:![0-9]+]]
 ;
 entry:
   %captured_vars_addrs = alloca ptr, align 8, addrspace(5)
@@ -566,39 +566,39 @@ for.body:                                         ; preds = %for.cond
 }
 
 define internal void @__omp_outlined__1(ptr noalias %.global_tid., ptr noalias %.bound_tid.) {
-; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__1
-; AMDGPU-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
-; AMDGPU-NEXT:  entry:
+; AMDGPU-LABEL: define internal void @__omp_outlined__1(
+; AMDGPU-SAME: ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
+; AMDGPU-NEXT:  [[ENTRY:.*:]]
 ; AMDGPU-NEXT:    call void @unknown() #[[ATTR8:[0-9]+]]
 ; AMDGPU-NEXT:    ret void
 ;
-; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__1
-; NVPTX-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
-; NVPTX-NEXT:  entry:
+; NVPTX-LABEL: define internal void @__omp_outlined__1(
+; NVPTX-SAME: ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
+; NVPTX-NEXT:  [[ENTRY:.*:]]
 ; NVPTX-NEXT:    call void @unknown() #[[ATTR8:[0-9]+]]
 ; NVPTX-NEXT:    ret void
 ;
-; AMDGPU-DISABLED1-LABEL: define {{[^@]+}}@__omp_outlined__1
-; AMDGPU-DISABLED1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
-; AMDGPU-DISABLED1-NEXT:  entry:
+; AMDGPU-DISABLED1-LABEL: define internal void @__omp_outlined__1(
+; AMDGPU-DISABLED1-SAME: ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
+; AMDGPU-DISABLED1-NEXT:  [[ENTRY:.*:]]
 ; AMDGPU-DISABLED1-NEXT:    call void @unknown() #[[ATTR8:[0-9]+]]
 ; AMDGPU-DISABLED1-NEXT:    ret void
 ;
-; AMDGPU-DISABLED2-LABEL: define {{[^@]+}}@__omp_outlined__1
-; AMDGPU-DISABLED2-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
-; AMDGPU-DISABLED2-NEXT:  entry:
+; AMDGPU-DISABLED2-LABEL: define internal void @__omp_outlined__1(
+; AMDGPU-DISABLED2-SAME: ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
+; AMDGPU-DISABLED2-NEXT:  [[ENTRY:.*:]]
 ; AMDGPU-DISABLED2-NEXT:    call void @unknown() #[[ATTR8:[0-9]+]]
 ; AMDGPU-DISABLED2-NEXT:    ret void
 ;
-; NVPTX-DISABLED1-LABEL: define {{[^@]+}}@__omp_outlined__1
-; NVPTX-DISABLED1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
-; NVPTX-DISABLED1-NEXT:  entry:
+; NVPTX-DISABLED1-LABEL: define internal void @__omp_outlined__1(
+; NVPTX-DISABLED1-SAME: ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
+; NVPTX-DISABLED1-NEXT:  [[ENTRY:.*:]]
 ; NVPTX-DISABLED1-NEXT:    call void @unknown() #[[ATTR8:[0-9]+]]
 ; NVPTX-DISABLED1-NEXT:    ret void
 ;
-; NVPTX-DISABLED2-LABEL: define {{[^@]+}}@__omp_outlined__1
-; NVPTX-DISABLED2-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
-; NVPTX-DISABLED2-NEXT:  entry:
+; NVPTX-DISABLED2-LABEL: define internal void @__omp_outlined__1(
+; NVPTX-DISABLED2-SAME: ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
+; NVPTX-DISABLED2-NEXT:  [[ENTRY:.*:]]
 ; NVPTX-DISABLED2-NEXT:    call void @unknown() #[[ATTR8:[0-9]+]]
 ; NVPTX-DISABLED2-NEXT:    ret void
 ;
@@ -609,9 +609,9 @@ entry:
 
 ; Function Attrs: convergent norecurse nounwind
 define internal void @__omp_outlined__1_wrapper(i16 zeroext %0, i32 %1) #1 {
-; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper
-; AMDGPU-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2:[0-9]+]] {
-; AMDGPU-NEXT:  entry:
+; AMDGPU-LABEL: define internal void @__omp_outlined__1_wrapper(
+; AMDGPU-SAME: i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2:[0-9]+]] {
+; AMDGPU-NEXT:  [[ENTRY:.*:]]
 ; AMDGPU-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8, addrspace(5)
 ; AMDGPU-NEXT:    [[DOTADDR1_CAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
 ; AMDGPU-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
@@ -622,9 +622,9 @@ define internal void @__omp_outlined__1_wrapper(i16 zeroext %0, i32 %1) #1 {
 ; AMDGPU-NEXT:    call void @__omp_outlined__1(ptr [[DOTADDR1_CAST]], ptr [[DOTZERO_ADDR_CAST]]) #[[ATTR4]]
 ; AMDGPU-NEXT:    ret void
 ;
-; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper
-; NVPTX-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2:[0-9]+]] {
-; NVPTX-NEXT:  entry:
+; NVPTX-LABEL: define internal void @__omp_outlined__1_wrapper(
+; NVPTX-SAME: i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2:[0-9]+]] {
+; NVPTX-NEXT:  [[ENTRY:.*:]]
 ; NVPTX-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8, addrspace(5)
 ; NVPTX-NEXT:    [[DOTADDR1_CAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
 ; NVPTX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
@@ -635,9 +635,9 @@ define internal void @__omp_outlined__1_wrapper(i16 zeroext %0, i32 %1) #1 {
 ; NVPTX-NEXT:    call void @__omp_outlined__1(ptr [[DOTADDR1_CAST]], ptr [[DOTZERO_ADDR_CAST]]) #[[ATTR4]]
 ; NVPTX-NEXT:    ret void
 ;
-; AMDGPU-DISABLED1-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper
-; AMDGPU-DISABLED1-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2:[0-9]+]] {
-; AMDGPU-DISABLED1-NEXT:  entry:
+; AMDGPU-DISABLED1-LABEL: define internal void @__omp_outlined__1_wrapper(
+; AMDGPU-DISABLED1-SAME: i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2:[0-9]+]] {
+; AMDGPU-DISABLED1-NEXT:  [[ENTRY:.*:]]
 ; AMDGPU-DISABLED1-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8, addrspace(5)
 ; AMDGPU-DISABLED1-NEXT:    [[DOTADDR1_CAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
 ; AMDGPU-DISABLED1-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
@@ -648,9 +648,9 @@ define internal void @__omp_outlined__1_wrapper(i16 zeroext %0, i32 %1) #1 {
 ; AMDGPU-DISABLED1-NEXT:    call void @__omp_outlined__1(ptr [[DOTADDR1_CAST]], ptr [[DOTZERO_ADDR_CAST]]) #[[ATTR4]]
 ; AMDGPU-DISABLED1-NEXT:    ret void
 ;
-; AMDGPU-DISABLED2-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper
-; AMDGPU-DISABLED2-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2:[0-9]+]] {
-; AMDGPU-DISABLED2-NEXT:  entry:
+; AMDGPU-DISABLED2-LABEL: define internal void @__omp_outlined__1_wrapper(
+; AMDGPU-DISABLED2-SAME: i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2:[0-9]+]] {
+; AMDGPU-DISABLED2-NEXT:  [[ENTRY:.*:]]
 ; AMDGPU-DISABLED2-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8, addrspace(5)
 ; AMDGPU-DISABLED2-NEXT:    [[DOTADDR1_CAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
 ; AMDGPU-DISABLED2-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
@@ -661,9 +661,9 @@ define internal void @__omp_outlined__1_wrapper(i16 zeroext %0, i32 %1) #1 {
 ; AMDGPU-DISABLED2-NEXT:    call void @__omp_outlined__1(ptr [[DOTADDR1_CAST]], ptr [[DOTZERO_ADDR_CAST]]) #[[ATTR4]]
 ; AMDGPU-DISABLED2-NEXT:    ret void
 ;
-; NVPTX-DISABLED1-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper
-; NVPTX-DISABLED1-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2:[0-9]+]] {
-; NVPTX-DISABLED1-NEXT:  entry:
+; NVPTX-DISABLED1-LABEL: define internal void @__omp_outlined__1_wrapper(
+; NVPTX-DISABLED1-SAME: i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2:[0-9]+]] {
+; NVPTX-DISABLED1-NEXT:  [[ENTRY:.*:]]
 ; NVPTX-DISABLED1-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8, addrspace(5)
 ; NVPTX-DISABLED1-NEXT:    [[DOTADDR1_CAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
 ; NVPTX-DISABLED1-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
@@ -674,9 +674,9 @@ define internal void @__omp_outlined__1_wrapper(i16 zeroext %0, i32 %1) #1 {
 ; NVPTX-DISABLED1-NEXT:    call void @__omp_outlined__1(ptr [[DOTADDR1_CAST]], ptr [[DOTZERO_ADDR_CAST]]) #[[ATTR4]]
 ; NVPTX-DISABLED1-NEXT:    ret void
 ;
-; NVPTX-DISABLED2-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper
-; NVPTX-DISABLED2-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2:[0-9]+]] {
-; NVPTX-DISABLED2-NEXT:  entry:
+; NVPTX-DISABLED2-LABEL: define internal void @__omp_outlined__1_wrapper(
+; NVPTX-DISABLED2-SAME: i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2:[0-9]+]] {
+; NVPTX-DISABLED2-NEXT:  [[ENTRY:.*:]]
 ; NVPTX-DISABLED2-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8, addrspace(5)
 ; NVPTX-DISABLED2-NEXT:    [[DOTADDR1_CAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
 ; NVPTX-DISABLED2-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
@@ -703,47 +703,47 @@ entry:
 
 ; Function Attrs: alwaysinline convergent norecurse nounwind
 define weak ptx_kernel void @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20() #0 {
-; AMDGPU-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20
-; AMDGPU-SAME: () #[[ATTR0]] {
-; AMDGPU-NEXT:  entry:
+; AMDGPU-LABEL: define weak ptx_kernel void @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20(
+; AMDGPU-SAME: ) #[[ATTR0]] {
+; AMDGPU-NEXT:  [[ENTRY:.*:]]
 ; AMDGPU-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 ; AMDGPU-NEXT:    [[DOTZERO_ADDR_CAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTZERO_ADDR]] to ptr
 ; AMDGPU-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca ptr, align 8, addrspace(5)
 ; AMDGPU-NEXT:    [[DOTTHREADID_TEMP__CAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTTHREADID_TEMP_]] to ptr
 ; AMDGPU-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20_kernel_environment, ptr null)
 ; AMDGPU-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
-; AMDGPU-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
-; AMDGPU:       common.ret:
+; AMDGPU-NEXT:    br i1 [[EXEC_USER_CODE]], label %[[USER_CODE_ENTRY:.*]], label %[[COMMON_RET:.*]]
+; AMDGPU:       [[COMMON_RET]]:
 ; AMDGPU-NEXT:    ret void
-; AMDGPU:       user_code.entry:
+; AMDGPU:       [[USER_CODE_ENTRY]]:
 ; AMDGPU-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]]
-; AMDGPU-NEXT:    store i32 [[TMP1]], ptr addrspace(5) [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12]]
+; AMDGPU-NEXT:    store i32 [[TMP1]], ptr addrspace(5) [[DOTTHREADID_TEMP_]], align 4, !tbaa [[INT_TBAA12]]
 ; AMDGPU-NEXT:    call void @__omp_outlined__2(ptr [[DOTTHREADID_TEMP__CAST]], ptr [[DOTZERO_ADDR_CAST]]) #[[ATTR4]]
 ; AMDGPU-NEXT:    call void @__kmpc_target_deinit()
-; AMDGPU-NEXT:    br label [[COMMON_RET]]
+; AMDGPU-NEXT:    br label %[[COMMON_RET]]
 ;
-; NVPTX-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20
-; NVPTX-SAME: () #[[ATTR0]] {
-; NVPTX-NEXT:  entry:
+; NVPTX-LABEL: define weak ptx_kernel void @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20(
+; NVPTX-SAME: ) #[[ATTR0]] {
+; NVPTX-NEXT:  [[ENTRY:.*:]]
 ; NVPTX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 ; NVPTX-NEXT:    [[DOTZERO_ADDR_CAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTZERO_ADDR]] to ptr
 ; NVPTX-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca ptr, align 8, addrspace(5)
 ; NVPTX-NEXT:    [[DOTTHREADID_TEMP__CAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTTHREADID_TEMP_]] to ptr
 ; NVPTX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20_kernel_environment, ptr null)
 ; NVPTX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
-; NVPTX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
-; NVPTX:       common.ret:
+; NVPTX-NEXT:    br i1 [[EXEC_USER_CODE]], label %[[USER_CODE_ENTRY:.*]], label %[[COMMON_RET:.*]]
+; NVPTX:       [[COMMON_RET]]:
 ; NVPTX-NEXT:    ret void
-; NVPTX:       user_code.entry:
+; NVPTX:       [[USER_CODE_ENTRY]]:
 ; NVPTX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]]
-; NVPTX-NEXT:    store i32 [[TMP1]], ptr addrspace(5) [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12]]
+; NVPTX-NEXT:    store i32 [[TMP1]], ptr addrspace(5) [[DOTTHREADID_TEMP_]], align 4, !tbaa [[INT_TBAA12]]
 ; NVPTX-NEXT:    call void @__omp_outlined__2(ptr [[DOTTHREADID_TEMP__CAST]], ptr [[DOTZERO_ADDR_CAST]]) #[[ATTR4]]
 ; NVPTX-NEXT:    call void @__kmpc_target_deinit()
-; NVPTX-NEXT:    br label [[COMMON_RET]]
+; NVPTX-NEXT:    br label %[[COMMON_RET]]
 ;
-; AMDGPU-DISABLED1-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20
-; AMDGPU-DISABLED1-SAME: () #[[ATTR0]] {
-; AMDGPU-DISABLED1-NEXT:  entry:
+; AMDGPU-DISABLED1-LABEL: define weak ptx_kernel void @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20(
+; AMDGPU-DISABLED1-SAME: ) #[[ATTR0]] {
+; AMDGPU-DISABLED1-NEXT:  [[ENTRY:.*:]]
 ; AMDGPU-DISABLED1-NEXT:    [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 ; AMDGPU-DISABLED1-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 ; AMDGPU-DISABLED1-NEXT:    [[DOTZERO_ADDR_CAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTZERO_ADDR]] to ptr
@@ -751,71 +751,71 @@ define weak ptx_kernel void @__omp_offloading_fd02_2044372e_sequential_loop_to_s
 ; AMDGPU-DISABLED1-NEXT:    [[DOTTHREADID_TEMP__CAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTTHREADID_TEMP_]] to ptr
 ; AMDGPU-DISABLED1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20_kernel_environment, ptr null)
 ; AMDGPU-DISABLED1-NEXT:    [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
-; AMDGPU-DISABLED1-NEXT:    br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
-; AMDGPU-DISABLED1:       is_worker_check:
+; AMDGPU-DISABLED1-NEXT:    br i1 [[THREAD_IS_WORKER]], label %[[IS_WORKER_CHECK:.*]], label %[[THREAD_USER_CODE_CHECK:.*]]
+; AMDGPU-DISABLED1:       [[IS_WORKER_CHECK]]:
 ; AMDGPU-DISABLED1-NEXT:    [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
 ; AMDGPU-DISABLED1-NEXT:    [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
 ; AMDGPU-DISABLED1-NEXT:    [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]]
 ; AMDGPU-DISABLED1-NEXT:    [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]]
-; AMDGPU-DISABLED1-NEXT:    br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]]
-; AMDGPU-DISABLED1:       worker_state_machine.begin:
+; AMDGPU-DISABLED1-NEXT:    br i1 [[THREAD_IS_MAIN_OR_WORKER]], label %[[WORKER_STATE_MACHINE_BEGIN:.*]], label %[[WORKER_STATE_MACHINE_FINISHED:.*]]
+; AMDGPU-DISABLED1:       [[WORKER_STATE_MACHINE_BEGIN]]:
 ; AMDGPU-DISABLED1-NEXT:    call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
 ; AMDGPU-DISABLED1-NEXT:    [[WORKER_WORK_FN_ADDR_GENERIC:%.*]] = addrspacecast ptr addrspace(5) [[WORKER_WORK_FN_ADDR]] to ptr
 ; AMDGPU-DISABLED1-NEXT:    [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(ptr [[WORKER_WORK_FN_ADDR_GENERIC]])
 ; AMDGPU-DISABLED1-NEXT:    [[WORKER_WORK_FN:%.*]] = load ptr, ptr [[WORKER_WORK_FN_ADDR_GENERIC]], align 8
 ; AMDGPU-DISABLED1-NEXT:    [[WORKER_IS_DONE:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], null
-; AMDGPU-DISABLED1-NEXT:    br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
-; AMDGPU-DISABLED1:       worker_state_machine.finished:
+; AMDGPU-DISABLED1-NEXT:    br i1 [[WORKER_IS_DONE]], label %[[WORKER_STATE_MACHINE_FINISHED]], label %[[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:.*]]
+; AMDGPU-DISABLED1:       [[WORKER_STATE_MACHINE_FINISHED]]:
 ; AMDGPU-DISABLED1-NEXT:    ret void
-; AMDGPU-DISABLED1:       worker_state_machine.is_active.check:
-; AMDGPU-DISABLED1-NEXT:    br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]]
-; AMDGPU-DISABLED1:       worker_state_machine.parallel_region.check:
-; AMDGPU-DISABLED1-NEXT:    br i1 true, label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK1:%.*]]
-; AMDGPU-DISABLED1:       worker_state_machine.parallel_region.execute:
+; AMDGPU-DISABLED1:       [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK]]:
+; AMDGPU-DISABLED1-NEXT:    br i1 [[WORKER_IS_ACTIVE]], label %[[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:.*]], label %[[WORKER_STATE_MACHINE_DONE_BARRIER:.*]]
+; AMDGPU-DISABLED1:       [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK]]:
+; AMDGPU-DISABLED1-NEXT:    br i1 true, label %[[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:.*]], label %[[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK1:.*]]
+; AMDGPU-DISABLED1:       [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE]]:
 ; AMDGPU-DISABLED1-NEXT:    call void @__omp_outlined__3_wrapper(i16 0, i32 [[TMP0]])
-; AMDGPU-DISABLED1-NEXT:    br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]]
-; AMDGPU-DISABLED1:       worker_state_machine.parallel_region.check1:
-; AMDGPU-DISABLED1-NEXT:    br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]]
-; AMDGPU-DISABLED1:       worker_state_machine.parallel_region.end:
+; AMDGPU-DISABLED1-NEXT:    br label %[[WORKER_STATE_MACHINE_PARALLEL_REGION_END:.*]]
+; AMDGPU-DISABLED1:       [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK1]]:
+; AMDGPU-DISABLED1-NEXT:    br label %[[WORKER_STATE_MACHINE_PARALLEL_REGION_END]]
+; AMDGPU-DISABLED1:       [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]]:
 ; AMDGPU-DISABLED1-NEXT:    call void @__kmpc_kernel_end_parallel()
-; AMDGPU-DISABLED1-NEXT:    br label [[WORKER_STATE_MACHINE_DONE_BARRIER]]
-; AMDGPU-DISABLED1:       worker_state_machine.done.barrier:
+; AMDGPU-DISABLED1-NEXT:    br label %[[WORKER_STATE_MACHINE_DONE_BARRIER]]
+; AMDGPU-DISABLED1:       [[WORKER_STATE_MACHINE_DONE_BARRIER]]:
 ; AMDGPU-DISABLED1-NEXT:    call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
-; AMDGPU-DISABLED1-NEXT:    br label [[WORKER_STATE_MACHINE_BEGIN]]
-; AMDGPU-DISABLED1:       thread.user_code.check:
+; AMDGPU-DISABLED1-NEXT:    br label %[[WORKER_STATE_MACHINE_BEGIN]]
+; AMDGPU-DISABLED1:       [[THREAD_USER_CODE_CHECK]]:
 ; AMDGPU-DISABLED1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
-; AMDGPU-DISABLED1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
-; AMDGPU-DISABLED1:       common.ret:
+; AMDGPU-DISABLED1-NEXT:    br i1 [[EXEC_USER_CODE]], label %[[USER_CODE_ENTRY:.*]], label %[[COMMON_RET:.*]]
+; AMDGPU-DISABLED1:       [[COMMON_RET]]:
 ; AMDGPU-DISABLED1-NEXT:    ret void
-; AMDGPU-DISABLED1:       user_code.entry:
+; AMDGPU-DISABLED1:       [[USER_CODE_ENTRY]]:
 ; AMDGPU-DISABLED1-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]]
-; AMDGPU-DISABLED1-NEXT:    store i32 [[TMP1]], ptr addrspace(5) [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12]]
+; AMDGPU-DISABLED1-NEXT:    store i32 [[TMP1]], ptr addrspace(5) [[DOTTHREADID_TEMP_]], align 4, !tbaa [[INT_TBAA12]]
 ; AMDGPU-DISABLED1-NEXT:    call void @__omp_outlined__2(ptr [[DOTTHREADID_TEMP__CAST]], ptr [[DOTZERO_ADDR_CAST]]) #[[ATTR4]]
 ; AMDGPU-DISABLED1-NEXT:    call void @__kmpc_target_deinit()
-; AMDGPU-DISABLED1-NEXT:    br label [[COMMON_RET]]
+; AMDGPU-DISABLED1-NEXT:    br label %[[COMMON_RET]]
 ;
-; AMDGPU-DISABLED2-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20
-; AMDGPU-DISABLED2-SAME: () #[[ATTR0]] {
-; AMDGPU-DISABLED2-NEXT:  entry:
+; AMDGPU-DISABLED2-LABEL: define weak ptx_kernel void @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20(
+; AMDGPU-DISABLED2-SAME: ) #[[ATTR0]] {
+; AMDGPU-DISABLED2-NEXT:  [[ENTRY:.*:]]
 ; AMDGPU-DISABLED2-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 ; AMDGPU-DISABLED2-NEXT:    [[DOTZERO_ADDR_CAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTZERO_ADDR]] to ptr
 ; AMDGPU-DISABLED2-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca ptr, align 8, addrspace(5)
 ; AMDGPU-DISABLED2-NEXT:    [[DOTTHREADID_TEMP__CAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTTHREADID_TEMP_]] to ptr
 ; AMDGPU-DISABLED2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20_kernel_environment, ptr null)
 ; AMDGPU-DISABLED2-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
-; AMDGPU-DISABLED2-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
-; AMDGPU-DISABLED2:       common.ret:
+; AMDGPU-DISABLED2-NEXT:    br i1 [[EXEC_USER_CODE]], label %[[USER_CODE_ENTRY:.*]], label %[[COMMON_RET:.*]]
+; AMDGPU-DISABLED2:       [[COMMON_RET]]:
 ; AMDGPU-DISABLED2-NEXT:    ret void
-; AMDGPU-DISABLED2:       user_code.entry:
+; AMDGPU-DISABLED2:       [[USER_CODE_ENTRY]]:
 ; AMDGPU-DISABLED2-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]]
-; AMDGPU-DISABLED2-NEXT:    store i32 [[TMP1]], ptr addrspace(5) [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12]]
+; AMDGPU-DISABLED2-NEXT:    store i32 [[TMP1]], ptr addrspace(5) [[DOTTHREADID_TEMP_]], align 4, !tbaa [[INT_TBAA12]]
 ; AMDGPU-DISABLED2-NEXT:    call void @__omp_outlined__2(ptr [[DOTTHREADID_TEMP__CAST]], ptr [[DOTZERO_ADDR_CAST]]) #[[ATTR4]]
 ; AMDGPU-DISABLED2-NEXT:    call void @__kmpc_target_deinit()
-; AMDGPU-DISABLED2-NEXT:    br label [[COMMON_RET]]
+; AMDGPU-DISABLED2-NEXT:    br label %[[COMMON_RET]]
 ;
-; NVPTX-DISABLED1-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20
-; NVPTX-DISABLED1-SAME: () #[[ATTR0]] {
-; NVPTX-DISABLED1-NEXT:  entry:
+; NVPTX-DISABLED1-LABEL: define weak ptx_kernel void @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20(
+; NVPTX-DISABLED1-SAME: ) #[[ATTR0]] {
+; NVPTX-DISABLED1-NEXT:  [[ENTRY:.*:]]
 ; NVPTX-DISABLED1-NEXT:    [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8
 ; NVPTX-DISABLED1-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 ; NVPTX-DISABLED1-NEXT:    [[DOTZERO_ADDR_CAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTZERO_ADDR]] to ptr
@@ -823,66 +823,66 @@ define weak ptx_kernel void @__omp_offloading_fd02_2044372e_sequential_loop_to_s
 ; NVPTX-DISABLED1-NEXT:    [[DOTTHREADID_TEMP__CAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTTHREADID_TEMP_]] to ptr
 ; NVPTX-DISABLED1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20_kernel_environment, ptr null)
 ; NVPTX-DISABLED1-NEXT:    [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
-; NVPTX-DISABLED1-NEXT:    br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
-; NVPTX-DISABLED1:       is_worker_check:
+; NVPTX-DISABLED1-NEXT:    br i1 [[THREAD_IS_WORKER]], label %[[IS_WORKER_CHECK:.*]], label %[[THREAD_USER_CODE_CHECK:.*]]
+; NVPTX-DISABLED1:       [[IS_WORKER_CHECK]]:
 ; NVPTX-DISABLED1-NEXT:    [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
 ; NVPTX-DISABLED1-NEXT:    [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
 ; NVPTX-DISABLED1-NEXT:    [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]]
 ; NVPTX-DISABLED1-NEXT:    [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]]
-; NVPTX-DISABLED1-NEXT:    br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]]
-; NVPTX-DISABLED1:       worker_state_machine.begin:
+; NVPTX-DISABLED1-NEXT:    br i1 [[THREAD_IS_MAIN_OR_WORKER]], label %[[WORKER_STATE_MACHINE_BEGIN:.*]], label %[[WORKER_STATE_MACHINE_FINISHED:.*]]
+; NVPTX-DISABLED1:       [[WORKER_STATE_MACHINE_BEGIN]]:
 ; NVPTX-DISABLED1-NEXT:    call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
 ; NVPTX-DISABLED1-NEXT:    [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(ptr [[WORKER_WORK_FN_ADDR]])
 ; NVPTX-DISABLED1-NEXT:    [[WORKER_WORK_FN:%.*]] = load ptr, ptr [[WORKER_WORK_FN_ADDR]], align 8
 ; NVPTX-DISABLED1-NEXT:    [[WORKER_IS_DONE:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], null
-; NVPTX-DISABLED1-NEXT:    br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
-; NVPTX-DISABLED1:       worker_state_machine.finished:
+; NVPTX-DISABLED1-NEXT:    br i1 [[WORKER_IS_DONE]], label %[[WORKER_STATE_MACHINE_FINISHED]], label %[[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:.*]]
+; NVPTX-DISABLED1:       [[WORKER_STATE_MACHINE_FINISHED]]:
 ; NVPTX-DISABLED1-NEXT:    ret void
-; NVPTX-DISABLED1:       worker_state_machine.is_active.check:
-; NVPTX-DISABLED1-NEXT:    br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]]
-; NVPTX-DISABLED1:       worker_state_machine.parallel_region.check:
-; NVPTX-DISABLED1-NEXT:    br i1 true, label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK1:%.*]]
-; NVPTX-DISABLED1:       worker_state_machine.parallel_region.execute:
+; NVPTX-DISABLED1:       [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK]]:
+; NVPTX-DISABLED1-NEXT:    br i1 [[WORKER_IS_ACTIVE]], label %[[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:.*]], label %[[WORKER_STATE_MACHINE_DONE_BARRIER:.*]]
+; NVPTX-DISABLED1:       [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK]]:
+; NVPTX-DISABLED1-NEXT:    br i1 true, label %[[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:.*]], label %[[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK1:.*]]
+; NVPTX-DISABLED1:       [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE]]:
 ; NVPTX-DISABLED1-NEXT:    call void @__omp_outlined__3_wrapper(i16 0, i32 [[TMP0]])
-; NVPTX-DISABLED1-NEXT:    br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]]
-; NVPTX-DISABLED1:       worker_state_machine.parallel_region.check1:
-; NVPTX-DISABLED1-NEXT:    br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]]
-; NVPTX-DISABLED1:       worker_state_machine.parallel_region.end:
+; NVPTX-DISABLED1-NEXT:    br label %[[WORKER_STATE_MACHINE_PARALLEL_REGION_END:.*]]
+; NVPTX-DISABLED1:       [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK1]]:
+; NVPTX-DISABLED1-NEXT:    br label %[[WORKER_STATE_MACHINE_PARALLEL_REGION_END]]
+; NVPTX-DISABLED1:       [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]]:
 ; NVPTX-DISABLED1-NEXT:    call void @__kmpc_kernel_end_parallel()
-; NVPTX-DISABLED1-NEXT:    br label [[WORKER_STATE_MACHINE_DONE_BARRIER]]
-; NVPTX-DISABLED1:       worker_state_machine.done.barrier:
+; NVPTX-DISABLED1-NEXT:    br label %[[WORKER_STATE_MACHINE_DONE_BARRIER]]
+; NVPTX-DISABLED1:       [[WORKER_STATE_MACHINE_DONE_BARRIER]]:
 ; NVPTX-DISABLED1-NEXT:    call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
-; NVPTX-DISABLED1-NEXT:    br label [[WORKER_STATE_MACHINE_BEGIN]]
-; NVPTX-DISABLED1:       thread.user_code.check:
+; NVPTX-DISABLED1-NEXT:    br label %[[WORKER_STATE_MACHINE_BEGIN]]
+; NVPTX-DISABLED1:       [[THREAD_USER_CODE_CHECK]]:
 ; NVPTX-DISABLED1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
-; NVPTX-DISABLED1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
-; NVPTX-DISABLED1:       common.ret:
+; NVPTX-DISABLED1-NEXT:    br i1 [[EXEC_USER_CODE]], label %[[USER_CODE_ENTRY:.*]], label %[[COMMON_RET:.*]]
+; NVPTX-DISABLED1:       [[COMMON_RET]]:
 ; NVPTX-DISABLED1-NEXT:    ret void
-; NVPTX-DISABLED1:       user_code.entry:
+; NVPTX-DISABLED1:       [[USER_CODE_ENTRY]]:
 ; NVPTX-DISABLED1-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]]
-; NVPTX-DISABLED1-NEXT:    store i32 [[TMP1]], ptr addrspace(5) [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12]]
+; NVPTX-DISABLED1-NEXT:    store i32 [[TMP1]], ptr addrspace(5) [[DOTTHREADID_TEMP_]], align 4, !tbaa [[INT_TBAA12]]
 ; NVPTX-DISABLED1-NEXT:    call void @__omp_outlined__2(ptr [[DOTTHREADID_TEMP__CAST]], ptr [[DOTZERO_ADDR_CAST]]) #[[ATTR4]]
 ; NVPTX-DISABLED1-NEXT:    call void @__kmpc_target_deinit()
-; NVPTX-DISABLED1-NEXT:    br label [[COMMON_RET]]
+; NVPTX-DISABLED1-NEXT:    br label %[[COMMON_RET]]
 ;
-; NVPTX-DISABLED2-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20
-; NVPTX-DISABLED2-SAME: () #[[ATTR0]] {
-; NVPTX-DISABLED2-NEXT:  entry:
+; NVPTX-DISABLED2-LABEL: define weak ptx_kernel void @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20(
+; NVPTX-DISABLED2-SAME: ) #[[ATTR0]] {
+; NVPTX-DISABLED2-NEXT:  [[ENTRY:.*:]]
 ; NVPTX-DISABLED2-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 ; NVPTX-DISABLED2-NEXT:    [[DOTZERO_ADDR_CAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTZERO_ADDR]] to ptr
 ; NVPTX-DISABLED2-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca ptr, align 8, addrspace(5)
 ; NVPTX-DISABLED2-NEXT:    [[DOTTHREADID_TEMP__CAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTTHREADID_TEMP_]] to ptr
 ; NVPTX-DISABLED2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20_kernel_environment, ptr null)
 ; NVPTX-DISABLED2-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
-; NVPTX-DISABLED2-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
-; NVPTX-DISABLED2:       common.ret:
+; NVPTX-DISABLED2-NEXT:    br i1 [[EXEC_USER_CODE]], label %[[USER_CODE_ENTRY:.*]], label %[[COMMON_RET:.*]]
+; NVPTX-DISABLED2:       [[COMMON_RET]]:
 ; NVPTX-DISABLED2-NEXT:    ret void
-; NVPTX-DISABLED2:       user_code.entry:
+; NVPTX-DISABLED2:       [[USER_CODE_ENTRY]]:
 ; NVPTX-DISABLED2-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]]
-; NVPTX-DISABLED2-NEXT:    store i32 [[TMP1]], ptr addrspace(5) [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12]]
+; NVPTX-DISABLED2-NEXT:    store i32 [[TMP1]], ptr addrspace(5) [[DOTTHREADID_TEMP_]], align 4, !tbaa [[INT_TBAA12]]
 ; NVPTX-DISABLED2-NEXT:    call void @__omp_outlined__2(ptr [[DOTTHREADID_TEMP__CAST]], ptr [[DOTZERO_ADDR_CAST]]) #[[ATTR4]]
 ; NVPTX-DISABLED2-NEXT:    call void @__kmpc_target_deinit()
-; NVPTX-DISABLED2-NEXT:    br label [[COMMON_RET]]
+; NVPTX-DISABLED2-NEXT:    br label %[[COMMON_RET]]
 ;
 entry:
   %.zero.addr = alloca ptr, align 8, addrspace(5)
@@ -906,140 +906,140 @@ user_code.entry:                                  ; preds = %entry
 }
 
 define internal void @__omp_outlined__2(ptr noalias %.global_tid., ptr noalias %.bound_tid.) {
-; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__2
-; AMDGPU-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
-; AMDGPU-NEXT:  entry:
+; AMDGPU-LABEL: define internal void @__omp_outlined__2(
+; AMDGPU-SAME: ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+; AMDGPU-NEXT:  [[ENTRY:.*]]:
 ; AMDGPU-NEXT:    [[X_H2S:%.*]] = alloca i8, i64 4, align 4, addrspace(5)
 ; AMDGPU-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca ptr, align 8, addrspace(5)
 ; AMDGPU-NEXT:    [[CAPTURED_VARS_ADDRS_CAST:%.*]] = addrspacecast ptr addrspace(5) [[CAPTURED_VARS_ADDRS]] to ptr
 ; AMDGPU-NEXT:    [[MALLOC_CAST:%.*]] = addrspacecast ptr addrspace(5) [[X_H2S]] to ptr
 ; AMDGPU-NEXT:    call void @use(ptr captures(none) [[MALLOC_CAST]]) #[[ATTR7]]
-; AMDGPU-NEXT:    br label [[FOR_COND:%.*]]
-; AMDGPU:       for.cond:
-; AMDGPU-NEXT:    [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
+; AMDGPU-NEXT:    br label %[[FOR_COND:.*]]
+; AMDGPU:       [[FOR_COND]]:
+; AMDGPU-NEXT:    [[I_0:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_BODY:.*]] ]
 ; AMDGPU-NEXT:    [[CMP:%.*]] = icmp slt i32 [[I_0]], 100
-; AMDGPU-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
-; AMDGPU:       for.cond.cleanup:
+; AMDGPU-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP:.*]]
+; AMDGPU:       [[FOR_COND_CLEANUP]]:
 ; AMDGPU-NEXT:    call void @spmd_amenable() #[[ATTR7]]
 ; AMDGPU-NEXT:    ret void
-; AMDGPU:       for.body:
+; AMDGPU:       [[FOR_BODY]]:
 ; AMDGPU-NEXT:    [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID_]] to ptr addrspace(5)
-; AMDGPU-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[TMP0]], align 4, !tbaa [[TBAA12]]
+; AMDGPU-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[TMP0]], align 4, !tbaa [[INT_TBAA12]]
 ; AMDGPU-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__3, ptr @__omp_outlined__3_wrapper, ptr [[CAPTURED_VARS_ADDRS_CAST]], i64 0)
 ; AMDGPU-NEXT:    [[INC]] = add nsw i32 [[I_0]], 1
-; AMDGPU-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP19:![0-9]+]]
+; AMDGPU-NEXT:    br label %[[FOR_COND]], !llvm.loop [[LOOP19:![0-9]+]]
 ;
-; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__2
-; NVPTX-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
-; NVPTX-NEXT:  entry:
+; NVPTX-LABEL: define internal void @__omp_outlined__2(
+; NVPTX-SAME: ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+; NVPTX-NEXT:  [[ENTRY:.*]]:
 ; NVPTX-NEXT:    [[X_H2S:%.*]] = alloca i8, i64 4, align 4
 ; NVPTX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca ptr, align 8, addrspace(5)
 ; NVPTX-NEXT:    [[CAPTURED_VARS_ADDRS_CAST:%.*]] = addrspacecast ptr addrspace(5) [[CAPTURED_VARS_ADDRS]] to ptr
 ; NVPTX-NEXT:    call void @use(ptr captures(none) [[X_H2S]]) #[[ATTR7]]
-; NVPTX-NEXT:    br label [[FOR_COND:%.*]]
-; NVPTX:       for.cond:
-; NVPTX-NEXT:    [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
+; NVPTX-NEXT:    br label %[[FOR_COND:.*]]
+; NVPTX:       [[FOR_COND]]:
+; NVPTX-NEXT:    [[I_0:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_BODY:.*]] ]
 ; NVPTX-NEXT:    [[CMP:%.*]] = icmp slt i32 [[I_0]], 100
-; NVPTX-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
-; NVPTX:       for.cond.cleanup:
+; NVPTX-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP:.*]]
+; NVPTX:       [[FOR_COND_CLEANUP]]:
 ; NVPTX-NEXT:    call void @spmd_amenable() #[[ATTR7]]
 ; NVPTX-NEXT:    ret void
-; NVPTX:       for.body:
+; NVPTX:       [[FOR_BODY]]:
 ; NVPTX-NEXT:    [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID_]] to ptr addrspace(5)
-; NVPTX-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[TMP0]], align 4, !tbaa [[TBAA12]]
+; NVPTX-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[TMP0]], align 4, !tbaa [[INT_TBAA12]]
 ; NVPTX-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__3, ptr @__omp_outlined__3_wrapper, ptr [[CAPTURED_VARS_ADDRS_CAST]], i64 0)
 ; NVPTX-NEXT:    [[INC]] = add nsw i32 [[I_0]], 1
-; NVPTX-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP19:![0-9]+]]
+; NVPTX-NEXT:    br label %[[FOR_COND]], !llvm.loop [[LOOP19:![0-9]+]]
 ;
-; AMDGPU-DISABLED1-LABEL: define {{[^@]+}}@__omp_outlined__2
-; AMDGPU-DISABLED1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
-; AMDGPU-DISABLED1-NEXT:  entry:
+; AMDGPU-DISABLED1-LABEL: define internal void @__omp_outlined__2(
+; AMDGPU-DISABLED1-SAME: ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+; AMDGPU-DISABLED1-NEXT:  [[ENTRY:.*]]:
 ; AMDGPU-DISABLED1-NEXT:    [[X_H2S:%.*]] = alloca i8, i64 4, align 4, addrspace(5)
 ; AMDGPU-DISABLED1-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca ptr, align 8, addrspace(5)
 ; AMDGPU-DISABLED1-NEXT:    [[CAPTURED_VARS_ADDRS_CAST:%.*]] = addrspacecast ptr addrspace(5) [[CAPTURED_VARS_ADDRS]] to ptr
 ; AMDGPU-DISABLED1-NEXT:    [[MALLOC_CAST:%.*]] = addrspacecast ptr addrspace(5) [[X_H2S]] to ptr
 ; AMDGPU-DISABLED1-NEXT:    call void @use(ptr captures(none) [[MALLOC_CAST]]) #[[ATTR7]]
-; AMDGPU-DISABLED1-NEXT:    br label [[FOR_COND:%.*]]
-; AMDGPU-DISABLED1:       for.cond:
-; AMDGPU-DISABLED1-NEXT:    [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
+; AMDGPU-DISABLED1-NEXT:    br label %[[FOR_COND:.*]]
+; AMDGPU-DISABLED1:       [[FOR_COND]]:
+; AMDGPU-DISABLED1-NEXT:    [[I_0:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_BODY:.*]] ]
 ; AMDGPU-DISABLED1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[I_0]], 100
-; AMDGPU-DISABLED1-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
-; AMDGPU-DISABLED1:       for.cond.cleanup:
+; AMDGPU-DISABLED1-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP:.*]]
+; AMDGPU-DISABLED1:       [[FOR_COND_CLEANUP]]:
 ; AMDGPU-DISABLED1-NEXT:    call void @spmd_amenable() #[[ATTR7]]
 ; AMDGPU-DISABLED1-NEXT:    ret void
-; AMDGPU-DISABLED1:       for.body:
+; AMDGPU-DISABLED1:       [[FOR_BODY]]:
 ; AMDGPU-DISABLED1-NEXT:    [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID_]] to ptr addrspace(5)
-; AMDGPU-DISABLED1-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[TMP0]], align 4, !tbaa [[TBAA12]]
+; AMDGPU-DISABLED1-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[TMP0]], align 4, !tbaa [[INT_TBAA12]]
 ; AMDGPU-DISABLED1-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__3, ptr @__omp_outlined__3_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS_CAST]], i64 0)
 ; AMDGPU-DISABLED1-NEXT:    [[INC]] = add nsw i32 [[I_0]], 1
-; AMDGPU-DISABLED1-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP19:![0-9]+]]
+; AMDGPU-DISABLED1-NEXT:    br label %[[FOR_COND]], !llvm.loop [[LOOP19:![0-9]+]]
 ;
-; AMDGPU-DISABLED2-LABEL: define {{[^@]+}}@__omp_outlined__2
-; AMDGPU-DISABLED2-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
-; AMDGPU-DISABLED2-NEXT:  entry:
+; AMDGPU-DISABLED2-LABEL: define internal void @__omp_outlined__2(
+; AMDGPU-DISABLED2-SAME: ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+; AMDGPU-DISABLED2-NEXT:  [[ENTRY:.*]]:
 ; AMDGPU-DISABLED2-NEXT:    [[X_H2S:%.*]] = alloca i8, i64 4, align 4, addrspace(5)
 ; AMDGPU-DISABLED2-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca ptr, align 8, addrspace(5)
 ; AMDGPU-DISABLED2-NEXT:    [[CAPTURED_VARS_ADDRS_CAST:%.*]] = addrspacecast ptr addrspace(5) [[CAPTURED_VARS_ADDRS]] to ptr
 ; AMDGPU-DISABLED2-NEXT:    [[MALLOC_CAST:%.*]] = addrspacecast ptr addrspace(5) [[X_H2S]] to ptr
 ; AMDGPU-DISABLED2-NEXT:    call void @use(ptr captures(none) [[MALLOC_CAST]]) #[[ATTR7]]
-; AMDGPU-DISABLED2-NEXT:    br label [[FOR_COND:%.*]]
-; AMDGPU-DISABLED2:       for.cond:
-; AMDGPU-DISABLED2-NEXT:    [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
+; AMDGPU-DISABLED2-NEXT:    br label %[[FOR_COND:.*]]
+; AMDGPU-DISABLED2:       [[FOR_COND]]:
+; AMDGPU-DISABLED2-NEXT:    [[I_0:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_BODY:.*]] ]
 ; AMDGPU-DISABLED2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[I_0]], 100
-; AMDGPU-DISABLED2-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
-; AMDGPU-DISABLED2:       for.cond.cleanup:
+; AMDGPU-DISABLED2-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP:.*]]
+; AMDGPU-DISABLED2:       [[FOR_COND_CLEANUP]]:
 ; AMDGPU-DISABLED2-NEXT:    call void @spmd_amenable() #[[ATTR7]]
 ; AMDGPU-DISABLED2-NEXT:    ret void
-; AMDGPU-DISABLED2:       for.body:
+; AMDGPU-DISABLED2:       [[FOR_BODY]]:
 ; AMDGPU-DISABLED2-NEXT:    [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID_]] to ptr addrspace(5)
-; AMDGPU-DISABLED2-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[TMP0]], align 4, !tbaa [[TBAA12]]
+; AMDGPU-DISABLED2-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[TMP0]], align 4, !tbaa [[INT_TBAA12]]
 ; AMDGPU-DISABLED2-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__3, ptr @__omp_outlined__3_wrapper, ptr [[CAPTURED_VARS_ADDRS_CAST]], i64 0)
 ; AMDGPU-DISABLED2-NEXT:    [[INC]] = add nsw i32 [[I_0]], 1
-; AMDGPU-DISABLED2-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP19:![0-9]+]]
+; AMDGPU-DISABLED2-NEXT:    br label %[[FOR_COND]], !llvm.loop [[LOOP19:![0-9]+]]
 ;
-; NVPTX-DISABLED1-LABEL: define {{[^@]+}}@__omp_outlined__2
-; NVPTX-DISABLED1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
-; NVPTX-DISABLED1-NEXT:  entry:
+; NVPTX-DISABLED1-LABEL: define internal void @__omp_outlined__2(
+; NVPTX-DISABLED1-SAME: ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+; NVPTX-DISABLED1-NEXT:  [[ENTRY:.*]]:
 ; NVPTX-DISABLED1-NEXT:    [[X_H2S:%.*]] = alloca i8, i64 4, align 4
 ; NVPTX-DISABLED1-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca ptr, align 8, addrspace(5)
 ; NVPTX-DISABLED1-NEXT:    [[CAPTURED_VARS_ADDRS_CAST:%.*]] = addrspacecast ptr addrspace(5) [[CAPTURED_VARS_ADDRS]] to ptr
 ; NVPTX-DISABLED1-NEXT:    call void @use(ptr captures(none) [[X_H2S]]) #[[ATTR7]]
-; NVPTX-DISABLED1-NEXT:    br label [[FOR_COND:%.*]]
-; NVPTX-DISABLED1:       for.cond:
-; NVPTX-DISABLED1-NEXT:    [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
+; NVPTX-DISABLED1-NEXT:    br label %[[FOR_COND:.*]]
+; NVPTX-DISABLED1:       [[FOR_COND]]:
+; NVPTX-DISABLED1-NEXT:    [[I_0:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_BODY:.*]] ]
 ; NVPTX-DISABLED1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[I_0]], 100
-; NVPTX-DISABLED1-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
-; NVPTX-DISABLED1:       for.cond.cleanup:
+; NVPTX-DISABLED1-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP:.*]]
+; NVPTX-DISABLED1:       [[FOR_COND_CLEANUP]]:
 ; NVPTX-DISABLED1-NEXT:    call void @spmd_amenable() #[[ATTR7]]
 ; NVPTX-DISABLED1-NEXT:    ret void
-; NVPTX-DISABLED1:       for.body:
+; NVPTX-DISABLED1:       [[FOR_BODY]]:
 ; NVPTX-DISABLED1-NEXT:    [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID_]] to ptr addrspace(5)
-; NVPTX-DISABLED1-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[TMP0]], align 4, !tbaa [[TBAA12]]
+; NVPTX-DISABLED1-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[TMP0]], align 4, !tbaa [[INT_TBAA12]]
 ; NVPTX-DISABLED1-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__3, ptr @__omp_outlined__3_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS_CAST]], i64 0)
 ; NVPTX-DISABLED1-NEXT:    [[INC]] = add nsw i32 [[I_0]], 1
-; NVPTX-DISABLED1-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP19:![0-9]+]]
+; NVPTX-DISABLED1-NEXT:    br label %[[FOR_COND]], !llvm.loop [[LOOP19:![0-9]+]]
 ;
-; NVPTX-DISABLED2-LABEL: define {{[^@]+}}@__omp_outlined__2
-; NVPTX-DISABLED2-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
-; NVPTX-DISABLED2-NEXT:  entry:
+; NVPTX-DISABLED2-LABEL: define internal void @__omp_outlined__2(
+; NVPTX-DISABLED2-SAME: ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+; NVPTX-DISABLED2-NEXT:  [[ENTRY:.*]]:
 ; NVPTX-DISABLED2-NEXT:    [[X_H2S:%.*]] = alloca i8, i64 4, align 4
 ; NVPTX-DISABLED2-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca ptr, align 8, addrspace(5)
 ; NVPTX-DISABLED2-NEXT:    [[CAPTURED_VARS_ADDRS_CAST:%.*]] = addrspacecast ptr addrspace(5) [[CAPTURED_VARS_ADDRS]] to ptr
 ; NVPTX-DISABLED2-NEXT:    call void @use(ptr captures(none) [[X_H2S]]) #[[ATTR7]]
-; NVPTX-DISABLED2-NEXT:    br label [[FOR_COND:%.*]]
-; NVPTX-DISABLED2:       for.cond:
-; NVPTX-DISABLED2-NEXT:    [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
+; NVPTX-DISABLED2-NEXT:    br label %[[FOR_COND:.*]]
+; NVPTX-DISABLED2:       [[FOR_COND]]:
+; NVPTX-DISABLED2-NEXT:    [[I_0:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_BODY:.*]] ]
 ; NVPTX-DISABLED2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[I_0]], 100
-; NVPTX-DISABLED2-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
-; NVPTX-DISABLED2:       for.cond.cleanup:
+; NVPTX-DISABLED2-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP:.*]]
+; NVPTX-DISABLED2:       [[FOR_COND_CLEANUP]]:
 ; NVPTX-DISABLED2-NEXT:    call void @spmd_amenable() #[[ATTR7]]
 ; NVPTX-DISABLED2-NEXT:    ret void
-; NVPTX-DISABLED2:       for.body:
+; NVPTX-DISABLED2:       [[FOR_BODY]]:
 ; NVPTX-DISABLED2-NEXT:    [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID_]] to ptr addrspace(5)
-; NVPTX-DISABLED2-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[TMP0]], align 4, !tbaa [[TBAA12]]
+; NVPTX-DISABLED2-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[TMP0]], align 4, !tbaa [[INT_TBAA12]]
 ; NVPTX-DISABLED2-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__3, ptr @__omp_outlined__3_wrapper, ptr [[CAPTURED_VARS_ADDRS_CAST]], i64 0)
 ; NVPTX-DISABLED2-NEXT:    [[INC]] = add nsw i32 [[I_0]], 1
-; NVPTX-DISABLED2-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP19:![0-9]+]]
+; NVPTX-DISABLED2-NEXT:    br label %[[FOR_COND]], !llvm.loop [[LOOP19:![0-9]+]]
 ;
 entry:
   %captured_vars_addrs = alloca ptr, align 8, addrspace(5)
@@ -1066,39 +1066,39 @@ for.body:                                         ; preds = %for.cond
 }
 
 define internal void @__omp_outlined__3(ptr noalias %.global_tid., ptr noalias %.bound_tid.) {
-; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__3
-; AMDGPU-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
-; AMDGPU-NEXT:  entry:
+; AMDGPU-LABEL: define internal void @__omp_outlined__3(
+; AMDGPU-SAME: ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
+; AMDGPU-NEXT:  [[ENTRY:.*:]]
 ; AMDGPU-NEXT:    call void @unknown() #[[ATTR8]]
 ; AMDGPU-NEXT:    ret void
 ;
-; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__3
-; NVPTX-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
-; NVPTX-NEXT:  entry:
+; NVPTX-LABEL: define internal void @__omp_outlined__3(
+; NVPTX-SAME: ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
+; NVPTX-NEXT:  [[ENTRY:.*:]]
 ; NVPTX-NEXT:    call void @unknown() #[[ATTR8]]
 ; NVPTX-NEXT:    ret void
 ;
-; AMDGPU-DISABLED1-LABEL: define {{[^@]+}}@__omp_outlined__3
-; AMDGPU-DISABLED1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
-; AMDGPU-DISABLED1-NEXT:  entry:
+; AMDGPU-DISABLED1-LABEL: define internal void @__omp_outlined__3(
+; AMDGPU-DISABLED1-SAME: ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
+; AMDGPU-DISABLED1-NEXT:  [[ENTRY:.*:]]
 ; AMDGPU-DISABLED1-NEXT:    call void @unknown() #[[ATTR8]]
 ; AMDGPU-DISABLED1-NEXT:    ret void
 ;
-; AMDGPU-DISABLED2-LABEL: define {{[^@]+}}@__omp_outlined__3
-; AMDGPU-DISABLED2-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
-; AMDGPU-DISABLED2-NEXT:  entry:
+; AMDGPU-DISABLED2-LABEL: define internal void @__omp_outlined__3(
+; AMDGPU-DISABLED2-SAME: ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
+; AMDGPU-DISABLED2-NEXT:  [[ENTRY:.*:]]
 ; AMDGPU-DISABLED2-NEXT:    call void @unknown() #[[ATTR8]]
 ; AMDGPU-DISABLED2-NEXT:    ret void
 ;
-; NVPTX-DISABLED1-LABEL: define {{[^@]+}}@__omp_outlined__3
-; NVPTX-DISABLED1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
-; NVPTX-DISABLED1-NEXT:  entry:
+; NVPTX-DISABLED1-LABEL: define internal void @__omp_outlined__3(
+; NVPTX-DISABLED1-SAME: ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
+; NVPTX-DISABLED1-NEXT:  [[ENTRY:.*:]]
 ; NVPTX-DISABLED1-NEXT:    call void @unknown() #[[ATTR8]]
 ; NVPTX-DISABLED1-NEXT:    ret void
 ;
-; NVPTX-DISABLED2-LABEL: define {{[^@]+}}@__omp_outlined__3
-; NVPTX-DISABLED2-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
-; NVPTX-DISABLED2-NEXT:  entry:
+; NVPTX-DISABLED2-LABEL: define internal void @__omp_outlined__3(
+; NVPTX-DISABLED2-SAME: ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
+; NVPTX-DISABLED2-NEXT:  [[ENTRY:.*:]]
 ; NVPTX-DISABLED2-NEXT:    call void @unknown() #[[ATTR8]]
 ; NVPTX-DISABLED2-NEXT:    ret void
 ;
@@ -1109,9 +1109,9 @@ entry:
 
 ; Function Attrs: convergent norecurse nounwind
 define internal void @__omp_outlined__3_wrapper(i16 zeroext %0, i32 %1) #1 {
-; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__3_wrapper
-; AMDGPU-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] {
-; AMDGPU-NEXT:  entry:
+; AMDGPU-LABEL: define internal void @__omp_outlined__3_wrapper(
+; AMDGPU-SAME: i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] {
+; AMDGPU-NEXT:  [[ENTRY:.*:]]
 ; AMDGPU-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8, addrspace(5)
 ; AMDGPU-NEXT:    [[DOTADDR1_CAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
 ; AMDGPU-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
@@ -1122,9 +1122,9 @@ define internal void @__omp_outlined__3_wrapper(i16 zeroext %0, i32 %1) #1 {
 ; AMDGPU-NEXT:    call void @__omp_outlined__3(ptr [[DOTADDR1_CAST]], ptr [[DOTZERO_ADDR_CAST]]) #[[ATTR4]]
 ; AMDGPU-NEXT:    ret void
 ;
-; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__3_wrapper
-; NVPTX-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] {
-; NVPTX-NEXT:  entry:
+; NVPTX-LABEL: define internal void @__omp_outlined__3_wrapper(
+; NVPTX-SAME: i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] {
+; NVPTX-NEXT:  [[ENTRY:.*:]]
 ; NVPTX-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8, addrspace(5)
 ; NVPTX-NEXT:    [[DOTADDR1_CAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
 ; NVPTX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
@@ -1135,9 +1135,9 @@ define internal void @__omp_outlined__3_wrapper(i16 zeroext %0, i32 %1) #1 {
 ; NVPTX-NEXT:    call void @__omp_outlined__3(ptr [[DOTADDR1_CAST]], ptr [[DOTZERO_ADDR_CAST]]) #[[ATTR4]]
 ; NVPTX-NEXT:    ret void
 ;
-; AMDGPU-DISABLED1-LABEL: define {{[^@]+}}@__omp_outlined__3_wrapper
-; AMDGPU-DISABLED1-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] {
-; AMDGPU-DISABLED1-NEXT:  entry:
+; AMDGPU-DISABLED1-LABEL: define internal void @__omp_outlined__3_wrapper(
+; AMDGPU-DISABLED1-SAME: i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] {
+; AMDGPU-DISABLED1-NEXT:  [[ENTRY:.*:]]
 ; AMDGPU-DISABLED1-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8, addrspace(5)
 ; AMDGPU-DISABLED1-NEXT:    [[DOTADDR1_CAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
 ; AMDGPU-DISABLED1-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
@@ -1148,9 +1148,9 @@ define internal void @__omp_outlined__3_wrapper(i16 zeroext %0, i32 %1) #1 {
 ; AMDGPU-DISABLED1-NEXT:    call void @__omp_outlined__3(ptr [[DOTADDR1_CAST]], ptr [[DOTZERO_ADDR_CAST]]) #[[ATTR4]]
 ; AMDGPU-DISABLED1-NEXT:    ret void
 ;
-; AMDGPU-DISABLED2-LABEL: define {{[^@]+}}@__omp_outlined__3_wrapper
-; AMDGPU-DISABLED2-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] {
-; AMDGPU-DISABLED2-NEXT:  entry:
+; AMDGPU-DISABLED2-LABEL: define internal void @__omp_outlined__3_wrapper(
+; AMDGPU-DISABLED2-SAME: i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] {
+; AMDGPU-DISABLED2-NEXT:  [[ENTRY:.*:]]
 ; AMDGPU-DISABLED2-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8, addrspace(5)
 ; AMDGPU-DISABLED2-NEXT:    [[DOTADDR1_CAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
 ; AMDGPU-DISABLED2-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
@@ -1161,9 +1161,9 @@ define internal void @__omp_outlined__3_wrapper(i16 zeroext %0, i32 %1) #1 {
 ; AMDGPU-DISABLED2-NEXT:    call void @__omp_outlined__3(ptr [[DOTADDR1_CAST]], ptr [[DOTZERO_ADDR_CAST]]) #[[ATTR4]]
 ; AMDGPU-DISABLED2-NEXT:    ret void
 ;
-; NVPTX-DISABLED1-LABEL: define {{[^@]+}}@__omp_outlined__3_wrapper
-; NVPTX-DISABLED1-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] {
-; NVPTX-DISABLED1-NEXT:  entry:
+; NVPTX-DISABLED1-LABEL: define internal void @__omp_outlined__3_wrapper(
+; NVPTX-DISABLED1-SAME: i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] {
+; NVPTX-DISABLED1-NEXT:  [[ENTRY:.*:]]
 ; NVPTX-DISABLED1-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8, addrspace(5)
 ; NVPTX-DISABLED1-NEXT:    [[DOTADDR1_CAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
 ; NVPTX-DISABLED1-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
@@ -1174,9 +1174,9 @@ define internal void @__omp_outlined__3_wrapper(i16 zeroext %0, i32 %1) #1 {
 ; NVPTX-DISABLED1-NEXT:    call void @__omp_outlined__3(ptr [[DOTADDR1_CAST]], ptr [[DOTZERO_ADDR_CAST]]) #[[ATTR4]]
 ; NVPTX-DISABLED1-NEXT:    ret void
 ;
-; NVPTX-DISABLED2-LABEL: define {{[^@]+}}@__omp_outlined__3_wrapper
-; NVPTX-DISABLED2-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] {
-; NVPTX-DISABLED2-NEXT:  entry:
+; NVPTX-DISABLED2-LABEL: define internal void @__omp_outlined__3_wrapper(
+; NVPTX-DISABLED2-SAME: i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] {
+; NVPTX-DISABLED2-NEXT:  [[ENTRY:.*:]]
 ; NVPTX-DISABLED2-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8, addrspace(5)
 ; NVPTX-DISABLED2-NEXT:    [[DOTADDR1_CAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
 ; NVPTX-DISABLED2-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
@@ -1203,47 +1203,47 @@ entry:
 
 ; Function Attrs: alwaysinline convergent norecurse nounwind
 define weak ptx_kernel void @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35() #0 {
-; AMDGPU-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35
-; AMDGPU-SAME: () #[[ATTR0]] {
-; AMDGPU-NEXT:  entry:
+; AMDGPU-LABEL: define weak ptx_kernel void @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35(
+; AMDGPU-SAME: ) #[[ATTR0]] {
+; AMDGPU-NEXT:  [[ENTRY:.*:]]
 ; AMDGPU-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 ; AMDGPU-NEXT:    [[DOTZERO_ADDR_CAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTZERO_ADDR]] to ptr
 ; AMDGPU-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca ptr, align 8, addrspace(5)
 ; AMDGPU-NEXT:    [[DOTTHREADID_TEMP__CAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTTHREADID_TEMP_]] to ptr
 ; AMDGPU-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35_kernel_environment, ptr null)
 ; AMDGPU-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
-; AMDGPU-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
-; AMDGPU:       common.ret:
+; AMDGPU-NEXT:    br i1 [[EXEC_USER_CODE]], label %[[USER_CODE_ENTRY:.*]], label %[[COMMON_RET:.*]]
+; AMDGPU:       [[COMMON_RET]]:
 ; AMDGPU-NEXT:    ret void
-; AMDGPU:       user_code.entry:
+; AMDGPU:       [[USER_CODE_ENTRY]]:
 ; AMDGPU-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]]
-; AMDGPU-NEXT:    store i32 [[TMP1]], ptr addrspace(5) [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12]]
+; AMDGPU-NEXT:    store i32 [[TMP1]], ptr addrspace(5) [[DOTTHREADID_TEMP_]], align 4, !tbaa [[INT_TBAA12]]
 ; AMDGPU-NEXT:    call void @__omp_outlined__4(ptr [[DOTTHREADID_TEMP__CAST]], ptr [[DOTZERO_ADDR_CAST]]) #[[ATTR4]]
 ; AMDGPU-NEXT:    call void @__kmpc_target_deinit()
-; AMDGPU-NEXT:    br label [[COMMON_RET]]
+; AMDGPU-NEXT:    br label %[[COMMON_RET]]
 ;
-; NVPTX-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35
-; NVPTX-SAME: () #[[ATTR0]] {
-; NVPTX-NEXT:  entry:
+; NVPTX-LABEL: define weak ptx_kernel void @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35(
+; NVPTX-SAME: ) #[[ATTR0]] {
+; NVPTX-NEXT:  [[ENTRY:.*:]]
 ; NVPTX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 ; NVPTX-NEXT:    [[DOTZERO_ADDR_CAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTZERO_ADDR]] to ptr
 ; NVPTX-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca ptr, align 8, addrspace(5)
 ; NVPTX-NEXT:    [[DOTTHREADID_TEMP__CAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTTHREADID_TEMP_]] to ptr
 ; NVPTX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35_kernel_environment, ptr null)
 ; NVPTX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
-; NVPTX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
-; NVPTX:       common.ret:
+; NVPTX-NEXT:    br i1 [[EXEC_USER_CODE]], label %[[USER_CODE_ENTRY:.*]], label %[[COMMON_RET:.*]]
+; NVPTX:       [[COMMON_RET]]:
 ; NVPTX-NEXT:    ret void
-; NVPTX:       user_code.entry:
+; NVPTX:       [[USER_CODE_ENTRY]]:
 ; NVPTX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]]
-; NVPTX-NEXT:    store i32 [[TMP1]], ptr addrspace(5) [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12]]
+; NVPTX-NEXT:    store i32 [[TMP1]], ptr addrspace(5) [[DOTTHREADID_TEMP_]], align 4, !tbaa [[INT_TBAA12]]
 ; NVPTX-NEXT:    call void @__omp_outlined__4(ptr [[DOTTHREADID_TEMP__CAST]], ptr [[DOTZERO_ADDR_CAST]]) #[[ATTR4]]
 ; NVPTX-NEXT:    call void @__kmpc_target_deinit()
-; NVPTX-NEXT:    br label [[COMMON_RET]]
+; NVPTX-NEXT:    br label %[[COMMON_RET]]
 ;
-; AMDGPU-DISABLED1-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35
-; AMDGPU-DISABLED1-SAME: () #[[ATTR0]] {
-; AMDGPU-DISABLED1-NEXT:  entry:
+; AMDGPU-DISABLED1-LABEL: define weak ptx_kernel void @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35(
+; AMDGPU-DISABLED1-SAME: ) #[[ATTR0]] {
+; AMDGPU-DISABLED1-NEXT:  [[ENTRY:.*:]]
 ; AMDGPU-DISABLED1-NEXT:    [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 ; AMDGPU-DISABLED1-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 ; AMDGPU-DISABLED1-NEXT:    [[DOTZERO_ADDR_CAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTZERO_ADDR]] to ptr
@@ -1251,71 +1251,71 @@ define weak ptx_kernel void @__omp_offloading_fd02_2044372e_sequential_loop_to_s
 ; AMDGPU-DISABLED1-NEXT:    [[DOTTHREADID_TEMP__CAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTTHREADID_TEMP_]] to ptr
 ; AMDGPU-DISABLED1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35_kernel_environment, ptr null)
 ; AMDGPU-DISABLED1-NEXT:    [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
-; AMDGPU-DISABLED1-NEXT:    br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
-; AMDGPU-DISABLED1:       is_worker_check:
+; AMDGPU-DISABLED1-NEXT:    br i1 [[THREAD_IS_WORKER]], label %[[IS_WORKER_CHECK:.*]], label %[[THREAD_USER_CODE_CHECK:.*]]
+; AMDGPU-DISABLED1:       [[IS_WORKER_CHECK]]:
 ; AMDGPU-DISABLED1-NEXT:    [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
 ; AMDGPU-DISABLED1-NEXT:    [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
 ; AMDGPU-DISABLED1-NEXT:    [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]]
 ; AMDGPU-DISABLED1-NEXT:    [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]]
-; AMDGPU-DISABLED1-NEXT:    br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]]
-; AMDGPU-DISABLED1:       worker_state_machine.begin:
+; AMDGPU-DISABLED1-NEXT:    br i1 [[THREAD_IS_MAIN_OR_WORKER]], label %[[WORKER_STATE_MACHINE_BEGIN:.*]], label %[[WORKER_STATE_MACHINE_FINISHED:.*]]
+; AMDGPU-DISABLED1:       [[WORKER_STATE_MACHINE_BEGIN]]:
 ; AMDGPU-DISABLED1-NEXT:    call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
 ; AMDGPU-DISABLED1-NEXT:    [[WORKER_WORK_FN_ADDR_GENERIC:%.*]] = addrspacecast ptr addrspace(5) [[WORKER_WORK_FN_ADDR]] to ptr
 ; AMDGPU-DISABLED1-NEXT:    [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(ptr [[WORKER_WORK_FN_ADDR_GENERIC]])
 ; AMDGPU-DISABLED1-NEXT:    [[WORKER_WORK_FN:%.*]] = load ptr, ptr [[WORKER_WORK_FN_ADDR_GENERIC]], align 8
 ; AMDGPU-DISABLED1-NEXT:    [[WORKER_IS_DONE:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], null
-; AMDGPU-DISABLED1-NEXT:    br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
-; AMDGPU-DISABLED1:       worker_state_machine.finished:
+; AMDGPU-DISABLED1-NEXT:    br i1 [[WORKER_IS_DONE]], label %[[WORKER_STATE_MACHINE_FINISHED]], label %[[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:.*]]
+; AMDGPU-DISABLED1:       [[WORKER_STATE_MACHINE_FINISHED]]:
 ; AMDGPU-DISABLED1-NEXT:    ret void
-; AMDGPU-DISABLED1:       worker_state_machine.is_active.check:
-; AMDGPU-DISABLED1-NEXT:    br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]]
-; AMDGPU-DISABLED1:       worker_state_machine.parallel_region.check:
-; AMDGPU-DISABLED1-NEXT:    br i1 true, label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK1:%.*]]
-; AMDGPU-DISABLED1:       worker_state_machine.parallel_region.execute:
+; AMDGPU-DISABLED1:       [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK]]:
+; AMDGPU-DISABLED1-NEXT:    br i1 [[WORKER_IS_ACTIVE]], label %[[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:.*]], label %[[WORKER_STATE_MACHINE_DONE_BARRIER:.*]]
+; AMDGPU-DISABLED1:       [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK]]:
+; AMDGPU-DISABLED1-NEXT:    br i1 true, label %[[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:.*]], label %[[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK1:.*]]
+; AMDGPU-DISABLED1:       [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE]]:
 ; AMDGPU-DISABLED1-NEXT:    call void @__omp_outlined__5_wrapper(i16 0, i32 [[TMP0]])
-; AMDGPU-DISABLED1-NEXT:    br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]]
-; AMDGPU-DISABLED1:       worker_state_machine.parallel_region.check1:
-; AMDGPU-DISABLED1-NEXT:    br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]]
-; AMDGPU-DISABLED1:       worker_state_machine.parallel_region.end:
+; AMDGPU-DISABLED1-NEXT:    br label %[[WORKER_STATE_MACHINE_PARALLEL_REGION_END:.*]]
+; AMDGPU-DISABLED1:       [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK1]]:
+; AMDGPU-DISABLED1-NEXT:    br label %[[WORKER_STATE_MACHINE_PARALLEL_REGION_END]]
+; AMDGPU-DISABLED1:       [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]]:
 ; AMDGPU-DISABLED1-NEXT:    call void @__kmpc_kernel_end_parallel()
-; AMDGPU-DISABLED1-NEXT:    br label [[WORKER_STATE_MACHINE_DONE_BARRIER]]
-; AMDGPU-DISABLED1:       worker_state_machine.done.barrier:
+; AMDGPU-DISABLED1-NEXT:    br label %[[WORKER_STATE_MACHINE_DONE_BARRIER]]
+; AMDGPU-DISABLED1:       [[WORKER_STATE_MACHINE_DONE_BARRIER]]:
 ; AMDGPU-DISABLED1-NEXT:    call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
-; AMDGPU-DISABLED1-NEXT:    br label [[WORKER_STATE_MACHINE_BEGIN]]
-; AMDGPU-DISABLED1:       thread.user_code.check:
+; AMDGPU-DISABLED1-NEXT:    br label %[[WORKER_STATE_MACHINE_BEGIN]]
+; AMDGPU-DISABLED1:       [[THREAD_USER_CODE_CHECK]]:
 ; AMDGPU-DISABLED1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
-; AMDGPU-DISABLED1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
-; AMDGPU-DISABLED1:       common.ret:
+; AMDGPU-DISABLED1-NEXT:    br i1 [[EXEC_USER_CODE]], label %[[USER_CODE_ENTRY:.*]], label %[[COMMON_RET:.*]]
+; AMDGPU-DISABLED1:       [[COMMON_RET]]:
 ; AMDGPU-DISABLED1-NEXT:    ret void
-; AMDGPU-DISABLED1:       user_code.entry:
+; AMDGPU-DISABLED1:       [[USER_CODE_ENTRY]]:
 ; AMDGPU-DISABLED1-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]]
-; AMDGPU-DISABLED1-NEXT:    store i32 [[TMP1]], ptr addrspace(5) [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12]]
+; AMDGPU-DISABLED1-NEXT:    store i32 [[TMP1]], ptr addrspace(5) [[DOTTHREADID_TEMP_]], align 4, !tbaa [[INT_TBAA12]]
 ; AMDGPU-DISABLED1-NEXT:    call void @__omp_outlined__4(ptr [[DOTTHREADID_TEMP__CAST]], ptr [[DOTZERO_ADDR_CAST]]) #[[ATTR4]]
 ; AMDGPU-DISABLED1-NEXT:    call void @__kmpc_target_deinit()
-; AMDGPU-DISABLED1-NEXT:    br label [[COMMON_RET]]
+; AMDGPU-DISABLED1-NEXT:    br label %[[COMMON_RET]]
 ;
-; AMDGPU-DISABLED2-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35
-; AMDGPU-DISABLED2-SAME: () #[[ATTR0]] {
-; AMDGPU-DISABLED2-NEXT:  entry:
+; AMDGPU-DISABLED2-LABEL: define weak ptx_kernel void @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35(
+; AMDGPU-DISABLED2-SAME: ) #[[ATTR0]] {
+; AMDGPU-DISABLED2-NEXT:  [[ENTRY:.*:]]
 ; AMDGPU-DISABLED2-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 ; AMDGPU-DISABLED2-NEXT:    [[DOTZERO_ADDR_CAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTZERO_ADDR]] to ptr
 ; AMDGPU-DISABLED2-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca ptr, align 8, addrspace(5)
 ; AMDGPU-DISABLED2-NEXT:    [[DOTTHREADID_TEMP__CAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTTHREADID_TEMP_]] to ptr
 ; AMDGPU-DISABLED2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35_kernel_environment, ptr null)
 ; AMDGPU-DISABLED2-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
-; AMDGPU-DISABLED2-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
-; AMDGPU-DISABLED2:       common.ret:
+; AMDGPU-DISABLED2-NEXT:    br i1 [[EXEC_USER_CODE]], label %[[USER_CODE_ENTRY:.*]], label %[[COMMON_RET:.*]]
+; AMDGPU-DISABLED2:       [[COMMON_RET]]:
 ; AMDGPU-DISABLED2-NEXT:    ret void
-; AMDGPU-DISABLED2:       user_code.entry:
+; AMDGPU-DISABLED2:       [[USER_CODE_ENTRY]]:
 ; AMDGPU-DISABLED2-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]]
-; AMDGPU-DISABLED2-NEXT:    store i32 [[TMP1]], ptr addrspace(5) [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12]]
+; AMDGPU-DISABLED2-NEXT:    store i32 [[TMP1]], ptr addrspace(5) [[DOTTHREADID_TEMP_]], align 4, !tbaa [[INT_TBAA12]]
 ; AMDGPU-DISABLED2-NEXT:    call void @__omp_outlined__4(ptr [[DOTTHREADID_TEMP__CAST]], ptr [[DOTZERO_ADDR_CAST]]) #[[ATTR4]]
 ; AMDGPU-DISABLED2-NEXT:    call void @__kmpc_target_deinit()
-; AMDGPU-DISABLED2-NEXT:    br label [[COMMON_RET]]
+; AMDGPU-DISABLED2-NEXT:    br label %[[COMMON_RET]]
 ;
-; NVPTX-DISABLED1-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35
-; NVPTX-DISABLED1-SAME: () #[[ATTR0]] {
-; NVPTX-DISABLED1-NEXT:  entry:
+; NVPTX-DISABLED1-LABEL: define weak ptx_kernel void @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35(
+; NVPTX-DISABLED1-SAME: ) #[[ATTR0]] {
+; NVPTX-DISABLED1-NEXT:  [[ENTRY:.*:]]
 ; NVPTX-DISABLED1-NEXT:    [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8
 ; NVPTX-DISABLED1-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 ; NVPTX-DISABLED1-NEXT:    [[DOTZERO_ADDR_CAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTZERO_ADDR]] to ptr
@@ -1323,66 +1323,66 @@ define weak ptx_kernel void @__omp_offloading_fd02_2044372e_sequential_loop_to_s
 ; NVPTX-DISABLED1-NEXT:    [[DOTTHREADID_TEMP__CAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTTHREADID_TEMP_]] to ptr
 ; NVPTX-DISABLED1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35_kernel_environment, ptr null)
 ; NVPTX-DISABLED1-NEXT:    [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
-; NVPTX-DISABLED1-NEXT:    br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
-; NVPTX-DISABLED1:       is_worker_check:
+; NVPTX-DISABLED1-NEXT:    br i1 [[THREAD_IS_WORKER]], label %[[IS_WORKER_CHECK:.*]], label %[[THREAD_USER_CODE_CHECK:.*]]
+; NVPTX-DISABLED1:       [[IS_WORKER_CHECK]]:
 ; NVPTX-DISABLED1-NEXT:    [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
 ; NVPTX-DISABLED1-NEXT:    [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
 ; NVPTX-DISABLED1-NEXT:    [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]]
 ; NVPTX-DISABLED1-NEXT:    [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]]
-; NVPTX-DISABLED1-NEXT:    br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]]
-; NVPTX-DISABLED1:       worker_state_machine.begin:
+; NVPTX-DISABLED1-NEXT:    br i1 [[THREAD_IS_MAIN_OR_WORKER]], label %[[WORKER_STATE_MACHINE_BEGIN:.*]], label %[[WORKER_STATE_MACHINE_FINISHED:.*]]
+; NVPTX-DISABLED1:       [[WORKER_STATE_MACHINE_BEGIN]]:
 ; NVPTX-DISABLED1-NEXT:    call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
 ; NVPTX-DISABLED1-NEXT:    [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(ptr [[WORKER_WORK_FN_ADDR]])
 ; NVPTX-DISABLED1-NEXT:    [[WORKER_WORK_FN:%.*]] = load ptr, ptr [[WORKER_WORK_FN_ADDR]], align 8
 ; NVPTX-DISABLED1-NEXT:    [[WORKER_IS_DONE:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], null
-; NVPTX-DISABLED1-NEXT:    br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
-; NVPTX-DISABLED1:       worker_state_machine.finished:
+; NVPTX-DISABLED1-NEXT:    br i1 [[WORKER_IS_DONE]], label %[[WORKER_STATE_MACHINE_FINISHED]], label %[[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:.*]]
+; NVPTX-DISABLED1:       [[WORKER_STATE_MACHINE_FINISHED]]:
 ; NVPTX-DISABLED1-NEXT:    ret void
-; NVPTX-DISABLED1:       worker_state_machine.is_active.check:
-; NVPTX-DISABLED1-NEXT:    br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]]
-; NVPTX-DISABLED1:       worker_state_machine.parallel_region.check:
-; NVPTX-DISABLED1-NEXT:    br i1 true, label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK1:%.*]]
-; NVPTX-DISABLED1:       worker_state_machine.parallel_region.execute:
+; NVPTX-DISABLED1:       [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK]]:
+; NVPTX-DISABLED1-NEXT:    br i1 [[WORKER_IS_ACTIVE]], label %[[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:.*]], label %[[WORKER_STATE_MACHINE_DONE_BARRIER:.*]]
+; NVPTX-DISABLED1:       [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK]]:
+; NVPTX-DISABLED1-NEXT:    br i1 true, label %[[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:.*]], label %[[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK1:.*]]
+; NVPTX-DISABLED1:       [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE]]:
 ; NVPTX-DISABLED1-NEXT:    call void @__omp_outlined__5_wrapper(i16 0, i32 [[TMP0]])
-; NVPTX-DISABLED1-NEXT:    br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]]
-; NVPTX-DISABLED1:       worker_state_machine.parallel_region.check1:
-; NVPTX-DISABLED1-NEXT:    br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]]
-; NVPTX-DISABLED1:       worker_state_machine.parallel_region.end:
+; NVPTX-DISABLED1-NEXT:    br label %[[WORKER_STATE_MACHINE_PARALLEL_REGION_END:.*]]
+; NVPTX-DISABLED1:       [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK1]]:
+; NVPTX-DISABLED1-NEXT:    br label %[[WORKER_STATE_MACHINE_PARALLEL_REGION_END]]
+; NVPTX-DISABLED1:       [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]]:
 ; NVPTX-DISABLED1-NEXT:    call void @__kmpc_kernel_end_parallel()
-; NVPTX-DISABLED1-NEXT:    br label [[WORKER_STATE_MACHINE_DONE_BARRIER]]
-; NVPTX-DISABLED1:       worker_state_machine.done.barrier:
+; NVPTX-DISABLED1-NEXT:    br label %[[WORKER_STATE_MACHINE_DONE_BARRIER]]
+; NVPTX-DISABLED1:       [[WORKER_STATE_MACHINE_DONE_BARRIER]]:
 ; NVPTX-DISABLED1-NEXT:    call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
-; NVPTX-DISABLED1-NEXT:    br label [[WORKER_STATE_MACHINE_BEGIN]]
-; NVPTX-DISABLED1:       thread.user_code.check:
+; NVPTX-DISABLED1-NEXT:    br label %[[WORKER_STATE_MACHINE_BEGIN]]
+; NVPTX-DISABLED1:       [[THREAD_USER_CODE_CHECK]]:
 ; NVPTX-DISABLED1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
-; NVPTX-DISABLED1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
-; NVPTX-DISABLED1:       common.ret:
+; NVPTX-DISABLED1-NEXT:    br i1 [[EXEC_USER_CODE]], label %[[USER_CODE_ENTRY:.*]], label %[[COMMON_RET:.*]]
+; NVPTX-DISABLED1:       [[COMMON_RET]]:
 ; NVPTX-DISABLED1-NEXT:    ret void
-; NVPTX-DISABLED1:       user_code.entry:
+; NVPTX-DISABLED1:       [[USER_CODE_ENTRY]]:
 ; NVPTX-DISABLED1-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]]
-; NVPTX-DISABLED1-NEXT:    store i32 [[TMP1]], ptr addrspace(5) [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12]]
+; NVPTX-DISABLED1-NEXT:    store i32 [[TMP1]], ptr addrspace(5) [[DOTTHREADID_TEMP_]], align 4, !tbaa [[INT_TBAA12]]
 ; NVPTX-DISABLED1-NEXT:    call void @__omp_outlined__4(ptr [[DOTTHREADID_TEMP__CAST]], ptr [[DOTZERO_ADDR_CAST]]) #[[ATTR4]]
 ; NVPTX-DISABLED1-NEXT:    call void @__kmpc_target_deinit()
-; NVPTX-DISABLED1-NEXT:    br label [[COMMON_RET]]
+; NVPTX-DISABLED1-NEXT:    br label %[[COMMON_RET]]
 ;
-; NVPTX-DISABLED2-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35
-; NVPTX-DISABLED2-SAME: () #[[ATTR0]] {
-; NVPTX-DISABLED2-NEXT:  entry:
+; NVPTX-DISABLED2-LABEL: define weak ptx_kernel void @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35(
+; NVPTX-DISABLED2-SAME: ) #[[ATTR0]] {
+; NVPTX-DISABLED2-NEXT:  [[ENTRY:.*:]]
 ; NVPTX-DISABLED2-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 ; NVPTX-DISABLED2-NEXT:    [[DOTZERO_ADDR_CAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTZERO_ADDR]] to ptr
 ; NVPTX-DISABLED2-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca ptr, align 8, addrspace(5)
 ; NVPTX-DISABLED2-NEXT:    [[DOTTHREADID_TEMP__CAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTTHREADID_TEMP_]] to ptr
 ; NVPTX-DISABLED2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35_kernel_environment, ptr null)
 ; NVPTX-DISABLED2-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
-; NVPTX-DISABLED2-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
-; NVPTX-DISABLED2:       common.ret:
+; NVPTX-DISABLED2-NEXT:    br i1 [[EXEC_USER_CODE]], label %[[USER_CODE_ENTRY:.*]], label %[[COMMON_RET:.*]]
+; NVPTX-DISABLED2:       [[COMMON_RET]]:
 ; NVPTX-DISABLED2-NEXT:    ret void
-; NVPTX-DISABLED2:       user_code.entry:
+; NVPTX-DISABLED2:       [[USER_CODE_ENTRY]]:
 ; NVPTX-DISABLED2-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]]
-; NVPTX-DISABLED2-NEXT:    store i32 [[TMP1]], ptr addrspace(5) [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12]]
+; NVPTX-DISABLED2-NEXT:    store i32 [[TMP1]], ptr addrspace(5) [[DOTTHREADID_TEMP_]], align 4, !tbaa [[INT_TBAA12]]
 ; NVPTX-DISABLED2-NEXT:    call void @__omp_outlined__4(ptr [[DOTTHREADID_TEMP__CAST]], ptr [[DOTZERO_ADDR_CAST]]) #[[ATTR4]]
 ; NVPTX-DISABLED2-NEXT:    call void @__kmpc_target_deinit()
-; NVPTX-DISABLED2-NEXT:    br label [[COMMON_RET]]
+; NVPTX-DISABLED2-NEXT:    br label %[[COMMON_RET]]
 ;
 entry:
   %.zero.addr = alloca ptr, align 8, addrspace(5)
@@ -1406,131 +1406,131 @@ user_code.entry:                                  ; preds = %entry
 }
 
 define internal void @__omp_outlined__4(ptr noalias %.global_tid., ptr noalias %.bound_tid.) {
-; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__4
-; AMDGPU-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
-; AMDGPU-NEXT:  entry:
+; AMDGPU-LABEL: define internal void @__omp_outlined__4(
+; AMDGPU-SAME: ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+; AMDGPU-NEXT:  [[ENTRY:.*]]:
 ; AMDGPU-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca ptr, align 8, addrspace(5)
 ; AMDGPU-NEXT:    [[CAPTURED_VARS_ADDRS_CAST:%.*]] = addrspacecast ptr addrspace(5) [[CAPTURED_VARS_ADDRS]] to ptr
-; AMDGPU-NEXT:    br label [[FOR_COND:%.*]]
-; AMDGPU:       for.cond:
-; AMDGPU-NEXT:    [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
+; AMDGPU-NEXT:    br label %[[FOR_COND:.*]]
+; AMDGPU:       [[FOR_COND]]:
+; AMDGPU-NEXT:    [[I_0:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_BODY:.*]] ]
 ; AMDGPU-NEXT:    [[CMP:%.*]] = icmp slt i32 [[I_0]], 100
-; AMDGPU-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
-; AMDGPU:       for.cond.cleanup:
+; AMDGPU-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP:.*]]
+; AMDGPU:       [[FOR_COND_CLEANUP]]:
 ; AMDGPU-NEXT:    call void @spmd_amenable() #[[ATTR7]]
 ; AMDGPU-NEXT:    ret void
-; AMDGPU:       for.body:
-; AMDGPU-NEXT:    store ptr addrspacecast (ptr addrspace(3) @x_shared to ptr), ptr addrspace(5) [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[TBAA20:![0-9]+]]
+; AMDGPU:       [[FOR_BODY]]:
+; AMDGPU-NEXT:    store ptr addrspacecast (ptr addrspace(3) @x_shared to ptr), ptr addrspace(5) [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[ANYPTR_TBAA20:![0-9]+]]
 ; AMDGPU-NEXT:    [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID_]] to ptr addrspace(5)
-; AMDGPU-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[TMP0]], align 4, !tbaa [[TBAA12]]
+; AMDGPU-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[TMP0]], align 4, !tbaa [[INT_TBAA12]]
 ; AMDGPU-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__5, ptr @__omp_outlined__5_wrapper, ptr [[CAPTURED_VARS_ADDRS_CAST]], i64 1)
 ; AMDGPU-NEXT:    [[INC]] = add nsw i32 [[I_0]], 1
-; AMDGPU-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP22:![0-9]+]]
+; AMDGPU-NEXT:    br label %[[FOR_COND]], !llvm.loop [[LOOP22:![0-9]+]]
 ;
-; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__4
-; NVPTX-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
-; NVPTX-NEXT:  entry:
+; NVPTX-LABEL: define internal void @__omp_outlined__4(
+; NVPTX-SAME: ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+; NVPTX-NEXT:  [[ENTRY:.*]]:
 ; NVPTX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca ptr, align 8, addrspace(5)
 ; NVPTX-NEXT:    [[CAPTURED_VARS_ADDRS_CAST:%.*]] = addrspacecast ptr addrspace(5) [[CAPTURED_VARS_ADDRS]] to ptr
-; NVPTX-NEXT:    br label [[FOR_COND:%.*]]
-; NVPTX:       for.cond:
-; NVPTX-NEXT:    [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
+; NVPTX-NEXT:    br label %[[FOR_COND:.*]]
+; NVPTX:       [[FOR_COND]]:
+; NVPTX-NEXT:    [[I_0:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_BODY:.*]] ]
 ; NVPTX-NEXT:    [[CMP:%.*]] = icmp slt i32 [[I_0]], 100
-; NVPTX-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
-; NVPTX:       for.cond.cleanup:
+; NVPTX-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP:.*]]
+; NVPTX:       [[FOR_COND_CLEANUP]]:
 ; NVPTX-NEXT:    call void @spmd_amenable() #[[ATTR7]]
 ; NVPTX-NEXT:    ret void
-; NVPTX:       for.body:
-; NVPTX-NEXT:    store ptr addrspacecast (ptr addrspace(3) @x_shared to ptr), ptr addrspace(5) [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[TBAA20:![0-9]+]]
+; NVPTX:       [[FOR_BODY]]:
+; NVPTX-NEXT:    store ptr addrspacecast (ptr addrspace(3) @x_shared to ptr), ptr addrspace(5) [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[ANYPTR_TBAA20:![0-9]+]]
 ; NVPTX-NEXT:    [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID_]] to ptr addrspace(5)
-; NVPTX-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[TMP0]], align 4, !tbaa [[TBAA12]]
+; NVPTX-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[TMP0]], align 4, !tbaa [[INT_TBAA12]]
 ; NVPTX-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__5, ptr @__omp_outlined__5_wrapper, ptr [[CAPTURED_VARS_ADDRS_CAST]], i64 1)
 ; NVPTX-NEXT:    [[INC]] = add nsw i32 [[I_0]], 1
-; NVPTX-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP22:![0-9]+]]
+; NVPTX-NEXT:    br label %[[FOR_COND]], !llvm.loop [[LOOP22:![0-9]+]]
 ;
-; AMDGPU-DISABLED1-LABEL: define {{[^@]+}}@__omp_outlined__4
-; AMDGPU-DISABLED1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
-; AMDGPU-DISABLED1-NEXT:  entry:
+; AMDGPU-DISABLED1-LABEL: define internal void @__omp_outlined__4(
+; AMDGPU-DISABLED1-SAME: ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+; AMDGPU-DISABLED1-NEXT:  [[ENTRY:.*]]:
 ; AMDGPU-DISABLED1-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca ptr, align 8, addrspace(5)
 ; AMDGPU-DISABLED1-NEXT:    [[CAPTURED_VARS_ADDRS_CAST:%.*]] = addrspacecast ptr addrspace(5) [[CAPTURED_VARS_ADDRS]] to ptr
-; AMDGPU-DISABLED1-NEXT:    br label [[FOR_COND:%.*]]
-; AMDGPU-DISABLED1:       for.cond:
-; AMDGPU-DISABLED1-NEXT:    [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
+; AMDGPU-DISABLED1-NEXT:    br label %[[FOR_COND:.*]]
+; AMDGPU-DISABLED1:       [[FOR_COND]]:
+; AMDGPU-DISABLED1-NEXT:    [[I_0:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_BODY:.*]] ]
 ; AMDGPU-DISABLED1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[I_0]], 100
-; AMDGPU-DISABLED1-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
-; AMDGPU-DISABLED1:       for.cond.cleanup:
+; AMDGPU-DISABLED1-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP:.*]]
+; AMDGPU-DISABLED1:       [[FOR_COND_CLEANUP]]:
 ; AMDGPU-DISABLED1-NEXT:    call void @spmd_amenable() #[[ATTR7]]
 ; AMDGPU-DISABLED1-NEXT:    ret void
-; AMDGPU-DISABLED1:       for.body:
-; AMDGPU-DISABLED1-NEXT:    store ptr addrspacecast (ptr addrspace(3) @x_shared to ptr), ptr addrspace(5) [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[TBAA20:![0-9]+]]
+; AMDGPU-DISABLED1:       [[FOR_BODY]]:
+; AMDGPU-DISABLED1-NEXT:    store ptr addrspacecast (ptr addrspace(3) @x_shared to ptr), ptr addrspace(5) [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[ANYPTR_TBAA20:![0-9]+]]
 ; AMDGPU-DISABLED1-NEXT:    [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID_]] to ptr addrspace(5)
-; AMDGPU-DISABLED1-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[TMP0]], align 4, !tbaa [[TBAA12]]
+; AMDGPU-DISABLED1-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[TMP0]], align 4, !tbaa [[INT_TBAA12]]
 ; AMDGPU-DISABLED1-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__5, ptr @__omp_outlined__5_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS_CAST]], i64 1)
 ; AMDGPU-DISABLED1-NEXT:    [[INC]] = add nsw i32 [[I_0]], 1
-; AMDGPU-DISABLED1-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP22:![0-9]+]]
+; AMDGPU-DISABLED1-NEXT:    br label %[[FOR_COND]], !llvm.loop [[LOOP22:![0-9]+]]
 ;
-; AMDGPU-DISABLED2-LABEL: define {{[^@]+}}@__omp_outlined__4
-; AMDGPU-DISABLED2-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
-; AMDGPU-DISABLED2-NEXT:  entry:
+; AMDGPU-DISABLED2-LABEL: define internal void @__omp_outlined__4(
+; AMDGPU-DISABLED2-SAME: ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+; AMDGPU-DISABLED2-NEXT:  [[ENTRY:.*]]:
 ; AMDGPU-DISABLED2-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca ptr, align 8, addrspace(5)
 ; AMDGPU-DISABLED2-NEXT:    [[CAPTURED_VARS_ADDRS_CAST:%.*]] = addrspacecast ptr addrspace(5) [[CAPTURED_VARS_ADDRS]] to ptr
-; AMDGPU-DISABLED2-NEXT:    br label [[FOR_COND:%.*]]
-; AMDGPU-DISABLED2:       for.cond:
-; AMDGPU-DISABLED2-NEXT:    [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
+; AMDGPU-DISABLED2-NEXT:    br label %[[FOR_COND:.*]]
+; AMDGPU-DISABLED2:       [[FOR_COND]]:
+; AMDGPU-DISABLED2-NEXT:    [[I_0:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_BODY:.*]] ]
 ; AMDGPU-DISABLED2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[I_0]], 100
-; AMDGPU-DISABLED2-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
-; AMDGPU-DISABLED2:       for.cond.cleanup:
+; AMDGPU-DISABLED2-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP:.*]]
+; AMDGPU-DISABLED2:       [[FOR_COND_CLEANUP]]:
 ; AMDGPU-DISABLED2-NEXT:    call void @spmd_amenable() #[[ATTR7]]
 ; AMDGPU-DISABLED2-NEXT:    ret void
-; AMDGPU-DISABLED2:       for.body:
-; AMDGPU-DISABLED2-NEXT:    store ptr addrspacecast (ptr addrspace(3) @x_shared to ptr), ptr addrspace(5) [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[TBAA20:![0-9]+]]
+; AMDGPU-DISABLED2:       [[FOR_BODY]]:
+; AMDGPU-DISABLED2-NEXT:    store ptr addrspacecast (ptr addrspace(3) @x_shared to ptr), ptr addrspace(5) [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[ANYPTR_TBAA20:![0-9]+]]
 ; AMDGPU-DISABLED2-NEXT:    [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID_]] to ptr addrspace(5)
-; AMDGPU-DISABLED2-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[TMP0]], align 4, !tbaa [[TBAA12]]
+; AMDGPU-DISABLED2-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[TMP0]], align 4, !tbaa [[INT_TBAA12]]
 ; AMDGPU-DISABLED2-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__5, ptr @__omp_outlined__5_wrapper, ptr [[CAPTURED_VARS_ADDRS_CAST]], i64 1)
 ; AMDGPU-DISABLED2-NEXT:    [[INC]] = add nsw i32 [[I_0]], 1
-; AMDGPU-DISABLED2-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP22:![0-9]+]]
+; AMDGPU-DISABLED2-NEXT:    br label %[[FOR_COND]], !llvm.loop [[LOOP22:![0-9]+]]
 ;
-; NVPTX-DISABLED1-LABEL: define {{[^@]+}}@__omp_outlined__4
-; NVPTX-DISABLED1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
-; NVPTX-DISABLED1-NEXT:  entry:
+; NVPTX-DISABLED1-LABEL: define internal void @__omp_outlined__4(
+; NVPTX-DISABLED1-SAME: ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+; NVPTX-DISABLED1-NEXT:  [[ENTRY:.*]]:
 ; NVPTX-DISABLED1-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca ptr, align 8, addrspace(5)
 ; NVPTX-DISABLED1-NEXT:    [[CAPTURED_VARS_ADDRS_CAST:%.*]] = addrspacecast ptr addrspace(5) [[CAPTURED_VARS_ADDRS]] to ptr
-; NVPTX-DISABLED1-NEXT:    br label [[FOR_COND:%.*]]
-; NVPTX-DISABLED1:       for.cond:
-; NVPTX-DISABLED1-NEXT:    [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
+; NVPTX-DISABLED1-NEXT:    br label %[[FOR_COND:.*]]
+; NVPTX-DISABLED1:       [[FOR_COND]]:
+; NVPTX-DISABLED1-NEXT:    [[I_0:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_BODY:.*]] ]
 ; NVPTX-DISABLED1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[I_0]], 100
-; NVPTX-DISABLED1-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
-; NVPTX-DISABLED1:       for.cond.cleanup:
+; NVPTX-DISABLED1-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP:.*]]
+; NVPTX-DISABLED1:       [[FOR_COND_CLEANUP]]:
 ; NVPTX-DISABLED1-NEXT:    call void @spmd_amenable() #[[ATTR7]]
 ; NVPTX-DISABLED1-NEXT:    ret void
-; NVPTX-DISABLED1:       for.body:
-; NVPTX-DISABLED1-NEXT:    store ptr addrspacecast (ptr addrspace(3) @x_shared to ptr), ptr addrspace(5) [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[TBAA20:![0-9]+]]
+; NVPTX-DISABLED1:       [[FOR_BODY]]:
+; NVPTX-DISABLED1-NEXT:    store ptr addrspacecast (ptr addrspace(3) @x_shared to ptr), ptr addrspace(5) [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[ANYPTR_TBAA20:![0-9]+]]
 ; NVPTX-DISABLED1-NEXT:    [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID_]] to ptr addrspace(5)
-; NVPTX-DISABLED1-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[TMP0]], align 4, !tbaa [[TBAA12]]
+; NVPTX-DISABLED1-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[TMP0]], align 4, !tbaa [[INT_TBAA12]]
 ; NVPTX-DISABLED1-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__5, ptr @__omp_outlined__5_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS_CAST]], i64 1)
 ; NVPTX-DISABLED1-NEXT:    [[INC]] = add nsw i32 [[I_0]], 1
-; NVPTX-DISABLED1-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP22:![0-9]+]]
+; NVPTX-DISABLED1-NEXT:    br label %[[FOR_COND]], !llvm.loop [[LOOP22:![0-9]+]]
 ;
-; NVPTX-DISABLED2-LABEL: define {{[^@]+}}@__omp_outlined__4
-; NVPTX-DISABLED2-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
-; NVPTX-DISABLED2-NEXT:  entry:
+; NVPTX-DISABLED2-LABEL: define internal void @__omp_outlined__4(
+; NVPTX-DISABLED2-SAME: ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+; NVPTX-DISABLED2-NEXT:  [[ENTRY:.*]]:
 ; NVPTX-DISABLED2-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca ptr, align 8, addrspace(5)
 ; NVPTX-DISABLED2-NEXT:    [[CAPTURED_VARS_ADDRS_CAST:%.*]] = addrspacecast ptr addrspace(5) [[CAPTURED_VARS_ADDRS]] to ptr
-; NVPTX-DISABLED2-NEXT:    br label [[FOR_COND:%.*]]
-; NVPTX-DISABLED2:       for.cond:
-; NVPTX-DISABLED2-NEXT:    [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
+; NVPTX-DISABLED2-NEXT:    br label %[[FOR_COND:.*]]
+; NVPTX-DISABLED2:       [[FOR_COND]]:
+; NVPTX-DISABLED2-NEXT:    [[I_0:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_BODY:.*]] ]
 ; NVPTX-DISABLED2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[I_0]], 100
-; NVPTX-DISABLED2-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
-; NVPTX-DISABLED2:       for.cond.cleanup:
+; NVPTX-DISABLED2-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP:.*]]
+; NVPTX-DISABLED2:       [[FOR_COND_CLEANUP]]:
 ; NVPTX-DISABLED2-NEXT:    call void @spmd_amenable() #[[ATTR7]]
 ; NVPTX-DISABLED2-NEXT:    ret void
-; NVPTX-DISABLED2:       for.body:
-; NVPTX-DISABLED2-NEXT:    store ptr addrspacecast (ptr addrspace(3) @x_shared to ptr), ptr addrspace(5) [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[TBAA20:![0-9]+]]
+; NVPTX-DISABLED2:       [[FOR_BODY]]:
+; NVPTX-DISABLED2-NEXT:    store ptr addrspacecast (ptr addrspace(3) @x_shared to ptr), ptr addrspace(5) [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[ANYPTR_TBAA20:![0-9]+]]
 ; NVPTX-DISABLED2-NEXT:    [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID_]] to ptr addrspace(5)
-; NVPTX-DISABLED2-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[TMP0]], align 4, !tbaa [[TBAA12]]
+; NVPTX-DISABLED2-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[TMP0]], align 4, !tbaa [[INT_TBAA12]]
 ; NVPTX-DISABLED2-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__5, ptr @__omp_outlined__5_wrapper, ptr [[CAPTURED_VARS_ADDRS_CAST]], i64 1)
 ; NVPTX-DISABLED2-NEXT:    [[INC]] = add nsw i32 [[I_0]], 1
-; NVPTX-DISABLED2-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP22:![0-9]+]]
+; NVPTX-DISABLED2-NEXT:    br label %[[FOR_COND]], !llvm.loop [[LOOP22:![0-9]+]]
 ;
 entry:
   %captured_vars_addrs = alloca ptr, align 8, addrspace(5)
@@ -1557,57 +1557,57 @@ for.body:                                         ; preds = %for.cond
 }
 
 define internal void @__omp_outlined__5(ptr noalias %.global_tid., ptr noalias %.bound_tid., ptr nonnull align 4 dereferenceable(4) %x) {
-; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__5
-; AMDGPU-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(4) [[X:%.*]]) {
-; AMDGPU-NEXT:  entry:
-; AMDGPU-NEXT:    [[TMP0:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[TBAA12]]
+; AMDGPU-LABEL: define internal void @__omp_outlined__5(
+; AMDGPU-SAME: ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(4) [[X:%.*]]) {
+; AMDGPU-NEXT:  [[ENTRY:.*:]]
+; AMDGPU-NEXT:    [[TMP0:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[INT_TBAA12]]
 ; AMDGPU-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP0]], 1
-; AMDGPU-NEXT:    store i32 [[INC]], ptr [[X]], align 4, !tbaa [[TBAA12]]
+; AMDGPU-NEXT:    store i32 [[INC]], ptr [[X]], align 4, !tbaa [[INT_TBAA12]]
 ; AMDGPU-NEXT:    call void @unknown() #[[ATTR8]]
 ; AMDGPU-NEXT:    ret void
 ;
-; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__5
-; NVPTX-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(4) [[X:%.*]]) {
-; NVPTX-NEXT:  entry:
-; NVPTX-NEXT:    [[TMP0:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[TBAA12]]
+; NVPTX-LABEL: define internal void @__omp_outlined__5(
+; NVPTX-SAME: ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(4) [[X:%.*]]) {
+; NVPTX-NEXT:  [[ENTRY:.*:]]
+; NVPTX-NEXT:    [[TMP0:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[INT_TBAA12]]
 ; NVPTX-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP0]], 1
-; NVPTX-NEXT:    store i32 [[INC]], ptr [[X]], align 4, !tbaa [[TBAA12]]
+; NVPTX-NEXT:    store i32 [[INC]], ptr [[X]], align 4, !tbaa [[INT_TBAA12]]
 ; NVPTX-NEXT:    call void @unknown() #[[ATTR8]]
 ; NVPTX-NEXT:    ret void
 ;
-; AMDGPU-DISABLED1-LABEL: define {{[^@]+}}@__omp_outlined__5
-; AMDGPU-DISABLED1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(4) [[X:%.*]]) {
-; AMDGPU-DISABLED1-NEXT:  entry:
-; AMDGPU-DISABLED1-NEXT:    [[TMP0:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[TBAA12]]
+; AMDGPU-DISABLED1-LABEL: define internal void @__omp_outlined__5(
+; AMDGPU-DISABLED1-SAME: ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(4) [[X:%.*]]) {
+; AMDGPU-DISABLED1-NEXT:  [[ENTRY:.*:]]
+; AMDGPU-DISABLED1-NEXT:    [[TMP0:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[INT_TBAA12]]
 ; AMDGPU-DISABLED1-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP0]], 1
-; AMDGPU-DISABLED1-NEXT:    store i32 [[INC]], ptr [[X]], align 4, !tbaa [[TBAA12]]
+; AMDGPU-DISABLED1-NEXT:    store i32 [[INC]], ptr [[X]], align 4, !tbaa [[INT_TBAA12]]
 ; AMDGPU-DISABLED1-NEXT:    call void @unknown() #[[ATTR8]]
 ; AMDGPU-DISABLED1-NEXT:    ret void
 ;
-; AMDGPU-DISABLED2-LABEL: define {{[^@]+}}@__omp_outlined__5
-; AMDGPU-DISABLED2-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(4) [[X:%.*]]) {
-; AMDGPU-DISABLED2-NEXT:  entry:
-; AMDGPU-DISABLED2-NEXT:    [[TMP0:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[TBAA12]]
+; AMDGPU-DISABLED2-LABEL: define internal void @__omp_outlined__5(
+; AMDGPU-DISABLED2-SAME: ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(4) [[X:%.*]]) {
+; AMDGPU-DISABLED2-NEXT:  [[ENTRY:.*:]]
+; AMDGPU-DISABLED2-NEXT:    [[TMP0:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[INT_TBAA12]]
 ; AMDGPU-DISABLED2-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP0]], 1
-; AMDGPU-DISABLED2-NEXT:    store i32 [[INC]], ptr [[X]], align 4, !tbaa [[TBAA12]]
+; AMDGPU-DISABLED2-NEXT:    store i32 [[INC]], ptr [[X]], align 4, !tbaa [[INT_TBAA12]]
 ; AMDGPU-DISABLED2-NEXT:    call void @unknown() #[[ATTR8]]
 ; AMDGPU-DISABLED2-NEXT:    ret void
 ;
-; NVPTX-DISABLED1-LABEL: define {{[^@]+}}@__omp_outlined__5
-; NVPTX-DISABLED1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(4) [[X:%.*]]) {
-; NVPTX-DISABLED1-NEXT:  entry:
-; NVPTX-DISABLED1-NEXT:    [[TMP0:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[TBAA12]]
+; NVPTX-DISABLED1-LABEL: define internal void @__omp_outlined__5(
+; NVPTX-DISABLED1-SAME: ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(4) [[X:%.*]]) {
+; NVPTX-DISABLED1-NEXT:  [[ENTRY:.*:]]
+; NVPTX-DISABLED1-NEXT:    [[TMP0:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[INT_TBAA12]]
 ; NVPTX-DISABLED1-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP0]], 1
-; NVPTX-DISABLED1-NEXT:    store i32 [[INC]], ptr [[X]], align 4, !tbaa [[TBAA12]]
+; NVPTX-DISABLED1-NEXT:    store i32 [[INC]], ptr [[X]], align 4, !tbaa [[INT_TBAA12]]
 ; NVPTX-DISABLED1-NEXT:    call void @unknown() #[[ATTR8]]
 ; NVPTX-DISABLED1-NEXT:    ret void
 ;
-; NVPTX-DISABLED2-LABEL: define {{[^@]+}}@__omp_outlined__5
-; NVPTX-DISABLED2-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(4) [[X:%.*]]) {
-; NVPTX-DISABLED2-NEXT:  entry:
-; NVPTX-DISABLED2-NEXT:    [[TMP0:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[TBAA12]]
+; NVPTX-DISABLED2-LABEL: define internal void @__omp_outlined__5(
+; NVPTX-DISABLED2-SAME: ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(4) [[X:%.*]]) {
+; NVPTX-DISABLED2-NEXT:  [[ENTRY:.*:]]
+; NVPTX-DISABLED2-NEXT:    [[TMP0:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[INT_TBAA12]]
 ; NVPTX-DISABLED2-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP0]], 1
-; NVPTX-DISABLED2-NEXT:    store i32 [[INC]], ptr [[X]], align 4, !tbaa [[TBAA12]]
+; NVPTX-DISABLED2-NEXT:    store i32 [[INC]], ptr [[X]], align 4, !tbaa [[INT_TBAA12]]
 ; NVPTX-DISABLED2-NEXT:    call void @unknown() #[[ATTR8]]
 ; NVPTX-DISABLED2-NEXT:    ret void
 ;
@@ -1621,9 +1621,9 @@ entry:
 
 ; Function Attrs: convergent norecurse nounwind
 define internal void @__omp_outlined__5_wrapper(i16 zeroext %0, i32 %1) #1 {
-; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__5_wrapper
-; AMDGPU-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] {
-; AMDGPU-NEXT:  entry:
+; AMDGPU-LABEL: define internal void @__omp_outlined__5_wrapper(
+; AMDGPU-SAME: i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] {
+; AMDGPU-NEXT:  [[ENTRY:.*:]]
 ; AMDGPU-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8, addrspace(5)
 ; AMDGPU-NEXT:    [[DOTADDR1_CAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
 ; AMDGPU-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
@@ -1632,13 +1632,13 @@ define internal void @__omp_outlined__5_wrapper(i16 zeroext %0, i32 %1) #1 {
 ; AMDGPU-NEXT:    [[GLOBAL_ARGS_CAST:%.*]] = addrspacecast ptr addrspace(5) [[GLOBAL_ARGS]] to ptr
 ; AMDGPU-NEXT:    call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS_CAST]])
 ; AMDGPU-NEXT:    [[TMP2:%.*]] = load ptr, ptr addrspace(5) [[GLOBAL_ARGS]], align 8
-; AMDGPU-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[TBAA20]]
+; AMDGPU-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[ANYPTR_TBAA20]]
 ; AMDGPU-NEXT:    call void @__omp_outlined__5(ptr [[DOTADDR1_CAST]], ptr [[DOTZERO_ADDR_CAST]], ptr [[TMP3]]) #[[ATTR4]]
 ; AMDGPU-NEXT:    ret void
 ;
-; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__5_wrapper
-; NVPTX-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] {
-; NVPTX-NEXT:  entry:
+; NVPTX-LABEL: define internal void @__omp_outlined__5_wrapper(
+; NVPTX-SAME: i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] {
+; NVPTX-NEXT:  [[ENTRY:.*:]]
 ; NVPTX-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8, addrspace(5)
 ; NVPTX-NEXT:    [[DOTADDR1_CAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
 ; NVPTX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
@@ -1647,13 +1647,13 @@ define internal void @__omp_outlined__5_wrapper(i16 zeroext %0, i32 %1) #1 {
 ; NVPTX-NEXT:    [[GLOBAL_ARGS_CAST:%.*]] = addrspacecast ptr addrspace(5) [[GLOBAL_ARGS]] to ptr
 ; NVPTX-NEXT:    call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS_CAST]])
 ; NVPTX-NEXT:    [[TMP2:%.*]] = load ptr, ptr addrspace(5) [[GLOBAL_ARGS]], align 8
-; NVPTX-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[TBAA20]]
+; NVPTX-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[ANYPTR_TBAA20]]
 ; NVPTX-NEXT:    call void @__omp_outlined__5(ptr [[DOTADDR1_CAST]], ptr [[DOTZERO_ADDR_CAST]], ptr [[TMP3]]) #[[ATTR4]]
 ; NVPTX-NEXT:    ret void
 ;
-; AMDGPU-DISABLED1-LABEL: define {{[^@]+}}@__omp_outlined__5_wrapper
-; AMDGPU-DISABLED1-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] {
-; AMDGPU-DISABLED1-NEXT:  entry:
+; AMDGPU-DISABLED1-LABEL: define internal void @__omp_outlined__5_wrapper(
+; AMDGPU-DISABLED1-SAME: i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] {
+; AMDGPU-DISABLED1-NEXT:  [[ENTRY:.*:]]
 ; AMDGPU-DISABLED1-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8, addrspace(5)
 ; AMDGPU-DISABLED1-NEXT:    [[DOTADDR1_CAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
 ; AMDGPU-DISABLED1-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
@@ -1662,13 +1662,13 @@ define internal void @__omp_outlined__5_wrapper(i16 zeroext %0, i32 %1) #1 {
 ; AMDGPU-DISABLED1-NEXT:    [[GLOBAL_ARGS_CAST:%.*]] = addrspacecast ptr addrspace(5) [[GLOBAL_ARGS]] to ptr
 ; AMDGPU-DISABLED1-NEXT:    call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS_CAST]])
 ; AMDGPU-DISABLED1-NEXT:    [[TMP2:%.*]] = load ptr, ptr addrspace(5) [[GLOBAL_ARGS]], align 8
-; AMDGPU-DISABLED1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[TBAA20]]
+; AMDGPU-DISABLED1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[ANYPTR_TBAA20]]
 ; AMDGPU-DISABLED1-NEXT:    call void @__omp_outlined__5(ptr [[DOTADDR1_CAST]], ptr [[DOTZERO_ADDR_CAST]], ptr [[TMP3]]) #[[ATTR4]]
 ; AMDGPU-DISABLED1-NEXT:    ret void
 ;
-; AMDGPU-DISABLED2-LABEL: define {{[^@]+}}@__omp_outlined__5_wrapper
-; AMDGPU-DISABLED2-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] {
-; AMDGPU-DISABLED2-NEXT:  entry:
+; AMDGPU-DISABLED2-LABEL: define internal void @__omp_outlined__5_wrapper(
+; AMDGPU-DISABLED2-SAME: i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] {
+; AMDGPU-DISABLED2-NEXT:  [[ENTRY:.*:]]
 ; AMDGPU-DISABLED2-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8, addrspace(5)
 ; AMDGPU-DISABLED2-NEXT:    [[DOTADDR1_CAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
 ; AMDGPU-DISABLED2-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
@@ -1677,13 +1677,13 @@ define internal void @__omp_outlined__5_wrapper(i16 zeroext %0, i32 %1) #1 {
 ; AMDGPU-DISABLED2-NEXT:    [[GLOBAL_ARGS_CAST:%.*]] = addrspacecast ptr addrspace(5) [[GLOBAL_ARGS]] to ptr
 ; AMDGPU-DISABLED2-NEXT:    call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS_CAST]])
 ; AMDGPU-DISABLED2-NEXT:    [[TMP2:%.*]] = load ptr, ptr addrspace(5) [[GLOBAL_ARGS]], align 8
-; AMDGPU-DISABLED2-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[TBAA20]]
+; AMDGPU-DISABLED2-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[ANYPTR_TBAA20]]
 ; AMDGPU-DISABLED2-NEXT:    call void @__omp_outlined__5(ptr [[DOTADDR1_CAST]], ptr [[DOTZERO_ADDR_CAST]], ptr [[TMP3]]) #[[ATTR4]]
 ; AMDGPU-DISABLED2-NEXT:    ret void
 ;
-; NVPTX-DISABLED1-LABEL: define {{[^@]+}}@__omp_outlined__5_wrapper
-; NVPTX-DISABLED1-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] {
-; NVPTX-DISABLED1-NEXT:  entry:
+; NVPTX-DISABLED1-LABEL: define internal void @__omp_outlined__5_wrapper(
+; NVPTX-DISABLED1-SAME: i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] {
+; NVPTX-DISABLED1-NEXT:  [[ENTRY:.*:]]
 ; NVPTX-DISABLED1-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8, addrspace(5)
 ; NVPTX-DISABLED1-NEXT:    [[DOTADDR1_CAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
 ; NVPTX-DISABLED1-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
@@ -1692,13 +1692,13 @@ define internal void @__omp_outlined__5_wrapper(i16 zeroext %0, i32 %1) #1 {
 ; NVPTX-DISABLED1-NEXT:    [[GLOBAL_ARGS_CAST:%.*]] = addrspacecast ptr addrspace(5) [[GLOBAL_ARGS]] to ptr
 ; NVPTX-DISABLED1-NEXT:    call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS_CAST]])
 ; NVPTX-DISABLED1-NEXT:    [[TMP2:%.*]] = load ptr, ptr addrspace(5) [[GLOBAL_ARGS]], align 8
-; NVPTX-DISABLED1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[TBAA20]]
+; NVPTX-DISABLED1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[ANYPTR_TBAA20]]
 ; NVPTX-DISABLED1-NEXT:    call void @__omp_outlined__5(ptr [[DOTADDR1_CAST]], ptr [[DOTZERO_ADDR_CAST]], ptr [[TMP3]]) #[[ATTR4]]
 ; NVPTX-DISABLED1-NEXT:    ret void
 ;
-; NVPTX-DISABLED2-LABEL: define {{[^@]+}}@__omp_outlined__5_wrapper
-; NVPTX-DISABLED2-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] {
-; NVPTX-DISABLED2-NEXT:  entry:
+; NVPTX-DISABLED2-LABEL: define internal void @__omp_outlined__5_wrapper(
+; NVPTX-DISABLED2-SAME: i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] {
+; NVPTX-DISABLED2-NEXT:  [[ENTRY:.*:]]
 ; NVPTX-DISABLED2-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8, addrspace(5)
 ; NVPTX-DISABLED2-NEXT:    [[DOTADDR1_CAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
 ; NVPTX-DISABLED2-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
@@ -1707,7 +1707,7 @@ define internal void @__omp_outlined__5_wrapper(i16 zeroext %0, i32 %1) #1 {
 ; NVPTX-DISABLED2-NEXT:    [[GLOBAL_ARGS_CAST:%.*]] = addrspacecast ptr addrspace(5) [[GLOBAL_ARGS]] to ptr
 ; NVPTX-DISABLED2-NEXT:    call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS_CAST]])
 ; NVPTX-DISABLED2-NEXT:    [[TMP2:%.*]] = load ptr, ptr addrspace(5) [[GLOBAL_ARGS]], align 8
-; NVPTX-DISABLED2-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[TBAA20]]
+; NVPTX-DISABLED2-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[ANYPTR_TBAA20]]
 ; NVPTX-DISABLED2-NEXT:    call void @__omp_outlined__5(ptr [[DOTADDR1_CAST]], ptr [[DOTZERO_ADDR_CAST]], ptr [[TMP3]]) #[[ATTR4]]
 ; NVPTX-DISABLED2-NEXT:    ret void
 ;
@@ -1729,47 +1729,47 @@ entry:
 
 ; Function Attrs: alwaysinline convergent norecurse nounwind
 define weak ptx_kernel void @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50() #0 {
-; AMDGPU-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50
-; AMDGPU-SAME: () #[[ATTR0]] {
-; AMDGPU-NEXT:  entry:
+; AMDGPU-LABEL: define weak ptx_kernel void @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50(
+; AMDGPU-SAME: ) #[[ATTR0]] {
+; AMDGPU-NEXT:  [[ENTRY:.*:]]
 ; AMDGPU-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 ; AMDGPU-NEXT:    [[DOTZERO_ADDR_CAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTZERO_ADDR]] to ptr
 ; AMDGPU-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca ptr, align 8, addrspace(5)
 ; AMDGPU-NEXT:    [[DOTTHREADID_TEMP__CAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTTHREADID_TEMP_]] to ptr
 ; AMDGPU-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50_kernel_environment, ptr null)
 ; AMDGPU-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
-; AMDGPU-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
-; AMDGPU:       common.ret:
+; AMDGPU-NEXT:    br i1 [[EXEC_USER_CODE]], label %[[USER_CODE_ENTRY:.*]], label %[[COMMON_RET:.*]]
+; AMDGPU:       [[COMMON_RET]]:
 ; AMDGPU-NEXT:    ret void
-; AMDGPU:       user_code.entry:
+; AMDGPU:       [[USER_CODE_ENTRY]]:
 ; AMDGPU-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]]
-; AMDGPU-NEXT:    store i32 [[TMP1]], ptr addrspace(5) [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12]]
+; AMDGPU-NEXT:    store i32 [[TMP1]], ptr addrspace(5) [[DOTTHREADID_TEMP_]], align 4, !tbaa [[INT_TBAA12]]
 ; AMDGPU-NEXT:    call void @__omp_outlined__6(ptr [[DOTTHREADID_TEMP__CAST]], ptr [[DOTZERO_ADDR_CAST]]) #[[ATTR4]]
 ; AMDGPU-NEXT:    call void @__kmpc_target_deinit()
-; AMDGPU-NEXT:    br label [[COMMON_RET]]
+; AMDGPU-NEXT:    br label %[[COMMON_RET]]
 ;
-; NVPTX-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50
-; NVPTX-SAME: () #[[ATTR0]] {
-; NVPTX-NEXT:  entry:
+; NVPTX-LABEL: define weak ptx_kernel void @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50(
+; NVPTX-SAME: ) #[[ATTR0]] {
+; NVPTX-NEXT:  [[ENTRY:.*:]]
 ; NVPTX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 ; NVPTX-NEXT:    [[DOTZERO_ADDR_CAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTZERO_ADDR]] to ptr
 ; NVPTX-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca ptr, align 8, addrspace(5)
 ; NVPTX-NEXT:    [[DOTTHREADID_TEMP__CAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTTHREADID_TEMP_]] to ptr
 ; NVPTX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50_kernel_environment, ptr null)
 ; NVPTX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
-; NVPTX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
-; NVPTX:       common.ret:
+; NVPTX-NEXT:    br i1 [[EXEC_USER_CODE]], label %[[USER_CODE_ENTRY:.*]], label %[[COMMON_RET:.*]]
+; NVPTX:       [[COMMON_RET]]:
 ; NVPTX-NEXT:    ret void
-; NVPTX:       user_code.entry:
+; NVPTX:       [[USER_CODE_ENTRY]]:
 ; NVPTX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]]
-; NVPTX-NEXT:    store i32 [[TMP1]], ptr addrspace(5) [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12]]
+; NVPTX-NEXT:    store i32 [[TMP1]], ptr addrspace(5) [[DOTTHREADID_TEMP_]], align 4, !tbaa [[INT_TBAA12]]
 ; NVPTX-NEXT:    call void @__omp_outlined__6(ptr [[DOTTHREADID_TEMP__CAST]], ptr [[DOTZERO_ADDR_CAST]]) #[[ATTR4]]
 ; NVPTX-NEXT:    call void @__kmpc_target_deinit()
-; NVPTX-NEXT:    br label [[COMMON_RET]]
+; NVPTX-NEXT:    br label %[[COMMON_RET]]
 ;
-; AMDGPU-DISABLED1-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50
-; AMDGPU-DISABLED1-SAME: () #[[ATTR0]] {
-; AMDGPU-DISABLED1-NEXT:  entry:
+; AMDGPU-DISABLED1-LABEL: define weak ptx_kernel void @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50(
+; AMDGPU-DISABLED1-SAME: ) #[[ATTR0]] {
+; AMDGPU-DISABLED1-NEXT:  [[ENTRY:.*:]]
 ; AMDGPU-DISABLED1-NEXT:    [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 ; AMDGPU-DISABLED1-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 ; AMDGPU-DISABLED1-NEXT:    [[DOTZERO_ADDR_CAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTZERO_ADDR]] to ptr
@@ -1777,71 +1777,71 @@ define weak ptx_kernel void @__omp_offloading_fd02_2044372e_sequential_loop_to_s
 ; AMDGPU-DISABLED1-NEXT:    [[DOTTHREADID_TEMP__CAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTTHREADID_TEMP_]] to ptr
 ; AMDGPU-DISABLED1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50_kernel_environment, ptr null)
 ; AMDGPU-DISABLED1-NEXT:    [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
-; AMDGPU-DISABLED1-NEXT:    br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
-; AMDGPU-DISABLED1:       is_worker_check:
+; AMDGPU-DISABLED1-NEXT:    br i1 [[THREAD_IS_WORKER]], label %[[IS_WORKER_CHECK:.*]], label %[[THREAD_USER_CODE_CHECK:.*]]
+; AMDGPU-DISABLED1:       [[IS_WORKER_CHECK]]:
 ; AMDGPU-DISABLED1-NEXT:    [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
 ; AMDGPU-DISABLED1-NEXT:    [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
 ; AMDGPU-DISABLED1-NEXT:    [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]]
 ; AMDGPU-DISABLED1-NEXT:    [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]]
-; AMDGPU-DISABLED1-NEXT:    br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]]
-; AMDGPU-DISABLED1:       worker_state_machine.begin:
+; AMDGPU-DISABLED1-NEXT:    br i1 [[THREAD_IS_MAIN_OR_WORKER]], label %[[WORKER_STATE_MACHINE_BEGIN:.*]], label %[[WORKER_STATE_MACHINE_FINISHED:.*]]
+; AMDGPU-DISABLED1:       [[WORKER_STATE_MACHINE_BEGIN]]:
 ; AMDGPU-DISABLED1-NEXT:    call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
 ; AMDGPU-DISABLED1-NEXT:    [[WORKER_WORK_FN_ADDR_GENERIC:%.*]] = addrspacecast ptr addrspace(5) [[WORKER_WORK_FN_ADDR]] to ptr
 ; AMDGPU-DISABLED1-NEXT:    [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(ptr [[WORKER_WORK_FN_ADDR_GENERIC]])
 ; AMDGPU-DISABLED1-NEXT:    [[WORKER_WORK_FN:%.*]] = load ptr, ptr [[WORKER_WORK_FN_ADDR_GENERIC]], align 8
 ; AMDGPU-DISABLED1-NEXT:    [[WORKER_IS_DONE:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], null
-; AMDGPU-DISABLED1-NEXT:    br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
-; AMDGPU-DISABLED1:       worker_state_machine.finished:
+; AMDGPU-DISABLED1-NEXT:    br i1 [[WORKER_IS_DONE]], label %[[WORKER_STATE_MACHINE_FINISHED]], label %[[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:.*]]
+; AMDGPU-DISABLED1:       [[WORKER_STATE_MACHINE_FINISHED]]:
 ; AMDGPU-DISABLED1-NEXT:    ret void
-; AMDGPU-DISABLED1:       worker_state_machine.is_active.check:
-; AMDGPU-DISABLED1-NEXT:    br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]]
-; AMDGPU-DISABLED1:       worker_state_machine.parallel_region.check:
-; AMDGPU-DISABLED1-NEXT:    br i1 true, label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK1:%.*]]
-; AMDGPU-DISABLED1:       worker_state_machine.parallel_region.execute:
+; AMDGPU-DISABLED1:       [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK]]:
+; AMDGPU-DISABLED1-NEXT:    br i1 [[WORKER_IS_ACTIVE]], label %[[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:.*]], label %[[WORKER_STATE_MACHINE_DONE_BARRIER:.*]]
+; AMDGPU-DISABLED1:       [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK]]:
+; AMDGPU-DISABLED1-NEXT:    br i1 true, label %[[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:.*]], label %[[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK1:.*]]
+; AMDGPU-DISABLED1:       [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE]]:
 ; AMDGPU-DISABLED1-NEXT:    call void @__omp_outlined__7_wrapper(i16 0, i32 [[TMP0]])
-; AMDGPU-DISABLED1-NEXT:    br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]]
-; AMDGPU-DISABLED1:       worker_state_machine.parallel_region.check1:
-; AMDGPU-DISABLED1-NEXT:    br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]]
-; AMDGPU-DISABLED1:       worker_state_machine.parallel_region.end:
+; AMDGPU-DISABLED1-NEXT:    br label %[[WORKER_STATE_MACHINE_PARALLEL_REGION_END:.*]]
+; AMDGPU-DISABLED1:       [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK1]]:
+; AMDGPU-DISABLED1-NEXT:    br label %[[WORKER_STATE_MACHINE_PARALLEL_REGION_END]]
+; AMDGPU-DISABLED1:       [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]]:
 ; AMDGPU-DISABLED1-NEXT:    call void @__kmpc_kernel_end_parallel()
-; AMDGPU-DISABLED1-NEXT:    br label [[WORKER_STATE_MACHINE_DONE_BARRIER]]
-; AMDGPU-DISABLED1:       worker_state_machine.done.barrier:
+; AMDGPU-DISABLED1-NEXT:    br label %[[WORKER_STATE_MACHINE_DONE_BARRIER]]
+; AMDGPU-DISABLED1:       [[WORKER_STATE_MACHINE_DONE_BARRIER]]:
 ; AMDGPU-DISABLED1-NEXT:    call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
-; AMDGPU-DISABLED1-NEXT:    br label [[WORKER_STATE_MACHINE_BEGIN]]
-; AMDGPU-DISABLED1:       thread.user_code.check:
+; AMDGPU-DISABLED1-NEXT:    br label %[[WORKER_STATE_MACHINE_BEGIN]]
+; AMDGPU-DISABLED1:       [[THREAD_USER_CODE_CHECK]]:
 ; AMDGPU-DISABLED1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
-; AMDGPU-DISABLED1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
-; AMDGPU-DISABLED1:       common.ret:
+; AMDGPU-DISABLED1-NEXT:    br i1 [[EXEC_USER_CODE]], label %[[USER_CODE_ENTRY:.*]], label %[[COMMON_RET:.*]]
+; AMDGPU-DISABLED1:       [[COMMON_RET]]:
 ; AMDGPU-DISABLED1-NEXT:    ret void
-; AMDGPU-DISABLED1:       user_code.entry:
+; AMDGPU-DISABLED1:       [[USER_CODE_ENTRY]]:
 ; AMDGPU-DISABLED1-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]]
-; AMDGPU-DISABLED1-NEXT:    store i32 [[TMP1]], ptr addrspace(5) [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12]]
+; AMDGPU-DISABLED1-NEXT:    store i32 [[TMP1]], ptr addrspace(5) [[DOTTHREADID_TEMP_]], align 4, !tbaa [[INT_TBAA12]]
 ; AMDGPU-DISABLED1-NEXT:    call void @__omp_outlined__6(ptr [[DOTTHREADID_TEMP__CAST]], ptr [[DOTZERO_ADDR_CAST]]) #[[ATTR4]]
 ; AMDGPU-DISABLED1-NEXT:    call void @__kmpc_target_deinit()
-; AMDGPU-DISABLED1-NEXT:    br label [[COMMON_RET]]
+; AMDGPU-DISABLED1-NEXT:    br label %[[COMMON_RET]]
 ;
-; AMDGPU-DISABLED2-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50
-; AMDGPU-DISABLED2-SAME: () #[[ATTR0]] {
-; AMDGPU-DISABLED2-NEXT:  entry:
+; AMDGPU-DISABLED2-LABEL: define weak ptx_kernel void @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50(
+; AMDGPU-DISABLED2-SAME: ) #[[ATTR0]] {
+; AMDGPU-DISABLED2-NEXT:  [[ENTRY:.*:]]
 ; AMDGPU-DISABLED2-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 ; AMDGPU-DISABLED2-NEXT:    [[DOTZERO_ADDR_CAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTZERO_ADDR]] to ptr
 ; AMDGPU-DISABLED2-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca ptr, align 8, addrspace(5)
 ; AMDGPU-DISABLED2-NEXT:    [[DOTTHREADID_TEMP__CAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTTHREADID_TEMP_]] to ptr
 ; AMDGPU-DISABLED2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50_kernel_environment, ptr null)
 ; AMDGPU-DISABLED2-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
-; AMDGPU-DISABLED2-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
-; AMDGPU-DISABLED2:       common.ret:
+; AMDGPU-DISABLED2-NEXT:    br i1 [[EXEC_USER_CODE]], label %[[USER_CODE_ENTRY:.*]], label %[[COMMON_RET:.*]]
+; AMDGPU-DISABLED2:       [[COMMON_RET]]:
 ; AMDGPU-DISABLED2-NEXT:    ret void
-; AMDGPU-DISABLED2:       user_code.entry:
+; AMDGPU-DISABLED2:       [[USER_CODE_ENTRY]]:
 ; AMDGPU-DISABLED2-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]]
-; AMDGPU-DISABLED2-NEXT:    store i32 [[TMP1]], ptr addrspace(5) [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12]]
+; AMDGPU-DISABLED2-NEXT:    store i32 [[TMP1]], ptr addrspace(5) [[DOTTHREADID_TEMP_]], align 4, !tbaa [[INT_TBAA12]]
 ; AMDGPU-DISABLED2-NEXT:    call void @__omp_outlined__6(ptr [[DOTTHREADID_TEMP__CAST]], ptr [[DOTZERO_ADDR_CAST]]) #[[ATTR4]]
 ; AMDGPU-DISABLED2-NEXT:    call void @__kmpc_target_deinit()
-; AMDGPU-DISABLED2-NEXT:    br label [[COMMON_RET]]
+; AMDGPU-DISABLED2-NEXT:    br label %[[COMMON_RET]]
 ;
-; NVPTX-DISABLED1-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50
-; NVPTX-DISABLED1-SAME: () #[[ATTR0]] {
-; NVPTX-DISABLED1-NEXT:  entry:
+; NVPTX-DISABLED1-LABEL: define weak ptx_kernel void @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50(
+; NVPTX-DISABLED1-SAME: ) #[[ATTR0]] {
+; NVPTX-DISABLED1-NEXT:  [[ENTRY:.*:]]
 ; NVPTX-DISABLED1-NEXT:    [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8
 ; NVPTX-DISABLED1-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 ; NVPTX-DISABLED1-NEXT:    [[DOTZERO_ADDR_CAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTZERO_ADDR]] to ptr
@@ -1849,66 +1849,66 @@ define weak ptx_kernel void @__omp_offloading_fd02_2044372e_sequential_loop_to_s
 ; NVPTX-DISABLED1-NEXT:    [[DOTTHREADID_TEMP__CAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTTHREADID_TEMP_]] to ptr
 ; NVPTX-DISABLED1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50_kernel_environment, ptr null)
 ; NVPTX-DISABLED1-NEXT:    [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
-; NVPTX-DISABLED1-NEXT:    br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
-; NVPTX-DISABLED1:       is_worker_check:
+; NVPTX-DISABLED1-NEXT:    br i1 [[THREAD_IS_WORKER]], label %[[IS_WORKER_CHECK:.*]], label %[[THREAD_USER_CODE_CHECK:.*]]
+; NVPTX-DISABLED1:       [[IS_WORKER_CHECK]]:
 ; NVPTX-DISABLED1-NEXT:    [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
 ; NVPTX-DISABLED1-NEXT:    [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
 ; NVPTX-DISABLED1-NEXT:    [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]]
 ; NVPTX-DISABLED1-NEXT:    [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]]
-; NVPTX-DISABLED1-NEXT:    br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]]
-; NVPTX-DISABLED1:       worker_state_machine.begin:
+; NVPTX-DISABLED1-NEXT:    br i1 [[THREAD_IS_MAIN_OR_WORKER]], label %[[WORKER_STATE_MACHINE_BEGIN:.*]], label %[[WORKER_STATE_MACHINE_FINISHED:.*]]
+; NVPTX-DISABLED1:       [[WORKER_STATE_MACHINE_BEGIN]]:
 ; NVPTX-DISABLED1-NEXT:    call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
 ; NVPTX-DISABLED1-NEXT:    [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(ptr [[WORKER_WORK_FN_ADDR]])
 ; NVPTX-DISABLED1-NEXT:    [[WORKER_WORK_FN:%.*]] = load ptr, ptr [[WORKER_WORK_FN_ADDR]], align 8
 ; NVPTX-DISABLED1-NEXT:    [[WORKER_IS_DONE:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], null
-; NVPTX-DISABLED1-NEXT:    br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
-; NVPTX-DISABLED1:       worker_state_machine.finished:
+; NVPTX-DISABLED1-NEXT:    br i1 [[WORKER_IS_DONE]], label %[[WORKER_STATE_MACHINE_FINISHED]], label %[[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:.*]]
+; NVPTX-DISABLED1:       [[WORKER_STATE_MACHINE_FINISHED]]:
 ; NVPTX-DISABLED1-NEXT:    ret void
-; NVPTX-DISABLED1:       worker_state_machine.is_active.check:
-; NVPTX-DISABLED1-NEXT:    br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]]
-; NVPTX-DISABLED1:       worker_state_machine.parallel_region.check:
-; NVPTX-DISABLED1-NEXT:    br i1 true, label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK1:%.*]]
-; NVPTX-DISABLED1:       worker_state_machine.parallel_region.execute:
+; NVPTX-DISABLED1:       [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK]]:
+; NVPTX-DISABLED1-NEXT:    br i1 [[WORKER_IS_ACTIVE]], label %[[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:.*]], label %[[WORKER_STATE_MACHINE_DONE_BARRIER:.*]]
+; NVPTX-DISABLED1:       [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK]]:
+; NVPTX-DISABLED1-NEXT:    br i1 true, label %[[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:.*]], label %[[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK1:.*]]
+; NVPTX-DISABLED1:       [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE]]:
 ; NVPTX-DISABLED1-NEXT:    call void @__omp_outlined__7_wrapper(i16 0, i32 [[TMP0]])
-; NVPTX-DISABLED1-NEXT:    br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]]
-; NVPTX-DISABLED1:       worker_state_machine.parallel_region.check1:
-; NVPTX-DISABLED1-NEXT:    br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]]
-; NVPTX-DISABLED1:       worker_state_machine.parallel_region.end:
+; NVPTX-DISABLED1-NEXT:    br label %[[WORKER_STATE_MACHINE_PARALLEL_REGION_END:.*]]
+; NVPTX-DISABLED1:       [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK1]]:
+; NVPTX-DISABLED1-NEXT:    br label %[[WORKER_STATE_MACHINE_PARALLEL_REGION_END]]
+; NVPTX-DISABLED1:       [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]]:
 ; NVPTX-DISABLED1-NEXT:    call void @__kmpc_kernel_end_parallel()
-; NVPTX-DISABLED1-NEXT:    br label [[WORKER_STATE_MACHINE_DONE_BARRIER]]
-; NVPTX-DISABLED1:       worker_state_machine.done.barrier:
+; NVPTX-DISABLED1-NEXT:    br label %[[WORKER_STATE_MACHINE_DONE_BARRIER]]
+; NVPTX-DISABLED1:       [[WORKER_STATE_MACHINE_DONE_BARRIER]]:
 ; NVPTX-DISABLED1-NEXT:    call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
-; NVPTX-DISABLED1-NEXT:    br label [[WORKER_STATE_MACHINE_BEGIN]]
-; NVPTX-DISABLED1:       thread.user_code.check:
+; NVPTX-DISABLED1-NEXT:    br label %[[WORKER_STATE_MACHINE_BEGIN]]
+; NVPTX-DISABLED1:       [[THREAD_USER_CODE_CHECK]]:
 ; NVPTX-DISABLED1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
-; NVPTX-DISABLED1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
-; NVPTX-DISABLED1:       common.ret:
+; NVPTX-DISABLED1-NEXT:    br i1 [[EXEC_USER_CODE]], label %[[USER_CODE_ENTRY:.*]], label %[[COMMON_RET:.*]]
+; NVPTX-DISABLED1:       [[COMMON_RET]]:
 ; NVPTX-DISABLED1-NEXT:    ret void
-; NVPTX-DISABLED1:       user_code.entry:
+; NVPTX-DISABLED1:       [[USER_CODE_ENTRY]]:
 ; NVPTX-DISABLED1-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]]
-; NVPTX-DISABLED1-NEXT:    store i32 [[TMP1]], ptr addrspace(5) [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12]]
+; NVPTX-DISABLED1-NEXT:    store i32 [[TMP1]], ptr addrspace(5) [[DOTTHREADID_TEMP_]], align 4, !tbaa [[INT_TBAA12]]
 ; NVPTX-DISABLED1-NEXT:    call void @__omp_outlined__6(ptr [[DOTTHREADID_TEMP__CAST]], ptr [[DOTZERO_ADDR_CAST]]) #[[ATTR4]]
 ; NVPTX-DISABLED1-NEXT:    call void @__kmpc_target_deinit()
-; NVPTX-DISABLED1-NEXT:    br label [[COMMON_RET]]
+; NVPTX-DISABLED1-NEXT:    br label %[[COMMON_RET]]
 ;
-; NVPTX-DISABLED2-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50
-; NVPTX-DISABLED2-SAME: () #[[ATTR0]] {
-; NVPTX-DISABLED2-NEXT:  entry:
+; NVPTX-DISABLED2-LABEL: define weak ptx_kernel void @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50(
+; NVPTX-DISABLED2-SAME: ) #[[ATTR0]] {
+; NVPTX-DISABLED2-NEXT:  [[ENTRY:.*:]]
 ; NVPTX-DISABLED2-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 ; NVPTX-DISABLED2-NEXT:    [[DOTZERO_ADDR_CAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTZERO_ADDR]] to ptr
 ; NVPTX-DISABLED2-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca ptr, align 8, addrspace(5)
 ; NVPTX-DISABLED2-NEXT:    [[DOTTHREADID_TEMP__CAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTTHREADID_TEMP_]] to ptr
 ; NVPTX-DISABLED2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50_kernel_environment, ptr null)
 ; NVPTX-DISABLED2-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
-; NVPTX-DISABLED2-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
-; NVPTX-DISABLED2:       common.ret:
+; NVPTX-DISABLED2-NEXT:    br i1 [[EXEC_USER_CODE]], label %[[USER_CODE_ENTRY:.*]], label %[[COMMON_RET:.*]]
+; NVPTX-DISABLED2:       [[COMMON_RET]]:
 ; NVPTX-DISABLED2-NEXT:    ret void
-; NVPTX-DISABLED2:       user_code.entry:
+; NVPTX-DISABLED2:       [[USER_CODE_ENTRY]]:
 ; NVPTX-DISABLED2-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]]
-; NVPTX-DISABLED2-NEXT:    store i32 [[TMP1]], ptr addrspace(5) [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12]]
+; NVPTX-DISABLED2-NEXT:    store i32 [[TMP1]], ptr addrspace(5) [[DOTTHREADID_TEMP_]], align 4, !tbaa [[INT_TBAA12]]
 ; NVPTX-DISABLED2-NEXT:    call void @__omp_outlined__6(ptr [[DOTTHREADID_TEMP__CAST]], ptr [[DOTZERO_ADDR_CAST]]) #[[ATTR4]]
 ; NVPTX-DISABLED2-NEXT:    call void @__kmpc_target_deinit()
-; NVPTX-DISABLED2-NEXT:    br label [[COMMON_RET]]
+; NVPTX-DISABLED2-NEXT:    br label %[[COMMON_RET]]
 ;
 entry:
   %.zero.addr = alloca ptr, align 8, addrspace(5)
@@ -1932,163 +1932,163 @@ user_code.entry:                                  ; preds = %entry
 }
 
 define internal void @__omp_outlined__6(ptr noalias %.global_tid., ptr noalias %.bound_tid.) {
-; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__6
-; AMDGPU-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
-; AMDGPU-NEXT:  entry:
+; AMDGPU-LABEL: define internal void @__omp_outlined__6(
+; AMDGPU-SAME: ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+; AMDGPU-NEXT:  [[ENTRY:.*:]]
 ; AMDGPU-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca ptr, align 8, addrspace(5)
 ; AMDGPU-NEXT:    [[CAPTURED_VARS_ADDRS_CAST:%.*]] = addrspacecast ptr addrspace(5) [[CAPTURED_VARS_ADDRS]] to ptr
-; AMDGPU-NEXT:    br label [[REGION_CHECK_TID:%.*]]
-; AMDGPU:       region.check.tid:
+; AMDGPU-NEXT:    br label %[[REGION_CHECK_TID:.*]]
+; AMDGPU:       [[REGION_CHECK_TID]]:
 ; AMDGPU-NEXT:    [[TMP0:%.*]] = call fastcc i32 @__kmpc_get_hardware_thread_id_in_block()
 ; AMDGPU-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[TMP0]], 0
-; AMDGPU-NEXT:    br i1 [[TMP1]], label [[REGION_GUARDED:%.*]], label [[REGION_BARRIER:%.*]]
-; AMDGPU:       region.guarded:
-; AMDGPU-NEXT:    store i32 42, ptr addrspacecast (ptr addrspace(3) @x_shared.1 to ptr), align 4, !tbaa [[TBAA12]]
-; AMDGPU-NEXT:    br label [[REGION_GUARDED_END:%.*]]
-; AMDGPU:       region.guarded.end:
-; AMDGPU-NEXT:    br label [[REGION_BARRIER]]
-; AMDGPU:       region.barrier:
+; AMDGPU-NEXT:    br i1 [[TMP1]], label %[[REGION_GUARDED:.*]], label %[[REGION_BARRIER:.*]]
+; AMDGPU:       [[REGION_GUARDED]]:
+; AMDGPU-NEXT:    store i32 42, ptr addrspacecast (ptr addrspace(3) @x_shared.1 to ptr), align 4, !tbaa [[INT_TBAA12]]
+; AMDGPU-NEXT:    br label %[[REGION_GUARDED_END:.*]]
+; AMDGPU:       [[REGION_GUARDED_END]]:
+; AMDGPU-NEXT:    br label %[[REGION_BARRIER]]
+; AMDGPU:       [[REGION_BARRIER]]:
 ; AMDGPU-NEXT:    call void @__kmpc_barrier_simple_spmd(ptr @[[GLOB2]], i32 [[TMP0]])
-; AMDGPU-NEXT:    br label [[REGION_EXIT:%.*]]
-; AMDGPU:       region.exit:
-; AMDGPU-NEXT:    br label [[FOR_COND:%.*]]
-; AMDGPU:       for.cond:
-; AMDGPU-NEXT:    [[I_0:%.*]] = phi i32 [ 0, [[REGION_EXIT]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
+; AMDGPU-NEXT:    br label %[[REGION_EXIT:.*]]
+; AMDGPU:       [[REGION_EXIT]]:
+; AMDGPU-NEXT:    br label %[[FOR_COND:.*]]
+; AMDGPU:       [[FOR_COND]]:
+; AMDGPU-NEXT:    [[I_0:%.*]] = phi i32 [ 0, %[[REGION_EXIT]] ], [ [[INC:%.*]], %[[FOR_BODY:.*]] ]
 ; AMDGPU-NEXT:    [[CMP:%.*]] = icmp slt i32 [[I_0]], 100
-; AMDGPU-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
-; AMDGPU:       for.cond.cleanup:
+; AMDGPU-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP:.*]]
+; AMDGPU:       [[FOR_COND_CLEANUP]]:
 ; AMDGPU-NEXT:    call void @spmd_amenable() #[[ATTR7]]
 ; AMDGPU-NEXT:    ret void
-; AMDGPU:       for.body:
-; AMDGPU-NEXT:    store ptr addrspacecast (ptr addrspace(3) @x_shared.1 to ptr), ptr addrspace(5) [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[TBAA20]]
+; AMDGPU:       [[FOR_BODY]]:
+; AMDGPU-NEXT:    store ptr addrspacecast (ptr addrspace(3) @x_shared.1 to ptr), ptr addrspace(5) [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[ANYPTR_TBAA20]]
 ; AMDGPU-NEXT:    [[TMP2:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID_]] to ptr addrspace(5)
-; AMDGPU-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(5) [[TMP2]], align 4, !tbaa [[TBAA12]]
+; AMDGPU-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(5) [[TMP2]], align 4, !tbaa [[INT_TBAA12]]
 ; AMDGPU-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP3]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__7, ptr @__omp_outlined__7_wrapper, ptr [[CAPTURED_VARS_ADDRS_CAST]], i64 1)
 ; AMDGPU-NEXT:    [[INC]] = add nsw i32 [[I_0]], 1
-; AMDGPU-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP23:![0-9]+]]
+; AMDGPU-NEXT:    br label %[[FOR_COND]], !llvm.loop [[LOOP23:![0-9]+]]
 ;
-; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__6
-; NVPTX-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
-; NVPTX-NEXT:  entry:
+; NVPTX-LABEL: define internal void @__omp_outlined__6(
+; NVPTX-SAME: ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+; NVPTX-NEXT:  [[ENTRY:.*:]]
 ; NVPTX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca ptr, align 8, addrspace(5)
 ; NVPTX-NEXT:    [[CAPTURED_VARS_ADDRS_CAST:%.*]] = addrspacecast ptr addrspace(5) [[CAPTURED_VARS_ADDRS]] to ptr
-; NVPTX-NEXT:    br label [[REGION_CHECK_TID:%.*]]
-; NVPTX:       region.check.tid:
+; NVPTX-NEXT:    br label %[[REGION_CHECK_TID:.*]]
+; NVPTX:       [[REGION_CHECK_TID]]:
 ; NVPTX-NEXT:    [[TMP0:%.*]] = call fastcc i32 @__kmpc_get_hardware_thread_id_in_block()
 ; NVPTX-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[TMP0]], 0
-; NVPTX-NEXT:    br i1 [[TMP1]], label [[REGION_GUARDED:%.*]], label [[REGION_BARRIER:%.*]]
-; NVPTX:       region.guarded:
-; NVPTX-NEXT:    store i32 42, ptr addrspacecast (ptr addrspace(3) @x_shared1 to ptr), align 4, !tbaa [[TBAA12]]
-; NVPTX-NEXT:    br label [[REGION_GUARDED_END:%.*]]
-; NVPTX:       region.guarded.end:
-; NVPTX-NEXT:    br label [[REGION_BARRIER]]
-; NVPTX:       region.barrier:
+; NVPTX-NEXT:    br i1 [[TMP1]], label %[[REGION_GUARDED:.*]], label %[[REGION_BARRIER:.*]]
+; NVPTX:       [[REGION_GUARDED]]:
+; NVPTX-NEXT:    store i32 42, ptr addrspacecast (ptr addrspace(3) @x_shared1 to ptr), align 4, !tbaa [[INT_TBAA12]]
+; NVPTX-NEXT:    br label %[[REGION_GUARDED_END:.*]]
+; NVPTX:       [[REGION_GUARDED_END]]:
+; NVPTX-NEXT:    br label %[[REGION_BARRIER]]
+; NVPTX:       [[REGION_BARRIER]]:
 ; NVPTX-NEXT:    call void @__kmpc_barrier_simple_spmd(ptr @[[GLOB2]], i32 [[TMP0]])
-; NVPTX-NEXT:    br label [[REGION_EXIT:%.*]]
-; NVPTX:       region.exit:
-; NVPTX-NEXT:    br label [[FOR_COND:%.*]]
-; NVPTX:       for.cond:
-; NVPTX-NEXT:    [[I_0:%.*]] = phi i32 [ 0, [[REGION_EXIT]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
+; NVPTX-NEXT:    br label %[[REGION_EXIT:.*]]
+; NVPTX:       [[REGION_EXIT]]:
+; NVPTX-NEXT:    br label %[[FOR_COND:.*]]
+; NVPTX:       [[FOR_COND]]:
+; NVPTX-NEXT:    [[I_0:%.*]] = phi i32 [ 0, %[[REGION_EXIT]] ], [ [[INC:%.*]], %[[FOR_BODY:.*]] ]
 ; NVPTX-NEXT:    [[CMP:%.*]] = icmp slt i32 [[I_0]], 100
-; NVPTX-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
-; NVPTX:       for.cond.cleanup:
+; NVPTX-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP:.*]]
+; NVPTX:       [[FOR_COND_CLEANUP]]:
 ; NVPTX-NEXT:    call void @spmd_amenable() #[[ATTR7]]
 ; NVPTX-NEXT:    ret void
-; NVPTX:       for.body:
-; NVPTX-NEXT:    store ptr addrspacecast (ptr addrspace(3) @x_shared1 to ptr), ptr addrspace(5) [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[TBAA20]]
+; NVPTX:       [[FOR_BODY]]:
+; NVPTX-NEXT:    store ptr addrspacecast (ptr addrspace(3) @x_shared1 to ptr), ptr addrspace(5) [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[ANYPTR_TBAA20]]
 ; NVPTX-NEXT:    [[TMP2:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID_]] to ptr addrspace(5)
-; NVPTX-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(5) [[TMP2]], align 4, !tbaa [[TBAA12]]
+; NVPTX-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(5) [[TMP2]], align 4, !tbaa [[INT_TBAA12]]
 ; NVPTX-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP3]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__7, ptr @__omp_outlined__7_wrapper, ptr [[CAPTURED_VARS_ADDRS_CAST]], i64 1)
 ; NVPTX-NEXT:    [[INC]] = add nsw i32 [[I_0]], 1
-; NVPTX-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP23:![0-9]+]]
+; NVPTX-NEXT:    br label %[[FOR_COND]], !llvm.loop [[LOOP23:![0-9]+]]
 ;
-; AMDGPU-DISABLED1-LABEL: define {{[^@]+}}@__omp_outlined__6
-; AMDGPU-DISABLED1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
-; AMDGPU-DISABLED1-NEXT:  entry:
+; AMDGPU-DISABLED1-LABEL: define internal void @__omp_outlined__6(
+; AMDGPU-DISABLED1-SAME: ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+; AMDGPU-DISABLED1-NEXT:  [[ENTRY:.*]]:
 ; AMDGPU-DISABLED1-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca ptr, align 8, addrspace(5)
 ; AMDGPU-DISABLED1-NEXT:    [[CAPTURED_VARS_ADDRS_CAST:%.*]] = addrspacecast ptr addrspace(5) [[CAPTURED_VARS_ADDRS]] to ptr
-; AMDGPU-DISABLED1-NEXT:    store i32 42, ptr addrspacecast (ptr addrspace(3) @x_shared.1 to ptr), align 4, !tbaa [[TBAA12]]
-; AMDGPU-DISABLED1-NEXT:    br label [[FOR_COND:%.*]]
-; AMDGPU-DISABLED1:       for.cond:
-; AMDGPU-DISABLED1-NEXT:    [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
+; AMDGPU-DISABLED1-NEXT:    store i32 42, ptr addrspacecast (ptr addrspace(3) @x_shared.1 to ptr), align 4, !tbaa [[INT_TBAA12]]
+; AMDGPU-DISABLED1-NEXT:    br label %[[FOR_COND:.*]]
+; AMDGPU-DISABLED1:       [[FOR_COND]]:
+; AMDGPU-DISABLED1-NEXT:    [[I_0:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_BODY:.*]] ]
 ; AMDGPU-DISABLED1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[I_0]], 100
-; AMDGPU-DISABLED1-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
-; AMDGPU-DISABLED1:       for.cond.cleanup:
+; AMDGPU-DISABLED1-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP:.*]]
+; AMDGPU-DISABLED1:       [[FOR_COND_CLEANUP]]:
 ; AMDGPU-DISABLED1-NEXT:    call void @spmd_amenable() #[[ATTR7]]
 ; AMDGPU-DISABLED1-NEXT:    ret void
-; AMDGPU-DISABLED1:       for.body:
-; AMDGPU-DISABLED1-NEXT:    store ptr addrspacecast (ptr addrspace(3) @x_shared.1 to ptr), ptr addrspace(5) [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[TBAA20]]
+; AMDGPU-DISABLED1:       [[FOR_BODY]]:
+; AMDGPU-DISABLED1-NEXT:    store ptr addrspacecast (ptr addrspace(3) @x_shared.1 to ptr), ptr addrspace(5) [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[ANYPTR_TBAA20]]
 ; AMDGPU-DISABLED1-NEXT:    [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID_]] to ptr addrspace(5)
-; AMDGPU-DISABLED1-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[TMP0]], align 4, !tbaa [[TBAA12]]
+; AMDGPU-DISABLED1-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[TMP0]], align 4, !tbaa [[INT_TBAA12]]
 ; AMDGPU-DISABLED1-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__7, ptr @__omp_outlined__7_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS_CAST]], i64 1)
 ; AMDGPU-DISABLED1-NEXT:    [[INC]] = add nsw i32 [[I_0]], 1
-; AMDGPU-DISABLED1-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP23:![0-9]+]]
+; AMDGPU-DISABLED1-NEXT:    br label %[[FOR_COND]], !llvm.loop [[LOOP23:![0-9]+]]
 ;
-; AMDGPU-DISABLED2-LABEL: define {{[^@]+}}@__omp_outlined__6
-; AMDGPU-DISABLED2-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
-; AMDGPU-DISABLED2-NEXT:  entry:
+; AMDGPU-DISABLED2-LABEL: define internal void @__omp_outlined__6(
+; AMDGPU-DISABLED2-SAME: ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+; AMDGPU-DISABLED2-NEXT:  [[ENTRY:.*]]:
 ; AMDGPU-DISABLED2-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca ptr, align 8, addrspace(5)
 ; AMDGPU-DISABLED2-NEXT:    [[CAPTURED_VARS_ADDRS_CAST:%.*]] = addrspacecast ptr addrspace(5) [[CAPTURED_VARS_ADDRS]] to ptr
-; AMDGPU-DISABLED2-NEXT:    store i32 42, ptr addrspacecast (ptr addrspace(3) @x_shared.1 to ptr), align 4, !tbaa [[TBAA12]]
-; AMDGPU-DISABLED2-NEXT:    br label [[FOR_COND:%.*]]
-; AMDGPU-DISABLED2:       for.cond:
-; AMDGPU-DISABLED2-NEXT:    [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
+; AMDGPU-DISABLED2-NEXT:    store i32 42, ptr addrspacecast (ptr addrspace(3) @x_shared.1 to ptr), align 4, !tbaa [[INT_TBAA12]]
+; AMDGPU-DISABLED2-NEXT:    br label %[[FOR_COND:.*]]
+; AMDGPU-DISABLED2:       [[FOR_COND]]:
+; AMDGPU-DISABLED2-NEXT:    [[I_0:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_BODY:.*]] ]
 ; AMDGPU-DISABLED2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[I_0]], 100
-; AMDGPU-DISABLED2-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
-; AMDGPU-DISABLED2:       for.cond.cleanup:
+; AMDGPU-DISABLED2-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP:.*]]
+; AMDGPU-DISABLED2:       [[FOR_COND_CLEANUP]]:
 ; AMDGPU-DISABLED2-NEXT:    call void @spmd_amenable() #[[ATTR7]]
 ; AMDGPU-DISABLED2-NEXT:    ret void
-; AMDGPU-DISABLED2:       for.body:
-; AMDGPU-DISABLED2-NEXT:    store ptr addrspacecast (ptr addrspace(3) @x_shared.1 to ptr), ptr addrspace(5) [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[TBAA20]]
+; AMDGPU-DISABLED2:       [[FOR_BODY]]:
+; AMDGPU-DISABLED2-NEXT:    store ptr addrspacecast (ptr addrspace(3) @x_shared.1 to ptr), ptr addrspace(5) [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[ANYPTR_TBAA20]]
 ; AMDGPU-DISABLED2-NEXT:    [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID_]] to ptr addrspace(5)
-; AMDGPU-DISABLED2-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[TMP0]], align 4, !tbaa [[TBAA12]]
+; AMDGPU-DISABLED2-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[TMP0]], align 4, !tbaa [[INT_TBAA12]]
 ; AMDGPU-DISABLED2-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__7, ptr @__omp_outlined__7_wrapper, ptr [[CAPTURED_VARS_ADDRS_CAST]], i64 1)
 ; AMDGPU-DISABLED2-NEXT:    [[INC]] = add nsw i32 [[I_0]], 1
-; AMDGPU-DISABLED2-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP23:![0-9]+]]
+; AMDGPU-DISABLED2-NEXT:    br label %[[FOR_COND]], !llvm.loop [[LOOP23:![0-9]+]]
 ;
-; NVPTX-DISABLED1-LABEL: define {{[^@]+}}@__omp_outlined__6
-; NVPTX-DISABLED1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
-; NVPTX-DISABLED1-NEXT:  entry:
+; NVPTX-DISABLED1-LABEL: define internal void @__omp_outlined__6(
+; NVPTX-DISABLED1-SAME: ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+; NVPTX-DISABLED1-NEXT:  [[ENTRY:.*]]:
 ; NVPTX-DISABLED1-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca ptr, align 8, addrspace(5)
 ; NVPTX-DISABLED1-NEXT:    [[CAPTURED_VARS_ADDRS_CAST:%.*]] = addrspacecast ptr addrspace(5) [[CAPTURED_VARS_ADDRS]] to ptr
-; NVPTX-DISABLED1-NEXT:    store i32 42, ptr addrspacecast (ptr addrspace(3) @x_shared1 to ptr), align 4, !tbaa [[TBAA12]]
-; NVPTX-DISABLED1-NEXT:    br label [[FOR_COND:%.*]]
-; NVPTX-DISABLED1:       for.cond:
-; NVPTX-DISABLED1-NEXT:    [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
+; NVPTX-DISABLED1-NEXT:    store i32 42, ptr addrspacecast (ptr addrspace(3) @x_shared1 to ptr), align 4, !tbaa [[INT_TBAA12]]
+; NVPTX-DISABLED1-NEXT:    br label %[[FOR_COND:.*]]
+; NVPTX-DISABLED1:       [[FOR_COND]]:
+; NVPTX-DISABLED1-NEXT:    [[I_0:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_BODY:.*]] ]
 ; NVPTX-DISABLED1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[I_0]], 100
-; NVPTX-DISABLED1-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
-; NVPTX-DISABLED1:       for.cond.cleanup:
+; NVPTX-DISABLED1-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP:.*]]
+; NVPTX-DISABLED1:       [[FOR_COND_CLEANUP]]:
 ; NVPTX-DISABLED1-NEXT:    call void @spmd_amenable() #[[ATTR7]]
 ; NVPTX-DISABLED1-NEXT:    ret void
-; NVPTX-DISABLED1:       for.body:
-; NVPTX-DISABLED1-NEXT:    store ptr addrspacecast (ptr addrspace(3) @x_shared1 to ptr), ptr addrspace(5) [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[TBAA20]]
+; NVPTX-DISABLED1:       [[FOR_BODY]]:
+; NVPTX-DISABLED1-NEXT:    store ptr addrspacecast (ptr addrspace(3) @x_shared1 to ptr), ptr addrspace(5) [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[ANYPTR_TBAA20]]
 ; NVPTX-DISABLED1-NEXT:    [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID_]] to ptr addrspace(5)
-; NVPTX-DISABLED1-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[TMP0]], align 4, !tbaa [[TBAA12]]
+; NVPTX-DISABLED1-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[TMP0]], align 4, !tbaa [[INT_TBAA12]]
 ; NVPTX-DISABLED1-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__7, ptr @__omp_outlined__7_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS_CAST]], i64 1)
 ; NVPTX-DISABLED1-NEXT:    [[INC]] = add nsw i32 [[I_0]], 1
-; NVPTX-DISABLED1-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP23:![0-9]+]]
+; NVPTX-DISABLED1-NEXT:    br label %[[FOR_COND]], !llvm.loop [[LOOP23:![0-9]+]]
 ;
-; NVPTX-DISABLED2-LABEL: define {{[^@]+}}@__omp_outlined__6
-; NVPTX-DISABLED2-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
-; NVPTX-DISABLED2-NEXT:  entry:
+; NVPTX-DISABLED2-LABEL: define internal void @__omp_outlined__6(
+; NVPTX-DISABLED2-SAME: ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+; NVPTX-DISABLED2-NEXT:  [[ENTRY:.*]]:
 ; NVPTX-DISABLED2-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca ptr, align 8, addrspace(5)
 ; NVPTX-DISABLED2-NEXT:    [[CAPTURED_VARS_ADDRS_CAST:%.*]] = addrspacecast ptr addrspace(5) [[CAPTURED_VARS_ADDRS]] to ptr
-; NVPTX-DISABLED2-NEXT:    store i32 42, ptr addrspacecast (ptr addrspace(3) @x_shared1 to ptr), align 4, !tbaa [[TBAA12]]
-; NVPTX-DISABLED2-NEXT:    br label [[FOR_COND:%.*]]
-; NVPTX-DISABLED2:       for.cond:
-; NVPTX-DISABLED2-NEXT:    [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
+; NVPTX-DISABLED2-NEXT:    store i32 42, ptr addrspacecast (ptr addrspace(3) @x_shared1 to ptr), align 4, !tbaa [[INT_TBAA12]]
+; NVPTX-DISABLED2-NEXT:    br label %[[FOR_COND:.*]]
+; NVPTX-DISABLED2:       [[FOR_COND]]:
+; NVPTX-DISABLED2-NEXT:    [[I_0:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_BODY:.*]] ]
 ; NVPTX-DISABLED2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[I_0]], 100
-; NVPTX-DISABLED2-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
-; NVPTX-DISABLED2:       for.cond.cleanup:
+; NVPTX-DISABLED2-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP:.*]]
+; NVPTX-DISABLED2:       [[FOR_COND_CLEANUP]]:
 ; NVPTX-DISABLED2-NEXT:    call void @spmd_amenable() #[[ATTR7]]
 ; NVPTX-DISABLED2-NEXT:    ret void
-; NVPTX-DISABLED2:       for.body:
-; NVPTX-DISABLED2-NEXT:    store ptr addrspacecast (ptr addrspace(3) @x_shared1 to ptr), ptr addrspace(5) [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[TBAA20]]
+; NVPTX-DISABLED2:       [[FOR_BODY]]:
+; NVPTX-DISABLED2-NEXT:    store ptr addrspacecast (ptr addrspace(3) @x_shared1 to ptr), ptr addrspace(5) [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[ANYPTR_TBAA20]]
 ; NVPTX-DISABLED2-NEXT:    [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID_]] to ptr addrspace(5)
-; NVPTX-DISABLED2-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[TMP0]], align 4, !tbaa [[TBAA12]]
+; NVPTX-DISABLED2-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[TMP0]], align 4, !tbaa [[INT_TBAA12]]
 ; NVPTX-DISABLED2-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__7, ptr @__omp_outlined__7_wrapper, ptr [[CAPTURED_VARS_ADDRS_CAST]], i64 1)
 ; NVPTX-DISABLED2-NEXT:    [[INC]] = add nsw i32 [[I_0]], 1
-; NVPTX-DISABLED2-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP23:![0-9]+]]
+; NVPTX-DISABLED2-NEXT:    br label %[[FOR_COND]], !llvm.loop [[LOOP23:![0-9]+]]
 ;
 entry:
   %captured_vars_addrs = alloca ptr, align 8, addrspace(5)
@@ -2116,57 +2116,57 @@ for.body:                                         ; preds = %for.cond
 }
 
 define internal void @__omp_outlined__7(ptr noalias %.global_tid., ptr noalias %.bound_tid., ptr nonnull align 4 dereferenceable(4) %x) {
-; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__7
-; AMDGPU-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(4) [[X:%.*]]) {
-; AMDGPU-NEXT:  entry:
-; AMDGPU-NEXT:    [[TMP0:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[TBAA12]]
+; AMDGPU-LABEL: define internal void @__omp_outlined__7(
+; AMDGPU-SAME: ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(4) [[X:%.*]]) {
+; AMDGPU-NEXT:  [[ENTRY:.*:]]
+; AMDGPU-NEXT:    [[TMP0:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[INT_TBAA12]]
 ; AMDGPU-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP0]], 1
-; AMDGPU-NEXT:    store i32 [[INC]], ptr [[X]], align 4, !tbaa [[TBAA12]]
+; AMDGPU-NEXT:    store i32 [[INC]], ptr [[X]], align 4, !tbaa [[INT_TBAA12]]
 ; AMDGPU-NEXT:    call void @unknowni32p(ptr [[X]]) #[[ATTR8]]
 ; AMDGPU-NEXT:    ret void
 ;
-; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__7
-; NVPTX-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(4) [[X:%.*]]) {
-; NVPTX-NEXT:  entry:
-; NVPTX-NEXT:    [[TMP0:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[TBAA12]]
+; NVPTX-LABEL: define internal void @__omp_outlined__7(
+; NVPTX-SAME: ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(4) [[X:%.*]]) {
+; NVPTX-NEXT:  [[ENTRY:.*:]]
+; NVPTX-NEXT:    [[TMP0:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[INT_TBAA12]]
 ; NVPTX-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP0]], 1
-; NVPTX-NEXT:    store i32 [[INC]], ptr [[X]], align 4, !tbaa [[TBAA12]]
+; NVPTX-NEXT:    store i32 [[INC]], ptr [[X]], align 4, !tbaa [[INT_TBAA12]]
 ; NVPTX-NEXT:    call void @unknowni32p(ptr [[X]]) #[[ATTR8]]
 ; NVPTX-NEXT:    ret void
 ;
-; AMDGPU-DISABLED1-LABEL: define {{[^@]+}}@__omp_outlined__7
-; AMDGPU-DISABLED1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(4) [[X:%.*]]) {
-; AMDGPU-DISABLED1-NEXT:  entry:
-; AMDGPU-DISABLED1-NEXT:    [[TMP0:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[TBAA12]]
+; AMDGPU-DISABLED1-LABEL: define internal void @__omp_outlined__7(
+; AMDGPU-DISABLED1-SAME: ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(4) [[X:%.*]]) {
+; AMDGPU-DISABLED1-NEXT:  [[ENTRY:.*:]]
+; AMDGPU-DISABLED1-NEXT:    [[TMP0:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[INT_TBAA12]]
 ; AMDGPU-DISABLED1-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP0]], 1
-; AMDGPU-DISABLED1-NEXT:    store i32 [[INC]], ptr [[X]], align 4, !tbaa [[TBAA12]]
+; AMDGPU-DISABLED1-NEXT:    store i32 [[INC]], ptr [[X]], align 4, !tbaa [[INT_TBAA12]]
 ; AMDGPU-DISABLED1-NEXT:    call void @unknowni32p(ptr [[X]]) #[[ATTR8]]
 ; AMDGPU-DISABLED1-NEXT:    ret void
 ;
-; AMDGPU-DISABLED2-LABEL: define {{[^@]+}}@__omp_outlined__7
-; AMDGPU-DISABLED2-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(4) [[X:%.*]]) {
-; AMDGPU-DISABLED2-NEXT:  entry:
-; AMDGPU-DISABLED2-NEXT:    [[TMP0:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[TBAA12]]
+; AMDGPU-DISABLED2-LABEL: define internal void @__omp_outlined__7(
+; AMDGPU-DISABLED2-SAME: ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(4) [[X:%.*]]) {
+; AMDGPU-DISABLED2-NEXT:  [[ENTRY:.*:]]
+; AMDGPU-DISABLED2-NEXT:    [[TMP0:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[INT_TBAA12]]
 ; AMDGPU-DISABLED2-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP0]], 1
-; AMDGPU-DISABLED2-NEXT:    store i32 [[INC]], ptr [[X]], align 4, !tbaa [[TBAA12]]
+; AMDGPU-DISABLED2-NEXT:    store i32 [[INC]], ptr [[X]], align 4, !tbaa [[INT_TBAA12]]
 ; AMDGPU-DISABLED2-NEXT:    call void @unknowni32p(ptr [[X]]) #[[ATTR8]]
 ; AMDGPU-DISABLED2-NEXT:    ret void
 ;
-; NVPTX-DISABLED1-LABEL: define {{[^@]+}}@__omp_outlined__7
-; NVPTX-DISABLED1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(4) [[X:%.*]]) {
-; NVPTX-DISABLED1-NEXT:  entry:
-; NVPTX-DISABLED1-NEXT:    [[TMP0:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[TBAA12]]
+; NVPTX-DISABLED1-LABEL: define internal void @__omp_outlined__7(
+; NVPTX-DISABLED1-SAME: ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(4) [[X:%.*]]) {
+; NVPTX-DISABLED1-NEXT:  [[ENTRY:.*:]]
+; NVPTX-DISABLED1-NEXT:    [[TMP0:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[INT_TBAA12]]
 ; NVPTX-DISABLED1-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP0]], 1
-; NVPTX-DISABLED1-NEXT:    store i32 [[INC]], ptr [[X]], align 4, !tbaa [[TBAA12]]
+; NVPTX-DISABLED1-NEXT:    store i32 [[INC]], ptr [[X]], align 4, !tbaa [[INT_TBAA12]]
 ; NVPTX-DISABLED1-NEXT:    call void @unknowni32p(ptr [[X]]) #[[ATTR8]]
 ; NVPTX-DISABLED1-NEXT:    ret void
 ;
-; NVPTX-DISABLED2-LABEL: define {{[^@]+}}@__omp_outlined__7
-; NVPTX-DISABLED2-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(4) [[X:%.*]]) {
-; NVPTX-DISABLED2-NEXT:  entry:
-; NVPTX-DISABLED2-NEXT:    [[TMP0:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[TBAA12]]
+; NVPTX-DISABLED2-LABEL: define internal void @__omp_outlined__7(
+; NVPTX-DISABLED2-SAME: ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(4) [[X:%.*]]) {
+; NVPTX-DISABLED2-NEXT:  [[ENTRY:.*:]]
+; NVPTX-DISABLED2-NEXT:    [[TMP0:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[INT_TBAA12]]
 ; NVPTX-DISABLED2-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP0]], 1
-; NVPTX-DISABLED2-NEXT:    store i32 [[INC]], ptr [[X]], align 4, !tbaa [[TBAA12]]
+; NVPTX-DISABLED2-NEXT:    store i32 [[INC]], ptr [[X]], align 4, !tbaa [[INT_TBAA12]]
 ; NVPTX-DISABLED2-NEXT:    call void @unknowni32p(ptr [[X]]) #[[ATTR8]]
 ; NVPTX-DISABLED2-NEXT:    ret void
 ;
@@ -2180,9 +2180,9 @@ entry:
 
 ; Function Attrs: convergent norecurse nounwind
 define internal void @__omp_outlined__7_wrapper(i16 zeroext %0, i32 %1) #1 {
-; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__7_wrapper
-; AMDGPU-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] {
-; AMDGPU-NEXT:  entry:
+; AMDGPU-LABEL: define internal void @__omp_outlined__7_wrapper(
+; AMDGPU-SAME: i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] {
+; AMDGPU-NEXT:  [[ENTRY:.*:]]
 ; AMDGPU-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8, addrspace(5)
 ; AMDGPU-NEXT:    [[DOTADDR1_CAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
 ; AMDGPU-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
@@ -2191,13 +2191,13 @@ define internal void @__omp_outlined__7_wrapper(i16 zeroext %0, i32 %1) #1 {
 ; AMDGPU-NEXT:    [[GLOBAL_ARGS_CAST:%.*]] = addrspacecast ptr addrspace(5) [[GLOBAL_ARGS]] to ptr
 ; AMDGPU-NEXT:    call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS_CAST]])
 ; AMDGPU-NEXT:    [[TMP2:%.*]] = load ptr, ptr addrspace(5) [[GLOBAL_ARGS]], align 8
-; AMDGPU-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[TBAA20]]
+; AMDGPU-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[ANYPTR_TBAA20]]
 ; AMDGPU-NEXT:    call void @__omp_outlined__7(ptr [[DOTADDR1_CAST]], ptr [[DOTZERO_ADDR_CAST]], ptr [[TMP3]]) #[[ATTR4]]
 ; AMDGPU-NEXT:    ret void
 ;
-; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__7_wrapper
-; NVPTX-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] {
-; NVPTX-NEXT:  entry:
+; NVPTX-LABEL: define internal void @__omp_outlined__7_wrapper(
+; NVPTX-SAME: i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] {
+; NVPTX-NEXT:  [[ENTRY:.*:]]
 ; NVPTX-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8, addrspace(5)
 ; NVPTX-NEXT:    [[DOTADDR1_CAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
 ; NVPTX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
@@ -2206,13 +2206,13 @@ define internal void @__omp_outlined__7_wrapper(i16 zeroext %0, i32 %1) #1 {
 ; NVPTX-NEXT:    [[GLOBAL_ARGS_CAST:%.*]] = addrspacecast ptr addrspace(5) [[GLOBAL_ARGS]] to ptr
 ; NVPTX-NEXT:    call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS_CAST]])
 ; NVPTX-NEXT:    [[TMP2:%.*]] = load ptr, ptr addrspace(5) [[GLOBAL_ARGS]], align 8
-; NVPTX-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[TBAA20]]
+; NVPTX-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[ANYPTR_TBAA20]]
 ; NVPTX-NEXT:    call void @__omp_outlined__7(ptr [[DOTADDR1_CAST]], ptr [[DOTZERO_ADDR_CAST]], ptr [[TMP3]]) #[[ATTR4]]
 ; NVPTX-NEXT:    ret void
 ;
-; AMDGPU-DISABLED1-LABEL: define {{[^@]+}}@__omp_outlined__7_wrapper
-; AMDGPU-DISABLED1-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] {
-; AMDGPU-DISABLED1-NEXT:  entry:
+; AMDGPU-DISABLED1-LABEL: define internal void @__omp_outlined__7_wrapper(
+; AMDGPU-DISABLED1-SAME: i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] {
+; AMDGPU-DISABLED1-NEXT:  [[ENTRY:.*:]]
 ; AMDGPU-DISABLED1-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8, addrspace(5)
 ; AMDGPU-DISABLED1-NEXT:    [[DOTADDR1_CAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
 ; AMDGPU-DISABLED1-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
@@ -2221,13 +2221,13 @@ define internal void @__omp_outlined__7_wrapper(i16 zeroext %0, i32 %1) #1 {
 ; AMDGPU-DISABLED1-NEXT:    [[GLOBAL_ARGS_CAST:%.*]] = addrspacecast ptr addrspace(5) [[GLOBAL_ARGS]] to ptr
 ; AMDGPU-DISABLED1-NEXT:    call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS_CAST]])
 ; AMDGPU-DISABLED1-NEXT:    [[TMP2:%.*]] = load ptr, ptr addrspace(5) [[GLOBAL_ARGS]], align 8
-; AMDGPU-DISABLED1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[TBAA20]]
+; AMDGPU-DISABLED1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[ANYPTR_TBAA20]]
 ; AMDGPU-DISABLED1-NEXT:    call void @__omp_outlined__7(ptr [[DOTADDR1_CAST]], ptr [[DOTZERO_ADDR_CAST]], ptr [[TMP3]]) #[[ATTR4]]
 ; AMDGPU-DISABLED1-NEXT:    ret void
 ;
-; AMDGPU-DISABLED2-LABEL: define {{[^@]+}}@__omp_outlined__7_wrapper
-; AMDGPU-DISABLED2-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] {
-; AMDGPU-DISABLED2-NEXT:  entry:
+; AMDGPU-DISABLED2-LABEL: define internal void @__omp_outlined__7_wrapper(
+; AMDGPU-DISABLED2-SAME: i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] {
+; AMDGPU-DISABLED2-NEXT:  [[ENTRY:.*:]]
 ; AMDGPU-DISABLED2-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8, addrspace(5)
 ; AMDGPU-DISABLED2-NEXT:    [[DOTADDR1_CAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
 ; AMDGPU-DISABLED2-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
@@ -2236,13 +2236,13 @@ define internal void @__omp_outlined__7_wrapper(i16 zeroext %0, i32 %1) #1 {
 ; AMDGPU-DISABLED2-NEXT:    [[GLOBAL_ARGS_CAST:%.*]] = addrspacecast ptr addrspace(5) [[GLOBAL_ARGS]] to ptr
 ; AMDGPU-DISABLED2-NEXT:    call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS_CAST]])
 ; AMDGPU-DISABLED2-NEXT:    [[TMP2:%.*]] = load ptr, ptr addrspace(5) [[GLOBAL_ARGS]], align 8
-; AMDGPU-DISABLED2-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[TBAA20]]
+; AMDGPU-DISABLED2-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[ANYPTR_TBAA20]]
 ; AMDGPU-DISABLED2-NEXT:    call void @__omp_outlined__7(ptr [[DOTADDR1_CAST]], ptr [[DOTZERO_ADDR_CAST]], ptr [[TMP3]]) #[[ATTR4]]
 ; AMDGPU-DISABLED2-NEXT:    ret void
 ;
-; NVPTX-DISABLED1-LABEL: define {{[^@]+}}@__omp_outlined__7_wrapper
-; NVPTX-DISABLED1-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] {
-; NVPTX-DISABLED1-NEXT:  entry:
+; NVPTX-DISABLED1-LABEL: define internal void @__omp_outlined__7_wrapper(
+; NVPTX-DISABLED1-SAME: i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] {
+; NVPTX-DISABLED1-NEXT:  [[ENTRY:.*:]]
 ; NVPTX-DISABLED1-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8, addrspace(5)
 ; NVPTX-DISABLED1-NEXT:    [[DOTADDR1_CAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
 ; NVPTX-DISABLED1-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
@@ -2251,13 +2251,13 @@ define internal void @__omp_outlined__7_wrapper(i16 zeroext %0, i32 %1) #1 {
 ; NVPTX-DISABLED1-NEXT:    [[GLOBAL_ARGS_CAST:%.*]] = addrspacecast ptr addrspace(5) [[GLOBAL_ARGS]] to ptr
 ; NVPTX-DISABLED1-NEXT:    call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS_CAST]])
 ; NVPTX-DISABLED1-NEXT:    [[TMP2:%.*]] = load ptr, ptr addrspace(5) [[GLOBAL_ARGS]], align 8
-; NVPTX-DISABLED1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[TBAA20]]
+; NVPTX-DISABLED1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[ANYPTR_TBAA20]]
 ; NVPTX-DISABLED1-NEXT:    call void @__omp_outlined__7(ptr [[DOTADDR1_CAST]], ptr [[DOTZERO_ADDR_CAST]], ptr [[TMP3]]) #[[ATTR4]]
 ; NVPTX-DISABLED1-NEXT:    ret void
 ;
-; NVPTX-DISABLED2-LABEL: define {{[^@]+}}@__omp_outlined__7_wrapper
-; NVPTX-DISABLED2-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] {
-; NVPTX-DISABLED2-NEXT:  entry:
+; NVPTX-DISABLED2-LABEL: define internal void @__omp_outlined__7_wrapper(
+; NVPTX-DISABLED2-SAME: i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] {
+; NVPTX-DISABLED2-NEXT:  [[ENTRY:.*:]]
 ; NVPTX-DISABLED2-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8, addrspace(5)
 ; NVPTX-DISABLED2-NEXT:    [[DOTADDR1_CAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
 ; NVPTX-DISABLED2-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
@@ -2266,7 +2266,7 @@ define internal void @__omp_outlined__7_wrapper(i16 zeroext %0, i32 %1) #1 {
 ; NVPTX-DISABLED2-NEXT:    [[GLOBAL_ARGS_CAST:%.*]] = addrspacecast ptr addrspace(5) [[GLOBAL_ARGS]] to ptr
 ; NVPTX-DISABLED2-NEXT:    call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS_CAST]])
 ; NVPTX-DISABLED2-NEXT:    [[TMP2:%.*]] = load ptr, ptr addrspace(5) [[GLOBAL_ARGS]], align 8
-; NVPTX-DISABLED2-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[TBAA20]]
+; NVPTX-DISABLED2-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[ANYPTR_TBAA20]]
 ; NVPTX-DISABLED2-NEXT:    call void @__omp_outlined__7(ptr [[DOTADDR1_CAST]], ptr [[DOTZERO_ADDR_CAST]], ptr [[TMP3]]) #[[ATTR4]]
 ; NVPTX-DISABLED2-NEXT:    ret void
 ;
@@ -2288,9 +2288,9 @@ entry:
 
 ; Function Attrs: alwaysinline convergent norecurse nounwind
 define weak ptx_kernel void @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65() #0 {
-; AMDGPU-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65
-; AMDGPU-SAME: () #[[ATTR0]] {
-; AMDGPU-NEXT:  entry:
+; AMDGPU-LABEL: define weak ptx_kernel void @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65(
+; AMDGPU-SAME: ) #[[ATTR0]] {
+; AMDGPU-NEXT:  [[ENTRY:.*:]]
 ; AMDGPU-NEXT:    [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 ; AMDGPU-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 ; AMDGPU-NEXT:    [[DOTZERO_ADDR_CAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTZERO_ADDR]] to ptr
@@ -2298,47 +2298,47 @@ define weak ptx_kernel void @__omp_offloading_fd02_2044372e_do_not_spmdize_targe
 ; AMDGPU-NEXT:    [[DOTTHREADID_TEMP__CAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTTHREADID_TEMP_]] to ptr
 ; AMDGPU-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65_kernel_environment, ptr null)
 ; AMDGPU-NEXT:    [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
-; AMDGPU-NEXT:    br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
-; AMDGPU:       is_worker_check:
+; AMDGPU-NEXT:    br i1 [[THREAD_IS_WORKER]], label %[[IS_WORKER_CHECK:.*]], label %[[THREAD_USER_CODE_CHECK:.*]]
+; AMDGPU:       [[IS_WORKER_CHECK]]:
 ; AMDGPU-NEXT:    [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
 ; AMDGPU-NEXT:    [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
 ; AMDGPU-NEXT:    [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]]
 ; AMDGPU-NEXT:    [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]]
-; AMDGPU-NEXT:    br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]]
-; AMDGPU:       worker_state_machine.begin:
+; AMDGPU-NEXT:    br i1 [[THREAD_IS_MAIN_OR_WORKER]], label %[[WORKER_STATE_MACHINE_BEGIN:.*]], label %[[WORKER_STATE_MACHINE_FINISHED:.*]]
+; AMDGPU:       [[WORKER_STATE_MACHINE_BEGIN]]:
 ; AMDGPU-NEXT:    call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
 ; AMDGPU-NEXT:    [[WORKER_WORK_FN_ADDR_GENERIC:%.*]] = addrspacecast ptr addrspace(5) [[WORKER_WORK_FN_ADDR]] to ptr
 ; AMDGPU-NEXT:    [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(ptr [[WORKER_WORK_FN_ADDR_GENERIC]])
 ; AMDGPU-NEXT:    [[WORKER_WORK_FN:%.*]] = load ptr, ptr [[WORKER_WORK_FN_ADDR_GENERIC]], align 8
 ; AMDGPU-NEXT:    [[WORKER_IS_DONE:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], null
-; AMDGPU-NEXT:    br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
-; AMDGPU:       worker_state_machine.finished:
+; AMDGPU-NEXT:    br i1 [[WORKER_IS_DONE]], label %[[WORKER_STATE_MACHINE_FINISHED]], label %[[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:.*]]
+; AMDGPU:       [[WORKER_STATE_MACHINE_FINISHED]]:
 ; AMDGPU-NEXT:    ret void
-; AMDGPU:       worker_state_machine.is_active.check:
-; AMDGPU-NEXT:    br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]]
-; AMDGPU:       worker_state_machine.parallel_region.fallback.execute:
+; AMDGPU:       [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK]]:
+; AMDGPU-NEXT:    br i1 [[WORKER_IS_ACTIVE]], label %[[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:.*]], label %[[WORKER_STATE_MACHINE_DONE_BARRIER:.*]]
+; AMDGPU:       [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE]]:
 ; AMDGPU-NEXT:    call void [[WORKER_WORK_FN]](i16 0, i32 [[TMP0]])
-; AMDGPU-NEXT:    br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]]
-; AMDGPU:       worker_state_machine.parallel_region.end:
+; AMDGPU-NEXT:    br label %[[WORKER_STATE_MACHINE_PARALLEL_REGION_END:.*]]
+; AMDGPU:       [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]]:
 ; AMDGPU-NEXT:    call void @__kmpc_kernel_end_parallel()
-; AMDGPU-NEXT:    br label [[WORKER_STATE_MACHINE_DONE_BARRIER]]
-; AMDGPU:       worker_state_machine.done.barrier:
+; AMDGPU-NEXT:    br label %[[WORKER_STATE_MACHINE_DONE_BARRIER]]
+; AMDGPU:       [[WORKER_STATE_MACHINE_DONE_BARRIER]]:
 ; AMDGPU-NEXT:    call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
-; AMDGPU-NEXT:    br label [[WORKER_STATE_MACHINE_BEGIN]]
-; AMDGPU:       thread.user_code.check:
+; AMDGPU-NEXT:    br label %[[WORKER_STATE_MACHINE_BEGIN]]
+; AMDGPU:       [[THREAD_USER_CODE_CHECK]]:
 ; AMDGPU-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
-; AMDGPU-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
-; AMDGPU:       common.ret:
+; AMDGPU-NEXT:    br i1 [[EXEC_USER_CODE]], label %[[USER_CODE_ENTRY:.*]], label %[[COMMON_RET:.*]]
+; AMDGPU:       [[COMMON_RET]]:
 ; AMDGPU-NEXT:    ret void
-; AMDGPU:       user_code.entry:
+; AMDGPU:       [[USER_CODE_ENTRY]]:
 ; AMDGPU-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]]
 ; AMDGPU-NEXT:    call void @__omp_outlined__8(ptr [[DOTTHREADID_TEMP__CAST]], ptr [[DOTZERO_ADDR_CAST]]) #[[ATTR4]]
 ; AMDGPU-NEXT:    call void @__kmpc_target_deinit()
-; AMDGPU-NEXT:    br label [[COMMON_RET]]
+; AMDGPU-NEXT:    br label %[[COMMON_RET]]
 ;
-; NVPTX-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65
-; NVPTX-SAME: () #[[ATTR0]] {
-; NVPTX-NEXT:  entry:
+; NVPTX-LABEL: define weak ptx_kernel void @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65(
+; NVPTX-SAME: ) #[[ATTR0]] {
+; NVPTX-NEXT:  [[ENTRY:.*:]]
 ; NVPTX-NEXT:    [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8
 ; NVPTX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 ; NVPTX-NEXT:    [[DOTZERO_ADDR_CAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTZERO_ADDR]] to ptr
@@ -2346,46 +2346,46 @@ define weak ptx_kernel void @__omp_offloading_fd02_2044372e_do_not_spmdize_targe
 ; NVPTX-NEXT:    [[DOTTHREADID_TEMP__CAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTTHREADID_TEMP_]] to ptr
 ; NVPTX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65_kernel_environment, ptr null)
 ; NVPTX-NEXT:    [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
-; NVPTX-NEXT:    br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
-; NVPTX:       is_worker_check:
+; NVPTX-NEXT:    br i1 [[THREAD_IS_WORKER]], label %[[IS_WORKER_CHECK:.*]], label %[[THREAD_USER_CODE_CHECK:.*]]
+; NVPTX:       [[IS_WORKER_CHECK]]:
 ; NVPTX-NEXT:    [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
 ; NVPTX-NEXT:    [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
 ; NVPTX-NEXT:    [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]]
 ; NVPTX-NEXT:    [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]]
-; NVPTX-NEXT:    br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]]
-; NVPTX:       worker_state_machine.begin:
+; NVPTX-NEXT:    br i1 [[THREAD_IS_MAIN_OR_WORKER]], label %[[WORKER_STATE_MACHINE_BEGIN:.*]], label %[[WORKER_STATE_MACHINE_FINISHED:.*]]
+; NVPTX:       [[WORKER_STATE_MACHINE_BEGIN]]:
 ; NVPTX-NEXT:    call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
 ; NVPTX-NEXT:    [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(ptr [[WORKER_WORK_FN_ADDR]])
 ; NVPTX-NEXT:    [[WORKER_WORK_FN:%.*]] = load ptr, ptr [[WORKER_WORK_FN_ADDR]], align 8
 ; NVPTX-NEXT:    [[WORKER_IS_DONE:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], null
-; NVPTX-NEXT:    br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
-; NVPTX:       worker_state_machine.finished:
+; NVPTX-NEXT:    br i1 [[WORKER_IS_DONE]], label %[[WORKER_STATE_MACHINE_FINISHED]], label %[[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:.*]]
+; NVPTX:       [[WORKER_STATE_MACHINE_FINISHED]]:
 ; NVPTX-NEXT:    ret void
-; NVPTX:       worker_state_machine.is_active.check:
-; NVPTX-NEXT:    br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]]
-; NVPTX:       worker_state_machine.parallel_region.fallback.execute:
+; NVPTX:       [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK]]:
+; NVPTX-NEXT:    br i1 [[WORKER_IS_ACTIVE]], label %[[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:.*]], label %[[WORKER_STATE_MACHINE_DONE_BARRIER:.*]]
+; NVPTX:       [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE]]:
 ; NVPTX-NEXT:    call void [[WORKER_WORK_FN]](i16 0, i32 [[TMP0]])
-; NVPTX-NEXT:    br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]]
-; NVPTX:       worker_state_machine.parallel_region.end:
+; NVPTX-NEXT:    br label %[[WORKER_STATE_MACHINE_PARALLEL_REGION_END:.*]]
+; NVPTX:       [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]]:
 ; NVPTX-NEXT:    call void @__kmpc_kernel_end_parallel()
-; NVPTX-NEXT:    br label [[WORKER_STATE_MACHINE_DONE_BARRIER]]
-; NVPTX:       worker_state_machine.done.barrier:
+; NVPTX-NEXT:    br label %[[WORKER_STATE_MACHINE_DONE_BARRIER]]
+; NVPTX:       [[WORKER_STATE_MACHINE_DONE_BARRIER]]:
 ; NVPTX-NEXT:    call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
-; NVPTX-NEXT:    br label [[WORKER_STATE_MACHINE_BEGIN]]
-; NVPTX:       thread.user_code.check:
+; NVPTX-NEXT:    br label %[[WORKER_STATE_MACHINE_BEGIN]]
+; NVPTX:       [[THREAD_USER_CODE_CHECK]]:
 ; NVPTX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
-; NVPTX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
-; NVPTX:       common.ret:
+; NVPTX-NEXT:    br i1 [[EXEC_USER_CODE]], label %[[USER_CODE_ENTRY:.*]], label %[[COMMON_RET:.*]]
+; NVPTX:       [[COMMON_RET]]:
 ; NVPTX-NEXT:    ret void
-; NVPTX:       user_code.entry:
+; NVPTX:       [[USER_CODE_ENTRY]]:
 ; NVPTX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]]
 ; NVPTX-NEXT:    call void @__omp_outlined__8(ptr [[DOTTHREADID_TEMP__CAST]], ptr [[DOTZERO_ADDR_CAST]]) #[[ATTR4]]
 ; NVPTX-NEXT:    call void @__kmpc_target_deinit()
-; NVPTX-NEXT:    br label [[COMMON_RET]]
+; NVPTX-NEXT:    br label %[[COMMON_RET]]
 ;
-; AMDGPU-DISABLED1-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65
-; AMDGPU-DISABLED1-SAME: () #[[ATTR0]] {
-; AMDGPU-DISABLED1-NEXT:  entry:
+; AMDGPU-DISABLED1-LABEL: define weak ptx_kernel void @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65(
+; AMDGPU-DISABLED1-SAME: ) #[[ATTR0]] {
+; AMDGPU-DISABLED1-NEXT:  [[ENTRY:.*:]]
 ; AMDGPU-DISABLED1-NEXT:    [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 ; AMDGPU-DISABLED1-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 ; AMDGPU-DISABLED1-NEXT:    [[DOTZERO_ADDR_CAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTZERO_ADDR]] to ptr
@@ -2393,65 +2393,65 @@ define weak ptx_kernel void @__omp_offloading_fd02_2044372e_do_not_spmdize_targe
 ; AMDGPU-DISABLED1-NEXT:    [[DOTTHREADID_TEMP__CAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTTHREADID_TEMP_]] to ptr
 ; AMDGPU-DISABLED1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65_kernel_environment, ptr null)
 ; AMDGPU-DISABLED1-NEXT:    [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
-; AMDGPU-DISABLED1-NEXT:    br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
-; AMDGPU-DISABLED1:       is_worker_check:
+; AMDGPU-DISABLED1-NEXT:    br i1 [[THREAD_IS_WORKER]], label %[[IS_WORKER_CHECK:.*]], label %[[THREAD_USER_CODE_CHECK:.*]]
+; AMDGPU-DISABLED1:       [[IS_WORKER_CHECK]]:
 ; AMDGPU-DISABLED1-NEXT:    [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
 ; AMDGPU-DISABLED1-NEXT:    [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
 ; AMDGPU-DISABLED1-NEXT:    [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]]
 ; AMDGPU-DISABLED1-NEXT:    [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]]
-; AMDGPU-DISABLED1-NEXT:    br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]]
-; AMDGPU-DISABLED1:       worker_state_machine.begin:
+; AMDGPU-DISABLED1-NEXT:    br i1 [[THREAD_IS_MAIN_OR_WORKER]], label %[[WORKER_STATE_MACHINE_BEGIN:.*]], label %[[WORKER_STATE_MACHINE_FINISHED:.*]]
+; AMDGPU-DISABLED1:       [[WORKER_STATE_MACHINE_BEGIN]]:
 ; AMDGPU-DISABLED1-NEXT:    call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
 ; AMDGPU-DISABLED1-NEXT:    [[WORKER_WORK_FN_ADDR_GENERIC:%.*]] = addrspacecast ptr addrspace(5) [[WORKER_WORK_FN_ADDR]] to ptr
 ; AMDGPU-DISABLED1-NEXT:    [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(ptr [[WORKER_WORK_FN_ADDR_GENERIC]])
 ; AMDGPU-DISABLED1-NEXT:    [[WORKER_WORK_FN:%.*]] = load ptr, ptr [[WORKER_WORK_FN_ADDR_GENERIC]], align 8
 ; AMDGPU-DISABLED1-NEXT:    [[WORKER_IS_DONE:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], null
-; AMDGPU-DISABLED1-NEXT:    br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
-; AMDGPU-DISABLED1:       worker_state_machine.finished:
+; AMDGPU-DISABLED1-NEXT:    br i1 [[WORKER_IS_DONE]], label %[[WORKER_STATE_MACHINE_FINISHED]], label %[[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:.*]]
+; AMDGPU-DISABLED1:       [[WORKER_STATE_MACHINE_FINISHED]]:
 ; AMDGPU-DISABLED1-NEXT:    ret void
-; AMDGPU-DISABLED1:       worker_state_machine.is_active.check:
-; AMDGPU-DISABLED1-NEXT:    br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]]
-; AMDGPU-DISABLED1:       worker_state_machine.parallel_region.fallback.execute:
+; AMDGPU-DISABLED1:       [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK]]:
+; AMDGPU-DISABLED1-NEXT:    br i1 [[WORKER_IS_ACTIVE]], label %[[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:.*]], label %[[WORKER_STATE_MACHINE_DONE_BARRIER:.*]]
+; AMDGPU-DISABLED1:       [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE]]:
 ; AMDGPU-DISABLED1-NEXT:    call void [[WORKER_WORK_FN]](i16 0, i32 [[TMP0]])
-; AMDGPU-DISABLED1-NEXT:    br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]]
-; AMDGPU-DISABLED1:       worker_state_machine.parallel_region.end:
+; AMDGPU-DISABLED1-NEXT:    br label %[[WORKER_STATE_MACHINE_PARALLEL_REGION_END:.*]]
+; AMDGPU-DISABLED1:       [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]]:
 ; AMDGPU-DISABLED1-NEXT:    call void @__kmpc_kernel_end_parallel()
-; AMDGPU-DISABLED1-NEXT:    br label [[WORKER_STATE_MACHINE_DONE_BARRIER]]
-; AMDGPU-DISABLED1:       worker_state_machine.done.barrier:
+; AMDGPU-DISABLED1-NEXT:    br label %[[WORKER_STATE_MACHINE_DONE_BARRIER]]
+; AMDGPU-DISABLED1:       [[WORKER_STATE_MACHINE_DONE_BARRIER]]:
 ; AMDGPU-DISABLED1-NEXT:    call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
-; AMDGPU-DISABLED1-NEXT:    br label [[WORKER_STATE_MACHINE_BEGIN]]
-; AMDGPU-DISABLED1:       thread.user_code.check:
+; AMDGPU-DISABLED1-NEXT:    br label %[[WORKER_STATE_MACHINE_BEGIN]]
+; AMDGPU-DISABLED1:       [[THREAD_USER_CODE_CHECK]]:
 ; AMDGPU-DISABLED1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
-; AMDGPU-DISABLED1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
-; AMDGPU-DISABLED1:       common.ret:
+; AMDGPU-DISABLED1-NEXT:    br i1 [[EXEC_USER_CODE]], label %[[USER_CODE_ENTRY:.*]], label %[[COMMON_RET:.*]]
+; AMDGPU-DISABLED1:       [[COMMON_RET]]:
 ; AMDGPU-DISABLED1-NEXT:    ret void
-; AMDGPU-DISABLED1:       user_code.entry:
+; AMDGPU-DISABLED1:       [[USER_CODE_ENTRY]]:
 ; AMDGPU-DISABLED1-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]]
 ; AMDGPU-DISABLED1-NEXT:    call void @__omp_outlined__8(ptr [[DOTTHREADID_TEMP__CAST]], ptr [[DOTZERO_ADDR_CAST]]) #[[ATTR4]]
 ; AMDGPU-DISABLED1-NEXT:    call void @__kmpc_target_deinit()
-; AMDGPU-DISABLED1-NEXT:    br label [[COMMON_RET]]
+; AMDGPU-DISABLED1-NEXT:    br label %[[COMMON_RET]]
 ;
-; AMDGPU-DISABLED2-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65
-; AMDGPU-DISABLED2-SAME: () #[[ATTR0]] {
-; AMDGPU-DISABLED2-NEXT:  entry:
+; AMDGPU-DISABLED2-LABEL: define weak ptx_kernel void @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65(
+; AMDGPU-DISABLED2-SAME: ) #[[ATTR0]] {
+; AMDGPU-DISABLED2-NEXT:  [[ENTRY:.*:]]
 ; AMDGPU-DISABLED2-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 ; AMDGPU-DISABLED2-NEXT:    [[DOTZERO_ADDR_CAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTZERO_ADDR]] to ptr
 ; AMDGPU-DISABLED2-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca ptr, align 8, addrspace(5)
 ; AMDGPU-DISABLED2-NEXT:    [[DOTTHREADID_TEMP__CAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTTHREADID_TEMP_]] to ptr
 ; AMDGPU-DISABLED2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65_kernel_environment, ptr null)
 ; AMDGPU-DISABLED2-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
-; AMDGPU-DISABLED2-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
-; AMDGPU-DISABLED2:       common.ret:
+; AMDGPU-DISABLED2-NEXT:    br i1 [[EXEC_USER_CODE]], label %[[USER_CODE_ENTRY:.*]], label %[[COMMON_RET:.*]]
+; AMDGPU-DISABLED2:       [[COMMON_RET]]:
 ; AMDGPU-DISABLED2-NEXT:    ret void
-; AMDGPU-DISABLED2:       user_code.entry:
+; AMDGPU-DISABLED2:       [[USER_CODE_ENTRY]]:
 ; AMDGPU-DISABLED2-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]]
 ; AMDGPU-DISABLED2-NEXT:    call void @__omp_outlined__8(ptr [[DOTTHREADID_TEMP__CAST]], ptr [[DOTZERO_ADDR_CAST]]) #[[ATTR4]]
 ; AMDGPU-DISABLED2-NEXT:    call void @__kmpc_target_deinit()
-; AMDGPU-DISABLED2-NEXT:    br label [[COMMON_RET]]
+; AMDGPU-DISABLED2-NEXT:    br label %[[COMMON_RET]]
 ;
-; NVPTX-DISABLED1-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65
-; NVPTX-DISABLED1-SAME: () #[[ATTR0]] {
-; NVPTX-DISABLED1-NEXT:  entry:
+; NVPTX-DISABLED1-LABEL: define weak ptx_kernel void @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65(
+; NVPTX-DISABLED1-SAME: ) #[[ATTR0]] {
+; NVPTX-DISABLED1-NEXT:  [[ENTRY:.*:]]
 ; NVPTX-DISABLED1-NEXT:    [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8
 ; NVPTX-DISABLED1-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 ; NVPTX-DISABLED1-NEXT:    [[DOTZERO_ADDR_CAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTZERO_ADDR]] to ptr
@@ -2459,60 +2459,60 @@ define weak ptx_kernel void @__omp_offloading_fd02_2044372e_do_not_spmdize_targe
 ; NVPTX-DISABLED1-NEXT:    [[DOTTHREADID_TEMP__CAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTTHREADID_TEMP_]] to ptr
 ; NVPTX-DISABLED1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65_kernel_environment, ptr null)
 ; NVPTX-DISABLED1-NEXT:    [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
-; NVPTX-DISABLED1-NEXT:    br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
-; NVPTX-DISABLED1:       is_worker_check:
+; NVPTX-DISABLED1-NEXT:    br i1 [[THREAD_IS_WORKER]], label %[[IS_WORKER_CHECK:.*]], label %[[THREAD_USER_CODE_CHECK:.*]]
+; NVPTX-DISABLED1:       [[IS_WORKER_CHECK]]:
 ; NVPTX-DISABLED1-NEXT:    [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
 ; NVPTX-DISABLED1-NEXT:    [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
 ; NVPTX-DISABLED1-NEXT:    [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]]
 ; NVPTX-DISABLED1-NEXT:    [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]]
-; NVPTX-DISABLED1-NEXT:    br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]]
-; NVPTX-DISABLED1:       worker_state_machine.begin:
+; NVPTX-DISABLED1-NEXT:    br i1 [[THREAD_IS_MAIN_OR_WORKER]], label %[[WORKER_STATE_MACHINE_BEGIN:.*]], label %[[WORKER_STATE_MACHINE_FINISHED:.*]]
+; NVPTX-DISABLED1:       [[WORKER_STATE_MACHINE_BEGIN]]:
 ; NVPTX-DISABLED1-NEXT:    call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
 ; NVPTX-DISABLED1-NEXT:    [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(ptr [[WORKER_WORK_FN_ADDR]])
 ; NVPTX-DISABLED1-NEXT:    [[WORKER_WORK_FN:%.*]] = load ptr, ptr [[WORKER_WORK_FN_ADDR]], align 8
 ; NVPTX-DISABLED1-NEXT:    [[WORKER_IS_DONE:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], null
-; NVPTX-DISABLED1-NEXT:    br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
-; NVPTX-DISABLED1:       worker_state_machine.finished:
+; NVPTX-DISABLED1-NEXT:    br i1 [[WORKER_IS_DONE]], label %[[WORKER_STATE_MACHINE_FINISHED]], label %[[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:.*]]
+; NVPTX-DISABLED1:       [[WORKER_STATE_MACHINE_FINISHED]]:
 ; NVPTX-DISABLED1-NEXT:    ret void
-; NVPTX-DISABLED1:       worker_state_machine.is_active.check:
-; NVPTX-DISABLED1-NEXT:    br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]]
-; NVPTX-DISABLED1:       worker_state_machine.parallel_region.fallback.execute:
+; NVPTX-DISABLED1:       [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK]]:
+; NVPTX-DISABLED1-NEXT:    br i1 [[WORKER_IS_ACTIVE]], label %[[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:.*]], label %[[WORKER_STATE_MACHINE_DONE_BARRIER:.*]]
+; NVPTX-DISABLED1:       [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE]]:
 ; NVPTX-DISABLED1-NEXT:    call void [[WORKER_WORK_FN]](i16 0, i32 [[TMP0]])
-; NVPTX-DISABLED1-NEXT:    br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]]
-; NVPTX-DISABLED1:       worker_state_machine.parallel_region.end:
+; NVPTX-DISABLED1-NEXT:    br label %[[WORKER_STATE_MACHINE_PARALLEL_REGION_END:.*]]
+; NVPTX-DISABLED1:       [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]]:
 ; NVPTX-DISABLED1-NEXT:    call void @__kmpc_kernel_end_parallel()
-; NVPTX-DISABLED1-NEXT:    br label [[WORKER_STATE_MACHINE_DONE_BARRIER]]
-; NVPTX-DISABLED1:       worker_state_machine.done.barrier:
+; NVPTX-DISABLED1-NEXT:    br label %[[WORKER_STATE_MACHINE_DONE_BARRIER]]
+; NVPTX-DISABLED1:       [[WORKER_STATE_MACHINE_DONE_BARRIER]]:
 ; NVPTX-DISABLED1-NEXT:    call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
-; NVPTX-DISABLED1-NEXT:    br label [[WORKER_STATE_MACHINE_BEGIN]]
-; NVPTX-DISABLED1:       thread.user_code.check:
+; NVPTX-DISABLED1-NEXT:    br label %[[WORKER_STATE_MACHINE_BEGIN]]
+; NVPTX-DISABLED1:       [[THREAD_USER_CODE_CHECK]]:
 ; NVPTX-DISABLED1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
-; NVPTX-DISABLED1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
-; NVPTX-DISABLED1:       common.ret:
+; NVPTX-DISABLED1-NEXT:    br i1 [[EXEC_USER_CODE]], label %[[USER_CODE_ENTRY:.*]], label %[[COMMON_RET:.*]]
+; NVPTX-DISABLED1:       [[COMMON_RET]]:
 ; NVPTX-DISABLED1-NEXT:    ret void
-; NVPTX-DISABLED1:       user_code.entry:
+; NVPTX-DISABLED1:       [[USER_CODE_ENTRY]]:
 ; NVPTX-DISABLED1-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]]
 ; NVPTX-DISABLED1-NEXT:    call void @__omp_outlined__8(ptr [[DOTTHREADID_TEMP__CAST]], ptr [[DOTZERO_ADDR_CAST]]) #[[ATTR4]]
 ; NVPTX-DISABLED1-NEXT:    call void @__kmpc_target_deinit()
-; NVPTX-DISABLED1-NEXT:    br label [[COMMON_RET]]
+; NVPTX-DISABLED1-NEXT:    br label %[[COMMON_RET]]
 ;
-; NVPTX-DISABLED2-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65
-; NVPTX-DISABLED2-SAME: () #[[ATTR0]] {
-; NVPTX-DISABLED2-NEXT:  entry:
+; NVPTX-DISABLED2-LABEL: define weak ptx_kernel void @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65(
+; NVPTX-DISABLED2-SAME: ) #[[ATTR0]] {
+; NVPTX-DISABLED2-NEXT:  [[ENTRY:.*:]]
 ; NVPTX-DISABLED2-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 ; NVPTX-DISABLED2-NEXT:    [[DOTZERO_ADDR_CAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTZERO_ADDR]] to ptr
 ; NVPTX-DISABLED2-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca ptr, align 8, addrspace(5)
 ; NVPTX-DISABLED2-NEXT:    [[DOTTHREADID_TEMP__CAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTTHREADID_TEMP_]] to ptr
 ; NVPTX-DISABLED2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65_kernel_environment, ptr null)
 ; NVPTX-DISABLED2-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
-; NVPTX-DISABLED2-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
-; NVPTX-DISABLED2:       common.ret:
+; NVPTX-DISABLED2-NEXT:    br i1 [[EXEC_USER_CODE]], label %[[USER_CODE_ENTRY:.*]], label %[[COMMON_RET:.*]]
+; NVPTX-DISABLED2:       [[COMMON_RET]]:
 ; NVPTX-DISABLED2-NEXT:    ret void
-; NVPTX-DISABLED2:       user_code.entry:
+; NVPTX-DISABLED2:       [[USER_CODE_ENTRY]]:
 ; NVPTX-DISABLED2-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]]
 ; NVPTX-DISABLED2-NEXT:    call void @__omp_outlined__8(ptr [[DOTTHREADID_TEMP__CAST]], ptr [[DOTZERO_ADDR_CAST]]) #[[ATTR4]]
 ; NVPTX-DISABLED2-NEXT:    call void @__kmpc_target_deinit()
-; NVPTX-DISABLED2-NEXT:    br label [[COMMON_RET]]
+; NVPTX-DISABLED2-NEXT:    br label %[[COMMON_RET]]
 ;
 entry:
   %.zero.addr = alloca ptr, align 8, addrspace(5)
@@ -2536,39 +2536,39 @@ user_code.entry:                                  ; preds = %entry
 }
 
 define internal void @__omp_outlined__8(ptr noalias %.global_tid., ptr noalias %.bound_tid.) {
-; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__8
-; AMDGPU-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
-; AMDGPU-NEXT:  entry:
+; AMDGPU-LABEL: define internal void @__omp_outlined__8(
+; AMDGPU-SAME: ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
+; AMDGPU-NEXT:  [[ENTRY:.*:]]
 ; AMDGPU-NEXT:    call void @unknown() #[[ATTR8]]
 ; AMDGPU-NEXT:    ret void
 ;
-; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__8
-; NVPTX-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
-; NVPTX-NEXT:  entry:
+; NVPTX-LABEL: define internal void @__omp_outlined__8(
+; NVPTX-SAME: ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
+; NVPTX-NEXT:  [[ENTRY:.*:]]
 ; NVPTX-NEXT:    call void @unknown() #[[ATTR8]]
 ; NVPTX-NEXT:    ret void
 ;
-; AMDGPU-DISABLED1-LABEL: define {{[^@]+}}@__omp_outlined__8
-; AMDGPU-DISABLED1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
-; AMDGPU-DISABLED1-NEXT:  entry:
+; AMDGPU-DISABLED1-LABEL: define internal void @__omp_outlined__8(
+; AMDGPU-DISABLED1-SAME: ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
+; AMDGPU-DISABLED1-NEXT:  [[ENTRY:.*:]]
 ; AMDGPU-DISABLED1-NEXT:    call void @unknown() #[[ATTR8]]
 ; AMDGPU-DISABLED1-NEXT:    ret void
 ;
-; AMDGPU-DISABLED2-LABEL: define {{[^@]+}}@__omp_outlined__8
-; AMDGPU-DISABLED2-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
-; AMDGPU-DISABLED2-NEXT:  entry:
+; AMDGPU-DISABLED2-LABEL: define internal void @__omp_outlined__8(
+; AMDGPU-DISABLED2-SAME: ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
+; AMDGPU-DISABLED2-NEXT:  [[ENTRY:.*:]]
 ; AMDGPU-DISABLED2-NEXT:    call void @unknown() #[[ATTR8]]
 ; AMDGPU-DISABLED2-NEXT:    ret void
 ;
-; NVPTX-DISABLED1-LABEL: define {{[^@]+}}@__omp_outlined__8
-; NVPTX-DISABLED1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
-; NVPTX-DISABLED1-NEXT:  entry:
+; NVPTX-DISABLED1-LABEL: define internal void @__omp_outlined__8(
+; NVPTX-DISABLED1-SAME: ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
+; NVPTX-DISABLED1-NEXT:  [[ENTRY:.*:]]
 ; NVPTX-DISABLED1-NEXT:    call void @unknown() #[[ATTR8]]
 ; NVPTX-DISABLED1-NEXT:    ret void
 ;
-; NVPTX-DISABLED2-LABEL: define {{[^@]+}}@__omp_outlined__8
-; NVPTX-DISABLED2-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
-; NVPTX-DISABLED2-NEXT:  entry:
+; NVPTX-DISABLED2-LABEL: define internal void @__omp_outlined__8(
+; NVPTX-DISABLED2-SAME: ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
+; NVPTX-DISABLED2-NEXT:  [[ENTRY:.*:]]
 ; NVPTX-DISABLED2-NEXT:    call void @unknown() #[[ATTR8]]
 ; NVPTX-DISABLED2-NEXT:    ret void
 ;
@@ -2579,255 +2579,255 @@ entry:
 
 ; Function Attrs: alwaysinline convergent norecurse nounwind
 define weak ptx_kernel void @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74() #0 {
-; AMDGPU-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74
-; AMDGPU-SAME: () #[[ATTR0]] {
-; AMDGPU-NEXT:  entry:
+; AMDGPU-LABEL: define weak ptx_kernel void @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74(
+; AMDGPU-SAME: ) #[[ATTR0]] {
+; AMDGPU-NEXT:  [[ENTRY:.*:]]
 ; AMDGPU-NEXT:    [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 ; AMDGPU-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca ptr, align 8, addrspace(5)
 ; AMDGPU-NEXT:    [[CAPTURED_VARS_ADDRS_CAST:%.*]] = addrspacecast ptr addrspace(5) [[CAPTURED_VARS_ADDRS]] to ptr
 ; AMDGPU-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74_kernel_environment, ptr null)
 ; AMDGPU-NEXT:    [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
-; AMDGPU-NEXT:    br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
-; AMDGPU:       is_worker_check:
+; AMDGPU-NEXT:    br i1 [[THREAD_IS_WORKER]], label %[[IS_WORKER_CHECK:.*]], label %[[THREAD_USER_CODE_CHECK:.*]]
+; AMDGPU:       [[IS_WORKER_CHECK]]:
 ; AMDGPU-NEXT:    [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
 ; AMDGPU-NEXT:    [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
 ; AMDGPU-NEXT:    [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]]
 ; AMDGPU-NEXT:    [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]]
-; AMDGPU-NEXT:    br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]]
-; AMDGPU:       worker_state_machine.begin:
+; AMDGPU-NEXT:    br i1 [[THREAD_IS_MAIN_OR_WORKER]], label %[[WORKER_STATE_MACHINE_BEGIN:.*]], label %[[WORKER_STATE_MACHINE_FINISHED:.*]]
+; AMDGPU:       [[WORKER_STATE_MACHINE_BEGIN]]:
 ; AMDGPU-NEXT:    call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
 ; AMDGPU-NEXT:    [[WORKER_WORK_FN_ADDR_GENERIC:%.*]] = addrspacecast ptr addrspace(5) [[WORKER_WORK_FN_ADDR]] to ptr
 ; AMDGPU-NEXT:    [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(ptr [[WORKER_WORK_FN_ADDR_GENERIC]])
 ; AMDGPU-NEXT:    [[WORKER_WORK_FN:%.*]] = load ptr, ptr [[WORKER_WORK_FN_ADDR_GENERIC]], align 8
 ; AMDGPU-NEXT:    [[WORKER_IS_DONE:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], null
-; AMDGPU-NEXT:    br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
-; AMDGPU:       worker_state_machine.finished:
+; AMDGPU-NEXT:    br i1 [[WORKER_IS_DONE]], label %[[WORKER_STATE_MACHINE_FINISHED]], label %[[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:.*]]
+; AMDGPU:       [[WORKER_STATE_MACHINE_FINISHED]]:
 ; AMDGPU-NEXT:    ret void
-; AMDGPU:       worker_state_machine.is_active.check:
-; AMDGPU-NEXT:    br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]]
-; AMDGPU:       worker_state_machine.parallel_region.check:
+; AMDGPU:       [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK]]:
+; AMDGPU-NEXT:    br i1 [[WORKER_IS_ACTIVE]], label %[[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:.*]], label %[[WORKER_STATE_MACHINE_DONE_BARRIER:.*]]
+; AMDGPU:       [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK]]:
 ; AMDGPU-NEXT:    [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], @__omp_outlined__9_wrapper.ID
-; AMDGPU-NEXT:    br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]]
-; AMDGPU:       worker_state_machine.parallel_region.execute:
+; AMDGPU-NEXT:    br i1 [[WORKER_CHECK_PARALLEL_REGION]], label %[[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:.*]], label %[[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:.*]]
+; AMDGPU:       [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE]]:
 ; AMDGPU-NEXT:    call void @__omp_outlined__9_wrapper(i16 0, i32 [[TMP0]])
-; AMDGPU-NEXT:    br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]]
-; AMDGPU:       worker_state_machine.parallel_region.fallback.execute:
+; AMDGPU-NEXT:    br label %[[WORKER_STATE_MACHINE_PARALLEL_REGION_END:.*]]
+; AMDGPU:       [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE]]:
 ; AMDGPU-NEXT:    call void [[WORKER_WORK_FN]](i16 0, i32 [[TMP0]])
-; AMDGPU-NEXT:    br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]]
-; AMDGPU:       worker_state_machine.parallel_region.end:
+; AMDGPU-NEXT:    br label %[[WORKER_STATE_MACHINE_PARALLEL_REGION_END]]
+; AMDGPU:       [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]]:
 ; AMDGPU-NEXT:    call void @__kmpc_kernel_end_parallel()
-; AMDGPU-NEXT:    br label [[WORKER_STATE_MACHINE_DONE_BARRIER]]
-; AMDGPU:       worker_state_machine.done.barrier:
+; AMDGPU-NEXT:    br label %[[WORKER_STATE_MACHINE_DONE_BARRIER]]
+; AMDGPU:       [[WORKER_STATE_MACHINE_DONE_BARRIER]]:
 ; AMDGPU-NEXT:    call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
-; AMDGPU-NEXT:    br label [[WORKER_STATE_MACHINE_BEGIN]]
-; AMDGPU:       thread.user_code.check:
+; AMDGPU-NEXT:    br label %[[WORKER_STATE_MACHINE_BEGIN]]
+; AMDGPU:       [[THREAD_USER_CODE_CHECK]]:
 ; AMDGPU-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
-; AMDGPU-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
-; AMDGPU:       common.ret:
+; AMDGPU-NEXT:    br i1 [[EXEC_USER_CODE]], label %[[USER_CODE_ENTRY:.*]], label %[[COMMON_RET:.*]]
+; AMDGPU:       [[COMMON_RET]]:
 ; AMDGPU-NEXT:    ret void
-; AMDGPU:       user_code.entry:
+; AMDGPU:       [[USER_CODE_ENTRY]]:
 ; AMDGPU-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]]
 ; AMDGPU-NEXT:    [[TMP2:%.*]] = call ptr @__kmpc_omp_task_alloc(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i64 40, i64 0, ptr @"_omp_task_entry$") #[[ATTR4]]
 ; AMDGPU-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_omp_task(ptr @[[GLOB1]], i32 [[TMP1]], ptr [[TMP2]]) #[[ATTR4]]
 ; AMDGPU-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__9, ptr @__omp_outlined__9_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS_CAST]], i64 0)
 ; AMDGPU-NEXT:    call void @__kmpc_target_deinit()
-; AMDGPU-NEXT:    br label [[COMMON_RET]]
+; AMDGPU-NEXT:    br label %[[COMMON_RET]]
 ;
-; NVPTX-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74
-; NVPTX-SAME: () #[[ATTR0]] {
-; NVPTX-NEXT:  entry:
+; NVPTX-LABEL: define weak ptx_kernel void @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74(
+; NVPTX-SAME: ) #[[ATTR0]] {
+; NVPTX-NEXT:  [[ENTRY:.*:]]
 ; NVPTX-NEXT:    [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8
 ; NVPTX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca ptr, align 8, addrspace(5)
 ; NVPTX-NEXT:    [[CAPTURED_VARS_ADDRS_CAST:%.*]] = addrspacecast ptr addrspace(5) [[CAPTURED_VARS_ADDRS]] to ptr
 ; NVPTX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74_kernel_environment, ptr null)
 ; NVPTX-NEXT:    [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
-; NVPTX-NEXT:    br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
-; NVPTX:       is_worker_check:
+; NVPTX-NEXT:    br i1 [[THREAD_IS_WORKER]], label %[[IS_WORKER_CHECK:.*]], label %[[THREAD_USER_CODE_CHECK:.*]]
+; NVPTX:       [[IS_WORKER_CHECK]]:
 ; NVPTX-NEXT:    [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
 ; NVPTX-NEXT:    [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
 ; NVPTX-NEXT:    [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]]
 ; NVPTX-NEXT:    [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]]
-; NVPTX-NEXT:    br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]]
-; NVPTX:       worker_state_machine.begin:
+; NVPTX-NEXT:    br i1 [[THREAD_IS_MAIN_OR_WORKER]], label %[[WORKER_STATE_MACHINE_BEGIN:.*]], label %[[WORKER_STATE_MACHINE_FINISHED:.*]]
+; NVPTX:       [[WORKER_STATE_MACHINE_BEGIN]]:
 ; NVPTX-NEXT:    call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
 ; NVPTX-NEXT:    [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(ptr [[WORKER_WORK_FN_ADDR]])
 ; NVPTX-NEXT:    [[WORKER_WORK_FN:%.*]] = load ptr, ptr [[WORKER_WORK_FN_ADDR]], align 8
 ; NVPTX-NEXT:    [[WORKER_IS_DONE:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], null
-; NVPTX-NEXT:    br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
-; NVPTX:       worker_state_machine.finished:
+; NVPTX-NEXT:    br i1 [[WORKER_IS_DONE]], label %[[WORKER_STATE_MACHINE_FINISHED]], label %[[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:.*]]
+; NVPTX:       [[WORKER_STATE_MACHINE_FINISHED]]:
 ; NVPTX-NEXT:    ret void
-; NVPTX:       worker_state_machine.is_active.check:
-; NVPTX-NEXT:    br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]]
-; NVPTX:       worker_state_machine.parallel_region.check:
+; NVPTX:       [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK]]:
+; NVPTX-NEXT:    br i1 [[WORKER_IS_ACTIVE]], label %[[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:.*]], label %[[WORKER_STATE_MACHINE_DONE_BARRIER:.*]]
+; NVPTX:       [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK]]:
 ; NVPTX-NEXT:    [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], @__omp_outlined__9_wrapper.ID
-; NVPTX-NEXT:    br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]]
-; NVPTX:       worker_state_machine.parallel_region.execute:
+; NVPTX-NEXT:    br i1 [[WORKER_CHECK_PARALLEL_REGION]], label %[[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:.*]], label %[[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:.*]]
+; NVPTX:       [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE]]:
 ; NVPTX-NEXT:    call void @__omp_outlined__9_wrapper(i16 0, i32 [[TMP0]])
-; NVPTX-NEXT:    br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]]
-; NVPTX:       worker_state_machine.parallel_region.fallback.execute:
+; NVPTX-NEXT:    br label %[[WORKER_STATE_MACHINE_PARALLEL_REGION_END:.*]]
+; NVPTX:       [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE]]:
 ; NVPTX-NEXT:    call void [[WORKER_WORK_FN]](i16 0, i32 [[TMP0]])
-; NVPTX-NEXT:    br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]]
-; NVPTX:       worker_state_machine.parallel_region.end:
+; NVPTX-NEXT:    br label %[[WORKER_STATE_MACHINE_PARALLEL_REGION_END]]
+; NVPTX:       [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]]:
 ; NVPTX-NEXT:    call void @__kmpc_kernel_end_parallel()
-; NVPTX-NEXT:    br label [[WORKER_STATE_MACHINE_DONE_BARRIER]]
-; NVPTX:       worker_state_machine.done.barrier:
+; NVPTX-NEXT:    br label %[[WORKER_STATE_MACHINE_DONE_BARRIER]]
+; NVPTX:       [[WORKER_STATE_MACHINE_DONE_BARRIER]]:
 ; NVPTX-NEXT:    call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
-; NVPTX-NEXT:    br label [[WORKER_STATE_MACHINE_BEGIN]]
-; NVPTX:       thread.user_code.check:
+; NVPTX-NEXT:    br label %[[WORKER_STATE_MACHINE_BEGIN]]
+; NVPTX:       [[THREAD_USER_CODE_CHECK]]:
 ; NVPTX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
-; NVPTX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
-; NVPTX:       common.ret:
+; NVPTX-NEXT:    br i1 [[EXEC_USER_CODE]], label %[[USER_CODE_ENTRY:.*]], label %[[COMMON_RET:.*]]
+; NVPTX:       [[COMMON_RET]]:
 ; NVPTX-NEXT:    ret void
-; NVPTX:       user_code.entry:
+; NVPTX:       [[USER_CODE_ENTRY]]:
 ; NVPTX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]]
 ; NVPTX-NEXT:    [[TMP2:%.*]] = call ptr @__kmpc_omp_task_alloc(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i64 40, i64 0, ptr @"_omp_task_entry$") #[[ATTR4]]
 ; NVPTX-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_omp_task(ptr @[[GLOB1]], i32 [[TMP1]], ptr [[TMP2]]) #[[ATTR4]]
 ; NVPTX-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__9, ptr @__omp_outlined__9_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS_CAST]], i64 0)
 ; NVPTX-NEXT:    call void @__kmpc_target_deinit()
-; NVPTX-NEXT:    br label [[COMMON_RET]]
+; NVPTX-NEXT:    br label %[[COMMON_RET]]
 ;
-; AMDGPU-DISABLED1-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74
-; AMDGPU-DISABLED1-SAME: () #[[ATTR0]] {
-; AMDGPU-DISABLED1-NEXT:  entry:
+; AMDGPU-DISABLED1-LABEL: define weak ptx_kernel void @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74(
+; AMDGPU-DISABLED1-SAME: ) #[[ATTR0]] {
+; AMDGPU-DISABLED1-NEXT:  [[ENTRY:.*:]]
 ; AMDGPU-DISABLED1-NEXT:    [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 ; AMDGPU-DISABLED1-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca ptr, align 8, addrspace(5)
 ; AMDGPU-DISABLED1-NEXT:    [[CAPTURED_VARS_ADDRS_CAST:%.*]] = addrspacecast ptr addrspace(5) [[CAPTURED_VARS_ADDRS]] to ptr
 ; AMDGPU-DISABLED1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74_kernel_environment, ptr null)
 ; AMDGPU-DISABLED1-NEXT:    [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
-; AMDGPU-DISABLED1-NEXT:    br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
-; AMDGPU-DISABLED1:       is_worker_check:
+; AMDGPU-DISABLED1-NEXT:    br i1 [[THREAD_IS_WORKER]], label %[[IS_WORKER_CHECK:.*]], label %[[THREAD_USER_CODE_CHECK:.*]]
+; AMDGPU-DISABLED1:       [[IS_WORKER_CHECK]]:
 ; AMDGPU-DISABLED1-NEXT:    [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
 ; AMDGPU-DISABLED1-NEXT:    [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
 ; AMDGPU-DISABLED1-NEXT:    [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]]
 ; AMDGPU-DISABLED1-NEXT:    [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]]
-; AMDGPU-DISABLED1-NEXT:    br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]]
-; AMDGPU-DISABLED1:       worker_state_machine.begin:
+; AMDGPU-DISABLED1-NEXT:    br i1 [[THREAD_IS_MAIN_OR_WORKER]], label %[[WORKER_STATE_MACHINE_BEGIN:.*]], label %[[WORKER_STATE_MACHINE_FINISHED:.*]]
+; AMDGPU-DISABLED1:       [[WORKER_STATE_MACHINE_BEGIN]]:
 ; AMDGPU-DISABLED1-NEXT:    call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
 ; AMDGPU-DISABLED1-NEXT:    [[WORKER_WORK_FN_ADDR_GENERIC:%.*]] = addrspacecast ptr addrspace(5) [[WORKER_WORK_FN_ADDR]] to ptr
 ; AMDGPU-DISABLED1-NEXT:    [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(ptr [[WORKER_WORK_FN_ADDR_GENERIC]])
 ; AMDGPU-DISABLED1-NEXT:    [[WORKER_WORK_FN:%.*]] = load ptr, ptr [[WORKER_WORK_FN_ADDR_GENERIC]], align 8
 ; AMDGPU-DISABLED1-NEXT:    [[WORKER_IS_DONE:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], null
-; AMDGPU-DISABLED1-NEXT:    br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
-; AMDGPU-DISABLED1:       worker_state_machine.finished:
+; AMDGPU-DISABLED1-NEXT:    br i1 [[WORKER_IS_DONE]], label %[[WORKER_STATE_MACHINE_FINISHED]], label %[[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:.*]]
+; AMDGPU-DISABLED1:       [[WORKER_STATE_MACHINE_FINISHED]]:
 ; AMDGPU-DISABLED1-NEXT:    ret void
-; AMDGPU-DISABLED1:       worker_state_machine.is_active.check:
-; AMDGPU-DISABLED1-NEXT:    br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]]
-; AMDGPU-DISABLED1:       worker_state_machine.parallel_region.check:
+; AMDGPU-DISABLED1:       [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK]]:
+; AMDGPU-DISABLED1-NEXT:    br i1 [[WORKER_IS_ACTIVE]], label %[[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:.*]], label %[[WORKER_STATE_MACHINE_DONE_BARRIER:.*]]
+; AMDGPU-DISABLED1:       [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK]]:
 ; AMDGPU-DISABLED1-NEXT:    [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], @__omp_outlined__9_wrapper.ID
-; AMDGPU-DISABLED1-NEXT:    br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]]
-; AMDGPU-DISABLED1:       worker_state_machine.parallel_region.execute:
+; AMDGPU-DISABLED1-NEXT:    br i1 [[WORKER_CHECK_PARALLEL_REGION]], label %[[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:.*]], label %[[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:.*]]
+; AMDGPU-DISABLED1:       [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE]]:
 ; AMDGPU-DISABLED1-NEXT:    call void @__omp_outlined__9_wrapper(i16 0, i32 [[TMP0]])
-; AMDGPU-DISABLED1-NEXT:    br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]]
-; AMDGPU-DISABLED1:       worker_state_machine.parallel_region.fallback.execute:
+; AMDGPU-DISABLED1-NEXT:    br label %[[WORKER_STATE_MACHINE_PARALLEL_REGION_END:.*]]
+; AMDGPU-DISABLED1:       [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE]]:
 ; AMDGPU-DISABLED1-NEXT:    call void [[WORKER_WORK_FN]](i16 0, i32 [[TMP0]])
-; AMDGPU-DISABLED1-NEXT:    br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]]
-; AMDGPU-DISABLED1:       worker_state_machine.parallel_region.end:
+; AMDGPU-DISABLED1-NEXT:    br label %[[WORKER_STATE_MACHINE_PARALLEL_REGION_END]]
+; AMDGPU-DISABLED1:       [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]]:
 ; AMDGPU-DISABLED1-NEXT:    call void @__kmpc_kernel_end_parallel()
-; AMDGPU-DISABLED1-NEXT:    br label [[WORKER_STATE_MACHINE_DONE_BARRIER]]
-; AMDGPU-DISABLED1:       worker_state_machine.done.barrier:
+; AMDGPU-DISABLED1-NEXT:    br label %[[WORKER_STATE_MACHINE_DONE_BARRIER]]
+; AMDGPU-DISABLED1:       [[WORKER_STATE_MACHINE_DONE_BARRIER]]:
 ; AMDGPU-DISABLED1-NEXT:    call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
-; AMDGPU-DISABLED1-NEXT:    br label [[WORKER_STATE_MACHINE_BEGIN]]
-; AMDGPU-DISABLED1:       thread.user_code.check:
+; AMDGPU-DISABLED1-NEXT:    br label %[[WORKER_STATE_MACHINE_BEGIN]]
+; AMDGPU-DISABLED1:       [[THREAD_USER_CODE_CHECK]]:
 ; AMDGPU-DISABLED1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
-; AMDGPU-DISABLED1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
-; AMDGPU-DISABLED1:       common.ret:
+; AMDGPU-DISABLED1-NEXT:    br i1 [[EXEC_USER_CODE]], label %[[USER_CODE_ENTRY:.*]], label %[[COMMON_RET:.*]]
+; AMDGPU-DISABLED1:       [[COMMON_RET]]:
 ; AMDGPU-DISABLED1-NEXT:    ret void
-; AMDGPU-DISABLED1:       user_code.entry:
+; AMDGPU-DISABLED1:       [[USER_CODE_ENTRY]]:
 ; AMDGPU-DISABLED1-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]]
 ; AMDGPU-DISABLED1-NEXT:    [[TMP2:%.*]] = call ptr @__kmpc_omp_task_alloc(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i64 40, i64 0, ptr @"_omp_task_entry$") #[[ATTR4]]
 ; AMDGPU-DISABLED1-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_omp_task(ptr @[[GLOB1]], i32 [[TMP1]], ptr [[TMP2]]) #[[ATTR4]]
 ; AMDGPU-DISABLED1-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__9, ptr @__omp_outlined__9_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS_CAST]], i64 0)
 ; AMDGPU-DISABLED1-NEXT:    call void @__kmpc_target_deinit()
-; AMDGPU-DISABLED1-NEXT:    br label [[COMMON_RET]]
+; AMDGPU-DISABLED1-NEXT:    br label %[[COMMON_RET]]
 ;
-; AMDGPU-DISABLED2-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74
-; AMDGPU-DISABLED2-SAME: () #[[ATTR0]] {
-; AMDGPU-DISABLED2-NEXT:  entry:
+; AMDGPU-DISABLED2-LABEL: define weak ptx_kernel void @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74(
+; AMDGPU-DISABLED2-SAME: ) #[[ATTR0]] {
+; AMDGPU-DISABLED2-NEXT:  [[ENTRY:.*:]]
 ; AMDGPU-DISABLED2-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca ptr, align 8, addrspace(5)
 ; AMDGPU-DISABLED2-NEXT:    [[CAPTURED_VARS_ADDRS_CAST:%.*]] = addrspacecast ptr addrspace(5) [[CAPTURED_VARS_ADDRS]] to ptr
 ; AMDGPU-DISABLED2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74_kernel_environment, ptr null)
 ; AMDGPU-DISABLED2-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
-; AMDGPU-DISABLED2-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
-; AMDGPU-DISABLED2:       common.ret:
+; AMDGPU-DISABLED2-NEXT:    br i1 [[EXEC_USER_CODE]], label %[[USER_CODE_ENTRY:.*]], label %[[COMMON_RET:.*]]
+; AMDGPU-DISABLED2:       [[COMMON_RET]]:
 ; AMDGPU-DISABLED2-NEXT:    ret void
-; AMDGPU-DISABLED2:       user_code.entry:
+; AMDGPU-DISABLED2:       [[USER_CODE_ENTRY]]:
 ; AMDGPU-DISABLED2-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]]
 ; AMDGPU-DISABLED2-NEXT:    [[TMP2:%.*]] = call ptr @__kmpc_omp_task_alloc(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i64 40, i64 0, ptr @"_omp_task_entry$") #[[ATTR4]]
 ; AMDGPU-DISABLED2-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_omp_task(ptr @[[GLOB1]], i32 [[TMP1]], ptr [[TMP2]]) #[[ATTR4]]
 ; AMDGPU-DISABLED2-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__9, ptr @__omp_outlined__9_wrapper, ptr [[CAPTURED_VARS_ADDRS_CAST]], i64 0)
 ; AMDGPU-DISABLED2-NEXT:    call void @__kmpc_target_deinit()
-; AMDGPU-DISABLED2-NEXT:    br label [[COMMON_RET]]
+; AMDGPU-DISABLED2-NEXT:    br label %[[COMMON_RET]]
 ;
-; NVPTX-DISABLED1-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74
-; NVPTX-DISABLED1-SAME: () #[[ATTR0]] {
-; NVPTX-DISABLED1-NEXT:  entry:
+; NVPTX-DISABLED1-LABEL: define weak ptx_kernel void @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74(
+; NVPTX-DISABLED1-SAME: ) #[[ATTR0]] {
+; NVPTX-DISABLED1-NEXT:  [[ENTRY:.*:]]
 ; NVPTX-DISABLED1-NEXT:    [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8
 ; NVPTX-DISABLED1-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca ptr, align 8, addrspace(5)
 ; NVPTX-DISABLED1-NEXT:    [[CAPTURED_VARS_ADDRS_CAST:%.*]] = addrspacecast ptr addrspace(5) [[CAPTURED_VARS_ADDRS]] to ptr
 ; NVPTX-DISABLED1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74_kernel_environment, ptr null)
 ; NVPTX-DISABLED1-NEXT:    [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
-; NVPTX-DISABLED1-NEXT:    br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
-; NVPTX-DISABLED1:       is_worker_check:
+; NVPTX-DISABLED1-NEXT:    br i1 [[THREAD_IS_WORKER]], label %[[IS_WORKER_CHECK:.*]], label %[[THREAD_USER_CODE_CHECK:.*]]
+; NVPTX-DISABLED1:       [[IS_WORKER_CHECK]]:
 ; NVPTX-DISABLED1-NEXT:    [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
 ; NVPTX-DISABLED1-NEXT:    [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
 ; NVPTX-DISABLED1-NEXT:    [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]]
 ; NVPTX-DISABLED1-NEXT:    [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]]
-; NVPTX-DISABLED1-NEXT:    br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]]
-; NVPTX-DISABLED1:       worker_state_machine.begin:
+; NVPTX-DISABLED1-NEXT:    br i1 [[THREAD_IS_MAIN_OR_WORKER]], label %[[WORKER_STATE_MACHINE_BEGIN:.*]], label %[[WORKER_STATE_MACHINE_FINISHED:.*]]
+; NVPTX-DISABLED1:       [[WORKER_STATE_MACHINE_BEGIN]]:
 ; NVPTX-DISABLED1-NEXT:    call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
 ; NVPTX-DISABLED1-NEXT:    [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(ptr [[WORKER_WORK_FN_ADDR]])
 ; NVPTX-DISABLED1-NEXT:    [[WORKER_WORK_FN:%.*]] = load ptr, ptr [[WORKER_WORK_FN_ADDR]], align 8
 ; NVPTX-DISABLED1-NEXT:    [[WORKER_IS_DONE:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], null
-; NVPTX-DISABLED1-NEXT:    br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
-; NVPTX-DISABLED1:       worker_state_machine.finished:
+; NVPTX-DISABLED1-NEXT:    br i1 [[WORKER_IS_DONE]], label %[[WORKER_STATE_MACHINE_FINISHED]], label %[[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:.*]]
+; NVPTX-DISABLED1:       [[WORKER_STATE_MACHINE_FINISHED]]:
 ; NVPTX-DISABLED1-NEXT:    ret void
-; NVPTX-DISABLED1:       worker_state_machine.is_active.check:
-; NVPTX-DISABLED1-NEXT:    br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]]
-; NVPTX-DISABLED1:       worker_state_machine.parallel_region.check:
+; NVPTX-DISABLED1:       [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK]]:
+; NVPTX-DISABLED1-NEXT:    br i1 [[WORKER_IS_ACTIVE]], label %[[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:.*]], label %[[WORKER_STATE_MACHINE_DONE_BARRIER:.*]]
+; NVPTX-DISABLED1:       [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK]]:
 ; NVPTX-DISABLED1-NEXT:    [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], @__omp_outlined__9_wrapper.ID
-; NVPTX-DISABLED1-NEXT:    br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]]
-; NVPTX-DISABLED1:       worker_state_machine.parallel_region.execute:
+; NVPTX-DISABLED1-NEXT:    br i1 [[WORKER_CHECK_PARALLEL_REGION]], label %[[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:.*]], label %[[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:.*]]
+; NVPTX-DISABLED1:       [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE]]:
 ; NVPTX-DISABLED1-NEXT:    call void @__omp_outlined__9_wrapper(i16 0, i32 [[TMP0]])
-; NVPTX-DISABLED1-NEXT:    br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]]
-; NVPTX-DISABLED1:       worker_state_machine.parallel_region.fallback.execute:
+; NVPTX-DISABLED1-NEXT:    br label %[[WORKER_STATE_MACHINE_PARALLEL_REGION_END:.*]]
+; NVPTX-DISABLED1:       [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE]]:
 ; NVPTX-DISABLED1-NEXT:    call void [[WORKER_WORK_FN]](i16 0, i32 [[TMP0]])
-; NVPTX-DISABLED1-NEXT:    br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]]
-; NVPTX-DISABLED1:       worker_state_machine.parallel_region.end:
+; NVPTX-DISABLED1-NEXT:    br label %[[WORKER_STATE_MACHINE_PARALLEL_REGION_END]]
+; NVPTX-DISABLED1:       [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]]:
 ; NVPTX-DISABLED1-NEXT:    call void @__kmpc_kernel_end_parallel()
-; NVPTX-DISABLED1-NEXT:    br label [[WORKER_STATE_MACHINE_DONE_BARRIER]]
-; NVPTX-DISABLED1:       worker_state_machine.done.barrier:
+; NVPTX-DISABLED1-NEXT:    br label %[[WORKER_STATE_MACHINE_DONE_BARRIER]]
+; NVPTX-DISABLED1:       [[WORKER_STATE_MACHINE_DONE_BARRIER]]:
 ; NVPTX-DISABLED1-NEXT:    call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
-; NVPTX-DISABLED1-NEXT:    br label [[WORKER_STATE_MACHINE_BEGIN]]
-; NVPTX-DISABLED1:       thread.user_code.check:
+; NVPTX-DISABLED1-NEXT:    br label %[[WORKER_STATE_MACHINE_BEGIN]]
+; NVPTX-DISABLED1:       [[THREAD_USER_CODE_CHECK]]:
 ; NVPTX-DISABLED1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
-; NVPTX-DISABLED1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
-; NVPTX-DISABLED1:       common.ret:
+; NVPTX-DISABLED1-NEXT:    br i1 [[EXEC_USER_CODE]], label %[[USER_CODE_ENTRY:.*]], label %[[COMMON_RET:.*]]
+; NVPTX-DISABLED1:       [[COMMON_RET]]:
 ; NVPTX-DISABLED1-NEXT:    ret void
-; NVPTX-DISABLED1:       user_code.entry:
+; NVPTX-DISABLED1:       [[USER_CODE_ENTRY]]:
 ; NVPTX-DISABLED1-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]]
 ; NVPTX-DISABLED1-NEXT:    [[TMP2:%.*]] = call ptr @__kmpc_omp_task_alloc(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i64 40, i64 0, ptr @"_omp_task_entry$") #[[ATTR4]]
 ; NVPTX-DISABLED1-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_omp_task(ptr @[[GLOB1]], i32 [[TMP1]], ptr [[TMP2]]) #[[ATTR4]]
 ; NVPTX-DISABLED1-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__9, ptr @__omp_outlined__9_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS_CAST]], i64 0)
 ; NVPTX-DISABLED1-NEXT:    call void @__kmpc_target_deinit()
-; NVPTX-DISABLED1-NEXT:    br label [[COMMON_RET]]
+; NVPTX-DISABLED1-NEXT:    br label %[[COMMON_RET]]
 ;
-; NVPTX-DISABLED2-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74
-; NVPTX-DISABLED2-SAME: () #[[ATTR0]] {
-; NVPTX-DISABLED2-NEXT:  entry:
+; NVPTX-DISABLED2-LABEL: define weak ptx_kernel void @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74(
+; NVPTX-DISABLED2-SAME: ) #[[ATTR0]] {
+; NVPTX-DISABLED2-NEXT:  [[ENTRY:.*:]]
 ; NVPTX-DISABLED2-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca ptr, align 8, addrspace(5)
 ; NVPTX-DISABLED2-NEXT:    [[CAPTURED_VARS_ADDRS_CAST:%.*]] = addrspacecast ptr addrspace(5) [[CAPTURED_VARS_ADDRS]] to ptr
 ; NVPTX-DISABLED2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74_kernel_environment, ptr null)
 ; NVPTX-DISABLED2-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
-; NVPTX-DISABLED2-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
-; NVPTX-DISABLED2:       common.ret:
+; NVPTX-DISABLED2-NEXT:    br i1 [[EXEC_USER_CODE]], label %[[USER_CODE_ENTRY:.*]], label %[[COMMON_RET:.*]]
+; NVPTX-DISABLED2:       [[COMMON_RET]]:
 ; NVPTX-DISABLED2-NEXT:    ret void
-; NVPTX-DISABLED2:       user_code.entry:
+; NVPTX-DISABLED2:       [[USER_CODE_ENTRY]]:
 ; NVPTX-DISABLED2-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]]
 ; NVPTX-DISABLED2-NEXT:    [[TMP2:%.*]] = call ptr @__kmpc_omp_task_alloc(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i64 40, i64 0, ptr @"_omp_task_entry$") #[[ATTR4]]
 ; NVPTX-DISABLED2-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_omp_task(ptr @[[GLOB1]], i32 [[TMP1]], ptr [[TMP2]]) #[[ATTR4]]
 ; NVPTX-DISABLED2-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__9, ptr @__omp_outlined__9_wrapper, ptr [[CAPTURED_VARS_ADDRS_CAST]], i64 0)
 ; NVPTX-DISABLED2-NEXT:    call void @__kmpc_target_deinit()
-; NVPTX-DISABLED2-NEXT:    br label [[COMMON_RET]]
+; NVPTX-DISABLED2-NEXT:    br label %[[COMMON_RET]]
 ;
 entry:
   %captured_vars_addrs = alloca ptr, align 8, addrspace(5)
@@ -2850,39 +2850,39 @@ user_code.entry:                                  ; preds = %entry
 
 ; Function Attrs: alwaysinline convergent nounwind
 define internal void @.omp_outlined.(i32 %.global_tid., ptr noalias %.part_id., ptr noalias %.privates., ptr noalias %.copy_fn., ptr %.task_t., ptr noalias %__context) #2 {
-; AMDGPU-LABEL: define {{[^@]+}}@.omp_outlined.
-; AMDGPU-SAME: (i32 [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTPART_ID_:%.*]], ptr noalias [[DOTPRIVATES_:%.*]], ptr noalias [[DOTCOPY_FN_:%.*]], ptr [[DOTTASK_T_:%.*]], ptr noalias [[__CONTEXT:%.*]]) #[[ATTR3:[0-9]+]] {
-; AMDGPU-NEXT:  entry:
+; AMDGPU-LABEL: define internal void @.omp_outlined.(
+; AMDGPU-SAME: i32 [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTPART_ID_:%.*]], ptr noalias [[DOTPRIVATES_:%.*]], ptr noalias [[DOTCOPY_FN_:%.*]], ptr [[DOTTASK_T_:%.*]], ptr noalias [[__CONTEXT:%.*]]) #[[ATTR3:[0-9]+]] {
+; AMDGPU-NEXT:  [[ENTRY:.*:]]
 ; AMDGPU-NEXT:    call void @spmd_amenable() #[[ATTR7]]
 ; AMDGPU-NEXT:    ret void
 ;
-; NVPTX-LABEL: define {{[^@]+}}@.omp_outlined.
-; NVPTX-SAME: (i32 [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTPART_ID_:%.*]], ptr noalias [[DOTPRIVATES_:%.*]], ptr noalias [[DOTCOPY_FN_:%.*]], ptr [[DOTTASK_T_:%.*]], ptr noalias [[__CONTEXT:%.*]]) #[[ATTR3:[0-9]+]] {
-; NVPTX-NEXT:  entry:
+; NVPTX-LABEL: define internal void @.omp_outlined.(
+; NVPTX-SAME: i32 [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTPART_ID_:%.*]], ptr noalias [[DOTPRIVATES_:%.*]], ptr noalias [[DOTCOPY_FN_:%.*]], ptr [[DOTTASK_T_:%.*]], ptr noalias [[__CONTEXT:%.*]]) #[[ATTR3:[0-9]+]] {
+; NVPTX-NEXT:  [[ENTRY:.*:]]
 ; NVPTX-NEXT:    call void @spmd_amenable() #[[ATTR7]]
 ; NVPTX-NEXT:    ret void
 ;
-; AMDGPU-DISABLED1-LABEL: define {{[^@]+}}@.omp_outlined.
-; AMDGPU-DISABLED1-SAME: (i32 [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTPART_ID_:%.*]], ptr noalias [[DOTPRIVATES_:%.*]], ptr noalias [[DOTCOPY_FN_:%.*]], ptr [[DOTTASK_T_:%.*]], ptr noalias [[__CONTEXT:%.*]]) #[[ATTR3:[0-9]+]] {
-; AMDGPU-DISABLED1-NEXT:  entry:
+; AMDGPU-DISABLED1-LABEL: define internal void @.omp_outlined.(
+; AMDGPU-DISABLED1-SAME: i32 [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTPART_ID_:%.*]], ptr noalias [[DOTPRIVATES_:%.*]], ptr noalias [[DOTCOPY_FN_:%.*]], ptr [[DOTTASK_T_:%.*]], ptr noalias [[__CONTEXT:%.*]]) #[[ATTR3:[0-9]+]] {
+; AMDGPU-DISABLED1-NEXT:  [[ENTRY:.*:]]
 ; AMDGPU-DISABLED1-NEXT:    call void @spmd_amenable() #[[ATTR7]]
 ; AMDGPU-DISABLED1-NEXT:    ret void
 ;
-; AMDGPU-DISABLED2-LABEL: define {{[^@]+}}@.omp_outlined.
-; AMDGPU-DISABLED2-SAME: (i32 [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTPART_ID_:%.*]], ptr noalias [[DOTPRIVATES_:%.*]], ptr noalias [[DOTCOPY_FN_:%.*]], ptr [[DOTTASK_T_:%.*]], ptr noalias [[__CONTEXT:%.*]]) #[[ATTR3:[0-9]+]] {
-; AMDGPU-DISABLED2-NEXT:  entry:
+; AMDGPU-DISABLED2-LABEL: define internal void @.omp_outlined.(
+; AMDGPU-DISABLED2-SAME: i32 [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTPART_ID_:%.*]], ptr noalias [[DOTPRIVATES_:%.*]], ptr noalias [[DOTCOPY_FN_:%.*]], ptr [[DOTTASK_T_:%.*]], ptr noalias [[__CONTEXT:%.*]]) #[[ATTR3:[0-9]+]] {
+; AMDGPU-DISABLED2-NEXT:  [[ENTRY:.*:]]
 ; AMDGPU-DISABLED2-NEXT:    call void @spmd_amenable() #[[ATTR7]]
 ; AMDGPU-DISABLED2-NEXT:    ret void
 ;
-; NVPTX-DISABLED1-LABEL: define {{[^@]+}}@.omp_outlined.
-; NVPTX-DISABLED1-SAME: (i32 [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTPART_ID_:%.*]], ptr noalias [[DOTPRIVATES_:%.*]], ptr noalias [[DOTCOPY_FN_:%.*]], ptr [[DOTTASK_T_:%.*]], ptr noalias [[__CONTEXT:%.*]]) #[[ATTR3:[0-9]+]] {
-; NVPTX-DISABLED1-NEXT:  entry:
+; NVPTX-DISABLED1-LABEL: define internal void @.omp_outlined.(
+; NVPTX-DISABLED1-SAME: i32 [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTPART_ID_:%.*]], ptr noalias [[DOTPRIVATES_:%.*]], ptr noalias [[DOTCOPY_FN_:%.*]], ptr [[DOTTASK_T_:%.*]], ptr noalias [[__CONTEXT:%.*]]) #[[ATTR3:[0-9]+]] {
+; NVPTX-DISABLED1-NEXT:  [[ENTRY:.*:]]
 ; NVPTX-DISABLED1-NEXT:    call void @spmd_amenable() #[[ATTR7]]
 ; NVPTX-DISABLED1-NEXT:    ret void
 ;
-; NVPTX-DISABLED2-LABEL: define {{[^@]+}}@.omp_outlined.
-; NVPTX-DISABLED2-SAME: (i32 [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTPART_ID_:%.*]], ptr noalias [[DOTPRIVATES_:%.*]], ptr noalias [[DOTCOPY_FN_:%.*]], ptr [[DOTTASK_T_:%.*]], ptr noalias [[__CONTEXT:%.*]]) #[[ATTR3:[0-9]+]] {
-; NVPTX-DISABLED2-NEXT:  entry:
+; NVPTX-DISABLED2-LABEL: define internal void @.omp_outlined.(
+; NVPTX-DISABLED2-SAME: i32 [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTPART_ID_:%.*]], ptr noalias [[DOTPRIVATES_:%.*]], ptr noalias [[DOTCOPY_FN_:%.*]], ptr [[DOTTASK_T_:%.*]], ptr noalias [[__CONTEXT:%.*]]) #[[ATTR3:[0-9]+]] {
+; NVPTX-DISABLED2-NEXT:  [[ENTRY:.*:]]
 ; NVPTX-DISABLED2-NEXT:    call void @spmd_amenable() #[[ATTR7]]
 ; NVPTX-DISABLED2-NEXT:    ret void
 ;
@@ -2925,28 +2925,28 @@ declare void @unknowni32p(ptr) #7
 declare void @llvm.lifetime.start.p0(ptr captures(none)) #8
 
 define weak i32 @__kmpc_target_init(ptr %0, ptr %1) {
-; AMDGPU-LABEL: define {{[^@]+}}@__kmpc_target_init
-; AMDGPU-SAME: (ptr [[TMP0:%.*]], ptr [[TMP1:%.*]]) {
+; AMDGPU-LABEL: define weak i32 @__kmpc_target_init(
+; AMDGPU-SAME: ptr [[TMP0:%.*]], ptr [[TMP1:%.*]]) {
 ; AMDGPU-NEXT:    ret i32 0
 ;
-; NVPTX-LABEL: define {{[^@]+}}@__kmpc_target_init
-; NVPTX-SAME: (ptr [[TMP0:%.*]], ptr [[TMP1:%.*]]) {
+; NVPTX-LABEL: define weak i32 @__kmpc_target_init(
+; NVPTX-SAME: ptr [[TMP0:%.*]], ptr [[TMP1:%.*]]) {
 ; NVPTX-NEXT:    ret i32 0
 ;
-; AMDGPU-DISABLED1-LABEL: define {{[^@]+}}@__kmpc_target_init
-; AMDGPU-DISABLED1-SAME: (ptr [[TMP0:%.*]], ptr [[TMP1:%.*]]) {
+; AMDGPU-DISABLED1-LABEL: define weak i32 @__kmpc_target_init(
+; AMDGPU-DISABLED1-SAME: ptr [[TMP0:%.*]], ptr [[TMP1:%.*]]) {
 ; AMDGPU-DISABLED1-NEXT:    ret i32 0
 ;
-; AMDGPU-DISABLED2-LABEL: define {{[^@]+}}@__kmpc_target_init
-; AMDGPU-DISABLED2-SAME: (ptr [[TMP0:%.*]], ptr [[TMP1:%.*]]) {
+; AMDGPU-DISABLED2-LABEL: define weak i32 @__kmpc_target_init(
+; AMDGPU-DISABLED2-SAME: ptr [[TMP0:%.*]], ptr [[TMP1:%.*]]) {
 ; AMDGPU-DISABLED2-NEXT:    ret i32 0
 ;
-; NVPTX-DISABLED1-LABEL: define {{[^@]+}}@__kmpc_target_init
-; NVPTX-DISABLED1-SAME: (ptr [[TMP0:%.*]], ptr [[TMP1:%.*]]) {
+; NVPTX-DISABLED1-LABEL: define weak i32 @__kmpc_target_init(
+; NVPTX-DISABLED1-SAME: ptr [[TMP0:%.*]], ptr [[TMP1:%.*]]) {
 ; NVPTX-DISABLED1-NEXT:    ret i32 0
 ;
-; NVPTX-DISABLED2-LABEL: define {{[^@]+}}@__kmpc_target_init
-; NVPTX-DISABLED2-SAME: (ptr [[TMP0:%.*]], ptr [[TMP1:%.*]]) {
+; NVPTX-DISABLED2-LABEL: define weak i32 @__kmpc_target_init(
+; NVPTX-DISABLED2-SAME: ptr [[TMP0:%.*]], ptr [[TMP1:%.*]]) {
 ; NVPTX-DISABLED2-NEXT:    ret i32 0
 ;
   ret i32 0
@@ -2969,39 +2969,39 @@ declare i32 @__kmpc_global_thread_num(ptr) #3
 declare void @__kmpc_target_deinit()
 
 define internal void @__omp_outlined__9(ptr noalias %.global_tid., ptr noalias %.bound_tid.) {
-; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__9
-; AMDGPU-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
-; AMDGPU-NEXT:  entry:
+; AMDGPU-LABEL: define internal void @__omp_outlined__9(
+; AMDGPU-SAME: ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
+; AMDGPU-NEXT:  [[ENTRY:.*:]]
 ; AMDGPU-NEXT:    call void @unknown() #[[ATTR8]]
 ; AMDGPU-NEXT:    ret void
 ;
-; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__9
-; NVPTX-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
-; NVPTX-NEXT:  entry:
+; NVPTX-LABEL: define internal void @__omp_outlined__9(
+; NVPTX-SAME: ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
+; NVPTX-NEXT:  [[ENTRY:.*:]]
 ; NVPTX-NEXT:    call void @unknown() #[[ATTR8]]
 ; NVPTX-NEXT:    ret void
 ;
-; AMDGPU-DISABLED1-LABEL: define {{[^@]+}}@__omp_outlined__9
-; AMDGPU-DISABLED1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
-; AMDGPU-DISABLED1-NEXT:  entry:
+; AMDGPU-DISABLED1-LABEL: define internal void @__omp_outlined__9(
+; AMDGPU-DISABLED1-SAME: ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
+; AMDGPU-DISABLED1-NEXT:  [[ENTRY:.*:]]
 ; AMDGPU-DISABLED1-NEXT:    call void @unknown() #[[ATTR8]]
 ; AMDGPU-DISABLED1-NEXT:    ret void
 ;
-; AMDGPU-DISABLED2-LABEL: define {{[^@]+}}@__omp_outlined__9
-; AMDGPU-DISABLED2-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
-; AMDGPU-DISABLED2-NEXT:  entry:
+; AMDGPU-DISABLED2-LABEL: define internal void @__omp_outlined__9(
+; AMDGPU-DISABLED2-SAME: ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
+; AMDGPU-DISABLED2-NEXT:  [[ENTRY:.*:]]
 ; AMDGPU-DISABLED2-NEXT:    call void @unknown() #[[ATTR8]]
 ; AMDGPU-DISABLED2-NEXT:    ret void
 ;
-; NVPTX-DISABLED1-LABEL: define {{[^@]+}}@__omp_outlined__9
-; NVPTX-DISABLED1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
-; NVPTX-DISABLED1-NEXT:  entry:
+; NVPTX-DISABLED1-LABEL: define internal void @__omp_outlined__9(
+; NVPTX-DISABLED1-SAME: ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
+; NVPTX-DISABLED1-NEXT:  [[ENTRY:.*:]]
 ; NVPTX-DISABLED1-NEXT:    call void @unknown() #[[ATTR8]]
 ; NVPTX-DISABLED1-NEXT:    ret void
 ;
-; NVPTX-DISABLED2-LABEL: define {{[^@]+}}@__omp_outlined__9
-; NVPTX-DISABLED2-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
-; NVPTX-DISABLED2-NEXT:  entry:
+; NVPTX-DISABLED2-LABEL: define internal void @__omp_outlined__9(
+; NVPTX-DISABLED2-SAME: ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
+; NVPTX-DISABLED2-NEXT:  [[ENTRY:.*:]]
 ; NVPTX-DISABLED2-NEXT:    call void @unknown() #[[ATTR8]]
 ; NVPTX-DISABLED2-NEXT:    ret void
 ;
@@ -3012,9 +3012,9 @@ entry:
 
 ; Function Attrs: convergent norecurse nounwind
 define internal void @__omp_outlined__9_wrapper(i16 zeroext %0, i32 %1) #1 {
-; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__9_wrapper
-; AMDGPU-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] {
-; AMDGPU-NEXT:  entry:
+; AMDGPU-LABEL: define internal void @__omp_outlined__9_wrapper(
+; AMDGPU-SAME: i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] {
+; AMDGPU-NEXT:  [[ENTRY:.*:]]
 ; AMDGPU-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8, addrspace(5)
 ; AMDGPU-NEXT:    [[DOTADDR1_CAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
 ; AMDGPU-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
@@ -3025,9 +3025,9 @@ define internal void @__omp_outlined__9_wrapper(i16 zeroext %0, i32 %1) #1 {
 ; AMDGPU-NEXT:    call void @__omp_outlined__9(ptr [[DOTADDR1_CAST]], ptr [[DOTZERO_ADDR_CAST]]) #[[ATTR4]]
 ; AMDGPU-NEXT:    ret void
 ;
-; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__9_wrapper
-; NVPTX-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] {
-; NVPTX-NEXT:  entry:
+; NVPTX-LABEL: define internal void @__omp_outlined__9_wrapper(
+; NVPTX-SAME: i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] {
+; NVPTX-NEXT:  [[ENTRY:.*:]]
 ; NVPTX-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8, addrspace(5)
 ; NVPTX-NEXT:    [[DOTADDR1_CAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
 ; NVPTX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
@@ -3038,9 +3038,9 @@ define internal void @__omp_outlined__9_wrapper(i16 zeroext %0, i32 %1) #1 {
 ; NVPTX-NEXT:    call void @__omp_outlined__9(ptr [[DOTADDR1_CAST]], ptr [[DOTZERO_ADDR_CAST]]) #[[ATTR4]]
 ; NVPTX-NEXT:    ret void
 ;
-; AMDGPU-DISABLED1-LABEL: define {{[^@]+}}@__omp_outlined__9_wrapper
-; AMDGPU-DISABLED1-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] {
-; AMDGPU-DISABLED1-NEXT:  entry:
+; AMDGPU-DISABLED1-LABEL: define internal void @__omp_outlined__9_wrapper(
+; AMDGPU-DISABLED1-SAME: i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] {
+; AMDGPU-DISABLED1-NEXT:  [[ENTRY:.*:]]
 ; AMDGPU-DISABLED1-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8, addrspace(5)
 ; AMDGPU-DISABLED1-NEXT:    [[DOTADDR1_CAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
 ; AMDGPU-DISABLED1-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
@@ -3051,9 +3051,9 @@ define internal void @__omp_outlined__9_wrapper(i16 zeroext %0, i32 %1) #1 {
 ; AMDGPU-DISABLED1-NEXT:    call void @__omp_outlined__9(ptr [[DOTADDR1_CAST]], ptr [[DOTZERO_ADDR_CAST]]) #[[ATTR4]]
 ; AMDGPU-DISABLED1-NEXT:    ret void
 ;
-; AMDGPU-DISABLED2-LABEL: define {{[^@]+}}@__omp_outlined__9_wrapper
-; AMDGPU-DISABLED2-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] {
-; AMDGPU-DISABLED2-NEXT:  entry:
+; AMDGPU-DISABLED2-LABEL: define internal void @__omp_outlined__9_wrapper(
+; AMDGPU-DISABLED2-SAME: i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] {
+; AMDGPU-DISABLED2-NEXT:  [[ENTRY:.*:]]
 ; AMDGPU-DISABLED2-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8, addrspace(5)
 ; AMDGPU-DISABLED2-NEXT:    [[DOTADDR1_CAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
 ; AMDGPU-DISABLED2-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
@@ -3064,9 +3064,9 @@ define internal void @__omp_outlined__9_wrapper(i16 zeroext %0, i32 %1) #1 {
 ; AMDGPU-DISABLED2-NEXT:    call void @__omp_outlined__9(ptr [[DOTADDR1_CAST]], ptr [[DOTZERO_ADDR_CAST]]) #[[ATTR4]]
 ; AMDGPU-DISABLED2-NEXT:    ret void
 ;
-; NVPTX-DISABLED1-LABEL: define {{[^@]+}}@__omp_outlined__9_wrapper
-; NVPTX-DISABLED1-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] {
-; NVPTX-DISABLED1-NEXT:  entry:
+; NVPTX-DISABLED1-LABEL: define internal void @__omp_outlined__9_wrapper(
+; NVPTX-DISABLED1-SAME: i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] {
+; NVPTX-DISABLED1-NEXT:  [[ENTRY:.*:]]
 ; NVPTX-DISABLED1-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8, addrspace(5)
 ; NVPTX-DISABLED1-NEXT:    [[DOTADDR1_CAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
 ; NVPTX-DISABLED1-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
@@ -3077,9 +3077,9 @@ define internal void @__omp_outlined__9_wrapper(i16 zeroext %0, i32 %1) #1 {
 ; NVPTX-DISABLED1-NEXT:    call void @__omp_outlined__9(ptr [[DOTADDR1_CAST]], ptr [[DOTZERO_ADDR_CAST]]) #[[ATTR4]]
 ; NVPTX-DISABLED1-NEXT:    ret void
 ;
-; NVPTX-DISABLED2-LABEL: define {{[^@]+}}@__omp_outlined__9_wrapper
-; NVPTX-DISABLED2-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] {
-; NVPTX-DISABLED2-NEXT:  entry:
+; NVPTX-DISABLED2-LABEL: define internal void @__omp_outlined__9_wrapper(
+; NVPTX-DISABLED2-SAME: i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] {
+; NVPTX-DISABLED2-NEXT:  [[ENTRY:.*:]]
 ; NVPTX-DISABLED2-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8, addrspace(5)
 ; NVPTX-DISABLED2-NEXT:    [[DOTADDR1_CAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
 ; NVPTX-DISABLED2-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
@@ -3237,7 +3237,7 @@ attributes #9 = { alwaysinline }
 ; AMDGPU: [[META9:![0-9]+]] = !{i32 8, !"PIC Level", i32 2}
 ; AMDGPU: [[META10:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2}
 ; AMDGPU: [[META11:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
-; AMDGPU: [[TBAA12]] = !{[[META13:![0-9]+]], [[META13]], i64 0}
+; AMDGPU: [[INT_TBAA12]] = !{[[META13:![0-9]+]], [[META13]], i64 0}
 ; AMDGPU: [[META13]] = !{!"int", [[META14:![0-9]+]], i64 0}
 ; AMDGPU: [[META14]] = !{!"omnipotent char", [[META15:![0-9]+]], i64 0}
 ; AMDGPU: [[META15]] = !{!"Simple C/C++ TBAA"}
@@ -3245,7 +3245,7 @@ attributes #9 = { alwaysinline }
 ; AMDGPU: [[META17]] = !{!"llvm.loop.mustprogress"}
 ; AMDGPU: [[META18]] = !{!"llvm.loop.unroll.disable"}
 ; AMDGPU: [[LOOP19]] = distinct !{[[LOOP19]], [[META17]], [[META18]]}
-; AMDGPU: [[TBAA20]] = !{[[META21:![0-9]+]], [[META21]], i64 0}
+; AMDGPU: [[ANYPTR_TBAA20]] = !{[[META21:![0-9]+]], [[META21]], i64 0}
 ; AMDGPU: [[META21]] = !{!"any pointer", [[META14]], i64 0}
 ; AMDGPU: [[LOOP22]] = distinct !{[[LOOP22]], [[META17]], [[META18]]}
 ; AMDGPU: [[LOOP23]] = distinct !{[[LOOP23]], [[META17]], [[META18]]}
@@ -3262,7 +3262,7 @@ attributes #9 = { alwaysinline }
 ; NVPTX: [[META9:![0-9]+]] = !{i32 8, !"PIC Level", i32 2}
 ; NVPTX: [[META10:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2}
 ; NVPTX: [[META11:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
-; NVPTX: [[TBAA12]] = !{[[META13:![0-9]+]], [[META13]], i64 0}
+; NVPTX: [[INT_TBAA12]] = !{[[META13:![0-9]+]], [[META13]], i64 0}
 ; NVPTX: [[META13]] = !{!"int", [[META14:![0-9]+]], i64 0}
 ; NVPTX: [[META14]] = !{!"omnipotent char", [[META15:![0-9]+]], i64 0}
 ; NVPTX: [[META15]] = !{!"Simple C/C++ TBAA"}
@@ -3270,7 +3270,7 @@ attributes #9 = { alwaysinline }
 ; NVPTX: [[META17]] = !{!"llvm.loop.mustprogress"}
 ; NVPTX: [[META18]] = !{!"llvm.loop.unroll.disable"}
 ; NVPTX: [[LOOP19]] = distinct !{[[LOOP19]], [[META17]], [[META18]]}
-; NVPTX: [[TBAA20]] = !{[[META21:![0-9]+]], [[META21]], i64 0}
+; NVPTX: [[ANYPTR_TBAA20]] = !{[[META21:![0-9]+]], [[META21]], i64 0}
 ; NVPTX: [[META21]] = !{!"any pointer", [[META14]], i64 0}
 ; NVPTX: [[LOOP22]] = distinct !{[[LOOP22]], [[META17]], [[META18]]}
 ; NVPTX: [[LOOP23]] = distinct !{[[LOOP23]], [[META17]], [[META18]]}
@@ -3287,7 +3287,7 @@ attributes #9 = { alwaysinline }
 ; AMDGPU-DISABLED1: [[META9:![0-9]+]] = !{i32 8, !"PIC Level", i32 2}
 ; AMDGPU-DISABLED1: [[META10:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2}
 ; AMDGPU-DISABLED1: [[META11:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
-; AMDGPU-DISABLED1: [[TBAA12]] = !{[[META13:![0-9]+]], [[META13]], i64 0}
+; AMDGPU-DISABLED1: [[INT_TBAA12]] = !{[[META13:![0-9]+]], [[META13]], i64 0}
 ; AMDGPU-DISABLED1: [[META13]] = !{!"int", [[META14:![0-9]+]], i64 0}
 ; AMDGPU-DISABLED1: [[META14]] = !{!"omnipotent char", [[META15:![0-9]+]], i64 0}
 ; AMDGPU-DISABLED1: [[META15]] = !{!"Simple C/C++ TBAA"}
@@ -3295,7 +3295,7 @@ attributes #9 = { alwaysinline }
 ; AMDGPU-DISABLED1: [[META17]] = !{!"llvm.loop.mustprogress"}
 ; AMDGPU-DISABLED1: [[META18]] = !{!"llvm.loop.unroll.disable"}
 ; AMDGPU-DISABLED1: [[LOOP19]] = distinct !{[[LOOP19]], [[META17]], [[META18]]}
-; AMDGPU-DISABLED1: [[TBAA20]] = !{[[META21:![0-9]+]], [[META21]], i64 0}
+; AMDGPU-DISABLED1: [[ANYPTR_TBAA20]] = !{[[META21:![0-9]+]], [[META21]], i64 0}
 ; AMDGPU-DISABLED1: [[META21]] = !{!"any pointer", [[META14]], i64 0}
 ; AMDGPU-DISABLED1: [[LOOP22]] = distinct !{[[LOOP22]], [[META17]], [[META18]]}
 ; AMDGPU-DISABLED1: [[LOOP23]] = distinct !{[[LOOP23]], [[META17]], [[META18]]}
@@ -3312,7 +3312,7 @@ attributes #9 = { alwaysinline }
 ; AMDGPU-DISABLED2: [[META9:![0-9]+]] = !{i32 8, !"PIC Level", i32 2}
 ; AMDGPU-DISABLED2: [[META10:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2}
 ; AMDGPU-DISABLED2: [[META11:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
-; AMDGPU-DISABLED2: [[TBAA12]] = !{[[META13:![0-9]+]], [[META13]], i64 0}
+; AMDGPU-DISABLED2: [[INT_TBAA12]] = !{[[META13:![0-9]+]], [[META13]], i64 0}
 ; AMDGPU-DISABLED2: [[META13]] = !{!"int", [[META14:![0-9]+]], i64 0}
 ; AMDGPU-DISABLED2: [[META14]] = !{!"omnipotent char", [[META15:![0-9]+]], i64 0}
 ; AMDGPU-DISABLED2: [[META15]] = !{!"Simple C/C++ TBAA"}
@@ -3320,7 +3320,7 @@ attributes #9 = { alwaysinline }
 ; AMDGPU-DISABLED2: [[META17]] = !{!"llvm.loop.mustprogress"}
 ; AMDGPU-DISABLED2: [[META18]] = !{!"llvm.loop.unroll.disable"}
 ; AMDGPU-DISABLED2: [[LOOP19]] = distinct !{[[LOOP19]], [[META17]], [[META18]]}
-; AMDGPU-DISABLED2: [[TBAA20]] = !{[[META21:![0-9]+]], [[META21]], i64 0}
+; AMDGPU-DISABLED2: [[ANYPTR_TBAA20]] = !{[[META21:![0-9]+]], [[META21]], i64 0}
 ; AMDGPU-DISABLED2: [[META21]] = !{!"any pointer", [[META14]], i64 0}
 ; AMDGPU-DISABLED2: [[LOOP22]] = distinct !{[[LOOP22]], [[META17]], [[META18]]}
 ; AMDGPU-DISABLED2: [[LOOP23]] = distinct !{[[LOOP23]], [[META17]], [[META18]]}
@@ -3337,7 +3337,7 @@ attributes #9 = { alwaysinline }
 ; NVPTX-DISABLED1: [[META9:![0-9]+]] = !{i32 8, !"PIC Level", i32 2}
 ; NVPTX-DISABLED1: [[META10:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2}
 ; NVPTX-DISABLED1: [[META11:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
-; NVPTX-DISABLED1: [[TBAA12]] = !{[[META13:![0-9]+]], [[META13]], i64 0}
+; NVPTX-DISABLED1: [[INT_TBAA12]] = !{[[META13:![0-9]+]], [[META13]], i64 0}
 ; NVPTX-DISABLED1: [[META13]] = !{!"int", [[META14:![0-9]+]], i64 0}
 ; NVPTX-DISABLED1: [[META14]] = !{!"omnipotent char", [[META15:![0-9]+]], i64 0}
 ; NVPTX-DISABLED1: [[META15]] = !{!"Simple C/C++ TBAA"}
@@ -3345,7 +3345,7 @@ attributes #9 = { alwaysinline }
 ; NVPTX-DISABLED1: [[META17]] = !{!"llvm.loop.mustprogress"}
 ; NVPTX-DISABLED1: [[META18]] = !{!"llvm.loop.unroll.disable"}
 ; NVPTX-DISABLED1: [[LOOP19]] = distinct !{[[LOOP19]], [[META17]], [[META18]]}
-; NVPTX-DISABLED1: [[TBAA20]] = !{[[META21:![0-9]+]], [[META21]], i64 0}
+; NVPTX-DISABLED1: [[ANYPTR_TBAA20]] = !{[[META21:![0-9]+]], [[META21]], i64 0}
 ; NVPTX-DISABLED1: [[META21]] = !{!"any pointer", [[META14]], i64 0}
 ; NVPTX-DISABLED1: [[LOOP22]] = distinct !{[[LOOP22]], [[META17]], [[META18]]}
 ; NVPTX-DISABLED1: [[LOOP23]] = distinct !{[[LOOP23]], [[META17]], [[META18]]}
@@ -3362,7 +3362,7 @@ attributes #9 = { alwaysinline }
 ; NVPTX-DISABLED2: [[META9:![0-9]+]] = !{i32 8, !"PIC Level", i32 2}
 ; NVPTX-DISABLED2: [[META10:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2}
 ; NVPTX-DISABLED2: [[META11:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
-; NVPTX-DISABLED2: [[TBAA12]] = !{[[META13:![0-9]+]], [[META13]], i64 0}
+; NVPTX-DISABLED2: [[INT_TBAA12]] = !{[[META13:![0-9]+]], [[META13]], i64 0}
 ; NVPTX-DISABLED2: [[META13]] = !{!"int", [[META14:![0-9]+]], i64 0}
 ; NVPTX-DISABLED2: [[META14]] = !{!"omnipotent char", [[META15:![0-9]+]], i64 0}
 ; NVPTX-DISABLED2: [[META15]] = !{!"Simple C/C++ TBAA"}
@@ -3370,7 +3370,7 @@ attributes #9 = { alwaysinline }
 ; NVPTX-DISABLED2: [[META17]] = !{!"llvm.loop.mustprogress"}
 ; NVPTX-DISABLED2: [[META18]] = !{!"llvm.loop.unroll.disable"}
 ; NVPTX-DISABLED2: [[LOOP19]] = distinct !{[[LOOP19]], [[META17]], [[META18]]}
-; NVPTX-DISABLED2: [[TBAA20]] = !{[[META21:![0-9]+]], [[META21]], i64 0}
+; NVPTX-DISABLED2: [[ANYPTR_TBAA20]] = !{[[META21:![0-9]+]], [[META21]], i64 0}
 ; NVPTX-DISABLED2: [[META21]] = !{!"any pointer", [[META14]], i64 0}
 ; NVPTX-DISABLED2: [[LOOP22]] = distinct !{[[LOOP22]], [[META17]], [[META18]]}
 ; NVPTX-DISABLED2: [[LOOP23]] = distinct !{[[LOOP23]], [[META17]], [[META18]]}
diff --git a/llvm/test/Transforms/OpenMP/spmdization_assumes.ll b/llvm/test/Transforms/OpenMP/spmdization_assumes.ll
index 59e2499ead2ad..60d42ed931e76 100644
--- a/llvm/test/Transforms/OpenMP/spmdization_assumes.ll
+++ b/llvm/test/Transforms/OpenMP/spmdization_assumes.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 6
 ; RUN: opt -S -passes=openmp-opt < %s | FileCheck %s
 
 ; void foo(double x) {
@@ -29,35 +29,35 @@ target triple = "nvptx64"
 ; CHECK: @[[GLOB2:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 22, ptr @[[GLOB0]] }, align 8
 ;.
 define weak ptx_kernel void @__omp_offloading_fd02_404433c2_main_l5(ptr %dyn, ptr nonnull align 8 dereferenceable(8) %x) local_unnamed_addr #0 {
-; CHECK-LABEL: define {{[^@]+}}@__omp_offloading_fd02_404433c2_main_l5
-; CHECK-SAME: (ptr [[DYN:%.*]], ptr nonnull align 8 dereferenceable(8) [[X:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:  entry:
+; CHECK-LABEL: define weak ptx_kernel void @__omp_offloading_fd02_404433c2_main_l5(
+; CHECK-SAME: ptr [[DYN:%.*]], ptr nonnull align 8 dereferenceable(8) [[X:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr nonnull @__omp_offloading_fd02_404433c2_main_l5_kernel_environment, ptr [[DYN]]) #[[ATTR3:[0-9]+]]
 ; CHECK-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
-; CHECK-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
-; CHECK:       common.ret:
+; CHECK-NEXT:    br i1 [[EXEC_USER_CODE]], label %[[USER_CODE_ENTRY:.*]], label %[[COMMON_RET:.*]]
+; CHECK:       [[COMMON_RET]]:
 ; CHECK-NEXT:    ret void
-; CHECK:       user_code.entry:
+; CHECK:       [[USER_CODE_ENTRY]]:
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr nonnull @[[GLOB1]]) #[[ATTR3]]
 ; CHECK-NEXT:    [[CALL_I:%.*]] = call double @__nv_sin(double 0x400921FB54442D18) #[[ATTR7:[0-9]+]]
-; CHECK-NEXT:    br label [[REGION_CHECK_TID:%.*]]
-; CHECK:       region.check.tid:
+; CHECK-NEXT:    br label %[[REGION_CHECK_TID:.*]]
+; CHECK:       [[REGION_CHECK_TID]]:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[TMP3]], label [[REGION_GUARDED:%.*]], label [[REGION_BARRIER:%.*]]
-; CHECK:       region.guarded:
-; CHECK-NEXT:    store double [[CALL_I]], ptr [[X]], align 8, !tbaa [[TBAA7:![0-9]+]]
-; CHECK-NEXT:    br label [[REGION_GUARDED_END:%.*]]
-; CHECK:       region.guarded.end:
-; CHECK-NEXT:    br label [[REGION_BARRIER]]
-; CHECK:       region.barrier:
+; CHECK-NEXT:    br i1 [[TMP3]], label %[[REGION_GUARDED:.*]], label %[[REGION_BARRIER:.*]]
+; CHECK:       [[REGION_GUARDED]]:
+; CHECK-NEXT:    store double [[CALL_I]], ptr [[X]], align 8, !tbaa [[DOUBLE_TBAA7:![0-9]+]]
+; CHECK-NEXT:    br label %[[REGION_GUARDED_END:.*]]
+; CHECK:       [[REGION_GUARDED_END]]:
+; CHECK-NEXT:    br label %[[REGION_BARRIER]]
+; CHECK:       [[REGION_BARRIER]]:
 ; CHECK-NEXT:    call void @__kmpc_barrier_simple_spmd(ptr @[[GLOB2]], i32 [[TMP2]])
-; CHECK-NEXT:    br label [[REGION_EXIT:%.*]]
-; CHECK:       region.exit:
+; CHECK-NEXT:    br label %[[REGION_EXIT:.*]]
+; CHECK:       [[REGION_EXIT]]:
 ; CHECK-NEXT:    call void @__kmpc_parallel_51(ptr nonnull @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__, ptr @__omp_outlined___wrapper, ptr nonnull [[CAPTURED_VARS_ADDRS]], i64 0) #[[ATTR3]]
 ; CHECK-NEXT:    call void @__kmpc_target_deinit() #[[ATTR3]]
-; CHECK-NEXT:    br label [[COMMON_RET]]
+; CHECK-NEXT:    br label %[[COMMON_RET]]
 ;
 entry:
   %captured_vars_addrs = alloca [0 x ptr], align 8
@@ -81,9 +81,9 @@ declare i32 @__kmpc_target_init(ptr, ptr) local_unnamed_addr
 
 ; Function Attrs: alwaysinline mustprogress nofree norecurse nosync nounwind readnone willreturn
 define internal void @__omp_outlined__(ptr noalias nocapture %.global_tid., ptr noalias nocapture %.bound_tid.) #1 {
-; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__
-; CHECK-SAME: (ptr noalias captures(none) [[DOTGLOBAL_TID_:%.*]], ptr noalias captures(none) [[DOTBOUND_TID_:%.*]]) #[[ATTR1:[0-9]+]] {
-; CHECK-NEXT:  entry:
+; CHECK-LABEL: define internal void @__omp_outlined__(
+; CHECK-SAME: ptr noalias captures(none) [[DOTGLOBAL_TID_:%.*]], ptr noalias captures(none) [[DOTBOUND_TID_:%.*]]) #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -92,9 +92,9 @@ entry:
 
 ; Function Attrs: norecurse nounwind
 define internal void @__omp_outlined___wrapper(i16 zeroext %0, i32 %1) #2 {
-; CHECK-LABEL: define {{[^@]+}}@__omp_outlined___wrapper
-; CHECK-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2:[0-9]+]] {
-; CHECK-NEXT:  entry:
+; CHECK-LABEL: define internal void @__omp_outlined___wrapper(
+; CHECK-SAME: i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8
 ; CHECK-NEXT:    call void @__kmpc_get_shared_variables(ptr nonnull [[GLOBAL_ARGS]]) #[[ATTR3]]
 ; CHECK-NEXT:    ret void
@@ -158,7 +158,7 @@ attributes #6 = { convergent nounwind "llvm.assume"="ompx_spmd_amenable" }
 ; CHECK: [[META4:![0-9]+]] = !{i32 8, !"PIC Level", i32 2}
 ; CHECK: [[META5:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2}
 ; CHECK: [[META6:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
-; CHECK: [[TBAA7]] = !{[[META8:![0-9]+]], [[META8]], i64 0}
+; CHECK: [[DOUBLE_TBAA7]] = !{[[META8:![0-9]+]], [[META8]], i64 0}
 ; CHECK: [[META8]] = !{!"double", [[META9:![0-9]+]], i64 0}
 ; CHECK: [[META9]] = !{!"omnipotent char", [[META10:![0-9]+]], i64 0}
 ; CHECK: [[META10]] = !{!"Simple C/C++ TBAA"}
diff --git a/llvm/test/Transforms/OpenMP/spmdization_indirect.ll b/llvm/test/Transforms/OpenMP/spmdization_indirect.ll
index d1e006a704441..dec6a68478f09 100644
--- a/llvm/test/Transforms/OpenMP/spmdization_indirect.ll
+++ b/llvm/test/Transforms/OpenMP/spmdization_indirect.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 6
 ; RUN: opt --mtriple=amdgcn-amd-amdhsa --data-layout=A5 -S -passes=openmp-opt < %s | FileCheck %s --check-prefixes=AMDGPU
 ; RUN: opt --mtriple=nvptx64-- -S -passes=openmp-opt < %s | FileCheck %s --check-prefixes=NVPTX
 
@@ -30,13 +30,13 @@
 ; NVPTX: @spmd_and_non_spmd_callee_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
 ;.
 define weak ptx_kernel void @spmd_callees(i1 %c) #0 {
-; AMDGPU-LABEL: define {{[^@]+}}@spmd_callees
-; AMDGPU-SAME: (i1 [[C:%.*]]) #[[ATTR0:[0-9]+]] {
+; AMDGPU-LABEL: define weak ptx_kernel void @spmd_callees(
+; AMDGPU-SAME: i1 [[C:%.*]]) #[[ATTR0:[0-9]+]] {
 ; AMDGPU-NEXT:    call void @spmd_callees__debug(i1 [[C]])
 ; AMDGPU-NEXT:    ret void
 ;
-; NVPTX-LABEL: define {{[^@]+}}@spmd_callees
-; NVPTX-SAME: (i1 [[C:%.*]]) #[[ATTR0:[0-9]+]] {
+; NVPTX-LABEL: define weak ptx_kernel void @spmd_callees(
+; NVPTX-SAME: i1 [[C:%.*]]) #[[ATTR0:[0-9]+]] {
 ; NVPTX-NEXT:    call void @spmd_callees__debug(i1 [[C]])
 ; NVPTX-NEXT:    ret void
 ;
@@ -45,71 +45,71 @@ define weak ptx_kernel void @spmd_callees(i1 %c) #0 {
 }
 
 define internal void @spmd_callees__debug(i1 %c) {
-; AMDGPU-LABEL: define {{[^@]+}}@spmd_callees__debug
-; AMDGPU-SAME: (i1 [[C:%.*]]) #[[ATTR1:[0-9]+]] {
-; AMDGPU-NEXT:  entry:
+; AMDGPU-LABEL: define internal void @spmd_callees__debug(
+; AMDGPU-SAME: i1 [[C:%.*]]) #[[ATTR1:[0-9]+]] {
+; AMDGPU-NEXT:  [[ENTRY:.*:]]
 ; AMDGPU-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 ; AMDGPU-NEXT:    [[DOTZERO_ADDR_CAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTZERO_ADDR]] to ptr
 ; AMDGPU-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca ptr, align 8, addrspace(5)
 ; AMDGPU-NEXT:    [[DOTTHREADID_TEMP__CAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTTHREADID_TEMP_]] to ptr
 ; AMDGPU-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @spmd_callees_kernel_environment, ptr null)
 ; AMDGPU-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
-; AMDGPU-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
-; AMDGPU:       common.ret:
+; AMDGPU-NEXT:    br i1 [[EXEC_USER_CODE]], label %[[USER_CODE_ENTRY:.*]], label %[[COMMON_RET:.*]]
+; AMDGPU:       [[COMMON_RET]]:
 ; AMDGPU-NEXT:    ret void
-; AMDGPU:       user_code.entry:
+; AMDGPU:       [[USER_CODE_ENTRY]]:
 ; AMDGPU-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR10:[0-9]+]]
 ; AMDGPU-NEXT:    store i32 0, ptr addrspace(5) [[DOTZERO_ADDR]], align 4
-; AMDGPU-NEXT:    store i32 [[TMP1]], ptr addrspace(5) [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12:![0-9]+]]
+; AMDGPU-NEXT:    store i32 [[TMP1]], ptr addrspace(5) [[DOTTHREADID_TEMP_]], align 4, !tbaa [[INT_TBAA12:![0-9]+]]
 ; AMDGPU-NEXT:    [[FP:%.*]] = select i1 [[C]], ptr @__omp_outlined_spmd_amenable1, ptr @__omp_outlined_spmd_amenable2
 ; AMDGPU-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[FP]], @__omp_outlined_spmd_amenable2
-; AMDGPU-NEXT:    br i1 [[TMP2]], label [[TMP3:%.*]], label [[TMP4:%.*]]
-; AMDGPU:       3:
+; AMDGPU-NEXT:    br i1 [[TMP2]], label %[[BB3:.*]], label %[[BB4:.*]]
+; AMDGPU:       [[BB3]]:
 ; AMDGPU-NEXT:    call void @__omp_outlined_spmd_amenable2(ptr [[DOTTHREADID_TEMP__CAST]], ptr [[DOTZERO_ADDR_CAST]]) #[[ATTR10]]
-; AMDGPU-NEXT:    br label [[TMP7:%.*]]
-; AMDGPU:       4:
-; AMDGPU-NEXT:    br i1 true, label [[TMP5:%.*]], label [[TMP6:%.*]]
-; AMDGPU:       5:
+; AMDGPU-NEXT:    br label %[[BB7:.*]]
+; AMDGPU:       [[BB4]]:
+; AMDGPU-NEXT:    br i1 true, label %[[BB5:.*]], label %[[BB6:.*]]
+; AMDGPU:       [[BB5]]:
 ; AMDGPU-NEXT:    call void @__omp_outlined_spmd_amenable1(ptr [[DOTTHREADID_TEMP__CAST]], ptr [[DOTZERO_ADDR_CAST]]) #[[ATTR10]]
-; AMDGPU-NEXT:    br label [[TMP7]]
-; AMDGPU:       6:
+; AMDGPU-NEXT:    br label %[[BB7]]
+; AMDGPU:       [[BB6]]:
 ; AMDGPU-NEXT:    unreachable
-; AMDGPU:       7:
+; AMDGPU:       [[BB7]]:
 ; AMDGPU-NEXT:    call void @__kmpc_target_deinit()
-; AMDGPU-NEXT:    br label [[COMMON_RET]]
+; AMDGPU-NEXT:    br label %[[COMMON_RET]]
 ;
-; NVPTX-LABEL: define {{[^@]+}}@spmd_callees__debug
-; NVPTX-SAME: (i1 [[C:%.*]]) #[[ATTR1:[0-9]+]] {
-; NVPTX-NEXT:  entry:
+; NVPTX-LABEL: define internal void @spmd_callees__debug(
+; NVPTX-SAME: i1 [[C:%.*]]) #[[ATTR1:[0-9]+]] {
+; NVPTX-NEXT:  [[ENTRY:.*:]]
 ; NVPTX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 ; NVPTX-NEXT:    [[DOTZERO_ADDR_CAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTZERO_ADDR]] to ptr
 ; NVPTX-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca ptr, align 8, addrspace(5)
 ; NVPTX-NEXT:    [[DOTTHREADID_TEMP__CAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTTHREADID_TEMP_]] to ptr
 ; NVPTX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @spmd_callees_kernel_environment, ptr null)
 ; NVPTX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
-; NVPTX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
-; NVPTX:       common.ret:
+; NVPTX-NEXT:    br i1 [[EXEC_USER_CODE]], label %[[USER_CODE_ENTRY:.*]], label %[[COMMON_RET:.*]]
+; NVPTX:       [[COMMON_RET]]:
 ; NVPTX-NEXT:    ret void
-; NVPTX:       user_code.entry:
+; NVPTX:       [[USER_CODE_ENTRY]]:
 ; NVPTX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR10:[0-9]+]]
 ; NVPTX-NEXT:    store i32 0, ptr addrspace(5) [[DOTZERO_ADDR]], align 4
-; NVPTX-NEXT:    store i32 [[TMP1]], ptr addrspace(5) [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12:![0-9]+]]
+; NVPTX-NEXT:    store i32 [[TMP1]], ptr addrspace(5) [[DOTTHREADID_TEMP_]], align 4, !tbaa [[INT_TBAA12:![0-9]+]]
 ; NVPTX-NEXT:    [[FP:%.*]] = select i1 [[C]], ptr @__omp_outlined_spmd_amenable1, ptr @__omp_outlined_spmd_amenable2
 ; NVPTX-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[FP]], @__omp_outlined_spmd_amenable2
-; NVPTX-NEXT:    br i1 [[TMP2]], label [[TMP3:%.*]], label [[TMP4:%.*]]
-; NVPTX:       3:
+; NVPTX-NEXT:    br i1 [[TMP2]], label %[[BB3:.*]], label %[[BB4:.*]]
+; NVPTX:       [[BB3]]:
 ; NVPTX-NEXT:    call void @__omp_outlined_spmd_amenable2(ptr [[DOTTHREADID_TEMP__CAST]], ptr [[DOTZERO_ADDR_CAST]]) #[[ATTR10]]
-; NVPTX-NEXT:    br label [[TMP7:%.*]]
-; NVPTX:       4:
-; NVPTX-NEXT:    br i1 true, label [[TMP5:%.*]], label [[TMP6:%.*]]
-; NVPTX:       5:
+; NVPTX-NEXT:    br label %[[BB7:.*]]
+; NVPTX:       [[BB4]]:
+; NVPTX-NEXT:    br i1 true, label %[[BB5:.*]], label %[[BB6:.*]]
+; NVPTX:       [[BB5]]:
 ; NVPTX-NEXT:    call void @__omp_outlined_spmd_amenable1(ptr [[DOTTHREADID_TEMP__CAST]], ptr [[DOTZERO_ADDR_CAST]]) #[[ATTR10]]
-; NVPTX-NEXT:    br label [[TMP7]]
-; NVPTX:       6:
+; NVPTX-NEXT:    br label %[[BB7]]
+; NVPTX:       [[BB6]]:
 ; NVPTX-NEXT:    unreachable
-; NVPTX:       7:
+; NVPTX:       [[BB7]]:
 ; NVPTX-NEXT:    call void @__kmpc_target_deinit()
-; NVPTX-NEXT:    br label [[COMMON_RET]]
+; NVPTX-NEXT:    br label %[[COMMON_RET]]
 ;
 entry:
   %.zero.addr = alloca ptr, align 8, addrspace(5)
@@ -134,43 +134,43 @@ user_code.entry:                                  ; preds = %entry
 }
 
 define internal void @__omp_outlined_spmd_amenable1(ptr noalias %.global_tid., ptr noalias %.bound_tid.) {
-; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined_spmd_amenable1
-; AMDGPU-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
-; AMDGPU-NEXT:  entry:
+; AMDGPU-LABEL: define internal void @__omp_outlined_spmd_amenable1(
+; AMDGPU-SAME: ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
+; AMDGPU-NEXT:  [[ENTRY:.*]]:
 ; AMDGPU-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca ptr, align 8, addrspace(5)
 ; AMDGPU-NEXT:    [[CAPTURED_VARS_ADDRS_CAST:%.*]] = addrspacecast ptr addrspace(5) [[CAPTURED_VARS_ADDRS]] to ptr
-; AMDGPU-NEXT:    br label [[FOR_COND:%.*]]
-; AMDGPU:       for.cond:
-; AMDGPU-NEXT:    [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
+; AMDGPU-NEXT:    br label %[[FOR_COND:.*]]
+; AMDGPU:       [[FOR_COND]]:
+; AMDGPU-NEXT:    [[I_0:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_BODY:.*]] ]
 ; AMDGPU-NEXT:    [[CMP:%.*]] = icmp slt i32 [[I_0]], 100
-; AMDGPU-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
-; AMDGPU:       for.cond.cleanup:
+; AMDGPU-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP:.*]]
+; AMDGPU:       [[FOR_COND_CLEANUP]]:
 ; AMDGPU-NEXT:    call void @spmd_amenable() #[[ATTR6:[0-9]+]]
 ; AMDGPU-NEXT:    ret void
-; AMDGPU:       for.body:
-; AMDGPU-NEXT:    [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA12]]
+; AMDGPU:       [[FOR_BODY]]:
+; AMDGPU-NEXT:    [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[INT_TBAA12]]
 ; AMDGPU-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__1, ptr @__omp_outlined__1_wrapper, ptr [[CAPTURED_VARS_ADDRS_CAST]], i64 0)
 ; AMDGPU-NEXT:    [[INC]] = add nsw i32 [[I_0]], 1
-; AMDGPU-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP16:![0-9]+]]
+; AMDGPU-NEXT:    br label %[[FOR_COND]], !llvm.loop [[LOOP16:![0-9]+]]
 ;
-; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined_spmd_amenable1
-; NVPTX-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
-; NVPTX-NEXT:  entry:
+; NVPTX-LABEL: define internal void @__omp_outlined_spmd_amenable1(
+; NVPTX-SAME: ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
+; NVPTX-NEXT:  [[ENTRY:.*]]:
 ; NVPTX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca ptr, align 8, addrspace(5)
 ; NVPTX-NEXT:    [[CAPTURED_VARS_ADDRS_CAST:%.*]] = addrspacecast ptr addrspace(5) [[CAPTURED_VARS_ADDRS]] to ptr
-; NVPTX-NEXT:    br label [[FOR_COND:%.*]]
-; NVPTX:       for.cond:
-; NVPTX-NEXT:    [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
+; NVPTX-NEXT:    br label %[[FOR_COND:.*]]
+; NVPTX:       [[FOR_COND]]:
+; NVPTX-NEXT:    [[I_0:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_BODY:.*]] ]
 ; NVPTX-NEXT:    [[CMP:%.*]] = icmp slt i32 [[I_0]], 100
-; NVPTX-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
-; NVPTX:       for.cond.cleanup:
+; NVPTX-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP:.*]]
+; NVPTX:       [[FOR_COND_CLEANUP]]:
 ; NVPTX-NEXT:    call void @spmd_amenable() #[[ATTR6:[0-9]+]]
 ; NVPTX-NEXT:    ret void
-; NVPTX:       for.body:
-; NVPTX-NEXT:    [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA12]]
+; NVPTX:       [[FOR_BODY]]:
+; NVPTX-NEXT:    [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[INT_TBAA12]]
 ; NVPTX-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__1, ptr @__omp_outlined__1_wrapper, ptr [[CAPTURED_VARS_ADDRS_CAST]], i64 0)
 ; NVPTX-NEXT:    [[INC]] = add nsw i32 [[I_0]], 1
-; NVPTX-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP16:![0-9]+]]
+; NVPTX-NEXT:    br label %[[FOR_COND]], !llvm.loop [[LOOP16:![0-9]+]]
 ;
 entry:
   %captured_vars_addrs = alloca ptr, align 8, addrspace(5)
@@ -194,15 +194,15 @@ for.body:                                         ; preds = %for.cond
 }
 
 define internal void @__omp_outlined__1(ptr noalias %.global_tid., ptr noalias %.bound_tid.) {
-; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__1
-; AMDGPU-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
-; AMDGPU-NEXT:  entry:
+; AMDGPU-LABEL: define internal void @__omp_outlined__1(
+; AMDGPU-SAME: ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
+; AMDGPU-NEXT:  [[ENTRY:.*:]]
 ; AMDGPU-NEXT:    call void @unknown() #[[ATTR7:[0-9]+]]
 ; AMDGPU-NEXT:    ret void
 ;
-; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__1
-; NVPTX-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
-; NVPTX-NEXT:  entry:
+; NVPTX-LABEL: define internal void @__omp_outlined__1(
+; NVPTX-SAME: ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
+; NVPTX-NEXT:  [[ENTRY:.*:]]
 ; NVPTX-NEXT:    call void @unknown() #[[ATTR7:[0-9]+]]
 ; NVPTX-NEXT:    ret void
 ;
@@ -213,9 +213,9 @@ entry:
 
 ; Function Attrs: convergent norecurse nounwind
 define internal void @__omp_outlined__1_wrapper(i16 zeroext %0, i32 %1) #1 {
-; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper
-; AMDGPU-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2:[0-9]+]] {
-; AMDGPU-NEXT:  entry:
+; AMDGPU-LABEL: define internal void @__omp_outlined__1_wrapper(
+; AMDGPU-SAME: i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2:[0-9]+]] {
+; AMDGPU-NEXT:  [[ENTRY:.*:]]
 ; AMDGPU-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8, addrspace(5)
 ; AMDGPU-NEXT:    [[DOTADDR1_CAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
 ; AMDGPU-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
@@ -226,9 +226,9 @@ define internal void @__omp_outlined__1_wrapper(i16 zeroext %0, i32 %1) #1 {
 ; AMDGPU-NEXT:    call void @__omp_outlined__1(ptr [[DOTADDR1_CAST]], ptr [[DOTZERO_ADDR_CAST]]) #[[ATTR10]]
 ; AMDGPU-NEXT:    ret void
 ;
-; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper
-; NVPTX-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2:[0-9]+]] {
-; NVPTX-NEXT:  entry:
+; NVPTX-LABEL: define internal void @__omp_outlined__1_wrapper(
+; NVPTX-SAME: i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2:[0-9]+]] {
+; NVPTX-NEXT:  [[ENTRY:.*:]]
 ; NVPTX-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8, addrspace(5)
 ; NVPTX-NEXT:    [[DOTADDR1_CAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
 ; NVPTX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
@@ -254,48 +254,48 @@ entry:
 }
 
 define internal void @__omp_outlined_spmd_amenable2(ptr noalias %.global_tid., ptr noalias %.bound_tid.) {
-; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined_spmd_amenable2
-; AMDGPU-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
-; AMDGPU-NEXT:  entry:
+; AMDGPU-LABEL: define internal void @__omp_outlined_spmd_amenable2(
+; AMDGPU-SAME: ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
+; AMDGPU-NEXT:  [[ENTRY:.*]]:
 ; AMDGPU-NEXT:    [[X_H2S:%.*]] = alloca i8, i64 4, align 4, addrspace(5)
 ; AMDGPU-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca ptr, align 8, addrspace(5)
 ; AMDGPU-NEXT:    [[CAPTURED_VARS_ADDRS_CAST:%.*]] = addrspacecast ptr addrspace(5) [[CAPTURED_VARS_ADDRS]] to ptr
 ; AMDGPU-NEXT:    [[MALLOC_CAST:%.*]] = addrspacecast ptr addrspace(5) [[X_H2S]] to ptr
 ; AMDGPU-NEXT:    call void @use(ptr captures(none) [[MALLOC_CAST]]) #[[ATTR6]]
-; AMDGPU-NEXT:    br label [[FOR_COND:%.*]]
-; AMDGPU:       for.cond:
-; AMDGPU-NEXT:    [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
+; AMDGPU-NEXT:    br label %[[FOR_COND:.*]]
+; AMDGPU:       [[FOR_COND]]:
+; AMDGPU-NEXT:    [[I_0:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_BODY:.*]] ]
 ; AMDGPU-NEXT:    [[CMP:%.*]] = icmp slt i32 [[I_0]], 100
-; AMDGPU-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
-; AMDGPU:       for.cond.cleanup:
+; AMDGPU-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP:.*]]
+; AMDGPU:       [[FOR_COND_CLEANUP]]:
 ; AMDGPU-NEXT:    call void @spmd_amenable() #[[ATTR6]]
 ; AMDGPU-NEXT:    ret void
-; AMDGPU:       for.body:
-; AMDGPU-NEXT:    [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA12]]
+; AMDGPU:       [[FOR_BODY]]:
+; AMDGPU-NEXT:    [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[INT_TBAA12]]
 ; AMDGPU-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__3, ptr @__omp_outlined__3_wrapper, ptr [[CAPTURED_VARS_ADDRS_CAST]], i64 0)
 ; AMDGPU-NEXT:    [[INC]] = add nsw i32 [[I_0]], 1
-; AMDGPU-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP19:![0-9]+]]
+; AMDGPU-NEXT:    br label %[[FOR_COND]], !llvm.loop [[LOOP19:![0-9]+]]
 ;
-; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined_spmd_amenable2
-; NVPTX-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
-; NVPTX-NEXT:  entry:
+; NVPTX-LABEL: define internal void @__omp_outlined_spmd_amenable2(
+; NVPTX-SAME: ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
+; NVPTX-NEXT:  [[ENTRY:.*]]:
 ; NVPTX-NEXT:    [[X_H2S:%.*]] = alloca i8, i64 4, align 4
 ; NVPTX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca ptr, align 8, addrspace(5)
 ; NVPTX-NEXT:    [[CAPTURED_VARS_ADDRS_CAST:%.*]] = addrspacecast ptr addrspace(5) [[CAPTURED_VARS_ADDRS]] to ptr
 ; NVPTX-NEXT:    call void @use(ptr captures(none) [[X_H2S]]) #[[ATTR6]]
-; NVPTX-NEXT:    br label [[FOR_COND:%.*]]
-; NVPTX:       for.cond:
-; NVPTX-NEXT:    [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
+; NVPTX-NEXT:    br label %[[FOR_COND:.*]]
+; NVPTX:       [[FOR_COND]]:
+; NVPTX-NEXT:    [[I_0:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_BODY:.*]] ]
 ; NVPTX-NEXT:    [[CMP:%.*]] = icmp slt i32 [[I_0]], 100
-; NVPTX-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
-; NVPTX:       for.cond.cleanup:
+; NVPTX-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP:.*]]
+; NVPTX:       [[FOR_COND_CLEANUP]]:
 ; NVPTX-NEXT:    call void @spmd_amenable() #[[ATTR6]]
 ; NVPTX-NEXT:    ret void
-; NVPTX:       for.body:
-; NVPTX-NEXT:    [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA12]]
+; NVPTX:       [[FOR_BODY]]:
+; NVPTX-NEXT:    [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[INT_TBAA12]]
 ; NVPTX-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__3, ptr @__omp_outlined__3_wrapper, ptr [[CAPTURED_VARS_ADDRS_CAST]], i64 0)
 ; NVPTX-NEXT:    [[INC]] = add nsw i32 [[I_0]], 1
-; NVPTX-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP19:![0-9]+]]
+; NVPTX-NEXT:    br label %[[FOR_COND]], !llvm.loop [[LOOP19:![0-9]+]]
 ;
 entry:
   %captured_vars_addrs = alloca ptr, align 8, addrspace(5)
@@ -322,15 +322,15 @@ for.body:                                         ; preds = %for.cond
 }
 
 define internal void @__omp_outlined__3(ptr noalias %.global_tid., ptr noalias %.bound_tid.) {
-; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__3
-; AMDGPU-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
-; AMDGPU-NEXT:  entry:
+; AMDGPU-LABEL: define internal void @__omp_outlined__3(
+; AMDGPU-SAME: ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
+; AMDGPU-NEXT:  [[ENTRY:.*:]]
 ; AMDGPU-NEXT:    call void @unknown() #[[ATTR7]]
 ; AMDGPU-NEXT:    ret void
 ;
-; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__3
-; NVPTX-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
-; NVPTX-NEXT:  entry:
+; NVPTX-LABEL: define internal void @__omp_outlined__3(
+; NVPTX-SAME: ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
+; NVPTX-NEXT:  [[ENTRY:.*:]]
 ; NVPTX-NEXT:    call void @unknown() #[[ATTR7]]
 ; NVPTX-NEXT:    ret void
 ;
@@ -341,9 +341,9 @@ entry:
 
 ; Function Attrs: convergent norecurse nounwind
 define internal void @__omp_outlined__3_wrapper(i16 zeroext %0, i32 %1) #1 {
-; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__3_wrapper
-; AMDGPU-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] {
-; AMDGPU-NEXT:  entry:
+; AMDGPU-LABEL: define internal void @__omp_outlined__3_wrapper(
+; AMDGPU-SAME: i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] {
+; AMDGPU-NEXT:  [[ENTRY:.*:]]
 ; AMDGPU-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8, addrspace(5)
 ; AMDGPU-NEXT:    [[DOTADDR1_CAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
 ; AMDGPU-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
@@ -354,9 +354,9 @@ define internal void @__omp_outlined__3_wrapper(i16 zeroext %0, i32 %1) #1 {
 ; AMDGPU-NEXT:    call void @__omp_outlined__3(ptr [[DOTADDR1_CAST]], ptr [[DOTZERO_ADDR_CAST]]) #[[ATTR10]]
 ; AMDGPU-NEXT:    ret void
 ;
-; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__3_wrapper
-; NVPTX-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] {
-; NVPTX-NEXT:  entry:
+; NVPTX-LABEL: define internal void @__omp_outlined__3_wrapper(
+; NVPTX-SAME: i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] {
+; NVPTX-NEXT:  [[ENTRY:.*:]]
 ; NVPTX-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8, addrspace(5)
 ; NVPTX-NEXT:    [[DOTADDR1_CAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
 ; NVPTX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
@@ -383,9 +383,9 @@ entry:
 
 ; Function Attrs: alwaysinline convergent norecurse nounwind
 define weak ptx_kernel void @spmd_and_non_spmd_callee(i1 %c) #0 {
-; AMDGPU-LABEL: define {{[^@]+}}@spmd_and_non_spmd_callee
-; AMDGPU-SAME: (i1 [[C:%.*]]) #[[ATTR0]] {
-; AMDGPU-NEXT:  entry:
+; AMDGPU-LABEL: define weak ptx_kernel void @spmd_and_non_spmd_callee(
+; AMDGPU-SAME: i1 [[C:%.*]]) #[[ATTR0]] {
+; AMDGPU-NEXT:  [[ENTRY:.*:]]
 ; AMDGPU-NEXT:    [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 ; AMDGPU-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 ; AMDGPU-NEXT:    [[DOTZERO_ADDR_CAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTZERO_ADDR]] to ptr
@@ -393,62 +393,62 @@ define weak ptx_kernel void @spmd_and_non_spmd_callee(i1 %c) #0 {
 ; AMDGPU-NEXT:    [[DOTTHREADID_TEMP__CAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTTHREADID_TEMP_]] to ptr
 ; AMDGPU-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @spmd_and_non_spmd_callee_kernel_environment, ptr null)
 ; AMDGPU-NEXT:    [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
-; AMDGPU-NEXT:    br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
-; AMDGPU:       is_worker_check:
+; AMDGPU-NEXT:    br i1 [[THREAD_IS_WORKER]], label %[[IS_WORKER_CHECK:.*]], label %[[THREAD_USER_CODE_CHECK:.*]]
+; AMDGPU:       [[IS_WORKER_CHECK]]:
 ; AMDGPU-NEXT:    [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
 ; AMDGPU-NEXT:    [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
 ; AMDGPU-NEXT:    [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]]
 ; AMDGPU-NEXT:    [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]]
-; AMDGPU-NEXT:    br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]]
-; AMDGPU:       worker_state_machine.begin:
+; AMDGPU-NEXT:    br i1 [[THREAD_IS_MAIN_OR_WORKER]], label %[[WORKER_STATE_MACHINE_BEGIN:.*]], label %[[WORKER_STATE_MACHINE_FINISHED:.*]]
+; AMDGPU:       [[WORKER_STATE_MACHINE_BEGIN]]:
 ; AMDGPU-NEXT:    call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
 ; AMDGPU-NEXT:    [[WORKER_WORK_FN_ADDR_GENERIC:%.*]] = addrspacecast ptr addrspace(5) [[WORKER_WORK_FN_ADDR]] to ptr
 ; AMDGPU-NEXT:    [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(ptr [[WORKER_WORK_FN_ADDR_GENERIC]])
 ; AMDGPU-NEXT:    [[WORKER_WORK_FN:%.*]] = load ptr, ptr [[WORKER_WORK_FN_ADDR_GENERIC]], align 8
 ; AMDGPU-NEXT:    [[WORKER_IS_DONE:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], null
-; AMDGPU-NEXT:    br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
-; AMDGPU:       worker_state_machine.finished:
+; AMDGPU-NEXT:    br i1 [[WORKER_IS_DONE]], label %[[WORKER_STATE_MACHINE_FINISHED]], label %[[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:.*]]
+; AMDGPU:       [[WORKER_STATE_MACHINE_FINISHED]]:
 ; AMDGPU-NEXT:    ret void
-; AMDGPU:       worker_state_machine.is_active.check:
-; AMDGPU-NEXT:    br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]]
-; AMDGPU:       worker_state_machine.parallel_region.fallback.execute:
+; AMDGPU:       [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK]]:
+; AMDGPU-NEXT:    br i1 [[WORKER_IS_ACTIVE]], label %[[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:.*]], label %[[WORKER_STATE_MACHINE_DONE_BARRIER:.*]]
+; AMDGPU:       [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE]]:
 ; AMDGPU-NEXT:    call void [[WORKER_WORK_FN]](i16 0, i32 [[TMP0]])
-; AMDGPU-NEXT:    br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]]
-; AMDGPU:       worker_state_machine.parallel_region.end:
+; AMDGPU-NEXT:    br label %[[WORKER_STATE_MACHINE_PARALLEL_REGION_END:.*]]
+; AMDGPU:       [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]]:
 ; AMDGPU-NEXT:    call void @__kmpc_kernel_end_parallel()
-; AMDGPU-NEXT:    br label [[WORKER_STATE_MACHINE_DONE_BARRIER]]
-; AMDGPU:       worker_state_machine.done.barrier:
+; AMDGPU-NEXT:    br label %[[WORKER_STATE_MACHINE_DONE_BARRIER]]
+; AMDGPU:       [[WORKER_STATE_MACHINE_DONE_BARRIER]]:
 ; AMDGPU-NEXT:    call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
-; AMDGPU-NEXT:    br label [[WORKER_STATE_MACHINE_BEGIN]]
-; AMDGPU:       thread.user_code.check:
+; AMDGPU-NEXT:    br label %[[WORKER_STATE_MACHINE_BEGIN]]
+; AMDGPU:       [[THREAD_USER_CODE_CHECK]]:
 ; AMDGPU-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
-; AMDGPU-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
-; AMDGPU:       common.ret:
+; AMDGPU-NEXT:    br i1 [[EXEC_USER_CODE]], label %[[USER_CODE_ENTRY:.*]], label %[[COMMON_RET:.*]]
+; AMDGPU:       [[COMMON_RET]]:
 ; AMDGPU-NEXT:    ret void
-; AMDGPU:       user_code.entry:
+; AMDGPU:       [[USER_CODE_ENTRY]]:
 ; AMDGPU-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR10]]
 ; AMDGPU-NEXT:    store i32 0, ptr addrspace(5) [[DOTZERO_ADDR]], align 4
-; AMDGPU-NEXT:    store i32 [[TMP1]], ptr addrspace(5) [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12]]
+; AMDGPU-NEXT:    store i32 [[TMP1]], ptr addrspace(5) [[DOTTHREADID_TEMP_]], align 4, !tbaa [[INT_TBAA12]]
 ; AMDGPU-NEXT:    [[FP:%.*]] = select i1 [[C]], ptr @__omp_outlined_spmd_amenable3, ptr @__omp_outlined_not_spmd_amenable
 ; AMDGPU-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[FP]], @__omp_outlined_not_spmd_amenable
-; AMDGPU-NEXT:    br i1 [[TMP2]], label [[TMP3:%.*]], label [[TMP4:%.*]]
-; AMDGPU:       3:
+; AMDGPU-NEXT:    br i1 [[TMP2]], label %[[BB3:.*]], label %[[BB4:.*]]
+; AMDGPU:       [[BB3]]:
 ; AMDGPU-NEXT:    call void @__omp_outlined_not_spmd_amenable(ptr [[DOTTHREADID_TEMP__CAST]], ptr [[DOTZERO_ADDR_CAST]]) #[[ATTR10]]
-; AMDGPU-NEXT:    br label [[TMP7:%.*]]
-; AMDGPU:       4:
-; AMDGPU-NEXT:    br i1 true, label [[TMP5:%.*]], label [[TMP6:%.*]]
-; AMDGPU:       5:
+; AMDGPU-NEXT:    br label %[[BB7:.*]]
+; AMDGPU:       [[BB4]]:
+; AMDGPU-NEXT:    br i1 true, label %[[BB5:.*]], label %[[BB6:.*]]
+; AMDGPU:       [[BB5]]:
 ; AMDGPU-NEXT:    call void @__omp_outlined_spmd_amenable3(ptr [[DOTTHREADID_TEMP__CAST]], ptr [[DOTZERO_ADDR_CAST]]) #[[ATTR10]]
-; AMDGPU-NEXT:    br label [[TMP7]]
-; AMDGPU:       6:
+; AMDGPU-NEXT:    br label %[[BB7]]
+; AMDGPU:       [[BB6]]:
 ; AMDGPU-NEXT:    unreachable
-; AMDGPU:       7:
+; AMDGPU:       [[BB7]]:
 ; AMDGPU-NEXT:    call void @__kmpc_target_deinit()
-; AMDGPU-NEXT:    br label [[COMMON_RET]]
+; AMDGPU-NEXT:    br label %[[COMMON_RET]]
 ;
-; NVPTX-LABEL: define {{[^@]+}}@spmd_and_non_spmd_callee
-; NVPTX-SAME: (i1 [[C:%.*]]) #[[ATTR0]] {
-; NVPTX-NEXT:  entry:
+; NVPTX-LABEL: define weak ptx_kernel void @spmd_and_non_spmd_callee(
+; NVPTX-SAME: i1 [[C:%.*]]) #[[ATTR0]] {
+; NVPTX-NEXT:  [[ENTRY:.*:]]
 ; NVPTX-NEXT:    [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8
 ; NVPTX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 ; NVPTX-NEXT:    [[DOTZERO_ADDR_CAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTZERO_ADDR]] to ptr
@@ -456,57 +456,57 @@ define weak ptx_kernel void @spmd_and_non_spmd_callee(i1 %c) #0 {
 ; NVPTX-NEXT:    [[DOTTHREADID_TEMP__CAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTTHREADID_TEMP_]] to ptr
 ; NVPTX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @spmd_and_non_spmd_callee_kernel_environment, ptr null)
 ; NVPTX-NEXT:    [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
-; NVPTX-NEXT:    br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
-; NVPTX:       is_worker_check:
+; NVPTX-NEXT:    br i1 [[THREAD_IS_WORKER]], label %[[IS_WORKER_CHECK:.*]], label %[[THREAD_USER_CODE_CHECK:.*]]
+; NVPTX:       [[IS_WORKER_CHECK]]:
 ; NVPTX-NEXT:    [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
 ; NVPTX-NEXT:    [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
 ; NVPTX-NEXT:    [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]]
 ; NVPTX-NEXT:    [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]]
-; NVPTX-NEXT:    br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]]
-; NVPTX:       worker_state_machine.begin:
+; NVPTX-NEXT:    br i1 [[THREAD_IS_MAIN_OR_WORKER]], label %[[WORKER_STATE_MACHINE_BEGIN:.*]], label %[[WORKER_STATE_MACHINE_FINISHED:.*]]
+; NVPTX:       [[WORKER_STATE_MACHINE_BEGIN]]:
 ; NVPTX-NEXT:    call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
 ; NVPTX-NEXT:    [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(ptr [[WORKER_WORK_FN_ADDR]])
 ; NVPTX-NEXT:    [[WORKER_WORK_FN:%.*]] = load ptr, ptr [[WORKER_WORK_FN_ADDR]], align 8
 ; NVPTX-NEXT:    [[WORKER_IS_DONE:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], null
-; NVPTX-NEXT:    br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
-; NVPTX:       worker_state_machine.finished:
+; NVPTX-NEXT:    br i1 [[WORKER_IS_DONE]], label %[[WORKER_STATE_MACHINE_FINISHED]], label %[[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:.*]]
+; NVPTX:       [[WORKER_STATE_MACHINE_FINISHED]]:
 ; NVPTX-NEXT:    ret void
-; NVPTX:       worker_state_machine.is_active.check:
-; NVPTX-NEXT:    br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]]
-; NVPTX:       worker_state_machine.parallel_region.fallback.execute:
+; NVPTX:       [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK]]:
+; NVPTX-NEXT:    br i1 [[WORKER_IS_ACTIVE]], label %[[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:.*]], label %[[WORKER_STATE_MACHINE_DONE_BARRIER:.*]]
+; NVPTX:       [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE]]:
 ; NVPTX-NEXT:    call void [[WORKER_WORK_FN]](i16 0, i32 [[TMP0]])
-; NVPTX-NEXT:    br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]]
-; NVPTX:       worker_state_machine.parallel_region.end:
+; NVPTX-NEXT:    br label %[[WORKER_STATE_MACHINE_PARALLEL_REGION_END:.*]]
+; NVPTX:       [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]]:
 ; NVPTX-NEXT:    call void @__kmpc_kernel_end_parallel()
-; NVPTX-NEXT:    br label [[WORKER_STATE_MACHINE_DONE_BARRIER]]
-; NVPTX:       worker_state_machine.done.barrier:
+; NVPTX-NEXT:    br label %[[WORKER_STATE_MACHINE_DONE_BARRIER]]
+; NVPTX:       [[WORKER_STATE_MACHINE_DONE_BARRIER]]:
 ; NVPTX-NEXT:    call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
-; NVPTX-NEXT:    br label [[WORKER_STATE_MACHINE_BEGIN]]
-; NVPTX:       thread.user_code.check:
+; NVPTX-NEXT:    br label %[[WORKER_STATE_MACHINE_BEGIN]]
+; NVPTX:       [[THREAD_USER_CODE_CHECK]]:
 ; NVPTX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
-; NVPTX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
-; NVPTX:       common.ret:
+; NVPTX-NEXT:    br i1 [[EXEC_USER_CODE]], label %[[USER_CODE_ENTRY:.*]], label %[[COMMON_RET:.*]]
+; NVPTX:       [[COMMON_RET]]:
 ; NVPTX-NEXT:    ret void
-; NVPTX:       user_code.entry:
+; NVPTX:       [[USER_CODE_ENTRY]]:
 ; NVPTX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR10]]
 ; NVPTX-NEXT:    store i32 0, ptr addrspace(5) [[DOTZERO_ADDR]], align 4
-; NVPTX-NEXT:    store i32 [[TMP1]], ptr addrspace(5) [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12]]
+; NVPTX-NEXT:    store i32 [[TMP1]], ptr addrspace(5) [[DOTTHREADID_TEMP_]], align 4, !tbaa [[INT_TBAA12]]
 ; NVPTX-NEXT:    [[FP:%.*]] = select i1 [[C]], ptr @__omp_outlined_spmd_amenable3, ptr @__omp_outlined_not_spmd_amenable
 ; NVPTX-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[FP]], @__omp_outlined_not_spmd_amenable
-; NVPTX-NEXT:    br i1 [[TMP2]], label [[TMP3:%.*]], label [[TMP4:%.*]]
-; NVPTX:       3:
+; NVPTX-NEXT:    br i1 [[TMP2]], label %[[BB3:.*]], label %[[BB4:.*]]
+; NVPTX:       [[BB3]]:
 ; NVPTX-NEXT:    call void @__omp_outlined_not_spmd_amenable(ptr [[DOTTHREADID_TEMP__CAST]], ptr [[DOTZERO_ADDR_CAST]]) #[[ATTR10]]
-; NVPTX-NEXT:    br label [[TMP7:%.*]]
-; NVPTX:       4:
-; NVPTX-NEXT:    br i1 true, label [[TMP5:%.*]], label [[TMP6:%.*]]
-; NVPTX:       5:
+; NVPTX-NEXT:    br label %[[BB7:.*]]
+; NVPTX:       [[BB4]]:
+; NVPTX-NEXT:    br i1 true, label %[[BB5:.*]], label %[[BB6:.*]]
+; NVPTX:       [[BB5]]:
 ; NVPTX-NEXT:    call void @__omp_outlined_spmd_amenable3(ptr [[DOTTHREADID_TEMP__CAST]], ptr [[DOTZERO_ADDR_CAST]]) #[[ATTR10]]
-; NVPTX-NEXT:    br label [[TMP7]]
-; NVPTX:       6:
+; NVPTX-NEXT:    br label %[[BB7]]
+; NVPTX:       [[BB6]]:
 ; NVPTX-NEXT:    unreachable
-; NVPTX:       7:
+; NVPTX:       [[BB7]]:
 ; NVPTX-NEXT:    call void @__kmpc_target_deinit()
-; NVPTX-NEXT:    br label [[COMMON_RET]]
+; NVPTX-NEXT:    br label %[[COMMON_RET]]
 ;
 entry:
   %.zero.addr = alloca ptr, align 8, addrspace(5)
@@ -531,49 +531,49 @@ user_code.entry:                                  ; preds = %entry
 }
 
 define internal void @__omp_outlined_spmd_amenable3(ptr noalias %.global_tid., ptr noalias %.bound_tid.) {
-; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined_spmd_amenable3
-; AMDGPU-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
-; AMDGPU-NEXT:  entry:
+; AMDGPU-LABEL: define internal void @__omp_outlined_spmd_amenable3(
+; AMDGPU-SAME: ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
+; AMDGPU-NEXT:  [[ENTRY:.*]]:
 ; AMDGPU-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca ptr, align 8, addrspace(5)
 ; AMDGPU-NEXT:    [[CAPTURED_VARS_ADDRS_CAST:%.*]] = addrspacecast ptr addrspace(5) [[CAPTURED_VARS_ADDRS]] to ptr
 ; AMDGPU-NEXT:    [[X:%.*]] = call align 4 ptr @__kmpc_alloc_shared(i64 4) #[[ATTR10]]
-; AMDGPU-NEXT:    br label [[FOR_COND:%.*]]
-; AMDGPU:       for.cond:
-; AMDGPU-NEXT:    [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
+; AMDGPU-NEXT:    br label %[[FOR_COND:.*]]
+; AMDGPU:       [[FOR_COND]]:
+; AMDGPU-NEXT:    [[I_0:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_BODY:.*]] ]
 ; AMDGPU-NEXT:    [[CMP:%.*]] = icmp slt i32 [[I_0]], 100
-; AMDGPU-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
-; AMDGPU:       for.cond.cleanup:
+; AMDGPU-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP:.*]]
+; AMDGPU:       [[FOR_COND_CLEANUP]]:
 ; AMDGPU-NEXT:    call void @spmd_amenable() #[[ATTR6]]
 ; AMDGPU-NEXT:    call void @__kmpc_free_shared(ptr [[X]], i64 4) #[[ATTR10]]
 ; AMDGPU-NEXT:    ret void
-; AMDGPU:       for.body:
-; AMDGPU-NEXT:    store ptr [[X]], ptr addrspace(5) [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[TBAA20:![0-9]+]]
-; AMDGPU-NEXT:    [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA12]]
+; AMDGPU:       [[FOR_BODY]]:
+; AMDGPU-NEXT:    store ptr [[X]], ptr addrspace(5) [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[ANYPTR_TBAA20:![0-9]+]]
+; AMDGPU-NEXT:    [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[INT_TBAA12]]
 ; AMDGPU-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__5, ptr @__omp_outlined__5_wrapper, ptr [[CAPTURED_VARS_ADDRS_CAST]], i64 1)
 ; AMDGPU-NEXT:    [[INC]] = add nsw i32 [[I_0]], 1
-; AMDGPU-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP22:![0-9]+]]
+; AMDGPU-NEXT:    br label %[[FOR_COND]], !llvm.loop [[LOOP22:![0-9]+]]
 ;
-; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined_spmd_amenable3
-; NVPTX-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
-; NVPTX-NEXT:  entry:
+; NVPTX-LABEL: define internal void @__omp_outlined_spmd_amenable3(
+; NVPTX-SAME: ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
+; NVPTX-NEXT:  [[ENTRY:.*]]:
 ; NVPTX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca ptr, align 8, addrspace(5)
 ; NVPTX-NEXT:    [[CAPTURED_VARS_ADDRS_CAST:%.*]] = addrspacecast ptr addrspace(5) [[CAPTURED_VARS_ADDRS]] to ptr
 ; NVPTX-NEXT:    [[X:%.*]] = call align 4 ptr @__kmpc_alloc_shared(i64 4) #[[ATTR10]]
-; NVPTX-NEXT:    br label [[FOR_COND:%.*]]
-; NVPTX:       for.cond:
-; NVPTX-NEXT:    [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
+; NVPTX-NEXT:    br label %[[FOR_COND:.*]]
+; NVPTX:       [[FOR_COND]]:
+; NVPTX-NEXT:    [[I_0:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_BODY:.*]] ]
 ; NVPTX-NEXT:    [[CMP:%.*]] = icmp slt i32 [[I_0]], 100
-; NVPTX-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
-; NVPTX:       for.cond.cleanup:
+; NVPTX-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP:.*]]
+; NVPTX:       [[FOR_COND_CLEANUP]]:
 ; NVPTX-NEXT:    call void @spmd_amenable() #[[ATTR6]]
 ; NVPTX-NEXT:    call void @__kmpc_free_shared(ptr [[X]], i64 4) #[[ATTR10]]
 ; NVPTX-NEXT:    ret void
-; NVPTX:       for.body:
-; NVPTX-NEXT:    store ptr [[X]], ptr addrspace(5) [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[TBAA20:![0-9]+]]
-; NVPTX-NEXT:    [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA12]]
+; NVPTX:       [[FOR_BODY]]:
+; NVPTX-NEXT:    store ptr [[X]], ptr addrspace(5) [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[ANYPTR_TBAA20:![0-9]+]]
+; NVPTX-NEXT:    [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[INT_TBAA12]]
 ; NVPTX-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__5, ptr @__omp_outlined__5_wrapper, ptr [[CAPTURED_VARS_ADDRS_CAST]], i64 1)
 ; NVPTX-NEXT:    [[INC]] = add nsw i32 [[I_0]], 1
-; NVPTX-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP22:![0-9]+]]
+; NVPTX-NEXT:    br label %[[FOR_COND]], !llvm.loop [[LOOP22:![0-9]+]]
 ;
 entry:
   %captured_vars_addrs = alloca ptr, align 8, addrspace(5)
@@ -600,21 +600,21 @@ for.body:                                         ; preds = %for.cond
 }
 
 define internal void @__omp_outlined__5(ptr noalias %.global_tid., ptr noalias %.bound_tid., ptr nonnull align 4 dereferenceable(4) %x) {
-; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__5
-; AMDGPU-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(4) [[X:%.*]]) {
-; AMDGPU-NEXT:  entry:
-; AMDGPU-NEXT:    [[TMP0:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[TBAA12]]
+; AMDGPU-LABEL: define internal void @__omp_outlined__5(
+; AMDGPU-SAME: ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(4) [[X:%.*]]) {
+; AMDGPU-NEXT:  [[ENTRY:.*:]]
+; AMDGPU-NEXT:    [[TMP0:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[INT_TBAA12]]
 ; AMDGPU-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP0]], 1
-; AMDGPU-NEXT:    store i32 [[INC]], ptr [[X]], align 4, !tbaa [[TBAA12]]
+; AMDGPU-NEXT:    store i32 [[INC]], ptr [[X]], align 4, !tbaa [[INT_TBAA12]]
 ; AMDGPU-NEXT:    call void @unknown() #[[ATTR7]]
 ; AMDGPU-NEXT:    ret void
 ;
-; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__5
-; NVPTX-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(4) [[X:%.*]]) {
-; NVPTX-NEXT:  entry:
-; NVPTX-NEXT:    [[TMP0:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[TBAA12]]
+; NVPTX-LABEL: define internal void @__omp_outlined__5(
+; NVPTX-SAME: ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(4) [[X:%.*]]) {
+; NVPTX-NEXT:  [[ENTRY:.*:]]
+; NVPTX-NEXT:    [[TMP0:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[INT_TBAA12]]
 ; NVPTX-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP0]], 1
-; NVPTX-NEXT:    store i32 [[INC]], ptr [[X]], align 4, !tbaa [[TBAA12]]
+; NVPTX-NEXT:    store i32 [[INC]], ptr [[X]], align 4, !tbaa [[INT_TBAA12]]
 ; NVPTX-NEXT:    call void @unknown() #[[ATTR7]]
 ; NVPTX-NEXT:    ret void
 ;
@@ -628,9 +628,9 @@ entry:
 
 ; Function Attrs: convergent norecurse nounwind
 define internal void @__omp_outlined__5_wrapper(i16 zeroext %0, i32 %1) #1 {
-; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__5_wrapper
-; AMDGPU-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] {
-; AMDGPU-NEXT:  entry:
+; AMDGPU-LABEL: define internal void @__omp_outlined__5_wrapper(
+; AMDGPU-SAME: i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] {
+; AMDGPU-NEXT:  [[ENTRY:.*:]]
 ; AMDGPU-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8, addrspace(5)
 ; AMDGPU-NEXT:    [[DOTADDR1_CAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
 ; AMDGPU-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
@@ -639,13 +639,13 @@ define internal void @__omp_outlined__5_wrapper(i16 zeroext %0, i32 %1) #1 {
 ; AMDGPU-NEXT:    [[GLOBAL_ARGS_CAST:%.*]] = addrspacecast ptr addrspace(5) [[GLOBAL_ARGS]] to ptr
 ; AMDGPU-NEXT:    call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS_CAST]])
 ; AMDGPU-NEXT:    [[TMP2:%.*]] = load ptr, ptr addrspace(5) [[GLOBAL_ARGS]], align 8
-; AMDGPU-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[TBAA20]]
+; AMDGPU-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[ANYPTR_TBAA20]]
 ; AMDGPU-NEXT:    call void @__omp_outlined__5(ptr [[DOTADDR1_CAST]], ptr [[DOTZERO_ADDR_CAST]], ptr [[TMP3]]) #[[ATTR10]]
 ; AMDGPU-NEXT:    ret void
 ;
-; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__5_wrapper
-; NVPTX-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] {
-; NVPTX-NEXT:  entry:
+; NVPTX-LABEL: define internal void @__omp_outlined__5_wrapper(
+; NVPTX-SAME: i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] {
+; NVPTX-NEXT:  [[ENTRY:.*:]]
 ; NVPTX-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8, addrspace(5)
 ; NVPTX-NEXT:    [[DOTADDR1_CAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
 ; NVPTX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
@@ -654,7 +654,7 @@ define internal void @__omp_outlined__5_wrapper(i16 zeroext %0, i32 %1) #1 {
 ; NVPTX-NEXT:    [[GLOBAL_ARGS_CAST:%.*]] = addrspacecast ptr addrspace(5) [[GLOBAL_ARGS]] to ptr
 ; NVPTX-NEXT:    call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS_CAST]])
 ; NVPTX-NEXT:    [[TMP2:%.*]] = load ptr, ptr addrspace(5) [[GLOBAL_ARGS]], align 8
-; NVPTX-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[TBAA20]]
+; NVPTX-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[ANYPTR_TBAA20]]
 ; NVPTX-NEXT:    call void @__omp_outlined__5(ptr [[DOTADDR1_CAST]], ptr [[DOTZERO_ADDR_CAST]], ptr [[TMP3]]) #[[ATTR10]]
 ; NVPTX-NEXT:    ret void
 ;
@@ -676,45 +676,45 @@ entry:
 
 ; Function Attrs: alwaysinline convergent norecurse nounwind
 define weak ptx_kernel void @spmd_callees_metadata(ptr %fp) #0 {
-; AMDGPU-LABEL: define {{[^@]+}}@spmd_callees_metadata
-; AMDGPU-SAME: (ptr [[FP:%.*]]) #[[ATTR0]] {
-; AMDGPU-NEXT:  entry:
+; AMDGPU-LABEL: define weak ptx_kernel void @spmd_callees_metadata(
+; AMDGPU-SAME: ptr [[FP:%.*]]) #[[ATTR0]] {
+; AMDGPU-NEXT:  [[ENTRY:.*:]]
 ; AMDGPU-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 ; AMDGPU-NEXT:    [[DOTZERO_ADDR_CAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTZERO_ADDR]] to ptr
 ; AMDGPU-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca ptr, align 8, addrspace(5)
 ; AMDGPU-NEXT:    [[DOTTHREADID_TEMP__CAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTTHREADID_TEMP_]] to ptr
 ; AMDGPU-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @spmd_callees_metadata_kernel_environment, ptr null)
 ; AMDGPU-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
-; AMDGPU-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
-; AMDGPU:       common.ret:
+; AMDGPU-NEXT:    br i1 [[EXEC_USER_CODE]], label %[[USER_CODE_ENTRY:.*]], label %[[COMMON_RET:.*]]
+; AMDGPU:       [[COMMON_RET]]:
 ; AMDGPU-NEXT:    ret void
-; AMDGPU:       user_code.entry:
+; AMDGPU:       [[USER_CODE_ENTRY]]:
 ; AMDGPU-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR10]]
 ; AMDGPU-NEXT:    store i32 0, ptr addrspace(5) [[DOTZERO_ADDR]], align 4
-; AMDGPU-NEXT:    store i32 [[TMP1]], ptr addrspace(5) [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12]]
+; AMDGPU-NEXT:    store i32 [[TMP1]], ptr addrspace(5) [[DOTTHREADID_TEMP_]], align 4, !tbaa [[INT_TBAA12]]
 ; AMDGPU-NEXT:    call void @__omp_outlined_spmd_amenable_external(ptr [[DOTTHREADID_TEMP__CAST]], ptr [[DOTZERO_ADDR_CAST]])
 ; AMDGPU-NEXT:    call void @__kmpc_target_deinit()
-; AMDGPU-NEXT:    br label [[COMMON_RET]]
+; AMDGPU-NEXT:    br label %[[COMMON_RET]]
 ;
-; NVPTX-LABEL: define {{[^@]+}}@spmd_callees_metadata
-; NVPTX-SAME: (ptr [[FP:%.*]]) #[[ATTR0]] {
-; NVPTX-NEXT:  entry:
+; NVPTX-LABEL: define weak ptx_kernel void @spmd_callees_metadata(
+; NVPTX-SAME: ptr [[FP:%.*]]) #[[ATTR0]] {
+; NVPTX-NEXT:  [[ENTRY:.*:]]
 ; NVPTX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 ; NVPTX-NEXT:    [[DOTZERO_ADDR_CAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTZERO_ADDR]] to ptr
 ; NVPTX-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca ptr, align 8, addrspace(5)
 ; NVPTX-NEXT:    [[DOTTHREADID_TEMP__CAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTTHREADID_TEMP_]] to ptr
 ; NVPTX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @spmd_callees_metadata_kernel_environment, ptr null)
 ; NVPTX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
-; NVPTX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
-; NVPTX:       common.ret:
+; NVPTX-NEXT:    br i1 [[EXEC_USER_CODE]], label %[[USER_CODE_ENTRY:.*]], label %[[COMMON_RET:.*]]
+; NVPTX:       [[COMMON_RET]]:
 ; NVPTX-NEXT:    ret void
-; NVPTX:       user_code.entry:
+; NVPTX:       [[USER_CODE_ENTRY]]:
 ; NVPTX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR10]]
 ; NVPTX-NEXT:    store i32 0, ptr addrspace(5) [[DOTZERO_ADDR]], align 4
-; NVPTX-NEXT:    store i32 [[TMP1]], ptr addrspace(5) [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12]]
+; NVPTX-NEXT:    store i32 [[TMP1]], ptr addrspace(5) [[DOTTHREADID_TEMP_]], align 4, !tbaa [[INT_TBAA12]]
 ; NVPTX-NEXT:    call void @__omp_outlined_spmd_amenable_external(ptr [[DOTTHREADID_TEMP__CAST]], ptr [[DOTZERO_ADDR_CAST]])
 ; NVPTX-NEXT:    call void @__kmpc_target_deinit()
-; NVPTX-NEXT:    br label [[COMMON_RET]]
+; NVPTX-NEXT:    br label %[[COMMON_RET]]
 ;
 entry:
   %.zero.addr = alloca ptr, align 8, addrspace(5)
@@ -739,9 +739,9 @@ user_code.entry:                                  ; preds = %entry
 
 ; Function Attrs: alwaysinline convergent norecurse nounwind
 define weak ptx_kernel void @spmd_and_non_spmd_callees_metadata(ptr %fp) #0 {
-; AMDGPU-LABEL: define {{[^@]+}}@spmd_and_non_spmd_callees_metadata
-; AMDGPU-SAME: (ptr [[FP:%.*]]) #[[ATTR0]] {
-; AMDGPU-NEXT:  entry:
+; AMDGPU-LABEL: define weak ptx_kernel void @spmd_and_non_spmd_callees_metadata(
+; AMDGPU-SAME: ptr [[FP:%.*]]) #[[ATTR0]] {
+; AMDGPU-NEXT:  [[ENTRY:.*:]]
 ; AMDGPU-NEXT:    [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 ; AMDGPU-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 ; AMDGPU-NEXT:    [[DOTZERO_ADDR_CAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTZERO_ADDR]] to ptr
@@ -749,61 +749,61 @@ define weak ptx_kernel void @spmd_and_non_spmd_callees_metadata(ptr %fp) #0 {
 ; AMDGPU-NEXT:    [[DOTTHREADID_TEMP__CAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTTHREADID_TEMP_]] to ptr
 ; AMDGPU-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @spmd_and_non_spmd_callees_metadata_kernel_environment, ptr null)
 ; AMDGPU-NEXT:    [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
-; AMDGPU-NEXT:    br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
-; AMDGPU:       is_worker_check:
+; AMDGPU-NEXT:    br i1 [[THREAD_IS_WORKER]], label %[[IS_WORKER_CHECK:.*]], label %[[THREAD_USER_CODE_CHECK:.*]]
+; AMDGPU:       [[IS_WORKER_CHECK]]:
 ; AMDGPU-NEXT:    [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
 ; AMDGPU-NEXT:    [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
 ; AMDGPU-NEXT:    [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]]
 ; AMDGPU-NEXT:    [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]]
-; AMDGPU-NEXT:    br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]]
-; AMDGPU:       worker_state_machine.begin:
+; AMDGPU-NEXT:    br i1 [[THREAD_IS_MAIN_OR_WORKER]], label %[[WORKER_STATE_MACHINE_BEGIN:.*]], label %[[WORKER_STATE_MACHINE_FINISHED:.*]]
+; AMDGPU:       [[WORKER_STATE_MACHINE_BEGIN]]:
 ; AMDGPU-NEXT:    call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
 ; AMDGPU-NEXT:    [[WORKER_WORK_FN_ADDR_GENERIC:%.*]] = addrspacecast ptr addrspace(5) [[WORKER_WORK_FN_ADDR]] to ptr
 ; AMDGPU-NEXT:    [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(ptr [[WORKER_WORK_FN_ADDR_GENERIC]])
 ; AMDGPU-NEXT:    [[WORKER_WORK_FN:%.*]] = load ptr, ptr [[WORKER_WORK_FN_ADDR_GENERIC]], align 8
 ; AMDGPU-NEXT:    [[WORKER_IS_DONE:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], null
-; AMDGPU-NEXT:    br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
-; AMDGPU:       worker_state_machine.finished:
+; AMDGPU-NEXT:    br i1 [[WORKER_IS_DONE]], label %[[WORKER_STATE_MACHINE_FINISHED]], label %[[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:.*]]
+; AMDGPU:       [[WORKER_STATE_MACHINE_FINISHED]]:
 ; AMDGPU-NEXT:    ret void
-; AMDGPU:       worker_state_machine.is_active.check:
-; AMDGPU-NEXT:    br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]]
-; AMDGPU:       worker_state_machine.parallel_region.fallback.execute:
+; AMDGPU:       [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK]]:
+; AMDGPU-NEXT:    br i1 [[WORKER_IS_ACTIVE]], label %[[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:.*]], label %[[WORKER_STATE_MACHINE_DONE_BARRIER:.*]]
+; AMDGPU:       [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE]]:
 ; AMDGPU-NEXT:    call void [[WORKER_WORK_FN]](i16 0, i32 [[TMP0]])
-; AMDGPU-NEXT:    br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]]
-; AMDGPU:       worker_state_machine.parallel_region.end:
+; AMDGPU-NEXT:    br label %[[WORKER_STATE_MACHINE_PARALLEL_REGION_END:.*]]
+; AMDGPU:       [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]]:
 ; AMDGPU-NEXT:    call void @__kmpc_kernel_end_parallel()
-; AMDGPU-NEXT:    br label [[WORKER_STATE_MACHINE_DONE_BARRIER]]
-; AMDGPU:       worker_state_machine.done.barrier:
+; AMDGPU-NEXT:    br label %[[WORKER_STATE_MACHINE_DONE_BARRIER]]
+; AMDGPU:       [[WORKER_STATE_MACHINE_DONE_BARRIER]]:
 ; AMDGPU-NEXT:    call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
-; AMDGPU-NEXT:    br label [[WORKER_STATE_MACHINE_BEGIN]]
-; AMDGPU:       thread.user_code.check:
+; AMDGPU-NEXT:    br label %[[WORKER_STATE_MACHINE_BEGIN]]
+; AMDGPU:       [[THREAD_USER_CODE_CHECK]]:
 ; AMDGPU-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
-; AMDGPU-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
-; AMDGPU:       common.ret:
+; AMDGPU-NEXT:    br i1 [[EXEC_USER_CODE]], label %[[USER_CODE_ENTRY:.*]], label %[[COMMON_RET:.*]]
+; AMDGPU:       [[COMMON_RET]]:
 ; AMDGPU-NEXT:    ret void
-; AMDGPU:       user_code.entry:
+; AMDGPU:       [[USER_CODE_ENTRY]]:
 ; AMDGPU-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR10]]
 ; AMDGPU-NEXT:    store i32 0, ptr addrspace(5) [[DOTZERO_ADDR]], align 4
-; AMDGPU-NEXT:    store i32 [[TMP1]], ptr addrspace(5) [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12]]
+; AMDGPU-NEXT:    store i32 [[TMP1]], ptr addrspace(5) [[DOTTHREADID_TEMP_]], align 4, !tbaa [[INT_TBAA12]]
 ; AMDGPU-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[FP]], @__omp_outlined_spmd_amenable_external
-; AMDGPU-NEXT:    br i1 [[TMP2]], label [[TMP3:%.*]], label [[TMP4:%.*]]
-; AMDGPU:       3:
+; AMDGPU-NEXT:    br i1 [[TMP2]], label %[[BB3:.*]], label %[[BB4:.*]]
+; AMDGPU:       [[BB3]]:
 ; AMDGPU-NEXT:    call void @__omp_outlined_spmd_amenable_external(ptr [[DOTTHREADID_TEMP__CAST]], ptr [[DOTZERO_ADDR_CAST]])
-; AMDGPU-NEXT:    br label [[TMP7:%.*]]
-; AMDGPU:       4:
-; AMDGPU-NEXT:    br i1 true, label [[TMP5:%.*]], label [[TMP6:%.*]]
-; AMDGPU:       5:
+; AMDGPU-NEXT:    br label %[[BB7:.*]]
+; AMDGPU:       [[BB4]]:
+; AMDGPU-NEXT:    br i1 true, label %[[BB5:.*]], label %[[BB6:.*]]
+; AMDGPU:       [[BB5]]:
 ; AMDGPU-NEXT:    call void @__omp_outlined_not_spmd_amenable_external(ptr [[DOTTHREADID_TEMP__CAST]], ptr [[DOTZERO_ADDR_CAST]])
-; AMDGPU-NEXT:    br label [[TMP7]]
-; AMDGPU:       6:
+; AMDGPU-NEXT:    br label %[[BB7]]
+; AMDGPU:       [[BB6]]:
 ; AMDGPU-NEXT:    unreachable
-; AMDGPU:       7:
+; AMDGPU:       [[BB7]]:
 ; AMDGPU-NEXT:    call void @__kmpc_target_deinit()
-; AMDGPU-NEXT:    br label [[COMMON_RET]]
+; AMDGPU-NEXT:    br label %[[COMMON_RET]]
 ;
-; NVPTX-LABEL: define {{[^@]+}}@spmd_and_non_spmd_callees_metadata
-; NVPTX-SAME: (ptr [[FP:%.*]]) #[[ATTR0]] {
-; NVPTX-NEXT:  entry:
+; NVPTX-LABEL: define weak ptx_kernel void @spmd_and_non_spmd_callees_metadata(
+; NVPTX-SAME: ptr [[FP:%.*]]) #[[ATTR0]] {
+; NVPTX-NEXT:  [[ENTRY:.*:]]
 ; NVPTX-NEXT:    [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8
 ; NVPTX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 ; NVPTX-NEXT:    [[DOTZERO_ADDR_CAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTZERO_ADDR]] to ptr
@@ -811,56 +811,56 @@ define weak ptx_kernel void @spmd_and_non_spmd_callees_metadata(ptr %fp) #0 {
 ; NVPTX-NEXT:    [[DOTTHREADID_TEMP__CAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTTHREADID_TEMP_]] to ptr
 ; NVPTX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @spmd_and_non_spmd_callees_metadata_kernel_environment, ptr null)
 ; NVPTX-NEXT:    [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
-; NVPTX-NEXT:    br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
-; NVPTX:       is_worker_check:
+; NVPTX-NEXT:    br i1 [[THREAD_IS_WORKER]], label %[[IS_WORKER_CHECK:.*]], label %[[THREAD_USER_CODE_CHECK:.*]]
+; NVPTX:       [[IS_WORKER_CHECK]]:
 ; NVPTX-NEXT:    [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
 ; NVPTX-NEXT:    [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
 ; NVPTX-NEXT:    [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]]
 ; NVPTX-NEXT:    [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]]
-; NVPTX-NEXT:    br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]]
-; NVPTX:       worker_state_machine.begin:
+; NVPTX-NEXT:    br i1 [[THREAD_IS_MAIN_OR_WORKER]], label %[[WORKER_STATE_MACHINE_BEGIN:.*]], label %[[WORKER_STATE_MACHINE_FINISHED:.*]]
+; NVPTX:       [[WORKER_STATE_MACHINE_BEGIN]]:
 ; NVPTX-NEXT:    call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
 ; NVPTX-NEXT:    [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(ptr [[WORKER_WORK_FN_ADDR]])
 ; NVPTX-NEXT:    [[WORKER_WORK_FN:%.*]] = load ptr, ptr [[WORKER_WORK_FN_ADDR]], align 8
 ; NVPTX-NEXT:    [[WORKER_IS_DONE:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], null
-; NVPTX-NEXT:    br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
-; NVPTX:       worker_state_machine.finished:
+; NVPTX-NEXT:    br i1 [[WORKER_IS_DONE]], label %[[WORKER_STATE_MACHINE_FINISHED]], label %[[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:.*]]
+; NVPTX:       [[WORKER_STATE_MACHINE_FINISHED]]:
 ; NVPTX-NEXT:    ret void
-; NVPTX:       worker_state_machine.is_active.check:
-; NVPTX-NEXT:    br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]]
-; NVPTX:       worker_state_machine.parallel_region.fallback.execute:
+; NVPTX:       [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK]]:
+; NVPTX-NEXT:    br i1 [[WORKER_IS_ACTIVE]], label %[[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:.*]], label %[[WORKER_STATE_MACHINE_DONE_BARRIER:.*]]
+; NVPTX:       [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE]]:
 ; NVPTX-NEXT:    call void [[WORKER_WORK_FN]](i16 0, i32 [[TMP0]])
-; NVPTX-NEXT:    br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]]
-; NVPTX:       worker_state_machine.parallel_region.end:
+; NVPTX-NEXT:    br label %[[WORKER_STATE_MACHINE_PARALLEL_REGION_END:.*]]
+; NVPTX:       [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]]:
 ; NVPTX-NEXT:    call void @__kmpc_kernel_end_parallel()
-; NVPTX-NEXT:    br label [[WORKER_STATE_MACHINE_DONE_BARRIER]]
-; NVPTX:       worker_state_machine.done.barrier:
+; NVPTX-NEXT:    br label %[[WORKER_STATE_MACHINE_DONE_BARRIER]]
+; NVPTX:       [[WORKER_STATE_MACHINE_DONE_BARRIER]]:
 ; NVPTX-NEXT:    call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
-; NVPTX-NEXT:    br label [[WORKER_STATE_MACHINE_BEGIN]]
-; NVPTX:       thread.user_code.check:
+; NVPTX-NEXT:    br label %[[WORKER_STATE_MACHINE_BEGIN]]
+; NVPTX:       [[THREAD_USER_CODE_CHECK]]:
 ; NVPTX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
-; NVPTX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
-; NVPTX:       common.ret:
+; NVPTX-NEXT:    br i1 [[EXEC_USER_CODE]], label %[[USER_CODE_ENTRY:.*]], label %[[COMMON_RET:.*]]
+; NVPTX:       [[COMMON_RET]]:
 ; NVPTX-NEXT:    ret void
-; NVPTX:       user_code.entry:
+; NVPTX:       [[USER_CODE_ENTRY]]:
 ; NVPTX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR10]]
 ; NVPTX-NEXT:    store i32 0, ptr addrspace(5) [[DOTZERO_ADDR]], align 4
-; NVPTX-NEXT:    store i32 [[TMP1]], ptr addrspace(5) [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12]]
+; NVPTX-NEXT:    store i32 [[TMP1]], ptr addrspace(5) [[DOTTHREADID_TEMP_]], align 4, !tbaa [[INT_TBAA12]]
 ; NVPTX-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[FP]], @__omp_outlined_spmd_amenable_external
-; NVPTX-NEXT:    br i1 [[TMP2]], label [[TMP3:%.*]], label [[TMP4:%.*]]
-; NVPTX:       3:
+; NVPTX-NEXT:    br i1 [[TMP2]], label %[[BB3:.*]], label %[[BB4:.*]]
+; NVPTX:       [[BB3]]:
 ; NVPTX-NEXT:    call void @__omp_outlined_spmd_amenable_external(ptr [[DOTTHREADID_TEMP__CAST]], ptr [[DOTZERO_ADDR_CAST]])
-; NVPTX-NEXT:    br label [[TMP7:%.*]]
-; NVPTX:       4:
-; NVPTX-NEXT:    br i1 true, label [[TMP5:%.*]], label [[TMP6:%.*]]
-; NVPTX:       5:
+; NVPTX-NEXT:    br label %[[BB7:.*]]
+; NVPTX:       [[BB4]]:
+; NVPTX-NEXT:    br i1 true, label %[[BB5:.*]], label %[[BB6:.*]]
+; NVPTX:       [[BB5]]:
 ; NVPTX-NEXT:    call void @__omp_outlined_not_spmd_amenable_external(ptr [[DOTTHREADID_TEMP__CAST]], ptr [[DOTZERO_ADDR_CAST]])
-; NVPTX-NEXT:    br label [[TMP7]]
-; NVPTX:       6:
+; NVPTX-NEXT:    br label %[[BB7]]
+; NVPTX:       [[BB6]]:
 ; NVPTX-NEXT:    unreachable
-; NVPTX:       7:
+; NVPTX:       [[BB7]]:
 ; NVPTX-NEXT:    call void @__kmpc_target_deinit()
-; NVPTX-NEXT:    br label [[COMMON_RET]]
+; NVPTX-NEXT:    br label %[[COMMON_RET]]
 ;
 entry:
   %.zero.addr = alloca ptr, align 8, addrspace(5)
@@ -884,39 +884,39 @@ user_code.entry:                                  ; preds = %entry
 }
 
 define void @__omp_outlined_spmd_amenable_external(ptr noalias %.global_tid., ptr noalias %.bound_tid.) {
-; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined_spmd_amenable_external
-; AMDGPU-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
-; AMDGPU-NEXT:  entry:
-; AMDGPU-NEXT:    br label [[FOR_COND:%.*]]
-; AMDGPU:       for.cond:
-; AMDGPU-NEXT:    [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
+; AMDGPU-LABEL: define void @__omp_outlined_spmd_amenable_external(
+; AMDGPU-SAME: ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
+; AMDGPU-NEXT:  [[ENTRY:.*]]:
+; AMDGPU-NEXT:    br label %[[FOR_COND:.*]]
+; AMDGPU:       [[FOR_COND]]:
+; AMDGPU-NEXT:    [[I_0:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_BODY:.*]] ]
 ; AMDGPU-NEXT:    [[CMP:%.*]] = icmp slt i32 [[I_0]], 100
-; AMDGPU-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
-; AMDGPU:       for.cond.cleanup:
+; AMDGPU-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP:.*]]
+; AMDGPU:       [[FOR_COND_CLEANUP]]:
 ; AMDGPU-NEXT:    call void @spmd_amenable() #[[ATTR6]]
 ; AMDGPU-NEXT:    ret void
-; AMDGPU:       for.body:
-; AMDGPU-NEXT:    [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA12]]
+; AMDGPU:       [[FOR_BODY]]:
+; AMDGPU-NEXT:    [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[INT_TBAA12]]
 ; AMDGPU-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__7, ptr @__omp_outlined__7_wrapper, ptr undef, i64 0)
 ; AMDGPU-NEXT:    [[INC]] = add nsw i32 [[I_0]], 1
-; AMDGPU-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP23:![0-9]+]]
+; AMDGPU-NEXT:    br label %[[FOR_COND]], !llvm.loop [[LOOP23:![0-9]+]]
 ;
-; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined_spmd_amenable_external
-; NVPTX-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
-; NVPTX-NEXT:  entry:
-; NVPTX-NEXT:    br label [[FOR_COND:%.*]]
-; NVPTX:       for.cond:
-; NVPTX-NEXT:    [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
+; NVPTX-LABEL: define void @__omp_outlined_spmd_amenable_external(
+; NVPTX-SAME: ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
+; NVPTX-NEXT:  [[ENTRY:.*]]:
+; NVPTX-NEXT:    br label %[[FOR_COND:.*]]
+; NVPTX:       [[FOR_COND]]:
+; NVPTX-NEXT:    [[I_0:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_BODY:.*]] ]
 ; NVPTX-NEXT:    [[CMP:%.*]] = icmp slt i32 [[I_0]], 100
-; NVPTX-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
-; NVPTX:       for.cond.cleanup:
+; NVPTX-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP:.*]]
+; NVPTX:       [[FOR_COND_CLEANUP]]:
 ; NVPTX-NEXT:    call void @spmd_amenable() #[[ATTR6]]
 ; NVPTX-NEXT:    ret void
-; NVPTX:       for.body:
-; NVPTX-NEXT:    [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA12]]
+; NVPTX:       [[FOR_BODY]]:
+; NVPTX-NEXT:    [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[INT_TBAA12]]
 ; NVPTX-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__7, ptr @__omp_outlined__7_wrapper, ptr undef, i64 0)
 ; NVPTX-NEXT:    [[INC]] = add nsw i32 [[I_0]], 1
-; NVPTX-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP23:![0-9]+]]
+; NVPTX-NEXT:    br label %[[FOR_COND]], !llvm.loop [[LOOP23:![0-9]+]]
 ;
 entry:
   br label %for.cond
@@ -938,14 +938,14 @@ for.body:                                         ; preds = %for.cond
 }
 
 define internal void @__omp_outlined__7(ptr noalias %.global_tid., ptr noalias %.bound_tid., ptr nonnull align 4 dereferenceable(4) %x) {
-; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__7
-; AMDGPU-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(4) [[X:%.*]]) {
-; AMDGPU-NEXT:  entry:
+; AMDGPU-LABEL: define internal void @__omp_outlined__7(
+; AMDGPU-SAME: ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(4) [[X:%.*]]) {
+; AMDGPU-NEXT:  [[ENTRY:.*:]]
 ; AMDGPU-NEXT:    ret void
 ;
-; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__7
-; NVPTX-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(4) [[X:%.*]]) {
-; NVPTX-NEXT:  entry:
+; NVPTX-LABEL: define internal void @__omp_outlined__7(
+; NVPTX-SAME: ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(4) [[X:%.*]]) {
+; NVPTX-NEXT:  [[ENTRY:.*:]]
 ; NVPTX-NEXT:    ret void
 ;
 entry:
@@ -954,14 +954,14 @@ entry:
 
 ; Function Attrs: convergent norecurse nounwind
 define internal void @__omp_outlined__7_wrapper(i16 zeroext %0, i32 %1) #1 {
-; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__7_wrapper
-; AMDGPU-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR3:[0-9]+]] {
-; AMDGPU-NEXT:  entry:
+; AMDGPU-LABEL: define internal void @__omp_outlined__7_wrapper(
+; AMDGPU-SAME: i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR3:[0-9]+]] {
+; AMDGPU-NEXT:  [[ENTRY:.*:]]
 ; AMDGPU-NEXT:    ret void
 ;
-; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__7_wrapper
-; NVPTX-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR3:[0-9]+]] {
-; NVPTX-NEXT:  entry:
+; NVPTX-LABEL: define internal void @__omp_outlined__7_wrapper(
+; NVPTX-SAME: i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR3:[0-9]+]] {
+; NVPTX-NEXT:  [[ENTRY:.*:]]
 ; NVPTX-NEXT:    ret void
 ;
 entry:
@@ -969,13 +969,13 @@ entry:
 }
 
 define void @__omp_outlined_not_spmd_amenable_external(ptr noalias %.global_tid., ptr noalias %.bound_tid.) {
-; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined_not_spmd_amenable_external
-; AMDGPU-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
+; AMDGPU-LABEL: define void @__omp_outlined_not_spmd_amenable_external(
+; AMDGPU-SAME: ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
 ; AMDGPU-NEXT:    call void @__omp_outlined_not_spmd_amenable(ptr [[DOTGLOBAL_TID_]], ptr [[DOTBOUND_TID_]])
 ; AMDGPU-NEXT:    ret void
 ;
-; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined_not_spmd_amenable_external
-; NVPTX-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
+; NVPTX-LABEL: define void @__omp_outlined_not_spmd_amenable_external(
+; NVPTX-SAME: ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
 ; NVPTX-NEXT:    call void @__omp_outlined_not_spmd_amenable(ptr [[DOTGLOBAL_TID_]], ptr [[DOTBOUND_TID_]])
 ; NVPTX-NEXT:    ret void
 ;
@@ -984,15 +984,15 @@ define void @__omp_outlined_not_spmd_amenable_external(ptr noalias %.global_tid.
 }
 
 define internal void @__omp_outlined_not_spmd_amenable(ptr noalias %.global_tid., ptr noalias %.bound_tid.) {
-; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined_not_spmd_amenable
-; AMDGPU-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
-; AMDGPU-NEXT:  entry:
+; AMDGPU-LABEL: define internal void @__omp_outlined_not_spmd_amenable(
+; AMDGPU-SAME: ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
+; AMDGPU-NEXT:  [[ENTRY:.*:]]
 ; AMDGPU-NEXT:    call void @unknown() #[[ATTR7]]
 ; AMDGPU-NEXT:    ret void
 ;
-; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined_not_spmd_amenable
-; NVPTX-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
-; NVPTX-NEXT:  entry:
+; NVPTX-LABEL: define internal void @__omp_outlined_not_spmd_amenable(
+; NVPTX-SAME: ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
+; NVPTX-NEXT:  [[ENTRY:.*:]]
 ; NVPTX-NEXT:    call void @unknown() #[[ATTR7]]
 ; NVPTX-NEXT:    ret void
 ;
@@ -1020,12 +1020,12 @@ declare void @unknowni32p(ptr) #5
 declare void @llvm.lifetime.start.p0(ptr captures(none)) #6
 
 define weak i32 @__kmpc_target_init(ptr %0, ptr %1) {
-; AMDGPU-LABEL: define {{[^@]+}}@__kmpc_target_init
-; AMDGPU-SAME: (ptr [[TMP0:%.*]], ptr [[TMP1:%.*]]) {
+; AMDGPU-LABEL: define weak i32 @__kmpc_target_init(
+; AMDGPU-SAME: ptr [[TMP0:%.*]], ptr [[TMP1:%.*]]) {
 ; AMDGPU-NEXT:    ret i32 0
 ;
-; NVPTX-LABEL: define {{[^@]+}}@__kmpc_target_init
-; NVPTX-SAME: (ptr [[TMP0:%.*]], ptr [[TMP1:%.*]]) {
+; NVPTX-LABEL: define weak i32 @__kmpc_target_init(
+; NVPTX-SAME: ptr [[TMP0:%.*]], ptr [[TMP1:%.*]]) {
 ; NVPTX-NEXT:    ret i32 0
 ;
   ret i32 0
@@ -1150,7 +1150,7 @@ attributes #8 = { nounwind }
 ; AMDGPU: [[META9:![0-9]+]] = !{i32 8, !"PIC Level", i32 2}
 ; AMDGPU: [[META10:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2}
 ; AMDGPU: [[META11:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
-; AMDGPU: [[TBAA12]] = !{[[META13:![0-9]+]], [[META13]], i64 0}
+; AMDGPU: [[INT_TBAA12]] = !{[[META13:![0-9]+]], [[META13]], i64 0}
 ; AMDGPU: [[META13]] = !{!"int", [[META14:![0-9]+]], i64 0}
 ; AMDGPU: [[META14]] = !{!"omnipotent char", [[META15:![0-9]+]], i64 0}
 ; AMDGPU: [[META15]] = !{!"Simple C/C++ TBAA"}
@@ -1158,7 +1158,7 @@ attributes #8 = { nounwind }
 ; AMDGPU: [[META17]] = !{!"llvm.loop.mustprogress"}
 ; AMDGPU: [[META18]] = !{!"llvm.loop.unroll.disable"}
 ; AMDGPU: [[LOOP19]] = distinct !{[[LOOP19]], [[META17]], [[META18]]}
-; AMDGPU: [[TBAA20]] = !{[[META21:![0-9]+]], [[META21]], i64 0}
+; AMDGPU: [[ANYPTR_TBAA20]] = !{[[META21:![0-9]+]], [[META21]], i64 0}
 ; AMDGPU: [[META21]] = !{!"any pointer", [[META14]], i64 0}
 ; AMDGPU: [[LOOP22]] = distinct !{[[LOOP22]], [[META17]], [[META18]]}
 ; AMDGPU: [[LOOP23]] = distinct !{[[LOOP23]], [[META17]], [[META18]]}
@@ -1175,7 +1175,7 @@ attributes #8 = { nounwind }
 ; NVPTX: [[META9:![0-9]+]] = !{i32 8, !"PIC Level", i32 2}
 ; NVPTX: [[META10:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2}
 ; NVPTX: [[META11:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
-; NVPTX: [[TBAA12]] = !{[[META13:![0-9]+]], [[META13]], i64 0}
+; NVPTX: [[INT_TBAA12]] = !{[[META13:![0-9]+]], [[META13]], i64 0}
 ; NVPTX: [[META13]] = !{!"int", [[META14:![0-9]+]], i64 0}
 ; NVPTX: [[META14]] = !{!"omnipotent char", [[META15:![0-9]+]], i64 0}
 ; NVPTX: [[META15]] = !{!"Simple C/C++ TBAA"}
@@ -1183,7 +1183,7 @@ attributes #8 = { nounwind }
 ; NVPTX: [[META17]] = !{!"llvm.loop.mustprogress"}
 ; NVPTX: [[META18]] = !{!"llvm.loop.unroll.disable"}
 ; NVPTX: [[LOOP19]] = distinct !{[[LOOP19]], [[META17]], [[META18]]}
-; NVPTX: [[TBAA20]] = !{[[META21:![0-9]+]], [[META21]], i64 0}
+; NVPTX: [[ANYPTR_TBAA20]] = !{[[META21:![0-9]+]], [[META21]], i64 0}
 ; NVPTX: [[META21]] = !{!"any pointer", [[META14]], i64 0}
 ; NVPTX: [[LOOP22]] = distinct !{[[LOOP22]], [[META17]], [[META18]]}
 ; NVPTX: [[LOOP23]] = distinct !{[[LOOP23]], [[META17]], [[META18]]}
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/hoisting-sinking-required-for-vectorization.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/hoisting-sinking-required-for-vectorization.ll
index e74bf592e1525..670c2d9108d4e 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/hoisting-sinking-required-for-vectorization.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/hoisting-sinking-required-for-vectorization.ll
@@ -168,9 +168,8 @@ define void @loop2(ptr %A, ptr %B, ptr %C, float %x) {
 ; CHECK-NEXT:    [[TMP11:%.*]] = fadd <4 x float> [[TMP7]], [[WIDE_LOAD11]]
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP2]], <4 x float> [[TMP6]], <4 x float> [[TMP10]]
 ; CHECK-NEXT:    [[PREDPHI12:%.*]] = select <4 x i1> [[TMP3]], <4 x float> [[TMP7]], <4 x float> [[TMP11]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i64 16
 ; CHECK-NEXT:    store <4 x float> [[PREDPHI]], ptr [[TMP8]], align 4, !alias.scope [[META9]], !noalias [[META11]]
-; CHECK-NEXT:    store <4 x float> [[PREDPHI12]], ptr [[TMP12]], align 4, !alias.scope [[META9]], !noalias [[META11]]
+; CHECK-NEXT:    store <4 x float> [[PREDPHI12]], ptr [[TMP9]], align 4, !alias.scope [[META9]], !noalias [[META11]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000
 ; CHECK-NEXT:    br i1 [[TMP13]], label [[EXIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/infer-align-from-assumption.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/infer-align-from-assumption.ll
index 889f25c79c10f..4196625e6bd21 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/infer-align-from-assumption.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/infer-align-from-assumption.ll
@@ -50,7 +50,7 @@ define i32 @earlycse_fn1(ptr %p) {
 
 define i32 @load_assume_aligned(ptr %p) {
 ; CHECK-LABEL: define i32 @load_assume_aligned(
-; CHECK-SAME: ptr [[P:%.*]]) local_unnamed_addr {
+; CHECK-SAME: ptr readonly captures(none) [[P:%.*]]) local_unnamed_addr {
 ; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[P]], i64 4) ]
 ; CHECK-NEXT:    [[DOT0_COPYLOAD:%.*]] = load i32, ptr [[P]], align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = tail call i32 @swap(i32 [[DOT0_COPYLOAD]])
@@ -66,7 +66,7 @@ declare i32 @swap(i32)
 
 define void @sroa_align_entry(ptr %p) {
 ; CHECK-LABEL: define void @sroa_align_entry(
-; CHECK-SAME: ptr [[P:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] {
+; CHECK-SAME: ptr readonly captures(none) [[P:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] {
 ; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[P]], i64 8) ]
 ; CHECK-NEXT:    [[DOT0_COPYLOAD_I_I_I:%.*]] = load i64, ptr [[P]], align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[DOT0_COPYLOAD_I_I_I]] to ptr
@@ -96,7 +96,7 @@ define ptr @sroa_fn1(ptr %p) {
 
 define ptr @sroa_fn2(ptr %p) {
 ; CHECK-LABEL: define ptr @sroa_fn2(
-; CHECK-SAME: ptr [[P:%.*]]) local_unnamed_addr #[[ATTR3:[0-9]+]] {
+; CHECK-SAME: ptr readonly captures(none) [[P:%.*]]) local_unnamed_addr #[[ATTR3:[0-9]+]] {
 ; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[P]], i64 8) ]
 ; CHECK-NEXT:    [[DOT0_COPYLOAD_I_I:%.*]] = load i64, ptr [[P]], align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[DOT0_COPYLOAD_I_I]] to ptr
@@ -109,7 +109,7 @@ define ptr @sroa_fn2(ptr %p) {
 
 define i64 @sroa_fn3(ptr %0) {
 ; CHECK-LABEL: define i64 @sroa_fn3(
-; CHECK-SAME: ptr [[TMP0:%.*]]) local_unnamed_addr #[[ATTR3]] {
+; CHECK-SAME: ptr readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR3]] {
 ; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[TMP0]], i64 8) ]
 ; CHECK-NEXT:    [[DOT0_COPYLOAD_I:%.*]] = load i64, ptr [[TMP0]], align 8
 ; CHECK-NEXT:    ret i64 [[DOT0_COPYLOAD_I]]
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/slpordering.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/slpordering.ll
index 1fe3fde61f410..92e625deb11b1 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/slpordering.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/slpordering.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt -S -O3 < %s | FileCheck %s
 
 ; Check unrolling / SLP vectorization where the order of lanes is important for
@@ -11,9 +11,9 @@ target triple = "aarch64"
 
 ; Function Attrs: nounwind uwtable
 define i32 @slpordering(ptr noundef %p1, i32 noundef %ip1, ptr noundef %p2, i32 noundef %ip2) #0 {
-; CHECK-LABEL: define range(i32 0, 65536) i32 @slpordering
-; CHECK-SAME: (ptr noundef readonly captures(none) [[P1:%.*]], i32 noundef [[IP1:%.*]], ptr noundef readonly captures(none) [[P2:%.*]], i32 noundef [[IP2:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:  entry:
+; CHECK-LABEL: define range(i32 0, 65536) i32 @slpordering(
+; CHECK-SAME: ptr noundef readonly captures(none) [[P1:%.*]], i32 noundef [[IP1:%.*]], ptr noundef readonly captures(none) [[P2:%.*]], i32 noundef [[IP2:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[IDX_EXT:%.*]] = sext i32 [[IP1]] to i64
 ; CHECK-NEXT:    [[IDX_EXT63:%.*]] = sext i32 [[IP2]] to i64
 ; CHECK-NEXT:    [[RRRAYIDX3:%.*]] = getelementptr inbounds nuw i8, ptr [[P1]], i64 4
@@ -30,26 +30,26 @@ define i32 @slpordering(ptr noundef %p1, i32 noundef %ip1, ptr noundef %p2, i32
 ; CHECK-NEXT:    [[RDD_PTR64_2:%.*]] = getelementptr inbounds i8, ptr [[RDD_PTR64_1]], i64 [[IDX_EXT63]]
 ; CHECK-NEXT:    [[RRRAYIDX3_3:%.*]] = getelementptr inbounds nuw i8, ptr [[RDD_PTR_2]], i64 4
 ; CHECK-NEXT:    [[RRRAYIDX5_3:%.*]] = getelementptr inbounds nuw i8, ptr [[RDD_PTR64_2]], i64 4
-; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i8>, ptr [[P1]], align 1, !tbaa [[TBAA0:![0-9]+]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i8>, ptr [[P2]], align 1, !tbaa [[TBAA0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i8>, ptr [[RRRAYIDX3]], align 1, !tbaa [[TBAA0]]
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i8>, ptr [[RRRAYIDX5]], align 1, !tbaa [[TBAA0]]
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i8>, ptr [[RDD_PTR]], align 1, !tbaa [[TBAA0]]
-; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i8>, ptr [[RDD_PTR64]], align 1, !tbaa [[TBAA0]]
-; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i8>, ptr [[RRRAYIDX3_1]], align 1, !tbaa [[TBAA0]]
-; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i8>, ptr [[RRRAYIDX5_1]], align 1, !tbaa [[TBAA0]]
-; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i8>, ptr [[RDD_PTR_1]], align 1, !tbaa [[TBAA0]]
-; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i8>, ptr [[RDD_PTR64_1]], align 1, !tbaa [[TBAA0]]
-; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i8>, ptr [[RRRAYIDX3_2]], align 1, !tbaa [[TBAA0]]
-; CHECK-NEXT:    [[TMP11:%.*]] = load <4 x i8>, ptr [[RRRAYIDX5_2]], align 1, !tbaa [[TBAA0]]
-; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i8>, ptr [[RDD_PTR_2]], align 1, !tbaa [[TBAA0]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i8>, ptr [[P1]], align 1, !tbaa [[CHAR_TBAA0:![0-9]+]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i8>, ptr [[P2]], align 1, !tbaa [[CHAR_TBAA0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i8>, ptr [[RRRAYIDX3]], align 1, !tbaa [[CHAR_TBAA0]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i8>, ptr [[RRRAYIDX5]], align 1, !tbaa [[CHAR_TBAA0]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i8>, ptr [[RDD_PTR]], align 1, !tbaa [[CHAR_TBAA0]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i8>, ptr [[RDD_PTR64]], align 1, !tbaa [[CHAR_TBAA0]]
+; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i8>, ptr [[RRRAYIDX3_1]], align 1, !tbaa [[CHAR_TBAA0]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i8>, ptr [[RRRAYIDX5_1]], align 1, !tbaa [[CHAR_TBAA0]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i8>, ptr [[RDD_PTR_1]], align 1, !tbaa [[CHAR_TBAA0]]
+; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i8>, ptr [[RDD_PTR64_1]], align 1, !tbaa [[CHAR_TBAA0]]
+; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i8>, ptr [[RRRAYIDX3_2]], align 1, !tbaa [[CHAR_TBAA0]]
+; CHECK-NEXT:    [[TMP11:%.*]] = load <4 x i8>, ptr [[RRRAYIDX5_2]], align 1, !tbaa [[CHAR_TBAA0]]
+; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i8>, ptr [[RDD_PTR_2]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> [[TMP4]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <4 x i8> [[TMP8]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <16 x i8> [[TMP13]], <16 x i8> [[TMP14]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <4 x i8> [[TMP12]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <16 x i8> [[TMP15]], <16 x i8> [[TMP16]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
 ; CHECK-NEXT:    [[TMP18:%.*]] = zext <16 x i8> [[TMP17]] to <16 x i32>
-; CHECK-NEXT:    [[TMP19:%.*]] = load <4 x i8>, ptr [[RDD_PTR64_2]], align 1, !tbaa [[TBAA0]]
+; CHECK-NEXT:    [[TMP19:%.*]] = load <4 x i8>, ptr [[RDD_PTR64_2]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> [[TMP5]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP21:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP22:%.*]] = shufflevector <16 x i8> [[TMP20]], <16 x i8> [[TMP21]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
@@ -57,14 +57,14 @@ define i32 @slpordering(ptr noundef %p1, i32 noundef %ip1, ptr noundef %p2, i32
 ; CHECK-NEXT:    [[TMP24:%.*]] = shufflevector <16 x i8> [[TMP22]], <16 x i8> [[TMP23]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
 ; CHECK-NEXT:    [[TMP25:%.*]] = zext <16 x i8> [[TMP24]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP26:%.*]] = sub nsw <16 x i32> [[TMP18]], [[TMP25]]
-; CHECK-NEXT:    [[TMP27:%.*]] = load <4 x i8>, ptr [[RRRAYIDX3_3]], align 1, !tbaa [[TBAA0]]
+; CHECK-NEXT:    [[TMP27:%.*]] = load <4 x i8>, ptr [[RRRAYIDX3_3]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-NEXT:    [[TMP28:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP29:%.*]] = shufflevector <4 x i8> [[TMP10]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP30:%.*]] = shufflevector <16 x i8> [[TMP28]], <16 x i8> [[TMP29]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP31:%.*]] = shufflevector <4 x i8> [[TMP27]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP32:%.*]] = shufflevector <16 x i8> [[TMP30]], <16 x i8> [[TMP31]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
 ; CHECK-NEXT:    [[TMP33:%.*]] = zext <16 x i8> [[TMP32]] to <16 x i32>
-; CHECK-NEXT:    [[TMP34:%.*]] = load <4 x i8>, ptr [[RRRAYIDX5_3]], align 1, !tbaa [[TBAA0]]
+; CHECK-NEXT:    [[TMP34:%.*]] = load <4 x i8>, ptr [[RRRAYIDX5_3]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-NEXT:    [[TMP35:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> [[TMP7]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP36:%.*]] = shufflevector <4 x i8> [[TMP11]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP37:%.*]] = shufflevector <16 x i8> [[TMP35]], <16 x i8> [[TMP36]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
@@ -482,3 +482,8 @@ attributes #2 = { nounwind }
 !11 = distinct !{!11, !12}
 !12 = !{!"llvm.loop.mustprogress"}
 !13 = distinct !{!13, !12}
+;.
+; CHECK: [[CHAR_TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0}
+; CHECK: [[META1]] = !{!"omnipotent char", [[META2:![0-9]+]], i64 0}
+; CHECK: [[META2]] = !{!"Simple C/C++ TBAA"}
+;.
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/udotabd.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/udotabd.ll
index 0967736b6740a..4c7e39d31b5c6 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/udotabd.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/udotabd.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt -S -O3 < %s | FileCheck %s --check-prefixes=CHECK-O3
 ; RUN: opt -S -passes="default<O3>,default<O3>" < %s | FileCheck %s --check-prefixes=CHECK-LTO
 
@@ -11,9 +11,9 @@ define dso_local i32 @test(ptr noundef %p1, i32 noundef %s_p1, ptr noundef %p2,
 ; CHECK-O3-NEXT:  [[ENTRY:.*:]]
 ; CHECK-O3-NEXT:    [[IDX_EXT8:%.*]] = sext i32 [[S_P2]] to i64
 ; CHECK-O3-NEXT:    [[IDX_EXT:%.*]] = sext i32 [[S_P1]] to i64
-; CHECK-O3-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[P1]], align 1, !tbaa [[TBAA0:![0-9]+]]
+; CHECK-O3-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[P1]], align 1, !tbaa [[CHAR_TBAA0:![0-9]+]]
 ; CHECK-O3-NEXT:    [[TMP1:%.*]] = zext <16 x i8> [[TMP0]] to <16 x i16>
-; CHECK-O3-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr [[P2]], align 1, !tbaa [[TBAA0]]
+; CHECK-O3-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr [[P2]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-O3-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[TMP2]] to <16 x i16>
 ; CHECK-O3-NEXT:    [[TMP4:%.*]] = sub nsw <16 x i16> [[TMP1]], [[TMP3]]
 ; CHECK-O3-NEXT:    [[TMP5:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP4]], i1 false)
@@ -21,9 +21,9 @@ define dso_local i32 @test(ptr noundef %p1, i32 noundef %s_p1, ptr noundef %p2,
 ; CHECK-O3-NEXT:    [[TMP7:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP6]])
 ; CHECK-O3-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[IDX_EXT]]
 ; CHECK-O3-NEXT:    [[ADD_PTR9:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[IDX_EXT8]]
-; CHECK-O3-NEXT:    [[TMP8:%.*]] = load <16 x i8>, ptr [[ADD_PTR]], align 1, !tbaa [[TBAA0]]
+; CHECK-O3-NEXT:    [[TMP8:%.*]] = load <16 x i8>, ptr [[ADD_PTR]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-O3-NEXT:    [[TMP9:%.*]] = zext <16 x i8> [[TMP8]] to <16 x i16>
-; CHECK-O3-NEXT:    [[TMP10:%.*]] = load <16 x i8>, ptr [[ADD_PTR9]], align 1, !tbaa [[TBAA0]]
+; CHECK-O3-NEXT:    [[TMP10:%.*]] = load <16 x i8>, ptr [[ADD_PTR9]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-O3-NEXT:    [[TMP11:%.*]] = zext <16 x i8> [[TMP10]] to <16 x i16>
 ; CHECK-O3-NEXT:    [[TMP12:%.*]] = sub nsw <16 x i16> [[TMP9]], [[TMP11]]
 ; CHECK-O3-NEXT:    [[TMP13:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP12]], i1 false)
@@ -32,9 +32,9 @@ define dso_local i32 @test(ptr noundef %p1, i32 noundef %s_p1, ptr noundef %p2,
 ; CHECK-O3-NEXT:    [[OP_RDX_1:%.*]] = add i32 [[TMP15]], [[TMP7]]
 ; CHECK-O3-NEXT:    [[ADD_PTR_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR]], i64 [[IDX_EXT]]
 ; CHECK-O3-NEXT:    [[ADD_PTR9_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9]], i64 [[IDX_EXT8]]
-; CHECK-O3-NEXT:    [[TMP16:%.*]] = load <16 x i8>, ptr [[ADD_PTR_1]], align 1, !tbaa [[TBAA0]]
+; CHECK-O3-NEXT:    [[TMP16:%.*]] = load <16 x i8>, ptr [[ADD_PTR_1]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-O3-NEXT:    [[TMP17:%.*]] = zext <16 x i8> [[TMP16]] to <16 x i16>
-; CHECK-O3-NEXT:    [[TMP18:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_1]], align 1, !tbaa [[TBAA0]]
+; CHECK-O3-NEXT:    [[TMP18:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_1]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-O3-NEXT:    [[TMP19:%.*]] = zext <16 x i8> [[TMP18]] to <16 x i16>
 ; CHECK-O3-NEXT:    [[TMP20:%.*]] = sub nsw <16 x i16> [[TMP17]], [[TMP19]]
 ; CHECK-O3-NEXT:    [[TMP21:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP20]], i1 false)
@@ -43,9 +43,9 @@ define dso_local i32 @test(ptr noundef %p1, i32 noundef %s_p1, ptr noundef %p2,
 ; CHECK-O3-NEXT:    [[OP_RDX_2:%.*]] = add i32 [[TMP23]], [[OP_RDX_1]]
 ; CHECK-O3-NEXT:    [[ADD_PTR_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 [[IDX_EXT]]
 ; CHECK-O3-NEXT:    [[ADD_PTR9_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_1]], i64 [[IDX_EXT8]]
-; CHECK-O3-NEXT:    [[TMP24:%.*]] = load <16 x i8>, ptr [[ADD_PTR_2]], align 1, !tbaa [[TBAA0]]
+; CHECK-O3-NEXT:    [[TMP24:%.*]] = load <16 x i8>, ptr [[ADD_PTR_2]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-O3-NEXT:    [[TMP25:%.*]] = zext <16 x i8> [[TMP24]] to <16 x i16>
-; CHECK-O3-NEXT:    [[TMP26:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_2]], align 1, !tbaa [[TBAA0]]
+; CHECK-O3-NEXT:    [[TMP26:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_2]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-O3-NEXT:    [[TMP27:%.*]] = zext <16 x i8> [[TMP26]] to <16 x i16>
 ; CHECK-O3-NEXT:    [[TMP28:%.*]] = sub nsw <16 x i16> [[TMP25]], [[TMP27]]
 ; CHECK-O3-NEXT:    [[TMP29:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP28]], i1 false)
@@ -54,9 +54,9 @@ define dso_local i32 @test(ptr noundef %p1, i32 noundef %s_p1, ptr noundef %p2,
 ; CHECK-O3-NEXT:    [[OP_RDX_3:%.*]] = add i32 [[TMP31]], [[OP_RDX_2]]
 ; CHECK-O3-NEXT:    [[ADD_PTR_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_2]], i64 [[IDX_EXT]]
 ; CHECK-O3-NEXT:    [[ADD_PTR9_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_2]], i64 [[IDX_EXT8]]
-; CHECK-O3-NEXT:    [[TMP32:%.*]] = load <16 x i8>, ptr [[ADD_PTR_3]], align 1, !tbaa [[TBAA0]]
+; CHECK-O3-NEXT:    [[TMP32:%.*]] = load <16 x i8>, ptr [[ADD_PTR_3]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-O3-NEXT:    [[TMP33:%.*]] = zext <16 x i8> [[TMP32]] to <16 x i16>
-; CHECK-O3-NEXT:    [[TMP34:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_3]], align 1, !tbaa [[TBAA0]]
+; CHECK-O3-NEXT:    [[TMP34:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_3]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-O3-NEXT:    [[TMP35:%.*]] = zext <16 x i8> [[TMP34]] to <16 x i16>
 ; CHECK-O3-NEXT:    [[TMP36:%.*]] = sub nsw <16 x i16> [[TMP33]], [[TMP35]]
 ; CHECK-O3-NEXT:    [[TMP37:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP36]], i1 false)
@@ -65,9 +65,9 @@ define dso_local i32 @test(ptr noundef %p1, i32 noundef %s_p1, ptr noundef %p2,
 ; CHECK-O3-NEXT:    [[OP_RDX_4:%.*]] = add i32 [[TMP39]], [[OP_RDX_3]]
 ; CHECK-O3-NEXT:    [[ADD_PTR_4:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_3]], i64 [[IDX_EXT]]
 ; CHECK-O3-NEXT:    [[ADD_PTR9_4:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_3]], i64 [[IDX_EXT8]]
-; CHECK-O3-NEXT:    [[TMP40:%.*]] = load <16 x i8>, ptr [[ADD_PTR_4]], align 1, !tbaa [[TBAA0]]
+; CHECK-O3-NEXT:    [[TMP40:%.*]] = load <16 x i8>, ptr [[ADD_PTR_4]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-O3-NEXT:    [[TMP41:%.*]] = zext <16 x i8> [[TMP40]] to <16 x i16>
-; CHECK-O3-NEXT:    [[TMP42:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_4]], align 1, !tbaa [[TBAA0]]
+; CHECK-O3-NEXT:    [[TMP42:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_4]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-O3-NEXT:    [[TMP43:%.*]] = zext <16 x i8> [[TMP42]] to <16 x i16>
 ; CHECK-O3-NEXT:    [[TMP44:%.*]] = sub nsw <16 x i16> [[TMP41]], [[TMP43]]
 ; CHECK-O3-NEXT:    [[TMP45:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP44]], i1 false)
@@ -76,9 +76,9 @@ define dso_local i32 @test(ptr noundef %p1, i32 noundef %s_p1, ptr noundef %p2,
 ; CHECK-O3-NEXT:    [[OP_RDX_5:%.*]] = add i32 [[TMP47]], [[OP_RDX_4]]
 ; CHECK-O3-NEXT:    [[ADD_PTR_5:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_4]], i64 [[IDX_EXT]]
 ; CHECK-O3-NEXT:    [[ADD_PTR9_5:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_4]], i64 [[IDX_EXT8]]
-; CHECK-O3-NEXT:    [[TMP48:%.*]] = load <16 x i8>, ptr [[ADD_PTR_5]], align 1, !tbaa [[TBAA0]]
+; CHECK-O3-NEXT:    [[TMP48:%.*]] = load <16 x i8>, ptr [[ADD_PTR_5]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-O3-NEXT:    [[TMP49:%.*]] = zext <16 x i8> [[TMP48]] to <16 x i16>
-; CHECK-O3-NEXT:    [[TMP50:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_5]], align 1, !tbaa [[TBAA0]]
+; CHECK-O3-NEXT:    [[TMP50:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_5]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-O3-NEXT:    [[TMP51:%.*]] = zext <16 x i8> [[TMP50]] to <16 x i16>
 ; CHECK-O3-NEXT:    [[TMP52:%.*]] = sub nsw <16 x i16> [[TMP49]], [[TMP51]]
 ; CHECK-O3-NEXT:    [[TMP53:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP52]], i1 false)
@@ -87,9 +87,9 @@ define dso_local i32 @test(ptr noundef %p1, i32 noundef %s_p1, ptr noundef %p2,
 ; CHECK-O3-NEXT:    [[OP_RDX_6:%.*]] = add i32 [[TMP55]], [[OP_RDX_5]]
 ; CHECK-O3-NEXT:    [[ADD_PTR_6:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_5]], i64 [[IDX_EXT]]
 ; CHECK-O3-NEXT:    [[ADD_PTR9_6:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_5]], i64 [[IDX_EXT8]]
-; CHECK-O3-NEXT:    [[TMP56:%.*]] = load <16 x i8>, ptr [[ADD_PTR_6]], align 1, !tbaa [[TBAA0]]
+; CHECK-O3-NEXT:    [[TMP56:%.*]] = load <16 x i8>, ptr [[ADD_PTR_6]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-O3-NEXT:    [[TMP57:%.*]] = zext <16 x i8> [[TMP56]] to <16 x i16>
-; CHECK-O3-NEXT:    [[TMP58:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_6]], align 1, !tbaa [[TBAA0]]
+; CHECK-O3-NEXT:    [[TMP58:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_6]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-O3-NEXT:    [[TMP59:%.*]] = zext <16 x i8> [[TMP58]] to <16 x i16>
 ; CHECK-O3-NEXT:    [[TMP60:%.*]] = sub nsw <16 x i16> [[TMP57]], [[TMP59]]
 ; CHECK-O3-NEXT:    [[TMP61:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP60]], i1 false)
@@ -98,9 +98,9 @@ define dso_local i32 @test(ptr noundef %p1, i32 noundef %s_p1, ptr noundef %p2,
 ; CHECK-O3-NEXT:    [[OP_RDX_7:%.*]] = add i32 [[TMP63]], [[OP_RDX_6]]
 ; CHECK-O3-NEXT:    [[ADD_PTR_7:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_6]], i64 [[IDX_EXT]]
 ; CHECK-O3-NEXT:    [[ADD_PTR9_7:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_6]], i64 [[IDX_EXT8]]
-; CHECK-O3-NEXT:    [[TMP64:%.*]] = load <16 x i8>, ptr [[ADD_PTR_7]], align 1, !tbaa [[TBAA0]]
+; CHECK-O3-NEXT:    [[TMP64:%.*]] = load <16 x i8>, ptr [[ADD_PTR_7]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-O3-NEXT:    [[TMP65:%.*]] = zext <16 x i8> [[TMP64]] to <16 x i16>
-; CHECK-O3-NEXT:    [[TMP66:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_7]], align 1, !tbaa [[TBAA0]]
+; CHECK-O3-NEXT:    [[TMP66:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_7]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-O3-NEXT:    [[TMP67:%.*]] = zext <16 x i8> [[TMP66]] to <16 x i16>
 ; CHECK-O3-NEXT:    [[TMP68:%.*]] = sub nsw <16 x i16> [[TMP65]], [[TMP67]]
 ; CHECK-O3-NEXT:    [[TMP69:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP68]], i1 false)
@@ -109,9 +109,9 @@ define dso_local i32 @test(ptr noundef %p1, i32 noundef %s_p1, ptr noundef %p2,
 ; CHECK-O3-NEXT:    [[OP_RDX_8:%.*]] = add i32 [[TMP71]], [[OP_RDX_7]]
 ; CHECK-O3-NEXT:    [[ADD_PTR_8:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_7]], i64 [[IDX_EXT]]
 ; CHECK-O3-NEXT:    [[ADD_PTR9_8:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_7]], i64 [[IDX_EXT8]]
-; CHECK-O3-NEXT:    [[TMP72:%.*]] = load <16 x i8>, ptr [[ADD_PTR_8]], align 1, !tbaa [[TBAA0]]
+; CHECK-O3-NEXT:    [[TMP72:%.*]] = load <16 x i8>, ptr [[ADD_PTR_8]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-O3-NEXT:    [[TMP73:%.*]] = zext <16 x i8> [[TMP72]] to <16 x i16>
-; CHECK-O3-NEXT:    [[TMP74:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_8]], align 1, !tbaa [[TBAA0]]
+; CHECK-O3-NEXT:    [[TMP74:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_8]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-O3-NEXT:    [[TMP75:%.*]] = zext <16 x i8> [[TMP74]] to <16 x i16>
 ; CHECK-O3-NEXT:    [[TMP76:%.*]] = sub nsw <16 x i16> [[TMP73]], [[TMP75]]
 ; CHECK-O3-NEXT:    [[TMP77:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP76]], i1 false)
@@ -120,9 +120,9 @@ define dso_local i32 @test(ptr noundef %p1, i32 noundef %s_p1, ptr noundef %p2,
 ; CHECK-O3-NEXT:    [[OP_RDX_9:%.*]] = add i32 [[TMP79]], [[OP_RDX_8]]
 ; CHECK-O3-NEXT:    [[ADD_PTR_9:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_8]], i64 [[IDX_EXT]]
 ; CHECK-O3-NEXT:    [[ADD_PTR9_9:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_8]], i64 [[IDX_EXT8]]
-; CHECK-O3-NEXT:    [[TMP80:%.*]] = load <16 x i8>, ptr [[ADD_PTR_9]], align 1, !tbaa [[TBAA0]]
+; CHECK-O3-NEXT:    [[TMP80:%.*]] = load <16 x i8>, ptr [[ADD_PTR_9]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-O3-NEXT:    [[TMP81:%.*]] = zext <16 x i8> [[TMP80]] to <16 x i16>
-; CHECK-O3-NEXT:    [[TMP82:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_9]], align 1, !tbaa [[TBAA0]]
+; CHECK-O3-NEXT:    [[TMP82:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_9]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-O3-NEXT:    [[TMP83:%.*]] = zext <16 x i8> [[TMP82]] to <16 x i16>
 ; CHECK-O3-NEXT:    [[TMP84:%.*]] = sub nsw <16 x i16> [[TMP81]], [[TMP83]]
 ; CHECK-O3-NEXT:    [[TMP85:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP84]], i1 false)
@@ -131,9 +131,9 @@ define dso_local i32 @test(ptr noundef %p1, i32 noundef %s_p1, ptr noundef %p2,
 ; CHECK-O3-NEXT:    [[OP_RDX_10:%.*]] = add i32 [[TMP87]], [[OP_RDX_9]]
 ; CHECK-O3-NEXT:    [[ADD_PTR_10:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_9]], i64 [[IDX_EXT]]
 ; CHECK-O3-NEXT:    [[ADD_PTR9_10:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_9]], i64 [[IDX_EXT8]]
-; CHECK-O3-NEXT:    [[TMP88:%.*]] = load <16 x i8>, ptr [[ADD_PTR_10]], align 1, !tbaa [[TBAA0]]
+; CHECK-O3-NEXT:    [[TMP88:%.*]] = load <16 x i8>, ptr [[ADD_PTR_10]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-O3-NEXT:    [[TMP89:%.*]] = zext <16 x i8> [[TMP88]] to <16 x i16>
-; CHECK-O3-NEXT:    [[TMP90:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_10]], align 1, !tbaa [[TBAA0]]
+; CHECK-O3-NEXT:    [[TMP90:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_10]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-O3-NEXT:    [[TMP91:%.*]] = zext <16 x i8> [[TMP90]] to <16 x i16>
 ; CHECK-O3-NEXT:    [[TMP92:%.*]] = sub nsw <16 x i16> [[TMP89]], [[TMP91]]
 ; CHECK-O3-NEXT:    [[TMP93:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP92]], i1 false)
@@ -142,9 +142,9 @@ define dso_local i32 @test(ptr noundef %p1, i32 noundef %s_p1, ptr noundef %p2,
 ; CHECK-O3-NEXT:    [[OP_RDX_11:%.*]] = add i32 [[TMP95]], [[OP_RDX_10]]
 ; CHECK-O3-NEXT:    [[ADD_PTR_11:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_10]], i64 [[IDX_EXT]]
 ; CHECK-O3-NEXT:    [[ADD_PTR9_11:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_10]], i64 [[IDX_EXT8]]
-; CHECK-O3-NEXT:    [[TMP96:%.*]] = load <16 x i8>, ptr [[ADD_PTR_11]], align 1, !tbaa [[TBAA0]]
+; CHECK-O3-NEXT:    [[TMP96:%.*]] = load <16 x i8>, ptr [[ADD_PTR_11]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-O3-NEXT:    [[TMP97:%.*]] = zext <16 x i8> [[TMP96]] to <16 x i16>
-; CHECK-O3-NEXT:    [[TMP98:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_11]], align 1, !tbaa [[TBAA0]]
+; CHECK-O3-NEXT:    [[TMP98:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_11]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-O3-NEXT:    [[TMP99:%.*]] = zext <16 x i8> [[TMP98]] to <16 x i16>
 ; CHECK-O3-NEXT:    [[TMP100:%.*]] = sub nsw <16 x i16> [[TMP97]], [[TMP99]]
 ; CHECK-O3-NEXT:    [[TMP101:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP100]], i1 false)
@@ -153,9 +153,9 @@ define dso_local i32 @test(ptr noundef %p1, i32 noundef %s_p1, ptr noundef %p2,
 ; CHECK-O3-NEXT:    [[OP_RDX_12:%.*]] = add i32 [[TMP103]], [[OP_RDX_11]]
 ; CHECK-O3-NEXT:    [[ADD_PTR_12:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_11]], i64 [[IDX_EXT]]
 ; CHECK-O3-NEXT:    [[ADD_PTR9_12:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_11]], i64 [[IDX_EXT8]]
-; CHECK-O3-NEXT:    [[TMP104:%.*]] = load <16 x i8>, ptr [[ADD_PTR_12]], align 1, !tbaa [[TBAA0]]
+; CHECK-O3-NEXT:    [[TMP104:%.*]] = load <16 x i8>, ptr [[ADD_PTR_12]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-O3-NEXT:    [[TMP105:%.*]] = zext <16 x i8> [[TMP104]] to <16 x i16>
-; CHECK-O3-NEXT:    [[TMP106:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_12]], align 1, !tbaa [[TBAA0]]
+; CHECK-O3-NEXT:    [[TMP106:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_12]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-O3-NEXT:    [[TMP107:%.*]] = zext <16 x i8> [[TMP106]] to <16 x i16>
 ; CHECK-O3-NEXT:    [[TMP108:%.*]] = sub nsw <16 x i16> [[TMP105]], [[TMP107]]
 ; CHECK-O3-NEXT:    [[TMP109:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP108]], i1 false)
@@ -164,9 +164,9 @@ define dso_local i32 @test(ptr noundef %p1, i32 noundef %s_p1, ptr noundef %p2,
 ; CHECK-O3-NEXT:    [[OP_RDX_13:%.*]] = add i32 [[TMP111]], [[OP_RDX_12]]
 ; CHECK-O3-NEXT:    [[ADD_PTR_13:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_12]], i64 [[IDX_EXT]]
 ; CHECK-O3-NEXT:    [[ADD_PTR9_13:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_12]], i64 [[IDX_EXT8]]
-; CHECK-O3-NEXT:    [[TMP112:%.*]] = load <16 x i8>, ptr [[ADD_PTR_13]], align 1, !tbaa [[TBAA0]]
+; CHECK-O3-NEXT:    [[TMP112:%.*]] = load <16 x i8>, ptr [[ADD_PTR_13]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-O3-NEXT:    [[TMP113:%.*]] = zext <16 x i8> [[TMP112]] to <16 x i16>
-; CHECK-O3-NEXT:    [[TMP114:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_13]], align 1, !tbaa [[TBAA0]]
+; CHECK-O3-NEXT:    [[TMP114:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_13]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-O3-NEXT:    [[TMP115:%.*]] = zext <16 x i8> [[TMP114]] to <16 x i16>
 ; CHECK-O3-NEXT:    [[TMP116:%.*]] = sub nsw <16 x i16> [[TMP113]], [[TMP115]]
 ; CHECK-O3-NEXT:    [[TMP117:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP116]], i1 false)
@@ -175,9 +175,9 @@ define dso_local i32 @test(ptr noundef %p1, i32 noundef %s_p1, ptr noundef %p2,
 ; CHECK-O3-NEXT:    [[OP_RDX_14:%.*]] = add i32 [[TMP119]], [[OP_RDX_13]]
 ; CHECK-O3-NEXT:    [[ADD_PTR_14:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_13]], i64 [[IDX_EXT]]
 ; CHECK-O3-NEXT:    [[ADD_PTR9_14:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_13]], i64 [[IDX_EXT8]]
-; CHECK-O3-NEXT:    [[TMP120:%.*]] = load <16 x i8>, ptr [[ADD_PTR_14]], align 1, !tbaa [[TBAA0]]
+; CHECK-O3-NEXT:    [[TMP120:%.*]] = load <16 x i8>, ptr [[ADD_PTR_14]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-O3-NEXT:    [[TMP121:%.*]] = zext <16 x i8> [[TMP120]] to <16 x i16>
-; CHECK-O3-NEXT:    [[TMP122:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_14]], align 1, !tbaa [[TBAA0]]
+; CHECK-O3-NEXT:    [[TMP122:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_14]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-O3-NEXT:    [[TMP123:%.*]] = zext <16 x i8> [[TMP122]] to <16 x i16>
 ; CHECK-O3-NEXT:    [[TMP124:%.*]] = sub nsw <16 x i16> [[TMP121]], [[TMP123]]
 ; CHECK-O3-NEXT:    [[TMP125:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP124]], i1 false)
@@ -191,9 +191,9 @@ define dso_local i32 @test(ptr noundef %p1, i32 noundef %s_p1, ptr noundef %p2,
 ; CHECK-LTO-NEXT:  [[ENTRY:.*:]]
 ; CHECK-LTO-NEXT:    [[IDX_EXT8:%.*]] = sext i32 [[S_P2]] to i64
 ; CHECK-LTO-NEXT:    [[IDX_EXT:%.*]] = sext i32 [[S_P1]] to i64
-; CHECK-LTO-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[P1]], align 1, !tbaa [[TBAA0:![0-9]+]]
+; CHECK-LTO-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[P1]], align 1, !tbaa [[CHAR_TBAA0:![0-9]+]]
 ; CHECK-LTO-NEXT:    [[TMP1:%.*]] = zext <16 x i8> [[TMP0]] to <16 x i16>
-; CHECK-LTO-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr [[P2]], align 1, !tbaa [[TBAA0]]
+; CHECK-LTO-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr [[P2]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-LTO-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[TMP2]] to <16 x i16>
 ; CHECK-LTO-NEXT:    [[TMP4:%.*]] = sub nsw <16 x i16> [[TMP1]], [[TMP3]]
 ; CHECK-LTO-NEXT:    [[TMP5:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP4]], i1 true)
@@ -201,9 +201,9 @@ define dso_local i32 @test(ptr noundef %p1, i32 noundef %s_p1, ptr noundef %p2,
 ; CHECK-LTO-NEXT:    [[TMP44:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP36]])
 ; CHECK-LTO-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[IDX_EXT]]
 ; CHECK-LTO-NEXT:    [[ADD_PTR9:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[IDX_EXT8]]
-; CHECK-LTO-NEXT:    [[TMP6:%.*]] = load <16 x i8>, ptr [[ADD_PTR]], align 1, !tbaa [[TBAA0]]
+; CHECK-LTO-NEXT:    [[TMP6:%.*]] = load <16 x i8>, ptr [[ADD_PTR]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-LTO-NEXT:    [[TMP7:%.*]] = zext <16 x i8> [[TMP6]] to <16 x i16>
-; CHECK-LTO-NEXT:    [[TMP8:%.*]] = load <16 x i8>, ptr [[ADD_PTR9]], align 1, !tbaa [[TBAA0]]
+; CHECK-LTO-NEXT:    [[TMP8:%.*]] = load <16 x i8>, ptr [[ADD_PTR9]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-LTO-NEXT:    [[TMP9:%.*]] = zext <16 x i8> [[TMP8]] to <16 x i16>
 ; CHECK-LTO-NEXT:    [[TMP10:%.*]] = sub nsw <16 x i16> [[TMP7]], [[TMP9]]
 ; CHECK-LTO-NEXT:    [[TMP11:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP10]], i1 true)
@@ -212,9 +212,9 @@ define dso_local i32 @test(ptr noundef %p1, i32 noundef %s_p1, ptr noundef %p2,
 ; CHECK-LTO-NEXT:    [[OP_RDX_1:%.*]] = add i32 [[TMP60]], [[TMP44]]
 ; CHECK-LTO-NEXT:    [[ADD_PTR_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR]], i64 [[IDX_EXT]]
 ; CHECK-LTO-NEXT:    [[ADD_PTR9_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9]], i64 [[IDX_EXT8]]
-; CHECK-LTO-NEXT:    [[TMP12:%.*]] = load <16 x i8>, ptr [[ADD_PTR_1]], align 1, !tbaa [[TBAA0]]
+; CHECK-LTO-NEXT:    [[TMP12:%.*]] = load <16 x i8>, ptr [[ADD_PTR_1]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-LTO-NEXT:    [[TMP13:%.*]] = zext <16 x i8> [[TMP12]] to <16 x i16>
-; CHECK-LTO-NEXT:    [[TMP14:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_1]], align 1, !tbaa [[TBAA0]]
+; CHECK-LTO-NEXT:    [[TMP14:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_1]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-LTO-NEXT:    [[TMP15:%.*]] = zext <16 x i8> [[TMP14]] to <16 x i16>
 ; CHECK-LTO-NEXT:    [[TMP16:%.*]] = sub nsw <16 x i16> [[TMP13]], [[TMP15]]
 ; CHECK-LTO-NEXT:    [[TMP17:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP16]], i1 true)
@@ -223,9 +223,9 @@ define dso_local i32 @test(ptr noundef %p1, i32 noundef %s_p1, ptr noundef %p2,
 ; CHECK-LTO-NEXT:    [[OP_RDX_2:%.*]] = add i32 [[OP_RDX_1]], [[TMP76]]
 ; CHECK-LTO-NEXT:    [[ADD_PTR_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 [[IDX_EXT]]
 ; CHECK-LTO-NEXT:    [[ADD_PTR9_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_1]], i64 [[IDX_EXT8]]
-; CHECK-LTO-NEXT:    [[TMP18:%.*]] = load <16 x i8>, ptr [[ADD_PTR_2]], align 1, !tbaa [[TBAA0]]
+; CHECK-LTO-NEXT:    [[TMP18:%.*]] = load <16 x i8>, ptr [[ADD_PTR_2]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-LTO-NEXT:    [[TMP19:%.*]] = zext <16 x i8> [[TMP18]] to <16 x i16>
-; CHECK-LTO-NEXT:    [[TMP20:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_2]], align 1, !tbaa [[TBAA0]]
+; CHECK-LTO-NEXT:    [[TMP20:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_2]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-LTO-NEXT:    [[TMP21:%.*]] = zext <16 x i8> [[TMP20]] to <16 x i16>
 ; CHECK-LTO-NEXT:    [[TMP22:%.*]] = sub nsw <16 x i16> [[TMP19]], [[TMP21]]
 ; CHECK-LTO-NEXT:    [[TMP23:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP22]], i1 true)
@@ -234,9 +234,9 @@ define dso_local i32 @test(ptr noundef %p1, i32 noundef %s_p1, ptr noundef %p2,
 ; CHECK-LTO-NEXT:    [[OP_RDX_3:%.*]] = add i32 [[OP_RDX_2]], [[TMP92]]
 ; CHECK-LTO-NEXT:    [[ADD_PTR_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_2]], i64 [[IDX_EXT]]
 ; CHECK-LTO-NEXT:    [[ADD_PTR9_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_2]], i64 [[IDX_EXT8]]
-; CHECK-LTO-NEXT:    [[TMP24:%.*]] = load <16 x i8>, ptr [[ADD_PTR_3]], align 1, !tbaa [[TBAA0]]
+; CHECK-LTO-NEXT:    [[TMP24:%.*]] = load <16 x i8>, ptr [[ADD_PTR_3]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-LTO-NEXT:    [[TMP25:%.*]] = zext <16 x i8> [[TMP24]] to <16 x i16>
-; CHECK-LTO-NEXT:    [[TMP26:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_3]], align 1, !tbaa [[TBAA0]]
+; CHECK-LTO-NEXT:    [[TMP26:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_3]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-LTO-NEXT:    [[TMP27:%.*]] = zext <16 x i8> [[TMP26]] to <16 x i16>
 ; CHECK-LTO-NEXT:    [[TMP28:%.*]] = sub nsw <16 x i16> [[TMP25]], [[TMP27]]
 ; CHECK-LTO-NEXT:    [[TMP29:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP28]], i1 true)
@@ -245,9 +245,9 @@ define dso_local i32 @test(ptr noundef %p1, i32 noundef %s_p1, ptr noundef %p2,
 ; CHECK-LTO-NEXT:    [[OP_RDX_4:%.*]] = add i32 [[OP_RDX_3]], [[TMP108]]
 ; CHECK-LTO-NEXT:    [[ADD_PTR_4:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_3]], i64 [[IDX_EXT]]
 ; CHECK-LTO-NEXT:    [[ADD_PTR9_4:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_3]], i64 [[IDX_EXT8]]
-; CHECK-LTO-NEXT:    [[TMP30:%.*]] = load <16 x i8>, ptr [[ADD_PTR_4]], align 1, !tbaa [[TBAA0]]
+; CHECK-LTO-NEXT:    [[TMP30:%.*]] = load <16 x i8>, ptr [[ADD_PTR_4]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-LTO-NEXT:    [[TMP31:%.*]] = zext <16 x i8> [[TMP30]] to <16 x i16>
-; CHECK-LTO-NEXT:    [[TMP32:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_4]], align 1, !tbaa [[TBAA0]]
+; CHECK-LTO-NEXT:    [[TMP32:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_4]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-LTO-NEXT:    [[TMP33:%.*]] = zext <16 x i8> [[TMP32]] to <16 x i16>
 ; CHECK-LTO-NEXT:    [[TMP34:%.*]] = sub nsw <16 x i16> [[TMP31]], [[TMP33]]
 ; CHECK-LTO-NEXT:    [[TMP35:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP34]], i1 true)
@@ -256,9 +256,9 @@ define dso_local i32 @test(ptr noundef %p1, i32 noundef %s_p1, ptr noundef %p2,
 ; CHECK-LTO-NEXT:    [[OP_RDX_5:%.*]] = add i32 [[OP_RDX_4]], [[TMP117]]
 ; CHECK-LTO-NEXT:    [[ADD_PTR_5:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_4]], i64 [[IDX_EXT]]
 ; CHECK-LTO-NEXT:    [[ADD_PTR9_5:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_4]], i64 [[IDX_EXT8]]
-; CHECK-LTO-NEXT:    [[TMP37:%.*]] = load <16 x i8>, ptr [[ADD_PTR_5]], align 1, !tbaa [[TBAA0]]
+; CHECK-LTO-NEXT:    [[TMP37:%.*]] = load <16 x i8>, ptr [[ADD_PTR_5]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-LTO-NEXT:    [[TMP38:%.*]] = zext <16 x i8> [[TMP37]] to <16 x i16>
-; CHECK-LTO-NEXT:    [[TMP39:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_5]], align 1, !tbaa [[TBAA0]]
+; CHECK-LTO-NEXT:    [[TMP39:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_5]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-LTO-NEXT:    [[TMP40:%.*]] = zext <16 x i8> [[TMP39]] to <16 x i16>
 ; CHECK-LTO-NEXT:    [[TMP41:%.*]] = sub nsw <16 x i16> [[TMP38]], [[TMP40]]
 ; CHECK-LTO-NEXT:    [[TMP42:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP41]], i1 true)
@@ -267,9 +267,9 @@ define dso_local i32 @test(ptr noundef %p1, i32 noundef %s_p1, ptr noundef %p2,
 ; CHECK-LTO-NEXT:    [[OP_RDX_6:%.*]] = add i32 [[OP_RDX_5]], [[TMP118]]
 ; CHECK-LTO-NEXT:    [[ADD_PTR_6:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_5]], i64 [[IDX_EXT]]
 ; CHECK-LTO-NEXT:    [[ADD_PTR9_6:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_5]], i64 [[IDX_EXT8]]
-; CHECK-LTO-NEXT:    [[TMP45:%.*]] = load <16 x i8>, ptr [[ADD_PTR_6]], align 1, !tbaa [[TBAA0]]
+; CHECK-LTO-NEXT:    [[TMP45:%.*]] = load <16 x i8>, ptr [[ADD_PTR_6]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-LTO-NEXT:    [[TMP46:%.*]] = zext <16 x i8> [[TMP45]] to <16 x i16>
-; CHECK-LTO-NEXT:    [[TMP47:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_6]], align 1, !tbaa [[TBAA0]]
+; CHECK-LTO-NEXT:    [[TMP47:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_6]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-LTO-NEXT:    [[TMP48:%.*]] = zext <16 x i8> [[TMP47]] to <16 x i16>
 ; CHECK-LTO-NEXT:    [[TMP49:%.*]] = sub nsw <16 x i16> [[TMP46]], [[TMP48]]
 ; CHECK-LTO-NEXT:    [[TMP50:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP49]], i1 true)
@@ -278,9 +278,9 @@ define dso_local i32 @test(ptr noundef %p1, i32 noundef %s_p1, ptr noundef %p2,
 ; CHECK-LTO-NEXT:    [[OP_RDX_7:%.*]] = add i32 [[OP_RDX_6]], [[TMP120]]
 ; CHECK-LTO-NEXT:    [[ADD_PTR_7:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_6]], i64 [[IDX_EXT]]
 ; CHECK-LTO-NEXT:    [[ADD_PTR9_7:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_6]], i64 [[IDX_EXT8]]
-; CHECK-LTO-NEXT:    [[TMP53:%.*]] = load <16 x i8>, ptr [[ADD_PTR_7]], align 1, !tbaa [[TBAA0]]
+; CHECK-LTO-NEXT:    [[TMP53:%.*]] = load <16 x i8>, ptr [[ADD_PTR_7]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-LTO-NEXT:    [[TMP54:%.*]] = zext <16 x i8> [[TMP53]] to <16 x i16>
-; CHECK-LTO-NEXT:    [[TMP55:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_7]], align 1, !tbaa [[TBAA0]]
+; CHECK-LTO-NEXT:    [[TMP55:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_7]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-LTO-NEXT:    [[TMP56:%.*]] = zext <16 x i8> [[TMP55]] to <16 x i16>
 ; CHECK-LTO-NEXT:    [[TMP57:%.*]] = sub nsw <16 x i16> [[TMP54]], [[TMP56]]
 ; CHECK-LTO-NEXT:    [[TMP58:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP57]], i1 true)
@@ -289,9 +289,9 @@ define dso_local i32 @test(ptr noundef %p1, i32 noundef %s_p1, ptr noundef %p2,
 ; CHECK-LTO-NEXT:    [[OP_RDX_8:%.*]] = add i32 [[OP_RDX_7]], [[TMP121]]
 ; CHECK-LTO-NEXT:    [[ADD_PTR_8:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_7]], i64 [[IDX_EXT]]
 ; CHECK-LTO-NEXT:    [[ADD_PTR9_8:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_7]], i64 [[IDX_EXT8]]
-; CHECK-LTO-NEXT:    [[TMP61:%.*]] = load <16 x i8>, ptr [[ADD_PTR_8]], align 1, !tbaa [[TBAA0]]
+; CHECK-LTO-NEXT:    [[TMP61:%.*]] = load <16 x i8>, ptr [[ADD_PTR_8]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-LTO-NEXT:    [[TMP62:%.*]] = zext <16 x i8> [[TMP61]] to <16 x i16>
-; CHECK-LTO-NEXT:    [[TMP63:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_8]], align 1, !tbaa [[TBAA0]]
+; CHECK-LTO-NEXT:    [[TMP63:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_8]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-LTO-NEXT:    [[TMP64:%.*]] = zext <16 x i8> [[TMP63]] to <16 x i16>
 ; CHECK-LTO-NEXT:    [[TMP65:%.*]] = sub nsw <16 x i16> [[TMP62]], [[TMP64]]
 ; CHECK-LTO-NEXT:    [[TMP66:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP65]], i1 true)
@@ -300,9 +300,9 @@ define dso_local i32 @test(ptr noundef %p1, i32 noundef %s_p1, ptr noundef %p2,
 ; CHECK-LTO-NEXT:    [[OP_RDX_9:%.*]] = add i32 [[OP_RDX_8]], [[TMP122]]
 ; CHECK-LTO-NEXT:    [[ADD_PTR_9:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_8]], i64 [[IDX_EXT]]
 ; CHECK-LTO-NEXT:    [[ADD_PTR9_9:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_8]], i64 [[IDX_EXT8]]
-; CHECK-LTO-NEXT:    [[TMP69:%.*]] = load <16 x i8>, ptr [[ADD_PTR_9]], align 1, !tbaa [[TBAA0]]
+; CHECK-LTO-NEXT:    [[TMP69:%.*]] = load <16 x i8>, ptr [[ADD_PTR_9]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-LTO-NEXT:    [[TMP70:%.*]] = zext <16 x i8> [[TMP69]] to <16 x i16>
-; CHECK-LTO-NEXT:    [[TMP71:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_9]], align 1, !tbaa [[TBAA0]]
+; CHECK-LTO-NEXT:    [[TMP71:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_9]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-LTO-NEXT:    [[TMP72:%.*]] = zext <16 x i8> [[TMP71]] to <16 x i16>
 ; CHECK-LTO-NEXT:    [[TMP73:%.*]] = sub nsw <16 x i16> [[TMP70]], [[TMP72]]
 ; CHECK-LTO-NEXT:    [[TMP74:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP73]], i1 true)
@@ -311,9 +311,9 @@ define dso_local i32 @test(ptr noundef %p1, i32 noundef %s_p1, ptr noundef %p2,
 ; CHECK-LTO-NEXT:    [[OP_RDX_10:%.*]] = add i32 [[OP_RDX_9]], [[TMP123]]
 ; CHECK-LTO-NEXT:    [[ADD_PTR_10:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_9]], i64 [[IDX_EXT]]
 ; CHECK-LTO-NEXT:    [[ADD_PTR9_10:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_9]], i64 [[IDX_EXT8]]
-; CHECK-LTO-NEXT:    [[TMP77:%.*]] = load <16 x i8>, ptr [[ADD_PTR_10]], align 1, !tbaa [[TBAA0]]
+; CHECK-LTO-NEXT:    [[TMP77:%.*]] = load <16 x i8>, ptr [[ADD_PTR_10]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-LTO-NEXT:    [[TMP78:%.*]] = zext <16 x i8> [[TMP77]] to <16 x i16>
-; CHECK-LTO-NEXT:    [[TMP79:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_10]], align 1, !tbaa [[TBAA0]]
+; CHECK-LTO-NEXT:    [[TMP79:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_10]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-LTO-NEXT:    [[TMP80:%.*]] = zext <16 x i8> [[TMP79]] to <16 x i16>
 ; CHECK-LTO-NEXT:    [[TMP81:%.*]] = sub nsw <16 x i16> [[TMP78]], [[TMP80]]
 ; CHECK-LTO-NEXT:    [[TMP82:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP81]], i1 true)
@@ -322,9 +322,9 @@ define dso_local i32 @test(ptr noundef %p1, i32 noundef %s_p1, ptr noundef %p2,
 ; CHECK-LTO-NEXT:    [[OP_RDX_11:%.*]] = add i32 [[OP_RDX_10]], [[TMP124]]
 ; CHECK-LTO-NEXT:    [[ADD_PTR_11:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_10]], i64 [[IDX_EXT]]
 ; CHECK-LTO-NEXT:    [[ADD_PTR9_11:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_10]], i64 [[IDX_EXT8]]
-; CHECK-LTO-NEXT:    [[TMP85:%.*]] = load <16 x i8>, ptr [[ADD_PTR_11]], align 1, !tbaa [[TBAA0]]
+; CHECK-LTO-NEXT:    [[TMP85:%.*]] = load <16 x i8>, ptr [[ADD_PTR_11]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-LTO-NEXT:    [[TMP86:%.*]] = zext <16 x i8> [[TMP85]] to <16 x i16>
-; CHECK-LTO-NEXT:    [[TMP87:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_11]], align 1, !tbaa [[TBAA0]]
+; CHECK-LTO-NEXT:    [[TMP87:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_11]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-LTO-NEXT:    [[TMP88:%.*]] = zext <16 x i8> [[TMP87]] to <16 x i16>
 ; CHECK-LTO-NEXT:    [[TMP89:%.*]] = sub nsw <16 x i16> [[TMP86]], [[TMP88]]
 ; CHECK-LTO-NEXT:    [[TMP90:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP89]], i1 true)
@@ -333,9 +333,9 @@ define dso_local i32 @test(ptr noundef %p1, i32 noundef %s_p1, ptr noundef %p2,
 ; CHECK-LTO-NEXT:    [[OP_RDX_12:%.*]] = add i32 [[OP_RDX_11]], [[TMP125]]
 ; CHECK-LTO-NEXT:    [[ADD_PTR_12:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_11]], i64 [[IDX_EXT]]
 ; CHECK-LTO-NEXT:    [[ADD_PTR9_12:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_11]], i64 [[IDX_EXT8]]
-; CHECK-LTO-NEXT:    [[TMP93:%.*]] = load <16 x i8>, ptr [[ADD_PTR_12]], align 1, !tbaa [[TBAA0]]
+; CHECK-LTO-NEXT:    [[TMP93:%.*]] = load <16 x i8>, ptr [[ADD_PTR_12]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-LTO-NEXT:    [[TMP94:%.*]] = zext <16 x i8> [[TMP93]] to <16 x i16>
-; CHECK-LTO-NEXT:    [[TMP95:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_12]], align 1, !tbaa [[TBAA0]]
+; CHECK-LTO-NEXT:    [[TMP95:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_12]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-LTO-NEXT:    [[TMP96:%.*]] = zext <16 x i8> [[TMP95]] to <16 x i16>
 ; CHECK-LTO-NEXT:    [[TMP97:%.*]] = sub nsw <16 x i16> [[TMP94]], [[TMP96]]
 ; CHECK-LTO-NEXT:    [[TMP98:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP97]], i1 true)
@@ -344,9 +344,9 @@ define dso_local i32 @test(ptr noundef %p1, i32 noundef %s_p1, ptr noundef %p2,
 ; CHECK-LTO-NEXT:    [[OP_RDX_13:%.*]] = add i32 [[OP_RDX_12]], [[TMP126]]
 ; CHECK-LTO-NEXT:    [[ADD_PTR_13:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_12]], i64 [[IDX_EXT]]
 ; CHECK-LTO-NEXT:    [[ADD_PTR9_13:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_12]], i64 [[IDX_EXT8]]
-; CHECK-LTO-NEXT:    [[TMP101:%.*]] = load <16 x i8>, ptr [[ADD_PTR_13]], align 1, !tbaa [[TBAA0]]
+; CHECK-LTO-NEXT:    [[TMP101:%.*]] = load <16 x i8>, ptr [[ADD_PTR_13]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-LTO-NEXT:    [[TMP102:%.*]] = zext <16 x i8> [[TMP101]] to <16 x i16>
-; CHECK-LTO-NEXT:    [[TMP103:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_13]], align 1, !tbaa [[TBAA0]]
+; CHECK-LTO-NEXT:    [[TMP103:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_13]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-LTO-NEXT:    [[TMP104:%.*]] = zext <16 x i8> [[TMP103]] to <16 x i16>
 ; CHECK-LTO-NEXT:    [[TMP105:%.*]] = sub nsw <16 x i16> [[TMP102]], [[TMP104]]
 ; CHECK-LTO-NEXT:    [[TMP106:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP105]], i1 true)
@@ -355,9 +355,9 @@ define dso_local i32 @test(ptr noundef %p1, i32 noundef %s_p1, ptr noundef %p2,
 ; CHECK-LTO-NEXT:    [[OP_RDX_14:%.*]] = add i32 [[OP_RDX_13]], [[TMP119]]
 ; CHECK-LTO-NEXT:    [[ADD_PTR_14:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_13]], i64 [[IDX_EXT]]
 ; CHECK-LTO-NEXT:    [[ADD_PTR9_14:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_13]], i64 [[IDX_EXT8]]
-; CHECK-LTO-NEXT:    [[TMP109:%.*]] = load <16 x i8>, ptr [[ADD_PTR_14]], align 1, !tbaa [[TBAA0]]
+; CHECK-LTO-NEXT:    [[TMP109:%.*]] = load <16 x i8>, ptr [[ADD_PTR_14]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-LTO-NEXT:    [[TMP110:%.*]] = zext <16 x i8> [[TMP109]] to <16 x i16>
-; CHECK-LTO-NEXT:    [[TMP111:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_14]], align 1, !tbaa [[TBAA0]]
+; CHECK-LTO-NEXT:    [[TMP111:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_14]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-LTO-NEXT:    [[TMP112:%.*]] = zext <16 x i8> [[TMP111]] to <16 x i16>
 ; CHECK-LTO-NEXT:    [[TMP113:%.*]] = sub nsw <16 x i16> [[TMP110]], [[TMP112]]
 ; CHECK-LTO-NEXT:    [[TMP114:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP113]], i1 true)
@@ -489,11 +489,11 @@ attributes #3 = { nounwind }
 !13 = !{!"llvm.loop.mustprogress"}
 !14 = distinct !{!14, !13}
 ;.
-; CHECK-O3: [[TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0}
+; CHECK-O3: [[CHAR_TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0}
 ; CHECK-O3: [[META1]] = !{!"omnipotent char", [[META2:![0-9]+]], i64 0}
 ; CHECK-O3: [[META2]] = !{!"Simple C/C++ TBAA"}
 ;.
-; CHECK-LTO: [[TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0}
+; CHECK-LTO: [[CHAR_TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0}
 ; CHECK-LTO: [[META1]] = !{!"omnipotent char", [[META2:![0-9]+]], i64 0}
 ; CHECK-LTO: [[META2]] = !{!"Simple C/C++ TBAA"}
 ;.
diff --git a/llvm/test/Transforms/PhaseOrdering/SystemZ/sub-xor.ll b/llvm/test/Transforms/PhaseOrdering/SystemZ/sub-xor.ll
index 5386bf939918a..13eed2e918aa0 100644
--- a/llvm/test/Transforms/PhaseOrdering/SystemZ/sub-xor.ll
+++ b/llvm/test/Transforms/PhaseOrdering/SystemZ/sub-xor.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt -passes='default<O2>' -S %s | FileCheck %s
 
 target datalayout = "E-m:e-i1:8:16-i8:8:16-i64:64-f128:64-a:8:16-n32:64"
@@ -12,134 +12,135 @@ target triple = "systemz"
 ; that transform to produce optimal asm.
 
 define dso_local zeroext i32 @foo(ptr noundef %a) #0 {
-; CHECK-LABEL: @foo(
-; CHECK-NEXT:  entry:
+; CHECK-LABEL: define dso_local zeroext i32 @foo(
+; CHECK-SAME: ptr noundef readnone captures(none) [[A:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    tail call void @populate(ptr noundef nonnull @ARR) #[[ATTR2:[0-9]+]]
-; CHECK-NEXT:    br label [[FOR_BODY4:%.*]]
-; CHECK:       for.body4:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT_7:%.*]], [[FOR_BODY4]] ]
-; CHECK-NEXT:    [[SUM_11:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[ADD_7:%.*]], [[FOR_BODY4]] ]
+; CHECK-NEXT:    br label %[[FOR_BODY4:.*]]
+; CHECK:       [[FOR_BODY4]]:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT_7:%.*]], %[[FOR_BODY4]] ]
+; CHECK-NEXT:    [[SUM_11:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[ADD_7:%.*]], %[[FOR_BODY4]] ]
 ; CHECK-NEXT:    [[IDX_NEG:%.*]] = sub nsw i64 0, [[INDVARS_IV]]
 ; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i32, ptr getelementptr inbounds nuw (i8, ptr @ARR, i64 396), i64 [[IDX_NEG]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ADD_PTR]], align 4, !tbaa [[TBAA3:![0-9]+]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ADD_PTR]], align 4, !tbaa [[INT_TBAA3:![0-9]+]]
 ; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[TMP0]], [[SUM_11]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT_NEG:%.*]] = xor i64 [[INDVARS_IV]], -1
 ; CHECK-NEXT:    [[ADD_PTR_110:%.*]] = getelementptr inbounds i32, ptr getelementptr inbounds nuw (i8, ptr @ARR, i64 396), i64 [[INDVARS_IV_NEXT_NEG]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ADD_PTR_110]], align 4, !tbaa [[TBAA3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ADD_PTR_110]], align 4, !tbaa [[INT_TBAA3]]
 ; CHECK-NEXT:    [[ADD_111:%.*]] = add i32 [[TMP1]], [[ADD]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT_112_NEG:%.*]] = sub nuw nsw i64 -2, [[INDVARS_IV]]
 ; CHECK-NEXT:    [[ADD_PTR_217:%.*]] = getelementptr inbounds i32, ptr getelementptr inbounds nuw (i8, ptr @ARR, i64 396), i64 [[INDVARS_IV_NEXT_112_NEG]]
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ADD_PTR_217]], align 4, !tbaa [[TBAA3]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ADD_PTR_217]], align 4, !tbaa [[INT_TBAA3]]
 ; CHECK-NEXT:    [[ADD_218:%.*]] = add i32 [[TMP2]], [[ADD_111]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT_219_NEG:%.*]] = sub nuw nsw i64 -3, [[INDVARS_IV]]
 ; CHECK-NEXT:    [[ADD_PTR_3:%.*]] = getelementptr inbounds i32, ptr getelementptr inbounds nuw (i8, ptr @ARR, i64 396), i64 [[INDVARS_IV_NEXT_219_NEG]]
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ADD_PTR_3]], align 4, !tbaa [[TBAA3]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ADD_PTR_3]], align 4, !tbaa [[INT_TBAA3]]
 ; CHECK-NEXT:    [[ADD_3:%.*]] = add i32 [[TMP3]], [[ADD_218]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT_3_NEG:%.*]] = sub nuw nsw i64 -4, [[INDVARS_IV]]
 ; CHECK-NEXT:    [[ADD_PTR_4:%.*]] = getelementptr inbounds i32, ptr getelementptr inbounds nuw (i8, ptr @ARR, i64 396), i64 [[INDVARS_IV_NEXT_3_NEG]]
-; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[ADD_PTR_4]], align 4, !tbaa [[TBAA3]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[ADD_PTR_4]], align 4, !tbaa [[INT_TBAA3]]
 ; CHECK-NEXT:    [[ADD_4:%.*]] = add i32 [[TMP4]], [[ADD_3]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT_4_NEG:%.*]] = sub nuw nsw i64 -5, [[INDVARS_IV]]
 ; CHECK-NEXT:    [[ADD_PTR_5:%.*]] = getelementptr inbounds i32, ptr getelementptr inbounds nuw (i8, ptr @ARR, i64 396), i64 [[INDVARS_IV_NEXT_4_NEG]]
-; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[ADD_PTR_5]], align 4, !tbaa [[TBAA3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[ADD_PTR_5]], align 4, !tbaa [[INT_TBAA3]]
 ; CHECK-NEXT:    [[ADD_5:%.*]] = add i32 [[TMP5]], [[ADD_4]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT_5_NEG:%.*]] = sub nuw nsw i64 -6, [[INDVARS_IV]]
 ; CHECK-NEXT:    [[ADD_PTR_6:%.*]] = getelementptr inbounds i32, ptr getelementptr inbounds nuw (i8, ptr @ARR, i64 396), i64 [[INDVARS_IV_NEXT_5_NEG]]
-; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[ADD_PTR_6]], align 4, !tbaa [[TBAA3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[ADD_PTR_6]], align 4, !tbaa [[INT_TBAA3]]
 ; CHECK-NEXT:    [[ADD_6:%.*]] = add i32 [[TMP6]], [[ADD_5]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT_6_NEG:%.*]] = sub nuw nsw i64 -7, [[INDVARS_IV]]
 ; CHECK-NEXT:    [[ADD_PTR_7:%.*]] = getelementptr inbounds i32, ptr getelementptr inbounds nuw (i8, ptr @ARR, i64 396), i64 [[INDVARS_IV_NEXT_6_NEG]]
-; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[ADD_PTR_7]], align 4, !tbaa [[TBAA3]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[ADD_PTR_7]], align 4, !tbaa [[INT_TBAA3]]
 ; CHECK-NEXT:    [[ADD_7]] = add i32 [[TMP7]], [[ADD_6]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT_7]] = add nuw nsw i64 [[INDVARS_IV]], 8
 ; CHECK-NEXT:    [[EXITCOND_NOT_7:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_7]], 32
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT_7]], label [[FOR_BODY4_1:%.*]], label [[FOR_BODY4]], !llvm.loop [[LOOP7:![0-9]+]]
-; CHECK:       for.body4.1:
-; CHECK-NEXT:    [[INDVARS_IV_1:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_1_7:%.*]], [[FOR_BODY4_1]] ], [ 0, [[FOR_BODY4]] ]
-; CHECK-NEXT:    [[SUM_11_1:%.*]] = phi i32 [ [[ADD_1_7:%.*]], [[FOR_BODY4_1]] ], [ [[ADD_7]], [[FOR_BODY4]] ]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT_7]], label %[[FOR_BODY4_1:.*]], label %[[FOR_BODY4]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK:       [[FOR_BODY4_1]]:
+; CHECK-NEXT:    [[INDVARS_IV_1:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_1_7:%.*]], %[[FOR_BODY4_1]] ], [ 0, %[[FOR_BODY4]] ]
+; CHECK-NEXT:    [[SUM_11_1:%.*]] = phi i32 [ [[ADD_1_7:%.*]], %[[FOR_BODY4_1]] ], [ [[ADD_7]], %[[FOR_BODY4]] ]
 ; CHECK-NEXT:    [[IDX_NEG_1:%.*]] = sub nsw i64 0, [[INDVARS_IV_1]]
 ; CHECK-NEXT:    [[ADD_PTR_1:%.*]] = getelementptr inbounds i32, ptr getelementptr inbounds nuw (i8, ptr @ARR, i64 396), i64 [[IDX_NEG_1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr [[ADD_PTR_1]], align 4, !tbaa [[TBAA3]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr [[ADD_PTR_1]], align 4, !tbaa [[INT_TBAA3]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT_1_NEG:%.*]] = xor i64 [[INDVARS_IV_1]], -1
 ; CHECK-NEXT:    [[ADD_PTR_1_1:%.*]] = getelementptr inbounds i32, ptr getelementptr inbounds nuw (i8, ptr @ARR, i64 396), i64 [[INDVARS_IV_NEXT_1_NEG]]
-; CHECK-NEXT:    [[TMP9:%.*]] = load i32, ptr [[ADD_PTR_1_1]], align 4, !tbaa [[TBAA3]]
+; CHECK-NEXT:    [[TMP9:%.*]] = load i32, ptr [[ADD_PTR_1_1]], align 4, !tbaa [[INT_TBAA3]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = add i32 [[TMP8]], [[TMP9]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT_1_1_NEG:%.*]] = sub nuw nsw i64 -2, [[INDVARS_IV_1]]
 ; CHECK-NEXT:    [[ADD_PTR_1_2:%.*]] = getelementptr inbounds i32, ptr getelementptr inbounds nuw (i8, ptr @ARR, i64 396), i64 [[INDVARS_IV_NEXT_1_1_NEG]]
-; CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[ADD_PTR_1_2]], align 4, !tbaa [[TBAA3]]
+; CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[ADD_PTR_1_2]], align 4, !tbaa [[INT_TBAA3]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = add i32 [[TMP10]], [[TMP11]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT_1_2_NEG:%.*]] = sub nuw nsw i64 -3, [[INDVARS_IV_1]]
 ; CHECK-NEXT:    [[ADD_PTR_1_3:%.*]] = getelementptr inbounds i32, ptr getelementptr inbounds nuw (i8, ptr @ARR, i64 396), i64 [[INDVARS_IV_NEXT_1_2_NEG]]
-; CHECK-NEXT:    [[TMP13:%.*]] = load i32, ptr [[ADD_PTR_1_3]], align 4, !tbaa [[TBAA3]]
+; CHECK-NEXT:    [[TMP13:%.*]] = load i32, ptr [[ADD_PTR_1_3]], align 4, !tbaa [[INT_TBAA3]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = add i32 [[TMP12]], [[TMP13]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT_1_3_NEG:%.*]] = sub nuw nsw i64 -4, [[INDVARS_IV_1]]
 ; CHECK-NEXT:    [[ADD_PTR_1_4:%.*]] = getelementptr inbounds i32, ptr getelementptr inbounds nuw (i8, ptr @ARR, i64 396), i64 [[INDVARS_IV_NEXT_1_3_NEG]]
-; CHECK-NEXT:    [[TMP15:%.*]] = load i32, ptr [[ADD_PTR_1_4]], align 4, !tbaa [[TBAA3]]
+; CHECK-NEXT:    [[TMP15:%.*]] = load i32, ptr [[ADD_PTR_1_4]], align 4, !tbaa [[INT_TBAA3]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = add i32 [[TMP14]], [[TMP15]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT_1_4_NEG:%.*]] = sub nuw nsw i64 -5, [[INDVARS_IV_1]]
 ; CHECK-NEXT:    [[ADD_PTR_1_5:%.*]] = getelementptr inbounds i32, ptr getelementptr inbounds nuw (i8, ptr @ARR, i64 396), i64 [[INDVARS_IV_NEXT_1_4_NEG]]
-; CHECK-NEXT:    [[TMP17:%.*]] = load i32, ptr [[ADD_PTR_1_5]], align 4, !tbaa [[TBAA3]]
+; CHECK-NEXT:    [[TMP17:%.*]] = load i32, ptr [[ADD_PTR_1_5]], align 4, !tbaa [[INT_TBAA3]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = add i32 [[TMP16]], [[TMP17]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT_1_5_NEG:%.*]] = sub nuw nsw i64 -6, [[INDVARS_IV_1]]
 ; CHECK-NEXT:    [[ADD_PTR_1_6:%.*]] = getelementptr inbounds i32, ptr getelementptr inbounds nuw (i8, ptr @ARR, i64 396), i64 [[INDVARS_IV_NEXT_1_5_NEG]]
-; CHECK-NEXT:    [[TMP19:%.*]] = load i32, ptr [[ADD_PTR_1_6]], align 4, !tbaa [[TBAA3]]
+; CHECK-NEXT:    [[TMP19:%.*]] = load i32, ptr [[ADD_PTR_1_6]], align 4, !tbaa [[INT_TBAA3]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[TMP18]], [[TMP19]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT_1_6_NEG:%.*]] = sub nuw nsw i64 -7, [[INDVARS_IV_1]]
 ; CHECK-NEXT:    [[ADD_PTR_1_7:%.*]] = getelementptr inbounds i32, ptr getelementptr inbounds nuw (i8, ptr @ARR, i64 396), i64 [[INDVARS_IV_NEXT_1_6_NEG]]
-; CHECK-NEXT:    [[TMP21:%.*]] = load i32, ptr [[ADD_PTR_1_7]], align 4, !tbaa [[TBAA3]]
+; CHECK-NEXT:    [[TMP21:%.*]] = load i32, ptr [[ADD_PTR_1_7]], align 4, !tbaa [[INT_TBAA3]]
 ; CHECK-NEXT:    [[TMP22:%.*]] = add i32 [[TMP20]], [[TMP21]]
 ; CHECK-NEXT:    [[TMP23:%.*]] = shl i32 [[TMP22]], 1
 ; CHECK-NEXT:    [[ADD_1_7]] = add i32 [[TMP23]], [[SUM_11_1]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT_1_7]] = add nuw nsw i64 [[INDVARS_IV_1]], 8
 ; CHECK-NEXT:    [[EXITCOND_1_NOT_7:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_1_7]], 32
-; CHECK-NEXT:    br i1 [[EXITCOND_1_NOT_7]], label [[FOR_BODY4_2:%.*]], label [[FOR_BODY4_1]], !llvm.loop [[LOOP7]]
-; CHECK:       for.body4.2:
-; CHECK-NEXT:    [[INDVARS_IV_2:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_2_7:%.*]], [[FOR_BODY4_2]] ], [ 0, [[FOR_BODY4_1]] ]
-; CHECK-NEXT:    [[SUM_11_2:%.*]] = phi i32 [ [[ADD_2_7:%.*]], [[FOR_BODY4_2]] ], [ [[ADD_1_7]], [[FOR_BODY4_1]] ]
+; CHECK-NEXT:    br i1 [[EXITCOND_1_NOT_7]], label %[[FOR_BODY4_2:.*]], label %[[FOR_BODY4_1]], !llvm.loop [[LOOP7]]
+; CHECK:       [[FOR_BODY4_2]]:
+; CHECK-NEXT:    [[INDVARS_IV_2:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_2_7:%.*]], %[[FOR_BODY4_2]] ], [ 0, %[[FOR_BODY4_1]] ]
+; CHECK-NEXT:    [[SUM_11_2:%.*]] = phi i32 [ [[ADD_2_7:%.*]], %[[FOR_BODY4_2]] ], [ [[ADD_1_7]], %[[FOR_BODY4_1]] ]
 ; CHECK-NEXT:    [[IDX_NEG_2:%.*]] = sub nsw i64 0, [[INDVARS_IV_2]]
 ; CHECK-NEXT:    [[ADD_PTR_2:%.*]] = getelementptr inbounds i32, ptr getelementptr inbounds nuw (i8, ptr @ARR, i64 396), i64 [[IDX_NEG_2]]
-; CHECK-NEXT:    [[TMP24:%.*]] = load i32, ptr [[ADD_PTR_2]], align 4, !tbaa [[TBAA3]]
+; CHECK-NEXT:    [[TMP24:%.*]] = load i32, ptr [[ADD_PTR_2]], align 4, !tbaa [[INT_TBAA3]]
 ; CHECK-NEXT:    [[MUL_2:%.*]] = mul i32 [[TMP24]], 3
 ; CHECK-NEXT:    [[ADD_2:%.*]] = add i32 [[MUL_2]], [[SUM_11_2]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT_2_NEG:%.*]] = xor i64 [[INDVARS_IV_2]], -1
 ; CHECK-NEXT:    [[ADD_PTR_2_1:%.*]] = getelementptr inbounds i32, ptr getelementptr inbounds nuw (i8, ptr @ARR, i64 396), i64 [[INDVARS_IV_NEXT_2_NEG]]
-; CHECK-NEXT:    [[TMP25:%.*]] = load i32, ptr [[ADD_PTR_2_1]], align 4, !tbaa [[TBAA3]]
+; CHECK-NEXT:    [[TMP25:%.*]] = load i32, ptr [[ADD_PTR_2_1]], align 4, !tbaa [[INT_TBAA3]]
 ; CHECK-NEXT:    [[MUL_2_1:%.*]] = mul i32 [[TMP25]], 3
 ; CHECK-NEXT:    [[ADD_2_1:%.*]] = add i32 [[MUL_2_1]], [[ADD_2]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT_2_1_NEG:%.*]] = sub nuw nsw i64 -2, [[INDVARS_IV_2]]
 ; CHECK-NEXT:    [[ADD_PTR_2_2:%.*]] = getelementptr inbounds i32, ptr getelementptr inbounds nuw (i8, ptr @ARR, i64 396), i64 [[INDVARS_IV_NEXT_2_1_NEG]]
-; CHECK-NEXT:    [[TMP26:%.*]] = load i32, ptr [[ADD_PTR_2_2]], align 4, !tbaa [[TBAA3]]
+; CHECK-NEXT:    [[TMP26:%.*]] = load i32, ptr [[ADD_PTR_2_2]], align 4, !tbaa [[INT_TBAA3]]
 ; CHECK-NEXT:    [[MUL_2_2:%.*]] = mul i32 [[TMP26]], 3
 ; CHECK-NEXT:    [[ADD_2_2:%.*]] = add i32 [[MUL_2_2]], [[ADD_2_1]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT_2_2_NEG:%.*]] = sub nuw nsw i64 -3, [[INDVARS_IV_2]]
 ; CHECK-NEXT:    [[ADD_PTR_2_3:%.*]] = getelementptr inbounds i32, ptr getelementptr inbounds nuw (i8, ptr @ARR, i64 396), i64 [[INDVARS_IV_NEXT_2_2_NEG]]
-; CHECK-NEXT:    [[TMP27:%.*]] = load i32, ptr [[ADD_PTR_2_3]], align 4, !tbaa [[TBAA3]]
+; CHECK-NEXT:    [[TMP27:%.*]] = load i32, ptr [[ADD_PTR_2_3]], align 4, !tbaa [[INT_TBAA3]]
 ; CHECK-NEXT:    [[MUL_2_3:%.*]] = mul i32 [[TMP27]], 3
 ; CHECK-NEXT:    [[ADD_2_3:%.*]] = add i32 [[MUL_2_3]], [[ADD_2_2]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT_2_3_NEG:%.*]] = sub nuw nsw i64 -4, [[INDVARS_IV_2]]
 ; CHECK-NEXT:    [[ADD_PTR_2_4:%.*]] = getelementptr inbounds i32, ptr getelementptr inbounds nuw (i8, ptr @ARR, i64 396), i64 [[INDVARS_IV_NEXT_2_3_NEG]]
-; CHECK-NEXT:    [[TMP28:%.*]] = load i32, ptr [[ADD_PTR_2_4]], align 4, !tbaa [[TBAA3]]
+; CHECK-NEXT:    [[TMP28:%.*]] = load i32, ptr [[ADD_PTR_2_4]], align 4, !tbaa [[INT_TBAA3]]
 ; CHECK-NEXT:    [[MUL_2_4:%.*]] = mul i32 [[TMP28]], 3
 ; CHECK-NEXT:    [[ADD_2_4:%.*]] = add i32 [[MUL_2_4]], [[ADD_2_3]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT_2_4_NEG:%.*]] = sub nuw nsw i64 -5, [[INDVARS_IV_2]]
 ; CHECK-NEXT:    [[ADD_PTR_2_5:%.*]] = getelementptr inbounds i32, ptr getelementptr inbounds nuw (i8, ptr @ARR, i64 396), i64 [[INDVARS_IV_NEXT_2_4_NEG]]
-; CHECK-NEXT:    [[TMP29:%.*]] = load i32, ptr [[ADD_PTR_2_5]], align 4, !tbaa [[TBAA3]]
+; CHECK-NEXT:    [[TMP29:%.*]] = load i32, ptr [[ADD_PTR_2_5]], align 4, !tbaa [[INT_TBAA3]]
 ; CHECK-NEXT:    [[MUL_2_5:%.*]] = mul i32 [[TMP29]], 3
 ; CHECK-NEXT:    [[ADD_2_5:%.*]] = add i32 [[MUL_2_5]], [[ADD_2_4]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT_2_5_NEG:%.*]] = sub nuw nsw i64 -6, [[INDVARS_IV_2]]
 ; CHECK-NEXT:    [[ADD_PTR_2_6:%.*]] = getelementptr inbounds i32, ptr getelementptr inbounds nuw (i8, ptr @ARR, i64 396), i64 [[INDVARS_IV_NEXT_2_5_NEG]]
-; CHECK-NEXT:    [[TMP30:%.*]] = load i32, ptr [[ADD_PTR_2_6]], align 4, !tbaa [[TBAA3]]
+; CHECK-NEXT:    [[TMP30:%.*]] = load i32, ptr [[ADD_PTR_2_6]], align 4, !tbaa [[INT_TBAA3]]
 ; CHECK-NEXT:    [[MUL_2_6:%.*]] = mul i32 [[TMP30]], 3
 ; CHECK-NEXT:    [[ADD_2_6:%.*]] = add i32 [[MUL_2_6]], [[ADD_2_5]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT_2_6_NEG:%.*]] = sub nuw nsw i64 -7, [[INDVARS_IV_2]]
 ; CHECK-NEXT:    [[ADD_PTR_2_7:%.*]] = getelementptr inbounds i32, ptr getelementptr inbounds nuw (i8, ptr @ARR, i64 396), i64 [[INDVARS_IV_NEXT_2_6_NEG]]
-; CHECK-NEXT:    [[TMP31:%.*]] = load i32, ptr [[ADD_PTR_2_7]], align 4, !tbaa [[TBAA3]]
+; CHECK-NEXT:    [[TMP31:%.*]] = load i32, ptr [[ADD_PTR_2_7]], align 4, !tbaa [[INT_TBAA3]]
 ; CHECK-NEXT:    [[MUL_2_7:%.*]] = mul i32 [[TMP31]], 3
 ; CHECK-NEXT:    [[ADD_2_7]] = add i32 [[MUL_2_7]], [[ADD_2_6]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT_2_7]] = add nuw nsw i64 [[INDVARS_IV_2]], 8
 ; CHECK-NEXT:    [[EXITCOND_2_NOT_7:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_2_7]], 32
-; CHECK-NEXT:    br i1 [[EXITCOND_2_NOT_7]], label [[FOR_INC5_2:%.*]], label [[FOR_BODY4_2]], !llvm.loop [[LOOP7]]
-; CHECK:       for.inc5.2:
+; CHECK-NEXT:    br i1 [[EXITCOND_2_NOT_7]], label %[[FOR_INC5_2:.*]], label %[[FOR_BODY4_2]], !llvm.loop [[LOOP7]]
+; CHECK:       [[FOR_INC5_2]]:
 ; CHECK-NEXT:    ret i32 [[ADD_2_7]]
 ;
 entry:
@@ -210,3 +211,11 @@ attributes #2 = { argmemonly nocallback nofree nosync nounwind willreturn }
 !7 = distinct !{!7, !8}
 !8 = !{!"llvm.loop.mustprogress"}
 !9 = distinct !{!9, !8}
+;.
+; CHECK: [[INT_TBAA3]] = !{[[META4:![0-9]+]], [[META4]], i64 0}
+; CHECK: [[META4]] = !{!"int", [[META5:![0-9]+]], i64 0}
+; CHECK: [[META5]] = !{!"omnipotent char", [[META6:![0-9]+]], i64 0}
+; CHECK: [[META6]] = !{!"Simple C/C++ TBAA"}
+; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META8:![0-9]+]]}
+; CHECK: [[META8]] = !{!"llvm.loop.mustprogress"}
+;.
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/SROA-after-final-loop-unrolling-2.ll b/llvm/test/Transforms/PhaseOrdering/X86/SROA-after-final-loop-unrolling-2.ll
index 7fe3f33430234..f42101ffe89aa 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/SROA-after-final-loop-unrolling-2.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/SROA-after-final-loop-unrolling-2.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt < %s -O3 -S                   | FileCheck %s
 ; RUN: opt < %s -passes="default<O3>" -S | FileCheck %s
 
@@ -20,27 +20,28 @@ $_ZNSt14__array_traitsIiLm2EE6_S_refERA2_Kim = comdat any
 
 ; Function Attrs: mustprogress nounwind uwtable
 define dso_local void @foo(i32 noundef %arg, ptr noundef nonnull align 4 dereferenceable(8) %arg1) #0 {
-; CHECK-LABEL: @foo(
-; CHECK-NEXT:  bb:
-; CHECK-NEXT:    [[I9:%.*]] = sdiv i32 [[ARG:%.*]], 128
+; CHECK-LABEL: define dso_local void @foo(
+; CHECK-SAME: i32 noundef [[ARG:%.*]], ptr noundef nonnull writeonly align 4 captures(none) dereferenceable(8) [[ARG1:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[BB:.*]]:
+; CHECK-NEXT:    [[I9:%.*]] = sdiv i32 [[ARG]], 128
 ; CHECK-NEXT:    [[I10:%.*]] = shl nsw i32 [[I9]], 7
 ; CHECK-NEXT:    [[ARG_OFF:%.*]] = add i32 [[ARG]], 127
 ; CHECK-NEXT:    [[TMP0:%.*]] = icmp ult i32 [[ARG_OFF]], 255
-; CHECK-NEXT:    br i1 [[TMP0]], label [[BB12:%.*]], label [[BB13:%.*]]
-; CHECK:       bb12.loopexit:
+; CHECK-NEXT:    br i1 [[TMP0]], label %[[BB12:.*]], label %[[BB13:.*]]
+; CHECK:       [[BB12_LOOPEXIT:.*]]:
 ; CHECK-NEXT:    [[I3_SROA_8_0_INSERT_EXT:%.*]] = zext i32 [[I21_3:%.*]] to i64
 ; CHECK-NEXT:    [[I3_SROA_8_0_INSERT_SHIFT:%.*]] = shl nuw i64 [[I3_SROA_8_0_INSERT_EXT]], 32
 ; CHECK-NEXT:    [[I3_SROA_0_0_INSERT_EXT:%.*]] = zext i32 [[I21_2:%.*]] to i64
 ; CHECK-NEXT:    [[I3_SROA_0_0_INSERT_INSERT:%.*]] = or disjoint i64 [[I3_SROA_8_0_INSERT_SHIFT]], [[I3_SROA_0_0_INSERT_EXT]]
-; CHECK-NEXT:    br label [[BB12]]
-; CHECK:       bb12:
-; CHECK-NEXT:    [[TMP1:%.*]] = phi i64 [ [[I3_SROA_0_0_INSERT_INSERT]], [[BB12_LOOPEXIT:%.*]] ], [ 180388626456, [[BB:%.*]] ]
-; CHECK-NEXT:    store i64 [[TMP1]], ptr [[ARG1:%.*]], align 4, !tbaa [[TBAA5:![0-9]+]]
+; CHECK-NEXT:    br label %[[BB12]]
+; CHECK:       [[BB12]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = phi i64 [ [[I3_SROA_0_0_INSERT_INSERT]], %[[BB12_LOOPEXIT]] ], [ 180388626456, %[[BB]] ]
+; CHECK-NEXT:    store i64 [[TMP1]], ptr [[ARG1]], align 4, !tbaa [[CHAR_TBAA5:![0-9]+]]
 ; CHECK-NEXT:    ret void
-; CHECK:       bb13:
-; CHECK-NEXT:    [[I3_SROA_8_0:%.*]] = phi i32 [ [[I21_3]], [[BB13]] ], [ 42, [[BB]] ]
-; CHECK-NEXT:    [[I3_SROA_0_0:%.*]] = phi i32 [ [[I21_2]], [[BB13]] ], [ 24, [[BB]] ]
-; CHECK-NEXT:    [[I4_05:%.*]] = phi i32 [ [[I24_3:%.*]], [[BB13]] ], [ 0, [[BB]] ]
+; CHECK:       [[BB13]]:
+; CHECK-NEXT:    [[I3_SROA_8_0:%.*]] = phi i32 [ [[I21_3]], %[[BB13]] ], [ 42, %[[BB]] ]
+; CHECK-NEXT:    [[I3_SROA_0_0:%.*]] = phi i32 [ [[I21_2]], %[[BB13]] ], [ 24, %[[BB]] ]
+; CHECK-NEXT:    [[I4_05:%.*]] = phi i32 [ [[I24_3:%.*]], %[[BB13]] ], [ 0, %[[BB]] ]
 ; CHECK-NEXT:    [[I21:%.*]] = mul nsw i32 [[I3_SROA_0_0]], [[I4_05]]
 ; CHECK-NEXT:    [[I24:%.*]] = or disjoint i32 [[I4_05]], 1
 ; CHECK-NEXT:    [[I21_1:%.*]] = mul nsw i32 [[I3_SROA_8_0]], [[I24]]
@@ -50,7 +51,7 @@ define dso_local void @foo(i32 noundef %arg, ptr noundef nonnull align 4 derefer
 ; CHECK-NEXT:    [[I21_3]] = mul nsw i32 [[I21_1]], [[I24_2]]
 ; CHECK-NEXT:    [[I24_3]] = add nuw nsw i32 [[I4_05]], 4
 ; CHECK-NEXT:    [[I11_NOT_3:%.*]] = icmp eq i32 [[I24_3]], [[I10]]
-; CHECK-NEXT:    br i1 [[I11_NOT_3]], label [[BB12_LOOPEXIT]], label [[BB13]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT:    br i1 [[I11_NOT_3]], label %[[BB12_LOOPEXIT]], label %[[BB13]], !llvm.loop [[LOOP8:![0-9]+]]
 ;
 bb:
   %i = alloca i32, align 4
@@ -166,3 +167,11 @@ attributes #3 = { nounwind }
 !14 = !{!7, !7, i64 0}
 !15 = !{!16, !16, i64 0}
 !16 = !{!"long", !7, i64 0}
+;.
+; CHECK: [[CHAR_TBAA5]] = !{[[META6:![0-9]+]], [[META6]], i64 0}
+; CHECK: [[META6]] = !{!"omnipotent char", [[META7:![0-9]+]], i64 0}
+; CHECK: [[META7]] = !{!"Simple C++ TBAA"}
+; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META9:![0-9]+]], [[META10:![0-9]+]]}
+; CHECK: [[META9]] = !{!"llvm.loop.mustprogress"}
+; CHECK: [[META10]] = !{!"llvm.loop.isvectorized", i32 1}
+;.
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/hoist-load-of-baseptr.ll b/llvm/test/Transforms/PhaseOrdering/X86/hoist-load-of-baseptr.ll
index 00453e701ee51..7954ff051a33d 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/hoist-load-of-baseptr.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/hoist-load-of-baseptr.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt -passes="default<O1>" -S < %s | FileCheck --check-prefixes=O1 %s
 ; RUN: opt -passes="default<O2>" -S < %s | FileCheck --check-prefixes=O2 %s
 ; RUN: opt -passes="default<O3>" -S < %s | FileCheck --check-prefixes=O3 %s
@@ -14,125 +14,125 @@ target triple = "x86_64-unknown-linux-gnu"
 $_ZNSt6vectorIiSaIiEEixEm = comdat any
 
 define dso_local void @_Z7computeRSt6vectorIiSaIiEEy(ptr noundef nonnull align 8 dereferenceable(24) %data, i64 noundef %numElems) {
-; O1-LABEL: define {{[^@]+}}@_Z7computeRSt6vectorIiSaIiEEy
-; O1-SAME: (ptr noundef nonnull readonly align 8 captures(none) dereferenceable(24) [[DATA:%.*]], i64 noundef [[NUMELEMS:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
-; O1-NEXT:  entry:
+; O1-LABEL: define dso_local void @_Z7computeRSt6vectorIiSaIiEEy(
+; O1-SAME: ptr noundef nonnull readonly align 8 captures(none) dereferenceable(24) [[DATA:%.*]], i64 noundef [[NUMELEMS:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; O1-NEXT:  [[ENTRY:.*]]:
 ; O1-NEXT:    [[CMP24_NOT:%.*]] = icmp eq i64 [[NUMELEMS]], 0
 ; O1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[DATA]], align 8
-; O1-NEXT:    br label [[FOR_COND1_PREHEADER:%.*]]
-; O1:       for.cond1.preheader:
-; O1-NEXT:    [[I_06:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INC7:%.*]], [[FOR_COND_CLEANUP3:%.*]] ]
-; O1-NEXT:    br i1 [[CMP24_NOT]], label [[FOR_COND_CLEANUP3]], label [[FOR_BODY4:%.*]]
-; O1:       for.cond.cleanup:
+; O1-NEXT:    br label %[[FOR_COND1_PREHEADER:.*]]
+; O1:       [[FOR_COND1_PREHEADER]]:
+; O1-NEXT:    [[I_06:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INC7:%.*]], %[[FOR_COND_CLEANUP3:.*]] ]
+; O1-NEXT:    br i1 [[CMP24_NOT]], label %[[FOR_COND_CLEANUP3]], label %[[FOR_BODY4:.*]]
+; O1:       [[FOR_COND_CLEANUP:.*]]:
 ; O1-NEXT:    ret void
-; O1:       for.cond.cleanup3:
+; O1:       [[FOR_COND_CLEANUP3]]:
 ; O1-NEXT:    [[INC7]] = add nuw nsw i64 [[I_06]], 1
 ; O1-NEXT:    [[EXITCOND7_NOT:%.*]] = icmp eq i64 [[INC7]], 100
-; O1-NEXT:    br i1 [[EXITCOND7_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_COND1_PREHEADER]], !llvm.loop [[LOOP0:![0-9]+]]
-; O1:       for.body4:
-; O1-NEXT:    [[J_05:%.*]] = phi i64 [ [[INC5:%.*]], [[FOR_BODY4]] ], [ 0, [[FOR_COND1_PREHEADER]] ]
+; O1-NEXT:    br i1 [[EXITCOND7_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_COND1_PREHEADER]], !llvm.loop [[LOOP0:![0-9]+]]
+; O1:       [[FOR_BODY4]]:
+; O1-NEXT:    [[J_05:%.*]] = phi i64 [ [[INC5:%.*]], %[[FOR_BODY4]] ], [ 0, %[[FOR_COND1_PREHEADER]] ]
 ; O1-NEXT:    [[ADD_PTR_I:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 [[J_05]]
-; O1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ADD_PTR_I]], align 4, !tbaa [[TBAA2:![0-9]+]]
+; O1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ADD_PTR_I]], align 4, !tbaa [[INT_TBAA2:![0-9]+]]
 ; O1-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP1]], 1
-; O1-NEXT:    store i32 [[INC]], ptr [[ADD_PTR_I]], align 4, !tbaa [[TBAA2]]
+; O1-NEXT:    store i32 [[INC]], ptr [[ADD_PTR_I]], align 4, !tbaa [[INT_TBAA2]]
 ; O1-NEXT:    [[INC5]] = add nuw i64 [[J_05]], 1
 ; O1-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC5]], [[NUMELEMS]]
-; O1-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP3]], label [[FOR_BODY4]], !llvm.loop [[LOOP6:![0-9]+]]
+; O1-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP3]], label %[[FOR_BODY4]], !llvm.loop [[LOOP6:![0-9]+]]
 ;
-; O2-LABEL: define {{[^@]+}}@_Z7computeRSt6vectorIiSaIiEEy
-; O2-SAME: (ptr noundef nonnull readonly align 8 captures(none) dereferenceable(24) [[DATA:%.*]], i64 noundef [[NUMELEMS:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
-; O2-NEXT:  entry:
+; O2-LABEL: define dso_local void @_Z7computeRSt6vectorIiSaIiEEy(
+; O2-SAME: ptr noundef nonnull readonly align 8 captures(none) dereferenceable(24) [[DATA:%.*]], i64 noundef [[NUMELEMS:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; O2-NEXT:  [[ENTRY:.*]]:
 ; O2-NEXT:    [[CMP24_NOT:%.*]] = icmp eq i64 [[NUMELEMS]], 0
 ; O2-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[DATA]], align 8
 ; O2-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUMELEMS]], 8
 ; O2-NEXT:    [[N_VEC:%.*]] = and i64 [[NUMELEMS]], -8
 ; O2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[NUMELEMS]], [[N_VEC]]
-; O2-NEXT:    br label [[FOR_COND1_PREHEADER:%.*]]
-; O2:       for.cond1.preheader:
-; O2-NEXT:    [[I_06:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INC7:%.*]], [[FOR_COND_CLEANUP3:%.*]] ]
-; O2-NEXT:    br i1 [[CMP24_NOT]], label [[FOR_COND_CLEANUP3]], label [[FOR_BODY4_PREHEADER:%.*]]
-; O2:       for.body4.preheader:
-; O2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY4_PREHEADER9:%.*]], label [[VECTOR_BODY:%.*]]
-; O2:       vector.body:
-; O2-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [ 0, [[FOR_BODY4_PREHEADER]] ]
+; O2-NEXT:    br label %[[FOR_COND1_PREHEADER:.*]]
+; O2:       [[FOR_COND1_PREHEADER]]:
+; O2-NEXT:    [[I_06:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INC7:%.*]], %[[FOR_COND_CLEANUP3:.*]] ]
+; O2-NEXT:    br i1 [[CMP24_NOT]], label %[[FOR_COND_CLEANUP3]], label %[[FOR_BODY4_PREHEADER:.*]]
+; O2:       [[FOR_BODY4_PREHEADER]]:
+; O2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[FOR_BODY4_PREHEADER9:.*]], label %[[VECTOR_BODY:.*]]
+; O2:       [[VECTOR_BODY]]:
+; O2-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ], [ 0, %[[FOR_BODY4_PREHEADER]] ]
 ; O2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 [[INDEX]]
 ; O2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 16
-; O2-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4, !tbaa [[TBAA0:![0-9]+]]
-; O2-NEXT:    [[WIDE_LOAD8:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4, !tbaa [[TBAA0]]
+; O2-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4, !tbaa [[INT_TBAA0:![0-9]+]]
+; O2-NEXT:    [[WIDE_LOAD8:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4, !tbaa [[INT_TBAA0]]
 ; O2-NEXT:    [[TMP3:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], splat (i32 1)
 ; O2-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[WIDE_LOAD8]], splat (i32 1)
-; O2-NEXT:    store <4 x i32> [[TMP3]], ptr [[TMP1]], align 4, !tbaa [[TBAA0]]
-; O2-NEXT:    store <4 x i32> [[TMP4]], ptr [[TMP2]], align 4, !tbaa [[TBAA0]]
+; O2-NEXT:    store <4 x i32> [[TMP3]], ptr [[TMP1]], align 4, !tbaa [[INT_TBAA0]]
+; O2-NEXT:    store <4 x i32> [[TMP4]], ptr [[TMP2]], align 4, !tbaa [[INT_TBAA0]]
 ; O2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; O2-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; O2-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; O2:       middle.block:
-; O2-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP3]], label [[FOR_BODY4_PREHEADER9]]
-; O2:       for.body4.preheader9:
-; O2-NEXT:    [[J_05_PH:%.*]] = phi i64 [ 0, [[FOR_BODY4_PREHEADER]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ]
-; O2-NEXT:    br label [[FOR_BODY4:%.*]]
-; O2:       for.cond.cleanup:
+; O2-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; O2:       [[MIDDLE_BLOCK]]:
+; O2-NEXT:    br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP3]], label %[[FOR_BODY4_PREHEADER9]]
+; O2:       [[FOR_BODY4_PREHEADER9]]:
+; O2-NEXT:    [[J_05_PH:%.*]] = phi i64 [ 0, %[[FOR_BODY4_PREHEADER]] ], [ [[N_VEC]], %[[MIDDLE_BLOCK]] ]
+; O2-NEXT:    br label %[[FOR_BODY4:.*]]
+; O2:       [[FOR_COND_CLEANUP:.*]]:
 ; O2-NEXT:    ret void
-; O2:       for.cond.cleanup3:
+; O2:       [[FOR_COND_CLEANUP3]]:
 ; O2-NEXT:    [[INC7]] = add nuw nsw i64 [[I_06]], 1
 ; O2-NEXT:    [[EXITCOND7_NOT:%.*]] = icmp eq i64 [[INC7]], 100
-; O2-NEXT:    br i1 [[EXITCOND7_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_COND1_PREHEADER]], !llvm.loop [[LOOP8:![0-9]+]]
-; O2:       for.body4:
-; O2-NEXT:    [[J_05:%.*]] = phi i64 [ [[INC5:%.*]], [[FOR_BODY4]] ], [ [[J_05_PH]], [[FOR_BODY4_PREHEADER9]] ]
+; O2-NEXT:    br i1 [[EXITCOND7_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_COND1_PREHEADER]], !llvm.loop [[LOOP8:![0-9]+]]
+; O2:       [[FOR_BODY4]]:
+; O2-NEXT:    [[J_05:%.*]] = phi i64 [ [[INC5:%.*]], %[[FOR_BODY4]] ], [ [[J_05_PH]], %[[FOR_BODY4_PREHEADER9]] ]
 ; O2-NEXT:    [[ADD_PTR_I:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 [[J_05]]
-; O2-NEXT:    [[TMP6:%.*]] = load i32, ptr [[ADD_PTR_I]], align 4, !tbaa [[TBAA0]]
+; O2-NEXT:    [[TMP6:%.*]] = load i32, ptr [[ADD_PTR_I]], align 4, !tbaa [[INT_TBAA0]]
 ; O2-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP6]], 1
-; O2-NEXT:    store i32 [[INC]], ptr [[ADD_PTR_I]], align 4, !tbaa [[TBAA0]]
+; O2-NEXT:    store i32 [[INC]], ptr [[ADD_PTR_I]], align 4, !tbaa [[INT_TBAA0]]
 ; O2-NEXT:    [[INC5]] = add nuw i64 [[J_05]], 1
 ; O2-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC5]], [[NUMELEMS]]
-; O2-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP3]], label [[FOR_BODY4]], !llvm.loop [[LOOP9:![0-9]+]]
+; O2-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP3]], label %[[FOR_BODY4]], !llvm.loop [[LOOP9:![0-9]+]]
 ;
-; O3-LABEL: define {{[^@]+}}@_Z7computeRSt6vectorIiSaIiEEy
-; O3-SAME: (ptr noundef nonnull readonly align 8 captures(none) dereferenceable(24) [[DATA:%.*]], i64 noundef [[NUMELEMS:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
-; O3-NEXT:  entry:
+; O3-LABEL: define dso_local void @_Z7computeRSt6vectorIiSaIiEEy(
+; O3-SAME: ptr noundef nonnull readonly align 8 captures(none) dereferenceable(24) [[DATA:%.*]], i64 noundef [[NUMELEMS:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; O3-NEXT:  [[ENTRY:.*:]]
 ; O3-NEXT:    [[CMP24_NOT:%.*]] = icmp eq i64 [[NUMELEMS]], 0
 ; O3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[DATA]], align 8
-; O3-NEXT:    br i1 [[CMP24_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_COND1_PREHEADER_US_PREHEADER:%.*]]
-; O3:       for.cond1.preheader.us.preheader:
+; O3-NEXT:    br i1 [[CMP24_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_COND1_PREHEADER_US_PREHEADER:.*]]
+; O3:       [[FOR_COND1_PREHEADER_US_PREHEADER]]:
 ; O3-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUMELEMS]], 8
 ; O3-NEXT:    [[N_VEC:%.*]] = and i64 [[NUMELEMS]], -8
 ; O3-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[NUMELEMS]], [[N_VEC]]
-; O3-NEXT:    br label [[FOR_COND1_PREHEADER_US:%.*]]
-; O3:       for.cond1.preheader.us:
-; O3-NEXT:    [[I_06_US:%.*]] = phi i64 [ [[INC7_US:%.*]], [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US:%.*]] ], [ 0, [[FOR_COND1_PREHEADER_US_PREHEADER]] ]
-; O3-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY4_US_PREHEADER:%.*]], label [[VECTOR_BODY:%.*]]
-; O3:       vector.body:
-; O3-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [ 0, [[FOR_COND1_PREHEADER_US]] ]
+; O3-NEXT:    br label %[[FOR_COND1_PREHEADER_US:.*]]
+; O3:       [[FOR_COND1_PREHEADER_US]]:
+; O3-NEXT:    [[I_06_US:%.*]] = phi i64 [ [[INC7_US:%.*]], %[[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US:.*]] ], [ 0, %[[FOR_COND1_PREHEADER_US_PREHEADER]] ]
+; O3-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[FOR_BODY4_US_PREHEADER:.*]], label %[[VECTOR_BODY:.*]]
+; O3:       [[VECTOR_BODY]]:
+; O3-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ], [ 0, %[[FOR_COND1_PREHEADER_US]] ]
 ; O3-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 [[INDEX]]
 ; O3-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 16
-; O3-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4, !tbaa [[TBAA0:![0-9]+]]
-; O3-NEXT:    [[WIDE_LOAD9:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4, !tbaa [[TBAA0]]
+; O3-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4, !tbaa [[INT_TBAA0:![0-9]+]]
+; O3-NEXT:    [[WIDE_LOAD9:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4, !tbaa [[INT_TBAA0]]
 ; O3-NEXT:    [[TMP3:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], splat (i32 1)
 ; O3-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[WIDE_LOAD9]], splat (i32 1)
-; O3-NEXT:    store <4 x i32> [[TMP3]], ptr [[TMP1]], align 4, !tbaa [[TBAA0]]
-; O3-NEXT:    store <4 x i32> [[TMP4]], ptr [[TMP2]], align 4, !tbaa [[TBAA0]]
+; O3-NEXT:    store <4 x i32> [[TMP3]], ptr [[TMP1]], align 4, !tbaa [[INT_TBAA0]]
+; O3-NEXT:    store <4 x i32> [[TMP4]], ptr [[TMP2]], align 4, !tbaa [[INT_TBAA0]]
 ; O3-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; O3-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; O3-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; O3:       middle.block:
-; O3-NEXT:    br i1 [[CMP_N]], label [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US]], label [[FOR_BODY4_US_PREHEADER]]
-; O3:       for.body4.us.preheader:
-; O3-NEXT:    [[J_05_US_PH:%.*]] = phi i64 [ 0, [[FOR_COND1_PREHEADER_US]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ]
-; O3-NEXT:    br label [[FOR_BODY4_US:%.*]]
-; O3:       for.body4.us:
-; O3-NEXT:    [[J_05_US:%.*]] = phi i64 [ [[INC5_US:%.*]], [[FOR_BODY4_US]] ], [ [[J_05_US_PH]], [[FOR_BODY4_US_PREHEADER]] ]
+; O3-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; O3:       [[MIDDLE_BLOCK]]:
+; O3-NEXT:    br i1 [[CMP_N]], label %[[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US]], label %[[FOR_BODY4_US_PREHEADER]]
+; O3:       [[FOR_BODY4_US_PREHEADER]]:
+; O3-NEXT:    [[J_05_US_PH:%.*]] = phi i64 [ 0, %[[FOR_COND1_PREHEADER_US]] ], [ [[N_VEC]], %[[MIDDLE_BLOCK]] ]
+; O3-NEXT:    br label %[[FOR_BODY4_US:.*]]
+; O3:       [[FOR_BODY4_US]]:
+; O3-NEXT:    [[J_05_US:%.*]] = phi i64 [ [[INC5_US:%.*]], %[[FOR_BODY4_US]] ], [ [[J_05_US_PH]], %[[FOR_BODY4_US_PREHEADER]] ]
 ; O3-NEXT:    [[ADD_PTR_I_US:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 [[J_05_US]]
-; O3-NEXT:    [[TMP6:%.*]] = load i32, ptr [[ADD_PTR_I_US]], align 4, !tbaa [[TBAA0]]
+; O3-NEXT:    [[TMP6:%.*]] = load i32, ptr [[ADD_PTR_I_US]], align 4, !tbaa [[INT_TBAA0]]
 ; O3-NEXT:    [[INC_US:%.*]] = add nsw i32 [[TMP6]], 1
-; O3-NEXT:    store i32 [[INC_US]], ptr [[ADD_PTR_I_US]], align 4, !tbaa [[TBAA0]]
+; O3-NEXT:    store i32 [[INC_US]], ptr [[ADD_PTR_I_US]], align 4, !tbaa [[INT_TBAA0]]
 ; O3-NEXT:    [[INC5_US]] = add nuw i64 [[J_05_US]], 1
 ; O3-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC5_US]], [[NUMELEMS]]
-; O3-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US]], label [[FOR_BODY4_US]], !llvm.loop [[LOOP8:![0-9]+]]
-; O3:       for.cond1.for.cond.cleanup3_crit_edge.us:
+; O3-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US]], label %[[FOR_BODY4_US]], !llvm.loop [[LOOP8:![0-9]+]]
+; O3:       [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US]]:
 ; O3-NEXT:    [[INC7_US]] = add nuw nsw i64 [[I_06_US]], 1
 ; O3-NEXT:    [[EXITCOND8_NOT:%.*]] = icmp eq i64 [[INC7_US]], 100
-; O3-NEXT:    br i1 [[EXITCOND8_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_COND1_PREHEADER_US]], !llvm.loop [[LOOP9:![0-9]+]]
-; O3:       for.cond.cleanup:
+; O3-NEXT:    br i1 [[EXITCOND8_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_COND1_PREHEADER_US]], !llvm.loop [[LOOP9:![0-9]+]]
+; O3:       [[FOR_COND_CLEANUP]]:
 ; O3-NEXT:    ret void
 ;
 entry:
@@ -237,3 +237,34 @@ declare void @llvm.lifetime.end.p0(ptr nocapture)
 !15 = !{!"long", !5, i64 0}
 !16 = !{!17, !4, i64 0}
 !17 = !{!"_ZTSNSt12_Vector_baseIiSaIiEE17_Vector_impl_dataE", !4, i64 0, !4, i64 8, !4, i64 16}
+;.
+; O1: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]}
+; O1: [[META1]] = !{!"llvm.loop.mustprogress"}
+; O1: [[INT_TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
+; O1: [[META3]] = !{!"int", [[META4:![0-9]+]], i64 0}
+; O1: [[META4]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0}
+; O1: [[META5]] = !{!"Simple C++ TBAA"}
+; O1: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]]}
+;.
+; O2: [[INT_TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0}
+; O2: [[META1]] = !{!"int", [[META2:![0-9]+]], i64 0}
+; O2: [[META2]] = !{!"omnipotent char", [[META3:![0-9]+]], i64 0}
+; O2: [[META3]] = !{!"Simple C++ TBAA"}
+; O2: [[LOOP4]] = distinct !{[[LOOP4]], [[META5:![0-9]+]], [[META6:![0-9]+]], [[META7:![0-9]+]]}
+; O2: [[META5]] = !{!"llvm.loop.mustprogress"}
+; O2: [[META6]] = !{!"llvm.loop.isvectorized", i32 1}
+; O2: [[META7]] = !{!"llvm.loop.unroll.runtime.disable"}
+; O2: [[LOOP8]] = distinct !{[[LOOP8]], [[META5]]}
+; O2: [[LOOP9]] = distinct !{[[LOOP9]], [[META5]], [[META7]], [[META6]]}
+;.
+; O3: [[INT_TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0}
+; O3: [[META1]] = !{!"int", [[META2:![0-9]+]], i64 0}
+; O3: [[META2]] = !{!"omnipotent char", [[META3:![0-9]+]], i64 0}
+; O3: [[META3]] = !{!"Simple C++ TBAA"}
+; O3: [[LOOP4]] = distinct !{[[LOOP4]], [[META5:![0-9]+]], [[META6:![0-9]+]], [[META7:![0-9]+]]}
+; O3: [[META5]] = !{!"llvm.loop.mustprogress"}
+; O3: [[META6]] = !{!"llvm.loop.isvectorized", i32 1}
+; O3: [[META7]] = !{!"llvm.loop.unroll.runtime.disable"}
+; O3: [[LOOP8]] = distinct !{[[LOOP8]], [[META5]], [[META7]], [[META6]]}
+; O3: [[LOOP9]] = distinct !{[[LOOP9]], [[META5]]}
+;.
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/preserve-access-group.ll b/llvm/test/Transforms/PhaseOrdering/X86/preserve-access-group.ll
index cb378465e30ec..ac736518c0cbd 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/preserve-access-group.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/preserve-access-group.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt -passes="default<O3>" -S %s | FileCheck %s
 
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
@@ -23,18 +23,18 @@ define void @test(i32 noundef %nface, i32 noundef %ncell, ptr noalias noundef %f
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDVARS_IV_EPIL:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw i32, ptr [[FACE_CELL]], i64 [[INDVARS_IV_EPIL]]
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP10]], align 4, !tbaa [[TBAA0:![0-9]+]], !llvm.access.group [[ACC_GRP4:![0-9]+]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP10]], align 4, !tbaa [[INT_TBAA0:![0-9]+]], !llvm.access.group [[ACC_GRP4:![0-9]+]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw i32, ptr [[INVARIANT_GEP]], i64 [[INDVARS_IV_EPIL]]
-; CHECK-NEXT:    [[WIDE_LOAD12:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4, !tbaa [[TBAA0]], !llvm.access.group [[ACC_GRP4]]
+; CHECK-NEXT:    [[WIDE_LOAD12:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4, !tbaa [[INT_TBAA0]], !llvm.access.group [[ACC_GRP4]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = sext <4 x i32> [[WIDE_LOAD]] to <4 x i64>
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds double, ptr [[Y]], <4 x i64> [[TMP3]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = sext <4 x i32> [[WIDE_LOAD12]] to <4 x i64>
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds double, ptr [[X]], <4 x i64> [[TMP5]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = tail call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> [[TMP4]], i32 8, <4 x i1> splat (i1 true), <4 x double> poison), !tbaa [[TBAA5:![0-9]+]], !llvm.access.group [[ACC_GRP4]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER13:%.*]] = tail call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> [[TMP6]], i32 8, <4 x i1> splat (i1 true), <4 x double> poison), !tbaa [[TBAA5]], !llvm.access.group [[ACC_GRP4]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = tail call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> [[TMP4]], i32 8, <4 x i1> splat (i1 true), <4 x double> poison), !tbaa [[DOUBLE_TBAA5:![0-9]+]], !llvm.access.group [[ACC_GRP4]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER13:%.*]] = tail call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> [[TMP6]], i32 8, <4 x i1> splat (i1 true), <4 x double> poison), !tbaa [[DOUBLE_TBAA5]], !llvm.access.group [[ACC_GRP4]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = fcmp fast olt <4 x double> [[WIDE_MASKED_GATHER]], [[WIDE_MASKED_GATHER13]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = select <4 x i1> [[TMP7]], <4 x double> [[WIDE_MASKED_GATHER13]], <4 x double> [[WIDE_MASKED_GATHER]]
-; CHECK-NEXT:    tail call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> [[TMP8]], <4 x ptr> [[TMP4]], i32 8, <4 x i1> splat (i1 true)), !tbaa [[TBAA5]], !llvm.access.group [[ACC_GRP4]]
+; CHECK-NEXT:    tail call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> [[TMP8]], <4 x ptr> [[TMP4]], i32 8, <4 x i1> splat (i1 true)), !tbaa [[DOUBLE_TBAA5]], !llvm.access.group [[ACC_GRP4]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDVARS_IV_EPIL]], 4
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[UNROLL_ITER]]
 ; CHECK-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
@@ -49,18 +49,18 @@ define void @test(i32 noundef %nface, i32 noundef %ncell, ptr noalias noundef %f
 ; CHECK:       [[FOR_BODY]]:
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT_2:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ], [ [[INDVARS_IV_PH]], %[[FOR_BODY_PREHEADER14]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[FACE_CELL]], i64 [[INDVARS_IV_NEXT_2]]
-; CHECK-NEXT:    [[TMP22:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA0]], !llvm.access.group [[ACC_GRP4]]
+; CHECK-NEXT:    [[TMP22:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !tbaa [[INT_TBAA0]], !llvm.access.group [[ACC_GRP4]]
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds nuw i32, ptr [[INVARIANT_GEP]], i64 [[INDVARS_IV_NEXT_2]]
-; CHECK-NEXT:    [[TMP23:%.*]] = load i32, ptr [[GEP]], align 4, !tbaa [[TBAA0]], !llvm.access.group [[ACC_GRP4]]
+; CHECK-NEXT:    [[TMP23:%.*]] = load i32, ptr [[GEP]], align 4, !tbaa [[INT_TBAA0]], !llvm.access.group [[ACC_GRP4]]
 ; CHECK-NEXT:    [[IDXPROM3_3:%.*]] = sext i32 [[TMP22]] to i64
 ; CHECK-NEXT:    [[ARRAYIDX4_3:%.*]] = getelementptr inbounds double, ptr [[Y]], i64 [[IDXPROM3_3]]
 ; CHECK-NEXT:    [[IDXPROM5_3:%.*]] = sext i32 [[TMP23]] to i64
 ; CHECK-NEXT:    [[ARRAYIDX6_3:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[IDXPROM5_3]]
-; CHECK-NEXT:    [[TMP24:%.*]] = load double, ptr [[ARRAYIDX4_3]], align 8, !tbaa [[TBAA5]], !llvm.access.group [[ACC_GRP4]]
-; CHECK-NEXT:    [[TMP25:%.*]] = load double, ptr [[ARRAYIDX6_3]], align 8, !tbaa [[TBAA5]], !llvm.access.group [[ACC_GRP4]]
+; CHECK-NEXT:    [[TMP24:%.*]] = load double, ptr [[ARRAYIDX4_3]], align 8, !tbaa [[DOUBLE_TBAA5]], !llvm.access.group [[ACC_GRP4]]
+; CHECK-NEXT:    [[TMP25:%.*]] = load double, ptr [[ARRAYIDX6_3]], align 8, !tbaa [[DOUBLE_TBAA5]], !llvm.access.group [[ACC_GRP4]]
 ; CHECK-NEXT:    [[CMP_I_3:%.*]] = fcmp fast olt double [[TMP24]], [[TMP25]]
 ; CHECK-NEXT:    [[TMP26:%.*]] = select i1 [[CMP_I_3]], double [[TMP25]], double [[TMP24]]
-; CHECK-NEXT:    store double [[TMP26]], ptr [[ARRAYIDX4_3]], align 8, !tbaa [[TBAA5]], !llvm.access.group [[ACC_GRP4]]
+; CHECK-NEXT:    store double [[TMP26]], ptr [[ARRAYIDX4_3]], align 8, !tbaa [[DOUBLE_TBAA5]], !llvm.access.group [[ACC_GRP4]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV_NEXT_2]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[TMP0]]
 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
@@ -190,12 +190,12 @@ attributes #1 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 !18 = !{!"llvm.loop.vectorize.enable", i1 true}
 
 ;.
-; CHECK: [[TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0}
+; CHECK: [[INT_TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0}
 ; CHECK: [[META1]] = !{!"int", [[META2:![0-9]+]], i64 0}
 ; CHECK: [[META2]] = !{!"omnipotent char", [[META3:![0-9]+]], i64 0}
 ; CHECK: [[META3]] = !{!"Simple C++ TBAA"}
 ; CHECK: [[ACC_GRP4]] = distinct !{}
-; CHECK: [[TBAA5]] = !{[[META6:![0-9]+]], [[META6]], i64 0}
+; CHECK: [[DOUBLE_TBAA5]] = !{[[META6:![0-9]+]], [[META6]], i64 0}
 ; CHECK: [[META6]] = !{!"double", [[META2]], i64 0}
 ; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META8:![0-9]+]], [[META9:![0-9]+]], [[META10:![0-9]+]], [[META11:![0-9]+]]}
 ; CHECK: [[META8]] = !{!"llvm.loop.mustprogress"}
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/speculation-vs-tbaa.ll b/llvm/test/Transforms/PhaseOrdering/X86/speculation-vs-tbaa.ll
index ec387d6ae44f2..fcdb68353311d 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/speculation-vs-tbaa.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/speculation-vs-tbaa.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt -passes="default<O1>" -S < %s | FileCheck --check-prefixes=O1 %s
 ; RUN: opt -passes="default<O2>" -S < %s | FileCheck --check-prefixes=O23 %s
 ; RUN: opt -passes="default<O3>" -S < %s | FileCheck --check-prefixes=O23 %s
@@ -9,57 +9,59 @@ target triple = "x86_64-unknown-linux-gnu"
 ; We should retain the TBAA on the load here, not lose it.
 
 define void @licm(ptr align 8 dereferenceable(8) %_M_start.i, i64 %numElem) {
-; O1-LABEL: @licm(
-; O1-NEXT:  entry:
-; O1-NEXT:    [[CMP1_NOT:%.*]] = icmp eq i64 [[NUMELEM:%.*]], 0
-; O1-NEXT:    br i1 [[CMP1_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_LR_PH:%.*]]
-; O1:       for.body.lr.ph:
-; O1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[_M_START_I:%.*]], align 8, !tbaa [[TBAA3:![0-9]+]]
-; O1-NEXT:    br label [[FOR_BODY:%.*]]
-; O1:       for.body:
-; O1-NEXT:    [[K_02:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
+; O1-LABEL: define void @licm(
+; O1-SAME: ptr readonly align 8 captures(none) dereferenceable(8) [[_M_START_I:%.*]], i64 [[NUMELEM:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; O1-NEXT:  [[ENTRY:.*:]]
+; O1-NEXT:    [[CMP1_NOT:%.*]] = icmp eq i64 [[NUMELEM]], 0
+; O1-NEXT:    br i1 [[CMP1_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY_LR_PH:.*]]
+; O1:       [[FOR_BODY_LR_PH]]:
+; O1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[_M_START_I]], align 8, !tbaa [[ANYPTR_TBAA3:![0-9]+]]
+; O1-NEXT:    br label %[[FOR_BODY:.*]]
+; O1:       [[FOR_BODY]]:
+; O1-NEXT:    [[K_02:%.*]] = phi i64 [ 0, %[[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
 ; O1-NEXT:    [[ADD_PTR_I:%.*]] = getelementptr inbounds double, ptr [[TMP0]], i64 [[K_02]]
-; O1-NEXT:    store double 2.000000e+00, ptr [[ADD_PTR_I]], align 8, !tbaa [[TBAA8:![0-9]+]]
+; O1-NEXT:    store double 2.000000e+00, ptr [[ADD_PTR_I]], align 8, !tbaa [[DOUBLE_TBAA8:![0-9]+]]
 ; O1-NEXT:    [[INC]] = add nuw i64 [[K_02]], 1
 ; O1-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[NUMELEM]]
-; O1-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]]
-; O1:       for.cond.cleanup:
+; O1-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
+; O1:       [[FOR_COND_CLEANUP]]:
 ; O1-NEXT:    ret void
 ;
-; O23-LABEL: @licm(
-; O23-NEXT:  entry:
-; O23-NEXT:    [[CMP1_NOT:%.*]] = icmp eq i64 [[NUMELEM:%.*]], 0
-; O23-NEXT:    br i1 [[CMP1_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_LR_PH:%.*]]
-; O23:       for.body.lr.ph:
-; O23-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[_M_START_I:%.*]], align 8, !tbaa [[TBAA3:![0-9]+]]
+; O23-LABEL: define void @licm(
+; O23-SAME: ptr readonly align 8 captures(none) dereferenceable(8) [[_M_START_I:%.*]], i64 [[NUMELEM:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; O23-NEXT:  [[ENTRY:.*:]]
+; O23-NEXT:    [[CMP1_NOT:%.*]] = icmp eq i64 [[NUMELEM]], 0
+; O23-NEXT:    br i1 [[CMP1_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY_LR_PH:.*]]
+; O23:       [[FOR_BODY_LR_PH]]:
+; O23-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[_M_START_I]], align 8, !tbaa [[ANYPTR_TBAA3:![0-9]+]]
 ; O23-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUMELEM]], 4
-; O23-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY_PREHEADER:%.*]], label [[VECTOR_PH:%.*]]
-; O23:       vector.ph:
+; O23-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[FOR_BODY_PREHEADER:.*]], label %[[VECTOR_PH:.*]]
+; O23:       [[VECTOR_PH]]:
 ; O23-NEXT:    [[N_VEC:%.*]] = and i64 [[NUMELEM]], -4
-; O23-NEXT:    br label [[VECTOR_BODY:%.*]]
-; O23:       vector.body:
-; O23-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; O23-NEXT:    br label %[[VECTOR_BODY:.*]]
+; O23:       [[VECTOR_BODY]]:
+; O23-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; O23-NEXT:    [[TMP1:%.*]] = getelementptr inbounds double, ptr [[TMP0]], i64 [[INDEX]]
 ; O23-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 16
-; O23-NEXT:    store <2 x double> splat (double 2.000000e+00), ptr [[TMP1]], align 8, !tbaa [[TBAA8:![0-9]+]]
-; O23-NEXT:    store <2 x double> splat (double 2.000000e+00), ptr [[TMP2]], align 8, !tbaa [[TBAA8]]
+; O23-NEXT:    store <2 x double> splat (double 2.000000e+00), ptr [[TMP1]], align 8, !tbaa [[DOUBLE_TBAA8:![0-9]+]]
+; O23-NEXT:    store <2 x double> splat (double 2.000000e+00), ptr [[TMP2]], align 8, !tbaa [[DOUBLE_TBAA8]]
 ; O23-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; O23-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; O23-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
-; O23:       middle.block:
+; O23-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; O23:       [[MIDDLE_BLOCK]]:
 ; O23-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[NUMELEM]], [[N_VEC]]
-; O23-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY_PREHEADER]]
-; O23:       for.body.preheader:
-; O23-NEXT:    [[K_02_PH:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ]
-; O23-NEXT:    br label [[FOR_BODY:%.*]]
-; O23:       for.body:
-; O23-NEXT:    [[K_02:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[K_02_PH]], [[FOR_BODY_PREHEADER]] ]
+; O23-NEXT:    br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY_PREHEADER]]
+; O23:       [[FOR_BODY_PREHEADER]]:
+; O23-NEXT:    [[K_02_PH:%.*]] = phi i64 [ 0, %[[FOR_BODY_LR_PH]] ], [ [[N_VEC]], %[[MIDDLE_BLOCK]] ]
+; O23-NEXT:    br label %[[FOR_BODY:.*]]
+; O23:       [[FOR_BODY]]:
+; O23-NEXT:    [[K_02:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[K_02_PH]], %[[FOR_BODY_PREHEADER]] ]
 ; O23-NEXT:    [[ADD_PTR_I:%.*]] = getelementptr inbounds double, ptr [[TMP0]], i64 [[K_02]]
-; O23-NEXT:    store double 2.000000e+00, ptr [[ADD_PTR_I]], align 8, !tbaa [[TBAA8]]
+; O23-NEXT:    store double 2.000000e+00, ptr [[ADD_PTR_I]], align 8, !tbaa [[DOUBLE_TBAA8]]
 ; O23-NEXT:    [[INC]] = add nuw i64 [[K_02]], 1
 ; O23-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[NUMELEM]]
-; O23-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
-; O23:       for.cond.cleanup:
+; O23-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
+; O23:       [[FOR_COND_CLEANUP]]:
 ; O23-NEXT:    ret void
 ;
 entry:
@@ -94,3 +96,24 @@ for.cond.cleanup:                                 ; preds = %for.cond
 !7 = !{!"Simple C++ TBAA"}
 !8 = !{!9, !9, i64 0}
 !9 = !{!"double", !6, i64 0}
+;.
+; O1: [[ANYPTR_TBAA3]] = !{[[META4:![0-9]+]], [[META5:![0-9]+]], i64 0}
+; O1: [[META4]] = !{!"_ZTSNSt12_Vector_baseIdSaIdEE17_Vector_impl_dataE", [[META5]], i64 0, [[META5]], i64 8, [[META5]], i64 16}
+; O1: [[META5]] = !{!"any pointer", [[META6:![0-9]+]], i64 0}
+; O1: [[META6]] = !{!"omnipotent char", [[META7:![0-9]+]], i64 0}
+; O1: [[META7]] = !{!"Simple C++ TBAA"}
+; O1: [[DOUBLE_TBAA8]] = !{[[META9:![0-9]+]], [[META9]], i64 0}
+; O1: [[META9]] = !{!"double", [[META6]], i64 0}
+;.
+; O23: [[ANYPTR_TBAA3]] = !{[[META4:![0-9]+]], [[META5:![0-9]+]], i64 0}
+; O23: [[META4]] = !{!"_ZTSNSt12_Vector_baseIdSaIdEE17_Vector_impl_dataE", [[META5]], i64 0, [[META5]], i64 8, [[META5]], i64 16}
+; O23: [[META5]] = !{!"any pointer", [[META6:![0-9]+]], i64 0}
+; O23: [[META6]] = !{!"omnipotent char", [[META7:![0-9]+]], i64 0}
+; O23: [[META7]] = !{!"Simple C++ TBAA"}
+; O23: [[DOUBLE_TBAA8]] = !{[[META9:![0-9]+]], [[META9]], i64 0}
+; O23: [[META9]] = !{!"double", [[META6]], i64 0}
+; O23: [[LOOP10]] = distinct !{[[LOOP10]], [[META11:![0-9]+]], [[META12:![0-9]+]]}
+; O23: [[META11]] = !{!"llvm.loop.isvectorized", i32 1}
+; O23: [[META12]] = !{!"llvm.loop.unroll.runtime.disable"}
+; O23: [[LOOP13]] = distinct !{[[LOOP13]], [[META12]], [[META11]]}
+;.
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/spurious-peeling.ll b/llvm/test/Transforms/PhaseOrdering/X86/spurious-peeling.ll
index 438a93c735796..574132c18d263 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/spurious-peeling.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/spurious-peeling.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt -passes="default<O1>" -S < %s | FileCheck --check-prefixes=O1 %s
 ; RUN: opt -passes="default<O2>" -S < %s | FileCheck --check-prefixes=O23 %s
 ; RUN: opt -passes="default<O3>" -S < %s | FileCheck --check-prefixes=O23 %s
@@ -13,65 +13,65 @@ target triple = "x86_64-unknown-linux-gnu"
 $_ZN12FloatVecPair6vecIncEv = comdat any
 
 define dso_local void @_Z13vecIncFromPtrP12FloatVecPair(ptr %FVP) {
-; O1-LABEL: define {{[^@]+}}@_Z13vecIncFromPtrP12FloatVecPair
-; O1-SAME: (ptr readonly captures(none) [[FVP:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
-; O1-NEXT:  entry:
+; O1-LABEL: define dso_local void @_Z13vecIncFromPtrP12FloatVecPair(
+; O1-SAME: ptr readonly captures(none) [[FVP:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; O1-NEXT:  [[ENTRY:.*:]]
 ; O1-NEXT:    [[VSRC23_I:%.*]] = getelementptr inbounds nuw i8, ptr [[FVP]], i64 16
-; O1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VSRC23_I]], align 8, !tbaa [[TBAA0:![0-9]+]]
+; O1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VSRC23_I]], align 8, !tbaa [[ANYPTR_TBAA0:![0-9]+]]
 ; O1-NEXT:    [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds [[CLASS_HOMEMADEVECTOR_0:%.*]], ptr [[TMP0]], i64 undef
 ; O1-NEXT:    [[SIZE4_I:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX_I_I]], i64 8
-; O1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[SIZE4_I]], align 8, !tbaa [[TBAA6:![0-9]+]]
+; O1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[SIZE4_I]], align 8, !tbaa [[INT_TBAA6:![0-9]+]]
 ; O1-NEXT:    [[CMP56_NOT_I:%.*]] = icmp eq i32 [[TMP1]], 0
-; O1-NEXT:    br i1 [[CMP56_NOT_I]], label [[_ZN12FLOATVECPAIR6VECINCEV_EXIT:%.*]], label [[FOR_BODY7_LR_PH_I:%.*]]
-; O1:       for.body7.lr.ph.i:
-; O1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[ARRAYIDX_I_I]], align 8, !tbaa [[TBAA8:![0-9]+]]
+; O1-NEXT:    br i1 [[CMP56_NOT_I]], label %[[_ZN12FLOATVECPAIR6VECINCEV_EXIT:.*]], label %[[FOR_BODY7_LR_PH_I:.*]]
+; O1:       [[FOR_BODY7_LR_PH_I]]:
+; O1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[ARRAYIDX_I_I]], align 8, !tbaa [[ANYPTR_TBAA8:![0-9]+]]
 ; O1-NEXT:    [[ARRAYIDX_I3_I:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i64 undef
-; O1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[FVP]], align 8, !tbaa [[TBAA0]]
+; O1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[FVP]], align 8, !tbaa [[ANYPTR_TBAA0]]
 ; O1-NEXT:    [[ARRAYIDX_I4_I:%.*]] = getelementptr inbounds [[CLASS_HOMEMADEVECTOR_0]], ptr [[TMP3]], i64 undef
-; O1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[ARRAYIDX_I4_I]], align 8, !tbaa [[TBAA8]]
+; O1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[ARRAYIDX_I4_I]], align 8, !tbaa [[ANYPTR_TBAA8]]
 ; O1-NEXT:    [[ARRAYIDX_I5_I:%.*]] = getelementptr inbounds float, ptr [[TMP4]], i64 undef
-; O1-NEXT:    br label [[FOR_BODY7_I:%.*]]
-; O1:       for.body7.i:
-; O1-NEXT:    [[J_07_I:%.*]] = phi i32 [ 0, [[FOR_BODY7_LR_PH_I]] ], [ [[INC_I:%.*]], [[FOR_BODY7_I]] ]
-; O1-NEXT:    [[TMP5:%.*]] = load float, ptr [[ARRAYIDX_I3_I]], align 4, !tbaa [[TBAA9:![0-9]+]]
-; O1-NEXT:    [[TMP6:%.*]] = load float, ptr [[ARRAYIDX_I5_I]], align 4, !tbaa [[TBAA9]]
+; O1-NEXT:    br label %[[FOR_BODY7_I:.*]]
+; O1:       [[FOR_BODY7_I]]:
+; O1-NEXT:    [[J_07_I:%.*]] = phi i32 [ 0, %[[FOR_BODY7_LR_PH_I]] ], [ [[INC_I:%.*]], %[[FOR_BODY7_I]] ]
+; O1-NEXT:    [[TMP5:%.*]] = load float, ptr [[ARRAYIDX_I3_I]], align 4, !tbaa [[FLOAT_TBAA9:![0-9]+]]
+; O1-NEXT:    [[TMP6:%.*]] = load float, ptr [[ARRAYIDX_I5_I]], align 4, !tbaa [[FLOAT_TBAA9]]
 ; O1-NEXT:    [[ADD_I:%.*]] = fadd float [[TMP5]], [[TMP6]]
-; O1-NEXT:    store float [[ADD_I]], ptr [[ARRAYIDX_I5_I]], align 4, !tbaa [[TBAA9]]
+; O1-NEXT:    store float [[ADD_I]], ptr [[ARRAYIDX_I5_I]], align 4, !tbaa [[FLOAT_TBAA9]]
 ; O1-NEXT:    [[INC_I]] = add nuw i32 [[J_07_I]], 1
 ; O1-NEXT:    [[EXITCOND_NOT_I:%.*]] = icmp eq i32 [[INC_I]], [[TMP1]]
-; O1-NEXT:    br i1 [[EXITCOND_NOT_I]], label [[_ZN12FLOATVECPAIR6VECINCEV_EXIT]], label [[FOR_BODY7_I]], !llvm.loop [[LOOP11:![0-9]+]]
-; O1:       _ZN12FloatVecPair6vecIncEv.exit:
+; O1-NEXT:    br i1 [[EXITCOND_NOT_I]], label %[[_ZN12FLOATVECPAIR6VECINCEV_EXIT]], label %[[FOR_BODY7_I]], !llvm.loop [[LOOP11:![0-9]+]]
+; O1:       [[_ZN12FLOATVECPAIR6VECINCEV_EXIT]]:
 ; O1-NEXT:    ret void
 ;
-; O23-LABEL: define {{[^@]+}}@_Z13vecIncFromPtrP12FloatVecPair
-; O23-SAME: (ptr readonly captures(none) [[FVP:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
-; O23-NEXT:  entry:
+; O23-LABEL: define dso_local void @_Z13vecIncFromPtrP12FloatVecPair(
+; O23-SAME: ptr readonly captures(none) [[FVP:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; O23-NEXT:  [[ENTRY:.*:]]
 ; O23-NEXT:    [[VSRC23_I:%.*]] = getelementptr inbounds nuw i8, ptr [[FVP]], i64 16
-; O23-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VSRC23_I]], align 8, !tbaa [[TBAA0:![0-9]+]]
+; O23-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VSRC23_I]], align 8, !tbaa [[ANYPTR_TBAA0:![0-9]+]]
 ; O23-NEXT:    [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds [[CLASS_HOMEMADEVECTOR_0:%.*]], ptr [[TMP0]], i64 undef
 ; O23-NEXT:    [[SIZE4_I:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX_I_I]], i64 8
-; O23-NEXT:    [[TMP1:%.*]] = load i32, ptr [[SIZE4_I]], align 8, !tbaa [[TBAA6:![0-9]+]]
+; O23-NEXT:    [[TMP1:%.*]] = load i32, ptr [[SIZE4_I]], align 8, !tbaa [[INT_TBAA6:![0-9]+]]
 ; O23-NEXT:    [[CMP56_NOT_I:%.*]] = icmp eq i32 [[TMP1]], 0
-; O23-NEXT:    br i1 [[CMP56_NOT_I]], label [[_ZN12FLOATVECPAIR6VECINCEV_EXIT:%.*]], label [[FOR_BODY7_LR_PH_I:%.*]]
-; O23:       for.body7.lr.ph.i:
-; O23-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[ARRAYIDX_I_I]], align 8, !tbaa [[TBAA8:![0-9]+]]
+; O23-NEXT:    br i1 [[CMP56_NOT_I]], label %[[_ZN12FLOATVECPAIR6VECINCEV_EXIT:.*]], label %[[FOR_BODY7_LR_PH_I:.*]]
+; O23:       [[FOR_BODY7_LR_PH_I]]:
+; O23-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[ARRAYIDX_I_I]], align 8, !tbaa [[ANYPTR_TBAA8:![0-9]+]]
 ; O23-NEXT:    [[ARRAYIDX_I3_I:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i64 undef
-; O23-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[FVP]], align 8, !tbaa [[TBAA0]]
+; O23-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[FVP]], align 8, !tbaa [[ANYPTR_TBAA0]]
 ; O23-NEXT:    [[ARRAYIDX_I4_I:%.*]] = getelementptr inbounds [[CLASS_HOMEMADEVECTOR_0]], ptr [[TMP3]], i64 undef
-; O23-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[ARRAYIDX_I4_I]], align 8, !tbaa [[TBAA8]]
+; O23-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[ARRAYIDX_I4_I]], align 8, !tbaa [[ANYPTR_TBAA8]]
 ; O23-NEXT:    [[ARRAYIDX_I5_I:%.*]] = getelementptr inbounds float, ptr [[TMP4]], i64 undef
-; O23-NEXT:    [[DOTPRE_I:%.*]] = load float, ptr [[ARRAYIDX_I5_I]], align 4, !tbaa [[TBAA9:![0-9]+]]
-; O23-NEXT:    br label [[FOR_BODY7_I:%.*]]
-; O23:       for.body7.i:
-; O23-NEXT:    [[TMP5:%.*]] = phi float [ [[DOTPRE_I]], [[FOR_BODY7_LR_PH_I]] ], [ [[ADD_I:%.*]], [[FOR_BODY7_I]] ]
-; O23-NEXT:    [[J_07_I:%.*]] = phi i32 [ 0, [[FOR_BODY7_LR_PH_I]] ], [ [[INC_I:%.*]], [[FOR_BODY7_I]] ]
-; O23-NEXT:    [[TMP6:%.*]] = load float, ptr [[ARRAYIDX_I3_I]], align 4, !tbaa [[TBAA9]]
+; O23-NEXT:    [[DOTPRE_I:%.*]] = load float, ptr [[ARRAYIDX_I5_I]], align 4, !tbaa [[FLOAT_TBAA9:![0-9]+]]
+; O23-NEXT:    br label %[[FOR_BODY7_I:.*]]
+; O23:       [[FOR_BODY7_I]]:
+; O23-NEXT:    [[TMP5:%.*]] = phi float [ [[DOTPRE_I]], %[[FOR_BODY7_LR_PH_I]] ], [ [[ADD_I:%.*]], %[[FOR_BODY7_I]] ]
+; O23-NEXT:    [[J_07_I:%.*]] = phi i32 [ 0, %[[FOR_BODY7_LR_PH_I]] ], [ [[INC_I:%.*]], %[[FOR_BODY7_I]] ]
+; O23-NEXT:    [[TMP6:%.*]] = load float, ptr [[ARRAYIDX_I3_I]], align 4, !tbaa [[FLOAT_TBAA9]]
 ; O23-NEXT:    [[ADD_I]] = fadd float [[TMP5]], [[TMP6]]
-; O23-NEXT:    store float [[ADD_I]], ptr [[ARRAYIDX_I5_I]], align 4, !tbaa [[TBAA9]]
+; O23-NEXT:    store float [[ADD_I]], ptr [[ARRAYIDX_I5_I]], align 4, !tbaa [[FLOAT_TBAA9]]
 ; O23-NEXT:    [[INC_I]] = add nuw i32 [[J_07_I]], 1
 ; O23-NEXT:    [[EXITCOND_NOT_I:%.*]] = icmp eq i32 [[INC_I]], [[TMP1]]
-; O23-NEXT:    br i1 [[EXITCOND_NOT_I]], label [[_ZN12FLOATVECPAIR6VECINCEV_EXIT]], label [[FOR_BODY7_I]], !llvm.loop [[LOOP11:![0-9]+]]
-; O23:       _ZN12FloatVecPair6vecIncEv.exit:
+; O23-NEXT:    br i1 [[EXITCOND_NOT_I]], label %[[_ZN12FLOATVECPAIR6VECINCEV_EXIT]], label %[[FOR_BODY7_I]], !llvm.loop [[LOOP11:![0-9]+]]
+; O23:       [[_ZN12FLOATVECPAIR6VECINCEV_EXIT]]:
 ; O23-NEXT:    ret void
 ;
 entry:
@@ -163,3 +163,32 @@ entry:
 !12 = !{!13, !1, i64 0}
 !13 = !{!"_ZTS14HomemadeVectorIS_IfLj8EELj8EE", !1, i64 0, !5, i64 8}
 !14 = !{!7, !1, i64 0}
+;.
+; O1: [[ANYPTR_TBAA0]] = !{[[META1:![0-9]+]], [[META2:![0-9]+]], i64 0}
+; O1: [[META1]] = !{!"_ZTS14HomemadeVectorIS_IfLj8EELj8EE", [[META2]], i64 0, [[META5:![0-9]+]], i64 8}
+; O1: [[META2]] = !{!"any pointer", [[META3:![0-9]+]], i64 0}
+; O1: [[META3]] = !{!"omnipotent char", [[META4:![0-9]+]], i64 0}
+; O1: [[META4]] = !{!"Simple C++ TBAA"}
+; O1: [[META5]] = !{!"int", [[META3]], i64 0}
+; O1: [[INT_TBAA6]] = !{[[META7:![0-9]+]], [[META5]], i64 8}
+; O1: [[META7]] = !{!"_ZTS14HomemadeVectorIfLj8EE", [[META2]], i64 0, [[META5]], i64 8}
+; O1: [[ANYPTR_TBAA8]] = !{[[META7]], [[META2]], i64 0}
+; O1: [[FLOAT_TBAA9]] = !{[[META10:![0-9]+]], [[META10]], i64 0}
+; O1: [[META10]] = !{!"float", [[META3]], i64 0}
+; O1: [[LOOP11]] = distinct !{[[LOOP11]], [[META12:![0-9]+]]}
+; O1: [[META12]] = !{!"llvm.loop.mustprogress"}
+;.
+; O23: [[ANYPTR_TBAA0]] = !{[[META1:![0-9]+]], [[META2:![0-9]+]], i64 0}
+; O23: [[META1]] = !{!"_ZTS14HomemadeVectorIS_IfLj8EELj8EE", [[META2]], i64 0, [[META5:![0-9]+]], i64 8}
+; O23: [[META2]] = !{!"any pointer", [[META3:![0-9]+]], i64 0}
+; O23: [[META3]] = !{!"omnipotent char", [[META4:![0-9]+]], i64 0}
+; O23: [[META4]] = !{!"Simple C++ TBAA"}
+; O23: [[META5]] = !{!"int", [[META3]], i64 0}
+; O23: [[INT_TBAA6]] = !{[[META7:![0-9]+]], [[META5]], i64 8}
+; O23: [[META7]] = !{!"_ZTS14HomemadeVectorIfLj8EE", [[META2]], i64 0, [[META5]], i64 8}
+; O23: [[ANYPTR_TBAA8]] = !{[[META7]], [[META2]], i64 0}
+; O23: [[FLOAT_TBAA9]] = !{[[META10:![0-9]+]], [[META10]], i64 0}
+; O23: [[META10]] = !{!"float", [[META3]], i64 0}
+; O23: [[LOOP11]] = distinct !{[[LOOP11]], [[META12:![0-9]+]]}
+; O23: [[META12]] = !{!"llvm.loop.mustprogress"}
+;.
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/vdiv-nounroll.ll b/llvm/test/Transforms/PhaseOrdering/X86/vdiv-nounroll.ll
index 69a46b26decb2..ae6f4a7b76ab8 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/vdiv-nounroll.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/vdiv-nounroll.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt < %s -O3 -S | FileCheck %s
 
 target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
@@ -15,22 +15,23 @@ target triple = "x86_64-apple-macosx11.0.0"
 ;  }
 
 define void @vdiv(ptr %a, float %b) #0 {
-; CHECK-LABEL: @vdiv(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[B:%.*]], i64 0
+; CHECK-LABEL: define void @vdiv(
+; CHECK-SAME: ptr captures(none) [[A:%.*]], float [[B:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[B]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP0:%.*]] = fdiv fast <4 x float> splat (float 1.000000e+00), [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 4, !tbaa [[TBAA3:![0-9]+]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[A]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 4, !tbaa [[FLOAT_TBAA3:![0-9]+]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = fmul fast <4 x float> [[WIDE_LOAD]], [[TMP0]]
-; CHECK-NEXT:    store <4 x float> [[TMP3]], ptr [[TMP1]], align 4, !tbaa [[TBAA3]]
+; CHECK-NEXT:    store <4 x float> [[TMP3]], ptr [[TMP1]], align 4, !tbaa [[FLOAT_TBAA3]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[TMP5]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
-; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    br i1 [[TMP5]], label %[[FOR_COND_CLEANUP:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK:       [[FOR_COND_CLEANUP]]:
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -97,3 +98,14 @@ attributes #2 = { nounwind }
 !11 = distinct !{!11, !12, !13}
 !12 = !{!"llvm.loop.mustprogress"}
 !13 = !{!"llvm.loop.unroll.disable"}
+;.
+; CHECK: [[FLOAT_TBAA3]] = !{[[META4:![0-9]+]], [[META4]], i64 0}
+; CHECK: [[META4]] = !{!"float", [[META5:![0-9]+]], i64 0}
+; CHECK: [[META5]] = !{!"omnipotent char", [[META6:![0-9]+]], i64 0}
+; CHECK: [[META6]] = !{!"Simple C/C++ TBAA"}
+; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META8:![0-9]+]], [[META9:![0-9]+]], [[META10:![0-9]+]], [[META11:![0-9]+]]}
+; CHECK: [[META8]] = !{!"llvm.loop.mustprogress"}
+; CHECK: [[META9]] = !{!"llvm.loop.unroll.disable"}
+; CHECK: [[META10]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META11]] = !{!"llvm.loop.unroll.runtime.disable"}
+;.
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/vdiv.ll b/llvm/test/Transforms/PhaseOrdering/X86/vdiv.ll
index 7817c23e6a3ec..f7bc01e0e8af1 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/vdiv.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/vdiv.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt < %s -O3 -S                                        | FileCheck %s
 ; RUN: opt < %s -passes="default<O3>" -S | FileCheck %s
 
@@ -12,41 +12,42 @@ target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16
 target triple = "x86_64-apple-macosx10.15.0"
 
 define void @vdiv(ptr %x, ptr %y, double %a, i32 %N) #0 {
-; CHECK-LABEL: @vdiv(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; CHECK-NEXT:    br i1 [[CMP1]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
-; CHECK:       iter.check:
-; CHECK-NEXT:    [[X4:%.*]] = ptrtoint ptr [[X:%.*]] to i64
-; CHECK-NEXT:    [[Y5:%.*]] = ptrtoint ptr [[Y:%.*]] to i64
+; CHECK-LABEL: define void @vdiv(
+; CHECK-SAME: ptr writeonly captures(none) [[X:%.*]], ptr readonly captures(none) [[Y:%.*]], double [[A:%.*]], i32 [[N:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP1]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_END:.*]]
+; CHECK:       [[FOR_BODY_PREHEADER]]:
+; CHECK-NEXT:    [[X4:%.*]] = ptrtoint ptr [[X]] to i64
+; CHECK-NEXT:    [[Y5:%.*]] = ptrtoint ptr [[Y]] to i64
 ; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4
 ; CHECK-NEXT:    [[TMP0:%.*]] = sub i64 [[X4]], [[Y5]]
 ; CHECK-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 128
 ; CHECK-NEXT:    [[OR_COND:%.*]] = select i1 [[MIN_ITERS_CHECK]], i1 true, i1 [[DIFF_CHECK]]
-; CHECK-NEXT:    br i1 [[OR_COND]], label [[FOR_BODY_PREHEADER9:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.main.loop.iter.check:
+; CHECK-NEXT:    br i1 [[OR_COND]], label %[[FOR_BODY_PREHEADER9:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK6:%.*]] = icmp ult i32 [[N]], 16
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK6]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH1:%.*]]
-; CHECK:       vector.ph:
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK6]], label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH1:.*]]
+; CHECK:       [[VECTOR_PH1]]:
 ; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 2147483632
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x double> poison, double [[A:%.*]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x double> poison, double [[A]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x double> [[BROADCAST_SPLATINSERT]], <4 x double> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP1:%.*]] = fdiv fast <4 x double> splat (double 1.000000e+00), [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = fdiv fast <4 x double> splat (double 1.000000e+00), [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = fdiv fast <4 x double> splat (double 1.000000e+00), [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = fdiv fast <4 x double> splat (double 1.000000e+00), [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH1]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH1]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw double, ptr [[Y]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP5]], i64 32
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP5]], i64 64
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP5]], i64 96
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x double>, ptr [[TMP5]], align 8, !tbaa [[TBAA3:![0-9]+]]
-; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x double>, ptr [[TMP6]], align 8, !tbaa [[TBAA3]]
-; CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x double>, ptr [[TMP7]], align 8, !tbaa [[TBAA3]]
-; CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <4 x double>, ptr [[TMP8]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x double>, ptr [[TMP5]], align 8, !tbaa [[DOUBLE_TBAA3:![0-9]+]]
+; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x double>, ptr [[TMP6]], align 8, !tbaa [[DOUBLE_TBAA3]]
+; CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x double>, ptr [[TMP7]], align 8, !tbaa [[DOUBLE_TBAA3]]
+; CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <4 x double>, ptr [[TMP8]], align 8, !tbaa [[DOUBLE_TBAA3]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = fmul fast <4 x double> [[WIDE_LOAD]], [[TMP1]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = fmul fast <4 x double> [[WIDE_LOAD6]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = fmul fast <4 x double> [[WIDE_LOAD7]], [[TMP3]]
@@ -55,67 +56,67 @@ define void @vdiv(ptr %x, ptr %y, double %a, i32 %N) #0 {
 ; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP13]], i64 32
 ; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP13]], i64 64
 ; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP13]], i64 96
-; CHECK-NEXT:    store <4 x double> [[TMP9]], ptr [[TMP13]], align 8, !tbaa [[TBAA3]]
-; CHECK-NEXT:    store <4 x double> [[TMP10]], ptr [[TMP14]], align 8, !tbaa [[TBAA3]]
-; CHECK-NEXT:    store <4 x double> [[TMP11]], ptr [[TMP15]], align 8, !tbaa [[TBAA3]]
-; CHECK-NEXT:    store <4 x double> [[TMP12]], ptr [[TMP16]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    store <4 x double> [[TMP9]], ptr [[TMP13]], align 8, !tbaa [[DOUBLE_TBAA3]]
+; CHECK-NEXT:    store <4 x double> [[TMP10]], ptr [[TMP14]], align 8, !tbaa [[DOUBLE_TBAA3]]
+; CHECK-NEXT:    store <4 x double> [[TMP11]], ptr [[TMP15]], align 8, !tbaa [[DOUBLE_TBAA3]]
+; CHECK-NEXT:    store <4 x double> [[TMP12]], ptr [[TMP16]], align 8, !tbaa [[DOUBLE_TBAA3]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
-; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[WIDE_TRIP_COUNT]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
-; CHECK:       vec.epilog.iter.check:
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[FOR_END]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
+; CHECK:       [[VEC_EPILOG_ITER_CHECK]]:
 ; CHECK-NEXT:    [[N_VEC_REMAINING:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 12
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp eq i64 [[N_VEC_REMAINING]], 0
-; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[FOR_BODY_PREHEADER9]], label [[VEC_EPILOG_PH]]
-; CHECK:       vec.epilog.ph:
-; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_PH]] ]
+; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[FOR_BODY_PREHEADER9]], label %[[VEC_EPILOG_PH]]
+; CHECK:       [[VEC_EPILOG_PH]]:
+; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_PH]] ]
 ; CHECK-NEXT:    [[N_VEC11:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 2147483644
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT14:%.*]] = insertelement <4 x double> poison, double [[A]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT15:%.*]] = shufflevector <4 x double> [[BROADCAST_SPLATINSERT14]], <4 x double> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP38:%.*]] = fdiv fast <4 x double> splat (double 1.000000e+00), [[BROADCAST_SPLAT15]]
-; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
-; CHECK:       vec.epilog.vector.body:
-; CHECK-NEXT:    [[INDEX12:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT16:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
+; CHECK:       [[VEC_EPILOG_VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX12:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT16:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP39:%.*]] = getelementptr inbounds nuw double, ptr [[Y]], i64 [[INDEX12]]
-; CHECK-NEXT:    [[WIDE_LOAD13:%.*]] = load <4 x double>, ptr [[TMP39]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    [[WIDE_LOAD13:%.*]] = load <4 x double>, ptr [[TMP39]], align 8, !tbaa [[DOUBLE_TBAA3]]
 ; CHECK-NEXT:    [[TMP40:%.*]] = fmul fast <4 x double> [[WIDE_LOAD13]], [[TMP38]]
 ; CHECK-NEXT:    [[TMP41:%.*]] = getelementptr inbounds nuw double, ptr [[X]], i64 [[INDEX12]]
-; CHECK-NEXT:    store <4 x double> [[TMP40]], ptr [[TMP41]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    store <4 x double> [[TMP40]], ptr [[TMP41]], align 8, !tbaa [[DOUBLE_TBAA3]]
 ; CHECK-NEXT:    [[INDEX_NEXT16]] = add nuw i64 [[INDEX12]], 4
 ; CHECK-NEXT:    [[TMP42:%.*]] = icmp eq i64 [[INDEX_NEXT16]], [[N_VEC11]]
-; CHECK-NEXT:    br i1 [[TMP42]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
-; CHECK:       vec.epilog.middle.block:
+; CHECK-NEXT:    br i1 [[TMP42]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK:       [[VEC_EPILOG_MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[CMP_N17:%.*]] = icmp eq i64 [[N_VEC11]], [[WIDE_TRIP_COUNT]]
-; CHECK-NEXT:    br i1 [[CMP_N17]], label [[FOR_END]], label [[FOR_BODY_PREHEADER9]]
-; CHECK:       for.body.preheader:
-; CHECK-NEXT:    [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[N_VEC11]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    br i1 [[CMP_N17]], label %[[FOR_END]], label %[[FOR_BODY_PREHEADER9]]
+; CHECK:       [[FOR_BODY_PREHEADER9]]:
+; CHECK-NEXT:    [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[N_VEC11]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    [[TMP43:%.*]] = sub nsw i64 [[WIDE_TRIP_COUNT]], [[INDVARS_IV_PH]]
 ; CHECK-NEXT:    [[XTRAITER:%.*]] = and i64 [[TMP43]], 7
 ; CHECK-NEXT:    [[LCMP_MOD_NOT:%.*]] = icmp eq i64 [[XTRAITER]], 0
-; CHECK-NEXT:    br i1 [[LCMP_MOD_NOT]], label [[FOR_BODY_PROL_LOOPEXIT:%.*]], label [[FOR_BODY_PROL_PREHEADER:%.*]]
-; CHECK:       for.body.prol.preheader:
+; CHECK-NEXT:    br i1 [[LCMP_MOD_NOT]], label %[[FOR_BODY_PROL_LOOPEXIT:.*]], label %[[FOR_BODY_PROL_PREHEADER:.*]]
+; CHECK:       [[FOR_BODY_PROL_PREHEADER]]:
 ; CHECK-NEXT:    [[TMP18:%.*]] = fdiv fast double 1.000000e+00, [[A]]
-; CHECK-NEXT:    br label [[FOR_BODY_PROL:%.*]]
-; CHECK:       for.body.prol:
-; CHECK-NEXT:    [[INDVARS_IV_PROL:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_PROL:%.*]], [[FOR_BODY_PROL]] ], [ [[INDVARS_IV_PH]], [[FOR_BODY_PROL_PREHEADER]] ]
-; CHECK-NEXT:    [[PROL_ITER:%.*]] = phi i64 [ [[PROL_ITER_NEXT:%.*]], [[FOR_BODY_PROL]] ], [ 0, [[FOR_BODY_PROL_PREHEADER]] ]
+; CHECK-NEXT:    br label %[[FOR_BODY_PROL:.*]]
+; CHECK:       [[FOR_BODY_PROL]]:
+; CHECK-NEXT:    [[INDVARS_IV_PROL:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_PROL:%.*]], %[[FOR_BODY_PROL]] ], [ [[INDVARS_IV_PH]], %[[FOR_BODY_PROL_PREHEADER]] ]
+; CHECK-NEXT:    [[PROL_ITER:%.*]] = phi i64 [ [[PROL_ITER_NEXT:%.*]], %[[FOR_BODY_PROL]] ], [ 0, %[[FOR_BODY_PROL_PREHEADER]] ]
 ; CHECK-NEXT:    [[ARRAYIDX_PROL:%.*]] = getelementptr inbounds nuw double, ptr [[Y]], i64 [[INDVARS_IV_PROL]]
-; CHECK-NEXT:    [[T0_PROL:%.*]] = load double, ptr [[ARRAYIDX_PROL]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    [[T0_PROL:%.*]] = load double, ptr [[ARRAYIDX_PROL]], align 8, !tbaa [[DOUBLE_TBAA3]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = fmul fast double [[T0_PROL]], [[TMP18]]
 ; CHECK-NEXT:    [[ARRAYIDX2_PROL:%.*]] = getelementptr inbounds nuw double, ptr [[X]], i64 [[INDVARS_IV_PROL]]
-; CHECK-NEXT:    store double [[TMP19]], ptr [[ARRAYIDX2_PROL]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    store double [[TMP19]], ptr [[ARRAYIDX2_PROL]], align 8, !tbaa [[DOUBLE_TBAA3]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT_PROL]] = add nuw nsw i64 [[INDVARS_IV_PROL]], 1
 ; CHECK-NEXT:    [[PROL_ITER_NEXT]] = add i64 [[PROL_ITER]], 1
 ; CHECK-NEXT:    [[PROL_ITER_CMP_NOT:%.*]] = icmp eq i64 [[PROL_ITER_NEXT]], [[XTRAITER]]
-; CHECK-NEXT:    br i1 [[PROL_ITER_CMP_NOT]], label [[FOR_BODY_PROL_LOOPEXIT]], label [[FOR_BODY_PROL]], !llvm.loop [[LOOP11:![0-9]+]]
-; CHECK:       for.body.prol.loopexit:
-; CHECK-NEXT:    [[INDVARS_IV_UNR:%.*]] = phi i64 [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER9]] ], [ [[INDVARS_IV_NEXT_PROL]], [[FOR_BODY_PROL]] ]
+; CHECK-NEXT:    br i1 [[PROL_ITER_CMP_NOT]], label %[[FOR_BODY_PROL_LOOPEXIT]], label %[[FOR_BODY_PROL]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK:       [[FOR_BODY_PROL_LOOPEXIT]]:
+; CHECK-NEXT:    [[INDVARS_IV_UNR:%.*]] = phi i64 [ [[INDVARS_IV_PH]], %[[FOR_BODY_PREHEADER9]] ], [ [[INDVARS_IV_NEXT_PROL]], %[[FOR_BODY_PROL]] ]
 ; CHECK-NEXT:    [[TMP20:%.*]] = sub nsw i64 [[INDVARS_IV_PH]], [[WIDE_TRIP_COUNT]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = icmp ugt i64 [[TMP20]], -8
-; CHECK-NEXT:    br i1 [[TMP21]], label [[FOR_END]], label [[FOR_BODY_PREHEADER9_NEW:%.*]]
-; CHECK:       for.body.preheader.new:
+; CHECK-NEXT:    br i1 [[TMP21]], label %[[FOR_END]], label %[[FOR_BODY_PREHEADER9_NEW:.*]]
+; CHECK:       [[FOR_BODY_PREHEADER9_NEW]]:
 ; CHECK-NEXT:    [[TMP22:%.*]] = fdiv fast double 1.000000e+00, [[A]]
 ; CHECK-NEXT:    [[TMP23:%.*]] = fdiv fast double 1.000000e+00, [[A]]
 ; CHECK-NEXT:    [[TMP24:%.*]] = fdiv fast double 1.000000e+00, [[A]]
@@ -124,60 +125,60 @@ define void @vdiv(ptr %x, ptr %y, double %a, i32 %N) #0 {
 ; CHECK-NEXT:    [[TMP27:%.*]] = fdiv fast double 1.000000e+00, [[A]]
 ; CHECK-NEXT:    [[TMP28:%.*]] = fdiv fast double 1.000000e+00, [[A]]
 ; CHECK-NEXT:    [[TMP29:%.*]] = fdiv fast double 1.000000e+00, [[A]]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_UNR]], [[FOR_BODY_PREHEADER9_NEW]] ], [ [[INDVARS_IV_NEXT_7:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_UNR]], %[[FOR_BODY_PREHEADER9_NEW]] ], [ [[INDVARS_IV_NEXT_7:%.*]], %[[FOR_BODY]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw double, ptr [[Y]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[T0:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    [[T0:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !tbaa [[DOUBLE_TBAA3]]
 ; CHECK-NEXT:    [[TMP30:%.*]] = fmul fast double [[T0]], [[TMP22]]
 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw double, ptr [[X]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    store double [[TMP30]], ptr [[ARRAYIDX2]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    store double [[TMP30]], ptr [[ARRAYIDX2]], align 8, !tbaa [[DOUBLE_TBAA3]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds nuw double, ptr [[Y]], i64 [[INDVARS_IV_NEXT]]
-; CHECK-NEXT:    [[T0_1:%.*]] = load double, ptr [[ARRAYIDX_1]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    [[T0_1:%.*]] = load double, ptr [[ARRAYIDX_1]], align 8, !tbaa [[DOUBLE_TBAA3]]
 ; CHECK-NEXT:    [[TMP31:%.*]] = fmul fast double [[T0_1]], [[TMP23]]
 ; CHECK-NEXT:    [[ARRAYIDX2_1:%.*]] = getelementptr inbounds nuw double, ptr [[X]], i64 [[INDVARS_IV_NEXT]]
-; CHECK-NEXT:    store double [[TMP31]], ptr [[ARRAYIDX2_1]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    store double [[TMP31]], ptr [[ARRAYIDX2_1]], align 8, !tbaa [[DOUBLE_TBAA3]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT_1:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 2
 ; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds nuw double, ptr [[Y]], i64 [[INDVARS_IV_NEXT_1]]
-; CHECK-NEXT:    [[T0_2:%.*]] = load double, ptr [[ARRAYIDX_2]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    [[T0_2:%.*]] = load double, ptr [[ARRAYIDX_2]], align 8, !tbaa [[DOUBLE_TBAA3]]
 ; CHECK-NEXT:    [[TMP32:%.*]] = fmul fast double [[T0_2]], [[TMP24]]
 ; CHECK-NEXT:    [[ARRAYIDX2_2:%.*]] = getelementptr inbounds nuw double, ptr [[X]], i64 [[INDVARS_IV_NEXT_1]]
-; CHECK-NEXT:    store double [[TMP32]], ptr [[ARRAYIDX2_2]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    store double [[TMP32]], ptr [[ARRAYIDX2_2]], align 8, !tbaa [[DOUBLE_TBAA3]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT_2:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 3
 ; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds nuw double, ptr [[Y]], i64 [[INDVARS_IV_NEXT_2]]
-; CHECK-NEXT:    [[T0_3:%.*]] = load double, ptr [[ARRAYIDX_3]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    [[T0_3:%.*]] = load double, ptr [[ARRAYIDX_3]], align 8, !tbaa [[DOUBLE_TBAA3]]
 ; CHECK-NEXT:    [[TMP33:%.*]] = fmul fast double [[T0_3]], [[TMP25]]
 ; CHECK-NEXT:    [[ARRAYIDX2_3:%.*]] = getelementptr inbounds nuw double, ptr [[X]], i64 [[INDVARS_IV_NEXT_2]]
-; CHECK-NEXT:    store double [[TMP33]], ptr [[ARRAYIDX2_3]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    store double [[TMP33]], ptr [[ARRAYIDX2_3]], align 8, !tbaa [[DOUBLE_TBAA3]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT_3:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 4
 ; CHECK-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds nuw double, ptr [[Y]], i64 [[INDVARS_IV_NEXT_3]]
-; CHECK-NEXT:    [[T0_4:%.*]] = load double, ptr [[ARRAYIDX_4]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    [[T0_4:%.*]] = load double, ptr [[ARRAYIDX_4]], align 8, !tbaa [[DOUBLE_TBAA3]]
 ; CHECK-NEXT:    [[TMP34:%.*]] = fmul fast double [[T0_4]], [[TMP26]]
 ; CHECK-NEXT:    [[ARRAYIDX2_4:%.*]] = getelementptr inbounds nuw double, ptr [[X]], i64 [[INDVARS_IV_NEXT_3]]
-; CHECK-NEXT:    store double [[TMP34]], ptr [[ARRAYIDX2_4]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    store double [[TMP34]], ptr [[ARRAYIDX2_4]], align 8, !tbaa [[DOUBLE_TBAA3]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT_4:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 5
 ; CHECK-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds nuw double, ptr [[Y]], i64 [[INDVARS_IV_NEXT_4]]
-; CHECK-NEXT:    [[T0_5:%.*]] = load double, ptr [[ARRAYIDX_5]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    [[T0_5:%.*]] = load double, ptr [[ARRAYIDX_5]], align 8, !tbaa [[DOUBLE_TBAA3]]
 ; CHECK-NEXT:    [[TMP35:%.*]] = fmul fast double [[T0_5]], [[TMP27]]
 ; CHECK-NEXT:    [[ARRAYIDX2_5:%.*]] = getelementptr inbounds nuw double, ptr [[X]], i64 [[INDVARS_IV_NEXT_4]]
-; CHECK-NEXT:    store double [[TMP35]], ptr [[ARRAYIDX2_5]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    store double [[TMP35]], ptr [[ARRAYIDX2_5]], align 8, !tbaa [[DOUBLE_TBAA3]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT_5:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 6
 ; CHECK-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds nuw double, ptr [[Y]], i64 [[INDVARS_IV_NEXT_5]]
-; CHECK-NEXT:    [[T0_6:%.*]] = load double, ptr [[ARRAYIDX_6]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    [[T0_6:%.*]] = load double, ptr [[ARRAYIDX_6]], align 8, !tbaa [[DOUBLE_TBAA3]]
 ; CHECK-NEXT:    [[TMP36:%.*]] = fmul fast double [[T0_6]], [[TMP28]]
 ; CHECK-NEXT:    [[ARRAYIDX2_6:%.*]] = getelementptr inbounds nuw double, ptr [[X]], i64 [[INDVARS_IV_NEXT_5]]
-; CHECK-NEXT:    store double [[TMP36]], ptr [[ARRAYIDX2_6]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    store double [[TMP36]], ptr [[ARRAYIDX2_6]], align 8, !tbaa [[DOUBLE_TBAA3]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT_6:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 7
 ; CHECK-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds nuw double, ptr [[Y]], i64 [[INDVARS_IV_NEXT_6]]
-; CHECK-NEXT:    [[T0_7:%.*]] = load double, ptr [[ARRAYIDX_7]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    [[T0_7:%.*]] = load double, ptr [[ARRAYIDX_7]], align 8, !tbaa [[DOUBLE_TBAA3]]
 ; CHECK-NEXT:    [[TMP37:%.*]] = fmul fast double [[T0_7]], [[TMP29]]
 ; CHECK-NEXT:    [[ARRAYIDX2_7:%.*]] = getelementptr inbounds nuw double, ptr [[X]], i64 [[INDVARS_IV_NEXT_6]]
-; CHECK-NEXT:    store double [[TMP37]], ptr [[ARRAYIDX2_7]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    store double [[TMP37]], ptr [[ARRAYIDX2_7]], align 8, !tbaa [[DOUBLE_TBAA3]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT_7]] = add nuw nsw i64 [[INDVARS_IV]], 8
 ; CHECK-NEXT:    [[EXITCOND_NOT_7:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_7]], [[WIDE_TRIP_COUNT]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT_7]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
-; CHECK:       for.end:
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT_7]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK:       [[FOR_END]]:
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -223,3 +224,16 @@ attributes #0 = { nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="
 !5 = !{!"omnipotent char", !6, i64 0}
 !6 = !{!"Simple C/C++ TBAA"}
 
+;.
+; CHECK: [[DOUBLE_TBAA3]] = !{[[META4:![0-9]+]], [[META4]], i64 0}
+; CHECK: [[META4]] = !{!"double", [[META5:![0-9]+]], i64 0}
+; CHECK: [[META5]] = !{!"omnipotent char", [[META6:![0-9]+]], i64 0}
+; CHECK: [[META6]] = !{!"Simple C/C++ TBAA"}
+; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META8:![0-9]+]], [[META9:![0-9]+]]}
+; CHECK: [[META8]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META9]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META8]], [[META9]]}
+; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[META12:![0-9]+]]}
+; CHECK: [[META12]] = !{!"llvm.loop.unroll.disable"}
+; CHECK: [[LOOP13]] = distinct !{[[LOOP13]], [[META8]]}
+;.
diff --git a/llvm/test/Transforms/PhaseOrdering/loop-access-checks.ll b/llvm/test/Transforms/PhaseOrdering/loop-access-checks.ll
index ae0e59169d3e5..5253c42d9c6d2 100644
--- a/llvm/test/Transforms/PhaseOrdering/loop-access-checks.ll
+++ b/llvm/test/Transforms/PhaseOrdering/loop-access-checks.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt -passes='default<O2>' -S %s | FileCheck %s
 
 ; Slightly reduced test case for a loop iterating over a std::span with libc++ hardening.
@@ -18,30 +18,30 @@
 %"struct.std::__1::__bounded_iter" = type { ptr, ptr, ptr }
 
 define void @test_fill_with_foreach([2 x i64] %elems.coerce) {
-; CHECK-LABEL: define void @test_fill_with_foreach
-; CHECK-SAME: ([2 x i64] [[ELEMS_COERCE:%.*]]) local_unnamed_addr {
-; CHECK-NEXT:  entry:
+; CHECK-LABEL: define void @test_fill_with_foreach(
+; CHECK-SAME: [2 x i64] [[ELEMS_COERCE:%.*]]) local_unnamed_addr {
+; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[ELEMS_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[ELEMS_COERCE]], 0
 ; CHECK-NEXT:    [[TMP0:%.*]] = inttoptr i64 [[ELEMS_COERCE_FCA_0_EXTRACT]] to ptr
 ; CHECK-NEXT:    [[ELEMS_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[ELEMS_COERCE]], 1
 ; CHECK-NEXT:    [[ADD_PTR_I_IDX:%.*]] = shl nsw i64 [[ELEMS_COERCE_FCA_1_EXTRACT]], 2
 ; CHECK-NEXT:    [[ADD_PTR_I:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 [[ADD_PTR_I_IDX]]
 ; CHECK-NEXT:    [[CMP_NOT_I_I_I_I:%.*]] = icmp slt i64 [[ELEMS_COERCE_FCA_1_EXTRACT]], 0
-; CHECK-NEXT:    br i1 [[CMP_NOT_I_I_I_I]], label [[ERROR:%.*]], label [[FOR_COND_PREHEADER_SPLIT:%.*]]
-; CHECK:       for.cond.preheader.split:
+; CHECK-NEXT:    br i1 [[CMP_NOT_I_I_I_I]], label %[[ERROR:.*]], label %[[FOR_COND_PREHEADER_SPLIT:.*]]
+; CHECK:       [[FOR_COND_PREHEADER_SPLIT]]:
 ; CHECK-NEXT:    [[CMP_I_NOT2:%.*]] = icmp eq i64 [[ELEMS_COERCE_FCA_1_EXTRACT]], 0
-; CHECK-NEXT:    br i1 [[CMP_I_NOT2]], label [[COMMON_RET:%.*]], label [[FOR_BODY:%.*]]
-; CHECK:       common.ret:
+; CHECK-NEXT:    br i1 [[CMP_I_NOT2]], label %[[COMMON_RET:.*]], label %[[FOR_BODY:.*]]
+; CHECK:       [[COMMON_RET]]:
 ; CHECK-NEXT:    ret void
-; CHECK:       error:
+; CHECK:       [[ERROR]]:
 ; CHECK-NEXT:    tail call void @error()
-; CHECK-NEXT:    br label [[COMMON_RET]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[__BEGIN1_SROA_0_03:%.*]] = phi ptr [ [[INCDEC_PTR_I:%.*]], [[FOR_BODY]] ], [ [[TMP0]], [[FOR_COND_PREHEADER_SPLIT]] ]
+; CHECK-NEXT:    br label %[[COMMON_RET]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[__BEGIN1_SROA_0_03:%.*]] = phi ptr [ [[INCDEC_PTR_I:%.*]], %[[FOR_BODY]] ], [ [[TMP0]], %[[FOR_COND_PREHEADER_SPLIT]] ]
 ; CHECK-NEXT:    tail call void @use(ptr noundef nonnull align 4 dereferenceable(4) [[__BEGIN1_SROA_0_03]])
 ; CHECK-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds nuw i8, ptr [[__BEGIN1_SROA_0_03]], i64 4
 ; CHECK-NEXT:    [[CMP_I_NOT:%.*]] = icmp eq ptr [[INCDEC_PTR_I]], [[ADD_PTR_I]]
-; CHECK-NEXT:    br i1 [[CMP_I_NOT]], label [[COMMON_RET]], label [[FOR_BODY]]
+; CHECK-NEXT:    br i1 [[CMP_I_NOT]], label %[[COMMON_RET]], label %[[FOR_BODY]]
 ;
 entry:
   %elems = alloca %"class.std::__1::span", align 8
@@ -131,29 +131,29 @@ declare void @llvm.lifetime.end.p0(ptr nocapture)
 %Vector_impl_data = type { ptr, ptr, ptr }
 
 define void @foo(ptr noundef nonnull align 8 dereferenceable(24) noalias %vec) #0 {
-; CHECK-LABEL: define void @foo
-; CHECK-SAME: (ptr noalias noundef nonnull readonly align 8 captures(none) dereferenceable(24) [[VEC:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:  entry:
+; CHECK-LABEL: define void @foo(
+; CHECK-SAME: ptr noalias noundef nonnull readonly align 8 captures(none) dereferenceable(24) [[VEC:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    [[_M_FINISH_I_I:%.*]] = getelementptr inbounds nuw i8, ptr [[VEC]], i64 8
-; CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[_M_FINISH_I_I]], align 8, !tbaa [[TBAA0:![0-9]+]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[VEC]], align 8, !tbaa [[TBAA5:![0-9]+]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[_M_FINISH_I_I]], align 8, !tbaa [[ANYPTR_TBAA0:![0-9]+]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[VEC]], align 8, !tbaa [[ANYPTR_TBAA5:![0-9]+]]
 ; CHECK-NEXT:    [[SUB_PTR_LHS_CAST_I_I:%.*]] = ptrtoint ptr [[TMP0]] to i64
 ; CHECK-NEXT:    [[SUB_PTR_RHS_CAST_I_I:%.*]] = ptrtoint ptr [[TMP1]] to i64
 ; CHECK-NEXT:    [[SUB_PTR_SUB_I_I:%.*]] = sub i64 [[SUB_PTR_LHS_CAST_I_I]], [[SUB_PTR_RHS_CAST_I_I]]
 ; CHECK-NEXT:    [[SUB_PTR_DIV_I_I:%.*]] = ashr exact i64 [[SUB_PTR_SUB_I_I]], 3
 ; CHECK-NEXT:    [[CMP_NOT9:%.*]] = icmp eq ptr [[TMP0]], [[TMP1]]
-; CHECK-NEXT:    br i1 [[CMP_NOT9]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY:%.*]]
-; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    br i1 [[CMP_NOT9]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_COND_CLEANUP]]:
 ; CHECK-NEXT:    ret void
-; CHECK:       for.body:
-; CHECK-NEXT:    [[I_010:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[I_010:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    [[ADD_PTR_I:%.*]] = getelementptr inbounds double, ptr [[TMP1]], i64 [[I_010]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = load double, ptr [[ADD_PTR_I]], align 8
 ; CHECK-NEXT:    [[ADD:%.*]] = fadd double [[TMP2]], 1.000000e+00
 ; CHECK-NEXT:    store double [[ADD]], ptr [[ADD_PTR_I]], align 8
 ; CHECK-NEXT:    [[INC]] = add nuw i64 [[I_010]], 1
 ; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq i64 [[INC]], [[SUB_PTR_DIV_I_I]]
-; CHECK-NEXT:    br i1 [[CMP_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]]
+; CHECK-NEXT:    br i1 [[CMP_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
 ;
 entry:
   %vec.addr = alloca ptr, align 8
@@ -270,29 +270,29 @@ declare void @abort()
 ; https://github.com/llvm/llvm-project/issues/63126
 
 define void @loop_with_signed_induction(ptr noundef nonnull align 8 dereferenceable(24) %vec) {
-; CHECK-LABEL: define void @loop_with_signed_induction
-; CHECK-SAME: (ptr noundef nonnull readonly align 8 captures(none) dereferenceable(24) [[VEC:%.*]]) local_unnamed_addr #[[ATTR0]] {
-; CHECK-NEXT:  entry:
+; CHECK-LABEL: define void @loop_with_signed_induction(
+; CHECK-SAME: ptr noundef nonnull readonly align 8 captures(none) dereferenceable(24) [[VEC:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    [[_M_FINISH_I_I:%.*]] = getelementptr inbounds nuw i8, ptr [[VEC]], i64 8
-; CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[_M_FINISH_I_I]], align 8, !tbaa [[TBAA0]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[VEC]], align 8, !tbaa [[TBAA5]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[_M_FINISH_I_I]], align 8, !tbaa [[ANYPTR_TBAA0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[VEC]], align 8, !tbaa [[ANYPTR_TBAA5]]
 ; CHECK-NEXT:    [[SUB_PTR_LHS_CAST_I_I:%.*]] = ptrtoint ptr [[TMP0]] to i64
 ; CHECK-NEXT:    [[SUB_PTR_RHS_CAST_I_I:%.*]] = ptrtoint ptr [[TMP1]] to i64
 ; CHECK-NEXT:    [[SUB_PTR_SUB_I_I:%.*]] = sub i64 [[SUB_PTR_LHS_CAST_I_I]], [[SUB_PTR_RHS_CAST_I_I]]
 ; CHECK-NEXT:    [[SUB_PTR_DIV_I_I:%.*]] = ashr exact i64 [[SUB_PTR_SUB_I_I]], 3
 ; CHECK-NEXT:    [[CMP9:%.*]] = icmp sgt i64 [[SUB_PTR_DIV_I_I]], 0
-; CHECK-NEXT:    br i1 [[CMP9]], label [[FOR_BODY:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    br i1 [[CMP9]], label %[[FOR_BODY:.*]], label %[[FOR_COND_CLEANUP:.*]]
+; CHECK:       [[FOR_COND_CLEANUP]]:
 ; CHECK-NEXT:    ret void
-; CHECK:       for.body:
-; CHECK-NEXT:    [[I_010:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[I_010:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    [[ADD_PTR_I:%.*]] = getelementptr inbounds nuw double, ptr [[TMP1]], i64 [[I_010]]
-; CHECK-NEXT:    [[TMP2:%.*]] = load double, ptr [[ADD_PTR_I]], align 8, !tbaa [[TBAA6:![0-9]+]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load double, ptr [[ADD_PTR_I]], align 8, !tbaa [[DOUBLE_TBAA6:![0-9]+]]
 ; CHECK-NEXT:    [[ADD:%.*]] = fadd double [[TMP2]], 1.000000e+00
-; CHECK-NEXT:    store double [[ADD]], ptr [[ADD_PTR_I]], align 8, !tbaa [[TBAA6]]
+; CHECK-NEXT:    store double [[ADD]], ptr [[ADD_PTR_I]], align 8, !tbaa [[DOUBLE_TBAA6]]
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I_010]], 1
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i64 [[INC]], [[SUB_PTR_DIV_I_I]]
-; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP]]
+; CHECK-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP]]
 ;
 entry:
   %vec.addr = alloca ptr, align 8
@@ -343,22 +343,22 @@ for.end:
 
 
 define void @monkey(ptr noundef %arr, i32 noundef %len) {
-; CHECK-LABEL: define void @monkey
-; CHECK-SAME: (ptr noundef captures(none) [[ARR:%.*]], i32 noundef [[LEN:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] {
-; CHECK-NEXT:  entry:
+; CHECK-LABEL: define void @monkey(
+; CHECK-SAME: ptr noundef captures(none) [[ARR:%.*]], i32 noundef [[LEN:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    [[CMP8:%.*]] = icmp ugt i32 [[LEN]], 1
-; CHECK-NEXT:    br i1 [[CMP8]], label [[FOR_BODY4_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; CHECK:       for.body4.preheader:
-; CHECK-NEXT:    [[I_09:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_COND_CLEANUP3:%.*]] ], [ 1, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY4:%.*]]
-; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    br i1 [[CMP8]], label %[[FOR_BODY4_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]]
+; CHECK:       [[FOR_BODY4_PREHEADER]]:
+; CHECK-NEXT:    [[I_09:%.*]] = phi i32 [ [[INC:%.*]], %[[FOR_COND_CLEANUP3:.*]] ], [ 1, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[FOR_BODY4:.*]]
+; CHECK:       [[FOR_COND_CLEANUP]]:
 ; CHECK-NEXT:    ret void
-; CHECK:       for.cond.cleanup3:
+; CHECK:       [[FOR_COND_CLEANUP3]]:
 ; CHECK-NEXT:    [[INC]] = add nuw i32 [[I_09]], 1
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[INC]], [[LEN]]
-; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY4_PREHEADER]], label [[FOR_COND_CLEANUP]]
-; CHECK:       for.body4:
-; CHECK-NEXT:    [[K_07:%.*]] = phi i32 [ [[DEC:%.*]], [[FOR_BODY4]] ], [ [[I_09]], [[FOR_BODY4_PREHEADER]] ]
+; CHECK-NEXT:    br i1 [[CMP]], label %[[FOR_BODY4_PREHEADER]], label %[[FOR_COND_CLEANUP]]
+; CHECK:       [[FOR_BODY4]]:
+; CHECK-NEXT:    [[K_07:%.*]] = phi i32 [ [[DEC:%.*]], %[[FOR_BODY4]] ], [ [[I_09]], %[[FOR_BODY4_PREHEADER]] ]
 ; CHECK-NEXT:    [[IDX_EXT_I:%.*]] = zext i32 [[K_07]] to i64
 ; CHECK-NEXT:    [[ADD_PTR_I:%.*]] = getelementptr inbounds nuw i32, ptr [[ARR]], i64 [[IDX_EXT_I]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ADD_PTR_I]], align 4
@@ -366,7 +366,7 @@ define void @monkey(ptr noundef %arr, i32 noundef %len) {
 ; CHECK-NEXT:    store i32 [[ADD]], ptr [[ADD_PTR_I]], align 4
 ; CHECK-NEXT:    [[DEC]] = add i32 [[K_07]], -1
 ; CHECK-NEXT:    [[CMP2_NOT:%.*]] = icmp eq i32 [[DEC]], 0
-; CHECK-NEXT:    br i1 [[CMP2_NOT]], label [[FOR_COND_CLEANUP3]], label [[FOR_BODY4]]
+; CHECK-NEXT:    br i1 [[CMP2_NOT]], label %[[FOR_COND_CLEANUP3]], label %[[FOR_BODY4]]
 ;
 entry:
   %arr.addr = alloca ptr, align 8
@@ -472,3 +472,13 @@ if.end:                                           ; preds = %entry
 !7 = !{!1, !2, i64 8}
 !8 = !{!9, !9, i64 0}
 !9 = !{!"double", !3, i64 0}
+;.
+; CHECK: [[ANYPTR_TBAA0]] = !{[[META1:![0-9]+]], [[META2:![0-9]+]], i64 8}
+; CHECK: [[META1]] = !{!"_ZTSNSt12_Vector_baseIdSaIdEE17_Vector_impl_dataE", [[META2]], i64 0, [[META2]], i64 8, [[META2]], i64 16}
+; CHECK: [[META2]] = !{!"any pointer", [[META3:![0-9]+]], i64 0}
+; CHECK: [[META3]] = !{!"omnipotent char", [[META4:![0-9]+]], i64 0}
+; CHECK: [[META4]] = !{!"Simple C++ TBAA"}
+; CHECK: [[ANYPTR_TBAA5]] = !{[[META1]], [[META2]], i64 0}
+; CHECK: [[DOUBLE_TBAA6]] = !{[[META7:![0-9]+]], [[META7]], i64 0}
+; CHECK: [[META7]] = !{!"double", [[META3]], i64 0}
+;.
diff --git a/llvm/test/Transforms/PreISelIntrinsicLowering/X86/memset-pattern.ll b/llvm/test/Transforms/PreISelIntrinsicLowering/X86/memset-pattern.ll
index aaca5a6c87b4f..1a1fe20350885 100644
--- a/llvm/test/Transforms/PreISelIntrinsicLowering/X86/memset-pattern.ll
+++ b/llvm/test/Transforms/PreISelIntrinsicLowering/X86/memset-pattern.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 5
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 6
 ; RUN: opt -mtriple=x86_64-apple-darwin10.0.0 -passes=pre-isel-intrinsic-lowering -S -o - %s | FileCheck %s
 
 ;.
@@ -131,7 +131,7 @@ define void @memset_pattern_i64_x(ptr %a, i64 %x) nounwind {
 define void @memset_pattern_i64_128_tbaa(ptr %a) nounwind {
 ; CHECK-LABEL: define void @memset_pattern_i64_128_tbaa(
 ; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    call void @memset_pattern16(ptr [[A]], ptr @.memset_pattern.2, i64 1024), !tbaa [[TBAA0:![0-9]+]]
+; CHECK-NEXT:    call void @memset_pattern16(ptr [[A]], ptr @.memset_pattern.2, i64 1024), !tbaa [[DOUBLE_TBAA0:![0-9]+]]
 ; CHECK-NEXT:    ret void
 ;
   tail call void @llvm.experimental.memset.pattern(ptr %a, i64 u0x400921fb54442d18, i64 128, i1 false), !tbaa !5
@@ -216,7 +216,7 @@ define void @memset_pattern_i64_x_fromnonconstptr(ptr %a, i64 %x, ptr %p) nounwi
 ; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: write) }
 ; CHECK: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
 ;.
-; CHECK: [[TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0}
+; CHECK: [[DOUBLE_TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0}
 ; CHECK: [[META1]] = !{!"double", [[META2:![0-9]+]], i64 0}
 ; CHECK: [[META2]] = !{!"omnipotent char", [[META3:![0-9]+]], i64 0}
 ; CHECK: [[META3]] = !{!"Simple C++ TBAA"}
diff --git a/llvm/test/Transforms/SCCP/overdefined-ext.ll b/llvm/test/Transforms/SCCP/overdefined-ext.ll
index e08acd20cc2ac..16eecba60d58c 100644
--- a/llvm/test/Transforms/SCCP/overdefined-ext.ll
+++ b/llvm/test/Transforms/SCCP/overdefined-ext.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -passes=sccp -S | FileCheck %s
+; RUN: opt < %s -passes=sccp -use-constant-int-for-fixed-length-splat=false -S | FileCheck %s --check-prefixes=CHECK,CHECK-CV
+; RUN: opt < %s -passes=sccp -use-constant-int-for-fixed-length-splat -S | FileCheck %s --check-prefixes=CHECK,CHECK-CI
 
 define i32 @zext_lshr(i1 %t0) {
 ; CHECK-LABEL: @zext_lshr(
@@ -24,10 +25,14 @@ define i1 @zext_icmp(i1 %t0) {
 ; TODO: SCCP operates poorly with vector ranges
 
 define <2 x i1> @zext_vector(<2 x i1> %t0) {
-; CHECK-LABEL: @zext_vector(
-; CHECK-NEXT:    [[T1:%.*]] = zext <2 x i1> [[T0:%.*]] to <2 x i32>
-; CHECK-NEXT:    [[T2:%.*]] = icmp eq <2 x i32> [[T1]], splat (i32 2)
-; CHECK-NEXT:    ret <2 x i1> [[T2]]
+; CHECK-CV-LABEL: @zext_vector(
+; CHECK-CV-NEXT:    [[T1:%.*]] = zext <2 x i1> [[T0:%.*]] to <2 x i32>
+; CHECK-CV-NEXT:    [[T2:%.*]] = icmp eq <2 x i32> [[T1]], splat (i32 2)
+; CHECK-CV-NEXT:    ret <2 x i1> [[T2]]
+;
+; CHECK-CI-LABEL: @zext_vector(
+; CHECK-CI-NEXT:    [[T1:%.*]] = zext <2 x i1> [[T0:%.*]] to <2 x i32>
+; CHECK-CI-NEXT:    ret <2 x i1> zeroinitializer
 ;
   %t1 = zext <2 x i1> %t0 to <2 x i32>
   %t2 = icmp eq <2 x i32> %t1, <i32 2, i32 2>
@@ -72,10 +77,14 @@ define i1 @sext_icmp(i1 %t0) {
 ; TODO: SCCP operates poorly with vector ranges
 
 define <2 x i1> @sext_vector(<2 x i1> %t0) {
-; CHECK-LABEL: @sext_vector(
-; CHECK-NEXT:    [[T1:%.*]] = sext <2 x i1> [[T0:%.*]] to <2 x i32>
-; CHECK-NEXT:    [[T2:%.*]] = icmp eq <2 x i32> [[T1]], splat (i32 2)
-; CHECK-NEXT:    ret <2 x i1> [[T2]]
+; CHECK-CV-LABEL: @sext_vector(
+; CHECK-CV-NEXT:    [[T1:%.*]] = sext <2 x i1> [[T0:%.*]] to <2 x i32>
+; CHECK-CV-NEXT:    [[T2:%.*]] = icmp eq <2 x i32> [[T1]], splat (i32 2)
+; CHECK-CV-NEXT:    ret <2 x i1> [[T2]]
+;
+; CHECK-CI-LABEL: @sext_vector(
+; CHECK-CI-NEXT:    [[T1:%.*]] = sext <2 x i1> [[T0:%.*]] to <2 x i32>
+; CHECK-CI-NEXT:    ret <2 x i1> zeroinitializer
 ;
   %t1 = sext <2 x i1> %t0 to <2 x i32>
   %t2 = icmp eq <2 x i32> %t1, <i32 2, i32 2>
diff --git a/llvm/test/Transforms/SCCP/relax-range-checks.ll b/llvm/test/Transforms/SCCP/relax-range-checks.ll
new file mode 100644
index 0000000000000..90722f350aa9e
--- /dev/null
+++ b/llvm/test/Transforms/SCCP/relax-range-checks.ll
@@ -0,0 +1,92 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt < %s -passes=sccp -S | FileCheck %s
+
+define i1 @relax_range_check(i8 range(i8 0, 5) %x)  {
+; CHECK-LABEL: define i1 @relax_range_check(
+; CHECK-SAME: i8 range(i8 0, 5) [[X:%.*]]) {
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i8 [[X]], -3
+; CHECK-NEXT:    [[RET:%.*]] = icmp uge i8 [[X]], 3
+; CHECK-NEXT:    ret i1 [[RET]]
+;
+  %add = add i8 %x, -3
+  %ret = icmp ult i8 %add, 2
+  ret i1 %ret
+}
+
+define i1 @relax_range_check_highbits_check(i8 range(i8 2, 0) %x)  {
+; CHECK-LABEL: define i1 @relax_range_check_highbits_check(
+; CHECK-SAME: i8 range(i8 2, 0) [[X:%.*]]) {
+; CHECK-NEXT:    [[AND:%.*]] = and i8 [[X]], -2
+; CHECK-NEXT:    [[RET:%.*]] = icmp ult i8 [[X]], 4
+; CHECK-NEXT:    ret i1 [[RET]]
+;
+  %and = and i8 %x, -2
+  %ret = icmp eq i8 %and, 2
+  ret i1 %ret
+}
+
+; Negative tests.
+
+define i1 @relax_range_check_one_instruction(i8 range(i8 0, 5) %x)  {
+; CHECK-LABEL: define i1 @relax_range_check_one_instruction(
+; CHECK-SAME: i8 range(i8 0, 5) [[X:%.*]]) {
+; CHECK-NEXT:    [[RET:%.*]] = icmp ult i8 [[X]], 2
+; CHECK-NEXT:    ret i1 [[RET]]
+;
+  %ret = icmp ult i8 %x, 2
+  ret i1 %ret
+}
+
+define i1 @relax_range_check_not_profitable(i8 range(i8 0, 6) %x)  {
+; CHECK-LABEL: define i1 @relax_range_check_not_profitable(
+; CHECK-SAME: i8 range(i8 0, 6) [[X:%.*]]) {
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i8 [[X]], -3
+; CHECK-NEXT:    [[RET:%.*]] = icmp ult i8 [[ADD]], 2
+; CHECK-NEXT:    ret i1 [[RET]]
+;
+  %add = add i8 %x, -3
+  %ret = icmp ult i8 %add, 2
+  ret i1 %ret
+}
+
+define i1 @relax_range_check_unknown_range(i64 %x)  {
+; CHECK-LABEL: define i1 @relax_range_check_unknown_range(
+; CHECK-SAME: i64 [[X:%.*]]) {
+; CHECK-NEXT:    [[AND:%.*]] = and i64 [[X]], -67108864
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[AND]], 0
+; CHECK-NEXT:    ret i1 [[TMP1]]
+;
+  %and = and i64 %x, -67108864
+  %test = icmp eq i64 %and, 0
+  ret i1 %test
+}
+
+define i1 @relax_range_check_highbits_check_multiuse(i8 range(i8 2, 0) %x)  {
+; CHECK-LABEL: define i1 @relax_range_check_highbits_check_multiuse(
+; CHECK-SAME: i8 range(i8 2, 0) [[X:%.*]]) {
+; CHECK-NEXT:    [[AND:%.*]] = and i8 [[X]], -2
+; CHECK-NEXT:    call void @use(i8 [[AND]])
+; CHECK-NEXT:    [[RET:%.*]] = icmp eq i8 [[AND]], 2
+; CHECK-NEXT:    ret i1 [[RET]]
+;
+  %and = and i8 %x, -2
+  call void @use(i8 %and)
+  %ret = icmp eq i8 %and, 2
+  ret i1 %ret
+}
+
+define i1 @relax_range_check_multiuse(i8 range(i8 0, 5) %x)  {
+; CHECK-LABEL: define i1 @relax_range_check_multiuse(
+; CHECK-SAME: i8 range(i8 0, 5) [[X:%.*]]) {
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i8 [[X]], -3
+; CHECK-NEXT:    call void @use(i8 [[ADD]])
+; CHECK-NEXT:    [[RET:%.*]] = icmp ult i8 [[ADD]], 2
+; CHECK-NEXT:    ret i1 [[RET]]
+;
+  %add = add i8 %x, -3
+  call void @use(i8 %add)
+  %ret = icmp ult i8 %add, 2
+  ret i1 %ret
+}
+
+declare void @use(i8)
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/32-bit.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/32-bit.ll
index 5d91e03559dea..bfa18f88a2467 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/32-bit.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/32-bit.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt -passes=slp-vectorizer -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
@@ -7,13 +7,13 @@ target triple = "aarch64-unknown-linux-gnu"
 %S = type { i8, i8, i8, i8 }
 
 define ptr @foo(ptr %this, ptr %rhs) {
-; CHECK-LABEL: define ptr @foo
-; CHECK-SAME: (ptr [[THIS:%.*]], ptr [[RHS:%.*]]) {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i8>, ptr [[RHS]], align 1, !tbaa [[TBAA0:![0-9]+]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i8>, ptr [[THIS]], align 1, !tbaa [[TBAA0]]
+; CHECK-LABEL: define ptr @foo(
+; CHECK-SAME: ptr [[THIS:%.*]], ptr [[RHS:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i8>, ptr [[RHS]], align 1, !tbaa [[BOOL_TBAA0:![0-9]+]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i8>, ptr [[THIS]], align 1, !tbaa [[BOOL_TBAA0]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = or <4 x i8> [[TMP0]], [[TMP1]]
-; CHECK-NEXT:    store <4 x i8> [[TMP2]], ptr [[THIS]], align 1, !tbaa [[TBAA0]]
+; CHECK-NEXT:    store <4 x i8> [[TMP2]], ptr [[THIS]], align 1, !tbaa [[BOOL_TBAA0]]
 ; CHECK-NEXT:    ret ptr [[THIS]]
 ;
 entry:
@@ -54,3 +54,9 @@ entry:
 !14 = !{!7, !8, i64 2}
 !15 = !{!7, !8, i64 3}
 
+;.
+; CHECK: [[BOOL_TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0}
+; CHECK: [[META1]] = !{!"bool", [[META2:![0-9]+]], i64 0}
+; CHECK: [[META2]] = !{!"omnipotent char", [[META3:![0-9]+]], i64 0}
+; CHECK: [[META3]] = !{!"Simple C++ TBAA"}
+;.
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/spillcost-di.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/spillcost-di.ll
index 9cb2badc25fb2..76b1d18fdc0a8 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/spillcost-di.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/spillcost-di.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; Debug informations shouldn't affect spill cost.
 ; RUN: opt -S -passes=slp-vectorizer %s -o - | FileCheck %s
 
@@ -7,17 +7,18 @@ target triple = "aarch64"
 %struct.S = type { i64, i64 }
 
 define void @patatino(i64 %n, i64 %i, ptr %p) !dbg !7 {
-; CHECK-LABEL: @patatino(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:      #dbg_value(i64 [[N:%.*]], [[META18:![0-9]+]], !DIExpression(), [[META23:![0-9]+]])
-; CHECK-NEXT:      #dbg_value(i64 [[I:%.*]], [[META19:![0-9]+]], !DIExpression(), [[META24:![0-9]+]])
-; CHECK-NEXT:      #dbg_value(ptr [[P:%.*]], [[META20:![0-9]+]], !DIExpression(), [[META25:![0-9]+]])
+; CHECK-LABEL: define void @patatino(
+; CHECK-SAME: i64 [[N:%.*]], i64 [[I:%.*]], ptr [[P:%.*]]) !dbg [[DBG7:![0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:      #dbg_value(i64 [[N]], [[META18:![0-9]+]], !DIExpression(), [[META23:![0-9]+]])
+; CHECK-NEXT:      #dbg_value(i64 [[I]], [[META19:![0-9]+]], !DIExpression(), [[META24:![0-9]+]])
+; CHECK-NEXT:      #dbg_value(ptr [[P]], [[META20:![0-9]+]], !DIExpression(), [[META25:![0-9]+]])
 ; CHECK-NEXT:    [[X1:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], ptr [[P]], i64 [[N]], i32 0, !dbg [[DBG26:![0-9]+]]
 ; CHECK-NEXT:      #dbg_value(i64 poison, [[META21:![0-9]+]], !DIExpression(), [[META27:![0-9]+]])
 ; CHECK-NEXT:      #dbg_value(i64 poison, [[META22:![0-9]+]], !DIExpression(), [[META28:![0-9]+]])
 ; CHECK-NEXT:    [[X5:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[P]], i64 [[I]], i32 0, !dbg [[DBG29:![0-9]+]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[X1]], align 8, !dbg [[DBG26]], !tbaa [[TBAA30:![0-9]+]]
-; CHECK-NEXT:    store <2 x i64> [[TMP0]], ptr [[X5]], align 8, !dbg [[DBG34:![0-9]+]], !tbaa [[TBAA30]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[X1]], align 8, !dbg [[DBG26]], !tbaa [[LONG_TBAA30:![0-9]+]]
+; CHECK-NEXT:    store <2 x i64> [[TMP0]], ptr [[X5]], align 8, !dbg [[DBG34:![0-9]+]], !tbaa [[LONG_TBAA30]]
 ; CHECK-NEXT:    ret void, !dbg [[DBG35:![0-9]+]]
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/store-ptr.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/store-ptr.ll
index e32e5f82991d9..2b6a41403fb48 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/store-ptr.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/store-ptr.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt < %s -passes=slp-vectorizer -S | FileCheck %s
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
@@ -7,28 +7,29 @@ target triple = "aarch64"
 %struct.node = type { i64, i64, ptr, ptr }
 
 define void @copy(ptr nocapture noundef writeonly %x, ptr nocapture noundef readonly %y, i32 noundef %n) {
-; CHECK-LABEL: @copy(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP34:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; CHECK-NEXT:    br i1 [[CMP34]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; CHECK:       for.body.preheader:
+; CHECK-LABEL: define void @copy(
+; CHECK-SAME: ptr noundef writeonly captures(none) [[X:%.*]], ptr noundef readonly captures(none) [[Y:%.*]], i32 noundef [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[CMP34:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP34]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]]
+; CHECK:       [[FOR_BODY_PREHEADER]]:
 ; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_COND_CLEANUP]]:
 ; CHECK-NEXT:    ret void
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_NODE:%.*]], ptr [[Y:%.*]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [[STRUCT_NODE]], ptr [[X:%.*]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA0:![0-9]+]]
-; CHECK-NEXT:    store <2 x i64> [[TMP0]], ptr [[ARRAYIDX2]], align 8, !tbaa [[TBAA0]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_NODE:%.*]], ptr [[Y]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [[STRUCT_NODE]], ptr [[X]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 8, !tbaa [[LONG_TBAA0:![0-9]+]]
+; CHECK-NEXT:    store <2 x i64> [[TMP0]], ptr [[ARRAYIDX2]], align 8, !tbaa [[LONG_TBAA0]]
 ; CHECK-NEXT:    [[C:%.*]] = getelementptr inbounds [[STRUCT_NODE]], ptr [[Y]], i64 [[INDVARS_IV]], i32 2
 ; CHECK-NEXT:    [[C13:%.*]] = getelementptr inbounds [[STRUCT_NODE]], ptr [[X]], i64 [[INDVARS_IV]], i32 2
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x ptr>, ptr [[C]], align 8, !tbaa [[TBAA4:![0-9]+]]
-; CHECK-NEXT:    store <2 x ptr> [[TMP1]], ptr [[C13]], align 8, !tbaa [[TBAA4]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x ptr>, ptr [[C]], align 8, !tbaa [[ANYPTR_TBAA4:![0-9]+]]
+; CHECK-NEXT:    store <2 x ptr> [[TMP1]], ptr [[C13]], align 8, !tbaa [[ANYPTR_TBAA4]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
 ;
 entry:
   %cmp34 = icmp sgt i32 %n, 0
@@ -74,3 +75,11 @@ for.body:
 !11 = !{!5, !9, i64 16}
 !12 = !{!5, !9, i64 24}
 
+;.
+; CHECK: [[LONG_TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0}
+; CHECK: [[META1]] = !{!"long", [[META2:![0-9]+]], i64 0}
+; CHECK: [[META2]] = !{!"omnipotent char", [[META3:![0-9]+]], i64 0}
+; CHECK: [[META3]] = !{!"Simple C/C++ TBAA"}
+; CHECK: [[ANYPTR_TBAA4]] = !{[[META5:![0-9]+]], [[META5]], i64 0}
+; CHECK: [[META5]] = !{!"any pointer", [[META2]], i64 0}
+;.
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/reversed-strided-load.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/reversed-strided-load.ll
new file mode 100644
index 0000000000000..77d3ac1fb2322
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/reversed-strided-load.ll
@@ -0,0 +1,85 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -mtriple=riscv64 -mattr=+m,+v \
+; RUN: -passes=slp-vectorizer \
+; RUN: -slp-disable-tree-reorder=true -slp-force-strided-loads=true \
+; RUN: -S < %s | FileCheck %s
+
+define void @const_stride_reversed(ptr %pl, ptr %ps) {
+; CHECK-LABEL: define void @const_stride_reversed(
+; CHECK-SAME: ptr [[PL:%.*]], ptr [[PS:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[GEP_L15:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 15
+; CHECK-NEXT:    [[GEP_S0:%.*]] = getelementptr inbounds i8, ptr [[PS]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.experimental.vp.strided.load.v16i8.p0.i64(ptr align 16 [[GEP_L15]], i64 -1, <16 x i1> splat (i1 true), i32 16)
+; CHECK-NEXT:    store <16 x i8> [[TMP2]], ptr [[GEP_S0]], align 16
+; CHECK-NEXT:    ret void
+;
+  %gep_l0 = getelementptr inbounds i8, ptr %pl, i64 0
+  %gep_l1 = getelementptr inbounds i8, ptr %pl, i64 1
+  %gep_l2 = getelementptr inbounds i8, ptr %pl, i64 2
+  %gep_l3 = getelementptr inbounds i8, ptr %pl, i64 3
+  %gep_l4 = getelementptr inbounds i8, ptr %pl, i64 4
+  %gep_l5 = getelementptr inbounds i8, ptr %pl, i64 5
+  %gep_l6 = getelementptr inbounds i8, ptr %pl, i64 6
+  %gep_l7 = getelementptr inbounds i8, ptr %pl, i64 7
+  %gep_l8 = getelementptr inbounds i8, ptr %pl, i64 8
+  %gep_l9 = getelementptr inbounds i8, ptr %pl, i64 9
+  %gep_l10 = getelementptr inbounds i8, ptr %pl, i64 10
+  %gep_l11 = getelementptr inbounds i8, ptr %pl, i64 11
+  %gep_l12 = getelementptr inbounds i8, ptr %pl, i64 12
+  %gep_l13 = getelementptr inbounds i8, ptr %pl, i64 13
+  %gep_l14 = getelementptr inbounds i8, ptr %pl, i64 14
+  %gep_l15 = getelementptr inbounds i8, ptr %pl, i64 15
+
+  %load0  = load i8, ptr %gep_l0 , align 16
+  %load1  = load i8, ptr %gep_l1 , align 16
+  %load2  = load i8, ptr %gep_l2 , align 16
+  %load3  = load i8, ptr %gep_l3 , align 16
+  %load4  = load i8, ptr %gep_l4 , align 16
+  %load5  = load i8, ptr %gep_l5 , align 16
+  %load6  = load i8, ptr %gep_l6 , align 16
+  %load7  = load i8, ptr %gep_l7 , align 16
+  %load8  = load i8, ptr %gep_l8 , align 16
+  %load9  = load i8, ptr %gep_l9 , align 16
+  %load10 = load i8, ptr %gep_l10, align 16
+  %load11 = load i8, ptr %gep_l11, align 16
+  %load12 = load i8, ptr %gep_l12, align 16
+  %load13 = load i8, ptr %gep_l13, align 16
+  %load14 = load i8, ptr %gep_l14, align 16
+  %load15 = load i8, ptr %gep_l15, align 16
+
+  %gep_s0 = getelementptr inbounds i8, ptr %ps, i64 0
+  %gep_s1 = getelementptr inbounds i8, ptr %ps, i64 1
+  %gep_s2 = getelementptr inbounds i8, ptr %ps, i64 2
+  %gep_s3 = getelementptr inbounds i8, ptr %ps, i64 3
+  %gep_s4 = getelementptr inbounds i8, ptr %ps, i64 4
+  %gep_s5 = getelementptr inbounds i8, ptr %ps, i64 5
+  %gep_s6 = getelementptr inbounds i8, ptr %ps, i64 6
+  %gep_s7 = getelementptr inbounds i8, ptr %ps, i64 7
+  %gep_s8 = getelementptr inbounds i8, ptr %ps, i64 8
+  %gep_s9 = getelementptr inbounds i8, ptr %ps, i64 9
+  %gep_s10 = getelementptr inbounds i8, ptr %ps, i64 10
+  %gep_s11 = getelementptr inbounds i8, ptr %ps, i64 11
+  %gep_s12 = getelementptr inbounds i8, ptr %ps, i64 12
+  %gep_s13 = getelementptr inbounds i8, ptr %ps, i64 13
+  %gep_s14 = getelementptr inbounds i8, ptr %ps, i64 14
+  %gep_s15 = getelementptr inbounds i8, ptr %ps, i64 15
+
+  store i8 %load0, ptr %gep_s15, align 16
+  store i8 %load1, ptr %gep_s14, align 16
+  store i8 %load2, ptr %gep_s13, align 16
+  store i8 %load3, ptr %gep_s12, align 16
+  store i8 %load4, ptr %gep_s11, align 16
+  store i8 %load5, ptr %gep_s10, align 16
+  store i8 %load6, ptr %gep_s9, align 16
+  store i8 %load7, ptr %gep_s8, align 16
+  store i8 %load8, ptr %gep_s7, align 16
+  store i8 %load9, ptr %gep_s6, align 16
+  store i8 %load10, ptr %gep_s5, align 16
+  store i8 %load11, ptr %gep_s4, align 16
+  store i8 %load12, ptr %gep_s3, align 16
+  store i8 %load13, ptr %gep_s2, align 16
+  store i8 %load14, ptr %gep_s1, align 16
+  store i8 %load15, ptr %gep_s0, align 16
+
+  ret void
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/SystemZ/vec-elt-insertion.ll b/llvm/test/Transforms/SLPVectorizer/SystemZ/vec-elt-insertion.ll
index 85b8157c949f1..541e76138e373 100644
--- a/llvm/test/Transforms/SLPVectorizer/SystemZ/vec-elt-insertion.ll
+++ b/llvm/test/Transforms/SLPVectorizer/SystemZ/vec-elt-insertion.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt < %s -mtriple=s390x-unknown-linux -mcpu=z16 -S -passes=slp-vectorizer \
 ; RUN:   -pass-remarks-output=%t | FileCheck %s
 ; RUN: cat %t | FileCheck -check-prefix=REMARK %s
@@ -147,8 +147,8 @@ define void @fun3(ptr %0)  {
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i64 48
 ; CHECK-NEXT:    br label %[[BB5:.*]]
 ; CHECK:       [[BB5]]:
-; CHECK-NEXT:    store ptr null, ptr [[TMP3]], align 8, !tbaa [[TBAA0:![0-9]+]]
-; CHECK-NEXT:    [[TMP6:%.*]] = load ptr, ptr inttoptr (i64 64 to ptr), align 8, !tbaa [[TBAA8:![0-9]+]]
+; CHECK-NEXT:    store ptr null, ptr [[TMP3]], align 8, !tbaa [[ANYPTR_TBAA0:![0-9]+]]
+; CHECK-NEXT:    [[TMP6:%.*]] = load ptr, ptr inttoptr (i64 64 to ptr), align 8, !tbaa [[ANYPTR_TBAA8:![0-9]+]]
 ; CHECK-NEXT:    store ptr [[TMP6]], ptr [[TMP4]], align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = tail call i64 [[TMP0]](ptr noundef poison, i64 noundef poison)
 ; CHECK-NEXT:    br label %[[BB5]]
@@ -177,7 +177,7 @@ define void @fun3(ptr %0)  {
 !9 = !{!10, !7, i64 64}
 !10 = !{!"node", !6, i64 0, !3, i64 8, !7, i64 16, !7, i64 24, !7, i64 32, !7, i64 40, !7, i64 48, !7, i64 56, !7, i64 64, !7, i64 72, !6, i64 80, !6, i64 88, !3, i64 96, !3, i64 100}
 ;.
-; CHECK: [[TBAA0]] = !{[[META1:![0-9]+]], [[META6:![0-9]+]], i64 40}
+; CHECK: [[ANYPTR_TBAA0]] = !{[[META1:![0-9]+]], [[META6:![0-9]+]], i64 40}
 ; CHECK: [[META1]] = !{!"arc", [[META2:![0-9]+]], i64 0, [[META5:![0-9]+]], i64 8, [[META6]], i64 16, [[META6]], i64 24, [[META7:![0-9]+]], i64 32, [[META6]], i64 40, [[META6]], i64 48, [[META5]], i64 56, [[META5]], i64 64}
 ; CHECK: [[META2]] = !{!"int", [[META3:![0-9]+]], i64 0}
 ; CHECK: [[META3]] = !{!"omnipotent char", [[META4:![0-9]+]], i64 0}
@@ -185,6 +185,6 @@ define void @fun3(ptr %0)  {
 ; CHECK: [[META5]] = !{!"long", [[META3]], i64 0}
 ; CHECK: [[META6]] = !{!"any pointer", [[META3]], i64 0}
 ; CHECK: [[META7]] = !{!"short", [[META3]], i64 0}
-; CHECK: [[TBAA8]] = !{[[META9:![0-9]+]], [[META6]], i64 64}
+; CHECK: [[ANYPTR_TBAA8]] = !{[[META9:![0-9]+]], [[META6]], i64 64}
 ; CHECK: [[META9]] = !{!"node", [[META5]], i64 0, [[META2]], i64 8, [[META6]], i64 16, [[META6]], i64 24, [[META6]], i64 32, [[META6]], i64 40, [[META6]], i64 48, [[META6]], i64 56, [[META6]], i64 64, [[META6]], i64 72, [[META5]], i64 80, [[META5]], i64 88, [[META2]], i64 96, [[META2]], i64 100}
 ;.
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/commutable-member-in-non-commutable-node.ll b/llvm/test/Transforms/SLPVectorizer/X86/commutable-member-in-non-commutable-node.ll
new file mode 100644
index 0000000000000..adceef172864d
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/commutable-member-in-non-commutable-node.ll
@@ -0,0 +1,24 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt --passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f < %s | FileCheck %s
+
+define i64 @test(i32 %arg) {
+; CHECK-LABEL: define i64 @test(
+; CHECK-SAME: i32 [[ARG:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[BB:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(1) getelementptr inbounds nuw (i8, ptr addrspace(1) null, i64 896), align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = add <2 x i32> [[TMP0]], zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i32> <i32 poison, i32 0>, i32 [[ARG]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = sub <2 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    store <2 x i32> [[TMP3]], ptr addrspace(1) getelementptr inbounds nuw (i8, ptr addrspace(1) null, i64 896), align 4
+; CHECK-NEXT:    ret i64 0
+;
+bb:
+  %load = load i32, ptr addrspace(1) getelementptr inbounds nuw (i8, ptr addrspace(1) null, i64 900), align 4
+  %add = add i32 0, %load
+  store i32 %add, ptr addrspace(1) getelementptr inbounds nuw (i8, ptr addrspace(1) null, i64 900), align 4
+  %load1 = load i32, ptr addrspace(1) getelementptr inbounds nuw (i8, ptr addrspace(1) null, i64 896), align 4
+  %add2 = add i32 %load1, 0
+  %sub = sub i32 %add2, %arg
+  store i32 %sub, ptr addrspace(1) getelementptr inbounds nuw (i8, ptr addrspace(1) null, i64 896), align 4
+  ret i64 0
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_scheduling-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_scheduling-inseltpoison.ll
index 95ae544e2c62f..6f0521066f0d8 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/crash_scheduling-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_scheduling-inseltpoison.ll
@@ -1,38 +1,39 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 | FileCheck %s
 
 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-darwin13.3.0"
 
 define void @_foo(double %p1, double %p2, double %p3) #0 {
-; CHECK-LABEL: @_foo(
-; CHECK-NEXT:  entry:
+; CHECK-LABEL: define void @_foo(
+; CHECK-SAME: double [[P1:%.*]], double [[P2:%.*]], double [[P3:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[TAB1:%.*]] = alloca [256 x i32], align 16
 ; CHECK-NEXT:    [[TAB2:%.*]] = alloca [256 x i32], align 16
-; CHECK-NEXT:    br label [[BB1:%.*]]
-; CHECK:       bb1:
-; CHECK-NEXT:    [[MUL20:%.*]] = fmul double [[P3:%.*]], 1.638400e+04
+; CHECK-NEXT:    br label %[[BB1:.*]]
+; CHECK:       [[BB1]]:
+; CHECK-NEXT:    [[MUL20:%.*]] = fmul double [[P3]], 1.638400e+04
 ; CHECK-NEXT:    [[ADD:%.*]] = fadd double [[MUL20]], 8.192000e+03
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x double> poison, double [[P2:%.*]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[P1:%.*]], i32 1
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x double> poison, double [[P2]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[P1]], i32 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = fmul <2 x double> [[TMP1]], splat (double 1.638400e+04)
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> <double poison, double 0.000000e+00>, double [[ADD]], i32 0
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV266:%.*]] = phi i64 [ 0, [[BB1]] ], [ [[INDVARS_IV_NEXT267:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = phi <2 x double> [ [[TMP3]], [[BB1]] ], [ [[TMP6:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[INDVARS_IV266:%.*]] = phi i64 [ 0, %[[BB1]] ], [ [[INDVARS_IV_NEXT267:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = phi <2 x double> [ [[TMP3]], %[[BB1]] ], [ [[TMP6:%.*]], %[[FOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> poison, <2 x i32> <i32 1, i32 poison>
 ; CHECK-NEXT:    [[X13:%.*]] = tail call i32 @_xfn(<2 x double> [[TMP5]])
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [256 x i32], ptr [[TAB1]], i64 0, i64 [[INDVARS_IV266]]
-; CHECK-NEXT:    store i32 [[X13]], ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA0:![0-9]+]]
+; CHECK-NEXT:    store i32 [[X13]], ptr [[ARRAYIDX]], align 4, !tbaa [[INT_TBAA0:![0-9]+]]
 ; CHECK-NEXT:    [[X14:%.*]] = tail call i32 @_xfn(<2 x double> [[TMP4]])
 ; CHECK-NEXT:    [[ARRAYIDX26:%.*]] = getelementptr inbounds [256 x i32], ptr [[TAB2]], i64 0, i64 [[INDVARS_IV266]]
-; CHECK-NEXT:    store i32 [[X14]], ptr [[ARRAYIDX26]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    store i32 [[X14]], ptr [[ARRAYIDX26]], align 4, !tbaa [[INT_TBAA0]]
 ; CHECK-NEXT:    [[TMP6]] = fadd <2 x double> [[TMP2]], [[TMP4]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT267]] = add nuw nsw i64 [[INDVARS_IV266]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT267]], 256
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[RETURN:%.*]], label [[FOR_BODY]]
-; CHECK:       return:
+; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[RETURN:.*]], label %[[FOR_BODY]]
+; CHECK:       [[RETURN]]:
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -78,3 +79,9 @@ declare i32 @_xfn(<2 x double>) #4
 !4 = !{!3, !3, i64 0}
 !5 = !{!"omnipotent char", !6, i64 0}
 !6 = !{!"Simple C/C++ TBAA"}
+;.
+; CHECK: [[INT_TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0}
+; CHECK: [[META1]] = !{!"int", [[META2:![0-9]+]], i64 0}
+; CHECK: [[META2]] = !{!"omnipotent char", [[META3:![0-9]+]], i64 0}
+; CHECK: [[META3]] = !{!"Simple C/C++ TBAA"}
+;.
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_scheduling.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_scheduling.ll
index 1e31772b8e49e..2d9e1f79e827c 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/crash_scheduling.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_scheduling.ll
@@ -1,38 +1,39 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 | FileCheck %s
 
 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-darwin13.3.0"
 
 define void @_foo(double %p1, double %p2, double %p3) #0 {
-; CHECK-LABEL: @_foo(
-; CHECK-NEXT:  entry:
+; CHECK-LABEL: define void @_foo(
+; CHECK-SAME: double [[P1:%.*]], double [[P2:%.*]], double [[P3:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[TAB1:%.*]] = alloca [256 x i32], align 16
 ; CHECK-NEXT:    [[TAB2:%.*]] = alloca [256 x i32], align 16
-; CHECK-NEXT:    br label [[BB1:%.*]]
-; CHECK:       bb1:
-; CHECK-NEXT:    [[MUL20:%.*]] = fmul double [[P3:%.*]], 1.638400e+04
+; CHECK-NEXT:    br label %[[BB1:.*]]
+; CHECK:       [[BB1]]:
+; CHECK-NEXT:    [[MUL20:%.*]] = fmul double [[P3]], 1.638400e+04
 ; CHECK-NEXT:    [[ADD:%.*]] = fadd double [[MUL20]], 8.192000e+03
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x double> poison, double [[P2:%.*]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[P1:%.*]], i32 1
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x double> poison, double [[P2]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[P1]], i32 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = fmul <2 x double> [[TMP1]], splat (double 1.638400e+04)
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> <double poison, double 0.000000e+00>, double [[ADD]], i32 0
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV266:%.*]] = phi i64 [ 0, [[BB1]] ], [ [[INDVARS_IV_NEXT267:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = phi <2 x double> [ [[TMP3]], [[BB1]] ], [ [[TMP6:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[INDVARS_IV266:%.*]] = phi i64 [ 0, %[[BB1]] ], [ [[INDVARS_IV_NEXT267:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = phi <2 x double> [ [[TMP3]], %[[BB1]] ], [ [[TMP6:%.*]], %[[FOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> poison, <2 x i32> <i32 1, i32 poison>
 ; CHECK-NEXT:    [[X13:%.*]] = tail call i32 @_xfn(<2 x double> [[TMP5]])
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [256 x i32], ptr [[TAB1]], i64 0, i64 [[INDVARS_IV266]]
-; CHECK-NEXT:    store i32 [[X13]], ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA0:![0-9]+]]
+; CHECK-NEXT:    store i32 [[X13]], ptr [[ARRAYIDX]], align 4, !tbaa [[INT_TBAA0:![0-9]+]]
 ; CHECK-NEXT:    [[X14:%.*]] = tail call i32 @_xfn(<2 x double> [[TMP4]])
 ; CHECK-NEXT:    [[ARRAYIDX26:%.*]] = getelementptr inbounds [256 x i32], ptr [[TAB2]], i64 0, i64 [[INDVARS_IV266]]
-; CHECK-NEXT:    store i32 [[X14]], ptr [[ARRAYIDX26]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    store i32 [[X14]], ptr [[ARRAYIDX26]], align 4, !tbaa [[INT_TBAA0]]
 ; CHECK-NEXT:    [[TMP6]] = fadd <2 x double> [[TMP2]], [[TMP4]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT267]] = add nuw nsw i64 [[INDVARS_IV266]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT267]], 256
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[RETURN:%.*]], label [[FOR_BODY]]
-; CHECK:       return:
+; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[RETURN:.*]], label %[[FOR_BODY]]
+; CHECK:       [[RETURN]]:
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -78,3 +79,9 @@ declare i32 @_xfn(<2 x double>) #4
 !4 = !{!3, !3, i64 0}
 !5 = !{!"omnipotent char", !6, i64 0}
 !6 = !{!"Simple C/C++ TBAA"}
+;.
+; CHECK: [[INT_TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0}
+; CHECK: [[META1]] = !{!"int", [[META2:![0-9]+]], i64 0}
+; CHECK: [[META2]] = !{!"omnipotent char", [[META3:![0-9]+]], i64 0}
+; CHECK: [[META3]] = !{!"Simple C/C++ TBAA"}
+;.
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/metadata.ll b/llvm/test/Transforms/SLPVectorizer/X86/metadata.ll
index c4bdfa804868e..635ec32ca055b 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/metadata.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/metadata.ll
@@ -1,16 +1,17 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt < %s -passes=slp-vectorizer,dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"
 
 define void @test1(ptr %a, ptr %b, ptr %c) {
-; CHECK-LABEL: @test1(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[A:%.*]], align 8, !tbaa [[TBAA0:![0-9]+]]
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr [[B:%.*]], align 8, !tbaa [[TBAA0]]
-; CHECK-NEXT:    [[TMP4:%.*]] = fmul <2 x double> [[TMP1]], [[TMP3]], !fpmath !4
-; CHECK-NEXT:    store <2 x double> [[TMP4]], ptr [[C:%.*]], align 8, !tbaa [[TBAA0]]
+; CHECK-LABEL: define void @test1(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A]], align 8, !tbaa [[DOUBLE_TBAA0:![0-9]+]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B]], align 8, !tbaa [[DOUBLE_TBAA0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fmul <2 x double> [[TMP0]], [[TMP1]], !fpmath [[META4:![0-9]+]]
+; CHECK-NEXT:    store <2 x double> [[TMP2]], ptr [[C]], align 8, !tbaa [[DOUBLE_TBAA0]]
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -29,12 +30,13 @@ entry:
 }
 
 define void @test2(ptr %a, ptr %b, ptr %e) {
-; CHECK-LABEL: @test2(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[A:%.*]], align 8, !tbaa [[TBAA0]]
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr [[B:%.*]], align 8, !tbaa [[TBAA0]]
-; CHECK-NEXT:    [[TMP4:%.*]] = fmul <2 x double> [[TMP1]], [[TMP3]], !fpmath !5
-; CHECK-NEXT:    store <2 x double> [[TMP4]], ptr [[E:%.*]], align 8, !tbaa [[TBAA0]]
+; CHECK-LABEL: define void @test2(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[E:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A]], align 8, !tbaa [[DOUBLE_TBAA0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B]], align 8, !tbaa [[DOUBLE_TBAA0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fmul <2 x double> [[TMP0]], [[TMP1]], !fpmath [[META5:![0-9]+]]
+; CHECK-NEXT:    store <2 x double> [[TMP2]], ptr [[E]], align 8, !tbaa [[DOUBLE_TBAA0]]
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -52,10 +54,16 @@ entry:
   ret void
 }
 
-;CHECK-DAG: !4 = !{float 5.000000e+00}
-;CHECK-DAG: !5 = !{float 2.500000e+00}
 !0 = !{ float 5.0 }
 !1 = !{ float 2.5 }
 !2 = !{!"Simple C/C++ TBAA"}
 !3 = !{!"omnipotent char", !2}
 !4 = !{!"double", !3}
+;.
+; CHECK: [[DOUBLE_TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0}
+; CHECK: [[META1]] = !{!"double", [[META2:![0-9]+]]}
+; CHECK: [[META2]] = !{!"omnipotent char", [[META3:![0-9]+]]}
+; CHECK: [[META3]] = !{!"Simple C/C++ TBAA"}
+; CHECK: [[META4]] = !{float 5.000000e+00}
+; CHECK: [[META5]] = !{float 2.500000e+00}
+;.
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr16899.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr16899.ll
index ff4ef6086d42a..1b76ee970e6d8 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr16899.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr16899.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=i386--netbsd -mcpu=i486 | FileCheck %s
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32-S128"
 target triple = "i386--netbsd"
@@ -7,19 +7,20 @@ target triple = "i386--netbsd"
 
 ; Function Attrs: noreturn nounwind readonly
 define i32 @fn1() #0 {
-; CHECK-LABEL: @fn1(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr @a, align 4, !tbaa [[TBAA0:![0-9]+]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4, !tbaa [[TBAA4:![0-9]+]]
+; CHECK-LABEL: define i32 @fn1(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr @a, align 4, !tbaa [[ANYPTR_TBAA0:![0-9]+]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4, !tbaa [[INT_TBAA4:![0-9]+]]
 ; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 1
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4, !tbaa [[TBAA4]]
-; CHECK-NEXT:    br label [[DO_BODY:%.*]]
-; CHECK:       do.body:
-; CHECK-NEXT:    [[C_0:%.*]] = phi i32 [ [[TMP2]], [[ENTRY:%.*]] ], [ [[ADD2:%.*]], [[DO_BODY]] ]
-; CHECK-NEXT:    [[B_0:%.*]] = phi i32 [ [[TMP1]], [[ENTRY]] ], [ [[ADD:%.*]], [[DO_BODY]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4, !tbaa [[INT_TBAA4]]
+; CHECK-NEXT:    br label %[[DO_BODY:.*]]
+; CHECK:       [[DO_BODY]]:
+; CHECK-NEXT:    [[C_0:%.*]] = phi i32 [ [[TMP2]], %[[ENTRY]] ], [ [[ADD2:%.*]], %[[DO_BODY]] ]
+; CHECK-NEXT:    [[B_0:%.*]] = phi i32 [ [[TMP1]], %[[ENTRY]] ], [ [[ADD:%.*]], %[[DO_BODY]] ]
 ; CHECK-NEXT:    [[ADD]] = add nsw i32 [[B_0]], [[C_0]]
 ; CHECK-NEXT:    [[ADD2]] = add nsw i32 [[ADD]], 1
-; CHECK-NEXT:    br label [[DO_BODY]]
+; CHECK-NEXT:    br label %[[DO_BODY]]
 ;
 entry:
   %0 = load ptr, ptr @a, align 4, !tbaa !4
@@ -44,3 +45,11 @@ attributes #0 = { noreturn nounwind readonly "less-precise-fpmad"="false" "frame
 !3 = !{!"int", !1}
 !4 = !{!0, !0, i64 0}
 !5 = !{!3, !3, i64 0}
+;.
+; CHECK: [[ANYPTR_TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0}
+; CHECK: [[META1]] = !{!"any pointer", [[META2:![0-9]+]]}
+; CHECK: [[META2]] = !{!"omnipotent char", [[META3:![0-9]+]]}
+; CHECK: [[META3]] = !{!"Simple C/C++ TBAA"}
+; CHECK: [[INT_TBAA4]] = !{[[META5:![0-9]+]], [[META5]], i64 0}
+; CHECK: [[META5]] = !{!"int", [[META2]]}
+;.
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr40522.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr40522.ll
index 6fd2de8ad8ab5..618c316c6f2fa 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr40522.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr40522.ll
@@ -1,15 +1,16 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt -passes=slp-vectorizer -S -mtriple=x86_64-- -mcpu=corei7 < %s | FileCheck %s
 
 define void @test1(float %a, float %b, float %c, float %d, ptr nocapture %p) {
-; CHECK-LABEL: @test1(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x float> poison, float [[A:%.*]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> [[TMP0]], float [[B:%.*]], i32 1
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> [[TMP1]], float [[C:%.*]], i32 2
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x float> [[TMP2]], float [[D:%.*]], i32 3
+; CHECK-LABEL: define void @test1(
+; CHECK-SAME: float [[A:%.*]], float [[B:%.*]], float [[C:%.*]], float [[D:%.*]], ptr captures(none) [[P:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x float> poison, float [[A]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> [[TMP0]], float [[B]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> [[TMP1]], float [[C]], i32 2
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x float> [[TMP2]], float [[D]], i32 3
 ; CHECK-NEXT:    [[TMP4:%.*]] = fptosi <4 x float> [[TMP3]] to <4 x i32>
-; CHECK-NEXT:    store <4 x i32> [[TMP4]], ptr [[P:%.*]], align 4, !tbaa [[TBAA0:![0-9]+]]
+; CHECK-NEXT:    store <4 x i32> [[TMP4]], ptr [[P]], align 4, !tbaa [[INT_TBAA0:![0-9]+]]
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -28,14 +29,15 @@ entry:
 }
 
 define void @test1_vec(float %a, float %b, float %c, float %d, ptr nocapture %p) {
-; CHECK-LABEL: @test1_vec(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x float> poison, float [[A:%.*]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> [[TMP0]], float [[B:%.*]], i32 1
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> [[TMP1]], float [[C:%.*]], i32 2
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x float> [[TMP2]], float [[D:%.*]], i32 3
+; CHECK-LABEL: define void @test1_vec(
+; CHECK-SAME: float [[A:%.*]], float [[B:%.*]], float [[C:%.*]], float [[D:%.*]], ptr captures(none) [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x float> poison, float [[A]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> [[TMP0]], float [[B]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> [[TMP1]], float [[C]], i32 2
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x float> [[TMP2]], float [[D]], i32 3
 ; CHECK-NEXT:    [[TMP4:%.*]] = fptosi <4 x float> [[TMP3]] to <4 x i32>
-; CHECK-NEXT:    store <4 x i32> [[TMP4]], ptr [[P:%.*]], align 16, !tbaa [[TBAA0]]
+; CHECK-NEXT:    store <4 x i32> [[TMP4]], ptr [[P]], align 16, !tbaa [[INT_TBAA0]]
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -52,14 +54,15 @@ entry:
 }
 
 define void @test2(i32 %a, i32 %b, i32 %c, i32 %d, ptr nocapture %p) {
-; CHECK-LABEL: @test2(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[A:%.*]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> [[TMP0]], i32 [[B:%.*]], i32 1
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[C:%.*]], i32 2
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[D:%.*]], i32 3
+; CHECK-LABEL: define void @test2(
+; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]], i32 [[D:%.*]], ptr captures(none) [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> [[TMP0]], i32 [[B]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[C]], i32 2
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[D]], i32 3
 ; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[TMP3]], splat (i32 1)
-; CHECK-NEXT:    store <4 x i32> [[TMP4]], ptr [[P:%.*]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    store <4 x i32> [[TMP4]], ptr [[P]], align 4, !tbaa [[INT_TBAA0]]
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -78,13 +81,14 @@ entry:
 }
 
 define void @test2_vec(i32 %0, i32 %1, i32 %2, i32 %3, ptr nocapture %4) {
-; CHECK-LABEL: @test2_vec(
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0:%.*]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[TMP1:%.*]], i32 1
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP2:%.*]], i32 2
-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[TMP3:%.*]], i32 3
+; CHECK-LABEL: define void @test2_vec(
+; CHECK-SAME: i32 [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], i32 [[TMP3:%.*]], ptr captures(none) [[TMP4:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[TMP1]], i32 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP2]], i32 2
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[TMP3]], i32 3
 ; CHECK-NEXT:    [[TMP10:%.*]] = add nsw <4 x i32> [[TMP9]], splat (i32 1)
-; CHECK-NEXT:    store <4 x i32> [[TMP10]], ptr [[TMP4:%.*]], align 16, !tbaa [[TBAA0]]
+; CHECK-NEXT:    store <4 x i32> [[TMP10]], ptr [[TMP4]], align 16, !tbaa [[INT_TBAA0]]
 ; CHECK-NEXT:    ret void
 ;
   %6 = add nsw i32 %0, 1
@@ -103,3 +107,9 @@ define void @test2_vec(i32 %0, i32 %1, i32 %2, i32 %3, ptr nocapture %4) {
 !3 = !{!"int", !4, i64 0}
 !4 = !{!"omnipotent char", !5, i64 0}
 !5 = !{!"Simple C++ TBAA"}
+;.
+; CHECK: [[INT_TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0}
+; CHECK: [[META1]] = !{!"int", [[META2:![0-9]+]], i64 0}
+; CHECK: [[META2]] = !{!"omnipotent char", [[META3:![0-9]+]], i64 0}
+; CHECK: [[META3]] = !{!"Simple C++ TBAA"}
+;.
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll
index 9e4f10ec7b349..9c8ba07734b87 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2   -slp-threshold=-1 | FileCheck %s
 ; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.2 | FileCheck %s
 ; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu -mattr=+avx    | FileCheck %s
@@ -6,15 +6,16 @@
 ; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512bw,+avx512vl | FileCheck %s
 
 define void @store_i32(ptr nocapture %0, i32 %1, i32 %2) {
-; CHECK-LABEL: @store_i32(
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0:![0-9]+]]
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> poison, i32 [[TMP1:%.*]], i32 0
+; CHECK-LABEL: define void @store_i32(
+; CHECK-SAME: ptr captures(none) [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr [[TMP0]], align 4, !tbaa [[INT_TBAA0:![0-9]+]]
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> poison, i32 [[TMP1]], i32 0
 ; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP7:%.*]] = mul <4 x i32> [[TMP4]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = lshr <4 x i32> [[TMP7]], splat (i32 15)
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp ult <4 x i32> [[TMP8]], splat (i32 255)
 ; CHECK-NEXT:    [[TMP10:%.*]] = select <4 x i1> [[TMP9]], <4 x i32> [[TMP8]], <4 x i32> splat (i32 255)
-; CHECK-NEXT:    store <4 x i32> [[TMP10]], ptr [[TMP0]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    store <4 x i32> [[TMP10]], ptr [[TMP0]], align 4, !tbaa [[INT_TBAA0]]
 ; CHECK-NEXT:    ret void
 ;
   %4 = load i32, ptr %0, align 4, !tbaa !2
@@ -48,17 +49,18 @@ define void @store_i32(ptr nocapture %0, i32 %1, i32 %2) {
 }
 
 define void @store_i8(ptr nocapture %0, i32 %1, i32 %2) {
-; CHECK-LABEL: @store_i8(
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i8>, ptr [[TMP0:%.*]], align 1, !tbaa [[TBAA4:![0-9]+]]
+; CHECK-LABEL: define void @store_i8(
+; CHECK-SAME: ptr captures(none) [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i8>, ptr [[TMP0]], align 1, !tbaa [[CHAR_TBAA4:![0-9]+]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = zext <4 x i8> [[TMP4]] to <4 x i32>
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> poison, i32 [[TMP1:%.*]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> poison, i32 [[TMP1]], i32 0
 ; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP8:%.*]] = mul <4 x i32> [[TMP5]], [[TMP7]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = lshr <4 x i32> [[TMP8]], splat (i32 15)
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp ult <4 x i32> [[TMP9]], splat (i32 255)
 ; CHECK-NEXT:    [[TMP11:%.*]] = select <4 x i1> [[TMP10]], <4 x i32> [[TMP9]], <4 x i32> splat (i32 255)
 ; CHECK-NEXT:    [[TMP12:%.*]] = trunc <4 x i32> [[TMP11]] to <4 x i8>
-; CHECK-NEXT:    store <4 x i8> [[TMP12]], ptr [[TMP0]], align 1, !tbaa [[TBAA4]]
+; CHECK-NEXT:    store <4 x i8> [[TMP12]], ptr [[TMP0]], align 1, !tbaa [[CHAR_TBAA4]]
 ; CHECK-NEXT:    ret void
 ;
   %4 = load i8, ptr %0, align 1, !tbaa !6
@@ -100,9 +102,10 @@ define void @store_i8(ptr nocapture %0, i32 %1, i32 %2) {
 }
 
 define void @store_i64(ptr nocapture %0, i32 %1, i32 %2) {
-; CHECK-LABEL: @store_i64(
-; CHECK-NEXT:    [[TMP4:%.*]] = zext i32 [[TMP1:%.*]] to i64
-; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 8, !tbaa [[TBAA5:![0-9]+]]
+; CHECK-LABEL: define void @store_i64(
+; CHECK-SAME: ptr captures(none) [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i32 [[TMP1]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i64>, ptr [[TMP0]], align 8, !tbaa [[LONG_TBAA5:![0-9]+]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i64> poison, i64 [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i64> [[TMP6]], <4 x i64> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP8:%.*]] = mul <4 x i64> [[TMP5]], [[TMP7]]
@@ -111,7 +114,7 @@ define void @store_i64(ptr nocapture %0, i32 %1, i32 %2) {
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp ult <4 x i32> [[TMP10]], splat (i32 255)
 ; CHECK-NEXT:    [[TMP12:%.*]] = and <4 x i64> [[TMP9]], splat (i64 4294967295)
 ; CHECK-NEXT:    [[TMP13:%.*]] = select <4 x i1> [[TMP11]], <4 x i64> [[TMP12]], <4 x i64> splat (i64 255)
-; CHECK-NEXT:    store <4 x i64> [[TMP13]], ptr [[TMP0]], align 8, !tbaa [[TBAA5]]
+; CHECK-NEXT:    store <4 x i64> [[TMP13]], ptr [[TMP0]], align 8, !tbaa [[LONG_TBAA5]]
 ; CHECK-NEXT:    ret void
 ;
   %4 = zext i32 %1 to i64
@@ -160,3 +163,12 @@ define void @store_i64(ptr nocapture %0, i32 %1, i32 %2) {
 !6 = !{!4, !4, i64 0}
 !7 = !{!8, !8, i64 0}
 !8 = !{!"long", !4, i64 0}
+;.
+; CHECK: [[INT_TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0}
+; CHECK: [[META1]] = !{!"int", [[META2:![0-9]+]], i64 0}
+; CHECK: [[META2]] = !{!"omnipotent char", [[META3:![0-9]+]], i64 0}
+; CHECK: [[META3]] = !{!"Simple C++ TBAA"}
+; CHECK: [[CHAR_TBAA4]] = !{[[META2]], [[META2]], i64 0}
+; CHECK: [[LONG_TBAA5]] = !{[[META6:![0-9]+]], [[META6]], i64 0}
+; CHECK: [[META6]] = !{!"long", [[META2]], i64 0}
+;.
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll
index db38a62017391..fde76f8b0e8b9 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN:  opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux -mattr=+sse2     | FileCheck %s --check-prefixes=SSE
 ; RUN:  opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux -mattr=+avx      | FileCheck %s --check-prefixes=AVX
 ; RUN:  opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux -mattr=+avx2     | FileCheck %s --check-prefixes=AVX2
@@ -6,99 +6,104 @@
 ; RUN:  opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512VL
 
 define void @gather_load(ptr noalias nocapture %0, ptr noalias nocapture readonly %1) {
-; SSE-LABEL: @gather_load(
-; SSE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1:%.*]], i64 1
-; SSE-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP1]], align 4, !tbaa [[TBAA0:![0-9]+]]
-; SSE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP0:%.*]], i64 1
+; SSE-LABEL: define void @gather_load(
+; SSE-SAME: ptr noalias captures(none) [[TMP0:%.*]], ptr noalias readonly captures(none) [[TMP1:%.*]]) #[[ATTR0:[0-9]+]] {
+; SSE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 1
+; SSE-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP1]], align 4, !tbaa [[SHORT_TBAA0:![0-9]+]]
+; SSE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 1
 ; SSE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 11
-; SSE-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4, !tbaa [[SHORT_TBAA0]]
 ; SSE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 2
 ; SSE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 4
-; SSE-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[SHORT_TBAA0]]
 ; SSE-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 3
-; SSE-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[SHORT_TBAA0]]
 ; SSE-NEXT:    [[TMP16:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i32 0
 ; SSE-NEXT:    [[TMP17:%.*]] = insertelement <4 x i32> [[TMP16]], i32 [[TMP7]], i32 1
 ; SSE-NEXT:    [[TMP15:%.*]] = insertelement <4 x i32> [[TMP17]], i32 [[TMP10]], i32 2
 ; SSE-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP15]], i32 [[TMP12]], i32 3
 ; SSE-NEXT:    [[TMP14:%.*]] = add nsw <4 x i32> [[TMP13]], <i32 1, i32 2, i32 3, i32 4>
-; SSE-NEXT:    store <4 x i32> [[TMP14]], ptr [[TMP0]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    store <4 x i32> [[TMP14]], ptr [[TMP0]], align 4, !tbaa [[SHORT_TBAA0]]
 ; SSE-NEXT:    ret void
 ;
-; AVX-LABEL: @gather_load(
-; AVX-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1:%.*]], i64 1
-; AVX-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP1]], align 4, !tbaa [[TBAA0:![0-9]+]]
-; AVX-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP0:%.*]], i64 1
+; AVX-LABEL: define void @gather_load(
+; AVX-SAME: ptr noalias captures(none) [[TMP0:%.*]], ptr noalias readonly captures(none) [[TMP1:%.*]]) #[[ATTR0:[0-9]+]] {
+; AVX-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 1
+; AVX-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP1]], align 4, !tbaa [[SHORT_TBAA0:![0-9]+]]
+; AVX-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 1
 ; AVX-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 11
-; AVX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 2
 ; AVX-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 4
-; AVX-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 3
-; AVX-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX-NEXT:    [[TMP16:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i32 0
 ; AVX-NEXT:    [[TMP17:%.*]] = insertelement <4 x i32> [[TMP16]], i32 [[TMP7]], i32 1
 ; AVX-NEXT:    [[TMP15:%.*]] = insertelement <4 x i32> [[TMP17]], i32 [[TMP10]], i32 2
 ; AVX-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP15]], i32 [[TMP12]], i32 3
 ; AVX-NEXT:    [[TMP14:%.*]] = add nsw <4 x i32> [[TMP13]], <i32 1, i32 2, i32 3, i32 4>
-; AVX-NEXT:    store <4 x i32> [[TMP14]], ptr [[TMP0]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    store <4 x i32> [[TMP14]], ptr [[TMP0]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX-NEXT:    ret void
 ;
-; AVX2-LABEL: @gather_load(
-; AVX2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1:%.*]], i64 1
-; AVX2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP1]], align 4, !tbaa [[TBAA0:![0-9]+]]
-; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP0:%.*]], i64 1
+; AVX2-LABEL: define void @gather_load(
+; AVX2-SAME: ptr noalias captures(none) [[TMP0:%.*]], ptr noalias readonly captures(none) [[TMP1:%.*]]) #[[ATTR0:[0-9]+]] {
+; AVX2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 1
+; AVX2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP1]], align 4, !tbaa [[SHORT_TBAA0:![0-9]+]]
+; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 1
 ; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 11
-; AVX2-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 2
 ; AVX2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 4
-; AVX2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 3
-; AVX2-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX2-NEXT:    [[TMP16:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i32 0
 ; AVX2-NEXT:    [[TMP17:%.*]] = insertelement <4 x i32> [[TMP16]], i32 [[TMP7]], i32 1
 ; AVX2-NEXT:    [[TMP15:%.*]] = insertelement <4 x i32> [[TMP17]], i32 [[TMP10]], i32 2
 ; AVX2-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP15]], i32 [[TMP12]], i32 3
 ; AVX2-NEXT:    [[TMP14:%.*]] = add nsw <4 x i32> [[TMP13]], <i32 1, i32 2, i32 3, i32 4>
-; AVX2-NEXT:    store <4 x i32> [[TMP14]], ptr [[TMP0]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    store <4 x i32> [[TMP14]], ptr [[TMP0]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX2-NEXT:    ret void
 ;
-; AVX512F-LABEL: @gather_load(
-; AVX512F-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1:%.*]], i64 1
-; AVX512F-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP1]], align 4, !tbaa [[TBAA0:![0-9]+]]
-; AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP0:%.*]], i64 1
+; AVX512F-LABEL: define void @gather_load(
+; AVX512F-SAME: ptr noalias captures(none) [[TMP0:%.*]], ptr noalias readonly captures(none) [[TMP1:%.*]]) #[[ATTR0:[0-9]+]] {
+; AVX512F-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 1
+; AVX512F-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP1]], align 4, !tbaa [[SHORT_TBAA0:![0-9]+]]
+; AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 1
 ; AVX512F-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 11
-; AVX512F-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 2
 ; AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 4
-; AVX512F-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX512F-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 3
-; AVX512F-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX512F-NEXT:    [[TMP16:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i32 0
 ; AVX512F-NEXT:    [[TMP17:%.*]] = insertelement <4 x i32> [[TMP16]], i32 [[TMP7]], i32 1
 ; AVX512F-NEXT:    [[TMP15:%.*]] = insertelement <4 x i32> [[TMP17]], i32 [[TMP10]], i32 2
 ; AVX512F-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP15]], i32 [[TMP12]], i32 3
 ; AVX512F-NEXT:    [[TMP14:%.*]] = add nsw <4 x i32> [[TMP13]], <i32 1, i32 2, i32 3, i32 4>
-; AVX512F-NEXT:    store <4 x i32> [[TMP14]], ptr [[TMP0]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT:    store <4 x i32> [[TMP14]], ptr [[TMP0]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX512F-NEXT:    ret void
 ;
-; AVX512VL-LABEL: @gather_load(
-; AVX512VL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1:%.*]], i64 1
-; AVX512VL-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP1]], align 4, !tbaa [[TBAA0:![0-9]+]]
-; AVX512VL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP0:%.*]], i64 1
+; AVX512VL-LABEL: define void @gather_load(
+; AVX512VL-SAME: ptr noalias captures(none) [[TMP0:%.*]], ptr noalias readonly captures(none) [[TMP1:%.*]]) #[[ATTR0:[0-9]+]] {
+; AVX512VL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 1
+; AVX512VL-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP1]], align 4, !tbaa [[SHORT_TBAA0:![0-9]+]]
+; AVX512VL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 1
 ; AVX512VL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 11
-; AVX512VL-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4, !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX512VL-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 2
 ; AVX512VL-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 4
-; AVX512VL-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX512VL-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 3
-; AVX512VL-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX512VL-NEXT:    [[TMP16:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i32 0
 ; AVX512VL-NEXT:    [[TMP17:%.*]] = insertelement <4 x i32> [[TMP16]], i32 [[TMP7]], i32 1
 ; AVX512VL-NEXT:    [[TMP15:%.*]] = insertelement <4 x i32> [[TMP17]], i32 [[TMP10]], i32 2
 ; AVX512VL-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP15]], i32 [[TMP12]], i32 3
 ; AVX512VL-NEXT:    [[TMP14:%.*]] = add nsw <4 x i32> [[TMP13]], <i32 1, i32 2, i32 3, i32 4>
-; AVX512VL-NEXT:    store <4 x i32> [[TMP14]], ptr [[TMP0]], align 4, !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    store <4 x i32> [[TMP14]], ptr [[TMP0]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX512VL-NEXT:    ret void
 ;
   %3 = getelementptr inbounds i32, ptr %1, i64 1
@@ -121,78 +126,83 @@ define void @gather_load(ptr noalias nocapture %0, ptr noalias nocapture readonl
 }
 
 define void @gather_load_2(ptr noalias nocapture %0, ptr noalias nocapture readonly %1) {
-; SSE-LABEL: @gather_load_2(
-; SSE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1:%.*]], i64 1
-; SSE-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]]
+; SSE-LABEL: define void @gather_load_2(
+; SSE-SAME: ptr noalias captures(none) [[TMP0:%.*]], ptr noalias readonly captures(none) [[TMP1:%.*]]) #[[ATTR0]] {
+; SSE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 1
+; SSE-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[SHORT_TBAA0]]
 ; SSE-NEXT:    [[TMP5:%.*]] = add nsw i32 [[TMP4]], 1
-; SSE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP0:%.*]], i64 1
-; SSE-NEXT:    store i32 [[TMP5]], ptr [[TMP0]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 1
+; SSE-NEXT:    store i32 [[TMP5]], ptr [[TMP0]], align 4, !tbaa [[SHORT_TBAA0]]
 ; SSE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 10
-; SSE-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[SHORT_TBAA0]]
 ; SSE-NEXT:    [[TMP9:%.*]] = add nsw i32 [[TMP8]], 2
 ; SSE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 2
-; SSE-NEXT:    store i32 [[TMP9]], ptr [[TMP6]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    store i32 [[TMP9]], ptr [[TMP6]], align 4, !tbaa [[SHORT_TBAA0]]
 ; SSE-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 3
-; SSE-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP11]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP11]], align 4, !tbaa [[SHORT_TBAA0]]
 ; SSE-NEXT:    [[TMP13:%.*]] = add nsw i32 [[TMP12]], 3
 ; SSE-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 3
-; SSE-NEXT:    store i32 [[TMP13]], ptr [[TMP10]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    store i32 [[TMP13]], ptr [[TMP10]], align 4, !tbaa [[SHORT_TBAA0]]
 ; SSE-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 5
-; SSE-NEXT:    [[TMP16:%.*]] = load i32, ptr [[TMP15]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP16:%.*]] = load i32, ptr [[TMP15]], align 4, !tbaa [[SHORT_TBAA0]]
 ; SSE-NEXT:    [[TMP17:%.*]] = add nsw i32 [[TMP16]], 4
-; SSE-NEXT:    store i32 [[TMP17]], ptr [[TMP14]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    store i32 [[TMP17]], ptr [[TMP14]], align 4, !tbaa [[SHORT_TBAA0]]
 ; SSE-NEXT:    ret void
 ;
-; AVX-LABEL: @gather_load_2(
-; AVX-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1:%.*]], i64 1
-; AVX-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]]
+; AVX-LABEL: define void @gather_load_2(
+; AVX-SAME: ptr noalias captures(none) [[TMP0:%.*]], ptr noalias readonly captures(none) [[TMP1:%.*]]) #[[ATTR0]] {
+; AVX-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 1
+; AVX-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 10
-; AVX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 3
-; AVX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 5
-; AVX-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i32 0
 ; AVX-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP6]], i32 1
 ; AVX-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP8]], i32 2
 ; AVX-NEXT:    [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i32 3
 ; AVX-NEXT:    [[TMP15:%.*]] = add nsw <4 x i32> [[TMP14]], <i32 1, i32 2, i32 3, i32 4>
-; AVX-NEXT:    store <4 x i32> [[TMP15]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    store <4 x i32> [[TMP15]], ptr [[TMP0]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX-NEXT:    ret void
 ;
-; AVX2-LABEL: @gather_load_2(
-; AVX2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1:%.*]], i64 1
-; AVX2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]]
+; AVX2-LABEL: define void @gather_load_2(
+; AVX2-SAME: ptr noalias captures(none) [[TMP0:%.*]], ptr noalias readonly captures(none) [[TMP1:%.*]]) #[[ATTR0]] {
+; AVX2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 1
+; AVX2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 10
-; AVX2-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 3
-; AVX2-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 5
-; AVX2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX2-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i32 0
 ; AVX2-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP6]], i32 1
 ; AVX2-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP8]], i32 2
 ; AVX2-NEXT:    [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i32 3
 ; AVX2-NEXT:    [[TMP15:%.*]] = add nsw <4 x i32> [[TMP14]], <i32 1, i32 2, i32 3, i32 4>
-; AVX2-NEXT:    store <4 x i32> [[TMP15]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    store <4 x i32> [[TMP15]], ptr [[TMP0]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX2-NEXT:    ret void
 ;
-; AVX512F-LABEL: @gather_load_2(
-; AVX512F-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1:%.*]], i64 1
-; AVX512F-NEXT:    [[TMP4:%.*]] = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr [[TMP3]], i32 4, <10 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true>, <10 x i32> poison), !tbaa [[TBAA0]]
+; AVX512F-LABEL: define void @gather_load_2(
+; AVX512F-SAME: ptr noalias captures(none) [[TMP0:%.*]], ptr noalias readonly captures(none) [[TMP1:%.*]]) #[[ATTR0]] {
+; AVX512F-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 1
+; AVX512F-NEXT:    [[TMP4:%.*]] = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr [[TMP3]], i32 4, <10 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true>, <10 x i32> poison), !tbaa [[SHORT_TBAA0]]
 ; AVX512F-NEXT:    [[TMP5:%.*]] = shufflevector <10 x i32> [[TMP4]], <10 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 9>
 ; AVX512F-NEXT:    [[TMP6:%.*]] = add nsw <4 x i32> [[TMP5]], <i32 1, i32 3, i32 4, i32 2>
 ; AVX512F-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> poison, <4 x i32> <i32 0, i32 3, i32 1, i32 2>
-; AVX512F-NEXT:    store <4 x i32> [[TMP7]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT:    store <4 x i32> [[TMP7]], ptr [[TMP0]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX512F-NEXT:    ret void
 ;
-; AVX512VL-LABEL: @gather_load_2(
-; AVX512VL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1:%.*]], i64 1
-; AVX512VL-NEXT:    [[TMP4:%.*]] = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr [[TMP3]], i32 4, <10 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true>, <10 x i32> poison), !tbaa [[TBAA0]]
+; AVX512VL-LABEL: define void @gather_load_2(
+; AVX512VL-SAME: ptr noalias captures(none) [[TMP0:%.*]], ptr noalias readonly captures(none) [[TMP1:%.*]]) #[[ATTR0]] {
+; AVX512VL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 1
+; AVX512VL-NEXT:    [[TMP4:%.*]] = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr [[TMP3]], i32 4, <10 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true>, <10 x i32> poison), !tbaa [[SHORT_TBAA0]]
 ; AVX512VL-NEXT:    [[TMP5:%.*]] = shufflevector <10 x i32> [[TMP4]], <10 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 9>
 ; AVX512VL-NEXT:    [[TMP6:%.*]] = add nsw <4 x i32> [[TMP5]], <i32 1, i32 3, i32 4, i32 2>
 ; AVX512VL-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> poison, <4 x i32> <i32 0, i32 3, i32 1, i32 2>
-; AVX512VL-NEXT:    store <4 x i32> [[TMP7]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    store <4 x i32> [[TMP7]], ptr [[TMP0]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX512VL-NEXT:    ret void
 ;
   %3 = getelementptr inbounds i32, ptr %1, i64 1
@@ -219,63 +229,65 @@ define void @gather_load_2(ptr noalias nocapture %0, ptr noalias nocapture reado
 
 
 define void @gather_load_3(ptr noalias nocapture %0, ptr noalias nocapture readonly %1) {
-; SSE-LABEL: @gather_load_3(
-; SSE-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP1:%.*]], align 4, !tbaa [[TBAA0]]
+; SSE-LABEL: define void @gather_load_3(
+; SSE-SAME: ptr noalias captures(none) [[TMP0:%.*]], ptr noalias readonly captures(none) [[TMP1:%.*]]) #[[ATTR0]] {
+; SSE-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP1]], align 4, !tbaa [[SHORT_TBAA0]]
 ; SSE-NEXT:    [[TMP4:%.*]] = add i32 [[TMP3]], 1
-; SSE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP0:%.*]], i64 1
-; SSE-NEXT:    store i32 [[TMP4]], ptr [[TMP0]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 1
+; SSE-NEXT:    store i32 [[TMP4]], ptr [[TMP0]], align 4, !tbaa [[SHORT_TBAA0]]
 ; SSE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 11
-; SSE-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4, !tbaa [[SHORT_TBAA0]]
 ; SSE-NEXT:    [[TMP8:%.*]] = add i32 [[TMP7]], 2
 ; SSE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 2
-; SSE-NEXT:    store i32 [[TMP8]], ptr [[TMP5]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    store i32 [[TMP8]], ptr [[TMP5]], align 4, !tbaa [[SHORT_TBAA0]]
 ; SSE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 4
-; SSE-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 4, !tbaa [[SHORT_TBAA0]]
 ; SSE-NEXT:    [[TMP12:%.*]] = add i32 [[TMP11]], 3
 ; SSE-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 3
-; SSE-NEXT:    store i32 [[TMP12]], ptr [[TMP9]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    store i32 [[TMP12]], ptr [[TMP9]], align 4, !tbaa [[SHORT_TBAA0]]
 ; SSE-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 15
-; SSE-NEXT:    [[TMP15:%.*]] = load i32, ptr [[TMP14]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP15:%.*]] = load i32, ptr [[TMP14]], align 4, !tbaa [[SHORT_TBAA0]]
 ; SSE-NEXT:    [[TMP16:%.*]] = add i32 [[TMP15]], 4
 ; SSE-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 4
-; SSE-NEXT:    store i32 [[TMP16]], ptr [[TMP13]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    store i32 [[TMP16]], ptr [[TMP13]], align 4, !tbaa [[SHORT_TBAA0]]
 ; SSE-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 18
-; SSE-NEXT:    [[TMP19:%.*]] = load i32, ptr [[TMP18]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP19:%.*]] = load i32, ptr [[TMP18]], align 4, !tbaa [[SHORT_TBAA0]]
 ; SSE-NEXT:    [[TMP20:%.*]] = add i32 [[TMP19]], 1
 ; SSE-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 5
-; SSE-NEXT:    store i32 [[TMP20]], ptr [[TMP17]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    store i32 [[TMP20]], ptr [[TMP17]], align 4, !tbaa [[SHORT_TBAA0]]
 ; SSE-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 9
-; SSE-NEXT:    [[TMP23:%.*]] = load i32, ptr [[TMP22]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP23:%.*]] = load i32, ptr [[TMP22]], align 4, !tbaa [[SHORT_TBAA0]]
 ; SSE-NEXT:    [[TMP24:%.*]] = add i32 [[TMP23]], 2
 ; SSE-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 6
-; SSE-NEXT:    store i32 [[TMP24]], ptr [[TMP21]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    store i32 [[TMP24]], ptr [[TMP21]], align 4, !tbaa [[SHORT_TBAA0]]
 ; SSE-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 6
-; SSE-NEXT:    [[TMP27:%.*]] = load i32, ptr [[TMP26]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP27:%.*]] = load i32, ptr [[TMP26]], align 4, !tbaa [[SHORT_TBAA0]]
 ; SSE-NEXT:    [[TMP28:%.*]] = add i32 [[TMP27]], 3
 ; SSE-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 7
-; SSE-NEXT:    store i32 [[TMP28]], ptr [[TMP25]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    store i32 [[TMP28]], ptr [[TMP25]], align 4, !tbaa [[SHORT_TBAA0]]
 ; SSE-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 21
-; SSE-NEXT:    [[TMP31:%.*]] = load i32, ptr [[TMP30]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP31:%.*]] = load i32, ptr [[TMP30]], align 4, !tbaa [[SHORT_TBAA0]]
 ; SSE-NEXT:    [[TMP32:%.*]] = add i32 [[TMP31]], 4
-; SSE-NEXT:    store i32 [[TMP32]], ptr [[TMP29]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    store i32 [[TMP32]], ptr [[TMP29]], align 4, !tbaa [[SHORT_TBAA0]]
 ; SSE-NEXT:    ret void
 ;
-; AVX-LABEL: @gather_load_3(
-; AVX-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP1:%.*]], align 4, !tbaa [[TBAA0]]
+; AVX-LABEL: define void @gather_load_3(
+; AVX-SAME: ptr noalias captures(none) [[TMP0:%.*]], ptr noalias readonly captures(none) [[TMP1:%.*]]) #[[ATTR0]] {
+; AVX-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP1]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 11
-; AVX-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 4
-; AVX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 15
-; AVX-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP8]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP8]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 18
-; AVX-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 9
-; AVX-NEXT:    [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 6
-; AVX-NEXT:    [[TMP15:%.*]] = load i32, ptr [[TMP14]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP15:%.*]] = load i32, ptr [[TMP14]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 21
-; AVX-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP16]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP16]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX-NEXT:    [[TMP18:%.*]] = insertelement <8 x i32> poison, i32 [[TMP3]], i32 0
 ; AVX-NEXT:    [[TMP19:%.*]] = insertelement <8 x i32> [[TMP18]], i32 [[TMP5]], i32 1
 ; AVX-NEXT:    [[TMP20:%.*]] = insertelement <8 x i32> [[TMP19]], i32 [[TMP7]], i32 2
@@ -285,31 +297,34 @@ define void @gather_load_3(ptr noalias nocapture %0, ptr noalias nocapture reado
 ; AVX-NEXT:    [[TMP24:%.*]] = insertelement <8 x i32> [[TMP23]], i32 [[TMP15]], i32 6
 ; AVX-NEXT:    [[TMP25:%.*]] = insertelement <8 x i32> [[TMP24]], i32 [[TMP17]], i32 7
 ; AVX-NEXT:    [[TMP26:%.*]] = add <8 x i32> [[TMP25]], <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
-; AVX-NEXT:    store <8 x i32> [[TMP26]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    store <8 x i32> [[TMP26]], ptr [[TMP0]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX-NEXT:    ret void
 ;
-; AVX2-LABEL: @gather_load_3(
-; AVX2-NEXT:    [[TMP3:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr [[TMP1:%.*]], i32 4, <22 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true>, <22 x i32> poison), !tbaa [[TBAA0]]
+; AVX2-LABEL: define void @gather_load_3(
+; AVX2-SAME: ptr noalias captures(none) [[TMP0:%.*]], ptr noalias readonly captures(none) [[TMP1:%.*]]) #[[ATTR0]] {
+; AVX2-NEXT:    [[TMP3:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr [[TMP1]], i32 4, <22 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true>, <22 x i32> poison), !tbaa [[SHORT_TBAA0]]
 ; AVX2-NEXT:    [[TMP4:%.*]] = shufflevector <22 x i32> [[TMP3]], <22 x i32> poison, <8 x i32> <i32 0, i32 4, i32 6, i32 9, i32 11, i32 15, i32 18, i32 21>
 ; AVX2-NEXT:    [[TMP5:%.*]] = add <8 x i32> [[TMP4]], <i32 1, i32 3, i32 3, i32 2, i32 2, i32 4, i32 1, i32 4>
 ; AVX2-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 6, i32 3, i32 2, i32 7>
-; AVX2-NEXT:    store <8 x i32> [[TMP6]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    store <8 x i32> [[TMP6]], ptr [[TMP0]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX2-NEXT:    ret void
 ;
-; AVX512F-LABEL: @gather_load_3(
-; AVX512F-NEXT:    [[TMP3:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr [[TMP1:%.*]], i32 4, <22 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true>, <22 x i32> poison), !tbaa [[TBAA0]]
+; AVX512F-LABEL: define void @gather_load_3(
+; AVX512F-SAME: ptr noalias captures(none) [[TMP0:%.*]], ptr noalias readonly captures(none) [[TMP1:%.*]]) #[[ATTR0]] {
+; AVX512F-NEXT:    [[TMP3:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr [[TMP1]], i32 4, <22 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true>, <22 x i32> poison), !tbaa [[SHORT_TBAA0]]
 ; AVX512F-NEXT:    [[TMP4:%.*]] = shufflevector <22 x i32> [[TMP3]], <22 x i32> poison, <8 x i32> <i32 0, i32 4, i32 6, i32 9, i32 11, i32 15, i32 18, i32 21>
 ; AVX512F-NEXT:    [[TMP5:%.*]] = add <8 x i32> [[TMP4]], <i32 1, i32 3, i32 3, i32 2, i32 2, i32 4, i32 1, i32 4>
 ; AVX512F-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 6, i32 3, i32 2, i32 7>
-; AVX512F-NEXT:    store <8 x i32> [[TMP6]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT:    store <8 x i32> [[TMP6]], ptr [[TMP0]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX512F-NEXT:    ret void
 ;
-; AVX512VL-LABEL: @gather_load_3(
-; AVX512VL-NEXT:    [[TMP3:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr [[TMP1:%.*]], i32 4, <22 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true>, <22 x i32> poison), !tbaa [[TBAA0]]
+; AVX512VL-LABEL: define void @gather_load_3(
+; AVX512VL-SAME: ptr noalias captures(none) [[TMP0:%.*]], ptr noalias readonly captures(none) [[TMP1:%.*]]) #[[ATTR0]] {
+; AVX512VL-NEXT:    [[TMP3:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr [[TMP1]], i32 4, <22 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true>, <22 x i32> poison), !tbaa [[SHORT_TBAA0]]
 ; AVX512VL-NEXT:    [[TMP4:%.*]] = shufflevector <22 x i32> [[TMP3]], <22 x i32> poison, <8 x i32> <i32 0, i32 4, i32 6, i32 9, i32 11, i32 15, i32 18, i32 21>
 ; AVX512VL-NEXT:    [[TMP5:%.*]] = add <8 x i32> [[TMP4]], <i32 1, i32 3, i32 3, i32 2, i32 2, i32 4, i32 1, i32 4>
 ; AVX512VL-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 6, i32 3, i32 2, i32 7>
-; AVX512VL-NEXT:    store <8 x i32> [[TMP6]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    store <8 x i32> [[TMP6]], ptr [[TMP0]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX512VL-NEXT:    ret void
 ;
   %3 = load i32, ptr %1, align 4, !tbaa !2
@@ -354,9 +369,10 @@ define void @gather_load_3(ptr noalias nocapture %0, ptr noalias nocapture reado
 }
 
 define void @gather_load_4(ptr noalias nocapture %t0, ptr noalias nocapture readonly %t1) {
-; SSE-LABEL: @gather_load_4(
-; SSE-NEXT:    [[T5:%.*]] = getelementptr inbounds i32, ptr [[T0:%.*]], i64 1
-; SSE-NEXT:    [[T6:%.*]] = getelementptr inbounds i32, ptr [[T1:%.*]], i64 11
+; SSE-LABEL: define void @gather_load_4(
+; SSE-SAME: ptr noalias captures(none) [[T0:%.*]], ptr noalias readonly captures(none) [[T1:%.*]]) #[[ATTR0]] {
+; SSE-NEXT:    [[T5:%.*]] = getelementptr inbounds i32, ptr [[T0]], i64 1
+; SSE-NEXT:    [[T6:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 11
 ; SSE-NEXT:    [[T9:%.*]] = getelementptr inbounds i32, ptr [[T0]], i64 2
 ; SSE-NEXT:    [[T10:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 4
 ; SSE-NEXT:    [[T13:%.*]] = getelementptr inbounds i32, ptr [[T0]], i64 3
@@ -369,14 +385,14 @@ define void @gather_load_4(ptr noalias nocapture %t0, ptr noalias nocapture read
 ; SSE-NEXT:    [[T26:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 6
 ; SSE-NEXT:    [[T29:%.*]] = getelementptr inbounds i32, ptr [[T0]], i64 7
 ; SSE-NEXT:    [[T30:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 21
-; SSE-NEXT:    [[T3:%.*]] = load i32, ptr [[T1]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[T7:%.*]] = load i32, ptr [[T6]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[T11:%.*]] = load i32, ptr [[T10]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[T15:%.*]] = load i32, ptr [[T14]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[T19:%.*]] = load i32, ptr [[T18]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[T23:%.*]] = load i32, ptr [[T22]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[T27:%.*]] = load i32, ptr [[T26]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[T31:%.*]] = load i32, ptr [[T30]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[T3:%.*]] = load i32, ptr [[T1]], align 4, !tbaa [[SHORT_TBAA0]]
+; SSE-NEXT:    [[T7:%.*]] = load i32, ptr [[T6]], align 4, !tbaa [[SHORT_TBAA0]]
+; SSE-NEXT:    [[T11:%.*]] = load i32, ptr [[T10]], align 4, !tbaa [[SHORT_TBAA0]]
+; SSE-NEXT:    [[T15:%.*]] = load i32, ptr [[T14]], align 4, !tbaa [[SHORT_TBAA0]]
+; SSE-NEXT:    [[T19:%.*]] = load i32, ptr [[T18]], align 4, !tbaa [[SHORT_TBAA0]]
+; SSE-NEXT:    [[T23:%.*]] = load i32, ptr [[T22]], align 4, !tbaa [[SHORT_TBAA0]]
+; SSE-NEXT:    [[T27:%.*]] = load i32, ptr [[T26]], align 4, !tbaa [[SHORT_TBAA0]]
+; SSE-NEXT:    [[T31:%.*]] = load i32, ptr [[T30]], align 4, !tbaa [[SHORT_TBAA0]]
 ; SSE-NEXT:    [[T4:%.*]] = add i32 [[T3]], 1
 ; SSE-NEXT:    [[T8:%.*]] = add i32 [[T7]], 2
 ; SSE-NEXT:    [[T12:%.*]] = add i32 [[T11]], 3
@@ -385,32 +401,33 @@ define void @gather_load_4(ptr noalias nocapture %t0, ptr noalias nocapture read
 ; SSE-NEXT:    [[T24:%.*]] = add i32 [[T23]], 2
 ; SSE-NEXT:    [[T28:%.*]] = add i32 [[T27]], 3
 ; SSE-NEXT:    [[T32:%.*]] = add i32 [[T31]], 4
-; SSE-NEXT:    store i32 [[T4]], ptr [[T0]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    store i32 [[T8]], ptr [[T5]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    store i32 [[T12]], ptr [[T9]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    store i32 [[T16]], ptr [[T13]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    store i32 [[T20]], ptr [[T17]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    store i32 [[T24]], ptr [[T21]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    store i32 [[T28]], ptr [[T25]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    store i32 [[T32]], ptr [[T29]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    store i32 [[T4]], ptr [[T0]], align 4, !tbaa [[SHORT_TBAA0]]
+; SSE-NEXT:    store i32 [[T8]], ptr [[T5]], align 4, !tbaa [[SHORT_TBAA0]]
+; SSE-NEXT:    store i32 [[T12]], ptr [[T9]], align 4, !tbaa [[SHORT_TBAA0]]
+; SSE-NEXT:    store i32 [[T16]], ptr [[T13]], align 4, !tbaa [[SHORT_TBAA0]]
+; SSE-NEXT:    store i32 [[T20]], ptr [[T17]], align 4, !tbaa [[SHORT_TBAA0]]
+; SSE-NEXT:    store i32 [[T24]], ptr [[T21]], align 4, !tbaa [[SHORT_TBAA0]]
+; SSE-NEXT:    store i32 [[T28]], ptr [[T25]], align 4, !tbaa [[SHORT_TBAA0]]
+; SSE-NEXT:    store i32 [[T32]], ptr [[T29]], align 4, !tbaa [[SHORT_TBAA0]]
 ; SSE-NEXT:    ret void
 ;
-; AVX-LABEL: @gather_load_4(
-; AVX-NEXT:    [[T6:%.*]] = getelementptr inbounds i32, ptr [[T1:%.*]], i64 11
+; AVX-LABEL: define void @gather_load_4(
+; AVX-SAME: ptr noalias captures(none) [[T0:%.*]], ptr noalias readonly captures(none) [[T1:%.*]]) #[[ATTR0]] {
+; AVX-NEXT:    [[T6:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 11
 ; AVX-NEXT:    [[T10:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 4
 ; AVX-NEXT:    [[T14:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 15
 ; AVX-NEXT:    [[T18:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 18
 ; AVX-NEXT:    [[T22:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 9
 ; AVX-NEXT:    [[T26:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 6
 ; AVX-NEXT:    [[T30:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 21
-; AVX-NEXT:    [[T3:%.*]] = load i32, ptr [[T1]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[T7:%.*]] = load i32, ptr [[T6]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[T11:%.*]] = load i32, ptr [[T10]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[T15:%.*]] = load i32, ptr [[T14]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[T19:%.*]] = load i32, ptr [[T18]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[T23:%.*]] = load i32, ptr [[T22]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[T27:%.*]] = load i32, ptr [[T26]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[T31:%.*]] = load i32, ptr [[T30]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[T3:%.*]] = load i32, ptr [[T1]], align 4, !tbaa [[SHORT_TBAA0]]
+; AVX-NEXT:    [[T7:%.*]] = load i32, ptr [[T6]], align 4, !tbaa [[SHORT_TBAA0]]
+; AVX-NEXT:    [[T11:%.*]] = load i32, ptr [[T10]], align 4, !tbaa [[SHORT_TBAA0]]
+; AVX-NEXT:    [[T15:%.*]] = load i32, ptr [[T14]], align 4, !tbaa [[SHORT_TBAA0]]
+; AVX-NEXT:    [[T19:%.*]] = load i32, ptr [[T18]], align 4, !tbaa [[SHORT_TBAA0]]
+; AVX-NEXT:    [[T23:%.*]] = load i32, ptr [[T22]], align 4, !tbaa [[SHORT_TBAA0]]
+; AVX-NEXT:    [[T27:%.*]] = load i32, ptr [[T26]], align 4, !tbaa [[SHORT_TBAA0]]
+; AVX-NEXT:    [[T31:%.*]] = load i32, ptr [[T30]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX-NEXT:    [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[T3]], i32 0
 ; AVX-NEXT:    [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[T7]], i32 1
 ; AVX-NEXT:    [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[T11]], i32 2
@@ -420,31 +437,34 @@ define void @gather_load_4(ptr noalias nocapture %t0, ptr noalias nocapture read
 ; AVX-NEXT:    [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[T27]], i32 6
 ; AVX-NEXT:    [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[T31]], i32 7
 ; AVX-NEXT:    [[TMP9:%.*]] = add <8 x i32> [[TMP8]], <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
-; AVX-NEXT:    store <8 x i32> [[TMP9]], ptr [[T0:%.*]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    store <8 x i32> [[TMP9]], ptr [[T0]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX-NEXT:    ret void
 ;
-; AVX2-LABEL: @gather_load_4(
-; AVX2-NEXT:    [[TMP1:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr [[T1:%.*]], i32 4, <22 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true>, <22 x i32> poison), !tbaa [[TBAA0]]
+; AVX2-LABEL: define void @gather_load_4(
+; AVX2-SAME: ptr noalias captures(none) [[T0:%.*]], ptr noalias readonly captures(none) [[T1:%.*]]) #[[ATTR0]] {
+; AVX2-NEXT:    [[TMP1:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr [[T1]], i32 4, <22 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true>, <22 x i32> poison), !tbaa [[SHORT_TBAA0]]
 ; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <22 x i32> [[TMP1]], <22 x i32> poison, <8 x i32> <i32 0, i32 4, i32 6, i32 9, i32 11, i32 15, i32 18, i32 21>
 ; AVX2-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[TMP2]], <i32 1, i32 3, i32 3, i32 2, i32 2, i32 4, i32 1, i32 4>
 ; AVX2-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 6, i32 3, i32 2, i32 7>
-; AVX2-NEXT:    store <8 x i32> [[TMP4]], ptr [[T0:%.*]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    store <8 x i32> [[TMP4]], ptr [[T0]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX2-NEXT:    ret void
 ;
-; AVX512F-LABEL: @gather_load_4(
-; AVX512F-NEXT:    [[TMP1:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr [[T1:%.*]], i32 4, <22 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true>, <22 x i32> poison), !tbaa [[TBAA0]]
+; AVX512F-LABEL: define void @gather_load_4(
+; AVX512F-SAME: ptr noalias captures(none) [[T0:%.*]], ptr noalias readonly captures(none) [[T1:%.*]]) #[[ATTR0]] {
+; AVX512F-NEXT:    [[TMP1:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr [[T1]], i32 4, <22 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true>, <22 x i32> poison), !tbaa [[SHORT_TBAA0]]
 ; AVX512F-NEXT:    [[TMP2:%.*]] = shufflevector <22 x i32> [[TMP1]], <22 x i32> poison, <8 x i32> <i32 0, i32 4, i32 6, i32 9, i32 11, i32 15, i32 18, i32 21>
 ; AVX512F-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[TMP2]], <i32 1, i32 3, i32 3, i32 2, i32 2, i32 4, i32 1, i32 4>
 ; AVX512F-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 6, i32 3, i32 2, i32 7>
-; AVX512F-NEXT:    store <8 x i32> [[TMP4]], ptr [[T0:%.*]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT:    store <8 x i32> [[TMP4]], ptr [[T0]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX512F-NEXT:    ret void
 ;
-; AVX512VL-LABEL: @gather_load_4(
-; AVX512VL-NEXT:    [[TMP1:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr [[T1:%.*]], i32 4, <22 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true>, <22 x i32> poison), !tbaa [[TBAA0]]
+; AVX512VL-LABEL: define void @gather_load_4(
+; AVX512VL-SAME: ptr noalias captures(none) [[T0:%.*]], ptr noalias readonly captures(none) [[T1:%.*]]) #[[ATTR0]] {
+; AVX512VL-NEXT:    [[TMP1:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr [[T1]], i32 4, <22 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true>, <22 x i32> poison), !tbaa [[SHORT_TBAA0]]
 ; AVX512VL-NEXT:    [[TMP2:%.*]] = shufflevector <22 x i32> [[TMP1]], <22 x i32> poison, <8 x i32> <i32 0, i32 4, i32 6, i32 9, i32 11, i32 15, i32 18, i32 21>
 ; AVX512VL-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[TMP2]], <i32 1, i32 3, i32 3, i32 2, i32 2, i32 4, i32 1, i32 4>
 ; AVX512VL-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 6, i32 3, i32 2, i32 7>
-; AVX512VL-NEXT:    store <8 x i32> [[TMP4]], ptr [[T0:%.*]], align 4, !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    store <8 x i32> [[TMP4]], ptr [[T0]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX512VL-NEXT:    ret void
 ;
   %t5 = getelementptr inbounds i32, ptr %t0, i64 1
@@ -494,17 +514,18 @@ define void @gather_load_4(ptr noalias nocapture %t0, ptr noalias nocapture read
 
 
 define void @gather_load_div(ptr noalias nocapture %0, ptr noalias nocapture readonly %1) {
-; SSE-LABEL: @gather_load_div(
-; SSE-NEXT:    [[TMP3:%.*]] = load float, ptr [[TMP1:%.*]], align 4, !tbaa [[TBAA0]]
+; SSE-LABEL: define void @gather_load_div(
+; SSE-SAME: ptr noalias captures(none) [[TMP0:%.*]], ptr noalias readonly captures(none) [[TMP1:%.*]]) #[[ATTR0]] {
+; SSE-NEXT:    [[TMP3:%.*]] = load float, ptr [[TMP1]], align 4, !tbaa [[SHORT_TBAA0]]
 ; SSE-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 10
 ; SSE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 13
 ; SSE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 3
 ; SSE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 44
-; SSE-NEXT:    [[TMP8:%.*]] = load float, ptr [[TMP7]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP0:%.*]], i64 4
-; SSE-NEXT:    [[TMP10:%.*]] = load <2 x float>, ptr [[TMP6]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP11:%.*]] = load <2 x float>, ptr [[TMP4]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP12:%.*]] = load <2 x float>, ptr [[TMP5]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP8:%.*]] = load float, ptr [[TMP7]], align 4, !tbaa [[SHORT_TBAA0]]
+; SSE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP0]], i64 4
+; SSE-NEXT:    [[TMP10:%.*]] = load <2 x float>, ptr [[TMP6]], align 4, !tbaa [[SHORT_TBAA0]]
+; SSE-NEXT:    [[TMP11:%.*]] = load <2 x float>, ptr [[TMP4]], align 4, !tbaa [[SHORT_TBAA0]]
+; SSE-NEXT:    [[TMP12:%.*]] = load <2 x float>, ptr [[TMP5]], align 4, !tbaa [[SHORT_TBAA0]]
 ; SSE-NEXT:    [[TMP13:%.*]] = insertelement <4 x float> poison, float [[TMP3]], i32 0
 ; SSE-NEXT:    [[TMP14:%.*]] = shufflevector <2 x float> [[TMP11]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
 ; SSE-NEXT:    [[TMP15:%.*]] = shufflevector <4 x float> [[TMP13]], <4 x float> [[TMP14]], <4 x i32> <i32 0, i32 4, i32 poison, i32 poison>
@@ -516,23 +537,23 @@ define void @gather_load_div(ptr noalias nocapture %0, ptr noalias nocapture rea
 ; SSE-NEXT:    [[TMP21:%.*]] = shufflevector <4 x float> [[TMP20]], <4 x float> [[TMP14]], <4 x i32> <i32 0, i32 1, i32 5, i32 poison>
 ; SSE-NEXT:    [[TMP22:%.*]] = insertelement <4 x float> [[TMP21]], float [[TMP8]], i32 3
 ; SSE-NEXT:    [[TMP23:%.*]] = fdiv <4 x float> [[TMP19]], [[TMP22]]
-; SSE-NEXT:    store <4 x float> [[TMP23]], ptr [[TMP0]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    store <4 x float> [[TMP23]], ptr [[TMP0]], align 4, !tbaa [[SHORT_TBAA0]]
 ; SSE-NEXT:    [[TMP24:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 17
-; SSE-NEXT:    [[TMP25:%.*]] = load float, ptr [[TMP24]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP25:%.*]] = load float, ptr [[TMP24]], align 4, !tbaa [[SHORT_TBAA0]]
 ; SSE-NEXT:    [[TMP26:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 33
-; SSE-NEXT:    [[TMP27:%.*]] = load float, ptr [[TMP26]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP27:%.*]] = load float, ptr [[TMP26]], align 4, !tbaa [[SHORT_TBAA0]]
 ; SSE-NEXT:    [[TMP28:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 8
-; SSE-NEXT:    [[TMP29:%.*]] = load float, ptr [[TMP28]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP29:%.*]] = load float, ptr [[TMP28]], align 4, !tbaa [[SHORT_TBAA0]]
 ; SSE-NEXT:    [[TMP30:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 30
-; SSE-NEXT:    [[TMP31:%.*]] = load float, ptr [[TMP30]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP31:%.*]] = load float, ptr [[TMP30]], align 4, !tbaa [[SHORT_TBAA0]]
 ; SSE-NEXT:    [[TMP32:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 5
-; SSE-NEXT:    [[TMP33:%.*]] = load float, ptr [[TMP32]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP33:%.*]] = load float, ptr [[TMP32]], align 4, !tbaa [[SHORT_TBAA0]]
 ; SSE-NEXT:    [[TMP34:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 27
-; SSE-NEXT:    [[TMP35:%.*]] = load float, ptr [[TMP34]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP35:%.*]] = load float, ptr [[TMP34]], align 4, !tbaa [[SHORT_TBAA0]]
 ; SSE-NEXT:    [[TMP36:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 20
-; SSE-NEXT:    [[TMP37:%.*]] = load float, ptr [[TMP36]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP37:%.*]] = load float, ptr [[TMP36]], align 4, !tbaa [[SHORT_TBAA0]]
 ; SSE-NEXT:    [[TMP38:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 23
-; SSE-NEXT:    [[TMP39:%.*]] = load float, ptr [[TMP38]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP39:%.*]] = load float, ptr [[TMP38]], align 4, !tbaa [[SHORT_TBAA0]]
 ; SSE-NEXT:    [[TMP40:%.*]] = insertelement <4 x float> poison, float [[TMP25]], i32 0
 ; SSE-NEXT:    [[TMP41:%.*]] = insertelement <4 x float> [[TMP40]], float [[TMP29]], i32 1
 ; SSE-NEXT:    [[TMP42:%.*]] = insertelement <4 x float> [[TMP41]], float [[TMP33]], i32 2
@@ -542,35 +563,36 @@ define void @gather_load_div(ptr noalias nocapture %0, ptr noalias nocapture rea
 ; SSE-NEXT:    [[TMP46:%.*]] = insertelement <4 x float> [[TMP45]], float [[TMP35]], i32 2
 ; SSE-NEXT:    [[TMP47:%.*]] = insertelement <4 x float> [[TMP46]], float [[TMP39]], i32 3
 ; SSE-NEXT:    [[TMP48:%.*]] = fdiv <4 x float> [[TMP43]], [[TMP47]]
-; SSE-NEXT:    store <4 x float> [[TMP48]], ptr [[TMP9]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    store <4 x float> [[TMP48]], ptr [[TMP9]], align 4, !tbaa [[SHORT_TBAA0]]
 ; SSE-NEXT:    ret void
 ;
-; AVX-LABEL: @gather_load_div(
-; AVX-NEXT:    [[TMP3:%.*]] = load float, ptr [[TMP1:%.*]], align 4, !tbaa [[TBAA0]]
+; AVX-LABEL: define void @gather_load_div(
+; AVX-SAME: ptr noalias captures(none) [[TMP0:%.*]], ptr noalias readonly captures(none) [[TMP1:%.*]]) #[[ATTR0]] {
+; AVX-NEXT:    [[TMP3:%.*]] = load float, ptr [[TMP1]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 10
 ; AVX-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 13
 ; AVX-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 3
 ; AVX-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 44
-; AVX-NEXT:    [[TMP8:%.*]] = load float, ptr [[TMP7]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP8:%.*]] = load float, ptr [[TMP7]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 17
-; AVX-NEXT:    [[TMP10:%.*]] = load float, ptr [[TMP9]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP10:%.*]] = load float, ptr [[TMP9]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 33
-; AVX-NEXT:    [[TMP12:%.*]] = load float, ptr [[TMP11]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP12:%.*]] = load float, ptr [[TMP11]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX-NEXT:    [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 8
-; AVX-NEXT:    [[TMP14:%.*]] = load float, ptr [[TMP13]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP14:%.*]] = load float, ptr [[TMP13]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX-NEXT:    [[TMP15:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 30
-; AVX-NEXT:    [[TMP16:%.*]] = load float, ptr [[TMP15]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP16:%.*]] = load float, ptr [[TMP15]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 5
-; AVX-NEXT:    [[TMP18:%.*]] = load float, ptr [[TMP17]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP18:%.*]] = load float, ptr [[TMP17]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX-NEXT:    [[TMP19:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 27
-; AVX-NEXT:    [[TMP20:%.*]] = load float, ptr [[TMP19]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP20:%.*]] = load float, ptr [[TMP19]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX-NEXT:    [[TMP21:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 20
-; AVX-NEXT:    [[TMP22:%.*]] = load float, ptr [[TMP21]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP22:%.*]] = load float, ptr [[TMP21]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX-NEXT:    [[TMP23:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 23
-; AVX-NEXT:    [[TMP24:%.*]] = load float, ptr [[TMP23]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP25:%.*]] = load <2 x float>, ptr [[TMP6]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP26:%.*]] = load <2 x float>, ptr [[TMP4]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP27:%.*]] = load <2 x float>, ptr [[TMP5]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP24:%.*]] = load float, ptr [[TMP23]], align 4, !tbaa [[SHORT_TBAA0]]
+; AVX-NEXT:    [[TMP25:%.*]] = load <2 x float>, ptr [[TMP6]], align 4, !tbaa [[SHORT_TBAA0]]
+; AVX-NEXT:    [[TMP26:%.*]] = load <2 x float>, ptr [[TMP4]], align 4, !tbaa [[SHORT_TBAA0]]
+; AVX-NEXT:    [[TMP27:%.*]] = load <2 x float>, ptr [[TMP5]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX-NEXT:    [[TMP28:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i32 0
 ; AVX-NEXT:    [[TMP29:%.*]] = shufflevector <2 x float> [[TMP26]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX-NEXT:    [[TMP30:%.*]] = shufflevector <8 x float> [[TMP28]], <8 x float> [[TMP29]], <8 x i32> <i32 0, i32 8, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
@@ -590,35 +612,36 @@ define void @gather_load_div(ptr noalias nocapture %0, ptr noalias nocapture rea
 ; AVX-NEXT:    [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP20]], i32 6
 ; AVX-NEXT:    [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[TMP24]], i32 7
 ; AVX-NEXT:    [[TMP46:%.*]] = fdiv <8 x float> [[TMP38]], [[TMP45]]
-; AVX-NEXT:    store <8 x float> [[TMP46]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    store <8 x float> [[TMP46]], ptr [[TMP0]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX-NEXT:    ret void
 ;
-; AVX2-LABEL: @gather_load_div(
-; AVX2-NEXT:    [[TMP3:%.*]] = load float, ptr [[TMP1:%.*]], align 4, !tbaa [[TBAA0]]
+; AVX2-LABEL: define void @gather_load_div(
+; AVX2-SAME: ptr noalias captures(none) [[TMP0:%.*]], ptr noalias readonly captures(none) [[TMP1:%.*]]) #[[ATTR0]] {
+; AVX2-NEXT:    [[TMP3:%.*]] = load float, ptr [[TMP1]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 10
 ; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 13
 ; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 3
 ; AVX2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 44
-; AVX2-NEXT:    [[TMP8:%.*]] = load float, ptr [[TMP7]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP8:%.*]] = load float, ptr [[TMP7]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 17
-; AVX2-NEXT:    [[TMP10:%.*]] = load float, ptr [[TMP9]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP10:%.*]] = load float, ptr [[TMP9]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 33
-; AVX2-NEXT:    [[TMP12:%.*]] = load float, ptr [[TMP11]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP12:%.*]] = load float, ptr [[TMP11]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 8
-; AVX2-NEXT:    [[TMP14:%.*]] = load float, ptr [[TMP13]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP14:%.*]] = load float, ptr [[TMP13]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 30
-; AVX2-NEXT:    [[TMP16:%.*]] = load float, ptr [[TMP15]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP16:%.*]] = load float, ptr [[TMP15]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 5
-; AVX2-NEXT:    [[TMP18:%.*]] = load float, ptr [[TMP17]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP18:%.*]] = load float, ptr [[TMP17]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX2-NEXT:    [[TMP19:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 27
-; AVX2-NEXT:    [[TMP20:%.*]] = load float, ptr [[TMP19]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP20:%.*]] = load float, ptr [[TMP19]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX2-NEXT:    [[TMP21:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 20
-; AVX2-NEXT:    [[TMP22:%.*]] = load float, ptr [[TMP21]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP22:%.*]] = load float, ptr [[TMP21]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX2-NEXT:    [[TMP23:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 23
-; AVX2-NEXT:    [[TMP24:%.*]] = load float, ptr [[TMP23]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP25:%.*]] = load <2 x float>, ptr [[TMP6]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP26:%.*]] = load <2 x float>, ptr [[TMP4]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP27:%.*]] = load <2 x float>, ptr [[TMP5]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP24:%.*]] = load float, ptr [[TMP23]], align 4, !tbaa [[SHORT_TBAA0]]
+; AVX2-NEXT:    [[TMP25:%.*]] = load <2 x float>, ptr [[TMP6]], align 4, !tbaa [[SHORT_TBAA0]]
+; AVX2-NEXT:    [[TMP26:%.*]] = load <2 x float>, ptr [[TMP4]], align 4, !tbaa [[SHORT_TBAA0]]
+; AVX2-NEXT:    [[TMP27:%.*]] = load <2 x float>, ptr [[TMP5]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX2-NEXT:    [[TMP28:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i32 0
 ; AVX2-NEXT:    [[TMP29:%.*]] = shufflevector <2 x float> [[TMP26]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX2-NEXT:    [[TMP30:%.*]] = shufflevector <8 x float> [[TMP28]], <8 x float> [[TMP29]], <8 x i32> <i32 0, i32 8, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
@@ -638,27 +661,29 @@ define void @gather_load_div(ptr noalias nocapture %0, ptr noalias nocapture rea
 ; AVX2-NEXT:    [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP20]], i32 6
 ; AVX2-NEXT:    [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[TMP24]], i32 7
 ; AVX2-NEXT:    [[TMP46:%.*]] = fdiv <8 x float> [[TMP38]], [[TMP45]]
-; AVX2-NEXT:    store <8 x float> [[TMP46]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    store <8 x float> [[TMP46]], ptr [[TMP0]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX2-NEXT:    ret void
 ;
-; AVX512F-LABEL: @gather_load_div(
-; AVX512F-NEXT:    [[TMP3:%.*]] = call <45 x float> @llvm.masked.load.v45f32.p0(ptr [[TMP1:%.*]], i32 4, <45 x i1> <i1 true, i1 false, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true>, <45 x float> poison), !tbaa [[TBAA0]]
+; AVX512F-LABEL: define void @gather_load_div(
+; AVX512F-SAME: ptr noalias captures(none) [[TMP0:%.*]], ptr noalias readonly captures(none) [[TMP1:%.*]]) #[[ATTR0]] {
+; AVX512F-NEXT:    [[TMP3:%.*]] = call <45 x float> @llvm.masked.load.v45f32.p0(ptr [[TMP1]], i32 4, <45 x i1> <i1 true, i1 false, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true>, <45 x float> poison), !tbaa [[SHORT_TBAA0]]
 ; AVX512F-NEXT:    [[TMP4:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <16 x i32> <i32 0, i32 3, i32 4, i32 5, i32 8, i32 10, i32 11, i32 13, i32 14, i32 17, i32 20, i32 23, i32 27, i32 30, i32 33, i32 44>
 ; AVX512F-NEXT:    [[TMP7:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <8 x i32> <i32 0, i32 3, i32 5, i32 8, i32 10, i32 14, i32 17, i32 20>
 ; AVX512F-NEXT:    [[TMP8:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <8 x i32> <i32 4, i32 11, i32 27, i32 30, i32 13, i32 44, i32 33, i32 23>
 ; AVX512F-NEXT:    [[TMP9:%.*]] = fdiv <8 x float> [[TMP7]], [[TMP8]]
 ; AVX512F-NEXT:    [[TMP10:%.*]] = shufflevector <8 x float> [[TMP9]], <8 x float> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 6, i32 3, i32 2, i32 7>
-; AVX512F-NEXT:    store <8 x float> [[TMP10]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT:    store <8 x float> [[TMP10]], ptr [[TMP0]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX512F-NEXT:    ret void
 ;
-; AVX512VL-LABEL: @gather_load_div(
-; AVX512VL-NEXT:    [[TMP3:%.*]] = call <45 x float> @llvm.masked.load.v45f32.p0(ptr [[TMP1:%.*]], i32 4, <45 x i1> <i1 true, i1 false, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true>, <45 x float> poison), !tbaa [[TBAA0]]
+; AVX512VL-LABEL: define void @gather_load_div(
+; AVX512VL-SAME: ptr noalias captures(none) [[TMP0:%.*]], ptr noalias readonly captures(none) [[TMP1:%.*]]) #[[ATTR0]] {
+; AVX512VL-NEXT:    [[TMP3:%.*]] = call <45 x float> @llvm.masked.load.v45f32.p0(ptr [[TMP1]], i32 4, <45 x i1> <i1 true, i1 false, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true>, <45 x float> poison), !tbaa [[SHORT_TBAA0]]
 ; AVX512VL-NEXT:    [[TMP4:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <16 x i32> <i32 0, i32 3, i32 4, i32 5, i32 8, i32 10, i32 11, i32 13, i32 14, i32 17, i32 20, i32 23, i32 27, i32 30, i32 33, i32 44>
 ; AVX512VL-NEXT:    [[TMP7:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <8 x i32> <i32 0, i32 3, i32 5, i32 8, i32 10, i32 14, i32 17, i32 20>
 ; AVX512VL-NEXT:    [[TMP8:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <8 x i32> <i32 4, i32 11, i32 27, i32 30, i32 13, i32 44, i32 33, i32 23>
 ; AVX512VL-NEXT:    [[TMP9:%.*]] = fdiv <8 x float> [[TMP7]], [[TMP8]]
 ; AVX512VL-NEXT:    [[TMP10:%.*]] = shufflevector <8 x float> [[TMP9]], <8 x float> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 6, i32 3, i32 2, i32 7>
-; AVX512VL-NEXT:    store <8 x float> [[TMP10]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    store <8 x float> [[TMP10]], ptr [[TMP0]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX512VL-NEXT:    ret void
 ;
   %3 = load float, ptr %1, align 4, !tbaa !2
@@ -722,3 +747,29 @@ define void @gather_load_div(ptr noalias nocapture %0, ptr noalias nocapture rea
 !3 = !{!"short", !4, i64 0}
 !4 = !{!"omnipotent char", !5, i64 0}
 !5 = !{!"Simple C++ TBAA"}
+;.
+; SSE: [[SHORT_TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0}
+; SSE: [[META1]] = !{!"short", [[META2:![0-9]+]], i64 0}
+; SSE: [[META2]] = !{!"omnipotent char", [[META3:![0-9]+]], i64 0}
+; SSE: [[META3]] = !{!"Simple C++ TBAA"}
+;.
+; AVX: [[SHORT_TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0}
+; AVX: [[META1]] = !{!"short", [[META2:![0-9]+]], i64 0}
+; AVX: [[META2]] = !{!"omnipotent char", [[META3:![0-9]+]], i64 0}
+; AVX: [[META3]] = !{!"Simple C++ TBAA"}
+;.
+; AVX2: [[SHORT_TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0}
+; AVX2: [[META1]] = !{!"short", [[META2:![0-9]+]], i64 0}
+; AVX2: [[META2]] = !{!"omnipotent char", [[META3:![0-9]+]], i64 0}
+; AVX2: [[META3]] = !{!"Simple C++ TBAA"}
+;.
+; AVX512F: [[SHORT_TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0}
+; AVX512F: [[META1]] = !{!"short", [[META2:![0-9]+]], i64 0}
+; AVX512F: [[META2]] = !{!"omnipotent char", [[META3:![0-9]+]], i64 0}
+; AVX512F: [[META3]] = !{!"Simple C++ TBAA"}
+;.
+; AVX512VL: [[SHORT_TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0}
+; AVX512VL: [[META1]] = !{!"short", [[META2:![0-9]+]], i64 0}
+; AVX512VL: [[META2]] = !{!"omnipotent char", [[META3:![0-9]+]], i64 0}
+; AVX512VL: [[META3]] = !{!"Simple C++ TBAA"}
+;.
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll
index bfa3610804967..cf380f04a6939 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN:  opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux -mattr=+sse2     | FileCheck %s --check-prefixes=SSE
 ; RUN:  opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux -mattr=+avx      | FileCheck %s --check-prefixes=AVX
 ; RUN:  opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux -mattr=+avx2     | FileCheck %s --check-prefixes=AVX2
@@ -6,99 +6,104 @@
 ; RUN:  opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512VL
 
 define void @gather_load(ptr noalias nocapture %0, ptr noalias nocapture readonly %1) {
-; SSE-LABEL: @gather_load(
-; SSE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1:%.*]], i64 1
-; SSE-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP1]], align 4, !tbaa [[TBAA0:![0-9]+]]
-; SSE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP0:%.*]], i64 1
+; SSE-LABEL: define void @gather_load(
+; SSE-SAME: ptr noalias captures(none) [[TMP0:%.*]], ptr noalias readonly captures(none) [[TMP1:%.*]]) #[[ATTR0:[0-9]+]] {
+; SSE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 1
+; SSE-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP1]], align 4, !tbaa [[SHORT_TBAA0:![0-9]+]]
+; SSE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 1
 ; SSE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 11
-; SSE-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4, !tbaa [[SHORT_TBAA0]]
 ; SSE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 2
 ; SSE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 4
-; SSE-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[SHORT_TBAA0]]
 ; SSE-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 3
-; SSE-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[SHORT_TBAA0]]
 ; SSE-NEXT:    [[TMP16:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0
 ; SSE-NEXT:    [[TMP17:%.*]] = insertelement <4 x i32> [[TMP16]], i32 [[TMP7]], i32 1
 ; SSE-NEXT:    [[TMP15:%.*]] = insertelement <4 x i32> [[TMP17]], i32 [[TMP10]], i32 2
 ; SSE-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP15]], i32 [[TMP12]], i32 3
 ; SSE-NEXT:    [[TMP14:%.*]] = add nsw <4 x i32> [[TMP13]], <i32 1, i32 2, i32 3, i32 4>
-; SSE-NEXT:    store <4 x i32> [[TMP14]], ptr [[TMP0]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    store <4 x i32> [[TMP14]], ptr [[TMP0]], align 4, !tbaa [[SHORT_TBAA0]]
 ; SSE-NEXT:    ret void
 ;
-; AVX-LABEL: @gather_load(
-; AVX-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1:%.*]], i64 1
-; AVX-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP1]], align 4, !tbaa [[TBAA0:![0-9]+]]
-; AVX-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP0:%.*]], i64 1
+; AVX-LABEL: define void @gather_load(
+; AVX-SAME: ptr noalias captures(none) [[TMP0:%.*]], ptr noalias readonly captures(none) [[TMP1:%.*]]) #[[ATTR0:[0-9]+]] {
+; AVX-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 1
+; AVX-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP1]], align 4, !tbaa [[SHORT_TBAA0:![0-9]+]]
+; AVX-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 1
 ; AVX-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 11
-; AVX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 2
 ; AVX-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 4
-; AVX-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 3
-; AVX-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX-NEXT:    [[TMP16:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0
 ; AVX-NEXT:    [[TMP17:%.*]] = insertelement <4 x i32> [[TMP16]], i32 [[TMP7]], i32 1
 ; AVX-NEXT:    [[TMP15:%.*]] = insertelement <4 x i32> [[TMP17]], i32 [[TMP10]], i32 2
 ; AVX-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP15]], i32 [[TMP12]], i32 3
 ; AVX-NEXT:    [[TMP14:%.*]] = add nsw <4 x i32> [[TMP13]], <i32 1, i32 2, i32 3, i32 4>
-; AVX-NEXT:    store <4 x i32> [[TMP14]], ptr [[TMP0]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    store <4 x i32> [[TMP14]], ptr [[TMP0]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX-NEXT:    ret void
 ;
-; AVX2-LABEL: @gather_load(
-; AVX2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1:%.*]], i64 1
-; AVX2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP1]], align 4, !tbaa [[TBAA0:![0-9]+]]
-; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP0:%.*]], i64 1
+; AVX2-LABEL: define void @gather_load(
+; AVX2-SAME: ptr noalias captures(none) [[TMP0:%.*]], ptr noalias readonly captures(none) [[TMP1:%.*]]) #[[ATTR0:[0-9]+]] {
+; AVX2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 1
+; AVX2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP1]], align 4, !tbaa [[SHORT_TBAA0:![0-9]+]]
+; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 1
 ; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 11
-; AVX2-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 2
 ; AVX2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 4
-; AVX2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 3
-; AVX2-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX2-NEXT:    [[TMP16:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0
 ; AVX2-NEXT:    [[TMP17:%.*]] = insertelement <4 x i32> [[TMP16]], i32 [[TMP7]], i32 1
 ; AVX2-NEXT:    [[TMP15:%.*]] = insertelement <4 x i32> [[TMP17]], i32 [[TMP10]], i32 2
 ; AVX2-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP15]], i32 [[TMP12]], i32 3
 ; AVX2-NEXT:    [[TMP14:%.*]] = add nsw <4 x i32> [[TMP13]], <i32 1, i32 2, i32 3, i32 4>
-; AVX2-NEXT:    store <4 x i32> [[TMP14]], ptr [[TMP0]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    store <4 x i32> [[TMP14]], ptr [[TMP0]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX2-NEXT:    ret void
 ;
-; AVX512F-LABEL: @gather_load(
-; AVX512F-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1:%.*]], i64 1
-; AVX512F-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP1]], align 4, !tbaa [[TBAA0:![0-9]+]]
-; AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP0:%.*]], i64 1
+; AVX512F-LABEL: define void @gather_load(
+; AVX512F-SAME: ptr noalias captures(none) [[TMP0:%.*]], ptr noalias readonly captures(none) [[TMP1:%.*]]) #[[ATTR0:[0-9]+]] {
+; AVX512F-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 1
+; AVX512F-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP1]], align 4, !tbaa [[SHORT_TBAA0:![0-9]+]]
+; AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 1
 ; AVX512F-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 11
-; AVX512F-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 2
 ; AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 4
-; AVX512F-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX512F-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 3
-; AVX512F-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX512F-NEXT:    [[TMP16:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0
 ; AVX512F-NEXT:    [[TMP17:%.*]] = insertelement <4 x i32> [[TMP16]], i32 [[TMP7]], i32 1
 ; AVX512F-NEXT:    [[TMP15:%.*]] = insertelement <4 x i32> [[TMP17]], i32 [[TMP10]], i32 2
 ; AVX512F-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP15]], i32 [[TMP12]], i32 3
 ; AVX512F-NEXT:    [[TMP14:%.*]] = add nsw <4 x i32> [[TMP13]], <i32 1, i32 2, i32 3, i32 4>
-; AVX512F-NEXT:    store <4 x i32> [[TMP14]], ptr [[TMP0]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT:    store <4 x i32> [[TMP14]], ptr [[TMP0]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX512F-NEXT:    ret void
 ;
-; AVX512VL-LABEL: @gather_load(
-; AVX512VL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1:%.*]], i64 1
-; AVX512VL-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP1]], align 4, !tbaa [[TBAA0:![0-9]+]]
-; AVX512VL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP0:%.*]], i64 1
+; AVX512VL-LABEL: define void @gather_load(
+; AVX512VL-SAME: ptr noalias captures(none) [[TMP0:%.*]], ptr noalias readonly captures(none) [[TMP1:%.*]]) #[[ATTR0:[0-9]+]] {
+; AVX512VL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 1
+; AVX512VL-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP1]], align 4, !tbaa [[SHORT_TBAA0:![0-9]+]]
+; AVX512VL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 1
 ; AVX512VL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 11
-; AVX512VL-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4, !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX512VL-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 2
 ; AVX512VL-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 4
-; AVX512VL-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX512VL-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 3
-; AVX512VL-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX512VL-NEXT:    [[TMP16:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0
 ; AVX512VL-NEXT:    [[TMP17:%.*]] = insertelement <4 x i32> [[TMP16]], i32 [[TMP7]], i32 1
 ; AVX512VL-NEXT:    [[TMP15:%.*]] = insertelement <4 x i32> [[TMP17]], i32 [[TMP10]], i32 2
 ; AVX512VL-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP15]], i32 [[TMP12]], i32 3
 ; AVX512VL-NEXT:    [[TMP14:%.*]] = add nsw <4 x i32> [[TMP13]], <i32 1, i32 2, i32 3, i32 4>
-; AVX512VL-NEXT:    store <4 x i32> [[TMP14]], ptr [[TMP0]], align 4, !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    store <4 x i32> [[TMP14]], ptr [[TMP0]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX512VL-NEXT:    ret void
 ;
   %3 = getelementptr inbounds i32, ptr %1, i64 1
@@ -121,78 +126,83 @@ define void @gather_load(ptr noalias nocapture %0, ptr noalias nocapture readonl
 }
 
 define void @gather_load_2(ptr noalias nocapture %0, ptr noalias nocapture readonly %1) {
-; SSE-LABEL: @gather_load_2(
-; SSE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1:%.*]], i64 1
-; SSE-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]]
+; SSE-LABEL: define void @gather_load_2(
+; SSE-SAME: ptr noalias captures(none) [[TMP0:%.*]], ptr noalias readonly captures(none) [[TMP1:%.*]]) #[[ATTR0]] {
+; SSE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 1
+; SSE-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[SHORT_TBAA0]]
 ; SSE-NEXT:    [[TMP5:%.*]] = add nsw i32 [[TMP4]], 1
-; SSE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP0:%.*]], i64 1
-; SSE-NEXT:    store i32 [[TMP5]], ptr [[TMP0]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 1
+; SSE-NEXT:    store i32 [[TMP5]], ptr [[TMP0]], align 4, !tbaa [[SHORT_TBAA0]]
 ; SSE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 10
-; SSE-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[SHORT_TBAA0]]
 ; SSE-NEXT:    [[TMP9:%.*]] = add nsw i32 [[TMP8]], 2
 ; SSE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 2
-; SSE-NEXT:    store i32 [[TMP9]], ptr [[TMP6]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    store i32 [[TMP9]], ptr [[TMP6]], align 4, !tbaa [[SHORT_TBAA0]]
 ; SSE-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 3
-; SSE-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP11]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP11]], align 4, !tbaa [[SHORT_TBAA0]]
 ; SSE-NEXT:    [[TMP13:%.*]] = add nsw i32 [[TMP12]], 3
 ; SSE-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 3
-; SSE-NEXT:    store i32 [[TMP13]], ptr [[TMP10]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    store i32 [[TMP13]], ptr [[TMP10]], align 4, !tbaa [[SHORT_TBAA0]]
 ; SSE-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 5
-; SSE-NEXT:    [[TMP16:%.*]] = load i32, ptr [[TMP15]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP16:%.*]] = load i32, ptr [[TMP15]], align 4, !tbaa [[SHORT_TBAA0]]
 ; SSE-NEXT:    [[TMP17:%.*]] = add nsw i32 [[TMP16]], 4
-; SSE-NEXT:    store i32 [[TMP17]], ptr [[TMP14]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    store i32 [[TMP17]], ptr [[TMP14]], align 4, !tbaa [[SHORT_TBAA0]]
 ; SSE-NEXT:    ret void
 ;
-; AVX-LABEL: @gather_load_2(
-; AVX-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1:%.*]], i64 1
-; AVX-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]]
+; AVX-LABEL: define void @gather_load_2(
+; AVX-SAME: ptr noalias captures(none) [[TMP0:%.*]], ptr noalias readonly captures(none) [[TMP1:%.*]]) #[[ATTR0]] {
+; AVX-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 1
+; AVX-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 10
-; AVX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 3
-; AVX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 5
-; AVX-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i32 0
 ; AVX-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP6]], i32 1
 ; AVX-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP8]], i32 2
 ; AVX-NEXT:    [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i32 3
 ; AVX-NEXT:    [[TMP15:%.*]] = add nsw <4 x i32> [[TMP14]], <i32 1, i32 2, i32 3, i32 4>
-; AVX-NEXT:    store <4 x i32> [[TMP15]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    store <4 x i32> [[TMP15]], ptr [[TMP0]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX-NEXT:    ret void
 ;
-; AVX2-LABEL: @gather_load_2(
-; AVX2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1:%.*]], i64 1
-; AVX2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]]
+; AVX2-LABEL: define void @gather_load_2(
+; AVX2-SAME: ptr noalias captures(none) [[TMP0:%.*]], ptr noalias readonly captures(none) [[TMP1:%.*]]) #[[ATTR0]] {
+; AVX2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 1
+; AVX2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 10
-; AVX2-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 3
-; AVX2-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 5
-; AVX2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX2-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i32 0
 ; AVX2-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP6]], i32 1
 ; AVX2-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP8]], i32 2
 ; AVX2-NEXT:    [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i32 3
 ; AVX2-NEXT:    [[TMP15:%.*]] = add nsw <4 x i32> [[TMP14]], <i32 1, i32 2, i32 3, i32 4>
-; AVX2-NEXT:    store <4 x i32> [[TMP15]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    store <4 x i32> [[TMP15]], ptr [[TMP0]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX2-NEXT:    ret void
 ;
-; AVX512F-LABEL: @gather_load_2(
-; AVX512F-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1:%.*]], i64 1
-; AVX512F-NEXT:    [[TMP4:%.*]] = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr [[TMP3]], i32 4, <10 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true>, <10 x i32> poison), !tbaa [[TBAA0]]
+; AVX512F-LABEL: define void @gather_load_2(
+; AVX512F-SAME: ptr noalias captures(none) [[TMP0:%.*]], ptr noalias readonly captures(none) [[TMP1:%.*]]) #[[ATTR0]] {
+; AVX512F-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 1
+; AVX512F-NEXT:    [[TMP4:%.*]] = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr [[TMP3]], i32 4, <10 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true>, <10 x i32> poison), !tbaa [[SHORT_TBAA0]]
 ; AVX512F-NEXT:    [[TMP5:%.*]] = shufflevector <10 x i32> [[TMP4]], <10 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 9>
 ; AVX512F-NEXT:    [[TMP6:%.*]] = add nsw <4 x i32> [[TMP5]], <i32 1, i32 3, i32 4, i32 2>
 ; AVX512F-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> poison, <4 x i32> <i32 0, i32 3, i32 1, i32 2>
-; AVX512F-NEXT:    store <4 x i32> [[TMP7]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT:    store <4 x i32> [[TMP7]], ptr [[TMP0]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX512F-NEXT:    ret void
 ;
-; AVX512VL-LABEL: @gather_load_2(
-; AVX512VL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1:%.*]], i64 1
-; AVX512VL-NEXT:    [[TMP4:%.*]] = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr [[TMP3]], i32 4, <10 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true>, <10 x i32> poison), !tbaa [[TBAA0]]
+; AVX512VL-LABEL: define void @gather_load_2(
+; AVX512VL-SAME: ptr noalias captures(none) [[TMP0:%.*]], ptr noalias readonly captures(none) [[TMP1:%.*]]) #[[ATTR0]] {
+; AVX512VL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 1
+; AVX512VL-NEXT:    [[TMP4:%.*]] = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr [[TMP3]], i32 4, <10 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true>, <10 x i32> poison), !tbaa [[SHORT_TBAA0]]
 ; AVX512VL-NEXT:    [[TMP5:%.*]] = shufflevector <10 x i32> [[TMP4]], <10 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 9>
 ; AVX512VL-NEXT:    [[TMP6:%.*]] = add nsw <4 x i32> [[TMP5]], <i32 1, i32 3, i32 4, i32 2>
 ; AVX512VL-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> poison, <4 x i32> <i32 0, i32 3, i32 1, i32 2>
-; AVX512VL-NEXT:    store <4 x i32> [[TMP7]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    store <4 x i32> [[TMP7]], ptr [[TMP0]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX512VL-NEXT:    ret void
 ;
   %3 = getelementptr inbounds i32, ptr %1, i64 1
@@ -219,63 +229,65 @@ define void @gather_load_2(ptr noalias nocapture %0, ptr noalias nocapture reado
 
 
 define void @gather_load_3(ptr noalias nocapture %0, ptr noalias nocapture readonly %1) {
-; SSE-LABEL: @gather_load_3(
-; SSE-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP1:%.*]], align 4, !tbaa [[TBAA0]]
+; SSE-LABEL: define void @gather_load_3(
+; SSE-SAME: ptr noalias captures(none) [[TMP0:%.*]], ptr noalias readonly captures(none) [[TMP1:%.*]]) #[[ATTR0]] {
+; SSE-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP1]], align 4, !tbaa [[SHORT_TBAA0]]
 ; SSE-NEXT:    [[TMP4:%.*]] = add i32 [[TMP3]], 1
-; SSE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP0:%.*]], i64 1
-; SSE-NEXT:    store i32 [[TMP4]], ptr [[TMP0]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 1
+; SSE-NEXT:    store i32 [[TMP4]], ptr [[TMP0]], align 4, !tbaa [[SHORT_TBAA0]]
 ; SSE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 11
-; SSE-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4, !tbaa [[SHORT_TBAA0]]
 ; SSE-NEXT:    [[TMP8:%.*]] = add i32 [[TMP7]], 2
 ; SSE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 2
-; SSE-NEXT:    store i32 [[TMP8]], ptr [[TMP5]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    store i32 [[TMP8]], ptr [[TMP5]], align 4, !tbaa [[SHORT_TBAA0]]
 ; SSE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 4
-; SSE-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 4, !tbaa [[SHORT_TBAA0]]
 ; SSE-NEXT:    [[TMP12:%.*]] = add i32 [[TMP11]], 3
 ; SSE-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 3
-; SSE-NEXT:    store i32 [[TMP12]], ptr [[TMP9]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    store i32 [[TMP12]], ptr [[TMP9]], align 4, !tbaa [[SHORT_TBAA0]]
 ; SSE-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 15
-; SSE-NEXT:    [[TMP15:%.*]] = load i32, ptr [[TMP14]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP15:%.*]] = load i32, ptr [[TMP14]], align 4, !tbaa [[SHORT_TBAA0]]
 ; SSE-NEXT:    [[TMP16:%.*]] = add i32 [[TMP15]], 4
 ; SSE-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 4
-; SSE-NEXT:    store i32 [[TMP16]], ptr [[TMP13]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    store i32 [[TMP16]], ptr [[TMP13]], align 4, !tbaa [[SHORT_TBAA0]]
 ; SSE-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 18
-; SSE-NEXT:    [[TMP19:%.*]] = load i32, ptr [[TMP18]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP19:%.*]] = load i32, ptr [[TMP18]], align 4, !tbaa [[SHORT_TBAA0]]
 ; SSE-NEXT:    [[TMP20:%.*]] = add i32 [[TMP19]], 1
 ; SSE-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 5
-; SSE-NEXT:    store i32 [[TMP20]], ptr [[TMP17]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    store i32 [[TMP20]], ptr [[TMP17]], align 4, !tbaa [[SHORT_TBAA0]]
 ; SSE-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 9
-; SSE-NEXT:    [[TMP23:%.*]] = load i32, ptr [[TMP22]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP23:%.*]] = load i32, ptr [[TMP22]], align 4, !tbaa [[SHORT_TBAA0]]
 ; SSE-NEXT:    [[TMP24:%.*]] = add i32 [[TMP23]], 2
 ; SSE-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 6
-; SSE-NEXT:    store i32 [[TMP24]], ptr [[TMP21]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    store i32 [[TMP24]], ptr [[TMP21]], align 4, !tbaa [[SHORT_TBAA0]]
 ; SSE-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 6
-; SSE-NEXT:    [[TMP27:%.*]] = load i32, ptr [[TMP26]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP27:%.*]] = load i32, ptr [[TMP26]], align 4, !tbaa [[SHORT_TBAA0]]
 ; SSE-NEXT:    [[TMP28:%.*]] = add i32 [[TMP27]], 3
 ; SSE-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 7
-; SSE-NEXT:    store i32 [[TMP28]], ptr [[TMP25]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    store i32 [[TMP28]], ptr [[TMP25]], align 4, !tbaa [[SHORT_TBAA0]]
 ; SSE-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 21
-; SSE-NEXT:    [[TMP31:%.*]] = load i32, ptr [[TMP30]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP31:%.*]] = load i32, ptr [[TMP30]], align 4, !tbaa [[SHORT_TBAA0]]
 ; SSE-NEXT:    [[TMP32:%.*]] = add i32 [[TMP31]], 4
-; SSE-NEXT:    store i32 [[TMP32]], ptr [[TMP29]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    store i32 [[TMP32]], ptr [[TMP29]], align 4, !tbaa [[SHORT_TBAA0]]
 ; SSE-NEXT:    ret void
 ;
-; AVX-LABEL: @gather_load_3(
-; AVX-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP1:%.*]], align 4, !tbaa [[TBAA0]]
+; AVX-LABEL: define void @gather_load_3(
+; AVX-SAME: ptr noalias captures(none) [[TMP0:%.*]], ptr noalias readonly captures(none) [[TMP1:%.*]]) #[[ATTR0]] {
+; AVX-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP1]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 11
-; AVX-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 4
-; AVX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 15
-; AVX-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP8]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP8]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 18
-; AVX-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 9
-; AVX-NEXT:    [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 6
-; AVX-NEXT:    [[TMP15:%.*]] = load i32, ptr [[TMP14]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP15:%.*]] = load i32, ptr [[TMP14]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 21
-; AVX-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP16]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP16]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX-NEXT:    [[TMP18:%.*]] = insertelement <8 x i32> poison, i32 [[TMP3]], i32 0
 ; AVX-NEXT:    [[TMP19:%.*]] = insertelement <8 x i32> [[TMP18]], i32 [[TMP5]], i32 1
 ; AVX-NEXT:    [[TMP20:%.*]] = insertelement <8 x i32> [[TMP19]], i32 [[TMP7]], i32 2
@@ -285,31 +297,34 @@ define void @gather_load_3(ptr noalias nocapture %0, ptr noalias nocapture reado
 ; AVX-NEXT:    [[TMP24:%.*]] = insertelement <8 x i32> [[TMP23]], i32 [[TMP15]], i32 6
 ; AVX-NEXT:    [[TMP25:%.*]] = insertelement <8 x i32> [[TMP24]], i32 [[TMP17]], i32 7
 ; AVX-NEXT:    [[TMP26:%.*]] = add <8 x i32> [[TMP25]], <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
-; AVX-NEXT:    store <8 x i32> [[TMP26]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    store <8 x i32> [[TMP26]], ptr [[TMP0]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX-NEXT:    ret void
 ;
-; AVX2-LABEL: @gather_load_3(
-; AVX2-NEXT:    [[TMP3:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr [[TMP1:%.*]], i32 4, <22 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true>, <22 x i32> poison), !tbaa [[TBAA0]]
+; AVX2-LABEL: define void @gather_load_3(
+; AVX2-SAME: ptr noalias captures(none) [[TMP0:%.*]], ptr noalias readonly captures(none) [[TMP1:%.*]]) #[[ATTR0]] {
+; AVX2-NEXT:    [[TMP3:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr [[TMP1]], i32 4, <22 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true>, <22 x i32> poison), !tbaa [[SHORT_TBAA0]]
 ; AVX2-NEXT:    [[TMP4:%.*]] = shufflevector <22 x i32> [[TMP3]], <22 x i32> poison, <8 x i32> <i32 0, i32 4, i32 6, i32 9, i32 11, i32 15, i32 18, i32 21>
 ; AVX2-NEXT:    [[TMP5:%.*]] = add <8 x i32> [[TMP4]], <i32 1, i32 3, i32 3, i32 2, i32 2, i32 4, i32 1, i32 4>
 ; AVX2-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 6, i32 3, i32 2, i32 7>
-; AVX2-NEXT:    store <8 x i32> [[TMP6]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    store <8 x i32> [[TMP6]], ptr [[TMP0]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX2-NEXT:    ret void
 ;
-; AVX512F-LABEL: @gather_load_3(
-; AVX512F-NEXT:    [[TMP3:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr [[TMP1:%.*]], i32 4, <22 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true>, <22 x i32> poison), !tbaa [[TBAA0]]
+; AVX512F-LABEL: define void @gather_load_3(
+; AVX512F-SAME: ptr noalias captures(none) [[TMP0:%.*]], ptr noalias readonly captures(none) [[TMP1:%.*]]) #[[ATTR0]] {
+; AVX512F-NEXT:    [[TMP3:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr [[TMP1]], i32 4, <22 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true>, <22 x i32> poison), !tbaa [[SHORT_TBAA0]]
 ; AVX512F-NEXT:    [[TMP4:%.*]] = shufflevector <22 x i32> [[TMP3]], <22 x i32> poison, <8 x i32> <i32 0, i32 4, i32 6, i32 9, i32 11, i32 15, i32 18, i32 21>
 ; AVX512F-NEXT:    [[TMP5:%.*]] = add <8 x i32> [[TMP4]], <i32 1, i32 3, i32 3, i32 2, i32 2, i32 4, i32 1, i32 4>
 ; AVX512F-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 6, i32 3, i32 2, i32 7>
-; AVX512F-NEXT:    store <8 x i32> [[TMP6]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT:    store <8 x i32> [[TMP6]], ptr [[TMP0]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX512F-NEXT:    ret void
 ;
-; AVX512VL-LABEL: @gather_load_3(
-; AVX512VL-NEXT:    [[TMP3:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr [[TMP1:%.*]], i32 4, <22 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true>, <22 x i32> poison), !tbaa [[TBAA0]]
+; AVX512VL-LABEL: define void @gather_load_3(
+; AVX512VL-SAME: ptr noalias captures(none) [[TMP0:%.*]], ptr noalias readonly captures(none) [[TMP1:%.*]]) #[[ATTR0]] {
+; AVX512VL-NEXT:    [[TMP3:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr [[TMP1]], i32 4, <22 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true>, <22 x i32> poison), !tbaa [[SHORT_TBAA0]]
 ; AVX512VL-NEXT:    [[TMP4:%.*]] = shufflevector <22 x i32> [[TMP3]], <22 x i32> poison, <8 x i32> <i32 0, i32 4, i32 6, i32 9, i32 11, i32 15, i32 18, i32 21>
 ; AVX512VL-NEXT:    [[TMP5:%.*]] = add <8 x i32> [[TMP4]], <i32 1, i32 3, i32 3, i32 2, i32 2, i32 4, i32 1, i32 4>
 ; AVX512VL-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 6, i32 3, i32 2, i32 7>
-; AVX512VL-NEXT:    store <8 x i32> [[TMP6]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    store <8 x i32> [[TMP6]], ptr [[TMP0]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX512VL-NEXT:    ret void
 ;
   %3 = load i32, ptr %1, align 4, !tbaa !2
@@ -354,9 +369,10 @@ define void @gather_load_3(ptr noalias nocapture %0, ptr noalias nocapture reado
 }
 
 define void @gather_load_4(ptr noalias nocapture %t0, ptr noalias nocapture readonly %t1) {
-; SSE-LABEL: @gather_load_4(
-; SSE-NEXT:    [[T5:%.*]] = getelementptr inbounds i32, ptr [[T0:%.*]], i64 1
-; SSE-NEXT:    [[T6:%.*]] = getelementptr inbounds i32, ptr [[T1:%.*]], i64 11
+; SSE-LABEL: define void @gather_load_4(
+; SSE-SAME: ptr noalias captures(none) [[T0:%.*]], ptr noalias readonly captures(none) [[T1:%.*]]) #[[ATTR0]] {
+; SSE-NEXT:    [[T5:%.*]] = getelementptr inbounds i32, ptr [[T0]], i64 1
+; SSE-NEXT:    [[T6:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 11
 ; SSE-NEXT:    [[T9:%.*]] = getelementptr inbounds i32, ptr [[T0]], i64 2
 ; SSE-NEXT:    [[T10:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 4
 ; SSE-NEXT:    [[T13:%.*]] = getelementptr inbounds i32, ptr [[T0]], i64 3
@@ -369,14 +385,14 @@ define void @gather_load_4(ptr noalias nocapture %t0, ptr noalias nocapture read
 ; SSE-NEXT:    [[T26:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 6
 ; SSE-NEXT:    [[T29:%.*]] = getelementptr inbounds i32, ptr [[T0]], i64 7
 ; SSE-NEXT:    [[T30:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 21
-; SSE-NEXT:    [[T3:%.*]] = load i32, ptr [[T1]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[T7:%.*]] = load i32, ptr [[T6]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[T11:%.*]] = load i32, ptr [[T10]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[T15:%.*]] = load i32, ptr [[T14]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[T19:%.*]] = load i32, ptr [[T18]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[T23:%.*]] = load i32, ptr [[T22]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[T27:%.*]] = load i32, ptr [[T26]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[T31:%.*]] = load i32, ptr [[T30]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[T3:%.*]] = load i32, ptr [[T1]], align 4, !tbaa [[SHORT_TBAA0]]
+; SSE-NEXT:    [[T7:%.*]] = load i32, ptr [[T6]], align 4, !tbaa [[SHORT_TBAA0]]
+; SSE-NEXT:    [[T11:%.*]] = load i32, ptr [[T10]], align 4, !tbaa [[SHORT_TBAA0]]
+; SSE-NEXT:    [[T15:%.*]] = load i32, ptr [[T14]], align 4, !tbaa [[SHORT_TBAA0]]
+; SSE-NEXT:    [[T19:%.*]] = load i32, ptr [[T18]], align 4, !tbaa [[SHORT_TBAA0]]
+; SSE-NEXT:    [[T23:%.*]] = load i32, ptr [[T22]], align 4, !tbaa [[SHORT_TBAA0]]
+; SSE-NEXT:    [[T27:%.*]] = load i32, ptr [[T26]], align 4, !tbaa [[SHORT_TBAA0]]
+; SSE-NEXT:    [[T31:%.*]] = load i32, ptr [[T30]], align 4, !tbaa [[SHORT_TBAA0]]
 ; SSE-NEXT:    [[T4:%.*]] = add i32 [[T3]], 1
 ; SSE-NEXT:    [[T8:%.*]] = add i32 [[T7]], 2
 ; SSE-NEXT:    [[T12:%.*]] = add i32 [[T11]], 3
@@ -385,32 +401,33 @@ define void @gather_load_4(ptr noalias nocapture %t0, ptr noalias nocapture read
 ; SSE-NEXT:    [[T24:%.*]] = add i32 [[T23]], 2
 ; SSE-NEXT:    [[T28:%.*]] = add i32 [[T27]], 3
 ; SSE-NEXT:    [[T32:%.*]] = add i32 [[T31]], 4
-; SSE-NEXT:    store i32 [[T4]], ptr [[T0]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    store i32 [[T8]], ptr [[T5]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    store i32 [[T12]], ptr [[T9]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    store i32 [[T16]], ptr [[T13]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    store i32 [[T20]], ptr [[T17]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    store i32 [[T24]], ptr [[T21]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    store i32 [[T28]], ptr [[T25]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    store i32 [[T32]], ptr [[T29]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    store i32 [[T4]], ptr [[T0]], align 4, !tbaa [[SHORT_TBAA0]]
+; SSE-NEXT:    store i32 [[T8]], ptr [[T5]], align 4, !tbaa [[SHORT_TBAA0]]
+; SSE-NEXT:    store i32 [[T12]], ptr [[T9]], align 4, !tbaa [[SHORT_TBAA0]]
+; SSE-NEXT:    store i32 [[T16]], ptr [[T13]], align 4, !tbaa [[SHORT_TBAA0]]
+; SSE-NEXT:    store i32 [[T20]], ptr [[T17]], align 4, !tbaa [[SHORT_TBAA0]]
+; SSE-NEXT:    store i32 [[T24]], ptr [[T21]], align 4, !tbaa [[SHORT_TBAA0]]
+; SSE-NEXT:    store i32 [[T28]], ptr [[T25]], align 4, !tbaa [[SHORT_TBAA0]]
+; SSE-NEXT:    store i32 [[T32]], ptr [[T29]], align 4, !tbaa [[SHORT_TBAA0]]
 ; SSE-NEXT:    ret void
 ;
-; AVX-LABEL: @gather_load_4(
-; AVX-NEXT:    [[T6:%.*]] = getelementptr inbounds i32, ptr [[T1:%.*]], i64 11
+; AVX-LABEL: define void @gather_load_4(
+; AVX-SAME: ptr noalias captures(none) [[T0:%.*]], ptr noalias readonly captures(none) [[T1:%.*]]) #[[ATTR0]] {
+; AVX-NEXT:    [[T6:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 11
 ; AVX-NEXT:    [[T10:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 4
 ; AVX-NEXT:    [[T14:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 15
 ; AVX-NEXT:    [[T18:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 18
 ; AVX-NEXT:    [[T22:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 9
 ; AVX-NEXT:    [[T26:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 6
 ; AVX-NEXT:    [[T30:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 21
-; AVX-NEXT:    [[T3:%.*]] = load i32, ptr [[T1]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[T7:%.*]] = load i32, ptr [[T6]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[T11:%.*]] = load i32, ptr [[T10]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[T15:%.*]] = load i32, ptr [[T14]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[T19:%.*]] = load i32, ptr [[T18]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[T23:%.*]] = load i32, ptr [[T22]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[T27:%.*]] = load i32, ptr [[T26]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[T31:%.*]] = load i32, ptr [[T30]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[T3:%.*]] = load i32, ptr [[T1]], align 4, !tbaa [[SHORT_TBAA0]]
+; AVX-NEXT:    [[T7:%.*]] = load i32, ptr [[T6]], align 4, !tbaa [[SHORT_TBAA0]]
+; AVX-NEXT:    [[T11:%.*]] = load i32, ptr [[T10]], align 4, !tbaa [[SHORT_TBAA0]]
+; AVX-NEXT:    [[T15:%.*]] = load i32, ptr [[T14]], align 4, !tbaa [[SHORT_TBAA0]]
+; AVX-NEXT:    [[T19:%.*]] = load i32, ptr [[T18]], align 4, !tbaa [[SHORT_TBAA0]]
+; AVX-NEXT:    [[T23:%.*]] = load i32, ptr [[T22]], align 4, !tbaa [[SHORT_TBAA0]]
+; AVX-NEXT:    [[T27:%.*]] = load i32, ptr [[T26]], align 4, !tbaa [[SHORT_TBAA0]]
+; AVX-NEXT:    [[T31:%.*]] = load i32, ptr [[T30]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX-NEXT:    [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[T3]], i32 0
 ; AVX-NEXT:    [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[T7]], i32 1
 ; AVX-NEXT:    [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[T11]], i32 2
@@ -420,31 +437,34 @@ define void @gather_load_4(ptr noalias nocapture %t0, ptr noalias nocapture read
 ; AVX-NEXT:    [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[T27]], i32 6
 ; AVX-NEXT:    [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[T31]], i32 7
 ; AVX-NEXT:    [[TMP9:%.*]] = add <8 x i32> [[TMP8]], <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
-; AVX-NEXT:    store <8 x i32> [[TMP9]], ptr [[T0:%.*]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    store <8 x i32> [[TMP9]], ptr [[T0]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX-NEXT:    ret void
 ;
-; AVX2-LABEL: @gather_load_4(
-; AVX2-NEXT:    [[TMP1:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr [[T1:%.*]], i32 4, <22 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true>, <22 x i32> poison), !tbaa [[TBAA0]]
+; AVX2-LABEL: define void @gather_load_4(
+; AVX2-SAME: ptr noalias captures(none) [[T0:%.*]], ptr noalias readonly captures(none) [[T1:%.*]]) #[[ATTR0]] {
+; AVX2-NEXT:    [[TMP1:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr [[T1]], i32 4, <22 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true>, <22 x i32> poison), !tbaa [[SHORT_TBAA0]]
 ; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <22 x i32> [[TMP1]], <22 x i32> poison, <8 x i32> <i32 0, i32 4, i32 6, i32 9, i32 11, i32 15, i32 18, i32 21>
 ; AVX2-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[TMP2]], <i32 1, i32 3, i32 3, i32 2, i32 2, i32 4, i32 1, i32 4>
 ; AVX2-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 6, i32 3, i32 2, i32 7>
-; AVX2-NEXT:    store <8 x i32> [[TMP4]], ptr [[T0:%.*]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    store <8 x i32> [[TMP4]], ptr [[T0]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX2-NEXT:    ret void
 ;
-; AVX512F-LABEL: @gather_load_4(
-; AVX512F-NEXT:    [[TMP1:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr [[T1:%.*]], i32 4, <22 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true>, <22 x i32> poison), !tbaa [[TBAA0]]
+; AVX512F-LABEL: define void @gather_load_4(
+; AVX512F-SAME: ptr noalias captures(none) [[T0:%.*]], ptr noalias readonly captures(none) [[T1:%.*]]) #[[ATTR0]] {
+; AVX512F-NEXT:    [[TMP1:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr [[T1]], i32 4, <22 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true>, <22 x i32> poison), !tbaa [[SHORT_TBAA0]]
 ; AVX512F-NEXT:    [[TMP2:%.*]] = shufflevector <22 x i32> [[TMP1]], <22 x i32> poison, <8 x i32> <i32 0, i32 4, i32 6, i32 9, i32 11, i32 15, i32 18, i32 21>
 ; AVX512F-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[TMP2]], <i32 1, i32 3, i32 3, i32 2, i32 2, i32 4, i32 1, i32 4>
 ; AVX512F-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 6, i32 3, i32 2, i32 7>
-; AVX512F-NEXT:    store <8 x i32> [[TMP4]], ptr [[T0:%.*]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT:    store <8 x i32> [[TMP4]], ptr [[T0]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX512F-NEXT:    ret void
 ;
-; AVX512VL-LABEL: @gather_load_4(
-; AVX512VL-NEXT:    [[TMP1:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr [[T1:%.*]], i32 4, <22 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true>, <22 x i32> poison), !tbaa [[TBAA0]]
+; AVX512VL-LABEL: define void @gather_load_4(
+; AVX512VL-SAME: ptr noalias captures(none) [[T0:%.*]], ptr noalias readonly captures(none) [[T1:%.*]]) #[[ATTR0]] {
+; AVX512VL-NEXT:    [[TMP1:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr [[T1]], i32 4, <22 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true>, <22 x i32> poison), !tbaa [[SHORT_TBAA0]]
 ; AVX512VL-NEXT:    [[TMP2:%.*]] = shufflevector <22 x i32> [[TMP1]], <22 x i32> poison, <8 x i32> <i32 0, i32 4, i32 6, i32 9, i32 11, i32 15, i32 18, i32 21>
 ; AVX512VL-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[TMP2]], <i32 1, i32 3, i32 3, i32 2, i32 2, i32 4, i32 1, i32 4>
 ; AVX512VL-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 6, i32 3, i32 2, i32 7>
-; AVX512VL-NEXT:    store <8 x i32> [[TMP4]], ptr [[T0:%.*]], align 4, !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    store <8 x i32> [[TMP4]], ptr [[T0]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX512VL-NEXT:    ret void
 ;
   %t5 = getelementptr inbounds i32, ptr %t0, i64 1
@@ -494,17 +514,18 @@ define void @gather_load_4(ptr noalias nocapture %t0, ptr noalias nocapture read
 
 
 define void @gather_load_div(ptr noalias nocapture %0, ptr noalias nocapture readonly %1) {
-; SSE-LABEL: @gather_load_div(
-; SSE-NEXT:    [[TMP3:%.*]] = load float, ptr [[TMP1:%.*]], align 4, !tbaa [[TBAA0]]
+; SSE-LABEL: define void @gather_load_div(
+; SSE-SAME: ptr noalias captures(none) [[TMP0:%.*]], ptr noalias readonly captures(none) [[TMP1:%.*]]) #[[ATTR0]] {
+; SSE-NEXT:    [[TMP3:%.*]] = load float, ptr [[TMP1]], align 4, !tbaa [[SHORT_TBAA0]]
 ; SSE-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 10
 ; SSE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 13
 ; SSE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 3
 ; SSE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 44
-; SSE-NEXT:    [[TMP8:%.*]] = load float, ptr [[TMP7]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP0:%.*]], i64 4
-; SSE-NEXT:    [[TMP10:%.*]] = load <2 x float>, ptr [[TMP6]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP11:%.*]] = load <2 x float>, ptr [[TMP4]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP12:%.*]] = load <2 x float>, ptr [[TMP5]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP8:%.*]] = load float, ptr [[TMP7]], align 4, !tbaa [[SHORT_TBAA0]]
+; SSE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP0]], i64 4
+; SSE-NEXT:    [[TMP10:%.*]] = load <2 x float>, ptr [[TMP6]], align 4, !tbaa [[SHORT_TBAA0]]
+; SSE-NEXT:    [[TMP11:%.*]] = load <2 x float>, ptr [[TMP4]], align 4, !tbaa [[SHORT_TBAA0]]
+; SSE-NEXT:    [[TMP12:%.*]] = load <2 x float>, ptr [[TMP5]], align 4, !tbaa [[SHORT_TBAA0]]
 ; SSE-NEXT:    [[TMP13:%.*]] = insertelement <4 x float> poison, float [[TMP3]], i32 0
 ; SSE-NEXT:    [[TMP14:%.*]] = shufflevector <2 x float> [[TMP11]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
 ; SSE-NEXT:    [[TMP15:%.*]] = shufflevector <4 x float> [[TMP13]], <4 x float> [[TMP14]], <4 x i32> <i32 0, i32 4, i32 poison, i32 poison>
@@ -516,23 +537,23 @@ define void @gather_load_div(ptr noalias nocapture %0, ptr noalias nocapture rea
 ; SSE-NEXT:    [[TMP21:%.*]] = shufflevector <4 x float> [[TMP20]], <4 x float> [[TMP14]], <4 x i32> <i32 0, i32 1, i32 5, i32 poison>
 ; SSE-NEXT:    [[TMP22:%.*]] = insertelement <4 x float> [[TMP21]], float [[TMP8]], i32 3
 ; SSE-NEXT:    [[TMP23:%.*]] = fdiv <4 x float> [[TMP19]], [[TMP22]]
-; SSE-NEXT:    store <4 x float> [[TMP23]], ptr [[TMP0]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    store <4 x float> [[TMP23]], ptr [[TMP0]], align 4, !tbaa [[SHORT_TBAA0]]
 ; SSE-NEXT:    [[TMP24:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 17
-; SSE-NEXT:    [[TMP25:%.*]] = load float, ptr [[TMP24]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP25:%.*]] = load float, ptr [[TMP24]], align 4, !tbaa [[SHORT_TBAA0]]
 ; SSE-NEXT:    [[TMP26:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 33
-; SSE-NEXT:    [[TMP27:%.*]] = load float, ptr [[TMP26]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP27:%.*]] = load float, ptr [[TMP26]], align 4, !tbaa [[SHORT_TBAA0]]
 ; SSE-NEXT:    [[TMP28:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 8
-; SSE-NEXT:    [[TMP29:%.*]] = load float, ptr [[TMP28]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP29:%.*]] = load float, ptr [[TMP28]], align 4, !tbaa [[SHORT_TBAA0]]
 ; SSE-NEXT:    [[TMP30:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 30
-; SSE-NEXT:    [[TMP31:%.*]] = load float, ptr [[TMP30]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP31:%.*]] = load float, ptr [[TMP30]], align 4, !tbaa [[SHORT_TBAA0]]
 ; SSE-NEXT:    [[TMP32:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 5
-; SSE-NEXT:    [[TMP33:%.*]] = load float, ptr [[TMP32]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP33:%.*]] = load float, ptr [[TMP32]], align 4, !tbaa [[SHORT_TBAA0]]
 ; SSE-NEXT:    [[TMP34:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 27
-; SSE-NEXT:    [[TMP35:%.*]] = load float, ptr [[TMP34]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP35:%.*]] = load float, ptr [[TMP34]], align 4, !tbaa [[SHORT_TBAA0]]
 ; SSE-NEXT:    [[TMP36:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 20
-; SSE-NEXT:    [[TMP37:%.*]] = load float, ptr [[TMP36]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP37:%.*]] = load float, ptr [[TMP36]], align 4, !tbaa [[SHORT_TBAA0]]
 ; SSE-NEXT:    [[TMP38:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 23
-; SSE-NEXT:    [[TMP39:%.*]] = load float, ptr [[TMP38]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP39:%.*]] = load float, ptr [[TMP38]], align 4, !tbaa [[SHORT_TBAA0]]
 ; SSE-NEXT:    [[TMP40:%.*]] = insertelement <4 x float> poison, float [[TMP25]], i32 0
 ; SSE-NEXT:    [[TMP41:%.*]] = insertelement <4 x float> [[TMP40]], float [[TMP29]], i32 1
 ; SSE-NEXT:    [[TMP42:%.*]] = insertelement <4 x float> [[TMP41]], float [[TMP33]], i32 2
@@ -542,35 +563,36 @@ define void @gather_load_div(ptr noalias nocapture %0, ptr noalias nocapture rea
 ; SSE-NEXT:    [[TMP46:%.*]] = insertelement <4 x float> [[TMP45]], float [[TMP35]], i32 2
 ; SSE-NEXT:    [[TMP47:%.*]] = insertelement <4 x float> [[TMP46]], float [[TMP39]], i32 3
 ; SSE-NEXT:    [[TMP48:%.*]] = fdiv <4 x float> [[TMP43]], [[TMP47]]
-; SSE-NEXT:    store <4 x float> [[TMP48]], ptr [[TMP9]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    store <4 x float> [[TMP48]], ptr [[TMP9]], align 4, !tbaa [[SHORT_TBAA0]]
 ; SSE-NEXT:    ret void
 ;
-; AVX-LABEL: @gather_load_div(
-; AVX-NEXT:    [[TMP3:%.*]] = load float, ptr [[TMP1:%.*]], align 4, !tbaa [[TBAA0]]
+; AVX-LABEL: define void @gather_load_div(
+; AVX-SAME: ptr noalias captures(none) [[TMP0:%.*]], ptr noalias readonly captures(none) [[TMP1:%.*]]) #[[ATTR0]] {
+; AVX-NEXT:    [[TMP3:%.*]] = load float, ptr [[TMP1]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 10
 ; AVX-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 13
 ; AVX-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 3
 ; AVX-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 44
-; AVX-NEXT:    [[TMP8:%.*]] = load float, ptr [[TMP7]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP8:%.*]] = load float, ptr [[TMP7]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 17
-; AVX-NEXT:    [[TMP10:%.*]] = load float, ptr [[TMP9]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP10:%.*]] = load float, ptr [[TMP9]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 33
-; AVX-NEXT:    [[TMP12:%.*]] = load float, ptr [[TMP11]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP12:%.*]] = load float, ptr [[TMP11]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX-NEXT:    [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 8
-; AVX-NEXT:    [[TMP14:%.*]] = load float, ptr [[TMP13]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP14:%.*]] = load float, ptr [[TMP13]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX-NEXT:    [[TMP15:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 30
-; AVX-NEXT:    [[TMP16:%.*]] = load float, ptr [[TMP15]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP16:%.*]] = load float, ptr [[TMP15]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 5
-; AVX-NEXT:    [[TMP18:%.*]] = load float, ptr [[TMP17]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP18:%.*]] = load float, ptr [[TMP17]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX-NEXT:    [[TMP19:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 27
-; AVX-NEXT:    [[TMP20:%.*]] = load float, ptr [[TMP19]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP20:%.*]] = load float, ptr [[TMP19]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX-NEXT:    [[TMP21:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 20
-; AVX-NEXT:    [[TMP22:%.*]] = load float, ptr [[TMP21]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP22:%.*]] = load float, ptr [[TMP21]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX-NEXT:    [[TMP23:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 23
-; AVX-NEXT:    [[TMP24:%.*]] = load float, ptr [[TMP23]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP25:%.*]] = load <2 x float>, ptr [[TMP6]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP26:%.*]] = load <2 x float>, ptr [[TMP4]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP27:%.*]] = load <2 x float>, ptr [[TMP5]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP24:%.*]] = load float, ptr [[TMP23]], align 4, !tbaa [[SHORT_TBAA0]]
+; AVX-NEXT:    [[TMP25:%.*]] = load <2 x float>, ptr [[TMP6]], align 4, !tbaa [[SHORT_TBAA0]]
+; AVX-NEXT:    [[TMP26:%.*]] = load <2 x float>, ptr [[TMP4]], align 4, !tbaa [[SHORT_TBAA0]]
+; AVX-NEXT:    [[TMP27:%.*]] = load <2 x float>, ptr [[TMP5]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX-NEXT:    [[TMP28:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i32 0
 ; AVX-NEXT:    [[TMP29:%.*]] = shufflevector <2 x float> [[TMP26]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX-NEXT:    [[TMP30:%.*]] = shufflevector <8 x float> [[TMP28]], <8 x float> [[TMP29]], <8 x i32> <i32 0, i32 8, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
@@ -590,35 +612,36 @@ define void @gather_load_div(ptr noalias nocapture %0, ptr noalias nocapture rea
 ; AVX-NEXT:    [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP20]], i32 6
 ; AVX-NEXT:    [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[TMP24]], i32 7
 ; AVX-NEXT:    [[TMP46:%.*]] = fdiv <8 x float> [[TMP38]], [[TMP45]]
-; AVX-NEXT:    store <8 x float> [[TMP46]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    store <8 x float> [[TMP46]], ptr [[TMP0]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX-NEXT:    ret void
 ;
-; AVX2-LABEL: @gather_load_div(
-; AVX2-NEXT:    [[TMP3:%.*]] = load float, ptr [[TMP1:%.*]], align 4, !tbaa [[TBAA0]]
+; AVX2-LABEL: define void @gather_load_div(
+; AVX2-SAME: ptr noalias captures(none) [[TMP0:%.*]], ptr noalias readonly captures(none) [[TMP1:%.*]]) #[[ATTR0]] {
+; AVX2-NEXT:    [[TMP3:%.*]] = load float, ptr [[TMP1]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 10
 ; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 13
 ; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 3
 ; AVX2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 44
-; AVX2-NEXT:    [[TMP8:%.*]] = load float, ptr [[TMP7]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP8:%.*]] = load float, ptr [[TMP7]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 17
-; AVX2-NEXT:    [[TMP10:%.*]] = load float, ptr [[TMP9]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP10:%.*]] = load float, ptr [[TMP9]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 33
-; AVX2-NEXT:    [[TMP12:%.*]] = load float, ptr [[TMP11]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP12:%.*]] = load float, ptr [[TMP11]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 8
-; AVX2-NEXT:    [[TMP14:%.*]] = load float, ptr [[TMP13]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP14:%.*]] = load float, ptr [[TMP13]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 30
-; AVX2-NEXT:    [[TMP16:%.*]] = load float, ptr [[TMP15]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP16:%.*]] = load float, ptr [[TMP15]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 5
-; AVX2-NEXT:    [[TMP18:%.*]] = load float, ptr [[TMP17]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP18:%.*]] = load float, ptr [[TMP17]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX2-NEXT:    [[TMP19:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 27
-; AVX2-NEXT:    [[TMP20:%.*]] = load float, ptr [[TMP19]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP20:%.*]] = load float, ptr [[TMP19]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX2-NEXT:    [[TMP21:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 20
-; AVX2-NEXT:    [[TMP22:%.*]] = load float, ptr [[TMP21]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP22:%.*]] = load float, ptr [[TMP21]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX2-NEXT:    [[TMP23:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 23
-; AVX2-NEXT:    [[TMP24:%.*]] = load float, ptr [[TMP23]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP25:%.*]] = load <2 x float>, ptr [[TMP6]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP26:%.*]] = load <2 x float>, ptr [[TMP4]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP27:%.*]] = load <2 x float>, ptr [[TMP5]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP24:%.*]] = load float, ptr [[TMP23]], align 4, !tbaa [[SHORT_TBAA0]]
+; AVX2-NEXT:    [[TMP25:%.*]] = load <2 x float>, ptr [[TMP6]], align 4, !tbaa [[SHORT_TBAA0]]
+; AVX2-NEXT:    [[TMP26:%.*]] = load <2 x float>, ptr [[TMP4]], align 4, !tbaa [[SHORT_TBAA0]]
+; AVX2-NEXT:    [[TMP27:%.*]] = load <2 x float>, ptr [[TMP5]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX2-NEXT:    [[TMP28:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i32 0
 ; AVX2-NEXT:    [[TMP29:%.*]] = shufflevector <2 x float> [[TMP26]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX2-NEXT:    [[TMP30:%.*]] = shufflevector <8 x float> [[TMP28]], <8 x float> [[TMP29]], <8 x i32> <i32 0, i32 8, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
@@ -638,27 +661,29 @@ define void @gather_load_div(ptr noalias nocapture %0, ptr noalias nocapture rea
 ; AVX2-NEXT:    [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP20]], i32 6
 ; AVX2-NEXT:    [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[TMP24]], i32 7
 ; AVX2-NEXT:    [[TMP46:%.*]] = fdiv <8 x float> [[TMP38]], [[TMP45]]
-; AVX2-NEXT:    store <8 x float> [[TMP46]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    store <8 x float> [[TMP46]], ptr [[TMP0]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX2-NEXT:    ret void
 ;
-; AVX512F-LABEL: @gather_load_div(
-; AVX512F-NEXT:    [[TMP3:%.*]] = call <45 x float> @llvm.masked.load.v45f32.p0(ptr [[TMP1:%.*]], i32 4, <45 x i1> <i1 true, i1 false, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true>, <45 x float> poison), !tbaa [[TBAA0]]
+; AVX512F-LABEL: define void @gather_load_div(
+; AVX512F-SAME: ptr noalias captures(none) [[TMP0:%.*]], ptr noalias readonly captures(none) [[TMP1:%.*]]) #[[ATTR0]] {
+; AVX512F-NEXT:    [[TMP3:%.*]] = call <45 x float> @llvm.masked.load.v45f32.p0(ptr [[TMP1]], i32 4, <45 x i1> <i1 true, i1 false, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true>, <45 x float> poison), !tbaa [[SHORT_TBAA0]]
 ; AVX512F-NEXT:    [[TMP4:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <16 x i32> <i32 0, i32 3, i32 4, i32 5, i32 8, i32 10, i32 11, i32 13, i32 14, i32 17, i32 20, i32 23, i32 27, i32 30, i32 33, i32 44>
 ; AVX512F-NEXT:    [[TMP7:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <8 x i32> <i32 0, i32 3, i32 5, i32 8, i32 10, i32 14, i32 17, i32 20>
 ; AVX512F-NEXT:    [[TMP8:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <8 x i32> <i32 4, i32 11, i32 27, i32 30, i32 13, i32 44, i32 33, i32 23>
 ; AVX512F-NEXT:    [[TMP9:%.*]] = fdiv <8 x float> [[TMP7]], [[TMP8]]
 ; AVX512F-NEXT:    [[TMP10:%.*]] = shufflevector <8 x float> [[TMP9]], <8 x float> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 6, i32 3, i32 2, i32 7>
-; AVX512F-NEXT:    store <8 x float> [[TMP10]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT:    store <8 x float> [[TMP10]], ptr [[TMP0]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX512F-NEXT:    ret void
 ;
-; AVX512VL-LABEL: @gather_load_div(
-; AVX512VL-NEXT:    [[TMP3:%.*]] = call <45 x float> @llvm.masked.load.v45f32.p0(ptr [[TMP1:%.*]], i32 4, <45 x i1> <i1 true, i1 false, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true>, <45 x float> poison), !tbaa [[TBAA0]]
+; AVX512VL-LABEL: define void @gather_load_div(
+; AVX512VL-SAME: ptr noalias captures(none) [[TMP0:%.*]], ptr noalias readonly captures(none) [[TMP1:%.*]]) #[[ATTR0]] {
+; AVX512VL-NEXT:    [[TMP3:%.*]] = call <45 x float> @llvm.masked.load.v45f32.p0(ptr [[TMP1]], i32 4, <45 x i1> <i1 true, i1 false, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true>, <45 x float> poison), !tbaa [[SHORT_TBAA0]]
 ; AVX512VL-NEXT:    [[TMP4:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <16 x i32> <i32 0, i32 3, i32 4, i32 5, i32 8, i32 10, i32 11, i32 13, i32 14, i32 17, i32 20, i32 23, i32 27, i32 30, i32 33, i32 44>
 ; AVX512VL-NEXT:    [[TMP7:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <8 x i32> <i32 0, i32 3, i32 5, i32 8, i32 10, i32 14, i32 17, i32 20>
 ; AVX512VL-NEXT:    [[TMP8:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <8 x i32> <i32 4, i32 11, i32 27, i32 30, i32 13, i32 44, i32 33, i32 23>
 ; AVX512VL-NEXT:    [[TMP9:%.*]] = fdiv <8 x float> [[TMP7]], [[TMP8]]
 ; AVX512VL-NEXT:    [[TMP10:%.*]] = shufflevector <8 x float> [[TMP9]], <8 x float> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 6, i32 3, i32 2, i32 7>
-; AVX512VL-NEXT:    store <8 x float> [[TMP10]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    store <8 x float> [[TMP10]], ptr [[TMP0]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX512VL-NEXT:    ret void
 ;
   %3 = load float, ptr %1, align 4, !tbaa !2
@@ -722,3 +747,29 @@ define void @gather_load_div(ptr noalias nocapture %0, ptr noalias nocapture rea
 !3 = !{!"short", !4, i64 0}
 !4 = !{!"omnipotent char", !5, i64 0}
 !5 = !{!"Simple C++ TBAA"}
+;.
+; SSE: [[SHORT_TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0}
+; SSE: [[META1]] = !{!"short", [[META2:![0-9]+]], i64 0}
+; SSE: [[META2]] = !{!"omnipotent char", [[META3:![0-9]+]], i64 0}
+; SSE: [[META3]] = !{!"Simple C++ TBAA"}
+;.
+; AVX: [[SHORT_TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0}
+; AVX: [[META1]] = !{!"short", [[META2:![0-9]+]], i64 0}
+; AVX: [[META2]] = !{!"omnipotent char", [[META3:![0-9]+]], i64 0}
+; AVX: [[META3]] = !{!"Simple C++ TBAA"}
+;.
+; AVX2: [[SHORT_TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0}
+; AVX2: [[META1]] = !{!"short", [[META2:![0-9]+]], i64 0}
+; AVX2: [[META2]] = !{!"omnipotent char", [[META3:![0-9]+]], i64 0}
+; AVX2: [[META3]] = !{!"Simple C++ TBAA"}
+;.
+; AVX512F: [[SHORT_TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0}
+; AVX512F: [[META1]] = !{!"short", [[META2:![0-9]+]], i64 0}
+; AVX512F: [[META2]] = !{!"omnipotent char", [[META3:![0-9]+]], i64 0}
+; AVX512F: [[META3]] = !{!"Simple C++ TBAA"}
+;.
+; AVX512VL: [[SHORT_TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0}
+; AVX512VL: [[META1]] = !{!"short", [[META2:![0-9]+]], i64 0}
+; AVX512VL: [[META2]] = !{!"omnipotent char", [[META3:![0-9]+]], i64 0}
+; AVX512VL: [[META3]] = !{!"Simple C++ TBAA"}
+;.
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr49933.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr49933.ll
index 26258402b9781..253f08450a2b7 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr49933.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr49933.ll
@@ -1,13 +1,14 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-- -mcpu=skylake-avx512 | FileCheck %s
 
 define void @foo(ptr noalias nocapture %t0, ptr noalias nocapture readonly %t1) {
-; CHECK-LABEL: @foo(
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr [[T1:%.*]], align 1, !tbaa [[TBAA0:![0-9]+]]
+; CHECK-LABEL: define void @foo(
+; CHECK-SAME: ptr noalias captures(none) [[T0:%.*]], ptr noalias readonly captures(none) [[T1:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr [[T1]], align 1, !tbaa [[CHAR_TBAA0:![0-9]+]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ult <8 x i8> [[TMP2]], splat (i8 64)
 ; CHECK-NEXT:    [[TMP4:%.*]] = sub <8 x i8> zeroinitializer, [[TMP2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = select <8 x i1> [[TMP3]], <8 x i8> [[TMP2]], <8 x i8> [[TMP4]]
-; CHECK-NEXT:    store <8 x i8> [[TMP5]], ptr [[T0:%.*]], align 1, !tbaa [[TBAA0]]
+; CHECK-NEXT:    store <8 x i8> [[TMP5]], ptr [[T0]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-NEXT:    ret void
 ;
   %t3 = load i8, ptr %t1, align 1, !tbaa !3
@@ -70,3 +71,8 @@ define void @foo(ptr noalias nocapture %t0, ptr noalias nocapture readonly %t1)
 !3 = !{!4, !4, i64 0}
 !4 = !{!"omnipotent char", !5, i64 0}
 !5 = !{!"Simple C++ TBAA"}
+;.
+; CHECK: [[CHAR_TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0}
+; CHECK: [[META1]] = !{!"omnipotent char", [[META2:![0-9]+]], i64 0}
+; CHECK: [[META2]] = !{!"Simple C++ TBAA"}
+;.
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/remark_listcost.ll b/llvm/test/Transforms/SLPVectorizer/X86/remark_listcost.ll
index 2cd7adaad969f..b409aa74acd48 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/remark_listcost.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/remark_listcost.ll
@@ -1,26 +1,27 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt -S -mtriple=x86_64-pc-linux-gnu -mcpu=generic -passes=slp-vectorizer -pass-remarks-output=%t < %s | FileCheck %s
 ; RUN: FileCheck --input-file=%t --check-prefix=YAML %s
 
 define void @vsub2_test(ptr %pin1, ptr %pin2, ptr %pout) #0 {
-; CHECK-LABEL: @vsub2_test(
-; CHECK-NEXT:    br label [[TMP1:%.*]]
-; CHECK:       1:
-; CHECK-NEXT:    [[IDX_04:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[TMP1]] ]
-; CHECK-NEXT:    [[PO_03:%.*]] = phi ptr [ [[POUT:%.*]], [[TMP0]] ], [ [[TMP7:%.*]], [[TMP1]] ]
-; CHECK-NEXT:    [[PTMPI2_02:%.*]] = phi ptr [ [[PIN2:%.*]], [[TMP0]] ], [ [[TMP4:%.*]], [[TMP1]] ]
-; CHECK-NEXT:    [[PTMPI1_01:%.*]] = phi ptr [ [[PIN1:%.*]], [[TMP0]] ], [ [[TMP2:%.*]], [[TMP1]] ]
+; CHECK-LABEL: define void @vsub2_test(
+; CHECK-SAME: ptr [[PIN1:%.*]], ptr [[PIN2:%.*]], ptr [[POUT:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    br label %[[TMP1:.*]]
+; CHECK:       [[TMP1]]:
+; CHECK-NEXT:    [[IDX_04:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], %[[TMP1]] ]
+; CHECK-NEXT:    [[PO_03:%.*]] = phi ptr [ [[POUT]], [[TMP0]] ], [ [[TMP7:%.*]], %[[TMP1]] ]
+; CHECK-NEXT:    [[PTMPI2_02:%.*]] = phi ptr [ [[PIN2]], [[TMP0]] ], [ [[TMP4:%.*]], %[[TMP1]] ]
+; CHECK-NEXT:    [[PTMPI1_01:%.*]] = phi ptr [ [[PIN1]], [[TMP0]] ], [ [[TMP2:%.*]], %[[TMP1]] ]
 ; CHECK-NEXT:    [[TMP2]] = getelementptr inbounds i32, ptr [[PTMPI1_01]], i64 1
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[PTMPI1_01]], align 4, !tbaa [[TBAA1:![0-9]+]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[PTMPI1_01]], align 4, !tbaa [[INT_TBAA1:![0-9]+]]
 ; CHECK-NEXT:    [[TMP4]] = getelementptr inbounds i32, ptr [[PTMPI2_02]], i64 1
-; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[PTMPI2_02]], align 4, !tbaa [[TBAA1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[PTMPI2_02]], align 4, !tbaa [[INT_TBAA1]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = sub nsw i32 [[TMP3]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP7]] = getelementptr inbounds i32, ptr [[PO_03]], i64 1
-; CHECK-NEXT:    store i32 [[TMP6]], ptr [[PO_03]], align 4, !tbaa [[TBAA1]]
+; CHECK-NEXT:    store i32 [[TMP6]], ptr [[PO_03]], align 4, !tbaa [[INT_TBAA1]]
 ; CHECK-NEXT:    [[TMP8]] = add nuw nsw i32 [[IDX_04]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[TMP8]], 64
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[TMP9:%.*]], label [[TMP1]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK:       9:
+; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[BB9:.*]], label %[[TMP1]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK:       [[BB9]]:
 ; CHECK-NEXT:    ret void
 ;
   br label %1
@@ -61,3 +62,12 @@ define void @vsub2_test(ptr %pin1, ptr %pin2, ptr %pout) #0 {
 !5 = distinct !{!5, !6, !7}
 !6 = !{!"llvm.loop.vectorize.width", i32 1}
 !7 = !{!"llvm.loop.interleave.count", i32 1}
+;.
+; CHECK: [[INT_TBAA1]] = !{[[META2:![0-9]+]], [[META2]], i64 0}
+; CHECK: [[META2]] = !{!"int", [[META3:![0-9]+]], i64 0}
+; CHECK: [[META3]] = !{!"omnipotent char", [[META4:![0-9]+]], i64 0}
+; CHECK: [[META4]] = !{!"Simple C/C++ TBAA"}
+; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META6:![0-9]+]], [[META7:![0-9]+]]}
+; CHECK: [[META6]] = !{!"llvm.loop.vectorize.width", i32 1}
+; CHECK: [[META7]] = !{!"llvm.loop.interleave.count", i32 1}
+;.
diff --git a/llvm/test/Transforms/SROA/tbaa-struct2.ll b/llvm/test/Transforms/SROA/tbaa-struct2.ll
index 545fa47eecb2c..be91a87b6175d 100644
--- a/llvm/test/Transforms/SROA/tbaa-struct2.ll
+++ b/llvm/test/Transforms/SROA/tbaa-struct2.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 6
 ; RUN: opt -S -passes='sroa<preserve-cfg>' %s | FileCheck %s --check-prefixes=CHECK,CHECK-PRESERVE-CFG
 ; RUN: opt -S -passes='sroa<modify-cfg>' %s | FileCheck %s --check-prefixes=CHECK,CHECK-MODIFY-CFG
 
@@ -9,11 +9,12 @@ declare void @llvm.memcpy.p0.p0.i64(ptr writeonly, ptr readonly, i64, i1 immarg)
 declare double @subcall(double %g, i32 %m)
 
 define double @bar(ptr %wishart) {
-; CHECK-LABEL: @bar(
+; CHECK-LABEL: define double @bar(
+; CHECK-SAME: ptr [[WISHART:%.*]]) {
 ; CHECK-NEXT:    [[TMP_SROA_3:%.*]] = alloca [4 x i8], align 4
-; CHECK-NEXT:    [[TMP_SROA_0_0_COPYLOAD:%.*]] = load double, ptr [[WISHART:%.*]], align 8, !tbaa [[TBAA0:![0-9]+]]
+; CHECK-NEXT:    [[TMP_SROA_0_0_COPYLOAD:%.*]] = load double, ptr [[WISHART]], align 8, !tbaa [[DOUBLE_TBAA0:![0-9]+]]
 ; CHECK-NEXT:    [[TMP_SROA_2_0_WISHART_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[WISHART]], i64 8
-; CHECK-NEXT:    [[TMP_SROA_2_0_COPYLOAD:%.*]] = load i32, ptr [[TMP_SROA_2_0_WISHART_SROA_IDX]], align 8, !tbaa [[TBAA4:![0-9]+]]
+; CHECK-NEXT:    [[TMP_SROA_2_0_COPYLOAD:%.*]] = load i32, ptr [[TMP_SROA_2_0_WISHART_SROA_IDX]], align 8, !tbaa [[INT_TBAA4:![0-9]+]]
 ; CHECK-NEXT:    [[TMP_SROA_3_0_WISHART_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[WISHART]], i64 12
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP_SROA_3]], ptr align 4 [[TMP_SROA_3_0_WISHART_SROA_IDX]], i64 4, i1 false), !tbaa.struct [[TBAA_STRUCT6:![0-9]+]]
 ; CHECK-NEXT:    [[CALL:%.*]] = call double @subcall(double [[TMP_SROA_0_0_COPYLOAD]], i32 [[TMP_SROA_2_0_COPYLOAD]])
@@ -38,11 +39,11 @@ define double @bar(ptr %wishart) {
 ;.
 ; CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
 ;.
-; CHECK: [[TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0}
+; CHECK: [[DOUBLE_TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0}
 ; CHECK: [[META1]] = !{!"double", [[META2:![0-9]+]], i64 0}
 ; CHECK: [[META2]] = !{!"omnipotent char", [[META3:![0-9]+]], i64 0}
 ; CHECK: [[META3]] = !{!"Simple C++ TBAA"}
-; CHECK: [[TBAA4]] = !{[[META5:![0-9]+]], [[META5]], i64 0}
+; CHECK: [[INT_TBAA4]] = !{[[META5:![0-9]+]], [[META5]], i64 0}
 ; CHECK: [[META5]] = !{!"int", [[META2]], i64 0}
 ; CHECK: [[TBAA_STRUCT6]] = !{}
 ;.
diff --git a/llvm/test/Transforms/SROA/tbaa-struct3.ll b/llvm/test/Transforms/SROA/tbaa-struct3.ll
index 5326b9802ec6d..6a0cacc7016f7 100644
--- a/llvm/test/Transforms/SROA/tbaa-struct3.ll
+++ b/llvm/test/Transforms/SROA/tbaa-struct3.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt -p sroa -S %s | FileCheck %s
 
 
@@ -7,12 +7,12 @@ target datalayout = "e-p:64:64:64-p1:16:16:16-i1:8:8-i8:8:8-i16:16:16-i32:32:32-
 define void @load_store_transfer_split_struct_tbaa_2_float(ptr dereferenceable(24) %res, float %a, float %b) {
 ; CHECK-LABEL: define void @load_store_transfer_split_struct_tbaa_2_float(
 ; CHECK-SAME: ptr dereferenceable(24) [[RES:%.*]], float [[A:%.*]], float [[B:%.*]]) {
-; CHECK-NEXT:  entry:
+; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float [[A]] to i32
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float [[B]] to i32
-; CHECK-NEXT:    store i32 [[TMP0]], ptr [[RES]], align 4, !tbaa [[TBAA0:![0-9]+]]
+; CHECK-NEXT:    store i32 [[TMP0]], ptr [[RES]], align 4, !tbaa [[FLOAT_TBAA0:![0-9]+]]
 ; CHECK-NEXT:    [[RES_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[RES]], i64 4
-; CHECK-NEXT:    store i32 [[TMP1]], ptr [[RES_SROA_IDX]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    store i32 [[TMP1]], ptr [[RES_SROA_IDX]], align 4, !tbaa [[FLOAT_TBAA0]]
 ; CHECK-NEXT:    [[P:%.*]] = load ptr, ptr [[RES]], align 8
 ; CHECK-NEXT:    ret void
 ;
@@ -30,11 +30,11 @@ entry:
 define void @memcpy_transfer(ptr dereferenceable(24) %res, float %a, float %b) {
 ; CHECK-LABEL: define void @memcpy_transfer(
 ; CHECK-SAME: ptr dereferenceable(24) [[RES:%.*]], float [[A:%.*]], float [[B:%.*]]) {
-; CHECK-NEXT:  entry:
+; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[L_PTR:%.*]] = load ptr, ptr [[RES]], align 8
-; CHECK-NEXT:    store float [[A]], ptr [[L_PTR]], align 1, !tbaa [[TBAA0]]
+; CHECK-NEXT:    store float [[A]], ptr [[L_PTR]], align 1, !tbaa [[FLOAT_TBAA0]]
 ; CHECK-NEXT:    [[TMP_SROA_2_0_L_PTR_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[L_PTR]], i64 4
-; CHECK-NEXT:    store float [[B]], ptr [[TMP_SROA_2_0_L_PTR_SROA_IDX]], align 1, !tbaa [[TBAA0]]
+; CHECK-NEXT:    store float [[B]], ptr [[TMP_SROA_2_0_L_PTR_SROA_IDX]], align 1, !tbaa [[FLOAT_TBAA0]]
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -50,9 +50,9 @@ entry:
 define void @memcpy_transfer_tbaa_field_and_size_do_not_align(ptr dereferenceable(24) %res, float %a, float %b) {
 ; CHECK-LABEL: define void @memcpy_transfer_tbaa_field_and_size_do_not_align(
 ; CHECK-SAME: ptr dereferenceable(24) [[RES:%.*]], float [[A:%.*]], float [[B:%.*]]) {
-; CHECK-NEXT:  entry:
+; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[L_PTR:%.*]] = load ptr, ptr [[RES]], align 8
-; CHECK-NEXT:    store float [[A]], ptr [[L_PTR]], align 1, !tbaa [[TBAA0]]
+; CHECK-NEXT:    store float [[A]], ptr [[L_PTR]], align 1, !tbaa [[FLOAT_TBAA0]]
 ; CHECK-NEXT:    [[TMP_SROA_2_0_L_PTR_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[L_PTR]], i64 4
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float [[B]] to i32
 ; CHECK-NEXT:    [[TMP_SROA_2_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[TMP0]] to i16
@@ -72,7 +72,7 @@ entry:
 define void @load_store_transfer_split_struct_tbaa_2_i31(ptr dereferenceable(24) %res, i31 %a, i31 %b) {
 ; CHECK-LABEL: define void @load_store_transfer_split_struct_tbaa_2_i31(
 ; CHECK-SAME: ptr dereferenceable(24) [[RES:%.*]], i31 [[A:%.*]], i31 [[B:%.*]]) {
-; CHECK-NEXT:  entry:
+; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[TMP:%.*]] = alloca { i31, i31 }, align 4
 ; CHECK-NEXT:    store i31 [[A]], ptr [[TMP]], align 4
 ; CHECK-NEXT:    [[TMP_4_TMP_4_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[TMP]], i64 4
@@ -98,9 +98,9 @@ define void @store_vector_part_first(ptr %y2, float %f) {
 ; CHECK-LABEL: define void @store_vector_part_first(
 ; CHECK-SAME: ptr [[Y2:%.*]], float [[F:%.*]]) {
 ; CHECK-NEXT:    [[V_1:%.*]] = call <2 x float> @foo(ptr [[Y2]])
-; CHECK-NEXT:    store <2 x float> [[V_1]], ptr [[Y2]], align 8, !tbaa [[TBAA5:![0-9]+]]
+; CHECK-NEXT:    store <2 x float> [[V_1]], ptr [[Y2]], align 8, !tbaa [[V2F32_TBAA5:![0-9]+]]
 ; CHECK-NEXT:    [[X7_SROA_2_0_Y2_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[Y2]], i64 8
-; CHECK-NEXT:    store float [[F]], ptr [[X7_SROA_2_0_Y2_SROA_IDX]], align 8, !tbaa [[TBAA0]]
+; CHECK-NEXT:    store float [[F]], ptr [[X7_SROA_2_0_Y2_SROA_IDX]], align 8, !tbaa [[FLOAT_TBAA0]]
 ; CHECK-NEXT:    ret void
 ;
   %x7 = alloca { float, float, float, float }
@@ -116,9 +116,9 @@ define void @store_vector_part_second(ptr %y2, float %f) {
 ; CHECK-LABEL: define void @store_vector_part_second(
 ; CHECK-SAME: ptr [[Y2:%.*]], float [[F:%.*]]) {
 ; CHECK-NEXT:    [[V_1:%.*]] = call <2 x float> @foo(ptr [[Y2]])
-; CHECK-NEXT:    store float [[F]], ptr [[Y2]], align 8, !tbaa [[TBAA0]]
+; CHECK-NEXT:    store float [[F]], ptr [[Y2]], align 8, !tbaa [[FLOAT_TBAA0]]
 ; CHECK-NEXT:    [[X7_SROA_2_0_Y2_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[Y2]], i64 4
-; CHECK-NEXT:    store <2 x float> [[V_1]], ptr [[X7_SROA_2_0_Y2_SROA_IDX]], align 4, !tbaa [[TBAA5]]
+; CHECK-NEXT:    store <2 x float> [[V_1]], ptr [[X7_SROA_2_0_Y2_SROA_IDX]], align 4, !tbaa [[V2F32_TBAA5]]
 ; CHECK-NEXT:    ret void
 ;
   %x7 = alloca { float, float, float, float }
@@ -134,7 +134,7 @@ define void @store_vector_single(ptr %y2, float %f) {
 ; CHECK-LABEL: define void @store_vector_single(
 ; CHECK-SAME: ptr [[Y2:%.*]], float [[F:%.*]]) {
 ; CHECK-NEXT:    [[V_1:%.*]] = call <2 x float> @foo(ptr [[Y2]])
-; CHECK-NEXT:    store <2 x float> [[V_1]], ptr [[Y2]], align 4, !tbaa [[TBAA5]]
+; CHECK-NEXT:    store <2 x float> [[V_1]], ptr [[Y2]], align 4, !tbaa [[V2F32_TBAA5]]
 ; CHECK-NEXT:    ret void
 ;
   %x7 = alloca { float, float }
@@ -149,7 +149,7 @@ declare void @llvm.memset.p0.i8(ptr nocapture, i8, i32, i1) nounwind
 define void @memset(ptr %dst, ptr align 8 %src) {
 ; CHECK-LABEL: define void @memset(
 ; CHECK-SAME: ptr [[DST:%.*]], ptr align 8 [[SRC:%.*]]) {
-; CHECK-NEXT:  entry:
+; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[A_SROA_0:%.*]] = alloca [7 x i8], align 1
 ; CHECK-NEXT:    [[A_SROA_3:%.*]] = alloca i16, align 2
 ; CHECK-NEXT:    [[A_SROA_4:%.*]] = alloca [10 x i8], align 1
@@ -162,7 +162,7 @@ define void @memset(ptr %dst, ptr align 8 %src) {
 ; CHECK-NEXT:    store i16 1, ptr [[A_SROA_3]], align 2
 ; CHECK-NEXT:    [[A_SROA_0_1_A_1_SROA_IDX2:%.*]] = getelementptr inbounds i8, ptr [[A_SROA_0]], i64 1
 ; CHECK-NEXT:    call void @llvm.memset.p0.i32(ptr align 1 [[A_SROA_0_1_A_1_SROA_IDX2]], i8 42, i32 6, i1 false)
-; CHECK-NEXT:    store i16 10794, ptr [[A_SROA_3]], align 2, !tbaa [[TBAA0]]
+; CHECK-NEXT:    store i16 10794, ptr [[A_SROA_3]], align 2, !tbaa [[FLOAT_TBAA0]]
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 1 [[DST]], ptr align 1 [[A_SROA_0]], i32 7, i1 true)
 ; CHECK-NEXT:    [[A_SROA_3_0_DST_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 7
 ; CHECK-NEXT:    [[A_SROA_3_0_A_SROA_3_0_COPYLOAD1:%.*]] = load volatile i16, ptr [[A_SROA_3]], align 2
@@ -187,7 +187,7 @@ entry:
 define void @memset2(ptr %dst, ptr align 8 %src) {
 ; CHECK-LABEL: define void @memset2(
 ; CHECK-SAME: ptr [[DST:%.*]], ptr align 8 [[SRC:%.*]]) {
-; CHECK-NEXT:  entry:
+; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[A_SROA_0:%.*]] = alloca [209 x i8], align 1
 ; CHECK-NEXT:    [[A_SROA_3:%.*]] = alloca i8, align 1
 ; CHECK-NEXT:    [[A_SROA_4:%.*]] = alloca [90 x i8], align 1
@@ -199,8 +199,8 @@ define void @memset2(ptr %dst, ptr align 8 %src) {
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 1 [[A_SROA_4]], ptr align 2 [[A_SROA_4_0_SRC_SROA_IDX]], i32 90, i1 false)
 ; CHECK-NEXT:    store i8 1, ptr [[A_SROA_3]], align 1
 ; CHECK-NEXT:    [[A_SROA_0_202_A_202_SROA_IDX2:%.*]] = getelementptr inbounds i8, ptr [[A_SROA_0]], i64 202
-; CHECK-NEXT:    call void @llvm.memset.p0.i32(ptr align 1 [[A_SROA_0_202_A_202_SROA_IDX2]], i8 42, i32 7, i1 false), !tbaa [[TBAA5]]
-; CHECK-NEXT:    store i8 42, ptr [[A_SROA_3]], align 1, !tbaa [[TBAA5]]
+; CHECK-NEXT:    call void @llvm.memset.p0.i32(ptr align 1 [[A_SROA_0_202_A_202_SROA_IDX2]], i8 42, i32 7, i1 false), !tbaa [[V2F32_TBAA5]]
+; CHECK-NEXT:    store i8 42, ptr [[A_SROA_3]], align 1, !tbaa [[V2F32_TBAA5]]
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 1 [[DST]], ptr align 1 [[A_SROA_0]], i32 209, i1 true)
 ; CHECK-NEXT:    [[A_SROA_3_0_DST_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 209
 ; CHECK-NEXT:    [[A_SROA_3_0_A_SROA_3_0_COPYLOAD1:%.*]] = load volatile i8, ptr [[A_SROA_3]], align 1
@@ -233,7 +233,7 @@ entry:
 define void @slice_store_v2i8_1(ptr %dst, ptr %dst.2, ptr %src) {
 ; CHECK-LABEL: define void @slice_store_v2i8_1(
 ; CHECK-SAME: ptr [[DST:%.*]], ptr [[DST_2:%.*]], ptr [[SRC:%.*]]) {
-; CHECK-NEXT:  entry:
+; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[A_SROA_0:%.*]] = alloca [6 x i8], align 1
 ; CHECK-NEXT:    [[A_SROA_2_SROA_0:%.*]] = alloca <2 x i8>, align 4
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 1 [[A_SROA_0]], ptr align 8 [[SRC]], i32 6, i1 false)
@@ -268,7 +268,7 @@ entry:
 define void @slice_store_v2i8_2(ptr %dst, ptr %dst.2, ptr %src) {
 ; CHECK-LABEL: define void @slice_store_v2i8_2(
 ; CHECK-SAME: ptr [[DST:%.*]], ptr [[DST_2:%.*]], ptr [[SRC:%.*]]) {
-; CHECK-NEXT:  entry:
+; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[A_SROA_0_SROA_1:%.*]] = alloca <2 x i8>, align 2
 ; CHECK-NEXT:    [[A_SROA_0_SROA_4:%.*]] = alloca i8, align 1
 ; CHECK-NEXT:    [[A_SROA_4:%.*]] = alloca [5 x i8], align 1
@@ -317,7 +317,7 @@ define double @tbaa_struct_load(ptr %src, ptr %dst) {
 ; CHECK-NEXT:    [[TMP_SROA_3_0_SRC_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 8
 ; CHECK-NEXT:    [[TMP_SROA_3_0_COPYLOAD:%.*]] = load i64, ptr [[TMP_SROA_3_0_SRC_SROA_IDX]], align 8
 ; CHECK-NEXT:    store i64 [[TMP_SROA_3_0_COPYLOAD]], ptr [[TMP_SROA_3]], align 8
-; CHECK-NEXT:    [[TMP_SROA_0_0_TMP_SROA_0_0_LG:%.*]] = load double, ptr [[TMP_SROA_0]], align 8, !tbaa [[TBAA5]]
+; CHECK-NEXT:    [[TMP_SROA_0_0_TMP_SROA_0_0_LG:%.*]] = load double, ptr [[TMP_SROA_0]], align 8, !tbaa [[V2F32_TBAA5]]
 ; CHECK-NEXT:    [[TMP_SROA_0_0_TMP_SROA_0_0_COPYLOAD1:%.*]] = load volatile double, ptr [[TMP_SROA_0]], align 8
 ; CHECK-NEXT:    store volatile double [[TMP_SROA_0_0_TMP_SROA_0_0_COPYLOAD1]], ptr [[DST]], align 8
 ; CHECK-NEXT:    [[TMP_SROA_3_0_DST_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 8
@@ -335,9 +335,9 @@ define double @tbaa_struct_load(ptr %src, ptr %dst) {
 define i32 @shorten_integer_store_single_field(ptr %dst, ptr %dst.2, ptr %src) {
 ; CHECK-LABEL: define i32 @shorten_integer_store_single_field(
 ; CHECK-SAME: ptr [[DST:%.*]], ptr [[DST_2:%.*]], ptr [[SRC:%.*]]) {
-; CHECK-NEXT:  entry:
+; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[A_SROA_0:%.*]] = alloca i32, align 4
-; CHECK-NEXT:    store i32 123, ptr [[A_SROA_0]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    store i32 123, ptr [[A_SROA_0]], align 4, !tbaa [[FLOAT_TBAA0]]
 ; CHECK-NEXT:    [[A_SROA_0_0_A_SROA_0_0_L:%.*]] = load i32, ptr [[A_SROA_0]], align 4
 ; CHECK-NEXT:    [[A_SROA_0_0_A_SROA_0_0_COPYLOAD:%.*]] = load volatile i32, ptr [[A_SROA_0]], align 4
 ; CHECK-NEXT:    store volatile i32 [[A_SROA_0_0_A_SROA_0_0_COPYLOAD]], ptr [[DST]], align 1
@@ -354,9 +354,9 @@ entry:
 define i32 @shorten_integer_store_multiple_fields(ptr %dst, ptr %dst.2, ptr %src) {
 ; CHECK-LABEL: define i32 @shorten_integer_store_multiple_fields(
 ; CHECK-SAME: ptr [[DST:%.*]], ptr [[DST_2:%.*]], ptr [[SRC:%.*]]) {
-; CHECK-NEXT:  entry:
+; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[A_SROA_0:%.*]] = alloca i32, align 4
-; CHECK-NEXT:    store i32 123, ptr [[A_SROA_0]], align 4, !tbaa [[TBAA5]]
+; CHECK-NEXT:    store i32 123, ptr [[A_SROA_0]], align 4, !tbaa [[V2F32_TBAA5]]
 ; CHECK-NEXT:    [[A_SROA_0_0_A_SROA_0_0_L:%.*]] = load i32, ptr [[A_SROA_0]], align 4
 ; CHECK-NEXT:    [[A_SROA_0_0_A_SROA_0_0_COPYLOAD:%.*]] = load volatile i32, ptr [[A_SROA_0]], align 4
 ; CHECK-NEXT:    store volatile i32 [[A_SROA_0_0_A_SROA_0_0_COPYLOAD]], ptr [[DST]], align 1
@@ -373,7 +373,7 @@ entry:
 define <2 x i16> @shorten_vector_store_multiple_fields(ptr %dst, ptr %dst.2, ptr %src) {
 ; CHECK-LABEL: define <2 x i16> @shorten_vector_store_multiple_fields(
 ; CHECK-SAME: ptr [[DST:%.*]], ptr [[DST_2:%.*]], ptr [[SRC:%.*]]) {
-; CHECK-NEXT:  entry:
+; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[A_SROA_0:%.*]] = alloca <2 x i32>, align 8
 ; CHECK-NEXT:    store <2 x i32> <i32 1, i32 2>, ptr [[A_SROA_0]], align 8
 ; CHECK-NEXT:    [[A_SROA_0_0_A_SROA_0_0_L:%.*]] = load <2 x i16>, ptr [[A_SROA_0]], align 8
@@ -391,7 +391,7 @@ entry:
 define <2 x i16> @shorten_vector_store_single_fields(ptr %dst, ptr %dst.2, ptr %src) {
 ; CHECK-LABEL: define <2 x i16> @shorten_vector_store_single_fields(
 ; CHECK-SAME: ptr [[DST:%.*]], ptr [[DST_2:%.*]], ptr [[SRC:%.*]]) {
-; CHECK-NEXT:  entry:
+; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[A_SROA_0:%.*]] = alloca <2 x i32>, align 8
 ; CHECK-NEXT:    store <2 x i32> <i32 1, i32 2>, ptr [[A_SROA_0]], align 8
 ; CHECK-NEXT:    [[A_SROA_0_0_A_SROA_0_0_L:%.*]] = load <2 x i16>, ptr [[A_SROA_0]], align 8
@@ -409,7 +409,7 @@ entry:
 define i32 @split_load_with_tbaa_struct(i32 %x, ptr %src, ptr %dst) {
 ; CHECK-LABEL: define i32 @split_load_with_tbaa_struct(
 ; CHECK-SAME: i32 [[X:%.*]], ptr [[SRC:%.*]], ptr [[DST:%.*]]) {
-; CHECK-NEXT:  entry:
+; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[A3_SROA_0:%.*]] = alloca i16, align 8
 ; CHECK-NEXT:    [[A3_SROA_3:%.*]] = alloca i16, align 2
 ; CHECK-NEXT:    [[A3_SROA_33:%.*]] = alloca float, align 4
@@ -429,11 +429,11 @@ define i32 @split_load_with_tbaa_struct(i32 %x, ptr %src, ptr %dst) {
 ; CHECK-NEXT:    [[A3_SROA_5_0_SRC_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 9
 ; CHECK-NEXT:    [[A3_SROA_5_0_COPYLOAD:%.*]] = load i8, ptr [[A3_SROA_5_0_SRC_SROA_IDX]], align 1
 ; CHECK-NEXT:    store i8 [[A3_SROA_5_0_COPYLOAD]], ptr [[A3_SROA_5]], align 1
-; CHECK-NEXT:    [[A3_SROA_0_0_A3_SROA_0_0_LOAD4_FCA_0_LOAD:%.*]] = load i16, ptr [[A3_SROA_0]], align 8, !tbaa [[TBAA5]]
+; CHECK-NEXT:    [[A3_SROA_0_0_A3_SROA_0_0_LOAD4_FCA_0_LOAD:%.*]] = load i16, ptr [[A3_SROA_0]], align 8, !tbaa [[V2F32_TBAA5]]
 ; CHECK-NEXT:    [[LOAD4_FCA_0_INSERT:%.*]] = insertvalue { i16, float, i8 } poison, i16 [[A3_SROA_0_0_A3_SROA_0_0_LOAD4_FCA_0_LOAD]], 0
-; CHECK-NEXT:    [[A3_SROA_33_0_A3_SROA_33_4_LOAD4_FCA_1_LOAD:%.*]] = load float, ptr [[A3_SROA_33]], align 4, !tbaa [[TBAA5]]
+; CHECK-NEXT:    [[A3_SROA_33_0_A3_SROA_33_4_LOAD4_FCA_1_LOAD:%.*]] = load float, ptr [[A3_SROA_33]], align 4, !tbaa [[V2F32_TBAA5]]
 ; CHECK-NEXT:    [[LOAD4_FCA_1_INSERT:%.*]] = insertvalue { i16, float, i8 } [[LOAD4_FCA_0_INSERT]], float [[A3_SROA_33_0_A3_SROA_33_4_LOAD4_FCA_1_LOAD]], 1
-; CHECK-NEXT:    [[A3_SROA_4_0_A3_SROA_4_8_LOAD4_FCA_2_LOAD:%.*]] = load i8, ptr [[A3_SROA_4]], align 8, !tbaa [[TBAA5]]
+; CHECK-NEXT:    [[A3_SROA_4_0_A3_SROA_4_8_LOAD4_FCA_2_LOAD:%.*]] = load i8, ptr [[A3_SROA_4]], align 8, !tbaa [[V2F32_TBAA5]]
 ; CHECK-NEXT:    [[LOAD4_FCA_2_INSERT:%.*]] = insertvalue { i16, float, i8 } [[LOAD4_FCA_1_INSERT]], i8 [[A3_SROA_4_0_A3_SROA_4_8_LOAD4_FCA_2_LOAD]], 2
 ; CHECK-NEXT:    [[UNWRAP2:%.*]] = extractvalue { i16, float, i8 } [[LOAD4_FCA_2_INSERT]], 1
 ; CHECK-NEXT:    [[VALCAST2:%.*]] = bitcast float [[UNWRAP2]] to i32
@@ -468,7 +468,7 @@ entry:
 define i32 @split_store_with_tbaa_struct(i32 %x, ptr %src, ptr %dst) {
 ; CHECK-LABEL: define i32 @split_store_with_tbaa_struct(
 ; CHECK-SAME: i32 [[X:%.*]], ptr [[SRC:%.*]], ptr [[DST:%.*]]) {
-; CHECK-NEXT:  entry:
+; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[A3_SROA_0:%.*]] = alloca i16, align 8
 ; CHECK-NEXT:    [[A3_SROA_3:%.*]] = alloca i16, align 2
 ; CHECK-NEXT:    [[A3_SROA_33:%.*]] = alloca float, align 4
@@ -492,11 +492,11 @@ define i32 @split_store_with_tbaa_struct(i32 %x, ptr %src, ptr %dst) {
 ; CHECK-NEXT:    [[I_2:%.*]] = insertvalue { i16, float, i8 } [[I_1]], float 3.000000e+00, 1
 ; CHECK-NEXT:    [[I_3:%.*]] = insertvalue { i16, float, i8 } [[I_2]], i8 99, 2
 ; CHECK-NEXT:    [[I_3_FCA_0_EXTRACT:%.*]] = extractvalue { i16, float, i8 } [[I_3]], 0
-; CHECK-NEXT:    store i16 [[I_3_FCA_0_EXTRACT]], ptr [[A3_SROA_0]], align 8, !tbaa [[TBAA5]]
+; CHECK-NEXT:    store i16 [[I_3_FCA_0_EXTRACT]], ptr [[A3_SROA_0]], align 8, !tbaa [[V2F32_TBAA5]]
 ; CHECK-NEXT:    [[I_3_FCA_1_EXTRACT:%.*]] = extractvalue { i16, float, i8 } [[I_3]], 1
-; CHECK-NEXT:    store float [[I_3_FCA_1_EXTRACT]], ptr [[A3_SROA_33]], align 4, !tbaa [[TBAA5]]
+; CHECK-NEXT:    store float [[I_3_FCA_1_EXTRACT]], ptr [[A3_SROA_33]], align 4, !tbaa [[V2F32_TBAA5]]
 ; CHECK-NEXT:    [[I_3_FCA_2_EXTRACT:%.*]] = extractvalue { i16, float, i8 } [[I_3]], 2
-; CHECK-NEXT:    store i8 [[I_3_FCA_2_EXTRACT]], ptr [[A3_SROA_4]], align 8, !tbaa [[TBAA5]]
+; CHECK-NEXT:    store i8 [[I_3_FCA_2_EXTRACT]], ptr [[A3_SROA_4]], align 8, !tbaa [[V2F32_TBAA5]]
 ; CHECK-NEXT:    [[A3_SROA_0_0_A3_SROA_0_0_COPYLOAD1:%.*]] = load volatile i16, ptr [[A3_SROA_0]], align 8
 ; CHECK-NEXT:    store volatile i16 [[A3_SROA_0_0_A3_SROA_0_0_COPYLOAD1]], ptr [[DST]], align 1
 ; CHECK-NEXT:    [[A3_SROA_3_0_DST_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 2
@@ -548,11 +548,11 @@ declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias
 !15 = !{i64 0, i64 7, !6, i64 7, i64 1, !6}
 !16 = !{i64 0, i64 2, !6, i64 4, i64 4, !6, i64 8, i64 1, !6}
 ;.
-; CHECK: [[TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0}
+; CHECK: [[FLOAT_TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0}
 ; CHECK: [[META1]] = !{!"float", [[META2:![0-9]+]], i64 0}
 ; CHECK: [[META2]] = !{!"omnipotent char", [[META3:![0-9]+]], i64 0}
 ; CHECK: [[META3]] = !{!"Simple C++ TBAA"}
-; CHECK: [[TBAA_STRUCT4]] = !{i64 0, i64 4, [[TBAA0]], i64 4, i64 4, [[TBAA0]]}
-; CHECK: [[TBAA5]] = !{[[META6:![0-9]+]], [[META6]], i64 0}
+; CHECK: [[TBAA_STRUCT4]] = !{i64 0, i64 4, [[FLOAT_TBAA0]], i64 4, i64 4, [[FLOAT_TBAA0]]}
+; CHECK: [[V2F32_TBAA5]] = !{[[META6:![0-9]+]], [[META6]], i64 0}
 ; CHECK: [[META6]] = !{!"v2f32", [[META2]], i64 0}
 ;.
diff --git a/llvm/test/Transforms/SROA/tbaa-subload.ll b/llvm/test/Transforms/SROA/tbaa-subload.ll
index b07874da7ab03..4c18006a4d1cb 100644
--- a/llvm/test/Transforms/SROA/tbaa-subload.ll
+++ b/llvm/test/Transforms/SROA/tbaa-subload.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 6
 ; RUN: opt -S -passes='sroa<preserve-cfg>' %s | FileCheck %s --check-prefixes=CHECK,CHECK-PRESERVE-CFG
 ; RUN: opt -S -passes='sroa<modify-cfg>' %s | FileCheck %s --check-prefixes=CHECK,CHECK-MODIFY-CFG
 
@@ -8,14 +8,14 @@
 %class.anon = type <{ %class.ar, [7 x i8], { i64, i64 } }>
 
 define void @caller() {
-; CHECK-LABEL: @caller(
-; CHECK-NEXT:  entry:
+; CHECK-LABEL: define void @caller() {
+; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[AGG:%.*]] = alloca [[CLASS_ANON:%.*]], align 8
 ; CHECK-NEXT:    [[OFF:%.*]] = getelementptr inbounds [[CLASS_ANON]], ptr [[AGG]], i32 0, i32 2
 ; CHECK-NEXT:    [[DOTFCA_0_GEP:%.*]] = getelementptr inbounds { i64, i64 }, ptr [[OFF]], i32 0, i32 0
-; CHECK-NEXT:    store i64 1, ptr [[DOTFCA_0_GEP]], align 8, !tbaa [[TBAA0:![0-9]+]]
+; CHECK-NEXT:    store i64 1, ptr [[DOTFCA_0_GEP]], align 8, !tbaa [[CHAR_TBAA0:![0-9]+]]
 ; CHECK-NEXT:    [[DOTFCA_1_GEP:%.*]] = getelementptr inbounds { i64, i64 }, ptr [[OFF]], i32 0, i32 1
-; CHECK-NEXT:    store i64 2, ptr [[DOTFCA_1_GEP]], align 8, !tbaa [[TBAA0]]
+; CHECK-NEXT:    store i64 2, ptr [[DOTFCA_1_GEP]], align 8, !tbaa [[CHAR_TBAA0]]
 ; CHECK-NEXT:    call void @use(ptr [[AGG]])
 ; CHECK-NEXT:    ret void
 ;
@@ -36,11 +36,11 @@ declare void @use(ptr %this)
 !8 = !{!"_ZTSZN2ax2baEMS_FvvE2an2arE3$_0", !9, i64 0, !3, i64 8}
 !9 = !{!"_ZTS2ar"}
 ;.
-; CHECK: [[TBAA0]] = !{!1, !3, i64 8}
-; CHECK: [[META1:![0-9]+]] = !{!"_ZTSZN2ax2baEMS_FvvE2an2arE3$_0", !2, i64 0, !3, i64 8}
-; CHECK: [[META2:![0-9]+]] = !{!"_ZTS2ar"}
-; CHECK: [[META3:![0-9]+]] = !{!"omnipotent char", !4, i64 0}
-; CHECK: [[META4:![0-9]+]] = !{!"Simple C++ TBAA"}
+; CHECK: [[CHAR_TBAA0]] = !{[[META1:![0-9]+]], [[META3:![0-9]+]], i64 8}
+; CHECK: [[META1]] = !{!"_ZTSZN2ax2baEMS_FvvE2an2arE3$_0", [[META2:![0-9]+]], i64 0, [[META3]], i64 8}
+; CHECK: [[META2]] = !{!"_ZTS2ar"}
+; CHECK: [[META3]] = !{!"omnipotent char", [[META4:![0-9]+]], i64 0}
+; CHECK: [[META4]] = !{!"Simple C++ TBAA"}
 ;.
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; CHECK-MODIFY-CFG: {{.*}}
diff --git a/llvm/test/Transforms/Scalarizer/basic-inseltpoison.ll b/llvm/test/Transforms/Scalarizer/basic-inseltpoison.ll
index 6cb94e8f561bc..af152d4ba8d05 100644
--- a/llvm/test/Transforms/Scalarizer/basic-inseltpoison.ll
+++ b/llvm/test/Transforms/Scalarizer/basic-inseltpoison.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt %s -passes='function(scalarizer<load-store>,dce)' -S | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
@@ -6,21 +6,22 @@ declare <4 x float> @ext(<4 x float>)
 @g = global <4 x float> zeroinitializer
 
 define void @f1(<4 x float> %init, ptr %base, i32 %count) {
-; CHECK-LABEL: @f1(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[INIT_I0:%.*]] = extractelement <4 x float> [[INIT:%.*]], i64 0
+; CHECK-LABEL: define void @f1(
+; CHECK-SAME: <4 x float> [[INIT:%.*]], ptr [[BASE:%.*]], i32 [[COUNT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[INIT_I0:%.*]] = extractelement <4 x float> [[INIT]], i64 0
 ; CHECK-NEXT:    [[INIT_I1:%.*]] = extractelement <4 x float> [[INIT]], i64 1
 ; CHECK-NEXT:    [[INIT_I2:%.*]] = extractelement <4 x float> [[INIT]], i64 2
 ; CHECK-NEXT:    [[INIT_I3:%.*]] = extractelement <4 x float> [[INIT]], i64 3
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[I:%.*]] = phi i32 [ [[COUNT:%.*]], [[ENTRY:%.*]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[ACC_I0:%.*]] = phi float [ [[INIT_I0]], [[ENTRY]] ], [ [[SEL_I0:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[ACC_I1:%.*]] = phi float [ [[INIT_I1]], [[ENTRY]] ], [ [[SEL_I1:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[ACC_I2:%.*]] = phi float [ [[INIT_I2]], [[ENTRY]] ], [ [[SEL_I2:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[ACC_I3:%.*]] = phi float [ [[INIT_I3]], [[ENTRY]] ], [ [[SEL_I3:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ [[COUNT]], %[[ENTRY]] ], [ [[NEXTI:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[ACC_I0:%.*]] = phi float [ [[INIT_I0]], %[[ENTRY]] ], [ [[SEL_I0:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[ACC_I1:%.*]] = phi float [ [[INIT_I1]], %[[ENTRY]] ], [ [[SEL_I1:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[ACC_I2:%.*]] = phi float [ [[INIT_I2]], %[[ENTRY]] ], [ [[SEL_I2:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[ACC_I3:%.*]] = phi float [ [[INIT_I3]], %[[ENTRY]] ], [ [[SEL_I3:%.*]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[NEXTI]] = sub i32 [[I]], 1
-; CHECK-NEXT:    [[PTR:%.*]] = getelementptr <4 x float>, ptr [[BASE:%.*]], i32 [[I]]
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr <4 x float>, ptr [[BASE]], i32 [[I]]
 ; CHECK-NEXT:    [[VAL_I0:%.*]] = load float, ptr [[PTR]], align 16
 ; CHECK-NEXT:    [[PTR_I1:%.*]] = getelementptr float, ptr [[PTR]], i32 1
 ; CHECK-NEXT:    [[VAL_I1:%.*]] = load float, ptr [[PTR_I1]], align 4
@@ -54,8 +55,8 @@ define void @f1(<4 x float> %init, ptr %base, i32 %count) {
 ; CHECK-NEXT:    store float [[SEL_I2]], ptr [[PTR_I2]], align 8
 ; CHECK-NEXT:    store float [[SEL_I3]], ptr [[PTR_I3]], align 4
 ; CHECK-NEXT:    [[TEST:%.*]] = icmp eq i32 [[NEXTI]], 0
-; CHECK-NEXT:    br i1 [[TEST]], label [[LOOP]], label [[EXIT:%.*]]
-; CHECK:       exit:
+; CHECK-NEXT:    br i1 [[TEST]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -92,21 +93,22 @@ exit:
 }
 
 define void @f2(<4 x i32> %init, ptr %base, i32 %count) {
-; CHECK-LABEL: @f2(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[INIT_I0:%.*]] = extractelement <4 x i32> [[INIT:%.*]], i64 0
+; CHECK-LABEL: define void @f2(
+; CHECK-SAME: <4 x i32> [[INIT:%.*]], ptr [[BASE:%.*]], i32 [[COUNT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[INIT_I0:%.*]] = extractelement <4 x i32> [[INIT]], i64 0
 ; CHECK-NEXT:    [[INIT_I1:%.*]] = extractelement <4 x i32> [[INIT]], i64 1
 ; CHECK-NEXT:    [[INIT_I2:%.*]] = extractelement <4 x i32> [[INIT]], i64 2
 ; CHECK-NEXT:    [[INIT_I3:%.*]] = extractelement <4 x i32> [[INIT]], i64 3
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[I:%.*]] = phi i32 [ [[COUNT:%.*]], [[ENTRY:%.*]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[ACC_I0:%.*]] = phi i32 [ [[INIT_I0]], [[ENTRY]] ], [ [[SEL_I0:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[ACC_I1:%.*]] = phi i32 [ [[INIT_I1]], [[ENTRY]] ], [ [[SEL_I1:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[ACC_I2:%.*]] = phi i32 [ [[INIT_I2]], [[ENTRY]] ], [ [[SEL_I2:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[ACC_I3:%.*]] = phi i32 [ [[INIT_I3]], [[ENTRY]] ], [ [[SEL_I3:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ [[COUNT]], %[[ENTRY]] ], [ [[NEXTI:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[ACC_I0:%.*]] = phi i32 [ [[INIT_I0]], %[[ENTRY]] ], [ [[SEL_I0:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[ACC_I1:%.*]] = phi i32 [ [[INIT_I1]], %[[ENTRY]] ], [ [[SEL_I1:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[ACC_I2:%.*]] = phi i32 [ [[INIT_I2]], %[[ENTRY]] ], [ [[SEL_I2:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[ACC_I3:%.*]] = phi i32 [ [[INIT_I3]], %[[ENTRY]] ], [ [[SEL_I3:%.*]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[NEXTI]] = sub i32 [[I]], 1
-; CHECK-NEXT:    [[PTR:%.*]] = getelementptr <4 x i8>, ptr [[BASE:%.*]], i32 [[I]]
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr <4 x i8>, ptr [[BASE]], i32 [[I]]
 ; CHECK-NEXT:    [[VAL_I0:%.*]] = load i8, ptr [[PTR]], align 4
 ; CHECK-NEXT:    [[PTR_I1:%.*]] = getelementptr i8, ptr [[PTR]], i32 1
 ; CHECK-NEXT:    [[VAL_I1:%.*]] = load i8, ptr [[PTR_I1]], align 1
@@ -139,8 +141,8 @@ define void @f2(<4 x i32> %init, ptr %base, i32 %count) {
 ; CHECK-NEXT:    store i8 [[TRUNC_I2]], ptr [[PTR_I2]], align 2
 ; CHECK-NEXT:    store i8 [[TRUNC_I3]], ptr [[PTR_I3]], align 1
 ; CHECK-NEXT:    [[TEST:%.*]] = icmp eq i32 [[NEXTI]], 0
-; CHECK-NEXT:    br i1 [[TEST]], label [[LOOP]], label [[EXIT:%.*]]
-; CHECK:       exit:
+; CHECK-NEXT:    br i1 [[TEST]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -172,25 +174,26 @@ exit:
 
 ; Check that !tbaa information is preserved.
 define void @f3(ptr %src, ptr %dst) {
-; CHECK-LABEL: @f3(
-; CHECK-NEXT:    [[DST_I1:%.*]] = getelementptr i32, ptr [[DST:%.*]], i32 1
+; CHECK-LABEL: define void @f3(
+; CHECK-SAME: ptr [[SRC:%.*]], ptr [[DST:%.*]]) {
+; CHECK-NEXT:    [[DST_I1:%.*]] = getelementptr i32, ptr [[DST]], i32 1
 ; CHECK-NEXT:    [[DST_I2:%.*]] = getelementptr i32, ptr [[DST]], i32 2
 ; CHECK-NEXT:    [[DST_I3:%.*]] = getelementptr i32, ptr [[DST]], i32 3
-; CHECK-NEXT:    [[VAL_I0:%.*]] = load i32, ptr [[SRC:%.*]], align 16, !tbaa [[TBAA0:![0-9]+]]
+; CHECK-NEXT:    [[VAL_I0:%.*]] = load i32, ptr [[SRC]], align 16, !tbaa [[SET1_TBAA0:![0-9]+]]
 ; CHECK-NEXT:    [[SRC_I1:%.*]] = getelementptr i32, ptr [[SRC]], i32 1
-; CHECK-NEXT:    [[VAL_I1:%.*]] = load i32, ptr [[SRC_I1]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    [[VAL_I1:%.*]] = load i32, ptr [[SRC_I1]], align 4, !tbaa [[SET1_TBAA0]]
 ; CHECK-NEXT:    [[SRC_I2:%.*]] = getelementptr i32, ptr [[SRC]], i32 2
-; CHECK-NEXT:    [[VAL_I2:%.*]] = load i32, ptr [[SRC_I2]], align 8, !tbaa [[TBAA0]]
+; CHECK-NEXT:    [[VAL_I2:%.*]] = load i32, ptr [[SRC_I2]], align 8, !tbaa [[SET1_TBAA0]]
 ; CHECK-NEXT:    [[SRC_I3:%.*]] = getelementptr i32, ptr [[SRC]], i32 3
-; CHECK-NEXT:    [[VAL_I3:%.*]] = load i32, ptr [[SRC_I3]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    [[VAL_I3:%.*]] = load i32, ptr [[SRC_I3]], align 4, !tbaa [[SET1_TBAA0]]
 ; CHECK-NEXT:    [[ADD_I0:%.*]] = add i32 [[VAL_I0]], [[VAL_I0]]
 ; CHECK-NEXT:    [[ADD_I1:%.*]] = add i32 [[VAL_I1]], [[VAL_I1]]
 ; CHECK-NEXT:    [[ADD_I2:%.*]] = add i32 [[VAL_I2]], [[VAL_I2]]
 ; CHECK-NEXT:    [[ADD_I3:%.*]] = add i32 [[VAL_I3]], [[VAL_I3]]
-; CHECK-NEXT:    store i32 [[ADD_I0]], ptr [[DST]], align 16, !tbaa [[TBAA3:![0-9]+]]
-; CHECK-NEXT:    store i32 [[ADD_I1]], ptr [[DST_I1]], align 4, !tbaa [[TBAA3]]
-; CHECK-NEXT:    store i32 [[ADD_I2]], ptr [[DST_I2]], align 8, !tbaa [[TBAA3]]
-; CHECK-NEXT:    store i32 [[ADD_I3]], ptr [[DST_I3]], align 4, !tbaa [[TBAA3]]
+; CHECK-NEXT:    store i32 [[ADD_I0]], ptr [[DST]], align 16, !tbaa [[SET2_TBAA3:![0-9]+]]
+; CHECK-NEXT:    store i32 [[ADD_I1]], ptr [[DST_I1]], align 4, !tbaa [[SET2_TBAA3]]
+; CHECK-NEXT:    store i32 [[ADD_I2]], ptr [[DST_I2]], align 8, !tbaa [[SET2_TBAA3]]
+; CHECK-NEXT:    store i32 [[ADD_I3]], ptr [[DST_I3]], align 4, !tbaa [[SET2_TBAA3]]
 ; CHECK-NEXT:    ret void
 ;
   %val = load <4 x i32> , ptr %src, !tbaa !1
@@ -201,11 +204,12 @@ define void @f3(ptr %src, ptr %dst) {
 
 ; Check that !tbaa.struct information is preserved.
 define void @f4(ptr %src, ptr %dst) {
-; CHECK-LABEL: @f4(
-; CHECK-NEXT:    [[DST_I1:%.*]] = getelementptr i32, ptr [[DST:%.*]], i32 1
+; CHECK-LABEL: define void @f4(
+; CHECK-SAME: ptr [[SRC:%.*]], ptr [[DST:%.*]]) {
+; CHECK-NEXT:    [[DST_I1:%.*]] = getelementptr i32, ptr [[DST]], i32 1
 ; CHECK-NEXT:    [[DST_I2:%.*]] = getelementptr i32, ptr [[DST]], i32 2
 ; CHECK-NEXT:    [[DST_I3:%.*]] = getelementptr i32, ptr [[DST]], i32 3
-; CHECK-NEXT:    [[VAL_I0:%.*]] = load i32, ptr [[SRC:%.*]], align 16, !tbaa.struct [[TBAA_STRUCT5:![0-9]+]]
+; CHECK-NEXT:    [[VAL_I0:%.*]] = load i32, ptr [[SRC]], align 16, !tbaa.struct [[TBAA_STRUCT5:![0-9]+]]
 ; CHECK-NEXT:    [[SRC_I1:%.*]] = getelementptr i32, ptr [[SRC]], i32 1
 ; CHECK-NEXT:    [[VAL_I1:%.*]] = load i32, ptr [[SRC_I1]], align 4, !tbaa.struct [[TBAA_STRUCT5]]
 ; CHECK-NEXT:    [[SRC_I2:%.*]] = getelementptr i32, ptr [[SRC]], i32 2
@@ -230,16 +234,17 @@ define void @f4(ptr %src, ptr %dst) {
 
 ; Check that llvm.access.group information is preserved.
 define void @f5(i32 %count, ptr %src, ptr %dst) {
-; CHECK-LABEL: @f5(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[NEXT_INDEX:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[THIS_SRC:%.*]] = getelementptr <4 x i32>, ptr [[SRC:%.*]], i32 [[INDEX]]
+; CHECK-LABEL: define void @f5(
+; CHECK-SAME: i32 [[COUNT:%.*]], ptr [[SRC:%.*]], ptr [[DST:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[NEXT_INDEX:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[THIS_SRC:%.*]] = getelementptr <4 x i32>, ptr [[SRC]], i32 [[INDEX]]
 ; CHECK-NEXT:    [[THIS_SRC_I1:%.*]] = getelementptr i32, ptr [[THIS_SRC]], i32 1
 ; CHECK-NEXT:    [[THIS_SRC_I2:%.*]] = getelementptr i32, ptr [[THIS_SRC]], i32 2
 ; CHECK-NEXT:    [[THIS_SRC_I3:%.*]] = getelementptr i32, ptr [[THIS_SRC]], i32 3
-; CHECK-NEXT:    [[THIS_DST:%.*]] = getelementptr <4 x i32>, ptr [[DST:%.*]], i32 [[INDEX]]
+; CHECK-NEXT:    [[THIS_DST:%.*]] = getelementptr <4 x i32>, ptr [[DST]], i32 [[INDEX]]
 ; CHECK-NEXT:    [[THIS_DST_I1:%.*]] = getelementptr i32, ptr [[THIS_DST]], i32 1
 ; CHECK-NEXT:    [[THIS_DST_I2:%.*]] = getelementptr i32, ptr [[THIS_DST]], i32 2
 ; CHECK-NEXT:    [[THIS_DST_I3:%.*]] = getelementptr i32, ptr [[THIS_DST]], i32 3
@@ -256,9 +261,9 @@ define void @f5(i32 %count, ptr %src, ptr %dst) {
 ; CHECK-NEXT:    store i32 [[ADD_I2]], ptr [[THIS_DST_I2]], align 8, !llvm.access.group [[ACC_GRP6]]
 ; CHECK-NEXT:    store i32 [[ADD_I3]], ptr [[THIS_DST_I3]], align 4, !llvm.access.group [[ACC_GRP6]]
 ; CHECK-NEXT:    [[NEXT_INDEX]] = add i32 [[INDEX]], -1
-; CHECK-NEXT:    [[CONTINUE:%.*]] = icmp ne i32 [[NEXT_INDEX]], [[COUNT:%.*]]
-; CHECK-NEXT:    br i1 [[CONTINUE]], label [[LOOP]], label [[END:%.*]], !llvm.loop [[LOOP7:![0-9]+]]
-; CHECK:       end:
+; CHECK-NEXT:    [[CONTINUE:%.*]] = icmp ne i32 [[NEXT_INDEX]], [[COUNT]]
+; CHECK-NEXT:    br i1 [[CONTINUE]], label %[[LOOP]], label %[[END:.*]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK:       [[END]]:
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -281,15 +286,16 @@ end:
 
 ; Check that fpmath information is preserved.
 define <4 x float> @f6(<4 x float> %x) {
-; CHECK-LABEL: @f6(
-; CHECK-NEXT:    [[X_I0:%.*]] = extractelement <4 x float> [[X:%.*]], i64 0
-; CHECK-NEXT:    [[RES_I0:%.*]] = fadd float [[X_I0]], 1.000000e+00, !fpmath !9
+; CHECK-LABEL: define <4 x float> @f6(
+; CHECK-SAME: <4 x float> [[X:%.*]]) {
+; CHECK-NEXT:    [[X_I0:%.*]] = extractelement <4 x float> [[X]], i64 0
+; CHECK-NEXT:    [[RES_I0:%.*]] = fadd float [[X_I0]], 1.000000e+00, !fpmath [[META9:![0-9]+]]
 ; CHECK-NEXT:    [[X_I1:%.*]] = extractelement <4 x float> [[X]], i64 1
-; CHECK-NEXT:    [[RES_I1:%.*]] = fadd float [[X_I1]], 2.000000e+00, !fpmath !9
+; CHECK-NEXT:    [[RES_I1:%.*]] = fadd float [[X_I1]], 2.000000e+00, !fpmath [[META9]]
 ; CHECK-NEXT:    [[X_I2:%.*]] = extractelement <4 x float> [[X]], i64 2
-; CHECK-NEXT:    [[RES_I2:%.*]] = fadd float [[X_I2]], 3.000000e+00, !fpmath !9
+; CHECK-NEXT:    [[RES_I2:%.*]] = fadd float [[X_I2]], 3.000000e+00, !fpmath [[META9]]
 ; CHECK-NEXT:    [[X_I3:%.*]] = extractelement <4 x float> [[X]], i64 3
-; CHECK-NEXT:    [[RES_I3:%.*]] = fadd float [[X_I3]], 4.000000e+00, !fpmath !9
+; CHECK-NEXT:    [[RES_I3:%.*]] = fadd float [[X_I3]], 4.000000e+00, !fpmath [[META9]]
 ; CHECK-NEXT:    [[RES_UPTO0:%.*]] = insertelement <4 x float> poison, float [[RES_I0]], i64 0
 ; CHECK-NEXT:    [[RES_UPTO1:%.*]] = insertelement <4 x float> [[RES_UPTO0]], float [[RES_I1]], i64 1
 ; CHECK-NEXT:    [[RES_UPTO2:%.*]] = insertelement <4 x float> [[RES_UPTO1]], float [[RES_I2]], i64 2
@@ -303,11 +309,12 @@ define <4 x float> @f6(<4 x float> %x) {
 
 ; Check that random metadata isn't kept.
 define void @f7(ptr %src, ptr %dst) {
-; CHECK-LABEL: @f7(
-; CHECK-NEXT:    [[DST_I1:%.*]] = getelementptr i32, ptr [[DST:%.*]], i32 1
+; CHECK-LABEL: define void @f7(
+; CHECK-SAME: ptr [[SRC:%.*]], ptr [[DST:%.*]]) {
+; CHECK-NEXT:    [[DST_I1:%.*]] = getelementptr i32, ptr [[DST]], i32 1
 ; CHECK-NEXT:    [[DST_I2:%.*]] = getelementptr i32, ptr [[DST]], i32 2
 ; CHECK-NEXT:    [[DST_I3:%.*]] = getelementptr i32, ptr [[DST]], i32 3
-; CHECK-NEXT:    [[VAL_I0:%.*]] = load i32, ptr [[SRC:%.*]], align 16
+; CHECK-NEXT:    [[VAL_I0:%.*]] = load i32, ptr [[SRC]], align 16
 ; CHECK-NEXT:    [[SRC_I1:%.*]] = getelementptr i32, ptr [[SRC]], i32 1
 ; CHECK-NEXT:    [[VAL_I1:%.*]] = load i32, ptr [[SRC_I1]], align 4
 ; CHECK-NEXT:    [[SRC_I2:%.*]] = getelementptr i32, ptr [[SRC]], i32 2
@@ -332,17 +339,18 @@ define void @f7(ptr %src, ptr %dst) {
 
 ; Test GEP with vectors.
 define void @f8(ptr %dest, <4 x ptr> %ptr0, <4 x i32> %i0,
-; CHECK-LABEL: @f8(
-; CHECK-NEXT:    [[DEST_I1:%.*]] = getelementptr ptr, ptr [[DEST:%.*]], i32 1
+; CHECK-LABEL: define void @f8(
+; CHECK-SAME: ptr [[DEST:%.*]], <4 x ptr> [[PTR0:%.*]], <4 x i32> [[I0:%.*]], ptr [[OTHER:%.*]]) {
+; CHECK-NEXT:    [[DEST_I1:%.*]] = getelementptr ptr, ptr [[DEST]], i32 1
 ; CHECK-NEXT:    [[DEST_I2:%.*]] = getelementptr ptr, ptr [[DEST]], i32 2
 ; CHECK-NEXT:    [[DEST_I3:%.*]] = getelementptr ptr, ptr [[DEST]], i32 3
-; CHECK-NEXT:    [[PTR0_I0:%.*]] = extractelement <4 x ptr> [[PTR0:%.*]], i64 0
+; CHECK-NEXT:    [[PTR0_I0:%.*]] = extractelement <4 x ptr> [[PTR0]], i64 0
 ; CHECK-NEXT:    [[PTR0_I2:%.*]] = extractelement <4 x ptr> [[PTR0]], i64 2
 ; CHECK-NEXT:    [[PTR0_I3:%.*]] = extractelement <4 x ptr> [[PTR0]], i64 3
-; CHECK-NEXT:    [[I0_I1:%.*]] = extractelement <4 x i32> [[I0:%.*]], i64 1
+; CHECK-NEXT:    [[I0_I1:%.*]] = extractelement <4 x i32> [[I0]], i64 1
 ; CHECK-NEXT:    [[I0_I3:%.*]] = extractelement <4 x i32> [[I0]], i64 3
 ; CHECK-NEXT:    [[VAL_I0:%.*]] = getelementptr float, ptr [[PTR0_I0]], i32 100
-; CHECK-NEXT:    [[VAL_I1:%.*]] = getelementptr float, ptr [[OTHER:%.*]], i32 [[I0_I1]]
+; CHECK-NEXT:    [[VAL_I1:%.*]] = getelementptr float, ptr [[OTHER]], i32 [[I0_I1]]
 ; CHECK-NEXT:    [[VAL_I2:%.*]] = getelementptr float, ptr [[PTR0_I2]], i32 100
 ; CHECK-NEXT:    [[VAL_I3:%.*]] = getelementptr float, ptr [[PTR0_I3]], i32 [[I0_I3]]
 ; CHECK-NEXT:    store ptr [[VAL_I0]], ptr [[DEST]], align 32
@@ -362,11 +370,12 @@ define void @f8(ptr %dest, <4 x ptr> %ptr0, <4 x i32> %i0,
 
 ; Test the handling of unaligned loads.
 define void @f9(ptr %dest, ptr %src) {
-; CHECK-LABEL: @f9(
-; CHECK-NEXT:    [[DEST_I1:%.*]] = getelementptr float, ptr [[DEST:%.*]], i32 1
+; CHECK-LABEL: define void @f9(
+; CHECK-SAME: ptr [[DEST:%.*]], ptr [[SRC:%.*]]) {
+; CHECK-NEXT:    [[DEST_I1:%.*]] = getelementptr float, ptr [[DEST]], i32 1
 ; CHECK-NEXT:    [[DEST_I2:%.*]] = getelementptr float, ptr [[DEST]], i32 2
 ; CHECK-NEXT:    [[DEST_I3:%.*]] = getelementptr float, ptr [[DEST]], i32 3
-; CHECK-NEXT:    [[VAL_I0:%.*]] = load float, ptr [[SRC:%.*]], align 4
+; CHECK-NEXT:    [[VAL_I0:%.*]] = load float, ptr [[SRC]], align 4
 ; CHECK-NEXT:    [[SRC_I1:%.*]] = getelementptr float, ptr [[SRC]], i32 1
 ; CHECK-NEXT:    [[VAL_I1:%.*]] = load float, ptr [[SRC_I1]], align 4
 ; CHECK-NEXT:    [[SRC_I2:%.*]] = getelementptr float, ptr [[SRC]], i32 2
@@ -386,11 +395,12 @@ define void @f9(ptr %dest, ptr %src) {
 
 ; ...and again with subelement alignment.
 define void @f10(ptr %dest, ptr %src) {
-; CHECK-LABEL: @f10(
-; CHECK-NEXT:    [[DEST_I1:%.*]] = getelementptr float, ptr [[DEST:%.*]], i32 1
+; CHECK-LABEL: define void @f10(
+; CHECK-SAME: ptr [[DEST:%.*]], ptr [[SRC:%.*]]) {
+; CHECK-NEXT:    [[DEST_I1:%.*]] = getelementptr float, ptr [[DEST]], i32 1
 ; CHECK-NEXT:    [[DEST_I2:%.*]] = getelementptr float, ptr [[DEST]], i32 2
 ; CHECK-NEXT:    [[DEST_I3:%.*]] = getelementptr float, ptr [[DEST]], i32 3
-; CHECK-NEXT:    [[VAL_I0:%.*]] = load float, ptr [[SRC:%.*]], align 1
+; CHECK-NEXT:    [[VAL_I0:%.*]] = load float, ptr [[SRC]], align 1
 ; CHECK-NEXT:    [[SRC_I1:%.*]] = getelementptr float, ptr [[SRC]], i32 1
 ; CHECK-NEXT:    [[VAL_I1:%.*]] = load float, ptr [[SRC_I1]], align 1
 ; CHECK-NEXT:    [[SRC_I2:%.*]] = getelementptr float, ptr [[SRC]], i32 2
@@ -410,8 +420,9 @@ define void @f10(ptr %dest, ptr %src) {
 
 ; Test that sub-byte loads aren't scalarized.
 define void @f11(ptr %dest, ptr %src0) {
-; CHECK-LABEL: @f11(
-; CHECK-NEXT:    [[SRC1:%.*]] = getelementptr <32 x i1>, ptr [[SRC0:%.*]], i32 1
+; CHECK-LABEL: define void @f11(
+; CHECK-SAME: ptr [[DEST:%.*]], ptr [[SRC0:%.*]]) {
+; CHECK-NEXT:    [[SRC1:%.*]] = getelementptr <32 x i1>, ptr [[SRC0]], i32 1
 ; CHECK-NEXT:    [[VAL0:%.*]] = load <32 x i1>, ptr [[SRC0]], align 4
 ; CHECK-NEXT:    [[VAL0_I0:%.*]] = extractelement <32 x i1> [[VAL0]], i64 0
 ; CHECK-NEXT:    [[VAL0_I1:%.*]] = extractelement <32 x i1> [[VAL0]], i64 1
@@ -542,7 +553,7 @@ define void @f11(ptr %dest, ptr %src0) {
 ; CHECK-NEXT:    [[AND_UPTO29:%.*]] = insertelement <32 x i1> [[AND_UPTO28]], i1 [[AND_I29]], i64 29
 ; CHECK-NEXT:    [[AND_UPTO30:%.*]] = insertelement <32 x i1> [[AND_UPTO29]], i1 [[AND_I30]], i64 30
 ; CHECK-NEXT:    [[AND:%.*]] = insertelement <32 x i1> [[AND_UPTO30]], i1 [[AND_I31]], i64 31
-; CHECK-NEXT:    store <32 x i1> [[AND]], ptr [[DEST:%.*]], align 4
+; CHECK-NEXT:    store <32 x i1> [[AND]], ptr [[DEST]], align 4
 ; CHECK-NEXT:    ret void
 ;
   %src1 = getelementptr <32 x i1>, ptr %src0, i32 1
@@ -555,12 +566,13 @@ define void @f11(ptr %dest, ptr %src0) {
 
 ; Test vector GEPs with more than one index.
 define void @f13(ptr %dest, <4 x ptr> %ptr, <4 x i32> %i,
-; CHECK-LABEL: @f13(
-; CHECK-NEXT:    [[DEST_I1:%.*]] = getelementptr ptr, ptr [[DEST:%.*]], i32 1
+; CHECK-LABEL: define void @f13(
+; CHECK-SAME: ptr [[DEST:%.*]], <4 x ptr> [[PTR:%.*]], <4 x i32> [[I:%.*]], ptr [[OTHER:%.*]]) {
+; CHECK-NEXT:    [[DEST_I1:%.*]] = getelementptr ptr, ptr [[DEST]], i32 1
 ; CHECK-NEXT:    [[DEST_I2:%.*]] = getelementptr ptr, ptr [[DEST]], i32 2
 ; CHECK-NEXT:    [[DEST_I3:%.*]] = getelementptr ptr, ptr [[DEST]], i32 3
-; CHECK-NEXT:    [[PTR_I0:%.*]] = extractelement <4 x ptr> [[PTR:%.*]], i64 0
-; CHECK-NEXT:    [[I_I0:%.*]] = extractelement <4 x i32> [[I:%.*]], i64 0
+; CHECK-NEXT:    [[PTR_I0:%.*]] = extractelement <4 x ptr> [[PTR]], i64 0
+; CHECK-NEXT:    [[I_I0:%.*]] = extractelement <4 x i32> [[I]], i64 0
 ; CHECK-NEXT:    [[VAL_I0:%.*]] = getelementptr inbounds [4 x float], ptr [[PTR_I0]], i32 0, i32 [[I_I0]]
 ; CHECK-NEXT:    [[PTR_I1:%.*]] = extractelement <4 x ptr> [[PTR]], i64 1
 ; CHECK-NEXT:    [[I_I1:%.*]] = extractelement <4 x i32> [[I]], i64 1
@@ -587,19 +599,20 @@ define void @f13(ptr %dest, <4 x ptr> %ptr, <4 x i32> %i,
 
 ; Test combinations of vector and non-vector PHIs.
 define <4 x float> @f14(<4 x float> %acc, i32 %count) {
-; CHECK-LABEL: @f14(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[ACC_I0:%.*]] = extractelement <4 x float> [[ACC:%.*]], i64 0
+; CHECK-LABEL: define <4 x float> @f14(
+; CHECK-SAME: <4 x float> [[ACC:%.*]], i32 [[COUNT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[ACC_I0:%.*]] = extractelement <4 x float> [[ACC]], i64 0
 ; CHECK-NEXT:    [[ACC_I1:%.*]] = extractelement <4 x float> [[ACC]], i64 1
 ; CHECK-NEXT:    [[ACC_I2:%.*]] = extractelement <4 x float> [[ACC]], i64 2
 ; CHECK-NEXT:    [[ACC_I3:%.*]] = extractelement <4 x float> [[ACC]], i64 3
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[THIS_ACC_I0:%.*]] = phi float [ [[ACC_I0]], [[ENTRY:%.*]] ], [ [[NEXT_ACC_I0:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[THIS_ACC_I1:%.*]] = phi float [ [[ACC_I1]], [[ENTRY]] ], [ [[NEXT_ACC_I1:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[THIS_ACC_I2:%.*]] = phi float [ [[ACC_I2]], [[ENTRY]] ], [ [[NEXT_ACC_I2:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[THIS_ACC_I3:%.*]] = phi float [ [[ACC_I3]], [[ENTRY]] ], [ [[NEXT_ACC_I3:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[THIS_COUNT:%.*]] = phi i32 [ [[COUNT:%.*]], [[ENTRY]] ], [ [[NEXT_COUNT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[THIS_ACC_I0:%.*]] = phi float [ [[ACC_I0]], %[[ENTRY]] ], [ [[NEXT_ACC_I0:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[THIS_ACC_I1:%.*]] = phi float [ [[ACC_I1]], %[[ENTRY]] ], [ [[NEXT_ACC_I1:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[THIS_ACC_I2:%.*]] = phi float [ [[ACC_I2]], %[[ENTRY]] ], [ [[NEXT_ACC_I2:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[THIS_ACC_I3:%.*]] = phi float [ [[ACC_I3]], %[[ENTRY]] ], [ [[NEXT_ACC_I3:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[THIS_COUNT:%.*]] = phi i32 [ [[COUNT]], %[[ENTRY]] ], [ [[NEXT_COUNT:%.*]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[THIS_ACC_UPTO0:%.*]] = insertelement <4 x float> poison, float [[THIS_ACC_I0]], i64 0
 ; CHECK-NEXT:    [[THIS_ACC_UPTO1:%.*]] = insertelement <4 x float> [[THIS_ACC_UPTO0]], float [[THIS_ACC_I1]], i64 1
 ; CHECK-NEXT:    [[THIS_ACC_UPTO2:%.*]] = insertelement <4 x float> [[THIS_ACC_UPTO1]], float [[THIS_ACC_I2]], i64 2
@@ -619,8 +632,8 @@ define <4 x float> @f14(<4 x float> %acc, i32 %count) {
 ; CHECK-NEXT:    [[NEXT_ACC:%.*]] = insertelement <4 x float> [[NEXT_ACC_UPTO2]], float [[NEXT_ACC_I3]], i64 3
 ; CHECK-NEXT:    [[NEXT_COUNT]] = sub i32 [[THIS_COUNT]], 1
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[NEXT_COUNT]], 0
-; CHECK-NEXT:    br i1 [[CMP]], label [[LOOP]], label [[EXIT:%.*]]
-; CHECK:       exit:
+; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret <4 x float> [[NEXT_ACC]]
 ;
 entry:
@@ -641,13 +654,14 @@ exit:
 
 ; Test unary operator scalarization.
 define void @f15(<4 x float> %init, ptr %base, i32 %count) {
-; CHECK-LABEL: @f15(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[I:%.*]] = phi i32 [ [[COUNT:%.*]], [[ENTRY:%.*]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; CHECK-LABEL: define void @f15(
+; CHECK-SAME: <4 x float> [[INIT:%.*]], ptr [[BASE:%.*]], i32 [[COUNT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ [[COUNT]], %[[ENTRY]] ], [ [[NEXTI:%.*]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[NEXTI]] = sub i32 [[I]], 1
-; CHECK-NEXT:    [[PTR:%.*]] = getelementptr <4 x float>, ptr [[BASE:%.*]], i32 [[I]]
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr <4 x float>, ptr [[BASE]], i32 [[I]]
 ; CHECK-NEXT:    [[VAL_I0:%.*]] = load float, ptr [[PTR]], align 16
 ; CHECK-NEXT:    [[PTR_I1:%.*]] = getelementptr float, ptr [[PTR]], i32 1
 ; CHECK-NEXT:    [[VAL_I1:%.*]] = load float, ptr [[PTR_I1]], align 4
@@ -681,8 +695,8 @@ define void @f15(<4 x float> %init, ptr %base, i32 %count) {
 ; CHECK-NEXT:    store float [[SEL_I2]], ptr [[PTR_I2]], align 8
 ; CHECK-NEXT:    store float [[SEL_I3]], ptr [[PTR_I3]], align 4
 ; CHECK-NEXT:    [[TEST:%.*]] = icmp eq i32 [[NEXTI]], 0
-; CHECK-NEXT:    br i1 [[TEST]], label [[LOOP]], label [[EXIT:%.*]]
-; CHECK:       exit:
+; CHECK-NEXT:    br i1 [[TEST]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -712,9 +726,10 @@ exit:
 
 ; Check that IR flags are preserved.
 define <2 x i32> @f16(<2 x i32> %i, <2 x i32> %j) {
-; CHECK-LABEL: @f16(
-; CHECK-NEXT:    [[I_I0:%.*]] = extractelement <2 x i32> [[I:%.*]], i64 0
-; CHECK-NEXT:    [[J_I0:%.*]] = extractelement <2 x i32> [[J:%.*]], i64 0
+; CHECK-LABEL: define <2 x i32> @f16(
+; CHECK-SAME: <2 x i32> [[I:%.*]], <2 x i32> [[J:%.*]]) {
+; CHECK-NEXT:    [[I_I0:%.*]] = extractelement <2 x i32> [[I]], i64 0
+; CHECK-NEXT:    [[J_I0:%.*]] = extractelement <2 x i32> [[J]], i64 0
 ; CHECK-NEXT:    [[RES_I0:%.*]] = add nuw nsw i32 [[I_I0]], [[J_I0]]
 ; CHECK-NEXT:    [[I_I1:%.*]] = extractelement <2 x i32> [[I]], i64 1
 ; CHECK-NEXT:    [[J_I1:%.*]] = extractelement <2 x i32> [[J]], i64 1
@@ -727,9 +742,10 @@ define <2 x i32> @f16(<2 x i32> %i, <2 x i32> %j) {
   ret <2 x i32> %res
 }
 define <2 x i32> @f17(<2 x i32> %i, <2 x i32> %j) {
-; CHECK-LABEL: @f17(
-; CHECK-NEXT:    [[I_I0:%.*]] = extractelement <2 x i32> [[I:%.*]], i64 0
-; CHECK-NEXT:    [[J_I0:%.*]] = extractelement <2 x i32> [[J:%.*]], i64 0
+; CHECK-LABEL: define <2 x i32> @f17(
+; CHECK-SAME: <2 x i32> [[I:%.*]], <2 x i32> [[J:%.*]]) {
+; CHECK-NEXT:    [[I_I0:%.*]] = extractelement <2 x i32> [[I]], i64 0
+; CHECK-NEXT:    [[J_I0:%.*]] = extractelement <2 x i32> [[J]], i64 0
 ; CHECK-NEXT:    [[RES_I0:%.*]] = sdiv exact i32 [[I_I0]], [[J_I0]]
 ; CHECK-NEXT:    [[I_I1:%.*]] = extractelement <2 x i32> [[I]], i64 1
 ; CHECK-NEXT:    [[J_I1:%.*]] = extractelement <2 x i32> [[J]], i64 1
@@ -742,9 +758,10 @@ define <2 x i32> @f17(<2 x i32> %i, <2 x i32> %j) {
   ret <2 x i32> %res
 }
 define <2 x float> @f18(<2 x float> %x, <2 x float> %y) {
-; CHECK-LABEL: @f18(
-; CHECK-NEXT:    [[X_I0:%.*]] = extractelement <2 x float> [[X:%.*]], i64 0
-; CHECK-NEXT:    [[Y_I0:%.*]] = extractelement <2 x float> [[Y:%.*]], i64 0
+; CHECK-LABEL: define <2 x float> @f18(
+; CHECK-SAME: <2 x float> [[X:%.*]], <2 x float> [[Y:%.*]]) {
+; CHECK-NEXT:    [[X_I0:%.*]] = extractelement <2 x float> [[X]], i64 0
+; CHECK-NEXT:    [[Y_I0:%.*]] = extractelement <2 x float> [[Y]], i64 0
 ; CHECK-NEXT:    [[RES_I0:%.*]] = fadd fast float [[X_I0]], [[Y_I0]]
 ; CHECK-NEXT:    [[X_I1:%.*]] = extractelement <2 x float> [[X]], i64 1
 ; CHECK-NEXT:    [[Y_I1:%.*]] = extractelement <2 x float> [[Y]], i64 1
@@ -757,8 +774,9 @@ define <2 x float> @f18(<2 x float> %x, <2 x float> %y) {
   ret <2 x float> %res
 }
 define <2 x float> @f19(<2 x float> %x) {
-; CHECK-LABEL: @f19(
-; CHECK-NEXT:    [[X_I0:%.*]] = extractelement <2 x float> [[X:%.*]], i64 0
+; CHECK-LABEL: define <2 x float> @f19(
+; CHECK-SAME: <2 x float> [[X:%.*]]) {
+; CHECK-NEXT:    [[X_I0:%.*]] = extractelement <2 x float> [[X]], i64 0
 ; CHECK-NEXT:    [[RES_I0:%.*]] = fneg fast float [[X_I0]]
 ; CHECK-NEXT:    [[X_I1:%.*]] = extractelement <2 x float> [[X]], i64 1
 ; CHECK-NEXT:    [[RES_I1:%.*]] = fneg fast float [[X_I1]]
@@ -770,9 +788,10 @@ define <2 x float> @f19(<2 x float> %x) {
   ret <2 x float> %res
 }
 define <2 x i1> @f20(<2 x float> %x, <2 x float> %y) {
-; CHECK-LABEL: @f20(
-; CHECK-NEXT:    [[X_I0:%.*]] = extractelement <2 x float> [[X:%.*]], i64 0
-; CHECK-NEXT:    [[Y_I0:%.*]] = extractelement <2 x float> [[Y:%.*]], i64 0
+; CHECK-LABEL: define <2 x i1> @f20(
+; CHECK-SAME: <2 x float> [[X:%.*]], <2 x float> [[Y:%.*]]) {
+; CHECK-NEXT:    [[X_I0:%.*]] = extractelement <2 x float> [[X]], i64 0
+; CHECK-NEXT:    [[Y_I0:%.*]] = extractelement <2 x float> [[Y]], i64 0
 ; CHECK-NEXT:    [[RES_I0:%.*]] = fcmp fast ogt float [[X_I0]], [[Y_I0]]
 ; CHECK-NEXT:    [[X_I1:%.*]] = extractelement <2 x float> [[X]], i64 1
 ; CHECK-NEXT:    [[Y_I1:%.*]] = extractelement <2 x float> [[Y]], i64 1
@@ -786,8 +805,9 @@ define <2 x i1> @f20(<2 x float> %x, <2 x float> %y) {
 }
 declare <2 x float> @llvm.sqrt.v2f32(<2 x float>)
 define <2 x float> @f21(<2 x float> %x) {
-; CHECK-LABEL: @f21(
-; CHECK-NEXT:    [[X_I0:%.*]] = extractelement <2 x float> [[X:%.*]], i64 0
+; CHECK-LABEL: define <2 x float> @f21(
+; CHECK-SAME: <2 x float> [[X:%.*]]) {
+; CHECK-NEXT:    [[X_I0:%.*]] = extractelement <2 x float> [[X]], i64 0
 ; CHECK-NEXT:    [[RES_I0:%.*]] = call fast float @llvm.sqrt.f32(float [[X_I0]])
 ; CHECK-NEXT:    [[X_I1:%.*]] = extractelement <2 x float> [[X]], i64 1
 ; CHECK-NEXT:    [[RES_I1:%.*]] = call fast float @llvm.sqrt.f32(float [[X_I1]])
@@ -800,10 +820,11 @@ define <2 x float> @f21(<2 x float> %x) {
 }
 declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>)
 define <2 x float> @f22(<2 x float> %x, <2 x float> %y, <2 x float> %z) {
-; CHECK-LABEL: @f22(
-; CHECK-NEXT:    [[X_I0:%.*]] = extractelement <2 x float> [[X:%.*]], i64 0
-; CHECK-NEXT:    [[Y_I0:%.*]] = extractelement <2 x float> [[Y:%.*]], i64 0
-; CHECK-NEXT:    [[Z_I0:%.*]] = extractelement <2 x float> [[Z:%.*]], i64 0
+; CHECK-LABEL: define <2 x float> @f22(
+; CHECK-SAME: <2 x float> [[X:%.*]], <2 x float> [[Y:%.*]], <2 x float> [[Z:%.*]]) {
+; CHECK-NEXT:    [[X_I0:%.*]] = extractelement <2 x float> [[X]], i64 0
+; CHECK-NEXT:    [[Y_I0:%.*]] = extractelement <2 x float> [[Y]], i64 0
+; CHECK-NEXT:    [[Z_I0:%.*]] = extractelement <2 x float> [[Z]], i64 0
 ; CHECK-NEXT:    [[RES_I0:%.*]] = call fast float @llvm.fma.f32(float [[X_I0]], float [[Y_I0]], float [[Z_I0]])
 ; CHECK-NEXT:    [[X_I1:%.*]] = extractelement <2 x float> [[X]], i64 1
 ; CHECK-NEXT:    [[Y_I1:%.*]] = extractelement <2 x float> [[Y]], i64 1
@@ -819,10 +840,11 @@ define <2 x float> @f22(<2 x float> %x, <2 x float> %y, <2 x float> %z) {
 
 ; See https://reviews.llvm.org/D83101#2133062
 define <2 x i32> @f23_crash(<2 x i32> %srcvec, i32 %v1) {
-; CHECK-LABEL: @f23_crash(
-; CHECK-NEXT:    [[SRCVEC_I0:%.*]] = extractelement <2 x i32> [[SRCVEC:%.*]], i64 0
+; CHECK-LABEL: define <2 x i32> @f23_crash(
+; CHECK-SAME: <2 x i32> [[SRCVEC:%.*]], i32 [[V1:%.*]]) {
+; CHECK-NEXT:    [[SRCVEC_I0:%.*]] = extractelement <2 x i32> [[SRCVEC]], i64 0
 ; CHECK-NEXT:    [[T1_UPTO0:%.*]] = insertelement <2 x i32> poison, i32 [[SRCVEC_I0]], i64 0
-; CHECK-NEXT:    [[T1:%.*]] = insertelement <2 x i32> [[T1_UPTO0]], i32 [[V1:%.*]], i64 1
+; CHECK-NEXT:    [[T1:%.*]] = insertelement <2 x i32> [[T1_UPTO0]], i32 [[V1]], i64 1
 ; CHECK-NEXT:    ret <2 x i32> [[T1]]
 ;
   %v0 = extractelement <2 x i32> %srcvec, i32 0
@@ -838,3 +860,15 @@ define <2 x i32> @f23_crash(<2 x i32> %srcvec, i32 %v1) {
 !4 = !{ float 4.0 }
 !5 = !{ i64 0, i64 8, null }
 !13 = distinct !{}
+;.
+; CHECK: [[SET1_TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0}
+; CHECK: [[META1]] = !{!"set1", [[META2:![0-9]+]]}
+; CHECK: [[META2]] = !{!"root"}
+; CHECK: [[SET2_TBAA3]] = !{[[META4:![0-9]+]], [[META4]], i64 0}
+; CHECK: [[META4]] = !{!"set2", [[META2]]}
+; CHECK: [[TBAA_STRUCT5]] = !{i64 0, i64 8, null}
+; CHECK: [[ACC_GRP6]] = distinct !{}
+; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META8:![0-9]+]]}
+; CHECK: [[META8]] = !{!"llvm.loop.parallel_accesses", [[ACC_GRP6]]}
+; CHECK: [[META9]] = !{float 4.000000e+00}
+;.
diff --git a/llvm/test/Transforms/Scalarizer/basic.ll b/llvm/test/Transforms/Scalarizer/basic.ll
index 190e8a089a5f6..82337c927a9ed 100644
--- a/llvm/test/Transforms/Scalarizer/basic.ll
+++ b/llvm/test/Transforms/Scalarizer/basic.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt %s -passes='function(scalarizer<load-store>,dce)' -S | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
@@ -6,21 +6,22 @@ declare <4 x float> @ext(<4 x float>)
 @g = global <4 x float> zeroinitializer
 
 define void @f1(<4 x float> %init, ptr %base, i32 %count) {
-; CHECK-LABEL: @f1(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[INIT_I0:%.*]] = extractelement <4 x float> [[INIT:%.*]], i64 0
+; CHECK-LABEL: define void @f1(
+; CHECK-SAME: <4 x float> [[INIT:%.*]], ptr [[BASE:%.*]], i32 [[COUNT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[INIT_I0:%.*]] = extractelement <4 x float> [[INIT]], i64 0
 ; CHECK-NEXT:    [[INIT_I1:%.*]] = extractelement <4 x float> [[INIT]], i64 1
 ; CHECK-NEXT:    [[INIT_I2:%.*]] = extractelement <4 x float> [[INIT]], i64 2
 ; CHECK-NEXT:    [[INIT_I3:%.*]] = extractelement <4 x float> [[INIT]], i64 3
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[I:%.*]] = phi i32 [ [[COUNT:%.*]], [[ENTRY:%.*]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[ACC_I0:%.*]] = phi float [ [[INIT_I0]], [[ENTRY]] ], [ [[SEL_I0:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[ACC_I1:%.*]] = phi float [ [[INIT_I1]], [[ENTRY]] ], [ [[SEL_I1:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[ACC_I2:%.*]] = phi float [ [[INIT_I2]], [[ENTRY]] ], [ [[SEL_I2:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[ACC_I3:%.*]] = phi float [ [[INIT_I3]], [[ENTRY]] ], [ [[SEL_I3:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ [[COUNT]], %[[ENTRY]] ], [ [[NEXTI:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[ACC_I0:%.*]] = phi float [ [[INIT_I0]], %[[ENTRY]] ], [ [[SEL_I0:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[ACC_I1:%.*]] = phi float [ [[INIT_I1]], %[[ENTRY]] ], [ [[SEL_I1:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[ACC_I2:%.*]] = phi float [ [[INIT_I2]], %[[ENTRY]] ], [ [[SEL_I2:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[ACC_I3:%.*]] = phi float [ [[INIT_I3]], %[[ENTRY]] ], [ [[SEL_I3:%.*]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[NEXTI]] = sub i32 [[I]], 1
-; CHECK-NEXT:    [[PTR:%.*]] = getelementptr <4 x float>, ptr [[BASE:%.*]], i32 [[I]]
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr <4 x float>, ptr [[BASE]], i32 [[I]]
 ; CHECK-NEXT:    [[VAL_I0:%.*]] = load float, ptr [[PTR]], align 16
 ; CHECK-NEXT:    [[PTR_I1:%.*]] = getelementptr float, ptr [[PTR]], i32 1
 ; CHECK-NEXT:    [[VAL_I1:%.*]] = load float, ptr [[PTR_I1]], align 4
@@ -54,8 +55,8 @@ define void @f1(<4 x float> %init, ptr %base, i32 %count) {
 ; CHECK-NEXT:    store float [[SEL_I2]], ptr [[PTR_I2]], align 8
 ; CHECK-NEXT:    store float [[SEL_I3]], ptr [[PTR_I3]], align 4
 ; CHECK-NEXT:    [[TEST:%.*]] = icmp eq i32 [[NEXTI]], 0
-; CHECK-NEXT:    br i1 [[TEST]], label [[LOOP]], label [[EXIT:%.*]]
-; CHECK:       exit:
+; CHECK-NEXT:    br i1 [[TEST]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -92,21 +93,22 @@ exit:
 }
 
 define void @f2(<4 x i32> %init, ptr %base, i32 %count) {
-; CHECK-LABEL: @f2(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[INIT_I0:%.*]] = extractelement <4 x i32> [[INIT:%.*]], i64 0
+; CHECK-LABEL: define void @f2(
+; CHECK-SAME: <4 x i32> [[INIT:%.*]], ptr [[BASE:%.*]], i32 [[COUNT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[INIT_I0:%.*]] = extractelement <4 x i32> [[INIT]], i64 0
 ; CHECK-NEXT:    [[INIT_I1:%.*]] = extractelement <4 x i32> [[INIT]], i64 1
 ; CHECK-NEXT:    [[INIT_I2:%.*]] = extractelement <4 x i32> [[INIT]], i64 2
 ; CHECK-NEXT:    [[INIT_I3:%.*]] = extractelement <4 x i32> [[INIT]], i64 3
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[I:%.*]] = phi i32 [ [[COUNT:%.*]], [[ENTRY:%.*]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[ACC_I0:%.*]] = phi i32 [ [[INIT_I0]], [[ENTRY]] ], [ [[SEL_I0:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[ACC_I1:%.*]] = phi i32 [ [[INIT_I1]], [[ENTRY]] ], [ [[SEL_I1:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[ACC_I2:%.*]] = phi i32 [ [[INIT_I2]], [[ENTRY]] ], [ [[SEL_I2:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[ACC_I3:%.*]] = phi i32 [ [[INIT_I3]], [[ENTRY]] ], [ [[SEL_I3:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ [[COUNT]], %[[ENTRY]] ], [ [[NEXTI:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[ACC_I0:%.*]] = phi i32 [ [[INIT_I0]], %[[ENTRY]] ], [ [[SEL_I0:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[ACC_I1:%.*]] = phi i32 [ [[INIT_I1]], %[[ENTRY]] ], [ [[SEL_I1:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[ACC_I2:%.*]] = phi i32 [ [[INIT_I2]], %[[ENTRY]] ], [ [[SEL_I2:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[ACC_I3:%.*]] = phi i32 [ [[INIT_I3]], %[[ENTRY]] ], [ [[SEL_I3:%.*]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[NEXTI]] = sub i32 [[I]], 1
-; CHECK-NEXT:    [[PTR:%.*]] = getelementptr <4 x i8>, ptr [[BASE:%.*]], i32 [[I]]
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr <4 x i8>, ptr [[BASE]], i32 [[I]]
 ; CHECK-NEXT:    [[VAL_I0:%.*]] = load i8, ptr [[PTR]], align 4
 ; CHECK-NEXT:    [[PTR_I1:%.*]] = getelementptr i8, ptr [[PTR]], i32 1
 ; CHECK-NEXT:    [[VAL_I1:%.*]] = load i8, ptr [[PTR_I1]], align 1
@@ -139,8 +141,8 @@ define void @f2(<4 x i32> %init, ptr %base, i32 %count) {
 ; CHECK-NEXT:    store i8 [[TRUNC_I2]], ptr [[PTR_I2]], align 2
 ; CHECK-NEXT:    store i8 [[TRUNC_I3]], ptr [[PTR_I3]], align 1
 ; CHECK-NEXT:    [[TEST:%.*]] = icmp eq i32 [[NEXTI]], 0
-; CHECK-NEXT:    br i1 [[TEST]], label [[LOOP]], label [[EXIT:%.*]]
-; CHECK:       exit:
+; CHECK-NEXT:    br i1 [[TEST]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -172,25 +174,26 @@ exit:
 
 ; Check that !tbaa information is preserved.
 define void @f3(ptr %src, ptr %dst) {
-; CHECK-LABEL: @f3(
-; CHECK-NEXT:    [[DST_I1:%.*]] = getelementptr i32, ptr [[DST:%.*]], i32 1
+; CHECK-LABEL: define void @f3(
+; CHECK-SAME: ptr [[SRC:%.*]], ptr [[DST:%.*]]) {
+; CHECK-NEXT:    [[DST_I1:%.*]] = getelementptr i32, ptr [[DST]], i32 1
 ; CHECK-NEXT:    [[DST_I2:%.*]] = getelementptr i32, ptr [[DST]], i32 2
 ; CHECK-NEXT:    [[DST_I3:%.*]] = getelementptr i32, ptr [[DST]], i32 3
-; CHECK-NEXT:    [[VAL_I0:%.*]] = load i32, ptr [[SRC:%.*]], align 16, !tbaa [[TBAA0:![0-9]+]]
+; CHECK-NEXT:    [[VAL_I0:%.*]] = load i32, ptr [[SRC]], align 16, !tbaa [[SET1_TBAA0:![0-9]+]]
 ; CHECK-NEXT:    [[SRC_I1:%.*]] = getelementptr i32, ptr [[SRC]], i32 1
-; CHECK-NEXT:    [[VAL_I1:%.*]] = load i32, ptr [[SRC_I1]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    [[VAL_I1:%.*]] = load i32, ptr [[SRC_I1]], align 4, !tbaa [[SET1_TBAA0]]
 ; CHECK-NEXT:    [[SRC_I2:%.*]] = getelementptr i32, ptr [[SRC]], i32 2
-; CHECK-NEXT:    [[VAL_I2:%.*]] = load i32, ptr [[SRC_I2]], align 8, !tbaa [[TBAA0]]
+; CHECK-NEXT:    [[VAL_I2:%.*]] = load i32, ptr [[SRC_I2]], align 8, !tbaa [[SET1_TBAA0]]
 ; CHECK-NEXT:    [[SRC_I3:%.*]] = getelementptr i32, ptr [[SRC]], i32 3
-; CHECK-NEXT:    [[VAL_I3:%.*]] = load i32, ptr [[SRC_I3]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    [[VAL_I3:%.*]] = load i32, ptr [[SRC_I3]], align 4, !tbaa [[SET1_TBAA0]]
 ; CHECK-NEXT:    [[ADD_I0:%.*]] = add i32 [[VAL_I0]], [[VAL_I0]]
 ; CHECK-NEXT:    [[ADD_I1:%.*]] = add i32 [[VAL_I1]], [[VAL_I1]]
 ; CHECK-NEXT:    [[ADD_I2:%.*]] = add i32 [[VAL_I2]], [[VAL_I2]]
 ; CHECK-NEXT:    [[ADD_I3:%.*]] = add i32 [[VAL_I3]], [[VAL_I3]]
-; CHECK-NEXT:    store i32 [[ADD_I0]], ptr [[DST]], align 16, !tbaa [[TBAA3:![0-9]+]]
-; CHECK-NEXT:    store i32 [[ADD_I1]], ptr [[DST_I1]], align 4, !tbaa [[TBAA3]]
-; CHECK-NEXT:    store i32 [[ADD_I2]], ptr [[DST_I2]], align 8, !tbaa [[TBAA3]]
-; CHECK-NEXT:    store i32 [[ADD_I3]], ptr [[DST_I3]], align 4, !tbaa [[TBAA3]]
+; CHECK-NEXT:    store i32 [[ADD_I0]], ptr [[DST]], align 16, !tbaa [[SET2_TBAA3:![0-9]+]]
+; CHECK-NEXT:    store i32 [[ADD_I1]], ptr [[DST_I1]], align 4, !tbaa [[SET2_TBAA3]]
+; CHECK-NEXT:    store i32 [[ADD_I2]], ptr [[DST_I2]], align 8, !tbaa [[SET2_TBAA3]]
+; CHECK-NEXT:    store i32 [[ADD_I3]], ptr [[DST_I3]], align 4, !tbaa [[SET2_TBAA3]]
 ; CHECK-NEXT:    ret void
 ;
   %val = load <4 x i32> , ptr %src, !tbaa !1
@@ -201,11 +204,12 @@ define void @f3(ptr %src, ptr %dst) {
 
 ; Check that !tbaa.struct information is preserved.
 define void @f4(ptr %src, ptr %dst) {
-; CHECK-LABEL: @f4(
-; CHECK-NEXT:    [[DST_I1:%.*]] = getelementptr i32, ptr [[DST:%.*]], i32 1
+; CHECK-LABEL: define void @f4(
+; CHECK-SAME: ptr [[SRC:%.*]], ptr [[DST:%.*]]) {
+; CHECK-NEXT:    [[DST_I1:%.*]] = getelementptr i32, ptr [[DST]], i32 1
 ; CHECK-NEXT:    [[DST_I2:%.*]] = getelementptr i32, ptr [[DST]], i32 2
 ; CHECK-NEXT:    [[DST_I3:%.*]] = getelementptr i32, ptr [[DST]], i32 3
-; CHECK-NEXT:    [[VAL_I0:%.*]] = load i32, ptr [[SRC:%.*]], align 16, !tbaa.struct [[TBAA_STRUCT5:![0-9]+]]
+; CHECK-NEXT:    [[VAL_I0:%.*]] = load i32, ptr [[SRC]], align 16, !tbaa.struct [[TBAA_STRUCT5:![0-9]+]]
 ; CHECK-NEXT:    [[SRC_I1:%.*]] = getelementptr i32, ptr [[SRC]], i32 1
 ; CHECK-NEXT:    [[VAL_I1:%.*]] = load i32, ptr [[SRC_I1]], align 4, !tbaa.struct [[TBAA_STRUCT5]]
 ; CHECK-NEXT:    [[SRC_I2:%.*]] = getelementptr i32, ptr [[SRC]], i32 2
@@ -230,16 +234,17 @@ define void @f4(ptr %src, ptr %dst) {
 
 ; Check that llvm.access.group information is preserved.
 define void @f5(i32 %count, ptr %src, ptr %dst) {
-; CHECK-LABEL: @f5(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[NEXT_INDEX:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[THIS_SRC:%.*]] = getelementptr <4 x i32>, ptr [[SRC:%.*]], i32 [[INDEX]]
+; CHECK-LABEL: define void @f5(
+; CHECK-SAME: i32 [[COUNT:%.*]], ptr [[SRC:%.*]], ptr [[DST:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[NEXT_INDEX:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[THIS_SRC:%.*]] = getelementptr <4 x i32>, ptr [[SRC]], i32 [[INDEX]]
 ; CHECK-NEXT:    [[THIS_SRC_I1:%.*]] = getelementptr i32, ptr [[THIS_SRC]], i32 1
 ; CHECK-NEXT:    [[THIS_SRC_I2:%.*]] = getelementptr i32, ptr [[THIS_SRC]], i32 2
 ; CHECK-NEXT:    [[THIS_SRC_I3:%.*]] = getelementptr i32, ptr [[THIS_SRC]], i32 3
-; CHECK-NEXT:    [[THIS_DST:%.*]] = getelementptr <4 x i32>, ptr [[DST:%.*]], i32 [[INDEX]]
+; CHECK-NEXT:    [[THIS_DST:%.*]] = getelementptr <4 x i32>, ptr [[DST]], i32 [[INDEX]]
 ; CHECK-NEXT:    [[THIS_DST_I1:%.*]] = getelementptr i32, ptr [[THIS_DST]], i32 1
 ; CHECK-NEXT:    [[THIS_DST_I2:%.*]] = getelementptr i32, ptr [[THIS_DST]], i32 2
 ; CHECK-NEXT:    [[THIS_DST_I3:%.*]] = getelementptr i32, ptr [[THIS_DST]], i32 3
@@ -256,9 +261,9 @@ define void @f5(i32 %count, ptr %src, ptr %dst) {
 ; CHECK-NEXT:    store i32 [[ADD_I2]], ptr [[THIS_DST_I2]], align 8, !llvm.access.group [[ACC_GRP6]]
 ; CHECK-NEXT:    store i32 [[ADD_I3]], ptr [[THIS_DST_I3]], align 4, !llvm.access.group [[ACC_GRP6]]
 ; CHECK-NEXT:    [[NEXT_INDEX]] = add i32 [[INDEX]], -1
-; CHECK-NEXT:    [[CONTINUE:%.*]] = icmp ne i32 [[NEXT_INDEX]], [[COUNT:%.*]]
-; CHECK-NEXT:    br i1 [[CONTINUE]], label [[LOOP]], label [[END:%.*]], !llvm.loop [[LOOP7:![0-9]+]]
-; CHECK:       end:
+; CHECK-NEXT:    [[CONTINUE:%.*]] = icmp ne i32 [[NEXT_INDEX]], [[COUNT]]
+; CHECK-NEXT:    br i1 [[CONTINUE]], label %[[LOOP]], label %[[END:.*]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK:       [[END]]:
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -281,8 +286,9 @@ end:
 
 ; Check that fpmath information is preserved.
 define <4 x float> @f6(<4 x float> %x) {
-; CHECK-LABEL: @f6(
-; CHECK-NEXT:    [[X_I0:%.*]] = extractelement <4 x float> [[X:%.*]], i64 0
+; CHECK-LABEL: define <4 x float> @f6(
+; CHECK-SAME: <4 x float> [[X:%.*]]) {
+; CHECK-NEXT:    [[X_I0:%.*]] = extractelement <4 x float> [[X]], i64 0
 ; CHECK-NEXT:    [[RES_I0:%.*]] = fadd float [[X_I0]], 1.000000e+00, !fpmath [[META9:![0-9]+]]
 ; CHECK-NEXT:    [[X_I1:%.*]] = extractelement <4 x float> [[X]], i64 1
 ; CHECK-NEXT:    [[RES_I1:%.*]] = fadd float [[X_I1]], 2.000000e+00, !fpmath [[META9]]
@@ -303,11 +309,12 @@ define <4 x float> @f6(<4 x float> %x) {
 
 ; Check that random metadata isn't kept.
 define void @f7(ptr %src, ptr %dst) {
-; CHECK-LABEL: @f7(
-; CHECK-NEXT:    [[DST_I1:%.*]] = getelementptr i32, ptr [[DST:%.*]], i32 1
+; CHECK-LABEL: define void @f7(
+; CHECK-SAME: ptr [[SRC:%.*]], ptr [[DST:%.*]]) {
+; CHECK-NEXT:    [[DST_I1:%.*]] = getelementptr i32, ptr [[DST]], i32 1
 ; CHECK-NEXT:    [[DST_I2:%.*]] = getelementptr i32, ptr [[DST]], i32 2
 ; CHECK-NEXT:    [[DST_I3:%.*]] = getelementptr i32, ptr [[DST]], i32 3
-; CHECK-NEXT:    [[VAL_I0:%.*]] = load i32, ptr [[SRC:%.*]], align 16
+; CHECK-NEXT:    [[VAL_I0:%.*]] = load i32, ptr [[SRC]], align 16
 ; CHECK-NEXT:    [[SRC_I1:%.*]] = getelementptr i32, ptr [[SRC]], i32 1
 ; CHECK-NEXT:    [[VAL_I1:%.*]] = load i32, ptr [[SRC_I1]], align 4
 ; CHECK-NEXT:    [[SRC_I2:%.*]] = getelementptr i32, ptr [[SRC]], i32 2
@@ -332,17 +339,18 @@ define void @f7(ptr %src, ptr %dst) {
 
 ; Test GEP with vectors.
 define void @f8(ptr %dest, <4 x ptr> %ptr0, <4 x i32> %i0,
-; CHECK-LABEL: @f8(
-; CHECK-NEXT:    [[DEST_I1:%.*]] = getelementptr ptr, ptr [[DEST:%.*]], i32 1
+; CHECK-LABEL: define void @f8(
+; CHECK-SAME: ptr [[DEST:%.*]], <4 x ptr> [[PTR0:%.*]], <4 x i32> [[I0:%.*]], ptr [[OTHER:%.*]]) {
+; CHECK-NEXT:    [[DEST_I1:%.*]] = getelementptr ptr, ptr [[DEST]], i32 1
 ; CHECK-NEXT:    [[DEST_I2:%.*]] = getelementptr ptr, ptr [[DEST]], i32 2
 ; CHECK-NEXT:    [[DEST_I3:%.*]] = getelementptr ptr, ptr [[DEST]], i32 3
-; CHECK-NEXT:    [[PTR0_I0:%.*]] = extractelement <4 x ptr> [[PTR0:%.*]], i64 0
+; CHECK-NEXT:    [[PTR0_I0:%.*]] = extractelement <4 x ptr> [[PTR0]], i64 0
 ; CHECK-NEXT:    [[PTR0_I2:%.*]] = extractelement <4 x ptr> [[PTR0]], i64 2
 ; CHECK-NEXT:    [[PTR0_I3:%.*]] = extractelement <4 x ptr> [[PTR0]], i64 3
-; CHECK-NEXT:    [[I0_I1:%.*]] = extractelement <4 x i32> [[I0:%.*]], i64 1
+; CHECK-NEXT:    [[I0_I1:%.*]] = extractelement <4 x i32> [[I0]], i64 1
 ; CHECK-NEXT:    [[I0_I3:%.*]] = extractelement <4 x i32> [[I0]], i64 3
 ; CHECK-NEXT:    [[VAL_I0:%.*]] = getelementptr float, ptr [[PTR0_I0]], i32 100
-; CHECK-NEXT:    [[VAL_I1:%.*]] = getelementptr float, ptr [[OTHER:%.*]], i32 [[I0_I1]]
+; CHECK-NEXT:    [[VAL_I1:%.*]] = getelementptr float, ptr [[OTHER]], i32 [[I0_I1]]
 ; CHECK-NEXT:    [[VAL_I2:%.*]] = getelementptr float, ptr [[PTR0_I2]], i32 100
 ; CHECK-NEXT:    [[VAL_I3:%.*]] = getelementptr float, ptr [[PTR0_I3]], i32 [[I0_I3]]
 ; CHECK-NEXT:    store ptr [[VAL_I0]], ptr [[DEST]], align 32
@@ -362,11 +370,12 @@ define void @f8(ptr %dest, <4 x ptr> %ptr0, <4 x i32> %i0,
 
 ; Test the handling of unaligned loads.
 define void @f9(ptr %dest, ptr %src) {
-; CHECK-LABEL: @f9(
-; CHECK-NEXT:    [[DEST_I1:%.*]] = getelementptr float, ptr [[DEST:%.*]], i32 1
+; CHECK-LABEL: define void @f9(
+; CHECK-SAME: ptr [[DEST:%.*]], ptr [[SRC:%.*]]) {
+; CHECK-NEXT:    [[DEST_I1:%.*]] = getelementptr float, ptr [[DEST]], i32 1
 ; CHECK-NEXT:    [[DEST_I2:%.*]] = getelementptr float, ptr [[DEST]], i32 2
 ; CHECK-NEXT:    [[DEST_I3:%.*]] = getelementptr float, ptr [[DEST]], i32 3
-; CHECK-NEXT:    [[VAL_I0:%.*]] = load float, ptr [[SRC:%.*]], align 4
+; CHECK-NEXT:    [[VAL_I0:%.*]] = load float, ptr [[SRC]], align 4
 ; CHECK-NEXT:    [[SRC_I1:%.*]] = getelementptr float, ptr [[SRC]], i32 1
 ; CHECK-NEXT:    [[VAL_I1:%.*]] = load float, ptr [[SRC_I1]], align 4
 ; CHECK-NEXT:    [[SRC_I2:%.*]] = getelementptr float, ptr [[SRC]], i32 2
@@ -386,11 +395,12 @@ define void @f9(ptr %dest, ptr %src) {
 
 ; ...and again with subelement alignment.
 define void @f10(ptr %dest, ptr %src) {
-; CHECK-LABEL: @f10(
-; CHECK-NEXT:    [[DEST_I1:%.*]] = getelementptr float, ptr [[DEST:%.*]], i32 1
+; CHECK-LABEL: define void @f10(
+; CHECK-SAME: ptr [[DEST:%.*]], ptr [[SRC:%.*]]) {
+; CHECK-NEXT:    [[DEST_I1:%.*]] = getelementptr float, ptr [[DEST]], i32 1
 ; CHECK-NEXT:    [[DEST_I2:%.*]] = getelementptr float, ptr [[DEST]], i32 2
 ; CHECK-NEXT:    [[DEST_I3:%.*]] = getelementptr float, ptr [[DEST]], i32 3
-; CHECK-NEXT:    [[VAL_I0:%.*]] = load float, ptr [[SRC:%.*]], align 1
+; CHECK-NEXT:    [[VAL_I0:%.*]] = load float, ptr [[SRC]], align 1
 ; CHECK-NEXT:    [[SRC_I1:%.*]] = getelementptr float, ptr [[SRC]], i32 1
 ; CHECK-NEXT:    [[VAL_I1:%.*]] = load float, ptr [[SRC_I1]], align 1
 ; CHECK-NEXT:    [[SRC_I2:%.*]] = getelementptr float, ptr [[SRC]], i32 2
@@ -410,8 +420,9 @@ define void @f10(ptr %dest, ptr %src) {
 
 ; Test that sub-byte loads aren't scalarized.
 define void @f11(ptr %dest, ptr %src0) {
-; CHECK-LABEL: @f11(
-; CHECK-NEXT:    [[SRC1:%.*]] = getelementptr <32 x i1>, ptr [[SRC0:%.*]], i32 1
+; CHECK-LABEL: define void @f11(
+; CHECK-SAME: ptr [[DEST:%.*]], ptr [[SRC0:%.*]]) {
+; CHECK-NEXT:    [[SRC1:%.*]] = getelementptr <32 x i1>, ptr [[SRC0]], i32 1
 ; CHECK-NEXT:    [[VAL0:%.*]] = load <32 x i1>, ptr [[SRC0]], align 4
 ; CHECK-NEXT:    [[VAL0_I0:%.*]] = extractelement <32 x i1> [[VAL0]], i64 0
 ; CHECK-NEXT:    [[VAL0_I1:%.*]] = extractelement <32 x i1> [[VAL0]], i64 1
@@ -542,7 +553,7 @@ define void @f11(ptr %dest, ptr %src0) {
 ; CHECK-NEXT:    [[AND_UPTO29:%.*]] = insertelement <32 x i1> [[AND_UPTO28]], i1 [[AND_I29]], i64 29
 ; CHECK-NEXT:    [[AND_UPTO30:%.*]] = insertelement <32 x i1> [[AND_UPTO29]], i1 [[AND_I30]], i64 30
 ; CHECK-NEXT:    [[AND:%.*]] = insertelement <32 x i1> [[AND_UPTO30]], i1 [[AND_I31]], i64 31
-; CHECK-NEXT:    store <32 x i1> [[AND]], ptr [[DEST:%.*]], align 4
+; CHECK-NEXT:    store <32 x i1> [[AND]], ptr [[DEST]], align 4
 ; CHECK-NEXT:    ret void
 ;
   %src1 = getelementptr <32 x i1>, ptr %src0, i32 1
@@ -555,12 +566,13 @@ define void @f11(ptr %dest, ptr %src0) {
 
 ; Test vector GEPs with more than one index.
 define void @f13(ptr %dest, <4 x ptr> %ptr, <4 x i32> %i,
-; CHECK-LABEL: @f13(
-; CHECK-NEXT:    [[DEST_I1:%.*]] = getelementptr ptr, ptr [[DEST:%.*]], i32 1
+; CHECK-LABEL: define void @f13(
+; CHECK-SAME: ptr [[DEST:%.*]], <4 x ptr> [[PTR:%.*]], <4 x i32> [[I:%.*]], ptr [[OTHER:%.*]]) {
+; CHECK-NEXT:    [[DEST_I1:%.*]] = getelementptr ptr, ptr [[DEST]], i32 1
 ; CHECK-NEXT:    [[DEST_I2:%.*]] = getelementptr ptr, ptr [[DEST]], i32 2
 ; CHECK-NEXT:    [[DEST_I3:%.*]] = getelementptr ptr, ptr [[DEST]], i32 3
-; CHECK-NEXT:    [[PTR_I0:%.*]] = extractelement <4 x ptr> [[PTR:%.*]], i64 0
-; CHECK-NEXT:    [[I_I0:%.*]] = extractelement <4 x i32> [[I:%.*]], i64 0
+; CHECK-NEXT:    [[PTR_I0:%.*]] = extractelement <4 x ptr> [[PTR]], i64 0
+; CHECK-NEXT:    [[I_I0:%.*]] = extractelement <4 x i32> [[I]], i64 0
 ; CHECK-NEXT:    [[VAL_I0:%.*]] = getelementptr inbounds [4 x float], ptr [[PTR_I0]], i32 0, i32 [[I_I0]]
 ; CHECK-NEXT:    [[PTR_I1:%.*]] = extractelement <4 x ptr> [[PTR]], i64 1
 ; CHECK-NEXT:    [[I_I1:%.*]] = extractelement <4 x i32> [[I]], i64 1
@@ -587,19 +599,20 @@ define void @f13(ptr %dest, <4 x ptr> %ptr, <4 x i32> %i,
 
 ; Test combinations of vector and non-vector PHIs.
 define <4 x float> @f14(<4 x float> %acc, i32 %count) {
-; CHECK-LABEL: @f14(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[ACC_I0:%.*]] = extractelement <4 x float> [[ACC:%.*]], i64 0
+; CHECK-LABEL: define <4 x float> @f14(
+; CHECK-SAME: <4 x float> [[ACC:%.*]], i32 [[COUNT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[ACC_I0:%.*]] = extractelement <4 x float> [[ACC]], i64 0
 ; CHECK-NEXT:    [[ACC_I1:%.*]] = extractelement <4 x float> [[ACC]], i64 1
 ; CHECK-NEXT:    [[ACC_I2:%.*]] = extractelement <4 x float> [[ACC]], i64 2
 ; CHECK-NEXT:    [[ACC_I3:%.*]] = extractelement <4 x float> [[ACC]], i64 3
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[THIS_ACC_I0:%.*]] = phi float [ [[ACC_I0]], [[ENTRY:%.*]] ], [ [[NEXT_ACC_I0:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[THIS_ACC_I1:%.*]] = phi float [ [[ACC_I1]], [[ENTRY]] ], [ [[NEXT_ACC_I1:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[THIS_ACC_I2:%.*]] = phi float [ [[ACC_I2]], [[ENTRY]] ], [ [[NEXT_ACC_I2:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[THIS_ACC_I3:%.*]] = phi float [ [[ACC_I3]], [[ENTRY]] ], [ [[NEXT_ACC_I3:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[THIS_COUNT:%.*]] = phi i32 [ [[COUNT:%.*]], [[ENTRY]] ], [ [[NEXT_COUNT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[THIS_ACC_I0:%.*]] = phi float [ [[ACC_I0]], %[[ENTRY]] ], [ [[NEXT_ACC_I0:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[THIS_ACC_I1:%.*]] = phi float [ [[ACC_I1]], %[[ENTRY]] ], [ [[NEXT_ACC_I1:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[THIS_ACC_I2:%.*]] = phi float [ [[ACC_I2]], %[[ENTRY]] ], [ [[NEXT_ACC_I2:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[THIS_ACC_I3:%.*]] = phi float [ [[ACC_I3]], %[[ENTRY]] ], [ [[NEXT_ACC_I3:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[THIS_COUNT:%.*]] = phi i32 [ [[COUNT]], %[[ENTRY]] ], [ [[NEXT_COUNT:%.*]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[THIS_ACC_UPTO0:%.*]] = insertelement <4 x float> poison, float [[THIS_ACC_I0]], i64 0
 ; CHECK-NEXT:    [[THIS_ACC_UPTO1:%.*]] = insertelement <4 x float> [[THIS_ACC_UPTO0]], float [[THIS_ACC_I1]], i64 1
 ; CHECK-NEXT:    [[THIS_ACC_UPTO2:%.*]] = insertelement <4 x float> [[THIS_ACC_UPTO1]], float [[THIS_ACC_I2]], i64 2
@@ -619,8 +632,8 @@ define <4 x float> @f14(<4 x float> %acc, i32 %count) {
 ; CHECK-NEXT:    [[NEXT_ACC:%.*]] = insertelement <4 x float> [[NEXT_ACC_UPTO2]], float [[NEXT_ACC_I3]], i64 3
 ; CHECK-NEXT:    [[NEXT_COUNT]] = sub i32 [[THIS_COUNT]], 1
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[NEXT_COUNT]], 0
-; CHECK-NEXT:    br i1 [[CMP]], label [[LOOP]], label [[EXIT:%.*]]
-; CHECK:       exit:
+; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret <4 x float> [[NEXT_ACC]]
 ;
 entry:
@@ -641,13 +654,14 @@ exit:
 
 ; Test unary operator scalarization.
 define void @f15(<4 x float> %init, ptr %base, i32 %count) {
-; CHECK-LABEL: @f15(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[I:%.*]] = phi i32 [ [[COUNT:%.*]], [[ENTRY:%.*]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; CHECK-LABEL: define void @f15(
+; CHECK-SAME: <4 x float> [[INIT:%.*]], ptr [[BASE:%.*]], i32 [[COUNT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ [[COUNT]], %[[ENTRY]] ], [ [[NEXTI:%.*]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[NEXTI]] = sub i32 [[I]], 1
-; CHECK-NEXT:    [[PTR:%.*]] = getelementptr <4 x float>, ptr [[BASE:%.*]], i32 [[I]]
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr <4 x float>, ptr [[BASE]], i32 [[I]]
 ; CHECK-NEXT:    [[VAL_I0:%.*]] = load float, ptr [[PTR]], align 16
 ; CHECK-NEXT:    [[PTR_I1:%.*]] = getelementptr float, ptr [[PTR]], i32 1
 ; CHECK-NEXT:    [[VAL_I1:%.*]] = load float, ptr [[PTR_I1]], align 4
@@ -681,8 +695,8 @@ define void @f15(<4 x float> %init, ptr %base, i32 %count) {
 ; CHECK-NEXT:    store float [[SEL_I2]], ptr [[PTR_I2]], align 8
 ; CHECK-NEXT:    store float [[SEL_I3]], ptr [[PTR_I3]], align 4
 ; CHECK-NEXT:    [[TEST:%.*]] = icmp eq i32 [[NEXTI]], 0
-; CHECK-NEXT:    br i1 [[TEST]], label [[LOOP]], label [[EXIT:%.*]]
-; CHECK:       exit:
+; CHECK-NEXT:    br i1 [[TEST]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -712,9 +726,10 @@ exit:
 
 ; Check that IR flags are preserved.
 define <2 x i32> @f16(<2 x i32> %i, <2 x i32> %j) {
-; CHECK-LABEL: @f16(
-; CHECK-NEXT:    [[I_I0:%.*]] = extractelement <2 x i32> [[I:%.*]], i64 0
-; CHECK-NEXT:    [[J_I0:%.*]] = extractelement <2 x i32> [[J:%.*]], i64 0
+; CHECK-LABEL: define <2 x i32> @f16(
+; CHECK-SAME: <2 x i32> [[I:%.*]], <2 x i32> [[J:%.*]]) {
+; CHECK-NEXT:    [[I_I0:%.*]] = extractelement <2 x i32> [[I]], i64 0
+; CHECK-NEXT:    [[J_I0:%.*]] = extractelement <2 x i32> [[J]], i64 0
 ; CHECK-NEXT:    [[RES_I0:%.*]] = add nuw nsw i32 [[I_I0]], [[J_I0]]
 ; CHECK-NEXT:    [[I_I1:%.*]] = extractelement <2 x i32> [[I]], i64 1
 ; CHECK-NEXT:    [[J_I1:%.*]] = extractelement <2 x i32> [[J]], i64 1
@@ -727,9 +742,10 @@ define <2 x i32> @f16(<2 x i32> %i, <2 x i32> %j) {
   ret <2 x i32> %res
 }
 define <2 x i32> @f17(<2 x i32> %i, <2 x i32> %j) {
-; CHECK-LABEL: @f17(
-; CHECK-NEXT:    [[I_I0:%.*]] = extractelement <2 x i32> [[I:%.*]], i64 0
-; CHECK-NEXT:    [[J_I0:%.*]] = extractelement <2 x i32> [[J:%.*]], i64 0
+; CHECK-LABEL: define <2 x i32> @f17(
+; CHECK-SAME: <2 x i32> [[I:%.*]], <2 x i32> [[J:%.*]]) {
+; CHECK-NEXT:    [[I_I0:%.*]] = extractelement <2 x i32> [[I]], i64 0
+; CHECK-NEXT:    [[J_I0:%.*]] = extractelement <2 x i32> [[J]], i64 0
 ; CHECK-NEXT:    [[RES_I0:%.*]] = sdiv exact i32 [[I_I0]], [[J_I0]]
 ; CHECK-NEXT:    [[I_I1:%.*]] = extractelement <2 x i32> [[I]], i64 1
 ; CHECK-NEXT:    [[J_I1:%.*]] = extractelement <2 x i32> [[J]], i64 1
@@ -742,9 +758,10 @@ define <2 x i32> @f17(<2 x i32> %i, <2 x i32> %j) {
   ret <2 x i32> %res
 }
 define <2 x float> @f18(<2 x float> %x, <2 x float> %y) {
-; CHECK-LABEL: @f18(
-; CHECK-NEXT:    [[X_I0:%.*]] = extractelement <2 x float> [[X:%.*]], i64 0
-; CHECK-NEXT:    [[Y_I0:%.*]] = extractelement <2 x float> [[Y:%.*]], i64 0
+; CHECK-LABEL: define <2 x float> @f18(
+; CHECK-SAME: <2 x float> [[X:%.*]], <2 x float> [[Y:%.*]]) {
+; CHECK-NEXT:    [[X_I0:%.*]] = extractelement <2 x float> [[X]], i64 0
+; CHECK-NEXT:    [[Y_I0:%.*]] = extractelement <2 x float> [[Y]], i64 0
 ; CHECK-NEXT:    [[RES_I0:%.*]] = fadd fast float [[X_I0]], [[Y_I0]]
 ; CHECK-NEXT:    [[X_I1:%.*]] = extractelement <2 x float> [[X]], i64 1
 ; CHECK-NEXT:    [[Y_I1:%.*]] = extractelement <2 x float> [[Y]], i64 1
@@ -757,8 +774,9 @@ define <2 x float> @f18(<2 x float> %x, <2 x float> %y) {
   ret <2 x float> %res
 }
 define <2 x float> @f19(<2 x float> %x) {
-; CHECK-LABEL: @f19(
-; CHECK-NEXT:    [[X_I0:%.*]] = extractelement <2 x float> [[X:%.*]], i64 0
+; CHECK-LABEL: define <2 x float> @f19(
+; CHECK-SAME: <2 x float> [[X:%.*]]) {
+; CHECK-NEXT:    [[X_I0:%.*]] = extractelement <2 x float> [[X]], i64 0
 ; CHECK-NEXT:    [[RES_I0:%.*]] = fneg fast float [[X_I0]]
 ; CHECK-NEXT:    [[X_I1:%.*]] = extractelement <2 x float> [[X]], i64 1
 ; CHECK-NEXT:    [[RES_I1:%.*]] = fneg fast float [[X_I1]]
@@ -770,9 +788,10 @@ define <2 x float> @f19(<2 x float> %x) {
   ret <2 x float> %res
 }
 define <2 x i1> @f20(<2 x float> %x, <2 x float> %y) {
-; CHECK-LABEL: @f20(
-; CHECK-NEXT:    [[X_I0:%.*]] = extractelement <2 x float> [[X:%.*]], i64 0
-; CHECK-NEXT:    [[Y_I0:%.*]] = extractelement <2 x float> [[Y:%.*]], i64 0
+; CHECK-LABEL: define <2 x i1> @f20(
+; CHECK-SAME: <2 x float> [[X:%.*]], <2 x float> [[Y:%.*]]) {
+; CHECK-NEXT:    [[X_I0:%.*]] = extractelement <2 x float> [[X]], i64 0
+; CHECK-NEXT:    [[Y_I0:%.*]] = extractelement <2 x float> [[Y]], i64 0
 ; CHECK-NEXT:    [[RES_I0:%.*]] = fcmp fast ogt float [[X_I0]], [[Y_I0]]
 ; CHECK-NEXT:    [[X_I1:%.*]] = extractelement <2 x float> [[X]], i64 1
 ; CHECK-NEXT:    [[Y_I1:%.*]] = extractelement <2 x float> [[Y]], i64 1
@@ -786,8 +805,9 @@ define <2 x i1> @f20(<2 x float> %x, <2 x float> %y) {
 }
 declare <2 x float> @llvm.sqrt.v2f32(<2 x float>)
 define <2 x float> @f21(<2 x float> %x) {
-; CHECK-LABEL: @f21(
-; CHECK-NEXT:    [[X_I0:%.*]] = extractelement <2 x float> [[X:%.*]], i64 0
+; CHECK-LABEL: define <2 x float> @f21(
+; CHECK-SAME: <2 x float> [[X:%.*]]) {
+; CHECK-NEXT:    [[X_I0:%.*]] = extractelement <2 x float> [[X]], i64 0
 ; CHECK-NEXT:    [[RES_I0:%.*]] = call fast float @llvm.sqrt.f32(float [[X_I0]])
 ; CHECK-NEXT:    [[X_I1:%.*]] = extractelement <2 x float> [[X]], i64 1
 ; CHECK-NEXT:    [[RES_I1:%.*]] = call fast float @llvm.sqrt.f32(float [[X_I1]])
@@ -800,10 +820,11 @@ define <2 x float> @f21(<2 x float> %x) {
 }
 declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>)
 define <2 x float> @f22(<2 x float> %x, <2 x float> %y, <2 x float> %z) {
-; CHECK-LABEL: @f22(
-; CHECK-NEXT:    [[X_I0:%.*]] = extractelement <2 x float> [[X:%.*]], i64 0
-; CHECK-NEXT:    [[Y_I0:%.*]] = extractelement <2 x float> [[Y:%.*]], i64 0
-; CHECK-NEXT:    [[Z_I0:%.*]] = extractelement <2 x float> [[Z:%.*]], i64 0
+; CHECK-LABEL: define <2 x float> @f22(
+; CHECK-SAME: <2 x float> [[X:%.*]], <2 x float> [[Y:%.*]], <2 x float> [[Z:%.*]]) {
+; CHECK-NEXT:    [[X_I0:%.*]] = extractelement <2 x float> [[X]], i64 0
+; CHECK-NEXT:    [[Y_I0:%.*]] = extractelement <2 x float> [[Y]], i64 0
+; CHECK-NEXT:    [[Z_I0:%.*]] = extractelement <2 x float> [[Z]], i64 0
 ; CHECK-NEXT:    [[RES_I0:%.*]] = call fast float @llvm.fma.f32(float [[X_I0]], float [[Y_I0]], float [[Z_I0]])
 ; CHECK-NEXT:    [[X_I1:%.*]] = extractelement <2 x float> [[X]], i64 1
 ; CHECK-NEXT:    [[Y_I1:%.*]] = extractelement <2 x float> [[Y]], i64 1
@@ -819,10 +840,11 @@ define <2 x float> @f22(<2 x float> %x, <2 x float> %y, <2 x float> %z) {
 
 ; See https://reviews.llvm.org/D83101#2133062
 define <2 x i32> @f23_crash(<2 x i32> %srcvec, i32 %v1) {
-; CHECK-LABEL: @f23_crash(
-; CHECK-NEXT:    [[SRCVEC_I0:%.*]] = extractelement <2 x i32> [[SRCVEC:%.*]], i64 0
+; CHECK-LABEL: define <2 x i32> @f23_crash(
+; CHECK-SAME: <2 x i32> [[SRCVEC:%.*]], i32 [[V1:%.*]]) {
+; CHECK-NEXT:    [[SRCVEC_I0:%.*]] = extractelement <2 x i32> [[SRCVEC]], i64 0
 ; CHECK-NEXT:    [[T1_UPTO0:%.*]] = insertelement <2 x i32> poison, i32 [[SRCVEC_I0]], i64 0
-; CHECK-NEXT:    [[T1:%.*]] = insertelement <2 x i32> [[T1_UPTO0]], i32 [[V1:%.*]], i64 1
+; CHECK-NEXT:    [[T1:%.*]] = insertelement <2 x i32> [[T1_UPTO0]], i32 [[V1]], i64 1
 ; CHECK-NEXT:    ret <2 x i32> [[T1]]
 ;
   %v0 = extractelement <2 x i32> %srcvec, i32 0
@@ -832,8 +854,9 @@ define <2 x i32> @f23_crash(<2 x i32> %srcvec, i32 %v1) {
 }
 
 define <2 x i32> @f24(<2 x i32> %src) {
-; CHECK-LABEL: @f24(
-; CHECK-NEXT:    [[SRC_I0:%.*]] = extractelement <2 x i32> [[SRC:%.*]], i64 0
+; CHECK-LABEL: define <2 x i32> @f24(
+; CHECK-SAME: <2 x i32> [[SRC:%.*]]) {
+; CHECK-NEXT:    [[SRC_I0:%.*]] = extractelement <2 x i32> [[SRC]], i64 0
 ; CHECK-NEXT:    [[FRZ_I0:%.*]] = freeze i32 [[SRC_I0]]
 ; CHECK-NEXT:    [[SRC_I1:%.*]] = extractelement <2 x i32> [[SRC]], i64 1
 ; CHECK-NEXT:    [[FRZ_I1:%.*]] = freeze i32 [[SRC_I1]]
@@ -846,8 +869,9 @@ define <2 x i32> @f24(<2 x i32> %src) {
 }
 
 define <2 x float> @f25(<2 x float> %src) {
-; CHECK-LABEL: @f25(
-; CHECK-NEXT:    [[SRC_I0:%.*]] = extractelement <2 x float> [[SRC:%.*]], i64 0
+; CHECK-LABEL: define <2 x float> @f25(
+; CHECK-SAME: <2 x float> [[SRC:%.*]]) {
+; CHECK-NEXT:    [[SRC_I0:%.*]] = extractelement <2 x float> [[SRC]], i64 0
 ; CHECK-NEXT:    [[ADD_I0:%.*]] = fadd float [[SRC_I0]], [[SRC_I0]]
 ; CHECK-NEXT:    [[SRC_I1:%.*]] = extractelement <2 x float> [[SRC]], i64 1
 ; CHECK-NEXT:    [[ADD_I1:%.*]] = fadd float [[SRC_I1]], [[SRC_I1]]
@@ -866,8 +890,9 @@ define <2 x float> @f25(<2 x float> %src) {
 }
 
 define <2 x i8> @test_copy_trunc_flags(<2 x i32> %src) {
-; CHECK-LABEL: @test_copy_trunc_flags(
-; CHECK-NEXT:    [[SRC_I0:%.*]] = extractelement <2 x i32> [[SRC:%.*]], i64 0
+; CHECK-LABEL: define <2 x i8> @test_copy_trunc_flags(
+; CHECK-SAME: <2 x i32> [[SRC:%.*]]) {
+; CHECK-NEXT:    [[SRC_I0:%.*]] = extractelement <2 x i32> [[SRC]], i64 0
 ; CHECK-NEXT:    [[TRUNC_I0:%.*]] = trunc nuw nsw i32 [[SRC_I0]] to i8
 ; CHECK-NEXT:    [[SRC_I1:%.*]] = extractelement <2 x i32> [[SRC]], i64 1
 ; CHECK-NEXT:    [[TRUNC_I1:%.*]] = trunc nuw nsw i32 [[SRC_I1]] to i8
@@ -886,3 +911,15 @@ define <2 x i8> @test_copy_trunc_flags(<2 x i32> %src) {
 !4 = !{ float 4.0 }
 !5 = !{ i64 0, i64 8, null }
 !13 = distinct !{}
+;.
+; CHECK: [[SET1_TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0}
+; CHECK: [[META1]] = !{!"set1", [[META2:![0-9]+]]}
+; CHECK: [[META2]] = !{!"root"}
+; CHECK: [[SET2_TBAA3]] = !{[[META4:![0-9]+]], [[META4]], i64 0}
+; CHECK: [[META4]] = !{!"set2", [[META2]]}
+; CHECK: [[TBAA_STRUCT5]] = !{i64 0, i64 8, null}
+; CHECK: [[ACC_GRP6]] = distinct !{}
+; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META8:![0-9]+]]}
+; CHECK: [[META8]] = !{!"llvm.loop.parallel_accesses", [[ACC_GRP6]]}
+; CHECK: [[META9]] = !{float 4.000000e+00}
+;.
diff --git a/llvm/test/Transforms/SimplifyCFG/PhiBlockMerge.ll b/llvm/test/Transforms/SimplifyCFG/PhiBlockMerge.ll
index 2c5889a981db2..08397b5755a3f 100644
--- a/llvm/test/Transforms/SimplifyCFG/PhiBlockMerge.ll
+++ b/llvm/test/Transforms/SimplifyCFG/PhiBlockMerge.ll
@@ -1,20 +1,21 @@
-; NOTE: Assertions have been autogenerated by update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 5
 ; Test merging of blocks that only have PHI nodes in them
 ;
 ; RUN: opt < %s -passes=simplifycfg -simplifycfg-require-and-preserve-domtree=1 -S | FileCheck %s
 ;
 
 define i32 @test(i1 %a, i1 %b) {
-; CHECK-LABEL: @test(
-; CHECK:       M:
-; CHECK-NEXT:    [[DOT:%.*]] = select i1 %b, i32 0, i32 1
-; CHECK-NEXT:    [[W:%.*]] = select i1 %a, i32 2, i32 [[DOT]]
+; CHECK-LABEL: define i32 @test(
+; CHECK-SAME: i1 [[A:%.*]], i1 [[B:%.*]]) {
+; CHECK-NEXT:  [[M:.*:]]
+; CHECK-NEXT:    [[SPEC_SELECT:%.*]] = select i1 [[B]], i32 0, i32 1, !prof [[PROF0:![0-9]+]]
+; CHECK-NEXT:    [[W:%.*]] = select i1 [[A]], i32 2, i32 [[SPEC_SELECT]], !prof [[PROF1:![0-9]+]]
 ; CHECK-NEXT:    [[R:%.*]] = add i32 [[W]], 1
 ; CHECK-NEXT:    ret i32 [[R]]
 ;
-  br i1 %a, label %M, label %O
+  br i1 %a, label %M, label %O, !prof !0
 O:              ; preds = %0
-  br i1 %b, label %N, label %Q
+  br i1 %b, label %N, label %Q, !prof !1
 Q:              ; preds = %O
   br label %N
 N:              ; preds = %Q, %O
@@ -27,3 +28,9 @@ M:              ; preds = %N, %0
   ret i32 %R
 }
 
+!0 = !{!"branch_weights", i32 11, i32 7}
+!1 = !{!"branch_weights", i32 3, i32 5}
+;.
+; CHECK: [[PROF0]] = !{!"branch_weights", i32 3, i32 5}
+; CHECK: [[PROF1]] = !{!"branch_weights", i32 11, i32 7}
+;.
diff --git a/llvm/test/Transforms/SimplifyCFG/branch-fold-threshold.ll b/llvm/test/Transforms/SimplifyCFG/branch-fold-threshold.ll
index 4384847ce156b..71ad069fb8d06 100644
--- a/llvm/test/Transforms/SimplifyCFG/branch-fold-threshold.ll
+++ b/llvm/test/Transforms/SimplifyCFG/branch-fold-threshold.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 5
 ; RUN: opt %s -passes=simplifycfg -simplifycfg-require-and-preserve-domtree=1 -S | FileCheck %s --check-prefixes=NORMAL,BASELINE
 ; RUN: opt %s -passes=simplifycfg -simplifycfg-require-and-preserve-domtree=1 -S -bonus-inst-threshold=2 | FileCheck %s --check-prefixes=NORMAL,AGGRESSIVE
 ; RUN: opt %s -passes=simplifycfg -simplifycfg-require-and-preserve-domtree=1 -S -bonus-inst-threshold=4 | FileCheck %s --check-prefixes=WAYAGGRESSIVE
@@ -11,12 +11,12 @@ define i32 @foo(i32 %a, i32 %b, i32 %c, i32 %d, ptr %input) {
 ; BASELINE-SAME: i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]], i32 [[D:%.*]], ptr [[INPUT:%.*]]) {
 ; BASELINE-NEXT:  [[ENTRY:.*]]:
 ; BASELINE-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[D]], 3
-; BASELINE-NEXT:    br i1 [[CMP]], label %[[COND_END:.*]], label %[[LOR_LHS_FALSE:.*]]
+; BASELINE-NEXT:    br i1 [[CMP]], label %[[COND_END:.*]], label %[[LOR_LHS_FALSE:.*]], !prof [[PROF0:![0-9]+]]
 ; BASELINE:       [[LOR_LHS_FALSE]]:
 ; BASELINE-NEXT:    [[MUL:%.*]] = shl i32 [[C]], 1
 ; BASELINE-NEXT:    [[ADD:%.*]] = add nsw i32 [[MUL]], [[A]]
 ; BASELINE-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[ADD]], [[B]]
-; BASELINE-NEXT:    br i1 [[CMP1]], label %[[COND_FALSE:.*]], label %[[COND_END]]
+; BASELINE-NEXT:    br i1 [[CMP1]], label %[[COND_FALSE:.*]], label %[[COND_END]], !prof [[PROF1:![0-9]+]]
 ; BASELINE:       [[COND_FALSE]]:
 ; BASELINE-NEXT:    [[TMP0:%.*]] = load i32, ptr [[INPUT]], align 4
 ; BASELINE-NEXT:    br label %[[COND_END]]
@@ -31,8 +31,8 @@ define i32 @foo(i32 %a, i32 %b, i32 %c, i32 %d, ptr %input) {
 ; AGGRESSIVE-NEXT:    [[MUL:%.*]] = shl i32 [[C]], 1
 ; AGGRESSIVE-NEXT:    [[ADD:%.*]] = add nsw i32 [[MUL]], [[A]]
 ; AGGRESSIVE-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[ADD]], [[B]]
-; AGGRESSIVE-NEXT:    [[OR_COND:%.*]] = select i1 [[CMP]], i1 [[CMP1]], i1 false
-; AGGRESSIVE-NEXT:    br i1 [[OR_COND]], label %[[COND_FALSE:.*]], label %[[COND_END:.*]]
+; AGGRESSIVE-NEXT:    [[OR_COND:%.*]] = select i1 [[CMP]], i1 [[CMP1]], i1 false, !prof [[PROF0:![0-9]+]]
+; AGGRESSIVE-NEXT:    br i1 [[OR_COND]], label %[[COND_FALSE:.*]], label %[[COND_END:.*]], !prof [[PROF0]]
 ; AGGRESSIVE:       [[COND_FALSE]]:
 ; AGGRESSIVE-NEXT:    [[TMP0:%.*]] = load i32, ptr [[INPUT]], align 4
 ; AGGRESSIVE-NEXT:    br label %[[COND_END]]
@@ -47,8 +47,8 @@ define i32 @foo(i32 %a, i32 %b, i32 %c, i32 %d, ptr %input) {
 ; WAYAGGRESSIVE-NEXT:    [[MUL:%.*]] = shl i32 [[C]], 1
 ; WAYAGGRESSIVE-NEXT:    [[ADD:%.*]] = add nsw i32 [[MUL]], [[A]]
 ; WAYAGGRESSIVE-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[ADD]], [[B]]
-; WAYAGGRESSIVE-NEXT:    [[OR_COND:%.*]] = select i1 [[CMP]], i1 [[CMP1]], i1 false
-; WAYAGGRESSIVE-NEXT:    br i1 [[OR_COND]], label %[[COND_FALSE:.*]], label %[[COND_END:.*]]
+; WAYAGGRESSIVE-NEXT:    [[OR_COND:%.*]] = select i1 [[CMP]], i1 [[CMP1]], i1 false, !prof [[PROF0:![0-9]+]]
+; WAYAGGRESSIVE-NEXT:    br i1 [[OR_COND]], label %[[COND_FALSE:.*]], label %[[COND_END:.*]], !prof [[PROF0]]
 ; WAYAGGRESSIVE:       [[COND_FALSE]]:
 ; WAYAGGRESSIVE-NEXT:    [[TMP0:%.*]] = load i32, ptr [[INPUT]], align 4
 ; WAYAGGRESSIVE-NEXT:    br label %[[COND_END]]
@@ -58,13 +58,13 @@ define i32 @foo(i32 %a, i32 %b, i32 %c, i32 %d, ptr %input) {
 ;
 entry:
   %cmp = icmp sgt i32 %d, 3
-  br i1 %cmp, label %cond.end, label %lor.lhs.false
+  br i1 %cmp, label %cond.end, label %lor.lhs.false, !prof !0
 
 lor.lhs.false:
   %mul = shl i32 %c, 1
   %add = add nsw i32 %mul, %a
   %cmp1 = icmp slt i32 %add, %b
-  br i1 %cmp1, label %cond.false, label %cond.end
+  br i1 %cmp1, label %cond.false, label %cond.end, !prof !1
 
 cond.false:
   %0 = load i32, ptr %input, align 4
@@ -160,3 +160,14 @@ cond.end:
   %cond = phi i32 [ %0, %cond.false ], [ 0, %lor.lhs.false ],[ 0, %pred_a ],[ 0, %pred_b ]
   ret i32 %cond
 }
+
+!0 = !{!"branch_weights", i32 7, i32 11}
+!1 = !{!"branch_weights", i32 13, i32 5}
+;.
+; BASELINE: [[PROF0]] = !{!"branch_weights", i32 7, i32 11}
+; BASELINE: [[PROF1]] = !{!"branch_weights", i32 13, i32 5}
+;.
+; AGGRESSIVE: [[PROF0]] = !{!"branch_weights", i32 143, i32 181}
+;.
+; WAYAGGRESSIVE: [[PROF0]] = !{!"branch_weights", i32 143, i32 181}
+;.
diff --git a/llvm/test/Transforms/SimplifyCFG/branch-fold.ll b/llvm/test/Transforms/SimplifyCFG/branch-fold.ll
index 2f5fb4f33013d..8e7b91ea172be 100644
--- a/llvm/test/Transforms/SimplifyCFG/branch-fold.ll
+++ b/llvm/test/Transforms/SimplifyCFG/branch-fold.ll
@@ -1,12 +1,12 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals
 ; RUN: opt < %s -passes=simplifycfg -simplifycfg-require-and-preserve-domtree=1 -S | FileCheck %s
 
 define void @test(ptr %P, ptr %Q, i1 %A, i1 %B) {
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[A_NOT:%.*]] = xor i1 [[A:%.*]], true
-; CHECK-NEXT:    [[BRMERGE:%.*]] = select i1 [[A_NOT]], i1 true, i1 [[B:%.*]]
-; CHECK-NEXT:    br i1 [[BRMERGE]], label [[B:%.*]], label [[COMMON_RET:%.*]]
+; CHECK-NEXT:    [[BRMERGE:%.*]] = select i1 [[A_NOT]], i1 true, i1 [[B:%.*]], !prof [[PROF0:![0-9]+]]
+; CHECK-NEXT:    br i1 [[BRMERGE]], label [[B:%.*]], label [[COMMON_RET:%.*]], !prof [[PROF1:![0-9]+]]
 ; CHECK:       common.ret:
 ; CHECK-NEXT:    ret void
 ; CHECK:       b:
@@ -15,9 +15,9 @@ define void @test(ptr %P, ptr %Q, i1 %A, i1 %B) {
 ;
 
 entry:
-  br i1 %A, label %a, label %b
+  br i1 %A, label %a, label %b, !prof !0
 a:
-  br i1 %B, label %b, label %c
+  br i1 %B, label %b, label %c, !prof !1
 b:
   store i32 123, ptr %P
   ret void
@@ -146,3 +146,12 @@ Succ:
 }
 
 declare void @dummy()
+
+!0 = !{!"branch_weights", i32 3, i32 7}
+!1 = !{!"branch_weights", i32 11, i32 4}
+;.
+; CHECK: attributes #[[ATTR0:[0-9]+]] = { nounwind ssp memory(read) uwtable }
+;.
+; CHECK: [[PROF0]] = !{!"branch_weights", i32 7, i32 3}
+; CHECK: [[PROF1]] = !{!"branch_weights", i32 138, i32 12}
+;.
diff --git a/llvm/test/Transforms/SimplifyCFG/merge-cond-stores.ll b/llvm/test/Transforms/SimplifyCFG/merge-cond-stores.ll
index e1bd7916b3be0..b1cce4484bbab 100644
--- a/llvm/test/Transforms/SimplifyCFG/merge-cond-stores.ll
+++ b/llvm/test/Transforms/SimplifyCFG/merge-cond-stores.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals
 ; RUN: opt -passes=simplifycfg,instcombine -simplifycfg-require-and-preserve-domtree=1 < %s -simplifycfg-merge-cond-stores=true -simplifycfg-merge-cond-stores-aggressively=false -phi-node-folding-threshold=2 -S | FileCheck %s
 
 ; This test should succeed and end up if-converted.
@@ -43,7 +43,7 @@ define void @test_simple_commuted(ptr %p, i32 %a, i32 %b) {
 ; CHECK-NEXT:    [[X2:%.*]] = icmp eq i32 [[B:%.*]], 0
 ; CHECK-NEXT:    [[X3:%.*]] = icmp eq i32 [[B1:%.*]], 0
 ; CHECK-NEXT:    [[TMP0:%.*]] = or i1 [[X2]], [[X3]]
-; CHECK-NEXT:    br i1 [[TMP0]], label [[TMP1:%.*]], label [[TMP2:%.*]]
+; CHECK-NEXT:    br i1 [[TMP0]], label [[TMP1:%.*]], label [[TMP2:%.*]], !prof [[PROF0:![0-9]+]]
 ; CHECK:       1:
 ; CHECK-NEXT:    [[SPEC_SELECT:%.*]] = zext i1 [[X3]] to i32
 ; CHECK-NEXT:    store i32 [[SPEC_SELECT]], ptr [[P:%.*]], align 4
@@ -53,7 +53,7 @@ define void @test_simple_commuted(ptr %p, i32 %a, i32 %b) {
 ;
 entry:
   %x1 = icmp eq i32 %a, 0
-  br i1 %x1, label %yes1, label %fallthrough
+  br i1 %x1, label %yes1, label %fallthrough, !prof !0
 
 yes1:
   store i32 0, ptr %p
@@ -61,7 +61,7 @@ yes1:
 
 fallthrough:
   %x2 = icmp eq i32 %b, 0
-  br i1 %x2, label %yes2, label %end
+  br i1 %x2, label %yes2, label %end, !prof !1
 
 yes2:
   store i32 1, ptr %p
@@ -406,3 +406,9 @@ yes2:
 end:
   ret void
 }
+
+!0 = !{!"branch_weights", i32 7, i32 13}
+!1 = !{!"branch_weights", i32 3, i32 11}
+;.
+; CHECK: [[PROF0]] = !{!"branch_weights", i32 137, i32 143}
+;.
diff --git a/llvm/test/Transforms/SimplifyCFG/no-drop-debug-loc-when-speculating-call.ll b/llvm/test/Transforms/SimplifyCFG/no-drop-debug-loc-when-speculating-call.ll
new file mode 100644
index 0000000000000..dd1db41632c98
--- /dev/null
+++ b/llvm/test/Transforms/SimplifyCFG/no-drop-debug-loc-when-speculating-call.ll
@@ -0,0 +1,43 @@
+; RUN: opt -S -o - %s -passes=simplifycfg -simplifycfg-require-and-preserve-domtree=1 | FileCheck %s
+
+
+declare i1 @make_condition()
+
+define i1 @specfn() readnone nounwind speculatable {
+    ret i1 true
+}
+
+; CHECK-LABEL: @test1(
+; CHECK: call i1 @specfn(), !dbg
+; CHECK: select i1
+define void @test1(i1 %cond) !dbg !6 {
+start:
+  br i1 %cond, label %then, label %else, !dbg !9
+
+then:                                             ; preds = %start
+  %sres = call i1 @specfn(), !dbg !8
+  br label %else, !dbg !11
+
+else:                                             ; preds = %then, %start
+  %phi = phi i1 [ %cond, %start ], [ %sres, %then ], !dbg !12
+  ret void, !dbg !13
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.debugify = !{!3, !4}
+!llvm.module.flags = !{!5}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "debugify", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
+!1 = !DIFile(filename: "test.ll", directory: "/")
+!2 = !{}
+!3 = !{i32 6}
+!4 = !{i32 0}
+!5 = !{i32 2, !"Debug Info Version", i32 3}
+!6 = distinct !DISubprogram(name: "test1", linkageName: "test1", scope: null, file: !1, line: 1, type: !7, scopeLine: 1, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2)
+!7 = !DISubroutineType(types: !2)
+!8 = !DILocation(line: 1, column: 1, scope: !6)
+!9 = !DILocation(line: 2, column: 1, scope: !6)
+!10 = !DILocation(line: 3, column: 2, scope: !6)
+!11 = !DILocation(line: 4, column: 2, scope: !6)
+!12 = !DILocation(line: 5, column: 3, scope: !6)
+!13 = !DILocation(line: 6, column: 3, scope: !6)
diff --git a/llvm/test/Transforms/SimplifyCFG/preserve-branchweights.ll b/llvm/test/Transforms/SimplifyCFG/preserve-branchweights.ll
index ba542459a396c..0624f72d7a142 100644
--- a/llvm/test/Transforms/SimplifyCFG/preserve-branchweights.ll
+++ b/llvm/test/Transforms/SimplifyCFG/preserve-branchweights.ll
@@ -11,8 +11,8 @@ define void @test1(i1 %a, i1 %b) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[A_NOT:%.*]] = xor i1 [[A:%.*]], true
 ; CHECK-NEXT:    [[C:%.*]] = or i1 [[B:%.*]], false
-; CHECK-NEXT:    [[OR_COND:%.*]] = select i1 [[A_NOT]], i1 [[C]], i1 false
-; CHECK-NEXT:    br i1 [[OR_COND]], label [[Z:%.*]], label [[Y:%.*]], !prof [[PROF0:![0-9]+]]
+; CHECK-NEXT:    [[OR_COND:%.*]] = select i1 [[A_NOT]], i1 [[C]], i1 false, !prof [[PROF0:![0-9]+]]
+; CHECK-NEXT:    br i1 [[OR_COND]], label [[Z:%.*]], label [[Y:%.*]], !prof [[PROF0]]
 ; CHECK:       common.ret:
 ; CHECK-NEXT:    ret void
 ; CHECK:       Y:
@@ -42,8 +42,8 @@ define void @test2(i1 %a, i1 %b) {
 ; CHECK-LABEL: @test2(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[C:%.*]] = or i1 [[B:%.*]], false
-; CHECK-NEXT:    [[OR_COND:%.*]] = select i1 [[A:%.*]], i1 [[C]], i1 false
-; CHECK-NEXT:    br i1 [[OR_COND]], label [[Z:%.*]], label [[Y:%.*]], !prof [[PROF1:![0-9]+]]
+; CHECK-NEXT:    [[OR_COND:%.*]] = select i1 [[A:%.*]], i1 [[C]], i1 false, !prof [[PROF1:![0-9]+]]
+; CHECK-NEXT:    br i1 [[OR_COND]], label [[Z:%.*]], label [[Y:%.*]], !prof [[PROF1]]
 ; CHECK:       common.ret:
 ; CHECK-NEXT:    ret void
 ; CHECK:       Y:
@@ -73,8 +73,8 @@ define void @test3(i1 %a, i1 %b) {
 ; CHECK-LABEL: @test3(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[C:%.*]] = or i1 [[B:%.*]], false
-; CHECK-NEXT:    [[OR_COND:%.*]] = select i1 [[A:%.*]], i1 [[C]], i1 false
-; CHECK-NEXT:    br i1 [[OR_COND]], label [[Z:%.*]], label [[Y:%.*]], !prof [[PROF2:![0-9]+]]
+; CHECK-NEXT:    [[OR_COND:%.*]] = select i1 [[A:%.*]], i1 [[C]], i1 false, !prof [[PROF2:![0-9]+]]
+; CHECK-NEXT:    br i1 [[OR_COND]], label [[Z:%.*]], label [[Y:%.*]], !prof [[PROF2]]
 ; CHECK:       common.ret:
 ; CHECK-NEXT:    ret void
 ; CHECK:       Y:
@@ -104,7 +104,7 @@ define void @test4(i1 %a, i1 %b) {
 ; CHECK-LABEL: @test4(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[C:%.*]] = or i1 [[B:%.*]], false
-; CHECK-NEXT:    [[OR_COND:%.*]] = select i1 [[A:%.*]], i1 [[C]], i1 false
+; CHECK-NEXT:    [[OR_COND:%.*]] = select i1 [[A:%.*]], i1 [[C]], i1 false, !prof [[PROF2]]
 ; CHECK-NEXT:    br i1 [[OR_COND]], label [[Z:%.*]], label [[Y:%.*]], !prof [[PROF2]]
 ; CHECK:       common.ret:
 ; CHECK-NEXT:    ret void
@@ -237,8 +237,8 @@ define void @test1_swap(i1 %a, i1 %b) {
 ; CHECK-LABEL: @test1_swap(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[C:%.*]] = or i1 [[B:%.*]], false
-; CHECK-NEXT:    [[OR_COND:%.*]] = select i1 [[A:%.*]], i1 true, i1 [[C]]
-; CHECK-NEXT:    br i1 [[OR_COND]], label [[Y:%.*]], label [[Z:%.*]], !prof [[PROF5:![0-9]+]]
+; CHECK-NEXT:    [[OR_COND:%.*]] = select i1 [[A:%.*]], i1 true, i1 [[C]], !prof [[PROF5:![0-9]+]]
+; CHECK-NEXT:    br i1 [[OR_COND]], label [[Y:%.*]], label [[Z:%.*]], !prof [[PROF5]]
 ; CHECK:       common.ret:
 ; CHECK-NEXT:    ret void
 ; CHECK:       Y:
@@ -268,8 +268,8 @@ define void @test7(i1 %a, i1 %b) {
 ; CHECK-LABEL: @test7(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[C:%.*]] = or i1 [[B:%.*]], false
-; CHECK-NEXT:    [[BRMERGE:%.*]] = select i1 [[A:%.*]], i1 true, i1 [[C]]
-; CHECK-NEXT:    br i1 [[BRMERGE]], label [[Y:%.*]], label [[Z:%.*]], !prof [[PROF6:![0-9]+]]
+; CHECK-NEXT:    [[BRMERGE:%.*]] = select i1 [[A:%.*]], i1 true, i1 [[C]], !prof [[PROF6:![0-9]+]]
+; CHECK-NEXT:    br i1 [[BRMERGE]], label [[Y:%.*]], label [[Z:%.*]], !prof [[PROF7:![0-9]+]]
 ; CHECK:       common.ret:
 ; CHECK-NEXT:    ret void
 ; CHECK:       Y:
@@ -300,7 +300,7 @@ define void @test8(i64 %x, i64 %y) nounwind {
 ; CHECK-LABEL: @test8(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[LT:%.*]] = icmp slt i64 [[X:%.*]], [[Y:%.*]]
-; CHECK-NEXT:    br i1 [[LT]], label [[A:%.*]], label [[B:%.*]], !prof [[PROF7:![0-9]+]]
+; CHECK-NEXT:    br i1 [[LT]], label [[A:%.*]], label [[B:%.*]], !prof [[PROF8:![0-9]+]]
 ; CHECK:       common.ret:
 ; CHECK-NEXT:    ret void
 ; CHECK:       a:
@@ -339,7 +339,7 @@ define i1 @test9(i32 %x, i32 %y) nounwind {
 ; CHECK-NEXT:      i32 1, label [[END:%.*]]
 ; CHECK-NEXT:      i32 2, label [[END]]
 ; CHECK-NEXT:      i32 92, label [[END]]
-; CHECK-NEXT:    ], !prof [[PROF8:![0-9]+]]
+; CHECK-NEXT:    ], !prof [[PROF9:![0-9]+]]
 ; CHECK:       common.ret:
 ; CHECK-NEXT:    [[COMMON_RET_OP:%.*]] = phi i1 [ [[RETA:%.*]], [[A]] ], [ [[RET:%.*]], [[END]] ]
 ; CHECK-NEXT:    ret i1 [[COMMON_RET_OP]]
@@ -381,7 +381,7 @@ define void @test10(i32 %x) nounwind readnone ssp noredzone {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[X_OFF:%.*]] = add i32 [[X:%.*]], -1
 ; CHECK-NEXT:    [[SWITCH:%.*]] = icmp ult i32 [[X_OFF]], 3
-; CHECK-NEXT:    br i1 [[SWITCH]], label [[LOR_END:%.*]], label [[LOR_RHS:%.*]], !prof [[PROF9:![0-9]+]]
+; CHECK-NEXT:    br i1 [[SWITCH]], label [[LOR_END:%.*]], label [[LOR_RHS:%.*]], !prof [[PROF10:![0-9]+]]
 ; CHECK:       common.ret:
 ; CHECK-NEXT:    ret void
 ; CHECK:       lor.rhs:
@@ -413,7 +413,7 @@ define void @test11(i32 %x) nounwind {
 ; CHECK-LABEL: @test11(
 ; CHECK-NEXT:    [[I:%.*]] = shl i32 [[X:%.*]], 1
 ; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[I]], 24
-; CHECK-NEXT:    br i1 [[COND]], label [[C:%.*]], label [[A:%.*]], !prof [[PROF10:![0-9]+]]
+; CHECK-NEXT:    br i1 [[COND]], label [[C:%.*]], label [[A:%.*]], !prof [[PROF11:![0-9]+]]
 ; CHECK:       common.ret:
 ; CHECK-NEXT:    ret void
 ; CHECK:       a:
@@ -500,8 +500,8 @@ define void @test14(ptr %old, i32 %final) {
 ; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp eq i32 [[BIT_0]], 0
 ; CHECK-NEXT:    [[V3:%.*]] = load i32, ptr @max_regno, align 4
 ; CHECK-NEXT:    [[CMP4:%.*]] = icmp eq i32 [[I_1]], [[V3]]
-; CHECK-NEXT:    [[OR_COND:%.*]] = select i1 [[TOBOOL]], i1 true, i1 [[CMP4]]
-; CHECK-NEXT:    br i1 [[OR_COND]], label [[FOR_EXIT:%.*]], label [[FOR_INC]], !prof [[PROF11:![0-9]+]]
+; CHECK-NEXT:    [[OR_COND:%.*]] = select i1 [[TOBOOL]], i1 true, i1 [[CMP4]], !prof [[PROF12:![0-9]+]]
+; CHECK-NEXT:    br i1 [[OR_COND]], label [[FOR_EXIT:%.*]], label [[FOR_INC]], !prof [[PROF12]]
 ; CHECK:       for.inc:
 ; CHECK-NEXT:    [[SHL]] = shl i32 [[BIT_0]], 1
 ; CHECK-NEXT:    [[INC19]] = add nsw i32 [[I_1]], 1
@@ -534,7 +534,7 @@ define i32 @HoistThenElseCodeToIf(i32 %n) {
 ; CHECK-LABEL: @HoistThenElseCodeToIf(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp eq i32 [[N:%.*]], 0
-; CHECK-NEXT:    [[DOT:%.*]] = select i1 [[TOBOOL]], i32 1, i32 234, !prof [[PROF12:![0-9]+]]
+; CHECK-NEXT:    [[DOT:%.*]] = select i1 [[TOBOOL]], i32 1, i32 234, !prof [[PROF6]]
 ; CHECK-NEXT:    ret i32 [[DOT]]
 ;
 entry:
@@ -557,8 +557,8 @@ return:
 define i32 @SimplifyCondBranchToCondBranch(i1 %cmpa, i1 %cmpb) {
 ; CHECK-LABEL: @SimplifyCondBranchToCondBranch(
 ; CHECK-NEXT:  block1:
-; CHECK-NEXT:    [[BRMERGE:%.*]] = select i1 [[CMPA:%.*]], i1 true, i1 [[CMPB:%.*]]
-; CHECK-NEXT:    [[DOTMUX:%.*]] = select i1 [[CMPA]], i32 0, i32 2, !prof [[PROF13:![0-9]+]]
+; CHECK-NEXT:    [[BRMERGE:%.*]] = select i1 [[CMPA:%.*]], i1 true, i1 [[CMPB:%.*]], !prof [[PROF13:![0-9]+]]
+; CHECK-NEXT:    [[DOTMUX:%.*]] = select i1 [[CMPA]], i32 0, i32 2, !prof [[PROF13]]
 ; CHECK-NEXT:    [[OUTVAL:%.*]] = select i1 [[BRMERGE]], i32 [[DOTMUX]], i32 1, !prof [[PROF14:![0-9]+]]
 ; CHECK-NEXT:    ret i32 [[OUTVAL]]
 ;
@@ -584,8 +584,8 @@ define i32 @SimplifyCondBranchToCondBranchSwap(i1 %cmpa, i1 %cmpb) {
 ; CHECK-NEXT:  block1:
 ; CHECK-NEXT:    [[CMPA_NOT:%.*]] = xor i1 [[CMPA:%.*]], true
 ; CHECK-NEXT:    [[CMPB_NOT:%.*]] = xor i1 [[CMPB:%.*]], true
-; CHECK-NEXT:    [[BRMERGE:%.*]] = select i1 [[CMPA_NOT]], i1 true, i1 [[CMPB_NOT]]
-; CHECK-NEXT:    [[DOTMUX:%.*]] = select i1 [[CMPA_NOT]], i32 0, i32 2, !prof [[PROF15:![0-9]+]]
+; CHECK-NEXT:    [[BRMERGE:%.*]] = select i1 [[CMPA_NOT]], i1 true, i1 [[CMPB_NOT]], !prof [[PROF15:![0-9]+]]
+; CHECK-NEXT:    [[DOTMUX:%.*]] = select i1 [[CMPA_NOT]], i32 0, i32 2, !prof [[PROF15]]
 ; CHECK-NEXT:    [[OUTVAL:%.*]] = select i1 [[BRMERGE]], i32 [[DOTMUX]], i32 1, !prof [[PROF16:![0-9]+]]
 ; CHECK-NEXT:    ret i32 [[OUTVAL]]
 ;
@@ -609,7 +609,7 @@ define i32 @SimplifyCondBranchToCondBranchSwapMissingWeight(i1 %cmpa, i1 %cmpb)
 ; CHECK-NEXT:  block1:
 ; CHECK-NEXT:    [[CMPA_NOT:%.*]] = xor i1 [[CMPA:%.*]], true
 ; CHECK-NEXT:    [[CMPB_NOT:%.*]] = xor i1 [[CMPB:%.*]], true
-; CHECK-NEXT:    [[BRMERGE:%.*]] = select i1 [[CMPA_NOT]], i1 true, i1 [[CMPB_NOT]]
+; CHECK-NEXT:    [[BRMERGE:%.*]] = select i1 [[CMPA_NOT]], i1 true, i1 [[CMPB_NOT]], !prof [[PROF15]]
 ; CHECK-NEXT:    [[DOTMUX:%.*]] = select i1 [[CMPA_NOT]], i32 0, i32 2, !prof [[PROF15]]
 ; CHECK-NEXT:    [[OUTVAL:%.*]] = select i1 [[BRMERGE]], i32 [[DOTMUX]], i32 1, !prof [[PROF17:![0-9]+]]
 ; CHECK-NEXT:    ret i32 [[OUTVAL]]
@@ -701,8 +701,8 @@ define void @or_icmps_probably_not_harmful(i32 %x, i32 %y, ptr %p) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[EXPECTED_TRUE:%.*]] = icmp sgt i32 [[X:%.*]], -1
 ; CHECK-NEXT:    [[EXPENSIVE:%.*]] = icmp eq i32 [[Y:%.*]], 0
-; CHECK-NEXT:    [[OR_COND:%.*]] = select i1 [[EXPECTED_TRUE]], i1 true, i1 [[EXPENSIVE]]
-; CHECK-NEXT:    br i1 [[OR_COND]], label [[EXIT:%.*]], label [[FALSE:%.*]], !prof [[PROF20:![0-9]+]], !unpredictable [[META21:![0-9]+]]
+; CHECK-NEXT:    [[OR_COND:%.*]] = select i1 [[EXPECTED_TRUE]], i1 true, i1 [[EXPENSIVE]], !prof [[PROF20:![0-9]+]]
+; CHECK-NEXT:    br i1 [[OR_COND]], label [[EXIT:%.*]], label [[FALSE:%.*]], !prof [[PROF20]], !unpredictable [[META21:![0-9]+]]
 ; CHECK:       false:
 ; CHECK-NEXT:    store i8 42, ptr [[P:%.*]], align 1
 ; CHECK-NEXT:    br label [[EXIT]]
@@ -733,8 +733,8 @@ define void @or_icmps_not_that_harmful(i32 %x, i32 %y, ptr %p) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[EXPECTED_TRUE:%.*]] = icmp sgt i32 [[X:%.*]], -1
 ; CHECK-NEXT:    [[EXPENSIVE:%.*]] = icmp eq i32 [[Y:%.*]], 0
-; CHECK-NEXT:    [[OR_COND:%.*]] = select i1 [[EXPECTED_TRUE]], i1 true, i1 [[EXPENSIVE]]
-; CHECK-NEXT:    br i1 [[OR_COND]], label [[EXIT:%.*]], label [[FALSE:%.*]], !prof [[PROF22:![0-9]+]]
+; CHECK-NEXT:    [[OR_COND:%.*]] = select i1 [[EXPECTED_TRUE]], i1 true, i1 [[EXPENSIVE]], !prof [[PROF22:![0-9]+]]
+; CHECK-NEXT:    br i1 [[OR_COND]], label [[EXIT:%.*]], label [[FALSE:%.*]], !prof [[PROF22]]
 ; CHECK:       false:
 ; CHECK-NEXT:    store i8 42, ptr [[P:%.*]], align 1
 ; CHECK-NEXT:    br label [[EXIT]]
@@ -765,8 +765,8 @@ define void @or_icmps_not_that_harmful_inverted(i32 %x, i32 %y, ptr %p) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[EXPECTED_TRUE:%.*]] = icmp sgt i32 [[X:%.*]], -1
 ; CHECK-NEXT:    [[EXPENSIVE:%.*]] = icmp eq i32 [[Y:%.*]], 0
-; CHECK-NEXT:    [[OR_COND:%.*]] = select i1 [[EXPECTED_TRUE]], i1 true, i1 [[EXPENSIVE]]
-; CHECK-NEXT:    br i1 [[OR_COND]], label [[EXIT:%.*]], label [[FALSE:%.*]], !prof [[PROF23:![0-9]+]]
+; CHECK-NEXT:    [[OR_COND:%.*]] = select i1 [[EXPECTED_TRUE]], i1 true, i1 [[EXPENSIVE]], !prof [[PROF23:![0-9]+]]
+; CHECK-NEXT:    br i1 [[OR_COND]], label [[EXIT:%.*]], label [[FALSE:%.*]], !prof [[PROF23]]
 ; CHECK:       false:
 ; CHECK-NEXT:    store i8 42, ptr [[P:%.*]], align 1
 ; CHECK-NEXT:    br label [[EXIT]]
@@ -796,8 +796,8 @@ define void @or_icmps_useful(i32 %x, i32 %y, ptr %p) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[EXPECTED_TRUE:%.*]] = icmp sle i32 [[X:%.*]], -1
 ; CHECK-NEXT:    [[EXPENSIVE:%.*]] = icmp eq i32 [[Y:%.*]], 0
-; CHECK-NEXT:    [[OR_COND:%.*]] = select i1 [[EXPECTED_TRUE]], i1 true, i1 [[EXPENSIVE]]
-; CHECK-NEXT:    br i1 [[OR_COND]], label [[EXIT:%.*]], label [[FALSE:%.*]], !prof [[PROF24:![0-9]+]]
+; CHECK-NEXT:    [[OR_COND:%.*]] = select i1 [[EXPECTED_TRUE]], i1 true, i1 [[EXPENSIVE]], !prof [[PROF24:![0-9]+]]
+; CHECK-NEXT:    br i1 [[OR_COND]], label [[EXIT:%.*]], label [[FALSE:%.*]], !prof [[PROF24]]
 ; CHECK:       false:
 ; CHECK-NEXT:    store i8 42, ptr [[P:%.*]], align 1
 ; CHECK-NEXT:    br label [[EXIT]]
@@ -827,7 +827,7 @@ define void @or_icmps_useful_inverted(i32 %x, i32 %y, ptr %p) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[EXPECTED_FALSE:%.*]] = icmp sgt i32 [[X:%.*]], -1
 ; CHECK-NEXT:    [[EXPENSIVE:%.*]] = icmp eq i32 [[Y:%.*]], 0
-; CHECK-NEXT:    [[OR_COND:%.*]] = select i1 [[EXPECTED_FALSE]], i1 true, i1 [[EXPENSIVE]]
+; CHECK-NEXT:    [[OR_COND:%.*]] = select i1 [[EXPECTED_FALSE]], i1 true, i1 [[EXPENSIVE]], !prof [[PROF24]]
 ; CHECK-NEXT:    br i1 [[OR_COND]], label [[EXIT:%.*]], label [[FALSE:%.*]], !prof [[PROF24]]
 ; CHECK:       false:
 ; CHECK-NEXT:    store i8 42, ptr [[P:%.*]], align 1
@@ -956,8 +956,8 @@ define void @and_icmps_not_that_harmful(i32 %x, i32 %y, ptr %p) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[EXPECTED_FALSE:%.*]] = icmp sgt i32 [[X:%.*]], -1
 ; CHECK-NEXT:    [[EXPENSIVE:%.*]] = icmp eq i32 [[Y:%.*]], 0
-; CHECK-NEXT:    [[OR_COND:%.*]] = select i1 [[EXPECTED_FALSE]], i1 [[EXPENSIVE]], i1 false
-; CHECK-NEXT:    br i1 [[OR_COND]], label [[FALSE:%.*]], label [[EXIT:%.*]], !prof [[PROF25:![0-9]+]]
+; CHECK-NEXT:    [[OR_COND:%.*]] = select i1 [[EXPECTED_FALSE]], i1 [[EXPENSIVE]], i1 false, !prof [[PROF25:![0-9]+]]
+; CHECK-NEXT:    br i1 [[OR_COND]], label [[FALSE:%.*]], label [[EXIT:%.*]], !prof [[PROF25]]
 ; CHECK:       false:
 ; CHECK-NEXT:    store i8 42, ptr [[P:%.*]], align 1
 ; CHECK-NEXT:    br label [[EXIT]]
@@ -988,7 +988,7 @@ define void @and_icmps_not_that_harmful_inverted(i32 %x, i32 %y, ptr %p) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[EXPECTED_TRUE:%.*]] = icmp sle i32 [[X:%.*]], -1
 ; CHECK-NEXT:    [[EXPENSIVE:%.*]] = icmp eq i32 [[Y:%.*]], 0
-; CHECK-NEXT:    [[OR_COND:%.*]] = select i1 [[EXPECTED_TRUE]], i1 [[EXPENSIVE]], i1 false
+; CHECK-NEXT:    [[OR_COND:%.*]] = select i1 [[EXPECTED_TRUE]], i1 [[EXPENSIVE]], i1 false, !prof [[PROF25]]
 ; CHECK-NEXT:    br i1 [[OR_COND]], label [[FALSE:%.*]], label [[EXIT:%.*]], !prof [[PROF25]]
 ; CHECK:       false:
 ; CHECK-NEXT:    store i8 42, ptr [[P:%.*]], align 1
@@ -1019,8 +1019,8 @@ define void @and_icmps_useful(i32 %x, i32 %y, ptr %p) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[EXPECTED_TRUE:%.*]] = icmp sgt i32 [[X:%.*]], -1
 ; CHECK-NEXT:    [[EXPENSIVE:%.*]] = icmp eq i32 [[Y:%.*]], 0
-; CHECK-NEXT:    [[OR_COND:%.*]] = select i1 [[EXPECTED_TRUE]], i1 [[EXPENSIVE]], i1 false
-; CHECK-NEXT:    br i1 [[OR_COND]], label [[FALSE:%.*]], label [[EXIT:%.*]], !prof [[PROF26:![0-9]+]]
+; CHECK-NEXT:    [[OR_COND:%.*]] = select i1 [[EXPECTED_TRUE]], i1 [[EXPENSIVE]], i1 false, !prof [[PROF26:![0-9]+]]
+; CHECK-NEXT:    br i1 [[OR_COND]], label [[FALSE:%.*]], label [[EXIT:%.*]], !prof [[PROF26]]
 ; CHECK:       false:
 ; CHECK-NEXT:    store i8 42, ptr [[P:%.*]], align 1
 ; CHECK-NEXT:    br label [[EXIT]]
@@ -1050,7 +1050,7 @@ define void @and_icmps_useful_inverted(i32 %x, i32 %y, ptr %p) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[EXPECTED_FALSE:%.*]] = icmp sle i32 [[X:%.*]], -1
 ; CHECK-NEXT:    [[EXPENSIVE:%.*]] = icmp eq i32 [[Y:%.*]], 0
-; CHECK-NEXT:    [[OR_COND:%.*]] = select i1 [[EXPECTED_FALSE]], i1 [[EXPENSIVE]], i1 false
+; CHECK-NEXT:    [[OR_COND:%.*]] = select i1 [[EXPECTED_FALSE]], i1 [[EXPENSIVE]], i1 false, !prof [[PROF26]]
 ; CHECK-NEXT:    br i1 [[OR_COND]], label [[FALSE:%.*]], label [[EXIT:%.*]], !prof [[PROF26]]
 ; CHECK:       false:
 ; CHECK-NEXT:    store i8 42, ptr [[P:%.*]], align 1
@@ -1097,23 +1097,26 @@ exit:
 !20 = !{}
 
 ; .
+; .
+; .
+;.
 ; CHECK: attributes #[[ATTR0:[0-9]+]] = { nounwind uwtable }
 ; CHECK: attributes #[[ATTR1]] = { nounwind }
 ; CHECK: attributes #[[ATTR2:[0-9]+]] = { noredzone nounwind ssp memory(none) }
-; .
+;.
 ; CHECK: [[PROF0]] = !{!"branch_weights", i32 5, i32 11}
 ; CHECK: [[PROF1]] = !{!"branch_weights", i32 1, i32 5}
 ; CHECK: [[PROF2]] = !{!"branch_weights", i32 1, i32 3}
 ; CHECK: [[PROF3]] = !{!"branch_weights", i32 7, i32 1, i32 2}
 ; CHECK: [[PROF4]] = !{!"branch_weights", i32 49, i32 12, i32 24, i32 35}
 ; CHECK: [[PROF5]] = !{!"branch_weights", i32 11, i32 5}
-; CHECK: [[PROF6]] = !{!"branch_weights", i32 17, i32 15}
-; CHECK: [[PROF7]] = !{!"branch_weights", i32 9, i32 7}
-; CHECK: [[PROF8]] = !{!"branch_weights", i32 17, i32 9, i32 8, i32 7, i32 17}
-; CHECK: [[PROF9]] = !{!"branch_weights", i32 24, i32 33}
-; CHECK: [[PROF10]] = !{!"branch_weights", i32 8, i32 33}
-; CHECK: [[PROF11]] = !{!"branch_weights", i32 112017436, i32 -735157296}
-; CHECK: [[PROF12]] = !{!"branch_weights", i32 3, i32 5}
+; CHECK: [[PROF6]] = !{!"branch_weights", i32 3, i32 5}
+; CHECK: [[PROF7]] = !{!"branch_weights", i32 17, i32 15}
+; CHECK: [[PROF8]] = !{!"branch_weights", i32 9, i32 7}
+; CHECK: [[PROF9]] = !{!"branch_weights", i32 17, i32 9, i32 8, i32 7, i32 17}
+; CHECK: [[PROF10]] = !{!"branch_weights", i32 24, i32 33}
+; CHECK: [[PROF11]] = !{!"branch_weights", i32 8, i32 33}
+; CHECK: [[PROF12]] = !{!"branch_weights", i32 112017436, i32 -735157296}
 ; CHECK: [[PROF13]] = !{!"branch_weights", i32 2, i32 3}
 ; CHECK: [[PROF14]] = !{!"branch_weights", i32 34, i32 21}
 ; CHECK: [[PROF15]] = !{!"branch_weights", i32 3, i32 2}
@@ -1128,4 +1131,4 @@ exit:
 ; CHECK: [[PROF24]] = !{!"branch_weights", i32 101, i32 99}
 ; CHECK: [[PROF25]] = !{!"branch_weights", i32 1, i32 197}
 ; CHECK: [[PROF26]] = !{!"branch_weights", i32 99, i32 101}
-; .
+;.
diff --git a/llvm/test/Transforms/StructurizeCFG/hoist-zerocost.ll b/llvm/test/Transforms/StructurizeCFG/hoist-zerocost.ll
index d084e199ceb89..b118f8189d716 100644
--- a/llvm/test/Transforms/StructurizeCFG/hoist-zerocost.ll
+++ b/llvm/test/Transforms/StructurizeCFG/hoist-zerocost.ll
@@ -209,3 +209,53 @@ merge:
   store i32 %phi, ptr  %ptr
   ret void
 }
+
+define void @test_nested_if_2 (i32 %val,ptr %gep, i1 %cond) {
+; CHECK-LABEL: define void @test_nested_if_2(
+; CHECK-SAME: i32 [[VAL:%.*]], ptr [[GEP:%.*]], i1 [[COND:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[COND_INV:%.*]] = xor i1 [[COND]], true
+; CHECK-NEXT:    [[T825:%.*]] = icmp eq i32 [[VAL]], 0
+; CHECK-NEXT:    [[T825_INV:%.*]] = xor i1 [[T825]], true
+; CHECK-NEXT:    br i1 [[T825]], label %[[IF:.*]], label %[[FLOW:.*]]
+; CHECK:       [[IF]]:
+; CHECK-NEXT:    [[LOADED:%.*]] = load [[PAIR:%.*]], ptr [[GEP]], align 4
+; CHECK-NEXT:    br label %[[FLOW]]
+; CHECK:       [[FLOW1:.*]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = phi i1 [ false, %[[ELSE:.*]] ], [ [[TMP2:%.*]], %[[FLOW]] ]
+; CHECK-NEXT:    br i1 [[TMP0]], label %[[IF_2:.*]], label %[[EXIT:.*]]
+; CHECK:       [[IF_2]]:
+; CHECK-NEXT:    [[IF_VALUE:%.*]] = extractvalue [[PAIR]] [[TMP1:%.*]], 0
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[FLOW]]:
+; CHECK-NEXT:    [[TMP1]] = phi [[PAIR]] [ [[LOADED]], %[[IF]] ], [ poison, %[[ENTRY]] ]
+; CHECK-NEXT:    [[TMP2]] = phi i1 [ true, %[[IF]] ], [ false, %[[ENTRY]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = phi i1 [ [[COND_INV]], %[[IF]] ], [ [[T825_INV]], %[[ENTRY]] ]
+; CHECK-NEXT:    br i1 [[TMP3]], label %[[ELSE]], label %[[FLOW1]]
+; CHECK:       [[ELSE]]:
+; CHECK-NEXT:    br label %[[FLOW1]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[T_SINK168:%.*]] = phi i32 [ 0, %[[FLOW1]] ], [ [[IF_VALUE]], %[[IF_2]] ]
+; CHECK-NEXT:    store i32 [[T_SINK168]], ptr [[GEP]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %t825 = icmp eq i32 %val, 0
+  br i1 %t825, label %if, label %else
+
+if:
+  %loaded = load %pair, ptr %gep
+  br i1 %cond, label %if_2, label %else
+
+if_2:
+  %if_value = extractvalue %pair %loaded, 0
+  br label %exit
+
+else:
+  br label %exit
+
+exit:
+  %phi = phi i32 [ %if_value, %if_2 ], [ 0, %else ]
+  store i32 %phi,ptr %gep
+  ret void
+}
diff --git a/llvm/test/Transforms/VectorCombine/AArch64/scalarize-ext-extract-endian.ll b/llvm/test/Transforms/VectorCombine/AArch64/scalarize-ext-extract-endian.ll
new file mode 100644
index 0000000000000..9796faf2e6feb
--- /dev/null
+++ b/llvm/test/Transforms/VectorCombine/AArch64/scalarize-ext-extract-endian.ll
@@ -0,0 +1,36 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes='vector-combine' -S -mtriple=aarch64-unknown-linux-gnu %s -o - | FileCheck %s --check-prefix=LE
+; RUN: opt -passes='vector-combine' -S -mtriple=aarch64_be-unknown-linux-gnu %s -o - | FileCheck %s --check-prefix=BE
+
+define i64 @g(<8 x i8> %v) {
+; LE-LABEL: @g(
+; LE-NEXT:    [[TMP1:%.*]] = freeze <8 x i8> [[V:%.*]]
+; LE-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to i64
+; LE-NEXT:    [[TMP3:%.*]] = lshr i64 [[TMP2]], 56
+; LE-NEXT:    [[TMP4:%.*]] = and i64 [[TMP2]], 255
+; LE-NEXT:    [[Z:%.*]] = zext <8 x i8> [[V]] to <8 x i64>
+; LE-NEXT:    [[E0:%.*]] = extractelement <8 x i64> [[Z]], i32 0
+; LE-NEXT:    [[E7:%.*]] = extractelement <8 x i64> [[Z]], i32 7
+; LE-NEXT:    [[SUM:%.*]] = add i64 [[TMP4]], [[TMP3]]
+; LE-NEXT:    ret i64 [[SUM]]
+;
+; BE-LABEL: @g(
+; BE-NEXT:    [[TMP1:%.*]] = freeze <8 x i8> [[V:%.*]]
+; BE-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to i64
+; BE-NEXT:    [[TMP3:%.*]] = and i64 [[TMP2]], 255
+; BE-NEXT:    [[TMP4:%.*]] = lshr i64 [[TMP2]], 56
+; BE-NEXT:    [[Z:%.*]] = zext <8 x i8> [[V]] to <8 x i64>
+; BE-NEXT:    [[E0:%.*]] = extractelement <8 x i64> [[Z]], i32 0
+; BE-NEXT:    [[E7:%.*]] = extractelement <8 x i64> [[Z]], i32 7
+; BE-NEXT:    [[SUM:%.*]] = add i64 [[TMP4]], [[TMP3]]
+; BE-NEXT:    ret i64 [[SUM]]
+;
+  %z  = zext <8 x i8> %v to <8 x i64>
+  %e0 = extractelement <8 x i64> %z, i32 0
+  %e7 = extractelement <8 x i64> %z, i32 7
+  %sum = add i64 %e0, %e7
+  ret i64 %sum
+}
+
+
+
diff --git a/llvm/test/Transforms/VectorCombine/PowerPC/lit.local.cfg b/llvm/test/Transforms/VectorCombine/PowerPC/lit.local.cfg
new file mode 100644
index 0000000000000..15af315f104fc
--- /dev/null
+++ b/llvm/test/Transforms/VectorCombine/PowerPC/lit.local.cfg
@@ -0,0 +1,2 @@
+if 'PowerPC' not in config.root.targets:
+    config.unsupported = True
diff --git a/llvm/test/Transforms/VectorCombine/PowerPC/scalarize-ext-extract.ll b/llvm/test/Transforms/VectorCombine/PowerPC/scalarize-ext-extract.ll
new file mode 100644
index 0000000000000..a9b719920c341
--- /dev/null
+++ b/llvm/test/Transforms/VectorCombine/PowerPC/scalarize-ext-extract.ll
@@ -0,0 +1,22 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes='vector-combine' -S -mtriple=powerpc64-ibm-aix-xcoff     %s -o - | FileCheck %s --check-prefix=BE
+
+define i64 @g(<8 x i8> %v) {
+; BE-LABEL: @g(
+; BE-NEXT:    [[TMP1:%.*]] = freeze <8 x i8> [[V:%.*]]
+; BE-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to i64
+; BE-NEXT:    [[TMP3:%.*]] = and i64 [[TMP2]], 255
+; BE-NEXT:    [[TMP4:%.*]] = lshr i64 [[TMP2]], 56
+; BE-NEXT:    [[Z:%.*]] = zext <8 x i8> [[V]] to <8 x i64>
+; BE-NEXT:    [[E0:%.*]] = extractelement <8 x i64> [[Z]], i32 0
+; BE-NEXT:    [[E7:%.*]] = extractelement <8 x i64> [[Z]], i32 7
+; BE-NEXT:    [[SUM:%.*]] = add i64 [[TMP4]], [[TMP3]]
+; BE-NEXT:    ret i64 [[SUM]]
+;
+  %z  = zext <8 x i8> %v to <8 x i64>
+  %e0 = extractelement <8 x i64> %z, i32 0
+  %e7 = extractelement <8 x i64> %z, i32 7
+  %sum = add i64 %e0, %e7
+  ret i64 %sum
+}
+
diff --git a/llvm/test/Transforms/VectorCombine/intrinsic-scalarize.ll b/llvm/test/Transforms/VectorCombine/intrinsic-scalarize.ll
index abd98a4dc64b8..ec3711eabb7e1 100644
--- a/llvm/test/Transforms/VectorCombine/intrinsic-scalarize.ll
+++ b/llvm/test/Transforms/VectorCombine/intrinsic-scalarize.ll
@@ -81,12 +81,12 @@ define <4 x i32> @non_trivially_vectorizable(i32 %x, i32 %y) {
 ; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
 ; CHECK-NEXT:    [[X_INSERT:%.*]] = insertelement <4 x i32> poison, i32 [[X]], i32 0
 ; CHECK-NEXT:    [[Y_INSERT:%.*]] = insertelement <8 x i32> poison, i32 [[Y]], i32 0
-; CHECK-NEXT:    [[V:%.*]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v8i32(<4 x i32> [[X_INSERT]], <8 x i32> [[Y_INSERT]])
+; CHECK-NEXT:    [[V:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v8i32(<4 x i32> [[X_INSERT]], <8 x i32> [[Y_INSERT]])
 ; CHECK-NEXT:    ret <4 x i32> [[V]]
 ;
   %x.insert = insertelement <4 x i32> poison, i32 %x, i32 0
   %y.insert = insertelement <8 x i32> poison, i32 %y, i32 0
-  %v = call <4 x i32> @llvm.experimental.vector.partial.reduce.add(<4 x i32> %x.insert, <8 x i32> %y.insert)
+  %v = call <4 x i32> @llvm.vector.partial.reduce.add(<4 x i32> %x.insert, <8 x i32> %y.insert)
   ret <4 x i32> %v
 }
 
diff --git a/llvm/test/Verifier/llvm.loop.estimated_trip_count.ll b/llvm/test/Verifier/llvm.loop.estimated_trip_count.ll
new file mode 100644
index 0000000000000..b1e456f5b0ad6
--- /dev/null
+++ b/llvm/test/Verifier/llvm.loop.estimated_trip_count.ll
@@ -0,0 +1,68 @@
+; Test "llvm.loop.estimated_trip_count" validation
+
+; DEFINE: %{RUN} = opt -passes=verify %t -disable-output 2>&1 | \
+; DEFINE:   FileCheck %s -allow-empty -check-prefix
+
+define void @test() {
+entry:
+  br label %body
+body:
+  br i1 0, label %body, label %exit, !llvm.loop !0
+exit:
+  ret void
+}
+!0 = distinct !{!0, !1}
+
+; GOOD-NOT: {{.}}
+
+;      BAD-VALUE: Expected second operand to be an integer constant of type i32 or smaller
+; BAD-VALUE-NEXT: !1 = !{!"llvm.loop.estimated_trip_count",
+
+;      TOO-FEW: Expected two operands
+; TOO-FEW-NEXT: !1 = !{!"llvm.loop.estimated_trip_count"}
+
+;      TOO-MANY: Expected two operands
+; TOO-MANY-NEXT: !1 = !{!"llvm.loop.estimated_trip_count", i32 5, i32 5}
+
+; No value.
+; RUN: cp %s %t
+; RUN: chmod u+w %t
+; RUN: echo '!1 = !{!"llvm.loop.estimated_trip_count"}' >> %t
+; RUN: not %{RUN} TOO-FEW
+
+; i16 value.
+; RUN: cp %s %t
+; RUN: chmod u+w %t
+; RUN: echo '!1 = !{!"llvm.loop.estimated_trip_count", i16 5}' >> %t
+; RUN: %{RUN} GOOD
+
+; i32 value.
+; RUN: cp %s %t
+; RUN: chmod u+w %t
+; RUN: echo '!1 = !{!"llvm.loop.estimated_trip_count", i32 5}' >> %t
+; RUN: %{RUN} GOOD
+
+; i64 value.
+; RUN: cp %s %t
+; RUN: chmod u+w %t
+; RUN: echo '!1 = !{!"llvm.loop.estimated_trip_count", i64 5}' >> %t
+; RUN: not %{RUN} BAD-VALUE
+
+; MDString value.
+; RUN: cp %s %t
+; RUN: chmod u+w %t
+; RUN: echo '!1 = !{!"llvm.loop.estimated_trip_count", !"5"}' >> %t
+; RUN: not %{RUN} BAD-VALUE
+
+; MDNode value.
+; RUN: cp %s %t
+; RUN: chmod u+w %t
+; RUN: echo '!1 = !{!"llvm.loop.estimated_trip_count", !2}' >> %t
+; RUN: echo '!2 = !{i32 5}' >> %t
+; RUN: not %{RUN} BAD-VALUE
+
+; Too many values.
+; RUN: cp %s %t
+; RUN: chmod u+w %t
+; RUN: echo '!1 = !{!"llvm.loop.estimated_trip_count", i32 5, i32 5}' >> %t
+; RUN: not %{RUN} TOO-MANY
diff --git a/llvm/test/lit.cfg.py b/llvm/test/lit.cfg.py
index 867a44be56727..dd3f947b186b3 100644
--- a/llvm/test/lit.cfg.py
+++ b/llvm/test/lit.cfg.py
@@ -17,6 +17,17 @@
 # name: The name of this test suite.
 config.name = "LLVM"
 
+# TODO: Consolidate the logic for turning on the internal shell by default for all LLVM test suites.
+# See https://github.com/llvm/llvm-project/issues/106636 for more details.
+#
+# We prefer the lit internal shell which provides a better user experience on failures
+# and is faster unless the user explicitly disables it with LIT_USE_INTERNAL_SHELL=0
+# env var.
+use_lit_shell = True
+lit_shell_env = os.environ.get("LIT_USE_INTERNAL_SHELL")
+if lit_shell_env:
+    use_lit_shell = lit.util.pythonize_bool(lit_shell_env)
+
 # testFormat: The test format to use to interpret tests.
 extra_substitutions = extra_substitutions = (
     [
@@ -26,9 +37,7 @@
     if config.enable_profcheck
     else []
 )
-config.test_format = lit.formats.ShTest(
-    not llvm_config.use_lit_shell, extra_substitutions
-)
+config.test_format = lit.formats.ShTest(not use_lit_shell, extra_substitutions)
 
 # suffixes: A list of file extensions to treat as test files. This is overriden
 # by individual lit.local.cfg files in the test subdirectories.
@@ -147,7 +156,7 @@ def get_asan_rtlib():
 ld64_cmd = config.ld64_executable
 asan_rtlib = get_asan_rtlib()
 if asan_rtlib:
-    ld64_cmd = "DYLD_INSERT_LIBRARIES={} {}".format(asan_rtlib, ld64_cmd)
+    ld64_cmd = "env DYLD_INSERT_LIBRARIES={} {}".format(asan_rtlib, ld64_cmd)
 if config.osx_sysroot:
     ld64_cmd = "{} -syslibroot {}".format(ld64_cmd, config.osx_sysroot)
 
diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_isel.ll.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_isel.ll.expected
index bd1eb4c4e6d1c..bc49669df8a2c 100644
--- a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_isel.ll.expected
+++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_isel.ll.expected
@@ -7,10 +7,10 @@ define i64 @i64_test(i64 %i) nounwind readnone {
 ; CHECK-NEXT:    t0: ch,glue = EntryToken
 ; CHECK-NEXT:    t2: i32,ch = CopyFromReg # D:1 t0, Register:i32 %8
 ; CHECK-NEXT:    t4: i32,ch = CopyFromReg # D:1 t0, Register:i32 %9
-; CHECK-NEXT:    t50: i64 = REG_SEQUENCE # D:1 TargetConstant:i32<72>, t2, TargetConstant:i32<3>, t4, TargetConstant:i32<11>
+; CHECK-NEXT:    t50: i64 = REG_SEQUENCE # D:1 TargetConstant:i32<80>, t2, TargetConstant:i32<3>, t4, TargetConstant:i32<11>
 ; CHECK-NEXT:    t27: i32,ch = BUFFER_LOAD_DWORD_OFFEN<Mem:(dereferenceable load (s32) from %ir.loc, align 8, addrspace 5)> TargetFrameIndex:i32<0>, Register:v4i32 $sgpr0_sgpr1_sgpr2_sgpr3, TargetConstant:i32<0>, TargetConstant:i32<0>, TargetConstant:i32<0>, TargetConstant:i1<0>, t0
 ; CHECK-NEXT:    t30: i32,ch = BUFFER_LOAD_DWORD_OFFEN<Mem:(dereferenceable load (s32) from %ir.loc + 4, basealign 8, addrspace 5)> TargetFrameIndex:i32<0>, Register:v4i32 $sgpr0_sgpr1_sgpr2_sgpr3, TargetConstant:i32<0>, TargetConstant:i32<4>, TargetConstant:i32<0>, TargetConstant:i1<0>, t0
-; CHECK-NEXT:    t33: v2i32 = REG_SEQUENCE # D:1 TargetConstant:i32<72>, t27, TargetConstant:i32<3>, t30, TargetConstant:i32<11>
+; CHECK-NEXT:    t33: v2i32 = REG_SEQUENCE # D:1 TargetConstant:i32<80>, t27, TargetConstant:i32<3>, t30, TargetConstant:i32<11>
 ; CHECK-NEXT:    t10: i64 = V_ADD_U64_PSEUDO # D:1 t50, t33
 ; CHECK-NEXT:    t24: i32 = EXTRACT_SUBREG # D:1 t10, TargetConstant:i32<3>
 ; CHECK-NEXT:    t17: ch,glue = CopyToReg # D:1 t0, Register:i32 $vgpr0, t24
diff --git a/llvm/test/tools/llvm-debuginfod-find/cache.test b/llvm/test/tools/llvm-debuginfod-find/cache.test
index eae341f3eb299..66af974a2596d 100644
--- a/llvm/test/tools/llvm-debuginfod-find/cache.test
+++ b/llvm/test/tools/llvm-debuginfod-find/cache.test
@@ -2,9 +2,10 @@ REQUIRES: curl
 UNSUPPORTED: system-windows
 
 RUN: rm -rf %t/*
-RUN: mkdir -p %t/buildid/012345678901234{5,6}
+RUN: mkdir -p %t/buildid/0123456789012345
+RUN: mkdir -p %t/buildid/0123456789012346
 RUN: echo 'f' > %t/buildid/0123456789012345/debuginfo
-RUN: cp %t/buildid/012345678901234{5,6}/debuginfo
+RUN: cp %t/buildid/0123456789012345/debuginfo %t/buildid/0123456789012346/debuginfo
 RUN: mkdir %t/cache
 RUN: env DEBUGINFOD_CACHE_PATH=%t/cache DEBUGINFOD_URLS=file://%t \
 RUN:   llvm-debuginfod-find --debuginfo 0123456789012345 > /dev/null
diff --git a/llvm/test/tools/llvm-debuginfod/llvm-debuginfod.test b/llvm/test/tools/llvm-debuginfod/llvm-debuginfod.test
index edeae375a5079..c32c7b75e79c1 100644
--- a/llvm/test/tools/llvm-debuginfod/llvm-debuginfod.test
+++ b/llvm/test/tools/llvm-debuginfod/llvm-debuginfod.test
@@ -13,20 +13,20 @@
 # RUN: rm -rf %t
 # RUN: mkdir %t
 # # Query the debuginfod server for artifacts
-# RUN: DEBUGINFOD_CACHE_PATH=%t %python %s --server-cmd 'llvm-debuginfod -v -c 3 %S/Inputs' \
+# RUN: env DEBUGINFOD_CACHE_PATH=%t %python %s --server-cmd 'llvm-debuginfod -v -c 3 %S/Inputs' \
 # RUN:   --tool-cmd 'llvm-debuginfod-find --dump --executable 2c39b7557c50162aaeb5a3148c9f76e6e46012e3' | \
 # RUN:   diff - %S/Inputs/main.exe
-# RUN: DEBUGINFOD_CACHE_PATH=%t %python %s --server-cmd 'llvm-debuginfod -v -c 3 %S/Inputs' \
+# RUN: env DEBUGINFOD_CACHE_PATH=%t %python %s --server-cmd 'llvm-debuginfod -v -c 3 %S/Inputs' \
 # RUN:   --tool-cmd 'llvm-debuginfod-find --dump --debuginfo 2c39b7557c50162aaeb5a3148c9f76e6e46012e3' | \
 # RUN:   diff - %S/Inputs/main-debug.exe
 # Debuginfod server does not yet support source files
 
 # # The artifacts should still be present in the cache without needing to query
 # # the server.
-# RUN: DEBUGINFOD_CACHE_PATH=%t llvm-debuginfod-find --dump \
+# RUN: env DEBUGINFOD_CACHE_PATH=%t llvm-debuginfod-find --dump \
 # RUN:   --executable 2c39b7557c50162aaeb5a3148c9f76e6e46012e3 | \
 # RUN:   diff - %S/Inputs/main.exe
-# RUN: DEBUGINFOD_CACHE_PATH=%t llvm-debuginfod-find --dump \
+# RUN: env DEBUGINFOD_CACHE_PATH=%t llvm-debuginfod-find --dump \
 # RUN:   --debuginfo 2c39b7557c50162aaeb5a3148c9f76e6e46012e3 | \
 # RUN:   diff - %S/Inputs/main-debug.exe
 
diff --git a/llvm/test/tools/llvm-dwarfdump/X86/heterogeneous-user-ops.s b/llvm/test/tools/llvm-dwarfdump/X86/heterogeneous-user-ops.s
new file mode 100644
index 0000000000000..ef90f31d92a8a
--- /dev/null
+++ b/llvm/test/tools/llvm-dwarfdump/X86/heterogeneous-user-ops.s
@@ -0,0 +1,31 @@
+# RUN: llvm-mc %s -filetype=obj -triple=i686-pc-linux -o - | llvm-dwarfdump --debug-frame - | FileCheck %s
+
+# CHECK:      .eh_frame contents:
+# CHECK:      FDE
+# CHECK-NEXT: Format: DWARF32
+
+foo:
+ .cfi_startproc
+ # CHECK-NEXT: DW_CFA_expression: EAX DW_OP_LLVM_user DW_OP_LLVM_form_aspace_address
+ .cfi_escape 0x10, 0x00, 0x02, 0xe9, 0x02
+ # CHECK-NEXT: DW_CFA_expression: EAX DW_OP_LLVM_user DW_OP_LLVM_push_lane
+ .cfi_escape 0x10, 0x00, 0x02, 0xe9, 0x03
+ # CHECK-NEXT: DW_CFA_expression: EAX DW_OP_LLVM_user DW_OP_LLVM_offset
+ .cfi_escape 0x10, 0x00, 0x02, 0xe9, 0x04
+ # CHECK-NEXT: DW_CFA_expression: EAX DW_OP_LLVM_user DW_OP_LLVM_offset_uconst 0x0
+ .cfi_escape 0x10, 0x00, 0x03, 0xe9, 0x05, 0x00
+ # CHECK-NEXT: DW_CFA_expression: EAX DW_OP_LLVM_user DW_OP_LLVM_bit_offset
+ .cfi_escape 0x10, 0x00, 0x02, 0xe9, 0x06
+ # CHECK-NEXT: DW_CFA_expression: EAX DW_OP_LLVM_user DW_OP_LLVM_call_frame_entry_reg EAX
+ .cfi_escape 0x10, 0x00, 0x03, 0xe9, 0x07, 0x00
+ # CHECK-NEXT: DW_CFA_expression: EAX DW_OP_LLVM_user DW_OP_LLVM_undefined
+ .cfi_escape 0x10, 0x00, 0x02, 0xe9, 0x08
+ # CHECK-NEXT: DW_CFA_expression: EAX DW_OP_LLVM_user DW_OP_LLVM_aspace_bregx EAX+2
+ .cfi_escape 0x10, 0x00, 0x04, 0xe9, 0x09, 0x00, 0x02
+ # CHECK-NEXT: DW_CFA_expression: EAX DW_OP_LLVM_user DW_OP_LLVM_piece_end
+ .cfi_escape 0x10, 0x00, 0x02, 0xe9, 0x0a
+ # CHECK-NEXT: DW_CFA_expression: EAX DW_OP_LLVM_user DW_OP_LLVM_extend 0x0 0x0
+ .cfi_escape 0x10, 0x00, 0x04, 0xe9, 0x0b, 0x00, 0x00
+ # CHECK-NEXT: DW_CFA_expression: EAX DW_OP_LLVM_user DW_OP_LLVM_select_bit_piece 0x0 0x0
+ .cfi_escape 0x10, 0x00, 0x04, 0xe9, 0x0c, 0x00, 0x00
+ .cfi_endproc
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-writeback.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-writeback.s
index 8fe21167a5bd3..127c8c30fc2c6 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-writeback.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-writeback.s
@@ -1165,10 +1165,10 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      507
 # CHECK-NEXT: Total uOps:        1500
 
-# CHECK:      Dispatch Width:    8
+# CHECK:      Dispatch Width:    3
 # CHECK-NEXT: uOps Per Cycle:    2.96
 # CHECK-NEXT: IPC:               1.97
-# CHECK-NEXT: Block RThroughput: 3.3
+# CHECK-NEXT: Block RThroughput: 5.0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     01
@@ -1176,14 +1176,14 @@ add x0, x27, 1
 
 # CHECK:      [0,0]     DeeeeeER  ..   ld1	{ v1.1d }, [x27], #8
 # CHECK-NEXT: [0,1]     D=eE---R  ..   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=eeeeeER ..   ld1	{ v1.2d }, [x27], #16
-# CHECK-NEXT: [0,3]     D==eE---R ..   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     D==eeeeeER..   ld1	{ v1.2s }, [x27], #8
-# CHECK-NEXT: [0,5]     .D==eE---R..   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D==eeeeeER.   ld1	{ v1.4h }, [x27], #8
-# CHECK-NEXT: [0,7]     .D===eE---R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .D===eeeeeER   ld1	{ v1.4s }, [x27], #16
-# CHECK-NEXT: [0,9]     .D====eE---R   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeER ..   ld1	{ v1.2d }, [x27], #16
+# CHECK-NEXT: [0,3]     .D=eE---R ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DeeeeeER..   ld1	{ v1.2s }, [x27], #8
+# CHECK-NEXT: [0,5]     . D=eE---R..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  DeeeeeER.   ld1	{ v1.4h }, [x27], #8
+# CHECK-NEXT: [0,7]     .  D=eE---R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   DeeeeeER   ld1	{ v1.4s }, [x27], #16
+# CHECK-NEXT: [0,9]     .   D=eE---R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1194,15 +1194,15 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.1d }, [x27], #8
 # CHECK-NEXT: 1.     1     2.0    0.0    3.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld1	{ v1.2d }, [x27], #16
-# CHECK-NEXT: 3.     1     3.0    0.0    3.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     3.0    0.0    0.0       ld1	{ v1.2s }, [x27], #8
-# CHECK-NEXT: 5.     1     3.0    0.0    3.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ld1	{ v1.4h }, [x27], #8
-# CHECK-NEXT: 7.     1     4.0    0.0    3.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     4.0    0.0    0.0       ld1	{ v1.4s }, [x27], #16
-# CHECK-NEXT: 9.     1     5.0    0.0    3.0       add	x0, x27, #1
-# CHECK-NEXT:        1     3.0    0.1    1.5       <total>
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld1	{ v1.2d }, [x27], #16
+# CHECK-NEXT: 3.     1     2.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ld1	{ v1.2s }, [x27], #8
+# CHECK-NEXT: 5.     1     2.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    0.0    0.0       ld1	{ v1.4h }, [x27], #8
+# CHECK-NEXT: 7.     1     2.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    0.0    0.0       ld1	{ v1.4s }, [x27], #16
+# CHECK-NEXT: 9.     1     2.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.5    0.1    1.5       <total>
 
 # CHECK:      [1] Code Region - G02
 
@@ -1211,10 +1211,10 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      507
 # CHECK-NEXT: Total uOps:        1500
 
-# CHECK:      Dispatch Width:    8
+# CHECK:      Dispatch Width:    3
 # CHECK-NEXT: uOps Per Cycle:    2.96
 # CHECK-NEXT: IPC:               1.97
-# CHECK-NEXT: Block RThroughput: 3.3
+# CHECK-NEXT: Block RThroughput: 5.0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     01
@@ -1222,14 +1222,14 @@ add x0, x27, 1
 
 # CHECK:      [0,0]     DeeeeeER  ..   ld1	{ v1.8b }, [x27], #8
 # CHECK-NEXT: [0,1]     D=eE---R  ..   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=eeeeeER ..   ld1	{ v1.8h }, [x27], #16
-# CHECK-NEXT: [0,3]     D==eE---R ..   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     D==eeeeeER..   ld1	{ v1.16b }, [x27], #16
-# CHECK-NEXT: [0,5]     .D==eE---R..   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D==eeeeeER.   ld1	{ v1.1d }, [x27], x28
-# CHECK-NEXT: [0,7]     .D===eE---R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .D===eeeeeER   ld1	{ v1.2d }, [x27], x28
-# CHECK-NEXT: [0,9]     .D====eE---R   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeER ..   ld1	{ v1.8h }, [x27], #16
+# CHECK-NEXT: [0,3]     .D=eE---R ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DeeeeeER..   ld1	{ v1.16b }, [x27], #16
+# CHECK-NEXT: [0,5]     . D=eE---R..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  DeeeeeER.   ld1	{ v1.1d }, [x27], x28
+# CHECK-NEXT: [0,7]     .  D=eE---R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   DeeeeeER   ld1	{ v1.2d }, [x27], x28
+# CHECK-NEXT: [0,9]     .   D=eE---R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1240,15 +1240,15 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.8b }, [x27], #8
 # CHECK-NEXT: 1.     1     2.0    0.0    3.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld1	{ v1.8h }, [x27], #16
-# CHECK-NEXT: 3.     1     3.0    0.0    3.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     3.0    0.0    0.0       ld1	{ v1.16b }, [x27], #16
-# CHECK-NEXT: 5.     1     3.0    0.0    3.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ld1	{ v1.1d }, [x27], x28
-# CHECK-NEXT: 7.     1     4.0    0.0    3.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     4.0    0.0    0.0       ld1	{ v1.2d }, [x27], x28
-# CHECK-NEXT: 9.     1     5.0    0.0    3.0       add	x0, x27, #1
-# CHECK-NEXT:        1     3.0    0.1    1.5       <total>
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld1	{ v1.8h }, [x27], #16
+# CHECK-NEXT: 3.     1     2.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ld1	{ v1.16b }, [x27], #16
+# CHECK-NEXT: 5.     1     2.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    0.0    0.0       ld1	{ v1.1d }, [x27], x28
+# CHECK-NEXT: 7.     1     2.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    0.0    0.0       ld1	{ v1.2d }, [x27], x28
+# CHECK-NEXT: 9.     1     2.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.5    0.1    1.5       <total>
 
 # CHECK:      [2] Code Region - G03
 
@@ -1257,10 +1257,10 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      507
 # CHECK-NEXT: Total uOps:        1500
 
-# CHECK:      Dispatch Width:    8
+# CHECK:      Dispatch Width:    3
 # CHECK-NEXT: uOps Per Cycle:    2.96
 # CHECK-NEXT: IPC:               1.97
-# CHECK-NEXT: Block RThroughput: 3.3
+# CHECK-NEXT: Block RThroughput: 5.0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     01
@@ -1268,14 +1268,14 @@ add x0, x27, 1
 
 # CHECK:      [0,0]     DeeeeeER  ..   ld1	{ v1.2s }, [x27], x28
 # CHECK-NEXT: [0,1]     D=eE---R  ..   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=eeeeeER ..   ld1	{ v1.4h }, [x27], x28
-# CHECK-NEXT: [0,3]     D==eE---R ..   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     D==eeeeeER..   ld1	{ v1.4s }, [x27], x28
-# CHECK-NEXT: [0,5]     .D==eE---R..   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D==eeeeeER.   ld1	{ v1.8b }, [x27], x28
-# CHECK-NEXT: [0,7]     .D===eE---R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .D===eeeeeER   ld1	{ v1.8h }, [x27], x28
-# CHECK-NEXT: [0,9]     .D====eE---R   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeER ..   ld1	{ v1.4h }, [x27], x28
+# CHECK-NEXT: [0,3]     .D=eE---R ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DeeeeeER..   ld1	{ v1.4s }, [x27], x28
+# CHECK-NEXT: [0,5]     . D=eE---R..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  DeeeeeER.   ld1	{ v1.8b }, [x27], x28
+# CHECK-NEXT: [0,7]     .  D=eE---R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   DeeeeeER   ld1	{ v1.8h }, [x27], x28
+# CHECK-NEXT: [0,9]     .   D=eE---R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1286,42 +1286,42 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.2s }, [x27], x28
 # CHECK-NEXT: 1.     1     2.0    0.0    3.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld1	{ v1.4h }, [x27], x28
-# CHECK-NEXT: 3.     1     3.0    0.0    3.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     3.0    0.0    0.0       ld1	{ v1.4s }, [x27], x28
-# CHECK-NEXT: 5.     1     3.0    0.0    3.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ld1	{ v1.8b }, [x27], x28
-# CHECK-NEXT: 7.     1     4.0    0.0    3.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     4.0    0.0    0.0       ld1	{ v1.8h }, [x27], x28
-# CHECK-NEXT: 9.     1     5.0    0.0    3.0       add	x0, x27, #1
-# CHECK-NEXT:        1     3.0    0.1    1.5       <total>
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld1	{ v1.4h }, [x27], x28
+# CHECK-NEXT: 3.     1     2.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ld1	{ v1.4s }, [x27], x28
+# CHECK-NEXT: 5.     1     2.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    0.0    0.0       ld1	{ v1.8b }, [x27], x28
+# CHECK-NEXT: 7.     1     2.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    0.0    0.0       ld1	{ v1.8h }, [x27], x28
+# CHECK-NEXT: 9.     1     2.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.5    0.1    1.5       <total>
 
 # CHECK:      [3] Code Region - G04
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      507
+# CHECK-NEXT: Total Cycles:      906
 # CHECK-NEXT: Total uOps:        1900
 
-# CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    3.75
-# CHECK-NEXT: IPC:               1.97
-# CHECK-NEXT: Block RThroughput: 4.5
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    2.10
+# CHECK-NEXT: IPC:               1.10
+# CHECK-NEXT: Block RThroughput: 6.3
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     01
+# CHECK-NEXT:                     01234
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeER  ..   ld1	{ v1.16b }, [x27], x28
-# CHECK-NEXT: [0,1]     D=eE---R  ..   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=eeeeeER ..   ld1	{ v1.1d, v2.1d }, [x27], #16
-# CHECK-NEXT: [0,3]     D==eE---R ..   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D=eeeeeER..   ld1	{ v1.2d, v2.2d }, [x27], #32
-# CHECK-NEXT: [0,5]     .D==eE---R..   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D==eeeeeER.   ld1	{ v1.2s, v2.2s }, [x27], #16
-# CHECK-NEXT: [0,7]     .D===eE---R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D==eeeeeER   ld1	{ v1.4h, v2.4h }, [x27], #16
-# CHECK-NEXT: [0,9]     . D===eE---R   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeER  .   .   ld1	{ v1.16b }, [x27], x28
+# CHECK-NEXT: [0,1]     D=eE---R  .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeER .   .   ld1	{ v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: [0,3]     . DeE---R .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .  DeeeeeER   .   ld1	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: [0,5]     .   DeE---R   .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    DeeeeeER .   ld1	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: [0,7]     .    .DeE---R .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    . DeeeeeER   ld1	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: [0,9]     .    .  DeE---R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1332,42 +1332,42 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.16b }, [x27], x28
 # CHECK-NEXT: 1.     1     2.0    0.0    3.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld1	{ v1.1d, v2.1d }, [x27], #16
-# CHECK-NEXT: 3.     1     3.0    0.0    3.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ld1	{ v1.2d, v2.2d }, [x27], #32
-# CHECK-NEXT: 5.     1     3.0    0.0    3.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ld1	{ v1.2s, v2.2s }, [x27], #16
-# CHECK-NEXT: 7.     1     4.0    0.0    3.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     3.0    0.0    0.0       ld1	{ v1.4h, v2.4h }, [x27], #16
-# CHECK-NEXT: 9.     1     4.0    0.0    3.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.7    0.1    1.5       <total>
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld1	{ v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: 3.     1     1.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       ld1	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: 5.     1     1.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       ld1	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: 7.     1     1.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       ld1	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: 9.     1     1.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.1    0.4    1.5       <total>
 
 # CHECK:      [4] Code Region - G05
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      507
+# CHECK-NEXT: Total Cycles:      1006
 # CHECK-NEXT: Total uOps:        2000
 
-# CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    3.94
-# CHECK-NEXT: IPC:               1.97
-# CHECK-NEXT: Block RThroughput: 5.0
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    1.99
+# CHECK-NEXT: IPC:               0.99
+# CHECK-NEXT: Block RThroughput: 6.7
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     01
+# CHECK-NEXT:                     012345
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeER  ..   ld1	{ v1.4s, v2.4s }, [x27], #32
-# CHECK-NEXT: [0,1]     D=eE---R  ..   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=eeeeeER ..   ld1	{ v1.8b, v2.8b }, [x27], #16
-# CHECK-NEXT: [0,3]     D==eE---R ..   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D=eeeeeER..   ld1	{ v1.8h, v2.8h }, [x27], #32
-# CHECK-NEXT: [0,5]     .D==eE---R..   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D==eeeeeER.   ld1	{ v1.16b, v2.16b }, [x27], #32
-# CHECK-NEXT: [0,7]     .D===eE---R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D==eeeeeER   ld1	{ v1.1d, v2.1d }, [x27], x28
-# CHECK-NEXT: [0,9]     . D===eE---R   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeER  .    .   ld1	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: [0,1]     .DeE---R  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeeeeeER.    .   ld1	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: [0,3]     .  DeE---R.    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   DeeeeeER   .   ld1	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: [0,5]     .    DeE---R   .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .DeeeeeER .   ld1	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: [0,7]     .    . DeE---R .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  DeeeeeER   ld1	{ v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .   DeE---R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1377,43 +1377,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.4s, v2.4s }, [x27], #32
-# CHECK-NEXT: 1.     1     2.0    0.0    3.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld1	{ v1.8b, v2.8b }, [x27], #16
-# CHECK-NEXT: 3.     1     3.0    0.0    3.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ld1	{ v1.8h, v2.8h }, [x27], #32
-# CHECK-NEXT: 5.     1     3.0    0.0    3.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ld1	{ v1.16b, v2.16b }, [x27], #32
-# CHECK-NEXT: 7.     1     4.0    0.0    3.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     3.0    0.0    0.0       ld1	{ v1.1d, v2.1d }, [x27], x28
-# CHECK-NEXT: 9.     1     4.0    0.0    3.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.7    0.1    1.5       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       ld1	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: 3.     1     1.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       ld1	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: 5.     1     1.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       ld1	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: 7.     1     1.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       ld1	{ v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: 9.     1     1.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    0.5    1.5       <total>
 
 # CHECK:      [5] Code Region - G06
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      507
+# CHECK-NEXT: Total Cycles:      1006
 # CHECK-NEXT: Total uOps:        2000
 
-# CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    3.94
-# CHECK-NEXT: IPC:               1.97
-# CHECK-NEXT: Block RThroughput: 5.0
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    1.99
+# CHECK-NEXT: IPC:               0.99
+# CHECK-NEXT: Block RThroughput: 6.7
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     01
+# CHECK-NEXT:                     012345
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeER  ..   ld1	{ v1.2d, v2.2d }, [x27], x28
-# CHECK-NEXT: [0,1]     D=eE---R  ..   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=eeeeeER ..   ld1	{ v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT: [0,3]     D==eE---R ..   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D=eeeeeER..   ld1	{ v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: [0,5]     .D==eE---R..   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D==eeeeeER.   ld1	{ v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT: [0,7]     .D===eE---R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D==eeeeeER   ld1	{ v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: [0,9]     . D===eE---R   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeER  .    .   ld1	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: [0,1]     .DeE---R  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeeeeeER.    .   ld1	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: [0,3]     .  DeE---R.    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   DeeeeeER   .   ld1	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: [0,5]     .    DeE---R   .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .DeeeeeER .   ld1	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: [0,7]     .    . DeE---R .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  DeeeeeER   ld1	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .   DeE---R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1423,43 +1423,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.2d, v2.2d }, [x27], x28
-# CHECK-NEXT: 1.     1     2.0    0.0    3.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld1	{ v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT: 3.     1     3.0    0.0    3.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ld1	{ v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: 5.     1     3.0    0.0    3.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ld1	{ v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT: 7.     1     4.0    0.0    3.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     3.0    0.0    0.0       ld1	{ v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: 9.     1     4.0    0.0    3.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.7    0.1    1.5       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       ld1	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: 3.     1     1.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       ld1	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: 5.     1     1.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       ld1	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: 7.     1     1.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       ld1	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: 9.     1     1.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    0.5    1.5       <total>
 
 # CHECK:      [6] Code Region - G07
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      707
+# CHECK-NEXT: Total Cycles:      1007
 # CHECK-NEXT: Total uOps:        2300
 
-# CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    3.25
-# CHECK-NEXT: IPC:               1.41
-# CHECK-NEXT: Block RThroughput: 6.5
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    2.28
+# CHECK-NEXT: IPC:               0.99
+# CHECK-NEXT: Block RThroughput: 7.7
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123
+# CHECK-NEXT:                     0123456
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeER  .  .   ld1	{ v1.8h, v2.8h }, [x27], x28
-# CHECK-NEXT: [0,1]     D=eE---R  .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=eeeeeER .  .   ld1	{ v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: [0,3]     D==eE---R .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D=eeeeeeER  .   ld1	{ v1.1d, v2.1d, v3.1d }, [x27], #24
-# CHECK-NEXT: [0,5]     .D==eE----R  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     . D=eeeeeeER .   ld1	{ v1.2d, v2.2d, v3.2d }, [x27], #48
-# CHECK-NEXT: [0,7]     . D==eE----R .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .  D==eeeeeeER   ld1	{ v1.2s, v2.2s, v3.2s }, [x27], #24
-# CHECK-NEXT: [0,9]     .  D===eE----R   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeER  .    ..   ld1	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: [0,1]     .DeE---R  .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeeeeeER.    ..   ld1	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: [0,3]     .  DeE---R.    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   DeeeeeeER  ..   ld1	{ v1.1d, v2.1d, v3.1d }, [x27], #24
+# CHECK-NEXT: [0,5]     .    DeE----R  ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .DeeeeeeER..   ld1	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: [0,7]     .    . DeE----R..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  DeeeeeeER   ld1	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: [0,9]     .    .   DeE----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1469,43 +1469,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.8h, v2.8h }, [x27], x28
-# CHECK-NEXT: 1.     1     2.0    0.0    3.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld1	{ v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: 3.     1     3.0    0.0    3.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ld1	{ v1.1d, v2.1d, v3.1d }, [x27], #24
-# CHECK-NEXT: 5.     1     3.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     2.0    0.0    0.0       ld1	{ v1.2d, v2.2d, v3.2d }, [x27], #48
-# CHECK-NEXT: 7.     1     3.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     3.0    1.0    0.0       ld1	{ v1.2s, v2.2s, v3.2s }, [x27], #24
-# CHECK-NEXT: 9.     1     4.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.5    0.2    1.8       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       ld1	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: 3.     1     1.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       ld1	{ v1.1d, v2.1d, v3.1d }, [x27], #24
+# CHECK-NEXT: 5.     1     1.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       ld1	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: 7.     1     1.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       ld1	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: 9.     1     1.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    0.5    1.8       <total>
 
 # CHECK:      [7] Code Region - G08
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      757
+# CHECK-NEXT: Total Cycles:      1007
 # CHECK-NEXT: Total uOps:        2500
 
-# CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    3.30
-# CHECK-NEXT: IPC:               1.32
-# CHECK-NEXT: Block RThroughput: 7.5
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    2.48
+# CHECK-NEXT: IPC:               0.99
+# CHECK-NEXT: Block RThroughput: 8.3
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     01234
+# CHECK-NEXT:                     0123456
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeER .   .   ld1	{ v1.4h, v2.4h, v3.4h }, [x27], #24
-# CHECK-NEXT: [0,1]     D=eE----R .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeeeeeeER.   .   ld1	{ v1.4s, v2.4s, v3.4s }, [x27], #48
-# CHECK-NEXT: [0,3]     .D=eE----R.   .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D=eeeeeeER  .   ld1	{ v1.8b, v2.8b, v3.8b }, [x27], #24
-# CHECK-NEXT: [0,5]     . D==eE----R  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D=eeeeeeER .   ld1	{ v1.8h, v2.8h, v3.8h }, [x27], #48
-# CHECK-NEXT: [0,7]     .  D==eE----R .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D==eeeeeeER   ld1	{ v1.16b, v2.16b, v3.16b }, [x27], #48
-# CHECK-NEXT: [0,9]     .   D===eE----R   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeER .   ..   ld1	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: [0,1]     .DeE----R .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeeeeeeER    ..   ld1	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: [0,3]     .  DeE----R    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   DeeeeeeER  ..   ld1	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: [0,5]     .    DeE----R  ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .DeeeeeeER..   ld1	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: [0,7]     .    . DeE----R..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  DeeeeeeER   ld1	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: [0,9]     .    .   DeE----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1515,43 +1515,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.4h, v2.4h, v3.4h }, [x27], #24
-# CHECK-NEXT: 1.     1     2.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld1	{ v1.4s, v2.4s, v3.4s }, [x27], #48
-# CHECK-NEXT: 3.     1     2.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     2.0    1.0    0.0       ld1	{ v1.8b, v2.8b, v3.8b }, [x27], #24
-# CHECK-NEXT: 5.     1     3.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     2.0    0.0    0.0       ld1	{ v1.8h, v2.8h, v3.8h }, [x27], #48
-# CHECK-NEXT: 7.     1     3.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     3.0    1.0    0.0       ld1	{ v1.16b, v2.16b, v3.16b }, [x27], #48
-# CHECK-NEXT: 9.     1     4.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.3    0.3    2.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       ld1	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: 3.     1     1.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       ld1	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: 5.     1     1.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       ld1	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: 7.     1     1.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       ld1	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: 9.     1     1.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    0.5    2.0       <total>
 
 # CHECK:      [8] Code Region - G09
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      757
+# CHECK-NEXT: Total Cycles:      1007
 # CHECK-NEXT: Total uOps:        2500
 
-# CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    3.30
-# CHECK-NEXT: IPC:               1.32
-# CHECK-NEXT: Block RThroughput: 7.5
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    2.48
+# CHECK-NEXT: IPC:               0.99
+# CHECK-NEXT: Block RThroughput: 8.3
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     01234
+# CHECK-NEXT:                     0123456
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeER .   .   ld1	{ v1.1d, v2.1d, v3.1d }, [x27], x28
-# CHECK-NEXT: [0,1]     D=eE----R .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeeeeeeER.   .   ld1	{ v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT: [0,3]     .D=eE----R.   .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D=eeeeeeER  .   ld1	{ v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: [0,5]     . D==eE----R  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D=eeeeeeER .   ld1	{ v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: [0,7]     .  D==eE----R .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D==eeeeeeER   ld1	{ v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: [0,9]     .   D===eE----R   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeER .   ..   ld1	{ v1.1d, v2.1d, v3.1d }, [x27], x28
+# CHECK-NEXT: [0,1]     .DeE----R .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeeeeeeER    ..   ld1	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: [0,3]     .  DeE----R    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   DeeeeeeER  ..   ld1	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: [0,5]     .    DeE----R  ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .DeeeeeeER..   ld1	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: [0,7]     .    . DeE----R..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  DeeeeeeER   ld1	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .   DeE----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1561,43 +1561,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.1d, v2.1d, v3.1d }, [x27], x28
-# CHECK-NEXT: 1.     1     2.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld1	{ v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT: 3.     1     2.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     2.0    1.0    0.0       ld1	{ v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: 5.     1     3.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     2.0    0.0    0.0       ld1	{ v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: 7.     1     3.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     3.0    1.0    0.0       ld1	{ v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: 9.     1     4.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.3    0.3    2.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       ld1	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: 3.     1     1.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       ld1	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: 5.     1     1.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       ld1	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: 7.     1     1.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       ld1	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: 9.     1     1.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    0.5    2.0       <total>
 
 # CHECK:      [9] Code Region - G10
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      856
+# CHECK-NEXT: Total Cycles:      1007
 # CHECK-NEXT: Total uOps:        2700
 
-# CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    3.15
-# CHECK-NEXT: IPC:               1.17
-# CHECK-NEXT: Block RThroughput: 8.5
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    2.68
+# CHECK-NEXT: IPC:               0.99
+# CHECK-NEXT: Block RThroughput: 9.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     01234
+# CHECK-NEXT:                     0123456
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeER .   .   ld1	{ v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: [0,1]     D=eE----R .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeeeeeeER.   .   ld1	{ v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT: [0,3]     .D=eE----R.   .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D=eeeeeeER  .   ld1	{ v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: [0,5]     . D==eE----R  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D=eeeeeeER .   ld1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
-# CHECK-NEXT: [0,7]     .  D==eE----R .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D==eeeeeeER   ld1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
-# CHECK-NEXT: [0,9]     .   D===eE----R   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeER .    ..   ld1	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: [0,1]     .DeE----R .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeeeeeeER    ..   ld1	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: [0,3]     .  DeE----R    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   DeeeeeeER  ..   ld1	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: [0,5]     .    DeE----R  ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .DeeeeeeER..   ld1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: [0,7]     .    . DeE----R..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  DeeeeeeER   ld1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: [0,9]     .    .   DeE----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1607,25 +1607,25 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: 1.     1     2.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld1	{ v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT: 3.     1     2.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     2.0    1.0    0.0       ld1	{ v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: 5.     1     3.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     2.0    0.0    0.0       ld1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
-# CHECK-NEXT: 7.     1     3.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     3.0    1.0    0.0       ld1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
-# CHECK-NEXT: 9.     1     4.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.3    0.3    2.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       ld1	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: 3.     1     1.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       ld1	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: 5.     1     1.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       ld1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: 7.     1     1.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       ld1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: 9.     1     1.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    0.5    2.0       <total>
 
 # CHECK:      [10] Code Region - G11
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      1006
+# CHECK-NEXT: Total Cycles:      1007
 # CHECK-NEXT: Total uOps:        3000
 
-# CHECK:      Dispatch Width:    8
+# CHECK:      Dispatch Width:    3
 # CHECK-NEXT: uOps Per Cycle:    2.98
 # CHECK-NEXT: IPC:               0.99
 # CHECK-NEXT: Block RThroughput: 10.0
@@ -1635,15 +1635,15 @@ add x0, x27, 1
 # CHECK-NEXT: Index     0123456789
 
 # CHECK:      [0,0]     DeeeeeeER .    ..   ld1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
-# CHECK-NEXT: [0,1]     D=eE----R .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeeeeeeER.    ..   ld1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
-# CHECK-NEXT: [0,3]     .D=eE----R.    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D==eeeeeeER  ..   ld1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
-# CHECK-NEXT: [0,5]     . D===eE----R  ..   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D==eeeeeeER ..   ld1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
-# CHECK-NEXT: [0,7]     .  D===eE----R ..   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D====eeeeeeER   ld1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
-# CHECK-NEXT: [0,9]     .   D=====eE----R   add	x0, x27, #1
+# CHECK-NEXT: [0,1]     .DeE----R .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeeeeeeER    ..   ld1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: [0,3]     .  DeE----R    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   DeeeeeeER  ..   ld1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: [0,5]     .    DeE----R  ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .DeeeeeeER..   ld1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: [0,7]     .    . DeE----R..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  DeeeeeeER   ld1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: [0,9]     .    .   DeE----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1653,25 +1653,25 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
-# CHECK-NEXT: 1.     1     2.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
-# CHECK-NEXT: 3.     1     2.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     3.0    2.0    0.0       ld1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
-# CHECK-NEXT: 5.     1     4.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ld1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
-# CHECK-NEXT: 7.     1     4.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     5.0    2.0    0.0       ld1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
-# CHECK-NEXT: 9.     1     6.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT:        1     3.1    0.5    2.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       ld1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: 3.     1     1.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       ld1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: 5.     1     1.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       ld1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: 7.     1     1.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       ld1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: 9.     1     1.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    0.5    2.0       <total>
 
 # CHECK:      [11] Code Region - G12
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      1006
+# CHECK-NEXT: Total Cycles:      1007
 # CHECK-NEXT: Total uOps:        3000
 
-# CHECK:      Dispatch Width:    8
+# CHECK:      Dispatch Width:    3
 # CHECK-NEXT: uOps Per Cycle:    2.98
 # CHECK-NEXT: IPC:               0.99
 # CHECK-NEXT: Block RThroughput: 10.0
@@ -1681,15 +1681,15 @@ add x0, x27, 1
 # CHECK-NEXT: Index     0123456789
 
 # CHECK:      [0,0]     DeeeeeeER .    ..   ld1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
-# CHECK-NEXT: [0,1]     D=eE----R .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeeeeeeER.    ..   ld1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
-# CHECK-NEXT: [0,3]     .D=eE----R.    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D==eeeeeeER  ..   ld1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: [0,5]     . D===eE----R  ..   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D==eeeeeeER ..   ld1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: [0,7]     .  D===eE----R ..   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D====eeeeeeER   ld1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: [0,9]     .   D=====eE----R   add	x0, x27, #1
+# CHECK-NEXT: [0,1]     .DeE----R .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeeeeeeER    ..   ld1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# CHECK-NEXT: [0,3]     .  DeE----R    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   DeeeeeeER  ..   ld1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: [0,5]     .    DeE----R  ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .DeeeeeeER..   ld1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: [0,7]     .    . DeE----R..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  DeeeeeeER   ld1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .   DeE----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1699,43 +1699,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
-# CHECK-NEXT: 1.     1     2.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
-# CHECK-NEXT: 3.     1     2.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     3.0    2.0    0.0       ld1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: 5.     1     4.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ld1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: 7.     1     4.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     5.0    2.0    0.0       ld1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: 9.     1     6.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT:        1     3.1    0.5    2.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       ld1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# CHECK-NEXT: 3.     1     1.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       ld1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: 5.     1     1.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       ld1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: 7.     1     1.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       ld1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: 9.     1     1.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    0.5    2.0       <total>
 
 # CHECK:      [12] Code Region - G13
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      1209
+# CHECK-NEXT: Total Cycles:      1210
 # CHECK-NEXT: Total uOps:        2800
 
-# CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    2.32
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    2.31
 # CHECK-NEXT: IPC:               0.83
-# CHECK-NEXT: Block RThroughput: 8.5
+# CHECK-NEXT: Block RThroughput: 9.3
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          0
-
-# CHECK:      [0,0]     DeeeeeeER .    .    .   ld1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: [0,1]     D=eE----R .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeeeeeeER.    .    .   ld1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: [0,3]     .D=eE----R.    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D==eeeeeeER  .    .   ld1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: [0,5]     . D===eE----R  .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D==eeeeeeER .    .   ld1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: [0,7]     .  D===eE----R .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D=======eeeeeeeER   ld1	{ v1.b }[0], [x27], #1
-# CHECK-NEXT: [0,9]     .   D========eE-----R   add	x0, x27, #1
+# CHECK-NEXT: Index     0123456789          01
+
+# CHECK:      [0,0]     DeeeeeeER .    .    ..   ld1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: [0,1]     .DeE----R .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeeeeeeER    .    ..   ld1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: [0,3]     .  DeE----R    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   DeeeeeeER  .    ..   ld1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: [0,5]     .    DeE----R  .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .DeeeeeeER.    ..   ld1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: [0,7]     .    . DeE----R.    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  D====eeeeeeeER   ld1	{ v1.b }[0], [x27], #1
+# CHECK-NEXT: [0,9]     .    .   D====eE-----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1745,16 +1745,16 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: 1.     1     2.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: 3.     1     2.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     3.0    2.0    0.0       ld1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: 5.     1     4.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ld1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: 7.     1     4.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     8.0    0.0    0.0       ld1	{ v1.b }[0], [x27], #1
-# CHECK-NEXT: 9.     1     9.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT:        1     3.7    0.3    2.1       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       ld1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: 3.     1     1.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       ld1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: 5.     1     1.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       ld1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: 7.     1     1.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     5.0    0.0    0.0       ld1	{ v1.b }[0], [x27], #1
+# CHECK-NEXT: 9.     1     5.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.8    0.4    2.1       <total>
 
 # CHECK:      [13] Code Region - G14
 
@@ -1763,25 +1763,25 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      3503
 # CHECK-NEXT: Total uOps:        2000
 
-# CHECK:      Dispatch Width:    8
+# CHECK:      Dispatch Width:    3
 # CHECK-NEXT: uOps Per Cycle:    0.57
 # CHECK-NEXT: IPC:               0.29
-# CHECK-NEXT: Block RThroughput: 3.3
+# CHECK-NEXT: Block RThroughput: 6.7
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789          01234567
 # CHECK-NEXT: Index     0123456789          0123456789
 
 # CHECK:      [0,0]     DeeeeeeeER.    .    .    .    .    . .   ld1	{ v1.b }[8], [x27], #1
-# CHECK-NEXT: [0,1]     D=eE-----R.    .    .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=======eeeeeeeER   .    .    .    . .   ld1	{ v1.b }[0], [x27], x28
-# CHECK-NEXT: [0,3]     D========eE-----R   .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D=============eeeeeeeER .    .    . .   ld1	{ v1.b }[8], [x27], x28
-# CHECK-NEXT: [0,5]     .D==============eE-----R .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D====================eeeeeeeER    . .   ld1	{ v1.h }[0], [x27], #2
-# CHECK-NEXT: [0,7]     .D=====================eE-----R    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D==========================eeeeeeeER   ld1	{ v1.h }[4], [x27], #2
-# CHECK-NEXT: [0,9]     . D===========================eE-----R   add	x0, x27, #1
+# CHECK-NEXT: [0,1]     .DeE-----R.    .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . D=====eeeeeeeER   .    .    .    . .   ld1	{ v1.b }[0], [x27], x28
+# CHECK-NEXT: [0,3]     .  D=====eE-----R   .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   D==========eeeeeeeER .    .    . .   ld1	{ v1.b }[8], [x27], x28
+# CHECK-NEXT: [0,5]     .    D==========eE-----R .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .D===============eeeeeeeER    . .   ld1	{ v1.h }[0], [x27], #2
+# CHECK-NEXT: [0,7]     .    . D===============eE-----R    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  D====================eeeeeeeER   ld1	{ v1.h }[4], [x27], #2
+# CHECK-NEXT: [0,9]     .    .   D====================eE-----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1791,16 +1791,16 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.b }[8], [x27], #1
-# CHECK-NEXT: 1.     1     2.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     8.0    0.0    0.0       ld1	{ v1.b }[0], [x27], x28
-# CHECK-NEXT: 3.     1     9.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     14.0   0.0    0.0       ld1	{ v1.b }[8], [x27], x28
-# CHECK-NEXT: 5.     1     15.0   0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     21.0   0.0    0.0       ld1	{ v1.h }[0], [x27], #2
-# CHECK-NEXT: 7.     1     22.0   0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     27.0   0.0    0.0       ld1	{ v1.h }[4], [x27], #2
-# CHECK-NEXT: 9.     1     28.0   0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT:        1     14.7   0.1    2.5       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     6.0    0.0    0.0       ld1	{ v1.b }[0], [x27], x28
+# CHECK-NEXT: 3.     1     6.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     11.0   0.0    0.0       ld1	{ v1.b }[8], [x27], x28
+# CHECK-NEXT: 5.     1     11.0   0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     16.0   0.0    0.0       ld1	{ v1.h }[0], [x27], #2
+# CHECK-NEXT: 7.     1     16.0   0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     21.0   0.0    0.0       ld1	{ v1.h }[4], [x27], #2
+# CHECK-NEXT: 9.     1     21.0   0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT:        1     11.0   0.1    2.5       <total>
 
 # CHECK:      [14] Code Region - G15
 
@@ -1809,25 +1809,25 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      3503
 # CHECK-NEXT: Total uOps:        2000
 
-# CHECK:      Dispatch Width:    8
+# CHECK:      Dispatch Width:    3
 # CHECK-NEXT: uOps Per Cycle:    0.57
 # CHECK-NEXT: IPC:               0.29
-# CHECK-NEXT: Block RThroughput: 3.3
+# CHECK-NEXT: Block RThroughput: 6.7
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789          01234567
 # CHECK-NEXT: Index     0123456789          0123456789
 
 # CHECK:      [0,0]     DeeeeeeeER.    .    .    .    .    . .   ld1	{ v1.h }[0], [x27], x28
-# CHECK-NEXT: [0,1]     D=eE-----R.    .    .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=======eeeeeeeER   .    .    .    . .   ld1	{ v1.h }[4], [x27], x28
-# CHECK-NEXT: [0,3]     D========eE-----R   .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D=============eeeeeeeER .    .    . .   ld1	{ v1.s }[0], [x27], #4
-# CHECK-NEXT: [0,5]     .D==============eE-----R .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D====================eeeeeeeER    . .   ld1	{ v1.s }[0], [x27], x28
-# CHECK-NEXT: [0,7]     .D=====================eE-----R    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D==========================eeeeeeeER   ld1	{ v1.d }[0], [x27], #8
-# CHECK-NEXT: [0,9]     . D===========================eE-----R   add	x0, x27, #1
+# CHECK-NEXT: [0,1]     .DeE-----R.    .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . D=====eeeeeeeER   .    .    .    . .   ld1	{ v1.h }[4], [x27], x28
+# CHECK-NEXT: [0,3]     .  D=====eE-----R   .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   D==========eeeeeeeER .    .    . .   ld1	{ v1.s }[0], [x27], #4
+# CHECK-NEXT: [0,5]     .    D==========eE-----R .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .D===============eeeeeeeER    . .   ld1	{ v1.s }[0], [x27], x28
+# CHECK-NEXT: [0,7]     .    . D===============eE-----R    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  D====================eeeeeeeER   ld1	{ v1.d }[0], [x27], #8
+# CHECK-NEXT: [0,9]     .    .   D====================eE-----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1837,43 +1837,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.h }[0], [x27], x28
-# CHECK-NEXT: 1.     1     2.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     8.0    0.0    0.0       ld1	{ v1.h }[4], [x27], x28
-# CHECK-NEXT: 3.     1     9.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     14.0   0.0    0.0       ld1	{ v1.s }[0], [x27], #4
-# CHECK-NEXT: 5.     1     15.0   0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     21.0   0.0    0.0       ld1	{ v1.s }[0], [x27], x28
-# CHECK-NEXT: 7.     1     22.0   0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     27.0   0.0    0.0       ld1	{ v1.d }[0], [x27], #8
-# CHECK-NEXT: 9.     1     28.0   0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT:        1     14.7   0.1    2.5       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     6.0    0.0    0.0       ld1	{ v1.h }[4], [x27], x28
+# CHECK-NEXT: 3.     1     6.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     11.0   0.0    0.0       ld1	{ v1.s }[0], [x27], #4
+# CHECK-NEXT: 5.     1     11.0   0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     16.0   0.0    0.0       ld1	{ v1.s }[0], [x27], x28
+# CHECK-NEXT: 7.     1     16.0   0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     21.0   0.0    0.0       ld1	{ v1.d }[0], [x27], #8
+# CHECK-NEXT: 9.     1     21.0   0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT:        1     11.0   0.1    2.5       <total>
 
 # CHECK:      [15] Code Region - G16
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      1103
+# CHECK-NEXT: Total Cycles:      1107
 # CHECK-NEXT: Total uOps:        2000
 
-# CHECK:      Dispatch Width:    8
+# CHECK:      Dispatch Width:    3
 # CHECK-NEXT: uOps Per Cycle:    1.81
-# CHECK-NEXT: IPC:               0.91
-# CHECK-NEXT: Block RThroughput: 3.3
+# CHECK-NEXT: IPC:               0.90
+# CHECK-NEXT: Block RThroughput: 6.7
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123
+# CHECK-NEXT:                     01234567
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeeER.  .   ld1	{ v1.d }[0], [x27], x28
-# CHECK-NEXT: [0,1]     D=eE-----R.  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=eeeeeeeER  .   ld1r	{ v1.1d }, [x27], #8
-# CHECK-NEXT: [0,3]     D==eE-----R  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D=eeeeeeeER .   ld1r	{ v1.2d }, [x27], #8
-# CHECK-NEXT: [0,5]     .D==eE-----R .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D==eeeeeeeER.   ld1r	{ v1.2s }, [x27], #4
-# CHECK-NEXT: [0,7]     .D===eE-----R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D==eeeeeeeER   ld1r	{ v1.4h }, [x27], #2
-# CHECK-NEXT: [0,9]     . D===eE-----R   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeeER.  .   .   ld1	{ v1.d }[0], [x27], x28
+# CHECK-NEXT: [0,1]     .DeE-----R.    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeeeeeeeER   . .   ld1r	{ v1.1d }, [x27], #8
+# CHECK-NEXT: [0,3]     .  DeE-----R   . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   DeeeeeeeER . .   ld1r	{ v1.2d }, [x27], #8
+# CHECK-NEXT: [0,5]     .    DeE-----R . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .DeeeeeeeER .   ld1r	{ v1.2s }, [x27], #4
+# CHECK-NEXT: [0,7]     .    . DeE-----R .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  DeeeeeeeER   ld1r	{ v1.4h }, [x27], #2
+# CHECK-NEXT: [0,9]     .    .   DeE-----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1883,43 +1883,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.d }[0], [x27], x28
-# CHECK-NEXT: 1.     1     2.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld1r	{ v1.1d }, [x27], #8
-# CHECK-NEXT: 3.     1     3.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ld1r	{ v1.2d }, [x27], #8
-# CHECK-NEXT: 5.     1     3.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ld1r	{ v1.2s }, [x27], #4
-# CHECK-NEXT: 7.     1     4.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     3.0    0.0    0.0       ld1r	{ v1.4h }, [x27], #2
-# CHECK-NEXT: 9.     1     4.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.7    0.1    2.5       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       ld1r	{ v1.1d }, [x27], #8
+# CHECK-NEXT: 3.     1     1.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       ld1r	{ v1.2d }, [x27], #8
+# CHECK-NEXT: 5.     1     1.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       ld1r	{ v1.2s }, [x27], #4
+# CHECK-NEXT: 7.     1     1.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       ld1r	{ v1.4h }, [x27], #2
+# CHECK-NEXT: 9.     1     1.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    0.5    2.5       <total>
 
 # CHECK:      [16] Code Region - G17
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      509
+# CHECK-NEXT: Total Cycles:      1008
 # CHECK-NEXT: Total uOps:        2000
 
-# CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    3.93
-# CHECK-NEXT: IPC:               1.96
-# CHECK-NEXT: Block RThroughput: 3.3
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    1.98
+# CHECK-NEXT: IPC:               0.99
+# CHECK-NEXT: Block RThroughput: 6.7
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123
+# CHECK-NEXT:                     01234567
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeeER.  .   ld1r	{ v1.4s }, [x27], #4
-# CHECK-NEXT: [0,1]     D=eE-----R.  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=eeeeeeeER  .   ld1r	{ v1.8b }, [x27], #1
-# CHECK-NEXT: [0,3]     D==eE-----R  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D=eeeeeeeER .   ld1r	{ v1.8h }, [x27], #2
-# CHECK-NEXT: [0,5]     .D==eE-----R .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D==eeeeeeeER.   ld1r	{ v1.16b }, [x27], #1
-# CHECK-NEXT: [0,7]     .D===eE-----R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D==eeeeeeeER   ld1r	{ v1.1d }, [x27], x28
-# CHECK-NEXT: [0,9]     . D===eE-----R   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeeER.  .   .   ld1r	{ v1.4s }, [x27], #4
+# CHECK-NEXT: [0,1]     .DeE-----R.    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeeeeeeeER   . .   ld1r	{ v1.8b }, [x27], #1
+# CHECK-NEXT: [0,3]     .  DeE-----R   . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   DeeeeeeeER . .   ld1r	{ v1.8h }, [x27], #2
+# CHECK-NEXT: [0,5]     .    DeE-----R . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .DeeeeeeeER .   ld1r	{ v1.16b }, [x27], #1
+# CHECK-NEXT: [0,7]     .    . DeE-----R .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  DeeeeeeeER   ld1r	{ v1.1d }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .   DeE-----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1929,43 +1929,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1r	{ v1.4s }, [x27], #4
-# CHECK-NEXT: 1.     1     2.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld1r	{ v1.8b }, [x27], #1
-# CHECK-NEXT: 3.     1     3.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ld1r	{ v1.8h }, [x27], #2
-# CHECK-NEXT: 5.     1     3.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ld1r	{ v1.16b }, [x27], #1
-# CHECK-NEXT: 7.     1     4.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     3.0    0.0    0.0       ld1r	{ v1.1d }, [x27], x28
-# CHECK-NEXT: 9.     1     4.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.7    0.1    2.5       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       ld1r	{ v1.8b }, [x27], #1
+# CHECK-NEXT: 3.     1     1.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       ld1r	{ v1.8h }, [x27], #2
+# CHECK-NEXT: 5.     1     1.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       ld1r	{ v1.16b }, [x27], #1
+# CHECK-NEXT: 7.     1     1.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       ld1r	{ v1.1d }, [x27], x28
+# CHECK-NEXT: 9.     1     1.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    0.5    2.5       <total>
 
 # CHECK:      [17] Code Region - G18
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      509
+# CHECK-NEXT: Total Cycles:      1008
 # CHECK-NEXT: Total uOps:        2000
 
-# CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    3.93
-# CHECK-NEXT: IPC:               1.96
-# CHECK-NEXT: Block RThroughput: 3.3
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    1.98
+# CHECK-NEXT: IPC:               0.99
+# CHECK-NEXT: Block RThroughput: 6.7
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123
+# CHECK-NEXT:                     01234567
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeeER.  .   ld1r	{ v1.2d }, [x27], x28
-# CHECK-NEXT: [0,1]     D=eE-----R.  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=eeeeeeeER  .   ld1r	{ v1.2s }, [x27], x28
-# CHECK-NEXT: [0,3]     D==eE-----R  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D=eeeeeeeER .   ld1r	{ v1.4h }, [x27], x28
-# CHECK-NEXT: [0,5]     .D==eE-----R .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D==eeeeeeeER.   ld1r	{ v1.4s }, [x27], x28
-# CHECK-NEXT: [0,7]     .D===eE-----R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D==eeeeeeeER   ld1r	{ v1.8b }, [x27], x28
-# CHECK-NEXT: [0,9]     . D===eE-----R   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeeER.  .   .   ld1r	{ v1.2d }, [x27], x28
+# CHECK-NEXT: [0,1]     .DeE-----R.    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeeeeeeeER   . .   ld1r	{ v1.2s }, [x27], x28
+# CHECK-NEXT: [0,3]     .  DeE-----R   . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   DeeeeeeeER . .   ld1r	{ v1.4h }, [x27], x28
+# CHECK-NEXT: [0,5]     .    DeE-----R . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .DeeeeeeeER .   ld1r	{ v1.4s }, [x27], x28
+# CHECK-NEXT: [0,7]     .    . DeE-----R .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  DeeeeeeeER   ld1r	{ v1.8b }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .   DeE-----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1975,43 +1975,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1r	{ v1.2d }, [x27], x28
-# CHECK-NEXT: 1.     1     2.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld1r	{ v1.2s }, [x27], x28
-# CHECK-NEXT: 3.     1     3.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ld1r	{ v1.4h }, [x27], x28
-# CHECK-NEXT: 5.     1     3.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ld1r	{ v1.4s }, [x27], x28
-# CHECK-NEXT: 7.     1     4.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     3.0    0.0    0.0       ld1r	{ v1.8b }, [x27], x28
-# CHECK-NEXT: 9.     1     4.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.7    0.1    2.5       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       ld1r	{ v1.2s }, [x27], x28
+# CHECK-NEXT: 3.     1     1.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       ld1r	{ v1.4h }, [x27], x28
+# CHECK-NEXT: 5.     1     1.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       ld1r	{ v1.4s }, [x27], x28
+# CHECK-NEXT: 7.     1     1.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       ld1r	{ v1.8b }, [x27], x28
+# CHECK-NEXT: 9.     1     1.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    0.5    2.5       <total>
 
 # CHECK:      [18] Code Region - G19
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      509
+# CHECK-NEXT: Total Cycles:      1008
 # CHECK-NEXT: Total uOps:        2600
 
-# CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    5.11
-# CHECK-NEXT: IPC:               1.96
-# CHECK-NEXT: Block RThroughput: 4.0
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    2.58
+# CHECK-NEXT: IPC:               0.99
+# CHECK-NEXT: Block RThroughput: 8.7
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123
+# CHECK-NEXT:                     01234567
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeeER.  .   ld1r	{ v1.8h }, [x27], x28
-# CHECK-NEXT: [0,1]     D=eE-----R.  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=eeeeeeeER  .   ld1r	{ v1.16b }, [x27], x28
-# CHECK-NEXT: [0,3]     D==eE-----R  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D=eeeeeeeER .   ld2	{ v1.2d, v2.2d }, [x27], #32
-# CHECK-NEXT: [0,5]     .D==eE-----R .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     . D=eeeeeeeER.   ld2	{ v1.2s, v2.2s }, [x27], #16
-# CHECK-NEXT: [0,7]     . D==eE-----R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .  D=eeeeeeeER   ld2	{ v1.4h, v2.4h }, [x27], #16
-# CHECK-NEXT: [0,9]     .  D==eE-----R   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeeER.    . .   ld1r	{ v1.8h }, [x27], x28
+# CHECK-NEXT: [0,1]     .DeE-----R.    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeeeeeeeER   . .   ld1r	{ v1.16b }, [x27], x28
+# CHECK-NEXT: [0,3]     .  DeE-----R   . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   DeeeeeeeER . .   ld2	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: [0,5]     .    DeE-----R . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .DeeeeeeeER .   ld2	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: [0,7]     .    . DeE-----R .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  DeeeeeeeER   ld2	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: [0,9]     .    .   DeE-----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2021,43 +2021,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1r	{ v1.8h }, [x27], x28
-# CHECK-NEXT: 1.     1     2.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld1r	{ v1.16b }, [x27], x28
-# CHECK-NEXT: 3.     1     3.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ld2	{ v1.2d, v2.2d }, [x27], #32
-# CHECK-NEXT: 5.     1     3.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     2.0    0.0    0.0       ld2	{ v1.2s, v2.2s }, [x27], #16
-# CHECK-NEXT: 7.     1     3.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     2.0    0.0    0.0       ld2	{ v1.4h, v2.4h }, [x27], #16
-# CHECK-NEXT: 9.     1     3.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.3    0.1    2.5       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       ld1r	{ v1.16b }, [x27], x28
+# CHECK-NEXT: 3.     1     1.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       ld2	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: 5.     1     1.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       ld2	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: 7.     1     1.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       ld2	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: 9.     1     1.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    0.5    2.5       <total>
 
 # CHECK:      [19] Code Region - G20
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      509
+# CHECK-NEXT: Total Cycles:      1008
 # CHECK-NEXT: Total uOps:        3000
 
-# CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    5.89
-# CHECK-NEXT: IPC:               1.96
-# CHECK-NEXT: Block RThroughput: 5.0
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    2.98
+# CHECK-NEXT: IPC:               0.99
+# CHECK-NEXT: Block RThroughput: 10.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123
+# CHECK-NEXT:                     01234567
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeeER.  .   ld2	{ v1.4s, v2.4s }, [x27], #32
-# CHECK-NEXT: [0,1]     D=eE-----R.  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeeeeeeeER  .   ld2	{ v1.8b, v2.8b }, [x27], #16
-# CHECK-NEXT: [0,3]     .D=eE-----R  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . DeeeeeeeER .   ld2	{ v1.8h, v2.8h }, [x27], #32
-# CHECK-NEXT: [0,5]     . D=eE-----R .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  DeeeeeeeER.   ld2	{ v1.16b, v2.16b }, [x27], #32
-# CHECK-NEXT: [0,7]     .  D=eE-----R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   DeeeeeeeER   ld2	{ v1.2d, v2.2d }, [x27], x28
-# CHECK-NEXT: [0,9]     .   D=eE-----R   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeeER.    . .   ld2	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: [0,1]     .DeE-----R.    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeeeeeeeER   . .   ld2	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: [0,3]     .  DeE-----R   . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   DeeeeeeeER . .   ld2	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: [0,5]     .    DeE-----R . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .DeeeeeeeER .   ld2	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: [0,7]     .    . DeE-----R .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  DeeeeeeeER   ld2	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .   DeE-----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2067,43 +2067,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld2	{ v1.4s, v2.4s }, [x27], #32
-# CHECK-NEXT: 1.     1     2.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld2	{ v1.8b, v2.8b }, [x27], #16
-# CHECK-NEXT: 3.     1     2.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ld2	{ v1.8h, v2.8h }, [x27], #32
-# CHECK-NEXT: 5.     1     2.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     1.0    0.0    0.0       ld2	{ v1.16b, v2.16b }, [x27], #32
-# CHECK-NEXT: 7.     1     2.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     1.0    0.0    0.0       ld2	{ v1.2d, v2.2d }, [x27], x28
-# CHECK-NEXT: 9.     1     2.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT:        1     1.5    0.1    2.5       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       ld2	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: 3.     1     1.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       ld2	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: 5.     1     1.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       ld2	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: 7.     1     1.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       ld2	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: 9.     1     1.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    0.5    2.5       <total>
 
 # CHECK:      [20] Code Region - G21
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      509
+# CHECK-NEXT: Total Cycles:      1008
 # CHECK-NEXT: Total uOps:        3000
 
-# CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    5.89
-# CHECK-NEXT: IPC:               1.96
-# CHECK-NEXT: Block RThroughput: 5.0
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    2.98
+# CHECK-NEXT: IPC:               0.99
+# CHECK-NEXT: Block RThroughput: 10.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123
+# CHECK-NEXT:                     01234567
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeeER.  .   ld2	{ v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT: [0,1]     D=eE-----R.  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeeeeeeeER  .   ld2	{ v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: [0,3]     .D=eE-----R  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . DeeeeeeeER .   ld2	{ v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT: [0,5]     . D=eE-----R .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  DeeeeeeeER.   ld2	{ v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: [0,7]     .  D=eE-----R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   DeeeeeeeER   ld2	{ v1.8h, v2.8h }, [x27], x28
-# CHECK-NEXT: [0,9]     .   D=eE-----R   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeeER.    . .   ld2	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: [0,1]     .DeE-----R.    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeeeeeeeER   . .   ld2	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: [0,3]     .  DeE-----R   . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   DeeeeeeeER . .   ld2	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: [0,5]     .    DeE-----R . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .DeeeeeeeER .   ld2	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: [0,7]     .    . DeE-----R .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  DeeeeeeeER   ld2	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .   DeE-----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2113,16 +2113,16 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld2	{ v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT: 1.     1     2.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld2	{ v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: 3.     1     2.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ld2	{ v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT: 5.     1     2.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     1.0    0.0    0.0       ld2	{ v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: 7.     1     2.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     1.0    0.0    0.0       ld2	{ v1.8h, v2.8h }, [x27], x28
-# CHECK-NEXT: 9.     1     2.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT:        1     1.5    0.1    2.5       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       ld2	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: 3.     1     1.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       ld2	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: 5.     1     1.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       ld2	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: 7.     1     1.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       ld2	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: 9.     1     1.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    0.5    2.5       <total>
 
 # CHECK:      [21] Code Region - G22
 
@@ -2131,25 +2131,25 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      2909
 # CHECK-NEXT: Total uOps:        3000
 
-# CHECK:      Dispatch Width:    8
+# CHECK:      Dispatch Width:    3
 # CHECK-NEXT: uOps Per Cycle:    1.03
 # CHECK-NEXT: IPC:               0.34
-# CHECK-NEXT: Block RThroughput: 5.0
+# CHECK-NEXT: Block RThroughput: 10.0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789          01234567
 # CHECK-NEXT: Index     0123456789          0123456789
 
 # CHECK:      [0,0]     DeeeeeeeER.    .    .    .    .    . .   ld2	{ v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: [0,1]     D=eE-----R.    .    .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .D======eeeeeeeER   .    .    .    . .   ld2	{ v1.b, v2.b }[0], [x27], #2
-# CHECK-NEXT: [0,3]     .D=======eE-----R   .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D============eeeeeeeER .    .    . .   ld2	{ v1.b, v2.b }[8], [x27], #2
-# CHECK-NEXT: [0,5]     . D=============eE-----R .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D==================eeeeeeeER    . .   ld2	{ v1.b, v2.b }[0], [x27], x28
-# CHECK-NEXT: [0,7]     .  D===================eE-----R    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D========================eeeeeeeER   ld2	{ v1.b, v2.b }[8], [x27], x28
-# CHECK-NEXT: [0,9]     .   D=========================eE-----R   add	x0, x27, #1
+# CHECK-NEXT: [0,1]     .DeE-----R.    .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . D=====eeeeeeeER   .    .    .    . .   ld2	{ v1.b, v2.b }[0], [x27], #2
+# CHECK-NEXT: [0,3]     .  D=====eE-----R   .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   D==========eeeeeeeER .    .    . .   ld2	{ v1.b, v2.b }[8], [x27], #2
+# CHECK-NEXT: [0,5]     .    D==========eE-----R .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .D===============eeeeeeeER    . .   ld2	{ v1.b, v2.b }[0], [x27], x28
+# CHECK-NEXT: [0,7]     .    . D===============eE-----R    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  D====================eeeeeeeER   ld2	{ v1.b, v2.b }[8], [x27], x28
+# CHECK-NEXT: [0,9]     .    .   D====================eE-----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2159,16 +2159,16 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld2	{ v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: 1.     1     2.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld2	{ v1.b, v2.b }[0], [x27], #2
-# CHECK-NEXT: 3.     1     8.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     13.0   0.0    0.0       ld2	{ v1.b, v2.b }[8], [x27], #2
-# CHECK-NEXT: 5.     1     14.0   0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     19.0   0.0    0.0       ld2	{ v1.b, v2.b }[0], [x27], x28
-# CHECK-NEXT: 7.     1     20.0   0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     25.0   0.0    0.0       ld2	{ v1.b, v2.b }[8], [x27], x28
-# CHECK-NEXT: 9.     1     26.0   0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT:        1     13.5   0.1    2.5       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     6.0    0.0    0.0       ld2	{ v1.b, v2.b }[0], [x27], #2
+# CHECK-NEXT: 3.     1     6.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     11.0   0.0    0.0       ld2	{ v1.b, v2.b }[8], [x27], #2
+# CHECK-NEXT: 5.     1     11.0   0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     16.0   0.0    0.0       ld2	{ v1.b, v2.b }[0], [x27], x28
+# CHECK-NEXT: 7.     1     16.0   0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     21.0   0.0    0.0       ld2	{ v1.b, v2.b }[8], [x27], x28
+# CHECK-NEXT: 9.     1     21.0   0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT:        1     11.0   0.1    2.5       <total>
 
 # CHECK:      [22] Code Region - G23
 
@@ -2177,25 +2177,25 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      3503
 # CHECK-NEXT: Total uOps:        3000
 
-# CHECK:      Dispatch Width:    8
+# CHECK:      Dispatch Width:    3
 # CHECK-NEXT: uOps Per Cycle:    0.86
 # CHECK-NEXT: IPC:               0.29
-# CHECK-NEXT: Block RThroughput: 5.0
+# CHECK-NEXT: Block RThroughput: 10.0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789          01234567
 # CHECK-NEXT: Index     0123456789          0123456789
 
 # CHECK:      [0,0]     DeeeeeeeER.    .    .    .    .    . .   ld2	{ v1.h, v2.h }[0], [x27], #4
-# CHECK-NEXT: [0,1]     D=eE-----R.    .    .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .D======eeeeeeeER   .    .    .    . .   ld2	{ v1.h, v2.h }[4], [x27], #4
-# CHECK-NEXT: [0,3]     .D=======eE-----R   .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D============eeeeeeeER .    .    . .   ld2	{ v1.h, v2.h }[0], [x27], x28
-# CHECK-NEXT: [0,5]     . D=============eE-----R .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D==================eeeeeeeER    . .   ld2	{ v1.h, v2.h }[4], [x27], x28
-# CHECK-NEXT: [0,7]     .  D===================eE-----R    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D========================eeeeeeeER   ld2	{ v1.s, v2.s }[0], [x27], #8
-# CHECK-NEXT: [0,9]     .   D=========================eE-----R   add	x0, x27, #1
+# CHECK-NEXT: [0,1]     .DeE-----R.    .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . D=====eeeeeeeER   .    .    .    . .   ld2	{ v1.h, v2.h }[4], [x27], #4
+# CHECK-NEXT: [0,3]     .  D=====eE-----R   .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   D==========eeeeeeeER .    .    . .   ld2	{ v1.h, v2.h }[0], [x27], x28
+# CHECK-NEXT: [0,5]     .    D==========eE-----R .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .D===============eeeeeeeER    . .   ld2	{ v1.h, v2.h }[4], [x27], x28
+# CHECK-NEXT: [0,7]     .    . D===============eE-----R    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  D====================eeeeeeeER   ld2	{ v1.s, v2.s }[0], [x27], #8
+# CHECK-NEXT: [0,9]     .    .   D====================eE-----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2205,16 +2205,16 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld2	{ v1.h, v2.h }[0], [x27], #4
-# CHECK-NEXT: 1.     1     2.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld2	{ v1.h, v2.h }[4], [x27], #4
-# CHECK-NEXT: 3.     1     8.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     13.0   0.0    0.0       ld2	{ v1.h, v2.h }[0], [x27], x28
-# CHECK-NEXT: 5.     1     14.0   0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     19.0   0.0    0.0       ld2	{ v1.h, v2.h }[4], [x27], x28
-# CHECK-NEXT: 7.     1     20.0   0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     25.0   0.0    0.0       ld2	{ v1.s, v2.s }[0], [x27], #8
-# CHECK-NEXT: 9.     1     26.0   0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT:        1     13.5   0.1    2.5       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     6.0    0.0    0.0       ld2	{ v1.h, v2.h }[4], [x27], #4
+# CHECK-NEXT: 3.     1     6.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     11.0   0.0    0.0       ld2	{ v1.h, v2.h }[0], [x27], x28
+# CHECK-NEXT: 5.     1     11.0   0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     16.0   0.0    0.0       ld2	{ v1.h, v2.h }[4], [x27], x28
+# CHECK-NEXT: 7.     1     16.0   0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     21.0   0.0    0.0       ld2	{ v1.s, v2.s }[0], [x27], #8
+# CHECK-NEXT: 9.     1     21.0   0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT:        1     11.0   0.1    2.5       <total>
 
 # CHECK:      [23] Code Region - G24
 
@@ -2223,25 +2223,25 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      2303
 # CHECK-NEXT: Total uOps:        3000
 
-# CHECK:      Dispatch Width:    8
+# CHECK:      Dispatch Width:    3
 # CHECK-NEXT: uOps Per Cycle:    1.30
 # CHECK-NEXT: IPC:               0.43
-# CHECK-NEXT: Block RThroughput: 5.0
+# CHECK-NEXT: Block RThroughput: 10.0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
 # CHECK-NEXT: Index     0123456789          012345
 
 # CHECK:      [0,0]     DeeeeeeeER.    .    .    .   ld2	{ v1.s, v2.s }[0], [x27], x28
-# CHECK-NEXT: [0,1]     D=eE-----R.    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .D======eeeeeeeER   .    .   ld2	{ v1.d, v2.d }[0], [x27], #16
-# CHECK-NEXT: [0,3]     .D=======eE-----R   .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D============eeeeeeeER .   ld2	{ v1.d, v2.d }[0], [x27], x28
-# CHECK-NEXT: [0,5]     . D=============eE-----R .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D============eeeeeeeER.   ld2r	{ v1.1d, v2.1d }, [x27], #16
-# CHECK-NEXT: [0,7]     .  D=============eE-----R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D============eeeeeeeER   ld2r	{ v1.2d, v2.2d }, [x27], #16
-# CHECK-NEXT: [0,9]     .   D=============eE-----R   add	x0, x27, #1
+# CHECK-NEXT: [0,1]     .DeE-----R.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . D=====eeeeeeeER   .    .   ld2	{ v1.d, v2.d }[0], [x27], #16
+# CHECK-NEXT: [0,3]     .  D=====eE-----R   .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   D==========eeeeeeeER .   ld2	{ v1.d, v2.d }[0], [x27], x28
+# CHECK-NEXT: [0,5]     .    D==========eE-----R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .D=========eeeeeeeER.   ld2r	{ v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: [0,7]     .    . D=========eE-----R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  D========eeeeeeeER   ld2r	{ v1.2d, v2.2d }, [x27], #16
+# CHECK-NEXT: [0,9]     .    .   D========eE-----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2251,43 +2251,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld2	{ v1.s, v2.s }[0], [x27], x28
-# CHECK-NEXT: 1.     1     2.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld2	{ v1.d, v2.d }[0], [x27], #16
-# CHECK-NEXT: 3.     1     8.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     13.0   0.0    0.0       ld2	{ v1.d, v2.d }[0], [x27], x28
-# CHECK-NEXT: 5.     1     14.0   0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     13.0   0.0    0.0       ld2r	{ v1.1d, v2.1d }, [x27], #16
-# CHECK-NEXT: 7.     1     14.0   0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     13.0   0.0    0.0       ld2r	{ v1.2d, v2.2d }, [x27], #16
-# CHECK-NEXT: 9.     1     14.0   0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT:        1     9.9    0.1    2.5       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     6.0    0.0    0.0       ld2	{ v1.d, v2.d }[0], [x27], #16
+# CHECK-NEXT: 3.     1     6.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     11.0   0.0    0.0       ld2	{ v1.d, v2.d }[0], [x27], x28
+# CHECK-NEXT: 5.     1     11.0   0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     10.0   0.0    0.0       ld2r	{ v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: 7.     1     10.0   0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     9.0    0.0    0.0       ld2r	{ v1.2d, v2.2d }, [x27], #16
+# CHECK-NEXT: 9.     1     9.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT:        1     7.4    0.1    2.5       <total>
 
 # CHECK:      [24] Code Region - G25
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      509
+# CHECK-NEXT: Total Cycles:      1008
 # CHECK-NEXT: Total uOps:        3000
 
-# CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    5.89
-# CHECK-NEXT: IPC:               1.96
-# CHECK-NEXT: Block RThroughput: 5.0
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    2.98
+# CHECK-NEXT: IPC:               0.99
+# CHECK-NEXT: Block RThroughput: 10.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123
+# CHECK-NEXT:                     01234567
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeeER.  .   ld2r	{ v1.2s, v2.2s }, [x27], #8
-# CHECK-NEXT: [0,1]     D=eE-----R.  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeeeeeeeER  .   ld2r	{ v1.4h, v2.4h }, [x27], #4
-# CHECK-NEXT: [0,3]     .D=eE-----R  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . DeeeeeeeER .   ld2r	{ v1.4s, v2.4s }, [x27], #8
-# CHECK-NEXT: [0,5]     . D=eE-----R .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  DeeeeeeeER.   ld2r	{ v1.8b, v2.8b }, [x27], #2
-# CHECK-NEXT: [0,7]     .  D=eE-----R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   DeeeeeeeER   ld2r	{ v1.8h, v2.8h }, [x27], #4
-# CHECK-NEXT: [0,9]     .   D=eE-----R   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeeER.    . .   ld2r	{ v1.2s, v2.2s }, [x27], #8
+# CHECK-NEXT: [0,1]     .DeE-----R.    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeeeeeeeER   . .   ld2r	{ v1.4h, v2.4h }, [x27], #4
+# CHECK-NEXT: [0,3]     .  DeE-----R   . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   DeeeeeeeER . .   ld2r	{ v1.4s, v2.4s }, [x27], #8
+# CHECK-NEXT: [0,5]     .    DeE-----R . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .DeeeeeeeER .   ld2r	{ v1.8b, v2.8b }, [x27], #2
+# CHECK-NEXT: [0,7]     .    . DeE-----R .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  DeeeeeeeER   ld2r	{ v1.8h, v2.8h }, [x27], #4
+# CHECK-NEXT: [0,9]     .    .   DeE-----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2297,43 +2297,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld2r	{ v1.2s, v2.2s }, [x27], #8
-# CHECK-NEXT: 1.     1     2.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld2r	{ v1.4h, v2.4h }, [x27], #4
-# CHECK-NEXT: 3.     1     2.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ld2r	{ v1.4s, v2.4s }, [x27], #8
-# CHECK-NEXT: 5.     1     2.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     1.0    0.0    0.0       ld2r	{ v1.8b, v2.8b }, [x27], #2
-# CHECK-NEXT: 7.     1     2.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     1.0    0.0    0.0       ld2r	{ v1.8h, v2.8h }, [x27], #4
-# CHECK-NEXT: 9.     1     2.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT:        1     1.5    0.1    2.5       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       ld2r	{ v1.4h, v2.4h }, [x27], #4
+# CHECK-NEXT: 3.     1     1.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       ld2r	{ v1.4s, v2.4s }, [x27], #8
+# CHECK-NEXT: 5.     1     1.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       ld2r	{ v1.8b, v2.8b }, [x27], #2
+# CHECK-NEXT: 7.     1     1.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       ld2r	{ v1.8h, v2.8h }, [x27], #4
+# CHECK-NEXT: 9.     1     1.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    0.5    2.5       <total>
 
 # CHECK:      [25] Code Region - G26
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      509
+# CHECK-NEXT: Total Cycles:      1008
 # CHECK-NEXT: Total uOps:        3000
 
-# CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    5.89
-# CHECK-NEXT: IPC:               1.96
-# CHECK-NEXT: Block RThroughput: 5.0
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    2.98
+# CHECK-NEXT: IPC:               0.99
+# CHECK-NEXT: Block RThroughput: 10.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123
+# CHECK-NEXT:                     01234567
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeeER.  .   ld2r	{ v1.16b, v2.16b }, [x27], #2
-# CHECK-NEXT: [0,1]     D=eE-----R.  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeeeeeeeER  .   ld2r	{ v1.1d, v2.1d }, [x27], x28
-# CHECK-NEXT: [0,3]     .D=eE-----R  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . DeeeeeeeER .   ld2r	{ v1.2d, v2.2d }, [x27], x28
-# CHECK-NEXT: [0,5]     . D=eE-----R .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  DeeeeeeeER.   ld2r	{ v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT: [0,7]     .  D=eE-----R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   DeeeeeeeER   ld2r	{ v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: [0,9]     .   D=eE-----R   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeeER.    . .   ld2r	{ v1.16b, v2.16b }, [x27], #2
+# CHECK-NEXT: [0,1]     .DeE-----R.    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeeeeeeeER   . .   ld2r	{ v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: [0,3]     .  DeE-----R   . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   DeeeeeeeER . .   ld2r	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: [0,5]     .    DeE-----R . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .DeeeeeeeER .   ld2r	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: [0,7]     .    . DeE-----R .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  DeeeeeeeER   ld2r	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .   DeE-----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2343,43 +2343,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld2r	{ v1.16b, v2.16b }, [x27], #2
-# CHECK-NEXT: 1.     1     2.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld2r	{ v1.1d, v2.1d }, [x27], x28
-# CHECK-NEXT: 3.     1     2.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ld2r	{ v1.2d, v2.2d }, [x27], x28
-# CHECK-NEXT: 5.     1     2.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     1.0    0.0    0.0       ld2r	{ v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT: 7.     1     2.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     1.0    0.0    0.0       ld2r	{ v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: 9.     1     2.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT:        1     1.5    0.1    2.5       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       ld2r	{ v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: 3.     1     1.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       ld2r	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: 5.     1     1.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       ld2r	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: 7.     1     1.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       ld2r	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: 9.     1     1.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    0.5    2.5       <total>
 
 # CHECK:      [26] Code Region - G27
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      609
+# CHECK-NEXT: Total Cycles:      1108
 # CHECK-NEXT: Total uOps:        3200
 
-# CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    5.25
-# CHECK-NEXT: IPC:               1.64
-# CHECK-NEXT: Block RThroughput: 5.5
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    2.89
+# CHECK-NEXT: IPC:               0.90
+# CHECK-NEXT: Block RThroughput: 10.7
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     01234
+# CHECK-NEXT:                     012345678
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeeER.   .   ld2r	{ v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT: [0,1]     D=eE-----R.   .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeeeeeeeER   .   ld2r	{ v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: [0,3]     .D=eE-----R   .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . DeeeeeeeER  .   ld2r	{ v1.8h, v2.8h }, [x27], x28
-# CHECK-NEXT: [0,5]     . D=eE-----R  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  DeeeeeeeER .   ld2r	{ v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: [0,7]     .  D=eE-----R .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   DeeeeeeeeER   ld3	{ v1.2d, v2.2d, v3.2d }, [x27], #48
-# CHECK-NEXT: [0,9]     .   D=eE------R   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeeER.    .  .   ld2r	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: [0,1]     .DeE-----R.    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeeeeeeeER   .  .   ld2r	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: [0,3]     .  DeE-----R   .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   DeeeeeeeER .  .   ld2r	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: [0,5]     .    DeE-----R .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .DeeeeeeeER  .   ld2r	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: [0,7]     .    . DeE-----R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  DeeeeeeeeER   ld3	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: [0,9]     .    .    DeE-----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2389,43 +2389,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld2r	{ v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT: 1.     1     2.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld2r	{ v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: 3.     1     2.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ld2r	{ v1.8h, v2.8h }, [x27], x28
-# CHECK-NEXT: 5.     1     2.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     1.0    0.0    0.0       ld2r	{ v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: 7.     1     2.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     1.0    0.0    0.0       ld3	{ v1.2d, v2.2d, v3.2d }, [x27], #48
-# CHECK-NEXT: 9.     1     2.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT:        1     1.5    0.1    2.6       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       ld2r	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: 3.     1     1.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       ld2r	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: 5.     1     1.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       ld2r	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: 7.     1     1.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       ld3	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: 9.     1     1.0    1.0    5.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    0.6    2.5       <total>
 
 # CHECK:      [27] Code Region - G28
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      759
+# CHECK-NEXT: Total Cycles:      1508
 # CHECK-NEXT: Total uOps:        4000
 
-# CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    5.27
-# CHECK-NEXT: IPC:               1.32
-# CHECK-NEXT: Block RThroughput: 7.5
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    2.65
+# CHECK-NEXT: IPC:               0.66
+# CHECK-NEXT: Block RThroughput: 13.3
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456
-# CHECK-NEXT: Index     0123456789
-
-# CHECK:      [0,0]     DeeeeeeeeER    ..   ld3	{ v1.2s, v2.2s, v3.2s }, [x27], #24
-# CHECK-NEXT: [0,1]     D=eE------R    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeeeeeeeeER   ..   ld3	{ v1.4h, v2.4h, v3.4h }, [x27], #24
-# CHECK-NEXT: [0,3]     .D=eE------R   ..   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D=eeeeeeeeER ..   ld3	{ v1.4s, v2.4s, v3.4s }, [x27], #48
-# CHECK-NEXT: [0,5]     . D==eE------R ..   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D=eeeeeeeeER..   ld3	{ v1.8b, v2.8b, v3.8b }, [x27], #24
-# CHECK-NEXT: [0,7]     .  D==eE------R..   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D==eeeeeeeeER   ld3	{ v1.8h, v2.8h, v3.8h }, [x27], #48
-# CHECK-NEXT: [0,9]     .   D===eE------R   add	x0, x27, #1
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    . .   ld3	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: [0,1]     . DeE-----R    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .  DeeeeeeeeER .    . .   ld3	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: [0,3]     .    DeE-----R .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .DeeeeeeeeER   . .   ld3	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: [0,5]     .    .  DeE-----R   . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .   DeeeeeeeeER. .   ld3	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: [0,7]     .    .    .DeE-----R. .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    . DeeeeeeeeER   ld3	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: [0,9]     .    .    .   DeE-----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2435,43 +2435,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3	{ v1.2s, v2.2s, v3.2s }, [x27], #24
-# CHECK-NEXT: 1.     1     2.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld3	{ v1.4h, v2.4h, v3.4h }, [x27], #24
-# CHECK-NEXT: 3.     1     2.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     2.0    1.0    0.0       ld3	{ v1.4s, v2.4s, v3.4s }, [x27], #48
-# CHECK-NEXT: 5.     1     3.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     2.0    0.0    0.0       ld3	{ v1.8b, v2.8b, v3.8b }, [x27], #24
-# CHECK-NEXT: 7.     1     3.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     3.0    1.0    0.0       ld3	{ v1.8h, v2.8h, v3.8h }, [x27], #48
-# CHECK-NEXT: 9.     1     4.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.3    0.3    3.0       <total>
+# CHECK-NEXT: 1.     1     1.0    1.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       ld3	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: 3.     1     1.0    1.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       ld3	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: 5.     1     1.0    1.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       ld3	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: 7.     1     1.0    1.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       ld3	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: 9.     1     1.0    1.0    5.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    1.0    2.5       <total>
 
 # CHECK:      [28] Code Region - G29
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      759
+# CHECK-NEXT: Total Cycles:      1508
 # CHECK-NEXT: Total uOps:        4000
 
-# CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    5.27
-# CHECK-NEXT: IPC:               1.32
-# CHECK-NEXT: Block RThroughput: 7.5
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    2.65
+# CHECK-NEXT: IPC:               0.66
+# CHECK-NEXT: Block RThroughput: 13.3
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456
-# CHECK-NEXT: Index     0123456789
-
-# CHECK:      [0,0]     DeeeeeeeeER    ..   ld3	{ v1.16b, v2.16b, v3.16b }, [x27], #48
-# CHECK-NEXT: [0,1]     D=eE------R    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeeeeeeeeER   ..   ld3	{ v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT: [0,3]     .D=eE------R   ..   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D=eeeeeeeeER ..   ld3	{ v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: [0,5]     . D==eE------R ..   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D=eeeeeeeeER..   ld3	{ v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: [0,7]     .  D==eE------R..   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D==eeeeeeeeER   ld3	{ v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: [0,9]     .   D===eE------R   add	x0, x27, #1
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    . .   ld3	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: [0,1]     . DeE-----R    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .  DeeeeeeeeER .    . .   ld3	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: [0,3]     .    DeE-----R .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .DeeeeeeeeER   . .   ld3	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .  DeE-----R   . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .   DeeeeeeeeER. .   ld3	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .DeE-----R. .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    . DeeeeeeeeER   ld3	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .   DeE-----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2481,43 +2481,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3	{ v1.16b, v2.16b, v3.16b }, [x27], #48
-# CHECK-NEXT: 1.     1     2.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld3	{ v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT: 3.     1     2.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     2.0    1.0    0.0       ld3	{ v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: 5.     1     3.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     2.0    0.0    0.0       ld3	{ v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: 7.     1     3.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     3.0    1.0    0.0       ld3	{ v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: 9.     1     4.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.3    0.3    3.0       <total>
+# CHECK-NEXT: 1.     1     1.0    1.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       ld3	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: 3.     1     1.0    1.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       ld3	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: 5.     1     1.0    1.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       ld3	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: 7.     1     1.0    1.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       ld3	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: 9.     1     1.0    1.0    5.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    1.0    2.5       <total>
 
 # CHECK:      [29] Code Region - G30
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      1909
+# CHECK-NEXT: Total Cycles:      1912
 # CHECK-NEXT: Total uOps:        3800
 
-# CHECK:      Dispatch Width:    8
+# CHECK:      Dispatch Width:    3
 # CHECK-NEXT: uOps Per Cycle:    1.99
 # CHECK-NEXT: IPC:               0.52
-# CHECK-NEXT: Block RThroughput: 7.5
+# CHECK-NEXT: Block RThroughput: 12.7
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          01234567
+# CHECK-NEXT:                     0123456789          0
+# CHECK-NEXT: Index     0123456789          0123456789
 
-# CHECK:      [0,0]     DeeeeeeeeER    .    .    . .   ld3	{ v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: [0,1]     D=eE------R    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeeeeeeeeER   .    .    . .   ld3	{ v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT: [0,3]     .D=eE------R   .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D=eeeeeeeeER .    .    . .   ld3	{ v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: [0,5]     . D==eE------R .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D========eeeeeeeER    . .   ld3	{ v1.b, v2.b, v3.b }[0], [x27], #3
-# CHECK-NEXT: [0,7]     .  D=========eE-----R    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D==============eeeeeeeER   ld3	{ v1.b, v2.b, v3.b }[8], [x27], #3
-# CHECK-NEXT: [0,9]     .   D===============eE-----R   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .   ld3	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: [0,1]     . DeE-----R    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .  DeeeeeeeeER .    .    .    .   ld3	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: [0,3]     .    DeE-----R .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .DeeeeeeeeER   .    .    .   ld3	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .  DeE-----R   .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .   D=====eeeeeeeER .    .   ld3	{ v1.b, v2.b, v3.b }[0], [x27], #3
+# CHECK-NEXT: [0,7]     .    .    .D====eE-----R .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    . D=========eeeeeeeER   ld3	{ v1.b, v2.b, v3.b }[8], [x27], #3
+# CHECK-NEXT: [0,9]     .    .    .   D========eE-----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2527,16 +2527,16 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3	{ v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: 1.     1     2.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld3	{ v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT: 3.     1     2.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     2.0    1.0    0.0       ld3	{ v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: 5.     1     3.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     9.0    0.0    0.0       ld3	{ v1.b, v2.b, v3.b }[0], [x27], #3
-# CHECK-NEXT: 7.     1     10.0   0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     15.0   0.0    0.0       ld3	{ v1.b, v2.b, v3.b }[8], [x27], #3
-# CHECK-NEXT: 9.     1     16.0   0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT:        1     6.1    0.2    2.8       <total>
+# CHECK-NEXT: 1.     1     1.0    1.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       ld3	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: 3.     1     1.0    1.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       ld3	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: 5.     1     1.0    1.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     6.0    0.0    0.0       ld3	{ v1.b, v2.b, v3.b }[0], [x27], #3
+# CHECK-NEXT: 7.     1     5.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     10.0   0.0    0.0       ld3	{ v1.b, v2.b, v3.b }[8], [x27], #3
+# CHECK-NEXT: 9.     1     9.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT:        1     3.6    0.6    2.5       <total>
 
 # CHECK:      [30] Code Region - G31
 
@@ -2545,25 +2545,25 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      3503
 # CHECK-NEXT: Total uOps:        3500
 
-# CHECK:      Dispatch Width:    8
+# CHECK:      Dispatch Width:    3
 # CHECK-NEXT: uOps Per Cycle:    1.00
 # CHECK-NEXT: IPC:               0.29
-# CHECK-NEXT: Block RThroughput: 7.5
+# CHECK-NEXT: Block RThroughput: 11.7
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789          01234567
 # CHECK-NEXT: Index     0123456789          0123456789
 
 # CHECK:      [0,0]     DeeeeeeeER.    .    .    .    .    . .   ld3	{ v1.b, v2.b, v3.b }[0], [x27], x28
-# CHECK-NEXT: [0,1]     D=eE-----R.    .    .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .D======eeeeeeeER   .    .    .    . .   ld3	{ v1.b, v2.b, v3.b }[8], [x27], x28
-# CHECK-NEXT: [0,3]     .D=======eE-----R   .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D============eeeeeeeER .    .    . .   ld3	{ v1.h, v2.h, v3.h }[0], [x27], #6
-# CHECK-NEXT: [0,5]     . D=============eE-----R .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D==================eeeeeeeER    . .   ld3	{ v1.h, v2.h, v3.h }[4], [x27], #6
-# CHECK-NEXT: [0,7]     .  D===================eE-----R    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D========================eeeeeeeER   ld3	{ v1.h, v2.h, v3.h }[0], [x27], x28
-# CHECK-NEXT: [0,9]     .   D=========================eE-----R   add	x0, x27, #1
+# CHECK-NEXT: [0,1]     . DeE----R.    .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .  D====eeeeeeeER   .    .    .    . .   ld3	{ v1.b, v2.b, v3.b }[8], [x27], x28
+# CHECK-NEXT: [0,3]     .    D===eE-----R   .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .D========eeeeeeeER .    .    . .   ld3	{ v1.h, v2.h, v3.h }[0], [x27], #6
+# CHECK-NEXT: [0,5]     .    .  D=======eE-----R .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .   D============eeeeeeeER    . .   ld3	{ v1.h, v2.h, v3.h }[4], [x27], #6
+# CHECK-NEXT: [0,7]     .    .    .D===========eE-----R    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    . D================eeeeeeeER   ld3	{ v1.h, v2.h, v3.h }[0], [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .   D===============eE-----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2573,16 +2573,16 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3	{ v1.b, v2.b, v3.b }[0], [x27], x28
-# CHECK-NEXT: 1.     1     2.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld3	{ v1.b, v2.b, v3.b }[8], [x27], x28
-# CHECK-NEXT: 3.     1     8.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     13.0   0.0    0.0       ld3	{ v1.h, v2.h, v3.h }[0], [x27], #6
-# CHECK-NEXT: 5.     1     14.0   0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     19.0   0.0    0.0       ld3	{ v1.h, v2.h, v3.h }[4], [x27], #6
-# CHECK-NEXT: 7.     1     20.0   0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     25.0   0.0    0.0       ld3	{ v1.h, v2.h, v3.h }[0], [x27], x28
-# CHECK-NEXT: 9.     1     26.0   0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT:        1     13.5   0.1    2.5       <total>
+# CHECK-NEXT: 1.     1     1.0    1.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     5.0    0.0    0.0       ld3	{ v1.b, v2.b, v3.b }[8], [x27], x28
+# CHECK-NEXT: 3.     1     4.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     9.0    0.0    0.0       ld3	{ v1.h, v2.h, v3.h }[0], [x27], #6
+# CHECK-NEXT: 5.     1     8.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     13.0   0.0    0.0       ld3	{ v1.h, v2.h, v3.h }[4], [x27], #6
+# CHECK-NEXT: 7.     1     12.0   0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     17.0   0.0    0.0       ld3	{ v1.h, v2.h, v3.h }[0], [x27], x28
+# CHECK-NEXT: 9.     1     16.0   0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT:        1     8.6    0.2    2.4       <total>
 
 # CHECK:      [31] Code Region - G32
 
@@ -2591,25 +2591,25 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      3503
 # CHECK-NEXT: Total uOps:        3500
 
-# CHECK:      Dispatch Width:    8
+# CHECK:      Dispatch Width:    3
 # CHECK-NEXT: uOps Per Cycle:    1.00
 # CHECK-NEXT: IPC:               0.29
-# CHECK-NEXT: Block RThroughput: 7.5
+# CHECK-NEXT: Block RThroughput: 11.7
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789          01234567
 # CHECK-NEXT: Index     0123456789          0123456789
 
 # CHECK:      [0,0]     DeeeeeeeER.    .    .    .    .    . .   ld3	{ v1.h, v2.h, v3.h }[4], [x27], x28
-# CHECK-NEXT: [0,1]     D=eE-----R.    .    .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .D======eeeeeeeER   .    .    .    . .   ld3	{ v1.s, v2.s, v3.s }[0], [x27], #12
-# CHECK-NEXT: [0,3]     .D=======eE-----R   .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D============eeeeeeeER .    .    . .   ld3	{ v1.s, v2.s, v3.s }[0], [x27], x28
-# CHECK-NEXT: [0,5]     . D=============eE-----R .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D==================eeeeeeeER    . .   ld3	{ v1.d, v2.d, v3.d }[0], [x27], #24
-# CHECK-NEXT: [0,7]     .  D===================eE-----R    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D========================eeeeeeeER   ld3	{ v1.d, v2.d, v3.d }[0], [x27], x28
-# CHECK-NEXT: [0,9]     .   D=========================eE-----R   add	x0, x27, #1
+# CHECK-NEXT: [0,1]     . DeE----R.    .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .  D====eeeeeeeER   .    .    .    . .   ld3	{ v1.s, v2.s, v3.s }[0], [x27], #12
+# CHECK-NEXT: [0,3]     .    D===eE-----R   .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .D========eeeeeeeER .    .    . .   ld3	{ v1.s, v2.s, v3.s }[0], [x27], x28
+# CHECK-NEXT: [0,5]     .    .  D=======eE-----R .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .   D============eeeeeeeER    . .   ld3	{ v1.d, v2.d, v3.d }[0], [x27], #24
+# CHECK-NEXT: [0,7]     .    .    .D===========eE-----R    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    . D================eeeeeeeER   ld3	{ v1.d, v2.d, v3.d }[0], [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .   D===============eE-----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2619,43 +2619,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3	{ v1.h, v2.h, v3.h }[4], [x27], x28
-# CHECK-NEXT: 1.     1     2.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld3	{ v1.s, v2.s, v3.s }[0], [x27], #12
-# CHECK-NEXT: 3.     1     8.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     13.0   0.0    0.0       ld3	{ v1.s, v2.s, v3.s }[0], [x27], x28
-# CHECK-NEXT: 5.     1     14.0   0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     19.0   0.0    0.0       ld3	{ v1.d, v2.d, v3.d }[0], [x27], #24
-# CHECK-NEXT: 7.     1     20.0   0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     25.0   0.0    0.0       ld3	{ v1.d, v2.d, v3.d }[0], [x27], x28
-# CHECK-NEXT: 9.     1     26.0   0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT:        1     13.5   0.1    2.5       <total>
+# CHECK-NEXT: 1.     1     1.0    1.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     5.0    0.0    0.0       ld3	{ v1.s, v2.s, v3.s }[0], [x27], #12
+# CHECK-NEXT: 3.     1     4.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     9.0    0.0    0.0       ld3	{ v1.s, v2.s, v3.s }[0], [x27], x28
+# CHECK-NEXT: 5.     1     8.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     13.0   0.0    0.0       ld3	{ v1.d, v2.d, v3.d }[0], [x27], #24
+# CHECK-NEXT: 7.     1     12.0   0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     17.0   0.0    0.0       ld3	{ v1.d, v2.d, v3.d }[0], [x27], x28
+# CHECK-NEXT: 9.     1     16.0   0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT:        1     8.6    0.2    2.4       <total>
 
 # CHECK:      [32] Code Region - G33
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      758
+# CHECK-NEXT: Total Cycles:      1507
 # CHECK-NEXT: Total uOps:        3500
 
-# CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    4.62
-# CHECK-NEXT: IPC:               1.32
-# CHECK-NEXT: Block RThroughput: 7.5
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    2.32
+# CHECK-NEXT: IPC:               0.66
+# CHECK-NEXT: Block RThroughput: 11.7
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     012345
-# CHECK-NEXT: Index     0123456789
-
-# CHECK:      [0,0]     DeeeeeeeER.    .   ld3r	{ v1.1d, v2.1d, v3.1d }, [x27], #24
-# CHECK-NEXT: [0,1]     D=eE-----R.    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeeeeeeeER    .   ld3r	{ v1.2d, v2.2d, v3.2d }, [x27], #24
-# CHECK-NEXT: [0,3]     .D=eE-----R    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D=eeeeeeeER  .   ld3r	{ v1.2s, v2.2s, v3.2s }, [x27], #12
-# CHECK-NEXT: [0,5]     . D==eE-----R  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D=eeeeeeeER .   ld3r	{ v1.4h, v2.4h, v3.4h }, [x27], #6
-# CHECK-NEXT: [0,7]     .  D==eE-----R .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D==eeeeeeeER   ld3r	{ v1.4s, v2.4s, v3.4s }, [x27], #12
-# CHECK-NEXT: [0,9]     .   D===eE-----R   add	x0, x27, #1
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          01
+
+# CHECK:      [0,0]     DeeeeeeeER.    .    ..   ld3r	{ v1.1d, v2.1d, v3.1d }, [x27], #24
+# CHECK-NEXT: [0,1]     . DeE----R.    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .  DeeeeeeeER  .    ..   ld3r	{ v1.2d, v2.2d, v3.2d }, [x27], #24
+# CHECK-NEXT: [0,3]     .    DeE----R  .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .DeeeeeeeER    ..   ld3r	{ v1.2s, v2.2s, v3.2s }, [x27], #12
+# CHECK-NEXT: [0,5]     .    .  DeE----R    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .   DeeeeeeeER ..   ld3r	{ v1.4h, v2.4h, v3.4h }, [x27], #6
+# CHECK-NEXT: [0,7]     .    .    .DeE----R ..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    . DeeeeeeeER   ld3r	{ v1.4s, v2.4s, v3.4s }, [x27], #12
+# CHECK-NEXT: [0,9]     .    .    .   DeE----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2665,43 +2665,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3r	{ v1.1d, v2.1d, v3.1d }, [x27], #24
-# CHECK-NEXT: 1.     1     2.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld3r	{ v1.2d, v2.2d, v3.2d }, [x27], #24
-# CHECK-NEXT: 3.     1     2.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     2.0    1.0    0.0       ld3r	{ v1.2s, v2.2s, v3.2s }, [x27], #12
-# CHECK-NEXT: 5.     1     3.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     2.0    0.0    0.0       ld3r	{ v1.4h, v2.4h, v3.4h }, [x27], #6
-# CHECK-NEXT: 7.     1     3.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     3.0    1.0    0.0       ld3r	{ v1.4s, v2.4s, v3.4s }, [x27], #12
-# CHECK-NEXT: 9.     1     4.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.3    0.3    2.5       <total>
+# CHECK-NEXT: 1.     1     1.0    1.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       ld3r	{ v1.2d, v2.2d, v3.2d }, [x27], #24
+# CHECK-NEXT: 3.     1     1.0    1.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       ld3r	{ v1.2s, v2.2s, v3.2s }, [x27], #12
+# CHECK-NEXT: 5.     1     1.0    1.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       ld3r	{ v1.4h, v2.4h, v3.4h }, [x27], #6
+# CHECK-NEXT: 7.     1     1.0    1.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       ld3r	{ v1.4s, v2.4s, v3.4s }, [x27], #12
+# CHECK-NEXT: 9.     1     1.0    1.0    4.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    1.0    2.0       <total>
 
 # CHECK:      [33] Code Region - G34
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      758
+# CHECK-NEXT: Total Cycles:      1507
 # CHECK-NEXT: Total uOps:        3500
 
-# CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    4.62
-# CHECK-NEXT: IPC:               1.32
-# CHECK-NEXT: Block RThroughput: 7.5
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    2.32
+# CHECK-NEXT: IPC:               0.66
+# CHECK-NEXT: Block RThroughput: 11.7
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     012345
-# CHECK-NEXT: Index     0123456789
-
-# CHECK:      [0,0]     DeeeeeeeER.    .   ld3r	{ v1.8b, v2.8b, v3.8b }, [x27], #3
-# CHECK-NEXT: [0,1]     D=eE-----R.    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeeeeeeeER    .   ld3r	{ v1.8h, v2.8h, v3.8h }, [x27], #6
-# CHECK-NEXT: [0,3]     .D=eE-----R    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D=eeeeeeeER  .   ld3r	{ v1.16b, v2.16b, v3.16b }, [x27], #3
-# CHECK-NEXT: [0,5]     . D==eE-----R  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D=eeeeeeeER .   ld3r	{ v1.1d, v2.1d, v3.1d }, [x27], x28
-# CHECK-NEXT: [0,7]     .  D==eE-----R .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D==eeeeeeeER   ld3r	{ v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT: [0,9]     .   D===eE-----R   add	x0, x27, #1
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          01
+
+# CHECK:      [0,0]     DeeeeeeeER.    .    ..   ld3r	{ v1.8b, v2.8b, v3.8b }, [x27], #3
+# CHECK-NEXT: [0,1]     . DeE----R.    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .  DeeeeeeeER  .    ..   ld3r	{ v1.8h, v2.8h, v3.8h }, [x27], #6
+# CHECK-NEXT: [0,3]     .    DeE----R  .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .DeeeeeeeER    ..   ld3r	{ v1.16b, v2.16b, v3.16b }, [x27], #3
+# CHECK-NEXT: [0,5]     .    .  DeE----R    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .   DeeeeeeeER ..   ld3r	{ v1.1d, v2.1d, v3.1d }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .DeE----R ..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    . DeeeeeeeER   ld3r	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .   DeE----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2711,43 +2711,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3r	{ v1.8b, v2.8b, v3.8b }, [x27], #3
-# CHECK-NEXT: 1.     1     2.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld3r	{ v1.8h, v2.8h, v3.8h }, [x27], #6
-# CHECK-NEXT: 3.     1     2.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     2.0    1.0    0.0       ld3r	{ v1.16b, v2.16b, v3.16b }, [x27], #3
-# CHECK-NEXT: 5.     1     3.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     2.0    0.0    0.0       ld3r	{ v1.1d, v2.1d, v3.1d }, [x27], x28
-# CHECK-NEXT: 7.     1     3.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     3.0    1.0    0.0       ld3r	{ v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT: 9.     1     4.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.3    0.3    2.5       <total>
+# CHECK-NEXT: 1.     1     1.0    1.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       ld3r	{ v1.8h, v2.8h, v3.8h }, [x27], #6
+# CHECK-NEXT: 3.     1     1.0    1.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       ld3r	{ v1.16b, v2.16b, v3.16b }, [x27], #3
+# CHECK-NEXT: 5.     1     1.0    1.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       ld3r	{ v1.1d, v2.1d, v3.1d }, [x27], x28
+# CHECK-NEXT: 7.     1     1.0    1.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       ld3r	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: 9.     1     1.0    1.0    4.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    1.0    2.0       <total>
 
 # CHECK:      [34] Code Region - G35
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      758
+# CHECK-NEXT: Total Cycles:      1507
 # CHECK-NEXT: Total uOps:        3500
 
-# CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    4.62
-# CHECK-NEXT: IPC:               1.32
-# CHECK-NEXT: Block RThroughput: 7.5
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    2.32
+# CHECK-NEXT: IPC:               0.66
+# CHECK-NEXT: Block RThroughput: 11.7
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     012345
-# CHECK-NEXT: Index     0123456789
-
-# CHECK:      [0,0]     DeeeeeeeER.    .   ld3r	{ v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: [0,1]     D=eE-----R.    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeeeeeeeER    .   ld3r	{ v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: [0,3]     .D=eE-----R    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D=eeeeeeeER  .   ld3r	{ v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: [0,5]     . D==eE-----R  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D=eeeeeeeER .   ld3r	{ v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: [0,7]     .  D==eE-----R .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D==eeeeeeeER   ld3r	{ v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT: [0,9]     .   D===eE-----R   add	x0, x27, #1
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          01
+
+# CHECK:      [0,0]     DeeeeeeeER.    .    ..   ld3r	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: [0,1]     . DeE----R.    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .  DeeeeeeeER  .    ..   ld3r	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: [0,3]     .    DeE----R  .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .DeeeeeeeER    ..   ld3r	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .  DeE----R    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .   DeeeeeeeER ..   ld3r	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .DeE----R ..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    . DeeeeeeeER   ld3r	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .   DeE----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2757,43 +2757,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3r	{ v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: 1.     1     2.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld3r	{ v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: 3.     1     2.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     2.0    1.0    0.0       ld3r	{ v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: 5.     1     3.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     2.0    0.0    0.0       ld3r	{ v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: 7.     1     3.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     3.0    1.0    0.0       ld3r	{ v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT: 9.     1     4.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.3    0.3    2.5       <total>
+# CHECK-NEXT: 1.     1     1.0    1.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       ld3r	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: 3.     1     1.0    1.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       ld3r	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: 5.     1     1.0    1.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       ld3r	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: 7.     1     1.0    1.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       ld3r	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: 9.     1     1.0    1.0    4.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    1.0    2.0       <total>
 
 # CHECK:      [35] Code Region - G36
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      960
+# CHECK-NEXT: Total Cycles:      1709
 # CHECK-NEXT: Total uOps:        4500
 
-# CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    4.69
-# CHECK-NEXT: IPC:               1.04
-# CHECK-NEXT: Block RThroughput: 9.5
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    2.63
+# CHECK-NEXT: IPC:               0.59
+# CHECK-NEXT: Block RThroughput: 15.0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789
+# CHECK-NEXT: Index     0123456789          012345
 
-# CHECK:      [0,0]     DeeeeeeeER.    .   .   ld3r	{ v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: [0,1]     D=eE-----R.    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeeeeeeeeeeER .   .   ld4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
-# CHECK-NEXT: [0,3]     . DeE--------R .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .  DeeeeeeeeER .   .   ld4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
-# CHECK-NEXT: [0,5]     .   DeE------R .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    DeeeeeeeeER   .   ld4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
-# CHECK-NEXT: [0,7]     .    .DeE------R   .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    . DeeeeeeeeeeER   ld4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
-# CHECK-NEXT: [0,9]     .    .  DeE--------R   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeeER.    .    .    .   ld3r	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: [0,1]     . DeE----R.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .  DeeeeeeeeeeER    .    .   ld4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: [0,3]     .    .DeE------R    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    . DeeeeeeeeER  .    .   ld4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: [0,5]     .    .   DeE-----R  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    DeeeeeeeeER    .   ld4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: [0,7]     .    .    . DeE-----R    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .  DeeeeeeeeeeER   ld4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: [0,9]     .    .    .    .DeE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2803,43 +2803,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3r	{ v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: 1.     1     2.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
-# CHECK-NEXT: 3.     1     1.0    0.0    8.0       add	x0, x27, #1
+# CHECK-NEXT: 1.     1     1.0    1.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       ld4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: 3.     1     1.0    1.0    6.0       add	x0, x27, #1
 # CHECK-NEXT: 4.     1     1.0    1.0    0.0       ld4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
-# CHECK-NEXT: 5.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 5.     1     1.0    1.0    5.0       add	x0, x27, #1
 # CHECK-NEXT: 6.     1     1.0    1.0    0.0       ld4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
-# CHECK-NEXT: 7.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 7.     1     1.0    1.0    5.0       add	x0, x27, #1
 # CHECK-NEXT: 8.     1     1.0    1.0    0.0       ld4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
-# CHECK-NEXT: 9.     1     1.0    0.0    8.0       add	x0, x27, #1
-# CHECK-NEXT:        1     1.1    0.4    3.3       <total>
+# CHECK-NEXT: 9.     1     1.0    1.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    1.0    2.6       <total>
 
 # CHECK:      [36] Code Region - G37
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      1009
+# CHECK-NEXT: Total Cycles:      1808
 # CHECK-NEXT: Total uOps:        4800
 
-# CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    4.76
-# CHECK-NEXT: IPC:               0.99
-# CHECK-NEXT: Block RThroughput: 10.0
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    2.65
+# CHECK-NEXT: IPC:               0.55
+# CHECK-NEXT: Block RThroughput: 16.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     012345678
-# CHECK-NEXT: Index     0123456789
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345
 
-# CHECK:      [0,0]     DeeeeeeeeER    .  .   ld4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
-# CHECK-NEXT: [0,1]     .DeE------R    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     . DeeeeeeeeeeER.  .   ld4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
-# CHECK-NEXT: [0,3]     .  DeE--------R.  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .   DeeeeeeeeeeER .   ld4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
-# CHECK-NEXT: [0,5]     .    DeE--------R .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .DeeeeeeeeeeER   ld4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: [0,7]     .    . DeE--------R   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .  DeeeeeeeeER   ld4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .   DeE------R   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .   ld4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: [0,1]     . DeE-----R    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .  DeeeeeeeeeeER    .    .   ld4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: [0,3]     .    .DeE------R    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    . DeeeeeeeeeeER.    .   ld4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: [0,5]     .    .    DeE------R.    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .DeeeeeeeeeeER .   ld4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .   DeE------R .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    DeeeeeeeeER   ld4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    . DeE-----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2849,43 +2849,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
-# CHECK-NEXT: 1.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 1.     1     1.0    1.0    5.0       add	x0, x27, #1
 # CHECK-NEXT: 2.     1     1.0    1.0    0.0       ld4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
-# CHECK-NEXT: 3.     1     1.0    0.0    8.0       add	x0, x27, #1
+# CHECK-NEXT: 3.     1     1.0    1.0    6.0       add	x0, x27, #1
 # CHECK-NEXT: 4.     1     1.0    1.0    0.0       ld4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
-# CHECK-NEXT: 5.     1     1.0    0.0    8.0       add	x0, x27, #1
+# CHECK-NEXT: 5.     1     1.0    1.0    6.0       add	x0, x27, #1
 # CHECK-NEXT: 6.     1     1.0    1.0    0.0       ld4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: 7.     1     1.0    0.0    8.0       add	x0, x27, #1
+# CHECK-NEXT: 7.     1     1.0    1.0    6.0       add	x0, x27, #1
 # CHECK-NEXT: 8.     1     1.0    1.0    0.0       ld4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: 9.     1     1.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT:        1     1.0    0.5    3.6       <total>
+# CHECK-NEXT: 9.     1     1.0    1.0    5.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    1.0    2.8       <total>
 
 # CHECK:      [37] Code Region - G38
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      1011
+# CHECK-NEXT: Total Cycles:      1809
 # CHECK-NEXT: Total uOps:        4800
 
-# CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    4.75
-# CHECK-NEXT: IPC:               0.99
-# CHECK-NEXT: Block RThroughput: 10.0
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    2.65
+# CHECK-NEXT: IPC:               0.55
+# CHECK-NEXT: Block RThroughput: 16.0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          0
-
-# CHECK:      [0,0]     DeeeeeeeeER    .    .   ld4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: [0,1]     .DeE------R    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     . DeeeeeeeeeeER.    .   ld4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: [0,3]     .  DeE--------R.    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .   DeeeeeeeeER.    .   ld4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: [0,5]     .    DeE------R.    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .DeeeeeeeeeeER .   ld4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: [0,7]     .    . DeE--------R .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .  DeeeeeeeeeeER   ld4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .   DeE--------R   add	x0, x27, #1
+# CHECK-NEXT: Index     0123456789          0123456
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    ..   ld4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: [0,1]     . DeE-----R    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .  DeeeeeeeeeeER    .    ..   ld4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .DeE------R    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    . DeeeeeeeeER  .    ..   ld4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .   DeE-----R  .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    DeeeeeeeeeeER  ..   ld4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .  DeE------R  ..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .   DeeeeeeeeeeER   ld4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    . DeE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2895,16 +2895,16 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: 1.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 1.     1     1.0    1.0    5.0       add	x0, x27, #1
 # CHECK-NEXT: 2.     1     1.0    1.0    0.0       ld4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: 3.     1     1.0    0.0    8.0       add	x0, x27, #1
+# CHECK-NEXT: 3.     1     1.0    1.0    6.0       add	x0, x27, #1
 # CHECK-NEXT: 4.     1     1.0    1.0    0.0       ld4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: 5.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 5.     1     1.0    1.0    5.0       add	x0, x27, #1
 # CHECK-NEXT: 6.     1     1.0    1.0    0.0       ld4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: 7.     1     1.0    0.0    8.0       add	x0, x27, #1
+# CHECK-NEXT: 7.     1     1.0    1.0    6.0       add	x0, x27, #1
 # CHECK-NEXT: 8.     1     1.0    1.0    0.0       ld4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: 9.     1     1.0    0.0    8.0       add	x0, x27, #1
-# CHECK-NEXT:        1     1.0    0.5    3.6       <total>
+# CHECK-NEXT: 9.     1     1.0    1.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    1.0    2.8       <total>
 
 # CHECK:      [38] Code Region - G39
 
@@ -2913,25 +2913,25 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      4003
 # CHECK-NEXT: Total uOps:        5000
 
-# CHECK:      Dispatch Width:    8
+# CHECK:      Dispatch Width:    3
 # CHECK-NEXT: uOps Per Cycle:    1.25
 # CHECK-NEXT: IPC:               0.25
-# CHECK-NEXT: Block RThroughput: 10.0
+# CHECK-NEXT: Block RThroughput: 16.7
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789          0123456789
 # CHECK-NEXT: Index     0123456789          0123456789          012
 
 # CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    . .   ld4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
-# CHECK-NEXT: [0,1]     .DeE------R    .    .    .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     . D======eeeeeeeeER .    .    .    .    . .   ld4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
-# CHECK-NEXT: [0,3]     .  D======eE------R .    .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .   D============eeeeeeeeER   .    .    . .   ld4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
-# CHECK-NEXT: [0,5]     .    D============eE------R   .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .D==================eeeeeeeeER.    . .   ld4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
-# CHECK-NEXT: [0,7]     .    . D==================eE------R.    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .  D========================eeeeeeeeER   ld4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
-# CHECK-NEXT: [0,9]     .    .   D========================eE------R   add	x0, x27, #1
+# CHECK-NEXT: [0,1]     .  DeE----R    .    .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   D====eeeeeeeeER .    .    .    .    . .   ld4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
+# CHECK-NEXT: [0,3]     .    . D==eE------R .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  D========eeeeeeeeER   .    .    . .   ld4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .D======eE------R   .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    . D============eeeeeeeeER.    . .   ld4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    D==========eE------R.    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .D================eeeeeeeeER   ld4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
+# CHECK-NEXT: [0,9]     .    .    .    .   D==============eE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2941,16 +2941,16 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
-# CHECK-NEXT: 1.     1     1.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
-# CHECK-NEXT: 3.     1     7.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     13.0   0.0    0.0       ld4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
-# CHECK-NEXT: 5.     1     13.0   0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     19.0   0.0    0.0       ld4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
-# CHECK-NEXT: 7.     1     19.0   0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     25.0   0.0    0.0       ld4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
-# CHECK-NEXT: 9.     1     25.0   0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT:        1     13.0   0.1    3.0       <total>
+# CHECK-NEXT: 1.     1     1.0    1.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     5.0    0.0    0.0       ld4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
+# CHECK-NEXT: 3.     1     3.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     9.0    0.0    0.0       ld4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
+# CHECK-NEXT: 5.     1     7.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     13.0   0.0    0.0       ld4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
+# CHECK-NEXT: 7.     1     11.0   0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     17.0   0.0    0.0       ld4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
+# CHECK-NEXT: 9.     1     15.0   0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     8.2    0.2    2.8       <total>
 
 # CHECK:      [39] Code Region - G40
 
@@ -2959,25 +2959,25 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      4003
 # CHECK-NEXT: Total uOps:        5000
 
-# CHECK:      Dispatch Width:    8
+# CHECK:      Dispatch Width:    3
 # CHECK-NEXT: uOps Per Cycle:    1.25
 # CHECK-NEXT: IPC:               0.25
-# CHECK-NEXT: Block RThroughput: 10.0
+# CHECK-NEXT: Block RThroughput: 16.7
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789          0123456789
 # CHECK-NEXT: Index     0123456789          0123456789          012
 
 # CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    . .   ld4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
-# CHECK-NEXT: [0,1]     .DeE------R    .    .    .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     . D======eeeeeeeeER .    .    .    .    . .   ld4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
-# CHECK-NEXT: [0,3]     .  D======eE------R .    .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .   D============eeeeeeeeER   .    .    . .   ld4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
-# CHECK-NEXT: [0,5]     .    D============eE------R   .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .D==================eeeeeeeeER.    . .   ld4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
-# CHECK-NEXT: [0,7]     .    . D==================eE------R.    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .  D========================eeeeeeeeER   ld4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
-# CHECK-NEXT: [0,9]     .    .   D========================eE------R   add	x0, x27, #1
+# CHECK-NEXT: [0,1]     .  DeE----R    .    .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   D====eeeeeeeeER .    .    .    .    . .   ld4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
+# CHECK-NEXT: [0,3]     .    . D==eE------R .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  D========eeeeeeeeER   .    .    . .   ld4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .D======eE------R   .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    . D============eeeeeeeeER.    . .   ld4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
+# CHECK-NEXT: [0,7]     .    .    .    D==========eE------R.    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .D================eeeeeeeeER   ld4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .   D==============eE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2987,43 +2987,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
-# CHECK-NEXT: 1.     1     1.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
-# CHECK-NEXT: 3.     1     7.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     13.0   0.0    0.0       ld4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
-# CHECK-NEXT: 5.     1     13.0   0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     19.0   0.0    0.0       ld4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
-# CHECK-NEXT: 7.     1     19.0   0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     25.0   0.0    0.0       ld4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
-# CHECK-NEXT: 9.     1     25.0   0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT:        1     13.0   0.1    3.0       <total>
+# CHECK-NEXT: 1.     1     1.0    1.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     5.0    0.0    0.0       ld4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
+# CHECK-NEXT: 3.     1     3.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     9.0    0.0    0.0       ld4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
+# CHECK-NEXT: 5.     1     7.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     13.0   0.0    0.0       ld4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
+# CHECK-NEXT: 7.     1     11.0   0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     17.0   0.0    0.0       ld4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
+# CHECK-NEXT: 9.     1     15.0   0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     8.2    0.2    2.8       <total>
 
 # CHECK:      [40] Code Region - G41
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2103
+# CHECK-NEXT: Total Cycles:      2106
 # CHECK-NEXT: Total uOps:        5000
 
-# CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    2.38
-# CHECK-NEXT: IPC:               0.48
-# CHECK-NEXT: Block RThroughput: 10.0
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    2.37
+# CHECK-NEXT: IPC:               0.47
+# CHECK-NEXT: Block RThroughput: 16.7
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          0123
-
-# CHECK:      [0,0]     DeeeeeeeeER    .    .  .   ld4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
-# CHECK-NEXT: [0,1]     .DeE------R    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     . D======eeeeeeeeER .  .   ld4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
-# CHECK-NEXT: [0,3]     .  D======eE------R .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .   D=====eeeeeeeeER.  .   ld4r	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
-# CHECK-NEXT: [0,5]     .    D=====eE------R.  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .D======eeeeeeeeER.   ld4r	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #32
-# CHECK-NEXT: [0,7]     .    . D======eE------R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .  D=====eeeeeeeeER   ld4r	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #16
-# CHECK-NEXT: [0,9]     .    .   D=====eE------R   add	x0, x27, #1
+# CHECK-NEXT: Index     0123456789          0123456
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    ..   ld4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
+# CHECK-NEXT: [0,1]     .  DeE----R    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   D====eeeeeeeeER .    ..   ld4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
+# CHECK-NEXT: [0,3]     .    . D==eE------R .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  D=eeeeeeeeER.    ..   ld4r	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: [0,5]     .    .    .DeE-----R.    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    . DeeeeeeeeER  ..   ld4r	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #32
+# CHECK-NEXT: [0,7]     .    .    .    DeE----R  ..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .DeeeeeeeeER   ld4r	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #16
+# CHECK-NEXT: [0,9]     .    .    .    .   DeE----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3033,43 +3033,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
-# CHECK-NEXT: 1.     1     1.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
-# CHECK-NEXT: 3.     1     7.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     6.0    0.0    0.0       ld4r	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
-# CHECK-NEXT: 5.     1     6.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     7.0    2.0    0.0       ld4r	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #32
-# CHECK-NEXT: 7.     1     7.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     6.0    0.0    0.0       ld4r	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #16
-# CHECK-NEXT: 9.     1     6.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT:        1     5.4    0.3    3.0       <total>
+# CHECK-NEXT: 1.     1     1.0    1.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     5.0    0.0    0.0       ld4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
+# CHECK-NEXT: 3.     1     3.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ld4r	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: 5.     1     1.0    1.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       ld4r	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #32
+# CHECK-NEXT: 7.     1     1.0    1.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       ld4r	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #16
+# CHECK-NEXT: 9.     1     1.0    1.0    4.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.7    0.7    2.3       <total>
 
 # CHECK:      [41] Code Region - G42
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      1009
+# CHECK-NEXT: Total Cycles:      2007
 # CHECK-NEXT: Total uOps:        5000
 
-# CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    4.96
-# CHECK-NEXT: IPC:               0.99
-# CHECK-NEXT: Block RThroughput: 10.0
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    2.49
+# CHECK-NEXT: IPC:               0.50
+# CHECK-NEXT: Block RThroughput: 16.7
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     012345678
-# CHECK-NEXT: Index     0123456789
-
-# CHECK:      [0,0]     DeeeeeeeeER    .  .   ld4r	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #8
-# CHECK-NEXT: [0,1]     .DeE------R    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     . DeeeeeeeeER  .  .   ld4r	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #16
-# CHECK-NEXT: [0,3]     .  DeE------R  .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .   DeeeeeeeeER.  .   ld4r	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #4
-# CHECK-NEXT: [0,5]     .    DeE------R.  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .DeeeeeeeeER .   ld4r	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #8
-# CHECK-NEXT: [0,7]     .    . DeE------R .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .  DeeeeeeeeER   ld4r	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #4
-# CHECK-NEXT: [0,9]     .    .   DeE------R   add	x0, x27, #1
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0123456
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    ..   ld4r	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #8
+# CHECK-NEXT: [0,1]     .  DeE----R    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeeeeeeeER.    .    ..   ld4r	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #16
+# CHECK-NEXT: [0,3]     .    . DeE----R.    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  DeeeeeeeeER .    ..   ld4r	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #4
+# CHECK-NEXT: [0,5]     .    .    .DeE----R .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    . DeeeeeeeeER  ..   ld4r	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #8
+# CHECK-NEXT: [0,7]     .    .    .    DeE----R  ..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .DeeeeeeeeER   ld4r	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #4
+# CHECK-NEXT: [0,9]     .    .    .    .   DeE----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3079,43 +3079,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld4r	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #8
-# CHECK-NEXT: 1.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 1.     1     1.0    1.0    4.0       add	x0, x27, #1
 # CHECK-NEXT: 2.     1     1.0    1.0    0.0       ld4r	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #16
-# CHECK-NEXT: 3.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 3.     1     1.0    1.0    4.0       add	x0, x27, #1
 # CHECK-NEXT: 4.     1     1.0    1.0    0.0       ld4r	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #4
-# CHECK-NEXT: 5.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 5.     1     1.0    1.0    4.0       add	x0, x27, #1
 # CHECK-NEXT: 6.     1     1.0    1.0    0.0       ld4r	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #8
-# CHECK-NEXT: 7.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 7.     1     1.0    1.0    4.0       add	x0, x27, #1
 # CHECK-NEXT: 8.     1     1.0    1.0    0.0       ld4r	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #4
-# CHECK-NEXT: 9.     1     1.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT:        1     1.0    0.5    3.0       <total>
+# CHECK-NEXT: 9.     1     1.0    1.0    4.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    1.0    2.0       <total>
 
 # CHECK:      [42] Code Region - G43
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      1009
+# CHECK-NEXT: Total Cycles:      2007
 # CHECK-NEXT: Total uOps:        5000
 
-# CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    4.96
-# CHECK-NEXT: IPC:               0.99
-# CHECK-NEXT: Block RThroughput: 10.0
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    2.49
+# CHECK-NEXT: IPC:               0.50
+# CHECK-NEXT: Block RThroughput: 16.7
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     012345678
-# CHECK-NEXT: Index     0123456789
-
-# CHECK:      [0,0]     DeeeeeeeeER    .  .   ld4r	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
-# CHECK-NEXT: [0,1]     .DeE------R    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     . DeeeeeeeeER  .  .   ld4r	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: [0,3]     .  DeE------R  .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .   DeeeeeeeeER.  .   ld4r	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: [0,5]     .    DeE------R.  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .DeeeeeeeeER .   ld4r	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: [0,7]     .    . DeE------R .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .  DeeeeeeeeER   ld4r	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .   DeE------R   add	x0, x27, #1
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0123456
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    ..   ld4r	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# CHECK-NEXT: [0,1]     .  DeE----R    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeeeeeeeER.    .    ..   ld4r	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: [0,3]     .    . DeE----R.    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  DeeeeeeeeER .    ..   ld4r	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .DeE----R .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    . DeeeeeeeeER  ..   ld4r	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    DeE----R  ..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .DeeeeeeeeER   ld4r	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .   DeE----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3125,43 +3125,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld4r	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
-# CHECK-NEXT: 1.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 1.     1     1.0    1.0    4.0       add	x0, x27, #1
 # CHECK-NEXT: 2.     1     1.0    1.0    0.0       ld4r	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: 3.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 3.     1     1.0    1.0    4.0       add	x0, x27, #1
 # CHECK-NEXT: 4.     1     1.0    1.0    0.0       ld4r	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: 5.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 5.     1     1.0    1.0    4.0       add	x0, x27, #1
 # CHECK-NEXT: 6.     1     1.0    1.0    0.0       ld4r	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: 7.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 7.     1     1.0    1.0    4.0       add	x0, x27, #1
 # CHECK-NEXT: 8.     1     1.0    1.0    0.0       ld4r	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: 9.     1     1.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT:        1     1.0    0.5    3.0       <total>
+# CHECK-NEXT: 9.     1     1.0    1.0    4.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    1.0    2.0       <total>
 
 # CHECK:      [43] Code Region - G44
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      808
+# CHECK-NEXT: Total Cycles:      1606
 # CHECK-NEXT: Total uOps:        3800
 
-# CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    4.70
-# CHECK-NEXT: IPC:               1.24
-# CHECK-NEXT: Block RThroughput: 8.0
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    2.37
+# CHECK-NEXT: IPC:               0.62
+# CHECK-NEXT: Block RThroughput: 12.7
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     012345
-# CHECK-NEXT: Index     0123456789
-
-# CHECK:      [0,0]     DeeeeeeeeER    .   ld4r	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: [0,1]     .DeE------R    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     . DeeeeeeeeER  .   ld4r	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: [0,3]     .  DeE------R  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .   DeeeeeeeeER.   ld4r	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: [0,5]     .    DeE------R.   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    D=eeeeeE-R.   ldp	s1, s2, [x27], #248
-# CHECK-NEXT: [0,7]     .    D==eE----R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .D==eeeeeER   ldp	d1, d2, [x27], #496
-# CHECK-NEXT: [0,9]     .    .D===eE---R   add	x0, x27, #1
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          01
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    ..   ld4r	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: [0,1]     .  DeE----R    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeeeeeeeER.    ..   ld4r	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: [0,3]     .    . DeE----R.    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  DeeeeeeeeER ..   ld4r	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .DeE----R ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    . DeeeeeER..   ldp	s1, s2, [x27], #248
+# CHECK-NEXT: [0,7]     .    .    .  DeE---R..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .   DeeeeeER   ldp	d1, d2, [x27], #496
+# CHECK-NEXT: [0,9]     .    .    .    DeE---R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3171,43 +3171,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld4r	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: 1.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 1.     1     1.0    1.0    4.0       add	x0, x27, #1
 # CHECK-NEXT: 2.     1     1.0    1.0    0.0       ld4r	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: 3.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 3.     1     1.0    1.0    4.0       add	x0, x27, #1
 # CHECK-NEXT: 4.     1     1.0    1.0    0.0       ld4r	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: 5.     1     1.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     2.0    1.0    1.0       ldp	s1, s2, [x27], #248
-# CHECK-NEXT: 7.     1     3.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     3.0    1.0    0.0       ldp	d1, d2, [x27], #496
-# CHECK-NEXT: 9.     1     4.0    0.0    3.0       add	x0, x27, #1
-# CHECK-NEXT:        1     1.8    0.5    2.6       <total>
+# CHECK-NEXT: 5.     1     1.0    1.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       ldp	s1, s2, [x27], #248
+# CHECK-NEXT: 7.     1     1.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       ldp	d1, d2, [x27], #496
+# CHECK-NEXT: 9.     1     1.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    0.8    1.8       <total>
 
 # CHECK:      [44] Code Region - G45
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      508
+# CHECK-NEXT: Total Cycles:      1006
 # CHECK-NEXT: Total uOps:        2000
 
-# CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    3.94
-# CHECK-NEXT: IPC:               1.97
-# CHECK-NEXT: Block RThroughput: 5.0
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    1.99
+# CHECK-NEXT: IPC:               0.99
+# CHECK-NEXT: Block RThroughput: 6.7
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     012
+# CHECK-NEXT:                     012345
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeeER. .   ldp	q1, q2, [x27], #992
-# CHECK-NEXT: [0,1]     D=eE-----R. .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=eeeeeE-R. .   ldp	s1, s2, [x27, #248]!
-# CHECK-NEXT: [0,3]     D==eE----R. .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D=eeeeeER. .   ldp	d1, d2, [x27, #496]!
-# CHECK-NEXT: [0,5]     .D==eE---R. .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D==eeeeeeeER   ldp	q1, q2, [x27, #992]!
-# CHECK-NEXT: [0,7]     .D===eE-----R   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D==eeeeE--R   ldp	w1, w2, [x27], #248
-# CHECK-NEXT: [0,9]     . D===eE----R   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeeER.    .   ldp	q1, q2, [x27], #992
+# CHECK-NEXT: [0,1]     .DeE-----R.    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeeeeeER.    .   ldp	s1, s2, [x27, #248]!
+# CHECK-NEXT: [0,3]     .  DeE---R.    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   DeeeeeER   .   ldp	d1, d2, [x27, #496]!
+# CHECK-NEXT: [0,5]     .    DeE---R   .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .DeeeeeeeER   ldp	q1, q2, [x27, #992]!
+# CHECK-NEXT: [0,7]     .    . DeE-----R   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  DeeeeE-R   ldp	w1, w2, [x27], #248
+# CHECK-NEXT: [0,9]     .    .   DeE---R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3217,43 +3217,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ldp	q1, q2, [x27], #992
-# CHECK-NEXT: 1.     1     2.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    1.0       ldp	s1, s2, [x27, #248]!
-# CHECK-NEXT: 3.     1     3.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ldp	d1, d2, [x27, #496]!
-# CHECK-NEXT: 5.     1     3.0    0.0    3.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ldp	q1, q2, [x27, #992]!
-# CHECK-NEXT: 7.     1     4.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     3.0    0.0    2.0       ldp	w1, w2, [x27], #248
-# CHECK-NEXT: 9.     1     4.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.7    0.1    2.4       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       ldp	s1, s2, [x27, #248]!
+# CHECK-NEXT: 3.     1     1.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       ldp	d1, d2, [x27, #496]!
+# CHECK-NEXT: 5.     1     1.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       ldp	q1, q2, [x27, #992]!
+# CHECK-NEXT: 7.     1     1.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    1.0       ldp	w1, w2, [x27], #248
+# CHECK-NEXT: 9.     1     1.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    0.5    2.0       <total>
 
 # CHECK:      [45] Code Region - G46
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      507
+# CHECK-NEXT: Total Cycles:      1006
 # CHECK-NEXT: Total uOps:        2000
 
-# CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    3.94
-# CHECK-NEXT: IPC:               1.97
-# CHECK-NEXT: Block RThroughput: 4.0
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    1.99
+# CHECK-NEXT: IPC:               0.99
+# CHECK-NEXT: Block RThroughput: 6.7
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     01
+# CHECK-NEXT:                     012345
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeER   ..   ldp	x1, x2, [x27], #496
-# CHECK-NEXT: [0,1]     D=eE--R   ..   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=eeeeER  ..   ldp	w1, w2, [x27, #248]!
-# CHECK-NEXT: [0,3]     D==eE--R  ..   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D=eeeeER ..   ldp	x1, x2, [x27, #496]!
-# CHECK-NEXT: [0,5]     .D==eE--R ..   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D==eeeeeER.   ldpsw	x1, x2, [x27], #248
-# CHECK-NEXT: [0,7]     .D===eE---R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D==eeeeeER   ldpsw	x1, x2, [x27, #248]!
-# CHECK-NEXT: [0,9]     . D===eE---R   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeER   .    .   ldp	x1, x2, [x27], #496
+# CHECK-NEXT: [0,1]     .DeE--R   .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeeeeER .    .   ldp	w1, w2, [x27, #248]!
+# CHECK-NEXT: [0,3]     .  DeE--R .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   DeeeeER    .   ldp	x1, x2, [x27, #496]!
+# CHECK-NEXT: [0,5]     .    DeE--R    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .DeeeeeER .   ldpsw	x1, x2, [x27], #248
+# CHECK-NEXT: [0,7]     .    . DeE---R .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  DeeeeeER   ldpsw	x1, x2, [x27, #248]!
+# CHECK-NEXT: [0,9]     .    .   DeE---R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3263,16 +3263,16 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ldp	x1, x2, [x27], #496
-# CHECK-NEXT: 1.     1     2.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ldp	w1, w2, [x27, #248]!
-# CHECK-NEXT: 3.     1     3.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ldp	x1, x2, [x27, #496]!
-# CHECK-NEXT: 5.     1     3.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ldpsw	x1, x2, [x27], #248
-# CHECK-NEXT: 7.     1     4.0    0.0    3.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     3.0    0.0    0.0       ldpsw	x1, x2, [x27, #248]!
-# CHECK-NEXT: 9.     1     4.0    0.0    3.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.7    0.1    1.2       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       ldp	w1, w2, [x27, #248]!
+# CHECK-NEXT: 3.     1     1.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       ldp	x1, x2, [x27, #496]!
+# CHECK-NEXT: 5.     1     1.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       ldpsw	x1, x2, [x27], #248
+# CHECK-NEXT: 7.     1     1.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       ldpsw	x1, x2, [x27, #248]!
+# CHECK-NEXT: 9.     1     1.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    0.5    1.2       <total>
 
 # CHECK:      [46] Code Region - G47
 
@@ -3281,10 +3281,10 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      507
 # CHECK-NEXT: Total uOps:        1500
 
-# CHECK:      Dispatch Width:    8
+# CHECK:      Dispatch Width:    3
 # CHECK-NEXT: uOps Per Cycle:    2.96
 # CHECK-NEXT: IPC:               1.97
-# CHECK-NEXT: Block RThroughput: 3.3
+# CHECK-NEXT: Block RThroughput: 5.0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     01
@@ -3292,14 +3292,14 @@ add x0, x27, 1
 
 # CHECK:      [0,0]     DeeeeeER  ..   ldr	b1, [x27], #254
 # CHECK-NEXT: [0,1]     D=eE---R  ..   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=eeeeeER ..   ldr	h1, [x27], #254
-# CHECK-NEXT: [0,3]     D==eE---R ..   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     D==eeeeeER..   ldr	s1, [x27], #254
-# CHECK-NEXT: [0,5]     .D==eE---R..   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D==eeeeeER.   ldr	d1, [x27], #254
-# CHECK-NEXT: [0,7]     .D===eE---R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .D===eeeeeER   ldr	q1, [x27], #254
-# CHECK-NEXT: [0,9]     .D====eE---R   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeER ..   ldr	h1, [x27], #254
+# CHECK-NEXT: [0,3]     .D=eE---R ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DeeeeeER..   ldr	s1, [x27], #254
+# CHECK-NEXT: [0,5]     . D=eE---R..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  DeeeeeER.   ldr	d1, [x27], #254
+# CHECK-NEXT: [0,7]     .  D=eE---R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   DeeeeeER   ldr	q1, [x27], #254
+# CHECK-NEXT: [0,9]     .   D=eE---R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3310,15 +3310,15 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ldr	b1, [x27], #254
 # CHECK-NEXT: 1.     1     2.0    0.0    3.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ldr	h1, [x27], #254
-# CHECK-NEXT: 3.     1     3.0    0.0    3.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     3.0    0.0    0.0       ldr	s1, [x27], #254
-# CHECK-NEXT: 5.     1     3.0    0.0    3.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ldr	d1, [x27], #254
-# CHECK-NEXT: 7.     1     4.0    0.0    3.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     4.0    0.0    0.0       ldr	q1, [x27], #254
-# CHECK-NEXT: 9.     1     5.0    0.0    3.0       add	x0, x27, #1
-# CHECK-NEXT:        1     3.0    0.1    1.5       <total>
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ldr	h1, [x27], #254
+# CHECK-NEXT: 3.     1     2.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ldr	s1, [x27], #254
+# CHECK-NEXT: 5.     1     2.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    0.0    0.0       ldr	d1, [x27], #254
+# CHECK-NEXT: 7.     1     2.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    0.0    0.0       ldr	q1, [x27], #254
+# CHECK-NEXT: 9.     1     2.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.5    0.1    1.5       <total>
 
 # CHECK:      [47] Code Region - G48
 
@@ -3327,10 +3327,10 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      507
 # CHECK-NEXT: Total uOps:        1500
 
-# CHECK:      Dispatch Width:    8
+# CHECK:      Dispatch Width:    3
 # CHECK-NEXT: uOps Per Cycle:    2.96
 # CHECK-NEXT: IPC:               1.97
-# CHECK-NEXT: Block RThroughput: 3.3
+# CHECK-NEXT: Block RThroughput: 5.0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     01
@@ -3338,14 +3338,14 @@ add x0, x27, 1
 
 # CHECK:      [0,0]     DeeeeeER  ..   ldr	b1, [x27, #254]!
 # CHECK-NEXT: [0,1]     D=eE---R  ..   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=eeeeeER ..   ldr	h1, [x27, #254]!
-# CHECK-NEXT: [0,3]     D==eE---R ..   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     D==eeeeeER..   ldr	s1, [x27, #254]!
-# CHECK-NEXT: [0,5]     .D==eE---R..   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D==eeeeeER.   ldr	d1, [x27, #254]!
-# CHECK-NEXT: [0,7]     .D===eE---R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .D===eeeeeER   ldr	q1, [x27, #254]!
-# CHECK-NEXT: [0,9]     .D====eE---R   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeER ..   ldr	h1, [x27, #254]!
+# CHECK-NEXT: [0,3]     .D=eE---R ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DeeeeeER..   ldr	s1, [x27, #254]!
+# CHECK-NEXT: [0,5]     . D=eE---R..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  DeeeeeER.   ldr	d1, [x27, #254]!
+# CHECK-NEXT: [0,7]     .  D=eE---R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   DeeeeeER   ldr	q1, [x27, #254]!
+# CHECK-NEXT: [0,9]     .   D=eE---R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3356,15 +3356,15 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ldr	b1, [x27, #254]!
 # CHECK-NEXT: 1.     1     2.0    0.0    3.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ldr	h1, [x27, #254]!
-# CHECK-NEXT: 3.     1     3.0    0.0    3.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     3.0    0.0    0.0       ldr	s1, [x27, #254]!
-# CHECK-NEXT: 5.     1     3.0    0.0    3.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ldr	d1, [x27, #254]!
-# CHECK-NEXT: 7.     1     4.0    0.0    3.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     4.0    0.0    0.0       ldr	q1, [x27, #254]!
-# CHECK-NEXT: 9.     1     5.0    0.0    3.0       add	x0, x27, #1
-# CHECK-NEXT:        1     3.0    0.1    1.5       <total>
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ldr	h1, [x27, #254]!
+# CHECK-NEXT: 3.     1     2.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ldr	s1, [x27, #254]!
+# CHECK-NEXT: 5.     1     2.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    0.0    0.0       ldr	d1, [x27, #254]!
+# CHECK-NEXT: 7.     1     2.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    0.0    0.0       ldr	q1, [x27, #254]!
+# CHECK-NEXT: 9.     1     2.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.5    0.1    1.5       <total>
 
 # CHECK:      [48] Code Region - G49
 
@@ -3373,10 +3373,10 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      506
 # CHECK-NEXT: Total uOps:        1500
 
-# CHECK:      Dispatch Width:    8
+# CHECK:      Dispatch Width:    3
 # CHECK-NEXT: uOps Per Cycle:    2.96
 # CHECK-NEXT: IPC:               1.98
-# CHECK-NEXT: Block RThroughput: 3.3
+# CHECK-NEXT: Block RThroughput: 5.0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0
@@ -3384,14 +3384,14 @@ add x0, x27, 1
 
 # CHECK:      [0,0]     DeeeeER   .   ldr	w1, [x27], #254
 # CHECK-NEXT: [0,1]     D=eE--R   .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=eeeeER  .   ldr	x1, [x27], #254
-# CHECK-NEXT: [0,3]     D==eE--R  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     D==eeeeER .   ldr	w1, [x27, #254]!
-# CHECK-NEXT: [0,5]     .D==eE--R .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D==eeeeER.   ldr	x1, [x27, #254]!
-# CHECK-NEXT: [0,7]     .D===eE--R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .D===eeeeER   ldrb	w1, [x27], #254
-# CHECK-NEXT: [0,9]     .D====eE--R   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeER  .   ldr	x1, [x27], #254
+# CHECK-NEXT: [0,3]     .D=eE--R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DeeeeER .   ldr	w1, [x27, #254]!
+# CHECK-NEXT: [0,5]     . D=eE--R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  DeeeeER.   ldr	x1, [x27, #254]!
+# CHECK-NEXT: [0,7]     .  D=eE--R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   DeeeeER   ldrb	w1, [x27], #254
+# CHECK-NEXT: [0,9]     .   D=eE--R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3402,15 +3402,15 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ldr	w1, [x27], #254
 # CHECK-NEXT: 1.     1     2.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ldr	x1, [x27], #254
-# CHECK-NEXT: 3.     1     3.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     3.0    0.0    0.0       ldr	w1, [x27, #254]!
-# CHECK-NEXT: 5.     1     3.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ldr	x1, [x27, #254]!
-# CHECK-NEXT: 7.     1     4.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     4.0    0.0    0.0       ldrb	w1, [x27], #254
-# CHECK-NEXT: 9.     1     5.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT:        1     3.0    0.1    1.0       <total>
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ldr	x1, [x27], #254
+# CHECK-NEXT: 3.     1     2.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ldr	w1, [x27, #254]!
+# CHECK-NEXT: 5.     1     2.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    0.0    0.0       ldr	x1, [x27, #254]!
+# CHECK-NEXT: 7.     1     2.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    0.0    0.0       ldrb	w1, [x27], #254
+# CHECK-NEXT: 9.     1     2.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.5    0.1    1.0       <total>
 
 # CHECK:      [49] Code Region - G50
 
@@ -3419,10 +3419,10 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      506
 # CHECK-NEXT: Total uOps:        1500
 
-# CHECK:      Dispatch Width:    8
+# CHECK:      Dispatch Width:    3
 # CHECK-NEXT: uOps Per Cycle:    2.96
 # CHECK-NEXT: IPC:               1.98
-# CHECK-NEXT: Block RThroughput: 3.3
+# CHECK-NEXT: Block RThroughput: 5.0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0
@@ -3430,14 +3430,14 @@ add x0, x27, 1
 
 # CHECK:      [0,0]     DeeeeER   .   ldrb	w1, [x27, #254]!
 # CHECK-NEXT: [0,1]     D=eE--R   .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=eeeeER  .   ldrh	w1, [x27], #254
-# CHECK-NEXT: [0,3]     D==eE--R  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     D==eeeeER .   ldrh	w1, [x27, #254]!
-# CHECK-NEXT: [0,5]     .D==eE--R .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D==eeeeER.   ldrsb	w1, [x27], #254
-# CHECK-NEXT: [0,7]     .D===eE--R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .D===eeeeER   ldrsb	x1, [x27], #254
-# CHECK-NEXT: [0,9]     .D====eE--R   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeER  .   ldrh	w1, [x27], #254
+# CHECK-NEXT: [0,3]     .D=eE--R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DeeeeER .   ldrh	w1, [x27, #254]!
+# CHECK-NEXT: [0,5]     . D=eE--R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  DeeeeER.   ldrsb	w1, [x27], #254
+# CHECK-NEXT: [0,7]     .  D=eE--R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   DeeeeER   ldrsb	x1, [x27], #254
+# CHECK-NEXT: [0,9]     .   D=eE--R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3448,15 +3448,15 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ldrb	w1, [x27, #254]!
 # CHECK-NEXT: 1.     1     2.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ldrh	w1, [x27], #254
-# CHECK-NEXT: 3.     1     3.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     3.0    0.0    0.0       ldrh	w1, [x27, #254]!
-# CHECK-NEXT: 5.     1     3.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ldrsb	w1, [x27], #254
-# CHECK-NEXT: 7.     1     4.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     4.0    0.0    0.0       ldrsb	x1, [x27], #254
-# CHECK-NEXT: 9.     1     5.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT:        1     3.0    0.1    1.0       <total>
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ldrh	w1, [x27], #254
+# CHECK-NEXT: 3.     1     2.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ldrh	w1, [x27, #254]!
+# CHECK-NEXT: 5.     1     2.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    0.0    0.0       ldrsb	w1, [x27], #254
+# CHECK-NEXT: 7.     1     2.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    0.0    0.0       ldrsb	x1, [x27], #254
+# CHECK-NEXT: 9.     1     2.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.5    0.1    1.0       <total>
 
 # CHECK:      [50] Code Region - G51
 
@@ -3465,10 +3465,10 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      506
 # CHECK-NEXT: Total uOps:        1500
 
-# CHECK:      Dispatch Width:    8
+# CHECK:      Dispatch Width:    3
 # CHECK-NEXT: uOps Per Cycle:    2.96
 # CHECK-NEXT: IPC:               1.98
-# CHECK-NEXT: Block RThroughput: 3.3
+# CHECK-NEXT: Block RThroughput: 5.0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0
@@ -3476,14 +3476,14 @@ add x0, x27, 1
 
 # CHECK:      [0,0]     DeeeeER   .   ldrsb	w1, [x27, #254]!
 # CHECK-NEXT: [0,1]     D=eE--R   .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=eeeeER  .   ldrsb	x1, [x27, #254]!
-# CHECK-NEXT: [0,3]     D==eE--R  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     D==eeeeER .   ldrsh	w1, [x27], #254
-# CHECK-NEXT: [0,5]     .D==eE--R .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D==eeeeER.   ldrsh	x1, [x27], #254
-# CHECK-NEXT: [0,7]     .D===eE--R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .D===eeeeER   ldrsh	w1, [x27, #254]!
-# CHECK-NEXT: [0,9]     .D====eE--R   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeER  .   ldrsb	x1, [x27, #254]!
+# CHECK-NEXT: [0,3]     .D=eE--R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DeeeeER .   ldrsh	w1, [x27], #254
+# CHECK-NEXT: [0,5]     . D=eE--R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  DeeeeER.   ldrsh	x1, [x27], #254
+# CHECK-NEXT: [0,7]     .  D=eE--R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   DeeeeER   ldrsh	w1, [x27, #254]!
+# CHECK-NEXT: [0,9]     .   D=eE--R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3494,41 +3494,41 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ldrsb	w1, [x27, #254]!
 # CHECK-NEXT: 1.     1     2.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ldrsb	x1, [x27, #254]!
-# CHECK-NEXT: 3.     1     3.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     3.0    0.0    0.0       ldrsh	w1, [x27], #254
-# CHECK-NEXT: 5.     1     3.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ldrsh	x1, [x27], #254
-# CHECK-NEXT: 7.     1     4.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     4.0    0.0    0.0       ldrsh	w1, [x27, #254]!
-# CHECK-NEXT: 9.     1     5.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT:        1     3.0    0.1    1.0       <total>
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ldrsb	x1, [x27, #254]!
+# CHECK-NEXT: 3.     1     2.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ldrsh	w1, [x27], #254
+# CHECK-NEXT: 5.     1     2.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    0.0    0.0       ldrsh	x1, [x27], #254
+# CHECK-NEXT: 7.     1     2.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    0.0    0.0       ldrsh	w1, [x27, #254]!
+# CHECK-NEXT: 9.     1     2.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.5    0.1    1.0       <total>
 
 # CHECK:      [51] Code Region - G52
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      504
+# CHECK-NEXT: Total Cycles:      703
 # CHECK-NEXT: Total uOps:        1700
 
-# CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    3.37
-# CHECK-NEXT: IPC:               1.98
-# CHECK-NEXT: Block RThroughput: 3.3
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    2.42
+# CHECK-NEXT: IPC:               1.42
+# CHECK-NEXT: Block RThroughput: 5.7
 
 # CHECK:      Timeline view:
-# CHECK-NEXT: Index     012345678
-
-# CHECK:      [0,0]     DeeeeER .   ldrsh	x1, [x27, #254]!
-# CHECK-NEXT: [0,1]     D=eE--R .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=eeeeER.   ldrsw	x1, [x27], #254
-# CHECK-NEXT: [0,3]     D==eE--R.   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     D==eeeeER   ldrsw	x1, [x27, #254]!
-# CHECK-NEXT: [0,5]     .D==eE--R   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D==eeE-R   st1	{ v1.1d }, [x27], #8
-# CHECK-NEXT: [0,7]     .D===eE-R   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .D===eeER   st1	{ v1.2d }, [x27], #16
-# CHECK-NEXT: [0,9]     . D===eER   add	x0, x27, #1
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeER  .   ldrsh	x1, [x27, #254]!
+# CHECK-NEXT: [0,1]     D=eE--R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeER .   ldrsw	x1, [x27], #254
+# CHECK-NEXT: [0,3]     .D=eE--R .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DeeeeER.   ldrsw	x1, [x27, #254]!
+# CHECK-NEXT: [0,5]     . D=eE--R.   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  DeeE-R.   st1	{ v1.1d }, [x27], #8
+# CHECK-NEXT: [0,7]     .   DeE-R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    DeeER   st1	{ v1.2d }, [x27], #16
+# CHECK-NEXT: [0,9]     .    .DeER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3539,41 +3539,42 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ldrsh	x1, [x27, #254]!
 # CHECK-NEXT: 1.     1     2.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ldrsw	x1, [x27], #254
-# CHECK-NEXT: 3.     1     3.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     3.0    0.0    0.0       ldrsw	x1, [x27, #254]!
-# CHECK-NEXT: 5.     1     3.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     3.0    0.0    1.0       st1	{ v1.1d }, [x27], #8
-# CHECK-NEXT: 7.     1     4.0    0.0    1.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     4.0    0.0    0.0       st1	{ v1.2d }, [x27], #16
-# CHECK-NEXT: 9.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.9    0.1    0.8       <total>
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ldrsw	x1, [x27], #254
+# CHECK-NEXT: 3.     1     2.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ldrsw	x1, [x27, #254]!
+# CHECK-NEXT: 5.     1     2.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    0.0    1.0       st1	{ v1.1d }, [x27], #8
+# CHECK-NEXT: 7.     1     1.0    0.0    1.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       st1	{ v1.2d }, [x27], #16
+# CHECK-NEXT: 9.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.3    0.2    0.8       <total>
 
 # CHECK:      [52] Code Region - G53
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      504
+# CHECK-NEXT: Total Cycles:      1003
 # CHECK-NEXT: Total uOps:        2000
 
-# CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    3.97
-# CHECK-NEXT: IPC:               1.98
-# CHECK-NEXT: Block RThroughput: 3.3
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    1.99
+# CHECK-NEXT: IPC:               1.00
+# CHECK-NEXT: Block RThroughput: 6.7
 
 # CHECK:      Timeline view:
-# CHECK-NEXT: Index     012345678
-
-# CHECK:      [0,0]     DeeER.  .   st1	{ v1.2s }, [x27], #8
-# CHECK-NEXT: [0,1]     D=eER.  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=eeER  .   st1	{ v1.4h }, [x27], #8
-# CHECK-NEXT: [0,3]     D==eER  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D=eeER .   st1	{ v1.4s }, [x27], #16
-# CHECK-NEXT: [0,5]     .D==eER .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D==eeER.   st1	{ v1.8b }, [x27], #8
-# CHECK-NEXT: [0,7]     .D===eER.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D==eeER   st1	{ v1.8h }, [x27], #16
-# CHECK-NEXT: [0,9]     . D===eER   add	x0, x27, #1
+# CHECK-NEXT:                     012
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeER.    . .   st1	{ v1.2s }, [x27], #8
+# CHECK-NEXT: [0,1]     .DeER.    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeeER   . .   st1	{ v1.4h }, [x27], #8
+# CHECK-NEXT: [0,3]     .  DeER   . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   DeeER . .   st1	{ v1.4s }, [x27], #16
+# CHECK-NEXT: [0,5]     .    DeER . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .DeeER .   st1	{ v1.8b }, [x27], #8
+# CHECK-NEXT: [0,7]     .    . DeER .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  DeeER   st1	{ v1.8h }, [x27], #16
+# CHECK-NEXT: [0,9]     .    .   DeER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3583,42 +3584,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.2s }, [x27], #8
-# CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st1	{ v1.4h }, [x27], #8
-# CHECK-NEXT: 3.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     2.0    0.0    0.0       st1	{ v1.4s }, [x27], #16
-# CHECK-NEXT: 5.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     3.0    0.0    0.0       st1	{ v1.8b }, [x27], #8
-# CHECK-NEXT: 7.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     3.0    0.0    0.0       st1	{ v1.8h }, [x27], #16
-# CHECK-NEXT: 9.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.7    0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       st1	{ v1.4h }, [x27], #8
+# CHECK-NEXT: 3.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       st1	{ v1.4s }, [x27], #16
+# CHECK-NEXT: 5.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       st1	{ v1.8b }, [x27], #8
+# CHECK-NEXT: 7.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       st1	{ v1.8h }, [x27], #16
+# CHECK-NEXT: 9.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    0.5    0.0       <total>
 
 # CHECK:      [53] Code Region - G54
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      504
+# CHECK-NEXT: Total Cycles:      1003
 # CHECK-NEXT: Total uOps:        2000
 
-# CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    3.97
-# CHECK-NEXT: IPC:               1.98
-# CHECK-NEXT: Block RThroughput: 3.3
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    1.99
+# CHECK-NEXT: IPC:               1.00
+# CHECK-NEXT: Block RThroughput: 6.7
 
 # CHECK:      Timeline view:
-# CHECK-NEXT: Index     012345678
-
-# CHECK:      [0,0]     DeeER.  .   st1	{ v1.16b }, [x27], #16
-# CHECK-NEXT: [0,1]     D=eER.  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=eeER  .   st1	{ v1.1d }, [x27], x28
-# CHECK-NEXT: [0,3]     D==eER  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D=eeER .   st1	{ v1.2d }, [x27], x28
-# CHECK-NEXT: [0,5]     .D==eER .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D==eeER.   st1	{ v1.2s }, [x27], x28
-# CHECK-NEXT: [0,7]     .D===eER.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D==eeER   st1	{ v1.4h }, [x27], x28
-# CHECK-NEXT: [0,9]     . D===eER   add	x0, x27, #1
+# CHECK-NEXT:                     012
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeER.    . .   st1	{ v1.16b }, [x27], #16
+# CHECK-NEXT: [0,1]     .DeER.    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeeER   . .   st1	{ v1.1d }, [x27], x28
+# CHECK-NEXT: [0,3]     .  DeER   . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   DeeER . .   st1	{ v1.2d }, [x27], x28
+# CHECK-NEXT: [0,5]     .    DeER . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .DeeER .   st1	{ v1.2s }, [x27], x28
+# CHECK-NEXT: [0,7]     .    . DeER .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  DeeER   st1	{ v1.4h }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .   DeER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3628,42 +3630,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.16b }, [x27], #16
-# CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st1	{ v1.1d }, [x27], x28
-# CHECK-NEXT: 3.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     2.0    0.0    0.0       st1	{ v1.2d }, [x27], x28
-# CHECK-NEXT: 5.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     3.0    0.0    0.0       st1	{ v1.2s }, [x27], x28
-# CHECK-NEXT: 7.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     3.0    0.0    0.0       st1	{ v1.4h }, [x27], x28
-# CHECK-NEXT: 9.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.7    0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       st1	{ v1.1d }, [x27], x28
+# CHECK-NEXT: 3.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       st1	{ v1.2d }, [x27], x28
+# CHECK-NEXT: 5.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       st1	{ v1.2s }, [x27], x28
+# CHECK-NEXT: 7.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       st1	{ v1.4h }, [x27], x28
+# CHECK-NEXT: 9.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    0.5    0.0       <total>
 
 # CHECK:      [54] Code Region - G55
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      504
+# CHECK-NEXT: Total Cycles:      1003
 # CHECK-NEXT: Total uOps:        2100
 
-# CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    4.17
-# CHECK-NEXT: IPC:               1.98
-# CHECK-NEXT: Block RThroughput: 3.3
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    2.09
+# CHECK-NEXT: IPC:               1.00
+# CHECK-NEXT: Block RThroughput: 7.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT: Index     012345678
-
-# CHECK:      [0,0]     DeeER.  .   st1	{ v1.4s }, [x27], x28
-# CHECK-NEXT: [0,1]     D=eER.  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=eeER  .   st1	{ v1.8b }, [x27], x28
-# CHECK-NEXT: [0,3]     D==eER  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D=eeER .   st1	{ v1.8h }, [x27], x28
-# CHECK-NEXT: [0,5]     .D==eER .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D==eeER.   st1	{ v1.16b }, [x27], x28
-# CHECK-NEXT: [0,7]     .D===eER.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D==eeER   st1	{ v1.1d, v2.1d }, [x27], #16
-# CHECK-NEXT: [0,9]     . D===eER   add	x0, x27, #1
+# CHECK-NEXT:                     012
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeER.    . .   st1	{ v1.4s }, [x27], x28
+# CHECK-NEXT: [0,1]     .DeER.    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeeER   . .   st1	{ v1.8b }, [x27], x28
+# CHECK-NEXT: [0,3]     .  DeER   . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   DeeER . .   st1	{ v1.8h }, [x27], x28
+# CHECK-NEXT: [0,5]     .    DeER . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .DeeER .   st1	{ v1.16b }, [x27], x28
+# CHECK-NEXT: [0,7]     .    . DeER .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  DeeER   st1	{ v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: [0,9]     .    .   DeER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3673,42 +3676,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.4s }, [x27], x28
-# CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st1	{ v1.8b }, [x27], x28
-# CHECK-NEXT: 3.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     2.0    0.0    0.0       st1	{ v1.8h }, [x27], x28
-# CHECK-NEXT: 5.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     3.0    0.0    0.0       st1	{ v1.16b }, [x27], x28
-# CHECK-NEXT: 7.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     3.0    0.0    0.0       st1	{ v1.1d, v2.1d }, [x27], #16
-# CHECK-NEXT: 9.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.7    0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       st1	{ v1.8b }, [x27], x28
+# CHECK-NEXT: 3.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       st1	{ v1.8h }, [x27], x28
+# CHECK-NEXT: 5.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       st1	{ v1.16b }, [x27], x28
+# CHECK-NEXT: 7.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       st1	{ v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: 9.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    0.5    0.0       <total>
 
 # CHECK:      [55] Code Region - G56
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      504
+# CHECK-NEXT: Total Cycles:      1003
 # CHECK-NEXT: Total uOps:        2700
 
-# CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    5.36
-# CHECK-NEXT: IPC:               1.98
-# CHECK-NEXT: Block RThroughput: 5.0
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    2.69
+# CHECK-NEXT: IPC:               1.00
+# CHECK-NEXT: Block RThroughput: 9.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT: Index     012345678
-
-# CHECK:      [0,0]     DeeER.  .   st1	{ v1.2d, v2.2d }, [x27], #32
-# CHECK-NEXT: [0,1]     D=eER.  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeeER  .   st1	{ v1.2s, v2.2s }, [x27], #16
-# CHECK-NEXT: [0,3]     .D=eER  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . DeeER .   st1	{ v1.4h, v2.4h }, [x27], #16
-# CHECK-NEXT: [0,5]     . D=eER .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  DeeER.   st1	{ v1.4s, v2.4s }, [x27], #32
-# CHECK-NEXT: [0,7]     .  D=eER.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   DeeER   st1	{ v1.8b, v2.8b }, [x27], #16
-# CHECK-NEXT: [0,9]     .   D=eER   add	x0, x27, #1
+# CHECK-NEXT:                     012
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeER.    . .   st1	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: [0,1]     .DeER.    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeeER   . .   st1	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: [0,3]     .  DeER   . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   DeeER . .   st1	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: [0,5]     .    DeER . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .DeeER .   st1	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: [0,7]     .    . DeER .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  DeeER   st1	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: [0,9]     .    .   DeER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3718,42 +3722,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.2d, v2.2d }, [x27], #32
-# CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st1	{ v1.2s, v2.2s }, [x27], #16
-# CHECK-NEXT: 3.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     1.0    0.0    0.0       st1	{ v1.4h, v2.4h }, [x27], #16
-# CHECK-NEXT: 5.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     1.0    0.0    0.0       st1	{ v1.4s, v2.4s }, [x27], #32
-# CHECK-NEXT: 7.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     1.0    0.0    0.0       st1	{ v1.8b, v2.8b }, [x27], #16
-# CHECK-NEXT: 9.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     1.5    0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       st1	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: 3.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       st1	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: 5.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       st1	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: 7.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       st1	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: 9.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    0.5    0.0       <total>
 
 # CHECK:      [56] Code Region - G57
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      504
+# CHECK-NEXT: Total Cycles:      1003
 # CHECK-NEXT: Total uOps:        2800
 
-# CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    5.56
-# CHECK-NEXT: IPC:               1.98
-# CHECK-NEXT: Block RThroughput: 5.0
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    2.79
+# CHECK-NEXT: IPC:               1.00
+# CHECK-NEXT: Block RThroughput: 9.3
 
 # CHECK:      Timeline view:
-# CHECK-NEXT: Index     012345678
-
-# CHECK:      [0,0]     DeeER.  .   st1	{ v1.8h, v2.8h }, [x27], #32
-# CHECK-NEXT: [0,1]     D=eER.  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeeER  .   st1	{ v1.16b, v2.16b }, [x27], #32
-# CHECK-NEXT: [0,3]     .D=eER  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . DeeER .   st1	{ v1.1d, v2.1d }, [x27], x28
-# CHECK-NEXT: [0,5]     . D=eER .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  DeeER.   st1	{ v1.2d, v2.2d }, [x27], x28
-# CHECK-NEXT: [0,7]     .  D=eER.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   DeeER   st1	{ v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT: [0,9]     .   D=eER   add	x0, x27, #1
+# CHECK-NEXT:                     012
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeER.    . .   st1	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: [0,1]     .DeER.    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeeER   . .   st1	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: [0,3]     .  DeER   . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   DeeER . .   st1	{ v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: [0,5]     .    DeER . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .DeeER .   st1	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: [0,7]     .    . DeER .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  DeeER   st1	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .   DeER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3763,42 +3768,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.8h, v2.8h }, [x27], #32
-# CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st1	{ v1.16b, v2.16b }, [x27], #32
-# CHECK-NEXT: 3.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     1.0    0.0    0.0       st1	{ v1.1d, v2.1d }, [x27], x28
-# CHECK-NEXT: 5.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     1.0    0.0    0.0       st1	{ v1.2d, v2.2d }, [x27], x28
-# CHECK-NEXT: 7.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     1.0    0.0    0.0       st1	{ v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT: 9.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     1.5    0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       st1	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: 3.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       st1	{ v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: 5.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       st1	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: 7.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       st1	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: 9.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    0.5    0.0       <total>
 
 # CHECK:      [57] Code Region - G58
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      504
+# CHECK-NEXT: Total Cycles:      1003
 # CHECK-NEXT: Total uOps:        2800
 
-# CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    5.56
-# CHECK-NEXT: IPC:               1.98
-# CHECK-NEXT: Block RThroughput: 5.0
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    2.79
+# CHECK-NEXT: IPC:               1.00
+# CHECK-NEXT: Block RThroughput: 9.3
 
 # CHECK:      Timeline view:
-# CHECK-NEXT: Index     012345678
-
-# CHECK:      [0,0]     DeeER.  .   st1	{ v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: [0,1]     D=eER.  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeeER  .   st1	{ v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT: [0,3]     .D=eER  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . DeeER .   st1	{ v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: [0,5]     . D=eER .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  DeeER.   st1	{ v1.8h, v2.8h }, [x27], x28
-# CHECK-NEXT: [0,7]     .  D=eER.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   DeeER   st1	{ v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: [0,9]     .   D=eER   add	x0, x27, #1
+# CHECK-NEXT:                     012
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeER.    . .   st1	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: [0,1]     .DeER.    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeeER   . .   st1	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: [0,3]     .  DeER   . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   DeeER . .   st1	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: [0,5]     .    DeER . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .DeeER .   st1	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: [0,7]     .    . DeER .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  DeeER   st1	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .   DeER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3808,43 +3814,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st1	{ v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT: 3.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     1.0    0.0    0.0       st1	{ v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: 5.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     1.0    0.0    0.0       st1	{ v1.8h, v2.8h }, [x27], x28
-# CHECK-NEXT: 7.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     1.0    0.0    0.0       st1	{ v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: 9.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     1.5    0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       st1	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: 3.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       st1	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: 5.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       st1	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: 7.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       st1	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: 9.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    0.5    0.0       <total>
 
 # CHECK:      [58] Code Region - G59
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      755
+# CHECK-NEXT: Total Cycles:      1504
 # CHECK-NEXT: Total uOps:        3700
 
-# CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    4.90
-# CHECK-NEXT: IPC:               1.32
-# CHECK-NEXT: Block RThroughput: 7.5
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    2.46
+# CHECK-NEXT: IPC:               0.66
+# CHECK-NEXT: Block RThroughput: 12.3
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     012
+# CHECK-NEXT:                     012345678
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeER    . .   st1	{ v1.1d, v2.1d, v3.1d }, [x27], #24
-# CHECK-NEXT: [0,1]     D=eE-R    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeeeeER  . .   st1	{ v1.2d, v2.2d, v3.2d }, [x27], #48
-# CHECK-NEXT: [0,3]     .D=eE--R  . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D=eeeER . .   st1	{ v1.2s, v2.2s, v3.2s }, [x27], #24
-# CHECK-NEXT: [0,5]     . D==eE-R . .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D=eeeER. .   st1	{ v1.4h, v2.4h, v3.4h }, [x27], #24
-# CHECK-NEXT: [0,7]     .  D==eE-R. .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D==eeeeER   st1	{ v1.4s, v2.4s, v3.4s }, [x27], #48
-# CHECK-NEXT: [0,9]     .   D===eE--R   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeER    .    .  .   st1	{ v1.1d, v2.1d, v3.1d }, [x27], #24
+# CHECK-NEXT: [0,1]     . DeER    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .  DeeeeER.    .  .   st1	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: [0,3]     .    DeE-R.    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .DeeeER   .  .   st1	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: [0,5]     .    .  DeER   .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .   DeeeER.  .   st1	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: [0,7]     .    .    .DeER.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    . DeeeeER   st1	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: [0,9]     .    .    .   DeE-R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3854,43 +3860,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.1d, v2.1d, v3.1d }, [x27], #24
-# CHECK-NEXT: 1.     1     2.0    0.0    1.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st1	{ v1.2d, v2.2d, v3.2d }, [x27], #48
-# CHECK-NEXT: 3.     1     2.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     2.0    1.0    0.0       st1	{ v1.2s, v2.2s, v3.2s }, [x27], #24
-# CHECK-NEXT: 5.     1     3.0    0.0    1.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     2.0    0.0    0.0       st1	{ v1.4h, v2.4h, v3.4h }, [x27], #24
-# CHECK-NEXT: 7.     1     3.0    0.0    1.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     3.0    1.0    0.0       st1	{ v1.4s, v2.4s, v3.4s }, [x27], #48
-# CHECK-NEXT: 9.     1     4.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.3    0.3    0.7       <total>
+# CHECK-NEXT: 1.     1     1.0    1.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       st1	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: 3.     1     1.0    1.0    1.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       st1	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: 5.     1     1.0    1.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       st1	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: 7.     1     1.0    1.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       st1	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: 9.     1     1.0    1.0    1.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    1.0    0.2       <total>
 
 # CHECK:      [59] Code Region - G60
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      755
+# CHECK-NEXT: Total Cycles:      1504
 # CHECK-NEXT: Total uOps:        3800
 
-# CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    5.03
-# CHECK-NEXT: IPC:               1.32
-# CHECK-NEXT: Block RThroughput: 7.5
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    2.53
+# CHECK-NEXT: IPC:               0.66
+# CHECK-NEXT: Block RThroughput: 12.7
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     012
+# CHECK-NEXT:                     012345678
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeER    . .   st1	{ v1.8b, v2.8b, v3.8b }, [x27], #24
-# CHECK-NEXT: [0,1]     D=eE-R    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeeeeER  . .   st1	{ v1.8h, v2.8h, v3.8h }, [x27], #48
-# CHECK-NEXT: [0,3]     .D=eE--R  . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D=eeeeER. .   st1	{ v1.16b, v2.16b, v3.16b }, [x27], #48
-# CHECK-NEXT: [0,5]     . D==eE--R. .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D=eeeER. .   st1	{ v1.1d, v2.1d, v3.1d }, [x27], x28
-# CHECK-NEXT: [0,7]     .  D==eE-R. .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D==eeeeER   st1	{ v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT: [0,9]     .   D===eE--R   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeER    .    .  .   st1	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: [0,1]     . DeER    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .  DeeeeER.    .  .   st1	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: [0,3]     .    DeE-R.    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .DeeeeER  .  .   st1	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: [0,5]     .    .  DeE-R  .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .   DeeeER.  .   st1	{ v1.1d, v2.1d, v3.1d }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .DeER.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    . DeeeeER   st1	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .   DeE-R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3900,43 +3906,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.8b, v2.8b, v3.8b }, [x27], #24
-# CHECK-NEXT: 1.     1     2.0    0.0    1.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st1	{ v1.8h, v2.8h, v3.8h }, [x27], #48
-# CHECK-NEXT: 3.     1     2.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     2.0    1.0    0.0       st1	{ v1.16b, v2.16b, v3.16b }, [x27], #48
-# CHECK-NEXT: 5.     1     3.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     2.0    0.0    0.0       st1	{ v1.1d, v2.1d, v3.1d }, [x27], x28
-# CHECK-NEXT: 7.     1     3.0    0.0    1.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     3.0    1.0    0.0       st1	{ v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT: 9.     1     4.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.3    0.3    0.8       <total>
+# CHECK-NEXT: 1.     1     1.0    1.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       st1	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: 3.     1     1.0    1.0    1.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       st1	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: 5.     1     1.0    1.0    1.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       st1	{ v1.1d, v2.1d, v3.1d }, [x27], x28
+# CHECK-NEXT: 7.     1     1.0    1.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       st1	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: 9.     1     1.0    1.0    1.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    1.0    0.3       <total>
 
 # CHECK:      [60] Code Region - G61
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      755
+# CHECK-NEXT: Total Cycles:      1504
 # CHECK-NEXT: Total uOps:        3700
 
-# CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    4.90
-# CHECK-NEXT: IPC:               1.32
-# CHECK-NEXT: Block RThroughput: 7.5
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    2.46
+# CHECK-NEXT: IPC:               0.66
+# CHECK-NEXT: Block RThroughput: 12.3
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     012
+# CHECK-NEXT:                     012345678
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeER    . .   st1	{ v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: [0,1]     D=eE-R    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeeeER   . .   st1	{ v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: [0,3]     .D=eE-R   . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D=eeeeER. .   st1	{ v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: [0,5]     . D==eE--R. .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D=eeeER. .   st1	{ v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: [0,7]     .  D==eE-R. .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D==eeeeER   st1	{ v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT: [0,9]     .   D===eE--R   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeER    .    .  .   st1	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: [0,1]     . DeER    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .  DeeeER .    .  .   st1	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: [0,3]     .    DeER .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .DeeeeER  .  .   st1	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .  DeE-R  .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .   DeeeER.  .   st1	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .DeER.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    . DeeeeER   st1	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .   DeE-R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3946,43 +3952,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: 1.     1     2.0    0.0    1.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st1	{ v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: 3.     1     2.0    0.0    1.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     2.0    1.0    0.0       st1	{ v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: 5.     1     3.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     2.0    0.0    0.0       st1	{ v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: 7.     1     3.0    0.0    1.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     3.0    1.0    0.0       st1	{ v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT: 9.     1     4.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.3    0.3    0.7       <total>
+# CHECK-NEXT: 1.     1     1.0    1.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       st1	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: 3.     1     1.0    1.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       st1	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: 5.     1     1.0    1.0    1.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       st1	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: 7.     1     1.0    1.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       st1	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: 9.     1     1.0    1.0    1.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    1.0    0.2       <total>
 
 # CHECK:      [61] Code Region - G62
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      704
+# CHECK-NEXT: Total Cycles:      1303
 # CHECK-NEXT: Total uOps:        3600
 
-# CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    5.11
-# CHECK-NEXT: IPC:               1.42
-# CHECK-NEXT: Block RThroughput: 6.5
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    2.76
+# CHECK-NEXT: IPC:               0.77
+# CHECK-NEXT: Block RThroughput: 12.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0
+# CHECK-NEXT:                     012345
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeER   .   st1	{ v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: [0,1]     D=eE--R   .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeeE-R   .   st1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
-# CHECK-NEXT: [0,3]     .D=eE-R   .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D=eeeeeER   st1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
-# CHECK-NEXT: [0,5]     .  D=eE---R   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D=eeE--R   st1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
-# CHECK-NEXT: [0,7]     .  D==eE--R   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D==eeER   st1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
-# CHECK-NEXT: [0,9]     .   D===eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeER   .    .   st1	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: [0,1]     . DeE-R   .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .  DeeER  .    .   st1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: [0,3]     .   DeER  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    DeeeeeER  .   st1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: [0,5]     .    .  DeE-R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .   DeeER .   st1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: [0,7]     .    .    DeER .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .DeeER   st1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: [0,9]     .    .    . DeER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3992,43 +3998,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: 1.     1     2.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    0.0    1.0       st1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
-# CHECK-NEXT: 3.     1     2.0    0.0    1.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     2.0    1.0    0.0       st1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
-# CHECK-NEXT: 5.     1     2.0    0.0    3.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     2.0    0.0    2.0       st1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
-# CHECK-NEXT: 7.     1     3.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     3.0    1.0    0.0       st1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
-# CHECK-NEXT: 9.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.2    0.3    1.1       <total>
+# CHECK-NEXT: 1.     1     1.0    1.0    1.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       st1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: 3.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       st1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: 5.     1     1.0    1.0    1.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       st1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: 7.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       st1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: 9.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    0.7    0.2       <total>
 
 # CHECK:      [62] Code Region - G63
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      804
+# CHECK-NEXT: Total Cycles:      1603
 # CHECK-NEXT: Total uOps:        4200
 
-# CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    5.22
-# CHECK-NEXT: IPC:               1.24
-# CHECK-NEXT: Block RThroughput: 8.0
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    2.62
+# CHECK-NEXT: IPC:               0.62
+# CHECK-NEXT: Block RThroughput: 14.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     01
+# CHECK-NEXT:                     012345678
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeER  ..   st1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
-# CHECK-NEXT: [0,1]     .DeE---R  ..   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeeE--R  ..   st1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
-# CHECK-NEXT: [0,3]     .D=eE--R  ..   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D=eeeeeER.   st1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
-# CHECK-NEXT: [0,5]     .  D=eE---R.   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .   DeeeeeER   st1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
-# CHECK-NEXT: [0,7]     .    DeE---R   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    D==eeER   st1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
-# CHECK-NEXT: [0,9]     .    D===eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeER  .    .  .   st1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: [0,1]     .  DeE-R  .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeER .    .  .   st1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: [0,3]     .    DeER .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .DeeeeeER .  .   st1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: [0,5]     .    .   DeE-R .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    DeeeeeER.   st1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: [0,7]     .    .    .  DeE-R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .   DeeER   st1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    DeER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4038,43 +4044,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
-# CHECK-NEXT: 1.     1     1.0    0.0    3.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    0.0    2.0       st1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
-# CHECK-NEXT: 3.     1     2.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     2.0    1.0    0.0       st1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
-# CHECK-NEXT: 5.     1     2.0    0.0    3.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     1.0    0.0    0.0       st1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
-# CHECK-NEXT: 7.     1     1.0    0.0    3.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     3.0    2.0    0.0       st1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
-# CHECK-NEXT: 9.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     1.8    0.4    1.3       <total>
+# CHECK-NEXT: 1.     1     1.0    1.0    1.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       st1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: 3.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       st1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: 5.     1     1.0    1.0    1.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       st1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: 7.     1     1.0    1.0    1.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       st1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# CHECK-NEXT: 9.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    0.8    0.3       <total>
 
 # CHECK:      [63] Code Region - G64
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      705
+# CHECK-NEXT: Total Cycles:      1403
 # CHECK-NEXT: Total uOps:        3800
 
-# CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    5.39
-# CHECK-NEXT: IPC:               1.42
-# CHECK-NEXT: Block RThroughput: 7.0
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    2.71
+# CHECK-NEXT: IPC:               0.71
+# CHECK-NEXT: Block RThroughput: 12.7
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     01
+# CHECK-NEXT:                     0123456
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeER  ..   st1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: [0,1]     .DeE---R  ..   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeeE--R  ..   st1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: [0,3]     .D=eE--R  ..   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D=eeER  ..   st1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: [0,5]     . D==eER  ..   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D=eeeeeER   st1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: [0,7]     .   D=eE---R   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D=eeE--R   st1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: [0,9]     .   D==eE--R   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeER  .    ..   st1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: [0,1]     .  DeE-R  .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeER .    ..   st1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: [0,3]     .    DeER .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .DeeER    ..   st1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: [0,5]     .    . DeER    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .  DeeeeeER.   st1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .DeE-R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    . DeeER   st1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .  DeER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4084,43 +4090,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: 1.     1     1.0    0.0    3.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    0.0    2.0       st1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: 3.     1     2.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     2.0    1.0    0.0       st1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: 5.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     2.0    0.0    0.0       st1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: 7.     1     2.0    0.0    3.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     2.0    0.0    2.0       st1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: 9.     1     3.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT:        1     1.9    0.2    1.4       <total>
+# CHECK-NEXT: 1.     1     1.0    1.0    1.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       st1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: 3.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       st1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: 5.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       st1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: 7.     1     1.0    1.0    1.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       st1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: 9.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    0.7    0.2       <total>
 
 # CHECK:      [64] Code Region - G65
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      706
+# CHECK-NEXT: Total Cycles:      1405
 # CHECK-NEXT: Total uOps:        3200
 
-# CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    4.53
-# CHECK-NEXT: IPC:               1.42
-# CHECK-NEXT: Block RThroughput: 5.5
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    2.28
+# CHECK-NEXT: IPC:               0.71
+# CHECK-NEXT: Block RThroughput: 10.7
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     012
+# CHECK-NEXT:                     012345678
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeER  . .   st1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: [0,1]     .DeE---R  . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     . DeeeeeER. .   st1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: [0,3]     .  DeE---R. .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .  D=eeeeER .   st1	{ v1.b }[0], [x27], #1
-# CHECK-NEXT: [0,5]     .  D==eE--R .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .   D=eeeeER.   st1	{ v1.b }[8], [x27], #1
-# CHECK-NEXT: [0,7]     .   D==eE--R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D==eeeeER   st1	{ v1.b }[0], [x27], x28
-# CHECK-NEXT: [0,9]     .   D===eE--R   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeER  .    .  .   st1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: [0,1]     .  DeE-R  .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeeeeER   .  .   st1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: [0,3]     .    . DeE-R   .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  DeeeeER.  .   st1	{ v1.b }[0], [x27], #1
+# CHECK-NEXT: [0,5]     .    .   DeE--R.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    DeeeeER .   st1	{ v1.b }[8], [x27], #1
+# CHECK-NEXT: [0,7]     .    .    .DeE--R .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    . DeeeeER   st1	{ v1.b }[0], [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .  DeE--R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4130,43 +4136,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: 1.     1     1.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 1.     1     1.0    1.0    1.0       add	x0, x27, #1
 # CHECK-NEXT: 2.     1     1.0    1.0    0.0       st1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: 3.     1     1.0    0.0    3.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     2.0    1.0    0.0       st1	{ v1.b }[0], [x27], #1
-# CHECK-NEXT: 5.     1     3.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     2.0    0.0    0.0       st1	{ v1.b }[8], [x27], #1
-# CHECK-NEXT: 7.     1     3.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     3.0    0.0    0.0       st1	{ v1.b }[0], [x27], x28
-# CHECK-NEXT: 9.     1     4.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.1    0.3    1.2       <total>
+# CHECK-NEXT: 3.     1     1.0    1.0    1.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       st1	{ v1.b }[0], [x27], #1
+# CHECK-NEXT: 5.     1     1.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       st1	{ v1.b }[8], [x27], #1
+# CHECK-NEXT: 7.     1     1.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       st1	{ v1.b }[0], [x27], x28
+# CHECK-NEXT: 9.     1     1.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    0.7    0.8       <total>
 
 # CHECK:      [65] Code Region - G66
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      506
+# CHECK-NEXT: Total Cycles:      1005
 # CHECK-NEXT: Total uOps:        2000
 
-# CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    3.95
-# CHECK-NEXT: IPC:               1.98
-# CHECK-NEXT: Block RThroughput: 3.3
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    1.99
+# CHECK-NEXT: IPC:               1.00
+# CHECK-NEXT: Block RThroughput: 6.7
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0
+# CHECK-NEXT:                     01234
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeER   .   st1	{ v1.b }[8], [x27], x28
-# CHECK-NEXT: [0,1]     D=eE--R   .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=eeeeER  .   st1	{ v1.h }[0], [x27], #2
-# CHECK-NEXT: [0,3]     D==eE--R  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D=eeeeER .   st1	{ v1.h }[4], [x27], #2
-# CHECK-NEXT: [0,5]     .D==eE--R .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D==eeeeER.   st1	{ v1.h }[0], [x27], x28
-# CHECK-NEXT: [0,7]     .D===eE--R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D==eeeeER   st1	{ v1.h }[4], [x27], x28
-# CHECK-NEXT: [0,9]     . D===eE--R   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeER   .   .   st1	{ v1.b }[8], [x27], x28
+# CHECK-NEXT: [0,1]     .DeE--R   .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeeeeER .   .   st1	{ v1.h }[0], [x27], #2
+# CHECK-NEXT: [0,3]     .  DeE--R .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   DeeeeER   .   st1	{ v1.h }[4], [x27], #2
+# CHECK-NEXT: [0,5]     .    DeE--R   .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .DeeeeER .   st1	{ v1.h }[0], [x27], x28
+# CHECK-NEXT: [0,7]     .    . DeE--R .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  DeeeeER   st1	{ v1.h }[4], [x27], x28
+# CHECK-NEXT: [0,9]     .    .   DeE--R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4176,43 +4182,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.b }[8], [x27], x28
-# CHECK-NEXT: 1.     1     2.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st1	{ v1.h }[0], [x27], #2
-# CHECK-NEXT: 3.     1     3.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     2.0    0.0    0.0       st1	{ v1.h }[4], [x27], #2
-# CHECK-NEXT: 5.     1     3.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     3.0    0.0    0.0       st1	{ v1.h }[0], [x27], x28
-# CHECK-NEXT: 7.     1     4.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     3.0    0.0    0.0       st1	{ v1.h }[4], [x27], x28
-# CHECK-NEXT: 9.     1     4.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.7    0.1    1.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       st1	{ v1.h }[0], [x27], #2
+# CHECK-NEXT: 3.     1     1.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       st1	{ v1.h }[4], [x27], #2
+# CHECK-NEXT: 5.     1     1.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       st1	{ v1.h }[0], [x27], x28
+# CHECK-NEXT: 7.     1     1.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       st1	{ v1.h }[4], [x27], x28
+# CHECK-NEXT: 9.     1     1.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    0.5    1.0       <total>
 
 # CHECK:      [66] Code Region - G67
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      507
+# CHECK-NEXT: Total Cycles:      1006
 # CHECK-NEXT: Total uOps:        2200
 
-# CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    4.34
-# CHECK-NEXT: IPC:               1.97
-# CHECK-NEXT: Block RThroughput: 3.3
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    2.19
+# CHECK-NEXT: IPC:               0.99
+# CHECK-NEXT: Block RThroughput: 7.3
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     01
+# CHECK-NEXT:                     012345
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeER   ..   st1	{ v1.s }[0], [x27], #4
-# CHECK-NEXT: [0,1]     D=eE--R   ..   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=eeeeER  ..   st1	{ v1.s }[0], [x27], x28
-# CHECK-NEXT: [0,3]     D==eE--R  ..   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D=eeeeER ..   st1	{ v1.d }[0], [x27], #8
-# CHECK-NEXT: [0,5]     .D==eE--R ..   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D==eeeeER..   st1	{ v1.d }[0], [x27], x28
-# CHECK-NEXT: [0,7]     .D===eE--R..   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D==eeeeeER   st2	{ v1.2d, v2.2d }, [x27], #32
-# CHECK-NEXT: [0,9]     . D===eE---R   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeER   .    .   st1	{ v1.s }[0], [x27], #4
+# CHECK-NEXT: [0,1]     .DeE--R   .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeeeeER .    .   st1	{ v1.s }[0], [x27], x28
+# CHECK-NEXT: [0,3]     .  DeE--R .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   DeeeeER    .   st1	{ v1.d }[0], [x27], #8
+# CHECK-NEXT: [0,5]     .    DeE--R    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .DeeeeER  .   st1	{ v1.d }[0], [x27], x28
+# CHECK-NEXT: [0,7]     .    . DeE--R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  DeeeeeER   st2	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: [0,9]     .    .   DeE---R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4222,43 +4228,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.s }[0], [x27], #4
-# CHECK-NEXT: 1.     1     2.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st1	{ v1.s }[0], [x27], x28
-# CHECK-NEXT: 3.     1     3.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     2.0    0.0    0.0       st1	{ v1.d }[0], [x27], #8
-# CHECK-NEXT: 5.     1     3.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     3.0    0.0    0.0       st1	{ v1.d }[0], [x27], x28
-# CHECK-NEXT: 7.     1     4.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     3.0    0.0    0.0       st2	{ v1.2d, v2.2d }, [x27], #32
-# CHECK-NEXT: 9.     1     4.0    0.0    3.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.7    0.1    1.1       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       st1	{ v1.s }[0], [x27], x28
+# CHECK-NEXT: 3.     1     1.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       st1	{ v1.d }[0], [x27], #8
+# CHECK-NEXT: 5.     1     1.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       st1	{ v1.d }[0], [x27], x28
+# CHECK-NEXT: 7.     1     1.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       st2	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: 9.     1     1.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    0.5    1.1       <total>
 
 # CHECK:      [67] Code Region - G68
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      507
+# CHECK-NEXT: Total Cycles:      1006
 # CHECK-NEXT: Total uOps:        2400
 
-# CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    4.73
-# CHECK-NEXT: IPC:               1.97
-# CHECK-NEXT: Block RThroughput: 3.5
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    2.39
+# CHECK-NEXT: IPC:               0.99
+# CHECK-NEXT: Block RThroughput: 8.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     01
+# CHECK-NEXT:                     012345
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeER   ..   st2	{ v1.2s, v2.2s }, [x27], #16
-# CHECK-NEXT: [0,1]     D=eE--R   ..   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=eeeeER  ..   st2	{ v1.4h, v2.4h }, [x27], #16
-# CHECK-NEXT: [0,3]     D==eE--R  ..   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D=eeeeeER..   st2	{ v1.4s, v2.4s }, [x27], #32
-# CHECK-NEXT: [0,5]     .D==eE---R..   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     . D=eeeeER..   st2	{ v1.8b, v2.8b }, [x27], #16
-# CHECK-NEXT: [0,7]     . D==eE--R..   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .  D=eeeeeER   st2	{ v1.8h, v2.8h }, [x27], #32
-# CHECK-NEXT: [0,9]     .  D==eE---R   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeER   .    .   st2	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: [0,1]     .DeE--R   .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeeeeER .    .   st2	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: [0,3]     .  DeE--R .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   DeeeeeER   .   st2	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: [0,5]     .    DeE---R   .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .DeeeeER  .   st2	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: [0,7]     .    . DeE--R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  DeeeeeER   st2	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: [0,9]     .    .   DeE---R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4268,43 +4274,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st2	{ v1.2s, v2.2s }, [x27], #16
-# CHECK-NEXT: 1.     1     2.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st2	{ v1.4h, v2.4h }, [x27], #16
-# CHECK-NEXT: 3.     1     3.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     2.0    0.0    0.0       st2	{ v1.4s, v2.4s }, [x27], #32
-# CHECK-NEXT: 5.     1     3.0    0.0    3.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     2.0    0.0    0.0       st2	{ v1.8b, v2.8b }, [x27], #16
-# CHECK-NEXT: 7.     1     3.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     2.0    0.0    0.0       st2	{ v1.8h, v2.8h }, [x27], #32
-# CHECK-NEXT: 9.     1     3.0    0.0    3.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.3    0.1    1.2       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       st2	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: 3.     1     1.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       st2	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: 5.     1     1.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       st2	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: 7.     1     1.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       st2	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: 9.     1     1.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    0.5    1.2       <total>
 
 # CHECK:      [68] Code Region - G69
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      507
+# CHECK-NEXT: Total Cycles:      1006
 # CHECK-NEXT: Total uOps:        2600
 
-# CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    5.13
-# CHECK-NEXT: IPC:               1.97
-# CHECK-NEXT: Block RThroughput: 4.0
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    2.58
+# CHECK-NEXT: IPC:               0.99
+# CHECK-NEXT: Block RThroughput: 8.7
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     01
+# CHECK-NEXT:                     012345
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeER  ..   st2	{ v1.16b, v2.16b }, [x27], #32
-# CHECK-NEXT: [0,1]     D=eE---R  ..   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeeeeeER ..   st2	{ v1.2d, v2.2d }, [x27], x28
-# CHECK-NEXT: [0,3]     .D=eE---R ..   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . DeeeeER ..   st2	{ v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT: [0,5]     . D=eE--R ..   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     . D=eeeeER..   st2	{ v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: [0,7]     . D==eE--R..   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .  D=eeeeeER   st2	{ v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT: [0,9]     .  D==eE---R   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeER  .    .   st2	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: [0,1]     .DeE---R  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeeeeeER.    .   st2	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: [0,3]     .  DeE---R.    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   DeeeeER    .   st2	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: [0,5]     .    DeE--R    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .DeeeeER  .   st2	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: [0,7]     .    . DeE--R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  DeeeeeER   st2	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .   DeE---R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4314,43 +4320,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st2	{ v1.16b, v2.16b }, [x27], #32
-# CHECK-NEXT: 1.     1     2.0    0.0    3.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st2	{ v1.2d, v2.2d }, [x27], x28
-# CHECK-NEXT: 3.     1     2.0    0.0    3.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     1.0    0.0    0.0       st2	{ v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT: 5.     1     2.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     2.0    0.0    0.0       st2	{ v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: 7.     1     3.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     2.0    0.0    0.0       st2	{ v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT: 9.     1     3.0    0.0    3.0       add	x0, x27, #1
-# CHECK-NEXT:        1     1.9    0.1    1.3       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       st2	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: 3.     1     1.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       st2	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: 5.     1     1.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       st2	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: 7.     1     1.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       st2	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: 9.     1     1.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    0.5    1.3       <total>
 
 # CHECK:      [69] Code Region - G70
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      506
+# CHECK-NEXT: Total Cycles:      1005
 # CHECK-NEXT: Total uOps:        2400
 
-# CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    4.74
-# CHECK-NEXT: IPC:               1.98
-# CHECK-NEXT: Block RThroughput: 3.5
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    2.39
+# CHECK-NEXT: IPC:               1.00
+# CHECK-NEXT: Block RThroughput: 8.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0
+# CHECK-NEXT:                     01234
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeER   .   st2	{ v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: [0,1]     D=eE--R   .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeeeeeER .   st2	{ v1.8h, v2.8h }, [x27], x28
-# CHECK-NEXT: [0,3]     .D=eE---R .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . DeeeeeER.   st2	{ v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: [0,5]     . D=eE---R.   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  DeeeeER.   st2	{ v1.b, v2.b }[0], [x27], #2
-# CHECK-NEXT: [0,7]     .  D=eE--R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .  D=eeeeER   st2	{ v1.b, v2.b }[8], [x27], #2
-# CHECK-NEXT: [0,9]     .  D==eE--R   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeER   .   .   st2	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: [0,1]     .DeE--R   .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeeeeeER.   .   st2	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: [0,3]     .  DeE---R.   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   DeeeeeER  .   st2	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: [0,5]     .    DeE---R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .DeeeeER .   st2	{ v1.b, v2.b }[0], [x27], #2
+# CHECK-NEXT: [0,7]     .    . DeE--R .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  DeeeeER   st2	{ v1.b, v2.b }[8], [x27], #2
+# CHECK-NEXT: [0,9]     .    .   DeE--R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4360,43 +4366,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st2	{ v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: 1.     1     2.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st2	{ v1.8h, v2.8h }, [x27], x28
-# CHECK-NEXT: 3.     1     2.0    0.0    3.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     1.0    0.0    0.0       st2	{ v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: 5.     1     2.0    0.0    3.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     1.0    0.0    0.0       st2	{ v1.b, v2.b }[0], [x27], #2
-# CHECK-NEXT: 7.     1     2.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     2.0    0.0    0.0       st2	{ v1.b, v2.b }[8], [x27], #2
-# CHECK-NEXT: 9.     1     3.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT:        1     1.7    0.1    1.2       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       st2	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: 3.     1     1.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       st2	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: 5.     1     1.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       st2	{ v1.b, v2.b }[0], [x27], #2
+# CHECK-NEXT: 7.     1     1.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       st2	{ v1.b, v2.b }[8], [x27], #2
+# CHECK-NEXT: 9.     1     1.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    0.5    1.2       <total>
 
 # CHECK:      [70] Code Region - G71
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      506
+# CHECK-NEXT: Total Cycles:      1005
 # CHECK-NEXT: Total uOps:        2000
 
-# CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    3.95
-# CHECK-NEXT: IPC:               1.98
-# CHECK-NEXT: Block RThroughput: 3.3
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    1.99
+# CHECK-NEXT: IPC:               1.00
+# CHECK-NEXT: Block RThroughput: 6.7
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0
+# CHECK-NEXT:                     01234
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeER   .   st2	{ v1.b, v2.b }[0], [x27], x28
-# CHECK-NEXT: [0,1]     D=eE--R   .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=eeeeER  .   st2	{ v1.b, v2.b }[8], [x27], x28
-# CHECK-NEXT: [0,3]     D==eE--R  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D=eeeeER .   st2	{ v1.h, v2.h }[0], [x27], #4
-# CHECK-NEXT: [0,5]     .D==eE--R .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D==eeeeER.   st2	{ v1.h, v2.h }[4], [x27], #4
-# CHECK-NEXT: [0,7]     .D===eE--R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D==eeeeER   st2	{ v1.h, v2.h }[0], [x27], x28
-# CHECK-NEXT: [0,9]     . D===eE--R   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeER   .   .   st2	{ v1.b, v2.b }[0], [x27], x28
+# CHECK-NEXT: [0,1]     .DeE--R   .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeeeeER .   .   st2	{ v1.b, v2.b }[8], [x27], x28
+# CHECK-NEXT: [0,3]     .  DeE--R .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   DeeeeER   .   st2	{ v1.h, v2.h }[0], [x27], #4
+# CHECK-NEXT: [0,5]     .    DeE--R   .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .DeeeeER .   st2	{ v1.h, v2.h }[4], [x27], #4
+# CHECK-NEXT: [0,7]     .    . DeE--R .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  DeeeeER   st2	{ v1.h, v2.h }[0], [x27], x28
+# CHECK-NEXT: [0,9]     .    .   DeE--R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4406,43 +4412,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st2	{ v1.b, v2.b }[0], [x27], x28
-# CHECK-NEXT: 1.     1     2.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st2	{ v1.b, v2.b }[8], [x27], x28
-# CHECK-NEXT: 3.     1     3.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     2.0    0.0    0.0       st2	{ v1.h, v2.h }[0], [x27], #4
-# CHECK-NEXT: 5.     1     3.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     3.0    0.0    0.0       st2	{ v1.h, v2.h }[4], [x27], #4
-# CHECK-NEXT: 7.     1     4.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     3.0    0.0    0.0       st2	{ v1.h, v2.h }[0], [x27], x28
-# CHECK-NEXT: 9.     1     4.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.7    0.1    1.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       st2	{ v1.b, v2.b }[8], [x27], x28
+# CHECK-NEXT: 3.     1     1.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       st2	{ v1.h, v2.h }[0], [x27], #4
+# CHECK-NEXT: 5.     1     1.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       st2	{ v1.h, v2.h }[4], [x27], #4
+# CHECK-NEXT: 7.     1     1.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       st2	{ v1.h, v2.h }[0], [x27], x28
+# CHECK-NEXT: 9.     1     1.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    0.5    1.0       <total>
 
 # CHECK:      [71] Code Region - G72
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      506
+# CHECK-NEXT: Total Cycles:      1005
 # CHECK-NEXT: Total uOps:        2000
 
-# CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    3.95
-# CHECK-NEXT: IPC:               1.98
-# CHECK-NEXT: Block RThroughput: 3.3
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    1.99
+# CHECK-NEXT: IPC:               1.00
+# CHECK-NEXT: Block RThroughput: 6.7
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0
+# CHECK-NEXT:                     01234
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeER   .   st2	{ v1.h, v2.h }[4], [x27], x28
-# CHECK-NEXT: [0,1]     D=eE--R   .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=eeeeER  .   st2	{ v1.s, v2.s }[0], [x27], #8
-# CHECK-NEXT: [0,3]     D==eE--R  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D=eeeeER .   st2	{ v1.s, v2.s }[0], [x27], x28
-# CHECK-NEXT: [0,5]     .D==eE--R .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D==eeeeER.   st2	{ v1.d, v2.d }[0], [x27], #16
-# CHECK-NEXT: [0,7]     .D===eE--R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D==eeeeER   st2	{ v1.d, v2.d }[0], [x27], x28
-# CHECK-NEXT: [0,9]     . D===eE--R   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeER   .   .   st2	{ v1.h, v2.h }[4], [x27], x28
+# CHECK-NEXT: [0,1]     .DeE--R   .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeeeeER .   .   st2	{ v1.s, v2.s }[0], [x27], #8
+# CHECK-NEXT: [0,3]     .  DeE--R .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   DeeeeER   .   st2	{ v1.s, v2.s }[0], [x27], x28
+# CHECK-NEXT: [0,5]     .    DeE--R   .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .DeeeeER .   st2	{ v1.d, v2.d }[0], [x27], #16
+# CHECK-NEXT: [0,7]     .    . DeE--R .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  DeeeeER   st2	{ v1.d, v2.d }[0], [x27], x28
+# CHECK-NEXT: [0,9]     .    .   DeE--R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4452,39 +4458,39 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st2	{ v1.h, v2.h }[4], [x27], x28
-# CHECK-NEXT: 1.     1     2.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st2	{ v1.s, v2.s }[0], [x27], #8
-# CHECK-NEXT: 3.     1     3.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     2.0    0.0    0.0       st2	{ v1.s, v2.s }[0], [x27], x28
-# CHECK-NEXT: 5.     1     3.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     3.0    0.0    0.0       st2	{ v1.d, v2.d }[0], [x27], #16
-# CHECK-NEXT: 7.     1     4.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     3.0    0.0    0.0       st2	{ v1.d, v2.d }[0], [x27], x28
-# CHECK-NEXT: 9.     1     4.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.7    0.1    1.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       st2	{ v1.s, v2.s }[0], [x27], #8
+# CHECK-NEXT: 3.     1     1.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       st2	{ v1.s, v2.s }[0], [x27], x28
+# CHECK-NEXT: 5.     1     1.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       st2	{ v1.d, v2.d }[0], [x27], #16
+# CHECK-NEXT: 7.     1     1.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       st2	{ v1.d, v2.d }[0], [x27], x28
+# CHECK-NEXT: 9.     1     1.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    0.5    1.0       <total>
 
 # CHECK:      [72] Code Region - G73
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      600
-# CHECK-NEXT: Total Cycles:      407
+# CHECK-NEXT: Total Cycles:      706
 # CHECK-NEXT: Total uOps:        2000
 
-# CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    4.91
-# CHECK-NEXT: IPC:               1.47
-# CHECK-NEXT: Block RThroughput: 3.5
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    2.83
+# CHECK-NEXT: IPC:               0.85
+# CHECK-NEXT: Block RThroughput: 6.7
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0
+# CHECK-NEXT:                     012
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeER .   st3	{ v1.2d, v2.2d, v3.2d }, [x27], #48
-# CHECK-NEXT: [0,1]     D=eE----R .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeeeeeER .   st3	{ v1.2s, v2.2s, v3.2s }, [x27], #24
-# CHECK-NEXT: [0,3]     .D=eE---R .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D=eeeeeER   st3	{ v1.4h, v2.4h, v3.4h }, [x27], #24
-# CHECK-NEXT: [0,5]     . D==eE---R   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeER . .   st3	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: [0,1]     . DeE---R . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .  DeeeeeER .   st3	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: [0,3]     .   DeE---R .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    DeeeeeER   st3	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: [0,5]     .    .DeE---R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4494,39 +4500,39 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st3	{ v1.2d, v2.2d, v3.2d }, [x27], #48
-# CHECK-NEXT: 1.     1     2.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st3	{ v1.2s, v2.2s, v3.2s }, [x27], #24
-# CHECK-NEXT: 3.     1     2.0    0.0    3.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     2.0    1.0    0.0       st3	{ v1.4h, v2.4h, v3.4h }, [x27], #24
-# CHECK-NEXT: 5.     1     3.0    0.0    3.0       add	x0, x27, #1
-# CHECK-NEXT:        1     1.8    0.3    1.7       <total>
+# CHECK-NEXT: 1.     1     1.0    1.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       st3	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: 3.     1     1.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       st3	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: 5.     1     1.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    0.7    1.5       <total>
 
 # CHECK:      [73] Code Region - G74
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      708
+# CHECK-NEXT: Total Cycles:      1406
 # CHECK-NEXT: Total uOps:        3800
 
-# CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    5.37
-# CHECK-NEXT: IPC:               1.41
-# CHECK-NEXT: Block RThroughput: 7.0
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    2.70
+# CHECK-NEXT: IPC:               0.71
+# CHECK-NEXT: Block RThroughput: 12.7
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     01234
+# CHECK-NEXT:                     0123456789
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeER .   .   st3	{ v1.4s, v2.4s, v3.4s }, [x27], #48
-# CHECK-NEXT: [0,1]     D=eE----R .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeeeeeER .   .   st3	{ v1.8b, v2.8b, v3.8b }, [x27], #24
-# CHECK-NEXT: [0,3]     .D=eE---R .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D=eeeeeeER  .   st3	{ v1.8h, v2.8h, v3.8h }, [x27], #48
-# CHECK-NEXT: [0,5]     . D==eE----R  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D=eeeeeeER .   st3	{ v1.16b, v2.16b, v3.16b }, [x27], #48
-# CHECK-NEXT: [0,7]     .  D==eE----R .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D==eeeeeeER   st3	{ v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT: [0,9]     .   D===eE----R   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeER .    .   .   st3	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: [0,1]     . DeE---R .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .  DeeeeeER    .   .   st3	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: [0,3]     .   DeE---R    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    DeeeeeeER .   .   st3	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: [0,5]     .    . DeE---R .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .  DeeeeeeER  .   st3	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: [0,7]     .    .    DeE---R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .DeeeeeeER   st3	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .  DeE---R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4536,43 +4542,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st3	{ v1.4s, v2.4s, v3.4s }, [x27], #48
-# CHECK-NEXT: 1.     1     2.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st3	{ v1.8b, v2.8b, v3.8b }, [x27], #24
-# CHECK-NEXT: 3.     1     2.0    0.0    3.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     2.0    1.0    0.0       st3	{ v1.8h, v2.8h, v3.8h }, [x27], #48
-# CHECK-NEXT: 5.     1     3.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     2.0    0.0    0.0       st3	{ v1.16b, v2.16b, v3.16b }, [x27], #48
-# CHECK-NEXT: 7.     1     3.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     3.0    1.0    0.0       st3	{ v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT: 9.     1     4.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.3    0.3    1.9       <total>
+# CHECK-NEXT: 1.     1     1.0    1.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       st3	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: 3.     1     1.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       st3	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: 5.     1     1.0    1.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       st3	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: 7.     1     1.0    1.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       st3	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: 9.     1     1.0    1.0    3.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    0.9    1.5       <total>
 
 # CHECK:      [74] Code Region - G75
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      707
+# CHECK-NEXT: Total Cycles:      1206
 # CHECK-NEXT: Total uOps:        3400
 
-# CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    4.81
-# CHECK-NEXT: IPC:               1.41
-# CHECK-NEXT: Block RThroughput: 6.0
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    2.82
+# CHECK-NEXT: IPC:               0.83
+# CHECK-NEXT: Block RThroughput: 11.3
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123
+# CHECK-NEXT:                     01234567
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeER  .  .   st3	{ v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: [0,1]     D=eE---R  .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeeeeeER .  .   st3	{ v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: [0,3]     .D=eE---R .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . DeeeeeeER  .   st3	{ v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: [0,5]     . D=eE----R  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  DeeeeeER  .   st3	{ v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: [0,7]     .  D=eE---R  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D=eeeeeeER   st3	{ v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT: [0,9]     .   D==eE----R   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeER  .    . .   st3	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: [0,1]     .DeE---R  .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeeeeeER.    . .   st3	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: [0,3]     .  DeE---R.    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   DeeeeeeER  . .   st3	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .DeE---R  . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    . DeeeeeER. .   st3	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .  DeE---R. .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .   DeeeeeeER   st3	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .DeE---R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4582,43 +4588,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st3	{ v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: 1.     1     2.0    0.0    3.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st3	{ v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: 3.     1     2.0    0.0    3.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     1.0    0.0    0.0       st3	{ v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: 5.     1     2.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     1.0    0.0    0.0       st3	{ v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: 7.     1     2.0    0.0    3.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     2.0    1.0    0.0       st3	{ v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT: 9.     1     3.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT:        1     1.7    0.2    1.7       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       st3	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: 3.     1     1.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       st3	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: 5.     1     1.0    1.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       st3	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: 7.     1     1.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       st3	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: 9.     1     1.0    1.0    3.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    0.7    1.5       <total>
 
 # CHECK:      [75] Code Region - G76
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      755
+# CHECK-NEXT: Total Cycles:      1504
 # CHECK-NEXT: Total uOps:        4000
 
-# CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    5.30
-# CHECK-NEXT: IPC:               1.32
-# CHECK-NEXT: Block RThroughput: 7.5
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    2.66
+# CHECK-NEXT: IPC:               0.66
+# CHECK-NEXT: Block RThroughput: 13.3
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     012
+# CHECK-NEXT:                     012345678
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeER . .   st3	{ v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: [0,1]     D=eE----R . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeeeeE-R . .   st3	{ v1.b, v2.b, v3.b }[0], [x27], #3
-# CHECK-NEXT: [0,3]     .D=eE---R . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D=eeeeER. .   st3	{ v1.b, v2.b, v3.b }[8], [x27], #3
-# CHECK-NEXT: [0,5]     . D==eE--R. .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D=eeeeER .   st3	{ v1.b, v2.b, v3.b }[0], [x27], x28
-# CHECK-NEXT: [0,7]     .  D==eE--R .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D==eeeeER   st3	{ v1.b, v2.b, v3.b }[8], [x27], x28
-# CHECK-NEXT: [0,9]     .   D===eE--R   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeER .    .  .   st3	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: [0,1]     . DeE---R .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .  DeeeeER.    .  .   st3	{ v1.b, v2.b, v3.b }[0], [x27], #3
+# CHECK-NEXT: [0,3]     .    DeE-R.    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .DeeeeER  .  .   st3	{ v1.b, v2.b, v3.b }[8], [x27], #3
+# CHECK-NEXT: [0,5]     .    .  DeE-R  .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .   DeeeeER  .   st3	{ v1.b, v2.b, v3.b }[0], [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .DeE-R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    . DeeeeER   st3	{ v1.b, v2.b, v3.b }[8], [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .   DeE-R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4628,43 +4634,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st3	{ v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: 1.     1     2.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    0.0    1.0       st3	{ v1.b, v2.b, v3.b }[0], [x27], #3
-# CHECK-NEXT: 3.     1     2.0    0.0    3.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     2.0    1.0    0.0       st3	{ v1.b, v2.b, v3.b }[8], [x27], #3
-# CHECK-NEXT: 5.     1     3.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     2.0    0.0    0.0       st3	{ v1.b, v2.b, v3.b }[0], [x27], x28
-# CHECK-NEXT: 7.     1     3.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     3.0    1.0    0.0       st3	{ v1.b, v2.b, v3.b }[8], [x27], x28
-# CHECK-NEXT: 9.     1     4.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.3    0.3    1.4       <total>
+# CHECK-NEXT: 1.     1     1.0    1.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       st3	{ v1.b, v2.b, v3.b }[0], [x27], #3
+# CHECK-NEXT: 3.     1     1.0    1.0    1.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       st3	{ v1.b, v2.b, v3.b }[8], [x27], #3
+# CHECK-NEXT: 5.     1     1.0    1.0    1.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       st3	{ v1.b, v2.b, v3.b }[0], [x27], x28
+# CHECK-NEXT: 7.     1     1.0    1.0    1.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       st3	{ v1.b, v2.b, v3.b }[8], [x27], x28
+# CHECK-NEXT: 9.     1     1.0    1.0    1.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    1.0    0.7       <total>
 
 # CHECK:      [76] Code Region - G77
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      755
+# CHECK-NEXT: Total Cycles:      1504
 # CHECK-NEXT: Total uOps:        4000
 
-# CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    5.30
-# CHECK-NEXT: IPC:               1.32
-# CHECK-NEXT: Block RThroughput: 7.5
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    2.66
+# CHECK-NEXT: IPC:               0.66
+# CHECK-NEXT: Block RThroughput: 13.3
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     012
+# CHECK-NEXT:                     012345678
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeER   . .   st3	{ v1.h, v2.h, v3.h }[0], [x27], #6
-# CHECK-NEXT: [0,1]     D=eE--R   . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeeeeER  . .   st3	{ v1.h, v2.h, v3.h }[4], [x27], #6
-# CHECK-NEXT: [0,3]     .D=eE--R  . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D=eeeeER. .   st3	{ v1.h, v2.h, v3.h }[0], [x27], x28
-# CHECK-NEXT: [0,5]     . D==eE--R. .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D=eeeeER .   st3	{ v1.h, v2.h, v3.h }[4], [x27], x28
-# CHECK-NEXT: [0,7]     .  D==eE--R .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D==eeeeER   st3	{ v1.s, v2.s, v3.s }[0], [x27], #12
-# CHECK-NEXT: [0,9]     .   D===eE--R   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeER   .    .  .   st3	{ v1.h, v2.h, v3.h }[0], [x27], #6
+# CHECK-NEXT: [0,1]     . DeE-R   .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .  DeeeeER.    .  .   st3	{ v1.h, v2.h, v3.h }[4], [x27], #6
+# CHECK-NEXT: [0,3]     .    DeE-R.    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .DeeeeER  .  .   st3	{ v1.h, v2.h, v3.h }[0], [x27], x28
+# CHECK-NEXT: [0,5]     .    .  DeE-R  .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .   DeeeeER  .   st3	{ v1.h, v2.h, v3.h }[4], [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .DeE-R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    . DeeeeER   st3	{ v1.s, v2.s, v3.s }[0], [x27], #12
+# CHECK-NEXT: [0,9]     .    .    .   DeE-R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4674,43 +4680,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st3	{ v1.h, v2.h, v3.h }[0], [x27], #6
-# CHECK-NEXT: 1.     1     2.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st3	{ v1.h, v2.h, v3.h }[4], [x27], #6
-# CHECK-NEXT: 3.     1     2.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     2.0    1.0    0.0       st3	{ v1.h, v2.h, v3.h }[0], [x27], x28
-# CHECK-NEXT: 5.     1     3.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     2.0    0.0    0.0       st3	{ v1.h, v2.h, v3.h }[4], [x27], x28
-# CHECK-NEXT: 7.     1     3.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     3.0    1.0    0.0       st3	{ v1.s, v2.s, v3.s }[0], [x27], #12
-# CHECK-NEXT: 9.     1     4.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.3    0.3    1.0       <total>
+# CHECK-NEXT: 1.     1     1.0    1.0    1.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       st3	{ v1.h, v2.h, v3.h }[4], [x27], #6
+# CHECK-NEXT: 3.     1     1.0    1.0    1.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       st3	{ v1.h, v2.h, v3.h }[0], [x27], x28
+# CHECK-NEXT: 5.     1     1.0    1.0    1.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       st3	{ v1.h, v2.h, v3.h }[4], [x27], x28
+# CHECK-NEXT: 7.     1     1.0    1.0    1.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       st3	{ v1.s, v2.s, v3.s }[0], [x27], #12
+# CHECK-NEXT: 9.     1     1.0    1.0    1.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    1.0    0.5       <total>
 
 # CHECK:      [77] Code Region - G78
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      808
+# CHECK-NEXT: Total Cycles:      1607
 # CHECK-NEXT: Total uOps:        4200
 
-# CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    5.20
-# CHECK-NEXT: IPC:               1.24
-# CHECK-NEXT: Block RThroughput: 8.0
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    2.61
+# CHECK-NEXT: IPC:               0.62
+# CHECK-NEXT: Block RThroughput: 14.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     012345
-# CHECK-NEXT: Index     0123456789
-
-# CHECK:      [0,0]     DeeeeER   .    .   st3	{ v1.s, v2.s, v3.s }[0], [x27], x28
-# CHECK-NEXT: [0,1]     D=eE--R   .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeeeeeER .    .   st3	{ v1.d, v2.d, v3.d }[0], [x27], #24
-# CHECK-NEXT: [0,3]     .D=eE---R .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D=eeeeeER    .   st3	{ v1.d, v2.d, v3.d }[0], [x27], x28
-# CHECK-NEXT: [0,5]     . D==eE---R    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D=eeeeeeER  .   st4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
-# CHECK-NEXT: [0,7]     .   D=eE----R  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    D=eeeeeeeER   st4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
-# CHECK-NEXT: [0,9]     .    D==eE-----R   add	x0, x27, #1
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012
+
+# CHECK:      [0,0]     DeeeeER   .    .    . .   st3	{ v1.s, v2.s, v3.s }[0], [x27], x28
+# CHECK-NEXT: [0,1]     . DeE-R   .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .  DeeeeeER    .    . .   st3	{ v1.d, v2.d, v3.d }[0], [x27], #24
+# CHECK-NEXT: [0,3]     .    DeE--R    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .DeeeeeER .    . .   st3	{ v1.d, v2.d, v3.d }[0], [x27], x28
+# CHECK-NEXT: [0,5]     .    .  DeE--R .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .   DeeeeeeER  . .   st4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: [0,7]     .    .    . DeE--R  . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .  DeeeeeeeER   st4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: [0,9]     .    .    .    DeE----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4720,43 +4726,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st3	{ v1.s, v2.s, v3.s }[0], [x27], x28
-# CHECK-NEXT: 1.     1     2.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st3	{ v1.d, v2.d, v3.d }[0], [x27], #24
-# CHECK-NEXT: 3.     1     2.0    0.0    3.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     2.0    1.0    0.0       st3	{ v1.d, v2.d, v3.d }[0], [x27], x28
-# CHECK-NEXT: 5.     1     3.0    0.0    3.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     2.0    0.0    0.0       st4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
-# CHECK-NEXT: 7.     1     2.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     2.0    1.0    0.0       st4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
-# CHECK-NEXT: 9.     1     3.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.0    0.3    1.7       <total>
+# CHECK-NEXT: 1.     1     1.0    1.0    1.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       st3	{ v1.d, v2.d, v3.d }[0], [x27], #24
+# CHECK-NEXT: 3.     1     1.0    1.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       st3	{ v1.d, v2.d, v3.d }[0], [x27], x28
+# CHECK-NEXT: 5.     1     1.0    1.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       st4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: 7.     1     1.0    1.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       st4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: 9.     1     1.0    1.0    4.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    1.0    1.1       <total>
 
 # CHECK:      [78] Code Region - G79
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      1207
+# CHECK-NEXT: Total Cycles:      2107
 # CHECK-NEXT: Total uOps:        5800
 
-# CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    4.81
-# CHECK-NEXT: IPC:               0.83
-# CHECK-NEXT: Block RThroughput: 12.0
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    2.75
+# CHECK-NEXT: IPC:               0.47
+# CHECK-NEXT: Block RThroughput: 19.3
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     012345678
-# CHECK-NEXT: Index     0123456789
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          01234567
 
-# CHECK:      [0,0]     DeeeeeeeER.    .  .   st4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
-# CHECK-NEXT: [0,1]     D=eE-----R.    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeeeeeeeeeER  .  .   st4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
-# CHECK-NEXT: [0,3]     . DeE-------R  .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .  DeeeeeeeER  .  .   st4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
-# CHECK-NEXT: [0,5]     .  D=eE-----R  .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .   D==eeeeeeeeeER.   st4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
-# CHECK-NEXT: [0,7]     .    D==eE-------R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .D=eeeeeeeeeER   st4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
-# CHECK-NEXT: [0,9]     .    . D=eE-------R   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeeER.    .    .    . .   st4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: [0,1]     . DeE----R.    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .  DeeeeeeeeeER.    .    . .   st4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: [0,3]     .    . DeE----R.    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  DeeeeeeeER  .    . .   st4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: [0,5]     .    .    DeE----R  .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .DeeeeeeeeeER  . .   st4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: [0,7]     .    .    .    DeE----R  . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .DeeeeeeeeeER   st4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: [0,9]     .    .    .    .    DeE----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4766,43 +4772,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
-# CHECK-NEXT: 1.     1     2.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
-# CHECK-NEXT: 3.     1     1.0    0.0    7.0       add	x0, x27, #1
+# CHECK-NEXT: 1.     1     1.0    1.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       st4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: 3.     1     1.0    1.0    4.0       add	x0, x27, #1
 # CHECK-NEXT: 4.     1     1.0    1.0    0.0       st4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
-# CHECK-NEXT: 5.     1     2.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     3.0    2.0    0.0       st4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
-# CHECK-NEXT: 7.     1     3.0    0.0    7.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     2.0    0.0    0.0       st4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
-# CHECK-NEXT: 9.     1     2.0    0.0    7.0       add	x0, x27, #1
-# CHECK-NEXT:        1     1.8    0.4    3.1       <total>
+# CHECK-NEXT: 5.     1     1.0    1.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       st4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: 7.     1     1.0    1.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       st4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: 9.     1     1.0    1.0    4.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    1.0    2.0       <total>
 
 # CHECK:      [79] Code Region - G80
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      1007
+# CHECK-NEXT: Total Cycles:      1807
 # CHECK-NEXT: Total uOps:        4800
 
-# CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    4.77
-# CHECK-NEXT: IPC:               0.99
-# CHECK-NEXT: Block RThroughput: 9.5
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    2.66
+# CHECK-NEXT: IPC:               0.55
+# CHECK-NEXT: Block RThroughput: 16.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456
-# CHECK-NEXT: Index     0123456789
-
-# CHECK:      [0,0]     DeeeeeeER .    ..   st4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: [0,1]     .DeE----R .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     . DeeeeeeeER   ..   st4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: [0,3]     . D=eE-----R   ..   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .  D=eeeeeeeER ..   st4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: [0,5]     .  D==eE-----R ..   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .   D=eeeeeeeeeER   st4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: [0,7]     .    D=eE-------R   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .D=eeeeeeeER   st4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .D==eE-----R   add	x0, x27, #1
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          01234
+
+# CHECK:      [0,0]     DeeeeeeER .    .    .   .   st4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: [0,1]     .  DeE--R .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeeeeeeER .    .   .   st4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .DeE----R .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    . DeeeeeeeER   .   .   st4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .   DeE----R   .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    DeeeeeeeeeER  .   st4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .   DeE----R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    DeeeeeeeER   st4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    . DeE----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4812,43 +4818,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: 1.     1     1.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 1.     1     1.0    1.0    2.0       add	x0, x27, #1
 # CHECK-NEXT: 2.     1     1.0    1.0    0.0       st4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: 3.     1     2.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     2.0    1.0    0.0       st4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: 5.     1     3.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     2.0    0.0    0.0       st4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: 7.     1     2.0    0.0    7.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     2.0    1.0    0.0       st4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: 9.     1     3.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT:        1     1.9    0.4    2.6       <total>
+# CHECK-NEXT: 3.     1     1.0    1.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       st4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: 5.     1     1.0    1.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       st4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: 7.     1     1.0    1.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       st4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: 9.     1     1.0    1.0    4.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    1.0    1.8       <total>
 
 # CHECK:      [80] Code Region - G81
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      1057
+# CHECK-NEXT: Total Cycles:      1905
 # CHECK-NEXT: Total uOps:        5200
 
-# CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    4.92
-# CHECK-NEXT: IPC:               0.95
-# CHECK-NEXT: Block RThroughput: 10.5
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    2.73
+# CHECK-NEXT: IPC:               0.52
+# CHECK-NEXT: Block RThroughput: 17.3
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456
-# CHECK-NEXT: Index     0123456789
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0123
 
-# CHECK:      [0,0]     DeeeeeeeeeER   ..   st4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: [0,1]     .DeE-------R   ..   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     . DeeeeeeeeeER ..   st4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: [0,3]     .  DeE-------R ..   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .   D==eeeeeER ..   st4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
-# CHECK-NEXT: [0,5]     .   D===eE---R ..   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    D===eeeeeER.   st4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
-# CHECK-NEXT: [0,7]     .    D====eE---R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .D===eeeeeER   st4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
-# CHECK-NEXT: [0,9]     .    .D====eE---R   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeeeeER   .    .  .   st4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: [0,1]     .   DeE----R   .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeeeeeeER   .  .   st4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .   DeE----R   .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeeeER  .  .   st4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
+# CHECK-NEXT: [0,5]     .    .    . DeE--R  .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .  DeeeeeER  .   st4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
+# CHECK-NEXT: [0,7]     .    .    .    DeE--R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .DeeeeeER   st4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .  DeE--R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4858,43 +4864,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: 1.     1     1.0    0.0    7.0       add	x0, x27, #1
+# CHECK-NEXT: 1.     1     1.0    1.0    4.0       add	x0, x27, #1
 # CHECK-NEXT: 2.     1     1.0    1.0    0.0       st4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: 3.     1     1.0    0.0    7.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     3.0    3.0    0.0       st4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
-# CHECK-NEXT: 5.     1     4.0    0.0    3.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     4.0    1.0    0.0       st4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
-# CHECK-NEXT: 7.     1     5.0    0.0    3.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     4.0    0.0    0.0       st4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
-# CHECK-NEXT: 9.     1     5.0    0.0    3.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.9    0.6    2.3       <total>
+# CHECK-NEXT: 3.     1     1.0    1.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       st4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
+# CHECK-NEXT: 5.     1     1.0    1.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       st4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
+# CHECK-NEXT: 7.     1     1.0    1.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       st4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
+# CHECK-NEXT: 9.     1     1.0    1.0    2.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    1.0    1.4       <total>
 
 # CHECK:      [81] Code Region - G82
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      756
+# CHECK-NEXT: Total Cycles:      1505
 # CHECK-NEXT: Total uOps:        4000
 
-# CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    5.29
-# CHECK-NEXT: IPC:               1.32
-# CHECK-NEXT: Block RThroughput: 7.5
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    2.66
+# CHECK-NEXT: IPC:               0.66
+# CHECK-NEXT: Block RThroughput: 13.3
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123
+# CHECK-NEXT:                     0123456789
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeER  .  .   st4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
-# CHECK-NEXT: [0,1]     D=eE---R  .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeeeeeER .  .   st4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
-# CHECK-NEXT: [0,3]     .D=eE---R .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D=eeeeeER  .   st4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
-# CHECK-NEXT: [0,5]     . D==eE---R  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D=eeeeeER .   st4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
-# CHECK-NEXT: [0,7]     .  D==eE---R .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D==eeeeeER   st4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
-# CHECK-NEXT: [0,9]     .   D===eE---R   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeER  .    .   .   st4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
+# CHECK-NEXT: [0,1]     . DeE--R  .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .  DeeeeeER    .   .   st4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
+# CHECK-NEXT: [0,3]     .    DeE--R    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .DeeeeeER .   .   st4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
+# CHECK-NEXT: [0,5]     .    .  DeE--R .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .   DeeeeeER  .   st4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .DeE--R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    . DeeeeeER   st4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .   DeE--R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4904,41 +4910,41 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
-# CHECK-NEXT: 1.     1     2.0    0.0    3.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
-# CHECK-NEXT: 3.     1     2.0    0.0    3.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     2.0    1.0    0.0       st4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
-# CHECK-NEXT: 5.     1     3.0    0.0    3.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     2.0    0.0    0.0       st4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
-# CHECK-NEXT: 7.     1     3.0    0.0    3.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     3.0    1.0    0.0       st4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
-# CHECK-NEXT: 9.     1     4.0    0.0    3.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.3    0.3    1.5       <total>
+# CHECK-NEXT: 1.     1     1.0    1.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       st4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
+# CHECK-NEXT: 3.     1     1.0    1.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       st4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
+# CHECK-NEXT: 5.     1     1.0    1.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       st4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
+# CHECK-NEXT: 7.     1     1.0    1.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       st4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
+# CHECK-NEXT: 9.     1     1.0    1.0    2.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    1.0    1.0       <total>
 
 # CHECK:      [82] Code Region - G83
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      800
-# CHECK-NEXT: Total Cycles:      605
+# CHECK-NEXT: Total Cycles:      1204
 # CHECK-NEXT: Total uOps:        3200
 
-# CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    5.29
-# CHECK-NEXT: IPC:               1.32
-# CHECK-NEXT: Block RThroughput: 6.0
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    2.66
+# CHECK-NEXT: IPC:               0.66
+# CHECK-NEXT: Block RThroughput: 10.7
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0
+# CHECK-NEXT:                     012345
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeER  .   st4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
-# CHECK-NEXT: [0,1]     D=eE---R  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeeeeeER .   st4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
-# CHECK-NEXT: [0,3]     .D=eE---R .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D=eeeeER.   st4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
-# CHECK-NEXT: [0,5]     . D==eE--R.   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D=eeeeER   st4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
-# CHECK-NEXT: [0,7]     .  D==eE--R   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeER  .    .   st4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
+# CHECK-NEXT: [0,1]     . DeE--R  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .  DeeeeeER    .   st4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
+# CHECK-NEXT: [0,3]     .    DeE--R    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .DeeeeER  .   st4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
+# CHECK-NEXT: [0,5]     .    .  DeE-R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .   DeeeeER   st4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .DeE-R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4948,34 +4954,34 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
-# CHECK-NEXT: 1.     1     2.0    0.0    3.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
-# CHECK-NEXT: 3.     1     2.0    0.0    3.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     2.0    1.0    0.0       st4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
-# CHECK-NEXT: 5.     1     3.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     2.0    0.0    0.0       st4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
-# CHECK-NEXT: 7.     1     3.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.0    0.3    1.3       <total>
+# CHECK-NEXT: 1.     1     1.0    1.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       st4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
+# CHECK-NEXT: 3.     1     1.0    1.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       st4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
+# CHECK-NEXT: 5.     1     1.0    1.0    1.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       st4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
+# CHECK-NEXT: 7.     1     1.0    1.0    1.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    1.0    0.8       <total>
 
 # CHECK:      [83] Code Region - G84
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      400
-# CHECK-NEXT: Total Cycles:      204
+# CHECK-NEXT: Total Cycles:      403
 # CHECK-NEXT: Total uOps:        1000
 
-# CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    4.90
-# CHECK-NEXT: IPC:               1.96
-# CHECK-NEXT: Block RThroughput: 1.5
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    2.48
+# CHECK-NEXT: IPC:               0.99
+# CHECK-NEXT: Block RThroughput: 3.3
 
 # CHECK:      Timeline view:
-# CHECK-NEXT: Index     012345
+# CHECK-NEXT: Index     0123456
 
-# CHECK:      [0,0]     DeeER.   stp	s1, s2, [x27], #248
-# CHECK-NEXT: [0,1]     D=eER.   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeeER   stp	d1, d2, [x27], #496
-# CHECK-NEXT: [0,3]     .D=eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeER..   stp	s1, s2, [x27], #248
+# CHECK-NEXT: [0,1]     .DeER..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeeER   stp	d1, d2, [x27], #496
+# CHECK-NEXT: [0,3]     .  DeER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4985,36 +4991,37 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       stp	s1, s2, [x27], #248
-# CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       stp	d1, d2, [x27], #496
-# CHECK-NEXT: 3.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     1.5    0.3    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       stp	d1, d2, [x27], #496
+# CHECK-NEXT: 3.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    0.5    0.0       <total>
 
 # CHECK:      [84] Code Region - G85
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      703
+# CHECK-NEXT: Total Cycles:      1203
 # CHECK-NEXT: Total uOps:        3100
 
-# CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    4.41
-# CHECK-NEXT: IPC:               1.42
-# CHECK-NEXT: Block RThroughput: 6.5
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    2.58
+# CHECK-NEXT: IPC:               0.83
+# CHECK-NEXT: Block RThroughput: 10.3
 
 # CHECK:      Timeline view:
+# CHECK-NEXT:                     01234
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeER   .   stp	q1, q2, [x27], #992
-# CHECK-NEXT: [0,1]     D=eE-R   .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeeER   .   stp	s1, s2, [x27, #248]!
-# CHECK-NEXT: [0,3]     .D=eER   .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . DeeER  .   stp	d1, d2, [x27, #496]!
-# CHECK-NEXT: [0,5]     . D=eER  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D=eeeER   stp	q1, q2, [x27, #992]!
-# CHECK-NEXT: [0,7]     .  D==eE-R   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D=eE-R   stp	w1, w2, [x27], #248
-# CHECK-NEXT: [0,9]     .   D==eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeER    .   .   stp	q1, q2, [x27], #992
+# CHECK-NEXT: [0,1]     . DeER    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .  DeeER  .   .   stp	s1, s2, [x27, #248]!
+# CHECK-NEXT: [0,3]     .   DeER  .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    DeeER.   .   stp	d1, d2, [x27, #496]!
+# CHECK-NEXT: [0,5]     .    .DeER.   .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    . DeeeER .   stp	q1, q2, [x27, #992]!
+# CHECK-NEXT: [0,7]     .    .   DeER .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    DeER.   stp	w1, w2, [x27], #248
+# CHECK-NEXT: [0,9]     .    .    .DeER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -5024,42 +5031,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       stp	q1, q2, [x27], #992
-# CHECK-NEXT: 1.     1     2.0    0.0    1.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       stp	s1, s2, [x27, #248]!
-# CHECK-NEXT: 3.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     1.0    0.0    0.0       stp	d1, d2, [x27, #496]!
-# CHECK-NEXT: 5.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     2.0    1.0    0.0       stp	q1, q2, [x27, #992]!
-# CHECK-NEXT: 7.     1     3.0    0.0    1.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     2.0    0.0    1.0       stp	w1, w2, [x27], #248
-# CHECK-NEXT: 9.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     1.9    0.2    0.3       <total>
+# CHECK-NEXT: 1.     1     1.0    1.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       stp	s1, s2, [x27, #248]!
+# CHECK-NEXT: 3.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       stp	d1, d2, [x27, #496]!
+# CHECK-NEXT: 5.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       stp	q1, q2, [x27, #992]!
+# CHECK-NEXT: 7.     1     1.0    1.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       stp	w1, w2, [x27], #248
+# CHECK-NEXT: 9.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    0.7    0.0       <total>
 
 # CHECK:      [85] Code Region - G86
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      504
+# CHECK-NEXT: Total Cycles:      1003
 # CHECK-NEXT: Total uOps:        2300
 
-# CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    4.56
-# CHECK-NEXT: IPC:               1.98
-# CHECK-NEXT: Block RThroughput: 4.0
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    2.29
+# CHECK-NEXT: IPC:               1.00
+# CHECK-NEXT: Block RThroughput: 7.7
 
 # CHECK:      Timeline view:
-# CHECK-NEXT: Index     012345678
-
-# CHECK:      [0,0]     DeER .  .   stp	x1, x2, [x27], #496
-# CHECK-NEXT: [0,1]     D=eER.  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeER.  .   stp	w1, w2, [x27, #248]!
-# CHECK-NEXT: [0,3]     .D=eER  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . DeER  .   stp	x1, x2, [x27, #496]!
-# CHECK-NEXT: [0,5]     . D=eER .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     . D=eeER.   str	b1, [x27], #254
-# CHECK-NEXT: [0,7]     .  D=eER.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .  D=eeER   str	h1, [x27], #254
-# CHECK-NEXT: [0,9]     .  D==eER   add	x0, x27, #1
+# CHECK-NEXT:                     012
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeER .    . .   stp	x1, x2, [x27], #496
+# CHECK-NEXT: [0,1]     .DeER.    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeER    . .   stp	w1, w2, [x27, #248]!
+# CHECK-NEXT: [0,3]     .  DeER   . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   DeER  . .   stp	x1, x2, [x27, #496]!
+# CHECK-NEXT: [0,5]     .    DeER . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .DeeER .   str	b1, [x27], #254
+# CHECK-NEXT: [0,7]     .    . DeER .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  DeeER   str	h1, [x27], #254
+# CHECK-NEXT: [0,9]     .    .   DeER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -5069,42 +5077,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       stp	x1, x2, [x27], #496
-# CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       stp	w1, w2, [x27, #248]!
-# CHECK-NEXT: 3.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     1.0    0.0    0.0       stp	x1, x2, [x27, #496]!
-# CHECK-NEXT: 5.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     2.0    0.0    0.0       str	b1, [x27], #254
-# CHECK-NEXT: 7.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     2.0    0.0    0.0       str	h1, [x27], #254
-# CHECK-NEXT: 9.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     1.8    0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       stp	w1, w2, [x27, #248]!
+# CHECK-NEXT: 3.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       stp	x1, x2, [x27, #496]!
+# CHECK-NEXT: 5.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       str	b1, [x27], #254
+# CHECK-NEXT: 7.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       str	h1, [x27], #254
+# CHECK-NEXT: 9.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    0.5    0.0       <total>
 
 # CHECK:      [86] Code Region - G87
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      504
+# CHECK-NEXT: Total Cycles:      1003
 # CHECK-NEXT: Total uOps:        2200
 
-# CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    4.37
-# CHECK-NEXT: IPC:               1.98
-# CHECK-NEXT: Block RThroughput: 3.3
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    2.19
+# CHECK-NEXT: IPC:               1.00
+# CHECK-NEXT: Block RThroughput: 7.3
 
 # CHECK:      Timeline view:
-# CHECK-NEXT: Index     012345678
-
-# CHECK:      [0,0]     DeeER.  .   str	s1, [x27], #254
-# CHECK-NEXT: [0,1]     D=eER.  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=eeER  .   str	d1, [x27], #254
-# CHECK-NEXT: [0,3]     D==eER  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D=eeER .   str	q1, [x27], #254
-# CHECK-NEXT: [0,5]     .D==eER .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     . D=eeER.   str	b1, [x27, #254]!
-# CHECK-NEXT: [0,7]     . D==eER.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D==eeER   str	h1, [x27, #254]!
-# CHECK-NEXT: [0,9]     . D===eER   add	x0, x27, #1
+# CHECK-NEXT:                     012
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeER.    . .   str	s1, [x27], #254
+# CHECK-NEXT: [0,1]     .DeER.    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeeER   . .   str	d1, [x27], #254
+# CHECK-NEXT: [0,3]     .  DeER   . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   DeeER . .   str	q1, [x27], #254
+# CHECK-NEXT: [0,5]     .    DeER . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .DeeER .   str	b1, [x27, #254]!
+# CHECK-NEXT: [0,7]     .    . DeER .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  DeeER   str	h1, [x27, #254]!
+# CHECK-NEXT: [0,9]     .    .   DeER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -5114,42 +5123,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       str	s1, [x27], #254
-# CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       str	d1, [x27], #254
-# CHECK-NEXT: 3.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     2.0    0.0    0.0       str	q1, [x27], #254
-# CHECK-NEXT: 5.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     2.0    0.0    0.0       str	b1, [x27, #254]!
-# CHECK-NEXT: 7.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     3.0    0.0    0.0       str	h1, [x27, #254]!
-# CHECK-NEXT: 9.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.5    0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       str	d1, [x27], #254
+# CHECK-NEXT: 3.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       str	q1, [x27], #254
+# CHECK-NEXT: 5.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       str	b1, [x27, #254]!
+# CHECK-NEXT: 7.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       str	h1, [x27, #254]!
+# CHECK-NEXT: 9.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    0.5    0.0       <total>
 
 # CHECK:      [87] Code Region - G88
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      504
+# CHECK-NEXT: Total Cycles:      1003
 # CHECK-NEXT: Total uOps:        2200
 
-# CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    4.37
-# CHECK-NEXT: IPC:               1.98
-# CHECK-NEXT: Block RThroughput: 3.3
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    2.19
+# CHECK-NEXT: IPC:               1.00
+# CHECK-NEXT: Block RThroughput: 7.3
 
 # CHECK:      Timeline view:
-# CHECK-NEXT: Index     012345678
-
-# CHECK:      [0,0]     DeeER.  .   str	s1, [x27, #254]!
-# CHECK-NEXT: [0,1]     D=eER.  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=eeER  .   str	d1, [x27, #254]!
-# CHECK-NEXT: [0,3]     D==eER  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D=eeER .   str	q1, [x27, #254]!
-# CHECK-NEXT: [0,5]     .D==eER .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     . D=eER .   str	w1, [x27], #254
-# CHECK-NEXT: [0,7]     . D==eER.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D==eER.   str	x1, [x27], #254
-# CHECK-NEXT: [0,9]     . D===eER   add	x0, x27, #1
+# CHECK-NEXT:                     012
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeER.    . .   str	s1, [x27, #254]!
+# CHECK-NEXT: [0,1]     .DeER.    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeeER   . .   str	d1, [x27, #254]!
+# CHECK-NEXT: [0,3]     .  DeER   . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   DeeER . .   str	q1, [x27, #254]!
+# CHECK-NEXT: [0,5]     .    DeER . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .DeER. .   str	w1, [x27], #254
+# CHECK-NEXT: [0,7]     .    . DeER .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  DeER.   str	x1, [x27], #254
+# CHECK-NEXT: [0,9]     .    .   DeER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -5159,42 +5169,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       str	s1, [x27, #254]!
-# CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       str	d1, [x27, #254]!
-# CHECK-NEXT: 3.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     2.0    0.0    0.0       str	q1, [x27, #254]!
-# CHECK-NEXT: 5.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     2.0    0.0    0.0       str	w1, [x27], #254
-# CHECK-NEXT: 7.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     3.0    0.0    0.0       str	x1, [x27], #254
-# CHECK-NEXT: 9.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.5    0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       str	d1, [x27, #254]!
+# CHECK-NEXT: 3.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       str	q1, [x27, #254]!
+# CHECK-NEXT: 5.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       str	w1, [x27], #254
+# CHECK-NEXT: 7.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       str	x1, [x27], #254
+# CHECK-NEXT: 9.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    0.5    0.0       <total>
 
 # CHECK:      [88] Code Region - G89
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      504
+# CHECK-NEXT: Total Cycles:      1003
 # CHECK-NEXT: Total uOps:        2000
 
-# CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    3.97
-# CHECK-NEXT: IPC:               1.98
-# CHECK-NEXT: Block RThroughput: 3.3
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    1.99
+# CHECK-NEXT: IPC:               1.00
+# CHECK-NEXT: Block RThroughput: 6.7
 
 # CHECK:      Timeline view:
-# CHECK-NEXT: Index     012345678
-
-# CHECK:      [0,0]     DeER .  .   str	w1, [x27, #254]!
-# CHECK-NEXT: [0,1]     D=eER.  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=eER.  .   str	x1, [x27, #254]!
-# CHECK-NEXT: [0,3]     D==eER  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D=eER  .   strb	w1, [x27], #254
-# CHECK-NEXT: [0,5]     .D==eER .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D==eER .   strb	w1, [x27, #254]!
-# CHECK-NEXT: [0,7]     .D===eER.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D==eER.   strh	w1, [x27], #254
-# CHECK-NEXT: [0,9]     . D===eER   add	x0, x27, #1
+# CHECK-NEXT:                     012
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeER .    . .   str	w1, [x27, #254]!
+# CHECK-NEXT: [0,1]     .DeER.    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeER    . .   str	x1, [x27, #254]!
+# CHECK-NEXT: [0,3]     .  DeER   . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   DeER  . .   strb	w1, [x27], #254
+# CHECK-NEXT: [0,5]     .    DeER . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .DeER. .   strb	w1, [x27, #254]!
+# CHECK-NEXT: [0,7]     .    . DeER .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  DeER.   strh	w1, [x27], #254
+# CHECK-NEXT: [0,9]     .    .   DeER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -5204,34 +5215,34 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       str	w1, [x27, #254]!
-# CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       str	x1, [x27, #254]!
-# CHECK-NEXT: 3.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     2.0    0.0    0.0       strb	w1, [x27], #254
-# CHECK-NEXT: 5.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     3.0    0.0    0.0       strb	w1, [x27, #254]!
-# CHECK-NEXT: 7.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     3.0    0.0    0.0       strh	w1, [x27], #254
-# CHECK-NEXT: 9.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.7    0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       str	x1, [x27, #254]!
+# CHECK-NEXT: 3.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       strb	w1, [x27], #254
+# CHECK-NEXT: 5.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       strb	w1, [x27, #254]!
+# CHECK-NEXT: 7.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       strh	w1, [x27], #254
+# CHECK-NEXT: 9.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    0.5    0.0       <total>
 
 # CHECK:      [89] Code Region - G90
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      200
-# CHECK-NEXT: Total Cycles:      104
+# CHECK-NEXT: Total Cycles:      203
 # CHECK-NEXT: Total uOps:        400
 
-# CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    3.85
-# CHECK-NEXT: IPC:               1.92
-# CHECK-NEXT: Block RThroughput: 0.7
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    1.97
+# CHECK-NEXT: IPC:               0.99
+# CHECK-NEXT: Block RThroughput: 1.3
 
 # CHECK:      Timeline view:
 # CHECK-NEXT: Index     01234
 
 # CHECK:      [0,0]     DeER.   strh	w1, [x27, #254]!
-# CHECK-NEXT: [0,1]     D=eER   add	x0, x27, #1
+# CHECK-NEXT: [0,1]     .DeER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -5241,20 +5252,20 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       strh	w1, [x27, #254]!
-# CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     1.5    0.5    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    0.5    0.0       <total>
 
 # CHECK:      [90] Code Region - G91
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      400
-# CHECK-NEXT: Total Cycles:      142
+# CHECK-NEXT: Total Cycles:      209
 # CHECK-NEXT: Total uOps:        600
 
-# CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    4.23
-# CHECK-NEXT: IPC:               2.82
-# CHECK-NEXT: Block RThroughput: 1.3
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    2.87
+# CHECK-NEXT: IPC:               1.91
+# CHECK-NEXT: Block RThroughput: 2.0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0
@@ -5262,8 +5273,8 @@ add x0, x27, 1
 
 # CHECK:      [0,0]     DeeeeER   .   ldr	x1, [x27], #254
 # CHECK-NEXT: [0,1]     D=eE--R   .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D====eeeeER   ldr	x2, [x1], #254
-# CHECK-NEXT: [0,3]     D=eE------R   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D===eeeeER   ldr	x2, [x1], #254
+# CHECK-NEXT: [0,3]     .DeE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -5274,6 +5285,6 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ldr	x1, [x27], #254
 # CHECK-NEXT: 1.     1     2.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     5.0    0.0    0.0       ldr	x2, [x1], #254
-# CHECK-NEXT: 3.     1     2.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.5    0.3    2.0       <total>
+# CHECK-NEXT: 2.     1     4.0    0.0    0.0       ldr	x2, [x1], #254
+# CHECK-NEXT: 3.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.0    0.3    2.0       <total>
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N3-writeback.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N3-writeback.s
index 46bea36d38eb8..d105b8b8f69a1 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N3-writeback.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N3-writeback.s
@@ -1185,10 +1185,10 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      508
 # CHECK-NEXT: Total uOps:        1500
 
-# CHECK:      Dispatch Width:    10
+# CHECK:      Dispatch Width:    5
 # CHECK-NEXT: uOps Per Cycle:    2.95
 # CHECK-NEXT: IPC:               1.97
-# CHECK-NEXT: Block RThroughput: 2.5
+# CHECK-NEXT: Block RThroughput: 3.0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     012
@@ -1197,13 +1197,13 @@ add x0, x27, 1
 # CHECK:      [0,0]     DeeeeeeER . .   ld1	{ v1.1d }, [x27], #8
 # CHECK-NEXT: [0,1]     D=eE----R . .   add	x0, x27, #1
 # CHECK-NEXT: [0,2]     D=eeeeeeER. .   ld1	{ v1.2d }, [x27], #16
-# CHECK-NEXT: [0,3]     D==eE----R. .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     D==eeeeeeER .   ld1	{ v1.2s }, [x27], #8
-# CHECK-NEXT: [0,5]     D===eE----R .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D==eeeeeeER.   ld1	{ v1.4h }, [x27], #8
-# CHECK-NEXT: [0,7]     .D===eE----R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .D===eeeeeeER   ld1	{ v1.4s }, [x27], #16
-# CHECK-NEXT: [0,9]     .D====eE----R   add	x0, x27, #1
+# CHECK-NEXT: [0,3]     .D=eE----R. .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eeeeeeER .   ld1	{ v1.2s }, [x27], #8
+# CHECK-NEXT: [0,5]     .D==eE----R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     . D=eeeeeeER.   ld1	{ v1.4h }, [x27], #8
+# CHECK-NEXT: [0,7]     . D==eE----R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==eeeeeeER   ld1	{ v1.4s }, [x27], #16
+# CHECK-NEXT: [0,9]     .  D==eE----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1215,14 +1215,14 @@ add x0, x27, 1
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.1d }, [x27], #8
 # CHECK-NEXT: 1.     1     2.0    0.0    4.0       add	x0, x27, #1
 # CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld1	{ v1.2d }, [x27], #16
-# CHECK-NEXT: 3.     1     3.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     3.0    0.0    0.0       ld1	{ v1.2s }, [x27], #8
-# CHECK-NEXT: 5.     1     4.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ld1	{ v1.4h }, [x27], #8
-# CHECK-NEXT: 7.     1     4.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     4.0    0.0    0.0       ld1	{ v1.4s }, [x27], #16
-# CHECK-NEXT: 9.     1     5.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT:        1     3.1    0.1    2.0       <total>
+# CHECK-NEXT: 3.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ld1	{ v1.2s }, [x27], #8
+# CHECK-NEXT: 5.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     2.0    0.0    0.0       ld1	{ v1.4h }, [x27], #8
+# CHECK-NEXT: 7.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    0.0    0.0       ld1	{ v1.4s }, [x27], #16
+# CHECK-NEXT: 9.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.3    0.1    2.0       <total>
 
 # CHECK:      [1] Code Region - G02
 
@@ -1231,10 +1231,10 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      508
 # CHECK-NEXT: Total uOps:        1500
 
-# CHECK:      Dispatch Width:    10
+# CHECK:      Dispatch Width:    5
 # CHECK-NEXT: uOps Per Cycle:    2.95
 # CHECK-NEXT: IPC:               1.97
-# CHECK-NEXT: Block RThroughput: 2.5
+# CHECK-NEXT: Block RThroughput: 3.0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     012
@@ -1243,13 +1243,13 @@ add x0, x27, 1
 # CHECK:      [0,0]     DeeeeeeER . .   ld1	{ v1.8b }, [x27], #8
 # CHECK-NEXT: [0,1]     D=eE----R . .   add	x0, x27, #1
 # CHECK-NEXT: [0,2]     D=eeeeeeER. .   ld1	{ v1.8h }, [x27], #16
-# CHECK-NEXT: [0,3]     D==eE----R. .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     D==eeeeeeER .   ld1	{ v1.16b }, [x27], #16
-# CHECK-NEXT: [0,5]     D===eE----R .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D==eeeeeeER.   ld1	{ v1.1d }, [x27], x28
-# CHECK-NEXT: [0,7]     .D===eE----R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .D===eeeeeeER   ld1	{ v1.2d }, [x27], x28
-# CHECK-NEXT: [0,9]     .D====eE----R   add	x0, x27, #1
+# CHECK-NEXT: [0,3]     .D=eE----R. .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eeeeeeER .   ld1	{ v1.16b }, [x27], #16
+# CHECK-NEXT: [0,5]     .D==eE----R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     . D=eeeeeeER.   ld1	{ v1.1d }, [x27], x28
+# CHECK-NEXT: [0,7]     . D==eE----R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==eeeeeeER   ld1	{ v1.2d }, [x27], x28
+# CHECK-NEXT: [0,9]     .  D==eE----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1261,14 +1261,14 @@ add x0, x27, 1
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.8b }, [x27], #8
 # CHECK-NEXT: 1.     1     2.0    0.0    4.0       add	x0, x27, #1
 # CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld1	{ v1.8h }, [x27], #16
-# CHECK-NEXT: 3.     1     3.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     3.0    0.0    0.0       ld1	{ v1.16b }, [x27], #16
-# CHECK-NEXT: 5.     1     4.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ld1	{ v1.1d }, [x27], x28
-# CHECK-NEXT: 7.     1     4.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     4.0    0.0    0.0       ld1	{ v1.2d }, [x27], x28
-# CHECK-NEXT: 9.     1     5.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT:        1     3.1    0.1    2.0       <total>
+# CHECK-NEXT: 3.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ld1	{ v1.16b }, [x27], #16
+# CHECK-NEXT: 5.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     2.0    0.0    0.0       ld1	{ v1.1d }, [x27], x28
+# CHECK-NEXT: 7.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    0.0    0.0       ld1	{ v1.2d }, [x27], x28
+# CHECK-NEXT: 9.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.3    0.1    2.0       <total>
 
 # CHECK:      [2] Code Region - G03
 
@@ -1277,10 +1277,10 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      508
 # CHECK-NEXT: Total uOps:        1500
 
-# CHECK:      Dispatch Width:    10
+# CHECK:      Dispatch Width:    5
 # CHECK-NEXT: uOps Per Cycle:    2.95
 # CHECK-NEXT: IPC:               1.97
-# CHECK-NEXT: Block RThroughput: 2.5
+# CHECK-NEXT: Block RThroughput: 3.0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     012
@@ -1289,13 +1289,13 @@ add x0, x27, 1
 # CHECK:      [0,0]     DeeeeeeER . .   ld1	{ v1.2s }, [x27], x28
 # CHECK-NEXT: [0,1]     D=eE----R . .   add	x0, x27, #1
 # CHECK-NEXT: [0,2]     D=eeeeeeER. .   ld1	{ v1.4h }, [x27], x28
-# CHECK-NEXT: [0,3]     D==eE----R. .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     D==eeeeeeER .   ld1	{ v1.4s }, [x27], x28
-# CHECK-NEXT: [0,5]     D===eE----R .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D==eeeeeeER.   ld1	{ v1.8b }, [x27], x28
-# CHECK-NEXT: [0,7]     .D===eE----R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .D===eeeeeeER   ld1	{ v1.8h }, [x27], x28
-# CHECK-NEXT: [0,9]     .D====eE----R   add	x0, x27, #1
+# CHECK-NEXT: [0,3]     .D=eE----R. .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eeeeeeER .   ld1	{ v1.4s }, [x27], x28
+# CHECK-NEXT: [0,5]     .D==eE----R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     . D=eeeeeeER.   ld1	{ v1.8b }, [x27], x28
+# CHECK-NEXT: [0,7]     . D==eE----R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==eeeeeeER   ld1	{ v1.8h }, [x27], x28
+# CHECK-NEXT: [0,9]     .  D==eE----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1307,14 +1307,14 @@ add x0, x27, 1
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.2s }, [x27], x28
 # CHECK-NEXT: 1.     1     2.0    0.0    4.0       add	x0, x27, #1
 # CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld1	{ v1.4h }, [x27], x28
-# CHECK-NEXT: 3.     1     3.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     3.0    0.0    0.0       ld1	{ v1.4s }, [x27], x28
-# CHECK-NEXT: 5.     1     4.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ld1	{ v1.8b }, [x27], x28
-# CHECK-NEXT: 7.     1     4.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     4.0    0.0    0.0       ld1	{ v1.8h }, [x27], x28
-# CHECK-NEXT: 9.     1     5.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT:        1     3.1    0.1    2.0       <total>
+# CHECK-NEXT: 3.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ld1	{ v1.4s }, [x27], x28
+# CHECK-NEXT: 5.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     2.0    0.0    0.0       ld1	{ v1.8b }, [x27], x28
+# CHECK-NEXT: 7.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    0.0    0.0       ld1	{ v1.8h }, [x27], x28
+# CHECK-NEXT: 9.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.3    0.1    2.0       <total>
 
 # CHECK:      [3] Code Region - G04
 
@@ -1323,10 +1323,10 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      508
 # CHECK-NEXT: Total uOps:        1900
 
-# CHECK:      Dispatch Width:    10
+# CHECK:      Dispatch Width:    5
 # CHECK-NEXT: uOps Per Cycle:    3.74
 # CHECK-NEXT: IPC:               1.97
-# CHECK-NEXT: Block RThroughput: 3.0
+# CHECK-NEXT: Block RThroughput: 3.8
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     012
@@ -1334,14 +1334,14 @@ add x0, x27, 1
 
 # CHECK:      [0,0]     DeeeeeeER . .   ld1	{ v1.16b }, [x27], x28
 # CHECK-NEXT: [0,1]     D=eE----R . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=eeeeeeER. .   ld1	{ v1.1d, v2.1d }, [x27], #16
-# CHECK-NEXT: [0,3]     D==eE----R. .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     D==eeeeeeER .   ld1	{ v1.2d, v2.2d }, [x27], #32
-# CHECK-NEXT: [0,5]     .D==eE----R .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D==eeeeeeER.   ld1	{ v1.2s, v2.2s }, [x27], #16
-# CHECK-NEXT: [0,7]     .D===eE----R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .D===eeeeeeER   ld1	{ v1.4h, v2.4h }, [x27], #16
-# CHECK-NEXT: [0,9]     .D====eE----R   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeeER. .   ld1	{ v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: [0,3]     .D=eE----R. .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DeeeeeeER .   ld1	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: [0,5]     . D=eE----R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  DeeeeeeER.   ld1	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: [0,7]     .  D=eE----R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   DeeeeeeER   ld1	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: [0,9]     .   D=eE----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1352,15 +1352,15 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.16b }, [x27], x28
 # CHECK-NEXT: 1.     1     2.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld1	{ v1.1d, v2.1d }, [x27], #16
-# CHECK-NEXT: 3.     1     3.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     3.0    0.0    0.0       ld1	{ v1.2d, v2.2d }, [x27], #32
-# CHECK-NEXT: 5.     1     3.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ld1	{ v1.2s, v2.2s }, [x27], #16
-# CHECK-NEXT: 7.     1     4.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     4.0    0.0    0.0       ld1	{ v1.4h, v2.4h }, [x27], #16
-# CHECK-NEXT: 9.     1     5.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT:        1     3.0    0.1    2.0       <total>
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld1	{ v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: 3.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ld1	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: 5.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    0.0    0.0       ld1	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: 7.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    0.0    0.0       ld1	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: 9.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.5    0.1    2.0       <total>
 
 # CHECK:      [4] Code Region - G05
 
@@ -1369,10 +1369,10 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      508
 # CHECK-NEXT: Total uOps:        2000
 
-# CHECK:      Dispatch Width:    10
+# CHECK:      Dispatch Width:    5
 # CHECK-NEXT: uOps Per Cycle:    3.94
 # CHECK-NEXT: IPC:               1.97
-# CHECK-NEXT: Block RThroughput: 3.3
+# CHECK-NEXT: Block RThroughput: 4.0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     012
@@ -1380,14 +1380,14 @@ add x0, x27, 1
 
 # CHECK:      [0,0]     DeeeeeeER . .   ld1	{ v1.4s, v2.4s }, [x27], #32
 # CHECK-NEXT: [0,1]     D=eE----R . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=eeeeeeER. .   ld1	{ v1.8b, v2.8b }, [x27], #16
-# CHECK-NEXT: [0,3]     D==eE----R. .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D=eeeeeeER .   ld1	{ v1.8h, v2.8h }, [x27], #32
-# CHECK-NEXT: [0,5]     .D==eE----R .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D==eeeeeeER.   ld1	{ v1.16b, v2.16b }, [x27], #32
-# CHECK-NEXT: [0,7]     .D===eE----R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D==eeeeeeER   ld1	{ v1.1d, v2.1d }, [x27], x28
-# CHECK-NEXT: [0,9]     . D===eE----R   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeeER. .   ld1	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: [0,3]     .D=eE----R. .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DeeeeeeER .   ld1	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: [0,5]     . D=eE----R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  DeeeeeeER.   ld1	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: [0,7]     .  D=eE----R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   DeeeeeeER   ld1	{ v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: [0,9]     .   D=eE----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1398,15 +1398,15 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.4s, v2.4s }, [x27], #32
 # CHECK-NEXT: 1.     1     2.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld1	{ v1.8b, v2.8b }, [x27], #16
-# CHECK-NEXT: 3.     1     3.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ld1	{ v1.8h, v2.8h }, [x27], #32
-# CHECK-NEXT: 5.     1     3.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ld1	{ v1.16b, v2.16b }, [x27], #32
-# CHECK-NEXT: 7.     1     4.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     3.0    0.0    0.0       ld1	{ v1.1d, v2.1d }, [x27], x28
-# CHECK-NEXT: 9.     1     4.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.7    0.1    2.0       <total>
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld1	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: 3.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ld1	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: 5.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    0.0    0.0       ld1	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: 7.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    0.0    0.0       ld1	{ v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: 9.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.5    0.1    2.0       <total>
 
 # CHECK:      [5] Code Region - G06
 
@@ -1415,10 +1415,10 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      508
 # CHECK-NEXT: Total uOps:        2000
 
-# CHECK:      Dispatch Width:    10
+# CHECK:      Dispatch Width:    5
 # CHECK-NEXT: uOps Per Cycle:    3.94
 # CHECK-NEXT: IPC:               1.97
-# CHECK-NEXT: Block RThroughput: 3.3
+# CHECK-NEXT: Block RThroughput: 4.0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     012
@@ -1426,14 +1426,14 @@ add x0, x27, 1
 
 # CHECK:      [0,0]     DeeeeeeER . .   ld1	{ v1.2d, v2.2d }, [x27], x28
 # CHECK-NEXT: [0,1]     D=eE----R . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=eeeeeeER. .   ld1	{ v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT: [0,3]     D==eE----R. .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D=eeeeeeER .   ld1	{ v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: [0,5]     .D==eE----R .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D==eeeeeeER.   ld1	{ v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT: [0,7]     .D===eE----R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D==eeeeeeER   ld1	{ v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: [0,9]     . D===eE----R   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeeER. .   ld1	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: [0,3]     .D=eE----R. .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DeeeeeeER .   ld1	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: [0,5]     . D=eE----R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  DeeeeeeER.   ld1	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: [0,7]     .  D=eE----R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   DeeeeeeER   ld1	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: [0,9]     .   D=eE----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1444,15 +1444,15 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.2d, v2.2d }, [x27], x28
 # CHECK-NEXT: 1.     1     2.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld1	{ v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT: 3.     1     3.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ld1	{ v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: 5.     1     3.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ld1	{ v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT: 7.     1     4.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     3.0    0.0    0.0       ld1	{ v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: 9.     1     4.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.7    0.1    2.0       <total>
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld1	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: 3.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ld1	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: 5.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    0.0    0.0       ld1	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: 7.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    0.0    0.0       ld1	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: 9.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.5    0.1    2.0       <total>
 
 # CHECK:      [6] Code Region - G07
 
@@ -1461,10 +1461,10 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      508
 # CHECK-NEXT: Total uOps:        2300
 
-# CHECK:      Dispatch Width:    10
+# CHECK:      Dispatch Width:    5
 # CHECK-NEXT: uOps Per Cycle:    4.53
 # CHECK-NEXT: IPC:               1.97
-# CHECK-NEXT: Block RThroughput: 4.3
+# CHECK-NEXT: Block RThroughput: 4.6
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     012
@@ -1472,14 +1472,14 @@ add x0, x27, 1
 
 # CHECK:      [0,0]     DeeeeeeER . .   ld1	{ v1.8h, v2.8h }, [x27], x28
 # CHECK-NEXT: [0,1]     D=eE----R . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=eeeeeeER. .   ld1	{ v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: [0,3]     D==eE----R. .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D=eeeeeeER .   ld1	{ v1.1d, v2.1d, v3.1d }, [x27], #24
-# CHECK-NEXT: [0,5]     .D==eE----R .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D==eeeeeeER.   ld1	{ v1.2d, v2.2d, v3.2d }, [x27], #48
-# CHECK-NEXT: [0,7]     .D===eE----R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D==eeeeeeER   ld1	{ v1.2s, v2.2s, v3.2s }, [x27], #24
-# CHECK-NEXT: [0,9]     . D===eE----R   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeeER. .   ld1	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: [0,3]     .D=eE----R. .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DeeeeeeER .   ld1	{ v1.1d, v2.1d, v3.1d }, [x27], #24
+# CHECK-NEXT: [0,5]     . D=eE----R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  DeeeeeeER.   ld1	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: [0,7]     .  D=eE----R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   DeeeeeeER   ld1	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: [0,9]     .   D=eE----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1490,15 +1490,15 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.8h, v2.8h }, [x27], x28
 # CHECK-NEXT: 1.     1     2.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld1	{ v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: 3.     1     3.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ld1	{ v1.1d, v2.1d, v3.1d }, [x27], #24
-# CHECK-NEXT: 5.     1     3.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ld1	{ v1.2d, v2.2d, v3.2d }, [x27], #48
-# CHECK-NEXT: 7.     1     4.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     3.0    0.0    0.0       ld1	{ v1.2s, v2.2s, v3.2s }, [x27], #24
-# CHECK-NEXT: 9.     1     4.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.7    0.1    2.0       <total>
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld1	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: 3.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ld1	{ v1.1d, v2.1d, v3.1d }, [x27], #24
+# CHECK-NEXT: 5.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    0.0    0.0       ld1	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: 7.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    0.0    0.0       ld1	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: 9.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.5    0.1    2.0       <total>
 
 # CHECK:      [7] Code Region - G08
 
@@ -1507,7 +1507,7 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      508
 # CHECK-NEXT: Total uOps:        2500
 
-# CHECK:      Dispatch Width:    10
+# CHECK:      Dispatch Width:    5
 # CHECK-NEXT: uOps Per Cycle:    4.92
 # CHECK-NEXT: IPC:               1.97
 # CHECK-NEXT: Block RThroughput: 5.0
@@ -1518,14 +1518,14 @@ add x0, x27, 1
 
 # CHECK:      [0,0]     DeeeeeeER . .   ld1	{ v1.4h, v2.4h, v3.4h }, [x27], #24
 # CHECK-NEXT: [0,1]     D=eE----R . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=eeeeeeER. .   ld1	{ v1.4s, v2.4s, v3.4s }, [x27], #48
-# CHECK-NEXT: [0,3]     D==eE----R. .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D=eeeeeeER .   ld1	{ v1.8b, v2.8b, v3.8b }, [x27], #24
-# CHECK-NEXT: [0,5]     .D==eE----R .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D==eeeeeeER.   ld1	{ v1.8h, v2.8h, v3.8h }, [x27], #48
-# CHECK-NEXT: [0,7]     .D===eE----R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D==eeeeeeER   ld1	{ v1.16b, v2.16b, v3.16b }, [x27], #48
-# CHECK-NEXT: [0,9]     . D===eE----R   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeeER. .   ld1	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: [0,3]     .D=eE----R. .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DeeeeeeER .   ld1	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: [0,5]     . D=eE----R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  DeeeeeeER.   ld1	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: [0,7]     .  D=eE----R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   DeeeeeeER   ld1	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: [0,9]     .   D=eE----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1536,15 +1536,15 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.4h, v2.4h, v3.4h }, [x27], #24
 # CHECK-NEXT: 1.     1     2.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld1	{ v1.4s, v2.4s, v3.4s }, [x27], #48
-# CHECK-NEXT: 3.     1     3.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ld1	{ v1.8b, v2.8b, v3.8b }, [x27], #24
-# CHECK-NEXT: 5.     1     3.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ld1	{ v1.8h, v2.8h, v3.8h }, [x27], #48
-# CHECK-NEXT: 7.     1     4.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     3.0    0.0    0.0       ld1	{ v1.16b, v2.16b, v3.16b }, [x27], #48
-# CHECK-NEXT: 9.     1     4.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.7    0.1    2.0       <total>
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld1	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: 3.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ld1	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: 5.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    0.0    0.0       ld1	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: 7.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    0.0    0.0       ld1	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: 9.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.5    0.1    2.0       <total>
 
 # CHECK:      [8] Code Region - G09
 
@@ -1553,7 +1553,7 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      508
 # CHECK-NEXT: Total uOps:        2500
 
-# CHECK:      Dispatch Width:    10
+# CHECK:      Dispatch Width:    5
 # CHECK-NEXT: uOps Per Cycle:    4.92
 # CHECK-NEXT: IPC:               1.97
 # CHECK-NEXT: Block RThroughput: 5.0
@@ -1564,14 +1564,14 @@ add x0, x27, 1
 
 # CHECK:      [0,0]     DeeeeeeER . .   ld1	{ v1.1d, v2.1d, v3.1d }, [x27], x28
 # CHECK-NEXT: [0,1]     D=eE----R . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=eeeeeeER. .   ld1	{ v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT: [0,3]     D==eE----R. .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D=eeeeeeER .   ld1	{ v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: [0,5]     .D==eE----R .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D==eeeeeeER.   ld1	{ v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: [0,7]     .D===eE----R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D==eeeeeeER   ld1	{ v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: [0,9]     . D===eE----R   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeeER. .   ld1	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: [0,3]     .D=eE----R. .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DeeeeeeER .   ld1	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: [0,5]     . D=eE----R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  DeeeeeeER.   ld1	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: [0,7]     .  D=eE----R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   DeeeeeeER   ld1	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: [0,9]     .   D=eE----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1582,42 +1582,42 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.1d, v2.1d, v3.1d }, [x27], x28
 # CHECK-NEXT: 1.     1     2.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld1	{ v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT: 3.     1     3.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ld1	{ v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: 5.     1     3.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ld1	{ v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: 7.     1     4.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     3.0    0.0    0.0       ld1	{ v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: 9.     1     4.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.7    0.1    2.0       <total>
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld1	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: 3.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ld1	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: 5.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    0.0    0.0       ld1	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: 7.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    0.0    0.0       ld1	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: 9.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.5    0.1    2.0       <total>
 
 # CHECK:      [9] Code Region - G10
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      608
+# CHECK-NEXT: Total Cycles:      708
 # CHECK-NEXT: Total uOps:        2700
 
-# CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    4.44
-# CHECK-NEXT: IPC:               1.64
+# CHECK:      Dispatch Width:    5
+# CHECK-NEXT: uOps Per Cycle:    3.81
+# CHECK-NEXT: IPC:               1.41
 # CHECK-NEXT: Block RThroughput: 5.7
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123
+# CHECK-NEXT:                     01234
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeER .  .   ld1	{ v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: [0,1]     D=eE----R .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=eeeeeeER.  .   ld1	{ v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT: [0,3]     D==eE----R.  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D=eeeeeeER  .   ld1	{ v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: [0,5]     .D==eE----R  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D==eeeeeeeER.   ld1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
-# CHECK-NEXT: [0,7]     . D==eE-----R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D==eeeeeeeER   ld1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
-# CHECK-NEXT: [0,9]     . D===eE-----R   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeER .   .   ld1	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: [0,1]     D=eE----R .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeeER.   .   ld1	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: [0,3]     .D=eE----R.   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DeeeeeeER   .   ld1	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: [0,5]     . D=eE----R   .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  DeeeeeeeER .   ld1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: [0,7]     .   DeE-----R .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    DeeeeeeeER   ld1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: [0,9]     .    .DeE-----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1628,42 +1628,42 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.8b, v2.8b, v3.8b }, [x27], x28
 # CHECK-NEXT: 1.     1     2.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld1	{ v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT: 3.     1     3.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ld1	{ v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: 5.     1     3.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ld1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
-# CHECK-NEXT: 7.     1     3.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     3.0    0.0    0.0       ld1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
-# CHECK-NEXT: 9.     1     4.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.6    0.1    2.2       <total>
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld1	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: 3.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ld1	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: 5.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    0.0    0.0       ld1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: 7.     1     1.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       ld1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: 9.     1     1.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.3    0.2    2.2       <total>
 
 # CHECK:      [10] Code Region - G11
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      675
+# CHECK-NEXT: Total Cycles:      1008
 # CHECK-NEXT: Total uOps:        3000
 
-# CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    4.44
-# CHECK-NEXT: IPC:               1.48
+# CHECK:      Dispatch Width:    5
+# CHECK-NEXT: uOps Per Cycle:    2.98
+# CHECK-NEXT: IPC:               0.99
 # CHECK-NEXT: Block RThroughput: 6.7
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     01234
+# CHECK-NEXT:                     01234567
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeeER.   .   ld1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
-# CHECK-NEXT: [0,1]     D=eE-----R.   .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeeeeeeeER   .   ld1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
-# CHECK-NEXT: [0,3]     .D=eE-----R   .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . DeeeeeeeER  .   ld1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
-# CHECK-NEXT: [0,5]     . D=eE-----R  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D=eeeeeeeER.   ld1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
-# CHECK-NEXT: [0,7]     .  D==eE-----R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D=eeeeeeeER   ld1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
-# CHECK-NEXT: [0,9]     .   D==eE-----R   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeeER.    . .   ld1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: [0,1]     .DeE-----R.    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeeeeeeeER   . .   ld1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: [0,3]     .  DeE-----R   . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   DeeeeeeeER . .   ld1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: [0,5]     .    DeE-----R . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .DeeeeeeeER .   ld1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: [0,7]     .    . DeE-----R .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  DeeeeeeeER   ld1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: [0,9]     .    .   DeE-----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1673,43 +1673,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
-# CHECK-NEXT: 1.     1     2.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
-# CHECK-NEXT: 3.     1     2.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ld1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
-# CHECK-NEXT: 5.     1     2.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     2.0    1.0    0.0       ld1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
-# CHECK-NEXT: 7.     1     3.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     2.0    0.0    0.0       ld1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
-# CHECK-NEXT: 9.     1     3.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT:        1     1.9    0.2    2.5       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       ld1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: 3.     1     1.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       ld1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: 5.     1     1.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       ld1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: 7.     1     1.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       ld1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: 9.     1     1.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    0.5    2.5       <total>
 
 # CHECK:      [11] Code Region - G12
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      675
+# CHECK-NEXT: Total Cycles:      1008
 # CHECK-NEXT: Total uOps:        3000
 
-# CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    4.44
-# CHECK-NEXT: IPC:               1.48
+# CHECK:      Dispatch Width:    5
+# CHECK-NEXT: uOps Per Cycle:    2.98
+# CHECK-NEXT: IPC:               0.99
 # CHECK-NEXT: Block RThroughput: 6.7
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     01234
+# CHECK-NEXT:                     01234567
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeeER.   .   ld1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
-# CHECK-NEXT: [0,1]     D=eE-----R.   .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeeeeeeeER   .   ld1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
-# CHECK-NEXT: [0,3]     .D=eE-----R   .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . DeeeeeeeER  .   ld1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: [0,5]     . D=eE-----R  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D=eeeeeeeER.   ld1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: [0,7]     .  D==eE-----R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D=eeeeeeeER   ld1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: [0,9]     .   D==eE-----R   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeeER.    . .   ld1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: [0,1]     .DeE-----R.    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeeeeeeeER   . .   ld1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# CHECK-NEXT: [0,3]     .  DeE-----R   . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   DeeeeeeeER . .   ld1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: [0,5]     .    DeE-----R . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .DeeeeeeeER .   ld1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: [0,7]     .    . DeE-----R .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  DeeeeeeeER   ld1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .   DeE-----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1719,43 +1719,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
-# CHECK-NEXT: 1.     1     2.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
-# CHECK-NEXT: 3.     1     2.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ld1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: 5.     1     2.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     2.0    1.0    0.0       ld1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: 7.     1     3.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     2.0    0.0    0.0       ld1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: 9.     1     3.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT:        1     1.9    0.2    2.5       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       ld1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# CHECK-NEXT: 3.     1     1.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       ld1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: 5.     1     1.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       ld1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: 7.     1     1.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       ld1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: 9.     1     1.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    0.5    2.5       <total>
 
 # CHECK:      [12] Code Region - G13
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      1210
+# CHECK-NEXT: Total Cycles:      1212
 # CHECK-NEXT: Total uOps:        2800
 
-# CHECK:      Dispatch Width:    10
+# CHECK:      Dispatch Width:    5
 # CHECK-NEXT: uOps Per Cycle:    2.31
 # CHECK-NEXT: IPC:               0.83
 # CHECK-NEXT: Block RThroughput: 5.7
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          01
-
-# CHECK:      [0,0]     DeeeeeeeER.    .    ..   ld1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: [0,1]     D=eE-----R.    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeeeeeeeER    .    ..   ld1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: [0,3]     .D=eE-----R    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . DeeeeeeeER   .    ..   ld1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: [0,5]     . D=eE-----R   .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D=eeeeeeeER .    ..   ld1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: [0,7]     .  D==eE-----R .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .  D========eeeeeeeeER   ld1	{ v1.b }[0], [x27], #1
-# CHECK-NEXT: [0,9]     .  D=========eE------R   add	x0, x27, #1
+# CHECK-NEXT: Index     0123456789          0123
+
+# CHECK:      [0,0]     DeeeeeeeER.    .    .  .   ld1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: [0,1]     .DeE-----R.    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeeeeeeeER   .    .  .   ld1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: [0,3]     .  DeE-----R   .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   DeeeeeeeER .    .  .   ld1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: [0,5]     .    DeE-----R .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .DeeeeeeeER    .  .   ld1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: [0,7]     .    . DeE-----R    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    . D======eeeeeeeeER   ld1	{ v1.b }[0], [x27], #1
+# CHECK-NEXT: [0,9]     .    . D=======eE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1765,16 +1765,16 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: 1.     1     2.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: 3.     1     2.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ld1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: 5.     1     2.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     2.0    1.0    0.0       ld1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: 7.     1     3.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     9.0    0.0    0.0       ld1	{ v1.b }[0], [x27], #1
-# CHECK-NEXT: 9.     1     10.0   0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT:        1     3.3    0.2    2.6       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       ld1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: 3.     1     1.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       ld1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: 5.     1     1.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       ld1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: 7.     1     1.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     7.0    0.0    0.0       ld1	{ v1.b }[0], [x27], #1
+# CHECK-NEXT: 9.     1     8.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.3    0.4    2.6       <total>
 
 # CHECK:      [13] Code Region - G14
 
@@ -1783,10 +1783,10 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      4003
 # CHECK-NEXT: Total uOps:        2000
 
-# CHECK:      Dispatch Width:    10
+# CHECK:      Dispatch Width:    5
 # CHECK-NEXT: uOps Per Cycle:    0.50
 # CHECK-NEXT: IPC:               0.25
-# CHECK-NEXT: Block RThroughput: 2.5
+# CHECK-NEXT: Block RThroughput: 4.0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789          0123456789
@@ -1794,14 +1794,14 @@ add x0, x27, 1
 
 # CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    . .   ld1	{ v1.b }[8], [x27], #1
 # CHECK-NEXT: [0,1]     D=eE------R    .    .    .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D========eeeeeeeeER .    .    .    .    . .   ld1	{ v1.b }[0], [x27], x28
-# CHECK-NEXT: [0,3]     D=========eE------R .    .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D===============eeeeeeeeER   .    .    . .   ld1	{ v1.b }[8], [x27], x28
-# CHECK-NEXT: [0,5]     .D================eE------R   .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D=======================eeeeeeeeER.    . .   ld1	{ v1.h }[0], [x27], #2
-# CHECK-NEXT: [0,7]     .D========================eE------R.    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D==============================eeeeeeeeER   ld1	{ v1.h }[4], [x27], #2
-# CHECK-NEXT: [0,9]     . D===============================eE------R   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D=======eeeeeeeeER .    .    .    .    . .   ld1	{ v1.b }[0], [x27], x28
+# CHECK-NEXT: [0,3]     .D========eE------R .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D==============eeeeeeeeER   .    .    . .   ld1	{ v1.b }[8], [x27], x28
+# CHECK-NEXT: [0,5]     . D===============eE------R   .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D=====================eeeeeeeeER.    . .   ld1	{ v1.h }[0], [x27], #2
+# CHECK-NEXT: [0,7]     .  D======================eE------R.    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D============================eeeeeeeeER   ld1	{ v1.h }[4], [x27], #2
+# CHECK-NEXT: [0,9]     .   D=============================eE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1812,15 +1812,15 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.b }[8], [x27], #1
 # CHECK-NEXT: 1.     1     2.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     9.0    0.0    0.0       ld1	{ v1.b }[0], [x27], x28
-# CHECK-NEXT: 3.     1     10.0   0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     16.0   0.0    0.0       ld1	{ v1.b }[8], [x27], x28
-# CHECK-NEXT: 5.     1     17.0   0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     24.0   0.0    0.0       ld1	{ v1.h }[0], [x27], #2
-# CHECK-NEXT: 7.     1     25.0   0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     31.0   0.0    0.0       ld1	{ v1.h }[4], [x27], #2
-# CHECK-NEXT: 9.     1     32.0   0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT:        1     16.7   0.1    3.0       <total>
+# CHECK-NEXT: 2.     1     8.0    0.0    0.0       ld1	{ v1.b }[0], [x27], x28
+# CHECK-NEXT: 3.     1     9.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     15.0   0.0    0.0       ld1	{ v1.b }[8], [x27], x28
+# CHECK-NEXT: 5.     1     16.0   0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     22.0   0.0    0.0       ld1	{ v1.h }[0], [x27], #2
+# CHECK-NEXT: 7.     1     23.0   0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     29.0   0.0    0.0       ld1	{ v1.h }[4], [x27], #2
+# CHECK-NEXT: 9.     1     30.0   0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     15.5   0.1    3.0       <total>
 
 # CHECK:      [14] Code Region - G15
 
@@ -1829,10 +1829,10 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      4003
 # CHECK-NEXT: Total uOps:        2000
 
-# CHECK:      Dispatch Width:    10
+# CHECK:      Dispatch Width:    5
 # CHECK-NEXT: uOps Per Cycle:    0.50
 # CHECK-NEXT: IPC:               0.25
-# CHECK-NEXT: Block RThroughput: 2.5
+# CHECK-NEXT: Block RThroughput: 4.0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789          0123456789
@@ -1840,14 +1840,14 @@ add x0, x27, 1
 
 # CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    . .   ld1	{ v1.h }[0], [x27], x28
 # CHECK-NEXT: [0,1]     D=eE------R    .    .    .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D========eeeeeeeeER .    .    .    .    . .   ld1	{ v1.h }[4], [x27], x28
-# CHECK-NEXT: [0,3]     D=========eE------R .    .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D===============eeeeeeeeER   .    .    . .   ld1	{ v1.s }[0], [x27], #4
-# CHECK-NEXT: [0,5]     .D================eE------R   .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D=======================eeeeeeeeER.    . .   ld1	{ v1.s }[0], [x27], x28
-# CHECK-NEXT: [0,7]     .D========================eE------R.    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D==============================eeeeeeeeER   ld1	{ v1.d }[0], [x27], #8
-# CHECK-NEXT: [0,9]     . D===============================eE------R   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D=======eeeeeeeeER .    .    .    .    . .   ld1	{ v1.h }[4], [x27], x28
+# CHECK-NEXT: [0,3]     .D========eE------R .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D==============eeeeeeeeER   .    .    . .   ld1	{ v1.s }[0], [x27], #4
+# CHECK-NEXT: [0,5]     . D===============eE------R   .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D=====================eeeeeeeeER.    . .   ld1	{ v1.s }[0], [x27], x28
+# CHECK-NEXT: [0,7]     .  D======================eE------R.    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D============================eeeeeeeeER   ld1	{ v1.d }[0], [x27], #8
+# CHECK-NEXT: [0,9]     .   D=============================eE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1858,15 +1858,15 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.h }[0], [x27], x28
 # CHECK-NEXT: 1.     1     2.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     9.0    0.0    0.0       ld1	{ v1.h }[4], [x27], x28
-# CHECK-NEXT: 3.     1     10.0   0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     16.0   0.0    0.0       ld1	{ v1.s }[0], [x27], #4
-# CHECK-NEXT: 5.     1     17.0   0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     24.0   0.0    0.0       ld1	{ v1.s }[0], [x27], x28
-# CHECK-NEXT: 7.     1     25.0   0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     31.0   0.0    0.0       ld1	{ v1.d }[0], [x27], #8
-# CHECK-NEXT: 9.     1     32.0   0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT:        1     16.7   0.1    3.0       <total>
+# CHECK-NEXT: 2.     1     8.0    0.0    0.0       ld1	{ v1.h }[4], [x27], x28
+# CHECK-NEXT: 3.     1     9.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     15.0   0.0    0.0       ld1	{ v1.s }[0], [x27], #4
+# CHECK-NEXT: 5.     1     16.0   0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     22.0   0.0    0.0       ld1	{ v1.s }[0], [x27], x28
+# CHECK-NEXT: 7.     1     23.0   0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     29.0   0.0    0.0       ld1	{ v1.d }[0], [x27], #8
+# CHECK-NEXT: 9.     1     30.0   0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     15.5   0.1    3.0       <total>
 
 # CHECK:      [15] Code Region - G16
 
@@ -1875,10 +1875,10 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      1003
 # CHECK-NEXT: Total uOps:        1600
 
-# CHECK:      Dispatch Width:    10
+# CHECK:      Dispatch Width:    5
 # CHECK-NEXT: uOps Per Cycle:    1.60
 # CHECK-NEXT: IPC:               1.00
-# CHECK-NEXT: Block RThroughput: 2.5
+# CHECK-NEXT: Block RThroughput: 3.2
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     012
@@ -1886,14 +1886,14 @@ add x0, x27, 1
 
 # CHECK:      [0,0]     DeeeeeeeeER .   ld1	{ v1.d }[0], [x27], x28
 # CHECK-NEXT: [0,1]     D=eE------R .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=eeeeeeE-R .   ld1r	{ v1.1d }, [x27], #8
-# CHECK-NEXT: [0,3]     D==eE-----R .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     D==eeeeeeER .   ld1r	{ v1.2d }, [x27], #8
-# CHECK-NEXT: [0,5]     D===eE----R .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D==eeeeeeER.   ld1r	{ v1.2s }, [x27], #4
-# CHECK-NEXT: [0,7]     .D===eE----R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .D===eeeeeeER   ld1r	{ v1.4h }, [x27], #2
-# CHECK-NEXT: [0,9]     .D====eE----R   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeeE-R .   ld1r	{ v1.1d }, [x27], #8
+# CHECK-NEXT: [0,3]     .D=eE-----R .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eeeeeeER .   ld1r	{ v1.2d }, [x27], #8
+# CHECK-NEXT: [0,5]     . D=eE----R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     . D=eeeeeeER.   ld1r	{ v1.2s }, [x27], #4
+# CHECK-NEXT: [0,7]     . D==eE----R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .  D=eeeeeeER   ld1r	{ v1.4h }, [x27], #2
+# CHECK-NEXT: [0,9]     .  D==eE----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1904,15 +1904,15 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.d }[0], [x27], x28
 # CHECK-NEXT: 1.     1     2.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    1.0       ld1r	{ v1.1d }, [x27], #8
-# CHECK-NEXT: 3.     1     3.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     3.0    0.0    0.0       ld1r	{ v1.2d }, [x27], #8
-# CHECK-NEXT: 5.     1     4.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ld1r	{ v1.2s }, [x27], #4
-# CHECK-NEXT: 7.     1     4.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     4.0    0.0    0.0       ld1r	{ v1.4h }, [x27], #2
-# CHECK-NEXT: 9.     1     5.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT:        1     3.1    0.1    2.4       <total>
+# CHECK-NEXT: 2.     1     1.0    0.0    1.0       ld1r	{ v1.1d }, [x27], #8
+# CHECK-NEXT: 3.     1     2.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ld1r	{ v1.2d }, [x27], #8
+# CHECK-NEXT: 5.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     2.0    0.0    0.0       ld1r	{ v1.2s }, [x27], #4
+# CHECK-NEXT: 7.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     2.0    0.0    0.0       ld1r	{ v1.4h }, [x27], #2
+# CHECK-NEXT: 9.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.0    0.1    2.4       <total>
 
 # CHECK:      [16] Code Region - G17
 
@@ -1921,10 +1921,10 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      508
 # CHECK-NEXT: Total uOps:        1500
 
-# CHECK:      Dispatch Width:    10
+# CHECK:      Dispatch Width:    5
 # CHECK-NEXT: uOps Per Cycle:    2.95
 # CHECK-NEXT: IPC:               1.97
-# CHECK-NEXT: Block RThroughput: 2.5
+# CHECK-NEXT: Block RThroughput: 3.0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     012
@@ -1933,13 +1933,13 @@ add x0, x27, 1
 # CHECK:      [0,0]     DeeeeeeER . .   ld1r	{ v1.4s }, [x27], #4
 # CHECK-NEXT: [0,1]     D=eE----R . .   add	x0, x27, #1
 # CHECK-NEXT: [0,2]     D=eeeeeeER. .   ld1r	{ v1.8b }, [x27], #1
-# CHECK-NEXT: [0,3]     D==eE----R. .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     D==eeeeeeER .   ld1r	{ v1.8h }, [x27], #2
-# CHECK-NEXT: [0,5]     D===eE----R .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D==eeeeeeER.   ld1r	{ v1.16b }, [x27], #1
-# CHECK-NEXT: [0,7]     .D===eE----R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .D===eeeeeeER   ld1r	{ v1.1d }, [x27], x28
-# CHECK-NEXT: [0,9]     .D====eE----R   add	x0, x27, #1
+# CHECK-NEXT: [0,3]     .D=eE----R. .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eeeeeeER .   ld1r	{ v1.8h }, [x27], #2
+# CHECK-NEXT: [0,5]     .D==eE----R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     . D=eeeeeeER.   ld1r	{ v1.16b }, [x27], #1
+# CHECK-NEXT: [0,7]     . D==eE----R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==eeeeeeER   ld1r	{ v1.1d }, [x27], x28
+# CHECK-NEXT: [0,9]     .  D==eE----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1951,14 +1951,14 @@ add x0, x27, 1
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1r	{ v1.4s }, [x27], #4
 # CHECK-NEXT: 1.     1     2.0    0.0    4.0       add	x0, x27, #1
 # CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld1r	{ v1.8b }, [x27], #1
-# CHECK-NEXT: 3.     1     3.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     3.0    0.0    0.0       ld1r	{ v1.8h }, [x27], #2
-# CHECK-NEXT: 5.     1     4.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ld1r	{ v1.16b }, [x27], #1
-# CHECK-NEXT: 7.     1     4.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     4.0    0.0    0.0       ld1r	{ v1.1d }, [x27], x28
-# CHECK-NEXT: 9.     1     5.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT:        1     3.1    0.1    2.0       <total>
+# CHECK-NEXT: 3.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ld1r	{ v1.8h }, [x27], #2
+# CHECK-NEXT: 5.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     2.0    0.0    0.0       ld1r	{ v1.16b }, [x27], #1
+# CHECK-NEXT: 7.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    0.0    0.0       ld1r	{ v1.1d }, [x27], x28
+# CHECK-NEXT: 9.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.3    0.1    2.0       <total>
 
 # CHECK:      [17] Code Region - G18
 
@@ -1967,10 +1967,10 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      508
 # CHECK-NEXT: Total uOps:        1500
 
-# CHECK:      Dispatch Width:    10
+# CHECK:      Dispatch Width:    5
 # CHECK-NEXT: uOps Per Cycle:    2.95
 # CHECK-NEXT: IPC:               1.97
-# CHECK-NEXT: Block RThroughput: 2.5
+# CHECK-NEXT: Block RThroughput: 3.0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     012
@@ -1979,13 +1979,13 @@ add x0, x27, 1
 # CHECK:      [0,0]     DeeeeeeER . .   ld1r	{ v1.2d }, [x27], x28
 # CHECK-NEXT: [0,1]     D=eE----R . .   add	x0, x27, #1
 # CHECK-NEXT: [0,2]     D=eeeeeeER. .   ld1r	{ v1.2s }, [x27], x28
-# CHECK-NEXT: [0,3]     D==eE----R. .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     D==eeeeeeER .   ld1r	{ v1.4h }, [x27], x28
-# CHECK-NEXT: [0,5]     D===eE----R .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D==eeeeeeER.   ld1r	{ v1.4s }, [x27], x28
-# CHECK-NEXT: [0,7]     .D===eE----R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .D===eeeeeeER   ld1r	{ v1.8b }, [x27], x28
-# CHECK-NEXT: [0,9]     .D====eE----R   add	x0, x27, #1
+# CHECK-NEXT: [0,3]     .D=eE----R. .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eeeeeeER .   ld1r	{ v1.4h }, [x27], x28
+# CHECK-NEXT: [0,5]     .D==eE----R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     . D=eeeeeeER.   ld1r	{ v1.4s }, [x27], x28
+# CHECK-NEXT: [0,7]     . D==eE----R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==eeeeeeER   ld1r	{ v1.8b }, [x27], x28
+# CHECK-NEXT: [0,9]     .  D==eE----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1997,14 +1997,14 @@ add x0, x27, 1
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1r	{ v1.2d }, [x27], x28
 # CHECK-NEXT: 1.     1     2.0    0.0    4.0       add	x0, x27, #1
 # CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld1r	{ v1.2s }, [x27], x28
-# CHECK-NEXT: 3.     1     3.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     3.0    0.0    0.0       ld1r	{ v1.4h }, [x27], x28
-# CHECK-NEXT: 5.     1     4.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ld1r	{ v1.4s }, [x27], x28
-# CHECK-NEXT: 7.     1     4.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     4.0    0.0    0.0       ld1r	{ v1.8b }, [x27], x28
-# CHECK-NEXT: 9.     1     5.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT:        1     3.1    0.1    2.0       <total>
+# CHECK-NEXT: 3.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ld1r	{ v1.4h }, [x27], x28
+# CHECK-NEXT: 5.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     2.0    0.0    0.0       ld1r	{ v1.4s }, [x27], x28
+# CHECK-NEXT: 7.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    0.0    0.0       ld1r	{ v1.8b }, [x27], x28
+# CHECK-NEXT: 9.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.3    0.1    2.0       <total>
 
 # CHECK:      [18] Code Region - G19
 
@@ -2013,10 +2013,10 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      510
 # CHECK-NEXT: Total uOps:        1900
 
-# CHECK:      Dispatch Width:    10
+# CHECK:      Dispatch Width:    5
 # CHECK-NEXT: uOps Per Cycle:    3.73
 # CHECK-NEXT: IPC:               1.96
-# CHECK-NEXT: Block RThroughput: 2.5
+# CHECK-NEXT: Block RThroughput: 3.8
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     01234
@@ -2025,13 +2025,13 @@ add x0, x27, 1
 # CHECK:      [0,0]     DeeeeeeER .   .   ld1r	{ v1.8h }, [x27], x28
 # CHECK-NEXT: [0,1]     D=eE----R .   .   add	x0, x27, #1
 # CHECK-NEXT: [0,2]     D=eeeeeeER.   .   ld1r	{ v1.16b }, [x27], x28
-# CHECK-NEXT: [0,3]     D==eE----R.   .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     D==eeeeeeeeER .   ld2	{ v1.2d, v2.2d }, [x27], #32
-# CHECK-NEXT: [0,5]     .D==eE------R .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D==eeeeeeeeER.   ld2	{ v1.2s, v2.2s }, [x27], #16
-# CHECK-NEXT: [0,7]     .D===eE------R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .D===eeeeeeeeER   ld2	{ v1.4h, v2.4h }, [x27], #16
-# CHECK-NEXT: [0,9]     .D====eE------R   add	x0, x27, #1
+# CHECK-NEXT: [0,3]     .D=eE----R.   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eeeeeeeeER .   ld2	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: [0,5]     . D=eE------R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     . D=eeeeeeeeER.   ld2	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: [0,7]     . D==eE------R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .  D=eeeeeeeeER   ld2	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: [0,9]     .  D==eE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2043,14 +2043,14 @@ add x0, x27, 1
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1r	{ v1.8h }, [x27], x28
 # CHECK-NEXT: 1.     1     2.0    0.0    4.0       add	x0, x27, #1
 # CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld1r	{ v1.16b }, [x27], x28
-# CHECK-NEXT: 3.     1     3.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     3.0    0.0    0.0       ld2	{ v1.2d, v2.2d }, [x27], #32
-# CHECK-NEXT: 5.     1     3.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ld2	{ v1.2s, v2.2s }, [x27], #16
-# CHECK-NEXT: 7.     1     4.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     4.0    0.0    0.0       ld2	{ v1.4h, v2.4h }, [x27], #16
-# CHECK-NEXT: 9.     1     5.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT:        1     3.0    0.1    2.6       <total>
+# CHECK-NEXT: 3.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ld2	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: 5.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     2.0    0.0    0.0       ld2	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: 7.     1     3.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     2.0    0.0    0.0       ld2	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: 9.     1     3.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.1    0.1    2.6       <total>
 
 # CHECK:      [19] Code Region - G20
 
@@ -2059,10 +2059,10 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      510
 # CHECK-NEXT: Total uOps:        2400
 
-# CHECK:      Dispatch Width:    10
+# CHECK:      Dispatch Width:    5
 # CHECK-NEXT: uOps Per Cycle:    4.71
 # CHECK-NEXT: IPC:               1.96
-# CHECK-NEXT: Block RThroughput: 3.0
+# CHECK-NEXT: Block RThroughput: 4.8
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     01234
@@ -2070,14 +2070,14 @@ add x0, x27, 1
 
 # CHECK:      [0,0]     DeeeeeeeeER   .   ld2	{ v1.4s, v2.4s }, [x27], #32
 # CHECK-NEXT: [0,1]     D=eE------R   .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=eeeeeeeeER  .   ld2	{ v1.8b, v2.8b }, [x27], #16
-# CHECK-NEXT: [0,3]     D==eE------R  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D=eeeeeeeeER .   ld2	{ v1.8h, v2.8h }, [x27], #32
-# CHECK-NEXT: [0,5]     .D==eE------R .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D==eeeeeeeeER.   ld2	{ v1.16b, v2.16b }, [x27], #32
-# CHECK-NEXT: [0,7]     .D===eE------R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D==eeeeeeeeER   ld2	{ v1.2d, v2.2d }, [x27], x28
-# CHECK-NEXT: [0,9]     . D===eE------R   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeeeeER  .   ld2	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: [0,3]     .D=eE------R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DeeeeeeeeER .   ld2	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: [0,5]     . D=eE------R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  DeeeeeeeeER.   ld2	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: [0,7]     .  D=eE------R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   DeeeeeeeeER   ld2	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: [0,9]     .   D=eE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2088,15 +2088,15 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld2	{ v1.4s, v2.4s }, [x27], #32
 # CHECK-NEXT: 1.     1     2.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld2	{ v1.8b, v2.8b }, [x27], #16
-# CHECK-NEXT: 3.     1     3.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ld2	{ v1.8h, v2.8h }, [x27], #32
-# CHECK-NEXT: 5.     1     3.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ld2	{ v1.16b, v2.16b }, [x27], #32
-# CHECK-NEXT: 7.     1     4.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     3.0    0.0    0.0       ld2	{ v1.2d, v2.2d }, [x27], x28
-# CHECK-NEXT: 9.     1     4.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.7    0.1    3.0       <total>
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld2	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: 3.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ld2	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: 5.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    0.0    0.0       ld2	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: 7.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    0.0    0.0       ld2	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: 9.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.5    0.1    3.0       <total>
 
 # CHECK:      [20] Code Region - G21
 
@@ -2105,10 +2105,10 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      510
 # CHECK-NEXT: Total uOps:        2200
 
-# CHECK:      Dispatch Width:    10
+# CHECK:      Dispatch Width:    5
 # CHECK-NEXT: uOps Per Cycle:    4.31
 # CHECK-NEXT: IPC:               1.96
-# CHECK-NEXT: Block RThroughput: 2.5
+# CHECK-NEXT: Block RThroughput: 4.4
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     01234
@@ -2116,14 +2116,14 @@ add x0, x27, 1
 
 # CHECK:      [0,0]     DeeeeeeeeER   .   ld2	{ v1.2s, v2.2s }, [x27], x28
 # CHECK-NEXT: [0,1]     D=eE------R   .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=eeeeeeeeER  .   ld2	{ v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: [0,3]     D==eE------R  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D=eeeeeeeeER .   ld2	{ v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT: [0,5]     .D==eE------R .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D==eeeeeeeeER.   ld2	{ v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: [0,7]     .D===eE------R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D==eeeeeeeeER   ld2	{ v1.8h, v2.8h }, [x27], x28
-# CHECK-NEXT: [0,9]     . D===eE------R   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeeeeER  .   ld2	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: [0,3]     .D=eE------R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DeeeeeeeeER .   ld2	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: [0,5]     . D=eE------R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  DeeeeeeeeER.   ld2	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: [0,7]     .  D=eE------R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   DeeeeeeeeER   ld2	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: [0,9]     .   D=eE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2134,15 +2134,15 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld2	{ v1.2s, v2.2s }, [x27], x28
 # CHECK-NEXT: 1.     1     2.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld2	{ v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: 3.     1     3.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ld2	{ v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT: 5.     1     3.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ld2	{ v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: 7.     1     4.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     3.0    0.0    0.0       ld2	{ v1.8h, v2.8h }, [x27], x28
-# CHECK-NEXT: 9.     1     4.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.7    0.1    3.0       <total>
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld2	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: 3.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ld2	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: 5.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    0.0    0.0       ld2	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: 7.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    0.0    0.0       ld2	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: 9.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.5    0.1    3.0       <total>
 
 # CHECK:      [21] Code Region - G22
 
@@ -2151,10 +2151,10 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      3310
 # CHECK-NEXT: Total uOps:        2100
 
-# CHECK:      Dispatch Width:    10
+# CHECK:      Dispatch Width:    5
 # CHECK-NEXT: uOps Per Cycle:    0.63
 # CHECK-NEXT: IPC:               0.30
-# CHECK-NEXT: Block RThroughput: 2.5
+# CHECK-NEXT: Block RThroughput: 4.2
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789          0123456789
@@ -2162,14 +2162,14 @@ add x0, x27, 1
 
 # CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    . .   ld2	{ v1.16b, v2.16b }, [x27], x28
 # CHECK-NEXT: [0,1]     D=eE------R    .    .    .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D========eeeeeeeeER .    .    .    .    . .   ld2	{ v1.b, v2.b }[0], [x27], #2
-# CHECK-NEXT: [0,3]     D=========eE------R .    .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D===============eeeeeeeeER   .    .    . .   ld2	{ v1.b, v2.b }[8], [x27], #2
-# CHECK-NEXT: [0,5]     .D================eE------R   .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D=======================eeeeeeeeER.    . .   ld2	{ v1.b, v2.b }[0], [x27], x28
-# CHECK-NEXT: [0,7]     .D========================eE------R.    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D==============================eeeeeeeeER   ld2	{ v1.b, v2.b }[8], [x27], x28
-# CHECK-NEXT: [0,9]     . D===============================eE------R   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D=======eeeeeeeeER .    .    .    .    . .   ld2	{ v1.b, v2.b }[0], [x27], #2
+# CHECK-NEXT: [0,3]     .D========eE------R .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D==============eeeeeeeeER   .    .    . .   ld2	{ v1.b, v2.b }[8], [x27], #2
+# CHECK-NEXT: [0,5]     . D===============eE------R   .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D=====================eeeeeeeeER.    . .   ld2	{ v1.b, v2.b }[0], [x27], x28
+# CHECK-NEXT: [0,7]     .  D======================eE------R.    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D============================eeeeeeeeER   ld2	{ v1.b, v2.b }[8], [x27], x28
+# CHECK-NEXT: [0,9]     .   D=============================eE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2180,15 +2180,15 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld2	{ v1.16b, v2.16b }, [x27], x28
 # CHECK-NEXT: 1.     1     2.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     9.0    0.0    0.0       ld2	{ v1.b, v2.b }[0], [x27], #2
-# CHECK-NEXT: 3.     1     10.0   0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     16.0   0.0    0.0       ld2	{ v1.b, v2.b }[8], [x27], #2
-# CHECK-NEXT: 5.     1     17.0   0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     24.0   0.0    0.0       ld2	{ v1.b, v2.b }[0], [x27], x28
-# CHECK-NEXT: 7.     1     25.0   0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     31.0   0.0    0.0       ld2	{ v1.b, v2.b }[8], [x27], x28
-# CHECK-NEXT: 9.     1     32.0   0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT:        1     16.7   0.1    3.0       <total>
+# CHECK-NEXT: 2.     1     8.0    0.0    0.0       ld2	{ v1.b, v2.b }[0], [x27], #2
+# CHECK-NEXT: 3.     1     9.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     15.0   0.0    0.0       ld2	{ v1.b, v2.b }[8], [x27], #2
+# CHECK-NEXT: 5.     1     16.0   0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     22.0   0.0    0.0       ld2	{ v1.b, v2.b }[0], [x27], x28
+# CHECK-NEXT: 7.     1     23.0   0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     29.0   0.0    0.0       ld2	{ v1.b, v2.b }[8], [x27], x28
+# CHECK-NEXT: 9.     1     30.0   0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     15.5   0.1    3.0       <total>
 
 # CHECK:      [22] Code Region - G23
 
@@ -2197,10 +2197,10 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      4003
 # CHECK-NEXT: Total uOps:        2000
 
-# CHECK:      Dispatch Width:    10
+# CHECK:      Dispatch Width:    5
 # CHECK-NEXT: uOps Per Cycle:    0.50
 # CHECK-NEXT: IPC:               0.25
-# CHECK-NEXT: Block RThroughput: 2.5
+# CHECK-NEXT: Block RThroughput: 4.0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789          0123456789
@@ -2208,14 +2208,14 @@ add x0, x27, 1
 
 # CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    . .   ld2	{ v1.h, v2.h }[0], [x27], #4
 # CHECK-NEXT: [0,1]     D=eE------R    .    .    .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D========eeeeeeeeER .    .    .    .    . .   ld2	{ v1.h, v2.h }[4], [x27], #4
-# CHECK-NEXT: [0,3]     D=========eE------R .    .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D===============eeeeeeeeER   .    .    . .   ld2	{ v1.h, v2.h }[0], [x27], x28
-# CHECK-NEXT: [0,5]     .D================eE------R   .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D=======================eeeeeeeeER.    . .   ld2	{ v1.h, v2.h }[4], [x27], x28
-# CHECK-NEXT: [0,7]     .D========================eE------R.    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D==============================eeeeeeeeER   ld2	{ v1.s, v2.s }[0], [x27], #8
-# CHECK-NEXT: [0,9]     . D===============================eE------R   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D=======eeeeeeeeER .    .    .    .    . .   ld2	{ v1.h, v2.h }[4], [x27], #4
+# CHECK-NEXT: [0,3]     .D========eE------R .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D==============eeeeeeeeER   .    .    . .   ld2	{ v1.h, v2.h }[0], [x27], x28
+# CHECK-NEXT: [0,5]     . D===============eE------R   .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D=====================eeeeeeeeER.    . .   ld2	{ v1.h, v2.h }[4], [x27], x28
+# CHECK-NEXT: [0,7]     .  D======================eE------R.    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D============================eeeeeeeeER   ld2	{ v1.s, v2.s }[0], [x27], #8
+# CHECK-NEXT: [0,9]     .   D=============================eE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2226,15 +2226,15 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld2	{ v1.h, v2.h }[0], [x27], #4
 # CHECK-NEXT: 1.     1     2.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     9.0    0.0    0.0       ld2	{ v1.h, v2.h }[4], [x27], #4
-# CHECK-NEXT: 3.     1     10.0   0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     16.0   0.0    0.0       ld2	{ v1.h, v2.h }[0], [x27], x28
-# CHECK-NEXT: 5.     1     17.0   0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     24.0   0.0    0.0       ld2	{ v1.h, v2.h }[4], [x27], x28
-# CHECK-NEXT: 7.     1     25.0   0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     31.0   0.0    0.0       ld2	{ v1.s, v2.s }[0], [x27], #8
-# CHECK-NEXT: 9.     1     32.0   0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT:        1     16.7   0.1    3.0       <total>
+# CHECK-NEXT: 2.     1     8.0    0.0    0.0       ld2	{ v1.h, v2.h }[4], [x27], #4
+# CHECK-NEXT: 3.     1     9.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     15.0   0.0    0.0       ld2	{ v1.h, v2.h }[0], [x27], x28
+# CHECK-NEXT: 5.     1     16.0   0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     22.0   0.0    0.0       ld2	{ v1.h, v2.h }[4], [x27], x28
+# CHECK-NEXT: 7.     1     23.0   0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     29.0   0.0    0.0       ld2	{ v1.s, v2.s }[0], [x27], #8
+# CHECK-NEXT: 9.     1     30.0   0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     15.5   0.1    3.0       <total>
 
 # CHECK:      [23] Code Region - G24
 
@@ -2243,10 +2243,10 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      2403
 # CHECK-NEXT: Total uOps:        2000
 
-# CHECK:      Dispatch Width:    10
+# CHECK:      Dispatch Width:    5
 # CHECK-NEXT: uOps Per Cycle:    0.83
 # CHECK-NEXT: IPC:               0.42
-# CHECK-NEXT: Block RThroughput: 2.5
+# CHECK-NEXT: Block RThroughput: 4.0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
@@ -2254,14 +2254,14 @@ add x0, x27, 1
 
 # CHECK:      [0,0]     DeeeeeeeeER    .    .    ..   ld2	{ v1.s, v2.s }[0], [x27], x28
 # CHECK-NEXT: [0,1]     D=eE------R    .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D========eeeeeeeeER .    ..   ld2	{ v1.d, v2.d }[0], [x27], #16
-# CHECK-NEXT: [0,3]     D=========eE------R .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D===============eeeeeeeeER   ld2	{ v1.d, v2.d }[0], [x27], x28
-# CHECK-NEXT: [0,5]     .D================eE------R   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D================eeeeeeE-R   ld2r	{ v1.1d, v2.1d }, [x27], #16
-# CHECK-NEXT: [0,7]     .D=================eE-----R   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D================eeeeeeER   ld2r	{ v1.2d, v2.2d }, [x27], #16
-# CHECK-NEXT: [0,9]     . D=================eE----R   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D=======eeeeeeeeER .    ..   ld2	{ v1.d, v2.d }[0], [x27], #16
+# CHECK-NEXT: [0,3]     .D========eE------R .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D==============eeeeeeeeER   ld2	{ v1.d, v2.d }[0], [x27], x28
+# CHECK-NEXT: [0,5]     . D===============eE------R   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D==============eeeeeeE-R   ld2r	{ v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: [0,7]     .  D===============eE-----R   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D==============eeeeeeER   ld2r	{ v1.2d, v2.2d }, [x27], #16
+# CHECK-NEXT: [0,9]     .   D===============eE----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2272,15 +2272,15 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld2	{ v1.s, v2.s }[0], [x27], x28
 # CHECK-NEXT: 1.     1     2.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     9.0    0.0    0.0       ld2	{ v1.d, v2.d }[0], [x27], #16
-# CHECK-NEXT: 3.     1     10.0   0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     16.0   0.0    0.0       ld2	{ v1.d, v2.d }[0], [x27], x28
-# CHECK-NEXT: 5.     1     17.0   0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     17.0   0.0    1.0       ld2r	{ v1.1d, v2.1d }, [x27], #16
-# CHECK-NEXT: 7.     1     18.0   0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     17.0   0.0    0.0       ld2r	{ v1.2d, v2.2d }, [x27], #16
-# CHECK-NEXT: 9.     1     18.0   0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT:        1     12.5   0.1    2.8       <total>
+# CHECK-NEXT: 2.     1     8.0    0.0    0.0       ld2	{ v1.d, v2.d }[0], [x27], #16
+# CHECK-NEXT: 3.     1     9.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     15.0   0.0    0.0       ld2	{ v1.d, v2.d }[0], [x27], x28
+# CHECK-NEXT: 5.     1     16.0   0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     15.0   0.0    1.0       ld2r	{ v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: 7.     1     16.0   0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     15.0   0.0    0.0       ld2r	{ v1.2d, v2.2d }, [x27], #16
+# CHECK-NEXT: 9.     1     16.0   0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT:        1     11.3   0.1    2.8       <total>
 
 # CHECK:      [24] Code Region - G25
 
@@ -2289,10 +2289,10 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      508
 # CHECK-NEXT: Total uOps:        2000
 
-# CHECK:      Dispatch Width:    10
+# CHECK:      Dispatch Width:    5
 # CHECK-NEXT: uOps Per Cycle:    3.94
 # CHECK-NEXT: IPC:               1.97
-# CHECK-NEXT: Block RThroughput: 3.3
+# CHECK-NEXT: Block RThroughput: 4.0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     012
@@ -2300,14 +2300,14 @@ add x0, x27, 1
 
 # CHECK:      [0,0]     DeeeeeeER . .   ld2r	{ v1.2s, v2.2s }, [x27], #8
 # CHECK-NEXT: [0,1]     D=eE----R . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=eeeeeeER. .   ld2r	{ v1.4h, v2.4h }, [x27], #4
-# CHECK-NEXT: [0,3]     D==eE----R. .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D=eeeeeeER .   ld2r	{ v1.4s, v2.4s }, [x27], #8
-# CHECK-NEXT: [0,5]     .D==eE----R .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D==eeeeeeER.   ld2r	{ v1.8b, v2.8b }, [x27], #2
-# CHECK-NEXT: [0,7]     .D===eE----R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D==eeeeeeER   ld2r	{ v1.8h, v2.8h }, [x27], #4
-# CHECK-NEXT: [0,9]     . D===eE----R   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeeER. .   ld2r	{ v1.4h, v2.4h }, [x27], #4
+# CHECK-NEXT: [0,3]     .D=eE----R. .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DeeeeeeER .   ld2r	{ v1.4s, v2.4s }, [x27], #8
+# CHECK-NEXT: [0,5]     . D=eE----R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  DeeeeeeER.   ld2r	{ v1.8b, v2.8b }, [x27], #2
+# CHECK-NEXT: [0,7]     .  D=eE----R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   DeeeeeeER   ld2r	{ v1.8h, v2.8h }, [x27], #4
+# CHECK-NEXT: [0,9]     .   D=eE----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2318,15 +2318,15 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld2r	{ v1.2s, v2.2s }, [x27], #8
 # CHECK-NEXT: 1.     1     2.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld2r	{ v1.4h, v2.4h }, [x27], #4
-# CHECK-NEXT: 3.     1     3.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ld2r	{ v1.4s, v2.4s }, [x27], #8
-# CHECK-NEXT: 5.     1     3.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ld2r	{ v1.8b, v2.8b }, [x27], #2
-# CHECK-NEXT: 7.     1     4.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     3.0    0.0    0.0       ld2r	{ v1.8h, v2.8h }, [x27], #4
-# CHECK-NEXT: 9.     1     4.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.7    0.1    2.0       <total>
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld2r	{ v1.4h, v2.4h }, [x27], #4
+# CHECK-NEXT: 3.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ld2r	{ v1.4s, v2.4s }, [x27], #8
+# CHECK-NEXT: 5.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    0.0    0.0       ld2r	{ v1.8b, v2.8b }, [x27], #2
+# CHECK-NEXT: 7.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    0.0    0.0       ld2r	{ v1.8h, v2.8h }, [x27], #4
+# CHECK-NEXT: 9.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.5    0.1    2.0       <total>
 
 # CHECK:      [25] Code Region - G26
 
@@ -2335,10 +2335,10 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      508
 # CHECK-NEXT: Total uOps:        2000
 
-# CHECK:      Dispatch Width:    10
+# CHECK:      Dispatch Width:    5
 # CHECK-NEXT: uOps Per Cycle:    3.94
 # CHECK-NEXT: IPC:               1.97
-# CHECK-NEXT: Block RThroughput: 3.3
+# CHECK-NEXT: Block RThroughput: 4.0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     012
@@ -2346,14 +2346,14 @@ add x0, x27, 1
 
 # CHECK:      [0,0]     DeeeeeeER . .   ld2r	{ v1.16b, v2.16b }, [x27], #2
 # CHECK-NEXT: [0,1]     D=eE----R . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=eeeeeeER. .   ld2r	{ v1.1d, v2.1d }, [x27], x28
-# CHECK-NEXT: [0,3]     D==eE----R. .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D=eeeeeeER .   ld2r	{ v1.2d, v2.2d }, [x27], x28
-# CHECK-NEXT: [0,5]     .D==eE----R .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D==eeeeeeER.   ld2r	{ v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT: [0,7]     .D===eE----R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D==eeeeeeER   ld2r	{ v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: [0,9]     . D===eE----R   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeeER. .   ld2r	{ v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: [0,3]     .D=eE----R. .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DeeeeeeER .   ld2r	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: [0,5]     . D=eE----R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  DeeeeeeER.   ld2r	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: [0,7]     .  D=eE----R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   DeeeeeeER   ld2r	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: [0,9]     .   D=eE----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2364,27 +2364,27 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld2r	{ v1.16b, v2.16b }, [x27], #2
 # CHECK-NEXT: 1.     1     2.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld2r	{ v1.1d, v2.1d }, [x27], x28
-# CHECK-NEXT: 3.     1     3.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ld2r	{ v1.2d, v2.2d }, [x27], x28
-# CHECK-NEXT: 5.     1     3.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ld2r	{ v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT: 7.     1     4.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     3.0    0.0    0.0       ld2r	{ v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: 9.     1     4.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.7    0.1    2.0       <total>
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld2r	{ v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: 3.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ld2r	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: 5.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    0.0    0.0       ld2r	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: 7.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    0.0    0.0       ld2r	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: 9.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.5    0.1    2.0       <total>
 
 # CHECK:      [26] Code Region - G27
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      512
+# CHECK-NEXT: Total Cycles:      611
 # CHECK-NEXT: Total uOps:        2400
 
-# CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    4.69
-# CHECK-NEXT: IPC:               1.95
-# CHECK-NEXT: Block RThroughput: 3.7
+# CHECK:      Dispatch Width:    5
+# CHECK-NEXT: uOps Per Cycle:    3.93
+# CHECK-NEXT: IPC:               1.64
+# CHECK-NEXT: Block RThroughput: 4.8
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456
@@ -2392,14 +2392,14 @@ add x0, x27, 1
 
 # CHECK:      [0,0]     DeeeeeeER .    ..   ld2r	{ v1.4s, v2.4s }, [x27], x28
 # CHECK-NEXT: [0,1]     D=eE----R .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=eeeeeeER.    ..   ld2r	{ v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: [0,3]     D==eE----R.    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D=eeeeeeER    ..   ld2r	{ v1.8h, v2.8h }, [x27], x28
-# CHECK-NEXT: [0,5]     .D==eE----R    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D==eeeeeeER   ..   ld2r	{ v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: [0,7]     .D===eE----R   ..   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D==eeeeeeeeeeER   ld3	{ v1.2d, v2.2d, v3.2d }, [x27], #48
-# CHECK-NEXT: [0,9]     . D===eE--------R   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeeER.    ..   ld2r	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: [0,3]     .D=eE----R.    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DeeeeeeER    ..   ld2r	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: [0,5]     . D=eE----R    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  DeeeeeeER   ..   ld2r	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: [0,7]     .  D=eE----R   ..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   DeeeeeeeeeeER   ld3	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: [0,9]     .    DeE--------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2410,42 +2410,42 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld2r	{ v1.4s, v2.4s }, [x27], x28
 # CHECK-NEXT: 1.     1     2.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld2r	{ v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: 3.     1     3.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ld2r	{ v1.8h, v2.8h }, [x27], x28
-# CHECK-NEXT: 5.     1     3.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ld2r	{ v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: 7.     1     4.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     3.0    0.0    0.0       ld3	{ v1.2d, v2.2d, v3.2d }, [x27], #48
-# CHECK-NEXT: 9.     1     4.0    0.0    8.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.7    0.1    2.4       <total>
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld2r	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: 3.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ld2r	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: 5.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    0.0    0.0       ld2r	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: 7.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    0.0    0.0       ld3	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: 9.     1     1.0    0.0    8.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.4    0.1    2.4       <total>
 
 # CHECK:      [27] Code Region - G28
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      761
+# CHECK-NEXT: Total Cycles:      1011
 # CHECK-NEXT: Total uOps:        4000
 
-# CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    5.26
-# CHECK-NEXT: IPC:               1.31
-# CHECK-NEXT: Block RThroughput: 7.5
+# CHECK:      Dispatch Width:    5
+# CHECK-NEXT: uOps Per Cycle:    3.96
+# CHECK-NEXT: IPC:               0.99
+# CHECK-NEXT: Block RThroughput: 8.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     012345678
-# CHECK-NEXT: Index     0123456789
-
-# CHECK:      [0,0]     DeeeeeeeeER    .  .   ld3	{ v1.2s, v2.2s, v3.2s }, [x27], #24
-# CHECK-NEXT: [0,1]     D=eE------R    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeeeeeeeeER   .  .   ld3	{ v1.4h, v2.4h, v3.4h }, [x27], #24
-# CHECK-NEXT: [0,3]     .D=eE------R   .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D=eeeeeeeeeeER  .   ld3	{ v1.4s, v2.4s, v3.4s }, [x27], #48
-# CHECK-NEXT: [0,5]     . D==eE--------R  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D=eeeeeeeeE-R  .   ld3	{ v1.8b, v2.8b, v3.8b }, [x27], #24
-# CHECK-NEXT: [0,7]     .  D==eE-------R  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D==eeeeeeeeeeER   ld3	{ v1.8h, v2.8h, v3.8h }, [x27], #48
-# CHECK-NEXT: [0,9]     .   D===eE--------R   add	x0, x27, #1
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .   ld3	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: [0,1]     .DeE------R    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeeeeeeeeER  .    .   ld3	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: [0,3]     .  DeE------R  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   DeeeeeeeeeeER   .   ld3	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: [0,5]     .    DeE--------R   .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .DeeeeeeeeER   .   ld3	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: [0,7]     .    . DeE------R   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  DeeeeeeeeeeER   ld3	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: [0,9]     .    .   DeE--------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2455,43 +2455,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3	{ v1.2s, v2.2s, v3.2s }, [x27], #24
-# CHECK-NEXT: 1.     1     2.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld3	{ v1.4h, v2.4h, v3.4h }, [x27], #24
-# CHECK-NEXT: 3.     1     2.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     2.0    1.0    0.0       ld3	{ v1.4s, v2.4s, v3.4s }, [x27], #48
-# CHECK-NEXT: 5.     1     3.0    0.0    8.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     2.0    0.0    1.0       ld3	{ v1.8b, v2.8b, v3.8b }, [x27], #24
-# CHECK-NEXT: 7.     1     3.0    0.0    7.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     3.0    1.0    0.0       ld3	{ v1.8h, v2.8h, v3.8h }, [x27], #48
-# CHECK-NEXT: 9.     1     4.0    0.0    8.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.3    0.3    3.6       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       ld3	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: 3.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       ld3	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: 5.     1     1.0    0.0    8.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       ld3	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: 7.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       ld3	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: 9.     1     1.0    0.0    8.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    0.5    3.4       <total>
 
 # CHECK:      [28] Code Region - G29
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      761
+# CHECK-NEXT: Total Cycles:      1011
 # CHECK-NEXT: Total uOps:        4000
 
-# CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    5.26
-# CHECK-NEXT: IPC:               1.31
-# CHECK-NEXT: Block RThroughput: 7.5
+# CHECK:      Dispatch Width:    5
+# CHECK-NEXT: uOps Per Cycle:    3.96
+# CHECK-NEXT: IPC:               0.99
+# CHECK-NEXT: Block RThroughput: 8.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     012345678
-# CHECK-NEXT: Index     0123456789
-
-# CHECK:      [0,0]     DeeeeeeeeeeER  .  .   ld3	{ v1.16b, v2.16b, v3.16b }, [x27], #48
-# CHECK-NEXT: [0,1]     D=eE--------R  .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeeeeeeeeeeER .  .   ld3	{ v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT: [0,3]     .D=eE--------R .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D=eeeeeeeeER .  .   ld3	{ v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: [0,5]     . D==eE------R .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D=eeeeeeeeER.  .   ld3	{ v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: [0,7]     .  D==eE------R.  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D==eeeeeeeeeeER   ld3	{ v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: [0,9]     .   D===eE--------R   add	x0, x27, #1
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0
+
+# CHECK:      [0,0]     DeeeeeeeeeeER  .    .   ld3	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: [0,1]     .DeE--------R  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeeeeeeeeeeER.    .   ld3	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: [0,3]     .  DeE--------R.    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   DeeeeeeeeER.    .   ld3	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: [0,5]     .    DeE------R.    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .DeeeeeeeeER   .   ld3	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: [0,7]     .    . DeE------R   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  DeeeeeeeeeeER   ld3	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .   DeE--------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2501,43 +2501,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3	{ v1.16b, v2.16b, v3.16b }, [x27], #48
-# CHECK-NEXT: 1.     1     2.0    0.0    8.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld3	{ v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT: 3.     1     2.0    0.0    8.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     2.0    1.0    0.0       ld3	{ v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: 5.     1     3.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     2.0    0.0    0.0       ld3	{ v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: 7.     1     3.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     3.0    1.0    0.0       ld3	{ v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: 9.     1     4.0    0.0    8.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.3    0.3    3.6       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    8.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       ld3	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: 3.     1     1.0    0.0    8.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       ld3	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: 5.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       ld3	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: 7.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       ld3	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: 9.     1     1.0    0.0    8.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    0.5    3.6       <total>
 
 # CHECK:      [29] Code Region - G30
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2210
+# CHECK-NEXT: Total Cycles:      2211
 # CHECK-NEXT: Total uOps:        4000
 
-# CHECK:      Dispatch Width:    10
+# CHECK:      Dispatch Width:    5
 # CHECK-NEXT: uOps Per Cycle:    1.81
 # CHECK-NEXT: IPC:               0.45
-# CHECK-NEXT: Block RThroughput: 7.5
+# CHECK-NEXT: Block RThroughput: 8.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          01
+# CHECK-NEXT:                     0123456789          012
 # CHECK-NEXT: Index     0123456789          0123456789
 
-# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    ..   ld3	{ v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: [0,1]     D=eE------R    .    .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeeeeeeeeeeER .    .    .    ..   ld3	{ v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT: [0,3]     .D=eE--------R .    .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D=eeeeeeeeeeER    .    .    ..   ld3	{ v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: [0,5]     . D==eE--------R    .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D==========eeeeeeeeER .    ..   ld3	{ v1.b, v2.b, v3.b }[0], [x27], #3
-# CHECK-NEXT: [0,7]     .  D===========eE------R .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D=================eeeeeeeeER   ld3	{ v1.b, v2.b, v3.b }[8], [x27], #3
-# CHECK-NEXT: [0,9]     .   D==================eE------R   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    . .   ld3	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: [0,1]     .DeE------R    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeeeeeeeeeeER.    .    .    . .   ld3	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: [0,3]     .  DeE--------R.    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   DeeeeeeeeeeER   .    .    . .   ld3	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: [0,5]     .    DeE--------R   .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .D========eeeeeeeeER.    . .   ld3	{ v1.b, v2.b, v3.b }[0], [x27], #3
+# CHECK-NEXT: [0,7]     .    . D========eE------R.    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  D==============eeeeeeeeER   ld3	{ v1.b, v2.b, v3.b }[8], [x27], #3
+# CHECK-NEXT: [0,9]     .    .   D==============eE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2547,16 +2547,16 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3	{ v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: 1.     1     2.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld3	{ v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT: 3.     1     2.0    0.0    8.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     2.0    1.0    0.0       ld3	{ v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: 5.     1     3.0    0.0    8.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     11.0   0.0    0.0       ld3	{ v1.b, v2.b, v3.b }[0], [x27], #3
-# CHECK-NEXT: 7.     1     12.0   0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     18.0   0.0    0.0       ld3	{ v1.b, v2.b, v3.b }[8], [x27], #3
-# CHECK-NEXT: 9.     1     19.0   0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT:        1     7.1    0.2    3.4       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       ld3	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: 3.     1     1.0    0.0    8.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       ld3	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: 5.     1     1.0    0.0    8.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     9.0    0.0    0.0       ld3	{ v1.b, v2.b, v3.b }[0], [x27], #3
+# CHECK-NEXT: 7.     1     9.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     15.0   0.0    0.0       ld3	{ v1.b, v2.b, v3.b }[8], [x27], #3
+# CHECK-NEXT: 9.     1     15.0   0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     5.4    0.3    3.4       <total>
 
 # CHECK:      [30] Code Region - G31
 
@@ -2565,25 +2565,25 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      4003
 # CHECK-NEXT: Total uOps:        4000
 
-# CHECK:      Dispatch Width:    10
+# CHECK:      Dispatch Width:    5
 # CHECK-NEXT: uOps Per Cycle:    1.00
 # CHECK-NEXT: IPC:               0.25
-# CHECK-NEXT: Block RThroughput: 7.5
+# CHECK-NEXT: Block RThroughput: 8.0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789          0123456789
 # CHECK-NEXT: Index     0123456789          0123456789          012
 
 # CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    . .   ld3	{ v1.b, v2.b, v3.b }[0], [x27], x28
-# CHECK-NEXT: [0,1]     D=eE------R    .    .    .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .D=======eeeeeeeeER .    .    .    .    . .   ld3	{ v1.b, v2.b, v3.b }[8], [x27], x28
-# CHECK-NEXT: [0,3]     .D========eE------R .    .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D==============eeeeeeeeER   .    .    . .   ld3	{ v1.h, v2.h, v3.h }[0], [x27], #6
-# CHECK-NEXT: [0,5]     . D===============eE------R   .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D=====================eeeeeeeeER.    . .   ld3	{ v1.h, v2.h, v3.h }[4], [x27], #6
-# CHECK-NEXT: [0,7]     .  D======================eE------R.    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D============================eeeeeeeeER   ld3	{ v1.h, v2.h, v3.h }[0], [x27], x28
-# CHECK-NEXT: [0,9]     .   D=============================eE------R   add	x0, x27, #1
+# CHECK-NEXT: [0,1]     .DeE------R    .    .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . D======eeeeeeeeER .    .    .    .    . .   ld3	{ v1.b, v2.b, v3.b }[8], [x27], x28
+# CHECK-NEXT: [0,3]     .  D======eE------R .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   D============eeeeeeeeER   .    .    . .   ld3	{ v1.h, v2.h, v3.h }[0], [x27], #6
+# CHECK-NEXT: [0,5]     .    D============eE------R   .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .D==================eeeeeeeeER.    . .   ld3	{ v1.h, v2.h, v3.h }[4], [x27], #6
+# CHECK-NEXT: [0,7]     .    . D==================eE------R.    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  D========================eeeeeeeeER   ld3	{ v1.h, v2.h, v3.h }[0], [x27], x28
+# CHECK-NEXT: [0,9]     .    .   D========================eE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2593,16 +2593,16 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3	{ v1.b, v2.b, v3.b }[0], [x27], x28
-# CHECK-NEXT: 1.     1     2.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     8.0    0.0    0.0       ld3	{ v1.b, v2.b, v3.b }[8], [x27], x28
-# CHECK-NEXT: 3.     1     9.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     15.0   0.0    0.0       ld3	{ v1.h, v2.h, v3.h }[0], [x27], #6
-# CHECK-NEXT: 5.     1     16.0   0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     22.0   0.0    0.0       ld3	{ v1.h, v2.h, v3.h }[4], [x27], #6
-# CHECK-NEXT: 7.     1     23.0   0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     29.0   0.0    0.0       ld3	{ v1.h, v2.h, v3.h }[0], [x27], x28
-# CHECK-NEXT: 9.     1     30.0   0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT:        1     15.5   0.1    3.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld3	{ v1.b, v2.b, v3.b }[8], [x27], x28
+# CHECK-NEXT: 3.     1     7.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     13.0   0.0    0.0       ld3	{ v1.h, v2.h, v3.h }[0], [x27], #6
+# CHECK-NEXT: 5.     1     13.0   0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     19.0   0.0    0.0       ld3	{ v1.h, v2.h, v3.h }[4], [x27], #6
+# CHECK-NEXT: 7.     1     19.0   0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     25.0   0.0    0.0       ld3	{ v1.h, v2.h, v3.h }[0], [x27], x28
+# CHECK-NEXT: 9.     1     25.0   0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     13.0   0.1    3.0       <total>
 
 # CHECK:      [31] Code Region - G32
 
@@ -2611,25 +2611,25 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      4003
 # CHECK-NEXT: Total uOps:        4000
 
-# CHECK:      Dispatch Width:    10
+# CHECK:      Dispatch Width:    5
 # CHECK-NEXT: uOps Per Cycle:    1.00
 # CHECK-NEXT: IPC:               0.25
-# CHECK-NEXT: Block RThroughput: 7.5
+# CHECK-NEXT: Block RThroughput: 8.0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789          0123456789
 # CHECK-NEXT: Index     0123456789          0123456789          012
 
 # CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    . .   ld3	{ v1.h, v2.h, v3.h }[4], [x27], x28
-# CHECK-NEXT: [0,1]     D=eE------R    .    .    .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .D=======eeeeeeeeER .    .    .    .    . .   ld3	{ v1.s, v2.s, v3.s }[0], [x27], #12
-# CHECK-NEXT: [0,3]     .D========eE------R .    .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D==============eeeeeeeeER   .    .    . .   ld3	{ v1.s, v2.s, v3.s }[0], [x27], x28
-# CHECK-NEXT: [0,5]     . D===============eE------R   .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D=====================eeeeeeeeER.    . .   ld3	{ v1.d, v2.d, v3.d }[0], [x27], #24
-# CHECK-NEXT: [0,7]     .  D======================eE------R.    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D============================eeeeeeeeER   ld3	{ v1.d, v2.d, v3.d }[0], [x27], x28
-# CHECK-NEXT: [0,9]     .   D=============================eE------R   add	x0, x27, #1
+# CHECK-NEXT: [0,1]     .DeE------R    .    .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . D======eeeeeeeeER .    .    .    .    . .   ld3	{ v1.s, v2.s, v3.s }[0], [x27], #12
+# CHECK-NEXT: [0,3]     .  D======eE------R .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   D============eeeeeeeeER   .    .    . .   ld3	{ v1.s, v2.s, v3.s }[0], [x27], x28
+# CHECK-NEXT: [0,5]     .    D============eE------R   .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .D==================eeeeeeeeER.    . .   ld3	{ v1.d, v2.d, v3.d }[0], [x27], #24
+# CHECK-NEXT: [0,7]     .    . D==================eE------R.    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  D========================eeeeeeeeER   ld3	{ v1.d, v2.d, v3.d }[0], [x27], x28
+# CHECK-NEXT: [0,9]     .    .   D========================eE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2639,16 +2639,16 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3	{ v1.h, v2.h, v3.h }[4], [x27], x28
-# CHECK-NEXT: 1.     1     2.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     8.0    0.0    0.0       ld3	{ v1.s, v2.s, v3.s }[0], [x27], #12
-# CHECK-NEXT: 3.     1     9.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     15.0   0.0    0.0       ld3	{ v1.s, v2.s, v3.s }[0], [x27], x28
-# CHECK-NEXT: 5.     1     16.0   0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     22.0   0.0    0.0       ld3	{ v1.d, v2.d, v3.d }[0], [x27], #24
-# CHECK-NEXT: 7.     1     23.0   0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     29.0   0.0    0.0       ld3	{ v1.d, v2.d, v3.d }[0], [x27], x28
-# CHECK-NEXT: 9.     1     30.0   0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT:        1     15.5   0.1    3.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld3	{ v1.s, v2.s, v3.s }[0], [x27], #12
+# CHECK-NEXT: 3.     1     7.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     13.0   0.0    0.0       ld3	{ v1.s, v2.s, v3.s }[0], [x27], x28
+# CHECK-NEXT: 5.     1     13.0   0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     19.0   0.0    0.0       ld3	{ v1.d, v2.d, v3.d }[0], [x27], #24
+# CHECK-NEXT: 7.     1     19.0   0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     25.0   0.0    0.0       ld3	{ v1.d, v2.d, v3.d }[0], [x27], x28
+# CHECK-NEXT: 9.     1     25.0   0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     13.0   0.1    3.0       <total>
 
 # CHECK:      [32] Code Region - G33
 
@@ -2657,7 +2657,7 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      508
 # CHECK-NEXT: Total uOps:        2500
 
-# CHECK:      Dispatch Width:    10
+# CHECK:      Dispatch Width:    5
 # CHECK-NEXT: uOps Per Cycle:    4.92
 # CHECK-NEXT: IPC:               1.97
 # CHECK-NEXT: Block RThroughput: 5.0
@@ -2668,14 +2668,14 @@ add x0, x27, 1
 
 # CHECK:      [0,0]     DeeeeeeER . .   ld3r	{ v1.1d, v2.1d, v3.1d }, [x27], #24
 # CHECK-NEXT: [0,1]     D=eE----R . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=eeeeeeER. .   ld3r	{ v1.2d, v2.2d, v3.2d }, [x27], #24
-# CHECK-NEXT: [0,3]     D==eE----R. .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D=eeeeeeER .   ld3r	{ v1.2s, v2.2s, v3.2s }, [x27], #12
-# CHECK-NEXT: [0,5]     .D==eE----R .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D==eeeeeeER.   ld3r	{ v1.4h, v2.4h, v3.4h }, [x27], #6
-# CHECK-NEXT: [0,7]     .D===eE----R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D==eeeeeeER   ld3r	{ v1.4s, v2.4s, v3.4s }, [x27], #12
-# CHECK-NEXT: [0,9]     . D===eE----R   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeeER. .   ld3r	{ v1.2d, v2.2d, v3.2d }, [x27], #24
+# CHECK-NEXT: [0,3]     .D=eE----R. .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DeeeeeeER .   ld3r	{ v1.2s, v2.2s, v3.2s }, [x27], #12
+# CHECK-NEXT: [0,5]     . D=eE----R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  DeeeeeeER.   ld3r	{ v1.4h, v2.4h, v3.4h }, [x27], #6
+# CHECK-NEXT: [0,7]     .  D=eE----R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   DeeeeeeER   ld3r	{ v1.4s, v2.4s, v3.4s }, [x27], #12
+# CHECK-NEXT: [0,9]     .   D=eE----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2686,15 +2686,15 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3r	{ v1.1d, v2.1d, v3.1d }, [x27], #24
 # CHECK-NEXT: 1.     1     2.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld3r	{ v1.2d, v2.2d, v3.2d }, [x27], #24
-# CHECK-NEXT: 3.     1     3.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ld3r	{ v1.2s, v2.2s, v3.2s }, [x27], #12
-# CHECK-NEXT: 5.     1     3.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ld3r	{ v1.4h, v2.4h, v3.4h }, [x27], #6
-# CHECK-NEXT: 7.     1     4.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     3.0    0.0    0.0       ld3r	{ v1.4s, v2.4s, v3.4s }, [x27], #12
-# CHECK-NEXT: 9.     1     4.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.7    0.1    2.0       <total>
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld3r	{ v1.2d, v2.2d, v3.2d }, [x27], #24
+# CHECK-NEXT: 3.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ld3r	{ v1.2s, v2.2s, v3.2s }, [x27], #12
+# CHECK-NEXT: 5.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    0.0    0.0       ld3r	{ v1.4h, v2.4h, v3.4h }, [x27], #6
+# CHECK-NEXT: 7.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    0.0    0.0       ld3r	{ v1.4s, v2.4s, v3.4s }, [x27], #12
+# CHECK-NEXT: 9.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.5    0.1    2.0       <total>
 
 # CHECK:      [33] Code Region - G34
 
@@ -2703,7 +2703,7 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      508
 # CHECK-NEXT: Total uOps:        2500
 
-# CHECK:      Dispatch Width:    10
+# CHECK:      Dispatch Width:    5
 # CHECK-NEXT: uOps Per Cycle:    4.92
 # CHECK-NEXT: IPC:               1.97
 # CHECK-NEXT: Block RThroughput: 5.0
@@ -2714,14 +2714,14 @@ add x0, x27, 1
 
 # CHECK:      [0,0]     DeeeeeeER . .   ld3r	{ v1.8b, v2.8b, v3.8b }, [x27], #3
 # CHECK-NEXT: [0,1]     D=eE----R . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=eeeeeeER. .   ld3r	{ v1.8h, v2.8h, v3.8h }, [x27], #6
-# CHECK-NEXT: [0,3]     D==eE----R. .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D=eeeeeeER .   ld3r	{ v1.16b, v2.16b, v3.16b }, [x27], #3
-# CHECK-NEXT: [0,5]     .D==eE----R .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D==eeeeeeER.   ld3r	{ v1.1d, v2.1d, v3.1d }, [x27], x28
-# CHECK-NEXT: [0,7]     .D===eE----R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D==eeeeeeER   ld3r	{ v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT: [0,9]     . D===eE----R   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeeER. .   ld3r	{ v1.8h, v2.8h, v3.8h }, [x27], #6
+# CHECK-NEXT: [0,3]     .D=eE----R. .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DeeeeeeER .   ld3r	{ v1.16b, v2.16b, v3.16b }, [x27], #3
+# CHECK-NEXT: [0,5]     . D=eE----R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  DeeeeeeER.   ld3r	{ v1.1d, v2.1d, v3.1d }, [x27], x28
+# CHECK-NEXT: [0,7]     .  D=eE----R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   DeeeeeeER   ld3r	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: [0,9]     .   D=eE----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2732,15 +2732,15 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3r	{ v1.8b, v2.8b, v3.8b }, [x27], #3
 # CHECK-NEXT: 1.     1     2.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld3r	{ v1.8h, v2.8h, v3.8h }, [x27], #6
-# CHECK-NEXT: 3.     1     3.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ld3r	{ v1.16b, v2.16b, v3.16b }, [x27], #3
-# CHECK-NEXT: 5.     1     3.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ld3r	{ v1.1d, v2.1d, v3.1d }, [x27], x28
-# CHECK-NEXT: 7.     1     4.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     3.0    0.0    0.0       ld3r	{ v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT: 9.     1     4.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.7    0.1    2.0       <total>
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld3r	{ v1.8h, v2.8h, v3.8h }, [x27], #6
+# CHECK-NEXT: 3.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ld3r	{ v1.16b, v2.16b, v3.16b }, [x27], #3
+# CHECK-NEXT: 5.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    0.0    0.0       ld3r	{ v1.1d, v2.1d, v3.1d }, [x27], x28
+# CHECK-NEXT: 7.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    0.0    0.0       ld3r	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: 9.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.5    0.1    2.0       <total>
 
 # CHECK:      [34] Code Region - G35
 
@@ -2749,7 +2749,7 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      508
 # CHECK-NEXT: Total uOps:        2500
 
-# CHECK:      Dispatch Width:    10
+# CHECK:      Dispatch Width:    5
 # CHECK-NEXT: uOps Per Cycle:    4.92
 # CHECK-NEXT: IPC:               1.97
 # CHECK-NEXT: Block RThroughput: 5.0
@@ -2760,14 +2760,14 @@ add x0, x27, 1
 
 # CHECK:      [0,0]     DeeeeeeER . .   ld3r	{ v1.2s, v2.2s, v3.2s }, [x27], x28
 # CHECK-NEXT: [0,1]     D=eE----R . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=eeeeeeER. .   ld3r	{ v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: [0,3]     D==eE----R. .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D=eeeeeeER .   ld3r	{ v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: [0,5]     .D==eE----R .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D==eeeeeeER.   ld3r	{ v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: [0,7]     .D===eE----R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D==eeeeeeER   ld3r	{ v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT: [0,9]     . D===eE----R   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeeER. .   ld3r	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: [0,3]     .D=eE----R. .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DeeeeeeER .   ld3r	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: [0,5]     . D=eE----R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  DeeeeeeER.   ld3r	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: [0,7]     .  D=eE----R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   DeeeeeeER   ld3r	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: [0,9]     .   D=eE----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2778,42 +2778,42 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3r	{ v1.2s, v2.2s, v3.2s }, [x27], x28
 # CHECK-NEXT: 1.     1     2.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld3r	{ v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: 3.     1     3.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ld3r	{ v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: 5.     1     3.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ld3r	{ v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: 7.     1     4.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     3.0    0.0    0.0       ld3r	{ v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT: 9.     1     4.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.7    0.1    2.0       <total>
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld3r	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: 3.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ld3r	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: 5.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    0.0    0.0       ld3r	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: 7.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    0.0    0.0       ld3r	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: 9.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.5    0.1    2.0       <total>
 
 # CHECK:      [35] Code Region - G36
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      809
+# CHECK-NEXT: Total Cycles:      909
 # CHECK-NEXT: Total uOps:        4500
 
-# CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    5.56
-# CHECK-NEXT: IPC:               1.24
-# CHECK-NEXT: Block RThroughput: 8.0
+# CHECK:      Dispatch Width:    5
+# CHECK-NEXT: uOps Per Cycle:    4.95
+# CHECK-NEXT: IPC:               1.10
+# CHECK-NEXT: Block RThroughput: 9.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456
+# CHECK-NEXT:                     01234567
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeER .    ..   ld3r	{ v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: [0,1]     D=eE----R .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeeeeeeeeER   ..   ld4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
-# CHECK-NEXT: [0,3]     .D=eE------R   ..   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . DeeeeeeeeER  ..   ld4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
-# CHECK-NEXT: [0,5]     . D=eE------R  ..   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D==eeeeeeeeER.   ld4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
-# CHECK-NEXT: [0,7]     .  D===eE------R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D==eeeeeeeeER   ld4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
-# CHECK-NEXT: [0,9]     .   D===eE------R   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeER .    . .   ld3r	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: [0,1]     D=eE----R .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeeeeER   . .   ld4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: [0,3]     . DeE------R   . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .  DeeeeeeeeER . .   ld4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: [0,5]     .   DeE------R . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    DeeeeeeeeER .   ld4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: [0,7]     .    .DeE------R .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    . DeeeeeeeeER   ld4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: [0,9]     .    .  DeE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2825,23 +2825,23 @@ add x0, x27, 1
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3r	{ v1.16b, v2.16b, v3.16b }, [x27], x28
 # CHECK-NEXT: 1.     1     2.0    0.0    4.0       add	x0, x27, #1
 # CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
-# CHECK-NEXT: 3.     1     2.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ld4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
-# CHECK-NEXT: 5.     1     2.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     3.0    2.0    0.0       ld4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
-# CHECK-NEXT: 7.     1     4.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     3.0    0.0    0.0       ld4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
-# CHECK-NEXT: 9.     1     4.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.3    0.3    2.8       <total>
+# CHECK-NEXT: 3.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       ld4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: 5.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       ld4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: 7.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       ld4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: 9.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.1    0.4    2.8       <total>
 
 # CHECK:      [36] Code Region - G37
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      1008
+# CHECK-NEXT: Total Cycles:      1009
 # CHECK-NEXT: Total uOps:        5000
 
-# CHECK:      Dispatch Width:    10
+# CHECK:      Dispatch Width:    5
 # CHECK-NEXT: uOps Per Cycle:    4.96
 # CHECK-NEXT: IPC:               0.99
 # CHECK-NEXT: Block RThroughput: 10.0
@@ -2851,15 +2851,15 @@ add x0, x27, 1
 # CHECK-NEXT: Index     0123456789
 
 # CHECK:      [0,0]     DeeeeeeeeER    .  .   ld4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
-# CHECK-NEXT: [0,1]     D=eE------R    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeeeeeeeeER   .  .   ld4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
-# CHECK-NEXT: [0,3]     .D=eE------R   .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D==eeeeeeeeER.  .   ld4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
-# CHECK-NEXT: [0,5]     . D===eE------R.  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D==eeeeeeeeER  .   ld4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: [0,7]     .  D===eE------R  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D====eeeeeeeeER   ld4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: [0,9]     .   D=====eE------R   add	x0, x27, #1
+# CHECK-NEXT: [0,1]     .DeE------R    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeeeeeeeeER  .  .   ld4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: [0,3]     .  DeE------R  .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   DeeeeeeeeER.  .   ld4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: [0,5]     .    DeE------R.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .DeeeeeeeeER .   ld4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: [0,7]     .    . DeE------R .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  DeeeeeeeeER   ld4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .   DeE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2869,25 +2869,25 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
-# CHECK-NEXT: 1.     1     2.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
-# CHECK-NEXT: 3.     1     2.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     3.0    2.0    0.0       ld4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
-# CHECK-NEXT: 5.     1     4.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ld4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: 7.     1     4.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     5.0    2.0    0.0       ld4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: 9.     1     6.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT:        1     3.1    0.5    3.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       ld4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: 3.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       ld4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: 5.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       ld4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: 7.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       ld4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: 9.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    0.5    3.0       <total>
 
 # CHECK:      [37] Code Region - G38
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      1008
+# CHECK-NEXT: Total Cycles:      1009
 # CHECK-NEXT: Total uOps:        5000
 
-# CHECK:      Dispatch Width:    10
+# CHECK:      Dispatch Width:    5
 # CHECK-NEXT: uOps Per Cycle:    4.96
 # CHECK-NEXT: IPC:               0.99
 # CHECK-NEXT: Block RThroughput: 10.0
@@ -2897,15 +2897,15 @@ add x0, x27, 1
 # CHECK-NEXT: Index     0123456789
 
 # CHECK:      [0,0]     DeeeeeeeeER    .  .   ld4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: [0,1]     D=eE------R    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeeeeeeeeER   .  .   ld4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: [0,3]     .D=eE------R   .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D==eeeeeeeeER.  .   ld4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: [0,5]     . D===eE------R.  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D==eeeeeeeeER  .   ld4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: [0,7]     .  D===eE------R  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D====eeeeeeeeER   ld4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: [0,9]     .   D=====eE------R   add	x0, x27, #1
+# CHECK-NEXT: [0,1]     .DeE------R    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeeeeeeeeER  .  .   ld4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: [0,3]     .  DeE------R  .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   DeeeeeeeeER.  .   ld4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: [0,5]     .    DeE------R.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .DeeeeeeeeER .   ld4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: [0,7]     .    . DeE------R .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  DeeeeeeeeER   ld4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .   DeE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2915,16 +2915,16 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: 1.     1     2.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: 3.     1     2.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     3.0    2.0    0.0       ld4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: 5.     1     4.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ld4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: 7.     1     4.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     5.0    2.0    0.0       ld4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: 9.     1     6.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT:        1     3.1    0.5    3.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       ld4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: 3.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       ld4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: 5.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       ld4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: 7.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       ld4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: 9.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    0.5    3.0       <total>
 
 # CHECK:      [38] Code Region - G39
 
@@ -2933,7 +2933,7 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      4003
 # CHECK-NEXT: Total uOps:        5000
 
-# CHECK:      Dispatch Width:    10
+# CHECK:      Dispatch Width:    5
 # CHECK-NEXT: uOps Per Cycle:    1.25
 # CHECK-NEXT: IPC:               0.25
 # CHECK-NEXT: Block RThroughput: 10.0
@@ -2943,15 +2943,15 @@ add x0, x27, 1
 # CHECK-NEXT: Index     0123456789          0123456789          012
 
 # CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    . .   ld4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
-# CHECK-NEXT: [0,1]     D=eE------R    .    .    .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .D=======eeeeeeeeER .    .    .    .    . .   ld4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
-# CHECK-NEXT: [0,3]     .D========eE------R .    .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D==============eeeeeeeeER   .    .    . .   ld4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
-# CHECK-NEXT: [0,5]     . D===============eE------R   .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D=====================eeeeeeeeER.    . .   ld4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
-# CHECK-NEXT: [0,7]     .  D======================eE------R.    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D============================eeeeeeeeER   ld4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
-# CHECK-NEXT: [0,9]     .   D=============================eE------R   add	x0, x27, #1
+# CHECK-NEXT: [0,1]     .DeE------R    .    .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . D======eeeeeeeeER .    .    .    .    . .   ld4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
+# CHECK-NEXT: [0,3]     .  D======eE------R .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   D============eeeeeeeeER   .    .    . .   ld4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
+# CHECK-NEXT: [0,5]     .    D============eE------R   .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .D==================eeeeeeeeER.    . .   ld4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
+# CHECK-NEXT: [0,7]     .    . D==================eE------R.    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  D========================eeeeeeeeER   ld4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
+# CHECK-NEXT: [0,9]     .    .   D========================eE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2961,16 +2961,16 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
-# CHECK-NEXT: 1.     1     2.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     8.0    0.0    0.0       ld4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
-# CHECK-NEXT: 3.     1     9.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     15.0   0.0    0.0       ld4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
-# CHECK-NEXT: 5.     1     16.0   0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     22.0   0.0    0.0       ld4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
-# CHECK-NEXT: 7.     1     23.0   0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     29.0   0.0    0.0       ld4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
-# CHECK-NEXT: 9.     1     30.0   0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT:        1     15.5   0.1    3.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
+# CHECK-NEXT: 3.     1     7.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     13.0   0.0    0.0       ld4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
+# CHECK-NEXT: 5.     1     13.0   0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     19.0   0.0    0.0       ld4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
+# CHECK-NEXT: 7.     1     19.0   0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     25.0   0.0    0.0       ld4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
+# CHECK-NEXT: 9.     1     25.0   0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     13.0   0.1    3.0       <total>
 
 # CHECK:      [39] Code Region - G40
 
@@ -2979,7 +2979,7 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      4003
 # CHECK-NEXT: Total uOps:        5000
 
-# CHECK:      Dispatch Width:    10
+# CHECK:      Dispatch Width:    5
 # CHECK-NEXT: uOps Per Cycle:    1.25
 # CHECK-NEXT: IPC:               0.25
 # CHECK-NEXT: Block RThroughput: 10.0
@@ -2989,15 +2989,15 @@ add x0, x27, 1
 # CHECK-NEXT: Index     0123456789          0123456789          012
 
 # CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    . .   ld4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
-# CHECK-NEXT: [0,1]     D=eE------R    .    .    .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .D=======eeeeeeeeER .    .    .    .    . .   ld4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
-# CHECK-NEXT: [0,3]     .D========eE------R .    .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D==============eeeeeeeeER   .    .    . .   ld4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
-# CHECK-NEXT: [0,5]     . D===============eE------R   .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D=====================eeeeeeeeER.    . .   ld4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
-# CHECK-NEXT: [0,7]     .  D======================eE------R.    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D============================eeeeeeeeER   ld4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
-# CHECK-NEXT: [0,9]     .   D=============================eE------R   add	x0, x27, #1
+# CHECK-NEXT: [0,1]     .DeE------R    .    .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . D======eeeeeeeeER .    .    .    .    . .   ld4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
+# CHECK-NEXT: [0,3]     .  D======eE------R .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   D============eeeeeeeeER   .    .    . .   ld4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
+# CHECK-NEXT: [0,5]     .    D============eE------R   .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .D==================eeeeeeeeER.    . .   ld4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
+# CHECK-NEXT: [0,7]     .    . D==================eE------R.    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  D========================eeeeeeeeER   ld4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
+# CHECK-NEXT: [0,9]     .    .   D========================eE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3007,16 +3007,16 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
-# CHECK-NEXT: 1.     1     2.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     8.0    0.0    0.0       ld4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
-# CHECK-NEXT: 3.     1     9.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     15.0   0.0    0.0       ld4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
-# CHECK-NEXT: 5.     1     16.0   0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     22.0   0.0    0.0       ld4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
-# CHECK-NEXT: 7.     1     23.0   0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     29.0   0.0    0.0       ld4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
-# CHECK-NEXT: 9.     1     30.0   0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT:        1     15.5   0.1    3.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
+# CHECK-NEXT: 3.     1     7.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     13.0   0.0    0.0       ld4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
+# CHECK-NEXT: 5.     1     13.0   0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     19.0   0.0    0.0       ld4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
+# CHECK-NEXT: 7.     1     19.0   0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     25.0   0.0    0.0       ld4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
+# CHECK-NEXT: 9.     1     25.0   0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     13.0   0.1    3.0       <total>
 
 # CHECK:      [40] Code Region - G41
 
@@ -3025,25 +3025,25 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      2103
 # CHECK-NEXT: Total uOps:        4900
 
-# CHECK:      Dispatch Width:    10
+# CHECK:      Dispatch Width:    5
 # CHECK-NEXT: uOps Per Cycle:    2.33
 # CHECK-NEXT: IPC:               0.48
-# CHECK-NEXT: Block RThroughput: 9.5
+# CHECK-NEXT: Block RThroughput: 9.8
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
 # CHECK-NEXT: Index     0123456789          0123
 
 # CHECK:      [0,0]     DeeeeeeeeER    .    .  .   ld4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
-# CHECK-NEXT: [0,1]     D=eE------R    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .D=======eeeeeeeeER .  .   ld4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
-# CHECK-NEXT: [0,3]     .D========eE------R .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D=======eeeeeeeeER.  .   ld4r	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
-# CHECK-NEXT: [0,5]     . D========eE------R.  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D=========eeeeeeeeER.   ld4r	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #32
-# CHECK-NEXT: [0,7]     .  D==========eE------R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D=========eeeeeeeeER   ld4r	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #16
-# CHECK-NEXT: [0,9]     .   D==========eE------R   add	x0, x27, #1
+# CHECK-NEXT: [0,1]     .DeE------R    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . D======eeeeeeeeER .  .   ld4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
+# CHECK-NEXT: [0,3]     .  D======eE------R .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   D=====eeeeeeeeER.  .   ld4r	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: [0,5]     .    D=====eE------R.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .D======eeeeeeeeER.   ld4r	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #32
+# CHECK-NEXT: [0,7]     .    . D======eE------R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  D=====eeeeeeeeER   ld4r	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #16
+# CHECK-NEXT: [0,9]     .    .   D=====eE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3053,43 +3053,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
-# CHECK-NEXT: 1.     1     2.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     8.0    0.0    0.0       ld4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
-# CHECK-NEXT: 3.     1     9.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     8.0    0.0    0.0       ld4r	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
-# CHECK-NEXT: 5.     1     9.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     10.0   2.0    0.0       ld4r	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #32
-# CHECK-NEXT: 7.     1     11.0   0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     10.0   0.0    0.0       ld4r	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #16
-# CHECK-NEXT: 9.     1     11.0   0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT:        1     7.9    0.3    3.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
+# CHECK-NEXT: 3.     1     7.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     6.0    0.0    0.0       ld4r	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: 5.     1     6.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     7.0    2.0    0.0       ld4r	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #32
+# CHECK-NEXT: 7.     1     7.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     6.0    0.0    0.0       ld4r	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #16
+# CHECK-NEXT: 9.     1     6.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     5.4    0.3    3.0       <total>
 
 # CHECK:      [41] Code Region - G42
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      759
+# CHECK-NEXT: Total Cycles:      1009
 # CHECK-NEXT: Total uOps:        4500
 
-# CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    5.93
-# CHECK-NEXT: IPC:               1.32
-# CHECK-NEXT: Block RThroughput: 7.5
+# CHECK:      Dispatch Width:    5
+# CHECK-NEXT: uOps Per Cycle:    4.46
+# CHECK-NEXT: IPC:               0.99
+# CHECK-NEXT: Block RThroughput: 9.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456
+# CHECK-NEXT:                     012345678
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeeeER    ..   ld4r	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #8
-# CHECK-NEXT: [0,1]     D=eE------R    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeeeeeeeeER   ..   ld4r	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #16
-# CHECK-NEXT: [0,3]     .D=eE------R   ..   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D=eeeeeeeeER ..   ld4r	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #4
-# CHECK-NEXT: [0,5]     . D==eE------R ..   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D=eeeeeeeeER..   ld4r	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #8
-# CHECK-NEXT: [0,7]     .  D==eE------R..   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D==eeeeeeeeER   ld4r	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #4
-# CHECK-NEXT: [0,9]     .   D===eE------R   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeeeER    .  .   ld4r	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #8
+# CHECK-NEXT: [0,1]     .DeE------R    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeeeeeeeeER  .  .   ld4r	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #16
+# CHECK-NEXT: [0,3]     .  DeE------R  .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   DeeeeeeeeER.  .   ld4r	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #4
+# CHECK-NEXT: [0,5]     .    DeE------R.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .DeeeeeeeeER .   ld4r	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #8
+# CHECK-NEXT: [0,7]     .    . DeE------R .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  DeeeeeeeeER   ld4r	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #4
+# CHECK-NEXT: [0,9]     .    .   DeE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3099,43 +3099,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld4r	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #8
-# CHECK-NEXT: 1.     1     2.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld4r	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #16
-# CHECK-NEXT: 3.     1     2.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     2.0    1.0    0.0       ld4r	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #4
-# CHECK-NEXT: 5.     1     3.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     2.0    0.0    0.0       ld4r	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #8
-# CHECK-NEXT: 7.     1     3.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     3.0    1.0    0.0       ld4r	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #4
-# CHECK-NEXT: 9.     1     4.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.3    0.3    3.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       ld4r	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #16
+# CHECK-NEXT: 3.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       ld4r	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #4
+# CHECK-NEXT: 5.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       ld4r	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #8
+# CHECK-NEXT: 7.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       ld4r	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #4
+# CHECK-NEXT: 9.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    0.5    3.0       <total>
 
 # CHECK:      [42] Code Region - G43
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      859
+# CHECK-NEXT: Total Cycles:      1009
 # CHECK-NEXT: Total uOps:        4700
 
-# CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    5.47
-# CHECK-NEXT: IPC:               1.16
-# CHECK-NEXT: Block RThroughput: 8.5
+# CHECK:      Dispatch Width:    5
+# CHECK-NEXT: uOps Per Cycle:    4.66
+# CHECK-NEXT: IPC:               0.99
+# CHECK-NEXT: Block RThroughput: 9.4
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     01234567
+# CHECK-NEXT:                     012345678
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeeeER    . .   ld4r	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
-# CHECK-NEXT: [0,1]     D=eE------R    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeeeeeeeeER   . .   ld4r	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: [0,3]     .D=eE------R   . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D==eeeeeeeeER. .   ld4r	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: [0,5]     . D===eE------R. .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D==eeeeeeeeER .   ld4r	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: [0,7]     .  D===eE------R .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D===eeeeeeeeER   ld4r	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: [0,9]     .   D====eE------R   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeeeER    .  .   ld4r	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# CHECK-NEXT: [0,1]     .DeE------R    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeeeeeeeeER  .  .   ld4r	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: [0,3]     .  DeE------R  .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   DeeeeeeeeER.  .   ld4r	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: [0,5]     .    DeE------R.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .DeeeeeeeeER .   ld4r	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: [0,7]     .    . DeE------R .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  DeeeeeeeeER   ld4r	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .   DeE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3145,43 +3145,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld4r	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
-# CHECK-NEXT: 1.     1     2.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld4r	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: 3.     1     2.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     3.0    2.0    0.0       ld4r	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: 5.     1     4.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ld4r	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: 7.     1     4.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     4.0    1.0    0.0       ld4r	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: 9.     1     5.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.9    0.4    3.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       ld4r	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: 3.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       ld4r	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: 5.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       ld4r	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: 7.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       ld4r	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: 9.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    0.5    3.0       <total>
 
 # CHECK:      [43] Code Region - G44
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      608
+# CHECK-NEXT: Total Cycles:      1007
 # CHECK-NEXT: Total uOps:        3900
 
-# CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    6.41
-# CHECK-NEXT: IPC:               1.64
-# CHECK-NEXT: Block RThroughput: 5.3
+# CHECK:      Dispatch Width:    5
+# CHECK-NEXT: uOps Per Cycle:    3.87
+# CHECK-NEXT: IPC:               0.99
+# CHECK-NEXT: Block RThroughput: 7.8
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123
+# CHECK-NEXT:                     0123456
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeeeER  .   ld4r	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: [0,1]     D=eE------R  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeeeeeeeeER .   ld4r	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: [0,3]     .D=eE------R .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D=eeeeeeeeER   ld4r	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: [0,5]     . D==eE------R   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D=eeeeeeE-R   ldp	s1, s2, [x27], #248
-# CHECK-NEXT: [0,7]     .  D==eE-----R   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D=eeeeeeER   ldp	d1, d2, [x27], #496
-# CHECK-NEXT: [0,9]     .   D==eE----R   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeeeER    ..   ld4r	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: [0,1]     .DeE------R    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeeeeeeeeER  ..   ld4r	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: [0,3]     .  DeE------R  ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   DeeeeeeeeER..   ld4r	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: [0,5]     .    DeE------R..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .DeeeeeeER..   ldp	s1, s2, [x27], #248
+# CHECK-NEXT: [0,7]     .    . DeE----R..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  DeeeeeeER   ldp	d1, d2, [x27], #496
+# CHECK-NEXT: [0,9]     .    .   DeE----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3191,43 +3191,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld4r	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: 1.     1     2.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld4r	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: 3.     1     2.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     2.0    1.0    0.0       ld4r	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: 5.     1     3.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     2.0    0.0    1.0       ldp	s1, s2, [x27], #248
-# CHECK-NEXT: 7.     1     3.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     2.0    0.0    0.0       ldp	d1, d2, [x27], #496
-# CHECK-NEXT: 9.     1     3.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.1    0.2    2.8       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       ld4r	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: 3.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       ld4r	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: 5.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       ldp	s1, s2, [x27], #248
+# CHECK-NEXT: 7.     1     1.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       ldp	d1, d2, [x27], #496
+# CHECK-NEXT: 9.     1     1.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    0.5    2.6       <total>
 
 # CHECK:      [44] Code Region - G45
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      507
+# CHECK-NEXT: Total Cycles:      807
 # CHECK-NEXT: Total uOps:        2700
 
-# CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    5.33
-# CHECK-NEXT: IPC:               1.97
-# CHECK-NEXT: Block RThroughput: 4.5
+# CHECK:      Dispatch Width:    5
+# CHECK-NEXT: uOps Per Cycle:    3.35
+# CHECK-NEXT: IPC:               1.24
+# CHECK-NEXT: Block RThroughput: 5.4
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     01
+# CHECK-NEXT:                     01234
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeER ..   ldp	q1, q2, [x27], #992
-# CHECK-NEXT: [0,1]     D=eE----R ..   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeeeeeeER..   ldp	s1, s2, [x27, #248]!
-# CHECK-NEXT: [0,3]     .D=eE----R..   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . DeeeeeeER.   ldp	d1, d2, [x27, #496]!
-# CHECK-NEXT: [0,5]     . D=eE----R.   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  DeeeeeeER   ldp	q1, q2, [x27, #992]!
-# CHECK-NEXT: [0,7]     .  D=eE----R   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .  D=eeeeE-R   ldp	w1, w2, [x27], #248
-# CHECK-NEXT: [0,9]     .  D==eE---R   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeER .   .   ldp	q1, q2, [x27], #992
+# CHECK-NEXT: [0,1]     .DeE----R .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeeeeeeER   .   ldp	s1, s2, [x27, #248]!
+# CHECK-NEXT: [0,3]     .  DeE----R   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   DeeeeeeER .   ldp	d1, d2, [x27, #496]!
+# CHECK-NEXT: [0,5]     .    DeE----R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .DeeeeeeER   ldp	q1, q2, [x27, #992]!
+# CHECK-NEXT: [0,7]     .    . DeE----R   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    . DeeeeE-R   ldp	w1, w2, [x27], #248
+# CHECK-NEXT: [0,9]     .    . D=eE---R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3237,16 +3237,16 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ldp	q1, q2, [x27], #992
-# CHECK-NEXT: 1.     1     2.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ldp	s1, s2, [x27, #248]!
-# CHECK-NEXT: 3.     1     2.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ldp	d1, d2, [x27, #496]!
-# CHECK-NEXT: 5.     1     2.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     1.0    0.0    0.0       ldp	q1, q2, [x27, #992]!
-# CHECK-NEXT: 7.     1     2.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     2.0    0.0    1.0       ldp	w1, w2, [x27], #248
-# CHECK-NEXT: 9.     1     3.0    0.0    3.0       add	x0, x27, #1
-# CHECK-NEXT:        1     1.7    0.1    2.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       ldp	s1, s2, [x27, #248]!
+# CHECK-NEXT: 3.     1     1.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       ldp	d1, d2, [x27, #496]!
+# CHECK-NEXT: 5.     1     1.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       ldp	q1, q2, [x27, #992]!
+# CHECK-NEXT: 7.     1     1.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    0.0    1.0       ldp	w1, w2, [x27], #248
+# CHECK-NEXT: 9.     1     2.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.1    0.4    2.0       <total>
 
 # CHECK:      [45] Code Region - G46
 
@@ -3255,10 +3255,10 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      506
 # CHECK-NEXT: Total uOps:        2100
 
-# CHECK:      Dispatch Width:    10
+# CHECK:      Dispatch Width:    5
 # CHECK-NEXT: uOps Per Cycle:    4.15
 # CHECK-NEXT: IPC:               1.98
-# CHECK-NEXT: Block RThroughput: 3.0
+# CHECK-NEXT: Block RThroughput: 4.2
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0
@@ -3266,14 +3266,14 @@ add x0, x27, 1
 
 # CHECK:      [0,0]     DeeeeER   .   ldp	x1, x2, [x27], #496
 # CHECK-NEXT: [0,1]     D=eE--R   .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=eeeeER  .   ldp	w1, w2, [x27, #248]!
-# CHECK-NEXT: [0,3]     D==eE--R  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     D==eeeeER .   ldp	x1, x2, [x27, #496]!
-# CHECK-NEXT: [0,5]     .D==eE--R .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D==eeeeER.   ldpsw	x1, x2, [x27], #248
-# CHECK-NEXT: [0,7]     .D===eE--R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .D===eeeeER   ldpsw	x1, x2, [x27, #248]!
-# CHECK-NEXT: [0,9]     . D===eE--R   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeER  .   ldp	w1, w2, [x27, #248]!
+# CHECK-NEXT: [0,3]     .D=eE--R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DeeeeER .   ldp	x1, x2, [x27, #496]!
+# CHECK-NEXT: [0,5]     . D=eE--R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  DeeeeER.   ldpsw	x1, x2, [x27], #248
+# CHECK-NEXT: [0,7]     .  D=eE--R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   DeeeeER   ldpsw	x1, x2, [x27, #248]!
+# CHECK-NEXT: [0,9]     .   D=eE--R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3284,15 +3284,15 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ldp	x1, x2, [x27], #496
 # CHECK-NEXT: 1.     1     2.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ldp	w1, w2, [x27, #248]!
-# CHECK-NEXT: 3.     1     3.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     3.0    0.0    0.0       ldp	x1, x2, [x27, #496]!
-# CHECK-NEXT: 5.     1     3.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ldpsw	x1, x2, [x27], #248
-# CHECK-NEXT: 7.     1     4.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     4.0    0.0    0.0       ldpsw	x1, x2, [x27, #248]!
-# CHECK-NEXT: 9.     1     4.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.9    0.1    1.0       <total>
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ldp	w1, w2, [x27, #248]!
+# CHECK-NEXT: 3.     1     2.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ldp	x1, x2, [x27, #496]!
+# CHECK-NEXT: 5.     1     2.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    0.0    0.0       ldpsw	x1, x2, [x27], #248
+# CHECK-NEXT: 7.     1     2.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    0.0    0.0       ldpsw	x1, x2, [x27, #248]!
+# CHECK-NEXT: 9.     1     2.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.5    0.1    1.0       <total>
 
 # CHECK:      [46] Code Region - G47
 
@@ -3301,10 +3301,10 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      508
 # CHECK-NEXT: Total uOps:        1500
 
-# CHECK:      Dispatch Width:    10
+# CHECK:      Dispatch Width:    5
 # CHECK-NEXT: uOps Per Cycle:    2.95
 # CHECK-NEXT: IPC:               1.97
-# CHECK-NEXT: Block RThroughput: 2.5
+# CHECK-NEXT: Block RThroughput: 3.0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     012
@@ -3313,13 +3313,13 @@ add x0, x27, 1
 # CHECK:      [0,0]     DeeeeeeER . .   ldr	b1, [x27], #254
 # CHECK-NEXT: [0,1]     D=eE----R . .   add	x0, x27, #1
 # CHECK-NEXT: [0,2]     D=eeeeeeER. .   ldr	h1, [x27], #254
-# CHECK-NEXT: [0,3]     D==eE----R. .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     D==eeeeeeER .   ldr	s1, [x27], #254
-# CHECK-NEXT: [0,5]     D===eE----R .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D==eeeeeeER.   ldr	d1, [x27], #254
-# CHECK-NEXT: [0,7]     .D===eE----R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .D===eeeeeeER   ldr	q1, [x27], #254
-# CHECK-NEXT: [0,9]     .D====eE----R   add	x0, x27, #1
+# CHECK-NEXT: [0,3]     .D=eE----R. .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eeeeeeER .   ldr	s1, [x27], #254
+# CHECK-NEXT: [0,5]     .D==eE----R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     . D=eeeeeeER.   ldr	d1, [x27], #254
+# CHECK-NEXT: [0,7]     . D==eE----R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==eeeeeeER   ldr	q1, [x27], #254
+# CHECK-NEXT: [0,9]     .  D==eE----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3331,14 +3331,14 @@ add x0, x27, 1
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ldr	b1, [x27], #254
 # CHECK-NEXT: 1.     1     2.0    0.0    4.0       add	x0, x27, #1
 # CHECK-NEXT: 2.     1     2.0    0.0    0.0       ldr	h1, [x27], #254
-# CHECK-NEXT: 3.     1     3.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     3.0    0.0    0.0       ldr	s1, [x27], #254
-# CHECK-NEXT: 5.     1     4.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ldr	d1, [x27], #254
-# CHECK-NEXT: 7.     1     4.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     4.0    0.0    0.0       ldr	q1, [x27], #254
-# CHECK-NEXT: 9.     1     5.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT:        1     3.1    0.1    2.0       <total>
+# CHECK-NEXT: 3.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ldr	s1, [x27], #254
+# CHECK-NEXT: 5.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     2.0    0.0    0.0       ldr	d1, [x27], #254
+# CHECK-NEXT: 7.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    0.0    0.0       ldr	q1, [x27], #254
+# CHECK-NEXT: 9.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.3    0.1    2.0       <total>
 
 # CHECK:      [47] Code Region - G48
 
@@ -3347,10 +3347,10 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      508
 # CHECK-NEXT: Total uOps:        1500
 
-# CHECK:      Dispatch Width:    10
+# CHECK:      Dispatch Width:    5
 # CHECK-NEXT: uOps Per Cycle:    2.95
 # CHECK-NEXT: IPC:               1.97
-# CHECK-NEXT: Block RThroughput: 2.5
+# CHECK-NEXT: Block RThroughput: 3.0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     012
@@ -3359,13 +3359,13 @@ add x0, x27, 1
 # CHECK:      [0,0]     DeeeeeeER . .   ldr	b1, [x27, #254]!
 # CHECK-NEXT: [0,1]     D=eE----R . .   add	x0, x27, #1
 # CHECK-NEXT: [0,2]     D=eeeeeeER. .   ldr	h1, [x27, #254]!
-# CHECK-NEXT: [0,3]     D==eE----R. .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     D==eeeeeeER .   ldr	s1, [x27, #254]!
-# CHECK-NEXT: [0,5]     D===eE----R .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D==eeeeeeER.   ldr	d1, [x27, #254]!
-# CHECK-NEXT: [0,7]     .D===eE----R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .D===eeeeeeER   ldr	q1, [x27, #254]!
-# CHECK-NEXT: [0,9]     .D====eE----R   add	x0, x27, #1
+# CHECK-NEXT: [0,3]     .D=eE----R. .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eeeeeeER .   ldr	s1, [x27, #254]!
+# CHECK-NEXT: [0,5]     .D==eE----R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     . D=eeeeeeER.   ldr	d1, [x27, #254]!
+# CHECK-NEXT: [0,7]     . D==eE----R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==eeeeeeER   ldr	q1, [x27, #254]!
+# CHECK-NEXT: [0,9]     .  D==eE----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3377,14 +3377,14 @@ add x0, x27, 1
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ldr	b1, [x27, #254]!
 # CHECK-NEXT: 1.     1     2.0    0.0    4.0       add	x0, x27, #1
 # CHECK-NEXT: 2.     1     2.0    0.0    0.0       ldr	h1, [x27, #254]!
-# CHECK-NEXT: 3.     1     3.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     3.0    0.0    0.0       ldr	s1, [x27, #254]!
-# CHECK-NEXT: 5.     1     4.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ldr	d1, [x27, #254]!
-# CHECK-NEXT: 7.     1     4.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     4.0    0.0    0.0       ldr	q1, [x27, #254]!
-# CHECK-NEXT: 9.     1     5.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT:        1     3.1    0.1    2.0       <total>
+# CHECK-NEXT: 3.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ldr	s1, [x27, #254]!
+# CHECK-NEXT: 5.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     2.0    0.0    0.0       ldr	d1, [x27, #254]!
+# CHECK-NEXT: 7.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    0.0    0.0       ldr	q1, [x27, #254]!
+# CHECK-NEXT: 9.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.3    0.1    2.0       <total>
 
 # CHECK:      [48] Code Region - G49
 
@@ -3393,10 +3393,10 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      506
 # CHECK-NEXT: Total uOps:        1500
 
-# CHECK:      Dispatch Width:    10
+# CHECK:      Dispatch Width:    5
 # CHECK-NEXT: uOps Per Cycle:    2.96
 # CHECK-NEXT: IPC:               1.98
-# CHECK-NEXT: Block RThroughput: 2.5
+# CHECK-NEXT: Block RThroughput: 3.0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0
@@ -3405,13 +3405,13 @@ add x0, x27, 1
 # CHECK:      [0,0]     DeeeeER   .   ldr	w1, [x27], #254
 # CHECK-NEXT: [0,1]     D=eE--R   .   add	x0, x27, #1
 # CHECK-NEXT: [0,2]     D=eeeeER  .   ldr	x1, [x27], #254
-# CHECK-NEXT: [0,3]     D==eE--R  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     D==eeeeER .   ldr	w1, [x27, #254]!
-# CHECK-NEXT: [0,5]     D===eE--R .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D==eeeeER.   ldr	x1, [x27, #254]!
-# CHECK-NEXT: [0,7]     .D===eE--R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .D===eeeeER   ldrb	w1, [x27], #254
-# CHECK-NEXT: [0,9]     .D====eE--R   add	x0, x27, #1
+# CHECK-NEXT: [0,3]     .D=eE--R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eeeeER .   ldr	w1, [x27, #254]!
+# CHECK-NEXT: [0,5]     .D==eE--R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     . D=eeeeER.   ldr	x1, [x27, #254]!
+# CHECK-NEXT: [0,7]     . D==eE--R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==eeeeER   ldrb	w1, [x27], #254
+# CHECK-NEXT: [0,9]     .  D==eE--R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3423,14 +3423,14 @@ add x0, x27, 1
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ldr	w1, [x27], #254
 # CHECK-NEXT: 1.     1     2.0    0.0    2.0       add	x0, x27, #1
 # CHECK-NEXT: 2.     1     2.0    0.0    0.0       ldr	x1, [x27], #254
-# CHECK-NEXT: 3.     1     3.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     3.0    0.0    0.0       ldr	w1, [x27, #254]!
-# CHECK-NEXT: 5.     1     4.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ldr	x1, [x27, #254]!
-# CHECK-NEXT: 7.     1     4.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     4.0    0.0    0.0       ldrb	w1, [x27], #254
-# CHECK-NEXT: 9.     1     5.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT:        1     3.1    0.1    1.0       <total>
+# CHECK-NEXT: 3.     1     2.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ldr	w1, [x27, #254]!
+# CHECK-NEXT: 5.     1     3.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     2.0    0.0    0.0       ldr	x1, [x27, #254]!
+# CHECK-NEXT: 7.     1     3.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    0.0    0.0       ldrb	w1, [x27], #254
+# CHECK-NEXT: 9.     1     3.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.3    0.1    1.0       <total>
 
 # CHECK:      [49] Code Region - G50
 
@@ -3439,10 +3439,10 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      506
 # CHECK-NEXT: Total uOps:        1500
 
-# CHECK:      Dispatch Width:    10
+# CHECK:      Dispatch Width:    5
 # CHECK-NEXT: uOps Per Cycle:    2.96
 # CHECK-NEXT: IPC:               1.98
-# CHECK-NEXT: Block RThroughput: 2.5
+# CHECK-NEXT: Block RThroughput: 3.0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0
@@ -3451,13 +3451,13 @@ add x0, x27, 1
 # CHECK:      [0,0]     DeeeeER   .   ldrb	w1, [x27, #254]!
 # CHECK-NEXT: [0,1]     D=eE--R   .   add	x0, x27, #1
 # CHECK-NEXT: [0,2]     D=eeeeER  .   ldrh	w1, [x27], #254
-# CHECK-NEXT: [0,3]     D==eE--R  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     D==eeeeER .   ldrh	w1, [x27, #254]!
-# CHECK-NEXT: [0,5]     D===eE--R .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D==eeeeER.   ldrsb	w1, [x27], #254
-# CHECK-NEXT: [0,7]     .D===eE--R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .D===eeeeER   ldrsb	x1, [x27], #254
-# CHECK-NEXT: [0,9]     .D====eE--R   add	x0, x27, #1
+# CHECK-NEXT: [0,3]     .D=eE--R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eeeeER .   ldrh	w1, [x27, #254]!
+# CHECK-NEXT: [0,5]     .D==eE--R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     . D=eeeeER.   ldrsb	w1, [x27], #254
+# CHECK-NEXT: [0,7]     . D==eE--R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==eeeeER   ldrsb	x1, [x27], #254
+# CHECK-NEXT: [0,9]     .  D==eE--R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3469,14 +3469,14 @@ add x0, x27, 1
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ldrb	w1, [x27, #254]!
 # CHECK-NEXT: 1.     1     2.0    0.0    2.0       add	x0, x27, #1
 # CHECK-NEXT: 2.     1     2.0    0.0    0.0       ldrh	w1, [x27], #254
-# CHECK-NEXT: 3.     1     3.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     3.0    0.0    0.0       ldrh	w1, [x27, #254]!
-# CHECK-NEXT: 5.     1     4.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ldrsb	w1, [x27], #254
-# CHECK-NEXT: 7.     1     4.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     4.0    0.0    0.0       ldrsb	x1, [x27], #254
-# CHECK-NEXT: 9.     1     5.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT:        1     3.1    0.1    1.0       <total>
+# CHECK-NEXT: 3.     1     2.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ldrh	w1, [x27, #254]!
+# CHECK-NEXT: 5.     1     3.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     2.0    0.0    0.0       ldrsb	w1, [x27], #254
+# CHECK-NEXT: 7.     1     3.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    0.0    0.0       ldrsb	x1, [x27], #254
+# CHECK-NEXT: 9.     1     3.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.3    0.1    1.0       <total>
 
 # CHECK:      [50] Code Region - G51
 
@@ -3485,10 +3485,10 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      506
 # CHECK-NEXT: Total uOps:        1500
 
-# CHECK:      Dispatch Width:    10
+# CHECK:      Dispatch Width:    5
 # CHECK-NEXT: uOps Per Cycle:    2.96
 # CHECK-NEXT: IPC:               1.98
-# CHECK-NEXT: Block RThroughput: 2.5
+# CHECK-NEXT: Block RThroughput: 3.0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0
@@ -3497,13 +3497,13 @@ add x0, x27, 1
 # CHECK:      [0,0]     DeeeeER   .   ldrsb	w1, [x27, #254]!
 # CHECK-NEXT: [0,1]     D=eE--R   .   add	x0, x27, #1
 # CHECK-NEXT: [0,2]     D=eeeeER  .   ldrsb	x1, [x27, #254]!
-# CHECK-NEXT: [0,3]     D==eE--R  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     D==eeeeER .   ldrsh	w1, [x27], #254
-# CHECK-NEXT: [0,5]     D===eE--R .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D==eeeeER.   ldrsh	x1, [x27], #254
-# CHECK-NEXT: [0,7]     .D===eE--R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .D===eeeeER   ldrsh	w1, [x27, #254]!
-# CHECK-NEXT: [0,9]     .D====eE--R   add	x0, x27, #1
+# CHECK-NEXT: [0,3]     .D=eE--R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eeeeER .   ldrsh	w1, [x27], #254
+# CHECK-NEXT: [0,5]     .D==eE--R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     . D=eeeeER.   ldrsh	x1, [x27], #254
+# CHECK-NEXT: [0,7]     . D==eE--R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==eeeeER   ldrsh	w1, [x27, #254]!
+# CHECK-NEXT: [0,9]     .  D==eE--R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3515,14 +3515,14 @@ add x0, x27, 1
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ldrsb	w1, [x27, #254]!
 # CHECK-NEXT: 1.     1     2.0    0.0    2.0       add	x0, x27, #1
 # CHECK-NEXT: 2.     1     2.0    0.0    0.0       ldrsb	x1, [x27, #254]!
-# CHECK-NEXT: 3.     1     3.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     3.0    0.0    0.0       ldrsh	w1, [x27], #254
-# CHECK-NEXT: 5.     1     4.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ldrsh	x1, [x27], #254
-# CHECK-NEXT: 7.     1     4.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     4.0    0.0    0.0       ldrsh	w1, [x27, #254]!
-# CHECK-NEXT: 9.     1     5.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT:        1     3.1    0.1    1.0       <total>
+# CHECK-NEXT: 3.     1     2.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ldrsh	w1, [x27], #254
+# CHECK-NEXT: 5.     1     3.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     2.0    0.0    0.0       ldrsh	x1, [x27], #254
+# CHECK-NEXT: 7.     1     3.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    0.0    0.0       ldrsh	w1, [x27, #254]!
+# CHECK-NEXT: 9.     1     3.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.3    0.1    1.0       <total>
 
 # CHECK:      [51] Code Region - G52
 
@@ -3531,10 +3531,10 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      504
 # CHECK-NEXT: Total uOps:        1700
 
-# CHECK:      Dispatch Width:    10
+# CHECK:      Dispatch Width:    5
 # CHECK-NEXT: uOps Per Cycle:    3.37
 # CHECK-NEXT: IPC:               1.98
-# CHECK-NEXT: Block RThroughput: 2.5
+# CHECK-NEXT: Block RThroughput: 3.4
 
 # CHECK:      Timeline view:
 # CHECK-NEXT: Index     012345678
@@ -3542,13 +3542,13 @@ add x0, x27, 1
 # CHECK:      [0,0]     DeeeeER .   ldrsh	x1, [x27, #254]!
 # CHECK-NEXT: [0,1]     D=eE--R .   add	x0, x27, #1
 # CHECK-NEXT: [0,2]     D=eeeeER.   ldrsw	x1, [x27], #254
-# CHECK-NEXT: [0,3]     D==eE--R.   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     D==eeeeER   ldrsw	x1, [x27, #254]!
-# CHECK-NEXT: [0,5]     D===eE--R   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D==eeE-R   st1	{ v1.1d }, [x27], #8
-# CHECK-NEXT: [0,7]     .D===eE-R   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .D===eeER   st1	{ v1.2d }, [x27], #16
-# CHECK-NEXT: [0,9]     .D====eER   add	x0, x27, #1
+# CHECK-NEXT: [0,3]     .D=eE--R.   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eeeeER   ldrsw	x1, [x27, #254]!
+# CHECK-NEXT: [0,5]     .D==eE--R   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     . D=eeE-R   st1	{ v1.1d }, [x27], #8
+# CHECK-NEXT: [0,7]     . D==eE-R   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .  D=eeER   st1	{ v1.2d }, [x27], #16
+# CHECK-NEXT: [0,9]     .  D==eER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3560,14 +3560,14 @@ add x0, x27, 1
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ldrsh	x1, [x27, #254]!
 # CHECK-NEXT: 1.     1     2.0    0.0    2.0       add	x0, x27, #1
 # CHECK-NEXT: 2.     1     2.0    0.0    0.0       ldrsw	x1, [x27], #254
-# CHECK-NEXT: 3.     1     3.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     3.0    0.0    0.0       ldrsw	x1, [x27, #254]!
-# CHECK-NEXT: 5.     1     4.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     3.0    0.0    1.0       st1	{ v1.1d }, [x27], #8
-# CHECK-NEXT: 7.     1     4.0    0.0    1.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     4.0    0.0    0.0       st1	{ v1.2d }, [x27], #16
-# CHECK-NEXT: 9.     1     5.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     3.1    0.1    0.8       <total>
+# CHECK-NEXT: 3.     1     2.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ldrsw	x1, [x27, #254]!
+# CHECK-NEXT: 5.     1     3.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     2.0    0.0    1.0       st1	{ v1.1d }, [x27], #8
+# CHECK-NEXT: 7.     1     3.0    0.0    1.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     2.0    0.0    0.0       st1	{ v1.2d }, [x27], #16
+# CHECK-NEXT: 9.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.2    0.1    0.8       <total>
 
 # CHECK:      [52] Code Region - G53
 
@@ -3576,24 +3576,24 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      504
 # CHECK-NEXT: Total uOps:        2000
 
-# CHECK:      Dispatch Width:    10
+# CHECK:      Dispatch Width:    5
 # CHECK-NEXT: uOps Per Cycle:    3.97
 # CHECK-NEXT: IPC:               1.98
-# CHECK-NEXT: Block RThroughput: 2.5
+# CHECK-NEXT: Block RThroughput: 4.0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT: Index     012345678
 
 # CHECK:      [0,0]     DeeER.  .   st1	{ v1.2s }, [x27], #8
 # CHECK-NEXT: [0,1]     D=eER.  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=eeER  .   st1	{ v1.4h }, [x27], #8
-# CHECK-NEXT: [0,3]     D==eER  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D=eeER .   st1	{ v1.4s }, [x27], #16
-# CHECK-NEXT: [0,5]     .D==eER .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D==eeER.   st1	{ v1.8b }, [x27], #8
-# CHECK-NEXT: [0,7]     .D===eER.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D==eeER   st1	{ v1.8h }, [x27], #16
-# CHECK-NEXT: [0,9]     . D===eER   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeER  .   st1	{ v1.4h }, [x27], #8
+# CHECK-NEXT: [0,3]     .D=eER  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DeeER .   st1	{ v1.4s }, [x27], #16
+# CHECK-NEXT: [0,5]     . D=eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  DeeER.   st1	{ v1.8b }, [x27], #8
+# CHECK-NEXT: [0,7]     .  D=eER.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   DeeER   st1	{ v1.8h }, [x27], #16
+# CHECK-NEXT: [0,9]     .   D=eER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3604,15 +3604,15 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.2s }, [x27], #8
 # CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st1	{ v1.4h }, [x27], #8
-# CHECK-NEXT: 3.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     2.0    0.0    0.0       st1	{ v1.4s }, [x27], #16
-# CHECK-NEXT: 5.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     3.0    0.0    0.0       st1	{ v1.8b }, [x27], #8
-# CHECK-NEXT: 7.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     3.0    0.0    0.0       st1	{ v1.8h }, [x27], #16
-# CHECK-NEXT: 9.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.7    0.1    0.0       <total>
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st1	{ v1.4h }, [x27], #8
+# CHECK-NEXT: 3.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       st1	{ v1.4s }, [x27], #16
+# CHECK-NEXT: 5.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    0.0    0.0       st1	{ v1.8b }, [x27], #8
+# CHECK-NEXT: 7.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    0.0    0.0       st1	{ v1.8h }, [x27], #16
+# CHECK-NEXT: 9.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.5    0.1    0.0       <total>
 
 # CHECK:      [53] Code Region - G54
 
@@ -3621,24 +3621,24 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      504
 # CHECK-NEXT: Total uOps:        2000
 
-# CHECK:      Dispatch Width:    10
+# CHECK:      Dispatch Width:    5
 # CHECK-NEXT: uOps Per Cycle:    3.97
 # CHECK-NEXT: IPC:               1.98
-# CHECK-NEXT: Block RThroughput: 2.5
+# CHECK-NEXT: Block RThroughput: 4.0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT: Index     012345678
 
 # CHECK:      [0,0]     DeeER.  .   st1	{ v1.16b }, [x27], #16
 # CHECK-NEXT: [0,1]     D=eER.  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=eeER  .   st1	{ v1.1d }, [x27], x28
-# CHECK-NEXT: [0,3]     D==eER  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D=eeER .   st1	{ v1.2d }, [x27], x28
-# CHECK-NEXT: [0,5]     .D==eER .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D==eeER.   st1	{ v1.2s }, [x27], x28
-# CHECK-NEXT: [0,7]     .D===eER.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D==eeER   st1	{ v1.4h }, [x27], x28
-# CHECK-NEXT: [0,9]     . D===eER   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeER  .   st1	{ v1.1d }, [x27], x28
+# CHECK-NEXT: [0,3]     .D=eER  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DeeER .   st1	{ v1.2d }, [x27], x28
+# CHECK-NEXT: [0,5]     . D=eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  DeeER.   st1	{ v1.2s }, [x27], x28
+# CHECK-NEXT: [0,7]     .  D=eER.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   DeeER   st1	{ v1.4h }, [x27], x28
+# CHECK-NEXT: [0,9]     .   D=eER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3649,15 +3649,15 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.16b }, [x27], #16
 # CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st1	{ v1.1d }, [x27], x28
-# CHECK-NEXT: 3.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     2.0    0.0    0.0       st1	{ v1.2d }, [x27], x28
-# CHECK-NEXT: 5.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     3.0    0.0    0.0       st1	{ v1.2s }, [x27], x28
-# CHECK-NEXT: 7.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     3.0    0.0    0.0       st1	{ v1.4h }, [x27], x28
-# CHECK-NEXT: 9.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.7    0.1    0.0       <total>
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st1	{ v1.1d }, [x27], x28
+# CHECK-NEXT: 3.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       st1	{ v1.2d }, [x27], x28
+# CHECK-NEXT: 5.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    0.0    0.0       st1	{ v1.2s }, [x27], x28
+# CHECK-NEXT: 7.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    0.0    0.0       st1	{ v1.4h }, [x27], x28
+# CHECK-NEXT: 9.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.5    0.1    0.0       <total>
 
 # CHECK:      [54] Code Region - G55
 
@@ -3666,24 +3666,24 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      504
 # CHECK-NEXT: Total uOps:        2000
 
-# CHECK:      Dispatch Width:    10
+# CHECK:      Dispatch Width:    5
 # CHECK-NEXT: uOps Per Cycle:    3.97
 # CHECK-NEXT: IPC:               1.98
-# CHECK-NEXT: Block RThroughput: 2.5
+# CHECK-NEXT: Block RThroughput: 4.0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT: Index     012345678
 
 # CHECK:      [0,0]     DeeER.  .   st1	{ v1.4s }, [x27], x28
 # CHECK-NEXT: [0,1]     D=eER.  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=eeER  .   st1	{ v1.8b }, [x27], x28
-# CHECK-NEXT: [0,3]     D==eER  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D=eeER .   st1	{ v1.8h }, [x27], x28
-# CHECK-NEXT: [0,5]     .D==eER .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D==eeER.   st1	{ v1.16b }, [x27], x28
-# CHECK-NEXT: [0,7]     .D===eER.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D==eeER   st1	{ v1.1d, v2.1d }, [x27], #16
-# CHECK-NEXT: [0,9]     . D===eER   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeER  .   st1	{ v1.8b }, [x27], x28
+# CHECK-NEXT: [0,3]     .D=eER  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DeeER .   st1	{ v1.8h }, [x27], x28
+# CHECK-NEXT: [0,5]     . D=eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  DeeER.   st1	{ v1.16b }, [x27], x28
+# CHECK-NEXT: [0,7]     .  D=eER.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   DeeER   st1	{ v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: [0,9]     .   D=eER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3694,15 +3694,15 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.4s }, [x27], x28
 # CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st1	{ v1.8b }, [x27], x28
-# CHECK-NEXT: 3.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     2.0    0.0    0.0       st1	{ v1.8h }, [x27], x28
-# CHECK-NEXT: 5.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     3.0    0.0    0.0       st1	{ v1.16b }, [x27], x28
-# CHECK-NEXT: 7.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     3.0    0.0    0.0       st1	{ v1.1d, v2.1d }, [x27], #16
-# CHECK-NEXT: 9.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.7    0.1    0.0       <total>
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st1	{ v1.8b }, [x27], x28
+# CHECK-NEXT: 3.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       st1	{ v1.8h }, [x27], x28
+# CHECK-NEXT: 5.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    0.0    0.0       st1	{ v1.16b }, [x27], x28
+# CHECK-NEXT: 7.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    0.0    0.0       st1	{ v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: 9.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.5    0.1    0.0       <total>
 
 # CHECK:      [55] Code Region - G56
 
@@ -3711,24 +3711,24 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      504
 # CHECK-NEXT: Total uOps:        2000
 
-# CHECK:      Dispatch Width:    10
+# CHECK:      Dispatch Width:    5
 # CHECK-NEXT: uOps Per Cycle:    3.97
 # CHECK-NEXT: IPC:               1.98
-# CHECK-NEXT: Block RThroughput: 2.5
+# CHECK-NEXT: Block RThroughput: 4.0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT: Index     012345678
 
 # CHECK:      [0,0]     DeeER.  .   st1	{ v1.2d, v2.2d }, [x27], #32
 # CHECK-NEXT: [0,1]     D=eER.  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=eeER  .   st1	{ v1.2s, v2.2s }, [x27], #16
-# CHECK-NEXT: [0,3]     D==eER  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D=eeER .   st1	{ v1.4h, v2.4h }, [x27], #16
-# CHECK-NEXT: [0,5]     .D==eER .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D==eeER.   st1	{ v1.4s, v2.4s }, [x27], #32
-# CHECK-NEXT: [0,7]     .D===eER.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D==eeER   st1	{ v1.8b, v2.8b }, [x27], #16
-# CHECK-NEXT: [0,9]     . D===eER   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeER  .   st1	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: [0,3]     .D=eER  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DeeER .   st1	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: [0,5]     . D=eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  DeeER.   st1	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: [0,7]     .  D=eER.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   DeeER   st1	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: [0,9]     .   D=eER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3739,15 +3739,15 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.2d, v2.2d }, [x27], #32
 # CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st1	{ v1.2s, v2.2s }, [x27], #16
-# CHECK-NEXT: 3.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     2.0    0.0    0.0       st1	{ v1.4h, v2.4h }, [x27], #16
-# CHECK-NEXT: 5.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     3.0    0.0    0.0       st1	{ v1.4s, v2.4s }, [x27], #32
-# CHECK-NEXT: 7.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     3.0    0.0    0.0       st1	{ v1.8b, v2.8b }, [x27], #16
-# CHECK-NEXT: 9.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.7    0.1    0.0       <total>
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st1	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: 3.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       st1	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: 5.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    0.0    0.0       st1	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: 7.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    0.0    0.0       st1	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: 9.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.5    0.1    0.0       <total>
 
 # CHECK:      [56] Code Region - G57
 
@@ -3756,24 +3756,24 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      504
 # CHECK-NEXT: Total uOps:        2000
 
-# CHECK:      Dispatch Width:    10
+# CHECK:      Dispatch Width:    5
 # CHECK-NEXT: uOps Per Cycle:    3.97
 # CHECK-NEXT: IPC:               1.98
-# CHECK-NEXT: Block RThroughput: 2.5
+# CHECK-NEXT: Block RThroughput: 4.0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT: Index     012345678
 
 # CHECK:      [0,0]     DeeER.  .   st1	{ v1.8h, v2.8h }, [x27], #32
 # CHECK-NEXT: [0,1]     D=eER.  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=eeER  .   st1	{ v1.16b, v2.16b }, [x27], #32
-# CHECK-NEXT: [0,3]     D==eER  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D=eeER .   st1	{ v1.1d, v2.1d }, [x27], x28
-# CHECK-NEXT: [0,5]     .D==eER .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D==eeER.   st1	{ v1.2d, v2.2d }, [x27], x28
-# CHECK-NEXT: [0,7]     .D===eER.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D==eeER   st1	{ v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT: [0,9]     . D===eER   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeER  .   st1	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: [0,3]     .D=eER  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DeeER .   st1	{ v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: [0,5]     . D=eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  DeeER.   st1	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: [0,7]     .  D=eER.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   DeeER   st1	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: [0,9]     .   D=eER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3784,15 +3784,15 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.8h, v2.8h }, [x27], #32
 # CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st1	{ v1.16b, v2.16b }, [x27], #32
-# CHECK-NEXT: 3.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     2.0    0.0    0.0       st1	{ v1.1d, v2.1d }, [x27], x28
-# CHECK-NEXT: 5.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     3.0    0.0    0.0       st1	{ v1.2d, v2.2d }, [x27], x28
-# CHECK-NEXT: 7.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     3.0    0.0    0.0       st1	{ v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT: 9.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.7    0.1    0.0       <total>
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st1	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: 3.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       st1	{ v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: 5.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    0.0    0.0       st1	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: 7.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    0.0    0.0       st1	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: 9.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.5    0.1    0.0       <total>
 
 # CHECK:      [57] Code Region - G58
 
@@ -3801,24 +3801,24 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      504
 # CHECK-NEXT: Total uOps:        2000
 
-# CHECK:      Dispatch Width:    10
+# CHECK:      Dispatch Width:    5
 # CHECK-NEXT: uOps Per Cycle:    3.97
 # CHECK-NEXT: IPC:               1.98
-# CHECK-NEXT: Block RThroughput: 2.5
+# CHECK-NEXT: Block RThroughput: 4.0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT: Index     012345678
 
 # CHECK:      [0,0]     DeeER.  .   st1	{ v1.4h, v2.4h }, [x27], x28
 # CHECK-NEXT: [0,1]     D=eER.  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=eeER  .   st1	{ v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT: [0,3]     D==eER  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D=eeER .   st1	{ v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: [0,5]     .D==eER .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D==eeER.   st1	{ v1.8h, v2.8h }, [x27], x28
-# CHECK-NEXT: [0,7]     .D===eER.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D==eeER   st1	{ v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: [0,9]     . D===eER   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeER  .   st1	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: [0,3]     .D=eER  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DeeER .   st1	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: [0,5]     . D=eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  DeeER.   st1	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: [0,7]     .  D=eER.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   DeeER   st1	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: [0,9]     .   D=eER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3829,41 +3829,42 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.4h, v2.4h }, [x27], x28
 # CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st1	{ v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT: 3.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     2.0    0.0    0.0       st1	{ v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: 5.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     3.0    0.0    0.0       st1	{ v1.8h, v2.8h }, [x27], x28
-# CHECK-NEXT: 7.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     3.0    0.0    0.0       st1	{ v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: 9.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.7    0.1    0.0       <total>
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st1	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: 3.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       st1	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: 5.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    0.0    0.0       st1	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: 7.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    0.0    0.0       st1	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: 9.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.5    0.1    0.0       <total>
 
 # CHECK:      [58] Code Region - G59
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      504
+# CHECK-NEXT: Total Cycles:      1003
 # CHECK-NEXT: Total uOps:        3000
 
-# CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    5.95
-# CHECK-NEXT: IPC:               1.98
-# CHECK-NEXT: Block RThroughput: 5.0
+# CHECK:      Dispatch Width:    5
+# CHECK-NEXT: uOps Per Cycle:    2.99
+# CHECK-NEXT: IPC:               1.00
+# CHECK-NEXT: Block RThroughput: 6.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT: Index     012345678
+# CHECK-NEXT:                     012
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeER.  .   st1	{ v1.1d, v2.1d, v3.1d }, [x27], #24
-# CHECK-NEXT: [0,1]     D=eER.  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeeER  .   st1	{ v1.2d, v2.2d, v3.2d }, [x27], #48
-# CHECK-NEXT: [0,3]     .D=eER  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . DeeER .   st1	{ v1.2s, v2.2s, v3.2s }, [x27], #24
-# CHECK-NEXT: [0,5]     . D=eER .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  DeeER.   st1	{ v1.4h, v2.4h, v3.4h }, [x27], #24
-# CHECK-NEXT: [0,7]     .  D=eER.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   DeeER   st1	{ v1.4s, v2.4s, v3.4s }, [x27], #48
-# CHECK-NEXT: [0,9]     .   D=eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeER.    . .   st1	{ v1.1d, v2.1d, v3.1d }, [x27], #24
+# CHECK-NEXT: [0,1]     .DeER.    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeeER   . .   st1	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: [0,3]     .  DeER   . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   DeeER . .   st1	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: [0,5]     .    DeER . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .DeeER .   st1	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: [0,7]     .    . DeER .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  DeeER   st1	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: [0,9]     .    .   DeER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3873,42 +3874,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.1d, v2.1d, v3.1d }, [x27], #24
-# CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st1	{ v1.2d, v2.2d, v3.2d }, [x27], #48
-# CHECK-NEXT: 3.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     1.0    0.0    0.0       st1	{ v1.2s, v2.2s, v3.2s }, [x27], #24
-# CHECK-NEXT: 5.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     1.0    0.0    0.0       st1	{ v1.4h, v2.4h, v3.4h }, [x27], #24
-# CHECK-NEXT: 7.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     1.0    0.0    0.0       st1	{ v1.4s, v2.4s, v3.4s }, [x27], #48
-# CHECK-NEXT: 9.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     1.5    0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       st1	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: 3.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       st1	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: 5.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       st1	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: 7.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       st1	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: 9.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    0.5    0.0       <total>
 
 # CHECK:      [59] Code Region - G60
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      504
+# CHECK-NEXT: Total Cycles:      1003
 # CHECK-NEXT: Total uOps:        3000
 
-# CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    5.95
-# CHECK-NEXT: IPC:               1.98
-# CHECK-NEXT: Block RThroughput: 5.0
+# CHECK:      Dispatch Width:    5
+# CHECK-NEXT: uOps Per Cycle:    2.99
+# CHECK-NEXT: IPC:               1.00
+# CHECK-NEXT: Block RThroughput: 6.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT: Index     012345678
+# CHECK-NEXT:                     012
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeER.  .   st1	{ v1.8b, v2.8b, v3.8b }, [x27], #24
-# CHECK-NEXT: [0,1]     D=eER.  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeeER  .   st1	{ v1.8h, v2.8h, v3.8h }, [x27], #48
-# CHECK-NEXT: [0,3]     .D=eER  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . DeeER .   st1	{ v1.16b, v2.16b, v3.16b }, [x27], #48
-# CHECK-NEXT: [0,5]     . D=eER .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  DeeER.   st1	{ v1.1d, v2.1d, v3.1d }, [x27], x28
-# CHECK-NEXT: [0,7]     .  D=eER.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   DeeER   st1	{ v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT: [0,9]     .   D=eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeER.    . .   st1	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: [0,1]     .DeER.    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeeER   . .   st1	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: [0,3]     .  DeER   . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   DeeER . .   st1	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: [0,5]     .    DeER . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .DeeER .   st1	{ v1.1d, v2.1d, v3.1d }, [x27], x28
+# CHECK-NEXT: [0,7]     .    . DeER .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  DeeER   st1	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .   DeER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3918,42 +3920,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.8b, v2.8b, v3.8b }, [x27], #24
-# CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st1	{ v1.8h, v2.8h, v3.8h }, [x27], #48
-# CHECK-NEXT: 3.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     1.0    0.0    0.0       st1	{ v1.16b, v2.16b, v3.16b }, [x27], #48
-# CHECK-NEXT: 5.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     1.0    0.0    0.0       st1	{ v1.1d, v2.1d, v3.1d }, [x27], x28
-# CHECK-NEXT: 7.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     1.0    0.0    0.0       st1	{ v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT: 9.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     1.5    0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       st1	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: 3.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       st1	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: 5.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       st1	{ v1.1d, v2.1d, v3.1d }, [x27], x28
+# CHECK-NEXT: 7.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       st1	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: 9.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    0.5    0.0       <total>
 
 # CHECK:      [60] Code Region - G61
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      504
+# CHECK-NEXT: Total Cycles:      1003
 # CHECK-NEXT: Total uOps:        3000
 
-# CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    5.95
-# CHECK-NEXT: IPC:               1.98
-# CHECK-NEXT: Block RThroughput: 5.0
+# CHECK:      Dispatch Width:    5
+# CHECK-NEXT: uOps Per Cycle:    2.99
+# CHECK-NEXT: IPC:               1.00
+# CHECK-NEXT: Block RThroughput: 6.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT: Index     012345678
+# CHECK-NEXT:                     012
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeER.  .   st1	{ v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: [0,1]     D=eER.  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeeER  .   st1	{ v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: [0,3]     .D=eER  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . DeeER .   st1	{ v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: [0,5]     . D=eER .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  DeeER.   st1	{ v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: [0,7]     .  D=eER.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   DeeER   st1	{ v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT: [0,9]     .   D=eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeER.    . .   st1	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: [0,1]     .DeER.    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeeER   . .   st1	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: [0,3]     .  DeER   . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   DeeER . .   st1	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: [0,5]     .    DeER . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .DeeER .   st1	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: [0,7]     .    . DeER .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  DeeER   st1	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .   DeER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3963,42 +3966,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st1	{ v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: 3.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     1.0    0.0    0.0       st1	{ v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: 5.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     1.0    0.0    0.0       st1	{ v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: 7.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     1.0    0.0    0.0       st1	{ v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT: 9.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     1.5    0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       st1	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: 3.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       st1	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: 5.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       st1	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: 7.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       st1	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: 9.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    0.5    0.0       <total>
 
 # CHECK:      [61] Code Region - G62
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      504
+# CHECK-NEXT: Total Cycles:      1003
 # CHECK-NEXT: Total uOps:        3000
 
-# CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    5.95
-# CHECK-NEXT: IPC:               1.98
-# CHECK-NEXT: Block RThroughput: 5.0
+# CHECK:      Dispatch Width:    5
+# CHECK-NEXT: uOps Per Cycle:    2.99
+# CHECK-NEXT: IPC:               1.00
+# CHECK-NEXT: Block RThroughput: 6.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT: Index     012345678
+# CHECK-NEXT:                     012
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeER.  .   st1	{ v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: [0,1]     D=eER.  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeeER  .   st1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
-# CHECK-NEXT: [0,3]     .D=eER  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . DeeER .   st1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
-# CHECK-NEXT: [0,5]     . D=eER .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  DeeER.   st1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
-# CHECK-NEXT: [0,7]     .  D=eER.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   DeeER   st1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
-# CHECK-NEXT: [0,9]     .   D=eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeER.    . .   st1	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: [0,1]     .DeER.    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeeER   . .   st1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: [0,3]     .  DeER   . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   DeeER . .   st1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: [0,5]     .    DeER . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .DeeER .   st1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: [0,7]     .    . DeER .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  DeeER   st1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: [0,9]     .    .   DeER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4008,42 +4012,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
-# CHECK-NEXT: 3.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     1.0    0.0    0.0       st1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
-# CHECK-NEXT: 5.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     1.0    0.0    0.0       st1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
-# CHECK-NEXT: 7.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     1.0    0.0    0.0       st1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
-# CHECK-NEXT: 9.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     1.5    0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       st1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: 3.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       st1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: 5.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       st1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: 7.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       st1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: 9.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    0.5    0.0       <total>
 
 # CHECK:      [62] Code Region - G63
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      504
+# CHECK-NEXT: Total Cycles:      1003
 # CHECK-NEXT: Total uOps:        3000
 
-# CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    5.95
-# CHECK-NEXT: IPC:               1.98
-# CHECK-NEXT: Block RThroughput: 5.0
+# CHECK:      Dispatch Width:    5
+# CHECK-NEXT: uOps Per Cycle:    2.99
+# CHECK-NEXT: IPC:               1.00
+# CHECK-NEXT: Block RThroughput: 6.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT: Index     012345678
+# CHECK-NEXT:                     012
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeER.  .   st1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
-# CHECK-NEXT: [0,1]     D=eER.  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeeER  .   st1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
-# CHECK-NEXT: [0,3]     .D=eER  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . DeeER .   st1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
-# CHECK-NEXT: [0,5]     . D=eER .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  DeeER.   st1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
-# CHECK-NEXT: [0,7]     .  D=eER.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   DeeER   st1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
-# CHECK-NEXT: [0,9]     .   D=eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeER.    . .   st1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: [0,1]     .DeER.    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeeER   . .   st1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: [0,3]     .  DeER   . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   DeeER . .   st1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: [0,5]     .    DeER . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .DeeER .   st1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: [0,7]     .    . DeER .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  DeeER   st1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .   DeER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4053,42 +4058,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
-# CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
-# CHECK-NEXT: 3.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     1.0    0.0    0.0       st1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
-# CHECK-NEXT: 5.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     1.0    0.0    0.0       st1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
-# CHECK-NEXT: 7.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     1.0    0.0    0.0       st1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
-# CHECK-NEXT: 9.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     1.5    0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       st1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: 3.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       st1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: 5.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       st1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: 7.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       st1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# CHECK-NEXT: 9.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    0.5    0.0       <total>
 
 # CHECK:      [63] Code Region - G64
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      504
+# CHECK-NEXT: Total Cycles:      1003
 # CHECK-NEXT: Total uOps:        3000
 
-# CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    5.95
-# CHECK-NEXT: IPC:               1.98
-# CHECK-NEXT: Block RThroughput: 5.0
+# CHECK:      Dispatch Width:    5
+# CHECK-NEXT: uOps Per Cycle:    2.99
+# CHECK-NEXT: IPC:               1.00
+# CHECK-NEXT: Block RThroughput: 6.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT: Index     012345678
+# CHECK-NEXT:                     012
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeER.  .   st1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: [0,1]     D=eER.  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeeER  .   st1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: [0,3]     .D=eER  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . DeeER .   st1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: [0,5]     . D=eER .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  DeeER.   st1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: [0,7]     .  D=eER.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   DeeER   st1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: [0,9]     .   D=eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeER.    . .   st1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: [0,1]     .DeER.    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeeER   . .   st1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: [0,3]     .  DeER   . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   DeeER . .   st1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: [0,5]     .    DeER . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .DeeER .   st1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: [0,7]     .    . DeER .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  DeeER   st1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .   DeER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4098,42 +4104,42 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: 3.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     1.0    0.0    0.0       st1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: 5.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     1.0    0.0    0.0       st1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: 7.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     1.0    0.0    0.0       st1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: 9.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     1.5    0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       st1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: 3.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       st1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: 5.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       st1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: 7.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       st1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: 9.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    0.5    0.0       <total>
 
 # CHECK:      [64] Code Region - G65
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      504
+# CHECK-NEXT: Total Cycles:      604
 # CHECK-NEXT: Total uOps:        2400
 
-# CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    4.76
-# CHECK-NEXT: IPC:               1.98
-# CHECK-NEXT: Block RThroughput: 3.5
+# CHECK:      Dispatch Width:    5
+# CHECK-NEXT: uOps Per Cycle:    3.97
+# CHECK-NEXT: IPC:               1.66
+# CHECK-NEXT: Block RThroughput: 4.8
 
 # CHECK:      Timeline view:
-# CHECK-NEXT: Index     012345678
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeER.  .   st1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: [0,1]     D=eER.  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeeER  .   st1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: [0,3]     .D=eER  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D=eeER .   st1	{ v1.b }[0], [x27], #1
-# CHECK-NEXT: [0,5]     .D==eER .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     . D=eeER.   st1	{ v1.b }[8], [x27], #1
-# CHECK-NEXT: [0,7]     . D==eER.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D==eeER   st1	{ v1.b }[0], [x27], x28
-# CHECK-NEXT: [0,9]     . D===eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeER.   .   st1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: [0,1]     .DeER.   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeeER  .   st1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: [0,3]     .  DeER  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .  DeeER .   st1	{ v1.b }[0], [x27], #1
+# CHECK-NEXT: [0,5]     .  D=eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .   DeeER.   st1	{ v1.b }[8], [x27], #1
+# CHECK-NEXT: [0,7]     .   D=eER.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    DeeER   st1	{ v1.b }[0], [x27], x28
+# CHECK-NEXT: [0,9]     .    D=eER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4143,16 +4149,16 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: 3.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     2.0    0.0    0.0       st1	{ v1.b }[0], [x27], #1
-# CHECK-NEXT: 5.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     2.0    0.0    0.0       st1	{ v1.b }[8], [x27], #1
-# CHECK-NEXT: 7.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     3.0    0.0    0.0       st1	{ v1.b }[0], [x27], x28
-# CHECK-NEXT: 9.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.3    0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       st1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: 3.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       st1	{ v1.b }[0], [x27], #1
+# CHECK-NEXT: 5.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    0.0    0.0       st1	{ v1.b }[8], [x27], #1
+# CHECK-NEXT: 7.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    0.0    0.0       st1	{ v1.b }[0], [x27], x28
+# CHECK-NEXT: 9.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.3    0.2    0.0       <total>
 
 # CHECK:      [65] Code Region - G66
 
@@ -4161,24 +4167,24 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      504
 # CHECK-NEXT: Total uOps:        2000
 
-# CHECK:      Dispatch Width:    10
+# CHECK:      Dispatch Width:    5
 # CHECK-NEXT: uOps Per Cycle:    3.97
 # CHECK-NEXT: IPC:               1.98
-# CHECK-NEXT: Block RThroughput: 2.5
+# CHECK-NEXT: Block RThroughput: 4.0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT: Index     012345678
 
 # CHECK:      [0,0]     DeeER.  .   st1	{ v1.b }[8], [x27], x28
 # CHECK-NEXT: [0,1]     D=eER.  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=eeER  .   st1	{ v1.h }[0], [x27], #2
-# CHECK-NEXT: [0,3]     D==eER  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D=eeER .   st1	{ v1.h }[4], [x27], #2
-# CHECK-NEXT: [0,5]     .D==eER .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D==eeER.   st1	{ v1.h }[0], [x27], x28
-# CHECK-NEXT: [0,7]     .D===eER.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D==eeER   st1	{ v1.h }[4], [x27], x28
-# CHECK-NEXT: [0,9]     . D===eER   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeER  .   st1	{ v1.h }[0], [x27], #2
+# CHECK-NEXT: [0,3]     .D=eER  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DeeER .   st1	{ v1.h }[4], [x27], #2
+# CHECK-NEXT: [0,5]     . D=eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  DeeER.   st1	{ v1.h }[0], [x27], x28
+# CHECK-NEXT: [0,7]     .  D=eER.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   DeeER   st1	{ v1.h }[4], [x27], x28
+# CHECK-NEXT: [0,9]     .   D=eER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4189,15 +4195,15 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.b }[8], [x27], x28
 # CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st1	{ v1.h }[0], [x27], #2
-# CHECK-NEXT: 3.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     2.0    0.0    0.0       st1	{ v1.h }[4], [x27], #2
-# CHECK-NEXT: 5.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     3.0    0.0    0.0       st1	{ v1.h }[0], [x27], x28
-# CHECK-NEXT: 7.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     3.0    0.0    0.0       st1	{ v1.h }[4], [x27], x28
-# CHECK-NEXT: 9.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.7    0.1    0.0       <total>
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st1	{ v1.h }[0], [x27], #2
+# CHECK-NEXT: 3.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       st1	{ v1.h }[4], [x27], #2
+# CHECK-NEXT: 5.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    0.0    0.0       st1	{ v1.h }[0], [x27], x28
+# CHECK-NEXT: 7.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    0.0    0.0       st1	{ v1.h }[4], [x27], x28
+# CHECK-NEXT: 9.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.5    0.1    0.0       <total>
 
 # CHECK:      [66] Code Region - G67
 
@@ -4206,24 +4212,24 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      504
 # CHECK-NEXT: Total uOps:        2000
 
-# CHECK:      Dispatch Width:    10
+# CHECK:      Dispatch Width:    5
 # CHECK-NEXT: uOps Per Cycle:    3.97
 # CHECK-NEXT: IPC:               1.98
-# CHECK-NEXT: Block RThroughput: 2.5
+# CHECK-NEXT: Block RThroughput: 4.0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT: Index     012345678
 
 # CHECK:      [0,0]     DeeER.  .   st1	{ v1.s }[0], [x27], #4
 # CHECK-NEXT: [0,1]     D=eER.  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=eeER  .   st1	{ v1.s }[0], [x27], x28
-# CHECK-NEXT: [0,3]     D==eER  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D=eeER .   st1	{ v1.d }[0], [x27], #8
-# CHECK-NEXT: [0,5]     .D==eER .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D==eeER.   st1	{ v1.d }[0], [x27], x28
-# CHECK-NEXT: [0,7]     .D===eER.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D==eeER   st2	{ v1.2d, v2.2d }, [x27], #32
-# CHECK-NEXT: [0,9]     . D===eER   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeER  .   st1	{ v1.s }[0], [x27], x28
+# CHECK-NEXT: [0,3]     .D=eER  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DeeER .   st1	{ v1.d }[0], [x27], #8
+# CHECK-NEXT: [0,5]     . D=eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  DeeER.   st1	{ v1.d }[0], [x27], x28
+# CHECK-NEXT: [0,7]     .  D=eER.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   DeeER   st2	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: [0,9]     .   D=eER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4234,15 +4240,15 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.s }[0], [x27], #4
 # CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st1	{ v1.s }[0], [x27], x28
-# CHECK-NEXT: 3.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     2.0    0.0    0.0       st1	{ v1.d }[0], [x27], #8
-# CHECK-NEXT: 5.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     3.0    0.0    0.0       st1	{ v1.d }[0], [x27], x28
-# CHECK-NEXT: 7.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     3.0    0.0    0.0       st2	{ v1.2d, v2.2d }, [x27], #32
-# CHECK-NEXT: 9.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.7    0.1    0.0       <total>
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st1	{ v1.s }[0], [x27], x28
+# CHECK-NEXT: 3.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       st1	{ v1.d }[0], [x27], #8
+# CHECK-NEXT: 5.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    0.0    0.0       st1	{ v1.d }[0], [x27], x28
+# CHECK-NEXT: 7.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    0.0    0.0       st2	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: 9.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.5    0.1    0.0       <total>
 
 # CHECK:      [67] Code Region - G68
 
@@ -4251,24 +4257,24 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      504
 # CHECK-NEXT: Total uOps:        2000
 
-# CHECK:      Dispatch Width:    10
+# CHECK:      Dispatch Width:    5
 # CHECK-NEXT: uOps Per Cycle:    3.97
 # CHECK-NEXT: IPC:               1.98
-# CHECK-NEXT: Block RThroughput: 2.5
+# CHECK-NEXT: Block RThroughput: 4.0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT: Index     012345678
 
 # CHECK:      [0,0]     DeeER.  .   st2	{ v1.2s, v2.2s }, [x27], #16
 # CHECK-NEXT: [0,1]     D=eER.  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=eeER  .   st2	{ v1.4h, v2.4h }, [x27], #16
-# CHECK-NEXT: [0,3]     D==eER  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D=eeER .   st2	{ v1.4s, v2.4s }, [x27], #32
-# CHECK-NEXT: [0,5]     .D==eER .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D==eeER.   st2	{ v1.8b, v2.8b }, [x27], #16
-# CHECK-NEXT: [0,7]     .D===eER.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D==eeER   st2	{ v1.8h, v2.8h }, [x27], #32
-# CHECK-NEXT: [0,9]     . D===eER   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeER  .   st2	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: [0,3]     .D=eER  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DeeER .   st2	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: [0,5]     . D=eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  DeeER.   st2	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: [0,7]     .  D=eER.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   DeeER   st2	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: [0,9]     .   D=eER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4279,15 +4285,15 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st2	{ v1.2s, v2.2s }, [x27], #16
 # CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st2	{ v1.4h, v2.4h }, [x27], #16
-# CHECK-NEXT: 3.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     2.0    0.0    0.0       st2	{ v1.4s, v2.4s }, [x27], #32
-# CHECK-NEXT: 5.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     3.0    0.0    0.0       st2	{ v1.8b, v2.8b }, [x27], #16
-# CHECK-NEXT: 7.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     3.0    0.0    0.0       st2	{ v1.8h, v2.8h }, [x27], #32
-# CHECK-NEXT: 9.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.7    0.1    0.0       <total>
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st2	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: 3.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       st2	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: 5.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    0.0    0.0       st2	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: 7.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    0.0    0.0       st2	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: 9.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.5    0.1    0.0       <total>
 
 # CHECK:      [68] Code Region - G69
 
@@ -4296,24 +4302,24 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      504
 # CHECK-NEXT: Total uOps:        2000
 
-# CHECK:      Dispatch Width:    10
+# CHECK:      Dispatch Width:    5
 # CHECK-NEXT: uOps Per Cycle:    3.97
 # CHECK-NEXT: IPC:               1.98
-# CHECK-NEXT: Block RThroughput: 2.5
+# CHECK-NEXT: Block RThroughput: 4.0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT: Index     012345678
 
 # CHECK:      [0,0]     DeeER.  .   st2	{ v1.16b, v2.16b }, [x27], #32
 # CHECK-NEXT: [0,1]     D=eER.  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=eeER  .   st2	{ v1.2d, v2.2d }, [x27], x28
-# CHECK-NEXT: [0,3]     D==eER  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D=eeER .   st2	{ v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT: [0,5]     .D==eER .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D==eeER.   st2	{ v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: [0,7]     .D===eER.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D==eeER   st2	{ v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT: [0,9]     . D===eER   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeER  .   st2	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: [0,3]     .D=eER  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DeeER .   st2	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: [0,5]     . D=eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  DeeER.   st2	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: [0,7]     .  D=eER.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   DeeER   st2	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: [0,9]     .   D=eER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4324,15 +4330,15 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st2	{ v1.16b, v2.16b }, [x27], #32
 # CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st2	{ v1.2d, v2.2d }, [x27], x28
-# CHECK-NEXT: 3.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     2.0    0.0    0.0       st2	{ v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT: 5.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     3.0    0.0    0.0       st2	{ v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: 7.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     3.0    0.0    0.0       st2	{ v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT: 9.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.7    0.1    0.0       <total>
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st2	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: 3.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       st2	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: 5.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    0.0    0.0       st2	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: 7.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    0.0    0.0       st2	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: 9.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.5    0.1    0.0       <total>
 
 # CHECK:      [69] Code Region - G70
 
@@ -4341,24 +4347,24 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      504
 # CHECK-NEXT: Total uOps:        2000
 
-# CHECK:      Dispatch Width:    10
+# CHECK:      Dispatch Width:    5
 # CHECK-NEXT: uOps Per Cycle:    3.97
 # CHECK-NEXT: IPC:               1.98
-# CHECK-NEXT: Block RThroughput: 2.5
+# CHECK-NEXT: Block RThroughput: 4.0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT: Index     012345678
 
 # CHECK:      [0,0]     DeeER.  .   st2	{ v1.8b, v2.8b }, [x27], x28
 # CHECK-NEXT: [0,1]     D=eER.  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=eeER  .   st2	{ v1.8h, v2.8h }, [x27], x28
-# CHECK-NEXT: [0,3]     D==eER  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D=eeER .   st2	{ v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: [0,5]     .D==eER .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D==eeER.   st2	{ v1.b, v2.b }[0], [x27], #2
-# CHECK-NEXT: [0,7]     .D===eER.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D==eeER   st2	{ v1.b, v2.b }[8], [x27], #2
-# CHECK-NEXT: [0,9]     . D===eER   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeER  .   st2	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: [0,3]     .D=eER  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DeeER .   st2	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: [0,5]     . D=eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  DeeER.   st2	{ v1.b, v2.b }[0], [x27], #2
+# CHECK-NEXT: [0,7]     .  D=eER.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   DeeER   st2	{ v1.b, v2.b }[8], [x27], #2
+# CHECK-NEXT: [0,9]     .   D=eER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4369,15 +4375,15 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st2	{ v1.8b, v2.8b }, [x27], x28
 # CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st2	{ v1.8h, v2.8h }, [x27], x28
-# CHECK-NEXT: 3.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     2.0    0.0    0.0       st2	{ v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: 5.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     3.0    0.0    0.0       st2	{ v1.b, v2.b }[0], [x27], #2
-# CHECK-NEXT: 7.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     3.0    0.0    0.0       st2	{ v1.b, v2.b }[8], [x27], #2
-# CHECK-NEXT: 9.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.7    0.1    0.0       <total>
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st2	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: 3.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       st2	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: 5.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    0.0    0.0       st2	{ v1.b, v2.b }[0], [x27], #2
+# CHECK-NEXT: 7.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    0.0    0.0       st2	{ v1.b, v2.b }[8], [x27], #2
+# CHECK-NEXT: 9.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.5    0.1    0.0       <total>
 
 # CHECK:      [70] Code Region - G71
 
@@ -4386,24 +4392,24 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      504
 # CHECK-NEXT: Total uOps:        2000
 
-# CHECK:      Dispatch Width:    10
+# CHECK:      Dispatch Width:    5
 # CHECK-NEXT: uOps Per Cycle:    3.97
 # CHECK-NEXT: IPC:               1.98
-# CHECK-NEXT: Block RThroughput: 2.5
+# CHECK-NEXT: Block RThroughput: 4.0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT: Index     012345678
 
 # CHECK:      [0,0]     DeeER.  .   st2	{ v1.b, v2.b }[0], [x27], x28
 # CHECK-NEXT: [0,1]     D=eER.  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=eeER  .   st2	{ v1.b, v2.b }[8], [x27], x28
-# CHECK-NEXT: [0,3]     D==eER  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D=eeER .   st2	{ v1.h, v2.h }[0], [x27], #4
-# CHECK-NEXT: [0,5]     .D==eER .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D==eeER.   st2	{ v1.h, v2.h }[4], [x27], #4
-# CHECK-NEXT: [0,7]     .D===eER.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D==eeER   st2	{ v1.h, v2.h }[0], [x27], x28
-# CHECK-NEXT: [0,9]     . D===eER   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeER  .   st2	{ v1.b, v2.b }[8], [x27], x28
+# CHECK-NEXT: [0,3]     .D=eER  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DeeER .   st2	{ v1.h, v2.h }[0], [x27], #4
+# CHECK-NEXT: [0,5]     . D=eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  DeeER.   st2	{ v1.h, v2.h }[4], [x27], #4
+# CHECK-NEXT: [0,7]     .  D=eER.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   DeeER   st2	{ v1.h, v2.h }[0], [x27], x28
+# CHECK-NEXT: [0,9]     .   D=eER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4414,15 +4420,15 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st2	{ v1.b, v2.b }[0], [x27], x28
 # CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st2	{ v1.b, v2.b }[8], [x27], x28
-# CHECK-NEXT: 3.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     2.0    0.0    0.0       st2	{ v1.h, v2.h }[0], [x27], #4
-# CHECK-NEXT: 5.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     3.0    0.0    0.0       st2	{ v1.h, v2.h }[4], [x27], #4
-# CHECK-NEXT: 7.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     3.0    0.0    0.0       st2	{ v1.h, v2.h }[0], [x27], x28
-# CHECK-NEXT: 9.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.7    0.1    0.0       <total>
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st2	{ v1.b, v2.b }[8], [x27], x28
+# CHECK-NEXT: 3.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       st2	{ v1.h, v2.h }[0], [x27], #4
+# CHECK-NEXT: 5.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    0.0    0.0       st2	{ v1.h, v2.h }[4], [x27], #4
+# CHECK-NEXT: 7.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    0.0    0.0       st2	{ v1.h, v2.h }[0], [x27], x28
+# CHECK-NEXT: 9.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.5    0.1    0.0       <total>
 
 # CHECK:      [71] Code Region - G72
 
@@ -4431,24 +4437,24 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      504
 # CHECK-NEXT: Total uOps:        2000
 
-# CHECK:      Dispatch Width:    10
+# CHECK:      Dispatch Width:    5
 # CHECK-NEXT: uOps Per Cycle:    3.97
 # CHECK-NEXT: IPC:               1.98
-# CHECK-NEXT: Block RThroughput: 2.5
+# CHECK-NEXT: Block RThroughput: 4.0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT: Index     012345678
 
 # CHECK:      [0,0]     DeeER.  .   st2	{ v1.h, v2.h }[4], [x27], x28
 # CHECK-NEXT: [0,1]     D=eER.  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=eeER  .   st2	{ v1.s, v2.s }[0], [x27], #8
-# CHECK-NEXT: [0,3]     D==eER  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D=eeER .   st2	{ v1.s, v2.s }[0], [x27], x28
-# CHECK-NEXT: [0,5]     .D==eER .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D==eeER.   st2	{ v1.d, v2.d }[0], [x27], #16
-# CHECK-NEXT: [0,7]     .D===eER.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D==eeER   st2	{ v1.d, v2.d }[0], [x27], x28
-# CHECK-NEXT: [0,9]     . D===eER   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeER  .   st2	{ v1.s, v2.s }[0], [x27], #8
+# CHECK-NEXT: [0,3]     .D=eER  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DeeER .   st2	{ v1.s, v2.s }[0], [x27], x28
+# CHECK-NEXT: [0,5]     . D=eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  DeeER.   st2	{ v1.d, v2.d }[0], [x27], #16
+# CHECK-NEXT: [0,7]     .  D=eER.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   DeeER   st2	{ v1.d, v2.d }[0], [x27], x28
+# CHECK-NEXT: [0,9]     .   D=eER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4459,42 +4465,42 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st2	{ v1.h, v2.h }[4], [x27], x28
 # CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st2	{ v1.s, v2.s }[0], [x27], #8
-# CHECK-NEXT: 3.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     2.0    0.0    0.0       st2	{ v1.s, v2.s }[0], [x27], x28
-# CHECK-NEXT: 5.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     3.0    0.0    0.0       st2	{ v1.d, v2.d }[0], [x27], #16
-# CHECK-NEXT: 7.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     3.0    0.0    0.0       st2	{ v1.d, v2.d }[0], [x27], x28
-# CHECK-NEXT: 9.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.7    0.1    0.0       <total>
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st2	{ v1.s, v2.s }[0], [x27], #8
+# CHECK-NEXT: 3.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       st2	{ v1.s, v2.s }[0], [x27], x28
+# CHECK-NEXT: 5.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    0.0    0.0       st2	{ v1.d, v2.d }[0], [x27], #16
+# CHECK-NEXT: 7.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    0.0    0.0       st2	{ v1.d, v2.d }[0], [x27], x28
+# CHECK-NEXT: 9.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.5    0.1    0.0       <total>
 
 # CHECK:      [72] Code Region - G73
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      606
+# CHECK-NEXT: Total Cycles:      706
 # CHECK-NEXT: Total uOps:        2800
 
-# CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    4.62
-# CHECK-NEXT: IPC:               1.65
-# CHECK-NEXT: Block RThroughput: 4.5
+# CHECK:      Dispatch Width:    5
+# CHECK-NEXT: uOps Per Cycle:    3.97
+# CHECK-NEXT: IPC:               1.42
+# CHECK-NEXT: Block RThroughput: 5.6
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     01
+# CHECK-NEXT:                     012
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeER .    ..   st2g	x26, [x27], #4064
-# CHECK-NEXT: [0,1]     D=eER.    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=eER.    ..   st2g	x26, [x27, #4064]!
-# CHECK-NEXT: [0,3]     D==eER    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D=eeER   ..   st3	{ v1.2d, v2.2d, v3.2d }, [x27], #48
-# CHECK-NEXT: [0,5]     .D==eER   ..   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     . D=eeeeER..   st3	{ v1.2s, v2.2s, v3.2s }, [x27], #24
-# CHECK-NEXT: [0,7]     . D==eE--R..   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .  D==eeeeER   st3	{ v1.4h, v2.4h, v3.4h }, [x27], #24
-# CHECK-NEXT: [0,9]     .  D===eE--R   add	x0, x27, #1
+# CHECK:      [0,0]     DeER .    . .   st2g	x26, [x27], #4064
+# CHECK-NEXT: [0,1]     D=eER.    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeER.    . .   st2g	x26, [x27, #4064]!
+# CHECK-NEXT: [0,3]     .D=eER    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DeeER   . .   st3	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: [0,5]     .  DeER   . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .   DeeeeER .   st3	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: [0,7]     .    DeE--R .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .DeeeeER   st3	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: [0,9]     .    . DeE--R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4505,42 +4511,42 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st2g	x26, [x27], #4064
 # CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st2g	x26, [x27, #4064]!
-# CHECK-NEXT: 3.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     2.0    0.0    0.0       st3	{ v1.2d, v2.2d, v3.2d }, [x27], #48
-# CHECK-NEXT: 5.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     2.0    0.0    0.0       st3	{ v1.2s, v2.2s, v3.2s }, [x27], #24
-# CHECK-NEXT: 7.     1     3.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     3.0    1.0    0.0       st3	{ v1.4h, v2.4h, v3.4h }, [x27], #24
-# CHECK-NEXT: 9.     1     4.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.5    0.2    0.4       <total>
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st2g	x26, [x27, #4064]!
+# CHECK-NEXT: 3.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       st3	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: 5.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       st3	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: 7.     1     1.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       st3	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: 9.     1     1.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.2    0.3    0.4       <total>
 
 # CHECK:      [73] Code Region - G74
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      704
+# CHECK-NEXT: Total Cycles:      1003
 # CHECK-NEXT: Total uOps:        3800
 
-# CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    5.40
-# CHECK-NEXT: IPC:               1.42
-# CHECK-NEXT: Block RThroughput: 7.0
+# CHECK:      Dispatch Width:    5
+# CHECK-NEXT: uOps Per Cycle:    3.79
+# CHECK-NEXT: IPC:               1.00
+# CHECK-NEXT: Block RThroughput: 7.6
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0
+# CHECK-NEXT:                     012
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeER   .   st3	{ v1.4s, v2.4s, v3.4s }, [x27], #48
-# CHECK-NEXT: [0,1]     D=eE--R   .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeeeeER  .   st3	{ v1.8b, v2.8b, v3.8b }, [x27], #24
-# CHECK-NEXT: [0,3]     .D=eE--R  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D=eeeeER.   st3	{ v1.8h, v2.8h, v3.8h }, [x27], #48
-# CHECK-NEXT: [0,5]     . D==eE--R.   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D=eeeeER   st3	{ v1.16b, v2.16b, v3.16b }, [x27], #48
-# CHECK-NEXT: [0,7]     .  D==eE--R   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D==eeER   st3	{ v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT: [0,9]     .   D===eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeER   . .   st3	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: [0,1]     .DeE--R   . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeeeeER . .   st3	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: [0,3]     .  DeE--R . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   DeeeeER .   st3	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: [0,5]     .    DeE--R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .DeeeeER   st3	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: [0,7]     .    . DeE--R   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  DeeER   st3	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .   DeER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4550,43 +4556,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st3	{ v1.4s, v2.4s, v3.4s }, [x27], #48
-# CHECK-NEXT: 1.     1     2.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st3	{ v1.8b, v2.8b, v3.8b }, [x27], #24
-# CHECK-NEXT: 3.     1     2.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     2.0    1.0    0.0       st3	{ v1.8h, v2.8h, v3.8h }, [x27], #48
-# CHECK-NEXT: 5.     1     3.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     2.0    0.0    0.0       st3	{ v1.16b, v2.16b, v3.16b }, [x27], #48
-# CHECK-NEXT: 7.     1     3.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     3.0    1.0    0.0       st3	{ v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT: 9.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.3    0.3    0.8       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       st3	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: 3.     1     1.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       st3	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: 5.     1     1.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       st3	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: 7.     1     1.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       st3	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: 9.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    0.5    0.8       <total>
 
 # CHECK:      [74] Code Region - G75
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      705
+# CHECK-NEXT: Total Cycles:      1005
 # CHECK-NEXT: Total uOps:        3400
 
-# CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    4.82
-# CHECK-NEXT: IPC:               1.42
-# CHECK-NEXT: Block RThroughput: 6.0
+# CHECK:      Dispatch Width:    5
+# CHECK-NEXT: uOps Per Cycle:    3.38
+# CHECK-NEXT: IPC:               1.00
+# CHECK-NEXT: Block RThroughput: 6.8
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     01
+# CHECK-NEXT:                     01234
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeER   ..   st3	{ v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: [0,1]     D=eE--R   ..   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeeeeER  ..   st3	{ v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: [0,3]     .D=eE--R  ..   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . DeeeeER ..   st3	{ v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: [0,5]     . D=eE--R ..   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  DeeeeER..   st3	{ v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: [0,7]     .  D=eE--R..   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D=eeeeER   st3	{ v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT: [0,9]     .   D==eE--R   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeER   .   .   st3	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: [0,1]     .DeE--R   .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeeeeER .   .   st3	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: [0,3]     .  DeE--R .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   DeeeeER   .   st3	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: [0,5]     .    DeE--R   .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .DeeeeER .   st3	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: [0,7]     .    . DeE--R .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  DeeeeER   st3	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .   DeE--R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4596,42 +4602,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st3	{ v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: 1.     1     2.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st3	{ v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: 3.     1     2.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     1.0    0.0    0.0       st3	{ v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: 5.     1     2.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     1.0    0.0    0.0       st3	{ v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: 7.     1     2.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     2.0    1.0    0.0       st3	{ v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT: 9.     1     3.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT:        1     1.7    0.2    1.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       st3	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: 3.     1     1.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       st3	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: 5.     1     1.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       st3	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: 7.     1     1.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       st3	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: 9.     1     1.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    0.5    1.0       <total>
 
 # CHECK:      [75] Code Region - G76
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      604
+# CHECK-NEXT: Total Cycles:      1003
 # CHECK-NEXT: Total uOps:        3200
 
-# CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    5.30
-# CHECK-NEXT: IPC:               1.66
-# CHECK-NEXT: Block RThroughput: 5.5
+# CHECK:      Dispatch Width:    5
+# CHECK-NEXT: uOps Per Cycle:    3.19
+# CHECK-NEXT: IPC:               1.00
+# CHECK-NEXT: Block RThroughput: 6.4
 
 # CHECK:      Timeline view:
+# CHECK-NEXT:                     012
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeER  .   st3	{ v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: [0,1]     D=eE--R  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeeE-R  .   st3	{ v1.b, v2.b, v3.b }[0], [x27], #3
-# CHECK-NEXT: [0,3]     .D=eE-R  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D=eeER .   st3	{ v1.b, v2.b, v3.b }[8], [x27], #3
-# CHECK-NEXT: [0,5]     . D==eER .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D=eeER.   st3	{ v1.b, v2.b, v3.b }[0], [x27], x28
-# CHECK-NEXT: [0,7]     .  D==eER.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D=eeER   st3	{ v1.b, v2.b, v3.b }[8], [x27], x28
-# CHECK-NEXT: [0,9]     .   D==eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeER   . .   st3	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: [0,1]     .DeE--R   . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeeER   . .   st3	{ v1.b, v2.b, v3.b }[0], [x27], #3
+# CHECK-NEXT: [0,3]     .  DeER   . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   DeeER . .   st3	{ v1.b, v2.b, v3.b }[8], [x27], #3
+# CHECK-NEXT: [0,5]     .    DeER . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .DeeER .   st3	{ v1.b, v2.b, v3.b }[0], [x27], x28
+# CHECK-NEXT: [0,7]     .    . DeER .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  DeeER   st3	{ v1.b, v2.b, v3.b }[8], [x27], x28
+# CHECK-NEXT: [0,9]     .    .   DeER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4641,42 +4648,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st3	{ v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: 1.     1     2.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    0.0    1.0       st3	{ v1.b, v2.b, v3.b }[0], [x27], #3
-# CHECK-NEXT: 3.     1     2.0    0.0    1.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     2.0    1.0    0.0       st3	{ v1.b, v2.b, v3.b }[8], [x27], #3
-# CHECK-NEXT: 5.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     2.0    0.0    0.0       st3	{ v1.b, v2.b, v3.b }[0], [x27], x28
-# CHECK-NEXT: 7.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     2.0    0.0    0.0       st3	{ v1.b, v2.b, v3.b }[8], [x27], x28
-# CHECK-NEXT: 9.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.1    0.2    0.4       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       st3	{ v1.b, v2.b, v3.b }[0], [x27], #3
+# CHECK-NEXT: 3.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       st3	{ v1.b, v2.b, v3.b }[8], [x27], #3
+# CHECK-NEXT: 5.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       st3	{ v1.b, v2.b, v3.b }[0], [x27], x28
+# CHECK-NEXT: 7.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       st3	{ v1.b, v2.b, v3.b }[8], [x27], x28
+# CHECK-NEXT: 9.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    0.5    0.2       <total>
 
 # CHECK:      [76] Code Region - G77
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      504
+# CHECK-NEXT: Total Cycles:      1003
 # CHECK-NEXT: Total uOps:        3000
 
-# CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    5.95
-# CHECK-NEXT: IPC:               1.98
-# CHECK-NEXT: Block RThroughput: 5.0
+# CHECK:      Dispatch Width:    5
+# CHECK-NEXT: uOps Per Cycle:    2.99
+# CHECK-NEXT: IPC:               1.00
+# CHECK-NEXT: Block RThroughput: 6.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT: Index     012345678
+# CHECK-NEXT:                     012
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeER.  .   st3	{ v1.h, v2.h, v3.h }[0], [x27], #6
-# CHECK-NEXT: [0,1]     D=eER.  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeeER  .   st3	{ v1.h, v2.h, v3.h }[4], [x27], #6
-# CHECK-NEXT: [0,3]     .D=eER  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . DeeER .   st3	{ v1.h, v2.h, v3.h }[0], [x27], x28
-# CHECK-NEXT: [0,5]     . D=eER .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  DeeER.   st3	{ v1.h, v2.h, v3.h }[4], [x27], x28
-# CHECK-NEXT: [0,7]     .  D=eER.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   DeeER   st3	{ v1.s, v2.s, v3.s }[0], [x27], #12
-# CHECK-NEXT: [0,9]     .   D=eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeER.    . .   st3	{ v1.h, v2.h, v3.h }[0], [x27], #6
+# CHECK-NEXT: [0,1]     .DeER.    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeeER   . .   st3	{ v1.h, v2.h, v3.h }[4], [x27], #6
+# CHECK-NEXT: [0,3]     .  DeER   . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   DeeER . .   st3	{ v1.h, v2.h, v3.h }[0], [x27], x28
+# CHECK-NEXT: [0,5]     .    DeER . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .DeeER .   st3	{ v1.h, v2.h, v3.h }[4], [x27], x28
+# CHECK-NEXT: [0,7]     .    . DeER .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  DeeER   st3	{ v1.s, v2.s, v3.s }[0], [x27], #12
+# CHECK-NEXT: [0,9]     .    .   DeER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4686,43 +4694,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st3	{ v1.h, v2.h, v3.h }[0], [x27], #6
-# CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st3	{ v1.h, v2.h, v3.h }[4], [x27], #6
-# CHECK-NEXT: 3.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     1.0    0.0    0.0       st3	{ v1.h, v2.h, v3.h }[0], [x27], x28
-# CHECK-NEXT: 5.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     1.0    0.0    0.0       st3	{ v1.h, v2.h, v3.h }[4], [x27], x28
-# CHECK-NEXT: 7.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     1.0    0.0    0.0       st3	{ v1.s, v2.s, v3.s }[0], [x27], #12
-# CHECK-NEXT: 9.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     1.5    0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       st3	{ v1.h, v2.h, v3.h }[4], [x27], #6
+# CHECK-NEXT: 3.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       st3	{ v1.h, v2.h, v3.h }[0], [x27], x28
+# CHECK-NEXT: 5.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       st3	{ v1.h, v2.h, v3.h }[4], [x27], x28
+# CHECK-NEXT: 7.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       st3	{ v1.s, v2.s, v3.s }[0], [x27], #12
+# CHECK-NEXT: 9.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    0.5    0.0       <total>
 
 # CHECK:      [77] Code Region - G78
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      506
+# CHECK-NEXT: Total Cycles:      1005
 # CHECK-NEXT: Total uOps:        3000
 
-# CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    5.93
-# CHECK-NEXT: IPC:               1.98
-# CHECK-NEXT: Block RThroughput: 5.0
+# CHECK:      Dispatch Width:    5
+# CHECK-NEXT: uOps Per Cycle:    2.99
+# CHECK-NEXT: IPC:               1.00
+# CHECK-NEXT: Block RThroughput: 6.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0
+# CHECK-NEXT:                     01234
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeER.    .   st3	{ v1.s, v2.s, v3.s }[0], [x27], x28
-# CHECK-NEXT: [0,1]     D=eER.    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeeER    .   st3	{ v1.d, v2.d, v3.d }[0], [x27], #24
-# CHECK-NEXT: [0,3]     .D=eER    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . DeeER   .   st3	{ v1.d, v2.d, v3.d }[0], [x27], x28
-# CHECK-NEXT: [0,5]     . D=eER   .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  DeeER  .   st4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
-# CHECK-NEXT: [0,7]     .  D=eER  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   DeeeeER   st4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
-# CHECK-NEXT: [0,9]     .   D=eE--R   add	x0, x27, #1
+# CHECK:      [0,0]     DeeER.    .   .   st3	{ v1.s, v2.s, v3.s }[0], [x27], x28
+# CHECK-NEXT: [0,1]     .DeER.    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeeER   .   .   st3	{ v1.d, v2.d, v3.d }[0], [x27], #24
+# CHECK-NEXT: [0,3]     .  DeER   .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   DeeER .   .   st3	{ v1.d, v2.d, v3.d }[0], [x27], x28
+# CHECK-NEXT: [0,5]     .    DeER .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .DeeER   .   st4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: [0,7]     .    . DeER   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  DeeeeER   st4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: [0,9]     .    .   DeE--R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4732,43 +4740,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st3	{ v1.s, v2.s, v3.s }[0], [x27], x28
-# CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st3	{ v1.d, v2.d, v3.d }[0], [x27], #24
-# CHECK-NEXT: 3.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     1.0    0.0    0.0       st3	{ v1.d, v2.d, v3.d }[0], [x27], x28
-# CHECK-NEXT: 5.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     1.0    0.0    0.0       st4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
-# CHECK-NEXT: 7.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     1.0    0.0    0.0       st4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
-# CHECK-NEXT: 9.     1     2.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT:        1     1.5    0.1    0.2       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       st3	{ v1.d, v2.d, v3.d }[0], [x27], #24
+# CHECK-NEXT: 3.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       st3	{ v1.d, v2.d, v3.d }[0], [x27], x28
+# CHECK-NEXT: 5.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       st4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: 7.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       st4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: 9.     1     1.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    0.5    0.2       <total>
 
 # CHECK:      [78] Code Region - G79
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      804
+# CHECK-NEXT: Total Cycles:      1005
 # CHECK-NEXT: Total uOps:        4200
 
-# CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    5.22
-# CHECK-NEXT: IPC:               1.24
-# CHECK-NEXT: Block RThroughput: 8.0
+# CHECK:      Dispatch Width:    5
+# CHECK-NEXT: uOps Per Cycle:    4.18
+# CHECK-NEXT: IPC:               1.00
+# CHECK-NEXT: Block RThroughput: 8.4
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     01
+# CHECK-NEXT:                     01234
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeER   ..   st4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
-# CHECK-NEXT: [0,1]     D=eE--R   ..   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeeeeER  ..   st4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
-# CHECK-NEXT: [0,3]     .D=eE--R  ..   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . DeeeeER ..   st4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
-# CHECK-NEXT: [0,5]     . D=eE--R ..   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D=eeeeER.   st4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
-# CHECK-NEXT: [0,7]     .  D==eE--R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D=eeeeER   st4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
-# CHECK-NEXT: [0,9]     .   D==eE--R   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeER   .   .   st4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: [0,1]     .DeE--R   .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeeeeER .   .   st4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: [0,3]     .  DeE--R .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   DeeeeER   .   st4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: [0,5]     .    DeE--R   .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .DeeeeER .   st4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: [0,7]     .    . DeE--R .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  DeeeeER   st4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: [0,9]     .    .   DeE--R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4778,43 +4786,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
-# CHECK-NEXT: 1.     1     2.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
-# CHECK-NEXT: 3.     1     2.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     1.0    0.0    0.0       st4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
-# CHECK-NEXT: 5.     1     2.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     2.0    1.0    0.0       st4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
-# CHECK-NEXT: 7.     1     3.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     2.0    0.0    0.0       st4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
-# CHECK-NEXT: 9.     1     3.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT:        1     1.9    0.2    1.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       st4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: 3.     1     1.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       st4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: 5.     1     1.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       st4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: 7.     1     1.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       st4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: 9.     1     1.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    0.5    1.0       <total>
 
 # CHECK:      [79] Code Region - G80
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      605
+# CHECK-NEXT: Total Cycles:      1005
 # CHECK-NEXT: Total uOps:        3400
 
-# CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    5.62
-# CHECK-NEXT: IPC:               1.65
-# CHECK-NEXT: Block RThroughput: 6.0
+# CHECK:      Dispatch Width:    5
+# CHECK-NEXT: uOps Per Cycle:    3.38
+# CHECK-NEXT: IPC:               1.00
+# CHECK-NEXT: Block RThroughput: 6.8
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0
+# CHECK-NEXT:                     01234
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeER.    .   st4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: [0,1]     D=eER.    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeeeeER  .   st4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: [0,3]     .D=eE--R  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . DeeeeER .   st4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: [0,5]     . D=eE--R .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  DeeeeER.   st4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: [0,7]     .  D=eE--R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   DeeeeER   st4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: [0,9]     .   D=eE--R   add	x0, x27, #1
+# CHECK:      [0,0]     DeeER.    .   .   st4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: [0,1]     .DeER.    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeeeeER .   .   st4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: [0,3]     .  DeE--R .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   DeeeeER   .   st4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: [0,5]     .    DeE--R   .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .DeeeeER .   st4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: [0,7]     .    . DeE--R .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  DeeeeER   st4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .   DeE--R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4824,43 +4832,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: 3.     1     2.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     1.0    0.0    0.0       st4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: 5.     1     2.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     1.0    0.0    0.0       st4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: 7.     1     2.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     1.0    0.0    0.0       st4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: 9.     1     2.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT:        1     1.5    0.1    0.8       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       st4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: 3.     1     1.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       st4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: 5.     1     1.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       st4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: 7.     1     1.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       st4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: 9.     1     1.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    0.5    0.8       <total>
 
 # CHECK:      [80] Code Region - G81
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      704
+# CHECK-NEXT: Total Cycles:      1003
 # CHECK-NEXT: Total uOps:        3800
 
-# CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    5.40
-# CHECK-NEXT: IPC:               1.42
-# CHECK-NEXT: Block RThroughput: 7.0
+# CHECK:      Dispatch Width:    5
+# CHECK-NEXT: uOps Per Cycle:    3.79
+# CHECK-NEXT: IPC:               1.00
+# CHECK-NEXT: Block RThroughput: 7.6
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0
+# CHECK-NEXT:                     012
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeER   .   st4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: [0,1]     D=eE--R   .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeeeeER  .   st4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: [0,3]     .D=eE--R  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D==eeER .   st4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
-# CHECK-NEXT: [0,5]     . D===eER .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D==eeER.   st4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
-# CHECK-NEXT: [0,7]     .  D===eER.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D==eeER   st4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
-# CHECK-NEXT: [0,9]     .   D===eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeER   . .   st4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: [0,1]     .DeE--R   . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeeeeER . .   st4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: [0,3]     .  DeE--R . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   DeeER . .   st4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
+# CHECK-NEXT: [0,5]     .    DeER . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .DeeER .   st4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
+# CHECK-NEXT: [0,7]     .    . DeER .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  DeeER   st4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
+# CHECK-NEXT: [0,9]     .    .   DeER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4870,42 +4878,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: 1.     1     2.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: 3.     1     2.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     3.0    2.0    0.0       st4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
-# CHECK-NEXT: 5.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     3.0    0.0    0.0       st4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
-# CHECK-NEXT: 7.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     3.0    0.0    0.0       st4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
-# CHECK-NEXT: 9.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.7    0.3    0.4       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       st4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: 3.     1     1.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       st4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
+# CHECK-NEXT: 5.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       st4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
+# CHECK-NEXT: 7.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       st4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
+# CHECK-NEXT: 9.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    0.5    0.4       <total>
 
 # CHECK:      [81] Code Region - G82
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      504
+# CHECK-NEXT: Total Cycles:      1003
 # CHECK-NEXT: Total uOps:        3000
 
-# CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    5.95
-# CHECK-NEXT: IPC:               1.98
-# CHECK-NEXT: Block RThroughput: 5.0
+# CHECK:      Dispatch Width:    5
+# CHECK-NEXT: uOps Per Cycle:    2.99
+# CHECK-NEXT: IPC:               1.00
+# CHECK-NEXT: Block RThroughput: 6.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT: Index     012345678
+# CHECK-NEXT:                     012
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeER.  .   st4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
-# CHECK-NEXT: [0,1]     D=eER.  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeeER  .   st4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
-# CHECK-NEXT: [0,3]     .D=eER  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . DeeER .   st4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
-# CHECK-NEXT: [0,5]     . D=eER .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  DeeER.   st4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
-# CHECK-NEXT: [0,7]     .  D=eER.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   DeeER   st4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
-# CHECK-NEXT: [0,9]     .   D=eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeER.    . .   st4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
+# CHECK-NEXT: [0,1]     .DeER.    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeeER   . .   st4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
+# CHECK-NEXT: [0,3]     .  DeER   . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   DeeER . .   st4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
+# CHECK-NEXT: [0,5]     .    DeER . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .DeeER .   st4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
+# CHECK-NEXT: [0,7]     .    . DeER .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  DeeER   st4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
+# CHECK-NEXT: [0,9]     .    .   DeER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4915,42 +4924,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
-# CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
-# CHECK-NEXT: 3.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     1.0    0.0    0.0       st4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
-# CHECK-NEXT: 5.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     1.0    0.0    0.0       st4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
-# CHECK-NEXT: 7.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     1.0    0.0    0.0       st4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
-# CHECK-NEXT: 9.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     1.5    0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       st4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
+# CHECK-NEXT: 3.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       st4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
+# CHECK-NEXT: 5.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       st4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
+# CHECK-NEXT: 7.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       st4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
+# CHECK-NEXT: 9.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    0.5    0.0       <total>
 
 # CHECK:      [82] Code Region - G83
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      504
+# CHECK-NEXT: Total Cycles:      804
 # CHECK-NEXT: Total uOps:        2800
 
-# CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    5.56
-# CHECK-NEXT: IPC:               1.98
-# CHECK-NEXT: Block RThroughput: 4.5
+# CHECK:      Dispatch Width:    5
+# CHECK-NEXT: uOps Per Cycle:    3.48
+# CHECK-NEXT: IPC:               1.24
+# CHECK-NEXT: Block RThroughput: 5.6
 
 # CHECK:      Timeline view:
-# CHECK-NEXT: Index     012345678
+# CHECK-NEXT:                     01
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeER.  .   st4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
-# CHECK-NEXT: [0,1]     D=eER.  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeeER  .   st4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
-# CHECK-NEXT: [0,3]     .D=eER  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . DeeER .   st4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
-# CHECK-NEXT: [0,5]     . D=eER .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  DeeER.   st4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
-# CHECK-NEXT: [0,7]     .  D=eER.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .  D=eER.   stg	x26, [x27], #4064
-# CHECK-NEXT: [0,9]     .  D==eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeER.    ..   st4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
+# CHECK-NEXT: [0,1]     .DeER.    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeeER   ..   st4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
+# CHECK-NEXT: [0,3]     .  DeER   ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   DeeER ..   st4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
+# CHECK-NEXT: [0,5]     .    DeER ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .DeeER.   st4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
+# CHECK-NEXT: [0,7]     .    . DeER.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    . DeER.   stg	x26, [x27], #4064
+# CHECK-NEXT: [0,9]     .    . D=eER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4960,16 +4970,16 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
-# CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
-# CHECK-NEXT: 3.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     1.0    0.0    0.0       st4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
-# CHECK-NEXT: 5.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     1.0    0.0    0.0       st4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
-# CHECK-NEXT: 7.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     2.0    0.0    0.0       stg	x26, [x27], #4064
-# CHECK-NEXT: 9.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     1.7    0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       st4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
+# CHECK-NEXT: 3.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       st4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
+# CHECK-NEXT: 5.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       st4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
+# CHECK-NEXT: 7.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    0.0    0.0       stg	x26, [x27], #4064
+# CHECK-NEXT: 9.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.1    0.4    0.0       <total>
 
 # CHECK:      [83] Code Region - G84
 
@@ -4978,24 +4988,24 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      504
 # CHECK-NEXT: Total uOps:        2200
 
-# CHECK:      Dispatch Width:    10
+# CHECK:      Dispatch Width:    5
 # CHECK-NEXT: uOps Per Cycle:    4.37
 # CHECK-NEXT: IPC:               1.98
-# CHECK-NEXT: Block RThroughput: 3.0
+# CHECK-NEXT: Block RThroughput: 4.4
 
 # CHECK:      Timeline view:
 # CHECK-NEXT: Index     012345678
 
 # CHECK:      [0,0]     DeER .  .   stg	x26, [x27, #4064]!
 # CHECK-NEXT: [0,1]     D=eER.  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=eER.  .   stgp	x1, x2, [x27], #992
-# CHECK-NEXT: [0,3]     D==eER  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D=eER  .   stgp	x1, x2, [x27, #992]!
-# CHECK-NEXT: [0,5]     .D==eER .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D==eeER.   stp	s1, s2, [x27], #248
-# CHECK-NEXT: [0,7]     .D===eER.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D==eeER   stp	d1, d2, [x27], #496
-# CHECK-NEXT: [0,9]     . D===eER   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeER.  .   stgp	x1, x2, [x27], #992
+# CHECK-NEXT: [0,3]     .D=eER  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DeER  .   stgp	x1, x2, [x27, #992]!
+# CHECK-NEXT: [0,5]     . D=eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  DeeER.   stp	s1, s2, [x27], #248
+# CHECK-NEXT: [0,7]     .  D=eER.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   DeeER   stp	d1, d2, [x27], #496
+# CHECK-NEXT: [0,9]     .   D=eER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -5006,15 +5016,15 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       stg	x26, [x27, #4064]!
 # CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       stgp	x1, x2, [x27], #992
-# CHECK-NEXT: 3.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     2.0    0.0    0.0       stgp	x1, x2, [x27, #992]!
-# CHECK-NEXT: 5.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     3.0    0.0    0.0       stp	s1, s2, [x27], #248
-# CHECK-NEXT: 7.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     3.0    0.0    0.0       stp	d1, d2, [x27], #496
-# CHECK-NEXT: 9.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.7    0.1    0.0       <total>
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       stgp	x1, x2, [x27], #992
+# CHECK-NEXT: 3.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       stgp	x1, x2, [x27, #992]!
+# CHECK-NEXT: 5.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    0.0    0.0       stp	s1, s2, [x27], #248
+# CHECK-NEXT: 7.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    0.0    0.0       stp	d1, d2, [x27], #496
+# CHECK-NEXT: 9.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.5    0.1    0.0       <total>
 
 # CHECK:      [84] Code Region - G85
 
@@ -5023,24 +5033,24 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      504
 # CHECK-NEXT: Total uOps:        2400
 
-# CHECK:      Dispatch Width:    10
+# CHECK:      Dispatch Width:    5
 # CHECK-NEXT: uOps Per Cycle:    4.76
 # CHECK-NEXT: IPC:               1.98
-# CHECK-NEXT: Block RThroughput: 3.5
+# CHECK-NEXT: Block RThroughput: 4.8
 
 # CHECK:      Timeline view:
 # CHECK-NEXT: Index     012345678
 
 # CHECK:      [0,0]     DeeER.  .   stp	q1, q2, [x27], #992
 # CHECK-NEXT: [0,1]     D=eER.  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=eeER  .   stp	s1, s2, [x27, #248]!
-# CHECK-NEXT: [0,3]     D==eER  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D=eeER .   stp	d1, d2, [x27, #496]!
-# CHECK-NEXT: [0,5]     .D==eER .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D==eeER.   stp	q1, q2, [x27, #992]!
-# CHECK-NEXT: [0,7]     .D===eER.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D==eER.   stp	w1, w2, [x27], #248
-# CHECK-NEXT: [0,9]     . D===eER   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeER  .   stp	s1, s2, [x27, #248]!
+# CHECK-NEXT: [0,3]     .D=eER  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DeeER .   stp	d1, d2, [x27, #496]!
+# CHECK-NEXT: [0,5]     . D=eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  DeeER.   stp	q1, q2, [x27, #992]!
+# CHECK-NEXT: [0,7]     .  D=eER.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   DeER.   stp	w1, w2, [x27], #248
+# CHECK-NEXT: [0,9]     .   D=eER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -5051,15 +5061,15 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       stp	q1, q2, [x27], #992
 # CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       stp	s1, s2, [x27, #248]!
-# CHECK-NEXT: 3.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     2.0    0.0    0.0       stp	d1, d2, [x27, #496]!
-# CHECK-NEXT: 5.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     3.0    0.0    0.0       stp	q1, q2, [x27, #992]!
-# CHECK-NEXT: 7.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     3.0    0.0    0.0       stp	w1, w2, [x27], #248
-# CHECK-NEXT: 9.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.7    0.1    0.0       <total>
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       stp	s1, s2, [x27, #248]!
+# CHECK-NEXT: 3.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       stp	d1, d2, [x27, #496]!
+# CHECK-NEXT: 5.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    0.0    0.0       stp	q1, q2, [x27, #992]!
+# CHECK-NEXT: 7.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    0.0    0.0       stp	w1, w2, [x27], #248
+# CHECK-NEXT: 9.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.5    0.1    0.0       <total>
 
 # CHECK:      [85] Code Region - G86
 
@@ -5068,24 +5078,24 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      504
 # CHECK-NEXT: Total uOps:        2200
 
-# CHECK:      Dispatch Width:    10
+# CHECK:      Dispatch Width:    5
 # CHECK-NEXT: uOps Per Cycle:    4.37
 # CHECK-NEXT: IPC:               1.98
-# CHECK-NEXT: Block RThroughput: 3.0
+# CHECK-NEXT: Block RThroughput: 4.4
 
 # CHECK:      Timeline view:
 # CHECK-NEXT: Index     012345678
 
 # CHECK:      [0,0]     DeER .  .   stp	x1, x2, [x27], #496
 # CHECK-NEXT: [0,1]     D=eER.  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=eER.  .   stp	w1, w2, [x27, #248]!
-# CHECK-NEXT: [0,3]     D==eER  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D=eER  .   stp	x1, x2, [x27, #496]!
-# CHECK-NEXT: [0,5]     .D==eER .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D==eeER.   str	b1, [x27], #254
-# CHECK-NEXT: [0,7]     .D===eER.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D==eeER   str	h1, [x27], #254
-# CHECK-NEXT: [0,9]     . D===eER   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeER.  .   stp	w1, w2, [x27, #248]!
+# CHECK-NEXT: [0,3]     .D=eER  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DeER  .   stp	x1, x2, [x27, #496]!
+# CHECK-NEXT: [0,5]     . D=eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  DeeER.   str	b1, [x27], #254
+# CHECK-NEXT: [0,7]     .  D=eER.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   DeeER   str	h1, [x27], #254
+# CHECK-NEXT: [0,9]     .   D=eER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -5096,15 +5106,15 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       stp	x1, x2, [x27], #496
 # CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       stp	w1, w2, [x27, #248]!
-# CHECK-NEXT: 3.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     2.0    0.0    0.0       stp	x1, x2, [x27, #496]!
-# CHECK-NEXT: 5.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     3.0    0.0    0.0       str	b1, [x27], #254
-# CHECK-NEXT: 7.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     3.0    0.0    0.0       str	h1, [x27], #254
-# CHECK-NEXT: 9.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.7    0.1    0.0       <total>
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       stp	w1, w2, [x27, #248]!
+# CHECK-NEXT: 3.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       stp	x1, x2, [x27, #496]!
+# CHECK-NEXT: 5.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    0.0    0.0       str	b1, [x27], #254
+# CHECK-NEXT: 7.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    0.0    0.0       str	h1, [x27], #254
+# CHECK-NEXT: 9.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.5    0.1    0.0       <total>
 
 # CHECK:      [86] Code Region - G87
 
@@ -5113,24 +5123,24 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      505
 # CHECK-NEXT: Total uOps:        2500
 
-# CHECK:      Dispatch Width:    10
+# CHECK:      Dispatch Width:    5
 # CHECK-NEXT: uOps Per Cycle:    4.95
 # CHECK-NEXT: IPC:               1.98
-# CHECK-NEXT: Block RThroughput: 3.8
+# CHECK-NEXT: Block RThroughput: 5.0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT: Index     0123456789
 
 # CHECK:      [0,0]     DeeER.   .   str	s1, [x27], #254
 # CHECK-NEXT: [0,1]     D=eER.   .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=eeER   .   str	d1, [x27], #254
-# CHECK-NEXT: [0,3]     D==eER   .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D=eeER  .   str	q1, [x27], #254
-# CHECK-NEXT: [0,5]     .D==eER  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D==eeeER.   str	b1, [x27, #254]!
-# CHECK-NEXT: [0,7]     .D===eE-R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D==eeeER   str	h1, [x27, #254]!
-# CHECK-NEXT: [0,9]     . D===eE-R   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeER   .   str	d1, [x27], #254
+# CHECK-NEXT: [0,3]     .D=eER   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DeeER  .   str	q1, [x27], #254
+# CHECK-NEXT: [0,5]     . D=eER  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  DeeeER.   str	b1, [x27, #254]!
+# CHECK-NEXT: [0,7]     .  D=eE-R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   DeeeER   str	h1, [x27, #254]!
+# CHECK-NEXT: [0,9]     .   D=eE-R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -5141,15 +5151,15 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       str	s1, [x27], #254
 # CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       str	d1, [x27], #254
-# CHECK-NEXT: 3.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     2.0    0.0    0.0       str	q1, [x27], #254
-# CHECK-NEXT: 5.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     3.0    0.0    0.0       str	b1, [x27, #254]!
-# CHECK-NEXT: 7.     1     4.0    0.0    1.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     3.0    0.0    0.0       str	h1, [x27, #254]!
-# CHECK-NEXT: 9.     1     4.0    0.0    1.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.7    0.1    0.2       <total>
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       str	d1, [x27], #254
+# CHECK-NEXT: 3.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       str	q1, [x27], #254
+# CHECK-NEXT: 5.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    0.0    0.0       str	b1, [x27, #254]!
+# CHECK-NEXT: 7.     1     2.0    0.0    1.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    0.0    0.0       str	h1, [x27, #254]!
+# CHECK-NEXT: 9.     1     2.0    0.0    1.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.5    0.1    0.2       <total>
 
 # CHECK:      [87] Code Region - G88
 
@@ -5158,24 +5168,24 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      504
 # CHECK-NEXT: Total uOps:        2300
 
-# CHECK:      Dispatch Width:    10
+# CHECK:      Dispatch Width:    5
 # CHECK-NEXT: uOps Per Cycle:    4.56
 # CHECK-NEXT: IPC:               1.98
-# CHECK-NEXT: Block RThroughput: 3.3
+# CHECK-NEXT: Block RThroughput: 4.6
 
 # CHECK:      Timeline view:
 # CHECK-NEXT: Index     012345678
 
 # CHECK:      [0,0]     DeeeER  .   str	s1, [x27, #254]!
 # CHECK-NEXT: [0,1]     D=eE-R  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=eeeER .   str	d1, [x27, #254]!
-# CHECK-NEXT: [0,3]     D==eE-R .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D=eeER .   str	q1, [x27, #254]!
-# CHECK-NEXT: [0,5]     .D==eER .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D==eER .   str	w1, [x27], #254
-# CHECK-NEXT: [0,7]     .D===eER.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D==eER.   str	x1, [x27], #254
-# CHECK-NEXT: [0,9]     . D===eER   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeER .   str	d1, [x27, #254]!
+# CHECK-NEXT: [0,3]     .D=eE-R .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DeeER .   str	q1, [x27, #254]!
+# CHECK-NEXT: [0,5]     . D=eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  DeER .   str	w1, [x27], #254
+# CHECK-NEXT: [0,7]     .  D=eER.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   DeER.   str	x1, [x27], #254
+# CHECK-NEXT: [0,9]     .   D=eER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -5186,15 +5196,15 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       str	s1, [x27, #254]!
 # CHECK-NEXT: 1.     1     2.0    0.0    1.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       str	d1, [x27, #254]!
-# CHECK-NEXT: 3.     1     3.0    0.0    1.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     2.0    0.0    0.0       str	q1, [x27, #254]!
-# CHECK-NEXT: 5.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     3.0    0.0    0.0       str	w1, [x27], #254
-# CHECK-NEXT: 7.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     3.0    0.0    0.0       str	x1, [x27], #254
-# CHECK-NEXT: 9.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.7    0.1    0.2       <total>
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       str	d1, [x27, #254]!
+# CHECK-NEXT: 3.     1     2.0    0.0    1.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       str	q1, [x27, #254]!
+# CHECK-NEXT: 5.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    0.0    0.0       str	w1, [x27], #254
+# CHECK-NEXT: 7.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    0.0    0.0       str	x1, [x27], #254
+# CHECK-NEXT: 9.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.5    0.1    0.2       <total>
 
 # CHECK:      [88] Code Region - G89
 
@@ -5203,24 +5213,24 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      504
 # CHECK-NEXT: Total uOps:        2000
 
-# CHECK:      Dispatch Width:    10
+# CHECK:      Dispatch Width:    5
 # CHECK-NEXT: uOps Per Cycle:    3.97
 # CHECK-NEXT: IPC:               1.98
-# CHECK-NEXT: Block RThroughput: 2.5
+# CHECK-NEXT: Block RThroughput: 4.0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT: Index     012345678
 
 # CHECK:      [0,0]     DeER .  .   str	w1, [x27, #254]!
 # CHECK-NEXT: [0,1]     D=eER.  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=eER.  .   str	x1, [x27, #254]!
-# CHECK-NEXT: [0,3]     D==eER  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D=eER  .   strb	w1, [x27], #254
-# CHECK-NEXT: [0,5]     .D==eER .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D==eER .   strb	w1, [x27, #254]!
-# CHECK-NEXT: [0,7]     .D===eER.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D==eER.   strh	w1, [x27], #254
-# CHECK-NEXT: [0,9]     . D===eER   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeER.  .   str	x1, [x27, #254]!
+# CHECK-NEXT: [0,3]     .D=eER  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DeER  .   strb	w1, [x27], #254
+# CHECK-NEXT: [0,5]     . D=eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  DeER .   strb	w1, [x27, #254]!
+# CHECK-NEXT: [0,7]     .  D=eER.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   DeER.   strh	w1, [x27], #254
+# CHECK-NEXT: [0,9]     .   D=eER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -5231,15 +5241,15 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       str	w1, [x27, #254]!
 # CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       str	x1, [x27, #254]!
-# CHECK-NEXT: 3.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     2.0    0.0    0.0       strb	w1, [x27], #254
-# CHECK-NEXT: 5.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     3.0    0.0    0.0       strb	w1, [x27, #254]!
-# CHECK-NEXT: 7.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     3.0    0.0    0.0       strh	w1, [x27], #254
-# CHECK-NEXT: 9.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.7    0.1    0.0       <total>
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       str	x1, [x27, #254]!
+# CHECK-NEXT: 3.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       strb	w1, [x27], #254
+# CHECK-NEXT: 5.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    0.0    0.0       strb	w1, [x27, #254]!
+# CHECK-NEXT: 7.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    0.0    0.0       strh	w1, [x27], #254
+# CHECK-NEXT: 9.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.5    0.1    0.0       <total>
 
 # CHECK:      [89] Code Region - G90
 
@@ -5248,24 +5258,24 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      504
 # CHECK-NEXT: Total uOps:        2000
 
-# CHECK:      Dispatch Width:    10
+# CHECK:      Dispatch Width:    5
 # CHECK-NEXT: uOps Per Cycle:    3.97
 # CHECK-NEXT: IPC:               1.98
-# CHECK-NEXT: Block RThroughput: 2.5
+# CHECK-NEXT: Block RThroughput: 4.0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT: Index     012345678
 
 # CHECK:      [0,0]     DeER .  .   strh	w1, [x27, #254]!
 # CHECK-NEXT: [0,1]     D=eER.  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=eER.  .   stz2g	x26, [x27], #4064
-# CHECK-NEXT: [0,3]     D==eER  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D=eER  .   stz2g	x26, [x27, #4064]!
-# CHECK-NEXT: [0,5]     .D==eER .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D==eER .   stzg	x26, [x27], #4064
-# CHECK-NEXT: [0,7]     .D===eER.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D==eER.   stzg	x26, [x27, #4064]!
-# CHECK-NEXT: [0,9]     . D===eER   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeER.  .   stz2g	x26, [x27], #4064
+# CHECK-NEXT: [0,3]     .D=eER  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DeER  .   stz2g	x26, [x27, #4064]!
+# CHECK-NEXT: [0,5]     . D=eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  DeER .   stzg	x26, [x27], #4064
+# CHECK-NEXT: [0,7]     .  D=eER.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   DeER.   stzg	x26, [x27, #4064]!
+# CHECK-NEXT: [0,9]     .   D=eER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -5276,27 +5286,27 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       strh	w1, [x27, #254]!
 # CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       stz2g	x26, [x27], #4064
-# CHECK-NEXT: 3.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     2.0    0.0    0.0       stz2g	x26, [x27, #4064]!
-# CHECK-NEXT: 5.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     3.0    0.0    0.0       stzg	x26, [x27], #4064
-# CHECK-NEXT: 7.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     3.0    0.0    0.0       stzg	x26, [x27, #4064]!
-# CHECK-NEXT: 9.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.7    0.1    0.0       <total>
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       stz2g	x26, [x27], #4064
+# CHECK-NEXT: 3.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       stz2g	x26, [x27, #4064]!
+# CHECK-NEXT: 5.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    0.0    0.0       stzg	x26, [x27], #4064
+# CHECK-NEXT: 7.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    0.0    0.0       stzg	x26, [x27, #4064]!
+# CHECK-NEXT: 9.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.5    0.1    0.0       <total>
 
 # CHECK:      [90] Code Region - G91
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      400
-# CHECK-NEXT: Total Cycles:      110
+# CHECK-NEXT: Total Cycles:      143
 # CHECK-NEXT: Total uOps:        600
 
-# CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    5.45
-# CHECK-NEXT: IPC:               3.64
-# CHECK-NEXT: Block RThroughput: 1.0
+# CHECK:      Dispatch Width:    5
+# CHECK-NEXT: uOps Per Cycle:    4.20
+# CHECK-NEXT: IPC:               2.80
+# CHECK-NEXT: Block RThroughput: 1.2
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0
@@ -5305,7 +5315,7 @@ add x0, x27, 1
 # CHECK:      [0,0]     DeeeeER   .   ldr	x1, [x27], #254
 # CHECK-NEXT: [0,1]     D=eE--R   .   add	x0, x27, #1
 # CHECK-NEXT: [0,2]     D====eeeeER   ldr	x2, [x1], #254
-# CHECK-NEXT: [0,3]     D=eE------R   add	x0, x27, #1
+# CHECK-NEXT: [0,3]     .DeE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -5317,5 +5327,5 @@ add x0, x27, 1
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ldr	x1, [x27], #254
 # CHECK-NEXT: 1.     1     2.0    0.0    2.0       add	x0, x27, #1
 # CHECK-NEXT: 2.     1     5.0    0.0    0.0       ldr	x2, [x1], #254
-# CHECK-NEXT: 3.     1     2.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.5    0.3    2.0       <total>
+# CHECK-NEXT: 3.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.3    0.3    2.0       <total>
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-basic-instructions.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-basic-instructions.s
index e1c7bf56f45f2..72ae67e3bea71 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-basic-instructions.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-basic-instructions.s
@@ -2688,7 +2688,7 @@ drps
 # CHECK-NEXT:  1      1     0.25                        movk	x7, #0, lsl #32
 # CHECK-NEXT:  1      1     0.25                        movz	x8, #0, lsl #48
 # CHECK-NEXT:  1      1     0.25                        movk	x9, #0, lsl #48
-# CHECK-NEXT:  1      1     0.07                  U     msr	DAIFSet, #0
+# CHECK-NEXT:  1      1     0.12                  U     msr	DAIFSet, #0
 # CHECK-NEXT:  1      1     0.25                        adr	x2, #1600
 # CHECK-NEXT:  1      1     0.25                        adrp	x21, #6553600
 # CHECK-NEXT:  1      1     0.25                        adr	x0, #262144
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-clear-upper-regs.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-clear-upper-regs.s
index 37ae765148396..e0eb35917dc5e 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-clear-upper-regs.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-clear-upper-regs.s
@@ -58,7 +58,7 @@ add v0.16b, v0.16b, v0.16b
 # CHECK-NEXT: Total Cycles:      41
 # CHECK-NEXT: Total uOps:        200
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    4.88
 # CHECK-NEXT: IPC:               4.88
 # CHECK-NEXT: Block RThroughput: 0.3
@@ -134,7 +134,7 @@ add v0.16b, v0.16b, v0.16b
 # CHECK-NEXT: Total Cycles:      44
 # CHECK-NEXT: Total uOps:        200
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    4.55
 # CHECK-NEXT: IPC:               4.55
 # CHECK-NEXT: Block RThroughput: 0.3
@@ -211,7 +211,7 @@ add v0.16b, v0.16b, v0.16b
 # CHECK-NEXT: Total Cycles:      44
 # CHECK-NEXT: Total uOps:        200
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    4.55
 # CHECK-NEXT: IPC:               4.55
 # CHECK-NEXT: Block RThroughput: 0.3
@@ -288,7 +288,7 @@ add v0.16b, v0.16b, v0.16b
 # CHECK-NEXT: Total Cycles:      44
 # CHECK-NEXT: Total uOps:        200
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    4.55
 # CHECK-NEXT: IPC:               4.55
 # CHECK-NEXT: Block RThroughput: 0.3
@@ -365,7 +365,7 @@ add v0.16b, v0.16b, v0.16b
 # CHECK-NEXT: Total Cycles:      44
 # CHECK-NEXT: Total uOps:        200
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    4.55
 # CHECK-NEXT: IPC:               4.55
 # CHECK-NEXT: Block RThroughput: 0.3
@@ -442,7 +442,7 @@ add v0.16b, v0.16b, v0.16b
 # CHECK-NEXT: Total Cycles:      44
 # CHECK-NEXT: Total uOps:        200
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    4.55
 # CHECK-NEXT: IPC:               4.55
 # CHECK-NEXT: Block RThroughput: 0.3
@@ -519,7 +519,7 @@ add v0.16b, v0.16b, v0.16b
 # CHECK-NEXT: Total Cycles:      44
 # CHECK-NEXT: Total uOps:        200
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    4.55
 # CHECK-NEXT: IPC:               4.55
 # CHECK-NEXT: Block RThroughput: 0.3
@@ -596,7 +596,7 @@ add v0.16b, v0.16b, v0.16b
 # CHECK-NEXT: Total Cycles:      44
 # CHECK-NEXT: Total uOps:        200
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    4.55
 # CHECK-NEXT: IPC:               4.55
 # CHECK-NEXT: Block RThroughput: 0.3
@@ -673,7 +673,7 @@ add v0.16b, v0.16b, v0.16b
 # CHECK-NEXT: Total Cycles:      403
 # CHECK-NEXT: Total uOps:        200
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    0.50
 # CHECK-NEXT: IPC:               0.50
 # CHECK-NEXT: Block RThroughput: 0.5
@@ -750,7 +750,7 @@ add v0.16b, v0.16b, v0.16b
 # CHECK-NEXT: Total Cycles:      1003
 # CHECK-NEXT: Total uOps:        300
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    0.30
 # CHECK-NEXT: IPC:               0.20
 # CHECK-NEXT: Block RThroughput: 0.5
@@ -805,9 +805,9 @@ add v0.16b, v0.16b, v0.16b
 # CHECK-NEXT: [1,0]     D==========eeeeeeeeER    .    .    .    . .   ld1	{ v0.b }[0], [sp]
 # CHECK-NEXT: [1,1]     D==================eeER  .    .    .    . .   add	v0.16b, v0.16b, v0.16b
 # CHECK-NEXT: [2,0]     D====================eeeeeeeeER    .    . .   ld1	{ v0.b }[0], [sp]
-# CHECK-NEXT: [2,1]     D============================eeER  .    . .   add	v0.16b, v0.16b, v0.16b
-# CHECK-NEXT: [3,0]     D==============================eeeeeeeeER .   ld1	{ v0.b }[0], [sp]
-# CHECK-NEXT: [3,1]     D======================================eeER   add	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT: [2,1]     .D===========================eeER  .    . .   add	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT: [3,0]     .D=============================eeeeeeeeER .   ld1	{ v0.b }[0], [sp]
+# CHECK-NEXT: [3,1]     .D=====================================eeER   add	v0.16b, v0.16b, v0.16b
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -816,6 +816,6 @@ add v0.16b, v0.16b, v0.16b
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     4     16.0   0.3    0.0       ld1	{ v0.b }[0], [sp]
-# CHECK-NEXT: 1.     4     24.0   0.0    0.0       add	v0.16b, v0.16b, v0.16b
-# CHECK-NEXT:        4     20.0   0.1    0.0       <total>
+# CHECK-NEXT: 0.     4     15.8   0.3    0.0       ld1	{ v0.b }[0], [sp]
+# CHECK-NEXT: 1.     4     23.5   0.0    0.0       add	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:        4     19.6   0.1    0.0       <total>
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-forwarding.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-forwarding.s
index a720a6bde305b..c3ccf1ceba307 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-forwarding.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-forwarding.s
@@ -238,7 +238,7 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: Total Cycles:      703
 # CHECK-NEXT: Total uOps:        400
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    0.57
 # CHECK-NEXT: IPC:               0.57
 # CHECK-NEXT: Block RThroughput: 3.0
@@ -276,7 +276,7 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: Total Cycles:      703
 # CHECK-NEXT: Total uOps:        400
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    0.57
 # CHECK-NEXT: IPC:               0.57
 # CHECK-NEXT: Block RThroughput: 3.0
@@ -314,7 +314,7 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: Total Cycles:      1703
 # CHECK-NEXT: Total uOps:        600
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    0.35
 # CHECK-NEXT: IPC:               0.35
 # CHECK-NEXT: Block RThroughput: 1.5
@@ -331,10 +331,10 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [0,5]     D=============eeeeER.    .    .    ..   fmadd	d0, d0, d1, d2
 # CHECK-NEXT: [1,0]     D=================eeER   .    .    ..   fadd	d0, d0, d0
 # CHECK-NEXT: [1,1]     D===================eeeeER    .    ..   fmadd	d0, d1, d2, d0
-# CHECK-NEXT: [1,2]     D=======================eeeER .    ..   fmul	d0, d0, d0
-# CHECK-NEXT: [1,3]     D========================eeeeER    ..   fmadd	d0, d1, d2, d0
-# CHECK-NEXT: [1,4]     D==========================eeeeER  ..   fmadd	d0, d1, d2, d0
-# CHECK-NEXT: [1,5]     D==============================eeeeER   fmadd	d0, d0, d1, d2
+# CHECK-NEXT: [1,2]     .D======================eeeER .    ..   fmul	d0, d0, d0
+# CHECK-NEXT: [1,3]     .D=======================eeeeER    ..   fmadd	d0, d1, d2, d0
+# CHECK-NEXT: [1,4]     .D=========================eeeeER  ..   fmadd	d0, d1, d2, d0
+# CHECK-NEXT: [1,5]     .D=============================eeeeER   fmadd	d0, d0, d1, d2
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -345,11 +345,11 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     2     9.5    0.5    0.0       fadd	d0, d0, d0
 # CHECK-NEXT: 1.     2     11.5   0.0    0.0       fmadd	d0, d1, d2, d0
-# CHECK-NEXT: 2.     2     15.5   0.0    0.0       fmul	d0, d0, d0
-# CHECK-NEXT: 3.     2     16.5   0.0    0.0       fmadd	d0, d1, d2, d0
-# CHECK-NEXT: 4.     2     18.5   0.0    0.0       fmadd	d0, d1, d2, d0
-# CHECK-NEXT: 5.     2     22.5   0.0    0.0       fmadd	d0, d0, d1, d2
-# CHECK-NEXT:        2     15.7   0.1    0.0       <total>
+# CHECK-NEXT: 2.     2     15.0   0.0    0.0       fmul	d0, d0, d0
+# CHECK-NEXT: 3.     2     16.0   0.0    0.0       fmadd	d0, d1, d2, d0
+# CHECK-NEXT: 4.     2     18.0   0.0    0.0       fmadd	d0, d1, d2, d0
+# CHECK-NEXT: 5.     2     22.0   0.0    0.0       fmadd	d0, d0, d1, d2
+# CHECK-NEXT:        2     15.3   0.1    0.0       <total>
 
 # CHECK:      [3] Code Region - saba
 
@@ -358,7 +358,7 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: Total Cycles:      1303
 # CHECK-NEXT: Total uOps:        400
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    0.31
 # CHECK-NEXT: IPC:               0.31
 # CHECK-NEXT: Block RThroughput: 1.5
@@ -396,7 +396,7 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: Total Cycles:      1303
 # CHECK-NEXT: Total uOps:        400
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    0.31
 # CHECK-NEXT: IPC:               0.31
 # CHECK-NEXT: Block RThroughput: 1.5
@@ -434,7 +434,7 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: Total Cycles:      1103
 # CHECK-NEXT: Total uOps:        400
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    0.36
 # CHECK-NEXT: IPC:               0.36
 # CHECK-NEXT: Block RThroughput: 0.8
@@ -472,7 +472,7 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: Total Cycles:      1103
 # CHECK-NEXT: Total uOps:        400
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    0.36
 # CHECK-NEXT: IPC:               0.36
 # CHECK-NEXT: Block RThroughput: 0.8
@@ -510,7 +510,7 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: Total Cycles:      1303
 # CHECK-NEXT: Total uOps:        400
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    0.31
 # CHECK-NEXT: IPC:               0.31
 # CHECK-NEXT: Block RThroughput: 2.0
@@ -548,7 +548,7 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: Total Cycles:      1303
 # CHECK-NEXT: Total uOps:        400
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    0.31
 # CHECK-NEXT: IPC:               0.31
 # CHECK-NEXT: Block RThroughput: 2.0
@@ -586,7 +586,7 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: Total Cycles:      1303
 # CHECK-NEXT: Total uOps:        400
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    0.31
 # CHECK-NEXT: IPC:               0.31
 # CHECK-NEXT: Block RThroughput: 1.5
@@ -624,7 +624,7 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: Total Cycles:      1303
 # CHECK-NEXT: Total uOps:        400
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    0.31
 # CHECK-NEXT: IPC:               0.31
 # CHECK-NEXT: Block RThroughput: 1.0
@@ -662,7 +662,7 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: Total Cycles:      1703
 # CHECK-NEXT: Total uOps:        600
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    0.35
 # CHECK-NEXT: IPC:               0.35
 # CHECK-NEXT: Block RThroughput: 1.5
@@ -679,10 +679,10 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [0,5]     D=============eeeeER.    .    .    ..   fmla	v0.2d, v0.2d, v1.2d
 # CHECK-NEXT: [1,0]     D=================eeeER  .    .    ..   fmul	v0.2d, v0.2d, v0.2d
 # CHECK-NEXT: [1,1]     D==================eeeeER.    .    ..   fmla	v0.2d, v1.2d, v2.2d
-# CHECK-NEXT: [1,2]     D======================eeER   .    ..   fadd	v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: [1,3]     D========================eeeeER    ..   fmla	v0.2d, v1.2d, v2.2d
-# CHECK-NEXT: [1,4]     D==========================eeeeER  ..   fmla	v0.2d, v1.2d, v2.2d
-# CHECK-NEXT: [1,5]     D==============================eeeeER   fmla	v0.2d, v0.2d, v1.2d
+# CHECK-NEXT: [1,2]     .D=====================eeER   .    ..   fadd	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [1,3]     .D=======================eeeeER    ..   fmla	v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: [1,4]     .D=========================eeeeER  ..   fmla	v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: [1,5]     .D=============================eeeeER   fmla	v0.2d, v0.2d, v1.2d
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -693,11 +693,11 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     2     9.5    0.5    0.0       fmul	v0.2d, v0.2d, v0.2d
 # CHECK-NEXT: 1.     2     10.5   0.0    0.0       fmla	v0.2d, v1.2d, v2.2d
-# CHECK-NEXT: 2.     2     14.5   0.0    0.0       fadd	v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: 3.     2     16.5   0.0    0.0       fmla	v0.2d, v1.2d, v2.2d
-# CHECK-NEXT: 4.     2     18.5   0.0    0.0       fmla	v0.2d, v1.2d, v2.2d
-# CHECK-NEXT: 5.     2     22.5   0.0    0.0       fmla	v0.2d, v0.2d, v1.2d
-# CHECK-NEXT:        2     15.3   0.1    0.0       <total>
+# CHECK-NEXT: 2.     2     14.0   0.0    0.0       fadd	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: 3.     2     16.0   0.0    0.0       fmla	v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: 4.     2     18.0   0.0    0.0       fmla	v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: 5.     2     22.0   0.0    0.0       fmla	v0.2d, v0.2d, v1.2d
+# CHECK-NEXT:        2     15.0   0.1    0.0       <total>
 
 # CHECK:      [12] Code Region - fmlal
 
@@ -706,7 +706,7 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: Total Cycles:      2203
 # CHECK-NEXT: Total uOps:        600
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    0.27
 # CHECK-NEXT: IPC:               0.27
 # CHECK-NEXT: Block RThroughput: 1.5
@@ -723,10 +723,10 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [0,5]     D=================eeeeeER.    .    .    .    ..   fmlal	v0.4s, v0.4h, v1.4h
 # CHECK-NEXT: [1,0]     D======================eeeER  .    .    .    ..   fmul	v0.2d, v0.2d, v0.2d
 # CHECK-NEXT: [1,1]     D=========================eeeeeER  .    .    ..   fmlal	v0.4s, v1.4h, v2.4h
-# CHECK-NEXT: [1,2]     D==============================eeER.    .    ..   fadd	v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: [1,3]     D================================eeeeeER.    ..   fmlal	v0.4s, v1.4h, v2.4h
-# CHECK-NEXT: [1,4]     D==================================eeeeeER   ..   fmlal	v0.4s, v1.4h, v2.4h
-# CHECK-NEXT: [1,5]     D=======================================eeeeeER   fmlal	v0.4s, v0.4h, v1.4h
+# CHECK-NEXT: [1,2]     .D=============================eeER.    .    ..   fadd	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [1,3]     .D===============================eeeeeER.    ..   fmlal	v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: [1,4]     .D=================================eeeeeER   ..   fmlal	v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: [1,5]     .D======================================eeeeeER   fmlal	v0.4s, v0.4h, v1.4h
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -737,11 +737,11 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     2     12.0   0.5    0.0       fmul	v0.2d, v0.2d, v0.2d
 # CHECK-NEXT: 1.     2     15.0   0.0    0.0       fmlal	v0.4s, v1.4h, v2.4h
-# CHECK-NEXT: 2.     2     20.0   0.0    0.0       fadd	v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: 3.     2     22.0   0.0    0.0       fmlal	v0.4s, v1.4h, v2.4h
-# CHECK-NEXT: 4.     2     24.0   0.0    0.0       fmlal	v0.4s, v1.4h, v2.4h
-# CHECK-NEXT: 5.     2     29.0   0.0    0.0       fmlal	v0.4s, v0.4h, v1.4h
-# CHECK-NEXT:        2     20.3   0.1    0.0       <total>
+# CHECK-NEXT: 2.     2     19.5   0.0    0.0       fadd	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: 3.     2     21.5   0.0    0.0       fmlal	v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: 4.     2     23.5   0.0    0.0       fmlal	v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: 5.     2     28.5   0.0    0.0       fmlal	v0.4s, v0.4h, v1.4h
+# CHECK-NEXT:        2     20.0   0.1    0.0       <total>
 
 # CHECK:      [13] Code Region - bfdot
 
@@ -750,7 +750,7 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: Total Cycles:      1303
 # CHECK-NEXT: Total uOps:        400
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    0.31
 # CHECK-NEXT: IPC:               0.31
 # CHECK-NEXT: Block RThroughput: 1.0
@@ -788,7 +788,7 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: Total Cycles:      1603
 # CHECK-NEXT: Total uOps:        400
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    0.25
 # CHECK-NEXT: IPC:               0.25
 # CHECK-NEXT: Block RThroughput: 1.0
@@ -826,7 +826,7 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: Total Cycles:      1303
 # CHECK-NEXT: Total uOps:        400
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    0.31
 # CHECK-NEXT: IPC:               0.31
 # CHECK-NEXT: Block RThroughput: 1.0
@@ -864,7 +864,7 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: Total Cycles:      1403
 # CHECK-NEXT: Total uOps:        1100
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    0.78
 # CHECK-NEXT: IPC:               0.78
 # CHECK-NEXT: Block RThroughput: 10.0
@@ -881,20 +881,20 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [0,5]     D=======eeER   .    .    .    .   crc32h	w0, w0, w21
 # CHECK-NEXT: [0,6]     D========eeER  .    .    .    .   crc32w	w0, w0, w24
 # CHECK-NEXT: [0,7]     D=========eeER .    .    .    .   crc32x	w0, w0, x25
-# CHECK-NEXT: [0,8]     D==========eeER.    .    .    .   crc32ch	w0, w0, w16
-# CHECK-NEXT: [0,9]     D===========eeER    .    .    .   crc32cw	w0, w0, w23
-# CHECK-NEXT: [0,10]    D============eeER   .    .    .   crc32cx	w0, w0, x5
-# CHECK-NEXT: [1,0]     D==============eeER .    .    .   mul	w0, w0, w0
-# CHECK-NEXT: [1,1]     D================eeER    .    .   crc32cb	w0, w0, w1
-# CHECK-NEXT: [1,2]     D=================eeER   .    .   crc32cb	w0, w0, w1
-# CHECK-NEXT: [1,3]     D===================eeER .    .   crc32cb	w0, w0, w0
+# CHECK-NEXT: [0,8]     .D=========eeER.    .    .    .   crc32ch	w0, w0, w16
+# CHECK-NEXT: [0,9]     .D==========eeER    .    .    .   crc32cw	w0, w0, w23
+# CHECK-NEXT: [0,10]    .D===========eeER   .    .    .   crc32cx	w0, w0, x5
+# CHECK-NEXT: [1,0]     .D=============eeER .    .    .   mul	w0, w0, w0
+# CHECK-NEXT: [1,1]     .D===============eeER    .    .   crc32cb	w0, w0, w1
+# CHECK-NEXT: [1,2]     .D================eeER   .    .   crc32cb	w0, w0, w1
+# CHECK-NEXT: [1,3]     .D==================eeER .    .   crc32cb	w0, w0, w0
 # CHECK-NEXT: [1,4]     .D===================eeER.    .   crc32b	w0, w0, w15
-# CHECK-NEXT: [1,5]     .D====================eeER    .   crc32h	w0, w0, w21
-# CHECK-NEXT: [1,6]     .D=====================eeER   .   crc32w	w0, w0, w24
-# CHECK-NEXT: [1,7]     .D======================eeER  .   crc32x	w0, w0, x25
-# CHECK-NEXT: [1,8]     .D=======================eeER .   crc32ch	w0, w0, w16
-# CHECK-NEXT: [1,9]     .D========================eeER.   crc32cw	w0, w0, w23
-# CHECK-NEXT: [1,10]    .D=========================eeER   crc32cx	w0, w0, x5
+# CHECK-NEXT: [1,5]     . D===================eeER    .   crc32h	w0, w0, w21
+# CHECK-NEXT: [1,6]     . D====================eeER   .   crc32w	w0, w0, w24
+# CHECK-NEXT: [1,7]     . D=====================eeER  .   crc32x	w0, w0, x25
+# CHECK-NEXT: [1,8]     . D======================eeER .   crc32ch	w0, w0, w16
+# CHECK-NEXT: [1,9]     . D=======================eeER.   crc32cw	w0, w0, w23
+# CHECK-NEXT: [1,10]    . D========================eeER   crc32cx	w0, w0, x5
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -903,18 +903,18 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     8.0    0.5    0.0       mul	w0, w0, w0
-# CHECK-NEXT: 1.     2     10.0   0.0    0.0       crc32cb	w0, w0, w1
-# CHECK-NEXT: 2.     2     11.0   0.0    0.0       crc32cb	w0, w0, w1
-# CHECK-NEXT: 3.     2     13.0   0.0    0.0       crc32cb	w0, w0, w0
+# CHECK-NEXT: 0.     2     7.5    0.5    0.0       mul	w0, w0, w0
+# CHECK-NEXT: 1.     2     9.5    0.0    0.0       crc32cb	w0, w0, w1
+# CHECK-NEXT: 2.     2     10.5   0.0    0.0       crc32cb	w0, w0, w1
+# CHECK-NEXT: 3.     2     12.5   0.0    0.0       crc32cb	w0, w0, w0
 # CHECK-NEXT: 4.     2     13.5   0.0    0.0       crc32b	w0, w0, w15
-# CHECK-NEXT: 5.     2     14.5   0.0    0.0       crc32h	w0, w0, w21
-# CHECK-NEXT: 6.     2     15.5   0.0    0.0       crc32w	w0, w0, w24
-# CHECK-NEXT: 7.     2     16.5   0.0    0.0       crc32x	w0, w0, x25
-# CHECK-NEXT: 8.     2     17.5   0.0    0.0       crc32ch	w0, w0, w16
-# CHECK-NEXT: 9.     2     18.5   0.0    0.0       crc32cw	w0, w0, w23
-# CHECK-NEXT: 10.    2     19.5   0.0    0.0       crc32cx	w0, w0, x5
-# CHECK-NEXT:        2     14.3   0.0    0.0       <total>
+# CHECK-NEXT: 5.     2     14.0   0.0    0.0       crc32h	w0, w0, w21
+# CHECK-NEXT: 6.     2     15.0   0.0    0.0       crc32w	w0, w0, w24
+# CHECK-NEXT: 7.     2     16.0   0.0    0.0       crc32x	w0, w0, x25
+# CHECK-NEXT: 8.     2     16.5   0.0    0.0       crc32ch	w0, w0, w16
+# CHECK-NEXT: 9.     2     17.5   0.0    0.0       crc32cw	w0, w0, w23
+# CHECK-NEXT: 10.    2     18.5   0.0    0.0       crc32cx	w0, w0, x5
+# CHECK-NEXT:        2     13.7   0.0    0.0       <total>
 
 # CHECK:      [17] Code Region - Z sdot.s
 
@@ -923,7 +923,7 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: Total Cycles:      1203
 # CHECK-NEXT: Total uOps:        500
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    0.42
 # CHECK-NEXT: IPC:               0.33
 # CHECK-NEXT: Block RThroughput: 2.0
@@ -938,8 +938,8 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [0,3]     D=========eeeER.    .    ..   sdot	z0.s, z0.b, z1.b
 # CHECK-NEXT: [1,0]     D============eeeeeER.    ..   mul	z0.d, p0/m, z0.d, z0.d
 # CHECK-NEXT: [1,1]     D=================eeeER  ..   sdot	z0.s, z1.b, z2.b
-# CHECK-NEXT: [1,2]     D==================eeeER ..   sdot	z0.s, z1.b, z2.b
-# CHECK-NEXT: [1,3]     D=====================eeeER   sdot	z0.s, z0.b, z1.b
+# CHECK-NEXT: [1,2]     .D=================eeeER ..   sdot	z0.s, z1.b, z2.b
+# CHECK-NEXT: [1,3]     .D====================eeeER   sdot	z0.s, z0.b, z1.b
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -950,9 +950,9 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     2     7.0    0.5    0.0       mul	z0.d, p0/m, z0.d, z0.d
 # CHECK-NEXT: 1.     2     12.0   0.0    0.0       sdot	z0.s, z1.b, z2.b
-# CHECK-NEXT: 2.     2     13.0   0.0    0.0       sdot	z0.s, z1.b, z2.b
-# CHECK-NEXT: 3.     2     16.0   0.0    0.0       sdot	z0.s, z0.b, z1.b
-# CHECK-NEXT:        2     12.0   0.1    0.0       <total>
+# CHECK-NEXT: 2.     2     12.5   0.0    0.0       sdot	z0.s, z1.b, z2.b
+# CHECK-NEXT: 3.     2     15.5   0.0    0.0       sdot	z0.s, z0.b, z1.b
+# CHECK-NEXT:        2     11.8   0.1    0.0       <total>
 
 # CHECK:      [18] Code Region - Z sudot
 
@@ -961,7 +961,7 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: Total Cycles:      1203
 # CHECK-NEXT: Total uOps:        500
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    0.42
 # CHECK-NEXT: IPC:               0.33
 # CHECK-NEXT: Block RThroughput: 2.0
@@ -976,8 +976,8 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [0,3]     D=========eeeER.    .    ..   sdot	z0.s, z0.b, z1.b[1]
 # CHECK-NEXT: [1,0]     D============eeeeeER.    ..   mul	z0.d, p0/m, z0.d, z0.d
 # CHECK-NEXT: [1,1]     D=================eeeER  ..   sdot	z0.s, z1.b, z2.b[1]
-# CHECK-NEXT: [1,2]     D==================eeeER ..   sdot	z0.s, z1.b, z2.b[1]
-# CHECK-NEXT: [1,3]     D=====================eeeER   sdot	z0.s, z0.b, z1.b[1]
+# CHECK-NEXT: [1,2]     .D=================eeeER ..   sdot	z0.s, z1.b, z2.b[1]
+# CHECK-NEXT: [1,3]     .D====================eeeER   sdot	z0.s, z0.b, z1.b[1]
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -988,9 +988,9 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     2     7.0    0.5    0.0       mul	z0.d, p0/m, z0.d, z0.d
 # CHECK-NEXT: 1.     2     12.0   0.0    0.0       sdot	z0.s, z1.b, z2.b[1]
-# CHECK-NEXT: 2.     2     13.0   0.0    0.0       sdot	z0.s, z1.b, z2.b[1]
-# CHECK-NEXT: 3.     2     16.0   0.0    0.0       sdot	z0.s, z0.b, z1.b[1]
-# CHECK-NEXT:        2     12.0   0.1    0.0       <total>
+# CHECK-NEXT: 2.     2     12.5   0.0    0.0       sdot	z0.s, z1.b, z2.b[1]
+# CHECK-NEXT: 3.     2     15.5   0.0    0.0       sdot	z0.s, z0.b, z1.b[1]
+# CHECK-NEXT:        2     11.8   0.1    0.0       <total>
 
 # CHECK:      [19] Code Region - Z sdot.d
 
@@ -999,7 +999,7 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: Total Cycles:      1403
 # CHECK-NEXT: Total uOps:        500
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    0.36
 # CHECK-NEXT: IPC:               0.29
 # CHECK-NEXT: Block RThroughput: 5.0
@@ -1014,8 +1014,8 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [0,3]     D==========eeeeER   .    .    .   sdot	z0.d, z0.h, z1.h
 # CHECK-NEXT: [1,0]     D==============eeeeeER   .    .   mul	z0.d, p0/m, z0.d, z0.d
 # CHECK-NEXT: [1,1]     D===================eeeeER    .   sdot	z0.d, z1.h, z2.h
-# CHECK-NEXT: [1,2]     D====================eeeeER   .   sdot	z0.d, z1.h, z2.h
-# CHECK-NEXT: [1,3]     D========================eeeeER   sdot	z0.d, z0.h, z1.h
+# CHECK-NEXT: [1,2]     .D===================eeeeER   .   sdot	z0.d, z1.h, z2.h
+# CHECK-NEXT: [1,3]     .D=======================eeeeER   sdot	z0.d, z0.h, z1.h
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1026,9 +1026,9 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     2     8.0    0.5    0.0       mul	z0.d, p0/m, z0.d, z0.d
 # CHECK-NEXT: 1.     2     13.0   0.0    0.0       sdot	z0.d, z1.h, z2.h
-# CHECK-NEXT: 2.     2     14.0   0.0    0.0       sdot	z0.d, z1.h, z2.h
-# CHECK-NEXT: 3.     2     18.0   0.0    0.0       sdot	z0.d, z0.h, z1.h
-# CHECK-NEXT:        2     13.3   0.1    0.0       <total>
+# CHECK-NEXT: 2.     2     13.5   0.0    0.0       sdot	z0.d, z1.h, z2.h
+# CHECK-NEXT: 3.     2     17.5   0.0    0.0       sdot	z0.d, z0.h, z1.h
+# CHECK-NEXT:        2     13.0   0.1    0.0       <total>
 
 # CHECK:      [20] Code Region - Z smmla
 
@@ -1037,7 +1037,7 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: Total Cycles:      1203
 # CHECK-NEXT: Total uOps:        500
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    0.42
 # CHECK-NEXT: IPC:               0.33
 # CHECK-NEXT: Block RThroughput: 2.0
@@ -1052,8 +1052,8 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [0,3]     D=========eeeER.    .    ..   smmla	z0.s, z0.b, z1.b
 # CHECK-NEXT: [1,0]     D============eeeeeER.    ..   mul	z0.d, p0/m, z0.d, z0.d
 # CHECK-NEXT: [1,1]     D=================eeeER  ..   smmla	z0.s, z1.b, z2.b
-# CHECK-NEXT: [1,2]     D==================eeeER ..   smmla	z0.s, z1.b, z2.b
-# CHECK-NEXT: [1,3]     D=====================eeeER   smmla	z0.s, z0.b, z1.b
+# CHECK-NEXT: [1,2]     .D=================eeeER ..   smmla	z0.s, z1.b, z2.b
+# CHECK-NEXT: [1,3]     .D====================eeeER   smmla	z0.s, z0.b, z1.b
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1064,9 +1064,9 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     2     7.0    0.5    0.0       mul	z0.d, p0/m, z0.d, z0.d
 # CHECK-NEXT: 1.     2     12.0   0.0    0.0       smmla	z0.s, z1.b, z2.b
-# CHECK-NEXT: 2.     2     13.0   0.0    0.0       smmla	z0.s, z1.b, z2.b
-# CHECK-NEXT: 3.     2     16.0   0.0    0.0       smmla	z0.s, z0.b, z1.b
-# CHECK-NEXT:        2     12.0   0.1    0.0       <total>
+# CHECK-NEXT: 2.     2     12.5   0.0    0.0       smmla	z0.s, z1.b, z2.b
+# CHECK-NEXT: 3.     2     15.5   0.0    0.0       smmla	z0.s, z0.b, z1.b
+# CHECK-NEXT:        2     11.8   0.1    0.0       <total>
 
 # CHECK:      [21] Code Region - Z mla.d
 
@@ -1075,7 +1075,7 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: Total Cycles:      1703
 # CHECK-NEXT: Total uOps:        800
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    0.47
 # CHECK-NEXT: IPC:               0.23
 # CHECK-NEXT: Block RThroughput: 8.0
@@ -1088,9 +1088,9 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [0,1]     D=====eeeeeER  .    .    .    .    ..   mla	z0.d, p0/m, z1.d, z2.d
 # CHECK-NEXT: [0,2]     D=======eeeeeER.    .    .    .    ..   mla	z0.d, p0/m, z1.d, z2.d
 # CHECK-NEXT: [0,3]     D============eeeeeER.    .    .    ..   mla	z0.d, p0/m, z0.d, z1.d
-# CHECK-NEXT: [1,0]     D=================eeeeeER.    .    ..   mul	z0.d, p0/m, z0.d, z0.d
-# CHECK-NEXT: [1,1]     D======================eeeeeER.    ..   mla	z0.d, p0/m, z1.d, z2.d
-# CHECK-NEXT: [1,2]     D========================eeeeeER   ..   mla	z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: [1,0]     .D================eeeeeER.    .    ..   mul	z0.d, p0/m, z0.d, z0.d
+# CHECK-NEXT: [1,1]     .D=====================eeeeeER.    ..   mla	z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: [1,2]     .D=======================eeeeeER   ..   mla	z0.d, p0/m, z1.d, z2.d
 # CHECK-NEXT: [1,3]     .D============================eeeeeER   mla	z0.d, p0/m, z0.d, z1.d
 
 # CHECK:      Average Wait times (based on the timeline view):
@@ -1100,11 +1100,11 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     9.5    0.5    0.0       mul	z0.d, p0/m, z0.d, z0.d
-# CHECK-NEXT: 1.     2     14.5   0.0    0.0       mla	z0.d, p0/m, z1.d, z2.d
-# CHECK-NEXT: 2.     2     16.5   0.0    0.0       mla	z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: 0.     2     9.0    0.5    0.0       mul	z0.d, p0/m, z0.d, z0.d
+# CHECK-NEXT: 1.     2     14.0   0.0    0.0       mla	z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: 2.     2     16.0   0.0    0.0       mla	z0.d, p0/m, z1.d, z2.d
 # CHECK-NEXT: 3.     2     21.0   0.0    0.0       mla	z0.d, p0/m, z0.d, z1.d
-# CHECK-NEXT:        2     15.4   0.1    0.0       <total>
+# CHECK-NEXT:        2     15.0   0.1    0.0       <total>
 
 # CHECK:      [22] Code Region - Z mad.d
 
@@ -1113,7 +1113,7 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: Total Cycles:      1703
 # CHECK-NEXT: Total uOps:        800
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    0.47
 # CHECK-NEXT: IPC:               0.23
 # CHECK-NEXT: Block RThroughput: 8.0
@@ -1126,9 +1126,9 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [0,1]     D=====eeeeeER  .    .    .    .    ..   mad	z0.d, p0/m, z1.d, z2.d
 # CHECK-NEXT: [0,2]     D=======eeeeeER.    .    .    .    ..   mad	z0.d, p0/m, z1.d, z2.d
 # CHECK-NEXT: [0,3]     D============eeeeeER.    .    .    ..   mad	z0.d, p0/m, z0.d, z1.d
-# CHECK-NEXT: [1,0]     D=================eeeeeER.    .    ..   mul	z0.d, p0/m, z0.d, z0.d
-# CHECK-NEXT: [1,1]     D======================eeeeeER.    ..   mad	z0.d, p0/m, z1.d, z2.d
-# CHECK-NEXT: [1,2]     D========================eeeeeER   ..   mad	z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: [1,0]     .D================eeeeeER.    .    ..   mul	z0.d, p0/m, z0.d, z0.d
+# CHECK-NEXT: [1,1]     .D=====================eeeeeER.    ..   mad	z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: [1,2]     .D=======================eeeeeER   ..   mad	z0.d, p0/m, z1.d, z2.d
 # CHECK-NEXT: [1,3]     .D============================eeeeeER   mad	z0.d, p0/m, z0.d, z1.d
 
 # CHECK:      Average Wait times (based on the timeline view):
@@ -1138,11 +1138,11 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     9.5    0.5    0.0       mul	z0.d, p0/m, z0.d, z0.d
-# CHECK-NEXT: 1.     2     14.5   0.0    0.0       mad	z0.d, p0/m, z1.d, z2.d
-# CHECK-NEXT: 2.     2     16.5   0.0    0.0       mad	z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: 0.     2     9.0    0.5    0.0       mul	z0.d, p0/m, z0.d, z0.d
+# CHECK-NEXT: 1.     2     14.0   0.0    0.0       mad	z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: 2.     2     16.0   0.0    0.0       mad	z0.d, p0/m, z1.d, z2.d
 # CHECK-NEXT: 3.     2     21.0   0.0    0.0       mad	z0.d, p0/m, z0.d, z1.d
-# CHECK-NEXT:        2     15.4   0.1    0.0       <total>
+# CHECK-NEXT:        2     15.0   0.1    0.0       <total>
 
 # CHECK:      [23] Code Region - Z msb.d
 
@@ -1151,7 +1151,7 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: Total Cycles:      1703
 # CHECK-NEXT: Total uOps:        800
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    0.47
 # CHECK-NEXT: IPC:               0.23
 # CHECK-NEXT: Block RThroughput: 8.0
@@ -1164,9 +1164,9 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [0,1]     D=====eeeeeER  .    .    .    .    ..   msb	z0.d, p0/m, z1.d, z2.d
 # CHECK-NEXT: [0,2]     D=======eeeeeER.    .    .    .    ..   msb	z0.d, p0/m, z1.d, z2.d
 # CHECK-NEXT: [0,3]     D============eeeeeER.    .    .    ..   msb	z0.d, p0/m, z0.d, z1.d
-# CHECK-NEXT: [1,0]     D=================eeeeeER.    .    ..   mul	z0.d, p0/m, z0.d, z0.d
-# CHECK-NEXT: [1,1]     D======================eeeeeER.    ..   msb	z0.d, p0/m, z1.d, z2.d
-# CHECK-NEXT: [1,2]     D========================eeeeeER   ..   msb	z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: [1,0]     .D================eeeeeER.    .    ..   mul	z0.d, p0/m, z0.d, z0.d
+# CHECK-NEXT: [1,1]     .D=====================eeeeeER.    ..   msb	z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: [1,2]     .D=======================eeeeeER   ..   msb	z0.d, p0/m, z1.d, z2.d
 # CHECK-NEXT: [1,3]     .D============================eeeeeER   msb	z0.d, p0/m, z0.d, z1.d
 
 # CHECK:      Average Wait times (based on the timeline view):
@@ -1176,11 +1176,11 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     9.5    0.5    0.0       mul	z0.d, p0/m, z0.d, z0.d
-# CHECK-NEXT: 1.     2     14.5   0.0    0.0       msb	z0.d, p0/m, z1.d, z2.d
-# CHECK-NEXT: 2.     2     16.5   0.0    0.0       msb	z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: 0.     2     9.0    0.5    0.0       mul	z0.d, p0/m, z0.d, z0.d
+# CHECK-NEXT: 1.     2     14.0   0.0    0.0       msb	z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: 2.     2     16.0   0.0    0.0       msb	z0.d, p0/m, z1.d, z2.d
 # CHECK-NEXT: 3.     2     21.0   0.0    0.0       msb	z0.d, p0/m, z0.d, z1.d
-# CHECK-NEXT:        2     15.4   0.1    0.0       <total>
+# CHECK-NEXT:        2     15.0   0.1    0.0       <total>
 
 # CHECK:      [24] Code Region - Z fcmla ZPmZZ
 
@@ -1189,7 +1189,7 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: Total Cycles:      1503
 # CHECK-NEXT: Total uOps:        400
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    0.27
 # CHECK-NEXT: IPC:               0.27
 # CHECK-NEXT: Block RThroughput: 2.0
@@ -1227,7 +1227,7 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: Total Cycles:      1503
 # CHECK-NEXT: Total uOps:        400
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    0.27
 # CHECK-NEXT: IPC:               0.27
 # CHECK-NEXT: Block RThroughput: 2.0
@@ -1265,7 +1265,7 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: Total Cycles:      1303
 # CHECK-NEXT: Total uOps:        400
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    0.31
 # CHECK-NEXT: IPC:               0.31
 # CHECK-NEXT: Block RThroughput: 2.0
@@ -1303,7 +1303,7 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: Total Cycles:      1303
 # CHECK-NEXT: Total uOps:        400
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    0.31
 # CHECK-NEXT: IPC:               0.31
 # CHECK-NEXT: Block RThroughput: 2.0
@@ -1341,7 +1341,7 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: Total Cycles:      1303
 # CHECK-NEXT: Total uOps:        400
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    0.31
 # CHECK-NEXT: IPC:               0.31
 # CHECK-NEXT: Block RThroughput: 2.0
@@ -1379,7 +1379,7 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: Total Cycles:      1603
 # CHECK-NEXT: Total uOps:        400
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    0.25
 # CHECK-NEXT: IPC:               0.25
 # CHECK-NEXT: Block RThroughput: 2.0
@@ -1417,7 +1417,7 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: Total Cycles:      1503
 # CHECK-NEXT: Total uOps:        400
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    0.27
 # CHECK-NEXT: IPC:               0.27
 # CHECK-NEXT: Block RThroughput: 2.0
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-misc-instructions.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-misc-instructions.s
index 4f48de8b42926..2af85a87c51af 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-misc-instructions.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-misc-instructions.s
@@ -30,21 +30,21 @@ sysl x16, #5, c11, c8, #5
 # CHECK-NEXT: [6]: HasSideEffects (U)
 
 # CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
-# CHECK-NEXT:  1      1     0.07                  U     at	s12e1r, x28
-# CHECK-NEXT:  1      1     0.07                  U     brk	#0x8415
-# CHECK-NEXT:  1      1     0.07    *      *      U     clrex
-# CHECK-NEXT:  1      1     0.07    *      *      U     csdb
-# CHECK-NEXT:  1      1     0.07                  U     dcps1
-# CHECK-NEXT:  1      1     0.07                  U     dcps2
-# CHECK-NEXT:  1      1     0.07                  U     dcps3
-# CHECK-NEXT:  1      1     0.07    *      *      U     dmb	sy
-# CHECK-NEXT:  1      1     0.07                  U     hlt	#0x7a67
-# CHECK-NEXT:  1      1     0.07                  U     hvc	#0xecb9
-# CHECK-NEXT:  1      1     0.07    *      *      U     isb
-# CHECK-NEXT:  1      1     0.07    *      *      U     pssbb
-# CHECK-NEXT:  1      1     0.07                  U     smc	#0x7e57
-# CHECK-NEXT:  1      1     0.07                  U     svc	#0x89cb
-# CHECK-NEXT:  1      1     0.07                  U     sysl	x16, #5, c11, c8, #5
+# CHECK-NEXT:  1      1     0.12                  U     at	s12e1r, x28
+# CHECK-NEXT:  1      1     0.12                  U     brk	#0x8415
+# CHECK-NEXT:  1      1     0.12    *      *      U     clrex
+# CHECK-NEXT:  1      1     0.12    *      *      U     csdb
+# CHECK-NEXT:  1      1     0.12                  U     dcps1
+# CHECK-NEXT:  1      1     0.12                  U     dcps2
+# CHECK-NEXT:  1      1     0.12                  U     dcps3
+# CHECK-NEXT:  1      1     0.12    *      *      U     dmb	sy
+# CHECK-NEXT:  1      1     0.12                  U     hlt	#0x7a67
+# CHECK-NEXT:  1      1     0.12                  U     hvc	#0xecb9
+# CHECK-NEXT:  1      1     0.12    *      *      U     isb
+# CHECK-NEXT:  1      1     0.12    *      *      U     pssbb
+# CHECK-NEXT:  1      1     0.12                  U     smc	#0x7e57
+# CHECK-NEXT:  1      1     0.12                  U     svc	#0x89cb
+# CHECK-NEXT:  1      1     0.12                  U     sysl	x16, #5, c11, c8, #5
 
 # CHECK:      Resources:
 # CHECK-NEXT: [0.0] - V1UnitB
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-sve-instructions.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-sve-instructions.s
index d855ba06ec992..3c0f0b3ddcb15 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-sve-instructions.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-sve-instructions.s
@@ -3991,19 +3991,19 @@ zip2	z31.s, z31.s, z31.s
 # CHECK-NEXT:  2      2     2.00                         2     V1UnitI[2],V1UnitM[2],V1UnitM0[2]          ANDS_PPzPP                 movs	p0.b, p0/z, p0.b
 # CHECK-NEXT:  2      2     2.00                         2     V1UnitI[2],V1UnitM[2],V1UnitM0[2]          ORRS_PPzPP                 movs	p15.b, p15.b
 # CHECK-NEXT:  2      2     2.00                         2     V1UnitI[2],V1UnitM[2],V1UnitM0[2]          ANDS_PPzPP                 movs	p15.b, p15/z, p15.b
-# CHECK-NEXT:  1      1     0.07                  U      1                                                MRS                        mrs	x3, ID_AA64ZFR0_EL1
-# CHECK-NEXT:  1      1     0.07                  U      1                                                MRS                        mrs	x3, ZCR_EL1
-# CHECK-NEXT:  1      1     0.07                  U      1                                                MRS                        mrs	x3, ZCR_EL12
-# CHECK-NEXT:  1      1     0.07                  U      1                                                MRS                        mrs	x3, ZCR_EL2
-# CHECK-NEXT:  1      1     0.07                  U      1                                                MRS                        mrs	x3, ZCR_EL3
-# CHECK-NEXT:  1      1     0.07                  U      1                                                MSR                        msr	ZCR_EL1, x3
+# CHECK-NEXT:  1      1     0.12                  U      1                                                MRS                        mrs	x3, ID_AA64ZFR0_EL1
+# CHECK-NEXT:  1      1     0.12                  U      1                                                MRS                        mrs	x3, ZCR_EL1
+# CHECK-NEXT:  1      1     0.12                  U      1                                                MRS                        mrs	x3, ZCR_EL12
+# CHECK-NEXT:  1      1     0.12                  U      1                                                MRS                        mrs	x3, ZCR_EL2
+# CHECK-NEXT:  1      1     0.12                  U      1                                                MRS                        mrs	x3, ZCR_EL3
+# CHECK-NEXT:  1      1     0.12                  U      1                                                MSR                        msr	ZCR_EL1, x3
 # CHECK-NEXT:  2      5     2.00                         2     V1UnitV[2],V1UnitV0[2],V1UnitV01[2],V1UnitV02[2] MSB_ZPmZZ_D          msb	z0.d, p0/m, z0.d, z0.d
 # CHECK-NEXT:  1      4     1.00                         4     V1UnitV,V1UnitV0,V1UnitV01,V1UnitV02       MSB_ZPmZZ_B                msb	z18.b, p1/m, z27.b, z0.b
 # CHECK-NEXT:  1      4     1.00                         4     V1UnitV,V1UnitV0,V1UnitV01,V1UnitV02       MSB_ZPmZZ_H                msb	z27.h, p5/m, z23.h, z1.h
 # CHECK-NEXT:  1      4     1.00                         4     V1UnitV,V1UnitV0,V1UnitV01,V1UnitV02       MSB_ZPmZZ_S                msb	z26.s, p2/m, z0.s, z2.s
-# CHECK-NEXT:  1      1     0.07                  U      1                                                MSR                        msr	ZCR_EL12, x3
-# CHECK-NEXT:  1      1     0.07                  U      1                                                MSR                        msr	ZCR_EL2, x3
-# CHECK-NEXT:  1      1     0.07                  U      1                                                MSR                        msr	ZCR_EL3, x3
+# CHECK-NEXT:  1      1     0.12                  U      1                                                MSR                        msr	ZCR_EL12, x3
+# CHECK-NEXT:  1      1     0.12                  U      1                                                MSR                        msr	ZCR_EL2, x3
+# CHECK-NEXT:  1      1     0.12                  U      1                                                MSR                        msr	ZCR_EL3, x3
 # CHECK-NEXT:  1      4     1.00                         4     V1UnitV,V1UnitV0,V1UnitV01,V1UnitV02       MUL_ZPmZ_B                 mul	z0.b, p7/m, z0.b, z31.b
 # CHECK-NEXT:  2      5     2.00                         5     V1UnitV[2],V1UnitV0[2],V1UnitV01[2],V1UnitV02[2] MUL_ZPmZ_D           mul	z0.d, p7/m, z0.d, z31.d
 # CHECK-NEXT:  1      4     1.00                         4     V1UnitV,V1UnitV0,V1UnitV01,V1UnitV02       MUL_ZPmZ_H                 mul	z0.h, p7/m, z0.h, z31.h
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-writeback.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-writeback.s
index 264ad8bccc58e..1961b24ae6aac 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-writeback.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-writeback.s
@@ -1165,7 +1165,7 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      508
 # CHECK-NEXT: Total uOps:        1500
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    2.95
 # CHECK-NEXT: IPC:               1.97
 # CHECK-NEXT: Block RThroughput: 2.5
@@ -1179,11 +1179,11 @@ add x0, x27, 1
 # CHECK-NEXT: [0,2]     D=eeeeeeER. .   ld1	{ v1.2d }, [x27], #16
 # CHECK-NEXT: [0,3]     D==eE----R. .   add	x0, x27, #1
 # CHECK-NEXT: [0,4]     D==eeeeeeER .   ld1	{ v1.2s }, [x27], #8
-# CHECK-NEXT: [0,5]     D===eE----R .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     D===eeeeeeER.   ld1	{ v1.4h }, [x27], #8
-# CHECK-NEXT: [0,7]     D====eE----R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     D====eeeeeeER   ld1	{ v1.4s }, [x27], #16
-# CHECK-NEXT: [0,9]     D=====eE----R   add	x0, x27, #1
+# CHECK-NEXT: [0,5]     .D==eE----R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeeeeeER.   ld1	{ v1.4h }, [x27], #8
+# CHECK-NEXT: [0,7]     .D===eE----R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .D===eeeeeeER   ld1	{ v1.4s }, [x27], #16
+# CHECK-NEXT: [0,9]     .D====eE----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1197,12 +1197,12 @@ add x0, x27, 1
 # CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld1	{ v1.2d }, [x27], #16
 # CHECK-NEXT: 3.     1     3.0    0.0    4.0       add	x0, x27, #1
 # CHECK-NEXT: 4.     1     3.0    0.0    0.0       ld1	{ v1.2s }, [x27], #8
-# CHECK-NEXT: 5.     1     4.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     4.0    0.0    0.0       ld1	{ v1.4h }, [x27], #8
-# CHECK-NEXT: 7.     1     5.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     5.0    0.0    0.0       ld1	{ v1.4s }, [x27], #16
-# CHECK-NEXT: 9.     1     6.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT:        1     3.5    0.1    2.0       <total>
+# CHECK-NEXT: 5.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ld1	{ v1.4h }, [x27], #8
+# CHECK-NEXT: 7.     1     4.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     4.0    0.0    0.0       ld1	{ v1.4s }, [x27], #16
+# CHECK-NEXT: 9.     1     5.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT:        1     3.0    0.1    2.0       <total>
 
 # CHECK:      [1] Code Region - G02
 
@@ -1211,7 +1211,7 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      508
 # CHECK-NEXT: Total uOps:        1500
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    2.95
 # CHECK-NEXT: IPC:               1.97
 # CHECK-NEXT: Block RThroughput: 2.5
@@ -1225,11 +1225,11 @@ add x0, x27, 1
 # CHECK-NEXT: [0,2]     D=eeeeeeER. .   ld1	{ v1.8h }, [x27], #16
 # CHECK-NEXT: [0,3]     D==eE----R. .   add	x0, x27, #1
 # CHECK-NEXT: [0,4]     D==eeeeeeER .   ld1	{ v1.16b }, [x27], #16
-# CHECK-NEXT: [0,5]     D===eE----R .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     D===eeeeeeER.   ld1	{ v1.1d }, [x27], x28
-# CHECK-NEXT: [0,7]     D====eE----R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     D====eeeeeeER   ld1	{ v1.2d }, [x27], x28
-# CHECK-NEXT: [0,9]     D=====eE----R   add	x0, x27, #1
+# CHECK-NEXT: [0,5]     .D==eE----R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeeeeeER.   ld1	{ v1.1d }, [x27], x28
+# CHECK-NEXT: [0,7]     .D===eE----R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .D===eeeeeeER   ld1	{ v1.2d }, [x27], x28
+# CHECK-NEXT: [0,9]     .D====eE----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1243,12 +1243,12 @@ add x0, x27, 1
 # CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld1	{ v1.8h }, [x27], #16
 # CHECK-NEXT: 3.     1     3.0    0.0    4.0       add	x0, x27, #1
 # CHECK-NEXT: 4.     1     3.0    0.0    0.0       ld1	{ v1.16b }, [x27], #16
-# CHECK-NEXT: 5.     1     4.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     4.0    0.0    0.0       ld1	{ v1.1d }, [x27], x28
-# CHECK-NEXT: 7.     1     5.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     5.0    0.0    0.0       ld1	{ v1.2d }, [x27], x28
-# CHECK-NEXT: 9.     1     6.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT:        1     3.5    0.1    2.0       <total>
+# CHECK-NEXT: 5.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ld1	{ v1.1d }, [x27], x28
+# CHECK-NEXT: 7.     1     4.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     4.0    0.0    0.0       ld1	{ v1.2d }, [x27], x28
+# CHECK-NEXT: 9.     1     5.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT:        1     3.0    0.1    2.0       <total>
 
 # CHECK:      [2] Code Region - G03
 
@@ -1257,7 +1257,7 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      508
 # CHECK-NEXT: Total uOps:        1500
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    2.95
 # CHECK-NEXT: IPC:               1.97
 # CHECK-NEXT: Block RThroughput: 2.5
@@ -1271,11 +1271,11 @@ add x0, x27, 1
 # CHECK-NEXT: [0,2]     D=eeeeeeER. .   ld1	{ v1.4h }, [x27], x28
 # CHECK-NEXT: [0,3]     D==eE----R. .   add	x0, x27, #1
 # CHECK-NEXT: [0,4]     D==eeeeeeER .   ld1	{ v1.4s }, [x27], x28
-# CHECK-NEXT: [0,5]     D===eE----R .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     D===eeeeeeER.   ld1	{ v1.8b }, [x27], x28
-# CHECK-NEXT: [0,7]     D====eE----R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     D====eeeeeeER   ld1	{ v1.8h }, [x27], x28
-# CHECK-NEXT: [0,9]     D=====eE----R   add	x0, x27, #1
+# CHECK-NEXT: [0,5]     .D==eE----R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeeeeeER.   ld1	{ v1.8b }, [x27], x28
+# CHECK-NEXT: [0,7]     .D===eE----R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .D===eeeeeeER   ld1	{ v1.8h }, [x27], x28
+# CHECK-NEXT: [0,9]     .D====eE----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1289,12 +1289,12 @@ add x0, x27, 1
 # CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld1	{ v1.4h }, [x27], x28
 # CHECK-NEXT: 3.     1     3.0    0.0    4.0       add	x0, x27, #1
 # CHECK-NEXT: 4.     1     3.0    0.0    0.0       ld1	{ v1.4s }, [x27], x28
-# CHECK-NEXT: 5.     1     4.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     4.0    0.0    0.0       ld1	{ v1.8b }, [x27], x28
-# CHECK-NEXT: 7.     1     5.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     5.0    0.0    0.0       ld1	{ v1.8h }, [x27], x28
-# CHECK-NEXT: 9.     1     6.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT:        1     3.5    0.1    2.0       <total>
+# CHECK-NEXT: 5.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ld1	{ v1.8b }, [x27], x28
+# CHECK-NEXT: 7.     1     4.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     4.0    0.0    0.0       ld1	{ v1.8h }, [x27], x28
+# CHECK-NEXT: 9.     1     5.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT:        1     3.0    0.1    2.0       <total>
 
 # CHECK:      [3] Code Region - G04
 
@@ -1303,7 +1303,7 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      508
 # CHECK-NEXT: Total uOps:        1900
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    3.74
 # CHECK-NEXT: IPC:               1.97
 # CHECK-NEXT: Block RThroughput: 3.0
@@ -1316,12 +1316,12 @@ add x0, x27, 1
 # CHECK-NEXT: [0,1]     D=eE----R . .   add	x0, x27, #1
 # CHECK-NEXT: [0,2]     D=eeeeeeER. .   ld1	{ v1.1d, v2.1d }, [x27], #16
 # CHECK-NEXT: [0,3]     D==eE----R. .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     D==eeeeeeER .   ld1	{ v1.2d, v2.2d }, [x27], #32
-# CHECK-NEXT: [0,5]     D===eE----R .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     D===eeeeeeER.   ld1	{ v1.2s, v2.2s }, [x27], #16
-# CHECK-NEXT: [0,7]     D====eE----R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .D===eeeeeeER   ld1	{ v1.4h, v2.4h }, [x27], #16
-# CHECK-NEXT: [0,9]     .D====eE----R   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eeeeeeER .   ld1	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: [0,5]     .D==eE----R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeeeeeER.   ld1	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: [0,7]     .D===eE----R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==eeeeeeER   ld1	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: [0,9]     . D===eE----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1334,13 +1334,13 @@ add x0, x27, 1
 # CHECK-NEXT: 1.     1     2.0    0.0    4.0       add	x0, x27, #1
 # CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld1	{ v1.1d, v2.1d }, [x27], #16
 # CHECK-NEXT: 3.     1     3.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     3.0    0.0    0.0       ld1	{ v1.2d, v2.2d }, [x27], #32
-# CHECK-NEXT: 5.     1     4.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     4.0    0.0    0.0       ld1	{ v1.2s, v2.2s }, [x27], #16
-# CHECK-NEXT: 7.     1     5.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     4.0    0.0    0.0       ld1	{ v1.4h, v2.4h }, [x27], #16
-# CHECK-NEXT: 9.     1     5.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT:        1     3.3    0.1    2.0       <total>
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ld1	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: 5.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ld1	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: 7.     1     4.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    0.0    0.0       ld1	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: 9.     1     4.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.7    0.1    2.0       <total>
 
 # CHECK:      [4] Code Region - G05
 
@@ -1349,7 +1349,7 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      508
 # CHECK-NEXT: Total uOps:        2000
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    3.94
 # CHECK-NEXT: IPC:               1.97
 # CHECK-NEXT: Block RThroughput: 3.3
@@ -1362,12 +1362,12 @@ add x0, x27, 1
 # CHECK-NEXT: [0,1]     D=eE----R . .   add	x0, x27, #1
 # CHECK-NEXT: [0,2]     D=eeeeeeER. .   ld1	{ v1.8b, v2.8b }, [x27], #16
 # CHECK-NEXT: [0,3]     D==eE----R. .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     D==eeeeeeER .   ld1	{ v1.8h, v2.8h }, [x27], #32
-# CHECK-NEXT: [0,5]     D===eE----R .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     D===eeeeeeER.   ld1	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: [0,4]     .D=eeeeeeER .   ld1	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: [0,5]     .D==eE----R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeeeeeER.   ld1	{ v1.16b, v2.16b }, [x27], #32
 # CHECK-NEXT: [0,7]     .D===eE----R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .D===eeeeeeER   ld1	{ v1.1d, v2.1d }, [x27], x28
-# CHECK-NEXT: [0,9]     .D====eE----R   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==eeeeeeER   ld1	{ v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: [0,9]     . D===eE----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1380,13 +1380,13 @@ add x0, x27, 1
 # CHECK-NEXT: 1.     1     2.0    0.0    4.0       add	x0, x27, #1
 # CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld1	{ v1.8b, v2.8b }, [x27], #16
 # CHECK-NEXT: 3.     1     3.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     3.0    0.0    0.0       ld1	{ v1.8h, v2.8h }, [x27], #32
-# CHECK-NEXT: 5.     1     4.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     4.0    0.0    0.0       ld1	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ld1	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: 5.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ld1	{ v1.16b, v2.16b }, [x27], #32
 # CHECK-NEXT: 7.     1     4.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     4.0    0.0    0.0       ld1	{ v1.1d, v2.1d }, [x27], x28
-# CHECK-NEXT: 9.     1     5.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT:        1     3.2    0.1    2.0       <total>
+# CHECK-NEXT: 8.     1     3.0    0.0    0.0       ld1	{ v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: 9.     1     4.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.7    0.1    2.0       <total>
 
 # CHECK:      [5] Code Region - G06
 
@@ -1395,7 +1395,7 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      508
 # CHECK-NEXT: Total uOps:        2000
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    3.94
 # CHECK-NEXT: IPC:               1.97
 # CHECK-NEXT: Block RThroughput: 3.3
@@ -1408,12 +1408,12 @@ add x0, x27, 1
 # CHECK-NEXT: [0,1]     D=eE----R . .   add	x0, x27, #1
 # CHECK-NEXT: [0,2]     D=eeeeeeER. .   ld1	{ v1.2s, v2.2s }, [x27], x28
 # CHECK-NEXT: [0,3]     D==eE----R. .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     D==eeeeeeER .   ld1	{ v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: [0,5]     D===eE----R .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     D===eeeeeeER.   ld1	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: [0,4]     .D=eeeeeeER .   ld1	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: [0,5]     .D==eE----R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeeeeeER.   ld1	{ v1.4s, v2.4s }, [x27], x28
 # CHECK-NEXT: [0,7]     .D===eE----R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .D===eeeeeeER   ld1	{ v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: [0,9]     .D====eE----R   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==eeeeeeER   ld1	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: [0,9]     . D===eE----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1426,13 +1426,13 @@ add x0, x27, 1
 # CHECK-NEXT: 1.     1     2.0    0.0    4.0       add	x0, x27, #1
 # CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld1	{ v1.2s, v2.2s }, [x27], x28
 # CHECK-NEXT: 3.     1     3.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     3.0    0.0    0.0       ld1	{ v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: 5.     1     4.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     4.0    0.0    0.0       ld1	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ld1	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: 5.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ld1	{ v1.4s, v2.4s }, [x27], x28
 # CHECK-NEXT: 7.     1     4.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     4.0    0.0    0.0       ld1	{ v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: 9.     1     5.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT:        1     3.2    0.1    2.0       <total>
+# CHECK-NEXT: 8.     1     3.0    0.0    0.0       ld1	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: 9.     1     4.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.7    0.1    2.0       <total>
 
 # CHECK:      [6] Code Region - G07
 
@@ -1441,7 +1441,7 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      508
 # CHECK-NEXT: Total uOps:        2300
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    4.53
 # CHECK-NEXT: IPC:               1.97
 # CHECK-NEXT: Block RThroughput: 4.3
@@ -1454,12 +1454,12 @@ add x0, x27, 1
 # CHECK-NEXT: [0,1]     D=eE----R . .   add	x0, x27, #1
 # CHECK-NEXT: [0,2]     D=eeeeeeER. .   ld1	{ v1.16b, v2.16b }, [x27], x28
 # CHECK-NEXT: [0,3]     D==eE----R. .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     D==eeeeeeER .   ld1	{ v1.1d, v2.1d, v3.1d }, [x27], #24
-# CHECK-NEXT: [0,5]     D===eE----R .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D==eeeeeeER.   ld1	{ v1.2d, v2.2d, v3.2d }, [x27], #48
-# CHECK-NEXT: [0,7]     .D===eE----R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .D===eeeeeeER   ld1	{ v1.2s, v2.2s, v3.2s }, [x27], #24
-# CHECK-NEXT: [0,9]     .D====eE----R   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eeeeeeER .   ld1	{ v1.1d, v2.1d, v3.1d }, [x27], #24
+# CHECK-NEXT: [0,5]     .D==eE----R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     . D=eeeeeeER.   ld1	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: [0,7]     . D==eE----R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .  D=eeeeeeER   ld1	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: [0,9]     .  D==eE----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1472,13 +1472,13 @@ add x0, x27, 1
 # CHECK-NEXT: 1.     1     2.0    0.0    4.0       add	x0, x27, #1
 # CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld1	{ v1.16b, v2.16b }, [x27], x28
 # CHECK-NEXT: 3.     1     3.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     3.0    0.0    0.0       ld1	{ v1.1d, v2.1d, v3.1d }, [x27], #24
-# CHECK-NEXT: 5.     1     4.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ld1	{ v1.2d, v2.2d, v3.2d }, [x27], #48
-# CHECK-NEXT: 7.     1     4.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     4.0    0.0    0.0       ld1	{ v1.2s, v2.2s, v3.2s }, [x27], #24
-# CHECK-NEXT: 9.     1     5.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT:        1     3.1    0.1    2.0       <total>
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ld1	{ v1.1d, v2.1d, v3.1d }, [x27], #24
+# CHECK-NEXT: 5.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     2.0    0.0    0.0       ld1	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: 7.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     2.0    0.0    0.0       ld1	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: 9.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.3    0.1    2.0       <total>
 
 # CHECK:      [7] Code Region - G08
 
@@ -1487,7 +1487,7 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      508
 # CHECK-NEXT: Total uOps:        2500
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    4.92
 # CHECK-NEXT: IPC:               1.97
 # CHECK-NEXT: Block RThroughput: 5.0
@@ -1498,14 +1498,14 @@ add x0, x27, 1
 
 # CHECK:      [0,0]     DeeeeeeER . .   ld1	{ v1.4h, v2.4h, v3.4h }, [x27], #24
 # CHECK-NEXT: [0,1]     D=eE----R . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=eeeeeeER. .   ld1	{ v1.4s, v2.4s, v3.4s }, [x27], #48
-# CHECK-NEXT: [0,3]     D==eE----R. .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     D==eeeeeeER .   ld1	{ v1.8b, v2.8b, v3.8b }, [x27], #24
-# CHECK-NEXT: [0,5]     D===eE----R .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D==eeeeeeER.   ld1	{ v1.8h, v2.8h, v3.8h }, [x27], #48
-# CHECK-NEXT: [0,7]     .D===eE----R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .D===eeeeeeER   ld1	{ v1.16b, v2.16b, v3.16b }, [x27], #48
-# CHECK-NEXT: [0,9]     .D====eE----R   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeeER. .   ld1	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: [0,3]     .D=eE----R. .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DeeeeeeER .   ld1	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: [0,5]     . D=eE----R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  DeeeeeeER.   ld1	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: [0,7]     .  D=eE----R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   DeeeeeeER   ld1	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: [0,9]     .   D=eE----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1516,15 +1516,15 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.4h, v2.4h, v3.4h }, [x27], #24
 # CHECK-NEXT: 1.     1     2.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld1	{ v1.4s, v2.4s, v3.4s }, [x27], #48
-# CHECK-NEXT: 3.     1     3.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     3.0    0.0    0.0       ld1	{ v1.8b, v2.8b, v3.8b }, [x27], #24
-# CHECK-NEXT: 5.     1     4.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ld1	{ v1.8h, v2.8h, v3.8h }, [x27], #48
-# CHECK-NEXT: 7.     1     4.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     4.0    0.0    0.0       ld1	{ v1.16b, v2.16b, v3.16b }, [x27], #48
-# CHECK-NEXT: 9.     1     5.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT:        1     3.1    0.1    2.0       <total>
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld1	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: 3.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ld1	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: 5.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    0.0    0.0       ld1	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: 7.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    0.0    0.0       ld1	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: 9.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.5    0.1    2.0       <total>
 
 # CHECK:      [8] Code Region - G09
 
@@ -1533,7 +1533,7 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      508
 # CHECK-NEXT: Total uOps:        2500
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    4.92
 # CHECK-NEXT: IPC:               1.97
 # CHECK-NEXT: Block RThroughput: 5.0
@@ -1544,14 +1544,14 @@ add x0, x27, 1
 
 # CHECK:      [0,0]     DeeeeeeER . .   ld1	{ v1.1d, v2.1d, v3.1d }, [x27], x28
 # CHECK-NEXT: [0,1]     D=eE----R . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=eeeeeeER. .   ld1	{ v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT: [0,3]     D==eE----R. .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     D==eeeeeeER .   ld1	{ v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: [0,5]     D===eE----R .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D==eeeeeeER.   ld1	{ v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: [0,7]     .D===eE----R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .D===eeeeeeER   ld1	{ v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: [0,9]     .D====eE----R   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeeER. .   ld1	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: [0,3]     .D=eE----R. .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DeeeeeeER .   ld1	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: [0,5]     . D=eE----R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  DeeeeeeER.   ld1	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: [0,7]     .  D=eE----R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   DeeeeeeER   ld1	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: [0,9]     .   D=eE----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1562,15 +1562,15 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.1d, v2.1d, v3.1d }, [x27], x28
 # CHECK-NEXT: 1.     1     2.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld1	{ v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT: 3.     1     3.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     3.0    0.0    0.0       ld1	{ v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: 5.     1     4.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ld1	{ v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: 7.     1     4.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     4.0    0.0    0.0       ld1	{ v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: 9.     1     5.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT:        1     3.1    0.1    2.0       <total>
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld1	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: 3.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ld1	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: 5.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    0.0    0.0       ld1	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: 7.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    0.0    0.0       ld1	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: 9.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.5    0.1    2.0       <total>
 
 # CHECK:      [9] Code Region - G10
 
@@ -1579,7 +1579,7 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      608
 # CHECK-NEXT: Total uOps:        2500
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    4.11
 # CHECK-NEXT: IPC:               1.64
 # CHECK-NEXT: Block RThroughput: 5.0
@@ -1590,14 +1590,14 @@ add x0, x27, 1
 
 # CHECK:      [0,0]     DeeeeeeER .  .   ld1	{ v1.8b, v2.8b, v3.8b }, [x27], x28
 # CHECK-NEXT: [0,1]     D=eE----R .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=eeeeeeER.  .   ld1	{ v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT: [0,3]     D==eE----R.  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     D==eeeeeeER  .   ld1	{ v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: [0,5]     D===eE----R  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D==eeeeeeER .   ld1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
-# CHECK-NEXT: [0,7]     .D===eE----R .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .D===eeeeeeeER   ld1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
-# CHECK-NEXT: [0,9]     .D====eE-----R   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeeER.  .   ld1	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: [0,3]     .D=eE----R.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DeeeeeeER  .   ld1	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: [0,5]     . D=eE----R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     . D=eeeeeeER .   ld1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: [0,7]     .  D=eE----R .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .  D=eeeeeeeER   ld1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: [0,9]     .  D==eE-----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1608,15 +1608,15 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.8b, v2.8b, v3.8b }, [x27], x28
 # CHECK-NEXT: 1.     1     2.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld1	{ v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT: 3.     1     3.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     3.0    0.0    0.0       ld1	{ v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: 5.     1     4.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ld1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
-# CHECK-NEXT: 7.     1     4.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     4.0    0.0    0.0       ld1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
-# CHECK-NEXT: 9.     1     5.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT:        1     3.1    0.1    2.1       <total>
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld1	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: 3.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ld1	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: 5.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     2.0    0.0    0.0       ld1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: 7.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     2.0    0.0    0.0       ld1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: 9.     1     3.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.8    0.1    2.1       <total>
 
 # CHECK:      [10] Code Region - G11
 
@@ -1625,7 +1625,7 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      509
 # CHECK-NEXT: Total uOps:        2400
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    4.72
 # CHECK-NEXT: IPC:               1.96
 # CHECK-NEXT: Block RThroughput: 4.7
@@ -1638,12 +1638,12 @@ add x0, x27, 1
 # CHECK-NEXT: [0,1]     D=eE----R .  .   add	x0, x27, #1
 # CHECK-NEXT: [0,2]     D=eeeeeeER.  .   ld1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
 # CHECK-NEXT: [0,3]     D==eE----R.  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     D==eeeeeeeER .   ld1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
-# CHECK-NEXT: [0,5]     D===eE-----R .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D==eeeeeeER .   ld1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
-# CHECK-NEXT: [0,7]     .D===eE----R .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .D===eeeeeeeER   ld1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
-# CHECK-NEXT: [0,9]     .D====eE-----R   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eeeeeeeER .   ld1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: [0,5]     .D==eE-----R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     . D=eeeeeeER .   ld1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: [0,7]     . D==eE----R .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .  D=eeeeeeeER   ld1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: [0,9]     .  D==eE-----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1656,13 +1656,13 @@ add x0, x27, 1
 # CHECK-NEXT: 1.     1     2.0    0.0    4.0       add	x0, x27, #1
 # CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
 # CHECK-NEXT: 3.     1     3.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     3.0    0.0    0.0       ld1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
-# CHECK-NEXT: 5.     1     4.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ld1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
-# CHECK-NEXT: 7.     1     4.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     4.0    0.0    0.0       ld1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
-# CHECK-NEXT: 9.     1     5.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT:        1     3.1    0.1    2.2       <total>
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ld1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: 5.     1     3.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     2.0    0.0    0.0       ld1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: 7.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     2.0    0.0    0.0       ld1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: 9.     1     3.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.3    0.1    2.2       <total>
 
 # CHECK:      [11] Code Region - G12
 
@@ -1671,7 +1671,7 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      508
 # CHECK-NEXT: Total uOps:        2400
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    4.72
 # CHECK-NEXT: IPC:               1.97
 # CHECK-NEXT: Block RThroughput: 4.7
@@ -1682,14 +1682,14 @@ add x0, x27, 1
 
 # CHECK:      [0,0]     DeeeeeeeER. .   ld1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
 # CHECK-NEXT: [0,1]     D=eE-----R. .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=eeeeeeER. .   ld1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
-# CHECK-NEXT: [0,3]     D==eE----R. .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     D==eeeeeeeER.   ld1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: [0,5]     .D==eE-----R.   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D==eeeeeeER.   ld1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: [0,7]     .D===eE----R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .D===eeeeeeER   ld1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: [0,9]     .D====eE----R   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeeER. .   ld1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# CHECK-NEXT: [0,3]     .D=eE----R. .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DeeeeeeeER.   ld1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: [0,5]     . D=eE-----R.   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  DeeeeeeER.   ld1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: [0,7]     .  D=eE----R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .  D=eeeeeeER   ld1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: [0,9]     .  D==eE----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1700,15 +1700,15 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
 # CHECK-NEXT: 1.     1     2.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
-# CHECK-NEXT: 3.     1     3.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     3.0    0.0    0.0       ld1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: 5.     1     3.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ld1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: 7.     1     4.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     4.0    0.0    0.0       ld1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: 9.     1     5.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT:        1     3.0    0.1    2.2       <total>
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# CHECK-NEXT: 3.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ld1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: 5.     1     2.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    0.0    0.0       ld1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: 7.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     2.0    0.0    0.0       ld1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: 9.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.7    0.1    2.2       <total>
 
 # CHECK:      [12] Code Region - G13
 
@@ -1717,7 +1717,7 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      1110
 # CHECK-NEXT: Total uOps:        2600
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    2.34
 # CHECK-NEXT: IPC:               0.90
 # CHECK-NEXT: Block RThroughput: 5.0
@@ -1728,14 +1728,14 @@ add x0, x27, 1
 
 # CHECK:      [0,0]     DeeeeeeeER.    .    .   ld1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
 # CHECK-NEXT: [0,1]     D=eE-----R.    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=eeeeeeER.    .    .   ld1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: [0,3]     D==eE----R.    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     D==eeeeeeeER   .    .   ld1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: [0,5]     .D==eE-----R   .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D==eeeeeeeER  .    .   ld1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: [0,7]     .D===eE-----R  .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .D=========eeeeeeeeER   ld1	{ v1.b }[0], [x27], #1
-# CHECK-NEXT: [0,9]     .D==========eE------R   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeeER.    .    .   ld1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: [0,3]     .D=eE----R.    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DeeeeeeeER   .    .   ld1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: [0,5]     . D=eE-----R   .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  DeeeeeeeER  .    .   ld1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: [0,7]     .  D=eE-----R  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D======eeeeeeeeER   ld1	{ v1.b }[0], [x27], #1
+# CHECK-NEXT: [0,9]     .   D=======eE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1746,15 +1746,15 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
 # CHECK-NEXT: 1.     1     2.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: 3.     1     3.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     3.0    0.0    0.0       ld1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: 5.     1     3.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ld1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: 7.     1     4.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     10.0   0.0    0.0       ld1	{ v1.b }[0], [x27], #1
-# CHECK-NEXT: 9.     1     11.0   0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT:        1     4.2    0.1    2.5       <total>
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: 3.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ld1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: 5.     1     2.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    0.0    0.0       ld1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: 7.     1     2.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     7.0    0.0    0.0       ld1	{ v1.b }[0], [x27], #1
+# CHECK-NEXT: 9.     1     8.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.7    0.1    2.5       <total>
 
 # CHECK:      [13] Code Region - G14
 
@@ -1763,7 +1763,7 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      4003
 # CHECK-NEXT: Total uOps:        2000
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    0.50
 # CHECK-NEXT: IPC:               0.25
 # CHECK-NEXT: Block RThroughput: 2.5
@@ -1776,12 +1776,12 @@ add x0, x27, 1
 # CHECK-NEXT: [0,1]     D=eE------R    .    .    .    .    .    . .   add	x0, x27, #1
 # CHECK-NEXT: [0,2]     D========eeeeeeeeER .    .    .    .    . .   ld1	{ v1.b }[0], [x27], x28
 # CHECK-NEXT: [0,3]     D=========eE------R .    .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     D================eeeeeeeeER   .    .    . .   ld1	{ v1.b }[8], [x27], x28
-# CHECK-NEXT: [0,5]     D=================eE------R   .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     D========================eeeeeeeeER.    . .   ld1	{ v1.h }[0], [x27], #2
+# CHECK-NEXT: [0,4]     .D===============eeeeeeeeER   .    .    . .   ld1	{ v1.b }[8], [x27], x28
+# CHECK-NEXT: [0,5]     .D================eE------R   .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D=======================eeeeeeeeER.    . .   ld1	{ v1.h }[0], [x27], #2
 # CHECK-NEXT: [0,7]     .D========================eE------R.    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .D===============================eeeeeeeeER   ld1	{ v1.h }[4], [x27], #2
-# CHECK-NEXT: [0,9]     .D================================eE------R   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==============================eeeeeeeeER   ld1	{ v1.h }[4], [x27], #2
+# CHECK-NEXT: [0,9]     . D===============================eE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1794,13 +1794,13 @@ add x0, x27, 1
 # CHECK-NEXT: 1.     1     2.0    0.0    6.0       add	x0, x27, #1
 # CHECK-NEXT: 2.     1     9.0    0.0    0.0       ld1	{ v1.b }[0], [x27], x28
 # CHECK-NEXT: 3.     1     10.0   0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     17.0   0.0    0.0       ld1	{ v1.b }[8], [x27], x28
-# CHECK-NEXT: 5.     1     18.0   0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     25.0   0.0    0.0       ld1	{ v1.h }[0], [x27], #2
+# CHECK-NEXT: 4.     1     16.0   0.0    0.0       ld1	{ v1.b }[8], [x27], x28
+# CHECK-NEXT: 5.     1     17.0   0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     24.0   0.0    0.0       ld1	{ v1.h }[0], [x27], #2
 # CHECK-NEXT: 7.     1     25.0   0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     32.0   0.0    0.0       ld1	{ v1.h }[4], [x27], #2
-# CHECK-NEXT: 9.     1     33.0   0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT:        1     17.2   0.1    3.0       <total>
+# CHECK-NEXT: 8.     1     31.0   0.0    0.0       ld1	{ v1.h }[4], [x27], #2
+# CHECK-NEXT: 9.     1     32.0   0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     16.7   0.1    3.0       <total>
 
 # CHECK:      [14] Code Region - G15
 
@@ -1809,7 +1809,7 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      4003
 # CHECK-NEXT: Total uOps:        2000
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    0.50
 # CHECK-NEXT: IPC:               0.25
 # CHECK-NEXT: Block RThroughput: 2.5
@@ -1822,12 +1822,12 @@ add x0, x27, 1
 # CHECK-NEXT: [0,1]     D=eE------R    .    .    .    .    .    . .   add	x0, x27, #1
 # CHECK-NEXT: [0,2]     D========eeeeeeeeER .    .    .    .    . .   ld1	{ v1.h }[4], [x27], x28
 # CHECK-NEXT: [0,3]     D=========eE------R .    .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     D================eeeeeeeeER   .    .    . .   ld1	{ v1.s }[0], [x27], #4
-# CHECK-NEXT: [0,5]     D=================eE------R   .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     D========================eeeeeeeeER.    . .   ld1	{ v1.s }[0], [x27], x28
+# CHECK-NEXT: [0,4]     .D===============eeeeeeeeER   .    .    . .   ld1	{ v1.s }[0], [x27], #4
+# CHECK-NEXT: [0,5]     .D================eE------R   .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D=======================eeeeeeeeER.    . .   ld1	{ v1.s }[0], [x27], x28
 # CHECK-NEXT: [0,7]     .D========================eE------R.    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .D===============================eeeeeeeeER   ld1	{ v1.d }[0], [x27], #8
-# CHECK-NEXT: [0,9]     .D================================eE------R   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==============================eeeeeeeeER   ld1	{ v1.d }[0], [x27], #8
+# CHECK-NEXT: [0,9]     . D===============================eE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1840,13 +1840,13 @@ add x0, x27, 1
 # CHECK-NEXT: 1.     1     2.0    0.0    6.0       add	x0, x27, #1
 # CHECK-NEXT: 2.     1     9.0    0.0    0.0       ld1	{ v1.h }[4], [x27], x28
 # CHECK-NEXT: 3.     1     10.0   0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     17.0   0.0    0.0       ld1	{ v1.s }[0], [x27], #4
-# CHECK-NEXT: 5.     1     18.0   0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     25.0   0.0    0.0       ld1	{ v1.s }[0], [x27], x28
+# CHECK-NEXT: 4.     1     16.0   0.0    0.0       ld1	{ v1.s }[0], [x27], #4
+# CHECK-NEXT: 5.     1     17.0   0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     24.0   0.0    0.0       ld1	{ v1.s }[0], [x27], x28
 # CHECK-NEXT: 7.     1     25.0   0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     32.0   0.0    0.0       ld1	{ v1.d }[0], [x27], #8
-# CHECK-NEXT: 9.     1     33.0   0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT:        1     17.2   0.1    3.0       <total>
+# CHECK-NEXT: 8.     1     31.0   0.0    0.0       ld1	{ v1.d }[0], [x27], #8
+# CHECK-NEXT: 9.     1     32.0   0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     16.7   0.1    3.0       <total>
 
 # CHECK:      [15] Code Region - G16
 
@@ -1855,7 +1855,7 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      1203
 # CHECK-NEXT: Total uOps:        2000
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    1.66
 # CHECK-NEXT: IPC:               0.83
 # CHECK-NEXT: Block RThroughput: 2.5
@@ -1868,12 +1868,12 @@ add x0, x27, 1
 # CHECK-NEXT: [0,1]     D=eE------R   .   add	x0, x27, #1
 # CHECK-NEXT: [0,2]     D=eeeeeeeeER  .   ld1r	{ v1.1d }, [x27], #8
 # CHECK-NEXT: [0,3]     D==eE------R  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     D==eeeeeeeeER .   ld1r	{ v1.2d }, [x27], #8
-# CHECK-NEXT: [0,5]     D===eE------R .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     D===eeeeeeeeER.   ld1r	{ v1.2s }, [x27], #4
+# CHECK-NEXT: [0,4]     .D=eeeeeeeeER .   ld1r	{ v1.2d }, [x27], #8
+# CHECK-NEXT: [0,5]     .D==eE------R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeeeeeeeER.   ld1r	{ v1.2s }, [x27], #4
 # CHECK-NEXT: [0,7]     .D===eE------R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .D===eeeeeeeeER   ld1r	{ v1.4h }, [x27], #2
-# CHECK-NEXT: [0,9]     .D====eE------R   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==eeeeeeeeER   ld1r	{ v1.4h }, [x27], #2
+# CHECK-NEXT: [0,9]     . D===eE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1886,13 +1886,13 @@ add x0, x27, 1
 # CHECK-NEXT: 1.     1     2.0    0.0    6.0       add	x0, x27, #1
 # CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld1r	{ v1.1d }, [x27], #8
 # CHECK-NEXT: 3.     1     3.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     3.0    0.0    0.0       ld1r	{ v1.2d }, [x27], #8
-# CHECK-NEXT: 5.     1     4.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     4.0    0.0    0.0       ld1r	{ v1.2s }, [x27], #4
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ld1r	{ v1.2d }, [x27], #8
+# CHECK-NEXT: 5.     1     3.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ld1r	{ v1.2s }, [x27], #4
 # CHECK-NEXT: 7.     1     4.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     4.0    0.0    0.0       ld1r	{ v1.4h }, [x27], #2
-# CHECK-NEXT: 9.     1     5.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT:        1     3.2    0.1    3.0       <total>
+# CHECK-NEXT: 8.     1     3.0    0.0    0.0       ld1r	{ v1.4h }, [x27], #2
+# CHECK-NEXT: 9.     1     4.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.7    0.1    3.0       <total>
 
 # CHECK:      [16] Code Region - G17
 
@@ -1901,7 +1901,7 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      510
 # CHECK-NEXT: Total uOps:        2000
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    3.92
 # CHECK-NEXT: IPC:               1.96
 # CHECK-NEXT: Block RThroughput: 2.5
@@ -1914,12 +1914,12 @@ add x0, x27, 1
 # CHECK-NEXT: [0,1]     D=eE------R   .   add	x0, x27, #1
 # CHECK-NEXT: [0,2]     D=eeeeeeeeER  .   ld1r	{ v1.8b }, [x27], #1
 # CHECK-NEXT: [0,3]     D==eE------R  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     D==eeeeeeeeER .   ld1r	{ v1.8h }, [x27], #2
-# CHECK-NEXT: [0,5]     D===eE------R .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     D===eeeeeeeeER.   ld1r	{ v1.16b }, [x27], #1
+# CHECK-NEXT: [0,4]     .D=eeeeeeeeER .   ld1r	{ v1.8h }, [x27], #2
+# CHECK-NEXT: [0,5]     .D==eE------R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeeeeeeeER.   ld1r	{ v1.16b }, [x27], #1
 # CHECK-NEXT: [0,7]     .D===eE------R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .D===eeeeeeeeER   ld1r	{ v1.1d }, [x27], x28
-# CHECK-NEXT: [0,9]     .D====eE------R   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==eeeeeeeeER   ld1r	{ v1.1d }, [x27], x28
+# CHECK-NEXT: [0,9]     . D===eE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1932,13 +1932,13 @@ add x0, x27, 1
 # CHECK-NEXT: 1.     1     2.0    0.0    6.0       add	x0, x27, #1
 # CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld1r	{ v1.8b }, [x27], #1
 # CHECK-NEXT: 3.     1     3.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     3.0    0.0    0.0       ld1r	{ v1.8h }, [x27], #2
-# CHECK-NEXT: 5.     1     4.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     4.0    0.0    0.0       ld1r	{ v1.16b }, [x27], #1
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ld1r	{ v1.8h }, [x27], #2
+# CHECK-NEXT: 5.     1     3.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ld1r	{ v1.16b }, [x27], #1
 # CHECK-NEXT: 7.     1     4.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     4.0    0.0    0.0       ld1r	{ v1.1d }, [x27], x28
-# CHECK-NEXT: 9.     1     5.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT:        1     3.2    0.1    3.0       <total>
+# CHECK-NEXT: 8.     1     3.0    0.0    0.0       ld1r	{ v1.1d }, [x27], x28
+# CHECK-NEXT: 9.     1     4.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.7    0.1    3.0       <total>
 
 # CHECK:      [17] Code Region - G18
 
@@ -1947,7 +1947,7 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      510
 # CHECK-NEXT: Total uOps:        2000
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    3.92
 # CHECK-NEXT: IPC:               1.96
 # CHECK-NEXT: Block RThroughput: 2.5
@@ -1960,12 +1960,12 @@ add x0, x27, 1
 # CHECK-NEXT: [0,1]     D=eE------R   .   add	x0, x27, #1
 # CHECK-NEXT: [0,2]     D=eeeeeeeeER  .   ld1r	{ v1.2s }, [x27], x28
 # CHECK-NEXT: [0,3]     D==eE------R  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     D==eeeeeeeeER .   ld1r	{ v1.4h }, [x27], x28
-# CHECK-NEXT: [0,5]     D===eE------R .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     D===eeeeeeeeER.   ld1r	{ v1.4s }, [x27], x28
+# CHECK-NEXT: [0,4]     .D=eeeeeeeeER .   ld1r	{ v1.4h }, [x27], x28
+# CHECK-NEXT: [0,5]     .D==eE------R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeeeeeeeER.   ld1r	{ v1.4s }, [x27], x28
 # CHECK-NEXT: [0,7]     .D===eE------R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .D===eeeeeeeeER   ld1r	{ v1.8b }, [x27], x28
-# CHECK-NEXT: [0,9]     .D====eE------R   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==eeeeeeeeER   ld1r	{ v1.8b }, [x27], x28
+# CHECK-NEXT: [0,9]     . D===eE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1978,13 +1978,13 @@ add x0, x27, 1
 # CHECK-NEXT: 1.     1     2.0    0.0    6.0       add	x0, x27, #1
 # CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld1r	{ v1.2s }, [x27], x28
 # CHECK-NEXT: 3.     1     3.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     3.0    0.0    0.0       ld1r	{ v1.4h }, [x27], x28
-# CHECK-NEXT: 5.     1     4.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     4.0    0.0    0.0       ld1r	{ v1.4s }, [x27], x28
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ld1r	{ v1.4h }, [x27], x28
+# CHECK-NEXT: 5.     1     3.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ld1r	{ v1.4s }, [x27], x28
 # CHECK-NEXT: 7.     1     4.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     4.0    0.0    0.0       ld1r	{ v1.8b }, [x27], x28
-# CHECK-NEXT: 9.     1     5.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT:        1     3.2    0.1    3.0       <total>
+# CHECK-NEXT: 8.     1     3.0    0.0    0.0       ld1r	{ v1.8b }, [x27], x28
+# CHECK-NEXT: 9.     1     4.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.7    0.1    3.0       <total>
 
 # CHECK:      [18] Code Region - G19
 
@@ -1993,10 +1993,10 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      510
 # CHECK-NEXT: Total uOps:        2400
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    4.71
 # CHECK-NEXT: IPC:               1.96
-# CHECK-NEXT: Block RThroughput: 2.5
+# CHECK-NEXT: Block RThroughput: 3.0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     01234
@@ -2006,12 +2006,12 @@ add x0, x27, 1
 # CHECK-NEXT: [0,1]     D=eE------R   .   add	x0, x27, #1
 # CHECK-NEXT: [0,2]     D=eeeeeeeeER  .   ld1r	{ v1.16b }, [x27], x28
 # CHECK-NEXT: [0,3]     D==eE------R  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     D==eeeeeeeeER .   ld2	{ v1.2d, v2.2d }, [x27], #32
-# CHECK-NEXT: [0,5]     D===eE------R .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D==eeeeeeeeER.   ld2	{ v1.2s, v2.2s }, [x27], #16
-# CHECK-NEXT: [0,7]     .D===eE------R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .D===eeeeeeeeER   ld2	{ v1.4h, v2.4h }, [x27], #16
-# CHECK-NEXT: [0,9]     .D====eE------R   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eeeeeeeeER .   ld2	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: [0,5]     .D==eE------R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     . D=eeeeeeeeER.   ld2	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: [0,7]     . D==eE------R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .  D=eeeeeeeeER   ld2	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: [0,9]     .  D==eE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2024,13 +2024,13 @@ add x0, x27, 1
 # CHECK-NEXT: 1.     1     2.0    0.0    6.0       add	x0, x27, #1
 # CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld1r	{ v1.16b }, [x27], x28
 # CHECK-NEXT: 3.     1     3.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     3.0    0.0    0.0       ld2	{ v1.2d, v2.2d }, [x27], #32
-# CHECK-NEXT: 5.     1     4.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ld2	{ v1.2s, v2.2s }, [x27], #16
-# CHECK-NEXT: 7.     1     4.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     4.0    0.0    0.0       ld2	{ v1.4h, v2.4h }, [x27], #16
-# CHECK-NEXT: 9.     1     5.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT:        1     3.1    0.1    3.0       <total>
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ld2	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: 5.     1     3.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     2.0    0.0    0.0       ld2	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: 7.     1     3.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     2.0    0.0    0.0       ld2	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: 9.     1     3.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.3    0.1    3.0       <total>
 
 # CHECK:      [19] Code Region - G20
 
@@ -2039,10 +2039,10 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      510
 # CHECK-NEXT: Total uOps:        2900
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    5.69
 # CHECK-NEXT: IPC:               1.96
-# CHECK-NEXT: Block RThroughput: 3.0
+# CHECK-NEXT: Block RThroughput: 3.6
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     01234
@@ -2050,14 +2050,14 @@ add x0, x27, 1
 
 # CHECK:      [0,0]     DeeeeeeeeER   .   ld2	{ v1.4s, v2.4s }, [x27], #32
 # CHECK-NEXT: [0,1]     D=eE------R   .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=eeeeeeeeER  .   ld2	{ v1.8b, v2.8b }, [x27], #16
-# CHECK-NEXT: [0,3]     D==eE------R  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D=eeeeeeeeER .   ld2	{ v1.8h, v2.8h }, [x27], #32
-# CHECK-NEXT: [0,5]     .D==eE------R .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D==eeeeeeeeER.   ld2	{ v1.16b, v2.16b }, [x27], #32
-# CHECK-NEXT: [0,7]     .D===eE------R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D==eeeeeeeeER   ld2	{ v1.2d, v2.2d }, [x27], x28
-# CHECK-NEXT: [0,9]     . D===eE------R   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeeeeER  .   ld2	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: [0,3]     .D=eE------R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DeeeeeeeeER .   ld2	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: [0,5]     . D=eE------R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  DeeeeeeeeER.   ld2	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: [0,7]     .  D=eE------R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   DeeeeeeeeER   ld2	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: [0,9]     .   D=eE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2068,15 +2068,15 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld2	{ v1.4s, v2.4s }, [x27], #32
 # CHECK-NEXT: 1.     1     2.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld2	{ v1.8b, v2.8b }, [x27], #16
-# CHECK-NEXT: 3.     1     3.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ld2	{ v1.8h, v2.8h }, [x27], #32
-# CHECK-NEXT: 5.     1     3.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ld2	{ v1.16b, v2.16b }, [x27], #32
-# CHECK-NEXT: 7.     1     4.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     3.0    0.0    0.0       ld2	{ v1.2d, v2.2d }, [x27], x28
-# CHECK-NEXT: 9.     1     4.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.7    0.1    3.0       <total>
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld2	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: 3.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ld2	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: 5.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    0.0    0.0       ld2	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: 7.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    0.0    0.0       ld2	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: 9.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.5    0.1    3.0       <total>
 
 # CHECK:      [20] Code Region - G21
 
@@ -2085,10 +2085,10 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      510
 # CHECK-NEXT: Total uOps:        2700
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    5.29
 # CHECK-NEXT: IPC:               1.96
-# CHECK-NEXT: Block RThroughput: 2.5
+# CHECK-NEXT: Block RThroughput: 3.4
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     01234
@@ -2096,14 +2096,14 @@ add x0, x27, 1
 
 # CHECK:      [0,0]     DeeeeeeeeER   .   ld2	{ v1.2s, v2.2s }, [x27], x28
 # CHECK-NEXT: [0,1]     D=eE------R   .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=eeeeeeeeER  .   ld2	{ v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: [0,3]     D==eE------R  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     D==eeeeeeeeER .   ld2	{ v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT: [0,5]     .D==eE------R .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D==eeeeeeeeER.   ld2	{ v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: [0,7]     .D===eE------R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .D===eeeeeeeeER   ld2	{ v1.8h, v2.8h }, [x27], x28
-# CHECK-NEXT: [0,9]     .D====eE------R   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeeeeER  .   ld2	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: [0,3]     .D=eE------R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DeeeeeeeeER .   ld2	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: [0,5]     . D=eE------R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  DeeeeeeeeER.   ld2	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: [0,7]     .  D=eE------R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   DeeeeeeeeER   ld2	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: [0,9]     .   D=eE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2114,15 +2114,15 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld2	{ v1.2s, v2.2s }, [x27], x28
 # CHECK-NEXT: 1.     1     2.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld2	{ v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: 3.     1     3.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     3.0    0.0    0.0       ld2	{ v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT: 5.     1     3.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ld2	{ v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: 7.     1     4.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     4.0    0.0    0.0       ld2	{ v1.8h, v2.8h }, [x27], x28
-# CHECK-NEXT: 9.     1     5.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT:        1     3.0    0.1    3.0       <total>
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld2	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: 3.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ld2	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: 5.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    0.0    0.0       ld2	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: 7.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    0.0    0.0       ld2	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: 9.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.5    0.1    3.0       <total>
 
 # CHECK:      [21] Code Region - G22
 
@@ -2131,10 +2131,10 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      3310
 # CHECK-NEXT: Total uOps:        2600
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    0.79
 # CHECK-NEXT: IPC:               0.30
-# CHECK-NEXT: Block RThroughput: 2.5
+# CHECK-NEXT: Block RThroughput: 3.3
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789          0123456789
@@ -2142,14 +2142,14 @@ add x0, x27, 1
 
 # CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    . .   ld2	{ v1.16b, v2.16b }, [x27], x28
 # CHECK-NEXT: [0,1]     D=eE------R    .    .    .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D========eeeeeeeeER .    .    .    .    . .   ld2	{ v1.b, v2.b }[0], [x27], #2
-# CHECK-NEXT: [0,3]     D=========eE------R .    .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     D================eeeeeeeeER   .    .    . .   ld2	{ v1.b, v2.b }[8], [x27], #2
-# CHECK-NEXT: [0,5]     .D================eE------R   .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D=======================eeeeeeeeER.    . .   ld2	{ v1.b, v2.b }[0], [x27], x28
-# CHECK-NEXT: [0,7]     .D========================eE------R.    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .D===============================eeeeeeeeER   ld2	{ v1.b, v2.b }[8], [x27], x28
-# CHECK-NEXT: [0,9]     .D================================eE------R   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D=======eeeeeeeeER .    .    .    .    . .   ld2	{ v1.b, v2.b }[0], [x27], #2
+# CHECK-NEXT: [0,3]     .D========eE------R .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D==============eeeeeeeeER   .    .    . .   ld2	{ v1.b, v2.b }[8], [x27], #2
+# CHECK-NEXT: [0,5]     . D===============eE------R   .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D=====================eeeeeeeeER.    . .   ld2	{ v1.b, v2.b }[0], [x27], x28
+# CHECK-NEXT: [0,7]     .  D======================eE------R.    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D============================eeeeeeeeER   ld2	{ v1.b, v2.b }[8], [x27], x28
+# CHECK-NEXT: [0,9]     .   D=============================eE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2160,15 +2160,15 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld2	{ v1.16b, v2.16b }, [x27], x28
 # CHECK-NEXT: 1.     1     2.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     9.0    0.0    0.0       ld2	{ v1.b, v2.b }[0], [x27], #2
-# CHECK-NEXT: 3.     1     10.0   0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     17.0   0.0    0.0       ld2	{ v1.b, v2.b }[8], [x27], #2
-# CHECK-NEXT: 5.     1     17.0   0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     24.0   0.0    0.0       ld2	{ v1.b, v2.b }[0], [x27], x28
-# CHECK-NEXT: 7.     1     25.0   0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     32.0   0.0    0.0       ld2	{ v1.b, v2.b }[8], [x27], x28
-# CHECK-NEXT: 9.     1     33.0   0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT:        1     17.0   0.1    3.0       <total>
+# CHECK-NEXT: 2.     1     8.0    0.0    0.0       ld2	{ v1.b, v2.b }[0], [x27], #2
+# CHECK-NEXT: 3.     1     9.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     15.0   0.0    0.0       ld2	{ v1.b, v2.b }[8], [x27], #2
+# CHECK-NEXT: 5.     1     16.0   0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     22.0   0.0    0.0       ld2	{ v1.b, v2.b }[0], [x27], x28
+# CHECK-NEXT: 7.     1     23.0   0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     29.0   0.0    0.0       ld2	{ v1.b, v2.b }[8], [x27], x28
+# CHECK-NEXT: 9.     1     30.0   0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     15.5   0.1    3.0       <total>
 
 # CHECK:      [22] Code Region - G23
 
@@ -2177,10 +2177,10 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      4003
 # CHECK-NEXT: Total uOps:        2500
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    0.62
 # CHECK-NEXT: IPC:               0.25
-# CHECK-NEXT: Block RThroughput: 2.5
+# CHECK-NEXT: Block RThroughput: 3.1
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789          0123456789
@@ -2188,14 +2188,14 @@ add x0, x27, 1
 
 # CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    . .   ld2	{ v1.h, v2.h }[0], [x27], #4
 # CHECK-NEXT: [0,1]     D=eE------R    .    .    .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D========eeeeeeeeER .    .    .    .    . .   ld2	{ v1.h, v2.h }[4], [x27], #4
-# CHECK-NEXT: [0,3]     D=========eE------R .    .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     D================eeeeeeeeER   .    .    . .   ld2	{ v1.h, v2.h }[0], [x27], x28
-# CHECK-NEXT: [0,5]     D=================eE------R   .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D=======================eeeeeeeeER.    . .   ld2	{ v1.h, v2.h }[4], [x27], x28
-# CHECK-NEXT: [0,7]     .D========================eE------R.    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .D===============================eeeeeeeeER   ld2	{ v1.s, v2.s }[0], [x27], #8
-# CHECK-NEXT: [0,9]     .D================================eE------R   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D=======eeeeeeeeER .    .    .    .    . .   ld2	{ v1.h, v2.h }[4], [x27], #4
+# CHECK-NEXT: [0,3]     .D========eE------R .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D==============eeeeeeeeER   .    .    . .   ld2	{ v1.h, v2.h }[0], [x27], x28
+# CHECK-NEXT: [0,5]     . D===============eE------R   .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D=====================eeeeeeeeER.    . .   ld2	{ v1.h, v2.h }[4], [x27], x28
+# CHECK-NEXT: [0,7]     .  D======================eE------R.    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D============================eeeeeeeeER   ld2	{ v1.s, v2.s }[0], [x27], #8
+# CHECK-NEXT: [0,9]     .   D=============================eE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2206,15 +2206,15 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld2	{ v1.h, v2.h }[0], [x27], #4
 # CHECK-NEXT: 1.     1     2.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     9.0    0.0    0.0       ld2	{ v1.h, v2.h }[4], [x27], #4
-# CHECK-NEXT: 3.     1     10.0   0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     17.0   0.0    0.0       ld2	{ v1.h, v2.h }[0], [x27], x28
-# CHECK-NEXT: 5.     1     18.0   0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     24.0   0.0    0.0       ld2	{ v1.h, v2.h }[4], [x27], x28
-# CHECK-NEXT: 7.     1     25.0   0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     32.0   0.0    0.0       ld2	{ v1.s, v2.s }[0], [x27], #8
-# CHECK-NEXT: 9.     1     33.0   0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT:        1     17.1   0.1    3.0       <total>
+# CHECK-NEXT: 2.     1     8.0    0.0    0.0       ld2	{ v1.h, v2.h }[4], [x27], #4
+# CHECK-NEXT: 3.     1     9.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     15.0   0.0    0.0       ld2	{ v1.h, v2.h }[0], [x27], x28
+# CHECK-NEXT: 5.     1     16.0   0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     22.0   0.0    0.0       ld2	{ v1.h, v2.h }[4], [x27], x28
+# CHECK-NEXT: 7.     1     23.0   0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     29.0   0.0    0.0       ld2	{ v1.s, v2.s }[0], [x27], #8
+# CHECK-NEXT: 9.     1     30.0   0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     15.5   0.1    3.0       <total>
 
 # CHECK:      [23] Code Region - G24
 
@@ -2223,10 +2223,10 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      2603
 # CHECK-NEXT: Total uOps:        2500
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    0.96
 # CHECK-NEXT: IPC:               0.38
-# CHECK-NEXT: Block RThroughput: 2.5
+# CHECK-NEXT: Block RThroughput: 3.1
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
@@ -2234,14 +2234,14 @@ add x0, x27, 1
 
 # CHECK:      [0,0]     DeeeeeeeeER    .    .    .  .   ld2	{ v1.s, v2.s }[0], [x27], x28
 # CHECK-NEXT: [0,1]     D=eE------R    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D========eeeeeeeeER .    .  .   ld2	{ v1.d, v2.d }[0], [x27], #16
-# CHECK-NEXT: [0,3]     D=========eE------R .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     D================eeeeeeeeER .   ld2	{ v1.d, v2.d }[0], [x27], x28
-# CHECK-NEXT: [0,5]     D=================eE------R .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D================eeeeeeeeER.   ld2r	{ v1.1d, v2.1d }, [x27], #16
-# CHECK-NEXT: [0,7]     .D=================eE------R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .D=================eeeeeeeeER   ld2r	{ v1.2d, v2.2d }, [x27], #16
-# CHECK-NEXT: [0,9]     .D==================eE------R   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D=======eeeeeeeeER .    .  .   ld2	{ v1.d, v2.d }[0], [x27], #16
+# CHECK-NEXT: [0,3]     .D========eE------R .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D==============eeeeeeeeER .   ld2	{ v1.d, v2.d }[0], [x27], x28
+# CHECK-NEXT: [0,5]     . D===============eE------R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D==============eeeeeeeeER.   ld2r	{ v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: [0,7]     .  D===============eE------R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D==============eeeeeeeeER   ld2r	{ v1.2d, v2.2d }, [x27], #16
+# CHECK-NEXT: [0,9]     .   D===============eE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2252,15 +2252,15 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld2	{ v1.s, v2.s }[0], [x27], x28
 # CHECK-NEXT: 1.     1     2.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     9.0    0.0    0.0       ld2	{ v1.d, v2.d }[0], [x27], #16
-# CHECK-NEXT: 3.     1     10.0   0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     17.0   0.0    0.0       ld2	{ v1.d, v2.d }[0], [x27], x28
-# CHECK-NEXT: 5.     1     18.0   0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     17.0   0.0    0.0       ld2r	{ v1.1d, v2.1d }, [x27], #16
-# CHECK-NEXT: 7.     1     18.0   0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     18.0   0.0    0.0       ld2r	{ v1.2d, v2.2d }, [x27], #16
-# CHECK-NEXT: 9.     1     19.0   0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT:        1     12.9   0.1    3.0       <total>
+# CHECK-NEXT: 2.     1     8.0    0.0    0.0       ld2	{ v1.d, v2.d }[0], [x27], #16
+# CHECK-NEXT: 3.     1     9.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     15.0   0.0    0.0       ld2	{ v1.d, v2.d }[0], [x27], x28
+# CHECK-NEXT: 5.     1     16.0   0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     15.0   0.0    0.0       ld2r	{ v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: 7.     1     16.0   0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     15.0   0.0    0.0       ld2r	{ v1.2d, v2.2d }, [x27], #16
+# CHECK-NEXT: 9.     1     16.0   0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     11.3   0.1    3.0       <total>
 
 # CHECK:      [24] Code Region - G25
 
@@ -2269,10 +2269,10 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      510
 # CHECK-NEXT: Total uOps:        2500
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    4.90
 # CHECK-NEXT: IPC:               1.96
-# CHECK-NEXT: Block RThroughput: 2.5
+# CHECK-NEXT: Block RThroughput: 3.1
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     01234
@@ -2280,14 +2280,14 @@ add x0, x27, 1
 
 # CHECK:      [0,0]     DeeeeeeeeER   .   ld2r	{ v1.2s, v2.2s }, [x27], #8
 # CHECK-NEXT: [0,1]     D=eE------R   .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=eeeeeeeeER  .   ld2r	{ v1.4h, v2.4h }, [x27], #4
-# CHECK-NEXT: [0,3]     D==eE------R  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     D==eeeeeeeeER .   ld2r	{ v1.4s, v2.4s }, [x27], #8
-# CHECK-NEXT: [0,5]     D===eE------R .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D==eeeeeeeeER.   ld2r	{ v1.8b, v2.8b }, [x27], #2
-# CHECK-NEXT: [0,7]     .D===eE------R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .D===eeeeeeeeER   ld2r	{ v1.8h, v2.8h }, [x27], #4
-# CHECK-NEXT: [0,9]     .D====eE------R   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeeeeER  .   ld2r	{ v1.4h, v2.4h }, [x27], #4
+# CHECK-NEXT: [0,3]     .D=eE------R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DeeeeeeeeER .   ld2r	{ v1.4s, v2.4s }, [x27], #8
+# CHECK-NEXT: [0,5]     . D=eE------R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  DeeeeeeeeER.   ld2r	{ v1.8b, v2.8b }, [x27], #2
+# CHECK-NEXT: [0,7]     .  D=eE------R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   DeeeeeeeeER   ld2r	{ v1.8h, v2.8h }, [x27], #4
+# CHECK-NEXT: [0,9]     .   D=eE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2298,15 +2298,15 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld2r	{ v1.2s, v2.2s }, [x27], #8
 # CHECK-NEXT: 1.     1     2.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld2r	{ v1.4h, v2.4h }, [x27], #4
-# CHECK-NEXT: 3.     1     3.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     3.0    0.0    0.0       ld2r	{ v1.4s, v2.4s }, [x27], #8
-# CHECK-NEXT: 5.     1     4.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ld2r	{ v1.8b, v2.8b }, [x27], #2
-# CHECK-NEXT: 7.     1     4.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     4.0    0.0    0.0       ld2r	{ v1.8h, v2.8h }, [x27], #4
-# CHECK-NEXT: 9.     1     5.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT:        1     3.1    0.1    3.0       <total>
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld2r	{ v1.4h, v2.4h }, [x27], #4
+# CHECK-NEXT: 3.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ld2r	{ v1.4s, v2.4s }, [x27], #8
+# CHECK-NEXT: 5.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    0.0    0.0       ld2r	{ v1.8b, v2.8b }, [x27], #2
+# CHECK-NEXT: 7.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    0.0    0.0       ld2r	{ v1.8h, v2.8h }, [x27], #4
+# CHECK-NEXT: 9.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.5    0.1    3.0       <total>
 
 # CHECK:      [25] Code Region - G26
 
@@ -2315,10 +2315,10 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      510
 # CHECK-NEXT: Total uOps:        2500
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    4.90
 # CHECK-NEXT: IPC:               1.96
-# CHECK-NEXT: Block RThroughput: 2.5
+# CHECK-NEXT: Block RThroughput: 3.1
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     01234
@@ -2326,14 +2326,14 @@ add x0, x27, 1
 
 # CHECK:      [0,0]     DeeeeeeeeER   .   ld2r	{ v1.16b, v2.16b }, [x27], #2
 # CHECK-NEXT: [0,1]     D=eE------R   .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=eeeeeeeeER  .   ld2r	{ v1.1d, v2.1d }, [x27], x28
-# CHECK-NEXT: [0,3]     D==eE------R  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     D==eeeeeeeeER .   ld2r	{ v1.2d, v2.2d }, [x27], x28
-# CHECK-NEXT: [0,5]     D===eE------R .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D==eeeeeeeeER.   ld2r	{ v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT: [0,7]     .D===eE------R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .D===eeeeeeeeER   ld2r	{ v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: [0,9]     .D====eE------R   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeeeeER  .   ld2r	{ v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: [0,3]     .D=eE------R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DeeeeeeeeER .   ld2r	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: [0,5]     . D=eE------R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  DeeeeeeeeER.   ld2r	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: [0,7]     .  D=eE------R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   DeeeeeeeeER   ld2r	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: [0,9]     .   D=eE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2344,15 +2344,15 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld2r	{ v1.16b, v2.16b }, [x27], #2
 # CHECK-NEXT: 1.     1     2.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld2r	{ v1.1d, v2.1d }, [x27], x28
-# CHECK-NEXT: 3.     1     3.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     3.0    0.0    0.0       ld2r	{ v1.2d, v2.2d }, [x27], x28
-# CHECK-NEXT: 5.     1     4.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ld2r	{ v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT: 7.     1     4.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     4.0    0.0    0.0       ld2r	{ v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: 9.     1     5.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT:        1     3.1    0.1    3.0       <total>
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld2r	{ v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: 3.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ld2r	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: 5.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    0.0    0.0       ld2r	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: 7.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    0.0    0.0       ld2r	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: 9.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.5    0.1    3.0       <total>
 
 # CHECK:      [26] Code Region - G27
 
@@ -2361,10 +2361,10 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      510
 # CHECK-NEXT: Total uOps:        2800
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    5.49
 # CHECK-NEXT: IPC:               1.96
-# CHECK-NEXT: Block RThroughput: 2.8
+# CHECK-NEXT: Block RThroughput: 3.5
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     01234
@@ -2372,14 +2372,14 @@ add x0, x27, 1
 
 # CHECK:      [0,0]     DeeeeeeeeER   .   ld2r	{ v1.4s, v2.4s }, [x27], x28
 # CHECK-NEXT: [0,1]     D=eE------R   .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=eeeeeeeeER  .   ld2r	{ v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: [0,3]     D==eE------R  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     D==eeeeeeeeER .   ld2r	{ v1.8h, v2.8h }, [x27], x28
-# CHECK-NEXT: [0,5]     D===eE------R .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D==eeeeeeeeER.   ld2r	{ v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: [0,7]     .D===eE------R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .D===eeeeeeeeER   ld3	{ v1.2d, v2.2d, v3.2d }, [x27], #48
-# CHECK-NEXT: [0,9]     .D====eE------R   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeeeeER  .   ld2r	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: [0,3]     .D=eE------R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DeeeeeeeeER .   ld2r	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: [0,5]     . D=eE------R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  DeeeeeeeeER.   ld2r	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: [0,7]     .  D=eE------R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   DeeeeeeeeER   ld3	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: [0,9]     .   D=eE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2390,15 +2390,15 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld2r	{ v1.4s, v2.4s }, [x27], x28
 # CHECK-NEXT: 1.     1     2.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld2r	{ v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: 3.     1     3.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     3.0    0.0    0.0       ld2r	{ v1.8h, v2.8h }, [x27], x28
-# CHECK-NEXT: 5.     1     4.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ld2r	{ v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: 7.     1     4.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     4.0    0.0    0.0       ld3	{ v1.2d, v2.2d, v3.2d }, [x27], #48
-# CHECK-NEXT: 9.     1     5.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT:        1     3.1    0.1    3.0       <total>
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld2r	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: 3.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ld2r	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: 5.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    0.0    0.0       ld2r	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: 7.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    0.0    0.0       ld3	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: 9.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.5    0.1    3.0       <total>
 
 # CHECK:      [27] Code Region - G28
 
@@ -2407,10 +2407,10 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      510
 # CHECK-NEXT: Total uOps:        3700
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    7.25
 # CHECK-NEXT: IPC:               1.96
-# CHECK-NEXT: Block RThroughput: 4.0
+# CHECK-NEXT: Block RThroughput: 4.6
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     01234
@@ -2418,14 +2418,14 @@ add x0, x27, 1
 
 # CHECK:      [0,0]     DeeeeeeeeER   .   ld3	{ v1.2s, v2.2s, v3.2s }, [x27], #24
 # CHECK-NEXT: [0,1]     D=eE------R   .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=eeeeeeeeER  .   ld3	{ v1.4h, v2.4h, v3.4h }, [x27], #24
-# CHECK-NEXT: [0,3]     D==eE------R  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D=eeeeeeeeER .   ld3	{ v1.4s, v2.4s, v3.4s }, [x27], #48
-# CHECK-NEXT: [0,5]     .D==eE------R .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D==eeeeeeeeER.   ld3	{ v1.8b, v2.8b, v3.8b }, [x27], #24
-# CHECK-NEXT: [0,7]     .D===eE------R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D==eeeeeeeeER   ld3	{ v1.8h, v2.8h, v3.8h }, [x27], #48
-# CHECK-NEXT: [0,9]     . D===eE------R   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeeeeER  .   ld3	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: [0,3]     .D=eE------R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DeeeeeeeeER .   ld3	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: [0,5]     . D=eE------R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  DeeeeeeeeER.   ld3	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: [0,7]     .  D=eE------R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   DeeeeeeeeER   ld3	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: [0,9]     .   D=eE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2436,15 +2436,15 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3	{ v1.2s, v2.2s, v3.2s }, [x27], #24
 # CHECK-NEXT: 1.     1     2.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld3	{ v1.4h, v2.4h, v3.4h }, [x27], #24
-# CHECK-NEXT: 3.     1     3.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ld3	{ v1.4s, v2.4s, v3.4s }, [x27], #48
-# CHECK-NEXT: 5.     1     3.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ld3	{ v1.8b, v2.8b, v3.8b }, [x27], #24
-# CHECK-NEXT: 7.     1     4.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     3.0    0.0    0.0       ld3	{ v1.8h, v2.8h, v3.8h }, [x27], #48
-# CHECK-NEXT: 9.     1     4.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.7    0.1    3.0       <total>
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld3	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: 3.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ld3	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: 5.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    0.0    0.0       ld3	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: 7.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    0.0    0.0       ld3	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: 9.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.5    0.1    3.0       <total>
 
 # CHECK:      [28] Code Region - G29
 
@@ -2453,10 +2453,10 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      510
 # CHECK-NEXT: Total uOps:        3800
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    7.45
 # CHECK-NEXT: IPC:               1.96
-# CHECK-NEXT: Block RThroughput: 4.3
+# CHECK-NEXT: Block RThroughput: 4.8
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     01234
@@ -2464,14 +2464,14 @@ add x0, x27, 1
 
 # CHECK:      [0,0]     DeeeeeeeeER   .   ld3	{ v1.16b, v2.16b, v3.16b }, [x27], #48
 # CHECK-NEXT: [0,1]     D=eE------R   .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=eeeeeeeeER  .   ld3	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: [0,2]     .DeeeeeeeeER  .   ld3	{ v1.2d, v2.2d, v3.2d }, [x27], x28
 # CHECK-NEXT: [0,3]     .D=eE------R  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D=eeeeeeeeER .   ld3	{ v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: [0,5]     .D==eE------R .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D==eeeeeeeeER.   ld3	{ v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: [0,7]     .D===eE------R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D==eeeeeeeeER   ld3	{ v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: [0,9]     . D===eE------R   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DeeeeeeeeER .   ld3	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: [0,5]     . D=eE------R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  DeeeeeeeeER.   ld3	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: [0,7]     .  D=eE------R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   DeeeeeeeeER   ld3	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: [0,9]     .   D=eE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2482,15 +2482,15 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3	{ v1.16b, v2.16b, v3.16b }, [x27], #48
 # CHECK-NEXT: 1.     1     2.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld3	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld3	{ v1.2d, v2.2d, v3.2d }, [x27], x28
 # CHECK-NEXT: 3.     1     2.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ld3	{ v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: 5.     1     3.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ld3	{ v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: 7.     1     4.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     3.0    0.0    0.0       ld3	{ v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: 9.     1     4.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.6    0.1    3.0       <total>
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ld3	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: 5.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    0.0    0.0       ld3	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: 7.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    0.0    0.0       ld3	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: 9.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.5    0.1    3.0       <total>
 
 # CHECK:      [29] Code Region - G30
 
@@ -2499,10 +2499,10 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      1910
 # CHECK-NEXT: Total uOps:        3700
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    1.94
 # CHECK-NEXT: IPC:               0.52
-# CHECK-NEXT: Block RThroughput: 4.0
+# CHECK-NEXT: Block RThroughput: 4.6
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
@@ -2510,14 +2510,14 @@ add x0, x27, 1
 
 # CHECK:      [0,0]     DeeeeeeeeER    .    .    .  .   ld3	{ v1.8b, v2.8b, v3.8b }, [x27], x28
 # CHECK-NEXT: [0,1]     D=eE------R    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=eeeeeeeeER   .    .    .  .   ld3	{ v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT: [0,3]     D==eE------R   .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D=eeeeeeeeER  .    .    .  .   ld3	{ v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: [0,5]     .D==eE------R  .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D=========eeeeeeeeER    .  .   ld3	{ v1.b, v2.b, v3.b }[0], [x27], #3
-# CHECK-NEXT: [0,7]     .D==========eE------R    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D================eeeeeeeeER   ld3	{ v1.b, v2.b, v3.b }[8], [x27], #3
-# CHECK-NEXT: [0,9]     . D=================eE------R   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeeeeER   .    .    .  .   ld3	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: [0,3]     .D=eE------R   .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DeeeeeeeeER  .    .    .  .   ld3	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: [0,5]     . D=eE------R  .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D=======eeeeeeeeER    .  .   ld3	{ v1.b, v2.b, v3.b }[0], [x27], #3
+# CHECK-NEXT: [0,7]     .  D========eE------R    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D==============eeeeeeeeER   ld3	{ v1.b, v2.b, v3.b }[8], [x27], #3
+# CHECK-NEXT: [0,9]     .   D===============eE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2528,15 +2528,15 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3	{ v1.8b, v2.8b, v3.8b }, [x27], x28
 # CHECK-NEXT: 1.     1     2.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld3	{ v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT: 3.     1     3.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ld3	{ v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: 5.     1     3.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     10.0   0.0    0.0       ld3	{ v1.b, v2.b, v3.b }[0], [x27], #3
-# CHECK-NEXT: 7.     1     11.0   0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     17.0   0.0    0.0       ld3	{ v1.b, v2.b, v3.b }[8], [x27], #3
-# CHECK-NEXT: 9.     1     18.0   0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT:        1     6.9    0.1    3.0       <total>
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld3	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: 3.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ld3	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: 5.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     8.0    0.0    0.0       ld3	{ v1.b, v2.b, v3.b }[0], [x27], #3
+# CHECK-NEXT: 7.     1     9.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     15.0   0.0    0.0       ld3	{ v1.b, v2.b, v3.b }[8], [x27], #3
+# CHECK-NEXT: 9.     1     16.0   0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     5.7    0.1    3.0       <total>
 
 # CHECK:      [30] Code Region - G31
 
@@ -2545,10 +2545,10 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      4003
 # CHECK-NEXT: Total uOps:        3500
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    0.87
 # CHECK-NEXT: IPC:               0.25
-# CHECK-NEXT: Block RThroughput: 3.8
+# CHECK-NEXT: Block RThroughput: 4.4
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789          0123456789
@@ -2556,14 +2556,14 @@ add x0, x27, 1
 
 # CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    . .   ld3	{ v1.b, v2.b, v3.b }[0], [x27], x28
 # CHECK-NEXT: [0,1]     D=eE------R    .    .    .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D========eeeeeeeeER .    .    .    .    . .   ld3	{ v1.b, v2.b, v3.b }[8], [x27], x28
-# CHECK-NEXT: [0,3]     D=========eE------R .    .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D===============eeeeeeeeER   .    .    . .   ld3	{ v1.h, v2.h, v3.h }[0], [x27], #6
-# CHECK-NEXT: [0,5]     .D================eE------R   .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D=======================eeeeeeeeER.    . .   ld3	{ v1.h, v2.h, v3.h }[4], [x27], #6
-# CHECK-NEXT: [0,7]     .D========================eE------R.    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D==============================eeeeeeeeER   ld3	{ v1.h, v2.h, v3.h }[0], [x27], x28
-# CHECK-NEXT: [0,9]     . D===============================eE------R   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D=======eeeeeeeeER .    .    .    .    . .   ld3	{ v1.b, v2.b, v3.b }[8], [x27], x28
+# CHECK-NEXT: [0,3]     .D========eE------R .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D==============eeeeeeeeER   .    .    . .   ld3	{ v1.h, v2.h, v3.h }[0], [x27], #6
+# CHECK-NEXT: [0,5]     . D===============eE------R   .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D=====================eeeeeeeeER.    . .   ld3	{ v1.h, v2.h, v3.h }[4], [x27], #6
+# CHECK-NEXT: [0,7]     .  D======================eE------R.    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D============================eeeeeeeeER   ld3	{ v1.h, v2.h, v3.h }[0], [x27], x28
+# CHECK-NEXT: [0,9]     .   D=============================eE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2574,15 +2574,15 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3	{ v1.b, v2.b, v3.b }[0], [x27], x28
 # CHECK-NEXT: 1.     1     2.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     9.0    0.0    0.0       ld3	{ v1.b, v2.b, v3.b }[8], [x27], x28
-# CHECK-NEXT: 3.     1     10.0   0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     16.0   0.0    0.0       ld3	{ v1.h, v2.h, v3.h }[0], [x27], #6
-# CHECK-NEXT: 5.     1     17.0   0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     24.0   0.0    0.0       ld3	{ v1.h, v2.h, v3.h }[4], [x27], #6
-# CHECK-NEXT: 7.     1     25.0   0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     31.0   0.0    0.0       ld3	{ v1.h, v2.h, v3.h }[0], [x27], x28
-# CHECK-NEXT: 9.     1     32.0   0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT:        1     16.7   0.1    3.0       <total>
+# CHECK-NEXT: 2.     1     8.0    0.0    0.0       ld3	{ v1.b, v2.b, v3.b }[8], [x27], x28
+# CHECK-NEXT: 3.     1     9.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     15.0   0.0    0.0       ld3	{ v1.h, v2.h, v3.h }[0], [x27], #6
+# CHECK-NEXT: 5.     1     16.0   0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     22.0   0.0    0.0       ld3	{ v1.h, v2.h, v3.h }[4], [x27], #6
+# CHECK-NEXT: 7.     1     23.0   0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     29.0   0.0    0.0       ld3	{ v1.h, v2.h, v3.h }[0], [x27], x28
+# CHECK-NEXT: 9.     1     30.0   0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     15.5   0.1    3.0       <total>
 
 # CHECK:      [31] Code Region - G32
 
@@ -2591,10 +2591,10 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      4003
 # CHECK-NEXT: Total uOps:        3500
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    0.87
 # CHECK-NEXT: IPC:               0.25
-# CHECK-NEXT: Block RThroughput: 3.8
+# CHECK-NEXT: Block RThroughput: 4.4
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789          0123456789
@@ -2602,14 +2602,14 @@ add x0, x27, 1
 
 # CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    . .   ld3	{ v1.h, v2.h, v3.h }[4], [x27], x28
 # CHECK-NEXT: [0,1]     D=eE------R    .    .    .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D========eeeeeeeeER .    .    .    .    . .   ld3	{ v1.s, v2.s, v3.s }[0], [x27], #12
-# CHECK-NEXT: [0,3]     D=========eE------R .    .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D===============eeeeeeeeER   .    .    . .   ld3	{ v1.s, v2.s, v3.s }[0], [x27], x28
-# CHECK-NEXT: [0,5]     .D================eE------R   .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D=======================eeeeeeeeER.    . .   ld3	{ v1.d, v2.d, v3.d }[0], [x27], #24
-# CHECK-NEXT: [0,7]     .D========================eE------R.    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D==============================eeeeeeeeER   ld3	{ v1.d, v2.d, v3.d }[0], [x27], x28
-# CHECK-NEXT: [0,9]     . D===============================eE------R   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D=======eeeeeeeeER .    .    .    .    . .   ld3	{ v1.s, v2.s, v3.s }[0], [x27], #12
+# CHECK-NEXT: [0,3]     .D========eE------R .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D==============eeeeeeeeER   .    .    . .   ld3	{ v1.s, v2.s, v3.s }[0], [x27], x28
+# CHECK-NEXT: [0,5]     . D===============eE------R   .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D=====================eeeeeeeeER.    . .   ld3	{ v1.d, v2.d, v3.d }[0], [x27], #24
+# CHECK-NEXT: [0,7]     .  D======================eE------R.    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D============================eeeeeeeeER   ld3	{ v1.d, v2.d, v3.d }[0], [x27], x28
+# CHECK-NEXT: [0,9]     .   D=============================eE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2620,15 +2620,15 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3	{ v1.h, v2.h, v3.h }[4], [x27], x28
 # CHECK-NEXT: 1.     1     2.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     9.0    0.0    0.0       ld3	{ v1.s, v2.s, v3.s }[0], [x27], #12
-# CHECK-NEXT: 3.     1     10.0   0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     16.0   0.0    0.0       ld3	{ v1.s, v2.s, v3.s }[0], [x27], x28
-# CHECK-NEXT: 5.     1     17.0   0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     24.0   0.0    0.0       ld3	{ v1.d, v2.d, v3.d }[0], [x27], #24
-# CHECK-NEXT: 7.     1     25.0   0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     31.0   0.0    0.0       ld3	{ v1.d, v2.d, v3.d }[0], [x27], x28
-# CHECK-NEXT: 9.     1     32.0   0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT:        1     16.7   0.1    3.0       <total>
+# CHECK-NEXT: 2.     1     8.0    0.0    0.0       ld3	{ v1.s, v2.s, v3.s }[0], [x27], #12
+# CHECK-NEXT: 3.     1     9.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     15.0   0.0    0.0       ld3	{ v1.s, v2.s, v3.s }[0], [x27], x28
+# CHECK-NEXT: 5.     1     16.0   0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     22.0   0.0    0.0       ld3	{ v1.d, v2.d, v3.d }[0], [x27], #24
+# CHECK-NEXT: 7.     1     23.0   0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     29.0   0.0    0.0       ld3	{ v1.d, v2.d, v3.d }[0], [x27], x28
+# CHECK-NEXT: 9.     1     30.0   0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     15.5   0.1    3.0       <total>
 
 # CHECK:      [32] Code Region - G33
 
@@ -2637,10 +2637,10 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      510
 # CHECK-NEXT: Total uOps:        3500
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    6.86
 # CHECK-NEXT: IPC:               1.96
-# CHECK-NEXT: Block RThroughput: 3.8
+# CHECK-NEXT: Block RThroughput: 4.4
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     01234
@@ -2648,14 +2648,14 @@ add x0, x27, 1
 
 # CHECK:      [0,0]     DeeeeeeeeER   .   ld3r	{ v1.1d, v2.1d, v3.1d }, [x27], #24
 # CHECK-NEXT: [0,1]     D=eE------R   .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=eeeeeeeeER  .   ld3r	{ v1.2d, v2.2d, v3.2d }, [x27], #24
-# CHECK-NEXT: [0,3]     D==eE------R  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D=eeeeeeeeER .   ld3r	{ v1.2s, v2.2s, v3.2s }, [x27], #12
-# CHECK-NEXT: [0,5]     .D==eE------R .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D==eeeeeeeeER.   ld3r	{ v1.4h, v2.4h, v3.4h }, [x27], #6
-# CHECK-NEXT: [0,7]     .D===eE------R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D==eeeeeeeeER   ld3r	{ v1.4s, v2.4s, v3.4s }, [x27], #12
-# CHECK-NEXT: [0,9]     . D===eE------R   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeeeeER  .   ld3r	{ v1.2d, v2.2d, v3.2d }, [x27], #24
+# CHECK-NEXT: [0,3]     .D=eE------R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DeeeeeeeeER .   ld3r	{ v1.2s, v2.2s, v3.2s }, [x27], #12
+# CHECK-NEXT: [0,5]     . D=eE------R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  DeeeeeeeeER.   ld3r	{ v1.4h, v2.4h, v3.4h }, [x27], #6
+# CHECK-NEXT: [0,7]     .  D=eE------R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   DeeeeeeeeER   ld3r	{ v1.4s, v2.4s, v3.4s }, [x27], #12
+# CHECK-NEXT: [0,9]     .   D=eE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2666,15 +2666,15 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3r	{ v1.1d, v2.1d, v3.1d }, [x27], #24
 # CHECK-NEXT: 1.     1     2.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld3r	{ v1.2d, v2.2d, v3.2d }, [x27], #24
-# CHECK-NEXT: 3.     1     3.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ld3r	{ v1.2s, v2.2s, v3.2s }, [x27], #12
-# CHECK-NEXT: 5.     1     3.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ld3r	{ v1.4h, v2.4h, v3.4h }, [x27], #6
-# CHECK-NEXT: 7.     1     4.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     3.0    0.0    0.0       ld3r	{ v1.4s, v2.4s, v3.4s }, [x27], #12
-# CHECK-NEXT: 9.     1     4.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.7    0.1    3.0       <total>
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld3r	{ v1.2d, v2.2d, v3.2d }, [x27], #24
+# CHECK-NEXT: 3.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ld3r	{ v1.2s, v2.2s, v3.2s }, [x27], #12
+# CHECK-NEXT: 5.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    0.0    0.0       ld3r	{ v1.4h, v2.4h, v3.4h }, [x27], #6
+# CHECK-NEXT: 7.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    0.0    0.0       ld3r	{ v1.4s, v2.4s, v3.4s }, [x27], #12
+# CHECK-NEXT: 9.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.5    0.1    3.0       <total>
 
 # CHECK:      [33] Code Region - G34
 
@@ -2683,10 +2683,10 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      510
 # CHECK-NEXT: Total uOps:        3500
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    6.86
 # CHECK-NEXT: IPC:               1.96
-# CHECK-NEXT: Block RThroughput: 3.8
+# CHECK-NEXT: Block RThroughput: 4.4
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     01234
@@ -2694,14 +2694,14 @@ add x0, x27, 1
 
 # CHECK:      [0,0]     DeeeeeeeeER   .   ld3r	{ v1.8b, v2.8b, v3.8b }, [x27], #3
 # CHECK-NEXT: [0,1]     D=eE------R   .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=eeeeeeeeER  .   ld3r	{ v1.8h, v2.8h, v3.8h }, [x27], #6
-# CHECK-NEXT: [0,3]     D==eE------R  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D=eeeeeeeeER .   ld3r	{ v1.16b, v2.16b, v3.16b }, [x27], #3
-# CHECK-NEXT: [0,5]     .D==eE------R .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D==eeeeeeeeER.   ld3r	{ v1.1d, v2.1d, v3.1d }, [x27], x28
-# CHECK-NEXT: [0,7]     .D===eE------R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D==eeeeeeeeER   ld3r	{ v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT: [0,9]     . D===eE------R   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeeeeER  .   ld3r	{ v1.8h, v2.8h, v3.8h }, [x27], #6
+# CHECK-NEXT: [0,3]     .D=eE------R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DeeeeeeeeER .   ld3r	{ v1.16b, v2.16b, v3.16b }, [x27], #3
+# CHECK-NEXT: [0,5]     . D=eE------R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  DeeeeeeeeER.   ld3r	{ v1.1d, v2.1d, v3.1d }, [x27], x28
+# CHECK-NEXT: [0,7]     .  D=eE------R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   DeeeeeeeeER   ld3r	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: [0,9]     .   D=eE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2712,15 +2712,15 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3r	{ v1.8b, v2.8b, v3.8b }, [x27], #3
 # CHECK-NEXT: 1.     1     2.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld3r	{ v1.8h, v2.8h, v3.8h }, [x27], #6
-# CHECK-NEXT: 3.     1     3.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ld3r	{ v1.16b, v2.16b, v3.16b }, [x27], #3
-# CHECK-NEXT: 5.     1     3.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ld3r	{ v1.1d, v2.1d, v3.1d }, [x27], x28
-# CHECK-NEXT: 7.     1     4.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     3.0    0.0    0.0       ld3r	{ v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT: 9.     1     4.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.7    0.1    3.0       <total>
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld3r	{ v1.8h, v2.8h, v3.8h }, [x27], #6
+# CHECK-NEXT: 3.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ld3r	{ v1.16b, v2.16b, v3.16b }, [x27], #3
+# CHECK-NEXT: 5.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    0.0    0.0       ld3r	{ v1.1d, v2.1d, v3.1d }, [x27], x28
+# CHECK-NEXT: 7.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    0.0    0.0       ld3r	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: 9.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.5    0.1    3.0       <total>
 
 # CHECK:      [34] Code Region - G35
 
@@ -2729,10 +2729,10 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      510
 # CHECK-NEXT: Total uOps:        3500
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    6.86
 # CHECK-NEXT: IPC:               1.96
-# CHECK-NEXT: Block RThroughput: 3.8
+# CHECK-NEXT: Block RThroughput: 4.4
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     01234
@@ -2740,14 +2740,14 @@ add x0, x27, 1
 
 # CHECK:      [0,0]     DeeeeeeeeER   .   ld3r	{ v1.2s, v2.2s, v3.2s }, [x27], x28
 # CHECK-NEXT: [0,1]     D=eE------R   .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=eeeeeeeeER  .   ld3r	{ v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: [0,3]     D==eE------R  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D=eeeeeeeeER .   ld3r	{ v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: [0,5]     .D==eE------R .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D==eeeeeeeeER.   ld3r	{ v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: [0,7]     .D===eE------R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D==eeeeeeeeER   ld3r	{ v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT: [0,9]     . D===eE------R   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeeeeER  .   ld3r	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: [0,3]     .D=eE------R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DeeeeeeeeER .   ld3r	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: [0,5]     . D=eE------R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  DeeeeeeeeER.   ld3r	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: [0,7]     .  D=eE------R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   DeeeeeeeeER   ld3r	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: [0,9]     .   D=eE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2758,42 +2758,42 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3r	{ v1.2s, v2.2s, v3.2s }, [x27], x28
 # CHECK-NEXT: 1.     1     2.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld3r	{ v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: 3.     1     3.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ld3r	{ v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: 5.     1     3.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ld3r	{ v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: 7.     1     4.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     3.0    0.0    0.0       ld3r	{ v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT: 9.     1     4.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.7    0.1    3.0       <total>
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld3r	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: 3.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ld3r	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: 5.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    0.0    0.0       ld3r	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: 7.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    0.0    0.0       ld3r	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: 9.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.5    0.1    3.0       <total>
 
 # CHECK:      [35] Code Region - G36
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      611
+# CHECK-NEXT: Total Cycles:      910
 # CHECK-NEXT: Total uOps:        4500
 
-# CHECK:      Dispatch Width:    15
-# CHECK-NEXT: uOps Per Cycle:    7.36
-# CHECK-NEXT: IPC:               1.64
-# CHECK-NEXT: Block RThroughput: 5.3
+# CHECK:      Dispatch Width:    8
+# CHECK-NEXT: uOps Per Cycle:    4.95
+# CHECK-NEXT: IPC:               1.10
+# CHECK-NEXT: Block RThroughput: 5.6
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456
+# CHECK-NEXT:                     012345678
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeeeER    ..   ld3r	{ v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: [0,1]     D=eE------R    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeeeeeeeeeER  ..   ld4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
-# CHECK-NEXT: [0,3]     .D=eE-------R  ..   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . DeeeeeeeeER  ..   ld4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
-# CHECK-NEXT: [0,5]     . D=eE------R  ..   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  DeeeeeeeeER ..   ld4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
-# CHECK-NEXT: [0,7]     .  D=eE------R ..   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D=eeeeeeeeeER   ld4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
-# CHECK-NEXT: [0,9]     .   D==eE-------R   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeeeER    .  .   ld3r	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: [0,1]     D=eE------R    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeeeeeER  .  .   ld4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: [0,3]     . DeE-------R  .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .  DeeeeeeeeER .  .   ld4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: [0,5]     .   DeE------R .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    DeeeeeeeeER  .   ld4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: [0,7]     .    .DeE------R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    . DeeeeeeeeeER   ld4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: [0,9]     .    .  DeE-------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2805,41 +2805,41 @@ add x0, x27, 1
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3r	{ v1.16b, v2.16b, v3.16b }, [x27], x28
 # CHECK-NEXT: 1.     1     2.0    0.0    6.0       add	x0, x27, #1
 # CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
-# CHECK-NEXT: 3.     1     2.0    0.0    7.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ld4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
-# CHECK-NEXT: 5.     1     2.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     1.0    0.0    0.0       ld4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
-# CHECK-NEXT: 7.     1     2.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     2.0    1.0    0.0       ld4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
-# CHECK-NEXT: 9.     1     3.0    0.0    7.0       add	x0, x27, #1
-# CHECK-NEXT:        1     1.7    0.2    3.2       <total>
+# CHECK-NEXT: 3.     1     1.0    0.0    7.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       ld4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: 5.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       ld4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: 7.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       ld4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: 9.     1     1.0    0.0    7.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.1    0.4    3.2       <total>
 
 # CHECK:      [36] Code Region - G37
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      610
+# CHECK-NEXT: Total Cycles:      1009
 # CHECK-NEXT: Total uOps:        4800
 
-# CHECK:      Dispatch Width:    15
-# CHECK-NEXT: uOps Per Cycle:    7.87
-# CHECK-NEXT: IPC:               1.64
+# CHECK:      Dispatch Width:    8
+# CHECK-NEXT: uOps Per Cycle:    4.76
+# CHECK-NEXT: IPC:               0.99
 # CHECK-NEXT: Block RThroughput: 6.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     012345
+# CHECK-NEXT:                     012345678
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeeeER    .   ld4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
-# CHECK-NEXT: [0,1]     D=eE------R    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeeeeeeeeeER  .   ld4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
-# CHECK-NEXT: [0,3]     .D=eE-------R  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . DeeeeeeeeeER .   ld4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
-# CHECK-NEXT: [0,5]     . D=eE-------R .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  DeeeeeeeeeER.   ld4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: [0,7]     .  D=eE-------R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D=eeeeeeeeER   ld4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: [0,9]     .   D==eE------R   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeeeER    .  .   ld4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: [0,1]     .DeE------R    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeeeeeeeeeER .  .   ld4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: [0,3]     .  DeE-------R .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   DeeeeeeeeeER  .   ld4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: [0,5]     .    DeE-------R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .DeeeeeeeeeER.   ld4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: [0,7]     .    . DeE-------R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  DeeeeeeeeER   ld4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .   DeE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2849,43 +2849,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
-# CHECK-NEXT: 1.     1     2.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
-# CHECK-NEXT: 3.     1     2.0    0.0    7.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ld4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
-# CHECK-NEXT: 5.     1     2.0    0.0    7.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     1.0    0.0    0.0       ld4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: 7.     1     2.0    0.0    7.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     2.0    1.0    0.0       ld4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: 9.     1     3.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT:        1     1.7    0.2    3.3       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       ld4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: 3.     1     1.0    0.0    7.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       ld4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: 5.     1     1.0    0.0    7.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       ld4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: 7.     1     1.0    0.0    7.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       ld4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: 9.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    0.5    3.3       <total>
 
 # CHECK:      [37] Code Region - G38
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      660
+# CHECK-NEXT: Total Cycles:      1010
 # CHECK-NEXT: Total uOps:        4800
 
-# CHECK:      Dispatch Width:    15
-# CHECK-NEXT: uOps Per Cycle:    7.27
-# CHECK-NEXT: IPC:               1.52
+# CHECK:      Dispatch Width:    8
+# CHECK-NEXT: uOps Per Cycle:    4.75
+# CHECK-NEXT: IPC:               0.99
 # CHECK-NEXT: Block RThroughput: 6.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456
+# CHECK-NEXT:                     0123456789
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeeeER    ..   ld4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: [0,1]     D=eE------R    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeeeeeeeeeER  ..   ld4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: [0,3]     .D=eE-------R  ..   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . DeeeeeeeeER  ..   ld4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: [0,5]     . D=eE------R  ..   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  DeeeeeeeeeER..   ld4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: [0,7]     .  D=eE-------R..   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D=eeeeeeeeeER   ld4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: [0,9]     .   D==eE-------R   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeeeER    .   .   ld4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: [0,1]     .DeE------R    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeeeeeeeeeER .   .   ld4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: [0,3]     .  DeE-------R .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   DeeeeeeeeER.   .   ld4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: [0,5]     .    DeE------R.   .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .DeeeeeeeeeER .   ld4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: [0,7]     .    . DeE-------R .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  DeeeeeeeeeER   ld4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .   DeE-------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2895,16 +2895,16 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: 1.     1     2.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: 3.     1     2.0    0.0    7.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ld4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: 5.     1     2.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     1.0    0.0    0.0       ld4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: 7.     1     2.0    0.0    7.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     2.0    1.0    0.0       ld4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: 9.     1     3.0    0.0    7.0       add	x0, x27, #1
-# CHECK-NEXT:        1     1.7    0.2    3.3       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       ld4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: 3.     1     1.0    0.0    7.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       ld4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: 5.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       ld4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: 7.     1     1.0    0.0    7.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       ld4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: 9.     1     1.0    0.0    7.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    0.5    3.3       <total>
 
 # CHECK:      [38] Code Region - G39
 
@@ -2913,25 +2913,25 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      4003
 # CHECK-NEXT: Total uOps:        4500
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    1.12
 # CHECK-NEXT: IPC:               0.25
-# CHECK-NEXT: Block RThroughput: 5.0
+# CHECK-NEXT: Block RThroughput: 5.6
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789          0123456789
 # CHECK-NEXT: Index     0123456789          0123456789          012
 
 # CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    . .   ld4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
-# CHECK-NEXT: [0,1]     D=eE------R    .    .    .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .D=======eeeeeeeeER .    .    .    .    . .   ld4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
-# CHECK-NEXT: [0,3]     .D========eE------R .    .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D==============eeeeeeeeER   .    .    . .   ld4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
-# CHECK-NEXT: [0,5]     . D===============eE------R   .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D=====================eeeeeeeeER.    . .   ld4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
-# CHECK-NEXT: [0,7]     .  D======================eE------R.    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D============================eeeeeeeeER   ld4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
-# CHECK-NEXT: [0,9]     .   D=============================eE------R   add	x0, x27, #1
+# CHECK-NEXT: [0,1]     .DeE------R    .    .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . D======eeeeeeeeER .    .    .    .    . .   ld4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
+# CHECK-NEXT: [0,3]     .  D======eE------R .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   D============eeeeeeeeER   .    .    . .   ld4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
+# CHECK-NEXT: [0,5]     .    D============eE------R   .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .D==================eeeeeeeeER.    . .   ld4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
+# CHECK-NEXT: [0,7]     .    . D==================eE------R.    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  D========================eeeeeeeeER   ld4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
+# CHECK-NEXT: [0,9]     .    .   D========================eE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2941,16 +2941,16 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
-# CHECK-NEXT: 1.     1     2.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     8.0    0.0    0.0       ld4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
-# CHECK-NEXT: 3.     1     9.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     15.0   0.0    0.0       ld4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
-# CHECK-NEXT: 5.     1     16.0   0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     22.0   0.0    0.0       ld4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
-# CHECK-NEXT: 7.     1     23.0   0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     29.0   0.0    0.0       ld4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
-# CHECK-NEXT: 9.     1     30.0   0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT:        1     15.5   0.1    3.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
+# CHECK-NEXT: 3.     1     7.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     13.0   0.0    0.0       ld4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
+# CHECK-NEXT: 5.     1     13.0   0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     19.0   0.0    0.0       ld4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
+# CHECK-NEXT: 7.     1     19.0   0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     25.0   0.0    0.0       ld4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
+# CHECK-NEXT: 9.     1     25.0   0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     13.0   0.1    3.0       <total>
 
 # CHECK:      [39] Code Region - G40
 
@@ -2959,25 +2959,25 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      4003
 # CHECK-NEXT: Total uOps:        4500
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    1.12
 # CHECK-NEXT: IPC:               0.25
-# CHECK-NEXT: Block RThroughput: 5.0
+# CHECK-NEXT: Block RThroughput: 5.6
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789          0123456789
 # CHECK-NEXT: Index     0123456789          0123456789          012
 
 # CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    . .   ld4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
-# CHECK-NEXT: [0,1]     D=eE------R    .    .    .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .D=======eeeeeeeeER .    .    .    .    . .   ld4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
-# CHECK-NEXT: [0,3]     .D========eE------R .    .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D==============eeeeeeeeER   .    .    . .   ld4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
-# CHECK-NEXT: [0,5]     . D===============eE------R   .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D=====================eeeeeeeeER.    . .   ld4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
-# CHECK-NEXT: [0,7]     .  D======================eE------R.    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D============================eeeeeeeeER   ld4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
-# CHECK-NEXT: [0,9]     .   D=============================eE------R   add	x0, x27, #1
+# CHECK-NEXT: [0,1]     .DeE------R    .    .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . D======eeeeeeeeER .    .    .    .    . .   ld4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
+# CHECK-NEXT: [0,3]     .  D======eE------R .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   D============eeeeeeeeER   .    .    . .   ld4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
+# CHECK-NEXT: [0,5]     .    D============eE------R   .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .D==================eeeeeeeeER.    . .   ld4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
+# CHECK-NEXT: [0,7]     .    . D==================eE------R.    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  D========================eeeeeeeeER   ld4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
+# CHECK-NEXT: [0,9]     .    .   D========================eE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2987,16 +2987,16 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
-# CHECK-NEXT: 1.     1     2.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     8.0    0.0    0.0       ld4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
-# CHECK-NEXT: 3.     1     9.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     15.0   0.0    0.0       ld4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
-# CHECK-NEXT: 5.     1     16.0   0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     22.0   0.0    0.0       ld4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
-# CHECK-NEXT: 7.     1     23.0   0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     29.0   0.0    0.0       ld4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
-# CHECK-NEXT: 9.     1     30.0   0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT:        1     15.5   0.1    3.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
+# CHECK-NEXT: 3.     1     7.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     13.0   0.0    0.0       ld4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
+# CHECK-NEXT: 5.     1     13.0   0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     19.0   0.0    0.0       ld4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
+# CHECK-NEXT: 7.     1     19.0   0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     25.0   0.0    0.0       ld4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
+# CHECK-NEXT: 9.     1     25.0   0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     13.0   0.1    3.0       <total>
 
 # CHECK:      [40] Code Region - G41
 
@@ -3005,25 +3005,25 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      1903
 # CHECK-NEXT: Total uOps:        4500
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    2.36
 # CHECK-NEXT: IPC:               0.53
-# CHECK-NEXT: Block RThroughput: 5.0
+# CHECK-NEXT: Block RThroughput: 5.6
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
 # CHECK-NEXT: Index     0123456789          01
 
 # CHECK:      [0,0]     DeeeeeeeeER    .    ..   ld4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
-# CHECK-NEXT: [0,1]     D=eE------R    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .D=======eeeeeeeeER ..   ld4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
-# CHECK-NEXT: [0,3]     .D========eE------R ..   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D=======eeeeeeeeER..   ld4r	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
-# CHECK-NEXT: [0,5]     . D========eE------R..   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D=======eeeeeeeeER.   ld4r	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #32
-# CHECK-NEXT: [0,7]     .  D========eE------R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D=======eeeeeeeeER   ld4r	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #16
-# CHECK-NEXT: [0,9]     .   D========eE------R   add	x0, x27, #1
+# CHECK-NEXT: [0,1]     .DeE------R    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . D======eeeeeeeeER ..   ld4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
+# CHECK-NEXT: [0,3]     .  D======eE------R ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   D=====eeeeeeeeER..   ld4r	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: [0,5]     .    D=====eE------R..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .D====eeeeeeeeER.   ld4r	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #32
+# CHECK-NEXT: [0,7]     .    . D====eE------R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  D===eeeeeeeeER   ld4r	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #16
+# CHECK-NEXT: [0,9]     .    .   D===eE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3033,43 +3033,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
-# CHECK-NEXT: 1.     1     2.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     8.0    0.0    0.0       ld4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
-# CHECK-NEXT: 3.     1     9.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     8.0    0.0    0.0       ld4r	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
-# CHECK-NEXT: 5.     1     9.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     8.0    0.0    0.0       ld4r	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #32
-# CHECK-NEXT: 7.     1     9.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     8.0    0.0    0.0       ld4r	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #16
-# CHECK-NEXT: 9.     1     9.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT:        1     7.1    0.1    3.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
+# CHECK-NEXT: 3.     1     7.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     6.0    0.0    0.0       ld4r	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: 5.     1     6.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     5.0    0.0    0.0       ld4r	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #32
+# CHECK-NEXT: 7.     1     5.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     4.0    0.0    0.0       ld4r	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #16
+# CHECK-NEXT: 9.     1     4.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     4.6    0.1    3.0       <total>
 
 # CHECK:      [41] Code Region - G42
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      510
+# CHECK-NEXT: Total Cycles:      1009
 # CHECK-NEXT: Total uOps:        4500
 
-# CHECK:      Dispatch Width:    15
-# CHECK-NEXT: uOps Per Cycle:    8.82
-# CHECK-NEXT: IPC:               1.96
-# CHECK-NEXT: Block RThroughput: 5.0
+# CHECK:      Dispatch Width:    8
+# CHECK-NEXT: uOps Per Cycle:    4.46
+# CHECK-NEXT: IPC:               0.99
+# CHECK-NEXT: Block RThroughput: 5.6
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     01234
+# CHECK-NEXT:                     012345678
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeeeER   .   ld4r	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #8
-# CHECK-NEXT: [0,1]     D=eE------R   .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeeeeeeeeER  .   ld4r	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #16
-# CHECK-NEXT: [0,3]     .D=eE------R  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . DeeeeeeeeER .   ld4r	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #4
-# CHECK-NEXT: [0,5]     . D=eE------R .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  DeeeeeeeeER.   ld4r	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #8
-# CHECK-NEXT: [0,7]     .  D=eE------R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   DeeeeeeeeER   ld4r	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #4
-# CHECK-NEXT: [0,9]     .   D=eE------R   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeeeER    .  .   ld4r	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #8
+# CHECK-NEXT: [0,1]     .DeE------R    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeeeeeeeeER  .  .   ld4r	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #16
+# CHECK-NEXT: [0,3]     .  DeE------R  .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   DeeeeeeeeER.  .   ld4r	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #4
+# CHECK-NEXT: [0,5]     .    DeE------R.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .DeeeeeeeeER .   ld4r	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #8
+# CHECK-NEXT: [0,7]     .    . DeE------R .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  DeeeeeeeeER   ld4r	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #4
+# CHECK-NEXT: [0,9]     .    .   DeE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3078,44 +3078,44 @@ add x0, x27, 1
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld4r	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #8
-# CHECK-NEXT: 1.     1     2.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld4r	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #16
-# CHECK-NEXT: 3.     1     2.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ld4r	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #4
-# CHECK-NEXT: 5.     1     2.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     1.0    0.0    0.0       ld4r	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #8
-# CHECK-NEXT: 7.     1     2.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     1.0    0.0    0.0       ld4r	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #4
-# CHECK-NEXT: 9.     1     2.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT:        1     1.5    0.1    3.0       <total>
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld4r	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #8
+# CHECK-NEXT: 1.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       ld4r	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #16
+# CHECK-NEXT: 3.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       ld4r	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #4
+# CHECK-NEXT: 5.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       ld4r	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #8
+# CHECK-NEXT: 7.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       ld4r	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #4
+# CHECK-NEXT: 9.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    0.5    3.0       <total>
 
 # CHECK:      [42] Code Region - G43
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      510
+# CHECK-NEXT: Total Cycles:      1009
 # CHECK-NEXT: Total uOps:        4500
 
-# CHECK:      Dispatch Width:    15
-# CHECK-NEXT: uOps Per Cycle:    8.82
-# CHECK-NEXT: IPC:               1.96
-# CHECK-NEXT: Block RThroughput: 5.0
+# CHECK:      Dispatch Width:    8
+# CHECK-NEXT: uOps Per Cycle:    4.46
+# CHECK-NEXT: IPC:               0.99
+# CHECK-NEXT: Block RThroughput: 5.6
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     01234
+# CHECK-NEXT:                     012345678
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeeeER   .   ld4r	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
-# CHECK-NEXT: [0,1]     D=eE------R   .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeeeeeeeeER  .   ld4r	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: [0,3]     .D=eE------R  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . DeeeeeeeeER .   ld4r	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: [0,5]     . D=eE------R .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  DeeeeeeeeER.   ld4r	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: [0,7]     .  D=eE------R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   DeeeeeeeeER   ld4r	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: [0,9]     .   D=eE------R   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeeeER    .  .   ld4r	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# CHECK-NEXT: [0,1]     .DeE------R    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeeeeeeeeER  .  .   ld4r	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: [0,3]     .  DeE------R  .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   DeeeeeeeeER.  .   ld4r	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: [0,5]     .    DeE------R.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .DeeeeeeeeER .   ld4r	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: [0,7]     .    . DeE------R .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  DeeeeeeeeER   ld4r	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .   DeE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3125,43 +3125,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld4r	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
-# CHECK-NEXT: 1.     1     2.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld4r	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: 3.     1     2.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ld4r	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: 5.     1     2.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     1.0    0.0    0.0       ld4r	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: 7.     1     2.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     1.0    0.0    0.0       ld4r	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: 9.     1     2.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT:        1     1.5    0.1    3.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       ld4r	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: 3.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       ld4r	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: 5.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       ld4r	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: 7.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       ld4r	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: 9.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    0.5    3.0       <total>
 
 # CHECK:      [43] Code Region - G44
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      508
+# CHECK-NEXT: Total Cycles:      609
 # CHECK-NEXT: Total uOps:        3300
 
-# CHECK:      Dispatch Width:    15
-# CHECK-NEXT: uOps Per Cycle:    6.50
-# CHECK-NEXT: IPC:               1.97
-# CHECK-NEXT: Block RThroughput: 3.7
+# CHECK:      Dispatch Width:    8
+# CHECK-NEXT: uOps Per Cycle:    5.42
+# CHECK-NEXT: IPC:               1.64
+# CHECK-NEXT: Block RThroughput: 4.1
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     012
+# CHECK-NEXT:                     01234
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeeeER .   ld4r	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: [0,1]     D=eE------R .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeeeeeeeeER.   ld4r	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: [0,3]     .D=eE------R.   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . DeeeeeeeeER   ld4r	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: [0,5]     . D=eE------R   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     . D=eeeeeeE-R   ldp	s1, s2, [x27], #248
-# CHECK-NEXT: [0,7]     . D==eE-----R   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D==eeeeeeER   ldp	d1, d2, [x27], #496
-# CHECK-NEXT: [0,9]     . D===eE----R   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeeeER   .   ld4r	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: [0,1]     .DeE------R   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeeeeeeeeER .   ld4r	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: [0,3]     .  DeE------R .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   DeeeeeeeeER   ld4r	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: [0,5]     .    DeE------R   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    DeeeeeeE-R   ldp	s1, s2, [x27], #248
+# CHECK-NEXT: [0,7]     .    D=eE-----R   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    D=eeeeeeER   ldp	d1, d2, [x27], #496
+# CHECK-NEXT: [0,9]     .    D==eE----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3171,16 +3171,16 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld4r	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: 1.     1     2.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld4r	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: 3.     1     2.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ld4r	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: 5.     1     2.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     2.0    0.0    1.0       ldp	s1, s2, [x27], #248
-# CHECK-NEXT: 7.     1     3.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     3.0    0.0    0.0       ldp	d1, d2, [x27], #496
-# CHECK-NEXT: 9.     1     4.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.1    0.1    2.8       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       ld4r	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: 3.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       ld4r	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: 5.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    0.0    1.0       ldp	s1, s2, [x27], #248
+# CHECK-NEXT: 7.     1     2.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     2.0    0.0    0.0       ldp	d1, d2, [x27], #496
+# CHECK-NEXT: 9.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.4    0.3    2.8       <total>
 
 # CHECK:      [44] Code Region - G45
 
@@ -3189,7 +3189,7 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      507
 # CHECK-NEXT: Total uOps:        1700
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    3.35
 # CHECK-NEXT: IPC:               1.97
 # CHECK-NEXT: Block RThroughput: 2.5
@@ -3202,12 +3202,12 @@ add x0, x27, 1
 # CHECK-NEXT: [0,1]     D=eE----R ..   add	x0, x27, #1
 # CHECK-NEXT: [0,2]     D=eeeeeeER..   ldp	s1, s2, [x27, #248]!
 # CHECK-NEXT: [0,3]     D==eE----R..   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     D==eeeeeeER.   ldp	d1, d2, [x27, #496]!
-# CHECK-NEXT: [0,5]     D===eE----R.   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     D===eeeeeeER   ldp	q1, q2, [x27, #992]!
-# CHECK-NEXT: [0,7]     D====eE----R   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .D===eeeeE-R   ldp	w1, w2, [x27], #248
-# CHECK-NEXT: [0,9]     .D====eE---R   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eeeeeeER.   ldp	d1, d2, [x27, #496]!
+# CHECK-NEXT: [0,5]     .D==eE----R.   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeeeeeER   ldp	q1, q2, [x27, #992]!
+# CHECK-NEXT: [0,7]     .D===eE----R   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==eeeeE-R   ldp	w1, w2, [x27], #248
+# CHECK-NEXT: [0,9]     . D===eE---R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3220,13 +3220,13 @@ add x0, x27, 1
 # CHECK-NEXT: 1.     1     2.0    0.0    4.0       add	x0, x27, #1
 # CHECK-NEXT: 2.     1     2.0    0.0    0.0       ldp	s1, s2, [x27, #248]!
 # CHECK-NEXT: 3.     1     3.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     3.0    0.0    0.0       ldp	d1, d2, [x27, #496]!
-# CHECK-NEXT: 5.     1     4.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     4.0    0.0    0.0       ldp	q1, q2, [x27, #992]!
-# CHECK-NEXT: 7.     1     5.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     4.0    0.0    1.0       ldp	w1, w2, [x27], #248
-# CHECK-NEXT: 9.     1     5.0    0.0    3.0       add	x0, x27, #1
-# CHECK-NEXT:        1     3.3    0.1    2.0       <total>
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ldp	d1, d2, [x27, #496]!
+# CHECK-NEXT: 5.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ldp	q1, q2, [x27, #992]!
+# CHECK-NEXT: 7.     1     4.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    0.0    1.0       ldp	w1, w2, [x27], #248
+# CHECK-NEXT: 9.     1     4.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.7    0.1    2.0       <total>
 
 # CHECK:      [45] Code Region - G46
 
@@ -3235,7 +3235,7 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      507
 # CHECK-NEXT: Total uOps:        1900
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    3.75
 # CHECK-NEXT: IPC:               1.97
 # CHECK-NEXT: Block RThroughput: 3.0
@@ -3248,12 +3248,12 @@ add x0, x27, 1
 # CHECK-NEXT: [0,1]     D=eE--R   ..   add	x0, x27, #1
 # CHECK-NEXT: [0,2]     D=eeeeER  ..   ldp	w1, w2, [x27, #248]!
 # CHECK-NEXT: [0,3]     D==eE--R  ..   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     D==eeeeER ..   ldp	x1, x2, [x27, #496]!
-# CHECK-NEXT: [0,5]     D===eE--R ..   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     D===eeeeeER.   ldpsw	x1, x2, [x27], #248
-# CHECK-NEXT: [0,7]     D====eE---R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .D===eeeeeER   ldpsw	x1, x2, [x27, #248]!
-# CHECK-NEXT: [0,9]     .D====eE---R   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eeeeER ..   ldp	x1, x2, [x27, #496]!
+# CHECK-NEXT: [0,5]     .D==eE--R ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeeeeER.   ldpsw	x1, x2, [x27], #248
+# CHECK-NEXT: [0,7]     .D===eE---R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==eeeeeER   ldpsw	x1, x2, [x27, #248]!
+# CHECK-NEXT: [0,9]     . D===eE---R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3266,13 +3266,13 @@ add x0, x27, 1
 # CHECK-NEXT: 1.     1     2.0    0.0    2.0       add	x0, x27, #1
 # CHECK-NEXT: 2.     1     2.0    0.0    0.0       ldp	w1, w2, [x27, #248]!
 # CHECK-NEXT: 3.     1     3.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     3.0    0.0    0.0       ldp	x1, x2, [x27, #496]!
-# CHECK-NEXT: 5.     1     4.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     4.0    0.0    0.0       ldpsw	x1, x2, [x27], #248
-# CHECK-NEXT: 7.     1     5.0    0.0    3.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     4.0    0.0    0.0       ldpsw	x1, x2, [x27, #248]!
-# CHECK-NEXT: 9.     1     5.0    0.0    3.0       add	x0, x27, #1
-# CHECK-NEXT:        1     3.3    0.1    1.2       <total>
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ldp	x1, x2, [x27, #496]!
+# CHECK-NEXT: 5.     1     3.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ldpsw	x1, x2, [x27], #248
+# CHECK-NEXT: 7.     1     4.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    0.0    0.0       ldpsw	x1, x2, [x27, #248]!
+# CHECK-NEXT: 9.     1     4.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.7    0.1    1.2       <total>
 
 # CHECK:      [46] Code Region - G47
 
@@ -3281,7 +3281,7 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      508
 # CHECK-NEXT: Total uOps:        1500
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    2.95
 # CHECK-NEXT: IPC:               1.97
 # CHECK-NEXT: Block RThroughput: 2.5
@@ -3295,11 +3295,11 @@ add x0, x27, 1
 # CHECK-NEXT: [0,2]     D=eeeeeeER. .   ldr	h1, [x27], #254
 # CHECK-NEXT: [0,3]     D==eE----R. .   add	x0, x27, #1
 # CHECK-NEXT: [0,4]     D==eeeeeeER .   ldr	s1, [x27], #254
-# CHECK-NEXT: [0,5]     D===eE----R .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     D===eeeeeeER.   ldr	d1, [x27], #254
-# CHECK-NEXT: [0,7]     D====eE----R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     D====eeeeeeER   ldr	q1, [x27], #254
-# CHECK-NEXT: [0,9]     D=====eE----R   add	x0, x27, #1
+# CHECK-NEXT: [0,5]     .D==eE----R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeeeeeER.   ldr	d1, [x27], #254
+# CHECK-NEXT: [0,7]     .D===eE----R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .D===eeeeeeER   ldr	q1, [x27], #254
+# CHECK-NEXT: [0,9]     .D====eE----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3313,12 +3313,12 @@ add x0, x27, 1
 # CHECK-NEXT: 2.     1     2.0    0.0    0.0       ldr	h1, [x27], #254
 # CHECK-NEXT: 3.     1     3.0    0.0    4.0       add	x0, x27, #1
 # CHECK-NEXT: 4.     1     3.0    0.0    0.0       ldr	s1, [x27], #254
-# CHECK-NEXT: 5.     1     4.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     4.0    0.0    0.0       ldr	d1, [x27], #254
-# CHECK-NEXT: 7.     1     5.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     5.0    0.0    0.0       ldr	q1, [x27], #254
-# CHECK-NEXT: 9.     1     6.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT:        1     3.5    0.1    2.0       <total>
+# CHECK-NEXT: 5.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ldr	d1, [x27], #254
+# CHECK-NEXT: 7.     1     4.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     4.0    0.0    0.0       ldr	q1, [x27], #254
+# CHECK-NEXT: 9.     1     5.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT:        1     3.0    0.1    2.0       <total>
 
 # CHECK:      [47] Code Region - G48
 
@@ -3327,7 +3327,7 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      508
 # CHECK-NEXT: Total uOps:        1500
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    2.95
 # CHECK-NEXT: IPC:               1.97
 # CHECK-NEXT: Block RThroughput: 2.5
@@ -3341,11 +3341,11 @@ add x0, x27, 1
 # CHECK-NEXT: [0,2]     D=eeeeeeER. .   ldr	h1, [x27, #254]!
 # CHECK-NEXT: [0,3]     D==eE----R. .   add	x0, x27, #1
 # CHECK-NEXT: [0,4]     D==eeeeeeER .   ldr	s1, [x27, #254]!
-# CHECK-NEXT: [0,5]     D===eE----R .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     D===eeeeeeER.   ldr	d1, [x27, #254]!
-# CHECK-NEXT: [0,7]     D====eE----R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     D====eeeeeeER   ldr	q1, [x27, #254]!
-# CHECK-NEXT: [0,9]     D=====eE----R   add	x0, x27, #1
+# CHECK-NEXT: [0,5]     .D==eE----R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeeeeeER.   ldr	d1, [x27, #254]!
+# CHECK-NEXT: [0,7]     .D===eE----R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .D===eeeeeeER   ldr	q1, [x27, #254]!
+# CHECK-NEXT: [0,9]     .D====eE----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3359,12 +3359,12 @@ add x0, x27, 1
 # CHECK-NEXT: 2.     1     2.0    0.0    0.0       ldr	h1, [x27, #254]!
 # CHECK-NEXT: 3.     1     3.0    0.0    4.0       add	x0, x27, #1
 # CHECK-NEXT: 4.     1     3.0    0.0    0.0       ldr	s1, [x27, #254]!
-# CHECK-NEXT: 5.     1     4.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     4.0    0.0    0.0       ldr	d1, [x27, #254]!
-# CHECK-NEXT: 7.     1     5.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     5.0    0.0    0.0       ldr	q1, [x27, #254]!
-# CHECK-NEXT: 9.     1     6.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT:        1     3.5    0.1    2.0       <total>
+# CHECK-NEXT: 5.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ldr	d1, [x27, #254]!
+# CHECK-NEXT: 7.     1     4.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     4.0    0.0    0.0       ldr	q1, [x27, #254]!
+# CHECK-NEXT: 9.     1     5.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT:        1     3.0    0.1    2.0       <total>
 
 # CHECK:      [48] Code Region - G49
 
@@ -3373,7 +3373,7 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      506
 # CHECK-NEXT: Total uOps:        1500
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    2.96
 # CHECK-NEXT: IPC:               1.98
 # CHECK-NEXT: Block RThroughput: 2.5
@@ -3387,11 +3387,11 @@ add x0, x27, 1
 # CHECK-NEXT: [0,2]     D=eeeeER  .   ldr	x1, [x27], #254
 # CHECK-NEXT: [0,3]     D==eE--R  .   add	x0, x27, #1
 # CHECK-NEXT: [0,4]     D==eeeeER .   ldr	w1, [x27, #254]!
-# CHECK-NEXT: [0,5]     D===eE--R .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     D===eeeeER.   ldr	x1, [x27, #254]!
-# CHECK-NEXT: [0,7]     D====eE--R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     D====eeeeER   ldrb	w1, [x27], #254
-# CHECK-NEXT: [0,9]     D=====eE--R   add	x0, x27, #1
+# CHECK-NEXT: [0,5]     .D==eE--R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeeeER.   ldr	x1, [x27, #254]!
+# CHECK-NEXT: [0,7]     .D===eE--R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .D===eeeeER   ldrb	w1, [x27], #254
+# CHECK-NEXT: [0,9]     .D====eE--R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3405,12 +3405,12 @@ add x0, x27, 1
 # CHECK-NEXT: 2.     1     2.0    0.0    0.0       ldr	x1, [x27], #254
 # CHECK-NEXT: 3.     1     3.0    0.0    2.0       add	x0, x27, #1
 # CHECK-NEXT: 4.     1     3.0    0.0    0.0       ldr	w1, [x27, #254]!
-# CHECK-NEXT: 5.     1     4.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     4.0    0.0    0.0       ldr	x1, [x27, #254]!
-# CHECK-NEXT: 7.     1     5.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     5.0    0.0    0.0       ldrb	w1, [x27], #254
-# CHECK-NEXT: 9.     1     6.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT:        1     3.5    0.1    1.0       <total>
+# CHECK-NEXT: 5.     1     3.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ldr	x1, [x27, #254]!
+# CHECK-NEXT: 7.     1     4.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     4.0    0.0    0.0       ldrb	w1, [x27], #254
+# CHECK-NEXT: 9.     1     5.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT:        1     3.0    0.1    1.0       <total>
 
 # CHECK:      [49] Code Region - G50
 
@@ -3419,7 +3419,7 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      506
 # CHECK-NEXT: Total uOps:        1500
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    2.96
 # CHECK-NEXT: IPC:               1.98
 # CHECK-NEXT: Block RThroughput: 2.5
@@ -3433,11 +3433,11 @@ add x0, x27, 1
 # CHECK-NEXT: [0,2]     D=eeeeER  .   ldrh	w1, [x27], #254
 # CHECK-NEXT: [0,3]     D==eE--R  .   add	x0, x27, #1
 # CHECK-NEXT: [0,4]     D==eeeeER .   ldrh	w1, [x27, #254]!
-# CHECK-NEXT: [0,5]     D===eE--R .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     D===eeeeER.   ldrsb	w1, [x27], #254
-# CHECK-NEXT: [0,7]     D====eE--R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     D====eeeeER   ldrsb	x1, [x27], #254
-# CHECK-NEXT: [0,9]     D=====eE--R   add	x0, x27, #1
+# CHECK-NEXT: [0,5]     .D==eE--R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeeeER.   ldrsb	w1, [x27], #254
+# CHECK-NEXT: [0,7]     .D===eE--R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .D===eeeeER   ldrsb	x1, [x27], #254
+# CHECK-NEXT: [0,9]     .D====eE--R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3451,12 +3451,12 @@ add x0, x27, 1
 # CHECK-NEXT: 2.     1     2.0    0.0    0.0       ldrh	w1, [x27], #254
 # CHECK-NEXT: 3.     1     3.0    0.0    2.0       add	x0, x27, #1
 # CHECK-NEXT: 4.     1     3.0    0.0    0.0       ldrh	w1, [x27, #254]!
-# CHECK-NEXT: 5.     1     4.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     4.0    0.0    0.0       ldrsb	w1, [x27], #254
-# CHECK-NEXT: 7.     1     5.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     5.0    0.0    0.0       ldrsb	x1, [x27], #254
-# CHECK-NEXT: 9.     1     6.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT:        1     3.5    0.1    1.0       <total>
+# CHECK-NEXT: 5.     1     3.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ldrsb	w1, [x27], #254
+# CHECK-NEXT: 7.     1     4.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     4.0    0.0    0.0       ldrsb	x1, [x27], #254
+# CHECK-NEXT: 9.     1     5.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT:        1     3.0    0.1    1.0       <total>
 
 # CHECK:      [50] Code Region - G51
 
@@ -3465,7 +3465,7 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      506
 # CHECK-NEXT: Total uOps:        1500
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    2.96
 # CHECK-NEXT: IPC:               1.98
 # CHECK-NEXT: Block RThroughput: 2.5
@@ -3479,11 +3479,11 @@ add x0, x27, 1
 # CHECK-NEXT: [0,2]     D=eeeeER  .   ldrsb	x1, [x27, #254]!
 # CHECK-NEXT: [0,3]     D==eE--R  .   add	x0, x27, #1
 # CHECK-NEXT: [0,4]     D==eeeeER .   ldrsh	w1, [x27], #254
-# CHECK-NEXT: [0,5]     D===eE--R .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     D===eeeeER.   ldrsh	x1, [x27], #254
-# CHECK-NEXT: [0,7]     D====eE--R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     D====eeeeER   ldrsh	w1, [x27, #254]!
-# CHECK-NEXT: [0,9]     D=====eE--R   add	x0, x27, #1
+# CHECK-NEXT: [0,5]     .D==eE--R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeeeER.   ldrsh	x1, [x27], #254
+# CHECK-NEXT: [0,7]     .D===eE--R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .D===eeeeER   ldrsh	w1, [x27, #254]!
+# CHECK-NEXT: [0,9]     .D====eE--R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3497,12 +3497,12 @@ add x0, x27, 1
 # CHECK-NEXT: 2.     1     2.0    0.0    0.0       ldrsb	x1, [x27, #254]!
 # CHECK-NEXT: 3.     1     3.0    0.0    2.0       add	x0, x27, #1
 # CHECK-NEXT: 4.     1     3.0    0.0    0.0       ldrsh	w1, [x27], #254
-# CHECK-NEXT: 5.     1     4.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     4.0    0.0    0.0       ldrsh	x1, [x27], #254
-# CHECK-NEXT: 7.     1     5.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     5.0    0.0    0.0       ldrsh	w1, [x27, #254]!
-# CHECK-NEXT: 9.     1     6.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT:        1     3.5    0.1    1.0       <total>
+# CHECK-NEXT: 5.     1     3.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ldrsh	x1, [x27], #254
+# CHECK-NEXT: 7.     1     4.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     4.0    0.0    0.0       ldrsh	w1, [x27, #254]!
+# CHECK-NEXT: 9.     1     5.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT:        1     3.0    0.1    1.0       <total>
 
 # CHECK:      [51] Code Region - G52
 
@@ -3511,7 +3511,7 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      504
 # CHECK-NEXT: Total uOps:        1700
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    3.37
 # CHECK-NEXT: IPC:               1.98
 # CHECK-NEXT: Block RThroughput: 2.5
@@ -3524,11 +3524,11 @@ add x0, x27, 1
 # CHECK-NEXT: [0,2]     D=eeeeER.   ldrsw	x1, [x27], #254
 # CHECK-NEXT: [0,3]     D==eE--R.   add	x0, x27, #1
 # CHECK-NEXT: [0,4]     D==eeeeER   ldrsw	x1, [x27, #254]!
-# CHECK-NEXT: [0,5]     D===eE--R   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     D===eeE-R   st1	{ v1.1d }, [x27], #8
-# CHECK-NEXT: [0,7]     D====eE-R   add	x0, x27, #1
+# CHECK-NEXT: [0,5]     .D==eE--R   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeE-R   st1	{ v1.1d }, [x27], #8
+# CHECK-NEXT: [0,7]     .D===eE-R   add	x0, x27, #1
 # CHECK-NEXT: [0,8]     .D===eeER   st1	{ v1.2d }, [x27], #16
-# CHECK-NEXT: [0,9]     .D====eER   add	x0, x27, #1
+# CHECK-NEXT: [0,9]     . D===eER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3542,12 +3542,12 @@ add x0, x27, 1
 # CHECK-NEXT: 2.     1     2.0    0.0    0.0       ldrsw	x1, [x27], #254
 # CHECK-NEXT: 3.     1     3.0    0.0    2.0       add	x0, x27, #1
 # CHECK-NEXT: 4.     1     3.0    0.0    0.0       ldrsw	x1, [x27, #254]!
-# CHECK-NEXT: 5.     1     4.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     4.0    0.0    1.0       st1	{ v1.1d }, [x27], #8
-# CHECK-NEXT: 7.     1     5.0    0.0    1.0       add	x0, x27, #1
+# CHECK-NEXT: 5.     1     3.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    1.0       st1	{ v1.1d }, [x27], #8
+# CHECK-NEXT: 7.     1     4.0    0.0    1.0       add	x0, x27, #1
 # CHECK-NEXT: 8.     1     4.0    0.0    0.0       st1	{ v1.2d }, [x27], #16
-# CHECK-NEXT: 9.     1     5.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     3.3    0.1    0.8       <total>
+# CHECK-NEXT: 9.     1     4.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.9    0.1    0.8       <total>
 
 # CHECK:      [52] Code Region - G53
 
@@ -3556,7 +3556,7 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      504
 # CHECK-NEXT: Total uOps:        2000
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    3.97
 # CHECK-NEXT: IPC:               1.98
 # CHECK-NEXT: Block RThroughput: 2.5
@@ -3568,12 +3568,12 @@ add x0, x27, 1
 # CHECK-NEXT: [0,1]     D=eER.  .   add	x0, x27, #1
 # CHECK-NEXT: [0,2]     D=eeER  .   st1	{ v1.4h }, [x27], #8
 # CHECK-NEXT: [0,3]     D==eER  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     D==eeER .   st1	{ v1.4s }, [x27], #16
-# CHECK-NEXT: [0,5]     D===eER .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     D===eeER.   st1	{ v1.8b }, [x27], #8
+# CHECK-NEXT: [0,4]     .D=eeER .   st1	{ v1.4s }, [x27], #16
+# CHECK-NEXT: [0,5]     .D==eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeER.   st1	{ v1.8b }, [x27], #8
 # CHECK-NEXT: [0,7]     .D===eER.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .D===eeER   st1	{ v1.8h }, [x27], #16
-# CHECK-NEXT: [0,9]     .D====eER   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==eeER   st1	{ v1.8h }, [x27], #16
+# CHECK-NEXT: [0,9]     . D===eER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3586,13 +3586,13 @@ add x0, x27, 1
 # CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
 # CHECK-NEXT: 2.     1     2.0    0.0    0.0       st1	{ v1.4h }, [x27], #8
 # CHECK-NEXT: 3.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     3.0    0.0    0.0       st1	{ v1.4s }, [x27], #16
-# CHECK-NEXT: 5.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     4.0    0.0    0.0       st1	{ v1.8b }, [x27], #8
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       st1	{ v1.4s }, [x27], #16
+# CHECK-NEXT: 5.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       st1	{ v1.8b }, [x27], #8
 # CHECK-NEXT: 7.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     4.0    0.0    0.0       st1	{ v1.8h }, [x27], #16
-# CHECK-NEXT: 9.     1     5.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     3.2    0.1    0.0       <total>
+# CHECK-NEXT: 8.     1     3.0    0.0    0.0       st1	{ v1.8h }, [x27], #16
+# CHECK-NEXT: 9.     1     4.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.7    0.1    0.0       <total>
 
 # CHECK:      [53] Code Region - G54
 
@@ -3601,7 +3601,7 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      504
 # CHECK-NEXT: Total uOps:        2000
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    3.97
 # CHECK-NEXT: IPC:               1.98
 # CHECK-NEXT: Block RThroughput: 2.5
@@ -3613,12 +3613,12 @@ add x0, x27, 1
 # CHECK-NEXT: [0,1]     D=eER.  .   add	x0, x27, #1
 # CHECK-NEXT: [0,2]     D=eeER  .   st1	{ v1.1d }, [x27], x28
 # CHECK-NEXT: [0,3]     D==eER  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     D==eeER .   st1	{ v1.2d }, [x27], x28
-# CHECK-NEXT: [0,5]     D===eER .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     D===eeER.   st1	{ v1.2s }, [x27], x28
+# CHECK-NEXT: [0,4]     .D=eeER .   st1	{ v1.2d }, [x27], x28
+# CHECK-NEXT: [0,5]     .D==eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeER.   st1	{ v1.2s }, [x27], x28
 # CHECK-NEXT: [0,7]     .D===eER.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .D===eeER   st1	{ v1.4h }, [x27], x28
-# CHECK-NEXT: [0,9]     .D====eER   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==eeER   st1	{ v1.4h }, [x27], x28
+# CHECK-NEXT: [0,9]     . D===eER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3631,13 +3631,13 @@ add x0, x27, 1
 # CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
 # CHECK-NEXT: 2.     1     2.0    0.0    0.0       st1	{ v1.1d }, [x27], x28
 # CHECK-NEXT: 3.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     3.0    0.0    0.0       st1	{ v1.2d }, [x27], x28
-# CHECK-NEXT: 5.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     4.0    0.0    0.0       st1	{ v1.2s }, [x27], x28
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       st1	{ v1.2d }, [x27], x28
+# CHECK-NEXT: 5.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       st1	{ v1.2s }, [x27], x28
 # CHECK-NEXT: 7.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     4.0    0.0    0.0       st1	{ v1.4h }, [x27], x28
-# CHECK-NEXT: 9.     1     5.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     3.2    0.1    0.0       <total>
+# CHECK-NEXT: 8.     1     3.0    0.0    0.0       st1	{ v1.4h }, [x27], x28
+# CHECK-NEXT: 9.     1     4.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.7    0.1    0.0       <total>
 
 # CHECK:      [54] Code Region - G55
 
@@ -3646,7 +3646,7 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      504
 # CHECK-NEXT: Total uOps:        2000
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    3.97
 # CHECK-NEXT: IPC:               1.98
 # CHECK-NEXT: Block RThroughput: 2.5
@@ -3658,12 +3658,12 @@ add x0, x27, 1
 # CHECK-NEXT: [0,1]     D=eER.  .   add	x0, x27, #1
 # CHECK-NEXT: [0,2]     D=eeER  .   st1	{ v1.8b }, [x27], x28
 # CHECK-NEXT: [0,3]     D==eER  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     D==eeER .   st1	{ v1.8h }, [x27], x28
-# CHECK-NEXT: [0,5]     D===eER .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     D===eeER.   st1	{ v1.16b }, [x27], x28
+# CHECK-NEXT: [0,4]     .D=eeER .   st1	{ v1.8h }, [x27], x28
+# CHECK-NEXT: [0,5]     .D==eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeER.   st1	{ v1.16b }, [x27], x28
 # CHECK-NEXT: [0,7]     .D===eER.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .D===eeER   st1	{ v1.1d, v2.1d }, [x27], #16
-# CHECK-NEXT: [0,9]     .D====eER   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==eeER   st1	{ v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: [0,9]     . D===eER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3676,13 +3676,13 @@ add x0, x27, 1
 # CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
 # CHECK-NEXT: 2.     1     2.0    0.0    0.0       st1	{ v1.8b }, [x27], x28
 # CHECK-NEXT: 3.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     3.0    0.0    0.0       st1	{ v1.8h }, [x27], x28
-# CHECK-NEXT: 5.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     4.0    0.0    0.0       st1	{ v1.16b }, [x27], x28
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       st1	{ v1.8h }, [x27], x28
+# CHECK-NEXT: 5.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       st1	{ v1.16b }, [x27], x28
 # CHECK-NEXT: 7.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     4.0    0.0    0.0       st1	{ v1.1d, v2.1d }, [x27], #16
-# CHECK-NEXT: 9.     1     5.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     3.2    0.1    0.0       <total>
+# CHECK-NEXT: 8.     1     3.0    0.0    0.0       st1	{ v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: 9.     1     4.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.7    0.1    0.0       <total>
 
 # CHECK:      [55] Code Region - G56
 
@@ -3691,7 +3691,7 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      504
 # CHECK-NEXT: Total uOps:        2400
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    4.76
 # CHECK-NEXT: IPC:               1.98
 # CHECK-NEXT: Block RThroughput: 3.5
@@ -3701,14 +3701,14 @@ add x0, x27, 1
 
 # CHECK:      [0,0]     DeeER.  .   st1	{ v1.2d, v2.2d }, [x27], #32
 # CHECK-NEXT: [0,1]     D=eER.  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=eeER  .   st1	{ v1.2s, v2.2s }, [x27], #16
-# CHECK-NEXT: [0,3]     D==eER  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     D==eeER .   st1	{ v1.4h, v2.4h }, [x27], #16
-# CHECK-NEXT: [0,5]     D===eER .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D==eeER.   st1	{ v1.4s, v2.4s }, [x27], #32
-# CHECK-NEXT: [0,7]     .D===eER.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .D===eeER   st1	{ v1.8b, v2.8b }, [x27], #16
-# CHECK-NEXT: [0,9]     .D====eER   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeER  .   st1	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: [0,3]     .D=eER  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eeER .   st1	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: [0,5]     .D==eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     . D=eeER.   st1	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: [0,7]     . D==eER.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .  D=eeER   st1	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: [0,9]     .  D==eER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3719,15 +3719,15 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.2d, v2.2d }, [x27], #32
 # CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st1	{ v1.2s, v2.2s }, [x27], #16
-# CHECK-NEXT: 3.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     3.0    0.0    0.0       st1	{ v1.4h, v2.4h }, [x27], #16
-# CHECK-NEXT: 5.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     3.0    0.0    0.0       st1	{ v1.4s, v2.4s }, [x27], #32
-# CHECK-NEXT: 7.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     4.0    0.0    0.0       st1	{ v1.8b, v2.8b }, [x27], #16
-# CHECK-NEXT: 9.     1     5.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     3.1    0.1    0.0       <total>
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st1	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: 3.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       st1	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: 5.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     2.0    0.0    0.0       st1	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: 7.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     2.0    0.0    0.0       st1	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: 9.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.1    0.1    0.0       <total>
 
 # CHECK:      [56] Code Region - G57
 
@@ -3736,7 +3736,7 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      504
 # CHECK-NEXT: Total uOps:        2600
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    5.16
 # CHECK-NEXT: IPC:               1.98
 # CHECK-NEXT: Block RThroughput: 4.0
@@ -3746,14 +3746,14 @@ add x0, x27, 1
 
 # CHECK:      [0,0]     DeeER.  .   st1	{ v1.8h, v2.8h }, [x27], #32
 # CHECK-NEXT: [0,1]     D=eER.  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=eeER  .   st1	{ v1.16b, v2.16b }, [x27], #32
-# CHECK-NEXT: [0,3]     D==eER  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     D==eeER .   st1	{ v1.1d, v2.1d }, [x27], x28
-# CHECK-NEXT: [0,5]     .D==eER .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D==eeER.   st1	{ v1.2d, v2.2d }, [x27], x28
-# CHECK-NEXT: [0,7]     .D===eER.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .D===eeER   st1	{ v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT: [0,9]     .D====eER   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeER  .   st1	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: [0,3]     .D=eER  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DeeER .   st1	{ v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: [0,5]     . D=eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  DeeER.   st1	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: [0,7]     .  D=eER.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   DeeER   st1	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: [0,9]     .   D=eER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3764,15 +3764,15 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.8h, v2.8h }, [x27], #32
 # CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st1	{ v1.16b, v2.16b }, [x27], #32
-# CHECK-NEXT: 3.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     3.0    0.0    0.0       st1	{ v1.1d, v2.1d }, [x27], x28
-# CHECK-NEXT: 5.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     3.0    0.0    0.0       st1	{ v1.2d, v2.2d }, [x27], x28
-# CHECK-NEXT: 7.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     4.0    0.0    0.0       st1	{ v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT: 9.     1     5.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     3.0    0.1    0.0       <total>
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st1	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: 3.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       st1	{ v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: 5.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    0.0    0.0       st1	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: 7.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    0.0    0.0       st1	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: 9.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.5    0.1    0.0       <total>
 
 # CHECK:      [57] Code Region - G58
 
@@ -3781,7 +3781,7 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      504
 # CHECK-NEXT: Total uOps:        2600
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    5.16
 # CHECK-NEXT: IPC:               1.98
 # CHECK-NEXT: Block RThroughput: 4.0
@@ -3791,14 +3791,14 @@ add x0, x27, 1
 
 # CHECK:      [0,0]     DeeER.  .   st1	{ v1.4h, v2.4h }, [x27], x28
 # CHECK-NEXT: [0,1]     D=eER.  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=eeER  .   st1	{ v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT: [0,3]     D==eER  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     D==eeER .   st1	{ v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: [0,5]     D===eER .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D==eeER.   st1	{ v1.8h, v2.8h }, [x27], x28
-# CHECK-NEXT: [0,7]     .D===eER.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .D===eeER   st1	{ v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: [0,9]     .D====eER   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeER  .   st1	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: [0,3]     .D=eER  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DeeER .   st1	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: [0,5]     . D=eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  DeeER.   st1	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: [0,7]     .  D=eER.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   DeeER   st1	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: [0,9]     .   D=eER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3809,15 +3809,15 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.4h, v2.4h }, [x27], x28
 # CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st1	{ v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT: 3.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     3.0    0.0    0.0       st1	{ v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: 5.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     3.0    0.0    0.0       st1	{ v1.8h, v2.8h }, [x27], x28
-# CHECK-NEXT: 7.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     4.0    0.0    0.0       st1	{ v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: 9.     1     5.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     3.1    0.1    0.0       <total>
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st1	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: 3.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       st1	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: 5.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    0.0    0.0       st1	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: 7.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    0.0    0.0       st1	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: 9.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.5    0.1    0.0       <total>
 
 # CHECK:      [58] Code Region - G59
 
@@ -3826,7 +3826,7 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      703
 # CHECK-NEXT: Total uOps:        3400
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    4.84
 # CHECK-NEXT: IPC:               1.42
 # CHECK-NEXT: Block RThroughput: 6.0
@@ -3836,14 +3836,14 @@ add x0, x27, 1
 
 # CHECK:      [0,0]     DeeER.   .   st1	{ v1.1d, v2.1d, v3.1d }, [x27], #24
 # CHECK-NEXT: [0,1]     D=eER.   .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=eeER   .   st1	{ v1.2d, v2.2d, v3.2d }, [x27], #48
-# CHECK-NEXT: [0,3]     D==eER   .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D=eeER  .   st1	{ v1.2s, v2.2s, v3.2s }, [x27], #24
-# CHECK-NEXT: [0,5]     .D==eER  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D===eeER.   st1	{ v1.4h, v2.4h, v3.4h }, [x27], #24
-# CHECK-NEXT: [0,7]     .D====eER.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D===eeER   st1	{ v1.4s, v2.4s, v3.4s }, [x27], #48
-# CHECK-NEXT: [0,9]     . D====eER   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeER   .   st1	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: [0,3]     .D=eER   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DeeER  .   st1	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: [0,5]     . D=eER  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D=eeER.   st1	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: [0,7]     .  D==eER.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D=eeER   st1	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: [0,9]     .   D==eER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3854,15 +3854,15 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.1d, v2.1d, v3.1d }, [x27], #24
 # CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st1	{ v1.2d, v2.2d, v3.2d }, [x27], #48
-# CHECK-NEXT: 3.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     2.0    0.0    0.0       st1	{ v1.2s, v2.2s, v3.2s }, [x27], #24
-# CHECK-NEXT: 5.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     4.0    1.0    0.0       st1	{ v1.4h, v2.4h, v3.4h }, [x27], #24
-# CHECK-NEXT: 7.     1     5.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     4.0    0.0    0.0       st1	{ v1.4s, v2.4s, v3.4s }, [x27], #48
-# CHECK-NEXT: 9.     1     5.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     3.1    0.2    0.0       <total>
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st1	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: 3.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       st1	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: 5.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     2.0    1.0    0.0       st1	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: 7.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     2.0    0.0    0.0       st1	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: 9.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.9    0.2    0.0       <total>
 
 # CHECK:      [59] Code Region - G60
 
@@ -3871,7 +3871,7 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      703
 # CHECK-NEXT: Total uOps:        3600
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    5.12
 # CHECK-NEXT: IPC:               1.42
 # CHECK-NEXT: Block RThroughput: 6.5
@@ -3881,14 +3881,14 @@ add x0, x27, 1
 
 # CHECK:      [0,0]     DeeER.   .   st1	{ v1.8b, v2.8b, v3.8b }, [x27], #24
 # CHECK-NEXT: [0,1]     D=eER.   .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=eeER   .   st1	{ v1.8h, v2.8h, v3.8h }, [x27], #48
-# CHECK-NEXT: [0,3]     D==eER   .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D=eeER  .   st1	{ v1.16b, v2.16b, v3.16b }, [x27], #48
-# CHECK-NEXT: [0,5]     .D==eER  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D===eeER.   st1	{ v1.1d, v2.1d, v3.1d }, [x27], x28
-# CHECK-NEXT: [0,7]     .D====eER.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D===eeER   st1	{ v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT: [0,9]     . D====eER   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeER   .   st1	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: [0,3]     .D=eER   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DeeER  .   st1	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: [0,5]     . D=eER  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D=eeER.   st1	{ v1.1d, v2.1d, v3.1d }, [x27], x28
+# CHECK-NEXT: [0,7]     .  D==eER.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D=eeER   st1	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: [0,9]     .   D==eER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3899,15 +3899,15 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.8b, v2.8b, v3.8b }, [x27], #24
 # CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st1	{ v1.8h, v2.8h, v3.8h }, [x27], #48
-# CHECK-NEXT: 3.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     2.0    0.0    0.0       st1	{ v1.16b, v2.16b, v3.16b }, [x27], #48
-# CHECK-NEXT: 5.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     4.0    1.0    0.0       st1	{ v1.1d, v2.1d, v3.1d }, [x27], x28
-# CHECK-NEXT: 7.     1     5.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     4.0    0.0    0.0       st1	{ v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT: 9.     1     5.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     3.1    0.2    0.0       <total>
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st1	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: 3.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       st1	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: 5.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     2.0    1.0    0.0       st1	{ v1.1d, v2.1d, v3.1d }, [x27], x28
+# CHECK-NEXT: 7.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     2.0    0.0    0.0       st1	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: 9.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.9    0.2    0.0       <total>
 
 # CHECK:      [60] Code Region - G61
 
@@ -3916,7 +3916,7 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      703
 # CHECK-NEXT: Total uOps:        3400
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    4.84
 # CHECK-NEXT: IPC:               1.42
 # CHECK-NEXT: Block RThroughput: 6.0
@@ -3926,14 +3926,14 @@ add x0, x27, 1
 
 # CHECK:      [0,0]     DeeER.   .   st1	{ v1.2s, v2.2s, v3.2s }, [x27], x28
 # CHECK-NEXT: [0,1]     D=eER.   .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=eeER   .   st1	{ v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: [0,3]     D==eER   .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D=eeER  .   st1	{ v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: [0,5]     .D==eER  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D==eeER .   st1	{ v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: [0,7]     .D===eER .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D===eeER   st1	{ v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT: [0,9]     . D====eER   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeER   .   st1	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: [0,3]     .D=eER   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DeeER  .   st1	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: [0,5]     . D=eER  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  DeeER .   st1	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: [0,7]     .  D=eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D=eeER   st1	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: [0,9]     .   D==eER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3944,15 +3944,15 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.2s, v2.2s, v3.2s }, [x27], x28
 # CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st1	{ v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: 3.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     2.0    0.0    0.0       st1	{ v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: 5.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     3.0    0.0    0.0       st1	{ v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: 7.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     4.0    1.0    0.0       st1	{ v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT: 9.     1     5.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.9    0.2    0.0       <total>
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st1	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: 3.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       st1	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: 5.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    0.0    0.0       st1	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: 7.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     2.0    1.0    0.0       st1	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: 9.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.7    0.2    0.0       <total>
 
 # CHECK:      [61] Code Region - G62
 
@@ -3961,7 +3961,7 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      704
 # CHECK-NEXT: Total uOps:        3600
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    5.11
 # CHECK-NEXT: IPC:               1.42
 # CHECK-NEXT: Block RThroughput: 6.5
@@ -3972,14 +3972,14 @@ add x0, x27, 1
 
 # CHECK:      [0,0]     DeeER.    .   st1	{ v1.16b, v2.16b, v3.16b }, [x27], x28
 # CHECK-NEXT: [0,1]     D=eER.    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=eeER    .   st1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
-# CHECK-NEXT: [0,3]     D==eER    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D==eeER  .   st1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
-# CHECK-NEXT: [0,5]     .D===eER  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D===eeER .   st1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
-# CHECK-NEXT: [0,7]     . D===eER .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D====eeER   st1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
-# CHECK-NEXT: [0,9]     . D=====eER   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeER    .   st1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: [0,3]     .D=eER    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D=eeER  .   st1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: [0,5]     .  D=eER  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D=eeER .   st1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: [0,7]     .  D==eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D==eeER   st1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: [0,9]     .   D===eER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3990,15 +3990,15 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.16b, v2.16b, v3.16b }, [x27], x28
 # CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
-# CHECK-NEXT: 3.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     3.0    1.0    0.0       st1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
-# CHECK-NEXT: 5.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     4.0    0.0    0.0       st1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
-# CHECK-NEXT: 7.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     5.0    1.0    0.0       st1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
-# CHECK-NEXT: 9.     1     6.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     3.4    0.3    0.0       <total>
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: 3.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    1.0    0.0       st1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: 5.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     2.0    0.0    0.0       st1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: 7.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    1.0    0.0       st1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: 9.     1     4.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.2    0.3    0.0       <total>
 
 # CHECK:      [62] Code Region - G63
 
@@ -4007,7 +4007,7 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      804
 # CHECK-NEXT: Total uOps:        4200
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    5.22
 # CHECK-NEXT: IPC:               1.24
 # CHECK-NEXT: Block RThroughput: 8.0
@@ -4017,15 +4017,15 @@ add x0, x27, 1
 # CHECK-NEXT: Index     0123456789
 
 # CHECK:      [0,0]     DeeER.    ..   st1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
-# CHECK-NEXT: [0,1]     D=eER.    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=eeER    ..   st1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: [0,1]     .DeER.    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeER    ..   st1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
 # CHECK-NEXT: [0,3]     .D=eER    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D==eeER  ..   st1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
-# CHECK-NEXT: [0,5]     .D===eER  ..   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     . D==eeER ..   st1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
-# CHECK-NEXT: [0,7]     . D===eER ..   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D=====eeER   st1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
-# CHECK-NEXT: [0,9]     .  D=====eER   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D=eeER  ..   st1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: [0,5]     .  D=eER  ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .   DeeER ..   st1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: [0,7]     .    DeER ..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    D==eeER   st1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# CHECK-NEXT: [0,9]     .    D===eER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4035,16 +4035,16 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
-# CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: 1.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
 # CHECK-NEXT: 3.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     3.0    1.0    0.0       st1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
-# CHECK-NEXT: 5.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     3.0    0.0    0.0       st1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
-# CHECK-NEXT: 7.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     6.0    2.0    0.0       st1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
-# CHECK-NEXT: 9.     1     6.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     3.3    0.4    0.0       <total>
+# CHECK-NEXT: 4.     1     2.0    1.0    0.0       st1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: 5.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    0.0    0.0       st1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: 7.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    2.0    0.0       st1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# CHECK-NEXT: 9.     1     4.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.8    0.4    0.0       <total>
 
 # CHECK:      [63] Code Region - G64
 
@@ -4053,7 +4053,7 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      703
 # CHECK-NEXT: Total uOps:        3800
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    5.41
 # CHECK-NEXT: IPC:               1.42
 # CHECK-NEXT: Block RThroughput: 7.0
@@ -4062,15 +4062,15 @@ add x0, x27, 1
 # CHECK-NEXT: Index     0123456789
 
 # CHECK:      [0,0]     DeeER.   .   st1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: [0,1]     D=eER.   .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=eeER   .   st1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: [0,1]     .DeER.   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeER   .   st1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
 # CHECK-NEXT: [0,3]     .D=eER   .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D==eeER .   st1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: [0,5]     .D===eER .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     . D==eeER.   st1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: [0,7]     . D===eER.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D===eeER   st1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: [0,9]     .  D===eER   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D=eeER .   st1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: [0,5]     . D==eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D=eeER.   st1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: [0,7]     .   D=eER.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D=eeER   st1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: [0,9]     .   D==eER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4080,16 +4080,16 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: 1.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
 # CHECK-NEXT: 3.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     3.0    1.0    0.0       st1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: 5.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     3.0    0.0    0.0       st1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: 7.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     4.0    0.0    0.0       st1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: 9.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.9    0.2    0.0       <total>
+# CHECK-NEXT: 4.     1     2.0    1.0    0.0       st1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: 5.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     2.0    0.0    0.0       st1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: 7.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     2.0    0.0    0.0       st1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: 9.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.9    0.2    0.0       <total>
 
 # CHECK:      [64] Code Region - G65
 
@@ -4098,7 +4098,7 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      706
 # CHECK-NEXT: Total uOps:        3200
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    4.53
 # CHECK-NEXT: IPC:               1.42
 # CHECK-NEXT: Block RThroughput: 5.5
@@ -4108,15 +4108,15 @@ add x0, x27, 1
 # CHECK-NEXT: Index     0123456789
 
 # CHECK:      [0,0]     DeeER.    . .   st1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: [0,1]     D=eER.    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeeER    . .   st1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: [0,3]     .D=eER    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D===eeeeER .   st1	{ v1.b }[0], [x27], #1
-# CHECK-NEXT: [0,5]     .D====eE--R .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     . D===eeeeER.   st1	{ v1.b }[8], [x27], #1
-# CHECK-NEXT: [0,7]     . D====eE--R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D====eeeeER   st1	{ v1.b }[0], [x27], x28
-# CHECK-NEXT: [0,9]     . D=====eE--R   add	x0, x27, #1
+# CHECK-NEXT: [0,1]     .DeER.    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeeER   . .   st1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: [0,3]     .  DeER   . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .  D=eeeeER .   st1	{ v1.b }[0], [x27], #1
+# CHECK-NEXT: [0,5]     .  D==eE--R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .   D=eeeeER.   st1	{ v1.b }[8], [x27], #1
+# CHECK-NEXT: [0,7]     .   D==eE--R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D==eeeeER   st1	{ v1.b }[0], [x27], x28
+# CHECK-NEXT: [0,9]     .   D===eE--R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4126,16 +4126,16 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: 3.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     4.0    2.0    0.0       st1	{ v1.b }[0], [x27], #1
-# CHECK-NEXT: 5.     1     5.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     4.0    0.0    0.0       st1	{ v1.b }[8], [x27], #1
-# CHECK-NEXT: 7.     1     5.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     5.0    0.0    0.0       st1	{ v1.b }[0], [x27], x28
-# CHECK-NEXT: 9.     1     6.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT:        1     3.5    0.3    0.6       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       st1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: 3.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    1.0    0.0       st1	{ v1.b }[0], [x27], #1
+# CHECK-NEXT: 5.     1     3.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     2.0    0.0    0.0       st1	{ v1.b }[8], [x27], #1
+# CHECK-NEXT: 7.     1     3.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    0.0    0.0       st1	{ v1.b }[0], [x27], x28
+# CHECK-NEXT: 9.     1     4.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.1    0.3    0.6       <total>
 
 # CHECK:      [65] Code Region - G66
 
@@ -4144,7 +4144,7 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      506
 # CHECK-NEXT: Total uOps:        2000
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    3.95
 # CHECK-NEXT: IPC:               1.98
 # CHECK-NEXT: Block RThroughput: 2.5
@@ -4157,12 +4157,12 @@ add x0, x27, 1
 # CHECK-NEXT: [0,1]     D=eE--R   .   add	x0, x27, #1
 # CHECK-NEXT: [0,2]     D=eeeeER  .   st1	{ v1.h }[0], [x27], #2
 # CHECK-NEXT: [0,3]     D==eE--R  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     D==eeeeER .   st1	{ v1.h }[4], [x27], #2
-# CHECK-NEXT: [0,5]     D===eE--R .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     D===eeeeER.   st1	{ v1.h }[0], [x27], x28
+# CHECK-NEXT: [0,4]     .D=eeeeER .   st1	{ v1.h }[4], [x27], #2
+# CHECK-NEXT: [0,5]     .D==eE--R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeeeER.   st1	{ v1.h }[0], [x27], x28
 # CHECK-NEXT: [0,7]     .D===eE--R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .D===eeeeER   st1	{ v1.h }[4], [x27], x28
-# CHECK-NEXT: [0,9]     .D====eE--R   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==eeeeER   st1	{ v1.h }[4], [x27], x28
+# CHECK-NEXT: [0,9]     . D===eE--R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4175,13 +4175,13 @@ add x0, x27, 1
 # CHECK-NEXT: 1.     1     2.0    0.0    2.0       add	x0, x27, #1
 # CHECK-NEXT: 2.     1     2.0    0.0    0.0       st1	{ v1.h }[0], [x27], #2
 # CHECK-NEXT: 3.     1     3.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     3.0    0.0    0.0       st1	{ v1.h }[4], [x27], #2
-# CHECK-NEXT: 5.     1     4.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     4.0    0.0    0.0       st1	{ v1.h }[0], [x27], x28
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       st1	{ v1.h }[4], [x27], #2
+# CHECK-NEXT: 5.     1     3.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       st1	{ v1.h }[0], [x27], x28
 # CHECK-NEXT: 7.     1     4.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     4.0    0.0    0.0       st1	{ v1.h }[4], [x27], x28
-# CHECK-NEXT: 9.     1     5.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT:        1     3.2    0.1    1.0       <total>
+# CHECK-NEXT: 8.     1     3.0    0.0    0.0       st1	{ v1.h }[4], [x27], x28
+# CHECK-NEXT: 9.     1     4.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.7    0.1    1.0       <total>
 
 # CHECK:      [66] Code Region - G67
 
@@ -4190,7 +4190,7 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      506
 # CHECK-NEXT: Total uOps:        2200
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    4.35
 # CHECK-NEXT: IPC:               1.98
 # CHECK-NEXT: Block RThroughput: 3.0
@@ -4203,12 +4203,12 @@ add x0, x27, 1
 # CHECK-NEXT: [0,1]     D=eE--R   .   add	x0, x27, #1
 # CHECK-NEXT: [0,2]     D=eeeeER  .   st1	{ v1.s }[0], [x27], x28
 # CHECK-NEXT: [0,3]     D==eE--R  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     D==eeeeER .   st1	{ v1.d }[0], [x27], #8
-# CHECK-NEXT: [0,5]     D===eE--R .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     D===eeeeER.   st1	{ v1.d }[0], [x27], x28
+# CHECK-NEXT: [0,4]     .D=eeeeER .   st1	{ v1.d }[0], [x27], #8
+# CHECK-NEXT: [0,5]     .D==eE--R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeeeER.   st1	{ v1.d }[0], [x27], x28
 # CHECK-NEXT: [0,7]     .D===eE--R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .D===eeeeER   st2	{ v1.2d, v2.2d }, [x27], #32
-# CHECK-NEXT: [0,9]     .D====eE--R   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==eeeeER   st2	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: [0,9]     . D===eE--R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4221,13 +4221,13 @@ add x0, x27, 1
 # CHECK-NEXT: 1.     1     2.0    0.0    2.0       add	x0, x27, #1
 # CHECK-NEXT: 2.     1     2.0    0.0    0.0       st1	{ v1.s }[0], [x27], x28
 # CHECK-NEXT: 3.     1     3.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     3.0    0.0    0.0       st1	{ v1.d }[0], [x27], #8
-# CHECK-NEXT: 5.     1     4.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     4.0    0.0    0.0       st1	{ v1.d }[0], [x27], x28
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       st1	{ v1.d }[0], [x27], #8
+# CHECK-NEXT: 5.     1     3.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       st1	{ v1.d }[0], [x27], x28
 # CHECK-NEXT: 7.     1     4.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     4.0    0.0    0.0       st2	{ v1.2d, v2.2d }, [x27], #32
-# CHECK-NEXT: 9.     1     5.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT:        1     3.2    0.1    1.0       <total>
+# CHECK-NEXT: 8.     1     3.0    0.0    0.0       st2	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: 9.     1     4.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.7    0.1    1.0       <total>
 
 # CHECK:      [67] Code Region - G68
 
@@ -4236,7 +4236,7 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      506
 # CHECK-NEXT: Total uOps:        2400
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    4.74
 # CHECK-NEXT: IPC:               1.98
 # CHECK-NEXT: Block RThroughput: 3.5
@@ -4249,12 +4249,12 @@ add x0, x27, 1
 # CHECK-NEXT: [0,1]     D=eE--R   .   add	x0, x27, #1
 # CHECK-NEXT: [0,2]     D=eeeeER  .   st2	{ v1.4h, v2.4h }, [x27], #16
 # CHECK-NEXT: [0,3]     D==eE--R  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     D==eeeeER .   st2	{ v1.4s, v2.4s }, [x27], #32
-# CHECK-NEXT: [0,5]     D===eE--R .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D==eeeeER.   st2	{ v1.8b, v2.8b }, [x27], #16
-# CHECK-NEXT: [0,7]     .D===eE--R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .D===eeeeER   st2	{ v1.8h, v2.8h }, [x27], #32
-# CHECK-NEXT: [0,9]     .D====eE--R   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eeeeER .   st2	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: [0,5]     .D==eE--R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     . D=eeeeER.   st2	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: [0,7]     . D==eE--R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .  D=eeeeER   st2	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: [0,9]     .  D==eE--R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4267,13 +4267,13 @@ add x0, x27, 1
 # CHECK-NEXT: 1.     1     2.0    0.0    2.0       add	x0, x27, #1
 # CHECK-NEXT: 2.     1     2.0    0.0    0.0       st2	{ v1.4h, v2.4h }, [x27], #16
 # CHECK-NEXT: 3.     1     3.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     3.0    0.0    0.0       st2	{ v1.4s, v2.4s }, [x27], #32
-# CHECK-NEXT: 5.     1     4.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     3.0    0.0    0.0       st2	{ v1.8b, v2.8b }, [x27], #16
-# CHECK-NEXT: 7.     1     4.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     4.0    0.0    0.0       st2	{ v1.8h, v2.8h }, [x27], #32
-# CHECK-NEXT: 9.     1     5.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT:        1     3.1    0.1    1.0       <total>
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       st2	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: 5.     1     3.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     2.0    0.0    0.0       st2	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: 7.     1     3.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     2.0    0.0    0.0       st2	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: 9.     1     3.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.3    0.1    1.0       <total>
 
 # CHECK:      [68] Code Region - G69
 
@@ -4282,7 +4282,7 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      506
 # CHECK-NEXT: Total uOps:        2600
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    5.14
 # CHECK-NEXT: IPC:               1.98
 # CHECK-NEXT: Block RThroughput: 4.0
@@ -4293,14 +4293,14 @@ add x0, x27, 1
 
 # CHECK:      [0,0]     DeeeeER   .   st2	{ v1.16b, v2.16b }, [x27], #32
 # CHECK-NEXT: [0,1]     D=eE--R   .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=eeeeER  .   st2	{ v1.2d, v2.2d }, [x27], x28
-# CHECK-NEXT: [0,3]     D==eE--R  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     D==eeeeER .   st2	{ v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT: [0,5]     .D==eE--R .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D==eeeeER.   st2	{ v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: [0,7]     .D===eE--R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .D===eeeeER   st2	{ v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT: [0,9]     .D====eE--R   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeER  .   st2	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: [0,3]     .D=eE--R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DeeeeER .   st2	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: [0,5]     . D=eE--R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     . D=eeeeER.   st2	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: [0,7]     . D==eE--R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .  D=eeeeER   st2	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: [0,9]     .  D==eE--R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4311,15 +4311,15 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st2	{ v1.16b, v2.16b }, [x27], #32
 # CHECK-NEXT: 1.     1     2.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st2	{ v1.2d, v2.2d }, [x27], x28
-# CHECK-NEXT: 3.     1     3.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     3.0    0.0    0.0       st2	{ v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT: 5.     1     3.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     3.0    0.0    0.0       st2	{ v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: 7.     1     4.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     4.0    0.0    0.0       st2	{ v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT: 9.     1     5.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT:        1     3.0    0.1    1.0       <total>
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st2	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: 3.     1     2.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       st2	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: 5.     1     2.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     2.0    0.0    0.0       st2	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: 7.     1     3.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     2.0    0.0    0.0       st2	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: 9.     1     3.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.9    0.1    1.0       <total>
 
 # CHECK:      [69] Code Region - G70
 
@@ -4328,7 +4328,7 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      506
 # CHECK-NEXT: Total uOps:        2400
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    4.74
 # CHECK-NEXT: IPC:               1.98
 # CHECK-NEXT: Block RThroughput: 3.5
@@ -4339,14 +4339,14 @@ add x0, x27, 1
 
 # CHECK:      [0,0]     DeeeeER   .   st2	{ v1.8b, v2.8b }, [x27], x28
 # CHECK-NEXT: [0,1]     D=eE--R   .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=eeeeER  .   st2	{ v1.8h, v2.8h }, [x27], x28
-# CHECK-NEXT: [0,3]     D==eE--R  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     D==eeeeER .   st2	{ v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: [0,5]     .D==eE--R .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D==eeeeER.   st2	{ v1.b, v2.b }[0], [x27], #2
-# CHECK-NEXT: [0,7]     .D===eE--R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .D===eeeeER   st2	{ v1.b, v2.b }[8], [x27], #2
-# CHECK-NEXT: [0,9]     .D====eE--R   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeER  .   st2	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: [0,3]     .D=eE--R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DeeeeER .   st2	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: [0,5]     . D=eE--R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  DeeeeER.   st2	{ v1.b, v2.b }[0], [x27], #2
+# CHECK-NEXT: [0,7]     .  D=eE--R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .  D=eeeeER   st2	{ v1.b, v2.b }[8], [x27], #2
+# CHECK-NEXT: [0,9]     .  D==eE--R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4357,15 +4357,15 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st2	{ v1.8b, v2.8b }, [x27], x28
 # CHECK-NEXT: 1.     1     2.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st2	{ v1.8h, v2.8h }, [x27], x28
-# CHECK-NEXT: 3.     1     3.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     3.0    0.0    0.0       st2	{ v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: 5.     1     3.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     3.0    0.0    0.0       st2	{ v1.b, v2.b }[0], [x27], #2
-# CHECK-NEXT: 7.     1     4.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     4.0    0.0    0.0       st2	{ v1.b, v2.b }[8], [x27], #2
-# CHECK-NEXT: 9.     1     5.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT:        1     3.0    0.1    1.0       <total>
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st2	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: 3.     1     2.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       st2	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: 5.     1     2.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    0.0    0.0       st2	{ v1.b, v2.b }[0], [x27], #2
+# CHECK-NEXT: 7.     1     2.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     2.0    0.0    0.0       st2	{ v1.b, v2.b }[8], [x27], #2
+# CHECK-NEXT: 9.     1     3.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.7    0.1    1.0       <total>
 
 # CHECK:      [70] Code Region - G71
 
@@ -4374,7 +4374,7 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      506
 # CHECK-NEXT: Total uOps:        2000
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    3.95
 # CHECK-NEXT: IPC:               1.98
 # CHECK-NEXT: Block RThroughput: 2.5
@@ -4387,12 +4387,12 @@ add x0, x27, 1
 # CHECK-NEXT: [0,1]     D=eE--R   .   add	x0, x27, #1
 # CHECK-NEXT: [0,2]     D=eeeeER  .   st2	{ v1.b, v2.b }[8], [x27], x28
 # CHECK-NEXT: [0,3]     D==eE--R  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     D==eeeeER .   st2	{ v1.h, v2.h }[0], [x27], #4
-# CHECK-NEXT: [0,5]     D===eE--R .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     D===eeeeER.   st2	{ v1.h, v2.h }[4], [x27], #4
+# CHECK-NEXT: [0,4]     .D=eeeeER .   st2	{ v1.h, v2.h }[0], [x27], #4
+# CHECK-NEXT: [0,5]     .D==eE--R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeeeER.   st2	{ v1.h, v2.h }[4], [x27], #4
 # CHECK-NEXT: [0,7]     .D===eE--R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .D===eeeeER   st2	{ v1.h, v2.h }[0], [x27], x28
-# CHECK-NEXT: [0,9]     .D====eE--R   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==eeeeER   st2	{ v1.h, v2.h }[0], [x27], x28
+# CHECK-NEXT: [0,9]     . D===eE--R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4405,13 +4405,13 @@ add x0, x27, 1
 # CHECK-NEXT: 1.     1     2.0    0.0    2.0       add	x0, x27, #1
 # CHECK-NEXT: 2.     1     2.0    0.0    0.0       st2	{ v1.b, v2.b }[8], [x27], x28
 # CHECK-NEXT: 3.     1     3.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     3.0    0.0    0.0       st2	{ v1.h, v2.h }[0], [x27], #4
-# CHECK-NEXT: 5.     1     4.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     4.0    0.0    0.0       st2	{ v1.h, v2.h }[4], [x27], #4
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       st2	{ v1.h, v2.h }[0], [x27], #4
+# CHECK-NEXT: 5.     1     3.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       st2	{ v1.h, v2.h }[4], [x27], #4
 # CHECK-NEXT: 7.     1     4.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     4.0    0.0    0.0       st2	{ v1.h, v2.h }[0], [x27], x28
-# CHECK-NEXT: 9.     1     5.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT:        1     3.2    0.1    1.0       <total>
+# CHECK-NEXT: 8.     1     3.0    0.0    0.0       st2	{ v1.h, v2.h }[0], [x27], x28
+# CHECK-NEXT: 9.     1     4.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.7    0.1    1.0       <total>
 
 # CHECK:      [71] Code Region - G72
 
@@ -4420,7 +4420,7 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      506
 # CHECK-NEXT: Total uOps:        2000
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    3.95
 # CHECK-NEXT: IPC:               1.98
 # CHECK-NEXT: Block RThroughput: 2.5
@@ -4433,12 +4433,12 @@ add x0, x27, 1
 # CHECK-NEXT: [0,1]     D=eE--R   .   add	x0, x27, #1
 # CHECK-NEXT: [0,2]     D=eeeeER  .   st2	{ v1.s, v2.s }[0], [x27], #8
 # CHECK-NEXT: [0,3]     D==eE--R  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     D==eeeeER .   st2	{ v1.s, v2.s }[0], [x27], x28
-# CHECK-NEXT: [0,5]     D===eE--R .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     D===eeeeER.   st2	{ v1.d, v2.d }[0], [x27], #16
+# CHECK-NEXT: [0,4]     .D=eeeeER .   st2	{ v1.s, v2.s }[0], [x27], x28
+# CHECK-NEXT: [0,5]     .D==eE--R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeeeER.   st2	{ v1.d, v2.d }[0], [x27], #16
 # CHECK-NEXT: [0,7]     .D===eE--R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .D===eeeeER   st2	{ v1.d, v2.d }[0], [x27], x28
-# CHECK-NEXT: [0,9]     .D====eE--R   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==eeeeER   st2	{ v1.d, v2.d }[0], [x27], x28
+# CHECK-NEXT: [0,9]     . D===eE--R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4451,13 +4451,13 @@ add x0, x27, 1
 # CHECK-NEXT: 1.     1     2.0    0.0    2.0       add	x0, x27, #1
 # CHECK-NEXT: 2.     1     2.0    0.0    0.0       st2	{ v1.s, v2.s }[0], [x27], #8
 # CHECK-NEXT: 3.     1     3.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     3.0    0.0    0.0       st2	{ v1.s, v2.s }[0], [x27], x28
-# CHECK-NEXT: 5.     1     4.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     4.0    0.0    0.0       st2	{ v1.d, v2.d }[0], [x27], #16
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       st2	{ v1.s, v2.s }[0], [x27], x28
+# CHECK-NEXT: 5.     1     3.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       st2	{ v1.d, v2.d }[0], [x27], #16
 # CHECK-NEXT: 7.     1     4.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     4.0    0.0    0.0       st2	{ v1.d, v2.d }[0], [x27], x28
-# CHECK-NEXT: 9.     1     5.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT:        1     3.2    0.1    1.0       <total>
+# CHECK-NEXT: 8.     1     3.0    0.0    0.0       st2	{ v1.d, v2.d }[0], [x27], x28
+# CHECK-NEXT: 9.     1     4.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.7    0.1    1.0       <total>
 
 # CHECK:      [72] Code Region - G73
 
@@ -4466,7 +4466,7 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      406
 # CHECK-NEXT: Total uOps:        2000
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    4.93
 # CHECK-NEXT: IPC:               1.48
 # CHECK-NEXT: Block RThroughput: 3.5
@@ -4476,10 +4476,10 @@ add x0, x27, 1
 
 # CHECK:      [0,0]     DeeeeeER .   st3	{ v1.2d, v2.2d, v3.2d }, [x27], #48
 # CHECK-NEXT: [0,1]     D=eE---R .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=eeeeER .   st3	{ v1.2s, v2.2s, v3.2s }, [x27], #24
-# CHECK-NEXT: [0,3]     D==eE--R .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D==eeeeER   st3	{ v1.4h, v2.4h, v3.4h }, [x27], #24
-# CHECK-NEXT: [0,5]     .D===eE--R   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeER .   st3	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: [0,3]     .D=eE--R .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D=eeeeER   st3	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: [0,5]     . D==eE--R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4490,11 +4490,11 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st3	{ v1.2d, v2.2d, v3.2d }, [x27], #48
 # CHECK-NEXT: 1.     1     2.0    0.0    3.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st3	{ v1.2s, v2.2s, v3.2s }, [x27], #24
-# CHECK-NEXT: 3.     1     3.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     3.0    1.0    0.0       st3	{ v1.4h, v2.4h, v3.4h }, [x27], #24
-# CHECK-NEXT: 5.     1     4.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.5    0.3    1.2       <total>
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st3	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: 3.     1     2.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    1.0    0.0       st3	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: 5.     1     3.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.8    0.3    1.2       <total>
 
 # CHECK:      [73] Code Region - G74
 
@@ -4503,7 +4503,7 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      707
 # CHECK-NEXT: Total uOps:        3800
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    5.37
 # CHECK-NEXT: IPC:               1.41
 # CHECK-NEXT: Block RThroughput: 7.0
@@ -4514,14 +4514,14 @@ add x0, x27, 1
 
 # CHECK:      [0,0]     DeeeeeER  .  .   st3	{ v1.4s, v2.4s, v3.4s }, [x27], #48
 # CHECK-NEXT: [0,1]     D=eE---R  .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=eeeeER  .  .   st3	{ v1.8b, v2.8b, v3.8b }, [x27], #24
-# CHECK-NEXT: [0,3]     D==eE--R  .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D==eeeeeER  .   st3	{ v1.8h, v2.8h, v3.8h }, [x27], #48
-# CHECK-NEXT: [0,5]     .D===eE---R  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D===eeeeeER .   st3	{ v1.16b, v2.16b, v3.16b }, [x27], #48
-# CHECK-NEXT: [0,7]     . D===eE---R .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D====eeeeeER   st3	{ v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT: [0,9]     . D=====eE---R   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeER  .  .   st3	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: [0,3]     .D=eE--R  .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D=eeeeeER  .   st3	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: [0,5]     . D==eE---R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D=eeeeeER .   st3	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: [0,7]     .  D==eE---R .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D==eeeeeER   st3	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: [0,9]     .   D===eE---R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4532,15 +4532,15 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st3	{ v1.4s, v2.4s, v3.4s }, [x27], #48
 # CHECK-NEXT: 1.     1     2.0    0.0    3.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st3	{ v1.8b, v2.8b, v3.8b }, [x27], #24
-# CHECK-NEXT: 3.     1     3.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     3.0    1.0    0.0       st3	{ v1.8h, v2.8h, v3.8h }, [x27], #48
-# CHECK-NEXT: 5.     1     4.0    0.0    3.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     4.0    0.0    0.0       st3	{ v1.16b, v2.16b, v3.16b }, [x27], #48
-# CHECK-NEXT: 7.     1     4.0    0.0    3.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     5.0    1.0    0.0       st3	{ v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT: 9.     1     6.0    0.0    3.0       add	x0, x27, #1
-# CHECK-NEXT:        1     3.4    0.3    1.4       <total>
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st3	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: 3.     1     2.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    1.0    0.0       st3	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: 5.     1     3.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     2.0    0.0    0.0       st3	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: 7.     1     3.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    1.0    0.0       st3	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: 9.     1     4.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.3    0.3    1.4       <total>
 
 # CHECK:      [74] Code Region - G75
 
@@ -4549,7 +4549,7 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      706
 # CHECK-NEXT: Total uOps:        3400
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    4.82
 # CHECK-NEXT: IPC:               1.42
 # CHECK-NEXT: Block RThroughput: 6.0
@@ -4560,14 +4560,14 @@ add x0, x27, 1
 
 # CHECK:      [0,0]     DeeeeER   . .   st3	{ v1.2s, v2.2s, v3.2s }, [x27], x28
 # CHECK-NEXT: [0,1]     D=eE--R   . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=eeeeER  . .   st3	{ v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: [0,3]     D==eE--R  . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D=eeeeeER. .   st3	{ v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: [0,5]     .D==eE---R. .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D==eeeeER. .   st3	{ v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: [0,7]     .D===eE--R. .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D===eeeeeER   st3	{ v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT: [0,9]     . D====eE---R   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeER  . .   st3	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: [0,3]     .D=eE--R  . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DeeeeeER. .   st3	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: [0,5]     . D=eE---R. .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  DeeeeER. .   st3	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: [0,7]     .  D=eE--R. .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D=eeeeeER   st3	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: [0,9]     .   D==eE---R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4578,15 +4578,15 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st3	{ v1.2s, v2.2s, v3.2s }, [x27], x28
 # CHECK-NEXT: 1.     1     2.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st3	{ v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: 3.     1     3.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     2.0    0.0    0.0       st3	{ v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: 5.     1     3.0    0.0    3.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     3.0    0.0    0.0       st3	{ v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: 7.     1     4.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     4.0    1.0    0.0       st3	{ v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT: 9.     1     5.0    0.0    3.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.9    0.2    1.2       <total>
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st3	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: 3.     1     2.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       st3	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: 5.     1     2.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    0.0    0.0       st3	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: 7.     1     2.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     2.0    1.0    0.0       st3	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: 9.     1     3.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.7    0.2    1.2       <total>
 
 # CHECK:      [75] Code Region - G76
 
@@ -4595,7 +4595,7 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      606
 # CHECK-NEXT: Total uOps:        3200
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    5.28
 # CHECK-NEXT: IPC:               1.65
 # CHECK-NEXT: Block RThroughput: 5.5
@@ -4606,14 +4606,14 @@ add x0, x27, 1
 
 # CHECK:      [0,0]     DeeeeeER  ..   st3	{ v1.16b, v2.16b, v3.16b }, [x27], x28
 # CHECK-NEXT: [0,1]     D=eE---R  ..   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=eeeeER  ..   st3	{ v1.b, v2.b, v3.b }[0], [x27], #3
-# CHECK-NEXT: [0,3]     D==eE--R  ..   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D==eeeeER..   st3	{ v1.b, v2.b, v3.b }[8], [x27], #3
-# CHECK-NEXT: [0,5]     .D===eE--R..   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D===eeeeER.   st3	{ v1.b, v2.b, v3.b }[0], [x27], x28
-# CHECK-NEXT: [0,7]     .D====eE--R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D===eeeeER   st3	{ v1.b, v2.b, v3.b }[8], [x27], x28
-# CHECK-NEXT: [0,9]     . D====eE--R   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeER  ..   st3	{ v1.b, v2.b, v3.b }[0], [x27], #3
+# CHECK-NEXT: [0,3]     .D=eE--R  ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D=eeeeER..   st3	{ v1.b, v2.b, v3.b }[8], [x27], #3
+# CHECK-NEXT: [0,5]     . D==eE--R..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D=eeeeER.   st3	{ v1.b, v2.b, v3.b }[0], [x27], x28
+# CHECK-NEXT: [0,7]     .  D==eE--R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D=eeeeER   st3	{ v1.b, v2.b, v3.b }[8], [x27], x28
+# CHECK-NEXT: [0,9]     .   D==eE--R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4624,15 +4624,15 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st3	{ v1.16b, v2.16b, v3.16b }, [x27], x28
 # CHECK-NEXT: 1.     1     2.0    0.0    3.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st3	{ v1.b, v2.b, v3.b }[0], [x27], #3
-# CHECK-NEXT: 3.     1     3.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     3.0    1.0    0.0       st3	{ v1.b, v2.b, v3.b }[8], [x27], #3
-# CHECK-NEXT: 5.     1     4.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     4.0    0.0    0.0       st3	{ v1.b, v2.b, v3.b }[0], [x27], x28
-# CHECK-NEXT: 7.     1     5.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     4.0    0.0    0.0       st3	{ v1.b, v2.b, v3.b }[8], [x27], x28
-# CHECK-NEXT: 9.     1     5.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT:        1     3.3    0.2    1.1       <total>
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st3	{ v1.b, v2.b, v3.b }[0], [x27], #3
+# CHECK-NEXT: 3.     1     2.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    1.0    0.0       st3	{ v1.b, v2.b, v3.b }[8], [x27], #3
+# CHECK-NEXT: 5.     1     3.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     2.0    0.0    0.0       st3	{ v1.b, v2.b, v3.b }[0], [x27], x28
+# CHECK-NEXT: 7.     1     3.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     2.0    0.0    0.0       st3	{ v1.b, v2.b, v3.b }[8], [x27], x28
+# CHECK-NEXT: 9.     1     3.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.1    0.2    1.1       <total>
 
 # CHECK:      [76] Code Region - G77
 
@@ -4641,7 +4641,7 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      506
 # CHECK-NEXT: Total uOps:        3000
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    5.93
 # CHECK-NEXT: IPC:               1.98
 # CHECK-NEXT: Block RThroughput: 5.0
@@ -4652,14 +4652,14 @@ add x0, x27, 1
 
 # CHECK:      [0,0]     DeeeeER   .   st3	{ v1.h, v2.h, v3.h }[0], [x27], #6
 # CHECK-NEXT: [0,1]     D=eE--R   .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=eeeeER  .   st3	{ v1.h, v2.h, v3.h }[4], [x27], #6
-# CHECK-NEXT: [0,3]     D==eE--R  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D=eeeeER .   st3	{ v1.h, v2.h, v3.h }[0], [x27], x28
-# CHECK-NEXT: [0,5]     .D==eE--R .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D==eeeeER.   st3	{ v1.h, v2.h, v3.h }[4], [x27], x28
-# CHECK-NEXT: [0,7]     .D===eE--R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D==eeeeER   st3	{ v1.s, v2.s, v3.s }[0], [x27], #12
-# CHECK-NEXT: [0,9]     . D===eE--R   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeER  .   st3	{ v1.h, v2.h, v3.h }[4], [x27], #6
+# CHECK-NEXT: [0,3]     .D=eE--R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DeeeeER .   st3	{ v1.h, v2.h, v3.h }[0], [x27], x28
+# CHECK-NEXT: [0,5]     . D=eE--R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  DeeeeER.   st3	{ v1.h, v2.h, v3.h }[4], [x27], x28
+# CHECK-NEXT: [0,7]     .  D=eE--R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   DeeeeER   st3	{ v1.s, v2.s, v3.s }[0], [x27], #12
+# CHECK-NEXT: [0,9]     .   D=eE--R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4670,15 +4670,15 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st3	{ v1.h, v2.h, v3.h }[0], [x27], #6
 # CHECK-NEXT: 1.     1     2.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st3	{ v1.h, v2.h, v3.h }[4], [x27], #6
-# CHECK-NEXT: 3.     1     3.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     2.0    0.0    0.0       st3	{ v1.h, v2.h, v3.h }[0], [x27], x28
-# CHECK-NEXT: 5.     1     3.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     3.0    0.0    0.0       st3	{ v1.h, v2.h, v3.h }[4], [x27], x28
-# CHECK-NEXT: 7.     1     4.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     3.0    0.0    0.0       st3	{ v1.s, v2.s, v3.s }[0], [x27], #12
-# CHECK-NEXT: 9.     1     4.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.7    0.1    1.0       <total>
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st3	{ v1.h, v2.h, v3.h }[4], [x27], #6
+# CHECK-NEXT: 3.     1     2.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       st3	{ v1.h, v2.h, v3.h }[0], [x27], x28
+# CHECK-NEXT: 5.     1     2.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    0.0    0.0       st3	{ v1.h, v2.h, v3.h }[4], [x27], x28
+# CHECK-NEXT: 7.     1     2.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    0.0    0.0       st3	{ v1.s, v2.s, v3.s }[0], [x27], #12
+# CHECK-NEXT: 9.     1     2.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.5    0.1    1.0       <total>
 
 # CHECK:      [77] Code Region - G78
 
@@ -4687,25 +4687,25 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      706
 # CHECK-NEXT: Total uOps:        3600
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    5.10
 # CHECK-NEXT: IPC:               1.42
 # CHECK-NEXT: Block RThroughput: 6.5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     012
+# CHECK-NEXT:                     0123
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeER   . .   st3	{ v1.s, v2.s, v3.s }[0], [x27], x28
-# CHECK-NEXT: [0,1]     D=eE--R   . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=eeeeER  . .   st3	{ v1.d, v2.d, v3.d }[0], [x27], #24
-# CHECK-NEXT: [0,3]     D==eE--R  . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D=eeeeER . .   st3	{ v1.d, v2.d, v3.d }[0], [x27], x28
-# CHECK-NEXT: [0,5]     .D==eE--R . .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D==eeeeER. .   st4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
-# CHECK-NEXT: [0,7]     . D==eE--R. .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D==eeeeeeER   st4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
-# CHECK-NEXT: [0,9]     . D===eE----R   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeER   .  .   st3	{ v1.s, v2.s, v3.s }[0], [x27], x28
+# CHECK-NEXT: [0,1]     D=eE--R   .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeER  .  .   st3	{ v1.d, v2.d, v3.d }[0], [x27], #24
+# CHECK-NEXT: [0,3]     .D=eE--R  .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DeeeeER .  .   st3	{ v1.d, v2.d, v3.d }[0], [x27], x28
+# CHECK-NEXT: [0,5]     . D=eE--R .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  DeeeeER.  .   st4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: [0,7]     .   DeE--R.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    DeeeeeeER   st4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: [0,9]     .    D=eE----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4716,15 +4716,15 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st3	{ v1.s, v2.s, v3.s }[0], [x27], x28
 # CHECK-NEXT: 1.     1     2.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st3	{ v1.d, v2.d, v3.d }[0], [x27], #24
-# CHECK-NEXT: 3.     1     3.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     2.0    0.0    0.0       st3	{ v1.d, v2.d, v3.d }[0], [x27], x28
-# CHECK-NEXT: 5.     1     3.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     3.0    0.0    0.0       st4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
-# CHECK-NEXT: 7.     1     3.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     3.0    0.0    0.0       st4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
-# CHECK-NEXT: 9.     1     4.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.6    0.1    1.2       <total>
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st3	{ v1.d, v2.d, v3.d }[0], [x27], #24
+# CHECK-NEXT: 3.     1     2.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       st3	{ v1.d, v2.d, v3.d }[0], [x27], x28
+# CHECK-NEXT: 5.     1     2.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    0.0    0.0       st4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: 7.     1     1.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       st4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: 9.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.4    0.2    1.2       <total>
 
 # CHECK:      [78] Code Region - G79
 
@@ -4733,7 +4733,7 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      1205
 # CHECK-NEXT: Total uOps:        5800
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    4.81
 # CHECK-NEXT: IPC:               0.83
 # CHECK-NEXT: Block RThroughput: 12.0
@@ -4745,13 +4745,13 @@ add x0, x27, 1
 # CHECK:      [0,0]     DeeeeeeER .    ..   st4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
 # CHECK-NEXT: [0,1]     D=eE----R .    ..   add	x0, x27, #1
 # CHECK-NEXT: [0,2]     .DeeeeeeeER    ..   st4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
-# CHECK-NEXT: [0,3]     .D=eE-----R    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D=eeeeeeER   ..   st4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
-# CHECK-NEXT: [0,5]     . D==eE----R   ..   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D===eeeeeeeER.   st4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
-# CHECK-NEXT: [0,7]     .  D====eE-----R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D===eeeeeeeER   st4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
-# CHECK-NEXT: [0,9]     .   D====eE-----R   add	x0, x27, #1
+# CHECK-NEXT: [0,3]     . DeE-----R    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .  DeeeeeeER   ..   st4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: [0,5]     .  D=eE----R   ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .   D==eeeeeeeER.   st4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: [0,7]     .    D==eE-----R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .D=eeeeeeeER   st4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: [0,9]     .    . D=eE-----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4763,14 +4763,14 @@ add x0, x27, 1
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
 # CHECK-NEXT: 1.     1     2.0    0.0    4.0       add	x0, x27, #1
 # CHECK-NEXT: 2.     1     1.0    0.0    0.0       st4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
-# CHECK-NEXT: 3.     1     2.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     2.0    1.0    0.0       st4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
-# CHECK-NEXT: 5.     1     3.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     4.0    2.0    0.0       st4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
-# CHECK-NEXT: 7.     1     5.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     4.0    0.0    0.0       st4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
-# CHECK-NEXT: 9.     1     5.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.9    0.4    2.3       <total>
+# CHECK-NEXT: 3.     1     1.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       st4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: 5.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    2.0    0.0       st4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: 7.     1     3.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     2.0    0.0    0.0       st4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: 9.     1     2.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.8    0.4    2.3       <total>
 
 # CHECK:      [79] Code Region - G80
 
@@ -4779,7 +4779,7 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      1006
 # CHECK-NEXT: Total uOps:        4800
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    4.77
 # CHECK-NEXT: IPC:               0.99
 # CHECK-NEXT: Block RThroughput: 9.5
@@ -4789,15 +4789,15 @@ add x0, x27, 1
 # CHECK-NEXT: Index     0123456789
 
 # CHECK:      [0,0]     DeeeeER   .    .   st4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: [0,1]     D=eE--R   .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeeeeeeER.    .   st4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: [0,3]     .D=eE----R.    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D===eeeeeeER  .   st4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: [0,5]     . D===eE----R  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     . D===eeeeeeeER.   st4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: [0,7]     . D====eE-----R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .  D====eeeeeeER   st4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: [0,9]     .  D=====eE----R   add	x0, x27, #1
+# CHECK-NEXT: [0,1]     .DeE--R   .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeeeeeeER    .   st4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: [0,3]     . D=eE----R    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .  D=eeeeeeER  .   st4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: [0,5]     .  D==eE----R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .   D=eeeeeeeER.   st4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: [0,7]     .    D=eE-----R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .D=eeeeeeER   st4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .D==eE----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4807,43 +4807,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: 1.     1     2.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: 1.     1     1.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       st4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
 # CHECK-NEXT: 3.     1     2.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     4.0    2.0    0.0       st4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: 5.     1     4.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     4.0    0.0    0.0       st4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: 7.     1     5.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     5.0    1.0    0.0       st4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: 9.     1     6.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT:        1     3.4    0.4    1.9       <total>
+# CHECK-NEXT: 4.     1     2.0    1.0    0.0       st4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: 5.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     2.0    0.0    0.0       st4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: 7.     1     2.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     2.0    1.0    0.0       st4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: 9.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.9    0.4    1.9       <total>
 
 # CHECK:      [80] Code Region - G81
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      807
+# CHECK-NEXT: Total Cycles:      808
 # CHECK-NEXT: Total uOps:        5200
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    6.44
 # CHECK-NEXT: IPC:               1.24
-# CHECK-NEXT: Block RThroughput: 6.0
+# CHECK-NEXT: Block RThroughput: 6.5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     01234
+# CHECK-NEXT:                     012345
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeeER.   .   st4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: [0,1]     D=eE-----R.   .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeeeeeeeER   .   st4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: [0,3]     .D=eE-----R   .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . DeeeeeeER   .   st4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
-# CHECK-NEXT: [0,5]     . D=eE----R   .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     . D===eeeeeeER.   st4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
-# CHECK-NEXT: [0,7]     .  D===eE----R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .  D===eeeeeeER   st4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
-# CHECK-NEXT: [0,9]     .  D====eE----R   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeeER.    .   st4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: [0,1]     .DeE-----R.    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeeeeeeeER   .   st4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: [0,3]     .  DeE-----R   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   DeeeeeeER  .   st4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
+# CHECK-NEXT: [0,5]     .   D=eE----R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    D=eeeeeeER.   st4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
+# CHECK-NEXT: [0,7]     .    D==eE----R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .D=eeeeeeER   st4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
+# CHECK-NEXT: [0,9]     .    .D==eE----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4853,16 +4853,16 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: 1.     1     2.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: 3.     1     2.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     1.0    0.0    0.0       st4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
+# CHECK-NEXT: 1.     1     1.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       st4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: 3.     1     1.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       st4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
 # CHECK-NEXT: 5.     1     2.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     4.0    2.0    0.0       st4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
-# CHECK-NEXT: 7.     1     4.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     4.0    0.0    0.0       st4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
-# CHECK-NEXT: 9.     1     5.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.6    0.3    2.2       <total>
+# CHECK-NEXT: 6.     1     2.0    1.0    0.0       st4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
+# CHECK-NEXT: 7.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     2.0    0.0    0.0       st4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
+# CHECK-NEXT: 9.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.7    0.4    2.2       <total>
 
 # CHECK:      [81] Code Region - G82
 
@@ -4871,7 +4871,7 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      508
 # CHECK-NEXT: Total uOps:        4000
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    7.87
 # CHECK-NEXT: IPC:               1.97
 # CHECK-NEXT: Block RThroughput: 5.0
@@ -4882,14 +4882,14 @@ add x0, x27, 1
 
 # CHECK:      [0,0]     DeeeeeeER . .   st4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
 # CHECK-NEXT: [0,1]     D=eE----R . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=eeeeeeER. .   st4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
+# CHECK-NEXT: [0,2]     .DeeeeeeER. .   st4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
 # CHECK-NEXT: [0,3]     .D=eE----R. .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D=eeeeeeER .   st4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
-# CHECK-NEXT: [0,5]     .D==eE----R .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     . D=eeeeeeER.   st4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
-# CHECK-NEXT: [0,7]     . D==eE----R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D==eeeeeeER   st4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
-# CHECK-NEXT: [0,9]     .  D==eE----R   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DeeeeeeER .   st4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
+# CHECK-NEXT: [0,5]     . D=eE----R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  DeeeeeeER.   st4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
+# CHECK-NEXT: [0,7]     .  D=eE----R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   DeeeeeeER   st4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
+# CHECK-NEXT: [0,9]     .   D=eE----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4900,15 +4900,15 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
 # CHECK-NEXT: 1.     1     2.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
 # CHECK-NEXT: 3.     1     2.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     2.0    0.0    0.0       st4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
-# CHECK-NEXT: 5.     1     3.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     2.0    0.0    0.0       st4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
-# CHECK-NEXT: 7.     1     3.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     3.0    0.0    0.0       st4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
-# CHECK-NEXT: 9.     1     3.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.3    0.1    2.0       <total>
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       st4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
+# CHECK-NEXT: 5.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    0.0    0.0       st4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
+# CHECK-NEXT: 7.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    0.0    0.0       st4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
+# CHECK-NEXT: 9.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.5    0.1    2.0       <total>
 
 # CHECK:      [82] Code Region - G83
 
@@ -4917,10 +4917,10 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      506
 # CHECK-NEXT: Total uOps:        2800
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    5.53
 # CHECK-NEXT: IPC:               1.58
-# CHECK-NEXT: Block RThroughput: 2.0
+# CHECK-NEXT: Block RThroughput: 3.5
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0
@@ -4928,12 +4928,12 @@ add x0, x27, 1
 
 # CHECK:      [0,0]     DeeeeeeER .   st4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
 # CHECK-NEXT: [0,1]     D=eE----R .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=eeeeeeER.   st4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
+# CHECK-NEXT: [0,2]     .DeeeeeeER.   st4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
 # CHECK-NEXT: [0,3]     .D=eE----R.   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D==eeeeER.   st4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
-# CHECK-NEXT: [0,5]     .D===eE--R.   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D===eeeeER   st4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
-# CHECK-NEXT: [0,7]     .D====eE--R   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D=eeeeER.   st4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
+# CHECK-NEXT: [0,5]     . D==eE--R.   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D=eeeeER   st4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
+# CHECK-NEXT: [0,7]     .  D==eE--R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4944,13 +4944,13 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
 # CHECK-NEXT: 1.     1     2.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
 # CHECK-NEXT: 3.     1     2.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     3.0    1.0    0.0       st4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
-# CHECK-NEXT: 5.     1     4.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     4.0    0.0    0.0       st4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
-# CHECK-NEXT: 7.     1     5.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.9    0.3    1.5       <total>
+# CHECK-NEXT: 4.     1     2.0    1.0    0.0       st4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
+# CHECK-NEXT: 5.     1     3.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     2.0    0.0    0.0       st4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
+# CHECK-NEXT: 7.     1     3.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.0    0.3    1.5       <total>
 
 # CHECK:      [83] Code Region - G84
 
@@ -4959,7 +4959,7 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      204
 # CHECK-NEXT: Total uOps:        800
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    3.92
 # CHECK-NEXT: IPC:               1.96
 # CHECK-NEXT: Block RThroughput: 1.0
@@ -4992,7 +4992,7 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      504
 # CHECK-NEXT: Total uOps:        2200
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    4.37
 # CHECK-NEXT: IPC:               1.98
 # CHECK-NEXT: Block RThroughput: 3.5
@@ -5003,13 +5003,13 @@ add x0, x27, 1
 # CHECK:      [0,0]     DeeER.  .   stp	q1, q2, [x27], #992
 # CHECK-NEXT: [0,1]     D=eER.  .   add	x0, x27, #1
 # CHECK-NEXT: [0,2]     D=eeER  .   stp	s1, s2, [x27, #248]!
-# CHECK-NEXT: [0,3]     D==eER  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     D==eeER .   stp	d1, d2, [x27, #496]!
-# CHECK-NEXT: [0,5]     D===eER .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D==eeER.   stp	q1, q2, [x27, #992]!
-# CHECK-NEXT: [0,7]     .D===eER.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .D===eER.   stp	w1, w2, [x27], #248
-# CHECK-NEXT: [0,9]     .D====eER   add	x0, x27, #1
+# CHECK-NEXT: [0,3]     .D=eER  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eeER .   stp	d1, d2, [x27, #496]!
+# CHECK-NEXT: [0,5]     .D==eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     . D=eeER.   stp	q1, q2, [x27, #992]!
+# CHECK-NEXT: [0,7]     . D==eER.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==eER.   stp	w1, w2, [x27], #248
+# CHECK-NEXT: [0,9]     .  D==eER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -5021,14 +5021,14 @@ add x0, x27, 1
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       stp	q1, q2, [x27], #992
 # CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
 # CHECK-NEXT: 2.     1     2.0    0.0    0.0       stp	s1, s2, [x27, #248]!
-# CHECK-NEXT: 3.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     3.0    0.0    0.0       stp	d1, d2, [x27, #496]!
-# CHECK-NEXT: 5.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     3.0    0.0    0.0       stp	q1, q2, [x27, #992]!
-# CHECK-NEXT: 7.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     4.0    0.0    0.0       stp	w1, w2, [x27], #248
-# CHECK-NEXT: 9.     1     5.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     3.1    0.1    0.0       <total>
+# CHECK-NEXT: 3.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       stp	d1, d2, [x27, #496]!
+# CHECK-NEXT: 5.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     2.0    0.0    0.0       stp	q1, q2, [x27, #992]!
+# CHECK-NEXT: 7.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    0.0    0.0       stp	w1, w2, [x27], #248
+# CHECK-NEXT: 9.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.3    0.1    0.0       <total>
 
 # CHECK:      [85] Code Region - G86
 
@@ -5037,7 +5037,7 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      504
 # CHECK-NEXT: Total uOps:        2000
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    3.97
 # CHECK-NEXT: IPC:               1.98
 # CHECK-NEXT: Block RThroughput: 2.5
@@ -5049,12 +5049,12 @@ add x0, x27, 1
 # CHECK-NEXT: [0,1]     D=eER.  .   add	x0, x27, #1
 # CHECK-NEXT: [0,2]     D=eER.  .   stp	w1, w2, [x27, #248]!
 # CHECK-NEXT: [0,3]     D==eER  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     D==eER  .   stp	x1, x2, [x27, #496]!
-# CHECK-NEXT: [0,5]     D===eER .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     D===eeER.   str	b1, [x27], #254
+# CHECK-NEXT: [0,4]     .D=eER  .   stp	x1, x2, [x27, #496]!
+# CHECK-NEXT: [0,5]     .D==eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeER.   str	b1, [x27], #254
 # CHECK-NEXT: [0,7]     .D===eER.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .D===eeER   str	h1, [x27], #254
-# CHECK-NEXT: [0,9]     .D====eER   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==eeER   str	h1, [x27], #254
+# CHECK-NEXT: [0,9]     . D===eER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -5067,13 +5067,13 @@ add x0, x27, 1
 # CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
 # CHECK-NEXT: 2.     1     2.0    0.0    0.0       stp	w1, w2, [x27, #248]!
 # CHECK-NEXT: 3.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     3.0    0.0    0.0       stp	x1, x2, [x27, #496]!
-# CHECK-NEXT: 5.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     4.0    0.0    0.0       str	b1, [x27], #254
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       stp	x1, x2, [x27, #496]!
+# CHECK-NEXT: 5.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       str	b1, [x27], #254
 # CHECK-NEXT: 7.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     4.0    0.0    0.0       str	h1, [x27], #254
-# CHECK-NEXT: 9.     1     5.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     3.2    0.1    0.0       <total>
+# CHECK-NEXT: 8.     1     3.0    0.0    0.0       str	h1, [x27], #254
+# CHECK-NEXT: 9.     1     4.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.7    0.1    0.0       <total>
 
 # CHECK:      [86] Code Region - G87
 
@@ -5082,7 +5082,7 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      504
 # CHECK-NEXT: Total uOps:        2000
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    3.97
 # CHECK-NEXT: IPC:               1.98
 # CHECK-NEXT: Block RThroughput: 2.5
@@ -5094,12 +5094,12 @@ add x0, x27, 1
 # CHECK-NEXT: [0,1]     D=eER.  .   add	x0, x27, #1
 # CHECK-NEXT: [0,2]     D=eeER  .   str	d1, [x27], #254
 # CHECK-NEXT: [0,3]     D==eER  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     D==eeER .   str	q1, [x27], #254
-# CHECK-NEXT: [0,5]     D===eER .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     D===eeER.   str	b1, [x27, #254]!
+# CHECK-NEXT: [0,4]     .D=eeER .   str	q1, [x27], #254
+# CHECK-NEXT: [0,5]     .D==eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeER.   str	b1, [x27, #254]!
 # CHECK-NEXT: [0,7]     .D===eER.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .D===eeER   str	h1, [x27, #254]!
-# CHECK-NEXT: [0,9]     .D====eER   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==eeER   str	h1, [x27, #254]!
+# CHECK-NEXT: [0,9]     . D===eER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -5112,13 +5112,13 @@ add x0, x27, 1
 # CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
 # CHECK-NEXT: 2.     1     2.0    0.0    0.0       str	d1, [x27], #254
 # CHECK-NEXT: 3.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     3.0    0.0    0.0       str	q1, [x27], #254
-# CHECK-NEXT: 5.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     4.0    0.0    0.0       str	b1, [x27, #254]!
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       str	q1, [x27], #254
+# CHECK-NEXT: 5.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       str	b1, [x27, #254]!
 # CHECK-NEXT: 7.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     4.0    0.0    0.0       str	h1, [x27, #254]!
-# CHECK-NEXT: 9.     1     5.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     3.2    0.1    0.0       <total>
+# CHECK-NEXT: 8.     1     3.0    0.0    0.0       str	h1, [x27, #254]!
+# CHECK-NEXT: 9.     1     4.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.7    0.1    0.0       <total>
 
 # CHECK:      [87] Code Region - G88
 
@@ -5127,7 +5127,7 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      504
 # CHECK-NEXT: Total uOps:        2000
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    3.97
 # CHECK-NEXT: IPC:               1.98
 # CHECK-NEXT: Block RThroughput: 2.5
@@ -5139,12 +5139,12 @@ add x0, x27, 1
 # CHECK-NEXT: [0,1]     D=eER.  .   add	x0, x27, #1
 # CHECK-NEXT: [0,2]     D=eeER  .   str	d1, [x27, #254]!
 # CHECK-NEXT: [0,3]     D==eER  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     D==eeER .   str	q1, [x27, #254]!
-# CHECK-NEXT: [0,5]     D===eER .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     D===eER .   str	w1, [x27], #254
+# CHECK-NEXT: [0,4]     .D=eeER .   str	q1, [x27, #254]!
+# CHECK-NEXT: [0,5]     .D==eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eER .   str	w1, [x27], #254
 # CHECK-NEXT: [0,7]     .D===eER.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .D===eER.   str	x1, [x27], #254
-# CHECK-NEXT: [0,9]     .D====eER   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==eER.   str	x1, [x27], #254
+# CHECK-NEXT: [0,9]     . D===eER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -5157,13 +5157,13 @@ add x0, x27, 1
 # CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
 # CHECK-NEXT: 2.     1     2.0    0.0    0.0       str	d1, [x27, #254]!
 # CHECK-NEXT: 3.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     3.0    0.0    0.0       str	q1, [x27, #254]!
-# CHECK-NEXT: 5.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     4.0    0.0    0.0       str	w1, [x27], #254
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       str	q1, [x27, #254]!
+# CHECK-NEXT: 5.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       str	w1, [x27], #254
 # CHECK-NEXT: 7.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     4.0    0.0    0.0       str	x1, [x27], #254
-# CHECK-NEXT: 9.     1     5.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     3.2    0.1    0.0       <total>
+# CHECK-NEXT: 8.     1     3.0    0.0    0.0       str	x1, [x27], #254
+# CHECK-NEXT: 9.     1     4.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.7    0.1    0.0       <total>
 
 # CHECK:      [88] Code Region - G89
 
@@ -5172,7 +5172,7 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      504
 # CHECK-NEXT: Total uOps:        2000
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    3.97
 # CHECK-NEXT: IPC:               1.98
 # CHECK-NEXT: Block RThroughput: 2.5
@@ -5184,12 +5184,12 @@ add x0, x27, 1
 # CHECK-NEXT: [0,1]     D=eER.  .   add	x0, x27, #1
 # CHECK-NEXT: [0,2]     D=eER.  .   str	x1, [x27, #254]!
 # CHECK-NEXT: [0,3]     D==eER  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     D==eER  .   strb	w1, [x27], #254
-# CHECK-NEXT: [0,5]     D===eER .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     D===eER .   strb	w1, [x27, #254]!
+# CHECK-NEXT: [0,4]     .D=eER  .   strb	w1, [x27], #254
+# CHECK-NEXT: [0,5]     .D==eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eER .   strb	w1, [x27, #254]!
 # CHECK-NEXT: [0,7]     .D===eER.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .D===eER.   strh	w1, [x27], #254
-# CHECK-NEXT: [0,9]     .D====eER   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==eER.   strh	w1, [x27], #254
+# CHECK-NEXT: [0,9]     . D===eER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -5202,13 +5202,13 @@ add x0, x27, 1
 # CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
 # CHECK-NEXT: 2.     1     2.0    0.0    0.0       str	x1, [x27, #254]!
 # CHECK-NEXT: 3.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     3.0    0.0    0.0       strb	w1, [x27], #254
-# CHECK-NEXT: 5.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     4.0    0.0    0.0       strb	w1, [x27, #254]!
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       strb	w1, [x27], #254
+# CHECK-NEXT: 5.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       strb	w1, [x27, #254]!
 # CHECK-NEXT: 7.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     4.0    0.0    0.0       strh	w1, [x27], #254
-# CHECK-NEXT: 9.     1     5.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     3.2    0.1    0.0       <total>
+# CHECK-NEXT: 8.     1     3.0    0.0    0.0       strh	w1, [x27], #254
+# CHECK-NEXT: 9.     1     4.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.7    0.1    0.0       <total>
 
 # CHECK:      [89] Code Region - G90
 
@@ -5217,7 +5217,7 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      104
 # CHECK-NEXT: Total uOps:        400
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    3.85
 # CHECK-NEXT: IPC:               1.92
 # CHECK-NEXT: Block RThroughput: 0.5
@@ -5246,7 +5246,7 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      110
 # CHECK-NEXT: Total uOps:        600
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    5.45
 # CHECK-NEXT: IPC:               3.64
 # CHECK-NEXT: Block RThroughput: 1.0
@@ -5272,3 +5272,4 @@ add x0, x27, 1
 # CHECK-NEXT: 2.     1     5.0    0.0    0.0       ldr	x2, [x1], #254
 # CHECK-NEXT: 3.     1     2.0    0.0    6.0       add	x0, x27, #1
 # CHECK-NEXT:        1     2.5    0.3    2.0       <total>
+
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-zero-dependency.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-zero-dependency.s
index 8b1c8a4e4ca55..3954cbd8c5490 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-zero-dependency.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-zero-dependency.s
@@ -9,7 +9,7 @@ cmp x0, #4
 # CHECK-NEXT: Total Cycles:      54
 # CHECK-NEXT: Total uOps:        200
 
-# CHECK:      Dispatch Width:    15
+# CHECK:      Dispatch Width:    8
 # CHECK-NEXT: uOps Per Cycle:    3.70
 # CHECK-NEXT: IPC:               3.70
 # CHECK-NEXT: Block RThroughput: 0.5
diff --git a/llvm/test/tools/llvm-profdata/Inputs/profile-symbol-list-ext.expected b/llvm/test/tools/llvm-profdata/Inputs/profile-symbol-list-ext.expected
new file mode 100644
index 0000000000000..f7e7499a2c781
--- /dev/null
+++ b/llvm/test/tools/llvm-profdata/Inputs/profile-symbol-list-ext.expected
@@ -0,0 +1,44 @@
+Function: main: 368038, 0, 7 sampled lines
+Samples collected in the function's body {
+  4: 1068
+  4.2: 1068
+  5: 2150
+  5.1: 2150
+  6: 4160
+  7: 1068
+  9: 4128, calls: _Z3bari:2942 _Z3fooi:1262
+  9: vtables: _ZTVbar:2942 _ZTVfoo:1260
+}
+Samples collected in inlined callsites {
+  10: inlined callee: inline1: 2000, 0, 1 sampled lines
+    Samples collected in the function's body {
+      1: 2000
+    }
+    No inlined callsites in this function
+  10: inlined callee: inline2: 4000, 0, 1 sampled lines
+    Samples collected in the function's body {
+      1: 4000
+    }
+    No inlined callsites in this function
+  10: vtables: _ZTVinline1:2000 _ZTVinline2:4000
+}
+Function: _Z3bari: 40602, 2874, 1 sampled lines
+Samples collected in the function's body {
+  1: 2874
+}
+No inlined callsites in this function
+Function: _Z3fooi: 15422, 1220, 1 sampled lines
+Samples collected in the function's body {
+  1: 1220
+}
+No inlined callsites in this function
+======== Dump profile symbol list ========
+_Z3goov
+_Z3sumii
+__libc_csu_fini
+__libc_csu_init
+_dl_relocate_static_pie
+_fini
+_init
+_start
+main
diff --git a/llvm/test/tools/llvm-profdata/Inputs/sample-profile-ext.proftext b/llvm/test/tools/llvm-profdata/Inputs/sample-profile-ext.proftext
new file mode 100644
index 0000000000000..100133fa17ccb
--- /dev/null
+++ b/llvm/test/tools/llvm-profdata/Inputs/sample-profile-ext.proftext
@@ -0,0 +1,18 @@
+main:184019:0
+ 4: 534
+ 4.2: 534
+ 5: 1075
+ 5.1: 1075
+ 6: 2080
+ 7: 534
+ 9: 2064 _Z3bari:1471 _Z3fooi:631
+ 9: vtables _ZTVbar:1471 _ZTVfoo:630
+ 10: inline1:1000
+  1: 1000
+ 10: inline2:2000
+  1: 2000
+ 10: vtables _ZTVinline1:1000 _ZTVinline2:2000
+_Z3bari:20301:1437
+ 1: 1437
+_Z3fooi:7711:610
+ 1: 610
diff --git a/llvm/test/tools/llvm-profdata/profile-symbol-list-compress.test b/llvm/test/tools/llvm-profdata/profile-symbol-list-compress.test
index b445695c8e8e4..8383bcc1a2fbe 100644
--- a/llvm/test/tools/llvm-profdata/profile-symbol-list-compress.test
+++ b/llvm/test/tools/llvm-profdata/profile-symbol-list-compress.test
@@ -4,3 +4,12 @@ REQUIRES: zlib
 ; RUN: llvm-profdata merge -sample -extbinary -compress-all-sections %t.1.output %t.2.output -o %t.3.output
 ; RUN: llvm-profdata show -sample -show-prof-sym-list %t.3.output > %t.4.output
 ; RUN: diff -b %S/Inputs/profile-symbol-list.expected %t.4.output
+
+;; Generate two SampleFDO binary profiles and merge them.
+;; Tests that the vtable counters in the merged profile are the aggregated
+;; result from both sources.
+; RUN: llvm-profdata merge -sample -extbinary -compress-all-sections -extbinary-write-vtable-type-prof -prof-sym-list=%S/Inputs/profile-symbol-list-1.text %S/Inputs/sample-profile-ext.proftext -o %t.1.output
+; RUN: llvm-profdata merge -sample -extbinary -compress-all-sections -extbinary-write-vtable-type-prof -prof-sym-list=%S/Inputs/profile-symbol-list-2.text %S/Inputs/sample-profile-ext.proftext -o %t.2.output
+; RUN: llvm-profdata merge -sample -extbinary -compress-all-sections -extbinary-write-vtable-type-prof %t.1.output %t.2.output -o %t.3.output
+; RUN: llvm-profdata show -sample -show-prof-sym-list %t.3.output > %t.4.output
+; RUN: diff -b %S/Inputs/profile-symbol-list-ext.expected %t.4.output
diff --git a/llvm/test/tools/llvm-profdata/profile-symbol-list.test b/llvm/test/tools/llvm-profdata/profile-symbol-list.test
index 39dcd11ec1db7..6845531066c76 100644
--- a/llvm/test/tools/llvm-profdata/profile-symbol-list.test
+++ b/llvm/test/tools/llvm-profdata/profile-symbol-list.test
@@ -7,3 +7,12 @@
 ; RUN: llvm-profdata show -sample -show-sec-info-only %t.5.output  | FileCheck %s -check-prefix=NOSYMLIST
 
 ; NOSYMLIST: ProfileSymbolListSection {{.*}} Size: 0
+
+;; Generate two SampleFDO binary profiles and merge them.
+;; Tests that the vtable counters in the merged profile are the aggregated
+;; result from both sources.
+; RUN: llvm-profdata merge -sample -extbinary -extbinary-write-vtable-type-prof -prof-sym-list=%S/Inputs/profile-symbol-list-1.text %S/Inputs/sample-profile-ext.proftext -o %t.1.output
+; RUN: llvm-profdata merge -sample -extbinary -extbinary-write-vtable-type-prof -prof-sym-list=%S/Inputs/profile-symbol-list-2.text %S/Inputs/sample-profile-ext.proftext -o %t.2.output
+; RUN: llvm-profdata merge -sample -extbinary -extbinary-write-vtable-type-prof %t.1.output %t.2.output -o %t.3.output
+; RUN: llvm-profdata show -sample -show-prof-sym-list %t.3.output > %t.4.output
+; RUN: diff -b %S/Inputs/profile-symbol-list-ext.expected %t.4.output
diff --git a/llvm/test/tools/llvm-profdata/roundtrip.test b/llvm/test/tools/llvm-profdata/roundtrip.test
index 7af76e0a58224..eb55534763877 100644
--- a/llvm/test/tools/llvm-profdata/roundtrip.test
+++ b/llvm/test/tools/llvm-profdata/roundtrip.test
@@ -16,3 +16,9 @@ RUN: llvm-profdata merge --sample --binary -output=%t.4.profdata %S/Inputs/sampl
 RUN: llvm-profdata merge --sample --extbinary -output=%t.5.profdata %t.4.profdata
 RUN: llvm-profdata merge --sample --text -output=%t.4.proftext %t.5.profdata
 RUN: diff -b %t.4.proftext %S/Inputs/sample-profile.proftext
+# Round trip from text --> extbinary --> text.
+# The vtable profile is supported by ext-binary profile but not raw binary profile format,
+# so we don't use raw binary profile format in this roundtrip.
+RUN: llvm-profdata merge --sample --extbinary -extbinary-write-vtable-type-prof --output=%t.5.profdata %S/Inputs/sample-profile-ext.proftext
+RUN: llvm-profdata merge --sample --text --output=%t.5.proftext %t.5.profdata
+RUN: diff -b %t.5.proftext %S/Inputs/sample-profile-ext.proftext
diff --git a/llvm/test/tools/llvm-profgen/Inputs/css-pgo-perf.script b/llvm/test/tools/llvm-profgen/Inputs/css-pgo-perf.script
new file mode 100644
index 0000000000000..6b1bfc8754381
--- /dev/null
+++ b/llvm/test/tools/llvm-profgen/Inputs/css-pgo-perf.script
@@ -0,0 +1,19 @@
+PERF_RECORD_MMAP2 21450/21450: [0x260000(0x153000) @ 0x5f000 fd:01 21193997 0]: r-xp /path/to/dap.bin
+PERF_RECORD_MMAP2 21450/21450: [0x7ff768d52000(0x2000) @ 0 00:00 0 0]: r-xp [vdso]
+
+	          26090f
+	          30efd4
+ 0x2608bb/0x26090f/P/-/-/2//-  0x3510ab/0x2608af/P/-/-/3//-  0x351059/0x351098/P/-/-/7//-  0x350f8c/0x350fb4/P/-/-/4//-  0x260bf4/0x350f40/P/-/-/1//-  0x260be4/0x260bf0/P/-/-/1//-  0x26085a/0x260be0/P/-/-/2//-  0x2608ac/0x260850/P/-/-/1//-  0x26084c/0x2608a4/P/-/-/2//-  0x2608a2/0x260840/P/-/-/5//-  0x260832/0x260894/P/-/-/1//-  0x26081b/0x26082e/P/-/-/1//-  0x260c52/0x260814/P/-/-/1//-  0x3508da/0x260c4c/P/-/-/8//-  0x350879/0x350887/P/-/-/3//-  0x260c47/0x350850/P/-/-/2//-  0x26080f/0x260c30/P/-/-/2//-  0x26088f/0x260800/P/-/-/7//-  0x26090a/0x260880/P/-/-/1//-  0x26091c/0x260900/P/-/-/1//-  0x2608bb/0x26090f/P/-/-/2//-  0x3510ab/0x2608af/P/-/-/3//-  0x351059/0x351098/P/-/-/7//-  0x350f8c/0x350fb4/P/-/-/4//-  0x260bf4/0x350f40/P/-/-/1//-  0x260be4/0x260bf0/P/-/-/1//-  0x26085a/0x260be0/P/-/-/2//-  0x2608ac/0x260850/P/-/-/1//-  0x26084c/0x2608a4/P/-/-/2//-  0x2608a2/0x260840/P/-/-/4//-  0x260832/0x260894/P/-/-/1//-  0x26081b/0x26082e/P/-/-/1//- 
+
+	          351098
+	              21
+ 0x351059/0x351098/P/-/-/7//-  0x350f8c/0x350fb4/P/-/-/4//-  0x260bf4/0x350f40/P/-/-/1//-  0x260be4/0x260bf0/P/-/-/1//-  0x26085a/0x260be0/P/-/-/2//-  0x2608ac/0x260850/P/-/-/1//-  0x26084c/0x2608a4/P/-/-/2//-  0x2608a2/0x260840/P/-/-/4//-  0x260832/0x260894/P/-/-/1//-  0x26081b/0x26082e/P/-/-/1//-  0x260c52/0x260814/P/-/-/1//-  0x3508da/0x260c4c/P/-/-/10//-  0x350879/0x350887/P/-/-/3//-  0x260c47/0x350850/P/-/-/3//-  0x26080f/0x260c30/P/-/-/1//-  0x26088f/0x260800/P/-/-/3//-  0x26090a/0x260880/P/-/-/1//-  0x26091c/0x260900/P/-/-/1//-  0x2608bb/0x26090f/P/-/-/2//-  0x3510ab/0x2608af/P/-/-/3//-  0x351059/0x351098/P/-/-/7//-  0x350f8c/0x350fb4/P/-/-/4//-  0x260bf4/0x350f40/P/-/-/1//-  0x260be4/0x260bf0/P/-/-/1//-  0x26087a/0x260be0/P/-/-/2//-  0x2608ac/0x260870/P/-/-/1//-  0x2607fc/0x2608a4/P/-/-/2//-  0x2608a2/0x2607f0/P/-/-/6//-  0x260832/0x260894/P/-/-/1//-  0x260c52/0x260827/P/-/-/1//-  0x3508da/0x260c4c/P/-/-/12//-  0x350879/0x350887/P/-/-/3//- 
+
+	          350f40
+	          26090f
+	          30efd4
+ 0x260bf4/0x350f40/P/-/-/1//-  0x260be4/0x260bf0/P/-/-/1//-  0x26087a/0x260be0/P/-/-/2//-  0x2608ac/0x260870/P/-/-/1//-  0x2607fc/0x2608a4/P/-/-/2//-  0x2608a2/0x2607f0/P/-/-/5//-  0x260832/0x260894/P/-/-/1//-  0x260c52/0x260827/P/-/-/1//-  0x3508da/0x260c4c/P/-/-/10//-  0x350879/0x350887/P/-/-/3//-  0x260c47/0x350850/P/-/-/2//-  0x260822/0x260c30/P/-/-/1//-  0x260808/0x26081d/P/-/-/1//-  0x26088f/0x260800/P/-/-/3//-  0x26090a/0x260880/P/-/-/1//-  0x26091c/0x260900/P/-/-/1//-  0x2608bb/0x26090f/P/-/-/2//-  0x3510ab/0x2608af/P/-/-/3//-  0x351059/0x351098/P/-/-/7//-  0x350f8c/0x350fb4/P/-/-/4//-  0x260bf4/0x350f40/P/-/-/1//-  0x260be4/0x260bf0/P/-/-/1//-  0x26085a/0x260be0/P/-/-/2//-  0x2608ac/0x260850/P/-/-/1//-  0x26084c/0x2608a4/P/-/-/2//-  0x2608a2/0x260840/P/-/-/4//-  0x260832/0x260894/P/-/-/1//-  0x26081b/0x26082e/P/-/-/1//-  0x260c52/0x260814/P/-/-/1//-  0x3508da/0x260c4c/P/-/-/10//-  0x350879/0x350887/P/-/-/3//-  0x260c47/0x350850/P/-/-/3//- 
+
+	          351098
+	              21
+ 0x351059/0x351098/P/-/-/7//-  0x350f8c/0x350fb4/P/-/-/4//-  0x260bf4/0x350f40/P/-/-/1//-  0x260be4/0x260bf0/P/-/-/1//-  0x26085a/0x260be0/P/-/-/2//-  0x2608ac/0x260850/P/-/-/1//-  0x26084c/0x2608a4/P/-/-/2//-  0x2608a2/0x260840/P/-/-/4//-  0x260832/0x260894/P/-/-/1//-  0x26081b/0x26082e/P/-/-/1//-  0x260c52/0x260814/P/-/-/1//-  0x3508da/0x260c4c/P/-/-/14//-  0x350879/0x350887/P/-/-/4//-  0x260c47/0x350850/P/-/-/4//-  0x26080f/0x260c30/P/-/-/2//-  0x26088f/0x260800/P/-/-/3//-  0x26090a/0x260880/P/-/-/1//-  0x26091c/0x260900/P/-/-/1//-  0x2608bb/0x26090f/P/-/-/2//-  0x3510ab/0x2608af/P/-/-/3//-  0x351059/0x351098/P/-/-/7//-  0x350f8c/0x350fb4/P/-/-/4//-  0x260bf4/0x350f40/P/-/-/1//-  0x260be4/0x260bf0/P/-/-/1//-  0x26085a/0x260be0/P/-/-/2//-  0x2608ac/0x260850/P/-/-/1//-  0x26084c/0x2608a4/P/-/-/2//-  0x2608a2/0x260840/P/-/-/4//-  0x260832/0x260894/P/-/-/1//-  0x26081b/0x26082e/P/-/-/1//-  0x260c52/0x260814/P/-/-/1//-  0x3508da/0x260c4c/P/-/-/14//- 
diff --git a/llvm/test/tools/llvm-profgen/Inputs/dap-perf-trace.txt b/llvm/test/tools/llvm-profgen/Inputs/dap-perf-trace.txt
new file mode 100644
index 0000000000000..04025c79fa9a2
--- /dev/null
+++ b/llvm/test/tools/llvm-profgen/Inputs/dap-perf-trace.txt
@@ -0,0 +1,37 @@
+0 0x7b10 [0x88]: PERF_RECORD_MMAP2 3446532/3446532: [0x200000(0x60000) @ 0 08:01 527501 0]: r--p /path/to/dap.bin
+0 0x7b98 [0x88]: PERF_RECORD_MMAP2 3446532/3446532: [0x260000(0x153000) @ 0x5f000 08:01 527501 0]: r-xp /path/to/dap.bin
+0 0x7c20 [0x88]: PERF_RECORD_MMAP2 3446532/3446532: [0x3b3000(0xc000) @ 0x1b1000 08:01 527501 0]: r--p /path/to/dap.bin
+0 0x7ca8 [0x88]: PERF_RECORD_MMAP2 3446532/3446532: [0x3bf000(0x3000) @ 0x1bc000 08:01 527501 0]: rw-p /path/to/dap.bin
+1282514021937402 0x8660 [0x60]: PERF_RECORD_SAMPLE(IP, 0x4002): 3446532/3446532: 0x2608a2 period: 233 addr: 0x3b3f70
+1282514022939813 0x87b0 [0x60]: PERF_RECORD_SAMPLE(IP, 0x4002): 3446532/3446532: 0x2608a2 period: 233 addr: 0x3b3fb0
+1282514023932029 0x8a00 [0x60]: PERF_RECORD_SAMPLE(IP, 0x4002): 3446532/3446532: 0x2608a2 period: 233 addr: 0x3b3fb0
+1282514024937981 0x8d48 [0x60]: PERF_RECORD_SAMPLE(IP, 0x4002): 3446532/3446532: 0x2608a2 period: 233 addr: 0x3b3fb0
+1282514028925828 0x94c0 [0x60]: PERF_RECORD_SAMPLE(IP, 0x4002): 3446532/3446532: 0x2608a2 period: 233 addr: 0x3b3fb0
+1282514028934870 0x9678 [0x60]: PERF_RECORD_SAMPLE(IP, 0x4002): 3446532/3446532: 0x2608ac period: 233 addr: 0x3b3fc0
+1282514029934094 0x9830 [0x60]: PERF_RECORD_SAMPLE(IP, 0x4002): 3446532/3446532: 0x2608a2 period: 233 addr: 0x3b3fb0
+1282514040934785 0xb1d0 [0x60]: PERF_RECORD_SAMPLE(IP, 0x4002): 3446532/3446532: 0x2608ac period: 233 addr: 0x3b3fc0
+1282514052924510 0xcbb8 [0x60]: PERF_RECORD_SAMPLE(IP, 0x4002): 3446532/3446532: 0x2608a2 period: 233 addr: 0x3b3f70
+1282514053932406 0xcfb0 [0x60]: PERF_RECORD_SAMPLE(IP, 0x4002): 3446532/3446532: 0x2608ac period: 233 addr: 0x3b3fc0
+1282514063928248 0xe5c8 [0x60]: PERF_RECORD_SAMPLE(IP, 0x4002): 3446532/3446532: 0x2608a2 period: 233 addr: 0x3b3f70
+1282514073928057 0xfd20 [0x60]: PERF_RECORD_SAMPLE(IP, 0x4002): 3446532/3446532: 0x2608a2 period: 233 addr: 0x3b3f70
+1282514081925013 0x10f28 [0x60]: PERF_RECORD_SAMPLE(IP, 0x4002): 3446532/3446532: 0x2608a2 period: 233 addr: 0x3b3f70
+1282514084927335 0x11678 [0x60]: PERF_RECORD_SAMPLE(IP, 0x4002): 3446532/3446532: 0x2608a2 period: 233 addr: 0x3b3f70
+1282514088926926 0x11f90 [0x60]: PERF_RECORD_SAMPLE(IP, 0x4002): 3446532/3446532: 0x2608a2 period: 233 addr: 0x3b3f70
+1282514089929492 0x12270 [0x60]: PERF_RECORD_SAMPLE(IP, 0x4002): 3446532/3446532: 0x2608a2 period: 233 addr: 0x3b3f70
+1282514119919997 0x16610 [0x60]: PERF_RECORD_SAMPLE(IP, 0x4002): 3446532/3446532: 0x2608a2 period: 233 addr: 0x3b3f70
+1282514120924169 0x16920 [0x60]: PERF_RECORD_SAMPLE(IP, 0x4002): 3446532/3446532: 0x2608a2 period: 233 addr: 0x3b3f70
+1282514145923603 0x1a338 [0x60]: PERF_RECORD_SAMPLE(IP, 0x4002): 3446532/3446532: 0x2608a2 period: 233 addr: 0x3b3f70
+1282514146917708 0x1a428 [0x60]: PERF_RECORD_SAMPLE(IP, 0x4002): 3446532/3446532: 0x2608a2 period: 233 addr: 0x3b3f70
+1282514173914003 0x1e1b0 [0x60]: PERF_RECORD_SAMPLE(IP, 0x4002): 3446532/3446532: 0x2608a2 period: 233 addr: 0x3b3f70
+1282514188915199 0x20488 [0x60]: PERF_RECORD_SAMPLE(IP, 0x4002): 3446532/3446532: 0x2608a2 period: 233 addr: 0x3b3f70
+1282514210915866 0x236d8 [0x60]: PERF_RECORD_SAMPLE(IP, 0x4002): 3446532/3446532: 0x2608a2 period: 233 addr: 0x3b3f70
+1282514212908181 0x23a50 [0x60]: PERF_RECORD_SAMPLE(IP, 0x4002): 3446532/3446532: 0x2608a2 period: 233 addr: 0x3b3f70
+1282514480886012 0x4a098 [0x60]: PERF_RECORD_SAMPLE(IP, 0x4002): 3446532/3446532: 0x2608ac period: 233 addr: 0x3b3f80
+1282514840855333 0x7dd48 [0x60]: PERF_RECORD_SAMPLE(IP, 0x4002): 3446532/3446532: 0x2608ac period: 233 addr: 0x3b3f80
+1282514955835364 0x8e380 [0x60]: PERF_RECORD_SAMPLE(IP, 0x4002): 3446532/3446532: 0x2608ac period: 233 addr: 0x3b3f80
+1282514967839429 0x8fef8 [0x60]: PERF_RECORD_SAMPLE(IP, 0x4002): 3446532/3446532: 0x2608ac period: 233 addr: 0x3b3f80
+1282515023830209 0x97f98 [0x60]: PERF_RECORD_SAMPLE(IP, 0x4002): 3446532/3446532: 0x2608ac period: 233 addr: 0x3b3f80
+1282515356804308 0xc7b28 [0x60]: PERF_RECORD_SAMPLE(IP, 0x4002): 3446532/3446532: 0x2608ac period: 233 addr: 0x3b3f80
+1282515410794371 0xcf590 [0x60]: PERF_RECORD_SAMPLE(IP, 0x4002): 3446532/3446532: 0x2608ac period: 233 addr: 0x3b3f80
+1282515541786485 0xe2280 [0x60]: PERF_RECORD_SAMPLE(IP, 0x4002): 3446532/3446532: 0x2608ac period: 233 addr: 0x3b3f80
+1282515703761203 0xf93c0 [0x60]: PERF_RECORD_SAMPLE(IP, 0x4002): 3446532/3446532: 0x2608ac period: 233 addr: 0x3b3f80
diff --git a/llvm/test/tools/llvm-profgen/Inputs/dap-pie.bin b/llvm/test/tools/llvm-profgen/Inputs/dap-pie.bin
new file mode 100644
index 0000000000000..c09df29ca12ab
Binary files /dev/null and b/llvm/test/tools/llvm-profgen/Inputs/dap-pie.bin differ
diff --git a/llvm/test/tools/llvm-profgen/Inputs/dap.bin b/llvm/test/tools/llvm-profgen/Inputs/dap.bin
new file mode 100755
index 0000000000000..2b0802d448654
Binary files /dev/null and b/llvm/test/tools/llvm-profgen/Inputs/dap.bin differ
diff --git a/llvm/test/tools/llvm-profgen/Inputs/lbr-perf-for-dap.script b/llvm/test/tools/llvm-profgen/Inputs/lbr-perf-for-dap.script
new file mode 100644
index 0000000000000..3885fb6daa87c
--- /dev/null
+++ b/llvm/test/tools/llvm-profgen/Inputs/lbr-perf-for-dap.script
@@ -0,0 +1,14 @@
+PERF_RECORD_MMAP2 3446532/3446532: [0x260000(0x153000) @ 0x5f000 08:01 527501 0]: r-xp /path/to/dap.perfbin
+PERF_RECORD_MMAP2 3446532/3446532: [0x7fff5ff28000(0x2000) @ 0 00:00 0 0]: r-xp [vdso]
+PERF_RECORD_MMAP2 3446532/3446532: [0xffffffffff600000(0x1000) @ 0 00:00 0 0]: --xp [vsyscall]
+           350fd4 0x260832/0x260894/P/-/-/2//-  0x260832/0x260894/P/-/-/1//-  0x26081b/0x26082e/P/-/-/1//-  0x260c52/0x260814/P/-/-/1//-  0x3508da/0x260c4c/P/-/-/5//-  0x350879/0x350887/P/-/-/3//-  0x260c47/0x350850/P/-/-/3//-  0x26080f/0x260c30/P/-/-/1//-  0x26088f/0x260800/P/-/-/3//-  0x26090a/0x260880/P/-/-/1//-  0x26091c/0x260900/P/-/-/1//-  0x2608bb/0x26090f/P/-/-/2//-  0x3510ab/0x2608af/P/-/-/3//-  0x351059/0x351098/P/-/-/7//-  0x350f8c/0x350fb4/P/-/-/4//-  0x260bf4/0x350f40/P/-/-/1//-  0x260be4/0x260bf0/P/-/-/1//-  0x26085a/0x260be0/P/-/-/2//-  0x2608ac/0x260850/P/-/-/1//-  0x26084c/0x2608a4/P/-/-/2//-  0x2608a2/0x260840/P/-/-/2//-  0x260832/0x260894/P/-/-/1//-  0x26081b/0x26082e/P/-/-/1//-  0x260c52/0x260814/P/-/-/1//-  0x3508da/0x260c4c/P/-/-/5//-  0x350879/0x350887/P/-/-/3//-  0x260c47/0x350850/P/-/-/3//-  0x26080f/0x260c30/P/-/-/1//-  0x26088f/0x260800/P/-/-/3//-  0x26090a/0x260880/P/-/-/1//-  0x26091c/0x260900/P/-/-/1//-  0x2608bb/0x26090f/P/-/-/2//- 
+           350fd4 0x260832/0x260894/P/-/-/2//-  0x260832/0x260894/P/-/-/1//-  0x26081b/0x26082e/P/-/-/1//-  0x260c52/0x260814/P/-/-/1//-  0x3508da/0x260c4c/P/-/-/5//-  0x350879/0x350887/P/-/-/3//-  0x260c47/0x350850/P/-/-/3//-  0x26080f/0x260c30/P/-/-/1//-  0x26088f/0x260800/P/-/-/3//-  0x26090a/0x260880/P/-/-/1//-  0x26091c/0x260900/P/-/-/1//-  0x2608bb/0x26090f/P/-/-/2//-  0x3510ab/0x2608af/P/-/-/3//-  0x351059/0x351098/P/-/-/7//-  0x350f8c/0x350fb4/P/-/-/4//-  0x260bf4/0x350f40/P/-/-/1//-  0x260be4/0x260bf0/P/-/-/1//-  0x26085a/0x260be0/P/-/-/2//-  0x2608ac/0x260850/P/-/-/1//-  0x26084c/0x2608a4/P/-/-/2//-  0x2608a2/0x260840/P/-/-/2//-  0x260832/0x260894/P/-/-/1//-  0x26081b/0x26082e/P/-/-/1//-  0x260c52/0x260814/P/-/-/1//-  0x3508da/0x260c4c/P/-/-/5//-  0x350879/0x350887/P/-/-/3//-  0x260c47/0x350850/P/-/-/3//-  0x26080f/0x260c30/P/-/-/1//-  0x26088f/0x260800/P/-/-/3//-  0x26090a/0x260880/P/-/-/1//-  0x26091c/0x260900/P/-/-/1//-  0x2608bb/0x26090f/P/-/-/2//- 
+           350ff8 0x2608a2/0x2607f0/P/-/-/2//-  0x260832/0x260894/P/-/-/1//-  0x260c52/0x260827/P/-/-/1//-  0x3508da/0x260c4c/P/-/-/5//-  0x350879/0x350887/P/-/-/3//-  0x260c47/0x350850/P/-/-/2//-  0x260822/0x260c30/P/-/-/1//-  0x260808/0x26081d/P/-/-/1//-  0x26088f/0x260800/P/-/-/3//-  0x26090a/0x260880/P/-/-/1//-  0x26091c/0x260900/P/-/-/1//-  0x2608bb/0x26090f/P/-/-/2//-  0x3510ab/0x2608af/P/-/-/3//-  0x351059/0x351098/P/-/-/7//-  0x350f8c/0x350fb4/P/-/-/4//-  0x260bf4/0x350f40/P/-/-/1//-  0x260be4/0x260bf0/P/-/-/1//-  0x26085a/0x260be0/P/-/-/2//-  0x2608ac/0x260850/P/-/-/1//-  0x26084c/0x2608a4/P/-/-/2//-  0x2608a2/0x260840/P/-/-/2//-  0x260832/0x260894/P/-/-/1//-  0x26081b/0x26082e/P/-/-/1//-  0x260c52/0x260814/P/-/-/1//-  0x3508da/0x260c4c/P/-/-/5//-  0x350879/0x350887/P/-/-/3//-  0x260c47/0x350850/P/-/-/3//-  0x26080f/0x260c30/P/-/-/1//-  0x26088f/0x260800/P/-/-/3//-  0x26090a/0x260880/P/-/-/1//-  0x26091c/0x260900/P/-/-/1//-  0x2608bb/0x26090f/P/-/-/2//- 
+           350866 0x351059/0x351098/P/-/-/3//-  0x351059/0x351098/P/-/-/7//-  0x350f8c/0x350fb4/P/-/-/4//-  0x260bf4/0x350f40/P/-/-/1//-  0x260be4/0x260bf0/P/-/-/1//-  0x26087a/0x260be0/P/-/-/2//-  0x2608ac/0x260870/P/-/-/1//-  0x2607fc/0x2608a4/P/-/-/2//-  0x2608a2/0x2607f0/P/-/-/2//-  0x260832/0x260894/P/-/-/1//-  0x260c52/0x260827/P/-/-/1//-  0x3508da/0x260c4c/P/-/-/5//-  0x350879/0x350887/P/-/-/3//-  0x260c47/0x350850/P/-/-/2//-  0x260822/0x260c30/P/-/-/1//-  0x260808/0x26081d/P/-/-/1//-  0x26088f/0x260800/P/-/-/3//-  0x26090a/0x260880/P/-/-/1//-  0x26091c/0x260900/P/-/-/1//-  0x2608bb/0x26090f/P/-/-/2//-  0x3510ab/0x2608af/P/-/-/3//-  0x351059/0x351098/P/-/-/7//-  0x350f8c/0x350fb4/P/-/-/4//-  0x260bf4/0x350f40/P/-/-/1//-  0x260be4/0x260bf0/P/-/-/1//-  0x26085a/0x260be0/P/-/-/2//-  0x2608ac/0x260850/P/-/-/1//-  0x26084c/0x2608a4/P/-/-/2//-  0x2608a2/0x260840/P/-/-/2//-  0x260832/0x260894/P/-/-/1//-  0x26081b/0x26082e/P/-/-/1//-  0x260c52/0x260814/P/-/-/1//- 
+           260880 0x350f8c/0x350fb4/P/-/-/4//-  0x260bf4/0x350f40/P/-/-/1//-  0x260be4/0x260bf0/P/-/-/1//-  0x26085a/0x260be0/P/-/-/2//-  0x2608ac/0x260850/P/-/-/1//-  0x26084c/0x2608a4/P/-/-/2//-  0x2608a2/0x260840/P/-/-/2//-  0x260832/0x260894/P/-/-/1//-  0x26081b/0x26082e/P/-/-/1//-  0x260c52/0x260814/P/-/-/1//-  0x3508da/0x260c4c/P/-/-/5//-  0x350879/0x350887/P/-/-/3//-  0x260c47/0x350850/P/-/-/3//-  0x26080f/0x260c30/P/-/-/1//-  0x26088f/0x260800/P/-/-/3//-  0x26090a/0x260880/P/-/-/1//-  0x26091c/0x260900/P/-/-/1//-  0x2608bb/0x26090f/P/-/-/2//-  0x3510ab/0x2608af/P/-/-/3//-  0x351059/0x351098/P/-/-/7//-  0x350f8c/0x350fb4/P/-/-/4//-  0x260bf4/0x350f40/P/-/-/1//-  0x260be4/0x260bf0/P/-/-/1//-  0x26085a/0x260be0/P/-/-/2//-  0x2608ac/0x260850/P/-/-/1//-  0x26084c/0x2608a4/P/-/-/2//-  0x2608a2/0x260840/P/-/-/2//-  0x260832/0x260894/P/-/-/1//-  0x26081b/0x26082e/P/-/-/1//-  0x260c52/0x260814/P/-/-/1//-  0x3508da/0x260c4c/P/-/-/5//-  0x350879/0x350887/P/-/-/3//- 
+           26090f 0x260bf4/0x350f40/P/-/-/1//-  0x260be4/0x260bf0/P/-/-/1//-  0x26085a/0x260be0/P/-/-/2//-  0x2608ac/0x260850/P/-/-/1//-  0x26084c/0x2608a4/P/-/-/2//-  0x2608a2/0x260840/P/-/-/2//-  0x260832/0x260894/P/-/-/1//-  0x26081b/0x26082e/P/-/-/1//-  0x260c52/0x260814/P/-/-/1//-  0x3508da/0x260c4c/P/-/-/5//-  0x350879/0x350887/P/-/-/3//-  0x260c47/0x350850/P/-/-/3//-  0x26080f/0x260c30/P/-/-/1//-  0x26088f/0x260800/P/-/-/3//-  0x26090a/0x260880/P/-/-/1//-  0x26091c/0x260900/P/-/-/1//-  0x2608bb/0x26090f/P/-/-/2//-  0x3510ab/0x2608af/P/-/-/3//-  0x351059/0x351098/P/-/-/7//-  0x350f8c/0x350fb4/P/-/-/4//-  0x260bf4/0x350f40/P/-/-/1//-  0x260be4/0x260bf0/P/-/-/1//-  0x26085a/0x260be0/P/-/-/2//-  0x2608ac/0x260850/P/-/-/1//-  0x26084c/0x2608a4/P/-/-/2//-  0x2608a2/0x260840/P/-/-/2//-  0x260832/0x260894/P/-/-/1//-  0x26081b/0x26082e/P/-/-/1//-  0x260c52/0x260814/P/-/-/1//-  0x3508da/0x260c4c/P/-/-/5//-  0x350879/0x350887/P/-/-/3//-  0x260c47/0x350850/P/-/-/3//- 
+           260884 0x350f8c/0x350fb4/P/-/-/4//-  0x260bf4/0x350f40/P/-/-/1//-  0x260be4/0x260bf0/P/-/-/1//-  0x26085a/0x260be0/P/-/-/2//-  0x2608ac/0x260850/P/-/-/1//-  0x26084c/0x2608a4/P/-/-/2//-  0x2608a2/0x260840/P/-/-/2//-  0x260832/0x260894/P/-/-/1//-  0x26081b/0x26082e/P/-/-/1//-  0x260c52/0x260814/P/-/-/1//-  0x3508da/0x260c4c/P/-/-/5//-  0x350879/0x350887/P/-/-/3//-  0x260c47/0x350850/P/-/-/3//-  0x26080f/0x260c30/P/-/-/1//-  0x26088f/0x260800/P/-/-/3//-  0x26090a/0x260880/P/-/-/1//-  0x26091c/0x260900/P/-/-/1//-  0x2608bb/0x26090f/P/-/-/2//-  0x3510ab/0x2608af/P/-/-/3//-  0x351059/0x351098/P/-/-/7//-  0x350f8c/0x350fb4/P/-/-/4//-  0x260bf4/0x350f40/P/-/-/1//-  0x260be4/0x260bf0/P/-/-/1//-  0x26085a/0x260be0/P/-/-/2//-  0x2608ac/0x260850/P/-/-/1//-  0x26084c/0x2608a4/P/-/-/2//-  0x2608a2/0x260840/P/-/-/2//-  0x260832/0x260894/P/-/-/1//-  0x26081b/0x26082e/P/-/-/1//-  0x260c52/0x260814/P/-/-/1//-  0x3508da/0x260c4c/P/-/-/5//-  0x350879/0x350887/P/-/-/3//- 
+           2608a0 0x26080f/0x260c30/P/-/-/1//-  0x26088f/0x260800/P/-/-/3//-  0x26090a/0x260880/P/-/-/1//-  0x26091c/0x260900/P/-/-/1//-  0x2608bb/0x26090f/P/-/-/2//-  0x3510ab/0x2608af/P/-/-/3//-  0x351059/0x351098/P/-/-/7//-  0x350f8c/0x350fb4/P/-/-/4//-  0x260bf4/0x350f40/P/-/-/1//-  0x260be4/0x260bf0/P/-/-/1//-  0x26085a/0x260be0/P/-/-/2//-  0x2608ac/0x260850/P/-/-/1//-  0x26084c/0x2608a4/P/-/-/2//-  0x2608a2/0x260840/P/-/-/2//-  0x260832/0x260894/P/-/-/1//-  0x26081b/0x26082e/P/-/-/1//-  0x260c52/0x260814/P/-/-/1//-  0x3508da/0x260c4c/P/-/-/5//-  0x350879/0x350887/P/-/-/3//-  0x260c47/0x350850/P/-/-/3//-  0x26080f/0x260c30/P/-/-/1//-  0x26088f/0x260800/P/-/-/3//-  0x26090a/0x260880/P/-/-/1//-  0x26091c/0x260900/P/-/-/1//-  0x2608bb/0x26090f/P/-/-/2//-  0x3510ab/0x2608af/P/-/-/3//-  0x351059/0x351098/P/-/-/7//-  0x350f8c/0x350fb4/P/-/-/4//-  0x260bf4/0x350f40/P/-/-/1//-  0x260be4/0x260bf0/P/-/-/1//-  0x26087a/0x260be0/P/-/-/2//-  0x2608ac/0x260870/P/-/-/1//- 
+           350866 0x351059/0x351098/P/-/-/3//-  0x351059/0x351098/P/-/-/7//-  0x350f8c/0x350fb4/P/-/-/4//-  0x260bf4/0x350f40/P/-/-/1//-  0x260be4/0x260bf0/P/-/-/1//-  0x26085a/0x260be0/P/-/-/2//-  0x2608ac/0x260850/P/-/-/1//-  0x26084c/0x2608a4/P/-/-/2//-  0x2608a2/0x260840/P/-/-/2//-  0x260832/0x260894/P/-/-/1//-  0x26081b/0x26082e/P/-/-/1//-  0x260c52/0x260814/P/-/-/1//-  0x3508da/0x260c4c/P/-/-/5//-  0x350879/0x350887/P/-/-/3//-  0x260c47/0x350850/P/-/-/3//-  0x26080f/0x260c30/P/-/-/1//-  0x26088f/0x260800/P/-/-/3//-  0x26090a/0x260880/P/-/-/1//-  0x26091c/0x260900/P/-/-/1//-  0x2608bb/0x26090f/P/-/-/2//-  0x3510ab/0x2608af/P/-/-/3//-  0x351059/0x351098/P/-/-/7//-  0x350f8c/0x350fb4/P/-/-/4//-  0x260bf4/0x350f40/P/-/-/1//-  0x260be4/0x260bf0/P/-/-/1//-  0x26087a/0x260be0/P/-/-/2//-  0x2608ac/0x260870/P/-/-/1//-  0x2607fc/0x2608a4/P/-/-/2//-  0x2608a2/0x2607f0/P/-/-/2//-  0x260832/0x260894/P/-/-/1//-  0x260c52/0x260827/P/-/-/1//-  0x3508da/0x260c4c/P/-/-/5//- 
+           260800 0x350f8c/0x350fb4/P/-/-/4//-  0x260bf4/0x350f40/P/-/-/1//-  0x260be4/0x260bf0/P/-/-/1//-  0x26085a/0x260be0/P/-/-/2//-  0x2608ac/0x260850/P/-/-/1//-  0x26084c/0x2608a4/P/-/-/2//-  0x2608a2/0x260840/P/-/-/2//-  0x260832/0x260894/P/-/-/1//-  0x26081b/0x26082e/P/-/-/1//-  0x260c52/0x260814/P/-/-/1//-  0x3508da/0x260c4c/P/-/-/5//-  0x350879/0x350887/P/-/-/3//-  0x260c47/0x350850/P/-/-/3//-  0x26080f/0x260c30/P/-/-/1//-  0x26088f/0x260800/P/-/-/3//-  0x26090a/0x260880/P/-/-/1//-  0x26091c/0x260900/P/-/-/1//-  0x2608bb/0x26090f/P/-/-/2//-  0x3510ab/0x2608af/P/-/-/3//-  0x351059/0x351098/P/-/-/7//-  0x350f8c/0x350fb4/P/-/-/4//-  0x260bf4/0x350f40/P/-/-/1//-  0x260be4/0x260bf0/P/-/-/1//-  0x26085a/0x260be0/P/-/-/2//-  0x2608ac/0x260850/P/-/-/1//-  0x26084c/0x2608a4/P/-/-/2//-  0x2608a2/0x260840/P/-/-/2//-  0x260832/0x260894/P/-/-/1//-  0x26081b/0x26082e/P/-/-/1//-  0x260c52/0x260814/P/-/-/1//-  0x3508da/0x260c4c/P/-/-/5//-  0x350879/0x350887/P/-/-/3//- 
+           260800 0x350f8c/0x350fb4/P/-/-/4//-  0x260bf4/0x350f40/P/-/-/1//-  0x260be4/0x260bf0/P/-/-/1//-  0x26085a/0x260be0/P/-/-/2//-  0x2608ac/0x260850/P/-/-/1//-  0x26084c/0x2608a4/P/-/-/2//-  0x2608a2/0x260840/P/-/-/2//-  0x260832/0x260894/P/-/-/1//-  0x26081b/0x26082e/P/-/-/1//-  0x260c52/0x260814/P/-/-/1//-  0x3508da/0x260c4c/P/-/-/5//-  0x350879/0x350887/P/-/-/3//-  0x260c47/0x350850/P/-/-/3//-  0x26080f/0x260c30/P/-/-/1//-  0x26088f/0x260800/P/-/-/3//-  0x26090a/0x260880/P/-/-/1//-  0x26091c/0x260900/P/-/-/1//-  0x2608bb/0x26090f/P/-/-/2//-  0x3510ab/0x2608af/P/-/-/3//-  0x351059/0x351098/P/-/-/7//-  0x350f8c/0x350fb4/P/-/-/4//-  0x260bf4/0x350f40/P/-/-/1//-  0x260be4/0x260bf0/P/-/-/1//-  0x26085a/0x260be0/P/-/-/2//-  0x2608ac/0x260850/P/-/-/1//-  0x26084c/0x2608a4/P/-/-/2//-  0x2608a2/0x260840/P/-/-/2//-  0x260832/0x260894/P/-/-/1//-  0x26081b/0x26082e/P/-/-/1//-  0x260c52/0x260814/P/-/-/1//-  0x3508da/0x260c4c/P/-/-/5//-  0x350879/0x350887/P/-/-/3//- 
diff --git a/llvm/test/tools/llvm-profgen/Inputs/pie-dap-perf.txt b/llvm/test/tools/llvm-profgen/Inputs/pie-dap-perf.txt
new file mode 100644
index 0000000000000..106d2923d1fec
--- /dev/null
+++ b/llvm/test/tools/llvm-profgen/Inputs/pie-dap-perf.txt
@@ -0,0 +1,79 @@
+0 0x2a38 [0xb0]: PERF_RECORD_MMAP2 1725662/1725662: [0x55b978932000(0x1000) @ 0 fd:01 21244721 0]: r--p /usr/local/google/home/mingmingl/llvm-sdp/llvm-project/build/src/dap-pie.bin
+0 0x2ae8 [0xb0]: PERF_RECORD_MMAP2 1725662/1725662: [0x55b978933000(0x1000) @ 0 fd:01 21244721 0]: r-xp /usr/local/google/home/mingmingl/llvm-sdp/llvm-project/build/src/dap-pie.bin
+0 0x2b98 [0xb0]: PERF_RECORD_MMAP2 1725662/1725662: [0x55b978934000(0x2000) @ 0 fd:01 21244721 0]: r--p /usr/local/google/home/mingmingl/llvm-sdp/llvm-project/build/src/dap-pie.bin
+0 0x2c48 [0xb0]: PERF_RECORD_MMAP2 1725662/1725662: [0x55b978936000(0x1000) @ 0x1000 fd:01 21244721 0]: rw-p /usr/local/google/home/mingmingl/llvm-sdp/llvm-project/build/src/dap-pie.bin
+712804701634173 0x1e9d0 [0x60]: PERF_RECORD_SAMPLE(IP, 0x4002): 1725662/1725662: 0x55b978933cd2 period: 1000003 addr: 0x55b978934e20
+712804836290301 0x28f68 [0x60]: PERF_RECORD_SAMPLE(IP, 0x4002): 1725662/1725662: 0x55b978933cd2 period: 1000003 addr: 0x55b978934e20
+712804837605543 0x28fc8 [0x60]: PERF_RECORD_SAMPLE(IP, 0x4002): 1725662/1725662: 0x55b978933cd2 period: 1000003 addr: 0x55b978934e20
+712805007191186 0x35588 [0x60]: PERF_RECORD_SAMPLE(IP, 0x4002): 1725662/1725662: 0x55b978933cd2 period: 1000003 addr: 0x55b978934e60
+712805018823760 0x36178 [0x60]: PERF_RECORD_SAMPLE(IP, 0x4002): 1725662/1725662: 0x55b978933cd2 period: 1000003 addr: 0x55b978934e60
+712805020138664 0x361d8 [0x60]: PERF_RECORD_SAMPLE(IP, 0x4002): 1725662/1725662: 0x55b978933cd2 period: 1000003 addr: 0x55b978934e60
+712805113189243 0x3b938 [0x60]: PERF_RECORD_SAMPLE(IP, 0x4002): 1725662/1725662: 0x55b978933cd2 period: 1000003 addr: 0x55b978934e60
+712805141066561 0x3d290 [0x60]: PERF_RECORD_SAMPLE(IP, 0x4002): 1725662/1725662: 0x55b978933cd2 period: 1000003 addr: 0x55b978934e20
+712805283363367 0x469d8 [0x60]: PERF_RECORD_SAMPLE(IP, 0x4002): 1725662/1725662: 0x55b978933cd2 period: 1000003 addr: 0x55b978934e60
+712805677956073 0x5fd50 [0x60]: PERF_RECORD_SAMPLE(IP, 0x4002): 1725662/1725662: 0x55b978933cd2 period: 1000003 addr: 0x55b978934e60
+712805679249452 0x5fdb0 [0x60]: PERF_RECORD_SAMPLE(IP, 0x4002): 1725662/1725662: 0x55b978933cd2 period: 1000003 addr: 0x55b978934e60
+712805693565180 0x601f8 [0x60]: PERF_RECORD_SAMPLE(IP, 0x4002): 1725662/1725662: 0x55b978933cd2 period: 1000003 addr: 0x55b978934e60
+712806019001624 0x816f0 [0x60]: PERF_RECORD_SAMPLE(IP, 0x4002): 1725662/1725662: 0x55b978933cd2 period: 1000003 addr: 0x55b978934e60
+712806020316894 0x81750 [0x60]: PERF_RECORD_SAMPLE(IP, 0x4002): 1725662/1725662: 0x55b978933cd2 period: 1000003 addr: 0x55b978934e60
+712806032091602 0x81ad8 [0x60]: PERF_RECORD_SAMPLE(IP, 0x4002): 1725662/1725662: 0x55b978933cd2 period: 1000003 addr: 0x55b978934e60
+712806033406859 0x81b38 [0x60]: PERF_RECORD_SAMPLE(IP, 0x4002): 1725662/1725662: 0x55b978933cd2 period: 1000003 addr: 0x55b978934e60
+712806159075140 0x8b5c0 [0x60]: PERF_RECORD_SAMPLE(IP, 0x4002): 1725662/1725662: 0x55b978933cd2 period: 1000003 addr: 0x55b978934e20
+712808020339685 0x126910 [0x60]: PERF_RECORD_SAMPLE(IP, 0x4002): 1725662/1725662: 0x55b978933cd2 period: 1000003 addr: 0x55b978934e60
+712808021641071 0x126970 [0x60]: PERF_RECORD_SAMPLE(IP, 0x4002): 1725662/1725662: 0x55b978933cd2 period: 1000003 addr: 0x55b978934e60
+712808022947274 0x1269d0 [0x60]: PERF_RECORD_SAMPLE(IP, 0x4002): 1725662/1725662: 0x55b978933cd2 period: 1000003 addr: 0x55b978934e60
+712808689837848 0x15d9f8 [0x60]: PERF_RECORD_SAMPLE(IP, 0x4002): 1725662/1725662: 0x55b978933cdc period: 1000003 addr: 0x55b978934e70
+712808711116732 0x15e5b8 [0x60]: PERF_RECORD_SAMPLE(IP, 0x4002): 1725662/1725662: 0x55b978933cd2 period: 1000003 addr: 0x55b978934e60
+712808711634977 0x15e618 [0x60]: PERF_RECORD_SAMPLE(IP, 0x4002): 1725662/1725662: 0x55b978933cd2 period: 1000003 addr: 0x55b978934e60
+712808712152486 0x15e678 [0x60]: PERF_RECORD_SAMPLE(IP, 0x4002): 1725662/1725662: 0x55b978933cdc period: 1000003 addr: 0x55b978934e70
+712808719972492 0x15eb88 [0x60]: PERF_RECORD_SAMPLE(IP, 0x4002): 1725662/1725662: 0x55b978933cdc period: 1000003 addr: 0x55b978934e70
+712808779739123 0x164c78 [0x60]: PERF_RECORD_SAMPLE(IP, 0x4002): 1725662/1725662: 0x55b978933cdc period: 1000003 addr: 0x55b978934e70
+712809032526371 0x177348 [0x60]: PERF_RECORD_SAMPLE(IP, 0x4002): 1725662/1725662: 0x55b978933cd2 period: 1000003 addr: 0x55b978934e60
+712809839042086 0x1ba6c0 [0x60]: PERF_RECORD_SAMPLE(IP, 0x4002): 1725662/1725662: 0x55b978933cd2 period: 1000003 addr: 0x55b978934e20
+712809840367013 0x1ba720 [0x60]: PERF_RECORD_SAMPLE(IP, 0x4002): 1725662/1725662: 0x55b978933cd2 period: 1000003 addr: 0x55b978934e20
+712810017729661 0x1c8698 [0x60]: PERF_RECORD_SAMPLE(IP, 0x4002): 1725662/1725662: 0x55b978933cd2 period: 1000003 addr: 0x55b978934e20
+712810019114828 0x1c8760 [0x60]: PERF_RECORD_SAMPLE(IP, 0x4002): 1725662/1725662: 0x55b978933cd2 period: 1000003 addr: 0x55b978934e20
+712810030692973 0x1c8c00 [0x60]: PERF_RECORD_SAMPLE(IP, 0x4002): 1725662/1725662: 0x55b978933cd2 period: 1000003 addr: 0x55b978934e60
+712810988845241 0x218cc8 [0x60]: PERF_RECORD_SAMPLE(IP, 0x4002): 1725662/1725662: 0x55b978933cd2 period: 1000003 addr: 0x55b978934e60
+712811415653266 0x239f38 [0x60]: PERF_RECORD_SAMPLE(IP, 0x4002): 1725662/1725662: 0x55b978933cd2 period: 1000003 addr: 0x55b978934e60
+712811416968337 0x239f98 [0x60]: PERF_RECORD_SAMPLE(IP, 0x4002): 1725662/1725662: 0x55b978933cd2 period: 1000003 addr: 0x55b978934e60
+712811585896657 0x246f48 [0x60]: PERF_RECORD_SAMPLE(IP, 0x4002): 1725662/1725662: 0x55b978933cd2 period: 1000003 addr: 0x55b978934e20
+712812025437162 0x26ad10 [0x60]: PERF_RECORD_SAMPLE(IP, 0x4002): 1725662/1725662: 0x55b978933cd2 period: 1000003 addr: 0x55b978934e20
+712814437226587 0x32b278 [0x60]: PERF_RECORD_SAMPLE(IP, 0x4002): 1725662/1725662: 0x55b978933cd2 period: 1000003 addr: 0x55b978934e20
+712814438523218 0x32b2d8 [0x60]: PERF_RECORD_SAMPLE(IP, 0x4002): 1725662/1725662: 0x55b978933cd2 period: 1000003 addr: 0x55b978934e20
+712815028856637 0x35c470 [0x60]: PERF_RECORD_SAMPLE(IP, 0x4002): 1725662/1725662: 0x55b978933cd2 period: 1000003 addr: 0x55b978934e60
+712815030142353 0x35c4d0 [0x60]: PERF_RECORD_SAMPLE(IP, 0x4002): 1725662/1725662: 0x55b978933cd2 period: 1000003 addr: 0x55b978934e60
+712818017027908 0x4577f0 [0x60]: PERF_RECORD_SAMPLE(IP, 0x4002): 1725662/1725662: 0x55b978933cdc period: 1000003 addr: 0x55b978934e70
+712818021039899 0x457968 [0x60]: PERF_RECORD_SAMPLE(IP, 0x4002): 1725662/1725662: 0x55b978933cd2 period: 1000003 addr: 0x55b978934e60
+712818022355102 0x4579c8 [0x60]: PERF_RECORD_SAMPLE(IP, 0x4002): 1725662/1725662: 0x55b978933cd2 period: 1000003 addr: 0x55b978934e60
+712818692960009 0x490df0 [0x60]: PERF_RECORD_SAMPLE(IP, 0x4002): 1725662/1725662: 0x55b978933cd2 period: 1000003 addr: 0x55b978934e20
+712819018376007 0x4abe20 [0x60]: PERF_RECORD_SAMPLE(IP, 0x4002): 1725662/1725662: 0x55b978933cd2 period: 1000003 addr: 0x55b978934e20
+712819048387059 0x4ac708 [0x60]: PERF_RECORD_SAMPLE(IP, 0x4002): 1725662/1725662: 0x55b978933cd2 period: 1000003 addr: 0x55b978934e60
+712820021151706 0x4fec00 [0x60]: PERF_RECORD_SAMPLE(IP, 0x4002): 1725662/1725662: 0x55b978933cd2 period: 1000003 addr: 0x55b978934e60
+712822024629005 0x5a3be0 [0x60]: PERF_RECORD_SAMPLE(IP, 0x4002): 1725662/1725662: 0x55b978933cd2 period: 1000003 addr: 0x55b978934e20
+712823015752028 0x5f3760 [0x60]: PERF_RECORD_SAMPLE(IP, 0x4002): 1725662/1725662: 0x55b978933cdc period: 1000003 addr: 0x55b978934e70
+712824020617614 0x6462a0 [0x60]: PERF_RECORD_SAMPLE(IP, 0x4002): 1725662/1725662: 0x55b978933cd2 period: 1000003 addr: 0x55b978934e60
+712824021918372 0x646300 [0x60]: PERF_RECORD_SAMPLE(IP, 0x4002): 1725662/1725662: 0x55b978933cd2 period: 1000003 addr: 0x55b978934e60
+712826019891672 0x6ec3c8 [0x60]: PERF_RECORD_SAMPLE(IP, 0x4002): 1725662/1725662: 0x55b978933cd2 period: 1000003 addr: 0x55b978934e60
+712826021207387 0x6ec428 [0x60]: PERF_RECORD_SAMPLE(IP, 0x4002): 1725662/1725662: 0x55b978933cd2 period: 1000003 addr: 0x55b978934e60
+712826022522240 0x6ec488 [0x60]: PERF_RECORD_SAMPLE(IP, 0x4002): 1725662/1725662: 0x55b978933cd2 period: 1000003 addr: 0x55b978934e60
+712827015608752 0x73ee18 [0x60]: PERF_RECORD_SAMPLE(IP, 0x4002): 1725662/1725662: 0x55b978933cd2 period: 1000003 addr: 0x55b978934e60
+712827665092533 0x773768 [0x60]: PERF_RECORD_SAMPLE(IP, 0x4002): 1725662/1725662: 0x55b978933cd2 period: 1000003 addr: 0x55b978934e60
+712828018715780 0x78ef00 [0x60]: PERF_RECORD_SAMPLE(IP, 0x4002): 1725662/1725662: 0x55b978933cd2 period: 1000003 addr: 0x55b978934e60
+712828423585934 0x7aede8 [0x60]: PERF_RECORD_SAMPLE(IP, 0x4002): 1725662/1725662: 0x55b978933cdc period: 1000003 addr: 0x55b978934e70
+712829019664051 0x7e0210 [0x60]: PERF_RECORD_SAMPLE(IP, 0x4002): 1725662/1725662: 0x55b978933cd2 period: 1000003 addr: 0x55b978934e60
+712829395596728 0x7fe558 [0x60]: PERF_RECORD_SAMPLE(IP, 0x4002): 1725662/1725662: 0x55b978933cd2 period: 1000003 addr: 0x55b978934e60
+712830018663760 0x831e18 [0x60]: PERF_RECORD_SAMPLE(IP, 0x4002): 1725662/1725662: 0x55b978933cd2 period: 1000003 addr: 0x55b978934e60
+712830023673654 0x831f90 [0x60]: PERF_RECORD_SAMPLE(IP, 0x4002): 1725662/1725662: 0x55b978933cd2 period: 1000003 addr: 0x55b978934e60
+712830180382629 0x83ebc0 [0x60]: PERF_RECORD_SAMPLE(IP, 0x4002): 1725662/1725662: 0x55b978933cd2 period: 1000003 addr: 0x55b978934e60
+712830180777038 0x83ec20 [0x60]: PERF_RECORD_SAMPLE(IP, 0x4002): 1725662/1725662: 0x55b978933cd2 period: 1000003 addr: 0x55b978934e60
+712830344366327 0x8468f8 [0x60]: PERF_RECORD_SAMPLE(IP, 0x4002): 1725662/1725662: 0x55b978933cd2 period: 1000003 addr: 0x55b978934e60
+712831478881799 0x899a00 [0x60]: PERF_RECORD_SAMPLE(IP, 0x4002): 1725662/1725662: 0x55b978933cdc period: 1000003 addr: 0x55b978934e70
+712831992570428 0x8bec30 [0x60]: PERF_RECORD_SAMPLE(IP, 0x4002): 1725662/1725662: 0x55b978933cdc period: 1000003 addr: 0x55b978934e70
+712831999013143 0x8bf080 [0x60]: PERF_RECORD_SAMPLE(IP, 0x4002): 1725662/1725662: 0x55b978933cd2 period: 1000003 addr: 0x55b978934e60
+712832000314956 0x8bf0e0 [0x60]: PERF_RECORD_SAMPLE(IP, 0x4002): 1725662/1725662: 0x55b978933cd2 period: 1000003 addr: 0x55b978934e60
+712833027320322 0x906c38 [0x60]: PERF_RECORD_SAMPLE(IP, 0x4002): 1725662/1725662: 0x55b978933cd2 period: 1000003 addr: 0x55b978934e60
+712833188532818 0x9100a0 [0x60]: PERF_RECORD_SAMPLE(IP, 0x4002): 1725662/1725662: 0x55b978933cd2 period: 1000003 addr: 0x55b978934e60
+712833981258399 0x9498a8 [0x60]: PERF_RECORD_SAMPLE(IP, 0x4002): 1725662/1725662: 0x55b978933cdc period: 1000003 addr: 0x55b978934e30
+712833981258450 0x949908 [0x60]: PERF_RECORD_SAMPLE(IP, 0x4002): 1725662/1725662: 0x55b978933cdc period: 1000003 addr: 0x55b978934e30
+712833981258510 0x949968 [0x60]: PERF_RECORD_SAMPLE(IP, 0x4002): 1725662/1725662: 0x55b978933cdc period: 1000003 addr: 0x55b978934e30
diff --git a/llvm/test/tools/llvm-profgen/Inputs/pie-lbr-perf.script b/llvm/test/tools/llvm-profgen/Inputs/pie-lbr-perf.script
new file mode 100644
index 0000000000000..9294fc4c3dd67
--- /dev/null
+++ b/llvm/test/tools/llvm-profgen/Inputs/pie-lbr-perf.script
@@ -0,0 +1,17 @@
+PERF_RECORD_MMAP2 1725662/1725662: [0x55b978933000(0x1000) @ 0 fd:01 21244721 0]: r-xp /usr/local/google/home/mingmingl/llvm-sdp/llvm-project/build/src/dap-pie.bin
+PERF_RECORD_MMAP2 1725662/1725662: [0x7f3ffcc32000(0x165000) @ 0x28000 fd:01 20075535 0]: r-xp /usr/lib/x86_64-linux-gnu/libc.so.6
+PERF_RECORD_MMAP2 1725662/1725662: [0x7f3ffce9d000(0x129000) @ 0x9d000 fd:01 20054678 0]: r-xp /usr/lib/x86_64-linux-gnu/libstdc++.so.6.0.33
+PERF_RECORD_MMAP2 1725662/1725662: [0x7f3ffd0f5000(0x23000) @ 0x4000 fd:01 20059537 0]: r-xp /usr/lib/x86_64-linux-gnu/libgcc_s.so.1
+PERF_RECORD_MMAP2 1725662/1725662: [0x7f3ffd12f000(0x7d000) @ 0x11000 fd:01 20075696 0]: r-xp /usr/lib/x86_64-linux-gnu/libm.so.6
+PERF_RECORD_MMAP2 1725662/1725662: [0x7f3ffd22c000(0x2000) @ 0 00:00 0 0]: r-xp [vdso]
+PERF_RECORD_MMAP2 1725662/1725662: [0x7f3ffd22f000(0x28000) @ 0x1000 fd:01 20075532 0]: r-xp /usr/lib/x86_64-linux-gnu/ld-linux-x86-64.so.2
+     55b978933cb4 0x7f3ffccad28e/0x7f3ffccad2b6/P/-/-/4//-  0x7f3ffce9e890/0x7f3ffccad240/P/-/-/1//-  0x7f3ffceb1d64/0x7f3ffce9e890/P/-/-/1//-  0x7f3ffce9e070/0x7f3ffceb1d60/P/-/-/1//-  0x7f3ffceb1d74/0x7f3ffce9e070/P/-/-/1//-  0x55b978933dd0/0x7f3ffceb1d70/P/-/-/1//-  0x55b978933c8a/0x55b978933dd0/P/-/-/2//-  0x55b978933cdc/0x55b978933c80/P/-/-/1//-  0x55b978933c7c/0x55b978933cd4/P/-/-/2//-  0x55b978933cd2/0x55b978933c70/P/-/-/3//-  0x55b978933c62/0x55b978933cc4/P/-/-/1//-  0x55b978933c4b/0x55b978933c5e/P/-/-/1//-  0x7f3ffceb3bb2/0x55b978933c44/P/-/-/1//-  0x7f3ffccacbe1/0x7f3ffceb3bac/P/-/-/19//-  0x7f3ffccacb7b/0x7f3ffccacb89/P/-/-/3//-  0x7f3ffce9de50/0x7f3ffccacb50/P/-/-/1//-  0x7f3ffceb3ba7/0x7f3ffce9de50/P/-/-/2//-  0x55b978933dc0/0x7f3ffceb3b90/P/-/-/1//-  0x55b978933c3f/0x55b978933dc0/P/-/-/1//-  0x55b978933cbf/0x55b978933c30/P/-/-/3//-  0x55b978933d3a/0x55b978933cb0/P/-/-/1//-  0x55b978933d4c/0x55b978933d30/P/-/-/1//-  0x55b978933ceb/0x55b978933d3f/P/-/-/2//-  0x7f3ffccad3ab/0x55b978933cdf/P/-/-/2//-  0x7f3ffccad356/0x7f3ffccad398/P/-/-/8//-  0x7f3ffccad28e/0x7f3ffccad2b6/P/-/-/4//-  0x7f3ffce9e890/0x7f3ffccad240/P/-/-/1//-  0x7f3ffceb1d64/0x7f3ffce9e890/P/-/-/1//-  0x7f3ffce9e070/0x7f3ffceb1d60/P/-/-/1//-  0x7f3ffceb1d74/0x7f3ffce9e070/P/-/-/1//-  0x55b978933dd0/0x7f3ffceb1d70/P/-/-/1//-  0x55b978933c8a/0x55b978933dd0/P/-/-/2//- 
+     7f3ffccacbc6 0x55b978933dc0/0x7f3ffceb3b90/P/-/-/2//-  0x55b978933dc0/0x7f3ffceb3b90/P/-/-/1//-  0x55b978933c52/0x55b978933dc0/P/-/-/1//-  0x55b978933c38/0x55b978933c4d/P/-/-/1//-  0x55b978933cbf/0x55b978933c30/P/-/-/3//-  0x55b978933d3a/0x55b978933cb0/P/-/-/1//-  0x55b978933d4c/0x55b978933d30/P/-/-/1//-  0x55b978933ceb/0x55b978933d3f/P/-/-/2//-  0x7f3ffccad3ab/0x55b978933cdf/P/-/-/2//-  0x7f3ffccad356/0x7f3ffccad398/P/-/-/8//-  0x7f3ffccad28e/0x7f3ffccad2b6/P/-/-/4//-  0x7f3ffce9e890/0x7f3ffccad240/P/-/-/1//-  0x7f3ffceb1d64/0x7f3ffce9e890/P/-/-/1//-  0x7f3ffce9e070/0x7f3ffceb1d60/P/-/-/1//-  0x7f3ffceb1d74/0x7f3ffce9e070/P/-/-/1//-  0x55b978933dd0/0x7f3ffceb1d70/P/-/-/1//-  0x55b978933c8a/0x55b978933dd0/P/-/-/2//-  0x55b978933cdc/0x55b978933c80/P/-/-/1//-  0x55b978933c7c/0x55b978933cd4/P/-/-/2//-  0x55b978933cd2/0x55b978933c70/P/-/-/3//-  0x55b978933c62/0x55b978933cc4/P/-/-/1//-  0x55b978933c4b/0x55b978933c5e/P/-/-/1//-  0x7f3ffceb3bb2/0x55b978933c44/P/-/-/1//-  0x7f3ffccacbe1/0x7f3ffceb3bac/P/-/-/19//-  0x7f3ffccacb7b/0x7f3ffccacb89/P/-/-/3//-  0x7f3ffce9de50/0x7f3ffccacb50/P/-/-/1//-  0x7f3ffceb3ba7/0x7f3ffce9de50/P/-/-/2//-  0x55b978933dc0/0x7f3ffceb3b90/P/-/-/1//-  0x55b978933c3f/0x55b978933dc0/P/-/-/1//-  0x55b978933cbf/0x55b978933c30/P/-/-/3//-  0x55b978933d3a/0x55b978933cb0/P/-/-/1//-  0x55b978933d4c/0x55b978933d30/P/-/-/1//- 
+     55b978933cc4 0x7f3ffccacb7b/0x7f3ffccacb89/P/-/-/4//-  0x7f3ffce9de50/0x7f3ffccacb50/P/-/-/1//-  0x7f3ffceb3ba7/0x7f3ffce9de50/P/-/-/2//-  0x55b978933dc0/0x7f3ffceb3b90/P/-/-/1//-  0x55b978933c52/0x55b978933dc0/P/-/-/1//-  0x55b978933c38/0x55b978933c4d/P/-/-/1//-  0x55b978933cbf/0x55b978933c30/P/-/-/3//-  0x55b978933d3a/0x55b978933cb0/P/-/-/1//-  0x55b978933d4c/0x55b978933d30/P/-/-/1//-  0x55b978933ceb/0x55b978933d3f/P/-/-/2//-  0x7f3ffccad3ab/0x55b978933cdf/P/-/-/2//-  0x7f3ffccad356/0x7f3ffccad398/P/-/-/8//-  0x7f3ffccad28e/0x7f3ffccad2b6/P/-/-/4//-  0x7f3ffce9e890/0x7f3ffccad240/P/-/-/1//-  0x7f3ffceb1d64/0x7f3ffce9e890/P/-/-/1//-  0x7f3ffce9e070/0x7f3ffceb1d60/P/-/-/1//-  0x7f3ffceb1d74/0x7f3ffce9e070/P/-/-/1//-  0x55b978933dd0/0x7f3ffceb1d70/P/-/-/1//-  0x55b978933c8a/0x55b978933dd0/P/-/-/2//-  0x55b978933cdc/0x55b978933c80/P/-/-/1//-  0x55b978933c7c/0x55b978933cd4/P/-/-/2//-  0x55b978933cd2/0x55b978933c70/P/-/-/3//-  0x55b978933c62/0x55b978933cc4/P/-/-/1//-  0x55b978933c4b/0x55b978933c5e/P/-/-/1//-  0x7f3ffceb3bb2/0x55b978933c44/P/-/-/1//-  0x7f3ffccacbe1/0x7f3ffceb3bac/P/-/-/19//-  0x7f3ffccacb7b/0x7f3ffccacb89/P/-/-/3//-  0x7f3ffce9de50/0x7f3ffccacb50/P/-/-/1//-  0x7f3ffceb3ba7/0x7f3ffce9de50/P/-/-/2//-  0x55b978933dc0/0x7f3ffceb3b90/P/-/-/1//-  0x55b978933c3f/0x55b978933dc0/P/-/-/2//-  0x55b978933cbf/0x55b978933c30/P/-/-/3//- 
+     55b978933cba 0x7f3ffccad28e/0x7f3ffccad2b6/P/-/-/4//-  0x7f3ffce9e890/0x7f3ffccad240/P/-/-/1//-  0x7f3ffceb1d64/0x7f3ffce9e890/P/-/-/1//-  0x7f3ffce9e070/0x7f3ffceb1d60/P/-/-/1//-  0x7f3ffceb1d74/0x7f3ffce9e070/P/-/-/1//-  0x55b978933dd0/0x7f3ffceb1d70/P/-/-/1//-  0x55b978933caa/0x55b978933dd0/P/-/-/2//-  0x55b978933cdc/0x55b978933ca0/P/-/-/1//-  0x55b978933c2c/0x55b978933cd4/P/-/-/2//-  0x55b978933cd2/0x55b978933c20/P/-/-/4//-  0x55b978933c62/0x55b978933cc4/P/-/-/1//-  0x7f3ffceb3bb2/0x55b978933c57/P/-/-/1//-  0x7f3ffccacbe1/0x7f3ffceb3bac/P/-/-/16//-  0x7f3ffccacb7b/0x7f3ffccacb89/P/-/-/4//-  0x7f3ffce9de50/0x7f3ffccacb50/P/-/-/1//-  0x7f3ffceb3ba7/0x7f3ffce9de50/P/-/-/2//-  0x55b978933dc0/0x7f3ffceb3b90/P/-/-/1//-  0x55b978933c52/0x55b978933dc0/P/-/-/1//-  0x55b978933c38/0x55b978933c4d/P/-/-/1//-  0x55b978933cbf/0x55b978933c30/P/-/-/3//-  0x55b978933d3a/0x55b978933cb0/P/-/-/1//-  0x55b978933d4c/0x55b978933d30/P/-/-/1//-  0x55b978933ceb/0x55b978933d3f/P/-/-/2//-  0x7f3ffccad3ab/0x55b978933cdf/P/-/-/2//-  0x7f3ffccad356/0x7f3ffccad398/P/-/-/8//-  0x7f3ffccad28e/0x7f3ffccad2b6/P/-/-/4//-  0x7f3ffce9e890/0x7f3ffccad240/P/-/-/1//-  0x7f3ffceb1d64/0x7f3ffce9e890/P/-/-/1//-  0x7f3ffce9e070/0x7f3ffceb1d60/P/-/-/1//-  0x7f3ffceb1d74/0x7f3ffce9e070/P/-/-/1//-  0x55b978933dd0/0x7f3ffceb1d70/P/-/-/1//-  0x55b978933c8a/0x55b978933dd0/P/-/-/2//- 
+     7f3ffccacbbc 0x55b978933dc0/0x7f3ffceb3b90/P/-/-/1//-  0x55b978933c3f/0x55b978933dc0/P/-/-/1//-  0x55b978933cbf/0x55b978933c30/P/-/-/3//-  0x55b978933d3a/0x55b978933cb0/P/-/-/1//-  0x55b978933d4c/0x55b978933d30/P/-/-/1//-  0x55b978933ceb/0x55b978933d3f/P/-/-/2//-  0x7f3ffccad3ab/0x55b978933cdf/P/-/-/2//-  0x7f3ffccad356/0x7f3ffccad398/P/-/-/8//-  0x7f3ffccad28e/0x7f3ffccad2b6/P/-/-/4//-  0x7f3ffce9e890/0x7f3ffccad240/P/-/-/1//-  0x7f3ffceb1d64/0x7f3ffce9e890/P/-/-/1//-  0x7f3ffce9e070/0x7f3ffceb1d60/P/-/-/1//-  0x7f3ffceb1d74/0x7f3ffce9e070/P/-/-/1//-  0x55b978933dd0/0x7f3ffceb1d70/P/-/-/1//-  0x55b978933caa/0x55b978933dd0/P/-/-/2//-  0x55b978933cdc/0x55b978933ca0/P/-/-/1//-  0x55b978933c2c/0x55b978933cd4/P/-/-/2//-  0x55b978933cd2/0x55b978933c20/P/-/-/4//-  0x55b978933c62/0x55b978933cc4/P/-/-/1//-  0x7f3ffceb3bb2/0x55b978933c57/P/-/-/1//-  0x7f3ffccacbe1/0x7f3ffceb3bac/P/-/-/16//-  0x7f3ffccacb7b/0x7f3ffccacb89/P/-/-/4//-  0x7f3ffce9de50/0x7f3ffccacb50/P/-/-/1//-  0x7f3ffceb3ba7/0x7f3ffce9de50/P/-/-/2//-  0x55b978933dc0/0x7f3ffceb3b90/P/-/-/1//-  0x55b978933c52/0x55b978933dc0/P/-/-/1//-  0x55b978933c38/0x55b978933c4d/P/-/-/1//-  0x55b978933cbf/0x55b978933c30/P/-/-/3//-  0x55b978933d3a/0x55b978933cb0/P/-/-/1//-  0x55b978933d4c/0x55b978933d30/P/-/-/1//-  0x55b978933ceb/0x55b978933d3f/P/-/-/2//-  0x7f3ffccad3ab/0x55b978933cdf/P/-/-/2//- 
+     7f3ffccacba7 0x55b978933cbf/0x55b978933c30/P/-/-/1//-  0x55b978933cbf/0x55b978933c30/P/-/-/3//-  0x55b978933d3a/0x55b978933cb0/P/-/-/1//-  0x55b978933d4c/0x55b978933d30/P/-/-/1//-  0x55b978933ceb/0x55b978933d3f/P/-/-/2//-  0x7f3ffccad3ab/0x55b978933cdf/P/-/-/2//-  0x7f3ffccad356/0x7f3ffccad398/P/-/-/8//-  0x7f3ffccad28e/0x7f3ffccad2b6/P/-/-/4//-  0x7f3ffce9e890/0x7f3ffccad240/P/-/-/1//-  0x7f3ffceb1d64/0x7f3ffce9e890/P/-/-/1//-  0x7f3ffce9e070/0x7f3ffceb1d60/P/-/-/1//-  0x7f3ffceb1d74/0x7f3ffce9e070/P/-/-/1//-  0x55b978933dd0/0x7f3ffceb1d70/P/-/-/1//-  0x55b978933c8a/0x55b978933dd0/P/-/-/2//-  0x55b978933cdc/0x55b978933c80/P/-/-/1//-  0x55b978933c7c/0x55b978933cd4/P/-/-/2//-  0x55b978933cd2/0x55b978933c70/P/-/-/3//-  0x55b978933c62/0x55b978933cc4/P/-/-/1//-  0x55b978933c4b/0x55b978933c5e/P/-/-/1//-  0x7f3ffceb3bb2/0x55b978933c44/P/-/-/1//-  0x7f3ffccacbe1/0x7f3ffceb3bac/P/-/-/17//-  0x7f3ffccacb7b/0x7f3ffccacb89/P/-/-/4//-  0x7f3ffce9de50/0x7f3ffccacb50/P/-/-/1//-  0x7f3ffceb3ba7/0x7f3ffce9de50/P/-/-/2//-  0x55b978933dc0/0x7f3ffceb3b90/P/-/-/1//-  0x55b978933c3f/0x55b978933dc0/P/-/-/1//-  0x55b978933cbf/0x55b978933c30/P/-/-/3//-  0x55b978933d3a/0x55b978933cb0/P/-/-/1//-  0x55b978933d4c/0x55b978933d30/P/-/-/1//-  0x55b978933ceb/0x55b978933d3f/P/-/-/2//-  0x7f3ffccad3ab/0x55b978933cdf/P/-/-/2//-  0x7f3ffccad356/0x7f3ffccad398/P/-/-/8//- 
+     7f3ffccad320 0x55b978933caa/0x55b978933dd0/P/-/-/1//-  0x55b978933caa/0x55b978933dd0/P/-/-/2//-  0x55b978933cdc/0x55b978933ca0/P/-/-/1//-  0x55b978933c2c/0x55b978933cd4/P/-/-/2//-  0x55b978933cd2/0x55b978933c20/P/-/-/4//-  0x55b978933c62/0x55b978933cc4/P/-/-/1//-  0x7f3ffceb3bb2/0x55b978933c57/P/-/-/1//-  0x7f3ffccacbe1/0x7f3ffceb3bac/P/-/-/16//-  0x7f3ffccacb7b/0x7f3ffccacb89/P/-/-/3//-  0x7f3ffce9de50/0x7f3ffccacb50/P/-/-/1//-  0x7f3ffceb3ba7/0x7f3ffce9de50/P/-/-/2//-  0x55b978933dc0/0x7f3ffceb3b90/P/-/-/1//-  0x55b978933c52/0x55b978933dc0/P/-/-/1//-  0x55b978933c38/0x55b978933c4d/P/-/-/1//-  0x55b978933cbf/0x55b978933c30/P/-/-/3//-  0x55b978933d3a/0x55b978933cb0/P/-/-/1//-  0x55b978933d4c/0x55b978933d30/P/-/-/1//-  0x55b978933ceb/0x55b978933d3f/P/-/-/2//-  0x7f3ffccad3ab/0x55b978933cdf/P/-/-/2//-  0x7f3ffccad356/0x7f3ffccad398/P/-/-/8//-  0x7f3ffccad28e/0x7f3ffccad2b6/P/-/-/4//-  0x7f3ffce9e890/0x7f3ffccad240/P/-/-/1//-  0x7f3ffceb1d64/0x7f3ffce9e890/P/-/-/1//-  0x7f3ffce9e070/0x7f3ffceb1d60/P/-/-/1//-  0x7f3ffceb1d74/0x7f3ffce9e070/P/-/-/1//-  0x55b978933dd0/0x7f3ffceb1d70/P/-/-/1//-  0x55b978933c8a/0x55b978933dd0/P/-/-/2//-  0x55b978933cdc/0x55b978933c80/P/-/-/1//-  0x55b978933c7c/0x55b978933cd4/P/-/-/2//-  0x55b978933cd2/0x55b978933c70/P/-/-/3//-  0x55b978933c62/0x55b978933cc4/P/-/-/1//-  0x55b978933c4b/0x55b978933c5e/P/-/-/1//- 
+     7f3ffccacba7 0x55b978933cbf/0x55b978933c30/P/-/-/1//-  0x55b978933cbf/0x55b978933c30/P/-/-/3//-  0x55b978933d3a/0x55b978933cb0/P/-/-/1//-  0x55b978933d4c/0x55b978933d30/P/-/-/1//-  0x55b978933ceb/0x55b978933d3f/P/-/-/2//-  0x7f3ffccad3ab/0x55b978933cdf/P/-/-/2//-  0x7f3ffccad356/0x7f3ffccad398/P/-/-/8//-  0x7f3ffccad28e/0x7f3ffccad2b6/P/-/-/4//-  0x7f3ffce9e890/0x7f3ffccad240/P/-/-/1//-  0x7f3ffceb1d64/0x7f3ffce9e890/P/-/-/1//-  0x7f3ffce9e070/0x7f3ffceb1d60/P/-/-/1//-  0x7f3ffceb1d74/0x7f3ffce9e070/P/-/-/1//-  0x55b978933dd0/0x7f3ffceb1d70/P/-/-/1//-  0x55b978933c8a/0x55b978933dd0/P/-/-/2//-  0x55b978933cdc/0x55b978933c80/P/-/-/1//-  0x55b978933c7c/0x55b978933cd4/P/-/-/2//-  0x55b978933cd2/0x55b978933c70/P/-/-/3//-  0x55b978933c62/0x55b978933cc4/P/-/-/1//-  0x55b978933c4b/0x55b978933c5e/P/-/-/1//-  0x7f3ffceb3bb2/0x55b978933c44/P/-/-/1//-  0x7f3ffccacbe1/0x7f3ffceb3bac/P/-/-/19//-  0x7f3ffccacb7b/0x7f3ffccacb89/P/-/-/3//-  0x7f3ffce9de50/0x7f3ffccacb50/P/-/-/1//-  0x7f3ffceb3ba7/0x7f3ffce9de50/P/-/-/2//-  0x55b978933dc0/0x7f3ffceb3b90/P/-/-/1//-  0x55b978933c3f/0x55b978933dc0/P/-/-/2//-  0x55b978933cbf/0x55b978933c30/P/-/-/3//-  0x55b978933d3a/0x55b978933cb0/P/-/-/1//-  0x55b978933d4c/0x55b978933d30/P/-/-/1//-  0x55b978933ceb/0x55b978933d3f/P/-/-/3//-  0x7f3ffccad3ab/0x55b978933cdf/P/-/-/2//-  0x7f3ffccad356/0x7f3ffccad398/P/-/-/8//- 
+     7f3ffccacbcd 0x7f3ffccacb7b/0x7f3ffccacb89/P/-/-/3//-  0x7f3ffce9de50/0x7f3ffccacb50/P/-/-/1//-  0x7f3ffceb3ba7/0x7f3ffce9de50/P/-/-/2//-  0x55b978933dc0/0x7f3ffceb3b90/P/-/-/1//-  0x55b978933c52/0x55b978933dc0/P/-/-/1//-  0x55b978933c38/0x55b978933c4d/P/-/-/1//-  0x55b978933cbf/0x55b978933c30/P/-/-/3//-  0x55b978933d3a/0x55b978933cb0/P/-/-/1//-  0x55b978933d4c/0x55b978933d30/P/-/-/1//-  0x55b978933ceb/0x55b978933d3f/P/-/-/2//-  0x7f3ffccad3ab/0x55b978933cdf/P/-/-/2//-  0x7f3ffccad356/0x7f3ffccad398/P/-/-/8//-  0x7f3ffccad28e/0x7f3ffccad2b6/P/-/-/4//-  0x7f3ffce9e890/0x7f3ffccad240/P/-/-/1//-  0x7f3ffceb1d64/0x7f3ffce9e890/P/-/-/1//-  0x7f3ffce9e070/0x7f3ffceb1d60/P/-/-/1//-  0x7f3ffceb1d74/0x7f3ffce9e070/P/-/-/1//-  0x55b978933dd0/0x7f3ffceb1d70/P/-/-/1//-  0x55b978933c8a/0x55b978933dd0/P/-/-/2//-  0x55b978933cdc/0x55b978933c80/P/-/-/1//-  0x55b978933c7c/0x55b978933cd4/P/-/-/2//-  0x55b978933cd2/0x55b978933c70/P/-/-/5//-  0x55b978933c62/0x55b978933cc4/P/-/-/1//-  0x55b978933c4b/0x55b978933c5e/P/-/-/1//-  0x7f3ffceb3bb2/0x55b978933c44/P/-/-/1//-  0x7f3ffccacbe1/0x7f3ffceb3bac/P/-/-/13//-  0x7f3ffccacb7b/0x7f3ffccacb89/P/-/-/4//-  0x7f3ffce9de50/0x7f3ffccacb50/P/-/-/1//-  0x7f3ffceb3ba7/0x7f3ffce9de50/P/-/-/2//-  0x55b978933dc0/0x7f3ffceb3b90/P/-/-/1//-  0x55b978933c3f/0x55b978933dc0/P/-/-/1//-  0x55b978933cbf/0x55b978933c30/P/-/-/3//- 
+     55b978933cb0 0x7f3ffccad28e/0x7f3ffccad2b6/P/-/-/4//-  0x7f3ffce9e890/0x7f3ffccad240/P/-/-/1//-  0x7f3ffceb1d64/0x7f3ffce9e890/P/-/-/1//-  0x7f3ffce9e070/0x7f3ffceb1d60/P/-/-/1//-  0x7f3ffceb1d74/0x7f3ffce9e070/P/-/-/1//-  0x55b978933dd0/0x7f3ffceb1d70/P/-/-/1//-  0x55b978933c8a/0x55b978933dd0/P/-/-/2//-  0x55b978933cdc/0x55b978933c80/P/-/-/1//-  0x55b978933c7c/0x55b978933cd4/P/-/-/2//-  0x55b978933cd2/0x55b978933c70/P/-/-/3//-  0x55b978933c62/0x55b978933cc4/P/-/-/1//-  0x55b978933c4b/0x55b978933c5e/P/-/-/1//-  0x7f3ffceb3bb2/0x55b978933c44/P/-/-/1//-  0x7f3ffccacbe1/0x7f3ffceb3bac/P/-/-/19//-  0x7f3ffccacb7b/0x7f3ffccacb89/P/-/-/3//-  0x7f3ffce9de50/0x7f3ffccacb50/P/-/-/1//-  0x7f3ffceb3ba7/0x7f3ffce9de50/P/-/-/2//-  0x55b978933dc0/0x7f3ffceb3b90/P/-/-/1//-  0x55b978933c3f/0x55b978933dc0/P/-/-/1//-  0x55b978933cbf/0x55b978933c30/P/-/-/3//-  0x55b978933d3a/0x55b978933cb0/P/-/-/1//-  0x55b978933d4c/0x55b978933d30/P/-/-/1//-  0x55b978933ceb/0x55b978933d3f/P/-/-/2//-  0x7f3ffccad3ab/0x55b978933cdf/P/-/-/2//-  0x7f3ffccad356/0x7f3ffccad398/P/-/-/8//-  0x7f3ffccad28e/0x7f3ffccad2b6/P/-/-/4//-  0x7f3ffce9e890/0x7f3ffccad240/P/-/-/1//-  0x7f3ffceb1d64/0x7f3ffce9e890/P/-/-/1//-  0x7f3ffce9e070/0x7f3ffceb1d60/P/-/-/1//-  0x7f3ffceb1d74/0x7f3ffce9e070/P/-/-/1//-  0x55b978933dd0/0x7f3ffceb1d70/P/-/-/1//-  0x55b978933c8a/0x55b978933dd0/P/-/-/2//- 
diff --git a/llvm/test/tools/llvm-profgen/afdo-with-vtable-pie.test b/llvm/test/tools/llvm-profgen/afdo-with-vtable-pie.test
new file mode 100644
index 0000000000000..dbef3aac66786
--- /dev/null
+++ b/llvm/test/tools/llvm-profgen/afdo-with-vtable-pie.test
@@ -0,0 +1,30 @@
+RUN: llvm-profgen --perfscript=%p/Inputs/pie-lbr-perf.script \
+RUN: --data-access-perftrace=%p/Inputs/pie-dap-perf.txt  \
+RUN: --binary=%p/Inputs/dap-pie.bin --format=text --pid=1725662 \
+RUN: -ignore-stack-samples -use-dwarf-correlation -o %t.afdo
+
+RUN: llvm-profdata show --sample --function=_Z9loop_funciii %t.afdo 2>&1 | FileCheck %s --dump-input=always
+
+RUN: not llvm-profgen --perfscript=%p/Inputs/pie-lbr-perf.script \
+RUN: --data-access-perftrace=%p/Inputs/pie-dap-perf.txt  \
+RUN: --binary=%p/Inputs/dap-pie.bin --format=text --pid=1725662 \
+RUN: -ignore-stack-samples -use-dwarf-correlation=false -o - 2>&1 | FileCheck %s --dump-input=always --check-prefix=PSEUDOERR
+
+RUN: not llvm-profgen --perfscript=%p/Inputs/css-pgo-perf.script \
+RUN: --data-access-perftrace=%p/Inputs/pie-dap-perf.txt  \
+RUN: --binary=%p/Inputs/dap-pie.bin --format=text --pid=21450 \
+RUN: -ignore-stack-samples=false -use-dwarf-correlation=true -o - 2>&1 | FileCheck %s --dump-input=always --check-prefix=PSEUDOERR
+
+CHECK:      Function: _Z9loop_funciii: 746, 14, 5 sampled lines
+CHECK-NEXT: Samples collected in the function's body {
+CHECK-NEXT:   0: 14
+CHECK-NEXT:   1: 18, calls: _Z10createTypei:18
+CHECK-NEXT:   3: 11, calls: _ZN12_GLOBAL__N_18Derived24funcEii:8 _ZN8Derived14funcEii:3
+CHECK-NEXT:   3: vtables: _ZTV8Derived1:16 _ZTVN12_GLOBAL__N_18Derived2E:47 
+CHECK-NEXT:   5.1: 11, calls: _ZN12_GLOBAL__N_18Derived2D0Ev:8 _ZN8Derived1D0Ev:3
+CHECK-NEXT:   5.1: vtables: _ZTV8Derived1:3 _ZTVN12_GLOBAL__N_18Derived2E:9 
+CHECK-NEXT:   7: 13
+CHECK-NEXT: }
+CHECK-NEXT: No inlined callsites in this function
+
+PSEUDOERR: Symbolizing vtables from data access profiles is not yet supported for context-sensitive perf traces or when pseudo-probe based mapping is enabled.
diff --git a/llvm/test/tools/llvm-profgen/afdo-with-vtable.test b/llvm/test/tools/llvm-profgen/afdo-with-vtable.test
new file mode 100644
index 0000000000000..799d59d56526e
--- /dev/null
+++ b/llvm/test/tools/llvm-profgen/afdo-with-vtable.test
@@ -0,0 +1,27 @@
+RUN: llvm-profgen --perfscript=%p/Inputs/lbr-perf-for-dap.script --data-access-perftrace=%p/Inputs/dap-perf-trace.txt \
+RUN:  --binary=%p/Inputs/dap.bin --format=text --pid=3446532 \
+RUN: -ignore-stack-samples -use-dwarf-correlation -o %t.afdo
+
+RUN: llvm-profdata show --sample --function=_Z9loop_funciii %t.afdo 2>&1 | FileCheck %s
+
+RUN: not llvm-profgen --perfscript=%p/Inputs/lbr-perf-for-dap.script --data-access-perftrace=%p/Inputs/dap-perf-trace.txt \
+RUN:  --binary=%p/Inputs/dap.bin --format=text --pid=3446532 \
+RUN: -ignore-stack-samples -use-dwarf-correlation=false -o - 2>&1 | FileCheck %s --check-prefix=PSEUDOERR
+
+RUN: not llvm-profgen --perfscript=%p/Inputs/css-pgo-perf.script --data-access-perftrace=%p/Inputs/dap-perf-trace.txt \
+RUN:  --binary=%p/Inputs/dap.bin --format=text --pid=21450 \
+RUN: -ignore-stack-samples=false -use-dwarf-correlation=true -o - 2>&1 | FileCheck %s --check-prefix=PSEUDOERR
+
+CHECK:       Function: _Z9loop_funciii: 958, 15, 5 sampled lines
+CHECK-NEXT:  Samples collected in the function's body {
+CHECK-NEXT:    0: 15
+CHECK-NEXT:    1: 19, calls: _Z10createTypei:15
+CHECK-NEXT:    3: 19, calls: _ZN12_GLOBAL__N_18Derived24funcEii:16 _ZN8Derived14funcEii:3
+CHECK-NEXT:    3: vtables: _ZTV8Derived1:16 _ZTVN12_GLOBAL__N_18Derived2E:5 
+CHECK-NEXT:    5.1: 19, calls: _ZN12_GLOBAL__N_18Derived2D0Ev:16 _ZN8Derived1D0Ev:3
+CHECK-NEXT:    5.1: vtables: _ZTV8Derived1:9 _ZTVN12_GLOBAL__N_18Derived2E:3 
+CHECK-NEXT:    7: 12
+CHECK-NEXT:  }
+CHECK-NEXT:  No inlined callsites in this function
+
+PSEUDOERR: Symbolizing vtables from data access profiles is not yet supported for context-sensitive perf traces or when pseudo-probe based mapping is enabled.
diff --git a/llvm/tools/llvm-exegesis/lib/LlvmState.cpp b/llvm/tools/llvm-exegesis/lib/LlvmState.cpp
index 2c6f5d275cf00..d7bf9c1f77b89 100644
--- a/llvm/tools/llvm-exegesis/lib/LlvmState.cpp
+++ b/llvm/tools/llvm-exegesis/lib/LlvmState.cpp
@@ -56,6 +56,11 @@ Expected<LLVMState> LLVMState::Create(std::string TripleName,
 
   std::unique_ptr<MCSubtargetInfo> STI(
       TheTarget->createMCSubtargetInfo(TheTriple, CpuName, ""));
+  if (!STI) {
+    return make_error<StringError>("unable to create subtarget info",
+                                   inconvertibleErrorCode());
+  }
+
   assert(STI && "Unable to create subtarget info!");
   if (!STI->isCPUStringValid(CpuName)) {
     return make_error<StringError>(Twine("invalid CPU name (")
diff --git a/llvm/tools/llvm-lipo/llvm-lipo.cpp b/llvm/tools/llvm-lipo/llvm-lipo.cpp
index d4b1f8f3dd7d4..3e1d4165e8ed7 100644
--- a/llvm/tools/llvm-lipo/llvm-lipo.cpp
+++ b/llvm/tools/llvm-lipo/llvm-lipo.cpp
@@ -460,8 +460,8 @@ printInfo(LLVMContext &LLVMCtx, ArrayRef<OwningBinary<Binary>> InputBinaries) {
   for (auto &IB : InputBinaries) {
     const Binary *Binary = IB.getBinary();
     if (!Binary->isMachOUniversalBinary()) {
-      assert(Binary->isMachO() ||
-             Binary->isArchive() && "expected MachO binary");
+      assert((Binary->isMachO() || Binary->isArchive()) &&
+             "expected MachO binary");
       outs() << "Non-fat file: " << Binary->getFileName()
              << " is architecture: ";
       printBinaryArchs(LLVMCtx, Binary, outs());
diff --git a/llvm/tools/llvm-mc-assemble-fuzzer/llvm-mc-assemble-fuzzer.cpp b/llvm/tools/llvm-mc-assemble-fuzzer/llvm-mc-assemble-fuzzer.cpp
index 615a39ee9bb4d..dca64af657669 100644
--- a/llvm/tools/llvm-mc-assemble-fuzzer/llvm-mc-assemble-fuzzer.cpp
+++ b/llvm/tools/llvm-mc-assemble-fuzzer/llvm-mc-assemble-fuzzer.cpp
@@ -156,7 +156,7 @@ int AssembleOneInput(const uint8_t *Data, size_t Size) {
 
   std::unique_ptr<MCRegisterInfo> MRI(TheTarget->createMCRegInfo(TripleName));
   if (!MRI) {
-    errs() << "Unable to create target register info!";
+    errs() << "Unable to create target register info!\n";
     abort();
   }
 
@@ -164,12 +164,16 @@ int AssembleOneInput(const uint8_t *Data, size_t Size) {
   std::unique_ptr<MCAsmInfo> MAI(
       TheTarget->createMCAsmInfo(*MRI, TripleName, MCOptions));
   if (!MAI) {
-    errs() << "Unable to create target asm info!";
+    errs() << "Unable to create target asm info!\n";
     abort();
   }
 
   std::unique_ptr<MCSubtargetInfo> STI(
       TheTarget->createMCSubtargetInfo(TripleName, MCPU, FeaturesStr));
+  if (!STI) {
+    errs() << "Unable to create subtargettarget info!\n";
+    abort();
+  }
 
   MCContext Ctx(TheTriple, MAI.get(), MRI.get(), STI.get(), &SrcMgr);
   std::unique_ptr<MCObjectFileInfo> MOFI(
diff --git a/llvm/tools/llvm-mc/llvm-mc.cpp b/llvm/tools/llvm-mc/llvm-mc.cpp
index 136cd69526a3c..224fd80f6a6d3 100644
--- a/llvm/tools/llvm-mc/llvm-mc.cpp
+++ b/llvm/tools/llvm-mc/llvm-mc.cpp
@@ -469,7 +469,10 @@ int main(int argc, char **argv) {
 
   std::unique_ptr<MCSubtargetInfo> STI(
       TheTarget->createMCSubtargetInfo(TheTriple, MCPU, FeaturesStr));
-  assert(STI && "Unable to create subtarget info!");
+  if (!STI) {
+    WithColor::error(errs(), ProgName) << "unable to create subtarget info\n";
+    return 1;
+  }
 
   // FIXME: This is not pretty. MCContext has a ptr to MCObjectFileInfo and
   // MCObjectFileInfo needs a MCContext reference in order to initialize itself.
diff --git a/llvm/tools/llvm-mca/llvm-mca.cpp b/llvm/tools/llvm-mca/llvm-mca.cpp
index d330b44894e7a..a04b6ee9d90cf 100644
--- a/llvm/tools/llvm-mca/llvm-mca.cpp
+++ b/llvm/tools/llvm-mca/llvm-mca.cpp
@@ -83,9 +83,9 @@ static cl::opt<std::string>
              cl::cat(ToolOptions));
 
 static cl::opt<std::string>
-    TripleName("mtriple",
-               cl::desc("Target triple. See -version for available targets"),
-               cl::cat(ToolOptions));
+    TripleNameOpt("mtriple",
+                  cl::desc("Target triple. See -version for available targets"),
+                  cl::cat(ToolOptions));
 
 static cl::opt<std::string>
     MCPU("mcpu",
@@ -292,11 +292,7 @@ static cl::opt<bool> DisableInstrumentManager(
 
 namespace {
 
-const Target *getTarget(const char *ProgName) {
-  if (TripleName.empty())
-    TripleName = Triple::normalize(sys::getDefaultTargetTriple());
-  Triple TheTriple(TripleName);
-
+const Target *getTarget(Triple &TheTriple, const char *ProgName) {
   // Get the target specific parser.
   std::string Error;
   const Target *TheTarget =
@@ -306,9 +302,6 @@ const Target *getTarget(const char *ProgName) {
     return nullptr;
   }
 
-  // Update TripleName with the updated triple from the target lookup.
-  TripleName = TheTriple.str();
-
   // Return the found target.
   return TheTarget;
 }
@@ -387,18 +380,18 @@ int main(int argc, char **argv) {
   cl::ParseCommandLineOptions(argc, argv,
                               "llvm machine code performance analyzer.\n");
 
+  Triple TheTriple(TripleNameOpt.empty()
+                       ? Triple::normalize(sys::getDefaultTargetTriple())
+                       : TripleNameOpt);
+
   // Get the target from the triple. If a triple is not specified, then select
   // the default triple for the host. If the triple doesn't correspond to any
   // registered target, then exit with an error message.
   const char *ProgName = argv[0];
-  const Target *TheTarget = getTarget(ProgName);
+  const Target *TheTarget = getTarget(TheTriple, ProgName);
   if (!TheTarget)
     return 1;
 
-  // GetTarget() may replaced TripleName with a default triple.
-  // For safety, reconstruct the Triple object.
-  Triple TheTriple(TripleName);
-
   ErrorOr<std::unique_ptr<MemoryBuffer>> BufferPtr =
       MemoryBuffer::getFileOrSTDIN(InputFilename);
   if (std::error_code EC = BufferPtr.getError()) {
@@ -420,7 +413,11 @@ int main(int argc, char **argv) {
 
   std::unique_ptr<MCSubtargetInfo> STI(
       TheTarget->createMCSubtargetInfo(TheTriple, MCPU, FeaturesStr));
-  assert(STI && "Unable to create subtarget info!");
+  if (!STI) {
+    WithColor::error() << "unable to create subtarget info\n";
+    return 1;
+  }
+
   if (!STI->isCPUStringValid(MCPU))
     return 1;
 
@@ -469,7 +466,7 @@ int main(int argc, char **argv) {
   unsigned IPtempOutputAsmVariant =
       OutputAsmVariant == -1 ? 0 : OutputAsmVariant;
   std::unique_ptr<MCInstPrinter> IPtemp(TheTarget->createMCInstPrinter(
-      Triple(TripleName), IPtempOutputAsmVariant, *MAI, *MCII, *MRI));
+      TheTriple, IPtempOutputAsmVariant, *MAI, *MCII, *MRI));
   if (!IPtemp) {
     WithColor::error()
         << "unable to create instruction printer for target triple '"
@@ -558,7 +555,7 @@ int main(int argc, char **argv) {
   if (OutputAsmVariant >= 0)
     AssemblerDialect = static_cast<unsigned>(OutputAsmVariant);
   std::unique_ptr<MCInstPrinter> IP(TheTarget->createMCInstPrinter(
-      Triple(TripleName), AssemblerDialect, *MAI, *MCII, *MRI));
+      TheTriple, AssemblerDialect, *MAI, *MCII, *MRI));
   if (!IP) {
     WithColor::error()
         << "unable to create instruction printer for target triple '"
diff --git a/llvm/tools/llvm-ml/llvm-ml.cpp b/llvm/tools/llvm-ml/llvm-ml.cpp
index ae58b0da6d69b..cda86e77f3eb4 100644
--- a/llvm/tools/llvm-ml/llvm-ml.cpp
+++ b/llvm/tools/llvm-ml/llvm-ml.cpp
@@ -325,7 +325,10 @@ int llvm_ml_main(int Argc, char **Argv, const llvm::ToolContext &) {
 
   std::unique_ptr<MCSubtargetInfo> STI(
       TheTarget->createMCSubtargetInfo(TheTriple, /*CPU=*/"", /*Features=*/""));
-  assert(STI && "Unable to create subtarget info!");
+  if (!STI) {
+    WithColor::error(errs(), ProgName) << "unable to create subtarget info\n";
+    exit(1);
+  }
 
   // FIXME: This is not pretty. MCContext has a ptr to MCObjectFileInfo and
   // MCObjectFileInfo needs a MCContext reference in order to initialize itself.
diff --git a/llvm/tools/llvm-profgen/PerfReader.cpp b/llvm/tools/llvm-profgen/PerfReader.cpp
index 9a805f2941753..183b248a72320 100644
--- a/llvm/tools/llvm-profgen/PerfReader.cpp
+++ b/llvm/tools/llvm-profgen/PerfReader.cpp
@@ -6,11 +6,14 @@
 //
 //===----------------------------------------------------------------------===//
 #include "PerfReader.h"
+#include "ErrorHandling.h"
 #include "Options.h"
 #include "ProfileGenerator.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/DebugInfo/Symbolize/SymbolizableModule.h"
 #include "llvm/Support/FileSystem.h"
+#include "llvm/Support/LineIterator.h"
+#include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Process.h"
 #include "llvm/Support/ToolOutputFile.h"
 
@@ -374,6 +377,77 @@ PerfReaderBase::create(ProfiledBinary *Binary, PerfInputFile &PerfInput,
   return PerfReader;
 }
 
+Error PerfReaderBase::parseDataAccessPerfTraces(
+    StringRef DataAccessPerfTraceFile, std::optional<int32_t> PIDFilter) {
+  // A perf_record_sample line is like
+  // . 1282514022939813 0x87b0 [0x60]: PERF_RECORD_SAMPLE(IP, 0x4002):
+  // 3446532/3446532: 0x2608a2 period: 233 addr: 0x3b3fb0
+  constexpr static StringRef DataAccessSamplePattern =
+      "PERF_RECORD_SAMPLE\\([A-Za-z]+, 0x[0-9a-fA-F]+\\): "
+      "([0-9]+)\\/[0-9]+: 0x([0-9a-fA-F]+) period: [0-9]+ addr: "
+      "0x([0-9a-fA-F]+)";
+
+  llvm::Regex LogRegex(DataAccessSamplePattern);
+
+  auto BufferOrErr = MemoryBuffer::getFile(DataAccessPerfTraceFile);
+  std::error_code EC = BufferOrErr.getError();
+  if (EC)
+    return make_error<StringError>("Failed to open perf trace file: " +
+                                       DataAccessPerfTraceFile,
+                                   inconvertibleErrorCode());
+
+  assert(!SampleCounters.empty() && "Sample counters should not be empty!");
+  SampleCounter &Counter = SampleCounters.begin()->second;
+  line_iterator LineIt(*BufferOrErr.get(), true);
+
+  for (; !LineIt.is_at_eof(); ++LineIt) {
+    StringRef Line = *LineIt;
+
+    MMapEvent MMap;
+    if (Line.contains("PERF_RECORD_MMAP2")) {
+      if (PerfScriptReader::extractMMapEventForBinary(Binary, Line, MMap)) {
+        if (!MMap.MemProtectionFlag.contains("x")) {
+          if (Error E = Binary->addMMapNonTextEvent(MMap)) {
+            return E;
+          }
+        }
+      }
+      continue;
+    }
+
+    SmallVector<StringRef> Fields;
+    if (LogRegex.match(Line, &Fields)) {
+      int32_t PID = 0;
+      if (Fields[1].getAsInteger(10, PID))
+        return make_error<StringError>(
+            "Failed to parse PID from perf trace line: " + Line,
+            inconvertibleErrorCode());
+
+      if (PIDFilter.has_value() && *PIDFilter != PID) {
+        continue;
+      }
+
+      uint64_t DataAddress = 0;
+      if (Fields[3].getAsInteger(16, DataAddress))
+        return make_error<StringError>(
+            "Failed to parse data address from perf trace line: " + Line,
+            inconvertibleErrorCode());
+      // Out of all the memory access events, the vtable accesses are used to
+      // construct type profiles. We assume that this is under the Itanium
+      // C++ ABI so we can use `_ZTV` prefix to identify vtable.
+      StringRef DataSymbol = Binary->symbolizeDataAddress(
+          Binary->CanonicalizeNonTextAddress(DataAddress));
+      if (DataSymbol.starts_with("_ZTV")) {
+        uint64_t IP = 0;
+        Fields[2].getAsInteger(16, IP);
+        Counter.recordDataAccessCount(Binary->canonicalizeVirtualAddress(IP),
+                                      DataSymbol, 1);
+      }
+    }
+  }
+  return Error::success();
+}
+
 PerfInputFile
 PerfScriptReader::convertPerfDataToTrace(ProfiledBinary *Binary, bool SkipPID,
                                          PerfInputFile &File,
@@ -994,14 +1068,14 @@ bool PerfScriptReader::extractMMapEventForBinary(ProfiledBinary *Binary,
   constexpr static const char *const MMap2Pattern =
       "PERF_RECORD_MMAP2 (-?[0-9]+)/[0-9]+: "
       "\\[(0x[a-f0-9]+)\\((0x[a-f0-9]+)\\) @ "
-      "(0x[a-f0-9]+|0) .*\\]: [-a-z]+ (.*)";
+      "(0x[a-f0-9]+|0) .*\\]: ([-a-z]+) (.*)";
   // Parse a MMap line like
   // PERF_RECORD_MMAP -1/0: [0xffffffff81e00000(0x3e8fa000) @ \
   //  0xffffffff81e00000]: x [kernel.kallsyms]_text
   constexpr static const char *const MMapPattern =
       "PERF_RECORD_MMAP (-?[0-9]+)/[0-9]+: "
       "\\[(0x[a-f0-9]+)\\((0x[a-f0-9]+)\\) @ "
-      "(0x[a-f0-9]+|0)\\]: [-a-z]+ (.*)";
+      "(0x[a-f0-9]+|0)\\]: ([-a-z]+) (.*)";
   // Field 0 - whole line
   // Field 1 - PID
   // Field 2 - base address
@@ -1014,11 +1088,12 @@ bool PerfScriptReader::extractMMapEventForBinary(ProfiledBinary *Binary,
     MMAPPED_ADDRESS = 2,
     MMAPPED_SIZE = 3,
     PAGE_OFFSET = 4,
-    BINARY_PATH = 5
+    MEM_PROTECTION_FLAG = 5,
+    BINARY_PATH = 6,
   };
 
   bool R = false;
-  SmallVector<StringRef, 6> Fields;
+  SmallVector<StringRef, 7> Fields;
   if (Line.contains("PERF_RECORD_MMAP2 ")) {
     Regex RegMmap2(MMap2Pattern);
     R = RegMmap2.match(Line, &Fields);
@@ -1039,6 +1114,7 @@ bool PerfScriptReader::extractMMapEventForBinary(ProfiledBinary *Binary,
   Fields[MMAPPED_ADDRESS].getAsInteger(0, MMap.Address);
   Fields[MMAPPED_SIZE].getAsInteger(0, MMap.Size);
   Fields[PAGE_OFFSET].getAsInteger(0, MMap.Offset);
+  MMap.MemProtectionFlag = Fields[MEM_PROTECTION_FLAG];
   MMap.BinaryPath = Fields[BINARY_PATH];
   if (ShowMmapEvents) {
     outs() << "Mmap: Binary " << MMap.BinaryPath << " loaded at "
diff --git a/llvm/tools/llvm-profgen/PerfReader.h b/llvm/tools/llvm-profgen/PerfReader.h
index 19451915812e1..6e233d17f8e62 100644
--- a/llvm/tools/llvm-profgen/PerfReader.h
+++ b/llvm/tools/llvm-profgen/PerfReader.h
@@ -12,6 +12,7 @@
 #include "ProfiledBinary.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Error.h"
 #include "llvm/Support/Regex.h"
 #include <cstdint>
 #include <fstream>
@@ -392,10 +393,14 @@ using BranchSample = std::map<std::pair<uint64_t, uint64_t>, uint64_t>;
 // The counter of range samples for one function indexed by the range,
 // which is represented as the start and end offset pair.
 using RangeSample = std::map<std::pair<uint64_t, uint64_t>, uint64_t>;
+// <<inst-addr, vtable-data-symbol>, count> map for data access samples.
+// The instruction address is the virtual address in the binary.
+using DataAccessSample = std::map<std::pair<uint64_t, StringRef>, uint64_t>;
 // Wrapper for sample counters including range counter and branch counter
 struct SampleCounter {
   RangeSample RangeCounter;
   BranchSample BranchCounter;
+  DataAccessSample DataAccessCounter;
 
   void recordRangeCount(uint64_t Start, uint64_t End, uint64_t Repeat) {
     assert(Start <= End && "Invalid instruction range");
@@ -404,6 +409,10 @@ struct SampleCounter {
   void recordBranchCount(uint64_t Source, uint64_t Target, uint64_t Repeat) {
     BranchCounter[{Source, Target}] += Repeat;
   }
+  void recordDataAccessCount(uint64_t InstAddr, StringRef DataSymbol,
+                             uint64_t Repeat) {
+    DataAccessCounter[{InstAddr, DataSymbol}] += Repeat;
+  }
 };
 
 // Sample counter with context to support context-sensitive profile
@@ -569,6 +578,13 @@ class PerfReaderBase {
 
   // Entry of the reader to parse multiple perf traces
   virtual void parsePerfTraces() = 0;
+
+  // Parse the <ip, vtable-data-symbol> from the data access perf trace file,
+  // and accumulate the data access count for each <ip, data-symbol> pair.
+  Error
+  parseDataAccessPerfTraces(StringRef DataAccessPerfFile,
+                            std::optional<int32_t> PIDFilter = std::nullopt);
+
   const ContextSampleCounterMap &getSampleCounters() const {
     return SampleCounters;
   }
@@ -595,6 +611,14 @@ class PerfScriptReader : public PerfReaderBase {
 
   // Entry of the reader to parse multiple perf traces
   void parsePerfTraces() override;
+
+  // Parse a single line of a PERF_RECORD_MMAP event looking for a
+  // mapping between the binary name and its memory layout.
+  // TODO: Move this static method from PerScriptReader (subclass) to
+  // PerfReaderBase (superclass).
+  static bool extractMMapEventForBinary(ProfiledBinary *Binary, StringRef Line,
+                                        MMapEvent &MMap);
+
   // Generate perf script from perf data
   static PerfInputFile convertPerfDataToTrace(ProfiledBinary *Binary,
                                               bool SkipPID, PerfInputFile &File,
@@ -608,23 +632,10 @@ class PerfScriptReader : public PerfReaderBase {
   static SmallVector<CleanupInstaller, 2> TempFileCleanups;
 
 protected:
-  // The parsed MMap event
-  struct MMapEvent {
-    int64_t PID = 0;
-    uint64_t Address = 0;
-    uint64_t Size = 0;
-    uint64_t Offset = 0;
-    StringRef BinaryPath;
-  };
-
   // Check whether a given line is LBR sample
   static bool isLBRSample(StringRef Line);
   // Check whether a given line is MMAP event
   static bool isMMapEvent(StringRef Line);
-  // Parse a single line of a PERF_RECORD_MMAP event looking for a
-  // mapping between the binary name and its memory layout.
-  static bool extractMMapEventForBinary(ProfiledBinary *Binary, StringRef Line,
-                                        MMapEvent &MMap);
   // Update base address based on mmap events
   void updateBinaryAddress(const MMapEvent &Event);
   // Parse mmap event and update binary address
diff --git a/llvm/tools/llvm-profgen/ProfileGenerator.cpp b/llvm/tools/llvm-profgen/ProfileGenerator.cpp
index c611e410ebd58..3b875c5de3c09 100644
--- a/llvm/tools/llvm-profgen/ProfileGenerator.cpp
+++ b/llvm/tools/llvm-profgen/ProfileGenerator.cpp
@@ -558,6 +558,7 @@ void ProfileGenerator::generateLineNumBasedProfile() {
   populateBodySamplesForAllFunctions(SC.RangeCounter);
   // Fill in boundary sample counts as well as call site samples for calls
   populateBoundarySamplesForAllFunctions(SC.BranchCounter);
+  populateTypeSamplesForAllFunctions(SC.DataAccessCounter);
 
   updateFunctionSamples();
 }
@@ -624,8 +625,8 @@ void ProfileGenerator::populateBoundarySamplesWithProbesForAllFunctions(
           getLeafProfileAndAddTotalSamples(FrameVec, 0);
       FunctionProfile.addCalledTargetSamples(
           FrameVec.back().Location.LineOffset,
-          FrameVec.back().Location.Discriminator,
-          FunctionId(CalleeName), Count);
+          FrameVec.back().Location.Discriminator, FunctionId(CalleeName),
+          Count);
     }
   }
 }
@@ -648,8 +649,7 @@ FunctionSamples &ProfileGenerator::getLeafProfileAndAddTotalSamples(
         getBaseDiscriminator(FrameVec[I - 1].Location.Discriminator));
     FunctionSamplesMap &SamplesMap =
         FunctionProfile->functionSamplesAt(Callsite);
-    auto Ret =
-        SamplesMap.emplace(FrameVec[I].Func, FunctionSamples());
+    auto Ret = SamplesMap.emplace(FrameVec[I].Func, FunctionSamples());
     if (Ret.second) {
       SampleContext Context(FrameVec[I].Func);
       Ret.first->second.setContext(Context);
@@ -764,6 +764,26 @@ void ProfileGenerator::populateBoundarySamplesForAllFunctions(
   }
 }
 
+void ProfileGenerator::populateTypeSamplesForAllFunctions(
+    const DataAccessSample &DataAccessSamples) {
+  // For each instruction with vtable accesses, get its symbolized inline
+  // stack, and add the vtable counters to the function samples.
+  for (const auto &[IpData, Count] : DataAccessSamples) {
+    uint64_t InstAddr = IpData.first;
+    const SampleContextFrameVector &FrameVec =
+        Binary->getCachedFrameLocationStack(InstAddr,
+                                            /* UseProbeDiscriminator= */ false);
+    if (!FrameVec.empty()) {
+      FunctionSamples &FunctionProfile =
+          getLeafProfileAndAddTotalSamples(FrameVec, /* Count= */ 0);
+      LineLocation Loc(
+          FrameVec.back().Location.LineOffset,
+          getBaseDiscriminator(FrameVec.back().Location.Discriminator));
+      FunctionProfile.addTypeSamplesAt(Loc, FunctionId(IpData.second), Count);
+    }
+  }
+}
+
 void ProfileGeneratorBase::calculateBodySamplesAndSize(
     const FunctionSamples &FSamples, uint64_t &TotalBodySamples,
     uint64_t &FuncBodySize) {
diff --git a/llvm/tools/llvm-profgen/ProfileGenerator.h b/llvm/tools/llvm-profgen/ProfileGenerator.h
index dbf9d469063e1..a4b738016ec3a 100644
--- a/llvm/tools/llvm-profgen/ProfileGenerator.h
+++ b/llvm/tools/llvm-profgen/ProfileGenerator.h
@@ -181,6 +181,8 @@ class ProfileGenerator : public ProfileGeneratorBase {
   populateBodySamplesWithProbesForAllFunctions(const RangeSample &RangeCounter);
   void populateBoundarySamplesWithProbesForAllFunctions(
       const BranchSample &BranchCounters);
+  void
+  populateTypeSamplesForAllFunctions(const DataAccessSample &DataAccessSamples);
   void postProcessProfiles();
   void trimColdProfiles(const SampleProfileMap &Profiles,
                         uint64_t ColdCntThreshold);
diff --git a/llvm/tools/llvm-profgen/ProfiledBinary.cpp b/llvm/tools/llvm-profgen/ProfiledBinary.cpp
index 0f3ece275e6f5..6865e364f7e43 100644
--- a/llvm/tools/llvm-profgen/ProfiledBinary.cpp
+++ b/llvm/tools/llvm-profgen/ProfiledBinary.cpp
@@ -342,6 +342,12 @@ void ProfiledBinary::setPreferredTextSegmentAddresses(const ELFFile<ELFT> &Obj,
         PreferredTextSegmentAddresses.push_back(Phdr.p_vaddr &
                                                 ~(PageSize - 1U));
         TextSegmentOffsets.push_back(Phdr.p_offset & ~(PageSize - 1U));
+      } else {
+        PhdrInfo Info;
+        Info.FileOffset = Phdr.p_offset;
+        Info.FileSz = Phdr.p_filesz;
+        Info.VirtualAddr = Phdr.p_vaddr;
+        NonTextPhdrInfo.push_back(Info);
       }
     }
   }
@@ -350,6 +356,36 @@ void ProfiledBinary::setPreferredTextSegmentAddresses(const ELFFile<ELFT> &Obj,
     exitWithError("no executable segment found", FileName);
 }
 
+uint64_t ProfiledBinary::CanonicalizeNonTextAddress(uint64_t Address) {
+  uint64_t FileOffset = 0;
+  auto MMapIter = NonTextMMapEvents.lower_bound(Address);
+  if (MMapIter == NonTextMMapEvents.end())
+    return Address; // No non-text mmap event found, return the address as is.
+
+  const auto &MMapEvent = MMapIter->second;
+
+  // If the address is within the non-text mmap event, calculate its file
+  // offset in the binary.
+  if (MMapEvent.Address <= Address &&
+      Address < MMapEvent.Address + MMapEvent.Size)
+    FileOffset = Address - MMapEvent.Address + MMapEvent.Offset;
+
+  // If the address is not within the non-text mmap event, return the address
+  // as is.
+  if (FileOffset == 0)
+    return Address;
+
+  for (const auto &PhdrInfo : NonTextPhdrInfo) {
+    // Find the program section that contains the file offset and map the
+    // file offset to the virtual address.
+    if (PhdrInfo.FileOffset <= FileOffset &&
+        FileOffset < PhdrInfo.FileOffset + PhdrInfo.FileSz)
+      return PhdrInfo.VirtualAddr + (FileOffset - PhdrInfo.FileOffset);
+  }
+
+  return Address;
+}
+
 void ProfiledBinary::setPreferredTextSegmentAddresses(const COFFObjectFile *Obj,
                                                       StringRef FileName) {
   uint64_t ImageBase = Obj->getImageBase();
@@ -919,11 +955,10 @@ SampleContextFrameVector ProfiledBinary::symbolize(const InstructionPointer &IP,
                                                    bool UseProbeDiscriminator) {
   assert(this == IP.Binary &&
          "Binary should only symbolize its own instruction");
-  auto Addr = object::SectionedAddress{IP.Address,
-                                       object::SectionedAddress::UndefSection};
-  DIInliningInfo InlineStack = unwrapOrError(
-      Symbolizer->symbolizeInlinedCode(SymbolizerPath.str(), Addr),
-      SymbolizerPath);
+  DIInliningInfo InlineStack =
+      unwrapOrError(Symbolizer->symbolizeInlinedCode(
+                        SymbolizerPath.str(), getSectionedAddress(IP.Address)),
+                    SymbolizerPath);
 
   SampleContextFrameVector CallStack;
   for (int32_t I = InlineStack.getNumberOfFrames() - 1; I >= 0; I--) {
@@ -952,6 +987,16 @@ SampleContextFrameVector ProfiledBinary::symbolize(const InstructionPointer &IP,
   return CallStack;
 }
 
+StringRef ProfiledBinary::symbolizeDataAddress(uint64_t Address) {
+  DIGlobal DataDIGlobal =
+      unwrapOrError(Symbolizer->symbolizeData(SymbolizerPath.str(),
+                                              getSectionedAddress(Address)),
+                    SymbolizerPath);
+  decltype(NameStrings)::iterator Iter;
+  std::tie(Iter, std::ignore) = NameStrings.insert(DataDIGlobal.Name);
+  return StringRef(*Iter);
+}
+
 void ProfiledBinary::computeInlinedContextSizeForRange(uint64_t RangeBegin,
                                                        uint64_t RangeEnd) {
   InstructionPointer IP(this, RangeBegin, true);
diff --git a/llvm/tools/llvm-profgen/ProfiledBinary.h b/llvm/tools/llvm-profgen/ProfiledBinary.h
index 9c0bff591337a..e82fbab02f56a 100644
--- a/llvm/tools/llvm-profgen/ProfiledBinary.h
+++ b/llvm/tools/llvm-profgen/ProfiledBinary.h
@@ -176,6 +176,16 @@ class BinarySizeContextTracker {
 
 using AddressRange = std::pair<uint64_t, uint64_t>;
 
+// The parsed MMap event
+struct MMapEvent {
+  int64_t PID = 0;
+  uint64_t Address = 0;
+  uint64_t Size = 0;
+  uint64_t Offset = 0;
+  StringRef MemProtectionFlag;
+  StringRef BinaryPath;
+};
+
 class ProfiledBinary {
   // Absolute path of the executable binary.
   std::string Path;
@@ -267,6 +277,19 @@ class ProfiledBinary {
   // String table owning function name strings created from the symbolizer.
   std::unordered_set<std::string> NameStrings;
 
+  // MMap events for PT_LOAD segments without 'x' memory protection flag.
+  std::map<uint64_t, MMapEvent, std::greater<uint64_t>> NonTextMMapEvents;
+
+  // Records the file offset, file size and virtual address of program headers.
+  struct PhdrInfo {
+    uint64_t FileOffset;
+    uint64_t FileSz;
+    uint64_t VirtualAddr;
+  };
+
+  // Program header information for non-text PT_LOAD segments.
+  SmallVector<PhdrInfo> NonTextPhdrInfo;
+
   // A collection of functions to print disassembly for.
   StringSet<> DisassembleFunctionSet;
 
@@ -296,6 +319,16 @@ class ProfiledBinary {
 
   void setPreferredTextSegmentAddresses(const object::ObjectFile *O);
 
+  // LLVMSymbolizer's symbolize{Code, Data} interfaces requires a section index
+  // for each address to be symbolized. This is a helper function to
+  // construct a SectionedAddress object with the given address and section
+  // index. The section index is set to UndefSection by default.
+  static object::SectionedAddress getSectionedAddress(
+      uint64_t Address,
+      uint64_t SectionIndex = object::SectionedAddress::UndefSection) {
+    return object::SectionedAddress{Address, SectionIndex};
+  }
+
   template <class ELFT>
   void setPreferredTextSegmentAddresses(const object::ELFFile<ELFT> &Obj,
                                         StringRef FileName);
@@ -355,6 +388,10 @@ class ProfiledBinary {
   ProfiledBinary(const StringRef ExeBinPath, const StringRef DebugBinPath);
   ~ProfiledBinary();
 
+  /// Symbolize an address and return the symbol name. The returned StringRef is
+  /// owned by this ProfiledBinary object.
+  StringRef symbolizeDataAddress(uint64_t Address);
+
   void decodePseudoProbe();
 
   StringRef getPath() const { return Path; }
@@ -493,7 +530,7 @@ class ProfiledBinary {
   void setProfiledFunctions(std::unordered_set<const BinaryFunction *> &Funcs) {
     ProfiledFunctions = Funcs;
   }
-  
+
   BinaryFunction *getBinaryFunction(FunctionId FName) {
     if (FName.isStringRef()) {
       auto I = BinaryFunctions.find(FName.str());
@@ -595,6 +632,46 @@ class ProfiledBinary {
     return ProbeDecoder.getInlinerDescForProbe(Probe);
   }
 
+  bool isNonOverlappingAddressInterval(std::pair<uint64_t, uint64_t> LHS,
+                                       std::pair<uint64_t, uint64_t> RHS) {
+    if (LHS.second <= RHS.first || RHS.second <= LHS.first)
+      return true;
+    return false;
+  }
+
+  Error addMMapNonTextEvent(MMapEvent Event) {
+    // Given the mmap events of the profiled binary, the virtual address
+    // intervals of mmaps most often doesn't overlap with each other. The
+    // implementation validates so, and runtime data address is mapped to
+    // a mmap event using look-up. With this implementation, data addresses
+    // from dynamic shared libraries (not the profiled binary) are not mapped or
+    // symbolized. To map runtime address to binary address in case of
+    // overlapping mmap events, the implementation could store all the mmap
+    // events in a vector and in the order they are added and reverse iterate
+    // the vector to find the mmap events. We opt'ed for the non-overlapping
+    // implementation for simplicity.
+    for (const auto &ExistingMMap : NonTextMMapEvents) {
+      if (isNonOverlappingAddressInterval(
+              {ExistingMMap.second.Address,
+               ExistingMMap.second.Address + ExistingMMap.second.Size},
+              {Event.Address, Event.Address + Event.Size})) {
+        continue;
+      }
+      return createStringError(
+          inconvertibleErrorCode(),
+          "Non-text mmap event overlaps with existing event at address: %lx",
+          Event.Address);
+    }
+    NonTextMMapEvents[Event.Address] = Event;
+    return Error::success();
+  }
+
+  // Given a non-text runtime address, canonicalize it to the virtual address in
+  // the binary.
+  // TODO: Consider unifying the canonicalization of text and non-text addresses
+  // in the ProfiledBinary class.
+  uint64_t CanonicalizeNonTextAddress(uint64_t Address);
+
   bool getTrackFuncContextSize() { return TrackFuncContextSize; }
 
   bool getIsLoadedByMMap() { return IsLoadedByMMap; }
diff --git a/llvm/tools/llvm-profgen/llvm-profgen.cpp b/llvm/tools/llvm-profgen/llvm-profgen.cpp
index 7e070a1ea6489..8d2ed2850e38a 100644
--- a/llvm/tools/llvm-profgen/llvm-profgen.cpp
+++ b/llvm/tools/llvm-profgen/llvm-profgen.cpp
@@ -73,6 +73,12 @@ static cl::opt<std::string> DebugBinPath(
              "from it instead of the executable binary."),
     cl::cat(ProfGenCategory));
 
+static cl::opt<std::string> DataAccessProfileFilename(
+    "data-access-perftrace", cl::value_desc("data-access-perftrace"),
+    cl::desc("File path of a Linux perf raw trace (generated by `perf report "
+             "-D`) consisting of memory access events."),
+    cl::cat(ProfGenCategory));
+
 // Validate the command line input.
 static void validateCommandLine() {
   // Allow the missing perfscript if we only use to show binary disassembly.
@@ -180,6 +186,23 @@ int main(int argc, const char *argv[]) {
     // Parse perf events and samples
     Reader->parsePerfTraces();
 
+    if (!DataAccessProfileFilename.empty()) {
+      if (Reader->profileIsCS() || Binary->usePseudoProbes()) {
+        exitWithError("Symbolizing vtables from data access profiles is not "
+                      "yet supported for context-sensitive perf traces or "
+                      "when pseudo-probe based mapping is enabled. ");
+      }
+      // Parse the data access perf traces into <ip, data-addr> pairs, symbolize
+      // the data-addr to data-symbol. If the data-addr is a vtable, increment
+      // counters for the <ip, data-symbol> pair.
+      if (Error E = Reader->parseDataAccessPerfTraces(DataAccessProfileFilename,
+                                                      PIDFilter)) {
+        handleAllErrors(std::move(E), [&](const StringError &SE) {
+          exitWithError(SE.getMessage());
+        });
+      }
+    }
+
     if (SkipSymbolization)
       return EXIT_SUCCESS;
 
diff --git a/llvm/tools/opt/NewPMDriver.cpp b/llvm/tools/opt/NewPMDriver.cpp
index b9b8929a0f703..0c991b71a6b26 100644
--- a/llvm/tools/opt/NewPMDriver.cpp
+++ b/llvm/tools/opt/NewPMDriver.cpp
@@ -60,6 +60,9 @@ cl::opt<bool> VerifyEachDebugInfoPreserve(
     cl::desc("Start each pass with collecting and end it with checking of "
              "debug info preservation."));
 
+static cl::opt<bool> EnableLoopFusion("enable-loopfusion", cl::init(false),
+                                      cl::Hidden,
+                                      cl::desc("Enable the LoopFuse Pass"));
 cl::opt<std::string>
     VerifyDIPreserveExport("verify-di-preserve-export",
                    cl::desc("Export debug info preservation failures into "
@@ -446,6 +449,7 @@ bool llvm::runPassPipeline(
   // option has been enabled.
   PTO.LoopUnrolling = !DisableLoopUnrolling;
   PTO.UnifiedLTO = UnifiedLTO;
+  PTO.LoopFusion = EnableLoopFusion;
   PassBuilder PB(TM, PTO, P, &PIC);
   registerEPCallbacks(PB);
 
diff --git a/llvm/unittests/ADT/BitTest.cpp b/llvm/unittests/ADT/BitTest.cpp
index 2377ce3b78261..88ae36c44bdb9 100644
--- a/llvm/unittests/ADT/BitTest.cpp
+++ b/llvm/unittests/ADT/BitTest.cpp
@@ -297,6 +297,14 @@ TEST(BitTest, CountrZero) {
   EXPECT_EQ(1, llvm::countr_zero(NZ16));
   EXPECT_EQ(1, llvm::countr_zero(NZ32));
   EXPECT_EQ(1, llvm::countr_zero(NZ64));
+
+  EXPECT_EQ(0, llvm::countr_zero(uint8_t(1)));
+  EXPECT_EQ(3, llvm::countr_zero(uint8_t(8)));
+  EXPECT_EQ(7, llvm::countr_zero(uint8_t(128)));
+
+  EXPECT_EQ(0, llvm::countr_zero(uint16_t(1)));
+  EXPECT_EQ(8, llvm::countr_zero(uint16_t(256)));
+  EXPECT_EQ(15, llvm::countr_zero(uint16_t(32768)));
 }
 
 TEST(BitTest, CountlOne) {
diff --git a/llvm/unittests/ADT/DenseMapTest.cpp b/llvm/unittests/ADT/DenseMapTest.cpp
index 785ab16271d93..50e9c6e138ef1 100644
--- a/llvm/unittests/ADT/DenseMapTest.cpp
+++ b/llvm/unittests/ADT/DenseMapTest.cpp
@@ -962,4 +962,73 @@ TEST(DenseMapCustomTest, PairPrinting) {
   EXPECT_EQ(R"({ (1, "one"), (2, "two") })", ::testing::PrintToString(Map));
 }
 
+TEST(DenseMapCustomTest, InitSize) {
+  constexpr unsigned ElemSize = sizeof(std::pair<int *, int>);
+
+  {
+    DenseMap<int *, int> Map;
+    EXPECT_EQ(ElemSize * 0U, Map.getMemorySize());
+  }
+  {
+    DenseMap<int *, int> Map(0);
+    EXPECT_EQ(ElemSize * 0U, Map.getMemorySize());
+  }
+  {
+    DenseMap<int *, int> Map(1);
+    EXPECT_EQ(ElemSize * 4U, Map.getMemorySize());
+  }
+  {
+    DenseMap<int *, int> Map(2);
+    EXPECT_EQ(ElemSize * 4U, Map.getMemorySize());
+  }
+  {
+    DenseMap<int *, int> Map(3);
+    EXPECT_EQ(ElemSize * 8U, Map.getMemorySize());
+  }
+  {
+    int A, B;
+    DenseMap<int *, int> Map = {{&A, 1}, {&B, 2}};
+    EXPECT_EQ(ElemSize * 4U, Map.getMemorySize());
+  }
+  {
+    int A, B, C;
+    DenseMap<int *, int> Map = {{&A, 1}, {&B, 2}, {&C, 3}};
+    EXPECT_EQ(ElemSize * 8U, Map.getMemorySize());
+  }
+}
+
+TEST(SmallDenseMapCustomTest, InitSize) {
+  constexpr unsigned ElemSize = sizeof(std::pair<int *, int>);
+  {
+    SmallDenseMap<int *, int> Map;
+    EXPECT_EQ(ElemSize * 4U, Map.getMemorySize());
+  }
+  {
+    SmallDenseMap<int *, int> Map(0);
+    EXPECT_EQ(ElemSize * 4U, Map.getMemorySize());
+  }
+  {
+    SmallDenseMap<int *, int> Map(1);
+    EXPECT_EQ(ElemSize * 4U, Map.getMemorySize());
+  }
+  {
+    SmallDenseMap<int *, int> Map(2);
+    EXPECT_EQ(ElemSize * 4U, Map.getMemorySize());
+  }
+  {
+    SmallDenseMap<int *, int> Map(3);
+    EXPECT_EQ(ElemSize * 8U, Map.getMemorySize());
+  }
+  {
+    int A, B;
+    SmallDenseMap<int *, int> Map = {{&A, 1}, {&B, 2}};
+    EXPECT_EQ(ElemSize * 4U, Map.getMemorySize());
+  }
+  {
+    int A, B, C;
+    SmallDenseMap<int *, int> Map = {{&A, 1}, {&B, 2}, {&C, 3}};
+    EXPECT_EQ(ElemSize * 8U, Map.getMemorySize());
+  }
+}
+
 } // namespace
diff --git a/llvm/unittests/ADT/PackedVectorTest.cpp b/llvm/unittests/ADT/PackedVectorTest.cpp
index b4e017971efac..30fc7c0b6d07f 100644
--- a/llvm/unittests/ADT/PackedVectorTest.cpp
+++ b/llvm/unittests/ADT/PackedVectorTest.cpp
@@ -61,6 +61,16 @@ TEST(PackedVectorTest, Operation) {
   EXPECT_EQ(3U, Vec[1]);
 }
 
+TEST(PackedVectorTest, RawBitsSize) {
+  PackedVector<unsigned, 3> Vec;
+  EXPECT_EQ(0u, Vec.raw_bits().size());
+  Vec.push_back(2);
+  Vec.push_back(0);
+  Vec.push_back(1);
+  Vec.push_back(3);
+  EXPECT_EQ(12u, Vec.raw_bits().size());
+}
+
 #ifdef EXPECT_DEBUG_DEATH
 
 TEST(PackedVectorTest, UnsignedValues) {
diff --git a/llvm/unittests/ADT/STLExtrasTest.cpp b/llvm/unittests/ADT/STLExtrasTest.cpp
index 9fda0d912a2f5..5020acda95b0b 100644
--- a/llvm/unittests/ADT/STLExtrasTest.cpp
+++ b/llvm/unittests/ADT/STLExtrasTest.cpp
@@ -398,6 +398,8 @@ struct some_struct {
   std::string swap_val;
 };
 
+struct derives_from_some_struct : some_struct {};
+
 std::vector<int>::const_iterator begin(const some_struct &s) {
   return s.data.begin();
 }
@@ -500,6 +502,15 @@ TEST(STLExtrasTest, ToVector) {
   }
 }
 
+TEST(STLExtrasTest, AllTypesEqual) {
+  static_assert(all_types_equal_v<>);
+  static_assert(all_types_equal_v<int>);
+  static_assert(all_types_equal_v<int, int, int>);
+
+  static_assert(!all_types_equal_v<int, int, unsigned int>);
+  static_assert(!all_types_equal_v<int, int, float>);
+}
+
 TEST(STLExtrasTest, ConcatRange) {
   std::vector<int> Expected = {1, 2, 3, 4, 5, 6, 7, 8};
   std::vector<int> Test;
@@ -532,6 +543,43 @@ TEST(STLExtrasTest, ConcatRangeADL) {
   EXPECT_THAT(concat<const int>(S0, S1), ElementsAre(1, 2, 3, 4));
 }
 
+TEST(STLExtrasTest, ConcatRangePtrToSameClass) {
+  some_namespace::some_struct S0{};
+  some_namespace::some_struct S1{};
+  SmallVector<some_namespace::some_struct *> V0{&S0};
+  SmallVector<some_namespace::some_struct *> V1{&S1, &S1};
+
+  // Dereferencing all iterators yields `some_namespace::some_struct *&`; no
+  // conversion takes place, `reference_type` is
+  // `some_namespace::some_struct *&`.
+  auto C = concat<some_namespace::some_struct *>(V0, V1);
+  static_assert(
+      std::is_same_v<decltype(*C.begin()), some_namespace::some_struct *&>);
+  EXPECT_THAT(C, ElementsAre(&S0, &S1, &S1));
+  // `reference_type` should still allow container modification.
+  for (auto &i : C)
+    if (i == &S0)
+      i = nullptr;
+  EXPECT_THAT(C, ElementsAre(nullptr, &S1, &S1));
+}
+
+TEST(STLExtrasTest, ConcatRangePtrToDerivedClass) {
+  some_namespace::some_struct S0{};
+  some_namespace::derives_from_some_struct S1{};
+  SmallVector<some_namespace::some_struct *> V0{&S0};
+  SmallVector<some_namespace::derives_from_some_struct *> V1{&S1, &S1};
+
+  // Dereferencing all iterators yields different (but convertible types);
+  // conversion takes place, `reference_type` is
+  // `some_namespace::some_struct *`.
+  auto C = concat<some_namespace::some_struct *>(V0, V1);
+  static_assert(
+      std::is_same_v<decltype(*C.begin()), some_namespace::some_struct *>);
+  EXPECT_THAT(C,
+              ElementsAre(&S0, static_cast<some_namespace::some_struct *>(&S1),
+                          static_cast<some_namespace::some_struct *>(&S1)));
+}
+
 TEST(STLExtrasTest, MakeFirstSecondRangeADL) {
   // Make sure that we use the `begin`/`end` functions from `some_namespace`,
   // using ADL.
@@ -1602,6 +1650,16 @@ TEST(STLExtrasTest, Fill) {
   EXPECT_THAT(V2, ElementsAre(Val, Val, Val, Val));
 }
 
+TEST(STLExtrasTest, Accumulate) {
+  EXPECT_EQ(accumulate(std::vector<int>(), 0), 0);
+  EXPECT_EQ(accumulate(std::vector<int>(), 3), 3);
+  std::vector<int> V1 = {1, 2, 3, 4, 5};
+  EXPECT_EQ(accumulate(V1, 0), std::accumulate(V1.begin(), V1.end(), 0));
+  EXPECT_EQ(accumulate(V1, 10), std::accumulate(V1.begin(), V1.end(), 10));
+  EXPECT_EQ(accumulate(drop_begin(V1), 7),
+            std::accumulate(V1.begin() + 1, V1.end(), 7));
+}
+
 struct Foo;
 struct Bar {};
 
diff --git a/llvm/unittests/ADT/StringRefTest.cpp b/llvm/unittests/ADT/StringRefTest.cpp
index d5f8dc41cdb6b..1ace29e96dbb8 100644
--- a/llvm/unittests/ADT/StringRefTest.cpp
+++ b/llvm/unittests/ADT/StringRefTest.cpp
@@ -1124,14 +1124,13 @@ TEST(StringRefTest, StringLiteral) {
   constexpr StringRef StringRefs[] = {"Foo", "Bar"};
   EXPECT_EQ(StringRef("Foo"), StringRefs[0]);
   EXPECT_EQ(3u, (std::integral_constant<size_t, StringRefs[0].size()>::value));
-  EXPECT_EQ(false,
-            (std::integral_constant<bool, StringRefs[0].empty()>::value));
+  EXPECT_EQ(false, (std::bool_constant<StringRefs[0].empty()>::value));
   EXPECT_EQ(StringRef("Bar"), StringRefs[1]);
 
   constexpr StringLiteral Strings[] = {"Foo", "Bar"};
   EXPECT_EQ(StringRef("Foo"), Strings[0]);
   EXPECT_EQ(3u, (std::integral_constant<size_t, Strings[0].size()>::value));
-  EXPECT_EQ(false, (std::integral_constant<bool, Strings[0].empty()>::value));
+  EXPECT_EQ(false, (std::bool_constant<Strings[0].empty()>::value));
   EXPECT_EQ(StringRef("Bar"), Strings[1]);
 }
 
diff --git a/llvm/unittests/CAS/CMakeLists.txt b/llvm/unittests/CAS/CMakeLists.txt
index ff081007f31bc..ab709e30369bf 100644
--- a/llvm/unittests/CAS/CMakeLists.txt
+++ b/llvm/unittests/CAS/CMakeLists.txt
@@ -1,3 +1,7 @@
+if (LLVM_ENABLE_ONDISK_CAS)
+  add_definitions(-DLLVM_ENABLE_ONDISK_CAS=1)
+endif()
+
 set(LLVM_LINK_COMPONENTS
   Support
   CAS
@@ -8,6 +12,7 @@ add_llvm_unittest(CASTests
   ActionCacheTest.cpp
   CASTestConfig.cpp
   ObjectStoreTest.cpp
+  ProgramTest.cpp
   )
 
 target_link_libraries(CASTests PRIVATE LLVMTestingSupport)
diff --git a/llvm/unittests/CAS/ProgramTest.cpp b/llvm/unittests/CAS/ProgramTest.cpp
new file mode 100644
index 0000000000000..3ba5f697eeed5
--- /dev/null
+++ b/llvm/unittests/CAS/ProgramTest.cpp
@@ -0,0 +1,239 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/Program.h"
+#include "llvm/CAS/MappedFileRegionArena.h"
+#include "llvm/Config/llvm-config.h"
+#include "llvm/Support/ConvertUTF.h"
+#include "llvm/Support/ExponentialBackoff.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/ThreadPool.h"
+#include "llvm/Testing/Support/Error.h"
+#include "gtest/gtest.h"
+#if defined(__APPLE__)
+#include <crt_externs.h>
+#elif !defined(_MSC_VER)
+// Forward declare environ in case it's not provided by stdlib.h.
+extern char **environ;
+#endif
+
+using namespace llvm;
+using namespace llvm::cas;
+
+extern const char *TestMainArgv0;
+static char ProgramID = 0;
+
+class CASProgramTest : public testing::Test {
+  std::vector<StringRef> EnvTable;
+  std::vector<std::string> EnvStorage;
+
+protected:
+  void SetUp() override {
+    auto EnvP = [] {
+#if defined(_WIN32)
+      _wgetenv(L"TMP"); // Populate _wenviron, initially is null
+      return _wenviron;
+#elif defined(__APPLE__)
+      return *_NSGetEnviron();
+#else
+      return environ;
+#endif
+    }();
+    ASSERT_TRUE(EnvP);
+
+    auto prepareEnvVar = [this](decltype(*EnvP) Var) -> StringRef {
+#if defined(_WIN32)
+      // On Windows convert UTF16 encoded variable to UTF8
+      auto Len = wcslen(Var);
+      ArrayRef<char> Ref{reinterpret_cast<char const *>(Var),
+                         Len * sizeof(*Var)};
+      EnvStorage.emplace_back();
+      auto convStatus = llvm::convertUTF16ToUTF8String(Ref, EnvStorage.back());
+      EXPECT_TRUE(convStatus);
+      return EnvStorage.back();
+#else
+      (void)this;
+      return StringRef(Var);
+#endif
+    };
+
+    while (*EnvP != nullptr) {
+      auto S = prepareEnvVar(*EnvP);
+      if (!StringRef(S).starts_with("GTEST_"))
+        EnvTable.emplace_back(S);
+      ++EnvP;
+    }
+  }
+
+  void TearDown() override {
+    EnvTable.clear();
+    EnvStorage.clear();
+  }
+
+  void addEnvVar(StringRef Var) { EnvTable.emplace_back(Var); }
+
+  ArrayRef<StringRef> getEnviron() const { return EnvTable; }
+};
+
+#if LLVM_ENABLE_ONDISK_CAS
+
+static Error emptyConstructor(MappedFileRegionArena &) {
+  return Error::success();
+}
+
+TEST_F(CASProgramTest, MappedFileRegionArenaTest) {
+  auto TestAllocator = [](StringRef Path) {
+    std::optional<MappedFileRegionArena> Alloc;
+    ASSERT_THAT_ERROR(
+        MappedFileRegionArena::create(Path, /*Capacity=*/10 * 1024 * 1024,
+                                      /*HeaderOffset=*/0, emptyConstructor)
+            .moveInto(Alloc),
+        Succeeded());
+
+    std::vector<unsigned *> AllocatedPtr;
+    AllocatedPtr.resize(100);
+    DefaultThreadPool Threads;
+    for (unsigned I = 0; I < 100; ++I) {
+      Threads.async(
+          [&](unsigned Idx) {
+            // Allocate a buffer that is larger than needed so allocator hits
+            // additional pages for test coverage.
+            unsigned *P = (unsigned *)cantFail(Alloc->allocate(100));
+            *P = Idx;
+            AllocatedPtr[Idx] = P;
+          },
+          I);
+    }
+
+    Threads.wait();
+    for (unsigned I = 0; I < 100; ++I)
+      EXPECT_EQ(*AllocatedPtr[I], I);
+  };
+
+  if (const char *File = getenv("LLVM_CAS_TEST_MAPPED_FILE_REGION")) {
+    TestAllocator(File);
+    exit(0);
+  }
+
+  SmallString<128> FilePath;
+  sys::fs::createUniqueDirectory("MappedFileRegionArena", FilePath);
+  sys::path::append(FilePath, "allocation-file");
+
+  std::string Executable =
+      sys::fs::getMainExecutable(TestMainArgv0, &ProgramID);
+  StringRef Argv[] = {
+      Executable, "--gtest_filter=CASProgramTest.MappedFileRegionArenaTest"};
+
+  // Add LLVM_PROGRAM_TEST_LOCKED_FILE to the environment of the child.
+  std::string EnvVar = "LLVM_CAS_TEST_MAPPED_FILE_REGION=";
+  EnvVar += FilePath.str();
+  addEnvVar(EnvVar);
+
+  std::string Error;
+  bool ExecutionFailed;
+  sys::ProcessInfo PI = sys::ExecuteNoWait(Executable, Argv, getEnviron(), {},
+                                           0, &Error, &ExecutionFailed);
+  TestAllocator(FilePath);
+
+  ASSERT_FALSE(ExecutionFailed) << Error;
+  ASSERT_NE(PI.Pid, sys::ProcessInfo::InvalidPid) << "Invalid process id";
+  PI = llvm::sys::Wait(PI, /*SecondsToWait=*/5, &Error);
+  ASSERT_TRUE(PI.ReturnCode == 0);
+  ASSERT_TRUE(Error.empty());
+
+  // Clean up after both processes finish testing.
+  sys::fs::remove(FilePath);
+  sys::fs::remove_directories(sys::path::parent_path(FilePath));
+}
+
+TEST_F(CASProgramTest, MappedFileRegionArenaSizeTest) {
+  using namespace std::chrono_literals;
+  if (const char *File = getenv("LLVM_CAS_TEST_MAPPED_FILE_REGION")) {
+    ExponentialBackoff Backoff(5s);
+    do {
+      if (sys::fs::exists(File)) {
+        break;
+      }
+    } while (Backoff.waitForNextAttempt());
+
+    std::optional<MappedFileRegionArena> Alloc;
+    ASSERT_THAT_ERROR(MappedFileRegionArena::create(File, /*Capacity=*/1024,
+                                                    /*HeaderOffset=*/0,
+                                                    emptyConstructor)
+                          .moveInto(Alloc),
+                      Succeeded());
+    ASSERT_TRUE(Alloc->capacity() == 2048);
+
+    Alloc.reset();
+    ASSERT_THAT_ERROR(MappedFileRegionArena::create(File, /*Capacity=*/4096,
+                                                    /*HeaderOffset=*/0,
+                                                    emptyConstructor)
+                          .moveInto(Alloc),
+                      Succeeded());
+    ASSERT_TRUE(Alloc->capacity() == 2048);
+    Alloc.reset();
+
+    ASSERT_THAT_ERROR(
+        MappedFileRegionArena::create(File, /*Capacity=*/2048,
+                                      /*HeaderOffset=*/32, emptyConstructor)
+            .moveInto(Alloc),
+        FailedWithMessage(
+            "specified header offset (32) does not match existing config (0)"));
+
+    ASSERT_THAT_ERROR(MappedFileRegionArena::create(File, /*Capacity=*/2048,
+                                                    /*HeaderOffset=*/0,
+                                                    emptyConstructor)
+                          .moveInto(Alloc),
+                      Succeeded());
+
+    exit(0);
+  }
+
+  SmallString<128> FilePath;
+  sys::fs::createUniqueDirectory("MappedFileRegionArena", FilePath);
+  sys::path::append(FilePath, "allocation-file");
+
+  std::string Executable =
+      sys::fs::getMainExecutable(TestMainArgv0, &ProgramID);
+  StringRef Argv[] = {
+      Executable,
+      "--gtest_filter=CASProgramTest.MappedFileRegionArenaSizeTest"};
+
+  // Add LLVM_PROGRAM_TEST_LOCKED_FILE to the environment of the child.
+  std::string EnvVar = "LLVM_CAS_TEST_MAPPED_FILE_REGION=";
+  EnvVar += FilePath.str();
+  addEnvVar(EnvVar);
+
+  std::optional<MappedFileRegionArena> Alloc;
+  ASSERT_THAT_ERROR(MappedFileRegionArena::create(FilePath, /*Capacity=*/2048,
+                                                  /*HeaderOffset=*/0,
+                                                  emptyConstructor)
+                        .moveInto(Alloc),
+                    Succeeded());
+
+  std::string Error;
+  bool ExecutionFailed;
+  sys::ProcessInfo PI = sys::ExecuteNoWait(Executable, Argv, getEnviron(), {},
+                                           0, &Error, &ExecutionFailed);
+
+  ASSERT_FALSE(ExecutionFailed) << Error;
+  ASSERT_NE(PI.Pid, sys::ProcessInfo::InvalidPid) << "Invalid process id";
+  PI = llvm::sys::Wait(PI, /*SecondsToWait=*/100, &Error);
+  ASSERT_TRUE(PI.ReturnCode == 0);
+  ASSERT_TRUE(Error.empty());
+
+  // Size is still the requested 2048.
+  ASSERT_TRUE(Alloc->capacity() == 2048);
+
+  // Clean up after both processes finish testing.
+  sys::fs::remove(FilePath);
+  sys::fs::remove_directories(sys::path::parent_path(FilePath));
+}
+
+#endif // LLVM_ENABLE_ONDISK_CAS
diff --git a/llvm/unittests/CodeGen/GlobalISel/KnownBitsTest.cpp b/llvm/unittests/CodeGen/GlobalISel/KnownBitsTest.cpp
index 089fb00d6080d..8563d7f1f15c9 100644
--- a/llvm/unittests/CodeGen/GlobalISel/KnownBitsTest.cpp
+++ b/llvm/unittests/CodeGen/GlobalISel/KnownBitsTest.cpp
@@ -190,7 +190,7 @@ TEST_F(AArch64GISelMITest, TestKnownBitsDecreasingCstPHIWithLoop) {
   // Therefore, %14's known zero are 0x80 shifted by one 0xC0.
   // If we had simulated the loop we could have more zero bits, basically
   // up to 0xFC (count leading zero of 5, + 1).
-  EXPECT_EQ((uint64_t)0xC0, Res.Zero.getZExtValue());
+  EXPECT_EQ((uint64_t)0xFC, Res.Zero.getZExtValue());
 
   KnownBits Res2 = Info.getKnownBits(DstReg);
   EXPECT_EQ(Res.One.getZExtValue(), Res2.One.getZExtValue());
diff --git a/llvm/unittests/CodeGen/GlobalISel/KnownBitsVectorTest.cpp b/llvm/unittests/CodeGen/GlobalISel/KnownBitsVectorTest.cpp
index 73ddf0c88d3ed..6b70ae9739179 100644
--- a/llvm/unittests/CodeGen/GlobalISel/KnownBitsVectorTest.cpp
+++ b/llvm/unittests/CodeGen/GlobalISel/KnownBitsVectorTest.cpp
@@ -220,7 +220,7 @@ TEST_F(AArch64GISelMITest, TestKnownBitsVectorDecreasingCstPHIWithLoop) {
   GISelValueTracking Info(*MF, /*MaxDepth=*/24);
   KnownBits Res = Info.getKnownBits(SrcReg);
   EXPECT_EQ((uint64_t)0, Res.One.getZExtValue());
-  EXPECT_EQ((uint64_t)0xC0, Res.Zero.getZExtValue());
+  EXPECT_EQ((uint64_t)0xFC, Res.Zero.getZExtValue());
 
   KnownBits Res2 = Info.getKnownBits(DstReg);
   EXPECT_EQ(Res.One.getZExtValue(), Res2.One.getZExtValue());
diff --git a/llvm/unittests/CodeGen/InstrRefLDVTest.cpp b/llvm/unittests/CodeGen/InstrRefLDVTest.cpp
index 53bc0246c126e..3a625b299a96f 100644
--- a/llvm/unittests/CodeGen/InstrRefLDVTest.cpp
+++ b/llvm/unittests/CodeGen/InstrRefLDVTest.cpp
@@ -159,7 +159,7 @@ class InstrRefLDVTest : public testing::Test {
     // Setup things like the artifical block map, and BlockNo <=> RPO Order
     // mappings.
     LDV->initialSetup(*MF);
-    LDV->LS.initialize(*MF);
+    LDV->LS.scanFunction(*MF);
     addMTracker(MF);
     return &*LDV;
   }
diff --git a/llvm/unittests/CodeGen/LexicalScopesTest.cpp b/llvm/unittests/CodeGen/LexicalScopesTest.cpp
index 563d496d1e600..34bd37a4afdc2 100644
--- a/llvm/unittests/CodeGen/LexicalScopesTest.cpp
+++ b/llvm/unittests/CodeGen/LexicalScopesTest.cpp
@@ -44,6 +44,7 @@ class LexicalScopesTest : public testing::Test {
   std::unique_ptr<MachineFunction> MF;
   DICompileUnit *OurCU;
   DIFile *OurFile;
+  DISubroutineType *OurSubT;
   DISubprogram *OurFunc;
   DILexicalBlock *OurBlock, *AnotherBlock;
   DISubprogram *ToInlineFunc;
@@ -103,7 +104,7 @@ class LexicalScopesTest : public testing::Test {
     OurFile = DIB.createFile("xyzzy.c", "/cave");
     OurCU =
         DIB.createCompileUnit(dwarf::DW_LANG_C99, OurFile, "nou", false, "", 0);
-    auto OurSubT = DIB.createSubroutineType(DIB.getOrCreateTypeArray({}));
+    OurSubT = DIB.createSubroutineType(DIB.getOrCreateTypeArray({}));
     OurFunc =
         DIB.createFunction(OurCU, "bees", "", OurFile, 1, OurSubT, 1,
                            DINode::FlagZero, DISubprogram::SPFlagDefinition);
@@ -136,10 +137,10 @@ TEST_F(LexicalScopesTest, FlatLayout) {
 
   LexicalScopes LS;
   EXPECT_TRUE(LS.empty());
-  LS.reset();
+  LS.resetFunction();
   EXPECT_EQ(LS.getCurrentFunctionScope(), nullptr);
 
-  LS.initialize(*MF);
+  LS.scanFunction(*MF);
   EXPECT_FALSE(LS.empty());
   LexicalScope *FuncScope = LS.getCurrentFunctionScope();
   EXPECT_EQ(FuncScope->getParent(), nullptr);
@@ -182,7 +183,7 @@ TEST_F(LexicalScopesTest, BlockScopes) {
   BuildMI(*MBB4, MBB4->end(), InBlockLoc, BeanInst);
 
   LexicalScopes LS;
-  LS.initialize(*MF);
+  LS.scanFunction(*MF);
   LexicalScope *FuncScope = LS.getCurrentFunctionScope();
   EXPECT_EQ(FuncScope->getDesc(), OurFunc);
   auto &Children = FuncScope->getChildren();
@@ -217,7 +218,7 @@ TEST_F(LexicalScopesTest, InlinedScopes) {
   BuildMI(*MBB4, MBB4->end(), InlinedLoc, BeanInst);
 
   LexicalScopes LS;
-  LS.initialize(*MF);
+  LS.scanFunction(*MF);
   LexicalScope *FuncScope = LS.getCurrentFunctionScope();
   auto &Children = FuncScope->getChildren();
   ASSERT_EQ(Children.size(), 1u);
@@ -252,7 +253,7 @@ TEST_F(LexicalScopesTest, FuncWithEmptyGap) {
   BuildMI(*MBB4, MBB4->end(), OutermostLoc, BeanInst);
 
   LexicalScopes LS;
-  LS.initialize(*MF);
+  LS.scanFunction(*MF);
   LexicalScope *FuncScope = LS.getCurrentFunctionScope();
 
   // A gap in a range that contains no other location, is not actually a
@@ -273,7 +274,7 @@ TEST_F(LexicalScopesTest, FuncWithRealGap) {
   MachineInstr *LastI = BuildMI(*MBB4, MBB4->end(), InBlockLoc, BeanInst);
 
   LexicalScopes LS;
-  LS.initialize(*MF);
+  LS.scanFunction(*MF);
   LexicalScope *BlockScope = LS.findLexicalScope(InBlockLoc.get());
   ASSERT_NE(BlockScope, nullptr);
 
@@ -306,7 +307,7 @@ TEST_F(LexicalScopesTest, NotNested) {
   MachineInstr *FourthI = BuildMI(*MBB4, MBB4->end(), InBlockLoc, BeanInst);
 
   LexicalScopes LS;
-  LS.initialize(*MF);
+  LS.scanFunction(*MF);
   LexicalScope *FuncScope = LS.getCurrentFunctionScope();
   LexicalScope *BlockScope = LS.findLexicalScope(InBlockLoc.get());
   LexicalScope *OtherBlockScope = LS.findLexicalScope(NotNestedBlockLoc.get());
@@ -344,7 +345,7 @@ TEST_F(LexicalScopesTest, TestDominates) {
   BuildMI(*MBB4, MBB4->end(), InBlockLoc, BeanInst);
 
   LexicalScopes LS;
-  LS.initialize(*MF);
+  LS.scanFunction(*MF);
   LexicalScope *FuncScope = LS.getCurrentFunctionScope();
   LexicalScope *BlockScope = LS.findLexicalScope(InBlockLoc.get());
   LexicalScope *OtherBlockScope = LS.findLexicalScope(NotNestedBlockLoc.get());
@@ -386,7 +387,7 @@ TEST_F(LexicalScopesTest, TestGetBlocks) {
   BuildMI(*MBB4, MBB4->end(), InBlockLoc, BeanInst);
 
   LexicalScopes LS;
-  LS.initialize(*MF);
+  LS.scanFunction(*MF);
   LexicalScope *FuncScope = LS.getCurrentFunctionScope();
   LexicalScope *BlockScope = LS.findLexicalScope(InBlockLoc.get());
   LexicalScope *OtherBlockScope = LS.findLexicalScope(NotNestedBlockLoc.get());
@@ -443,7 +444,7 @@ TEST_F(LexicalScopesTest, TestMetaInst) {
   BuildMI(*MBB4, MBB4->end(), InBlockLoc, BeanInst);
 
   LexicalScopes LS;
-  LS.initialize(*MF);
+  LS.scanFunction(*MF);
   LexicalScope *FuncScope = LS.getCurrentFunctionScope();
   LexicalScope *BlockScope = LS.findLexicalScope(InBlockLoc.get());
   ASSERT_NE(FuncScope, nullptr);
@@ -459,4 +460,24 @@ TEST_F(LexicalScopesTest, TestMetaInst) {
   EXPECT_TRUE(LS.dominates(InBlockLoc.get(), MBB4));
 }
 
+// Test function map creation.
+TEST_F(LexicalScopesTest, TestFunctionScan) {
+  auto MF2 = createMachineFunction(Ctx, Mod, "Test2");
+  DIBuilder DIB(Mod, false, OurCU);
+  DISubprogram *Func2 =
+      DIB.createFunction(OurCU, "Func2", "", OurFile, 1, OurSubT, 1,
+                         DINode::FlagZero, DISubprogram::SPFlagDefinition);
+  DISubprogram *UnattachedFunc =
+      DIB.createFunction(OurCU, "UnattachedFunc", "", OurFile, 1, OurSubT, 1,
+                         DINode::FlagZero, DISubprogram::SPFlagDefinition);
+  MF2->getFunction().setSubprogram(Func2);
+  DIB.finalize();
+
+  LexicalScopes LS;
+  LS.initialize(Mod);
+  ASSERT_EQ(LS.getFunction(OurFunc), &MF->getFunction());
+  ASSERT_EQ(LS.getFunction(Func2), &MF2->getFunction());
+  ASSERT_EQ(LS.getFunction(UnattachedFunc), nullptr);
+}
+
 } // anonymous namespace
diff --git a/llvm/unittests/CodeGen/MFCommon.inc b/llvm/unittests/CodeGen/MFCommon.inc
index cb4a2410df08b..a86a68cb4adf1 100644
--- a/llvm/unittests/CodeGen/MFCommon.inc
+++ b/llvm/unittests/CodeGen/MFCommon.inc
@@ -132,10 +132,10 @@ BogusTargetMachine *createTargetMachine() {
   return &BogusTM;
 }
 
-std::unique_ptr<MachineFunction> createMachineFunction(LLVMContext &Ctx,
-                                                       Module &M) {
+std::unique_ptr<MachineFunction>
+createMachineFunction(LLVMContext &Ctx, Module &M, const Twine &Name = "Test") {
   auto Type = FunctionType::get(Type::getVoidTy(Ctx), false);
-  auto F = Function::Create(Type, GlobalValue::ExternalLinkage, "Test", &M);
+  auto F = Function::Create(Type, GlobalValue::ExternalLinkage, Name, &M);
 
   auto TM = createTargetMachine();
   unsigned FunctionNum = 42;
@@ -145,4 +145,3 @@ std::unique_ptr<MachineFunction> createMachineFunction(LLVMContext &Ctx,
   return std::make_unique<MachineFunction>(*F, *TM, STI, MMI.getContext(),
                                            FunctionNum);
 }
-
diff --git a/llvm/unittests/DebugInfo/DWARF/DWARFExpressionCompactPrinterTest.cpp b/llvm/unittests/DebugInfo/DWARF/DWARFExpressionCompactPrinterTest.cpp
index 41acc8240c720..306ce67f93793 100644
--- a/llvm/unittests/DebugInfo/DWARF/DWARFExpressionCompactPrinterTest.cpp
+++ b/llvm/unittests/DebugInfo/DWARF/DWARFExpressionCompactPrinterTest.cpp
@@ -141,3 +141,9 @@ TEST_F(DWARFExpressionCompactPrinterTest, Test_OP_nop_OP_reg) {
 TEST_F(DWARFExpressionCompactPrinterTest, Test_OP_LLVM_nop_OP_reg) {
   TestExprPrinter({DW_OP_LLVM_user, DW_OP_LLVM_nop, DW_OP_reg0}, "R0");
 }
+
+TEST_F(DWARFExpressionCompactPrinterTest, Test_OP_LLVM_user_unknown_subop) {
+  TestExprPrinter({DW_OP_LLVM_user, DW_OP_LLVM_form_aspace_address},
+                  "<unknown op DW_OP_LLVM_user (233) subop "
+                  "DW_OP_LLVM_form_aspace_address (2)>");
+}
diff --git a/llvm/unittests/IR/CMakeLists.txt b/llvm/unittests/IR/CMakeLists.txt
index 8b7bd3997ea27..d62ce66ef9d34 100644
--- a/llvm/unittests/IR/CMakeLists.txt
+++ b/llvm/unittests/IR/CMakeLists.txt
@@ -28,6 +28,7 @@ add_llvm_unittest(IRTests
   DominatorTreeBatchUpdatesTest.cpp
   DroppedVariableStatsIRTest.cpp
   FunctionTest.cpp
+  GlobalObjectTest.cpp
   PassBuilderCallbacksTest.cpp
   IRBuilderTest.cpp
   InstructionsTest.cpp
diff --git a/llvm/unittests/IR/DataLayoutTest.cpp b/llvm/unittests/IR/DataLayoutTest.cpp
index afa72a53ab2c0..e0c0f35847f07 100644
--- a/llvm/unittests/IR/DataLayoutTest.cpp
+++ b/llvm/unittests/IR/DataLayoutTest.cpp
@@ -677,11 +677,4 @@ TEST(DataLayoutTest, VectorAlign) {
   EXPECT_EQ(Align(4 * 8), DL->getPrefTypeAlign(V8F32Ty));
 }
 
-TEST(DataLayoutTest, UEFI) {
-  Triple TT = Triple("x86_64-unknown-uefi");
-
-  // Test UEFI X86_64 Mangling Component.
-  EXPECT_STREQ(DataLayout::getManglingComponent(TT), "-m:w");
-}
-
 } // anonymous namespace
diff --git a/llvm/unittests/IR/GlobalObjectTest.cpp b/llvm/unittests/IR/GlobalObjectTest.cpp
new file mode 100644
index 0000000000000..0e16d01e759de
--- /dev/null
+++ b/llvm/unittests/IR/GlobalObjectTest.cpp
@@ -0,0 +1,80 @@
+//===- GlobalObjectTest.cpp - Global object unit tests --------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/IR/GlobalObject.h"
+#include "llvm/AsmParser/Parser.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/SourceMgr.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+using namespace llvm;
+namespace {
+using testing::Eq;
+using testing::Optional;
+using testing::StrEq;
+
+static std::unique_ptr<Module> parseIR(LLVMContext &C, const char *IR) {
+  SMDiagnostic Err;
+  std::unique_ptr<Module> Mod = parseAssemblyString(IR, Err, C);
+  if (!Mod)
+    Err.print("GlobalObjectTests", errs());
+  return Mod;
+}
+
+static LLVMContext C;
+static std::unique_ptr<Module> M;
+
+class GlobalObjectTest : public testing::Test {
+public:
+  static void SetUpTestSuite() {
+    M = parseIR(C, R"(
+@foo = global i32 3, !section_prefix !0
+@bar = global i32 0
+
+!0 = !{!"section_prefix", !"hot"}
+)");
+  }
+};
+
+TEST_F(GlobalObjectTest, SectionPrefix) {
+  GlobalVariable *Foo = M->getGlobalVariable("foo");
+
+  // Initial section prefix is hot.
+  ASSERT_NE(Foo, nullptr);
+  ASSERT_THAT(Foo->getSectionPrefix(), Optional(StrEq("hot")));
+
+  // Test that set method returns false since existing section prefix is hot.
+  EXPECT_FALSE(Foo->setSectionPrefix("hot"));
+
+  // Set prefix from hot to unlikely.
+  Foo->setSectionPrefix("unlikely");
+  EXPECT_THAT(Foo->getSectionPrefix(), Optional(StrEq("unlikely")));
+
+  // Set prefix to empty is the same as clear.
+  Foo->setSectionPrefix("");
+  // Test that section prefix is cleared.
+  EXPECT_THAT(Foo->getSectionPrefix(), Eq(std::nullopt));
+
+  GlobalVariable *Bar = M->getGlobalVariable("bar");
+
+  // Initial section prefix is empty.
+  ASSERT_NE(Bar, nullptr);
+  ASSERT_THAT(Bar->getSectionPrefix(), Eq(std::nullopt));
+
+  // Test that set method returns false since Bar doesn't have prefix metadata.
+  EXPECT_FALSE(Bar->setSectionPrefix(""));
+
+  // Set from empty to hot.
+  EXPECT_TRUE(Bar->setSectionPrefix("hot"));
+  EXPECT_THAT(Bar->getSectionPrefix(), Optional(StrEq("hot")));
+
+  // Test that set method returns true and section prefix is cleared.
+  EXPECT_TRUE(Bar->setSectionPrefix(""));
+  EXPECT_THAT(Bar->getSectionPrefix(), Eq(std::nullopt));
+}
+} // namespace
diff --git a/llvm/unittests/ObjectYAML/DXContainerYAMLTest.cpp b/llvm/unittests/ObjectYAML/DXContainerYAMLTest.cpp
index 4cf8f61e83c8d..b0ad208625436 100644
--- a/llvm/unittests/ObjectYAML/DXContainerYAMLTest.cpp
+++ b/llvm/unittests/ObjectYAML/DXContainerYAMLTest.cpp
@@ -172,8 +172,8 @@ TEST(RootSignature, HeaderData) {
         NumStaticSamplers: 0
         StaticSamplersOffset: 48
         Parameters:
-          - ParameterType: 1
-            ShaderVisibility: 2
+          - ParameterType: Constants32Bit
+            ShaderVisibility: Hull
             Constants:
               Num32BitValues: 16
               ShaderRegister: 15
@@ -224,8 +224,8 @@ TEST(RootSignature, ParseRootConstants) {
         NumStaticSamplers: 0
         StaticSamplersOffset: 48
         Parameters:
-          - ParameterType: 1
-            ShaderVisibility: 2
+          - ParameterType: Constants32Bit
+            ShaderVisibility: Hull
             Constants:
               Num32BitValues: 16
               ShaderRegister: 15
@@ -276,8 +276,8 @@ TEST(RootSignature, ParseRootDescriptorsV10) {
       NumStaticSamplers: 0
       StaticSamplersOffset: 44
       Parameters:         
-      - ParameterType: 2 # SRV
-        ShaderVisibility: 3 # Domain
+      - ParameterType: CBV 
+        ShaderVisibility: Domain 
         Descriptor:
           ShaderRegister: 31
           RegisterSpace: 32
@@ -327,8 +327,8 @@ TEST(RootSignature, ParseRootDescriptorsV11) {
       NumStaticSamplers: 0
       StaticSamplersOffset: 48
       Parameters:         
-      - ParameterType: 2 # SRV
-        ShaderVisibility: 3 # Domain
+      - ParameterType: CBV
+        ShaderVisibility: Domain
         Descriptor:
           ShaderRegister: 31
           RegisterSpace: 32
@@ -379,12 +379,12 @@ TEST(RootSignature, ParseDescriptorTableV10) {
       NumStaticSamplers: 0
       StaticSamplersOffset: 64
       Parameters:         
-      - ParameterType: 0 # SRV
-        ShaderVisibility: 3 # Domain
+      - ParameterType: DescriptorTable
+        ShaderVisibility: Domain
         Table:
           NumRanges: 1
           Ranges:
-            - RangeType: 0
+            - RangeType: SRV
               NumDescriptors: 41
               BaseShaderRegister: 42
               RegisterSpace: 43
@@ -435,12 +435,12 @@ TEST(RootSignature, ParseDescriptorTableV11) {
       NumStaticSamplers: 0
       StaticSamplersOffset: 68
       Parameters:         
-      - ParameterType: 0 # Descriptor Table
-        ShaderVisibility: 3 # Domain
+      - ParameterType: DescriptorTable
+        ShaderVisibility: Domain
         Table:
           NumRanges: 1
           Ranges:
-            - RangeType: 0
+            - RangeType: SRV
               NumDescriptors: -1
               BaseShaderRegister: 42
               RegisterSpace: 43
@@ -492,19 +492,19 @@ TEST(RootSignature, ParseStaticSamplers) {
       StaticSamplersOffset: 24
       Parameters: []
       Samplers: 
-        - Filter: 10 
-          AddressU: 1
-          AddressV: 2
-          AddressW: 5
+        - Filter: MinLinearMagMipPoint 
+          AddressU: Wrap
+          AddressV: Mirror
+          AddressW: MirrorOnce
           MipLODBias: 1.23
           MaxAnisotropy: 20
-          ComparisonFunc: 4
-          BorderColor: 0
+          ComparisonFunc: LessEqual
+          BorderColor: TransparentBlack
           MinLOD: 4.56
           MaxLOD: 8.90
           ShaderRegister: 31 
           RegisterSpace: 32
-          ShaderVisibility:  7
+          ShaderVisibility:  Mesh
       AllowInputAssemblerInputLayout: true
       DenyGeometryShaderRootAccess: true
     )"));
@@ -517,7 +517,7 @@ TEST(RootSignature, ParseStaticSamplers) {
       0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
       0x52, 0x54, 0x53, 0x30, 0x4c, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
       0x00, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
-      0x18, 0x00, 0x00, 0x00, 0x11, 0x00, 0x00, 0x00, 0x0a, 0x00, 0x00, 0x00,
+      0x18, 0x00, 0x00, 0x00, 0x11, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
       0x01, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00,
       0xa4, 0x70, 0x9d, 0x3f, 0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
       0x00, 0x00, 0x00, 0x00, 0x85, 0xeb, 0x91, 0x40, 0x66, 0x66, 0x0e, 0x41,
diff --git a/llvm/unittests/ProfileData/CoverageMappingTest.cpp b/llvm/unittests/ProfileData/CoverageMappingTest.cpp
index ec81e5f274efa..b268aa7cdd057 100644
--- a/llvm/unittests/ProfileData/CoverageMappingTest.cpp
+++ b/llvm/unittests/ProfileData/CoverageMappingTest.cpp
@@ -64,6 +64,7 @@ namespace {
 struct OutputFunctionCoverageData {
   StringRef Name;
   uint64_t Hash;
+  std::vector<std::string> FilenamesStorage;
   std::vector<StringRef> Filenames;
   std::vector<CounterMappingRegion> Regions;
   std::vector<CounterExpression> Expressions;
@@ -71,8 +72,10 @@ struct OutputFunctionCoverageData {
   OutputFunctionCoverageData() : Hash(0) {}
 
   OutputFunctionCoverageData(OutputFunctionCoverageData &&OFCD)
-      : Name(OFCD.Name), Hash(OFCD.Hash), Filenames(std::move(OFCD.Filenames)),
-        Regions(std::move(OFCD.Regions)) {}
+      : Name(OFCD.Name), Hash(OFCD.Hash),
+        FilenamesStorage(std::move(OFCD.FilenamesStorage)),
+        Filenames(std::move(OFCD.Filenames)), Regions(std::move(OFCD.Regions)) {
+  }
 
   OutputFunctionCoverageData(const OutputFunctionCoverageData &) = delete;
   OutputFunctionCoverageData &
@@ -135,7 +138,6 @@ struct InputFunctionCoverageData {
 struct CoverageMappingTest : ::testing::TestWithParam<std::tuple<bool, bool>> {
   bool UseMultipleReaders;
   StringMap<unsigned> Files;
-  std::vector<std::string> Filenames;
   std::vector<InputFunctionCoverageData> InputFunctions;
   std::vector<OutputFunctionCoverageData> OutputFunctions;
 
@@ -233,13 +235,11 @@ struct CoverageMappingTest : ::testing::TestWithParam<std::tuple<bool, bool>> {
 
   void readCoverageRegions(const std::string &Coverage,
                            OutputFunctionCoverageData &Data) {
-    // We will re-use the StringRef in duplicate tests, clear it to avoid
-    // clobber previous ones.
-    Filenames.clear();
-    Filenames.resize(Files.size() + 1);
+    // +1 here since `Files` (filename to index map) uses 1-based index.
+    Data.FilenamesStorage.resize(Files.size() + 1);
     for (const auto &E : Files)
-      Filenames[E.getValue()] = E.getKey().str();
-    ArrayRef<std::string> FilenameRefs = llvm::ArrayRef(Filenames);
+      Data.FilenamesStorage[E.getValue()] = E.getKey().str();
+    ArrayRef<std::string> FilenameRefs = llvm::ArrayRef(Data.FilenamesStorage);
     RawCoverageMappingReader Reader(Coverage, FilenameRefs, Data.Filenames,
                                     Data.Expressions, Data.Regions);
     EXPECT_THAT_ERROR(Reader.read(), Succeeded());
diff --git a/llvm/unittests/Support/CMakeLists.txt b/llvm/unittests/Support/CMakeLists.txt
index 0910a0b296dd0..d1dfb1dc4a722 100644
--- a/llvm/unittests/Support/CMakeLists.txt
+++ b/llvm/unittests/Support/CMakeLists.txt
@@ -125,6 +125,8 @@ add_llvm_unittest(SupportTests
   intrinsics_gen
   )
 
+add_subdirectory(LSP)
+
 target_link_libraries(SupportTests PRIVATE LLVMTestingSupport)
 
 # Disable all warning for AlignOfTest.cpp,
diff --git a/llvm/unittests/Support/DebugLogTest.cpp b/llvm/unittests/Support/DebugLogTest.cpp
index e087705b72586..4df5a809d11aa 100644
--- a/llvm/unittests/Support/DebugLogTest.cpp
+++ b/llvm/unittests/Support/DebugLogTest.cpp
@@ -27,8 +27,8 @@ TEST(DebugLogTest, Basic) {
   {
     std::string str;
     raw_string_ostream os(str);
-    DEBUGLOG_WITH_STREAM_AND_TYPE(os, 0, nullptr) << "NoType";
-    EXPECT_FALSE(StringRef(os.str()).starts_with('['));
+    LDGB_STREAM_LEVEL_AND_TYPE(os, "", 0) << "NoType";
+    EXPECT_TRUE(StringRef(os.str()).starts_with('['));
     EXPECT_TRUE(StringRef(os.str()).ends_with("NoType\n"));
   }
 
@@ -36,8 +36,8 @@ TEST(DebugLogTest, Basic) {
   {
     std::string str;
     raw_string_ostream os(str);
-    DEBUGLOG_WITH_STREAM_AND_TYPE(os, 0, "A") << "A";
-    DEBUGLOG_WITH_STREAM_AND_TYPE(os, 0, "B") << "B";
+    LDGB_STREAM_LEVEL_AND_TYPE(os, 0, "A") << "A";
+    LDGB_STREAM_LEVEL_AND_TYPE(os, "B", 0) << "B";
     EXPECT_TRUE(StringRef(os.str()).starts_with('['));
     EXPECT_THAT(os.str(), AllOf(HasSubstr("A\n"), HasSubstr("B\n")));
   }
@@ -48,18 +48,18 @@ TEST(DebugLogTest, Basic) {
     raw_string_ostream os(str);
     // Just check that the macro doesn't result in dangling else.
     if (true)
-      DEBUGLOG_WITH_STREAM_AND_TYPE(os, 0, "A") << "A";
+      LDGB_STREAM_LEVEL_AND_TYPE(os, 0, "A") << "A";
     else
-      DEBUGLOG_WITH_STREAM_AND_TYPE(os, 0, "A") << "B";
-    DEBUGLOG_WITH_STREAM_AND_TYPE(os, 0, "B") << "B";
+      LDGB_STREAM_LEVEL_AND_TYPE(os, 0, "A") << "B";
+    LDGB_STREAM_LEVEL_AND_TYPE(os, 0, "B") << "B";
     EXPECT_THAT(os.str(), AllOf(HasSubstr("A\n"), Not(HasSubstr("B\n"))));
 
     int count = 0;
     auto inc = [&]() { return ++count; };
     EXPECT_THAT(count, Eq(0));
-    DEBUGLOG_WITH_STREAM_AND_TYPE(os, 0, "A") << inc();
+    LDGB_STREAM_LEVEL_AND_TYPE(os, 0, "A") << inc();
     EXPECT_THAT(count, Eq(1));
-    DEBUGLOG_WITH_STREAM_AND_TYPE(os, 0, "B") << inc();
+    LDGB_STREAM_LEVEL_AND_TYPE(os, 0, "B") << inc();
     EXPECT_THAT(count, Eq(1));
   }
 }
@@ -75,10 +75,10 @@ TEST(DebugLogTest, BasicWithLevel) {
   raw_string_ostream os(str);
   for (auto type : {"A", "B", "C", "D"})
     for (int level : llvm::seq<int>(0, 4))
-      DEBUGLOG_WITH_STREAM_TYPE_FILE_AND_LINE(os, level, type, type, level)
+      LDBG_STREAM_LEVEL_TYPE_FILE_AND_LINE(os, level, type, type, level)
           << level;
-  EXPECT_EQ(os.str(), "[A:0] A:0 0\n[A:1] A:1 1\n[A:2] A:2 2\n[A:3] A:3 "
-                      "3\n[B:0] B:0 0\n[B:1] B:1 1\n[C:0] C:0 0\n");
+  EXPECT_EQ(os.str(), "[A:0 0] 0\n[A:1 1] 1\n[A:2 2] 2\n[A:3 3] 3\n[B:0 0] "
+                      "0\n[B:1 1] 1\n[C:0 0] 0\n");
 }
 
 TEST(DebugLogTest, NegativeLevel) {
@@ -92,9 +92,10 @@ TEST(DebugLogTest, NegativeLevel) {
   raw_string_ostream os(str);
   for (auto type : {"A", "B"})
     for (int level : llvm::seq<int>(0, 2))
-      DEBUGLOG_WITH_STREAM_TYPE_FILE_AND_LINE(os, level, type, type, level)
+      LDBG_STREAM_LEVEL_TYPE_FILE_AND_LINE(
+          os, level, type, (std::string(type) + ".cpp").c_str(), level)
           << level;
-  EXPECT_EQ(os.str(), "[A:0] A:0 0\n[B:0] B:0 0\n[B:1] B:1 1\n");
+  EXPECT_EQ(os.str(), "[A A.cpp:0 0] 0\n[B B.cpp:0 0] 0\n[B B.cpp:1 1] 1\n");
 }
 
 TEST(DebugLogTest, StreamPrefix) {
@@ -128,6 +129,188 @@ TEST(DebugLogTest, DestructorPrefix) {
   // After destructors, nothing should have been printed.
   EXPECT_EQ(os.str(), "");
 }
+
+TEST(DebugLogTest, LDBG_MACROS) {
+  llvm::DebugFlag = true;
+  static const char *DT[] = {"A:3", "B:2"};
+  setCurrentDebugTypes(DT, sizeof(DT) / sizeof(DT[0]));
+  std::string Str;
+  raw_string_ostream DebugOs(Str);
+  std::string StrExpected;
+  raw_string_ostream ExpectedOs(StrExpected);
+#undef LDBG_STREAM
+#define LDBG_STREAM DebugOs
+#define DEBUG_TYPE "A"
+  LDBG() << "Hello, world!";
+  ExpectedOs << "[A " << __LLVM_FILE_NAME__ << ":" << (__LINE__ - 1)
+             << " 1] Hello, world!\n";
+  EXPECT_EQ(DebugOs.str(), ExpectedOs.str());
+  Str.clear();
+  StrExpected.clear();
+
+  // Test with a level, no type.
+  LDBG(2) << "Hello, world!";
+  ExpectedOs << "[A " << __LLVM_FILE_NAME__ << ":" << (__LINE__ - 1)
+             << " 2] Hello, world!\n";
+  EXPECT_EQ(DebugOs.str(), ExpectedOs.str());
+  Str.clear();
+  StrExpected.clear();
+
+// Now check when we don't use DEBUG_TYPE, the file name is implicitly used
+// instead.
+#undef DEBUG_TYPE
+
+  // Repeat the tests above, they won't match since the debug types defined
+  // above don't match the file name.
+  LDBG() << "Hello, world!";
+  EXPECT_EQ(DebugOs.str(), "");
+  Str.clear();
+  StrExpected.clear();
+
+  // Test with a level, no type.
+  LDBG(2) << "Hello, world!";
+  EXPECT_EQ(DebugOs.str(), "");
+  Str.clear();
+  StrExpected.clear();
+
+  // Now enable the debug types that match the file name.
+  auto fileNameAndLevel = std::string(__LLVM_FILE_NAME__) + ":3";
+  static const char *DT2[] = {fileNameAndLevel.c_str(), "B:2"};
+  setCurrentDebugTypes(DT2, sizeof(DT2) / sizeof(DT2[0]));
+
+  // Repeat the tests above, they should match now.
+
+  LDBG() << "Hello, world!";
+  ExpectedOs << "[" << __LLVM_FILE_NAME__ << ":" << (__LINE__ - 1)
+             << " 1] Hello, world!\n";
+  EXPECT_EQ(DebugOs.str(), ExpectedOs.str());
+  Str.clear();
+  StrExpected.clear();
+
+  // Test with a level, no type.
+  LDBG(2) << "Hello, world!";
+  ExpectedOs << "[" << __LLVM_FILE_NAME__ << ":" << (__LINE__ - 1)
+             << " 2] Hello, world!\n";
+  EXPECT_EQ(DebugOs.str(), ExpectedOs.str());
+  Str.clear();
+  StrExpected.clear();
+
+  // Test with a type
+  LDBG("B") << "Hello, world!";
+  ExpectedOs << "[B " << __LLVM_FILE_NAME__ << ":" << (__LINE__ - 1)
+             << " 1] Hello, world!\n";
+  EXPECT_EQ(DebugOs.str(), ExpectedOs.str());
+  Str.clear();
+  StrExpected.clear();
+
+  // Test with a type and a level
+  LDBG("B", 2) << "Hello, world!";
+  ExpectedOs << "[B " << __LLVM_FILE_NAME__ << ":" << (__LINE__ - 1)
+             << " 2] Hello, world!\n";
+  EXPECT_EQ(DebugOs.str(), ExpectedOs.str());
+  Str.clear();
+  StrExpected.clear();
+
+  // Test with a type not enabled.
+  LDBG("C", 1) << "Hello, world!";
+  EXPECT_EQ(DebugOs.str(), "");
+
+  // Test with a level not enabled.
+  LDBG("B", 3) << "Hello, world!";
+  EXPECT_EQ(DebugOs.str(), "");
+  LDBG(__LLVM_FILE_NAME__, 4) << "Hello, world!";
+  EXPECT_EQ(DebugOs.str(), "");
+}
+
+TEST(DebugLogTest, LDBG_OS_MACROS) {
+  llvm::DebugFlag = true;
+  static const char *DT[] = {"A:3", "B:2"};
+  setCurrentDebugTypes(DT, sizeof(DT) / sizeof(DT[0]));
+  std::string Str;
+  raw_string_ostream DebugOs(Str);
+  std::string StrExpected;
+  raw_string_ostream ExpectedOs(StrExpected);
+#undef LDBG_STREAM
+#define LDBG_STREAM DebugOs
+#define DEBUG_TYPE "A"
+  LDBG_OS([](raw_ostream &Os) { Os << "Hello, world!"; });
+  ExpectedOs << "[A " << __LLVM_FILE_NAME__ << ":" << (__LINE__ - 1)
+             << " 1] Hello, world!\n";
+  EXPECT_EQ(DebugOs.str(), ExpectedOs.str());
+  Str.clear();
+  StrExpected.clear();
+
+  // Test with a level, no type.
+  LDBG_OS(2, [](raw_ostream &Os) { Os << "Hello, world!"; });
+  ExpectedOs << "[A " << __LLVM_FILE_NAME__ << ":" << (__LINE__ - 1)
+             << " 2] Hello, world!\n";
+  EXPECT_EQ(DebugOs.str(), ExpectedOs.str());
+  Str.clear();
+  StrExpected.clear();
+
+// Now check when we don't use DEBUG_TYPE, the file name is implicitly used
+// instead.
+#undef DEBUG_TYPE
+
+  // Repeat the tests above, they won't match since the debug types defined
+  // above don't match the file name.
+  LDBG_OS([](raw_ostream &Os) { Os << "Hello, world!"; });
+  EXPECT_EQ(DebugOs.str(), "");
+  Str.clear();
+  StrExpected.clear();
+
+  // Test with a level, no type.
+  LDBG_OS(2, [](raw_ostream &Os) { Os << "Hello, world!"; });
+  EXPECT_EQ(DebugOs.str(), "");
+  Str.clear();
+  StrExpected.clear();
+
+  // Now enable the debug types that match the file name.
+  auto fileNameAndLevel = std::string(__LLVM_FILE_NAME__) + ":3";
+  static const char *DT2[] = {fileNameAndLevel.c_str(), "B:2"};
+  setCurrentDebugTypes(DT2, sizeof(DT2) / sizeof(DT2[0]));
+
+  // Repeat the tests above, they should match now.
+  LDBG_OS([](raw_ostream &Os) { Os << "Hello, world!"; });
+  ExpectedOs << "[" << __LLVM_FILE_NAME__ << ":" << (__LINE__ - 1)
+             << " 1] Hello, world!\n";
+  EXPECT_EQ(DebugOs.str(), ExpectedOs.str());
+  Str.clear();
+  StrExpected.clear();
+
+  // Test with a level, no type.
+  LDBG_OS(2, [](raw_ostream &Os) { Os << "Hello, world!"; });
+  ExpectedOs << "[" << __LLVM_FILE_NAME__ << ":" << (__LINE__ - 1)
+             << " 2] Hello, world!\n";
+  EXPECT_EQ(DebugOs.str(), ExpectedOs.str());
+  Str.clear();
+  StrExpected.clear();
+
+  // Test with a type.
+  LDBG_OS("B", [](raw_ostream &Os) { Os << "Hello, world!"; });
+  ExpectedOs << "[B " << __LLVM_FILE_NAME__ << ":" << (__LINE__ - 1)
+             << " 1] Hello, world!\n";
+  EXPECT_EQ(DebugOs.str(), ExpectedOs.str());
+  Str.clear();
+  StrExpected.clear();
+
+  // Test with a type and a level
+  LDBG_OS("B", 2, [](raw_ostream &Os) { Os << "Hello, world!"; });
+  ExpectedOs << "[B " << __LLVM_FILE_NAME__ << ":" << (__LINE__ - 1)
+             << " 2] Hello, world!\n";
+  EXPECT_EQ(DebugOs.str(), ExpectedOs.str());
+  Str.clear();
+  StrExpected.clear();
+
+  // Test with a type not enabled.
+  LDBG_OS("C", 1, [](raw_ostream &Os) { Os << "Hello, world!"; });
+  EXPECT_EQ(DebugOs.str(), "");
+
+  // Test with a level not enabled.
+  LDBG_OS("B", 3, [](raw_ostream &Os) { Os << "Hello, world!"; });
+  EXPECT_EQ(DebugOs.str(), "");
+}
+
 #else
 TEST(DebugLogTest, Basic) {
   // LDBG should be compiled out in NDEBUG, so just check it compiles and has
diff --git a/llvm/unittests/Support/EndianTest.cpp b/llvm/unittests/Support/EndianTest.cpp
index 59281c0ed5444..c48b7707b7751 100644
--- a/llvm/unittests/Support/EndianTest.cpp
+++ b/llvm/unittests/Support/EndianTest.cpp
@@ -201,26 +201,26 @@ TEST(Endian, WriteBitAligned) {
 
 TEST(Endian, Write) {
   unsigned char data[5];
-  endian::write<int32_t, llvm::endianness::big, unaligned>(data, -1362446643);
+  endian::write<int32_t, unaligned>(data, -1362446643, llvm::endianness::big);
   EXPECT_EQ(data[0], 0xAE);
   EXPECT_EQ(data[1], 0xCA);
   EXPECT_EQ(data[2], 0xB6);
   EXPECT_EQ(data[3], 0xCD);
-  endian::write<int32_t, llvm::endianness::big, unaligned>(data + 1,
-                                                           -1362446643);
+  endian::write<int32_t, unaligned>(data + 1, -1362446643,
+                                    llvm::endianness::big);
   EXPECT_EQ(data[1], 0xAE);
   EXPECT_EQ(data[2], 0xCA);
   EXPECT_EQ(data[3], 0xB6);
   EXPECT_EQ(data[4], 0xCD);
 
-  endian::write<int32_t, llvm::endianness::little, unaligned>(data,
-                                                              -1362446643);
+  endian::write<int32_t, unaligned>(data, -1362446643,
+                                    llvm::endianness::little);
   EXPECT_EQ(data[0], 0xCD);
   EXPECT_EQ(data[1], 0xB6);
   EXPECT_EQ(data[2], 0xCA);
   EXPECT_EQ(data[3], 0xAE);
-  endian::write<int32_t, llvm::endianness::little, unaligned>(data + 1,
-                                                              -1362446643);
+  endian::write<int32_t, unaligned>(data + 1, -1362446643,
+                                    llvm::endianness::little);
   EXPECT_EQ(data[1], 0xCD);
   EXPECT_EQ(data[2], 0xB6);
   EXPECT_EQ(data[3], 0xCA);
diff --git a/llvm/unittests/Support/LSP/CMakeLists.txt b/llvm/unittests/Support/LSP/CMakeLists.txt
new file mode 100644
index 0000000000000..790a8b725469b
--- /dev/null
+++ b/llvm/unittests/Support/LSP/CMakeLists.txt
@@ -0,0 +1,8 @@
+set(LLVM_LINK_COMPONENTS
+  SupportLSP
+)
+
+add_llvm_unittest(LLVMSupportLSPTests
+  Protocol.cpp
+  Transport.cpp
+)
diff --git a/mlir/unittests/Tools/lsp-server-support/Protocol.cpp b/llvm/unittests/Support/LSP/Protocol.cpp
similarity index 93%
rename from mlir/unittests/Tools/lsp-server-support/Protocol.cpp
rename to llvm/unittests/Support/LSP/Protocol.cpp
index 04d7b2fbb440f..43c548c24b38b 100644
--- a/mlir/unittests/Tools/lsp-server-support/Protocol.cpp
+++ b/llvm/unittests/Support/LSP/Protocol.cpp
@@ -6,12 +6,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "mlir/Tools/lsp-server-support/Protocol.h"
+#include "llvm/Support/LSP/Protocol.h"
 
 #include "gtest/gtest.h"
 
-using namespace mlir;
-using namespace mlir::lsp;
+using namespace llvm;
+using namespace llvm::lsp;
 using namespace testing;
 
 namespace {
diff --git a/mlir/unittests/Tools/lsp-server-support/Transport.cpp b/llvm/unittests/Support/LSP/Transport.cpp
similarity index 96%
rename from mlir/unittests/Tools/lsp-server-support/Transport.cpp
rename to llvm/unittests/Support/LSP/Transport.cpp
index 92581bd2bad08..514e93e983523 100644
--- a/mlir/unittests/Tools/lsp-server-support/Transport.cpp
+++ b/llvm/unittests/Support/LSP/Transport.cpp
@@ -6,15 +6,15 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "mlir/Tools/lsp-server-support/Transport.h"
-#include "mlir/Tools/lsp-server-support/Logging.h"
-#include "mlir/Tools/lsp-server-support/Protocol.h"
+#include "llvm/Support/LSP/Transport.h"
 #include "llvm/Support/FileSystem.h"
+#include "llvm/Support/LSP/Logging.h"
+#include "llvm/Support/LSP/Protocol.h"
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
 
-using namespace mlir;
-using namespace mlir::lsp;
+using namespace llvm;
+using namespace llvm::lsp;
 using namespace testing;
 
 namespace {
@@ -88,7 +88,7 @@ class TransportInputTest : public Test {
 TEST_F(TransportInputTest, RequestWithInvalidParams) {
   struct Handler {
     void onMethod(const TextDocumentItem &params,
-                  mlir::lsp::Callback<TextDocumentIdentifier> callback) {}
+                  llvm::lsp::Callback<TextDocumentIdentifier> callback) {}
   } handler;
   getMessageHandler().method("invalid-params-request", &handler,
                              &Handler::onMethod);
diff --git a/llvm/unittests/Support/ProgramTest.cpp b/llvm/unittests/Support/ProgramTest.cpp
index d30bf458f233c..eac0246d8c59e 100644
--- a/llvm/unittests/Support/ProgramTest.cpp
+++ b/llvm/unittests/Support/ProgramTest.cpp
@@ -680,4 +680,22 @@ TEST_F(ProgramEnvTest, TestExecuteWithNoStacktraceHandler) {
   ASSERT_EQ(0, RetCode);
 }
 
+TEST_F(ProgramEnvTest, TestExecuteEmptyEnvironment) {
+  using namespace llvm::sys;
+
+  std::string Executable =
+      sys::fs::getMainExecutable(TestMainArgv0, &ProgramTestStringArg1);
+  StringRef argv[] = {
+      Executable,
+      "--gtest_filter=" // A null invocation to avoid infinite recursion
+  };
+
+  std::string Error;
+  bool ExecutionFailed;
+  int RetCode = ExecuteAndWait(Executable, argv, ArrayRef<StringRef>{}, {}, 0,
+                               0, &Error, &ExecutionFailed);
+  EXPECT_FALSE(ExecutionFailed) << Error;
+  ASSERT_EQ(0, RetCode);
+}
+
 } // end anonymous namespace
diff --git a/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp b/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp
index febd06ea51739..e953c0d11590b 100644
--- a/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp
+++ b/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp
@@ -1184,6 +1184,7 @@ R"(All available -march extensions for RISC-V
 
 Experimental extensions
     p                    0.15
+    zibi                 0.1
     zicfilp              1.0       This is a long dummy description
     zicfiss              1.0
     zalasr               0.1
diff --git a/llvm/unittests/TargetParser/TripleTest.cpp b/llvm/unittests/TargetParser/TripleTest.cpp
index e6979cf49ce82..256756650d21f 100644
--- a/llvm/unittests/TargetParser/TripleTest.cpp
+++ b/llvm/unittests/TargetParser/TripleTest.cpp
@@ -10,6 +10,7 @@
 #include "llvm/Support/CodeGen.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/VersionTuple.h"
+#include "gmock/gmock.h"
 #include "gtest/gtest.h"
 
 using namespace llvm;
@@ -3300,4 +3301,12 @@ TEST(TripleTest, isCompatibleWith) {
     EXPECT_TRUE(DoTest(C.B, C.A, C.Result));
   }
 }
+
+TEST(DataLayoutTest, UEFI) {
+  Triple TT = Triple("x86_64-unknown-uefi");
+
+  // Test UEFI X86_64 Mangling Component.
+  EXPECT_THAT(TT.computeDataLayout(), testing::HasSubstr("-m:w-"));
+}
+
 } // end anonymous namespace
diff --git a/llvm/unittests/Transforms/Utils/CMakeLists.txt b/llvm/unittests/Transforms/Utils/CMakeLists.txt
index 5c7ec28709c16..7d649f25c830a 100644
--- a/llvm/unittests/Transforms/Utils/CMakeLists.txt
+++ b/llvm/unittests/Transforms/Utils/CMakeLists.txt
@@ -19,6 +19,7 @@ add_llvm_unittest(UtilsTests
   CodeLayoutTest.cpp
   CodeMoverUtilsTest.cpp
   DebugifyTest.cpp
+  DebugSSAUpdaterTest.cpp
   FunctionComparatorTest.cpp
   IntegerDivisionTest.cpp
   LocalTest.cpp
diff --git a/llvm/unittests/Transforms/Utils/DebugSSAUpdaterTest.cpp b/llvm/unittests/Transforms/Utils/DebugSSAUpdaterTest.cpp
new file mode 100644
index 0000000000000..cc20aec580591
--- /dev/null
+++ b/llvm/unittests/Transforms/Utils/DebugSSAUpdaterTest.cpp
@@ -0,0 +1,219 @@
+//===- DebugSSAUpdater.cpp - Unit tests for debug variable tracking -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/DebugSSAUpdater.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/AsmParser/Parser.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/DebugProgramInstruction.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/SourceMgr.h"
+#include "gtest/gtest.h"
+
+using namespace llvm;
+
+static std::unique_ptr<Module> parseIR(LLVMContext &C, const char *IR) {
+  SMDiagnostic Err;
+  std::unique_ptr<Module> Mod = parseAssemblyString(IR, Err, C);
+  if (!Mod)
+    Err.print("DebugSSAUpdaterTests", errs());
+  return Mod;
+}
+
+namespace {
+
+// Verify that two conflicting live-in values result in no live-in range for a
+// block.
+TEST(DebugSSAUpdater, EmptyPHIRange) {
+  LLVMContext C;
+
+  std::unique_ptr<Module> M =
+      parseIR(C,
+              R"(define i32 @foo(i32 %a, i1 %b) !dbg !7 {
+entry:
+    #dbg_value(i32 %a, !6, !DIExpression(), !10)
+  br i1 %b, label %if.then, label %if.else, !dbg !11
+
+if.then:
+  %c = add i32 %a, 10, !dbg !12
+    #dbg_value(i32 %c, !6, !DIExpression(), !13)
+  br label %exit, !dbg !14
+
+if.else:
+  %d = mul i32 %a, 3, !dbg !15
+    #dbg_value(i32 %d, !6, !DIExpression(), !16)
+  br label %exit, !dbg !17
+
+exit:
+  %res = phi i32 [ %c, %if.then ], [ %d, %if.else ], !dbg !18
+  ret i32 %res, !dbg !19
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!4}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_17, file: !1, producer: "clang version 20.0.0")
+!1 = !DIFile(filename: "test.cpp", directory: ".")
+!2 = !{}
+!3 = !{i32 7, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{!"clang version 20.0.0"}
+!6 = !DILocalVariable(name: "a", scope: !7, file: !1, line: 11, type: !8)
+!7 = distinct !DISubprogram(name: "foo", linkageName: "_Z3foov", scope: !1, file: !1, line: 10, type: !9, scopeLine: 10, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !2)
+!8 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!9 = !DISubroutineType(types: !2)
+!10 = !DILocation(line: 10, scope: !7)
+!11 = !DILocation(line: 11, scope: !7)
+!12 = !DILocation(line: 12, scope: !7)
+!13 = !DILocation(line: 13, scope: !7)
+!14 = !DILocation(line: 14, scope: !7)
+!15 = !DILocation(line: 15, scope: !7)
+!16 = !DILocation(line: 16, scope: !7)
+!17 = !DILocation(line: 17, scope: !7)
+!18 = !DILocation(line: 18, scope: !7)
+!19 = !DILocation(line: 19, scope: !7)
+)");
+
+  Function *Foo = &*M->begin();
+  DebugVariableAggregate VarA(cast<DbgVariableRecord>(
+      Foo->begin()->begin()->getDbgRecordRange().begin()));
+  DbgValueRangeTable DbgValueRanges;
+  DbgValueRanges.addVariable(Foo, VarA);
+  BasicBlock *ExitBlock = &Foo->back();
+  // We should have 5 ranges: 1 in the entry block, and 2 in each `if` block,
+  // while there should be no range for the exit block.
+  EXPECT_EQ(DbgValueRanges.getVariableRanges(VarA).size(), 5u);
+  EXPECT_TRUE(none_of(DbgValueRanges.getVariableRanges(VarA),
+                      [&](DbgRangeEntry VarRange) {
+                        return VarRange.Start->getParent() == ExitBlock;
+                      }));
+}
+
+// Verify that we correctly set live-in variable values through loops.
+TEST(DebugSSAUpdater, LoopPHI) {
+  LLVMContext C;
+
+  std::unique_ptr<Module> M =
+      parseIR(C,
+              R"(define i32 @foo(i32 %a, i32 %max) !dbg !7 {
+entry:
+    #dbg_value(i32 %a, !6, !DIExpression(), !10)
+  %cond.entry = icmp slt i32 %a, %max, !dbg !11
+  br i1 %cond.entry, label %loop, label %exit, !dbg !12
+
+loop:
+  %loop.a = phi i32 [ %a, %entry ], [ %inc, %loop ]
+  %inc = add i32 %loop.a, 1, !dbg !13
+  %cond.loop = icmp slt i32 %inc, %max, !dbg !14
+  br i1 %cond.loop, label %loop, label %exit, !dbg !15
+
+exit:
+  %res = phi i32 [ %a, %entry ], [ %loop.a, %loop ]
+  ret i32 %res, !dbg !16
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!4}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_17, file: !1, producer: "clang version 20.0.0")
+!1 = !DIFile(filename: "test.cpp", directory: ".")
+!2 = !{}
+!3 = !{i32 7, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{!"clang version 20.0.0"}
+!6 = !DILocalVariable(name: "a", scope: !7, file: !1, line: 11, type: !8)
+!7 = distinct !DISubprogram(name: "foo", linkageName: "_Z3foov", scope: !1, file: !1, line: 10, type: !9, scopeLine: 10, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !2)
+!8 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!9 = !DISubroutineType(types: !2)
+!10 = !DILocation(line: 10, scope: !7)
+!11 = !DILocation(line: 11, scope: !7)
+!12 = !DILocation(line: 12, scope: !7)
+!13 = !DILocation(line: 13, scope: !7)
+!14 = !DILocation(line: 14, scope: !7)
+!15 = !DILocation(line: 15, scope: !7)
+!16 = !DILocation(line: 16, scope: !7)
+)");
+
+  Function *Foo = &*M->begin();
+  DebugVariableAggregate VarA(cast<DbgVariableRecord>(
+      Foo->begin()->begin()->getDbgRecordRange().begin()));
+  DbgValueRangeTable DbgValueRanges;
+  DbgValueRanges.addVariable(Foo, VarA);
+  // We should have 3 ranges: 1 in the entry block, and 1 live-in entry for each
+  // of the loops.
+  EXPECT_EQ(DbgValueRanges.getVariableRanges(VarA).size(), 3u);
+  EXPECT_TRUE(
+      all_of(DbgValueRanges.getVariableRanges(VarA),
+             [&](DbgRangeEntry VarRange) { return !VarRange.Value.IsUndef; }));
+}
+
+// Verify that when a variable has only undef debug values, it has no live
+// ranges.
+TEST(DebugSSAUpdater, AllUndefVar) {
+  LLVMContext C;
+
+  std::unique_ptr<Module> M =
+      parseIR(C,
+              R"(define i32 @foo(i32 %a, i1 %b) !dbg !7 {
+entry:
+    #dbg_value(i32 poison, !6, !DIExpression(), !10)
+  br i1 %b, label %if.then, label %if.else, !dbg !11
+
+if.then:
+  %c = add i32 %a, 10, !dbg !12
+    #dbg_value(i32 poison, !6, !DIExpression(), !13)
+  br label %exit, !dbg !14
+
+if.else:
+  %d = mul i32 %a, 3, !dbg !15
+    #dbg_value(i32 poison, !6, !DIExpression(), !16)
+  br label %exit, !dbg !17
+
+exit:
+  %res = phi i32 [ %c, %if.then ], [ %d, %if.else ], !dbg !18
+  ret i32 %res, !dbg !19
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!4}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_17, file: !1, producer: "clang version 20.0.0")
+!1 = !DIFile(filename: "test.cpp", directory: ".")
+!2 = !{}
+!3 = !{i32 7, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{!"clang version 20.0.0"}
+!6 = !DILocalVariable(name: "a", scope: !7, file: !1, line: 11, type: !8)
+!7 = distinct !DISubprogram(name: "foo", linkageName: "_Z3foov", scope: !1, file: !1, line: 10, type: !9, scopeLine: 10, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !2)
+!8 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!9 = !DISubroutineType(types: !2)
+!10 = !DILocation(line: 10, scope: !7)
+!11 = !DILocation(line: 11, scope: !7)
+!12 = !DILocation(line: 12, scope: !7)
+!13 = !DILocation(line: 13, scope: !7)
+!14 = !DILocation(line: 14, scope: !7)
+!15 = !DILocation(line: 15, scope: !7)
+!16 = !DILocation(line: 16, scope: !7)
+!17 = !DILocation(line: 17, scope: !7)
+!18 = !DILocation(line: 18, scope: !7)
+!19 = !DILocation(line: 19, scope: !7)
+)");
+
+  Function *Foo = &*M->begin();
+  DebugVariableAggregate VarA(cast<DbgVariableRecord>(
+      Foo->begin()->begin()->getDbgRecordRange().begin()));
+  DbgValueRangeTable DbgValueRanges;
+  DbgValueRanges.addVariable(Foo, VarA);
+  // There should be no variable ranges emitted for a variable that has only
+  // undef dbg_values.
+  EXPECT_EQ(DbgValueRanges.getVariableRanges(VarA).size(), 0u);
+}
+} // namespace
diff --git a/llvm/unittests/Transforms/Utils/LoopUtilsTest.cpp b/llvm/unittests/Transforms/Utils/LoopUtilsTest.cpp
index c22a3582bee86..ce002e9239960 100644
--- a/llvm/unittests/Transforms/Utils/LoopUtilsTest.cpp
+++ b/llvm/unittests/Transforms/Utils/LoopUtilsTest.cpp
@@ -142,3 +142,56 @@ TEST(LoopUtils, IsKnownNonPositiveInLoopTest) {
         EXPECT_EQ(isKnownNonPositiveInLoop(ArgSCEV, L, SE), true);
       });
 }
+
+// The inner and outer loop here share a latch.  Because any loop metadata must
+// be attached to that latch, loop metadata cannot distinguish between the two
+// loops.  Until that problem is solved (by moving loop metadata to loops'
+// header blocks instead), getLoopEstimatedTripCount and
+// setLoopEstimatedTripCount must refuse to operate on at least one of the two
+// loops.  They choose to reject the outer loop here because the latch does not
+// exit it.
+TEST(LoopUtils, nestedLoopSharedLatchEstimatedTripCount) {
+  LLVMContext C;
+  std::unique_ptr<Module> M =
+      parseIR(C, "declare i1 @f()\n"
+                 "declare i1 @g()\n"
+                 "define void @foo() {\n"
+                 "entry:\n"
+                 "  br label %outer\n"
+                 "outer:\n"
+                 "  %c0 = call i1 @f()"
+                 "  br i1 %c0, label %inner, label %exit, !prof !0\n"
+                 "inner:\n"
+                 "  %c1 = call i1 @g()"
+                 "  br i1 %c1, label %inner, label %outer, !prof !1\n"
+                 "exit:\n"
+                 "  ret void\n"
+                 "}\n"
+                 "!0 = !{!\"branch_weights\", i32 100, i32 1}\n"
+                 "!1 = !{!\"branch_weights\", i32 4, i32 1}\n"
+                 "\n");
+
+  run(*M, "foo",
+      [&](Function &F, DominatorTree &DT, ScalarEvolution &SE, LoopInfo &LI) {
+        assert(LI.end() - LI.begin() == 1 && "Expected one outer loop");
+        Loop *Outer = *LI.begin();
+        assert(Outer->end() - Outer->begin() == 1 && "Expected one inner loop");
+        Loop *Inner = *Outer->begin();
+
+        // Even before llvm.loop.estimated_trip_count is added to either loop,
+        // getLoopEstimatedTripCount rejects the outer loop.
+        EXPECT_EQ(getLoopEstimatedTripCount(Inner), 5);
+        EXPECT_EQ(getLoopEstimatedTripCount(Outer), std::nullopt);
+
+        // setLoopEstimatedTripCount for the inner loop does not affect
+        // getLoopEstimatedTripCount for the outer loop.
+        EXPECT_EQ(setLoopEstimatedTripCount(Inner, 100), true);
+        EXPECT_EQ(getLoopEstimatedTripCount(Inner), 100);
+        EXPECT_EQ(getLoopEstimatedTripCount(Outer), std::nullopt);
+
+        // setLoopEstimatedTripCount rejects the outer loop.
+        EXPECT_EQ(setLoopEstimatedTripCount(Outer, 999), false);
+        EXPECT_EQ(getLoopEstimatedTripCount(Inner), 100);
+        EXPECT_EQ(getLoopEstimatedTripCount(Outer), std::nullopt);
+      });
+}
diff --git a/llvm/unittests/Transforms/Vectorize/VPlanPatternMatchTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanPatternMatchTest.cpp
index e38b4fad80b0e..582094bed3ef7 100644
--- a/llvm/unittests/Transforms/Vectorize/VPlanPatternMatchTest.cpp
+++ b/llvm/unittests/Transforms/Vectorize/VPlanPatternMatchTest.cpp
@@ -51,5 +51,29 @@ TEST_F(VPPatternMatchTest, ScalarIVSteps) {
                                             m_SpecificInt(2), m_Specific(VF))));
 }
 
+TEST_F(VPPatternMatchTest, GetElementPtr) {
+  VPlan &Plan = getPlan();
+  VPBasicBlock *VPBB = Plan.createVPBasicBlock("entry");
+  VPBuilder Builder(VPBB);
+
+  IntegerType *I64Ty = IntegerType::get(C, 64);
+  VPValue *One = Plan.getOrAddLiveIn(ConstantInt::get(I64Ty, 1));
+  VPValue *Two = Plan.getOrAddLiveIn(ConstantInt::get(I64Ty, 2));
+  VPValue *Ptr =
+      Plan.getOrAddLiveIn(Constant::getNullValue(PointerType::get(C, 0)));
+
+  VPInstruction *PtrAdd = Builder.createPtrAdd(Ptr, One);
+  VPInstruction *WidePtrAdd = Builder.createWidePtrAdd(Ptr, Two);
+
+  using namespace VPlanPatternMatch;
+  ASSERT_TRUE(
+      match(PtrAdd, m_GetElementPtr(m_Specific(Ptr), m_SpecificInt(1))));
+  ASSERT_FALSE(
+      match(PtrAdd, m_GetElementPtr(m_Specific(Ptr), m_SpecificInt(2))));
+  ASSERT_TRUE(
+      match(WidePtrAdd, m_GetElementPtr(m_Specific(Ptr), m_SpecificInt(2))));
+  ASSERT_FALSE(
+      match(WidePtrAdd, m_GetElementPtr(m_Specific(Ptr), m_SpecificInt(1))));
+}
 } // namespace
 } // namespace llvm
diff --git a/llvm/utils/TableGen/Basic/IntrinsicEmitter.cpp b/llvm/utils/TableGen/Basic/IntrinsicEmitter.cpp
index 09d29b8522f54..a702838afe463 100644
--- a/llvm/utils/TableGen/Basic/IntrinsicEmitter.cpp
+++ b/llvm/utils/TableGen/Basic/IntrinsicEmitter.cpp
@@ -629,21 +629,24 @@ static constexpr uint16_t IntrinsicsToAttributesMap[] = {)";
     UniqAttributes.try_emplace(&Int, ID);
   }
 
-  constexpr uint16_t NoFunctionAttrsID = 255;
-  if (UniqAttributes.size() > 256)
-    PrintFatalError("Too many unique argument attributes for table!");
-  // Note, ID 255 is used to indicate no function attributes.
-  if (UniqFnAttributes.size() > 255)
-    PrintFatalError("Too many unique function attributes for table!");
-
-  // Assign a 16-bit packed ID for each intrinsic. The lower 8-bits will be its
-  // "argument attribute ID" (index in UniqAttributes) and upper 8 bits will be
+  const uint8_t UniqAttributesBitSize = Log2_32_Ceil(UniqAttributes.size() + 1);
+  // Note, ID `-1` is used to indicate no function attributes.
+  const uint8_t UniqFnAttributesBitSize =
+      Log2_32_Ceil(UniqFnAttributes.size() + 2);
+  const uint16_t NoFunctionAttrsID =
+      maskTrailingOnes<uint16_t>(UniqFnAttributesBitSize);
+  if (UniqAttributesBitSize + UniqFnAttributesBitSize > 16)
+    PrintFatalError(
+        "More than 16 bits are used for IntrinsicsToAttributesMap's entry!");
+
+  // Assign a 16-bit packed ID for each intrinsic. The lower bits will be its
+  // "argument attribute ID" (index in UniqAttributes) and upper bits will be
   // its "function attribute ID" (index in UniqFnAttributes).
   for (const CodeGenIntrinsic &Int : Ints) {
     uint16_t FnAttrIndex =
         hasFnAttributes(Int) ? UniqFnAttributes[&Int] : NoFunctionAttrsID;
-    OS << formatv("\n    {} << 8 | {}, // {}", FnAttrIndex,
-                  UniqAttributes[&Int], Int.Name);
+    OS << formatv("\n    {} << {} | {}, // {}", FnAttrIndex,
+                  UniqAttributesBitSize, UniqAttributes[&Int], Int.Name);
   }
 
   OS << R"(
@@ -749,8 +752,8 @@ AttributeList Intrinsic::getAttributes(LLVMContext &C, ID id,
     return AttributeList();
 
   uint16_t PackedID = IntrinsicsToAttributesMap[id - 1];
-  uint8_t FnAttrID = PackedID >> 8;
-  uint8_t ArgAttrID = PackedID & 0xFF;
+  uint16_t FnAttrID = PackedID >> ({});
+  uint16_t ArgAttrID = PackedID & ({});
   using PairTy = std::pair<unsigned, AttributeSet>;
   alignas(PairTy) char ASStorage[sizeof(PairTy) * {}];
   PairTy *AS = reinterpret_cast<PairTy *>(ASStorage);
@@ -772,10 +775,20 @@ AttributeList Intrinsic::getAttributes(LLVMContext &C, ID id,
   }
   return AttributeList::get(C, ArrayRef(AS, NumAttrs));
 }
+
+AttributeSet Intrinsic::getFnAttributes(LLVMContext &C, ID id) {
+  if (id == 0)
+    return AttributeSet();
+  uint16_t PackedID = IntrinsicsToAttributesMap[id - 1];
+  uint16_t FnAttrID = PackedID >> ({});
+  return getIntrinsicFnAttributeSet(C, FnAttrID);
+}
 #endif // GET_INTRINSIC_ATTRIBUTES
 
 )",
-                MaxNumAttrs, NoFunctionAttrsID);
+                UniqAttributesBitSize,
+                maskTrailingOnes<uint16_t>(UniqAttributesBitSize), MaxNumAttrs,
+                NoFunctionAttrsID, UniqAttributesBitSize);
 }
 
 void IntrinsicEmitter::EmitIntrinsicToBuiltinMap(
diff --git a/llvm/utils/TableGen/CodeEmitterGen.cpp b/llvm/utils/TableGen/CodeEmitterGen.cpp
index 6a57ef6c90b36..a61ba54d3ffd2 100644
--- a/llvm/utils/TableGen/CodeEmitterGen.cpp
+++ b/llvm/utils/TableGen/CodeEmitterGen.cpp
@@ -31,6 +31,7 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/Format.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/TableGen/Error.h"
@@ -48,30 +49,29 @@ using namespace llvm;
 namespace {
 
 class CodeEmitterGen {
-  const RecordKeeper &Records;
+  const RecordKeeper &RK;
+  CodeGenTarget Target;
+  const CodeGenHwModes &CGH;
 
 public:
-  CodeEmitterGen(const RecordKeeper &R) : Records(R) {}
+  explicit CodeEmitterGen(const RecordKeeper &RK);
 
   void run(raw_ostream &O);
 
 private:
   int getVariableBit(const std::string &VarName, const BitsInit *BI, int Bit);
-  std::pair<std::string, std::string>
-  getInstructionCases(const Record *R, const CodeGenTarget &Target);
+  std::pair<std::string, std::string> getInstructionCases(const Record *R);
   void addInstructionCasesForEncoding(const Record *R,
                                       const Record *EncodingDef,
-                                      const CodeGenTarget &Target,
                                       std::string &Case,
                                       std::string &BitOffsetCase);
   bool addCodeToMergeInOperand(const Record *R, const BitsInit *BI,
                                const std::string &VarName, std::string &Case,
-                               std::string &BitOffsetCase,
-                               const CodeGenTarget &Target);
+                               std::string &BitOffsetCase);
 
   void emitInstructionBaseValues(
       raw_ostream &O, ArrayRef<const CodeGenInstruction *> NumberedInstructions,
-      const CodeGenTarget &Target, unsigned HwMode = DefaultMode);
+      unsigned HwMode = DefaultMode);
   void
   emitCaseMap(raw_ostream &O,
               const std::map<std::string, std::vector<std::string>> &CaseMap);
@@ -102,8 +102,7 @@ bool CodeEmitterGen::addCodeToMergeInOperand(const Record *R,
                                              const BitsInit *BI,
                                              const std::string &VarName,
                                              std::string &Case,
-                                             std::string &BitOffsetCase,
-                                             const CodeGenTarget &Target) {
+                                             std::string &BitOffsetCase) {
   CodeGenInstruction &CGI = Target.getInstruction(R);
 
   // Determine if VarName actually contributes to the Inst encoding.
@@ -141,58 +140,28 @@ bool CodeEmitterGen::addCodeToMergeInOperand(const Record *R,
   StringRef EncoderMethodName =
       CGI.Operands[SO.first].EncoderMethodNames[SO.second];
 
-  if (UseAPInt)
-    Case += "      op.clearAllBits();\n";
+  raw_string_ostream OS(Case);
+  indent Indent(6);
+
+  OS << Indent << "// op: " << VarName << '\n';
 
-  Case += "      // op: " + VarName + "\n";
+  if (UseAPInt)
+    OS << Indent << "op.clearAllBits();\n";
 
-  // If the source operand has a custom encoder, use it.
   if (!EncoderMethodName.empty()) {
-    raw_string_ostream CaseOS(Case);
-    CaseOS << indent(6);
     if (UseAPInt)
-      CaseOS << EncoderMethodName << "(MI, " << OpIdx << ", op";
+      OS << Indent << EncoderMethodName << "(MI, " << OpIdx
+         << ", op, Fixups, STI);\n";
     else
-      CaseOS << "op = " << EncoderMethodName << "(MI, " << OpIdx;
-    CaseOS << ", Fixups, STI);\n";
+      OS << Indent << "op = " << EncoderMethodName << "(MI, " << OpIdx
+         << ", Fixups, STI);\n";
   } else {
-    if (UseAPInt) {
-      Case +=
-          "      getMachineOpValue(MI, MI.getOperand(" + utostr(OpIdx) + ")";
-      Case += ", op, Fixups, STI";
-    } else {
-      Case += "      op = getMachineOpValue(MI, MI.getOperand(" +
-              utostr(OpIdx) + ")";
-      Case += ", Fixups, STI";
-    }
-    Case += ");\n";
-  }
-
-  // Precalculate the number of lits this variable contributes to in the
-  // operand. If there is a single lit (consecutive range of bits) we can use a
-  // destructive sequence on APInt that reduces memory allocations.
-  int NumOperandLits = 0;
-  for (int TmpBit = Bit; TmpBit >= 0;) {
-    int VarBit = getVariableBit(VarName, BI, TmpBit);
-
-    // If this bit isn't from a variable, skip it.
-    if (VarBit == -1) {
-      --TmpBit;
-      continue;
-    }
-
-    // Figure out the consecutive range of bits covered by this operand, in
-    // order to generate better encoding code.
-    int BeginVarBit = VarBit;
-    int N = 1;
-    for (--TmpBit; TmpBit >= 0;) {
-      VarBit = getVariableBit(VarName, BI, TmpBit);
-      if (VarBit == -1 || VarBit != (BeginVarBit - N))
-        break;
-      ++N;
-      --TmpBit;
-    }
-    ++NumOperandLits;
+    if (UseAPInt)
+      OS << Indent << "getMachineOpValue(MI, MI.getOperand(" << OpIdx
+         << "), op, Fixups, STI);\n";
+    else
+      OS << Indent << "op = getMachineOpValue(MI, MI.getOperand(" << OpIdx
+         << "), Fixups, STI);\n";
   }
 
   unsigned BitOffset = -1;
@@ -218,52 +187,25 @@ bool CodeEmitterGen::addCodeToMergeInOperand(const Record *R,
       --Bit;
     }
 
-    std::string MaskStr;
-    int OpShift;
-
     unsigned LoBit = BeginVarBit - N + 1;
-    unsigned HiBit = LoBit + N;
     unsigned LoInstBit = BeginInstBit - N + 1;
     BitOffset = LoInstBit;
     if (UseAPInt) {
-      std::string ExtractStr;
-      if (N >= 64) {
-        ExtractStr = "op.extractBits(" + itostr(HiBit - LoBit) + ", " +
-                     itostr(LoBit) + ")";
-        Case += "      Value.insertBits(" + ExtractStr + ", " +
-                itostr(LoInstBit) + ");\n";
-      } else {
-        ExtractStr = "op.extractBitsAsZExtValue(" + itostr(HiBit - LoBit) +
-                     ", " + itostr(LoBit) + ")";
-        Case += "      Value.insertBits(" + ExtractStr + ", " +
-                itostr(LoInstBit) + ", " + itostr(HiBit - LoBit) + ");\n";
-      }
+      if (N > 64)
+        OS << Indent << "Value.insertBits(op.extractBits(" << N << ", " << LoBit
+           << "), " << LoInstBit << ");\n";
+      else
+        OS << Indent << "Value.insertBits(op.extractBitsAsZExtValue(" << N
+           << ", " << LoBit << "), " << LoInstBit << ", " << N << ");\n";
     } else {
-      uint64_t OpMask = ~(uint64_t)0 >> (64 - N);
-      OpShift = BeginVarBit - N + 1;
-      OpMask <<= OpShift;
-      MaskStr = "UINT64_C(" + utostr(OpMask) + ")";
-      OpShift = BeginInstBit - BeginVarBit;
-
-      if (NumOperandLits == 1) {
-        Case += "      op &= " + MaskStr + ";\n";
-        if (OpShift > 0) {
-          Case += "      op <<= " + itostr(OpShift) + ";\n";
-        } else if (OpShift < 0) {
-          Case += "      op >>= " + itostr(-OpShift) + ";\n";
-        }
-        Case += "      Value |= op;\n";
-      } else {
-        if (OpShift > 0) {
-          Case += "      Value |= (op & " + MaskStr + ") << " +
-                  itostr(OpShift) + ";\n";
-        } else if (OpShift < 0) {
-          Case += "      Value |= (op & " + MaskStr + ") >> " +
-                  itostr(-OpShift) + ";\n";
-        } else {
-          Case += "      Value |= (op & " + MaskStr + ");\n";
-        }
-      }
+      uint64_t OpMask = maskTrailingOnes<uint64_t>(N) << LoBit;
+      OS << Indent << "Value |= (op & " << format_hex(OpMask, 0) << ')';
+      int OpShift = BeginInstBit - BeginVarBit;
+      if (OpShift > 0)
+        OS << " << " << OpShift;
+      else if (OpShift < 0)
+        OS << " >> " << -OpShift;
+      OS << ";\n";
     }
   }
 
@@ -277,8 +219,7 @@ bool CodeEmitterGen::addCodeToMergeInOperand(const Record *R,
 }
 
 std::pair<std::string, std::string>
-CodeEmitterGen::getInstructionCases(const Record *R,
-                                    const CodeGenTarget &Target) {
+CodeEmitterGen::getInstructionCases(const Record *R) {
   std::string Case, BitOffsetCase;
 
   auto Append = [&](const std::string &S) {
@@ -287,8 +228,7 @@ CodeEmitterGen::getInstructionCases(const Record *R,
   };
 
   if (const Record *RV = R->getValueAsOptionalDef("EncodingInfos")) {
-    const CodeGenHwModes &HWM = Target.getHwModes();
-    EncodingInfoByHwMode EBM(RV, HWM);
+    EncodingInfoByHwMode EBM(RV, CGH);
 
     // Invoke the interface to obtain the HwMode ID controlling the
     // EncodingInfo for the current subtarget. This interface will
@@ -304,7 +244,7 @@ CodeEmitterGen::getInstructionCases(const Record *R,
             "      case " + itostr(DefaultMode) + ": InstBitsByHw = InstBits";
       } else {
         Case += "      case " + itostr(ModeId) + ": InstBitsByHw = InstBits_" +
-                HWM.getMode(ModeId).Name.str();
+                CGH.getMode(ModeId).Name.str();
       }
       Case += "; break;\n";
     }
@@ -326,20 +266,20 @@ CodeEmitterGen::getInstructionCases(const Record *R,
     Append("      default: llvm_unreachable(\"Unhandled HwMode\");\n");
     for (auto &[ModeId, Encoding] : EBM) {
       Append("      case " + itostr(ModeId) + ": {\n");
-      addInstructionCasesForEncoding(R, Encoding, Target, Case, BitOffsetCase);
+      addInstructionCasesForEncoding(R, Encoding, Case, BitOffsetCase);
       Append("      break;\n");
       Append("      }\n");
     }
     Append("      }\n");
     return {std::move(Case), std::move(BitOffsetCase)};
   }
-  addInstructionCasesForEncoding(R, R, Target, Case, BitOffsetCase);
+  addInstructionCasesForEncoding(R, R, Case, BitOffsetCase);
   return {std::move(Case), std::move(BitOffsetCase)};
 }
 
 void CodeEmitterGen::addInstructionCasesForEncoding(
-    const Record *R, const Record *EncodingDef, const CodeGenTarget &Target,
-    std::string &Case, std::string &BitOffsetCase) {
+    const Record *R, const Record *EncodingDef, std::string &Case,
+    std::string &BitOffsetCase) {
   const BitsInit *BI = EncodingDef->getValueAsBitsInit("Inst");
 
   // Loop over all of the fields in the instruction, determining which are the
@@ -354,8 +294,8 @@ void CodeEmitterGen::addInstructionCasesForEncoding(
     if (RV.isNonconcreteOK() || RV.getValue()->isComplete())
       continue;
 
-    Success &= addCodeToMergeInOperand(R, BI, RV.getName().str(), Case,
-                                       BitOffsetCase, Target);
+    Success &=
+        addCodeToMergeInOperand(R, BI, RV.getName().str(), Case, BitOffsetCase);
   }
   // Avoid empty switches.
   if (BitOffsetCase.size() == BitOffsetCaseSizeBeforeLoop)
@@ -389,19 +329,18 @@ static void emitInstBits(raw_ostream &OS, const APInt &Bits) {
 
 void CodeEmitterGen::emitInstructionBaseValues(
     raw_ostream &O, ArrayRef<const CodeGenInstruction *> NumberedInstructions,
-    const CodeGenTarget &Target, unsigned HwMode) {
-  const CodeGenHwModes &HWM = Target.getHwModes();
+    unsigned HwMode) {
   if (HwMode == DefaultMode)
     O << "  static const uint64_t InstBits[] = {\n";
   else
-    O << "  static const uint64_t InstBits_"
-      << HWM.getModeName(HwMode, /*IncludeDefault=*/true) << "[] = {\n";
+    O << "  static const uint64_t InstBits_" << CGH.getModeName(HwMode)
+      << "[] = {\n";
 
   for (const CodeGenInstruction *CGI : NumberedInstructions) {
     const Record *R = CGI->TheDef;
     const Record *EncodingDef = R;
     if (const Record *RV = R->getValueAsOptionalDef("EncodingInfos")) {
-      EncodingInfoByHwMode EBM(RV, HWM);
+      EncodingInfoByHwMode EBM(RV, CGH);
       if (EBM.hasMode(HwMode)) {
         EncodingDef = EBM.get(HwMode);
       } else {
@@ -447,29 +386,29 @@ void CodeEmitterGen::emitCaseMap(
   }
 }
 
+CodeEmitterGen::CodeEmitterGen(const RecordKeeper &RK)
+    : RK(RK), Target(RK), CGH(Target.getHwModes()) {
+  // For little-endian instruction bit encodings, reverse the bit order.
+  Target.reverseBitsForLittleEndianEncoding();
+}
+
 void CodeEmitterGen::run(raw_ostream &O) {
   emitSourceFileHeader("Machine Code Emitter", O);
 
-  CodeGenTarget Target(Records);
-
-  // For little-endian instruction bit encodings, reverse the bit order
-  Target.reverseBitsForLittleEndianEncoding();
-
   ArrayRef<const CodeGenInstruction *> EncodedInstructions =
       Target.getTargetNonPseudoInstructions();
 
   if (Target.hasVariableLengthEncodings()) {
-    emitVarLenCodeEmitter(Records, O);
+    emitVarLenCodeEmitter(RK, O);
     return;
   }
-  const CodeGenHwModes &HWM = Target.getHwModes();
   // The set of HwModes used by instruction encodings.
   std::set<unsigned> HwModes;
   BitWidth = 0;
   for (const CodeGenInstruction *CGI : EncodedInstructions) {
     const Record *R = CGI->TheDef;
     if (const Record *RV = R->getValueAsOptionalDef("EncodingInfos")) {
-      EncodingInfoByHwMode EBM(RV, HWM);
+      EncodingInfoByHwMode EBM(RV, CGH);
       for (const auto &[Key, Value] : EBM) {
         const BitsInit *BI = Value->getValueAsBitsInit("Inst");
         BitWidth = std::max(BitWidth, BI->getNumBits());
@@ -498,13 +437,13 @@ void CodeEmitterGen::run(raw_ostream &O) {
   }
 
   // Emit instruction base values
-  emitInstructionBaseValues(O, EncodedInstructions, Target, DefaultMode);
+  emitInstructionBaseValues(O, EncodedInstructions, DefaultMode);
   if (!HwModes.empty()) {
     // Emit table for instrs whose encodings are controlled by HwModes.
     for (unsigned HwMode : HwModes) {
       if (HwMode == DefaultMode)
         continue;
-      emitInstructionBaseValues(O, EncodedInstructions, Target, HwMode);
+      emitInstructionBaseValues(O, EncodedInstructions, HwMode);
     }
 
     // This pointer will be assigned to the HwMode table later.
@@ -521,7 +460,7 @@ void CodeEmitterGen::run(raw_ostream &O) {
     std::string InstName =
         (R->getValueAsString("Namespace") + "::" + R->getName()).str();
     std::string Case, BitOffsetCase;
-    std::tie(Case, BitOffsetCase) = getInstructionCases(R, Target);
+    std::tie(Case, BitOffsetCase) = getInstructionCases(R);
 
     CaseMap[Case].push_back(InstName);
     BitOffsetCaseMap[BitOffsetCase].push_back(std::move(InstName));
diff --git a/llvm/utils/TableGen/Common/CMakeLists.txt b/llvm/utils/TableGen/Common/CMakeLists.txt
index 7342156980f35..66279a3ed3755 100644
--- a/llvm/utils/TableGen/Common/CMakeLists.txt
+++ b/llvm/utils/TableGen/Common/CMakeLists.txt
@@ -29,6 +29,7 @@ add_llvm_library(LLVMTableGenCommon STATIC OBJECT EXCLUDE_FROM_ALL DISABLE_LLVM_
   CodeGenTarget.cpp
   DAGISelMatcher.cpp
   InfoByHwMode.cpp
+  InstructionEncoding.cpp
   OptEmitter.cpp
   PredicateExpander.cpp
   SubtargetFeatureInfo.cpp
diff --git a/llvm/utils/TableGen/Common/InstructionEncoding.cpp b/llvm/utils/TableGen/Common/InstructionEncoding.cpp
new file mode 100644
index 0000000000000..22163e1898333
--- /dev/null
+++ b/llvm/utils/TableGen/Common/InstructionEncoding.cpp
@@ -0,0 +1,429 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "InstructionEncoding.h"
+#include "CodeGenInstruction.h"
+#include "VarLenCodeEmitterGen.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "llvm/TableGen/Error.h"
+
+using namespace llvm;
+
+static std::string findOperandDecoderMethod(const Record *Record) {
+  std::string Decoder;
+
+  const RecordVal *DecoderString = Record->getValue("DecoderMethod");
+  const StringInit *String =
+      DecoderString ? dyn_cast<StringInit>(DecoderString->getValue()) : nullptr;
+  if (String) {
+    Decoder = String->getValue().str();
+    if (!Decoder.empty())
+      return Decoder;
+  }
+
+  if (Record->isSubClassOf("RegisterOperand"))
+    // Allows use of a DecoderMethod in referenced RegisterClass if set.
+    return findOperandDecoderMethod(Record->getValueAsDef("RegClass"));
+
+  if (Record->isSubClassOf("RegisterClass")) {
+    Decoder = "Decode" + Record->getName().str() + "RegisterClass";
+  } else if (Record->isSubClassOf("PointerLikeRegClass")) {
+    Decoder = "DecodePointerLikeRegClass" +
+              utostr(Record->getValueAsInt("RegClassKind"));
+  }
+
+  return Decoder;
+}
+
+static OperandInfo getOpInfo(const Record *TypeRecord) {
+  const RecordVal *HasCompleteDecoderVal =
+      TypeRecord->getValue("hasCompleteDecoder");
+  const BitInit *HasCompleteDecoderBit =
+      HasCompleteDecoderVal
+          ? dyn_cast<BitInit>(HasCompleteDecoderVal->getValue())
+          : nullptr;
+  bool HasCompleteDecoder =
+      HasCompleteDecoderBit ? HasCompleteDecoderBit->getValue() : true;
+
+  return OperandInfo(findOperandDecoderMethod(TypeRecord), HasCompleteDecoder);
+}
+
+void InstructionEncoding::parseVarLenEncoding(const VarLenInst &VLI) {
+  InstBits = KnownBits(VLI.size());
+  SoftFailMask = APInt(VLI.size(), 0);
+
+  // Parse Inst field.
+  unsigned I = 0;
+  for (const EncodingSegment &S : VLI) {
+    if (const auto *SegmentBits = dyn_cast<BitsInit>(S.Value)) {
+      for (const Init *V : SegmentBits->getBits()) {
+        if (const auto *B = dyn_cast<BitInit>(V)) {
+          if (B->getValue())
+            InstBits.One.setBit(I);
+          else
+            InstBits.Zero.setBit(I);
+        }
+        ++I;
+      }
+    } else if (const auto *B = dyn_cast<BitInit>(S.Value)) {
+      if (B->getValue())
+        InstBits.One.setBit(I);
+      else
+        InstBits.Zero.setBit(I);
+      ++I;
+    } else {
+      I += S.BitWidth;
+    }
+  }
+  assert(I == VLI.size());
+}
+
+void InstructionEncoding::parseFixedLenEncoding(
+    const BitsInit &RecordInstBits) {
+  // For fixed length instructions, sometimes the `Inst` field specifies more
+  // bits than the actual size of the instruction, which is specified in `Size`.
+  // In such cases, we do some basic validation and drop the upper bits.
+  unsigned BitWidth = EncodingDef->getValueAsInt("Size") * 8;
+  unsigned InstNumBits = RecordInstBits.getNumBits();
+
+  // Returns true if all bits in `Bits` are zero or unset.
+  auto CheckAllZeroOrUnset = [&](ArrayRef<const Init *> Bits,
+                                 const RecordVal *Field) {
+    bool AllZeroOrUnset = llvm::all_of(Bits, [](const Init *Bit) {
+      if (const auto *BI = dyn_cast<BitInit>(Bit))
+        return !BI->getValue();
+      return isa<UnsetInit>(Bit);
+    });
+    if (AllZeroOrUnset)
+      return;
+    PrintNote([Field](raw_ostream &OS) { Field->print(OS); });
+    PrintFatalError(EncodingDef, Twine(Name) + ": Size is " + Twine(BitWidth) +
+                                     " bits, but " + Field->getName() +
+                                     " bits beyond that are    not zero/unset");
+  };
+
+  if (InstNumBits < BitWidth)
+    PrintFatalError(EncodingDef, Twine(Name) + ": Size is " + Twine(BitWidth) +
+                                     " bits, but Inst specifies only " +
+                                     Twine(InstNumBits) + " bits");
+
+  if (InstNumBits > BitWidth) {
+    // Ensure that all the bits beyond 'Size' are 0 or unset (i.e., carry no
+    // actual encoding).
+    ArrayRef<const Init *> UpperBits =
+        RecordInstBits.getBits().drop_front(BitWidth);
+    const RecordVal *InstField = EncodingDef->getValue("Inst");
+    CheckAllZeroOrUnset(UpperBits, InstField);
+  }
+
+  ArrayRef<const Init *> ActiveInstBits =
+      RecordInstBits.getBits().take_front(BitWidth);
+  InstBits = KnownBits(BitWidth);
+  SoftFailMask = APInt(BitWidth, 0);
+
+  // Parse Inst field.
+  for (auto [I, V] : enumerate(ActiveInstBits)) {
+    if (const auto *B = dyn_cast<BitInit>(V)) {
+      if (B->getValue())
+        InstBits.One.setBit(I);
+      else
+        InstBits.Zero.setBit(I);
+    }
+  }
+
+  // Parse SoftFail field.
+  const RecordVal *SoftFailField = EncodingDef->getValue("SoftFail");
+  if (!SoftFailField)
+    return;
+
+  const auto *SFBits = dyn_cast<BitsInit>(SoftFailField->getValue());
+  if (!SFBits || SFBits->getNumBits() != InstNumBits) {
+    PrintNote(EncodingDef->getLoc(), "in record");
+    PrintFatalError(SoftFailField,
+                    formatv("SoftFail field, if defined, must be "
+                            "of the same type as Inst, which is bits<{}>",
+                            InstNumBits));
+  }
+
+  if (InstNumBits > BitWidth) {
+    // Ensure that all upper bits of `SoftFail` are 0 or unset.
+    ArrayRef<const Init *> UpperBits = SFBits->getBits().drop_front(BitWidth);
+    CheckAllZeroOrUnset(UpperBits, SoftFailField);
+  }
+
+  ArrayRef<const Init *> ActiveSFBits = SFBits->getBits().take_front(BitWidth);
+  for (auto [I, V] : enumerate(ActiveSFBits)) {
+    if (const auto *B = dyn_cast<BitInit>(V); B && B->getValue()) {
+      if (!InstBits.Zero[I] && !InstBits.One[I]) {
+        PrintNote(EncodingDef->getLoc(), "in record");
+        PrintError(SoftFailField,
+                   formatv("SoftFail{{{0}} = 1 requires Inst{{{0}} "
+                           "to be fully defined (0 or 1, not '?')",
+                           I));
+      }
+      SoftFailMask.setBit(I);
+    }
+  }
+}
+
+void InstructionEncoding::parseVarLenOperands(const VarLenInst &VLI) {
+  SmallVector<int> TiedTo;
+
+  for (const auto &[Idx, Op] : enumerate(Inst->Operands)) {
+    if (Op.MIOperandInfo && Op.MIOperandInfo->getNumArgs() > 0)
+      for (auto *Arg : Op.MIOperandInfo->getArgs())
+        Operands.push_back(getOpInfo(cast<DefInit>(Arg)->getDef()));
+    else
+      Operands.push_back(getOpInfo(Op.Rec));
+
+    int TiedReg = Op.getTiedRegister();
+    TiedTo.push_back(-1);
+    if (TiedReg != -1) {
+      TiedTo[Idx] = TiedReg;
+      TiedTo[TiedReg] = Idx;
+    }
+  }
+
+  unsigned CurrBitPos = 0;
+  for (const auto &EncodingSegment : VLI) {
+    unsigned Offset = 0;
+    StringRef OpName;
+
+    if (const StringInit *SI = dyn_cast<StringInit>(EncodingSegment.Value)) {
+      OpName = SI->getValue();
+    } else if (const DagInit *DI = dyn_cast<DagInit>(EncodingSegment.Value)) {
+      OpName = cast<StringInit>(DI->getArg(0))->getValue();
+      Offset = cast<IntInit>(DI->getArg(2))->getValue();
+    }
+
+    if (!OpName.empty()) {
+      auto OpSubOpPair = Inst->Operands.parseOperandName(OpName);
+      unsigned OpIdx = Inst->Operands.getFlattenedOperandNumber(OpSubOpPair);
+      Operands[OpIdx].addField(CurrBitPos, EncodingSegment.BitWidth, Offset);
+      if (!EncodingSegment.CustomDecoder.empty())
+        Operands[OpIdx].Decoder = EncodingSegment.CustomDecoder.str();
+
+      int TiedReg = TiedTo[OpSubOpPair.first];
+      if (TiedReg != -1) {
+        unsigned OpIdx = Inst->Operands.getFlattenedOperandNumber(
+            {TiedReg, OpSubOpPair.second});
+        Operands[OpIdx].addField(CurrBitPos, EncodingSegment.BitWidth, Offset);
+      }
+    }
+
+    CurrBitPos += EncodingSegment.BitWidth;
+  }
+}
+
+static void debugDumpRecord(const Record &Rec) {
+  // Dump the record, so we can see what's going on.
+  PrintNote([&Rec](raw_ostream &OS) {
+    OS << "Dumping record for previous error:\n";
+    OS << Rec;
+  });
+}
+
+/// For an operand field named OpName: populate OpInfo.InitValue with the
+/// constant-valued bit values, and OpInfo.Fields with the ranges of bits to
+/// insert from the decoded instruction.
+static void addOneOperandFields(const Record *EncodingDef,
+                                const BitsInit &InstBits,
+                                std::map<StringRef, StringRef> &TiedNames,
+                                const Record *OpRec, StringRef OpName,
+                                OperandInfo &OpInfo) {
+  OpInfo.Name = OpName;
+
+  // Find a field with the operand's name.
+  const RecordVal *OpEncodingField = EncodingDef->getValue(OpName);
+
+  // If there is no such field, try tied operand's name.
+  if (!OpEncodingField) {
+    if (auto I = TiedNames.find(OpName); I != TiedNames.end())
+      OpEncodingField = EncodingDef->getValue(I->second);
+
+    // If still no luck, we're done with this operand.
+    if (!OpEncodingField) {
+      OpInfo.HasNoEncoding = true;
+      return;
+    }
+  }
+
+  // Some or all bits of the operand may be required to be 0 or 1 depending
+  // on the instruction's encoding. Collect those bits.
+  if (const auto *OpBit = dyn_cast<BitInit>(OpEncodingField->getValue())) {
+    OpInfo.InitValue = OpBit->getValue();
+    return;
+  }
+  if (const auto *OpBits = dyn_cast<BitsInit>(OpEncodingField->getValue())) {
+    if (OpBits->getNumBits() == 0) {
+      if (OpInfo.Decoder.empty()) {
+        PrintError(EncodingDef->getLoc(), "operand '" + OpName + "' of type '" +
+                                              OpRec->getName() +
+                                              "' must have a decoder method");
+      }
+      return;
+    }
+    for (unsigned I = 0; I < OpBits->getNumBits(); ++I) {
+      if (const auto *OpBit = dyn_cast<BitInit>(OpBits->getBit(I)))
+        OpInfo.InitValue = OpInfo.InitValue.value_or(0) |
+                           static_cast<uint64_t>(OpBit->getValue()) << I;
+    }
+  }
+
+  // Find out where the variable bits of the operand are encoded. The bits don't
+  // have to be consecutive or in ascending order. For example, an operand could
+  // be encoded as follows:
+  //
+  //  7    6      5      4    3    2      1    0
+  // {1, op{5}, op{2}, op{1}, 0, op{4}, op{3}, ?}
+  //
+  // In this example the operand is encoded in three segments:
+  //
+  //           Base Width Offset
+  // op{2...1}   4    2     1
+  // op{4...3}   1    2     3
+  // op{5}       6    1     5
+  //
+  for (unsigned I = 0, J = 0; I != InstBits.getNumBits(); I = J) {
+    const VarInit *Var;
+    unsigned Offset = 0;
+    for (; J != InstBits.getNumBits(); ++J) {
+      const Init *BitJ = InstBits.getBit(J);
+      if (const auto *VBI = dyn_cast<VarBitInit>(BitJ)) {
+        Var = dyn_cast<VarInit>(VBI->getBitVar());
+        if (I == J)
+          Offset = VBI->getBitNum();
+        else if (VBI->getBitNum() != Offset + J - I)
+          break;
+      } else {
+        Var = dyn_cast<VarInit>(BitJ);
+      }
+      if (!Var ||
+          (Var->getName() != OpName && Var->getName() != TiedNames[OpName]))
+        break;
+    }
+    if (I == J)
+      ++J;
+    else
+      OpInfo.addField(I, J - I, Offset);
+  }
+}
+
+void InstructionEncoding::parseFixedLenOperands(const BitsInit &Bits) {
+  // Search for tied operands, so that we can correctly instantiate
+  // operands that are not explicitly represented in the encoding.
+  std::map<StringRef, StringRef> TiedNames;
+  for (const auto &Op : Inst->Operands) {
+    for (const auto &[J, CI] : enumerate(Op.Constraints)) {
+      if (!CI.isTied())
+        continue;
+      std::pair<unsigned, unsigned> SO =
+          Inst->Operands.getSubOperandNumber(CI.getTiedOperand());
+      StringRef TiedName = Inst->Operands[SO.first].SubOpNames[SO.second];
+      if (TiedName.empty())
+        TiedName = Inst->Operands[SO.first].Name;
+      StringRef MyName = Op.SubOpNames[J];
+      if (MyName.empty())
+        MyName = Op.Name;
+
+      TiedNames[MyName] = TiedName;
+      TiedNames[TiedName] = MyName;
+    }
+  }
+
+  // For each operand, see if we can figure out where it is encoded.
+  for (const CGIOperandList::OperandInfo &Op : Inst->Operands) {
+    // Lookup the decoder method and construct a new OperandInfo to hold our
+    // result.
+    OperandInfo OpInfo = getOpInfo(Op.Rec);
+
+    // If we have named sub-operands...
+    if (Op.MIOperandInfo && !Op.SubOpNames[0].empty()) {
+      // Then there should not be a custom decoder specified on the top-level
+      // type.
+      if (!OpInfo.Decoder.empty()) {
+        PrintError(EncodingDef,
+                   "DecoderEmitter: operand \"" + Op.Name + "\" has type \"" +
+                       Op.Rec->getName() +
+                       "\" with a custom DecoderMethod, but also named "
+                       "sub-operands.");
+        continue;
+      }
+
+      // Decode each of the sub-ops separately.
+      for (auto [SubOpName, SubOp] :
+           zip_equal(Op.SubOpNames, Op.MIOperandInfo->getArgs())) {
+        const Record *SubOpRec = cast<DefInit>(SubOp)->getDef();
+        OperandInfo SubOpInfo = getOpInfo(SubOpRec);
+        addOneOperandFields(EncodingDef, Bits, TiedNames, SubOpRec, SubOpName,
+                            SubOpInfo);
+        Operands.push_back(std::move(SubOpInfo));
+      }
+      continue;
+    }
+
+    // Otherwise, if we have an operand with sub-operands, but they aren't
+    // named...
+    if (Op.MIOperandInfo && OpInfo.Decoder.empty()) {
+      // If we have sub-ops, we'd better have a custom decoder.
+      // (Otherwise we don't know how to populate them properly...)
+      if (Op.MIOperandInfo->getNumArgs()) {
+        PrintError(EncodingDef,
+                   "DecoderEmitter: operand \"" + Op.Name +
+                       "\" has non-empty MIOperandInfo, but doesn't "
+                       "have a custom decoder!");
+        debugDumpRecord(*EncodingDef);
+        continue;
+      }
+    }
+
+    addOneOperandFields(EncodingDef, Bits, TiedNames, Op.Rec, Op.Name, OpInfo);
+    Operands.push_back(std::move(OpInfo));
+  }
+}
+
+InstructionEncoding::InstructionEncoding(const Record *EncodingDef,
+                                         const CodeGenInstruction *Inst)
+    : EncodingDef(EncodingDef), Inst(Inst) {
+  const Record *InstDef = Inst->TheDef;
+
+  // Give this encoding a name.
+  if (EncodingDef != InstDef)
+    Name = (EncodingDef->getName() + Twine(':')).str();
+  Name.append(InstDef->getName());
+
+  DecoderNamespace = EncodingDef->getValueAsString("DecoderNamespace");
+  DecoderMethod = EncodingDef->getValueAsString("DecoderMethod");
+  if (!DecoderMethod.empty())
+    HasCompleteDecoder = EncodingDef->getValueAsBit("hasCompleteDecoder");
+
+  const RecordVal *InstField = EncodingDef->getValue("Inst");
+  if (const auto *DI = dyn_cast<DagInit>(InstField->getValue())) {
+    VarLenInst VLI(DI, InstField);
+    parseVarLenEncoding(VLI);
+    // If the encoding has a custom decoder, don't bother parsing the operands.
+    if (DecoderMethod.empty())
+      parseVarLenOperands(VLI);
+  } else {
+    const auto *BI = cast<BitsInit>(InstField->getValue());
+    parseFixedLenEncoding(*BI);
+    // If the encoding has a custom decoder, don't bother parsing the operands.
+    if (DecoderMethod.empty())
+      parseFixedLenOperands(*BI);
+  }
+
+  if (DecoderMethod.empty()) {
+    // A generated decoder is always successful if none of the operand
+    // decoders can fail (all are always successful).
+    HasCompleteDecoder = all_of(Operands, [](const OperandInfo &Op) {
+      // By default, a generated operand decoder is assumed to always succeed.
+      // This can be overridden by the user.
+      return Op.Decoder.empty() || Op.HasCompleteDecoder;
+    });
+  }
+}
diff --git a/llvm/utils/TableGen/Common/InstructionEncoding.h b/llvm/utils/TableGen/Common/InstructionEncoding.h
new file mode 100644
index 0000000000000..40c89dd4c6f2d
--- /dev/null
+++ b/llvm/utils/TableGen/Common/InstructionEncoding.h
@@ -0,0 +1,150 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_UTILS_TABLEGEN_COMMON_INSTRUCTIONENCODING_H
+#define LLVM_UTILS_TABLEGEN_COMMON_INSTRUCTIONENCODING_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/KnownBits.h"
+#include <optional>
+#include <string>
+#include <vector>
+
+namespace llvm {
+
+class BitsInit;
+class CodeGenInstruction;
+class Record;
+class RecordVal;
+class VarLenInst;
+
+// Represents a span of bits in the instruction encoding that's based on a span
+// of bits in an operand's encoding.
+//
+// Width is the width of the span.
+// Base is the starting position of that span in the instruction encoding.
+// Offset if the starting position of that span in the operand's encoding.
+// That is, bits {Base + Width - 1, Base} in the instruction encoding form
+// bits {Offset + Width - 1, Offset} in the operands encoding.
+struct EncodingField {
+  unsigned Base, Width, Offset;
+  EncodingField(unsigned B, unsigned W, unsigned O)
+      : Base(B), Width(W), Offset(O) {}
+};
+
+struct OperandInfo {
+  StringRef Name;
+  bool HasNoEncoding = false;
+  std::vector<EncodingField> Fields;
+  std::string Decoder;
+  bool HasCompleteDecoder;
+  std::optional<uint64_t> InitValue;
+
+  OperandInfo(std::string D, bool HCD) : Decoder(D), HasCompleteDecoder(HCD) {}
+
+  void addField(unsigned Base, unsigned Width, unsigned Offset) {
+    Fields.emplace_back(Base, Width, Offset);
+  }
+
+  ArrayRef<EncodingField> fields() const { return Fields; }
+};
+
+/// Represents a parsed InstructionEncoding record or a record derived from it.
+class InstructionEncoding {
+  /// The Record this encoding originates from.
+  const Record *EncodingDef;
+
+  /// The instruction this encoding is for.
+  const CodeGenInstruction *Inst;
+
+  /// The name of this encoding (for debugging purposes).
+  std::string Name;
+
+  /// The namespace in which this encoding exists.
+  StringRef DecoderNamespace;
+
+  /// Known bits of this encoding. This is the value of the `Inst` field
+  /// with any variable references replaced with '?'.
+  KnownBits InstBits;
+
+  /// Mask of bits that should be considered unknown during decoding.
+  /// This is the value of the `SoftFail` field.
+  APInt SoftFailMask;
+
+  /// The name of the function to use for decoding. May be an empty string,
+  /// meaning the decoder is generated.
+  StringRef DecoderMethod;
+
+  /// Whether the custom decoding function always succeeds. If a custom decoder
+  /// function is specified, the value is taken from the target description,
+  /// otherwise it is inferred.
+  bool HasCompleteDecoder;
+
+  /// Information about the operands' contribution to this encoding.
+  SmallVector<OperandInfo, 16> Operands;
+
+public:
+  InstructionEncoding(const Record *EncodingDef,
+                      const CodeGenInstruction *Inst);
+
+  /// Returns the Record this encoding originates from.
+  const Record *getRecord() const { return EncodingDef; }
+
+  /// Returns the instruction this encoding is for.
+  const CodeGenInstruction *getInstruction() const { return Inst; }
+
+  /// Returns the name of this encoding, for debugging purposes.
+  StringRef getName() const { return Name; }
+
+  /// Returns the namespace in which this encoding exists.
+  StringRef getDecoderNamespace() const { return DecoderNamespace; }
+
+  /// Returns the size of this encoding, in bits.
+  unsigned getBitWidth() const { return InstBits.getBitWidth(); }
+
+  /// Returns the known bits of this encoding.
+  const KnownBits &getInstBits() const { return InstBits; }
+
+  /// Returns a mask of bits that should be considered unknown during decoding.
+  const APInt &getSoftFailMask() const { return SoftFailMask; }
+
+  /// Returns the known bits of this encoding that must match for
+  /// successful decoding.
+  KnownBits getMandatoryBits() const {
+    KnownBits EncodingBits = InstBits;
+    // Mark all bits that are allowed to change according to SoftFail mask
+    // as unknown.
+    EncodingBits.Zero &= ~SoftFailMask;
+    EncodingBits.One &= ~SoftFailMask;
+    return EncodingBits;
+  }
+
+  /// Returns the name of the function to use for decoding, or an empty string
+  /// if the decoder is generated.
+  StringRef getDecoderMethod() const { return DecoderMethod; }
+
+  /// Returns whether the decoder (either generated or specified by the user)
+  /// always succeeds.
+  bool hasCompleteDecoder() const { return HasCompleteDecoder; }
+
+  /// Returns information about the operands' contribution to this encoding.
+  ArrayRef<OperandInfo> getOperands() const { return Operands; }
+
+private:
+  void parseVarLenEncoding(const VarLenInst &VLI);
+  void parseFixedLenEncoding(const BitsInit &RecordInstBits);
+
+  void parseVarLenOperands(const VarLenInst &VLI);
+  void parseFixedLenOperands(const BitsInit &Bits);
+};
+
+} // namespace llvm
+
+#endif // LLVM_UTILS_TABLEGEN_COMMON_INSTRUCTIONENCODING_H
diff --git a/llvm/utils/TableGen/Common/SubtargetFeatureInfo.cpp b/llvm/utils/TableGen/Common/SubtargetFeatureInfo.cpp
index bd8af32ee5096..a426bf4ef4b77 100644
--- a/llvm/utils/TableGen/Common/SubtargetFeatureInfo.cpp
+++ b/llvm/utils/TableGen/Common/SubtargetFeatureInfo.cpp
@@ -190,8 +190,9 @@ void SubtargetFeatureInfo::emitMCPredicateCheck(
   bool ParenIfBinOp = range_size(MCPredicates) > 1;
   for (const Record *R : MCPredicates) {
     OS << LS;
-    emitFeaturesAux(TargetName, *R->getValueAsDag("AssemblerCondDag"),
-                    ParenIfBinOp, OS);
+    if (emitFeaturesAux(TargetName, *R->getValueAsDag("AssemblerCondDag"),
+                        ParenIfBinOp, OS))
+      PrintFatalError(R, "Invalid AssemblerCondDag!");
   }
 }
 
@@ -206,12 +207,14 @@ void SubtargetFeatureInfo::emitComputeAssemblerAvailableFeatures(
     OS << "const ";
   OS << "{\n";
   OS << "  FeatureBitset Features;\n";
-  for (const auto &SF : SubtargetFeatures) {
-    const SubtargetFeatureInfo &SFI = SF.second;
+  for (const SubtargetFeatureInfo &SFI : make_second_range(SubtargetFeatures)) {
+    const Record *Def = SFI.TheDef;
 
     OS << "  if (";
-    emitFeaturesAux(TargetName, *SFI.TheDef->getValueAsDag("AssemblerCondDag"),
-                    /*ParenIfBinOp=*/false, OS);
+    if (emitFeaturesAux(TargetName, *Def->getValueAsDag("AssemblerCondDag"),
+                        /*ParenIfBinOp=*/false, OS))
+      PrintFatalError(Def, "Invalid AssemblerCondDag!");
+
     OS << ")\n";
     OS << "    Features.set(" << SFI.getEnumBitName() << ");\n";
   }
diff --git a/llvm/utils/TableGen/DFAPacketizerEmitter.cpp b/llvm/utils/TableGen/DFAPacketizerEmitter.cpp
index 8cb2c22736f8a..1bf4c7a8fd80a 100644
--- a/llvm/utils/TableGen/DFAPacketizerEmitter.cpp
+++ b/llvm/utils/TableGen/DFAPacketizerEmitter.cpp
@@ -266,6 +266,25 @@ void DFAPacketizerEmitter::emitForItineraries(
   }
   OS << "  " << ScheduleClasses.size() << "\n};\n\n";
 
+  // Output the mapping from proc ID to ResourceIndexStart
+  Idx = 1;
+  OS << "int " << TargetName << DFAName
+     << "GetResourceIndex(unsigned ProcID) { \n"
+     << "  static const unsigned " << TargetName << DFAName
+     << "ProcIdToProcResourceIdxTable[][2] = {\n";
+  for (const CodeGenProcModel *Model : ProcModels) {
+    OS << "    { " << Model->Index << ",  " << Idx++ << " }, // "
+       << Model->ModelName << "\n";
+  }
+  OS << "  };\n"
+     << "  auto It = llvm::lower_bound(" << TargetName << DFAName
+     << "ProcIdToProcResourceIdxTable, ProcID,\n"
+     << "      [](const unsigned LHS[], unsigned Val) { return LHS[0] < Val; "
+        "});\n"
+     << "  assert(*It[0] == ProcID);\n"
+     << "  return (*It)[1];\n"
+     << "}\n\n";
+
   // The type of a state in the nondeterministic automaton we're defining.
   using NfaStateTy = uint64_t;
 
@@ -339,16 +358,17 @@ void DFAPacketizerEmitter::emitForItineraries(
 
   std::string SubTargetClassName = TargetName + "GenSubtargetInfo";
   OS << "namespace llvm {\n";
-  OS << "DFAPacketizer *" << SubTargetClassName << "::"
-     << "create" << DFAName
+  OS << "DFAPacketizer *" << SubTargetClassName << "::" << "create" << DFAName
      << "DFAPacketizer(const InstrItineraryData *IID) const {\n"
      << "  static Automaton<uint64_t> A(ArrayRef<" << TargetAndDFAName
      << "Transition>(" << TargetAndDFAName << "Transitions), "
      << TargetAndDFAName << "TransitionInfo);\n"
+     << "  unsigned Index = " << TargetName << DFAName
+     << "GetResourceIndex(IID->SchedModel.ProcID);\n"
      << "  unsigned ProcResIdxStart = " << TargetAndDFAName
-     << "ProcResourceIndexStart[IID->SchedModel.ProcID];\n"
+     << "ProcResourceIndexStart[Index];\n"
      << "  unsigned ProcResIdxNum = " << TargetAndDFAName
-     << "ProcResourceIndexStart[IID->SchedModel.ProcID + 1] - "
+     << "ProcResourceIndexStart[Index + 1] - "
         "ProcResIdxStart;\n"
      << "  return new DFAPacketizer(IID, A, {&" << TargetAndDFAName
      << "ResourceIndices[ProcResIdxStart], ProcResIdxNum});\n"
diff --git a/llvm/utils/TableGen/DecoderEmitter.cpp b/llvm/utils/TableGen/DecoderEmitter.cpp
index 8747d02ac892b..3a464e01042dc 100644
--- a/llvm/utils/TableGen/DecoderEmitter.cpp
+++ b/llvm/utils/TableGen/DecoderEmitter.cpp
@@ -15,6 +15,8 @@
 #include "Common/CodeGenInstruction.h"
 #include "Common/CodeGenTarget.h"
 #include "Common/InfoByHwMode.h"
+#include "Common/InstructionEncoding.h"
+#include "Common/SubtargetFeatureInfo.h"
 #include "Common/VarLenCodeEmitterGen.h"
 #include "TableGenBackends.h"
 #include "llvm/ADT/APInt.h"
@@ -33,6 +35,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Format.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/KnownBits.h"
@@ -53,6 +56,7 @@
 #include <vector>
 
 using namespace llvm;
+using namespace llvm::MCD;
 
 #define DEBUG_TYPE "decoder-emitter"
 
@@ -143,126 +147,6 @@ static void printKnownBits(raw_ostream &OS, const KnownBits &Bits,
 
 namespace {
 
-// Represents a span of bits in the instruction encoding that's based on a span
-// of bits in an operand's encoding.
-//
-// Width is the width of the span.
-// Base is the starting position of that span in the instruction encoding.
-// Offset if the starting position of that span in the operand's encoding.
-// That is, bits {Base + Width - 1, Base} in the instruction encoding form
-// bits {Offset + Width - 1, Offset} in the operands encoding.
-struct EncodingField {
-  unsigned Base, Width, Offset;
-  EncodingField(unsigned B, unsigned W, unsigned O)
-      : Base(B), Width(W), Offset(O) {}
-};
-
-struct OperandInfo {
-  std::vector<EncodingField> Fields;
-  std::string Decoder;
-  bool HasCompleteDecoder;
-  std::optional<uint64_t> InitValue;
-
-  OperandInfo(std::string D, bool HCD) : Decoder(D), HasCompleteDecoder(HCD) {}
-
-  void addField(unsigned Base, unsigned Width, unsigned Offset) {
-    Fields.emplace_back(Base, Width, Offset);
-  }
-
-  unsigned numFields() const { return Fields.size(); }
-
-  ArrayRef<EncodingField> fields() const { return Fields; }
-};
-
-/// Represents a parsed InstructionEncoding record or a record derived from it.
-class InstructionEncoding {
-  /// The Record this encoding originates from.
-  const Record *EncodingDef;
-
-  /// The instruction this encoding is for.
-  const CodeGenInstruction *Inst;
-
-  /// The name of this encoding (for debugging purposes).
-  std::string Name;
-
-  /// The namespace in which this encoding exists.
-  StringRef DecoderNamespace;
-
-  /// Known bits of this encoding. This is the value of the `Inst` field
-  /// with any variable references replaced with '?'.
-  KnownBits InstBits;
-
-  /// Mask of bits that should be considered unknown during decoding.
-  /// This is the value of the `SoftFail` field.
-  APInt SoftFailMask;
-
-  /// The name of the function to use for decoding. May be an empty string,
-  /// meaning the decoder is generated.
-  StringRef DecoderMethod;
-
-  /// Whether the custom decoding function always succeeds. If a custom decoder
-  /// function is specified, the value is taken from the target description,
-  /// otherwise it is inferred.
-  bool HasCompleteDecoder;
-
-  /// Information about the operands' contribution to this encoding.
-  SmallVector<OperandInfo, 16> Operands;
-
-public:
-  InstructionEncoding(const Record *EncodingDef,
-                      const CodeGenInstruction *Inst);
-
-  /// Returns the Record this encoding originates from.
-  const Record *getRecord() const { return EncodingDef; }
-
-  /// Returns the instruction this encoding is for.
-  const CodeGenInstruction *getInstruction() const { return Inst; }
-
-  /// Returns the name of this encoding, for debugging purposes.
-  StringRef getName() const { return Name; }
-
-  /// Returns the namespace in which this encoding exists.
-  StringRef getDecoderNamespace() const { return DecoderNamespace; }
-
-  /// Returns the size of this encoding, in bits.
-  unsigned getBitWidth() const { return InstBits.getBitWidth(); }
-
-  /// Returns the known bits of this encoding.
-  const KnownBits &getInstBits() const { return InstBits; }
-
-  /// Returns a mask of bits that should be considered unknown during decoding.
-  const APInt &getSoftFailMask() const { return SoftFailMask; }
-
-  /// Returns the known bits of this encoding that must match for
-  /// successful decoding.
-  KnownBits getMandatoryBits() const {
-    KnownBits EncodingBits = InstBits;
-    // Mark all bits that are allowed to change according to SoftFail mask
-    // as unknown.
-    EncodingBits.Zero &= ~SoftFailMask;
-    EncodingBits.One &= ~SoftFailMask;
-    return EncodingBits;
-  }
-
-  /// Returns the name of the function to use for decoding, or an empty string
-  /// if the decoder is generated.
-  StringRef getDecoderMethod() const { return DecoderMethod; }
-
-  /// Returns whether the decoder (either generated or specified by the user)
-  /// always succeeds.
-  bool hasCompleteDecoder() const { return HasCompleteDecoder; }
-
-  /// Returns information about the operands' contribution to this encoding.
-  ArrayRef<OperandInfo> getOperands() const { return Operands; }
-
-private:
-  void parseVarLenEncoding(const VarLenInst &VLI);
-  void parseFixedLenEncoding(const BitsInit &RecordInstBits);
-
-  void parseVarLenOperands(const VarLenInst &VLI);
-  void parseFixedLenOperands(const BitsInit &Bits);
-};
-
 /// Sorting predicate to sort encoding IDs by encoding width.
 class LessEncodingIDByWidth {
   ArrayRef<InstructionEncoding> Encodings;
@@ -276,8 +160,8 @@ class LessEncodingIDByWidth {
   }
 };
 
-typedef SmallSetVector<CachedHashString, 16> PredicateSet;
-typedef SmallSetVector<CachedHashString, 16> DecoderSet;
+using PredicateSet = SetVector<CachedHashString>;
+using DecoderSet = SetVector<CachedHashString>;
 
 class DecoderTable {
 public:
@@ -292,7 +176,7 @@ class DecoderTable {
   const_iterator end() const { return Data.end(); }
 
   /// Inserts a state machine opcode into the table.
-  void insertOpcode(MCD::DecoderOps Opcode) { Data.push_back(Opcode); }
+  void insertOpcode(DecoderOps Opcode) { Data.push_back(Opcode); }
 
   /// Inserts a uint8 encoded value into the table.
   void insertUInt8(unsigned Value) {
@@ -342,6 +226,28 @@ struct DecoderTableInfo {
   DecoderTable Table;
   PredicateSet Predicates;
   DecoderSet Decoders;
+  bool HasCheckPredicate;
+  bool HasSoftFail;
+
+  void insertPredicate(StringRef Predicate) {
+    Predicates.insert(CachedHashString(Predicate));
+  }
+
+  void insertDecoder(StringRef Decoder) {
+    Decoders.insert(CachedHashString(Decoder));
+  }
+
+  unsigned getPredicateIndex(StringRef Predicate) const {
+    auto I = find(Predicates, Predicate);
+    assert(I != Predicates.end());
+    return std::distance(Predicates.begin(), I);
+  }
+
+  unsigned getDecoderIndex(StringRef Decoder) const {
+    auto I = find(Decoders, Decoder);
+    assert(I != Decoders.end());
+    return std::distance(Decoders.begin(), I);
+  }
 };
 
 using NamespacesHwModesMap = std::map<StringRef, std::set<unsigned>>;
@@ -362,15 +268,14 @@ class DecoderEmitter {
 
   const CodeGenTarget &getTarget() const { return Target; }
 
-  // Emit the decoder state machine table. Returns a mask of MCD decoder ops
-  // that were emitted.
-  unsigned emitTable(formatted_raw_ostream &OS, DecoderTable &Table,
-                     StringRef Namespace, unsigned HwModeID, unsigned BitWidth,
-                     ArrayRef<unsigned> EncodingIDs) const;
+  // Emit the decoder state machine table.
+  void emitTable(formatted_raw_ostream &OS, DecoderTableInfo &TableInfo,
+                 StringRef Namespace, unsigned HwModeID, unsigned BitWidth,
+                 ArrayRef<unsigned> EncodingIDs) const;
   void emitInstrLenTable(formatted_raw_ostream &OS,
                          ArrayRef<unsigned> InstrLen) const;
   void emitPredicateFunction(formatted_raw_ostream &OS,
-                             PredicateSet &Predicates) const;
+                             const PredicateSet &Predicates) const;
   void emitDecoderFunction(formatted_raw_ostream &OS,
                            const DecoderSet &Decoders,
                            unsigned BucketBitWidth) const;
@@ -621,22 +526,6 @@ class DecoderTableBuilder {
   }
 
 private:
-  void emitBinaryParser(raw_ostream &OS, indent Indent,
-                        const OperandInfo &OpInfo) const;
-
-  void emitDecoder(raw_ostream &OS, indent Indent, unsigned EncodingID) const;
-
-  unsigned getDecoderIndex(unsigned EncodingID) const;
-
-  unsigned getPredicateIndex(StringRef P) const;
-
-  bool emitPredicateMatchAux(const Init &Val, bool ParenIfBinOp,
-                             raw_ostream &OS) const;
-
-  bool emitPredicateMatch(raw_ostream &OS, unsigned EncodingID) const;
-
-  bool doesOpcodeNeedPredicate(unsigned EncodingID) const;
-
   void emitPredicateTableEntry(unsigned EncodingID) const;
 
   void emitSoftFailTableEntry(unsigned EncodingID) const;
@@ -722,12 +611,29 @@ unsigned Filter::usefulness() const {
 //                              //
 //////////////////////////////////
 
-// Emit the decoder state machine table. Returns a mask of MCD decoder ops
-// that were emitted.
-unsigned DecoderEmitter::emitTable(formatted_raw_ostream &OS,
-                                   DecoderTable &Table, StringRef Namespace,
-                                   unsigned HwModeID, unsigned BitWidth,
-                                   ArrayRef<unsigned> EncodingIDs) const {
+static StringRef getDecoderOpName(DecoderOps Op) {
+#define CASE(OP)                                                               \
+  case OP:                                                                     \
+    return #OP
+  switch (Op) {
+    CASE(OPC_Scope);
+    CASE(OPC_ExtractField);
+    CASE(OPC_FilterValueOrSkip);
+    CASE(OPC_FilterValue);
+    CASE(OPC_CheckField);
+    CASE(OPC_CheckPredicate);
+    CASE(OPC_Decode);
+    CASE(OPC_SoftFail);
+  }
+#undef CASE
+  llvm_unreachable("Unknown decoder op");
+}
+
+// Emit the decoder state machine table.
+void DecoderEmitter::emitTable(formatted_raw_ostream &OS,
+                               DecoderTableInfo &TableInfo, StringRef Namespace,
+                               unsigned HwModeID, unsigned BitWidth,
+                               ArrayRef<unsigned> EncodingIDs) const {
   // We'll need to be able to map from a decoded opcode into the corresponding
   // EncodingID for this specific combination of BitWidth and Namespace. This
   // is used below to index into Encodings.
@@ -741,10 +647,10 @@ unsigned DecoderEmitter::emitTable(formatted_raw_ostream &OS,
   OS << "static const uint8_t DecoderTable" << Namespace;
   if (HwModeID != DefaultMode)
     OS << '_' << Target.getHwModes().getModeName(HwModeID);
-  OS << BitWidth << "[" << Table.size() << "] = {\n";
+  OS << BitWidth << "[" << TableInfo.Table.size() << "] = {\n";
 
   // Emit ULEB128 encoded value to OS, returning the number of bytes emitted.
-  auto emitULEB128 = [](DecoderTable::const_iterator &I,
+  auto EmitULEB128 = [](DecoderTable::const_iterator &I,
                         formatted_raw_ostream &OS) {
     while (*I >= 128)
       OS << (unsigned)*I++ << ", ";
@@ -753,7 +659,7 @@ unsigned DecoderEmitter::emitTable(formatted_raw_ostream &OS,
 
   // Emit `getNumToSkipInBytes()`-byte numtoskip value to OS, returning the
   // NumToSkip value.
-  auto emitNumToSkip = [](DecoderTable::const_iterator &I,
+  auto EmitNumToSkip = [](DecoderTable::const_iterator &I,
                           formatted_raw_ostream &OS) {
     uint8_t Byte = *I++;
     uint32_t NumToSkip = Byte;
@@ -771,149 +677,151 @@ unsigned DecoderEmitter::emitTable(formatted_raw_ostream &OS,
 
   // FIXME: We may be able to use the NumToSkip values to recover
   // appropriate indentation levels.
+  DecoderTable &Table = TableInfo.Table;
   DecoderTable::const_iterator I = Table.begin();
   DecoderTable::const_iterator E = Table.end();
   const uint8_t *const EndPtr = Table.data() + Table.size();
 
-  auto emitNumToSkipComment = [&](uint32_t NumToSkip, bool InComment = false) {
-    uint32_t Index = ((I - Table.begin()) + NumToSkip);
-    OS << (InComment ? ", " : "// ");
-    OS << "Skip to: " << Index;
+  auto EmitPos = [&OS](uint32_t Pos) {
+    constexpr uint32_t StartColumn = 12;
+    OS << "/* " << Pos << " */";
+    OS.PadToColumn(StartColumn);
+  };
+
+  auto StartComment = [&OS]() {
+    constexpr uint32_t CommentColumn = 52;
+    OS.PadToColumn(CommentColumn);
+    OS << "// ";
+  };
+
+  auto EmitNumToSkipComment = [&](uint32_t NumToSkip) {
+    uint32_t Index = (I - Table.begin()) + NumToSkip;
+    OS << "skip to " << Index;
   };
 
   // The first entry when specializing decoders per bitwidth is the bitwidth.
   // This will be used for additional checks in `decodeInstruction`.
   if (SpecializeDecodersPerBitwidth) {
-    OS << "/* 0  */";
-    OS.PadToColumn(14);
-    emitULEB128(I, OS);
-    OS << " // Bitwidth " << BitWidth << '\n';
+    EmitPos(0);
+    EmitULEB128(I, OS);
+    StartComment();
+    OS << "Bitwidth " << BitWidth << '\n';
   }
 
-  unsigned OpcodeMask = 0;
+  auto DecodeAndEmitULEB128 = [EndPtr,
+                               &EmitULEB128](DecoderTable::const_iterator &I,
+                                             formatted_raw_ostream &OS) {
+    const char *ErrMsg = nullptr;
+    uint64_t Value = decodeULEB128(&*I, nullptr, EndPtr, &ErrMsg);
+    assert(ErrMsg == nullptr && "ULEB128 value too large!");
+
+    EmitULEB128(I, OS);
+    return Value;
+  };
 
   while (I != E) {
     assert(I < E && "incomplete decode table entry!");
 
-    uint64_t Pos = I - Table.begin();
-    OS << "/* " << Pos << " */";
-    OS.PadToColumn(12);
-
+    uint32_t Pos = I - Table.begin();
+    EmitPos(Pos);
     const uint8_t DecoderOp = *I++;
-    OpcodeMask |= (1 << DecoderOp);
+    OS << getDecoderOpName(static_cast<DecoderOps>(DecoderOp)) << ", ";
     switch (DecoderOp) {
     default:
       PrintFatalError("Invalid decode table opcode: " + Twine((int)DecoderOp) +
                       " at index " + Twine(Pos));
-    case MCD::OPC_Scope: {
-      OS << "  MCD::OPC_Scope, ";
-      uint32_t NumToSkip = emitNumToSkip(I, OS);
-      emitNumToSkipComment(NumToSkip);
-      OS << '\n';
+    case OPC_Scope: {
+      uint32_t NumToSkip = EmitNumToSkip(I, OS);
+      StartComment();
+      uint32_t Index = (I - Table.begin()) + NumToSkip;
+      OS << "end scope at " << Index;
       break;
     }
-    case MCD::OPC_ExtractField: {
-      OS << "  MCD::OPC_ExtractField, ";
-
+    case OPC_ExtractField: {
       // ULEB128 encoded start value.
-      const char *ErrMsg = nullptr;
-      unsigned Start = decodeULEB128(&*I, nullptr, EndPtr, &ErrMsg);
-      assert(ErrMsg == nullptr && "ULEB128 value too large!");
-      emitULEB128(I, OS);
-
+      unsigned Start = DecodeAndEmitULEB128(I, OS);
       unsigned Len = *I++;
-      OS << Len << ",  // Inst{";
+      OS << Len << ',';
+      StartComment();
+      OS << "Field = Inst{";
       if (Len > 1)
-        OS << (Start + Len - 1) << "-";
-      OS << Start << "} ...\n";
+        OS << (Start + Len - 1) << '-';
+      OS << Start << '}';
       break;
     }
-    case MCD::OPC_FilterValueOrSkip: {
-      OS << "  MCD::OPC_FilterValueOrSkip, ";
+    case OPC_FilterValueOrSkip: {
       // The filter value is ULEB128 encoded.
-      emitULEB128(I, OS);
-      uint32_t NumToSkip = emitNumToSkip(I, OS);
-      emitNumToSkipComment(NumToSkip);
-      OS << '\n';
+      uint64_t FilterVal = DecodeAndEmitULEB128(I, OS);
+      uint32_t NumToSkip = EmitNumToSkip(I, OS);
+      StartComment();
+      OS << "if Field != " << format_hex(FilterVal, 0) << ' ';
+      EmitNumToSkipComment(NumToSkip);
       break;
     }
-    case MCD::OPC_FilterValue: {
-      OS << "  MCD::OPC_FilterValue, ";
+    case OPC_FilterValue: {
       // The filter value is ULEB128 encoded.
-      emitULEB128(I, OS);
-      OS << '\n';
+      uint64_t FilterVal = DecodeAndEmitULEB128(I, OS);
+
+      StartComment();
+      OS << "if Field != " << format_hex(FilterVal, 0) << " pop scope";
       break;
     }
-    case MCD::OPC_CheckField: {
-      OS << "  MCD::OPC_CheckField, ";
+    case OPC_CheckField: {
       // ULEB128 encoded start value.
-      emitULEB128(I, OS);
+      unsigned Start = DecodeAndEmitULEB128(I, OS);
+
       // 8-bit length.
       unsigned Len = *I++;
       OS << Len << ", ";
+
       // ULEB128 encoded field value.
-      emitULEB128(I, OS);
-      OS << '\n';
+      uint64_t FieldVal = DecodeAndEmitULEB128(I, OS);
+
+      StartComment();
+      OS << "if Inst{";
+      if (Len > 1)
+        OS << (Start + Len - 1) << '-';
+      OS << Start << "} != " << format_hex(FieldVal, 0) << " pop scope";
       break;
     }
-    case MCD::OPC_CheckPredicate: {
-      OS << "  MCD::OPC_CheckPredicate, ";
-      emitULEB128(I, OS);
-      OS << '\n';
+    case OPC_CheckPredicate: {
+      unsigned PIdx = DecodeAndEmitULEB128(I, OS);
+      StartComment();
+      OS << "if !checkPredicate(" << PIdx << ") pop scope";
       break;
     }
-    case MCD::OPC_Decode:
-    case MCD::OPC_TryDecode: {
-      bool IsTry = DecoderOp == MCD::OPC_TryDecode;
+    case OPC_Decode: {
       // Decode the Opcode value.
-      const char *ErrMsg = nullptr;
-      unsigned Opc = decodeULEB128(&*I, nullptr, EndPtr, &ErrMsg);
-      assert(ErrMsg == nullptr && "ULEB128 value too large!");
-
-      OS << "  MCD::OPC_" << (IsTry ? "Try" : "") << "Decode, ";
-      emitULEB128(I, OS);
+      unsigned Opc = DecodeAndEmitULEB128(I, OS);
 
       // Decoder index.
-      unsigned DecodeIdx = decodeULEB128(&*I, nullptr, EndPtr, &ErrMsg);
-      assert(ErrMsg == nullptr && "ULEB128 value too large!");
-      emitULEB128(I, OS);
+      unsigned DecodeIdx = DecodeAndEmitULEB128(I, OS);
 
       auto EncI = OpcodeToEncodingID.find(Opc);
       assert(EncI != OpcodeToEncodingID.end() && "no encoding entry");
       auto EncodingID = EncI->second;
 
-      if (!IsTry) {
-        OS << "// Opcode: " << Encodings[EncodingID].getName()
-           << ", DecodeIdx: " << DecodeIdx << '\n';
-        break;
-      }
-      OS << '\n';
+      StartComment();
+      OS << "Opcode: " << Encodings[EncodingID].getName()
+         << ", DecodeIdx: " << DecodeIdx;
       break;
     }
-    case MCD::OPC_SoftFail: {
-      OS << "  MCD::OPC_SoftFail, ";
+    case OPC_SoftFail: {
       // Decode the positive mask.
-      const char *ErrMsg = nullptr;
-      uint64_t PositiveMask = decodeULEB128(&*I, nullptr, EndPtr, &ErrMsg);
-      assert(ErrMsg == nullptr && "ULEB128 value too large!");
-      emitULEB128(I, OS);
+      uint64_t PositiveMask = DecodeAndEmitULEB128(I, OS);
 
       // Decode the negative mask.
-      uint64_t NegativeMask = decodeULEB128(&*I, nullptr, EndPtr, &ErrMsg);
-      assert(ErrMsg == nullptr && "ULEB128 value too large!");
-      emitULEB128(I, OS);
-      OS << "// +ve mask: 0x";
-      OS.write_hex(PositiveMask);
-      OS << ", -ve mask: 0x";
-      OS.write_hex(NegativeMask);
-      OS << '\n';
+      uint64_t NegativeMask = DecodeAndEmitULEB128(I, OS);
+
+      StartComment();
+      OS << "positive mask: " << format_hex(PositiveMask, 0)
+         << "negative mask: " << format_hex(NegativeMask, 0);
       break;
     }
     }
+    OS << '\n';
   }
   OS << "};\n\n";
-
-  return OpcodeMask;
 }
 
 void DecoderEmitter::emitInstrLenTable(formatted_raw_ostream &OS,
@@ -924,17 +832,17 @@ void DecoderEmitter::emitInstrLenTable(formatted_raw_ostream &OS,
   OS << "};\n\n";
 }
 
-void DecoderEmitter::emitPredicateFunction(formatted_raw_ostream &OS,
-                                           PredicateSet &Predicates) const {
+void DecoderEmitter::emitPredicateFunction(
+    formatted_raw_ostream &OS, const PredicateSet &Predicates) const {
   // The predicate function is just a big switch statement based on the
   // input predicate index.
   OS << "static bool checkDecoderPredicate(unsigned Idx, const FeatureBitset "
-        "&Bits) {\n";
+        "&FB) {\n";
   OS << "  switch (Idx) {\n";
   OS << "  default: llvm_unreachable(\"Invalid index!\");\n";
   for (const auto &[Index, Predicate] : enumerate(Predicates)) {
     OS << "  case " << Index << ":\n";
-    OS << "    return (" << Predicate << ");\n";
+    OS << "    return " << Predicate << ";\n";
   }
   OS << "  }\n";
   OS << "}\n\n";
@@ -982,7 +890,6 @@ void DecoderEmitter::emitDecoderFunction(formatted_raw_ostream &OS,
       PrintTemplate();
       PrintDecodeFnName(Index);
       OS << "(" << DecodeParams << ") {\n";
-      OS << "  using namespace llvm::MCD;\n";
       OS << "  " << TmpTypeDecl;
       OS << "  [[maybe_unused]] TmpType tmp;\n";
       OS << Decoder;
@@ -994,7 +901,6 @@ void DecoderEmitter::emitDecoderFunction(formatted_raw_ostream &OS,
   OS << "// Handling " << Decoders.size() << " cases.\n";
   PrintTemplate();
   OS << "decodeToMCInst(unsigned Idx, " << DecodeParams << ") {\n";
-  OS << "  using namespace llvm::MCD;\n";
   OS << "  DecodeComplete = true;\n";
 
   if (UseFnTableInDecodeToMCInst) {
@@ -1089,8 +995,19 @@ FilterChooser::getIslands(const KnownBits &EncodingBits) const {
   return Islands;
 }
 
-void DecoderTableBuilder::emitBinaryParser(raw_ostream &OS, indent Indent,
-                                           const OperandInfo &OpInfo) const {
+static void emitBinaryParser(raw_ostream &OS, indent Indent,
+                             const InstructionEncoding &Encoding,
+                             const OperandInfo &OpInfo) {
+  if (OpInfo.HasNoEncoding) {
+    // If an operand has no encoding, the old behavior is to not decode it
+    // automatically and let the target do it. This is error-prone, so the
+    // new behavior is to report an error.
+    if (!IgnoreNonDecodableOperands)
+      PrintError(Encoding.getRecord()->getLoc(),
+                 "could not find field for operand '" + OpInfo.Name + "'");
+    return;
+  }
+
   // Special case for 'bits<0>'.
   if (OpInfo.Fields.empty() && !OpInfo.InitValue) {
     if (IgnoreNonDecodableOperands)
@@ -1104,31 +1021,33 @@ void DecoderTableBuilder::emitBinaryParser(raw_ostream &OS, indent Indent,
     return;
   }
 
-  if (OpInfo.Fields.empty() && OpInfo.InitValue && IgnoreFullyDefinedOperands)
-    return;
-
-  // We need to construct the encoding of the operand from pieces if it is not
-  // encoded sequentially or has a non-zero constant part in the encoding.
-  bool UseInsertBits = OpInfo.numFields() > 1 || OpInfo.InitValue.value_or(0);
-
-  if (UseInsertBits) {
-    OS << Indent << "tmp = 0x";
-    OS.write_hex(OpInfo.InitValue.value_or(0));
-    OS << ";\n";
-  }
-
-  for (const auto &[Base, Width, Offset] : OpInfo.fields()) {
-    OS << Indent;
-    if (UseInsertBits)
-      OS << "insertBits(tmp, ";
-    else
-      OS << "tmp = ";
-    OS << "fieldFromInstruction(insn, " << Base << ", " << Width << ')';
-    if (UseInsertBits)
-      OS << ", " << Offset << ", " << Width << ')';
-    else if (Offset != 0)
+  if (OpInfo.fields().empty()) {
+    // Only a constant part. The old behavior is to not decode this operand.
+    if (IgnoreFullyDefinedOperands)
+      return;
+    // Initialize `tmp` with the constant part.
+    OS << Indent << "tmp = " << format_hex(*OpInfo.InitValue, 0) << ";\n";
+  } else if (OpInfo.fields().size() == 1 && !OpInfo.InitValue.value_or(0)) {
+    // One variable part and no/zero constant part. Initialize `tmp` with the
+    // variable part.
+    auto [Base, Width, Offset] = OpInfo.fields().front();
+    OS << Indent << "tmp = fieldFromInstruction(insn, " << Base << ", " << Width
+       << ')';
+    if (Offset)
       OS << " << " << Offset;
     OS << ";\n";
+  } else {
+    // General case. Initialize `tmp` with the constant part, if any, and
+    // insert the variable parts into it.
+    OS << Indent << "tmp = " << format_hex(OpInfo.InitValue.value_or(0), 0)
+       << ";\n";
+    for (auto [Base, Width, Offset] : OpInfo.fields()) {
+      OS << Indent << "tmp |= fieldFromInstruction(insn, " << Base << ", "
+         << Width << ')';
+      if (Offset)
+        OS << " << " << Offset;
+      OS << ";\n";
+    }
   }
 
   StringRef Decoder = OpInfo.Decoder;
@@ -1142,9 +1061,10 @@ void DecoderTableBuilder::emitBinaryParser(raw_ostream &OS, indent Indent,
   }
 }
 
-void DecoderTableBuilder::emitDecoder(raw_ostream &OS, indent Indent,
-                                      unsigned EncodingID) const {
-  const InstructionEncoding &Encoding = Encodings[EncodingID];
+static std::string getDecoderString(const InstructionEncoding &Encoding) {
+  std::string Decoder;
+  raw_string_ostream OS(Decoder);
+  indent Indent(UseFnTableInDecodeToMCInst ? 2 : 4);
 
   // If a custom instruction decoder was specified, use that.
   StringRef DecoderMethod = Encoding.getDecoderMethod();
@@ -1153,138 +1073,46 @@ void DecoderTableBuilder::emitDecoder(raw_ostream &OS, indent Indent,
        << "(MI, insn, Address, Decoder))) { "
        << (Encoding.hasCompleteDecoder() ? "" : "DecodeComplete = false; ")
        << "return MCDisassembler::Fail; }\n";
-    return;
-  }
-
-  for (const OperandInfo &Op : Encoding.getOperands())
-    emitBinaryParser(OS, Indent, Op);
-}
-
-unsigned DecoderTableBuilder::getDecoderIndex(unsigned EncodingID) const {
-  // Build up the predicate string.
-  SmallString<256> Decoder;
-  // FIXME: emitDecoder() function can take a buffer directly rather than
-  // a stream.
-  raw_svector_ostream S(Decoder);
-  indent Indent(UseFnTableInDecodeToMCInst ? 2 : 4);
-  emitDecoder(S, Indent, EncodingID);
-
-  // Using the full decoder string as the key value here is a bit
-  // heavyweight, but is effective. If the string comparisons become a
-  // performance concern, we can implement a mangling of the predicate
-  // data easily enough with a map back to the actual string. That's
-  // overkill for now, though.
-
-  // Make sure the predicate is in the table.
-  DecoderSet &Decoders = TableInfo.Decoders;
-  Decoders.insert(CachedHashString(Decoder));
-  // Now figure out the index for when we write out the table.
-  DecoderSet::const_iterator P = find(Decoders, Decoder.str());
-  return std::distance(Decoders.begin(), P);
-}
-
-// If ParenIfBinOp is true, print a surrounding () if Val uses && or ||.
-bool DecoderTableBuilder::emitPredicateMatchAux(const Init &Val,
-                                                bool ParenIfBinOp,
-                                                raw_ostream &OS) const {
-  if (const auto *D = dyn_cast<DefInit>(&Val)) {
-    if (!D->getDef()->isSubClassOf("SubtargetFeature"))
-      return true;
-    OS << "Bits[" << Target.getName() << "::" << D->getAsString() << "]";
-    return false;
-  }
-  if (const auto *D = dyn_cast<DagInit>(&Val)) {
-    std::string Op = D->getOperator()->getAsString();
-    if (Op == "not" && D->getNumArgs() == 1) {
-      OS << '!';
-      return emitPredicateMatchAux(*D->getArg(0), true, OS);
-    }
-    if ((Op == "any_of" || Op == "all_of") && D->getNumArgs() > 0) {
-      bool Paren = D->getNumArgs() > 1 && std::exchange(ParenIfBinOp, true);
-      if (Paren)
-        OS << '(';
-      ListSeparator LS(Op == "any_of" ? " || " : " && ");
-      for (auto *Arg : D->getArgs()) {
-        OS << LS;
-        if (emitPredicateMatchAux(*Arg, ParenIfBinOp, OS))
-          return true;
-      }
-      if (Paren)
-        OS << ')';
-      return false;
-    }
+  } else {
+    for (const OperandInfo &Op : Encoding.getOperands())
+      emitBinaryParser(OS, Indent, Encoding, Op);
   }
-  return true;
+  return Decoder;
 }
 
-bool DecoderTableBuilder::emitPredicateMatch(raw_ostream &OS,
-                                             unsigned EncodingID) const {
-  const ListInit *Predicates =
-      Encodings[EncodingID].getRecord()->getValueAsListInit("Predicates");
-  bool IsFirstEmission = true;
-  for (unsigned i = 0; i < Predicates->size(); ++i) {
-    const Record *Pred = Predicates->getElementAsRecord(i);
-    if (!Pred->getValue("AssemblerMatcherPredicate"))
-      continue;
-
-    if (!isa<DagInit>(Pred->getValue("AssemblerCondDag")->getValue()))
-      continue;
+static std::string getPredicateString(const InstructionEncoding &Encoding,
+                                      StringRef TargetName) {
+  std::vector<const Record *> Predicates =
+      Encoding.getRecord()->getValueAsListOfDefs("Predicates");
+  auto It = llvm::find_if(Predicates, [](const Record *R) {
+    return R->getValueAsBit("AssemblerMatcherPredicate");
+  });
+  if (It == Predicates.end())
+    return std::string();
 
-    if (!IsFirstEmission)
-      OS << " && ";
-    if (emitPredicateMatchAux(*Pred->getValueAsDag("AssemblerCondDag"),
-                              Predicates->size() > 1, OS))
-      PrintFatalError(Pred->getLoc(), "Invalid AssemblerCondDag!");
-    IsFirstEmission = false;
-  }
-  return !Predicates->empty();
+  std::string Predicate;
+  raw_string_ostream OS(Predicate);
+  SubtargetFeatureInfo::emitMCPredicateCheck(OS, TargetName, Predicates);
+  return Predicate;
 }
 
-bool DecoderTableBuilder::doesOpcodeNeedPredicate(unsigned EncodingID) const {
-  const ListInit *Predicates =
-      Encodings[EncodingID].getRecord()->getValueAsListInit("Predicates");
-  for (unsigned i = 0; i < Predicates->size(); ++i) {
-    const Record *Pred = Predicates->getElementAsRecord(i);
-    if (!Pred->getValue("AssemblerMatcherPredicate"))
-      continue;
-
-    if (isa<DagInit>(Pred->getValue("AssemblerCondDag")->getValue()))
-      return true;
-  }
-  return false;
-}
+void DecoderTableBuilder::emitPredicateTableEntry(unsigned EncodingID) const {
+  const InstructionEncoding &Encoding = Encodings[EncodingID];
+  std::string Predicate = getPredicateString(Encoding, Target.getName());
+  if (Predicate.empty())
+    return;
 
-unsigned DecoderTableBuilder::getPredicateIndex(StringRef Predicate) const {
   // Using the full predicate string as the key value here is a bit
   // heavyweight, but is effective. If the string comparisons become a
   // performance concern, we can implement a mangling of the predicate
   // data easily enough with a map back to the actual string. That's
   // overkill for now, though.
+  TableInfo.insertPredicate(Predicate);
+  unsigned PredicateIndex = TableInfo.getPredicateIndex(Predicate);
 
-  // Make sure the predicate is in the table.
-  TableInfo.Predicates.insert(CachedHashString(Predicate));
-  // Now figure out the index for when we write out the table.
-  PredicateSet::const_iterator P = find(TableInfo.Predicates, Predicate);
-  return (unsigned)(P - TableInfo.Predicates.begin());
-}
-
-void DecoderTableBuilder::emitPredicateTableEntry(unsigned EncodingID) const {
-  if (!doesOpcodeNeedPredicate(EncodingID))
-    return;
-
-  // Build up the predicate string.
-  SmallString<256> Predicate;
-  // FIXME: emitPredicateMatch() functions can take a buffer directly rather
-  // than a stream.
-  raw_svector_ostream PS(Predicate);
-  emitPredicateMatch(PS, EncodingID);
-
-  // Figure out the index into the predicate table for the predicate just
-  // computed.
-  unsigned PIdx = getPredicateIndex(PS.str());
-
-  TableInfo.Table.insertOpcode(MCD::OPC_CheckPredicate);
-  TableInfo.Table.insertULEB128(PIdx);
+  TableInfo.Table.insertOpcode(OPC_CheckPredicate);
+  TableInfo.Table.insertULEB128(PredicateIndex);
+  TableInfo.HasCheckPredicate = true;
 }
 
 void DecoderTableBuilder::emitSoftFailTableEntry(unsigned EncodingID) const {
@@ -1298,9 +1126,10 @@ void DecoderTableBuilder::emitSoftFailTableEntry(unsigned EncodingID) const {
   APInt PositiveMask = InstBits.Zero & SoftFailMask;
   APInt NegativeMask = InstBits.One & SoftFailMask;
 
-  TableInfo.Table.insertOpcode(MCD::OPC_SoftFail);
+  TableInfo.Table.insertOpcode(OPC_SoftFail);
   TableInfo.Table.insertULEB128(PositiveMask.getZExtValue());
   TableInfo.Table.insertULEB128(NegativeMask.getZExtValue());
+  TableInfo.HasSoftFail = true;
 }
 
 // Emits table entries to decode the singleton.
@@ -1318,7 +1147,7 @@ void DecoderTableBuilder::emitSingletonTableEntry(
 
   // Check any additional encoding fields needed.
   for (const FilterChooser::Island &Ilnd : reverse(Islands)) {
-    TableInfo.Table.insertOpcode(MCD::OPC_CheckField);
+    TableInfo.Table.insertOpcode(OPC_CheckField);
     TableInfo.Table.insertULEB128(Ilnd.StartBit);
     TableInfo.Table.insertUInt8(Ilnd.NumBits);
     TableInfo.Table.insertULEB128(Ilnd.FieldVal);
@@ -1327,23 +1156,19 @@ void DecoderTableBuilder::emitSingletonTableEntry(
   // Check for soft failure of the match.
   emitSoftFailTableEntry(EncodingID);
 
-  unsigned DIdx = getDecoderIndex(EncodingID);
-
-  // Produce OPC_Decode or OPC_TryDecode opcode based on the information
-  // whether the instruction decoder is complete or not. If it is complete
-  // then it handles all possible values of remaining variable/unfiltered bits
-  // and for any value can determine if the bitpattern is a valid instruction
-  // or not. This means OPC_Decode will be the final step in the decoding
-  // process. If it is not complete, then the Fail return code from the
-  // decoder method indicates that additional processing should be done to see
-  // if there is any other instruction that also matches the bitpattern and
-  // can decode it.
-  const MCD::DecoderOps DecoderOp =
-      Encoding.hasCompleteDecoder() ? MCD::OPC_Decode : MCD::OPC_TryDecode;
-  TableInfo.Table.insertOpcode(DecoderOp);
+  // Using the full decoder string as the key value here is a bit
+  // heavyweight, but is effective. If the string comparisons become a
+  // performance concern, we can implement a mangling of the predicate
+  // data easily enough with a map back to the actual string. That's
+  // overkill for now, though.
+  std::string Decoder = getDecoderString(Encoding);
+  TableInfo.insertDecoder(Decoder);
+  unsigned DecoderIndex = TableInfo.getDecoderIndex(Decoder);
+
+  TableInfo.Table.insertOpcode(MCD::OPC_Decode);
   const Record *InstDef = Encodings[EncodingID].getInstruction()->TheDef;
   TableInfo.Table.insertULEB128(Target.getInstrIntValue(InstDef));
-  TableInfo.Table.insertULEB128(DIdx);
+  TableInfo.Table.insertULEB128(DecoderIndex);
 }
 
 std::unique_ptr<Filter>
@@ -1628,7 +1453,7 @@ void DecoderTableBuilder::emitTableEntries(const FilterChooser &FC) const {
   // known don't, enter a scope so that they have a chance.
   size_t FixupLoc = 0;
   if (FC.VariableFC) {
-    Table.insertOpcode(MCD::OPC_Scope);
+    Table.insertOpcode(OPC_Scope);
     FixupLoc = Table.insertNumToSkip();
   }
 
@@ -1642,7 +1467,7 @@ void DecoderTableBuilder::emitTableEntries(const FilterChooser &FC) const {
     // If there is only one possible field value, emit a combined OPC_CheckField
     // instead of OPC_ExtractField + OPC_FilterValue.
     const auto &[FilterVal, Delegate] = *FC.FilterChooserMap.begin();
-    Table.insertOpcode(MCD::OPC_CheckField);
+    Table.insertOpcode(OPC_CheckField);
     Table.insertULEB128(FC.StartBit);
     Table.insertUInt8(FC.NumBits);
     Table.insertULEB128(FilterVal);
@@ -1651,13 +1476,13 @@ void DecoderTableBuilder::emitTableEntries(const FilterChooser &FC) const {
     emitTableEntries(*Delegate);
   } else {
     // The general case: emit a switch over the field value.
-    Table.insertOpcode(MCD::OPC_ExtractField);
+    Table.insertOpcode(OPC_ExtractField);
     Table.insertULEB128(FC.StartBit);
     Table.insertUInt8(FC.NumBits);
 
     // Emit switch cases for all but the last element.
     for (const auto &[FilterVal, Delegate] : drop_end(FC.FilterChooserMap)) {
-      Table.insertOpcode(MCD::OPC_FilterValueOrSkip);
+      Table.insertOpcode(OPC_FilterValueOrSkip);
       Table.insertULEB128(FilterVal);
       size_t FixupPos = Table.insertNumToSkip();
 
@@ -1671,7 +1496,7 @@ void DecoderTableBuilder::emitTableEntries(const FilterChooser &FC) const {
     // Emit a switch case for the last element. It never falls through;
     // if it doesn't match, we leave the current scope.
     const auto &[FilterVal, Delegate] = *FC.FilterChooserMap.rbegin();
-    Table.insertOpcode(MCD::OPC_FilterValue);
+    Table.insertOpcode(OPC_FilterValue);
     Table.insertULEB128(FilterVal);
 
     // Emit table entries for the last case.
@@ -1684,430 +1509,10 @@ void DecoderTableBuilder::emitTableEntries(const FilterChooser &FC) const {
   }
 }
 
-static std::string findOperandDecoderMethod(const Record *Record) {
-  std::string Decoder;
-
-  const RecordVal *DecoderString = Record->getValue("DecoderMethod");
-  const StringInit *String =
-      DecoderString ? dyn_cast<StringInit>(DecoderString->getValue()) : nullptr;
-  if (String) {
-    Decoder = String->getValue().str();
-    if (!Decoder.empty())
-      return Decoder;
-  }
-
-  if (Record->isSubClassOf("RegisterOperand"))
-    // Allows use of a DecoderMethod in referenced RegisterClass if set.
-    return findOperandDecoderMethod(Record->getValueAsDef("RegClass"));
-
-  if (Record->isSubClassOf("RegisterClass")) {
-    Decoder = "Decode" + Record->getName().str() + "RegisterClass";
-  } else if (Record->isSubClassOf("PointerLikeRegClass")) {
-    Decoder = "DecodePointerLikeRegClass" +
-              utostr(Record->getValueAsInt("RegClassKind"));
-  }
-
-  return Decoder;
-}
-
-OperandInfo getOpInfo(const Record *TypeRecord) {
-  const RecordVal *HasCompleteDecoderVal =
-      TypeRecord->getValue("hasCompleteDecoder");
-  const BitInit *HasCompleteDecoderBit =
-      HasCompleteDecoderVal
-          ? dyn_cast<BitInit>(HasCompleteDecoderVal->getValue())
-          : nullptr;
-  bool HasCompleteDecoder =
-      HasCompleteDecoderBit ? HasCompleteDecoderBit->getValue() : true;
-
-  return OperandInfo(findOperandDecoderMethod(TypeRecord), HasCompleteDecoder);
-}
-
-void InstructionEncoding::parseVarLenEncoding(const VarLenInst &VLI) {
-  InstBits = KnownBits(VLI.size());
-  SoftFailMask = APInt(VLI.size(), 0);
-
-  // Parse Inst field.
-  unsigned I = 0;
-  for (const EncodingSegment &S : VLI) {
-    if (const auto *SegmentBits = dyn_cast<BitsInit>(S.Value)) {
-      for (const Init *V : SegmentBits->getBits()) {
-        if (const auto *B = dyn_cast<BitInit>(V)) {
-          if (B->getValue())
-            InstBits.One.setBit(I);
-          else
-            InstBits.Zero.setBit(I);
-        }
-        ++I;
-      }
-    } else if (const auto *B = dyn_cast<BitInit>(S.Value)) {
-      if (B->getValue())
-        InstBits.One.setBit(I);
-      else
-        InstBits.Zero.setBit(I);
-      ++I;
-    } else {
-      I += S.BitWidth;
-    }
-  }
-  assert(I == VLI.size());
-}
-
-void InstructionEncoding::parseFixedLenEncoding(
-    const BitsInit &RecordInstBits) {
-  // For fixed length instructions, sometimes the `Inst` field specifies more
-  // bits than the actual size of the instruction, which is specified in `Size`.
-  // In such cases, we do some basic validation and drop the upper bits.
-  unsigned BitWidth = EncodingDef->getValueAsInt("Size") * 8;
-  unsigned InstNumBits = RecordInstBits.getNumBits();
-
-  // Returns true if all bits in `Bits` are zero or unset.
-  auto CheckAllZeroOrUnset = [&](ArrayRef<const Init *> Bits,
-                                 const RecordVal *Field) {
-    bool AllZeroOrUnset = llvm::all_of(Bits, [](const Init *Bit) {
-      if (const auto *BI = dyn_cast<BitInit>(Bit))
-        return !BI->getValue();
-      return isa<UnsetInit>(Bit);
-    });
-    if (AllZeroOrUnset)
-      return;
-    PrintNote([Field](raw_ostream &OS) { Field->print(OS); });
-    PrintFatalError(EncodingDef, Twine(Name) + ": Size is " + Twine(BitWidth) +
-                                     " bits, but " + Field->getName() +
-                                     " bits beyond that are    not zero/unset");
-  };
-
-  if (InstNumBits < BitWidth)
-    PrintFatalError(EncodingDef, Twine(Name) + ": Size is " + Twine(BitWidth) +
-                                     " bits, but Inst specifies only " +
-                                     Twine(InstNumBits) + " bits");
-
-  if (InstNumBits > BitWidth) {
-    // Ensure that all the bits beyond 'Size' are 0 or unset (i.e., carry no
-    // actual encoding).
-    ArrayRef<const Init *> UpperBits =
-        RecordInstBits.getBits().drop_front(BitWidth);
-    const RecordVal *InstField = EncodingDef->getValue("Inst");
-    CheckAllZeroOrUnset(UpperBits, InstField);
-  }
-
-  ArrayRef<const Init *> ActiveInstBits =
-      RecordInstBits.getBits().take_front(BitWidth);
-  InstBits = KnownBits(BitWidth);
-  SoftFailMask = APInt(BitWidth, 0);
-
-  // Parse Inst field.
-  for (auto [I, V] : enumerate(ActiveInstBits)) {
-    if (const auto *B = dyn_cast<BitInit>(V)) {
-      if (B->getValue())
-        InstBits.One.setBit(I);
-      else
-        InstBits.Zero.setBit(I);
-    }
-  }
-
-  // Parse SoftFail field.
-  const RecordVal *SoftFailField = EncodingDef->getValue("SoftFail");
-  if (!SoftFailField)
-    return;
-
-  const auto *SFBits = dyn_cast<BitsInit>(SoftFailField->getValue());
-  if (!SFBits || SFBits->getNumBits() != InstNumBits) {
-    PrintNote(EncodingDef->getLoc(), "in record");
-    PrintFatalError(SoftFailField,
-                    formatv("SoftFail field, if defined, must be "
-                            "of the same type as Inst, which is bits<{}>",
-                            InstNumBits));
-  }
-
-  if (InstNumBits > BitWidth) {
-    // Ensure that all upper bits of `SoftFail` are 0 or unset.
-    ArrayRef<const Init *> UpperBits = SFBits->getBits().drop_front(BitWidth);
-    CheckAllZeroOrUnset(UpperBits, SoftFailField);
-  }
-
-  ArrayRef<const Init *> ActiveSFBits = SFBits->getBits().take_front(BitWidth);
-  for (auto [I, V] : enumerate(ActiveSFBits)) {
-    if (const auto *B = dyn_cast<BitInit>(V); B && B->getValue()) {
-      if (!InstBits.Zero[I] && !InstBits.One[I]) {
-        PrintNote(EncodingDef->getLoc(), "in record");
-        PrintError(SoftFailField,
-                   formatv("SoftFail{{{0}} = 1 requires Inst{{{0}} "
-                           "to be fully defined (0 or 1, not '?')",
-                           I));
-      }
-      SoftFailMask.setBit(I);
-    }
-  }
-}
-
-void InstructionEncoding::parseVarLenOperands(const VarLenInst &VLI) {
-  SmallVector<int> TiedTo;
-
-  for (const auto &[Idx, Op] : enumerate(Inst->Operands)) {
-    if (Op.MIOperandInfo && Op.MIOperandInfo->getNumArgs() > 0)
-      for (auto *Arg : Op.MIOperandInfo->getArgs())
-        Operands.push_back(getOpInfo(cast<DefInit>(Arg)->getDef()));
-    else
-      Operands.push_back(getOpInfo(Op.Rec));
-
-    int TiedReg = Op.getTiedRegister();
-    TiedTo.push_back(-1);
-    if (TiedReg != -1) {
-      TiedTo[Idx] = TiedReg;
-      TiedTo[TiedReg] = Idx;
-    }
-  }
-
-  unsigned CurrBitPos = 0;
-  for (const auto &EncodingSegment : VLI) {
-    unsigned Offset = 0;
-    StringRef OpName;
-
-    if (const StringInit *SI = dyn_cast<StringInit>(EncodingSegment.Value)) {
-      OpName = SI->getValue();
-    } else if (const DagInit *DI = dyn_cast<DagInit>(EncodingSegment.Value)) {
-      OpName = cast<StringInit>(DI->getArg(0))->getValue();
-      Offset = cast<IntInit>(DI->getArg(2))->getValue();
-    }
-
-    if (!OpName.empty()) {
-      auto OpSubOpPair = Inst->Operands.parseOperandName(OpName);
-      unsigned OpIdx = Inst->Operands.getFlattenedOperandNumber(OpSubOpPair);
-      Operands[OpIdx].addField(CurrBitPos, EncodingSegment.BitWidth, Offset);
-      if (!EncodingSegment.CustomDecoder.empty())
-        Operands[OpIdx].Decoder = EncodingSegment.CustomDecoder.str();
-
-      int TiedReg = TiedTo[OpSubOpPair.first];
-      if (TiedReg != -1) {
-        unsigned OpIdx = Inst->Operands.getFlattenedOperandNumber(
-            {TiedReg, OpSubOpPair.second});
-        Operands[OpIdx].addField(CurrBitPos, EncodingSegment.BitWidth, Offset);
-      }
-    }
-
-    CurrBitPos += EncodingSegment.BitWidth;
-  }
-}
-
-static void debugDumpRecord(const Record &Rec) {
-  // Dump the record, so we can see what's going on.
-  PrintNote([&Rec](raw_ostream &OS) {
-    OS << "Dumping record for previous error:\n";
-    OS << Rec;
-  });
-}
-
-/// For an operand field named OpName: populate OpInfo.InitValue with the
-/// constant-valued bit values, and OpInfo.Fields with the ranges of bits to
-/// insert from the decoded instruction.
-static void addOneOperandFields(const Record *EncodingDef,
-                                const BitsInit &InstBits,
-                                std::map<StringRef, StringRef> &TiedNames,
-                                const Record *OpRec, StringRef OpName,
-                                OperandInfo &OpInfo) {
-  // Find a field with the operand's name.
-  const RecordVal *OpEncodingField = EncodingDef->getValue(OpName);
-
-  // If there is no such field, try tied operand's name.
-  if (!OpEncodingField) {
-    if (auto I = TiedNames.find(OpName); I != TiedNames.end())
-      OpEncodingField = EncodingDef->getValue(I->second);
-
-    // If still no luck, the old behavior is to not decode this operand
-    // automatically and let the target do it. This is error-prone, so
-    // the new behavior is to report an error.
-    if (!OpEncodingField) {
-      if (!IgnoreNonDecodableOperands)
-        PrintError(EncodingDef->getLoc(),
-                   "could not find field for operand '" + OpName + "'");
-      return;
-    }
-  }
-
-  // Some or all bits of the operand may be required to be 0 or 1 depending
-  // on the instruction's encoding. Collect those bits.
-  if (const auto *OpBit = dyn_cast<BitInit>(OpEncodingField->getValue())) {
-    OpInfo.InitValue = OpBit->getValue();
-    return;
-  }
-  if (const auto *OpBits = dyn_cast<BitsInit>(OpEncodingField->getValue())) {
-    if (OpBits->getNumBits() == 0) {
-      if (OpInfo.Decoder.empty()) {
-        PrintError(EncodingDef->getLoc(), "operand '" + OpName + "' of type '" +
-                                              OpRec->getName() +
-                                              "' must have a decoder method");
-      }
-      return;
-    }
-    for (unsigned I = 0; I < OpBits->getNumBits(); ++I) {
-      if (const auto *OpBit = dyn_cast<BitInit>(OpBits->getBit(I)))
-        OpInfo.InitValue = OpInfo.InitValue.value_or(0) |
-                           static_cast<uint64_t>(OpBit->getValue()) << I;
-    }
-  }
-
-  // Find out where the variable bits of the operand are encoded. The bits don't
-  // have to be consecutive or in ascending order. For example, an operand could
-  // be encoded as follows:
-  //
-  //  7    6      5      4    3    2      1    0
-  // {1, op{5}, op{2}, op{1}, 0, op{4}, op{3}, ?}
-  //
-  // In this example the operand is encoded in three segments:
-  //
-  //           Base Width Offset
-  // op{2...1}   4    2     1
-  // op{4...3}   1    2     3
-  // op{5}       6    1     5
-  //
-  for (unsigned I = 0, J = 0; I != InstBits.getNumBits(); I = J) {
-    const VarInit *Var;
-    unsigned Offset = 0;
-    for (; J != InstBits.getNumBits(); ++J) {
-      const Init *BitJ = InstBits.getBit(J);
-      if (const auto *VBI = dyn_cast<VarBitInit>(BitJ)) {
-        Var = dyn_cast<VarInit>(VBI->getBitVar());
-        if (I == J)
-          Offset = VBI->getBitNum();
-        else if (VBI->getBitNum() != Offset + J - I)
-          break;
-      } else {
-        Var = dyn_cast<VarInit>(BitJ);
-      }
-      if (!Var ||
-          (Var->getName() != OpName && Var->getName() != TiedNames[OpName]))
-        break;
-    }
-    if (I == J)
-      ++J;
-    else
-      OpInfo.addField(I, J - I, Offset);
-  }
-}
-
-void InstructionEncoding::parseFixedLenOperands(const BitsInit &Bits) {
-  // Search for tied operands, so that we can correctly instantiate
-  // operands that are not explicitly represented in the encoding.
-  std::map<StringRef, StringRef> TiedNames;
-  for (const auto &Op : Inst->Operands) {
-    for (const auto &[J, CI] : enumerate(Op.Constraints)) {
-      if (!CI.isTied())
-        continue;
-      std::pair<unsigned, unsigned> SO =
-          Inst->Operands.getSubOperandNumber(CI.getTiedOperand());
-      StringRef TiedName = Inst->Operands[SO.first].SubOpNames[SO.second];
-      if (TiedName.empty())
-        TiedName = Inst->Operands[SO.first].Name;
-      StringRef MyName = Op.SubOpNames[J];
-      if (MyName.empty())
-        MyName = Op.Name;
-
-      TiedNames[MyName] = TiedName;
-      TiedNames[TiedName] = MyName;
-    }
-  }
-
-  // For each operand, see if we can figure out where it is encoded.
-  for (const CGIOperandList::OperandInfo &Op : Inst->Operands) {
-    // Lookup the decoder method and construct a new OperandInfo to hold our
-    // result.
-    OperandInfo OpInfo = getOpInfo(Op.Rec);
-
-    // If we have named sub-operands...
-    if (Op.MIOperandInfo && !Op.SubOpNames[0].empty()) {
-      // Then there should not be a custom decoder specified on the top-level
-      // type.
-      if (!OpInfo.Decoder.empty()) {
-        PrintError(EncodingDef,
-                   "DecoderEmitter: operand \"" + Op.Name + "\" has type \"" +
-                       Op.Rec->getName() +
-                       "\" with a custom DecoderMethod, but also named "
-                       "sub-operands.");
-        continue;
-      }
-
-      // Decode each of the sub-ops separately.
-      for (auto [SubOpName, SubOp] :
-           zip_equal(Op.SubOpNames, Op.MIOperandInfo->getArgs())) {
-        const Record *SubOpRec = cast<DefInit>(SubOp)->getDef();
-        OperandInfo SubOpInfo = getOpInfo(SubOpRec);
-        addOneOperandFields(EncodingDef, Bits, TiedNames, SubOpRec, SubOpName,
-                            SubOpInfo);
-        Operands.push_back(std::move(SubOpInfo));
-      }
-      continue;
-    }
-
-    // Otherwise, if we have an operand with sub-operands, but they aren't
-    // named...
-    if (Op.MIOperandInfo && OpInfo.Decoder.empty()) {
-      // If we have sub-ops, we'd better have a custom decoder.
-      // (Otherwise we don't know how to populate them properly...)
-      if (Op.MIOperandInfo->getNumArgs()) {
-        PrintError(EncodingDef,
-                   "DecoderEmitter: operand \"" + Op.Name +
-                       "\" has non-empty MIOperandInfo, but doesn't "
-                       "have a custom decoder!");
-        debugDumpRecord(*EncodingDef);
-        continue;
-      }
-    }
-
-    addOneOperandFields(EncodingDef, Bits, TiedNames, Op.Rec, Op.Name, OpInfo);
-    Operands.push_back(std::move(OpInfo));
-  }
-}
-
-InstructionEncoding::InstructionEncoding(const Record *EncodingDef,
-                                         const CodeGenInstruction *Inst)
-    : EncodingDef(EncodingDef), Inst(Inst) {
-  const Record *InstDef = Inst->TheDef;
-
-  // Give this encoding a name.
-  if (EncodingDef != InstDef)
-    Name = (EncodingDef->getName() + Twine(':')).str();
-  Name.append(InstDef->getName());
-
-  DecoderNamespace = EncodingDef->getValueAsString("DecoderNamespace");
-  DecoderMethod = EncodingDef->getValueAsString("DecoderMethod");
-  if (!DecoderMethod.empty())
-    HasCompleteDecoder = EncodingDef->getValueAsBit("hasCompleteDecoder");
-
-  const RecordVal *InstField = EncodingDef->getValue("Inst");
-  if (const auto *DI = dyn_cast<DagInit>(InstField->getValue())) {
-    VarLenInst VLI(DI, InstField);
-    parseVarLenEncoding(VLI);
-    // If the encoding has a custom decoder, don't bother parsing the operands.
-    if (DecoderMethod.empty())
-      parseVarLenOperands(VLI);
-  } else {
-    const auto *BI = cast<BitsInit>(InstField->getValue());
-    parseFixedLenEncoding(*BI);
-    // If the encoding has a custom decoder, don't bother parsing the operands.
-    if (DecoderMethod.empty())
-      parseFixedLenOperands(*BI);
-  }
-
-  if (DecoderMethod.empty()) {
-    // A generated decoder is always successful if none of the operand
-    // decoders can fail (all are always successful).
-    HasCompleteDecoder = all_of(Operands, [](const OperandInfo &Op) {
-      // By default, a generated operand decoder is assumed to always succeed.
-      // This can be overridden by the user.
-      return Op.Decoder.empty() || Op.HasCompleteDecoder;
-    });
-  }
-}
-
 // emitDecodeInstruction - Emit the templated helper function
 // decodeInstruction().
 static void emitDecodeInstruction(formatted_raw_ostream &OS, bool IsVarLenInst,
-                                  unsigned OpcodeMask) {
-  const bool HasTryDecode = OpcodeMask & (1 << MCD::OPC_TryDecode);
-  const bool HasCheckPredicate = OpcodeMask & (1 << MCD::OPC_CheckPredicate);
-  const bool HasSoftFail = OpcodeMask & (1 << MCD::OPC_SoftFail);
-
+                                  const DecoderTableInfo &TableInfo) {
   OS << R"(
 static unsigned decodeNumToSkip(const uint8_t *&Ptr) {
   unsigned NumToSkip = *Ptr++;
@@ -2128,9 +1533,8 @@ static DecodeStatus decodeInstruction(const uint8_t DecodeTable[], MCInst &MI,
           "llvm::function_ref<void(APInt &, uint64_t)> makeUp";
   }
   OS << ") {\n";
-  if (HasCheckPredicate)
+  if (TableInfo.HasCheckPredicate)
     OS << "  const FeatureBitset &Bits = STI.getFeatureBits();\n";
-  OS << "  using namespace llvm::MCD;\n";
   OS << "  const uint8_t *Ptr = DecodeTable;\n";
 
   if (SpecializeDecodersPerBitwidth) {
@@ -2155,7 +1559,7 @@ static DecodeStatus decodeInstruction(const uint8_t DecodeTable[], MCInst &MI,
       errs() << Loc << ": Unexpected decode table opcode: "
              << (int)DecoderOp << '\n';
       return MCDisassembler::Fail;
-    case MCD::OPC_Scope: {
+    case OPC_Scope: {
       unsigned NumToSkip = decodeNumToSkip(Ptr);
       const uint8_t *SkipTo = Ptr + NumToSkip;
       ScopeStack.push_back(SkipTo);
@@ -2163,7 +1567,7 @@ static DecodeStatus decodeInstruction(const uint8_t DecodeTable[], MCInst &MI,
                         << ")\n");
       break;
     }
-    case MCD::OPC_ExtractField: {
+    case OPC_ExtractField: {
       // Decode the start value.
       unsigned Start = decodeULEB128AndIncUnsafe(Ptr);
       unsigned Len = *Ptr++;)";
@@ -2175,7 +1579,7 @@ static DecodeStatus decodeInstruction(const uint8_t DecodeTable[], MCInst &MI,
                    << Len << "): " << CurFieldValue << "\n");
       break;
     }
-    case MCD::OPC_FilterValueOrSkip: {
+    case OPC_FilterValueOrSkip: {
       // Decode the field value.
       uint64_t Val = decodeULEB128AndIncUnsafe(Ptr);
       bool Failed = Val != CurFieldValue;
@@ -2192,7 +1596,7 @@ static DecodeStatus decodeInstruction(const uint8_t DecodeTable[], MCInst &MI,
       }
       break;
     }
-    case MCD::OPC_FilterValue: {
+    case OPC_FilterValue: {
       // Decode the field value.
       uint64_t Val = decodeULEB128AndIncUnsafe(Ptr);
       bool Failed = Val != CurFieldValue;
@@ -2210,7 +1614,7 @@ static DecodeStatus decodeInstruction(const uint8_t DecodeTable[], MCInst &MI,
       }
       break;
     }
-    case MCD::OPC_CheckField: {
+    case OPC_CheckField: {
       // Decode the start value.
       unsigned Start = decodeULEB128AndIncUnsafe(Ptr);
       unsigned Len = *Ptr;)";
@@ -2238,9 +1642,9 @@ static DecodeStatus decodeInstruction(const uint8_t DecodeTable[], MCInst &MI,
       }
       break;
     })";
-  if (HasCheckPredicate) {
+  if (TableInfo.HasCheckPredicate) {
     OS << R"(
-    case MCD::OPC_CheckPredicate: {
+    case OPC_CheckPredicate: {
       // Decode the Predicate Index value.
       unsigned PIdx = decodeULEB128AndIncUnsafe(Ptr);
       // Check the predicate.
@@ -2261,7 +1665,7 @@ static DecodeStatus decodeInstruction(const uint8_t DecodeTable[], MCInst &MI,
     })";
   }
   OS << R"(
-    case MCD::OPC_Decode: {
+    case OPC_Decode: {
       // Decode the Opcode value.
       unsigned Opc = decodeULEB128AndIncUnsafe(Ptr);
       unsigned DecodeIdx = decodeULEB128AndIncUnsafe(Ptr);
@@ -2274,51 +1678,31 @@ static DecodeStatus decodeInstruction(const uint8_t DecodeTable[], MCInst &MI,
        << "      makeUp(insn, Len);";
   }
   OS << R"(
-      S = decodeToMCInst(DecodeIdx, S, insn, MI, Address, DisAsm, DecodeComplete);
-      assert(DecodeComplete);
-
+      S = decodeToMCInst(DecodeIdx, S, insn, MI, Address, DisAsm,
+                         DecodeComplete);
       LLVM_DEBUG(dbgs() << Loc << ": OPC_Decode: opcode " << Opc
-                   << ", using decoder " << DecodeIdx << ": "
-                   << (S != MCDisassembler::Fail ? "PASS\n" : "FAIL\n"));
-      return S;
-    })";
-  if (HasTryDecode) {
-    OS << R"(
-    case MCD::OPC_TryDecode: {
-      // Decode the Opcode value.
-      unsigned Opc = decodeULEB128AndIncUnsafe(Ptr);
-      unsigned DecodeIdx = decodeULEB128AndIncUnsafe(Ptr);
-
-      // Perform the decode operation.
-      MCInst TmpMI;
-      TmpMI.setOpcode(Opc);
-      bool DecodeComplete;
-      S = decodeToMCInst(DecodeIdx, S, insn, TmpMI, Address, DisAsm, DecodeComplete);
-      LLVM_DEBUG(dbgs() << Loc << ": OPC_TryDecode: opcode " << Opc
-                   << ", using decoder " << DecodeIdx << ": ");
+                        << ", using decoder " << DecodeIdx << ": "
+                        << (S ? "PASS, " : "FAIL, "));
 
       if (DecodeComplete) {
-        // Decoding complete.
-        LLVM_DEBUG(dbgs() << (S != MCDisassembler::Fail ? "PASS\n" : "FAIL\n"));
-        MI = TmpMI;
+        LLVM_DEBUG(dbgs() << "decoding complete\n");
         return S;
       }
       assert(S == MCDisassembler::Fail);
       if (ScopeStack.empty()) {
-        LLVM_DEBUG(dbgs() << "FAIL, returning FAIL\n");
+        LLVM_DEBUG(dbgs() << "returning Fail\n");
         return MCDisassembler::Fail;
       }
       Ptr = ScopeStack.pop_back_val();
-      LLVM_DEBUG(dbgs() << "FAIL, continuing at " << Ptr - DecodeTable << '\n');
+      LLVM_DEBUG(dbgs() << "continuing at " << Ptr - DecodeTable << '\n');
       // Reset decode status. This also drops a SoftFail status that could be
       // set before the decode attempt.
       S = MCDisassembler::Success;
       break;
     })";
-  }
-  if (HasSoftFail) {
+  if (TableInfo.HasSoftFail) {
     OS << R"(
-    case MCD::OPC_SoftFail: {
+    case OPC_SoftFail: {
       // Decode the mask values.
       uint64_t PositiveMask = decodeULEB128AndIncUnsafe(Ptr);
       uint64_t NegativeMask = decodeULEB128AndIncUnsafe(Ptr);
@@ -2575,9 +1959,8 @@ template <typename T> constexpr uint32_t InsnBitWidth = 0;
 
   // Entries in `EncMap` are already sorted by bitwidth. So bucketing per
   // bitwidth can be done on-the-fly as we iterate over the map.
-  DecoderTableInfo TableInfo;
+  DecoderTableInfo TableInfo{};
   DecoderTableBuilder TableBuilder(Target, Encodings, TableInfo);
-  unsigned OpcodeMask = 0;
 
   bool HasConflict = false;
   for (const auto &[BitWidth, BWMap] : EncMap) {
@@ -2602,8 +1985,8 @@ template <typename T> constexpr uint32_t InsnBitWidth = 0;
       TableBuilder.buildTable(FC, BitWidth);
 
       // Print the table to the output stream.
-      OpcodeMask |= emitTable(OS, TableInfo.Table, DecoderNamespace, HwModeID,
-                              BitWidth, EncodingIDs);
+      emitTable(OS, TableInfo, DecoderNamespace, HwModeID, BitWidth,
+                EncodingIDs);
     }
 
     // Each BitWidth get's its own decoders and decoder function if
@@ -2622,14 +2005,12 @@ template <typename T> constexpr uint32_t InsnBitWidth = 0;
   if (!SpecializeDecodersPerBitwidth)
     emitDecoderFunction(OS, TableInfo.Decoders, 0);
 
-  const bool HasCheckPredicate = OpcodeMask & (1 << MCD::OPC_CheckPredicate);
-
   // Emit the predicate function.
-  if (HasCheckPredicate)
+  if (TableInfo.HasCheckPredicate)
     emitPredicateFunction(OS, TableInfo.Predicates);
 
   // Emit the main entry point for the decoder, decodeInstruction().
-  emitDecodeInstruction(OS, IsVarLenInst, OpcodeMask);
+  emitDecodeInstruction(OS, IsVarLenInst, TableInfo);
 
   OS << "\n} // namespace\n";
 }
diff --git a/llvm/utils/gn/secondary/bolt/unittests/Core/BUILD.gn b/llvm/utils/gn/secondary/bolt/unittests/Core/BUILD.gn
index 14e6671f7d9a8..b0c2ca333cfab 100644
--- a/llvm/utils/gn/secondary/bolt/unittests/Core/BUILD.gn
+++ b/llvm/utils/gn/secondary/bolt/unittests/Core/BUILD.gn
@@ -16,6 +16,7 @@ unittest("CoreTests") {
   ]
   sources = [
     "BinaryContext.cpp",
+    "ClusteredRows.cpp",
     "DynoStats.cpp",
     "MCPlusBuilder.cpp",
     "MemoryMaps.cpp",
diff --git a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/bugprone/BUILD.gn b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/bugprone/BUILD.gn
index 0d1f91061af05..877dfce428c80 100644
--- a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/bugprone/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/bugprone/BUILD.gn
@@ -31,6 +31,7 @@ static_library("bugprone") {
     "CopyConstructorInitCheck.cpp",
     "CrtpConstructorAccessibilityCheck.cpp",
     "DanglingHandleCheck.cpp",
+    "DerivedMethodShadowingBaseMethodCheck.cpp",
     "DynamicStaticInitializersCheck.cpp",
     "EasilySwappableParametersCheck.cpp",
     "EmptyCatchCheck.cpp",
diff --git a/llvm/utils/gn/secondary/clang/lib/Format/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Format/BUILD.gn
index 93c0f3c51fe89..57e9300159971 100644
--- a/llvm/utils/gn/secondary/clang/lib/Format/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/lib/Format/BUILD.gn
@@ -21,6 +21,7 @@ static_library("Format") {
     "MacroExpander.cpp",
     "MatchFilePath.cpp",
     "NamespaceEndCommentsFixer.cpp",
+    "NumericLiteralCaseFixer.cpp",
     "NumericLiteralInfo.cpp",
     "ObjCPropertyAttributeOrderFixer.cpp",
     "QualifierAlignmentFixer.cpp",
diff --git a/llvm/utils/gn/secondary/clang/unittests/Format/BUILD.gn b/llvm/utils/gn/secondary/clang/unittests/Format/BUILD.gn
index 88521a8e59da2..c501f121df4a8 100644
--- a/llvm/utils/gn/secondary/clang/unittests/Format/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/unittests/Format/BUILD.gn
@@ -38,6 +38,7 @@ unittest("FormatTests") {
     "MacroExpanderTest.cpp",
     "MatchFilePathTest.cpp",
     "NamespaceEndCommentsFixerTest.cpp",
+    "NumericLiteralCaseTest.cpp",
     "NumericLiteralInfoTest.cpp",
     "ObjCPropertyAttributeOrderFixerTest.cpp",
     "QualifierFixerTest.cpp",
diff --git a/llvm/utils/gn/secondary/clang/unittests/Interpreter/BUILD.gn b/llvm/utils/gn/secondary/clang/unittests/Interpreter/BUILD.gn
index 103954e5756d3..74fcb8427511b 100644
--- a/llvm/utils/gn/secondary/clang/unittests/Interpreter/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/unittests/Interpreter/BUILD.gn
@@ -18,6 +18,7 @@ unittest("ClangReplInterpreterTests") {
     "IncrementalProcessingTest.cpp",
     "InterpreterExtensionsTest.cpp",
     "InterpreterTest.cpp",
+    "OutOfProcessInterpreterTests.cpp",
   ]
 
   # Support plugins.
diff --git a/llvm/utils/gn/secondary/lldb/test/BUILD.gn b/llvm/utils/gn/secondary/lldb/test/BUILD.gn
index 6b1cac748558c..e82fe2d11b75d 100644
--- a/llvm/utils/gn/secondary/lldb/test/BUILD.gn
+++ b/llvm/utils/gn/secondary/lldb/test/BUILD.gn
@@ -142,6 +142,7 @@ write_lit_cfg("lit_shell_site_cfg") {
     "LLDB_TOOL_LLDB_SERVER_BUILD=1",
     "LLDB_TOOLS_DIR=" + rebase_path("$root_out_dir/bin"),
     "LLDB_USE_SYSTEM_DEBUGSERVER=1",  # XXX port //lldb/tools/debugserver (?)
+    "LLVM_ENABLE_DIA_SDK=0", # FIXME: option? just enable on windows?
     "LLVM_HOST_TRIPLE=$llvm_current_triple",
     "LLVM_USE_SANITIZER=",
     "Python3_EXECUTABLE=$python_path",
diff --git a/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn b/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn
index 499ded9422dab..022cd87a5b303 100644
--- a/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn
@@ -301,6 +301,7 @@ write_cmake_config("llvm-config") {
     "LLVM_BUILD_SHARED_LIBS=",
     "LLVM_ENABLE_LLVM_C_EXPORT_ANNOTATIONS=",
     "LLVM_ENABLE_TELEMETRY=",
+    "LLVM_ENABLE_ONDISK_CAS=",
     "LLVM_DEFAULT_TARGET_TRIPLE=$llvm_target_triple",
     "LLVM_ENABLE_DEBUGLOC_TRACKING_COVERAGE=",
     "LLVM_ENABLE_DEBUGLOC_TRACKING_ORIGIN=",
diff --git a/llvm/utils/gn/secondary/llvm/lib/CAS/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/CAS/BUILD.gn
index 8b67ac3a8888d..2f692d752ee18 100644
--- a/llvm/utils/gn/secondary/llvm/lib/CAS/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/CAS/BUILD.gn
@@ -5,6 +5,8 @@ static_library("CAS") {
     "ActionCaches.cpp",
     "BuiltinCAS.cpp",
     "InMemoryCAS.cpp",
+    "MappedFileRegionArena.cpp",
     "ObjectStore.cpp",
+    "OnDiskCommon.cpp",
   ]
 }
diff --git a/llvm/utils/gn/secondary/llvm/lib/Support/LSP/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Support/LSP/BUILD.gn
new file mode 100644
index 0000000000000..c510891dca092
--- /dev/null
+++ b/llvm/utils/gn/secondary/llvm/lib/Support/LSP/BUILD.gn
@@ -0,0 +1,9 @@
+static_library("LSP") {
+  output_name = "LLVMSupportLSP"
+  deps = [ "//llvm/lib/Support" ]
+  sources = [
+    "Logging.cpp",
+    "Protocol.cpp",
+    "Transport.cpp",
+  ]
+}
diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/AArch64/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/AArch64/BUILD.gn
index 83fd0aa8de422..6d85c7fb67477 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Target/AArch64/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Target/AArch64/BUILD.gn
@@ -108,6 +108,7 @@ static_library("LLVMAArch64CodeGen") {
     "//llvm/lib/CodeGen/SelectionDAG",
     "//llvm/lib/IR",
     "//llvm/lib/MC",
+    "//llvm/lib/Passes",
     "//llvm/lib/Support",
     "//llvm/lib/Target",
     "//llvm/lib/TargetParser",
diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/AArch64/Disassembler/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/AArch64/Disassembler/BUILD.gn
index 196e4a6ae6826..c445ef2bacc62 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Target/AArch64/Disassembler/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Target/AArch64/Disassembler/BUILD.gn
@@ -5,7 +5,6 @@ tablegen("AArch64GenDisassemblerTables") {
   args = [
     "-gen-disassembler",
     "-ignore-non-decodable-operands",
-    "-ignore-fully-defined-operands",
   ]
   td_file = "../AArch64.td"
 }
diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/AVR/Disassembler/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/AVR/Disassembler/BUILD.gn
index bbae270d24c46..dded556b786fb 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Target/AVR/Disassembler/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Target/AVR/Disassembler/BUILD.gn
@@ -2,10 +2,7 @@ import("//llvm/utils/TableGen/tablegen.gni")
 
 tablegen("AVRGenDisassemblerTables") {
   visibility = [ ":Disassembler" ]
-  args = [
-    "-gen-disassembler",
-    "-ignore-non-decodable-operands",
-  ]
+  args = [ "-gen-disassembler" ]
   td_file = "../AVR.td"
 }
 
diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/BPF/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/BPF/BUILD.gn
index aa594df8c164a..6dc75540731a0 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Target/BPF/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Target/BPF/BUILD.gn
@@ -79,6 +79,7 @@ static_library("LLVMBPFCodeGen") {
     "BPFRegisterInfo.cpp",
     "BPFSelectionDAGInfo.cpp",
     "BPFSubtarget.cpp",
+    "BPFTargetLoweringObjectFile.cpp",
     "BPFTargetMachine.cpp",
     "BTFDebug.cpp",
     "GISel/BPFCallLowering.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/BPF/Disassembler/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/BPF/Disassembler/BUILD.gn
index 924317d20eee6..f47fe7ac28cee 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Target/BPF/Disassembler/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Target/BPF/Disassembler/BUILD.gn
@@ -2,10 +2,7 @@ import("//llvm/utils/TableGen/tablegen.gni")
 
 tablegen("BPFGenDisassemblerTables") {
   visibility = [ ":Disassembler" ]
-  args = [
-    "-gen-disassembler",
-    "-ignore-non-decodable-operands",
-  ]
+  args = [ "-gen-disassembler" ]
   td_file = "../BPF.td"
 }
 
diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/Hexagon/Disassembler/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/Hexagon/Disassembler/BUILD.gn
index 2d21060086036..35a5d86c7e135 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Target/Hexagon/Disassembler/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Target/Hexagon/Disassembler/BUILD.gn
@@ -2,10 +2,7 @@ import("//llvm/utils/TableGen/tablegen.gni")
 
 tablegen("HexagonGenDisassemblerTables") {
   visibility = [ ":Disassembler" ]
-  args = [
-    "-gen-disassembler",
-    "-ignore-non-decodable-operands",
-  ]
+  args = [ "-gen-disassembler" ]
   td_file = "../Hexagon.td"
 }
 
diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/RISCV/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/RISCV/BUILD.gn
index 306e4d3f9f6b8..a1f5b475e2096 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Target/RISCV/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Target/RISCV/BUILD.gn
@@ -109,6 +109,7 @@ static_library("LLVMRISCVCodeGen") {
     "//llvm/lib/CodeGen/SelectionDAG",
     "//llvm/lib/IR",
     "//llvm/lib/MC",
+    "//llvm/lib/Passes",
     "//llvm/lib/Support",
     "//llvm/lib/Target",
     "//llvm/lib/TargetParser",
diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/RISCV/Disassembler/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/RISCV/Disassembler/BUILD.gn
index 94b6de7af6044..447a67af6be7b 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Target/RISCV/Disassembler/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Target/RISCV/Disassembler/BUILD.gn
@@ -5,7 +5,6 @@ tablegen("RISCVGenDisassemblerTables") {
   args = [
     "-gen-disassembler",
     "-specialize-decoders-per-bitwidth",
-    "-ignore-non-decodable-operands",
   ]
   td_file = "../RISCV.td"
 }
diff --git a/llvm/utils/gn/secondary/llvm/lib/TargetParser/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/TargetParser/BUILD.gn
index 183fa57d47a63..73ed834599e02 100644
--- a/llvm/utils/gn/secondary/llvm/lib/TargetParser/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/TargetParser/BUILD.gn
@@ -17,6 +17,7 @@ static_library("TargetParser") {
     "RISCVISAInfo.cpp",
     "RISCVTargetParser.cpp",
     "SubtargetFeature.cpp",
+    "TargetDataLayout.cpp",
     "TargetParser.cpp",
     "Triple.cpp",
     "X86TargetParser.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/lib/Transforms/Utils/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Transforms/Utils/BUILD.gn
index bf650c38e150a..186d2ef96c19b 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Transforms/Utils/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Transforms/Utils/BUILD.gn
@@ -29,6 +29,7 @@ static_library("Utils") {
     "CtorUtils.cpp",
     "DXILUpgrade.cpp",
     "Debugify.cpp",
+    "DebugSSAUpdater.cpp",
     "DeclareRuntimeLibcalls.cpp",
     "DemoteRegToStack.cpp",
     "EntryExitInstrumenter.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/unittests/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/BUILD.gn
index 22026664ae596..9ca6715f155c3 100644
--- a/llvm/utils/gn/secondary/llvm/unittests/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/unittests/BUILD.gn
@@ -51,6 +51,7 @@ group("unittests") {
     "SandboxIR:SandboxIRTests",
     "Support:SupportTests",
     "Support/DynamicLibrary:DynamicLibraryTests",
+    "Support/LSP:LSPTests",
     "TableGen:TableGenTests",
     "Target:TargetMachineCTests",
     "TargetParser:TargetParserTests",
diff --git a/llvm/utils/gn/secondary/llvm/unittests/CAS/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/CAS/BUILD.gn
index efd4f19b675f2..de6de0b119e9e 100644
--- a/llvm/utils/gn/secondary/llvm/unittests/CAS/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/unittests/CAS/BUILD.gn
@@ -10,5 +10,6 @@ unittest("CASTests") {
     "ActionCacheTest.cpp",
     "CASTestConfig.cpp",
     "ObjectStoreTest.cpp",
+    "ProgramTest.cpp",
   ]
 }
diff --git a/llvm/utils/gn/secondary/llvm/unittests/IR/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/IR/BUILD.gn
index 229586a9b0e3c..1f6c7e75d989f 100644
--- a/llvm/utils/gn/secondary/llvm/unittests/IR/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/unittests/IR/BUILD.gn
@@ -30,6 +30,7 @@ unittest("IRTests") {
     "DominatorTreeTest.cpp",
     "DroppedVariableStatsIRTest.cpp",
     "FunctionTest.cpp",
+    "GlobalObjectTest.cpp",
     "IRBuilderTest.cpp",
     "InstructionsTest.cpp",
     "IntrinsicsTest.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/unittests/Support/LSP/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/Support/LSP/BUILD.gn
new file mode 100644
index 0000000000000..3b59ef1288652
--- /dev/null
+++ b/llvm/utils/gn/secondary/llvm/unittests/Support/LSP/BUILD.gn
@@ -0,0 +1,9 @@
+import("//third-party/unittest/unittest.gni")
+
+unittest("LSPTests") {
+  deps = [ "//llvm/lib/Support/LSP" ]
+  sources = [
+    "Protocol.cpp",
+    "Transport.cpp",
+  ]
+}
diff --git a/llvm/utils/gn/secondary/llvm/unittests/Transforms/Utils/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/Transforms/Utils/BUILD.gn
index 380ed71a2bc01..b090552c87bfd 100644
--- a/llvm/utils/gn/secondary/llvm/unittests/Transforms/Utils/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/unittests/Transforms/Utils/BUILD.gn
@@ -19,6 +19,7 @@ unittest("UtilsTests") {
     "CodeExtractorTest.cpp",
     "CodeLayoutTest.cpp",
     "CodeMoverUtilsTest.cpp",
+    "DebugSSAUpdaterTest.cpp",
     "DebugifyTest.cpp",
     "FunctionComparatorTest.cpp",
     "IntegerDivisionTest.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/utils/TableGen/Common/BUILD.gn b/llvm/utils/gn/secondary/llvm/utils/TableGen/Common/BUILD.gn
index db11e56e550f9..1a8bec13a6580 100644
--- a/llvm/utils/gn/secondary/llvm/utils/TableGen/Common/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/utils/TableGen/Common/BUILD.gn
@@ -25,6 +25,7 @@ static_library("Common") {
     "GlobalISel/PatternParser.cpp",
     "GlobalISel/Patterns.cpp",
     "InfoByHwMode.cpp",
+    "InstructionEncoding.cpp",
     "OptEmitter.cpp",
     "PredicateExpander.cpp",
     "SubtargetFeatureInfo.cpp",
diff --git a/llvm/utils/lit/CMakeLists.txt b/llvm/utils/lit/CMakeLists.txt
index d22a778e2e531..97b1d7c022fd5 100644
--- a/llvm/utils/lit/CMakeLists.txt
+++ b/llvm/utils/lit/CMakeLists.txt
@@ -22,7 +22,7 @@ add_custom_target(prepare-check-lit
 # Add rules for lit's own test suite
 add_lit_testsuite(check-lit "Running lit's tests"
   ${CMAKE_CURRENT_BINARY_DIR}
-  DEPENDS "FileCheck" "not" "prepare-check-lit"
+  DEPENDS "FileCheck" "not" "split-file" "prepare-check-lit"
 )
 
 # For IDEs
diff --git a/llvm/utils/lit/lit/DiffUpdater.py b/llvm/utils/lit/lit/DiffUpdater.py
index de0001a94f0ba..a29c46fb8508f 100644
--- a/llvm/utils/lit/lit/DiffUpdater.py
+++ b/llvm/utils/lit/lit/DiffUpdater.py
@@ -1,37 +1,143 @@
 import shutil
+import os
+import shlex
+import pathlib
 
 """
 This file provides the `diff_test_updater` function, which is invoked on failed RUN lines when lit is executed with --update-tests.
 It checks whether the failed command is `diff` and, if so, uses heuristics to determine which file is the checked-in reference file and which file is output from the test case.
 The heuristics are currently as follows:
+    - if exactly one file originates from the `split-file` command, that file is the reference file and the other is the output file
     - if exactly one file ends with ".expected" (common pattern in LLVM), that file is the reference file and the other is the output file
     - if exactly one file path contains ".tmp" (e.g. because it contains the expansion of "%t"), that file is the reference file and the other is the output file
 If the command matches one of these patterns the output file content is copied to the reference file to make the test pass.
+If the reference file originated in `split-file`, the output file content is instead copied to the corresponding slice of the test file.
 Otherwise the test is ignored.
 
 Possible improvements:
     - Support stdin patterns like "my_binary %s | diff expected.txt"
-    - Scan RUN lines to see if a file is the source of output from a previous command.
+    - Scan RUN lines to see if a file is the source of output from a previous command (other than `split-file`).
       If it is then it is not a reference file that can be copied to, regardless of name, since the test will overwrite it anyways.
     - Only update the parts that need updating (based on the diff output). Could help avoid noisy updates when e.g. whitespace changes are ignored.
 """
 
 
-def get_source_and_target(a, b):
+class NormalFileTarget:
+    def __init__(self, target):
+        self.target = target
+
+    def copyFrom(self, source):
+        shutil.copy(source, self.target)
+
+    def __str__(self):
+        return self.target
+
+
+class SplitFileTarget:
+    def __init__(self, slice_start_idx, test_path, lines):
+        self.slice_start_idx = slice_start_idx
+        self.test_path = test_path
+        self.lines = lines
+
+    def copyFrom(self, source):
+        lines_before = self.lines[: self.slice_start_idx + 1]
+        self.lines = self.lines[self.slice_start_idx + 1 :]
+        slice_end_idx = None
+        for i, l in enumerate(self.lines):
+            if SplitFileTarget._get_split_line_path(l) != None:
+                slice_end_idx = i
+                break
+        if slice_end_idx is not None:
+            lines_after = self.lines[slice_end_idx:]
+        else:
+            lines_after = []
+        with open(source, "r") as f:
+            new_lines = lines_before + f.readlines() + lines_after
+        with open(self.test_path, "w") as f:
+            for l in new_lines:
+                f.write(l)
+
+    def __str__(self):
+        return f"slice in {self.test_path}"
+
+    @staticmethod
+    def get_target_dir(commands, test_path):
+        # posix=True breaks Windows paths because \ is treated as an escaping character
+        for cmd in commands:
+            split = shlex.split(cmd, posix=False)
+            if "split-file" not in split:
+                continue
+            start_idx = split.index("split-file")
+            split = split[start_idx:]
+            if len(split) < 3:
+                continue
+            p = unquote(split[1].strip())
+            if not test_path.samefile(p):
+                continue
+            return unquote(split[2].strip())
+        return None
+
+    @staticmethod
+    def create(path, commands, test_path, target_dir):
+        path = pathlib.Path(path)
+        with open(test_path, "r") as f:
+            lines = f.readlines()
+        for i, l in enumerate(lines):
+            p = SplitFileTarget._get_split_line_path(l)
+            if p and path.samefile(os.path.join(target_dir, p)):
+                idx = i
+                break
+        else:
+            return None
+        return SplitFileTarget(idx, test_path, lines)
+
+    @staticmethod
+    def _get_split_line_path(l):
+        if len(l) < 6:
+            return None
+        if l.startswith("//"):
+            l = l[2:]
+        else:
+            l = l[1:]
+        if l.startswith("--- "):
+            l = l[4:]
+        else:
+            return None
+        return l.rstrip()
+
+
+def unquote(s):
+    if len(s) > 1 and s[0] == s[-1] and (s[0] == '"' or s[0] == "'"):
+        return s[1:-1]
+    return s
+
+
+def get_source_and_target(a, b, test_path, commands):
     """
     Try to figure out which file is the test output and which is the reference.
     """
+    split_target_dir = SplitFileTarget.get_target_dir(commands, test_path)
+    if split_target_dir:
+        a_target = SplitFileTarget.create(a, commands, test_path, split_target_dir)
+        b_target = SplitFileTarget.create(b, commands, test_path, split_target_dir)
+        if a_target and b_target:
+            return None
+        if a_target:
+            return b, a_target
+        if b_target:
+            return a, b_target
+
     expected_suffix = ".expected"
     if a.endswith(expected_suffix) and not b.endswith(expected_suffix):
-        return b, a
+        return b, NormalFileTarget(a)
     if b.endswith(expected_suffix) and not a.endswith(expected_suffix):
-        return a, b
+        return a, NormalFileTarget(b)
 
     tmp_substr = ".tmp"
     if tmp_substr in a and not tmp_substr in b:
-        return a, b
+        return a, NormalFileTarget(b)
     if tmp_substr in b and not tmp_substr in a:
-        return b, a
+        return b, NormalFileTarget(a)
 
     return None
 
@@ -40,16 +146,16 @@ def filter_flags(args):
     return [arg for arg in args if not arg.startswith("-")]
 
 
-def diff_test_updater(result, test):
+def diff_test_updater(result, test, commands):
     args = filter_flags(result.command.args)
     if len(args) != 3:
         return None
     [cmd, a, b] = args
     if cmd != "diff":
         return None
-    res = get_source_and_target(a, b)
+    res = get_source_and_target(a, b, pathlib.Path(test.getFilePath()), commands)
     if not res:
         return f"update-diff-test: could not deduce source and target from {a} and {b}"
     source, target = res
-    shutil.copy(source, target)
+    target.copyFrom(source)
     return f"update-diff-test: copied {source} to {target}"
diff --git a/llvm/utils/lit/lit/TestRunner.py b/llvm/utils/lit/lit/TestRunner.py
index 0e32838eea1cb..90c2c6479b004 100644
--- a/llvm/utils/lit/lit/TestRunner.py
+++ b/llvm/utils/lit/lit/TestRunner.py
@@ -92,11 +92,12 @@ class ShellEnvironment(object):
     we maintain a dir stack for pushd/popd.
     """
 
-    def __init__(self, cwd, env, umask=-1):
+    def __init__(self, cwd, env, umask=-1, ulimit={}):
         self.cwd = cwd
         self.env = dict(env)
         self.umask = umask
         self.dirStack = []
+        self.ulimit = ulimit
 
     def change_dir(self, newdir):
         if os.path.isabs(newdir):
@@ -595,6 +596,27 @@ def executeBuiltinUmask(cmd, shenv):
     return ShellCommandResult(cmd, "", "", 0, False)
 
 
+def executeBuiltinUlimit(cmd, shenv):
+    """executeBuiltinUlimit - Change the current limits."""
+    if os.name != "posix":
+        raise InternalShellError(cmd, "'ulimit' not supported on this system")
+    if len(cmd.args) != 3:
+        raise InternalShellError(cmd, "'ulimit' requires two arguments")
+    try:
+        new_limit = int(cmd.args[2])
+    except ValueError as err:
+        raise InternalShellError(cmd, "Error: 'ulimit': %s" % str(err))
+    if cmd.args[1] == "-v":
+        shenv.ulimit["RLIMIT_AS"] = new_limit * 1024
+    elif cmd.args[1] == "-n":
+        shenv.ulimit["RLIMIT_NOFILE"] = new_limit
+    else:
+        raise InternalShellError(
+            cmd, "'ulimit' does not support option: %s" % cmd.args[1]
+        )
+    return ShellCommandResult(cmd, "", "", 0, False)
+
+
 def executeBuiltinColon(cmd, cmd_shenv):
     """executeBuiltinColon - Discard arguments and exit with status 0."""
     return ShellCommandResult(cmd, "", "", 0, False)
@@ -749,6 +771,7 @@ def _executeShCmd(cmd, shenv, results, timeoutHelper):
         "popd": executeBuiltinPopd,
         "pushd": executeBuiltinPushd,
         "rm": executeBuiltinRm,
+        "ulimit": executeBuiltinUlimit,
         "umask": executeBuiltinUmask,
         ":": executeBuiltinColon,
     }
@@ -899,14 +922,8 @@ def _executeShCmd(cmd, shenv, results, timeoutHelper):
 
         # Replace uses of /dev/null with temporary files.
         if kAvoidDevNull:
-            # In Python 2.x, basestring is the base class for all string (including unicode)
-            # In Python 3.x, basestring no longer exist and str is always unicode
-            try:
-                str_type = basestring
-            except NameError:
-                str_type = str
             for i, arg in enumerate(args):
-                if isinstance(arg, str_type) and kDevNull in arg:
+                if isinstance(arg, str) and kDevNull in arg:
                     f = tempfile.NamedTemporaryFile(delete=False)
                     f.close()
                     named_temp_files.append(f.name)
@@ -920,6 +937,19 @@ def _executeShCmd(cmd, shenv, results, timeoutHelper):
         if kIsWindows:
             args = quote_windows_command(args)
 
+        # Handle any resource limits. We do this by launching the command with
+        # a wrapper that sets the necessary limits. We use a wrapper rather than
+        # setting the limits in process as we cannot reraise the limits back to
+        # their defaults without elevated permissions.
+        if cmd_shenv.ulimit:
+            executable = sys.executable
+            args.insert(0, sys.executable)
+            args.insert(1, os.path.join(builtin_commands_dir, "_launch_with_limit.py"))
+            for limit in cmd_shenv.ulimit:
+                cmd_shenv.env["LIT_INTERNAL_ULIMIT_" + limit] = str(
+                    cmd_shenv.ulimit[limit]
+                )
+
         try:
             # TODO(boomanaiden154): We currently wrap the subprocess.Popen with
             # os.umask as the umask argument in subprocess.Popen is not
@@ -1241,7 +1271,7 @@ def executeScriptInternal(
         ):
             for test_updater in litConfig.test_updaters:
                 try:
-                    update_output = test_updater(result, test)
+                    update_output = test_updater(result, test, commands)
                 except Exception as e:
                     output = out
                     output += err
diff --git a/llvm/utils/lit/lit/builtin_commands/_launch_with_limit.py b/llvm/utils/lit/lit/builtin_commands/_launch_with_limit.py
new file mode 100644
index 0000000000000..33d2d59ff0dbe
--- /dev/null
+++ b/llvm/utils/lit/lit/builtin_commands/_launch_with_limit.py
@@ -0,0 +1,25 @@
+import sys
+import subprocess
+import resource
+import os
+
+ULIMIT_ENV_VAR_PREFIX = "LIT_INTERNAL_ULIMIT_"
+
+
+def main(argv):
+    command_args = argv[1:]
+    for env_var in os.environ:
+        if env_var.startswith(ULIMIT_ENV_VAR_PREFIX):
+            limit_str = env_var[len(ULIMIT_ENV_VAR_PREFIX) :]
+            limit_value = int(os.environ[env_var])
+            limit = (limit_value, limit_value)
+            if limit_str == "RLIMIT_AS":
+                resource.setrlimit(resource.RLIMIT_AS, limit)
+            elif limit_str == "RLIMIT_NOFILE":
+                resource.setrlimit(resource.RLIMIT_NOFILE, limit)
+    process_output = subprocess.run(command_args)
+    sys.exit(process_output.returncode)
+
+
+if __name__ == "__main__":
+    main(sys.argv)
diff --git a/llvm/utils/lit/lit/llvm/config.py b/llvm/utils/lit/lit/llvm/config.py
index 56aa5eb64fa36..2f2df68ac0cff 100644
--- a/llvm/utils/lit/lit/llvm/config.py
+++ b/llvm/utils/lit/lit/llvm/config.py
@@ -198,6 +198,9 @@ def __init__(self, lit_config, config):
             if gmalloc_path_str is not None:
                 self.with_environment("DYLD_INSERT_LIBRARIES", gmalloc_path_str)
 
+        if not platform.system() == "Windows":
+            features.add("symlinks")
+
     def _find_git_windows_unix_tools(self, tools_needed):
         assert sys.platform == "win32"
         import winreg
@@ -230,7 +233,7 @@ def with_environment(self, variable, value, append_path=False):
             # For paths, we should be able to take a list of them and process
             # all of them.
             paths_to_add = value
-            if lit.util.is_string(paths_to_add):
+            if isinstance(paths_to_add, str):
                 paths_to_add = [paths_to_add]
 
             def norm(x):
@@ -259,7 +262,7 @@ def norm(x):
         self.config.environment[variable] = value
 
     def with_system_environment(self, variables, append_path=False):
-        if lit.util.is_string(variables):
+        if isinstance(variables, str):
             variables = [variables]
         for v in variables:
             value = os.environ.get(v)
@@ -401,7 +404,7 @@ def add_tool_substitutions(self, tools, search_dirs=None):
         if not search_dirs:
             search_dirs = [self.config.llvm_tools_dir]
 
-        if lit.util.is_string(search_dirs):
+        if isinstance(search_dirs, str):
             search_dirs = [search_dirs]
 
         tools = [x if isinstance(x, ToolSubst) else ToolSubst(x) for x in tools]
diff --git a/llvm/utils/lit/lit/util.py b/llvm/utils/lit/lit/util.py
index b03fd8bc22693..ce4c3c2df3436 100644
--- a/llvm/utils/lit/lit/util.py
+++ b/llvm/utils/lit/lit/util.py
@@ -13,14 +13,6 @@
 import threading
 
 
-def is_string(value):
-    try:
-        # Python 2 and Python 3 are different here.
-        return isinstance(value, basestring)
-    except NameError:
-        return isinstance(value, str)
-
-
 def pythonize_bool(value):
     if value is None:
         return False
@@ -28,7 +20,7 @@ def pythonize_bool(value):
         return value
     if isinstance(value, numbers.Number):
         return value != 0
-    if is_string(value):
+    if isinstance(value, str):
         if value.lower() in ("1", "true", "on", "yes"):
             return True
         if value.lower() in ("", "0", "false", "off", "no"):
diff --git a/llvm/utils/lit/tests/Inputs/diff-test-update/.gitignore b/llvm/utils/lit/tests/Inputs/diff-test-update/.gitignore
index dd373bf9e0c66..aea8ee3be4982 100644
--- a/llvm/utils/lit/tests/Inputs/diff-test-update/.gitignore
+++ b/llvm/utils/lit/tests/Inputs/diff-test-update/.gitignore
@@ -1,2 +1,10 @@
 ; diff-tmp-dir.test clobbers this file
 empty.txt
+; these test cases are clobbered when run, so they're recreated each time
+single-split-file.test
+single-split-file-populated.test
+multiple-split-file.test
+multiple-split-file-populated.test
+single-split-file-no-expected.test
+split-c-comments.test
+split whitespace.test
diff --git a/llvm/utils/lit/tests/Inputs/diff-test-update/multiple-split-file-populated.in b/llvm/utils/lit/tests/Inputs/diff-test-update/multiple-split-file-populated.in
new file mode 100644
index 0000000000000..e218ed6a0c6ea
--- /dev/null
+++ b/llvm/utils/lit/tests/Inputs/diff-test-update/multiple-split-file-populated.in
@@ -0,0 +1,17 @@
+# RUN: split-file %s %t
+# RUN: cp %S/1.in %t/out.txt
+# RUN: diff %t/test3.expected %t/out.txt
+
+#--- test1.expected
+unrelated
+#--- test2.expected
+#--- test3.expected
+BAR
+
+BAZ
+
+#--- test4.expected
+filler
+#--- test5.expected
+
+
diff --git a/llvm/utils/lit/tests/Inputs/diff-test-update/multiple-split-file.in b/llvm/utils/lit/tests/Inputs/diff-test-update/multiple-split-file.in
new file mode 100644
index 0000000000000..c47db99912c24
--- /dev/null
+++ b/llvm/utils/lit/tests/Inputs/diff-test-update/multiple-split-file.in
@@ -0,0 +1,13 @@
+# RUN: split-file %s %t
+# RUN: cp %S/1.in %t/out.txt
+# RUN: diff %t/test3.expected %t/out.txt
+
+#--- test1.expected
+unrelated
+#--- test2.expected
+#--- test3.expected
+#--- test4.expected
+filler
+#--- test5.expected
+
+
diff --git a/llvm/utils/lit/tests/Inputs/diff-test-update/multiple-split-file.out b/llvm/utils/lit/tests/Inputs/diff-test-update/multiple-split-file.out
new file mode 100644
index 0000000000000..c1d2782d3c2d4
--- /dev/null
+++ b/llvm/utils/lit/tests/Inputs/diff-test-update/multiple-split-file.out
@@ -0,0 +1,14 @@
+# RUN: split-file %s %t
+# RUN: cp %S/1.in %t/out.txt
+# RUN: diff %t/test3.expected %t/out.txt
+
+#--- test1.expected
+unrelated
+#--- test2.expected
+#--- test3.expected
+FOO
+#--- test4.expected
+filler
+#--- test5.expected
+
+
diff --git a/llvm/utils/lit/tests/Inputs/diff-test-update/single-split-file-no-expected.in b/llvm/utils/lit/tests/Inputs/diff-test-update/single-split-file-no-expected.in
new file mode 100644
index 0000000000000..510dc7afba16b
--- /dev/null
+++ b/llvm/utils/lit/tests/Inputs/diff-test-update/single-split-file-no-expected.in
@@ -0,0 +1,6 @@
+# RUN: split-file %s %t
+# RUN: cp %S/1.in %t/out.txt
+# RUN: diff %t/test.txt %t/out.txt
+
+#--- test.txt
+
diff --git a/llvm/utils/lit/tests/Inputs/diff-test-update/single-split-file-no-expected.out b/llvm/utils/lit/tests/Inputs/diff-test-update/single-split-file-no-expected.out
new file mode 100644
index 0000000000000..f52e3004aee15
--- /dev/null
+++ b/llvm/utils/lit/tests/Inputs/diff-test-update/single-split-file-no-expected.out
@@ -0,0 +1,6 @@
+# RUN: split-file %s %t
+# RUN: cp %S/1.in %t/out.txt
+# RUN: diff %t/test.txt %t/out.txt
+
+#--- test.txt
+FOO
diff --git a/llvm/utils/lit/tests/Inputs/diff-test-update/single-split-file-populated.in b/llvm/utils/lit/tests/Inputs/diff-test-update/single-split-file-populated.in
new file mode 100644
index 0000000000000..63042cf9b86bc
--- /dev/null
+++ b/llvm/utils/lit/tests/Inputs/diff-test-update/single-split-file-populated.in
@@ -0,0 +1,7 @@
+# RUN: split-file %s %t
+# RUN: cp %S/1.in %t/out.txt
+# RUN: diff %t/test.expected %t/out.txt
+
+#--- test.expected
+BAR
+
diff --git a/llvm/utils/lit/tests/Inputs/diff-test-update/single-split-file.in b/llvm/utils/lit/tests/Inputs/diff-test-update/single-split-file.in
new file mode 100644
index 0000000000000..422ccf2ef6813
--- /dev/null
+++ b/llvm/utils/lit/tests/Inputs/diff-test-update/single-split-file.in
@@ -0,0 +1,5 @@
+# RUN: split-file %s %t
+# RUN: cp %S/1.in %t/out.txt
+# RUN: diff %t/test.expected %t/out.txt
+
+#--- test.expected
diff --git a/llvm/utils/lit/tests/Inputs/diff-test-update/single-split-file.out b/llvm/utils/lit/tests/Inputs/diff-test-update/single-split-file.out
new file mode 100644
index 0000000000000..5552ad328ec5c
--- /dev/null
+++ b/llvm/utils/lit/tests/Inputs/diff-test-update/single-split-file.out
@@ -0,0 +1,6 @@
+# RUN: split-file %s %t
+# RUN: cp %S/1.in %t/out.txt
+# RUN: diff %t/test.expected %t/out.txt
+
+#--- test.expected
+FOO
diff --git a/llvm/utils/lit/tests/Inputs/diff-test-update/split-both.test b/llvm/utils/lit/tests/Inputs/diff-test-update/split-both.test
new file mode 100644
index 0000000000000..f564f446cc94b
--- /dev/null
+++ b/llvm/utils/lit/tests/Inputs/diff-test-update/split-both.test
@@ -0,0 +1,11 @@
+# RUN: split-file %s %t
+# RUN: diff %t/split-both.expected %t/split-both.out
+
+# ignore the fact that it's called ".expected"
+# when comparing two files originating in split-file
+
+#--- split-both.expected
+FOO
+#--- split-both.out
+BAR
+
diff --git a/llvm/utils/lit/tests/Inputs/diff-test-update/split-c-comments.in b/llvm/utils/lit/tests/Inputs/diff-test-update/split-c-comments.in
new file mode 100644
index 0000000000000..3cda60118f5ba
--- /dev/null
+++ b/llvm/utils/lit/tests/Inputs/diff-test-update/split-c-comments.in
@@ -0,0 +1,6 @@
+// RUN: split-file %s %t
+// RUN: cp %S/1.in %t/out.txt
+// RUN: diff %t/test.txt %t/out.txt
+//
+//--- test.txt
+
diff --git a/llvm/utils/lit/tests/Inputs/diff-test-update/split-c-comments.out b/llvm/utils/lit/tests/Inputs/diff-test-update/split-c-comments.out
new file mode 100644
index 0000000000000..5020804f198b1
--- /dev/null
+++ b/llvm/utils/lit/tests/Inputs/diff-test-update/split-c-comments.out
@@ -0,0 +1,6 @@
+// RUN: split-file %s %t
+// RUN: cp %S/1.in %t/out.txt
+// RUN: diff %t/test.txt %t/out.txt
+//
+//--- test.txt
+FOO
diff --git a/llvm/utils/lit/tests/Inputs/diff-test-update/split-whitespace.in b/llvm/utils/lit/tests/Inputs/diff-test-update/split-whitespace.in
new file mode 100644
index 0000000000000..ad48d2ae4953c
--- /dev/null
+++ b/llvm/utils/lit/tests/Inputs/diff-test-update/split-whitespace.in
@@ -0,0 +1,6 @@
+// RUN: split-file "%s" "%t"
+// RUN: cp %S/1.in "%t/out.txt"
+// RUN: diff "%t/test.txt" "%t/out.txt"
+//
+//--- test.txt
+
diff --git a/llvm/utils/lit/tests/Inputs/diff-test-update/split-whitespace.out b/llvm/utils/lit/tests/Inputs/diff-test-update/split-whitespace.out
new file mode 100644
index 0000000000000..cb28124101ac6
--- /dev/null
+++ b/llvm/utils/lit/tests/Inputs/diff-test-update/split-whitespace.out
@@ -0,0 +1,6 @@
+// RUN: split-file "%s" "%t"
+// RUN: cp %S/1.in "%t/out.txt"
+// RUN: diff "%t/test.txt" "%t/out.txt"
+//
+//--- test.txt
+FOO
diff --git a/llvm/utils/lit/tests/Inputs/diff-test-update/unrelated-split.test b/llvm/utils/lit/tests/Inputs/diff-test-update/unrelated-split.test
new file mode 100644
index 0000000000000..b04eff36721de
--- /dev/null
+++ b/llvm/utils/lit/tests/Inputs/diff-test-update/unrelated-split.test
@@ -0,0 +1,11 @@
+# the fact that this test runs split-file is unrelated
+# to the diffed files
+
+# RUN: mkdir %t
+# RUN: split-file %s %t
+# RUN: cp %S/1.in %t/unrelated-split.expected
+# RUN: cp %S/2.in %t/unrelated-split.txt
+# RUN: diff %t/unrelated-split.expected %t/unrelated-split.txt
+
+#--- distraction.txt
+
diff --git a/llvm/utils/lit/tests/Inputs/pass-test-update/should_not_run.py b/llvm/utils/lit/tests/Inputs/pass-test-update/should_not_run.py
index 0fda62c832f08..5b39d208a2ed6 100644
--- a/llvm/utils/lit/tests/Inputs/pass-test-update/should_not_run.py
+++ b/llvm/utils/lit/tests/Inputs/pass-test-update/should_not_run.py
@@ -1,2 +1,2 @@
-def should_not_run(foo, bar):
+def should_not_run(foo, bar, baz):
     raise Exception("this test updater should only run on failure")
diff --git a/llvm/utils/lit/tests/Inputs/shtest-ulimit-nondarwin/lit.cfg b/llvm/utils/lit/tests/Inputs/shtest-ulimit-nondarwin/lit.cfg
new file mode 100644
index 0000000000000..c7bdc7e7b6bc0
--- /dev/null
+++ b/llvm/utils/lit/tests/Inputs/shtest-ulimit-nondarwin/lit.cfg
@@ -0,0 +1,8 @@
+import lit.formats
+
+config.name = "shtest-ulimit"
+config.suffixes = [".txt"]
+config.test_format = lit.formats.ShTest(execute_external=False)
+config.test_source_root = None
+config.test_exec_root = None
+config.substitutions.append(("%{python}", '"%s"' % (sys.executable)))
diff --git a/llvm/utils/lit/tests/Inputs/shtest-ulimit-nondarwin/ulimit_okay.txt b/llvm/utils/lit/tests/Inputs/shtest-ulimit-nondarwin/ulimit_okay.txt
new file mode 100644
index 0000000000000..dbdd0037e70a7
--- /dev/null
+++ b/llvm/utils/lit/tests/Inputs/shtest-ulimit-nondarwin/ulimit_okay.txt
@@ -0,0 +1,4 @@
+# RUN: ulimit -v 1048576
+# RUN: %{python} %S/../shtest-ulimit/print_limits.py
+# Fail the test so that we can assert on the output.
+# RUN: not echo return
diff --git a/llvm/utils/lit/tests/Inputs/shtest-ulimit/lit.cfg b/llvm/utils/lit/tests/Inputs/shtest-ulimit/lit.cfg
new file mode 100644
index 0000000000000..c7bdc7e7b6bc0
--- /dev/null
+++ b/llvm/utils/lit/tests/Inputs/shtest-ulimit/lit.cfg
@@ -0,0 +1,8 @@
+import lit.formats
+
+config.name = "shtest-ulimit"
+config.suffixes = [".txt"]
+config.test_format = lit.formats.ShTest(execute_external=False)
+config.test_source_root = None
+config.test_exec_root = None
+config.substitutions.append(("%{python}", '"%s"' % (sys.executable)))
diff --git a/llvm/utils/lit/tests/Inputs/shtest-ulimit/print_limits.py b/llvm/utils/lit/tests/Inputs/shtest-ulimit/print_limits.py
new file mode 100644
index 0000000000000..632f954fa8fde
--- /dev/null
+++ b/llvm/utils/lit/tests/Inputs/shtest-ulimit/print_limits.py
@@ -0,0 +1,4 @@
+import resource
+
+print("RLIMIT_AS=" + str(resource.getrlimit(resource.RLIMIT_AS)[0]))
+print("RLIMIT_NOFILE=" + str(resource.getrlimit(resource.RLIMIT_NOFILE)[0]))
diff --git a/llvm/utils/lit/tests/Inputs/shtest-ulimit/ulimit-bad-arg.txt b/llvm/utils/lit/tests/Inputs/shtest-ulimit/ulimit-bad-arg.txt
new file mode 100644
index 0000000000000..efa22881047e9
--- /dev/null
+++ b/llvm/utils/lit/tests/Inputs/shtest-ulimit/ulimit-bad-arg.txt
@@ -0,0 +1 @@
+# RUN: ulimit -n
diff --git a/llvm/utils/lit/tests/Inputs/shtest-ulimit/ulimit_okay.txt b/llvm/utils/lit/tests/Inputs/shtest-ulimit/ulimit_okay.txt
new file mode 100644
index 0000000000000..4edf1c303a092
--- /dev/null
+++ b/llvm/utils/lit/tests/Inputs/shtest-ulimit/ulimit_okay.txt
@@ -0,0 +1,4 @@
+# RUN: ulimit -n 50
+# RUN: %{python} %S/print_limits.py
+# Fail the test so that we can assert on the output.
+# RUN: not echo return
diff --git a/llvm/utils/lit/tests/diff-test-update.py b/llvm/utils/lit/tests/diff-test-update.py
index c37d0dccc727c..8b9f4610f7f95 100644
--- a/llvm/utils/lit/tests/diff-test-update.py
+++ b/llvm/utils/lit/tests/diff-test-update.py
@@ -1,10 +1,29 @@
+# RUN: cp %S/Inputs/diff-test-update/single-split-file.in %S/Inputs/diff-test-update/single-split-file.test
+# RUN: cp %S/Inputs/diff-test-update/single-split-file-populated.in %S/Inputs/diff-test-update/single-split-file-populated.test
+# RUN: cp %S/Inputs/diff-test-update/multiple-split-file.in %S/Inputs/diff-test-update/multiple-split-file.test
+# RUN: cp %S/Inputs/diff-test-update/multiple-split-file-populated.in %S/Inputs/diff-test-update/multiple-split-file-populated.test
+# RUN: cp %S/Inputs/diff-test-update/single-split-file-no-expected.in %S/Inputs/diff-test-update/single-split-file-no-expected.test
+# RUN: cp %S/Inputs/diff-test-update/split-c-comments.in %S/Inputs/diff-test-update/split-c-comments.test
+# RUN: cp %S/Inputs/diff-test-update/split-whitespace.in "%S/Inputs/diff-test-update/split whitespace.test"
+
 # RUN: not %{lit} --update-tests -v %S/Inputs/diff-test-update | FileCheck %s
 
+# RUN: diff --strip-trailing-cr %S/Inputs/diff-test-update/single-split-file.out %S/Inputs/diff-test-update/single-split-file.test
+# RUN: diff --strip-trailing-cr %S/Inputs/diff-test-update/single-split-file.out %S/Inputs/diff-test-update/single-split-file-populated.test
+# RUN: diff --strip-trailing-cr %S/Inputs/diff-test-update/multiple-split-file.out %S/Inputs/diff-test-update/multiple-split-file.test
+# RUN: diff --strip-trailing-cr %S/Inputs/diff-test-update/multiple-split-file.out %S/Inputs/diff-test-update/multiple-split-file-populated.test
+# RUN: diff --strip-trailing-cr %S/Inputs/diff-test-update/single-split-file-no-expected.out %S/Inputs/diff-test-update/single-split-file-no-expected.test
+# RUN: diff --strip-trailing-cr %S/Inputs/diff-test-update/split-c-comments.out %S/Inputs/diff-test-update/split-c-comments.test
+# RUN: diff --strip-trailing-cr %S/Inputs/diff-test-update/split-whitespace.out "%S/Inputs/diff-test-update/split whitespace.test"
+
+
 # CHECK: # update-diff-test: could not deduce source and target from {{.*}}1.in and {{.*}}2.in
 # CHECK: # update-diff-test: could not deduce source and target from {{.*}}1.txt and {{.*}}2.txt
 # CHECK: # update-diff-test: copied {{.*}}my-file.txt to {{.*}}my-file.expected
 # CHECK: # update-diff-test: copied {{.*}}1.txt to {{.*}}empty.txt
 # CHECK: # update-diff-test: copied {{.*}}diff-tmp.test.tmp.txt to {{.*}}diff-t-out.txt
+# CHECK: # update-diff-test: could not deduce source and target from {{.*}}split-both.expected and {{.*}}split-both.out
+# CHECK: # update-diff-test: copied {{.*}}unrelated-split.txt to {{.*}}unrelated-split.expected
 
 
-# CHECK: Failed: 5 (100.00%)
+# CHECK: Failed: 14 (100.00%)
diff --git a/llvm/utils/lit/tests/pass-test-update.py b/llvm/utils/lit/tests/pass-test-update.py
index 00a4025be660e..2e9f1be2bccab 100644
--- a/llvm/utils/lit/tests/pass-test-update.py
+++ b/llvm/utils/lit/tests/pass-test-update.py
@@ -12,7 +12,7 @@
 # CHECK: Exception occurred in test updater:
 # CHECK: Traceback (most recent call last):
 # CHECK:   File {{.*}}, line {{.*}}, in {{.*}}
-# CHECK:     update_output = test_updater(result, test)
+# CHECK:     update_output = test_updater(result, test, commands)
 # CHECK:   File "{{.*}}{{/|\\}}should_not_run.py", line {{.*}}, in should_not_run
 # CHECK:     raise Exception("this test updater should only run on failure")
 # CHECK: Exception: this test updater should only run on failure
diff --git a/llvm/utils/lit/tests/shtest-ulimit-nondarwin.py b/llvm/utils/lit/tests/shtest-ulimit-nondarwin.py
new file mode 100644
index 0000000000000..2661a2c8d6448
--- /dev/null
+++ b/llvm/utils/lit/tests/shtest-ulimit-nondarwin.py
@@ -0,0 +1,13 @@
+# Check the ulimit command
+
+# ulimit does not work on non-POSIX platforms.
+# These tests are specific to options that Darwin does not support.
+# UNSUPPORTED: system-windows, system-darwin
+
+# RUN: not %{lit} -a -v %{inputs}/shtest-ulimit-nondarwin | FileCheck %s
+
+# CHECK: -- Testing: 1 tests{{.*}}
+
+# CHECK-LABEL: FAIL: shtest-ulimit :: ulimit_okay.txt ({{[^)]*}})
+# CHECK: ulimit -v 1048576
+# CHECK: RLIMIT_AS=1073741824
diff --git a/llvm/utils/lit/tests/shtest-ulimit.py b/llvm/utils/lit/tests/shtest-ulimit.py
new file mode 100644
index 0000000000000..e84327772d3a1
--- /dev/null
+++ b/llvm/utils/lit/tests/shtest-ulimit.py
@@ -0,0 +1,18 @@
+# Check the ulimit command
+
+# ulimit does not work on non-POSIX platforms.
+# Solaris for some reason does not respect ulimit -n, so mark it unsupported
+# as well.
+# UNSUPPORTED: system-windows, system-solaris
+
+# RUN: not %{lit} -a -v %{inputs}/shtest-ulimit | FileCheck %s
+
+# CHECK: -- Testing: 2 tests{{.*}}
+
+# CHECK-LABEL: FAIL: shtest-ulimit :: ulimit-bad-arg.txt ({{[^)]*}})
+# CHECK: ulimit -n
+# CHECK: 'ulimit' requires two arguments
+
+# CHECK-LABEL: FAIL: shtest-ulimit :: ulimit_okay.txt ({{[^)]*}})
+# CHECK: ulimit -n 50
+# CHECK: RLIMIT_NOFILE=50
diff --git a/llvm/utils/profcheck-xfail.txt b/llvm/utils/profcheck-xfail.txt
index e1ee7c3664a51..482848842aa05 100644
--- a/llvm/utils/profcheck-xfail.txt
+++ b/llvm/utils/profcheck-xfail.txt
@@ -830,7 +830,6 @@ Transforms/IndVarSimplify/invalidate-modified-lcssa-phi.ll
 Transforms/IndVarSimplify/pr45835.ll
 Transforms/IndVarSimplify/preserving-debugloc-rem-div.ll
 Transforms/Inline/optimization-remarks-hotness-threshold.ll
-Transforms/InstCombine/2004-09-20-BadLoadCombine2.ll
 Transforms/InstCombine/2004-09-20-BadLoadCombine.ll
 Transforms/InstCombine/2005-04-07-UDivSelectCrash.ll
 Transforms/InstCombine/2011-02-14-InfLoop.ll
@@ -1303,6 +1302,7 @@ Transforms/LoopVectorize/explicit_outer_uniform_diverg_branch.ll
 Transforms/LoopVectorize/first-order-recurrence-complex.ll
 Transforms/LoopVectorize/first-order-recurrence.ll
 Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll
+Transforms/LoopVectorize/first-order-recurrence-with-uniform-ops.ll
 Transforms/LoopVectorize/float-induction.ll
 Transforms/LoopVectorize/float-minmax-instruction-flag.ll
 Transforms/LoopVectorize/fmax-without-fast-math-flags-interleave.ll
diff --git a/llvm/utils/update_any_test_checks.py b/llvm/utils/update_any_test_checks.py
index 76fe336593929..ec277f140a34f 100755
--- a/llvm/utils/update_any_test_checks.py
+++ b/llvm/utils/update_any_test_checks.py
@@ -63,7 +63,7 @@ def expand_listfile_args(arg_list):
     return exp_arg_list
 
 
-def utc_lit_plugin(result, test):
+def utc_lit_plugin(result, test, commands):
     testname = test.getFilePath()
     if not testname:
         return None
diff --git a/mlir/Maintainers.md b/mlir/Maintainers.md
index 02e93eb658279..5d3b576c2e751 100644
--- a/mlir/Maintainers.md
+++ b/mlir/Maintainers.md
@@ -46,7 +46,7 @@ dialects, build system and language bindings.
 * ‘ptr’ Dialect ([fabianmcg](https://github.com/fabianmcg))
 
 #### Basic Compute Dialects
-* ‘arith’ Dialect (core)
+* ‘arith’ Dialect (core + [kuhar](https://github.com/kuhar))
 * ‘math’ Dialect (core)
 * Rewrite System Dialects (core)
 * Transform Dialect ([martin-luecke](https://github.com/martin-luecke), [ftynse](https://github.com/ftynse), [rolfmorel](https://github.com/rolfmorel))
diff --git a/mlir/docs/Dialects/Affine.md b/mlir/docs/Dialects/Affine.md
index 0b6d7747e8a6f..90aa43a85d877 100644
--- a/mlir/docs/Dialects/Affine.md
+++ b/mlir/docs/Dialects/Affine.md
@@ -81,9 +81,9 @@ dimension is either static or a dynamic one in turn bound to a valid symbol.
 Note that as a result of rule (3) above, symbol validity is sensitive to the
 location of the SSA use. Dimensions may be bound not only to anything that a
 symbol is bound to, but also to induction variables of enclosing
-[`affine.for`](#affinefor-mliraffineforop) and
-[`affine.parallel`](#affineparallel-mliraffineparallelop) operations, and the result
-of an [`affine.apply` operation](#affineapply-mliraffineapplyop) (which recursively
+[`affine.for`](#affinefor-affineaffineforop) and
+[`affine.parallel`](#affineparallel-affineaffineparallelop) operations, and the result
+of an [`affine.apply` operation](#affineapply-affineaffineapplyop) (which recursively
 may use other dimensions and symbols).
 
 ### Affine Expressions
@@ -158,7 +158,7 @@ dimension indices and symbols into a list of results, with affine expressions
 combining the indices and symbols. Affine maps distinguish between
 [indices and symbols](#dimensions-and-symbols) because indices are inputs to the
 affine map when the map is called (through an operation such as
-[affine.apply](#affineapply-mliraffineapplyop)), whereas symbols are bound when the
+[affine.apply](#affineapply-affineaffineapplyop)), whereas symbols are bound when the
 map is established (e.g. when a memref is formed, establishing a memory
 [layout map](Builtin.md/#layout)).
 
diff --git a/mlir/docs/Dialects/SPIR-V.md b/mlir/docs/Dialects/SPIR-V.md
index 1e8c1c7be9f6a..716dd7773aefa 100644
--- a/mlir/docs/Dialects/SPIR-V.md
+++ b/mlir/docs/Dialects/SPIR-V.md
@@ -1375,7 +1375,7 @@ the proper file in test/Dialect/SPIRV/.
 
 The generated op will automatically gain the logic for (de)serialization.
 However, tests still need to be coupled with the change to make sure no
-surprises. Serialization tests live in test/Dialect/SPIRV/Serialization.
+surprises (see [Add a new test](#add-a-new-test) below).
 
 ### Add a new enum
 
@@ -1416,6 +1416,40 @@ conversion][MlirDialectConversionSignatureConversion] might be needed as well.
 operations contained within its region are valid operations in the SPIR-V
 dialect.
 
+### Add a new test
+
+Currently the SPIR-V dialect has three types of tests that should be added or
+updated accordingly:
+
+1.  **Dialect tests** - Those tests check different aspects of the op in isolation.
+    They should include both positive and negative case, and exercise the verifier,
+    parser and printer. Dialect tests do not have to form a valid SPIR-V code and
+    should be kept as simple as possible. They are run with `mlir-opt`; and are
+    also used to test transformations.
+
+2.  **Target tests** - Those tests are designed to exercise serialization and
+    deserialization, so each module should be a valid SPIR-V module. (De)serialization
+    is tested using the `mlir-translate --test-spirv-roundtrip` option.
+
+    To ensure that the SPIR-V MLIR forms and serializes into a valid SPIR-V, the
+    `spriv-val` tool should be run on a serialized binary (`--serialize-spirv`).
+    This can be automated by adding a conditional validation section to the test:
+
+    ```
+    // RUN: %if spirv-tools %{ rm -rf %t %}
+    // RUN: %if spirv-tools %{ mkdir %t %}
+    // RUN: %if spirv-tools %{ mlir-translate --no-implicit-module --serialize-spirv --split-input-file --spirv-save-validation-files-with-prefix=%t/module %s %}
+    // RUN: %if spirv-tools %{ spirv-val %t %}
+    ```
+
+    This sequence serializes and dumps each MLIR SPIR-V module into a separate
+    SPIR-V binary (MLIR allows multiple modules per files, however the SPIR-V
+    spec restricts each binary to a single module), and then runs `spirv-val`
+    on each of the file.
+
+3.  **Integration tests** - Those tests execute the MLIR code using the `mlir-runner`
+    to verify its functional correctness.
+
 ## Operation definitions
 
 [include "Dialects/SPIRVOps.md"]
diff --git a/mlir/docs/Tutorials/transform/Ch0.md b/mlir/docs/Tutorials/transform/Ch0.md
index dc4b753f98caa..0d7a70364742d 100644
--- a/mlir/docs/Tutorials/transform/Ch0.md
+++ b/mlir/docs/Tutorials/transform/Ch0.md
@@ -134,7 +134,7 @@ Furthermore, the operation now contains a region that explicitly specifies the m
 
 ## “Loop” Fusion
 
-Since the region of the `linalg.generic` operation can contain arbitrarily many operations, we can use it to express “fusion” of the implicit loops by simply having more operations chained in the region. For example, the common machine learning rectified linear unit layer (ReLU), which can be defined as `relu(x) = max(0, x)`, can be defined be expressed using the “compare-and-select” idiom in one `linalg.generic` operation, without the temporary buffer for the comparison result and without repeating the outer operation:
+Since the region of the `linalg.generic` operation can contain arbitrarily many operations, we can use it to express “fusion” of the implicit loops by simply having more operations chained in the region. For example, the common machine learning rectified linear unit layer (ReLU), which can be defined as `relu(x) = max(0, x)`, can be expressed using the “compare-and-select” idiom in one `linalg.generic` operation, without the temporary buffer for the comparison result and without repeating the outer operation:
 
 ```mlir
 linalg.generic {
diff --git a/mlir/examples/standalone/CMakeLists.txt b/mlir/examples/standalone/CMakeLists.txt
index 88dfa3e5d57a3..03627c0c10496 100644
--- a/mlir/examples/standalone/CMakeLists.txt
+++ b/mlir/examples/standalone/CMakeLists.txt
@@ -60,5 +60,7 @@ if(MLIR_ENABLE_BINDINGS_PYTHON)
 endif()
 add_subdirectory(test)
 add_subdirectory(standalone-opt)
-add_subdirectory(standalone-plugin)
+if(NOT WIN32)
+  add_subdirectory(standalone-plugin)
+endif()
 add_subdirectory(standalone-translate)
diff --git a/mlir/examples/standalone/python/CMakeLists.txt b/mlir/examples/standalone/python/CMakeLists.txt
index a0eca9c095775..1ab27ce3b533a 100644
--- a/mlir/examples/standalone/python/CMakeLists.txt
+++ b/mlir/examples/standalone/python/CMakeLists.txt
@@ -26,6 +26,8 @@ declare_mlir_python_extension(StandalonePythonSources.Pybind11Extension
   ADD_TO_PARENT StandalonePythonSources
   SOURCES
     StandaloneExtensionPybind11.cpp
+  PRIVATE_LINK_LIBS
+    LLVMSupport
   EMBED_CAPI_LINK_LIBS
     StandaloneCAPI
   PYTHON_BINDINGS_LIBRARY pybind11
@@ -36,6 +38,8 @@ declare_mlir_python_extension(StandalonePythonSources.NanobindExtension
   ADD_TO_PARENT StandalonePythonSources
   SOURCES
     StandaloneExtensionNanobind.cpp
+  PRIVATE_LINK_LIBS
+    LLVMSupport
   EMBED_CAPI_LINK_LIBS
     StandaloneCAPI
   PYTHON_BINDINGS_LIBRARY nanobind
diff --git a/mlir/examples/standalone/test/CMakeLists.txt b/mlir/examples/standalone/test/CMakeLists.txt
index fdde159064287..8864563df8a33 100644
--- a/mlir/examples/standalone/test/CMakeLists.txt
+++ b/mlir/examples/standalone/test/CMakeLists.txt
@@ -14,8 +14,10 @@ set(STANDALONE_TEST_DEPENDS
         standalone-capi-test
         standalone-opt
         standalone-translate
-        StandalonePlugin
         )
+if(NOT WIN32)
+  list(APPEND STANDALONE_TEST_DEPENDS StandalonePlugin)
+endif()
 if(MLIR_ENABLE_BINDINGS_PYTHON)
   list(APPEND STANDALONE_TEST_DEPENDS StandalonePythonModules)
 endif()
diff --git a/mlir/examples/standalone/test/Standalone/standalone-pass-plugin.mlir b/mlir/examples/standalone/test/Standalone/standalone-pass-plugin.mlir
index 1d652dc45830c..3020097dc1640 100644
--- a/mlir/examples/standalone/test/Standalone/standalone-pass-plugin.mlir
+++ b/mlir/examples/standalone/test/Standalone/standalone-pass-plugin.mlir
@@ -1,3 +1,4 @@
+// UNSUPPORTED: system-windows
 // RUN: mlir-opt %s --load-pass-plugin=%standalone_libs/StandalonePlugin%shlibext --pass-pipeline="builtin.module(standalone-switch-bar-foo)" | FileCheck %s
 
 module {
diff --git a/mlir/examples/standalone/test/Standalone/standalone-plugin.mlir b/mlir/examples/standalone/test/Standalone/standalone-plugin.mlir
index 468932b81a529..900b524c1feb7 100644
--- a/mlir/examples/standalone/test/Standalone/standalone-plugin.mlir
+++ b/mlir/examples/standalone/test/Standalone/standalone-plugin.mlir
@@ -1,3 +1,4 @@
+// UNSUPPORTED: system-windows
 // RUN: mlir-opt %s --load-dialect-plugin=%standalone_libs/StandalonePlugin%shlibext --pass-pipeline="builtin.module(standalone-switch-bar-foo)" | FileCheck %s
 
 module {
diff --git a/mlir/include/mlir/Analysis/DataFlow/DeadCodeAnalysis.h b/mlir/include/mlir/Analysis/DataFlow/DeadCodeAnalysis.h
index 2250db823b551..3b2914cdd4c98 100644
--- a/mlir/include/mlir/Analysis/DataFlow/DeadCodeAnalysis.h
+++ b/mlir/include/mlir/Analysis/DataFlow/DeadCodeAnalysis.h
@@ -17,6 +17,7 @@
 
 #include "mlir/Analysis/DataFlowFramework.h"
 #include "mlir/IR/SymbolTable.h"
+#include "mlir/Interfaces/ControlFlowInterfaces.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include <optional>
 
@@ -200,6 +201,13 @@ class DeadCodeAnalysis : public DataFlowAnalysis {
   /// which are live from the current block.
   void visitBranchOperation(BranchOpInterface branch);
 
+  /// Visit region branch edges from `predecessorOp` to a list of successors.
+  /// For each edge, mark the successor program point as executable, and record
+  /// the predecessor information in its `PredecessorState`.
+  void visitRegionBranchEdges(RegionBranchOpInterface regionBranchOp,
+                              Operation *predecessorOp,
+                              const SmallVector<RegionSuccessor> &successors);
+
   /// Visit the given region branch operation, which defines regions, and
   /// compute any necessary lattice state. This also resolves the lattice state
   /// of both the operation results and any nested regions.
@@ -229,6 +237,13 @@ class DeadCodeAnalysis : public DataFlowAnalysis {
   /// considered an external callable.
   Operation *analysisScope;
 
+  /// Whether the analysis scope has a symbol table. This is used to avoid
+  /// resolving callables outside the analysis scope.
+  /// It is updated when recursing into a region in case where the top-level
+  /// operation does not have a symbol table, but one is encountered in a nested
+  /// region.
+  bool hasSymbolTable = false;
+
   /// A symbol table used for O(1) symbol lookups during simplification.
   SymbolTableCollection symbolTable;
 };
diff --git a/mlir/include/mlir/Dialect/ArmSME/IR/ArmSMEIntrinsicOps.td b/mlir/include/mlir/Dialect/ArmSME/IR/ArmSMEIntrinsicOps.td
index 06fb8511774e8..4d19fa5415ef0 100644
--- a/mlir/include/mlir/Dialect/ArmSME/IR/ArmSMEIntrinsicOps.td
+++ b/mlir/include/mlir/Dialect/ArmSME/IR/ArmSMEIntrinsicOps.td
@@ -201,9 +201,6 @@ class ArmSME_IntrCountOp<string mnemonic>
                     /*traits*/[PredOpTrait<"`res` is i64", TypeIsPred<"res", I64>>],
                     /*numResults=*/1, /*overloadedResults=*/[]>;
 
-def LLVM_aarch64_sme_cntsb : ArmSME_IntrCountOp<"cntsb">;
-def LLVM_aarch64_sme_cntsh : ArmSME_IntrCountOp<"cntsh">;
-def LLVM_aarch64_sme_cntsw : ArmSME_IntrCountOp<"cntsw">;
 def LLVM_aarch64_sme_cntsd : ArmSME_IntrCountOp<"cntsd">;
 
 #endif // ARMSME_INTRINSIC_OPS
diff --git a/mlir/include/mlir/Dialect/ArmSME/Utils/Utils.h b/mlir/include/mlir/Dialect/ArmSME/Utils/Utils.h
index 1f40eb6fc693c..b57b27de4e1de 100644
--- a/mlir/include/mlir/Dialect/ArmSME/Utils/Utils.h
+++ b/mlir/include/mlir/Dialect/ArmSME/Utils/Utils.h
@@ -32,6 +32,9 @@ namespace mlir::arm_sme {
 
 constexpr unsigned MinStreamingVectorLengthInBits = 128;
 
+/// Return the size represented by arm_sme::TypeSize in bytes.
+unsigned getSizeInBytes(TypeSize type);
+
 /// Return minimum number of elements for the given element `type` in
 /// a vector of SVL bits.
 unsigned getSMETileSliceMinNumElts(Type type);
diff --git a/mlir/include/mlir/Dialect/Func/TransformOps/FuncTransformOps.td b/mlir/include/mlir/Dialect/Func/TransformOps/FuncTransformOps.td
index 4062f310c6521..b64b3fcdb275b 100644
--- a/mlir/include/mlir/Dialect/Func/TransformOps/FuncTransformOps.td
+++ b/mlir/include/mlir/Dialect/Func/TransformOps/FuncTransformOps.td
@@ -134,4 +134,30 @@ def ReplaceFuncSignatureOp
   }];
 }
 
+def DeduplicateFuncArgsOp
+    : Op<Transform_Dialect, "func.deduplicate_func_args",
+         [DeclareOpInterfaceMethods<TransformOpInterface>,
+          DeclareOpInterfaceMethods<MemoryEffectsOpInterface>]> {
+  let description = [{
+      This transform takes a module and a function name, and deduplicates
+      the arguments of the function. The function is expected to be defined in
+      the module.
+
+      This transform will emit a silenceable failure if:
+       - The function with the given name does not exist in the module.
+       - The function does not have duplicate arguments.
+       - The function does not have a single call.
+  }];
+
+  let arguments = (ins TransformHandleTypeInterface:$module,
+      SymbolRefAttr:$function_name);
+  let results = (outs TransformHandleTypeInterface:$transformed_module,
+                      TransformHandleTypeInterface:$transformed_function);
+
+  let assemblyFormat = [{
+    $function_name
+    `at` $module attr-dict `:` functional-type(operands, results)
+  }];
+}
+
 #endif // FUNC_TRANSFORM_OPS
diff --git a/mlir/include/mlir/Dialect/Func/Utils/Utils.h b/mlir/include/mlir/Dialect/Func/Utils/Utils.h
index 2e8b6723a0e53..3576126a487ac 100644
--- a/mlir/include/mlir/Dialect/Func/Utils/Utils.h
+++ b/mlir/include/mlir/Dialect/Func/Utils/Utils.h
@@ -18,32 +18,49 @@
 
 #include "mlir/IR/PatternMatch.h"
 #include "llvm/ADT/ArrayRef.h"
+#include <string>
 
 namespace mlir {
 
+class ModuleOp;
+
 namespace func {
 
 class FuncOp;
 class CallOp;
 
 /// Creates a new function operation with the same name as the original
-/// function operation, but with the arguments reordered according to
-/// the `newArgsOrder` and `newResultsOrder`.
+/// function operation, but with the arguments mapped according to
+/// the `oldArgToNewArg` and `oldResToNewRes`.
 /// The `funcOp` operation must have exactly one block.
 /// Returns the new function operation or failure if `funcOp` doesn't
 /// have exactly one block.
-FailureOr<FuncOp>
-replaceFuncWithNewOrder(RewriterBase &rewriter, FuncOp funcOp,
-                        llvm::ArrayRef<unsigned> newArgsOrder,
-                        llvm::ArrayRef<unsigned> newResultsOrder);
+/// Note: the method asserts that the `oldArgToNewArg` and `oldResToNewRes`
+/// maps the whole function arguments and results.
+mlir::FailureOr<mlir::func::FuncOp> replaceFuncWithNewMapping(
+    mlir::RewriterBase &rewriter, mlir::func::FuncOp funcOp,
+    ArrayRef<int> oldArgIdxToNewArgIdx, ArrayRef<int> oldResIdxToNewResIdx);
 /// Creates a new call operation with the values as the original
-/// call operation, but with the arguments reordered according to
-/// the `newArgsOrder` and `newResultsOrder`.
-CallOp replaceCallOpWithNewOrder(RewriterBase &rewriter, CallOp callOp,
-                                 llvm::ArrayRef<unsigned> newArgsOrder,
-                                 llvm::ArrayRef<unsigned> newResultsOrder);
+/// call operation, but with the arguments mapped according to
+/// the `oldArgToNewArg` and `oldResToNewRes`.
+/// Note: the method asserts that the `oldArgToNewArg` and `oldResToNewRes`
+/// maps the whole call operation arguments and results.
+mlir::func::CallOp replaceCallOpWithNewMapping(
+    mlir::RewriterBase &rewriter, mlir::func::CallOp callOp,
+    ArrayRef<int> oldArgIdxToNewArgIdx, ArrayRef<int> oldResIdxToNewResIdx);
+
+/// This utility function examines all call operations within the given
+/// `moduleOp` that target the specified `funcOp`. It identifies duplicate
+/// operands in the call operations, creates mappings to deduplicate them, and
+/// then applies the transformation to both the function and its call sites. For
+/// now, it only supports one call operation for the function operation. The
+/// function returns a pair containing the new funcOp and the new callOp. Note:
+/// after the transformation, the original funcOp and callOp will be erased.
+mlir::FailureOr<std::pair<mlir::func::FuncOp, mlir::func::CallOp>>
+deduplicateArgsOfFuncOp(mlir::RewriterBase &rewriter, mlir::func::FuncOp funcOp,
+                        mlir::ModuleOp moduleOp);
 
 } // namespace func
 } // namespace mlir
 
-#endif // MLIR_DIALECT_FUNC_UTILS_H
+#endif // MLIR_DIALECT_FUNC_UTILS_H
\ No newline at end of file
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMAttrDefs.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMAttrDefs.td
index a8c9ef790cfbd..75bce6b0a0e54 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMAttrDefs.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMAttrDefs.td
@@ -30,6 +30,7 @@ class LLVM_Attr<string name, string attrMnemonic,
 
 def LLVM_AddressSpaceAttr :
     LLVM_Attr<"AddressSpace", "address_space", [
+    LLVM_LLVMAddrSpaceAttrInterface,
     DeclareAttrInterfaceMethods<MemorySpaceAttrInterface>
   ]> {
   let summary = "LLVM address space";
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMAttrs.h b/mlir/include/mlir/Dialect/LLVMIR/LLVMAttrs.h
index fafccf304e1b4..ce62f0751d876 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMAttrs.h
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMAttrs.h
@@ -93,6 +93,14 @@ class TBAANodeAttr : public Attribute {
 using cconv::CConv;
 using linkage::Linkage;
 using tailcallkind::TailCallKind;
+
+namespace detail {
+/// Checks whether the given type is an LLVM type that can be loaded or stored.
+bool isValidLoadStoreImpl(Type type, ptr::AtomicOrdering ordering,
+                          std::optional<int64_t> alignment,
+                          const ::mlir::DataLayout *dataLayout,
+                          function_ref<InFlightDiagnostic()> emitError);
+} // namespace detail
 } // namespace LLVM
 } // namespace mlir
 
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMInterfaces.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMInterfaces.td
index 60235bcb35561..e05fb6a9bac7d 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMInterfaces.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMInterfaces.td
@@ -533,6 +533,24 @@ def LLVM_DIRecursiveTypeAttrInterface
   ];
 }
 
+def LLVM_LLVMAddrSpaceAttrInterface :
+    AttrInterface<"LLVMAddrSpaceAttrInterface"> {
+  let description = [{
+    An interface for attributes that represent LLVM address spaces.
+    Implementing attributes should provide access to the address space value
+    as an unsigned integer.
+  }];
+  let cppNamespace = "::mlir::LLVM";
+  let methods = [
+    InterfaceMethod<
+      /*description=*/"Returns the address space as an unsigned integer.",
+      /*retTy=*/"unsigned",
+      /*methodName=*/"getAddressSpace",
+      /*args=*/(ins)
+    >
+  ];
+}
+
 def LLVM_TargetAttrInterface
   : AttrInterface<"TargetAttrInterface", [DLTIQueryInterface]> {
   let description = [{
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td
index dd00d67974d28..d77bf0f51b637 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td
@@ -1162,9 +1162,13 @@ def LLVM_Annotation
 // Trap intrinsics.
 //
 
-def LLVM_Trap : LLVM_ZeroResultIntrOp<"trap">;
+def LLVM_Trap : LLVM_ZeroResultIntrOp<"trap"> {
+  let assemblyFormat = "attr-dict";
+}
 
-def LLVM_DebugTrap : LLVM_ZeroResultIntrOp<"debugtrap">;
+def LLVM_DebugTrap : LLVM_ZeroResultIntrOp<"debugtrap"> {
+  let assemblyFormat = "attr-dict";
+}
 
 def LLVM_UBSanTrap : LLVM_ZeroResultIntrOp<"ubsantrap",
   /*overloadedOperands=*/[], /*traits=*/[],
@@ -1172,6 +1176,8 @@ def LLVM_UBSanTrap : LLVM_ZeroResultIntrOp<"ubsantrap",
   /*requiresArgAndResultAttrs=*/0, /*requiresOpBundles=*/0,
   /*immArgPositions=*/[0], /*immArgAttrNames=*/["failureKind"]> {
   let arguments = (ins I8Attr:$failureKind);
+
+  let assemblyFormat = "attr-dict";
 }
 
 /// Create a call to vscale intrinsic.
diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMDialect.h b/mlir/include/mlir/Dialect/LLVMIR/NVVMDialect.h
index 6137bb087c576..6bd582d66ed25 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/NVVMDialect.h
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMDialect.h
@@ -19,6 +19,7 @@
 #include "mlir/Dialect/LLVMIR/BasicPtxBuilderInterface.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Dialect/LLVMIR/NVVMRequiresSMTraits.h"
+#include "mlir/Dialect/Ptr/IR/MemorySpaceInterfaces.h"
 #include "mlir/IR/Dialect.h"
 #include "mlir/IR/OpDefinition.h"
 #include "mlir/Interfaces/InferIntRangeInterface.h"
@@ -30,31 +31,23 @@
 
 namespace mlir {
 namespace NVVM {
+/// Utility functions to compare NVVMMemorySpace with unsigned values.
+inline bool operator==(unsigned as, NVVMMemorySpace memSpace) {
+  return as == static_cast<unsigned>(memSpace);
+}
+inline bool operator==(NVVMMemorySpace memSpace, unsigned as) {
+  return static_cast<unsigned>(memSpace) == as;
+}
+inline bool operator!=(unsigned as, NVVMMemorySpace memSpace) {
+  return as != static_cast<unsigned>(memSpace);
+}
+inline bool operator!=(NVVMMemorySpace memSpace, unsigned as) {
+  return static_cast<unsigned>(memSpace) != as;
+}
 
 // Shared memory has 128-bit alignment
 constexpr int kSharedMemoryAlignmentBit = 128;
 
-/// NVVM memory space identifiers.
-enum NVVMMemorySpace {
-  /// Generic memory space identifier.
-  kGenericMemorySpace = 0,
-  /// Global memory space identifier.
-  kGlobalMemorySpace = 1,
-  /// Shared memory space identifier.
-  kSharedMemorySpace = 3,
-  /// Constant memory space identifier.
-  kConstantMemorySpace = 4,
-  /// Local memory space identifier.
-  kLocalMemorySpace = 5,
-  /// Tensor memory space identifier.
-  /// Tensor memory is available only in arch-accelerated
-  /// variants from sm100 onwards.
-  kTensorMemorySpace = 6,
-  /// Distributed shared memory space identifier.
-  /// Distributed shared memory is available only in sm90+.
-  kSharedClusterMemorySpace = 7,
-};
-
 /// A pair type of LLVM's Intrinsic ID and args (which are llvm values).
 /// This type is returned by the getIntrinsicIDAndArgs() methods.
 using IDArgPair =
diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
index 854b4d26b4368..797f8ada9f238 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
@@ -17,6 +17,7 @@ include "mlir/IR/EnumAttr.td"
 include "mlir/Dialect/GPU/IR/CompilationAttrInterfaces.td"
 include "mlir/Dialect/LLVMIR/LLVMOpBase.td"
 include "mlir/Dialect/LLVMIR/NVVMRequiresSMTraits.td"
+include "mlir/Dialect/Ptr/IR/MemorySpaceInterfaces.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
 include "mlir/Dialect/LLVMIR/BasicPtxBuilderInterface.td"
 include "mlir/Interfaces/InferIntRangeInterface.td"
@@ -192,6 +193,40 @@ def CacheEvictionPriorityAttr : EnumAttr<NVVM_Dialect, CacheEvictionPriority,
   let assemblyFormat = "$value";
 }
 
+// Memory Space enum definitions
+/// Generic memory space identifier.
+def MemSpaceGeneric : I32EnumCase<"Generic", 0, "generic">;
+/// Global memory space identifier.
+def MemSpaceGlobal : I32EnumCase<"Global", 1, "global">;
+/// Shared memory space identifier.
+def MemSpaceShared : I32EnumCase<"Shared", 3, "shared">;
+/// Constant memory space identifier.
+def MemSpaceConstant : I32EnumCase<"Constant", 4, "constant">;
+/// Local memory space identifier.
+def MemSpaceLocal : I32EnumCase<"Local", 5, "local">;
+/// Tensor memory space identifier.
+/// Tensor memory is available only in arch-accelerated
+/// variants from sm100 onwards.
+def MemSpaceTensor : I32EnumCase<"Tensor", 6, "tensor">;
+/// Distributed shared memory space identifier.
+/// Distributed shared memory is available only in sm90+.
+def MemSpaceSharedCluster : I32EnumCase<"SharedCluster", 7, "shared_cluster">;
+
+def NVVMMemorySpace : I32Enum<"NVVMMemorySpace", "NVVM Memory Space",
+                  [MemSpaceGeneric, MemSpaceGlobal, MemSpaceShared,   
+                   MemSpaceConstant, MemSpaceLocal, MemSpaceTensor, 
+                   MemSpaceSharedCluster]> {
+  let cppNamespace = "::mlir::NVVM";
+}
+
+def NVVMMemorySpaceAttr : 
+  EnumAttr<NVVM_Dialect, NVVMMemorySpace, "memory_space", [
+    DeclareAttrInterfaceMethods<LLVM_LLVMAddrSpaceAttrInterface>,
+    DeclareAttrInterfaceMethods<MemorySpaceAttrInterface>
+  ]> {
+  let assemblyFormat = "`<` $value `>`";
+}
+
 //===----------------------------------------------------------------------===//
 // NVVM intrinsic operations
 //===----------------------------------------------------------------------===//
@@ -3592,7 +3627,7 @@ def NVVM_MapaOp: NVVM_Op<"mapa",
   string llvmBuilder = [{
     int addrSpace = llvm::cast<LLVMPointerType>(op.getA().getType()).getAddressSpace();
     
-    bool isSharedMemory = addrSpace == NVVM::NVVMMemorySpace::kSharedMemorySpace;
+    bool isSharedMemory = addrSpace == static_cast<int> (NVVM::NVVMMemorySpace::Shared);
 
     auto intId = isSharedMemory? llvm::Intrinsic::nvvm_mapa_shared_cluster : llvm::Intrinsic::nvvm_mapa;
     $res = createIntrinsicCall(builder, intId, {$a, $b});
@@ -4396,6 +4431,116 @@ def NVVM_DotAccumulate2WayOp : NVVM_Op<"dot.accumulate.2way"> {
   }];
 }
 
+//===----------------------------------------------------------------------===//
+// NVVM clusterlaunchcontrol Ops.
+//===----------------------------------------------------------------------===//
+
+def NVVM_ClusterLaunchControlTryCancelOp
+    : NVVM_Op<"clusterlaunchcontrol.try.cancel", [NVVMRequiresSM<100>]> {
+  let summary = "Request atomically canceling the launch of a cluster that has not started running yet";
+  let description = [{
+    `clusterlaunchcontrol.try.cancel` requests atomically canceling the launch 
+    of a cluster that has not started running yet. It asynchronously writes an 
+    opaque response to shared memory indicating whether the operation succeeded 
+    or failed.
+
+    Operand `smemAddress` specifies the naturally aligned address of the 
+    16-byte wide shared memory location where the request's response is written.
+
+    Operand `mbarrier` specifies the mbarrier object used to track the 
+    completion of the asynchronous operation.
+
+    If `multicast` is specified, the response is asynchronously written to the 
+    corresponding local shared memory location (specifed by `addr`) of each CTA 
+    in the requesting cluster.
+
+    [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-clusterlaunchcontrol-try-cancel)
+  }];
+
+  let arguments = (ins UnitAttr:$multicast, 
+                       LLVM_PointerShared: $smemAddress,
+                       LLVM_PointerShared: $mbarrier);
+
+  let assemblyFormat = "(`multicast` $multicast^ `,`)? $smemAddress `,` $mbarrier attr-dict";
+
+  let extraClassDeclaration = [{
+    static mlir::NVVM::IDArgPair
+    getIntrinsicIDAndArgs(Operation &op, LLVM::ModuleTranslation &mt,
+                          llvm::IRBuilderBase &builder);
+  }];
+  
+  string llvmBuilder = [{
+    auto [id, args] = 
+    NVVM::ClusterLaunchControlTryCancelOp::getIntrinsicIDAndArgs(
+                        *op, moduleTranslation, builder);
+    createIntrinsicCall(builder, id, args);
+  }];
+}
+
+def ClusterLaunchControlIsCanceled
+  : I32EnumCase<"IS_CANCELED", 0, "is_canceled">;
+def ClusterLaunchControlGetFirstCTAIDX
+  : I32EnumCase<"GET_FIRST_CTA_ID_X", 1, "get_first_cta_id_x">;
+def ClusterLaunchControlGetFirstCTAIDY
+  : I32EnumCase<"GET_FIRST_CTA_ID_Y", 2, "get_first_cta_id_y">;
+def ClusterLaunchControlGetFirstCTAIDZ
+  : I32EnumCase<"GET_FIRST_CTA_ID_Z", 3, "get_first_cta_id_z">;
+
+def ClusterLaunchControlQueryType
+  : I32Enum<"ClusterLaunchControlQueryType",
+      "NVVM ClusterLaunchControlQueryType", 
+      [ClusterLaunchControlIsCanceled, ClusterLaunchControlGetFirstCTAIDX,
+      ClusterLaunchControlGetFirstCTAIDY, ClusterLaunchControlGetFirstCTAIDZ]> {
+  let cppNamespace = "::mlir::NVVM";
+}
+
+def ClusterLaunchControlQueryTypeAttr
+  : EnumAttr<NVVM_Dialect,
+      ClusterLaunchControlQueryType, "cluster_launch_control_query_type"> {
+  let assemblyFormat = "$value";
+}
+
+def NVVM_ClusterLaunchControlQueryCancelOp
+    : NVVM_Op<"clusterlaunchcontrol.query.cancel", [NVVMRequiresSM<100>]> {
+  let summary = "Query the response of a clusterlaunchcontrol.try.cancel operation";
+  let description = [{
+    `clusterlaunchcontrol.query.cancel` queries the response of a 
+    `clusterlaunchcontrol.try.cancel` operation specified by operand 
+    `try_cancel_response`.
+
+    Operand `query_type` specifies the type of query to perform and can be one 
+    of the following:
+    - `is_canceled` : Returns true if the try cancel request succeeded, 
+    and false otherwise.
+    - `get_first_cta_id_{x/y/z}` : Returns the x, y, or z coordinate of the 
+    first CTA in the canceled cluster. Behaviour is defined only if the try 
+    cancel request succeeded. 
+
+    [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-clusterlaunchcontrol-query-cancel)
+  }];
+
+  let arguments = (ins ClusterLaunchControlQueryTypeAttr:$query_type, 
+                       I128:$try_cancel_response);
+  let results = (outs AnyTypeOf<[I1, I32]>:$res);
+                                 
+  let assemblyFormat = "`query` `=` $query_type `,` $try_cancel_response attr-dict `:` type($res)";
+  
+  let hasVerifier = 1;
+
+  let extraClassDeclaration = [{
+    static mlir::NVVM::IDArgPair
+    getIntrinsicIDAndArgs(Operation &op, LLVM::ModuleTranslation &mt,
+                          llvm::IRBuilderBase &builder);
+  }];
+  
+  string llvmBuilder = [{
+    auto [id, args] = 
+    NVVM::ClusterLaunchControlQueryCancelOp::getIntrinsicIDAndArgs(
+                        *op, moduleTranslation, builder);
+    $res = createIntrinsicCall(builder, id, args);
+  }];
+}
+
 //===----------------------------------------------------------------------===//
 // NVVM target attribute.
 //===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Dialect/LLVMIR/XeVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/XeVMOps.td
index f457f47d56219..514b01a69fb9b 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/XeVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/XeVMOps.td
@@ -69,6 +69,7 @@ class XeVM_Op<string mnemonic, list<Trait> traits = []>
 }
 
 def XeVM_ElemType : AnyTypeOf<[AnyI8, AnyI16, AnyI32, F32, TF32, F16, BF16]>;
+def XeVM_1DBlockElemType : AnyTypeOf<[I8, I16, I32, I64]>;
 
 //===----------------------------------------------------------------------===//
 // XeVM Load Cache Control
@@ -187,6 +188,81 @@ def XeVM_StoreCacheControlAttr
   let assemblyFormat = "`<` $value `>`";
 }
 
+def XeVM_BlockLoadOp
+    : XeVM_Op<"blockload">,
+      Results<(
+          outs FixedVectorOfRankAndType<[1], [XeVM_1DBlockElemType]>:$res)>,
+      Arguments<(ins Arg<LLVM_AnyPointer, "", [MemRead]>:$ptr,
+          OptionalAttr<XeVM_LoadCacheControlAttr>:$cache_control)> {
+  let summary = "subgroup block load";
+  let description = [{
+    Reads one or more components of Result data for each invocation
+    in the subgroup from the specified `ptr` as a block operation.
+    The data is read strided, so the first value read is:
+    ```
+      ptr[ SubgroupLocalInvocationId ]
+    ```
+    and the second value read is:
+    ```
+      ptr[ SubgroupLocalInvocationId + SubgroupMaxSize ]
+    ```
+    Result type may be a scalar or vector type of scalar element type.
+
+    The parameters are:
+      * `ptr` - the base address to load from. Must be uniform across subgroup.
+      * `cache_control` - an enumerator that sets the cache behaviour
+
+    Example:
+    ```mlir
+      %loaded_a = xevm.blockload %src,
+                      <{cache_control=#xevm.load_cache_control<L1uc_L2uc_L3uc>}>
+                    : (!llvm.ptr<1>) -> vector<4xi16>
+    ```
+  }];
+  let assemblyFormat = [{
+    operands prop-dict attr-dict `:` functional-type(operands, results)
+  }];
+  let hasVerifier = 1;
+}
+
+def XeVM_BlockStoreOp
+    : XeVM_Op<"blockstore">,
+      Arguments<(ins Arg<LLVM_AnyPointer, "", [MemWrite]>:$ptr,
+          FixedVectorOfRankAndType<[1], [XeVM_1DBlockElemType]>:$val,
+          OptionalAttr<XeVM_StoreCacheControlAttr>:$cache_control)> {
+  let summary = "subgroup block store";
+  let description = [{
+    Writes one or more components of `val` for each invocation
+    in the subgroup to the specified `ptr` as a block operation.
+    The data is written strided, so the first value is written to:
+    ```
+      ptr[ SubgroupLocalInvocationId ]
+    ```
+    and the second value is written to:
+    ```
+      ptr[ SubgroupLocalInvocationId + SubgroupMaxSize ]
+    ```
+    `val` type may be a scalar or vector type of scalar element type.
+
+    The parameters are:
+      * `ptr` - the base address to store to. Must be uniform across subgroup.
+      * `val` - the value to store
+      * `cache_control` - an enumerator that sets the cache behaviour
+
+    Example:
+    ```mlir
+      xevm.blockstore %ptr, %val
+        <{cache_control=#xevm.store_cache_control<L1uc_L2uc_L3uc>}>
+        : (!llvm.ptr<1>, vector<4xi16>)
+    ```
+  }];
+
+  let assemblyFormat = [{
+    operands prop-dict attr-dict `:` `(` type(operands) `)`
+  }];
+  let hasVerifier = 1;
+}
+
 def XeVM_BlockLoad2dOp
     : XeVM_Op<"blockload2d">,
       Results<(outs FixedVectorOfRankAndType<[1], [XeVM_ElemType]>:$res)>,
diff --git a/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td b/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td
index 513a9a18198a3..671cc05e963b4 100644
--- a/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td
+++ b/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td
@@ -1163,7 +1163,7 @@ def MemRef_GlobalOp : MemRef_Op<"global", [Symbol]> {
                        MemRefTypeAttr:$type,
                        OptionalAttr<AnyAttr>:$initial_value,
                        UnitAttr:$constant,
-                       OptionalAttr<I64Attr>:$alignment);
+                       OptionalAttr<IntValidAlignment<I64Attr>>:$alignment);
 
   let assemblyFormat = [{
        ($sym_visibility^)?
@@ -1231,8 +1231,7 @@ def LoadOp : MemRef_Op<"load",
                            [MemRead]>:$memref,
                        Variadic<Index>:$indices,
                        DefaultValuedOptionalAttr<BoolAttr, "false">:$nontemporal,
-                       ConfinedAttr<OptionalAttr<I64Attr>,
-                                    [AllAttrOf<[IntPositive, IntPowerOf2]>]>:$alignment);
+                       OptionalAttr<IntValidAlignment<I64Attr>>:$alignment);
 
   let builders = [
     OpBuilder<(ins "Value":$memref,
@@ -1965,8 +1964,7 @@ def MemRef_StoreOp : MemRef_Op<"store",
                            [MemWrite]>:$memref,
                        Variadic<Index>:$indices,
                        DefaultValuedOptionalAttr<BoolAttr, "false">:$nontemporal,
-                       ConfinedAttr<OptionalAttr<I64Attr>,
-                                    [AllAttrOf<[IntPositive, IntPowerOf2]>]>:$alignment);
+                       OptionalAttr<IntValidAlignment<I64Attr>>:$alignment);
 
   let builders = [
     OpBuilder<(ins "Value":$valueToStore,
diff --git a/mlir/include/mlir/Dialect/MemRef/Transforms/Transforms.h b/mlir/include/mlir/Dialect/MemRef/Transforms/Transforms.h
index 33e3d94f02b1c..8b76930aed35a 100644
--- a/mlir/include/mlir/Dialect/MemRef/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/MemRef/Transforms/Transforms.h
@@ -145,6 +145,10 @@ FailureOr<memref::AllocOp> multiBuffer(memref::AllocOp allocOp,
 /// ```
 void populateExtractAddressComputationsPatterns(RewritePatternSet &patterns);
 
+/// Patterns for flattening multi-dimensional memref operations into
+/// one-dimensional memref operations.
+void populateFlattenVectorOpsOnMemrefPatterns(RewritePatternSet &patterns);
+void populateFlattenMemrefOpsPatterns(RewritePatternSet &patterns);
 void populateFlattenMemrefsPatterns(RewritePatternSet &patterns);
 
 /// Build a new memref::AllocaOp whose dynamic sizes are independent of all
diff --git a/mlir/include/mlir/Dialect/Ptr/IR/PtrAttrDefs.td b/mlir/include/mlir/Dialect/Ptr/IR/PtrAttrDefs.td
index 4542f57a62d79..78006d2dec40d 100644
--- a/mlir/include/mlir/Dialect/Ptr/IR/PtrAttrDefs.td
+++ b/mlir/include/mlir/Dialect/Ptr/IR/PtrAttrDefs.td
@@ -22,6 +22,34 @@ class Ptr_Attr<string name, string attrMnemonic,
   let mnemonic = attrMnemonic;
 }
 
+//===----------------------------------------------------------------------===//
+// AddressAttr
+//===----------------------------------------------------------------------===//
+
+def Ptr_AddressAttr : Ptr_Attr<"Address", "address", [
+    DeclareAttrInterfaceMethods<TypedAttrInterface>
+  ]> {
+  let summary = "Address attribute";
+  let description = [{
+    The `address` attribute represents a raw memory address, expressed in bytes.
+
+    Example:
+
+    ```mlir
+      #ptr.address<0x1000> : !ptr.ptr<#ptr.generic_space>
+    ```
+  }];
+  let parameters = (ins AttributeSelfTypeParameter<"", "PtrType">:$type,
+                        APIntParameter<"">:$value);
+  let builders = [
+    AttrBuilderWithInferredContext<(ins "PtrType":$type,
+                                        "const llvm::APInt &":$value), [{
+      return $_get(type.getContext(), type, value);
+    }]>
+  ];
+  let assemblyFormat = "`<` $value `>`";
+}
+
 //===----------------------------------------------------------------------===//
 // GenericSpaceAttr
 //===----------------------------------------------------------------------===//
@@ -37,16 +65,42 @@ def Ptr_GenericSpaceAttr :
     - Load and store operations are always valid, regardless of the type.
     - Atomic operations are always valid, regardless of the type.
     - Cast operations to `generic_space` are always valid.
-  
+
     Example:
 
     ```mlir
-    #ptr.generic_space
+      #ptr.generic_space : !ptr.ptr<#ptr.generic_space>
     ```
   }];
   let assemblyFormat = "";
 }
 
+//===----------------------------------------------------------------------===//
+// NullAttr
+//===----------------------------------------------------------------------===//
+
+def Ptr_NullAttr : Ptr_Attr<"Null", "null", [
+    DeclareAttrInterfaceMethods<TypedAttrInterface>
+  ]> {
+  let summary = "Null pointer attribute";
+  let description = [{
+    The `null` attribute represents a null pointer.
+
+    Example:
+
+    ```mlir
+      #ptr.null
+    ```
+  }];
+  let parameters = (ins AttributeSelfTypeParameter<"", "PtrType">:$type);
+  let builders = [
+    AttrBuilderWithInferredContext<(ins "PtrType":$type), [{
+      return $_get(type.getContext(), type);
+    }]>
+  ];
+  let assemblyFormat = "";
+}
+
 //===----------------------------------------------------------------------===//
 // SpecAttr
 //===----------------------------------------------------------------------===//
@@ -62,7 +116,7 @@ def Ptr_SpecAttr : Ptr_Attr<"Spec", "spec"> {
      - [Optional] index: bitwidth that should be used when performing index
      computations for the type. Setting the field to `kOptionalSpecValue`, means
      the field is optional.
-    
+
     Furthermore, the attribute will verify that all present values are divisible
     by 8 (number of bits in a byte), and that `preferred` > `abi`.
 
diff --git a/mlir/include/mlir/Dialect/Ptr/IR/PtrAttrs.h b/mlir/include/mlir/Dialect/Ptr/IR/PtrAttrs.h
index bb01ceaaeea54..c252f9efd0471 100644
--- a/mlir/include/mlir/Dialect/Ptr/IR/PtrAttrs.h
+++ b/mlir/include/mlir/Dialect/Ptr/IR/PtrAttrs.h
@@ -21,6 +21,12 @@
 #include "mlir/Dialect/Ptr/IR/MemorySpaceInterfaces.h"
 #include "mlir/Dialect/Ptr/IR/PtrEnums.h"
 
+namespace mlir {
+namespace ptr {
+class PtrType;
+} // namespace ptr
+} // namespace mlir
+
 #define GET_ATTRDEF_CLASSES
 #include "mlir/Dialect/Ptr/IR/PtrOpsAttrs.h.inc"
 
diff --git a/mlir/include/mlir/Dialect/Ptr/IR/PtrDialect.td b/mlir/include/mlir/Dialect/Ptr/IR/PtrDialect.td
index 7407d74ce3a87..c98df5775195a 100644
--- a/mlir/include/mlir/Dialect/Ptr/IR/PtrDialect.td
+++ b/mlir/include/mlir/Dialect/Ptr/IR/PtrDialect.td
@@ -21,6 +21,18 @@ include "mlir/IR/OpBase.td"
 def Ptr_Dialect : Dialect {
   let name = "ptr";
   let summary = "Pointer dialect";
+  let description = [{
+    The pointer dialect provides types and operations for representing and
+    interacting with pointer values in MLIR, such as loading and storing values
+    from/to memory addresses.
+
+    The dialect's main type is an opaque pointer (`ptr`) that can be
+    parameterized by a memory space. This type represents a handle to an object
+    in memory, or target-dependent values like `nullptr`. Further, the dialect
+    assumes that the minimum addressable unit by a pointer is a byte. However,
+    the dialect does not make assumptions about the size of a byte, which is
+    considered a target-specific property.
+  }];
   let cppNamespace = "::mlir::ptr";
   let useDefaultTypePrinterParser = 1;
   let useDefaultAttributePrinterParser = 1;
diff --git a/mlir/include/mlir/Dialect/Ptr/IR/PtrOps.td b/mlir/include/mlir/Dialect/Ptr/IR/PtrOps.td
index 3ac12978b947c..468a3004d5c62 100644
--- a/mlir/include/mlir/Dialect/Ptr/IR/PtrOps.td
+++ b/mlir/include/mlir/Dialect/Ptr/IR/PtrOps.td
@@ -36,7 +36,7 @@ class Ptr_ShapedValueType<list<Type> allowedTypes, list<Pred> preds = []> :
     /*cppType=*/"::mlir::ShapedType">;
 
 // A ptr-like type, either scalar or shaped type with value semantics.
-def Ptr_PtrLikeType : 
+def Ptr_PtrLikeType :
   AnyTypeOf<[Ptr_ShapedValueType<[Ptr_PtrType], [HasRankPred]>, Ptr_PtrType]>;
 
 // An int-like type, either scalar or shaped type with value semantics.
@@ -57,6 +57,31 @@ def Ptr_Mask1DType :
 def Ptr_Ptr1DType :
   Ptr_ShapedValueType<[Ptr_PtrType], [HasAnyRankOfPred<[1]>]>;
 
+//===----------------------------------------------------------------------===//
+// ConstantOp
+//===----------------------------------------------------------------------===//
+
+def Ptr_ConstantOp : Pointer_Op<"constant", [
+    ConstantLike, Pure, AllTypesMatch<["value", "result"]>
+  ]> {
+  let summary = "Pointer constant operation";
+  let description = [{
+    The `constant` operation produces a pointer constant. The attribute must be
+    a typed attribute of pointer type.
+
+    Example:
+
+    ```mlir
+    // Create a null pointer
+    %null = ptr.constant #ptr.null : !ptr.ptr<#ptr.generic_space>
+    ```
+  }];
+  let arguments = (ins TypedAttrInterface:$value);
+  let results = (outs Ptr_PtrType:$result);
+  let assemblyFormat = "attr-dict $value";
+  let hasFolder = 1;
+}
+
 //===----------------------------------------------------------------------===//
 // FromPtrOp
 //===----------------------------------------------------------------------===//
@@ -81,7 +106,7 @@ def Ptr_FromPtrOp : Pointer_Op<"from_ptr", [
     ```mlir
     %typed_ptr = ptr.from_ptr %ptr : !ptr.ptr<#ptr.generic_space> -> !my.ptr<f32, #ptr.generic_space>
     %memref = ptr.from_ptr %ptr metadata %md : !ptr.ptr<#ptr.generic_space> -> memref<f32, #ptr.generic_space>
-  
+
     // Cast the `%ptr` to a memref without utilizing metadata.
     %memref = ptr.from_ptr %ptr : !ptr.ptr<#ptr.generic_space> -> memref<f32, #ptr.generic_space>
     ```
@@ -361,13 +386,13 @@ def Ptr_PtrAddOp : Pointer_Op<"ptr_add", [
     // Scalar base and offset
     %x_off  = ptr.ptr_add %x, %off : !ptr.ptr<#ptr.generic_space>, i32
     %x_off0 = ptr.ptr_add nusw %x, %off : !ptr.ptr<#ptr.generic_space>, i32
-    
+
     // Shaped base with scalar offset
     %ptrs_off = ptr.ptr_add %ptrs, %off : vector<4x!ptr.ptr<#ptr.generic_space>>, i32
-    
+
     // Scalar base with shaped offset
     %x_offs = ptr.ptr_add %x, %offs : !ptr.ptr<#ptr.generic_space>, vector<4xi32>
-    
+
     // Both base and offset are shaped
     %ptrs_offs = ptr.ptr_add %ptrs, %offs : vector<4x!ptr.ptr<#ptr.generic_space>>, vector<4xi32>
     ```
@@ -382,7 +407,7 @@ def Ptr_PtrAddOp : Pointer_Op<"ptr_add", [
   }];
   let hasFolder = 1;
   let extraClassDeclaration = [{
-    /// `ViewLikeOp::getViewSource` method. 
+    /// `ViewLikeOp::getViewSource` method.
     Value getViewSource() { return getBase(); }
 
     /// Returns the ptr type of the operation.
@@ -418,7 +443,7 @@ def Ptr_ScatterOp : Pointer_Op<"scatter", [
     // Scatter values to multiple memory locations
     ptr.scatter %value, %ptrs, %mask :
       vector<4xf32>, vector<4x!ptr.ptr<#ptr.generic_space>>
-    
+
     // Scatter with alignment
     ptr.scatter %value, %ptrs, %mask alignment = 8 :
       vector<4xf32>, vector<4x!ptr.ptr<#ptr.generic_space>>
diff --git a/mlir/include/mlir/Dialect/SCF/IR/SCFOps.td b/mlir/include/mlir/Dialect/SCF/IR/SCFOps.td
index 88df54174da24..fadd3fc10bfc4 100644
--- a/mlir/include/mlir/Dialect/SCF/IR/SCFOps.td
+++ b/mlir/include/mlir/Dialect/SCF/IR/SCFOps.td
@@ -152,7 +152,7 @@ def ForOp : SCF_Op<"for",
       [AutomaticAllocationScope, DeclareOpInterfaceMethods<LoopLikeOpInterface,
        ["getInitsMutable", "getLoopResults", "getRegionIterArgs",
         "getLoopInductionVars", "getLoopLowerBounds", "getLoopSteps",
-        "getLoopUpperBounds", "getYieldedValuesMutable",
+        "getLoopUpperBounds", "getStaticTripCount", "getYieldedValuesMutable",
         "promoteIfSingleIteration", "replaceWithAdditionalYields",
         "yieldTiledValuesAndReplace"]>,
        AllTypesMatch<["lowerBound", "upperBound", "step"]>,
@@ -654,7 +654,7 @@ def ForallOp : SCF_Op<"forall", [
 def InParallelOp : SCF_Op<"forall.in_parallel", [
        Pure,
        Terminator,
-       DeclareOpInterfaceMethods<ParallelCombiningOpInterface>,
+       DeclareOpInterfaceMethods<InParallelOpInterface>,
        HasParent<"ForallOp">,
       ] # GraphRegionNoTerminator.traits> {
   let summary = "terminates a `forall` block";
@@ -679,8 +679,6 @@ def InParallelOp : SCF_Op<"forall.in_parallel", [
     OpBuilder<(ins)>,
   ];
 
-  // TODO: Add a `InParallelOpInterface` interface for ops that can
-  // appear inside in_parallel.
   let extraClassDeclaration = [{
     ::llvm::SmallVector<::mlir::BlockArgument> getDests();
     ::llvm::iterator_range<::mlir::Block::iterator> getYieldingOps();
diff --git a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVCooperativeMatrixOps.td b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVCooperativeMatrixOps.td
index fd75532ae3d70..827ac901d22de 100644
--- a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVCooperativeMatrixOps.td
+++ b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVCooperativeMatrixOps.td
@@ -129,7 +129,7 @@ def SPIRV_KHRCooperativeMatrixLoadOp : SPIRV_KhrVendorOp<"CooperativeMatrixLoad"
     SPIRV_KHR_CooperativeMatrixLayoutAttr:$matrix_layout,
     SPIRV_Integer:$stride,
     OptionalAttr<SPIRV_MemoryAccessAttr>:$memory_operand,
-    OptionalAttr<I32Attr>:$alignment
+    OptionalAttr<IntValidAlignment<I32Attr>>:$alignment
   );
 
   let results = (outs
@@ -214,7 +214,7 @@ def SPIRV_KHRCooperativeMatrixStoreOp : SPIRV_KhrVendorOp<"CooperativeMatrixStor
     SPIRV_KHR_CooperativeMatrixLayoutAttr:$matrix_layout,
     SPIRV_Integer:$stride,
     OptionalAttr<SPIRV_MemoryAccessAttr>:$memory_operand,
-    OptionalAttr<I32Attr>:$alignment
+    OptionalAttr<IntValidAlignment<I32Attr>>:$alignment
   );
 
   let results = (outs);
diff --git a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVMemoryOps.td b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVMemoryOps.td
index 6253601a7c2b2..6108decdb9706 100644
--- a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVMemoryOps.td
+++ b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVMemoryOps.td
@@ -121,9 +121,9 @@ def SPIRV_CopyMemoryOp : SPIRV_Op<"CopyMemory", []> {
     SPIRV_AnyPtr:$target,
     SPIRV_AnyPtr:$source,
     OptionalAttr<SPIRV_MemoryAccessAttr>:$memory_access,
-    OptionalAttr<I32Attr>:$alignment,
+    OptionalAttr<IntValidAlignment<I32Attr>>:$alignment,
     OptionalAttr<SPIRV_MemoryAccessAttr>:$source_memory_access,
-    OptionalAttr<I32Attr>:$source_alignment
+    OptionalAttr<IntValidAlignment<I32Attr>>:$source_alignment
   );
 
   let results = (outs);
diff --git a/mlir/include/mlir/Dialect/Tensor/IR/TensorOps.td b/mlir/include/mlir/Dialect/Tensor/IR/TensorOps.td
index 7d396e5c64c28..2453cf5b5b5a4 100644
--- a/mlir/include/mlir/Dialect/Tensor/IR/TensorOps.td
+++ b/mlir/include/mlir/Dialect/Tensor/IR/TensorOps.td
@@ -1470,24 +1470,25 @@ def Tensor_PadOp : Tensor_Op<"pad", [
 // ParallelInsertSliceOp
 //===----------------------------------------------------------------------===//
 
-// TODO: Implement InParallelOpInterface.
 def Tensor_ParallelInsertSliceOp : Tensor_Op<"parallel_insert_slice", [
        AttrSizedOperandSegments,
        OffsetSizeAndStrideOpInterface,
+       DeclareOpInterfaceMethods<ParallelCombiningOpInterface,
+          ["getUpdatedDestinations", "getIteratingParent"]>,
        // TODO: Cannot use an interface here atm, verify this manually for now.
-       // HasParent<"ParallelCombiningOpInterface">
+       // HasParent<"InParallelOpInterface">
   ]> {
   let summary = [{
     Specify the tensor slice update of a single thread of a parent
-    ParallelCombiningOpInterface op.
+    InParallelOpInterface op.
   }];
   let description = [{
     The `parallel_insert_slice` yields a subset tensor value to its parent
-    ParallelCombiningOpInterface. These subset tensor values are aggregated to
+    InParallelOpInterface. These subset tensor values are aggregated to
     in some unspecified order into a full tensor value returned by the parent
     parallel iterating op.
     The `parallel_insert_slice` is one such op allowed in the
-    ParallelCombiningOpInterface op.
+    InParallelOpInterface op.
 
     Conflicting writes result in undefined semantics, in that the indices written
     to by multiple parallel updates might contain data from any of the updates,
@@ -1569,8 +1570,8 @@ def Tensor_ParallelInsertSliceOp : Tensor_Op<"parallel_insert_slice", [
       return ::llvm::cast<RankedTensorType>(getDest().getType());
     }
 
-    ParallelCombiningOpInterface getParallelCombiningParent() {
-      return dyn_cast<ParallelCombiningOpInterface>(
+    InParallelOpInterface getParallelCombiningParent() {
+      return dyn_cast<InParallelOpInterface>(
         getOperation()->getParentOp());
     }
 
diff --git a/mlir/include/mlir/Dialect/Utils/StaticValueUtils.h b/mlir/include/mlir/Dialect/Utils/StaticValueUtils.h
index 77c376fb9973a..2e7f85cce4654 100644
--- a/mlir/include/mlir/Dialect/Utils/StaticValueUtils.h
+++ b/mlir/include/mlir/Dialect/Utils/StaticValueUtils.h
@@ -105,6 +105,10 @@ OpFoldResult getAsIndexOpFoldResult(MLIRContext *ctx, int64_t val);
 SmallVector<OpFoldResult> getAsIndexOpFoldResult(MLIRContext *ctx,
                                                  ArrayRef<int64_t> values);
 
+/// If ofr is a constant integer or an IntegerAttr, return the integer.
+/// The second return value indicates whether the value is an index type
+/// and thus the bitwidth is not defined (the APInt will be set with 64bits).
+std::optional<std::pair<APInt, bool>> getConstantAPIntValue(OpFoldResult ofr);
 /// If ofr is a constant integer or an IntegerAttr, return the integer.
 std::optional<int64_t> getConstantIntValue(OpFoldResult ofr);
 /// If all ofrs are constant integers or IntegerAttrs, return the integers.
@@ -201,9 +205,26 @@ foldDynamicOffsetSizeList(SmallVectorImpl<OpFoldResult> &offsetsOrSizes);
 LogicalResult foldDynamicStrideList(SmallVectorImpl<OpFoldResult> &strides);
 
 /// Return the number of iterations for a loop with a lower bound `lb`, upper
-/// bound `ub` and step `step`.
-std::optional<int64_t> constantTripCount(OpFoldResult lb, OpFoldResult ub,
-                                         OpFoldResult step);
+/// bound `ub` and step `step`. The `isSigned` flag indicates whether the loop
+/// comparison between lb and ub is signed or unsigned. A negative step or a
+/// lower bound greater than the upper bound are considered invalid and will
+/// yield a zero trip count.
+/// The `computeUbMinusLb` callback is invoked to compute the difference between
+/// the upper and lower bound when not constant. It can be used by the client
+/// to compute a static difference when the bounds are not constant.
+///
+/// For example, the following code:
+///
+///   %ub = arith.addi nsw %lb, %c16_i32 : i32
+///   %1 = scf.for %arg0 = %lb to %ub ...
+///
+/// where %ub is computed as a static offset from %lb.
+/// Note: the matched addition should be nsw/nuw (matching the loop comparison)
+/// to avoid overflow, otherwise an overflow would imply a zero trip count.
+std::optional<APInt> constantTripCount(
+    OpFoldResult lb, OpFoldResult ub, OpFoldResult step, bool isSigned,
+    llvm::function_ref<std::optional<llvm::APSInt>(Value, Value, bool)>
+        computeUbMinusLb);
 
 /// Idiomatic saturated operations on values like offsets, sizes, and strides.
 struct SaturatedInteger {
diff --git a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
index 65ba7e0ad549f..26d06624cb976 100644
--- a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
+++ b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
@@ -675,7 +675,7 @@ def Vector_ExtractOp :
   }];
 
   let arguments = (ins
-    AnyVectorOfAnyRank:$vector,
+    AnyVectorOfAnyRank:$source,
     Variadic<Index>:$dynamic_position,
     DenseI64ArrayAttr:$static_position
   );
@@ -692,7 +692,7 @@ def Vector_ExtractOp :
 
   let extraClassDeclaration = extraPoisonClassDeclaration # [{
     VectorType getSourceVectorType() {
-      return ::llvm::cast<VectorType>(getVector().getType());
+      return ::llvm::cast<VectorType>(getSource().getType());
     }
 
     /// Return a vector with all the static and dynamic position indices.
@@ -709,12 +709,17 @@ def Vector_ExtractOp :
     bool hasDynamicPosition() {
       return !getDynamicPosition().empty();
     }
+
+    /// Wrapper for getSource, which replaced getVector.
+    [[deprecated("Use getSource instead!")]] ::mlir::Value getVector() {
+      return getSource();
+    }
   }];
 
   let assemblyFormat = [{
-    $vector ``
+    $source ``
     custom<DynamicIndexList>($dynamic_position, $static_position)
-    attr-dict `:` type($result) `from` type($vector)
+    attr-dict `:` type($result) `from` type($source)
   }];
 
   let hasCanonicalizer = 1;
@@ -1023,6 +1028,10 @@ def Vector_ScalableExtractOp :
     VectorType getResultVectorType() {
       return ::llvm::cast<VectorType>(getResult().getType());
     }
+    /// Wrapper for getSource, which replaced getVector.
+    [[deprecated("Use getSource instead!")]] ::mlir::Value getVector() {
+      return getSource();
+    }
   }];
 }
 
@@ -1174,7 +1183,7 @@ def Vector_ExtractStridedSliceOp :
   Vector_Op<"extract_strided_slice", [Pure,
     PredOpTrait<"operand and result have same element type",
                  TCresVTEtIsSameAsOpBase<0, 0>>]>,
-    Arguments<(ins AnyVectorOfNonZeroRank:$vector, I64ArrayAttr:$offsets,
+    Arguments<(ins AnyVectorOfNonZeroRank:$source, I64ArrayAttr:$offsets,
                I64ArrayAttr:$sizes, I64ArrayAttr:$strides)>,
     Results<(outs AnyVectorOfNonZeroRank)> {
   let summary = "extract_strided_slice operation";
@@ -1209,7 +1218,7 @@ def Vector_ExtractStridedSliceOp :
   ];
   let extraClassDeclaration = [{
     VectorType getSourceVectorType() {
-      return ::llvm::cast<VectorType>(getVector().getType());
+      return ::llvm::cast<VectorType>(getSource().getType());
     }
     void getOffsets(SmallVectorImpl<int64_t> &results);
     bool hasNonUnitStrides() {
@@ -1217,11 +1226,15 @@ def Vector_ExtractStridedSliceOp :
         return ::llvm::cast<IntegerAttr>(attr).getInt() != 1;
       });
     }
+    /// Wrapper for getSource, which replaced getVector.
+    [[deprecated("Use getSource instead!")]] ::mlir::Value getVector() {
+      return getSource();
+    }
   }];
   let hasCanonicalizer = 1;
   let hasFolder = 1;
   let hasVerifier = 1;
-  let assemblyFormat = "$vector attr-dict `:` type($vector) `to` type(results)";
+  let assemblyFormat = "$source attr-dict `:` type($source) `to` type(results)";
 }
 
 // TODO: Tighten semantics so that masks and inbounds can't be used
@@ -3006,6 +3019,7 @@ def Vector_ScanOp :
 
 def Vector_StepOp : Vector_Op<"step", [
     Pure,
+    DeclareOpInterfaceMethods<VectorUnrollOpInterface>,
     DeclareOpInterfaceMethods<InferIntRangeInterface, ["inferResultRanges"]>
   ]> {
   let summary = "A linear sequence of values from 0 to N";
diff --git a/mlir/include/mlir/Dialect/Vector/TransformOps/VectorTransformOps.td b/mlir/include/mlir/Dialect/Vector/TransformOps/VectorTransformOps.td
index 07a4117a37b2c..03d25505dc65c 100644
--- a/mlir/include/mlir/Dialect/Vector/TransformOps/VectorTransformOps.td
+++ b/mlir/include/mlir/Dialect/Vector/TransformOps/VectorTransformOps.td
@@ -85,6 +85,20 @@ def ApplyDropUnitDimWithShapeCastPatternsOp : Op<Transform_Dialect,
   let assemblyFormat = "attr-dict";
 }
 
+def ApplyDropInnerMostUnitDimsFromXferOpsPatternsOp : Op<Transform_Dialect,
+    "apply_patterns.vector.drop_inner_most_unit_dims_from_xfer_ops",
+    [DeclareOpInterfaceMethods<PatternDescriptorOpInterface>]> {
+  let description = [{
+    Apply vector patterns to drop the inner most unit dims from
+    vector.transfer_read and vector.transfer_write Ops by taking a subview (via
+    memref.subview) of the original source/destination MemRef. Since it
+    requires the input/ouptu to be MemRefs, this Op is only helpful
+    past-bufferization.
+  }];
+
+  let assemblyFormat = "attr-dict";
+}
+
 def ApplyTransferPermutationPatternsOp : Op<Transform_Dialect,
     "apply_patterns.vector.transfer_permutation_patterns",
     [DeclareOpInterfaceMethods<PatternDescriptorOpInterface>]> {
@@ -265,6 +279,17 @@ def ApplyUnrollFromElementsPatternsOp : Op<Transform_Dialect,
   let assemblyFormat = "attr-dict";
 }
 
+def ApplyUnrollToElementsPatternsOp : Op<Transform_Dialect,
+    "apply_patterns.vector.unroll_to_elements",
+    [DeclareOpInterfaceMethods<PatternDescriptorOpInterface>]> {
+  let description = [{
+    Indicates that vector to_elements operations should be unrolled
+    along the outermost dimension.
+  }];
+
+  let assemblyFormat = "attr-dict";
+}
+
 def ApplyLowerScanPatternsOp : Op<Transform_Dialect,
     "apply_patterns.vector.lower_scan",
     [DeclareOpInterfaceMethods<PatternDescriptorOpInterface>]> {
diff --git a/mlir/include/mlir/Dialect/Vector/Transforms/LoweringPatterns.h b/mlir/include/mlir/Dialect/Vector/Transforms/LoweringPatterns.h
index 47f96112a9433..b896506f29eef 100644
--- a/mlir/include/mlir/Dialect/Vector/Transforms/LoweringPatterns.h
+++ b/mlir/include/mlir/Dialect/Vector/Transforms/LoweringPatterns.h
@@ -293,6 +293,9 @@ void populateVectorBitCastLoweringPatterns(RewritePatternSet &patterns,
                                            int64_t targetRank = 1,
                                            PatternBenefit benefit = 1);
 
+void populateVectorShuffleLoweringPatterns(RewritePatternSet &patterns,
+                                           PatternBenefit benefit = 1);
+
 /// Populates a pattern that rank-reduces n-D FMAs into (n-1)-D FMAs where
 /// n > 1.
 void populateVectorRankReducingFMAPattern(RewritePatternSet &patterns);
@@ -311,6 +314,12 @@ void populateVectorToFromElementsToShuffleTreePatterns(
 void populateVectorFromElementsLoweringPatterns(RewritePatternSet &patterns,
                                                 PatternBenefit benefit = 1);
 
+/// Populate the pattern set with the following patterns:
+///
+/// [UnrollToElements]
+void populateVectorToElementsLoweringPatterns(RewritePatternSet &patterns,
+                                              PatternBenefit benefit = 1);
+
 /// Populate the pattern set with the following patterns:
 ///
 /// [ContractionOpToMatmulOpLowering]
diff --git a/mlir/include/mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h b/mlir/include/mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h
index 0138f477cadea..08f439222a9a0 100644
--- a/mlir/include/mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h
+++ b/mlir/include/mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h
@@ -383,6 +383,16 @@ void populateVectorNarrowTypeEmulationPatterns(
     const arith::NarrowTypeEmulationConverter &typeConverter,
     RewritePatternSet &patterns, bool disableAtomicRMW = false);
 
+/// Populates patterns for both MeMref flattening and Vector narrow type
+/// emulation.
+///
+/// Patterns for narrow-type-emulation require "flattened" MemRef(s), so this
+/// composite populate* method can be used for narrow-type-emulation for Ops
+/// operating on MemRef(s) that are rank > 2.
+void populateMemRefFlattenAndVectorNarrowTypeEmulationPatterns(
+    arith::NarrowTypeEmulationConverter &typeConverter,
+    RewritePatternSet &patterns);
+
 /// Rewrite a vector `bitcast(trunci)` to use a more efficient sequence of
 /// vector operations comprising `shuffle` and `bitwise` ops.
 /// Warning: these patterns currently only work for little endian targets.
diff --git a/mlir/include/mlir/Dialect/Vector/Utils/VectorUtils.h b/mlir/include/mlir/Dialect/Vector/Utils/VectorUtils.h
index ace26990601c8..97163c4532378 100644
--- a/mlir/include/mlir/Dialect/Vector/Utils/VectorUtils.h
+++ b/mlir/include/mlir/Dialect/Vector/Utils/VectorUtils.h
@@ -255,6 +255,12 @@ using UnrollVectorOpFn =
 LogicalResult unrollVectorOp(Operation *op, PatternRewriter &rewriter,
                              UnrollVectorOpFn unrollFn);
 
+/// Generic utility for unrolling values of type vector<NxAxBx...>
+/// to N values of type vector<AxBx...> using vector.extract. If the input
+/// is rank-1 or has leading scalable dimension, failure is returned.
+FailureOr<SmallVector<Value>> unrollVectorValue(TypedValue<VectorType>,
+                                                RewriterBase &);
+
 } // namespace vector
 
 /// Constructs a permutation map of invariant memref indices to vector
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
index cfe3e800484ce..1f1d367118365 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
@@ -194,26 +194,29 @@ def DistributeLayoutAttr: AttrInterface<"DistributeLayoutAttr"> {
     InterfaceMethod<"Get the num of effective subgroups",
                     "int64_t",
                     "getNumSubgroups", (ins), [{
-                        std::optional<SmallVector<int64_t>> sgLayout = llvm::cast<ConcreteAttr>(tablegen_opaque_val).getSgLayoutAsInt();
+                        std::optional<SmallVector<int64_t>> sgLayout = llvm::cast<ConcreteAttr>(tablegen_opaque_val).getEffectiveSgLayoutAsInt();
                         if (sgLayout.has_value())
                           return computeProduct(*sgLayout);
                         return 0;
                     }], [{}]>,
-    InterfaceMethod<"Get the SgLayout field of the attribute as integer array",
+    InterfaceMethod<"Get the order of the layout attribute",
+                    "DenseI32ArrayAttr",
+                    "getOrder">,
+    InterfaceMethod<"Get the effective SgLayout of the layout attribute as integer array",
                     "SmallVector<int64_t>",
-                    "getSgLayoutAsInt">,
-    InterfaceMethod<"Get the SgData field of the attribute as integer array",
+                    "getEffectiveSgLayoutAsInt">,
+    InterfaceMethod<"Get the effective SgData of the layout attribute as integer array",
                     "SmallVector<int64_t>",
-                    "getSgDataAsInt">,
-    InterfaceMethod<"Get the InstData field of the attribute as integer array",
+                    "getEffectiveSgDataAsInt">,
+    InterfaceMethod<"Get the effective InstData of the layout attribute as integer array",
                     "SmallVector<int64_t>",
-                    "getInstDataAsInt">,
-    InterfaceMethod<"Get the LaneLayout field of the attribute as integer array",
+                    "getEffectiveInstDataAsInt">,
+    InterfaceMethod<"Get the effective LaneLayout of the layout attribute as integer array",
                     "SmallVector<int64_t>",
-                    "getLaneLayoutAsInt">,
-    InterfaceMethod<"Get the LaneData field of the attribute as integer array",
+                    "getEffectiveLaneLayoutAsInt">,
+    InterfaceMethod<"Get the effective LaneData of the layout attribute as integer array",
                     "SmallVector<int64_t>",
-                    "getLaneDataAsInt">,
+                    "getEffectiveLaneDataAsInt">,
     InterfaceMethod<"Derive a new layout by dropping sgLayout and sgData",
                     "xegpu::DistributeLayoutAttr",
                     "dropSgLayoutAndData">,
@@ -231,7 +234,11 @@ def DistributeLayoutAttr: AttrInterface<"DistributeLayoutAttr"> {
                       multiple blocks according to round-robin distribution rules.}],
                     "FailureOr<SmallVector<SmallVector<Value>>>",
                     "getOffsets",
-                    (ins "OpBuilder &": $builder, "Location":$loc, "Value":$linearId, "ArrayRef<int64_t>":$shape)>
+                    (ins "OpBuilder &": $builder, "Location":$loc, "Value":$linearId, "ArrayRef<int64_t>":$shape)>,
+    InterfaceMethod</*desc=*/[{Check if this layout is a slice of some other layout.}],
+                    /*retTy=*/"bool",
+                    /*methodName=*/"isSliceOf",
+                    /*args=*/(ins "const xegpu::DistributeLayoutAttr&": $other)>
   ];
 }
 
@@ -391,31 +398,31 @@ def XeGPU_LayoutAttr : XeGPUAttr<"Layout", "layout", [DistributeLayoutAttr]> {
                              getLaneLayout(), getLaneData(), getOrder());
     }
 
-    SmallVector<int64_t> getSgLayoutAsInt() const {
+    SmallVector<int64_t> getEffectiveSgLayoutAsInt() const {
       if (DenseI32ArrayAttr layout = getSgLayout())
         return llvm::to_vector_of<int64_t>(layout.asArrayRef());
       return {};
     }
 
-    SmallVector<int64_t> getSgDataAsInt() const {
+    SmallVector<int64_t> getEffectiveSgDataAsInt() const {
       if (DenseI32ArrayAttr data = getSgData())
         return llvm::to_vector_of<int64_t>(data.asArrayRef());
       return {};
     }
 
-    SmallVector<int64_t> getInstDataAsInt() const {
+    SmallVector<int64_t> getEffectiveInstDataAsInt() const {
       if (DenseI32ArrayAttr inst = getInstData())
         return llvm::to_vector_of<int64_t>(inst.asArrayRef());
       return {};
     }
 
-    SmallVector<int64_t> getLaneLayoutAsInt() const {
+    SmallVector<int64_t> getEffectiveLaneLayoutAsInt() const {
       if (DenseI32ArrayAttr layout = getLaneLayout())
         return llvm::to_vector_of<int64_t>(layout.asArrayRef());
       return {};
     }
 
-    SmallVector<int64_t> getLaneDataAsInt() const {
+    SmallVector<int64_t> getEffectiveLaneDataAsInt() const {
       if (DenseI32ArrayAttr data = getLaneData())
         return llvm::to_vector_of<int64_t>(data.asArrayRef());
       return {};
@@ -433,6 +440,9 @@ def XeGPU_LayoutAttr : XeGPUAttr<"Layout", "layout", [DistributeLayoutAttr]> {
     FailureOr<SmallVector<SmallVector<Value>>>
     getOffsets(OpBuilder &builder, Location loc, Value linearId, ArrayRef<int64_t> shape);
 
+    /// Check if this is slice of some other layout.
+    bool isSliceOf(const xegpu::DistributeLayoutAttr &other) { return false; }
+
   }];
 
   let assemblyFormat = "`<` struct(params) `>`";
@@ -499,10 +509,10 @@ def XeGPU_SliceAttr : XeGPUAttr<"Slice", "slice", [DistributeLayoutAttr]> {
 
     /// Returns the SgLayout of the attribute, computed by applying
     /// the slice dimensions to the underlying LayoutAttr.
-    SmallVector<int64_t> getSgLayoutAsInt() const {
+    SmallVector<int64_t> getEffectiveSgLayoutAsInt() const {
       SliceAttr attr = flatten();
       auto parent = dyn_cast<LayoutAttr>(attr.getParent());
-      auto layout = parent.getSgLayoutAsInt();
+      auto layout = parent.getEffectiveSgLayoutAsInt();
       if (layout.size()) {
         ArrayRef<int64_t> dims = attr.getDims().asArrayRef();
         return XeGPUDialect::slice(ArrayRef<int64_t>(layout), dims);
@@ -512,10 +522,10 @@ def XeGPU_SliceAttr : XeGPUAttr<"Slice", "slice", [DistributeLayoutAttr]> {
 
     /// Returns the SgData of the attribute, computed by applying
     /// the slice dimensions to the underlying LayoutAttr.
-    SmallVector<int64_t> getSgDataAsInt() const {
+    SmallVector<int64_t> getEffectiveSgDataAsInt() const {
       SliceAttr attr = flatten();
       auto parent = dyn_cast<LayoutAttr>(attr.getParent());
-      auto data = parent.getSgDataAsInt();
+      auto data = parent.getEffectiveSgDataAsInt();
       if (data.size()) {
         ArrayRef<int64_t> dims = attr.getDims().asArrayRef();
         return XeGPUDialect::slice(ArrayRef<int64_t>(data), dims);
@@ -525,10 +535,10 @@ def XeGPU_SliceAttr : XeGPUAttr<"Slice", "slice", [DistributeLayoutAttr]> {
 
     /// Returns the InstData of the attribute, computed by applying
     /// the slice dimensions to the underlying LayoutAttr.
-    SmallVector<int64_t> getInstDataAsInt() const {
+    SmallVector<int64_t> getEffectiveInstDataAsInt() const {
       SliceAttr attr = flatten();
       auto parent = dyn_cast<LayoutAttr>(attr.getParent());
-      auto inst = parent.getInstDataAsInt();
+      auto inst = parent.getEffectiveInstDataAsInt();
       if (inst.size()) {
         ArrayRef<int64_t> dims = attr.getDims().asArrayRef();
         return XeGPUDialect::slice(llvm::ArrayRef<int64_t>(inst), dims);
@@ -538,10 +548,10 @@ def XeGPU_SliceAttr : XeGPUAttr<"Slice", "slice", [DistributeLayoutAttr]> {
 
     /// Returns the LaneLayout of the attribute, computed by applying
     /// the slice dimensions to the underlying LayoutAttr.
-    SmallVector<int64_t> getLaneLayoutAsInt() const {
+    SmallVector<int64_t> getEffectiveLaneLayoutAsInt() const {
       SliceAttr attr = flatten();
       auto parent = dyn_cast<LayoutAttr>(attr.getParent());
-      auto layout = parent.getLaneLayoutAsInt();
+      auto layout = parent.getEffectiveLaneLayoutAsInt();
       if (layout.size()) {
         ArrayRef<int64_t> dims = attr.getDims().asArrayRef();
         return XeGPUDialect::slice(llvm::ArrayRef<int64_t>(layout), dims);
@@ -551,10 +561,10 @@ def XeGPU_SliceAttr : XeGPUAttr<"Slice", "slice", [DistributeLayoutAttr]> {
 
     /// Returns the LaneData of the attribute, computed by applying
     /// the slice dimensions to the underlying LayoutAttr.
-    SmallVector<int64_t> getLaneDataAsInt() const {
+    SmallVector<int64_t> getEffectiveLaneDataAsInt() const {
       SliceAttr attr = flatten();
       auto parent = dyn_cast<LayoutAttr>(attr.getParent());
-      auto data = parent.getLaneDataAsInt();
+      auto data = parent.getEffectiveLaneDataAsInt();
       if (data.size()) {
         ArrayRef<int64_t> dims = attr.getDims().asArrayRef();
         return XeGPUDialect::slice(llvm::ArrayRef<int64_t>(data), dims);
@@ -594,6 +604,9 @@ def XeGPU_SliceAttr : XeGPUAttr<"Slice", "slice", [DistributeLayoutAttr]> {
     FailureOr<SmallVector<SmallVector<Value>>>
     getOffsets(OpBuilder &builder, Location loc, Value linearId, ArrayRef<int64_t> shape);
 
+    /// Check if this is slice of some other layout.
+    bool isSliceOf(const xegpu::DistributeLayoutAttr &other);
+
   }];
 
   let assemblyFormat = "`<` qualified($parent) `,` `dims` `=` $dims `>`";
diff --git a/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td b/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td
index ddf6b4ac85a90..59dca9f0d852a 100644
--- a/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td
@@ -27,6 +27,10 @@ def XeGPUSubgroupDistribute : Pass<"xegpu-subgroup-distribute"> {
   }];
   let dependentDialects = ["memref::MemRefDialect", "xegpu::XeGPUDialect",
                            "vector::VectorDialect"];
+  let options = [Option<
+    "enableSGReductions", "enable-sg-reductions", "bool",
+    /*default=*/"true",
+    "Enable subgroup reductions using subgroup shuffles.">];
 }
 
 def XeGPUPropagateLayout : Pass<"xegpu-propagate-layout"> {
diff --git a/mlir/include/mlir/ExecutionEngine/MemRefUtils.h b/mlir/include/mlir/ExecutionEngine/MemRefUtils.h
index d66d757cb7a8e..e9471731afe13 100644
--- a/mlir/include/mlir/ExecutionEngine/MemRefUtils.h
+++ b/mlir/include/mlir/ExecutionEngine/MemRefUtils.h
@@ -164,19 +164,17 @@ class OwningMemRef {
     int64_t nElements = 1;
     for (int64_t s : shapeAlloc)
       nElements *= s;
-    auto [data, alignedData] =
+    auto [allocatedPtr, alignedData] =
         detail::allocAligned<T>(nElements, allocFun, alignment);
-    descriptor = detail::makeStridedMemRefDescriptor<Rank>(data, alignedData,
-                                                           shape, shapeAlloc);
+    descriptor = detail::makeStridedMemRefDescriptor<Rank>(
+        allocatedPtr, alignedData, shape, shapeAlloc);
     if (init) {
       for (StridedMemrefIterator<T, Rank> it = descriptor.begin(),
                                           end = descriptor.end();
            it != end; ++it)
         init(*it, it.getIndices());
     } else {
-      memset(descriptor.data, 0,
-             nElements * sizeof(T) +
-                 alignment.value_or(detail::nextPowerOf2(sizeof(T))));
+      memset(alignedData, 0, nElements * sizeof(T));
     }
   }
   /// Take ownership of an existing descriptor with a custom deleter.
diff --git a/mlir/include/mlir/IR/DialectImplementation.h b/mlir/include/mlir/IR/DialectImplementation.h
index f45b88dc6deca..0b4f91cd750b8 100644
--- a/mlir/include/mlir/IR/DialectImplementation.h
+++ b/mlir/include/mlir/IR/DialectImplementation.h
@@ -103,10 +103,11 @@ struct FieldParser<
 
 /// Parse any integer.
 template <typename IntT>
-struct FieldParser<IntT,
-                   std::enable_if_t<std::is_integral<IntT>::value, IntT>> {
+struct FieldParser<IntT, std::enable_if_t<(std::is_integral<IntT>::value ||
+                                           std::is_same_v<IntT, llvm::APInt>),
+                                          IntT>> {
   static FailureOr<IntT> parse(AsmParser &parser) {
-    IntT value = 0;
+    IntT value{};
     if (parser.parseInteger(value))
       return failure();
     return value;
diff --git a/mlir/include/mlir/IR/PatternMatch.h b/mlir/include/mlir/IR/PatternMatch.h
index 7b0b9cef9c5bd..576481a6e7215 100644
--- a/mlir/include/mlir/IR/PatternMatch.h
+++ b/mlir/include/mlir/IR/PatternMatch.h
@@ -312,6 +312,9 @@ struct OpOrInterfaceRewritePatternBase : public RewritePattern {
 template <typename SourceOp>
 struct OpRewritePattern
     : public mlir::detail::OpOrInterfaceRewritePatternBase<SourceOp> {
+  /// Type alias to allow derived classes to inherit constructors with
+  /// `using Base::Base;`.
+  using Base = OpRewritePattern;
 
   /// Patterns must specify the root operation name they match against, and can
   /// also specify the benefit of the pattern matching and a list of generated
@@ -328,6 +331,9 @@ struct OpRewritePattern
 template <typename SourceOp>
 struct OpInterfaceRewritePattern
     : public mlir::detail::OpOrInterfaceRewritePatternBase<SourceOp> {
+  /// Type alias to allow derived classes to inherit constructors with
+  /// `using Base::Base;`.
+  using Base = OpInterfaceRewritePattern;
 
   OpInterfaceRewritePattern(MLIRContext *context, PatternBenefit benefit = 1)
       : mlir::detail::OpOrInterfaceRewritePatternBase<SourceOp>(
@@ -341,6 +347,10 @@ struct OpInterfaceRewritePattern
 template <template <typename> class TraitType>
 class OpTraitRewritePattern : public RewritePattern {
 public:
+  /// Type alias to allow derived classes to inherit constructors with
+  /// `using Base::Base;`.
+  using Base = OpTraitRewritePattern;
+
   OpTraitRewritePattern(MLIRContext *context, PatternBenefit benefit = 1)
       : RewritePattern(Pattern::MatchTraitOpTypeTag(), TypeID::get<TraitType>(),
                        benefit, context) {}
diff --git a/mlir/include/mlir/IR/Remarks.h b/mlir/include/mlir/IR/Remarks.h
index 26d65472f2b1c..20e84ec83cd01 100644
--- a/mlir/include/mlir/IR/Remarks.h
+++ b/mlir/include/mlir/IR/Remarks.h
@@ -29,7 +29,7 @@ namespace mlir::remark {
 /// Define an the set of categories to accept. By default none are, the provided
 /// regex matches against the category names for each kind of remark.
 struct RemarkCategories {
-  std::optional<std::string> passed, missed, analysis, failed;
+  std::optional<std::string> all, passed, missed, analysis, failed;
 };
 
 /// Categories describe the outcome of an transformation, not the mechanics of
diff --git a/mlir/include/mlir/Interfaces/LoopLikeInterface.td b/mlir/include/mlir/Interfaces/LoopLikeInterface.td
index 6c95b4802837b..cfd15a7746e19 100644
--- a/mlir/include/mlir/Interfaces/LoopLikeInterface.td
+++ b/mlir/include/mlir/Interfaces/LoopLikeInterface.td
@@ -232,6 +232,17 @@ def LoopLikeOpInterface : OpInterface<"LoopLikeOpInterface"> {
       /*defaultImplementation=*/[{
         return ::mlir::failure();
       }]
+    >,
+    InterfaceMethod<[{
+        Compute the static trip count if possible.
+      }],
+      /*retTy=*/"::std::optional<APInt>",
+      /*methodName=*/"getStaticTripCount",
+      /*args=*/(ins),
+      /*methodBody=*/"",
+      /*defaultImplementation=*/[{
+        return ::std::nullopt;
+      }]
     >
   ];
 
diff --git a/mlir/include/mlir/Interfaces/ParallelCombiningOpInterface.h b/mlir/include/mlir/Interfaces/ParallelCombiningOpInterface.h
index 72db06163df37..82ab427699f64 100644
--- a/mlir/include/mlir/Interfaces/ParallelCombiningOpInterface.h
+++ b/mlir/include/mlir/Interfaces/ParallelCombiningOpInterface.h
@@ -19,7 +19,7 @@
 namespace mlir {
 namespace detail {
 // TODO: Single region single block interface on interfaces ?
-LogicalResult verifyParallelCombiningOpInterface(Operation *op);
+LogicalResult verifyInParallelOpInterface(Operation *op);
 } // namespace detail
 } // namespace mlir
 
diff --git a/mlir/include/mlir/Interfaces/ParallelCombiningOpInterface.td b/mlir/include/mlir/Interfaces/ParallelCombiningOpInterface.td
index 424b4cf0a0a58..ace26f723ef53 100644
--- a/mlir/include/mlir/Interfaces/ParallelCombiningOpInterface.td
+++ b/mlir/include/mlir/Interfaces/ParallelCombiningOpInterface.td
@@ -6,7 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// Defines the interface for ops that perform parallel combining operations.
+// Defines the interface for ops that perform in parallel combining
+// operations.
 //
 //===----------------------------------------------------------------------===//
 
@@ -15,9 +16,9 @@
 
 include "mlir/IR/OpBase.td"
 
-def ParallelCombiningOpInterface : OpInterface<"ParallelCombiningOpInterface"> {
+def InParallelOpInterface : OpInterface<"InParallelOpInterface"> {
   let description = [{
-    A parallel combining op is an op with a region.
+    An in parallel op is an op with a region.
 
     This is useful as a terminator to parallel operations that iterate over
     some set and return tensors while avoiding tight coupling between the
@@ -52,8 +53,60 @@ def ParallelCombiningOpInterface : OpInterface<"ParallelCombiningOpInterface"> {
   ];
   // TODO: Single region single block interface on interfaces ?
   let verify = [{
-    return verifyParallelCombiningOpInterface($_op);
+    return verifyInParallelOpInterface($_op);
+  }];
+}
+
+def ParallelCombiningOpInterface : OpInterface<"ParallelCombiningOpInterface"> {
+  let description = [{
+    A parallel combining op is an operation that models parallel contributions
+    to result tensors within the context of a parent iterating operation.
+    
+    This interface is designed for operations that need to coordinate parallel
+    insertions or contributions to tensors that are being constructed across
+    multiple parallel iterations. The destination refers to a tensor value that
+    is assembled by aggregating results from parallel computations; each
+    parallel iteration may contribute a slice, element, or region to the final
+    result. No in-place mutation of tensors is implied.
+
+    One significant use case for this interface is `tensor.parallel_insert_slice`
+    which allows parallel insertion of slices that are aggregated into a
+    destination tensor. With this interface, other operations that express
+    similar parallel contributions can also be defined.
+
+    This op works within an op implementing the `InParallelOpInterface` that
+    specifies how the parallel results are combined.
+
+    Key semantics:
+    - The operation identifies destination tensors to which iterations
+      contribute through the `getUpdatedDestinations` method
+    - Each parallel iteration may produce elements or regions that are
+      incorporated into the destination tensor
+    - The parent iterating operation manages the coordination and ensures
+      proper synchronization of these contributions
+  
+    Note: This interface does not verify itself, it is up to the implementing operation
+    to verify the correctness of the op.
   }];
+  let cppNamespace = "::mlir";
+
+  let methods = [
+    InterfaceMethod<[{ 
+        Returns the list of destination values this op contributes to.
+      }],
+      /*retTy=*/"::mlir::MutableOperandRange",
+      /*methodName=*/"getUpdatedDestinations",
+      /*args=*/(ins)
+    >,
+    InterfaceMethod<
+      /*desc=*/[{
+        Returns the iterating parent for this op.
+      }],
+      /*retTy=*/"::mlir::Operation*",
+      /*methodName=*/"getIteratingParent",
+      /*args=*/(ins)
+    >,
+  ];
 }
 
 #endif // MLIR_INTERFACES_PARALLELCOMBININGOPINTERFACE
diff --git a/mlir/include/mlir/Interfaces/ValueBoundsOpInterface.h b/mlir/include/mlir/Interfaces/ValueBoundsOpInterface.h
index d168735f50598..58852239444b9 100644
--- a/mlir/include/mlir/Interfaces/ValueBoundsOpInterface.h
+++ b/mlir/include/mlir/Interfaces/ValueBoundsOpInterface.h
@@ -217,7 +217,7 @@ class ValueBoundsConstraintSet
   /// `closedUB` is set to "true", upper bounds are also closed.
   static FailureOr<int64_t>
   computeConstantBound(presburger::BoundType type, const Variable &var,
-                       StopConditionFn stopCondition = nullptr,
+                       const StopConditionFn &stopCondition = nullptr,
                        bool closedUB = false);
 
   /// Compute a constant delta between the given two values. Return "failure"
@@ -282,18 +282,18 @@ class ValueBoundsConstraintSet
   ///
   /// Slice are non-overlapping if the above constraint is not satisfied for
   /// at least one dimension.
-  static FailureOr<bool> areOverlappingSlices(MLIRContext *ctx,
-                                              HyperrectangularSlice slice1,
-                                              HyperrectangularSlice slice2);
+  static FailureOr<bool>
+  areOverlappingSlices(MLIRContext *ctx, const HyperrectangularSlice &slice1,
+                       const HyperrectangularSlice &slice2);
 
   /// Return "true" if the given slices are guaranteed to be equivalent.
   /// Return "false" if the given slices are guaranteed to be non-equivalent.
   /// Return "failure" if unknown.
   ///
   /// Slices are equivalent if their offsets, sizes and strices are equal.
-  static FailureOr<bool> areEquivalentSlices(MLIRContext *ctx,
-                                             HyperrectangularSlice slice1,
-                                             HyperrectangularSlice slice2);
+  static FailureOr<bool>
+  areEquivalentSlices(MLIRContext *ctx, const HyperrectangularSlice &slice1,
+                      const HyperrectangularSlice &slice2);
 
   /// Add a bound for the given index-typed value or shaped value. This function
   /// returns a builder that adds the bound.
@@ -326,7 +326,8 @@ class ValueBoundsConstraintSet
   /// An index-typed value or the dimension of a shaped-type value.
   using ValueDim = std::pair<Value, int64_t>;
 
-  ValueBoundsConstraintSet(MLIRContext *ctx, StopConditionFn stopCondition,
+  ValueBoundsConstraintSet(MLIRContext *ctx,
+                           const StopConditionFn &stopCondition,
                            bool addConservativeSemiAffineBounds = false);
 
   /// Return "true" if, based on the current state of the constraint system,
@@ -401,7 +402,8 @@ class ValueBoundsConstraintSet
   /// Insert the given affine map and its bound operands as a new column in the
   /// constraint system. Return the position of the new column. Any operands
   /// that were not analyzed yet are put on the worklist.
-  int64_t insert(AffineMap map, ValueDimList operands, bool isSymbol = true);
+  int64_t insert(AffineMap map, const ValueDimList &operands,
+                 bool isSymbol = true);
   int64_t insert(const Variable &var, bool isSymbol = true);
 
   /// Project out the given column in the constraint set.
diff --git a/mlir/include/mlir/Tools/lsp-server-support/SourceMgrUtils.h b/mlir/include/mlir/Tools/lsp-server-support/SourceMgrUtils.h
index 9ed8326a602e6..920ce831e42b6 100644
--- a/mlir/include/mlir/Tools/lsp-server-support/SourceMgrUtils.h
+++ b/mlir/include/mlir/Tools/lsp-server-support/SourceMgrUtils.h
@@ -14,7 +14,8 @@
 #ifndef MLIR_TOOLS_LSPSERVERSUPPORT_SOURCEMGRUTILS_H
 #define MLIR_TOOLS_LSPSERVERSUPPORT_SOURCEMGRUTILS_H
 
-#include "mlir/Tools/lsp-server-support/Protocol.h"
+#include "mlir/Support/LLVM.h"
+#include "llvm/Support/LSP/Protocol.h"
 #include "llvm/Support/SourceMgr.h"
 #include <optional>
 
@@ -45,17 +46,18 @@ bool contains(SMRange range, SMLoc loc);
 
 /// This class represents a single include within a root file.
 struct SourceMgrInclude {
-  SourceMgrInclude(const lsp::URIForFile &uri, const lsp::Range &range)
+  SourceMgrInclude(const llvm::lsp::URIForFile &uri,
+                   const llvm::lsp::Range &range)
       : uri(uri), range(range) {}
 
   /// Build a hover for the current include file.
-  Hover buildHover() const;
+  llvm::lsp::Hover buildHover() const;
 
   /// The URI of the file that is included.
-  lsp::URIForFile uri;
+  llvm::lsp::URIForFile uri;
 
   /// The range of the include directive.
-  lsp::Range range;
+  llvm::lsp::Range range;
 };
 
 /// Given a source manager, gather all of the processed include files. These are
diff --git a/mlir/include/mlir/Tools/lsp-server-support/Transport.h b/mlir/include/mlir/Tools/lsp-server-support/Transport.h
deleted file mode 100644
index 0010a475fedd2..0000000000000
--- a/mlir/include/mlir/Tools/lsp-server-support/Transport.h
+++ /dev/null
@@ -1,283 +0,0 @@
-//===--- Transport.h - Sending and Receiving LSP messages -------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// The language server protocol is usually implemented by writing messages as
-// JSON-RPC over the stdin/stdout of a subprocess. This file contains a JSON
-// transport interface that handles this communication.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef MLIR_TOOLS_LSPSERVERSUPPORT_TRANSPORT_H
-#define MLIR_TOOLS_LSPSERVERSUPPORT_TRANSPORT_H
-
-#include "mlir/Support/DebugStringHelper.h"
-#include "mlir/Support/LLVM.h"
-#include "mlir/Tools/lsp-server-support/Logging.h"
-#include "mlir/Tools/lsp-server-support/Protocol.h"
-#include "llvm/ADT/FunctionExtras.h"
-#include "llvm/ADT/StringMap.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/Support/FormatAdapters.h"
-#include "llvm/Support/JSON.h"
-#include "llvm/Support/raw_ostream.h"
-#include <atomic>
-
-namespace mlir {
-namespace lsp {
-class MessageHandler;
-
-//===----------------------------------------------------------------------===//
-// JSONTransport
-//===----------------------------------------------------------------------===//
-
-/// The encoding style of the JSON-RPC messages (both input and output).
-enum JSONStreamStyle {
-  /// Encoding per the LSP specification, with mandatory Content-Length header.
-  Standard,
-  /// Messages are delimited by a '// -----' line. Comment lines start with //.
-  Delimited
-};
-
-/// An abstract class used by the JSONTransport to read JSON message.
-class JSONTransportInput {
-public:
-  explicit JSONTransportInput(JSONStreamStyle style = JSONStreamStyle::Standard)
-      : style(style) {}
-  virtual ~JSONTransportInput() = default;
-
-  virtual bool hasError() const = 0;
-  virtual bool isEndOfInput() const = 0;
-
-  /// Read in a message from the input stream.
-  LogicalResult readMessage(std::string &json) {
-    return style == JSONStreamStyle::Delimited ? readDelimitedMessage(json)
-                                               : readStandardMessage(json);
-  }
-  virtual LogicalResult readDelimitedMessage(std::string &json) = 0;
-  virtual LogicalResult readStandardMessage(std::string &json) = 0;
-
-private:
-  /// The JSON stream style to use.
-  JSONStreamStyle style;
-};
-
-/// Concrete implementation of the JSONTransportInput that reads from a file.
-class JSONTransportInputOverFile : public JSONTransportInput {
-public:
-  explicit JSONTransportInputOverFile(
-      std::FILE *in, JSONStreamStyle style = JSONStreamStyle::Standard)
-      : JSONTransportInput(style), in(in) {}
-
-  bool hasError() const final { return ferror(in); }
-  bool isEndOfInput() const final { return feof(in); }
-
-  LogicalResult readDelimitedMessage(std::string &json) final;
-  LogicalResult readStandardMessage(std::string &json) final;
-
-private:
-  std::FILE *in;
-};
-
-/// A transport class that performs the JSON-RPC communication with the LSP
-/// client.
-class JSONTransport {
-public:
-  JSONTransport(std::unique_ptr<JSONTransportInput> in, raw_ostream &out,
-                bool prettyOutput = false)
-      : in(std::move(in)), out(out), prettyOutput(prettyOutput) {}
-
-  JSONTransport(std::FILE *in, raw_ostream &out,
-                JSONStreamStyle style = JSONStreamStyle::Standard,
-                bool prettyOutput = false)
-      : in(std::make_unique<JSONTransportInputOverFile>(in, style)), out(out),
-        prettyOutput(prettyOutput) {}
-
-  /// The following methods are used to send a message to the LSP client.
-  void notify(StringRef method, llvm::json::Value params);
-  void call(StringRef method, llvm::json::Value params, llvm::json::Value id);
-  void reply(llvm::json::Value id, llvm::Expected<llvm::json::Value> result);
-
-  /// Start executing the JSON-RPC transport.
-  llvm::Error run(MessageHandler &handler);
-
-private:
-  /// Dispatches the given incoming json message to the message handler.
-  bool handleMessage(llvm::json::Value msg, MessageHandler &handler);
-  /// Writes the given message to the output stream.
-  void sendMessage(llvm::json::Value msg);
-
-private:
-  /// The input to read a message from.
-  std::unique_ptr<JSONTransportInput> in;
-  SmallVector<char, 0> outputBuffer;
-  /// The output file stream.
-  raw_ostream &out;
-  /// If the output JSON should be formatted for easier readability.
-  bool prettyOutput;
-};
-
-//===----------------------------------------------------------------------===//
-// MessageHandler
-//===----------------------------------------------------------------------===//
-
-/// A Callback<T> is a void function that accepts Expected<T>. This is
-/// accepted by functions that logically return T.
-template <typename T>
-using Callback = llvm::unique_function<void(llvm::Expected<T>)>;
-
-/// An OutgoingNotification<T> is a function used for outgoing notifications
-/// send to the client.
-template <typename T>
-using OutgoingNotification = llvm::unique_function<void(const T &)>;
-
-/// An OutgoingRequest<T> is a function used for outgoing requests to send to
-/// the client.
-template <typename T>
-using OutgoingRequest =
-    llvm::unique_function<void(const T &, llvm::json::Value id)>;
-
-/// An `OutgoingRequestCallback` is invoked when an outgoing request to the
-/// client receives a response in turn. It is passed the original request's ID,
-/// as well as the response result.
-template <typename T>
-using OutgoingRequestCallback =
-    std::function<void(llvm::json::Value, llvm::Expected<T>)>;
-
-/// A handler used to process the incoming transport messages.
-class MessageHandler {
-public:
-  MessageHandler(JSONTransport &transport) : transport(transport) {}
-
-  bool onNotify(StringRef method, llvm::json::Value value);
-  bool onCall(StringRef method, llvm::json::Value params, llvm::json::Value id);
-  bool onReply(llvm::json::Value id, llvm::Expected<llvm::json::Value> result);
-
-  template <typename T>
-  static llvm::Expected<T> parse(const llvm::json::Value &raw,
-                                 StringRef payloadName, StringRef payloadKind) {
-    T result;
-    llvm::json::Path::Root root;
-    if (fromJSON(raw, result, root))
-      return std::move(result);
-
-    // Dump the relevant parts of the broken message.
-    std::string context;
-    llvm::raw_string_ostream os(context);
-    root.printErrorContext(raw, os);
-
-    // Report the error (e.g. to the client).
-    return llvm::make_error<LSPError>(
-        llvm::formatv("failed to decode {0} {1}: {2}", payloadName, payloadKind,
-                      fmt_consume(root.getError())),
-        ErrorCode::InvalidParams);
-  }
-
-  template <typename Param, typename Result, typename ThisT>
-  void method(llvm::StringLiteral method, ThisT *thisPtr,
-              void (ThisT::*handler)(const Param &, Callback<Result>)) {
-    methodHandlers[method] = [method, handler,
-                              thisPtr](llvm::json::Value rawParams,
-                                       Callback<llvm::json::Value> reply) {
-      llvm::Expected<Param> param = parse<Param>(rawParams, method, "request");
-      if (!param)
-        return reply(param.takeError());
-      (thisPtr->*handler)(*param, std::move(reply));
-    };
-  }
-
-  template <typename Param, typename ThisT>
-  void notification(llvm::StringLiteral method, ThisT *thisPtr,
-                    void (ThisT::*handler)(const Param &)) {
-    notificationHandlers[method] = [method, handler,
-                                    thisPtr](llvm::json::Value rawParams) {
-      llvm::Expected<Param> param =
-          parse<Param>(rawParams, method, "notification");
-      if (!param) {
-        return llvm::consumeError(
-            llvm::handleErrors(param.takeError(), [](const LSPError &lspError) {
-              Logger::error("JSON parsing error: {0}",
-                            lspError.message.c_str());
-            }));
-      }
-      (thisPtr->*handler)(*param);
-    };
-  }
-
-  /// Create an OutgoingNotification object used for the given method.
-  template <typename T>
-  OutgoingNotification<T> outgoingNotification(llvm::StringLiteral method) {
-    return [&, method](const T &params) {
-      std::lock_guard<std::mutex> transportLock(transportOutputMutex);
-      Logger::info("--> {0}", method);
-      transport.notify(method, llvm::json::Value(params));
-    };
-  }
-
-  /// Create an OutgoingRequest function that, when called, sends a request with
-  /// the given method via the transport. Should the outgoing request be
-  /// met with a response, the result JSON is parsed and the response callback
-  /// is invoked.
-  template <typename Param, typename Result>
-  OutgoingRequest<Param>
-  outgoingRequest(llvm::StringLiteral method,
-                  OutgoingRequestCallback<Result> callback) {
-    return [&, method, callback](const Param &param, llvm::json::Value id) {
-      auto callbackWrapper = [method, callback = std::move(callback)](
-                                 llvm::json::Value id,
-                                 llvm::Expected<llvm::json::Value> value) {
-        if (!value)
-          return callback(std::move(id), value.takeError());
-
-        std::string responseName = llvm::formatv("reply:{0}({1})", method, id);
-        llvm::Expected<Result> result =
-            parse<Result>(*value, responseName, "response");
-        if (!result)
-          return callback(std::move(id), result.takeError());
-
-        return callback(std::move(id), *result);
-      };
-
-      {
-        std::lock_guard<std::mutex> lock(responseHandlersMutex);
-        responseHandlers.insert(
-            {debugString(id), std::make_pair(method.str(), callbackWrapper)});
-      }
-
-      std::lock_guard<std::mutex> transportLock(transportOutputMutex);
-      Logger::info("--> {0}({1})", method, id);
-      transport.call(method, llvm::json::Value(param), id);
-    };
-  }
-
-private:
-  template <typename HandlerT>
-  using HandlerMap = llvm::StringMap<llvm::unique_function<HandlerT>>;
-
-  HandlerMap<void(llvm::json::Value)> notificationHandlers;
-  HandlerMap<void(llvm::json::Value, Callback<llvm::json::Value>)>
-      methodHandlers;
-
-  /// A pair of (1) the original request's method name, and (2) the callback
-  /// function to be invoked for responses.
-  using ResponseHandlerTy =
-      std::pair<std::string, OutgoingRequestCallback<llvm::json::Value>>;
-  /// A mapping from request/response ID to response handler.
-  llvm::StringMap<ResponseHandlerTy> responseHandlers;
-  /// Mutex to guard insertion into the response handler map.
-  std::mutex responseHandlersMutex;
-
-  JSONTransport &transport;
-
-  /// Mutex to guard sending output messages to the transport.
-  std::mutex transportOutputMutex;
-};
-
-} // namespace lsp
-} // namespace mlir
-
-#endif
diff --git a/mlir/include/mlir/Tools/mlir-lsp-server/MlirLspRegistryFunction.h b/mlir/include/mlir/Tools/mlir-lsp-server/MlirLspRegistryFunction.h
index 4811ecb5e92b7..0d9ba2a0d1607 100644
--- a/mlir/include/mlir/Tools/mlir-lsp-server/MlirLspRegistryFunction.h
+++ b/mlir/include/mlir/Tools/mlir-lsp-server/MlirLspRegistryFunction.h
@@ -16,14 +16,16 @@
 namespace llvm {
 template <typename Fn>
 class function_ref;
+namespace lsp {
+class URIForFile;
+} // namespace lsp
 } // namespace llvm
 
 namespace mlir {
 class DialectRegistry;
 namespace lsp {
-class URIForFile;
 using DialectRegistryFn =
-    llvm::function_ref<DialectRegistry &(const URIForFile &uri)>;
+    llvm::function_ref<DialectRegistry &(const llvm::lsp::URIForFile &uri)>;
 } // namespace lsp
 } // namespace mlir
 
diff --git a/mlir/include/mlir/Tools/mlir-opt/MlirOptMain.h b/mlir/include/mlir/Tools/mlir-opt/MlirOptMain.h
index 94231227599c9..0fbe15fa2e0db 100644
--- a/mlir/include/mlir/Tools/mlir-opt/MlirOptMain.h
+++ b/mlir/include/mlir/Tools/mlir-opt/MlirOptMain.h
@@ -38,6 +38,12 @@ enum class VerbosityLevel {
   ErrorsWarningsAndRemarks
 };
 
+enum class RemarkFormat {
+  REMARK_FORMAT_STDOUT,
+  REMARK_FORMAT_YAML,
+  REMARK_FORMAT_BITSTREAM,
+};
+
 /// Configuration options for the mlir-opt tool.
 /// This is intended to help building tools like mlir-opt by collecting the
 /// supported options.
@@ -221,15 +227,53 @@ class MlirOptMainConfig {
   }
   bool shouldVerifyRoundtrip() const { return verifyRoundtripFlag; }
 
+  /// Checks if any remark filters are set.
+  bool shouldEmitRemarks() const {
+    // Emit all remarks only when no filters are specified.
+    const bool hasFilters =
+        !getRemarksAllFilter().empty() || !getRemarksPassedFilter().empty() ||
+        !getRemarksFailedFilter().empty() ||
+        !getRemarksMissedFilter().empty() || !getRemarksAnalyseFilter().empty();
+    return hasFilters;
+  }
+
   /// Reproducer file generation (no crash required).
   StringRef getReproducerFilename() const { return generateReproducerFileFlag; }
 
+  /// Set the reproducer output filename
+  RemarkFormat getRemarkFormat() const { return remarkFormatFlag; }
+  /// Set the remark format to use.
+  std::string getRemarksAllFilter() const { return remarksAllFilterFlag; }
+  /// Set the remark output file.
+  std::string getRemarksOutputFile() const { return remarksOutputFileFlag; }
+  /// Set the remark passed filters.
+  std::string getRemarksPassedFilter() const { return remarksPassedFilterFlag; }
+  /// Set the remark failed filters.
+  std::string getRemarksFailedFilter() const { return remarksFailedFilterFlag; }
+  /// Set the remark missed filters.
+  std::string getRemarksMissedFilter() const { return remarksMissedFilterFlag; }
+  /// Set the remark analyse filters.
+  std::string getRemarksAnalyseFilter() const {
+    return remarksAnalyseFilterFlag;
+  }
+
 protected:
   /// Allow operation with no registered dialects.
   /// This option is for convenience during testing only and discouraged in
   /// general.
   bool allowUnregisteredDialectsFlag = false;
 
+  /// Remark format
+  RemarkFormat remarkFormatFlag = RemarkFormat::REMARK_FORMAT_STDOUT;
+  /// Remark file to output to
+  std::string remarksOutputFileFlag = "";
+  /// Remark filters
+  std::string remarksAllFilterFlag = "";
+  std::string remarksPassedFilterFlag = "";
+  std::string remarksFailedFilterFlag = "";
+  std::string remarksMissedFilterFlag = "";
+  std::string remarksAnalyseFilterFlag = "";
+
   /// Configuration for the debugging hooks.
   tracing::DebugConfig debugConfig;
 
diff --git a/mlir/include/mlir/Transforms/DialectConversion.h b/mlir/include/mlir/Transforms/DialectConversion.h
index a096f82a4cfd8..6ef649e8fc13a 100644
--- a/mlir/include/mlir/Transforms/DialectConversion.h
+++ b/mlir/include/mlir/Transforms/DialectConversion.h
@@ -40,6 +40,10 @@ class Value;
 /// registered using addConversion and addMaterialization, respectively.
 class TypeConverter {
 public:
+  /// Type alias to allow derived classes to inherit constructors with
+  /// `using Base::Base;`.
+  using Base = TypeConverter;
+
   virtual ~TypeConverter() = default;
   TypeConverter() = default;
   // Copy the registered conversions, but not the caches
@@ -223,7 +227,7 @@ class TypeConverter {
   }
 
   /// Register a conversion function for attributes within types. Type
-  /// converters may call this function in order to allow hoking into the
+  /// converters may call this function in order to allow hooking into the
   /// translation of attributes that exist within types. For example, a type
   /// converter for the `memref` type could use these conversions to convert
   /// memory spaces or layouts in an extensible way.
@@ -679,6 +683,10 @@ class ConversionPattern : public RewritePattern {
 template <typename SourceOp>
 class OpConversionPattern : public ConversionPattern {
 public:
+  /// Type alias to allow derived classes to inherit constructors with
+  /// `using Base::Base;`.
+  using Base = OpConversionPattern;
+
   using OpAdaptor = typename SourceOp::Adaptor;
   using OneToNOpAdaptor =
       typename SourceOp::template GenericAdaptor<ArrayRef<ValueRange>>;
@@ -729,6 +737,10 @@ class OpConversionPattern : public ConversionPattern {
 template <typename SourceOp>
 class OpInterfaceConversionPattern : public ConversionPattern {
 public:
+  /// Type alias to allow derived classes to inherit constructors with
+  /// `using Base::Base;`.
+  using Base = OpInterfaceConversionPattern;
+
   OpInterfaceConversionPattern(MLIRContext *context, PatternBenefit benefit = 1)
       : ConversionPattern(Pattern::MatchInterfaceOpTypeTag(),
                           SourceOp::getInterfaceID(), benefit, context) {}
@@ -773,6 +785,10 @@ class OpInterfaceConversionPattern : public ConversionPattern {
 template <template <typename> class TraitType>
 class OpTraitConversionPattern : public ConversionPattern {
 public:
+  /// Type alias to allow derived classes to inherit constructors with
+  /// `using Base::Base;`.
+  using Base = OpTraitConversionPattern;
+
   OpTraitConversionPattern(MLIRContext *context, PatternBenefit benefit = 1)
       : ConversionPattern(Pattern::MatchTraitOpTypeTag(),
                           TypeID::get<TraitType>(), benefit, context) {}
@@ -1428,6 +1444,9 @@ struct ConversionConfig {
 ///
 /// In the above example, %0 can be used instead of %3 and all cast ops are
 /// folded away.
+void reconcileUnrealizedCasts(
+    const DenseSet<UnrealizedConversionCastOp> &castOps,
+    SmallVectorImpl<UnrealizedConversionCastOp> *remainingCastOps = nullptr);
 void reconcileUnrealizedCasts(
     ArrayRef<UnrealizedConversionCastOp> castOps,
     SmallVectorImpl<UnrealizedConversionCastOp> *remainingCastOps = nullptr);
diff --git a/mlir/include/mlir/Transforms/Passes.td b/mlir/include/mlir/Transforms/Passes.td
index 039fbaed47165..beb59784947c5 100644
--- a/mlir/include/mlir/Transforms/Passes.td
+++ b/mlir/include/mlir/Transforms/Passes.td
@@ -364,8 +364,8 @@ def Mem2Reg : Pass<"mem2reg"> {
   let description = [{
     This pass removes loads out of and stores into a memory slot, and turns
     them into direct uses of SSA values. This is done generically using the
-    `PromoteAllocationOpInterface`, `PromoteOpInterface` and
-    `PromoteMemOpInterface` interfaces.
+    `PromotableAllocationOpInterface`, `PromotableOpInterface` and
+    `PromotableMemOpInterface` interfaces.
 
     This pass will attempt to compute which definitions of the content of
     the memory slot reach operations that use the memory slot pointer. It
diff --git a/mlir/lib/Analysis/DataFlow/DeadCodeAnalysis.cpp b/mlir/lib/Analysis/DataFlow/DeadCodeAnalysis.cpp
index 9424eff3e6b6f..de1ed39ed4fdb 100644
--- a/mlir/lib/Analysis/DataFlow/DeadCodeAnalysis.cpp
+++ b/mlir/lib/Analysis/DataFlow/DeadCodeAnalysis.cpp
@@ -22,6 +22,7 @@
 #include "mlir/Interfaces/CallInterfaces.h"
 #include "mlir/Interfaces/ControlFlowInterfaces.h"
 #include "mlir/Support/LLVM.h"
+#include "llvm/ADT/ScopeExit.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/DebugLog.h"
@@ -159,6 +160,7 @@ void DeadCodeAnalysis::initializeSymbolCallables(Operation *top) {
   LDBG() << "[init] Entering initializeSymbolCallables for top-level op: "
          << OpWithFlags(top, OpPrintingFlags().skipRegions());
   analysisScope = top;
+  hasSymbolTable = top->hasTrait<OpTrait::SymbolTable>();
   auto walkFn = [&](Operation *symTable, bool allUsesVisible) {
     LDBG() << "[init] Processing symbol table op: "
            << OpWithFlags(symTable, OpPrintingFlags().skipRegions());
@@ -260,14 +262,25 @@ LogicalResult DeadCodeAnalysis::initializeRecursively(Operation *op) {
       return failure();
   }
   // Recurse on nested operations.
-  for (Region &region : op->getRegions()) {
-    LDBG() << "[init] Recursing into region of op: "
-           << OpWithFlags(op, OpPrintingFlags().skipRegions());
-    for (Operation &nestedOp : region.getOps()) {
-      LDBG() << "[init] Recursing into nested op: "
-             << OpWithFlags(&nestedOp, OpPrintingFlags().skipRegions());
-      if (failed(initializeRecursively(&nestedOp)))
-        return failure();
+  if (op->getNumRegions()) {
+    // If we haven't seen a symbol table yet, check if the current operation
+    // has one. If so, update the flag to allow for resolving callables in
+    // nested regions.
+    bool savedHasSymbolTable = hasSymbolTable;
+    auto restoreHasSymbolTable =
+        llvm::make_scope_exit([&]() { hasSymbolTable = savedHasSymbolTable; });
+    if (!hasSymbolTable && op->hasTrait<OpTrait::SymbolTable>())
+      hasSymbolTable = true;
+
+    for (Region &region : op->getRegions()) {
+      LDBG() << "[init] Recursing into region of op: "
+             << OpWithFlags(op, OpPrintingFlags().skipRegions());
+      for (Operation &nestedOp : region.getOps()) {
+        LDBG() << "[init] Recursing into nested op: "
+               << OpWithFlags(&nestedOp, OpPrintingFlags().skipRegions());
+        if (failed(initializeRecursively(&nestedOp)))
+          return failure();
+      }
     }
   }
   LDBG() << "[init] Finished initializeRecursively for op: "
@@ -388,7 +401,13 @@ LogicalResult DeadCodeAnalysis::visit(ProgramPoint *point) {
 void DeadCodeAnalysis::visitCallOperation(CallOpInterface call) {
   LDBG() << "visitCallOperation: "
          << OpWithFlags(call.getOperation(), OpPrintingFlags().skipRegions());
-  Operation *callableOp = call.resolveCallableInTable(&symbolTable);
+
+  Operation *callableOp = nullptr;
+  if (hasSymbolTable)
+    callableOp = call.resolveCallableInTable(&symbolTable);
+  else
+    LDBG()
+        << "No symbol table present in analysis scope, can't resolve callable";
 
   // A call to a externally-defined callable has unknown predecessors.
   const auto isExternalCallable = [this](Operation *op) {
@@ -425,30 +444,21 @@ void DeadCodeAnalysis::visitCallOperation(CallOpInterface call) {
 /// Get the constant values of the operands of an operation. If any of the
 /// constant value lattices are uninitialized, return std::nullopt to indicate
 /// the analysis should bail out.
-static std::optional<SmallVector<Attribute>> getOperandValuesImpl(
-    Operation *op,
-    function_ref<const Lattice<ConstantValue> *(Value)> getLattice) {
+std::optional<SmallVector<Attribute>>
+DeadCodeAnalysis::getOperandValues(Operation *op) {
   SmallVector<Attribute> operands;
   operands.reserve(op->getNumOperands());
   for (Value operand : op->getOperands()) {
-    const Lattice<ConstantValue> *cv = getLattice(operand);
+    Lattice<ConstantValue> *cv = getOrCreate<Lattice<ConstantValue>>(operand);
+    cv->useDefSubscribe(this);
     // If any of the operands' values are uninitialized, bail out.
     if (cv->getValue().isUninitialized())
-      return {};
+      return std::nullopt;
     operands.push_back(cv->getValue().getConstantValue());
   }
   return operands;
 }
 
-std::optional<SmallVector<Attribute>>
-DeadCodeAnalysis::getOperandValues(Operation *op) {
-  return getOperandValuesImpl(op, [&](Value value) {
-    auto *lattice = getOrCreate<Lattice<ConstantValue>>(value);
-    lattice->useDefSubscribe(this);
-    return lattice;
-  });
-}
-
 void DeadCodeAnalysis::visitBranchOperation(BranchOpInterface branch) {
   LDBG() << "visitBranchOperation: "
          << OpWithFlags(branch.getOperation(), OpPrintingFlags().skipRegions());
@@ -479,23 +489,8 @@ void DeadCodeAnalysis::visitRegionBranchOperation(
 
   SmallVector<RegionSuccessor> successors;
   branch.getEntrySuccessorRegions(*operands, successors);
-  for (const RegionSuccessor &successor : successors) {
-    // The successor can be either an entry block or the parent operation.
-    ProgramPoint *point =
-        successor.getSuccessor()
-            ? getProgramPointBefore(&successor.getSuccessor()->front())
-            : getProgramPointAfter(branch);
-    // Mark the entry block as executable.
-    auto *state = getOrCreate<Executable>(point);
-    propagateIfChanged(state, state->setToLive());
-    LDBG() << "Marked region successor live: " << point;
-    // Add the parent op as a predecessor.
-    auto *predecessors = getOrCreate<PredecessorState>(point);
-    propagateIfChanged(
-        predecessors,
-        predecessors->join(branch, successor.getSuccessorInputs()));
-    LDBG() << "Added region branch as predecessor for successor: " << point;
-  }
+
+  visitRegionBranchEdges(branch, branch.getOperation(), successors);
 }
 
 void DeadCodeAnalysis::visitRegionTerminator(Operation *op,
@@ -511,26 +506,30 @@ void DeadCodeAnalysis::visitRegionTerminator(Operation *op,
   else
     branch.getSuccessorRegions(op->getParentRegion(), successors);
 
-  // Mark successor region entry blocks as executable and add this op to the
-  // list of predecessors.
+  visitRegionBranchEdges(branch, op, successors);
+}
+
+void DeadCodeAnalysis::visitRegionBranchEdges(
+    RegionBranchOpInterface regionBranchOp, Operation *predecessorOp,
+    const SmallVector<RegionSuccessor> &successors) {
   for (const RegionSuccessor &successor : successors) {
-    PredecessorState *predecessors;
-    if (Region *region = successor.getSuccessor()) {
-      auto *state =
-          getOrCreate<Executable>(getProgramPointBefore(&region->front()));
-      propagateIfChanged(state, state->setToLive());
-      LDBG() << "Marked region entry block live for region: " << region;
-      predecessors = getOrCreate<PredecessorState>(
-          getProgramPointBefore(&region->front()));
-    } else {
-      // Add this terminator as a predecessor to the parent op.
-      predecessors =
-          getOrCreate<PredecessorState>(getProgramPointAfter(branch));
-    }
-    propagateIfChanged(predecessors,
-                       predecessors->join(op, successor.getSuccessorInputs()));
-    LDBG() << "Added region terminator as predecessor for successor: "
-           << (successor.getSuccessor() ? "region entry" : "parent op");
+    // The successor can be either an entry block or the parent operation.
+    ProgramPoint *point =
+        successor.getSuccessor()
+            ? getProgramPointBefore(&successor.getSuccessor()->front())
+            : getProgramPointAfter(regionBranchOp);
+
+    // Mark the entry block as executable.
+    auto *state = getOrCreate<Executable>(point);
+    propagateIfChanged(state, state->setToLive());
+    LDBG() << "Marked region successor live: " << point;
+
+    // Add the parent op as a predecessor.
+    auto *predecessors = getOrCreate<PredecessorState>(point);
+    propagateIfChanged(
+        predecessors,
+        predecessors->join(predecessorOp, successor.getSuccessorInputs()));
+    LDBG() << "Added region branch as predecessor for successor: " << point;
   }
 }
 
diff --git a/mlir/lib/Analysis/DataFlow/DenseAnalysis.cpp b/mlir/lib/Analysis/DataFlow/DenseAnalysis.cpp
index d05374f667a51..b51465bc31ec3 100644
--- a/mlir/lib/Analysis/DataFlow/DenseAnalysis.cpp
+++ b/mlir/lib/Analysis/DataFlow/DenseAnalysis.cpp
@@ -64,10 +64,12 @@ void AbstractDenseForwardDataFlowAnalysis::visitCallOperation(
     AbstractDenseLattice *after) {
   // Allow for customizing the behavior of calls to external symbols, including
   // when the analysis is explicitly marked as non-interprocedural.
-  auto callable =
-      dyn_cast_if_present<CallableOpInterface>(call.resolveCallable());
-  if (!getSolverConfig().isInterprocedural() ||
-      (callable && !callable.getCallableRegion())) {
+  auto isExternalCallable = [&]() {
+    auto callable =
+        dyn_cast_if_present<CallableOpInterface>(call.resolveCallable());
+    return callable && !callable.getCallableRegion();
+  };
+  if (!getSolverConfig().isInterprocedural() || isExternalCallable()) {
     return visitCallControlFlowTransfer(
         call, CallControlFlowAction::ExternalCallee, before, after);
   }
@@ -290,6 +292,12 @@ AbstractDenseBackwardDataFlowAnalysis::visit(ProgramPoint *point) {
 void AbstractDenseBackwardDataFlowAnalysis::visitCallOperation(
     CallOpInterface call, const AbstractDenseLattice &after,
     AbstractDenseLattice *before) {
+  // If the solver is not interprocedural, let the hook handle it as an external
+  // callee.
+  if (!getSolverConfig().isInterprocedural())
+    return visitCallControlFlowTransfer(
+        call, CallControlFlowAction::ExternalCallee, after, before);
+
   // Find the callee.
   Operation *callee = call.resolveCallableInTable(&symbolTable);
 
@@ -297,12 +305,10 @@ void AbstractDenseBackwardDataFlowAnalysis::visitCallOperation(
   // No region means the callee is only declared in this module.
   // If that is the case or if the solver is not interprocedural,
   // let the hook handle it.
-  if (!getSolverConfig().isInterprocedural() ||
-      (callable && (!callable.getCallableRegion() ||
-                    callable.getCallableRegion()->empty()))) {
+  if (callable &&
+      (!callable.getCallableRegion() || callable.getCallableRegion()->empty()))
     return visitCallControlFlowTransfer(
         call, CallControlFlowAction::ExternalCallee, after, before);
-  }
 
   if (!callable)
     return setToExitState(before);
diff --git a/mlir/lib/Analysis/DataFlow/IntegerRangeAnalysis.cpp b/mlir/lib/Analysis/DataFlow/IntegerRangeAnalysis.cpp
index e79f6a8aec1cf..70b56ca77b2da 100644
--- a/mlir/lib/Analysis/DataFlow/IntegerRangeAnalysis.cpp
+++ b/mlir/lib/Analysis/DataFlow/IntegerRangeAnalysis.cpp
@@ -26,6 +26,7 @@
 #include "mlir/Interfaces/ControlFlowInterfaces.h"
 #include "mlir/Interfaces/InferIntRangeInterface.h"
 #include "mlir/Interfaces/LoopLikeInterface.h"
+#include "mlir/Support/DebugStringHelper.h"
 #include "mlir/Support/LLVM.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/Casting.h"
@@ -76,9 +77,17 @@ void IntegerValueRangeLattice::onUpdate(DataFlowSolver *solver) const {
   else
     dialect = value.getParentBlock()->getParentOp()->getDialect();
 
-  Type type = getElementTypeOrSelf(value);
-  solver->propagateIfChanged(
-      cv, cv->join(ConstantValue(IntegerAttr::get(type, *constant), dialect)));
+  Attribute cstAttr;
+  if (isa<IntegerType, IndexType>(value.getType())) {
+    cstAttr = IntegerAttr::get(value.getType(), *constant);
+  } else if (auto shapedTy = dyn_cast<ShapedType>(value.getType())) {
+    cstAttr = SplatElementsAttr::get(shapedTy, *constant);
+  } else {
+    llvm::report_fatal_error(
+        Twine("FIXME: Don't know how to create a constant for this type: ") +
+        mlir::debugString(value.getType()));
+  }
+  solver->propagateIfChanged(cv, cv->join(ConstantValue(cstAttr, dialect)));
 }
 
 LogicalResult IntegerRangeAnalysis::visitOperation(
diff --git a/mlir/lib/Analysis/DataFlow/SparseAnalysis.cpp b/mlir/lib/Analysis/DataFlow/SparseAnalysis.cpp
index 13a3e1480c836..0d2e2ed85549d 100644
--- a/mlir/lib/Analysis/DataFlow/SparseAnalysis.cpp
+++ b/mlir/lib/Analysis/DataFlow/SparseAnalysis.cpp
@@ -228,10 +228,12 @@ LogicalResult AbstractSparseForwardDataFlowAnalysis::visitCallOperation(
     ArrayRef<AbstractSparseLattice *> resultLattices) {
   // If the call operation is to an external function, attempt to infer the
   // results from the call arguments.
-  auto callable =
-      dyn_cast_if_present<CallableOpInterface>(call.resolveCallable());
-  if (!getSolverConfig().isInterprocedural() ||
-      (callable && !callable.getCallableRegion())) {
+  auto isExternalCallable = [&]() {
+    auto callable =
+        dyn_cast_if_present<CallableOpInterface>(call.resolveCallable());
+    return callable && !callable.getCallableRegion();
+  };
+  if (!getSolverConfig().isInterprocedural() || isExternalCallable()) {
     visitExternalCallImpl(call, operandLattices, resultLattices);
     return success();
   }
diff --git a/mlir/lib/Analysis/DataFlowFramework.cpp b/mlir/lib/Analysis/DataFlowFramework.cpp
index 7e1b4052027d3..9352ab02f7472 100644
--- a/mlir/lib/Analysis/DataFlowFramework.cpp
+++ b/mlir/lib/Analysis/DataFlowFramework.cpp
@@ -9,6 +9,7 @@
 #include "mlir/Analysis/DataFlowFramework.h"
 #include "mlir/IR/Location.h"
 #include "mlir/IR/Operation.h"
+#include "mlir/IR/SymbolTable.h"
 #include "mlir/IR/Value.h"
 #include "llvm/ADT/ScopeExit.h"
 #include "llvm/ADT/iterator.h"
@@ -109,6 +110,12 @@ LogicalResult DataFlowSolver::initializeAndRun(Operation *top) {
   isRunning = true;
   auto guard = llvm::make_scope_exit([&]() { isRunning = false; });
 
+  bool isInterprocedural = config.isInterprocedural();
+  auto restoreInterprocedural = llvm::make_scope_exit(
+      [&]() { config.setInterprocedural(isInterprocedural); });
+  if (isInterprocedural && !top->hasTrait<OpTrait::SymbolTable>())
+    config.setInterprocedural(false);
+
   // Initialize equivalent lattice anchors.
   for (DataFlowAnalysis &analysis : llvm::make_pointee_range(childAnalyses)) {
     analysis.initializeEquivalentLatticeAnchor(top);
diff --git a/mlir/lib/Bindings/Python/IRCore.cpp b/mlir/lib/Bindings/Python/IRCore.cpp
index 8273a9346e5dd..10360e448858c 100644
--- a/mlir/lib/Bindings/Python/IRCore.cpp
+++ b/mlir/lib/Bindings/Python/IRCore.cpp
@@ -1079,23 +1079,38 @@ PyLocation &DefaultingPyLocation::resolve() {
 PyModule::PyModule(PyMlirContextRef contextRef, MlirModule module)
     : BaseContextObject(std::move(contextRef)), module(module) {}
 
-PyModule::~PyModule() { mlirModuleDestroy(module); }
+PyModule::~PyModule() {
+  nb::gil_scoped_acquire acquire;
+  auto &liveModules = getContext()->liveModules;
+  assert(liveModules.count(module.ptr) == 1 &&
+         "destroying module not in live map");
+  liveModules.erase(module.ptr);
+  mlirModuleDestroy(module);
+}
 
 PyModuleRef PyModule::forModule(MlirModule module) {
   MlirContext context = mlirModuleGetContext(module);
   PyMlirContextRef contextRef = PyMlirContext::forContext(context);
 
-  // Create.
-  PyModule *unownedModule = new PyModule(std::move(contextRef), module);
-  // Note that the default return value policy on cast is `automatic_reference`,
-  // which means "does not take ownership, does not call delete/dtor".
-  // We use `take_ownership`, which means "Python will call the C++ destructor
-  // and delete operator when the Python wrapper is garbage collected", because
-  // MlirModule actually wraps OwningOpRef<ModuleOp> (see mlirModuleCreateParse
-  // etc).
-  nb::object pyRef = nb::cast(unownedModule, nb::rv_policy::take_ownership);
-  unownedModule->handle = pyRef;
-  return PyModuleRef(unownedModule, std::move(pyRef));
+  nb::gil_scoped_acquire acquire;
+  auto &liveModules = contextRef->liveModules;
+  auto it = liveModules.find(module.ptr);
+  if (it == liveModules.end()) {
+    // Create.
+    PyModule *unownedModule = new PyModule(std::move(contextRef), module);
+    // Note that the default return value policy on cast is automatic_reference,
+    // which does not take ownership (delete will not be called).
+    // Just be explicit.
+    nb::object pyRef = nb::cast(unownedModule, nb::rv_policy::take_ownership);
+    unownedModule->handle = pyRef;
+    liveModules[module.ptr] =
+        std::make_pair(unownedModule->handle, unownedModule);
+    return PyModuleRef(unownedModule, std::move(pyRef));
+  }
+  // Use existing.
+  PyModule *existing = it->second.second;
+  nb::object pyRef = nb::borrow<nb::object>(it->second.first);
+  return PyModuleRef(existing, std::move(pyRef));
 }
 
 nb::object PyModule::createFromCapsule(nb::object capsule) {
@@ -2084,6 +2099,8 @@ PyInsertionPoint PyInsertionPoint::after(PyOperationBase &op) {
   return PyInsertionPoint{block, std::move(nextOpRef)};
 }
 
+size_t PyMlirContext::getLiveModuleCount() { return liveModules.size(); }
+
 nb::object PyInsertionPoint::contextEnter(nb::object insertPoint) {
   return PyThreadContextEntry::pushInsertionPoint(insertPoint);
 }
@@ -2923,6 +2940,7 @@ void mlir::python::populateIRCore(nb::module_ &m) {
              PyMlirContextRef ref = PyMlirContext::forContext(self.get());
              return ref.releaseObject();
            })
+      .def("_get_live_module_count", &PyMlirContext::getLiveModuleCount)
       .def_prop_ro(MLIR_PYTHON_CAPI_PTR_ATTR, &PyMlirContext::getCapsule)
       .def(MLIR_PYTHON_CAPI_FACTORY_ATTR, &PyMlirContext::createFromCapsule)
       .def("__enter__", &PyMlirContext::contextEnter)
diff --git a/mlir/lib/Bindings/Python/IRModule.h b/mlir/lib/Bindings/Python/IRModule.h
index 1d1ff29533f98..28b885f136fe0 100644
--- a/mlir/lib/Bindings/Python/IRModule.h
+++ b/mlir/lib/Bindings/Python/IRModule.h
@@ -218,6 +218,10 @@ class PyMlirContext {
   /// Gets the count of live context objects. Used for testing.
   static size_t getLiveCount();
 
+  /// Gets the count of live modules associated with this context.
+  /// Used for testing.
+  size_t getLiveModuleCount();
+
   /// Enter and exit the context manager.
   static nanobind::object contextEnter(nanobind::object context);
   void contextExit(const nanobind::object &excType,
@@ -244,6 +248,14 @@ class PyMlirContext {
   static nanobind::ft_mutex live_contexts_mutex;
   static LiveContextMap &getLiveContexts();
 
+  // Interns all live modules associated with this context. Modules tracked
+  // in this map are valid. When a module is invalidated, it is removed
+  // from this map, and while it still exists as an instance, any
+  // attempt to access it will raise an error.
+  using LiveModuleMap =
+      llvm::DenseMap<const void *, std::pair<nanobind::handle, PyModule *>>;
+  LiveModuleMap liveModules;
+
   bool emitErrorDiagnostics = false;
 
   MlirContext context;
diff --git a/mlir/lib/CAPI/Dialect/Linalg.cpp b/mlir/lib/CAPI/Dialect/Linalg.cpp
index 21db18dfd47ed..5c2a65d2c4c8a 100644
--- a/mlir/lib/CAPI/Dialect/Linalg.cpp
+++ b/mlir/lib/CAPI/Dialect/Linalg.cpp
@@ -59,7 +59,7 @@ mlirLinalgInferContractionDimensions(MlirOperation op) {
   if (failed(maybeDims))
     return result;
 
-  linalg::ContractionDimensions contractionDims = *maybeDims;
+  const linalg::ContractionDimensions &contractionDims = *maybeDims;
   MLIRContext *ctx = linalgOp.getContext();
 
   auto toAttr = [&ctx](const SmallVector<unsigned, 2> &vals) -> MlirAttribute {
@@ -95,7 +95,7 @@ mlirLinalgInferConvolutionDimensions(MlirOperation op) {
   if (failed(maybeDims))
     return result;
 
-  linalg::ConvolutionDimensions dims = *maybeDims;
+  const linalg::ConvolutionDimensions &dims = *maybeDims;
   MLIRContext *ctx = linalgOp.getContext();
 
   auto toI32Attr =
diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index 203790ed95153..0078eed8b7a67 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -1915,7 +1915,16 @@ struct AMDGPUPermlaneLowering : public ConvertOpToLLVMPattern<PermlaneSwapOp> {
       else
         llvm_unreachable("unsupported row length");
 
-      Value vdstNew = LLVM::ExtractValueOp::create(rewriter, loc, res, {0});
+      const Value vdst0 = LLVM::ExtractValueOp::create(rewriter, loc, res, {0});
+      const Value vdst1 = LLVM::ExtractValueOp::create(rewriter, loc, res, {1});
+
+      const Value isEqual =
+          rewriter.create<LLVM::ICmpOp>(loc, LLVM::ICmpPredicate::eq, vdst0, v);
+
+      // Per `permlane(16|32)` semantics: if the first extracted element equals
+      // 'v', the result is the second element; otherwise it is the first.
+      Value vdstNew =
+          rewriter.create<LLVM::SelectOp>(loc, isEqual, vdst1, vdst0);
       permuted.emplace_back(vdstNew);
     }
 
diff --git a/mlir/lib/Conversion/ArmSMEToLLVM/ArmSMEToLLVM.cpp b/mlir/lib/Conversion/ArmSMEToLLVM/ArmSMEToLLVM.cpp
index 8a2e3b639aaa7..033e9ae1f4d4c 100644
--- a/mlir/lib/Conversion/ArmSMEToLLVM/ArmSMEToLLVM.cpp
+++ b/mlir/lib/Conversion/ArmSMEToLLVM/ArmSMEToLLVM.cpp
@@ -822,7 +822,7 @@ struct OuterProductWideningOpConversion
   }
 };
 
-/// Lower `arm_sme.streaming_vl` to SME CNTS intrinsics.
+/// Lower `arm_sme.streaming_vl` to SME CNTSD intrinsic.
 ///
 /// Example:
 ///
@@ -830,8 +830,10 @@ struct OuterProductWideningOpConversion
 ///
 /// is converted to:
 ///
-///   %cnt = "arm_sme.intr.cntsh"() : () -> i64
-///   %0 = arith.index_cast %cnt : i64 to index
+///   %cnt = "arm_sme.intr.cntsd"() : () -> i64
+///   %scale = arith.constant 4 : index
+///   %cntIndex = arith.index_cast %cnt : i64 to index
+///   %0 = arith.muli %cntIndex, %scale : index
 ///
 struct StreamingVLOpConversion
     : public ConvertArmSMEOpToLLVMPattern<arm_sme::StreamingVLOp,
@@ -844,21 +846,13 @@ struct StreamingVLOpConversion
                   ConversionPatternRewriter &rewriter) const override {
     auto loc = streamingVlOp.getLoc();
     auto i64Type = rewriter.getI64Type();
-    auto *intrOp = [&]() -> Operation * {
-      switch (streamingVlOp.getTypeSize()) {
-      case arm_sme::TypeSize::Byte:
-        return arm_sme::aarch64_sme_cntsb::create(rewriter, loc, i64Type);
-      case arm_sme::TypeSize::Half:
-        return arm_sme::aarch64_sme_cntsh::create(rewriter, loc, i64Type);
-      case arm_sme::TypeSize::Word:
-        return arm_sme::aarch64_sme_cntsw::create(rewriter, loc, i64Type);
-      case arm_sme::TypeSize::Double:
-        return arm_sme::aarch64_sme_cntsd::create(rewriter, loc, i64Type);
-      }
-      llvm_unreachable("unknown type size in StreamingVLOpConversion");
-    }();
-    rewriter.replaceOpWithNewOp<arith::IndexCastOp>(
-        streamingVlOp, rewriter.getIndexType(), intrOp->getResult(0));
+    auto cntsd = arm_sme::aarch64_sme_cntsd::create(rewriter, loc, i64Type);
+    auto cntsdIdx = arith::IndexCastOp::create(rewriter, loc,
+                                               rewriter.getIndexType(), cntsd);
+    auto scale = arith::ConstantIndexOp::create(
+        rewriter, loc,
+        8 / arm_sme::getSizeInBytes(streamingVlOp.getTypeSize()));
+    rewriter.replaceOpWithNewOp<arith::MulIOp>(streamingVlOp, cntsdIdx, scale);
     return success();
   }
 };
@@ -964,9 +958,7 @@ void mlir::configureArmSMEToLLVMConversionLegality(ConversionTarget &target) {
       arm_sme::aarch64_sme_smops_za32, arm_sme::aarch64_sme_umopa_za32,
       arm_sme::aarch64_sme_umops_za32, arm_sme::aarch64_sme_sumopa_wide,
       arm_sme::aarch64_sme_sumops_wide, arm_sme::aarch64_sme_usmopa_wide,
-      arm_sme::aarch64_sme_usmops_wide, arm_sme::aarch64_sme_cntsb,
-      arm_sme::aarch64_sme_cntsh, arm_sme::aarch64_sme_cntsw,
-      arm_sme::aarch64_sme_cntsd>();
+      arm_sme::aarch64_sme_usmops_wide, arm_sme::aarch64_sme_cntsd>();
   target.addLegalDialect<arith::ArithDialect,
                          /* The following are used to lower tile spills/fills */
                          vector::VectorDialect, scf::SCFDialect,
diff --git a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
index 93e370d91e6b9..76a7e0f3831a2 100644
--- a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
+++ b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
@@ -451,16 +451,14 @@ void mlir::configureGpuToNVVMTypeConverter(LLVMTypeConverter &converter) {
       converter, [](gpu::AddressSpace space) -> unsigned {
         switch (space) {
         case gpu::AddressSpace::Global:
-          return static_cast<unsigned>(
-              NVVM::NVVMMemorySpace::kGlobalMemorySpace);
+          return static_cast<unsigned>(NVVM::NVVMMemorySpace::Global);
         case gpu::AddressSpace::Workgroup:
-          return static_cast<unsigned>(
-              NVVM::NVVMMemorySpace::kSharedMemorySpace);
+          return static_cast<unsigned>(NVVM::NVVMMemorySpace::Shared);
         case gpu::AddressSpace::Private:
           return 0;
         }
         llvm_unreachable("unknown address space enum value");
-        return 0;
+        return static_cast<unsigned>(NVVM::NVVMMemorySpace::Generic);
       });
   // Lowering for MMAMatrixType.
   converter.addConversion([&](gpu::MMAMatrixType type) -> Type {
@@ -648,7 +646,7 @@ void mlir::populateGpuToNVVMConversionPatterns(
       GPUFuncOpLoweringOptions{
           /*allocaAddrSpace=*/0,
           /*workgroupAddrSpace=*/
-          static_cast<unsigned>(NVVM::NVVMMemorySpace::kSharedMemorySpace),
+          static_cast<unsigned>(NVVM::NVVMMemorySpace::Shared),
           StringAttr::get(&converter.getContext(),
                           NVVM::NVVMDialect::getKernelFuncAttrName()),
           StringAttr::get(&converter.getContext(),
diff --git a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
index bbfa3d17bc7e6..b215211e131d4 100644
--- a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
+++ b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
@@ -374,7 +374,7 @@ struct LowerGpuOpsToROCDLOpsPass final
       if (!allowedDialectsSet.empty() && !allowed)
         continue;
 
-      auto iface = dyn_cast<ConvertToLLVMPatternInterface>(dialect);
+      auto *iface = dyn_cast<ConvertToLLVMPatternInterface>(dialect);
       if (!iface) {
         // Error out if dialect was explicily specified but doesn't implement
         // conversion interface.
diff --git a/mlir/lib/Conversion/GPUToSPIRV/WmmaOpsToSPIRV.cpp b/mlir/lib/Conversion/GPUToSPIRV/WmmaOpsToSPIRV.cpp
index 51dc50048024f..c1b09f18af560 100644
--- a/mlir/lib/Conversion/GPUToSPIRV/WmmaOpsToSPIRV.cpp
+++ b/mlir/lib/Conversion/GPUToSPIRV/WmmaOpsToSPIRV.cpp
@@ -55,6 +55,9 @@ static bool createElementwiseOp(ConversionPatternRewriter &builder,
   case gpu::MMAElementwiseOp::SUBI:
     builder.replaceOpWithNewOp<spirv::ISubOp>(op, coopType, operands);
     return true;
+  case gpu::MMAElementwiseOp::MULF:
+    builder.replaceOpWithNewOp<spirv::FMulOp>(op, coopType, operands);
+    return true;
   case gpu::MMAElementwiseOp::DIVF:
     builder.replaceOpWithNewOp<spirv::FDivOp>(op, coopType, operands);
     return true;
diff --git a/mlir/lib/Conversion/MPIToLLVM/MPIToLLVM.cpp b/mlir/lib/Conversion/MPIToLLVM/MPIToLLVM.cpp
index aa47e398eb684..16ef11a8b14de 100644
--- a/mlir/lib/Conversion/MPIToLLVM/MPIToLLVM.cpp
+++ b/mlir/lib/Conversion/MPIToLLVM/MPIToLLVM.cpp
@@ -272,7 +272,7 @@ class OMPIImplTraits : public MPIImplTraits {
 
   Value getCommWorld(const Location loc,
                      ConversionPatternRewriter &rewriter) override {
-    auto context = rewriter.getContext();
+    auto *context = rewriter.getContext();
     // get external opaque struct pointer type
     auto commStructT =
         LLVM::LLVMStructType::getOpaque("ompi_communicator_t", context);
@@ -324,7 +324,7 @@ class OMPIImplTraits : public MPIImplTraits {
     else
       assert(false && "unsupported type");
 
-    auto context = rewriter.getContext();
+    auto *context = rewriter.getContext();
     // get external opaque struct pointer type
     auto typeStructT =
         LLVM::LLVMStructType::getOpaque("ompi_predefined_datatype_t", context);
@@ -383,7 +383,7 @@ class OMPIImplTraits : public MPIImplTraits {
       op = "ompi_mpi_replace";
       break;
     }
-    auto context = rewriter.getContext();
+    auto *context = rewriter.getContext();
     // get external opaque struct pointer type
     auto opStructT =
         LLVM::LLVMStructType::getOpaque("ompi_predefined_op_t", context);
diff --git a/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp b/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp
index 37d12bad298df..b7e3491117e9b 100644
--- a/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp
+++ b/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp
@@ -405,16 +405,14 @@ struct ConvertNVGPUToNVVMPass
         converter, [](gpu::AddressSpace space) -> unsigned {
           switch (space) {
           case gpu::AddressSpace::Global:
-            return static_cast<unsigned>(
-                NVVM::NVVMMemorySpace::kGlobalMemorySpace);
+            return static_cast<unsigned>(NVVM::NVVMMemorySpace::Global);
           case gpu::AddressSpace::Workgroup:
-            return static_cast<unsigned>(
-                NVVM::NVVMMemorySpace::kSharedMemorySpace);
+            return static_cast<unsigned>(NVVM::NVVMMemorySpace::Shared);
           case gpu::AddressSpace::Private:
             return 0;
           }
           llvm_unreachable("unknown address space enum value");
-          return 0;
+          return static_cast<unsigned>(NVVM::NVVMMemorySpace::Generic);
         });
     /// device-side async tokens cannot be materialized in nvvm. We just
     /// convert them to a dummy i32 type in order to easily drop them during
@@ -677,7 +675,7 @@ struct NVGPUAsyncCopyLowering
                              adaptor.getSrcIndices());
     // Intrinsics takes a global pointer so we need an address space cast.
     auto srcPointerGlobalType = LLVM::LLVMPointerType::get(
-        op->getContext(), NVVM::NVVMMemorySpace::kGlobalMemorySpace);
+        op->getContext(), static_cast<unsigned>(NVVM::NVVMMemorySpace::Global));
     scrPtr = LLVM::AddrSpaceCastOp::create(b, srcPointerGlobalType, scrPtr);
     int64_t dstElements = adaptor.getDstElements().getZExtValue();
     int64_t sizeInBytes =
diff --git a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp
index e2b31f640da2f..1955eec9964eb 100644
--- a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp
+++ b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp
@@ -1160,6 +1160,12 @@ static LogicalResult reduceMatchAndRewriteHelper(OpTy op, uint64_t axis,
   auto elementTy = resultTy.getElementType();
   Value input = op->getOperand(0);
 
+  // Figure out the accType if needed
+  bool widenAccTy = std::is_same_v<OpTy, tosa::ReduceSumOp> &&
+                    isa<FloatType>(elementTy) &&
+                    cast<FloatType>(elementTy).isBF16();
+  Type accTy = widenAccTy ? rewriter.getF32Type() : elementTy;
+
   SmallVector<int64_t> reduceShape;
   SmallVector<Value> dynDims;
   for (unsigned i = 0; i < inputTy.getRank(); i++) {
@@ -1174,11 +1180,11 @@ static LogicalResult reduceMatchAndRewriteHelper(OpTy op, uint64_t axis,
   inputs.push_back(input);
 
   // First fill the output buffer with the init value.
-  auto emptyTensor = tensor::EmptyOp::create(rewriter, loc, reduceShape,
-                                             resultTy.getElementType(), dynDims)
-                         .getResult();
+  auto emptyTensor =
+      tensor::EmptyOp::create(rewriter, loc, reduceShape, accTy, dynDims)
+          .getResult();
 
-  auto fillValueAttr = createInitialValueForReduceOp(op, elementTy, rewriter);
+  auto fillValueAttr = createInitialValueForReduceOp(op, accTy, rewriter);
   if (!fillValueAttr)
     return rewriter.notifyMatchFailure(
         op, "No initial value found for reduction operation");
@@ -1231,8 +1237,14 @@ static LogicalResult reduceMatchAndRewriteHelper(OpTy op, uint64_t axis,
       [&](OpBuilder &nestedBuilder, Location nestedLoc, ValueRange blockArgs) {
         std::array<Value, 2> binaryArgs{
             blockArgs[0], isNanIgnoreMode ? blockArgs[2] : blockArgs[1]};
-        auto result = createLinalgBodyCalculationForReduceOp(
-            op, binaryArgs, elementTy, rewriter);
+
+        // If reduction type differs then extend (applicable to reduce_sum)
+        if (binaryArgs[0].getType() != accTy)
+          binaryArgs[0] = arith::ExtFOp::create(nestedBuilder, nestedLoc, accTy,
+                                                binaryArgs[0]);
+
+        auto result = createLinalgBodyCalculationForReduceOp(op, binaryArgs,
+                                                             accTy, rewriter);
         if (result)
           didEncounterError = true;
 
@@ -1273,12 +1285,11 @@ static LogicalResult reduceMatchAndRewriteHelper(OpTy op, uint64_t axis,
 
     // Create a tensor full of NaNs.
     auto nanValueAttr = rewriter.getFloatAttr(
-        elementTy,
+        accTy,
         APFloat::getNaN(cast<FloatType>(elementTy).getFloatSemantics(), false));
     auto nanValue = arith::ConstantOp::create(rewriter, loc, nanValueAttr);
     auto emptyNanTensor =
-        tensor::EmptyOp::create(rewriter, loc, reduceShape,
-                                resultTy.getElementType(), dynDims)
+        tensor::EmptyOp::create(rewriter, loc, reduceShape, accTy, dynDims)
             .getResult();
     auto nanFilledTensor =
         linalg::FillOp::create(rewriter, loc, ValueRange{nanValue},
@@ -1288,8 +1299,7 @@ static LogicalResult reduceMatchAndRewriteHelper(OpTy op, uint64_t axis,
     // Create an empty tensor, non need to fill this since it will be
     // overwritten by the select.
     auto finalEmptyTensor =
-        tensor::EmptyOp::create(rewriter, loc, reduceShape,
-                                resultTy.getElementType(), dynDims)
+        tensor::EmptyOp::create(rewriter, loc, reduceShape, accTy, dynDims)
             .getResult();
 
     // Do a selection between the tensors akin to:
@@ -1304,9 +1314,32 @@ static LogicalResult reduceMatchAndRewriteHelper(OpTy op, uint64_t axis,
     linalgOp = linalgSelect;
   }
 
+  // Truncate back to resultTy if needed
+  Value reducedRes = linalgOp->getResult(0);
+  if (widenAccTy) {
+    auto resEmptyOp =
+        tensor::EmptyOp::create(rewriter, loc, reduceShape, elementTy, dynDims)
+            .getResult();
+
+    const unsigned reducedRank =
+        cast<ShapedType>(reducedRes.getType()).getRank();
+    auto identityMap = rewriter.getMultiDimIdentityMap(reducedRank);
+    reducedRes =
+        linalg::GenericOp::create(
+            rewriter, loc, resEmptyOp.getType(), ValueRange{reducedRes},
+            ValueRange{resEmptyOp},
+            ArrayRef<AffineMap>{identityMap, identityMap},
+            getNParallelLoopsAttrs(reducedRank),
+            [&](OpBuilder &nestedBuilder, Location nestedLoc, ValueRange args) {
+              Value truncf = arith::TruncFOp::create(nestedBuilder, nestedLoc,
+                                                     elementTy, args[0]);
+              linalg::YieldOp::create(nestedBuilder, nestedLoc, truncf);
+            })
+            .getResults()[0];
+  }
+
   SmallVector<ReassociationExprs, 4> reassociationMap;
-  uint64_t expandInputRank =
-      cast<ShapedType>(linalgOp->getResults()[0].getType()).getRank();
+  uint64_t expandInputRank = cast<ShapedType>(reducedRes.getType()).getRank();
   reassociationMap.resize(expandInputRank);
 
   for (uint64_t i = 0; i < expandInputRank; i++) {
@@ -1324,8 +1357,8 @@ static LogicalResult reduceMatchAndRewriteHelper(OpTy op, uint64_t axis,
   // since here we know which dimension to expand, and `tosa::ReshapeOp` would
   // not have access to such information. This matters when handling dynamically
   // sized tensors.
-  rewriter.replaceOpWithNewOp<tensor::ExpandShapeOp>(
-      op, resultTy, linalgOp->getResults()[0], reassociationMap);
+  rewriter.replaceOpWithNewOp<tensor::ExpandShapeOp>(op, resultTy, reducedRes,
+                                                     reassociationMap);
   return success();
 }
 
@@ -1794,8 +1827,8 @@ class GenericResizeConverter : public OpRewritePattern<tosa::ResizeOp> {
     auto resultTy = cast<ShapedType>(op.getType());
     auto resultETy = resultTy.getElementType();
 
-    bool floatingPointMode = resultETy.isF16() || resultETy.isF32();
-    auto floatTy = resultETy.isF16() ? b.getF16Type() : b.getF32Type();
+    bool floatingPointMode = isa<FloatType>(resultETy);
+    auto floatTy = resultETy;
 
     auto imageH = inputTy.getShape()[1];
     auto imageW = inputTy.getShape()[2];
diff --git a/mlir/lib/Conversion/VectorToArmSME/VectorToArmSME.cpp b/mlir/lib/Conversion/VectorToArmSME/VectorToArmSME.cpp
index 9efa34a9a3acc..4e1da39c29260 100644
--- a/mlir/lib/Conversion/VectorToArmSME/VectorToArmSME.cpp
+++ b/mlir/lib/Conversion/VectorToArmSME/VectorToArmSME.cpp
@@ -462,7 +462,7 @@ struct VectorExtractToArmSMELowering
     auto loc = extractOp.getLoc();
     auto position = extractOp.getMixedPosition();
 
-    Value sourceVector = extractOp.getVector();
+    Value sourceVector = extractOp.getSource();
 
     // Extract entire vector. Should be handled by folder, but just to be safe.
     if (position.empty()) {
@@ -692,7 +692,7 @@ struct ExtractFromCreateMaskToPselLowering
       return rewriter.notifyMatchFailure(extractOp, "result not VectorType");
 
     auto createMaskOp =
-        extractOp.getVector().getDefiningOp<vector::CreateMaskOp>();
+        extractOp.getSource().getDefiningOp<vector::CreateMaskOp>();
     if (!createMaskOp)
       return rewriter.notifyMatchFailure(extractOp, "source not CreateMaskOp");
 
diff --git a/mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp b/mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp
index 1d1904f717335..79cb49a4f7dbc 100644
--- a/mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp
+++ b/mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp
@@ -355,11 +355,14 @@ static SetVector<Operation *> getOpToConvert(mlir::Operation *op,
   forwardSliceOptions.filter = hasVectorSrc;
 
   SetVector<Operation *> opToConvert;
-  op->walk([&](vector::ContractionOp contract) {
-    if (opToConvert.contains(contract.getOperation()))
+  op->walk([&](Operation *nestedOp) {
+    if (!isa<vector::ContractionOp>(nestedOp) &&
+        !elementwiseSupportsMMAMatrixType(nestedOp))
+      return;
+    if (opToConvert.contains(nestedOp))
       return;
     SetVector<Operation *> dependentOps =
-        getSliceContract(contract, backwardSliceOptions, forwardSliceOptions);
+        getSliceContract(nestedOp, backwardSliceOptions, forwardSliceOptions);
     // If any instruction cannot use MMA matrix type drop the whole
     // chain. MMA matrix are stored in an opaque type so they cannot be used
     // by all operations.
@@ -962,7 +965,7 @@ convertExtractStridedSlice(RewriterBase &rewriter,
     return rewriter.notifyMatchFailure(op, "no mmaSyncFragmentInfo");
 
   // Find the vector.transer_read whose result vector is being sliced.
-  auto transferReadOp = op.getVector().getDefiningOp<vector::TransferReadOp>();
+  auto transferReadOp = op.getSource().getDefiningOp<vector::TransferReadOp>();
   if (!transferReadOp)
     return rewriter.notifyMatchFailure(op, "no transfer read");
 
diff --git a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
index 9fbac4925dc1d..e7266740894b1 100644
--- a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
+++ b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
@@ -1131,7 +1131,7 @@ class VectorExtractOpConversion
       positionVec.push_back(rewriter.getZeroAttr(idxType));
     }
 
-    Value extracted = adaptor.getVector();
+    Value extracted = adaptor.getSource();
     if (extractsAggregate) {
       ArrayRef<OpFoldResult> position(positionVec);
       if (extractsScalar) {
diff --git a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVMPass.cpp b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVMPass.cpp
index 9852df6970fdc..0b44ca7ceee42 100644
--- a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVMPass.cpp
+++ b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVMPass.cpp
@@ -95,6 +95,7 @@ void ConvertVectorToLLVMPass::runOnOperation() {
     populateVectorRankReducingFMAPattern(patterns);
     populateVectorGatherLoweringPatterns(patterns);
     populateVectorFromElementsLoweringPatterns(patterns);
+    populateVectorToElementsLoweringPatterns(patterns);
     if (armI8MM) {
       if (armNeon)
         arm_neon::populateLowerContractionToNeonI8MMPatterns(patterns);
diff --git a/mlir/lib/Conversion/VectorToSCF/VectorToSCF.cpp b/mlir/lib/Conversion/VectorToSCF/VectorToSCF.cpp
index 508f4e25326eb..c45c45e4712f3 100644
--- a/mlir/lib/Conversion/VectorToSCF/VectorToSCF.cpp
+++ b/mlir/lib/Conversion/VectorToSCF/VectorToSCF.cpp
@@ -1414,7 +1414,7 @@ struct UnrollTransferWriteConversion
   /// Return the vector from which newly generated ExtracOps will extract.
   Value getDataVector(TransferWriteOp xferOp) const {
     if (auto extractOp = getExtractOp(xferOp))
-      return extractOp.getVector();
+      return extractOp.getSource();
     return xferOp.getVector();
   }
 
diff --git a/mlir/lib/Conversion/VectorToSPIRV/VectorToSPIRV.cpp b/mlir/lib/Conversion/VectorToSPIRV/VectorToSPIRV.cpp
index c861935b4bc18..1c311d0312aaa 100644
--- a/mlir/lib/Conversion/VectorToSPIRV/VectorToSPIRV.cpp
+++ b/mlir/lib/Conversion/VectorToSPIRV/VectorToSPIRV.cpp
@@ -189,8 +189,8 @@ struct VectorExtractOpConvert final
     if (!dstType)
       return failure();
 
-    if (isa<spirv::ScalarType>(adaptor.getVector().getType())) {
-      rewriter.replaceOp(extractOp, adaptor.getVector());
+    if (isa<spirv::ScalarType>(adaptor.getSource().getType())) {
+      rewriter.replaceOp(extractOp, adaptor.getSource());
       return success();
     }
 
@@ -201,7 +201,7 @@ struct VectorExtractOpConvert final
             extractOp,
             "Static use of poison index handled elsewhere (folded to poison)");
       rewriter.replaceOpWithNewOp<spirv::CompositeExtractOp>(
-          extractOp, dstType, adaptor.getVector(),
+          extractOp, dstType, adaptor.getSource(),
           rewriter.getI32ArrayAttr(id.value()));
     } else {
       Value sanitizedIndex = sanitizeDynamicIndex(
@@ -209,7 +209,7 @@ struct VectorExtractOpConvert final
           vector::ExtractOp::kPoisonIndex,
           extractOp.getSourceVectorType().getNumElements());
       rewriter.replaceOpWithNewOp<spirv::VectorExtractDynamicOp>(
-          extractOp, dstType, adaptor.getVector(), sanitizedIndex);
+          extractOp, dstType, adaptor.getSource(), sanitizedIndex);
     }
     return success();
   }
diff --git a/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp b/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp
index 819c2e5973ffd..852c322cc6467 100644
--- a/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp
+++ b/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp
@@ -180,26 +180,31 @@ static void adjustStridesForPermutation(AffineMap permMap,
   strides = applyPermutation(strides, perms64);
 }
 
-// Computes memory strides for vector transfer operations, handling both
-// static and dynamic memrefs while applying permutation transformations
-// for XeGPU lowering.
-static SmallVector<Value> computeStrides(VectorTransferOpInterface xferOp,
-                                         PatternRewriter &rewriter) {
+// Computes memory strides and a memref offset for vector transfer operations,
+// handling both static and dynamic memrefs while applying permutation
+// transformations for XeGPU lowering.
+static std::pair<SmallVector<Value>, Value>
+computeMemrefMeta(VectorTransferOpInterface xferOp, PatternRewriter &rewriter) {
   SmallVector<Value> strides;
   Value baseMemref = xferOp.getBase();
   AffineMap permMap = xferOp.getPermutationMap();
   MemRefType memrefType = dyn_cast<MemRefType>(baseMemref.getType());
 
   Location loc = xferOp.getLoc();
+  Value offsetVal = nullptr;
   if (memrefType.hasStaticShape()) {
     int64_t offset;
     SmallVector<int64_t> intStrides;
     if (failed(memrefType.getStridesAndOffset(intStrides, offset)))
-      return {};
+      return {{}, offsetVal};
     // Wrap static strides as MLIR values
     for (int64_t s : intStrides)
       strides.push_back(arith::ConstantIndexOp::create(rewriter, loc, s));
-  } else {
+    if (!ShapedType::isDynamic(offset))
+      offsetVal = arith::ConstantIndexOp::create(rewriter, loc, offset);
+  }
+
+  if (strides.empty() || !offsetVal) {
     // For dynamic shape memref, use memref.extract_strided_metadata to get
     // stride values
     unsigned rank = memrefType.getRank();
@@ -220,11 +225,16 @@ static SmallVector<Value> computeStrides(VectorTransferOpInterface xferOp,
 
     auto meta = memref::ExtractStridedMetadataOp::create(
         rewriter, loc, resultTypes, baseMemref);
-    strides.append(meta.getStrides().begin(), meta.getStrides().end());
+
+    if (strides.empty())
+      strides.append(meta.getStrides().begin(), meta.getStrides().end());
+
+    if (!offsetVal)
+      offsetVal = meta.getOffset();
   }
   // Adjust strides according to the permutation map (e.g., for transpose)
   adjustStridesForPermutation(permMap, strides);
-  return strides;
+  return {strides, offsetVal};
 }
 
 // This function compute the vectors of localOffsets for scattered load/stores.
@@ -254,10 +264,10 @@ static SmallVector<Value> computeStrides(VectorTransferOpInterface xferOp,
 //   %23 = arith.add %20, %21
 //   %local_offsets = arith.add %22, %23
 //   %orig_offset = %block_id_y * 4x2x6x32 // consider using affine map
-//   %offsets =  orig_offset + local_offsets
+//   %offsets =  memref_offset + orig_offset + local_offsets
 static Value computeOffsets(VectorTransferOpInterface xferOp,
-                            PatternRewriter &rewriter,
-                            ArrayRef<Value> strides) {
+                            PatternRewriter &rewriter, ArrayRef<Value> strides,
+                            Value baseOffset) {
   Location loc = xferOp.getLoc();
   VectorType vectorType = xferOp.getVectorType();
   SmallVector<Value> indices(xferOp.getIndices().begin(),
@@ -315,51 +325,30 @@ static Value computeOffsets(VectorTransferOpInterface xferOp,
         arith::AddIOp::create(rewriter, loc, localOffsets, broadcasted[i]);
 
   // Compute base offset from transfer read indices
-  Value baseOffset = nullptr;
-  if (!indices.empty()) {
-    baseOffset = arith::ConstantIndexOp::create(rewriter, loc, 0);
-    for (size_t i = 0; i < indices.size(); ++i) {
-      Value strideVal = strides[i];
-      Value offsetContrib =
-          arith::MulIOp::create(rewriter, loc, indices[i], strideVal);
-      baseOffset =
-          arith::AddIOp::create(rewriter, loc, baseOffset, offsetContrib);
-    }
-    // Broadcast base offset to match vector shape
-    Value bcastBase = vector::BroadcastOp::create(
-        rewriter, loc, fullIndexVectorType, baseOffset);
-    localOffsets =
-        arith::AddIOp::create(rewriter, loc, bcastBase, localOffsets);
+  for (size_t i = 0; i < indices.size(); ++i) {
+    Value strideVal = strides[i];
+    Value offsetContrib =
+        arith::MulIOp::create(rewriter, loc, indices[i], strideVal);
+    baseOffset =
+        arith::AddIOp::create(rewriter, loc, baseOffset, offsetContrib);
   }
+  // Broadcast base offset to match vector shape
+  Value bcastBase = vector::BroadcastOp::create(
+      rewriter, loc, fullIndexVectorType, baseOffset);
+  localOffsets = arith::AddIOp::create(rewriter, loc, bcastBase, localOffsets);
   return localOffsets;
 }
 
-// Collapse memref shape to 1D
-static Value collapseMemrefTo1D(VectorTransferOpInterface xferOp,
-                                PatternRewriter &rewriter) {
+// Convert memref to i64 base pointer
+static Value memrefToIndexPtr(VectorTransferOpInterface xferOp,
+                              PatternRewriter &rewriter) {
   Location loc = xferOp.getLoc();
-
-  Value baseMemref = xferOp.getBase();
-  MemRefType memrefType = dyn_cast<MemRefType>(baseMemref.getType());
-  Type elementType = memrefType.getElementType();
-
-  // Compute the total number of elements in the memref
-  MemRefType flatMemrefType;
-  if (memrefType.hasStaticShape()) {
-    auto totalElements = memrefType.getNumElements();
-    flatMemrefType = MemRefType::get({totalElements}, elementType);
-  } else {
-    flatMemrefType = MemRefType::get({ShapedType::kDynamic}, elementType);
-  }
-
-  SmallVector<ReassociationIndices> reassociation;
-  ReassociationIndices allDims =
-      llvm::to_vector(llvm::seq<int64_t>(0, memrefType.getRank()));
-  reassociation.push_back(allDims);
-
-  auto collapseOp = memref::CollapseShapeOp::create(
-      rewriter, loc, flatMemrefType, baseMemref, reassociation);
-  return collapseOp;
+  auto indexPtr = memref::ExtractAlignedPointerAsIndexOp::create(
+                      rewriter, loc, xferOp.getBase())
+                      .getResult();
+  return arith::IndexCastOp::create(rewriter, loc, rewriter.getI64Type(),
+                                    indexPtr)
+      .getResult();
 }
 
 static LogicalResult lowerToScatteredLoadOp(vector::TransferReadOp readOp,
@@ -372,13 +361,14 @@ static LogicalResult lowerToScatteredLoadOp(vector::TransferReadOp readOp,
   if (!memrefType)
     return rewriter.notifyMatchFailure(readOp, "Expected memref source");
 
-  SmallVector<Value> strides = computeStrides(readOp, rewriter);
-  if (strides.empty())
+  auto meta = computeMemrefMeta(readOp, rewriter);
+  if (meta.first.empty())
     return rewriter.notifyMatchFailure(readOp, "Failed to compute strides");
 
-  Value localOffsets = computeOffsets(readOp, rewriter, strides);
+  Value localOffsets =
+      computeOffsets(readOp, rewriter, meta.first, meta.second);
 
-  Value flatMemref = collapseMemrefTo1D(readOp, rewriter);
+  Value flatMemref = memrefToIndexPtr(readOp, rewriter);
 
   Value mask = vector::ConstantMaskOp::create(
       rewriter, loc, VectorType::get(vectorShape, rewriter.getI1Type()),
@@ -405,11 +395,14 @@ static LogicalResult lowerToScatteredStoreOp(vector::TransferWriteOp writeOp,
   if (!memrefType)
     return rewriter.notifyMatchFailure(writeOp, "Expected memref source");
 
-  SmallVector<Value> strides = computeStrides(writeOp, rewriter);
+  auto meta = computeMemrefMeta(writeOp, rewriter);
+  if (meta.first.empty())
+    return rewriter.notifyMatchFailure(writeOp, "Failed to compute strides");
 
-  Value localOffsets = computeOffsets(writeOp, rewriter, strides);
+  Value localOffsets =
+      computeOffsets(writeOp, rewriter, meta.first, meta.second);
 
-  Value flatMemref = collapseMemrefTo1D(writeOp, rewriter);
+  Value flatMemref = memrefToIndexPtr(writeOp, rewriter);
 
   Value mask = vector::ConstantMaskOp::create(
       rewriter, loc, VectorType::get(vectorShape, rewriter.getI1Type()),
diff --git a/mlir/lib/Dialect/Arith/IR/ArithCanonicalization.td b/mlir/lib/Dialect/Arith/IR/ArithCanonicalization.td
index 4558d827e8563..de3efc9fe3506 100644
--- a/mlir/lib/Dialect/Arith/IR/ArithCanonicalization.td
+++ b/mlir/lib/Dialect/Arith/IR/ArithCanonicalization.td
@@ -394,7 +394,7 @@ def TruncIShrSIToTrunciShrUI :
         [(TruncationMatchesShiftAmount $x, $tr, $c0)]>;
 
 //===----------------------------------------------------------------------===//
-// TruncIOp
+// TruncFOp
 //===----------------------------------------------------------------------===//
 
 // truncf(sitofp(x)) -> sitofp(x) if default rounding mode.
diff --git a/mlir/lib/Dialect/Arith/IR/ArithOps.cpp b/mlir/lib/Dialect/Arith/IR/ArithOps.cpp
index 7d4d818ee448b..1fbb7a4784cd5 100644
--- a/mlir/lib/Dialect/Arith/IR/ArithOps.cpp
+++ b/mlir/lib/Dialect/Arith/IR/ArithOps.cpp
@@ -2723,7 +2723,7 @@ TypedAttr mlir::arith::getIdentityValueAttr(AtomicRMWKind kind, Type resultType,
   return nullptr;
 }
 
-/// Return the identity numeric value associated to the give op.
+/// Returns the identity numeric value of the given op.
 std::optional<TypedAttr> mlir::arith::getNeutralElement(Operation *op) {
   std::optional<AtomicRMWKind> maybeKind =
       llvm::TypeSwitch<Operation *, std::optional<AtomicRMWKind>>(op)
diff --git a/mlir/lib/Dialect/Arith/Transforms/IntRangeOptimizations.cpp b/mlir/lib/Dialect/Arith/Transforms/IntRangeOptimizations.cpp
index 777ff0ecaa314..2017905587b26 100644
--- a/mlir/lib/Dialect/Arith/Transforms/IntRangeOptimizations.cpp
+++ b/mlir/lib/Dialect/Arith/Transforms/IntRangeOptimizations.cpp
@@ -8,6 +8,7 @@
 
 #include <utility>
 
+#include "mlir/Analysis/DataFlow/ConstantPropagationAnalysis.h"
 #include "mlir/Analysis/DataFlowFramework.h"
 #include "mlir/Dialect/Arith/Transforms/Passes.h"
 
@@ -485,6 +486,7 @@ struct IntRangeOptimizationsPass final
     MLIRContext *ctx = op->getContext();
     DataFlowSolver solver;
     solver.load<DeadCodeAnalysis>();
+    solver.load<SparseConstantPropagation>();
     solver.load<IntegerRangeAnalysis>();
     if (failed(solver.initializeAndRun(op)))
       return signalPassFailure();
diff --git a/mlir/lib/Dialect/ArmSME/IR/Utils.cpp b/mlir/lib/Dialect/ArmSME/IR/Utils.cpp
index e5e1312f0eb04..e64ae42204fa0 100644
--- a/mlir/lib/Dialect/ArmSME/IR/Utils.cpp
+++ b/mlir/lib/Dialect/ArmSME/IR/Utils.cpp
@@ -14,6 +14,21 @@
 
 namespace mlir::arm_sme {
 
+unsigned getSizeInBytes(TypeSize type) {
+  switch (type) {
+  case arm_sme::TypeSize::Byte:
+    return 1;
+  case arm_sme::TypeSize::Half:
+    return 2;
+  case arm_sme::TypeSize::Word:
+    return 4;
+  case arm_sme::TypeSize::Double:
+    return 8;
+  }
+  llvm_unreachable("unknown type size");
+  return 0;
+}
+
 unsigned getSMETileSliceMinNumElts(Type type) {
   assert(isValidSMETileElementType(type) && "invalid tile type!");
   return MinStreamingVectorLengthInBits / type.getIntOrFloatBitWidth();
diff --git a/mlir/lib/Dialect/ArmSME/Transforms/OuterProductFusion.cpp b/mlir/lib/Dialect/ArmSME/Transforms/OuterProductFusion.cpp
index 9bf026563c255..9196d2ef79592 100644
--- a/mlir/lib/Dialect/ArmSME/Transforms/OuterProductFusion.cpp
+++ b/mlir/lib/Dialect/ArmSME/Transforms/OuterProductFusion.cpp
@@ -445,7 +445,7 @@ struct SwapVectorExtractOfArithExtend
       return rewriter.notifyMatchFailure(
           extractOp, "extracted type is not a 1-D scalable vector type");
 
-    auto *extendOp = extractOp.getVector().getDefiningOp();
+    auto *extendOp = extractOp.getSource().getDefiningOp();
     if (!isa_and_present<arith::ExtSIOp, arith::ExtUIOp, arith::ExtFOp>(
             extendOp))
       return rewriter.notifyMatchFailure(extractOp,
diff --git a/mlir/lib/Dialect/ArmSME/Transforms/VectorLegalization.cpp b/mlir/lib/Dialect/ArmSME/Transforms/VectorLegalization.cpp
index 1c0eced43dc00..576c92b375ff3 100644
--- a/mlir/lib/Dialect/ArmSME/Transforms/VectorLegalization.cpp
+++ b/mlir/lib/Dialect/ArmSME/Transforms/VectorLegalization.cpp
@@ -542,7 +542,7 @@ struct FoldExtractFromVectorOfSMELikeCreateMasks
                                 PatternRewriter &rewriter) const override {
     auto loc = extractOp.getLoc();
     auto createMaskOp =
-        extractOp.getVector().getDefiningOp<vector::CreateMaskOp>();
+        extractOp.getSource().getDefiningOp<vector::CreateMaskOp>();
     if (!createMaskOp)
       return rewriter.notifyMatchFailure(
           extractOp, "extract not from vector.create_mask op");
diff --git a/mlir/lib/Dialect/EmitC/IR/EmitC.cpp b/mlir/lib/Dialect/EmitC/IR/EmitC.cpp
index a73470cdf76c5..5c8564bca6f86 100644
--- a/mlir/lib/Dialect/EmitC/IR/EmitC.cpp
+++ b/mlir/lib/Dialect/EmitC/IR/EmitC.cpp
@@ -1320,7 +1320,11 @@ GetGlobalOp::verifySymbolUses(SymbolTableCollection &symbolTable) {
 
   // global has non-array type
   auto lvalueType = dyn_cast<LValueType>(resultType);
-  if (!lvalueType || lvalueType.getValueType() != globalType)
+  if (!lvalueType)
+    return emitOpError("on non-array type expects result type to be an "
+                       "lvalue type for the global @")
+           << getName();
+  if (lvalueType.getValueType() != globalType)
     return emitOpError("on non-array type expects result inner type ")
            << lvalueType.getValueType() << " to match type " << globalType
            << " of the global @" << getName();
diff --git a/mlir/lib/Dialect/Func/TransformOps/FuncTransformOps.cpp b/mlir/lib/Dialect/Func/TransformOps/FuncTransformOps.cpp
index 935d3e5ac331b..3a42d2a367d70 100644
--- a/mlir/lib/Dialect/Func/TransformOps/FuncTransformOps.cpp
+++ b/mlir/lib/Dialect/Func/TransformOps/FuncTransformOps.cpp
@@ -17,6 +17,7 @@
 #include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/Transforms/DialectConversion.h"
+#include "llvm/ADT/STLExtras.h"
 
 using namespace mlir;
 
@@ -296,9 +297,16 @@ transform::ReplaceFuncSignatureOp::apply(transform::TransformRewriter &rewriter,
     }
   }
 
-  FailureOr<func::FuncOp> newFuncOpOrFailure = func::replaceFuncWithNewOrder(
-      rewriter, funcOp, argsInterchange.getArrayRef(),
-      resultsInterchange.getArrayRef());
+  llvm::SmallVector<int> oldArgToNewArg(argsInterchange.size());
+  for (auto [newArgIdx, oldArgIdx] : llvm::enumerate(argsInterchange))
+    oldArgToNewArg[oldArgIdx] = newArgIdx;
+
+  llvm::SmallVector<int> oldResToNewRes(resultsInterchange.size());
+  for (auto [newResIdx, oldResIdx] : llvm::enumerate(resultsInterchange))
+    oldResToNewRes[oldResIdx] = newResIdx;
+
+  FailureOr<func::FuncOp> newFuncOpOrFailure = func::replaceFuncWithNewMapping(
+      rewriter, funcOp, oldArgToNewArg, oldResToNewRes);
   if (failed(newFuncOpOrFailure))
     return emitSilenceableFailure(getLoc())
            << "failed to replace function signature '" << getFunctionName()
@@ -312,9 +320,8 @@ transform::ReplaceFuncSignatureOp::apply(transform::TransformRewriter &rewriter,
     });
 
     for (func::CallOp callOp : callOps)
-      func::replaceCallOpWithNewOrder(rewriter, callOp,
-                                      argsInterchange.getArrayRef(),
-                                      resultsInterchange.getArrayRef());
+      func::replaceCallOpWithNewMapping(rewriter, callOp, oldArgToNewArg,
+                                        oldResToNewRes);
   }
 
   results.set(cast<OpResult>(getTransformedModule()), {targetModuleOp});
@@ -330,6 +337,51 @@ void transform::ReplaceFuncSignatureOp::getEffects(
   transform::modifiesPayload(effects);
 }
 
+//===----------------------------------------------------------------------===//
+// DeduplicateFuncArgsOp
+//===----------------------------------------------------------------------===//
+
+DiagnosedSilenceableFailure
+transform::DeduplicateFuncArgsOp::apply(transform::TransformRewriter &rewriter,
+                                        transform::TransformResults &results,
+                                        transform::TransformState &state) {
+  auto payloadOps = state.getPayloadOps(getModule());
+  if (!llvm::hasSingleElement(payloadOps))
+    return emitDefiniteFailure() << "requires a single module to operate on";
+
+  auto targetModuleOp = dyn_cast<ModuleOp>(*payloadOps.begin());
+  if (!targetModuleOp)
+    return emitSilenceableFailure(getLoc())
+           << "target is expected to be module operation";
+
+  func::FuncOp funcOp =
+      targetModuleOp.lookupSymbol<func::FuncOp>(getFunctionName());
+  if (!funcOp)
+    return emitSilenceableFailure(getLoc())
+           << "function with name '" << getFunctionName() << "' is not found";
+
+  auto transformationResult =
+      func::deduplicateArgsOfFuncOp(rewriter, funcOp, targetModuleOp);
+  if (failed(transformationResult))
+    return emitSilenceableFailure(getLoc())
+           << "failed to deduplicate function arguments of function "
+           << funcOp.getName();
+
+  auto [newFuncOp, newCallOp] = *transformationResult;
+
+  results.set(cast<OpResult>(getTransformedModule()), {targetModuleOp});
+  results.set(cast<OpResult>(getTransformedFunction()), {newFuncOp});
+
+  return DiagnosedSilenceableFailure::success();
+}
+
+void transform::DeduplicateFuncArgsOp::getEffects(
+    SmallVectorImpl<MemoryEffects::EffectInstance> &effects) {
+  transform::consumesHandle(getModuleMutable(), effects);
+  transform::producesHandle(getOperation()->getOpResults(), effects);
+  transform::modifiesPayload(effects);
+}
+
 //===----------------------------------------------------------------------===//
 // Transform op registration
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/Func/Utils/Utils.cpp b/mlir/lib/Dialect/Func/Utils/Utils.cpp
index f781ed2d591b4..b4cb0932ef631 100644
--- a/mlir/lib/Dialect/Func/Utils/Utils.cpp
+++ b/mlir/lib/Dialect/Func/Utils/Utils.cpp
@@ -14,35 +14,101 @@
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/IR/IRMapping.h"
 #include "mlir/IR/PatternMatch.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/DebugLog.h"
+
+#define DEBUG_TYPE "func-utils"
 
 using namespace mlir;
 
+/// This method creates an inverse mapping of the provided map `oldToNew`.
+/// Given an array where `oldIdxToNewIdx[i] = j` means old index `i` maps
+/// to new index `j`,
+/// This method returns a vector where `result[j]` contains all old indices
+/// that map to new index `j`.
+///
+/// Example:
+/// ```
+/// oldIdxToNewIdx = [0, 1, 2, 2, 3]
+/// getInverseMapping(oldIdxToNewIdx) = [[0], [1], [2, 3], [4]]
+/// ```
+///
+static llvm::SmallVector<llvm::SmallVector<int>>
+getInverseMapping(ArrayRef<int> oldIdxToNewIdx) {
+  int numOfNewIdxs = 0;
+  if (!oldIdxToNewIdx.empty())
+    numOfNewIdxs = 1 + *llvm::max_element(oldIdxToNewIdx);
+  llvm::SmallVector<llvm::SmallVector<int>> newToOldIdxs(numOfNewIdxs);
+  for (auto [oldIdx, newIdx] : llvm::enumerate(oldIdxToNewIdx))
+    newToOldIdxs[newIdx].push_back(oldIdx);
+  return newToOldIdxs;
+}
+
+/// This method returns a new vector of elements that are mapped from the
+/// `origElements` based on the `newIdxToOldIdxs` mapping. This function assumes
+/// that the `newIdxToOldIdxs` mapping is valid, i.e. for each new index, there
+/// is at least one old index that maps to it. Also, It assumes that mapping to
+/// the same old index has the same element in the `origElements` vector.
+template <typename Element>
+static SmallVector<Element> getMappedElements(
+    ArrayRef<Element> origElements,
+    const llvm::SmallVector<llvm::SmallVector<int>> &newIdxToOldIdxs) {
+  SmallVector<Element> newElements;
+  for (const auto &oldIdxs : newIdxToOldIdxs) {
+    assert(llvm::all_of(oldIdxs,
+                        [&origElements](int idx) -> bool {
+                          return idx >= 0 &&
+                                 static_cast<size_t>(idx) < origElements.size();
+                        }) &&
+           "idx must be less than the number of elements in the original "
+           "elements");
+    assert(!oldIdxs.empty() && "oldIdx must not be empty");
+    Element origTypeToCheck = origElements[oldIdxs.front()];
+    assert(llvm::all_of(oldIdxs,
+                        [&](int idx) -> bool {
+                          return origElements[idx] == origTypeToCheck;
+                        }) &&
+           "all oldIdxs must be equal");
+    newElements.push_back(origTypeToCheck);
+  }
+  return newElements;
+}
+
 FailureOr<func::FuncOp>
-func::replaceFuncWithNewOrder(RewriterBase &rewriter, func::FuncOp funcOp,
-                              ArrayRef<unsigned> newArgsOrder,
-                              ArrayRef<unsigned> newResultsOrder) {
+func::replaceFuncWithNewMapping(RewriterBase &rewriter, func::FuncOp funcOp,
+                                ArrayRef<int> oldArgIdxToNewArgIdx,
+                                ArrayRef<int> oldResIdxToNewResIdx) {
   // Generate an empty new function operation with the same name as the
   // original.
-  assert(funcOp.getNumArguments() == newArgsOrder.size() &&
-         "newArgsOrder must match the number of arguments in the function");
-  assert(funcOp.getNumResults() == newResultsOrder.size() &&
-         "newResultsOrder must match the number of results in the function");
+  assert(funcOp.getNumArguments() == oldArgIdxToNewArgIdx.size() &&
+         "oldArgIdxToNewArgIdx must match the number of arguments in the "
+         "function");
+  assert(
+      funcOp.getNumResults() == oldResIdxToNewResIdx.size() &&
+      "oldResIdxToNewResIdx must match the number of results in the function");
 
   if (!funcOp.getBody().hasOneBlock())
     return rewriter.notifyMatchFailure(
         funcOp, "expected function to have exactly one block");
 
-  ArrayRef<Type> origInputTypes = funcOp.getFunctionType().getInputs();
-  ArrayRef<Type> origOutputTypes = funcOp.getFunctionType().getResults();
-  SmallVector<Type> newInputTypes, newOutputTypes;
+  // We may have some duplicate arguments in the old function, i.e.
+  // in the mapping `newArgIdxToOldArgIdxs` for some new argument index
+  // there may be multiple old argument indices.
+  llvm::SmallVector<llvm::SmallVector<int>> newArgIdxToOldArgIdxs =
+      getInverseMapping(oldArgIdxToNewArgIdx);
+  SmallVector<Type> newInputTypes = getMappedElements(
+      funcOp.getFunctionType().getInputs(), newArgIdxToOldArgIdxs);
+
   SmallVector<Location> locs;
-  for (unsigned int idx : newArgsOrder) {
-    newInputTypes.push_back(origInputTypes[idx]);
-    locs.push_back(funcOp.getArgument(newArgsOrder[idx]).getLoc());
-  }
-  for (unsigned int idx : newResultsOrder)
-    newOutputTypes.push_back(origOutputTypes[idx]);
+  for (const auto &oldArgIdxs : newArgIdxToOldArgIdxs)
+    locs.push_back(funcOp.getArgument(oldArgIdxs.front()).getLoc());
+
+  llvm::SmallVector<llvm::SmallVector<int>> newResToOldResIdxs =
+      getInverseMapping(oldResIdxToNewResIdx);
+  SmallVector<Type> newOutputTypes = getMappedElements(
+      funcOp.getFunctionType().getResults(), newResToOldResIdxs);
+
   rewriter.setInsertionPoint(funcOp);
   auto newFuncOp = func::FuncOp::create(
       rewriter, funcOp.getLoc(), funcOp.getName(),
@@ -51,21 +117,21 @@ func::replaceFuncWithNewOrder(RewriterBase &rewriter, func::FuncOp funcOp,
   Region &newRegion = newFuncOp.getBody();
   rewriter.createBlock(&newRegion, newRegion.begin(), newInputTypes, locs);
   newFuncOp.setVisibility(funcOp.getVisibility());
-  newFuncOp->setDiscardableAttrs(funcOp->getDiscardableAttrDictionary());
 
   // Map the arguments of the original function to the new function in
   // the new order and adjust the attributes accordingly.
   IRMapping operandMapper;
   SmallVector<DictionaryAttr> argAttrs, resultAttrs;
   funcOp.getAllArgAttrs(argAttrs);
-  for (unsigned int i = 0; i < newArgsOrder.size(); ++i) {
-    operandMapper.map(funcOp.getArgument(newArgsOrder[i]),
-                      newFuncOp.getArgument(i));
-    newFuncOp.setArgAttrs(i, argAttrs[newArgsOrder[i]]);
-  }
+  for (auto [oldArgIdx, newArgIdx] : llvm::enumerate(oldArgIdxToNewArgIdx))
+    operandMapper.map(funcOp.getArgument(oldArgIdx),
+                      newFuncOp.getArgument(newArgIdx));
+  for (auto [newArgIdx, oldArgIdx] : llvm::enumerate(newArgIdxToOldArgIdxs))
+    newFuncOp.setArgAttrs(newArgIdx, argAttrs[oldArgIdx.front()]);
+
   funcOp.getAllResultAttrs(resultAttrs);
-  for (unsigned int i = 0; i < newResultsOrder.size(); ++i)
-    newFuncOp.setResultAttrs(i, resultAttrs[newResultsOrder[i]]);
+  for (auto [newResIdx, oldResIdx] : llvm::enumerate(newResToOldResIdxs))
+    newFuncOp.setResultAttrs(newResIdx, resultAttrs[oldResIdx.front()]);
 
   // Clone the operations from the original function to the new function.
   rewriter.setInsertionPointToStart(&newFuncOp.getBody().front());
@@ -76,12 +142,11 @@ func::replaceFuncWithNewOrder(RewriterBase &rewriter, func::FuncOp funcOp,
   auto returnOp = cast<func::ReturnOp>(
       newFuncOp.getFunctionBody().begin()->getTerminator());
   SmallVector<Value> newReturnValues;
-  for (unsigned int idx : newResultsOrder)
-    newReturnValues.push_back(returnOp.getOperand(idx));
+  for (const auto &oldResIdxs : newResToOldResIdxs)
+    newReturnValues.push_back(returnOp.getOperand(oldResIdxs.front()));
+
   rewriter.setInsertionPoint(returnOp);
-  auto newReturnOp =
-      func::ReturnOp::create(rewriter, newFuncOp.getLoc(), newReturnValues);
-  newReturnOp->setDiscardableAttrs(returnOp->getDiscardableAttrDictionary());
+  func::ReturnOp::create(rewriter, newFuncOp.getLoc(), newReturnValues);
   rewriter.eraseOp(returnOp);
 
   rewriter.eraseOp(funcOp);
@@ -90,33 +155,102 @@ func::replaceFuncWithNewOrder(RewriterBase &rewriter, func::FuncOp funcOp,
 }
 
 func::CallOp
-func::replaceCallOpWithNewOrder(RewriterBase &rewriter, func::CallOp callOp,
-                                ArrayRef<unsigned> newArgsOrder,
-                                ArrayRef<unsigned> newResultsOrder) {
-  assert(
-      callOp.getNumOperands() == newArgsOrder.size() &&
-      "newArgsOrder must match the number of operands in the call operation");
-  assert(
-      callOp.getNumResults() == newResultsOrder.size() &&
-      "newResultsOrder must match the number of results in the call operation");
-  SmallVector<Value> newArgsOrderValues;
-  for (unsigned int argIdx : newArgsOrder)
-    newArgsOrderValues.push_back(callOp.getOperand(argIdx));
-  SmallVector<Type> newResultTypes;
-  for (unsigned int resIdx : newResultsOrder)
-    newResultTypes.push_back(callOp.getResult(resIdx).getType());
+func::replaceCallOpWithNewMapping(RewriterBase &rewriter, func::CallOp callOp,
+                                  ArrayRef<int> oldArgIdxToNewArgIdx,
+                                  ArrayRef<int> oldResIdxToNewResIdx) {
+  assert(callOp.getNumOperands() == oldArgIdxToNewArgIdx.size() &&
+         "oldArgIdxToNewArgIdx must match the number of operands in the call "
+         "operation");
+  assert(callOp.getNumResults() == oldResIdxToNewResIdx.size() &&
+         "oldResIdxToNewResIdx must match the number of results in the call "
+         "operation");
+
+  SmallVector<Value> origOperands = callOp.getOperands();
+  SmallVector<llvm::SmallVector<int>> newArgIdxToOldArgIdxs =
+      getInverseMapping(oldArgIdxToNewArgIdx);
+  SmallVector<Value> newOperandsValues =
+      getMappedElements<Value>(origOperands, newArgIdxToOldArgIdxs);
+  SmallVector<llvm::SmallVector<int>> newResToOldResIdxs =
+      getInverseMapping(oldResIdxToNewResIdx);
+  SmallVector<Type> origResultTypes = llvm::to_vector(callOp.getResultTypes());
+  SmallVector<Type> newResultTypes =
+      getMappedElements<Type>(origResultTypes, newResToOldResIdxs);
 
   // Replace the kernel call operation with a new one that has the
-  // reordered arguments.
+  // mapped arguments.
   rewriter.setInsertionPoint(callOp);
   auto newCallOp =
       func::CallOp::create(rewriter, callOp.getLoc(), callOp.getCallee(),
-                           newResultTypes, newArgsOrderValues);
+                           newResultTypes, newOperandsValues);
   newCallOp.setNoInlineAttr(callOp.getNoInlineAttr());
-  for (auto &&[newIndex, origIndex] : llvm::enumerate(newResultsOrder))
-    rewriter.replaceAllUsesWith(callOp.getResult(origIndex),
-                                newCallOp.getResult(newIndex));
+  for (auto &&[oldResIdx, newResIdx] : llvm::enumerate(oldResIdxToNewResIdx))
+    rewriter.replaceAllUsesWith(callOp.getResult(oldResIdx),
+                                newCallOp.getResult(newResIdx));
   rewriter.eraseOp(callOp);
 
   return newCallOp;
 }
+
+FailureOr<std::pair<func::FuncOp, func::CallOp>>
+func::deduplicateArgsOfFuncOp(RewriterBase &rewriter, func::FuncOp funcOp,
+                              ModuleOp moduleOp) {
+  SmallVector<func::CallOp> callOps;
+  auto traversalResult = moduleOp.walk([&](func::CallOp callOp) {
+    if (callOp.getCallee() == funcOp.getSymName()) {
+      if (!callOps.empty())
+        // Only support one callOp for now
+        return WalkResult::interrupt();
+      callOps.push_back(callOp);
+    }
+    return WalkResult::advance();
+  });
+
+  if (traversalResult.wasInterrupted()) {
+    LDBG() << "function " << funcOp.getName() << " has more than one callOp";
+    return failure();
+  }
+
+  if (callOps.empty()) {
+    LDBG() << "function " << funcOp.getName() << " does not have any callOp";
+    return failure();
+  }
+
+  func::CallOp callOp = callOps.front();
+
+  // Create mapping for arguments (deduplicate operands)
+  SmallVector<int> oldArgIdxToNewArgIdx(callOp.getNumOperands());
+  llvm::DenseMap<Value, int> valueToNewArgIdx;
+  for (auto [operandIdx, operand] : llvm::enumerate(callOp.getOperands())) {
+    auto [iterator, inserted] = valueToNewArgIdx.insert(
+        {operand, static_cast<int>(valueToNewArgIdx.size())});
+    // Reduce the duplicate operands and maintain the original order.
+    oldArgIdxToNewArgIdx[operandIdx] = iterator->second;
+  }
+
+  bool hasDuplicateOperands =
+      valueToNewArgIdx.size() != callOp.getNumOperands();
+  if (!hasDuplicateOperands) {
+    LDBG() << "function " << funcOp.getName()
+           << " does not have duplicate operands";
+    return failure();
+  }
+
+  // Create identity mapping for results (no deduplication needed)
+  SmallVector<int> oldResIdxToNewResIdx(callOp.getNumResults());
+  for (int resultIdx : llvm::seq<int>(0, callOp.getNumResults()))
+    oldResIdxToNewResIdx[resultIdx] = resultIdx;
+
+  // Apply the transformation to create new function and call operations
+  FailureOr<func::FuncOp> newFuncOpOrFailure = replaceFuncWithNewMapping(
+      rewriter, funcOp, oldArgIdxToNewArgIdx, oldResIdxToNewResIdx);
+  if (failed(newFuncOpOrFailure)) {
+    LDBG() << "failed to replace function signature with name "
+           << funcOp.getName() << " with new order";
+    return failure();
+  }
+
+  func::CallOp newCallOp = replaceCallOpWithNewMapping(
+      rewriter, callOp, oldArgIdxToNewArgIdx, oldResIdxToNewResIdx);
+
+  return std::make_pair(*newFuncOpOrFailure, newCallOp);
+}
diff --git a/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp b/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp
index c766539f9d91a..2561f6606067f 100644
--- a/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp
+++ b/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp
@@ -71,16 +71,14 @@ void transform::ApplyGPUToNVVMConversionPatternsOp::populatePatterns(
       llvmTypeConverter, [](AddressSpace space) -> unsigned {
         switch (space) {
         case AddressSpace::Global:
-          return static_cast<unsigned>(
-              NVVM::NVVMMemorySpace::kGlobalMemorySpace);
+          return static_cast<unsigned>(NVVM::NVVMMemorySpace::Global);
         case AddressSpace::Workgroup:
-          return static_cast<unsigned>(
-              NVVM::NVVMMemorySpace::kSharedMemorySpace);
+          return static_cast<unsigned>(NVVM::NVVMMemorySpace::Shared);
         case AddressSpace::Private:
           return 0;
         }
         llvm_unreachable("unknown address space enum value");
-        return 0;
+        return static_cast<unsigned>(NVVM::NVVMMemorySpace::Generic);
       });
   // Used in GPUToNVVM/WmmaOpsToNvvm.cpp so attaching here for now.
   // TODO: We should have a single to_nvvm_type_converter.
diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMAttrs.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMAttrs.cpp
index 23610fbd657fd..b8331e0068880 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/LLVMAttrs.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMAttrs.cpp
@@ -57,10 +57,10 @@ void LLVMDialect::registerAttributes() {
 //===----------------------------------------------------------------------===//
 
 /// Checks whether the given type is an LLVM type that can be loaded or stored.
-static bool isValidLoadStoreImpl(Type type, ptr::AtomicOrdering ordering,
-                                 std::optional<int64_t> alignment,
-                                 const ::mlir::DataLayout *dataLayout,
-                                 function_ref<InFlightDiagnostic()> emitError) {
+bool LLVM::detail::isValidLoadStoreImpl(
+    Type type, ptr::AtomicOrdering ordering, std::optional<int64_t> alignment,
+    const ::mlir::DataLayout *dataLayout,
+    function_ref<InFlightDiagnostic()> emitError) {
   if (!isLoadableType(type)) {
     if (emitError)
       emitError() << "type must be LLVM type with size, but got " << type;
@@ -87,14 +87,16 @@ bool AddressSpaceAttr::isValidLoad(
     Type type, ptr::AtomicOrdering ordering, std::optional<int64_t> alignment,
     const ::mlir::DataLayout *dataLayout,
     function_ref<InFlightDiagnostic()> emitError) const {
-  return isValidLoadStoreImpl(type, ordering, alignment, dataLayout, emitError);
+  return detail::isValidLoadStoreImpl(type, ordering, alignment, dataLayout,
+                                      emitError);
 }
 
 bool AddressSpaceAttr::isValidStore(
     Type type, ptr::AtomicOrdering ordering, std::optional<int64_t> alignment,
     const ::mlir::DataLayout *dataLayout,
     function_ref<InFlightDiagnostic()> emitError) const {
-  return isValidLoadStoreImpl(type, ordering, alignment, dataLayout, emitError);
+  return detail::isValidLoadStoreImpl(type, ordering, alignment, dataLayout,
+                                      emitError);
 }
 
 bool AddressSpaceAttr::isValidAtomicOp(
diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMTypes.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMTypes.cpp
index 2dd0132a65bc4..01a16ce2d8a7f 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/LLVMTypes.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMTypes.cpp
@@ -703,14 +703,14 @@ const llvm::fltSemantics &LLVMPPCFP128Type::getFloatSemantics() const {
 //===----------------------------------------------------------------------===//
 
 /// Check whether type is a compatible ptr type. These are pointer-like types
-/// with no element type, no metadata, and using the LLVM AddressSpaceAttr
-/// memory space.
+/// with no element type, no metadata, and using the LLVM
+/// LLVMAddrSpaceAttrInterface memory space.
 static bool isCompatiblePtrType(Type type) {
   auto ptrTy = dyn_cast<PtrLikeTypeInterface>(type);
   if (!ptrTy)
     return false;
   return !ptrTy.hasPtrMetadata() && ptrTy.getElementType() == nullptr &&
-         isa<AddressSpaceAttr>(ptrTy.getMemorySpace());
+         isa<LLVMAddrSpaceAttrInterface>(ptrTy.getMemorySpace());
 }
 
 bool mlir::LLVM::isCompatibleOuterType(Type type) {
diff --git a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
index e364475251901..13f1dd9a664e5 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
@@ -800,8 +800,8 @@ inferMMATypeFromMNK(NVVM::MMATypes type, NVVM::MMAFrag frag, int m, int n,
 LogicalResult NVVM::WMMALoadOp::verify() {
   unsigned addressSpace =
       llvm::cast<LLVM::LLVMPointerType>(getPtr().getType()).getAddressSpace();
-  if (addressSpace != 0 && addressSpace != NVVM::kGlobalMemorySpace &&
-      addressSpace != NVVM::kSharedMemorySpace)
+  if (addressSpace != 0 && addressSpace != NVVMMemorySpace::Global &&
+      addressSpace != NVVMMemorySpace::Shared)
     return emitOpError("expected source pointer in memory "
                        "space 0, 1, 3");
 
@@ -821,8 +821,8 @@ LogicalResult NVVM::WMMALoadOp::verify() {
 LogicalResult NVVM::WMMAStoreOp::verify() {
   unsigned addressSpace =
       llvm::cast<LLVM::LLVMPointerType>(getPtr().getType()).getAddressSpace();
-  if (addressSpace != 0 && addressSpace != NVVM::kGlobalMemorySpace &&
-      addressSpace != NVVM::kSharedMemorySpace)
+  if (addressSpace != 0 && addressSpace != NVVMMemorySpace::Global &&
+      addressSpace != NVVMMemorySpace::Shared)
     return emitOpError("expected operands to be a source pointer in memory "
                        "space 0, 1, 3");
 
@@ -1339,8 +1339,8 @@ LogicalResult NVVM::PrefetchOp::verify() {
     return emitOpError("cannot specify both tensormap and cache level");
 
   if (getTensormap()) {
-    if (addressSpace != MemSpace::kGenericMemorySpace &&
-        addressSpace != MemSpace::kConstantMemorySpace) {
+    if (addressSpace != MemSpace::Generic &&
+        addressSpace != MemSpace::Constant) {
       return emitOpError(
           "prefetch tensormap requires a generic or constant pointer");
     }
@@ -1350,15 +1350,14 @@ LogicalResult NVVM::PrefetchOp::verify() {
           "prefetch tensormap does not support eviction priority");
     }
 
-    if (getInParamSpace() && addressSpace != MemSpace::kGenericMemorySpace) {
+    if (getInParamSpace() && addressSpace != MemSpace::Generic) {
       return emitOpError(
           "in_param_space can only be specified for a generic pointer");
     }
 
   } else if (cacheLevel) {
-    if (addressSpace != MemSpace::kGenericMemorySpace &&
-        addressSpace != MemSpace::kGlobalMemorySpace &&
-        addressSpace != MemSpace::kLocalMemorySpace) {
+    if (addressSpace != MemSpace::Generic && addressSpace != MemSpace::Global &&
+        addressSpace != MemSpace::Local) {
       return emitOpError("prefetch to cache level requires a generic, global, "
                          "or local pointer");
     }
@@ -1370,7 +1369,7 @@ LogicalResult NVVM::PrefetchOp::verify() {
             "cache level is L1");
       }
 
-      if (addressSpace != MemSpace::kGenericMemorySpace) {
+      if (addressSpace != MemSpace::Generic) {
         return emitOpError(
             "prefetch to uniform cache requires a generic pointer");
       }
@@ -1381,7 +1380,7 @@ LogicalResult NVVM::PrefetchOp::verify() {
         return emitOpError(
             "cache eviction priority supported only for cache level L2");
 
-      if (addressSpace != MemSpace::kGlobalMemorySpace)
+      if (addressSpace != MemSpace::Global)
         return emitOpError("cache eviction priority requires a global pointer");
 
       if (*evictPriority != NVVM::CacheEvictionPriority::EvictNormal &&
@@ -1402,6 +1401,24 @@ LogicalResult NVVM::PrefetchOp::verify() {
   return success();
 }
 
+LogicalResult NVVM::ClusterLaunchControlQueryCancelOp::verify() {
+  switch (getQueryType()) {
+  case NVVM::ClusterLaunchControlQueryType::IS_CANCELED:
+    if (!getType().isInteger(1))
+      return emitOpError("is_canceled query type returns an i1");
+    break;
+  case NVVM::ClusterLaunchControlQueryType::GET_FIRST_CTA_ID_X:
+  case NVVM::ClusterLaunchControlQueryType::GET_FIRST_CTA_ID_Y:
+  case NVVM::ClusterLaunchControlQueryType::GET_FIRST_CTA_ID_Z:
+    if (!getType().isInteger(32)) {
+      return emitOpError("get_first_cta_id_x, get_first_cta_id_y, "
+                         "get_first_cta_id_z query types return an i32");
+    }
+    break;
+  }
+  return success();
+}
+
 /// Packs the given `field` into the `result`.
 /// The `result` is 64-bits and each `field` can be 32-bits or narrower.
 static llvm::Value *
@@ -1796,7 +1813,7 @@ Tcgen05AllocOp::getIntrinsicIDAndArgs(Operation &op,
   auto curOp = cast<NVVM::Tcgen05AllocOp>(op);
   unsigned as = llvm::cast<LLVM::LLVMPointerType>(curOp.getAddr().getType())
                     .getAddressSpace();
-  bool isShared = as == NVVMMemorySpace::kSharedMemorySpace;
+  bool isShared = as == NVVMMemorySpace::Shared;
   bool is2CTAMode = curOp.getGroup() == CTAGroupKind::CTA_2;
 
   llvm::Intrinsic::ID id;
@@ -1845,7 +1862,7 @@ Tcgen05CommitOp::getIntrinsicIDAndArgs(Operation &op,
   auto curOp = cast<NVVM::Tcgen05CommitOp>(op);
   unsigned as = llvm::cast<LLVM::LLVMPointerType>(curOp.getAddr().getType())
                     .getAddressSpace();
-  bool isShared = as == NVVMMemorySpace::kSharedMemorySpace;
+  bool isShared = as == NVVMMemorySpace::Shared;
   bool hasMulticast = static_cast<bool>(curOp.getMulticastMask());
   bool is2CTAMode = curOp.getGroup() == CTAGroupKind::CTA_2;
 
@@ -2051,18 +2068,18 @@ PrefetchOp::getIntrinsicIDAndArgs(NVVM::PrefetchOp &op,
     }
   }
 
-  switch (addressSpace) {
-  case MemSpace::kGenericMemorySpace:
+  switch (static_cast<MemSpace>(addressSpace)) {
+  case MemSpace::Generic:
     return *cacheLevel == CacheLevel::L1
                ? NVVM::IDArgPair({llvm::Intrinsic::nvvm_prefetch_L1, args})
                : NVVM::IDArgPair({llvm::Intrinsic::nvvm_prefetch_L2, args});
-  case MemSpace::kGlobalMemorySpace:
+  case MemSpace::Global:
     return *cacheLevel == CacheLevel::L1
                ? NVVM::IDArgPair(
                      {llvm::Intrinsic::nvvm_prefetch_global_L1, args})
                : NVVM::IDArgPair(
                      {llvm::Intrinsic::nvvm_prefetch_global_L2, args});
-  case MemSpace::kLocalMemorySpace:
+  case MemSpace::Local:
     return *cacheLevel == CacheLevel::L1
                ? NVVM::IDArgPair(
                      {llvm::Intrinsic::nvvm_prefetch_local_L1, args})
@@ -2088,6 +2105,51 @@ bool NVVM::InlinePtxOp::getAsmValues(
   return false; // No manual mapping needed
 }
 
+NVVM::IDArgPair ClusterLaunchControlTryCancelOp::getIntrinsicIDAndArgs(
+    Operation &op, LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder) {
+  auto curOp = cast<NVVM::ClusterLaunchControlTryCancelOp>(op);
+  llvm::SmallVector<llvm::Value *> args;
+  args.push_back(mt.lookupValue(curOp.getSmemAddress()));
+  args.push_back(mt.lookupValue(curOp.getMbarrier()));
+
+  llvm::Intrinsic::ID intrinsicID =
+      curOp.getMulticast()
+          ? llvm::Intrinsic::
+                nvvm_clusterlaunchcontrol_try_cancel_async_multicast_shared
+          : llvm::Intrinsic::nvvm_clusterlaunchcontrol_try_cancel_async_shared;
+
+  return {intrinsicID, args};
+}
+
+NVVM::IDArgPair ClusterLaunchControlQueryCancelOp::getIntrinsicIDAndArgs(
+    Operation &op, LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder) {
+  auto curOp = cast<NVVM::ClusterLaunchControlQueryCancelOp>(op);
+  llvm::SmallVector<llvm::Value *> args;
+  args.push_back(mt.lookupValue(curOp.getTryCancelResponse()));
+
+  llvm::Intrinsic::ID intrinsicID;
+
+  switch (curOp.getQueryType()) {
+  case NVVM::ClusterLaunchControlQueryType::IS_CANCELED:
+    intrinsicID =
+        llvm::Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_is_canceled;
+    break;
+  case NVVM::ClusterLaunchControlQueryType::GET_FIRST_CTA_ID_X:
+    intrinsicID = llvm::Intrinsic::
+        nvvm_clusterlaunchcontrol_query_cancel_get_first_ctaid_x;
+    break;
+  case NVVM::ClusterLaunchControlQueryType::GET_FIRST_CTA_ID_Y:
+    intrinsicID = llvm::Intrinsic::
+        nvvm_clusterlaunchcontrol_query_cancel_get_first_ctaid_y;
+    break;
+  case NVVM::ClusterLaunchControlQueryType::GET_FIRST_CTA_ID_Z:
+    intrinsicID = llvm::Intrinsic::
+        nvvm_clusterlaunchcontrol_query_cancel_get_first_ctaid_z;
+    break;
+  }
+  return {intrinsicID, args};
+}
+
 //===----------------------------------------------------------------------===//
 // NVVMDialect initialization, type parsing, and registration.
 //===----------------------------------------------------------------------===//
@@ -2185,6 +2247,66 @@ LogicalResult NVVMDialect::verifyRegionArgAttribute(Operation *op,
   return success();
 }
 
+//===----------------------------------------------------------------------===//
+// NVVM Address Space Attr
+//===----------------------------------------------------------------------===//
+
+unsigned NVVMMemorySpaceAttr::getAddressSpace() const {
+  return static_cast<unsigned>(getValue());
+}
+
+bool NVVMMemorySpaceAttr::isValidLoad(
+    Type type, ptr::AtomicOrdering ordering, std::optional<int64_t> alignment,
+    const ::mlir::DataLayout *dataLayout,
+    function_ref<InFlightDiagnostic()> emitError) const {
+  return LLVM::detail::isValidLoadStoreImpl(type, ordering, alignment,
+                                            dataLayout, emitError);
+}
+
+bool NVVMMemorySpaceAttr::isValidStore(
+    Type type, ptr::AtomicOrdering ordering, std::optional<int64_t> alignment,
+    const ::mlir::DataLayout *dataLayout,
+    function_ref<InFlightDiagnostic()> emitError) const {
+  return LLVM::detail::isValidLoadStoreImpl(type, ordering, alignment,
+                                            dataLayout, emitError);
+}
+
+bool NVVMMemorySpaceAttr::isValidAtomicOp(
+    ptr::AtomicBinOp op, Type type, ptr::AtomicOrdering ordering,
+    std::optional<int64_t> alignment, const ::mlir::DataLayout *dataLayout,
+    function_ref<InFlightDiagnostic()> emitError) const {
+  // TODO: update this method once `ptr.atomic_rmw` is implemented.
+  assert(false && "unimplemented, see TODO in the source.");
+  return false;
+}
+
+bool NVVMMemorySpaceAttr::isValidAtomicXchg(
+    Type type, ptr::AtomicOrdering successOrdering,
+    ptr::AtomicOrdering failureOrdering, std::optional<int64_t> alignment,
+    const ::mlir::DataLayout *dataLayout,
+    function_ref<InFlightDiagnostic()> emitError) const {
+  // TODO: update this method once `ptr.atomic_cmpxchg` is implemented.
+  assert(false && "unimplemented, see TODO in the source.");
+  return false;
+}
+
+bool NVVMMemorySpaceAttr::isValidAddrSpaceCast(
+    Type tgt, Type src, function_ref<InFlightDiagnostic()> emitError) const {
+  // TODO: update this method once the `ptr.addrspace_cast` op is added to the
+  // dialect.
+  assert(false && "unimplemented, see TODO in the source.");
+  return false;
+}
+
+bool NVVMMemorySpaceAttr::isValidPtrIntCast(
+    Type intLikeTy, Type ptrLikeTy,
+    function_ref<InFlightDiagnostic()> emitError) const {
+  // TODO: update this method once the int-cast ops are added to the `ptr`
+  // dialect.
+  assert(false && "unimplemented, see TODO in the source.");
+  return false;
+}
+
 //===----------------------------------------------------------------------===//
 // NVVM target attribute.
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/LLVMIR/IR/XeVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/XeVMDialect.cpp
index 24e6a9c284e26..8295492ad73a8 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/XeVMDialect.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/XeVMDialect.cpp
@@ -9,6 +9,7 @@
 #include "mlir/Dialect/GPU/IR/CompilationInterfaces.h"
 #include "mlir/Dialect/Utils/StaticValueUtils.h"
 #include "mlir/IR/DialectImplementation.h"
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/TypeSwitch.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/MathExtras.h"
@@ -306,6 +307,36 @@ LogicalResult BlockPrefetch2dOp::verify() {
   return success();
 }
 
+template <typename OpType, typename = std::enable_if_t<llvm::is_one_of<
+                               OpType, BlockLoadOp, BlockStoreOp>::value>>
+LogicalResult verify1DBlockArg(OpType op) {
+  VectorType vTy;
+  if constexpr (std::is_same_v<OpType, BlockLoadOp>)
+    vTy = op.getResult().getType();
+  else
+    vTy = op.getVal().getType();
+  int elemTySize = vTy.getElementType().getIntOrFloatBitWidth() / 8;
+  if (elemTySize == 1) {
+    llvm::SmallSet<int, 5> validSizes{1, 2, 4, 8, 16};
+    if (validSizes.contains(vTy.getNumElements()))
+      return success();
+    else
+      return op.emitOpError(
+          "vector size must be 1, 2, 4, 8 or 16 for 8-bit element type");
+  } else {
+    llvm::SmallSet<int, 4> validSizes{1, 2, 4, 8};
+    if (validSizes.contains(vTy.getNumElements()))
+      return success();
+    else
+      return op.emitOpError(
+          "vector size must be 1, 2, 4 or 8 for element type > 8 bits");
+  }
+}
+
+LogicalResult BlockLoadOp::verify() { return verify1DBlockArg(*this); }
+
+LogicalResult BlockStoreOp::verify() { return verify1DBlockArg(*this); }
+
 LogicalResult MMAOp::verify() {
   if (getC()) {
     if (getResult().getType() != getC().getType())
diff --git a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
index f3db8f7ccfaa1..715eebb3c4a13 100644
--- a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
+++ b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
@@ -36,6 +36,7 @@
 #include "mlir/IR/BuiltinTypeInterfaces.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/IR/TypeUtilities.h"
+#include "mlir/Interfaces/ParallelCombiningOpInterface.h"
 #include "mlir/Interfaces/TilingInterface.h"
 #include "mlir/Support/LLVM.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
@@ -4147,12 +4148,11 @@ DiagnosedSilenceableFailure doit(RewriterBase &rewriter, OpTy target,
     return DiagnosedSilenceableFailure::success();
   }
 
-  // If we are inside an InParallel region, temporarily set the insertion point
-  // outside: only tensor.parallel_insert_slice ops are allowed in there.
-  if constexpr (std::is_same_v<OpTy, tensor::ParallelInsertSliceOp>) {
-    rewriter.setInsertionPoint(
-        target->template getParentOfType<scf::InParallelOp>());
-  }
+  // If we are inside a `ParallelCombiningOp` region, temporarily set the
+  // insertion point outside: only ops implementing ParallelCombiningOpInterface
+  // are allowed in there.
+  if (isa<mlir::ParallelCombiningOpInterface>(target.getOperation()))
+    rewriter.setInsertionPoint(target->getParentOp());
 
   Value extracted = tensor::ExtractSliceOp::create(
       rewriter, target.getLoc(), target.getDest(), target.getMixedOffsets(),
diff --git a/mlir/lib/Dialect/Linalg/Transforms/DataLayoutPropagation.cpp b/mlir/lib/Dialect/Linalg/Transforms/DataLayoutPropagation.cpp
index ed2efd6fea5f7..6c17c3c2d0cab 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/DataLayoutPropagation.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/DataLayoutPropagation.cpp
@@ -1245,21 +1245,21 @@ struct SliceDimInfo {
   OpFoldResult outputSize;
 };
 
-/// Return the first input extract slice operand, if present, for the current
+/// Return all extract slice operands, if present, for the current
 /// generic op.
-static FailureOr<OpOperand *> getSliceOperand(GenericOp genericOp) {
-  OpOperand *sliceOperand = nullptr;
+static FailureOr<SmallVector<OpOperand *>>
+getSliceOperands(GenericOp genericOp) {
+  SmallVector<OpOperand *> sliceOperands;
   for (auto operand : genericOp.getDpsInputOperands()) {
     auto extractOp = operand->get().getDefiningOp<tensor::ExtractSliceOp>();
     if (!extractOp)
       continue;
-    sliceOperand = operand;
-    break;
+    sliceOperands.push_back(operand);
   }
-  if (!sliceOperand) {
+  if (sliceOperands.empty()) {
     return failure();
   }
-  return sliceOperand;
+  return sliceOperands;
 }
 
 // Return a map of dims that have partial slices on them so that other operands
@@ -1336,14 +1336,24 @@ pushDownExtractSliceOpThroughGenericOp(RewriterBase &rewriter,
         genericOp,
         "propagation through generic with gather semantics is unsupported.");
   // Collect the sliced operand, if present.
-  auto maybeSliceOperand = getSliceOperand(genericOp);
-  if (failed(maybeSliceOperand))
+  auto maybeSliceOperands = getSliceOperands(genericOp);
+  if (failed(maybeSliceOperands))
     return failure();
-  OpOperand *sliceOperand = *maybeSliceOperand;
-  unsigned OperandIndex = sliceOperand->getOperandNumber();
-
-  if (!controlFn(sliceOperand))
+  SmallVector<OpOperand *> sliceOperands = *maybeSliceOperands;
+  OpOperand *sliceOperand;
+
+  bool foundValidOperand = false;
+  for (auto currSliceOperand : sliceOperands) {
+    if (controlFn(currSliceOperand)) {
+      sliceOperand = currSliceOperand;
+      foundValidOperand = true;
+      break;
+    }
+  }
+  if (!foundValidOperand) {
     return failure();
+  }
+  unsigned OperandIndex = sliceOperand->getOperandNumber();
 
   tensor::ExtractSliceOp producerSliceOp =
       sliceOperand->get().getDefiningOp<tensor::ExtractSliceOp>();
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Hoisting.cpp b/mlir/lib/Dialect/Linalg/Transforms/Hoisting.cpp
index 36434cf2d2ae2..e1dc40d6d37d9 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Hoisting.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Hoisting.cpp
@@ -105,7 +105,7 @@ void mlir::linalg::hoistRedundantVectorBroadcasts(RewriterBase &rewriter,
         return WalkResult::advance();
 
       // Check that the vector to extract from is a BlockArgument.
-      auto blockArg = dyn_cast<BlockArgument>(extractOp.getVector());
+      auto blockArg = dyn_cast<BlockArgument>(extractOp.getSource());
       if (!blockArg)
         return WalkResult::advance();
 
@@ -141,7 +141,7 @@ void mlir::linalg::hoistRedundantVectorBroadcasts(RewriterBase &rewriter,
           return WalkResult::advance();
 
       rewriter.modifyOpInPlace(broadcast, [&] {
-        extractOp.getVectorMutable().assign(initArg->get());
+        extractOp.getSourceMutable().assign(initArg->get());
       });
       loop.moveOutOfLoop(extractOp);
       rewriter.moveOpAfter(broadcast, loop);
diff --git a/mlir/lib/Dialect/Linalg/Transforms/WinogradConv2D.cpp b/mlir/lib/Dialect/Linalg/Transforms/WinogradConv2D.cpp
index b80b27fe5fcc5..ef172c131be3b 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/WinogradConv2D.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/WinogradConv2D.cpp
@@ -514,7 +514,7 @@ Value inputTransform(RewriterBase &rewriter, Location loc, Value input,
     Value NIter = ivs[2];
     Value CIter = ivs[3];
 
-    auto context = builder.getContext();
+    auto *context = builder.getContext();
 
     auto identityAffineMap = rewriter.getMultiDimIdentityMap(1);
     auto affineMap =
@@ -735,7 +735,7 @@ Value outputTransform(RewriterBase &rewriter, Location loc, Value value,
 
   auto buildBody = [&](OpBuilder &builder, Location loc, ValueRange ivs,
                        ValueRange args) -> scf::ValueVector {
-    auto context = builder.getContext();
+    auto *context = builder.getContext();
     Value tileHIter = ivs[0];
     Value tileWIter = ivs[1];
     Value NIter = ivs[2];
diff --git a/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp b/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp
index b59d73d1291c8..5d15d5f6e3de4 100644
--- a/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp
+++ b/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp
@@ -1549,14 +1549,6 @@ LogicalResult GlobalOp::verify() {
     }
   }
 
-  if (std::optional<uint64_t> alignAttr = getAlignment()) {
-    uint64_t alignment = *alignAttr;
-
-    if (!llvm::isPowerOf2_64(alignment))
-      return emitError() << "alignment attribute value " << alignment
-                         << " is not a power of 2";
-  }
-
   // TODO: verify visibility for declarations.
   return success();
 }
diff --git a/mlir/lib/Dialect/MemRef/Transforms/FlattenMemRefs.cpp b/mlir/lib/Dialect/MemRef/Transforms/FlattenMemRefs.cpp
index 42be847811d52..1208fddf37e0b 100644
--- a/mlir/lib/Dialect/MemRef/Transforms/FlattenMemRefs.cpp
+++ b/mlir/lib/Dialect/MemRef/Transforms/FlattenMemRefs.cpp
@@ -271,12 +271,9 @@ struct FlattenMemrefsPass
 
 } // namespace
 
-void memref::populateFlattenMemrefsPatterns(RewritePatternSet &patterns) {
-  patterns.insert<MemRefRewritePattern<memref::LoadOp>,
-                  MemRefRewritePattern<memref::StoreOp>,
-                  MemRefRewritePattern<memref::AllocOp>,
-                  MemRefRewritePattern<memref::AllocaOp>,
-                  MemRefRewritePattern<vector::LoadOp>,
+void memref::populateFlattenVectorOpsOnMemrefPatterns(
+    RewritePatternSet &patterns) {
+  patterns.insert<MemRefRewritePattern<vector::LoadOp>,
                   MemRefRewritePattern<vector::StoreOp>,
                   MemRefRewritePattern<vector::TransferReadOp>,
                   MemRefRewritePattern<vector::TransferWriteOp>,
@@ -284,3 +281,16 @@ void memref::populateFlattenMemrefsPatterns(RewritePatternSet &patterns) {
                   MemRefRewritePattern<vector::MaskedStoreOp>>(
       patterns.getContext());
 }
+
+void memref::populateFlattenMemrefOpsPatterns(RewritePatternSet &patterns) {
+  patterns.insert<MemRefRewritePattern<memref::LoadOp>,
+                  MemRefRewritePattern<memref::StoreOp>,
+                  MemRefRewritePattern<memref::AllocOp>,
+                  MemRefRewritePattern<memref::AllocaOp>>(
+      patterns.getContext());
+}
+
+void memref::populateFlattenMemrefsPatterns(RewritePatternSet &patterns) {
+  populateFlattenMemrefOpsPatterns(patterns);
+  populateFlattenVectorOpsOnMemrefPatterns(patterns);
+}
diff --git a/mlir/lib/Dialect/NVGPU/TransformOps/NVGPUTransformOps.cpp b/mlir/lib/Dialect/NVGPU/TransformOps/NVGPUTransformOps.cpp
index bc3e8b2b17fb1..46e82bd8fc8c8 100644
--- a/mlir/lib/Dialect/NVGPU/TransformOps/NVGPUTransformOps.cpp
+++ b/mlir/lib/Dialect/NVGPU/TransformOps/NVGPUTransformOps.cpp
@@ -53,16 +53,14 @@ void transform::ApplyNVGPUToNVVMConversionPatternsOp::populatePatterns(
       llvmTypeConverter, [](gpu::AddressSpace space) -> unsigned {
         switch (space) {
         case gpu::AddressSpace::Global:
-          return static_cast<unsigned>(
-              NVVM::NVVMMemorySpace::kGlobalMemorySpace);
+          return static_cast<unsigned>(NVVM::NVVMMemorySpace::Global);
         case gpu::AddressSpace::Workgroup:
-          return static_cast<unsigned>(
-              NVVM::NVVMMemorySpace::kSharedMemorySpace);
+          return static_cast<unsigned>(NVVM::NVVMMemorySpace::Shared);
         case gpu::AddressSpace::Private:
           return 0;
         }
         llvm_unreachable("unknown address space enum value");
-        return 0;
+        return static_cast<unsigned>(NVVM::NVVMMemorySpace::Generic);
       });
   llvmTypeConverter.addConversion(
       [&](nvgpu::DeviceAsyncTokenType type) -> Type {
diff --git a/mlir/lib/Dialect/NVGPU/Transforms/CreateAsyncGroups.cpp b/mlir/lib/Dialect/NVGPU/Transforms/CreateAsyncGroups.cpp
index b392ffeb13de6..050bbac2293e9 100644
--- a/mlir/lib/Dialect/NVGPU/Transforms/CreateAsyncGroups.cpp
+++ b/mlir/lib/Dialect/NVGPU/Transforms/CreateAsyncGroups.cpp
@@ -71,7 +71,7 @@ static FailureOr<TransferMask> getMaskOp(Operation *loadOp) {
   if (auto extractOp =
           transferRead.getMask().getDefiningOp<vector::ExtractOp>())
     if (auto maskOp =
-            extractOp.getVector().getDefiningOp<vector::CreateMaskOp>())
+            extractOp.getSource().getDefiningOp<vector::CreateMaskOp>())
       return TransferMask{maskOp,
                           SmallVector<int64_t>(extractOp.getStaticPosition())};
 
diff --git a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
index ded4c7ab27274..ee3e4029abfb2 100644
--- a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
+++ b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
@@ -93,8 +93,8 @@ mlir::ArrayAttr addDeviceTypeAffectedOperandHelper(
     deviceTypes.push_back(
         acc::DeviceTypeAttr::get(context, acc::DeviceType::None));
 
-  for (DeviceType DT : newDeviceTypes)
-    deviceTypes.push_back(acc::DeviceTypeAttr::get(context, DT));
+  for (DeviceType dt : newDeviceTypes)
+    deviceTypes.push_back(acc::DeviceTypeAttr::get(context, dt));
 
   return mlir::ArrayAttr::get(context, deviceTypes);
 }
@@ -121,10 +121,10 @@ mlir::ArrayAttr addDeviceTypeAffectedOperandHelper(
         acc::DeviceTypeAttr::get(context, acc::DeviceType::None));
   }
 
-  for (DeviceType DT : newDeviceTypes) {
+  for (DeviceType dt : newDeviceTypes) {
     argCollection.append(arguments);
     segments.push_back(arguments.size());
-    deviceTypes.push_back(acc::DeviceTypeAttr::get(context, DT));
+    deviceTypes.push_back(acc::DeviceTypeAttr::get(context, dt));
   }
 
   return mlir::ArrayAttr::get(context, deviceTypes);
@@ -2962,10 +2962,10 @@ void acc::LoopOp::setCollapseForDeviceTypes(
     newDeviceTypes.push_back(
         acc::DeviceTypeAttr::get(context, DeviceType::None));
   } else {
-    for (DeviceType DT : effectiveDeviceTypes) {
+    for (DeviceType dt : effectiveDeviceTypes) {
       newValues.push_back(
           mlir::IntegerAttr::get(mlir::IntegerType::get(context, 64), value));
-      newDeviceTypes.push_back(acc::DeviceTypeAttr::get(context, DT));
+      newDeviceTypes.push_back(acc::DeviceTypeAttr::get(context, dt));
     }
   }
 
@@ -3144,7 +3144,8 @@ LogicalResult acc::DataOp::verify() {
                      "must appear on the data operation");
 
   for (mlir::Value operand : getDataClauseOperands())
-    if (!mlir::isa<acc::AttachOp, acc::CopyinOp, acc::CopyoutOp, acc::CreateOp,
+    if (isa<BlockArgument>(operand) ||
+        !mlir::isa<acc::AttachOp, acc::CopyinOp, acc::CopyoutOp, acc::CreateOp,
                    acc::DeleteOp, acc::DetachOp, acc::DevicePtrOp,
                    acc::GetDevicePtrOp, acc::NoCreateOp, acc::PresentOp>(
             operand.getDefiningOp()))
@@ -3513,7 +3514,8 @@ checkDeclareOperands(Op &op, const mlir::ValueRange &operands,
         "at least one operand must appear on the declare operation");
 
   for (mlir::Value operand : operands) {
-    if (!mlir::isa<acc::CopyinOp, acc::CopyoutOp, acc::CreateOp,
+    if (isa<BlockArgument>(operand) ||
+        !mlir::isa<acc::CopyinOp, acc::CopyoutOp, acc::CreateOp,
                    acc::DevicePtrOp, acc::GetDevicePtrOp, acc::PresentOp,
                    acc::DeclareDeviceResidentOp, acc::DeclareLinkOp>(
             operand.getDefiningOp()))
diff --git a/mlir/lib/Dialect/Ptr/IR/PtrDialect.cpp b/mlir/lib/Dialect/Ptr/IR/PtrDialect.cpp
index 284c998690170..f0209af8a1ca3 100644
--- a/mlir/lib/Dialect/Ptr/IR/PtrDialect.cpp
+++ b/mlir/lib/Dialect/Ptr/IR/PtrDialect.cpp
@@ -56,6 +56,12 @@ verifyAlignment(std::optional<int64_t> alignment,
   return success();
 }
 
+//===----------------------------------------------------------------------===//
+// ConstantOp
+//===----------------------------------------------------------------------===//
+
+OpFoldResult ConstantOp::fold(FoldAdaptor adaptor) { return getValue(); }
+
 //===----------------------------------------------------------------------===//
 // FromPtrOp
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/SCF/IR/SCF.cpp b/mlir/lib/Dialect/SCF/IR/SCF.cpp
index 84f9777a443fd..ae55eaded0554 100644
--- a/mlir/lib/Dialect/SCF/IR/SCF.cpp
+++ b/mlir/lib/Dialect/SCF/IR/SCF.cpp
@@ -19,12 +19,18 @@
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/IRMapping.h"
 #include "mlir/IR/Matchers.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/OperationSupport.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/Interfaces/FunctionInterfaces.h"
+#include "mlir/Interfaces/ParallelCombiningOpInterface.h"
 #include "mlir/Interfaces/ValueBoundsOpInterface.h"
 #include "mlir/Transforms/InliningUtils.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/DebugLog.h"
+#include <optional>
 
 using namespace mlir;
 using namespace mlir::scf;
@@ -104,6 +110,24 @@ static TerminatorTy verifyAndGetTerminator(Operation *op, Region &region,
   return nullptr;
 }
 
+/// Helper function to compute the difference between two values. This is used
+/// by the loop implementations to compute the trip count.
+static std::optional<llvm::APSInt> computeUbMinusLb(Value lb, Value ub,
+                                                    bool isSigned) {
+  llvm::APSInt diff;
+  auto addOp = ub.getDefiningOp<arith::AddIOp>();
+  if (!addOp)
+    return std::nullopt;
+  if ((isSigned && !addOp.hasNoSignedWrap()) ||
+      (!isSigned && !addOp.hasNoUnsignedWrap()))
+    return std::nullopt;
+
+  if (addOp.getLhs() != lb ||
+      !matchPattern(addOp.getRhs(), m_ConstantInt(&diff)))
+    return std::nullopt;
+  return diff;
+}
+
 //===----------------------------------------------------------------------===//
 // ExecuteRegionOp
 //===----------------------------------------------------------------------===//
@@ -236,6 +260,8 @@ struct MultiBlockExecuteInliner : public OpRewritePattern<ExecuteRegionOp> {
 
   LogicalResult matchAndRewrite(ExecuteRegionOp op,
                                 PatternRewriter &rewriter) const override {
+    if (op.getNoInline())
+      return failure();
     if (!isa<FunctionOpInterface, ExecuteRegionOp>(op->getParentOp()))
       return failure();
 
@@ -405,11 +431,19 @@ std::optional<ResultRange> ForOp::getLoopResults() { return getResults(); }
 /// Promotes the loop body of a forOp to its containing block if the forOp
 /// it can be determined that the loop has a single iteration.
 LogicalResult ForOp::promoteIfSingleIteration(RewriterBase &rewriter) {
-  std::optional<int64_t> tripCount =
-      constantTripCount(getLowerBound(), getUpperBound(), getStep());
-  if (!tripCount.has_value() || tripCount != 1)
+  std::optional<APInt> tripCount = getStaticTripCount();
+  LDBG() << "promoteIfSingleIteration tripCount is " << tripCount
+         << " for loop "
+         << OpWithFlags(getOperation(), OpPrintingFlags().skipRegions());
+  if (!tripCount.has_value() || tripCount->getSExtValue() > 1)
     return failure();
 
+  if (*tripCount == 0) {
+    rewriter.replaceAllUsesWith(getResults(), getInitArgs());
+    rewriter.eraseOp(*this);
+    return success();
+  }
+
   // Replace all results with the yielded values.
   auto yieldOp = cast<scf::YieldOp>(getBody()->getTerminator());
   rewriter.replaceAllUsesWith(getResults(), getYieldedValues());
@@ -643,7 +677,8 @@ SmallVector<Region *> ForallOp::getLoopRegions() { return {&getRegion()}; }
 LogicalResult scf::ForallOp::promoteIfSingleIteration(RewriterBase &rewriter) {
   for (auto [lb, ub, step] :
        llvm::zip(getMixedLowerBound(), getMixedUpperBound(), getMixedStep())) {
-    auto tripCount = constantTripCount(lb, ub, step);
+    auto tripCount =
+        constantTripCount(lb, ub, step, /*isSigned=*/true, computeUbMinusLb);
     if (!tripCount.has_value() || *tripCount != 1)
       return failure();
   }
@@ -681,7 +716,9 @@ void mlir::scf::promote(RewriterBase &rewriter, scf::ForallOp forallOp) {
   results.reserve(forallOp.getResults().size());
   for (auto &yieldingOp : terminator.getYieldingOps()) {
     auto parallelInsertSliceOp =
-        cast<tensor::ParallelInsertSliceOp>(yieldingOp);
+        dyn_cast<tensor::ParallelInsertSliceOp>(yieldingOp);
+    if (!parallelInsertSliceOp)
+      continue;
 
     Value dst = parallelInsertSliceOp.getDest();
     Value src = parallelInsertSliceOp.getSource();
@@ -998,27 +1035,6 @@ struct ForOpIterArgsFolder : public OpRewritePattern<scf::ForOp> {
   }
 };
 
-/// Util function that tries to compute a constant diff between u and l.
-/// Returns std::nullopt when the difference between two AffineValueMap is
-/// dynamic.
-static std::optional<APInt> computeConstDiff(Value l, Value u) {
-  IntegerAttr clb, cub;
-  if (matchPattern(l, m_Constant(&clb)) && matchPattern(u, m_Constant(&cub))) {
-    llvm::APInt lbValue = clb.getValue();
-    llvm::APInt ubValue = cub.getValue();
-    return ubValue - lbValue;
-  }
-
-  // Else a simple pattern match for x + c or c + x
-  llvm::APInt diff;
-  if (matchPattern(
-          u, m_Op<arith::AddIOp>(matchers::m_Val(l), m_ConstantInt(&diff))) ||
-      matchPattern(
-          u, m_Op<arith::AddIOp>(m_ConstantInt(&diff), matchers::m_Val(l))))
-    return diff;
-  return std::nullopt;
-}
-
 /// Rewriting pattern that erases loops that are known not to iterate, replaces
 /// single-iteration loops with their bodies, and removes empty loops that
 /// iterate at least once and only return values defined outside of the loop.
@@ -1027,34 +1043,21 @@ struct SimplifyTrivialLoops : public OpRewritePattern<ForOp> {
 
   LogicalResult matchAndRewrite(ForOp op,
                                 PatternRewriter &rewriter) const override {
-    // If the upper bound is the same as the lower bound, the loop does not
-    // iterate, just remove it.
-    if (op.getLowerBound() == op.getUpperBound()) {
+    std::optional<APInt> tripCount = op.getStaticTripCount();
+    if (!tripCount.has_value())
+      return rewriter.notifyMatchFailure(op,
+                                         "can't compute constant trip count");
+
+    if (tripCount->isZero()) {
+      LDBG() << "SimplifyTrivialLoops tripCount is 0 for loop "
+             << OpWithFlags(op, OpPrintingFlags().skipRegions());
       rewriter.replaceOp(op, op.getInitArgs());
       return success();
     }
 
-    std::optional<APInt> diff =
-        computeConstDiff(op.getLowerBound(), op.getUpperBound());
-    if (!diff)
-      return failure();
-
-    // If the loop is known to have 0 iterations, remove it.
-    bool zeroOrLessIterations =
-        diff->isZero() || (!op.getUnsignedCmp() && diff->isNegative());
-    if (zeroOrLessIterations) {
-      rewriter.replaceOp(op, op.getInitArgs());
-      return success();
-    }
-
-    std::optional<llvm::APInt> maybeStepValue = op.getConstantStep();
-    if (!maybeStepValue)
-      return failure();
-
-    // If the loop is known to have 1 iteration, inline its body and remove the
-    // loop.
-    llvm::APInt stepValue = *maybeStepValue;
-    if (stepValue.sge(*diff)) {
+    if (tripCount->getSExtValue() == 1) {
+      LDBG() << "SimplifyTrivialLoops tripCount is 1 for loop "
+             << OpWithFlags(op, OpPrintingFlags().skipRegions());
       SmallVector<Value, 4> blockArgs;
       blockArgs.reserve(op.getInitArgs().size() + 1);
       blockArgs.push_back(op.getLowerBound());
@@ -1067,11 +1070,14 @@ struct SimplifyTrivialLoops : public OpRewritePattern<ForOp> {
     Block &block = op.getRegion().front();
     if (!llvm::hasSingleElement(block))
       return failure();
-    // If the loop is empty, iterates at least once, and only returns values
+    // The loop is empty and iterates at least once, if it only returns values
     // defined outside of the loop, remove it and replace it with yield values.
     if (llvm::any_of(op.getYieldedValues(),
                      [&](Value v) { return !op.isDefinedOutsideOfLoop(v); }))
       return failure();
+    LDBG() << "SimplifyTrivialLoops empty body loop allows replacement with "
+              "yield operands for loop "
+           << OpWithFlags(op, OpPrintingFlags().skipRegions());
     rewriter.replaceOp(op, op.getYieldedValues());
     return success();
   }
@@ -1167,6 +1173,11 @@ Speculation::Speculatability ForOp::getSpeculatability() {
   return Speculation::NotSpeculatable;
 }
 
+std::optional<APInt> ForOp::getStaticTripCount() {
+  return constantTripCount(getLowerBound(), getUpperBound(), getStep(),
+                           /*isSigned=*/!getUnsignedCmp(), computeUbMinusLb);
+}
+
 //===----------------------------------------------------------------------===//
 // ForallOp
 //===----------------------------------------------------------------------===//
@@ -1439,12 +1450,9 @@ InParallelOp ForallOp::getTerminator() {
 
 SmallVector<Operation *> ForallOp::getCombiningOps(BlockArgument bbArg) {
   SmallVector<Operation *> storeOps;
-  InParallelOp inParallelOp = getTerminator();
-  for (Operation &yieldOp : inParallelOp.getYieldingOps()) {
-    if (auto parallelInsertSliceOp =
-            dyn_cast<tensor::ParallelInsertSliceOp>(yieldOp);
-        parallelInsertSliceOp && parallelInsertSliceOp.getDest() == bbArg) {
-      storeOps.push_back(parallelInsertSliceOp);
+  for (Operation *user : bbArg.getUsers()) {
+    if (auto parallelOp = dyn_cast<ParallelCombiningOpInterface>(user)) {
+      storeOps.push_back(parallelOp);
     }
   }
   return storeOps;
@@ -1766,7 +1774,8 @@ struct ForallOpSingleOrZeroIterationDimsFolder
     for (auto [lb, ub, step, iv] :
          llvm::zip(op.getMixedLowerBound(), op.getMixedUpperBound(),
                    op.getMixedStep(), op.getInductionVars())) {
-      auto numIterations = constantTripCount(lb, ub, step);
+      auto numIterations =
+          constantTripCount(lb, ub, step, /*isSigned=*/true, computeUbMinusLb);
       if (numIterations.has_value()) {
         // Remove the loop if it performs zero iterations.
         if (*numIterations == 0) {
@@ -1837,7 +1846,8 @@ struct ForallOpReplaceConstantInductionVar : public OpRewritePattern<ForallOp> {
                    op.getMixedStep(), op.getInductionVars())) {
       if (iv.hasNUses(0))
         continue;
-      auto numIterations = constantTripCount(lb, ub, step);
+      auto numIterations =
+          constantTripCount(lb, ub, step, /*isSigned=*/true, computeUbMinusLb);
       if (!numIterations.has_value() || numIterations.value() != 1) {
         continue;
       }
@@ -1911,8 +1921,10 @@ struct FoldTensorCastOfOutputIntoForallOp
     auto terminator = newForallOp.getTerminator();
     for (auto [yieldingOp, outputBlockArg] : llvm::zip(
              terminator.getYieldingOps(), newForallOp.getRegionIterArgs())) {
-      auto insertSliceOp = cast<tensor::ParallelInsertSliceOp>(yieldingOp);
-      insertSliceOp.getDestMutable().assign(outputBlockArg);
+      if (auto parallelCombingingOp =
+              dyn_cast<ParallelCombiningOpInterface>(yieldingOp)) {
+        parallelCombingingOp.getUpdatedDestinations().assign(outputBlockArg);
+      }
     }
 
     // Cast results back to the original types.
@@ -1971,19 +1983,22 @@ LogicalResult InParallelOp::verify() {
   if (!forallOp)
     return this->emitOpError("expected forall op parent");
 
-  // TODO: InParallelOpInterface.
   for (Operation &op : getRegion().front().getOperations()) {
-    if (!isa<tensor::ParallelInsertSliceOp>(op)) {
-      return this->emitOpError("expected only ")
-             << tensor::ParallelInsertSliceOp::getOperationName() << " ops";
+    auto parallelCombiningOp = dyn_cast<ParallelCombiningOpInterface>(&op);
+    if (!parallelCombiningOp) {
+      return this->emitOpError("expected only ParallelCombiningOpInterface")
+             << " ops";
     }
 
     // Verify that inserts are into out block arguments.
-    Value dest = cast<tensor::ParallelInsertSliceOp>(op).getDest();
+    MutableOperandRange dests = parallelCombiningOp.getUpdatedDestinations();
     ArrayRef<BlockArgument> regionOutArgs = forallOp.getRegionOutArgs();
-    if (!llvm::is_contained(regionOutArgs, dest))
-      return op.emitOpError("may only insert into an output block argument");
+    for (OpOperand &dest : dests) {
+      if (!llvm::is_contained(regionOutArgs, dest.get()))
+        return op.emitOpError("may only insert into an output block argument");
+    }
   }
+
   return success();
 }
 
@@ -2018,12 +2033,17 @@ OpResult InParallelOp::getParentResult(int64_t idx) {
 }
 
 SmallVector<BlockArgument> InParallelOp::getDests() {
-  return llvm::to_vector<4>(
-      llvm::map_range(getYieldingOps(), [](Operation &op) {
-        // Add new ops here as needed.
-        auto insertSliceOp = cast<tensor::ParallelInsertSliceOp>(&op);
-        return llvm::cast<BlockArgument>(insertSliceOp.getDest());
-      }));
+  SmallVector<BlockArgument> updatedDests;
+  for (Operation &yieldingOp : getYieldingOps()) {
+    auto parallelCombiningOp =
+        dyn_cast<ParallelCombiningOpInterface>(&yieldingOp);
+    if (!parallelCombiningOp)
+      continue;
+    for (OpOperand &updatedOperand :
+         parallelCombiningOp.getUpdatedDestinations())
+      updatedDests.push_back(cast<BlockArgument>(updatedOperand.get()));
+  }
+  return updatedDests;
 }
 
 llvm::iterator_range<Block::iterator> InParallelOp::getYieldingOps() {
@@ -3072,7 +3092,8 @@ struct ParallelOpSingleOrZeroIterationDimsFolder
     for (auto [lb, ub, step, iv] :
          llvm::zip(op.getLowerBound(), op.getUpperBound(), op.getStep(),
                    op.getInductionVars())) {
-      auto numIterations = constantTripCount(lb, ub, step);
+      auto numIterations =
+          constantTripCount(lb, ub, step, /*isSigned=*/true, computeUbMinusLb);
       if (numIterations.has_value()) {
         // Remove the loop if it performs zero iterations.
         if (*numIterations == 0) {
diff --git a/mlir/lib/Dialect/SCF/Transforms/BufferDeallocationOpInterfaceImpl.cpp b/mlir/lib/Dialect/SCF/Transforms/BufferDeallocationOpInterfaceImpl.cpp
index a44612410bdee..63216e7cc7fba 100644
--- a/mlir/lib/Dialect/SCF/Transforms/BufferDeallocationOpInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/SCF/Transforms/BufferDeallocationOpInterfaceImpl.cpp
@@ -16,7 +16,7 @@ using namespace mlir::bufferization;
 namespace {
 /// The `scf.forall.in_parallel` terminator is special in a few ways:
 /// * It does not implement the BranchOpInterface or
-///   RegionBranchTerminatorOpInterface, but the ParallelCombiningOpInterface
+///   RegionBranchTerminatorOpInterface, but the InParallelOpInterface
 ///   which is not supported by BufferDeallocation.
 /// * It has a graph-like region which only allows one specific tensor op
 /// * After bufferization the nested region is always empty
@@ -40,9 +40,9 @@ namespace {
 ///   <implicit in_parallel terminator here>
 /// }
 /// ```
-struct InParallelOpInterface
-    : public BufferDeallocationOpInterface::ExternalModel<InParallelOpInterface,
-                                                          scf::InParallelOp> {
+struct InParallelDeallocOpInterface
+    : public BufferDeallocationOpInterface::ExternalModel<
+          InParallelDeallocOpInterface, scf::InParallelOp> {
   FailureOr<Operation *> process(Operation *op, DeallocationState &state,
                                  const DeallocationOptions &options) const {
     auto inParallelOp = cast<scf::InParallelOp>(op);
@@ -75,7 +75,7 @@ struct ReduceReturnOpInterface
 void mlir::scf::registerBufferDeallocationOpInterfaceExternalModels(
     DialectRegistry &registry) {
   registry.addExtension(+[](MLIRContext *ctx, SCFDialect *dialect) {
-    InParallelOp::attachInterface<InParallelOpInterface>(*ctx);
+    InParallelOp::attachInterface<InParallelDeallocOpInterface>(*ctx);
     ReduceReturnOp::attachInterface<ReduceReturnOpInterface>(*ctx);
   });
 }
diff --git a/mlir/lib/Dialect/SCF/Transforms/LoopSpecialization.cpp b/mlir/lib/Dialect/SCF/Transforms/LoopSpecialization.cpp
index f1203b2bdfee5..e3717aa9d940e 100644
--- a/mlir/lib/Dialect/SCF/Transforms/LoopSpecialization.cpp
+++ b/mlir/lib/Dialect/SCF/Transforms/LoopSpecialization.cpp
@@ -94,7 +94,9 @@ static void specializeForLoopForUnrolling(ForOp op) {
 
   OpBuilder b(op);
   IRMapping map;
-  Value constant = arith::ConstantIndexOp::create(b, op.getLoc(), minConstant);
+  Value constant = arith::ConstantOp::create(
+      b, op.getLoc(),
+      IntegerAttr::get(op.getUpperBound().getType(), minConstant));
   Value cond = arith::CmpIOp::create(b, op.getLoc(), arith::CmpIPredicate::eq,
                                      bound, constant);
   map.map(bound, constant);
@@ -150,6 +152,9 @@ static LogicalResult peelForLoop(RewriterBase &b, ForOp forOp,
                                              ValueRange{forOp.getLowerBound(),
                                                         forOp.getUpperBound(),
                                                         forOp.getStep()});
+  if (splitBound.getType() != forOp.getLowerBound().getType())
+    splitBound = b.createOrFold<arith::IndexCastOp>(
+        loc, forOp.getLowerBound().getType(), splitBound);
 
   // Create ForOp for partial iteration.
   b.setInsertionPointAfter(forOp);
@@ -230,6 +235,9 @@ LogicalResult mlir::scf::peelForLoopFirstIteration(RewriterBase &b, ForOp forOp,
   auto loc = forOp.getLoc();
   Value splitBound = b.createOrFold<AffineApplyOp>(
       loc, ubMap, ValueRange{forOp.getLowerBound(), forOp.getStep()});
+  if (splitBound.getType() != forOp.getUpperBound().getType())
+    splitBound = b.createOrFold<arith::IndexCastOp>(
+        loc, forOp.getUpperBound().getType(), splitBound);
 
   // Peel the first iteration.
   IRMapping map;
diff --git a/mlir/lib/Dialect/SCF/Utils/Utils.cpp b/mlir/lib/Dialect/SCF/Utils/Utils.cpp
index 684dff8121de6..e7bce98c607df 100644
--- a/mlir/lib/Dialect/SCF/Utils/Utils.cpp
+++ b/mlir/lib/Dialect/SCF/Utils/Utils.cpp
@@ -22,6 +22,7 @@
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/Interfaces/SideEffectInterfaces.h"
 #include "mlir/Transforms/RegionUtils.h"
+#include "llvm/ADT/APInt.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/DebugLog.h"
@@ -202,8 +203,7 @@ FailureOr<func::FuncOp> mlir::outlineSingleBlockRegion(RewriterBase &rewriter,
       OpBuilder::InsertionGuard g(rewriter);
       rewriter.setInsertionPointToStart(outlinedFuncBody);
       if (Operation *cst = orig.getDefiningOp<arith::ConstantIndexOp>()) {
-        IRMapping bvm;
-        repl = rewriter.clone(*cst, bvm)->getResult(0);
+        repl = rewriter.clone(*cst)->getResult(0);
       }
     }
     orig.replaceUsesWithIf(repl, [&](OpOperand &opOperand) {
@@ -291,14 +291,6 @@ static Value ceilDivPositive(OpBuilder &builder, Location loc, Value dividend,
   return arith::DivUIOp::create(builder, loc, sum, divisor);
 }
 
-/// Returns the trip count of `forOp` if its' low bound, high bound and step are
-/// constants, or optional otherwise. Trip count is computed as
-/// ceilDiv(highBound - lowBound, step).
-static std::optional<int64_t> getConstantTripCount(scf::ForOp forOp) {
-  return constantTripCount(forOp.getLowerBound(), forOp.getUpperBound(),
-                           forOp.getStep());
-}
-
 /// Generates unrolled copies of scf::ForOp 'loopBodyBlock', with
 /// associated 'forOpIV' by 'unrollFactor', calling 'ivRemapFn' to remap
 /// 'forOpIV' for each unrolled body. If specified, annotates the Ops in each
@@ -377,7 +369,7 @@ FailureOr<UnrolledLoopInfo> mlir::loopUnrollByFactor(
   Value stepUnrolled;
   bool generateEpilogueLoop = true;
 
-  std::optional<int64_t> constTripCount = getConstantTripCount(forOp);
+  std::optional<APInt> constTripCount = forOp.getStaticTripCount();
   if (constTripCount) {
     // Constant loop bounds computation.
     int64_t lbCst = getConstantIntValue(forOp.getLowerBound()).value();
@@ -391,7 +383,8 @@ FailureOr<UnrolledLoopInfo> mlir::loopUnrollByFactor(
     }
 
     int64_t tripCountEvenMultiple =
-        *constTripCount - (*constTripCount % unrollFactor);
+        constTripCount->getSExtValue() -
+        (constTripCount->getSExtValue() % unrollFactor);
     int64_t upperBoundUnrolledCst = lbCst + tripCountEvenMultiple * stepCst;
     int64_t stepUnrolledCst = stepCst * unrollFactor;
 
@@ -487,15 +480,15 @@ FailureOr<UnrolledLoopInfo> mlir::loopUnrollByFactor(
 /// Unrolls this loop completely.
 LogicalResult mlir::loopUnrollFull(scf::ForOp forOp) {
   IRRewriter rewriter(forOp.getContext());
-  std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
+  std::optional<APInt> mayBeConstantTripCount = forOp.getStaticTripCount();
   if (!mayBeConstantTripCount.has_value())
     return failure();
-  uint64_t tripCount = *mayBeConstantTripCount;
-  if (tripCount == 0)
+  const APInt &tripCount = *mayBeConstantTripCount;
+  if (tripCount.isZero())
     return success();
-  if (tripCount == 1)
+  if (tripCount.getSExtValue() == 1)
     return forOp.promoteIfSingleIteration(rewriter);
-  return loopUnrollByFactor(forOp, tripCount);
+  return loopUnrollByFactor(forOp, tripCount.getSExtValue());
 }
 
 /// Check if bounds of all inner loops are defined outside of `forOp`
@@ -535,18 +528,18 @@ LogicalResult mlir::loopUnrollJamByFactor(scf::ForOp forOp,
 
   // Currently, only constant trip count that divided by the unroll factor is
   // supported.
-  std::optional<uint64_t> tripCount = getConstantTripCount(forOp);
+  std::optional<APInt> tripCount = forOp.getStaticTripCount();
   if (!tripCount.has_value()) {
     // If the trip count is dynamic, do not unroll & jam.
     LDBG() << "failed to unroll and jam: trip count could not be determined";
     return failure();
   }
-  if (unrollJamFactor > *tripCount) {
+  if (unrollJamFactor > tripCount->getZExtValue()) {
     LDBG() << "unroll and jam factor is greater than trip count, set factor to "
               "trip "
               "count";
-    unrollJamFactor = *tripCount;
-  } else if (*tripCount % unrollJamFactor != 0) {
+    unrollJamFactor = tripCount->getZExtValue();
+  } else if (tripCount->getSExtValue() % unrollJamFactor != 0) {
     LDBG() << "failed to unroll and jam: unsupported trip count that is not a "
               "multiple of unroll jam factor";
     return failure();
diff --git a/mlir/lib/Dialect/SPIRV/IR/SPIRVDialect.cpp b/mlir/lib/Dialect/SPIRV/IR/SPIRVDialect.cpp
index fcf1526491971..44c86bc8777e4 100644
--- a/mlir/lib/Dialect/SPIRV/IR/SPIRVDialect.cpp
+++ b/mlir/lib/Dialect/SPIRV/IR/SPIRVDialect.cpp
@@ -1066,7 +1066,12 @@ LogicalResult SPIRVDialect::verifyRegionArgAttribute(Operation *op,
 }
 
 LogicalResult SPIRVDialect::verifyRegionResultAttribute(
-    Operation *op, unsigned /*regionIndex*/, unsigned /*resultIndex*/,
+    Operation *op, unsigned /*regionIndex*/, unsigned resultIndex,
     NamedAttribute attribute) {
-  return op->emitError("cannot attach SPIR-V attributes to region result");
+  if (auto graphOp = dyn_cast<spirv::GraphARMOp>(op))
+    return verifyRegionAttribute(
+        op->getLoc(), graphOp.getResultTypes()[resultIndex], attribute);
+  return op->emitError(
+      "cannot attach SPIR-V attributes to region result which is "
+      "not part of a spirv::GraphARMOp type");
 }
diff --git a/mlir/lib/Dialect/SPIRV/Transforms/LowerABIAttributesPass.cpp b/mlir/lib/Dialect/SPIRV/Transforms/LowerABIAttributesPass.cpp
index 3911ec08fcc27..5607a3cd3660f 100644
--- a/mlir/lib/Dialect/SPIRV/Transforms/LowerABIAttributesPass.cpp
+++ b/mlir/lib/Dialect/SPIRV/Transforms/LowerABIAttributesPass.cpp
@@ -22,6 +22,7 @@
 #include "mlir/Dialect/SPIRV/Utils/LayoutUtils.h"
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/Transforms/DialectConversion.h"
+#include "llvm/Support/FormatVariadic.h"
 
 namespace mlir {
 namespace spirv {
@@ -85,10 +86,36 @@ createGlobalVarForEntryPointArgument(OpBuilder &builder, spirv::FuncOp funcOp,
                                          abiInfo.getBinding());
 }
 
+/// Creates a global variable for an argument or result based on the ABI info.
+static spirv::GlobalVariableOp
+createGlobalVarForGraphEntryPoint(OpBuilder &builder, spirv::GraphARMOp graphOp,
+                                  unsigned index, bool isArg,
+                                  spirv::InterfaceVarABIAttr abiInfo) {
+  auto spirvModule = graphOp->getParentOfType<spirv::ModuleOp>();
+  if (!spirvModule)
+    return nullptr;
+
+  OpBuilder::InsertionGuard moduleInsertionGuard(builder);
+  builder.setInsertionPoint(graphOp.getOperation());
+  std::string varName = llvm::formatv("{}_{}_{}", graphOp.getName(),
+                                      isArg ? "arg" : "res", index);
+
+  Type varType = isArg ? graphOp.getFunctionType().getInput(index)
+                       : graphOp.getFunctionType().getResult(index);
+
+  auto pointerType = spirv::PointerType::get(
+      varType,
+      abiInfo.getStorageClass().value_or(spirv::StorageClass::UniformConstant));
+
+  return spirv::GlobalVariableOp::create(builder, graphOp.getLoc(), pointerType,
+                                         varName, abiInfo.getDescriptorSet(),
+                                         abiInfo.getBinding());
+}
+
 /// Gets the global variables that need to be specified as interface variable
 /// with an spirv.EntryPointOp. Traverses the body of a entry function to do so.
 static LogicalResult
-getInterfaceVariables(spirv::FuncOp funcOp,
+getInterfaceVariables(mlir::FunctionOpInterface funcOp,
                       SmallVectorImpl<Attribute> &interfaceVars) {
   auto module = funcOp->getParentOfType<spirv::ModuleOp>();
   if (!module) {
@@ -224,6 +251,21 @@ class ProcessInterfaceVarABI final : public OpConversionPattern<spirv::FuncOp> {
                   ConversionPatternRewriter &rewriter) const override;
 };
 
+/// A pattern to convert graph signature according to interface variable ABI
+/// attributes.
+///
+/// Specifically, this pattern creates global variables according to interface
+/// variable ABI attributes attached to graph arguments and results.
+class ProcessGraphInterfaceVarABI final
+    : public OpConversionPattern<spirv::GraphARMOp> {
+public:
+  using OpConversionPattern::OpConversionPattern;
+
+  LogicalResult
+  matchAndRewrite(spirv::GraphARMOp graphOp, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override;
+};
+
 /// Pass to implement the ABI information specified as attributes.
 class LowerABIAttributesPass final
     : public spirv::impl::SPIRVLowerABIAttributesPassBase<
@@ -297,6 +339,63 @@ LogicalResult ProcessInterfaceVarABI::matchAndRewrite(
   return success();
 }
 
+LogicalResult ProcessGraphInterfaceVarABI::matchAndRewrite(
+    spirv::GraphARMOp graphOp, OpAdaptor adaptor,
+    ConversionPatternRewriter &rewriter) const {
+  // Non-entry point graphs are not handled.
+  if (!graphOp.getEntryPoint().value_or(false))
+    return failure();
+
+  TypeConverter::SignatureConversion signatureConverter(
+      graphOp.getFunctionType().getNumInputs());
+
+  StringRef attrName = spirv::getInterfaceVarABIAttrName();
+  SmallVector<Attribute, 4> interfaceVars;
+
+  // Convert arguments.
+  unsigned numInputs = graphOp.getFunctionType().getNumInputs();
+  unsigned numResults = graphOp.getFunctionType().getNumResults();
+  for (unsigned index = 0; index < numInputs; ++index) {
+    auto abiInfo =
+        graphOp.getArgAttrOfType<spirv::InterfaceVarABIAttr>(index, attrName);
+    if (!abiInfo)
+      return failure();
+    spirv::GlobalVariableOp var = createGlobalVarForGraphEntryPoint(
+        rewriter, graphOp, index, true, abiInfo);
+    if (!var)
+      return failure();
+    interfaceVars.push_back(
+        SymbolRefAttr::get(rewriter.getContext(), var.getSymName()));
+  }
+
+  for (unsigned index = 0; index < numResults; ++index) {
+    auto abiInfo = graphOp.getResultAttrOfType<spirv::InterfaceVarABIAttr>(
+        index, attrName);
+    if (!abiInfo)
+      return failure();
+    spirv::GlobalVariableOp var = createGlobalVarForGraphEntryPoint(
+        rewriter, graphOp, index, false, abiInfo);
+    if (!var)
+      return failure();
+    interfaceVars.push_back(
+        SymbolRefAttr::get(rewriter.getContext(), var.getSymName()));
+  }
+
+  // Update graph signature.
+  rewriter.modifyOpInPlace(graphOp, [&] {
+    for (unsigned index = 0; index < numInputs; ++index) {
+      graphOp.removeArgAttr(index, attrName);
+    }
+    for (unsigned index = 0; index < numResults; ++index) {
+      graphOp.removeResultAttr(index, rewriter.getStringAttr(attrName));
+    }
+  });
+
+  spirv::GraphEntryPointARMOp::create(rewriter, graphOp.getLoc(), graphOp,
+                                      interfaceVars);
+  return success();
+}
+
 void LowerABIAttributesPass::runOnOperation() {
   // Uses the signature conversion methodology of the dialect conversion
   // framework to implement the conversion.
@@ -322,7 +421,8 @@ void LowerABIAttributesPass::runOnOperation() {
   });
 
   RewritePatternSet patterns(context);
-  patterns.add<ProcessInterfaceVarABI>(typeConverter, context);
+  patterns.add<ProcessInterfaceVarABI, ProcessGraphInterfaceVarABI>(
+      typeConverter, context);
 
   ConversionTarget target(*context);
   // "Legal" function ops should have no interface variable ABI attributes.
@@ -333,6 +433,17 @@ void LowerABIAttributesPass::runOnOperation() {
         return false;
     return true;
   });
+  target.addDynamicallyLegalOp<spirv::GraphARMOp>([&](spirv::GraphARMOp op) {
+    StringRef attrName = spirv::getInterfaceVarABIAttrName();
+    for (unsigned i = 0, e = op.getNumArguments(); i < e; ++i)
+      if (op.getArgAttr(i, attrName))
+        return false;
+    for (unsigned i = 0, e = op.getNumResults(); i < e; ++i)
+      if (op.getResultAttr(i, attrName))
+        return false;
+    return true;
+  });
+
   // All other SPIR-V ops are legal.
   target.markUnknownOpDynamicallyLegal([](Operation *op) {
     return op->getDialect()->getNamespace() ==
diff --git a/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp b/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp
index 68584ec4fd814..fa97b49a41d97 100644
--- a/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp
+++ b/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp
@@ -2976,9 +2976,9 @@ class InsertSliceOpConstantArgumentFolder final
     if (sourceType != insertSliceOp.getSourceType()) {
       OpBuilder::InsertionGuard g(rewriter);
       // The only difference between InsertSliceOp and ParallelInsertSliceOp
-      // is that the insertion point is just before the ParallelCombiningOp in
+      // is that the insertion point is just before the InParallelOp in
       // the parallel case.
-      if (std::is_same<InsertOpTy, ParallelInsertSliceOp>::value)
+      if (isa<InParallelOpInterface>(insertSliceOp->getParentOp()))
         rewriter.setInsertionPoint(insertSliceOp->getParentOp());
       toInsert = tensor::CastOp::create(rewriter, insertSliceOp.getLoc(),
                                         sourceType, toInsert);
@@ -3153,9 +3153,9 @@ struct InsertSliceOpSourceCastInserter final
     // Insert the cast.
     OpBuilder::InsertionGuard g(rewriter);
     // The only difference between InsertSliceOp and ParallelInsertSliceOp is
-    // that the insertion point is just before the ParallelCombiningOp in the
+    // that the insertion point is just before the InParallelOp in the
     // parallel case.
-    if (std::is_same<InsertOpTy, ParallelInsertSliceOp>::value)
+    if (isa<ParallelCombiningOpInterface>(insertSliceOp->getParentOp()))
       rewriter.setInsertionPoint(insertSliceOp->getParentOp());
     Value cast = tensor::CastOp::create(rewriter, insertSliceOp.getLoc(),
                                         newSrcType, insertSliceOp.getSource());
@@ -3846,8 +3846,7 @@ OpFoldResult PadOp::fold(FoldAdaptor) {
 //===----------------------------------------------------------------------===//
 
 OpResult ParallelInsertSliceOp::getTiedOpResult() {
-  ParallelCombiningOpInterface parallelCombiningParent =
-      getParallelCombiningParent();
+  InParallelOpInterface parallelCombiningParent = getParallelCombiningParent();
   for (const auto &it :
        llvm::enumerate(parallelCombiningParent.getYieldingOps())) {
     Operation &nextOp = it.value();
@@ -3901,8 +3900,8 @@ void ParallelInsertSliceOp::build(OpBuilder &b, OperationState &result,
 }
 
 LogicalResult ParallelInsertSliceOp::verify() {
-  if (!isa<ParallelCombiningOpInterface>(getOperation()->getParentOp()))
-    return this->emitError("expected ParallelCombiningOpInterface parent, got:")
+  if (!isa<InParallelOpInterface>(getOperation()->getParentOp()))
+    return this->emitError("expected InParallelOpInterface parent, got:")
            << *(getOperation()->getParentOp());
 
   // Verify result type against inferred type.
@@ -3935,6 +3934,19 @@ llvm::SmallBitVector ParallelInsertSliceOp::getDroppedDims() {
   return ::getDroppedDims(getSourceType().getShape(), getMixedSizes());
 }
 
+// ParallelCombiningOpInterface implementation.
+MutableOperandRange ParallelInsertSliceOp::getUpdatedDestinations() {
+  return getDestMutable();
+}
+
+Operation *ParallelInsertSliceOp::getIteratingParent() {
+  // Return the parent InParallelOpInterface's parent.
+  if (auto combiningOp =
+          dyn_cast<InParallelOpInterface>(getOperation()->getParentOp()))
+    return combiningOp->getParentOp();
+  return nullptr;
+}
+
 //===----------------------------------------------------------------------===//
 // ScatterOp
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp
index c3356c1e4b9d8..bce964e47a3be 100644
--- a/mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -970,10 +970,10 @@ struct ParallelInsertSliceOpInterface
                           BufferizationState &state) const {
     OpBuilder::InsertionGuard g(rewriter);
     auto parallelInsertSliceOp = cast<ParallelInsertSliceOp>(op);
-    ParallelCombiningOpInterface parallelCombiningParent =
+    InParallelOpInterface parallelCombiningParent =
         parallelInsertSliceOp.getParallelCombiningParent();
 
-    // Bufferize the op outside of the parallel combining terminator.
+    // Bufferize the op outside of the in parallel terminator.
     rewriter.setInsertionPoint(parallelCombiningParent);
 
     // Get source and destination buffers.
diff --git a/mlir/lib/Dialect/Tensor/Transforms/FoldTensorSubsetOps.cpp b/mlir/lib/Dialect/Tensor/Transforms/FoldTensorSubsetOps.cpp
index d76c02af7ab16..b32faf481af80 100644
--- a/mlir/lib/Dialect/Tensor/Transforms/FoldTensorSubsetOps.cpp
+++ b/mlir/lib/Dialect/Tensor/Transforms/FoldTensorSubsetOps.cpp
@@ -215,12 +215,11 @@ struct InsertSliceOfInsertSliceFolder : public OpRewritePattern<OpTy> {
                                         sourceInsertSliceOp.getMixedSizes(),
                                         droppedDims, resolvedSizes);
 
-    // If we are inside an InParallel region, temporarily set the insertion
-    // point outside: only tensor.parallel_insert_slice ops are allowed in
-    // there.
-    if (std::is_same_v<OpTy, tensor::ParallelInsertSliceOp>) {
-      rewriter.setInsertionPoint(
-          insertSliceOp->template getParentOfType<scf::InParallelOp>());
+    // If we are inside a ParallelCombining region, temporarily set the
+    // insertion point outside: only ops of ParallelCombiningOpInterface are
+    // allowed in there.
+    if (isa<mlir::ParallelCombiningOpInterface>(insertSliceOp.getOperation())) {
+      rewriter.setInsertionPoint(insertSliceOp->getParentOp());
     }
 
     // Resolve offsets according to source offsets and strides.
diff --git a/mlir/lib/Dialect/Tensor/Transforms/ReshapePatterns.cpp b/mlir/lib/Dialect/Tensor/Transforms/ReshapePatterns.cpp
index dfce835a1954b..7ec61c7df81cf 100644
--- a/mlir/lib/Dialect/Tensor/Transforms/ReshapePatterns.cpp
+++ b/mlir/lib/Dialect/Tensor/Transforms/ReshapePatterns.cpp
@@ -319,7 +319,7 @@ struct BubbleUpExpandThroughParallelCollapse
 /// Note - this pattern could be extended to be a swap pattern between
 /// `tensor.expand_shape` and `tensor.extract_slice`, but is currently
 /// implemented only as a bubble up pattern for `tensor.extract_slice`.
-struct BubbleUpExpandShapeThroughExtractSlice
+struct BubbleUpExtractSliceThroughExpandShape
     : public OpRewritePattern<tensor::ExtractSliceOp> {
   using OpRewritePattern<tensor::ExtractSliceOp>::OpRewritePattern;
 
@@ -427,7 +427,7 @@ struct BubbleUpExpandShapeThroughExtractSlice
 ///                                                       to tensor<15xf32>
 /// ```
 /// But this is not the intended purpose of the transformation.
-struct BubbleUpCollapseShapeThroughExtractSlice
+struct BubbleUpExtractSliceThroughCollapseShape
     : public OpRewritePattern<tensor::ExtractSliceOp> {
   using OpRewritePattern<tensor::ExtractSliceOp>::OpRewritePattern;
 
@@ -735,6 +735,6 @@ void mlir::tensor::populateBubbleUpExpandShapePatterns(
 
 void mlir::tensor::populateBubbleUpExtractSliceOpPatterns(
     RewritePatternSet &patterns) {
-  patterns.add<BubbleUpExpandShapeThroughExtractSlice,
-               BubbleUpCollapseShapeThroughExtractSlice>(patterns.getContext());
+  patterns.add<BubbleUpExtractSliceThroughExpandShape,
+               BubbleUpExtractSliceThroughCollapseShape>(patterns.getContext());
 }
diff --git a/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp b/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp
index b4c87a34a0e5a..1c0a6a618fcd2 100644
--- a/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp
+++ b/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp
@@ -3056,7 +3056,7 @@ static LogicalResult verifyReduceOp(T op) {
     int64_t inputRank = inputType.getRank();
     // We allow for a special case where the input/output shape has rank 0 and
     // axis is also 0.
-    if (reduceAxis >= inputRank && !(reduceAxis == 0 && inputRank == 0)) {
+    if (reduceAxis >= inputRank && (reduceAxis != 0 || inputRank != 0)) {
       op.emitOpError("expect input tensor rank (")
           << inputRank << ") to be larger than reduce axis (" << reduceAxis
           << ")";
@@ -3070,7 +3070,7 @@ static LogicalResult verifyReduceOp(T op) {
           "expect output tensor rank to be equal to input tensor rank");
       return failure();
     }
-    if (reduceAxis >= outputRank && !(reduceAxis == 0 && outputRank == 0)) {
+    if (reduceAxis >= outputRank && (reduceAxis != 0 || outputRank != 0)) {
       op.emitOpError("expect output tensor rank (")
           << outputRank << ") to be larger than reduce axis (" << reduceAxis
           << ")";
@@ -4105,7 +4105,7 @@ LogicalResult ReverseOp::verify() {
     int64_t inputRank = inputType.getRank();
     // We allow for a special case where the input/output shape has rank 0 and
     // axis is also 0.
-    if (reverseAxis >= inputRank && !(reverseAxis == 0 && inputRank == 0))
+    if (reverseAxis >= inputRank && (reverseAxis != 0 || inputRank != 0))
       return emitOpError("expect input tensor rank (")
              << inputRank << ") to be larger than reverse axis (" << reverseAxis
              << ")";
@@ -4115,7 +4115,7 @@ LogicalResult ReverseOp::verify() {
     if (inputType.hasRank() && outputRank != inputType.getRank())
       return emitOpError(
           "expect output tensor rank to be equal to input tensor rank");
-    if (reverseAxis >= outputRank && !(reverseAxis == 0 && outputRank == 0))
+    if (reverseAxis >= outputRank && (reverseAxis != 0 || outputRank != 0))
       return emitOpError("expect output tensor rank (")
              << outputRank << ") to be larger than reverse axis ("
              << reverseAxis << ")";
@@ -4330,7 +4330,7 @@ LogicalResult tosa::ConstShapeOp::verify() {
   // check that number of elements in values attr equal to rank of result shape
   auto count = getValues().getNumElements();
   auto rank = (cast<tosa::shapeType>(getResult().getType())).getRank();
-  if (!(count == rank || (count == 1 && rank == 0))) {
+  if (count != rank && (count != 1 || rank != 0)) {
     return emitOpError("expect number of elements in attribute values (")
            << count << ") to be equal to the rank (" << rank
            << ") for the result shape type";
diff --git a/mlir/lib/Dialect/Tosa/Transforms/TosaConvertIntegerTypeToSignless.cpp b/mlir/lib/Dialect/Tosa/Transforms/TosaConvertIntegerTypeToSignless.cpp
index 706b5ddd22e72..4b131333b956a 100644
--- a/mlir/lib/Dialect/Tosa/Transforms/TosaConvertIntegerTypeToSignless.cpp
+++ b/mlir/lib/Dialect/Tosa/Transforms/TosaConvertIntegerTypeToSignless.cpp
@@ -103,6 +103,32 @@ class ConvertGenericOpWithIntegerTensorType : public ConversionPattern {
   }
 };
 
+class ConvertTosaConstWithIntegerTensorType
+    : public OpConversionPattern<tosa::ConstOp> {
+  using OpConversionPattern::OpConversionPattern;
+
+  LogicalResult
+  matchAndRewrite(tosa::ConstOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const final {
+    const ElementsAttr oldAttr = op.getValues();
+    const auto oldTy = llvm::cast<ShapedType>(oldAttr.getType());
+    const auto newTy =
+        llvm::cast<ShapedType>(typeConverter->convertType(oldTy));
+    if (oldTy == newTy)
+      return success();
+
+    ElementsAttr newAttr = oldAttr;
+    if (auto denseAttr = llvm::dyn_cast<DenseElementsAttr>(oldAttr)) {
+      newAttr = DenseElementsAttr::get(newTy, denseAttr.getRawData());
+    } else {
+      return rewriter.notifyMatchFailure(op, "unknown elements attribute type");
+    }
+
+    rewriter.replaceOpWithNewOp<tosa::ConstOp>(op, newTy, newAttr);
+    return success();
+  }
+};
+
 class TosaConvertIntegerTypeToSignless
     : public impl::TosaConvertIntegerTypeToSignlessBase<
           TosaConvertIntegerTypeToSignless> {
@@ -116,6 +142,10 @@ class TosaConvertIntegerTypeToSignless
       return typeConverter.isSignatureLegal(op.getFunctionType()) &&
              typeConverter.isLegal(&op.getBody());
     });
+    target.addDynamicallyLegalOp<tosa::ConstOp>([&](tosa::ConstOp op) {
+      return typeConverter.isLegal(op.getType()) &&
+             typeConverter.isLegal(op.getValues().getType());
+    });
     target.markUnknownOpDynamicallyLegal([&](Operation *op) {
       return typeConverter.isLegal(op->getOperandTypes()) &&
              typeConverter.isLegal(op->getResultTypes());
@@ -125,6 +155,7 @@ class TosaConvertIntegerTypeToSignless
     populateFunctionOpInterfaceTypeConversionPattern<func::FuncOp>(
         patterns, typeConverter);
     patterns.add<ConvertGenericOpWithIntegerTensorType>(typeConverter, context);
+    patterns.add<ConvertTosaConstWithIntegerTensorType>(typeConverter, context);
 
     if (failed(
             applyFullConversion(getOperation(), target, std::move(patterns))))
diff --git a/mlir/lib/Dialect/Tosa/Transforms/TosaProfileCompliance.cpp b/mlir/lib/Dialect/Tosa/Transforms/TosaProfileCompliance.cpp
index 9543fa1fe39d8..20f9333e7c951 100644
--- a/mlir/lib/Dialect/Tosa/Transforms/TosaProfileCompliance.cpp
+++ b/mlir/lib/Dialect/Tosa/Transforms/TosaProfileCompliance.cpp
@@ -592,7 +592,8 @@ llvm::SmallString<7>
 TosaProfileCompliance::stringifyTypeInfo(const TypeInfo &typeInfo) {
   if (typeInfo.typeID == mlir::IntegerType::getTypeID()) {
     return {"i" + llvm::utostr(typeInfo.bitWidth)};
-  } else if (typeInfo.typeID == mlir::Float16Type::getTypeID()) {
+  }
+  if (typeInfo.typeID == mlir::Float16Type::getTypeID()) {
     return {"f16"};
   } else if (typeInfo.typeID == mlir::Float32Type::getTypeID()) {
     return {"f32"};
diff --git a/mlir/lib/Dialect/Transform/IR/TransformOps.cpp b/mlir/lib/Dialect/Transform/IR/TransformOps.cpp
index aba6178a2ea6c..132ed815c354e 100644
--- a/mlir/lib/Dialect/Transform/IR/TransformOps.cpp
+++ b/mlir/lib/Dialect/Transform/IR/TransformOps.cpp
@@ -1151,7 +1151,7 @@ transform::CollectMatchingOp::apply(transform::TransformRewriter &rewriter,
   std::optional<DiagnosedSilenceableFailure> maybeFailure;
   for (Operation *root : state.getPayloadOps(getRoot())) {
     WalkResult walkResult = root->walk([&](Operation *op) {
-      LDBG(1, DEBUG_TYPE_MATCHER)
+      LDBG(DEBUG_TYPE_MATCHER, 1)
           << "matching "
           << OpWithFlags(op, OpPrintingFlags().assumeVerified().skipRegions())
           << " @" << op;
@@ -1166,7 +1166,7 @@ transform::CollectMatchingOp::apply(transform::TransformRewriter &rewriter,
       if (diag.isDefiniteFailure())
         return WalkResult::interrupt();
       if (diag.isSilenceableFailure()) {
-        LDBG(1, DEBUG_TYPE_MATCHER) << "matcher " << matcher.getName()
+        LDBG(DEBUG_TYPE_MATCHER, 1) << "matcher " << matcher.getName()
                                     << " failed: " << diag.getMessage();
         return WalkResult::advance();
       }
@@ -1298,7 +1298,7 @@ transform::ForeachMatchOp::apply(transform::TransformRewriter &rewriter,
       if (!getRestrictRoot() && op == root)
         return WalkResult::advance();
 
-      LDBG(1, DEBUG_TYPE_MATCHER)
+      LDBG(DEBUG_TYPE_MATCHER, 1)
           << "matching "
           << OpWithFlags(op, OpPrintingFlags().assumeVerified().skipRegions())
           << " @" << op;
@@ -1314,7 +1314,7 @@ transform::ForeachMatchOp::apply(transform::TransformRewriter &rewriter,
         if (diag.isDefiniteFailure())
           return WalkResult::interrupt();
         if (diag.isSilenceableFailure()) {
-          LDBG(1, DEBUG_TYPE_MATCHER) << "matcher " << matcher.getName()
+          LDBG(DEBUG_TYPE_MATCHER, 1) << "matcher " << matcher.getName()
                                       << " failed: " << diag.getMessage();
           continue;
         }
@@ -2165,10 +2165,10 @@ DiagnosedSilenceableFailure transform::MatchOperationEmptyOp::matchOperation(
     ::std::optional<::mlir::Operation *> maybeCurrent,
     transform::TransformResults &results, transform::TransformState &state) {
   if (!maybeCurrent.has_value()) {
-    LDBG(1, DEBUG_TYPE_MATCHER) << "MatchOperationEmptyOp success";
+    LDBG(DEBUG_TYPE_MATCHER, 1) << "MatchOperationEmptyOp success";
     return DiagnosedSilenceableFailure::success();
   }
-  LDBG(1, DEBUG_TYPE_MATCHER) << "MatchOperationEmptyOp failure";
+  LDBG(DEBUG_TYPE_MATCHER, 1) << "MatchOperationEmptyOp failure";
   return emitSilenceableError() << "operation is not empty";
 }
 
diff --git a/mlir/lib/Dialect/Utils/StaticValueUtils.cpp b/mlir/lib/Dialect/Utils/StaticValueUtils.cpp
index 34385d76f133a..8d3944f883963 100644
--- a/mlir/lib/Dialect/Utils/StaticValueUtils.cpp
+++ b/mlir/lib/Dialect/Utils/StaticValueUtils.cpp
@@ -7,10 +7,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Dialect/Utils/StaticValueUtils.h"
+#include "mlir/IR/Attributes.h"
 #include "mlir/IR/Matchers.h"
 #include "mlir/Support/LLVM.h"
 #include "llvm/ADT/APSInt.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/DebugLog.h"
 #include "llvm/Support/MathExtras.h"
 
 namespace mlir {
@@ -112,21 +114,30 @@ SmallVector<OpFoldResult> getAsIndexOpFoldResult(MLIRContext *ctx,
 }
 
 /// If ofr is a constant integer or an IntegerAttr, return the integer.
-std::optional<int64_t> getConstantIntValue(OpFoldResult ofr) {
+/// The boolean indicates whether the value is an index type.
+std::optional<std::pair<APInt, bool>> getConstantAPIntValue(OpFoldResult ofr) {
   // Case 1: Check for Constant integer.
   if (auto val = llvm::dyn_cast_if_present<Value>(ofr)) {
-    APSInt intVal;
+    APInt intVal;
     if (matchPattern(val, m_ConstantInt(&intVal)))
-      return intVal.getSExtValue();
+      return std::make_pair(intVal, val.getType().isIndex());
     return std::nullopt;
   }
   // Case 2: Check for IntegerAttr.
   Attribute attr = llvm::dyn_cast_if_present<Attribute>(ofr);
   if (auto intAttr = dyn_cast_or_null<IntegerAttr>(attr))
-    return intAttr.getValue().getSExtValue();
+    return std::make_pair(intAttr.getValue(), intAttr.getType().isIndex());
   return std::nullopt;
 }
 
+/// If ofr is a constant integer or an IntegerAttr, return the integer.
+std::optional<int64_t> getConstantIntValue(OpFoldResult ofr) {
+  std::optional<std::pair<APInt, bool>> apInt = getConstantAPIntValue(ofr);
+  if (!apInt)
+    return std::nullopt;
+  return apInt->first.getSExtValue();
+}
+
 std::optional<SmallVector<int64_t>>
 getConstantIntValues(ArrayRef<OpFoldResult> ofrs) {
   bool failed = false;
@@ -264,22 +275,109 @@ getValuesSortedByKey(ArrayRef<Attribute> keys, ArrayRef<int64_t> values,
 
 /// Return the number of iterations for a loop with a lower bound `lb`, upper
 /// bound `ub` and step `step`.
-std::optional<int64_t> constantTripCount(OpFoldResult lb, OpFoldResult ub,
-                                         OpFoldResult step) {
+std::optional<APInt> constantTripCount(
+    OpFoldResult lb, OpFoldResult ub, OpFoldResult step, bool isSigned,
+    llvm::function_ref<std::optional<llvm::APSInt>(Value, Value, bool)>
+        computeUbMinusLb) {
+  // This is the bitwidth used to return 0 when loop does not execute.
+  // We infer it from the type of the bound if it isn't an index type.
+  auto getBitwidth = [&](OpFoldResult ofr) -> std::tuple<int, bool> {
+    if (auto intAttr =
+            dyn_cast_or_null<IntegerAttr>(dyn_cast<Attribute>(ofr))) {
+      if (auto intType = dyn_cast<IntegerType>(intAttr.getType()))
+        return std::make_tuple(intType.getWidth(), intType.isIndex());
+    } else {
+      auto val = cast<Value>(ofr);
+      if (auto intType = dyn_cast<IntegerType>(val.getType()))
+        return std::make_tuple(intType.getWidth(), intType.isIndex());
+    }
+    return std::make_tuple(IndexType::kInternalStorageBitWidth, true);
+  };
+  auto [bitwidth, isIndex] = getBitwidth(lb);
+  // This would better be an assert, but unfortunately it breaks scf.for_all
+  // which is missing attributes and SSA value optionally for its bounds, and
+  // uses Index type for the dynamic bounds but i64 for the static bounds. This
+  // is broken...
+  if (std::tie(bitwidth, isIndex) != getBitwidth(ub)) {
+    LDBG() << "mismatch between lb and ub bitwidth/type: " << ub << " vs "
+           << lb;
+    return std::nullopt;
+  }
   if (lb == ub)
-    return 0;
+    return APInt(bitwidth, 0);
+
+  std::optional<std::pair<APInt, bool>> maybeStepCst =
+      getConstantAPIntValue(step);
+
+  if (maybeStepCst) {
+    auto &stepCst = maybeStepCst->first;
+    assert(static_cast<int>(stepCst.getBitWidth()) == bitwidth &&
+           "step must have the same bitwidth as lb and ub");
+    if (stepCst.isZero())
+      return stepCst;
+    if (stepCst.isNegative())
+      return APInt(bitwidth, 0);
+  }
 
-  std::optional<int64_t> lbConstant = getConstantIntValue(lb);
-  if (!lbConstant)
-    return std::nullopt;
-  std::optional<int64_t> ubConstant = getConstantIntValue(ub);
-  if (!ubConstant)
-    return std::nullopt;
-  std::optional<int64_t> stepConstant = getConstantIntValue(step);
-  if (!stepConstant || *stepConstant == 0)
-    return std::nullopt;
+  if (isIndex) {
+    LDBG()
+        << "Computing loop trip count for index type may break with overflow";
+    // TODO: we can't compute the trip count for index type. We should fix this
+    // but too many tests are failing right now.
+    //   return {};
+  }
 
-  return llvm::divideCeilSigned(*ubConstant - *lbConstant, *stepConstant);
+  /// Compute the difference between the upper and lower bound: either from the
+  /// constant value or using the computeUbMinusLb callback.
+  llvm::APSInt diff;
+  std::optional<std::pair<APInt, bool>> maybeLbCst = getConstantAPIntValue(lb);
+  std::optional<std::pair<APInt, bool>> maybeUbCst = getConstantAPIntValue(ub);
+  if (maybeLbCst) {
+    // If one of the bounds is not a constant, we can't compute the trip count.
+    if (!maybeUbCst)
+      return std::nullopt;
+    APSInt lbCst(maybeLbCst->first, /*isUnsigned=*/!isSigned);
+    APSInt ubCst(maybeUbCst->first, /*isUnsigned=*/!isSigned);
+    if (!maybeUbCst)
+      return std::nullopt;
+    if (ubCst <= lbCst) {
+      LDBG() << "constantTripCount is 0 because ub <= lb (" << lbCst << "("
+             << lbCst.getBitWidth() << ") <= " << ubCst << "("
+             << ubCst.getBitWidth() << "), "
+             << (isSigned ? "isSigned" : "isUnsigned") << ")";
+      return APInt(bitwidth, 0);
+    }
+    diff = ubCst - lbCst;
+  } else {
+    if (maybeUbCst)
+      return std::nullopt;
+
+    /// Non-constant bound, let's try to compute the difference between the
+    /// upper and lower bound
+    std::optional<llvm::APSInt> maybeDiff =
+        computeUbMinusLb(cast<Value>(lb), cast<Value>(ub), isSigned);
+    if (!maybeDiff)
+      return std::nullopt;
+    diff = *maybeDiff;
+  }
+  LDBG() << "constantTripCount: " << (isSigned ? "isSigned" : "isUnsigned")
+         << ", ub-lb: " << diff << "(" << diff.getBitWidth() << "b)";
+  if (diff.isNegative()) {
+    LDBG() << "constantTripCount is 0 because ub-lb diff is negative";
+    return APInt(bitwidth, 0);
+  }
+  if (!maybeStepCst) {
+    LDBG()
+        << "constantTripCount can't be computed because step is not a constant";
+    return std::nullopt;
+  }
+  auto &stepCst = maybeStepCst->first;
+  llvm::APInt tripCount = diff.sdiv(stepCst);
+  llvm::APInt r = diff.srem(stepCst);
+  if (!r.isZero())
+    tripCount = tripCount + 1;
+  LDBG() << "constantTripCount found: " << tripCount;
+  return tripCount;
 }
 
 bool hasValidSizesOffsets(SmallVector<int64_t> sizesOrOffsets) {
diff --git a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
index 85e485c28c74e..8d6e263934fb4 100644
--- a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
+++ b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
@@ -1309,7 +1309,7 @@ LogicalResult
 ExtractOp::inferReturnTypes(MLIRContext *, std::optional<Location>,
                             ExtractOp::Adaptor adaptor,
                             SmallVectorImpl<Type> &inferredReturnTypes) {
-  auto vectorType = llvm::cast<VectorType>(adaptor.getVector().getType());
+  auto vectorType = llvm::cast<VectorType>(adaptor.getSource().getType());
   if (static_cast<int64_t>(adaptor.getStaticPosition().size()) ==
       vectorType.getRank()) {
     inferredReturnTypes.push_back(vectorType.getElementType());
@@ -1379,7 +1379,7 @@ static SmallVector<IntType> extractVector(ArrayAttr arrayAttr) {
 /// Fold the result of chains of ExtractOp in place by simply concatenating the
 /// positions.
 static LogicalResult foldExtractOpFromExtractChain(ExtractOp extractOp) {
-  if (!extractOp.getVector().getDefiningOp<ExtractOp>())
+  if (!extractOp.getSource().getDefiningOp<ExtractOp>())
     return failure();
 
   // TODO: Canonicalization for dynamic position not implemented yet.
@@ -1390,7 +1390,7 @@ static LogicalResult foldExtractOpFromExtractChain(ExtractOp extractOp) {
   ExtractOp currentOp = extractOp;
   ArrayRef<int64_t> extrPos = currentOp.getStaticPosition();
   globalPosition.append(extrPos.rbegin(), extrPos.rend());
-  while (ExtractOp nextOp = currentOp.getVector().getDefiningOp<ExtractOp>()) {
+  while (ExtractOp nextOp = currentOp.getSource().getDefiningOp<ExtractOp>()) {
     currentOp = nextOp;
     // TODO: Canonicalization for dynamic position not implemented yet.
     if (currentOp.hasDynamicPosition())
@@ -1398,7 +1398,7 @@ static LogicalResult foldExtractOpFromExtractChain(ExtractOp extractOp) {
     ArrayRef<int64_t> extrPos = currentOp.getStaticPosition();
     globalPosition.append(extrPos.rbegin(), extrPos.rend());
   }
-  extractOp.setOperand(0, currentOp.getVector());
+  extractOp.setOperand(0, currentOp.getSource());
   // OpBuilder is only used as a helper to build an I64ArrayAttr.
   OpBuilder b(extractOp.getContext());
   std::reverse(globalPosition.begin(), globalPosition.end());
@@ -1584,7 +1584,7 @@ Value ExtractFromInsertTransposeChainState::tryToFoldExtractOpInPlace(
     return Value();
 
   // If we can't fold (either internal transposition, or nothing to fold), bail.
-  bool nothingToFold = (source == extractOp.getVector());
+  bool nothingToFold = (source == extractOp.getSource());
   if (nothingToFold || !canFold())
     return Value();
 
@@ -1592,7 +1592,7 @@ Value ExtractFromInsertTransposeChainState::tryToFoldExtractOpInPlace(
   OpBuilder b(extractOp.getContext());
   extractOp.setStaticPosition(
       ArrayRef(extractPosition).take_front(extractedRank));
-  extractOp.getVectorMutable().assign(source);
+  extractOp.getSourceMutable().assign(source);
   return extractOp.getResult();
 }
 
@@ -1602,7 +1602,7 @@ Value ExtractFromInsertTransposeChainState::fold() {
   if (extractOp.hasDynamicPosition())
     return Value();
 
-  Value valueToExtractFrom = extractOp.getVector();
+  Value valueToExtractFrom = extractOp.getSource();
   updateStateForNextIteration(valueToExtractFrom);
   while (nextInsertOp || nextTransposeOp) {
     // Case 1. If we hit a transpose, just compose the map and iterate.
@@ -1693,7 +1693,7 @@ static bool isBroadcastLike(Operation *op) {
 /// `extract` shape.
 static Value foldExtractFromBroadcast(ExtractOp extractOp) {
 
-  Operation *defOp = extractOp.getVector().getDefiningOp();
+  Operation *defOp = extractOp.getSource().getDefiningOp();
   if (!defOp || !isBroadcastLike(defOp))
     return Value();
 
@@ -1762,7 +1762,7 @@ static Value foldExtractFromShuffle(ExtractOp extractOp) {
   if (extractOp.hasDynamicPosition())
     return Value();
 
-  auto shuffleOp = extractOp.getVector().getDefiningOp<ShuffleOp>();
+  auto shuffleOp = extractOp.getSource().getDefiningOp<ShuffleOp>();
   if (!shuffleOp)
     return Value();
 
@@ -1793,7 +1793,7 @@ static Value foldExtractFromShapeCast(ExtractOp extractOp) {
   if (extractOp.hasDynamicPosition())
     return Value();
 
-  auto shapeCastOp = extractOp.getVector().getDefiningOp<vector::ShapeCastOp>();
+  auto shapeCastOp = extractOp.getSource().getDefiningOp<vector::ShapeCastOp>();
   if (!shapeCastOp)
     return Value();
 
@@ -1859,7 +1859,7 @@ static Value foldExtractFromExtractStrided(ExtractOp extractOp) {
     return Value();
 
   auto extractStridedSliceOp =
-      extractOp.getVector().getDefiningOp<vector::ExtractStridedSliceOp>();
+      extractOp.getSource().getDefiningOp<vector::ExtractStridedSliceOp>();
   if (!extractStridedSliceOp)
     return Value();
 
@@ -1896,7 +1896,7 @@ static Value foldExtractFromExtractStrided(ExtractOp extractOp) {
   assert(extractedPos.size() >= sliceOffsets.size());
   for (size_t i = 0, e = sliceOffsets.size(); i < e; i++)
     extractedPos[i] = extractedPos[i] + sliceOffsets[i];
-  extractOp.getVectorMutable().assign(extractStridedSliceOp.getVector());
+  extractOp.getSourceMutable().assign(extractStridedSliceOp.getSource());
 
   // OpBuilder is only used as a helper to build an I64ArrayAttr.
   OpBuilder b(extractOp.getContext());
@@ -1914,7 +1914,7 @@ static Value foldExtractStridedOpFromInsertChain(ExtractOp extractOp) {
       llvm::isa<VectorType>(extractOp.getType())
           ? llvm::cast<VectorType>(extractOp.getType()).getRank()
           : 0;
-  auto insertOp = extractOp.getVector().getDefiningOp<InsertStridedSliceOp>();
+  auto insertOp = extractOp.getSource().getDefiningOp<InsertStridedSliceOp>();
   if (!insertOp)
     return Value();
 
@@ -1966,7 +1966,7 @@ static Value foldExtractStridedOpFromInsertChain(ExtractOp extractOp) {
                                                     insertRankDiff))
           return Value();
       }
-      extractOp.getVectorMutable().assign(insertOp.getValueToStore());
+      extractOp.getSourceMutable().assign(insertOp.getValueToStore());
       // OpBuilder is only used as a helper to build an I64ArrayAttr.
       OpBuilder b(extractOp.getContext());
       extractOp.setStaticPosition(offsetDiffs);
@@ -1991,7 +1991,7 @@ static Value foldScalarExtractFromFromElements(ExtractOp extractOp) {
     return {};
 
   // Look for extract(from_elements).
-  auto fromElementsOp = extractOp.getVector().getDefiningOp<FromElementsOp>();
+  auto fromElementsOp = extractOp.getSource().getDefiningOp<FromElementsOp>();
   if (!fromElementsOp)
     return {};
 
@@ -2142,20 +2142,20 @@ OpFoldResult ExtractOp::fold(FoldAdaptor adaptor) {
   // Fold "vector.extract %v[] : vector<2x2xf32> from vector<2x2xf32>" to %v.
   // Note: Do not fold "vector.extract %v[] : f32 from vector<f32>" (type
   // mismatch).
-  if (getNumIndices() == 0 && getVector().getType() == getResult().getType())
-    return getVector();
-  if (auto res = foldPoisonSrcExtractOp(adaptor.getVector()))
+  if (getNumIndices() == 0 && getSource().getType() == getResult().getType())
+    return getSource();
+  if (auto res = foldPoisonSrcExtractOp(adaptor.getSource()))
     return res;
   // Fold `arith.constant` indices into the `vector.extract` operation.
   // Do not stop here as this fold may enable subsequent folds that require
   // constant indices.
-  SmallVector<Value> operands = {getVector()};
+  SmallVector<Value> operands = {getSource()};
   auto inplaceFolded = extractInsertFoldConstantOp(*this, adaptor, operands);
 
   if (auto res = foldPoisonIndexInsertExtractOp(
           getContext(), adaptor.getStaticPosition(), kPoisonIndex))
     return res;
-  if (auto res = foldDenseElementsAttrSrcExtractOp(*this, adaptor.getVector()))
+  if (auto res = foldDenseElementsAttrSrcExtractOp(*this, adaptor.getSource()))
     return res;
   if (succeeded(foldExtractOpFromExtractChain(*this)))
     return getResult();
@@ -2187,7 +2187,7 @@ class ExtractOpFromBroadcast final : public OpRewritePattern<ExtractOp> {
   LogicalResult matchAndRewrite(ExtractOp extractOp,
                                 PatternRewriter &rewriter) const override {
 
-    Operation *defOp = extractOp.getVector().getDefiningOp();
+    Operation *defOp = extractOp.getSource().getDefiningOp();
     VectorType outType = dyn_cast<VectorType>(extractOp.getType());
     if (!defOp || !isBroadcastLike(defOp) || !outType)
       return failure();
@@ -2210,7 +2210,7 @@ class ExtractOpFromCreateMask final : public OpRewritePattern<ExtractOp> {
   LogicalResult matchAndRewrite(ExtractOp extractOp,
                                 PatternRewriter &rewriter) const override {
     auto createMaskOp =
-        extractOp.getVector().getDefiningOp<vector::CreateMaskOp>();
+        extractOp.getSource().getDefiningOp<vector::CreateMaskOp>();
     if (!createMaskOp)
       return failure();
 
@@ -2271,7 +2271,7 @@ class ExtractOpFromCreateMask final : public OpRewritePattern<ExtractOp> {
 // does not change.
 LogicalResult foldExtractFromShapeCastToShapeCast(ExtractOp extractOp,
                                                   PatternRewriter &rewriter) {
-  auto castOp = extractOp.getVector().getDefiningOp<ShapeCastOp>();
+  auto castOp = extractOp.getSource().getDefiningOp<ShapeCastOp>();
   if (!castOp)
     return failure();
 
@@ -2306,7 +2306,7 @@ LogicalResult foldExtractFromFromElements(ExtractOp extractOp,
     return failure();
 
   // Look for extracts from a from_elements op.
-  auto fromElementsOp = extractOp.getVector().getDefiningOp<FromElementsOp>();
+  auto fromElementsOp = extractOp.getSource().getDefiningOp<FromElementsOp>();
   if (!fromElementsOp)
     return failure();
   VectorType inputType = fromElementsOp.getType();
@@ -2558,8 +2558,8 @@ class FromElementsToShapeCast : public OpRewritePattern<FromElementsOp> {
       // Check condition (i) by checking that all elements have the same source
       // as the first element.
       if (insertIndex == 0) {
-        source = extractOp.getVector();
-      } else if (extractOp.getVector() != source) {
+        source = extractOp.getSource();
+      } else if (extractOp.getSource() != source) {
         return rewriter.notifyMatchFailure(fromElements,
                                            "element from different vector");
       }
@@ -4095,7 +4095,7 @@ foldExtractStridedOpFromInsertChain(ExtractStridedSliceOp op) {
   ArrayAttr extractOffsets = op.getOffsets();
   ArrayAttr extractStrides = op.getStrides();
   ArrayAttr extractSizes = op.getSizes();
-  auto insertOp = op.getVector().getDefiningOp<InsertStridedSliceOp>();
+  auto insertOp = op.getSource().getDefiningOp<InsertStridedSliceOp>();
   while (insertOp) {
     if (op.getSourceVectorType().getRank() !=
         insertOp.getSourceVectorType().getRank())
@@ -4199,17 +4199,17 @@ foldExtractStridedSliceNonSplatConstant(ExtractStridedSliceOp op,
 
 OpFoldResult ExtractStridedSliceOp::fold(FoldAdaptor adaptor) {
   if (getSourceVectorType() == getResult().getType())
-    return getVector();
+    return getSource();
   if (succeeded(foldExtractStridedOpFromInsertChain(*this)))
     return getResult();
 
   // ExtractStridedSliceOp(splat ConstantOp) -> ConstantOp.
   if (auto splat =
-          llvm::dyn_cast_if_present<SplatElementsAttr>(adaptor.getVector()))
+          llvm::dyn_cast_if_present<SplatElementsAttr>(adaptor.getSource()))
     DenseElementsAttr::get(getType(), splat.getSplatValue<Attribute>());
 
   // ExtractStridedSliceOp(non-splat ConstantOp) -> ConstantOp.
-  return foldExtractStridedSliceNonSplatConstant(*this, adaptor.getVector());
+  return foldExtractStridedSliceNonSplatConstant(*this, adaptor.getSource());
 }
 
 void ExtractStridedSliceOp::getOffsets(SmallVectorImpl<int64_t> &results) {
@@ -4241,7 +4241,7 @@ class StridedSliceCreateMaskFolder final
     // Return if 'extractStridedSliceOp' operand is not defined by a
     // CreateMaskOp.
     auto createMaskOp =
-        extractStridedSliceOp.getVector().getDefiningOp<CreateMaskOp>();
+        extractStridedSliceOp.getSource().getDefiningOp<CreateMaskOp>();
     if (!createMaskOp)
       return failure();
     // Return if 'extractStridedSliceOp' has non-unit strides.
@@ -4298,7 +4298,7 @@ class StridedSliceConstantMaskFolder final
                                 PatternRewriter &rewriter) const override {
     // Return if 'extractStridedSliceOp' operand is not defined by a
     // ConstantMaskOp.
-    auto *defOp = extractStridedSliceOp.getVector().getDefiningOp();
+    auto *defOp = extractStridedSliceOp.getSource().getDefiningOp();
     auto constantMaskOp = dyn_cast_or_null<ConstantMaskOp>(defOp);
     if (!constantMaskOp)
       return failure();
@@ -4351,7 +4351,7 @@ class StridedSliceBroadcast final
 
   LogicalResult matchAndRewrite(ExtractStridedSliceOp op,
                                 PatternRewriter &rewriter) const override {
-    auto broadcast = op.getVector().getDefiningOp<BroadcastOp>();
+    auto broadcast = op.getSource().getDefiningOp<BroadcastOp>();
     if (!broadcast)
       return failure();
     auto srcVecType =
@@ -4403,7 +4403,7 @@ class StridedSliceSplat final : public OpRewritePattern<ExtractStridedSliceOp> {
   LogicalResult matchAndRewrite(ExtractStridedSliceOp op,
                                 PatternRewriter &rewriter) const override {
 
-    Value splat = getScalarSplatSource(op.getVector());
+    Value splat = getScalarSplatSource(op.getSource());
     if (!splat)
       return failure();
     rewriter.replaceOpWithNewOp<BroadcastOp>(op, op.getType(), splat);
diff --git a/mlir/lib/Dialect/Vector/TransformOps/VectorTransformOps.cpp b/mlir/lib/Dialect/Vector/TransformOps/VectorTransformOps.cpp
index fe066dc04ad55..18f105ef62e38 100644
--- a/mlir/lib/Dialect/Vector/TransformOps/VectorTransformOps.cpp
+++ b/mlir/lib/Dialect/Vector/TransformOps/VectorTransformOps.cpp
@@ -88,6 +88,11 @@ void transform::ApplyDropUnitDimWithShapeCastPatternsOp::populatePatterns(
   vector::populateDropUnitDimWithShapeCastPatterns(patterns);
 }
 
+void transform::ApplyDropInnerMostUnitDimsFromXferOpsPatternsOp::
+    populatePatterns(RewritePatternSet &patterns) {
+  vector::populateDropInnerMostUnitDimsXferOpPatterns(patterns);
+}
+
 void transform::ApplyLowerBitCastPatternsOp::populatePatterns(
     RewritePatternSet &patterns) {
   vector::populateVectorBitCastLoweringPatterns(patterns);
@@ -144,6 +149,11 @@ void transform::ApplyUnrollFromElementsPatternsOp::populatePatterns(
   vector::populateVectorFromElementsLoweringPatterns(patterns);
 }
 
+void transform::ApplyUnrollToElementsPatternsOp::populatePatterns(
+    RewritePatternSet &patterns) {
+  vector::populateVectorToElementsLoweringPatterns(patterns);
+}
+
 void transform::ApplyLowerScanPatternsOp::populatePatterns(
     RewritePatternSet &patterns) {
   vector::populateVectorScanLoweringPatterns(patterns);
diff --git a/mlir/lib/Dialect/Vector/Transforms/CMakeLists.txt b/mlir/lib/Dialect/Vector/Transforms/CMakeLists.txt
index acbf2b746037b..fecf445720173 100644
--- a/mlir/lib/Dialect/Vector/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/Vector/Transforms/CMakeLists.txt
@@ -10,7 +10,9 @@ add_mlir_dialect_library(MLIRVectorTransforms
   LowerVectorMultiReduction.cpp
   LowerVectorScan.cpp
   LowerVectorShapeCast.cpp
+  LowerVectorShuffle.cpp
   LowerVectorStep.cpp
+  LowerVectorToElements.cpp
   LowerVectorToFromElementsToShuffleTree.cpp
   LowerVectorTransfer.cpp
   LowerVectorTranspose.cpp
@@ -44,6 +46,7 @@ add_mlir_dialect_library(MLIRVectorTransforms
   MLIRIR
   MLIRLinalgDialect
   MLIRMemRefDialect
+  MLIRMemRefTransforms
   MLIRMemRefUtils
   MLIRSCFDialect
   MLIRSideEffectInterfaces
diff --git a/mlir/lib/Dialect/Vector/Transforms/LowerVectorShuffle.cpp b/mlir/lib/Dialect/Vector/Transforms/LowerVectorShuffle.cpp
new file mode 100644
index 0000000000000..78102f7325b9f
--- /dev/null
+++ b/mlir/lib/Dialect/Vector/Transforms/LowerVectorShuffle.cpp
@@ -0,0 +1,110 @@
+//===- LowerVectorShuffle.cpp - Lower 'vector.shuffle' operation ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the lowering of complex `vector.shuffle` operation to a
+// set of simpler operations supported by LLVM/SPIR-V.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/Dialect/Vector/Transforms/LoweringPatterns.h"
+#include "mlir/IR/PatternMatch.h"
+
+#define DEBUG_TYPE "vector-shuffle-lowering"
+
+using namespace mlir;
+using namespace mlir::vector;
+
+namespace {
+
+/// Lowers a `vector.shuffle` operation with mixed-size inputs to a new
+/// `vector.shuffle` which promotes the smaller input to the larger vector size
+/// and an updated version of the original `vector.shuffle`.
+///
+/// Example:
+///
+///     %0 = vector.shuffle %v1, %v2 [0, 2, 1, 3] : vector<2xf32>, vector<4xf32>
+///
+///   is lowered to:
+///
+///     %0 = vector.shuffle %v1, %v1 [0, 1, -1, -1] :
+///       vector<2xf32>, vector<2xf32>
+///     %1 = vector.shuffle %0, %v2 [0, 4, 1, 5] :
+///       vector<4xf32>, vector<4xf32>
+///
+/// Note: This transformation helps legalize vector.shuffle ops when lowering
+/// to SPIR-V/LLVM, which don't support shuffle operations with mixed-size
+/// inputs.
+///
+struct MixedSizeInputShuffleOpRewrite final
+    : OpRewritePattern<vector::ShuffleOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(vector::ShuffleOp shuffleOp,
+                                PatternRewriter &rewriter) const override {
+    auto v1Type = shuffleOp.getV1VectorType();
+    auto v2Type = shuffleOp.getV2VectorType();
+
+    // Only support 1-D shuffle for now.
+    if (v1Type.getRank() != 1 || v2Type.getRank() != 1)
+      return failure();
+
+    // Bail out if inputs don't have mixed sizes.
+    int64_t v1OrigNumElems = v1Type.getNumElements();
+    int64_t v2OrigNumElems = v2Type.getNumElements();
+    if (v1OrigNumElems == v2OrigNumElems)
+      return failure();
+
+    // Determine which input needs promotion.
+    bool promoteV1 = v1OrigNumElems < v2OrigNumElems;
+    Value inputToPromote = promoteV1 ? shuffleOp.getV1() : shuffleOp.getV2();
+    VectorType promotedType = promoteV1 ? v2Type : v1Type;
+    int64_t origNumElems = promoteV1 ? v1OrigNumElems : v2OrigNumElems;
+    int64_t promotedNumElems = promoteV1 ? v2OrigNumElems : v1OrigNumElems;
+
+    // Create a shuffle with a mask that preserves existing elements and fills
+    // up with poison.
+    SmallVector<int64_t> promoteMask(promotedNumElems, ShuffleOp::kPoisonIndex);
+    for (int64_t i = 0; i < origNumElems; ++i)
+      promoteMask[i] = i;
+
+    Value promotedInput = rewriter.create<vector::ShuffleOp>(
+        shuffleOp.getLoc(), promotedType, inputToPromote, inputToPromote,
+        promoteMask);
+
+    // Create the final shuffle with the promoted inputs.
+    Value promotedV1 = promoteV1 ? promotedInput : shuffleOp.getV1();
+    Value promotedV2 = promoteV1 ? shuffleOp.getV2() : promotedInput;
+
+    SmallVector<int64_t> newMask;
+    if (!promoteV1) {
+      newMask = to_vector(shuffleOp.getMask());
+    } else {
+      // Adjust V2 indices to account for the new V1 size.
+      for (auto idx : shuffleOp.getMask()) {
+        int64_t newIdx = idx;
+        if (idx >= v1OrigNumElems) {
+          newIdx += promotedNumElems - v1OrigNumElems;
+        }
+        newMask.push_back(newIdx);
+      }
+    }
+
+    rewriter.replaceOpWithNewOp<vector::ShuffleOp>(
+        shuffleOp, shuffleOp.getResultVectorType(), promotedV1, promotedV2,
+        newMask);
+    return success();
+  }
+};
+} // namespace
+
+void mlir::vector::populateVectorShuffleLoweringPatterns(
+    RewritePatternSet &patterns, PatternBenefit benefit) {
+  patterns.add<MixedSizeInputShuffleOpRewrite>(patterns.getContext(), benefit);
+}
diff --git a/mlir/lib/Dialect/Vector/Transforms/LowerVectorToElements.cpp b/mlir/lib/Dialect/Vector/Transforms/LowerVectorToElements.cpp
new file mode 100644
index 0000000000000..a53a183ec31bc
--- /dev/null
+++ b/mlir/lib/Dialect/Vector/Transforms/LowerVectorToElements.cpp
@@ -0,0 +1,53 @@
+//===- LowerVectorToElements.cpp - Lower 'vector.to_elements' op ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements target-independent rewrites and utilities to lower the
+// 'vector.to_elements' operation.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/Dialect/Vector/Transforms/LoweringPatterns.h"
+
+#define DEBUG_TYPE "lower-vector-to-elements"
+
+using namespace mlir;
+
+namespace {
+
+struct UnrollToElements final : public OpRewritePattern<vector::ToElementsOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(vector::ToElementsOp op,
+                                PatternRewriter &rewriter) const override {
+
+    TypedValue<VectorType> source = op.getSource();
+    FailureOr<SmallVector<Value>> result =
+        vector::unrollVectorValue(source, rewriter);
+    if (failed(result)) {
+      return failure();
+    }
+    SmallVector<Value> vectors = *result;
+
+    SmallVector<Value> results;
+    for (const Value &vector : vectors) {
+      auto subElements =
+          vector::ToElementsOp::create(rewriter, op.getLoc(), vector);
+      llvm::append_range(results, subElements.getResults());
+    }
+    rewriter.replaceOp(op, results);
+    return success();
+  }
+};
+
+} // namespace
+
+void mlir::vector::populateVectorToElementsLoweringPatterns(
+    RewritePatternSet &patterns, PatternBenefit benefit) {
+  patterns.add<UnrollToElements>(patterns.getContext(), benefit);
+}
diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp
index 995a2595e5fbb..e95338f7d18be 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp
@@ -1341,7 +1341,7 @@ struct WarpOpExtractStridedSlice : public WarpDistributionPattern {
         VectorType::get(newDistributedShape, distributedType.getElementType());
     SmallVector<size_t> newRetIndices;
     WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
-        rewriter, warpOp, {extractOp.getVector()}, {newDistributedType},
+        rewriter, warpOp, {extractOp.getSource()}, {newDistributedType},
         newRetIndices);
     rewriter.setInsertionPointAfter(newWarpOp);
     SmallVector<Attribute> distributedSizes = llvm::map_to_vector(
@@ -1395,7 +1395,7 @@ struct WarpOpExtract : public WarpDistributionPattern {
       // the 1d case).
       SmallVector<size_t> newRetIndices;
       WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
-          rewriter, warpOp, {extractOp.getVector()},
+          rewriter, warpOp, {extractOp.getSource()},
           {extractOp.getSourceVectorType()}, newRetIndices);
       rewriter.setInsertionPointAfter(newWarpOp);
       Value distributedVec = newWarpOp->getResult(newRetIndices[0]);
@@ -1424,7 +1424,7 @@ struct WarpOpExtract : public WarpDistributionPattern {
         VectorType::get(newDistributedShape, distributedType.getElementType());
     SmallVector<size_t> newRetIndices;
     WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
-        rewriter, warpOp, {extractOp.getVector()}, {newDistributedType},
+        rewriter, warpOp, {extractOp.getSource()}, {newDistributedType},
         newRetIndices);
     rewriter.setInsertionPointAfter(newWarpOp);
     Value distributedVec = newWarpOp->getResult(newRetIndices[0]);
@@ -1478,7 +1478,7 @@ struct WarpOpExtractScalar : public WarpDistributionPattern {
       distributedVecType = extractSrcType;
     }
     // Yield source vector and position (if present) from warp op.
-    SmallVector<Value> additionalResults{extractOp.getVector()};
+    SmallVector<Value> additionalResults{extractOp.getSource()};
     SmallVector<Type> additionalResultTypes{distributedVecType};
     additionalResults.append(
         SmallVector<Value>(extractOp.getDynamicPosition()));
diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorDropLeadUnitDim.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorDropLeadUnitDim.cpp
index 9889d7f221fe6..cab12894487e2 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorDropLeadUnitDim.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorDropLeadUnitDim.cpp
@@ -78,7 +78,7 @@ struct CastAwayExtractStridedSliceLeadingOneDim
     Location loc = extractOp.getLoc();
 
     Value newSrcVector = vector::ExtractOp::create(
-        rewriter, loc, extractOp.getVector(), splatZero(dropCount));
+        rewriter, loc, extractOp.getSource(), splatZero(dropCount));
 
     // The offsets/sizes/strides attribute can have a less number of elements
     // than the input vector's rank: it is meant for the leading dimensions.
diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp
index f78e579d6c099..264cbc1869b9a 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp
@@ -38,6 +38,8 @@
 #include <cstdint>
 #include <optional>
 
+#include "mlir/Dialect/MemRef/Transforms/Transforms.h"
+
 using namespace mlir;
 
 #define DEBUG_TYPE "vector-narrow-type-emulation"
@@ -94,7 +96,7 @@ static FailureOr<Operation *> getCompressedMaskOp(OpBuilder &rewriter,
          !isa<arith::ConstantOp, vector::CreateMaskOp, vector::ConstantMaskOp>(
              maskOp)) {
     if (auto extractOp = dyn_cast<vector::ExtractOp>(maskOp)) {
-      maskOp = extractOp.getVector().getDefiningOp();
+      maskOp = extractOp.getSource().getDefiningOp();
       extractOps.push_back(extractOp);
     }
   }
@@ -556,7 +558,6 @@ struct ConvertVectorStore final : OpConversionPattern<vector::StoreOp> {
   matchAndRewrite(vector::StoreOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
 
-    // See #115653
     if (op.getValueToStore().getType().getRank() != 1)
       return rewriter.notifyMatchFailure(op,
                                          "only 1-D vectors are supported ATM");
@@ -817,7 +818,13 @@ struct ConvertVectorStore final : OpConversionPattern<vector::StoreOp> {
 // ConvertVectorMaskedStore
 //===----------------------------------------------------------------------===//
 
-// TODO: Document-me
+/// Converts `vector.maskedstore` operations on narrow element types to work
+/// with wider, byte-aligned container types by adjusting the mask and using
+/// bitcasting.
+///
+/// Example: Storing `vector<6xi4>` is emulated by bitcasting to `vector<3xi8>`
+/// (each `i8` container element holds two `i4` values) and storing with an
+/// adjusted mask .
 struct ConvertVectorMaskedStore final
     : OpConversionPattern<vector::MaskedStoreOp> {
   using OpConversionPattern::OpConversionPattern;
@@ -826,10 +833,10 @@ struct ConvertVectorMaskedStore final
   matchAndRewrite(vector::MaskedStoreOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
 
-    // See #115653
+    // Prerequisite: memref in the vector.maskedstore op is flattened into 1-D.
     if (op.getValueToStore().getType().getRank() != 1)
-      return rewriter.notifyMatchFailure(op,
-                                         "only 1-D vectors are supported ATM");
+      return rewriter.notifyMatchFailure(
+          op, "Memref in vector.maskedstore op must be flattened beforehand.");
 
     auto loc = op.getLoc();
     auto containerElemTy =
@@ -931,18 +938,27 @@ struct ConvertVectorMaskedStore final
 // ConvertVectorLoad
 //===----------------------------------------------------------------------===//
 
-// TODO: Document-me
+/// Converts `vector.load` on narrow element types to work with
+/// wider, byte-aligned container types by adjusting load sizes and using
+/// bitcasting.
+///
+/// Example: `vector.load` of `vector<4xi4>` from `memref<3x4xi4>` is emulated
+/// by loading `vector<2xi8>` from the linearized `memref<6xi8>` (each `i8`
+/// container holds two `i4` values) and bitcasting back.
+///
+/// There are cases where the number of elements to load is not byte-aligned. In
+/// those cases, loads are converted to byte-aligned, byte-sized loads and the
+/// target vector is extracted from the loaded vector.
 struct ConvertVectorLoad final : OpConversionPattern<vector::LoadOp> {
   using OpConversionPattern::OpConversionPattern;
 
   LogicalResult
   matchAndRewrite(vector::LoadOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
-
-    // See #115653
+    // Prerequisite:  memref in the vector.load op is flattened into 1-D.
     if (op.getVectorType().getRank() != 1)
-      return rewriter.notifyMatchFailure(op,
-                                         "only 1-D vectors are supported ATM");
+      return rewriter.notifyMatchFailure(
+          op, "Memref in emulated vector ops must be flattened beforehand.");
 
     auto loc = op.getLoc();
     auto containerElemTy =
@@ -961,8 +977,6 @@ struct ConvertVectorLoad final : OpConversionPattern<vector::LoadOp> {
 
     // Adjust the number of elements to load when emulating narrow types,
     // and then cast back to the original type with vector.bitcast op.
-    // Here only the 1-D vector load is considered, and the N-D memref types
-    // should be linearized.
     // For example, to emulate i4 to i8, the following op:
     //
     // %1 = vector.load %0[%c0, %c0] : memref<3x4xi4>, vector<4xi4>
@@ -1037,7 +1051,12 @@ struct ConvertVectorLoad final : OpConversionPattern<vector::LoadOp> {
 // ConvertVectorMaskedLoad
 //===----------------------------------------------------------------------===//
 
-// TODO: Document-me
+/// Converts `vector.maskedload` operations on narrow element types to work with
+/// wider, byte-aligned container types by adjusting the mask and using
+/// bitcasting.
+///
+/// Example: Loading `vector<6xi4>` is emulated by loading `vector<3xi8>` and
+/// bitcasting, since each `i8` container element holds two `i4` values.
 struct ConvertVectorMaskedLoad final
     : OpConversionPattern<vector::MaskedLoadOp> {
   using OpConversionPattern::OpConversionPattern;
@@ -1045,10 +1064,9 @@ struct ConvertVectorMaskedLoad final
   LogicalResult
   matchAndRewrite(vector::MaskedLoadOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
-    // See #115653
     if (op.getVectorType().getRank() != 1)
-      return rewriter.notifyMatchFailure(op,
-                                         "only 1-D vectors are supported ATM");
+      return rewriter.notifyMatchFailure(
+          op, "Memref in emulated vector ops must be flattened beforehand.");
 
     auto loc = op.getLoc();
 
@@ -1229,7 +1247,6 @@ static bool fitsInMultiByteContainerTy(VectorType subByteVecTy,
 
   int elemsPerMultiByte = multiByteBits / subByteBits;
 
-  // TODO: This is a bit too restrictive for vectors rank > 1.
   return subByteVecTy.getShape().back() % elemsPerMultiByte == 0;
 }
 
@@ -1246,10 +1263,11 @@ struct ConvertVectorTransferRead final
   matchAndRewrite(vector::TransferReadOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
 
-    // See #115653
+    // Prerequisites:  memref in the vector.transfer_read op is flattened into
+    // 1-D.
     if (op.getVectorType().getRank() != 1)
-      return rewriter.notifyMatchFailure(op,
-                                         "only 1-D vectors are supported ATM");
+      return rewriter.notifyMatchFailure(
+          op, "Memref in emulated vector ops must be flattened beforehand.");
 
     auto loc = op.getLoc();
     auto containerElemTy =
@@ -2227,7 +2245,6 @@ struct RewriteVectorTranspose : OpRewritePattern<vector::TransposeOp> {
 void vector::populateVectorNarrowTypeEmulationPatterns(
     const arith::NarrowTypeEmulationConverter &typeConverter,
     RewritePatternSet &patterns, bool disableAtomicRMW) {
-
   // Populate `vector.*` conversion patterns.
   // TODO: #119553 support atomicity
   patterns.add<ConvertVectorLoad, ConvertVectorMaskedLoad,
@@ -2266,3 +2283,10 @@ void vector::populateVectorTransposeNarrowTypeRewritePatterns(
     RewritePatternSet &patterns, PatternBenefit benefit) {
   patterns.add<RewriteVectorTranspose>(patterns.getContext(), benefit);
 }
+
+void vector::populateMemRefFlattenAndVectorNarrowTypeEmulationPatterns(
+    arith::NarrowTypeEmulationConverter &typeConverter,
+    RewritePatternSet &patterns) {
+  memref::populateFlattenVectorOpsOnMemrefPatterns(patterns);
+  vector::populateVectorNarrowTypeEmulationPatterns(typeConverter, patterns);
+}
diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorInsertExtractStridedSliceRewritePatterns.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorInsertExtractStridedSliceRewritePatterns.cpp
index cbb9d4bbf0b1f..f6d6555f4c6e2 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorInsertExtractStridedSliceRewritePatterns.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorInsertExtractStridedSliceRewritePatterns.cpp
@@ -213,8 +213,8 @@ class Convert1DExtractStridedSliceIntoShuffle
     for (int64_t off = offset, e = offset + size * stride; off < e;
          off += stride)
       offsets.push_back(off);
-    rewriter.replaceOpWithNewOp<ShuffleOp>(op, dstType, op.getVector(),
-                                           op.getVector(), offsets);
+    rewriter.replaceOpWithNewOp<ShuffleOp>(op, dstType, op.getSource(),
+                                           op.getSource(), offsets);
     return success();
   }
 };
@@ -250,7 +250,7 @@ class Convert1DExtractStridedSliceIntoExtractInsertChain final
     SmallVector<Value> elements;
     elements.reserve(size);
     for (int64_t i = offset, e = offset + size * stride; i < e; i += stride)
-      elements.push_back(ExtractOp::create(rewriter, loc, op.getVector(), i));
+      elements.push_back(ExtractOp::create(rewriter, loc, op.getSource(), i));
 
     Value result = arith::ConstantOp::create(
         rewriter, loc, rewriter.getZeroAttr(op.getType()));
@@ -306,7 +306,7 @@ class DecomposeNDExtractStridedSlice
     Value res = BroadcastOp::create(rewriter, loc, dstType, zero);
     for (int64_t off = offset, e = offset + size * stride, idx = 0; off < e;
          off += stride, ++idx) {
-      Value one = ExtractOp::create(rewriter, loc, op.getVector(), off);
+      Value one = ExtractOp::create(rewriter, loc, op.getSource(), off);
       Value extracted = ExtractStridedSliceOp::create(
           rewriter, loc, one, getI64SubArray(op.getOffsets(), /* dropFront=*/1),
           getI64SubArray(op.getSizes(), /* dropFront=*/1),
diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorLinearize.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorLinearize.cpp
index 7dde6311fa809..82bac8c499028 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorLinearize.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorLinearize.cpp
@@ -252,7 +252,7 @@ struct LinearizeVectorExtractStridedSlice final
     SmallVector<int64_t> indices = getStridedSliceInsertionIndices(
         outputShape, inputShape, offsets.value());
 
-    Value srcVector = adaptor.getVector();
+    Value srcVector = adaptor.getSource();
     rewriter.replaceOpWithNewOp<vector::ShuffleOp>(
         extractStridedSliceOp, flatOutputType, srcVector, srcVector, indices);
     return success();
@@ -438,8 +438,8 @@ struct LinearizeVectorExtract final
       return rewriter.notifyMatchFailure(extractOp,
                                          "dynamic position is not supported.");
 
-    llvm::ArrayRef<int64_t> shape = extractOp.getVector().getType().getShape();
-    int64_t size = extractOp.getVector().getType().getNumElements();
+    llvm::ArrayRef<int64_t> shape = extractOp.getSource().getType().getShape();
+    int64_t size = extractOp.getSource().getType().getNumElements();
 
     // Compute linearized offset.
     int64_t linearizedOffset = 0;
@@ -449,7 +449,7 @@ struct LinearizeVectorExtract final
       linearizedOffset += offsets[i] * size;
     }
 
-    Value srcVector = adaptor.getVector();
+    Value srcVector = adaptor.getSource();
     if (!isa<VectorType>(extractOp.getType())) {
       // Scalar case: generate a 1-D extract.
       Value result = rewriter.createOrFold<vector::ExtractOp>(
@@ -798,6 +798,51 @@ struct LinearizeVectorFromElements final
   }
 };
 
+/// This pattern linearizes the operand in `vector.to_elements` operations
+/// by converting the source type to a 1-D vector while preserving all element
+/// values. The transformation creates a linearized `vector.shape_cast`
+/// followed by a `vector.to_elements`.
+///
+/// Example:
+///
+///     %0:4 = vector.to_elements %v : vector<2x2xf32>
+///
+/// is converted to:
+///
+///     %vector_cast = vector.shape_cast %v : vector<2x2xf32> to vector<4xf32>
+///     %0:4 = vector.to_elements %vector_cast : vector<4xf32>
+///
+struct LinearizeVectorToElements final
+    : public OpConversionPattern<vector::ToElementsOp> {
+  using OpConversionPattern::OpConversionPattern;
+
+  LinearizeVectorToElements(const TypeConverter &typeConverter,
+                            MLIRContext *context, PatternBenefit benefit = 1)
+      : OpConversionPattern(typeConverter, context, benefit) {}
+
+  LogicalResult
+  matchAndRewrite(vector::ToElementsOp toElementsOp, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+
+    VectorType vecType = toElementsOp.getSource().getType();
+    if (vecType.getRank() <= 1)
+      return rewriter.notifyMatchFailure(
+          toElementsOp, "the rank is already less than or equal to 1");
+
+    assert(vecType.getNumScalableDims() == 0 &&
+           "to_elements does not support scalable vectors");
+    auto vec1DType =
+        VectorType::get({vecType.getNumElements()}, vecType.getElementType());
+    Value shapeCast = vector::ShapeCastOp::create(
+        rewriter, toElementsOp.getLoc(), vec1DType, toElementsOp.getSource());
+    auto newToElementsOp =
+        vector::ToElementsOp::create(rewriter, toElementsOp.getLoc(),
+                                     toElementsOp.getResultTypes(), shapeCast);
+    rewriter.replaceOp(toElementsOp, newToElementsOp);
+    return success();
+  }
+};
+
 } // namespace
 
 /// This method defines the set of operations that are linearizable, and hence
@@ -890,8 +935,8 @@ void mlir::vector::populateVectorLinearizeBasePatterns(
   patterns
       .add<LinearizeConstantLike, LinearizeVectorizable, LinearizeVectorBitCast,
            LinearizeVectorSplat, LinearizeVectorCreateMask, LinearizeVectorLoad,
-           LinearizeVectorStore, LinearizeVectorFromElements>(
-          typeConverter, patterns.getContext());
+           LinearizeVectorStore, LinearizeVectorFromElements,
+           LinearizeVectorToElements>(typeConverter, patterns.getContext());
 }
 
 void mlir::vector::populateVectorLinearizeShuffleLikeOpsPatterns(
diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorTransferOpTransforms.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorTransferOpTransforms.cpp
index 1874a3477a16c..c364a8b54167c 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorTransferOpTransforms.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorTransferOpTransforms.cpp
@@ -1007,7 +1007,7 @@ class RewriteScalarExtractOfTransferRead
   LogicalResult matchAndRewrite(vector::ExtractOp extractOp,
                                 PatternRewriter &rewriter) const override {
     // Match phase.
-    auto xferOp = extractOp.getVector().getDefiningOp<vector::TransferReadOp>();
+    auto xferOp = extractOp.getSource().getDefiningOp<vector::TransferReadOp>();
     if (!xferOp)
       return failure();
     // Check that we are extracting a scalar and not a sub-vector.
diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp
index dbb5eb398fae6..866f789ec6a39 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp
@@ -576,7 +576,7 @@ struct BubbleDownVectorBitCastForExtract
     if (extractOp.getSourceVectorType().getRank() != 1)
       return failure();
 
-    auto castOp = extractOp.getVector().getDefiningOp<vector::BitCastOp>();
+    auto castOp = extractOp.getSource().getDefiningOp<vector::BitCastOp>();
     if (!castOp)
       return failure();
 
@@ -647,7 +647,7 @@ struct BubbleDownBitCastForStridedSliceExtract
 
   LogicalResult matchAndRewrite(vector::ExtractStridedSliceOp extractOp,
                                 PatternRewriter &rewriter) const override {
-    auto castOp = extractOp.getVector().getDefiningOp<vector::BitCastOp>();
+    auto castOp = extractOp.getSource().getDefiningOp<vector::BitCastOp>();
     if (!castOp)
       return failure();
 
@@ -1135,7 +1135,7 @@ class ExtractOpFromElementwise final
 
   LogicalResult matchAndRewrite(vector::ExtractOp op,
                                 PatternRewriter &rewriter) const override {
-    Operation *eltwise = op.getVector().getDefiningOp();
+    Operation *eltwise = op.getSource().getDefiningOp();
 
     // TODO: vector::FMAOp is not an ElemetwiseMappable even if it claims to be,
     // as it doesn't support scalars.
@@ -1210,7 +1210,7 @@ class ExtractOpFromLoad final : public OpRewritePattern<vector::ExtractOp> {
 
   LogicalResult matchAndRewrite(vector::ExtractOp op,
                                 PatternRewriter &rewriter) const override {
-    auto loadOp = op.getVector().getDefiningOp<vector::LoadOp>();
+    auto loadOp = op.getSource().getDefiningOp<vector::LoadOp>();
     if (!loadOp)
       return rewriter.notifyMatchFailure(op, "expected a load op");
 
diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorUnroll.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorUnroll.cpp
index e8ecb0c0be846..79786f33a2d46 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorUnroll.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorUnroll.cpp
@@ -809,6 +809,81 @@ struct UnrollBroadcastPattern : public OpRewritePattern<vector::BroadcastOp> {
   vector::UnrollVectorOptions options;
 };
 
+/// This pattern unrolls `vector.step` operations according to the provided
+/// target unroll shape. It decomposes a large step vector into smaller step
+/// vectors (segments) and assembles the result by inserting each computed
+/// segment into the appropriate offset of the original vector.
+///
+/// The pattern does not support scalable vectors and will fail to match them.
+///
+/// For each segment, it adds the base step vector and the segment's offset,
+/// then inserts the result into the output vector at the corresponding
+/// position.
+///
+/// Example:
+///   Given a step operation:
+///     %0 = vector.step : vector<8xindex>
+///
+///   and a target unroll shape of <4>, the pattern produces:
+///
+///     %base = vector.step : vector<4xindex>
+///     %zero = arith.constant dense<0> : vector<8xindex>
+///     %result0 = vector.insert_strided_slice %base, %zero
+///       {offsets = [0], strides = [1]} : vector<4xindex> into vector<8xindex>
+///     %offset = arith.constant dense<4> : vector<4xindex>
+///     %segment1 = arith.addi %base, %offset : vector<4xindex>
+///     %result1 = vector.insert_strided_slice %segment1, %result0
+///       {offsets = [4], strides = [1]} : vector<4xindex> into vector<8xindex>
+///
+struct UnrollStepPattern : public OpRewritePattern<vector::StepOp> {
+  UnrollStepPattern(MLIRContext *context,
+                    const vector::UnrollVectorOptions &options,
+                    PatternBenefit benefit = 1)
+      : OpRewritePattern<vector::StepOp>(context, benefit), options(options) {}
+
+  LogicalResult matchAndRewrite(vector::StepOp stepOp,
+                                PatternRewriter &rewriter) const override {
+    std::optional<SmallVector<int64_t>> targetShape =
+        getTargetShape(options, stepOp);
+    if (!targetShape)
+      return failure();
+
+    VectorType vecType = stepOp.getType();
+    if (vecType.isScalable()) {
+      // Scalable vectors are not supported by this pattern.
+      return failure();
+    }
+    int64_t originalSize = vecType.getShape()[0];
+    Location loc = stepOp.getLoc();
+    SmallVector<int64_t> strides(1, 1);
+
+    Value result = arith::ConstantOp::create(rewriter, loc, vecType,
+                                             rewriter.getZeroAttr(vecType));
+
+    auto targetVecType =
+        VectorType::get(*targetShape, vecType.getElementType());
+    Value baseStep = vector::StepOp::create(rewriter, loc, targetVecType);
+    for (const SmallVector<int64_t> &offsets :
+         StaticTileOffsetRange({originalSize}, *targetShape)) {
+      Value bcastOffset = arith::ConstantOp::create(
+          rewriter, loc, targetVecType,
+          DenseElementsAttr::get(
+              targetVecType,
+              IntegerAttr::get(targetVecType.getElementType(), offsets[0])));
+      Value tileStep =
+          arith::AddIOp::create(rewriter, loc, baseStep, bcastOffset);
+
+      result = rewriter.createOrFold<vector::InsertStridedSliceOp>(
+          loc, tileStep, result, offsets, strides);
+    }
+    rewriter.replaceOp(stepOp, result);
+    return success();
+  }
+
+private:
+  vector::UnrollVectorOptions options;
+};
+
 } // namespace
 
 void mlir::vector::populateVectorUnrollPatterns(
@@ -818,6 +893,6 @@ void mlir::vector::populateVectorUnrollPatterns(
                UnrollContractionPattern, UnrollElementwisePattern,
                UnrollReductionPattern, UnrollMultiReductionPattern,
                UnrollTransposePattern, UnrollGatherPattern, UnrollLoadPattern,
-               UnrollStorePattern, UnrollBroadcastPattern>(
+               UnrollStorePattern, UnrollBroadcastPattern, UnrollStepPattern>(
       patterns.getContext(), options, benefit);
 }
diff --git a/mlir/lib/Dialect/Vector/Utils/VectorUtils.cpp b/mlir/lib/Dialect/Vector/Utils/VectorUtils.cpp
index 841e1384e03b3..39dc7a4f284a6 100644
--- a/mlir/lib/Dialect/Vector/Utils/VectorUtils.cpp
+++ b/mlir/lib/Dialect/Vector/Utils/VectorUtils.cpp
@@ -393,6 +393,41 @@ vector::isValidMaskedInputVector(ArrayRef<int64_t> shape,
   return success();
 }
 
+/// Takes a 2+ dimensional vector as an input
+/// returns n vector values produced by n vector.extract operations.
+/// I.e. calling unrollVectorValue([[%v]], rewriter) such that
+///
+///   %v : vector<nxaxb...>
+///
+/// will produce the following IR changes
+///
+///   %v0 = vector.extract %v[0] : vector<axbx...> from vector<nxaxb...>
+///   %v1 = vector.extract %v[1] : vector<axbx...> from vector<nxaxb...>
+///   ...
+///   %vnminusone = vector.extract %v[n-1] : vector<axbx...> from ...
+///
+/// and returns SmallVector<Value> r = {[[%v0]], [[%v1]], ..., [[%vnminusone]]}
+FailureOr<SmallVector<Value>>
+vector::unrollVectorValue(TypedValue<VectorType> vector,
+                          RewriterBase &rewriter) {
+  SmallVector<Value> subvectors;
+  VectorType ty = cast<VectorType>(vector.getType());
+  Location loc = vector.getLoc();
+  if (ty.getRank() < 2)
+    return rewriter.notifyMatchFailure(loc, "already 1-D");
+
+  // Unrolling doesn't take vscale into account. Pattern is disabled for
+  // vectors with leading scalable dim(s).
+  if (ty.getScalableDims().front())
+    return rewriter.notifyMatchFailure(loc, "cannot unroll scalable dim");
+
+  for (int64_t i = 0, e = ty.getShape().front(); i < e; ++i) {
+    subvectors.push_back(vector::ExtractOp::create(rewriter, loc, vector, i));
+  }
+
+  return subvectors;
+}
+
 LogicalResult vector::unrollVectorOp(Operation *op, PatternRewriter &rewriter,
                                      vector::UnrollVectorOpFn unrollFn) {
   assert(op->getNumResults() == 1 && "expected single result");
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index 7f3be7f91c56b..94c5509fd7c29 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -133,22 +133,23 @@ bool XeGPUDialect::isEvenlyDistributable(llvm::ArrayRef<int64_t> shape,
   };
 
   // check the sgLayout and sgData
-  auto maybeSgShape =
-      tryDistribute(shape, attr.getSgLayoutAsInt(), attr.getSgDataAsInt());
+  auto maybeSgShape = tryDistribute(shape, attr.getEffectiveSgLayoutAsInt(),
+                                    attr.getEffectiveSgDataAsInt());
   if (!maybeSgShape)
     return false;
   auto sgShape = maybeSgShape.value();
 
   // check InstData, it neither have layout nor need round-robin
   auto maybeInstShape =
-      tryDistribute(sgShape, {}, attr.getInstDataAsInt(), false);
+      tryDistribute(sgShape, {}, attr.getEffectiveInstDataAsInt(), false);
   if (!maybeInstShape)
     return false;
   auto instShape = maybeInstShape.value();
 
   // check LaneLayout and LaneData
-  auto maybeLaneShape = tryDistribute(instShape, attr.getLaneLayoutAsInt(),
-                                      attr.getLaneDataAsInt(), false);
+  auto maybeLaneShape =
+      tryDistribute(instShape, attr.getEffectiveLaneLayoutAsInt(),
+                    attr.getEffectiveLaneDataAsInt(), false);
   return maybeLaneShape.has_value();
 }
 
@@ -282,9 +283,10 @@ LayoutAttr::delinearizeSubgroupId(OpBuilder &builder, Location loc,
   if (!hasDefaultOrder())
     return mlir::emitError(loc, "order attribute is currently not supported.");
 
-  auto dims = llvm::map_to_vector(getSgLayoutAsInt(), [&](int64_t d) -> Value {
-    return builder.createOrFold<arith::ConstantIndexOp>(loc, d);
-  });
+  auto dims =
+      llvm::map_to_vector(getEffectiveSgLayoutAsInt(), [&](int64_t d) -> Value {
+        return builder.createOrFold<arith::ConstantIndexOp>(loc, d);
+      });
 
   return affine::delinearizeIndex(builder, loc, linearId, dims);
 }
@@ -298,8 +300,8 @@ LayoutAttr::getOffsets(OpBuilder &builder, Location loc, Value linearId,
   if (!isForWorkgroup())
     return failure();
 
-  SmallVector<int64_t> sgLayout = getSgLayoutAsInt();
-  SmallVector<int64_t> sgShape = getSgDataAsInt();
+  SmallVector<int64_t> sgLayout = getEffectiveSgLayoutAsInt();
+  SmallVector<int64_t> sgShape = getEffectiveSgDataAsInt();
   if (sgShape.empty()) {
     if (auto derivedShape = computeShapeRatio(shape, sgLayout))
       sgShape = derivedShape.value();
@@ -385,8 +387,8 @@ SliceAttr::getOffsets(OpBuilder &builder, Location loc, Value linearId,
   if (!isForWorkgroup())
     return failure();
 
-  SmallVector<int64_t> sgLayout = getSgLayoutAsInt();
-  SmallVector<int64_t> sgShape = getSgDataAsInt();
+  SmallVector<int64_t> sgLayout = getEffectiveSgLayoutAsInt();
+  SmallVector<int64_t> sgShape = getEffectiveSgDataAsInt();
   if (sgShape.empty()) {
     if (auto derivedShape = computeShapeRatio(shape, sgLayout))
       sgShape = derivedShape.value();
@@ -409,6 +411,26 @@ SliceAttr::getOffsets(OpBuilder &builder, Location loc, Value linearId,
                                   shape);
 }
 
+bool SliceAttr::isSliceOf(const xegpu::DistributeLayoutAttr &other) {
+  auto flattenedThis = flatten();
+  // If other is a LayoutAttr, just compare directly with parent of
+  // flattenedThis.
+  if (auto otherLayout = dyn_cast<xegpu::LayoutAttr>(other))
+    return flattenedThis.getParent() == otherLayout;
+  // If other is a SliceAttr, flatten it first before comparing.
+  auto flattenedOther = dyn_cast<xegpu::SliceAttr>(other).flatten();
+  // Both must have common parent LayoutAttr.
+  if (flattenedThis.getParent() != flattenedOther.getParent())
+    return false;
+  // otherFlattened's sliced dims must be a subset of flattenedThis's sliced
+  // dims.
+  llvm::SmallDenseSet<int64_t> thisDims(
+      flattenedThis.getDims().asArrayRef().begin(),
+      flattenedThis.getDims().asArrayRef().end());
+  return llvm::all_of(flattenedOther.getDims().asArrayRef(),
+                      [&](int64_t dim) { return thisDims.contains(dim); });
+}
+
 //===----------------------------------------------------------------------===//
 // XeGPU_RangeAttr
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
index 5d5ff69e06886..7efa4b9fbd934 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
@@ -85,16 +85,16 @@ struct ConvertLayoutOpPattern
   using OpRewritePattern::OpRewritePattern;
   LogicalResult matchAndRewrite(xegpu::ConvertLayoutOp op,
                                 PatternRewriter &rewriter) const override {
-    xegpu::DistributeLayoutAttr input_layout = op.getInputLayoutAttr();
-    xegpu::DistributeLayoutAttr target_layout = op.getTargetLayoutAttr();
-    if (input_layout.getInstDataAsInt().empty() ||
-        target_layout.getInstDataAsInt().empty())
+    xegpu::DistributeLayoutAttr inputLayout = op.getInputLayoutAttr();
+    xegpu::DistributeLayoutAttr targetLayout = op.getTargetLayoutAttr();
+    if (inputLayout.getEffectiveInstDataAsInt().empty() ||
+        targetLayout.getEffectiveInstDataAsInt().empty())
       return rewriter.notifyMatchFailure(op, "Not a target ConvertLayoutOp.");
 
-    input_layout = input_layout.dropInstData();
-    target_layout = target_layout.dropInstData();
+    inputLayout = inputLayout.dropInstData();
+    targetLayout = targetLayout.dropInstData();
     auto newOp = rewriter.createOrFold<xegpu::ConvertLayoutOp>(
-        op.getLoc(), op.getType(), op.getSource(), input_layout, target_layout);
+        op.getLoc(), op.getType(), op.getSource(), inputLayout, targetLayout);
     rewriter.replaceOp(op, newOp);
     return success();
   }
@@ -145,8 +145,8 @@ XeGPUBlockingPass::getTileShape(const T &operandOrResult) const {
   xegpu::DistributeLayoutAttr layout =
       xegpu::getDistributeLayoutAttr(operandOrResult);
   if (layout && layout.isForSubgroup()) {
-    if (!layout.getInstDataAsInt().empty())
-      return layout.getInstDataAsInt();
+    if (!layout.getEffectiveInstDataAsInt().empty())
+      return layout.getEffectiveInstDataAsInt();
 
     if (auto type = dyn_cast<ShapedType>(value.getType()))
       return llvm::to_vector(type.getShape());
@@ -226,7 +226,7 @@ bool XeGPUBlockingPass::needsUnroll(Operation *op) const {
     Type valTy = value.getType();
     if (auto tdescTy = dyn_cast<xegpu::TensorDescType>(valTy)) {
       xegpu::DistributeLayoutAttr layout = tdescTy.getLayoutAttr();
-      return layout && !layout.getInstDataAsInt().empty();
+      return layout && !layout.getEffectiveInstDataAsInt().empty();
     }
     auto shapedType = dyn_cast<ShapedType>(valTy);
     return shapedType && !llvm::equal(tileShape, shapedType.getShape());
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index b33669259249a..21c1583bf2633 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -58,6 +58,12 @@ namespace {
 // SIMT Distribution Patterns
 //===----------------------------------------------------------------------===//
 
+/// In certain cases, we may need to favor XeGPU specific distribution patterns
+/// over generic vector distribution patterns. In such cases, we can assign
+/// priorities to patterns.
+static constexpr unsigned regularPatternBenefit = 1;
+static constexpr unsigned highPatternBenefit = 2;
+
 /// Helper function to get  distributed vector type for a source vector type
 /// according to the lane_layout. We simply divide each dimension of tensor
 /// descriptor shape by corresponding lane_layout dimension. If
@@ -72,27 +78,31 @@ namespace {
 /// | 32x16                 | [2, 8]      | 16x2                     |
 /// | 2x32x16               | [1, 16]     | 2x32x1                   |
 static FailureOr<VectorType>
-getDistVecTypeBasedOnLaneLayout(xegpu::LayoutAttr layout,
+getDistVecTypeBasedOnLaneLayout(xegpu::DistributeLayoutAttr layout,
                                 VectorType originalType) {
   if (!layout)
     return failure();
-
-  auto laneLayout = layout.getLaneLayout().asArrayRef();
-  assert(originalType.getShape().size() >= laneLayout.size() &&
+  assert((isa<xegpu::LayoutAttr>(layout) || isa<xegpu::SliceAttr>(layout)) &&
+         "Expecting a valid layout.");
+  SmallVector<int64_t> effectiveLaneLayout =
+      layout.getEffectiveLaneLayoutAsInt();
+  assert(static_cast<size_t>(originalType.getRank()) >=
+             effectiveLaneLayout.size() &&
          "Rank of the original vector type should be greater or equal to the "
          "size of the lane layout to distribute the vector type.");
   SmallVector<int64_t> distributedShape(originalType.getShape());
   // Only distribute the last `laneLayout.size()` dimensions. The remaining
   // dimensions are not distributed.
-  unsigned distributionStart = originalType.getRank() - laneLayout.size();
+  unsigned distributionStart =
+      originalType.getRank() - effectiveLaneLayout.size();
   for (auto [i, dim] : llvm::enumerate(originalType.getShape())) {
     if (i < distributionStart)
       continue;
 
     // Check if the dimension can be distributed evenly.
-    if (dim % laneLayout[i - distributionStart] != 0)
+    if (dim % effectiveLaneLayout[i - distributionStart] != 0)
       return failure();
-    distributedShape[i] = dim / laneLayout[i - distributionStart];
+    distributedShape[i] = dim / effectiveLaneLayout[i - distributionStart];
   }
   return VectorType::get(distributedShape, originalType.getElementType());
 }
@@ -1001,12 +1011,282 @@ struct LoadDistribution final : public gpu::WarpDistributionPattern {
   }
 };
 
+/// Helper to rewrite a 2D VectorMultiReductionOp into a sequence of 1D
+/// VectorReductionOps.
+static Value lowerToVectorReductions(TypedValue<VectorType> src,
+                                     TypedValue<VectorType> acc,
+                                     vector::CombiningKind kind,
+                                     int64_t reductionDim, Location loc,
+                                     PatternRewriter &rewriter) {
+  // Expecting a 2D source vector.
+  assert(src.getType().getRank() == 2 && "expected a 2D source vector");
+  VectorType sourceType = src.getType();
+  int64_t sourceH = sourceType.getShape()[0];
+  int64_t sourceW = sourceType.getShape()[1];
+  int nSlices = (reductionDim == 0) ? sourceW : sourceH;
+  // Create a constant vector to hold the result of the reduction.
+  TypedAttr zeroAttr = rewriter.getZeroAttr(sourceType.getElementType());
+  Value reductionResult = arith::ConstantOp::create(
+      rewriter, loc, acc.getType(),
+      DenseElementsAttr::get(acc.getType(), zeroAttr));
+  // For each slice of the source, extract the slice vector, do a reduction
+  // and, insert the reduced value back to the result vector.
+  for (int i = 0; i < nSlices; ++i) {
+    SmallVector<int64_t, 2> sliceOffsets, sliceSizes;
+    if (reductionDim == 1) {
+      sliceOffsets = {i, 0};
+      sliceSizes = {1, sourceW};
+    } else {
+      sliceOffsets = {0, i};
+      sliceSizes = {sourceH, 1};
+    }
+    vector::ExtractStridedSliceOp extractOp =
+        vector::ExtractStridedSliceOp::create(rewriter, loc, src, sliceOffsets,
+                                              sliceSizes, {1, 1});
+    int64_t nSliceElements = extractOp.getResult().getType().getNumElements();
+    Value slice = vector::ShapeCastOp::create(
+        rewriter, loc,
+        VectorType::get({nSliceElements}, sourceType.getElementType()),
+        extractOp.getResult());
+    Value accExtract = vector::ExtractOp::create(rewriter, loc, acc, i);
+    Value reduction =
+        vector::ReductionOp::create(rewriter, loc, kind, slice, accExtract);
+    reductionResult =
+        vector::InsertOp::create(rewriter, loc, reduction, reductionResult, i);
+  }
+  return reductionResult;
+}
+
+/// This patterns distribute the `vector.multi_reduction` operation across
+/// lanes in a warp. Currently only 2D to 1D reductions are supported. Given
+/// layouts for the source and accumulator vectors,
+/// * If the reduction dimension is distributed across lanes, the reduction is
+///   non-lane-local and the reduction is done using warp shuffles. Here we
+///   simply rewrite the MultiDimReductionOp to a sequence of ReductionOps in
+///   the warp op body.
+/// * If the reduction dimension is not distributed across lanes, the reduction
+///   is lane-local. In this case, we yield the source and accumulator vectors
+///   from the warp op and perform the lane-local reduction outside the warp op
+///   using a sequence of ReductionOps.
+/// Example 1 (Reduction is lane-local):
+/// ```
+/// %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<1xf32>) {
+///   %0 = "some_def"() : () -> (vector<16x32xf32>)
+///   %acc = "some_def"() : () -> (vector<32xf32>)
+///   %1 = vector.multi_reduction <add>, %0, %acc [0] : vector<16x32xf32> to
+///   vector<32xf32> gpu.yield %1 : vector<32xf32>
+/// }
+/// ```
+/// is lowered to:
+/// ```
+/// %r:2 = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<16x1xf32>,
+/// vector<1xf32>) {
+///   %0 = "some_def"() : () -> (vector<16x32xf32>)
+///   %acc = "some_def"() : () -> (vector<32xf32>)
+///   gpu.yield %0, %acc : vector<16x32xf32>, vector<32xf32>
+/// }
+/// %c = arith.constant dense<0.0> : vector<1xf32>
+/// %1 = vector.shape_cast %r#0 : vector<16x1xf32> to vector<16xf32>
+/// %2 = vector.reduction <add>, %1, %r#1 : vector<16xf32> to f32
+/// %3 = vector.insert %2, %c[0] : f32 into vector<1xf32>
+/// ```
+/// Example 2 (Reduction is non-lane-local):
+/// ```
+/// %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<2xf32>) {
+///   %0 = "some_def"() : () -> (vector<2x32xf32>)
+///   %acc = "some_def"() : () -> (vector<2xf32>)
+///   %1 = vector.multi_reduction <add>, %0, %acc [1] : vector<2x32xf32> to
+///   vector<2xf32>
+///   gpu.yield %1 : vector<2xf32>
+/// }
+/// ```
+/// is lowered to:
+/// ```
+/// %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<2xf32>) {
+///   %0 = "some_def"() : () -> (vector<2x32xf32>)
+///   %acc = "some_def"() : () -> (vector<2xf32>)
+///   %1 = arith.constant dense<0.0> : vector<2xf32>
+///   %2 = vector.extract %0[0] : vector<32xf32> from <vector<2x32xf32>>
+///   %3 = ("warp.reduction %2") : f32
+///   %4 = vector.insert %3, %1[0] : f32 into vector<2xf32>
+///   ... repeat for row 1
+///   gpu.yield %1 : vector<2xf32>
+/// }
+struct VectorMultiReductionDistribution : public gpu::WarpDistributionPattern {
+  using gpu::WarpDistributionPattern::WarpDistributionPattern;
+  LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
+                                PatternRewriter &rewriter) const override {
+    OpOperand *yieldOperand =
+        getWarpResult(warpOp, llvm::IsaPred<vector::MultiDimReductionOp>);
+    if (!yieldOperand)
+      return failure();
+    auto reductionOp =
+        cast<vector::MultiDimReductionOp>(yieldOperand->get().getDefiningOp());
+    unsigned operandNumber = yieldOperand->getOperandNumber();
+    VectorType sourceType = reductionOp.getSourceVectorType();
+    // Only 2D vectors are supported.
+    if (sourceType.getRank() != 2)
+      return rewriter.notifyMatchFailure(warpOp,
+                                         "Only 2D reductions are supported.");
+    ArrayRef<int64_t> reductionDims = reductionOp.getReductionDims();
+    // Only 1 reduction dimension supported. This also ensures that the result
+    // is vector type.
+    if (reductionDims.size() != 1)
+      return rewriter.notifyMatchFailure(
+          warpOp, "Only 1 reduction dimension is supported.");
+    int64_t reductionDim = reductionDims[0];
+    VectorType distributedResultType =
+        cast<VectorType>(warpOp.getResult(operandNumber).getType());
+    VectorType resultType = cast<VectorType>(reductionOp.getType());
+    xegpu::DistributeLayoutAttr sourceLayout =
+        xegpu::getDistributeLayoutAttr(reductionOp.getSource());
+
+    FailureOr<VectorType> sourceDistTypeOrFailure =
+        getDistVecTypeBasedOnLaneLayout(sourceLayout, sourceType);
+    if (failed(sourceDistTypeOrFailure))
+      return rewriter.notifyMatchFailure(
+          warpOp, "Failed to distribute the source vector type.");
+    VectorType sourceDistType = sourceDistTypeOrFailure.value();
+    // Only single dimension distribution is supported.
+    bool dim0Distributed =
+        sourceDistType.getShape()[0] != sourceType.getShape()[0];
+    bool dim1Distributed =
+        sourceDistType.getShape()[1] != sourceType.getShape()[1];
+    if (dim0Distributed && dim1Distributed)
+      return rewriter.notifyMatchFailure(
+          warpOp, "Expecting source to be distributed in a single dimension.");
+    int64_t sourceDistDim = dim0Distributed ? 0 : (dim1Distributed ? 1 : -1);
+    if (sourceDistDim == -1)
+      return rewriter.notifyMatchFailure(
+          warpOp, "Expecting a distributed source vector.");
+    bool resultDistributed =
+        distributedResultType.getNumElements() < resultType.getNumElements();
+    // If the lane owns all the data required for reduction (i.e. reduction is
+    // fully parallel accross lanes), then each lane owns part of the result
+    // (i.e. result is distributed). If the reduction require cross-lane
+    // shuffling, then the result is shared among all lanes (broadcasted).
+    // Therefore we expect following cases:
+    //
+    // | Source vector        | Reduction dim  | Result vector  |
+    // |----------------------|----------------|----------------|
+    // |  dim-0 distributed   |       0        | broadcasted    |
+    // |  dim-0 distributed   |       1        | distributed    |
+    // |  dim-1 distributed   |       0        | distributed    |
+    // |  dim-1 distributed   |       1        | broadcasted    |
+
+    bool isReductionLaneLocal = (sourceDistDim == 0 && reductionDim == 1) ||
+                                (sourceDistDim == 1 && reductionDim == 0);
+    if (isReductionLaneLocal && !resultDistributed)
+      return rewriter.notifyMatchFailure(
+          warpOp, "Expecting a distributed result for lane-local reduction.");
+
+    if (!isReductionLaneLocal && resultDistributed)
+      return rewriter.notifyMatchFailure(
+          warpOp,
+          "Expecting a broadcasted result for non-lane-local reduction.");
+
+    // Handle lane-local reduction case. In this case we fully distribute the
+    // reduction result.
+    if (isReductionLaneLocal) {
+      // Yield the source and acc vectors from the WarpOp.
+      SmallVector<size_t> newRetIndices;
+      auto newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
+          rewriter, warpOp, {reductionOp.getSource(), reductionOp.getAcc()},
+          {sourceDistType, distributedResultType}, newRetIndices);
+      rewriter.setInsertionPointAfter(newWarpOp);
+      Value result = lowerToVectorReductions(
+          cast<TypedValue<VectorType>>(newWarpOp->getResult(newRetIndices[0])),
+          cast<TypedValue<VectorType>>(newWarpOp->getResult(newRetIndices[1])),
+          reductionOp.getKind(), reductionDim, reductionOp.getLoc(), rewriter);
+      // Replace the warp op result with the final result.
+      rewriter.replaceAllUsesWith(reductionOp.getResult(), result);
+      return success();
+    }
+    // For non-lane-local case, we simply rewrite the MultiReductionOp in terms
+    // of multiple ReductionOps. Actual distribution is done by the
+    // WarpOpReduction pattern.
+    rewriter.setInsertionPointAfter(reductionOp);
+    Value result = lowerToVectorReductions(
+        cast<TypedValue<VectorType>>(reductionOp.getSource()),
+        cast<TypedValue<VectorType>>(reductionOp.getAcc()),
+        reductionOp.getKind(), reductionDim, reductionOp.getLoc(), rewriter);
+    // Replace the warp op result with the final result.
+    rewriter.replaceAllUsesWith(reductionOp.getResult(), result);
+    return success();
+  }
+};
+
+/// Distribute a `vector.shape_cast` op feeding into yield op of an enclosing
+/// `gpu.warp_execute_on_lane_0` region.
+struct VectorShapeCastDistribution : public gpu::WarpDistributionPattern {
+  using gpu::WarpDistributionPattern::WarpDistributionPattern;
+  LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
+                                PatternRewriter &rewriter) const override {
+    OpOperand *yieldOperand =
+        getWarpResult(warpOp, llvm::IsaPred<vector::ShapeCastOp>);
+    if (!yieldOperand)
+      return failure();
+    auto shapeCastOp =
+        cast<vector::ShapeCastOp>(yieldOperand->get().getDefiningOp());
+    unsigned operandNumber = yieldOperand->getOperandNumber();
+    auto resultDistTy =
+        cast<VectorType>(warpOp.getResult(operandNumber).getType());
+    xegpu::DistributeLayoutAttr sourceLayout =
+        xegpu::getDistributeLayoutAttr(shapeCastOp.getSource());
+    xegpu::DistributeLayoutAttr resultLayout =
+        xegpu::getDistributeLayoutAttr(shapeCastOp.getResult());
+    if (!sourceLayout || !resultLayout)
+      return rewriter.notifyMatchFailure(
+          warpOp,
+          "the source or result of shape_cast op lacks distribution layout");
+
+    // For rank reducing or increasing shape_cast ops, the lower rank layout
+    // must be a slice of higher rank layout.
+    int64_t sourceRank = shapeCastOp.getSourceVectorType().getRank();
+    int64_t resultRank = shapeCastOp.getResultVectorType().getRank();
+    if (sourceRank < resultRank && !sourceLayout.isSliceOf(resultLayout))
+      return rewriter.notifyMatchFailure(
+          warpOp, "shape_cast is rank reducing but source layout is not a "
+                  "slice of result layout");
+    if (sourceRank > resultRank && !resultLayout.isSliceOf(sourceLayout))
+      return rewriter.notifyMatchFailure(
+          warpOp, "shape_cast is rank increasing but result layout is not a "
+                  "slice of source layout");
+
+    FailureOr<VectorType> sourceDistTypeOrFailure =
+        getDistVecTypeBasedOnLaneLayout(sourceLayout,
+                                        shapeCastOp.getSourceVectorType());
+    if (failed(sourceDistTypeOrFailure))
+      return rewriter.notifyMatchFailure(
+          warpOp, "failed to get distributed vector type for source");
+    VectorType sourceDistType = sourceDistTypeOrFailure.value();
+    // Create a new warp op that yields the source of the shape_cast op.
+    SmallVector<size_t> newRetIndices;
+    auto newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
+        rewriter, warpOp, {shapeCastOp.getSource()}, {sourceDistType},
+        newRetIndices);
+    rewriter.setInsertionPointAfter(newWarpOp);
+    Value source = newWarpOp.getResult(newRetIndices[0]);
+    // Create a new shape_cast op outside the warp op.
+    Value newShapeCast = vector::ShapeCastOp::create(
+        rewriter, shapeCastOp.getLoc(), resultDistTy, source);
+    rewriter.replaceAllUsesWith(newWarpOp.getResult(operandNumber),
+                                newShapeCast);
+    return success();
+  }
+};
+
 } // namespace
 
 namespace {
 struct XeGPUSubgroupDistributePass final
     : public xegpu::impl::XeGPUSubgroupDistributeBase<
           XeGPUSubgroupDistributePass> {
+  XeGPUSubgroupDistributePass() = default;
+  XeGPUSubgroupDistributePass(const XeGPUSubgroupDistributePass &other) =
+      default;
+  XeGPUSubgroupDistributePass(xegpu::XeGPUSubgroupDistributeOptions options)
+      : XeGPUSubgroupDistributeBase(options) {}
   void runOnOperation() override;
 };
 } // namespace
@@ -1016,8 +1296,13 @@ void xegpu::populateXeGPUSubgroupDistributePatterns(
   patterns
       .add<CreateNdDescDistribution, StoreNdDistribution, LoadNdDistribution,
            DpasDistribution, PrefetchNdDistribution, UpdateNdOffsetDistribution,
-           GpuBarrierDistribution, LoadDistribution, StoreDistribution>(
-          patterns.getContext());
+           GpuBarrierDistribution, VectorMultiReductionDistribution,
+           LoadDistribution, StoreDistribution>(
+          patterns.getContext(),
+          /*pattern benefit=*/regularPatternBenefit);
+  patterns.add<VectorShapeCastDistribution>(
+      patterns.getContext(),
+      /*pattern benefit=*/highPatternBenefit);
 }
 
 void XeGPUSubgroupDistributePass::runOnOperation() {
@@ -1032,8 +1317,7 @@ void XeGPUSubgroupDistributePass::runOnOperation() {
       if (!isa<VectorType>(operand.get().getType()))
         continue;
 
-      auto layout =
-          xegpu::getDistributeLayoutAttrOfType<xegpu::LayoutAttr>(operand);
+      auto layout = xegpu::getDistributeLayoutAttr(operand.get());
       if (!layout) {
         op->emitError("Could not find layout attribute for operand ")
             << operand.getOperandNumber() << " of operation " << op->getName();
@@ -1074,18 +1358,15 @@ void XeGPUSubgroupDistributePass::runOnOperation() {
     if (vecRank == 0)
       return AffineMap::get(val.getContext());
     // Get the layout of the vector type.
-    // TODO: support more layout types
-    auto layout = xegpu::getDistributeLayoutAttrOfType<xegpu::LayoutAttr>(val);
+    xegpu::DistributeLayoutAttr layout = xegpu::getDistributeLayoutAttr(val);
     // If no layout is specified, assume the inner most dimension is distributed
     // for now.
     if (!layout)
       return AffineMap::getMultiDimMapWithTargets(
           vecRank, {static_cast<unsigned int>(vecRank - 1)}, val.getContext());
     SmallVector<unsigned int> distributedDims;
-    // Get the distributed dimensions based on the layout.
-    ArrayRef<int> laneLayout = layout.getLaneLayout().asArrayRef();
-    for (unsigned i = 0; i < laneLayout.size(); ++i) {
-      if (laneLayout[i] > 1)
+    for (auto [i, v] : llvm::enumerate(layout.getEffectiveLaneLayoutAsInt())) {
+      if (v > 1)
         distributedDims.push_back(i);
     }
     return AffineMap::getMultiDimMapWithTargets(vecRank, distributedDims,
@@ -1094,8 +1375,32 @@ void XeGPUSubgroupDistributePass::runOnOperation() {
   // TODO: shuffleFn is not used.
   auto shuffleFn = [](Location loc, OpBuilder &builder, Value val, Value srcIdx,
                       int64_t warpSz) { return Value(); };
+
+  auto warpReduction = [](Location loc, OpBuilder &builder, Value input,
+                          vector::CombiningKind kind, uint32_t size) {
+    // First reduce on a single thread to get per lane reduction value.
+    Value laneVal = builder.create<vector::ReductionOp>(loc, kind, input);
+    // Parallel reduction using butterfly shuffles.
+    for (uint64_t i = 1; i < size; i <<= 1) {
+      Value shuffled =
+          builder
+              .create<gpu::ShuffleOp>(loc, laneVal, i,
+                                      /*width=*/size,
+                                      /*mode=*/gpu::ShuffleMode::XOR)
+              .getShuffleResult();
+      laneVal = makeArithReduction(builder, loc, kind, laneVal, shuffled);
+    }
+    return laneVal;
+  };
+
+  if (enableSGReductions)
+    vector::populateDistributeReduction(
+        patterns, warpReduction,
+        /*pattern benefit=*/regularPatternBenefit);
+
   vector::populatePropagateWarpVectorDistributionPatterns(
-      patterns, distributionFn, shuffleFn);
+      patterns, distributionFn, shuffleFn,
+      /*pattern benefit=*/regularPatternBenefit);
   if (failed(applyPatternsGreedily(getOperation(), std::move(patterns)))) {
     signalPassFailure();
     return;
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
index 5d0f1d18402f2..d7592fed6d186 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
@@ -52,9 +52,9 @@ getSgShapeAndCount(ArrayRef<int64_t> shape,
   int count = 1;
   SmallVector<int64_t> sgShape(shape);
   if (layout && layout.isForWorkgroup()) {
-    SmallVector<int64_t> sgLayout = layout.getSgLayoutAsInt();
-    if (!layout.getSgDataAsInt().empty())
-      sgShape = layout.getSgDataAsInt();
+    SmallVector<int64_t> sgLayout = layout.getEffectiveSgLayoutAsInt();
+    if (!layout.getEffectiveSgDataAsInt().empty())
+      sgShape = layout.getEffectiveSgDataAsInt();
     else if (auto maybeDerivedSgData = computeShapeRatio(shape, sgLayout))
       sgShape = *maybeDerivedSgData;
     SmallVector<int64_t> distUnit = computeElementwiseMul(sgLayout, sgShape);
@@ -468,6 +468,7 @@ struct WgToSgVectorBroadcastOp
   LogicalResult
   matchAndRewrite(vector::BroadcastOp op, OneToNOpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
+
     VectorType resultType = op.getResult().getType();
     ArrayRef<int64_t> wgShape = resultType.getShape();
 
@@ -476,43 +477,24 @@ struct WgToSgVectorBroadcastOp
     if (!layout || !layout.isForWorkgroup())
       return failure();
 
-    // TODO: Currently only supports cases where the source and result ranks
-    // are the same.
-    auto srcType =
-        dyn_cast<VectorType>(adaptor.getOperands().front()[0].getType());
-    if (!srcType || srcType.getRank() != resultType.getRank())
-      return failure();
-
     SmallVector<int64_t> sgShape = getSgShapeAndCount(wgShape, layout).first;
     VectorType newResultType =
         VectorType::get(sgShape, resultType.getElementType());
 
-    // Check if the output layout is distributable
-    SmallVector<int64_t> sgLayout = layout.getSgLayoutAsInt();
-    if (sgLayout.empty())
-      return failure();
-
     if (!xegpu::XeGPUDialect::isEvenlyDistributable(wgShape, layout))
       return failure();
 
-    // Check if the srcShape has unit dim in dimensions being broadcasted,
-    // and the other dimensions are the same as the destination type
-    // TODO: Generalize it
-    auto srcShape = srcType.getShape();
-    for (size_t i = 0; i < srcShape.size(); ++i) {
-      if (srcShape[i] != 1 && srcShape[i] != sgShape[i])
-        return failure();
-    }
-
     SmallVector<Value> newBroadcastOps;
     for (auto operand : adaptor.getOperands().front()) {
       auto newBroadcast = vector::BroadcastOp::create(rewriter, op.getLoc(),
                                                       newResultType, operand);
-      xegpu::setDistributeLayoutAttr(newBroadcast->getResult(0),
-                                     layout.dropSgLayoutAndData());
+      if (!layout.getEffectiveLaneLayoutAsInt().empty() ||
+          !layout.getEffectiveInstDataAsInt().empty())
+        xegpu::setDistributeLayoutAttr(newBroadcast->getResult(0),
+                                       layout.dropSgLayoutAndData());
+
       newBroadcastOps.push_back(newBroadcast.getResult());
     }
-
     rewriter.replaceOpWithMultiple(op, {newBroadcastOps});
     return success();
   }
@@ -564,9 +546,11 @@ struct WgToSgElementwiseOp : public ConversionPattern {
       // Copy all attributes, but update "layout_result_0" to drop
       // sgLayout/sgData
       for (auto attr : op->getAttrs()) {
-        if (auto layout = dyn_cast<xegpu::LayoutAttr>(attr.getValue())) {
-          if (auto newLayout = layout.dropSgLayoutAndData())
-            state.addAttribute(attr.getName(), newLayout);
+        if (auto layout =
+                dyn_cast<xegpu::DistributeLayoutAttr>(attr.getValue())) {
+          if (!layout.getEffectiveLaneLayoutAsInt().empty() ||
+              !layout.getEffectiveInstDataAsInt().empty())
+            state.addAttribute(attr.getName(), layout.dropSgLayoutAndData());
         } else {
           state.addAttribute(attr.getName(), attr.getValue());
         }
@@ -757,8 +741,10 @@ struct WgToSgArithConstantOp : public OpConversionPattern<arith::ConstantOp> {
     auto sgAttr = DenseElementsAttr::get(newType, singleVal);
     auto cstOp =
         arith::ConstantOp::create(rewriter, op.getLoc(), newType, sgAttr);
-    if (auto newLayout = layout.dropSgLayoutAndData())
-      xegpu::setDistributeLayoutAttr(cstOp->getResult(0), newLayout);
+    if (!layout.getEffectiveLaneLayoutAsInt().empty() ||
+        !layout.getEffectiveInstDataAsInt().empty())
+      xegpu::setDistributeLayoutAttr(cstOp->getResult(0),
+                                     layout.dropSgLayoutAndData());
     SmallVector<Value> newConsts(count, cstOp);
 
     rewriter.replaceOpWithMultiple(op, {newConsts});
@@ -919,6 +905,128 @@ struct WgToSgStoreMatrixOp : public OpConversionPattern<xegpu::StoreMatrixOp> {
   }
 };
 
+// This pattern distributes the vector.step ops to work at subgroup level
+struct WgToSgVectorStepOp : public OpConversionPattern<vector::StepOp> {
+  using OpConversionPattern<vector::StepOp>::OpConversionPattern;
+  LogicalResult
+  matchAndRewrite(vector::StepOp op, OneToNOpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    xegpu::DistributeLayoutAttr layout =
+        xegpu::getDistributeLayoutAttr(op.getResult());
+    if (!layout || !layout.isForWorkgroup())
+      return failure();
+
+    Location loc = op.getLoc();
+    VectorType type = op.getResult().getType();
+    auto wgShape = type.getShape();
+    std::optional<SmallVector<int64_t>> sgShape =
+        getSgShapeAndCount(wgShape, layout).first;
+    if (!sgShape)
+      return failure();
+
+    Value sgId =
+        gpu::SubgroupIdOp::create(rewriter, loc, /*upper_bound=*/nullptr);
+    auto sgOffsets = layout.getOffsets(rewriter, loc, sgId, wgShape);
+    if (failed(sgOffsets))
+      return failure();
+
+    VectorType newTy = type.cloneWith(*sgShape, type.getElementType());
+    auto steps = vector::StepOp::create(rewriter, loc, newTy);
+    SmallVector<Value> newOps;
+    for (auto offsets : *sgOffsets) {
+      // Broadcast the offset scalar to a vector & add to the base steps
+      auto bcastOffset =
+          vector::BroadcastOp::create(rewriter, loc, newTy, offsets[0]);
+      auto finalSteps =
+          arith::AddIOp::create(rewriter, loc, steps, bcastOffset);
+      if (!layout.getEffectiveLaneLayoutAsInt().empty() ||
+          !layout.getEffectiveInstDataAsInt().empty()) {
+        xegpu::setDistributeLayoutAttr(steps->getResult(0),
+                                       layout.dropSgLayoutAndData());
+        xegpu::setDistributeLayoutAttr(bcastOffset->getResult(0),
+                                       layout.dropSgLayoutAndData());
+        xegpu::setDistributeLayoutAttr(finalSteps->getResult(0),
+                                       layout.dropSgLayoutAndData());
+      }
+      newOps.push_back(finalSteps);
+    }
+
+    rewriter.replaceOpWithMultiple(op, {newOps});
+    return success();
+  }
+};
+
+// This pattern transforms vector.shape_cast ops to work at subgroup level.
+struct WgToSgVectorShapeCastOp
+    : public OpConversionPattern<vector::ShapeCastOp> {
+  using OpConversionPattern<vector::ShapeCastOp>::OpConversionPattern;
+
+  LogicalResult
+  matchAndRewrite(vector::ShapeCastOp op, OneToNOpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+
+    VectorType resultType = dyn_cast<VectorType>(op.getResult().getType());
+    if (!resultType)
+      return failure();
+
+    ArrayRef<int64_t> wgShape = resultType.getShape();
+    xegpu::DistributeLayoutAttr layout =
+        xegpu::getDistributeLayoutAttr(op.getResult());
+    if (!layout || !layout.isForWorkgroup())
+      return failure();
+
+    SmallVector<int64_t> sgShape = getSgShapeAndCount(wgShape, layout).first;
+    VectorType newResultType =
+        VectorType::get(sgShape, resultType.getElementType());
+
+    // TODO: Add check for compatible layouts in layout attr.
+    auto srcType = dyn_cast<VectorType>(adaptor.getSource()[0].getType());
+    if (!srcType)
+      return failure();
+
+    // Check that shape_cast only adds/removes unit dimensions,
+    auto onlyUnitDims = [](ArrayRef<int64_t> src, ArrayRef<int64_t> dst) {
+      // Remove all 1s from both shapes and compare the rest.
+      SmallVector<int64_t> srcNonUnit, dstNonUnit;
+      for (int64_t d : src)
+        if (d != 1)
+          srcNonUnit.push_back(d);
+      for (int64_t d : dst)
+        if (d != 1)
+          dstNonUnit.push_back(d);
+      return srcNonUnit == dstNonUnit;
+    };
+
+    if (!onlyUnitDims(srcType.getShape(), sgShape))
+      return failure();
+
+    // For rank reducing or increasing shape_cast ops, the lower rank layout
+    // must be a slice of higher rank layout.
+    int64_t sourceRank = srcType.getRank();
+    int64_t resultRank = sgShape.size();
+    xegpu::DistributeLayoutAttr sourceLayout =
+        xegpu::getDistributeLayoutAttr(op.getSource());
+    if (sourceRank < resultRank && !sourceLayout.isSliceOf(layout))
+      return failure();
+    if (sourceRank > resultRank && !layout.isSliceOf(sourceLayout))
+      return failure();
+
+    SmallVector<Value> newShapeCastOps;
+    for (auto src : adaptor.getSource()) {
+      auto newShapeCast =
+          rewriter.create<vector::ShapeCastOp>(op.getLoc(), newResultType, src);
+      if (!layout.getEffectiveLaneLayoutAsInt().empty() ||
+          !layout.getEffectiveInstDataAsInt().empty())
+        xegpu::setDistributeLayoutAttr(newShapeCast->getResult(0),
+                                       layout.dropSgLayoutAndData());
+      newShapeCastOps.push_back(newShapeCast.getResult());
+    }
+
+    rewriter.replaceOpWithMultiple(op, {newShapeCastOps});
+    return success();
+  }
+};
+
 } // namespace
 
 namespace mlir {
@@ -932,7 +1040,8 @@ void populateXeGPUWgToSgDistributePatterns(RewritePatternSet &patterns) {
            WgToSgElementwiseOp, WgToSgVectorBroadcastOp, WgToSgConvertLayoutOp,
            WgToSgArithConstantOp, WgToSgLoadGatherOpWithOffset,
            WgToSgStoreScatterOpWithOffset, WgToSgLoadMatrixOp,
-           WgToSgStoreMatrixOp>(patterns.getContext());
+           WgToSgStoreMatrixOp, WgToSgVectorStepOp, WgToSgVectorShapeCastOp>(
+          patterns.getContext());
 }
 } // namespace xegpu
 } // namespace mlir
@@ -1054,7 +1163,16 @@ void XeGPUWgToSgDistributePass::runOnOperation() {
         auto vecType = dyn_cast<VectorType>(op.getType());
         if (!vecType)
           return true;
-        return isLegal(xegpu::getDistributeLayoutAttr(op.getResult()));
+
+        auto layout = xegpu::getDistributeLayoutAttr(op.getResult());
+        return isLegal(layout);
+      });
+
+  target.addDynamicallyLegalOp<vector::ShapeCastOp, vector::StepOp>(
+      [=](Operation *op) -> bool {
+        // Check for either a SliceAttr or LayoutAttr on the result.
+        auto layout = xegpu::getDistributeLayoutAttr(op->getResult(0));
+        return isLegal(layout);
       });
 
   target.addDynamicallyLegalOp<xegpu::LoadGatherOp>(
diff --git a/mlir/lib/IR/Remarks.cpp b/mlir/lib/IR/Remarks.cpp
index 78c964427868f..a55f61aff77bb 100644
--- a/mlir/lib/IR/Remarks.cpp
+++ b/mlir/lib/IR/Remarks.cpp
@@ -248,17 +248,56 @@ RemarkEngine::initialize(std::unique_ptr<MLIRRemarkStreamerBase> streamer,
   return success();
 }
 
+/// Returns true if filter is already anchored like ^...$
+static bool isAnchored(llvm::StringRef s) {
+  s = s.trim();
+  return s.starts_with("^") && s.ends_with("$"); // note: startswith/endswith
+}
+
+/// Anchor the entire pattern so it matches the whole string.
+static std::string anchorWhole(llvm::StringRef filter) {
+  if (isAnchored(filter))
+    return filter.str();
+  return (llvm::Twine("^(") + filter + ")$").str();
+}
+
+/// Build a combined filter from cats.all and a category-specific pattern.
+/// If neither is present, return std::nullopt. Otherwise "(all|specific)"
+/// and anchor once. Also validate before returning.
+static std::optional<llvm::Regex>
+buildFilter(const mlir::remark::RemarkCategories &cats,
+            const std::optional<std::string> &specific) {
+  llvm::SmallVector<llvm::StringRef, 2> parts;
+  if (cats.all && !cats.all->empty())
+    parts.emplace_back(*cats.all);
+  if (specific && !specific->empty())
+    parts.emplace_back(*specific);
+
+  if (parts.empty())
+    return std::nullopt;
+
+  std::string joined = llvm::join(parts, "|");
+  std::string anchored = anchorWhole(joined);
+
+  llvm::Regex rx(anchored);
+  std::string err;
+  if (!rx.isValid(err))
+    return std::nullopt;
+
+  return std::make_optional<llvm::Regex>(std::move(rx));
+}
+
 RemarkEngine::RemarkEngine(bool printAsEmitRemarks,
                            const RemarkCategories &cats)
     : printAsEmitRemarks(printAsEmitRemarks) {
   if (cats.passed)
-    passedFilter = llvm::Regex(cats.passed.value());
+    passedFilter = buildFilter(cats, cats.passed);
   if (cats.missed)
-    missFilter = llvm::Regex(cats.missed.value());
+    missFilter = buildFilter(cats, cats.missed);
   if (cats.analysis)
-    analysisFilter = llvm::Regex(cats.analysis.value());
+    analysisFilter = buildFilter(cats, cats.analysis);
   if (cats.failed)
-    failedFilter = llvm::Regex(cats.failed.value());
+    failedFilter = buildFilter(cats, cats.failed);
 }
 
 llvm::LogicalResult mlir::remark::enableOptimizationRemarks(
diff --git a/mlir/lib/Interfaces/ParallelCombiningOpInterface.cpp b/mlir/lib/Interfaces/ParallelCombiningOpInterface.cpp
index 2b6703543bbd3..30b8191bf34b0 100644
--- a/mlir/lib/Interfaces/ParallelCombiningOpInterface.cpp
+++ b/mlir/lib/Interfaces/ParallelCombiningOpInterface.cpp
@@ -11,11 +11,11 @@
 using namespace mlir;
 
 //===----------------------------------------------------------------------===//
-// ParallelCombiningOpInterface
+// InParallelOpInterface (formerly ParallelCombiningOpInterface)
 //===----------------------------------------------------------------------===//
 
 // TODO: Single region single block interface on interfaces ?
-LogicalResult mlir::detail::verifyParallelCombiningOpInterface(Operation *op) {
+LogicalResult mlir::detail::verifyInParallelOpInterface(Operation *op) {
   if (op->getNumRegions() != 1)
     return op->emitError("expected single region op");
   if (!op->getRegion(0).hasOneBlock())
diff --git a/mlir/lib/Interfaces/ValueBoundsOpInterface.cpp b/mlir/lib/Interfaces/ValueBoundsOpInterface.cpp
index caa909186eb2c..d2bafb701046e 100644
--- a/mlir/lib/Interfaces/ValueBoundsOpInterface.cpp
+++ b/mlir/lib/Interfaces/ValueBoundsOpInterface.cpp
@@ -6,6 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include <utility>
+
 #include "mlir/Interfaces/ValueBoundsOpInterface.h"
 
 #include "mlir/IR/BuiltinTypes.h"
@@ -151,7 +153,7 @@ ValueBoundsConstraintSet::Variable::Variable(AffineMap map,
                                         [](Value v) { return Variable(v); })) {}
 
 ValueBoundsConstraintSet::ValueBoundsConstraintSet(
-    MLIRContext *ctx, StopConditionFn stopCondition,
+    MLIRContext *ctx, const StopConditionFn &stopCondition,
     bool addConservativeSemiAffineBounds)
     : builder(ctx), stopCondition(stopCondition),
       addConservativeSemiAffineBounds(addConservativeSemiAffineBounds) {
@@ -302,7 +304,8 @@ int64_t ValueBoundsConstraintSet::insert(bool isSymbol) {
   return pos;
 }
 
-int64_t ValueBoundsConstraintSet::insert(AffineMap map, ValueDimList operands,
+int64_t ValueBoundsConstraintSet::insert(AffineMap map,
+                                         const ValueDimList &operands,
                                          bool isSymbol) {
   assert(map.getNumResults() == 1 && "expected affine map with one result");
   int64_t pos = insert(isSymbol);
@@ -629,7 +632,7 @@ LogicalResult ValueBoundsConstraintSet::computeIndependentBound(
 
 FailureOr<int64_t> ValueBoundsConstraintSet::computeConstantBound(
     presburger::BoundType type, const Variable &var,
-    StopConditionFn stopCondition, bool closedUB) {
+    const StopConditionFn &stopCondition, bool closedUB) {
   // Default stop condition if none was specified: Keep adding constraints until
   // a bound could be computed.
   int64_t pos = 0;
@@ -666,7 +669,7 @@ void ValueBoundsConstraintSet::populateConstraints(Value value,
 
 int64_t ValueBoundsConstraintSet::populateConstraints(AffineMap map,
                                                       ValueDimList operands) {
-  int64_t pos = insert(map, operands, /*isSymbol=*/false);
+  int64_t pos = insert(map, std::move(operands), /*isSymbol=*/false);
   // Process the backward slice of `operands` (i.e., reverse use-def chain)
   // until `stopCondition` is met.
   processWorklist();
@@ -826,10 +829,9 @@ FailureOr<bool> ValueBoundsConstraintSet::areEqual(const Variable &var1,
   return strongCompare(var1, ComparisonOperator::EQ, var2);
 }
 
-FailureOr<bool>
-ValueBoundsConstraintSet::areOverlappingSlices(MLIRContext *ctx,
-                                               HyperrectangularSlice slice1,
-                                               HyperrectangularSlice slice2) {
+FailureOr<bool> ValueBoundsConstraintSet::areOverlappingSlices(
+    MLIRContext *ctx, const HyperrectangularSlice &slice1,
+    const HyperrectangularSlice &slice2) {
   assert(slice1.getMixedOffsets().size() == slice2.getMixedOffsets().size() &&
          "expected slices of same rank");
   assert(slice1.getMixedSizes().size() == slice2.getMixedSizes().size() &&
@@ -891,10 +893,9 @@ ValueBoundsConstraintSet::areOverlappingSlices(MLIRContext *ctx,
   return true;
 }
 
-FailureOr<bool>
-ValueBoundsConstraintSet::areEquivalentSlices(MLIRContext *ctx,
-                                              HyperrectangularSlice slice1,
-                                              HyperrectangularSlice slice2) {
+FailureOr<bool> ValueBoundsConstraintSet::areEquivalentSlices(
+    MLIRContext *ctx, const HyperrectangularSlice &slice1,
+    const HyperrectangularSlice &slice2) {
   assert(slice1.getMixedOffsets().size() == slice2.getMixedOffsets().size() &&
          "expected slices of same rank");
   assert(slice1.getMixedSizes().size() == slice2.getMixedSizes().size() &&
diff --git a/mlir/lib/Pass/Pass.cpp b/mlir/lib/Pass/Pass.cpp
index 7094c8e279f2d..521c7c6be17b6 100644
--- a/mlir/lib/Pass/Pass.cpp
+++ b/mlir/lib/Pass/Pass.cpp
@@ -21,11 +21,14 @@
 #include "llvm/ADT/Hashing.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/ScopeExit.h"
+#include "llvm/Support/DebugLog.h"
 #include "llvm/Support/Mutex.h"
 #include "llvm/Support/Signals.h"
 #include "llvm/Support/Threading.h"
 #include <optional>
 
+#define DEBUG_TYPE "pass-manager"
+
 using namespace mlir;
 using namespace mlir::detail;
 
@@ -242,6 +245,7 @@ LogicalResult OpPassManagerImpl::finalizePassList(MLIRContext *ctx) {
   };
 
   // Walk the pass list and merge adjacent adaptors.
+  LDBG(3) << "Merging adjacent adaptors in pass list";
   OpToOpPassAdaptor *lastAdaptor = nullptr;
   for (auto &pass : passes) {
     // Check to see if this pass is an adaptor.
@@ -249,18 +253,26 @@ LogicalResult OpPassManagerImpl::finalizePassList(MLIRContext *ctx) {
       // If it is the first adaptor in a possible chain, remember it and
       // continue.
       if (!lastAdaptor) {
+        LDBG(3) << "Found first adaptor in chain";
         lastAdaptor = currentAdaptor;
         continue;
       }
 
       // Otherwise, try to merge into the existing adaptor and delete the
       // current one. If merging fails, just remember this as the last adaptor.
-      if (succeeded(currentAdaptor->tryMergeInto(ctx, *lastAdaptor)))
+      LDBG(3) << "Attempting to merge adaptor with "
+              << currentAdaptor->getPassManagers().size()
+              << " managers into previous adaptor";
+      if (succeeded(currentAdaptor->tryMergeInto(ctx, *lastAdaptor))) {
+        LDBG(3) << "Successfully merged adaptors, removing current one";
         pass.reset();
-      else
+      } else {
+        LDBG(3) << "Failed to merge adaptors, keeping current as last";
         lastAdaptor = currentAdaptor;
+      }
     } else if (lastAdaptor) {
       // If this pass isn't an adaptor, finalize it and forget the last adaptor.
+      LDBG(3) << "Finalizing adaptor chain before non-adaptor pass";
       if (failed(finalizeAdaptor(lastAdaptor)))
         return failure();
       lastAdaptor = nullptr;
@@ -273,15 +285,26 @@ LogicalResult OpPassManagerImpl::finalizePassList(MLIRContext *ctx) {
 
   // Now that the adaptors have been merged, erase any empty slots corresponding
   // to the merged adaptors that were nulled-out in the loop above.
+  size_t beforeErase = passes.size();
   llvm::erase_if(passes, std::logical_not<std::unique_ptr<Pass>>());
+  if (beforeErase != passes.size()) {
+    LDBG(3) << "Removed " << (beforeErase - passes.size())
+            << " merged adaptor slots from pass list";
+  }
 
   // If this is a op-agnostic pass manager, there is nothing left to do.
   std::optional<OperationName> rawOpName = getOpName(*ctx);
-  if (!rawOpName)
+  if (!rawOpName) {
+    LDBG(3)
+        << "Op-agnostic pass manager, skipping operation-specific verification";
     return success();
+  }
 
   // Otherwise, verify that all of the passes are valid for the current
   // operation anchor.
+  LDBG(3) << "Verifying " << passes.size() << " passes for operation '"
+          << getOpAnchorName() << "'";
+
   std::optional<RegisteredOperationName> opName =
       rawOpName->getRegisteredInfo();
   for (std::unique_ptr<Pass> &pass : passes) {
@@ -292,6 +315,8 @@ LogicalResult OpPassManagerImpl::finalizePassList(MLIRContext *ctx) {
              << "'!";
     }
   }
+
+  LDBG(3) << "Pass list finalization completed successfully";
   return success();
 }
 
@@ -456,23 +481,45 @@ OpPassManager::Nesting OpPassManager::getNesting() { return impl->nesting; }
 
 LogicalResult OpPassManager::initialize(MLIRContext *context,
                                         unsigned newInitGeneration) {
-  if (impl->initializationGeneration == newInitGeneration)
+
+  if (impl->initializationGeneration == newInitGeneration) {
+    LDBG(2) << "Pass manager already initialized "
+            << "' (generation " << newInitGeneration << ") with " << size()
+            << " passes";
     return success();
+  }
+
+  LDBG(2) << "Initializing pass manager '" << getOpAnchorName()
+          << "' (generation " << newInitGeneration << ") with " << size()
+          << " passes";
   impl->initializationGeneration = newInitGeneration;
+
   for (Pass &pass : getPasses()) {
     // If this pass isn't an adaptor, directly initialize it.
     auto *adaptor = dyn_cast<OpToOpPassAdaptor>(&pass);
     if (!adaptor) {
-      if (failed(pass.initialize(context)))
+      LDBG(2) << "Initializing pass '" << pass.getName() << "'";
+      if (failed(pass.initialize(context))) {
+        LDBG(2) << "Failed to initialize pass '" << pass.getName() << "'";
         return failure();
+      }
       continue;
     }
 
     // Otherwise, initialize each of the adaptors pass managers.
+    LDBG(3) << "Initializing adaptor pass with "
+            << adaptor->getPassManagers().size() << " nested managers";
     for (OpPassManager &adaptorPM : adaptor->getPassManagers())
-      if (failed(adaptorPM.initialize(context, newInitGeneration)))
+      if (failed(adaptorPM.initialize(context, newInitGeneration))) {
+        LDBG(2) << "Failed to initialize nested pass manager";
         return failure();
+      }
   }
+
+  LDBG_OS([&](raw_ostream &os) {
+    os << "Pass manager initialization completed successfully: ";
+    printAsTextualPipeline(os, /*pretty=*/false);
+  });
   return success();
 }
 
@@ -499,16 +546,23 @@ llvm::hash_code OpPassManager::hash() {
 LogicalResult OpToOpPassAdaptor::run(Pass *pass, Operation *op,
                                      AnalysisManager am, bool verifyPasses,
                                      unsigned parentInitGeneration) {
+  LDBG() << "Running pass '" << pass->getName() << "' on operation '"
+         << OpWithFlags(op, OpPrintingFlags().skipRegions()) << "' at "
+         << op->getLoc();
+
   std::optional<RegisteredOperationName> opInfo = op->getRegisteredInfo();
-  if (!opInfo)
+  if (!opInfo) {
     return op->emitOpError()
            << "trying to schedule a pass on an unregistered operation";
-  if (!opInfo->hasTrait<OpTrait::IsIsolatedFromAbove>())
+  }
+  if (!opInfo->hasTrait<OpTrait::IsIsolatedFromAbove>()) {
     return op->emitOpError() << "trying to schedule a pass on an operation not "
                                 "marked as 'IsolatedFromAbove'";
-  if (!pass->canScheduleOn(*op->getName().getRegisteredInfo()))
+  }
+  if (!pass->canScheduleOn(*op->getName().getRegisteredInfo())) {
     return op->emitOpError()
            << "trying to schedule a pass on an unsupported operation";
+  }
 
   // Initialize the pass state with a callback for the pass to dynamically
   // execute a pipeline on the currently visited operation.
@@ -526,8 +580,10 @@ LogicalResult OpToOpPassAdaptor::run(Pass *pass, Operation *op,
         pipeline.getImpl().canScheduleOn(*op->getContext(), root->getName()));
 
     // Before running, finalize the passes held by the pipeline.
-    if (failed(pipeline.getImpl().finalizePassList(root->getContext())))
+    if (failed(pipeline.getImpl().finalizePassList(root->getContext()))) {
+      LDBG() << "Failed to finalize pass list for pipeline";
       return failure();
+    }
 
     // Initialize the user provided pipeline and execute the pipeline.
     if (failed(pipeline.initialize(root->getContext(), parentInitGeneration)))
@@ -599,6 +655,13 @@ LogicalResult OpToOpPassAdaptor::runPipeline(
     OpPassManager &pm, Operation *op, AnalysisManager am, bool verifyPasses,
     unsigned parentInitGeneration, PassInstrumentor *instrumentor,
     const PassInstrumentation::PipelineParentInfo *parentInfo) {
+  LDBG_OS([&](raw_ostream &os) {
+    os << "Running pipeline on operation '"
+       << OpWithFlags(op, OpPrintingFlags().skipRegions()) << "' with "
+       << pm.size() << " passes, verifyPasses=" << verifyPasses
+       << " pipeline: ";
+    pm.printAsTextualPipeline(os, /*pretty=*/false);
+  });
   assert((!instrumentor || parentInfo) &&
          "expected parent info if instrumentor is provided");
   auto scopeExit = llvm::make_scope_exit([&] {
@@ -615,9 +678,14 @@ LogicalResult OpToOpPassAdaptor::runPipeline(
                                     *parentInfo);
   }
 
-  for (Pass &pass : pm.getPasses())
-    if (failed(run(&pass, op, am, verifyPasses, parentInitGeneration)))
+  for (Pass &pass : pm.getPasses()) {
+    if (failed(run(&pass, op, am, verifyPasses, parentInitGeneration))) {
+      LDBG() << "Pipeline failed for pass '" << pass.getName()
+             << "' on operation '"
+             << OpWithFlags(op, OpPrintingFlags().skipRegions()) << "'";
       return failure();
+    }
+  }
 
   if (instrumentor) {
     instrumentor->runAfterPipeline(pm.getOpName(*op->getContext()),
@@ -630,9 +698,19 @@ LogicalResult OpToOpPassAdaptor::runPipeline(
 /// does not exist.
 static OpPassManager *
 findPassManagerWithAnchor(MutableArrayRef<OpPassManager> mgrs, StringRef name) {
+  LDBG(3) << "Looking for pass manager with anchor name '" << name << "' among "
+          << mgrs.size() << " managers";
+
   auto *it = llvm::find_if(
       mgrs, [&](OpPassManager &mgr) { return mgr.getOpAnchorName() == name; });
-  return it == mgrs.end() ? nullptr : &*it;
+
+  if (it == mgrs.end()) {
+    LDBG(2) << "No pass manager found with anchor name '" << name << "'";
+    return nullptr;
+  }
+
+  LDBG(2) << "Found pass manager with anchor name '" << name << "'";
+  return &*it;
 }
 
 /// Find an operation pass manager that can operate on an operation of the given
@@ -640,10 +718,22 @@ findPassManagerWithAnchor(MutableArrayRef<OpPassManager> mgrs, StringRef name) {
 static OpPassManager *findPassManagerFor(MutableArrayRef<OpPassManager> mgrs,
                                          OperationName name,
                                          MLIRContext &context) {
+  LDBG(4) << "Looking for pass manager that can handle operation '" << name
+          << "' among " << mgrs.size() << " managers";
+
   auto *it = llvm::find_if(mgrs, [&](OpPassManager &mgr) {
     return mgr.getImpl().canScheduleOn(context, name);
   });
-  return it == mgrs.end() ? nullptr : &*it;
+
+  if (it == mgrs.end()) {
+    LDBG(4) << "No pass manager found that can handle operation '" << name
+            << "'";
+    return nullptr;
+  }
+
+  LDBG(4) << "Found pass manager '" << it->getOpAnchorName()
+          << "' that can handle operation '" << name << "'";
+  return &*it;
 }
 
 OpToOpPassAdaptor::OpToOpPassAdaptor(OpPassManager &&mgr) {
@@ -657,6 +747,9 @@ void OpToOpPassAdaptor::getDependentDialects(DialectRegistry &dialects) const {
 
 LogicalResult OpToOpPassAdaptor::tryMergeInto(MLIRContext *ctx,
                                               OpToOpPassAdaptor &rhs) {
+  LDBG(3) << "Attempting to merge pass adaptor with " << mgrs.size()
+          << " managers into rhs with " << rhs.mgrs.size() << " managers";
+
   // Functor used to check if a pass manager is generic, i.e. op-agnostic.
   auto isGenericPM = [&](OpPassManager &pm) { return !pm.getOpName(); };
 
@@ -682,14 +775,24 @@ LogicalResult OpToOpPassAdaptor::tryMergeInto(MLIRContext *ctx,
   //
   // Check the current adaptor.
   auto *lhsGenericPMIt = llvm::find_if(mgrs, isGenericPM);
-  if (lhsGenericPMIt != mgrs.end() &&
-      hasScheduleConflictWith(*lhsGenericPMIt, rhs.mgrs))
-    return failure();
+  if (lhsGenericPMIt != mgrs.end()) {
+    LDBG(4) << "Found generic pass manager on LHS, checking for conflicts";
+    if (hasScheduleConflictWith(*lhsGenericPMIt, rhs.mgrs)) {
+      LDBG(4)
+          << "Merge failed: LHS generic pass manager has conflicts with RHS";
+      return failure();
+    }
+  }
   // Check the rhs adaptor.
   auto *rhsGenericPMIt = llvm::find_if(rhs.mgrs, isGenericPM);
-  if (rhsGenericPMIt != rhs.mgrs.end() &&
-      hasScheduleConflictWith(*rhsGenericPMIt, mgrs))
-    return failure();
+  if (rhsGenericPMIt != rhs.mgrs.end()) {
+    LDBG(4) << "Found generic pass manager on RHS, checking for conflicts";
+    if (hasScheduleConflictWith(*rhsGenericPMIt, mgrs)) {
+      LDBG(4)
+          << "Merge failed: RHS generic pass manager has conflicts with LHS";
+      return failure();
+    }
+  }
 
   for (auto &pm : mgrs) {
     // If an existing pass manager exists, then merge the given pass manager
@@ -744,25 +847,51 @@ void OpToOpPassAdaptor::runOnOperation(bool verifyPasses) {
 
 /// Run this pass adaptor synchronously.
 void OpToOpPassAdaptor::runOnOperationImpl(bool verifyPasses) {
+  LDBG_OS([&](raw_ostream &os) {
+    os << "Running pass adaptor synchronously on operation '"
+       << OpWithFlags(getOperation(), OpPrintingFlags().skipRegions())
+       << "' with " << mgrs.size()
+       << " pass managers, verifyPasses=" << verifyPasses << " pipeline: ";
+    printAsTextualPipeline(os, /*pretty=*/false);
+  });
+
   auto am = getAnalysisManager();
   PassInstrumentation::PipelineParentInfo parentInfo = {llvm::get_threadid(),
                                                         this};
   auto *instrumentor = am.getPassInstrumentor();
+
+  unsigned processedOps = 0;
   for (auto &region : getOperation()->getRegions()) {
     for (auto &block : region) {
       for (auto &op : block) {
         auto *mgr = findPassManagerFor(mgrs, op.getName(), *op.getContext());
-        if (!mgr)
+        if (!mgr) {
+          LDBG(2) << "Skipping operation '"
+                  << OpWithFlags(&op, OpPrintingFlags().skipRegions())
+                  << "': no suitable pass manager found";
           continue;
+        }
 
         // Run the held pipeline over the current operation.
+        LDBG(2) << "Processing operation '"
+                << OpWithFlags(&op, OpPrintingFlags().skipRegions())
+                << "' with pass manager '" << mgr->getOpAnchorName() << "'";
+
         unsigned initGeneration = mgr->impl->initializationGeneration;
         if (failed(runPipeline(*mgr, &op, am.nest(&op), verifyPasses,
-                               initGeneration, instrumentor, &parentInfo)))
+                               initGeneration, instrumentor, &parentInfo))) {
+          LDBG(2) << "Pipeline failed for operation '"
+                  << OpWithFlags(&op, OpPrintingFlags().skipRegions()) << "'";
           signalPassFailure();
+        } else {
+          processedOps++;
+        }
       }
     }
   }
+
+  LDBG() << "Completed synchronous pass adaptor run, processed " << processedOps
+         << " operations";
 }
 
 /// Utility functor that checks if the two ranges of pass managers have a size
@@ -776,13 +905,24 @@ static bool hasSizeMismatch(ArrayRef<OpPassManager> lhs,
 
 /// Run this pass adaptor synchronously.
 void OpToOpPassAdaptor::runOnOperationAsyncImpl(bool verifyPasses) {
+  LDBG_OS([&](raw_ostream &os) {
+    os << "Running pass adaptor asynchronously on operation '"
+       << OpWithFlags(getOperation(), OpPrintingFlags().skipRegions())
+       << "' with " << mgrs.size()
+       << " pass managers, verifyPasses=" << verifyPasses << " pipeline: ";
+    printAsTextualPipeline(os, /*pretty=*/false);
+  });
+
   AnalysisManager am = getAnalysisManager();
   MLIRContext *context = &getContext();
 
   // Create the async executors if they haven't been created, or if the main
   // pipeline has changed.
-  if (asyncExecutors.empty() || hasSizeMismatch(asyncExecutors.front(), mgrs))
+  if (asyncExecutors.empty() || hasSizeMismatch(asyncExecutors.front(), mgrs)) {
+    LDBG(2) << "Creating " << context->getThreadPool().getMaxConcurrency()
+            << " async executors";
     asyncExecutors.assign(context->getThreadPool().getMaxConcurrency(), mgrs);
+  }
 
   // This struct represents the information for a single operation to be
   // scheduled on a pass manager.
@@ -803,21 +943,36 @@ void OpToOpPassAdaptor::runOnOperationAsyncImpl(bool verifyPasses) {
   // operation, as well as providing a queue of operations to execute over.
   std::vector<OpPMInfo> opInfos;
   DenseMap<OperationName, std::optional<unsigned>> knownOpPMIdx;
+
+  LDBG(2) << "Collecting operations for async execution";
   for (auto &region : getOperation()->getRegions()) {
     for (Operation &op : region.getOps()) {
       // Get the pass manager index for this operation type.
       auto pmIdxIt = knownOpPMIdx.try_emplace(op.getName(), std::nullopt);
       if (pmIdxIt.second) {
-        if (auto *mgr = findPassManagerFor(mgrs, op.getName(), *context))
+        if (auto *mgr = findPassManagerFor(mgrs, op.getName(), *context)) {
           pmIdxIt.first->second = std::distance(mgrs.begin(), mgr);
+          LDBG(2) << "Operation '"
+                  << OpWithFlags(&op, OpPrintingFlags().skipRegions())
+                  << "' will use pass manager '" << mgr->getOpAnchorName()
+                  << "'";
+        }
       }
 
       // If this operation can be scheduled, add it to the list.
-      if (pmIdxIt.first->second)
+      if (pmIdxIt.first->second) {
         opInfos.emplace_back(*pmIdxIt.first->second, &op, am.nest(&op));
+      } else {
+        LDBG(2) << "Operation '"
+                << OpWithFlags(&op, OpPrintingFlags().skipRegions())
+                << "' skipped: no suitable pass manager";
+      }
     }
   }
 
+  LDBG(2) << "Collected " << opInfos.size()
+          << " operations for async execution";
+
   // Get the current thread for this adaptor.
   PassInstrumentation::PipelineParentInfo parentInfo = {llvm::get_threadid(),
                                                         this};
@@ -872,23 +1027,36 @@ void PassManager::enableVerifier(bool enabled) { verifyPasses = enabled; }
 
 /// Run the passes within this manager on the provided operation.
 LogicalResult PassManager::run(Operation *op) {
+  LDBG_OS([&](raw_ostream &os) {
+    os << "Starting PassManager run on operation '"
+       << OpWithFlags(op, OpPrintingFlags().skipRegions()) << "' with "
+       << size() << " passes, verifyPasses=" << verifyPasses << " pipeline: ";
+    printAsTextualPipeline(os, /*pretty=*/false);
+  });
+
   MLIRContext *context = getContext();
   std::optional<OperationName> anchorOp = getOpName(*context);
-  if (anchorOp && anchorOp != op->getName())
+  if (anchorOp && anchorOp != op->getName()) {
     return emitError(op->getLoc())
            << "can't run '" << getOpAnchorName() << "' pass manager on '"
            << op->getName() << "' op";
+  }
 
   // Register all dialects for the current pipeline.
+  LDBG(2) << "Registering dependent dialects for pipeline";
   DialectRegistry dependentDialects;
   getDependentDialects(dependentDialects);
   context->appendDialectRegistry(dependentDialects);
-  for (StringRef name : dependentDialects.getDialectNames())
+  for (StringRef name : dependentDialects.getDialectNames()) {
+    LDBG(2) << "Loading dialect: " << name;
     context->getOrLoadDialect(name);
+  }
 
   // Before running, make sure to finalize the pipeline pass list.
-  if (failed(getImpl().finalizePassList(context)))
+  if (failed(getImpl().finalizePassList(context))) {
+    LDBG(2) << "Pass list finalization failed";
     return failure();
+  }
 
   // Notify the context that we start running a pipeline for bookkeeping.
   context->enterMultiThreadedExecution();
@@ -898,17 +1066,27 @@ LogicalResult PassManager::run(Operation *op) {
   llvm::hash_code pipelineKey = hash();
   if (newInitKey != initializationKey ||
       pipelineKey != pipelineInitializationKey) {
-    if (failed(initialize(context, impl->initializationGeneration + 1)))
+    LDBG(2) << "Initializing passes with new generation: "
+            << (impl->initializationGeneration + 1);
+    if (failed(initialize(context, impl->initializationGeneration + 1))) {
+      LDBG(2) << "Pass initialization failed";
       return failure();
+    }
     initializationKey = newInitKey;
     pipelineInitializationKey = pipelineKey;
+  } else {
+    LDBG(2) << "Using existing pass initialization (generation: "
+            << impl->initializationGeneration << ")";
   }
 
   // Construct a top level analysis manager for the pipeline.
+  LDBG(2) << "Constructing analysis manager for pipeline execution";
   ModuleAnalysisManager am(op, instrumentor.get());
 
   // If reproducer generation is enabled, run the pass manager with crash
   // handling enabled.
+  LDBG(2) << "Executing pipeline with "
+          << (crashReproGenerator ? "crash recovery" : "normal execution");
   LogicalResult result =
       crashReproGenerator ? runWithCrashRecovery(op, am) : runPasses(op, am);
 
@@ -916,8 +1094,13 @@ LogicalResult PassManager::run(Operation *op) {
   context->exitMultiThreadedExecution();
 
   // Dump all of the pass statistics if necessary.
-  if (passStatisticsMode)
+  if (passStatisticsMode) {
+    LDBG(2) << "Dumping pass statistics";
     dumpStatistics();
+  }
+
+  LDBG(2) << "PassManager run completed with result: "
+          << (succeeded(result) ? "success" : "failure");
   return result;
 }
 
@@ -930,6 +1113,7 @@ void PassManager::addInstrumentation(std::unique_ptr<PassInstrumentation> pi) {
 }
 
 LogicalResult PassManager::runPasses(Operation *op, AnalysisManager am) {
+  LDBG(2) << "Executing passes using OpToOpPassAdaptor pipeline";
   return OpToOpPassAdaptor::runPipeline(*this, op, am, verifyPasses,
                                         impl->initializationGeneration);
 }
diff --git a/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp
index 7f69af14df338..3d86b09b32538 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp
@@ -253,8 +253,8 @@ getStMatrixIntrinsicId(NVVM::MMALayout layout, int32_t num,
 /// Return the intrinsic ID associated with st.bulk for the given address type.
 static llvm::Intrinsic::ID
 getStBulkIntrinsicId(LLVM::LLVMPointerType addrType) {
-  bool isSharedMemory =
-      addrType.getAddressSpace() == NVVM::NVVMMemorySpace::kSharedMemorySpace;
+  bool isSharedMemory = addrType.getAddressSpace() ==
+                        static_cast<unsigned>(NVVM::NVVMMemorySpace::Shared);
   return isSharedMemory ? llvm::Intrinsic::nvvm_st_bulk_shared_cta
                         : llvm::Intrinsic::nvvm_st_bulk;
 }
diff --git a/mlir/lib/Target/LLVMIR/Dialect/Ptr/PtrToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/Ptr/PtrToLLVMIRTranslation.cpp
index d777667022a98..7e610cd42e931 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/Ptr/PtrToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/Ptr/PtrToLLVMIRTranslation.cpp
@@ -29,7 +29,7 @@ namespace {
 
 /// Converts ptr::AtomicOrdering to llvm::AtomicOrdering
 static llvm::AtomicOrdering
-convertAtomicOrdering(ptr::AtomicOrdering ordering) {
+translateAtomicOrdering(ptr::AtomicOrdering ordering) {
   switch (ordering) {
   case ptr::AtomicOrdering::not_atomic:
     return llvm::AtomicOrdering::NotAtomic;
@@ -49,10 +49,10 @@ convertAtomicOrdering(ptr::AtomicOrdering ordering) {
   llvm_unreachable("Unknown atomic ordering");
 }
 
-/// Convert ptr.ptr_add operation
+/// Translate ptr.ptr_add operation to LLVM IR.
 static LogicalResult
-convertPtrAddOp(PtrAddOp ptrAddOp, llvm::IRBuilderBase &builder,
-                LLVM::ModuleTranslation &moduleTranslation) {
+translatePtrAddOp(PtrAddOp ptrAddOp, llvm::IRBuilderBase &builder,
+                  LLVM::ModuleTranslation &moduleTranslation) {
   llvm::Value *basePtr = moduleTranslation.lookupValue(ptrAddOp.getBase());
   llvm::Value *offset = moduleTranslation.lookupValue(ptrAddOp.getOffset());
 
@@ -83,18 +83,19 @@ convertPtrAddOp(PtrAddOp ptrAddOp, llvm::IRBuilderBase &builder,
   return success();
 }
 
-/// Convert ptr.load operation
-static LogicalResult convertLoadOp(LoadOp loadOp, llvm::IRBuilderBase &builder,
-                                   LLVM::ModuleTranslation &moduleTranslation) {
+/// Translate ptr.load operation to LLVM IR.
+static LogicalResult
+translateLoadOp(LoadOp loadOp, llvm::IRBuilderBase &builder,
+                LLVM::ModuleTranslation &moduleTranslation) {
   llvm::Value *ptr = moduleTranslation.lookupValue(loadOp.getPtr());
   if (!ptr)
     return loadOp.emitError("Failed to lookup pointer operand");
 
-  // Convert result type to LLVM type
+  // Translate result type to LLVM type
   llvm::Type *resultType =
       moduleTranslation.convertType(loadOp.getValue().getType());
   if (!resultType)
-    return loadOp.emitError("Failed to convert result type");
+    return loadOp.emitError("Failed to translate result type");
 
   // Create the load instruction.
   llvm::MaybeAlign alignment(loadOp.getAlignment().value_or(0));
@@ -102,7 +103,7 @@ static LogicalResult convertLoadOp(LoadOp loadOp, llvm::IRBuilderBase &builder,
       resultType, ptr, alignment, loadOp.getVolatile_());
 
   // Set op flags and metadata.
-  loadInst->setAtomic(convertAtomicOrdering(loadOp.getOrdering()));
+  loadInst->setAtomic(translateAtomicOrdering(loadOp.getOrdering()));
   // Set sync scope if specified
   if (loadOp.getSyncscope().has_value()) {
     llvm::LLVMContext &ctx = builder.getContext();
@@ -135,10 +136,10 @@ static LogicalResult convertLoadOp(LoadOp loadOp, llvm::IRBuilderBase &builder,
   return success();
 }
 
-/// Convert ptr.store operation
+/// Translate ptr.store operation to LLVM IR.
 static LogicalResult
-convertStoreOp(StoreOp storeOp, llvm::IRBuilderBase &builder,
-               LLVM::ModuleTranslation &moduleTranslation) {
+translateStoreOp(StoreOp storeOp, llvm::IRBuilderBase &builder,
+                 LLVM::ModuleTranslation &moduleTranslation) {
   llvm::Value *value = moduleTranslation.lookupValue(storeOp.getValue());
   llvm::Value *ptr = moduleTranslation.lookupValue(storeOp.getPtr());
 
@@ -151,7 +152,7 @@ convertStoreOp(StoreOp storeOp, llvm::IRBuilderBase &builder,
       builder.CreateAlignedStore(value, ptr, alignment, storeOp.getVolatile_());
 
   // Set op flags and metadata.
-  storeInst->setAtomic(convertAtomicOrdering(storeOp.getOrdering()));
+  storeInst->setAtomic(translateAtomicOrdering(storeOp.getOrdering()));
   // Set sync scope if specified
   if (storeOp.getSyncscope().has_value()) {
     llvm::LLVMContext &ctx = builder.getContext();
@@ -178,21 +179,21 @@ convertStoreOp(StoreOp storeOp, llvm::IRBuilderBase &builder,
   return success();
 }
 
-/// Convert ptr.type_offset operation
+/// Translate ptr.type_offset operation to LLVM IR.
 static LogicalResult
-convertTypeOffsetOp(TypeOffsetOp typeOffsetOp, llvm::IRBuilderBase &builder,
-                    LLVM::ModuleTranslation &moduleTranslation) {
-  // Convert the element type to LLVM type
+translateTypeOffsetOp(TypeOffsetOp typeOffsetOp, llvm::IRBuilderBase &builder,
+                      LLVM::ModuleTranslation &moduleTranslation) {
+  // Translate the element type to LLVM type
   llvm::Type *elementType =
       moduleTranslation.convertType(typeOffsetOp.getElementType());
   if (!elementType)
-    return typeOffsetOp.emitError("Failed to convert the element type");
+    return typeOffsetOp.emitError("Failed to translate the element type");
 
-  // Convert result type
+  // Translate result type
   llvm::Type *resultType =
       moduleTranslation.convertType(typeOffsetOp.getResult().getType());
   if (!resultType)
-    return typeOffsetOp.emitError("Failed to convert the result type");
+    return typeOffsetOp.emitError("Failed to translate the result type");
 
   // Use GEP with null pointer to compute type size/offset.
   llvm::Value *nullPtr = llvm::Constant::getNullValue(builder.getPtrTy(0));
@@ -204,10 +205,10 @@ convertTypeOffsetOp(TypeOffsetOp typeOffsetOp, llvm::IRBuilderBase &builder,
   return success();
 }
 
-/// Convert ptr.gather operation
+/// Translate ptr.gather operation to LLVM IR.
 static LogicalResult
-convertGatherOp(GatherOp gatherOp, llvm::IRBuilderBase &builder,
-                LLVM::ModuleTranslation &moduleTranslation) {
+translateGatherOp(GatherOp gatherOp, llvm::IRBuilderBase &builder,
+                  LLVM::ModuleTranslation &moduleTranslation) {
   llvm::Value *ptrs = moduleTranslation.lookupValue(gatherOp.getPtrs());
   llvm::Value *mask = moduleTranslation.lookupValue(gatherOp.getMask());
   llvm::Value *passthrough =
@@ -216,11 +217,11 @@ convertGatherOp(GatherOp gatherOp, llvm::IRBuilderBase &builder,
   if (!ptrs || !mask || !passthrough)
     return gatherOp.emitError("Failed to lookup operands");
 
-  // Convert result type to LLVM type.
+  // Translate result type to LLVM type.
   llvm::Type *resultType =
       moduleTranslation.convertType(gatherOp.getResult().getType());
   if (!resultType)
-    return gatherOp.emitError("Failed to convert result type");
+    return gatherOp.emitError("Failed to translate result type");
 
   // Get the alignment.
   llvm::MaybeAlign alignment(gatherOp.getAlignment().value_or(0));
@@ -233,10 +234,10 @@ convertGatherOp(GatherOp gatherOp, llvm::IRBuilderBase &builder,
   return success();
 }
 
-/// Convert ptr.masked_load operation
+/// Translate ptr.masked_load operation to LLVM IR.
 static LogicalResult
-convertMaskedLoadOp(MaskedLoadOp maskedLoadOp, llvm::IRBuilderBase &builder,
-                    LLVM::ModuleTranslation &moduleTranslation) {
+translateMaskedLoadOp(MaskedLoadOp maskedLoadOp, llvm::IRBuilderBase &builder,
+                      LLVM::ModuleTranslation &moduleTranslation) {
   llvm::Value *ptr = moduleTranslation.lookupValue(maskedLoadOp.getPtr());
   llvm::Value *mask = moduleTranslation.lookupValue(maskedLoadOp.getMask());
   llvm::Value *passthrough =
@@ -245,11 +246,11 @@ convertMaskedLoadOp(MaskedLoadOp maskedLoadOp, llvm::IRBuilderBase &builder,
   if (!ptr || !mask || !passthrough)
     return maskedLoadOp.emitError("Failed to lookup operands");
 
-  // Convert result type to LLVM type.
+  // Translate result type to LLVM type.
   llvm::Type *resultType =
       moduleTranslation.convertType(maskedLoadOp.getResult().getType());
   if (!resultType)
-    return maskedLoadOp.emitError("Failed to convert result type");
+    return maskedLoadOp.emitError("Failed to translate result type");
 
   // Get the alignment.
   llvm::MaybeAlign alignment(maskedLoadOp.getAlignment().value_or(0));
@@ -262,10 +263,11 @@ convertMaskedLoadOp(MaskedLoadOp maskedLoadOp, llvm::IRBuilderBase &builder,
   return success();
 }
 
-/// Convert ptr.masked_store operation
+/// Translate ptr.masked_store operation to LLVM IR.
 static LogicalResult
-convertMaskedStoreOp(MaskedStoreOp maskedStoreOp, llvm::IRBuilderBase &builder,
-                     LLVM::ModuleTranslation &moduleTranslation) {
+translateMaskedStoreOp(MaskedStoreOp maskedStoreOp,
+                       llvm::IRBuilderBase &builder,
+                       LLVM::ModuleTranslation &moduleTranslation) {
   llvm::Value *value = moduleTranslation.lookupValue(maskedStoreOp.getValue());
   llvm::Value *ptr = moduleTranslation.lookupValue(maskedStoreOp.getPtr());
   llvm::Value *mask = moduleTranslation.lookupValue(maskedStoreOp.getMask());
@@ -281,10 +283,10 @@ convertMaskedStoreOp(MaskedStoreOp maskedStoreOp, llvm::IRBuilderBase &builder,
   return success();
 }
 
-/// Convert ptr.scatter operation
+/// Translate ptr.scatter operation to LLVM IR.
 static LogicalResult
-convertScatterOp(ScatterOp scatterOp, llvm::IRBuilderBase &builder,
-                 LLVM::ModuleTranslation &moduleTranslation) {
+translateScatterOp(ScatterOp scatterOp, llvm::IRBuilderBase &builder,
+                   LLVM::ModuleTranslation &moduleTranslation) {
   llvm::Value *value = moduleTranslation.lookupValue(scatterOp.getValue());
   llvm::Value *ptrs = moduleTranslation.lookupValue(scatterOp.getPtrs());
   llvm::Value *mask = moduleTranslation.lookupValue(scatterOp.getMask());
@@ -300,7 +302,56 @@ convertScatterOp(ScatterOp scatterOp, llvm::IRBuilderBase &builder,
   return success();
 }
 
-/// Implementation of the dialect interface that converts operations belonging
+/// Translate ptr.constant operation to LLVM IR.
+static LogicalResult
+translateConstantOp(ConstantOp constantOp, llvm::IRBuilderBase &builder,
+                    LLVM::ModuleTranslation &moduleTranslation) {
+  // Translate result type to LLVM type
+  llvm::PointerType *resultType = dyn_cast_or_null<llvm::PointerType>(
+      moduleTranslation.convertType(constantOp.getResult().getType()));
+  if (!resultType)
+    return constantOp.emitError("Expected a valid pointer type");
+
+  llvm::Value *result = nullptr;
+
+  TypedAttr value = constantOp.getValue();
+  if (auto nullAttr = dyn_cast<ptr::NullAttr>(value)) {
+    // Create a null pointer constant
+    result = llvm::ConstantPointerNull::get(resultType);
+  } else if (auto addressAttr = dyn_cast<ptr::AddressAttr>(value)) {
+    // Create an integer constant and translate it to pointer
+    llvm::APInt addressValue = addressAttr.getValue();
+
+    // Determine the integer type width based on the target's pointer size
+    llvm::DataLayout dataLayout =
+        moduleTranslation.getLLVMModule()->getDataLayout();
+    unsigned pointerSizeInBits =
+        dataLayout.getPointerSizeInBits(resultType->getAddressSpace());
+
+    // Extend or truncate the address value to match pointer size if needed
+    if (addressValue.getBitWidth() != pointerSizeInBits) {
+      if (addressValue.getBitWidth() > pointerSizeInBits) {
+        constantOp.emitWarning()
+            << "Truncating address value to fit pointer size";
+      }
+      addressValue = addressValue.getBitWidth() < pointerSizeInBits
+                         ? addressValue.zext(pointerSizeInBits)
+                         : addressValue.trunc(pointerSizeInBits);
+    }
+
+    // Create integer constant and translate to pointer
+    llvm::Type *intType = builder.getIntNTy(pointerSizeInBits);
+    llvm::Value *intValue = llvm::ConstantInt::get(intType, addressValue);
+    result = builder.CreateIntToPtr(intValue, resultType);
+  } else {
+    return constantOp.emitError("Unsupported constant attribute type");
+  }
+
+  moduleTranslation.mapValue(constantOp.getResult(), result);
+  return success();
+}
+
+/// Implementation of the dialect interface that translates operations belonging
 /// to the `ptr` dialect to LLVM IR.
 class PtrDialectLLVMIRTranslationInterface
     : public LLVMTranslationDialectInterface {
@@ -314,30 +365,35 @@ class PtrDialectLLVMIRTranslationInterface
                    LLVM::ModuleTranslation &moduleTranslation) const final {
 
     return llvm::TypeSwitch<Operation *, LogicalResult>(op)
+        .Case([&](ConstantOp constantOp) {
+          return translateConstantOp(constantOp, builder, moduleTranslation);
+        })
         .Case([&](PtrAddOp ptrAddOp) {
-          return convertPtrAddOp(ptrAddOp, builder, moduleTranslation);
+          return translatePtrAddOp(ptrAddOp, builder, moduleTranslation);
         })
         .Case([&](LoadOp loadOp) {
-          return convertLoadOp(loadOp, builder, moduleTranslation);
+          return translateLoadOp(loadOp, builder, moduleTranslation);
         })
         .Case([&](StoreOp storeOp) {
-          return convertStoreOp(storeOp, builder, moduleTranslation);
+          return translateStoreOp(storeOp, builder, moduleTranslation);
         })
         .Case([&](TypeOffsetOp typeOffsetOp) {
-          return convertTypeOffsetOp(typeOffsetOp, builder, moduleTranslation);
+          return translateTypeOffsetOp(typeOffsetOp, builder,
+                                       moduleTranslation);
         })
         .Case<GatherOp>([&](GatherOp gatherOp) {
-          return convertGatherOp(gatherOp, builder, moduleTranslation);
+          return translateGatherOp(gatherOp, builder, moduleTranslation);
         })
         .Case<MaskedLoadOp>([&](MaskedLoadOp maskedLoadOp) {
-          return convertMaskedLoadOp(maskedLoadOp, builder, moduleTranslation);
+          return translateMaskedLoadOp(maskedLoadOp, builder,
+                                       moduleTranslation);
         })
         .Case<MaskedStoreOp>([&](MaskedStoreOp maskedStoreOp) {
-          return convertMaskedStoreOp(maskedStoreOp, builder,
-                                      moduleTranslation);
+          return translateMaskedStoreOp(maskedStoreOp, builder,
+                                        moduleTranslation);
         })
         .Case<ScatterOp>([&](ScatterOp scatterOp) {
-          return convertScatterOp(scatterOp, builder, moduleTranslation);
+          return translateScatterOp(scatterOp, builder, moduleTranslation);
         })
         .Default([&](Operation *op) {
           return op->emitError("Translation for operation '")
diff --git a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
index 9725359160a1a..adc5a74e2031f 100644
--- a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
@@ -2246,18 +2246,22 @@ SmallVector<llvm::Value *> ModuleTranslation::lookupValues(ValueRange values) {
 llvm::OpenMPIRBuilder *ModuleTranslation::getOpenMPBuilder() {
   if (!ompBuilder) {
     ompBuilder = std::make_unique<llvm::OpenMPIRBuilder>(*llvmModule);
-    ompBuilder->initialize();
 
     // Flags represented as top-level OpenMP dialect attributes are set in
     // `OpenMPDialectLLVMIRTranslationInterface::amendOperation()`. Here we set
     // the default configuration.
-    ompBuilder->setConfig(llvm::OpenMPIRBuilderConfig(
+    llvm::OpenMPIRBuilderConfig config(
         /* IsTargetDevice = */ false, /* IsGPU = */ false,
         /* OpenMPOffloadMandatory = */ false,
         /* HasRequiresReverseOffload = */ false,
         /* HasRequiresUnifiedAddress = */ false,
         /* HasRequiresUnifiedSharedMemory = */ false,
-        /* HasRequiresDynamicAllocators = */ false));
+        /* HasRequiresDynamicAllocators = */ false);
+    unsigned int defaultAS =
+        getLLVMModule()->getDataLayout().getProgramAddressSpace();
+    config.setDefaultTargetAS(defaultAS);
+    ompBuilder->setConfig(std::move(config));
+    ompBuilder->initialize();
   }
   return ompBuilder.get();
 }
diff --git a/mlir/lib/Target/LLVMIR/TypeToLLVM.cpp b/mlir/lib/Target/LLVMIR/TypeToLLVM.cpp
index ddd5946ce5d63..4d204744450a8 100644
--- a/mlir/lib/Target/LLVMIR/TypeToLLVM.cpp
+++ b/mlir/lib/Target/LLVMIR/TypeToLLVM.cpp
@@ -152,8 +152,9 @@ class TypeToLLVMIRTranslatorImpl {
 
   /// Translates the given ptr type.
   llvm::Type *translate(PtrLikeTypeInterface type) {
-    auto memSpace = dyn_cast<LLVM::AddressSpaceAttr>(type.getMemorySpace());
-    assert(memSpace && "expected pointer with the LLVM address space");
+    auto memSpace =
+        dyn_cast<LLVM::LLVMAddrSpaceAttrInterface>(type.getMemorySpace());
+    assert(memSpace && "expected pointer with an LLVM address space");
     assert(!type.hasPtrMetadata() && "expected pointer without metadata");
     return llvm::PointerType::get(context, memSpace.getAddressSpace());
   }
diff --git a/mlir/lib/Target/SPIRV/Deserialization/DeserializeOps.cpp b/mlir/lib/Target/SPIRV/Deserialization/DeserializeOps.cpp
index ee18cf815e4a7..c27f9aa91332c 100644
--- a/mlir/lib/Target/SPIRV/Deserialization/DeserializeOps.cpp
+++ b/mlir/lib/Target/SPIRV/Deserialization/DeserializeOps.cpp
@@ -86,6 +86,13 @@ Value spirv::Deserializer::getValue(uint32_t id) {
   if (auto undef = getUndefType(id)) {
     return spirv::UndefOp::create(opBuilder, unknownLoc, undef);
   }
+  if (std::optional<spirv::GraphConstantARMOpMaterializationInfo>
+          graphConstantARMInfo = getGraphConstantARM(id)) {
+    IntegerAttr graphConstantID = graphConstantARMInfo->graphConstantID;
+    Type resultType = graphConstantARMInfo->resultType;
+    return spirv::GraphConstantARMOp::create(opBuilder, unknownLoc, resultType,
+                                             graphConstantID);
+  }
   return valueMap.lookup(id);
 }
 
@@ -180,6 +187,7 @@ LogicalResult spirv::Deserializer::processInstruction(
   case spirv::Opcode::OpTypeStruct:
   case spirv::Opcode::OpTypePointer:
   case spirv::Opcode::OpTypeTensorARM:
+  case spirv::Opcode::OpTypeGraphARM:
   case spirv::Opcode::OpTypeCooperativeMatrixKHR:
     return processType(opcode, operands);
   case spirv::Opcode::OpTypeForwardPointer:
@@ -208,12 +216,26 @@ LogicalResult spirv::Deserializer::processInstruction(
     return processConstantBool(/*isTrue=*/false, operands, /*isSpec=*/true);
   case spirv::Opcode::OpConstantNull:
     return processConstantNull(operands);
+  case spirv::Opcode::OpGraphConstantARM:
+    return processGraphConstantARM(operands);
   case spirv::Opcode::OpDecorate:
     return processDecoration(operands);
   case spirv::Opcode::OpMemberDecorate:
     return processMemberDecoration(operands);
   case spirv::Opcode::OpFunction:
     return processFunction(operands);
+  case spirv::Opcode::OpGraphEntryPointARM:
+    if (deferInstructions) {
+      deferredInstructions.emplace_back(opcode, operands);
+      return success();
+    }
+    return processGraphEntryPointARM(operands);
+  case spirv::Opcode::OpGraphARM:
+    return processGraphARM(operands);
+  case spirv::Opcode::OpGraphSetOutputARM:
+    return processOpGraphSetOutputARM(operands);
+  case spirv::Opcode::OpGraphEndARM:
+    return processGraphEndARM(operands);
   case spirv::Opcode::OpLabel:
     return processLabel(operands);
   case spirv::Opcode::OpBranch:
diff --git a/mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp b/mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp
index 3625dd2eb7dd3..0c3e87a8dc1ef 100644
--- a/mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp
+++ b/mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp
@@ -669,6 +669,200 @@ spirv::Deserializer::processFunctionEnd(ArrayRef<uint32_t> operands) {
   return success();
 }
 
+LogicalResult
+spirv::Deserializer::processGraphEntryPointARM(ArrayRef<uint32_t> operands) {
+  if (operands.size() < 2) {
+    return emitError(unknownLoc,
+                     "missing graph defintion in OpGraphEntryPointARM");
+  }
+
+  unsigned wordIndex = 0;
+  uint32_t graphID = operands[wordIndex++];
+  if (!graphMap.contains(graphID)) {
+    return emitError(unknownLoc,
+                     "missing graph definition/declaration with id ")
+           << graphID;
+  }
+
+  spirv::GraphARMOp graphARM = graphMap[graphID];
+  StringRef name = decodeStringLiteral(operands, wordIndex);
+  graphARM.setSymName(name);
+  graphARM.setEntryPoint(true);
+
+  SmallVector<Attribute, 4> interface;
+  for (int64_t size = operands.size(); wordIndex < size; ++wordIndex) {
+    if (spirv::GlobalVariableOp arg = getGlobalVariable(operands[wordIndex])) {
+      interface.push_back(SymbolRefAttr::get(arg.getOperation()));
+    } else {
+      return emitError(unknownLoc, "undefined result <id> ")
+             << operands[wordIndex] << " while decoding OpGraphEntryPoint";
+    }
+  }
+
+  // RAII guard to reset the insertion point to previous value when done.
+  OpBuilder::InsertionGuard insertionGuard(opBuilder);
+  opBuilder.setInsertionPoint(graphARM);
+  opBuilder.create<spirv::GraphEntryPointARMOp>(
+      unknownLoc, SymbolRefAttr::get(opBuilder.getContext(), name),
+      opBuilder.getArrayAttr(interface));
+
+  return success();
+}
+
+LogicalResult
+spirv::Deserializer::processGraphARM(ArrayRef<uint32_t> operands) {
+  if (curGraph) {
+    return emitError(unknownLoc, "found graph inside graph");
+  }
+  // Get the result type.
+  if (operands.size() < 2) {
+    return emitError(unknownLoc, "OpGraphARM must have at least 2 parameters");
+  }
+
+  Type type = getType(operands[0]);
+  if (!type || !isa<GraphType>(type)) {
+    return emitError(unknownLoc, "unknown graph type from <id> ")
+           << operands[0];
+  }
+  auto graphType = cast<GraphType>(type);
+  if (graphType.getNumResults() <= 0) {
+    return emitError(unknownLoc, "expected at least one result");
+  }
+
+  uint32_t graphID = operands[1];
+  if (graphMap.count(graphID)) {
+    return emitError(unknownLoc, "duplicate graph definition/declaration");
+  }
+
+  std::string graphName = getGraphSymbol(graphID);
+  auto graphOp =
+      opBuilder.create<spirv::GraphARMOp>(unknownLoc, graphName, graphType);
+  curGraph = graphMap[graphID] = graphOp;
+  Block *entryBlock = graphOp.addEntryBlock();
+  LLVM_DEBUG({
+    logger.startLine()
+        << "//===-------------------------------------------===//\n";
+    logger.startLine() << "[graph] name: " << graphName << "\n";
+    logger.startLine() << "[graph] type: " << graphType << "\n";
+    logger.startLine() << "[graph] ID: " << graphID << "\n";
+    logger.startLine() << "[graph] entry block: " << entryBlock << "\n";
+    logger.indent();
+  });
+
+  // Parse the op argument instructions.
+  for (auto [index, argType] : llvm::enumerate(graphType.getInputs())) {
+    spirv::Opcode opcode;
+    ArrayRef<uint32_t> operands;
+    if (failed(sliceInstruction(opcode, operands,
+                                spirv::Opcode::OpGraphInputARM))) {
+      return failure();
+    }
+    if (operands.size() != 3) {
+      return emitError(unknownLoc, "expected result type, result <id> and "
+                                   "input index for OpGraphInputARM");
+    }
+
+    Type argDefinedType = getType(operands[0]);
+    if (!argDefinedType) {
+      return emitError(unknownLoc, "unknown operand type <id> ") << operands[0];
+    }
+
+    if (argDefinedType != argType) {
+      return emitError(unknownLoc,
+                       "mismatch in argument type between graph type "
+                       "definition ")
+             << graphType << " and argument type definition " << argDefinedType
+             << " at argument " << index;
+    }
+    if (getValue(operands[1])) {
+      return emitError(unknownLoc, "duplicate definition of result <id> ")
+             << operands[1];
+    }
+
+    IntegerAttr inputIndexAttr = getConstantInt(operands[2]);
+    if (!inputIndexAttr) {
+      return emitError(unknownLoc,
+                       "unable to read inputIndex value from constant op ")
+             << operands[2];
+    }
+    BlockArgument argValue = graphOp.getArgument(inputIndexAttr.getInt());
+    valueMap[operands[1]] = argValue;
+  }
+
+  graphOutputs.resize(graphType.getNumResults());
+
+  // RAII guard to reset the insertion point to the module's region after
+  // deserializing the body of this function.
+  OpBuilder::InsertionGuard moduleInsertionGuard(opBuilder);
+
+  blockMap[graphID] = entryBlock;
+  if (failed(createGraphBlock(graphID))) {
+    return failure();
+  }
+
+  // Process all the instructions in the graph until and including
+  // OpGraphEndARM.
+  spirv::Opcode opcode;
+  ArrayRef<uint32_t> instOperands;
+  do {
+    if (failed(sliceInstruction(opcode, instOperands, std::nullopt))) {
+      return failure();
+    }
+
+    if (failed(processInstruction(opcode, instOperands))) {
+      return failure();
+    }
+  } while (opcode != spirv::Opcode::OpGraphEndARM);
+
+  return success();
+}
+
+LogicalResult
+spirv::Deserializer::processOpGraphSetOutputARM(ArrayRef<uint32_t> operands) {
+  if (operands.size() != 2) {
+    return emitError(
+        unknownLoc,
+        "expected value id and output index for OpGraphSetOutputARM");
+  }
+
+  uint32_t id = operands[0];
+  Value value = getValue(id);
+  if (!value) {
+    return emitError(unknownLoc, "could not find result <id> ") << id;
+  }
+
+  IntegerAttr outputIndexAttr = getConstantInt(operands[1]);
+  if (!outputIndexAttr) {
+    return emitError(unknownLoc,
+                     "unable to read outputIndex value from constant op ")
+           << operands[1];
+  }
+  graphOutputs[outputIndexAttr.getInt()] = value;
+  return success();
+}
+
+LogicalResult
+spirv::Deserializer::processGraphEndARM(ArrayRef<uint32_t> operands) {
+  // Create GraphOutputsARM instruction.
+  opBuilder.create<spirv::GraphOutputsARMOp>(unknownLoc, graphOutputs);
+
+  // Process OpGraphEndARM.
+  if (!operands.empty()) {
+    return emitError(unknownLoc, "unexpected operands for OpGraphEndARM");
+  }
+
+  curBlock = nullptr;
+  curGraph = std::nullopt;
+  graphOutputs.clear();
+
+  LLVM_DEBUG({
+    logger.unindent();
+    logger.startLine()
+        << "//===-------------------------------------------===//\n";
+  });
+  return success();
+}
+
 std::optional<std::pair<Attribute, Type>>
 spirv::Deserializer::getConstant(uint32_t id) {
   auto constIt = constantMap.find(id);
@@ -701,6 +895,14 @@ std::string spirv::Deserializer::getFunctionSymbol(uint32_t id) {
   return funcName;
 }
 
+std::string spirv::Deserializer::getGraphSymbol(uint32_t id) {
+  std::string graphName = nameMap.lookup(id).str();
+  if (graphName.empty()) {
+    graphName = "spirv_graph_" + std::to_string(id);
+  }
+  return graphName;
+}
+
 std::string spirv::Deserializer::getSpecConstantSymbol(uint32_t id) {
   auto constName = nameMap.lookup(id).str();
   if (constName.empty()) {
@@ -723,6 +925,14 @@ spirv::Deserializer::createSpecConstant(Location loc, uint32_t resultID,
   return op;
 }
 
+std::optional<spirv::GraphConstantARMOpMaterializationInfo>
+spirv::Deserializer::getGraphConstantARM(uint32_t id) {
+  auto graphConstIt = graphConstantMap.find(id);
+  if (graphConstIt == graphConstantMap.end())
+    return std::nullopt;
+  return graphConstIt->getSecond();
+}
+
 LogicalResult
 spirv::Deserializer::processGlobalVariable(ArrayRef<uint32_t> operands) {
   unsigned wordIndex = 0;
@@ -944,6 +1154,8 @@ LogicalResult spirv::Deserializer::processType(spirv::Opcode opcode,
     return processMatrixType(operands);
   case spirv::Opcode::OpTypeTensorARM:
     return processTensorARMType(operands);
+  case spirv::Opcode::OpTypeGraphARM:
+    return processGraphTypeARM(operands);
   default:
     return emitError(unknownLoc, "unhandled type instruction");
   }
@@ -1311,6 +1523,35 @@ spirv::Deserializer::processTensorARMType(ArrayRef<uint32_t> operands) {
   return success();
 }
 
+LogicalResult
+spirv::Deserializer::processGraphTypeARM(ArrayRef<uint32_t> operands) {
+  unsigned size = operands.size();
+  if (size < 2) {
+    return emitError(unknownLoc, "OpTypeGraphARM must have at least 2 operands "
+                                 "(result_id, num_inputs, (inout0_type, "
+                                 "inout1_type, ...))")
+           << size;
+  }
+  uint32_t numInputs = operands[1];
+  SmallVector<Type, 1> argTypes;
+  SmallVector<Type, 1> returnTypes;
+  for (unsigned i = 2; i < size; ++i) {
+    Type inOutTy = getType(operands[i]);
+    if (!inOutTy) {
+      return emitError(unknownLoc,
+                       "OpTypeGraphARM references undefined element type.")
+             << operands[i];
+    }
+    if (i - 2 >= numInputs) {
+      returnTypes.push_back(inOutTy);
+    } else {
+      argTypes.push_back(inOutTy);
+    }
+  }
+  typeMap[operands[0]] = GraphType::get(context, argTypes, returnTypes);
+  return success();
+}
+
 LogicalResult
 spirv::Deserializer::processTypeForwardPointer(ArrayRef<uint32_t> operands) {
   if (operands.size() != 2)
@@ -1823,6 +2064,34 @@ spirv::Deserializer::processConstantNull(ArrayRef<uint32_t> operands) {
          << resultType;
 }
 
+LogicalResult
+spirv::Deserializer::processGraphConstantARM(ArrayRef<uint32_t> operands) {
+  if (operands.size() < 3) {
+    return emitError(unknownLoc)
+           << "OpGraphConstantARM must have at least 2 operands";
+  }
+
+  Type resultType = getType(operands[0]);
+  if (!resultType) {
+    return emitError(unknownLoc, "undefined result type from <id> ")
+           << operands[0];
+  }
+
+  uint32_t resultID = operands[1];
+
+  if (!dyn_cast<spirv::TensorArmType>(resultType)) {
+    return emitError(unknownLoc, "result must be of type OpTypeTensorARM");
+  }
+
+  APInt graph_constant_id = APInt(32, operands[2], /*isSigned=*/true);
+  Type i32Ty = opBuilder.getIntegerType(32);
+  IntegerAttr attr = opBuilder.getIntegerAttr(i32Ty, graph_constant_id);
+  graphConstantMap.try_emplace(
+      resultID, GraphConstantARMOpMaterializationInfo{resultType, attr});
+
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // Control flow
 //===----------------------------------------------------------------------===//
@@ -1920,6 +2189,24 @@ LogicalResult spirv::Deserializer::processLabel(ArrayRef<uint32_t> operands) {
   return success();
 }
 
+LogicalResult spirv::Deserializer::createGraphBlock(uint32_t graphID) {
+  if (!curGraph) {
+    return emitError(unknownLoc, "a graph block must appear inside a graph");
+  }
+
+  // We may have forward declared this block.
+  Block *block = getOrCreateBlock(graphID);
+  LLVM_DEBUG(logger.startLine()
+             << "[block] populating block " << block << "\n");
+  // If we have seen this block, make sure it was just a forward declaration.
+  assert(block->empty() && "re-deserialize the same block!");
+
+  opBuilder.setInsertionPointToStart(block);
+  blockMap[graphID] = curBlock = block;
+
+  return success();
+}
+
 LogicalResult
 spirv::Deserializer::processSelectionMerge(ArrayRef<uint32_t> operands) {
   if (!curBlock) {
diff --git a/mlir/lib/Target/SPIRV/Deserialization/Deserializer.h b/mlir/lib/Target/SPIRV/Deserialization/Deserializer.h
index db1cc3f8d79c2..6027f1ac94c23 100644
--- a/mlir/lib/Target/SPIRV/Deserialization/Deserializer.h
+++ b/mlir/lib/Target/SPIRV/Deserialization/Deserializer.h
@@ -106,6 +106,13 @@ struct SpecConstOperationMaterializationInfo {
   SmallVector<uint32_t> enclosedOpOperands;
 };
 
+/// A struct that collects the info needed to materialize/emit a
+/// GraphConstantARMOp.
+struct GraphConstantARMOpMaterializationInfo {
+  Type resultType;
+  IntegerAttr graphConstantID;
+};
+
 //===----------------------------------------------------------------------===//
 // Deserializer Declaration
 //===----------------------------------------------------------------------===//
@@ -211,9 +218,14 @@ class Deserializer {
   /// exists; otherwise creates one based on the <id>.
   std::string getFunctionSymbol(uint32_t id);
 
-  /// Returns a symbol to be used for the specialization constant with the given
-  /// result <id>. This tries to use the specialization constant's OpName if
+  /// Returns a symbol to be used for the graph name with the given
+  /// result <id>. This tries to use the graph's OpName if
   /// exists; otherwise creates one based on the <id>.
+  std::string getGraphSymbol(uint32_t id);
+
+  /// Returns a symbol to be used for the specialization constant with the
+  /// given result <id>. This tries to use the specialization constant's
+  /// OpName if exists; otherwise creates one based on the <id>.
   std::string getSpecConstantSymbol(uint32_t id);
 
   /// Gets the specialization constant with the given result <id>.
@@ -237,6 +249,11 @@ class Deserializer {
   spirv::SpecConstantOp createSpecConstant(Location loc, uint32_t resultID,
                                            TypedAttr defaultValue);
 
+  /// Gets the GraphConstantARM ID attribute and result type with the given
+  /// result <id>.
+  std::optional<spirv::GraphConstantARMOpMaterializationInfo>
+  getGraphConstantARM(uint32_t id);
+
   /// Processes the OpVariable instructions at current `offset` into `binary`.
   /// It is expected that this method is used for variables that are to be
   /// defined at module scope and will be deserialized into a
@@ -306,6 +323,16 @@ class Deserializer {
 
   LogicalResult processTensorARMType(ArrayRef<uint32_t> operands);
 
+  LogicalResult processGraphTypeARM(ArrayRef<uint32_t> operands);
+
+  LogicalResult processGraphEntryPointARM(ArrayRef<uint32_t> operands);
+
+  LogicalResult processGraphARM(ArrayRef<uint32_t> operands);
+
+  LogicalResult processOpGraphSetOutputARM(ArrayRef<uint32_t> operands);
+
+  LogicalResult processGraphEndARM(ArrayRef<uint32_t> operands);
+
   LogicalResult processTypeForwardPointer(ArrayRef<uint32_t> operands);
 
   //===--------------------------------------------------------------------===//
@@ -353,6 +380,10 @@ class Deserializer {
   /// Processes a SPIR-V OpConstantNull instruction with the given `operands`.
   LogicalResult processConstantNull(ArrayRef<uint32_t> operands);
 
+  /// Processes a SPIR-V OpGraphConstantARM instruction with the given
+  /// `operands`.
+  LogicalResult processGraphConstantARM(ArrayRef<uint32_t> operands);
+
   //===--------------------------------------------------------------------===//
   // Debug
   //===--------------------------------------------------------------------===//
@@ -450,6 +481,9 @@ class Deserializer {
   /// blocks declared as selection/loop headers are handled.
   LogicalResult structurizeControlFlow();
 
+  /// Creates a block for graph with the given graphID.
+  LogicalResult createGraphBlock(uint32_t graphID);
+
   //===--------------------------------------------------------------------===//
   // Instruction
   //===--------------------------------------------------------------------===//
@@ -546,6 +580,9 @@ class Deserializer {
   /// The current function under construction.
   std::optional<spirv::FuncOp> curFunction;
 
+  /// The current graph under construction.
+  std::optional<spirv::GraphARMOp> curGraph;
+
   /// The current block under construction.
   Block *curBlock = nullptr;
 
@@ -599,12 +636,19 @@ class Deserializer {
   DenseMap<uint32_t, SpecConstOperationMaterializationInfo>
       specConstOperationMap;
 
+  // Result <id> to GraphConstantARM ID attribute and result type.
+  DenseMap<uint32_t, spirv::GraphConstantARMOpMaterializationInfo>
+      graphConstantMap;
+
   // Result <id> to variable mapping.
   DenseMap<uint32_t, spirv::GlobalVariableOp> globalVariableMap;
 
   // Result <id> to function mapping.
   DenseMap<uint32_t, spirv::FuncOp> funcMap;
 
+  // Result <id> to function mapping.
+  DenseMap<uint32_t, spirv::GraphARMOp> graphMap;
+
   // Result <id> to block mapping.
   DenseMap<uint32_t, Block *> blockMap;
 
@@ -668,6 +712,9 @@ class Deserializer {
   /// Deserialization options.
   DeserializationOptions options;
 
+  /// List of IDs assigned to graph outputs.
+  SmallVector<Value> graphOutputs;
+
 #ifndef NDEBUG
   /// A logger used to emit information during the deserialzation process.
   llvm::ScopedPrinter logger;
diff --git a/mlir/lib/Target/SPIRV/Serialization/SerializeOps.cpp b/mlir/lib/Target/SPIRV/Serialization/SerializeOps.cpp
index d62529b85b3aa..e9b180a70bb23 100644
--- a/mlir/lib/Target/SPIRV/Serialization/SerializeOps.cpp
+++ b/mlir/lib/Target/SPIRV/Serialization/SerializeOps.cpp
@@ -203,6 +203,16 @@ Serializer::processSpecConstantOperationOp(spirv::SpecConstantOperationOp op) {
   return success();
 }
 
+LogicalResult
+Serializer::processGraphConstantARMOp(spirv::GraphConstantARMOp op) {
+  if (uint32_t resultID = prepareGraphConstantId(op.getLoc(), op.getType(),
+                                                 op.getGraphConstantIdAttr())) {
+    valueIDMap[op.getResult()] = resultID;
+    return success();
+  }
+  return failure();
+}
+
 LogicalResult Serializer::processUndefOp(spirv::UndefOp op) {
   auto undefType = op.getType();
   auto &id = undefValIDMap[undefType];
@@ -368,6 +378,118 @@ LogicalResult Serializer::processFuncOp(spirv::FuncOp op) {
   return success();
 }
 
+LogicalResult Serializer::processGraphARMOp(spirv::GraphARMOp op) {
+  if (op.getNumResults() < 1) {
+    return op.emitError("cannot serialize graph with no return types");
+  }
+
+  LLVM_DEBUG(llvm::dbgs() << "-- start graph '" << op.getName() << "' --\n");
+  assert(functionHeader.empty() && functionBody.empty());
+
+  uint32_t funcID = getOrCreateFunctionID(op.getName());
+  uint32_t fnTypeID = 0;
+  // Generate type of the function.
+  if (failed(processType(op.getLoc(), op.getFunctionType(), fnTypeID)))
+    return failure();
+  encodeInstructionInto(functionHeader, spirv::Opcode::OpGraphARM,
+                        {fnTypeID, funcID});
+
+  // Declare the parameters.
+  for (auto [idx, arg] : llvm::enumerate(op.getArguments())) {
+    uint32_t argTypeID = 0;
+    SmallVector<uint32_t, 3> inputOperands;
+
+    if (failed(processType(op.getLoc(), arg.getType(), argTypeID))) {
+      return failure();
+    }
+
+    uint32_t argValueID = getNextID();
+    valueIDMap[arg] = argValueID;
+
+    auto attr = IntegerAttr::get(IntegerType::get(op.getContext(), 32), idx);
+    uint32_t indexID = prepareConstantInt(op.getLoc(), attr, false);
+
+    inputOperands.push_back(argTypeID);
+    inputOperands.push_back(argValueID);
+    inputOperands.push_back(indexID);
+
+    encodeInstructionInto(functionHeader, spirv::Opcode::OpGraphInputARM,
+                          inputOperands);
+  }
+
+  if (failed(processBlock(&op.front(), /*omitLabel=*/true)))
+    return failure();
+  if (failed(visitInPrettyBlockOrder(
+          &op.front(), [&](Block *block) { return processBlock(block); },
+          /*skipHeader=*/true))) {
+    return failure();
+  }
+
+  LLVM_DEBUG(llvm::dbgs() << "-- completed graph '" << op.getName()
+                          << "' --\n");
+  // Insert OpGraphEndARM.
+  encodeInstructionInto(functionBody, spirv::Opcode::OpGraphEndARM, {});
+
+  llvm::append_range(graphs, functionHeader);
+  llvm::append_range(graphs, functionBody);
+  functionHeader.clear();
+  functionBody.clear();
+
+  return success();
+}
+
+LogicalResult
+Serializer::processGraphEntryPointARMOp(spirv::GraphEntryPointARMOp op) {
+  SmallVector<uint32_t, 4> operands;
+  StringRef graph = op.getFn();
+  // Add the graph <id>.
+  uint32_t graphID = getOrCreateFunctionID(graph);
+  operands.push_back(graphID);
+  // Add the name of the graph.
+  spirv::encodeStringLiteralInto(operands, graph);
+
+  // Add the interface values.
+  if (ArrayAttr interface = op.getInterface()) {
+    for (Attribute var : interface.getValue()) {
+      StringRef value = cast<FlatSymbolRefAttr>(var).getValue();
+      if (uint32_t id = getVariableID(value)) {
+        operands.push_back(id);
+      } else {
+        return op.emitError(
+            "referencing undefined global variable."
+            "spirv.GraphEntryPointARM is at the end of spirv.module. All "
+            "referenced variables should already be defined");
+      }
+    }
+  }
+  encodeInstructionInto(graphs, spirv::Opcode::OpGraphEntryPointARM, operands);
+  return success();
+}
+
+LogicalResult
+Serializer::processGraphOutputsARMOp(spirv::GraphOutputsARMOp op) {
+  for (auto [idx, value] : llvm::enumerate(op->getOperands())) {
+    SmallVector<uint32_t, 2> outputOperands;
+
+    Type resType = value.getType();
+    uint32_t resTypeID = 0;
+    if (failed(processType(op.getLoc(), resType, resTypeID))) {
+      return failure();
+    }
+
+    uint32_t outputID = getValueID(value);
+    auto attr = IntegerAttr::get(IntegerType::get(op.getContext(), 32), idx);
+    uint32_t indexID = prepareConstantInt(op.getLoc(), attr, false);
+
+    outputOperands.push_back(outputID);
+    outputOperands.push_back(indexID);
+
+    encodeInstructionInto(functionBody, spirv::Opcode::OpGraphSetOutputARM,
+                          outputOperands);
+  }
+  return success();
+}
+
 LogicalResult Serializer::processVariableOp(spirv::VariableOp op) {
   SmallVector<uint32_t, 4> operands;
   SmallVector<StringRef, 2> elidedAttrs;
diff --git a/mlir/lib/Target/SPIRV/Serialization/Serializer.cpp b/mlir/lib/Target/SPIRV/Serialization/Serializer.cpp
index 7fc779587f4f1..b56e7788625f5 100644
--- a/mlir/lib/Target/SPIRV/Serialization/Serializer.cpp
+++ b/mlir/lib/Target/SPIRV/Serialization/Serializer.cpp
@@ -136,7 +136,7 @@ void Serializer::collect(SmallVectorImpl<uint32_t> &binary) {
                     extensions.size() + extendedSets.size() +
                     memoryModel.size() + entryPoints.size() +
                     executionModes.size() + decorations.size() +
-                    typesGlobalValues.size() + functions.size();
+                    typesGlobalValues.size() + functions.size() + graphs.size();
 
   binary.clear();
   binary.reserve(moduleSize);
@@ -154,6 +154,7 @@ void Serializer::collect(SmallVectorImpl<uint32_t> &binary) {
   binary.append(decorations.begin(), decorations.end());
   binary.append(typesGlobalValues.begin(), typesGlobalValues.end());
   binary.append(functions.begin(), functions.end());
+  binary.append(graphs.begin(), graphs.end());
 }
 
 #ifndef NDEBUG
@@ -509,6 +510,9 @@ Serializer::processTypeImpl(Location loc, Type type, uint32_t &typeID,
   if ((isa<FunctionType>(type) &&
        succeeded(prepareFunctionType(loc, cast<FunctionType>(type), typeEnum,
                                      operands))) ||
+      (isa<GraphType>(type) &&
+       succeeded(
+           prepareGraphType(loc, cast<GraphType>(type), typeEnum, operands))) ||
       succeeded(prepareBasicType(loc, type, typeID, typeEnum, operands,
                                  deferSerialization, serializationCtx))) {
     if (deferSerialization)
@@ -539,7 +543,7 @@ Serializer::processTypeImpl(Location loc, Type type, uint32_t &typeID,
     return success();
   }
 
-  return failure();
+  return emitError(loc, "failed to process type: ") << type;
 }
 
 LogicalResult Serializer::prepareBasicType(
@@ -875,6 +879,33 @@ Serializer::prepareFunctionType(Location loc, FunctionType type,
   return success();
 }
 
+LogicalResult
+Serializer::prepareGraphType(Location loc, GraphType type,
+                             spirv::Opcode &typeEnum,
+                             SmallVectorImpl<uint32_t> &operands) {
+  typeEnum = spirv::Opcode::OpTypeGraphARM;
+  assert(type.getNumResults() >= 1 &&
+         "serialization requires at least a return value");
+
+  operands.push_back(type.getNumInputs());
+
+  for (Type argType : type.getInputs()) {
+    uint32_t argTypeID = 0;
+    if (failed(processType(loc, argType, argTypeID)))
+      return failure();
+    operands.push_back(argTypeID);
+  }
+
+  for (Type resType : type.getResults()) {
+    uint32_t resTypeID = 0;
+    if (failed(processType(loc, resType, resTypeID)))
+      return failure();
+    operands.push_back(resTypeID);
+  }
+
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // Constant
 //===----------------------------------------------------------------------===//
@@ -1135,6 +1166,41 @@ uint32_t Serializer::prepareConstantInt(Location loc, IntegerAttr intAttr,
   return resultID;
 }
 
+uint32_t Serializer::prepareGraphConstantId(Location loc, Type graphConstType,
+                                            IntegerAttr intAttr) {
+  // De-duplicate graph constants.
+  if (uint32_t id = getGraphConstantARMId(intAttr)) {
+    return id;
+  }
+
+  // Process the type for this graph constant.
+  uint32_t typeID = 0;
+  if (failed(processType(loc, graphConstType, typeID))) {
+    return 0;
+  }
+
+  uint32_t resultID = getNextID();
+  APInt value = intAttr.getValue();
+  unsigned bitwidth = value.getBitWidth();
+  if (bitwidth > 32) {
+    emitError(loc, "Too wide attribute for OpGraphConstantARM: ")
+        << bitwidth << " bits";
+    return 0;
+  }
+  bool isSigned = value.isSignedIntN(bitwidth);
+
+  uint32_t word = 0;
+  if (isSigned) {
+    word = static_cast<int32_t>(value.getSExtValue());
+  } else {
+    word = static_cast<uint32_t>(value.getZExtValue());
+  }
+  encodeInstructionInto(typesGlobalValues, spirv::Opcode::OpGraphConstantARM,
+                        {typeID, resultID, word});
+  graphConstIDMap[intAttr] = resultID;
+  return resultID;
+}
+
 uint32_t Serializer::prepareConstantFp(Location loc, FloatAttr floatAttr,
                                        bool isSpec) {
   if (!isSpec) {
@@ -1469,9 +1535,19 @@ LogicalResult Serializer::processOperation(Operation *opInst) {
         return processConstantCompositeReplicateOp(op);
       })
       .Case([&](spirv::FuncOp op) { return processFuncOp(op); })
+      .Case([&](spirv::GraphARMOp op) { return processGraphARMOp(op); })
+      .Case([&](spirv::GraphEntryPointARMOp op) {
+        return processGraphEntryPointARMOp(op);
+      })
+      .Case([&](spirv::GraphOutputsARMOp op) {
+        return processGraphOutputsARMOp(op);
+      })
       .Case([&](spirv::GlobalVariableOp op) {
         return processGlobalVariableOp(op);
       })
+      .Case([&](spirv::GraphConstantARMOp op) {
+        return processGraphConstantARMOp(op);
+      })
       .Case([&](spirv::LoopOp op) { return processLoopOp(op); })
       .Case([&](spirv::ReferenceOfOp op) { return processReferenceOfOp(op); })
       .Case([&](spirv::SelectionOp op) { return processSelectionOp(op); })
diff --git a/mlir/lib/Target/SPIRV/Serialization/Serializer.h b/mlir/lib/Target/SPIRV/Serialization/Serializer.h
index fb2cecdff8e43..add372b19b5af 100644
--- a/mlir/lib/Target/SPIRV/Serialization/Serializer.h
+++ b/mlir/lib/Target/SPIRV/Serialization/Serializer.h
@@ -122,6 +122,8 @@ class Serializer {
   LogicalResult
   processSpecConstantOperationOp(spirv::SpecConstantOperationOp op);
 
+  LogicalResult processGraphConstantARMOp(spirv::GraphConstantARMOp op);
+
   /// SPIR-V dialect supports OpUndef using spirv.UndefOp that produces a SSA
   /// value to use with other operations. The SPIR-V spec recommends that
   /// OpUndef be generated at module level. The serialization generates an
@@ -135,6 +137,15 @@ class Serializer {
   LogicalResult processFuncOp(spirv::FuncOp op);
   LogicalResult processFuncParameter(spirv::FuncOp op);
 
+  /// Processes a SPIR-V GraphARM op.
+  LogicalResult processGraphARMOp(spirv::GraphARMOp op);
+
+  /// Processes a SPIR-V GraphEntryPointARM op.
+  LogicalResult processGraphEntryPointARMOp(spirv::GraphEntryPointARMOp op);
+
+  /// Processes a SPIR-V GraphOutputsARMOp op.
+  LogicalResult processGraphOutputsARMOp(spirv::GraphOutputsARMOp op);
+
   LogicalResult processVariableOp(spirv::VariableOp op);
 
   /// Process a SPIR-V GlobalVariableOp
@@ -189,6 +200,10 @@ class Serializer {
                                     spirv::Opcode &typeEnum,
                                     SmallVectorImpl<uint32_t> &operands);
 
+  LogicalResult prepareGraphType(Location loc, GraphType type,
+                                 spirv::Opcode &typeEnum,
+                                 SmallVectorImpl<uint32_t> &operands);
+
   //===--------------------------------------------------------------------===//
   // Constant
   //===--------------------------------------------------------------------===//
@@ -238,6 +253,13 @@ class Serializer {
   uint32_t prepareConstantInt(Location loc, IntegerAttr intAttr,
                               bool isSpec = false);
 
+  uint32_t getGraphConstantARMId(Attribute value) const {
+    return graphConstIDMap.lookup(value);
+  }
+
+  uint32_t prepareGraphConstantId(Location loc, Type graphConstType,
+                                  IntegerAttr intAttr);
+
   uint32_t prepareConstantFp(Location loc, FloatAttr floatAttr,
                              bool isSpec = false);
 
@@ -372,6 +394,7 @@ class Serializer {
   SmallVector<uint32_t, 0> decorations;
   SmallVector<uint32_t, 0> typesGlobalValues;
   SmallVector<uint32_t, 0> functions;
+  SmallVector<uint32_t, 0> graphs;
 
   /// Recursive struct references are serialized as OpTypePointer instructions
   /// to the recursive struct type. However, the OpTypePointer instruction
@@ -388,15 +411,22 @@ class Serializer {
       recursiveStructInfos;
 
   /// `functionHeader` contains all the instructions that must be in the first
-  /// block in the function, and `functionBody` contains the rest. After
-  /// processing FuncOp, the encoded instructions of a function are appended to
-  /// `functions`. An example of instructions in `functionHeader` in order:
+  /// block in the function or graph, and `functionBody` contains the rest.
+  /// After processing FuncOp/GraphARMOp, the encoded instructions of a function
+  /// or graph are appended to `functions` or `graphs` respectively. Examples of
+  /// instructions in `functionHeader` in order:
+  ///
+  /// For a FuncOp:
   /// OpFunction ...
   /// OpFunctionParameter ...
   /// OpFunctionParameter ...
   /// OpLabel ...
   /// OpVariable ...
   /// OpVariable ...
+  ///
+  /// For a GraphARMOp
+  /// OpGraphARM ...
+  /// OpGraphInputARM ...
   SmallVector<uint32_t, 0> functionHeader;
   SmallVector<uint32_t, 0> functionBody;
 
@@ -412,6 +442,9 @@ class Serializer {
   /// Map from specialization constant names to their <id>s.
   llvm::StringMap<uint32_t> specConstIDMap;
 
+  /// Map from graph constant ID value to their <id>s.
+  DenseMap<Attribute, uint32_t> graphConstIDMap;
+
   /// Map from GlobalVariableOps name to <id>s.
   llvm::StringMap<uint32_t> globalVarIDMap;
 
diff --git a/mlir/lib/Tools/lsp-server-support/CMakeLists.txt b/mlir/lib/Tools/lsp-server-support/CMakeLists.txt
index 48a96016b792f..2fe29f1b9ec41 100644
--- a/mlir/lib/Tools/lsp-server-support/CMakeLists.txt
+++ b/mlir/lib/Tools/lsp-server-support/CMakeLists.txt
@@ -1,13 +1,13 @@
 add_mlir_library(MLIRLspServerSupportLib
   CompilationDatabase.cpp
-  Logging.cpp
-  Protocol.cpp
   SourceMgrUtils.cpp
-  Transport.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${MLIR_MAIN_INCLUDE_DIR}/mlir/Tools/lsp-server-support
 
+  LINK_COMPONENTS
+  SupportLSP
+
   LINK_LIBS PUBLIC
   MLIRSupport
-  )
+)
diff --git a/mlir/lib/Tools/lsp-server-support/CompilationDatabase.cpp b/mlir/lib/Tools/lsp-server-support/CompilationDatabase.cpp
index 9ae0674383a1d..67b8ef6a256bb 100644
--- a/mlir/lib/Tools/lsp-server-support/CompilationDatabase.cpp
+++ b/mlir/lib/Tools/lsp-server-support/CompilationDatabase.cpp
@@ -8,14 +8,15 @@
 
 #include "mlir/Tools/lsp-server-support/CompilationDatabase.h"
 #include "mlir/Support/FileUtilities.h"
-#include "mlir/Tools/lsp-server-support/Logging.h"
-#include "mlir/Tools/lsp-server-support/Protocol.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Support/LSP/Logging.h"
+#include "llvm/Support/LSP/Protocol.h"
 #include "llvm/Support/YAMLTraits.h"
 
 using namespace mlir;
 using namespace mlir::lsp;
+using llvm::lsp::Logger;
 
 //===----------------------------------------------------------------------===//
 // YamlFileInfo
diff --git a/mlir/lib/Tools/lsp-server-support/Protocol.cpp b/mlir/lib/Tools/lsp-server-support/Protocol.cpp
deleted file mode 100644
index 98287048355c1..0000000000000
--- a/mlir/lib/Tools/lsp-server-support/Protocol.cpp
+++ /dev/null
@@ -1,1043 +0,0 @@
-//===--- Protocol.cpp - Language Server Protocol Implementation -----------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the serialization code for the LSP structs.
-//
-//===----------------------------------------------------------------------===//
-
-#include "mlir/Tools/lsp-server-support/Protocol.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/ADT/StringSet.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/JSON.h"
-#include "llvm/Support/MemoryBuffer.h"
-#include "llvm/Support/Path.h"
-#include "llvm/Support/raw_ostream.h"
-
-using namespace mlir;
-using namespace mlir::lsp;
-
-// Helper that doesn't treat `null` and absent fields as failures.
-template <typename T>
-static bool mapOptOrNull(const llvm::json::Value &params,
-                         llvm::StringLiteral prop, T &out,
-                         llvm::json::Path path) {
-  const llvm::json::Object *o = params.getAsObject();
-  assert(o);
-
-  // Field is missing or null.
-  auto *v = o->get(prop);
-  if (!v || v->getAsNull())
-    return true;
-  return fromJSON(*v, out, path.field(prop));
-}
-
-//===----------------------------------------------------------------------===//
-// LSPError
-//===----------------------------------------------------------------------===//
-
-char LSPError::ID;
-
-//===----------------------------------------------------------------------===//
-// URIForFile
-//===----------------------------------------------------------------------===//
-
-static bool isWindowsPath(StringRef path) {
-  return path.size() > 1 && llvm::isAlpha(path[0]) && path[1] == ':';
-}
-
-static bool isNetworkPath(StringRef path) {
-  return path.size() > 2 && path[0] == path[1] &&
-         llvm::sys::path::is_separator(path[0]);
-}
-
-static bool shouldEscapeInURI(unsigned char c) {
-  // Unreserved characters.
-  if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') ||
-      (c >= '0' && c <= '9'))
-    return false;
-
-  switch (c) {
-  case '-':
-  case '_':
-  case '.':
-  case '~':
-  // '/' is only reserved when parsing.
-  case '/':
-  // ':' is only reserved for relative URI paths, which we doesn't produce.
-  case ':':
-    return false;
-  }
-  return true;
-}
-
-/// Encodes a string according to percent-encoding.
-/// - Unreserved characters are not escaped.
-/// - Reserved characters always escaped with exceptions like '/'.
-/// - All other characters are escaped.
-static void percentEncode(StringRef content, std::string &out) {
-  for (unsigned char c : content) {
-    if (shouldEscapeInURI(c)) {
-      out.push_back('%');
-      out.push_back(llvm::hexdigit(c / 16));
-      out.push_back(llvm::hexdigit(c % 16));
-    } else {
-      out.push_back(c);
-    }
-  }
-}
-
-/// Decodes a string according to percent-encoding.
-static std::string percentDecode(StringRef content) {
-  std::string result;
-  for (auto i = content.begin(), e = content.end(); i != e; ++i) {
-    if (*i != '%') {
-      result += *i;
-      continue;
-    }
-    if (*i == '%' && i + 2 < content.end() && llvm::isHexDigit(*(i + 1)) &&
-        llvm::isHexDigit(*(i + 2))) {
-      result.push_back(llvm::hexFromNibbles(*(i + 1), *(i + 2)));
-      i += 2;
-    } else {
-      result.push_back(*i);
-    }
-  }
-  return result;
-}
-
-/// Return the set containing the supported URI schemes.
-static StringSet<> &getSupportedSchemes() {
-  static StringSet<> schemes({"file", "test"});
-  return schemes;
-}
-
-/// Returns true if the given scheme is structurally valid, i.e. it does not
-/// contain any invalid scheme characters. This does not check that the scheme
-/// is actually supported.
-static bool isStructurallyValidScheme(StringRef scheme) {
-  if (scheme.empty())
-    return false;
-  if (!llvm::isAlpha(scheme[0]))
-    return false;
-  return llvm::all_of(llvm::drop_begin(scheme), [](char c) {
-    return llvm::isAlnum(c) || c == '+' || c == '.' || c == '-';
-  });
-}
-
-static llvm::Expected<std::string> uriFromAbsolutePath(StringRef absolutePath,
-                                                       StringRef scheme) {
-  std::string body;
-  StringRef authority;
-  StringRef root = llvm::sys::path::root_name(absolutePath);
-  if (isNetworkPath(root)) {
-    // Windows UNC paths e.g. \\server\share => file://server/share
-    authority = root.drop_front(2);
-    absolutePath.consume_front(root);
-  } else if (isWindowsPath(root)) {
-    // Windows paths e.g. X:\path => file:///X:/path
-    body = "/";
-  }
-  body += llvm::sys::path::convert_to_slash(absolutePath);
-
-  std::string uri = scheme.str() + ":";
-  if (authority.empty() && body.empty())
-    return uri;
-
-  // If authority if empty, we only print body if it starts with "/"; otherwise,
-  // the URI is invalid.
-  if (!authority.empty() || StringRef(body).starts_with("/")) {
-    uri.append("//");
-    percentEncode(authority, uri);
-  }
-  percentEncode(body, uri);
-  return uri;
-}
-
-static llvm::Expected<std::string> getAbsolutePath(StringRef authority,
-                                                   StringRef body) {
-  if (!body.starts_with("/"))
-    return llvm::createStringError(
-        llvm::inconvertibleErrorCode(),
-        "File scheme: expect body to be an absolute path starting "
-        "with '/': " +
-            body);
-  SmallString<128> path;
-  if (!authority.empty()) {
-    // Windows UNC paths e.g. file://server/share => \\server\share
-    ("//" + authority).toVector(path);
-  } else if (isWindowsPath(body.substr(1))) {
-    // Windows paths e.g. file:///X:/path => X:\path
-    body.consume_front("/");
-  }
-  path.append(body);
-  llvm::sys::path::native(path);
-  return std::string(path);
-}
-
-static llvm::Expected<std::string> parseFilePathFromURI(StringRef origUri) {
-  StringRef uri = origUri;
-
-  // Decode the scheme of the URI.
-  size_t pos = uri.find(':');
-  if (pos == StringRef::npos)
-    return llvm::createStringError(llvm::inconvertibleErrorCode(),
-                                   "Scheme must be provided in URI: " +
-                                       origUri);
-  StringRef schemeStr = uri.substr(0, pos);
-  std::string uriScheme = percentDecode(schemeStr);
-  if (!isStructurallyValidScheme(uriScheme))
-    return llvm::createStringError(llvm::inconvertibleErrorCode(),
-                                   "Invalid scheme: " + schemeStr +
-                                       " (decoded: " + uriScheme + ")");
-  uri = uri.substr(pos + 1);
-
-  // Decode the authority of the URI.
-  std::string uriAuthority;
-  if (uri.consume_front("//")) {
-    pos = uri.find('/');
-    uriAuthority = percentDecode(uri.substr(0, pos));
-    uri = uri.substr(pos);
-  }
-
-  // Decode the body of the URI.
-  std::string uriBody = percentDecode(uri);
-
-  // Compute the absolute path for this uri.
-  if (!getSupportedSchemes().contains(uriScheme)) {
-    return llvm::createStringError(llvm::inconvertibleErrorCode(),
-                                   "unsupported URI scheme `" + uriScheme +
-                                       "' for workspace files");
-  }
-  return getAbsolutePath(uriAuthority, uriBody);
-}
-
-llvm::Expected<URIForFile> URIForFile::fromURI(StringRef uri) {
-  llvm::Expected<std::string> filePath = parseFilePathFromURI(uri);
-  if (!filePath)
-    return filePath.takeError();
-  return URIForFile(std::move(*filePath), uri.str());
-}
-
-llvm::Expected<URIForFile> URIForFile::fromFile(StringRef absoluteFilepath,
-                                                StringRef scheme) {
-  llvm::Expected<std::string> uri =
-      uriFromAbsolutePath(absoluteFilepath, scheme);
-  if (!uri)
-    return uri.takeError();
-  return fromURI(*uri);
-}
-
-StringRef URIForFile::scheme() const { return uri().split(':').first; }
-
-void URIForFile::registerSupportedScheme(StringRef scheme) {
-  getSupportedSchemes().insert(scheme);
-}
-
-bool mlir::lsp::fromJSON(const llvm::json::Value &value, URIForFile &result,
-                         llvm::json::Path path) {
-  if (std::optional<StringRef> str = value.getAsString()) {
-    llvm::Expected<URIForFile> expectedURI = URIForFile::fromURI(*str);
-    if (!expectedURI) {
-      path.report("unresolvable URI");
-      consumeError(expectedURI.takeError());
-      return false;
-    }
-    result = std::move(*expectedURI);
-    return true;
-  }
-  return false;
-}
-
-llvm::json::Value mlir::lsp::toJSON(const URIForFile &value) {
-  return value.uri();
-}
-
-raw_ostream &mlir::lsp::operator<<(raw_ostream &os, const URIForFile &value) {
-  return os << value.uri();
-}
-
-//===----------------------------------------------------------------------===//
-// ClientCapabilities
-//===----------------------------------------------------------------------===//
-
-bool mlir::lsp::fromJSON(const llvm::json::Value &value,
-                         ClientCapabilities &result, llvm::json::Path path) {
-  const llvm::json::Object *o = value.getAsObject();
-  if (!o) {
-    path.report("expected object");
-    return false;
-  }
-  if (const llvm::json::Object *textDocument = o->getObject("textDocument")) {
-    if (const llvm::json::Object *documentSymbol =
-            textDocument->getObject("documentSymbol")) {
-      if (std::optional<bool> hierarchicalSupport =
-              documentSymbol->getBoolean("hierarchicalDocumentSymbolSupport"))
-        result.hierarchicalDocumentSymbol = *hierarchicalSupport;
-    }
-    if (auto *codeAction = textDocument->getObject("codeAction")) {
-      if (codeAction->getObject("codeActionLiteralSupport"))
-        result.codeActionStructure = true;
-    }
-  }
-  if (auto *window = o->getObject("window")) {
-    if (std::optional<bool> workDoneProgressSupport =
-            window->getBoolean("workDoneProgress"))
-      result.workDoneProgress = *workDoneProgressSupport;
-  }
-  return true;
-}
-
-//===----------------------------------------------------------------------===//
-// ClientInfo
-//===----------------------------------------------------------------------===//
-
-bool mlir::lsp::fromJSON(const llvm::json::Value &value, ClientInfo &result,
-                         llvm::json::Path path) {
-  llvm::json::ObjectMapper o(value, path);
-  if (!o || !o.map("name", result.name))
-    return false;
-
-  // Don't fail if we can't parse version.
-  o.map("version", result.version);
-  return true;
-}
-
-//===----------------------------------------------------------------------===//
-// InitializeParams
-//===----------------------------------------------------------------------===//
-
-bool mlir::lsp::fromJSON(const llvm::json::Value &value, TraceLevel &result,
-                         llvm::json::Path path) {
-  if (std::optional<StringRef> str = value.getAsString()) {
-    if (*str == "off") {
-      result = TraceLevel::Off;
-      return true;
-    }
-    if (*str == "messages") {
-      result = TraceLevel::Messages;
-      return true;
-    }
-    if (*str == "verbose") {
-      result = TraceLevel::Verbose;
-      return true;
-    }
-  }
-  return false;
-}
-
-bool mlir::lsp::fromJSON(const llvm::json::Value &value,
-                         InitializeParams &result, llvm::json::Path path) {
-  llvm::json::ObjectMapper o(value, path);
-  if (!o)
-    return false;
-  // We deliberately don't fail if we can't parse individual fields.
-  o.map("capabilities", result.capabilities);
-  o.map("trace", result.trace);
-  mapOptOrNull(value, "clientInfo", result.clientInfo, path);
-
-  return true;
-}
-
-//===----------------------------------------------------------------------===//
-// TextDocumentItem
-//===----------------------------------------------------------------------===//
-
-bool mlir::lsp::fromJSON(const llvm::json::Value &value,
-                         TextDocumentItem &result, llvm::json::Path path) {
-  llvm::json::ObjectMapper o(value, path);
-  return o && o.map("uri", result.uri) &&
-         o.map("languageId", result.languageId) && o.map("text", result.text) &&
-         o.map("version", result.version);
-}
-
-//===----------------------------------------------------------------------===//
-// TextDocumentIdentifier
-//===----------------------------------------------------------------------===//
-
-llvm::json::Value mlir::lsp::toJSON(const TextDocumentIdentifier &value) {
-  return llvm::json::Object{{"uri", value.uri}};
-}
-
-bool mlir::lsp::fromJSON(const llvm::json::Value &value,
-                         TextDocumentIdentifier &result,
-                         llvm::json::Path path) {
-  llvm::json::ObjectMapper o(value, path);
-  return o && o.map("uri", result.uri);
-}
-
-//===----------------------------------------------------------------------===//
-// VersionedTextDocumentIdentifier
-//===----------------------------------------------------------------------===//
-
-llvm::json::Value
-mlir::lsp::toJSON(const VersionedTextDocumentIdentifier &value) {
-  return llvm::json::Object{
-      {"uri", value.uri},
-      {"version", value.version},
-  };
-}
-
-bool mlir::lsp::fromJSON(const llvm::json::Value &value,
-                         VersionedTextDocumentIdentifier &result,
-                         llvm::json::Path path) {
-  llvm::json::ObjectMapper o(value, path);
-  return o && o.map("uri", result.uri) && o.map("version", result.version);
-}
-
-//===----------------------------------------------------------------------===//
-// Position
-//===----------------------------------------------------------------------===//
-
-bool mlir::lsp::fromJSON(const llvm::json::Value &value, Position &result,
-                         llvm::json::Path path) {
-  llvm::json::ObjectMapper o(value, path);
-  return o && o.map("line", result.line) &&
-         o.map("character", result.character);
-}
-
-llvm::json::Value mlir::lsp::toJSON(const Position &value) {
-  return llvm::json::Object{
-      {"line", value.line},
-      {"character", value.character},
-  };
-}
-
-raw_ostream &mlir::lsp::operator<<(raw_ostream &os, const Position &value) {
-  return os << value.line << ':' << value.character;
-}
-
-//===----------------------------------------------------------------------===//
-// Range
-//===----------------------------------------------------------------------===//
-
-bool mlir::lsp::fromJSON(const llvm::json::Value &value, Range &result,
-                         llvm::json::Path path) {
-  llvm::json::ObjectMapper o(value, path);
-  return o && o.map("start", result.start) && o.map("end", result.end);
-}
-
-llvm::json::Value mlir::lsp::toJSON(const Range &value) {
-  return llvm::json::Object{
-      {"start", value.start},
-      {"end", value.end},
-  };
-}
-
-raw_ostream &mlir::lsp::operator<<(raw_ostream &os, const Range &value) {
-  return os << value.start << '-' << value.end;
-}
-
-//===----------------------------------------------------------------------===//
-// Location
-//===----------------------------------------------------------------------===//
-
-bool mlir::lsp::fromJSON(const llvm::json::Value &value, Location &result,
-                         llvm::json::Path path) {
-  llvm::json::ObjectMapper o(value, path);
-  return o && o.map("uri", result.uri) && o.map("range", result.range);
-}
-
-llvm::json::Value mlir::lsp::toJSON(const Location &value) {
-  return llvm::json::Object{
-      {"uri", value.uri},
-      {"range", value.range},
-  };
-}
-
-raw_ostream &mlir::lsp::operator<<(raw_ostream &os, const Location &value) {
-  return os << value.range << '@' << value.uri;
-}
-
-//===----------------------------------------------------------------------===//
-// TextDocumentPositionParams
-//===----------------------------------------------------------------------===//
-
-bool mlir::lsp::fromJSON(const llvm::json::Value &value,
-                         TextDocumentPositionParams &result,
-                         llvm::json::Path path) {
-  llvm::json::ObjectMapper o(value, path);
-  return o && o.map("textDocument", result.textDocument) &&
-         o.map("position", result.position);
-}
-
-//===----------------------------------------------------------------------===//
-// ReferenceParams
-//===----------------------------------------------------------------------===//
-
-bool mlir::lsp::fromJSON(const llvm::json::Value &value,
-                         ReferenceContext &result, llvm::json::Path path) {
-  llvm::json::ObjectMapper o(value, path);
-  return o && o.mapOptional("includeDeclaration", result.includeDeclaration);
-}
-
-bool mlir::lsp::fromJSON(const llvm::json::Value &value,
-                         ReferenceParams &result, llvm::json::Path path) {
-  TextDocumentPositionParams &base = result;
-  llvm::json::ObjectMapper o(value, path);
-  return fromJSON(value, base, path) && o &&
-         o.mapOptional("context", result.context);
-}
-
-//===----------------------------------------------------------------------===//
-// DidOpenTextDocumentParams
-//===----------------------------------------------------------------------===//
-
-bool mlir::lsp::fromJSON(const llvm::json::Value &value,
-                         DidOpenTextDocumentParams &result,
-                         llvm::json::Path path) {
-  llvm::json::ObjectMapper o(value, path);
-  return o && o.map("textDocument", result.textDocument);
-}
-
-//===----------------------------------------------------------------------===//
-// DidCloseTextDocumentParams
-//===----------------------------------------------------------------------===//
-
-bool mlir::lsp::fromJSON(const llvm::json::Value &value,
-                         DidCloseTextDocumentParams &result,
-                         llvm::json::Path path) {
-  llvm::json::ObjectMapper o(value, path);
-  return o && o.map("textDocument", result.textDocument);
-}
-
-//===----------------------------------------------------------------------===//
-// DidChangeTextDocumentParams
-//===----------------------------------------------------------------------===//
-
-LogicalResult
-TextDocumentContentChangeEvent::applyTo(std::string &contents) const {
-  // If there is no range, the full document changed.
-  if (!range) {
-    contents = text;
-    return success();
-  }
-
-  // Try to map the replacement range to the content.
-  llvm::SourceMgr tmpScrMgr;
-  tmpScrMgr.AddNewSourceBuffer(llvm::MemoryBuffer::getMemBuffer(contents),
-                               SMLoc());
-  SMRange rangeLoc = range->getAsSMRange(tmpScrMgr);
-  if (!rangeLoc.isValid())
-    return failure();
-
-  contents.replace(rangeLoc.Start.getPointer() - contents.data(),
-                   rangeLoc.End.getPointer() - rangeLoc.Start.getPointer(),
-                   text);
-  return success();
-}
-
-LogicalResult TextDocumentContentChangeEvent::applyTo(
-    ArrayRef<TextDocumentContentChangeEvent> changes, std::string &contents) {
-  for (const auto &change : changes)
-    if (failed(change.applyTo(contents)))
-      return failure();
-  return success();
-}
-
-bool mlir::lsp::fromJSON(const llvm::json::Value &value,
-                         TextDocumentContentChangeEvent &result,
-                         llvm::json::Path path) {
-  llvm::json::ObjectMapper o(value, path);
-  return o && o.map("range", result.range) &&
-         o.map("rangeLength", result.rangeLength) && o.map("text", result.text);
-}
-
-bool mlir::lsp::fromJSON(const llvm::json::Value &value,
-                         DidChangeTextDocumentParams &result,
-                         llvm::json::Path path) {
-  llvm::json::ObjectMapper o(value, path);
-  return o && o.map("textDocument", result.textDocument) &&
-         o.map("contentChanges", result.contentChanges);
-}
-
-//===----------------------------------------------------------------------===//
-// MarkupContent
-//===----------------------------------------------------------------------===//
-
-static llvm::StringRef toTextKind(MarkupKind kind) {
-  switch (kind) {
-  case MarkupKind::PlainText:
-    return "plaintext";
-  case MarkupKind::Markdown:
-    return "markdown";
-  }
-  llvm_unreachable("Invalid MarkupKind");
-}
-
-raw_ostream &mlir::lsp::operator<<(raw_ostream &os, MarkupKind kind) {
-  return os << toTextKind(kind);
-}
-
-llvm::json::Value mlir::lsp::toJSON(const MarkupContent &mc) {
-  if (mc.value.empty())
-    return nullptr;
-
-  return llvm::json::Object{
-      {"kind", toTextKind(mc.kind)},
-      {"value", mc.value},
-  };
-}
-
-//===----------------------------------------------------------------------===//
-// Hover
-//===----------------------------------------------------------------------===//
-
-llvm::json::Value mlir::lsp::toJSON(const Hover &hover) {
-  llvm::json::Object result{{"contents", toJSON(hover.contents)}};
-  if (hover.range)
-    result["range"] = toJSON(*hover.range);
-  return std::move(result);
-}
-
-//===----------------------------------------------------------------------===//
-// DocumentSymbol
-//===----------------------------------------------------------------------===//
-
-llvm::json::Value mlir::lsp::toJSON(const DocumentSymbol &symbol) {
-  llvm::json::Object result{{"name", symbol.name},
-                            {"kind", static_cast<int>(symbol.kind)},
-                            {"range", symbol.range},
-                            {"selectionRange", symbol.selectionRange}};
-
-  if (!symbol.detail.empty())
-    result["detail"] = symbol.detail;
-  if (!symbol.children.empty())
-    result["children"] = symbol.children;
-  return std::move(result);
-}
-
-//===----------------------------------------------------------------------===//
-// DocumentSymbolParams
-//===----------------------------------------------------------------------===//
-
-bool mlir::lsp::fromJSON(const llvm::json::Value &value,
-                         DocumentSymbolParams &result, llvm::json::Path path) {
-  llvm::json::ObjectMapper o(value, path);
-  return o && o.map("textDocument", result.textDocument);
-}
-
-//===----------------------------------------------------------------------===//
-// DiagnosticRelatedInformation
-//===----------------------------------------------------------------------===//
-
-bool mlir::lsp::fromJSON(const llvm::json::Value &value,
-                         DiagnosticRelatedInformation &result,
-                         llvm::json::Path path) {
-  llvm::json::ObjectMapper o(value, path);
-  return o && o.map("location", result.location) &&
-         o.map("message", result.message);
-}
-
-llvm::json::Value mlir::lsp::toJSON(const DiagnosticRelatedInformation &info) {
-  return llvm::json::Object{
-      {"location", info.location},
-      {"message", info.message},
-  };
-}
-
-//===----------------------------------------------------------------------===//
-// Diagnostic
-//===----------------------------------------------------------------------===//
-
-llvm::json::Value mlir::lsp::toJSON(DiagnosticTag tag) {
-  return static_cast<int>(tag);
-}
-
-bool mlir::lsp::fromJSON(const llvm::json::Value &value, DiagnosticTag &result,
-                         llvm::json::Path path) {
-  if (std::optional<int64_t> i = value.getAsInteger()) {
-    result = (DiagnosticTag)*i;
-    return true;
-  }
-
-  return false;
-}
-
-llvm::json::Value mlir::lsp::toJSON(const Diagnostic &diag) {
-  llvm::json::Object result{
-      {"range", diag.range},
-      {"severity", (int)diag.severity},
-      {"message", diag.message},
-  };
-  if (diag.category)
-    result["category"] = *diag.category;
-  if (!diag.source.empty())
-    result["source"] = diag.source;
-  if (diag.relatedInformation)
-    result["relatedInformation"] = *diag.relatedInformation;
-  if (!diag.tags.empty())
-    result["tags"] = diag.tags;
-  return std::move(result);
-}
-
-bool mlir::lsp::fromJSON(const llvm::json::Value &value, Diagnostic &result,
-                         llvm::json::Path path) {
-  llvm::json::ObjectMapper o(value, path);
-  if (!o)
-    return false;
-  int severity = 0;
-  if (!mapOptOrNull(value, "severity", severity, path))
-    return false;
-  result.severity = (DiagnosticSeverity)severity;
-
-  return o.map("range", result.range) && o.map("message", result.message) &&
-         mapOptOrNull(value, "category", result.category, path) &&
-         mapOptOrNull(value, "source", result.source, path) &&
-         mapOptOrNull(value, "relatedInformation", result.relatedInformation,
-                      path) &&
-         mapOptOrNull(value, "tags", result.tags, path);
-}
-
-//===----------------------------------------------------------------------===//
-// PublishDiagnosticsParams
-//===----------------------------------------------------------------------===//
-
-llvm::json::Value mlir::lsp::toJSON(const PublishDiagnosticsParams &params) {
-  return llvm::json::Object{
-      {"uri", params.uri},
-      {"diagnostics", params.diagnostics},
-      {"version", params.version},
-  };
-}
-
-//===----------------------------------------------------------------------===//
-// TextEdit
-//===----------------------------------------------------------------------===//
-
-bool mlir::lsp::fromJSON(const llvm::json::Value &value, TextEdit &result,
-                         llvm::json::Path path) {
-  llvm::json::ObjectMapper o(value, path);
-  return o && o.map("range", result.range) && o.map("newText", result.newText);
-}
-
-llvm::json::Value mlir::lsp::toJSON(const TextEdit &value) {
-  return llvm::json::Object{
-      {"range", value.range},
-      {"newText", value.newText},
-  };
-}
-
-raw_ostream &mlir::lsp::operator<<(raw_ostream &os, const TextEdit &value) {
-  os << value.range << " => \"";
-  llvm::printEscapedString(value.newText, os);
-  return os << '"';
-}
-
-//===----------------------------------------------------------------------===//
-// CompletionItemKind
-//===----------------------------------------------------------------------===//
-
-bool mlir::lsp::fromJSON(const llvm::json::Value &value,
-                         CompletionItemKind &result, llvm::json::Path path) {
-  if (std::optional<int64_t> intValue = value.getAsInteger()) {
-    if (*intValue < static_cast<int>(CompletionItemKind::Text) ||
-        *intValue > static_cast<int>(CompletionItemKind::TypeParameter))
-      return false;
-    result = static_cast<CompletionItemKind>(*intValue);
-    return true;
-  }
-  return false;
-}
-
-CompletionItemKind mlir::lsp::adjustKindToCapability(
-    CompletionItemKind kind,
-    CompletionItemKindBitset &supportedCompletionItemKinds) {
-  size_t kindVal = static_cast<size_t>(kind);
-  if (kindVal >= kCompletionItemKindMin &&
-      kindVal <= supportedCompletionItemKinds.size() &&
-      supportedCompletionItemKinds[kindVal])
-    return kind;
-
-  // Provide some fall backs for common kinds that are close enough.
-  switch (kind) {
-  case CompletionItemKind::Folder:
-    return CompletionItemKind::File;
-  case CompletionItemKind::EnumMember:
-    return CompletionItemKind::Enum;
-  case CompletionItemKind::Struct:
-    return CompletionItemKind::Class;
-  default:
-    return CompletionItemKind::Text;
-  }
-}
-
-bool mlir::lsp::fromJSON(const llvm::json::Value &value,
-                         CompletionItemKindBitset &result,
-                         llvm::json::Path path) {
-  if (const llvm::json::Array *arrayValue = value.getAsArray()) {
-    for (size_t i = 0, e = arrayValue->size(); i < e; ++i) {
-      CompletionItemKind kindOut;
-      if (fromJSON((*arrayValue)[i], kindOut, path.index(i)))
-        result.set(size_t(kindOut));
-    }
-    return true;
-  }
-  return false;
-}
-
-//===----------------------------------------------------------------------===//
-// CompletionItem
-//===----------------------------------------------------------------------===//
-
-llvm::json::Value mlir::lsp::toJSON(const CompletionItem &value) {
-  assert(!value.label.empty() && "completion item label is required");
-  llvm::json::Object result{{"label", value.label}};
-  if (value.kind != CompletionItemKind::Missing)
-    result["kind"] = static_cast<int>(value.kind);
-  if (!value.detail.empty())
-    result["detail"] = value.detail;
-  if (value.documentation)
-    result["documentation"] = value.documentation;
-  if (!value.sortText.empty())
-    result["sortText"] = value.sortText;
-  if (!value.filterText.empty())
-    result["filterText"] = value.filterText;
-  if (!value.insertText.empty())
-    result["insertText"] = value.insertText;
-  if (value.insertTextFormat != InsertTextFormat::Missing)
-    result["insertTextFormat"] = static_cast<int>(value.insertTextFormat);
-  if (value.textEdit)
-    result["textEdit"] = *value.textEdit;
-  if (!value.additionalTextEdits.empty()) {
-    result["additionalTextEdits"] =
-        llvm::json::Array(value.additionalTextEdits);
-  }
-  if (value.deprecated)
-    result["deprecated"] = value.deprecated;
-  return std::move(result);
-}
-
-raw_ostream &mlir::lsp::operator<<(raw_ostream &os,
-                                   const CompletionItem &value) {
-  return os << value.label << " - " << toJSON(value);
-}
-
-bool mlir::lsp::operator<(const CompletionItem &lhs,
-                          const CompletionItem &rhs) {
-  return (lhs.sortText.empty() ? lhs.label : lhs.sortText) <
-         (rhs.sortText.empty() ? rhs.label : rhs.sortText);
-}
-
-//===----------------------------------------------------------------------===//
-// CompletionList
-//===----------------------------------------------------------------------===//
-
-llvm::json::Value mlir::lsp::toJSON(const CompletionList &value) {
-  return llvm::json::Object{
-      {"isIncomplete", value.isIncomplete},
-      {"items", llvm::json::Array(value.items)},
-  };
-}
-
-//===----------------------------------------------------------------------===//
-// CompletionContext
-//===----------------------------------------------------------------------===//
-
-bool mlir::lsp::fromJSON(const llvm::json::Value &value,
-                         CompletionContext &result, llvm::json::Path path) {
-  llvm::json::ObjectMapper o(value, path);
-  int triggerKind;
-  if (!o || !o.map("triggerKind", triggerKind) ||
-      !mapOptOrNull(value, "triggerCharacter", result.triggerCharacter, path))
-    return false;
-  result.triggerKind = static_cast<CompletionTriggerKind>(triggerKind);
-  return true;
-}
-
-//===----------------------------------------------------------------------===//
-// CompletionParams
-//===----------------------------------------------------------------------===//
-
-bool mlir::lsp::fromJSON(const llvm::json::Value &value,
-                         CompletionParams &result, llvm::json::Path path) {
-  if (!fromJSON(value, static_cast<TextDocumentPositionParams &>(result), path))
-    return false;
-  if (const llvm::json::Value *context = value.getAsObject()->get("context"))
-    return fromJSON(*context, result.context, path.field("context"));
-  return true;
-}
-
-//===----------------------------------------------------------------------===//
-// ParameterInformation
-//===----------------------------------------------------------------------===//
-
-llvm::json::Value mlir::lsp::toJSON(const ParameterInformation &value) {
-  assert((value.labelOffsets || !value.labelString.empty()) &&
-         "parameter information label is required");
-  llvm::json::Object result;
-  if (value.labelOffsets)
-    result["label"] = llvm::json::Array(
-        {value.labelOffsets->first, value.labelOffsets->second});
-  else
-    result["label"] = value.labelString;
-  if (!value.documentation.empty())
-    result["documentation"] = value.documentation;
-  return std::move(result);
-}
-
-//===----------------------------------------------------------------------===//
-// SignatureInformation
-//===----------------------------------------------------------------------===//
-
-llvm::json::Value mlir::lsp::toJSON(const SignatureInformation &value) {
-  assert(!value.label.empty() && "signature information label is required");
-  llvm::json::Object result{
-      {"label", value.label},
-      {"parameters", llvm::json::Array(value.parameters)},
-  };
-  if (!value.documentation.empty())
-    result["documentation"] = value.documentation;
-  return std::move(result);
-}
-
-raw_ostream &mlir::lsp::operator<<(raw_ostream &os,
-                                   const SignatureInformation &value) {
-  return os << value.label << " - " << toJSON(value);
-}
-
-//===----------------------------------------------------------------------===//
-// SignatureHelp
-//===----------------------------------------------------------------------===//
-
-llvm::json::Value mlir::lsp::toJSON(const SignatureHelp &value) {
-  assert(value.activeSignature >= 0 &&
-         "Unexpected negative value for number of active signatures.");
-  assert(value.activeParameter >= 0 &&
-         "Unexpected negative value for active parameter index");
-  return llvm::json::Object{
-      {"activeSignature", value.activeSignature},
-      {"activeParameter", value.activeParameter},
-      {"signatures", llvm::json::Array(value.signatures)},
-  };
-}
-
-//===----------------------------------------------------------------------===//
-// DocumentLinkParams
-//===----------------------------------------------------------------------===//
-
-bool mlir::lsp::fromJSON(const llvm::json::Value &value,
-                         DocumentLinkParams &result, llvm::json::Path path) {
-  llvm::json::ObjectMapper o(value, path);
-  return o && o.map("textDocument", result.textDocument);
-}
-
-//===----------------------------------------------------------------------===//
-// DocumentLink
-//===----------------------------------------------------------------------===//
-
-llvm::json::Value mlir::lsp::toJSON(const DocumentLink &value) {
-  return llvm::json::Object{
-      {"range", value.range},
-      {"target", value.target},
-  };
-}
-
-//===----------------------------------------------------------------------===//
-// InlayHintsParams
-//===----------------------------------------------------------------------===//
-
-bool mlir::lsp::fromJSON(const llvm::json::Value &value,
-                         InlayHintsParams &result, llvm::json::Path path) {
-  llvm::json::ObjectMapper o(value, path);
-  return o && o.map("textDocument", result.textDocument) &&
-         o.map("range", result.range);
-}
-
-//===----------------------------------------------------------------------===//
-// InlayHint
-//===----------------------------------------------------------------------===//
-
-llvm::json::Value mlir::lsp::toJSON(const InlayHint &value) {
-  return llvm::json::Object{{"position", value.position},
-                            {"kind", (int)value.kind},
-                            {"label", value.label},
-                            {"paddingLeft", value.paddingLeft},
-                            {"paddingRight", value.paddingRight}};
-}
-bool mlir::lsp::operator==(const InlayHint &lhs, const InlayHint &rhs) {
-  return std::tie(lhs.position, lhs.kind, lhs.label) ==
-         std::tie(rhs.position, rhs.kind, rhs.label);
-}
-bool mlir::lsp::operator<(const InlayHint &lhs, const InlayHint &rhs) {
-  return std::tie(lhs.position, lhs.kind, lhs.label) <
-         std::tie(rhs.position, rhs.kind, rhs.label);
-}
-
-llvm::raw_ostream &mlir::lsp::operator<<(llvm::raw_ostream &os,
-                                         InlayHintKind value) {
-  switch (value) {
-  case InlayHintKind::Parameter:
-    return os << "parameter";
-  case InlayHintKind::Type:
-    return os << "type";
-  }
-  llvm_unreachable("Unknown InlayHintKind");
-}
-
-//===----------------------------------------------------------------------===//
-// CodeActionContext
-//===----------------------------------------------------------------------===//
-
-bool mlir::lsp::fromJSON(const llvm::json::Value &value,
-                         CodeActionContext &result, llvm::json::Path path) {
-  llvm::json::ObjectMapper o(value, path);
-  if (!o || !o.map("diagnostics", result.diagnostics))
-    return false;
-  o.map("only", result.only);
-  return true;
-}
-
-//===----------------------------------------------------------------------===//
-// CodeActionParams
-//===----------------------------------------------------------------------===//
-
-bool mlir::lsp::fromJSON(const llvm::json::Value &value,
-                         CodeActionParams &result, llvm::json::Path path) {
-  llvm::json::ObjectMapper o(value, path);
-  return o && o.map("textDocument", result.textDocument) &&
-         o.map("range", result.range) && o.map("context", result.context);
-}
-
-//===----------------------------------------------------------------------===//
-// WorkspaceEdit
-//===----------------------------------------------------------------------===//
-
-bool mlir::lsp::fromJSON(const llvm::json::Value &value, WorkspaceEdit &result,
-                         llvm::json::Path path) {
-  llvm::json::ObjectMapper o(value, path);
-  return o && o.map("changes", result.changes);
-}
-
-llvm::json::Value mlir::lsp::toJSON(const WorkspaceEdit &value) {
-  llvm::json::Object fileChanges;
-  for (auto &change : value.changes)
-    fileChanges[change.first] = llvm::json::Array(change.second);
-  return llvm::json::Object{{"changes", std::move(fileChanges)}};
-}
-
-//===----------------------------------------------------------------------===//
-// CodeAction
-//===----------------------------------------------------------------------===//
-
-const llvm::StringLiteral CodeAction::kQuickFix = "quickfix";
-const llvm::StringLiteral CodeAction::kRefactor = "refactor";
-const llvm::StringLiteral CodeAction::kInfo = "info";
-
-llvm::json::Value mlir::lsp::toJSON(const CodeAction &value) {
-  llvm::json::Object codeAction{{"title", value.title}};
-  if (value.kind)
-    codeAction["kind"] = *value.kind;
-  if (value.diagnostics)
-    codeAction["diagnostics"] = llvm::json::Array(*value.diagnostics);
-  if (value.isPreferred)
-    codeAction["isPreferred"] = true;
-  if (value.edit)
-    codeAction["edit"] = *value.edit;
-  return std::move(codeAction);
-}
diff --git a/mlir/lib/Tools/lsp-server-support/SourceMgrUtils.cpp b/mlir/lib/Tools/lsp-server-support/SourceMgrUtils.cpp
index f1a362385f285..5cd1c85d054ab 100644
--- a/mlir/lib/Tools/lsp-server-support/SourceMgrUtils.cpp
+++ b/mlir/lib/Tools/lsp-server-support/SourceMgrUtils.cpp
@@ -14,6 +14,10 @@
 using namespace mlir;
 using namespace mlir::lsp;
 
+using llvm::lsp::Hover;
+using llvm::lsp::Range;
+using llvm::lsp::URIForFile;
+
 //===----------------------------------------------------------------------===//
 // Utils
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Tools/lsp-server-support/Transport.cpp b/mlir/lib/Tools/lsp-server-support/Transport.cpp
deleted file mode 100644
index 5a098b2841f4b..0000000000000
--- a/mlir/lib/Tools/lsp-server-support/Transport.cpp
+++ /dev/null
@@ -1,369 +0,0 @@
-//===--- JSONTransport.cpp - sending and receiving LSP messages over JSON -===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "mlir/Tools/lsp-server-support/Transport.h"
-#include "mlir/Support/ToolUtilities.h"
-#include "mlir/Tools/lsp-server-support/Logging.h"
-#include "mlir/Tools/lsp-server-support/Protocol.h"
-#include "llvm/ADT/SmallString.h"
-#include "llvm/Support/Error.h"
-#include <optional>
-#include <system_error>
-#include <utility>
-
-using namespace mlir;
-using namespace mlir::lsp;
-
-//===----------------------------------------------------------------------===//
-// Reply
-//===----------------------------------------------------------------------===//
-
-namespace {
-/// Function object to reply to an LSP call.
-/// Each instance must be called exactly once, otherwise:
-///  - if there was no reply, an error reply is sent
-///  - if there were multiple replies, only the first is sent
-class Reply {
-public:
-  Reply(const llvm::json::Value &id, StringRef method, JSONTransport &transport,
-        std::mutex &transportOutputMutex);
-  Reply(Reply &&other);
-  Reply &operator=(Reply &&) = delete;
-  Reply(const Reply &) = delete;
-  Reply &operator=(const Reply &) = delete;
-
-  void operator()(llvm::Expected<llvm::json::Value> reply);
-
-private:
-  std::string method;
-  std::atomic<bool> replied = {false};
-  llvm::json::Value id;
-  JSONTransport *transport;
-  std::mutex &transportOutputMutex;
-};
-} // namespace
-
-Reply::Reply(const llvm::json::Value &id, llvm::StringRef method,
-             JSONTransport &transport, std::mutex &transportOutputMutex)
-    : method(method), id(id), transport(&transport),
-      transportOutputMutex(transportOutputMutex) {}
-
-Reply::Reply(Reply &&other)
-    : method(other.method), replied(other.replied.load()),
-      id(std::move(other.id)), transport(other.transport),
-      transportOutputMutex(other.transportOutputMutex) {
-  other.transport = nullptr;
-}
-
-void Reply::operator()(llvm::Expected<llvm::json::Value> reply) {
-  if (replied.exchange(true)) {
-    Logger::error("Replied twice to message {0}({1})", method, id);
-    assert(false && "must reply to each call only once!");
-    return;
-  }
-  assert(transport && "expected valid transport to reply to");
-
-  std::lock_guard<std::mutex> transportLock(transportOutputMutex);
-  if (reply) {
-    Logger::info("--> reply:{0}({1})", method, id);
-    transport->reply(std::move(id), std::move(reply));
-  } else {
-    llvm::Error error = reply.takeError();
-    Logger::info("--> reply:{0}({1}): {2}", method, id, error);
-    transport->reply(std::move(id), std::move(error));
-  }
-}
-
-//===----------------------------------------------------------------------===//
-// MessageHandler
-//===----------------------------------------------------------------------===//
-
-bool MessageHandler::onNotify(llvm::StringRef method, llvm::json::Value value) {
-  Logger::info("--> {0}", method);
-
-  if (method == "exit")
-    return false;
-  if (method == "$cancel") {
-    // TODO: Add support for cancelling requests.
-  } else {
-    auto it = notificationHandlers.find(method);
-    if (it != notificationHandlers.end())
-      it->second(std::move(value));
-  }
-  return true;
-}
-
-bool MessageHandler::onCall(llvm::StringRef method, llvm::json::Value params,
-                            llvm::json::Value id) {
-  Logger::info("--> {0}({1})", method, id);
-
-  Reply reply(id, method, transport, transportOutputMutex);
-
-  auto it = methodHandlers.find(method);
-  if (it != methodHandlers.end()) {
-    it->second(std::move(params), std::move(reply));
-  } else {
-    reply(llvm::make_error<LSPError>("method not found: " + method.str(),
-                                     ErrorCode::MethodNotFound));
-  }
-  return true;
-}
-
-bool MessageHandler::onReply(llvm::json::Value id,
-                             llvm::Expected<llvm::json::Value> result) {
-  // Find the response handler in the mapping. If it exists, move it out of the
-  // mapping and erase it.
-  ResponseHandlerTy responseHandler;
-  {
-    std::lock_guard<std::mutex> responseHandlersLock(responseHandlersMutex);
-    auto it = responseHandlers.find(debugString(id));
-    if (it != responseHandlers.end()) {
-      responseHandler = std::move(it->second);
-      responseHandlers.erase(it);
-    }
-  }
-
-  // If we found a response handler, invoke it. Otherwise, log an error.
-  if (responseHandler.second) {
-    Logger::info("--> reply:{0}({1})", responseHandler.first, id);
-    responseHandler.second(std::move(id), std::move(result));
-  } else {
-    Logger::error(
-        "received a reply with ID {0}, but there was no such outgoing request",
-        id);
-    if (!result)
-      llvm::consumeError(result.takeError());
-  }
-  return true;
-}
-
-//===----------------------------------------------------------------------===//
-// JSONTransport
-//===----------------------------------------------------------------------===//
-
-/// Encode the given error as a JSON object.
-static llvm::json::Object encodeError(llvm::Error error) {
-  std::string message;
-  ErrorCode code = ErrorCode::UnknownErrorCode;
-  auto handlerFn = [&](const LSPError &lspError) -> llvm::Error {
-    message = lspError.message;
-    code = lspError.code;
-    return llvm::Error::success();
-  };
-  if (llvm::Error unhandled = llvm::handleErrors(std::move(error), handlerFn))
-    message = llvm::toString(std::move(unhandled));
-
-  return llvm::json::Object{
-      {"message", std::move(message)},
-      {"code", int64_t(code)},
-  };
-}
-
-/// Decode the given JSON object into an error.
-llvm::Error decodeError(const llvm::json::Object &o) {
-  StringRef msg = o.getString("message").value_or("Unspecified error");
-  if (std::optional<int64_t> code = o.getInteger("code"))
-    return llvm::make_error<LSPError>(msg.str(), ErrorCode(*code));
-  return llvm::make_error<llvm::StringError>(llvm::inconvertibleErrorCode(),
-                                             msg.str());
-}
-
-void JSONTransport::notify(StringRef method, llvm::json::Value params) {
-  sendMessage(llvm::json::Object{
-      {"jsonrpc", "2.0"},
-      {"method", method},
-      {"params", std::move(params)},
-  });
-}
-void JSONTransport::call(StringRef method, llvm::json::Value params,
-                         llvm::json::Value id) {
-  sendMessage(llvm::json::Object{
-      {"jsonrpc", "2.0"},
-      {"id", std::move(id)},
-      {"method", method},
-      {"params", std::move(params)},
-  });
-}
-void JSONTransport::reply(llvm::json::Value id,
-                          llvm::Expected<llvm::json::Value> result) {
-  if (result) {
-    return sendMessage(llvm::json::Object{
-        {"jsonrpc", "2.0"},
-        {"id", std::move(id)},
-        {"result", std::move(*result)},
-    });
-  }
-
-  sendMessage(llvm::json::Object{
-      {"jsonrpc", "2.0"},
-      {"id", std::move(id)},
-      {"error", encodeError(result.takeError())},
-  });
-}
-
-llvm::Error JSONTransport::run(MessageHandler &handler) {
-  std::string json;
-  while (!in->isEndOfInput()) {
-    if (in->hasError()) {
-      return llvm::errorCodeToError(
-          std::error_code(errno, std::system_category()));
-    }
-
-    if (succeeded(in->readMessage(json))) {
-      if (llvm::Expected<llvm::json::Value> doc = llvm::json::parse(json)) {
-        if (!handleMessage(std::move(*doc), handler))
-          return llvm::Error::success();
-      } else {
-        Logger::error("JSON parse error: {0}", llvm::toString(doc.takeError()));
-      }
-    }
-  }
-  return llvm::errorCodeToError(std::make_error_code(std::errc::io_error));
-}
-
-void JSONTransport::sendMessage(llvm::json::Value msg) {
-  outputBuffer.clear();
-  llvm::raw_svector_ostream os(outputBuffer);
-  os << llvm::formatv(prettyOutput ? "{0:2}\n" : "{0}", msg);
-  out << "Content-Length: " << outputBuffer.size() << "\r\n\r\n"
-      << outputBuffer;
-  out.flush();
-  Logger::debug(">>> {0}\n", outputBuffer);
-}
-
-bool JSONTransport::handleMessage(llvm::json::Value msg,
-                                  MessageHandler &handler) {
-  // Message must be an object with "jsonrpc":"2.0".
-  llvm::json::Object *object = msg.getAsObject();
-  if (!object ||
-      object->getString("jsonrpc") != std::optional<StringRef>("2.0"))
-    return false;
-
-  // `id` may be any JSON value. If absent, this is a notification.
-  std::optional<llvm::json::Value> id;
-  if (llvm::json::Value *i = object->get("id"))
-    id = std::move(*i);
-  std::optional<StringRef> method = object->getString("method");
-
-  // This is a response.
-  if (!method) {
-    if (!id)
-      return false;
-    if (auto *err = object->getObject("error"))
-      return handler.onReply(std::move(*id), decodeError(*err));
-    // result should be given, use null if not.
-    llvm::json::Value result = nullptr;
-    if (llvm::json::Value *r = object->get("result"))
-      result = std::move(*r);
-    return handler.onReply(std::move(*id), std::move(result));
-  }
-
-  // Params should be given, use null if not.
-  llvm::json::Value params = nullptr;
-  if (llvm::json::Value *p = object->get("params"))
-    params = std::move(*p);
-
-  if (id)
-    return handler.onCall(*method, std::move(params), std::move(*id));
-  return handler.onNotify(*method, std::move(params));
-}
-
-/// Tries to read a line up to and including \n.
-/// If failing, feof(), ferror(), or shutdownRequested() will be set.
-LogicalResult readLine(std::FILE *in, SmallVectorImpl<char> &out) {
-  // Big enough to hold any reasonable header line. May not fit content lines
-  // in delimited mode, but performance doesn't matter for that mode.
-  static constexpr int bufSize = 128;
-  size_t size = 0;
-  out.clear();
-  for (;;) {
-    out.resize_for_overwrite(size + bufSize);
-    if (!std::fgets(&out[size], bufSize, in))
-      return failure();
-
-    clearerr(in);
-
-    // If the line contained null bytes, anything after it (including \n) will
-    // be ignored. Fortunately this is not a legal header or JSON.
-    size_t read = std::strlen(&out[size]);
-    if (read > 0 && out[size + read - 1] == '\n') {
-      out.resize(size + read);
-      return success();
-    }
-    size += read;
-  }
-}
-
-// Returns std::nullopt when:
-//  - ferror(), feof(), or shutdownRequested() are set.
-//  - Content-Length is missing or empty (protocol error)
-LogicalResult
-JSONTransportInputOverFile::readStandardMessage(std::string &json) {
-  // A Language Server Protocol message starts with a set of HTTP headers,
-  // delimited  by \r\n, and terminated by an empty line (\r\n).
-  unsigned long long contentLength = 0;
-  llvm::SmallString<128> line;
-  while (true) {
-    if (feof(in) || hasError() || failed(readLine(in, line)))
-      return failure();
-
-    // Content-Length is a mandatory header, and the only one we handle.
-    StringRef lineRef = line;
-    if (lineRef.consume_front("Content-Length: ")) {
-      llvm::getAsUnsignedInteger(lineRef.trim(), 0, contentLength);
-    } else if (!lineRef.trim().empty()) {
-      // It's another header, ignore it.
-      continue;
-    } else {
-      // An empty line indicates the end of headers. Go ahead and read the JSON.
-      break;
-    }
-  }
-
-  // The fuzzer likes crashing us by sending "Content-Length: 9999999999999999"
-  if (contentLength == 0 || contentLength > 1 << 30)
-    return failure();
-
-  json.resize(contentLength);
-  for (size_t pos = 0, read; pos < contentLength; pos += read) {
-    read = std::fread(&json[pos], 1, contentLength - pos, in);
-    if (read == 0)
-      return failure();
-
-    // If we're done, the error was transient. If we're not done, either it was
-    // transient or we'll see it again on retry.
-    clearerr(in);
-    pos += read;
-  }
-  return success();
-}
-
-/// For lit tests we support a simplified syntax:
-/// - messages are delimited by '// -----' on a line by itself
-/// - lines starting with // are ignored.
-/// This is a testing path, so favor simplicity over performance here.
-/// When returning failure: feof(), ferror(), or shutdownRequested() will be
-/// set.
-LogicalResult
-JSONTransportInputOverFile::readDelimitedMessage(std::string &json) {
-  json.clear();
-  llvm::SmallString<128> line;
-  while (succeeded(readLine(in, line))) {
-    StringRef lineRef = line.str().trim();
-    if (lineRef.starts_with("//")) {
-      // Found a delimiter for the message.
-      if (lineRef == kDefaultSplitMarker)
-        break;
-      continue;
-    }
-
-    json += line;
-  }
-
-  return failure(ferror(in));
-}
diff --git a/mlir/lib/Tools/mlir-lsp-server/CMakeLists.txt b/mlir/lib/Tools/mlir-lsp-server/CMakeLists.txt
index d04d5156fb3c5..e2acba54e5624 100644
--- a/mlir/lib/Tools/mlir-lsp-server/CMakeLists.txt
+++ b/mlir/lib/Tools/mlir-lsp-server/CMakeLists.txt
@@ -7,6 +7,9 @@ add_mlir_library(MLIRLspServerLib
   ADDITIONAL_HEADER_DIRS
   ${MLIR_MAIN_INCLUDE_DIR}/mlir/Tools/mlir-lsp-server
 
+  LINK_COMPONENTS
+  SupportLSP
+
   LINK_LIBS PUBLIC
   MLIRBytecodeWriter
   MLIRFunctionInterfaces
diff --git a/mlir/lib/Tools/mlir-lsp-server/LSPServer.cpp b/mlir/lib/Tools/mlir-lsp-server/LSPServer.cpp
index 9b937db0c6a7a..1bbbcdecb57af 100644
--- a/mlir/lib/Tools/mlir-lsp-server/LSPServer.cpp
+++ b/mlir/lib/Tools/mlir-lsp-server/LSPServer.cpp
@@ -9,8 +9,8 @@
 #include "LSPServer.h"
 #include "MLIRServer.h"
 #include "Protocol.h"
-#include "mlir/Tools/lsp-server-support/Logging.h"
-#include "mlir/Tools/lsp-server-support/Transport.h"
+#include "llvm/Support/LSP/Logging.h"
+#include "llvm/Support/LSP/Transport.h"
 #include <optional>
 
 #define DEBUG_TYPE "mlir-lsp-server"
@@ -18,6 +18,33 @@
 using namespace mlir;
 using namespace mlir::lsp;
 
+using llvm::lsp::Callback;
+using llvm::lsp::CodeAction;
+using llvm::lsp::CodeActionParams;
+using llvm::lsp::CompletionList;
+using llvm::lsp::CompletionParams;
+using llvm::lsp::DidChangeTextDocumentParams;
+using llvm::lsp::DidCloseTextDocumentParams;
+using llvm::lsp::DidOpenTextDocumentParams;
+using llvm::lsp::DocumentSymbol;
+using llvm::lsp::DocumentSymbolParams;
+using llvm::lsp::Hover;
+using llvm::lsp::InitializedParams;
+using llvm::lsp::InitializeParams;
+using llvm::lsp::JSONTransport;
+using llvm::lsp::Location;
+using llvm::lsp::Logger;
+using llvm::lsp::MessageHandler;
+using llvm::lsp::MLIRConvertBytecodeParams;
+using llvm::lsp::MLIRConvertBytecodeResult;
+using llvm::lsp::NoParams;
+using llvm::lsp::OutgoingNotification;
+using llvm::lsp::PublishDiagnosticsParams;
+using llvm::lsp::ReferenceParams;
+using llvm::lsp::TextDocumentPositionParams;
+using llvm::lsp::TextDocumentSyncKind;
+using llvm::lsp::URIForFile;
+
 //===----------------------------------------------------------------------===//
 // LSPServer
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Tools/mlir-lsp-server/LSPServer.h b/mlir/lib/Tools/mlir-lsp-server/LSPServer.h
index 2c50c6b4ac6f5..d652899633255 100644
--- a/mlir/lib/Tools/mlir-lsp-server/LSPServer.h
+++ b/mlir/lib/Tools/mlir-lsp-server/LSPServer.h
@@ -13,17 +13,19 @@
 
 namespace llvm {
 struct LogicalResult;
+namespace lsp {
+class JSONTransport;
+} // namespace lsp
 } // namespace llvm
 
 namespace mlir {
 namespace lsp {
-class JSONTransport;
 class MLIRServer;
 
 /// Run the main loop of the LSP server using the given MLIR server and
 /// transport.
 llvm::LogicalResult runMlirLSPServer(MLIRServer &server,
-                                     JSONTransport &transport);
+                                     llvm::lsp::JSONTransport &transport);
 } // namespace lsp
 } // namespace mlir
 
diff --git a/mlir/lib/Tools/mlir-lsp-server/MLIRServer.cpp b/mlir/lib/Tools/mlir-lsp-server/MLIRServer.cpp
index 61987525a5ca5..47b4328d0d9ec 100644
--- a/mlir/lib/Tools/mlir-lsp-server/MLIRServer.cpp
+++ b/mlir/lib/Tools/mlir-lsp-server/MLIRServer.cpp
@@ -16,10 +16,10 @@
 #include "mlir/Interfaces/FunctionInterfaces.h"
 #include "mlir/Parser/Parser.h"
 #include "mlir/Support/ToolUtilities.h"
-#include "mlir/Tools/lsp-server-support/Logging.h"
 #include "mlir/Tools/lsp-server-support/SourceMgrUtils.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/Base64.h"
+#include "llvm/Support/LSP/Logging.h"
 #include "llvm/Support/SourceMgr.h"
 #include <optional>
 
@@ -39,9 +39,9 @@ static std::optional<lsp::Location> getLocationFromLoc(StringRef uriScheme,
   llvm::Expected<lsp::URIForFile> sourceURI =
       lsp::URIForFile::fromFile(loc.getFilename(), uriScheme);
   if (!sourceURI) {
-    lsp::Logger::error("Failed to create URI for file `{0}`: {1}",
-                       loc.getFilename(),
-                       llvm::toString(sourceURI.takeError()));
+    llvm::lsp::Logger::error("Failed to create URI for file `{0}`: {1}",
+                             loc.getFilename(),
+                             llvm::toString(sourceURI.takeError()));
     return std::nullopt;
   }
 
@@ -217,22 +217,22 @@ static lsp::Diagnostic getLspDiagnoticFromDiag(llvm::SourceMgr &sourceMgr,
 
   // Convert the severity for the diagnostic.
   switch (diag.getSeverity()) {
-  case DiagnosticSeverity::Note:
+  case mlir::DiagnosticSeverity::Note:
     llvm_unreachable("expected notes to be handled separately");
-  case DiagnosticSeverity::Warning:
-    lspDiag.severity = lsp::DiagnosticSeverity::Warning;
+  case mlir::DiagnosticSeverity::Warning:
+    lspDiag.severity = llvm::lsp::DiagnosticSeverity::Warning;
     break;
-  case DiagnosticSeverity::Error:
-    lspDiag.severity = lsp::DiagnosticSeverity::Error;
+  case mlir::DiagnosticSeverity::Error:
+    lspDiag.severity = llvm::lsp::DiagnosticSeverity::Error;
     break;
-  case DiagnosticSeverity::Remark:
-    lspDiag.severity = lsp::DiagnosticSeverity::Information;
+  case mlir::DiagnosticSeverity::Remark:
+    lspDiag.severity = llvm::lsp::DiagnosticSeverity::Information;
     break;
   }
   lspDiag.message = diag.str();
 
   // Attach any notes to the main diagnostic as related information.
-  std::vector<lsp::DiagnosticRelatedInformation> relatedDiags;
+  std::vector<llvm::lsp::DiagnosticRelatedInformation> relatedDiags;
   for (Diagnostic &note : diag.getNotes()) {
     lsp::Location noteLoc;
     if (std::optional<lsp::Location> loc =
@@ -317,7 +317,7 @@ struct MLIRDocument {
   void getCodeActionForDiagnostic(const lsp::URIForFile &uri,
                                   lsp::Position &pos, StringRef severity,
                                   StringRef message,
-                                  std::vector<lsp::TextEdit> &edits);
+                                  std::vector<llvm::lsp::TextEdit> &edits);
 
   //===--------------------------------------------------------------------===//
   // Bytecode
@@ -355,7 +355,8 @@ MLIRDocument::MLIRDocument(MLIRContext &context, const lsp::URIForFile &uri,
   // Try to parsed the given IR string.
   auto memBuffer = llvm::MemoryBuffer::getMemBufferCopy(contents, uri.file());
   if (!memBuffer) {
-    lsp::Logger::error("Failed to create memory buffer for file", uri.file());
+    llvm::lsp::Logger::error("Failed to create memory buffer for file",
+                             uri.file());
     return;
   }
 
@@ -695,8 +696,8 @@ void MLIRDocument::findDocumentSymbols(
     if (SymbolOpInterface symbol = dyn_cast<SymbolOpInterface>(op)) {
       symbols.emplace_back(symbol.getName(),
                            isa<FunctionOpInterface>(op)
-                               ? lsp::SymbolKind::Function
-                               : lsp::SymbolKind::Class,
+                               ? llvm::lsp::SymbolKind::Function
+                               : llvm::lsp::SymbolKind::Class,
                            lsp::Range(sourceMgr, def->scopeLoc),
                            lsp::Range(sourceMgr, def->loc));
       childSymbols = &symbols.back().children;
@@ -704,9 +705,9 @@ void MLIRDocument::findDocumentSymbols(
     } else if (op->hasTrait<OpTrait::SymbolTable>()) {
       // Otherwise, if this is a symbol table push an anonymous document symbol.
       symbols.emplace_back("<" + op->getName().getStringRef() + ">",
-                           lsp::SymbolKind::Namespace,
-                           lsp::Range(sourceMgr, def->scopeLoc),
-                           lsp::Range(sourceMgr, def->loc));
+                           llvm::lsp::SymbolKind::Namespace,
+                           llvm::lsp::Range(sourceMgr, def->scopeLoc),
+                           llvm::lsp::Range(sourceMgr, def->loc));
       childSymbols = &symbols.back().children;
     }
   }
@@ -734,9 +735,9 @@ class LSPCodeCompleteContext : public AsmParserCodeCompleteContext {
   /// Signal code completion for a dialect name, with an optional prefix.
   void completeDialectName(StringRef prefix) final {
     for (StringRef dialect : ctx->getAvailableDialects()) {
-      lsp::CompletionItem item(prefix + dialect,
-                               lsp::CompletionItemKind::Module,
-                               /*sortText=*/"3");
+      llvm::lsp::CompletionItem item(prefix + dialect,
+                                     llvm::lsp::CompletionItemKind::Module,
+                                     /*sortText=*/"3");
       item.detail = "dialect";
       completionList.items.emplace_back(item);
     }
@@ -753,9 +754,9 @@ class LSPCodeCompleteContext : public AsmParserCodeCompleteContext {
       if (&op.getDialect() != dialect)
         continue;
 
-      lsp::CompletionItem item(
+      llvm::lsp::CompletionItem item(
           op.getStringRef().drop_front(dialectName.size() + 1),
-          lsp::CompletionItemKind::Field,
+          llvm::lsp::CompletionItemKind::Field,
           /*sortText=*/"1");
       item.detail = "operation";
       completionList.items.emplace_back(item);
@@ -768,7 +769,8 @@ class LSPCodeCompleteContext : public AsmParserCodeCompleteContext {
     // Check if we need to insert the `%` or not.
     bool stripPrefix = getCodeCompleteLoc().getPointer()[-1] == '%';
 
-    lsp::CompletionItem item(name, lsp::CompletionItemKind::Variable);
+    llvm::lsp::CompletionItem item(name,
+                                   llvm::lsp::CompletionItemKind::Variable);
     if (stripPrefix)
       item.insertText = name.drop_front(1).str();
     item.detail = std::move(typeData);
@@ -781,7 +783,7 @@ class LSPCodeCompleteContext : public AsmParserCodeCompleteContext {
     // Check if we need to insert the `^` or not.
     bool stripPrefix = getCodeCompleteLoc().getPointer()[-1] == '^';
 
-    lsp::CompletionItem item(name, lsp::CompletionItemKind::Field);
+    llvm::lsp::CompletionItem item(name, llvm::lsp::CompletionItemKind::Field);
     if (stripPrefix)
       item.insertText = name.drop_front(1).str();
     completionList.items.emplace_back(item);
@@ -790,8 +792,9 @@ class LSPCodeCompleteContext : public AsmParserCodeCompleteContext {
   /// Signal a completion for the given expected token.
   void completeExpectedTokens(ArrayRef<StringRef> tokens, bool optional) final {
     for (StringRef token : tokens) {
-      lsp::CompletionItem item(token, lsp::CompletionItemKind::Keyword,
-                               /*sortText=*/"0");
+      llvm::lsp::CompletionItem item(token,
+                                     llvm::lsp::CompletionItemKind::Keyword,
+                                     /*sortText=*/"0");
       item.detail = optional ? "optional" : "";
       completionList.items.emplace_back(item);
     }
@@ -802,7 +805,7 @@ class LSPCodeCompleteContext : public AsmParserCodeCompleteContext {
     appendSimpleCompletions({"affine_set", "affine_map", "dense",
                              "dense_resource", "false", "loc", "sparse", "true",
                              "unit"},
-                            lsp::CompletionItemKind::Field,
+                            llvm::lsp::CompletionItemKind::Field,
                             /*sortText=*/"1");
 
     completeDialectName("#");
@@ -820,13 +823,14 @@ class LSPCodeCompleteContext : public AsmParserCodeCompleteContext {
     appendSimpleCompletions({"memref", "tensor", "complex", "tuple", "vector",
                              "bf16", "f16", "f32", "f64", "f80", "f128",
                              "index", "none"},
-                            lsp::CompletionItemKind::Field,
+                            llvm::lsp::CompletionItemKind::Field,
                             /*sortText=*/"1");
 
     // Handle the builtin integer types.
     for (StringRef type : {"i", "si", "ui"}) {
-      lsp::CompletionItem item(type + "<N>", lsp::CompletionItemKind::Field,
-                               /*sortText=*/"1");
+      llvm::lsp::CompletionItem item(type + "<N>",
+                                     llvm::lsp::CompletionItemKind::Field,
+                                     /*sortText=*/"1");
       item.insertText = type.str();
       completionList.items.emplace_back(item);
     }
@@ -846,9 +850,9 @@ class LSPCodeCompleteContext : public AsmParserCodeCompleteContext {
   void completeAliases(const llvm::StringMap<T> &aliases,
                        StringRef prefix = "") {
     for (const auto &alias : aliases) {
-      lsp::CompletionItem item(prefix + alias.getKey(),
-                               lsp::CompletionItemKind::Field,
-                               /*sortText=*/"2");
+      llvm::lsp::CompletionItem item(prefix + alias.getKey(),
+                                     llvm::lsp::CompletionItemKind::Field,
+                                     /*sortText=*/"2");
       llvm::raw_string_ostream(item.detail) << "alias: " << alias.getValue();
       completionList.items.emplace_back(item);
     }
@@ -856,7 +860,7 @@ class LSPCodeCompleteContext : public AsmParserCodeCompleteContext {
 
   /// Add a set of simple completions that all have the same kind.
   void appendSimpleCompletions(ArrayRef<StringRef> completions,
-                               lsp::CompletionItemKind kind,
+                               llvm::lsp::CompletionItemKind kind,
                                StringRef sortText = "") {
     for (StringRef completion : completions)
       completionList.items.emplace_back(completion, kind, sortText);
@@ -897,7 +901,7 @@ MLIRDocument::getCodeCompletion(const lsp::URIForFile &uri,
 
 void MLIRDocument::getCodeActionForDiagnostic(
     const lsp::URIForFile &uri, lsp::Position &pos, StringRef severity,
-    StringRef message, std::vector<lsp::TextEdit> &edits) {
+    StringRef message, std::vector<llvm::lsp::TextEdit> &edits) {
   // Ignore diagnostics that print the current operation. These are always
   // enabled for the language server, but not generally during normal
   // parsing/verification.
@@ -913,7 +917,7 @@ void MLIRDocument::getCodeActionForDiagnostic(
 
   // Add a text edit for adding an expected-* diagnostic check for this
   // diagnostic.
-  lsp::TextEdit edit;
+  llvm::lsp::TextEdit edit;
   edit.range = lsp::Range(lsp::Position(pos.line, 0));
 
   // Use the indent of the current line for the expected-* diagnostic.
@@ -937,13 +941,14 @@ MLIRDocument::convertToBytecode() {
   // conceptually be relaxed.
   if (!llvm::hasSingleElement(parsedIR)) {
     if (parsedIR.empty()) {
-      return llvm::make_error<lsp::LSPError>(
+      return llvm::make_error<llvm::lsp::LSPError>(
           "expected a single and valid top-level operation, please ensure "
           "there are no errors",
-          lsp::ErrorCode::RequestFailed);
+          llvm::lsp::ErrorCode::RequestFailed);
     }
-    return llvm::make_error<lsp::LSPError>(
-        "expected a single top-level operation", lsp::ErrorCode::RequestFailed);
+    return llvm::make_error<llvm::lsp::LSPError>(
+        "expected a single top-level operation",
+        llvm::lsp::ErrorCode::RequestFailed);
   }
 
   lsp::MLIRConvertBytecodeResult result;
@@ -1134,7 +1139,7 @@ void MLIRTextFile::findDocumentSymbols(
     lsp::Position endPos((i == e - 1) ? totalNumLines - 1
                                       : chunks[i + 1]->lineOffset);
     lsp::DocumentSymbol symbol("<file-split-" + Twine(i) + ">",
-                               lsp::SymbolKind::Namespace,
+                               llvm::lsp::SymbolKind::Namespace,
                                /*range=*/lsp::Range(startPos, endPos),
                                /*selectionRange=*/lsp::Range(startPos));
     chunk.document.findDocumentSymbols(symbol.children);
@@ -1167,10 +1172,10 @@ lsp::CompletionList MLIRTextFile::getCodeCompletion(const lsp::URIForFile &uri,
       uri, completePos, context.getDialectRegistry());
 
   // Adjust any completion locations.
-  for (lsp::CompletionItem &item : completionList.items) {
+  for (llvm::lsp::CompletionItem &item : completionList.items) {
     if (item.textEdit)
       chunk.adjustLocForChunkOffset(item.textEdit->range);
-    for (lsp::TextEdit &edit : item.additionalTextEdits)
+    for (llvm::lsp::TextEdit &edit : item.additionalTextEdits)
       chunk.adjustLocForChunkOffset(edit.range);
   }
   return completionList;
@@ -1194,10 +1199,10 @@ void MLIRTextFile::getCodeActions(const lsp::URIForFile &uri,
 
     StringRef severity;
     switch (diag.severity) {
-    case lsp::DiagnosticSeverity::Error:
+    case llvm::lsp::DiagnosticSeverity::Error:
       severity = "error";
       break;
-    case lsp::DiagnosticSeverity::Warning:
+    case llvm::lsp::DiagnosticSeverity::Warning:
       severity = "warning";
       break;
     default:
@@ -1205,7 +1210,7 @@ void MLIRTextFile::getCodeActions(const lsp::URIForFile &uri,
     }
 
     // Get edits for the diagnostic.
-    std::vector<lsp::TextEdit> edits;
+    std::vector<llvm::lsp::TextEdit> edits;
     chunk.document.getCodeActionForDiagnostic(uri, diagPos, severity,
                                               diag.message, edits);
 
@@ -1221,7 +1226,7 @@ void MLIRTextFile::getCodeActions(const lsp::URIForFile &uri,
       }
     }
     // Fixup the locations for any edits.
-    for (lsp::TextEdit &edit : edits)
+    for (llvm::lsp::TextEdit &edit : edits)
       chunk.adjustLocForChunkOffset(edit.range);
 
     action.edit.emplace();
@@ -1236,9 +1241,9 @@ llvm::Expected<lsp::MLIRConvertBytecodeResult>
 MLIRTextFile::convertToBytecode() {
   // Bail out if there is more than one chunk, bytecode wants a single module.
   if (chunks.size() != 1) {
-    return llvm::make_error<lsp::LSPError>(
+    return llvm::make_error<llvm::lsp::LSPError>(
         "unexpected split file, please remove all `// -----`",
-        lsp::ErrorCode::RequestFailed);
+        llvm::lsp::ErrorCode::RequestFailed);
   }
   return chunks.front()->document.convertToBytecode();
 }
@@ -1283,7 +1288,7 @@ lsp::MLIRServer::~MLIRServer() = default;
 
 void lsp::MLIRServer::addOrUpdateDocument(
     const URIForFile &uri, StringRef contents, int64_t version,
-    std::vector<Diagnostic> &diagnostics) {
+    std::vector<llvm::lsp::Diagnostic> &diagnostics) {
   impl->files[uri.file()] = std::make_unique<MLIRTextFile>(
       uri, contents, version, impl->registry_fn, diagnostics);
 }
@@ -1298,17 +1303,17 @@ std::optional<int64_t> lsp::MLIRServer::removeDocument(const URIForFile &uri) {
   return version;
 }
 
-void lsp::MLIRServer::getLocationsOf(const URIForFile &uri,
-                                     const Position &defPos,
-                                     std::vector<Location> &locations) {
+void lsp::MLIRServer::getLocationsOf(
+    const URIForFile &uri, const Position &defPos,
+    std::vector<llvm::lsp::Location> &locations) {
   auto fileIt = impl->files.find(uri.file());
   if (fileIt != impl->files.end())
     fileIt->second->getLocationsOf(uri, defPos, locations);
 }
 
-void lsp::MLIRServer::findReferencesOf(const URIForFile &uri,
-                                       const Position &pos,
-                                       std::vector<Location> &references) {
+void lsp::MLIRServer::findReferencesOf(
+    const URIForFile &uri, const Position &pos,
+    std::vector<llvm::lsp::Location> &references) {
   auto fileIt = impl->files.find(uri.file());
   if (fileIt != impl->files.end())
     fileIt->second->findReferencesOf(uri, pos, references);
@@ -1367,17 +1372,17 @@ lsp::MLIRServer::convertFromBytecode(const URIForFile &uri) {
   // Try to parse the given source file.
   Block parsedBlock;
   if (failed(parseSourceFile(uri.file(), &parsedBlock, parserConfig))) {
-    return llvm::make_error<lsp::LSPError>(
+    return llvm::make_error<llvm::lsp::LSPError>(
         "failed to parse bytecode source file: " + errorMsg,
-        lsp::ErrorCode::RequestFailed);
+        llvm::lsp::ErrorCode::RequestFailed);
   }
 
   // TODO: We currently expect a single top-level operation, but this could
   // conceptually be relaxed.
   if (!llvm::hasSingleElement(parsedBlock)) {
-    return llvm::make_error<lsp::LSPError>(
+    return llvm::make_error<llvm::lsp::LSPError>(
         "expected bytecode to contain a single top-level operation",
-        lsp::ErrorCode::RequestFailed);
+        llvm::lsp::ErrorCode::RequestFailed);
   }
 
   // Print the module to a buffer.
@@ -1401,9 +1406,9 @@ llvm::Expected<lsp::MLIRConvertBytecodeResult>
 lsp::MLIRServer::convertToBytecode(const URIForFile &uri) {
   auto fileIt = impl->files.find(uri.file());
   if (fileIt == impl->files.end()) {
-    return llvm::make_error<lsp::LSPError>(
+    return llvm::make_error<llvm::lsp::LSPError>(
         "language server does not contain an entry for this source file",
-        lsp::ErrorCode::RequestFailed);
+        llvm::lsp::ErrorCode::RequestFailed);
   }
   return fileIt->second->convertToBytecode();
 }
diff --git a/mlir/lib/Tools/mlir-lsp-server/MLIRServer.h b/mlir/lib/Tools/mlir-lsp-server/MLIRServer.h
index 85e69e69f6631..31a01fec8bbc9 100644
--- a/mlir/lib/Tools/mlir-lsp-server/MLIRServer.h
+++ b/mlir/lib/Tools/mlir-lsp-server/MLIRServer.h
@@ -9,6 +9,7 @@
 #ifndef LIB_MLIR_TOOLS_MLIRLSPSERVER_SERVER_H_
 #define LIB_MLIR_TOOLS_MLIRLSPSERVER_SERVER_H_
 
+#include "Protocol.h"
 #include "mlir/Support/LLVM.h"
 #include "mlir/Tools/mlir-lsp-server/MlirLspRegistryFunction.h"
 #include "llvm/Support/Error.h"
@@ -19,16 +20,17 @@ namespace mlir {
 class DialectRegistry;
 
 namespace lsp {
-struct CodeAction;
-struct CodeActionContext;
-struct CompletionList;
-struct Diagnostic;
-struct DocumentSymbol;
-struct Hover;
-struct Location;
-struct MLIRConvertBytecodeResult;
-struct Position;
-struct Range;
+using llvm::lsp::CodeAction;
+using llvm::lsp::CodeActionContext;
+using llvm::lsp::CompletionList;
+using llvm::lsp::Diagnostic;
+using llvm::lsp::DocumentSymbol;
+using llvm::lsp::Hover;
+using llvm::lsp::Location;
+using llvm::lsp::MLIRConvertBytecodeResult;
+using llvm::lsp::Position;
+using llvm::lsp::Range;
+using llvm::lsp::URIForFile;
 
 /// This class implements all of the MLIR related functionality necessary for a
 /// language server. This class allows for keeping the MLIR specific logic
diff --git a/mlir/lib/Tools/mlir-lsp-server/MlirLspServerMain.cpp b/mlir/lib/Tools/mlir-lsp-server/MlirLspServerMain.cpp
index f1dc32615c6a3..d4589b240e39e 100644
--- a/mlir/lib/Tools/mlir-lsp-server/MlirLspServerMain.cpp
+++ b/mlir/lib/Tools/mlir-lsp-server/MlirLspServerMain.cpp
@@ -9,14 +9,18 @@
 #include "mlir/Tools/mlir-lsp-server/MlirLspServerMain.h"
 #include "LSPServer.h"
 #include "MLIRServer.h"
-#include "mlir/Tools/lsp-server-support/Logging.h"
-#include "mlir/Tools/lsp-server-support/Transport.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/LSP/Logging.h"
+#include "llvm/Support/LSP/Transport.h"
 #include "llvm/Support/Program.h"
 
 using namespace mlir;
 using namespace mlir::lsp;
 
+using llvm::lsp::JSONStreamStyle;
+using llvm::lsp::JSONTransport;
+using llvm::lsp::Logger;
+
 LogicalResult mlir::MlirLspServerMain(int argc, char **argv,
                                       DialectRegistryFn registry_fn) {
   llvm::cl::opt<JSONStreamStyle> inputStyle{
diff --git a/mlir/lib/Tools/mlir-lsp-server/Protocol.cpp b/mlir/lib/Tools/mlir-lsp-server/Protocol.cpp
index a56e9a10f03f1..28aded304d388 100644
--- a/mlir/lib/Tools/mlir-lsp-server/Protocol.cpp
+++ b/mlir/lib/Tools/mlir-lsp-server/Protocol.cpp
@@ -13,14 +13,11 @@
 #include "Protocol.h"
 #include "llvm/Support/JSON.h"
 
-using namespace mlir;
-using namespace mlir::lsp;
-
 //===----------------------------------------------------------------------===//
 // MLIRConvertBytecodeParams
 //===----------------------------------------------------------------------===//
 
-bool mlir::lsp::fromJSON(const llvm::json::Value &value,
+bool llvm::lsp::fromJSON(const llvm::json::Value &value,
                          MLIRConvertBytecodeParams &result,
                          llvm::json::Path path) {
   llvm::json::ObjectMapper o(value, path);
@@ -31,6 +28,6 @@ bool mlir::lsp::fromJSON(const llvm::json::Value &value,
 // MLIRConvertBytecodeResult
 //===----------------------------------------------------------------------===//
 
-llvm::json::Value mlir::lsp::toJSON(const MLIRConvertBytecodeResult &value) {
+llvm::json::Value llvm::lsp::toJSON(const MLIRConvertBytecodeResult &value) {
   return llvm::json::Object{{"output", value.output}};
 }
diff --git a/mlir/lib/Tools/mlir-lsp-server/Protocol.h b/mlir/lib/Tools/mlir-lsp-server/Protocol.h
index d910780e1ee92..ed0db4e591d8f 100644
--- a/mlir/lib/Tools/mlir-lsp-server/Protocol.h
+++ b/mlir/lib/Tools/mlir-lsp-server/Protocol.h
@@ -20,9 +20,9 @@
 #ifndef LIB_MLIR_TOOLS_MLIRLSPSERVER_PROTOCOL_H_
 #define LIB_MLIR_TOOLS_MLIRLSPSERVER_PROTOCOL_H_
 
-#include "mlir/Tools/lsp-server-support/Protocol.h"
+#include "llvm/Support/LSP/Protocol.h"
 
-namespace mlir {
+namespace llvm {
 namespace lsp {
 //===----------------------------------------------------------------------===//
 // MLIRConvertBytecodeParams
@@ -54,6 +54,6 @@ struct MLIRConvertBytecodeResult {
 llvm::json::Value toJSON(const MLIRConvertBytecodeResult &value);
 
 } // namespace lsp
-} // namespace mlir
+} // namespace llvm
 
 #endif
diff --git a/mlir/lib/Tools/mlir-opt/CMakeLists.txt b/mlir/lib/Tools/mlir-opt/CMakeLists.txt
index f24d4c60174ee..858c9c1f97f9c 100644
--- a/mlir/lib/Tools/mlir-opt/CMakeLists.txt
+++ b/mlir/lib/Tools/mlir-opt/CMakeLists.txt
@@ -13,4 +13,5 @@ add_mlir_library(MLIROptLib
   MLIRPluginsLib
   MLIRSupport
   MLIRIRDL
+  MLIRRemarkStreamer
   )
diff --git a/mlir/lib/Tools/mlir-opt/MlirOptMain.cpp b/mlir/lib/Tools/mlir-opt/MlirOptMain.cpp
index de714d8b740af..30fd384f3977c 100644
--- a/mlir/lib/Tools/mlir-opt/MlirOptMain.cpp
+++ b/mlir/lib/Tools/mlir-opt/MlirOptMain.cpp
@@ -23,9 +23,11 @@
 #include "mlir/IR/Diagnostics.h"
 #include "mlir/IR/Location.h"
 #include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/Remarks.h"
 #include "mlir/Parser/Parser.h"
 #include "mlir/Pass/PassManager.h"
 #include "mlir/Pass/PassRegistry.h"
+#include "mlir/Remark/RemarkStreamer.h"
 #include "mlir/Support/FileUtilities.h"
 #include "mlir/Support/Timing.h"
 #include "mlir/Support/ToolUtilities.h"
@@ -33,6 +35,7 @@
 #include "mlir/Tools/Plugins/DialectPlugin.h"
 #include "mlir/Tools/Plugins/PassPlugin.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Remarks/RemarkFormat.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/InitLLVM.h"
 #include "llvm/Support/LogicalResult.h"
@@ -204,6 +207,59 @@ struct MlirOptMainConfigCLOptions : public MlirOptMainConfig {
             cl::location(generateReproducerFileFlag), cl::init(""),
             cl::value_desc("filename"));
 
+    static cl::OptionCategory remarkCategory(
+        "Remark Options",
+        "Filter remarks by regular expression (llvm::Regex syntax).");
+
+    static llvm::cl::opt<RemarkFormat, /*ExternalStorage=*/true> remarkFormat{
+        "remark-format",
+        llvm::cl::desc("Specify the format for remark output."),
+        cl::location(remarkFormatFlag),
+        llvm::cl::value_desc("format"),
+        llvm::cl::init(RemarkFormat::REMARK_FORMAT_STDOUT),
+        llvm::cl::values(clEnumValN(RemarkFormat::REMARK_FORMAT_STDOUT,
+                                    "emitRemark",
+                                    "Print as emitRemark to command-line"),
+                         clEnumValN(RemarkFormat::REMARK_FORMAT_YAML, "yaml",
+                                    "Print yaml file"),
+                         clEnumValN(RemarkFormat::REMARK_FORMAT_BITSTREAM,
+                                    "bitstream", "Print bitstream file")),
+        llvm::cl::cat(remarkCategory)};
+
+    static cl::opt<std::string, /*ExternalStorage=*/true> remarksAll(
+        "remarks-filter",
+        cl::desc("Show all remarks: passed, missed, failed, analysis"),
+        cl::location(remarksAllFilterFlag), cl::init(""),
+        cl::cat(remarkCategory));
+
+    static cl::opt<std::string, /*ExternalStorage=*/true> remarksFile(
+        "remarks-output-file",
+        cl::desc(
+            "Output file for yaml and bitstream remark formats. Default is "
+            "mlir-remarks.yaml or mlir-remarks.bitstream"),
+        cl::location(remarksOutputFileFlag), cl::init(""),
+        cl::cat(remarkCategory));
+
+    static cl::opt<std::string, /*ExternalStorage=*/true> remarksPassed(
+        "remarks-filter-passed", cl::desc("Show passed remarks"),
+        cl::location(remarksPassedFilterFlag), cl::init(""),
+        cl::cat(remarkCategory));
+
+    static cl::opt<std::string, /*ExternalStorage=*/true> remarksFailed(
+        "remarks-filter-failed", cl::desc("Show failed remarks"),
+        cl::location(remarksFailedFilterFlag), cl::init(""),
+        cl::cat(remarkCategory));
+
+    static cl::opt<std::string, /*ExternalStorage=*/true> remarksMissed(
+        "remarks-filter-missed", cl::desc("Show missed remarks"),
+        cl::location(remarksMissedFilterFlag), cl::init(""),
+        cl::cat(remarkCategory));
+
+    static cl::opt<std::string, /*ExternalStorage=*/true> remarksAnalyse(
+        "remarks-filter-analyse", cl::desc("Show analysis remarks"),
+        cl::location(remarksAnalyseFilterFlag), cl::init(""),
+        cl::cat(remarkCategory));
+
     /// Set the callback to load a pass plugin.
     passPlugins.setCallback([&](const std::string &pluginPath) {
       auto plugin = PassPlugin::load(pluginPath);
@@ -241,23 +297,23 @@ class DiagnosticFilter : public ScopedDiagnosticHandler {
     setHandler([verbosityLevel, showNotes](Diagnostic &diag) {
       auto severity = diag.getSeverity();
       switch (severity) {
-      case DiagnosticSeverity::Error:
+      case mlir::DiagnosticSeverity::Error:
         // failure indicates that the error is not handled by the filter and
         // goes through to the default handler. Therefore, the error can be
         // successfully printed.
         return failure();
-      case DiagnosticSeverity::Warning:
+      case mlir::DiagnosticSeverity::Warning:
         if (verbosityLevel == VerbosityLevel::ErrorsOnly)
           return success();
         else
           return failure();
-      case DiagnosticSeverity::Remark:
+      case mlir::DiagnosticSeverity::Remark:
         if (verbosityLevel == VerbosityLevel::ErrorsOnly ||
             verbosityLevel == VerbosityLevel::ErrorsAndWarnings)
           return success();
         else
           return failure();
-      case DiagnosticSeverity::Note:
+      case mlir::DiagnosticSeverity::Note:
         if (showNotes)
           return failure();
         else
@@ -462,6 +518,41 @@ performActions(raw_ostream &os,
 
   context->enableMultithreading(wasThreadingEnabled);
 
+  remark::RemarkCategories cats{
+      config.getRemarksAllFilter(), config.getRemarksPassedFilter(),
+      config.getRemarksMissedFilter(), config.getRemarksAnalyseFilter(),
+      config.getRemarksFailedFilter()};
+
+  mlir::MLIRContext &ctx = *context;
+
+  switch (config.getRemarkFormat()) {
+  case RemarkFormat::REMARK_FORMAT_STDOUT:
+    if (failed(mlir::remark::enableOptimizationRemarks(
+            ctx, nullptr, cats, true /*printAsEmitRemarks*/)))
+      return failure();
+    break;
+
+  case RemarkFormat::REMARK_FORMAT_YAML: {
+    std::string file = config.getRemarksOutputFile().empty()
+                           ? "mlir-remarks.yaml"
+                           : config.getRemarksOutputFile();
+    if (failed(mlir::remark::enableOptimizationRemarksWithLLVMStreamer(
+            ctx, file, llvm::remarks::Format::YAML, cats)))
+      return failure();
+    break;
+  }
+
+  case RemarkFormat::REMARK_FORMAT_BITSTREAM: {
+    std::string file = config.getRemarksOutputFile().empty()
+                           ? "mlir-remarks.bitstream"
+                           : config.getRemarksOutputFile();
+    if (failed(mlir::remark::enableOptimizationRemarksWithLLVMStreamer(
+            ctx, file, llvm::remarks::Format::Bitstream, cats)))
+      return failure();
+    break;
+  }
+  }
+
   // Prepare the pass manager, applying command-line and reproducer options.
   PassManager pm(op.get()->getName(), PassManager::Nesting::Implicit);
   pm.enableVerifier(config.shouldVerifyPasses());
@@ -523,8 +614,8 @@ processBuffer(raw_ostream &os, std::unique_ptr<MemoryBuffer> ownedBuffer,
       SMLoc());
   sourceMgr->AddNewSourceBuffer(std::move(ownedBuffer), SMLoc());
 
-  // Create a context just for the current buffer. Disable threading on creation
-  // since we'll inject the thread-pool separately.
+  // Create a context just for the current buffer. Disable threading on
+  // creation since we'll inject the thread-pool separately.
   MLIRContext context(registry, MLIRContext::Threading::DISABLED);
   if (threadPool)
     context.setThreadPool(*threadPool);
@@ -669,9 +760,9 @@ LogicalResult mlir::MlirOptMain(int argc, char **argv,
   if (config.shouldListPasses())
     return printRegisteredPassesAndReturn();
 
-  // When reading from stdin and the input is a tty, it is often a user mistake
-  // and the process "appears to be stuck". Print a message to let the user know
-  // about it!
+  // When reading from stdin and the input is a tty, it is often a user
+  // mistake and the process "appears to be stuck". Print a message to let the
+  // user know about it!
   if (inputFilename == "-" &&
       sys::Process::FileDescriptorIsDisplayed(fileno(stdin)))
     llvm::errs() << "(processing input from stdin now, hit ctrl-c/ctrl-d to "
diff --git a/mlir/lib/Tools/mlir-pdll-lsp-server/CMakeLists.txt b/mlir/lib/Tools/mlir-pdll-lsp-server/CMakeLists.txt
index bf25b7e0a64f3..b41603fb67eb0 100644
--- a/mlir/lib/Tools/mlir-pdll-lsp-server/CMakeLists.txt
+++ b/mlir/lib/Tools/mlir-pdll-lsp-server/CMakeLists.txt
@@ -7,6 +7,9 @@ llvm_add_library(MLIRPdllLspServerLib
   ADDITIONAL_HEADER_DIRS
   ${MLIR_MAIN_INCLUDE_DIR}/mlir/Tools/mlir-pdll-lsp-server
 
+  LINK_COMPONENTS
+  SupportLSP
+
   LINK_LIBS PUBLIC
   MLIRPDLLCodeGen
   MLIRPDLLParser
diff --git a/mlir/lib/Tools/mlir-pdll-lsp-server/LSPServer.cpp b/mlir/lib/Tools/mlir-pdll-lsp-server/LSPServer.cpp
index 82542a12a1807..7b23adcc7e2e1 100644
--- a/mlir/lib/Tools/mlir-pdll-lsp-server/LSPServer.cpp
+++ b/mlir/lib/Tools/mlir-pdll-lsp-server/LSPServer.cpp
@@ -10,8 +10,9 @@
 
 #include "PDLLServer.h"
 #include "Protocol.h"
-#include "mlir/Tools/lsp-server-support/Logging.h"
-#include "mlir/Tools/lsp-server-support/Transport.h"
+#include "llvm/Support/LSP/Logging.h"
+#include "llvm/Support/LSP/Protocol.h"
+#include "llvm/Support/LSP/Transport.h"
 #include <optional>
 
 #define DEBUG_TYPE "pdll-lsp-server"
@@ -19,6 +20,30 @@
 using namespace mlir;
 using namespace mlir::lsp;
 
+using llvm::lsp::Callback;
+using llvm::lsp::CompletionList;
+using llvm::lsp::CompletionParams;
+using llvm::lsp::DidChangeTextDocumentParams;
+using llvm::lsp::DidCloseTextDocumentParams;
+using llvm::lsp::DidOpenTextDocumentParams;
+using llvm::lsp::DocumentLinkParams;
+using llvm::lsp::DocumentSymbol;
+using llvm::lsp::DocumentSymbolParams;
+using llvm::lsp::Hover;
+using llvm::lsp::InitializedParams;
+using llvm::lsp::InitializeParams;
+using llvm::lsp::InlayHintsParams;
+using llvm::lsp::JSONTransport;
+using llvm::lsp::Location;
+using llvm::lsp::Logger;
+using llvm::lsp::MessageHandler;
+using llvm::lsp::NoParams;
+using llvm::lsp::OutgoingNotification;
+using llvm::lsp::PublishDiagnosticsParams;
+using llvm::lsp::ReferenceParams;
+using llvm::lsp::TextDocumentPositionParams;
+using llvm::lsp::TextDocumentSyncKind;
+
 //===----------------------------------------------------------------------===//
 // LSPServer
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Tools/mlir-pdll-lsp-server/LSPServer.h b/mlir/lib/Tools/mlir-pdll-lsp-server/LSPServer.h
index 78c4c31100cbc..42c0a5d7b6d2b 100644
--- a/mlir/lib/Tools/mlir-pdll-lsp-server/LSPServer.h
+++ b/mlir/lib/Tools/mlir-pdll-lsp-server/LSPServer.h
@@ -13,17 +13,19 @@
 
 namespace llvm {
 struct LogicalResult;
+namespace lsp {
+class JSONTransport;
+} // namespace lsp
 } // namespace llvm
 
 namespace mlir {
 namespace lsp {
-class JSONTransport;
 class PDLLServer;
 
 /// Run the main loop of the LSP server using the given PDLL server and
 /// transport.
 llvm::LogicalResult runPdllLSPServer(PDLLServer &server,
-                                     JSONTransport &transport);
+                                     llvm::lsp::JSONTransport &transport);
 
 } // namespace lsp
 } // namespace mlir
diff --git a/mlir/lib/Tools/mlir-pdll-lsp-server/MlirPdllLspServerMain.cpp b/mlir/lib/Tools/mlir-pdll-lsp-server/MlirPdllLspServerMain.cpp
index 287a131ecd17d..5dea130675cdb 100644
--- a/mlir/lib/Tools/mlir-pdll-lsp-server/MlirPdllLspServerMain.cpp
+++ b/mlir/lib/Tools/mlir-pdll-lsp-server/MlirPdllLspServerMain.cpp
@@ -9,14 +9,17 @@
 #include "mlir/Tools/mlir-pdll-lsp-server/MlirPdllLspServerMain.h"
 #include "LSPServer.h"
 #include "PDLLServer.h"
-#include "mlir/Tools/lsp-server-support/Logging.h"
-#include "mlir/Tools/lsp-server-support/Transport.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/LSP/Logging.h"
+#include "llvm/Support/LSP/Transport.h"
 #include "llvm/Support/Program.h"
 
 using namespace mlir;
 using namespace mlir::lsp;
 
+using llvm::lsp::JSONStreamStyle;
+using llvm::lsp::Logger;
+
 LogicalResult mlir::MlirPdllLspServerMain(int argc, char **argv) {
   llvm::cl::opt<JSONStreamStyle> inputStyle{
       "input-style",
@@ -72,7 +75,8 @@ LogicalResult mlir::MlirPdllLspServerMain(int argc, char **argv) {
 
   // Configure the transport used for communication.
   llvm::sys::ChangeStdinToBinary();
-  JSONTransport transport(stdin, llvm::outs(), inputStyle, prettyPrint);
+  llvm::lsp::JSONTransport transport(stdin, llvm::outs(), inputStyle,
+                                     prettyPrint);
 
   // Configure the servers and start the main language server.
   PDLLServer::Options options(compilationDatabases, extraIncludeDirs);
diff --git a/mlir/lib/Tools/mlir-pdll-lsp-server/PDLLServer.cpp b/mlir/lib/Tools/mlir-pdll-lsp-server/PDLLServer.cpp
index 84f529ae16401..60b9567ff7804 100644
--- a/mlir/lib/Tools/mlir-pdll-lsp-server/PDLLServer.cpp
+++ b/mlir/lib/Tools/mlir-pdll-lsp-server/PDLLServer.cpp
@@ -23,13 +23,13 @@
 #include "mlir/Tools/PDLL/Parser/CodeComplete.h"
 #include "mlir/Tools/PDLL/Parser/Parser.h"
 #include "mlir/Tools/lsp-server-support/CompilationDatabase.h"
-#include "mlir/Tools/lsp-server-support/Logging.h"
 #include "mlir/Tools/lsp-server-support/SourceMgrUtils.h"
 #include "llvm/ADT/IntervalMap.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringSet.h"
 #include "llvm/ADT/TypeSwitch.h"
 #include "llvm/Support/FileSystem.h"
+#include "llvm/Support/LSP/Logging.h"
 #include "llvm/Support/Path.h"
 #include <optional>
 
@@ -38,17 +38,19 @@ using namespace mlir::pdll;
 
 /// Returns a language server uri for the given source location. `mainFileURI`
 /// corresponds to the uri for the main file of the source manager.
-static lsp::URIForFile getURIFromLoc(llvm::SourceMgr &mgr, SMRange loc,
-                                     const lsp::URIForFile &mainFileURI) {
+static llvm::lsp::URIForFile
+getURIFromLoc(llvm::SourceMgr &mgr, SMRange loc,
+              const llvm::lsp::URIForFile &mainFileURI) {
   int bufferId = mgr.FindBufferContainingLoc(loc.Start);
   if (bufferId == 0 || bufferId == static_cast<int>(mgr.getMainFileID()))
     return mainFileURI;
-  llvm::Expected<lsp::URIForFile> fileForLoc = lsp::URIForFile::fromFile(
-      mgr.getBufferInfo(bufferId).Buffer->getBufferIdentifier());
+  llvm::Expected<llvm::lsp::URIForFile> fileForLoc =
+      llvm::lsp::URIForFile::fromFile(
+          mgr.getBufferInfo(bufferId).Buffer->getBufferIdentifier());
   if (fileForLoc)
     return *fileForLoc;
-  lsp::Logger::error("Failed to create URI for include file: {0}",
-                     llvm::toString(fileForLoc.takeError()));
+  llvm::lsp::Logger::error("Failed to create URI for include file: {0}",
+                           llvm::toString(fileForLoc.takeError()));
   return mainFileURI;
 }
 
@@ -59,16 +61,18 @@ static bool isMainFileLoc(llvm::SourceMgr &mgr, SMRange loc) {
 }
 
 /// Returns a language server location from the given source range.
-static lsp::Location getLocationFromLoc(llvm::SourceMgr &mgr, SMRange range,
-                                        const lsp::URIForFile &uri) {
-  return lsp::Location(getURIFromLoc(mgr, range, uri), lsp::Range(mgr, range));
+static llvm::lsp::Location
+getLocationFromLoc(llvm::SourceMgr &mgr, SMRange range,
+                   const llvm::lsp::URIForFile &uri) {
+  return llvm::lsp::Location(getURIFromLoc(mgr, range, uri),
+                             llvm::lsp::Range(mgr, range));
 }
 
 /// Convert the given MLIR diagnostic to the LSP form.
-static std::optional<lsp::Diagnostic>
+static std::optional<llvm::lsp::Diagnostic>
 getLspDiagnoticFromDiag(llvm::SourceMgr &sourceMgr, const ast::Diagnostic &diag,
-                        const lsp::URIForFile &uri) {
-  lsp::Diagnostic lspDiag;
+                        const llvm::lsp::URIForFile &uri) {
+  llvm::lsp::Diagnostic lspDiag;
   lspDiag.source = "pdll";
 
   // FIXME: Right now all of the diagnostics are treated as parser issues, but
@@ -76,7 +80,8 @@ getLspDiagnoticFromDiag(llvm::SourceMgr &sourceMgr, const ast::Diagnostic &diag,
   lspDiag.category = "Parse Error";
 
   // Try to grab a file location for this diagnostic.
-  lsp::Location loc = getLocationFromLoc(sourceMgr, diag.getLocation(), uri);
+  llvm::lsp::Location loc =
+      getLocationFromLoc(sourceMgr, diag.getLocation(), uri);
   lspDiag.range = loc.range;
 
   // Skip diagnostics that weren't emitted within the main file.
@@ -88,19 +93,19 @@ getLspDiagnoticFromDiag(llvm::SourceMgr &sourceMgr, const ast::Diagnostic &diag,
   case ast::Diagnostic::Severity::DK_Note:
     llvm_unreachable("expected notes to be handled separately");
   case ast::Diagnostic::Severity::DK_Warning:
-    lspDiag.severity = lsp::DiagnosticSeverity::Warning;
+    lspDiag.severity = llvm::lsp::DiagnosticSeverity::Warning;
     break;
   case ast::Diagnostic::Severity::DK_Error:
-    lspDiag.severity = lsp::DiagnosticSeverity::Error;
+    lspDiag.severity = llvm::lsp::DiagnosticSeverity::Error;
     break;
   case ast::Diagnostic::Severity::DK_Remark:
-    lspDiag.severity = lsp::DiagnosticSeverity::Information;
+    lspDiag.severity = llvm::lsp::DiagnosticSeverity::Information;
     break;
   }
   lspDiag.message = diag.getMessage().str();
 
   // Attach any notes to the main diagnostic as related information.
-  std::vector<lsp::DiagnosticRelatedInformation> relatedDiags;
+  std::vector<llvm::lsp::DiagnosticRelatedInformation> relatedDiags;
   for (const ast::Diagnostic &note : diag.getNotes()) {
     relatedDiags.emplace_back(
         getLocationFromLoc(sourceMgr, note.getLocation(), uri),
@@ -259,9 +264,9 @@ namespace {
 /// This class represents all of the information pertaining to a specific PDL
 /// document.
 struct PDLDocument {
-  PDLDocument(const lsp::URIForFile &uri, StringRef contents,
+  PDLDocument(const llvm::lsp::URIForFile &uri, StringRef contents,
               const std::vector<std::string> &extraDirs,
-              std::vector<lsp::Diagnostic> &diagnostics);
+              std::vector<llvm::lsp::Diagnostic> &diagnostics);
   PDLDocument(const PDLDocument &) = delete;
   PDLDocument &operator=(const PDLDocument &) = delete;
 
@@ -269,76 +274,83 @@ struct PDLDocument {
   // Definitions and References
   //===--------------------------------------------------------------------===//
 
-  void getLocationsOf(const lsp::URIForFile &uri, const lsp::Position &defPos,
-                      std::vector<lsp::Location> &locations);
-  void findReferencesOf(const lsp::URIForFile &uri, const lsp::Position &pos,
-                        std::vector<lsp::Location> &references);
+  void getLocationsOf(const llvm::lsp::URIForFile &uri,
+                      const llvm::lsp::Position &defPos,
+                      std::vector<llvm::lsp::Location> &locations);
+  void findReferencesOf(const llvm::lsp::URIForFile &uri,
+                        const llvm::lsp::Position &pos,
+                        std::vector<llvm::lsp::Location> &references);
 
   //===--------------------------------------------------------------------===//
   // Document Links
   //===--------------------------------------------------------------------===//
 
-  void getDocumentLinks(const lsp::URIForFile &uri,
-                        std::vector<lsp::DocumentLink> &links);
+  void getDocumentLinks(const llvm::lsp::URIForFile &uri,
+                        std::vector<llvm::lsp::DocumentLink> &links);
 
   //===--------------------------------------------------------------------===//
   // Hover
   //===--------------------------------------------------------------------===//
 
-  std::optional<lsp::Hover> findHover(const lsp::URIForFile &uri,
-                                      const lsp::Position &hoverPos);
-  std::optional<lsp::Hover> findHover(const ast::Decl *decl,
-                                      const SMRange &hoverRange);
-  lsp::Hover buildHoverForOpName(const ods::Operation *op,
-                                 const SMRange &hoverRange);
-  lsp::Hover buildHoverForVariable(const ast::VariableDecl *varDecl,
-                                   const SMRange &hoverRange);
-  lsp::Hover buildHoverForPattern(const ast::PatternDecl *decl,
-                                  const SMRange &hoverRange);
-  lsp::Hover buildHoverForCoreConstraint(const ast::CoreConstraintDecl *decl,
+  std::optional<llvm::lsp::Hover>
+  findHover(const llvm::lsp::URIForFile &uri,
+            const llvm::lsp::Position &hoverPos);
+  std::optional<llvm::lsp::Hover> findHover(const ast::Decl *decl,
+                                            const SMRange &hoverRange);
+  llvm::lsp::Hover buildHoverForOpName(const ods::Operation *op,
+                                       const SMRange &hoverRange);
+  llvm::lsp::Hover buildHoverForVariable(const ast::VariableDecl *varDecl,
                                          const SMRange &hoverRange);
+  llvm::lsp::Hover buildHoverForPattern(const ast::PatternDecl *decl,
+                                        const SMRange &hoverRange);
+  llvm::lsp::Hover
+  buildHoverForCoreConstraint(const ast::CoreConstraintDecl *decl,
+                              const SMRange &hoverRange);
   template <typename T>
-  lsp::Hover buildHoverForUserConstraintOrRewrite(StringRef typeName,
-                                                  const T *decl,
-                                                  const SMRange &hoverRange);
+  llvm::lsp::Hover
+  buildHoverForUserConstraintOrRewrite(StringRef typeName, const T *decl,
+                                       const SMRange &hoverRange);
 
   //===--------------------------------------------------------------------===//
   // Document Symbols
   //===--------------------------------------------------------------------===//
 
-  void findDocumentSymbols(std::vector<lsp::DocumentSymbol> &symbols);
+  void findDocumentSymbols(std::vector<llvm::lsp::DocumentSymbol> &symbols);
 
   //===--------------------------------------------------------------------===//
   // Code Completion
   //===--------------------------------------------------------------------===//
 
-  lsp::CompletionList getCodeCompletion(const lsp::URIForFile &uri,
-                                        const lsp::Position &completePos);
+  llvm::lsp::CompletionList
+  getCodeCompletion(const llvm::lsp::URIForFile &uri,
+                    const llvm::lsp::Position &completePos);
 
   //===--------------------------------------------------------------------===//
   // Signature Help
   //===--------------------------------------------------------------------===//
 
-  lsp::SignatureHelp getSignatureHelp(const lsp::URIForFile &uri,
-                                      const lsp::Position &helpPos);
+  llvm::lsp::SignatureHelp getSignatureHelp(const llvm::lsp::URIForFile &uri,
+                                            const llvm::lsp::Position &helpPos);
 
   //===--------------------------------------------------------------------===//
   // Inlay Hints
   //===--------------------------------------------------------------------===//
 
-  void getInlayHints(const lsp::URIForFile &uri, const lsp::Range &range,
-                     std::vector<lsp::InlayHint> &inlayHints);
+  void getInlayHints(const llvm::lsp::URIForFile &uri,
+                     const llvm::lsp::Range &range,
+                     std::vector<llvm::lsp::InlayHint> &inlayHints);
   void getInlayHintsFor(const ast::VariableDecl *decl,
-                        const lsp::URIForFile &uri,
-                        std::vector<lsp::InlayHint> &inlayHints);
-  void getInlayHintsFor(const ast::CallExpr *expr, const lsp::URIForFile &uri,
-                        std::vector<lsp::InlayHint> &inlayHints);
+                        const llvm::lsp::URIForFile &uri,
+                        std::vector<llvm::lsp::InlayHint> &inlayHints);
+  void getInlayHintsFor(const ast::CallExpr *expr,
+                        const llvm::lsp::URIForFile &uri,
+                        std::vector<llvm::lsp::InlayHint> &inlayHints);
   void getInlayHintsFor(const ast::OperationExpr *expr,
-                        const lsp::URIForFile &uri,
-                        std::vector<lsp::InlayHint> &inlayHints);
+                        const llvm::lsp::URIForFile &uri,
+                        std::vector<llvm::lsp::InlayHint> &inlayHints);
 
   /// Add a parameter hint for the given expression using `label`.
-  void addParameterHintFor(std::vector<lsp::InlayHint> &inlayHints,
+  void addParameterHintFor(std::vector<llvm::lsp::InlayHint> &inlayHints,
                            const ast::Expr *expr, StringRef label);
 
   //===--------------------------------------------------------------------===//
@@ -372,13 +384,14 @@ struct PDLDocument {
 };
 } // namespace
 
-PDLDocument::PDLDocument(const lsp::URIForFile &uri, StringRef contents,
+PDLDocument::PDLDocument(const llvm::lsp::URIForFile &uri, StringRef contents,
                          const std::vector<std::string> &extraDirs,
-                         std::vector<lsp::Diagnostic> &diagnostics)
+                         std::vector<llvm::lsp::Diagnostic> &diagnostics)
     : astContext(odsContext) {
   auto memBuffer = llvm::MemoryBuffer::getMemBufferCopy(contents, uri.file());
   if (!memBuffer) {
-    lsp::Logger::error("Failed to create memory buffer for file", uri.file());
+    llvm::lsp::Logger::error("Failed to create memory buffer for file",
+                             uri.file());
     return;
   }
 
@@ -412,9 +425,9 @@ PDLDocument::PDLDocument(const lsp::URIForFile &uri, StringRef contents,
 // PDLDocument: Definitions and References
 //===----------------------------------------------------------------------===//
 
-void PDLDocument::getLocationsOf(const lsp::URIForFile &uri,
-                                 const lsp::Position &defPos,
-                                 std::vector<lsp::Location> &locations) {
+void PDLDocument::getLocationsOf(const llvm::lsp::URIForFile &uri,
+                                 const llvm::lsp::Position &defPos,
+                                 std::vector<llvm::lsp::Location> &locations) {
   SMLoc posLoc = defPos.getAsSMLoc(sourceMgr);
   const PDLIndexSymbol *symbol = index.lookup(posLoc);
   if (!symbol)
@@ -423,9 +436,9 @@ void PDLDocument::getLocationsOf(const lsp::URIForFile &uri,
   locations.push_back(getLocationFromLoc(sourceMgr, symbol->getDefLoc(), uri));
 }
 
-void PDLDocument::findReferencesOf(const lsp::URIForFile &uri,
-                                   const lsp::Position &pos,
-                                   std::vector<lsp::Location> &references) {
+void PDLDocument::findReferencesOf(
+    const llvm::lsp::URIForFile &uri, const llvm::lsp::Position &pos,
+    std::vector<llvm::lsp::Location> &references) {
   SMLoc posLoc = pos.getAsSMLoc(sourceMgr);
   const PDLIndexSymbol *symbol = index.lookup(posLoc);
   if (!symbol)
@@ -440,8 +453,9 @@ void PDLDocument::findReferencesOf(const lsp::URIForFile &uri,
 // PDLDocument: Document Links
 //===--------------------------------------------------------------------===//
 
-void PDLDocument::getDocumentLinks(const lsp::URIForFile &uri,
-                                   std::vector<lsp::DocumentLink> &links) {
+void PDLDocument::getDocumentLinks(
+    const llvm::lsp::URIForFile &uri,
+    std::vector<llvm::lsp::DocumentLink> &links) {
   for (const lsp::SourceMgrInclude &include : parsedIncludes)
     links.emplace_back(include.range, include.uri);
 }
@@ -450,9 +464,9 @@ void PDLDocument::getDocumentLinks(const lsp::URIForFile &uri,
 // PDLDocument: Hover
 //===----------------------------------------------------------------------===//
 
-std::optional<lsp::Hover>
-PDLDocument::findHover(const lsp::URIForFile &uri,
-                       const lsp::Position &hoverPos) {
+std::optional<llvm::lsp::Hover>
+PDLDocument::findHover(const llvm::lsp::URIForFile &uri,
+                       const llvm::lsp::Position &hoverPos) {
   SMLoc posLoc = hoverPos.getAsSMLoc(sourceMgr);
 
   // Check for a reference to an include.
@@ -474,8 +488,8 @@ PDLDocument::findHover(const lsp::URIForFile &uri,
   return findHover(decl, hoverRange);
 }
 
-std::optional<lsp::Hover> PDLDocument::findHover(const ast::Decl *decl,
-                                                 const SMRange &hoverRange) {
+std::optional<llvm::lsp::Hover>
+PDLDocument::findHover(const ast::Decl *decl, const SMRange &hoverRange) {
   // Add hover for variables.
   if (const auto *varDecl = dyn_cast<ast::VariableDecl>(decl))
     return buildHoverForVariable(varDecl, hoverRange);
@@ -499,9 +513,9 @@ std::optional<lsp::Hover> PDLDocument::findHover(const ast::Decl *decl,
   return std::nullopt;
 }
 
-lsp::Hover PDLDocument::buildHoverForOpName(const ods::Operation *op,
-                                            const SMRange &hoverRange) {
-  lsp::Hover hover(lsp::Range(sourceMgr, hoverRange));
+llvm::lsp::Hover PDLDocument::buildHoverForOpName(const ods::Operation *op,
+                                                  const SMRange &hoverRange) {
+  llvm::lsp::Hover hover(llvm::lsp::Range(sourceMgr, hoverRange));
   {
     llvm::raw_string_ostream hoverOS(hover.contents.value);
     hoverOS << "**OpName**: `" << op->getName() << "`\n***\n"
@@ -511,9 +525,10 @@ lsp::Hover PDLDocument::buildHoverForOpName(const ods::Operation *op,
   return hover;
 }
 
-lsp::Hover PDLDocument::buildHoverForVariable(const ast::VariableDecl *varDecl,
-                                              const SMRange &hoverRange) {
-  lsp::Hover hover(lsp::Range(sourceMgr, hoverRange));
+llvm::lsp::Hover
+PDLDocument::buildHoverForVariable(const ast::VariableDecl *varDecl,
+                                   const SMRange &hoverRange) {
+  llvm::lsp::Hover hover(llvm::lsp::Range(sourceMgr, hoverRange));
   {
     llvm::raw_string_ostream hoverOS(hover.contents.value);
     hoverOS << "**Variable**: `" << varDecl->getName().getName() << "`\n***\n"
@@ -522,9 +537,9 @@ lsp::Hover PDLDocument::buildHoverForVariable(const ast::VariableDecl *varDecl,
   return hover;
 }
 
-lsp::Hover PDLDocument::buildHoverForPattern(const ast::PatternDecl *decl,
-                                             const SMRange &hoverRange) {
-  lsp::Hover hover(lsp::Range(sourceMgr, hoverRange));
+llvm::lsp::Hover PDLDocument::buildHoverForPattern(const ast::PatternDecl *decl,
+                                                   const SMRange &hoverRange) {
+  llvm::lsp::Hover hover(llvm::lsp::Range(sourceMgr, hoverRange));
   {
     llvm::raw_string_ostream hoverOS(hover.contents.value);
     hoverOS << "**Pattern**";
@@ -545,10 +560,10 @@ lsp::Hover PDLDocument::buildHoverForPattern(const ast::PatternDecl *decl,
   return hover;
 }
 
-lsp::Hover
+llvm::lsp::Hover
 PDLDocument::buildHoverForCoreConstraint(const ast::CoreConstraintDecl *decl,
                                          const SMRange &hoverRange) {
-  lsp::Hover hover(lsp::Range(sourceMgr, hoverRange));
+  llvm::lsp::Hover hover(llvm::lsp::Range(sourceMgr, hoverRange));
   {
     llvm::raw_string_ostream hoverOS(hover.contents.value);
     hoverOS << "**Constraint**: `";
@@ -573,9 +588,9 @@ PDLDocument::buildHoverForCoreConstraint(const ast::CoreConstraintDecl *decl,
 }
 
 template <typename T>
-lsp::Hover PDLDocument::buildHoverForUserConstraintOrRewrite(
+llvm::lsp::Hover PDLDocument::buildHoverForUserConstraintOrRewrite(
     StringRef typeName, const T *decl, const SMRange &hoverRange) {
-  lsp::Hover hover(lsp::Range(sourceMgr, hoverRange));
+  llvm::lsp::Hover hover(llvm::lsp::Range(sourceMgr, hoverRange));
   {
     llvm::raw_string_ostream hoverOS(hover.contents.value);
     hoverOS << "**" << typeName << "**: `" << decl->getName().getName()
@@ -617,7 +632,7 @@ lsp::Hover PDLDocument::buildHoverForUserConstraintOrRewrite(
 //===----------------------------------------------------------------------===//
 
 void PDLDocument::findDocumentSymbols(
-    std::vector<lsp::DocumentSymbol> &symbols) {
+    std::vector<llvm::lsp::DocumentSymbol> &symbols) {
   if (failed(astModule))
     return;
 
@@ -631,25 +646,28 @@ void PDLDocument::findDocumentSymbols(
       SMRange nameLoc = name ? name->getLoc() : patternDecl->getLoc();
       SMRange bodyLoc(nameLoc.Start, patternDecl->getBody()->getLoc().End);
 
-      symbols.emplace_back(
-          name ? name->getName() : "<pattern>", lsp::SymbolKind::Class,
-          lsp::Range(sourceMgr, bodyLoc), lsp::Range(sourceMgr, nameLoc));
+      symbols.emplace_back(name ? name->getName() : "<pattern>",
+                           llvm::lsp::SymbolKind::Class,
+                           llvm::lsp::Range(sourceMgr, bodyLoc),
+                           llvm::lsp::Range(sourceMgr, nameLoc));
     } else if (const auto *cDecl = dyn_cast<ast::UserConstraintDecl>(decl)) {
       // TODO: Add source information for the code block body.
       SMRange nameLoc = cDecl->getName().getLoc();
       SMRange bodyLoc = nameLoc;
 
-      symbols.emplace_back(
-          cDecl->getName().getName(), lsp::SymbolKind::Function,
-          lsp::Range(sourceMgr, bodyLoc), lsp::Range(sourceMgr, nameLoc));
+      symbols.emplace_back(cDecl->getName().getName(),
+                           llvm::lsp::SymbolKind::Function,
+                           llvm::lsp::Range(sourceMgr, bodyLoc),
+                           llvm::lsp::Range(sourceMgr, nameLoc));
     } else if (const auto *cDecl = dyn_cast<ast::UserRewriteDecl>(decl)) {
       // TODO: Add source information for the code block body.
       SMRange nameLoc = cDecl->getName().getLoc();
       SMRange bodyLoc = nameLoc;
 
-      symbols.emplace_back(
-          cDecl->getName().getName(), lsp::SymbolKind::Function,
-          lsp::Range(sourceMgr, bodyLoc), lsp::Range(sourceMgr, nameLoc));
+      symbols.emplace_back(cDecl->getName().getName(),
+                           llvm::lsp::SymbolKind::Function,
+                           llvm::lsp::Range(sourceMgr, bodyLoc),
+                           llvm::lsp::Range(sourceMgr, nameLoc));
     }
   }
 }
@@ -662,7 +680,7 @@ namespace {
 class LSPCodeCompleteContext : public CodeCompleteContext {
 public:
   LSPCodeCompleteContext(SMLoc completeLoc, llvm::SourceMgr &sourceMgr,
-                         lsp::CompletionList &completionList,
+                         llvm::lsp::CompletionList &completionList,
                          ods::Context &odsContext,
                          ArrayRef<std::string> includeDirs)
       : CodeCompleteContext(completeLoc), sourceMgr(sourceMgr),
@@ -674,13 +692,13 @@ class LSPCodeCompleteContext : public CodeCompleteContext {
     ArrayRef<StringRef> elementNames = tupleType.getElementNames();
     for (unsigned i = 0, e = tupleType.size(); i < e; ++i) {
       // Push back a completion item that uses the result index.
-      lsp::CompletionItem item;
+      llvm::lsp::CompletionItem item;
       item.label = llvm::formatv("{0} (field #{0})", i).str();
       item.insertText = Twine(i).str();
       item.filterText = item.sortText = item.insertText;
-      item.kind = lsp::CompletionItemKind::Field;
+      item.kind = llvm::lsp::CompletionItemKind::Field;
       item.detail = llvm::formatv("{0}: {1}", i, elementTypes[i]);
-      item.insertTextFormat = lsp::InsertTextFormat::PlainText;
+      item.insertTextFormat = llvm::lsp::InsertTextFormat::PlainText;
       completionList.items.emplace_back(item);
 
       // If the element has a name, push back a completion item with that name.
@@ -705,11 +723,11 @@ class LSPCodeCompleteContext : public CodeCompleteContext {
       const ods::TypeConstraint &constraint = result.getConstraint();
 
       // Push back a completion item that uses the result index.
-      lsp::CompletionItem item;
+      llvm::lsp::CompletionItem item;
       item.label = llvm::formatv("{0} (field #{0})", it.index()).str();
       item.insertText = Twine(it.index()).str();
       item.filterText = item.sortText = item.insertText;
-      item.kind = lsp::CompletionItemKind::Field;
+      item.kind = llvm::lsp::CompletionItemKind::Field;
       switch (result.getVariableLengthKind()) {
       case ods::VariableLengthKind::Single:
         item.detail = llvm::formatv("{0}: Value", it.index()).str();
@@ -721,12 +739,12 @@ class LSPCodeCompleteContext : public CodeCompleteContext {
         item.detail = llvm::formatv("{0}: ValueRange", it.index()).str();
         break;
       }
-      item.documentation = lsp::MarkupContent{
-          lsp::MarkupKind::Markdown,
+      item.documentation = llvm::lsp::MarkupContent{
+          llvm::lsp::MarkupKind::Markdown,
           llvm::formatv("{0}\n\n```c++\n{1}\n```\n", constraint.getSummary(),
                         constraint.getCppClass())
               .str()};
-      item.insertTextFormat = lsp::InsertTextFormat::PlainText;
+      item.insertTextFormat = llvm::lsp::InsertTextFormat::PlainText;
       completionList.items.emplace_back(item);
 
       // If the result has a name, push back a completion item with the result
@@ -750,16 +768,16 @@ class LSPCodeCompleteContext : public CodeCompleteContext {
     for (const ods::Attribute &attr : odsOp->getAttributes()) {
       const ods::AttributeConstraint &constraint = attr.getConstraint();
 
-      lsp::CompletionItem item;
+      llvm::lsp::CompletionItem item;
       item.label = attr.getName().str();
-      item.kind = lsp::CompletionItemKind::Field;
+      item.kind = llvm::lsp::CompletionItemKind::Field;
       item.detail = attr.isOptional() ? "optional" : "";
-      item.documentation = lsp::MarkupContent{
-          lsp::MarkupKind::Markdown,
+      item.documentation = llvm::lsp::MarkupContent{
+          llvm::lsp::MarkupKind::Markdown,
           llvm::formatv("{0}\n\n```c++\n{1}\n```\n", constraint.getSummary(),
                         constraint.getCppClass())
               .str()};
-      item.insertTextFormat = lsp::InsertTextFormat::PlainText;
+      item.insertTextFormat = llvm::lsp::InsertTextFormat::PlainText;
       completionList.items.emplace_back(item);
     }
   }
@@ -769,18 +787,18 @@ class LSPCodeCompleteContext : public CodeCompleteContext {
                                   const ast::DeclScope *scope) final {
     auto addCoreConstraint = [&](StringRef constraint, StringRef mlirType,
                                  StringRef snippetText = "") {
-      lsp::CompletionItem item;
+      llvm::lsp::CompletionItem item;
       item.label = constraint.str();
-      item.kind = lsp::CompletionItemKind::Class;
+      item.kind = llvm::lsp::CompletionItemKind::Class;
       item.detail = (constraint + " constraint").str();
-      item.documentation = lsp::MarkupContent{
-          lsp::MarkupKind::Markdown,
+      item.documentation = llvm::lsp::MarkupContent{
+          llvm::lsp::MarkupKind::Markdown,
           ("A single entity core constraint of type `" + mlirType + "`").str()};
       item.sortText = "0";
       item.insertText = snippetText.str();
       item.insertTextFormat = snippetText.empty()
-                                  ? lsp::InsertTextFormat::PlainText
-                                  : lsp::InsertTextFormat::Snippet;
+                                  ? llvm::lsp::InsertTextFormat::PlainText
+                                  : llvm::lsp::InsertTextFormat::Snippet;
       completionList.items.emplace_back(item);
     };
 
@@ -812,9 +830,9 @@ class LSPCodeCompleteContext : public CodeCompleteContext {
     while (scope) {
       for (const ast::Decl *decl : scope->getDecls()) {
         if (const auto *cst = dyn_cast<ast::UserConstraintDecl>(decl)) {
-          lsp::CompletionItem item;
+          llvm::lsp::CompletionItem item;
           item.label = cst->getName().getName().str();
-          item.kind = lsp::CompletionItemKind::Interface;
+          item.kind = llvm::lsp::CompletionItemKind::Interface;
           item.sortText = "2_" + item.label;
 
           // Skip constraints that are not single-arg. We currently only
@@ -841,8 +859,8 @@ class LSPCodeCompleteContext : public CodeCompleteContext {
           // Format the documentation for the constraint.
           if (std::optional<std::string> doc =
                   getDocumentationFor(sourceMgr, cst)) {
-            item.documentation =
-                lsp::MarkupContent{lsp::MarkupKind::Markdown, std::move(*doc)};
+            item.documentation = llvm::lsp::MarkupContent{
+                llvm::lsp::MarkupKind::Markdown, std::move(*doc)};
           }
 
           completionList.items.emplace_back(item);
@@ -856,10 +874,10 @@ class LSPCodeCompleteContext : public CodeCompleteContext {
   void codeCompleteDialectName() final {
     // Code complete known dialects.
     for (const ods::Dialect &dialect : odsContext.getDialects()) {
-      lsp::CompletionItem item;
+      llvm::lsp::CompletionItem item;
       item.label = dialect.getName().str();
-      item.kind = lsp::CompletionItemKind::Class;
-      item.insertTextFormat = lsp::InsertTextFormat::PlainText;
+      item.kind = llvm::lsp::CompletionItemKind::Class;
+      item.insertTextFormat = llvm::lsp::InsertTextFormat::PlainText;
       completionList.items.emplace_back(item);
     }
   }
@@ -872,10 +890,10 @@ class LSPCodeCompleteContext : public CodeCompleteContext {
     for (const auto &it : dialect->getOperations()) {
       const ods::Operation &op = *it.second;
 
-      lsp::CompletionItem item;
+      llvm::lsp::CompletionItem item;
       item.label = op.getName().drop_front(dialectName.size() + 1).str();
-      item.kind = lsp::CompletionItemKind::Field;
-      item.insertTextFormat = lsp::InsertTextFormat::PlainText;
+      item.kind = llvm::lsp::CompletionItemKind::Field;
+      item.insertTextFormat = llvm::lsp::InsertTextFormat::PlainText;
       completionList.items.emplace_back(item);
     }
   }
@@ -883,16 +901,16 @@ class LSPCodeCompleteContext : public CodeCompleteContext {
   void codeCompletePatternMetadata() final {
     auto addSimpleConstraint = [&](StringRef constraint, StringRef desc,
                                    StringRef snippetText = "") {
-      lsp::CompletionItem item;
+      llvm::lsp::CompletionItem item;
       item.label = constraint.str();
-      item.kind = lsp::CompletionItemKind::Class;
+      item.kind = llvm::lsp::CompletionItemKind::Class;
       item.detail = "pattern metadata";
       item.documentation =
-          lsp::MarkupContent{lsp::MarkupKind::Markdown, desc.str()};
+          llvm::lsp::MarkupContent{llvm::lsp::MarkupKind::Markdown, desc.str()};
       item.insertText = snippetText.str();
       item.insertTextFormat = snippetText.empty()
-                                  ? lsp::InsertTextFormat::PlainText
-                                  : lsp::InsertTextFormat::Snippet;
+                                  ? llvm::lsp::InsertTextFormat::PlainText
+                                  : llvm::lsp::InsertTextFormat::Snippet;
       completionList.items.emplace_back(item);
     };
 
@@ -913,10 +931,10 @@ class LSPCodeCompleteContext : public CodeCompleteContext {
 
     // Functor used to add a single include completion item.
     auto addIncludeCompletion = [&](StringRef path, bool isDirectory) {
-      lsp::CompletionItem item;
+      llvm::lsp::CompletionItem item;
       item.label = path.str();
-      item.kind = isDirectory ? lsp::CompletionItemKind::Folder
-                              : lsp::CompletionItemKind::File;
+      item.kind = isDirectory ? llvm::lsp::CompletionItemKind::Folder
+                              : llvm::lsp::CompletionItemKind::File;
       if (seenResults.insert(item.label).second)
         completionList.items.emplace_back(item);
     };
@@ -961,31 +979,31 @@ class LSPCodeCompleteContext : public CodeCompleteContext {
 
     // Sort the completion results to make sure the output is deterministic in
     // the face of different iteration schemes for different platforms.
-    llvm::sort(completionList.items, [](const lsp::CompletionItem &lhs,
-                                        const lsp::CompletionItem &rhs) {
+    llvm::sort(completionList.items, [](const llvm::lsp::CompletionItem &lhs,
+                                        const llvm::lsp::CompletionItem &rhs) {
       return lhs.label < rhs.label;
     });
   }
 
 private:
   llvm::SourceMgr &sourceMgr;
-  lsp::CompletionList &completionList;
+  llvm::lsp::CompletionList &completionList;
   ods::Context &odsContext;
   ArrayRef<std::string> includeDirs;
 };
 } // namespace
 
-lsp::CompletionList
-PDLDocument::getCodeCompletion(const lsp::URIForFile &uri,
-                               const lsp::Position &completePos) {
+llvm::lsp::CompletionList
+PDLDocument::getCodeCompletion(const llvm::lsp::URIForFile &uri,
+                               const llvm::lsp::Position &completePos) {
   SMLoc posLoc = completePos.getAsSMLoc(sourceMgr);
   if (!posLoc.isValid())
-    return lsp::CompletionList();
+    return llvm::lsp::CompletionList();
 
   // To perform code completion, we run another parse of the module with the
   // code completion context provided.
   ods::Context tmpODSContext;
-  lsp::CompletionList completionList;
+  llvm::lsp::CompletionList completionList;
   LSPCodeCompleteContext lspCompleteContext(posLoc, sourceMgr, completionList,
                                             tmpODSContext,
                                             sourceMgr.getIncludeDirs());
@@ -1005,7 +1023,7 @@ namespace {
 class LSPSignatureHelpContext : public CodeCompleteContext {
 public:
   LSPSignatureHelpContext(SMLoc completeLoc, llvm::SourceMgr &sourceMgr,
-                          lsp::SignatureHelp &signatureHelp,
+                          llvm::lsp::SignatureHelp &signatureHelp,
                           ods::Context &odsContext)
       : CodeCompleteContext(completeLoc), sourceMgr(sourceMgr),
         signatureHelp(signatureHelp), odsContext(odsContext) {}
@@ -1014,7 +1032,7 @@ class LSPSignatureHelpContext : public CodeCompleteContext {
                                  unsigned currentNumArgs) final {
     signatureHelp.activeParameter = currentNumArgs;
 
-    lsp::SignatureInformation signatureInfo;
+    llvm::lsp::SignatureInformation signatureInfo;
     {
       llvm::raw_string_ostream strOS(signatureInfo.label);
       strOS << callable->getName()->getName() << "(";
@@ -1022,7 +1040,7 @@ class LSPSignatureHelpContext : public CodeCompleteContext {
         unsigned paramStart = strOS.str().size();
         strOS << var->getName().getName() << ": " << var->getType();
         unsigned paramEnd = strOS.str().size();
-        signatureInfo.parameters.emplace_back(lsp::ParameterInformation{
+        signatureInfo.parameters.emplace_back(llvm::lsp::ParameterInformation{
             StringRef(strOS.str()).slice(paramStart, paramEnd).str(),
             std::make_pair(paramStart, paramEnd), /*paramDoc*/ std::string()});
       };
@@ -1070,7 +1088,7 @@ class LSPSignatureHelpContext : public CodeCompleteContext {
     // not more than what is defined in ODS, as this will result in an error
     // anyways.
     if (odsOp && currentValue < values.size()) {
-      lsp::SignatureInformation signatureInfo;
+      llvm::lsp::SignatureInformation signatureInfo;
 
       // Build the signature label.
       {
@@ -1099,7 +1117,7 @@ class LSPSignatureHelpContext : public CodeCompleteContext {
           }
 
           unsigned paramEnd = strOS.str().size();
-          signatureInfo.parameters.emplace_back(lsp::ParameterInformation{
+          signatureInfo.parameters.emplace_back(llvm::lsp::ParameterInformation{
               StringRef(strOS.str()).slice(paramStart, paramEnd).str(),
               std::make_pair(paramStart, paramEnd), paramDoc});
         };
@@ -1114,12 +1132,12 @@ class LSPSignatureHelpContext : public CodeCompleteContext {
 
     // If there aren't any arguments yet, we also add the generic signature.
     if (currentValue == 0 && (!odsOp || !values.empty())) {
-      lsp::SignatureInformation signatureInfo;
+      llvm::lsp::SignatureInformation signatureInfo;
       signatureInfo.label =
           llvm::formatv("(<{0}s>: {1}Range)", label, dataType).str();
       signatureInfo.documentation =
           ("Generic operation " + label + " specification").str();
-      signatureInfo.parameters.emplace_back(lsp::ParameterInformation{
+      signatureInfo.parameters.emplace_back(llvm::lsp::ParameterInformation{
           StringRef(signatureInfo.label).drop_front().drop_back().str(),
           std::pair<unsigned, unsigned>(1, signatureInfo.label.size() - 1),
           ("All of the " + label + "s of the operation.").str()});
@@ -1129,21 +1147,22 @@ class LSPSignatureHelpContext : public CodeCompleteContext {
 
 private:
   llvm::SourceMgr &sourceMgr;
-  lsp::SignatureHelp &signatureHelp;
+  llvm::lsp::SignatureHelp &signatureHelp;
   ods::Context &odsContext;
 };
 } // namespace
 
-lsp::SignatureHelp PDLDocument::getSignatureHelp(const lsp::URIForFile &uri,
-                                                 const lsp::Position &helpPos) {
+llvm::lsp::SignatureHelp
+PDLDocument::getSignatureHelp(const llvm::lsp::URIForFile &uri,
+                              const llvm::lsp::Position &helpPos) {
   SMLoc posLoc = helpPos.getAsSMLoc(sourceMgr);
   if (!posLoc.isValid())
-    return lsp::SignatureHelp();
+    return llvm::lsp::SignatureHelp();
 
   // To perform code completion, we run another parse of the module with the
   // code completion context provided.
   ods::Context tmpODSContext;
-  lsp::SignatureHelp signatureHelp;
+  llvm::lsp::SignatureHelp signatureHelp;
   LSPSignatureHelpContext completeContext(posLoc, sourceMgr, signatureHelp,
                                           tmpODSContext);
 
@@ -1173,9 +1192,9 @@ static bool shouldAddHintFor(const ast::Expr *expr, StringRef name) {
   return true;
 }
 
-void PDLDocument::getInlayHints(const lsp::URIForFile &uri,
-                                const lsp::Range &range,
-                                std::vector<lsp::InlayHint> &inlayHints) {
+void PDLDocument::getInlayHints(const llvm::lsp::URIForFile &uri,
+                                const llvm::lsp::Range &range,
+                                std::vector<llvm::lsp::InlayHint> &inlayHints) {
   if (failed(astModule))
     return;
   SMRange rangeLoc = range.getAsSMRange(sourceMgr);
@@ -1198,9 +1217,9 @@ void PDLDocument::getInlayHints(const lsp::URIForFile &uri,
   });
 }
 
-void PDLDocument::getInlayHintsFor(const ast::VariableDecl *decl,
-                                   const lsp::URIForFile &uri,
-                                   std::vector<lsp::InlayHint> &inlayHints) {
+void PDLDocument::getInlayHintsFor(
+    const ast::VariableDecl *decl, const llvm::lsp::URIForFile &uri,
+    std::vector<llvm::lsp::InlayHint> &inlayHints) {
   // Check to see if the variable has a constraint list, if it does we don't
   // provide initializer hints.
   if (!decl->getConstraints().empty())
@@ -1215,8 +1234,8 @@ void PDLDocument::getInlayHintsFor(const ast::VariableDecl *decl,
       return;
   }
 
-  lsp::InlayHint hint(lsp::InlayHintKind::Type,
-                      lsp::Position(sourceMgr, decl->getLoc().End));
+  llvm::lsp::InlayHint hint(llvm::lsp::InlayHintKind::Type,
+                            llvm::lsp::Position(sourceMgr, decl->getLoc().End));
   {
     llvm::raw_string_ostream labelOS(hint.label);
     labelOS << ": " << decl->getType();
@@ -1225,9 +1244,9 @@ void PDLDocument::getInlayHintsFor(const ast::VariableDecl *decl,
   inlayHints.emplace_back(std::move(hint));
 }
 
-void PDLDocument::getInlayHintsFor(const ast::CallExpr *expr,
-                                   const lsp::URIForFile &uri,
-                                   std::vector<lsp::InlayHint> &inlayHints) {
+void PDLDocument::getInlayHintsFor(
+    const ast::CallExpr *expr, const llvm::lsp::URIForFile &uri,
+    std::vector<llvm::lsp::InlayHint> &inlayHints) {
   // Try to extract the callable of this call.
   const auto *callableRef = dyn_cast<ast::DeclRefExpr>(expr->getCallableExpr());
   const auto *callable =
@@ -1242,9 +1261,9 @@ void PDLDocument::getInlayHintsFor(const ast::CallExpr *expr,
                         std::get<1>(it)->getName().getName());
 }
 
-void PDLDocument::getInlayHintsFor(const ast::OperationExpr *expr,
-                                   const lsp::URIForFile &uri,
-                                   std::vector<lsp::InlayHint> &inlayHints) {
+void PDLDocument::getInlayHintsFor(
+    const ast::OperationExpr *expr, const llvm::lsp::URIForFile &uri,
+    std::vector<llvm::lsp::InlayHint> &inlayHints) {
   // Check for ODS information.
   ast::OperationType opType = dyn_cast<ast::OperationType>(expr->getType());
   const auto *odsOp = opType ? opType.getODSOperation() : nullptr;
@@ -1290,13 +1309,15 @@ void PDLDocument::getInlayHintsFor(const ast::OperationExpr *expr,
                           "results");
 }
 
-void PDLDocument::addParameterHintFor(std::vector<lsp::InlayHint> &inlayHints,
-                                      const ast::Expr *expr, StringRef label) {
+void PDLDocument::addParameterHintFor(
+    std::vector<llvm::lsp::InlayHint> &inlayHints, const ast::Expr *expr,
+    StringRef label) {
   if (!shouldAddHintFor(expr, label))
     return;
 
-  lsp::InlayHint hint(lsp::InlayHintKind::Parameter,
-                      lsp::Position(sourceMgr, expr->getLoc().Start));
+  llvm::lsp::InlayHint hint(
+      llvm::lsp::InlayHintKind::Parameter,
+      llvm::lsp::Position(sourceMgr, expr->getLoc().Start));
   hint.label = (label + ":").str();
   hint.paddingRight = true;
   inlayHints.emplace_back(std::move(hint));
@@ -1342,22 +1363,24 @@ void PDLDocument::getPDLLViewOutput(raw_ostream &os,
 namespace {
 /// This class represents a single chunk of an PDL text file.
 struct PDLTextFileChunk {
-  PDLTextFileChunk(uint64_t lineOffset, const lsp::URIForFile &uri,
+  PDLTextFileChunk(uint64_t lineOffset, const llvm::lsp::URIForFile &uri,
                    StringRef contents,
                    const std::vector<std::string> &extraDirs,
-                   std::vector<lsp::Diagnostic> &diagnostics)
+                   std::vector<llvm::lsp::Diagnostic> &diagnostics)
       : lineOffset(lineOffset),
         document(uri, contents, extraDirs, diagnostics) {}
 
   /// Adjust the line number of the given range to anchor at the beginning of
   /// the file, instead of the beginning of this chunk.
-  void adjustLocForChunkOffset(lsp::Range &range) {
+  void adjustLocForChunkOffset(llvm::lsp::Range &range) {
     adjustLocForChunkOffset(range.start);
     adjustLocForChunkOffset(range.end);
   }
   /// Adjust the line number of the given position to anchor at the beginning of
   /// the file, instead of the beginning of this chunk.
-  void adjustLocForChunkOffset(lsp::Position &pos) { pos.line += lineOffset; }
+  void adjustLocForChunkOffset(llvm::lsp::Position &pos) {
+    pos.line += lineOffset;
+  }
 
   /// The line offset of this chunk from the beginning of the file.
   uint64_t lineOffset;
@@ -1374,38 +1397,41 @@ namespace {
 /// This class represents a text file containing one or more PDL documents.
 class PDLTextFile {
 public:
-  PDLTextFile(const lsp::URIForFile &uri, StringRef fileContents,
+  PDLTextFile(const llvm::lsp::URIForFile &uri, StringRef fileContents,
               int64_t version, const std::vector<std::string> &extraDirs,
-              std::vector<lsp::Diagnostic> &diagnostics);
+              std::vector<llvm::lsp::Diagnostic> &diagnostics);
 
   /// Return the current version of this text file.
   int64_t getVersion() const { return version; }
 
   /// Update the file to the new version using the provided set of content
   /// changes. Returns failure if the update was unsuccessful.
-  LogicalResult update(const lsp::URIForFile &uri, int64_t newVersion,
-                       ArrayRef<lsp::TextDocumentContentChangeEvent> changes,
-                       std::vector<lsp::Diagnostic> &diagnostics);
+  LogicalResult
+  update(const llvm::lsp::URIForFile &uri, int64_t newVersion,
+         ArrayRef<llvm::lsp::TextDocumentContentChangeEvent> changes,
+         std::vector<llvm::lsp::Diagnostic> &diagnostics);
 
   //===--------------------------------------------------------------------===//
   // LSP Queries
   //===--------------------------------------------------------------------===//
 
-  void getLocationsOf(const lsp::URIForFile &uri, lsp::Position defPos,
-                      std::vector<lsp::Location> &locations);
-  void findReferencesOf(const lsp::URIForFile &uri, lsp::Position pos,
-                        std::vector<lsp::Location> &references);
-  void getDocumentLinks(const lsp::URIForFile &uri,
-                        std::vector<lsp::DocumentLink> &links);
-  std::optional<lsp::Hover> findHover(const lsp::URIForFile &uri,
-                                      lsp::Position hoverPos);
-  void findDocumentSymbols(std::vector<lsp::DocumentSymbol> &symbols);
-  lsp::CompletionList getCodeCompletion(const lsp::URIForFile &uri,
-                                        lsp::Position completePos);
-  lsp::SignatureHelp getSignatureHelp(const lsp::URIForFile &uri,
-                                      lsp::Position helpPos);
-  void getInlayHints(const lsp::URIForFile &uri, lsp::Range range,
-                     std::vector<lsp::InlayHint> &inlayHints);
+  void getLocationsOf(const llvm::lsp::URIForFile &uri,
+                      llvm::lsp::Position defPos,
+                      std::vector<llvm::lsp::Location> &locations);
+  void findReferencesOf(const llvm::lsp::URIForFile &uri,
+                        llvm::lsp::Position pos,
+                        std::vector<llvm::lsp::Location> &references);
+  void getDocumentLinks(const llvm::lsp::URIForFile &uri,
+                        std::vector<llvm::lsp::DocumentLink> &links);
+  std::optional<llvm::lsp::Hover> findHover(const llvm::lsp::URIForFile &uri,
+                                            llvm::lsp::Position hoverPos);
+  void findDocumentSymbols(std::vector<llvm::lsp::DocumentSymbol> &symbols);
+  llvm::lsp::CompletionList getCodeCompletion(const llvm::lsp::URIForFile &uri,
+                                              llvm::lsp::Position completePos);
+  llvm::lsp::SignatureHelp getSignatureHelp(const llvm::lsp::URIForFile &uri,
+                                            llvm::lsp::Position helpPos);
+  void getInlayHints(const llvm::lsp::URIForFile &uri, llvm::lsp::Range range,
+                     std::vector<llvm::lsp::InlayHint> &inlayHints);
   lsp::PDLLViewOutputResult getPDLLViewOutput(lsp::PDLLViewOutputKind kind);
 
 private:
@@ -1413,14 +1439,14 @@ class PDLTextFile {
       std::vector<std::unique_ptr<PDLTextFileChunk>>::iterator>;
 
   /// Initialize the text file from the given file contents.
-  void initialize(const lsp::URIForFile &uri, int64_t newVersion,
-                  std::vector<lsp::Diagnostic> &diagnostics);
+  void initialize(const llvm::lsp::URIForFile &uri, int64_t newVersion,
+                  std::vector<llvm::lsp::Diagnostic> &diagnostics);
 
   /// Find the PDL document that contains the given position, and update the
   /// position to be anchored at the start of the found chunk instead of the
   /// beginning of the file.
-  ChunkIterator getChunkItFor(lsp::Position &pos);
-  PDLTextFileChunk &getChunkFor(lsp::Position &pos) {
+  ChunkIterator getChunkItFor(llvm::lsp::Position &pos);
+  PDLTextFileChunk &getChunkFor(llvm::lsp::Position &pos) {
     return *getChunkItFor(pos);
   }
 
@@ -1442,20 +1468,21 @@ class PDLTextFile {
 };
 } // namespace
 
-PDLTextFile::PDLTextFile(const lsp::URIForFile &uri, StringRef fileContents,
-                         int64_t version,
+PDLTextFile::PDLTextFile(const llvm::lsp::URIForFile &uri,
+                         StringRef fileContents, int64_t version,
                          const std::vector<std::string> &extraDirs,
-                         std::vector<lsp::Diagnostic> &diagnostics)
+                         std::vector<llvm::lsp::Diagnostic> &diagnostics)
     : contents(fileContents.str()), extraIncludeDirs(extraDirs) {
   initialize(uri, version, diagnostics);
 }
 
 LogicalResult
-PDLTextFile::update(const lsp::URIForFile &uri, int64_t newVersion,
-                    ArrayRef<lsp::TextDocumentContentChangeEvent> changes,
-                    std::vector<lsp::Diagnostic> &diagnostics) {
-  if (failed(lsp::TextDocumentContentChangeEvent::applyTo(changes, contents))) {
-    lsp::Logger::error("Failed to update contents of {0}", uri.file());
+PDLTextFile::update(const llvm::lsp::URIForFile &uri, int64_t newVersion,
+                    ArrayRef<llvm::lsp::TextDocumentContentChangeEvent> changes,
+                    std::vector<llvm::lsp::Diagnostic> &diagnostics) {
+  if (failed(llvm::lsp::TextDocumentContentChangeEvent::applyTo(changes,
+                                                                contents))) {
+    llvm::lsp::Logger::error("Failed to update contents of {0}", uri.file());
     return failure();
   }
 
@@ -1464,36 +1491,37 @@ PDLTextFile::update(const lsp::URIForFile &uri, int64_t newVersion,
   return success();
 }
 
-void PDLTextFile::getLocationsOf(const lsp::URIForFile &uri,
-                                 lsp::Position defPos,
-                                 std::vector<lsp::Location> &locations) {
+void PDLTextFile::getLocationsOf(const llvm::lsp::URIForFile &uri,
+                                 llvm::lsp::Position defPos,
+                                 std::vector<llvm::lsp::Location> &locations) {
   PDLTextFileChunk &chunk = getChunkFor(defPos);
   chunk.document.getLocationsOf(uri, defPos, locations);
 
   // Adjust any locations within this file for the offset of this chunk.
   if (chunk.lineOffset == 0)
     return;
-  for (lsp::Location &loc : locations)
+  for (llvm::lsp::Location &loc : locations)
     if (loc.uri == uri)
       chunk.adjustLocForChunkOffset(loc.range);
 }
 
-void PDLTextFile::findReferencesOf(const lsp::URIForFile &uri,
-                                   lsp::Position pos,
-                                   std::vector<lsp::Location> &references) {
+void PDLTextFile::findReferencesOf(
+    const llvm::lsp::URIForFile &uri, llvm::lsp::Position pos,
+    std::vector<llvm::lsp::Location> &references) {
   PDLTextFileChunk &chunk = getChunkFor(pos);
   chunk.document.findReferencesOf(uri, pos, references);
 
   // Adjust any locations within this file for the offset of this chunk.
   if (chunk.lineOffset == 0)
     return;
-  for (lsp::Location &loc : references)
+  for (llvm::lsp::Location &loc : references)
     if (loc.uri == uri)
       chunk.adjustLocForChunkOffset(loc.range);
 }
 
-void PDLTextFile::getDocumentLinks(const lsp::URIForFile &uri,
-                                   std::vector<lsp::DocumentLink> &links) {
+void PDLTextFile::getDocumentLinks(
+    const llvm::lsp::URIForFile &uri,
+    std::vector<llvm::lsp::DocumentLink> &links) {
   chunks.front()->document.getDocumentLinks(uri, links);
   for (const auto &it : llvm::drop_begin(chunks)) {
     size_t currentNumLinks = links.size();
@@ -1506,10 +1534,12 @@ void PDLTextFile::getDocumentLinks(const lsp::URIForFile &uri,
   }
 }
 
-std::optional<lsp::Hover> PDLTextFile::findHover(const lsp::URIForFile &uri,
-                                                 lsp::Position hoverPos) {
+std::optional<llvm::lsp::Hover>
+PDLTextFile::findHover(const llvm::lsp::URIForFile &uri,
+                       llvm::lsp::Position hoverPos) {
   PDLTextFileChunk &chunk = getChunkFor(hoverPos);
-  std::optional<lsp::Hover> hoverInfo = chunk.document.findHover(uri, hoverPos);
+  std::optional<llvm::lsp::Hover> hoverInfo =
+      chunk.document.findHover(uri, hoverPos);
 
   // Adjust any locations within this file for the offset of this chunk.
   if (chunk.lineOffset != 0 && hoverInfo && hoverInfo->range)
@@ -1518,7 +1548,7 @@ std::optional<lsp::Hover> PDLTextFile::findHover(const lsp::URIForFile &uri,
 }
 
 void PDLTextFile::findDocumentSymbols(
-    std::vector<lsp::DocumentSymbol> &symbols) {
+    std::vector<llvm::lsp::DocumentSymbol> &symbols) {
   if (chunks.size() == 1)
     return chunks.front()->document.findDocumentSymbols(symbols);
 
@@ -1526,27 +1556,27 @@ void PDLTextFile::findDocumentSymbols(
   // each chunk.
   for (unsigned i = 0, e = chunks.size(); i < e; ++i) {
     PDLTextFileChunk &chunk = *chunks[i];
-    lsp::Position startPos(chunk.lineOffset);
-    lsp::Position endPos((i == e - 1) ? totalNumLines - 1
-                                      : chunks[i + 1]->lineOffset);
-    lsp::DocumentSymbol symbol("<file-split-" + Twine(i) + ">",
-                               lsp::SymbolKind::Namespace,
-                               /*range=*/lsp::Range(startPos, endPos),
-                               /*selectionRange=*/lsp::Range(startPos));
+    llvm::lsp::Position startPos(chunk.lineOffset);
+    llvm::lsp::Position endPos((i == e - 1) ? totalNumLines - 1
+                                            : chunks[i + 1]->lineOffset);
+    llvm::lsp::DocumentSymbol symbol(
+        "<file-split-" + Twine(i) + ">", llvm::lsp::SymbolKind::Namespace,
+        /*range=*/llvm::lsp::Range(startPos, endPos),
+        /*selectionRange=*/llvm::lsp::Range(startPos));
     chunk.document.findDocumentSymbols(symbol.children);
 
     // Fixup the locations of document symbols within this chunk.
     if (i != 0) {
-      SmallVector<lsp::DocumentSymbol *> symbolsToFix;
-      for (lsp::DocumentSymbol &childSymbol : symbol.children)
+      SmallVector<llvm::lsp::DocumentSymbol *> symbolsToFix;
+      for (llvm::lsp::DocumentSymbol &childSymbol : symbol.children)
         symbolsToFix.push_back(&childSymbol);
 
       while (!symbolsToFix.empty()) {
-        lsp::DocumentSymbol *symbol = symbolsToFix.pop_back_val();
+        llvm::lsp::DocumentSymbol *symbol = symbolsToFix.pop_back_val();
         chunk.adjustLocForChunkOffset(symbol->range);
         chunk.adjustLocForChunkOffset(symbol->selectionRange);
 
-        for (lsp::DocumentSymbol &childSymbol : symbol->children)
+        for (llvm::lsp::DocumentSymbol &childSymbol : symbol->children)
           symbolsToFix.push_back(&childSymbol);
       }
     }
@@ -1556,34 +1586,37 @@ void PDLTextFile::findDocumentSymbols(
   }
 }
 
-lsp::CompletionList PDLTextFile::getCodeCompletion(const lsp::URIForFile &uri,
-                                                   lsp::Position completePos) {
+llvm::lsp::CompletionList
+PDLTextFile::getCodeCompletion(const llvm::lsp::URIForFile &uri,
+                               llvm::lsp::Position completePos) {
   PDLTextFileChunk &chunk = getChunkFor(completePos);
-  lsp::CompletionList completionList =
+  llvm::lsp::CompletionList completionList =
       chunk.document.getCodeCompletion(uri, completePos);
 
   // Adjust any completion locations.
-  for (lsp::CompletionItem &item : completionList.items) {
+  for (llvm::lsp::CompletionItem &item : completionList.items) {
     if (item.textEdit)
       chunk.adjustLocForChunkOffset(item.textEdit->range);
-    for (lsp::TextEdit &edit : item.additionalTextEdits)
+    for (llvm::lsp::TextEdit &edit : item.additionalTextEdits)
       chunk.adjustLocForChunkOffset(edit.range);
   }
   return completionList;
 }
 
-lsp::SignatureHelp PDLTextFile::getSignatureHelp(const lsp::URIForFile &uri,
-                                                 lsp::Position helpPos) {
+llvm::lsp::SignatureHelp
+PDLTextFile::getSignatureHelp(const llvm::lsp::URIForFile &uri,
+                              llvm::lsp::Position helpPos) {
   return getChunkFor(helpPos).document.getSignatureHelp(uri, helpPos);
 }
 
-void PDLTextFile::getInlayHints(const lsp::URIForFile &uri, lsp::Range range,
-                                std::vector<lsp::InlayHint> &inlayHints) {
+void PDLTextFile::getInlayHints(const llvm::lsp::URIForFile &uri,
+                                llvm::lsp::Range range,
+                                std::vector<llvm::lsp::InlayHint> &inlayHints) {
   auto startIt = getChunkItFor(range.start);
   auto endIt = getChunkItFor(range.end);
 
   // Functor used to get the chunks for a given file, and fixup any locations
-  auto getHintsForChunk = [&](ChunkIterator chunkIt, lsp::Range range) {
+  auto getHintsForChunk = [&](ChunkIterator chunkIt, llvm::lsp::Range range) {
     size_t currentNumHints = inlayHints.size();
     chunkIt->document.getInlayHints(uri, range, inlayHints);
 
@@ -1605,15 +1638,16 @@ void PDLTextFile::getInlayHints(const lsp::URIForFile &uri, lsp::Range range,
 
   // Otherwise, the range is split between multiple chunks. The first chunk
   // has the correct range start, but covers the total document.
-  getHintsForChunk(startIt, lsp::Range(range.start, getNumLines(startIt)));
+  getHintsForChunk(startIt,
+                   llvm::lsp::Range(range.start, getNumLines(startIt)));
 
   // Every chunk in between uses the full document.
   for (++startIt; startIt != endIt; ++startIt)
-    getHintsForChunk(startIt, lsp::Range(0, getNumLines(startIt)));
+    getHintsForChunk(startIt, llvm::lsp::Range(0, getNumLines(startIt)));
 
   // The range for the last chunk starts at the beginning of the document, up
   // through the end of the input range.
-  getHintsForChunk(startIt, lsp::Range(0, range.end));
+  getHintsForChunk(startIt, llvm::lsp::Range(0, range.end));
 }
 
 lsp::PDLLViewOutputResult
@@ -1632,8 +1666,9 @@ PDLTextFile::getPDLLViewOutput(lsp::PDLLViewOutputKind kind) {
   return result;
 }
 
-void PDLTextFile::initialize(const lsp::URIForFile &uri, int64_t newVersion,
-                             std::vector<lsp::Diagnostic> &diagnostics) {
+void PDLTextFile::initialize(const llvm::lsp::URIForFile &uri,
+                             int64_t newVersion,
+                             std::vector<llvm::lsp::Diagnostic> &diagnostics) {
   version = newVersion;
   chunks.clear();
 
@@ -1653,7 +1688,7 @@ void PDLTextFile::initialize(const lsp::URIForFile &uri, int64_t newVersion,
 
     // Adjust locations used in diagnostics to account for the offset from the
     // beginning of the file.
-    for (lsp::Diagnostic &diag :
+    for (llvm::lsp::Diagnostic &diag :
          llvm::drop_begin(diagnostics, currentNumDiags)) {
       chunk->adjustLocForChunkOffset(diag.range);
 
@@ -1668,14 +1703,15 @@ void PDLTextFile::initialize(const lsp::URIForFile &uri, int64_t newVersion,
   totalNumLines = lineOffset;
 }
 
-PDLTextFile::ChunkIterator PDLTextFile::getChunkItFor(lsp::Position &pos) {
+PDLTextFile::ChunkIterator
+PDLTextFile::getChunkItFor(llvm::lsp::Position &pos) {
   if (chunks.size() == 1)
     return chunks.begin();
 
   // Search for the first chunk with a greater line offset, the previous chunk
   // is the one that contains `pos`.
   auto it = llvm::upper_bound(
-      chunks, pos, [](const lsp::Position &pos, const auto &chunk) {
+      chunks, pos, [](const llvm::lsp::Position &pos, const auto &chunk) {
         return static_cast<uint64_t>(pos.line) < chunk->lineOffset;
       });
   ChunkIterator chunkIt(it == chunks.end() ? (chunks.end() - 1) : --it);
@@ -1710,9 +1746,9 @@ lsp::PDLLServer::PDLLServer(const Options &options)
     : impl(std::make_unique<Impl>(options)) {}
 lsp::PDLLServer::~PDLLServer() = default;
 
-void lsp::PDLLServer::addDocument(const URIForFile &uri, StringRef contents,
-                                  int64_t version,
-                                  std::vector<Diagnostic> &diagnostics) {
+void lsp::PDLLServer::addDocument(
+    const URIForFile &uri, StringRef contents, int64_t version,
+    std::vector<llvm::lsp::Diagnostic> &diagnostics) {
   // Build the set of additional include directories.
   std::vector<std::string> additionalIncludeDirs = impl->options.extraDirs;
   const auto &fileInfo = impl->compilationDatabase.getFileInfo(uri.file());
@@ -1724,7 +1760,7 @@ void lsp::PDLLServer::addDocument(const URIForFile &uri, StringRef contents,
 
 void lsp::PDLLServer::updateDocument(
     const URIForFile &uri, ArrayRef<TextDocumentContentChangeEvent> changes,
-    int64_t version, std::vector<Diagnostic> &diagnostics) {
+    int64_t version, std::vector<llvm::lsp::Diagnostic> &diagnostics) {
   // Check that we actually have a document for this uri.
   auto it = impl->files.find(uri.file());
   if (it == impl->files.end())
@@ -1746,17 +1782,17 @@ std::optional<int64_t> lsp::PDLLServer::removeDocument(const URIForFile &uri) {
   return version;
 }
 
-void lsp::PDLLServer::getLocationsOf(const URIForFile &uri,
-                                     const Position &defPos,
-                                     std::vector<Location> &locations) {
+void lsp::PDLLServer::getLocationsOf(
+    const URIForFile &uri, const Position &defPos,
+    std::vector<llvm::lsp::Location> &locations) {
   auto fileIt = impl->files.find(uri.file());
   if (fileIt != impl->files.end())
     fileIt->second->getLocationsOf(uri, defPos, locations);
 }
 
-void lsp::PDLLServer::findReferencesOf(const URIForFile &uri,
-                                       const Position &pos,
-                                       std::vector<Location> &references) {
+void lsp::PDLLServer::findReferencesOf(
+    const URIForFile &uri, const Position &pos,
+    std::vector<llvm::lsp::Location> &references) {
   auto fileIt = impl->files.find(uri.file());
   if (fileIt != impl->files.end())
     fileIt->second->findReferencesOf(uri, pos, references);
@@ -1769,8 +1805,8 @@ void lsp::PDLLServer::getDocumentLinks(
     return fileIt->second->getDocumentLinks(uri, documentLinks);
 }
 
-std::optional<lsp::Hover> lsp::PDLLServer::findHover(const URIForFile &uri,
-                                                     const Position &hoverPos) {
+std::optional<llvm::lsp::Hover>
+lsp::PDLLServer::findHover(const URIForFile &uri, const Position &hoverPos) {
   auto fileIt = impl->files.find(uri.file());
   if (fileIt != impl->files.end())
     return fileIt->second->findHover(uri, hoverPos);
@@ -1793,8 +1829,9 @@ lsp::PDLLServer::getCodeCompletion(const URIForFile &uri,
   return CompletionList();
 }
 
-lsp::SignatureHelp lsp::PDLLServer::getSignatureHelp(const URIForFile &uri,
-                                                     const Position &helpPos) {
+llvm::lsp::SignatureHelp
+lsp::PDLLServer::getSignatureHelp(const URIForFile &uri,
+                                  const Position &helpPos) {
   auto fileIt = impl->files.find(uri.file());
   if (fileIt != impl->files.end())
     return fileIt->second->getSignatureHelp(uri, helpPos);
diff --git a/mlir/lib/Tools/mlir-pdll-lsp-server/PDLLServer.h b/mlir/lib/Tools/mlir-pdll-lsp-server/PDLLServer.h
index 134431fa63bf8..d82014d6b0684 100644
--- a/mlir/lib/Tools/mlir-pdll-lsp-server/PDLLServer.h
+++ b/mlir/lib/Tools/mlir-pdll-lsp-server/PDLLServer.h
@@ -11,6 +11,7 @@
 
 #include "mlir/Support/LLVM.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Support/LSP/Protocol.h"
 #include <memory>
 #include <optional>
 #include <string>
@@ -18,21 +19,22 @@
 
 namespace mlir {
 namespace lsp {
-struct Diagnostic;
+using llvm::lsp::CompletionList;
+using llvm::lsp::Diagnostic;
+using llvm::lsp::DocumentLink;
+using llvm::lsp::DocumentSymbol;
+using llvm::lsp::Hover;
+using llvm::lsp::InlayHint;
+using llvm::lsp::Location;
+using llvm::lsp::Position;
+using llvm::lsp::Range;
+using llvm::lsp::SignatureHelp;
+using llvm::lsp::TextDocumentContentChangeEvent;
+using llvm::lsp::URIForFile;
+
 class CompilationDatabase;
 struct PDLLViewOutputResult;
 enum class PDLLViewOutputKind;
-struct CompletionList;
-struct DocumentLink;
-struct DocumentSymbol;
-struct Hover;
-struct InlayHint;
-struct Location;
-struct Position;
-struct Range;
-struct SignatureHelp;
-struct TextDocumentContentChangeEvent;
-class URIForFile;
 
 /// This class implements all of the PDLL related functionality necessary for a
 /// language server. This class allows for keeping the PDLL specific logic
diff --git a/mlir/lib/Tools/mlir-pdll-lsp-server/Protocol.cpp b/mlir/lib/Tools/mlir-pdll-lsp-server/Protocol.cpp
index 0c9896e3ec1b4..ace460536aa1b 100644
--- a/mlir/lib/Tools/mlir-pdll-lsp-server/Protocol.cpp
+++ b/mlir/lib/Tools/mlir-pdll-lsp-server/Protocol.cpp
@@ -11,6 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "Protocol.h"
+#include "mlir/Support/LLVM.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/JSON.h"
 
diff --git a/mlir/lib/Tools/mlir-pdll-lsp-server/Protocol.h b/mlir/lib/Tools/mlir-pdll-lsp-server/Protocol.h
index 0706316631851..a2775f8cbadc2 100644
--- a/mlir/lib/Tools/mlir-pdll-lsp-server/Protocol.h
+++ b/mlir/lib/Tools/mlir-pdll-lsp-server/Protocol.h
@@ -20,10 +20,12 @@
 #ifndef LIB_MLIR_TOOLS_MLIRPDLLLSPSERVER_PROTOCOL_H_
 #define LIB_MLIR_TOOLS_MLIRPDLLLSPSERVER_PROTOCOL_H_
 
-#include "mlir/Tools/lsp-server-support/Protocol.h"
+#include "llvm/Support/LSP/Protocol.h"
 
 namespace mlir {
 namespace lsp {
+using llvm::lsp::URIForFile;
+
 //===----------------------------------------------------------------------===//
 // PDLLViewOutputParams
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Tools/tblgen-lsp-server/CMakeLists.txt b/mlir/lib/Tools/tblgen-lsp-server/CMakeLists.txt
index 80fc1ffe4029a..b21650ed03b6f 100644
--- a/mlir/lib/Tools/tblgen-lsp-server/CMakeLists.txt
+++ b/mlir/lib/Tools/tblgen-lsp-server/CMakeLists.txt
@@ -2,6 +2,7 @@ set(LLVM_LINK_COMPONENTS
   Demangle
   Support
   TableGen
+  SupportLSP
 )
 
 llvm_add_library(TableGenLspServerLib
diff --git a/mlir/lib/Tools/tblgen-lsp-server/LSPServer.cpp b/mlir/lib/Tools/tblgen-lsp-server/LSPServer.cpp
index bb3c0a77747aa..95a457f3144c5 100644
--- a/mlir/lib/Tools/tblgen-lsp-server/LSPServer.cpp
+++ b/mlir/lib/Tools/tblgen-lsp-server/LSPServer.cpp
@@ -9,14 +9,33 @@
 #include "LSPServer.h"
 
 #include "TableGenServer.h"
-#include "mlir/Tools/lsp-server-support/Logging.h"
-#include "mlir/Tools/lsp-server-support/Protocol.h"
-#include "mlir/Tools/lsp-server-support/Transport.h"
+#include "llvm/Support/LSP/Logging.h"
+#include "llvm/Support/LSP/Protocol.h"
+#include "llvm/Support/LSP/Transport.h"
 #include <optional>
 
 using namespace mlir;
 using namespace mlir::lsp;
 
+using llvm::lsp::Callback;
+using llvm::lsp::DidChangeTextDocumentParams;
+using llvm::lsp::DidCloseTextDocumentParams;
+using llvm::lsp::DidOpenTextDocumentParams;
+using llvm::lsp::DocumentLinkParams;
+using llvm::lsp::Hover;
+using llvm::lsp::InitializedParams;
+using llvm::lsp::InitializeParams;
+using llvm::lsp::JSONTransport;
+using llvm::lsp::Location;
+using llvm::lsp::Logger;
+using llvm::lsp::MessageHandler;
+using llvm::lsp::NoParams;
+using llvm::lsp::OutgoingNotification;
+using llvm::lsp::PublishDiagnosticsParams;
+using llvm::lsp::ReferenceParams;
+using llvm::lsp::TextDocumentPositionParams;
+using llvm::lsp::TextDocumentSyncKind;
+
 //===----------------------------------------------------------------------===//
 // LSPServer
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Tools/tblgen-lsp-server/LSPServer.h b/mlir/lib/Tools/tblgen-lsp-server/LSPServer.h
index 501a9dada8aab..596688b62f8da 100644
--- a/mlir/lib/Tools/tblgen-lsp-server/LSPServer.h
+++ b/mlir/lib/Tools/tblgen-lsp-server/LSPServer.h
@@ -13,17 +13,19 @@
 
 namespace llvm {
 struct LogicalResult;
+namespace lsp {
+class JSONTransport;
+} // namespace lsp
 } // namespace llvm
 
 namespace mlir {
 namespace lsp {
-class JSONTransport;
 class TableGenServer;
 
 /// Run the main loop of the LSP server using the given TableGen server and
 /// transport.
 llvm::LogicalResult runTableGenLSPServer(TableGenServer &server,
-                                         JSONTransport &transport);
+                                         llvm::lsp::JSONTransport &transport);
 
 } // namespace lsp
 } // namespace mlir
diff --git a/mlir/lib/Tools/tblgen-lsp-server/TableGenLspServerMain.cpp b/mlir/lib/Tools/tblgen-lsp-server/TableGenLspServerMain.cpp
index 21af78c9a506c..8014b8d6dba4a 100644
--- a/mlir/lib/Tools/tblgen-lsp-server/TableGenLspServerMain.cpp
+++ b/mlir/lib/Tools/tblgen-lsp-server/TableGenLspServerMain.cpp
@@ -9,14 +9,18 @@
 #include "mlir/Tools/tblgen-lsp-server/TableGenLspServerMain.h"
 #include "LSPServer.h"
 #include "TableGenServer.h"
-#include "mlir/Tools/lsp-server-support/Logging.h"
-#include "mlir/Tools/lsp-server-support/Transport.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/LSP/Logging.h"
+#include "llvm/Support/LSP/Transport.h"
 #include "llvm/Support/Program.h"
 
 using namespace mlir;
 using namespace mlir::lsp;
 
+using llvm::lsp::JSONStreamStyle;
+using llvm::lsp::JSONTransport;
+using llvm::lsp::Logger;
+
 LogicalResult mlir::TableGenLspServerMain(int argc, char **argv) {
   llvm::cl::opt<JSONStreamStyle> inputStyle{
       "input-style",
diff --git a/mlir/lib/Tools/tblgen-lsp-server/TableGenServer.cpp b/mlir/lib/Tools/tblgen-lsp-server/TableGenServer.cpp
index 5faeeae839f44..3080b78f187b1 100644
--- a/mlir/lib/Tools/tblgen-lsp-server/TableGenServer.cpp
+++ b/mlir/lib/Tools/tblgen-lsp-server/TableGenServer.cpp
@@ -10,12 +10,12 @@
 
 #include "mlir/Support/IndentedOstream.h"
 #include "mlir/Tools/lsp-server-support/CompilationDatabase.h"
-#include "mlir/Tools/lsp-server-support/Logging.h"
-#include "mlir/Tools/lsp-server-support/Protocol.h"
 #include "mlir/Tools/lsp-server-support/SourceMgrUtils.h"
 #include "llvm/ADT/IntervalMap.h"
 #include "llvm/ADT/PointerUnion.h"
 #include "llvm/ADT/StringMap.h"
+#include "llvm/Support/LSP/Logging.h"
+#include "llvm/Support/LSP/Protocol.h"
 #include "llvm/Support/Path.h"
 #include "llvm/TableGen/Parser.h"
 #include "llvm/TableGen/Record.h"
@@ -36,45 +36,49 @@ static SMRange convertTokenLocToRange(SMLoc loc) {
 
 /// Returns a language server uri for the given source location. `mainFileURI`
 /// corresponds to the uri for the main file of the source manager.
-static lsp::URIForFile getURIFromLoc(const SourceMgr &mgr, SMLoc loc,
-                                     const lsp::URIForFile &mainFileURI) {
+static llvm::lsp::URIForFile
+getURIFromLoc(const SourceMgr &mgr, SMLoc loc,
+              const llvm::lsp::URIForFile &mainFileURI) {
   int bufferId = mgr.FindBufferContainingLoc(loc);
   if (bufferId == 0 || bufferId == static_cast<int>(mgr.getMainFileID()))
     return mainFileURI;
-  llvm::Expected<lsp::URIForFile> fileForLoc = lsp::URIForFile::fromFile(
-      mgr.getBufferInfo(bufferId).Buffer->getBufferIdentifier());
+  llvm::Expected<llvm::lsp::URIForFile> fileForLoc =
+      llvm::lsp::URIForFile::fromFile(
+          mgr.getBufferInfo(bufferId).Buffer->getBufferIdentifier());
   if (fileForLoc)
     return *fileForLoc;
-  lsp::Logger::error("Failed to create URI for include file: {0}",
-                     llvm::toString(fileForLoc.takeError()));
+  llvm::lsp::Logger::error("Failed to create URI for include file: {0}",
+                           llvm::toString(fileForLoc.takeError()));
   return mainFileURI;
 }
 
 /// Returns a language server location from the given source range.
-static lsp::Location getLocationFromLoc(SourceMgr &mgr, SMRange loc,
-                                        const lsp::URIForFile &uri) {
-  return lsp::Location(getURIFromLoc(mgr, loc.Start, uri),
-                       lsp::Range(mgr, loc));
+static llvm::lsp::Location
+getLocationFromLoc(SourceMgr &mgr, SMRange loc,
+                   const llvm::lsp::URIForFile &uri) {
+  return llvm::lsp::Location(getURIFromLoc(mgr, loc.Start, uri),
+                             llvm::lsp::Range(mgr, loc));
 }
-static lsp::Location getLocationFromLoc(SourceMgr &mgr, SMLoc loc,
-                                        const lsp::URIForFile &uri) {
+static llvm::lsp::Location
+getLocationFromLoc(SourceMgr &mgr, SMLoc loc,
+                   const llvm::lsp::URIForFile &uri) {
   return getLocationFromLoc(mgr, convertTokenLocToRange(loc), uri);
 }
 
 /// Convert the given TableGen diagnostic to the LSP form.
-static std::optional<lsp::Diagnostic>
+static std::optional<llvm::lsp::Diagnostic>
 getLspDiagnoticFromDiag(const llvm::SMDiagnostic &diag,
-                        const lsp::URIForFile &uri) {
+                        const llvm::lsp::URIForFile &uri) {
   auto *sourceMgr = const_cast<SourceMgr *>(diag.getSourceMgr());
   if (!sourceMgr || !diag.getLoc().isValid())
     return std::nullopt;
 
-  lsp::Diagnostic lspDiag;
+  llvm::lsp::Diagnostic lspDiag;
   lspDiag.source = "tablegen";
   lspDiag.category = "Parse Error";
 
   // Try to grab a file location for this diagnostic.
-  lsp::Location loc = getLocationFromLoc(*sourceMgr, diag.getLoc(), uri);
+  llvm::lsp::Location loc = getLocationFromLoc(*sourceMgr, diag.getLoc(), uri);
   lspDiag.range = loc.range;
 
   // Skip diagnostics that weren't emitted within the main file.
@@ -84,17 +88,17 @@ getLspDiagnoticFromDiag(const llvm::SMDiagnostic &diag,
   // Convert the severity for the diagnostic.
   switch (diag.getKind()) {
   case SourceMgr::DK_Warning:
-    lspDiag.severity = lsp::DiagnosticSeverity::Warning;
+    lspDiag.severity = llvm::lsp::DiagnosticSeverity::Warning;
     break;
   case SourceMgr::DK_Error:
-    lspDiag.severity = lsp::DiagnosticSeverity::Error;
+    lspDiag.severity = llvm::lsp::DiagnosticSeverity::Error;
     break;
   case SourceMgr::DK_Note:
     // Notes are emitted separately from the main diagnostic, so we just treat
     // them as remarks given that we can't determine the diagnostic to relate
     // them to.
   case SourceMgr::DK_Remark:
-    lspDiag.severity = lsp::DiagnosticSeverity::Information;
+    lspDiag.severity = llvm::lsp::DiagnosticSeverity::Information;
     break;
   }
   lspDiag.message = diag.getMessage().str();
@@ -322,54 +326,59 @@ namespace {
 /// This class represents a text file containing one or more TableGen documents.
 class TableGenTextFile {
 public:
-  TableGenTextFile(const lsp::URIForFile &uri, StringRef fileContents,
+  TableGenTextFile(const llvm::lsp::URIForFile &uri, StringRef fileContents,
                    int64_t version,
                    const std::vector<std::string> &extraIncludeDirs,
-                   std::vector<lsp::Diagnostic> &diagnostics);
+                   std::vector<llvm::lsp::Diagnostic> &diagnostics);
 
   /// Return the current version of this text file.
   int64_t getVersion() const { return version; }
 
   /// Update the file to the new version using the provided set of content
   /// changes. Returns failure if the update was unsuccessful.
-  LogicalResult update(const lsp::URIForFile &uri, int64_t newVersion,
-                       ArrayRef<lsp::TextDocumentContentChangeEvent> changes,
-                       std::vector<lsp::Diagnostic> &diagnostics);
+  LogicalResult
+  update(const llvm::lsp::URIForFile &uri, int64_t newVersion,
+         ArrayRef<llvm::lsp::TextDocumentContentChangeEvent> changes,
+         std::vector<llvm::lsp::Diagnostic> &diagnostics);
 
   //===--------------------------------------------------------------------===//
   // Definitions and References
   //===--------------------------------------------------------------------===//
 
-  void getLocationsOf(const lsp::URIForFile &uri, const lsp::Position &defPos,
-                      std::vector<lsp::Location> &locations);
-  void findReferencesOf(const lsp::URIForFile &uri, const lsp::Position &pos,
-                        std::vector<lsp::Location> &references);
+  void getLocationsOf(const llvm::lsp::URIForFile &uri,
+                      const llvm::lsp::Position &defPos,
+                      std::vector<llvm::lsp::Location> &locations);
+  void findReferencesOf(const llvm::lsp::URIForFile &uri,
+                        const llvm::lsp::Position &pos,
+                        std::vector<llvm::lsp::Location> &references);
 
   //===--------------------------------------------------------------------===//
   // Document Links
   //===--------------------------------------------------------------------===//
 
-  void getDocumentLinks(const lsp::URIForFile &uri,
-                        std::vector<lsp::DocumentLink> &links);
+  void getDocumentLinks(const llvm::lsp::URIForFile &uri,
+                        std::vector<llvm::lsp::DocumentLink> &links);
 
   //===--------------------------------------------------------------------===//
   // Hover
   //===--------------------------------------------------------------------===//
 
-  std::optional<lsp::Hover> findHover(const lsp::URIForFile &uri,
-                                      const lsp::Position &hoverPos);
-  lsp::Hover buildHoverForRecord(const Record *record,
-                                 const SMRange &hoverRange);
-  lsp::Hover buildHoverForTemplateArg(const Record *record,
+  std::optional<llvm::lsp::Hover>
+  findHover(const llvm::lsp::URIForFile &uri,
+            const llvm::lsp::Position &hoverPos);
+  llvm::lsp::Hover buildHoverForRecord(const Record *record,
+                                       const SMRange &hoverRange);
+  llvm::lsp::Hover buildHoverForTemplateArg(const Record *record,
+                                            const RecordVal *value,
+                                            const SMRange &hoverRange);
+  llvm::lsp::Hover buildHoverForField(const Record *record,
                                       const RecordVal *value,
                                       const SMRange &hoverRange);
-  lsp::Hover buildHoverForField(const Record *record, const RecordVal *value,
-                                const SMRange &hoverRange);
 
 private:
   /// Initialize the text file from the given file contents.
-  void initialize(const lsp::URIForFile &uri, int64_t newVersion,
-                  std::vector<lsp::Diagnostic> &diagnostics);
+  void initialize(const llvm::lsp::URIForFile &uri, int64_t newVersion,
+                  std::vector<llvm::lsp::Diagnostic> &diagnostics);
 
   /// The full string contents of the file.
   std::string contents;
@@ -395,9 +404,9 @@ class TableGenTextFile {
 } // namespace
 
 TableGenTextFile::TableGenTextFile(
-    const lsp::URIForFile &uri, StringRef fileContents, int64_t version,
+    const llvm::lsp::URIForFile &uri, StringRef fileContents, int64_t version,
     const std::vector<std::string> &extraIncludeDirs,
-    std::vector<lsp::Diagnostic> &diagnostics)
+    std::vector<llvm::lsp::Diagnostic> &diagnostics)
     : contents(fileContents.str()), version(version) {
   // Build the set of include directories for this file.
   llvm::SmallString<32> uriDirectory(uri.file());
@@ -409,12 +418,13 @@ TableGenTextFile::TableGenTextFile(
   initialize(uri, version, diagnostics);
 }
 
-LogicalResult
-TableGenTextFile::update(const lsp::URIForFile &uri, int64_t newVersion,
-                         ArrayRef<lsp::TextDocumentContentChangeEvent> changes,
-                         std::vector<lsp::Diagnostic> &diagnostics) {
-  if (failed(lsp::TextDocumentContentChangeEvent::applyTo(changes, contents))) {
-    lsp::Logger::error("Failed to update contents of {0}", uri.file());
+LogicalResult TableGenTextFile::update(
+    const llvm::lsp::URIForFile &uri, int64_t newVersion,
+    ArrayRef<llvm::lsp::TextDocumentContentChangeEvent> changes,
+    std::vector<llvm::lsp::Diagnostic> &diagnostics) {
+  if (failed(llvm::lsp::TextDocumentContentChangeEvent::applyTo(changes,
+                                                                contents))) {
+    llvm::lsp::Logger::error("Failed to update contents of {0}", uri.file());
     return failure();
   }
 
@@ -423,9 +433,9 @@ TableGenTextFile::update(const lsp::URIForFile &uri, int64_t newVersion,
   return success();
 }
 
-void TableGenTextFile::initialize(const lsp::URIForFile &uri,
-                                  int64_t newVersion,
-                                  std::vector<lsp::Diagnostic> &diagnostics) {
+void TableGenTextFile::initialize(
+    const llvm::lsp::URIForFile &uri, int64_t newVersion,
+    std::vector<llvm::lsp::Diagnostic> &diagnostics) {
   version = newVersion;
   sourceMgr = SourceMgr();
   recordKeeper = std::make_unique<RecordKeeper>();
@@ -433,7 +443,8 @@ void TableGenTextFile::initialize(const lsp::URIForFile &uri,
   // Build a buffer for this file.
   auto memBuffer = llvm::MemoryBuffer::getMemBuffer(contents, uri.file());
   if (!memBuffer) {
-    lsp::Logger::error("Failed to create memory buffer for file", uri.file());
+    llvm::lsp::Logger::error("Failed to create memory buffer for file",
+                             uri.file());
     return;
   }
   sourceMgr.setIncludeDirs(includeDirs);
@@ -442,8 +453,8 @@ void TableGenTextFile::initialize(const lsp::URIForFile &uri,
   // This class provides a context argument for the SourceMgr diagnostic
   // handler.
   struct DiagHandlerContext {
-    std::vector<lsp::Diagnostic> &diagnostics;
-    const lsp::URIForFile &uri;
+    std::vector<llvm::lsp::Diagnostic> &diagnostics;
+    const llvm::lsp::URIForFile &uri;
   } handlerContext{diagnostics, uri};
 
   // Set the diagnostic handler for the tablegen source manager.
@@ -469,9 +480,9 @@ void TableGenTextFile::initialize(const lsp::URIForFile &uri,
 // TableGenTextFile: Definitions and References
 //===----------------------------------------------------------------------===//
 
-void TableGenTextFile::getLocationsOf(const lsp::URIForFile &uri,
-                                      const lsp::Position &defPos,
-                                      std::vector<lsp::Location> &locations) {
+void TableGenTextFile::getLocationsOf(
+    const llvm::lsp::URIForFile &uri, const llvm::lsp::Position &defPos,
+    std::vector<llvm::lsp::Location> &locations) {
   SMLoc posLoc = defPos.getAsSMLoc(sourceMgr);
   const TableGenIndexSymbol *symbol = index.lookup(posLoc);
   if (!symbol)
@@ -492,8 +503,8 @@ void TableGenTextFile::getLocationsOf(const lsp::URIForFile &uri,
 }
 
 void TableGenTextFile::findReferencesOf(
-    const lsp::URIForFile &uri, const lsp::Position &pos,
-    std::vector<lsp::Location> &references) {
+    const llvm::lsp::URIForFile &uri, const llvm::lsp::Position &pos,
+    std::vector<llvm::lsp::Location> &references) {
   SMLoc posLoc = pos.getAsSMLoc(sourceMgr);
   const TableGenIndexSymbol *symbol = index.lookup(posLoc);
   if (!symbol)
@@ -508,8 +519,9 @@ void TableGenTextFile::findReferencesOf(
 // TableGenTextFile: Document Links
 //===--------------------------------------------------------------------===//
 
-void TableGenTextFile::getDocumentLinks(const lsp::URIForFile &uri,
-                                        std::vector<lsp::DocumentLink> &links) {
+void TableGenTextFile::getDocumentLinks(
+    const llvm::lsp::URIForFile &uri,
+    std::vector<llvm::lsp::DocumentLink> &links) {
   for (const lsp::SourceMgrInclude &include : parsedIncludes)
     links.emplace_back(include.range, include.uri);
 }
@@ -518,9 +530,9 @@ void TableGenTextFile::getDocumentLinks(const lsp::URIForFile &uri,
 // TableGenTextFile: Hover
 //===----------------------------------------------------------------------===//
 
-std::optional<lsp::Hover>
-TableGenTextFile::findHover(const lsp::URIForFile &uri,
-                            const lsp::Position &hoverPos) {
+std::optional<llvm::lsp::Hover>
+TableGenTextFile::findHover(const llvm::lsp::URIForFile &uri,
+                            const llvm::lsp::Position &hoverPos) {
   // Check for a reference to an include.
   for (const lsp::SourceMgrInclude &include : parsedIncludes)
     if (include.range.contains(hoverPos))
@@ -546,9 +558,10 @@ TableGenTextFile::findHover(const lsp::URIForFile &uri,
   return buildHoverForField(recordVal->record, value, hoverRange);
 }
 
-lsp::Hover TableGenTextFile::buildHoverForRecord(const Record *record,
-                                                 const SMRange &hoverRange) {
-  lsp::Hover hover(lsp::Range(sourceMgr, hoverRange));
+llvm::lsp::Hover
+TableGenTextFile::buildHoverForRecord(const Record *record,
+                                      const SMRange &hoverRange) {
+  llvm::lsp::Hover hover(llvm::lsp::Range(sourceMgr, hoverRange));
   {
     llvm::raw_string_ostream hoverOS(hover.contents.value);
 
@@ -590,9 +603,9 @@ lsp::Hover TableGenTextFile::buildHoverForRecord(const Record *record,
   return hover;
 }
 
-lsp::Hover TableGenTextFile::buildHoverForTemplateArg(
+llvm::lsp::Hover TableGenTextFile::buildHoverForTemplateArg(
     const Record *record, const RecordVal *value, const SMRange &hoverRange) {
-  lsp::Hover hover(lsp::Range(sourceMgr, hoverRange));
+  llvm::lsp::Hover hover(llvm::lsp::Range(sourceMgr, hoverRange));
   {
     llvm::raw_string_ostream hoverOS(hover.contents.value);
     StringRef name = value->getName().rsplit(':').second;
@@ -604,10 +617,9 @@ lsp::Hover TableGenTextFile::buildHoverForTemplateArg(
   return hover;
 }
 
-lsp::Hover TableGenTextFile::buildHoverForField(const Record *record,
-                                                const RecordVal *value,
-                                                const SMRange &hoverRange) {
-  lsp::Hover hover(lsp::Range(sourceMgr, hoverRange));
+llvm::lsp::Hover TableGenTextFile::buildHoverForField(
+    const Record *record, const RecordVal *value, const SMRange &hoverRange) {
+  llvm::lsp::Hover hover(llvm::lsp::Range(sourceMgr, hoverRange));
   {
     llvm::raw_string_ostream hoverOS(hover.contents.value);
     hoverOS << "**field** `" << value->getName() << "`\n***\nType: `";
@@ -722,7 +734,7 @@ void lsp::TableGenServer::getDocumentLinks(
     return fileIt->second->getDocumentLinks(uri, documentLinks);
 }
 
-std::optional<lsp::Hover>
+std::optional<llvm::lsp::Hover>
 lsp::TableGenServer::findHover(const URIForFile &uri,
                                const Position &hoverPos) {
   auto fileIt = impl->files.find(uri.file());
diff --git a/mlir/lib/Tools/tblgen-lsp-server/TableGenServer.h b/mlir/lib/Tools/tblgen-lsp-server/TableGenServer.h
index bdc851024a818..e54b8bcf35e24 100644
--- a/mlir/lib/Tools/tblgen-lsp-server/TableGenServer.h
+++ b/mlir/lib/Tools/tblgen-lsp-server/TableGenServer.h
@@ -11,6 +11,7 @@
 
 #include "mlir/Support/LLVM.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Support/LSP/Protocol.h"
 #include <memory>
 #include <optional>
 #include <string>
@@ -18,13 +19,13 @@
 
 namespace mlir {
 namespace lsp {
-struct Diagnostic;
-struct DocumentLink;
-struct Hover;
-struct Location;
-struct Position;
-struct TextDocumentContentChangeEvent;
-class URIForFile;
+using llvm::lsp::Diagnostic;
+using llvm::lsp::DocumentLink;
+using llvm::lsp::Hover;
+using llvm::lsp::Location;
+using llvm::lsp::Position;
+using llvm::lsp::TextDocumentContentChangeEvent;
+using llvm::lsp::URIForFile;
 
 /// This class implements all of the TableGen related functionality necessary
 /// for a language server. This class allows for keeping the TableGen specific
diff --git a/mlir/lib/Transforms/Utils/DialectConversion.cpp b/mlir/lib/Transforms/Utils/DialectConversion.cpp
index df9700f11200f..f7565cfb0e45e 100644
--- a/mlir/lib/Transforms/Utils/DialectConversion.cpp
+++ b/mlir/lib/Transforms/Utils/DialectConversion.cpp
@@ -3097,9 +3097,155 @@ unsigned OperationLegalizer::applyCostModelToPatterns(
   return minDepth;
 }
 
+//===----------------------------------------------------------------------===//
+// Reconcile Unrealized Casts
+//===----------------------------------------------------------------------===//
+
+/// Try to reconcile all given UnrealizedConversionCastOps and store the
+/// left-over ops in `remainingCastOps` (if provided). See documentation in
+/// DialectConversion.h for more details.
+/// The `isCastOpOfInterestFn` is used to filter the cast ops to proceed: the
+/// algorithm may visit an operand (or user) which is a cast op, but will not
+/// try to reconcile it if not in the filtered set.
+template <typename RangeT>
+static void reconcileUnrealizedCastsImpl(
+    RangeT castOps,
+    function_ref<bool(UnrealizedConversionCastOp)> isCastOpOfInterestFn,
+    SmallVectorImpl<UnrealizedConversionCastOp> *remainingCastOps) {
+  // A worklist of cast ops to process.
+  SetVector<UnrealizedConversionCastOp> worklist(llvm::from_range, castOps);
+
+  // Helper function that return the unrealized_conversion_cast op that
+  // defines all inputs of the given op (in the same order). Return "nullptr"
+  // if there is no such op.
+  auto getInputCast =
+      [](UnrealizedConversionCastOp castOp) -> UnrealizedConversionCastOp {
+    if (castOp.getInputs().empty())
+      return {};
+    auto inputCastOp =
+        castOp.getInputs().front().getDefiningOp<UnrealizedConversionCastOp>();
+    if (!inputCastOp)
+      return {};
+    if (inputCastOp.getOutputs() != castOp.getInputs())
+      return {};
+    return inputCastOp;
+  };
+
+  // Process ops in the worklist bottom-to-top.
+  while (!worklist.empty()) {
+    UnrealizedConversionCastOp castOp = worklist.pop_back_val();
+
+    // Traverse the chain of input cast ops to see if an op with the same
+    // input types can be found.
+    UnrealizedConversionCastOp nextCast = castOp;
+    while (nextCast) {
+      if (nextCast.getInputs().getTypes() == castOp.getResultTypes()) {
+        if (llvm::any_of(nextCast.getInputs(), [&](Value v) {
+              return v.getDefiningOp() == castOp;
+            })) {
+          // Ran into a cycle.
+          break;
+        }
+
+        // Found a cast where the input types match the output types of the
+        // matched op. We can directly use those inputs.
+        castOp.replaceAllUsesWith(nextCast.getInputs());
+        break;
+      }
+      nextCast = getInputCast(nextCast);
+    }
+  }
+
+  // A set of all alive cast ops. I.e., ops whose results are (transitively)
+  // used by an op that is not a cast op.
+  DenseSet<Operation *> liveOps;
+
+  // Helper function that marks the given op and transitively reachable input
+  // cast ops as alive.
+  auto markOpLive = [&](Operation *rootOp) {
+    SmallVector<Operation *> worklist;
+    worklist.push_back(rootOp);
+    while (!worklist.empty()) {
+      Operation *op = worklist.pop_back_val();
+      if (liveOps.insert(op).second) {
+        // Successfully inserted: process reachable input cast ops.
+        for (Value v : op->getOperands())
+          if (auto castOp = v.getDefiningOp<UnrealizedConversionCastOp>())
+            if (isCastOpOfInterestFn(castOp))
+              worklist.push_back(castOp);
+      }
+    }
+  };
+
+  // Find all alive cast ops.
+  for (UnrealizedConversionCastOp op : castOps) {
+    // The op may have been marked live already as being an operand of another
+    // live cast op.
+    if (liveOps.contains(op.getOperation()))
+      continue;
+    // If any of the users is not a cast op, mark the current op (and its
+    // input ops) as live.
+    if (llvm::any_of(op->getUsers(), [&](Operation *user) {
+          auto castOp = dyn_cast<UnrealizedConversionCastOp>(user);
+          return !castOp || !isCastOpOfInterestFn(castOp);
+        }))
+      markOpLive(op);
+  }
+
+  // Erase all dead cast ops.
+  for (UnrealizedConversionCastOp op : castOps) {
+    if (liveOps.contains(op)) {
+      // Op is alive and was not erased. Add it to the remaining cast ops.
+      if (remainingCastOps)
+        remainingCastOps->push_back(op);
+      continue;
+    }
+
+    // Op is dead. Erase it.
+    op->dropAllUses();
+    op->erase();
+  }
+}
+
+void mlir::reconcileUnrealizedCasts(
+    ArrayRef<UnrealizedConversionCastOp> castOps,
+    SmallVectorImpl<UnrealizedConversionCastOp> *remainingCastOps) {
+  // Set of all cast ops for faster lookups.
+  DenseSet<UnrealizedConversionCastOp> castOpSet;
+  for (UnrealizedConversionCastOp op : castOps)
+    castOpSet.insert(op);
+  reconcileUnrealizedCasts(castOpSet, remainingCastOps);
+}
+
+void mlir::reconcileUnrealizedCasts(
+    const DenseSet<UnrealizedConversionCastOp> &castOps,
+    SmallVectorImpl<UnrealizedConversionCastOp> *remainingCastOps) {
+  reconcileUnrealizedCastsImpl(
+      llvm::make_range(castOps.begin(), castOps.end()),
+      [&](UnrealizedConversionCastOp castOp) {
+        return castOps.contains(castOp);
+      },
+      remainingCastOps);
+}
+
+namespace mlir {
+static void reconcileUnrealizedCasts(
+    const DenseMap<UnrealizedConversionCastOp, UnresolvedMaterializationInfo>
+        &castOps,
+    SmallVectorImpl<UnrealizedConversionCastOp> *remainingCastOps) {
+  reconcileUnrealizedCastsImpl(
+      castOps.keys(),
+      [&](UnrealizedConversionCastOp castOp) {
+        return castOps.contains(castOp);
+      },
+      remainingCastOps);
+}
+} // namespace mlir
+
 //===----------------------------------------------------------------------===//
 // OperationConverter
 //===----------------------------------------------------------------------===//
+
 namespace {
 enum OpConversionMode {
   /// In this mode, the conversion will ignore failed conversions to allow
@@ -3264,18 +3410,13 @@ LogicalResult OperationConverter::convertOperations(ArrayRef<Operation *> ops) {
   // After a successful conversion, apply rewrites.
   rewriterImpl.applyRewrites();
 
-  // Gather all unresolved materializations.
-  SmallVector<UnrealizedConversionCastOp> allCastOps;
-  const DenseMap<UnrealizedConversionCastOp, UnresolvedMaterializationInfo>
-      &materializations = rewriterImpl.unresolvedMaterializations;
-  for (auto it : materializations)
-    allCastOps.push_back(it.first);
-
   // Reconcile all UnrealizedConversionCastOps that were inserted by the
-  // dialect conversion frameworks. (Not the one that were inserted by
+  // dialect conversion frameworks. (Not the ones that were inserted by
   // patterns.)
+  const DenseMap<UnrealizedConversionCastOp, UnresolvedMaterializationInfo>
+      &materializations = rewriterImpl.unresolvedMaterializations;
   SmallVector<UnrealizedConversionCastOp> remainingCastOps;
-  reconcileUnrealizedCasts(allCastOps, &remainingCastOps);
+  reconcileUnrealizedCasts(materializations, &remainingCastOps);
 
   // Drop markers.
   for (UnrealizedConversionCastOp castOp : remainingCastOps)
@@ -3299,79 +3440,6 @@ LogicalResult OperationConverter::convertOperations(ArrayRef<Operation *> ops) {
   return success();
 }
 
-//===----------------------------------------------------------------------===//
-// Reconcile Unrealized Casts
-//===----------------------------------------------------------------------===//
-
-void mlir::reconcileUnrealizedCasts(
-    ArrayRef<UnrealizedConversionCastOp> castOps,
-    SmallVectorImpl<UnrealizedConversionCastOp> *remainingCastOps) {
-  SetVector<UnrealizedConversionCastOp> worklist(llvm::from_range, castOps);
-  // This set is maintained only if `remainingCastOps` is provided.
-  DenseSet<Operation *> erasedOps;
-
-  // Helper function that adds all operands to the worklist that are an
-  // unrealized_conversion_cast op result.
-  auto enqueueOperands = [&](UnrealizedConversionCastOp castOp) {
-    for (Value v : castOp.getInputs())
-      if (auto inputCastOp = v.getDefiningOp<UnrealizedConversionCastOp>())
-        worklist.insert(inputCastOp);
-  };
-
-  // Helper function that return the unrealized_conversion_cast op that
-  // defines all inputs of the given op (in the same order). Return "nullptr"
-  // if there is no such op.
-  auto getInputCast =
-      [](UnrealizedConversionCastOp castOp) -> UnrealizedConversionCastOp {
-    if (castOp.getInputs().empty())
-      return {};
-    auto inputCastOp =
-        castOp.getInputs().front().getDefiningOp<UnrealizedConversionCastOp>();
-    if (!inputCastOp)
-      return {};
-    if (inputCastOp.getOutputs() != castOp.getInputs())
-      return {};
-    return inputCastOp;
-  };
-
-  // Process ops in the worklist bottom-to-top.
-  while (!worklist.empty()) {
-    UnrealizedConversionCastOp castOp = worklist.pop_back_val();
-    if (castOp->use_empty()) {
-      // DCE: If the op has no users, erase it. Add the operands to the
-      // worklist to find additional DCE opportunities.
-      enqueueOperands(castOp);
-      if (remainingCastOps)
-        erasedOps.insert(castOp.getOperation());
-      castOp->erase();
-      continue;
-    }
-
-    // Traverse the chain of input cast ops to see if an op with the same
-    // input types can be found.
-    UnrealizedConversionCastOp nextCast = castOp;
-    while (nextCast) {
-      if (nextCast.getInputs().getTypes() == castOp.getResultTypes()) {
-        // Found a cast where the input types match the output types of the
-        // matched op. We can directly use those inputs and the matched op can
-        // be removed.
-        enqueueOperands(castOp);
-        castOp.replaceAllUsesWith(nextCast.getInputs());
-        if (remainingCastOps)
-          erasedOps.insert(castOp.getOperation());
-        castOp->erase();
-        break;
-      }
-      nextCast = getInputCast(nextCast);
-    }
-  }
-
-  if (remainingCastOps)
-    for (UnrealizedConversionCastOp op : castOps)
-      if (!erasedOps.contains(op.getOperation()))
-        remainingCastOps->push_back(op);
-}
-
 //===----------------------------------------------------------------------===//
 // Type Conversion
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/permlane.mlir b/mlir/test/Conversion/AMDGPUToROCDL/permlane.mlir
old mode 100644
new mode 100755
index aae2b1d0fd90c..a92321da8f357
--- a/mlir/test/Conversion/AMDGPUToROCDL/permlane.mlir
+++ b/mlir/test/Conversion/AMDGPUToROCDL/permlane.mlir
@@ -4,8 +4,11 @@
 // CHECK-SAME: (%[[ARG0:.*]]: i32)
 func.func @test_permlane16_i32(%arg0 : i32) -> i32 {
 // CHECK:  %[[PERM:.*]] = rocdl.permlane16.swap %[[ARG0]], %[[ARG0]], false, false : (i32, i32) -> <(i32, i32)>
-// CHECK:  %[[RES:.*]] = llvm.extractvalue %[[PERM]][0] : !llvm.struct<(i32, i32)>
-// CHECK:  return %[[RES]] : i32
+// CHECK:  %[[E0:.*]] = llvm.extractvalue %[[PERM]][0] : !llvm.struct<(i32, i32)>
+// CHECK:  %[[E1:.*]] = llvm.extractvalue %[[PERM]][1] : !llvm.struct<(i32, i32)>
+// CHECK:  %[[CMP:.*]] = llvm.icmp "eq" %[[E0]], %[[ARG0]] : i32
+// CHECK:  %[[SEL:.*]] = llvm.select %[[CMP]], %[[E1]], %[[E0]] : i1, i32
+// CHECK:  return %[[SEL]] : i32
   %0 = amdgpu.permlane_swap %arg0 16 : i32
   return %0 : i32
 }
@@ -14,8 +17,11 @@ func.func @test_permlane16_i32(%arg0 : i32) -> i32 {
 // CHECK-SAME: (%[[ARG0:.*]]: i32)
 func.func @test_permlane16_i32_optional_attr(%arg0 : i32) -> i32 {
 // CHECK:  %[[PERM:.*]] = rocdl.permlane16.swap %[[ARG0]], %[[ARG0]], true, true : (i32, i32) -> <(i32, i32)>
-// CHECK:  %[[RES:.*]] = llvm.extractvalue %[[PERM]][0] : !llvm.struct<(i32, i32)>
-// CHECK:  return %[[RES]] : i32
+// CHECK:  %[[E0:.*]] = llvm.extractvalue %[[PERM]][0] : !llvm.struct<(i32, i32)>
+// CHECK:  %[[E1:.*]] = llvm.extractvalue %[[PERM]][1] : !llvm.struct<(i32, i32)>
+// CHECK:  %[[CMP:.*]] = llvm.icmp "eq" %[[E0]], %[[ARG0]] : i32
+// CHECK:  %[[SEL:.*]] = llvm.select %[[CMP]], %[[E1]], %[[E0]] : i1, i32
+// CHECK:  return %[[SEL]] : i32
   %0 = amdgpu.permlane_swap %arg0 16 { fetch_inactive = true, bound_ctrl = true }  : i32
   return %0 : i32
 }
@@ -24,8 +30,11 @@ func.func @test_permlane16_i32_optional_attr(%arg0 : i32) -> i32 {
 // CHECK-SAME: (%[[ARG0:.*]]: i32)
 func.func @test_permlane32_i32(%arg0 : i32) -> i32 {
 // CHECK:  %[[PERM:.*]] = rocdl.permlane32.swap %[[ARG0]], %[[ARG0]], false, false : (i32, i32) -> <(i32, i32)>
-// CHECK:  %[[RES:.*]] = llvm.extractvalue %[[PERM]][0] : !llvm.struct<(i32, i32)>
-// CHECK:  return %[[RES]] : i32
+// CHECK:  %[[E0:.*]] = llvm.extractvalue %[[PERM]][0] : !llvm.struct<(i32, i32)>
+// CHECK:  %[[E1:.*]] = llvm.extractvalue %[[PERM]][1] : !llvm.struct<(i32, i32)>
+// CHECK:  %[[CMP:.*]] = llvm.icmp "eq" %[[E0]], %[[ARG0]] : i32
+// CHECK:  %[[SEL:.*]] = llvm.select %[[CMP]], %[[E1]], %[[E0]] : i1, i32
+// CHECK:  return %[[SEL]] : i32
   %0 = amdgpu.permlane_swap %arg0 32 : i32
   return %0 : i32
 }
@@ -35,8 +44,11 @@ func.func @test_permlane32_i32(%arg0 : i32) -> i32 {
 func.func @test_permlane16_f32(%arg0 : f32) -> f32 {
 // CHECK:  %[[CAST:.*]] = llvm.bitcast %[[ARG0]] : f32 to i32
 // CHECK:  %[[PERM:.*]] = rocdl.permlane16.swap %[[CAST]], %[[CAST]], false, false : (i32, i32) -> <(i32, i32)>
-// CHECK:  %[[RES:.*]] = llvm.extractvalue %[[PERM]][0] : !llvm.struct<(i32, i32)>
-// CHECK:  %[[RES_CAST:.*]] = llvm.bitcast %[[RES]] : i32 to f32
+// CHECK:  %[[E0:.*]] = llvm.extractvalue %[[PERM]][0] : !llvm.struct<(i32, i32)>
+// CHECK:  %[[E1:.*]] = llvm.extractvalue %[[PERM]][1] : !llvm.struct<(i32, i32)>
+// CHECK:  %[[CMP:.*]] = llvm.icmp "eq" %[[E0]], %[[CAST]] : i32
+// CHECK:  %[[SEL:.*]] = llvm.select %[[CMP]], %[[E1]], %[[E0]] : i1, i32
+// CHECK:  %[[RES_CAST:.*]] = llvm.bitcast %[[SEL]] : i32 to f32
 // CHECK:  return %[[RES_CAST]] : f32
   %0 = amdgpu.permlane_swap %arg0 16 : f32
   return %0 : f32
@@ -47,8 +59,11 @@ func.func @test_permlane16_f32(%arg0 : f32) -> f32 {
 func.func @test_permlane32_f32(%arg0 : f32) -> f32 {
 // CHECK:  %[[CAST:.*]] = llvm.bitcast %[[ARG0]] : f32 to i32
 // CHECK:  %[[PERM:.*]] = rocdl.permlane32.swap %[[CAST]], %[[CAST]], false, false : (i32, i32) -> <(i32, i32)>
-// CHECK:  %[[RES:.*]] = llvm.extractvalue %[[PERM]][0] : !llvm.struct<(i32, i32)>
-// CHECK:  %[[RES_CAST:.*]] = llvm.bitcast %[[RES]] : i32 to f32
+// CHECK:  %[[E0:.*]] = llvm.extractvalue %[[PERM]][0] : !llvm.struct<(i32, i32)>
+// CHECK:  %[[E1:.*]] = llvm.extractvalue %[[PERM]][1] : !llvm.struct<(i32, i32)>
+// CHECK:  %[[CMP:.*]] = llvm.icmp "eq" %[[E0]], %[[CAST]] : i32
+// CHECK:  %[[SEL:.*]] = llvm.select %[[CMP]], %[[E1]], %[[E0]] : i1, i32
+// CHECK:  %[[RES_CAST:.*]] = llvm.bitcast %[[SEL]] : i32 to f32
 // CHECK:  return %[[RES_CAST]] : f32
   %0 = amdgpu.permlane_swap %arg0 32 : f32
   return %0 : f32
@@ -60,8 +75,11 @@ func.func @test_permlane16_f16(%arg0 : f16) -> f16 {
 // CHECK:  %[[CAST:.*]] = llvm.bitcast %[[ARG0]] : f16 to i16
 // CHECK:  %[[ZEXT:.*]] = llvm.zext %[[CAST]] : i16 to i32
 // CHECK:  %[[PERM:.*]] = rocdl.permlane16.swap %[[ZEXT]], %[[ZEXT]], false, false : (i32, i32) -> <(i32, i32)>
-// CHECK:  %[[RES:.*]] = llvm.extractvalue %[[PERM]][0] : !llvm.struct<(i32, i32)>
-// CHECK:  %[[TRUNC:.*]] = llvm.trunc %[[RES]] : i32 to i16
+// CHECK:  %[[E0:.*]] = llvm.extractvalue %[[PERM]][0] : !llvm.struct<(i32, i32)>
+// CHECK:  %[[E1:.*]] = llvm.extractvalue %[[PERM]][1] : !llvm.struct<(i32, i32)>
+// CHECK:  %[[CMP:.*]] = llvm.icmp "eq" %[[E0]], %[[ZEXT]] : i32
+// CHECK:  %[[SEL:.*]] = llvm.select %[[CMP]], %[[E1]], %[[E0]] : i1, i32
+// CHECK:  %[[TRUNC:.*]] = llvm.trunc %[[SEL]] : i32 to i16
 // CHECK:  %[[RES_CAST:.*]] = llvm.bitcast %[[TRUNC]] : i16 to f16
 // CHECK:  return %[[RES_CAST]] : f16
   %0 = amdgpu.permlane_swap %arg0 16 : f16
@@ -74,8 +92,11 @@ func.func @test_permlane32_f16(%arg0 : f16) -> f16 {
 // CHECK:  %[[CAST:.*]] = llvm.bitcast %[[ARG0]] : f16 to i16
 // CHECK:  %[[ZEXT:.*]] = llvm.zext %[[CAST]] : i16 to i32
 // CHECK:  %[[PERM:.*]] = rocdl.permlane32.swap %[[ZEXT]], %[[ZEXT]], false, false : (i32, i32) -> <(i32, i32)>
-// CHECK:  %[[RES:.*]] = llvm.extractvalue %[[PERM]][0] : !llvm.struct<(i32, i32)>
-// CHECK:  %[[TRUNC:.*]] = llvm.trunc %[[RES]] : i32 to i16
+// CHECK:  %[[E0:.*]] = llvm.extractvalue %[[PERM]][0] : !llvm.struct<(i32, i32)>
+// CHECK:  %[[E1:.*]] = llvm.extractvalue %[[PERM]][1] : !llvm.struct<(i32, i32)>
+// CHECK:  %[[CMP:.*]] = llvm.icmp "eq" %[[E0]], %[[ZEXT]] : i32
+// CHECK:  %[[SEL:.*]] = llvm.select %[[CMP]], %[[E1]], %[[E0]] : i1, i32
+// CHECK:  %[[TRUNC:.*]] = llvm.trunc %[[SEL]] : i32 to i16
 // CHECK:  %[[RES_CAST:.*]] = llvm.bitcast %[[TRUNC]] : i16 to f16
 // CHECK:  return %[[RES_CAST]] : f16
   %0 = amdgpu.permlane_swap %arg0 32 : f16
@@ -90,10 +111,16 @@ func.func @test_permlane16_2xi32(%arg0 : vector<2xi32>) -> vector<2xi32> {
 // CHECK-DAG:  %[[C0:.*]] = llvm.mlir.constant(0 : i32) : i32
 // CHECK:      %[[ELEM0:.*]] = llvm.extractelement %[[ARG0]][%[[C0]] : i32] : vector<2xi32>
 // CHECK:      %[[ELEM1:.*]] = llvm.extractelement %[[ARG0]][%[[C1]] : i32] : vector<2xi32>
-// CHECK:      %[[PERM0_TUPLE:.*]] = rocdl.permlane16.swap %[[ELEM0]], %[[ELEM0]], false, false : (i32, i32) -> <(i32, i32)>
-// CHECK:      %[[PERM0:.*]] = llvm.extractvalue %[[PERM0_TUPLE]][0] : !llvm.struct<(i32, i32)>
-// CHECK:      %[[PERM1_TUPLE:.*]] = rocdl.permlane16.swap %[[ELEM1]], %[[ELEM1]], false, false : (i32, i32) -> <(i32, i32)>
-// CHECK:      %[[PERM1:.*]] = llvm.extractvalue %[[PERM1_TUPLE]][0] : !llvm.struct<(i32, i32)>
+// CHECK:      %[[T0:.*]] = rocdl.permlane16.swap %[[ELEM0]], %[[ELEM0]], false, false : (i32, i32) -> <(i32, i32)>
+// CHECK:      %[[T0_0:.*]] = llvm.extractvalue %[[T0]][0] : !llvm.struct<(i32, i32)>
+// CHECK:      %[[T0_1:.*]] = llvm.extractvalue %[[T0]][1] : !llvm.struct<(i32, i32)>
+// CHECK:      %[[CMP0:.*]] = llvm.icmp "eq" %[[T0_0]], %[[ELEM0]] : i32
+// CHECK:      %[[PERM0:.*]] = llvm.select %[[CMP0]], %[[T0_1]], %[[T0_0]] : i1, i32
+// CHECK:      %[[T1:.*]] = rocdl.permlane16.swap %[[ELEM1]], %[[ELEM1]], false, false : (i32, i32) -> <(i32, i32)>
+// CHECK:      %[[T1_0:.*]] = llvm.extractvalue %[[T1]][0] : !llvm.struct<(i32, i32)>
+// CHECK:      %[[T1_1:.*]] = llvm.extractvalue %[[T1]][1] : !llvm.struct<(i32, i32)>
+// CHECK:      %[[CMP1:.*]] = llvm.icmp "eq" %[[T1_0]], %[[ELEM1]] : i32
+// CHECK:      %[[PERM1:.*]] = llvm.select %[[CMP1]], %[[T1_1]], %[[T1_0]] : i1, i32
 // CHECK:      %[[VEC_INSERT0:.*]] = llvm.insertelement %[[PERM0]], %[[POISON]][%[[C0]] : i32] : vector<2xi32>
 // CHECK:      %[[VEC_INSERT1:.*]] = llvm.insertelement %[[PERM1]], %[[VEC_INSERT0]][%[[C1]] : i32] : vector<2xi32>
 // CHECK:      return %[[VEC_INSERT1]] : vector<2xi32>
@@ -109,10 +136,16 @@ func.func @test_permlane32_2xi32(%arg0 : vector<2xi32>) -> vector<2xi32> {
 // CHECK-DAG:  %[[C0:.*]] = llvm.mlir.constant(0 : i32) : i32
 // CHECK:      %[[ELEM0:.*]] = llvm.extractelement %[[ARG0]][%[[C0]] : i32] : vector<2xi32>
 // CHECK:      %[[ELEM1:.*]] = llvm.extractelement %[[ARG0]][%[[C1]] : i32] : vector<2xi32>
-// CHECK:      %[[PERM0_TUPLE:.*]] = rocdl.permlane32.swap %[[ELEM0]], %[[ELEM0]], false, false : (i32, i32) -> <(i32, i32)>
-// CHECK:      %[[PERM0:.*]] = llvm.extractvalue %[[PERM0_TUPLE]][0] : !llvm.struct<(i32, i32)>
-// CHECK:      %[[PERM1_TUPLE:.*]] = rocdl.permlane32.swap %[[ELEM1]], %[[ELEM1]], false, false : (i32, i32) -> <(i32, i32)>
-// CHECK:      %[[PERM1:.*]] = llvm.extractvalue %[[PERM1_TUPLE]][0] : !llvm.struct<(i32, i32)>
+// CHECK:      %[[T0:.*]] = rocdl.permlane32.swap %[[ELEM0]], %[[ELEM0]], false, false : (i32, i32) -> <(i32, i32)>
+// CHECK:      %[[T0_0:.*]] = llvm.extractvalue %[[T0]][0] : !llvm.struct<(i32, i32)>
+// CHECK:      %[[T0_1:.*]] = llvm.extractvalue %[[T0]][1] : !llvm.struct<(i32, i32)>
+// CHECK:      %[[CMP0:.*]] = llvm.icmp "eq" %[[T0_0]], %[[ELEM0]] : i32
+// CHECK:      %[[PERM0:.*]] = llvm.select %[[CMP0]], %[[T0_1]], %[[T0_0]] : i1, i32
+// CHECK:      %[[T1:.*]] = rocdl.permlane32.swap %[[ELEM1]], %[[ELEM1]], false, false : (i32, i32) -> <(i32, i32)>
+// CHECK:      %[[T1_0:.*]] = llvm.extractvalue %[[T1]][0] : !llvm.struct<(i32, i32)>
+// CHECK:      %[[T1_1:.*]] = llvm.extractvalue %[[T1]][1] : !llvm.struct<(i32, i32)>
+// CHECK:      %[[CMP1:.*]] = llvm.icmp "eq" %[[T1_0]], %[[ELEM1]] : i32
+// CHECK:      %[[PERM1:.*]] = llvm.select %[[CMP1]], %[[T1_1]], %[[T1_0]] : i1, i32
 // CHECK:      %[[VEC_INSERT0:.*]] = llvm.insertelement %[[PERM0]], %[[POISON]][%[[C0]] : i32] : vector<2xi32>
 // CHECK:      %[[VEC_INSERT1:.*]] = llvm.insertelement %[[PERM1]], %[[VEC_INSERT0]][%[[C1]] : i32] : vector<2xi32>
 // CHECK:      return %[[VEC_INSERT1]] : vector<2xi32>
@@ -130,9 +163,15 @@ func.func @test_permlane16_4xf16(%arg0 : vector<4xf16>) -> vector<4xf16> {
 // CHECK:      %[[ELEM0:.*]] = llvm.extractelement %[[CAST1]][%[[C0]] : i32] : vector<2xi32>
 // CHECK:      %[[ELEM1:.*]] = llvm.extractelement %[[CAST1]][%[[C1]] : i32] : vector<2xi32>
 // CHECK:      %[[PERM0_TUPLE:.*]] = rocdl.permlane16.swap %[[ELEM0]], %[[ELEM0]], false, false : (i32, i32) -> <(i32, i32)>
-// CHECK:      %[[PERM0:.*]] = llvm.extractvalue %[[PERM0_TUPLE]][0] : !llvm.struct<(i32, i32)>
+// CHECK:      %[[PERM0_E0:.*]] = llvm.extractvalue %[[PERM0_TUPLE]][0] : !llvm.struct<(i32, i32)>
+// CHECK:      %[[PERM0_E1:.*]] = llvm.extractvalue %[[PERM0_TUPLE]][1] : !llvm.struct<(i32, i32)>
+// CHECK:      %[[CMP0:.*]] = llvm.icmp "eq" %[[PERM0_E0]], %[[ELEM0]] : i32
+// CHECK:      %[[PERM0:.*]] = llvm.select %[[CMP0]], %[[PERM0_E1]], %[[PERM0_E0]] : i1, i32
 // CHECK:      %[[PERM1_TUPLE:.*]] = rocdl.permlane16.swap %[[ELEM1]], %[[ELEM1]], false, false : (i32, i32) -> <(i32, i32)>
-// CHECK:      %[[PERM1:.*]] = llvm.extractvalue %[[PERM1_TUPLE]][0] : !llvm.struct<(i32, i32)>
+// CHECK:      %[[PERM1_E0:.*]] = llvm.extractvalue %[[PERM1_TUPLE]][0] : !llvm.struct<(i32, i32)>
+// CHECK:      %[[PERM1_E1:.*]] = llvm.extractvalue %[[PERM1_TUPLE]][1] : !llvm.struct<(i32, i32)>
+// CHECK:      %[[CMP1:.*]] = llvm.icmp "eq" %[[PERM1_E0]], %[[ELEM1]] : i32
+// CHECK:      %[[PERM1:.*]] = llvm.select %[[CMP1]], %[[PERM1_E1]], %[[PERM1_E0]] : i1, i32
 // CHECK:      %[[VEC_INSERT0:.*]] = llvm.insertelement %[[PERM0]], %[[POISON]][%[[C0]] : i32] : vector<2xi32>
 // CHECK:      %[[VEC_INSERT1:.*]] = llvm.insertelement %[[PERM1]], %[[VEC_INSERT0]][%[[C1]] : i32] : vector<2xi32>
 // CHECK:      %[[CAST2:.*]] = llvm.bitcast %[[VEC_INSERT1]] : vector<2xi32> to vector<4xf16>
@@ -151,9 +190,15 @@ func.func @test_permlane32_4xf16(%arg0 : vector<4xf16>) -> vector<4xf16> {
 // CHECK:      %[[ELEM0:.*]] = llvm.extractelement %[[CAST1]][%[[C0]] : i32] : vector<2xi32>
 // CHECK:      %[[ELEM1:.*]] = llvm.extractelement %[[CAST1]][%[[C1]] : i32] : vector<2xi32>
 // CHECK:      %[[PERM0_TUPLE:.*]] = rocdl.permlane32.swap %[[ELEM0]], %[[ELEM0]], false, false : (i32, i32) -> <(i32, i32)>
-// CHECK:      %[[PERM0:.*]] = llvm.extractvalue %[[PERM0_TUPLE]][0] : !llvm.struct<(i32, i32)>
+// CHECK:      %[[PERM0_E0:.*]] = llvm.extractvalue %[[PERM0_TUPLE]][0] : !llvm.struct<(i32, i32)>
+// CHECK:      %[[PERM0_E1:.*]] = llvm.extractvalue %[[PERM0_TUPLE]][1] : !llvm.struct<(i32, i32)>
+// CHECK:      %[[CMP0:.*]] = llvm.icmp "eq" %[[PERM0_E0]], %[[ELEM0]] : i32
+// CHECK:      %[[PERM0:.*]] = llvm.select %[[CMP0]], %[[PERM0_E1]], %[[PERM0_E0]] : i1, i32
 // CHECK:      %[[PERM1_TUPLE:.*]] = rocdl.permlane32.swap %[[ELEM1]], %[[ELEM1]], false, false : (i32, i32) -> <(i32, i32)>
-// CHECK:      %[[PERM1:.*]] = llvm.extractvalue %[[PERM1_TUPLE]][0] : !llvm.struct<(i32, i32)>
+// CHECK:      %[[PERM1_E0:.*]] = llvm.extractvalue %[[PERM1_TUPLE]][0] : !llvm.struct<(i32, i32)>
+// CHECK:      %[[PERM1_E1:.*]] = llvm.extractvalue %[[PERM1_TUPLE]][1] : !llvm.struct<(i32, i32)>
+// CHECK:      %[[CMP1:.*]] = llvm.icmp "eq" %[[PERM1_E0]], %[[ELEM1]] : i32
+// CHECK:      %[[PERM1:.*]] = llvm.select %[[CMP1]], %[[PERM1_E1]], %[[PERM1_E0]] : i1, i32
 // CHECK:      %[[VEC_INSERT0:.*]] = llvm.insertelement %[[PERM0]], %[[POISON]][%[[C0]] : i32] : vector<2xi32>
 // CHECK:      %[[VEC_INSERT1:.*]] = llvm.insertelement %[[PERM1]], %[[VEC_INSERT0]][%[[C1]] : i32] : vector<2xi32>
 // CHECK:      %[[CAST2:.*]] = llvm.bitcast %[[VEC_INSERT1]] : vector<2xi32> to vector<4xf16>
diff --git a/mlir/test/Conversion/ArmSMEToLLVM/arm-sme-to-llvm.mlir b/mlir/test/Conversion/ArmSMEToLLVM/arm-sme-to-llvm.mlir
index 6a4d77e86ab58..fd8910265cd89 100644
--- a/mlir/test/Conversion/ArmSMEToLLVM/arm-sme-to-llvm.mlir
+++ b/mlir/test/Conversion/ArmSMEToLLVM/arm-sme-to-llvm.mlir
@@ -586,9 +586,10 @@ func.func @arm_sme_extract_tile_slice_ver_i128(%tile_slice_index : index) -> vec
 // -----
 
 // CHECK-LABEL: @arm_sme_streaming_vl_bytes
-// CHECK: %[[COUNT:.*]] = "arm_sme.intr.cntsb"() : () -> i64
-// CHECK: %[[INDEX_COUNT:.*]] = arith.index_cast %[[COUNT]] : i64 to index
-// CHECK: return %[[INDEX_COUNT]] : index
+// CHECK: %[[CONST:.*]] = arith.constant 8 : index
+// CHECK: %[[CNTSD:.*]] = "arm_sme.intr.cntsd"() : () -> i64
+// CHECK: %[[CNTSD_IDX:.*]] = arith.index_cast %[[CNTSD]] : i64 to index
+// CHECK: %[[MUL:.*]] = arith.muli %[[CNTSD_IDX]], %[[CONST]] : index
 func.func @arm_sme_streaming_vl_bytes() -> index {
   %svl_b = arm_sme.streaming_vl <byte>
   return %svl_b : index
@@ -597,7 +598,10 @@ func.func @arm_sme_streaming_vl_bytes() -> index {
 // -----
 
 // CHECK-LABEL: @arm_sme_streaming_vl_half_words
-// CHECK: "arm_sme.intr.cntsh"() : () -> i64
+// CHECK: %[[CONST:.*]] = arith.constant 4 : index
+// CHECK: %[[CNTSD:.*]] = "arm_sme.intr.cntsd"() : () -> i64
+// CHECK: %[[CNTSD_IDX:.*]] = arith.index_cast %[[CNTSD]] : i64 to index
+// CHECK: %[[MUL:.*]] = arith.muli %[[CNTSD_IDX]], %[[CONST]] : index
 func.func @arm_sme_streaming_vl_half_words() -> index {
   %svl_h = arm_sme.streaming_vl <half>
   return %svl_h : index
@@ -606,7 +610,10 @@ func.func @arm_sme_streaming_vl_half_words() -> index {
 // -----
 
 // CHECK-LABEL: @arm_sme_streaming_vl_words
-// CHECK: "arm_sme.intr.cntsw"() : () -> i64
+// CHECK: %[[CONST:.*]] = arith.constant 2 : index
+// CHECK: %[[CNTSD:.*]] = "arm_sme.intr.cntsd"() : () -> i64
+// CHECK: %[[CNTSD_IDX:.*]] = arith.index_cast %[[CNTSD]] : i64 to index
+// CHECK: %[[MUL:.*]] = arith.muli %[[CNTSD_IDX]], %[[CONST]] : index
 func.func @arm_sme_streaming_vl_words() -> index {
   %svl_w = arm_sme.streaming_vl <word>
   return %svl_w : index
diff --git a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
old mode 100644
new mode 100755
index c6261b37ef8f2..ef631ce8a12e5
--- a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
+++ b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
@@ -749,13 +749,19 @@ gpu.module @test_module {
     %shfl1, %pred1 = gpu.shuffle xor %arg0, %arg1, %arg4 : f32
     // CHECK: %[[#CAST_VALUE:]] = llvm.bitcast %[[#VALUE]] : f32 to i32
     // CHECK: %[[#PERMUTE:]] = rocdl.permlane16.swap %[[#CAST_VALUE]], %[[#CAST_VALUE]], false, false : (i32, i32) -> <(i32, i32)>
-    // CHECK: %[[#EXTRACT:]] = llvm.extractvalue %[[#PERMUTE:]][0] : !llvm.struct<(i32, i32)>
-    // CHECK: %[[#CAST_SHFL_VALUE:]] = llvm.bitcast %[[#EXTRACT]] : i32 to f32
+    // CHECK: %[[#EXTRACT0:]] = llvm.extractvalue %[[#PERMUTE:]][0] : !llvm.struct<(i32, i32)>
+    // CHECK: %[[#EXTRACT1:]] = llvm.extractvalue %[[#PERMUTE:]][1] : !llvm.struct<(i32, i32)>
+    // CHECK: %[[#CMP:]] = llvm.icmp "eq" %[[#EXTRACT0]], %[[#CAST_VALUE]] : i32
+    // CHECK: %[[#SEL:]] = llvm.select %[[#CMP]], %[[#EXTRACT1]], %[[#EXTRACT0]] : i1, i32
+    // CHECK: %[[#CAST_SHFL_VALUE:]] = llvm.bitcast %[[#SEL]] : i32 to f32
     %shfl2, %pred2 = gpu.shuffle xor %arg0, %arg2, %arg4 : f32
     // CHECK: %[[#CAST_VALUE:]] = llvm.bitcast %[[#VALUE]] : f32 to i32
     // CHECK: %[[#PERMUTE:]] = rocdl.permlane32.swap %[[#CAST_VALUE]], %[[#CAST_VALUE]], false, false : (i32, i32) -> <(i32, i32)>
-    // CHECK: %[[#EXTRACT:]] = llvm.extractvalue %[[#PERMUTE:]][0] : !llvm.struct<(i32, i32)>
-    // CHECK: %[[#CAST_SHFL_VALUE:]] = llvm.bitcast %[[#EXTRACT]] : i32 to f32
+    // CHECK: %[[#EXTRACT0:]] = llvm.extractvalue %[[#PERMUTE:]][0] : !llvm.struct<(i32, i32)>
+    // CHECK: %[[#EXTRACT1:]] = llvm.extractvalue %[[#PERMUTE:]][1] : !llvm.struct<(i32, i32)>
+    // CHECK: %[[#CMP:]] = llvm.icmp "eq" %[[#EXTRACT0]], %[[#CAST_VALUE]] : i32
+    // CHECK: %[[#SEL:]] = llvm.select %[[#CMP]], %[[#EXTRACT1]], %[[#EXTRACT0]] : i1, i32
+    // CHECK: %[[#CAST_SHFL_VALUE:]] = llvm.bitcast %[[#SEL]] : i32 to f32
     %shfl3, %pred3 = gpu.shuffle xor  %arg0, %arg3, %arg4 : f32
     func.return %shfl1, %shfl2, %shfl3 : f32, f32, f32
   }
diff --git a/mlir/test/Conversion/GPUToSPIRV/wmma-ops-to-spirv-khr-coop-matrix.mlir b/mlir/test/Conversion/GPUToSPIRV/wmma-ops-to-spirv-khr-coop-matrix.mlir
index 7ef3711ebe28b..6dba9c3486c7b 100644
--- a/mlir/test/Conversion/GPUToSPIRV/wmma-ops-to-spirv-khr-coop-matrix.mlir
+++ b/mlir/test/Conversion/GPUToSPIRV/wmma-ops-to-spirv-khr-coop-matrix.mlir
@@ -136,14 +136,17 @@ module attributes {
       // CHECK:  {{%.*}} = spirv.FDiv {{%.*}}, {{%.*}} : !spirv.coopmatrix<16x16xf16, Subgroup, MatrixAcc>
       %E = gpu.subgroup_mma_elementwise divf %D, %A :
         (!gpu.mma_matrix<16x16xf16, "COp">, !gpu.mma_matrix<16x16xf16, "COp">) -> !gpu.mma_matrix<16x16xf16, "COp">
+      // CHECK:  {{%.*}} = spirv.FMul {{%.*}}, {{%.*}} : !spirv.coopmatrix<16x16xf16, Subgroup, MatrixAcc>
+      %F = gpu.subgroup_mma_elementwise mulf %E, %A :
+        (!gpu.mma_matrix<16x16xf16, "COp">, !gpu.mma_matrix<16x16xf16, "COp">) -> !gpu.mma_matrix<16x16xf16, "COp">
       // CHECK:  {{%.*}} = spirv.FConvert {{%.*}} :
       // CHECK-SAME: !spirv.coopmatrix<16x16xf16, Subgroup, MatrixAcc> to !spirv.coopmatrix<16x16xf32, Subgroup, MatrixAcc>
-      %F = gpu.subgroup_mma_elementwise extf %E :
+      %G = gpu.subgroup_mma_elementwise extf %F :
         (!gpu.mma_matrix<16x16xf16, "COp">) -> !gpu.mma_matrix<16x16xf32, "COp">
 
       %i = arith.constant 0 : index
       // CHECK: spirv.KHR.CooperativeMatrixStore %{{.+}}, %{{.+}}, %{{.+}}, <RowMajor>
-      gpu.subgroup_mma_store_matrix %F, %ptr[%i,%i] {leadDimension = 32 : index} :
+      gpu.subgroup_mma_store_matrix %G, %ptr[%i,%i] {leadDimension = 32 : index} :
         !gpu.mma_matrix<16x16xf32, "COp">, memref<16x16xf32, #spirv.storage_class<StorageBuffer>>
       // CHECK: spirv.Return
       gpu.return
diff --git a/mlir/test/Conversion/ReconcileUnrealizedCasts/reconcile-unrealized-casts.mlir b/mlir/test/Conversion/ReconcileUnrealizedCasts/reconcile-unrealized-casts.mlir
index 3573114f5e038..ac5ca321c066f 100644
--- a/mlir/test/Conversion/ReconcileUnrealizedCasts/reconcile-unrealized-casts.mlir
+++ b/mlir/test/Conversion/ReconcileUnrealizedCasts/reconcile-unrealized-casts.mlir
@@ -194,3 +194,53 @@ func.func @emptyCast() -> index {
     %0 = builtin.unrealized_conversion_cast to index
     return %0 : index
 }
+
+// -----
+
+// CHECK-LABEL: test.graph_region
+//  CHECK-NEXT:   "test.return"() : () -> ()
+test.graph_region {
+  %0 = builtin.unrealized_conversion_cast %2 : i32 to i64
+  %1 = builtin.unrealized_conversion_cast %0 : i64 to i16
+  %2 = builtin.unrealized_conversion_cast %1 : i16 to i32
+  "test.return"() : () -> ()
+}
+
+// -----
+
+// CHECK-LABEL: test.graph_region
+//  CHECK-NEXT:   %[[cast0:.*]] = builtin.unrealized_conversion_cast %[[cast2:.*]] : i32 to i64
+//  CHECK-NEXT:   %[[cast1:.*]] = builtin.unrealized_conversion_cast %[[cast0]] : i64 to i16
+//  CHECK-NEXT:   %[[cast2]] = builtin.unrealized_conversion_cast %[[cast1]] : i16 to i32
+//  CHECK-NEXT:   "test.user"(%[[cast2]]) : (i32) -> ()
+//  CHECK-NEXT:   "test.return"() : () -> ()
+test.graph_region {
+  %0 = builtin.unrealized_conversion_cast %2 : i32 to i64
+  %1 = builtin.unrealized_conversion_cast %0 : i64 to i16
+  %2 = builtin.unrealized_conversion_cast %1 : i16 to i32
+  "test.user"(%2) : (i32) -> ()
+  "test.return"() : () -> ()
+}
+
+// -----
+
+// CHECK-LABEL: test.graph_region
+//  CHECK-NEXT:   "test.return"() : () -> ()
+test.graph_region {
+  %0 = builtin.unrealized_conversion_cast %0 : i32 to i32
+  "test.return"() : () -> ()
+}
+
+// -----
+
+// CHECK-LABEL: test.graph_region
+//  CHECK-NEXT:   %[[c0:.*]] = arith.constant
+//  CHECK-NEXT:   %[[cast:.*]]:2 = builtin.unrealized_conversion_cast %[[c0]], %[[cast]]#1 : i32, i32 to i32, i32
+//  CHECK-NEXT:   "test.user"(%[[cast]]#0) : (i32) -> ()
+//  CHECK-NEXT:   "test.return"() : () -> ()
+test.graph_region {
+  %cst = arith.constant 0 : i32
+  %0, %1 = builtin.unrealized_conversion_cast %cst, %1 : i32, i32 to i32, i32
+  "test.user"(%0) : (i32) -> ()
+  "test.return"() : () -> ()
+}
diff --git a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-resize.mlir b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-resize.mlir
index ff2cbbc0b3938..6998aee45b887 100644
--- a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-resize.mlir
+++ b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-resize.mlir
@@ -12,6 +12,18 @@ func.func @unary_resize_nearest_fp32(%arg0 : tensor<3x1x1x7xf32>) -> tensor<3x1x
 
 // -----
 
+// CHECK-LABEL: @unary_resize_nearest_bf16
+func.func @unary_resize_nearest_bf16(%arg0 : tensor<3x1x1x7xbf16>) -> tensor<3x1x1x7xbf16> {
+  %scale = tosa.const_shape { values = dense<[2, 2, 1, 1]> : tensor<4xindex> } : () -> !tosa.shape<4>
+  %offset = tosa.const_shape { values = dense<0> : tensor<2xindex> } : () -> !tosa.shape<2>
+  %border = tosa.const_shape { values = dense<0> : tensor<2xindex> } : () -> !tosa.shape<2>
+  %resize = tosa.resize %arg0, %scale, %offset, %border {mode = NEAREST_NEIGHBOR} : (tensor<3x1x1x7xbf16>, !tosa.shape<4>, !tosa.shape<2>, !tosa.shape<2>) -> tensor<3x1x1x7xbf16>
+  // CHECK: return %arg0
+  return %resize : tensor<3x1x1x7xbf16>
+}
+
+// -----
+
 // CHECK-LABEL: @unary_resize_nearest_fp16
 func.func @unary_resize_nearest_fp16(%arg0 : tensor<3x1x1x7xf16>) -> tensor<3x1x1x7xf16> {
   %scale = tosa.const_shape { values = dense<[2, 2, 1, 1]> : tensor<4xindex> } : () -> !tosa.shape<4>
@@ -36,6 +48,18 @@ func.func @unary_resize_bilinear_fp32(%arg0 : tensor<3x1x1x7xf32>) -> tensor<3x1
 
 // -----
 
+// CHECK-LABEL: @unary_resize_bilinear_bf16
+func.func @unary_resize_bilinear_bf16(%arg0 : tensor<3x1x1x7xbf16>) -> tensor<3x1x1x7xbf16> {
+  %scale = tosa.const_shape { values = dense<[2, 2, 1, 1]> : tensor<4xindex> } : () -> !tosa.shape<4>
+  %offset = tosa.const_shape { values = dense<0> : tensor<2xindex> } : () -> !tosa.shape<2>
+  %border = tosa.const_shape { values = dense<0> : tensor<2xindex> } : () -> !tosa.shape<2>
+  %resize = tosa.resize %arg0, %scale, %offset, %border {mode = BILINEAR} : (tensor<3x1x1x7xbf16>, !tosa.shape<4>, !tosa.shape<2>, !tosa.shape<2>) -> tensor<3x1x1x7xbf16>
+  // CHECK: return %arg0
+  return %resize : tensor<3x1x1x7xbf16>
+}
+
+// -----
+
 // CHECK-LABEL: @unary_resize_bilinear_fp16
 func.func @unary_resize_bilinear_fp16(%arg0 : tensor<3x1x1x7xf16>) -> tensor<3x1x1x7xf16> {
   %scale = tosa.const_shape { values = dense<[2, 2, 1, 1]> : tensor<4xindex> } : () -> !tosa.shape<4>
@@ -60,6 +84,26 @@ func.func @unary_resize_nearest_i8(%arg0 : tensor<3x1x1x7xi8>) -> tensor<3x1x1x7
 
 // -----
 
+// CHECK-LABEL: @broadcast_resize_nearest_bf16
+func.func @broadcast_resize_nearest_bf16(%arg0 : tensor<3x1x1x7xbf16>) -> tensor<3x1x5x7xbf16> {
+  // CHECK: %[[COLLAPSE:.+]] = tensor.collapse_shape %arg0
+  // CHECK-NEXT{literal}: [[0], [1, 2, 3]] : tensor<3x1x1x7xbf16> into tensor<3x7xbf16>
+  // CHECK: %[[EMPTY:.+]] = tensor.empty() : tensor<3x1x5x7xbf16>
+  // CHECK: %[[GENERIC:.+]] = linalg.generic
+  // CHECK-SAME: indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]}
+  // CHECK-SAME: ins(%[[COLLAPSE]] : tensor<3x7xbf16>) outs(%[[EMPTY]] : tensor<3x1x5x7xbf16>)
+  // CHECK: ^bb0(%[[IN:.+]]: bf16, %[[OUT:.+]]: bf16):
+  // CHECK:   linalg.yield %[[IN]] : bf16
+  %scale = tosa.const_shape { values = dense<[2, 1, 3, 1]> : tensor<4xindex> } : () -> !tosa.shape<4>
+  %offset = tosa.const_shape { values = dense<0> : tensor<2xindex> } : () -> !tosa.shape<2>
+  %border = tosa.const_shape { values = dense<0> : tensor<2xindex> } : () -> !tosa.shape<2>
+  %resize = tosa.resize %arg0, %scale, %offset, %border {mode = NEAREST_NEIGHBOR} : (tensor<3x1x1x7xbf16>, !tosa.shape<4>, !tosa.shape<2>, !tosa.shape<2>) -> tensor<3x1x5x7xbf16>
+
+  return %resize : tensor<3x1x5x7xbf16>
+}
+
+// -----
+
 // CHECK-LABEL: @broadcast_resize_nearest_f32
 func.func @broadcast_resize_nearest_f32(%arg0 : tensor<3x1x1x7xf32>) -> tensor<3x1x5x7xf32> {
   // CHECK: %[[COLLAPSE:.+]] = tensor.collapse_shape %arg0
diff --git a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir
index 3fc513f823a1a..37af8b8859852 100644
--- a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir
+++ b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir
@@ -912,6 +912,32 @@ func.func @test_identity(%arg0: tensor<1xf32>, %arg1: tensor<1xi32>) -> (tensor<
 
 // -----
 
+// CHECK: #[[$MAP0:.*]] = affine_map<(d0) -> (d0)>
+// CHECK-LABEL: @reduce_bf16
+// CHECK-SAME: [[ARG0:%.+]]: tensor<5x4xbf16>
+func.func @reduce_bf16(%arg0: tensor<5x4xbf16>) -> () {
+  // CHECK: [[INIT:%.+]] = tensor.empty() : tensor<4xf32>
+  // CHECK: [[CST0:%.+]] = arith.constant 0.0
+  // CHECK: [[FILL:%.+]] = linalg.fill ins([[CST0]]{{.*}}outs([[INIT]]
+  // CHECK: [[REDUCE:%.+]] = linalg.reduce ins([[ARG0]] : tensor<5x4xbf16>) outs([[FILL]] : tensor<4xf32>) dimensions = [0]
+  // CHECK:  (%[[ARG1:.*]]: bf16, %[[ARG2:.*]]: f32) {
+  // CHECK:   [[EXTF:%.+]] = arith.extf %[[ARG1]] : bf16 to f32
+  // CHECK:   [[ACC:%.+]] = arith.addf [[EXTF]], %[[ARG2]] : f32
+  // CHECK:   linalg.yield [[ACC]] : f32
+  // CHECK:  }
+  // CHECK: [[INIT_RES:%.+]] = tensor.empty() : tensor<4xbf16>
+  // CHECK: [[RES:%.+]] = linalg.generic {indexing_maps = [#[[$MAP0]], #[[$MAP0]]], iterator_types = ["parallel"]} ins([[REDUCE]] : tensor<4xf32>) outs([[INIT_RES]] : tensor<4xbf16>)
+  // CHECK:  ^bb0(%[[IN:.*]]: f32, %[[OUT:.*]]: bf16):
+  // CHECK:   [[TRUNCF:%.+]] = arith.truncf %[[IN]] : f32 to bf16
+  // CHECK:   linalg.yield [[TRUNCF]] : bf16
+  // CHECK:  }
+  // CHECK: tensor.expand_shape [[RES]] {{\[}}[0, 1]] output_shape [1, 4] : tensor<4xbf16> into tensor<1x4xbf16>
+  %0 = tosa.reduce_sum %arg0 {axis = 0 : i32} : (tensor<5x4xbf16>) -> tensor<1x4xbf16>
+  return
+}
+
+// -----
+
 // CHECK-LABEL: @reduce_float
 // CHECK-SAME: [[ARG0:%.+]]: tensor<5x4xf32>
 func.func @reduce_float(%arg0: tensor<5x4xf32>) -> () {
diff --git a/mlir/test/Conversion/VectorToGPU/vector-to-mma-ops.mlir b/mlir/test/Conversion/VectorToGPU/vector-to-mma-ops.mlir
index b8ac63f89af33..ef72901750479 100644
--- a/mlir/test/Conversion/VectorToGPU/vector-to-mma-ops.mlir
+++ b/mlir/test/Conversion/VectorToGPU/vector-to-mma-ops.mlir
@@ -536,3 +536,22 @@ func.func @test_unsupported(%arg0: vector<4x4xi32>, %arg1: vector<4x4xi32>, %arg
                         %0, %1, %arg2 : vector<4x4xi64>, vector<4x4xi64> into vector<4x4xi64>
   return %2 : vector<4x4xi64>
 }
+
+// -----
+
+#map0 = affine_map<(d0, d1) -> (d1, d0)>
+
+// CHECK-LABEL: func @addf
+//       CHECK:   %[[A:.+]] = gpu.subgroup_mma_load_matrix {{.+}} {leadDimension = 16 : index} : memref<16x16xf16> -> !gpu.mma_matrix<16x16xf16, "COp">
+//       CHECK:   %[[B:.+]] = gpu.subgroup_mma_load_matrix {{.+}} {leadDimension = 16 : index, transpose} : memref<16x16xf16> -> !gpu.mma_matrix<16x16xf16, "COp">
+//       CHECK:   %[[C:.+]] = gpu.subgroup_mma_elementwise  addf %[[A]], %[[B]] : (!gpu.mma_matrix<16x16xf16, "COp">, !gpu.mma_matrix<16x16xf16, "COp">) -> !gpu.mma_matrix<16x16xf16, "COp">
+//       CHECK:   gpu.subgroup_mma_store_matrix %[[C]]
+func.func @addf(%arg0: memref<16x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<16x16xf16>) {
+  %c0 = arith.constant 0 : index
+  %cst = arith.constant 0.000000e+00 : f16
+  %A = vector.transfer_read %arg0[%c0, %c0], %cst {in_bounds = [true, true]} : memref<16x16xf16>, vector<16x16xf16>
+  %B = vector.transfer_read %arg1[%c0, %c0], %cst {permutation_map = #map0, in_bounds = [true, true]} : memref<16x16xf16>, vector<16x16xf16>
+  %C = arith.addf %A, %B : vector<16x16xf16>
+  vector.transfer_write %C, %arg2[%c0, %c0] {in_bounds = [true, true]} : vector<16x16xf16>, memref<16x16xf16>
+  return
+}
diff --git a/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir b/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir
index 07d335117de01..2d33888854ea7 100644
--- a/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir
+++ b/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir
@@ -1774,3 +1774,45 @@ func.func @from_elements_3d(%arg0: f32, %arg1: f32, %arg2: f32, %arg3: f32) -> v
   %0 = vector.from_elements %arg0, %arg1, %arg2, %arg3 : vector<2x1x2xf32>
   return %0 : vector<2x1x2xf32>
 }
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// vector.to_elements
+//===----------------------------------------------------------------------===//
+
+// CHECK-LABEL: func @to_elements_1d(
+// CHECK-SAME:    %[[ARG0:.+]]: vector<2xf32>
+// CHECK:         %[[C0:.+]] = llvm.mlir.constant(0 : i64) : i64
+// CHECK:         %[[V0:.+]] = llvm.extractelement %[[ARG0]][%[[C0]] : i64] : vector<2xf32>
+// CHECK:         %[[C1:.+]] = llvm.mlir.constant(1 : i64) : i64
+// CHECK:         %[[V1:.+]] = llvm.extractelement %[[ARG0]][%[[C1]] : i64] : vector<2xf32>
+// CHECK:         return %[[V0]], %[[V1]]
+func.func @to_elements_1d(%arg0: vector<2xf32>) -> (f32, f32) {
+  %0:2 = vector.to_elements %arg0 : vector<2xf32>
+  return %0#0, %0#1 : f32, f32
+}
+
+// -----
+
+// NOTE: We unroll multi-dimensional to_elements ops with pattern
+// `UnrollToElements` and then convert the 1-D to_elements ops to llvm.
+
+// CHECK-LABEL: func @to_elements_2d(
+// CHECK-SAME:    %[[ARG0:.+]]: vector<2x2xf32>
+// CHECK:         %[[CAST:.+]] = builtin.unrealized_conversion_cast %[[ARG0]] : vector<2x2xf32> to !llvm.array<2 x vector<2xf32>>
+// CHECK:         %[[V0:.+]] = llvm.extractvalue %[[CAST]][0] : !llvm.array<2 x vector<2xf32>>
+// CHECK:         %[[V1:.+]] = llvm.extractvalue %[[CAST]][1] : !llvm.array<2 x vector<2xf32>>
+// CHECK:         %[[C0:.+]] = llvm.mlir.constant(0 : i64) : i64
+// CHECK:         %[[R0:.+]] = llvm.extractelement %[[V0]][%[[C0]] : i64] : vector<2xf32>
+// CHECK:         %[[C1:.+]] = llvm.mlir.constant(1 : i64) : i64
+// CHECK:         %[[R1:.+]] = llvm.extractelement %[[V0]][%[[C1]] : i64] : vector<2xf32>
+// CHECK:         %[[C0:.+]] = llvm.mlir.constant(0 : i64) : i64
+// CHECK:         %[[R2:.+]] = llvm.extractelement %[[V1]][%[[C0]] : i64] : vector<2xf32>
+// CHECK:         %[[C1:.+]] = llvm.mlir.constant(1 : i64) : i64
+// CHECK:         %[[R3:.+]] = llvm.extractelement %[[V1]][%[[C1]] : i64] : vector<2xf32>
+// CHECK:         return %[[R0]], %[[R1]], %[[R2]], %[[R3]]
+func.func @to_elements_2d(%arg0: vector<2x2xf32>) -> (f32, f32, f32, f32) {
+  %0:4 = vector.to_elements %arg0 : vector<2x2xf32>
+  return %0#0, %0#1, %0#2, %0#3 : f32, f32, f32, f32
+}
diff --git a/mlir/test/Conversion/VectorToXeGPU/transfer-read-to-xegpu.mlir b/mlir/test/Conversion/VectorToXeGPU/transfer-read-to-xegpu.mlir
index b373bdab80567..c4ca79af1bd9a 100644
--- a/mlir/test/Conversion/VectorToXeGPU/transfer-read-to-xegpu.mlir
+++ b/mlir/test/Conversion/VectorToXeGPU/transfer-read-to-xegpu.mlir
@@ -27,8 +27,9 @@ gpu.func @load_1D_vector(%source: memref<8x16x32xf32>, %offset: index) -> vector
 // LOAD-GATHER-COUNT2: arith.addi {{.*}} : index
 // LOAD-GATHER:        %[[SPLAT:.+]] = vector.broadcast {{.*}}:  index to vector<8xindex>
 // LOAD-GATHER:        %[[IDX:.+]] = arith.addi %[[SPLAT]], %[[STEP]] : vector<8xindex>
-// LOAD-GATHER:        %[[COLLAPSE:.+]] = memref.collapse_shape %[[SRC]] {{\[}}[0, 1, 2]{{\]}} : memref<8x16x32xf32> into memref<4096xf32>
-// LOAD-GATHER:        %[[VEC:.+]] = xegpu.load %[[COLLAPSE]]{{\[}}%[[IDX]]{{\]}}, %[[CST]] : memref<4096xf32>, vector<8xindex>, vector<8xi1> -> vector<8xf32>
+// LOAD-GATHER:        %[[COLLAPSE:.+]] = memref.extract_aligned_pointer_as_index %[[SRC]] : memref<8x16x32xf32> -> index
+// LOAD-GATHER:        %[[COLLAPSE_I:.+]] = arith.index_cast %[[COLLAPSE]] : index to i64
+// LOAD-GATHER:        %[[VEC:.+]] = xegpu.load %[[COLLAPSE_I]]{{\[}}%[[IDX]]{{\]}}, %[[CST]] : i64, vector<8xindex>, vector<8xi1> -> vector<8xf32>
 
 }
 
@@ -62,8 +63,9 @@ gpu.func @load_2D_vector(%source: memref<8x16x32xf32>,
 // LOAD-GATHER-COUNT2: arith.addi {{.*}} : index
 // LOAD-GATHER:        %[[SPLAT:.+]] = vector.broadcast {{.*}}: index to vector<8x16xindex>
 // LOAD-GATHER:        %[[IDX:.+]] = arith.addi %[[SPLAT]], {{.*}}: vector<8x16xindex>
-// LOAD-GATHER:        %[[COLLAPSE:.+]] = memref.collapse_shape %[[SRC]] {{\[}}[0, 1, 2]{{\]}} : memref<8x16x32xf32> into memref<4096xf32>
-// LOAD-GATHER:        %[[VEC:.+]] = xegpu.load %[[COLLAPSE]]{{\[}}%[[IDX]]{{\]}}, %[[CST]] : memref<4096xf32>, vector<8x16xindex>, vector<8x16xi1> -> vector<8x16xf32>
+// LOAD-GATHER:        %[[COLLAPSE:.+]] = memref.extract_aligned_pointer_as_index %[[SRC]] : memref<8x16x32xf32> -> index
+// LOAD-GATHER:        %[[COLLAPSE_I:.+]] = arith.index_cast %[[COLLAPSE]] : index to i64
+// LOAD-GATHER:        %[[VEC:.+]] = xegpu.load %[[COLLAPSE_I]]{{\[}}%[[IDX]]{{\]}}, %[[CST]] : i64, vector<8x16xindex>, vector<8x16xi1> -> vector<8x16xf32>
 
 }
 
@@ -124,8 +126,9 @@ gpu.func @load_transposed(%source: memref<32x64xf32>,
 // LOAD-GATHER-COUNT2: arith.addi {{.*}} : index
 // LOAD-GATHER:        %[[BCAST2:.+]] = vector.broadcast {{.*}} : index to vector<8x16xindex>
 // LOAD-GATHER:        %[[IDX:.+]] = arith.addi %[[BCAST2]], {{.*}}: vector<8x16xindex>
-// LOAD-GATHER:        %[[COLLAPSE:.*]] = memref.collapse_shape %arg0 {{\[\[}}0, 1{{\]\]}} : memref<32x64xf32> into memref<2048xf32>
-// LOAD-GATHER:        %[[LOAD:.*]] = xegpu.load %[[COLLAPSE]][%[[IDX]]], %[[CST]] : memref<2048xf32>, vector<8x16xindex>, vector<8x16xi1> -> vector<8x16xf32>
+// LOAD-GATHER:        %[[COLLAPSE:.*]] = memref.extract_aligned_pointer_as_index %arg0 : memref<32x64xf32> -> index
+// LOAD-GATHER:        %[[COLLAPSE_I:.+]] = arith.index_cast %[[COLLAPSE]] : index to i64
+// LOAD-GATHER:        %[[LOAD:.*]] = xegpu.load %[[COLLAPSE_I]][%[[IDX]]], %[[CST]] : i64, vector<8x16xindex>, vector<8x16xi1> -> vector<8x16xf32>
 
 }
 
@@ -164,8 +167,9 @@ gpu.func @load_dynamic_source(%source: memref<?x?x?xf32>,
 // LOAD-GATHER-COUNT2: arith.addi {{.*}} : index
 // LOAD-GATHER:        %[[BROADIDX:.+]] = vector.broadcast {{.*}} : index to vector<8x16xindex>
 // LOAD-GATHER:        %[[FINALIDX:.+]] = arith.addi %[[BROADIDX]], {{.*}} : vector<8x16xindex>
-// LOAD-GATHER:        %[[COLLAPSE:.+]] = memref.collapse_shape %[[ARG0]] {{\[}}[0, 1, 2]{{\]}} : memref<?x?x?xf32> into memref<?xf32>
-// LOAD-GATHER:        %[[RES:.+]] = xegpu.load %[[COLLAPSE]][%[[FINALIDX]]], %[[CST]] : memref<?xf32>, vector<8x16xindex>, vector<8x16xi1> -> vector<8x16xf32>
+// LOAD-GATHER:        %[[COLLAPSE:.+]] = memref.extract_aligned_pointer_as_index %[[ARG0]] : memref<?x?x?xf32> -> index
+// LOAD-GATHER:        %[[COLLAPSE_I:.+]] = arith.index_cast %[[COLLAPSE]] : index to i64
+// LOAD-GATHER:        %[[RES:.+]] = xegpu.load %[[COLLAPSE_I]]{{\[}}%[[FINALIDX]]{{\]}}, %[[CST]] : i64, vector<8x16xindex>, vector<8x16xi1> -> vector<8x16xf32>
 // LOAD-GATHER:        gpu.return %[[RES]] : vector<8x16xf32>
 }
 
@@ -195,8 +199,9 @@ gpu.func @load_dynamic_source2(%source: memref<?x8x16xf32>,
 // LOAD-GATHER-COUNT2: arith.addi {{.*}} : index
 // LOAD-GATHER-DAG:    %[[BCASTIDX:.+]] = vector.broadcast {{.*}} : index to vector<8x16xindex>
 // LOAD-GATHER-DAG:    %[[OFFSETS:.+]] = arith.addi %[[BCASTIDX]], {{.*}} : vector<8x16xindex>
-// LOAD-GATHER-DAG:    %[[COLLAPSE:.+]] = memref.collapse_shape %arg0 {{\[}}[0, 1, 2]{{\]}} : memref<?x8x16xf32> into memref<?xf32>
-// LOAD-GATHER:        %[[VEC:.+]] = xegpu.load %[[COLLAPSE]]{{\[}}%[[OFFSETS]]{{\]}}, %[[CST_0]] : memref<?xf32>, vector<8x16xindex>, vector<8x16xi1> -> vector<8x16xf32> 
+// LOAD-GATHER-DAG:    %[[COLLAPSE:.+]] = memref.extract_aligned_pointer_as_index %arg0 : memref<?x8x16xf32> -> index
+// LOAD-GATHER-DAG:    %[[COLLAPSE_I:.+]] = arith.index_cast %[[COLLAPSE]] : index to i64
+// LOAD-GATHER:        %[[VEC:.+]] = xegpu.load %[[COLLAPSE_I]]{{\[}}%[[OFFSETS]]{{\]}}, %[[CST_0]] : i64, vector<8x16xindex>, vector<8x16xi1> -> vector<8x16xf32> 
 
 }
 
@@ -224,8 +229,9 @@ gpu.func @load_dynamic_source3(%source: memref<?x?x?x?x?xf32>,
 // LOAD-GATHER-COUNT3: arith.addi {{.*}} : vector<2x4x8x16xindex>
 // LOAD-GATHER:        %[[SPLAT:.+]] = vector.broadcast {{.*}} : index to vector<2x4x8x16xindex>
 // LOAD-GATHER:        %[[IDX:.+]] = arith.addi %[[SPLAT]], {{.*}} : vector<2x4x8x16xindex>
-// LOAD-GATHER:        %[[COLLAPSE:.+]] = memref.collapse_shape %[[SRC]] {{\[}}[0, 1, 2, 3, 4]{{\]}} : memref<?x?x?x?x?xf32> into memref<?xf32>
-// LOAD-GATHER:        %[[VEC:.+]] = xegpu.load %[[COLLAPSE]]{{\[}}%[[IDX]]{{\]}}, %[[CST]] : memref<?xf32>, vector<2x4x8x16xindex>, vector<2x4x8x16xi1> -> vector<2x4x8x16xf32>
+// LOAD-GATHER:        %[[COLLAPSE:.+]] = memref.extract_aligned_pointer_as_index %[[SRC]] : memref<?x?x?x?x?xf32> -> index
+// LOAD-GATHER:        %[[COLLAPSE_I:.+]] = arith.index_cast %[[COLLAPSE]] : index to i64
+// LOAD-GATHER:        %[[VEC:.+]] = xegpu.load %[[COLLAPSE_I]]{{\[}}%[[IDX]]{{\]}}, %[[CST]] : i64, vector<2x4x8x16xindex>, vector<2x4x8x16xi1> -> vector<2x4x8x16xf32>
 // LOAD-GATHER:        return %[[VEC]]
 }
 
@@ -254,8 +260,9 @@ gpu.func @load_high_dim_vector(%source: memref<16x32x64xf32>,
 // LOAD-GATHER-COUNT2: arith.addi {{.*}} : vector<8x16x32xindex>
 // LOAD-GATHER:        %[[BCASTOFF:.+]] = vector.broadcast {{.*}} : index to vector<8x16x32xindex>
 // LOAD-GATHER:        %[[IDX:.+]] = arith.addi %[[BCASTOFF]], {{.*}} : vector<8x16x32xindex>
-// LOAD-GATHER:        %[[COLLAPSE:.+]] = memref.collapse_shape %arg0 {{\[}}[0, 1, 2]{{\]}} : memref<16x32x64xf32> into memref<32768xf32>
-// LOAD-GATHER:        %[[VEC:.+]] = xegpu.load %[[COLLAPSE]][%[[IDX]]], %[[CST]] : memref<32768xf32>, vector<8x16x32xindex>, vector<8x16x32xi1> -> vector<8x16x32xf32>
+// LOAD-GATHER:        %[[COLLAPSE:.+]] = memref.extract_aligned_pointer_as_index %arg0 : memref<16x32x64xf32> -> index
+// LOAD-GATHER:        %[[COLLAPSE_I:.+]] = arith.index_cast %[[COLLAPSE]] : index to i64
+// LOAD-GATHER:        %[[VEC:.+]] = xegpu.load %[[COLLAPSE_I]][%[[IDX]]], %[[CST]] : i64, vector<8x16x32xindex>, vector<8x16x32xi1> -> vector<8x16x32xf32>
 
 }
 
@@ -283,8 +290,9 @@ gpu.func @load_transpose_f16(%source: memref<32x64xf16>,
 // LOAD-GATHER-COUNT2: arith.addi {{.*}} : index
 // LOAD-GATHER:        %[[BCAST2:.+]] = vector.broadcast {{.*}} : index to vector<8x16xindex>
 // LOAD-GATHER:        %[[IDX:.+]] = arith.addi %[[BCAST2]], {{.*}}: vector<8x16xindex>
-// LOAD-GATHER:        %[[COLLAPSE:.*]] = memref.collapse_shape %arg0 {{\[\[}}0, 1{{\]\]}} : memref<32x64xf16> into memref<2048xf16>
-// LOAD-GATHER:        %[[LOAD:.*]] = xegpu.load %[[COLLAPSE]][%[[IDX]]], %[[CST]] : memref<2048xf16>, vector<8x16xindex>, vector<8x16xi1> -> vector<8x16xf16>
+// LOAD-GATHER:        %[[COLLAPSE:.*]] = memref.extract_aligned_pointer_as_index %arg0 : memref<32x64xf16> -> index
+// LOAD-GATHER:        %[[COLLAPSE_I:.+]] = arith.index_cast %[[COLLAPSE]] : index to i64
+// LOAD-GATHER:        %[[LOAD:.*]] = xegpu.load %[[COLLAPSE_I]][%[[IDX]]], %[[CST]] : i64, vector<8x16xindex>, vector<8x16xi1> -> vector<8x16xf16>
 }
 
 // -----
@@ -396,3 +404,40 @@ gpu.func @no_load_unsupported_map(%source: memref<16x32x64xf32>,
 // LOAD-GATHER:        vector.transfer_read
 }
 
+// -----
+gpu.module @xevm_module {
+gpu.func @load_from_subview(%source: memref<4096x4096xf16>, %off1: index, %off2: index) -> vector<8xf16> {
+  %c0 = arith.constant 0.0 : f16
+  %subview = memref.subview %source[%off1, %off2] [256, 256] [1, 1] : memref<4096x4096xf16> to memref<256x256xf16, strided<[4096, 1], offset: ?>>
+  %0 = vector.transfer_read %subview[%off2, %off2], %c0
+    {in_bounds = [true]} : memref<256x256xf16, strided<[4096, 1], offset: ?>>, vector<8xf16>
+  gpu.return %0 : vector<8xf16>
+}
+
+// LOAD-ND-LABEL:  @load_from_subview(
+// LOAD-ND-SAME:   %[[SRC:.+]]: memref<4096x4096xf16>,
+// LOAD-ND-SAME:   %[[OFF1:.+]]: index, %[[OFF2:.+]]: index
+// LOAD-ND:        %[[SUBVIEW:.+]] = memref.subview %[[SRC]][%[[OFF1]], %[[OFF2]]] [256, 256] [1, 1] : memref<4096x4096xf16> to memref<256x256xf16, strided<[4096, 1], offset: ?>> 
+// LOAD-ND:        %[[DESC:.+]] = xegpu.create_nd_tdesc
+// LOAD-ND-SAME:     %[[SUBVIEW]][%[[OFF2]], %[[OFF2]]]
+// LOAD-ND-SAME:     memref<256x256xf16, strided<[4096, 1], offset: ?>> -> !xegpu.tensor_desc<8xf16,
+// LOAD-ND-SAME:     boundary_check = false
+// LOAD-ND:        %[[VEC:.+]] = xegpu.load_nd %[[DESC]]{{.*}}-> vector<8xf16>
+// LOAD-ND:        return %[[VEC]]
+
+// LOAD-GATHER-LABEL:  @load_from_subview(
+// LOAD-GATHER-SAME:   %[[SRC:.+]]: memref<4096x4096xf16>,
+// LOAD-GATHER-SAME:   %[[OFF1:.+]]: index, %[[OFF2:.+]]: index
+// LOAD-GATHER:        %[[CST:.+]] = arith.constant dense<true> : vector<8xi1>
+// LOAD-GATHER:        %[[SUBVIEW:.+]] = memref.subview %[[SRC]][%[[OFF1]], %[[OFF2]]] [256, 256] [1, 1] : memref<4096x4096xf16> to memref<256x256xf16, strided<[4096, 1], offset: ?>> 
+// LOAD-GATHER:        %[[BB:.+]], %[[OFFSET:.+]],{{.*}},{{.*}} = memref.extract_strided_metadata %[[SUBVIEW]] : memref<256x256xf16, strided<[4096, 1], offset: ?>> -> memref<f16>, index, index, index, index, index
+// LOAD-GATHER:        %[[STEP:.+]] = vector.step : vector<8xindex>
+// LOAD-GATHER:        arith.muli {{.*}} : index
+// LOAD-GATHER:        arith.addi %[[OFFSET]]{{.*}} : index
+// LOAD-GATHER:        arith.addi {{.*}} : index
+// LOAD-GATHER:        %[[SPLAT:.+]] = vector.broadcast {{.*}}:  index to vector<8xindex>
+// LOAD-GATHER:        %[[IDX:.+]] = arith.addi %[[SPLAT]], %[[STEP]] : vector<8xindex>
+// LOAD-GATHER:        %[[COLLAPSE:.+]] = memref.extract_aligned_pointer_as_index %[[SUBVIEW]] : memref<256x256xf16, strided<[4096, 1], offset: ?>> -> index
+// LOAD-GATHER:        %[[COLLAPSE_I:.+]] = arith.index_cast %[[COLLAPSE]] : index to i64
+// LOAD-GATHER:        %[[VEC:.+]] = xegpu.load %[[COLLAPSE_I]]{{\[}}%[[IDX]]{{\]}}, %[[CST]] : i64, vector<8xindex>, vector<8xi1> -> vector<8xf16>
+}
diff --git a/mlir/test/Conversion/VectorToXeGPU/transfer-write-to-xegpu.mlir b/mlir/test/Conversion/VectorToXeGPU/transfer-write-to-xegpu.mlir
index b3f761a545ee1..fcfc9414da4f6 100644
--- a/mlir/test/Conversion/VectorToXeGPU/transfer-write-to-xegpu.mlir
+++ b/mlir/test/Conversion/VectorToXeGPU/transfer-write-to-xegpu.mlir
@@ -30,8 +30,9 @@ gpu.func @store_1D_vector(%vec: vector<8xf32>,
 // STORE-SCATTER-COUNT2: arith.addi {{.*}} : index
 // STORE-SCATTER-DAG:    %[[BCAST:.+]] = vector.broadcast {{.*}} : index to vector<8xindex>
 // STORE-SCATTER-DAG:    %[[IDX:.+]] = arith.addi %[[BCAST]], %{{.*}} : vector<8xindex>
-// STORE-SCATTER-DAG:    %[[COLLAPSE:.+]] = memref.collapse_shape %[[SRC]] {{\[}}[0, 1, 2]{{\]}} : memref<8x16x32xf32> into memref<4096xf32>
-// STORE-SCATTER:       xegpu.store %[[VEC]], %[[COLLAPSE]]{{\[}}%[[IDX]]{{\]}}, %[[CST]] : vector<8xf32>, memref<4096xf32>, vector<8xindex>, vector<8xi1>
+// STORE-SCATTER-DAG:    %[[COLLAPSE:.+]] = memref.extract_aligned_pointer_as_index %[[SRC]] : memref<8x16x32xf32> -> index
+// STORE-SCATTER-DAG:    %[[COLLAPSE_I:.+]] = arith.index_cast %[[COLLAPSE]] : index to i64
+// STORE-SCATTER:       xegpu.store %[[VEC]], %[[COLLAPSE_I]]{{\[}}%[[IDX]]{{\]}}, %[[CST]] : vector<8xf32>, i64, vector<8xindex>, vector<8xi1>
 }
 
 // -----
@@ -64,8 +65,9 @@ gpu.func @store_2D_vector(%vec: vector<8x16xf32>,
 // STORE-SCATTER-COUNT2: vector.broadcast {{.*}} : vector<8x16xindex>
 // STORE-SCATTER-DAG:    %[[BCAST2:.+]] = vector.broadcast {{.*}} : index to vector<8x16xindex>
 // STORE-SCATTER-DAG:    %[[IDX:.+]] = arith.addi %[[BCAST2]], {{.*}} : vector<8x16xindex>
-// STORE-SCATTER-DAG:    %[[COLLAPSE:.+]] = memref.collapse_shape %[[SRC]] {{\[}}[0, 1, 2]{{\]}} : memref<8x16x32xf32> into memref<4096xf32>
-// STORE-SCATTER:        xegpu.store %[[VEC]], %[[COLLAPSE]]{{\[}}%[[IDX]]{{\]}}, %[[CST]] : vector<8x16xf32>, memref<4096xf32>, vector<8x16xindex>, vector<8x16xi1>
+// STORE-SCATTER-DAG:    %[[COLLAPSE:.+]] = memref.extract_aligned_pointer_as_index %[[SRC]] : memref<8x16x32xf32> -> index
+// STORE-SCATTER-DAG:    %[[COLLAPSE_I:.+]] = arith.index_cast %[[COLLAPSE]] : index to i64
+// STORE-SCATTER:        xegpu.store %[[VEC]], %[[COLLAPSE_I]]{{\[}}%[[IDX]]{{\]}}, %[[CST]] : vector<8x16xf32>, i64, vector<8x16xindex>, vector<8x16xi1>
 }
 
 // -----
@@ -104,8 +106,9 @@ gpu.func @store_dynamic_source(%vec: vector<8x16xf32>,
 // STORE-SCATTER-COUNT2: vector.broadcast {{.*}} : vector<8x16xindex>
 // STORE-SCATTER-DAG:   %[[BCAST2:.+]] = vector.broadcast {{.*}} : index to vector<8x16xindex>
 // STORE-SCATTER-DAG:   %[[IDX:.+]] = arith.addi %[[BCAST2]], {{.*}} : vector<8x16xindex>
-// STORE-SCATTER-DAG:   %[[COLLAPSE:.+]] = memref.collapse_shape %[[SRC]] {{\[}}[0, 1, 2]{{\]}} : memref<?x?x?xf32> into memref<?xf32>
-// STORE-SCATTER:       xegpu.store %[[VEC]], %[[COLLAPSE]]{{\[}}%[[IDX]]{{\]}}, %[[CST]] : vector<8x16xf32>, memref<?xf32>, vector<8x16xindex>, vector<8x16xi1>
+// STORE-SCATTER-DAG:   %[[COLLAPSE:.+]] = memref.extract_aligned_pointer_as_index %[[SRC]] : memref<?x?x?xf32> -> index
+// STORE-SCATTER-DAG:   %[[COLLAPSE_I:.+]] = arith.index_cast %[[COLLAPSE]] : index to i64
+// STORE-SCATTER:       xegpu.store %[[VEC]], %[[COLLAPSE_I]]{{\[}}%[[IDX]]{{\]}}, %[[CST]] : vector<8x16xf32>, i64, vector<8x16xindex>, vector<8x16xi1>
 }
 
 // -----
@@ -155,8 +158,9 @@ gpu.func @no_store_transposed(%vec: vector<8x16xf32>,
 // STORE-SCATTER-COUNT2: vector.broadcast {{.*}} : vector<8x16xindex>
 // STORE-SCATTER-DAG:    %[[BCAST2:.+]] = vector.broadcast {{.*}} : index to vector<8x16xindex>
 // STORE-SCATTER-DAG:    %[[IDX:.+]] = arith.addi %[[BCAST2]], {{.*}} : vector<8x16xindex>
-// STORE-SCATTER-DAG:    %[[COLLAPSE:.+]] = memref.collapse_shape %[[SRC]] {{\[}}[0, 1]{{\]}} : memref<32x64xf32> into memref<2048xf32>
-// STORE-SCATTER:        xegpu.store %[[VEC]], %[[COLLAPSE]]{{\[}}%[[IDX]]{{\]}}, %[[CST]] : vector<8x16xf32>, memref<2048xf32>, vector<8x16xindex>, vector<8x16xi1>
+// STORE-SCATTER-DAG:    %[[COLLAPSE:.+]] = memref.extract_aligned_pointer_as_index %[[SRC]] : memref<32x64xf32> -> index
+// STORE-SCATTER-DAG:    %[[COLLAPSE_I:.+]] = arith.index_cast %[[COLLAPSE]] : index to i64
+// STORE-SCATTER:        xegpu.store %[[VEC]], %[[COLLAPSE_I]]{{\[}}%[[IDX]]{{\]}}, %[[CST]] : vector<8x16xf32>, i64, vector<8x16xindex>, vector<8x16xi1>
 }
 
 // -----
@@ -186,8 +190,9 @@ gpu.func @store_high_dim_vector(%vec: vector<8x16x32xf32>,
 // STORE-SCATTER-COUNT2: arith.addi {{.*}} : vector<8x16x32xindex>
 // STORE-SCATTER:        %[[BCASTOFF:.+]] = vector.broadcast {{.*}} : index to vector<8x16x32xindex>
 // STORE-SCATTER:        %[[IDX:.+]] = arith.addi %[[BCASTOFF]], {{.*}} : vector<8x16x32xindex>
-// STORE-SCATTER:        %[[COLLAPSE:.+]] = memref.collapse_shape %[[SRC]] {{\[}}[0, 1, 2]{{\]}} : memref<16x32x64xf32> into memref<32768xf32>
-// STORE-SCATTER:        xegpu.store %[[VEC]], %[[COLLAPSE]][%[[IDX]]], %[[CST]] : vector<8x16x32xf32>, memref<32768xf32>, vector<8x16x32xindex>, vector<8x16x32xi1> 
+// STORE-SCATTER:        %[[COLLAPSE:.+]] = memref.extract_aligned_pointer_as_index %[[SRC]] : memref<16x32x64xf32> -> index
+// STORE-SCATTER:        %[[COLLAPSE_I:.+]] = arith.index_cast %[[COLLAPSE]] : index to i64
+// STORE-SCATTER:        xegpu.store %[[VEC]], %[[COLLAPSE_I]][%[[IDX]]], %[[CST]] : vector<8x16x32xf32>, i64, vector<8x16x32xindex>, vector<8x16x32xi1>
 }
 
 // -----
@@ -275,4 +280,49 @@ gpu.func @no_store_out_of_bounds_1D_vector(%vec: vector<8xf32>,
 
 // STORE-SCATTER-LABEL:  @no_store_out_of_bounds_1D_vector(
 // STORE-SCATTER:        vector.transfer_write
-}
\ No newline at end of file
+}
+
+// -----
+gpu.module @xevm_module {
+gpu.func @store_to_subview(%vec: vector<8xf16>,
+    %source: memref<4096x4096xf16>, %off1: index, %off2: index) {
+  %subview = memref.subview %source[%off1, %off2] [256, 256] [1, 1]
+      : memref<4096x4096xf16>
+        to memref<256x256xf16, strided<[4096, 1], offset: ?>>
+  vector.transfer_write %vec, %subview[%off2, %off2]
+      {in_bounds = [true]}
+      : vector<8xf16>, memref<256x256xf16, strided<[4096, 1], offset: ?>>
+  gpu.return
+}
+// STORE-ND-LABEL:  @store_to_subview(
+// STORE-ND-SAME:   %[[VEC:.+]]: vector<8xf16>,
+// STORE-ND-SAME:   %[[SRC:.+]]: memref<4096x4096xf16>,
+// STORE-ND-SAME:   %[[OFF1:.+]]: index, %[[OFF2:.+]]: index
+// STORE-ND:        %[[SUBVIEW:.+]] = memref.subview %[[SRC]][%[[OFF1]], %[[OFF2]]] [256, 256] [1, 1]
+// STORE-ND-SAME:     : memref<4096x4096xf16> to memref<256x256xf16, strided<[4096, 1], offset: ?>>
+// STORE-ND:        %[[DESC:.+]] = xegpu.create_nd_tdesc
+// STORE-ND-SAME:     %[[SUBVIEW]][%[[OFF2]], %[[OFF2]]]
+// STORE-ND-SAME:     memref<256x256xf16, strided<[4096, 1], offset: ?>> -> !xegpu.tensor_desc<8xf16,
+// STORE-ND-SAME:     boundary_check = false
+// STORE-ND:        xegpu.store_nd %[[VEC]], %[[DESC]] : vector<8xf16>
+
+// STORE-SCATTER-LABEL:  @store_to_subview(
+// STORE-SCATTER-SAME:   %[[VEC:.+]]: vector<8xf16>,
+// STORE-SCATTER-SAME:   %[[SRC:.+]]: memref<4096x4096xf16>,
+// STORE-SCATTER-SAME:   %[[OFF1:.+]]: index, %[[OFF2:.+]]: index
+// STORE-SCATTER:        %[[CST:.+]] = arith.constant dense<true> : vector<8xi1>
+// STORE-SCATTER:        %[[SUBVIEW:.+]] = memref.subview %[[SRC]][%[[OFF1]], %[[OFF2]]] [256, 256] [1, 1]
+// STORE-SCATTER-SAME:     : memref<4096x4096xf16> to memref<256x256xf16, strided<[4096, 1], offset: ?>>
+// STORE-SCATTER:        %[[BB:.+]], %[[OFFSET:.+]], {{.*}}, {{.*}} = memref.extract_strided_metadata %[[SUBVIEW]]
+// STORE-SCATTER-SAME:     : memref<256x256xf16, strided<[4096, 1], offset: ?>> -> memref<f16>, index, index, index, index, index
+// STORE-SCATTER:        %[[STEP:.+]] = vector.step : vector<8xindex>
+// STORE-SCATTER:        arith.muli {{.*}} : index
+// STORE-SCATTER:        arith.addi %[[OFFSET]]{{.*}} : index
+// STORE-SCATTER:        arith.addi {{.*}} : index
+// STORE-SCATTER:        %[[SPLAT:.+]] = vector.broadcast {{.*}} : index to vector<8xindex>
+// STORE-SCATTER:        %[[IDX:.+]] = arith.addi %[[SPLAT]], %[[STEP]] : vector<8xindex>
+// STORE-SCATTER:        %[[COLLAPSE:.+]] = memref.extract_aligned_pointer_as_index %[[SUBVIEW]]
+// STORE-SCATTER-SAME:     : memref<256x256xf16, strided<[4096, 1], offset: ?>> -> index
+// STORE-SCATTER:        %[[COLLAPSE_I:.+]] = arith.index_cast %[[COLLAPSE]] : index to i64
+// STORE-SCATTER:        xegpu.store %[[VEC]], %[[COLLAPSE_I]]{{\[}}%[[IDX]]{{\]}}, %[[CST]] : vector<8xf16>, i64, vector<8xindex>, vector<8xi1>
+}
diff --git a/mlir/test/Dialect/Arith/int-range-opts.mlir b/mlir/test/Dialect/Arith/int-range-opts.mlir
index ea5969a100258..e6e48d30cece5 100644
--- a/mlir/test/Dialect/Arith/int-range-opts.mlir
+++ b/mlir/test/Dialect/Arith/int-range-opts.mlir
@@ -132,3 +132,19 @@ func.func @wraps() -> i8 {
   %mod = arith.remsi %val, %c64 : i8
   return %mod : i8
 }
+
+// -----
+
+// CHECK-LABEL: @analysis_crash
+func.func @analysis_crash(%arg0: i32, %arg1: tensor<128xi1>) -> tensor<128xi64> {
+  %c0_i32 = arith.constant 0 : i32
+  %cst = arith.constant dense<-1> : tensor<128xi32>
+  %splat = tensor.splat %arg0 : tensor<128xi32>
+  %0 = scf.for %arg2 = %c0_i32 to %arg0 step %arg0 iter_args(%arg3 = %splat) -> (tensor<128xi32>)  : i32 {
+    scf.yield %arg3 : tensor<128xi32>
+  }
+  %1 = arith.select %arg1, %0#0, %cst : tensor<128xi1>, tensor<128xi32>
+  // Make sure the analysis doesn't crash when materializing the range as a tensor constant.
+  %2 = arith.extsi %1 : tensor<128xi32> to tensor<128xi64>
+  return %2 : tensor<128xi64>
+}
diff --git a/mlir/test/Dialect/EmitC/invalid_ops.mlir b/mlir/test/Dialect/EmitC/invalid_ops.mlir
index a97474401645c..f4c15f50053a8 100644
--- a/mlir/test/Dialect/EmitC/invalid_ops.mlir
+++ b/mlir/test/Dialect/EmitC/invalid_ops.mlir
@@ -532,6 +532,16 @@ func.func @use_global() {
 
 // -----
 
+emitc.global @myglobal_value : f32
+
+func.func @use_global() {
+  // expected-error @+1 {{'emitc.get_global' op on non-array type expects result type to be an lvalue type for the global @myglobal_value}}
+  %0 = emitc.get_global @myglobal_value : !emitc.array<2xf32>
+  return
+}
+
+// -----
+
 func.func @member(%arg0: !emitc.lvalue<i32>) {
   // expected-error @+1 {{'emitc.member' op operand #0 must be emitc.lvalue of EmitC opaque type values, but got '!emitc.lvalue<i32>'}}
   %0 = "emitc.member" (%arg0) {member = "a"} : (!emitc.lvalue<i32>) -> !emitc.lvalue<i32>
diff --git a/mlir/test/Dialect/Func/func-transform-invalid.mlir b/mlir/test/Dialect/Func/func-transform-invalid.mlir
index e712eee83f36e..29bd58ab52742 100644
--- a/mlir/test/Dialect/Func/func-transform-invalid.mlir
+++ b/mlir/test/Dialect/Func/func-transform-invalid.mlir
@@ -85,3 +85,92 @@ module attributes {transform.with_named_sequence} {
     transform.yield
   }
 }
+
+// -----
+
+func.func private @func_with_no_duplicate_args(%arg0: memref<1xi8, 1>, %arg1: memref<2xi8, 1>, %arg2: memref<3xi8, 1>) {
+  %c0 = arith.constant 0 : index
+  %view = memref.view %arg0[%c0][] : memref<1xi8, 1> to memref<1xi8, 1>
+  %view0 = memref.view %arg1[%c0][] : memref<2xi8, 1> to memref<2xi8, 1>
+  %view1 = memref.view %arg2[%c0][] : memref<3xi8, 1> to memref<3xi8, 1>
+  return
+}
+
+func.func @func_with_no_duplicate_args_caller(%arg0: memref<1xi8, 1>, %arg1: memref<2xi8, 1>, %arg2: memref<3xi8, 1>) {
+  call @func_with_no_duplicate_args(%arg0, %arg1, %arg2) : (memref<1xi8, 1>, memref<2xi8, 1>, memref<3xi8, 1>) -> ()
+  return
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%module: !transform.any_op) {
+    // expected-error @+1 {{failed to deduplicate function arguments of function func_with_no_duplicate_args}}
+    transform.func.deduplicate_func_args @func_with_no_duplicate_args at %module : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    transform.yield
+  }
+}
+
+// -----
+
+func.func private @func_not_found(%arg0: memref<1xi8, 1>, %arg1: memref<2xi8, 1>, %arg2: memref<3xi8, 1>) {
+  %c0 = arith.constant 0 : index
+  %view = memref.view %arg0[%c0][] : memref<1xi8, 1> to memref<1xi8, 1>
+  %view0 = memref.view %arg1[%c0][] : memref<2xi8, 1> to memref<2xi8, 1>
+  %view1 = memref.view %arg2[%c0][] : memref<3xi8, 1> to memref<3xi8, 1>
+  return
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%module: !transform.any_op) {
+    // expected-error @+1 {{function with name '@non_existent_func' is not found}}
+    transform.func.deduplicate_func_args @non_existent_func at %module : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    transform.yield
+  }
+}
+
+// -----
+
+func.func private @func_with_multiple_calls(%arg0: memref<1xi8, 1>, %arg1: memref<1xi8, 1>) {
+  %c0 = arith.constant 0 : index
+  %view = memref.view %arg0[%c0][] : memref<1xi8, 1> to memref<1xi8, 1>
+  %view0 = memref.view %arg1[%c0][] : memref<1xi8, 1> to memref<1xi8, 1>
+  return
+}
+
+func.func @func_with_multiple_calls_caller1(%arg0: memref<1xi8, 1>, %arg1: memref<2xi8, 1>) {
+  call @func_with_multiple_calls(%arg0, %arg0) : (memref<1xi8, 1>, memref<1xi8, 1>) -> ()
+  return
+}
+
+func.func @func_with_multiple_calls_caller2(%arg0: memref<1xi8, 1>, %arg1: memref<2xi8, 1>) {
+  call @func_with_multiple_calls(%arg0, %arg0) : (memref<1xi8, 1>, memref<1xi8, 1>) -> ()
+  return
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%module: !transform.any_op) {
+    // expected-error @+1 {{failed to deduplicate function arguments of function func_with_multiple_calls}}
+    transform.func.deduplicate_func_args @func_with_multiple_calls at %module : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    transform.yield
+  }
+}
+
+// -----
+
+func.func private @func_with_no_calls(%arg0: memref<1xi8, 1>, %arg1: memref<1xi8, 1>) {
+  %c0 = arith.constant 0 : index
+  %view = memref.view %arg0[%c0][] : memref<1xi8, 1> to memref<1xi8, 1>
+  %view0 = memref.view %arg1[%c0][] : memref<1xi8, 1> to memref<1xi8, 1>
+  return
+}
+
+func.func @some_other_func() {
+  return
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%module: !transform.any_op) {
+    // expected-error @+1 {{failed to deduplicate function arguments of function func_with_no_calls}}
+    transform.func.deduplicate_func_args @func_with_no_calls at %module : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    transform.yield
+  }
+}
diff --git a/mlir/test/Dialect/Func/func-transform.mlir b/mlir/test/Dialect/Func/func-transform.mlir
index 36a66aaa95bfb..8a71511e3ed5b 100644
--- a/mlir/test/Dialect/Func/func-transform.mlir
+++ b/mlir/test/Dialect/Func/func-transform.mlir
@@ -250,3 +250,65 @@ module attributes {transform.with_named_sequence} {
     transform.yield
   }
 }
+
+// -----
+
+// CHECK:           func.func private @func_with_duplicate_args(%[[ARG0:.*]]: memref<1xi8, 1>, %[[ARG1:.*]]: memref<2xi8, 1>) {
+func.func private @func_with_duplicate_args(%arg0: memref<1xi8, 1>, %arg1: memref<2xi8, 1>, %arg2: memref<1xi8, 1>) {
+  %c0 = arith.constant 0 : index
+  // CHECK:             %[[VAL_3:.*]] = memref.view %[[ARG0]]{{\[}}%[[C0:.*]]][] : memref<1xi8, 1> to memref<1xi8, 1>
+  %view = memref.view %arg0[%c0][] : memref<1xi8, 1> to memref<1xi8, 1>
+  // CHECK:             %[[VAL_4:.*]] = memref.view %[[ARG1]]{{\[}}%[[C0]]][] : memref<2xi8, 1> to memref<2xi8, 1>
+  %view0 = memref.view %arg1[%c0][] : memref<2xi8, 1> to memref<2xi8, 1>
+  // CHECK:             %[[VAL_5:.*]] = memref.view %[[ARG0]]{{\[}}%[[C0]]][] : memref<1xi8, 1> to memref<1xi8, 1>
+  %view1 = memref.view %arg2[%c0][] : memref<1xi8, 1> to memref<1xi8, 1>
+  return
+}
+
+// CHECK:           func.func @func_with_duplicate_args_caller(%[[ARG0:.*]]: memref<1xi8, 1>, %[[ARG1:.*]]: memref<2xi8, 1>) {
+func.func @func_with_duplicate_args_caller(%arg0: memref<1xi8, 1>, %arg1: memref<2xi8, 1>) {
+  // CHECK:             call @func_with_duplicate_args(%[[ARG0]], %[[ARG1]]) : (memref<1xi8, 1>, memref<2xi8, 1>) -> ()
+  call @func_with_duplicate_args(%arg0, %arg1, %arg0) : (memref<1xi8, 1>, memref<2xi8, 1>, memref<1xi8, 1>) -> ()
+  return
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%module: !transform.any_op) {
+    transform.func.deduplicate_func_args @func_with_duplicate_args at %module : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    transform.yield
+  }
+}
+
+// -----
+
+// CHECK:           func.func private @func_with_complex_duplicate_args(%[[ARG0:.*]]: memref<1xi8, 1>, %[[ARG1:.*]]: memref<2xi8, 1>, %[[ARG2:.*]]: memref<3xi8, 1>) -> (memref<1xi8, 1>, memref<2xi8, 1>, memref<1xi8, 1>, memref<3xi8, 1>, memref<2xi8, 1>) {
+func.func private @func_with_complex_duplicate_args(%arg0: memref<1xi8, 1>, %arg1: memref<2xi8, 1>, %arg2: memref<1xi8, 1>, %arg3: memref<3xi8, 1>, %arg4: memref<2xi8, 1>) -> (memref<1xi8, 1>, memref<2xi8, 1>, memref<1xi8, 1>, memref<3xi8, 1>, memref<2xi8, 1>) {
+  %c0 = arith.constant 0 : index
+  // CHECK:             %[[RET_0:.*]] = memref.view %[[ARG0]]{{\[}}%[[C0:.*]]][] : memref<1xi8, 1> to memref<1xi8, 1>
+  %view0 = memref.view %arg0[%c0][] : memref<1xi8, 1> to memref<1xi8, 1>
+  // CHECK:             %[[RET_1:.*]] = memref.view %[[ARG1]]{{\[}}%[[C0]]][] : memref<2xi8, 1> to memref<2xi8, 1>
+  %view1 = memref.view %arg1[%c0][] : memref<2xi8, 1> to memref<2xi8, 1>
+  // CHECK:             %[[RET_2:.*]] = memref.view %[[ARG0]]{{\[}}%[[C0]]][] : memref<1xi8, 1> to memref<1xi8, 1>
+  %view2 = memref.view %arg2[%c0][] : memref<1xi8, 1> to memref<1xi8, 1>
+  // CHECK:             %[[RET_3:.*]] = memref.view %[[ARG2]]{{\[}}%[[C0]]][] : memref<3xi8, 1> to memref<3xi8, 1>
+  %view3 = memref.view %arg3[%c0][] : memref<3xi8, 1> to memref<3xi8, 1>
+  // CHECK:             %[[RET_4:.*]] = memref.view %[[ARG1]]{{\[}}%[[C0]]][] : memref<2xi8, 1> to memref<2xi8, 1>
+  %view4 = memref.view %arg4[%c0][] : memref<2xi8, 1> to memref<2xi8, 1>
+  // CHECK:             return %[[RET_0]], %[[RET_1]], %[[RET_2]], %[[RET_3]], %[[RET_4]] : memref<1xi8, 1>, memref<2xi8, 1>, memref<1xi8, 1>, memref<3xi8, 1>, memref<2xi8, 1>
+  return %view0, %view1, %view2, %view3, %view4 : memref<1xi8, 1>, memref<2xi8, 1>, memref<1xi8, 1>, memref<3xi8, 1>, memref<2xi8, 1>
+}
+
+// CHECK:           func.func @func_with_complex_duplicate_args_caller(%[[ARG0:.*]]: memref<1xi8, 1>, %[[ARG1:.*]]: memref<2xi8, 1>, %[[ARG2:.*]]: memref<3xi8, 1>) -> (memref<1xi8, 1>, memref<2xi8, 1>, memref<1xi8, 1>, memref<3xi8, 1>, memref<2xi8, 1>) {
+func.func @func_with_complex_duplicate_args_caller(%arg0: memref<1xi8, 1>, %arg1: memref<2xi8, 1>, %arg2: memref<3xi8, 1>) -> (memref<1xi8, 1>, memref<2xi8, 1>, memref<1xi8, 1>, memref<3xi8, 1>, memref<2xi8, 1>) {
+  // CHECK:             %[[RET:.*]]:5 = call @func_with_complex_duplicate_args(%[[ARG0]], %[[ARG1]], %[[ARG2]]) : (memref<1xi8, 1>, memref<2xi8, 1>, memref<3xi8, 1>) -> (memref<1xi8, 1>, memref<2xi8, 1>, memref<1xi8, 1>, memref<3xi8, 1>, memref<2xi8, 1>)
+  %0:5 = call @func_with_complex_duplicate_args(%arg0, %arg1, %arg0, %arg2, %arg1) : (memref<1xi8, 1>, memref<2xi8, 1>, memref<1xi8, 1>, memref<3xi8, 1>, memref<2xi8, 1>) -> (memref<1xi8, 1>, memref<2xi8, 1>, memref<1xi8, 1>, memref<3xi8, 1>, memref<2xi8, 1>)
+  // CHECK:             return %[[RET]]#0, %[[RET]]#1, %[[RET]]#2, %[[RET]]#3, %[[RET]]#4 : memref<1xi8, 1>, memref<2xi8, 1>, memref<1xi8, 1>, memref<3xi8, 1>, memref<2xi8, 1>
+  return %0#0, %0#1, %0#2, %0#3, %0#4 : memref<1xi8, 1>, memref<2xi8, 1>, memref<1xi8, 1>, memref<3xi8, 1>, memref<2xi8, 1>
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%module: !transform.any_op) {
+    transform.func.deduplicate_func_args @func_with_complex_duplicate_args at %module : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    transform.yield
+  }
+}
diff --git a/mlir/test/Dialect/LLVMIR/invalid.mlir b/mlir/test/Dialect/LLVMIR/invalid.mlir
index 4394786db5a5d..749fb634dba76 100644
--- a/mlir/test/Dialect/LLVMIR/invalid.mlir
+++ b/mlir/test/Dialect/LLVMIR/invalid.mlir
@@ -1972,6 +1972,20 @@ llvm.func @invalid_xevm_prefetch(%arg0: !llvm.ptr) {
   llvm.return
 }
 
+// -----
+llvm.func @invalid_xevm_blockload(%arg0: !llvm.ptr<1>) {
+  // expected-error@+1 {{op vector size must be 1, 2, 4 or 8 for element type > 8 bits}}
+  %0 = xevm.blockload %arg0 : (!llvm.ptr<1>) -> vector<3xi16>
+  llvm.return
+}
+
+// -----
+llvm.func @invalid_xevm_blockstore(%arg0: !llvm.ptr<1>, %arg1: vector<5xi8>) {
+  // expected-error@+1 {{op vector size must be 1, 2, 4, 8 or 16 for 8-bit element type}}
+  xevm.blockstore %arg0, %arg1 : (!llvm.ptr<1>, vector<5xi8>)
+  llvm.return
+}
+
 // -----
 
 llvm.func @invalid_xevm_mma(%loaded_c_casted: vector<4xf32>, %loaded_a: vector<8xi16>, %loaded_b_casted: vector<8xi32>) -> vector<8xf32> {
diff --git a/mlir/test/Dialect/LLVMIR/nvvm.mlir b/mlir/test/Dialect/LLVMIR/nvvm.mlir
index 5209b3c1d7906..3277e62893527 100644
--- a/mlir/test/Dialect/LLVMIR/nvvm.mlir
+++ b/mlir/test/Dialect/LLVMIR/nvvm.mlir
@@ -620,6 +620,16 @@ func.func @prefetch_tensormap(%gen_ptr: !llvm.ptr, %const_ptr: !llvm.ptr<4>) {
   return
 }
 
+// CHECK-LABEL: @nvvm_address_space
+func.func private @nvvm_address_space(
+    !ptr.ptr<#nvvm.memory_space<global>>,
+    !ptr.ptr<#nvvm.memory_space<shared>>,
+    !ptr.ptr<#nvvm.memory_space<constant>>,
+    !ptr.ptr<#nvvm.memory_space<local>>,
+    !ptr.ptr<#nvvm.memory_space<tensor>>,
+    !ptr.ptr<#nvvm.memory_space<shared_cluster>>
+  ) -> !ptr.ptr<#nvvm.memory_space<generic>>
+
 // -----
 
 // Just check these don't emit errors.
diff --git a/mlir/test/Dialect/LLVMIR/xevm.mlir b/mlir/test/Dialect/LLVMIR/xevm.mlir
index 3dd5f872f898c..bb1f650a1cd12 100644
--- a/mlir/test/Dialect/LLVMIR/xevm.mlir
+++ b/mlir/test/Dialect/LLVMIR/xevm.mlir
@@ -58,6 +58,29 @@ func.func @blockprefetch2d(%ptr: !llvm.ptr<1>, %base_width: i32, %base_height: i
   return
 }
 
+// -----
+// CHECK-LABEL: func.func @blockload(
+// CHECK-SAME: %[[ARG0:.*]]: !llvm.ptr<1>)
+func.func @blockload(%ptr: !llvm.ptr<1>) -> vector<4xi16> {
+  // CHECK: %[[VAR0:.*]] = xevm.blockload %[[ARG0]]
+  // CHECK-SAME: cache_control = #xevm.load_cache_control<L1uc_L2uc_L3uc>
+  // CHECK-SAME: (!llvm.ptr<1>) -> vector<4xi16>
+  %loaded = xevm.blockload %ptr <{cache_control=#xevm.load_cache_control<L1uc_L2uc_L3uc>}>
+              : (!llvm.ptr<1>) -> vector<4xi16>
+  return %loaded : vector<4xi16>
+}
+
+// -----
+// CHECK-LABEL: func.func @blockstore(
+// CHECK-SAME: %[[ARG0:.*]]: !llvm.ptr<1>,
+// CHECK-SAME: %[[ARG1:.*]]: vector<4xi32>)
+func.func @blockstore(%ptr: !llvm.ptr<1>, %value: vector<4xi32>) {
+  // CHECK: xevm.blockstore %[[ARG0]], %[[ARG1]]
+  // CHECK-SAME: (!llvm.ptr<1>, vector<4xi32>)
+  xevm.blockstore %ptr, %value : (!llvm.ptr<1>, vector<4xi32>)
+  return
+}
+
 // -----
 // CHECK-LABEL: func.func @mma(
 // CHECK-SAME: %[[ARG0:.*]]: vector<8xf32>, %[[ARG1:.*]]: vector<8xi16>, %[[ARG2:.*]]: vector<8xi32>)
diff --git a/mlir/test/Dialect/Linalg/data-layout-propagation.mlir b/mlir/test/Dialect/Linalg/data-layout-propagation.mlir
index fb16e1e7dcda4..a5f8d63a3e912 100644
--- a/mlir/test/Dialect/Linalg/data-layout-propagation.mlir
+++ b/mlir/test/Dialect/Linalg/data-layout-propagation.mlir
@@ -1577,3 +1577,33 @@ func.func @push_extract_through_generic_rank0_operand(%arg0: tensor<128x128xf32>
 // CHECK:         %[[GENERIC:.+]] = linalg.generic
 // CHECK:         %[[EXTRACT:.+]] = tensor.extract_slice %[[GENERIC]]         
 // CHECK:         return %[[EXTRACT]]
+
+// -----
+// Test that if one extract doesnt pass the control function which in this case is set to
+// only allow extracts from the same block, then an extract from a later operand can still be pushed
+// down.
+func.func @push_extract_through_generic_secondextract(%arg0: tensor<128x128xf32>, %arg1: tensor<?x?xbf16>, %arg2: index) -> tensor<?x?xbf16> {
+  %c0 = arith.constant 0 : index
+  %c32 = arith.constant 32 : index
+  %extracted_slice1 = tensor.extract_slice %arg0[%arg2, %arg2] [%arg2, %arg2] [1, 1] : tensor<128x128xf32> to tensor<?x?xf32>
+  %for = scf.for %arg3 = %c0 to %c32 step %arg2 iter_args(%arg4 = %arg1) -> tensor<?x?xbf16> {
+    %extracted_slice = tensor.extract_slice %arg0[%arg2, %arg2] [%arg2, %arg2] [1, 1] : tensor<128x128xf32> to tensor<?x?xf32>
+    %0 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,affine_map<(d0, d1) -> (d0, d1)> ,affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%extracted_slice1, %extracted_slice : tensor<?x?xf32>,  tensor<?x?xf32>) outs(%arg1 : tensor<?x?xbf16>) {
+    ^bb0(%in: f32, %in_1 : f32, %out: bf16):
+      %1 = arith.truncf %in : f32 to bf16
+      linalg.yield %1 : bf16
+    } -> tensor<?x?xbf16>
+    scf.yield %0 : tensor<?x?xbf16>
+  }
+ return %for : tensor<?x?xbf16>
+}
+
+// CHECK-LABEL: func.func @push_extract_through_generic_secondextract
+// CHECK-SAME:    %[[ARG0:[a-zA-Z0-9]+]]
+// CHECK:         %[[EXTRACT:.+]] = tensor.extract_slice
+// CHECK:         %[[FOR:.+]] = scf.for
+// CHECK:           %[[PAD:.+]] = tensor.pad %[[EXTRACT]]
+// CHECK:           %[[GENERIC:.+]] = linalg.generic
+// CHECK-SAME:        ins(%[[PAD]], %[[ARG0]]
+// CHECK:           %[[EXTRACT2:.+]] =  tensor.extract_slice %[[GENERIC]]
+// CHECK:           scf.yield %[[EXTRACT2]]
diff --git a/mlir/test/Dialect/Linalg/drop-unit-extent-dims.mlir b/mlir/test/Dialect/Linalg/drop-unit-extent-dims.mlir
index 5f42938244db6..9005110205630 100644
--- a/mlir/test/Dialect/Linalg/drop-unit-extent-dims.mlir
+++ b/mlir/test/Dialect/Linalg/drop-unit-extent-dims.mlir
@@ -915,7 +915,7 @@ func.func @sparse_case(%arg0: tensor<8x8xf32, #CSR>, %arg1: tensor<8xf32>) -> te
 
 // -----
 
-func.func @reduce_dispatch_0() -> tensor<4x2xf32> {
+func.func @parallel_insert_slice() -> tensor<4x2xf32> {
   %c2 = arith.constant 2 : index
   %c4 = arith.constant 4 : index
   %cst = arith.constant 0.000000e+00 : f32
@@ -923,6 +923,7 @@ func.func @reduce_dispatch_0() -> tensor<4x2xf32> {
   %res = scf.forall (%arg0, %arg1) in (%c4, %c2) shared_outs(%o = %0) -> (tensor<4x2xf32>) {
     %1 = tensor.empty() : tensor<1x1xf32>
     %2 = linalg.fill ins(%cst : f32) outs(%1 : tensor<1x1xf32>) -> tensor<1x1xf32>
+    // CHECK: scf.forall.in_parallel
     scf.forall.in_parallel {
       //      CHECK: tensor.parallel_insert_slice %{{[0-9a-z]*}} into %{{[0-9a-z]*}}
       // CHECK-SAME: [%{{.*}}, %{{.*}}] [1, 1] [1, 1] : tensor<f32> into tensor<4x2xf32>
diff --git a/mlir/test/Dialect/MemRef/invalid.mlir b/mlir/test/Dialect/MemRef/invalid.mlir
index b4476036d6513..3f96d907632b7 100644
--- a/mlir/test/Dialect/MemRef/invalid.mlir
+++ b/mlir/test/Dialect/MemRef/invalid.mlir
@@ -380,7 +380,7 @@ func.func @mismatched_types() {
 
 // -----
 
-// expected-error @+1 {{alignment attribute value 63 is not a power of 2}}
+// expected-error @+1 {{'memref.global' op attribute 'alignment' failed to satisfy constraint: 64-bit signless integer attribute whose value is positive and whose value is a power of two > 0}}
 memref.global "private" @gv : memref<4xf32> = dense<1.0> { alignment = 63 }
 
 // -----
diff --git a/mlir/test/Dialect/OpenACC/invalid.mlir b/mlir/test/Dialect/OpenACC/invalid.mlir
index 68afd9fccba79..26b63fbe182ea 100644
--- a/mlir/test/Dialect/OpenACC/invalid.mlir
+++ b/mlir/test/Dialect/OpenACC/invalid.mlir
@@ -831,3 +831,20 @@ func.func @acc_loop_container() {
 %value = memref.alloc() : memref<f32>
 // expected-error @below {{invalid data clause modifiers: readonly}}
 %0 = acc.create varPtr(%value : memref<f32>) -> memref<f32> {modifiers = #acc<data_clause_modifier readonly,zero,capture,always>}
+
+// -----
+
+func.func @verify_declare_enter(%arg0 : memref<i32>) {
+// expected-error @below {{expect valid declare data entry operation or acc.getdeviceptr as defining op}}
+  %0 = acc.declare_enter dataOperands(%arg0 : memref<i32>)
+  acc.declare_exit token(%0) dataOperands(%arg0 : memref<i32>)
+  return
+}
+
+func.func @verify_data(%arg0 : memref<i32>) {
+// expected-error @below {{expect data entry/exit operation or acc.getdeviceptr as defining op}}
+  acc.data dataOperands(%arg0 : memref<i32>) {
+    acc.terminator
+  }
+  return
+}
diff --git a/mlir/test/Dialect/Ptr/ops.mlir b/mlir/test/Dialect/Ptr/ops.mlir
index 51e5ac3ae691d..7b2254185f57c 100644
--- a/mlir/test/Dialect/Ptr/ops.mlir
+++ b/mlir/test/Dialect/Ptr/ops.mlir
@@ -114,7 +114,7 @@ func.func @masked_store_ops_tensor(%value: tensor<8xi64>, %ptr: !ptr.ptr<#ptr.ge
 }
 
 /// Test operations with LLVM address space
-func.func @llvm_masked_ops(%ptr: !ptr.ptr<#llvm.address_space<3>>, %ptrs: vector<4x!ptr.ptr<#llvm.address_space<3>>>, 
+func.func @llvm_masked_ops(%ptr: !ptr.ptr<#llvm.address_space<3>>, %ptrs: vector<4x!ptr.ptr<#llvm.address_space<3>>>,
                            %mask: vector<4xi1>, %value: vector<4xf32>, %passthrough: vector<4xf32>) -> vector<4xf32> {
   // Gather from shared memory (address space 3)
   %0 = ptr.gather %ptrs, %mask, %passthrough alignment = 4 : vector<4x!ptr.ptr<#llvm.address_space<3>>> -> vector<4xf32>
@@ -189,3 +189,25 @@ func.func @ptr_add_tensor_base_scalar_offset(%ptrs: tensor<8x!ptr.ptr<#ptr.gener
   %res3 = ptr.ptr_add inbounds %ptrs, %offset : tensor<8x!ptr.ptr<#ptr.generic_space>>, i64
   return %res : tensor<8x!ptr.ptr<#ptr.generic_space>>
 }
+
+/// Test constant operations with null pointer
+func.func @constant_null_ops() -> (!ptr.ptr<#ptr.generic_space>, !ptr.ptr<#llvm.address_space<1>>) {
+  %null_generic = ptr.constant #ptr.null : !ptr.ptr<#ptr.generic_space>
+  %null_as1 = ptr.constant #ptr.null : !ptr.ptr<#llvm.address_space<1>>
+  return %null_generic, %null_as1 : !ptr.ptr<#ptr.generic_space>, !ptr.ptr<#llvm.address_space<1>>
+}
+
+/// Test constant operations with address values
+func.func @constant_address_ops() -> (!ptr.ptr<#ptr.generic_space>, !ptr.ptr<#llvm.address_space<1>>, !ptr.ptr<#llvm.address_space<3>>) {
+  %addr_0 = ptr.constant #ptr.address<0> : !ptr.ptr<#ptr.generic_space>
+  %addr_1000 = ptr.constant #ptr.address<0x1000> : !ptr.ptr<#llvm.address_space<1>>
+  %addr_deadbeef = ptr.constant #ptr.address<0xDEADBEEF> : !ptr.ptr<#llvm.address_space<3>>
+  return %addr_0, %addr_1000, %addr_deadbeef : !ptr.ptr<#ptr.generic_space>, !ptr.ptr<#llvm.address_space<1>>, !ptr.ptr<#llvm.address_space<3>>
+}
+
+/// Test constant operations with large address values
+func.func @constant_large_address_ops() -> (!ptr.ptr<#ptr.generic_space>, !ptr.ptr<#llvm.address_space<0>>) {
+  %addr_max32 = ptr.constant #ptr.address<0xFFFFFFFF> : !ptr.ptr<#ptr.generic_space>
+  %addr_large = ptr.constant #ptr.address<0x123456789ABCDEF0> : !ptr.ptr<#llvm.address_space<0>>
+  return %addr_max32, %addr_large : !ptr.ptr<#ptr.generic_space>, !ptr.ptr<#llvm.address_space<0>>
+}
diff --git a/mlir/test/Dialect/SCF/canonicalize.mlir b/mlir/test/Dialect/SCF/canonicalize.mlir
index 2752c492cb2be..5e89f74075252 100644
--- a/mlir/test/Dialect/SCF/canonicalize.mlir
+++ b/mlir/test/Dialect/SCF/canonicalize.mlir
@@ -749,7 +749,7 @@ func.func @replace_single_iteration_const_diff(%arg0 : index) {
   // CHECK-NEXT: %[[CST:.*]] = arith.constant 2
   %c1 = arith.constant 1 : index
   %c2 = arith.constant 2 : index
-  %5 = arith.addi %arg0, %c1 : index
+  %5 = arith.addi %arg0, %c1 overflow<nsw> : index
   // CHECK-NOT: scf.for
   scf.for %arg2 = %arg0 to %5 step %c1 {
     // CHECK-NEXT: %[[MUL:.*]] = arith.muli %[[A0]], %[[CST]]
@@ -1483,6 +1483,24 @@ func.func @execute_region_no_inline() {
 
 // -----
 
+// CHECK-LABEL: func @execute_region_under_func_no_inline
+func.func @execute_region_under_func_no_inline() {
+    "test.foo"() : () -> ()
+    %v = scf.execute_region -> i64 no_inline {
+      %x = "test.val"() : () -> i64
+      scf.yield %x : i64
+    }
+    "test.bar"(%v) : (i64) -> ()
+  return
+}
+
+// CHECK-NEXT:       "test.foo"() : () -> ()
+// CHECK-NEXT:       scf.execute_region
+// CHECK-NEXT:       %[[VAL:.*]] = "test.val"() : () -> i64
+// CHECK-NEXT:       scf.yield %[[VAL]] : i64
+
+// -----
+
 // CHECK-LABEL: func @func_execute_region_inline
 func.func @func_execute_region_inline() {
     "test.foo"() : () -> ()
@@ -1915,8 +1933,9 @@ func.func @index_switch_fold_no_res() {
 
 // -----
 
+// Step 0 is invalid, the loop is eliminated.
 // CHECK-LABEL: func @scf_for_all_step_size_0()
-//       CHECK:   scf.forall (%{{.*}}) = (0) to (1) step (0)
+//       CHECK-NOT:   scf.forall
 func.func @scf_for_all_step_size_0()  {
   %x = arith.constant 0 : index
   scf.forall (%i, %j) = (0, 4) to (1, 5) step (%x, 8) {
diff --git a/mlir/test/Dialect/SCF/for-loop-peeling.mlir b/mlir/test/Dialect/SCF/for-loop-peeling.mlir
index f59b79603b489..be58548b1fcfe 100644
--- a/mlir/test/Dialect/SCF/for-loop-peeling.mlir
+++ b/mlir/test/Dialect/SCF/for-loop-peeling.mlir
@@ -67,6 +67,41 @@ func.func @fully_static_bounds() -> i32 {
 
 // -----
 
+//      CHECK: func @fully_static_bounds_integers(
+//  CHECK-DAG:   %[[C0:.*]] = arith.constant 0 : i32
+//  CHECK-DAG:   %[[C1:.*]] = arith.constant 1 : i32
+//  CHECK-DAG:   %[[C4:.*]] = arith.constant 4 : i32
+//  CHECK-DAG:   %[[C16:.*]] = arith.constant 16 : i32
+//      CHECK:   %[[LOOP:.*]] = scf.for %[[IV:.*]] = %[[C0]] to %[[C16]]
+// CHECK-SAME:       step %[[C4]] iter_args(%[[ACC:.*]] = %[[C0]]) -> (i32)
+//      CHECK:     %[[MAP:.*]] = affine.min
+//      CHECK:     %[[MAP_CAST:.*]] = arith.index_cast %[[MAP]]
+//      CHECK:     %[[ADD:.*]] = arith.addi %[[ACC]], %[[MAP_CAST]] : i32
+//      CHECK:     scf.yield %[[ADD]]
+//      CHECK:   }
+//      CHECK:   %[[RESULT:.*]] = arith.addi %[[LOOP]], %[[C1]] : i32
+//      CHECK:   return %[[RESULT]]
+#map = affine_map<(d0, d1)[s0] -> (s0, d0 - d1)>
+func.func @fully_static_bounds_integers() -> i32 {
+  %c0_i32 = arith.constant 0 : i32
+  %lb = arith.constant 0 : i32
+  %step = arith.constant 4 : i32
+  %ub = arith.constant 17 : i32
+  %r = scf.for %iv = %lb to %ub step %step
+               iter_args(%arg = %c0_i32) -> i32 : i32 {
+    %ub_index = arith.index_cast %ub : i32 to index
+    %iv_index = arith.index_cast %iv : i32 to index
+    %step_index = arith.index_cast %step : i32 to index
+    %s = affine.min #map(%ub_index, %iv_index)[%step_index]
+    %casted = arith.index_cast %s : index to i32
+    %0 = arith.addi %arg, %casted : i32
+    scf.yield %0 : i32
+  }
+  return %r : i32
+}
+
+// -----
+
 //  CHECK-DAG: #[[MAP0:.*]] = affine_map<()[s0] -> ((s0 floordiv 4) * 4)>
 //  CHECK-DAG: #[[MAP1:.*]] = affine_map<(d0)[s0] -> (-d0 + s0)>
 //      CHECK: func @dynamic_upper_bound(
@@ -293,10 +328,9 @@ func.func @regression(%arg0: memref<i64>, %arg1: index) {
 // -----
 
 // Regression test: Make sure that we do not crash.
-
+// The step is 0, the loop will be eliminated.
 // CHECK-LABEL: func @zero_step(
-//       CHECK:   scf.for
-//       CHECK:   scf.for
+//       CHECK-NOT:   scf.for
 func.func @zero_step(%arg0: memref<i64>) {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
diff --git a/mlir/test/Dialect/SCF/invalid.mlir b/mlir/test/Dialect/SCF/invalid.mlir
index bb7958083e55c..37fc86b18e7f0 100644
--- a/mlir/test/Dialect/SCF/invalid.mlir
+++ b/mlir/test/Dialect/SCF/invalid.mlir
@@ -645,7 +645,7 @@ func.func @wrong_terminator_op(%in: tensor<100xf32>, %out: tensor<100xf32>) {
 
   %result = scf.forall (%thread_idx) in (%num_threads) shared_outs(%o = %out) -> (tensor<100xf32>) {
       %1 = tensor.extract_slice %in[%thread_idx][1][1] : tensor<100xf32> to tensor<1xf32>
-      // expected-error @+1 {{expected only tensor.parallel_insert_slice ops}}
+      // expected-error @+1 {{expected only ParallelCombiningOpInterface ops}}
       scf.forall.in_parallel {
         tensor.parallel_insert_slice %1 into %o[%thread_idx][1][1] :
           tensor<1xf32> into tensor<100xf32>
diff --git a/mlir/test/Dialect/SCF/one-shot-bufferize-analysis.mlir b/mlir/test/Dialect/SCF/one-shot-bufferize-analysis.mlir
index 9bb87ffbb2090..ed3685514dd0d 100644
--- a/mlir/test/Dialect/SCF/one-shot-bufferize-analysis.mlir
+++ b/mlir/test/Dialect/SCF/one-shot-bufferize-analysis.mlir
@@ -908,3 +908,111 @@ func.func @parallel_region_no_read()
   }
   return
 }
+
+// -----
+
+// CHECK-LABEL: func @in_order_multiple_parallel_writes
+func.func @in_order_multiple_parallel_writes(%2: tensor<320xf32> {bufferization.writable = true},
+                                            %3: tensor<320xf32> {bufferization.writable = true})
+  -> (tensor<320xf32>, tensor<320xf32>)
+{
+  %c0 = arith.constant 0 : index
+  %cst = arith.constant -0.000000e+00 : f32
+  %c320 = arith.constant 320 : index
+  %4:2 = scf.forall (%arg0) in (%c320) shared_outs(%arg1 = %2, %arg2 = %3) -> (tensor<320xf32>, tensor<320xf32>) {
+    // CHECK: tensor.extract_slice {{.*}} {__inplace_operands_attr__ = ["true", "none"]}
+    %6 = tensor.extract_slice %arg1[%arg0] [1] [1] : tensor<320xf32> to tensor<1xf32>
+    // CHECK: tensor.extract_slice {{.*}} {__inplace_operands_attr__ = ["true", "none"]}
+    %7 = tensor.extract_slice %arg2[%arg0] [1] [1] : tensor<320xf32> to tensor<1xf32>
+    // CHECK: linalg.fill {__inplace_operands_attr__ = ["none", "true"]}
+    %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<1xf32>) -> tensor<1xf32>
+
+    // CHECK: tensor.parallel_insert_slice {{.*}} {__inplace_operands_attr__ = ["true", "true", "none"]}
+    // CHECK: tensor.parallel_insert_slice {{.*}} {__inplace_operands_attr__ = ["true", "true", "none"]}
+    scf.forall.in_parallel {
+      tensor.parallel_insert_slice %6 into %arg2[%arg0] [1] [1] : tensor<1xf32> into tensor<320xf32>
+      tensor.parallel_insert_slice %8 into %arg1[%arg0] [1] [1] : tensor<1xf32> into tensor<320xf32>
+    }
+  }
+  return %4#0, %4#1 : tensor<320xf32>, tensor<320xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @out_of_order_parallel_write
+func.func @out_of_order_parallel_write(%2: tensor<320xf32> {bufferization.writable = true},
+                                       %3: tensor<320xf32> {bufferization.writable = true})
+  -> (tensor<320xf32>, tensor<320xf32>)
+{
+  %c0 = arith.constant 0 : index
+  %cst = arith.constant -0.000000e+00 : f32
+  %c320 = arith.constant 320 : index
+  %4:2 = scf.forall (%arg0) in (%c320) shared_outs(%arg1 = %2, %arg2 = %3) -> (tensor<320xf32>, tensor<320xf32>) {
+    // The extract_slice cannot operate in place because it is used after the
+    // first write.
+    // CHECK: tensor.extract_slice {{.*}} {__inplace_operands_attr__ = ["true", "none"]}
+    %6 = tensor.extract_slice %arg1[%arg0] [1] [1] : tensor<320xf32> to tensor<1xf32>
+
+    // Additionally the fill aliases the thread local slice.
+    // CHECK: linalg.fill {__inplace_operands_attr__ = ["none", "false"]}
+    %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<1xf32>) -> tensor<1xf32>
+
+    scf.forall.in_parallel {
+      // CHECK: tensor.parallel_insert_slice {{.*}} {__inplace_operands_attr__ = ["true", "true", "none"]}
+      tensor.parallel_insert_slice %7 into %arg1[%arg0] [1] [1] : tensor<1xf32> into tensor<320xf32>
+      // CHECK: tensor.parallel_insert_slice {{.*}} {__inplace_operands_attr__ = ["true", "true", "none"]}
+      tensor.parallel_insert_slice %6 into %arg2[%arg0] [1] [1] : tensor<1xf32> into tensor<320xf32>
+    }
+  }
+  return %4#0, %4#1 : tensor<320xf32>, tensor<320xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @out_of_order_parallel_write
+func.func @out_of_order_parallel_write_multiple_reads(%2: tensor<320xf32> {bufferization.writable = true},
+                                                      %3: tensor<320xf32> {bufferization.writable = true})
+  -> (tensor<320xf32>, tensor<320xf32>)
+{
+  %c0 = arith.constant 0 : index
+  %cst = arith.constant -0.000000e+00 : f32
+  %c320 = arith.constant 320 : index
+  %4:2 = scf.forall (%arg0) in (%c320) shared_outs(%arg1 = %2, %arg2 = %3) -> (tensor<320xf32>, tensor<320xf32>) {
+    // CHECK: tensor.extract_slice {{.*}} {__inplace_operands_attr__ = ["false", "none"]}
+    %6 = tensor.extract_slice %arg1[%arg0] [1] [1] : tensor<320xf32> to tensor<1xf32>
+    // CHECK: linalg.fill {__inplace_operands_attr__ = ["none", "true"]}
+    %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<1xf32>) -> tensor<1xf32>
+
+    %reverse = arith.subi %c320, %arg0 : index
+    // CHECK: tensor.extract_slice {{.*}} {__inplace_operands_attr__ = ["true", "none"]}
+    %8 = tensor.extract_slice %arg1[%reverse] [1] [1] : tensor<320xf32> to tensor<1xf32>
+    scf.forall.in_parallel {
+      // Also cannot operate in place due to subsequent conflicting reads.
+      // CHECK: tensor.parallel_insert_slice {{.*}} {__inplace_operands_attr__ = ["true", "true", "none"]}
+      tensor.parallel_insert_slice %7 into %arg1[%arg0] [1] [1] : tensor<1xf32> into tensor<320xf32>
+      // CHECK: tensor.parallel_insert_slice {{.*}} {__inplace_operands_attr__ = ["true", "true", "none"]}
+      tensor.parallel_insert_slice %8 into %arg2[%reverse] [1] [1] : tensor<1xf32> into tensor<320xf32>
+    }
+  }
+  return %4#0, %4#1 : tensor<320xf32>, tensor<320xf32>
+}
+// -----
+
+// CHECK-LABEL: func @in_order_multiple_parallel_writes
+func.func @in_order_multiple_parallel_writes(%2: tensor<320xf32> {bufferization.writable = true})
+  -> (tensor<320xf32>)
+{
+  %c0 = arith.constant 0 : index
+  %cst = arith.constant -0.000000e+00 : f32
+  %c320 = arith.constant 320 : index
+  %4 = scf.forall (%arg0) in (%c320) shared_outs(%arg1 = %2) -> (tensor<320xf32>) {
+    // CHECK: tensor.extract_slice {{.*}} {__inplace_operands_attr__ = ["true", "none"]}
+    %6 = tensor.extract_slice %arg1[%arg0] [1] [1] : tensor<320xf32> to tensor<1xf32>
+    %reverse = arith.subi %c320, %arg0 : index
+    // CHECK: tensor.parallel_insert_slice {{.*}} {__inplace_operands_attr__ = ["true", "true", "none"]}
+    scf.forall.in_parallel {
+      tensor.parallel_insert_slice %6 into %arg1[%reverse] [1] [1] : tensor<1xf32> into tensor<320xf32>
+    }
+  }
+  return %4 : tensor<320xf32>
+} 
diff --git a/mlir/test/Dialect/SCF/one-shot-bufferize-tensor-copy-insertion.mlir b/mlir/test/Dialect/SCF/one-shot-bufferize-tensor-copy-insertion.mlir
index 8f4b924cfd3cc..92486b8ed7208 100644
--- a/mlir/test/Dialect/SCF/one-shot-bufferize-tensor-copy-insertion.mlir
+++ b/mlir/test/Dialect/SCF/one-shot-bufferize-tensor-copy-insertion.mlir
@@ -112,7 +112,7 @@ func.func @scf_while_non_equiv_condition_and_body(%A: tensor<5xi1>,
 //  CHECK-SAME:     %[[arg0:.*]]: tensor<100xf32>, %[[arg1:.*]]: tensor<100xf32>
 // CHECK-FUNC-LABEL: func @scf_forall_out_of_place(
 func.func @scf_forall_out_of_place(%in: tensor<100xf32>,
-                                           %out: tensor<100xf32>) {
+                                   %out: tensor<100xf32>) {
   %c1 = arith.constant 1 : index
   %num_threads = arith.constant 100 : index
 
@@ -132,3 +132,31 @@ func.func @scf_forall_out_of_place(%in: tensor<100xf32>,
   } {mapping = [#gpu.thread<x>]}
   return
 }
+
+// -----
+
+// CHECK-LABEL: func @in_order_multiple_parallel_writes
+func.func @in_order_multiple_parallel_writes(%2: tensor<320xf32>,
+                                             %3: tensor<320xf32>)
+  -> (tensor<320xf32>, tensor<320xf32>)
+{
+  %c0 = arith.constant 0 : index
+  %cst = arith.constant -0.000000e+00 : f32
+  %c320 = arith.constant 320 : index
+  %4:2 = scf.forall (%arg0) in (%c320) shared_outs(%arg1 = %2, %arg2 = %3) -> (tensor<320xf32>, tensor<320xf32>) {
+    // CHECK: tensor.extract_slice {{.*}}
+    %6 = tensor.extract_slice %arg1[%arg0] [1] [1] : tensor<320xf32> to tensor<1xf32>
+    // CHECK: tensor.extract_slice {{.*}}
+    %7 = tensor.extract_slice %arg2[%arg0] [1] [1] : tensor<320xf32> to tensor<1xf32>
+    // CHECK: linalg.fill {{.*}}
+    %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<1xf32>) -> tensor<1xf32>
+
+    // CHECK: tensor.parallel_insert_slice {{.*}}
+    // CHECK: tensor.parallel_insert_slice {{.*}}
+    scf.forall.in_parallel {
+      tensor.parallel_insert_slice %6 into %arg2[%arg0] [1] [1] : tensor<1xf32> into tensor<320xf32>
+      tensor.parallel_insert_slice %8 into %arg1[%arg0] [1] [1] : tensor<1xf32> into tensor<320xf32>
+    }
+  }
+  return %4#0, %4#1 : tensor<320xf32>, tensor<320xf32>
+}
diff --git a/mlir/test/Dialect/SCF/trip_count.mlir b/mlir/test/Dialect/SCF/trip_count.mlir
new file mode 100644
index 0000000000000..54883d7bb874c
--- /dev/null
+++ b/mlir/test/Dialect/SCF/trip_count.mlir
@@ -0,0 +1,702 @@
+// RUN: mlir-opt %s  -test-scf-for-utils --split-input-file | FileCheck %s
+
+// CHECK-LABEL: func.func @trip_count_index_zero_to_zero(
+func.func @trip_count_index_zero_to_zero(%a : i32, %b : i32) -> i32 {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+
+  // CHECK: "test.trip-count" = 0
+  %r = scf.for %i = %c0 to %c0 step %c1 iter_args(%0 = %a) -> i32 {
+    scf.yield %b : i32
+  }
+  return %r : i32
+}
+
+// -----
+
+// CHECK-LABEL: func.func @trip_count_index_zero_to_zero_step_dyn(
+func.func @trip_count_index_zero_to_zero_step_dyn(%a : i32, %b : i32, %step : index) -> i32 {
+  %c0 = arith.constant 0 : index
+
+  // CHECK: "test.trip-count" = 0
+  %r = scf.for %i = %c0 to %c0 step %step iter_args(%0 = %a) -> i32 {
+    scf.yield %b : i32
+  }
+  return %r : i32
+}
+
+// -----
+
+// CHECK-LABEL: func.func @trip_count_i32_zero_to_zero(
+func.func @trip_count_i32_zero_to_zero(%a : i32, %b : i32) -> i32 {
+  %c0 = arith.constant 0 : i32
+  %c1 = arith.constant 1 : i32
+
+  // CHECK: "test.trip-count" = 0
+  %r = scf.for %i = %c0 to %c0 step %c1 iter_args(%0 = %a) -> i32 : i32 {
+    scf.yield %b : i32
+  }
+  return %r : i32
+}
+
+// -----
+
+
+// CHECK-LABEL: func.func @trip_count_i32_zero_to_zero_step_dyn(
+func.func @trip_count_i32_zero_to_zero_step_dyn(%a : i32, %b : i32, %step : i32) -> i32 {
+  %c0 = arith.constant 0 : i32
+
+  // CHECK: "test.trip-count" = 0
+  %r = scf.for %i = %c0 to %c0 step %step iter_args(%0 = %a) -> i32 : i32 {
+    scf.yield %b : i32
+  }
+  return %r : i32
+}
+
+// -----
+
+// CHECK-LABEL: func.func @trip_count_index_one_to_zero(
+func.func @trip_count_index_one_to_zero(%a : i32, %b : i32) -> i32 {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+
+  // Index type has a unknown bitwidth, we can't compute a loop tripcount
+  // in theory because of overflow concerns.
+  // CHECK: "test.trip-count" = 0
+  %r2 = scf.for %i = %c1 to %c0 step %c1 iter_args(%0 = %a) -> i32 {
+    scf.yield %b : i32
+  }
+  return %r2 : i32
+}
+
+// -----
+
+// CHECK-LABEL: func.func @trip_count_i32_one_to_zero(
+func.func @trip_count_i32_one_to_zero(%a : i32, %b : i32) -> i32 {
+  %c0 = arith.constant 0 : i32
+  %c1 = arith.constant 1 : i32
+
+  // CHECK: "test.trip-count" = 0
+  %r2 = scf.for %i = %c1 to %c0 step %c1 iter_args(%0 = %a) -> i32 : i32 {
+    scf.yield %b : i32
+  }
+  return %r2 : i32
+}
+
+// -----
+
+// CHECK-LABEL: func.func @trip_count_i32_one_to_zero_dyn_step(
+func.func @trip_count_i32_one_to_zero_dyn_step(%a : i32, %b : i32, %step : i32) -> i32 {
+  %c0 = arith.constant 0 : i32
+  %c1 = arith.constant 1 : i32
+
+  // CHECK: "test.trip-count" = 0
+  %r2 = scf.for %i = %c1 to %c0 step %step iter_args(%0 = %a) -> i32 : i32 {
+    scf.yield %b : i32
+  }
+  return %r2 : i32
+}
+
+// -----
+
+// CHECK-LABEL: func.func @trip_count_index_negative_step(
+func.func @trip_count_index_negative_step(%a : i32, %b : i32) -> i32 {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c-1 = arith.constant -1 : index
+
+  // Negative step is invalid, loop won't execute.
+  // CHECK: "test.trip-count" = 0
+  %r3 = scf.for %i = %c1 to %c0 step %c-1 iter_args(%0 = %a) -> i32 {
+    scf.yield %b : i32
+  }
+  return %r3 : i32
+}
+
+// -----
+
+// CHECK-LABEL: func.func @trip_count_i32_negative_step(
+func.func @trip_count_i32_negative_step(%a : i32, %b : i32) -> i32 {
+  %c0 = arith.constant 0 : i32
+  %c1 = arith.constant 1 : i32
+  %c-1 = arith.constant -1 : i32
+
+  // Negative step is invalid, loop won't execute.
+  // CHECK: "test.trip-count" = 0
+  %r3 = scf.for %i = %c1 to %c0 step %c-1 iter_args(%0 = %a) -> i32 : i32 {
+    scf.yield %b : i32
+  }
+  return %r3 : i32
+}
+
+// -----
+
+// CHECK-LABEL: func.func @trip_count_index_negative_step_unsigned_loop(
+func.func @trip_count_index_negative_step_unsigned_loop(%a : i32, %b : i32) -> i32 {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c-1 = arith.constant -1 : index
+
+  // Negative step is invalid, loop won't execute.
+  // CHECK: "test.trip-count" = 0
+  %r3 = scf.for unsigned %i = %c1 to %c0 step %c-1 iter_args(%0 = %a) -> i32 {
+    scf.yield %b : i32
+  }
+  return %r3 : i32
+}
+
+// -----
+
+// CHECK-LABEL: func.func @trip_count_i32_negative_step_unsigned_loop(
+func.func @trip_count_i32_negative_step_unsigned_loop(%a : i32, %b : i32) -> i32 {
+  %c0 = arith.constant 0 : i32
+  %c1 = arith.constant 1 : i32
+  %c-1 = arith.constant -1 : i32
+
+  // Negative step is invalid, loop won't execute.
+  // CHECK: "test.trip-count" = 0
+  %r3 = scf.for unsigned %i = %c1 to %c0 step %c-1 iter_args(%0 = %a) -> i32 : i32 {
+    scf.yield %b : i32
+  }
+  return %r3 : i32
+}
+
+// -----
+
+// CHECK-LABEL: func.func @trip_count_index_normal_loop(
+func.func @trip_count_index_normal_loop(%a : i32, %b : i32) -> i32 {
+  %c0 = arith.constant 0 : index
+  %c2 = arith.constant 2 : index
+  %c10 = arith.constant 10 : index
+
+  // Index type has a unknown bitwidth, we can't compute a loop tripcount
+  // in theory because of overflow concerns.
+  // CHECK: "test.trip-count" = 5
+  %r4 = scf.for %i = %c0 to %c10 step %c2 iter_args(%0 = %a) -> i32 {
+    scf.yield %b : i32
+  }
+  return %r4 : i32
+}
+
+// -----
+
+// CHECK-LABEL: func.func @trip_count_i32_normal_loop(
+func.func @trip_count_i32_normal_loop(%a : i32, %b : i32) -> i32 {
+  %c0 = arith.constant 0 : i32
+  %c2 = arith.constant 2 : i32
+  %c10 = arith.constant 10 : i32
+
+  // Normal loop
+  // CHECK: "test.trip-count" = 5
+  %r4 = scf.for %i = %c0 to %c10 step %c2 iter_args(%0 = %a) -> i32 : i32 {
+    scf.yield %b : i32
+  }
+  return %r4 : i32
+}
+
+// -----
+
+// CHECK-LABEL: func.func @trip_count_index_signed_crossing_zero(
+func.func @trip_count_index_signed_crossing_zero(%a : i32, %b : i32) -> i32 {
+  %c-1 = arith.constant -1 : index
+  %c1 = arith.constant 1 : index
+
+  // Index type has a unknown bitwidth, we can't compute a loop tripcount
+  // in theory because of overflow concerns.
+  // CHECK: "test.trip-count" = 2
+  %r5 = scf.for %i = %c-1 to %c1 step %c1 iter_args(%0 = %a) -> i32 {
+    scf.yield %b : i32
+  }
+  return %r5 : i32
+}
+
+// -----
+
+// CHECK-LABEL: func.func @trip_count_i32_signed_crossing_zero(
+func.func @trip_count_i32_signed_crossing_zero(%a : i32, %b : i32) -> i32 {
+  %c-1 = arith.constant -1 : i32
+  %c1 = arith.constant 1 : i32
+
+  // This loop execute with signed comparison, but not unsigned, because it is crossing 0.
+  // CHECK: "test.trip-count" = 2
+  %r5 = scf.for %i = %c-1 to %c1 step %c1 iter_args(%0 = %a) -> i32 : i32 {
+    scf.yield %b : i32
+  }
+  return %r5 : i32
+}
+
+// -----
+
+// CHECK-LABEL: func.func @trip_count_index_unsigned_crossing_zero(
+func.func @trip_count_index_unsigned_crossing_zero(%a : i32, %b : i32) -> i32 {
+  %c-1 = arith.constant -1 : index
+  %c1 = arith.constant 1 : index
+
+  // Index type has a unknown bitwidth, we can't compute a loop tripcount
+  // in theory because of overflow concerns.
+  // CHECK: "test.trip-count" = 0
+  %r6 = scf.for unsigned %i = %c-1 to %c1 step %c1 iter_args(%0 = %a) -> i32 {
+    scf.yield %b : i32
+  }
+  return %r6 : i32
+}
+
+// -----
+
+// CHECK-LABEL: func.func @trip_count_i32_unsigned_crossing_zero(
+func.func @trip_count_i32_unsigned_crossing_zero(%a : i32, %b : i32) -> i32 {
+  %c-1 = arith.constant -1 : i32
+  %c1 = arith.constant 1 : i32
+
+  // This loop execute with signed comparison, but not unsigned, because it is crossing 0.
+  // CHECK: "test.trip-count" = 0
+  %r6 = scf.for unsigned %i = %c-1 to %c1 step %c1 iter_args(%0 = %a) -> i32 : i32 {
+    scf.yield %b : i32
+  }
+  return %r6 : i32
+}
+
+// -----
+
+// CHECK-LABEL: func.func @trip_count_i32_unsigned_crossing_zero_dyn_step(
+func.func @trip_count_i32_unsigned_crossing_zero_dyn_step(%a : i32, %b : i32, %step : i32) -> i32 {
+  %c-1 = arith.constant -1 : i32
+  %c1 = arith.constant 1 : i32
+
+  // This loop execute with signed comparison, but not unsigned, because it is crossing 0.
+  // CHECK: "test.trip-count" = 0
+  %r6 = scf.for unsigned %i = %c-1 to %c1 step %step iter_args(%0 = %a) -> i32 : i32 {
+    scf.yield %b : i32
+  }
+  return %r6 : i32
+}
+
+// -----
+
+// CHECK-LABEL: func.func @trip_count_index_negative_bounds_signed(
+func.func @trip_count_index_negative_bounds_signed(%a : i32, %b : i32) -> i32 {
+  %c-10 = arith.constant -10 : index
+  %c-1 = arith.constant -1 : index
+  %c2 = arith.constant 2 : index
+
+  // Index type has a unknown bitwidth, we can't compute a loop tripcount
+  // in theory because of overflow concerns.
+  // CHECK: "test.trip-count" = 5
+  %r7 = scf.for %i = %c-10 to %c-1 step %c2 iter_args(%0 = %a) -> i32 {
+    scf.yield %b : i32
+  }
+  return %r7 : i32
+}
+
+// -----
+
+// CHECK-LABEL: func.func @trip_count_i32_negative_bounds_signed(
+func.func @trip_count_i32_negative_bounds_signed(%a : i32, %b : i32) -> i32 {
+  %c-10 = arith.constant -10 : i32
+  %c-1 = arith.constant -1 : i32
+  %c2 = arith.constant 2 : i32
+
+  // This loop execute with signed comparison, because both bounds are
+  // negative and there is no crossing of 0 here.
+  // CHECK: "test.trip-count" = 5
+  %r7 = scf.for %i = %c-10 to %c-1 step %c2 iter_args(%0 = %a) -> i32 : i32 {
+    scf.yield %b : i32
+  }
+  return %r7 : i32
+}
+
+// -----
+
+// CHECK-LABEL: func.func @trip_count_index_negative_bounds_unsigned(
+func.func @trip_count_index_negative_bounds_unsigned(%a : i32, %b : i32) -> i32 {
+  %c-10 = arith.constant -10 : index
+  %c-1 = arith.constant -1 : index
+  %c2 = arith.constant 2 : index
+
+  // Index type has a unknown bitwidth, we can't compute a loop tripcount
+  // in theory because of overflow concerns.
+  // CHECK: "test.trip-count" = 5
+  %r8 = scf.for %i = %c-10 to %c-1 step %c2 iter_args(%0 = %a) -> i32 {
+    scf.yield %b : i32
+  }
+  return %r8 : i32
+}
+
+// -----
+
+// CHECK-LABEL: func.func @trip_count_i32_negative_bounds_unsigned(
+func.func @trip_count_i32_negative_bounds_unsigned(%a : i32, %b : i32) -> i32 {
+  %c-10 = arith.constant -10 : i32
+  %c-1 = arith.constant -1 : i32
+  %c2 = arith.constant 2 : i32
+
+  // CHECK: "test.trip-count" = 5
+  %r8 = scf.for %i = %c-10 to %c-1 step %c2 iter_args(%0 = %a) -> i32 : i32 {
+    scf.yield %b : i32
+  }
+  return %r8 : i32
+}
+
+// -----
+
+// CHECK-LABEL: func.func @trip_count_index_overflow_signed(
+func.func @trip_count_index_overflow_signed(%a : i32, %b : i32) -> i32 {
+  %c1 = arith.constant 1 : index
+  %c_max = arith.constant 2147483647 : index   // 2^31 - 1
+  %c_min = arith.constant 2147483648 : index  // -2^31
+
+  // Index type has a unknown bitwidth, we can't compute a loop tripcount
+  // in theory because of overflow concerns.
+  // CHECK: "test.trip-count" = 1
+  %r9 = scf.for %i = %c_max to %c_min step %c1 iter_args(%0 = %a) -> i32 {
+    scf.yield %b : i32
+  }
+  return %r9 : i32
+}
+
+// -----
+
+// CHECK-LABEL: func.func @trip_count_i32_overflow_signed(
+func.func @trip_count_i32_overflow_signed(%a : i32, %b : i32) -> i32 {
+  %c1 = arith.constant 1 : i32
+  %c_max = arith.constant 2147483647 : i32   // 2^31 - 1
+  %c_min = arith.constant 2147483648 : i32  // -2^31
+
+  // This loop crosses the 2^31 threshold, which would overflow a signed 32-bit integer.
+  // CHECK: "test.trip-count" = 0
+  %r9 = scf.for %i = %c_max to %c_min step %c1 iter_args(%0 = %a) -> i32 : i32 {
+    scf.yield %b : i32
+  }
+  return %r9 : i32
+}
+
+// -----
+
+// CHECK-LABEL: func.func @trip_count_i32_overflow_signed_dyn_step(
+func.func @trip_count_i32_overflow_signed_dyn_step(%a : i32, %b : i32, %step : i32) -> i32 {
+  %c_max = arith.constant 2147483647 : i32   // 2^31 - 1
+  %c_min = arith.constant 2147483648 : i32  // -2^31
+
+  // This loop crosses the 2^31 threshold, which would overflow a signed 32-bit integer.
+  // CHECK: "test.trip-count" = 0
+  %r9 = scf.for %i = %c_max to %c_min step %step iter_args(%0 = %a) -> i32 : i32 {
+    scf.yield %b : i32
+  }
+  return %r9 : i32
+}
+
+// -----
+
+// CHECK-LABEL: func.func @trip_count_index_overflow_unsigned(
+func.func @trip_count_index_overflow_unsigned(%a : i32, %b : i32) -> i32 {
+  %c1 = arith.constant 1 : index
+  %c_max = arith.constant 2147483647 : index   // 2^31 - 1
+  %c_min = arith.constant 2147483648 : index  // -2^31
+
+  // Index type has a unknown bitwidth, we can't compute a loop tripcount
+  // in theory because of overflow concerns.
+  // CHECK: "test.trip-count" = 1
+  %r10 = scf.for unsigned %i = %c_max to %c_min step %c1 iter_args(%0 = %a) -> i32 {
+    scf.yield %b : i32
+  }
+  return %r10 : i32
+}
+
+// -----
+
+// CHECK-LABEL: func.func @trip_count_i32_overflow_unsigned(
+func.func @trip_count_i32_overflow_unsigned(%a : i32, %b : i32) -> i32 {
+  %c1 = arith.constant 1 : i32
+  %c_max = arith.constant 2147483647 : i32   // 2^31 - 1
+  %c_min = arith.constant 2147483648 : i32  // -2^31
+
+  // The same loop with unsigned comparison executes normally
+  // CHECK: "test.trip-count" = 1
+  %r10 = scf.for unsigned %i = %c_max to %c_min step %c1 iter_args(%0 = %a) -> i32 : i32 {
+    scf.yield %b : i32
+  }
+  return %r10 : i32
+}
+
+// -----
+
+// CHECK-LABEL: func.func @trip_count_index_overflow_64bit_signed(
+func.func @trip_count_index_overflow_64bit_signed(%a : i32, %b : i32) -> i32 {
+  %c1 = arith.constant 1 : index
+  %c_max = arith.constant 9223372036854775807 : index   // 2^63 - 1
+  %c_min = arith.constant -9223372036854775808 : index  // -2^63
+
+  // This loop crosses the 2^63 threshold, which would overflow a signed 64-bit integer.
+  // Index type has a unknown bitwidth, we can't compute a loop tripcount.
+  // CHECK: "test.trip-count" = 0
+  %r11 = scf.for %i = %c_max to %c_min step %c1 iter_args(%0 = %a) -> i32 {
+    scf.yield %b : i32
+  }
+  return %r11 : i32
+}
+
+// -----
+
+// CHECK-LABEL: func.func @trip_count_i64_overflow_64bit_signed(
+func.func @trip_count_i64_overflow_64bit_signed(%a : i32, %b : i32) -> i32 {
+  %c1 = arith.constant 1 : i64
+  %c_max = arith.constant 9223372036854775807 : i64   // 2^63 - 1
+  %c_min = arith.constant -9223372036854775808 : i64  // -2^63
+
+  // This loop crosses the 2^63 threshold, which would overflow a signed 64-bit integer.
+  // CHECK: "test.trip-count" = 0
+  %r11 = scf.for %i = %c_max to %c_min step %c1 iter_args(%0 = %a) -> i32 : i64 {
+    scf.yield %b : i32
+  }
+  return %r11 : i32
+}
+
+// -----
+
+// CHECK-LABEL: func.func @trip_count_index_overflow_64bit_unsigned(
+func.func @trip_count_index_overflow_64bit_unsigned(%a : i32, %b : i32) -> i32 {
+  %c1 = arith.constant 1 : index
+  %c_max = arith.constant 9223372036854775807 : index   // 2^63 - 1
+  %c_min = arith.constant -9223372036854775808 : index  // -2^63
+
+  // Index type has a unknown bitwidth, we can't compute a loop tripcount
+  // in theory because of overflow concerns.
+  // CHECK: "test.trip-count" = 1
+  %r12 = scf.for unsigned %i = %c_max to %c_min step %c1 iter_args(%0 = %a) -> i32 {
+    scf.yield %b : i32
+  }
+  return %r12 : i32
+}
+
+// -----
+
+// CHECK-LABEL: func.func @trip_count_i32_overflow_64bit_unsigned(
+func.func @trip_count_i32_overflow_64bit_unsigned(%a : i32, %b : i32) -> i32 {
+  %c1 = arith.constant 1 : i64
+  %c_max = arith.constant 9223372036854775807 : i64   // 2^63 - 1
+  %c_min = arith.constant -9223372036854775808 : i64  // -2^63
+
+  // The same loop with unsigned comparison executes normally
+  // CHECK: "test.trip-count" = 1
+  %r12 = scf.for unsigned %i = %c_max to %c_min step %c1 iter_args(%0 = %a) -> i32 : i64 {
+    scf.yield %b : i32
+  }
+  return %r12 : i32
+}
+
+// -----
+
+// CHECK-LABEL:func.func @trip_count_step_greater_than_iteration(
+func.func @trip_count_step_greater_than_iteration() -> i32 {
+  %c0_i32 = arith.constant 0 : i32
+  %c4_i32 = arith.constant 4 : i32
+  %c17_i32 = arith.constant 17 : i32
+  %c16_i32 = arith.constant 16 : i32
+  // CHECK: "test.trip-count" = 1
+  %1 = scf.for %arg0 = %c16_i32 to %c17_i32 step %c4_i32 iter_args(%arg1 = %c0_i32) -> (i32)  : i32 {
+    scf.yield %arg0 : i32
+  }
+  return %1 : i32
+}
+
+
+// -----
+
+// CHECK-LABEL:func.func @trip_count_arith_add(
+func.func @trip_count_arith_add(%lb : i32) -> i32 {
+  %c0_i32 = arith.constant 0 : i32
+  %c4_i32 = arith.constant 4 : i32
+  %c17_i32 = arith.constant 17 : i32
+  %c16_i32 = arith.constant 16 : i32
+  // Can't compute a trip-count in the absence of overflow flag.
+  // CHECK: "test.trip-count" = "none"
+  %ub = arith.addi %lb, %c16_i32 : i32
+  %1 = scf.for %arg0 = %lb to %ub step %c4_i32 iter_args(%arg1 = %c0_i32) -> (i32)  : i32 {
+    scf.yield %arg0 : i32
+  }
+  return %1 : i32
+}
+
+// -----
+
+// CHECK-LABEL:func.func @trip_count_arith_add_negative(
+func.func @trip_count_arith_add_negative(%lb : i32) -> i32 {
+  %c0_i32 = arith.constant 0 : i32
+  %c4_i32 = arith.constant 4 : i32
+  %c-16_i32 = arith.constant -16 : i32
+  // Can't compute a trip-count in the absence of overflow flag.
+  // CHECK: "test.trip-count" = "none"
+  %ub = arith.addi %lb, %c-16_i32 : i32
+  %1 = scf.for %arg0 = %lb to %ub step %c4_i32 iter_args(%arg1 = %c0_i32) -> (i32)  : i32 {
+    scf.yield %arg0 : i32
+  }
+  return %1 : i32
+}
+
+// -----
+
+// CHECK-LABEL:func.func @trip_count_arith_add_nsw_loop_signed(
+func.func @trip_count_arith_add_nsw_loop_signed(%lb : i32) -> i32 {
+  %c0_i32 = arith.constant 0 : i32
+  %c4_i32 = arith.constant 4 : i32
+  %c16_i32 = arith.constant 16 : i32
+  %ub = arith.addi %lb, %c16_i32 overflow<nsw> : i32
+  // CHECK: "test.trip-count" = 4
+  %1 = scf.for %arg0 = %lb to %ub step %c4_i32 iter_args(%arg1 = %c0_i32) -> (i32)  : i32 {
+    scf.yield %arg0 : i32
+  }
+  return %1 : i32
+}
+
+// -----
+
+// CHECK-LABEL:func.func @trip_count_arith_add_negative_nsw_loop_signed(
+func.func @trip_count_arith_add_negative_nsw_loop_signed(%lb : i32) -> i32 {
+  %c0_i32 = arith.constant 0 : i32
+  %c4_i32 = arith.constant 4 : i32
+  %c-16_i32 = arith.constant -16 : i32
+  %ub = arith.addi %lb, %c-16_i32 overflow<nsw> : i32
+  // CHECK: "test.trip-count" = 0
+  %1 = scf.for %arg0 = %lb to %ub step %c4_i32 iter_args(%arg1 = %c0_i32) -> (i32)  : i32 {
+    scf.yield %arg0 : i32
+  }
+  return %1 : i32
+}
+
+// -----
+
+// CHECK-LABEL:func.func @trip_count_arith_add_negative_nsw_loop_signed_step_dyn(
+func.func @trip_count_arith_add_negative_nsw_loop_signed_step_dyn(%lb : i32, %step : i32) -> i32 {
+  %c0_i32 = arith.constant 0 : i32
+  %c-16_i32 = arith.constant -16 : i32
+  %ub = arith.addi %lb, %c-16_i32 overflow<nsw> : i32
+  // CHECK: "test.trip-count" = 0
+  %1 = scf.for %arg0 = %lb to %ub step %step iter_args(%arg1 = %c0_i32) -> (i32)  : i32 {
+    scf.yield %arg0 : i32
+  }
+  return %1 : i32
+}
+
+// -----
+
+// CHECK-LABEL:func.func @trip_count_arith_add_nsw_loop_unsigned(
+func.func @trip_count_arith_add_nsw_loop_unsigned(%lb : i32) -> i32 {
+  %c0_i32 = arith.constant 0 : i32
+  %c4_i32 = arith.constant 4 : i32
+  %c16_i32 = arith.constant 16 : i32
+  // Can't compute a trip-count when the overflow flag mismatches the loop comparison signess
+  // CHECK: "test.trip-count" = "none"
+  %ub = arith.addi %lb, %c16_i32 overflow<nsw> : i32
+  %1 = scf.for unsigned %arg0 = %lb to %ub step %c4_i32 iter_args(%arg1 = %c0_i32) -> (i32)  : i32 {
+    scf.yield %arg0 : i32
+  }
+  return %1 : i32
+}
+
+// -----
+
+// CHECK-LABEL:func.func @trip_count_arith_add_negative_nsw_loop_unsigned(
+func.func @trip_count_arith_add_negative_nsw_loop_unsigned(%lb : i32) -> i32 {
+  %c0_i32 = arith.constant 0 : i32
+  %c4_i32 = arith.constant 4 : i32
+  %c-16_i32 = arith.constant -16 : i32
+  // Can't compute a trip-count when the overflow flag mismatches the loop comparison signess
+  // CHECK: "test.trip-count" = "none"
+  %ub = arith.addi %lb, %c-16_i32 overflow<nsw> : i32
+  %1 = scf.for unsigned %arg0 = %lb to %ub step %c4_i32 iter_args(%arg1 = %c0_i32) -> (i32)  : i32 {
+    scf.yield %arg0 : i32
+  }
+  return %1 : i32
+}
+
+// -----
+
+// CHECK-LABEL:func.func @trip_count_arith_add_nuw_loop_signed(
+func.func @trip_count_arith_add_nuw_loop_signed(%lb : i32) -> i32 {
+  %c0_i32 = arith.constant 0 : i32
+  %c4_i32 = arith.constant 4 : i32
+  %c16_i32 = arith.constant 16 : i32
+  // Can't compute a trip-count when the overflow flag mismatches the loop comparison signess
+  // CHECK: "test.trip-count" = "none"
+  %ub = arith.addi %lb, %c16_i32 overflow<nuw> : i32
+  %1 = scf.for %arg0 = %lb to %ub step %c4_i32 iter_args(%arg1 = %c0_i32) -> (i32)  : i32 {
+    scf.yield %arg0 : i32
+  }
+  return %1 : i32
+}
+
+// -----
+
+// CHECK-LABEL:func.func @trip_count_arith_add_negative_nuw_loop_signed(
+func.func @trip_count_arith_add_negative_nuw_loop_signed(%lb : i32) -> i32 {
+  %c0_i32 = arith.constant 0 : i32
+  %c4_i32 = arith.constant 4 : i32
+  %c-16_i32 = arith.constant -16 : i32
+  // Can't compute a trip-count when the overflow flag mismatches the loop comparison signess
+  // CHECK: "test.trip-count" = "none"
+  %ub = arith.addi %lb, %c-16_i32 overflow<nuw> : i32
+  %1 = scf.for %arg0 = %lb to %ub step %c4_i32 iter_args(%arg1 = %c0_i32) -> (i32)  : i32 {
+    scf.yield %arg0 : i32
+  }
+  return %1 : i32
+}
+
+// -----
+
+// CHECK-LABEL:func.func @trip_count_arith_add_nuw_loop_unsigned(
+func.func @trip_count_arith_add_nuw_loop_unsigned(%lb : i32) -> i32 {
+  %c0_i32 = arith.constant 0 : i32
+  %c4_i32 = arith.constant 4 : i32
+  %c16_i32 = arith.constant 16 : i32
+  // CHECK: "test.trip-count" = 4
+  %ub = arith.addi %lb, %c16_i32 overflow<nuw> : i32
+  %1 = scf.for unsigned %arg0 = %lb to %ub step %c4_i32 iter_args(%arg1 = %c0_i32) -> (i32)  : i32 {
+    scf.yield %arg0 : i32
+  }
+  return %1 : i32
+}
+
+// -----
+
+// CHECK-LABEL:func.func @trip_count_arith_add_negative_nuw_loop_unsigned(
+func.func @trip_count_arith_add_negative_nuw_loop_unsigned(%lb : i32) -> i32 {
+  %c0_i32 = arith.constant 0 : i32
+  %c4_i32 = arith.constant 4 : i32
+  %c-16_i32 = arith.constant -16 : i32
+  // CHECK: "test.trip-count" = 0
+  %ub = arith.addi %lb, %c-16_i32 overflow<nuw> : i32
+  %1 = scf.for unsigned %arg0 = %lb to %ub step %c4_i32 iter_args(%arg1 = %c0_i32) -> (i32)  : i32 {
+    scf.yield %arg0 : i32
+  }
+  return %1 : i32
+}
+
+// -----
+
+// CHECK-LABEL:func.func @trip_count_arith_add_negative_nuw_loop_unsigned_step_dyn(
+func.func @trip_count_arith_add_negative_nuw_loop_unsigned_step_dyn(%lb : i32, %step : i32) -> i32 {
+  %c0_i32 = arith.constant 0 : i32
+  %c-16_i32 = arith.constant -16 : i32
+  // CHECK: "test.trip-count" = 0
+  %ub = arith.addi %lb, %c-16_i32 overflow<nuw> : i32
+  %1 = scf.for unsigned %arg0 = %lb to %ub step %step iter_args(%arg1 = %c0_i32) -> (i32)  : i32 {
+    scf.yield %arg0 : i32
+  }
+  return %1 : i32
+}
+
+// -----
+
+// CHECK-LABEL:func.func @trip_count_arith_add_nuw_loop_unsigned_invalid(
+func.func @trip_count_arith_add_nuw_loop_unsigned_invalid(%lb : i32, %other : i32) -> i32 {
+  %c0_i32 = arith.constant 0 : i32
+  %c4_i32 = arith.constant 4 : i32
+  %c16_i32 = arith.constant 16 : i32
+  // The addition here is not adding from %lb
+  // CHECK: "test.trip-count" = "none"
+  %ub = arith.addi %other, %c16_i32 overflow<nuw> : i32
+  %1 = scf.for unsigned %arg0 = %lb to %ub step %c4_i32 iter_args(%arg1 = %c0_i32) -> (i32)  : i32 {
+    scf.yield %arg0 : i32
+  }
+  return %1 : i32
+}
\ No newline at end of file
diff --git a/mlir/test/Dialect/SPIRV/IR/target-and-abi.mlir b/mlir/test/Dialect/SPIRV/IR/target-and-abi.mlir
index 10fbcf06eb052..63dea6af83556 100644
--- a/mlir/test/Dialect/SPIRV/IR/target-and-abi.mlir
+++ b/mlir/test/Dialect/SPIRV/IR/target-and-abi.mlir
@@ -101,6 +101,14 @@ func.func @interface_var(
 
 // -----
 
+// CHECK: {spirv.interface_var_abi = #spirv.interface_var_abi<(0, 0)>}
+// CHECK: {spirv.interface_var_abi = #spirv.interface_var_abi<(0, 1)>}
+spirv.ARM.Graph @interface_var(%arg: !spirv.arm.tensor<1xf32> {spirv.interface_var_abi = #spirv.interface_var_abi<(0, 0)>}) -> (
+    !spirv.arm.tensor<1xf32> {spirv.interface_var_abi = #spirv.interface_var_abi<(0, 1)>}
+) { spirv.ARM.GraphOutputs %arg : !spirv.arm.tensor<1xf32> }
+
+// -----
+
 //===----------------------------------------------------------------------===//
 // spirv.resource_limits
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/Dialect/SPIRV/Transforms/abi-interface.mlir b/mlir/test/Dialect/SPIRV/Transforms/abi-interface.mlir
index f3a3218e5aec0..04667c828bbd1 100644
--- a/mlir/test/Dialect/SPIRV/Transforms/abi-interface.mlir
+++ b/mlir/test/Dialect/SPIRV/Transforms/abi-interface.mlir
@@ -35,6 +35,28 @@ spirv.module Logical GLSL450 {
 
 // -----
 
+module attributes {
+  spirv.target_env = #spirv.target_env<
+     #spirv.vce<v1.0, [VulkanMemoryModel, Shader, Int8, TensorsARM, GraphARM], [SPV_ARM_tensors, SPV_ARM_graph, SPV_KHR_vulkan_memory_model]>, #spirv.resource_limits<>>
+} {
+
+// CHECK-LABEL: spirv.module
+spirv.module Logical Vulkan {
+  //  CHECK-DAG:    spirv.GlobalVariable [[VARARG0:@.*]] bind(0, 0) : !spirv.ptr<!spirv.arm.tensor<1x16x16x16xi8>, UniformConstant>
+  //  CHECK-DAG:    spirv.GlobalVariable [[VARRES0:@.*]] bind(0, 1) : !spirv.ptr<!spirv.arm.tensor<1x16x16x16xi8>, UniformConstant>
+
+  //      CHECK:    spirv.ARM.GraphEntryPoint [[GN:@.*]], [[VARARG0]], [[VARRES0]]
+  //      CHECK:    spirv.ARM.Graph [[GN]]([[ARG0:%.*]]: !spirv.arm.tensor<1x16x16x16xi8>) -> !spirv.arm.tensor<1x16x16x16xi8> attributes {entry_point = true}
+  spirv.ARM.Graph @main(%arg0: !spirv.arm.tensor<1x16x16x16xi8> {spirv.interface_var_abi = #spirv.interface_var_abi<(0, 0)>})
+                  -> (!spirv.arm.tensor<1x16x16x16xi8> {spirv.interface_var_abi = #spirv.interface_var_abi<(0, 1)>}) attributes {entry_point = true} {
+    spirv.ARM.GraphOutputs %arg0 : !spirv.arm.tensor<1x16x16x16xi8>
+  }
+} // end spirv.module
+
+} // end module
+
+// -----
+
 module {
 // expected-error@+1 {{'spirv.module' op missing SPIR-V target env attribute}}
 spirv.module Logical GLSL450 {}
diff --git a/mlir/test/Dialect/Tosa/tosa-convert-integer-type-to-signless.mlir b/mlir/test/Dialect/Tosa/tosa-convert-integer-type-to-signless.mlir
index a64f69a8931fb..b7dbf9faf0936 100644
--- a/mlir/test/Dialect/Tosa/tosa-convert-integer-type-to-signless.mlir
+++ b/mlir/test/Dialect/Tosa/tosa-convert-integer-type-to-signless.mlir
@@ -32,6 +32,21 @@ func.func @test_rescale_input_unsigned(%arg0: tensor<1x1xui16>) -> (tensor<1x1xi
 
 // -----
 
+// CHECK-LABEL: test_rescale_unsigned_zp
+// CHECK: %[[ZP_IN:.*]] = "tosa.const"() <{values = dense<-2> : tensor<1xi8>}> : () -> tensor<1xi8>
+// CHECK: %[[ZP_OUT:.*]] = "tosa.const"() <{values = dense<2> : tensor<1xi8>}> : () -> tensor<1xi8>
+// CHECK: tosa.rescale %arg0, %0, %1, %[[ZP_IN]], %[[ZP_OUT]] {input_unsigned = true, output_unsigned = false, per_channel = false, rounding_mode = SINGLE_ROUND, scale32 = true} : (tensor<1x1xi8>, tensor<1xi32>, tensor<1xi8>, tensor<1xi8>, tensor<1xi8>)
+func.func @test_rescale_unsigned_zp(%arg0: tensor<1x1xui8>) -> tensor<1x1xi8> {
+  %0 = "tosa.const"() <{values = dense<2> : tensor<1xi32>}> : () -> tensor<1xi32>
+  %1 = "tosa.const"() <{values = dense<1> : tensor<1xi8>}> : () -> tensor<1xi8>
+  %2 = "tosa.const"() <{values = dense<254> : tensor<1xui8>}> : () -> tensor<1xui8>
+  %3 = "tosa.const"() <{values = dense<2> : tensor<1xi8>}> : () -> tensor<1xi8>
+  %r = tosa.rescale %arg0, %0, %1, %2, %3 {input_unsigned = true, output_unsigned = false, per_channel = false, rounding_mode = SINGLE_ROUND, scale32 = true} : (tensor<1x1xui8>, tensor<1xi32>, tensor<1xi8>, tensor<1xui8>, tensor<1xi8>) -> tensor<1x1xi8>
+  return %r : tensor<1x1xi8>
+}
+
+// -----
+
 // CHECK-LABEL: test_unsigned_function_signature
 // CHECK: %arg0: tensor<1xi8>, %arg1: tensor<1xi8>
 func.func @test_unsigned_function_signature(%arg0: tensor<1xui8>, %arg1: tensor<1xui8>) -> (tensor<1xui8>, tensor<1xui8>) {
@@ -41,6 +56,15 @@ func.func @test_unsigned_function_signature(%arg0: tensor<1xui8>, %arg1: tensor<
 
 // -----
 
+// CHECK-LABEL: test_unsigned_const_data
+// CHECK: "tosa.const"() <{values = dense<[-1, -2, 0, 1, -128]> : tensor<5xi8>}> : () -> tensor<5xi8>
+func.func @test_unsigned_const_data() -> tensor<5xui8> {
+  %0 = "tosa.const"() <{values = dense<[255, 254, 0, 1, 128]> : tensor<5xui8>}> : () -> tensor<5xui8>
+  return %0 : tensor<5xui8>
+}
+
+// -----
+
 // CHECK-LABEL: test_no_change
 // CHECK: %arg0: tensor<13x21x3xi8>
 func.func @test_no_change(%arg0: tensor<13x21x3xi8>) -> tensor<13x21x3xi8> {
diff --git a/mlir/test/Dialect/Vector/canonicalize.mlir b/mlir/test/Dialect/Vector/canonicalize.mlir
index e7381e0c8997e..05c88b8abfbb0 100644
--- a/mlir/test/Dialect/Vector/canonicalize.mlir
+++ b/mlir/test/Dialect/Vector/canonicalize.mlir
@@ -3326,8 +3326,12 @@ func.func @from_elements_to_elements_shuffle(%a: vector<4x2xf32>) -> vector<4x2x
 
 // -----
 
-// CHECK-LABEL: func @from_elements_all_elements_constant(
-func.func @from_elements_all_elements_constant() -> vector<2x2xi32> {
+// +---------------------------------------------------------------------------
+// Tests for foldFromElementsToConstant
+// +---------------------------------------------------------------------------
+
+// CHECK-LABEL: func @from_elements_to_constant(
+func.func @from_elements_to_constant() -> vector<2x2xi32> {
   %c0_i32 = arith.constant 0 : i32
   %c1_i32 = arith.constant 1 : i32
   %c2_i32 = arith.constant 2 : i32
@@ -3340,9 +3344,11 @@ func.func @from_elements_all_elements_constant() -> vector<2x2xi32> {
 
 // -----
 
-// CHECK-LABEL: func @from_elements_partial_elements_constant(
+// One of the elements is not a constant, the folder should fail.
+
+// CHECK-LABEL: func @negative_from_elements_to_constant(
 // CHECK-SAME:     %[[A:.*]]: f32
-func.func @from_elements_partial_elements_constant(%arg0: f32) -> vector<2xf32> {
+func.func @negative_from_elements_to_constant(%arg0: f32) -> vector<2xf32> {
   // CHECK: %[[C:.*]] = arith.constant 1.000000e+00 : f32
   %c = arith.constant 1.0 : f32
   // CHECK: %[[RES:.*]] = vector.from_elements %[[A]], %[[C]] : vector<2xf32>
@@ -3353,6 +3359,28 @@ func.func @from_elements_partial_elements_constant(%arg0: f32) -> vector<2xf32>
 
 // -----
 
+// While all inputs in this example are constant, we cannot create a
+// DenselElemAttr containing llvm.mlir.addressof. Instead,
+// `foldFromElementsToConstant` bails out. Note that in this case, a different
+// folder is applied (`rewriteFromElementsAsBroadcast`).
+llvm.mlir.global constant @my_symbol() : i32
+
+// CHECK-LABEL: func @negative_from_elements_to_constant
+//       CHECK:   %[[A:.*]] = llvm.mlir.addressof @my_symbol
+//       CHECK:   %[[B:.*]] = vector.broadcast %[[A]] : !llvm.ptr to vector<1x!llvm.ptr>
+//       CHECK:   return %[[B]]
+func.func @negative_from_elements_to_constant() -> vector<1x!llvm.ptr> {
+  %a = llvm.mlir.addressof @my_symbol : !llvm.ptr
+  %b = vector.from_elements %a : vector<1x!llvm.ptr>
+  return %b : vector<1x!llvm.ptr>
+}
+
+// +---------------------------------------------------------------------------
+// End of  Tests for foldFromElementsToConstant
+// +---------------------------------------------------------------------------
+
+// -----
+
 // CHECK-LABEL: func @vector_insert_const_regression(
 //       CHECK:   llvm.mlir.undef
 //       CHECK:   vector.insert
@@ -3726,17 +3754,3 @@ func.func @no_fold_insert_use_chain_mismatch_static_position(%arg : vector<4xf32
   %v_1 = vector.insert %val, %v_0[1] : f32 into vector<4xf32>
   return %v_1 : vector<4xf32>
 }
-
-// -----
-
-llvm.mlir.global constant @my_symbol() : i32
-
-// CHECK-LABEL: func @from_address_of_regression
-//       CHECK:   %[[a:.*]] = llvm.mlir.addressof @my_symbol
-//       CHECK:   %[[b:.*]] = vector.broadcast %[[a]] : !llvm.ptr to vector<1x!llvm.ptr>
-//       CHECK:   return %[[b]]
-func.func @from_address_of_regression() -> vector<1x!llvm.ptr> {
-  %a = llvm.mlir.addressof @my_symbol : !llvm.ptr
-  %b = vector.from_elements %a : vector<1x!llvm.ptr>
-  return %b : vector<1x!llvm.ptr>
-}
diff --git a/mlir/test/Dialect/Vector/flatten-memref-and-emulate-narrow-types.mlir b/mlir/test/Dialect/Vector/flatten-memref-and-emulate-narrow-types.mlir
new file mode 100644
index 0000000000000..222e613f5c18a
--- /dev/null
+++ b/mlir/test/Dialect/Vector/flatten-memref-and-emulate-narrow-types.mlir
@@ -0,0 +1,64 @@
+// RUN: mlir-opt --test-memref-flatten-and-vector-narrow-type-emulation --split-input-file %s | FileCheck %s
+
+// This test verifies that narrow-type-emulation works correctly for
+// rank > 1 memrefs by combining memref flattening with vector narrow type
+// emulation patterns. 
+//
+// The patterns tested here demonstrate the composition of two transformations,
+// memref flattening for vector ops and vector op narrow type emulation.
+//
+// TODO: Support `vector.transfer_write` operation.
+
+func.func @vector_load_2d_i4(%arg0: index) -> vector<8xi4> {
+    %0 = memref.alloc() : memref<4x8xi4>
+    %1 = vector.load %0[%arg0, %arg0] : memref<4x8xi4>, vector<8xi4>
+    return %1 : vector<8xi4>
+}
+//  CHECK-LABEL: func @vector_load_2d_i4
+//        CHECK:   vector.load {{.*}} memref<16xi8>
+
+// -----
+
+func.func @vector_maskedload_2d_i4(%arg0: index, %passthru: vector<8xi4>) -> vector<8xi4> {
+    %0 = memref.alloc() : memref<4x8xi4>
+    %mask = vector.constant_mask [6] : vector<8xi1>
+    %1 = vector.maskedload %0[%arg0, %arg0], %mask, %passthru :
+      memref<4x8xi4>, vector<8xi1>, vector<8xi4> into vector<8xi4>
+    return %1 : vector<8xi4>
+}
+//  CHECK-LABEL: func @vector_maskedload_2d_i4(
+//        CHECK:   vector.maskedload {{.*}} memref<16xi8>
+
+// -----
+
+func.func @vector_maskedstore_2d_i4(%arg0: index, %value: vector<8xi4>) {
+    %0 = memref.alloc() : memref<4x8xi4>
+    %mask = vector.constant_mask [5] : vector<8xi1>
+    vector.maskedstore %0[%arg0, %arg0], %mask, %value :
+      memref<4x8xi4>, vector<8xi1>, vector<8xi4>
+    return
+}
+//  CHECK-LABEL: func @vector_maskedstore_2d_i4(
+//        CHECK:   vector.maskedstore {{.*}} memref<16xi8>
+
+// -----
+
+func.func @vector_store_2d_i4(%arg0: index, %value: vector<8xi4>) {
+    %0 = memref.alloc() : memref<4x8xi4>
+    vector.store %value, %0[%arg0, %arg0] : memref<4x8xi4>, vector<8xi4>
+    return
+}
+//  CHECK-LABEL: func @vector_store_2d_i4(
+//        CHECK:   vector.store {{.*}} memref<16xi8>
+
+// -----
+
+func.func @vector_transfer_read_2d_i4(%arg0: index, %padding: i4) -> vector<8xi4> {
+    %0 = memref.alloc() : memref<4x8xi4>
+    %1 = vector.transfer_read %0[%arg0, %arg0], %padding {in_bounds = [true]} : memref<4x8xi4>, vector<8xi4>
+    return %1 : vector<8xi4>
+}
+//  CHECK-LABEL: func @vector_transfer_read_2d_i4(
+//   CHECK-SAME: %{{.*}}: index, %[[PADDING_I4:.*]]: i4)
+//        CHECK:   %[[PADDING_I8:.*]] = arith.extui %[[PADDING_I4]] : i4 to i8
+//        CHECK:   vector.transfer_read {{.*}}, %[[PADDING_I8]] : memref<16xi8>, vector<4xi8>
diff --git a/mlir/test/Dialect/Vector/linearize.mlir b/mlir/test/Dialect/Vector/linearize.mlir
index 5e8bfd0698b33..fe697c8b9c057 100644
--- a/mlir/test/Dialect/Vector/linearize.mlir
+++ b/mlir/test/Dialect/Vector/linearize.mlir
@@ -538,3 +538,26 @@ func.func @test_vector_from_elements(%arg0: f32, %arg1: f32, %arg2: f32, %arg3:
   %1 = vector.from_elements %arg0, %arg1, %arg2, %arg3 : vector<2x2xf32>
   return %1 : vector<2x2xf32>
 }
+
+// -----
+
+// CHECK-LABEL: func.func @to_elements_1d(
+// CHECK-SAME:    %[[ARG0:.+]]: vector<2xf32>
+// CHECK:         %[[RES:.+]]:2 = vector.to_elements %[[ARG0]] : vector<2xf32>
+// CHECK:         return %[[RES]]#0, %[[RES]]#1
+func.func @to_elements_1d(%arg0: vector<2xf32>) -> (f32, f32) {
+  %0:2 = vector.to_elements %arg0 : vector<2xf32>
+  return %0#0, %0#1 : f32, f32
+}
+
+// -----
+
+// CHECK-LABEL: func.func @to_elements_2d(
+// CHECK-SAME:    %[[ARG0:.+]]: vector<2x2xf32>
+// CHECK:         %[[CAST:.+]] = vector.shape_cast %[[ARG0]]
+// CHECK:         %[[RES:.+]]:4 = vector.to_elements %[[CAST]] : vector<4xf32>
+// CHECK:         return %[[RES]]#0, %[[RES]]#1, %[[RES]]#2, %[[RES]]#3
+func.func @to_elements_2d(%arg0: vector<2x2xf32>) -> (f32, f32, f32, f32) {
+  %0:4 = vector.to_elements %arg0 : vector<2x2xf32>
+  return %0#0, %0#1, %0#2, %0#3 : f32, f32, f32, f32
+}
diff --git a/mlir/test/Dialect/Vector/lit.local.cfg b/mlir/test/Dialect/Vector/lit.local.cfg
new file mode 100644
index 0000000000000..3e9e8f8497624
--- /dev/null
+++ b/mlir/test/Dialect/Vector/lit.local.cfg
@@ -0,0 +1,2 @@
+# Skip the directory with input TD sequences.
+config.excludes = ["td"]
diff --git a/mlir/test/Dialect/Vector/td/unroll-elements.mlir b/mlir/test/Dialect/Vector/td/unroll-elements.mlir
new file mode 100644
index 0000000000000..f7c69b503a561
--- /dev/null
+++ b/mlir/test/Dialect/Vector/td/unroll-elements.mlir
@@ -0,0 +1,14 @@
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @unroll_to_elements(%module_op: !transform.any_op {transform.readonly}) {
+
+    %func_op = transform.structured.match ops{["func.func"]} in %module_op
+      : (!transform.any_op) -> !transform.any_op
+    transform.apply_patterns to %func_op {
+      // Test patterns
+      transform.apply_patterns.vector.unroll_to_elements
+      transform.apply_patterns.vector.unroll_from_elements
+    } : !transform.any_op
+
+    transform.yield
+  }
+}
diff --git a/mlir/test/Dialect/Vector/td/xfer-drop-unit-dims.mlir b/mlir/test/Dialect/Vector/td/xfer-drop-unit-dims.mlir
new file mode 100644
index 0000000000000..44a823801d1cd
--- /dev/null
+++ b/mlir/test/Dialect/Vector/td/xfer-drop-unit-dims.mlir
@@ -0,0 +1,12 @@
+module @transforms attributes { transform.with_named_sequence } {
+  transform.named_sequence @drop_unit_dims(%module: !transform.any_op {transform.readonly}) {
+
+    %func_op = transform.structured.match ops{["func.func"]} in %module : (!transform.any_op) -> !transform.op<"func.func">
+    transform.apply_patterns to %func_op {
+      // Test patterns
+      transform.apply_patterns.vector.drop_inner_most_unit_dims_from_xfer_ops
+    } : !transform.op<"func.func">
+
+    transform.yield
+   }
+}
diff --git a/mlir/test/Dialect/Vector/vector-from-elements-lowering.mlir b/mlir/test/Dialect/Vector/vector-from-elements-lowering.mlir
index 8fac608ed5692..0d1bc662d09a0 100644
--- a/mlir/test/Dialect/Vector/vector-from-elements-lowering.mlir
+++ b/mlir/test/Dialect/Vector/vector-from-elements-lowering.mlir
@@ -1,33 +1,34 @@
-// RUN: mlir-opt %s -test-unroll-vector-from-elements | FileCheck %s --check-prefix=CHECK-UNROLL
+// RUN: mlir-opt %s -transform-preload-library='transform-library-paths=%p/td/unroll-elements.mlir' \
+// RUN: -transform-interpreter=entry-point=unroll_to_elements | FileCheck %s
 
 //===----------------------------------------------------------------------===//
 // Test UnrollFromElements.
 //===----------------------------------------------------------------------===//
 
-// CHECK-UNROLL-LABEL: @unroll_from_elements_2d
-// CHECK-UNROLL-SAME:    (%[[ARG0:.*]]: f32, %[[ARG1:.*]]: f32, %[[ARG2:.*]]: f32, %[[ARG3:.*]]: f32)
-// CHECK-UNROLL-NEXT:    %[[UNDEF_RES:.*]] = ub.poison : vector<2x2xf32>
-// CHECK-UNROLL-NEXT:    %[[VEC_0:.*]] = vector.from_elements %[[ARG0]], %[[ARG1]] : vector<2xf32>
-// CHECK-UNROLL-NEXT:    %[[RES_0:.*]] = vector.insert %[[VEC_0]], %[[UNDEF_RES]] [0] : vector<2xf32> into vector<2x2xf32>
-// CHECK-UNROLL-NEXT:    %[[VEC_1:.*]] = vector.from_elements %[[ARG2]], %[[ARG3]] : vector<2xf32>
-// CHECK-UNROLL-NEXT:    %[[RES_1:.*]] = vector.insert %[[VEC_1]], %[[RES_0]] [1] : vector<2xf32> into vector<2x2xf32>
-// CHECK-UNROLL-NEXT:    return %[[RES_1]] : vector<2x2xf32>
+// CHECK-LABEL: @unroll_from_elements_2d
+// CHECK-SAME:    (%[[ARG0:.*]]: f32, %[[ARG1:.*]]: f32, %[[ARG2:.*]]: f32, %[[ARG3:.*]]: f32)
+// CHECK-NEXT:    %[[UNDEF_RES:.*]] = ub.poison : vector<2x2xf32>
+// CHECK-NEXT:    %[[VEC_0:.*]] = vector.from_elements %[[ARG0]], %[[ARG1]] : vector<2xf32>
+// CHECK-NEXT:    %[[RES_0:.*]] = vector.insert %[[VEC_0]], %[[UNDEF_RES]] [0] : vector<2xf32> into vector<2x2xf32>
+// CHECK-NEXT:    %[[VEC_1:.*]] = vector.from_elements %[[ARG2]], %[[ARG3]] : vector<2xf32>
+// CHECK-NEXT:    %[[RES_1:.*]] = vector.insert %[[VEC_1]], %[[RES_0]] [1] : vector<2xf32> into vector<2x2xf32>
+// CHECK-NEXT:    return %[[RES_1]] : vector<2x2xf32>
 func.func @unroll_from_elements_2d(%arg0: f32, %arg1: f32, %arg2: f32, %arg3: f32) -> vector<2x2xf32> {
   %0 = vector.from_elements %arg0, %arg1, %arg2, %arg3 : vector<2x2xf32>
   return %0 : vector<2x2xf32>
 }
 
-// CHECK-UNROLL-LABEL: @unroll_from_elements_3d
-// CHECK-UNROLL-SAME:    (%[[ARG0:.*]]: f32, %[[ARG1:.*]]: f32, %[[ARG2:.*]]: f32, %[[ARG3:.*]]: f32)
-// CHECK-UNROLL-NEXT:    %[[UNDEF_RES:.*]] = ub.poison : vector<2x1x2xf32>
-// CHECK-UNROLL-NEXT:    %[[UNDEF_RANK_2:.*]] = ub.poison : vector<1x2xf32>
-// CHECK-UNROLL-NEXT:    %[[VEC_0:.*]] = vector.from_elements %[[ARG0]], %[[ARG1]] : vector<2xf32>
-// CHECK-UNROLL-NEXT:    %[[RANK_2_0:.*]] = vector.insert %[[VEC_0]], %[[UNDEF_RANK_2]] [0] : vector<2xf32> into vector<1x2xf32>
-// CHECK-UNROLL-NEXT:    %[[RES_0:.*]] = vector.insert %[[RANK_2_0]], %[[UNDEF_RES]] [0] : vector<1x2xf32> into vector<2x1x2xf32>
-// CHECK-UNROLL-NEXT:    %[[VEC_1:.*]] = vector.from_elements %[[ARG2]], %[[ARG3]] : vector<2xf32>
-// CHECK-UNROLL-NEXT:    %[[RANK_2_1:.*]] = vector.insert %[[VEC_1]], %[[UNDEF_RANK_2]] [0] : vector<2xf32> into vector<1x2xf32>
-// CHECK-UNROLL-NEXT:    %[[RES_1:.*]] = vector.insert %[[RANK_2_1]], %[[RES_0]] [1] : vector<1x2xf32> into vector<2x1x2xf32>
-// CHECK-UNROLL-NEXT:    return %[[RES_1]] : vector<2x1x2xf32>
+// CHECK-LABEL: @unroll_from_elements_3d
+// CHECK-SAME:    (%[[ARG0:.*]]: f32, %[[ARG1:.*]]: f32, %[[ARG2:.*]]: f32, %[[ARG3:.*]]: f32)
+// CHECK-NEXT:    %[[UNDEF_RES:.*]] = ub.poison : vector<2x1x2xf32>
+// CHECK-NEXT:    %[[UNDEF_RANK_2:.*]] = ub.poison : vector<1x2xf32>
+// CHECK-NEXT:    %[[VEC_0:.*]] = vector.from_elements %[[ARG0]], %[[ARG1]] : vector<2xf32>
+// CHECK-NEXT:    %[[RANK_2_0:.*]] = vector.insert %[[VEC_0]], %[[UNDEF_RANK_2]] [0] : vector<2xf32> into vector<1x2xf32>
+// CHECK-NEXT:    %[[RES_0:.*]] = vector.insert %[[RANK_2_0]], %[[UNDEF_RES]] [0] : vector<1x2xf32> into vector<2x1x2xf32>
+// CHECK-NEXT:    %[[VEC_1:.*]] = vector.from_elements %[[ARG2]], %[[ARG3]] : vector<2xf32>
+// CHECK-NEXT:    %[[RANK_2_1:.*]] = vector.insert %[[VEC_1]], %[[UNDEF_RANK_2]] [0] : vector<2xf32> into vector<1x2xf32>
+// CHECK-NEXT:    %[[RES_1:.*]] = vector.insert %[[RANK_2_1]], %[[RES_0]] [1] : vector<1x2xf32> into vector<2x1x2xf32>
+// CHECK-NEXT:    return %[[RES_1]] : vector<2x1x2xf32>
 func.func @unroll_from_elements_3d(%arg0: f32, %arg1: f32, %arg2: f32, %arg3: f32) -> vector<2x1x2xf32> {
   %0 = vector.from_elements %arg0, %arg1, %arg2, %arg3 : vector<2x1x2xf32>
   return %0 : vector<2x1x2xf32>
@@ -35,10 +36,10 @@ func.func @unroll_from_elements_3d(%arg0: f32, %arg1: f32, %arg2: f32, %arg3: f3
 
 // 1-D vector.from_elements should not be unrolled.
 
-// CHECK-UNROLL-LABEL: @negative_unroll_from_elements_1d
-// CHECK-UNROLL-SAME:    (%[[ARG0:.*]]: f32, %[[ARG1:.*]]: f32)
-// CHECK-UNROLL-NEXT:         %[[RES:.*]] = vector.from_elements %[[ARG0]], %[[ARG1]] : vector<2xf32>
-// CHECK-UNROLL-NEXT:    return %[[RES]] : vector<2xf32>
+// CHECK-LABEL: @negative_unroll_from_elements_1d
+// CHECK-SAME:    (%[[ARG0:.*]]: f32, %[[ARG1:.*]]: f32)
+// CHECK-NEXT:         %[[RES:.*]] = vector.from_elements %[[ARG0]], %[[ARG1]] : vector<2xf32>
+// CHECK-NEXT:    return %[[RES]] : vector<2xf32>
 func.func @negative_unroll_from_elements_1d(%arg0: f32, %arg1: f32) -> vector<2xf32> {
   %0 = vector.from_elements %arg0, %arg1 : vector<2xf32>
   return %0 : vector<2xf32>
diff --git a/mlir/test/Dialect/Vector/vector-shuffle-lowering.mlir b/mlir/test/Dialect/Vector/vector-shuffle-lowering.mlir
new file mode 100644
index 0000000000000..a137811fa367c
--- /dev/null
+++ b/mlir/test/Dialect/Vector/vector-shuffle-lowering.mlir
@@ -0,0 +1,77 @@
+// RUN: mlir-opt %s --test-vector-shuffle-lowering --split-input-file | FileCheck %s
+
+// CHECK-LABEL: func.func @shuffle_smaller_lhs_arbitrary
+// CHECK-SAME:    %[[LHS:.*]]: vector<2xf32>, %[[RHS:.*]]: vector<4xf32>
+func.func @shuffle_smaller_lhs_arbitrary(%lhs: vector<2xf32>, %rhs: vector<4xf32>) -> vector<5xf32> {
+  // CHECK: %[[PROMOTE_LHS:.*]] = vector.shuffle %[[LHS]], %[[LHS]] [0, 1, -1, -1] : vector<2xf32>, vector<2xf32>
+  // CHECK: %[[RESULT:.*]] = vector.shuffle %[[PROMOTE_LHS]], %[[RHS]] [1, 5, 0, 6, 7] : vector<4xf32>, vector<4xf32>
+  // CHECK: return %[[RESULT]] : vector<5xf32>
+  %0 = vector.shuffle %lhs, %rhs [1, 3, 0, 4, 5] : vector<2xf32>, vector<4xf32>
+  return %0 : vector<5xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func.func @shuffle_smaller_rhs_arbitrary
+// CHECK-SAME:    %[[LHS:.*]]: vector<4xi32>, %[[RHS:.*]]: vector<2xi32>
+func.func @shuffle_smaller_rhs_arbitrary(%lhs: vector<4xi32>, %rhs: vector<2xi32>) -> vector<6xi32> {
+  // CHECK: %[[PROMOTE_RHS:.*]] = vector.shuffle %[[RHS]], %[[RHS]] [0, 1, -1, -1] : vector<2xi32>, vector<2xi32>
+  // CHECK: %[[RESULT:.*]] = vector.shuffle %[[LHS]], %[[PROMOTE_RHS]] [3, 5, 1, 4, 0, 2] : vector<4xi32>, vector<4xi32>
+  // CHECK: return %[[RESULT]] : vector<6xi32>
+  %0 = vector.shuffle %lhs, %rhs [3, 5, 1, 4, 0, 2] : vector<4xi32>, vector<2xi32>
+  return %0 : vector<6xi32>
+}
+
+// -----
+
+// CHECK-LABEL: func.func @shuffle_smaller_lhs_concat
+// CHECK-SAME:    %[[LHS:.*]]: vector<3xf64>, %[[RHS:.*]]: vector<5xf64>
+func.func @shuffle_smaller_lhs_concat(%lhs: vector<3xf64>, %rhs: vector<5xf64>) -> vector<8xf64> {
+  // CHECK: %[[PROMOTE_LHS:.*]] = vector.shuffle %[[LHS]], %[[LHS]] [0, 1, 2, -1, -1] : vector<3xf64>, vector<3xf64>
+  // CHECK: %[[RESULT:.*]] = vector.shuffle %[[PROMOTE_LHS]], %[[RHS]] [0, 1, 2, 5, 6, 7, 8, 9] : vector<5xf64>, vector<5xf64>
+  // CHECK: return %[[RESULT]] : vector<8xf64>
+  %0 = vector.shuffle %lhs, %rhs [0, 1, 2, 3, 4, 5, 6, 7] : vector<3xf64>, vector<5xf64>
+  return %0 : vector<8xf64>
+}
+
+// -----
+
+// CHECK-LABEL: func.func @shuffle_smaller_rhs_concat
+// CHECK-SAME:    %[[LHS:.*]]: vector<4xi16>, %[[RHS:.*]]: vector<2xi16>
+func.func @shuffle_smaller_rhs_concat(%lhs: vector<4xi16>, %rhs: vector<2xi16>) -> vector<6xi16> {
+  // CHECK: %[[PROMOTE_RHS:.*]] = vector.shuffle %[[RHS]], %[[RHS]] [0, 1, -1, -1] : vector<2xi16>, vector<2xi16>
+  // CHECK: %[[RESULT:.*]] = vector.shuffle %[[LHS]], %[[PROMOTE_RHS]] [0, 1, 2, 3, 4, 5] : vector<4xi16>, vector<4xi16>
+  // CHECK: return %[[RESULT]] : vector<6xi16>
+  %0 = vector.shuffle %lhs, %rhs [0, 1, 2, 3, 4, 5] : vector<4xi16>, vector<2xi16>
+  return %0 : vector<6xi16>
+}
+
+// -----
+
+// Test that shuffles with same size inputs are not modified.
+
+// CHECK-LABEL: func.func @negative_shuffle_same_input_sizes
+// CHECK-SAME:    %[[LHS:.*]]: vector<4xf32>, %[[RHS:.*]]: vector<4xf32>
+func.func @negative_shuffle_same_input_sizes(%lhs: vector<4xf32>, %rhs: vector<4xf32>) -> vector<6xf32> {
+  // CHECK-NOT: vector.shuffle %[[LHS]], %[[LHS]]
+  // CHECK-NOT: vector.shuffle %[[RHS]], %[[RHS]]
+  // CHECK: %[[RESULT:.*]] = vector.shuffle %[[LHS]], %[[RHS]] [0, 1, 4, 5, 2, 6] : vector<4xf32>, vector<4xf32>
+  // CHECK: return %[[RESULT]] : vector<6xf32>
+  %0 = vector.shuffle %lhs, %rhs [0, 1, 4, 5, 2, 6] : vector<4xf32>, vector<4xf32>
+  return %0 : vector<6xf32>
+}
+
+// -----
+
+// Test that multi-dimensional shuffles are not modified.
+
+// CHECK-LABEL: func.func @negative_shuffle_2d_vectors
+// CHECK-SAME:    %[[LHS:.*]]: vector<2x4xf32>, %[[RHS:.*]]: vector<3x4xf32>
+func.func @negative_shuffle_2d_vectors(%lhs: vector<2x4xf32>, %rhs: vector<3x4xf32>) -> vector<4x4xf32> {
+  // CHECK-NOT: vector.shuffle %[[LHS]], %[[LHS]]
+  // CHECK-NOT: vector.shuffle %[[RHS]], %[[RHS]]
+  // CHECK: %[[RESULT:.*]] = vector.shuffle %[[LHS]], %[[RHS]] [0, 1, 2, 3] : vector<2x4xf32>, vector<3x4xf32>
+  // CHECK: return %[[RESULT]] : vector<4x4xf32>
+  %0 = vector.shuffle %lhs, %rhs [0, 1, 2, 3] : vector<2x4xf32>, vector<3x4xf32>
+  return %0 : vector<4x4xf32>
+}
diff --git a/mlir/test/Dialect/Vector/vector-to-elements-lowering.mlir b/mlir/test/Dialect/Vector/vector-to-elements-lowering.mlir
new file mode 100644
index 0000000000000..c521bf0138f98
--- /dev/null
+++ b/mlir/test/Dialect/Vector/vector-to-elements-lowering.mlir
@@ -0,0 +1,31 @@
+// RUN: mlir-opt %s -transform-preload-library='transform-library-paths=%p/td/unroll-elements.mlir' \
+// RUN: -transform-interpreter=entry-point=unroll_to_elements | FileCheck %s
+
+//===----------------------------------------------------------------------===//
+// Test UnrollToElements.
+//===----------------------------------------------------------------------===//
+
+// 1-D vector.from_elements should not be unrolled.
+
+// CHECK-LABEL: func.func @negative_unroll_to_elements_1d(
+// CHECK-SAME:    %[[ARG0:.+]]: vector<2xf32>
+// CHECK:         %[[RES:.+]]:2 = vector.to_elements %[[ARG0]] : vector<2xf32>
+// CHECK:         return %[[RES]]#0, %[[RES]]#1
+func.func @negative_unroll_to_elements_1d(%arg0: vector<2xf32>) -> (f32, f32) {
+  %0:2 = vector.to_elements %arg0 : vector<2xf32>
+  return %0#0, %0#1 : f32, f32
+}
+
+// -----
+
+// CHECK-LABEL: func.func @unroll_to_elements_2d(
+// CHECK-SAME:    %[[ARG0:.+]]: vector<2x2xf32>
+// CHECK:         %[[VEC0:.+]] = vector.extract %[[ARG0]][0] : vector<2xf32> from vector<2x2xf32>
+// CHECK:         %[[VEC1:.+]] = vector.extract %[[ARG0]][1] : vector<2xf32> from vector<2x2xf32>
+// CHECK:         %[[RES0:.+]]:2 = vector.to_elements %[[VEC0]] : vector<2xf32>
+// CHECK:         %[[RES1:.+]]:2 = vector.to_elements %[[VEC1]] : vector<2xf32>
+// CHECK:         return %[[RES0]]#0, %[[RES0]]#1, %[[RES1]]#0, %[[RES1]]#1
+func.func @unroll_to_elements_2d(%arg0: vector<2x2xf32>) -> (f32, f32, f32, f32) {
+  %0:4 = vector.to_elements %arg0 : vector<2x2xf32>
+  return %0#0, %0#1, %0#2, %0#3 : f32, f32, f32, f32
+}
diff --git a/mlir/test/Dialect/Vector/vector-transfer-collapse-inner-most-dims.mlir b/mlir/test/Dialect/Vector/vector-transfer-collapse-inner-most-dims.mlir
index cd56c1bf9695b..18c28799a62e5 100644
--- a/mlir/test/Dialect/Vector/vector-transfer-collapse-inner-most-dims.mlir
+++ b/mlir/test/Dialect/Vector/vector-transfer-collapse-inner-most-dims.mlir
@@ -1,4 +1,6 @@
-// RUN: mlir-opt %s -test-vector-transfer-collapse-inner-most-dims -split-input-file | FileCheck %s
+// RUN: mlir-opt -split-input-file \
+// RUN: -transform-preload-library='transform-library-paths=%p/td/xfer-drop-unit-dims.mlir' \
+// RUN: -transform-interpreter=entry-point=drop_unit_dims %s | FileCheck %s
 
 //-----------------------------------------------------------------------------
 // 1. vector.transfer_read
diff --git a/mlir/test/Dialect/Vector/vector-unroll-options.mlir b/mlir/test/Dialect/Vector/vector-unroll-options.mlir
index e129cd5c40b9c..35db14e0f7f1d 100644
--- a/mlir/test/Dialect/Vector/vector-unroll-options.mlir
+++ b/mlir/test/Dialect/Vector/vector-unroll-options.mlir
@@ -420,3 +420,23 @@ func.func @vector_store_2D(%mem: memref<4x4xf16>, %v: vector<4x4xf16>) {
   // CHECK: vector.store %[[V2]], %[[ARG0]][%[[C2]], %[[C0]]] : memref<4x4xf16>, vector<2x2xf16>
   // CHECK: %[[V3:.*]] = vector.extract_strided_slice %[[ARG1]] {offsets = [2, 2], sizes = [2, 2], strides = [1, 1]} : vector<4x4xf16> to vector<2x2xf16>
   // CHECK: vector.store %[[V3]], %[[ARG0]][%[[C2]], %[[C2]]] : memref<4x4xf16>, vector<2x2xf16>
+
+
+func.func @vector_step() -> vector<32xindex> {
+    %0 = vector.step : vector<32xindex>
+    return %0 : vector<32xindex>
+}
+// CHECK-LABEL: func @vector_step
+// CHECK: %[[CST:.*]] = arith.constant dense<24> : vector<8xindex>
+// CHECK: %[[CST0:.*]] = arith.constant dense<16> : vector<8xindex>
+// CHECK: %[[CST1:.*]] = arith.constant dense<8> : vector<8xindex>
+// CHECK: %[[CST2:.*]] = arith.constant dense<0> : vector<32xindex>
+// CHECK: %[[STEP:.*]] = vector.step : vector<8xindex>
+// CHECK: %[[INS0:.*]] = vector.insert_strided_slice %[[STEP]], %[[CST2]] {offsets = [0], strides = [1]} : vector<8xindex> into vector<32xindex>
+// CHECK: %[[ADD1:.*]] = arith.addi %[[STEP]], %[[CST1]] : vector<8xindex>
+// CHECK: %[[INS1:.*]] = vector.insert_strided_slice %[[ADD1]], %[[INS0]] {offsets = [8], strides = [1]} : vector<8xindex> into vector<32xindex>
+// CHECK: %[[ADD2:.*]] = arith.addi %[[STEP]], %[[CST0]] : vector<8xindex>
+// CHECK: %[[INS2:.*]] = vector.insert_strided_slice %[[ADD2]], %[[INS1]] {offsets = [16], strides = [1]} : vector<8xindex> into vector<32xindex>
+// CHECK: %[[ADD3:.*]] = arith.addi %[[STEP]], %[[CST]] : vector<8xindex>
+// CHECK: %[[INS3:.*]] = vector.insert_strided_slice %[[ADD3]], %[[INS2]] {offsets = [24], strides = [1]} : vector<8xindex> into vector<32xindex>
+// CHECK: return %[[INS3]] : vector<32xindex>
diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
index 60acea06c9a12..30ca9816df5bc 100644
--- a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
+++ b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
@@ -1,5 +1,8 @@
 // RUN: mlir-opt -xegpu-subgroup-distribute -allow-unregistered-dialect -canonicalize -cse -split-input-file %s | FileCheck %s
 
+// RUN: mlir-opt -xegpu-subgroup-distribute="enable-sg-reductions=false" -allow-unregistered-dialect \
+// RUN: -canonicalize -cse -split-input-file %s | FileCheck %s --check-prefix=CHECK-REDUCTION
+
 // CHECK-LABEL: gpu.func @store_nd_1d
 // CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<16xf32>) {
 // CHECK-DAG: %[[CST:.*]] = arith.constant dense<1.000000e+00> : vector<1xf32>
@@ -320,6 +323,116 @@ gpu.module @test {
   }
 }
 
+// -----
+// CHECK-LABEL: gpu.func @vector_multi_reduction_dim1_distributed_dim0_reduction
+// CHECK:       %[[W:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[16] ->
+// CHECK-SAME:    (!xegpu.tensor_desc<1x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, vector<16x2xf32>) {
+// CHECK:             %[[SRC:.*]] = "some_def"() {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : () -> vector<16x32xf32>
+// CHECK-NEXT:        gpu.yield %{{.*}}, %[[SRC]] : !xegpu.tensor_desc<1x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, vector<16x32xf32>
+// CHECK-NEXT:  }
+// CHECK:       %[[COL0:.*]] = vector.extract_strided_slice %[[W]]#1 {offsets = [0, 0], sizes = [16, 1], strides = [1, 1]} : vector<16x2xf32> to vector<16x1xf32>
+// CHECK-NEXT:  %[[CAST0:.*]] = vector.shape_cast %[[COL0]] : vector<16x1xf32> to vector<16xf32>
+// CHECK-NEXT:  %[[RED0:.*]] = vector.reduction <add>, %[[CAST0]], %{{.*}} : vector<16xf32> into f32
+// CHECK:       %[[COL1:.*]] = vector.extract_strided_slice %[[W]]#1 {offsets = [0, 1], sizes = [16, 1], strides = [1, 1]} : vector<16x2xf32> to vector<16x1xf32>
+// CHECK-NEXT:  %[[CAST1:.*]] = vector.shape_cast %[[COL1]] : vector<16x1xf32> to vector<16xf32>
+// CHECK-NEXT:  %[[RED1:.*]] = vector.reduction <add>, %[[CAST1]], %{{.*}} : vector<16xf32> into f32
+// CHECK-NEXT:  vector.from_elements %[[RED0]], %[[RED1]] : vector<2xf32>
+gpu.module @test {
+gpu.func @vector_multi_reduction_dim1_distributed_dim0_reduction() {
+  %0 = "some_def"() : () -> !xegpu.tensor_desc<1x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+  %src = "some_def"() {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}  : () -> (vector<16x32xf32>)
+  %acc = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>} dense<0.0>  : vector<32xf32>
+  %1 = vector.multi_reduction <add>, %src, %acc {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>}  [0]
+    : vector<16x32xf32> to vector<32xf32>
+  %3 = vector.shape_cast %1 {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+    : vector<32xf32> to vector<1x32xf32>
+  xegpu.store_nd %3, %0 : vector<1x32xf32>, !xegpu.tensor_desc<1x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+  gpu.return
+}
+}
+
+// -----
+// CHECK-REDUCTION-LABEL: gpu.func @vector_multi_reduction_dim1_distributed_dim1_reduction
+// CHECK-REDUCTION:         %[[W:.*]]:3 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (!xegpu.tensor_desc<2x16xf32,
+// CHECK-REDUCTION-SAME:      #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, f32, f32) {
+// CHECK-REDUCTION:           %[[SRC:.*]] = "some_def"() {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : () -> vector<2x16xf32>
+// CHECK-REDUCTION-NEXT:      %[[ROW0:.*]] = vector.extract %[[SRC]][0] : vector<16xf32> from vector<2x16xf32>
+// CHECK-REDUCTION-NEXT:      %[[R0:.*]] = vector.reduction <add>, %[[ROW0]], %{{.*}} : vector<16xf32> into f32
+// CHECK-REDUCTION-NEXT:      %[[ROW1:.*]] = vector.extract %[[SRC]][1] : vector<16xf32> from vector<2x16xf32>
+// CHECK-REDUCTION-NEXT:      %[[R1:.*]] = vector.reduction <add>, %[[ROW1]], %{{.*}} : vector<16xf32> into f32
+// CHECK-REDUCTION-NEXT:      gpu.yield %4, %[[R1]], %[[R0]] : !xegpu.tensor_desc<2x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, f32, f32
+// CHECK-REDUCTION-NEXT:    }
+// CHECK-REDUCTION-NEXT:    vector.from_elements %[[W]]#2, %[[W]]#1 : vector<2xf32>
+gpu.module @test {
+gpu.func @vector_multi_reduction_dim1_distributed_dim1_reduction() {
+  %0 = "some_def"() : () -> !xegpu.tensor_desc<2x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+  %src = "some_def"() {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}  : () -> (vector<2x16xf32>)
+  %acc = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>} dense<0.0>  : vector<2xf32>
+  %1 = vector.multi_reduction <add>, %src, %acc {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>}
+    [1] : vector<2x16xf32> to vector<2xf32>
+  %3 = vector.shape_cast %1 {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+    : vector<2xf32> to vector<2x1xf32>
+  %4 = vector.broadcast %3 {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<2x1xf32> to vector<2x16xf32>
+  xegpu.store_nd %4, %0 : vector<2x16xf32>, !xegpu.tensor_desc<2x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+  gpu.return
+}
+}
+
+// -----
+// CHECK-LABEL:   gpu.func @vector_multi_reduction_dim0_distributed_dim1_reduction
+// CHECK:             %[[W:.*]]:2 = gpu.warp_execute_on_lane_0(%0)[16] ->
+// CHECK-SAME:          (!xegpu.tensor_desc<32x1xf32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>>, vector<2x16xf32>) {
+// CHECK:                 %[[SRC:.*]] = "some_def"() {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>} : () -> vector<32x16xf32>
+// CHECK-NEXT:            gpu.yield %{{.*}}, %[[SRC]] : !xegpu.tensor_desc<32x1xf32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>>, vector<32x16xf32>
+// CHECK-NEXT:        }
+// CHECK:             %[[ROW0:.*]] = vector.extract %[[W]]#1[0] : vector<16xf32> from vector<2x16xf32>
+// CHECK-NEXT:        %[[R0:.*]] = vector.reduction <add>, %[[ROW0]], %{{.*}} : vector<16xf32> into f32
+// CHECK:             %[[ROW1:.*]] = vector.extract %[[W]]#1[1] : vector<16xf32> from vector<2x16xf32>
+// CHECK-NEXT:        %[[R1:.*]] = vector.reduction <add>, %[[ROW1]], %{{.*}} : vector<16xf32> into f32
+// CHECK-NEXT:        vector.from_elements %[[R0]], %[[R1]] : vector<2xf32>
+gpu.module @test {
+gpu.func @vector_multi_reduction_dim0_distributed_dim1_reduction() {
+  %0 = "some_def"() : () -> !xegpu.tensor_desc<32x1xf32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>>
+  %src = "some_def"() {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}  : () -> (vector<32x16xf32>)
+  %acc = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, dims = [1]>} dense<0.0>  : vector<32xf32>
+  %1 = vector.multi_reduction <add>, %src, %acc {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, dims = [1]>}  [1]
+    : vector<32x16xf32> to vector<32xf32>
+  %3 = vector.shape_cast %1 {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}
+    : vector<32xf32> to vector<32x1xf32>
+  xegpu.store_nd %3, %0 : vector<32x1xf32>, !xegpu.tensor_desc<32x1xf32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>>
+  gpu.return
+}
+}
+
+// -----
+// CHECK-REDUCTION-LABEL: gpu.func @vector_multi_reduction_dim0_distributed_dim0_reduction
+// CHECK-REDUCTION:       %[[W:.*]]:3 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (!xegpu.tensor_desc<16x2xf32,
+// CHECK-REDUCTION-SAME:    #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>>, f32, f32) {
+// CHECK-REDUCTION:          %[[SRC:.*]] = "some_def"() {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>} : () -> vector<16x2xf32>
+// CHECK-REDUCTION-NEXT:     %[[COL0:.*]] = vector.extract_strided_slice %[[SRC]] {offsets = [0, 0], sizes = [16, 1], strides = [1, 1]} : vector<16x2xf32> to vector<16x1xf32>
+// CHECK-REDUCTION-NEXT:     %[[CAST0:.*]] = vector.shape_cast %[[COL0]] : vector<16x1xf32> to vector<16xf32>
+// CHECK-REDUCTION-NEXT:     %[[R0:.*]] = vector.reduction <add>, %[[CAST0]], %{{.*}} : vector<16xf32> into f32
+// CHECK-REDUCTION-NEXT:     %[[COL1:.*]] = vector.extract_strided_slice %5 {offsets = [0, 1], sizes = [16, 1], strides = [1, 1]} : vector<16x2xf32> to vector<16x1xf32>
+// CHECK-REDUCTION-NEXT:     %[[CAST1:.*]] = vector.shape_cast %[[COL1]] : vector<16x1xf32> to vector<16xf32>
+// CHECK-REDUCTION-NEXT:     %[[R1:.*]] = vector.reduction <add>, %[[CAST1]], %cst : vector<16xf32> into f32
+// CHECK-REDUCTION-NEXT:     gpu.yield %4, %[[R1]], %[[R0]] : !xegpu.tensor_desc<16x2xf32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>>, f32, f32
+// CHECK-REDUCTION-NEXT:   }
+// CHECK-REDUCTION-NEXT:   vector.from_elements %[[W]]#2, %[[W]]#1 : vector<2xf32>
+gpu.module @test {
+gpu.func @vector_multi_reduction_dim0_distributed_dim0_reduction() {
+  %0 = "some_def"() : () -> !xegpu.tensor_desc<16x2xf32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>>
+  %src = "some_def"() {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}  : () -> (vector<16x2xf32>)
+  %acc = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, dims = [0]>} dense<0.0>  : vector<2xf32>
+  %1 = vector.multi_reduction <add>, %src, %acc {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, dims = [0]>}
+    [0] : vector<16x2xf32> to vector<2xf32>
+  %3 = vector.shape_cast %1 {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}
+    : vector<2xf32> to vector<1x2xf32>
+  %4 = vector.broadcast %3 {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>} : vector<1x2xf32> to vector<16x2xf32>
+  xegpu.store_nd %4, %0 : vector<16x2xf32>, !xegpu.tensor_desc<16x2xf32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>>
+  gpu.return
+}
+}
+
 // -----
 // CHECK-LABEL: gpu.func @scatter_ops_chunksize({{.*}}) {
 // CHECK: %[[MASK:.*]] = arith.constant dense<true> : vector<1xi1>
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
index afb2bf876c18f..3478a9b91da5f 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
@@ -2,6 +2,7 @@
 
 //CHECK: #map = affine_map<()[s0] -> (s0 floordiv 4)>
 //CHECK: #map1 = affine_map<()[s0] -> (s0 mod 4)>
+//CHECK: #map2 = affine_map<()[s0] -> (s0 floordiv 8)>
 gpu.module @test_distribution {
   // CHECK-LABEL: create_nd_tdesc_no_offset
   // CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32>
@@ -365,4 +366,62 @@ gpu.module @test_distribution {
     xegpu.store_matrix %cst, %mdesc[0, 0] {layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [32, 32]>} : vector<64x128xf32>, !xegpu.mem_desc<64x128xf32>
     gpu.return
   }
+
+  // CHECK-LABEL: vector_step_op
+  gpu.func @vector_step_op_slice_attr() {
+    //CHECK: [[sgId:%.+]] = gpu.subgroup_id : index
+    //CHECK-DAG: [[IDY:%.+]] = affine.apply #map2()[[[sgId]]]
+    //CHECK-DAG: [[c32:%.+]] = arith.constant 32 : index
+    //CHECK-DAG: [[LOCALY:%.+]] = index.mul [[IDY]], [[c32]]
+    //CHECK-DAG: [[c0:%.+]] = arith.constant 0 : index
+    //CHECK-DAG: [[Y:%.+]] = arith.addi [[LOCALY]], [[c0]] : index
+    //CHECK-DAG: [[c128:%.+]] = arith.constant 128 : index
+    //CHECK-DAG: [[MODY:%.+]] = index.remu [[Y]], [[c128]]
+    //CHECK-DAG: [[BASE:%.+]] = vector.step : vector<32xindex>
+    //CHECK-DAG: [[CAST:%.+]] = vector.broadcast [[MODY]] : index to vector<32xindex>
+    //CHECK: [[ADD:%.+]] = arith.addi [[BASE]], [[CAST]] : vector<32xindex>
+    %step = vector.step {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [4, 8], sg_data = [32, 32]>, dims = [1]>}: vector<128xindex>
+    gpu.return
+  }
+
+  gpu.func @vector_step_op_layout_attr() {
+    //CHECK: [[sgId:%.+]] = gpu.subgroup_id : index
+    //CHECK-DAG: [[c16:%.+]] = arith.constant 16 : index
+    //CHECK-DAG: [[c8:%.+]] = arith.constant 8 : index
+    //CHECK-DAG: [[LOCALY:%.+]] = index.mul [[sgId]], [[c8]]
+    //CHECK-DAG: [[c0:%.+]] = arith.constant 0 : index
+    //CHECK-DAG: [[Y:%.+]] = arith.addi [[LOCALY]], [[c0]] : index
+    //CHECK-DAG: [[c128:%.+]] = arith.constant 128 : index
+    //CHECK-DAG: [[MODY:%.+]] = index.remu [[Y]], [[c128]]
+    //CHECK-DAG: [[BASE:%.+]] = vector.step : vector<8xindex>
+    //CHECK-DAG: [[CAST:%.+]] = vector.broadcast [[MODY]] : index to vector<8xindex>
+    //CHECK: [[ADD:%.+]] = arith.addi [[BASE]], [[CAST]] : vector<8xindex>
+    %step = vector.step {layout_result_0 = #xegpu.layout<sg_layout = [16], sg_data = [8]>}: vector<128xindex>
+    gpu.return
+  }
+
+  // CHECK-LABEL: constant_with_slice_attr
+  gpu.func @constant_with_slice_attr() {
+    //CHECK: [[cst:%.+]] = arith.constant dense<10> : vector<1xindex>
+    %cst = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [4, 2, 6, 1], sg_data = [1, 1, 1, 1]>, dims = [1, 2, 3]>} dense<10> : vector<4xindex>
+    gpu.return
+  }
+
+  // CHECK-LABEL: vector_shape_cast
+  gpu.func @vector_shape_cast() {
+    %cst = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [8, 1, 1, 4], sg_data = [1, 1, 1, 32]>, dims = [0, 1, 2]>} dense<10> : vector<128xindex>
+    %step = vector.step {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [8, 1, 1, 4], sg_data = [1, 1, 1, 32]>, dims = [0, 1, 2]>} : vector<128xindex>
+    %muli = arith.muli %cst, %step {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [8, 1, 1, 4], sg_data = [1, 1, 1, 32]>, dims = [0, 1, 2]>} : vector<128xindex>
+    //CHECK: vector.shape_cast {{.*}} : vector<32xindex> to vector<1x1x1x32xindex>
+    %shape_cast = vector.shape_cast %muli {layout_result_0 = #xegpu.layout<sg_layout = [8, 1, 1, 4], sg_data = [1, 1, 1, 32]>} : vector<128xindex> to vector<1x1x1x128xindex>
+    gpu.return
+  }
+
+  // CHECK-LABEL: vector_broadcast
+  gpu.func @vector_broadcast(%arg0: index, %arg1: index) {
+    %muli = arith.muli %arg0, %arg1 : index
+    // CHECK: vector.broadcast {{.*}} : index to vector<1x1x1x32xindex>
+    %broadcast = vector.broadcast %muli {layout_result_0 = #xegpu.layout<sg_layout = [4, 2, 6, 1], sg_data = [1, 1, 1, 32]>} : index to vector<4x2x6x32xindex>
+    gpu.return
+  }
 }
diff --git a/mlir/test/Examples/standalone/lit.local.cfg b/mlir/test/Examples/standalone/lit.local.cfg
index fe8397c6b9a10..3b12dcbd99e83 100644
--- a/mlir/test/Examples/standalone/lit.local.cfg
+++ b/mlir/test/Examples/standalone/lit.local.cfg
@@ -10,3 +10,4 @@ config.substitutions.append(("%host_cc", config.host_cc))
 config.substitutions.append(("%enable_libcxx", config.enable_libcxx))
 config.substitutions.append(("%mlir_cmake_dir", config.mlir_cmake_dir))
 config.substitutions.append(("%llvm_use_linker", config.llvm_use_linker))
+config.substitutions.append(("%cmake_build_type", config.cmake_build_type))
diff --git a/mlir/test/Examples/standalone/test.toy b/mlir/test/Examples/standalone/test.toy
index e99bab5f0affc..8b6d9dd62b76d 100644
--- a/mlir/test/Examples/standalone/test.toy
+++ b/mlir/test/Examples/standalone/test.toy
@@ -1,4 +1,5 @@
 # RUN: "%cmake_exe" "%mlir_src_root/examples/standalone" -G "%cmake_generator" \
+# RUN: -DCMAKE_BUILD_TYPE=%cmake_build_type \
 # RUN: -DCMAKE_CXX_COMPILER=%host_cxx -DCMAKE_C_COMPILER=%host_cc \
 # RUN: -DLLVM_ENABLE_LIBCXX=%enable_libcxx -DMLIR_DIR=%mlir_cmake_dir \
 # RUN: -DLLVM_USE_LINKER=%llvm_use_linker \
@@ -11,4 +12,4 @@
 # if any fail.
 # CHECK: Passed
 # CHECK-NOT: Failed
-# UNSUPPORTED: target={{.*(windows|android).*}}
+# UNSUPPORTED: target={{.*(android).*}}
diff --git a/mlir/test/IR/test-pattern-logging-listener.mlir b/mlir/test/IR/test-pattern-logging-listener.mlir
index d3d42e3c9a500..d7d9120655596 100644
--- a/mlir/test/IR/test-pattern-logging-listener.mlir
+++ b/mlir/test/IR/test-pattern-logging-listener.mlir
@@ -1,6 +1,6 @@
 // REQUIRES: asserts
 // RUN: mlir-opt %s --test-walk-pattern-rewrite-driver \
-// RUN:   --allow-unregistered-dialect --debug-only=pattern-logging-listener 2>&1 | FileCheck %s
+// RUN:   --debug-only=pattern-logging-listener 2>&1 | FileCheck %s
 
 // Check that when replacing an op with a new op, we get appropriate
 // pattern-logging lines. The use of check same is to avoid the complexity of
@@ -8,15 +8,18 @@
 // {anonymous_namespace} vs `anonymous_namespace` (and maybe others?) on the
 // various platforms.
 
-// CHECK: [pattern-logging-listener:1]
+// Changing the reference output here need to also plan on updating the post-processing
+// script here:  https://github.com/llvm/mlir-www/blob/main/.github%2Fworkflows%2Fmain.yml#L71-L72
+
+// CHECK: [pattern-logging-listener PatternLoggingListener.cpp:10 1]
 // CHECK-SAME: ::ReplaceWithNewOp | notifyOperationInserted | test.new_op
-// CHECK: [pattern-logging-listener:1]
+// CHECK: [pattern-logging-listener PatternLoggingListener.cpp:31 1]
 // CHECK-SAME: ::ReplaceWithNewOp | notifyOperationReplaced (with values) | test.replace_with_new_op
-// CHECK: [pattern-logging-listener:1]
+// CHECK: [pattern-logging-listener PatternLoggingListener.cpp:17 1]
 // CHECK-SAME: ::ReplaceWithNewOp | notifyOperationModified | arith.addi
-// CHECK: [pattern-logging-listener:1]
+// CHECK: [pattern-logging-listener PatternLoggingListener.cpp:17 1]
 // CHECK-SAME: ::ReplaceWithNewOp | notifyOperationModified | arith.addi
-// CHECK: [pattern-logging-listener:1]
+// CHECK: [pattern-logging-listener PatternLoggingListener.cpp:38 1]
 // CHECK-SAME: ::ReplaceWithNewOp | notifyOperationErased | test.replace_with_new_op
 func.func @replace_with_new_op() -> i32 {
   %a = "test.replace_with_new_op"() : () -> (i32)
diff --git a/mlir/test/Integration/Dialect/MemRef/assume-alignment-runtime-verification.mlir b/mlir/test/Integration/Dialect/MemRef/assume-alignment-runtime-verification.mlir
index 25a338df8d790..01a826a638606 100644
--- a/mlir/test/Integration/Dialect/MemRef/assume-alignment-runtime-verification.mlir
+++ b/mlir/test/Integration/Dialect/MemRef/assume-alignment-runtime-verification.mlir
@@ -1,7 +1,8 @@
 // RUN: mlir-opt %s -generate-runtime-verification \
 // RUN:     -expand-strided-metadata \
 // RUN:     -test-cf-assert \
-// RUN:     -convert-to-llvm | \
+// RUN:     -convert-to-llvm \
+// RUN:     -reconcile-unrealized-casts | \
 // RUN: mlir-runner -e main -entry-point-result=void \
 // RUN:     -shared-libs=%mlir_runner_utils 2>&1 | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/MemRef/atomic-rmw-runtime-verification.mlir b/mlir/test/Integration/Dialect/MemRef/atomic-rmw-runtime-verification.mlir
index 4c6a48d577a6c..1144a7caf36e8 100644
--- a/mlir/test/Integration/Dialect/MemRef/atomic-rmw-runtime-verification.mlir
+++ b/mlir/test/Integration/Dialect/MemRef/atomic-rmw-runtime-verification.mlir
@@ -1,6 +1,7 @@
 // RUN: mlir-opt %s -generate-runtime-verification \
 // RUN:     -test-cf-assert \
-// RUN:     -convert-to-llvm | \
+// RUN:     -convert-to-llvm \
+// RUN:     -reconcile-unrealized-casts | \
 // RUN: mlir-runner -e main -entry-point-result=void \
 // RUN:     -shared-libs=%mlir_runner_utils 2>&1 | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/MemRef/store-runtime-verification.mlir b/mlir/test/Integration/Dialect/MemRef/store-runtime-verification.mlir
index dd000c6904bcb..82e63805cd027 100644
--- a/mlir/test/Integration/Dialect/MemRef/store-runtime-verification.mlir
+++ b/mlir/test/Integration/Dialect/MemRef/store-runtime-verification.mlir
@@ -1,6 +1,7 @@
 // RUN: mlir-opt %s -generate-runtime-verification \
 // RUN:     -test-cf-assert \
-// RUN:     -convert-to-llvm | \
+// RUN:     -convert-to-llvm \
+// RUN:     -reconcile-unrealized-casts | \
 // RUN: mlir-runner -e main -entry-point-result=void \
 // RUN:     -shared-libs=%mlir_runner_utils 2>&1 | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Pass/remarks.mlir b/mlir/test/Pass/remarks.mlir
new file mode 100644
index 0000000000000..8aa04e3c98d80
--- /dev/null
+++ b/mlir/test/Pass/remarks.mlir
@@ -0,0 +1,28 @@
+// RUN: mlir-opt %s --test-remark --remarks-filter-passed="category-1-passed" 2>&1 | FileCheck %s -check-prefix=CHECK-PASSED 
+// RUN: mlir-opt %s --test-remark --remarks-filter-missed="a-category-1-missed" 2>&1 | FileCheck %s -check-prefix=CHECK-MISSED
+// RUN: mlir-opt %s --test-remark --remarks-filter-failed="category-2-failed" 2>&1 | FileCheck %s -check-prefix=CHECK-FAILED
+// RUN: mlir-opt %s --test-remark --remarks-filter-analyse="category-2-analysis" 2>&1 | FileCheck %s -check-prefix=CHECK-ANALYSIS
+// RUN: mlir-opt %s --test-remark --remarks-filter="category.*" 2>&1 | FileCheck %s -check-prefix=CHECK-ALL
+// RUN: mlir-opt %s --test-remark --remarks-filter="category-1.*" 2>&1 | FileCheck %s -check-prefix=CHECK-ALL1
+module @foo {
+  "test.op"() : () -> ()
+  
+}
+
+
+// CHECK-PASSED: remarks.mlir:8:3: remark: [Passed] test-remark | Category:category-1-passed | Reason="because we are testing the remark pipeline", Remark="This is a test passed remark", Suggestion="try using the remark pipeline feature"
+// CHECK-MISSED:remarks.mlir:8:3: remark: [Missed] test-remark | Category:a-category-1-missed | Reason="because we are testing the remark pipeline", Remark="This is a test missed remark", Suggestion="try using the remark pipeline feature"
+// CHECK-FAILED: remarks.mlir:8:3: remark: [Failure] test-remark | Category:category-2-failed | Reason="because we are testing the remark pipeline", Remark="This is a test failed remark", Suggestion="try using the remark pipeline feature"
+// CHECK-ANALYSIS: remarks.mlir:8:3: remark: [Analysis] test-remark | Category:category-2-analysis | Remark="This is a test analysis remark"
+
+
+// CHECK-ALL: remarks.mlir:8:3: remark: [Passed] test-remark | Category:category-1-passed | Reason="because we are testing the remark pipeline", Remark="This is a test passed remark", Suggestion="try using the remark pipeline feature"
+// CHECK-ALL: remarks.mlir:8:3: remark: [Failure] test-remark | Category:category-2-failed | Reason="because we are testing the remark pipeline", Remark="This is a test failed remark", Suggestion="try using the remark pipeline feature"
+// CHECK-ALL: remarks.mlir:8:3: remark: [Analysis] test-remark | Category:category-2-analysis | Remark="This is a test analysis remark"
+
+// CHECK-ALL1: remarks.mlir:8:3: remark: [Passed] test-remark | Category:category-1-passed | Reason="because we are testing the remark pipeline", Remark="This is a test passed remark", Suggestion="try using the remark pipeline feature"
+// CHECK-ALL1-NOT: remarks.mlir:8:3: remark: [Missed]
+// CHECK-ALL1-NOT: remarks.mlir:8:3: remark: [Failure]
+// CHECK-ALL1-NOT: remarks.mlir:8:3: remark: [Analysis]
+
+
diff --git a/mlir/test/Target/LLVMIR/Import/intrinsic.ll b/mlir/test/Target/LLVMIR/Import/intrinsic.ll
index 07d22120153fe..db14a487b4aa7 100644
--- a/mlir/test/Target/LLVMIR/Import/intrinsic.ll
+++ b/mlir/test/Target/LLVMIR/Import/intrinsic.ll
@@ -567,12 +567,11 @@ define void @annotate_intrinsics(ptr %var, ptr %ptr, i16 %int, ptr %annotation,
 
 ; CHECK-LABEL:  llvm.func @trap_intrinsics
 define void @trap_intrinsics() {
-  ; CHECK: "llvm.intr.trap"() : () -> ()
+  ; CHECK:  llvm.intr.trap
   call void @llvm.trap()
-  ; CHECK: "llvm.intr.debugtrap"() : () -> ()
+  ; CHECK: llvm.intr.debugtrap
   call void @llvm.debugtrap()
-  ; CHECK: "llvm.intr.ubsantrap"()
-  ; CHECK-SAME: failureKind = 1
+  ; CHECK: llvm.intr.ubsantrap {failureKind = 1 : i8}
   call void @llvm.ubsantrap(i8 1)
   ret void
 }
diff --git a/mlir/test/Target/LLVMIR/arm-sme-invalid.mlir b/mlir/test/Target/LLVMIR/arm-sme-invalid.mlir
index 14821da838726..6f5b1d8c5d93d 100644
--- a/mlir/test/Target/LLVMIR/arm-sme-invalid.mlir
+++ b/mlir/test/Target/LLVMIR/arm-sme-invalid.mlir
@@ -36,6 +36,6 @@ llvm.func @arm_sme_tile_slice_to_vector_invalid_element_types(
 
 llvm.func @arm_sme_streaming_vl_invalid_return_type() -> i32 {
   // expected-error @+1 {{failed to verify that `res` is i64}}
-  %res = "arm_sme.intr.cntsb"() : () -> i32
+  %res = "arm_sme.intr.cntsd"() : () -> i32
   llvm.return %res : i32
 }
diff --git a/mlir/test/Target/LLVMIR/arm-sme.mlir b/mlir/test/Target/LLVMIR/arm-sme.mlir
index aedb6730b06bb..0a13a75618a23 100644
--- a/mlir/test/Target/LLVMIR/arm-sme.mlir
+++ b/mlir/test/Target/LLVMIR/arm-sme.mlir
@@ -419,12 +419,6 @@ llvm.func @arm_sme_tile_slice_to_vector_vert(%tileslice : i32,
 // -----
 
 llvm.func @arm_sme_streaming_vl() {
-  // CHECK: call i64 @llvm.aarch64.sme.cntsb()
-  %svl_b = "arm_sme.intr.cntsb"() : () -> i64
-  // CHECK: call i64 @llvm.aarch64.sme.cntsh()
-  %svl_h = "arm_sme.intr.cntsh"() : () -> i64
-  // CHECK: call i64 @llvm.aarch64.sme.cntsw()
-  %svl_w = "arm_sme.intr.cntsw"() : () -> i64
   // CHECK: call i64 @llvm.aarch64.sme.cntsd()
   %svl_d = "arm_sme.intr.cntsd"() : () -> i64
   llvm.return
diff --git a/mlir/test/Target/LLVMIR/llvmir-intrinsics.mlir b/mlir/test/Target/LLVMIR/llvmir-intrinsics.mlir
index c99dde36f5ccb..b75e07ef85f60 100644
--- a/mlir/test/Target/LLVMIR/llvmir-intrinsics.mlir
+++ b/mlir/test/Target/LLVMIR/llvmir-intrinsics.mlir
@@ -607,6 +607,13 @@ llvm.func @trap_intrinsics() {
   "llvm.intr.debugtrap"() : () -> ()
   // CHECK: call void @llvm.ubsantrap(i8 1)
   "llvm.intr.ubsantrap"() {failureKind = 1 : i8} : () -> ()
+
+  // CHECK: call void @llvm.trap()
+  llvm.intr.trap
+  // CHECK: call void @llvm.debugtrap()
+  llvm.intr.debugtrap
+  // CHECK: call void @llvm.ubsantrap(i8 1)
+  llvm.intr.ubsantrap {failureKind = 1 : i8}
   llvm.return
 }
 
diff --git a/mlir/test/Target/LLVMIR/nvvm/clusterlaunchcontrol.mlir b/mlir/test/Target/LLVMIR/nvvm/clusterlaunchcontrol.mlir
new file mode 100644
index 0000000000000..6ff85fa38a0a0
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/nvvm/clusterlaunchcontrol.mlir
@@ -0,0 +1,55 @@
+// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
+
+llvm.func @clusterlaunchcontrol_try_cancel(%addr: !llvm.ptr<3>, %mbar: !llvm.ptr<3>) {
+  // CHECK-LABEL: define void @clusterlaunchcontrol_try_cancel(ptr addrspace(3) %0, ptr addrspace(3) %1) {
+  // CHECK-NEXT: call void @llvm.nvvm.clusterlaunchcontrol.try_cancel.async.shared(ptr addrspace(3) %0, ptr addrspace(3) %1)
+  // CHECK-NEXT: ret void
+  // CHECK-NEXT: }
+  nvvm.clusterlaunchcontrol.try.cancel %addr, %mbar
+  llvm.return
+}
+
+llvm.func @clusterlaunchcontrol_try_cancel_multicast(%addr: !llvm.ptr<3>, %mbar: !llvm.ptr<3>) {
+  // CHECK-LABEL: define void @clusterlaunchcontrol_try_cancel_multicast(ptr addrspace(3) %0, ptr addrspace(3) %1) {
+  // CHECK-NEXT: call void @llvm.nvvm.clusterlaunchcontrol.try_cancel.async.multicast.shared(ptr addrspace(3) %0, ptr addrspace(3) %1)
+  // CHECK-NEXT: ret void
+  // CHECK-NEXT: }
+  nvvm.clusterlaunchcontrol.try.cancel multicast, %addr, %mbar
+  llvm.return
+}
+
+llvm.func @clusterlaunchcontrol_query_cancel_is_canceled(%try_cancel_response: i128) {
+  // CHECK-LABEL: define void @clusterlaunchcontrol_query_cancel_is_canceled(i128 %0) {
+  // CHECK-NEXT: %2 = call i1 @llvm.nvvm.clusterlaunchcontrol.query_cancel.is_canceled(i128 %0)
+  // CHECK-NEXT: ret void
+  // CHECK-NEXT: }
+  %res = nvvm.clusterlaunchcontrol.query.cancel query = is_canceled, %try_cancel_response : i1
+  llvm.return
+}
+
+llvm.func @clusterlaunchcontrol_query_cancel_get_first_cta_id_x(%try_cancel_response: i128) {
+  // CHECK-LABEL: define void @clusterlaunchcontrol_query_cancel_get_first_cta_id_x(i128 %0) {
+  // CHECK-NEXT: %2 = call i32 @llvm.nvvm.clusterlaunchcontrol.query_cancel.get_first_ctaid.x(i128 %0)
+  // CHECK-NEXT: ret void
+  // CHECK-NEXT: }
+  %res = nvvm.clusterlaunchcontrol.query.cancel query = get_first_cta_id_x, %try_cancel_response : i32
+  llvm.return
+}
+
+llvm.func @clusterlaunchcontrol_query_cancel_get_first_cta_id_y(%try_cancel_response: i128) {
+  // CHECK-LABEL: define void @clusterlaunchcontrol_query_cancel_get_first_cta_id_y(i128 %0) {
+  // CHECK-NEXT: %2 = call i32 @llvm.nvvm.clusterlaunchcontrol.query_cancel.get_first_ctaid.y(i128 %0)
+  // CHECK-NEXT: ret void
+  // CHECK-NEXT: }
+  %res = nvvm.clusterlaunchcontrol.query.cancel query = get_first_cta_id_y, %try_cancel_response : i32
+  llvm.return
+}
+
+llvm.func @clusterlaunchcontrol_query_cancel_get_first_cta_id_z(%try_cancel_response: i128) {
+  // CHECK-LABEL: define void @clusterlaunchcontrol_query_cancel_get_first_cta_id_z(i128 %0) {
+  // CHECK-NEXT: %2 = call i32 @llvm.nvvm.clusterlaunchcontrol.query_cancel.get_first_ctaid.z(i128 %0)
+  // CHECK-NEXT: ret void
+  // CHECK-NEXT: }
+  %res = nvvm.clusterlaunchcontrol.query.cancel query = get_first_cta_id_z, %try_cancel_response : i32
+  llvm.return
+}
diff --git a/mlir/test/Target/LLVMIR/nvvmir-invalid.mlir b/mlir/test/Target/LLVMIR/nvvmir-invalid.mlir
index b35a6dbcca286..383f4829f3287 100644
--- a/mlir/test/Target/LLVMIR/nvvmir-invalid.mlir
+++ b/mlir/test/Target/LLVMIR/nvvmir-invalid.mlir
@@ -535,3 +535,19 @@ llvm.func @nanosleep() {
   nvvm.nanosleep 100000000000000
   llvm.return
 }
+
+// -----
+
+llvm.func @clusterlaunchcontrol_query_cancel_is_canceled_invalid_return_type(%try_cancel_response: i128) {
+  // expected-error@+1 {{'nvvm.clusterlaunchcontrol.query.cancel' op is_canceled query type returns an i1}}
+  %res = nvvm.clusterlaunchcontrol.query.cancel query = is_canceled, %try_cancel_response : i32
+  llvm.return
+}
+
+// -----
+
+llvm.func @clusterlaunchcontrol_query_cancel_get_first_cta_id_invalid_return_type(%try_cancel_response: i128) {
+  // expected-error@+1 {{'nvvm.clusterlaunchcontrol.query.cancel' op get_first_cta_id_x, get_first_cta_id_y, get_first_cta_id_z query types return an i32}}
+  %res = nvvm.clusterlaunchcontrol.query.cancel query = get_first_cta_id_x, %try_cancel_response : i1
+  llvm.return
+}
diff --git a/mlir/test/Target/LLVMIR/openmp-target-default-as.mlir b/mlir/test/Target/LLVMIR/openmp-target-default-as.mlir
new file mode 100644
index 0000000000000..8344867d5fb7b
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/openmp-target-default-as.mlir
@@ -0,0 +1,21 @@
+// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
+
+// This tests that we correctly use the default program AS from the data layout.
+module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.program_memory_space", 4 : ui32>>, llvm.target_triple = "spirv64-intel", omp.is_target_device = true, omp.is_gpu = true} {
+
+// CHECK: @[[IDENT:.*]] = private unnamed_addr constant %s{{.*}} { i32 0, i32 2, i32 0, i32 22, ptr addrspace(4) addrspacecast (ptr @{{.*}} to ptr addrspace(4)) }, align 8
+
+ llvm.func @omp_target_region_() {
+    %0 = llvm.mlir.constant(20 : i32) : i32
+    %1 = llvm.mlir.constant(10 : i32) : i32
+    %2 = llvm.mlir.constant(1 : i64) : i64
+    %3 = llvm.alloca %2 x i32 {bindc_name = "a", in_type = i32, operandSegmentSizes = array<i32: 0, 0>, uniq_name = "_QFomp_target_regionEa"} : (i64) -> !llvm.ptr<5>
+    %4 = llvm.addrspacecast %3 : !llvm.ptr<5> to !llvm.ptr
+    llvm.store %1, %4 : i32, !llvm.ptr
+    %map = omp.map.info var_ptr(%4 : !llvm.ptr, i32)   map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""}
+    omp.target map_entries(%map -> %arg : !llvm.ptr) {
+      omp.terminator
+    }
+    llvm.return
+  }
+}
diff --git a/mlir/test/Target/LLVMIR/ptr.mlir b/mlir/test/Target/LLVMIR/ptr.mlir
index 4b29be813fa81..2fa794130ec52 100644
--- a/mlir/test/Target/LLVMIR/ptr.mlir
+++ b/mlir/test/Target/LLVMIR/ptr.mlir
@@ -41,10 +41,10 @@ llvm.func @type_offset(%arg0: !ptr.ptr<#llvm.address_space<0>>) -> !llvm.struct<
   %2 = ptr.type_offset i16 : i32
   %3 = ptr.type_offset i32 : i32
   %4 = llvm.mlir.poison : !llvm.struct<(i32, i32, i32, i32)>
-  %5 = llvm.insertvalue %0, %4[0] : !llvm.struct<(i32, i32, i32, i32)> 
-  %6 = llvm.insertvalue %1, %5[1] : !llvm.struct<(i32, i32, i32, i32)> 
-  %7 = llvm.insertvalue %2, %6[2] : !llvm.struct<(i32, i32, i32, i32)> 
-  %8 = llvm.insertvalue %3, %7[3] : !llvm.struct<(i32, i32, i32, i32)> 
+  %5 = llvm.insertvalue %0, %4[0] : !llvm.struct<(i32, i32, i32, i32)>
+  %6 = llvm.insertvalue %1, %5[1] : !llvm.struct<(i32, i32, i32, i32)>
+  %7 = llvm.insertvalue %2, %6[2] : !llvm.struct<(i32, i32, i32, i32)>
+  %8 = llvm.insertvalue %3, %7[3] : !llvm.struct<(i32, i32, i32, i32)>
   llvm.return %8 : !llvm.struct<(i32, i32, i32, i32)>
 }
 
@@ -194,7 +194,7 @@ llvm.func @scatter_ops_i64(%value: vector<8xi64>, %ptrs: vector<8x!ptr.ptr<#llvm
 // CHECK-NEXT:   call void @llvm.masked.store.v4f64.p3(<4 x double> %[[VALUE_F64]], ptr addrspace(3) %[[PTR_SHARED]], i32 8, <4 x i1> %[[MASK]])
 // CHECK-NEXT:   ret void
 // CHECK-NEXT: }
-llvm.func @mixed_masked_ops_address_spaces(%ptr: !ptr.ptr<#llvm.address_space<3>>, %ptrs: vector<4x!ptr.ptr<#llvm.address_space<3>>>, 
+llvm.func @mixed_masked_ops_address_spaces(%ptr: !ptr.ptr<#llvm.address_space<3>>, %ptrs: vector<4x!ptr.ptr<#llvm.address_space<3>>>,
                                           %mask: vector<4xi1>, %value: vector<4xf64>, %passthrough: vector<4xf64>) {
   // Test with shared memory address space (3) and f64 elements
   %0 = ptr.gather %ptrs, %mask, %passthrough alignment = 8 : vector<4x!ptr.ptr<#llvm.address_space<3>>> -> vector<4xf64>
@@ -233,3 +233,51 @@ llvm.func @ptr_add_vector_base_scalar_offset(%ptrs: vector<4x!ptr.ptr<#llvm.addr
   %res = ptr.ptr_add %ptrs, %offset : vector<4x!ptr.ptr<#llvm.address_space<0>>>, i32
   llvm.return %res : vector<4x!ptr.ptr<#llvm.address_space<0>>>
 }
+
+// CHECK-LABEL: declare ptr @nvvm_ptr_address_space(ptr addrspace(1), ptr addrspace(3), ptr addrspace(4), ptr addrspace(5), ptr addrspace(6), ptr addrspace(7))
+llvm.func @nvvm_ptr_address_space(
+    !ptr.ptr<#nvvm.memory_space<global>>,
+    !ptr.ptr<#nvvm.memory_space<shared>>,
+    !ptr.ptr<#nvvm.memory_space<constant>>,
+    !ptr.ptr<#nvvm.memory_space<local>>,
+    !ptr.ptr<#nvvm.memory_space<tensor>>,
+    !ptr.ptr<#nvvm.memory_space<shared_cluster>>
+  ) -> !ptr.ptr<#nvvm.memory_space<generic>>
+
+// CHECK-LABEL: define void @llvm_ops_with_ptr_nvvm_values
+// CHECK-SAME:   (ptr %[[ARG:.*]]) {
+// CHECK-NEXT:   %[[V0:.*]] = load ptr addrspace(1), ptr %[[ARG]], align 8
+// CHECK-NEXT:   store ptr addrspace(1) %[[V0]], ptr %[[ARG]], align 8
+// CHECK-NEXT:   ret void
+// CHECK-NEXT: }
+llvm.func @llvm_ops_with_ptr_nvvm_values(%arg0: !llvm.ptr) {
+  %1 = llvm.load %arg0 : !llvm.ptr -> !ptr.ptr<#nvvm.memory_space<global>>
+  llvm.store %1, %arg0 : !ptr.ptr<#nvvm.memory_space<global>>, !llvm.ptr
+  llvm.return
+}
+
+// CHECK-LABEL: define { ptr, ptr addrspace(1), ptr addrspace(2) } @constant_address_op() {
+// CHECK-NEXT: ret { ptr, ptr addrspace(1), ptr addrspace(2) } { ptr null, ptr addrspace(1) inttoptr (i64 4096 to ptr addrspace(1)), ptr addrspace(2) inttoptr (i64 3735928559 to ptr addrspace(2)) }
+llvm.func @constant_address_op() ->
+    !llvm.struct<(!ptr.ptr<#llvm.address_space<0>>,
+                  !ptr.ptr<#llvm.address_space<1>>,
+                  !ptr.ptr<#llvm.address_space<2>>)> {
+  %0 = ptr.constant #ptr.null : !ptr.ptr<#llvm.address_space<0>>
+  %1 = ptr.constant #ptr.address<0x1000> : !ptr.ptr<#llvm.address_space<1>>
+  %2 = ptr.constant #ptr.address<3735928559> : !ptr.ptr<#llvm.address_space<2>>
+  %3 = llvm.mlir.poison : !llvm.struct<(!ptr.ptr<#llvm.address_space<0>>, !ptr.ptr<#llvm.address_space<1>>, !ptr.ptr<#llvm.address_space<2>>)>
+  %4 = llvm.insertvalue %0, %3[0] : !llvm.struct<(!ptr.ptr<#llvm.address_space<0>>, !ptr.ptr<#llvm.address_space<1>>, !ptr.ptr<#llvm.address_space<2>>)>
+  %5 = llvm.insertvalue %1, %4[1] : !llvm.struct<(!ptr.ptr<#llvm.address_space<0>>, !ptr.ptr<#llvm.address_space<1>>, !ptr.ptr<#llvm.address_space<2>>)>
+  %6 = llvm.insertvalue %2, %5[2] : !llvm.struct<(!ptr.ptr<#llvm.address_space<0>>, !ptr.ptr<#llvm.address_space<1>>, !ptr.ptr<#llvm.address_space<2>>)>
+  llvm.return %6 : !llvm.struct<(!ptr.ptr<#llvm.address_space<0>>, !ptr.ptr<#llvm.address_space<1>>, !ptr.ptr<#llvm.address_space<2>>)>
+}
+
+// Test gep folders.
+// CHECK-LABEL: define ptr @ptr_add_cst() {
+// CHECK-NEXT:   ret ptr inttoptr (i64 42 to ptr)
+llvm.func @ptr_add_cst() -> !ptr.ptr<#llvm.address_space<0>> {
+  %off = llvm.mlir.constant(42 : i32) : i32
+  %ptr = ptr.constant #ptr.null : !ptr.ptr<#llvm.address_space<0>>
+  %res = ptr.ptr_add %ptr, %off : !ptr.ptr<#llvm.address_space<0>>, i32
+  llvm.return %res : !ptr.ptr<#llvm.address_space<0>>
+}
diff --git a/mlir/test/Target/SPIRV/arithmetic-ops.mlir b/mlir/test/Target/SPIRV/arithmetic-ops.mlir
index b80e17f979daa..ec47035d088b7 100644
--- a/mlir/test/Target/SPIRV/arithmetic-ops.mlir
+++ b/mlir/test/Target/SPIRV/arithmetic-ops.mlir
@@ -1,6 +1,11 @@
 // RUN: mlir-translate -no-implicit-module -test-spirv-roundtrip %s | FileCheck %s
 
-spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
+// RUN: %if spirv-tools %{ rm -rf %t %}
+// RUN: %if spirv-tools %{ mkdir %t %}
+// RUN: %if spirv-tools %{ mlir-translate --no-implicit-module --serialize-spirv --split-input-file --spirv-save-validation-files-with-prefix=%t/module %s %}
+// RUN: %if spirv-tools %{ spirv-val %t %}
+
+spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader, Linkage, BFloat16TypeKHR, BFloat16DotProductKHR], [SPV_KHR_bfloat16]> {
   spirv.func @fmul(%arg0 : f32, %arg1 : f32) "None" {
     // CHECK: {{%.*}}= spirv.FMul {{%.*}}, {{%.*}} : f32
     %0 = spirv.FMul %arg0, %arg1 : f32
diff --git a/mlir/test/Target/SPIRV/array-two-step-roundtrip.mlir b/mlir/test/Target/SPIRV/array-two-step-roundtrip.mlir
index 1a3bc88633d0d..203b8589c9ad5 100644
--- a/mlir/test/Target/SPIRV/array-two-step-roundtrip.mlir
+++ b/mlir/test/Target/SPIRV/array-two-step-roundtrip.mlir
@@ -1,6 +1,11 @@
 // RUN: mlir-translate -no-implicit-module -split-input-file -serialize-spirv -deserialize-spirv %s | FileCheck %s
 
-spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
+// RUN: %if spirv-tools %{ rm -rf %t %}
+// RUN: %if spirv-tools %{ mkdir %t %}
+// RUN: %if spirv-tools %{ mlir-translate --no-implicit-module --serialize-spirv --split-input-file --spirv-save-validation-files-with-prefix=%t/module %s %}
+// RUN: %if spirv-tools %{ spirv-val %t %}
+
+spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader, Linkage], [SPV_KHR_storage_buffer_storage_class]> {
   spirv.func @array_stride(%arg0 : !spirv.ptr<!spirv.array<4x!spirv.array<4xf32, stride=4>, stride=128>, StorageBuffer>, %arg1 : i32, %arg2 : i32) "None" {
     // CHECK: {{%.*}} = spirv.AccessChain {{%.*}}[{{%.*}}, {{%.*}}] : !spirv.ptr<!spirv.array<4 x !spirv.array<4 x f32, stride=4>, stride=128>, StorageBuffer>, i32, i32
     %2 = spirv.AccessChain %arg0[%arg1, %arg2] : !spirv.ptr<!spirv.array<4x!spirv.array<4xf32, stride=4>, stride=128>, StorageBuffer>, i32, i32 -> !spirv.ptr<f32, StorageBuffer>
@@ -10,7 +15,7 @@ spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
 
 // -----
 
-spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
+spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader, Linkage, Float16], [SPV_KHR_storage_buffer_storage_class]> {
   // CHECK: spirv.GlobalVariable {{@.*}} : !spirv.ptr<!spirv.rtarray<f32, stride=4>, StorageBuffer>
   spirv.GlobalVariable @var0 : !spirv.ptr<!spirv.rtarray<f32, stride=4>, StorageBuffer>
   // CHECK: spirv.GlobalVariable {{@.*}} : !spirv.ptr<!spirv.rtarray<vector<4xf16>>, Input>
diff --git a/mlir/test/Target/SPIRV/array.mlir b/mlir/test/Target/SPIRV/array.mlir
index 56908e687a914..095f744379e01 100644
--- a/mlir/test/Target/SPIRV/array.mlir
+++ b/mlir/test/Target/SPIRV/array.mlir
@@ -1,6 +1,11 @@
 // RUN: mlir-translate -no-implicit-module -split-input-file -test-spirv-roundtrip %s | FileCheck %s
 
-spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
+// RUN: %if spirv-tools %{ rm -rf %t %}
+// RUN: %if spirv-tools %{ mkdir %t %}
+// RUN: %if spirv-tools %{ mlir-translate --no-implicit-module --serialize-spirv --split-input-file --spirv-save-validation-files-with-prefix=%t/module %s %}
+// RUN: %if spirv-tools %{ spirv-val %t %}
+
+spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader, Linkage], [SPV_KHR_storage_buffer_storage_class]> {
   spirv.func @array_stride(%arg0 : !spirv.ptr<!spirv.array<4x!spirv.array<4xf32, stride=4>, stride=128>, StorageBuffer>, %arg1 : i32, %arg2 : i32) "None" {
     // CHECK: {{%.*}} = spirv.AccessChain {{%.*}}[{{%.*}}, {{%.*}}] : !spirv.ptr<!spirv.array<4 x !spirv.array<4 x f32, stride=4>, stride=128>, StorageBuffer>, i32, i32
     %2 = spirv.AccessChain %arg0[%arg1, %arg2] : !spirv.ptr<!spirv.array<4x!spirv.array<4xf32, stride=4>, stride=128>, StorageBuffer>, i32, i32 -> !spirv.ptr<f32, StorageBuffer>
@@ -10,7 +15,7 @@ spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
 
 // -----
 
-spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
+spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader, Linkage, Float16], [SPV_KHR_storage_buffer_storage_class]> {
   // CHECK: spirv.GlobalVariable {{@.*}} : !spirv.ptr<!spirv.rtarray<f32, stride=4>, StorageBuffer>
   spirv.GlobalVariable @var0 : !spirv.ptr<!spirv.rtarray<f32, stride=4>, StorageBuffer>
   // CHECK: spirv.GlobalVariable {{@.*}} : !spirv.ptr<!spirv.rtarray<vector<4xf16>>, Input>
diff --git a/mlir/test/Target/SPIRV/atomic-ops.mlir b/mlir/test/Target/SPIRV/atomic-ops.mlir
index cb7d9626e6c62..3e4908123a345 100644
--- a/mlir/test/Target/SPIRV/atomic-ops.mlir
+++ b/mlir/test/Target/SPIRV/atomic-ops.mlir
@@ -1,6 +1,11 @@
 // RUN: mlir-translate -no-implicit-module -test-spirv-roundtrip -split-input-file %s | FileCheck %s
 
-spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
+// RUN: %if spirv-tools %{ rm -rf %t %}
+// RUN: %if spirv-tools %{ mkdir %t %}
+// RUN: %if spirv-tools %{ mlir-translate --no-implicit-module --serialize-spirv --split-input-file --spirv-save-validation-files-with-prefix=%t/module %s %}
+// RUN: %if spirv-tools %{ spirv-val %t %}
+
+spirv.module Physical64 OpenCL requires #spirv.vce<v1.0, [Kernel, Linkage, Addresses, AtomicFloat32AddEXT], [SPV_EXT_shader_atomic_float_add]> {
   // CHECK-LABEL: @test_int_atomics
   spirv.func @test_int_atomics(%ptr: !spirv.ptr<i32, Workgroup>, %value: i32, %comparator: i32) -> i32 "None" {
     // CHECK: spirv.AtomicCompareExchangeWeak <Workgroup> <Release> <Acquire> %{{.*}}, %{{.*}}, %{{.*}} : !spirv.ptr<i32, Workgroup>
diff --git a/mlir/test/Target/SPIRV/barrier-ops.mlir b/mlir/test/Target/SPIRV/barrier-ops.mlir
index 56b0661c6105b..a229fccebac8a 100644
--- a/mlir/test/Target/SPIRV/barrier-ops.mlir
+++ b/mlir/test/Target/SPIRV/barrier-ops.mlir
@@ -1,6 +1,11 @@
 // RUN: mlir-translate -no-implicit-module -test-spirv-roundtrip %s | FileCheck %s
 
-spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
+// RUN: %if spirv-tools %{ rm -rf %t %}
+// RUN: %if spirv-tools %{ mkdir %t %}
+// RUN: %if spirv-tools %{ mlir-translate --no-implicit-module --serialize-spirv --split-input-file --spirv-save-validation-files-with-prefix=%t/module %s %}
+// RUN: %if spirv-tools %{ spirv-val %t %}
+
+spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader, Linkage], []> {
   spirv.func @memory_barrier_0() -> () "None" {
     // CHECK: spirv.MemoryBarrier <Device>, <Release|UniformMemory>
     spirv.MemoryBarrier <Device>, <Release|UniformMemory>
diff --git a/mlir/test/Target/SPIRV/bit-ops.mlir b/mlir/test/Target/SPIRV/bit-ops.mlir
index 61e863eb9f1c4..3be4091580798 100644
--- a/mlir/test/Target/SPIRV/bit-ops.mlir
+++ b/mlir/test/Target/SPIRV/bit-ops.mlir
@@ -1,6 +1,11 @@
 // RUN: mlir-translate -no-implicit-module -test-spirv-roundtrip -split-input-file %s | FileCheck %s
 
-spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
+// RUN: %if spirv-tools %{ rm -rf %t %}
+// RUN: %if spirv-tools %{ mkdir %t %}
+// RUN: %if spirv-tools %{ mlir-translate --no-implicit-module --serialize-spirv --split-input-file --spirv-save-validation-files-with-prefix=%t/module %s %}
+// RUN: %if spirv-tools %{ spirv-val %t %}
+
+spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader, Linkage, Int16, Int8], []> {
   spirv.func @bitcount(%arg: i32) -> i32 "None" {
     // CHECK: spirv.BitCount {{%.*}} : i32
     %0 = spirv.BitCount %arg : i32
diff --git a/mlir/test/Target/SPIRV/cast-ops.mlir b/mlir/test/Target/SPIRV/cast-ops.mlir
index 04a468b39b645..4f29610f928c4 100644
--- a/mlir/test/Target/SPIRV/cast-ops.mlir
+++ b/mlir/test/Target/SPIRV/cast-ops.mlir
@@ -1,6 +1,11 @@
 // RUN: mlir-translate -no-implicit-module -test-spirv-roundtrip -split-input-file %s | FileCheck %s
 
-spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
+// RUN: %if spirv-tools %{ rm -rf %t %}
+// RUN: %if spirv-tools %{ mkdir %t %}
+// RUN: %if spirv-tools %{ mlir-translate --no-implicit-module --serialize-spirv --split-input-file --spirv-save-validation-files-with-prefix=%t/module %s %}
+// RUN: %if spirv-tools %{ spirv-val %t %}
+
+spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader, Linkage], []> {
   spirv.func @bit_cast(%arg0 : f32) "None" {
     // CHECK: {{%.*}} = spirv.Bitcast {{%.*}} : f32 to i32
     %0 = spirv.Bitcast %arg0 : f32 to i32
@@ -14,7 +19,7 @@ spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
 
 // -----
 
-spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
+spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader, Linkage, BFloat16TypeKHR, Float64, Int64], [SPV_KHR_bfloat16]> {
   spirv.func @convert_f_to_s(%arg0 : f32) -> i32 "None" {
     // CHECK: {{%.*}} = spirv.ConvertFToS {{%.*}} : f32 to i32
     %0 = spirv.ConvertFToS %arg0 : f32 to i32
@@ -104,7 +109,7 @@ spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
 
 // -----
 
-spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Kernel], []> {
+spirv.module Physical64 OpenCL requires #spirv.vce<v1.0, [Kernel, Linkage, GenericPointer], []> {
   spirv.func @ptr_cast_to_generic(%arg0 : !spirv.ptr<f32, CrossWorkgroup>) "None" {
     // CHECK: {{%.*}} = spirv.PtrCastToGeneric {{%.*}} : !spirv.ptr<f32, CrossWorkgroup> to !spirv.ptr<f32, Generic>
     %0 = spirv.PtrCastToGeneric %arg0 : !spirv.ptr<f32, CrossWorkgroup> to !spirv.ptr<f32, Generic>
@@ -124,7 +129,7 @@ spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Kernel], []> {
 
 // -----
 
-spirv.module Physical64 OpenCL requires #spirv.vce<v1.0, [Kernel, Addresses], []> {
+spirv.module Physical64 OpenCL requires #spirv.vce<v1.0, [Kernel, Addresses, Linkage, GenericPointer, Int64], []> {
   spirv.func @covert_ptr_to_u(%arg0 : !spirv.ptr<i32, Generic>) "None" {
     // CHECK: {{%.*}} = spirv.ConvertPtrToU {{%.*}} : !spirv.ptr<i32, Generic> to i32
     %0 = spirv.ConvertPtrToU %arg0 : !spirv.ptr<i32, Generic> to i32
@@ -144,7 +149,7 @@ spirv.module Physical64 OpenCL requires #spirv.vce<v1.0, [Kernel, Addresses], []
 
 // -----
 
-spirv.module PhysicalStorageBuffer64 OpenCL requires #spirv.vce<v1.0, [Kernel, Addresses, PhysicalStorageBufferAddresses], []> {  
+spirv.module PhysicalStorageBuffer64 OpenCL requires #spirv.vce<v1.0, [Kernel, Addresses, PhysicalStorageBufferAddresses, Linkage], [SPV_EXT_physical_storage_buffer]> {  
   spirv.func @covert_ptr_to_u_PhysicalStorageBuffer(%arg0 : !spirv.ptr<i32, PhysicalStorageBuffer> { spirv.decoration = #spirv.decoration<Aliased>} ) "None" {
     // CHECK: {{%.*}} = spirv.ConvertPtrToU {{%.*}} : !spirv.ptr<i32, PhysicalStorageBuffer> to i32
     %0 = spirv.ConvertPtrToU %arg0 : !spirv.ptr<i32, PhysicalStorageBuffer> to i32
@@ -154,7 +159,7 @@ spirv.module PhysicalStorageBuffer64 OpenCL requires #spirv.vce<v1.0, [Kernel, A
 
 // -----
 
-spirv.module Physical64 OpenCL requires #spirv.vce<v1.0, [Kernel, Addresses], []> {
+spirv.module Physical64 OpenCL requires #spirv.vce<v1.0, [Kernel, Addresses, Linkage, GenericPointer, Int64], []> {
   spirv.func @covert_u_to_ptr(%arg0 : i32) "None" {
     // CHECK: {{%.*}} = spirv.ConvertUToPtr {{%.*}} : i32 to !spirv.ptr<i32, Generic> 
     %0 = spirv.ConvertUToPtr %arg0 : i32 to !spirv.ptr<i32, Generic>
@@ -174,7 +179,7 @@ spirv.module Physical64 OpenCL requires #spirv.vce<v1.0, [Kernel, Addresses], []
 
 // -----
 
-spirv.module PhysicalStorageBuffer64 OpenCL requires #spirv.vce<v1.0, [Kernel, Addresses, PhysicalStorageBufferAddresses], []> {
+spirv.module PhysicalStorageBuffer64 OpenCL requires #spirv.vce<v1.0, [Kernel, Addresses, PhysicalStorageBufferAddresses, Linkage], [SPV_EXT_physical_storage_buffer]> {
   spirv.func @covert_u_to_ptr_PhysicalStorageBuffer(%arg0 : i32) "None" {
     // CHECK: {{%.*}} = spirv.ConvertUToPtr {{%.*}} : i32 to !spirv.ptr<i32, PhysicalStorageBuffer>
     %0 = spirv.ConvertUToPtr %arg0 : i32 to !spirv.ptr<i32, PhysicalStorageBuffer>
diff --git a/mlir/test/Target/SPIRV/composite-op.mlir b/mlir/test/Target/SPIRV/composite-op.mlir
index 5f302fd0d38f8..014cca42a1b8f 100644
--- a/mlir/test/Target/SPIRV/composite-op.mlir
+++ b/mlir/test/Target/SPIRV/composite-op.mlir
@@ -1,6 +1,11 @@
 // RUN: mlir-translate -no-implicit-module -split-input-file -test-spirv-roundtrip %s | FileCheck %s
 
-spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
+// RUN: %if spirv-tools %{ rm -rf %t %}
+// RUN: %if spirv-tools %{ mkdir %t %}
+// RUN: %if spirv-tools %{ mlir-translate --no-implicit-module --serialize-spirv --split-input-file --spirv-save-validation-files-with-prefix=%t/module %s %}
+// RUN: %if spirv-tools %{ spirv-val %t %}
+
+spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader, Linkage], []> {
   spirv.func @composite_insert(%arg0 : !spirv.struct<(f32, !spirv.struct<(!spirv.array<4xf32>, f32)>)>, %arg1: !spirv.array<4xf32>) -> !spirv.struct<(f32, !spirv.struct<(!spirv.array<4xf32>, f32)>)> "None" {
     // CHECK: spirv.CompositeInsert {{%.*}}, {{%.*}}[1 : i32, 0 : i32] : !spirv.array<4 x f32> into !spirv.struct<(f32, !spirv.struct<(!spirv.array<4 x f32>, f32)>)>
     %0 = spirv.CompositeInsert %arg1, %arg0[1 : i32, 0 : i32] : !spirv.array<4xf32> into !spirv.struct<(f32, !spirv.struct<(!spirv.array<4xf32>, f32)>)>
diff --git a/mlir/test/Target/SPIRV/gl-ops.mlir b/mlir/test/Target/SPIRV/gl-ops.mlir
index 832f7ea2fe314..3c661f88e90d2 100644
--- a/mlir/test/Target/SPIRV/gl-ops.mlir
+++ b/mlir/test/Target/SPIRV/gl-ops.mlir
@@ -1,6 +1,11 @@
 // RUN: mlir-translate -no-implicit-module -test-spirv-roundtrip %s | FileCheck %s
 
-spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
+// RUN: %if spirv-tools %{ rm -rf %t %}
+// RUN: %if spirv-tools %{ mkdir %t %}
+// RUN: %if spirv-tools %{ mlir-translate --no-implicit-module --serialize-spirv --split-input-file --spirv-save-validation-files-with-prefix=%t/module %s %}
+// RUN: %if spirv-tools %{ spirv-val %t %}
+
+spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader, Linkage], []> {
   spirv.func @math(%arg0 : f32, %arg1 : f32, %arg2 : i32) "None" {
     // CHECK: {{%.*}} = spirv.GL.Exp {{%.*}} : f32
     %0 = spirv.GL.Exp %arg0 : f32
diff --git a/mlir/test/Target/SPIRV/graph-ops.mlir b/mlir/test/Target/SPIRV/graph-ops.mlir
new file mode 100644
index 0000000000000..c956157bfa6c1
--- /dev/null
+++ b/mlir/test/Target/SPIRV/graph-ops.mlir
@@ -0,0 +1,25 @@
+// RUN: mlir-translate --no-implicit-module --test-spirv-roundtrip %s | FileCheck %s
+// RUN: %if spirv-tools %{ mlir-translate --no-implicit-module --serialize-spirv %s | spirv-val %}
+
+// CHECK: spirv.module Logical Vulkan requires #spirv.vce<v1.3, [VulkanMemoryModel, Shader, Int8, Int16, TensorsARM, GraphARM], [SPV_ARM_tensors, SPV_ARM_graph, SPV_KHR_vulkan_memory_model]> {
+spirv.module Logical Vulkan requires #spirv.vce<v1.3, [VulkanMemoryModel, Shader, Int8, Int16, TensorsARM, GraphARM], [SPV_ARM_tensors, SPV_ARM_graph, SPV_KHR_vulkan_memory_model]> {
+  // CHECK: spirv.GlobalVariable [[VARARG0:@.*]] bind(0, 0) : !spirv.ptr<!spirv.arm.tensor<14x19xi16>, UniformConstant>
+  spirv.GlobalVariable @main_arg_0 bind(0, 0) : !spirv.ptr<!spirv.arm.tensor<14x19xi16>, UniformConstant>
+  // CHECK: spirv.GlobalVariable [[VARRES0:@.*]] bind(0, 1) : !spirv.ptr<!spirv.arm.tensor<2x3xi16>, UniformConstant>
+  spirv.GlobalVariable @main_res_0 bind(0, 1) : !spirv.ptr<!spirv.arm.tensor<2x3xi16>, UniformConstant>
+  // CHECK: spirv.ARM.GraphEntryPoint [[GN:@.*]], [[VARARG0]], [[VARRES0]]
+  spirv.ARM.GraphEntryPoint @main, @main_arg_0, @main_res_0
+  // CHECK: spirv.ARM.Graph [[GN]]({{%.*}}: !spirv.arm.tensor<14x19xi16>) -> !spirv.arm.tensor<2x3xi16> attributes {entry_point = true} {
+  spirv.ARM.Graph @main(%arg0 : !spirv.arm.tensor<14x19xi16>) -> !spirv.arm.tensor<2x3xi16> attributes {entry_point = true} {
+    // CHECK: [[CONST2:%.*]] = spirv.ARM.GraphConstant {graph_constant_id = 42 : i32} : !spirv.arm.tensor<2x3xi16>
+    %0 = spirv.ARM.GraphConstant { graph_constant_id = 42 : i32 } : !spirv.arm.tensor<2x3xi16>
+    // CHECK: spirv.ARM.GraphOutputs [[OUT:%.*]] : !spirv.arm.tensor<2x3xi16>
+    spirv.ARM.GraphOutputs %0 : !spirv.arm.tensor<2x3xi16>
+  }
+
+  // CHECK: spirv.ARM.Graph {{@.*}}({{%.*}}: !spirv.arm.tensor<1x16x16x16xi8>) -> !spirv.arm.tensor<1x16x16x16xi8> attributes {entry_point = false} {
+  spirv.ARM.Graph @empty_graph(%arg0: !spirv.arm.tensor<1x16x16x16xi8>) -> !spirv.arm.tensor<1x16x16x16xi8> {
+    // CHECK: spirv.ARM.GraphOutputs {{%.*}} : !spirv.arm.tensor<1x16x16x16xi8>
+    spirv.ARM.GraphOutputs %arg0 : !spirv.arm.tensor<1x16x16x16xi8>
+  }
+}
diff --git a/mlir/test/Target/SPIRV/group-ops.mlir b/mlir/test/Target/SPIRV/group-ops.mlir
index 2ba7f23258e7f..cf519cba961c5 100644
--- a/mlir/test/Target/SPIRV/group-ops.mlir
+++ b/mlir/test/Target/SPIRV/group-ops.mlir
@@ -1,6 +1,11 @@
 // RUN: mlir-translate -no-implicit-module -test-spirv-roundtrip -split-input-file %s | FileCheck %s
 
-spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
+// RUN: %if spirv-tools %{ rm -rf %t %}
+// RUN: %if spirv-tools %{ mkdir %t %}
+// RUN: %if spirv-tools %{ mlir-translate --no-implicit-module --serialize-spirv --split-input-file --spirv-save-validation-files-with-prefix=%t/module %s %}
+// RUN: %if spirv-tools %{ spirv-val %t %}
+
+spirv.module Logical GLSL450 requires #spirv.vce<v1.3, [Shader, Linkage, SubgroupBallotKHR, Groups, SubgroupBufferBlockIOINTEL, GroupNonUniformArithmetic, GroupUniformArithmeticKHR], [SPV_KHR_storage_buffer_storage_class, SPV_KHR_shader_ballot, SPV_INTEL_subgroups, SPV_KHR_uniform_group_instructions]> {
   // CHECK-LABEL: @subgroup_ballot
   spirv.func @subgroup_ballot(%predicate: i1) -> vector<4xi32> "None" {
     // CHECK: %{{.*}} = spirv.KHR.SubgroupBallot %{{.*}}: vector<4xi32>
diff --git a/mlir/test/Target/SPIRV/image-ops.mlir b/mlir/test/Target/SPIRV/image-ops.mlir
index c171265e1623f..3593d9b0e9b38 100644
--- a/mlir/test/Target/SPIRV/image-ops.mlir
+++ b/mlir/test/Target/SPIRV/image-ops.mlir
@@ -1,5 +1,10 @@
 // RUN: mlir-translate --no-implicit-module --split-input-file --test-spirv-roundtrip %s | FileCheck %s
 
+// RUN: %if spirv-tools %{ rm -rf %t %}
+// RUN: %if spirv-tools %{ mkdir %t %}
+// RUN: %if spirv-tools %{ mlir-translate --no-implicit-module --serialize-spirv --split-input-file --spirv-save-validation-files-with-prefix=%t/module %s %}
+// RUN: %if spirv-tools %{ spirv-val %t %}
+
 spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader, ImageQuery, Linkage], []> {
   spirv.func @image(%arg0 : !spirv.sampled_image<!spirv.image<f32, Dim2D, NoDepth, NonArrayed, SingleSampled, NeedSampler, Unknown>>, %arg1 : vector<4xf32>, %arg2 : f32) "None" {
     // CHECK: {{%.*}} = spirv.Image {{%.*}} : !spirv.sampled_image<!spirv.image<f32, Dim2D, NoDepth, NonArrayed, SingleSampled, NeedSampler, Unknown>>
diff --git a/mlir/test/Target/SPIRV/image.mlir b/mlir/test/Target/SPIRV/image.mlir
index 72482292debad..a0c245c6d55dc 100644
--- a/mlir/test/Target/SPIRV/image.mlir
+++ b/mlir/test/Target/SPIRV/image.mlir
@@ -1,6 +1,11 @@
 // RUN: mlir-translate -no-implicit-module -test-spirv-roundtrip %s | FileCheck %s
 
-spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
+// RUN: %if spirv-tools %{ rm -rf %t %}
+// RUN: %if spirv-tools %{ mkdir %t %}
+// RUN: %if spirv-tools %{ mlir-translate --no-implicit-module --serialize-spirv --split-input-file --spirv-save-validation-files-with-prefix=%t/module %s %}
+// RUN: %if spirv-tools %{ spirv-val %t %}
+
+spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader, Linkage, Sampled1D, StorageImageExtendedFormats, InputAttachment], []> {
   // CHECK: !spirv.ptr<!spirv.image<f32, Dim1D, NoDepth, NonArrayed, SingleSampled, SamplerUnknown, Unknown>, UniformConstant>
   spirv.GlobalVariable @var0 bind(0, 1) : !spirv.ptr<!spirv.image<f32, Dim1D, NoDepth, NonArrayed, SingleSampled, SamplerUnknown, Unknown>, UniformConstant>
 
diff --git a/mlir/test/Target/SPIRV/intel-ext-ops.mlir b/mlir/test/Target/SPIRV/intel-ext-ops.mlir
index 53cf8bf8fbd62..118bed8be7095 100644
--- a/mlir/test/Target/SPIRV/intel-ext-ops.mlir
+++ b/mlir/test/Target/SPIRV/intel-ext-ops.mlir
@@ -1,6 +1,11 @@
 // RUN: mlir-translate -no-implicit-module -test-spirv-roundtrip -split-input-file %s | FileCheck %s
 
-spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Bfloat16ConversionINTEL], [SPV_INTEL_bfloat16_conversion]> {
+// RUN: %if spirv-tools %{ rm -rf %t %}
+// RUN: %if spirv-tools %{ mkdir %t %}
+// RUN: %if spirv-tools %{ mlir-translate --no-implicit-module --serialize-spirv --split-input-file --spirv-save-validation-files-with-prefix=%t/module %s %}
+// RUN: %if spirv-tools %{ spirv-val %t %}
+
+spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader, Linkage, Bfloat16ConversionINTEL, Int16], [SPV_INTEL_bfloat16_conversion]> {
   // CHECK-LABEL: @f32_to_bf16
   spirv.func @f32_to_bf16(%arg0 : f32) "None" {
     // CHECK: {{%.*}} = spirv.INTEL.ConvertFToBF16 {{%.*}} : f32 to i16
@@ -36,7 +41,7 @@ spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Bfloat16ConversionINTEL]
 // spirv.INTEL.RoundFToTF32
 //===----------------------------------------------------------------------===//
 
-spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [TensorFloat32RoundingINTEL], [SPV_INTEL_tensor_float32_conversion]> {
+spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader, Linkage, TensorFloat32RoundingINTEL], [SPV_INTEL_tensor_float32_conversion]> {
   // CHECK-LABEL: @f32_to_tf32
   spirv.func @f32_to_tf32(%arg0 : f32) "None" {
     // CHECK: {{%.*}} = spirv.INTEL.RoundFToTF32 {{%.*}} : f32 to f32
@@ -58,8 +63,8 @@ spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [TensorFloat32RoundingINT
 // spirv.INTEL.SplitBarrier
 //===----------------------------------------------------------------------===//
 
-// CHECK: spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [SplitBarrierINTEL], [SPV_INTEL_split_barrier]>
-spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [SplitBarrierINTEL], [SPV_INTEL_split_barrier]> {
+// CHECK: spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader, Linkage, SplitBarrierINTEL], [SPV_INTEL_split_barrier]>
+spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader, Linkage, SplitBarrierINTEL], [SPV_INTEL_split_barrier]> {
   // CHECK-LABEL: @split_barrier
   spirv.func @split_barrier() "None" {
     // CHECK: spirv.INTEL.ControlBarrierArrive <Workgroup> <Device> <Acquire|UniformMemory>
diff --git a/mlir/test/Target/SPIRV/khr-cooperative-matrix-ops.mlir b/mlir/test/Target/SPIRV/khr-cooperative-matrix-ops.mlir
index 77949908e8883..26fa075df3852 100644
--- a/mlir/test/Target/SPIRV/khr-cooperative-matrix-ops.mlir
+++ b/mlir/test/Target/SPIRV/khr-cooperative-matrix-ops.mlir
@@ -1,8 +1,13 @@
 // RUN: mlir-translate --no-implicit-module --test-spirv-roundtrip \
 // RUN:  --split-input-file %s | FileCheck %s
 
-spirv.module Logical GLSL450 requires
-  #spirv.vce<v1.5, [Shader, Int8, Int16, Int64, Linkage, CooperativeMatrixKHR],
+// RUN: %if spirv-tools %{ rm -rf %t %}
+// RUN: %if spirv-tools %{ mkdir %t %}
+// RUN: %if spirv-tools %{ mlir-translate --no-implicit-module --serialize-spirv --split-input-file --spirv-save-validation-files-with-prefix=%t/module %s %}
+// RUN: %if spirv-tools %{ spirv-val %t %}
+
+spirv.module Logical Vulkan requires
+  #spirv.vce<v1.5, [Shader, Int8, Int16, Int64, Linkage, CooperativeMatrixKHR, VulkanMemoryModel],
                    [SPV_KHR_storage_buffer_storage_class, SPV_KHR_cooperative_matrix]> {
 
   // CHECK-LABEL: @cooperative_matrix_length
diff --git a/mlir/test/Target/SPIRV/matrix.mlir b/mlir/test/Target/SPIRV/matrix.mlir
index 452f8fc16f258..8a0f7f11685a3 100644
--- a/mlir/test/Target/SPIRV/matrix.mlir
+++ b/mlir/test/Target/SPIRV/matrix.mlir
@@ -1,6 +1,11 @@
 // RUN: mlir-translate -no-implicit-module -split-input-file -test-spirv-roundtrip %s | FileCheck %s
 
-spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
+// RUN: %if spirv-tools %{ rm -rf %t %}
+// RUN: %if spirv-tools %{ mkdir %t %}
+// RUN: %if spirv-tools %{ mlir-translate --no-implicit-module --serialize-spirv --split-input-file --spirv-save-validation-files-with-prefix=%t/module %s %}
+// RUN: %if spirv-tools %{ spirv-val %t %}
+
+spirv.module Physical64 Vulkan requires #spirv.vce<v1.3, [Shader, Linkage, CooperativeMatrixKHR, VulkanMemoryModel, Float16, Addresses], [SPV_KHR_cooperative_matrix, SPV_KHR_vulkan_memory_model]> {
   // CHECK-LABEL: @matrix_access_chain
   spirv.func @matrix_access_chain(%arg0 : !spirv.ptr<!spirv.matrix<3 x vector<3xf32>>, Function>, %arg1 : i32) -> !spirv.ptr<vector<3xf32>, Function> "None" {
     // CHECK: {{%.*}} = spirv.AccessChain {{%.*}}[{{%.*}}] : !spirv.ptr<!spirv.matrix<3 x vector<3xf32>>, Function>
@@ -67,7 +72,7 @@ spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
 
 // -----
 
-spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
+spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader, Linkage, Float16], [SPV_KHR_storage_buffer_storage_class]> {
   // CHECK: spirv.GlobalVariable {{@.*}} : !spirv.ptr<!spirv.matrix<3 x vector<3xf32>>, StorageBuffer>
   spirv.GlobalVariable @var0 : !spirv.ptr<!spirv.matrix<3 x vector<3xf32>>, StorageBuffer>
 
diff --git a/mlir/test/Target/SPIRV/memory-ops.mlir b/mlir/test/Target/SPIRV/memory-ops.mlir
index 786d07a218c66..2d18394818611 100644
--- a/mlir/test/Target/SPIRV/memory-ops.mlir
+++ b/mlir/test/Target/SPIRV/memory-ops.mlir
@@ -1,7 +1,11 @@
 // RUN: mlir-translate -no-implicit-module -test-spirv-roundtrip -split-input-file %s | FileCheck %s
 
+// RUN: %if spirv-tools %{ rm -rf %t %}
+// RUN: %if spirv-tools %{ mkdir %t %}
+// RUN: %if spirv-tools %{ mlir-translate --no-implicit-module --serialize-spirv --split-input-file --spirv-save-validation-files-with-prefix=%t/module %s %}
+// RUN: %if spirv-tools %{ spirv-val %t %}
 
-spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
+spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader, Linkage], []> {
   // CHECK-LABEL: spirv.func @load_store
   //  CHECK-SAME: ([[ARG1:%.*]]: !spirv.ptr<f32, Input>, [[ARG2:%.*]]: !spirv.ptr<f32, Output>)
   spirv.func @load_store(%arg0 : !spirv.ptr<f32, Input>, %arg1 : !spirv.ptr<f32, Output>) "None" {
@@ -24,7 +28,7 @@ spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
 
 // -----
 
-spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
+spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader, Linkage], []> {
   spirv.func @access_chain(%arg0 : !spirv.ptr<!spirv.array<4x!spirv.array<4xf32>>, Function>, %arg1 : i32, %arg2 : i32) "None" {
     // CHECK: {{%.*}} = spirv.AccessChain {{%.*}}[{{%.*}}] : !spirv.ptr<!spirv.array<4 x !spirv.array<4 x f32>>, Function>
     // CHECK-NEXT: {{%.*}} = spirv.AccessChain {{%.*}}[{{%.*}}, {{%.*}}] : !spirv.ptr<!spirv.array<4 x !spirv.array<4 x f32>>, Function>
@@ -36,7 +40,7 @@ spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
 
 // -----
 
-spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
+spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader, Linkage], [SPV_KHR_storage_buffer_storage_class]> {
   spirv.func @load_store_zero_rank_float(%arg0: !spirv.ptr<!spirv.struct<(!spirv.array<1 x f32, stride=4> [0]), Block>, StorageBuffer>, %arg1: !spirv.ptr<!spirv.struct<(!spirv.array<1 x f32, stride=4> [0]), Block>, StorageBuffer>) "None" {
     // CHECK: [[LOAD_PTR:%.*]] = spirv.AccessChain {{%.*}}[{{%.*}}, {{%.*}}] : !spirv.ptr<!spirv.struct<(!spirv.array<1 x f32, stride=4> [0]), Block>, StorageBuffer>
     // CHECK-NEXT: [[VAL:%.*]] = spirv.Load "StorageBuffer" [[LOAD_PTR]] : f32
@@ -70,7 +74,7 @@ spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
 
 // -----
 
-spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
+spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader, Linkage], []> {
   spirv.func @copy_memory_simple() "None" {
     %0 = spirv.Variable : !spirv.ptr<f32, Function>
     %1 = spirv.Variable : !spirv.ptr<f32, Function>
@@ -82,7 +86,7 @@ spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
 
 // -----
 
-spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
+spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader, Linkage], []> {
   spirv.func @copy_memory_different_storage_classes(%in : !spirv.ptr<!spirv.array<4xf32>, Input>, %out : !spirv.ptr<!spirv.array<4xf32>, Output>) "None" {
     // CHECK: spirv.CopyMemory "Output" %{{.*}}, "Input" %{{.*}} : !spirv.array<4 x f32>
     spirv.CopyMemory "Output" %out, "Input" %in : !spirv.array<4xf32>
@@ -93,7 +97,7 @@ spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
 
 // -----
 
-spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
+spirv.module Logical GLSL450 requires #spirv.vce<v1.4, [Shader, Linkage], []> {
   spirv.func @copy_memory_with_access_operands() "None" {
     %0 = spirv.Variable : !spirv.ptr<f32, Function>
     %1 = spirv.Variable : !spirv.ptr<f32, Function>
diff --git a/mlir/test/Target/SPIRV/mesh-ops.mlir b/mlir/test/Target/SPIRV/mesh-ops.mlir
index 3b937072de04e..e282f8916d4da 100644
--- a/mlir/test/Target/SPIRV/mesh-ops.mlir
+++ b/mlir/test/Target/SPIRV/mesh-ops.mlir
@@ -1,5 +1,10 @@
 // RUN: mlir-translate --no-implicit-module --split-input-file --test-spirv-roundtrip %s | FileCheck %s
 
+// RUN: %if spirv-tools %{ rm -rf %t %}
+// RUN: %if spirv-tools %{ mkdir %t %}
+// RUN: %if spirv-tools %{ mlir-translate --no-implicit-module --serialize-spirv --split-input-file --spirv-save-validation-files-with-prefix=%t/module %s %}
+// RUN: %if spirv-tools %{ spirv-val %t %}
+
 spirv.module Logical GLSL450 requires #spirv.vce<v1.4, [MeshShadingEXT], [SPV_EXT_mesh_shader]> {
   // CHECK-LABEL: @emit_mesh_tasks
   spirv.func @emit_mesh_tasks() "None" {
diff --git a/mlir/test/Target/SPIRV/module.mlir b/mlir/test/Target/SPIRV/module.mlir
index d4000dfa53810..7e52e549b5266 100644
--- a/mlir/test/Target/SPIRV/module.mlir
+++ b/mlir/test/Target/SPIRV/module.mlir
@@ -4,7 +4,7 @@
 // RUN: %if spirv-tools %{ rm -rf %t %}
 // RUN: %if spirv-tools %{ mkdir %t %}
 // RUN: %if spirv-tools %{ mlir-translate --no-implicit-module --serialize-spirv --split-input-file --spirv-save-validation-files-with-prefix=%t/module %s %}
-// RUN: %if spirv-tools %{ ls %t/module*.spv | xargs -I{} spirv-val {} %}
+// RUN: %if spirv-tools %{ spirv-val %t %}
 
 // CHECK:      spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
 // CHECK-NEXT:   spirv.func @foo() "Inline" {
diff --git a/mlir/test/Target/SPIRV/non-uniform-ops.mlir b/mlir/test/Target/SPIRV/non-uniform-ops.mlir
index f29ebd86a2e03..e7cf0a8905a81 100644
--- a/mlir/test/Target/SPIRV/non-uniform-ops.mlir
+++ b/mlir/test/Target/SPIRV/non-uniform-ops.mlir
@@ -1,6 +1,11 @@
 // RUN: mlir-translate -no-implicit-module -test-spirv-roundtrip -split-input-file %s | FileCheck %s
 
-spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
+// RUN: %if spirv-tools %{ rm -rf %t %}
+// RUN: %if spirv-tools %{ mkdir %t %}
+// RUN: %if spirv-tools %{ mlir-translate --no-implicit-module --serialize-spirv --split-input-file --spirv-save-validation-files-with-prefix=%t/module %s %}
+// RUN: %if spirv-tools %{ spirv-val %t %}
+
+spirv.module Logical GLSL450 requires #spirv.vce<v1.3, [Shader, Linkage, GroupNonUniformBallot, GroupNonUniformArithmetic, GroupNonUniformClustered, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformVote], []> {
   // CHECK-LABEL: @group_non_uniform_ballot
   spirv.func @group_non_uniform_ballot(%predicate: i1) -> vector<4xi32> "None" {
     // CHECK: %{{.*}} = spirv.GroupNonUniformBallot <Workgroup> %{{.*}}: vector<4xi32>
diff --git a/mlir/test/Target/SPIRV/ocl-ops.mlir b/mlir/test/Target/SPIRV/ocl-ops.mlir
index 9a2e4cf62e370..17accd93e8249 100644
--- a/mlir/test/Target/SPIRV/ocl-ops.mlir
+++ b/mlir/test/Target/SPIRV/ocl-ops.mlir
@@ -1,6 +1,11 @@
 // RUN: mlir-translate -no-implicit-module -test-spirv-roundtrip %s | FileCheck %s
 
-spirv.module Physical64 OpenCL requires #spirv.vce<v1.0, [Kernel, Addresses], []> {
+// RUN: %if spirv-tools %{ rm -rf %t %}
+// RUN: %if spirv-tools %{ mkdir %t %}
+// RUN: %if spirv-tools %{ mlir-translate --no-implicit-module --serialize-spirv --split-input-file --spirv-save-validation-files-with-prefix=%t/module %s %}
+// RUN: %if spirv-tools %{ spirv-val %t %}
+
+spirv.module Physical64 OpenCL requires #spirv.vce<v1.0, [Kernel, Addresses, Vector16, Linkage], []> {
   spirv.func @float_insts(%arg0 : f32) "None" {
     // CHECK: {{%.*}} = spirv.CL.exp {{%.*}} : f32
     %0 = spirv.CL.exp %arg0 : f32
diff --git a/mlir/test/Target/SPIRV/physical-storage-buffer.mlir b/mlir/test/Target/SPIRV/physical-storage-buffer.mlir
index 7cbd3f94e55ff..2e724e837d909 100644
--- a/mlir/test/Target/SPIRV/physical-storage-buffer.mlir
+++ b/mlir/test/Target/SPIRV/physical-storage-buffer.mlir
@@ -1,5 +1,10 @@
 // RUN: mlir-translate --no-implicit-module --test-spirv-roundtrip %s | FileCheck %s
 
+// RUN: %if spirv-tools %{ rm -rf %t %}
+// RUN: %if spirv-tools %{ mkdir %t %}
+// RUN: %if spirv-tools %{ mlir-translate --no-implicit-module --serialize-spirv --split-input-file --spirv-save-validation-files-with-prefix=%t/module %s %}
+// RUN: %if spirv-tools %{ spirv-val %t %}
+
 // Test file showing how the Physical Storage Buffer extension works end-2-end.
 
 !f32_binding = !spirv.struct<binding_f32_t, (!spirv.rtarray<f32, stride=4> [0])>
diff --git a/mlir/test/Target/SPIRV/primitive-ops.mlir b/mlir/test/Target/SPIRV/primitive-ops.mlir
index 63a0b1e74784a..112acf4d7e4ac 100644
--- a/mlir/test/Target/SPIRV/primitive-ops.mlir
+++ b/mlir/test/Target/SPIRV/primitive-ops.mlir
@@ -1,5 +1,10 @@
 // RUN: mlir-translate --no-implicit-module --test-spirv-roundtrip %s | FileCheck %s
 
+// RUN: %if spirv-tools %{ rm -rf %t %}
+// RUN: %if spirv-tools %{ mkdir %t %}
+// RUN: %if spirv-tools %{ mlir-translate --no-implicit-module --serialize-spirv --split-input-file --spirv-save-validation-files-with-prefix=%t/module %s %}
+// RUN: %if spirv-tools %{ spirv-val %t %}
+
 spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Geometry], []> {
   spirv.GlobalVariable @out : !spirv.ptr<!spirv.struct<(vector<4xf32>, f32, !spirv.array<1 x f32>)>, Output> 
   spirv.func @primitive_ops() "None" {
diff --git a/mlir/test/Target/SPIRV/spec-constant.mlir b/mlir/test/Target/SPIRV/spec-constant.mlir
index f434956ab34a3..d476c247b2873 100644
--- a/mlir/test/Target/SPIRV/spec-constant.mlir
+++ b/mlir/test/Target/SPIRV/spec-constant.mlir
@@ -1,6 +1,11 @@
 // RUN: mlir-translate -no-implicit-module -test-spirv-roundtrip -split-input-file %s | FileCheck %s
 
-spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
+// RUN: %if spirv-tools %{ rm -rf %t %}
+// RUN: %if spirv-tools %{ mkdir %t %}
+// RUN: %if spirv-tools %{ mlir-translate --no-implicit-module --serialize-spirv --split-input-file --spirv-save-validation-files-with-prefix=%t/module %s %}
+// RUN: %if spirv-tools %{ spirv-val %t %}
+
+spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader, Linkage], []> {
   // CHECK: spirv.SpecConstant @sc_true = true
   spirv.SpecConstant @sc_true = true
   // CHECK: spirv.SpecConstant @sc_false spec_id(1) = false
@@ -48,7 +53,7 @@ spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
 
 // -----
 
-spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
+spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader, Linkage], []> {
 
   spirv.SpecConstant @sc_f32_1 = 1.5 : f32
   spirv.SpecConstant @sc_f32_2 = 2.5 : f32
@@ -68,7 +73,7 @@ spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
 
 // -----
 
-spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
+spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader, Linkage], []> {
 
   spirv.SpecConstant @sc_f32_1 = 1.5 : f32
   spirv.SpecConstant @sc_f32_2 = 2.5 : f32
@@ -88,7 +93,7 @@ spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
 
 // -----
 
-spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader, ReplicatedCompositesEXT], [SPV_EXT_replicated_composites]> {
+spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader, Linkage, ReplicatedCompositesEXT], [SPV_EXT_replicated_composites]> {
 
   spirv.SpecConstant @sc_i32_1 = 1 : i32
 
@@ -115,7 +120,7 @@ spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader, ReplicatedCompos
 
 // -----
 
-spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
+spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader, Linkage], []> {
 
   spirv.SpecConstant @sc_i32_1 = 1 : i32
 
diff --git a/mlir/test/Target/SPIRV/tensorARM.mlir b/mlir/test/Target/SPIRV/tensorARM.mlir
index 75b648ebfd008..53a41e19f930f 100644
--- a/mlir/test/Target/SPIRV/tensorARM.mlir
+++ b/mlir/test/Target/SPIRV/tensorARM.mlir
@@ -1,6 +1,11 @@
 // RUN: mlir-translate -no-implicit-module -test-spirv-roundtrip %s | FileCheck %s
 
-spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader, TensorsARM], [SPV_ARM_tensors]> {
+// RUN: %if spirv-tools %{ rm -rf %t %}
+// RUN: %if spirv-tools %{ mkdir %t %}
+// RUN: %if spirv-tools %{ mlir-translate --no-implicit-module --serialize-spirv --spirv-save-validation-files-with-prefix=%t/module %s %}
+// RUN: %if spirv-tools %{ spirv-val %t %}
+
+spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader, Linkage, TensorsARM, Int64], [SPV_ARM_tensors]> {
   // CHECK: spirv.func @shaped_int_arm_tensor(%arg0: !spirv.arm.tensor<2xi32>) "None" {
   spirv.func @shaped_int_arm_tensor(%arg0 : !spirv.arm.tensor<2xi32>) "None" {
     spirv.Return
diff --git a/mlir/test/Target/SPIRV/terminator.mlir b/mlir/test/Target/SPIRV/terminator.mlir
index 8338a575681f1..8e5b043800a2c 100644
--- a/mlir/test/Target/SPIRV/terminator.mlir
+++ b/mlir/test/Target/SPIRV/terminator.mlir
@@ -1,6 +1,11 @@
 // RUN: mlir-translate -no-implicit-module -test-spirv-roundtrip %s | FileCheck %s
 
-spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
+// RUN: %if spirv-tools %{ rm -rf %t %}
+// RUN: %if spirv-tools %{ mkdir %t %}
+// RUN: %if spirv-tools %{ mlir-translate --no-implicit-module --serialize-spirv --split-input-file --spirv-save-validation-files-with-prefix=%t/module %s %}
+// RUN: %if spirv-tools %{ spirv-val %t %}
+
+spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader, Linkage], []> {
   // CHECK-LABEL: @ret
   spirv.func @ret() -> () "None" {
     // CHECK: spirv.Return
diff --git a/mlir/test/Target/SPIRV/undef.mlir b/mlir/test/Target/SPIRV/undef.mlir
index 8889b80e86f95..08ac1ff45efef 100644
--- a/mlir/test/Target/SPIRV/undef.mlir
+++ b/mlir/test/Target/SPIRV/undef.mlir
@@ -1,6 +1,11 @@
 // RUN: mlir-translate -no-implicit-module -split-input-file -test-spirv-roundtrip %s | FileCheck %s
 
-spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
+// RUN: %if spirv-tools %{ rm -rf %t %}
+// RUN: %if spirv-tools %{ mkdir %t %}
+// RUN: %if spirv-tools %{ mlir-translate --no-implicit-module --serialize-spirv --split-input-file --spirv-save-validation-files-with-prefix=%t/module %s %}
+// RUN: %if spirv-tools %{ spirv-val %t %}
+
+spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader, Linkage], [SPV_KHR_storage_buffer_storage_class]> {
   spirv.func @foo() -> () "None" {
     // CHECK: {{%.*}} = spirv.Undef : f32
     // CHECK-NEXT: {{%.*}} = spirv.Undef : f32
@@ -23,7 +28,7 @@ spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
 
 // -----
 
-spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
+spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader, Linkage], []> {
   // CHECK: spirv.func {{@.*}}
   spirv.func @ignore_unused_undef() -> () "None" {
     // CHECK-NEXT: spirv.Return
diff --git a/mlir/test/lib/Dialect/Linalg/TestDataLayoutPropagation.cpp b/mlir/test/lib/Dialect/Linalg/TestDataLayoutPropagation.cpp
index 2cf25d8fc8c19..d332270468ea8 100644
--- a/mlir/test/lib/Dialect/Linalg/TestDataLayoutPropagation.cpp
+++ b/mlir/test/lib/Dialect/Linalg/TestDataLayoutPropagation.cpp
@@ -34,8 +34,13 @@ struct TestDataLayoutPropagationPass
     RewritePatternSet patterns(context);
     linalg::populateDataLayoutPropagationPatterns(
         patterns, [](OpOperand *opOperand) { return true; });
-    linalg::populateExtractSliceSinkingPatterns(
-        patterns, [](OpOperand *opOperand) { return true; });
+    linalg::ControlPropagationFn controlExtract =
+        [](OpOperand *opOperand) -> bool {
+      Operation *producer = opOperand->get().getDefiningOp();
+      Operation *consumer = opOperand->getOwner();
+      return consumer->getBlock() == producer->getBlock();
+    };
+    linalg::populateExtractSliceSinkingPatterns(patterns, controlExtract);
     if (failed(applyPatternsGreedily(getOperation(), std::move(patterns))))
       return signalPassFailure();
   }
diff --git a/mlir/test/lib/Dialect/MemRef/TestEmulateNarrowType.cpp b/mlir/test/lib/Dialect/MemRef/TestEmulateNarrowType.cpp
index ba2ea40e83d96..b5f015aff19b4 100644
--- a/mlir/test/lib/Dialect/MemRef/TestEmulateNarrowType.cpp
+++ b/mlir/test/lib/Dialect/MemRef/TestEmulateNarrowType.cpp
@@ -18,6 +18,7 @@
 #include "mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/DialectConversion.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 
 using namespace mlir;
 
@@ -126,10 +127,73 @@ struct TestEmulateNarrowTypePass
                      "normal sequence"),
       llvm::cl::init(false)};
 };
+
+struct TestMemRefFlattenAndVectorNarrowTypeEmulationPass
+    : public PassWrapper<TestMemRefFlattenAndVectorNarrowTypeEmulationPass,
+                         OperationPass<func::FuncOp>> {
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(
+      TestMemRefFlattenAndVectorNarrowTypeEmulationPass)
+
+  TestMemRefFlattenAndVectorNarrowTypeEmulationPass() = default;
+  TestMemRefFlattenAndVectorNarrowTypeEmulationPass(
+      const TestMemRefFlattenAndVectorNarrowTypeEmulationPass &pass)
+      : PassWrapper(pass) {}
+
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry
+        .insert<arith::ArithDialect, func::FuncDialect, memref::MemRefDialect,
+                vector::VectorDialect, affine::AffineDialect>();
+  }
+
+  StringRef getArgument() const final {
+    return "test-memref-flatten-and-vector-narrow-type-emulation";
+  }
+
+  StringRef getDescription() const final {
+    return "Test MemRef flattening and vector narrow type emulation patterns";
+  }
+
+  void runOnOperation() override {
+    Operation *op = getOperation();
+    MLIRContext *ctx = &getContext();
+
+    // Create a type converter for narrow type emulation (8-bit)
+    arith::NarrowTypeEmulationConverter typeConverter(8);
+
+    // Add conversions for memref types with i4 elements
+    memref::populateMemRefNarrowTypeEmulationConversions(typeConverter);
+
+    ConversionTarget target(*ctx);
+    target.addDynamicallyLegalOp<func::FuncOp>([&typeConverter](Operation *op) {
+      return typeConverter.isLegal(cast<func::FuncOp>(op).getFunctionType());
+    });
+    auto opLegalCallback = [&typeConverter](Operation *op) {
+      return typeConverter.isLegal(op);
+    };
+    target.addDynamicallyLegalOp<func::CallOp, func::ReturnOp>(opLegalCallback);
+    target.addDynamicallyLegalDialect<
+        arith::ArithDialect, vector::VectorDialect, memref::MemRefDialect,
+        affine::AffineDialect>(opLegalCallback);
+
+    RewritePatternSet patterns(ctx);
+
+    // This is necessary for the purpose of emulating `memref.alloc` and
+    // function boundaries.
+    memref::populateMemRefNarrowTypeEmulationPatterns(typeConverter, patterns);
+
+    vector::populateMemRefFlattenAndVectorNarrowTypeEmulationPatterns(
+        typeConverter, patterns);
+
+    // Apply partial conversion
+    if (failed(applyPartialConversion(op, target, std::move(patterns))))
+      signalPassFailure();
+  }
+};
 } // namespace
 
 namespace mlir::test {
 void registerTestEmulateNarrowTypePass() {
   PassRegistration<TestEmulateNarrowTypePass>();
+  PassRegistration<TestMemRefFlattenAndVectorNarrowTypeEmulationPass>();
 }
 } // namespace mlir::test
diff --git a/mlir/test/lib/Dialect/SCF/TestSCFUtils.cpp b/mlir/test/lib/Dialect/SCF/TestSCFUtils.cpp
index 9a394d2875b67..6199cb13149af 100644
--- a/mlir/test/lib/Dialect/SCF/TestSCFUtils.cpp
+++ b/mlir/test/lib/Dialect/SCF/TestSCFUtils.cpp
@@ -42,6 +42,20 @@ struct TestSCFForUtilsPass
   void runOnOperation() override {
     func::FuncOp func = getOperation();
 
+    // Annotate every loop-like operation with the static trip count.
+    func.walk([&](LoopLikeOpInterface loopOp) {
+      std::optional<APInt> tripCount = loopOp.getStaticTripCount();
+      if (tripCount.has_value())
+        loopOp->setDiscardableAttr(
+            "test.trip-count",
+            IntegerAttr::get(IntegerType::get(&getContext(),
+                                              tripCount.value().getBitWidth()),
+                             tripCount.value().getSExtValue()));
+      else
+        loopOp->setDiscardableAttr("test.trip-count",
+                                   StringAttr::get(&getContext(), "none"));
+    });
+
     if (testReplaceWithNewYields) {
       func.walk([&](scf::ForOp forOp) {
         if (forOp.getNumResults() == 0)
diff --git a/mlir/test/lib/Dialect/Test/TestInterfaces.td b/mlir/test/lib/Dialect/Test/TestInterfaces.td
index dea26b8dda62a..d3d96ea5a65a4 100644
--- a/mlir/test/lib/Dialect/Test/TestInterfaces.td
+++ b/mlir/test/lib/Dialect/Test/TestInterfaces.td
@@ -174,4 +174,32 @@ def TestOptionallyImplementedTypeInterface
   }];
 }
 
+// Dummy type interface "A" that requires type interface "B" to be complete.
+def TestCyclicTypeInterfaceA : TypeInterface<"TestCyclicTypeInterfaceA"> {
+  let cppNamespace = "::mlir";
+  let methods = [
+    InterfaceMethod<"",
+      "::mlir::FailureOr<::mlir::TestCyclicTypeInterfaceB>",
+      /*methodName=*/"returnB",
+      (ins),
+      /*methodBody=*/"",
+      /*defaultImpl=*/"return mlir::failure();"
+    >,
+  ];
+}
+
+// Dummy type interface "B" that requires type interface "A" to be complete.
+def TestCyclicTypeInterfaceB : TypeInterface<"TestCyclicTypeInterfaceB"> {
+  let cppNamespace = "::mlir";
+  let methods = [
+    InterfaceMethod<"",
+      "::mlir::FailureOr<::mlir::TestCyclicTypeInterfaceA>",
+      /*methodName=*/"returnA",
+      (ins),
+      /*methodBody=*/"",
+      /*defaultImpl=*/"return mlir::failure();"
+    >,
+  ];
+}
+
 #endif // MLIR_TEST_DIALECT_TEST_INTERFACES
diff --git a/mlir/test/lib/Dialect/Test/TestPatterns.cpp b/mlir/test/lib/Dialect/Test/TestPatterns.cpp
index 93b007c792ad9..f8b5144e3acb2 100644
--- a/mlir/test/lib/Dialect/Test/TestPatterns.cpp
+++ b/mlir/test/lib/Dialect/Test/TestPatterns.cpp
@@ -17,6 +17,7 @@
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/Matchers.h"
+#include "mlir/IR/PatternMatch.h"
 #include "mlir/IR/Visitors.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/DialectConversion.h"
@@ -114,7 +115,8 @@ struct FoldingPattern : public RewritePattern {
 struct FolderInsertBeforePreviouslyFoldedConstantPattern
     : public OpRewritePattern<TestCastOp> {
 public:
-  using OpRewritePattern<TestCastOp>::OpRewritePattern;
+  static_assert(std::is_same_v<Base, OpRewritePattern<TestCastOp>>);
+  using Base::Base;
 
   LogicalResult matchAndRewrite(TestCastOp op,
                                 PatternRewriter &rewriter) const override {
@@ -1306,7 +1308,8 @@ class TestReplaceWithValidConsumer : public ConversionPattern {
 /// b) or: drops all block arguments and replaces each with 2x the first
 ///    operand.
 class TestConvertBlockArgs : public OpConversionPattern<ConvertBlockArgsOp> {
-  using OpConversionPattern<ConvertBlockArgsOp>::OpConversionPattern;
+  static_assert(std::is_same_v<Base, OpConversionPattern<ConvertBlockArgsOp>>);
+  using Base::Base;
 
   LogicalResult
   matchAndRewrite(ConvertBlockArgsOp op, OpAdaptor adaptor,
@@ -1431,7 +1434,9 @@ class TestTypeConsumerOpPattern
 
 namespace {
 struct TestTypeConverter : public TypeConverter {
-  using TypeConverter::TypeConverter;
+  static_assert(std::is_same_v<Base, TypeConverter>);
+  using Base::Base;
+
   TestTypeConverter() {
     addConversion(convertType);
     addSourceMaterialization(materializeCast);
diff --git a/mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp b/mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp
index bb1598ee3efe5..79bfc9bbcda71 100644
--- a/mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp
+++ b/mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp
@@ -172,6 +172,12 @@ struct TestVectorUnrollingPatterns
                       .setFilterConstraint([](Operation *op) {
                         return success(isa<vector::ReductionOp>(op));
                       }));
+    populateVectorUnrollPatterns(patterns,
+                                 UnrollVectorOptions()
+                                     .setNativeShape(ArrayRef<int64_t>{8})
+                                     .setFilterConstraint([](Operation *op) {
+                                       return success(isa<vector::StepOp>(op));
+                                     }));
     populateVectorUnrollPatterns(
         patterns, UnrollVectorOptions()
                       .setNativeShape(ArrayRef<int64_t>{1, 3, 4, 2})
@@ -344,36 +350,6 @@ struct TestVectorTransferOpt
   }
 };
 
-struct TestVectorTransferCollapseInnerMostContiguousDims
-    : public PassWrapper<TestVectorTransferCollapseInnerMostContiguousDims,
-                         OperationPass<func::FuncOp>> {
-  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(
-      TestVectorTransferCollapseInnerMostContiguousDims)
-
-  TestVectorTransferCollapseInnerMostContiguousDims() = default;
-  TestVectorTransferCollapseInnerMostContiguousDims(
-      const TestVectorTransferCollapseInnerMostContiguousDims &pass) = default;
-
-  void getDependentDialects(DialectRegistry &registry) const override {
-    registry.insert<memref::MemRefDialect, affine::AffineDialect>();
-  }
-
-  StringRef getArgument() const final {
-    return "test-vector-transfer-collapse-inner-most-dims";
-  }
-
-  StringRef getDescription() const final {
-    return "Test lowering patterns that reduces the rank of the vector "
-           "transfer memory and vector operands.";
-  }
-
-  void runOnOperation() override {
-    RewritePatternSet patterns(&getContext());
-    populateDropInnerMostUnitDimsXferOpPatterns(patterns);
-    (void)applyPatternsGreedily(getOperation(), std::move(patterns));
-  }
-};
-
 struct TestVectorSinkPatterns
     : public PassWrapper<TestVectorSinkPatterns, OperationPass<func::FuncOp>> {
   MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TestVectorSinkPatterns)
@@ -786,28 +762,6 @@ struct TestVectorGatherLowering
   }
 };
 
-struct TestUnrollVectorFromElements
-    : public PassWrapper<TestUnrollVectorFromElements,
-                         OperationPass<func::FuncOp>> {
-  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TestUnrollVectorFromElements)
-
-  StringRef getArgument() const final {
-    return "test-unroll-vector-from-elements";
-  }
-  StringRef getDescription() const final {
-    return "Test unrolling patterns for from_elements ops";
-  }
-  void getDependentDialects(DialectRegistry &registry) const override {
-    registry.insert<func::FuncDialect, vector::VectorDialect, ub::UBDialect>();
-  }
-
-  void runOnOperation() override {
-    RewritePatternSet patterns(&getContext());
-    populateVectorFromElementsLoweringPatterns(patterns);
-    (void)applyPatternsGreedily(getOperation(), std::move(patterns));
-  }
-};
-
 struct TestFoldArithExtensionIntoVectorContractPatterns
     : public PassWrapper<TestFoldArithExtensionIntoVectorContractPatterns,
                          OperationPass<func::FuncOp>> {
@@ -1040,6 +994,22 @@ struct TestEliminateVectorMasks
                          VscaleRange{vscaleMin, vscaleMax});
   }
 };
+
+struct TestVectorShuffleLowering
+    : public PassWrapper<TestVectorShuffleLowering,
+                         OperationPass<func::FuncOp>> {
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TestVectorShuffleLowering)
+
+  StringRef getArgument() const final { return "test-vector-shuffle-lowering"; }
+  StringRef getDescription() const final {
+    return "Test lowering patterns for vector.shuffle with mixed-size inputs";
+  }
+  void runOnOperation() override {
+    RewritePatternSet patterns(&getContext());
+    populateVectorShuffleLoweringPatterns(patterns);
+    (void)applyPatternsGreedily(getOperation(), std::move(patterns));
+  }
+};
 } // namespace
 
 namespace mlir {
@@ -1057,8 +1027,6 @@ void registerTestVectorLowerings() {
 
   PassRegistration<TestVectorTransferOpt>();
 
-  PassRegistration<TestVectorTransferCollapseInnerMostContiguousDims>();
-
   PassRegistration<TestVectorSinkPatterns>();
 
   PassRegistration<TestVectorReduceToContractPatternsPatterns>();
@@ -1071,6 +1039,8 @@ void registerTestVectorLowerings() {
 
   PassRegistration<TestVectorScanLowering>();
 
+  PassRegistration<TestVectorShuffleLowering>();
+
   PassRegistration<TestVectorDistribution>();
 
   PassRegistration<TestVectorExtractStridedSliceLowering>();
@@ -1081,8 +1051,6 @@ void registerTestVectorLowerings() {
 
   PassRegistration<TestVectorGatherLowering>();
 
-  PassRegistration<TestUnrollVectorFromElements>();
-
   PassRegistration<TestFoldArithExtensionIntoVectorContractPatterns>();
 
   PassRegistration<TestVectorEmulateMaskedLoadStore>();
diff --git a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
index 200323c7a4e51..e1ba45c60ac36 100644
--- a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
+++ b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
@@ -170,7 +170,8 @@ class TestStepOpPattern : public OpConversionPattern<vector::StepOp> {
     if (!sliceAttr || sliceAttr.getRank() != 1)
       return failure();
 
-    std::optional<SmallVector<int64_t>> sgShape = sliceAttr.getSgDataAsInt();
+    std::optional<SmallVector<int64_t>> sgShape =
+        sliceAttr.getEffectiveSgDataAsInt();
     if (!sgShape)
       return failure();
 
diff --git a/mlir/test/lib/Pass/CMakeLists.txt b/mlir/test/lib/Pass/CMakeLists.txt
index ab52f621c517e..04c91635def85 100644
--- a/mlir/test/lib/Pass/CMakeLists.txt
+++ b/mlir/test/lib/Pass/CMakeLists.txt
@@ -4,6 +4,7 @@ add_mlir_library(MLIRTestPass
   TestConvertToSPIRVPass.cpp
   TestDynamicPipeline.cpp
   TestPassManager.cpp
+  TestRemarksPass.cpp
   TestSPIRVCPURunnerPipeline.cpp
   TestVulkanRunnerPipeline.cpp
 
diff --git a/mlir/test/lib/Pass/TestRemarksPass.cpp b/mlir/test/lib/Pass/TestRemarksPass.cpp
new file mode 100644
index 0000000000000..3b25686b3dc14
--- /dev/null
+++ b/mlir/test/lib/Pass/TestRemarksPass.cpp
@@ -0,0 +1,74 @@
+//===------ TestRemarkPipeline.cpp --- dynamic pipeline test pass --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a pass to test the dynamic pipeline feature.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/Location.h"
+#include "mlir/IR/Remarks.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Pass/PassManager.h"
+#include "mlir/Support/WalkResult.h"
+
+using namespace mlir;
+
+namespace {
+
+class TestRemarkPass : public PassWrapper<TestRemarkPass, OperationPass<>> {
+public:
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TestRemarkPass)
+
+  StringRef getArgument() const final { return "test-remark"; }
+  StringRef getDescription() const final {
+    return "Tests the remark pipeline feature";
+  }
+
+  TestRemarkPass() = default;
+
+  void runOnOperation() override {
+
+    getOperation()->walk([](Operation *op) {
+      if (isa<ModuleOp>(op))
+        return WalkResult::advance();
+      Location loc = op->getLoc();
+      mlir::remark::missed(loc, remark::RemarkOpts::name("test-remark")
+                                    .category("a-category-1-missed"))
+          << remark::add("This is a test missed remark")
+          << remark::reason("because we are testing the remark pipeline")
+          << remark::suggest("try using the remark pipeline feature");
+
+      mlir::remark::passed(
+          loc,
+          remark::RemarkOpts::name("test-remark").category("category-1-passed"))
+          << remark::add("This is a test passed remark")
+          << remark::reason("because we are testing the remark pipeline")
+          << remark::suggest("try using the remark pipeline feature");
+
+      mlir::remark::failed(
+          loc,
+          remark::RemarkOpts::name("test-remark").category("category-2-failed"))
+          << remark::add("This is a test failed remark")
+          << remark::reason("because we are testing the remark pipeline")
+          << remark::suggest("try using the remark pipeline feature");
+
+      mlir::remark::analysis(loc, remark::RemarkOpts::name("test-remark")
+                                      .category("category-2-analysis"))
+          << remark::add("This is a test analysis remark");
+      return WalkResult::advance();
+    });
+  }
+};
+} // namespace
+
+namespace mlir {
+namespace test {
+void registerTestRemarkPass() { PassRegistration<TestRemarkPass>(); }
+} // namespace test
+} // namespace mlir
diff --git a/mlir/test/lib/Pass/TestVulkanRunnerPipeline.cpp b/mlir/test/lib/Pass/TestVulkanRunnerPipeline.cpp
index f5a6fc5ea2b20..e30c31693fae7 100644
--- a/mlir/test/lib/Pass/TestVulkanRunnerPipeline.cpp
+++ b/mlir/test/lib/Pass/TestVulkanRunnerPipeline.cpp
@@ -13,6 +13,7 @@
 #include "mlir/Conversion/GPUCommon/GPUCommonPass.h"
 #include "mlir/Conversion/GPUToSPIRV/GPUToSPIRVPass.h"
 #include "mlir/Conversion/MemRefToLLVM/MemRefToLLVM.h"
+#include "mlir/Conversion/ReconcileUnrealizedCasts/ReconcileUnrealizedCasts.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/GPU/Transforms/Passes.h"
@@ -73,6 +74,7 @@ void buildTestVulkanRunnerPipeline(OpPassManager &passManager,
   opt.kernelBarePtrCallConv = true;
   opt.kernelIntersperseSizeCallConv = true;
   passManager.addPass(createGpuToLLVMConversionPass(opt));
+  passManager.addPass(createReconcileUnrealizedCastsPass());
 }
 
 } // namespace
diff --git a/mlir/test/lit.site.cfg.py.in b/mlir/test/lit.site.cfg.py.in
index 8a742a227847b..2fc595dfabbf5 100644
--- a/mlir/test/lit.site.cfg.py.in
+++ b/mlir/test/lit.site.cfg.py.in
@@ -18,6 +18,7 @@ config.host_cxx = "@HOST_CXX@"
 config.enable_libcxx = "@LLVM_ENABLE_LIBCXX@"
 config.host_cmake = "@CMAKE_COMMAND@"
 config.host_cmake_generator = "@CMAKE_GENERATOR@"
+config.cmake_build_type = "@CMAKE_BUILD_TYPE@"
 config.llvm_use_linker = "@LLVM_USE_LINKER@"
 config.llvm_use_sanitizer = "@LLVM_USE_SANITIZER@"
 config.host_arch = "@HOST_ARCH@"
diff --git a/mlir/test/mlir-tblgen/op-interface.td b/mlir/test/mlir-tblgen/op-interface.td
index 17bd631fe250d..aa71baddf58cd 100644
--- a/mlir/test/mlir-tblgen/op-interface.td
+++ b/mlir/test/mlir-tblgen/op-interface.td
@@ -31,11 +31,6 @@ def ExtraShardDeclsInterface : OpInterface<"ExtraShardDeclsInterface"> {
 // DECL-NEXT:   return (*this).someOtherMethod();
 // DECL-NEXT: }
 
-// DECL: struct ExtraShardDeclsInterfaceTrait
-// DECL:      bool sharedMethodDeclaration() {
-// DECL-NEXT:   return (*static_cast<ConcreteOp *>(this)).someOtherMethod();
-// DECL-NEXT: }
-
 def TestInheritanceMultiBaseInterface : OpInterface<"TestInheritanceMultiBaseInterface"> {
   let methods = [
     InterfaceMethod<
@@ -71,7 +66,7 @@ def TestInheritanceMiddleBaseInterface
 def TestInheritanceZDerivedInterface
   : OpInterface<"TestInheritanceZDerivedInterface", [TestInheritanceMiddleBaseInterface]>;
 
-// DECL: class TestInheritanceZDerivedInterface
+// DECL: struct TestInheritanceZDerivedInterfaceInterfaceTraits
 // DECL: struct Concept {
 // DECL:     const TestInheritanceMultiBaseInterface::Concept *implTestInheritanceMultiBaseInterface = nullptr;
 // DECL-NOT: const TestInheritanceMultiBaseInterface::Concept
@@ -173,10 +168,16 @@ def DeclareMethodsWithDefaultOp : Op<TestDialect, "declare_methods_op",
 // DECL: /// some function comment
 // DECL: int foo(int input);
 
-// DECL-LABEL: struct TestOpInterfaceVerifyTrait
+// Trait declarations / definitions come after interface definitions.
+// DECL: struct ExtraShardDeclsInterfaceTrait : public
+// DECL:      bool sharedMethodDeclaration() {
+// DECL-NEXT:   return (*static_cast<ConcreteOp *>(this)).someOtherMethod();
+// DECL-NEXT: }
+
+// DECL-LABEL: struct TestOpInterfaceVerifyTrait : public
 // DECL: verifyTrait
 
-// DECL-LABEL: struct TestOpInterfaceVerifyRegionTrait
+// DECL-LABEL: struct TestOpInterfaceVerifyRegionTrait : public
 // DECL: verifyRegionTrait
 
 // Method implementations come last, after all class definitions.
diff --git a/mlir/test/mlir-tblgen/op-python-bindings.td b/mlir/test/mlir-tblgen/op-python-bindings.td
index 3ec69c33b4bb9..81a4b2bbd1631 100644
--- a/mlir/test/mlir-tblgen/op-python-bindings.td
+++ b/mlir/test/mlir-tblgen/op-python-bindings.td
@@ -60,7 +60,7 @@ def AttrSizedOperandsOp : TestOp<"attr_sized_operands",
                    Optional<AnyType>:$variadic2);
 }
 
-// CHECK: def attr_sized_operands(variadic1, non_variadic, *, variadic2=None, loc=None, ip=None)
+// CHECK: def attr_sized_operands(variadic1, non_variadic, *, variadic2=None, loc=None, ip=None) -> AttrSizedOperandsOp:
 // CHECK:   return AttrSizedOperandsOp(variadic1=variadic1, non_variadic=non_variadic, variadic2=variadic2, loc=loc, ip=ip)
 
 // CHECK: @_ods_cext.register_operation(_Dialect)
@@ -108,8 +108,9 @@ def AttrSizedResultsOp : TestOp<"attr_sized_results",
                  Variadic<AnyType>:$variadic2);
 }
 
-// CHECK: def attr_sized_results(variadic1, non_variadic, variadic2, *, loc=None, ip=None)
-// CHECK:   return _get_op_result_or_op_results(AttrSizedResultsOp(variadic1=variadic1, non_variadic=non_variadic, variadic2=variadic2, loc=loc, ip=ip))
+// CHECK: def attr_sized_results(variadic1, non_variadic, variadic2, *, loc=None, ip=None) -> _Union[_ods_ir.OpResult, _ods_ir.OpResultList, AttrSizedResultsOp]:
+// CHECK:   op = AttrSizedResultsOp(variadic1=variadic1, non_variadic=non_variadic, variadic2=variadic2, loc=loc, ip=ip); results = op.results
+// CHECK:   return results if len(results) > 1 else (results[0] if len(results) == 1 else op)
 
 
 // CHECK: @_ods_cext.register_operation(_Dialect)
@@ -159,7 +160,7 @@ def AttributedOp : TestOp<"attributed_op"> {
                    UnitAttr:$unitAttr, I32Attr:$in);
 }
 
-// CHECK: def attributed_op(i32attr, in_, *, optional_f32_attr=None, unit_attr=None, loc=None, ip=None)
+// CHECK: def attributed_op(i32attr, in_, *, optional_f32_attr=None, unit_attr=None, loc=None, ip=None) -> AttributedOp:
 // CHECK:     return AttributedOp(i32attr=i32attr, in_=in_, optionalF32Attr=optional_f32_attr, unitAttr=unit_attr, loc=loc, ip=ip)
 
 // CHECK: @_ods_cext.register_operation(_Dialect)
@@ -196,7 +197,7 @@ def AttributedOpWithOperands : TestOp<"attributed_op_with_operands"> {
   let arguments = (ins I32, UnitAttr:$in, F32, OptionalAttr<F32Attr>:$is);
 }
 
-// CHECK: def attributed_op_with_operands(_gen_arg_0, _gen_arg_2, *, in_=None, is_=None, loc=None, ip=None)
+// CHECK: def attributed_op_with_operands(_gen_arg_0, _gen_arg_2, *, in_=None, is_=None, loc=None, ip=None) -> AttributedOpWithOperands
 // CHECK:   return AttributedOpWithOperands(_gen_arg_0=_gen_arg_0, _gen_arg_2=_gen_arg_2, in_=in_, is_=is_, loc=loc, ip=ip)
 
 // CHECK: @_ods_cext.register_operation(_Dialect)
@@ -221,7 +222,7 @@ def DefaultValuedAttrsOp : TestOp<"default_valued_attrs"> {
   let results = (outs);
 }
 
-// CHECK: def default_valued_attrs(*, arr=None, unsupported=None, loc=None, ip=None)
+// CHECK: def default_valued_attrs(*, arr=None, unsupported=None, loc=None, ip=None) -> DefaultValuedAttrsOp:
 // CHECK:   return DefaultValuedAttrsOp(arr=arr, unsupported=unsupported, loc=loc, ip=ip)
 
 // CHECK-LABEL: OPERATION_NAME = "test.derive_result_types_op"
@@ -239,7 +240,7 @@ def DeriveResultTypesOp : TestOp<"derive_result_types_op", [FirstAttrDerivedResu
   let results = (outs AnyType:$res, AnyType);
 }
 
-// CHECK: def derive_result_types_op(type_, *, results=None, loc=None, ip=None)
+// CHECK: def derive_result_types_op(type_, *, results=None, loc=None, ip=None) -> _ods_ir.OpResultList:
 // CHECK:   return DeriveResultTypesOp(type_=type_, results=results, loc=loc, ip=ip).results
 
 // CHECK-LABEL: OPERATION_NAME = "test.derive_result_types_variadic_op"
@@ -249,8 +250,37 @@ def DeriveResultTypesVariadicOp : TestOp<"derive_result_types_variadic_op", [Fir
   let results = (outs AnyType:$res, Variadic<AnyType>);
 }
 
-// CHECK: def derive_result_types_variadic_op(res, _gen_res_1, type_, *, loc=None, ip=None)
-// CHECK:   return _get_op_result_or_op_results(DeriveResultTypesVariadicOp(res=res, _gen_res_1=_gen_res_1, type_=type_, loc=loc, ip=ip))
+// CHECK: def derive_result_types_variadic_op(res, _gen_res_1, type_, *, loc=None, ip=None) -> _Union[_ods_ir.OpResult, _ods_ir.OpResultList, DeriveResultTypesVariadicOp]:
+// CHECK:   op = DeriveResultTypesVariadicOp(res=res, _gen_res_1=_gen_res_1, type_=type_, loc=loc, ip=ip); results = op.results
+// CHECK:   return results if len(results) > 1 else (results[0] if len(results) == 1 else op)
+
+
+// CHECK: class DescriptionOp(_ods_ir.OpView):
+// CHECK:   r"""
+// CHECK:   This is a long description.
+// CHECK:   It has multiple lines.
+// CHECK:   A code block (to test the indent).
+// CHECK:   ```mlir
+// CHECK:   test.loop {
+// CHECK:     test.yield
+// CHECK:   }
+// CHECK:   ```
+// CHECK:   Add \"\"\" will not terminate the description.
+// CHECK:   """
+def DescriptionOp : TestOp<"description"> {
+  let description = [{
+    This is a long description.
+    It has multiple lines.
+
+    A code block (to test the indent).
+    ```mlir
+    test.loop {
+      test.yield
+    }
+    ```
+    Add """ will not terminate the description.
+  }];
+}
 
 // CHECK: @_ods_cext.register_operation(_Dialect)
 // CHECK: class EmptyOp(_ods_ir.OpView):
@@ -267,7 +297,7 @@ def EmptyOp : TestOp<"empty">;
   // CHECK:     attributes=attributes, results=results, operands=operands,
   // CHECK:     successors=_ods_successors, regions=regions, loc=loc, ip=ip)
 
-// CHECK: def empty(*, loc=None, ip=None)
+// CHECK: def empty(*, loc=None, ip=None) -> EmptyOp:
 // CHECK:   return EmptyOp(loc=loc, ip=ip)
 
 // CHECK-LABEL: OPERATION_NAME = "test.infer_result_types_implied_op"
@@ -281,7 +311,7 @@ def InferResultTypesImpliedOp : TestOp<"infer_result_types_implied_op"> {
   let results = (outs I32:$i32, F32:$f32);
 }
 
-// CHECK: def infer_result_types_implied_op(*, results=None, loc=None, ip=None)
+// CHECK: def infer_result_types_implied_op(*, results=None, loc=None, ip=None) -> _ods_ir.OpResultList:
 // CHECK:   return InferResultTypesImpliedOp(results=results, loc=loc, ip=ip).results
 
 // CHECK-LABEL: OPERATION_NAME = "test.infer_result_types_op"
@@ -295,7 +325,7 @@ def InferResultTypesOp : TestOp<"infer_result_types_op", [InferTypeOpInterface]>
   let results = (outs AnyType, AnyType, AnyType);
 }
 
-// CHECK: def infer_result_types_op(*, results=None, loc=None, ip=None)
+// CHECK: def infer_result_types_op(*, results=None, loc=None, ip=None) -> _ods_ir.OpResultList:
 // CHECK:   return InferResultTypesOp(results=results, loc=loc, ip=ip).results
 
 // CHECK: @_ods_cext.register_operation(_Dialect)
@@ -334,7 +364,7 @@ def MissingNamesOp : TestOp<"missing_names"> {
   let results = (outs I32:$i32, AnyFloat, I64:$i64);
 }
 
-// CHECK: def missing_names(i32, _gen_res_1, i64, _gen_arg_0, f32, _gen_arg_2, *, loc=None, ip=None)
+// CHECK: def missing_names(i32, _gen_res_1, i64, _gen_arg_0, f32, _gen_arg_2, *, loc=None, ip=None) -> _ods_ir.OpResultList:
 // CHECK:   return MissingNamesOp(i32=i32, _gen_res_1=_gen_res_1, i64=i64, _gen_arg_0=_gen_arg_0, f32=f32, _gen_arg_2=_gen_arg_2, loc=loc, ip=ip).results
 
 // CHECK: @_ods_cext.register_operation(_Dialect)
@@ -366,7 +396,7 @@ def OneOptionalOperandOp : TestOp<"one_optional_operand"> {
   // CHECK:   return None if len(self.operation.operands) < 2 else self.operation.operands[1]
 }
 
-// CHECK: def one_optional_operand(non_optional, *, optional=None, loc=None, ip=None)
+// CHECK: def one_optional_operand(non_optional, *, optional=None, loc=None, ip=None) -> OneOptionalOperandOp:
 // CHECK:   return OneOptionalOperandOp(non_optional=non_optional, optional=optional, loc=loc, ip=ip)
 
 // CHECK: @_ods_cext.register_operation(_Dialect)
@@ -399,7 +429,7 @@ def OneVariadicOperandOp : TestOp<"one_variadic_operand"> {
   let arguments = (ins AnyType:$non_variadic, Variadic<AnyType>:$variadic);
 }
 
-// CHECK: def one_variadic_operand(non_variadic, variadic, *, loc=None, ip=None)
+// CHECK: def one_variadic_operand(non_variadic, variadic, *, loc=None, ip=None) -> OneVariadicOperandOp:
 // CHECK:   return OneVariadicOperandOp(non_variadic=non_variadic, variadic=variadic, loc=loc, ip=ip)
 
 // CHECK: @_ods_cext.register_operation(_Dialect)
@@ -433,8 +463,9 @@ def OneVariadicResultOp : TestOp<"one_variadic_result"> {
   let results = (outs Variadic<AnyType>:$variadic, AnyType:$non_variadic);
 }
 
-// CHECK: def one_variadic_result(variadic, non_variadic, *, loc=None, ip=None)
-// CHECK:   return _get_op_result_or_op_results(OneVariadicResultOp(variadic=variadic, non_variadic=non_variadic, loc=loc, ip=ip))
+// CHECK: def one_variadic_result(variadic, non_variadic, *, loc=None, ip=None) -> _Union[_ods_ir.OpResult, _ods_ir.OpResultList, OneVariadicResultOp]:
+// CHECK:   op = OneVariadicResultOp(variadic=variadic, non_variadic=non_variadic, loc=loc, ip=ip); results = op.results
+// CHECK:   return results if len(results) > 1 else (results[0] if len(results) == 1 else op)
 
 // CHECK: @_ods_cext.register_operation(_Dialect)
 // CHECK: class PythonKeywordOp(_ods_ir.OpView):
@@ -458,7 +489,7 @@ def PythonKeywordOp : TestOp<"python_keyword"> {
   let arguments = (ins AnyType:$in);
 }
 
-// CHECK: def python_keyword(in_, *, loc=None, ip=None)
+// CHECK: def python_keyword(in_, *, loc=None, ip=None) -> PythonKeywordOp:
 // CHECK:   return PythonKeywordOp(in_=in_, loc=loc, ip=ip)
 
 // CHECK-LABEL: OPERATION_NAME = "test.same_results"
@@ -471,8 +502,8 @@ def SameResultsOp : TestOp<"same_results", [SameOperandsAndResultType]> {
   let results = (outs AnyType:$res);
 }
 
-// CHECK: def same_results(in1, in2, *, results=None, loc=None, ip=None)
-// CHECK:   return SameResultsOp(in1=in1, in2=in2, results=results, loc=loc, ip=ip)
+// CHECK: def same_results(in1, in2, *, results=None, loc=None, ip=None) -> _ods_ir.OpResult:
+// CHECK:   return SameResultsOp(in1=in1, in2=in2, results=results, loc=loc, ip=ip).result
 
 // CHECK-LABEL: OPERATION_NAME = "test.same_results_variadic"
 def SameResultsVariadicOp : TestOp<"same_results_variadic", [SameOperandsAndResultType]> {
@@ -481,8 +512,9 @@ def SameResultsVariadicOp : TestOp<"same_results_variadic", [SameOperandsAndResu
   let results = (outs Variadic<AnyType>:$res);
 }
 
-// CHECK: def same_results_variadic(res, in1, in2, *, loc=None, ip=None)
-// CHECK:   return _get_op_result_or_op_results(SameResultsVariadicOp(res=res, in1=in1, in2=in2, loc=loc, ip=ip))
+// CHECK: def same_results_variadic(res, in1, in2, *, loc=None, ip=None) -> _Union[_ods_ir.OpResult, _ods_ir.OpResultList, SameResultsVariadicOp]:
+// CHECK:   op = SameResultsVariadicOp(res=res, in1=in1, in2=in2, loc=loc, ip=ip); results = op.results
+// CHECK:   return results if len(results) > 1 else (results[0] if len(results) == 1 else op)
 
 
 // CHECK: @_ods_cext.register_operation(_Dialect)
@@ -508,7 +540,7 @@ def SameVariadicOperandSizeOp : TestOp<"same_variadic_operand",
                    Variadic<AnyType>:$variadic2);
 }
 
-// CHECK: def same_variadic_operand(variadic1, non_variadic, variadic2, *, loc=None, ip=None)
+// CHECK: def same_variadic_operand(variadic1, non_variadic, variadic2, *, loc=None, ip=None) -> SameVariadicOperandSizeOp:
 // CHECK:   return SameVariadicOperandSizeOp(variadic1=variadic1, non_variadic=non_variadic, variadic2=variadic2, loc=loc, ip=ip)
 
 // CHECK: @_ods_cext.register_operation(_Dialect)
@@ -534,8 +566,9 @@ def SameVariadicResultSizeOp : TestOp<"same_variadic_result",
                  Variadic<AnyType>:$variadic2);
 }
 
-// CHECK: def same_variadic_result(variadic1, non_variadic, variadic2, *, loc=None, ip=None)
-// CHECK:   return _get_op_result_or_op_results(SameVariadicResultSizeOp(variadic1=variadic1, non_variadic=non_variadic, variadic2=variadic2, loc=loc, ip=ip))
+// CHECK: def same_variadic_result(variadic1, non_variadic, variadic2, *, loc=None, ip=None) -> _Union[_ods_ir.OpResult, _ods_ir.OpResultList, SameVariadicResultSizeOp]:
+// CHECK:   op = SameVariadicResultSizeOp(variadic1=variadic1, non_variadic=non_variadic, variadic2=variadic2, loc=loc, ip=ip); results = op.results
+// CHECK:   return results if len(results) > 1 else (results[0] if len(results) == 1 else op)
 
 // CHECK: @_ods_cext.register_operation(_Dialect)
 // CHECK: class SimpleOp(_ods_ir.OpView):
@@ -575,7 +608,7 @@ def SimpleOp : TestOp<"simple"> {
   let results = (outs I64:$i64, AnyFloat:$f64);
 }
 
-// CHECK: def simple(i64, f64, i32, f32, *, loc=None, ip=None)
+// CHECK: def simple(i64, f64, i32, f32, *, loc=None, ip=None) -> _ods_ir.OpResultList:
 // CHECK:   return SimpleOp(i64=i64, f64=f64, i32=i32, f32=f32, loc=loc, ip=ip).results
 
 // CHECK: class VariadicAndNormalRegionOp(_ods_ir.OpView):
@@ -603,7 +636,7 @@ def VariadicAndNormalRegionOp : TestOp<"variadic_and_normal_region"> {
   // CHECK:    return self.regions[2:]
 }
 
-// CHECK: def variadic_and_normal_region(num_variadic, *, loc=None, ip=None)
+// CHECK: def variadic_and_normal_region(num_variadic, *, loc=None, ip=None) -> VariadicAndNormalRegionOp:
 // CHECK:   return VariadicAndNormalRegionOp(num_variadic=num_variadic, loc=loc, ip=ip)
 
 // CHECK: class VariadicRegionOp(_ods_ir.OpView):
@@ -627,7 +660,7 @@ def VariadicRegionOp : TestOp<"variadic_region"> {
   // CHECK:    return self.regions[0:]
 }
 
-// CHECK: def variadic_region(num_variadic, *, loc=None, ip=None)
+// CHECK: def variadic_region(num_variadic, *, loc=None, ip=None) -> VariadicRegionOp:
 // CHECK:   return VariadicRegionOp(num_variadic=num_variadic, loc=loc, ip=ip)
 
 // CHECK: @_ods_cext.register_operation(_Dialect)
@@ -636,7 +669,7 @@ def VariadicRegionOp : TestOp<"variadic_region"> {
 def WithSpecialCharactersOp : TestOp<"123with--special.characters"> {
 }
 
-// CHECK: def _123with__special_characters(*, loc=None, ip=None)
+// CHECK: def _123with__special_characters(*, loc=None, ip=None) -> WithSpecialCharactersOp:
 // CHECK:   return WithSpecialCharactersOp(loc=loc, ip=ip)
 
 // CHECK: @_ods_cext.register_operation(_Dialect)
@@ -651,11 +684,11 @@ def WithSuccessorsOp : TestOp<"with_successors"> {
                               VariadicSuccessor<AnySuccessor>:$successors);
 }
 
-// CHECK: def with_successors(successor, successors, *, loc=None, ip=None)
+// CHECK: def with_successors(successor, successors, *, loc=None, ip=None) -> WithSuccessorsOp:
 // CHECK:   return WithSuccessorsOp(successor=successor, successors=successors, loc=loc, ip=ip)
 
 // CHECK: class snake_case(_ods_ir.OpView):
 // CHECK-LABEL: OPERATION_NAME = "test.snake_case"
 def already_snake_case : TestOp<"snake_case"> {}
-// CHECK: def snake_case_(*, loc=None, ip=None)
+// CHECK: def snake_case_(*, loc=None, ip=None) -> snake_case:
 // CHECK:   return snake_case(loc=loc, ip=ip)
diff --git a/mlir/test/python/CMakeLists.txt b/mlir/test/python/CMakeLists.txt
index d68f3ff82e883..e1e82ef367b1e 100644
--- a/mlir/test/python/CMakeLists.txt
+++ b/mlir/test/python/CMakeLists.txt
@@ -13,7 +13,7 @@ add_subdirectory(lib)
 
 set(MLIR_PYTHON_TEST_DEPENDS MLIRPythonModules)
 if(NOT MLIR_STANDALONE_BUILD)
-  list(APPEND MLIR_PYTHON_TEST_DEPENDS FileCheck count)
+  list(APPEND MLIR_PYTHON_TEST_DEPENDS FileCheck count not)
 endif()
 add_lit_testsuite(check-mlir-python "Running the MLIR Python regression tests"
   ${CMAKE_CURRENT_BINARY_DIR}
diff --git a/mlir/test/python/dialects/python_test.py b/mlir/test/python/dialects/python_test.py
index 68262822ca6b5..17aaef7e1b9f4 100644
--- a/mlir/test/python/dialects/python_test.py
+++ b/mlir/test/python/dialects/python_test.py
@@ -1,7 +1,9 @@
 # RUN: %PYTHON %s pybind11 | FileCheck %s
 # RUN: %PYTHON %s nanobind | FileCheck %s
-
+import inspect
 import sys
+from typing import Union
+
 from mlir.ir import *
 import mlir.dialects.func as func
 import mlir.dialects.python_test as test
@@ -323,6 +325,7 @@ def resultTypesDefinedByTraits():
             # CHECK: f32 index
             print(no_infer.single.type, no_infer.doubled.type)
 
+
 # CHECK-LABEL: TEST: testOptionalOperandOp
 @run
 def testOptionalOperandOp():
@@ -594,6 +597,17 @@ def testInferTypeOpInterface():
             # CHECK: f32
             print(two_operands.result.type)
 
+            assert (
+                inspect.signature(
+                    test.infer_results_variadic_inputs_op
+                ).return_annotation
+                is OpResult
+            )
+            assert isinstance(
+                test.infer_results_variadic_inputs_op(single=zero, doubled=zero),
+                OpResult,
+            )
+
 
 # CHECK-LABEL: TEST: testVariadicOperandAccess
 @run
@@ -621,6 +635,15 @@ def values(lst):
             # CHECK: ['Value(%{{.*}} = arith.constant 3 : i32)', 'Value(%{{.*}} = arith.constant 4 : i32)']
             print(values(variadic_operands.variadic2))
 
+            assert (
+                inspect.signature(test.same_variadic_operand).return_annotation
+                is test.SameVariadicOperandSizeOp
+            )
+            assert isinstance(
+                test.same_variadic_operand([zero, one], two, [three, four]),
+                test.SameVariadicOperandSizeOp,
+            )
+
 
 # CHECK-LABEL: TEST: testVariadicResultAccess
 @run
@@ -642,6 +665,15 @@ def types(lst):
             # CHECK: [IntegerType(i3), IntegerType(i4)]
             print(types(op.variadic2))
 
+            assert (
+                inspect.signature(test.same_variadic_result_vfv).return_annotation
+                is Union[OpResult, OpResultList, test.SameVariadicResultSizeOpVFV]
+            )
+            assert isinstance(
+                test.same_variadic_result_vfv([i[0], i[1]], i[2], [i[3], i[4]]),
+                OpResultList,
+            )
+
             #  Test Variadic-Variadic-Variadic
             op = test.SameVariadicResultSizeOpVVV(
                 [i[0], i[1]], [i[2], i[3]], [i[4], i[5]]
@@ -713,3 +745,12 @@ def types(lst):
             print(types(op.variadic2))
             # CHECK: i4
             print(op.non_variadic3.type)
+
+            assert (
+                inspect.signature(test.results_variadic).return_annotation
+                is Union[OpResult, OpResultList, test.ResultsVariadicOp]
+            )
+            assert isinstance(
+                test.results_variadic([i[0]]),
+                OpResult,
+            )
diff --git a/mlir/test/python/dialects/transform_vector_ext.py b/mlir/test/python/dialects/transform_vector_ext.py
index 5a648fe073315..28902b012f7cb 100644
--- a/mlir/test/python/dialects/transform_vector_ext.py
+++ b/mlir/test/python/dialects/transform_vector_ext.py
@@ -48,6 +48,8 @@ def non_configurable_patterns():
     vector.ApplyLowerGatherPatternsOp()
     # CHECK: transform.apply_patterns.vector.unroll_from_elements
     vector.ApplyUnrollFromElementsPatternsOp()
+    # CHECK: transform.apply_patterns.vector.unroll_to_elements
+    vector.ApplyUnrollToElementsPatternsOp()
     # CHECK: transform.apply_patterns.vector.lower_scan
     vector.ApplyLowerScanPatternsOp()
     # CHECK: transform.apply_patterns.vector.lower_shape_cast
diff --git a/mlir/test/python/ir/auto_location.py b/mlir/test/python/ir/auto_location.py
index 01b5542119b4e..83168901341a8 100644
--- a/mlir/test/python/ir/auto_location.py
+++ b/mlir/test/python/ir/auto_location.py
@@ -51,7 +51,7 @@ def testInferLocations():
         _cext.globals.register_traceback_file_inclusion(_arith_ops_gen.__file__)
         three = arith.constant(IndexType.get(), 3)
         # fmt: off
-        # CHECK: loc(callsite("ConstantOp.__init__"("{{.*}}[[SEP]]mlir[[SEP]]dialects[[SEP]]_arith_ops_gen.py":397:4 to :235) at callsite("testInferLocations"("{{.*}}[[SEP]]test[[SEP]]python[[SEP]]ir[[SEP]]auto_location.py":52:16 to :50) at callsite("run"("{{.*}}[[SEP]]test[[SEP]]python[[SEP]]ir[[SEP]]auto_location.py":13:4 to :7) at "<module>"("{{.*}}[[SEP]]test[[SEP]]python[[SEP]]ir[[SEP]]auto_location.py":26:1 to :4)))))
+        # CHECK: loc(callsite("ConstantOp.__init__"("{{.*}}[[SEP]]mlir[[SEP]]dialects[[SEP]]_arith_ops_gen.py":{{[0-9]+}}:4 to :235) at callsite("testInferLocations"("{{.*}}[[SEP]]test[[SEP]]python[[SEP]]ir[[SEP]]auto_location.py":52:16 to :50) at callsite("run"("{{.*}}[[SEP]]test[[SEP]]python[[SEP]]ir[[SEP]]auto_location.py":13:4 to :7) at "<module>"("{{.*}}[[SEP]]test[[SEP]]python[[SEP]]ir[[SEP]]auto_location.py":26:1 to :4)))))
         # fmt: on
         print(three.location)
 
@@ -60,14 +60,14 @@ def foo():
             print(four.location)
 
         # fmt: off
-        # CHECK: loc(callsite("ConstantOp.__init__"("{{.*}}[[SEP]]mlir[[SEP]]dialects[[SEP]]_arith_ops_gen.py":397:4 to :235) at callsite("testInferLocations.<locals>.foo"("{{.*}}[[SEP]]test[[SEP]]python[[SEP]]ir[[SEP]]auto_location.py":59:19 to :53) at callsite("testInferLocations"("{{.*}}[[SEP]]test[[SEP]]python[[SEP]]ir[[SEP]]auto_location.py":65:8 to :13) at callsite("run"("{{.*}}[[SEP]]test[[SEP]]python[[SEP]]ir[[SEP]]auto_location.py":13:4 to :7) at "<module>"("{{.*}}[[SEP]]test[[SEP]]python[[SEP]]ir[[SEP]]auto_location.py":26:1 to :4))))))
+        # CHECK: loc(callsite("ConstantOp.__init__"("{{.*}}[[SEP]]mlir[[SEP]]dialects[[SEP]]_arith_ops_gen.py":{{[0-9]+}}:4 to :235) at callsite("testInferLocations.<locals>.foo"("{{.*}}[[SEP]]test[[SEP]]python[[SEP]]ir[[SEP]]auto_location.py":59:19 to :53) at callsite("testInferLocations"("{{.*}}[[SEP]]test[[SEP]]python[[SEP]]ir[[SEP]]auto_location.py":65:8 to :13) at callsite("run"("{{.*}}[[SEP]]test[[SEP]]python[[SEP]]ir[[SEP]]auto_location.py":13:4 to :7) at "<module>"("{{.*}}[[SEP]]test[[SEP]]python[[SEP]]ir[[SEP]]auto_location.py":26:1 to :4))))))
         # fmt: on
         foo()
 
         _cext.globals.register_traceback_file_exclusion(__file__)
 
         # fmt: off
-        # CHECK: loc("ConstantOp.__init__"("{{.*}}[[SEP]]mlir[[SEP]]dialects[[SEP]]_arith_ops_gen.py":397:4 to :235))
+        # CHECK: loc("ConstantOp.__init__"("{{.*}}[[SEP]]mlir[[SEP]]dialects[[SEP]]_arith_ops_gen.py":{{[0-9]+}}:4 to :235))
         # fmt: on
         foo()
 
diff --git a/mlir/test/python/ir/module.py b/mlir/test/python/ir/module.py
index ad4c9340a6c82..33959bea9ffb6 100644
--- a/mlir/test/python/ir/module.py
+++ b/mlir/test/python/ir/module.py
@@ -121,6 +121,7 @@ def testRoundtripBinary():
 def testModuleOperation():
     ctx = Context()
     module = Module.parse(r"""module @successfulParse {}""", ctx)
+    assert ctx._get_live_module_count() == 1
     op1 = module.operation
     # CHECK: module @successfulParse
     print(op1)
@@ -145,6 +146,7 @@ def testModuleOperation():
     op1 = None
     op2 = None
     gc.collect()
+    assert ctx._get_live_module_count() == 0
 
 
 # CHECK-LABEL: TEST: testModuleCapsule
@@ -152,17 +154,17 @@ def testModuleOperation():
 def testModuleCapsule():
     ctx = Context()
     module = Module.parse(r"""module @successfulParse {}""", ctx)
+    assert ctx._get_live_module_count() == 1
     # CHECK: "mlir.ir.Module._CAPIPtr"
     module_capsule = module._CAPIPtr
     print(module_capsule)
     module_dup = Module._CAPICreate(module_capsule)
-    assert module is not module_dup
+    assert module is module_dup
     assert module == module_dup
-    module._clear_mlir_module()
-    assert module != module_dup
     assert module_dup.context is ctx
     # Gc and verify destructed.
     module = None
     module_capsule = None
     module_dup = None
     gc.collect()
+    assert ctx._get_live_module_count() == 0
diff --git a/mlir/test/python/python_test_ops.td b/mlir/test/python/python_test_ops.td
index 026e64a3cfc19..1e94b94dc714b 100644
--- a/mlir/test/python/python_test_ops.td
+++ b/mlir/test/python/python_test_ops.td
@@ -265,4 +265,8 @@ def SameVariadicResultSizeOpFVFVF : TestOp<"same_variadic_result_fvfvf",
                  AnyType:$non_variadic3);
 }
 
+def ResultsVariadicOp : TestOp<"results_variadic"> {
+  let results = (outs Variadic<AnyType>:$res);
+}
+
 #endif // PYTHON_TEST_OPS
diff --git a/mlir/tools/mlir-lsp-server/mlir-lsp-server.cpp b/mlir/tools/mlir-lsp-server/mlir-lsp-server.cpp
index 10d602fdfe728..712237bbbbca6 100644
--- a/mlir/tools/mlir-lsp-server/mlir-lsp-server.cpp
+++ b/mlir/tools/mlir-lsp-server/mlir-lsp-server.cpp
@@ -10,8 +10,8 @@
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/InitAllDialects.h"
 #include "mlir/InitAllExtensions.h"
-#include "mlir/Tools/lsp-server-support/Protocol.h"
 #include "mlir/Tools/mlir-lsp-server/MlirLspServerMain.h"
+#include "llvm/Support/LSP/Protocol.h"
 
 using namespace mlir;
 
@@ -37,8 +37,8 @@ int main(int argc, char **argv) {
   // Returns the registry, except in testing mode when the URI contains
   // "-disable-lsp-registration". Testing for/example of registering dialects
   // based on URI.
-  auto registryFn = [&registry,
-                     &empty](const lsp::URIForFile &uri) -> DialectRegistry & {
+  auto registryFn = [&registry, &empty](
+                        const llvm::lsp::URIForFile &uri) -> DialectRegistry & {
     (void)empty;
 #ifdef MLIR_INCLUDE_TESTS
     if (uri.uri().contains("-disable-lsp-registration"))
diff --git a/mlir/tools/mlir-opt/mlir-opt.cpp b/mlir/tools/mlir-opt/mlir-opt.cpp
index 7b992b4ee029b..e4620c009af8c 100644
--- a/mlir/tools/mlir-opt/mlir-opt.cpp
+++ b/mlir/tools/mlir-opt/mlir-opt.cpp
@@ -97,6 +97,7 @@ void registerTestDiagnosticsPass();
 void registerTestDiagnosticsMetadataPass();
 void registerTestDominancePass();
 void registerTestDynamicPipelinePass();
+void registerTestRemarkPass();
 void registerTestEmulateNarrowTypePass();
 void registerTestFooAnalysisPass();
 void registerTestComposeSubView();
@@ -243,6 +244,7 @@ void registerTestPasses() {
   mlir::test::registerTestDiagnosticsMetadataPass();
   mlir::test::registerTestDominancePass();
   mlir::test::registerTestDynamicPipelinePass();
+  mlir::test::registerTestRemarkPass();
   mlir::test::registerTestEmulateNarrowTypePass();
   mlir::test::registerTestFooAnalysisPass();
   mlir::test::registerTestComposeSubView();
diff --git a/mlir/tools/mlir-tblgen/OpInterfacesGen.cpp b/mlir/tools/mlir-tblgen/OpInterfacesGen.cpp
index 4dfa1908b3267..3cc1636ac3317 100644
--- a/mlir/tools/mlir-tblgen/OpInterfacesGen.cpp
+++ b/mlir/tools/mlir-tblgen/OpInterfacesGen.cpp
@@ -96,9 +96,9 @@ class InterfaceGenerator {
   void emitConceptDecl(const Interface &interface);
   void emitModelDecl(const Interface &interface);
   void emitModelMethodsDef(const Interface &interface);
-  void emitTraitDecl(const Interface &interface, StringRef interfaceName,
-                     StringRef interfaceTraitsName);
+  void forwardDeclareInterface(const Interface &interface);
   void emitInterfaceDecl(const Interface &interface);
+  void emitInterfaceTraitDecl(const Interface &interface);
 
   /// The set of interface records to emit.
   std::vector<const Record *> defs;
@@ -445,9 +445,16 @@ void InterfaceGenerator::emitModelMethodsDef(const Interface &interface) {
     os << "} // namespace " << ns << "\n";
 }
 
-void InterfaceGenerator::emitTraitDecl(const Interface &interface,
-                                       StringRef interfaceName,
-                                       StringRef interfaceTraitsName) {
+void InterfaceGenerator::emitInterfaceTraitDecl(const Interface &interface) {
+  llvm::SmallVector<StringRef, 2> namespaces;
+  llvm::SplitString(interface.getCppNamespace(), namespaces, "::");
+  for (StringRef ns : namespaces)
+    os << "namespace " << ns << " {\n";
+
+  os << "namespace detail {\n";
+
+  StringRef interfaceName = interface.getName();
+  auto interfaceTraitsName = (interfaceName + "InterfaceTraits").str();
   os << llvm::formatv("  template <typename {3}>\n"
                       "  struct {0}Trait : public ::mlir::{2}<{0},"
                       " detail::{1}>::Trait<{3}> {{\n",
@@ -494,6 +501,10 @@ void InterfaceGenerator::emitTraitDecl(const Interface &interface,
     os << tblgen::tgfmt(*extraTraitDecls, &traitMethodFmt) << "\n";
 
   os << "  };\n";
+  os << "}// namespace detail\n";
+
+  for (StringRef ns : llvm::reverse(namespaces))
+    os << "} // namespace " << ns << "\n";
 }
 
 static void emitInterfaceDeclMethods(const Interface &interface,
@@ -517,6 +528,27 @@ static void emitInterfaceDeclMethods(const Interface &interface,
     os << tblgen::tgfmt(extraDecls->rtrim(), &extraDeclsFmt) << "\n";
 }
 
+void InterfaceGenerator::forwardDeclareInterface(const Interface &interface) {
+  llvm::SmallVector<StringRef, 2> namespaces;
+  llvm::SplitString(interface.getCppNamespace(), namespaces, "::");
+  for (StringRef ns : namespaces)
+    os << "namespace " << ns << " {\n";
+
+  // Emit a forward declaration of the interface class so that it becomes usable
+  // in the signature of its methods.
+  std::string comments = tblgen::emitSummaryAndDescComments(
+      "", interface.getDescription().value_or(""));
+  if (!comments.empty()) {
+    os << comments << "\n";
+  }
+
+  StringRef interfaceName = interface.getName();
+  os << "class " << interfaceName << ";\n";
+
+  for (StringRef ns : llvm::reverse(namespaces))
+    os << "} // namespace " << ns << "\n";
+}
+
 void InterfaceGenerator::emitInterfaceDecl(const Interface &interface) {
   llvm::SmallVector<StringRef, 2> namespaces;
   llvm::SplitString(interface.getCppNamespace(), namespaces, "::");
@@ -533,7 +565,6 @@ void InterfaceGenerator::emitInterfaceDecl(const Interface &interface) {
   if (!comments.empty()) {
     os << comments << "\n";
   }
-  os << "class " << interfaceName << ";\n";
 
   // Emit the traits struct containing the concept and model declarations.
   os << "namespace detail {\n"
@@ -603,10 +634,6 @@ void InterfaceGenerator::emitInterfaceDecl(const Interface &interface) {
 
   os << "};\n";
 
-  os << "namespace detail {\n";
-  emitTraitDecl(interface, interfaceName, interfaceTraitsName);
-  os << "}// namespace detail\n";
-
   for (StringRef ns : llvm::reverse(namespaces))
     os << "} // namespace " << ns << "\n";
 }
@@ -619,10 +646,15 @@ bool InterfaceGenerator::emitInterfaceDecls() {
   llvm::sort(sortedDefs, [](const Record *lhs, const Record *rhs) {
     return lhs->getID() < rhs->getID();
   });
+  for (const Record *def : sortedDefs)
+    forwardDeclareInterface(Interface(def));
   for (const Record *def : sortedDefs)
     emitInterfaceDecl(Interface(def));
+  for (const Record *def : sortedDefs)
+    emitInterfaceTraitDecl(Interface(def));
   for (const Record *def : sortedDefs)
     emitModelMethodsDef(Interface(def));
+
   return false;
 }
 
diff --git a/mlir/tools/mlir-tblgen/OpPythonBindingGen.cpp b/mlir/tools/mlir-tblgen/OpPythonBindingGen.cpp
index 6a7aa9e3432d5..169d550f44850 100644
--- a/mlir/tools/mlir-tblgen/OpPythonBindingGen.cpp
+++ b/mlir/tools/mlir-tblgen/OpPythonBindingGen.cpp
@@ -13,6 +13,7 @@
 
 #include "OpGenHelpers.h"
 
+#include "mlir/Support/IndentedOstream.h"
 #include "mlir/TableGen/GenInfo.h"
 #include "mlir/TableGen/Operator.h"
 #include "llvm/ADT/StringSet.h"
@@ -20,6 +21,7 @@
 #include "llvm/Support/FormatVariadic.h"
 #include "llvm/TableGen/Error.h"
 #include "llvm/TableGen/Record.h"
+#include <regex>
 
 using namespace mlir;
 using namespace mlir::tblgen;
@@ -36,7 +38,6 @@ from ._ods_common import _cext as _ods_cext
 from ._ods_common import (
     equally_sized_accessor as _ods_equally_sized_accessor,
     get_default_loc_context as _ods_get_default_loc_context,
-    get_op_result_or_op_results as _get_op_result_or_op_results,
     get_op_results_or_values as _get_op_results_or_values,
     segmented_accessor as _ods_segmented_accessor,
 )
@@ -62,10 +63,11 @@ from ._{0}_ops_gen import _Dialect
 
 /// Template for operation class:
 ///   {0} is the Python class name;
-///   {1} is the operation name.
+///   {1} is the operation name;
+///   {2} is the docstring for this operation.
 constexpr const char *opClassTemplate = R"Py(
 @_ods_cext.register_operation(_Dialect)
-class {0}(_ods_ir.OpView):
+class {0}(_ods_ir.OpView):{2}
   OPERATION_NAME = "{1}"
 )Py";
 
@@ -276,8 +278,9 @@ def {0}({2}) -> {4}:
 )Py";
 
 constexpr const char *valueBuilderVariadicTemplate = R"Py(
-def {0}({2}) -> {4}:
-  return _get_op_result_or_op_results({1}({3}))
+def {0}({2}) -> _Union[_ods_ir.OpResult, _ods_ir.OpResultList, {1}]:
+  op = {1}({3}); results = op.results
+  return results if len(results) > 1 else (results[0] if len(results) == 1 else op)
 )Py";
 
 static llvm::cl::OptionCategory
@@ -1013,30 +1016,49 @@ static void emitValueBuilder(const Operator &op,
     nameWithoutDialect += "_";
   std::string params = llvm::join(valueBuilderParams, ", ");
   std::string args = llvm::join(opBuilderArgs, ", ");
-  const char *type =
-      (op.getNumResults() > 1
-           ? "_Sequence[_ods_ir.Value]"
-           : (op.getNumResults() > 0 ? "_ods_ir.Value" : "_ods_ir.Operation"));
-  if (op.getNumVariableLengthResults() > 0) {
+  if (op.getNumVariableLengthResults()) {
     os << formatv(valueBuilderVariadicTemplate, nameWithoutDialect,
-                  op.getCppClassName(), params, args, type);
+                  op.getCppClassName(), params, args);
   } else {
-    const char *results;
-    if (op.getNumResults() == 0) {
-      results = "";
+    std::string type = op.getCppClassName().str();
+    const char *results = "";
+    if (op.getNumResults() > 1) {
+      type = "_ods_ir.OpResultList";
+      results = ".results";
     } else if (op.getNumResults() == 1) {
+      type = "_ods_ir.OpResult";
       results = ".result";
-    } else {
-      results = ".results";
     }
     os << formatv(valueBuilderTemplate, nameWithoutDialect,
                   op.getCppClassName(), params, args, type, results);
   }
 }
 
+/// Retrieve the description of the given op and generate a docstring for it.
+static std::string makeDocStringForOp(const Operator &op) {
+  if (!op.hasDescription())
+    return "";
+
+  auto desc = op.getDescription().rtrim(" \t").str();
+  // Replace all """ with \"\"\" to avoid early termination of the literal.
+  desc = std::regex_replace(desc, std::regex(R"(""")"), R"(\"\"\")");
+
+  std::string docString = "\n";
+  llvm::raw_string_ostream os(docString);
+  raw_indented_ostream identedOs(os);
+  os << R"(  r""")" << "\n";
+  identedOs.printReindented(desc, "  ");
+  if (!StringRef(desc).ends_with("\n"))
+    os << "\n";
+  os << R"(  """)" << "\n";
+
+  return docString;
+}
+
 /// Emits bindings for a specific Op to the given output stream.
 static void emitOpBindings(const Operator &op, raw_ostream &os) {
-  os << formatv(opClassTemplate, op.getCppClassName(), op.getOperationName());
+  os << formatv(opClassTemplate, op.getCppClassName(), op.getOperationName(),
+                makeDocStringForOp(op));
 
   // Sized segments.
   if (op.getTrait(attrSizedTraitForKind("operand")) != nullptr) {
diff --git a/mlir/unittests/CMakeLists.txt b/mlir/unittests/CMakeLists.txt
index c5f0d7e384d01..89332bce5fe05 100644
--- a/mlir/unittests/CMakeLists.txt
+++ b/mlir/unittests/CMakeLists.txt
@@ -18,7 +18,6 @@ add_subdirectory(Support)
 add_subdirectory(Rewrite)
 add_subdirectory(TableGen)
 add_subdirectory(Target)
-add_subdirectory(Tools)
 add_subdirectory(Transforms)
 
 if(MLIR_ENABLE_EXECUTION_ENGINE)
diff --git a/mlir/unittests/ExecutionEngine/Invoke.cpp b/mlir/unittests/ExecutionEngine/Invoke.cpp
index cdeeca20610f0..3161c7053f7a4 100644
--- a/mlir/unittests/ExecutionEngine/Invoke.cpp
+++ b/mlir/unittests/ExecutionEngine/Invoke.cpp
@@ -251,6 +251,24 @@ TEST(NativeMemRefJit, SKIP_WITHOUT_JIT(BasicMemref)) {
   EXPECT_EQ((a[{2, 1}]), 42.);
 }
 
+TEST(NativeMemRefJit, SKIP_WITHOUT_JIT(OwningMemrefZeroInit)) {
+  constexpr int k = 3;
+  constexpr int m = 7;
+  int64_t shape[] = {k, m};
+  // Use a large alignment to stress the case where the memref data/basePtr are
+  // disjoint.
+  int alignment = 8192;
+  OwningMemRef<float, 2> a(shape, {}, {}, alignment);
+  ASSERT_EQ(
+      (void *)(((uintptr_t)a->basePtr + alignment - 1) & ~(alignment - 1)),
+      a->data);
+  for (int i = 0; i < k; ++i) {
+    for (int j = 0; j < m; ++j) {
+      EXPECT_EQ((a[{i, j}]), 0.);
+    }
+  }
+}
+
 // A helper function that will be called from the JIT
 static void memrefMultiply(::StridedMemRefType<float, 2> *memref,
                            int32_t coefficient) {
diff --git a/mlir/unittests/IR/CMakeLists.txt b/mlir/unittests/IR/CMakeLists.txt
index 75cd2d65ef5a1..dd3b110dcd295 100644
--- a/mlir/unittests/IR/CMakeLists.txt
+++ b/mlir/unittests/IR/CMakeLists.txt
@@ -14,7 +14,7 @@ add_mlir_unittest(MLIRIRTests
   MemrefLayoutTest.cpp
   OperationSupportTest.cpp
   PatternMatchTest.cpp
-  RemarkTest.cpp
+  RemarkTest.cpp  
   ShapedTypeTest.cpp
   SymbolTableTest.cpp
   TypeTest.cpp
diff --git a/mlir/unittests/IR/RemarkTest.cpp b/mlir/unittests/IR/RemarkTest.cpp
index 65e1e08b83838..5bfca255c22ca 100644
--- a/mlir/unittests/IR/RemarkTest.cpp
+++ b/mlir/unittests/IR/RemarkTest.cpp
@@ -48,7 +48,8 @@ TEST(Remark, TestOutputOptimizationRemark) {
     context.printStackTraceOnDiagnostic(true);
 
     // Setup the remark engine
-    mlir::remark::RemarkCategories cats{/*passed=*/categoryVectorizer,
+    mlir::remark::RemarkCategories cats{/*all=*/"",
+                                        /*passed=*/categoryVectorizer,
                                         /*missed=*/categoryUnroll,
                                         /*analysis=*/categoryRegister,
                                         /*failed=*/categoryInliner};
@@ -197,7 +198,8 @@ TEST(Remark, TestOutputOptimizationRemarkDiagnostic) {
     });
 
     // Setup the remark engine
-    mlir::remark::RemarkCategories cats{/*passed=*/categoryVectorizer,
+    mlir::remark::RemarkCategories cats{/*all=*/"",
+                                        /*passed=*/categoryVectorizer,
                                         /*missed=*/categoryUnroll,
                                         /*analysis=*/categoryRegister,
                                         /*failed=*/categoryUnroll};
@@ -278,7 +280,8 @@ TEST(Remark, TestCustomOptimizationRemarkDiagnostic) {
     Location loc = UnknownLoc::get(&context);
 
     // Setup the remark engine
-    mlir::remark::RemarkCategories cats{/*passed=*/categoryLoopunroll,
+    mlir::remark::RemarkCategories cats{/*all=*/"",
+                                        /*passed=*/categoryLoopunroll,
                                         /*missed=*/std::nullopt,
                                         /*analysis=*/std::nullopt,
                                         /*failed=*/categoryLoopunroll};
diff --git a/mlir/unittests/Tools/CMakeLists.txt b/mlir/unittests/Tools/CMakeLists.txt
deleted file mode 100644
index a97588d928668..0000000000000
--- a/mlir/unittests/Tools/CMakeLists.txt
+++ /dev/null
@@ -1 +0,0 @@
-add_subdirectory(lsp-server-support)
diff --git a/mlir/unittests/Tools/lsp-server-support/CMakeLists.txt b/mlir/unittests/Tools/lsp-server-support/CMakeLists.txt
deleted file mode 100644
index c539c9bc5101f..0000000000000
--- a/mlir/unittests/Tools/lsp-server-support/CMakeLists.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-add_mlir_unittest(MLIRLspServerSupportTests
-  Protocol.cpp
-  Transport.cpp
-)
-mlir_target_link_libraries(MLIRLspServerSupportTests
-  PRIVATE
-  MLIRLspServerSupportLib)
diff --git a/offload/include/OpenMP/Mapping.h b/offload/include/OpenMP/Mapping.h
index 93c1e56905ae4..45bd9c6e7da8b 100644
--- a/offload/include/OpenMP/Mapping.h
+++ b/offload/include/OpenMP/Mapping.h
@@ -49,9 +49,46 @@ class MappingConfig {
 /// Information about shadow pointers.
 struct ShadowPtrInfoTy {
   void **HstPtrAddr = nullptr;
-  void *HstPtrVal = nullptr;
   void **TgtPtrAddr = nullptr;
-  void *TgtPtrVal = nullptr;
+  int64_t PtrSize = sizeof(void *); // Size of the pointer/descriptor
+
+  // Store the complete contents for both host and target pointers/descriptors.
+  // 96 bytes is chosen as the "Small" size to cover simple Fortran
+  // descriptors of up to 3 dimensions.
+  llvm::SmallVector<char, 96> HstPtrContent;
+  llvm::SmallVector<char, 96> TgtPtrContent;
+
+  ShadowPtrInfoTy(void **HstPtrAddr, void **TgtPtrAddr, void *TgtPteeBase,
+                  int64_t PtrSize)
+      : HstPtrAddr(HstPtrAddr), TgtPtrAddr(TgtPtrAddr), PtrSize(PtrSize),
+        HstPtrContent(PtrSize), TgtPtrContent(PtrSize) {
+    constexpr int64_t VoidPtrSize = sizeof(void *);
+    assert(HstPtrAddr != nullptr && "HstPtrAddr is nullptr");
+    assert(TgtPtrAddr != nullptr && "TgtPtrAddr is nullptr");
+    assert(PtrSize >= VoidPtrSize && "PtrSize is less than sizeof(void *)");
+
+    void *HstPteeBase = *HstPtrAddr;
+    // The first VoidPtrSize bytes for HstPtrContent/TgtPtrContent are from
+    // HstPteeBase/TgtPteeBase.
+    std::memcpy(HstPtrContent.data(), &HstPteeBase, VoidPtrSize);
+    std::memcpy(TgtPtrContent.data(), &TgtPteeBase, VoidPtrSize);
+
+    // If we are not dealing with Fortran descriptors (pointers larger than
+    // VoidPtrSize), then that's that.
+    if (PtrSize <= VoidPtrSize)
+      return;
+
+    // For larger pointers, i.e. Fortran descriptors, the remaining contents of
+    // the descriptor come from the host descriptor, i.e. HstPtrAddr.
+    std::memcpy(HstPtrContent.data() + VoidPtrSize,
+                reinterpret_cast<char *>(HstPtrAddr) + VoidPtrSize,
+                PtrSize - VoidPtrSize);
+    std::memcpy(TgtPtrContent.data() + VoidPtrSize,
+                reinterpret_cast<char *>(HstPtrAddr) + VoidPtrSize,
+                PtrSize - VoidPtrSize);
+  }
+
+  ShadowPtrInfoTy() = delete;
 
   bool operator==(const ShadowPtrInfoTy &Other) const {
     return HstPtrAddr == Other.HstPtrAddr;
@@ -243,9 +280,25 @@ struct HostDataToTargetTy {
     auto Pair = States->ShadowPtrInfos.insert(ShadowPtrInfo);
     if (Pair.second)
       return true;
+
     // Check for a stale entry, if found, replace the old one.
-    if ((*Pair.first).TgtPtrVal == ShadowPtrInfo.TgtPtrVal)
+
+    // For Fortran descriptors, we need to compare their full contents,
+    // as the starting address may be the same while other fields have
+    // been updated. e.g.
+    //
+    //   !$omp target enter data map(x(1:100)) !             (1)
+    //   p => x(10: 19)
+    //   !$omp target enter data map(p, p(:)) !              (2)
+    //   p => x(5: 9)
+    //   !$omp target enter data map(attach(always): p(:)) ! (3)
+    //
+    // While &desc_p and &p(1) (TgtPtrAddr and first "sizeof(void*)" bytes of
+    // TgtPtrContent) are same for (2) and (3), the pointer attachment for (3)
+    // needs to update the bounds information in the descriptor of p on device.
+    if ((*Pair.first).TgtPtrContent == ShadowPtrInfo.TgtPtrContent)
       return false;
+
     States->ShadowPtrInfos.erase(ShadowPtrInfo);
     return addShadowPointer(ShadowPtrInfo);
   }
diff --git a/offload/liboffload/src/OffloadImpl.cpp b/offload/liboffload/src/OffloadImpl.cpp
index 7e8e297831f45..b5b9b0e83b975 100644
--- a/offload/liboffload/src/OffloadImpl.cpp
+++ b/offload/liboffload/src/OffloadImpl.cpp
@@ -157,14 +157,11 @@ struct ol_event_impl_t {
 
 struct ol_program_impl_t {
   ol_program_impl_t(plugin::DeviceImageTy *Image,
-                    std::unique_ptr<llvm::MemoryBuffer> ImageData,
-                    const __tgt_device_image &DeviceImage)
-      : Image(Image), ImageData(std::move(ImageData)),
-        DeviceImage(DeviceImage) {}
+                    llvm::MemoryBufferRef DeviceImage)
+      : Image(Image), DeviceImage(DeviceImage) {}
   plugin::DeviceImageTy *Image;
-  std::unique_ptr<llvm::MemoryBuffer> ImageData;
   std::mutex SymbolListMutex;
-  __tgt_device_image DeviceImage;
+  llvm::MemoryBufferRef DeviceImage;
   llvm::StringMap<std::unique_ptr<ol_symbol_impl_t>> KernelSymbols;
   llvm::StringMap<std::unique_ptr<ol_symbol_impl_t>> GlobalSymbols;
 };
@@ -891,28 +888,14 @@ Error olMemFill_impl(ol_queue_handle_t Queue, void *Ptr, size_t PatternSize,
 Error olCreateProgram_impl(ol_device_handle_t Device, const void *ProgData,
                            size_t ProgDataSize, ol_program_handle_t *Program) {
   // Make a copy of the program binary in case it is released by the caller.
-  auto ImageData = MemoryBuffer::getMemBufferCopy(
-      StringRef(reinterpret_cast<const char *>(ProgData), ProgDataSize));
-
-  auto DeviceImage = __tgt_device_image{
-      const_cast<char *>(ImageData->getBuffer().data()),
-      const_cast<char *>(ImageData->getBuffer().data()) + ProgDataSize, nullptr,
-      nullptr};
-
-  ol_program_handle_t Prog =
-      new ol_program_impl_t(nullptr, std::move(ImageData), DeviceImage);
-
-  auto Res =
-      Device->Device->loadBinary(Device->Device->Plugin, &Prog->DeviceImage);
-  if (!Res) {
-    delete Prog;
+  StringRef Buffer(reinterpret_cast<const char *>(ProgData), ProgDataSize);
+  Expected<plugin::DeviceImageTy *> Res =
+      Device->Device->loadBinary(Device->Device->Plugin, Buffer);
+  if (!Res)
     return Res.takeError();
-  }
-  assert(*Res != nullptr && "loadBinary returned nullptr");
-
-  Prog->Image = *Res;
-  *Program = Prog;
+  assert(*Res && "loadBinary returned nullptr");
 
+  *Program = new ol_program_impl_t(*Res, (*Res)->getMemoryBuffer());
   return Error::success();
 }
 
diff --git a/offload/libomptarget/omptarget.cpp b/offload/libomptarget/omptarget.cpp
index 4c8eba1e7180c..39286d41ec865 100644
--- a/offload/libomptarget/omptarget.cpp
+++ b/offload/libomptarget/omptarget.cpp
@@ -396,7 +396,7 @@ static int performPointerAttachment(DeviceTy &Device, AsyncInfoTy &AsyncInfo,
   assert(PtrTPR.getEntry() &&
          "Need a valid pointer entry to perform pointer-attachment");
 
-  int64_t VoidPtrSize = sizeof(void *);
+  constexpr int64_t VoidPtrSize = sizeof(void *);
   assert(HstPtrSize >= VoidPtrSize && "PointerSize is too small");
 
   uint64_t Delta = reinterpret_cast<uint64_t>(HstPteeBegin) -
@@ -411,23 +411,8 @@ static int performPointerAttachment(DeviceTy &Device, AsyncInfoTy &AsyncInfo,
      DPxPTR(TgtPteeBase), DPxPTR(TgtPteeBegin));
 
   // Add shadow pointer tracking
-  // TODO: Support shadow-tracking of larger than VoidPtrSize pointers,
-  // to support restoration of Fortran descriptors. Currently, this check
-  // would return false, even if the host Fortran descriptor had been
-  // updated since its previous map, and we should have updated its
-  // device counterpart. e.g.
-  //
-  //   !$omp target enter data map(x(1:100)) !             (1)
-  //   p => x(10: 19)
-  //   !$omp target enter data map(p, p(:)) !              (2)
-  //   p => x(5: 9)
-  //   !$omp target enter data map(attach(always): p(:)) ! (3)
-  //
-  // While PtrAddr(&desc_p) and PteeBase(&p(1)) are same for (2) and (3), the
-  // pointer attachment for (3) needs to update the bounds information
-  // in the descriptor of p on device.
   if (!PtrTPR.getEntry()->addShadowPointer(
-          ShadowPtrInfoTy{HstPtrAddr, HstPteeBase, TgtPtrAddr, TgtPteeBase})) {
+          ShadowPtrInfoTy{HstPtrAddr, TgtPtrAddr, TgtPteeBase, HstPtrSize})) {
     DP("Pointer " DPxMOD " is already attached to " DPxMOD "\n",
        DPxPTR(TgtPtrAddr), DPxPTR(TgtPteeBase));
     return OFFLOAD_SUCCESS;
@@ -940,17 +925,29 @@ postProcessingTargetDataEnd(DeviceTy *Device,
       DelEntry = false;
     }
 
-    // If we copied back to the host a struct/array containing pointers,
-    // we need to restore the original host pointer values from their
-    // shadow copies. If the struct is going to be deallocated, remove any
-    // remaining shadow pointer entries for this struct.
+    // If we copied back to the host a struct/array containing pointers, or
+    // Fortran descriptors (which are larger than a "void *"), we need to
+    // restore the original host pointer/descriptor values from their shadow
+    // copies. If the struct is going to be deallocated, remove any remaining
+    // shadow pointer entries for this struct.
     const bool HasFrom = ArgType & OMP_TGT_MAPTYPE_FROM;
     if (HasFrom) {
       Entry->foreachShadowPointerInfo([&](const ShadowPtrInfoTy &ShadowPtr) {
-        *ShadowPtr.HstPtrAddr = ShadowPtr.HstPtrVal;
-        DP("Restoring original host pointer value " DPxMOD " for host "
-           "pointer " DPxMOD "\n",
-           DPxPTR(ShadowPtr.HstPtrVal), DPxPTR(ShadowPtr.HstPtrAddr));
+        constexpr int64_t VoidPtrSize = sizeof(void *);
+        if (ShadowPtr.PtrSize > VoidPtrSize) {
+          DP("Restoring host descriptor " DPxMOD
+             " to its original content (%" PRId64
+             " bytes), containing pointee address " DPxMOD "\n",
+             DPxPTR(ShadowPtr.HstPtrAddr), ShadowPtr.PtrSize,
+             DPxPTR(ShadowPtr.HstPtrContent.data()));
+        } else {
+          DP("Restoring host pointer " DPxMOD " to its original value " DPxMOD
+             "\n",
+             DPxPTR(ShadowPtr.HstPtrAddr),
+             DPxPTR(ShadowPtr.HstPtrContent.data()));
+        }
+        std::memcpy(ShadowPtr.HstPtrAddr, ShadowPtr.HstPtrContent.data(),
+                    ShadowPtr.PtrSize);
         return OFFLOAD_SUCCESS;
       });
     }
@@ -1163,12 +1160,22 @@ static int targetDataContiguous(ident_t *Loc, DeviceTy &Device, void *ArgsBase,
     if (TPR.getEntry()) {
       int Ret = TPR.getEntry()->foreachShadowPointerInfo(
           [&](ShadowPtrInfoTy &ShadowPtr) {
-            DP("Restoring original target pointer value " DPxMOD " for target "
-               "pointer " DPxMOD "\n",
-               DPxPTR(ShadowPtr.TgtPtrVal), DPxPTR(ShadowPtr.TgtPtrAddr));
+            constexpr int64_t VoidPtrSize = sizeof(void *);
+            if (ShadowPtr.PtrSize > VoidPtrSize) {
+              DP("Restoring target descriptor " DPxMOD
+                 " to its original content (%" PRId64
+                 " bytes), containing pointee address " DPxMOD "\n",
+                 DPxPTR(ShadowPtr.TgtPtrAddr), ShadowPtr.PtrSize,
+                 DPxPTR(ShadowPtr.TgtPtrContent.data()));
+            } else {
+              DP("Restoring target pointer " DPxMOD
+                 " to its original value " DPxMOD "\n",
+                 DPxPTR(ShadowPtr.TgtPtrAddr),
+                 DPxPTR(ShadowPtr.TgtPtrContent.data()));
+            }
             Ret = Device.submitData(ShadowPtr.TgtPtrAddr,
-                                    (void *)&ShadowPtr.TgtPtrVal,
-                                    sizeof(void *), AsyncInfo);
+                                    ShadowPtr.TgtPtrContent.data(),
+                                    ShadowPtr.PtrSize, AsyncInfo);
             if (Ret != OFFLOAD_SUCCESS) {
               REPORT("Copying data to device failed.\n");
               return OFFLOAD_FAIL;
@@ -1193,15 +1200,26 @@ static int targetDataContiguous(ident_t *Loc, DeviceTy &Device, void *ArgsBase,
     }
 
     // Wait for device-to-host memcopies for whole struct to complete,
-    // before restoring the correct host pointer.
+    // before restoring the correct host pointer/descriptor.
     if (auto *Entry = TPR.getEntry()) {
       AsyncInfo.addPostProcessingFunction([=]() -> int {
         int Ret = Entry->foreachShadowPointerInfo(
             [&](const ShadowPtrInfoTy &ShadowPtr) {
-              *ShadowPtr.HstPtrAddr = ShadowPtr.HstPtrVal;
-              DP("Restoring original host pointer value " DPxMOD
-                 " for host pointer " DPxMOD "\n",
-                 DPxPTR(ShadowPtr.HstPtrVal), DPxPTR(ShadowPtr.HstPtrAddr));
+              constexpr int64_t VoidPtrSize = sizeof(void *);
+              if (ShadowPtr.PtrSize > VoidPtrSize) {
+                DP("Restoring host descriptor " DPxMOD
+                   " to its original content (%" PRId64
+                   " bytes), containing pointee address " DPxMOD "\n",
+                   DPxPTR(ShadowPtr.HstPtrAddr), ShadowPtr.PtrSize,
+                   DPxPTR(ShadowPtr.HstPtrContent.data()));
+              } else {
+                DP("Restoring host pointer " DPxMOD
+                   " to its original value " DPxMOD "\n",
+                   DPxPTR(ShadowPtr.HstPtrAddr),
+                   DPxPTR(ShadowPtr.HstPtrContent.data()));
+              }
+              std::memcpy(ShadowPtr.HstPtrAddr, ShadowPtr.HstPtrContent.data(),
+                          ShadowPtr.PtrSize);
               return OFFLOAD_SUCCESS;
             });
         Entry->unlock();
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index c26cfe961aa0e..1d33bfc1a0be9 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -464,8 +464,8 @@ struct AMDGPUMemoryManagerTy : public DeviceAllocatorTy {
 struct AMDGPUDeviceImageTy : public DeviceImageTy {
   /// Create the AMDGPU image with the id and the target image pointer.
   AMDGPUDeviceImageTy(int32_t ImageId, GenericDeviceTy &Device,
-                      const __tgt_device_image *TgtImage)
-      : DeviceImageTy(ImageId, Device, TgtImage) {}
+                      std::unique_ptr<MemoryBuffer> &&TgtImage)
+      : DeviceImageTy(ImageId, Device, std::move(TgtImage)) {}
 
   /// Prepare and load the executable corresponding to the image.
   Error loadExecutable(const AMDGPUDeviceTy &Device);
@@ -2160,7 +2160,12 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
     AMDGPUDeviceImageTy &AMDImage = static_cast<AMDGPUDeviceImageTy &>(*Image);
 
     // Unload the executable of the image.
-    return AMDImage.unloadExecutable();
+    if (auto Err = AMDImage.unloadExecutable())
+      return Err;
+
+    // Destroy the associated memory and invalidate the object.
+    Plugin.free(Image);
+    return Error::success();
   }
 
   /// Deinitialize the device and release its resources.
@@ -2183,18 +2188,12 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
 
   virtual Error callGlobalConstructors(GenericPluginTy &Plugin,
                                        DeviceImageTy &Image) override {
-    GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler();
-    if (Handler.isSymbolInImage(*this, Image, "amdgcn.device.fini"))
-      Image.setPendingGlobalDtors();
-
     return callGlobalCtorDtorCommon(Plugin, Image, /*IsCtor=*/true);
   }
 
   virtual Error callGlobalDestructors(GenericPluginTy &Plugin,
                                       DeviceImageTy &Image) override {
-    if (Image.hasPendingGlobalDtors())
-      return callGlobalCtorDtorCommon(Plugin, Image, /*IsCtor=*/false);
-    return Plugin::success();
+    return callGlobalCtorDtorCommon(Plugin, Image, /*IsCtor=*/false);
   }
 
   uint64_t getStreamBusyWaitMicroseconds() const { return OMPX_StreamBusyWait; }
@@ -2321,11 +2320,12 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
   }
 
   /// Load the binary image into the device and allocate an image object.
-  Expected<DeviceImageTy *> loadBinaryImpl(const __tgt_device_image *TgtImage,
-                                           int32_t ImageId) override {
+  Expected<DeviceImageTy *>
+  loadBinaryImpl(std::unique_ptr<MemoryBuffer> &&TgtImage,
+                 int32_t ImageId) override {
     // Allocate and initialize the image object.
     AMDGPUDeviceImageTy *AMDImage = Plugin.allocate<AMDGPUDeviceImageTy>();
-    new (AMDImage) AMDGPUDeviceImageTy(ImageId, *this, TgtImage);
+    new (AMDImage) AMDGPUDeviceImageTy(ImageId, *this, std::move(TgtImage));
 
     // Load the HSA executable.
     if (Error Err = AMDImage->loadExecutable(*this))
@@ -3105,7 +3105,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
     // Perform a quick check for the named kernel in the image. The kernel
     // should be created by the 'amdgpu-lower-ctor-dtor' pass.
     GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler();
-    if (IsCtor && !Handler.isSymbolInImage(*this, Image, KernelName))
+    if (!Handler.isSymbolInImage(*this, Image, KernelName))
       return Plugin::success();
 
     // Allocate and construct the AMDGPU kernel.
diff --git a/offload/plugins-nextgen/common/include/JIT.h b/offload/plugins-nextgen/common/include/JIT.h
index d62516d20764a..b4e3712d9c980 100644
--- a/offload/plugins-nextgen/common/include/JIT.h
+++ b/offload/plugins-nextgen/common/include/JIT.h
@@ -51,27 +51,22 @@ struct JITEngine {
   /// Run jit compilation if \p Image is a bitcode image, otherwise simply
   /// return \p Image. It is expected to return a memory buffer containing the
   /// generated device image that could be loaded to the device directly.
-  Expected<const __tgt_device_image *>
-  process(const __tgt_device_image &Image,
-          target::plugin::GenericDeviceTy &Device);
-
-  /// Remove \p Image from the jit engine's cache
-  void erase(const __tgt_device_image &Image,
-             target::plugin::GenericDeviceTy &Device);
+  Expected<std::unique_ptr<MemoryBuffer>>
+  process(StringRef Image, target::plugin::GenericDeviceTy &Device);
 
 private:
   /// Compile the bitcode image \p Image and generate the binary image that can
   /// be loaded to the target device of the triple \p Triple architecture \p
   /// MCpu. \p PostProcessing will be called after codegen to handle cases such
   /// as assembler as an external tool.
-  Expected<const __tgt_device_image *>
-  compile(const __tgt_device_image &Image, const std::string &ComputeUnitKind,
+  Expected<std::unique_ptr<MemoryBuffer>>
+  compile(StringRef Image, const std::string &ComputeUnitKind,
           PostProcessingFn PostProcessing);
 
   /// Create or retrieve the object image file from the file system or via
   /// compilation of the \p Image.
   Expected<std::unique_ptr<MemoryBuffer>>
-  getOrCreateObjFile(const __tgt_device_image &Image, LLVMContext &Ctx,
+  getOrCreateObjFile(StringRef Image, LLVMContext &Ctx,
                      const std::string &ComputeUnitKind);
 
   /// Run backend, which contains optimization and code generation.
@@ -92,14 +87,6 @@ struct JITEngine {
   struct ComputeUnitInfo {
     /// LLVM Context in which the modules will be constructed.
     LLVMContext Context;
-
-    /// A map of embedded IR images to the buffer used to store JITed code
-    DenseMap<const __tgt_device_image *, std::unique_ptr<MemoryBuffer>>
-        JITImages;
-
-    /// A map of embedded IR images to JITed images.
-    DenseMap<const __tgt_device_image *, std::unique_ptr<__tgt_device_image>>
-        TgtImageMap;
   };
 
   /// Map from (march) "CPUs" (e.g., sm_80, or gfx90a), which we call compute
diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
index 6ff3ef8cda177..ce66d277d6187 100644
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -306,26 +306,18 @@ class DeviceImageTy {
   /// not unique between different device; they may overlap.
   int32_t ImageId;
 
-  /// The pointer to the raw __tgt_device_image.
-  const __tgt_device_image *TgtImage;
-  const __tgt_device_image *TgtImageBitcode;
+  /// The managed image data.
+  std::unique_ptr<MemoryBuffer> Image;
 
   /// Reference to the device this image is loaded on.
   GenericDeviceTy &Device;
 
-  /// If this image has any global destructors that much be called.
-  /// FIXME: This is only required because we currently have no invariants
-  ///        towards the lifetime of the underlying image. We should either copy
-  ///        the image into memory locally or erase the pointers after init.
-  bool PendingGlobalDtors;
-
 public:
+  virtual ~DeviceImageTy() = default;
+
   DeviceImageTy(int32_t Id, GenericDeviceTy &Device,
-                const __tgt_device_image *Image)
-      : ImageId(Id), TgtImage(Image), TgtImageBitcode(nullptr), Device(Device),
-        PendingGlobalDtors(false) {
-    assert(TgtImage && "Invalid target image");
-  }
+                std::unique_ptr<MemoryBuffer> &&Image)
+      : ImageId(Id), Image(std::move(Image)), Device(Device) {}
 
   /// Get the image identifier within the device.
   int32_t getId() const { return ImageId; }
@@ -333,33 +325,17 @@ class DeviceImageTy {
   /// Get the device that this image is loaded onto.
   GenericDeviceTy &getDevice() const { return Device; }
 
-  /// Get the pointer to the raw __tgt_device_image.
-  const __tgt_device_image *getTgtImage() const { return TgtImage; }
-
-  void setTgtImageBitcode(const __tgt_device_image *TgtImageBitcode) {
-    this->TgtImageBitcode = TgtImageBitcode;
-  }
-
-  const __tgt_device_image *getTgtImageBitcode() const {
-    return TgtImageBitcode;
-  }
-
   /// Get the image starting address.
-  void *getStart() const { return TgtImage->ImageStart; }
+  const void *getStart() const { return Image->getBufferStart(); }
 
   /// Get the image size.
-  size_t getSize() const {
-    return utils::getPtrDiff(TgtImage->ImageEnd, TgtImage->ImageStart);
-  }
+  size_t getSize() const { return Image->getBufferSize(); }
 
   /// Get a memory buffer reference to the whole image.
   MemoryBufferRef getMemoryBuffer() const {
     return MemoryBufferRef(StringRef((const char *)getStart(), getSize()),
                            "Image");
   }
-  /// Accessors to the boolean value
-  bool setPendingGlobalDtors() { return PendingGlobalDtors = true; }
-  bool hasPendingGlobalDtors() const { return PendingGlobalDtors; }
 };
 
 /// Class implementing common functionalities of offload kernels. Each plugin
@@ -831,9 +807,9 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
 
   /// Load the binary image into the device and return the target table.
   Expected<DeviceImageTy *> loadBinary(GenericPluginTy &Plugin,
-                                       const __tgt_device_image *TgtImage);
+                                       StringRef TgtImage);
   virtual Expected<DeviceImageTy *>
-  loadBinaryImpl(const __tgt_device_image *TgtImage, int32_t ImageId) = 0;
+  loadBinaryImpl(std::unique_ptr<MemoryBuffer> &&TgtImage, int32_t ImageId) = 0;
 
   /// Unload a previously loaded Image from the device
   Error unloadBinary(DeviceImageTy *Image);
diff --git a/offload/plugins-nextgen/common/src/JIT.cpp b/offload/plugins-nextgen/common/src/JIT.cpp
index 00720fa2d8103..07ef05e7e9d38 100644
--- a/offload/plugins-nextgen/common/src/JIT.cpp
+++ b/offload/plugins-nextgen/common/src/JIT.cpp
@@ -49,13 +49,6 @@ using namespace omp::target;
 
 namespace {
 
-bool isImageBitcode(const __tgt_device_image &Image) {
-  StringRef Binary(reinterpret_cast<const char *>(Image.ImageStart),
-                   utils::getPtrDiff(Image.ImageEnd, Image.ImageStart));
-
-  return identify_magic(Binary) == file_magic::bitcode;
-}
-
 Expected<std::unique_ptr<Module>>
 createModuleFromMemoryBuffer(std::unique_ptr<MemoryBuffer> &MB,
                              LLVMContext &Context) {
@@ -66,12 +59,10 @@ createModuleFromMemoryBuffer(std::unique_ptr<MemoryBuffer> &MB,
                                      "failed to create module");
   return std::move(Mod);
 }
-Expected<std::unique_ptr<Module>>
-createModuleFromImage(const __tgt_device_image &Image, LLVMContext &Context) {
-  StringRef Data((const char *)Image.ImageStart,
-                 utils::getPtrDiff(Image.ImageEnd, Image.ImageStart));
+Expected<std::unique_ptr<Module>> createModuleFromImage(StringRef Image,
+                                                        LLVMContext &Context) {
   std::unique_ptr<MemoryBuffer> MB = MemoryBuffer::getMemBuffer(
-      Data, /*BufferName=*/"", /*RequiresNullTerminator=*/false);
+      Image, /*BufferName=*/"", /*RequiresNullTerminator=*/false);
   return createModuleFromMemoryBuffer(MB, Context);
 }
 
@@ -238,7 +229,7 @@ JITEngine::backend(Module &M, const std::string &ComputeUnitKind,
 }
 
 Expected<std::unique_ptr<MemoryBuffer>>
-JITEngine::getOrCreateObjFile(const __tgt_device_image &Image, LLVMContext &Ctx,
+JITEngine::getOrCreateObjFile(StringRef Image, LLVMContext &Ctx,
                               const std::string &ComputeUnitKind) {
 
   // Check if the user replaces the module at runtime with a finished object.
@@ -277,58 +268,28 @@ JITEngine::getOrCreateObjFile(const __tgt_device_image &Image, LLVMContext &Ctx,
   return backend(*Mod, ComputeUnitKind, JITOptLevel);
 }
 
-Expected<const __tgt_device_image *>
-JITEngine::compile(const __tgt_device_image &Image,
-                   const std::string &ComputeUnitKind,
+Expected<std::unique_ptr<MemoryBuffer>>
+JITEngine::compile(StringRef Image, const std::string &ComputeUnitKind,
                    PostProcessingFn PostProcessing) {
   std::lock_guard<std::mutex> Lock(ComputeUnitMapMutex);
 
-  // Check if we JITed this image for the given compute unit kind before.
-  ComputeUnitInfo &CUI = ComputeUnitMap[ComputeUnitKind];
-  if (CUI.TgtImageMap.contains(&Image))
-    return CUI.TgtImageMap[&Image].get();
-
-  auto ObjMBOrErr = getOrCreateObjFile(Image, CUI.Context, ComputeUnitKind);
+  LLVMContext Ctz;
+  auto ObjMBOrErr = getOrCreateObjFile(Image, Ctz, ComputeUnitKind);
   if (!ObjMBOrErr)
     return ObjMBOrErr.takeError();
 
-  auto ImageMBOrErr = PostProcessing(std::move(*ObjMBOrErr));
-  if (!ImageMBOrErr)
-    return ImageMBOrErr.takeError();
-
-  CUI.JITImages.insert({&Image, std::move(*ImageMBOrErr)});
-  auto &ImageMB = CUI.JITImages[&Image];
-  CUI.TgtImageMap.insert({&Image, std::make_unique<__tgt_device_image>()});
-  auto &JITedImage = CUI.TgtImageMap[&Image];
-  *JITedImage = Image;
-  JITedImage->ImageStart = const_cast<char *>(ImageMB->getBufferStart());
-  JITedImage->ImageEnd = const_cast<char *>(ImageMB->getBufferEnd());
-
-  return JITedImage.get();
+  return PostProcessing(std::move(*ObjMBOrErr));
 }
 
-Expected<const __tgt_device_image *>
-JITEngine::process(const __tgt_device_image &Image,
-                   target::plugin::GenericDeviceTy &Device) {
-  const std::string &ComputeUnitKind = Device.getComputeUnitKind();
+Expected<std::unique_ptr<MemoryBuffer>>
+JITEngine::process(StringRef Image, target::plugin::GenericDeviceTy &Device) {
+  assert(identify_magic(Image) == file_magic::bitcode && "Image not LLVM-IR");
 
+  const std::string &ComputeUnitKind = Device.getComputeUnitKind();
   PostProcessingFn PostProcessing = [&Device](std::unique_ptr<MemoryBuffer> MB)
       -> Expected<std::unique_ptr<MemoryBuffer>> {
     return Device.doJITPostProcessing(std::move(MB));
   };
 
-  if (isImageBitcode(Image))
-    return compile(Image, ComputeUnitKind, PostProcessing);
-
-  return &Image;
-}
-
-void JITEngine::erase(const __tgt_device_image &Image,
-                      target::plugin::GenericDeviceTy &Device) {
-  std::lock_guard<std::mutex> Lock(ComputeUnitMapMutex);
-  const std::string &ComputeUnitKind = Device.getComputeUnitKind();
-  ComputeUnitInfo &CUI = ComputeUnitMap[ComputeUnitKind];
-
-  CUI.TgtImageMap.erase(&Image);
-  CUI.JITImages.erase(&Image);
+  return compile(Image, ComputeUnitKind, PostProcessing);
 }
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index 36cdd6035e26d..9f830874d5dad 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -214,15 +214,7 @@ struct RecordReplayTy {
     raw_fd_ostream OS(ImageName, EC);
     if (EC)
       report_fatal_error("Error saving image : " + StringRef(EC.message()));
-    if (const auto *TgtImageBitcode = Image.getTgtImageBitcode()) {
-      size_t Size = utils::getPtrDiff(TgtImageBitcode->ImageEnd,
-                                      TgtImageBitcode->ImageStart);
-      MemoryBufferRef MBR = MemoryBufferRef(
-          StringRef((const char *)TgtImageBitcode->ImageStart, Size), "");
-      OS << MBR.getBuffer();
-    } else {
-      OS << Image.getMemoryBuffer().getBuffer();
-    }
+    OS << Image.getMemoryBuffer().getBuffer();
     OS.close();
   }
 
@@ -813,9 +805,6 @@ Error GenericDeviceTy::unloadBinary(DeviceImageTy *Image) {
       return Err;
   }
 
-  if (Image->getTgtImageBitcode())
-    Plugin.getJIT().erase(*Image->getTgtImageBitcode(), Image->getDevice());
-
   return unloadBinaryImpl(Image);
 }
 
@@ -865,32 +854,29 @@ Error GenericDeviceTy::deinit(GenericPluginTy &Plugin) {
 
   return deinitImpl();
 }
-Expected<DeviceImageTy *>
-GenericDeviceTy::loadBinary(GenericPluginTy &Plugin,
-                            const __tgt_device_image *InputTgtImage) {
-  assert(InputTgtImage && "Expected non-null target image");
-  DP("Load data from image " DPxMOD "\n", DPxPTR(InputTgtImage->ImageStart));
-
-  auto PostJITImageOrErr = Plugin.getJIT().process(*InputTgtImage, *this);
-  if (!PostJITImageOrErr) {
-    auto Err = PostJITImageOrErr.takeError();
-    REPORT("Failure to jit IR image %p on device %d: %s\n", InputTgtImage,
-           DeviceId, toStringWithoutConsuming(Err).data());
-    return Plugin::error(ErrorCode::COMPILE_FAILURE, std::move(Err),
-                         "failure to jit IR image");
+Expected<DeviceImageTy *> GenericDeviceTy::loadBinary(GenericPluginTy &Plugin,
+                                                      StringRef InputTgtImage) {
+  DP("Load data from image " DPxMOD "\n", DPxPTR(InputTgtImage.bytes_begin()));
+
+  std::unique_ptr<MemoryBuffer> Buffer;
+  if (identify_magic(InputTgtImage) == file_magic::bitcode) {
+    auto CompiledImageOrErr = Plugin.getJIT().process(InputTgtImage, *this);
+    if (!CompiledImageOrErr) {
+      return Plugin::error(ErrorCode::COMPILE_FAILURE,
+                           CompiledImageOrErr.takeError(),
+                           "failure to jit IR image");
+    }
+    Buffer = std::move(*CompiledImageOrErr);
+  } else {
+    Buffer = MemoryBuffer::getMemBufferCopy(InputTgtImage);
   }
 
   // Load the binary and allocate the image object. Use the next available id
   // for the image id, which is the number of previously loaded images.
-  auto ImageOrErr =
-      loadBinaryImpl(PostJITImageOrErr.get(), LoadedImages.size());
+  auto ImageOrErr = loadBinaryImpl(std::move(Buffer), LoadedImages.size());
   if (!ImageOrErr)
     return ImageOrErr.takeError();
-
   DeviceImageTy *Image = *ImageOrErr;
-  assert(Image != nullptr && "Invalid image");
-  if (InputTgtImage != PostJITImageOrErr.get())
-    Image->setTgtImageBitcode(InputTgtImage);
 
   // Add the image to list.
   LoadedImages.push_back(Image);
@@ -912,12 +898,12 @@ GenericDeviceTy::loadBinary(GenericPluginTy &Plugin,
 
 #ifdef OMPT_SUPPORT
   if (ompt::Initialized) {
-    size_t Bytes =
-        utils::getPtrDiff(InputTgtImage->ImageEnd, InputTgtImage->ImageStart);
+    size_t Bytes = InputTgtImage.size();
     performOmptCallback(
         device_load, Plugin.getUserId(DeviceId),
         /*FileName=*/nullptr, /*FileOffset=*/0, /*VmaInFile=*/nullptr,
-        /*ImgSize=*/Bytes, /*HostAddr=*/InputTgtImage->ImageStart,
+        /*ImgSize=*/Bytes,
+        /*HostAddr=*/const_cast<unsigned char *>(InputTgtImage.bytes_begin()),
         /*DeviceAddr=*/nullptr, /* FIXME: ModuleId */ 0);
   }
 #endif
@@ -1848,7 +1834,9 @@ int32_t GenericPluginTy::load_binary(int32_t DeviceId,
                                      __tgt_device_binary *Binary) {
   GenericDeviceTy &Device = getDevice(DeviceId);
 
-  auto ImageOrErr = Device.loadBinary(*this, TgtImage);
+  StringRef Buffer(reinterpret_cast<const char *>(TgtImage->ImageStart),
+                   utils::getPtrDiff(TgtImage->ImageEnd, TgtImage->ImageStart));
+  auto ImageOrErr = Device.loadBinary(*this, Buffer);
   if (!ImageOrErr) {
     auto Err = ImageOrErr.takeError();
     REPORT("Failure to load binary image %p on device %d: %s\n", TgtImage,
diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp
index af3c74636bff3..99195cd8d7c99 100644
--- a/offload/plugins-nextgen/cuda/src/rtl.cpp
+++ b/offload/plugins-nextgen/cuda/src/rtl.cpp
@@ -81,8 +81,8 @@ CUresult cuMemFreeAsync(CUdeviceptr dptr, CUstream hStream) {}
 struct CUDADeviceImageTy : public DeviceImageTy {
   /// Create the CUDA image with the id and the target image pointer.
   CUDADeviceImageTy(int32_t ImageId, GenericDeviceTy &Device,
-                    const __tgt_device_image *TgtImage)
-      : DeviceImageTy(ImageId, Device, TgtImage), Module(nullptr) {}
+                    std::unique_ptr<MemoryBuffer> &&TgtImage)
+      : DeviceImageTy(ImageId, Device, std::move(TgtImage)), Module(nullptr) {}
 
   /// Load the image as a CUDA module.
   Error loadModule() {
@@ -385,6 +385,8 @@ struct CUDADeviceTy : public GenericDeviceTy {
     if (auto Err = CUDAImage.unloadModule())
       return Err;
 
+    // Destroy the associated memory and invalidate the object.
+    Plugin.free(Image);
     return Plugin::success();
   }
 
@@ -418,20 +420,12 @@ struct CUDADeviceTy : public GenericDeviceTy {
 
   virtual Error callGlobalConstructors(GenericPluginTy &Plugin,
                                        DeviceImageTy &Image) override {
-    // Check for the presence of global destructors at initialization time. This
-    // is required when the image may be deallocated before destructors are run.
-    GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler();
-    if (Handler.isSymbolInImage(*this, Image, "nvptx$device$fini"))
-      Image.setPendingGlobalDtors();
-
     return callGlobalCtorDtorCommon(Plugin, Image, /*IsCtor=*/true);
   }
 
   virtual Error callGlobalDestructors(GenericPluginTy &Plugin,
                                       DeviceImageTy &Image) override {
-    if (Image.hasPendingGlobalDtors())
-      return callGlobalCtorDtorCommon(Plugin, Image, /*IsCtor=*/false);
-    return Plugin::success();
+    return callGlobalCtorDtorCommon(Plugin, Image, /*IsCtor=*/false);
   }
 
   Expected<std::unique_ptr<MemoryBuffer>>
@@ -549,14 +543,15 @@ struct CUDADeviceTy : public GenericDeviceTy {
   CUdevice getCUDADevice() const { return Device; }
 
   /// Load the binary image into the device and allocate an image object.
-  Expected<DeviceImageTy *> loadBinaryImpl(const __tgt_device_image *TgtImage,
-                                           int32_t ImageId) override {
+  Expected<DeviceImageTy *>
+  loadBinaryImpl(std::unique_ptr<MemoryBuffer> &&TgtImage,
+                 int32_t ImageId) override {
     if (auto Err = setContext())
       return std::move(Err);
 
     // Allocate and initialize the image object.
     CUDADeviceImageTy *CUDAImage = Plugin.allocate<CUDADeviceImageTy>();
-    new (CUDAImage) CUDADeviceImageTy(ImageId, *this, TgtImage);
+    new (CUDAImage) CUDADeviceImageTy(ImageId, *this, std::move(TgtImage));
 
     // Load the CUDA module.
     if (auto Err = CUDAImage->loadModule())
@@ -1299,7 +1294,7 @@ struct CUDADeviceTy : public GenericDeviceTy {
     // Perform a quick check for the named kernel in the image. The kernel
     // should be created by the 'nvptx-lower-ctor-dtor' pass.
     GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler();
-    if (IsCtor && !Handler.isSymbolInImage(*this, Image, KernelName))
+    if (!Handler.isSymbolInImage(*this, Image, KernelName))
       return Plugin::success();
 
     // The Nvidia backend cannot handle creating the ctor / dtor array
diff --git a/offload/plugins-nextgen/host/src/rtl.cpp b/offload/plugins-nextgen/host/src/rtl.cpp
index 5436cae3b0293..0db01ca09ab02 100644
--- a/offload/plugins-nextgen/host/src/rtl.cpp
+++ b/offload/plugins-nextgen/host/src/rtl.cpp
@@ -131,8 +131,8 @@ struct GenELF64KernelTy : public GenericKernelTy {
 struct GenELF64DeviceImageTy : public DeviceImageTy {
   /// Create the GenELF64 image with the id and the target image pointer.
   GenELF64DeviceImageTy(int32_t ImageId, GenericDeviceTy &Device,
-                        const __tgt_device_image *TgtImage)
-      : DeviceImageTy(ImageId, Device, TgtImage), DynLib() {}
+                        std::unique_ptr<MemoryBuffer> &&TgtImage)
+      : DeviceImageTy(ImageId, Device, std::move(TgtImage)), DynLib() {}
 
   /// Getter and setter for the dynamic library.
   DynamicLibrary &getDynamicLibrary() { return DynLib; }
@@ -189,11 +189,12 @@ struct GenELF64DeviceTy : public GenericDeviceTy {
   Error setContext() override { return Plugin::success(); }
 
   /// Load the binary image into the device and allocate an image object.
-  Expected<DeviceImageTy *> loadBinaryImpl(const __tgt_device_image *TgtImage,
-                                           int32_t ImageId) override {
+  Expected<DeviceImageTy *>
+  loadBinaryImpl(std::unique_ptr<MemoryBuffer> &&TgtImage,
+                 int32_t ImageId) override {
     // Allocate and initialize the image object.
     GenELF64DeviceImageTy *Image = Plugin.allocate<GenELF64DeviceImageTy>();
-    new (Image) GenELF64DeviceImageTy(ImageId, *this, TgtImage);
+    new (Image) GenELF64DeviceImageTy(ImageId, *this, std::move(TgtImage));
 
     // Create a temporary file.
     char TmpFileName[] = "/tmp/tmpfile_XXXXXX";
diff --git a/offload/test/offloading/force-usm.cpp b/offload/test/offloading/force-usm.cpp
index a043ba47f54ad..9988c3dc4e9e0 100644
--- a/offload/test/offloading/force-usm.cpp
+++ b/offload/test/offloading/force-usm.cpp
@@ -48,7 +48,7 @@ int main(void) {
 
 // clang-format off
 // NO-USM: omptarget device 0 info: Copying data from host to device, HstPtr={{.*}}, TgtPtr={{.*}}, Size=4
-// NO-USM-NEXT: omptarget device 0 info: Copying data from host to device, HstPtr={{.*}}, TgtPtr={{.*}}, Size=12
+// NO-USM: omptarget device 0 info: Copying data from host to device, HstPtr={{.*}}, TgtPtr={{.*}}, Size=12
 // NO-USM-NEXT: omptarget device 0 info: Copying data from host to device, HstPtr={{.*}}, TgtPtr={{.*}}, Size=4
 // NO-USM-NEXT: omptarget device 0 info: Copying data from host to device, HstPtr={{.*}}, TgtPtr={{.*}}, Size=8, Name=pGI
 // NO-USM-NEXT: omptarget device 0 info: Copying data from device to host, TgtPtr={{.*}}, HstPtr={{.*}}, Size=4
diff --git a/offload/test/offloading/fortran/do-concurrent-to-omp-saxpy-2d.f90 b/offload/test/offloading/fortran/do-concurrent-to-omp-saxpy-2d.f90
new file mode 100644
index 0000000000000..c6f576acb90b6
--- /dev/null
+++ b/offload/test/offloading/fortran/do-concurrent-to-omp-saxpy-2d.f90
@@ -0,0 +1,53 @@
+! REQUIRES: flang, amdgpu
+
+! RUN: %libomptarget-compile-fortran-generic -fdo-concurrent-to-openmp=device
+! RUN: env LIBOMPTARGET_INFO=16 %libomptarget-run-generic 2>&1 | %fcheck-generic
+module saxpymod
+   use iso_fortran_env
+   public :: saxpy
+contains
+
+subroutine saxpy(a, x, y, n, m)
+   use iso_fortran_env
+   implicit none
+   integer,intent(in) :: n, m
+   real(kind=real32),intent(in) :: a
+   real(kind=real32), dimension(:,:),intent(in) :: x
+   real(kind=real32), dimension(:,:),intent(inout) :: y
+   integer :: i, j
+
+   do concurrent(i=1:n, j=1:m)
+       y(i,j) = a * x(i,j) + y(i,j)
+   end do
+
+   write(*,*) "plausibility check:"
+   write(*,'("y(1,1) ",f8.6)') y(1,1)
+   write(*,'("y(n,m) ",f8.6)') y(n,m)
+end subroutine saxpy
+
+end module saxpymod
+
+program main
+   use iso_fortran_env
+   use saxpymod, ONLY:saxpy
+   implicit none
+
+   integer,parameter :: n = 1000, m=10000
+   real(kind=real32), allocatable, dimension(:,:) :: x, y
+   real(kind=real32) :: a
+   integer :: i
+
+   allocate(x(1:n,1:m), y(1:n,1:m))
+   a = 2.0_real32
+   x(:,:) = 1.0_real32
+   y(:,:) = 2.0_real32
+
+   call saxpy(a, x, y, n, m)
+
+   deallocate(x,y)
+end program main
+
+! CHECK:  "PluginInterface" device {{[0-9]+}} info: Launching kernel {{.*}}
+! CHECK:  plausibility check:
+! CHECK:  y(1,1) 4.0
+! CHECK:  y(n,m) 4.0
diff --git a/offload/test/offloading/fortran/do-concurrent-to-omp-saxpy.f90 b/offload/test/offloading/fortran/do-concurrent-to-omp-saxpy.f90
new file mode 100644
index 0000000000000..e094a1d7459ef
--- /dev/null
+++ b/offload/test/offloading/fortran/do-concurrent-to-omp-saxpy.f90
@@ -0,0 +1,53 @@
+! REQUIRES: flang, amdgpu
+
+! RUN: %libomptarget-compile-fortran-generic -fdo-concurrent-to-openmp=device
+! RUN: env LIBOMPTARGET_INFO=16 %libomptarget-run-generic 2>&1 | %fcheck-generic
+module saxpymod
+   use iso_fortran_env
+   public :: saxpy
+contains
+
+subroutine saxpy(a, x, y, n)
+   use iso_fortran_env
+   implicit none
+   integer,intent(in) :: n
+   real(kind=real32),intent(in) :: a
+   real(kind=real32), dimension(:),intent(in) :: x
+   real(kind=real32), dimension(:),intent(inout) :: y
+   integer :: i
+
+   do concurrent(i=1:n)
+       y(i) = a * x(i) + y(i)
+   end do
+
+   write(*,*) "plausibility check:"
+   write(*,'("y(1) ",f8.6)') y(1)
+   write(*,'("y(n) ",f8.6)') y(n)
+end subroutine saxpy
+
+end module saxpymod
+
+program main
+   use iso_fortran_env
+   use saxpymod, ONLY:saxpy
+   implicit none
+
+   integer,parameter :: n = 10000000
+   real(kind=real32), allocatable, dimension(:) :: x, y
+   real(kind=real32) :: a
+   integer :: i
+
+   allocate(x(1:n), y(1:n))
+   a = 2.0_real32
+   x(:) = 1.0_real32
+   y(:) = 2.0_real32
+
+   call saxpy(a, x, y, n)
+
+   deallocate(x,y)
+end program main
+
+! CHECK:  "PluginInterface" device {{[0-9]+}} info: Launching kernel {{.*}}
+! CHECK:  plausibility check:
+! CHECK:  y(1) 4.0
+! CHECK:  y(n) 4.0
diff --git a/offload/unittests/OffloadAPI/common/Fixtures.hpp b/offload/unittests/OffloadAPI/common/Fixtures.hpp
index c5a35faba7a27..6f9961e2c6d58 100644
--- a/offload/unittests/OffloadAPI/common/Fixtures.hpp
+++ b/offload/unittests/OffloadAPI/common/Fixtures.hpp
@@ -40,12 +40,16 @@
   } while (0)
 #endif
 
-// TODO: rework this so the EXPECTED/ACTUAL results are readable
 #ifndef ASSERT_ERROR
 #define ASSERT_ERROR(EXPECTED, ACTUAL)                                         \
   do {                                                                         \
     ol_result_t Res = ACTUAL;                                                  \
-    ASSERT_TRUE(Res && (Res->Code == EXPECTED));                               \
+    if (!Res)                                                                  \
+      GTEST_FAIL() << #ACTUAL " succeeded when we expected it to fail";        \
+    if (Res->Code != EXPECTED)                                                 \
+      GTEST_FAIL() << #ACTUAL " was expected to return "                       \
+                   << #EXPECTED " but instead returned " << Res->Code << ": "  \
+                   << Res->Details;                                            \
   } while (0)
 #endif
 
diff --git a/openmp/docs/SupportAndFAQ.rst b/openmp/docs/SupportAndFAQ.rst
index 2684d20c5b0b6..f5a84784c8de8 100644
--- a/openmp/docs/SupportAndFAQ.rst
+++ b/openmp/docs/SupportAndFAQ.rst
@@ -82,8 +82,10 @@ Support for the device library comes from a separate build of the OpenMP library
 that targets the GPU architecture. Building it requires enabling the runtime
 targets, or setting the target manually when doing a standalone build. This is
 done with the ``LLVM_RUNTIME_TARGETS`` option and then enabling the OpenMP
-runtime for the GPU target. ``RUNTIMES_<triple>_LLVM_ENABLE_RUNTIMES``. Refer to
-the cache file for the specific invocation.
+runtime for the GPU target via ``RUNTIMES_<triple>_LLVM_ENABLE_RUNTIMES``.
+It's possible to set different flags for each device library by using
+``RUNTIMES_<triple>_CMAKE_CXX_FLAGS``. Refer to the cache file for the specific
+invocation.
 
 For Nvidia offload, please see :ref:`build_nvidia_offload_capable_compiler`.
 For AMDGPU offload, please see :ref:`build_amdgpu_offload_capable_compiler`.
diff --git a/openmp/runtime/test/affinity/format/fields_values.c b/openmp/runtime/test/affinity/format/fields_values.c
index e7c1faf3e3c6f..b3cc4c32e77e8 100644
--- a/openmp/runtime/test/affinity/format/fields_values.c
+++ b/openmp/runtime/test/affinity/format/fields_values.c
@@ -112,8 +112,13 @@ void check_native_thread_id() {
 */
 
 void check_host() {
+#ifdef _WIN32
+  typedef DWORD buffer_size_t;
+#else
+  typedef int buffer_size_t;
+#endif
   int i;
-  int buffer_size = 256;
+  buffer_size_t buffer_size = 256;
   const char* formats[2] = {"%{host}", "%H"};
   char hostname[256];
   my_gethostname(hostname, buffer_size);
diff --git a/utils/bazel/WORKSPACE b/utils/bazel/WORKSPACE
index 20c0fd1f4c985..00cfea572096a 100644
--- a/utils/bazel/WORKSPACE
+++ b/utils/bazel/WORKSPACE
@@ -186,9 +186,9 @@ maybe(
     http_archive,
     name = "nanobind",
     build_file = "@llvm-raw//utils/bazel/third_party_build:nanobind.BUILD",
-    sha256 = "8ce3667dce3e64fc06bfb9b778b6f48731482362fb89a43da156632266cd5a90",
-    strip_prefix = "nanobind-2.9.2",
-    url = "https://github.com/wjakob/nanobind/archive/refs/tags/v2.9.2.tar.gz",
+    sha256 = "bb35deaed7efac5029ed1e33880a415638352f757d49207a8e6013fefb6c49a7",
+    strip_prefix = "nanobind-2.4.0",
+    url = "https://github.com/wjakob/nanobind/archive/refs/tags/v2.4.0.tar.gz",
 )
 
 load("@rules_python//python:repositories.bzl", "py_repositories", "python_register_toolchains")
@@ -199,3 +199,17 @@ python_register_toolchains(
     name = "python_3_12",
     python_version = "3.12",
 )
+
+maybe(
+    http_archive,
+    name = "rules_shell",
+    sha256 = "e6b87c89bd0b27039e3af2c5da01147452f240f75d505f5b6880874f31036307",
+    strip_prefix = "rules_shell-0.6.1",
+    url = "https://github.com/bazelbuild/rules_shell/releases/download/v0.6.1/rules_shell-v0.6.1.tar.gz",
+)
+
+load("@rules_shell//shell:repositories.bzl", "rules_shell_dependencies", "rules_shell_toolchains")
+
+rules_shell_dependencies()
+
+rules_shell_toolchains()
diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index 3e62769dd0077..921163a413067 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -818,6 +818,7 @@ libc_support_library(
     hdrs = ["src/__support/arg_list.h"],
     deps = [
         ":__support_common",
+        ":string_memory_utils",
     ],
 )
 
@@ -1818,6 +1819,168 @@ libc_support_library(
     ],
 )
 
+################################# ctype targets ################################
+
+libc_function(
+    name = "isalnum",
+    srcs = ["src/ctype/isalnum.cpp"],
+    hdrs = ["src/ctype/isalnum.h"],
+    deps = [
+        ":__support_common",
+        ":__support_ctype_utils",
+    ],
+)
+
+libc_function(
+    name = "isalpha",
+    srcs = ["src/ctype/isalpha.cpp"],
+    hdrs = ["src/ctype/isalpha.h"],
+    deps = [
+        ":__support_common",
+        ":__support_ctype_utils",
+    ],
+)
+
+libc_function(
+    name = "isascii",
+    srcs = ["src/ctype/isascii.cpp"],
+    hdrs = ["src/ctype/isascii.h"],
+    deps = [
+        ":__support_common",
+        ":__support_ctype_utils",
+    ],
+)
+
+libc_function(
+    name = "isblank",
+    srcs = ["src/ctype/isblank.cpp"],
+    hdrs = ["src/ctype/isblank.h"],
+    deps = [
+        ":__support_common",
+        ":__support_ctype_utils",
+    ],
+)
+
+libc_function(
+    name = "iscntrl",
+    srcs = ["src/ctype/iscntrl.cpp"],
+    hdrs = ["src/ctype/iscntrl.h"],
+    deps = [
+        ":__support_common",
+        ":__support_ctype_utils",
+    ],
+)
+
+libc_function(
+    name = "isdigit",
+    srcs = ["src/ctype/isdigit.cpp"],
+    hdrs = ["src/ctype/isdigit.h"],
+    deps = [
+        ":__support_common",
+        ":__support_ctype_utils",
+    ],
+)
+
+libc_function(
+    name = "isgraph",
+    srcs = ["src/ctype/isgraph.cpp"],
+    hdrs = ["src/ctype/isgraph.h"],
+    deps = [
+        ":__support_common",
+        ":__support_ctype_utils",
+    ],
+)
+
+libc_function(
+    name = "islower",
+    srcs = ["src/ctype/islower.cpp"],
+    hdrs = ["src/ctype/islower.h"],
+    deps = [
+        ":__support_common",
+        ":__support_ctype_utils",
+    ],
+)
+
+libc_function(
+    name = "isprint",
+    srcs = ["src/ctype/isprint.cpp"],
+    hdrs = ["src/ctype/isprint.h"],
+    deps = [
+        ":__support_common",
+        ":__support_ctype_utils",
+    ],
+)
+
+libc_function(
+    name = "ispunct",
+    srcs = ["src/ctype/ispunct.cpp"],
+    hdrs = ["src/ctype/ispunct.h"],
+    deps = [
+        ":__support_common",
+        ":__support_ctype_utils",
+    ],
+)
+
+libc_function(
+    name = "isspace",
+    srcs = ["src/ctype/isspace.cpp"],
+    hdrs = ["src/ctype/isspace.h"],
+    deps = [
+        ":__support_common",
+        ":__support_ctype_utils",
+    ],
+)
+
+libc_function(
+    name = "isupper",
+    srcs = ["src/ctype/isupper.cpp"],
+    hdrs = ["src/ctype/isupper.h"],
+    deps = [
+        ":__support_common",
+        ":__support_ctype_utils",
+    ],
+)
+
+libc_function(
+    name = "isxdigit",
+    srcs = ["src/ctype/isxdigit.cpp"],
+    hdrs = ["src/ctype/isxdigit.h"],
+    deps = [
+        ":__support_common",
+        ":__support_ctype_utils",
+    ],
+)
+
+libc_function(
+    name = "toascii",
+    srcs = ["src/ctype/toascii.cpp"],
+    hdrs = ["src/ctype/toascii.h"],
+    deps = [
+        ":__support_common",
+        ":__support_ctype_utils",
+    ],
+)
+
+libc_function(
+    name = "tolower",
+    srcs = ["src/ctype/tolower.cpp"],
+    hdrs = ["src/ctype/tolower.h"],
+    deps = [
+        ":__support_common",
+        ":__support_ctype_utils",
+    ],
+)
+
+libc_function(
+    name = "toupper",
+    srcs = ["src/ctype/toupper.cpp"],
+    hdrs = ["src/ctype/toupper.h"],
+    deps = [
+        ":__support_common",
+        ":__support_ctype_utils",
+    ],
+)
+
 ################################ fenv targets ################################
 
 libc_function(
@@ -2133,6 +2296,21 @@ libc_support_library(
     ],
 )
 
+libc_support_library(
+    name = "__support_math_rsqrtf16",
+    hdrs = ["src/__support/math/rsqrtf16.h"],
+    deps = [
+        ":__support_fputil_cast",
+        ":__support_fputil_fenv_impl",
+        ":__support_fputil_fp_bits",
+        ":__support_fputil_manipulation_functions",
+        ":__support_fputil_multiply_add",
+        ":__support_fputil_polyeval",
+        ":__support_macros_optimization",
+        ":__support_macros_properties_types",
+    ],
+)
+
 libc_support_library(
     name = "__support_math_asin_utils",
     hdrs = ["src/__support/math/asin_utils.h"],
@@ -3081,6 +3259,14 @@ libc_math_function(
     ],
 )
 
+libc_math_function(
+    name = "rsqrtf16",
+    additional_deps = [
+        ":__support_math_rsqrtf16",
+        ":errno",
+    ],
+)
+
 libc_math_function(
     name = "acoshf16",
     additional_deps = [
@@ -4609,6 +4795,28 @@ libc_math_function(name = "ufromfpxf16")
 
 ############################## inttypes targets ##############################
 
+libc_function(
+    name = "strtoimax",
+    srcs = ["src/inttypes/strtoimax.cpp"],
+    hdrs = ["src/inttypes/strtoimax.h"],
+    deps = [
+        ":__support_common",
+        ":__support_str_to_integer",
+        ":errno",
+    ],
+)
+
+libc_function(
+    name = "strtoumax",
+    srcs = ["src/inttypes/strtoumax.cpp"],
+    hdrs = ["src/inttypes/strtoumax.h"],
+    deps = [
+        ":__support_common",
+        ":__support_str_to_integer",
+        ":errno",
+    ],
+)
+
 libc_function(
     name = "imaxabs",
     srcs = ["src/inttypes/imaxabs.cpp"],
@@ -4819,6 +5027,7 @@ libc_support_library(
         ":__support_cpp_bit",
         ":__support_cpp_cstddef",
         ":__support_macros_attributes",
+        ":string_memory_utils",
     ],
 )
 
@@ -5039,6 +5248,27 @@ libc_support_library(
     ],
 )
 
+libc_function(
+    name = "index",
+    srcs = ["src/strings/index.cpp"],
+    hdrs = ["src/strings/index.h"],
+    deps = [
+        ":__support_common",
+        ":string_utils",
+    ],
+)
+
+libc_function(
+    name = "rindex",
+    srcs = ["src/strings/rindex.cpp"],
+    hdrs = ["src/strings/rindex.h"],
+    deps = [
+        ":__support_common",
+        ":__support_macros_null_check",
+        ":string_utils",
+    ],
+)
+
 libc_function(
     name = "memchr",
     srcs = ["src/string/memchr.cpp"],
@@ -5061,6 +5291,16 @@ libc_function(
     ],
 )
 
+libc_function(
+    name = "memccpy",
+    srcs = ["src/string/memccpy.cpp"],
+    hdrs = ["src/string/memccpy.h"],
+    deps = [
+        ":__support_common",
+        ":__support_macros_null_check",
+    ],
+)
+
 libc_function(
     name = "memset",
     srcs = ["src/string/memset.cpp"],
@@ -5072,6 +5312,17 @@ libc_function(
     ],
 )
 
+libc_function(
+    name = "memset_explicit",
+    srcs = ["src/string/memset_explicit.cpp"],
+    hdrs = ["src/string/memset_explicit.h"],
+    deps = [
+        ":__support_common",
+        ":__support_macros_null_check",
+        ":string_memory_utils",
+    ],
+)
+
 libc_function(
     name = "memmove",
     srcs = ["src/string/memmove.cpp"],
@@ -5083,6 +5334,17 @@ libc_function(
     ],
 )
 
+libc_function(
+    name = "memmem",
+    srcs = ["src/string/memmem.cpp"],
+    hdrs = ["src/string/memmem.h"],
+    deps = [
+        ":__support_common",
+        ":__support_macros_null_check",
+        ":string_memory_utils",
+    ],
+)
+
 libc_function(
     name = "mempcpy",
     srcs = ["src/string/mempcpy.cpp"],
@@ -5147,6 +5409,30 @@ libc_function(
     ],
 )
 
+libc_function(
+    name = "stpcpy",
+    srcs = ["src/string/stpcpy.cpp"],
+    hdrs = ["src/string/stpcpy.h"],
+    deps = [
+        ":__support_common",
+        ":__support_macros_null_check",
+        ":string_memory_utils",
+        ":string_utils",
+    ],
+)
+
+libc_function(
+    name = "stpncpy",
+    srcs = ["src/string/stpncpy.cpp"],
+    hdrs = ["src/string/stpncpy.h"],
+    deps = [
+        ":__support_common",
+        ":__support_macros_null_check",
+        ":string_memory_utils",
+        ":string_utils",
+    ],
+)
+
 libc_function(
     name = "strlen",
     srcs = ["src/string/strlen.cpp"],
@@ -5170,6 +5456,17 @@ libc_function(
     ],
 )
 
+libc_function(
+    name = "strlcpy",
+    srcs = ["src/string/strlcpy.cpp"],
+    hdrs = ["src/string/strlcpy.h"],
+    deps = [
+        ":__support_common",
+        ":llvm_libc_types_size_t",
+        ":string_utils",
+    ],
+)
+
 libc_function(
     name = "strncpy",
     srcs = ["src/string/strncpy.cpp"],
@@ -5191,6 +5488,40 @@ libc_function(
     ],
 )
 
+libc_function(
+    name = "strncmp",
+    srcs = ["src/string/strncmp.cpp"],
+    hdrs = ["src/string/strncmp.h"],
+    deps = [
+        ":__support_common",
+        ":__support_macros_null_check",
+        ":string_memory_utils",
+        ":string_utils",
+    ],
+)
+
+libc_function(
+    name = "strcasecmp",
+    srcs = ["src/strings/strcasecmp.cpp"],
+    hdrs = ["src/strings/strcasecmp.h"],
+    deps = [
+        ":__support_common",
+        ":__support_ctype_utils",
+        ":string_memory_utils",
+    ],
+)
+
+libc_function(
+    name = "strncasecmp",
+    srcs = ["src/strings/strncasecmp.cpp"],
+    hdrs = ["src/strings/strncasecmp.h"],
+    deps = [
+        ":__support_common",
+        ":__support_ctype_utils",
+        ":string_memory_utils",
+    ],
+)
+
 libc_function(
     name = "strchr",
     srcs = ["src/string/strchr.cpp"],
@@ -5211,6 +5542,16 @@ libc_function(
     ],
 )
 
+libc_function(
+    name = "strchrnul",
+    srcs = ["src/string/strchrnul.cpp"],
+    hdrs = ["src/string/strchrnul.h"],
+    deps = [
+        ":__support_common",
+        ":string_utils",
+    ],
+)
+
 libc_function(
     name = "strsep",
     srcs = ["src/string/strsep.cpp"],
@@ -5236,6 +5577,51 @@ libc_function(
     ],
 )
 
+libc_function(
+    name = "strcasestr",
+    srcs = ["src/string/strcasestr.cpp"],
+    hdrs = ["src/string/strcasestr.h"],
+    deps = [
+        ":__support_common",
+        ":__support_ctype_utils",
+        ":__support_macros_null_check",
+        ":string_memory_utils",
+        ":string_utils",
+    ],
+)
+
+libc_function(
+    name = "strcat",
+    srcs = ["src/string/strcat.cpp"],
+    hdrs = ["src/string/strcat.h"],
+    deps = [
+        ":__support_common",
+        ":__support_macros_null_check",
+        ":string_utils",
+    ],
+)
+
+libc_function(
+    name = "strlcat",
+    srcs = ["src/string/strlcat.cpp"],
+    hdrs = ["src/string/strlcat.h"],
+    deps = [
+        ":__support_common",
+        ":string_utils",
+    ],
+)
+
+libc_function(
+    name = "strncat",
+    srcs = ["src/string/strncat.cpp"],
+    hdrs = ["src/string/strncat.h"],
+    deps = [
+        ":__support_common",
+        ":__support_macros_null_check",
+        ":string_utils",
+    ],
+)
+
 libc_function(
     name = "strnlen",
     srcs = ["src/string/strnlen.cpp"],
@@ -6761,6 +7147,7 @@ libc_function(
     deps = [
         ":__support_common",
         ":__support_macros_config",
+        ":string_memory_utils",
         ":types_size_t",
         ":types_wchar_t",
     ],
@@ -6784,6 +7171,7 @@ libc_function(
     hdrs = ["src/wchar/wmempcpy.h"],
     deps = [
         ":__support_common",
+        ":string_memory_utils",
         ":types_size_t",
         ":types_wchar_t",
     ],
diff --git a/utils/bazel/llvm-project-overlay/libc/libc_configure_options.bzl b/utils/bazel/llvm-project-overlay/libc/libc_configure_options.bzl
index 179fc83e67298..259d4d292fcf4 100644
--- a/utils/bazel/llvm-project-overlay/libc/libc_configure_options.bzl
+++ b/utils/bazel/llvm-project-overlay/libc/libc_configure_options.bzl
@@ -39,7 +39,7 @@ LIBC_CONFIGURE_OPTIONS = [
     # "LIBC_COPT_SCANF_DISABLE_FLOAT",
     # "LIBC_COPT_SCANF_DISABLE_INDEX_MODE",
     "LIBC_COPT_STDIO_USE_SYSTEM_FILE",
-    # "LIBC_COPT_STRING_UNSAFE_WIDE_READ",
+    "LIBC_COPT_STRING_UNSAFE_WIDE_READ",
     # "LIBC_COPT_STRTOFLOAT_DISABLE_CLINGER_FAST_PATH",
     # "LIBC_COPT_STRTOFLOAT_DISABLE_EISEL_LEMIRE",
     # "LIBC_COPT_STRTOFLOAT_DISABLE_SIMPLE_DECIMAL_CONVERSION",
diff --git a/utils/bazel/llvm-project-overlay/libc/test/UnitTest/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/test/UnitTest/BUILD.bazel
index 24baaf1983a08..b44273123dcad 100644
--- a/utils/bazel/llvm-project-overlay/libc/test/UnitTest/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/test/UnitTest/BUILD.bazel
@@ -39,6 +39,7 @@ libc_test_library(
         "LibcTestMain.cpp",
     ],
     hdrs = [
+        "ErrnoCheckingTest.h",
         "ErrnoSetterMatcher.h",
         "ExecuteFunction.h",
         "LibcTest.h",
@@ -62,6 +63,7 @@ libc_test_library(
         "//libc:__support_libc_errno",
         "//libc:__support_macros_config",
         "//libc:__support_macros_properties_architectures",
+        "//libc:__support_macros_properties_compiler",
         "//libc:__support_macros_properties_types",
         "//libc:__support_stringutil",
         "//libc:__support_uint128",
@@ -79,19 +81,6 @@ libc_test_library(
     alwayslink = True,
 )
 
-libc_test_library(
-    name = "errno_test_helpers",
-    hdrs = [
-        "ErrnoCheckingTest.h",
-    ],
-    deps = [
-        ":LibcUnitTest",
-        "//libc:__support_libc_errno",
-        "//libc:__support_macros_config",
-        "//libc:errno",
-    ],
-)
-
 libc_test_library(
     name = "fp_test_helpers",
     srcs = [
@@ -107,7 +96,6 @@ libc_test_library(
     ],
     deps = [
         ":LibcUnitTest",
-        ":errno_test_helpers",
         ":string_utils",
         ":test_logger",
         "//libc:__support_cpp_array",
diff --git a/utils/bazel/llvm-project-overlay/libc/test/src/ctype/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/test/src/ctype/BUILD.bazel
new file mode 100644
index 0000000000000..76d0dcae930bb
--- /dev/null
+++ b/utils/bazel/llvm-project-overlay/libc/test/src/ctype/BUILD.bazel
@@ -0,0 +1,147 @@
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+# Tests for LLVM libc ctype.h functions.
+
+load("//libc/test:libc_test_rules.bzl", "libc_test")
+
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])
+
+libc_test(
+    name = "isalnum_test",
+    srcs = ["isalnum_test.cpp"],
+    deps = [
+        "//libc:__support_cpp_span",
+        "//libc:isalnum",
+    ],
+)
+
+libc_test(
+    name = "islpha_test",
+    srcs = ["isalpha_test.cpp"],
+    deps = [
+        "//libc:__support_cpp_span",
+        "//libc:isalpha",
+    ],
+)
+
+libc_test(
+    name = "isascii_test",
+    srcs = ["isascii_test.cpp"],
+    deps = [
+        "//libc:isascii",
+    ],
+)
+
+libc_test(
+    name = "isblank_test",
+    srcs = ["isblank_test.cpp"],
+    deps = [
+        "//libc:isblank",
+    ],
+)
+
+libc_test(
+    name = "iscntrl_test",
+    srcs = ["iscntrl_test.cpp"],
+    deps = [
+        "//libc:iscntrl",
+    ],
+)
+
+libc_test(
+    name = "isdigit_test",
+    srcs = ["isdigit_test.cpp"],
+    deps = [
+        "//libc:__support_cpp_span",
+        "//libc:isdigit",
+    ],
+)
+
+libc_test(
+    name = "isgraph_test",
+    srcs = ["isgraph_test.cpp"],
+    deps = [
+        "//libc:isgraph",
+    ],
+)
+
+libc_test(
+    name = "islower_test",
+    srcs = ["islower_test.cpp"],
+    deps = [
+        "//libc:__support_cpp_span",
+        "//libc:islower",
+    ],
+)
+
+libc_test(
+    name = "isprint_test",
+    srcs = ["isprint_test.cpp"],
+    deps = [
+        "//libc:isprint",
+    ],
+)
+
+libc_test(
+    name = "ispunct_test",
+    srcs = ["ispunct_test.cpp"],
+    deps = [
+        "//libc:ispunct",
+    ],
+)
+
+libc_test(
+    name = "isspace_test",
+    srcs = ["isspace_test.cpp"],
+    deps = [
+        "//libc:isspace",
+    ],
+)
+
+libc_test(
+    name = "isupper_test",
+    srcs = ["isupper_test.cpp"],
+    deps = [
+        "//libc:__support_cpp_span",
+        "//libc:isupper",
+    ],
+)
+
+libc_test(
+    name = "isxdigit_test",
+    srcs = ["isxdigit_test.cpp"],
+    deps = [
+        "//libc:__support_cpp_span",
+        "//libc:isxdigit",
+    ],
+)
+
+libc_test(
+    name = "toascii_test",
+    srcs = ["toascii_test.cpp"],
+    deps = [
+        "//libc:toascii",
+    ],
+)
+
+libc_test(
+    name = "tolower_test",
+    srcs = ["tolower_test.cpp"],
+    deps = [
+        "//libc:__support_cpp_span",
+        "//libc:tolower",
+    ],
+)
+
+libc_test(
+    name = "toupper_test",
+    srcs = ["toupper_test.cpp"],
+    deps = [
+        "//libc:__support_cpp_span",
+        "//libc:toupper",
+    ],
+)
diff --git a/utils/bazel/llvm-project-overlay/libc/test/src/inttypes/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/test/src/inttypes/BUILD.bazel
index 3dd4ab379efe0..03b8cbeecc247 100644
--- a/utils/bazel/llvm-project-overlay/libc/test/src/inttypes/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/test/src/inttypes/BUILD.bazel
@@ -10,6 +10,24 @@ package(default_visibility = ["//visibility:public"])
 
 licenses(["notice"])
 
+libc_test(
+    name = "strtoimax_test",
+    srcs = ["strtoimax_test.cpp"],
+    deps = [
+        "//libc:strtoimax",
+        "//libc/test/src/stdlib:strtol_test_helper",
+    ],
+)
+
+libc_test(
+    name = "strtoumax_test",
+    srcs = ["strtoumax_test.cpp"],
+    deps = [
+        "//libc:strtoumax",
+        "//libc/test/src/stdlib:strtol_test_helper",
+    ],
+)
+
 libc_test(
     name = "imaxabs_test",
     srcs = ["imaxabs_test.cpp"],
diff --git a/utils/bazel/llvm-project-overlay/libc/test/src/stdio/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/test/src/stdio/BUILD.bazel
index 505b73fd77111..484d3e5e0a24e 100644
--- a/utils/bazel/llvm-project-overlay/libc/test/src/stdio/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/test/src/stdio/BUILD.bazel
@@ -122,7 +122,6 @@ libc_test(
         "//libc:mkdirat",
         "//libc:open",
         "//libc:remove",
-        "//libc/test/UnitTest:errno_test_helpers",
     ],
 )
 
diff --git a/utils/bazel/llvm-project-overlay/libc/test/src/stdlib/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/test/src/stdlib/BUILD.bazel
index 4f66793d44dfe..ce92ca8caa6f4 100644
--- a/utils/bazel/llvm-project-overlay/libc/test/src/stdlib/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/test/src/stdlib/BUILD.bazel
@@ -107,7 +107,6 @@ libc_test(
     deps = [
         "//libc:__support_fputil_fp_bits",
         "//libc:atof",
-        "//libc/test/UnitTest:errno_test_helpers",
     ],
 )
 
@@ -208,7 +207,6 @@ libc_test_library(
         "//libc:__support_macros_properties_architectures",
         "//libc:errno",
         "//libc/test/UnitTest:LibcUnitTest",
-        "//libc/test/UnitTest:errno_test_helpers",
     ],
 )
 
@@ -254,7 +252,6 @@ libc_test(
     deps = [
         "//libc:__support_fputil_fp_bits",
         "//libc:strtof",
-        "//libc/test/UnitTest:errno_test_helpers",
         "//libc/test/UnitTest:fp_test_helpers",
     ],
 )
@@ -265,7 +262,6 @@ libc_test(
     deps = [
         "//libc:__support_fputil_fp_bits",
         "//libc:strtod",
-        "//libc/test/UnitTest:errno_test_helpers",
         "//libc/test/UnitTest:fp_test_helpers",
     ],
 )
@@ -277,6 +273,5 @@ libc_test(
         "//libc:__support_fputil_fp_bits",
         "//libc:__support_uint128",
         "//libc:strtold",
-        "//libc/test/UnitTest:errno_test_helpers",
     ],
 )
diff --git a/utils/bazel/llvm-project-overlay/libc/test/src/string/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/test/src/string/BUILD.bazel
index 1a95dece8bf20..2a97f7a5d512c 100644
--- a/utils/bazel/llvm-project-overlay/libc/test/src/string/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/test/src/string/BUILD.bazel
@@ -27,6 +27,33 @@ libc_test(
     ],
 )
 
+libc_test(
+    name = "strlcpy_test",
+    srcs = ["strlcpy_test.cpp"],
+    deps = [
+        "//libc:strlcpy",
+    ],
+)
+
+libc_test(
+    name = "stpcpy_test",
+    srcs = ["stpcpy_test.cpp"],
+    deps = [
+        "//libc:stpcpy",
+        "//libc:string_utils",
+    ],
+)
+
+libc_test(
+    name = "stpncpy_test",
+    srcs = ["stpncpy_test.cpp"],
+    deps = [
+        "//libc:__support_cpp_span",
+        "//libc:hdr_signal_macros",
+        "//libc:stpncpy",
+    ],
+)
+
 libc_test(
     name = "strcmp_test",
     srcs = ["strcmp_test.cpp"],
@@ -35,6 +62,14 @@ libc_test(
     ],
 )
 
+libc_test(
+    name = "strncmp_test",
+    srcs = ["strncmp_test.cpp"],
+    deps = [
+        "//libc:strncmp",
+    ],
+)
+
 libc_test(
     name = "memchr_test",
     srcs = ["memchr_test.cpp"],
@@ -59,6 +94,14 @@ libc_test(
     ],
 )
 
+libc_test(
+    name = "strchrnul_test",
+    srcs = ["strchrnul_test.cpp"],
+    deps = [
+        "//libc:strchrnul",
+    ],
+)
+
 libc_test(
     name = "strpbrk_test",
     srcs = ["strpbrk_test.cpp"],
@@ -110,6 +153,39 @@ libc_test(
     ],
 )
 
+libc_test(
+    name = "strcasestr_test",
+    srcs = ["strcasestr_test.cpp"],
+    deps = [
+        "//libc:strcasestr",
+    ],
+)
+
+libc_test(
+    name = "strcat_test",
+    srcs = ["strcat_test.cpp"],
+    deps = [
+        "//libc:hdr_signal_macros",
+        "//libc:strcat",
+    ],
+)
+
+libc_test(
+    name = "strlcat_test",
+    srcs = ["strlcat_test.cpp"],
+    deps = [
+        "//libc:strlcat",
+    ],
+)
+
+libc_test(
+    name = "strncat_test",
+    srcs = ["strncat_test.cpp"],
+    deps = [
+        "//libc:strncat",
+    ],
+)
+
 libc_test(
     name = "strcspn_test",
     srcs = ["strcspn_test.cpp"],
@@ -178,6 +254,15 @@ libc_test(
     ],
 )
 
+libc_test(
+    name = "memccpy_test",
+    srcs = ["memccpy_test.cpp"],
+    deps = [
+        "//libc:__support_cpp_span",
+        "//libc:memccpy",
+    ],
+)
+
 libc_test(
     name = "mempcpy_test",
     srcs = ["mempcpy_test.cpp"],
@@ -199,6 +284,16 @@ libc_test(
     ],
 )
 
+libc_test(
+    name = "memset_explicit_test",
+    srcs = ["memset_explicit_test.cpp"],
+    deps = [
+        ":memory_check_utils",
+        "//libc:memset_explicit",
+        "//libc:string_memory_utils",
+    ],
+)
+
 libc_test(
     name = "memmove_test",
     srcs = ["memmove_test.cpp"],
@@ -222,3 +317,12 @@ libc_test(
         "//libc/test/UnitTest:test_logger",
     ],
 )
+
+libc_test(
+    name = "memmem_test",
+    srcs = ["memmem_test.cpp"],
+    deps = [
+        "//libc:memmem",
+        "//libc:string_utils",
+    ],
+)
diff --git a/utils/bazel/llvm-project-overlay/libc/test/src/strings/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/test/src/strings/BUILD.bazel
index 2e6f5644eec71..e576d2177687e 100644
--- a/utils/bazel/llvm-project-overlay/libc/test/src/strings/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/test/src/strings/BUILD.bazel
@@ -39,3 +39,37 @@ libc_test(
         "//libc/test/src/string:memory_check_utils",
     ],
 )
+
+libc_test(
+    name = "index_test",
+    srcs = ["index_test.cpp"],
+    deps = [
+        "//libc:index",
+        "//libc/test/src/string:strchr_test_helper",
+    ],
+)
+
+libc_test(
+    name = "rindex_test",
+    srcs = ["rindex_test.cpp"],
+    deps = [
+        "//libc:rindex",
+        "//libc/test/src/string:strchr_test_helper",
+    ],
+)
+
+libc_test(
+    name = "strcasecmp_test",
+    srcs = ["strcasecmp_test.cpp"],
+    deps = [
+        "//libc:strcasecmp",
+    ],
+)
+
+libc_test(
+    name = "strncasecmp_test",
+    srcs = ["strncasecmp_test.cpp"],
+    deps = [
+        "//libc:strncasecmp",
+    ],
+)
diff --git a/utils/bazel/llvm-project-overlay/libc/test/src/sys/epoll/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/test/src/sys/epoll/BUILD.bazel
index e391703075aa7..e0449d6a6a5bf 100644
--- a/utils/bazel/llvm-project-overlay/libc/test/src/sys/epoll/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/test/src/sys/epoll/BUILD.bazel
@@ -16,7 +16,6 @@ libc_test(
     deps = [
         "//libc:close",
         "//libc:epoll_create",
-        "//libc/test/UnitTest:errno_test_helpers",
     ],
 )
 
@@ -27,7 +26,6 @@ libc_test(
         "//libc:close",
         "//libc:epoll_create1",
         "//libc:hdr_sys_epoll_macros",
-        "//libc/test/UnitTest:errno_test_helpers",
     ],
 )
 
@@ -41,7 +39,6 @@ libc_test(
         "//libc:hdr_sys_epoll_macros",
         "//libc:pipe",
         "//libc:types_struct_epoll_event",
-        "//libc/test/UnitTest:errno_test_helpers",
     ],
 )
 
@@ -56,7 +53,6 @@ libc_test(
         "//libc:hdr_sys_epoll_macros",
         "//libc:pipe",
         "//libc:types_struct_epoll_event",
-        "//libc/test/UnitTest:errno_test_helpers",
     ],
 )
 
@@ -71,7 +67,6 @@ libc_test(
         "//libc:hdr_sys_epoll_macros",
         "//libc:pipe",
         "//libc:types_struct_epoll_event",
-        "//libc/test/UnitTest:errno_test_helpers",
     ],
 )
 
@@ -87,6 +82,5 @@ libc_test(
         "//libc:pipe",
         "//libc:types_struct_epoll_event",
         "//libc:types_struct_timespec",
-        "//libc/test/UnitTest:errno_test_helpers",
     ],
 )
diff --git a/utils/bazel/llvm-project-overlay/libc/test/src/sys/socket/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/test/src/sys/socket/BUILD.bazel
index 743bf2a4743b7..c73fde124927d 100644
--- a/utils/bazel/llvm-project-overlay/libc/test/src/sys/socket/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/test/src/sys/socket/BUILD.bazel
@@ -16,7 +16,6 @@ libc_test(
     deps = [
         "//libc:close",
         "//libc:socket",
-        "//libc/test/UnitTest:errno_test_helpers",
     ],
 )
 
@@ -26,7 +25,6 @@ libc_test(
     deps = [
         "//libc:close",
         "//libc:socketpair",
-        "//libc/test/UnitTest:errno_test_helpers",
     ],
 )
 
@@ -38,7 +36,6 @@ libc_test(
         "//libc:recv",
         "//libc:send",
         "//libc:socketpair",
-        "//libc/test/UnitTest:errno_test_helpers",
     ],
 )
 
@@ -50,7 +47,6 @@ libc_test(
         "//libc:recvfrom",
         "//libc:sendto",
         "//libc:socketpair",
-        "//libc/test/UnitTest:errno_test_helpers",
     ],
 )
 
@@ -62,6 +58,5 @@ libc_test(
         "//libc:recvmsg",
         "//libc:sendmsg",
         "//libc:socketpair",
-        "//libc/test/UnitTest:errno_test_helpers",
     ],
 )
diff --git a/utils/bazel/llvm-project-overlay/libc/test/src/unistd/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/test/src/unistd/BUILD.bazel
index 661e0a6ff5dfe..ac74513c8c4af 100644
--- a/utils/bazel/llvm-project-overlay/libc/test/src/unistd/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/test/src/unistd/BUILD.bazel
@@ -18,7 +18,6 @@ libc_test(
         "//libc:close",
         "//libc:open",
         "//libc:unlink",
-        "//libc/test/UnitTest:errno_test_helpers",
     ],
 )
 
@@ -32,7 +31,6 @@ libc_test(
         "//libc:read",
         "//libc:unlink",
         "//libc:write",
-        "//libc/test/UnitTest:errno_test_helpers",
     ],
 )
 
@@ -46,7 +44,6 @@ libc_test(
         "//libc:read",
         "//libc:unlink",
         "//libc:write",
-        "//libc/test/UnitTest:errno_test_helpers",
     ],
 )
 
@@ -60,7 +57,6 @@ libc_test(
         "//libc:read",
         "//libc:unlink",
         "//libc:write",
-        "//libc/test/UnitTest:errno_test_helpers",
     ],
 )
 
@@ -75,7 +71,6 @@ libc_test(
         "//libc:read",
         "//libc:unlink",
         "//libc:write",
-        "//libc/test/UnitTest:errno_test_helpers",
     ],
 )
 
@@ -90,7 +85,6 @@ libc_test(
         "//libc:pwrite",
         "//libc:unlink",
         "//libc:write",
-        "//libc/test/UnitTest:errno_test_helpers",
     ],
 )
 
@@ -104,7 +98,6 @@ libc_test(
         "//libc:read",
         "//libc:remove",
         "//libc:write",
-        "//libc/test/UnitTest:errno_test_helpers",
     ],
 )
 
@@ -116,7 +109,6 @@ libc_test(
         "//libc:link",
         "//libc:open",
         "//libc:unlink",
-        "//libc/test/UnitTest:errno_test_helpers",
     ],
 )
 
@@ -137,7 +129,6 @@ libc_test(
         "//libc:open",
         "//libc:symlink",
         "//libc:unlink",
-        "//libc/test/UnitTest:errno_test_helpers",
     ],
 )
 
@@ -152,7 +143,6 @@ libc_test(
         "//libc:truncate",
         "//libc:unlink",
         "//libc:write",
-        "//libc/test/UnitTest:errno_test_helpers",
     ],
 )
 
@@ -163,7 +153,6 @@ libc_test(
         "//libc:close",
         "//libc:open",
         "//libc:unlink",
-        "//libc/test/UnitTest:errno_test_helpers",
     ],
 )
 
@@ -190,7 +179,6 @@ libc_test(
         "//libc:close",
         "//libc:isatty",
         "//libc:open",
-        "//libc/test/UnitTest:errno_test_helpers",
     ],
 )
 
diff --git a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
index b042c183df9fb..8fe8258d72e34 100644
--- a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
@@ -6,6 +6,7 @@ load("@bazel_skylib//rules:common_settings.bzl", "string_flag")
 load("@bazel_skylib//rules:expand_template.bzl", "expand_template")
 load("@rules_cc//cc:defs.bzl", "cc_binary", "cc_library")
 load("@rules_python//python:defs.bzl", "py_binary")
+load("@rules_shell//shell:sh_binary.bzl", "sh_binary")
 load("//mlir:tblgen.bzl", "gentbl_cc_library", "gentbl_filegroup", "td_library")
 load(":binary_alias.bzl", "binary_alias")
 load(":config.bzl", "llvm_config_defines")
@@ -228,6 +229,7 @@ cc_library(
         "lib/Support/*.cpp",
         "lib/Support/*.h",
         "lib/Support/*.inc",
+        "lib/Support/LSP/*.cpp",
         # To avoid a dependency cycle.
         "include/llvm/Option/*.h",
     ]) + select({
diff --git a/utils/bazel/llvm-project-overlay/llvm/unittests/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/unittests/BUILD.bazel
index 3fa62bf708514..628a720a2bbb1 100644
--- a/utils/bazel/llvm-project-overlay/llvm/unittests/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/llvm/unittests/BUILD.bazel
@@ -793,6 +793,23 @@ cc_test(
     ],
 )
 
+cc_test(
+    name = "SupportLSPTests",
+    size = "small",
+    srcs = glob(["Support/LSP/*.cpp"]),
+    copts = [
+        "$(STACK_FRAME_UNLIMITED)",
+    ],
+    linkstatic = 1,
+    deps = [
+        "//llvm:Support",
+        "//llvm:config",
+        "//third-party/unittest:gmock",
+        "//third-party/unittest:gtest",
+        "//third-party/unittest:gtest_main",
+    ],
+)
+
 cc_test(
     name = "tablegen_tests",
     size = "small",
diff --git a/utils/bazel/llvm-project-overlay/llvm/utils/lit/tests/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/utils/lit/tests/BUILD.bazel
index d89626a6ee9e6..7146cdc53a29f 100644
--- a/utils/bazel/llvm-project-overlay/llvm/utils/lit/tests/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/llvm/utils/lit/tests/BUILD.bazel
@@ -31,6 +31,7 @@ expand_template(
             "//llvm:FileCheck",
             "//llvm:count",
             "//llvm:not",
+            "//llvm:split-file",
         ] + glob(["Inputs/**"]),
     )
     for src in glob(
diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index e556d65dba002..5970160c2971e 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -29,6 +29,7 @@ exports_files([
     "LICENSE.TXT",
     "run_lit.sh",
     "utils/lldb-scripts/mlirDataFormatters.py",
+    "utils/pygments/mlir_lexer.py",
     "utils/textmate/mlir.json",
 ])
 
@@ -4722,6 +4723,7 @@ cc_library(
         ":TransformDialect",
         ":TransformDialectInterfaces",
         ":TransformUtils",
+        "//llvm:Support",
     ],
 )
 
@@ -4887,6 +4889,7 @@ cc_library(
         ":IR",
         ":LinalgDialect",
         ":MemRefDialect",
+        ":MemRefTransforms",
         ":MemRefUtils",
         ":Pass",
         ":Rewrite",
@@ -6074,6 +6077,7 @@ cc_library(
         ":LLVMDialect",
         ":NVVMOpsIncGen",
         ":NVVMRequiresSMTraitsIncGen",
+        ":PtrDialect",
         ":SideEffectInterfaces",
         ":Support",
         ":ToLLVMIRTranslation",
@@ -9082,7 +9086,9 @@ cc_library(
         ":Parser",
         ":Pass",
         ":PluginsLib",
+        ":RemarkStreamer",
         ":Support",
+        "//llvm:Remarks",
         "//llvm:Support",
     ],
 )
@@ -10817,6 +10823,7 @@ cc_library(
         ":LinalgTransformOpsIncGen",
         ":LinalgTransforms",
         ":LinalgUtils",
+        ":ParallelCombiningOpInterface",
         ":SCFDialect",
         ":SCFTransforms",
         ":Support",
diff --git a/utils/bazel/llvm-project-overlay/mlir/linalggen.bzl b/utils/bazel/llvm-project-overlay/mlir/linalggen.bzl
index 7e21ac4995b11..b45a6f675d79e 100644
--- a/utils/bazel/llvm-project-overlay/mlir/linalggen.bzl
+++ b/utils/bazel/llvm-project-overlay/mlir/linalggen.bzl
@@ -33,6 +33,7 @@ def genlinalg(name, linalggen, src, linalg_outs):
             srcs = [src],
             outs = [out],
             tools = [linalggen],
+            output_to_bindir = 1,
             cmd = (" ".join(base_args)),
         )
 
diff --git a/utils/bazel/llvm-project-overlay/mlir/python/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/python/BUILD.bazel
index 944a911bccc17..016794d30b349 100644
--- a/utils/bazel/llvm-project-overlay/mlir/python/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/python/BUILD.bazel
@@ -32,6 +32,13 @@ filegroup(
     ],
 )
 
+filegroup(
+    name = "ExecutionEnginePyIFiles",
+    srcs = [
+        "mlir/_mlir_libs/_mlirExecutionEngine.pyi",
+    ],
+)
+
 filegroup(
     name = "IRPyFiles",
     srcs = [
@@ -46,6 +53,14 @@ filegroup(
     ]),
 )
 
+filegroup(
+    name = "IRPyIFiles",
+    srcs = [
+        "mlir/_mlir_libs/_mlir/__init__.pyi",
+        "mlir/_mlir_libs/_mlir/ir.pyi",
+    ],
+)
+
 filegroup(
     name = "MlirLibsPyFiles",
     srcs = [
@@ -60,6 +75,13 @@ filegroup(
     ],
 )
 
+filegroup(
+    name = "PassManagerPyIFiles",
+    srcs = [
+        "mlir/_mlir_libs/_mlir/passmanager.pyi",
+    ],
+)
+
 filegroup(
     name = "RewritePyFiles",
     srcs = [
@@ -637,6 +659,13 @@ gentbl_filegroup(
     ],
 )
 
+filegroup(
+    name = "PDLPyIFiles",
+    srcs = [
+        "mlir/_mlir_libs/_mlir/dialects/pdl.pyi",
+    ],
+)
+
 filegroup(
     name = "PDLPyFiles",
     srcs = [
@@ -727,6 +756,13 @@ filegroup(
 # Quant dialect.
 ##---------------------------------------------------------------------------##
 
+filegroup(
+    name = "QuantPyIFiles",
+    srcs = [
+        "mlir/_mlir_libs/_mlir/dialects/quant.pyi",
+    ],
+)
+
 filegroup(
     name = "QuantPyFiles",
     srcs = [
diff --git a/utils/bazel/llvm-project-overlay/mlir/tblgen.bzl b/utils/bazel/llvm-project-overlay/mlir/tblgen.bzl
index 2213d220da269..35888aac37e17 100644
--- a/utils/bazel/llvm-project-overlay/mlir/tblgen.bzl
+++ b/utils/bazel/llvm-project-overlay/mlir/tblgen.bzl
@@ -62,20 +62,6 @@ def _get_transitive_includes(includes, deps):
         transitive = [_get_dep_transitive_includes(dep) for dep in deps],
     )
 
-def _prefix_roots(ctx, includes):
-    """Map the given includes to be relative to all root directories.
-
-    This will expand them to be relative to all the root directories available
-    in the execution environment for ctx.run (bin and genfiles in addition to
-    the normal source root)
-    """
-    prefixed_includes = []
-    for include in includes:
-        prefixed_includes.append(include)
-        prefixed_includes.append(paths.join(ctx.genfiles_dir.path, include))
-        prefixed_includes.append(paths.join(ctx.bin_dir.path, include))
-    return prefixed_includes
-
 def _resolve_includes(ctx, includes):
     """Resolves include paths to paths relative to the execution root.
 
@@ -92,7 +78,7 @@ def _resolve_includes(ctx, includes):
         else:
             include = paths.join(package, include)
         include = paths.join(workspace_root, include)
-        resolved_includes.extend(_prefix_roots(ctx, [include]))
+        resolved_includes.append(include)
     return resolved_includes
 
 def _td_library_impl(ctx):
@@ -140,6 +126,9 @@ td_library = rule(
     },
 )
 
+def _format_includes(output):
+    return lambda x: ["-I", x, "-I", paths.join(output.root.path, x)]
+
 def _gentbl_rule_impl(ctx):
     td_file = ctx.file.td_file
 
@@ -153,22 +142,21 @@ def _gentbl_rule_impl(ctx):
     # workspace is not the main workspace. Therefore it is not included in the
     # _resolve_includes call that prepends this prefix.
     trans_includes = _get_transitive_includes(
-        _resolve_includes(ctx, ctx.attr.includes + ["/"]) +
-        _prefix_roots(ctx, [td_file.dirname]),
+        _resolve_includes(ctx, ctx.attr.includes + ["/"]) + [td_file.dirname],
         ctx.attr.deps,
     )
 
     args = ctx.actions.args()
     args.add_all(ctx.attr.opts)
     args.add(td_file)
-    args.add_all(trans_includes, before_each = "-I")
-
-    args.add("-o", ctx.outputs.out.path)
+    args.add_all(trans_includes, map_each = _format_includes(ctx.outputs.out), allow_closure = True)
+    args.add("-o", ctx.outputs.out)
 
     ctx.actions.run(
         outputs = [ctx.outputs.out],
         inputs = trans_srcs,
         executable = ctx.executable.tblgen,
+        execution_requirements = {"supports-path-mapping": "1"},
         arguments = [args],
         # Make sure action_env settings are honored so the env is the same as
         # when the tool was built. Important for locating shared libraries with
@@ -234,15 +222,18 @@ def _gentbl_test_impl(ctx):
     # workspace is not the main workspace. Therefore it is not included in the
     # _resolve_includes call that prepends this prefix.
     trans_includes = _get_transitive_includes(
-        _resolve_includes(ctx, ctx.attr.includes + ["/"]) +
-        _prefix_roots(ctx, [td_file.dirname]),
+        _resolve_includes(ctx, ctx.attr.includes + ["/"]) + [td_file.dirname],
         ctx.attr.deps,
     )
 
     test_args = [ctx.executable.tblgen.short_path]
     test_args.extend(ctx.attr.opts)
     test_args.append(td_file.path)
-    test_args.extend(["-I " + include for include in trans_includes.to_list()])
+    test_args.extend([
+        arg
+        for include in trans_includes.to_list()
+        for arg in ["-I", include, "-I", paths.join(ctx.bin_dir.path, include)]
+    ])
 
     test_args.extend(["-o", "/dev/null"])
 
@@ -440,11 +431,12 @@ def _gentbl_shard_impl(ctx):
     args = ctx.actions.args()
     args.add(ctx.file.src_file)
     args.add("-op-shard-index", ctx.attr.index)
-    args.add("-o", ctx.outputs.out.path)
+    args.add("-o", ctx.outputs.out)
     ctx.actions.run(
         outputs = [ctx.outputs.out],
         inputs = [ctx.file.src_file],
         executable = ctx.executable.sharder,
+        execution_requirements = {"supports-path-mapping": "1"},
         arguments = [args],
         use_default_shell_env = True,
         mnemonic = "ShardGenerate",
@@ -506,6 +498,7 @@ def gentbl_sharded_ops(
       includes: See gentbl_rule.includes
       deps: See gentbl_rule.deps
       strip_include_prefix: Attribute to pass through to cc_library.
+      **kwargs: Passed through to all generated rules.
     """
     cc_lib_name = name + "__gentbl_cc_lib"
     gentbl_cc_library(
diff --git a/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel
index e17cdb28286a2..469fcee8d9748 100644
--- a/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel
@@ -535,6 +535,7 @@ cc_library(
         "//mlir:MemRefToSPIRV",
         "//mlir:MemRefTransforms",
         "//mlir:Pass",
+        "//mlir:ReconcileUnrealizedCasts",
         "//mlir:Rewrite",
         "//mlir:SCFToSPIRV",
         "//mlir:SPIRVConversion",
diff --git a/utils/bazel/llvm-project-overlay/mlir/test/Dialect/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/test/Dialect/BUILD.bazel
index b8d136c174bd4..6a0af96e96831 100644
--- a/utils/bazel/llvm-project-overlay/mlir/test/Dialect/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/test/Dialect/BUILD.bazel
@@ -9,6 +9,8 @@ package(default_visibility = ["//visibility:public"])
         name = "%s.test" % src,
         srcs = [src],
         data = [
+            "Vector/td/unroll-elements.mlir",
+            "Vector/td/xfer-drop-unit-dims.mlir",
             "Vector/vector-sink-transform.mlir",
             "//llvm:llvm-symbolizer",
             "//mlir:mlir-opt",
@@ -33,6 +35,8 @@ package(default_visibility = ["//visibility:public"])
             "LLVM/*-symbol-def.mlir",
             "Transform/*-symbol-decl-and-schedule.mlir",
             "Transform/include/**/*.mlir",
+            "Vector/td/unroll-elements.mlir",
+            "Vector/td/xfer-drop-unit-dims.mlir",
             "Vector/vector-sink-transform.mlir",
         ],
     )
diff --git a/utils/bazel/llvm_configs/llvm-config.h.cmake b/utils/bazel/llvm_configs/llvm-config.h.cmake
index 39136bc45c292..6488d6c01b5c6 100644
--- a/utils/bazel/llvm_configs/llvm-config.h.cmake
+++ b/utils/bazel/llvm_configs/llvm-config.h.cmake
@@ -146,4 +146,7 @@
    coverage bugs, and to 0 otherwise. */
 #cmakedefine01 LLVM_ENABLE_DEBUGLOC_TRACKING_ORIGIN
 
+/* Define to 1 to enable LLVM OnDisk Content Addressable Storage */
+#cmakedefine01 LLVM_ENABLE_ONDISK_CAS
+
 #endif